diff --git a/example/pubspec.lock b/example/pubspec.lock index c647b860..89a62f4c 100644 --- a/example/pubspec.lock +++ b/example/pubspec.lock @@ -198,10 +198,10 @@ packages: dependency: transitive description: name: matcher - sha256: dc0b7dc7651697ea4ff3e69ef44b0407ea32c487a39fff6a4004fa585e901861 + sha256: "31bd099b47c10cd1aeb55146a2d46ce0277630ecef3f7dae54ad7873f36696cd" url: "https://pub.dev" source: hosted - version: "0.12.19" + version: "0.12.20" material_color_utilities: dependency: transitive description: @@ -214,10 +214,10 @@ packages: dependency: transitive description: name: meta - sha256: "1741988757a65eb6b36abe716829688cf01910bbf91c34354ff7ec1c3de2b349" + sha256: df0c643f44ad098eb37988027a8e2b2b5a031fd3977f06bbfd3a76637e8df739 url: "https://pub.dev" source: hosted - version: "1.18.0" + version: "1.18.2" native_toolchain_cmake: dependency: transitive description: @@ -323,10 +323,10 @@ packages: dependency: transitive description: name: test_api - sha256: "8161c84903fd860b26bfdefb7963b3f0b68fee7adea0f59ef805ecca346f0c7a" + sha256: "2a122cbe059f8b610d3a5415f42e255b6c17b1f21eee1d960f31080237fb4f11" url: "https://pub.dev" source: hosted - version: "0.7.10" + version: "0.7.12" typed_data: dependency: transitive description: @@ -339,10 +339,10 @@ packages: dependency: transitive description: name: vector_math - sha256: d530bd74fea330e6e364cda7a85019c434070188383e1cd8d9777ee586914c5b + sha256: "47a1b32ee755c3fcffa33db52a7258c137f97bdb2209a1075be847809fac4ccf" url: "https://pub.dev" source: hosted - version: "2.2.0" + version: "2.3.0" vm_service: dependency: transitive description: diff --git a/lib/src/third_party/boringssl/generated_bindings.dart b/lib/src/third_party/boringssl/generated_bindings.dart index 429062dd..c3039024 100644 --- a/lib/src/third_party/boringssl/generated_bindings.dart +++ b/lib/src/third_party/boringssl/generated_bindings.dart @@ -221,6 +221,10 @@ class BoringSsl { /// BN_num_bytes returns the minimum number of bytes needed to represent the /// absolute value of |bn|. + /// + /// While |size_t| is the preferred type for byte counts, callers can assume that + /// |BIGNUM|s are bounded such that this value, and its corresponding bit count, + /// will always fit in |int|. int BN_num_bytes(ffi.Pointer bn) { return _BN_num_bytes(bn); } @@ -286,8 +290,9 @@ class BoringSsl { late final _BN_value_one = _BN_value_onePtr.asFunction Function()>(); - /// BORINGSSL_self_test triggers the FIPS KAT-based self tests. It returns one on - /// success and zero on error. + /// BORINGSSL_self_test triggers most of the FIPS KAT-based self tests. It + /// returns one on success and zero on error. It currently skips the SLH-DSA + /// tests, which take a really long time to run. int BORINGSSL_self_test() { return _BORINGSSL_self_test(); } @@ -394,23 +399,6 @@ class BoringSsl { late final _CBB_zero = _CBB_zeroPtr.asFunction)>(); - /// CBS_init sets |cbs| to point to |data|. It does not take ownership of - /// |data|. - void CBS_init(ffi.Pointer cbs, ffi.Pointer data, int len) { - return _CBS_init(cbs, data, len); - } - - late final _CBS_initPtr = - _lookup< - ffi.NativeFunction< - ffi.Void Function(ffi.Pointer, ffi.Pointer, ffi.Size) - > - >('CBS_init'); - late final _CBS_init = - _CBS_initPtr.asFunction< - void Function(ffi.Pointer, ffi.Pointer, int) - >(); - /// CRYPTO_memcmp returns zero iff the |len| bytes at |a| and |b| are equal. It /// takes an amount of time dependent on |len|, but independent of the contents /// of |a| and |b|. Unlike memcmp, it cannot be used to put elements into a @@ -587,7 +575,12 @@ class BoringSsl { ffi.Pointer Function(ffi.Pointer) >(); - /// EC_GROUP_free releases a reference to |group|. + /// EC_GROUP_free releases a reference to |group|, if |group| was created by + /// |EC_GROUP_new_curve_GFp|. If |group| is static, it does nothing. + /// + /// This function exists for OpenSSL compatibility, and to manage dynamic + /// |EC_GROUP|s constructed by |EC_GROUP_new_curve_GFp|. Callers that do not need + /// either may ignore this function. void EC_GROUP_free(ffi.Pointer group) { return _EC_GROUP_free(group); } @@ -641,14 +634,20 @@ class BoringSsl { late final _EC_GROUP_get_degree = _EC_GROUP_get_degreePtr.asFunction)>(); - /// EC_GROUP_new_by_curve_name returns a fresh EC_GROUP object for the elliptic - /// curve specified by |nid|, or NULL on unsupported NID or allocation failure. + /// EC_GROUP_new_by_curve_name returns the |EC_GROUP| object for the elliptic + /// curve specified by |nid|, or NULL on unsupported NID. For OpenSSL + /// compatibility, this function returns a non-const pointer which may be passed + /// to |EC_GROUP_free|. However, the resulting object is actually static and + /// calling |EC_GROUP_free| is optional. /// /// The supported NIDs are: - /// NID_secp224r1 (P-224), - /// NID_X9_62_prime256v1 (P-256), - /// NID_secp384r1 (P-384), - /// NID_secp521r1 (P-521) + /// - |NID_secp224r1| (P-224) + /// - |NID_X9_62_prime256v1| (P-256) + /// - |NID_secp384r1| (P-384) + /// - |NID_secp521r1| (P-521) + /// + /// Calling this function causes all four curves to be linked into the binary. + /// Prefer calling |EC_group_*| to allow the static linker to drop unused curves. /// /// If in doubt, use |NID_X9_62_prime256v1|, or see the curve25519.h header for /// more modern primitives. @@ -818,7 +817,7 @@ class BoringSsl { /// EC_KEY_set_public_key sets the public key of |key| to |pub|, by copying it. /// It returns one on success and zero otherwise. |key| must already have had a /// group configured (see |EC_KEY_set_group| and |EC_KEY_new_by_curve_name|), and - /// |pub| must also belong to that group. + /// |pub| must also belong to that group, and must not be the point at infinity. int EC_KEY_set_public_key( ffi.Pointer key, ffi.Pointer pub, @@ -880,8 +879,8 @@ class BoringSsl { _EC_POINT_freePtr.asFunction)>(); /// EC_POINT_get_affine_coordinates_GFp sets |x| and |y| to the affine value of - /// |point| using |ctx|, if it's not NULL. It returns one on success and zero - /// otherwise. + /// |point|. It returns one on success and zero otherwise. |ctx| is ignored and + /// may be NULL. /// /// Either |x| or |y| may be NULL to skip computing that coordinate. This is /// slightly faster in the common case where only the x-coordinate is needed. @@ -936,9 +935,9 @@ class BoringSsl { >(); /// EC_POINT_oct2point sets |point| from |len| bytes of X9.62 format - /// serialisation in |buf|. It returns one on success and zero on error. The - /// |ctx| argument may be used if not NULL. It's considered an error if |buf| - /// does not represent a point on the curve. + /// serialisation in |buf|. It returns one on success and zero on error. |ctx| + /// may be NULL. It's considered an error if |buf| does not represent a point on + /// the curve. int EC_POINT_oct2point( ffi.Pointer group, ffi.Pointer point, @@ -973,7 +972,8 @@ class BoringSsl { >(); /// EC_POINT_point2cbb behaves like |EC_POINT_point2oct| but appends the - /// serialised point to |cbb|. It returns one on success and zero on error. + /// serialised point to |cbb|. It returns one on success and zero on error. |ctx| + /// is ignored and may be NULL. int EC_POINT_point2cbb( ffi.Pointer out, ffi.Pointer group, @@ -1395,8 +1395,11 @@ class BoringSsl { int Function(ffi.Pointer) >(); - /// EVP_CipherFinal_ex calls either |EVP_EncryptFinal_ex| or - /// |EVP_DecryptFinal_ex| depending on how |ctx| has been setup. + /// EVP_CipherFinal_ex does the same as |EVP_CipherFinal_ex2|, except that no + /// output size is given and thus no bounds checking is performed. + /// + /// WARNING: This function does not check bounds on out, and correctly sizing + /// the output buffer is difficult. Use |EVP_CipherFinal_ex2| instead. int EVP_CipherFinal_ex( ffi.Pointer ctx, ffi.Pointer out, @@ -1467,8 +1470,15 @@ class BoringSsl { ) >(); - /// EVP_CipherUpdate calls either |EVP_EncryptUpdate| or |EVP_DecryptUpdate| - /// depending on how |ctx| has been setup. + /// EVP_CipherUpdate does the same as |EVP_CipherUpdate_ex|, except that no + /// output size is given and thus no bounds checking is performed. + /// + /// Additionally, if |ctx| is an AEAD cipher, e.g. |EVP_aes_128_gcm|, and |out| + /// is NULL, this function instead behaves like |EVP_CipherUpdateAAD|. + /// + /// WARNING: This function does not check bounds on |out|, and correctly sizing + /// the output buffer is difficult. Use |EVP_CipherUpdate_ex| or + /// |EVP_CipherUpdateAAD| instead. int EVP_CipherUpdate( ffi.Pointer ctx, ffi.Pointer out, @@ -1693,8 +1703,8 @@ class BoringSsl { /// |EVP_DigestVerifyUpdate|. It returns one on success and zero otherwise. /// /// This function performs streaming signature verification and will fail for - /// signature algorithms which do not support this. Use |EVP_PKEY_verify_message| - /// for a single-shot verification. + /// signature algorithms which do not support this. Use |EVP_DigestVerify| for a + /// single-shot verification. int EVP_DigestVerifyFinal( ffi.Pointer ctx, ffi.Pointer sig, @@ -1769,8 +1779,8 @@ class BoringSsl { /// will be verified by |EVP_DigestVerifyFinal|. It returns one. /// /// This function performs streaming signature verification and will fail for - /// signature algorithms which do not support this. Use |EVP_PKEY_verify_message| - /// for a single-shot verification. + /// signature algorithms which do not support this. Use |EVP_DigestVerify| for a + /// single-shot verification. int EVP_DigestVerifyUpdate( ffi.Pointer ctx, ffi.Pointer data, @@ -1977,16 +1987,16 @@ class BoringSsl { >(); /// EVP_PKEY_CTX_set_rsa_pss_saltlen sets the length of the salt in a PSS-padded - /// signature. A value of -1 cause the salt to be the same length as the digest - /// in the signature. A value of -2 causes the salt to be the maximum length - /// that will fit when signing and recovered from the signature when verifying. - /// Otherwise the value gives the size of the salt in bytes. + /// signature. A value of |RSA_PSS_SALTLEN_DIGEST| causes the salt to be the same + /// length as the digest in the signature. A value of |RSA_PSS_SALTLEN_AUTO| + /// causes the salt to be the maximum length that will fit when signing and + /// recovered from the signature when verifying. Otherwise the value gives the + /// size of the salt in bytes. /// - /// If unsure, use -1. + /// If unsure, use |RSA_PSS_SALTLEN_DIGEST|, which is the default. Note this + /// differs from OpenSSL, which defaults to |RSA_PSS_SALTLEN_AUTO|. /// /// Returns one on success or zero on error. - /// - /// TODO(davidben): The default is currently -2. Switch it to -1. int EVP_PKEY_CTX_set_rsa_pss_saltlen( ffi.Pointer ctx, int salt_len, @@ -2123,8 +2133,8 @@ class BoringSsl { int Function(ffi.Pointer) >(); - /// EVP_PKEY_free frees all data referenced by |pkey| and then frees |pkey| - /// itself. + /// EVP_PKEY_free decrements the reference count of |pkey| and frees it if the + /// reference count drops to zero. void EVP_PKEY_free(ffi.Pointer pkey) { return _EVP_PKEY_free(pkey); } @@ -2163,7 +2173,11 @@ class BoringSsl { >(); /// EVP_PKEY_id returns the type of |pkey|, which is one of the |EVP_PKEY_*| - /// values. + /// values above. These type values generally correspond to the algorithm OID, + /// but not the parameters, of a SubjectPublicKeyInfo (RFC 5280) or + /// PrivateKeyInfo (RFC 5208) AlgorithmIdentifier. Algorithm parameters can be + /// inspected with algorithm-specific accessors, e.g. + /// |EVP_PKEY_get_ec_curve_nid|. int EVP_PKEY_id(ffi.Pointer pkey) { return _EVP_PKEY_id(pkey); } @@ -2206,9 +2220,9 @@ class BoringSsl { int Function(ffi.Pointer, ffi.Pointer) >(); - /// Getting and setting concrete public key types. + /// Getting and setting concrete key types. /// - /// The following functions get and set the underlying public key in an + /// The following functions get and set the underlying key representation in an /// |EVP_PKEY| object. The |set1| functions take an additional reference to the /// underlying key and return one on success or zero if |key| is NULL. The /// |assign| functions adopt the caller's reference and return one on success or @@ -2220,6 +2234,18 @@ class BoringSsl { /// non-mutating for thread-safety purposes, but mutating functions on the /// returned lower-level objects are considered to also mutate the |EVP_PKEY| and /// may not be called concurrently with other operations on the |EVP_PKEY|. + /// + /// WARNING: Matching OpenSSL, the RSA functions behave non-uniformly. + /// |EVP_PKEY_set1_RSA| and |EVP_PKEY_assign_RSA| construct an |EVP_PKEY_RSA| + /// key, while the |EVP_PKEY_get0_RSA| and |EVP_PKEY_get1_RSA| will return + /// non-NULL for both |EVP_PKEY_RSA| and |EVP_PKEY_RSA_PSS|. + /// + /// This means callers risk misusing a key if they assume a non-NULL return from + /// |EVP_PKEY_get0_RSA| or |EVP_PKEY_get1_RSA| implies |EVP_PKEY_RSA|. Prefer + /// |EVP_PKEY_id| to check the type of a key. To reduce this risk, BoringSSL does + /// not make |EVP_PKEY_RSA_PSS| available by default, only when callers opt in + /// via |EVP_pkey_rsa_pss_sha256|. This differs from upstream OpenSSL, where + /// callers are exposed to |EVP_PKEY_RSA_PSS| by default. int EVP_PKEY_set1_RSA(ffi.Pointer pkey, ffi.Pointer key) { return _EVP_PKEY_set1_RSA(pkey, key); } @@ -2237,7 +2263,21 @@ class BoringSsl { /// EVP_PKEY_set_type sets the type of |pkey| to |type|. It returns one if /// successful or zero if the |type| argument is not one of the |EVP_PKEY_*| - /// values. If |pkey| is NULL, it simply reports whether the type is known. + /// values supported for use with this function. If |pkey| is NULL, it simply + /// reports whether the type is known. + /// + /// There are very few cases where this function is useful. Changing |pkey|'s + /// type clears any previously stored keys, so there is no benefit to loading a + /// key and then changing its type. Although |pkey| is left with a type + /// configured, it has no key, and functions which set a key, such as + /// |EVP_PKEY_set1_RSA|, will configure a type anyway. If writing unit tests that + /// are only sensitive to the type of a key, it is preferable to construct a real + /// key, so that tests are more representative of production code. + /// + /// The only API pattern which requires this function is + /// |EVP_PKEY_set1_tls_encodedpoint| with X25519, which requires a half-empty + /// |EVP_PKEY| that was first configured with |EVP_PKEY_X25519|. Currently, all + /// other values of |type| will result in an error. int EVP_PKEY_set_type(ffi.Pointer pkey, int type) { return _EVP_PKEY_set_type(pkey, type); } @@ -2371,15 +2411,26 @@ class BoringSsl { /// 5208) from |cbs| and advances |cbs|. It returns a newly-allocated |EVP_PKEY| /// or NULL on error. /// - /// The caller must check the type of the parsed private key to ensure it is - /// suitable and validate other desired key properties such as RSA modulus size - /// or EC curve. In particular, RSA private key operations scale cubicly, so + /// Prefer |EVP_PKEY_from_private_key_info| instead. This function has + /// several pitfalls: + /// + /// Callers are expected to handle trailing data returned from |cbs|, making more + /// common cases error-prone. + /// + /// There is also no way to pass in supported algorithms. This function instead + /// supports some default set of algorithms. Future versions of BoringSSL may add + /// to this list, based on the needs of the other callers. Conversely, some + /// algorithms may be intentionally omitted, if they cause too much risk to + /// existing callers. + /// + /// This means the caller must check the type of the parsed private key to ensure + /// it is suitable and validate other desired key properties such as RSA modulus + /// size or EC curve. In particular, RSA private key operations scale cubicly, so /// applications accepting RSA private keys from external sources may need to /// bound key sizes (use |EVP_PKEY_bits| or |RSA_bits|) to avoid a DoS vector. /// - /// A PrivateKeyInfo ends with an optional set of attributes. These are not - /// processed and so this function will silently ignore any trailing data in the - /// structure. + /// A PrivateKeyInfo ends with an optional set of attributes. These are silently + /// ignored. ffi.Pointer EVP_parse_private_key(ffi.Pointer cbs) { return _EVP_parse_private_key(cbs); } @@ -2395,12 +2446,23 @@ class BoringSsl { /// EVP_parse_public_key decodes a DER-encoded SubjectPublicKeyInfo structure /// (RFC 5280) from |cbs| and advances |cbs|. It returns a newly-allocated - /// |EVP_PKEY| or NULL on error. If the key is an EC key, the curve is guaranteed - /// to be set. + /// |EVP_PKEY| or NULL on error. /// - /// The caller must check the type of the parsed public key to ensure it is - /// suitable and validate other desired key properties such as RSA modulus size - /// or EC curve. + /// Prefer |EVP_PKEY_from_subject_public_key_info| instead. This function has + /// several pitfalls: + /// + /// Callers are expected to handle trailing data returned from |cbs|, making more + /// common cases error-prone. + /// + /// There is also no way to pass in supported algorithms. This function instead + /// supports some default set of algorithms. Future versions of BoringSSL may add + /// to this list, based on the needs of the other callers. Conversely, some + /// algorithms may be intentionally omitted, if they cause too much risk to + /// existing callers. + /// + /// This means callers must check the type of the parsed public key to ensure it + /// is suitable and validate other desired key properties such as RSA modulus + /// size or EC curve. ffi.Pointer EVP_parse_public_key(ffi.Pointer cbs) { return _EVP_parse_public_key(cbs); } @@ -2539,10 +2601,10 @@ class BoringSsl { _HMAC_CTX_newPtr.asFunction Function()>(); /// HMAC_Final completes the HMAC operation in |ctx| and writes the result to - /// |out| and the sets |*out_len| to the length of the result. On entry, |out| - /// must contain at least |HMAC_size| bytes of space. An output size of - /// |EVP_MAX_MD_SIZE| will always be large enough. It returns one on success or - /// zero on allocation failure. + /// |out|. If |out_len| is not |NULL| then it writes the length of the result to + /// |*out_len|. On entry, |out| must contain at least |HMAC_size| bytes of + /// space. An output size of |EVP_MAX_MD_SIZE| will always be large enough. It + /// returns one on success or zero on allocation failure. int HMAC_Final( ffi.Pointer ctx, ffi.Pointer out, @@ -2651,7 +2713,8 @@ class BoringSsl { _HMAC_sizePtr.asFunction)>(); /// OPENSSL_free does nothing if |ptr| is NULL. Otherwise it zeros out the - /// memory allocated at |ptr| and frees it. + /// memory allocated at |ptr| and frees it along with the private data. + /// It must only be used on on |ptr| values obtained from |OPENSSL_malloc| void OPENSSL_free(ffi.Pointer ptr) { return _OPENSSL_free(ptr); } @@ -2663,7 +2726,10 @@ class BoringSsl { late final _OPENSSL_free = _OPENSSL_freePtr.asFunction)>(); - /// OPENSSL_malloc acts like a regular |malloc|. + /// OPENSSL_malloc is similar to a regular |malloc|, but allocates additional + /// private data. The resulting pointer must be freed with |OPENSSL_free|. In + /// the case of a malloc failure, prior to returning NULL |OPENSSL_malloc| will + /// push |ERR_R_MALLOC_FAILURE| onto the openssl error stack. ffi.Pointer OPENSSL_malloc(int size) { return _OPENSSL_malloc(size); } @@ -2676,7 +2742,8 @@ class BoringSsl { _OPENSSL_mallocPtr.asFunction Function(int)>(); /// OPENSSL_memdup returns an allocated, duplicate of |size| bytes from |data| or - /// NULL on allocation failure. + /// NULL on allocation failure. The memory allocated must be freed with + /// |OPENSSL_free|. ffi.Pointer OPENSSL_memdup(ffi.Pointer data, int size) { return _OPENSSL_memdup(data, size); } @@ -2725,7 +2792,7 @@ class BoringSsl { ffi.Size, ffi.Pointer, ffi.Size, - ffi.UnsignedInt, + ffi.Uint32, ffi.Pointer, ffi.Size, ffi.Pointer, @@ -2746,7 +2813,8 @@ class BoringSsl { ) >(); - /// RAND_bytes writes |len| bytes of random data to |buf| and returns one. + /// RAND_bytes writes |len| bytes of random data to |buf| and returns one. In the + /// event that sufficient random data can not be obtained, |abort| is called. int RAND_bytes(ffi.Pointer buf, int len) { return _RAND_bytes(buf, len); } @@ -2934,7 +3002,8 @@ class BoringSsl { ) >(); - /// RSA_new returns a new, empty |RSA| object or NULL on error. + /// RSA_new returns a new, empty |RSA| object or NULL on error. Prefer using + /// |RSA_new_public_key| or |RSA_new_private_key| to import an RSA key. ffi.Pointer RSA_new() { return _RSA_new(); } @@ -3070,18 +3139,6 @@ typedef BN_ULONG = ffi.Uint64; typedef DartBN_ULONG = int; typedef CBB = cbb_st; typedef CBS = cbs_st; - -/// CRYPTO_refcount_t is the type of a reference count. -/// -/// Since some platforms use C11 atomics to access this, it should have the -/// _Atomic qualifier. However, this header is included by C++ programs as well -/// as C code that might not set -std=c11. So, in practice, it's not possible to -/// do that. Instead we statically assert that the size and native alignment of -/// a plain uint32_t and an _Atomic uint32_t are equal in refcount_c11.c. -typedef CRYPTO_refcount_t = ffi.Uint32; -typedef DartCRYPTO_refcount_t = int; -typedef DH = dh_st; -typedef DSA = dsa_st; typedef ECDSA_SIG = ecdsa_sig_st; typedef EC_GROUP = ec_group_st; typedef EC_KEY = ec_key_st; @@ -3100,7 +3157,6 @@ typedef EVP_CIPHER_CTX = evp_cipher_ctx_st; typedef EVP_MD = env_md_st; typedef EVP_MD_CTX = env_md_ctx_st; typedef EVP_PKEY = evp_pkey_st; -typedef EVP_PKEY_ASN1_METHOD = evp_pkey_asn1_method_st; typedef EVP_PKEY_CTX = evp_pkey_ctx_st; const int EVP_PKEY_EC = 408; @@ -3125,16 +3181,13 @@ const int RSA_PKCS1_PADDING = 1; const int RSA_PKCS1_PSS_PADDING = 6; -final class UnnamedUnion1 extends ffi.Union { - external ffi.Pointer ptr; - - external ffi.Pointer rsa; - - external ffi.Pointer dsa; +/// md_data contains the hash-specific context. +final class UnnamedUnion2 extends ffi.Union { + @ffi.Array.multi([208]) + external ffi.Array md_data; - external ffi.Pointer dh; - - external ffi.Pointer ec; + @ffi.Uint64() + external int alignment; } final class bignum_ctx extends ffi.Opaque {} @@ -3175,8 +3228,6 @@ final class bignum_st extends ffi.Struct { external int flags; } -final class bn_blinding_st extends ffi.Opaque {} - /// bn_gencb_st, or |BN_GENCB|, holds a callback function that is used by /// generation functions that can take a very long time to complete. Use /// |BN_GENCB_set| to initialise a |BN_GENCB| structure. @@ -3184,11 +3235,11 @@ final class bn_blinding_st extends ffi.Opaque {} /// The callback receives the address of that |BN_GENCB| structure as its last /// argument and the user is free to put an arbitrary pointer in |arg|. The other /// arguments are set as follows: -/// event=BN_GENCB_GENERATED, n=i: after generating the i'th possible prime +/// - event=BN_GENCB_GENERATED, n=i: after generating the i'th possible prime /// number. -/// event=BN_GENCB_PRIME_TEST, n=-1: when finished trial division primality +/// - event=BN_GENCB_PRIME_TEST, n=-1: when finished trial division primality /// checks. -/// event=BN_GENCB_PRIME_TEST, n=i: when the i'th primality test has finished. +/// - event=BN_GENCB_PRIME_TEST, n=i: when the i'th primality test has finished. /// /// The callback can return zero to abort the generation progress or one to /// allow it to continue. @@ -3197,20 +3248,6 @@ final class bn_blinding_st extends ffi.Opaque {} /// BN_GENCB argument and may call the function with other argument values. final class bn_gencb_st extends ffi.Opaque {} -final class bn_mont_ctx_st extends ffi.Struct { - /// RR is R^2, reduced modulo |N|. It is used to convert to Montgomery form. It - /// is guaranteed to have the same width as |N|. - external BIGNUM RR; - - /// N is the modulus. It is always stored in minimal form, so |N.width| - /// determines R. - external BIGNUM N; - - /// least significant words of (R*Ri-1)/N - @ffi.Array.multi([2]) - external ffi.Array n0; -} - /// CRYPTO ByteBuilder. /// /// |CBB| objects allow one to build length-prefixed serialisations. A |CBB| @@ -3225,52 +3262,11 @@ final class bn_mont_ctx_st extends ffi.Struct { /// If one needs to force a length prefix to be written out because a |CBB| is /// going out of scope, use |CBB_flush|. If an operation on a |CBB| fails, it is /// in an undefined state and must not be used except to call |CBB_cleanup|. -final class cbb_buffer_st extends ffi.Struct { - external ffi.Pointer buf; +final class cbb_buffer_st extends ffi.Opaque {} - /// The number of valid bytes. - @ffi.Size() - external int len; +final class cbb_child_st extends ffi.Opaque {} - /// The size of buf. - @ffi.Size() - external int cap; - - /// One iff |buf| is owned by this object. If not then |buf| - /// cannot be resized. - @ffi.Char() - external int can_resize; - - /// One iff there was an error writing to this CBB. All future - /// operations will fail. - @ffi.Char() - external int error; -} - -final class cbb_st extends ffi.Struct { - external ffi.Pointer base; - - /// child points to a child CBB if a length-prefix is pending. - external ffi.Pointer child; - - /// offset is the number of bytes from the start of |base->buf| to this |CBB|'s - /// pending length prefix. - @ffi.Size() - external int offset; - - /// pending_len_len contains the number of bytes in this |CBB|'s pending - /// length-prefix, or zero if no length-prefix is pending. - @ffi.Uint8() - external int pending_len_len; - - @ffi.Char() - external int pending_is_asn1; - - /// is_child is true iff this is a child |CBB| (as opposed to a top-level - /// |CBB|). Top-level objects are valid arguments for |CBB_finish|. - @ffi.Char() - external int is_child; -} +final class cbb_st extends ffi.Opaque {} /// CRYPTO ByteString final class cbs_st extends ffi.Struct { @@ -3280,26 +3276,6 @@ final class cbs_st extends ffi.Struct { external int len; } -final class crypto_ex_data_st extends ffi.Struct { - external ffi.Pointer sk; -} - -/// On glibc, |pthread_rwlock_t| is hidden under feature flags, and we can't -/// ensure that we'll be able to get it from a public header. It's statically -/// asserted that this structure is large enough to contain a |pthread_rwlock_t| -/// by thread_pthread.c. -final class crypto_mutex_st extends ffi.Union { - @ffi.Double() - external double alignment; - - @ffi.Array.multi([56]) - external ffi.Array padding; -} - -final class dh_st extends ffi.Opaque {} - -final class dsa_st extends ffi.Opaque {} - final class ec_group_st extends ffi.Opaque {} final class ec_key_st extends ffi.Opaque {} @@ -3318,14 +3294,16 @@ final class ecdsa_sig_st extends ffi.Struct { final class engine_st extends ffi.Opaque {} +/// env_md_ctx_st is typoed ("evp" -> "env"), but the typo comes from OpenSSL +/// and some consumers forward-declare these structures so we're leaving it +/// alone. final class env_md_ctx_st extends ffi.Struct { + /// md_data contains the hash-specific context. + external UnnamedUnion2 unnamed; + /// digest is the underlying digest function, or NULL if not set. external ffi.Pointer digest; - /// md_data points to a block of memory that contains the hash-specific - /// context. - external ffi.Pointer md_data; - /// pctx is an opaque (at this layer) pointer to additional context that /// EVP_PKEY functions may store in this object. external ffi.Pointer pctx; @@ -3352,7 +3330,7 @@ final class evp_aead_ctx_st extends ffi.Struct { /// AEAD operations. final class evp_aead_ctx_st_state extends ffi.Union { - @ffi.Array.multi([580]) + @ffi.Array.multi([560]) external ffi.Array opaque; @ffi.Uint64() @@ -3414,32 +3392,20 @@ final class evp_cipher_ctx_st extends ffi.Struct { /// possible final block @ffi.Array.multi([32]) external ffi.Array final1; + + /// Has this structure been rendered unusable by a failure. + @ffi.Int() + external int poisoned; } final class evp_cipher_st extends ffi.Opaque {} +/// Internal constants and structures (hidden). final class evp_md_pctx_ops extends ffi.Opaque {} -final class evp_pkey_asn1_method_st extends ffi.Opaque {} - final class evp_pkey_ctx_st extends ffi.Opaque {} -/// Private structures. -final class evp_pkey_st extends ffi.Struct { - @CRYPTO_refcount_t() - external int references; - - /// type contains one of the EVP_PKEY_* values or NID_undef and determines - /// which element (if any) of the |pkey| union is valid. - @ffi.Int() - external int type; - - external UnnamedUnion1 pkey; - - /// ameth contains a pointer to a method table that contains many ASN.1 - /// methods for the key type. - external ffi.Pointer ameth; -} +final class evp_pkey_st extends ffi.Opaque {} /// Private functions final class hmac_ctx_st extends ffi.Struct { @@ -3452,17 +3418,6 @@ final class hmac_ctx_st extends ffi.Struct { external EVP_MD_CTX o_ctx; } -/// openssl_method_common_st contains the common part of all method structures. -/// This must be the first member of all method structures. -final class openssl_method_common_st extends ffi.Struct { - /// dummy – not used. - @ffi.Int() - external int references; - - @ffi.Char() - external int is_static; -} - /// point_conversion_form_t enumerates forms, as defined in X9.62 (ECDSA), for /// the encoding of a elliptic curve point (x,y) abstract class point_conversion_form_t { @@ -3483,100 +3438,4 @@ abstract class point_conversion_form_t { static const int POINT_CONVERSION_HYBRID = 6; } -final class rsa_meth_st extends ffi.Struct { - external openssl_method_common_st common; - - external ffi.Pointer app_data; - - external ffi.Pointer< - ffi.NativeFunction rsa)> - > - init; - - external ffi.Pointer< - ffi.NativeFunction rsa)> - > - finish; - - /// size returns the size of the RSA modulus in bytes. - external ffi.Pointer< - ffi.NativeFunction rsa)> - > - size; - - external ffi.Pointer< - ffi.NativeFunction< - ffi.Int Function( - ffi.Int type, - ffi.Pointer m, - ffi.UnsignedInt m_length, - ffi.Pointer sigret, - ffi.Pointer siglen, - ffi.Pointer rsa, - ) - > - > - sign; - - /// These functions mirror the |RSA_*| functions of the same name. - external ffi.Pointer< - ffi.NativeFunction< - ffi.Int Function( - ffi.Pointer rsa, - ffi.Pointer out_len, - ffi.Pointer out, - ffi.Size max_out, - ffi.Pointer in1, - ffi.Size in_len, - ffi.Int padding, - ) - > - > - sign_raw; - - external ffi.Pointer< - ffi.NativeFunction< - ffi.Int Function( - ffi.Pointer rsa, - ffi.Pointer out_len, - ffi.Pointer out, - ffi.Size max_out, - ffi.Pointer in1, - ffi.Size in_len, - ffi.Int padding, - ) - > - > - decrypt; - - /// private_transform takes a big-endian integer from |in|, calculates the - /// d'th power of it, modulo the RSA modulus and writes the result as a - /// big-endian integer to |out|. Both |in| and |out| are |len| bytes long and - /// |len| is always equal to |RSA_size(rsa)|. If the result of the transform - /// can be represented in fewer than |len| bytes, then |out| must be zero - /// padded on the left. - /// - /// It returns one on success and zero otherwise. - /// - /// RSA decrypt and sign operations will call this, thus an ENGINE might wish - /// to override it in order to avoid having to implement the padding - /// functionality demanded by those, higher level, operations. - external ffi.Pointer< - ffi.NativeFunction< - ffi.Int Function( - ffi.Pointer rsa, - ffi.Pointer out, - ffi.Pointer in1, - ffi.Size len, - ) - > - > - private_transform; - - @ffi.Int() - external int flags; -} - final class rsa_st extends ffi.Opaque {} - -final class stack_st_void extends ffi.Opaque {} diff --git a/third_party/boringssl/INCORPORATING.md b/third_party/boringssl/INCORPORATING.md new file mode 100644 index 00000000..415e2dca --- /dev/null +++ b/third_party/boringssl/INCORPORATING.md @@ -0,0 +1,121 @@ +# Incorporating BoringSSL into a project + +**Note**: if your target project is not a Google project then first read the +[main README](./README.md) about the purpose of BoringSSL. + +If you are porting BoringSSL to a new platform see +["go/boringssl-on-new-platform"](https://goto.corp.google.com/boringssl-on-new-platform) (Google +Internal) for information about porting BoringSSL to a new platform for a Google +project. + +## Which branch to use + +BoringSSL usage typically follows a +["live at head"](https://abseil.io/about/philosophy#we-recommend-that-you-choose-to-live-at-head) +model. Projects pin to whatever the current latest of BoringSSL is at the time +of update, and regularly update it to pick up new changes. + +Some systems cannot consume git revisions and expect git tags. BoringSSL tags +periodic snapshots as "releases", to meet the needs of those systems. These +versions do not represent any kind of stability or development milestone. +BoringSSL does not branch at these releases and will not cherry-pick bugfixes to +them. Unless there is a technical constraint to use one of these revisions, +projects should simply use the latest untagged revision when updating. + +While the BoringSSL repository may contain project-specific branches, e.g. +`chromium-2214`, those are _not_ supported release branches and must not as +such. In rare cases, BoringSSL will temporarily maintain a short-lived branch on +behalf of a project. Most such branches are no longer updated, because the +corresponding project no longer needs them, and we do not create new ones to +replace the ones that are no longer updated. E.g., not every Chromium release +branch has a corresponding BoringSSL `chromium-*` branch. Even while active, the +branch may not contain all changes relevant to a general BoringSSL consumer. + +## Bazel + +If you are using [Bazel](https://bazel.build) then you can use the [boringssl +module](https://registry.bazel.build/modules/boringssl) in the Bazel Central +Registry with bzlmod. Look up the latest version and add the following to your +`MODULE.bazel` file: + + bazel_dep(name = "boringssl", version = "INSERT_VERSION_HERE") + +Substitute the latest version in for `INSERT_VERSION_HERE`. + +BoringSSL will periodically ship snapshots to Bazel Central Registry. As with +other dependencies, periodically keep the referenced version up-to-date. + +## Directory layout + +Typically projects create a `third_party/boringssl` directory to put +BoringSSL-specific files into. The source code of BoringSSL itself goes into +`third_party/boringssl/src`, either by copying or as a +[submodule](https://git-scm.com/docs/git-submodule). + +It's generally a mistake to put BoringSSL's source code into +`third_party/boringssl` directly because custom build files need to go somewhere +and merging these with the BoringSSL source code makes updating things more +complex. + +## Build support + +BoringSSL is designed to work with many different build systems. The project +currently has [CMake](https://cmake.org/) and [Bazel](https://bazel.build/) +builds checked in. Other build systems, and embedders with custom build needs, +are supported by separating the source list, maintained by BoringSSL, and the +top-level build logic, maintained by the embedder. + +Source lists for various build systems are pre-generated and live in the `gen` +directory. For example, source lists for +[GN](https://gn.googlesource.com/gn/+/main/docs/quick_start.md) live in +[gen/sources.gni](./gen/sources.gni). There is also a generic +[gen/sources.json](./gen/sources.json) file for projects to consume if needed. +[util/build/build.go](./util/build/build.go) describes what the various source +lists mean. Most projects should concatenate the `bcm` and `crypto` targets. + +If you don't use any of the supported build systems, you should augment the +[util/pregenerate](./util/pregenerate) tool to support it, or +consume [gen/sources.json](./gen/sources.json). + +Historically, source lists were generated at update time with the +[`util/generate_build_files.py`](./util/generate_build_files.py) script. We are +in the process of transitioning builds to the pre-generated files, so that +embedders do not need to run a custom script when updating BoringSSL. + +## Defines + +BoringSSL does not present a lot of configurability in order to reduce the +number of configurations that need to be tested. But there are a couple of +\#defines that you may wish to set: + +`OPENSSL_NO_ASM` prevents the use of assembly code (although it's up to you to +ensure that the build system doesn't link it in if you wish to reduce binary +size). This will have a significant performance impact but can be useful if you +wish to use tools like +[AddressSanitizer](http://clang.llvm.org/docs/AddressSanitizer.html) that +interact poorly with assembly code. + +`OPENSSL_SMALL` removes some code that is especially large at some performance +cost. + +## Symbols + +You cannot link multiple versions of BoringSSL or OpenSSL into a single binary +without dealing with symbol conflicts. If you are statically linking multiple +versions together, there's not a lot that can be done because C doesn't have a +module system. + +If you are using multiple versions in a single binary, in different shared +objects, ensure you build BoringSSL with `-fvisibility=hidden` and do not +export any of BoringSSL's symbols. This will prevent any collisions with other +versions that may be included in other shared objects. Note that this requires +that all callers of BoringSSL APIs live in the same shared object as BoringSSL. + +If you require that BoringSSL APIs be used across shared object boundaries, +continue to build with `-fvisibility=hidden` but define +`BORINGSSL_SHARED_LIBRARY` in both BoringSSL and consumers. BoringSSL's own +source files (but *not* consumers' source files) must also build with +`BORINGSSL_IMPLEMENTATION` defined. This will export BoringSSL's public symbols +in the resulting shared object while hiding private symbols. However note that, +as with a static link, this precludes dynamically linking with another version +of BoringSSL or OpenSSL. diff --git a/third_party/boringssl/LICENSE b/third_party/boringssl/LICENSE index 49c41fa7..37a5b743 100644 --- a/third_party/boringssl/LICENSE +++ b/third_party/boringssl/LICENSE @@ -1,184 +1,205 @@ -BoringSSL is a fork of OpenSSL. As such, large parts of it fall under OpenSSL -licensing. Files that are completely new have a Google copyright and an ISC -license. This license is reproduced at the bottom of this file. - -Contributors to BoringSSL are required to follow the CLA rules for Chromium: -https://cla.developers.google.com/clas - -Files in third_party/ have their own licenses, as described therein. The MIT -license, for third_party/fiat, which, unlike other third_party directories, is -compiled into non-test libraries, is included below. - -The OpenSSL toolkit stays under a dual license, i.e. both the conditions of the -OpenSSL License and the original SSLeay license apply to the toolkit. See below -for the actual license texts. Actually both licenses are BSD-style Open Source -licenses. In case of any license issues related to OpenSSL please contact -openssl-core@openssl.org. - -The following are Google-internal bug numbers where explicit permission from -some authors is recorded for use of their work. (This is purely for our own -record keeping.) - 27287199 - 27287880 - 27287883 - - OpenSSL License - --------------- - -/* ==================================================================== - * Copyright (c) 1998-2011 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ - - Original SSLeay License - ----------------------- - -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - - -ISC license used for completely new code in BoringSSL: - -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - - -The code in third_party/fiat carries the MIT license: - -Copyright (c) 2015-2016 the fiat-crypto authors (see -https://github.com/mit-plv/fiat-crypto/blob/master/AUTHORS). - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. Licenses for support code @@ -215,37 +236,3 @@ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -BoringSSL uses the Chromium test infrastructure to run a continuous build, -trybots etc. The scripts which manage this, and the script for generating build -metadata, are under the Chromium license. Distributing code linked against -BoringSSL does not trigger this license. - -Copyright 2015 The Chromium Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/boringssl/apple-aarch64/crypto/chacha/chacha-armv8.S b/third_party/boringssl/apple-aarch64/crypto/chacha/chacha-armv8.S deleted file mode 100644 index dd992a2e..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/chacha/chacha-armv8.S +++ /dev/null @@ -1,1992 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - - -.private_extern _OPENSSL_armcap_P - -.section __TEXT,__const - -.align 5 -Lsigma: -.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral -Lone: -.long 1,0,0,0 -.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 - -.text - -.globl _ChaCha20_ctr32 -.private_extern _ChaCha20_ctr32 - -.align 5 -_ChaCha20_ctr32: - AARCH64_VALID_CALL_TARGET - cbz x2,Labort -#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 - adrp x5,:pg_hi21_nc:_OPENSSL_armcap_P -#else - adrp x5,_OPENSSL_armcap_P@PAGE -#endif - cmp x2,#192 - b.lo Lshort - ldr w17,[x5,_OPENSSL_armcap_P@PAGEOFF] - tst w17,#ARMV7_NEON - b.ne ChaCha20_neon - -Lshort: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adrp x5,Lsigma@PAGE - add x5,x5,Lsigma@PAGEOFF - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ldp x28,x30,[x4] // load counter -#ifdef __AARCH64EB__ - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - -Loop_outer: - mov w5,w22 // unpack key block - lsr x6,x22,#32 - mov w7,w23 - lsr x8,x23,#32 - mov w9,w24 - lsr x10,x24,#32 - mov w11,w25 - lsr x12,x25,#32 - mov w13,w26 - lsr x14,x26,#32 - mov w15,w27 - lsr x16,x27,#32 - mov w17,w28 - lsr x19,x28,#32 - mov w20,w30 - lsr x21,x30,#32 - - mov x4,#10 - subs x2,x2,#64 -Loop: - sub x4,x4,#1 - add w5,w5,w9 - add w6,w6,w10 - add w7,w7,w11 - add w8,w8,w12 - eor w17,w17,w5 - eor w19,w19,w6 - eor w20,w20,w7 - eor w21,w21,w8 - ror w17,w17,#16 - ror w19,w19,#16 - ror w20,w20,#16 - ror w21,w21,#16 - add w13,w13,w17 - add w14,w14,w19 - add w15,w15,w20 - add w16,w16,w21 - eor w9,w9,w13 - eor w10,w10,w14 - eor w11,w11,w15 - eor w12,w12,w16 - ror w9,w9,#20 - ror w10,w10,#20 - ror w11,w11,#20 - ror w12,w12,#20 - add w5,w5,w9 - add w6,w6,w10 - add w7,w7,w11 - add w8,w8,w12 - eor w17,w17,w5 - eor w19,w19,w6 - eor w20,w20,w7 - eor w21,w21,w8 - ror w17,w17,#24 - ror w19,w19,#24 - ror w20,w20,#24 - ror w21,w21,#24 - add w13,w13,w17 - add w14,w14,w19 - add w15,w15,w20 - add w16,w16,w21 - eor w9,w9,w13 - eor w10,w10,w14 - eor w11,w11,w15 - eor w12,w12,w16 - ror w9,w9,#25 - ror w10,w10,#25 - ror w11,w11,#25 - ror w12,w12,#25 - add w5,w5,w10 - add w6,w6,w11 - add w7,w7,w12 - add w8,w8,w9 - eor w21,w21,w5 - eor w17,w17,w6 - eor w19,w19,w7 - eor w20,w20,w8 - ror w21,w21,#16 - ror w17,w17,#16 - ror w19,w19,#16 - ror w20,w20,#16 - add w15,w15,w21 - add w16,w16,w17 - add w13,w13,w19 - add w14,w14,w20 - eor w10,w10,w15 - eor w11,w11,w16 - eor w12,w12,w13 - eor w9,w9,w14 - ror w10,w10,#20 - ror w11,w11,#20 - ror w12,w12,#20 - ror w9,w9,#20 - add w5,w5,w10 - add w6,w6,w11 - add w7,w7,w12 - add w8,w8,w9 - eor w21,w21,w5 - eor w17,w17,w6 - eor w19,w19,w7 - eor w20,w20,w8 - ror w21,w21,#24 - ror w17,w17,#24 - ror w19,w19,#24 - ror w20,w20,#24 - add w15,w15,w21 - add w16,w16,w17 - add w13,w13,w19 - add w14,w14,w20 - eor w10,w10,w15 - eor w11,w11,w16 - eor w12,w12,w13 - eor w9,w9,w14 - ror w10,w10,#25 - ror w11,w11,#25 - ror w12,w12,#25 - ror w9,w9,#25 - cbnz x4,Loop - - add w5,w5,w22 // accumulate key block - add x6,x6,x22,lsr#32 - add w7,w7,w23 - add x8,x8,x23,lsr#32 - add w9,w9,w24 - add x10,x10,x24,lsr#32 - add w11,w11,w25 - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add x21,x21,x30,lsr#32 - - b.lo Ltail - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#1 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - - b.hi Loop_outer - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER -Labort: - ret - -.align 4 -Ltail: - add x2,x2,#64 -Less_than_64: - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 - neg x2,x2 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - stp x5,x7,[sp,#0] - stp x9,x11,[sp,#16] - stp x13,x15,[sp,#32] - stp x17,x20,[sp,#48] - -Loop_tail: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] - add x2,x2,#1 - eor w10,w10,w11 - strb w10,[x0,x2] - cbnz x2,Loop_tail - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret - - - -.align 5 -ChaCha20_neon: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adrp x5,Lsigma@PAGE - add x5,x5,Lsigma@PAGEOFF - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - cmp x2,#512 - b.hs L512_or_more_neon - - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __AARCH64EB__ - rev64 v24.4s,v24.4s - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - add v27.4s,v27.4s,v31.4s // += 1 - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 - -Loop_outer_neon: - mov w5,w22 // unpack key block - lsr x6,x22,#32 - mov v0.16b,v24.16b - mov w7,w23 - lsr x8,x23,#32 - mov v4.16b,v24.16b - mov w9,w24 - lsr x10,x24,#32 - mov v16.16b,v24.16b - mov w11,w25 - mov v1.16b,v25.16b - lsr x12,x25,#32 - mov v5.16b,v25.16b - mov w13,w26 - mov v17.16b,v25.16b - lsr x14,x26,#32 - mov v3.16b,v27.16b - mov w15,w27 - mov v7.16b,v28.16b - lsr x16,x27,#32 - mov v19.16b,v29.16b - mov w17,w28 - mov v2.16b,v26.16b - lsr x19,x28,#32 - mov v6.16b,v26.16b - mov w20,w30 - mov v18.16b,v26.16b - lsr x21,x30,#32 - - mov x4,#10 - subs x2,x2,#256 -Loop_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v16.4s,v16.4s,v17.4s - add w7,w7,w11 - eor v3.16b,v3.16b,v0.16b - add w8,w8,w12 - eor v7.16b,v7.16b,v4.16b - eor w17,w17,w5 - eor v19.16b,v19.16b,v16.16b - eor w19,w19,w6 - rev32 v3.8h,v3.8h - eor w20,w20,w7 - rev32 v7.8h,v7.8h - eor w21,w21,w8 - rev32 v19.8h,v19.8h - ror w17,w17,#16 - add v2.4s,v2.4s,v3.4s - ror w19,w19,#16 - add v6.4s,v6.4s,v7.4s - ror w20,w20,#16 - add v18.4s,v18.4s,v19.4s - ror w21,w21,#16 - eor v20.16b,v1.16b,v2.16b - add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b - add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b - add w15,w15,w20 - ushr v1.4s,v20.4s,#20 - add w16,w16,w21 - ushr v5.4s,v21.4s,#20 - eor w9,w9,w13 - ushr v17.4s,v22.4s,#20 - eor w10,w10,w14 - sli v1.4s,v20.4s,#12 - eor w11,w11,w15 - sli v5.4s,v21.4s,#12 - eor w12,w12,w16 - sli v17.4s,v22.4s,#12 - ror w9,w9,#20 - add v0.4s,v0.4s,v1.4s - ror w10,w10,#20 - add v4.4s,v4.4s,v5.4s - ror w11,w11,#20 - add v16.4s,v16.4s,v17.4s - ror w12,w12,#20 - eor v20.16b,v3.16b,v0.16b - add w5,w5,w9 - eor v21.16b,v7.16b,v4.16b - add w6,w6,w10 - eor v22.16b,v19.16b,v16.16b - add w7,w7,w11 - ushr v3.4s,v20.4s,#24 - add w8,w8,w12 - ushr v7.4s,v21.4s,#24 - eor w17,w17,w5 - ushr v19.4s,v22.4s,#24 - eor w19,w19,w6 - sli v3.4s,v20.4s,#8 - eor w20,w20,w7 - sli v7.4s,v21.4s,#8 - eor w21,w21,w8 - sli v19.4s,v22.4s,#8 - ror w17,w17,#24 - add v2.4s,v2.4s,v3.4s - ror w19,w19,#24 - add v6.4s,v6.4s,v7.4s - ror w20,w20,#24 - add v18.4s,v18.4s,v19.4s - ror w21,w21,#24 - eor v20.16b,v1.16b,v2.16b - add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b - add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b - add w15,w15,w20 - ushr v1.4s,v20.4s,#25 - add w16,w16,w21 - ushr v5.4s,v21.4s,#25 - eor w9,w9,w13 - ushr v17.4s,v22.4s,#25 - eor w10,w10,w14 - sli v1.4s,v20.4s,#7 - eor w11,w11,w15 - sli v5.4s,v21.4s,#7 - eor w12,w12,w16 - sli v17.4s,v22.4s,#7 - ror w9,w9,#25 - ext v2.16b,v2.16b,v2.16b,#8 - ror w10,w10,#25 - ext v6.16b,v6.16b,v6.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w10 - add v4.4s,v4.4s,v5.4s - add w6,w6,w11 - add v16.4s,v16.4s,v17.4s - add w7,w7,w12 - eor v3.16b,v3.16b,v0.16b - add w8,w8,w9 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w5 - eor v19.16b,v19.16b,v16.16b - eor w17,w17,w6 - rev32 v3.8h,v3.8h - eor w19,w19,w7 - rev32 v7.8h,v7.8h - eor w20,w20,w8 - rev32 v19.8h,v19.8h - ror w21,w21,#16 - add v2.4s,v2.4s,v3.4s - ror w17,w17,#16 - add v6.4s,v6.4s,v7.4s - ror w19,w19,#16 - add v18.4s,v18.4s,v19.4s - ror w20,w20,#16 - eor v20.16b,v1.16b,v2.16b - add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b - add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b - add w13,w13,w19 - ushr v1.4s,v20.4s,#20 - add w14,w14,w20 - ushr v5.4s,v21.4s,#20 - eor w10,w10,w15 - ushr v17.4s,v22.4s,#20 - eor w11,w11,w16 - sli v1.4s,v20.4s,#12 - eor w12,w12,w13 - sli v5.4s,v21.4s,#12 - eor w9,w9,w14 - sli v17.4s,v22.4s,#12 - ror w10,w10,#20 - add v0.4s,v0.4s,v1.4s - ror w11,w11,#20 - add v4.4s,v4.4s,v5.4s - ror w12,w12,#20 - add v16.4s,v16.4s,v17.4s - ror w9,w9,#20 - eor v20.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v21.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v22.16b,v19.16b,v16.16b - add w7,w7,w12 - ushr v3.4s,v20.4s,#24 - add w8,w8,w9 - ushr v7.4s,v21.4s,#24 - eor w21,w21,w5 - ushr v19.4s,v22.4s,#24 - eor w17,w17,w6 - sli v3.4s,v20.4s,#8 - eor w19,w19,w7 - sli v7.4s,v21.4s,#8 - eor w20,w20,w8 - sli v19.4s,v22.4s,#8 - ror w21,w21,#24 - add v2.4s,v2.4s,v3.4s - ror w17,w17,#24 - add v6.4s,v6.4s,v7.4s - ror w19,w19,#24 - add v18.4s,v18.4s,v19.4s - ror w20,w20,#24 - eor v20.16b,v1.16b,v2.16b - add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b - add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b - add w13,w13,w19 - ushr v1.4s,v20.4s,#25 - add w14,w14,w20 - ushr v5.4s,v21.4s,#25 - eor w10,w10,w15 - ushr v17.4s,v22.4s,#25 - eor w11,w11,w16 - sli v1.4s,v20.4s,#7 - eor w12,w12,w13 - sli v5.4s,v21.4s,#7 - eor w9,w9,w14 - sli v17.4s,v22.4s,#7 - ror w10,w10,#25 - ext v2.16b,v2.16b,v2.16b,#8 - ror w11,w11,#25 - ext v6.16b,v6.16b,v6.16b,#8 - ror w12,w12,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - cbnz x4,Loop_neon - - add w5,w5,w22 // accumulate key block - add v0.4s,v0.4s,v24.4s - add x6,x6,x22,lsr#32 - add v4.4s,v4.4s,v24.4s - add w7,w7,w23 - add v16.4s,v16.4s,v24.4s - add x8,x8,x23,lsr#32 - add v2.4s,v2.4s,v26.4s - add w9,w9,w24 - add v6.4s,v6.4s,v26.4s - add x10,x10,x24,lsr#32 - add v18.4s,v18.4s,v26.4s - add w11,w11,w25 - add v3.4s,v3.4s,v27.4s - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add v7.4s,v7.4s,v28.4s - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add v19.4s,v19.4s,v29.4s - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add v1.4s,v1.4s,v25.4s - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add v5.4s,v5.4s,v25.4s - add x21,x21,x30,lsr#32 - add v17.4s,v17.4s,v25.4s - - b.lo Ltail_neon - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor v0.16b,v0.16b,v20.16b - eor x15,x15,x16 - eor v1.16b,v1.16b,v21.16b - eor x17,x17,x19 - eor v2.16b,v2.16b,v22.16b - eor x20,x20,x21 - eor v3.16b,v3.16b,v23.16b - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter - stp x9,x11,[x0,#16] - add v27.4s,v27.4s,v31.4s // += 4 - stp x13,x15,[x0,#32] - add v28.4s,v28.4s,v31.4s - stp x17,x20,[x0,#48] - add v29.4s,v29.4s,v31.4s - add x0,x0,#64 - - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - eor v16.16b,v16.16b,v0.16b - eor v17.16b,v17.16b,v1.16b - eor v18.16b,v18.16b,v2.16b - eor v19.16b,v19.16b,v3.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - - b.hi Loop_outer_neon - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret - -Ltail_neon: - add x2,x2,#256 - cmp x2,#64 - b.lo Less_than_64 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - b.eq Ldone_neon - sub x2,x2,#64 - cmp x2,#64 - b.lo Less_than_128 - - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v0.16b,v0.16b,v20.16b - eor v1.16b,v1.16b,v21.16b - eor v2.16b,v2.16b,v22.16b - eor v3.16b,v3.16b,v23.16b - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - b.eq Ldone_neon - sub x2,x2,#64 - cmp x2,#64 - b.lo Less_than_192 - - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - b.eq Ldone_neon - sub x2,x2,#64 - - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] - b Last_neon - -Less_than_128: - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] - b Last_neon -Less_than_192: - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] - b Last_neon - -.align 4 -Last_neon: - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 - neg x2,x2 - -Loop_tail_neon: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] - add x2,x2,#1 - eor w10,w10,w11 - strb w10,[x0,x2] - cbnz x2,Loop_tail_neon - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - -Ldone_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -.align 5 -ChaCha20_512_neon: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adrp x5,Lsigma@PAGE - add x5,x5,Lsigma@PAGEOFF - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - -L512_or_more_neon: - sub sp,sp,#128+64 - - ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __AARCH64EB__ - rev64 v24.4s,v24.4s - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - add v27.4s,v27.4s,v31.4s // += 1 - stp q24,q25,[sp,#0] // off-load key block, invariant part - add v27.4s,v27.4s,v31.4s // not typo - str q26,[sp,#32] - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - add v30.4s,v29.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 - - stp d8,d9,[sp,#128+0] // meet ABI requirements - stp d10,d11,[sp,#128+16] - stp d12,d13,[sp,#128+32] - stp d14,d15,[sp,#128+48] - - sub x2,x2,#512 // not typo - -Loop_outer_512_neon: - mov v0.16b,v24.16b - mov v4.16b,v24.16b - mov v8.16b,v24.16b - mov v12.16b,v24.16b - mov v16.16b,v24.16b - mov v20.16b,v24.16b - mov v1.16b,v25.16b - mov w5,w22 // unpack key block - mov v5.16b,v25.16b - lsr x6,x22,#32 - mov v9.16b,v25.16b - mov w7,w23 - mov v13.16b,v25.16b - lsr x8,x23,#32 - mov v17.16b,v25.16b - mov w9,w24 - mov v21.16b,v25.16b - lsr x10,x24,#32 - mov v3.16b,v27.16b - mov w11,w25 - mov v7.16b,v28.16b - lsr x12,x25,#32 - mov v11.16b,v29.16b - mov w13,w26 - mov v15.16b,v30.16b - lsr x14,x26,#32 - mov v2.16b,v26.16b - mov w15,w27 - mov v6.16b,v26.16b - lsr x16,x27,#32 - add v19.4s,v3.4s,v31.4s // +4 - mov w17,w28 - add v23.4s,v7.4s,v31.4s // +4 - lsr x19,x28,#32 - mov v10.16b,v26.16b - mov w20,w30 - mov v14.16b,v26.16b - lsr x21,x30,#32 - mov v18.16b,v26.16b - stp q27,q28,[sp,#48] // off-load key block, variable part - mov v22.16b,v26.16b - str q29,[sp,#80] - - mov x4,#5 - subs x2,x2,#512 -Loop_upper_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v9.16b,v9.16b,v9.16b,#4 - ext v13.16b,v13.16b,v13.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v9.16b,v9.16b,v9.16b,#12 - ext v13.16b,v13.16b,v13.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,Loop_upper_neon - - add w5,w5,w22 // accumulate key block - add x6,x6,x22,lsr#32 - add w7,w7,w23 - add x8,x8,x23,lsr#32 - add w9,w9,w24 - add x10,x10,x24,lsr#32 - add w11,w11,w25 - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add x21,x21,x30,lsr#32 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#1 // increment counter - mov w5,w22 // unpack key block - lsr x6,x22,#32 - stp x9,x11,[x0,#16] - mov w7,w23 - lsr x8,x23,#32 - stp x13,x15,[x0,#32] - mov w9,w24 - lsr x10,x24,#32 - stp x17,x20,[x0,#48] - add x0,x0,#64 - mov w11,w25 - lsr x12,x25,#32 - mov w13,w26 - lsr x14,x26,#32 - mov w15,w27 - lsr x16,x27,#32 - mov w17,w28 - lsr x19,x28,#32 - mov w20,w30 - lsr x21,x30,#32 - - mov x4,#5 -Loop_lower_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v9.16b,v9.16b,v9.16b,#4 - ext v13.16b,v13.16b,v13.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v9.16b,v9.16b,v9.16b,#12 - ext v13.16b,v13.16b,v13.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,Loop_lower_neon - - add w5,w5,w22 // accumulate key block - ldp q24,q25,[sp,#0] - add x6,x6,x22,lsr#32 - ldp q26,q27,[sp,#32] - add w7,w7,w23 - ldp q28,q29,[sp,#64] - add x8,x8,x23,lsr#32 - add v0.4s,v0.4s,v24.4s - add w9,w9,w24 - add v4.4s,v4.4s,v24.4s - add x10,x10,x24,lsr#32 - add v8.4s,v8.4s,v24.4s - add w11,w11,w25 - add v12.4s,v12.4s,v24.4s - add x12,x12,x25,lsr#32 - add v16.4s,v16.4s,v24.4s - add w13,w13,w26 - add v20.4s,v20.4s,v24.4s - add x14,x14,x26,lsr#32 - add v2.4s,v2.4s,v26.4s - add w15,w15,w27 - add v6.4s,v6.4s,v26.4s - add x16,x16,x27,lsr#32 - add v10.4s,v10.4s,v26.4s - add w17,w17,w28 - add v14.4s,v14.4s,v26.4s - add x19,x19,x28,lsr#32 - add v18.4s,v18.4s,v26.4s - add w20,w20,w30 - add v22.4s,v22.4s,v26.4s - add x21,x21,x30,lsr#32 - add v19.4s,v19.4s,v31.4s // +4 - add x5,x5,x6,lsl#32 // pack - add v23.4s,v23.4s,v31.4s // +4 - add x7,x7,x8,lsl#32 - add v3.4s,v3.4s,v27.4s - ldp x6,x8,[x1,#0] // load input - add v7.4s,v7.4s,v28.4s - add x9,x9,x10,lsl#32 - add v11.4s,v11.4s,v29.4s - add x11,x11,x12,lsl#32 - add v15.4s,v15.4s,v30.4s - ldp x10,x12,[x1,#16] - add v19.4s,v19.4s,v27.4s - add x13,x13,x14,lsl#32 - add v23.4s,v23.4s,v28.4s - add x15,x15,x16,lsl#32 - add v1.4s,v1.4s,v25.4s - ldp x14,x16,[x1,#32] - add v5.4s,v5.4s,v25.4s - add x17,x17,x19,lsl#32 - add v9.4s,v9.4s,v25.4s - add x20,x20,x21,lsl#32 - add v13.4s,v13.4s,v25.4s - ldp x19,x21,[x1,#48] - add v17.4s,v17.4s,v25.4s - add x1,x1,#64 - add v21.4s,v21.4s,v25.4s - -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor v0.16b,v0.16b,v24.16b - eor x15,x15,x16 - eor v1.16b,v1.16b,v25.16b - eor x17,x17,x19 - eor v2.16b,v2.16b,v26.16b - eor x20,x20,x21 - eor v3.16b,v3.16b,v27.16b - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#7 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - eor v4.16b,v4.16b,v24.16b - eor v5.16b,v5.16b,v25.16b - eor v6.16b,v6.16b,v26.16b - eor v7.16b,v7.16b,v27.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - eor v8.16b,v8.16b,v0.16b - ldp q24,q25,[sp,#0] - eor v9.16b,v9.16b,v1.16b - ldp q26,q27,[sp,#32] - eor v10.16b,v10.16b,v2.16b - eor v11.16b,v11.16b,v3.16b - st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 - - ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 - eor v12.16b,v12.16b,v4.16b - eor v13.16b,v13.16b,v5.16b - eor v14.16b,v14.16b,v6.16b - eor v15.16b,v15.16b,v7.16b - st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 - - ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 - eor v16.16b,v16.16b,v8.16b - eor v17.16b,v17.16b,v9.16b - eor v18.16b,v18.16b,v10.16b - eor v19.16b,v19.16b,v11.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - - shl v0.4s,v31.4s,#1 // 4 -> 8 - eor v20.16b,v20.16b,v12.16b - eor v21.16b,v21.16b,v13.16b - eor v22.16b,v22.16b,v14.16b - eor v23.16b,v23.16b,v15.16b - st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 - - add v27.4s,v27.4s,v0.4s // += 8 - add v28.4s,v28.4s,v0.4s - add v29.4s,v29.4s,v0.4s - add v30.4s,v30.4s,v0.4s - - b.hs Loop_outer_512_neon - - adds x2,x2,#512 - ushr v0.4s,v31.4s,#2 // 4 -> 1 - - ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp d10,d11,[sp,#128+16] - ldp d12,d13,[sp,#128+32] - ldp d14,d15,[sp,#128+48] - - stp q24,q31,[sp,#0] // wipe off-load area - stp q24,q31,[sp,#32] - stp q24,q31,[sp,#64] - - b.eq Ldone_512_neon - - cmp x2,#192 - sub v27.4s,v27.4s,v0.4s // -= 1 - sub v28.4s,v28.4s,v0.4s - sub v29.4s,v29.4s,v0.4s - add sp,sp,#128 - b.hs Loop_outer_neon - - eor v25.16b,v25.16b,v25.16b - eor v26.16b,v26.16b,v26.16b - eor v27.16b,v27.16b,v27.16b - eor v28.16b,v28.16b,v28.16b - eor v29.16b,v29.16b,v29.16b - eor v30.16b,v30.16b,v30.16b - b Loop_outer - -Ldone_512_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#128+64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret - -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S b/third_party/boringssl/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S deleted file mode 100644 index eea0722f..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S +++ /dev/null @@ -1,3017 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include -.section __TEXT,__const - -.align 7 -Lchacha20_consts: -.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' -Linc: -.long 1,2,3,4 -Lrol8: -.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 -Lclamp: -.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC - -.text - - -.align 6 -Lpoly_hash_ad_internal: -.cfi_startproc - cbnz x4, Lpoly_hash_intro - ret - -Lpoly_hash_intro: - cmp x4, #16 - b.lt Lpoly_hash_ad_tail - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - sub x4, x4, #16 - b Lpoly_hash_ad_internal - -Lpoly_hash_ad_tail: - cbz x4, Lpoly_hash_ad_ret - - eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD - sub x4, x4, #1 - -Lpoly_hash_tail_16_compose: - ext v20.16b, v20.16b, v20.16b, #15 - ldrb w11, [x3, x4] - mov v20.b[0], w11 - subs x4, x4, #1 - b.ge Lpoly_hash_tail_16_compose - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - -Lpoly_hash_ad_ret: - ret -.cfi_endproc - - -///////////////////////////////// -// -// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); -// -.globl _chacha20_poly1305_seal -.private_extern _chacha20_poly1305_seal - -.align 6 -_chacha20_poly1305_seal: - AARCH64_SIGN_LINK_REGISTER -.cfi_startproc - stp x29, x30, [sp, #-80]! -.cfi_def_cfa_offset 80 -.cfi_offset w30, -72 -.cfi_offset w29, -80 - mov x29, sp - // We probably could do .cfi_def_cfa w29, 80 at this point, but since - // we don't actually use the frame pointer like that, it's probably not - // worth bothering. - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] -.cfi_offset b15, -8 -.cfi_offset b14, -16 -.cfi_offset b13, -24 -.cfi_offset b12, -32 -.cfi_offset b11, -40 -.cfi_offset b10, -48 -.cfi_offset b9, -56 -.cfi_offset b8, -64 - - adrp x11, Lchacha20_consts@PAGE - add x11, x11, Lchacha20_consts@PAGEOFF - - ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values - ld1 {v28.16b - v30.16b}, [x5] - - mov x15, #1 // Prepare the Poly1305 state - mov x8, #0 - mov x9, #0 - mov x10, #0 - - ldr x12, [x5, #56] // The total cipher text length includes extra_in_len - add x12, x12, x2 - mov v31.d[0], x4 // Store the input and aad lengths - mov v31.d[1], x12 - - cmp x2, #128 - b.le Lseal_128 // Optimization for smaller buffers - - // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, - // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, - // the fifth block (A4-D4) horizontally. - ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] - mov v4.16b, v24.16b - - ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 - mov v9.16b, v28.16b - - ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 - mov v14.16b, v29.16b - - ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] - add v15.4s, v15.4s, v25.4s - mov v19.16b, v30.16b - - sub x5, x5, #32 - - mov x6, #10 - -.align 5 -Lseal_init_rounds: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v9.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v18.8h, v18.8h - rev32 v19.8h, v19.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - eor v8.16b, v8.16b, v13.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v9.4s, #20 - sli v8.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - add v4.4s, v4.4s, v8.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v18.16b, {v18.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v14.16b - - ushr v9.4s, v8.4s, #25 - sli v9.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #4 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #12 - add v0.4s, v0.4s, v6.4s - add v1.4s, v1.4s, v7.4s - add v2.4s, v2.4s, v8.4s - add v3.4s, v3.4s, v5.4s - add v4.4s, v4.4s, v9.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v18.8h, v18.8h - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v19.8h, v19.8h - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v6.4s, #20 - sli v20.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v5.4s, #20 - sli v8.4s, v5.4s, #12 - ushr v5.4s, v9.4s, #20 - sli v5.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v5.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v18.16b, {v18.16b}, v26.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v12.16b - eor v6.16b, v6.16b, v13.16b - eor v7.16b, v7.16b, v10.16b - eor v8.16b, v8.16b, v11.16b - eor v5.16b, v5.16b, v14.16b - - ushr v9.4s, v5.4s, #25 - sli v9.4s, v5.4s, #7 - ushr v5.4s, v8.4s, #25 - sli v5.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v20.4s, #25 - sli v6.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #12 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #4 - subs x6, x6, #1 - b.hi Lseal_init_rounds - - add v15.4s, v15.4s, v25.4s - mov x11, #4 - dup v20.4s, w11 - add v25.4s, v25.4s, v20.4s - - zip1 v20.4s, v0.4s, v1.4s - zip2 v21.4s, v0.4s, v1.4s - zip1 v22.4s, v2.4s, v3.4s - zip2 v23.4s, v2.4s, v3.4s - - zip1 v0.2d, v20.2d, v22.2d - zip2 v1.2d, v20.2d, v22.2d - zip1 v2.2d, v21.2d, v23.2d - zip2 v3.2d, v21.2d, v23.2d - - zip1 v20.4s, v5.4s, v6.4s - zip2 v21.4s, v5.4s, v6.4s - zip1 v22.4s, v7.4s, v8.4s - zip2 v23.4s, v7.4s, v8.4s - - zip1 v5.2d, v20.2d, v22.2d - zip2 v6.2d, v20.2d, v22.2d - zip1 v7.2d, v21.2d, v23.2d - zip2 v8.2d, v21.2d, v23.2d - - zip1 v20.4s, v10.4s, v11.4s - zip2 v21.4s, v10.4s, v11.4s - zip1 v22.4s, v12.4s, v13.4s - zip2 v23.4s, v12.4s, v13.4s - - zip1 v10.2d, v20.2d, v22.2d - zip2 v11.2d, v20.2d, v22.2d - zip1 v12.2d, v21.2d, v23.2d - zip2 v13.2d, v21.2d, v23.2d - - zip1 v20.4s, v15.4s, v16.4s - zip2 v21.4s, v15.4s, v16.4s - zip1 v22.4s, v17.4s, v18.4s - zip2 v23.4s, v17.4s, v18.4s - - zip1 v15.2d, v20.2d, v22.2d - zip2 v16.2d, v20.2d, v22.2d - zip1 v17.2d, v21.2d, v23.2d - zip2 v18.2d, v21.2d, v23.2d - - add v4.4s, v4.4s, v24.4s - add v9.4s, v9.4s, v28.4s - and v4.16b, v4.16b, v27.16b - - add v0.4s, v0.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v15.4s, v15.4s, v30.4s - - add v1.4s, v1.4s, v24.4s - add v6.4s, v6.4s, v28.4s - add v11.4s, v11.4s, v29.4s - add v16.4s, v16.4s, v30.4s - - add v2.4s, v2.4s, v24.4s - add v7.4s, v7.4s, v28.4s - add v12.4s, v12.4s, v29.4s - add v17.4s, v17.4s, v30.4s - - add v3.4s, v3.4s, v24.4s - add v8.4s, v8.4s, v28.4s - add v13.4s, v13.4s, v29.4s - add v18.4s, v18.4s, v30.4s - - mov x16, v4.d[0] // Move the R key to GPRs - mov x17, v4.d[1] - mov v27.16b, v9.16b // Store the S key - - bl Lpoly_hash_ad_internal - - mov x3, x0 - cmp x2, #256 - b.le Lseal_tail - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v0.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v15.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v1.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v11.16b - eor v23.16b, v23.16b, v16.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v2.16b - eor v21.16b, v21.16b, v7.16b - eor v22.16b, v22.16b, v12.16b - eor v23.16b, v23.16b, v17.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v3.16b - eor v21.16b, v21.16b, v8.16b - eor v22.16b, v22.16b, v13.16b - eor v23.16b, v23.16b, v18.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #256 - - mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds - mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 - -Lseal_main_loop: - adrp x11, Lchacha20_consts@PAGE - add x11, x11, Lchacha20_consts@PAGEOFF - - ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] - mov v4.16b, v24.16b - - ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 - mov v9.16b, v28.16b - - ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 - mov v14.16b, v29.16b - - ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] - add v15.4s, v15.4s, v25.4s - mov v19.16b, v30.16b - - eor v20.16b, v20.16b, v20.16b //zero - not v21.16b, v20.16b // -1 - sub v21.4s, v25.4s, v21.4s // Add +1 - ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) - add v19.4s, v19.4s, v20.4s - - sub x5, x5, #32 -.align 5 -Lseal_main_loop_rounds: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v9.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v18.8h, v18.8h - rev32 v19.8h, v19.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - eor v8.16b, v8.16b, v13.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v9.4s, #20 - sli v8.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - add v4.4s, v4.4s, v8.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v18.16b, {v18.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v14.16b - - ushr v9.4s, v8.4s, #25 - sli v9.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #4 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #12 - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - add v0.4s, v0.4s, v6.4s - add v1.4s, v1.4s, v7.4s - add v2.4s, v2.4s, v8.4s - add v3.4s, v3.4s, v5.4s - add v4.4s, v4.4s, v9.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v18.8h, v18.8h - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v19.8h, v19.8h - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v6.4s, #20 - sli v20.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v5.4s, #20 - sli v8.4s, v5.4s, #12 - ushr v5.4s, v9.4s, #20 - sli v5.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v5.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v18.16b, {v18.16b}, v26.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v12.16b - eor v6.16b, v6.16b, v13.16b - eor v7.16b, v7.16b, v10.16b - eor v8.16b, v8.16b, v11.16b - eor v5.16b, v5.16b, v14.16b - - ushr v9.4s, v5.4s, #25 - sli v9.4s, v5.4s, #7 - ushr v5.4s, v8.4s, #25 - sli v5.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v20.4s, #25 - sli v6.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #12 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #4 - subs x6, x6, #1 - b.ge Lseal_main_loop_rounds - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - subs x7, x7, #1 - b.gt Lseal_main_loop_rounds - - eor v20.16b, v20.16b, v20.16b //zero - not v21.16b, v20.16b // -1 - sub v21.4s, v25.4s, v21.4s // Add +1 - ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) - add v19.4s, v19.4s, v20.4s - - add v15.4s, v15.4s, v25.4s - mov x11, #5 - dup v20.4s, w11 - add v25.4s, v25.4s, v20.4s - - zip1 v20.4s, v0.4s, v1.4s - zip2 v21.4s, v0.4s, v1.4s - zip1 v22.4s, v2.4s, v3.4s - zip2 v23.4s, v2.4s, v3.4s - - zip1 v0.2d, v20.2d, v22.2d - zip2 v1.2d, v20.2d, v22.2d - zip1 v2.2d, v21.2d, v23.2d - zip2 v3.2d, v21.2d, v23.2d - - zip1 v20.4s, v5.4s, v6.4s - zip2 v21.4s, v5.4s, v6.4s - zip1 v22.4s, v7.4s, v8.4s - zip2 v23.4s, v7.4s, v8.4s - - zip1 v5.2d, v20.2d, v22.2d - zip2 v6.2d, v20.2d, v22.2d - zip1 v7.2d, v21.2d, v23.2d - zip2 v8.2d, v21.2d, v23.2d - - zip1 v20.4s, v10.4s, v11.4s - zip2 v21.4s, v10.4s, v11.4s - zip1 v22.4s, v12.4s, v13.4s - zip2 v23.4s, v12.4s, v13.4s - - zip1 v10.2d, v20.2d, v22.2d - zip2 v11.2d, v20.2d, v22.2d - zip1 v12.2d, v21.2d, v23.2d - zip2 v13.2d, v21.2d, v23.2d - - zip1 v20.4s, v15.4s, v16.4s - zip2 v21.4s, v15.4s, v16.4s - zip1 v22.4s, v17.4s, v18.4s - zip2 v23.4s, v17.4s, v18.4s - - zip1 v15.2d, v20.2d, v22.2d - zip2 v16.2d, v20.2d, v22.2d - zip1 v17.2d, v21.2d, v23.2d - zip2 v18.2d, v21.2d, v23.2d - - add v0.4s, v0.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v15.4s, v15.4s, v30.4s - - add v1.4s, v1.4s, v24.4s - add v6.4s, v6.4s, v28.4s - add v11.4s, v11.4s, v29.4s - add v16.4s, v16.4s, v30.4s - - add v2.4s, v2.4s, v24.4s - add v7.4s, v7.4s, v28.4s - add v12.4s, v12.4s, v29.4s - add v17.4s, v17.4s, v30.4s - - add v3.4s, v3.4s, v24.4s - add v8.4s, v8.4s, v28.4s - add v13.4s, v13.4s, v29.4s - add v18.4s, v18.4s, v30.4s - - add v4.4s, v4.4s, v24.4s - add v9.4s, v9.4s, v28.4s - add v14.4s, v14.4s, v29.4s - add v19.4s, v19.4s, v30.4s - - cmp x2, #320 - b.le Lseal_tail - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v0.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v15.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v1.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v11.16b - eor v23.16b, v23.16b, v16.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v2.16b - eor v21.16b, v21.16b, v7.16b - eor v22.16b, v22.16b, v12.16b - eor v23.16b, v23.16b, v17.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v3.16b - eor v21.16b, v21.16b, v8.16b - eor v22.16b, v22.16b, v13.16b - eor v23.16b, v23.16b, v18.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v4.16b - eor v21.16b, v21.16b, v9.16b - eor v22.16b, v22.16b, v14.16b - eor v23.16b, v23.16b, v19.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #320 - - mov x6, #0 - mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration - - b Lseal_main_loop - -Lseal_tail: - // This part of the function handles the storage and authentication of the last [0,320) bytes - // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. - cmp x2, #64 - b.lt Lseal_tail_64 - - // Store and authenticate 64B blocks per iteration - ld1 {v20.16b - v23.16b}, [x1], #64 - - eor v20.16b, v20.16b, v0.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v15.16b - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v21.d[0] - mov x12, v21.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v22.d[0] - mov x12, v22.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v23.d[0] - mov x12, v23.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - st1 {v20.16b - v23.16b}, [x0], #64 - sub x2, x2, #64 - - // Shift the state left by 64 bytes for the next iteration of the loop - mov v0.16b, v1.16b - mov v5.16b, v6.16b - mov v10.16b, v11.16b - mov v15.16b, v16.16b - - mov v1.16b, v2.16b - mov v6.16b, v7.16b - mov v11.16b, v12.16b - mov v16.16b, v17.16b - - mov v2.16b, v3.16b - mov v7.16b, v8.16b - mov v12.16b, v13.16b - mov v17.16b, v18.16b - - mov v3.16b, v4.16b - mov v8.16b, v9.16b - mov v13.16b, v14.16b - mov v18.16b, v19.16b - - b Lseal_tail - -Lseal_tail_64: - ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr - - // Here we handle the last [0,64) bytes of plaintext - cmp x2, #16 - b.lt Lseal_tail_16 - // Each iteration encrypt and authenticate a 16B block - ld1 {v20.16b}, [x1], #16 - eor v20.16b, v20.16b, v0.16b - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - st1 {v20.16b}, [x0], #16 - - sub x2, x2, #16 - - // Shift the state left by 16 bytes for the next iteration of the loop - mov v0.16b, v5.16b - mov v5.16b, v10.16b - mov v10.16b, v15.16b - - b Lseal_tail_64 - -Lseal_tail_16: - // Here we handle the last [0,16) bytes of ciphertext that require a padded block - cbz x2, Lseal_hash_extra - - eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in - eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes - not v22.16b, v20.16b - - mov x6, x2 - add x1, x1, x2 - - cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding - - mov x7, #16 // We need to load some extra_in first for padding - sub x7, x7, x2 - cmp x4, x7 - csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register - mov x12, x7 - add x3, x3, x7 - sub x4, x4, x7 - -Lseal_tail16_compose_extra_in: - ext v20.16b, v20.16b, v20.16b, #15 - ldrb w11, [x3, #-1]! - mov v20.b[0], w11 - subs x7, x7, #1 - b.gt Lseal_tail16_compose_extra_in - - add x3, x3, x12 - -Lseal_tail_16_compose: - ext v20.16b, v20.16b, v20.16b, #15 - ldrb w11, [x1, #-1]! - mov v20.b[0], w11 - ext v21.16b, v22.16b, v21.16b, #15 - subs x2, x2, #1 - b.gt Lseal_tail_16_compose - - and v0.16b, v0.16b, v21.16b - eor v20.16b, v20.16b, v0.16b - mov v21.16b, v20.16b - -Lseal_tail_16_store: - umov w11, v20.b[0] - strb w11, [x0], #1 - ext v20.16b, v20.16b, v20.16b, #1 - subs x6, x6, #1 - b.gt Lseal_tail_16_store - - // Hash in the final ct block concatenated with extra_in - mov x11, v21.d[0] - mov x12, v21.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - -Lseal_hash_extra: - cbz x4, Lseal_finalize - -Lseal_hash_extra_loop: - cmp x4, #16 - b.lt Lseal_hash_extra_tail - ld1 {v20.16b}, [x3], #16 - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - sub x4, x4, #16 - b Lseal_hash_extra_loop - -Lseal_hash_extra_tail: - cbz x4, Lseal_finalize - eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext - add x3, x3, x4 - -Lseal_hash_extra_load: - ext v20.16b, v20.16b, v20.16b, #15 - ldrb w11, [x3, #-1]! - mov v20.b[0], w11 - subs x4, x4, #1 - b.gt Lseal_hash_extra_load - - // Hash in the final padded extra_in blcok - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - -Lseal_finalize: - mov x11, v31.d[0] - mov x12, v31.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - // Final reduction step - sub x12, xzr, x15 - orr x13, xzr, #3 - subs x11, x8, #-5 - sbcs x12, x9, x12 - sbcs x13, x10, x13 - csel x8, x11, x8, cs - csel x9, x12, x9, cs - csel x10, x13, x10, cs - mov x11, v27.d[0] - mov x12, v27.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - - stp x8, x9, [x5] - - ldp d8, d9, [sp, #16] - ldp d10, d11, [sp, #32] - ldp d12, d13, [sp, #48] - ldp d14, d15, [sp, #64] -.cfi_restore b15 -.cfi_restore b14 -.cfi_restore b13 -.cfi_restore b12 -.cfi_restore b11 -.cfi_restore b10 -.cfi_restore b9 -.cfi_restore b8 - ldp x29, x30, [sp], 80 -.cfi_restore w29 -.cfi_restore w30 -.cfi_def_cfa_offset 0 - AARCH64_VALIDATE_LINK_REGISTER - ret - -Lseal_128: - // On some architectures preparing 5 blocks for small buffers is wasteful - eor v25.16b, v25.16b, v25.16b - mov x11, #1 - mov v25.s[0], w11 - mov v0.16b, v24.16b - mov v1.16b, v24.16b - mov v2.16b, v24.16b - mov v5.16b, v28.16b - mov v6.16b, v28.16b - mov v7.16b, v28.16b - mov v10.16b, v29.16b - mov v11.16b, v29.16b - mov v12.16b, v29.16b - mov v17.16b, v30.16b - add v15.4s, v17.4s, v25.4s - add v16.4s, v15.4s, v25.4s - - mov x6, #10 - -Lseal_128_rounds: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #4 - ext v6.16b, v6.16b, v6.16b, #4 - ext v7.16b, v7.16b, v7.16b, #4 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #12 - ext v16.16b, v16.16b, v16.16b, #12 - ext v17.16b, v17.16b, v17.16b, #12 - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #12 - ext v6.16b, v6.16b, v6.16b, #12 - ext v7.16b, v7.16b, v7.16b, #12 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #4 - ext v16.16b, v16.16b, v16.16b, #4 - ext v17.16b, v17.16b, v17.16b, #4 - subs x6, x6, #1 - b.hi Lseal_128_rounds - - add v0.4s, v0.4s, v24.4s - add v1.4s, v1.4s, v24.4s - add v2.4s, v2.4s, v24.4s - - add v5.4s, v5.4s, v28.4s - add v6.4s, v6.4s, v28.4s - add v7.4s, v7.4s, v28.4s - - // Only the first 32 bytes of the third block (counter = 0) are needed, - // so skip updating v12 and v17. - add v10.4s, v10.4s, v29.4s - add v11.4s, v11.4s, v29.4s - - add v30.4s, v30.4s, v25.4s - add v15.4s, v15.4s, v30.4s - add v30.4s, v30.4s, v25.4s - add v16.4s, v16.4s, v30.4s - - and v2.16b, v2.16b, v27.16b - mov x16, v2.d[0] // Move the R key to GPRs - mov x17, v2.d[1] - mov v27.16b, v7.16b // Store the S key - - bl Lpoly_hash_ad_internal - b Lseal_tail -.cfi_endproc - - -///////////////////////////////// -// -// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); -// -.globl _chacha20_poly1305_open -.private_extern _chacha20_poly1305_open - -.align 6 -_chacha20_poly1305_open: - AARCH64_SIGN_LINK_REGISTER -.cfi_startproc - stp x29, x30, [sp, #-80]! -.cfi_def_cfa_offset 80 -.cfi_offset w30, -72 -.cfi_offset w29, -80 - mov x29, sp - // We probably could do .cfi_def_cfa w29, 80 at this point, but since - // we don't actually use the frame pointer like that, it's probably not - // worth bothering. - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] -.cfi_offset b15, -8 -.cfi_offset b14, -16 -.cfi_offset b13, -24 -.cfi_offset b12, -32 -.cfi_offset b11, -40 -.cfi_offset b10, -48 -.cfi_offset b9, -56 -.cfi_offset b8, -64 - - adrp x11, Lchacha20_consts@PAGE - add x11, x11, Lchacha20_consts@PAGEOFF - - ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values - ld1 {v28.16b - v30.16b}, [x5] - - mov x15, #1 // Prepare the Poly1305 state - mov x8, #0 - mov x9, #0 - mov x10, #0 - - mov v31.d[0], x4 // Store the input and aad lengths - mov v31.d[1], x2 - - cmp x2, #128 - b.le Lopen_128 // Optimization for smaller buffers - - // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys - mov v0.16b, v24.16b - mov v5.16b, v28.16b - mov v10.16b, v29.16b - mov v15.16b, v30.16b - - mov x6, #10 - -.align 5 -Lopen_init_rounds: - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #4 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #12 - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #12 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #4 - subs x6, x6, #1 - b.hi Lopen_init_rounds - - add v0.4s, v0.4s, v24.4s - add v5.4s, v5.4s, v28.4s - - and v0.16b, v0.16b, v27.16b - mov x16, v0.d[0] // Move the R key to GPRs - mov x17, v0.d[1] - mov v27.16b, v5.16b // Store the S key - - bl Lpoly_hash_ad_internal - -Lopen_ad_done: - mov x3, x1 - -// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes -Lopen_main_loop: - - cmp x2, #192 - b.lt Lopen_tail - - adrp x11, Lchacha20_consts@PAGE - add x11, x11, Lchacha20_consts@PAGEOFF - - ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] - mov v4.16b, v24.16b - - ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 - mov v9.16b, v28.16b - - ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 - mov v14.16b, v29.16b - - ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] - sub x5, x5, #32 - add v15.4s, v15.4s, v25.4s - mov v19.16b, v30.16b - - eor v20.16b, v20.16b, v20.16b //zero - not v21.16b, v20.16b // -1 - sub v21.4s, v25.4s, v21.4s // Add +1 - ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) - add v19.4s, v19.4s, v20.4s - - lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 - sub x4, x4, #10 - - mov x7, #10 - subs x6, x7, x4 - subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash - csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full - - cbz x7, Lopen_main_loop_rounds_short - -.align 5 -Lopen_main_loop_rounds: - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most -Lopen_main_loop_rounds_short: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v9.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v18.8h, v18.8h - rev32 v19.8h, v19.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - eor v8.16b, v8.16b, v13.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v9.4s, #20 - sli v8.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - add v4.4s, v4.4s, v8.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v18.16b, {v18.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v14.16b - - ushr v9.4s, v8.4s, #25 - sli v9.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #4 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #12 - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - add v0.4s, v0.4s, v6.4s - add v1.4s, v1.4s, v7.4s - add v2.4s, v2.4s, v8.4s - add v3.4s, v3.4s, v5.4s - add v4.4s, v4.4s, v9.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v18.8h, v18.8h - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v19.8h, v19.8h - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v6.4s, #20 - sli v20.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v5.4s, #20 - sli v8.4s, v5.4s, #12 - ushr v5.4s, v9.4s, #20 - sli v5.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v5.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v18.16b, {v18.16b}, v26.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v12.16b - eor v6.16b, v6.16b, v13.16b - eor v7.16b, v7.16b, v10.16b - eor v8.16b, v8.16b, v11.16b - eor v5.16b, v5.16b, v14.16b - - ushr v9.4s, v5.4s, #25 - sli v9.4s, v5.4s, #7 - ushr v5.4s, v8.4s, #25 - sli v5.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v20.4s, #25 - sli v6.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #12 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #4 - subs x7, x7, #1 - b.gt Lopen_main_loop_rounds - subs x6, x6, #1 - b.ge Lopen_main_loop_rounds_short - - eor v20.16b, v20.16b, v20.16b //zero - not v21.16b, v20.16b // -1 - sub v21.4s, v25.4s, v21.4s // Add +1 - ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) - add v19.4s, v19.4s, v20.4s - - add v15.4s, v15.4s, v25.4s - mov x11, #5 - dup v20.4s, w11 - add v25.4s, v25.4s, v20.4s - - zip1 v20.4s, v0.4s, v1.4s - zip2 v21.4s, v0.4s, v1.4s - zip1 v22.4s, v2.4s, v3.4s - zip2 v23.4s, v2.4s, v3.4s - - zip1 v0.2d, v20.2d, v22.2d - zip2 v1.2d, v20.2d, v22.2d - zip1 v2.2d, v21.2d, v23.2d - zip2 v3.2d, v21.2d, v23.2d - - zip1 v20.4s, v5.4s, v6.4s - zip2 v21.4s, v5.4s, v6.4s - zip1 v22.4s, v7.4s, v8.4s - zip2 v23.4s, v7.4s, v8.4s - - zip1 v5.2d, v20.2d, v22.2d - zip2 v6.2d, v20.2d, v22.2d - zip1 v7.2d, v21.2d, v23.2d - zip2 v8.2d, v21.2d, v23.2d - - zip1 v20.4s, v10.4s, v11.4s - zip2 v21.4s, v10.4s, v11.4s - zip1 v22.4s, v12.4s, v13.4s - zip2 v23.4s, v12.4s, v13.4s - - zip1 v10.2d, v20.2d, v22.2d - zip2 v11.2d, v20.2d, v22.2d - zip1 v12.2d, v21.2d, v23.2d - zip2 v13.2d, v21.2d, v23.2d - - zip1 v20.4s, v15.4s, v16.4s - zip2 v21.4s, v15.4s, v16.4s - zip1 v22.4s, v17.4s, v18.4s - zip2 v23.4s, v17.4s, v18.4s - - zip1 v15.2d, v20.2d, v22.2d - zip2 v16.2d, v20.2d, v22.2d - zip1 v17.2d, v21.2d, v23.2d - zip2 v18.2d, v21.2d, v23.2d - - add v0.4s, v0.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v15.4s, v15.4s, v30.4s - - add v1.4s, v1.4s, v24.4s - add v6.4s, v6.4s, v28.4s - add v11.4s, v11.4s, v29.4s - add v16.4s, v16.4s, v30.4s - - add v2.4s, v2.4s, v24.4s - add v7.4s, v7.4s, v28.4s - add v12.4s, v12.4s, v29.4s - add v17.4s, v17.4s, v30.4s - - add v3.4s, v3.4s, v24.4s - add v8.4s, v8.4s, v28.4s - add v13.4s, v13.4s, v29.4s - add v18.4s, v18.4s, v30.4s - - add v4.4s, v4.4s, v24.4s - add v9.4s, v9.4s, v28.4s - add v14.4s, v14.4s, v29.4s - add v19.4s, v19.4s, v30.4s - - // We can always safely store 192 bytes - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v0.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v15.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v1.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v11.16b - eor v23.16b, v23.16b, v16.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v2.16b - eor v21.16b, v21.16b, v7.16b - eor v22.16b, v22.16b, v12.16b - eor v23.16b, v23.16b, v17.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #192 - - mov v0.16b, v3.16b - mov v5.16b, v8.16b - mov v10.16b, v13.16b - mov v15.16b, v18.16b - - cmp x2, #64 - b.lt Lopen_tail_64_store - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v3.16b - eor v21.16b, v21.16b, v8.16b - eor v22.16b, v22.16b, v13.16b - eor v23.16b, v23.16b, v18.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #64 - - mov v0.16b, v4.16b - mov v5.16b, v9.16b - mov v10.16b, v14.16b - mov v15.16b, v19.16b - - cmp x2, #64 - b.lt Lopen_tail_64_store - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v4.16b - eor v21.16b, v21.16b, v9.16b - eor v22.16b, v22.16b, v14.16b - eor v23.16b, v23.16b, v19.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #64 - b Lopen_main_loop - -Lopen_tail: - - cbz x2, Lopen_finalize - - lsr x4, x2, #4 // How many whole blocks we have to hash - - cmp x2, #64 - b.le Lopen_tail_64 - cmp x2, #128 - b.le Lopen_tail_128 - -Lopen_tail_192: - // We need three more blocks - mov v0.16b, v24.16b - mov v1.16b, v24.16b - mov v2.16b, v24.16b - mov v5.16b, v28.16b - mov v6.16b, v28.16b - mov v7.16b, v28.16b - mov v10.16b, v29.16b - mov v11.16b, v29.16b - mov v12.16b, v29.16b - mov v15.16b, v30.16b - mov v16.16b, v30.16b - mov v17.16b, v30.16b - eor v23.16b, v23.16b, v23.16b - eor v21.16b, v21.16b, v21.16b - ins v23.s[0], v25.s[0] - ins v21.d[0], x15 - - add v22.4s, v23.4s, v21.4s - add v21.4s, v22.4s, v21.4s - - add v15.4s, v15.4s, v21.4s - add v16.4s, v16.4s, v23.4s - add v17.4s, v17.4s, v22.4s - - mov x7, #10 - subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash - csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing - sub x4, x4, x7 - - cbz x7, Lopen_tail_192_rounds_no_hash - -Lopen_tail_192_rounds: - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most -Lopen_tail_192_rounds_no_hash: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #4 - ext v6.16b, v6.16b, v6.16b, #4 - ext v7.16b, v7.16b, v7.16b, #4 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #12 - ext v16.16b, v16.16b, v16.16b, #12 - ext v17.16b, v17.16b, v17.16b, #12 - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #12 - ext v6.16b, v6.16b, v6.16b, #12 - ext v7.16b, v7.16b, v7.16b, #12 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #4 - ext v16.16b, v16.16b, v16.16b, #4 - ext v17.16b, v17.16b, v17.16b, #4 - subs x7, x7, #1 - b.gt Lopen_tail_192_rounds - subs x6, x6, #1 - b.ge Lopen_tail_192_rounds_no_hash - - // We hashed 160 bytes at most, may still have 32 bytes left -Lopen_tail_192_hash: - cbz x4, Lopen_tail_192_hash_done - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - sub x4, x4, #1 - b Lopen_tail_192_hash - -Lopen_tail_192_hash_done: - - add v0.4s, v0.4s, v24.4s - add v1.4s, v1.4s, v24.4s - add v2.4s, v2.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v6.4s, v6.4s, v28.4s - add v7.4s, v7.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v11.4s, v11.4s, v29.4s - add v12.4s, v12.4s, v29.4s - add v15.4s, v15.4s, v30.4s - add v16.4s, v16.4s, v30.4s - add v17.4s, v17.4s, v30.4s - - add v15.4s, v15.4s, v21.4s - add v16.4s, v16.4s, v23.4s - add v17.4s, v17.4s, v22.4s - - ld1 {v20.16b - v23.16b}, [x1], #64 - - eor v20.16b, v20.16b, v1.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v11.16b - eor v23.16b, v23.16b, v16.16b - - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - - eor v20.16b, v20.16b, v2.16b - eor v21.16b, v21.16b, v7.16b - eor v22.16b, v22.16b, v12.16b - eor v23.16b, v23.16b, v17.16b - - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #128 - b Lopen_tail_64_store - -Lopen_tail_128: - // We need two more blocks - mov v0.16b, v24.16b - mov v1.16b, v24.16b - mov v5.16b, v28.16b - mov v6.16b, v28.16b - mov v10.16b, v29.16b - mov v11.16b, v29.16b - mov v15.16b, v30.16b - mov v16.16b, v30.16b - eor v23.16b, v23.16b, v23.16b - eor v22.16b, v22.16b, v22.16b - ins v23.s[0], v25.s[0] - ins v22.d[0], x15 - add v22.4s, v22.4s, v23.4s - - add v15.4s, v15.4s, v22.4s - add v16.4s, v16.4s, v23.4s - - mov x6, #10 - sub x6, x6, x4 - -Lopen_tail_128_rounds: - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #4 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #12 - add v1.4s, v1.4s, v6.4s - eor v16.16b, v16.16b, v1.16b - rev32 v16.8h, v16.8h - - add v11.4s, v11.4s, v16.4s - eor v6.16b, v6.16b, v11.16b - ushr v20.4s, v6.4s, #20 - sli v20.4s, v6.4s, #12 - add v1.4s, v1.4s, v20.4s - eor v16.16b, v16.16b, v1.16b - tbl v16.16b, {v16.16b}, v26.16b - - add v11.4s, v11.4s, v16.4s - eor v20.16b, v20.16b, v11.16b - ushr v6.4s, v20.4s, #25 - sli v6.4s, v20.4s, #7 - ext v6.16b, v6.16b, v6.16b, #4 - ext v11.16b, v11.16b, v11.16b, #8 - ext v16.16b, v16.16b, v16.16b, #12 - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #12 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #4 - add v1.4s, v1.4s, v6.4s - eor v16.16b, v16.16b, v1.16b - rev32 v16.8h, v16.8h - - add v11.4s, v11.4s, v16.4s - eor v6.16b, v6.16b, v11.16b - ushr v20.4s, v6.4s, #20 - sli v20.4s, v6.4s, #12 - add v1.4s, v1.4s, v20.4s - eor v16.16b, v16.16b, v1.16b - tbl v16.16b, {v16.16b}, v26.16b - - add v11.4s, v11.4s, v16.4s - eor v20.16b, v20.16b, v11.16b - ushr v6.4s, v20.4s, #25 - sli v6.4s, v20.4s, #7 - ext v6.16b, v6.16b, v6.16b, #12 - ext v11.16b, v11.16b, v11.16b, #8 - ext v16.16b, v16.16b, v16.16b, #4 - subs x6, x6, #1 - b.gt Lopen_tail_128_rounds - cbz x4, Lopen_tail_128_rounds_done - subs x4, x4, #1 - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - b Lopen_tail_128_rounds - -Lopen_tail_128_rounds_done: - add v0.4s, v0.4s, v24.4s - add v1.4s, v1.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v6.4s, v6.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v11.4s, v11.4s, v29.4s - add v15.4s, v15.4s, v30.4s - add v16.4s, v16.4s, v30.4s - add v15.4s, v15.4s, v22.4s - add v16.4s, v16.4s, v23.4s - - ld1 {v20.16b - v23.16b}, [x1], #64 - - eor v20.16b, v20.16b, v1.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v11.16b - eor v23.16b, v23.16b, v16.16b - - st1 {v20.16b - v23.16b}, [x0], #64 - sub x2, x2, #64 - - b Lopen_tail_64_store - -Lopen_tail_64: - // We just need a single block - mov v0.16b, v24.16b - mov v5.16b, v28.16b - mov v10.16b, v29.16b - mov v15.16b, v30.16b - eor v23.16b, v23.16b, v23.16b - ins v23.s[0], v25.s[0] - add v15.4s, v15.4s, v23.4s - - mov x6, #10 - sub x6, x6, x4 - -Lopen_tail_64_rounds: - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #4 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #12 - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #12 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #4 - subs x6, x6, #1 - b.gt Lopen_tail_64_rounds - cbz x4, Lopen_tail_64_rounds_done - subs x4, x4, #1 - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - b Lopen_tail_64_rounds - -Lopen_tail_64_rounds_done: - add v0.4s, v0.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v15.4s, v15.4s, v30.4s - add v15.4s, v15.4s, v23.4s - -Lopen_tail_64_store: - cmp x2, #16 - b.lt Lopen_tail_16 - - ld1 {v20.16b}, [x1], #16 - eor v20.16b, v20.16b, v0.16b - st1 {v20.16b}, [x0], #16 - mov v0.16b, v5.16b - mov v5.16b, v10.16b - mov v10.16b, v15.16b - sub x2, x2, #16 - b Lopen_tail_64_store - -Lopen_tail_16: - // Here we handle the last [0,16) bytes that require a padded block - cbz x2, Lopen_finalize - - eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext - eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask - not v22.16b, v20.16b - - add x7, x1, x2 - mov x6, x2 - -Lopen_tail_16_compose: - ext v20.16b, v20.16b, v20.16b, #15 - ldrb w11, [x7, #-1]! - mov v20.b[0], w11 - ext v21.16b, v22.16b, v21.16b, #15 - subs x2, x2, #1 - b.gt Lopen_tail_16_compose - - and v20.16b, v20.16b, v21.16b - // Hash in the final padded block - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - eor v20.16b, v20.16b, v0.16b - -Lopen_tail_16_store: - umov w11, v20.b[0] - strb w11, [x0], #1 - ext v20.16b, v20.16b, v20.16b, #1 - subs x6, x6, #1 - b.gt Lopen_tail_16_store - -Lopen_finalize: - mov x11, v31.d[0] - mov x12, v31.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - // Final reduction step - sub x12, xzr, x15 - orr x13, xzr, #3 - subs x11, x8, #-5 - sbcs x12, x9, x12 - sbcs x13, x10, x13 - csel x8, x11, x8, cs - csel x9, x12, x9, cs - csel x10, x13, x10, cs - mov x11, v27.d[0] - mov x12, v27.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - - stp x8, x9, [x5] - - ldp d8, d9, [sp, #16] - ldp d10, d11, [sp, #32] - ldp d12, d13, [sp, #48] - ldp d14, d15, [sp, #64] -.cfi_restore b15 -.cfi_restore b14 -.cfi_restore b13 -.cfi_restore b12 -.cfi_restore b11 -.cfi_restore b10 -.cfi_restore b9 -.cfi_restore b8 - ldp x29, x30, [sp], 80 -.cfi_restore w29 -.cfi_restore w30 -.cfi_def_cfa_offset 0 - AARCH64_VALIDATE_LINK_REGISTER - ret - -Lopen_128: - // On some architectures preparing 5 blocks for small buffers is wasteful - eor v25.16b, v25.16b, v25.16b - mov x11, #1 - mov v25.s[0], w11 - mov v0.16b, v24.16b - mov v1.16b, v24.16b - mov v2.16b, v24.16b - mov v5.16b, v28.16b - mov v6.16b, v28.16b - mov v7.16b, v28.16b - mov v10.16b, v29.16b - mov v11.16b, v29.16b - mov v12.16b, v29.16b - mov v17.16b, v30.16b - add v15.4s, v17.4s, v25.4s - add v16.4s, v15.4s, v25.4s - - mov x6, #10 - -Lopen_128_rounds: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #4 - ext v6.16b, v6.16b, v6.16b, #4 - ext v7.16b, v7.16b, v7.16b, #4 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #12 - ext v16.16b, v16.16b, v16.16b, #12 - ext v17.16b, v17.16b, v17.16b, #12 - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #12 - ext v6.16b, v6.16b, v6.16b, #12 - ext v7.16b, v7.16b, v7.16b, #12 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #4 - ext v16.16b, v16.16b, v16.16b, #4 - ext v17.16b, v17.16b, v17.16b, #4 - subs x6, x6, #1 - b.hi Lopen_128_rounds - - add v0.4s, v0.4s, v24.4s - add v1.4s, v1.4s, v24.4s - add v2.4s, v2.4s, v24.4s - - add v5.4s, v5.4s, v28.4s - add v6.4s, v6.4s, v28.4s - add v7.4s, v7.4s, v28.4s - - add v10.4s, v10.4s, v29.4s - add v11.4s, v11.4s, v29.4s - - add v30.4s, v30.4s, v25.4s - add v15.4s, v15.4s, v30.4s - add v30.4s, v30.4s, v25.4s - add v16.4s, v16.4s, v30.4s - - and v2.16b, v2.16b, v27.16b - mov x16, v2.d[0] // Move the R key to GPRs - mov x17, v2.d[1] - mov v27.16b, v7.16b // Store the S key - - bl Lpoly_hash_ad_internal - -Lopen_128_store: - cmp x2, #64 - b.lt Lopen_128_store_64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v21.d[0] - mov x12, v21.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v22.d[0] - mov x12, v22.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v23.d[0] - mov x12, v23.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - - eor v20.16b, v20.16b, v0.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v15.16b - - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #64 - - mov v0.16b, v1.16b - mov v5.16b, v6.16b - mov v10.16b, v11.16b - mov v15.16b, v16.16b - -Lopen_128_store_64: - - lsr x4, x2, #4 - mov x3, x1 - -Lopen_128_hash_64: - cbz x4, Lopen_tail_64_store - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - sub x4, x4, #1 - b Lopen_128_hash_64 -.cfi_endproc - -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S b/third_party/boringssl/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S deleted file mode 100644 index 50d7deaa..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/aesv8-armx64.S +++ /dev/null @@ -1,799 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -#if __ARM_MAX_ARCH__>=7 -.text - -.section __TEXT,__const -.align 5 -Lrcon: -.long 0x01,0x01,0x01,0x01 -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat -.long 0x1b,0x1b,0x1b,0x1b - -.text - -.globl _aes_hw_set_encrypt_key -.private_extern _aes_hw_set_encrypt_key - -.align 5 -_aes_hw_set_encrypt_key: -Lenc_key: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - AARCH64_VALID_CALL_TARGET - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - mov x3,#-1 - cmp x0,#0 - b.eq Lenc_key_abort - cmp x2,#0 - b.eq Lenc_key_abort - mov x3,#-2 - cmp w1,#128 - b.lt Lenc_key_abort - cmp w1,#256 - b.gt Lenc_key_abort - tst w1,#0x3f - b.ne Lenc_key_abort - - adrp x3,Lrcon@PAGE - add x3,x3,Lrcon@PAGEOFF - cmp w1,#192 - - eor v0.16b,v0.16b,v0.16b - ld1 {v3.16b},[x0],#16 - mov w1,#8 // reuse w1 - ld1 {v1.4s,v2.4s},[x3],#32 - - b.lt Loop128 - b.eq L192 - b L256 - -.align 4 -Loop128: - tbl v6.16b,{v3.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 - aese v6.16b,v0.16b - subs w1,w1,#1 - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - shl v1.16b,v1.16b,#1 - eor v3.16b,v3.16b,v6.16b - b.ne Loop128 - - ld1 {v1.4s},[x3] - - tbl v6.16b,{v3.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 - aese v6.16b,v0.16b - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - shl v1.16b,v1.16b,#1 - eor v3.16b,v3.16b,v6.16b - - tbl v6.16b,{v3.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 - aese v6.16b,v0.16b - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - eor v3.16b,v3.16b,v6.16b - st1 {v3.4s},[x2] - add x2,x2,#0x50 - - mov w12,#10 - b Ldone - -.align 4 -L192: - ld1 {v4.8b},[x0],#8 - movi v6.16b,#8 // borrow v6.16b - st1 {v3.4s},[x2],#16 - sub v2.16b,v2.16b,v6.16b // adjust the mask - -Loop192: - tbl v6.16b,{v4.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v4.8b},[x2],#8 - aese v6.16b,v0.16b - subs w1,w1,#1 - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - - dup v5.4s,v3.s[3] - eor v5.16b,v5.16b,v4.16b - eor v6.16b,v6.16b,v1.16b - ext v4.16b,v0.16b,v4.16b,#12 - shl v1.16b,v1.16b,#1 - eor v4.16b,v4.16b,v5.16b - eor v3.16b,v3.16b,v6.16b - eor v4.16b,v4.16b,v6.16b - st1 {v3.4s},[x2],#16 - b.ne Loop192 - - mov w12,#12 - add x2,x2,#0x20 - b Ldone - -.align 4 -L256: - ld1 {v4.16b},[x0] - mov w1,#7 - mov w12,#14 - st1 {v3.4s},[x2],#16 - -Loop256: - tbl v6.16b,{v4.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v4.4s},[x2],#16 - aese v6.16b,v0.16b - subs w1,w1,#1 - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - shl v1.16b,v1.16b,#1 - eor v3.16b,v3.16b,v6.16b - st1 {v3.4s},[x2],#16 - b.eq Ldone - - dup v6.4s,v3.s[3] // just splat - ext v5.16b,v0.16b,v4.16b,#12 - aese v6.16b,v0.16b - - eor v4.16b,v4.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v4.16b,v4.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v4.16b,v4.16b,v5.16b - - eor v4.16b,v4.16b,v6.16b - b Loop256 - -Ldone: - str w12,[x2] - mov x3,#0 - -Lenc_key_abort: - mov x0,x3 // return value - ldr x29,[sp],#16 - ret - - -.globl _aes_hw_set_decrypt_key -.private_extern _aes_hw_set_decrypt_key - -.align 5 -_aes_hw_set_decrypt_key: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - bl Lenc_key - - cmp x0,#0 - b.ne Ldec_key_abort - - sub x2,x2,#240 // restore original x2 - mov x4,#-16 - add x0,x2,x12,lsl#4 // end of key schedule - - ld1 {v0.4s},[x2] - ld1 {v1.4s},[x0] - st1 {v0.4s},[x0],x4 - st1 {v1.4s},[x2],#16 - -Loop_imc: - ld1 {v0.4s},[x2] - ld1 {v1.4s},[x0] - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - st1 {v0.4s},[x0],x4 - st1 {v1.4s},[x2],#16 - cmp x0,x2 - b.hi Loop_imc - - ld1 {v0.4s},[x2] - aesimc v0.16b,v0.16b - st1 {v0.4s},[x0] - - eor x0,x0,x0 // return value -Ldec_key_abort: - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - -.globl _aes_hw_encrypt -.private_extern _aes_hw_encrypt - -.align 5 -_aes_hw_encrypt: - AARCH64_VALID_CALL_TARGET - ldr w3,[x2,#240] - ld1 {v0.4s},[x2],#16 - ld1 {v2.16b},[x0] - sub w3,w3,#2 - ld1 {v1.4s},[x2],#16 - -Loop_enc: - aese v2.16b,v0.16b - aesmc v2.16b,v2.16b - ld1 {v0.4s},[x2],#16 - subs w3,w3,#2 - aese v2.16b,v1.16b - aesmc v2.16b,v2.16b - ld1 {v1.4s},[x2],#16 - b.gt Loop_enc - - aese v2.16b,v0.16b - aesmc v2.16b,v2.16b - ld1 {v0.4s},[x2] - aese v2.16b,v1.16b - eor v2.16b,v2.16b,v0.16b - - st1 {v2.16b},[x1] - ret - -.globl _aes_hw_decrypt -.private_extern _aes_hw_decrypt - -.align 5 -_aes_hw_decrypt: - AARCH64_VALID_CALL_TARGET - ldr w3,[x2,#240] - ld1 {v0.4s},[x2],#16 - ld1 {v2.16b},[x0] - sub w3,w3,#2 - ld1 {v1.4s},[x2],#16 - -Loop_dec: - aesd v2.16b,v0.16b - aesimc v2.16b,v2.16b - ld1 {v0.4s},[x2],#16 - subs w3,w3,#2 - aesd v2.16b,v1.16b - aesimc v2.16b,v2.16b - ld1 {v1.4s},[x2],#16 - b.gt Loop_dec - - aesd v2.16b,v0.16b - aesimc v2.16b,v2.16b - ld1 {v0.4s},[x2] - aesd v2.16b,v1.16b - eor v2.16b,v2.16b,v0.16b - - st1 {v2.16b},[x1] - ret - -.globl _aes_hw_cbc_encrypt -.private_extern _aes_hw_cbc_encrypt - -.align 5 -_aes_hw_cbc_encrypt: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - AARCH64_VALID_CALL_TARGET - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - subs x2,x2,#16 - mov x8,#16 - b.lo Lcbc_abort - csel x8,xzr,x8,eq - - cmp w5,#0 // en- or decrypting? - ldr w5,[x3,#240] - and x2,x2,#-16 - ld1 {v6.16b},[x4] - ld1 {v0.16b},[x0],x8 - - ld1 {v16.4s,v17.4s},[x3] // load key schedule... - sub w5,w5,#6 - add x7,x3,x5,lsl#4 // pointer to last 7 round keys - sub w5,w5,#2 - ld1 {v18.4s,v19.4s},[x7],#32 - ld1 {v20.4s,v21.4s},[x7],#32 - ld1 {v22.4s,v23.4s},[x7],#32 - ld1 {v7.4s},[x7] - - add x7,x3,#32 - mov w6,w5 - b.eq Lcbc_dec - - cmp w5,#2 - eor v0.16b,v0.16b,v6.16b - eor v5.16b,v16.16b,v7.16b - b.eq Lcbc_enc128 - - ld1 {v2.4s,v3.4s},[x7] - add x7,x3,#16 - add x6,x3,#16*4 - add x12,x3,#16*5 - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - add x14,x3,#16*6 - add x3,x3,#16*7 - b Lenter_cbc_enc - -.align 4 -Loop_cbc_enc: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - st1 {v6.16b},[x1],#16 -Lenter_cbc_enc: - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - aese v0.16b,v2.16b - aesmc v0.16b,v0.16b - ld1 {v16.4s},[x6] - cmp w5,#4 - aese v0.16b,v3.16b - aesmc v0.16b,v0.16b - ld1 {v17.4s},[x12] - b.eq Lcbc_enc192 - - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - ld1 {v16.4s},[x14] - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - ld1 {v17.4s},[x3] - nop - -Lcbc_enc192: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - subs x2,x2,#16 - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - csel x8,xzr,x8,eq - aese v0.16b,v18.16b - aesmc v0.16b,v0.16b - aese v0.16b,v19.16b - aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 - aese v0.16b,v20.16b - aesmc v0.16b,v0.16b - eor v16.16b,v16.16b,v5.16b - aese v0.16b,v21.16b - aesmc v0.16b,v0.16b - ld1 {v17.4s},[x7] // re-pre-load rndkey[1] - aese v0.16b,v22.16b - aesmc v0.16b,v0.16b - aese v0.16b,v23.16b - eor v6.16b,v0.16b,v7.16b - b.hs Loop_cbc_enc - - st1 {v6.16b},[x1],#16 - b Lcbc_done - -.align 5 -Lcbc_enc128: - ld1 {v2.4s,v3.4s},[x7] - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - b Lenter_cbc_enc128 -Loop_cbc_enc128: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - st1 {v6.16b},[x1],#16 -Lenter_cbc_enc128: - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - subs x2,x2,#16 - aese v0.16b,v2.16b - aesmc v0.16b,v0.16b - csel x8,xzr,x8,eq - aese v0.16b,v3.16b - aesmc v0.16b,v0.16b - aese v0.16b,v18.16b - aesmc v0.16b,v0.16b - aese v0.16b,v19.16b - aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 - aese v0.16b,v20.16b - aesmc v0.16b,v0.16b - aese v0.16b,v21.16b - aesmc v0.16b,v0.16b - aese v0.16b,v22.16b - aesmc v0.16b,v0.16b - eor v16.16b,v16.16b,v5.16b - aese v0.16b,v23.16b - eor v6.16b,v0.16b,v7.16b - b.hs Loop_cbc_enc128 - - st1 {v6.16b},[x1],#16 - b Lcbc_done -.align 5 -Lcbc_dec: - ld1 {v18.16b},[x0],#16 - subs x2,x2,#32 // bias - add w6,w5,#2 - orr v3.16b,v0.16b,v0.16b - orr v1.16b,v0.16b,v0.16b - orr v19.16b,v18.16b,v18.16b - b.lo Lcbc_dec_tail - - orr v1.16b,v18.16b,v18.16b - ld1 {v18.16b},[x0],#16 - orr v2.16b,v0.16b,v0.16b - orr v3.16b,v1.16b,v1.16b - orr v19.16b,v18.16b,v18.16b - -Loop3x_cbc_dec: - aesd v0.16b,v16.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v16.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b - ld1 {v16.4s},[x7],#16 - subs w6,w6,#2 - aesd v0.16b,v17.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v17.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b - ld1 {v17.4s},[x7],#16 - b.gt Loop3x_cbc_dec - - aesd v0.16b,v16.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v16.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b - eor v4.16b,v6.16b,v7.16b - subs x2,x2,#0x30 - eor v5.16b,v2.16b,v7.16b - csel x6,x2,x6,lo // x6, w6, is zero at this point - aesd v0.16b,v17.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v17.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b - eor v17.16b,v3.16b,v7.16b - add x0,x0,x6 // x0 is adjusted in such way that - // at exit from the loop v1.16b-v18.16b - // are loaded with last "words" - orr v6.16b,v19.16b,v19.16b - mov x7,x3 - aesd v0.16b,v20.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v20.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v20.16b - aesimc v18.16b,v18.16b - ld1 {v2.16b},[x0],#16 - aesd v0.16b,v21.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v21.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v21.16b - aesimc v18.16b,v18.16b - ld1 {v3.16b},[x0],#16 - aesd v0.16b,v22.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v22.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v22.16b - aesimc v18.16b,v18.16b - ld1 {v19.16b},[x0],#16 - aesd v0.16b,v23.16b - aesd v1.16b,v23.16b - aesd v18.16b,v23.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] - add w6,w5,#2 - eor v4.16b,v4.16b,v0.16b - eor v5.16b,v5.16b,v1.16b - eor v18.16b,v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v4.16b},[x1],#16 - orr v0.16b,v2.16b,v2.16b - st1 {v5.16b},[x1],#16 - orr v1.16b,v3.16b,v3.16b - st1 {v18.16b},[x1],#16 - orr v18.16b,v19.16b,v19.16b - b.hs Loop3x_cbc_dec - - cmn x2,#0x30 - b.eq Lcbc_done - nop - -Lcbc_dec_tail: - aesd v1.16b,v16.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b - ld1 {v16.4s},[x7],#16 - subs w6,w6,#2 - aesd v1.16b,v17.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b - ld1 {v17.4s},[x7],#16 - b.gt Lcbc_dec_tail - - aesd v1.16b,v16.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b - aesd v1.16b,v17.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b - aesd v1.16b,v20.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v20.16b - aesimc v18.16b,v18.16b - cmn x2,#0x20 - aesd v1.16b,v21.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v21.16b - aesimc v18.16b,v18.16b - eor v5.16b,v6.16b,v7.16b - aesd v1.16b,v22.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v22.16b - aesimc v18.16b,v18.16b - eor v17.16b,v3.16b,v7.16b - aesd v1.16b,v23.16b - aesd v18.16b,v23.16b - b.eq Lcbc_dec_one - eor v5.16b,v5.16b,v1.16b - eor v17.16b,v17.16b,v18.16b - orr v6.16b,v19.16b,v19.16b - st1 {v5.16b},[x1],#16 - st1 {v17.16b},[x1],#16 - b Lcbc_done - -Lcbc_dec_one: - eor v5.16b,v5.16b,v18.16b - orr v6.16b,v19.16b,v19.16b - st1 {v5.16b},[x1],#16 - -Lcbc_done: - st1 {v6.16b},[x4] -Lcbc_abort: - ldr x29,[sp],#16 - ret - -.globl _aes_hw_ctr32_encrypt_blocks -.private_extern _aes_hw_ctr32_encrypt_blocks - -.align 5 -_aes_hw_ctr32_encrypt_blocks: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - AARCH64_VALID_CALL_TARGET - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - ldr w5,[x3,#240] - - ldr w8, [x4, #12] - ld1 {v0.4s},[x4] - - ld1 {v16.4s,v17.4s},[x3] // load key schedule... - sub w5,w5,#4 - mov x12,#16 - cmp x2,#2 - add x7,x3,x5,lsl#4 // pointer to last 5 round keys - sub w5,w5,#2 - ld1 {v20.4s,v21.4s},[x7],#32 - ld1 {v22.4s,v23.4s},[x7],#32 - ld1 {v7.4s},[x7] - add x7,x3,#32 - mov w6,w5 - csel x12,xzr,x12,lo - - // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are - // affected by silicon errata #1742098 [0] and #1655431 [1], - // respectively, where the second instruction of an aese/aesmc - // instruction pair may execute twice if an interrupt is taken right - // after the first instruction consumes an input register of which a - // single 32-bit lane has been updated the last time it was modified. - // - // This function uses a counter in one 32-bit lane. The vmov lines - // could write to v1.16b and v18.16b directly, but that trips this bugs. - // We write to v6.16b and copy to the final register as a workaround. - // - // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice - // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice -#ifndef __AARCH64EB__ - rev w8, w8 -#endif - add w10, w8, #1 - orr v6.16b,v0.16b,v0.16b - rev w10, w10 - mov v6.s[3],w10 - add w8, w8, #2 - orr v1.16b,v6.16b,v6.16b - b.ls Lctr32_tail - rev w12, w8 - mov v6.s[3],w12 - sub x2,x2,#3 // bias - orr v18.16b,v6.16b,v6.16b - b Loop3x_ctr32 - -.align 4 -Loop3x_ctr32: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - aese v1.16b,v16.16b - aesmc v1.16b,v1.16b - aese v18.16b,v16.16b - aesmc v18.16b,v18.16b - ld1 {v16.4s},[x7],#16 - subs w6,w6,#2 - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - aese v1.16b,v17.16b - aesmc v1.16b,v1.16b - aese v18.16b,v17.16b - aesmc v18.16b,v18.16b - ld1 {v17.4s},[x7],#16 - b.gt Loop3x_ctr32 - - aese v0.16b,v16.16b - aesmc v4.16b,v0.16b - aese v1.16b,v16.16b - aesmc v5.16b,v1.16b - ld1 {v2.16b},[x0],#16 - add w9,w8,#1 - aese v18.16b,v16.16b - aesmc v18.16b,v18.16b - ld1 {v3.16b},[x0],#16 - rev w9,w9 - aese v4.16b,v17.16b - aesmc v4.16b,v4.16b - aese v5.16b,v17.16b - aesmc v5.16b,v5.16b - ld1 {v19.16b},[x0],#16 - mov x7,x3 - aese v18.16b,v17.16b - aesmc v17.16b,v18.16b - aese v4.16b,v20.16b - aesmc v4.16b,v4.16b - aese v5.16b,v20.16b - aesmc v5.16b,v5.16b - eor v2.16b,v2.16b,v7.16b - add w10,w8,#2 - aese v17.16b,v20.16b - aesmc v17.16b,v17.16b - eor v3.16b,v3.16b,v7.16b - add w8,w8,#3 - aese v4.16b,v21.16b - aesmc v4.16b,v4.16b - aese v5.16b,v21.16b - aesmc v5.16b,v5.16b - // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work - // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in - // 32-bit mode. See the comment above. - eor v19.16b,v19.16b,v7.16b - mov v6.s[3], w9 - aese v17.16b,v21.16b - aesmc v17.16b,v17.16b - orr v0.16b,v6.16b,v6.16b - rev w10,w10 - aese v4.16b,v22.16b - aesmc v4.16b,v4.16b - mov v6.s[3], w10 - rev w12,w8 - aese v5.16b,v22.16b - aesmc v5.16b,v5.16b - orr v1.16b,v6.16b,v6.16b - mov v6.s[3], w12 - aese v17.16b,v22.16b - aesmc v17.16b,v17.16b - orr v18.16b,v6.16b,v6.16b - subs x2,x2,#3 - aese v4.16b,v23.16b - aese v5.16b,v23.16b - aese v17.16b,v23.16b - - eor v2.16b,v2.16b,v4.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] - st1 {v2.16b},[x1],#16 - eor v3.16b,v3.16b,v5.16b - mov w6,w5 - st1 {v3.16b},[x1],#16 - eor v19.16b,v19.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v19.16b},[x1],#16 - b.hs Loop3x_ctr32 - - adds x2,x2,#3 - b.eq Lctr32_done - cmp x2,#1 - mov x12,#16 - csel x12,xzr,x12,eq - -Lctr32_tail: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - aese v1.16b,v16.16b - aesmc v1.16b,v1.16b - ld1 {v16.4s},[x7],#16 - subs w6,w6,#2 - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - aese v1.16b,v17.16b - aesmc v1.16b,v1.16b - ld1 {v17.4s},[x7],#16 - b.gt Lctr32_tail - - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - aese v1.16b,v16.16b - aesmc v1.16b,v1.16b - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - aese v1.16b,v17.16b - aesmc v1.16b,v1.16b - ld1 {v2.16b},[x0],x12 - aese v0.16b,v20.16b - aesmc v0.16b,v0.16b - aese v1.16b,v20.16b - aesmc v1.16b,v1.16b - ld1 {v3.16b},[x0] - aese v0.16b,v21.16b - aesmc v0.16b,v0.16b - aese v1.16b,v21.16b - aesmc v1.16b,v1.16b - eor v2.16b,v2.16b,v7.16b - aese v0.16b,v22.16b - aesmc v0.16b,v0.16b - aese v1.16b,v22.16b - aesmc v1.16b,v1.16b - eor v3.16b,v3.16b,v7.16b - aese v0.16b,v23.16b - aese v1.16b,v23.16b - - cmp x2,#1 - eor v2.16b,v2.16b,v0.16b - eor v3.16b,v3.16b,v1.16b - st1 {v2.16b},[x1],#16 - b.eq Lctr32_done - st1 {v3.16b},[x1] - -Lctr32_done: - ldr x29,[sp],#16 - ret - -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/armv8-mont.S b/third_party/boringssl/apple-aarch64/crypto/fipsmodule/armv8-mont.S deleted file mode 100644 index 2493ae08..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/armv8-mont.S +++ /dev/null @@ -1,1433 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.text - -.globl _bn_mul_mont -.private_extern _bn_mul_mont - -.align 5 -_bn_mul_mont: - AARCH64_SIGN_LINK_REGISTER - tst x5,#7 - b.eq __bn_sqr8x_mont - tst x5,#3 - b.eq __bn_mul4x_mont -Lmul_mont: - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldr x9,[x2],#8 // bp[0] - sub x22,sp,x5,lsl#3 - ldp x7,x8,[x1],#16 // ap[0..1] - lsl x5,x5,#3 - ldr x4,[x4] // *n0 - and x22,x22,#-16 // ABI says so - ldp x13,x14,[x3],#16 // np[0..1] - - mul x6,x7,x9 // ap[0]*bp[0] - sub x21,x5,#16 // j=num-2 - umulh x7,x7,x9 - mul x10,x8,x9 // ap[1]*bp[0] - umulh x11,x8,x9 - - mul x15,x6,x4 // "tp[0]"*n0 - mov sp,x22 // alloca - - // (*) mul x12,x13,x15 // np[0]*m1 - umulh x13,x13,x15 - mul x16,x14,x15 // np[1]*m1 - // (*) adds x12,x12,x6 // discarded - // (*) As for removal of first multiplication and addition - // instructions. The outcome of first addition is - // guaranteed to be zero, which leaves two computationally - // significant outcomes: it either carries or not. Then - // question is when does it carry? Is there alternative - // way to deduce it? If you follow operations, you can - // observe that condition for carry is quite simple: - // x6 being non-zero. So that carry can be calculated - // by adding -1 to x6. That's what next instruction does. - subs xzr,x6,#1 // (*) - umulh x17,x14,x15 - adc x13,x13,xzr - cbz x21,L1st_skip - -L1st: - ldr x8,[x1],#8 - adds x6,x10,x7 - sub x21,x21,#8 // j-- - adc x7,x11,xzr - - ldr x14,[x3],#8 - adds x12,x16,x13 - mul x10,x8,x9 // ap[j]*bp[0] - adc x13,x17,xzr - umulh x11,x8,x9 - - adds x12,x12,x6 - mul x16,x14,x15 // np[j]*m1 - adc x13,x13,xzr - umulh x17,x14,x15 - str x12,[x22],#8 // tp[j-1] - cbnz x21,L1st - -L1st_skip: - adds x6,x10,x7 - sub x1,x1,x5 // rewind x1 - adc x7,x11,xzr - - adds x12,x16,x13 - sub x3,x3,x5 // rewind x3 - adc x13,x17,xzr - - adds x12,x12,x6 - sub x20,x5,#8 // i=num-1 - adcs x13,x13,x7 - - adc x19,xzr,xzr // upmost overflow bit - stp x12,x13,[x22] - -Louter: - ldr x9,[x2],#8 // bp[i] - ldp x7,x8,[x1],#16 - ldr x23,[sp] // tp[0] - add x22,sp,#8 - - mul x6,x7,x9 // ap[0]*bp[i] - sub x21,x5,#16 // j=num-2 - umulh x7,x7,x9 - ldp x13,x14,[x3],#16 - mul x10,x8,x9 // ap[1]*bp[i] - adds x6,x6,x23 - umulh x11,x8,x9 - adc x7,x7,xzr - - mul x15,x6,x4 - sub x20,x20,#8 // i-- - - // (*) mul x12,x13,x15 // np[0]*m1 - umulh x13,x13,x15 - mul x16,x14,x15 // np[1]*m1 - // (*) adds x12,x12,x6 - subs xzr,x6,#1 // (*) - umulh x17,x14,x15 - cbz x21,Linner_skip - -Linner: - ldr x8,[x1],#8 - adc x13,x13,xzr - ldr x23,[x22],#8 // tp[j] - adds x6,x10,x7 - sub x21,x21,#8 // j-- - adc x7,x11,xzr - - adds x12,x16,x13 - ldr x14,[x3],#8 - adc x13,x17,xzr - - mul x10,x8,x9 // ap[j]*bp[i] - adds x6,x6,x23 - umulh x11,x8,x9 - adc x7,x7,xzr - - mul x16,x14,x15 // np[j]*m1 - adds x12,x12,x6 - umulh x17,x14,x15 - str x12,[x22,#-16] // tp[j-1] - cbnz x21,Linner - -Linner_skip: - ldr x23,[x22],#8 // tp[j] - adc x13,x13,xzr - adds x6,x10,x7 - sub x1,x1,x5 // rewind x1 - adc x7,x11,xzr - - adds x12,x16,x13 - sub x3,x3,x5 // rewind x3 - adcs x13,x17,x19 - adc x19,xzr,xzr - - adds x6,x6,x23 - adc x7,x7,xzr - - adds x12,x12,x6 - adcs x13,x13,x7 - adc x19,x19,xzr // upmost overflow bit - stp x12,x13,[x22,#-16] - - cbnz x20,Louter - - // Final step. We see if result is larger than modulus, and - // if it is, subtract the modulus. But comparison implies - // subtraction. So we subtract modulus, see if it borrowed, - // and conditionally copy original value. - ldr x23,[sp] // tp[0] - add x22,sp,#8 - ldr x14,[x3],#8 // np[0] - subs x21,x5,#8 // j=num-1 and clear borrow - mov x1,x0 -Lsub: - sbcs x8,x23,x14 // tp[j]-np[j] - ldr x23,[x22],#8 - sub x21,x21,#8 // j-- - ldr x14,[x3],#8 - str x8,[x1],#8 // rp[j]=tp[j]-np[j] - cbnz x21,Lsub - - sbcs x8,x23,x14 - sbcs x19,x19,xzr // did it borrow? - str x8,[x1],#8 // rp[num-1] - - ldr x23,[sp] // tp[0] - add x22,sp,#8 - ldr x8,[x0],#8 // rp[0] - sub x5,x5,#8 // num-- - nop -Lcond_copy: - sub x5,x5,#8 // num-- - csel x14,x23,x8,lo // did it borrow? - ldr x23,[x22],#8 - ldr x8,[x0],#8 - str xzr,[x22,#-16] // wipe tp - str x14,[x0,#-16] - cbnz x5,Lcond_copy - - csel x14,x23,x8,lo - str xzr,[x22,#-8] // wipe tp - str x14,[x0,#-8] - - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] - mov x0,#1 - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -.align 5 -__bn_sqr8x_mont: - // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to - // only from bn_mul_mont which has already signed the return address. - cmp x1,x2 - b.ne __bn_mul4x_mont -Lsqr8x_mont: - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x0,x3,[sp,#96] // offload rp and np - - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - ldp x10,x11,[x1,#8*4] - ldp x12,x13,[x1,#8*6] - - sub x2,sp,x5,lsl#4 - lsl x5,x5,#3 - ldr x4,[x4] // *n0 - mov sp,x2 // alloca - sub x27,x5,#8*8 - b Lsqr8x_zero_start - -Lsqr8x_zero: - sub x27,x27,#8*8 - stp xzr,xzr,[x2,#8*0] - stp xzr,xzr,[x2,#8*2] - stp xzr,xzr,[x2,#8*4] - stp xzr,xzr,[x2,#8*6] -Lsqr8x_zero_start: - stp xzr,xzr,[x2,#8*8] - stp xzr,xzr,[x2,#8*10] - stp xzr,xzr,[x2,#8*12] - stp xzr,xzr,[x2,#8*14] - add x2,x2,#8*16 - cbnz x27,Lsqr8x_zero - - add x3,x1,x5 - add x1,x1,#8*8 - mov x19,xzr - mov x20,xzr - mov x21,xzr - mov x22,xzr - mov x23,xzr - mov x24,xzr - mov x25,xzr - mov x26,xzr - mov x2,sp - str x4,[x29,#112] // offload n0 - - // Multiply everything but a[i]*a[i] -.align 4 -Lsqr8x_outer_loop: - // a[1]a[0] (i) - // a[2]a[0] - // a[3]a[0] - // a[4]a[0] - // a[5]a[0] - // a[6]a[0] - // a[7]a[0] - // a[2]a[1] (ii) - // a[3]a[1] - // a[4]a[1] - // a[5]a[1] - // a[6]a[1] - // a[7]a[1] - // a[3]a[2] (iii) - // a[4]a[2] - // a[5]a[2] - // a[6]a[2] - // a[7]a[2] - // a[4]a[3] (iv) - // a[5]a[3] - // a[6]a[3] - // a[7]a[3] - // a[5]a[4] (v) - // a[6]a[4] - // a[7]a[4] - // a[6]a[5] (vi) - // a[7]a[5] - // a[7]a[6] (vii) - - mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) - mul x15,x8,x6 - mul x16,x9,x6 - mul x17,x10,x6 - adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) - mul x14,x11,x6 - adcs x21,x21,x15 - mul x15,x12,x6 - adcs x22,x22,x16 - mul x16,x13,x6 - adcs x23,x23,x17 - umulh x17,x7,x6 // hi(a[1..7]*a[0]) - adcs x24,x24,x14 - umulh x14,x8,x6 - adcs x25,x25,x15 - umulh x15,x9,x6 - adcs x26,x26,x16 - umulh x16,x10,x6 - stp x19,x20,[x2],#8*2 // t[0..1] - adc x19,xzr,xzr // t[8] - adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) - umulh x17,x11,x6 - adcs x22,x22,x14 - umulh x14,x12,x6 - adcs x23,x23,x15 - umulh x15,x13,x6 - adcs x24,x24,x16 - mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) - adcs x25,x25,x17 - mul x17,x9,x7 - adcs x26,x26,x14 - mul x14,x10,x7 - adc x19,x19,x15 - - mul x15,x11,x7 - adds x22,x22,x16 - mul x16,x12,x7 - adcs x23,x23,x17 - mul x17,x13,x7 - adcs x24,x24,x14 - umulh x14,x8,x7 // hi(a[2..7]*a[1]) - adcs x25,x25,x15 - umulh x15,x9,x7 - adcs x26,x26,x16 - umulh x16,x10,x7 - adcs x19,x19,x17 - umulh x17,x11,x7 - stp x21,x22,[x2],#8*2 // t[2..3] - adc x20,xzr,xzr // t[9] - adds x23,x23,x14 - umulh x14,x12,x7 - adcs x24,x24,x15 - umulh x15,x13,x7 - adcs x25,x25,x16 - mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) - adcs x26,x26,x17 - mul x17,x10,x8 - adcs x19,x19,x14 - mul x14,x11,x8 - adc x20,x20,x15 - - mul x15,x12,x8 - adds x24,x24,x16 - mul x16,x13,x8 - adcs x25,x25,x17 - umulh x17,x9,x8 // hi(a[3..7]*a[2]) - adcs x26,x26,x14 - umulh x14,x10,x8 - adcs x19,x19,x15 - umulh x15,x11,x8 - adcs x20,x20,x16 - umulh x16,x12,x8 - stp x23,x24,[x2],#8*2 // t[4..5] - adc x21,xzr,xzr // t[10] - adds x25,x25,x17 - umulh x17,x13,x8 - adcs x26,x26,x14 - mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) - adcs x19,x19,x15 - mul x15,x11,x9 - adcs x20,x20,x16 - mul x16,x12,x9 - adc x21,x21,x17 - - mul x17,x13,x9 - adds x26,x26,x14 - umulh x14,x10,x9 // hi(a[4..7]*a[3]) - adcs x19,x19,x15 - umulh x15,x11,x9 - adcs x20,x20,x16 - umulh x16,x12,x9 - adcs x21,x21,x17 - umulh x17,x13,x9 - stp x25,x26,[x2],#8*2 // t[6..7] - adc x22,xzr,xzr // t[11] - adds x19,x19,x14 - mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) - adcs x20,x20,x15 - mul x15,x12,x10 - adcs x21,x21,x16 - mul x16,x13,x10 - adc x22,x22,x17 - - umulh x17,x11,x10 // hi(a[5..7]*a[4]) - adds x20,x20,x14 - umulh x14,x12,x10 - adcs x21,x21,x15 - umulh x15,x13,x10 - adcs x22,x22,x16 - mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) - adc x23,xzr,xzr // t[12] - adds x21,x21,x17 - mul x17,x13,x11 - adcs x22,x22,x14 - umulh x14,x12,x11 // hi(a[6..7]*a[5]) - adc x23,x23,x15 - - umulh x15,x13,x11 - adds x22,x22,x16 - mul x16,x13,x12 // lo(a[7]*a[6]) (vii) - adcs x23,x23,x17 - umulh x17,x13,x12 // hi(a[7]*a[6]) - adc x24,xzr,xzr // t[13] - adds x23,x23,x14 - sub x27,x3,x1 // done yet? - adc x24,x24,x15 - - adds x24,x24,x16 - sub x14,x3,x5 // rewinded ap - adc x25,xzr,xzr // t[14] - add x25,x25,x17 - - cbz x27,Lsqr8x_outer_break - - mov x4,x6 - ldp x6,x7,[x2,#8*0] - ldp x8,x9,[x2,#8*2] - ldp x10,x11,[x2,#8*4] - ldp x12,x13,[x2,#8*6] - adds x19,x19,x6 - adcs x20,x20,x7 - ldp x6,x7,[x1,#8*0] - adcs x21,x21,x8 - adcs x22,x22,x9 - ldp x8,x9,[x1,#8*2] - adcs x23,x23,x10 - adcs x24,x24,x11 - ldp x10,x11,[x1,#8*4] - adcs x25,x25,x12 - mov x0,x1 - adcs x26,xzr,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 - //adc x28,xzr,xzr // moved below - mov x27,#-8*8 - - // a[8]a[0] - // a[9]a[0] - // a[a]a[0] - // a[b]a[0] - // a[c]a[0] - // a[d]a[0] - // a[e]a[0] - // a[f]a[0] - // a[8]a[1] - // a[f]a[1]........................ - // a[8]a[2] - // a[f]a[2]........................ - // a[8]a[3] - // a[f]a[3]........................ - // a[8]a[4] - // a[f]a[4]........................ - // a[8]a[5] - // a[f]a[5]........................ - // a[8]a[6] - // a[f]a[6]........................ - // a[8]a[7] - // a[f]a[7]........................ -Lsqr8x_mul: - mul x14,x6,x4 - adc x28,xzr,xzr // carry bit, modulo-scheduled - mul x15,x7,x4 - add x27,x27,#8 - mul x16,x8,x4 - mul x17,x9,x4 - adds x19,x19,x14 - mul x14,x10,x4 - adcs x20,x20,x15 - mul x15,x11,x4 - adcs x21,x21,x16 - mul x16,x12,x4 - adcs x22,x22,x17 - mul x17,x13,x4 - adcs x23,x23,x14 - umulh x14,x6,x4 - adcs x24,x24,x15 - umulh x15,x7,x4 - adcs x25,x25,x16 - umulh x16,x8,x4 - adcs x26,x26,x17 - umulh x17,x9,x4 - adc x28,x28,xzr - str x19,[x2],#8 - adds x19,x20,x14 - umulh x14,x10,x4 - adcs x20,x21,x15 - umulh x15,x11,x4 - adcs x21,x22,x16 - umulh x16,x12,x4 - adcs x22,x23,x17 - umulh x17,x13,x4 - ldr x4,[x0,x27] - adcs x23,x24,x14 - adcs x24,x25,x15 - adcs x25,x26,x16 - adcs x26,x28,x17 - //adc x28,xzr,xzr // moved above - cbnz x27,Lsqr8x_mul - // note that carry flag is guaranteed - // to be zero at this point - cmp x1,x3 // done yet? - b.eq Lsqr8x_break - - ldp x6,x7,[x2,#8*0] - ldp x8,x9,[x2,#8*2] - ldp x10,x11,[x2,#8*4] - ldp x12,x13,[x2,#8*6] - adds x19,x19,x6 - ldr x4,[x0,#-8*8] - adcs x20,x20,x7 - ldp x6,x7,[x1,#8*0] - adcs x21,x21,x8 - adcs x22,x22,x9 - ldp x8,x9,[x1,#8*2] - adcs x23,x23,x10 - adcs x24,x24,x11 - ldp x10,x11,[x1,#8*4] - adcs x25,x25,x12 - mov x27,#-8*8 - adcs x26,x26,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 - //adc x28,xzr,xzr // moved above - b Lsqr8x_mul - -.align 4 -Lsqr8x_break: - ldp x6,x7,[x0,#8*0] - add x1,x0,#8*8 - ldp x8,x9,[x0,#8*2] - sub x14,x3,x1 // is it last iteration? - ldp x10,x11,[x0,#8*4] - sub x15,x2,x14 - ldp x12,x13,[x0,#8*6] - cbz x14,Lsqr8x_outer_loop - - stp x19,x20,[x2,#8*0] - ldp x19,x20,[x15,#8*0] - stp x21,x22,[x2,#8*2] - ldp x21,x22,[x15,#8*2] - stp x23,x24,[x2,#8*4] - ldp x23,x24,[x15,#8*4] - stp x25,x26,[x2,#8*6] - mov x2,x15 - ldp x25,x26,[x15,#8*6] - b Lsqr8x_outer_loop - -.align 4 -Lsqr8x_outer_break: - // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] - ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] - ldp x15,x16,[sp,#8*1] - ldp x11,x13,[x14,#8*2] - add x1,x14,#8*4 - ldp x17,x14,[sp,#8*3] - - stp x19,x20,[x2,#8*0] - mul x19,x7,x7 - stp x21,x22,[x2,#8*2] - umulh x7,x7,x7 - stp x23,x24,[x2,#8*4] - mul x8,x9,x9 - stp x25,x26,[x2,#8*6] - mov x2,sp - umulh x9,x9,x9 - adds x20,x7,x15,lsl#1 - extr x15,x16,x15,#63 - sub x27,x5,#8*4 - -Lsqr4x_shift_n_add: - adcs x21,x8,x15 - extr x16,x17,x16,#63 - sub x27,x27,#8*4 - adcs x22,x9,x16 - ldp x15,x16,[x2,#8*5] - mul x10,x11,x11 - ldp x7,x9,[x1],#8*2 - umulh x11,x11,x11 - mul x12,x13,x13 - umulh x13,x13,x13 - extr x17,x14,x17,#63 - stp x19,x20,[x2,#8*0] - adcs x23,x10,x17 - extr x14,x15,x14,#63 - stp x21,x22,[x2,#8*2] - adcs x24,x11,x14 - ldp x17,x14,[x2,#8*7] - extr x15,x16,x15,#63 - adcs x25,x12,x15 - extr x16,x17,x16,#63 - adcs x26,x13,x16 - ldp x15,x16,[x2,#8*9] - mul x6,x7,x7 - ldp x11,x13,[x1],#8*2 - umulh x7,x7,x7 - mul x8,x9,x9 - umulh x9,x9,x9 - stp x23,x24,[x2,#8*4] - extr x17,x14,x17,#63 - stp x25,x26,[x2,#8*6] - add x2,x2,#8*8 - adcs x19,x6,x17 - extr x14,x15,x14,#63 - adcs x20,x7,x14 - ldp x17,x14,[x2,#8*3] - extr x15,x16,x15,#63 - cbnz x27,Lsqr4x_shift_n_add - ldp x1,x4,[x29,#104] // pull np and n0 - - adcs x21,x8,x15 - extr x16,x17,x16,#63 - adcs x22,x9,x16 - ldp x15,x16,[x2,#8*5] - mul x10,x11,x11 - umulh x11,x11,x11 - stp x19,x20,[x2,#8*0] - mul x12,x13,x13 - umulh x13,x13,x13 - stp x21,x22,[x2,#8*2] - extr x17,x14,x17,#63 - adcs x23,x10,x17 - extr x14,x15,x14,#63 - ldp x19,x20,[sp,#8*0] - adcs x24,x11,x14 - extr x15,x16,x15,#63 - ldp x6,x7,[x1,#8*0] - adcs x25,x12,x15 - extr x16,xzr,x16,#63 - ldp x8,x9,[x1,#8*2] - adc x26,x13,x16 - ldp x10,x11,[x1,#8*4] - - // Reduce by 512 bits per iteration - mul x28,x4,x19 // t[0]*n0 - ldp x12,x13,[x1,#8*6] - add x3,x1,x5 - ldp x21,x22,[sp,#8*2] - stp x23,x24,[x2,#8*4] - ldp x23,x24,[sp,#8*4] - stp x25,x26,[x2,#8*6] - ldp x25,x26,[sp,#8*6] - add x1,x1,#8*8 - mov x30,xzr // initial top-most carry - mov x2,sp - mov x27,#8 - -Lsqr8x_reduction: - // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) - mul x15,x7,x28 - sub x27,x27,#1 - mul x16,x8,x28 - str x28,[x2],#8 // put aside t[0]*n0 for tail processing - mul x17,x9,x28 - // (*) adds xzr,x19,x14 - subs xzr,x19,#1 // (*) - mul x14,x10,x28 - adcs x19,x20,x15 - mul x15,x11,x28 - adcs x20,x21,x16 - mul x16,x12,x28 - adcs x21,x22,x17 - mul x17,x13,x28 - adcs x22,x23,x14 - umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) - adcs x23,x24,x15 - umulh x15,x7,x28 - adcs x24,x25,x16 - umulh x16,x8,x28 - adcs x25,x26,x17 - umulh x17,x9,x28 - adc x26,xzr,xzr - adds x19,x19,x14 - umulh x14,x10,x28 - adcs x20,x20,x15 - umulh x15,x11,x28 - adcs x21,x21,x16 - umulh x16,x12,x28 - adcs x22,x22,x17 - umulh x17,x13,x28 - mul x28,x4,x19 // next t[0]*n0 - adcs x23,x23,x14 - adcs x24,x24,x15 - adcs x25,x25,x16 - adc x26,x26,x17 - cbnz x27,Lsqr8x_reduction - - ldp x14,x15,[x2,#8*0] - ldp x16,x17,[x2,#8*2] - mov x0,x2 - sub x27,x3,x1 // done yet? - adds x19,x19,x14 - adcs x20,x20,x15 - ldp x14,x15,[x2,#8*4] - adcs x21,x21,x16 - adcs x22,x22,x17 - ldp x16,x17,[x2,#8*6] - adcs x23,x23,x14 - adcs x24,x24,x15 - adcs x25,x25,x16 - adcs x26,x26,x17 - //adc x28,xzr,xzr // moved below - cbz x27,Lsqr8x8_post_condition - - ldr x4,[x2,#-8*8] - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - ldp x10,x11,[x1,#8*4] - mov x27,#-8*8 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 - -Lsqr8x_tail: - mul x14,x6,x4 - adc x28,xzr,xzr // carry bit, modulo-scheduled - mul x15,x7,x4 - add x27,x27,#8 - mul x16,x8,x4 - mul x17,x9,x4 - adds x19,x19,x14 - mul x14,x10,x4 - adcs x20,x20,x15 - mul x15,x11,x4 - adcs x21,x21,x16 - mul x16,x12,x4 - adcs x22,x22,x17 - mul x17,x13,x4 - adcs x23,x23,x14 - umulh x14,x6,x4 - adcs x24,x24,x15 - umulh x15,x7,x4 - adcs x25,x25,x16 - umulh x16,x8,x4 - adcs x26,x26,x17 - umulh x17,x9,x4 - adc x28,x28,xzr - str x19,[x2],#8 - adds x19,x20,x14 - umulh x14,x10,x4 - adcs x20,x21,x15 - umulh x15,x11,x4 - adcs x21,x22,x16 - umulh x16,x12,x4 - adcs x22,x23,x17 - umulh x17,x13,x4 - ldr x4,[x0,x27] - adcs x23,x24,x14 - adcs x24,x25,x15 - adcs x25,x26,x16 - adcs x26,x28,x17 - //adc x28,xzr,xzr // moved above - cbnz x27,Lsqr8x_tail - // note that carry flag is guaranteed - // to be zero at this point - ldp x6,x7,[x2,#8*0] - sub x27,x3,x1 // done yet? - sub x16,x3,x5 // rewinded np - ldp x8,x9,[x2,#8*2] - ldp x10,x11,[x2,#8*4] - ldp x12,x13,[x2,#8*6] - cbz x27,Lsqr8x_tail_break - - ldr x4,[x0,#-8*8] - adds x19,x19,x6 - adcs x20,x20,x7 - ldp x6,x7,[x1,#8*0] - adcs x21,x21,x8 - adcs x22,x22,x9 - ldp x8,x9,[x1,#8*2] - adcs x23,x23,x10 - adcs x24,x24,x11 - ldp x10,x11,[x1,#8*4] - adcs x25,x25,x12 - mov x27,#-8*8 - adcs x26,x26,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 - //adc x28,xzr,xzr // moved above - b Lsqr8x_tail - -.align 4 -Lsqr8x_tail_break: - ldr x4,[x29,#112] // pull n0 - add x27,x2,#8*8 // end of current t[num] window - - subs xzr,x30,#1 // "move" top-most carry to carry bit - adcs x14,x19,x6 - adcs x15,x20,x7 - ldp x19,x20,[x0,#8*0] - adcs x21,x21,x8 - ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] - adcs x22,x22,x9 - ldp x8,x9,[x16,#8*2] - adcs x23,x23,x10 - adcs x24,x24,x11 - ldp x10,x11,[x16,#8*4] - adcs x25,x25,x12 - adcs x26,x26,x13 - ldp x12,x13,[x16,#8*6] - add x1,x16,#8*8 - adc x30,xzr,xzr // top-most carry - mul x28,x4,x19 - stp x14,x15,[x2,#8*0] - stp x21,x22,[x2,#8*2] - ldp x21,x22,[x0,#8*2] - stp x23,x24,[x2,#8*4] - ldp x23,x24,[x0,#8*4] - cmp x27,x29 // did we hit the bottom? - stp x25,x26,[x2,#8*6] - mov x2,x0 // slide the window - ldp x25,x26,[x0,#8*6] - mov x27,#8 - b.ne Lsqr8x_reduction - - // Final step. We see if result is larger than modulus, and - // if it is, subtract the modulus. But comparison implies - // subtraction. So we subtract modulus, see if it borrowed, - // and conditionally copy original value. - ldr x0,[x29,#96] // pull rp - add x2,x2,#8*8 - subs x14,x19,x6 - sbcs x15,x20,x7 - sub x27,x5,#8*8 - mov x3,x0 // x0 copy - -Lsqr8x_sub: - sbcs x16,x21,x8 - ldp x6,x7,[x1,#8*0] - sbcs x17,x22,x9 - stp x14,x15,[x0,#8*0] - sbcs x14,x23,x10 - ldp x8,x9,[x1,#8*2] - sbcs x15,x24,x11 - stp x16,x17,[x0,#8*2] - sbcs x16,x25,x12 - ldp x10,x11,[x1,#8*4] - sbcs x17,x26,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 - ldp x19,x20,[x2,#8*0] - sub x27,x27,#8*8 - ldp x21,x22,[x2,#8*2] - ldp x23,x24,[x2,#8*4] - ldp x25,x26,[x2,#8*6] - add x2,x2,#8*8 - stp x14,x15,[x0,#8*4] - sbcs x14,x19,x6 - stp x16,x17,[x0,#8*6] - add x0,x0,#8*8 - sbcs x15,x20,x7 - cbnz x27,Lsqr8x_sub - - sbcs x16,x21,x8 - mov x2,sp - add x1,sp,x5 - ldp x6,x7,[x3,#8*0] - sbcs x17,x22,x9 - stp x14,x15,[x0,#8*0] - sbcs x14,x23,x10 - ldp x8,x9,[x3,#8*2] - sbcs x15,x24,x11 - stp x16,x17,[x0,#8*2] - sbcs x16,x25,x12 - ldp x19,x20,[x1,#8*0] - sbcs x17,x26,x13 - ldp x21,x22,[x1,#8*2] - sbcs xzr,x30,xzr // did it borrow? - ldr x30,[x29,#8] // pull return address - stp x14,x15,[x0,#8*4] - stp x16,x17,[x0,#8*6] - - sub x27,x5,#8*4 -Lsqr4x_cond_copy: - sub x27,x27,#8*4 - csel x14,x19,x6,lo - stp xzr,xzr,[x2,#8*0] - csel x15,x20,x7,lo - ldp x6,x7,[x3,#8*4] - ldp x19,x20,[x1,#8*4] - csel x16,x21,x8,lo - stp xzr,xzr,[x2,#8*2] - add x2,x2,#8*4 - csel x17,x22,x9,lo - ldp x8,x9,[x3,#8*6] - ldp x21,x22,[x1,#8*6] - add x1,x1,#8*4 - stp x14,x15,[x3,#8*0] - stp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - stp xzr,xzr,[x1,#8*0] - stp xzr,xzr,[x1,#8*2] - cbnz x27,Lsqr4x_cond_copy - - csel x14,x19,x6,lo - stp xzr,xzr,[x2,#8*0] - csel x15,x20,x7,lo - stp xzr,xzr,[x2,#8*2] - csel x16,x21,x8,lo - csel x17,x22,x9,lo - stp x14,x15,[x3,#8*0] - stp x16,x17,[x3,#8*2] - - b Lsqr8x_done - -.align 4 -Lsqr8x8_post_condition: - adc x28,xzr,xzr - ldr x30,[x29,#8] // pull return address - // x19-7,x28 hold result, x6-7 hold modulus - subs x6,x19,x6 - ldr x1,[x29,#96] // pull rp - sbcs x7,x20,x7 - stp xzr,xzr,[sp,#8*0] - sbcs x8,x21,x8 - stp xzr,xzr,[sp,#8*2] - sbcs x9,x22,x9 - stp xzr,xzr,[sp,#8*4] - sbcs x10,x23,x10 - stp xzr,xzr,[sp,#8*6] - sbcs x11,x24,x11 - stp xzr,xzr,[sp,#8*8] - sbcs x12,x25,x12 - stp xzr,xzr,[sp,#8*10] - sbcs x13,x26,x13 - stp xzr,xzr,[sp,#8*12] - sbcs x28,x28,xzr // did it borrow? - stp xzr,xzr,[sp,#8*14] - - // x6-7 hold result-modulus - csel x6,x19,x6,lo - csel x7,x20,x7,lo - csel x8,x21,x8,lo - csel x9,x22,x9,lo - stp x6,x7,[x1,#8*0] - csel x10,x23,x10,lo - csel x11,x24,x11,lo - stp x8,x9,[x1,#8*2] - csel x12,x25,x12,lo - csel x13,x26,x13,lo - stp x10,x11,[x1,#8*4] - stp x12,x13,[x1,#8*6] - -Lsqr8x_done: - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] - mov x0,#1 - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - // x30 is popped earlier - AARCH64_VALIDATE_LINK_REGISTER - ret - - -.align 5 -__bn_mul4x_mont: - // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to - // only from bn_mul_mont or __bn_mul8x_mont which have already signed the - // return address. - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - sub x26,sp,x5,lsl#3 - lsl x5,x5,#3 - ldr x4,[x4] // *n0 - sub sp,x26,#8*4 // alloca - - add x10,x2,x5 - add x27,x1,x5 - stp x0,x10,[x29,#96] // offload rp and &b[num] - - ldr x24,[x2,#8*0] // b[0] - ldp x6,x7,[x1,#8*0] // a[0..3] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - mov x19,xzr - mov x20,xzr - mov x21,xzr - mov x22,xzr - ldp x14,x15,[x3,#8*0] // n[0..3] - ldp x16,x17,[x3,#8*2] - adds x3,x3,#8*4 // clear carry bit - mov x0,xzr - mov x28,#0 - mov x26,sp - -Loop_mul4x_1st_reduction: - mul x10,x6,x24 // lo(a[0..3]*b[0]) - adc x0,x0,xzr // modulo-scheduled - mul x11,x7,x24 - add x28,x28,#8 - mul x12,x8,x24 - and x28,x28,#31 - mul x13,x9,x24 - adds x19,x19,x10 - umulh x10,x6,x24 // hi(a[0..3]*b[0]) - adcs x20,x20,x11 - mul x25,x19,x4 // t[0]*n0 - adcs x21,x21,x12 - umulh x11,x7,x24 - adcs x22,x22,x13 - umulh x12,x8,x24 - adc x23,xzr,xzr - umulh x13,x9,x24 - ldr x24,[x2,x28] // next b[i] (or b[0]) - adds x20,x20,x10 - // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) - str x25,[x26],#8 // put aside t[0]*n0 for tail processing - adcs x21,x21,x11 - mul x11,x15,x25 - adcs x22,x22,x12 - mul x12,x16,x25 - adc x23,x23,x13 // can't overflow - mul x13,x17,x25 - // (*) adds xzr,x19,x10 - subs xzr,x19,#1 // (*) - umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) - adcs x19,x20,x11 - umulh x11,x15,x25 - adcs x20,x21,x12 - umulh x12,x16,x25 - adcs x21,x22,x13 - umulh x13,x17,x25 - adcs x22,x23,x0 - adc x0,xzr,xzr - adds x19,x19,x10 - sub x10,x27,x1 - adcs x20,x20,x11 - adcs x21,x21,x12 - adcs x22,x22,x13 - //adc x0,x0,xzr - cbnz x28,Loop_mul4x_1st_reduction - - cbz x10,Lmul4x4_post_condition - - ldp x6,x7,[x1,#8*0] // a[4..7] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - ldr x25,[sp] // a[0]*n0 - ldp x14,x15,[x3,#8*0] // n[4..7] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - -Loop_mul4x_1st_tail: - mul x10,x6,x24 // lo(a[4..7]*b[i]) - adc x0,x0,xzr // modulo-scheduled - mul x11,x7,x24 - add x28,x28,#8 - mul x12,x8,x24 - and x28,x28,#31 - mul x13,x9,x24 - adds x19,x19,x10 - umulh x10,x6,x24 // hi(a[4..7]*b[i]) - adcs x20,x20,x11 - umulh x11,x7,x24 - adcs x21,x21,x12 - umulh x12,x8,x24 - adcs x22,x22,x13 - umulh x13,x9,x24 - adc x23,xzr,xzr - ldr x24,[x2,x28] // next b[i] (or b[0]) - adds x20,x20,x10 - mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) - adcs x21,x21,x11 - mul x11,x15,x25 - adcs x22,x22,x12 - mul x12,x16,x25 - adc x23,x23,x13 // can't overflow - mul x13,x17,x25 - adds x19,x19,x10 - umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) - adcs x20,x20,x11 - umulh x11,x15,x25 - adcs x21,x21,x12 - umulh x12,x16,x25 - adcs x22,x22,x13 - adcs x23,x23,x0 - umulh x13,x17,x25 - adc x0,xzr,xzr - ldr x25,[sp,x28] // next t[0]*n0 - str x19,[x26],#8 // result!!! - adds x19,x20,x10 - sub x10,x27,x1 // done yet? - adcs x20,x21,x11 - adcs x21,x22,x12 - adcs x22,x23,x13 - //adc x0,x0,xzr - cbnz x28,Loop_mul4x_1st_tail - - sub x11,x27,x5 // rewinded x1 - cbz x10,Lmul4x_proceed - - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - ldp x14,x15,[x3,#8*0] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - b Loop_mul4x_1st_tail - -.align 5 -Lmul4x_proceed: - ldr x24,[x2,#8*4]! // *++b - adc x30,x0,xzr - ldp x6,x7,[x11,#8*0] // a[0..3] - sub x3,x3,x5 // rewind np - ldp x8,x9,[x11,#8*2] - add x1,x11,#8*4 - - stp x19,x20,[x26,#8*0] // result!!! - ldp x19,x20,[sp,#8*4] // t[0..3] - stp x21,x22,[x26,#8*2] // result!!! - ldp x21,x22,[sp,#8*6] - - ldp x14,x15,[x3,#8*0] // n[0..3] - mov x26,sp - ldp x16,x17,[x3,#8*2] - adds x3,x3,#8*4 // clear carry bit - mov x0,xzr - -.align 4 -Loop_mul4x_reduction: - mul x10,x6,x24 // lo(a[0..3]*b[4]) - adc x0,x0,xzr // modulo-scheduled - mul x11,x7,x24 - add x28,x28,#8 - mul x12,x8,x24 - and x28,x28,#31 - mul x13,x9,x24 - adds x19,x19,x10 - umulh x10,x6,x24 // hi(a[0..3]*b[4]) - adcs x20,x20,x11 - mul x25,x19,x4 // t[0]*n0 - adcs x21,x21,x12 - umulh x11,x7,x24 - adcs x22,x22,x13 - umulh x12,x8,x24 - adc x23,xzr,xzr - umulh x13,x9,x24 - ldr x24,[x2,x28] // next b[i] - adds x20,x20,x10 - // (*) mul x10,x14,x25 - str x25,[x26],#8 // put aside t[0]*n0 for tail processing - adcs x21,x21,x11 - mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 - adcs x22,x22,x12 - mul x12,x16,x25 - adc x23,x23,x13 // can't overflow - mul x13,x17,x25 - // (*) adds xzr,x19,x10 - subs xzr,x19,#1 // (*) - umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 - adcs x19,x20,x11 - umulh x11,x15,x25 - adcs x20,x21,x12 - umulh x12,x16,x25 - adcs x21,x22,x13 - umulh x13,x17,x25 - adcs x22,x23,x0 - adc x0,xzr,xzr - adds x19,x19,x10 - adcs x20,x20,x11 - adcs x21,x21,x12 - adcs x22,x22,x13 - //adc x0,x0,xzr - cbnz x28,Loop_mul4x_reduction - - adc x0,x0,xzr - ldp x10,x11,[x26,#8*4] // t[4..7] - ldp x12,x13,[x26,#8*6] - ldp x6,x7,[x1,#8*0] // a[4..7] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - adds x19,x19,x10 - adcs x20,x20,x11 - adcs x21,x21,x12 - adcs x22,x22,x13 - //adc x0,x0,xzr - - ldr x25,[sp] // t[0]*n0 - ldp x14,x15,[x3,#8*0] // n[4..7] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - -.align 4 -Loop_mul4x_tail: - mul x10,x6,x24 // lo(a[4..7]*b[4]) - adc x0,x0,xzr // modulo-scheduled - mul x11,x7,x24 - add x28,x28,#8 - mul x12,x8,x24 - and x28,x28,#31 - mul x13,x9,x24 - adds x19,x19,x10 - umulh x10,x6,x24 // hi(a[4..7]*b[4]) - adcs x20,x20,x11 - umulh x11,x7,x24 - adcs x21,x21,x12 - umulh x12,x8,x24 - adcs x22,x22,x13 - umulh x13,x9,x24 - adc x23,xzr,xzr - ldr x24,[x2,x28] // next b[i] - adds x20,x20,x10 - mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) - adcs x21,x21,x11 - mul x11,x15,x25 - adcs x22,x22,x12 - mul x12,x16,x25 - adc x23,x23,x13 // can't overflow - mul x13,x17,x25 - adds x19,x19,x10 - umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) - adcs x20,x20,x11 - umulh x11,x15,x25 - adcs x21,x21,x12 - umulh x12,x16,x25 - adcs x22,x22,x13 - umulh x13,x17,x25 - adcs x23,x23,x0 - ldr x25,[sp,x28] // next a[0]*n0 - adc x0,xzr,xzr - str x19,[x26],#8 // result!!! - adds x19,x20,x10 - sub x10,x27,x1 // done yet? - adcs x20,x21,x11 - adcs x21,x22,x12 - adcs x22,x23,x13 - //adc x0,x0,xzr - cbnz x28,Loop_mul4x_tail - - sub x11,x3,x5 // rewinded np? - adc x0,x0,xzr - cbz x10,Loop_mul4x_break - - ldp x10,x11,[x26,#8*4] - ldp x12,x13,[x26,#8*6] - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - adds x19,x19,x10 - adcs x20,x20,x11 - adcs x21,x21,x12 - adcs x22,x22,x13 - //adc x0,x0,xzr - ldp x14,x15,[x3,#8*0] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - b Loop_mul4x_tail - -.align 4 -Loop_mul4x_break: - ldp x12,x13,[x29,#96] // pull rp and &b[num] - adds x19,x19,x30 - add x2,x2,#8*4 // bp++ - adcs x20,x20,xzr - sub x1,x1,x5 // rewind ap - adcs x21,x21,xzr - stp x19,x20,[x26,#8*0] // result!!! - adcs x22,x22,xzr - ldp x19,x20,[sp,#8*4] // t[0..3] - adc x30,x0,xzr - stp x21,x22,[x26,#8*2] // result!!! - cmp x2,x13 // done yet? - ldp x21,x22,[sp,#8*6] - ldp x14,x15,[x11,#8*0] // n[0..3] - ldp x16,x17,[x11,#8*2] - add x3,x11,#8*4 - b.eq Lmul4x_post - - ldr x24,[x2] - ldp x6,x7,[x1,#8*0] // a[0..3] - ldp x8,x9,[x1,#8*2] - adds x1,x1,#8*4 // clear carry bit - mov x0,xzr - mov x26,sp - b Loop_mul4x_reduction - -.align 4 -Lmul4x_post: - // Final step. We see if result is larger than modulus, and - // if it is, subtract the modulus. But comparison implies - // subtraction. So we subtract modulus, see if it borrowed, - // and conditionally copy original value. - mov x0,x12 - mov x27,x12 // x0 copy - subs x10,x19,x14 - add x26,sp,#8*8 - sbcs x11,x20,x15 - sub x28,x5,#8*4 - -Lmul4x_sub: - sbcs x12,x21,x16 - ldp x14,x15,[x3,#8*0] - sub x28,x28,#8*4 - ldp x19,x20,[x26,#8*0] - sbcs x13,x22,x17 - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - ldp x21,x22,[x26,#8*2] - add x26,x26,#8*4 - stp x10,x11,[x0,#8*0] - sbcs x10,x19,x14 - stp x12,x13,[x0,#8*2] - add x0,x0,#8*4 - sbcs x11,x20,x15 - cbnz x28,Lmul4x_sub - - sbcs x12,x21,x16 - mov x26,sp - add x1,sp,#8*4 - ldp x6,x7,[x27,#8*0] - sbcs x13,x22,x17 - stp x10,x11,[x0,#8*0] - ldp x8,x9,[x27,#8*2] - stp x12,x13,[x0,#8*2] - ldp x19,x20,[x1,#8*0] - ldp x21,x22,[x1,#8*2] - sbcs xzr,x30,xzr // did it borrow? - ldr x30,[x29,#8] // pull return address - - sub x28,x5,#8*4 -Lmul4x_cond_copy: - sub x28,x28,#8*4 - csel x10,x19,x6,lo - stp xzr,xzr,[x26,#8*0] - csel x11,x20,x7,lo - ldp x6,x7,[x27,#8*4] - ldp x19,x20,[x1,#8*4] - csel x12,x21,x8,lo - stp xzr,xzr,[x26,#8*2] - add x26,x26,#8*4 - csel x13,x22,x9,lo - ldp x8,x9,[x27,#8*6] - ldp x21,x22,[x1,#8*6] - add x1,x1,#8*4 - stp x10,x11,[x27,#8*0] - stp x12,x13,[x27,#8*2] - add x27,x27,#8*4 - cbnz x28,Lmul4x_cond_copy - - csel x10,x19,x6,lo - stp xzr,xzr,[x26,#8*0] - csel x11,x20,x7,lo - stp xzr,xzr,[x26,#8*2] - csel x12,x21,x8,lo - stp xzr,xzr,[x26,#8*3] - csel x13,x22,x9,lo - stp xzr,xzr,[x26,#8*4] - stp x10,x11,[x27,#8*0] - stp x12,x13,[x27,#8*2] - - b Lmul4x_done - -.align 4 -Lmul4x4_post_condition: - adc x0,x0,xzr - ldr x1,[x29,#96] // pull rp - // x19-3,x0 hold result, x14-7 hold modulus - subs x6,x19,x14 - ldr x30,[x29,#8] // pull return address - sbcs x7,x20,x15 - stp xzr,xzr,[sp,#8*0] - sbcs x8,x21,x16 - stp xzr,xzr,[sp,#8*2] - sbcs x9,x22,x17 - stp xzr,xzr,[sp,#8*4] - sbcs xzr,x0,xzr // did it borrow? - stp xzr,xzr,[sp,#8*6] - - // x6-3 hold result-modulus - csel x6,x19,x6,lo - csel x7,x20,x7,lo - csel x8,x21,x8,lo - csel x9,x22,x9,lo - stp x6,x7,[x1,#8*0] - stp x8,x9,[x1,#8*2] - -Lmul4x_done: - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] - mov x0,#1 - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - // x30 is popped earlier - AARCH64_VALIDATE_LINK_REGISTER - ret - -.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 4 -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/third_party/boringssl/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S deleted file mode 100644 index 5441afc0..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S +++ /dev/null @@ -1,343 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.text - -.globl _gcm_init_neon -.private_extern _gcm_init_neon - -.align 4 -_gcm_init_neon: - AARCH64_VALID_CALL_TARGET - // This function is adapted from gcm_init_v8. xC2 is t3. - ld1 {v17.2d}, [x1] // load H - movi v19.16b, #0xe1 - shl v19.2d, v19.2d, #57 // 0xc2.0 - ext v3.16b, v17.16b, v17.16b, #8 - ushr v18.2d, v19.2d, #63 - dup v17.4s, v17.s[1] - ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 - ushr v18.2d, v3.2d, #63 - sshr v17.4s, v17.4s, #31 // broadcast carry bit - and v18.16b, v18.16b, v16.16b - shl v3.2d, v3.2d, #1 - ext v18.16b, v18.16b, v18.16b, #8 - and v16.16b, v16.16b, v17.16b - orr v3.16b, v3.16b, v18.16b // H<<<=1 - eor v5.16b, v3.16b, v16.16b // twisted H - st1 {v5.2d}, [x0] // store Htable[0] - ret - - -.globl _gcm_gmult_neon -.private_extern _gcm_gmult_neon - -.align 4 -_gcm_gmult_neon: - AARCH64_VALID_CALL_TARGET - ld1 {v3.16b}, [x0] // load Xi - ld1 {v5.1d}, [x1], #8 // load twisted H - ld1 {v6.1d}, [x1] - adrp x9, Lmasks@PAGE // load constants - add x9, x9, Lmasks@PAGEOFF - ld1 {v24.2d, v25.2d}, [x9] - rev64 v3.16b, v3.16b // byteswap Xi - ext v3.16b, v3.16b, v3.16b, #8 - eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing - - mov x3, #16 - b Lgmult_neon - - -.globl _gcm_ghash_neon -.private_extern _gcm_ghash_neon - -.align 4 -_gcm_ghash_neon: - AARCH64_VALID_CALL_TARGET - ld1 {v0.16b}, [x0] // load Xi - ld1 {v5.1d}, [x1], #8 // load twisted H - ld1 {v6.1d}, [x1] - adrp x9, Lmasks@PAGE // load constants - add x9, x9, Lmasks@PAGEOFF - ld1 {v24.2d, v25.2d}, [x9] - rev64 v0.16b, v0.16b // byteswap Xi - ext v0.16b, v0.16b, v0.16b, #8 - eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing - -Loop_neon: - ld1 {v3.16b}, [x2], #16 // load inp - rev64 v3.16b, v3.16b // byteswap inp - ext v3.16b, v3.16b, v3.16b, #8 - eor v3.16b, v3.16b, v0.16b // inp ^= Xi - -Lgmult_neon: - // Split the input into v3 and v4. (The upper halves are unused, - // so it is okay to leave them alone.) - ins v4.d[0], v3.d[1] - ext v16.8b, v5.8b, v5.8b, #1 // A1 - pmull v16.8h, v16.8b, v3.8b // F = A1*B - ext v0.8b, v3.8b, v3.8b, #1 // B1 - pmull v0.8h, v5.8b, v0.8b // E = A*B1 - ext v17.8b, v5.8b, v5.8b, #2 // A2 - pmull v17.8h, v17.8b, v3.8b // H = A2*B - ext v19.8b, v3.8b, v3.8b, #2 // B2 - pmull v19.8h, v5.8b, v19.8b // G = A*B2 - ext v18.8b, v5.8b, v5.8b, #3 // A3 - eor v16.16b, v16.16b, v0.16b // L = E + F - pmull v18.8h, v18.8b, v3.8b // J = A3*B - ext v0.8b, v3.8b, v3.8b, #3 // B3 - eor v17.16b, v17.16b, v19.16b // M = G + H - pmull v0.8h, v5.8b, v0.8b // I = A*B3 - - // Here we diverge from the 32-bit version. It computes the following - // (instructions reordered for clarity): - // - // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) - // vand $t0#hi, $t0#hi, $k48 - // veor $t0#lo, $t0#lo, $t0#hi - // - // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) - // vand $t1#hi, $t1#hi, $k32 - // veor $t1#lo, $t1#lo, $t1#hi - // - // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) - // vand $t2#hi, $t2#hi, $k16 - // veor $t2#lo, $t2#lo, $t2#hi - // - // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) - // vmov.i64 $t3#hi, #0 - // - // $kN is a mask with the bottom N bits set. AArch64 cannot compute on - // upper halves of SIMD registers, so we must split each half into - // separate registers. To compensate, we pair computations up and - // parallelize. - - ext v19.8b, v3.8b, v3.8b, #4 // B4 - eor v18.16b, v18.16b, v0.16b // N = I + J - pmull v19.8h, v5.8b, v19.8b // K = A*B4 - - // This can probably be scheduled more efficiently. For now, we just - // pair up independent instructions. - zip1 v20.2d, v16.2d, v17.2d - zip1 v22.2d, v18.2d, v19.2d - zip2 v21.2d, v16.2d, v17.2d - zip2 v23.2d, v18.2d, v19.2d - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - and v21.16b, v21.16b, v24.16b - and v23.16b, v23.16b, v25.16b - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - zip1 v16.2d, v20.2d, v21.2d - zip1 v18.2d, v22.2d, v23.2d - zip2 v17.2d, v20.2d, v21.2d - zip2 v19.2d, v22.2d, v23.2d - - ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 - ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 - pmull v0.8h, v5.8b, v3.8b // D = A*B - ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 - ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 - eor v16.16b, v16.16b, v17.16b - eor v18.16b, v18.16b, v19.16b - eor v0.16b, v0.16b, v16.16b - eor v0.16b, v0.16b, v18.16b - eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing - ext v16.8b, v7.8b, v7.8b, #1 // A1 - pmull v16.8h, v16.8b, v3.8b // F = A1*B - ext v1.8b, v3.8b, v3.8b, #1 // B1 - pmull v1.8h, v7.8b, v1.8b // E = A*B1 - ext v17.8b, v7.8b, v7.8b, #2 // A2 - pmull v17.8h, v17.8b, v3.8b // H = A2*B - ext v19.8b, v3.8b, v3.8b, #2 // B2 - pmull v19.8h, v7.8b, v19.8b // G = A*B2 - ext v18.8b, v7.8b, v7.8b, #3 // A3 - eor v16.16b, v16.16b, v1.16b // L = E + F - pmull v18.8h, v18.8b, v3.8b // J = A3*B - ext v1.8b, v3.8b, v3.8b, #3 // B3 - eor v17.16b, v17.16b, v19.16b // M = G + H - pmull v1.8h, v7.8b, v1.8b // I = A*B3 - - // Here we diverge from the 32-bit version. It computes the following - // (instructions reordered for clarity): - // - // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) - // vand $t0#hi, $t0#hi, $k48 - // veor $t0#lo, $t0#lo, $t0#hi - // - // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) - // vand $t1#hi, $t1#hi, $k32 - // veor $t1#lo, $t1#lo, $t1#hi - // - // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) - // vand $t2#hi, $t2#hi, $k16 - // veor $t2#lo, $t2#lo, $t2#hi - // - // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) - // vmov.i64 $t3#hi, #0 - // - // $kN is a mask with the bottom N bits set. AArch64 cannot compute on - // upper halves of SIMD registers, so we must split each half into - // separate registers. To compensate, we pair computations up and - // parallelize. - - ext v19.8b, v3.8b, v3.8b, #4 // B4 - eor v18.16b, v18.16b, v1.16b // N = I + J - pmull v19.8h, v7.8b, v19.8b // K = A*B4 - - // This can probably be scheduled more efficiently. For now, we just - // pair up independent instructions. - zip1 v20.2d, v16.2d, v17.2d - zip1 v22.2d, v18.2d, v19.2d - zip2 v21.2d, v16.2d, v17.2d - zip2 v23.2d, v18.2d, v19.2d - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - and v21.16b, v21.16b, v24.16b - and v23.16b, v23.16b, v25.16b - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - zip1 v16.2d, v20.2d, v21.2d - zip1 v18.2d, v22.2d, v23.2d - zip2 v17.2d, v20.2d, v21.2d - zip2 v19.2d, v22.2d, v23.2d - - ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 - ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 - pmull v1.8h, v7.8b, v3.8b // D = A*B - ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 - ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 - eor v16.16b, v16.16b, v17.16b - eor v18.16b, v18.16b, v19.16b - eor v1.16b, v1.16b, v16.16b - eor v1.16b, v1.16b, v18.16b - ext v16.8b, v6.8b, v6.8b, #1 // A1 - pmull v16.8h, v16.8b, v4.8b // F = A1*B - ext v2.8b, v4.8b, v4.8b, #1 // B1 - pmull v2.8h, v6.8b, v2.8b // E = A*B1 - ext v17.8b, v6.8b, v6.8b, #2 // A2 - pmull v17.8h, v17.8b, v4.8b // H = A2*B - ext v19.8b, v4.8b, v4.8b, #2 // B2 - pmull v19.8h, v6.8b, v19.8b // G = A*B2 - ext v18.8b, v6.8b, v6.8b, #3 // A3 - eor v16.16b, v16.16b, v2.16b // L = E + F - pmull v18.8h, v18.8b, v4.8b // J = A3*B - ext v2.8b, v4.8b, v4.8b, #3 // B3 - eor v17.16b, v17.16b, v19.16b // M = G + H - pmull v2.8h, v6.8b, v2.8b // I = A*B3 - - // Here we diverge from the 32-bit version. It computes the following - // (instructions reordered for clarity): - // - // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) - // vand $t0#hi, $t0#hi, $k48 - // veor $t0#lo, $t0#lo, $t0#hi - // - // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) - // vand $t1#hi, $t1#hi, $k32 - // veor $t1#lo, $t1#lo, $t1#hi - // - // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) - // vand $t2#hi, $t2#hi, $k16 - // veor $t2#lo, $t2#lo, $t2#hi - // - // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) - // vmov.i64 $t3#hi, #0 - // - // $kN is a mask with the bottom N bits set. AArch64 cannot compute on - // upper halves of SIMD registers, so we must split each half into - // separate registers. To compensate, we pair computations up and - // parallelize. - - ext v19.8b, v4.8b, v4.8b, #4 // B4 - eor v18.16b, v18.16b, v2.16b // N = I + J - pmull v19.8h, v6.8b, v19.8b // K = A*B4 - - // This can probably be scheduled more efficiently. For now, we just - // pair up independent instructions. - zip1 v20.2d, v16.2d, v17.2d - zip1 v22.2d, v18.2d, v19.2d - zip2 v21.2d, v16.2d, v17.2d - zip2 v23.2d, v18.2d, v19.2d - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - and v21.16b, v21.16b, v24.16b - and v23.16b, v23.16b, v25.16b - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - zip1 v16.2d, v20.2d, v21.2d - zip1 v18.2d, v22.2d, v23.2d - zip2 v17.2d, v20.2d, v21.2d - zip2 v19.2d, v22.2d, v23.2d - - ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 - ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 - pmull v2.8h, v6.8b, v4.8b // D = A*B - ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 - ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 - eor v16.16b, v16.16b, v17.16b - eor v18.16b, v18.16b, v19.16b - eor v2.16b, v2.16b, v16.16b - eor v2.16b, v2.16b, v18.16b - ext v16.16b, v0.16b, v2.16b, #8 - eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing - eor v1.16b, v1.16b, v2.16b - eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi - ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result - // This is a no-op due to the ins instruction below. - // ins v2.d[0], v1.d[1] - - // equivalent of reduction_avx from ghash-x86_64.pl - shl v17.2d, v0.2d, #57 // 1st phase - shl v18.2d, v0.2d, #62 - eor v18.16b, v18.16b, v17.16b // - shl v17.2d, v0.2d, #63 - eor v18.16b, v18.16b, v17.16b // - // Note Xm contains {Xl.d[1], Xh.d[0]}. - eor v18.16b, v18.16b, v1.16b - ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] - ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] - - ushr v18.2d, v0.2d, #1 // 2nd phase - eor v2.16b, v2.16b,v0.16b - eor v0.16b, v0.16b,v18.16b // - ushr v18.2d, v18.2d, #6 - ushr v0.2d, v0.2d, #1 // - eor v0.16b, v0.16b, v2.16b // - eor v0.16b, v0.16b, v18.16b // - - subs x3, x3, #16 - bne Loop_neon - - rev64 v0.16b, v0.16b // byteswap Xi and write - ext v0.16b, v0.16b, v0.16b, #8 - st1 {v0.16b}, [x0] - - ret - - -.section __TEXT,__const -.align 4 -Lmasks: -.quad 0x0000ffffffffffff // k48 -.quad 0x00000000ffffffff // k32 -.quad 0x000000000000ffff // k16 -.quad 0x0000000000000000 // k0 -.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S b/third_party/boringssl/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S deleted file mode 100644 index 0ba0cdd1..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S +++ /dev/null @@ -1,573 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -#if __ARM_MAX_ARCH__>=7 -.text - -.globl _gcm_init_v8 -.private_extern _gcm_init_v8 - -.align 4 -_gcm_init_v8: - AARCH64_VALID_CALL_TARGET - ld1 {v17.2d},[x1] //load input H - movi v19.16b,#0xe1 - shl v19.2d,v19.2d,#57 //0xc2.0 - ext v3.16b,v17.16b,v17.16b,#8 - ushr v18.2d,v19.2d,#63 - dup v17.4s,v17.s[1] - ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 - ushr v18.2d,v3.2d,#63 - sshr v17.4s,v17.4s,#31 //broadcast carry bit - and v18.16b,v18.16b,v16.16b - shl v3.2d,v3.2d,#1 - ext v18.16b,v18.16b,v18.16b,#8 - and v16.16b,v16.16b,v17.16b - orr v3.16b,v3.16b,v18.16b //H<<<=1 - eor v20.16b,v3.16b,v16.16b //twisted H - st1 {v20.2d},[x0],#16 //store Htable[0] - - //calculate H^2 - ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing - pmull v0.1q,v20.1d,v20.1d - eor v16.16b,v16.16b,v20.16b - pmull2 v2.1q,v20.2d,v20.2d - pmull v1.1q,v16.1d,v16.1d - - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase - - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v22.16b,v0.16b,v18.16b - - ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing - eor v17.16b,v17.16b,v22.16b - ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed - st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] - //calculate H^3 and H^4 - pmull v0.1q,v20.1d, v22.1d - pmull v5.1q,v22.1d,v22.1d - pmull2 v2.1q,v20.2d, v22.2d - pmull2 v7.1q,v22.2d,v22.2d - pmull v1.1q,v16.1d,v17.1d - pmull v6.1q,v17.1d,v17.1d - - ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - ext v17.16b,v5.16b,v7.16b,#8 - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v16.16b - eor v4.16b,v5.16b,v7.16b - eor v6.16b,v6.16b,v17.16b - eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase - eor v6.16b,v6.16b,v4.16b - pmull v4.1q,v5.1d,v19.1d - - ins v2.d[0],v1.d[1] - ins v7.d[0],v6.d[1] - ins v1.d[1],v0.d[0] - ins v6.d[1],v5.d[0] - eor v0.16b,v1.16b,v18.16b - eor v5.16b,v6.16b,v4.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase - ext v4.16b,v5.16b,v5.16b,#8 - pmull v0.1q,v0.1d,v19.1d - pmull v5.1q,v5.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v4.16b,v4.16b,v7.16b - eor v20.16b, v0.16b,v18.16b //H^3 - eor v22.16b,v5.16b,v4.16b //H^4 - - ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing - ext v17.16b,v22.16b,v22.16b,#8 - eor v16.16b,v16.16b,v20.16b - eor v17.16b,v17.16b,v22.16b - ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed - st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] - ret - -.globl _gcm_gmult_v8 -.private_extern _gcm_gmult_v8 - -.align 4 -_gcm_gmult_v8: - AARCH64_VALID_CALL_TARGET - ld1 {v17.2d},[x0] //load Xi - movi v19.16b,#0xe1 - ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... - shl v19.2d,v19.2d,#57 -#ifndef __AARCH64EB__ - rev64 v17.16b,v17.16b -#endif - ext v3.16b,v17.16b,v17.16b,#8 - - pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo - eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi - pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) - - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v0.16b,v0.16b,v18.16b - -#ifndef __AARCH64EB__ - rev64 v0.16b,v0.16b -#endif - ext v0.16b,v0.16b,v0.16b,#8 - st1 {v0.2d},[x0] //write out Xi - - ret - -.globl _gcm_ghash_v8 -.private_extern _gcm_ghash_v8 - -.align 4 -_gcm_ghash_v8: - AARCH64_VALID_CALL_TARGET - cmp x3,#64 - b.hs Lgcm_ghash_v8_4x - ld1 {v0.2d},[x0] //load [rotated] Xi - //"[rotated]" means that - //loaded value would have - //to be rotated in order to - //make it appear as in - //algorithm specification - subs x3,x3,#32 //see if x3 is 32 or larger - mov x12,#16 //x12 is used as post- - //increment for input pointer; - //as loop is modulo-scheduled - //x12 is zeroed just in time - //to preclude overstepping - //inp[len], which means that - //last block[s] are actually - //loaded twice, but last - //copy is not processed - ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 - movi v19.16b,#0xe1 - ld1 {v22.2d},[x1] - csel x12,xzr,x12,eq //is it time to zero x12? - ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi - ld1 {v16.2d},[x2],#16 //load [rotated] I[0] - shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant -#ifndef __AARCH64EB__ - rev64 v16.16b,v16.16b - rev64 v0.16b,v0.16b -#endif - ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] - b.lo Lodd_tail_v8 //x3 was less than 32 - ld1 {v17.2d},[x2],x12 //load [rotated] I[1] -#ifndef __AARCH64EB__ - rev64 v17.16b,v17.16b -#endif - ext v7.16b,v17.16b,v17.16b,#8 - eor v3.16b,v3.16b,v0.16b //I[i]^=Xi - pmull v4.1q,v20.1d,v7.1d //H·Ii+1 - eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing - pmull2 v6.1q,v20.2d,v7.2d - b Loop_mod2x_v8 - -.align 4 -Loop_mod2x_v8: - ext v18.16b,v3.16b,v3.16b,#8 - subs x3,x3,#32 //is there more data? - pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo - csel x12,xzr,x12,lo //is it time to zero x12? - - pmull v5.1q,v21.1d,v17.1d - eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi - eor v0.16b,v0.16b,v4.16b //accumulate - pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) - ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] - - eor v2.16b,v2.16b,v6.16b - csel x12,xzr,x12,eq //is it time to zero x12? - eor v1.16b,v1.16b,v5.16b - - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] -#ifndef __AARCH64EB__ - rev64 v16.16b,v16.16b -#endif - eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - -#ifndef __AARCH64EB__ - rev64 v17.16b,v17.16b -#endif - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - ext v7.16b,v17.16b,v17.16b,#8 - ext v3.16b,v16.16b,v16.16b,#8 - eor v0.16b,v1.16b,v18.16b - pmull v4.1q,v20.1d,v7.1d //H·Ii+1 - eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v3.16b,v3.16b,v18.16b - eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing - eor v3.16b,v3.16b,v0.16b - pmull2 v6.1q,v20.2d,v7.2d - b.hs Loop_mod2x_v8 //there was at least 32 more bytes - - eor v2.16b,v2.16b,v18.16b - ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b - adds x3,x3,#32 //re-construct x3 - eor v0.16b,v0.16b,v2.16b //re-construct v0.16b - b.eq Ldone_v8 //is x3 zero? -Lodd_tail_v8: - ext v18.16b,v0.16b,v0.16b,#8 - eor v3.16b,v3.16b,v0.16b //inp^=Xi - eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi - - pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo - eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi - pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) - - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v0.16b,v0.16b,v18.16b - -Ldone_v8: -#ifndef __AARCH64EB__ - rev64 v0.16b,v0.16b -#endif - ext v0.16b,v0.16b,v0.16b,#8 - st1 {v0.2d},[x0] //write out Xi - - ret - - -.align 4 -gcm_ghash_v8_4x: -Lgcm_ghash_v8_4x: - ld1 {v0.2d},[x0] //load [rotated] Xi - ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 - movi v19.16b,#0xe1 - ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 - shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant - - ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 -#ifndef __AARCH64EB__ - rev64 v0.16b,v0.16b - rev64 v5.16b,v5.16b - rev64 v6.16b,v6.16b - rev64 v7.16b,v7.16b - rev64 v4.16b,v4.16b -#endif - ext v25.16b,v7.16b,v7.16b,#8 - ext v24.16b,v6.16b,v6.16b,#8 - ext v23.16b,v5.16b,v5.16b,#8 - - pmull v29.1q,v20.1d,v25.1d //H·Ii+3 - eor v7.16b,v7.16b,v25.16b - pmull2 v31.1q,v20.2d,v25.2d - pmull v30.1q,v21.1d,v7.1d - - pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 - eor v6.16b,v6.16b,v24.16b - pmull2 v24.1q,v22.2d,v24.2d - pmull2 v6.1q,v21.2d,v6.2d - - eor v29.16b,v29.16b,v16.16b - eor v31.16b,v31.16b,v24.16b - eor v30.16b,v30.16b,v6.16b - - pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 - eor v5.16b,v5.16b,v23.16b - pmull2 v23.1q,v26.2d,v23.2d - pmull v5.1q,v27.1d,v5.1d - - eor v29.16b,v29.16b,v7.16b - eor v31.16b,v31.16b,v23.16b - eor v30.16b,v30.16b,v5.16b - - subs x3,x3,#128 - b.lo Ltail4x - - b Loop4x - -.align 4 -Loop4x: - eor v16.16b,v4.16b,v0.16b - ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 - ext v3.16b,v16.16b,v16.16b,#8 -#ifndef __AARCH64EB__ - rev64 v5.16b,v5.16b - rev64 v6.16b,v6.16b - rev64 v7.16b,v7.16b - rev64 v4.16b,v4.16b -#endif - - pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) - eor v16.16b,v16.16b,v3.16b - pmull2 v2.1q,v28.2d,v3.2d - ext v25.16b,v7.16b,v7.16b,#8 - pmull2 v1.1q,v27.2d,v16.2d - - eor v0.16b,v0.16b,v29.16b - eor v2.16b,v2.16b,v31.16b - ext v24.16b,v6.16b,v6.16b,#8 - eor v1.16b,v1.16b,v30.16b - ext v23.16b,v5.16b,v5.16b,#8 - - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - pmull v29.1q,v20.1d,v25.1d //H·Ii+3 - eor v7.16b,v7.16b,v25.16b - eor v1.16b,v1.16b,v17.16b - pmull2 v31.1q,v20.2d,v25.2d - eor v1.16b,v1.16b,v18.16b - pmull v30.1q,v21.1d,v7.1d - - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 - eor v6.16b,v6.16b,v24.16b - pmull2 v24.1q,v22.2d,v24.2d - eor v0.16b,v1.16b,v18.16b - pmull2 v6.1q,v21.2d,v6.2d - - eor v29.16b,v29.16b,v16.16b - eor v31.16b,v31.16b,v24.16b - eor v30.16b,v30.16b,v6.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 - eor v5.16b,v5.16b,v23.16b - eor v18.16b,v18.16b,v2.16b - pmull2 v23.1q,v26.2d,v23.2d - pmull v5.1q,v27.1d,v5.1d - - eor v0.16b,v0.16b,v18.16b - eor v29.16b,v29.16b,v7.16b - eor v31.16b,v31.16b,v23.16b - ext v0.16b,v0.16b,v0.16b,#8 - eor v30.16b,v30.16b,v5.16b - - subs x3,x3,#64 - b.hs Loop4x - -Ltail4x: - eor v16.16b,v4.16b,v0.16b - ext v3.16b,v16.16b,v16.16b,#8 - - pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) - eor v16.16b,v16.16b,v3.16b - pmull2 v2.1q,v28.2d,v3.2d - pmull2 v1.1q,v27.2d,v16.2d - - eor v0.16b,v0.16b,v29.16b - eor v2.16b,v2.16b,v31.16b - eor v1.16b,v1.16b,v30.16b - - adds x3,x3,#64 - b.eq Ldone4x - - cmp x3,#32 - b.lo Lone - b.eq Ltwo -Lthree: - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - ld1 {v4.2d,v5.2d,v6.2d},[x2] - eor v1.16b,v1.16b,v18.16b -#ifndef __AARCH64EB__ - rev64 v5.16b,v5.16b - rev64 v6.16b,v6.16b - rev64 v4.16b,v4.16b -#endif - - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - ext v24.16b,v6.16b,v6.16b,#8 - ext v23.16b,v5.16b,v5.16b,#8 - eor v0.16b,v1.16b,v18.16b - - pmull v29.1q,v20.1d,v24.1d //H·Ii+2 - eor v6.16b,v6.16b,v24.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - pmull2 v31.1q,v20.2d,v24.2d - pmull v30.1q,v21.1d,v6.1d - eor v0.16b,v0.16b,v18.16b - pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 - eor v5.16b,v5.16b,v23.16b - ext v0.16b,v0.16b,v0.16b,#8 - - pmull2 v23.1q,v22.2d,v23.2d - eor v16.16b,v4.16b,v0.16b - pmull2 v5.1q,v21.2d,v5.2d - ext v3.16b,v16.16b,v16.16b,#8 - - eor v29.16b,v29.16b,v7.16b - eor v31.16b,v31.16b,v23.16b - eor v30.16b,v30.16b,v5.16b - - pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) - eor v16.16b,v16.16b,v3.16b - pmull2 v2.1q,v26.2d,v3.2d - pmull v1.1q,v27.1d,v16.1d - - eor v0.16b,v0.16b,v29.16b - eor v2.16b,v2.16b,v31.16b - eor v1.16b,v1.16b,v30.16b - b Ldone4x - -.align 4 -Ltwo: - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - ld1 {v4.2d,v5.2d},[x2] - eor v1.16b,v1.16b,v18.16b -#ifndef __AARCH64EB__ - rev64 v5.16b,v5.16b - rev64 v4.16b,v4.16b -#endif - - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - ext v23.16b,v5.16b,v5.16b,#8 - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v0.16b,v0.16b,v18.16b - ext v0.16b,v0.16b,v0.16b,#8 - - pmull v29.1q,v20.1d,v23.1d //H·Ii+1 - eor v5.16b,v5.16b,v23.16b - - eor v16.16b,v4.16b,v0.16b - ext v3.16b,v16.16b,v16.16b,#8 - - pmull2 v31.1q,v20.2d,v23.2d - pmull v30.1q,v21.1d,v5.1d - - pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) - eor v16.16b,v16.16b,v3.16b - pmull2 v2.1q,v22.2d,v3.2d - pmull2 v1.1q,v21.2d,v16.2d - - eor v0.16b,v0.16b,v29.16b - eor v2.16b,v2.16b,v31.16b - eor v1.16b,v1.16b,v30.16b - b Ldone4x - -.align 4 -Lone: - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - ld1 {v4.2d},[x2] - eor v1.16b,v1.16b,v18.16b -#ifndef __AARCH64EB__ - rev64 v4.16b,v4.16b -#endif - - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v0.16b,v0.16b,v18.16b - ext v0.16b,v0.16b,v0.16b,#8 - - eor v16.16b,v4.16b,v0.16b - ext v3.16b,v16.16b,v16.16b,#8 - - pmull v0.1q,v20.1d,v3.1d - eor v16.16b,v16.16b,v3.16b - pmull2 v2.1q,v20.2d,v3.2d - pmull v1.1q,v21.1d,v16.1d - -Ldone4x: - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - eor v1.16b,v1.16b,v18.16b - - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v0.16b,v0.16b,v18.16b - ext v0.16b,v0.16b,v0.16b,#8 - -#ifndef __AARCH64EB__ - rev64 v0.16b,v0.16b -#endif - st1 {v0.2d},[x0] //write out Xi - - ret - -.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/p256-armv8-asm.S b/third_party/boringssl/apple-aarch64/crypto/fipsmodule/p256-armv8-asm.S deleted file mode 100644 index 7a5202dd..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/p256-armv8-asm.S +++ /dev/null @@ -1,1710 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include "openssl/arm_arch.h" - -.text -.align 5 -Lpoly: -.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 -LRR: // 2^512 mod P precomputed for NIST P256 polynomial -.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd -Lone_mont: -.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe -Lone: -.quad 1,0,0,0 -Lord: -.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 -LordK: -.quad 0xccd1c8aaee00bc4f -.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 - -// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], -// const BN_ULONG x2[4]); -.globl _ecp_nistz256_mul_mont -.private_extern _ecp_nistz256_mul_mont - -.align 4 -_ecp_nistz256_mul_mont: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - - ldr x3,[x2] // bp[0] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldr x12,Lpoly+8 - ldr x13,Lpoly+24 - - bl __ecp_nistz256_mul_mont - - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); -.globl _ecp_nistz256_sqr_mont -.private_extern _ecp_nistz256_sqr_mont - -.align 4 -_ecp_nistz256_sqr_mont: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldr x12,Lpoly+8 - ldr x13,Lpoly+24 - - bl __ecp_nistz256_sqr_mont - - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); -.globl _ecp_nistz256_div_by_2 -.private_extern _ecp_nistz256_div_by_2 - -.align 4 -_ecp_nistz256_div_by_2: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] - ldr x12,Lpoly+8 - ldr x13,Lpoly+24 - - bl __ecp_nistz256_div_by_2 - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); -.globl _ecp_nistz256_mul_by_2 -.private_extern _ecp_nistz256_mul_by_2 - -.align 4 -_ecp_nistz256_mul_by_2: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] - ldr x12,Lpoly+8 - ldr x13,Lpoly+24 - mov x8,x14 - mov x9,x15 - mov x10,x16 - mov x11,x17 - - bl __ecp_nistz256_add_to // ret = a+a // 2*a - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); -.globl _ecp_nistz256_mul_by_3 -.private_extern _ecp_nistz256_mul_by_3 - -.align 4 -_ecp_nistz256_mul_by_3: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] - ldr x12,Lpoly+8 - ldr x13,Lpoly+24 - mov x8,x14 - mov x9,x15 - mov x10,x16 - mov x11,x17 - mov x4,x14 - mov x5,x15 - mov x6,x16 - mov x7,x17 - - bl __ecp_nistz256_add_to // ret = a+a // 2*a - - mov x8,x4 - mov x9,x5 - mov x10,x6 - mov x11,x7 - - bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], -// const BN_ULONG x2[4]); -.globl _ecp_nistz256_sub -.private_extern _ecp_nistz256_sub - -.align 4 -_ecp_nistz256_sub: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] - ldr x12,Lpoly+8 - ldr x13,Lpoly+24 - - bl __ecp_nistz256_sub_from - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); -.globl _ecp_nistz256_neg -.private_extern _ecp_nistz256_neg - -.align 4 -_ecp_nistz256_neg: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x2,x1 - mov x14,xzr // a = 0 - mov x15,xzr - mov x16,xzr - mov x17,xzr - ldr x12,Lpoly+8 - ldr x13,Lpoly+24 - - bl __ecp_nistz256_sub_from - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded -// to x4-x7 and b[0] - to x3 - -.align 4 -__ecp_nistz256_mul_mont: - mul x14,x4,x3 // a[0]*b[0] - umulh x8,x4,x3 - - mul x15,x5,x3 // a[1]*b[0] - umulh x9,x5,x3 - - mul x16,x6,x3 // a[2]*b[0] - umulh x10,x6,x3 - - mul x17,x7,x3 // a[3]*b[0] - umulh x11,x7,x3 - ldr x3,[x2,#8] // b[1] - - adds x15,x15,x8 // accumulate high parts of multiplication - lsl x8,x14,#32 - adcs x16,x16,x9 - lsr x9,x14,#32 - adcs x17,x17,x10 - adc x19,xzr,x11 - mov x20,xzr - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - mul x8,x4,x3 // lo(a[0]*b[i]) - adcs x15,x16,x9 - mul x9,x5,x3 // lo(a[1]*b[i]) - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - mul x10,x6,x3 // lo(a[2]*b[i]) - adcs x17,x19,x11 - mul x11,x7,x3 // lo(a[3]*b[i]) - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts of multiplication - umulh x8,x4,x3 // hi(a[0]*b[i]) - adcs x15,x15,x9 - umulh x9,x5,x3 // hi(a[1]*b[i]) - adcs x16,x16,x10 - umulh x10,x6,x3 // hi(a[2]*b[i]) - adcs x17,x17,x11 - umulh x11,x7,x3 // hi(a[3]*b[i]) - adc x19,x19,xzr - ldr x3,[x2,#8*(1+1)] // b[1+1] - adds x15,x15,x8 // accumulate high parts of multiplication - lsl x8,x14,#32 - adcs x16,x16,x9 - lsr x9,x14,#32 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - mul x8,x4,x3 // lo(a[0]*b[i]) - adcs x15,x16,x9 - mul x9,x5,x3 // lo(a[1]*b[i]) - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - mul x10,x6,x3 // lo(a[2]*b[i]) - adcs x17,x19,x11 - mul x11,x7,x3 // lo(a[3]*b[i]) - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts of multiplication - umulh x8,x4,x3 // hi(a[0]*b[i]) - adcs x15,x15,x9 - umulh x9,x5,x3 // hi(a[1]*b[i]) - adcs x16,x16,x10 - umulh x10,x6,x3 // hi(a[2]*b[i]) - adcs x17,x17,x11 - umulh x11,x7,x3 // hi(a[3]*b[i]) - adc x19,x19,xzr - ldr x3,[x2,#8*(2+1)] // b[2+1] - adds x15,x15,x8 // accumulate high parts of multiplication - lsl x8,x14,#32 - adcs x16,x16,x9 - lsr x9,x14,#32 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - mul x8,x4,x3 // lo(a[0]*b[i]) - adcs x15,x16,x9 - mul x9,x5,x3 // lo(a[1]*b[i]) - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - mul x10,x6,x3 // lo(a[2]*b[i]) - adcs x17,x19,x11 - mul x11,x7,x3 // lo(a[3]*b[i]) - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts of multiplication - umulh x8,x4,x3 // hi(a[0]*b[i]) - adcs x15,x15,x9 - umulh x9,x5,x3 // hi(a[1]*b[i]) - adcs x16,x16,x10 - umulh x10,x6,x3 // hi(a[2]*b[i]) - adcs x17,x17,x11 - umulh x11,x7,x3 // hi(a[3]*b[i]) - adc x19,x19,xzr - adds x15,x15,x8 // accumulate high parts of multiplication - lsl x8,x14,#32 - adcs x16,x16,x9 - lsr x9,x14,#32 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - // last reduction - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - adcs x15,x16,x9 - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - adcs x17,x19,x11 - adc x19,x20,xzr - - adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus - sbcs x9,x15,x12 - sbcs x10,x16,xzr - sbcs x11,x17,x13 - sbcs xzr,x19,xzr // did it borrow? - - csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus - csel x15,x15,x9,lo - csel x16,x16,x10,lo - stp x14,x15,[x0] - csel x17,x17,x11,lo - stp x16,x17,[x0,#16] - - ret - - -// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded -// to x4-x7 - -.align 4 -__ecp_nistz256_sqr_mont: - // | | | | | |a1*a0| | - // | | | | |a2*a0| | | - // | |a3*a2|a3*a0| | | | - // | | | |a2*a1| | | | - // | | |a3*a1| | | | | - // *| | | | | | | | 2| - // +|a3*a3|a2*a2|a1*a1|a0*a0| - // |--+--+--+--+--+--+--+--| - // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow - // - // "can't overflow" below mark carrying into high part of - // multiplication result, which can't overflow, because it - // can never be all ones. - - mul x15,x5,x4 // a[1]*a[0] - umulh x9,x5,x4 - mul x16,x6,x4 // a[2]*a[0] - umulh x10,x6,x4 - mul x17,x7,x4 // a[3]*a[0] - umulh x19,x7,x4 - - adds x16,x16,x9 // accumulate high parts of multiplication - mul x8,x6,x5 // a[2]*a[1] - umulh x9,x6,x5 - adcs x17,x17,x10 - mul x10,x7,x5 // a[3]*a[1] - umulh x11,x7,x5 - adc x19,x19,xzr // can't overflow - - mul x20,x7,x6 // a[3]*a[2] - umulh x1,x7,x6 - - adds x9,x9,x10 // accumulate high parts of multiplication - mul x14,x4,x4 // a[0]*a[0] - adc x10,x11,xzr // can't overflow - - adds x17,x17,x8 // accumulate low parts of multiplication - umulh x4,x4,x4 - adcs x19,x19,x9 - mul x9,x5,x5 // a[1]*a[1] - adcs x20,x20,x10 - umulh x5,x5,x5 - adc x1,x1,xzr // can't overflow - - adds x15,x15,x15 // acc[1-6]*=2 - mul x10,x6,x6 // a[2]*a[2] - adcs x16,x16,x16 - umulh x6,x6,x6 - adcs x17,x17,x17 - mul x11,x7,x7 // a[3]*a[3] - adcs x19,x19,x19 - umulh x7,x7,x7 - adcs x20,x20,x20 - adcs x1,x1,x1 - adc x2,xzr,xzr - - adds x15,x15,x4 // +a[i]*a[i] - adcs x16,x16,x9 - adcs x17,x17,x5 - adcs x19,x19,x10 - adcs x20,x20,x6 - lsl x8,x14,#32 - adcs x1,x1,x11 - lsr x9,x14,#32 - adc x2,x2,x7 - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - adcs x15,x16,x9 - lsl x8,x14,#32 - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - lsr x9,x14,#32 - adc x17,x11,xzr // can't overflow - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - adcs x15,x16,x9 - lsl x8,x14,#32 - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - lsr x9,x14,#32 - adc x17,x11,xzr // can't overflow - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - adcs x15,x16,x9 - lsl x8,x14,#32 - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - lsr x9,x14,#32 - adc x17,x11,xzr // can't overflow - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - adcs x15,x16,x9 - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - adc x17,x11,xzr // can't overflow - - adds x14,x14,x19 // accumulate upper half - adcs x15,x15,x20 - adcs x16,x16,x1 - adcs x17,x17,x2 - adc x19,xzr,xzr - - adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus - sbcs x9,x15,x12 - sbcs x10,x16,xzr - sbcs x11,x17,x13 - sbcs xzr,x19,xzr // did it borrow? - - csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus - csel x15,x15,x9,lo - csel x16,x16,x10,lo - stp x14,x15,[x0] - csel x17,x17,x11,lo - stp x16,x17,[x0,#16] - - ret - - -// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to -// x4-x7 and x8-x11. This is done because it's used in multiple -// contexts, e.g. in multiplication by 2 and 3... - -.align 4 -__ecp_nistz256_add_to: - adds x14,x14,x8 // ret = a+b - adcs x15,x15,x9 - adcs x16,x16,x10 - adcs x17,x17,x11 - adc x1,xzr,xzr // zap x1 - - adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus - sbcs x9,x15,x12 - sbcs x10,x16,xzr - sbcs x11,x17,x13 - sbcs xzr,x1,xzr // did subtraction borrow? - - csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus - csel x15,x15,x9,lo - csel x16,x16,x10,lo - stp x14,x15,[x0] - csel x17,x17,x11,lo - stp x16,x17,[x0,#16] - - ret - - - -.align 4 -__ecp_nistz256_sub_from: - ldp x8,x9,[x2] - ldp x10,x11,[x2,#16] - subs x14,x14,x8 // ret = a-b - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbcs x17,x17,x11 - sbc x1,xzr,xzr // zap x1 - - subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus - adcs x9,x15,x12 - adcs x10,x16,xzr - adc x11,x17,x13 - cmp x1,xzr // did subtraction borrow? - - csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret - csel x15,x15,x9,eq - csel x16,x16,x10,eq - stp x14,x15,[x0] - csel x17,x17,x11,eq - stp x16,x17,[x0,#16] - - ret - - - -.align 4 -__ecp_nistz256_sub_morf: - ldp x8,x9,[x2] - ldp x10,x11,[x2,#16] - subs x14,x8,x14 // ret = b-a - sbcs x15,x9,x15 - sbcs x16,x10,x16 - sbcs x17,x11,x17 - sbc x1,xzr,xzr // zap x1 - - subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus - adcs x9,x15,x12 - adcs x10,x16,xzr - adc x11,x17,x13 - cmp x1,xzr // did subtraction borrow? - - csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret - csel x15,x15,x9,eq - csel x16,x16,x10,eq - stp x14,x15,[x0] - csel x17,x17,x11,eq - stp x16,x17,[x0,#16] - - ret - - - -.align 4 -__ecp_nistz256_div_by_2: - subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus - adcs x9,x15,x12 - adcs x10,x16,xzr - adcs x11,x17,x13 - adc x1,xzr,xzr // zap x1 - tst x14,#1 // is a even? - - csel x14,x14,x8,eq // ret = even ? a : a+modulus - csel x15,x15,x9,eq - csel x16,x16,x10,eq - csel x17,x17,x11,eq - csel x1,xzr,x1,eq - - lsr x14,x14,#1 // ret >>= 1 - orr x14,x14,x15,lsl#63 - lsr x15,x15,#1 - orr x15,x15,x16,lsl#63 - lsr x16,x16,#1 - orr x16,x16,x17,lsl#63 - lsr x17,x17,#1 - stp x14,x15,[x0] - orr x17,x17,x1,lsl#63 - stp x16,x17,[x0,#16] - - ret - -.globl _ecp_nistz256_point_double -.private_extern _ecp_nistz256_point_double - -.align 5 -_ecp_nistz256_point_double: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - sub sp,sp,#32*4 - -Ldouble_shortcut: - ldp x14,x15,[x1,#32] - mov x21,x0 - ldp x16,x17,[x1,#48] - mov x22,x1 - ldr x12,Lpoly+8 - mov x8,x14 - ldr x13,Lpoly+24 - mov x9,x15 - ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont - mov x10,x16 - mov x11,x17 - ldp x6,x7,[x22,#64+16] - add x0,sp,#0 - bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); - - add x0,sp,#64 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); - - ldp x8,x9,[x22] - ldp x10,x11,[x22,#16] - mov x4,x14 // put Zsqr aside for p256_sub - mov x5,x15 - mov x6,x16 - mov x7,x17 - add x0,sp,#32 - bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); - - add x2,x22,#0 - mov x14,x4 // restore Zsqr - mov x15,x5 - ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont - mov x16,x6 - mov x17,x7 - ldp x6,x7,[sp,#0+16] - add x0,sp,#64 - bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); - - add x0,sp,#0 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); - - ldr x3,[x22,#32] - ldp x4,x5,[x22,#64] - ldp x6,x7,[x22,#64+16] - add x2,x22,#32 - add x0,sp,#96 - bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); - - mov x8,x14 - mov x9,x15 - ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont - mov x10,x16 - mov x11,x17 - ldp x6,x7,[sp,#0+16] - add x0,x21,#64 - bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); - - add x0,sp,#96 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); - - ldr x3,[sp,#64] // forward load for p256_mul_mont - ldp x4,x5,[sp,#32] - ldp x6,x7,[sp,#32+16] - add x0,x21,#32 - bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); - - add x2,sp,#64 - add x0,sp,#32 - bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); - - mov x8,x14 // duplicate M - mov x9,x15 - mov x10,x16 - mov x11,x17 - mov x4,x14 // put M aside - mov x5,x15 - mov x6,x16 - mov x7,x17 - add x0,sp,#32 - bl __ecp_nistz256_add_to - mov x8,x4 // restore M - mov x9,x5 - ldr x3,[x22] // forward load for p256_mul_mont - mov x10,x6 - ldp x4,x5,[sp,#0] - mov x11,x7 - ldp x6,x7,[sp,#0+16] - bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); - - add x2,x22,#0 - add x0,sp,#0 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); - - mov x8,x14 - mov x9,x15 - ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont - mov x10,x16 - mov x11,x17 - ldp x6,x7,[sp,#32+16] - add x0,sp,#96 - bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); - - add x0,x21,#0 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); - - add x2,sp,#96 - bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); - - add x2,sp,#0 - add x0,sp,#0 - bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); - - ldr x3,[sp,#32] - mov x4,x14 // copy S - mov x5,x15 - mov x6,x16 - mov x7,x17 - add x2,sp,#32 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); - - add x2,x21,#32 - add x0,x21,#32 - bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); - - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret - -.globl _ecp_nistz256_point_add -.private_extern _ecp_nistz256_point_add - -.align 5 -_ecp_nistz256_point_add: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#32*12 - - ldp x4,x5,[x2,#64] // in2_z - ldp x6,x7,[x2,#64+16] - mov x21,x0 - mov x22,x1 - mov x23,x2 - ldr x12,Lpoly+8 - ldr x13,Lpoly+24 - orr x8,x4,x5 - orr x10,x6,x7 - orr x25,x8,x10 - cmp x25,#0 - csetm x25,ne // ~in2infty - add x0,sp,#192 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); - - ldp x4,x5,[x22,#64] // in1_z - ldp x6,x7,[x22,#64+16] - orr x8,x4,x5 - orr x10,x6,x7 - orr x24,x8,x10 - cmp x24,#0 - csetm x24,ne // ~in1infty - add x0,sp,#128 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); - - ldr x3,[x23,#64] - ldp x4,x5,[sp,#192] - ldp x6,x7,[sp,#192+16] - add x2,x23,#64 - add x0,sp,#320 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); - - ldr x3,[x22,#64] - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x2,x22,#64 - add x0,sp,#352 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); - - ldr x3,[x22,#32] - ldp x4,x5,[sp,#320] - ldp x6,x7,[sp,#320+16] - add x2,x22,#32 - add x0,sp,#320 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); - - ldr x3,[x23,#32] - ldp x4,x5,[sp,#352] - ldp x6,x7,[sp,#352+16] - add x2,x23,#32 - add x0,sp,#352 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); - - add x2,sp,#320 - ldr x3,[sp,#192] // forward load for p256_mul_mont - ldp x4,x5,[x22] - ldp x6,x7,[x22,#16] - add x0,sp,#160 - bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); - - orr x14,x14,x15 // see if result is zero - orr x16,x16,x17 - orr x26,x14,x16 // ~is_equal(S1,S2) - - add x2,sp,#192 - add x0,sp,#256 - bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); - - ldr x3,[sp,#128] - ldp x4,x5,[x23] - ldp x6,x7,[x23,#16] - add x2,sp,#128 - add x0,sp,#288 - bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); - - add x2,sp,#256 - ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont - ldp x6,x7,[sp,#160+16] - add x0,sp,#96 - bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); - - orr x14,x14,x15 // see if result is zero - orr x16,x16,x17 - orr x14,x14,x16 // ~is_equal(U1,U2) - - mvn x27,x24 // -1/0 -> 0/-1 - mvn x28,x25 // -1/0 -> 0/-1 - orr x14,x14,x27 - orr x14,x14,x28 - orr x14,x14,x26 - cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) - -Ladd_double: - mov x1,x22 - mov x0,x21 - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames - b Ldouble_shortcut - -.align 4 -Ladd_proceed: - add x0,sp,#192 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); - - ldr x3,[x22,#64] - ldp x4,x5,[sp,#96] - ldp x6,x7,[sp,#96+16] - add x2,x22,#64 - add x0,sp,#64 - bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); - - ldp x4,x5,[sp,#96] - ldp x6,x7,[sp,#96+16] - add x0,sp,#128 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); - - ldr x3,[x23,#64] - ldp x4,x5,[sp,#64] - ldp x6,x7,[sp,#64+16] - add x2,x23,#64 - add x0,sp,#64 - bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); - - ldr x3,[sp,#96] - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x2,sp,#96 - add x0,sp,#224 - bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); - - ldr x3,[sp,#128] - ldp x4,x5,[sp,#256] - ldp x6,x7,[sp,#256+16] - add x2,sp,#128 - add x0,sp,#288 - bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); - - mov x8,x14 - mov x9,x15 - mov x10,x16 - mov x11,x17 - add x0,sp,#128 - bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); - - add x2,sp,#192 - add x0,sp,#0 - bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); - - add x2,sp,#224 - bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); - - add x2,sp,#288 - ldr x3,[sp,#224] // forward load for p256_mul_mont - ldp x4,x5,[sp,#320] - ldp x6,x7,[sp,#320+16] - add x0,sp,#32 - bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); - - add x2,sp,#224 - add x0,sp,#352 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); - - ldr x3,[sp,#160] - ldp x4,x5,[sp,#32] - ldp x6,x7,[sp,#32+16] - add x2,sp,#160 - add x0,sp,#32 - bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); - - add x2,sp,#352 - bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); - - ldp x4,x5,[sp,#0] // res - ldp x6,x7,[sp,#0+16] - ldp x8,x9,[x23] // in2 - ldp x10,x11,[x23,#16] - ldp x14,x15,[x22,#0] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#0+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+0+32] // res - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+0+48] - csel x14,x8,x14,ne - csel x15,x9,x15,ne - ldp x8,x9,[x23,#0+32] // in2 - csel x16,x10,x16,ne - csel x17,x11,x17,ne - ldp x10,x11,[x23,#0+48] - stp x14,x15,[x21,#0] - stp x16,x17,[x21,#0+16] - ldp x14,x15,[x22,#32] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#32+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+32+32] // res - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+32+48] - csel x14,x8,x14,ne - csel x15,x9,x15,ne - ldp x8,x9,[x23,#32+32] // in2 - csel x16,x10,x16,ne - csel x17,x11,x17,ne - ldp x10,x11,[x23,#32+48] - stp x14,x15,[x21,#32] - stp x16,x17,[x21,#32+16] - ldp x14,x15,[x22,#64] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#64+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - csel x14,x8,x14,ne - csel x15,x9,x15,ne - csel x16,x10,x16,ne - csel x17,x11,x17,ne - stp x14,x15,[x21,#64] - stp x16,x17,[x21,#64+16] - -Ladd_done: - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret - -.globl _ecp_nistz256_point_add_affine -.private_extern _ecp_nistz256_point_add_affine - -.align 5 -_ecp_nistz256_point_add_affine: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - sub sp,sp,#32*10 - - mov x21,x0 - mov x22,x1 - mov x23,x2 - ldr x12,Lpoly+8 - ldr x13,Lpoly+24 - - ldp x4,x5,[x1,#64] // in1_z - ldp x6,x7,[x1,#64+16] - orr x8,x4,x5 - orr x10,x6,x7 - orr x24,x8,x10 - cmp x24,#0 - csetm x24,ne // ~in1infty - - ldp x14,x15,[x2] // in2_x - ldp x16,x17,[x2,#16] - ldp x8,x9,[x2,#32] // in2_y - ldp x10,x11,[x2,#48] - orr x14,x14,x15 - orr x16,x16,x17 - orr x8,x8,x9 - orr x10,x10,x11 - orr x14,x14,x16 - orr x8,x8,x10 - orr x25,x14,x8 - cmp x25,#0 - csetm x25,ne // ~in2infty - - add x0,sp,#128 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); - - mov x4,x14 - mov x5,x15 - mov x6,x16 - mov x7,x17 - ldr x3,[x23] - add x2,x23,#0 - add x0,sp,#96 - bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); - - add x2,x22,#0 - ldr x3,[x22,#64] // forward load for p256_mul_mont - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x0,sp,#160 - bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); - - add x2,x22,#64 - add x0,sp,#128 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); - - ldr x3,[x22,#64] - ldp x4,x5,[sp,#160] - ldp x6,x7,[sp,#160+16] - add x2,x22,#64 - add x0,sp,#64 - bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); - - ldr x3,[x23,#32] - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x2,x23,#32 - add x0,sp,#128 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); - - add x2,x22,#32 - ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont - ldp x6,x7,[sp,#160+16] - add x0,sp,#192 - bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); - - add x0,sp,#224 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); - - ldp x4,x5,[sp,#192] - ldp x6,x7,[sp,#192+16] - add x0,sp,#288 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); - - ldr x3,[sp,#160] - ldp x4,x5,[sp,#224] - ldp x6,x7,[sp,#224+16] - add x2,sp,#160 - add x0,sp,#256 - bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); - - ldr x3,[x22] - ldp x4,x5,[sp,#224] - ldp x6,x7,[sp,#224+16] - add x2,x22,#0 - add x0,sp,#96 - bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); - - mov x8,x14 - mov x9,x15 - mov x10,x16 - mov x11,x17 - add x0,sp,#224 - bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); - - add x2,sp,#288 - add x0,sp,#0 - bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); - - add x2,sp,#256 - bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); - - add x2,sp,#96 - ldr x3,[x22,#32] // forward load for p256_mul_mont - ldp x4,x5,[sp,#256] - ldp x6,x7,[sp,#256+16] - add x0,sp,#32 - bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); - - add x2,x22,#32 - add x0,sp,#128 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); - - ldr x3,[sp,#192] - ldp x4,x5,[sp,#32] - ldp x6,x7,[sp,#32+16] - add x2,sp,#192 - add x0,sp,#32 - bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); - - add x2,sp,#128 - bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); - - ldp x4,x5,[sp,#0] // res - ldp x6,x7,[sp,#0+16] - ldp x8,x9,[x23] // in2 - ldp x10,x11,[x23,#16] - ldp x14,x15,[x22,#0] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#0+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+0+32] // res - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+0+48] - csel x14,x8,x14,ne - csel x15,x9,x15,ne - ldp x8,x9,[x23,#0+32] // in2 - csel x16,x10,x16,ne - csel x17,x11,x17,ne - ldp x10,x11,[x23,#0+48] - stp x14,x15,[x21,#0] - stp x16,x17,[x21,#0+16] - adr x23,Lone_mont-64 - ldp x14,x15,[x22,#32] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#32+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+32+32] // res - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+32+48] - csel x14,x8,x14,ne - csel x15,x9,x15,ne - ldp x8,x9,[x23,#32+32] // in2 - csel x16,x10,x16,ne - csel x17,x11,x17,ne - ldp x10,x11,[x23,#32+48] - stp x14,x15,[x21,#32] - stp x16,x17,[x21,#32+16] - ldp x14,x15,[x22,#64] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#64+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - csel x14,x8,x14,ne - csel x15,x9,x15,ne - csel x16,x10,x16,ne - csel x17,x11,x17,ne - stp x14,x15,[x21,#64] - stp x16,x17,[x21,#64+16] - - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x29,x30,[sp],#80 - AARCH64_VALIDATE_LINK_REGISTER - ret - -//////////////////////////////////////////////////////////////////////// -// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], -// uint64_t b[4]); -.globl _ecp_nistz256_ord_mul_mont -.private_extern _ecp_nistz256_ord_mul_mont - -.align 4 -_ecp_nistz256_ord_mul_mont: - AARCH64_VALID_CALL_TARGET - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - adr x23,Lord - ldr x3,[x2] // bp[0] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - - ldp x12,x13,[x23,#0] - ldp x21,x22,[x23,#16] - ldr x23,[x23,#32] - - mul x14,x4,x3 // a[0]*b[0] - umulh x8,x4,x3 - - mul x15,x5,x3 // a[1]*b[0] - umulh x9,x5,x3 - - mul x16,x6,x3 // a[2]*b[0] - umulh x10,x6,x3 - - mul x17,x7,x3 // a[3]*b[0] - umulh x19,x7,x3 - - mul x24,x14,x23 - - adds x15,x15,x8 // accumulate high parts of multiplication - adcs x16,x16,x9 - adcs x17,x17,x10 - adc x19,x19,xzr - mov x20,xzr - ldr x3,[x2,#8*1] // b[i] - - lsl x8,x24,#32 - subs x16,x16,x24 - lsr x9,x24,#32 - sbcs x17,x17,x8 - sbcs x19,x19,x9 - sbc x20,x20,xzr - - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - mul x8,x4,x3 - adc x11,x11,xzr - mul x9,x5,x3 - - adds x14,x15,x10 - mul x10,x6,x3 - adcs x15,x16,x11 - mul x11,x7,x3 - adcs x16,x17,x24 - adcs x17,x19,x24 - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts - umulh x8,x4,x3 - adcs x15,x15,x9 - umulh x9,x5,x3 - adcs x16,x16,x10 - umulh x10,x6,x3 - adcs x17,x17,x11 - umulh x11,x7,x3 - adc x19,x19,xzr - mul x24,x14,x23 - adds x15,x15,x8 // accumulate high parts - adcs x16,x16,x9 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - ldr x3,[x2,#8*2] // b[i] - - lsl x8,x24,#32 - subs x16,x16,x24 - lsr x9,x24,#32 - sbcs x17,x17,x8 - sbcs x19,x19,x9 - sbc x20,x20,xzr - - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - mul x8,x4,x3 - adc x11,x11,xzr - mul x9,x5,x3 - - adds x14,x15,x10 - mul x10,x6,x3 - adcs x15,x16,x11 - mul x11,x7,x3 - adcs x16,x17,x24 - adcs x17,x19,x24 - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts - umulh x8,x4,x3 - adcs x15,x15,x9 - umulh x9,x5,x3 - adcs x16,x16,x10 - umulh x10,x6,x3 - adcs x17,x17,x11 - umulh x11,x7,x3 - adc x19,x19,xzr - mul x24,x14,x23 - adds x15,x15,x8 // accumulate high parts - adcs x16,x16,x9 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - ldr x3,[x2,#8*3] // b[i] - - lsl x8,x24,#32 - subs x16,x16,x24 - lsr x9,x24,#32 - sbcs x17,x17,x8 - sbcs x19,x19,x9 - sbc x20,x20,xzr - - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - mul x8,x4,x3 - adc x11,x11,xzr - mul x9,x5,x3 - - adds x14,x15,x10 - mul x10,x6,x3 - adcs x15,x16,x11 - mul x11,x7,x3 - adcs x16,x17,x24 - adcs x17,x19,x24 - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts - umulh x8,x4,x3 - adcs x15,x15,x9 - umulh x9,x5,x3 - adcs x16,x16,x10 - umulh x10,x6,x3 - adcs x17,x17,x11 - umulh x11,x7,x3 - adc x19,x19,xzr - mul x24,x14,x23 - adds x15,x15,x8 // accumulate high parts - adcs x16,x16,x9 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - lsl x8,x24,#32 // last reduction - subs x16,x16,x24 - lsr x9,x24,#32 - sbcs x17,x17,x8 - sbcs x19,x19,x9 - sbc x20,x20,xzr - - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - adc x11,x11,xzr - - adds x14,x15,x10 - adcs x15,x16,x11 - adcs x16,x17,x24 - adcs x17,x19,x24 - adc x19,x20,xzr - - subs x8,x14,x12 // ret -= modulus - sbcs x9,x15,x13 - sbcs x10,x16,x21 - sbcs x11,x17,x22 - sbcs xzr,x19,xzr - - csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus - csel x15,x15,x9,lo - csel x16,x16,x10,lo - stp x14,x15,[x0] - csel x17,x17,x11,lo - stp x16,x17,[x0,#16] - - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldr x29,[sp],#64 - ret - - -//////////////////////////////////////////////////////////////////////// -// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], -// int rep); -.globl _ecp_nistz256_ord_sqr_mont -.private_extern _ecp_nistz256_ord_sqr_mont - -.align 4 -_ecp_nistz256_ord_sqr_mont: - AARCH64_VALID_CALL_TARGET - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - adr x23,Lord - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - - ldp x12,x13,[x23,#0] - ldp x21,x22,[x23,#16] - ldr x23,[x23,#32] - b Loop_ord_sqr - -.align 4 -Loop_ord_sqr: - sub x2,x2,#1 - //////////////////////////////////////////////////////////////// - // | | | | | |a1*a0| | - // | | | | |a2*a0| | | - // | |a3*a2|a3*a0| | | | - // | | | |a2*a1| | | | - // | | |a3*a1| | | | | - // *| | | | | | | | 2| - // +|a3*a3|a2*a2|a1*a1|a0*a0| - // |--+--+--+--+--+--+--+--| - // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow - // - // "can't overflow" below mark carrying into high part of - // multiplication result, which can't overflow, because it - // can never be all ones. - - mul x15,x5,x4 // a[1]*a[0] - umulh x9,x5,x4 - mul x16,x6,x4 // a[2]*a[0] - umulh x10,x6,x4 - mul x17,x7,x4 // a[3]*a[0] - umulh x19,x7,x4 - - adds x16,x16,x9 // accumulate high parts of multiplication - mul x8,x6,x5 // a[2]*a[1] - umulh x9,x6,x5 - adcs x17,x17,x10 - mul x10,x7,x5 // a[3]*a[1] - umulh x11,x7,x5 - adc x19,x19,xzr // can't overflow - - mul x20,x7,x6 // a[3]*a[2] - umulh x1,x7,x6 - - adds x9,x9,x10 // accumulate high parts of multiplication - mul x14,x4,x4 // a[0]*a[0] - adc x10,x11,xzr // can't overflow - - adds x17,x17,x8 // accumulate low parts of multiplication - umulh x4,x4,x4 - adcs x19,x19,x9 - mul x9,x5,x5 // a[1]*a[1] - adcs x20,x20,x10 - umulh x5,x5,x5 - adc x1,x1,xzr // can't overflow - - adds x15,x15,x15 // acc[1-6]*=2 - mul x10,x6,x6 // a[2]*a[2] - adcs x16,x16,x16 - umulh x6,x6,x6 - adcs x17,x17,x17 - mul x11,x7,x7 // a[3]*a[3] - adcs x19,x19,x19 - umulh x7,x7,x7 - adcs x20,x20,x20 - adcs x1,x1,x1 - adc x3,xzr,xzr - - adds x15,x15,x4 // +a[i]*a[i] - mul x24,x14,x23 - adcs x16,x16,x9 - adcs x17,x17,x5 - adcs x19,x19,x10 - adcs x20,x20,x6 - adcs x1,x1,x11 - adc x3,x3,x7 - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - adc x11,x11,xzr - - adds x14,x15,x10 - adcs x15,x16,x11 - adcs x16,x17,x24 - adc x17,xzr,x24 // can't overflow - mul x11,x14,x23 - lsl x8,x24,#32 - subs x15,x15,x24 - lsr x9,x24,#32 - sbcs x16,x16,x8 - sbc x17,x17,x9 // can't borrow - subs xzr,x14,#1 - umulh x9,x12,x11 - mul x10,x13,x11 - umulh x24,x13,x11 - - adcs x10,x10,x9 - adc x24,x24,xzr - - adds x14,x15,x10 - adcs x15,x16,x24 - adcs x16,x17,x11 - adc x17,xzr,x11 // can't overflow - mul x24,x14,x23 - lsl x8,x11,#32 - subs x15,x15,x11 - lsr x9,x11,#32 - sbcs x16,x16,x8 - sbc x17,x17,x9 // can't borrow - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - adc x11,x11,xzr - - adds x14,x15,x10 - adcs x15,x16,x11 - adcs x16,x17,x24 - adc x17,xzr,x24 // can't overflow - mul x11,x14,x23 - lsl x8,x24,#32 - subs x15,x15,x24 - lsr x9,x24,#32 - sbcs x16,x16,x8 - sbc x17,x17,x9 // can't borrow - subs xzr,x14,#1 - umulh x9,x12,x11 - mul x10,x13,x11 - umulh x24,x13,x11 - - adcs x10,x10,x9 - adc x24,x24,xzr - - adds x14,x15,x10 - adcs x15,x16,x24 - adcs x16,x17,x11 - adc x17,xzr,x11 // can't overflow - lsl x8,x11,#32 - subs x15,x15,x11 - lsr x9,x11,#32 - sbcs x16,x16,x8 - sbc x17,x17,x9 // can't borrow - adds x14,x14,x19 // accumulate upper half - adcs x15,x15,x20 - adcs x16,x16,x1 - adcs x17,x17,x3 - adc x19,xzr,xzr - - subs x8,x14,x12 // ret -= modulus - sbcs x9,x15,x13 - sbcs x10,x16,x21 - sbcs x11,x17,x22 - sbcs xzr,x19,xzr - - csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus - csel x5,x15,x9,lo - csel x6,x16,x10,lo - csel x7,x17,x11,lo - - cbnz x2,Loop_ord_sqr - - stp x4,x5,[x0] - stp x6,x7,[x0,#16] - - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldr x29,[sp],#64 - ret - -//////////////////////////////////////////////////////////////////////// -// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); -.globl _ecp_nistz256_select_w5 -.private_extern _ecp_nistz256_select_w5 - -.align 4 -_ecp_nistz256_select_w5: - AARCH64_VALID_CALL_TARGET - - // x10 := x0 - // w9 := 0; loop counter and incremented internal index - mov x10, x0 - mov w9, #0 - - // [v16-v21] := 0 - movi v16.16b, #0 - movi v17.16b, #0 - movi v18.16b, #0 - movi v19.16b, #0 - movi v20.16b, #0 - movi v21.16b, #0 - -Lselect_w5_loop: - // Loop 16 times. - - // Increment index (loop counter); tested at the end of the loop - add w9, w9, #1 - - // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 - // and advance x1 to point to the next entry - ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 - - // x11 := (w9 == w2)? All 1s : All 0s - cmp w9, w2 - csetm x11, eq - - // continue loading ... - ld1 {v26.2d, v27.2d}, [x1],#32 - - // duplicate mask_64 into Mask (all 0s or all 1s) - dup v3.2d, x11 - - // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] - // i.e., values in output registers will remain the same if w9 != w2 - bit v16.16b, v22.16b, v3.16b - bit v17.16b, v23.16b, v3.16b - - bit v18.16b, v24.16b, v3.16b - bit v19.16b, v25.16b, v3.16b - - bit v20.16b, v26.16b, v3.16b - bit v21.16b, v27.16b, v3.16b - - // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back - tbz w9, #4, Lselect_w5_loop - - // Write [v16-v21] to memory at the output pointer - st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 - st1 {v20.2d, v21.2d}, [x10] - - ret - - - -//////////////////////////////////////////////////////////////////////// -// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); -.globl _ecp_nistz256_select_w7 -.private_extern _ecp_nistz256_select_w7 - -.align 4 -_ecp_nistz256_select_w7: - AARCH64_VALID_CALL_TARGET - - // w9 := 0; loop counter and incremented internal index - mov w9, #0 - - // [v16-v21] := 0 - movi v16.16b, #0 - movi v17.16b, #0 - movi v18.16b, #0 - movi v19.16b, #0 - -Lselect_w7_loop: - // Loop 64 times. - - // Increment index (loop counter); tested at the end of the loop - add w9, w9, #1 - - // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 - // and advance x1 to point to the next entry - ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 - - // x11 := (w9 == w2)? All 1s : All 0s - cmp w9, w2 - csetm x11, eq - - // duplicate mask_64 into Mask (all 0s or all 1s) - dup v3.2d, x11 - - // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] - // i.e., values in output registers will remain the same if w9 != w2 - bit v16.16b, v22.16b, v3.16b - bit v17.16b, v23.16b, v3.16b - - bit v18.16b, v24.16b, v3.16b - bit v19.16b, v25.16b, v3.16b - - // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back - tbz w9, #6, Lselect_w7_loop - - // Write [v16-v19] to memory at the output pointer - st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] - - ret - -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S b/third_party/boringssl/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S deleted file mode 100644 index 317b8138..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S +++ /dev/null @@ -1,317 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include "openssl/arm_arch.h" - -.text -.globl _beeu_mod_inverse_vartime -.private_extern _beeu_mod_inverse_vartime - -.align 4 -_beeu_mod_inverse_vartime: - // Reserve enough space for 14 8-byte registers on the stack - // in the first stp call for x29, x30. - // Then store the remaining callee-saved registers. - // - // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 | - // ^ ^ - // sp <------------------- 112 bytes ----------------> old sp - // x29 (FP) - // - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-112]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x0,x2,[sp,#96] - - // B = b3..b0 := a - ldp x25,x26,[x1] - ldp x27,x28,[x1,#16] - - // n3..n0 := n - // Note: the value of input params are changed in the following. - ldp x0,x1,[x2] - ldp x2,x30,[x2,#16] - - // A = a3..a0 := n - mov x21, x0 - mov x22, x1 - mov x23, x2 - mov x24, x30 - - // X = x4..x0 := 1 - mov x3, #1 - eor x4, x4, x4 - eor x5, x5, x5 - eor x6, x6, x6 - eor x7, x7, x7 - - // Y = y4..y0 := 0 - eor x8, x8, x8 - eor x9, x9, x9 - eor x10, x10, x10 - eor x11, x11, x11 - eor x12, x12, x12 - -Lbeeu_loop: - // if B == 0, jump to .Lbeeu_loop_end - orr x14, x25, x26 - orr x14, x14, x27 - - // reverse the bit order of x25. This is needed for clz after this macro - rbit x15, x25 - - orr x14, x14, x28 - cbz x14,Lbeeu_loop_end - - - // 0 < B < |n|, - // 0 < A <= |n|, - // (1) X*a == B (mod |n|), - // (2) (-1)*Y*a == A (mod |n|) - - // Now divide B by the maximum possible power of two in the - // integers, and divide X by the same value mod |n|. - // When we're done, (1) still holds. - - // shift := number of trailing 0s in x25 - // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO) - clz x13, x15 - - // If there is no shift, goto shift_A_Y - cbz x13, Lbeeu_shift_A_Y - - // Shift B right by "x13" bits - neg x14, x13 - lsr x25, x25, x13 - lsl x15, x26, x14 - - lsr x26, x26, x13 - lsl x19, x27, x14 - - orr x25, x25, x15 - - lsr x27, x27, x13 - lsl x20, x28, x14 - - orr x26, x26, x19 - - lsr x28, x28, x13 - - orr x27, x27, x20 - - - // Shift X right by "x13" bits, adding n whenever X becomes odd. - // x13--; - // x14 := 0; needed in the addition to the most significant word in SHIFT1 - eor x14, x14, x14 -Lbeeu_shift_loop_X: - tbz x3, #0, Lshift1_0 - adds x3, x3, x0 - adcs x4, x4, x1 - adcs x5, x5, x2 - adcs x6, x6, x30 - adc x7, x7, x14 -Lshift1_0: - // var0 := [var1|var0]<64..1>; - // i.e. concatenate var1 and var0, - // extract bits <64..1> from the resulting 128-bit value - // and put them in var0 - extr x3, x4, x3, #1 - extr x4, x5, x4, #1 - extr x5, x6, x5, #1 - extr x6, x7, x6, #1 - lsr x7, x7, #1 - - subs x13, x13, #1 - bne Lbeeu_shift_loop_X - - // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl - // with the following differences: - // - "x13" is set directly to the number of trailing 0s in B - // (using rbit and clz instructions) - // - The loop is only used to call SHIFT1(X) - // and x13 is decreased while executing the X loop. - // - SHIFT256(B, x13) is performed before right-shifting X; they are independent - -Lbeeu_shift_A_Y: - // Same for A and Y. - // Afterwards, (2) still holds. - // Reverse the bit order of x21 - // x13 := number of trailing 0s in x21 (= number of leading 0s in x15) - rbit x15, x21 - clz x13, x15 - - // If there is no shift, goto |B-A|, X+Y update - cbz x13, Lbeeu_update_B_X_or_A_Y - - // Shift A right by "x13" bits - neg x14, x13 - lsr x21, x21, x13 - lsl x15, x22, x14 - - lsr x22, x22, x13 - lsl x19, x23, x14 - - orr x21, x21, x15 - - lsr x23, x23, x13 - lsl x20, x24, x14 - - orr x22, x22, x19 - - lsr x24, x24, x13 - - orr x23, x23, x20 - - - // Shift Y right by "x13" bits, adding n whenever Y becomes odd. - // x13--; - // x14 := 0; needed in the addition to the most significant word in SHIFT1 - eor x14, x14, x14 -Lbeeu_shift_loop_Y: - tbz x8, #0, Lshift1_1 - adds x8, x8, x0 - adcs x9, x9, x1 - adcs x10, x10, x2 - adcs x11, x11, x30 - adc x12, x12, x14 -Lshift1_1: - // var0 := [var1|var0]<64..1>; - // i.e. concatenate var1 and var0, - // extract bits <64..1> from the resulting 128-bit value - // and put them in var0 - extr x8, x9, x8, #1 - extr x9, x10, x9, #1 - extr x10, x11, x10, #1 - extr x11, x12, x11, #1 - lsr x12, x12, #1 - - subs x13, x13, #1 - bne Lbeeu_shift_loop_Y - -Lbeeu_update_B_X_or_A_Y: - // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow) - // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words - // without taking a sign bit if generated. The lack of a carry would - // indicate a negative result. See, for example, - // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes - subs x14, x25, x21 - sbcs x15, x26, x22 - sbcs x19, x27, x23 - sbcs x20, x28, x24 - bcs Lbeeu_B_greater_than_A - - // Else A > B => - // A := A - B; Y := Y + X; goto beginning of the loop - subs x21, x21, x25 - sbcs x22, x22, x26 - sbcs x23, x23, x27 - sbcs x24, x24, x28 - - adds x8, x8, x3 - adcs x9, x9, x4 - adcs x10, x10, x5 - adcs x11, x11, x6 - adc x12, x12, x7 - b Lbeeu_loop - -Lbeeu_B_greater_than_A: - // Continue with B > A => - // B := B - A; X := X + Y; goto beginning of the loop - mov x25, x14 - mov x26, x15 - mov x27, x19 - mov x28, x20 - - adds x3, x3, x8 - adcs x4, x4, x9 - adcs x5, x5, x10 - adcs x6, x6, x11 - adc x7, x7, x12 - b Lbeeu_loop - -Lbeeu_loop_end: - // The Euclid's algorithm loop ends when A == gcd(a,n); - // this would be 1, when a and n are co-prime (i.e. do not have a common factor). - // Since (-1)*Y*a == A (mod |n|), Y>0 - // then out = -Y mod n - - // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) - // Is A-1 == 0? - // If not, fail. - sub x14, x21, #1 - orr x14, x14, x22 - orr x14, x14, x23 - orr x14, x14, x24 - cbnz x14, Lbeeu_err - - // If Y>n ==> Y:=Y-n -Lbeeu_reduction_loop: - // x_i := y_i - n_i (X is no longer needed, use it as temp) - // (x14 = 0 from above) - subs x3, x8, x0 - sbcs x4, x9, x1 - sbcs x5, x10, x2 - sbcs x6, x11, x30 - sbcs x7, x12, x14 - - // If result is non-negative (i.e., cs = carry set = no borrow), - // y_i := x_i; goto reduce again - // else - // y_i := y_i; continue - csel x8, x3, x8, cs - csel x9, x4, x9, cs - csel x10, x5, x10, cs - csel x11, x6, x11, cs - csel x12, x7, x12, cs - bcs Lbeeu_reduction_loop - - // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0) - // out = -Y = n-Y - subs x8, x0, x8 - sbcs x9, x1, x9 - sbcs x10, x2, x10 - sbcs x11, x30, x11 - - // Save Y in output (out (x0) was saved on the stack) - ldr x3, [sp,#96] - stp x8, x9, [x3] - stp x10, x11, [x3,#16] - // return 1 (success) - mov x0, #1 - b Lbeeu_finish - -Lbeeu_err: - // return 0 (error) - eor x0, x0, x0 - -Lbeeu_finish: - // Restore callee-saved registers, except x0, x2 - add sp,x29,#0 - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldp x25,x26,[sp,#64] - ldp x27,x28,[sp,#80] - ldp x29,x30,[sp],#112 - - AARCH64_VALIDATE_LINK_REGISTER - ret - -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/sha1-armv8.S b/third_party/boringssl/apple-aarch64/crypto/fipsmodule/sha1-armv8.S deleted file mode 100644 index 62ba800f..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/sha1-armv8.S +++ /dev/null @@ -1,1235 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.text - - -.private_extern _OPENSSL_armcap_P -.globl _sha1_block_data_order -.private_extern _sha1_block_data_order - -.align 6 -_sha1_block_data_order: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - AARCH64_VALID_CALL_TARGET -#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P -#else - adrp x16,_OPENSSL_armcap_P@PAGE -#endif - ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF] - tst w16,#ARMV8_SHA1 - b.ne Lv8_entry - - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp w20,w21,[x0] - ldp w22,w23,[x0,#8] - ldr w24,[x0,#16] - -Loop: - ldr x3,[x1],#64 - movz w28,#0x7999 - sub x2,x2,#1 - movk w28,#0x5a82,lsl#16 -#ifdef __AARCH64EB__ - ror x3,x3,#32 -#else - rev32 x3,x3 -#endif - add w24,w24,w28 // warm it up - add w24,w24,w3 - lsr x4,x3,#32 - ldr x5,[x1,#-56] - bic w25,w23,w21 - and w26,w22,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - orr w25,w25,w26 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - add w23,w23,w4 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x5,x5,#32 -#else - rev32 x5,x5 -#endif - bic w25,w22,w20 - and w26,w21,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - orr w25,w25,w26 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - add w22,w22,w5 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - lsr x6,x5,#32 - ldr x7,[x1,#-48] - bic w25,w21,w24 - and w26,w20,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - orr w25,w25,w26 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - add w21,w21,w6 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x7,x7,#32 -#else - rev32 x7,x7 -#endif - bic w25,w20,w23 - and w26,w24,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - orr w25,w25,w26 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - add w20,w20,w7 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - lsr x8,x7,#32 - ldr x9,[x1,#-40] - bic w25,w24,w22 - and w26,w23,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - orr w25,w25,w26 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - add w24,w24,w8 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x9,x9,#32 -#else - rev32 x9,x9 -#endif - bic w25,w23,w21 - and w26,w22,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - orr w25,w25,w26 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - add w23,w23,w9 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - lsr x10,x9,#32 - ldr x11,[x1,#-32] - bic w25,w22,w20 - and w26,w21,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - orr w25,w25,w26 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - add w22,w22,w10 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x11,x11,#32 -#else - rev32 x11,x11 -#endif - bic w25,w21,w24 - and w26,w20,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - orr w25,w25,w26 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - add w21,w21,w11 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - lsr x12,x11,#32 - ldr x13,[x1,#-24] - bic w25,w20,w23 - and w26,w24,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - orr w25,w25,w26 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - add w20,w20,w12 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x13,x13,#32 -#else - rev32 x13,x13 -#endif - bic w25,w24,w22 - and w26,w23,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - orr w25,w25,w26 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - add w24,w24,w13 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - lsr x14,x13,#32 - ldr x15,[x1,#-16] - bic w25,w23,w21 - and w26,w22,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - orr w25,w25,w26 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - add w23,w23,w14 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x15,x15,#32 -#else - rev32 x15,x15 -#endif - bic w25,w22,w20 - and w26,w21,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - orr w25,w25,w26 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - add w22,w22,w15 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - lsr x16,x15,#32 - ldr x17,[x1,#-8] - bic w25,w21,w24 - and w26,w20,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - orr w25,w25,w26 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - add w21,w21,w16 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x17,x17,#32 -#else - rev32 x17,x17 -#endif - bic w25,w20,w23 - and w26,w24,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - orr w25,w25,w26 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - add w20,w20,w17 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - lsr x19,x17,#32 - eor w3,w3,w5 - bic w25,w24,w22 - and w26,w23,w22 - ror w27,w21,#27 - eor w3,w3,w11 - add w24,w24,w28 // future e+=K - orr w25,w25,w26 - add w20,w20,w27 // e+=rot(a,5) - eor w3,w3,w16 - ror w22,w22,#2 - add w24,w24,w19 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w3,w3,#31 - eor w4,w4,w6 - bic w25,w23,w21 - and w26,w22,w21 - ror w27,w20,#27 - eor w4,w4,w12 - add w23,w23,w28 // future e+=K - orr w25,w25,w26 - add w24,w24,w27 // e+=rot(a,5) - eor w4,w4,w17 - ror w21,w21,#2 - add w23,w23,w3 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w4,w4,#31 - eor w5,w5,w7 - bic w25,w22,w20 - and w26,w21,w20 - ror w27,w24,#27 - eor w5,w5,w13 - add w22,w22,w28 // future e+=K - orr w25,w25,w26 - add w23,w23,w27 // e+=rot(a,5) - eor w5,w5,w19 - ror w20,w20,#2 - add w22,w22,w4 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w5,w5,#31 - eor w6,w6,w8 - bic w25,w21,w24 - and w26,w20,w24 - ror w27,w23,#27 - eor w6,w6,w14 - add w21,w21,w28 // future e+=K - orr w25,w25,w26 - add w22,w22,w27 // e+=rot(a,5) - eor w6,w6,w3 - ror w24,w24,#2 - add w21,w21,w5 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w6,w6,#31 - eor w7,w7,w9 - bic w25,w20,w23 - and w26,w24,w23 - ror w27,w22,#27 - eor w7,w7,w15 - add w20,w20,w28 // future e+=K - orr w25,w25,w26 - add w21,w21,w27 // e+=rot(a,5) - eor w7,w7,w4 - ror w23,w23,#2 - add w20,w20,w6 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w7,w7,#31 - movz w28,#0xeba1 - movk w28,#0x6ed9,lsl#16 - eor w8,w8,w10 - bic w25,w24,w22 - and w26,w23,w22 - ror w27,w21,#27 - eor w8,w8,w16 - add w24,w24,w28 // future e+=K - orr w25,w25,w26 - add w20,w20,w27 // e+=rot(a,5) - eor w8,w8,w5 - ror w22,w22,#2 - add w24,w24,w7 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w8,w8,#31 - eor w9,w9,w11 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w9,w9,w17 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w9,w9,w6 - add w23,w23,w8 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w9,w9,#31 - eor w10,w10,w12 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w10,w10,w19 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w10,w10,w7 - add w22,w22,w9 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w10,w10,#31 - eor w11,w11,w13 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w11,w11,w3 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w11,w11,w8 - add w21,w21,w10 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w11,w11,#31 - eor w12,w12,w14 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w12,w12,w4 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w12,w12,w9 - add w20,w20,w11 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w12,w12,#31 - eor w13,w13,w15 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w13,w13,w5 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w13,w13,w10 - add w24,w24,w12 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w13,w13,#31 - eor w14,w14,w16 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w14,w14,w6 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w14,w14,w11 - add w23,w23,w13 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w14,w14,#31 - eor w15,w15,w17 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w15,w15,w7 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w15,w15,w12 - add w22,w22,w14 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w15,w15,#31 - eor w16,w16,w19 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w16,w16,w8 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w16,w16,w13 - add w21,w21,w15 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w16,w16,#31 - eor w17,w17,w3 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w17,w17,w9 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w17,w17,w14 - add w20,w20,w16 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w17,w17,#31 - eor w19,w19,w4 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w19,w19,w10 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w19,w19,w15 - add w24,w24,w17 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w19,w19,#31 - eor w3,w3,w5 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w3,w3,w11 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w3,w3,w16 - add w23,w23,w19 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w3,w3,#31 - eor w4,w4,w6 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w4,w4,w12 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w4,w4,w17 - add w22,w22,w3 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w4,w4,#31 - eor w5,w5,w7 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w5,w5,w13 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w5,w5,w19 - add w21,w21,w4 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w5,w5,#31 - eor w6,w6,w8 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w6,w6,w14 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w6,w6,w3 - add w20,w20,w5 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w6,w6,#31 - eor w7,w7,w9 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w7,w7,w15 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w7,w7,w4 - add w24,w24,w6 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w7,w7,#31 - eor w8,w8,w10 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w8,w8,w16 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w8,w8,w5 - add w23,w23,w7 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w8,w8,#31 - eor w9,w9,w11 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w9,w9,w17 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w9,w9,w6 - add w22,w22,w8 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w9,w9,#31 - eor w10,w10,w12 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w10,w10,w19 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w10,w10,w7 - add w21,w21,w9 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w10,w10,#31 - eor w11,w11,w13 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w11,w11,w3 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w11,w11,w8 - add w20,w20,w10 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w11,w11,#31 - movz w28,#0xbcdc - movk w28,#0x8f1b,lsl#16 - eor w12,w12,w14 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w12,w12,w4 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w12,w12,w9 - add w24,w24,w11 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w12,w12,#31 - orr w25,w21,w22 - and w26,w21,w22 - eor w13,w13,w15 - ror w27,w20,#27 - and w25,w25,w23 - add w23,w23,w28 // future e+=K - eor w13,w13,w5 - add w24,w24,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w21,w21,#2 - eor w13,w13,w10 - add w23,w23,w12 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w13,w13,#31 - orr w25,w20,w21 - and w26,w20,w21 - eor w14,w14,w16 - ror w27,w24,#27 - and w25,w25,w22 - add w22,w22,w28 // future e+=K - eor w14,w14,w6 - add w23,w23,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w20,w20,#2 - eor w14,w14,w11 - add w22,w22,w13 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w14,w14,#31 - orr w25,w24,w20 - and w26,w24,w20 - eor w15,w15,w17 - ror w27,w23,#27 - and w25,w25,w21 - add w21,w21,w28 // future e+=K - eor w15,w15,w7 - add w22,w22,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w24,w24,#2 - eor w15,w15,w12 - add w21,w21,w14 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w15,w15,#31 - orr w25,w23,w24 - and w26,w23,w24 - eor w16,w16,w19 - ror w27,w22,#27 - and w25,w25,w20 - add w20,w20,w28 // future e+=K - eor w16,w16,w8 - add w21,w21,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w23,w23,#2 - eor w16,w16,w13 - add w20,w20,w15 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w16,w16,#31 - orr w25,w22,w23 - and w26,w22,w23 - eor w17,w17,w3 - ror w27,w21,#27 - and w25,w25,w24 - add w24,w24,w28 // future e+=K - eor w17,w17,w9 - add w20,w20,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w22,w22,#2 - eor w17,w17,w14 - add w24,w24,w16 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w17,w17,#31 - orr w25,w21,w22 - and w26,w21,w22 - eor w19,w19,w4 - ror w27,w20,#27 - and w25,w25,w23 - add w23,w23,w28 // future e+=K - eor w19,w19,w10 - add w24,w24,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w21,w21,#2 - eor w19,w19,w15 - add w23,w23,w17 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w19,w19,#31 - orr w25,w20,w21 - and w26,w20,w21 - eor w3,w3,w5 - ror w27,w24,#27 - and w25,w25,w22 - add w22,w22,w28 // future e+=K - eor w3,w3,w11 - add w23,w23,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w20,w20,#2 - eor w3,w3,w16 - add w22,w22,w19 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w3,w3,#31 - orr w25,w24,w20 - and w26,w24,w20 - eor w4,w4,w6 - ror w27,w23,#27 - and w25,w25,w21 - add w21,w21,w28 // future e+=K - eor w4,w4,w12 - add w22,w22,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w24,w24,#2 - eor w4,w4,w17 - add w21,w21,w3 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w4,w4,#31 - orr w25,w23,w24 - and w26,w23,w24 - eor w5,w5,w7 - ror w27,w22,#27 - and w25,w25,w20 - add w20,w20,w28 // future e+=K - eor w5,w5,w13 - add w21,w21,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w23,w23,#2 - eor w5,w5,w19 - add w20,w20,w4 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w5,w5,#31 - orr w25,w22,w23 - and w26,w22,w23 - eor w6,w6,w8 - ror w27,w21,#27 - and w25,w25,w24 - add w24,w24,w28 // future e+=K - eor w6,w6,w14 - add w20,w20,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w22,w22,#2 - eor w6,w6,w3 - add w24,w24,w5 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w6,w6,#31 - orr w25,w21,w22 - and w26,w21,w22 - eor w7,w7,w9 - ror w27,w20,#27 - and w25,w25,w23 - add w23,w23,w28 // future e+=K - eor w7,w7,w15 - add w24,w24,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w21,w21,#2 - eor w7,w7,w4 - add w23,w23,w6 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w7,w7,#31 - orr w25,w20,w21 - and w26,w20,w21 - eor w8,w8,w10 - ror w27,w24,#27 - and w25,w25,w22 - add w22,w22,w28 // future e+=K - eor w8,w8,w16 - add w23,w23,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w20,w20,#2 - eor w8,w8,w5 - add w22,w22,w7 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w8,w8,#31 - orr w25,w24,w20 - and w26,w24,w20 - eor w9,w9,w11 - ror w27,w23,#27 - and w25,w25,w21 - add w21,w21,w28 // future e+=K - eor w9,w9,w17 - add w22,w22,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w24,w24,#2 - eor w9,w9,w6 - add w21,w21,w8 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w9,w9,#31 - orr w25,w23,w24 - and w26,w23,w24 - eor w10,w10,w12 - ror w27,w22,#27 - and w25,w25,w20 - add w20,w20,w28 // future e+=K - eor w10,w10,w19 - add w21,w21,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w23,w23,#2 - eor w10,w10,w7 - add w20,w20,w9 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w10,w10,#31 - orr w25,w22,w23 - and w26,w22,w23 - eor w11,w11,w13 - ror w27,w21,#27 - and w25,w25,w24 - add w24,w24,w28 // future e+=K - eor w11,w11,w3 - add w20,w20,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w22,w22,#2 - eor w11,w11,w8 - add w24,w24,w10 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w11,w11,#31 - orr w25,w21,w22 - and w26,w21,w22 - eor w12,w12,w14 - ror w27,w20,#27 - and w25,w25,w23 - add w23,w23,w28 // future e+=K - eor w12,w12,w4 - add w24,w24,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w21,w21,#2 - eor w12,w12,w9 - add w23,w23,w11 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w12,w12,#31 - orr w25,w20,w21 - and w26,w20,w21 - eor w13,w13,w15 - ror w27,w24,#27 - and w25,w25,w22 - add w22,w22,w28 // future e+=K - eor w13,w13,w5 - add w23,w23,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w20,w20,#2 - eor w13,w13,w10 - add w22,w22,w12 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w13,w13,#31 - orr w25,w24,w20 - and w26,w24,w20 - eor w14,w14,w16 - ror w27,w23,#27 - and w25,w25,w21 - add w21,w21,w28 // future e+=K - eor w14,w14,w6 - add w22,w22,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w24,w24,#2 - eor w14,w14,w11 - add w21,w21,w13 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w14,w14,#31 - orr w25,w23,w24 - and w26,w23,w24 - eor w15,w15,w17 - ror w27,w22,#27 - and w25,w25,w20 - add w20,w20,w28 // future e+=K - eor w15,w15,w7 - add w21,w21,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w23,w23,#2 - eor w15,w15,w12 - add w20,w20,w14 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w15,w15,#31 - movz w28,#0xc1d6 - movk w28,#0xca62,lsl#16 - orr w25,w22,w23 - and w26,w22,w23 - eor w16,w16,w19 - ror w27,w21,#27 - and w25,w25,w24 - add w24,w24,w28 // future e+=K - eor w16,w16,w8 - add w20,w20,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w22,w22,#2 - eor w16,w16,w13 - add w24,w24,w15 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w16,w16,#31 - eor w17,w17,w3 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w17,w17,w9 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w17,w17,w14 - add w23,w23,w16 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w17,w17,#31 - eor w19,w19,w4 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w19,w19,w10 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w19,w19,w15 - add w22,w22,w17 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w19,w19,#31 - eor w3,w3,w5 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w3,w3,w11 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w3,w3,w16 - add w21,w21,w19 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w3,w3,#31 - eor w4,w4,w6 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w4,w4,w12 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w4,w4,w17 - add w20,w20,w3 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w4,w4,#31 - eor w5,w5,w7 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w5,w5,w13 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w5,w5,w19 - add w24,w24,w4 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w5,w5,#31 - eor w6,w6,w8 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w6,w6,w14 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w6,w6,w3 - add w23,w23,w5 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w6,w6,#31 - eor w7,w7,w9 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w7,w7,w15 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w7,w7,w4 - add w22,w22,w6 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w7,w7,#31 - eor w8,w8,w10 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w8,w8,w16 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w8,w8,w5 - add w21,w21,w7 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w8,w8,#31 - eor w9,w9,w11 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w9,w9,w17 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w9,w9,w6 - add w20,w20,w8 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w9,w9,#31 - eor w10,w10,w12 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w10,w10,w19 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w10,w10,w7 - add w24,w24,w9 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w10,w10,#31 - eor w11,w11,w13 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w11,w11,w3 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w11,w11,w8 - add w23,w23,w10 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w11,w11,#31 - eor w12,w12,w14 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w12,w12,w4 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w12,w12,w9 - add w22,w22,w11 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w12,w12,#31 - eor w13,w13,w15 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w13,w13,w5 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w13,w13,w10 - add w21,w21,w12 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w13,w13,#31 - eor w14,w14,w16 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w14,w14,w6 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w14,w14,w11 - add w20,w20,w13 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w14,w14,#31 - eor w15,w15,w17 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w15,w15,w7 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w15,w15,w12 - add w24,w24,w14 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w15,w15,#31 - eor w16,w16,w19 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w16,w16,w8 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w16,w16,w13 - add w23,w23,w15 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w16,w16,#31 - eor w17,w17,w3 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w17,w17,w9 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w17,w17,w14 - add w22,w22,w16 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w17,w17,#31 - eor w19,w19,w4 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w19,w19,w10 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w19,w19,w15 - add w21,w21,w17 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w19,w19,#31 - ldp w4,w5,[x0] - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - add w20,w20,w19 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ldp w6,w7,[x0,#8] - eor w25,w24,w22 - ror w27,w21,#27 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - ldr w8,[x0,#16] - add w20,w20,w25 // e+=F(b,c,d) - add w21,w21,w5 - add w22,w22,w6 - add w20,w20,w4 - add w23,w23,w7 - add w24,w24,w8 - stp w20,w21,[x0] - stp w22,w23,[x0,#8] - str w24,[x0,#16] - cbnz x2,Loop - - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldp x25,x26,[sp,#64] - ldp x27,x28,[sp,#80] - ldr x29,[sp],#96 - ret - - -.align 6 -sha1_block_armv8: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - AARCH64_VALID_CALL_TARGET -Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - adrp x4,Lconst@PAGE - add x4,x4,Lconst@PAGEOFF - eor v1.16b,v1.16b,v1.16b - ld1 {v0.4s},[x0],#16 - ld1 {v1.s}[0],[x0] - sub x0,x0,#16 - ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4] - -Loop_hw: - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - sub x2,x2,#1 - rev32 v4.16b,v4.16b - rev32 v5.16b,v5.16b - - add v20.4s,v16.4s,v4.4s - rev32 v6.16b,v6.16b - orr v22.16b,v0.16b,v0.16b // offload - - add v21.4s,v16.4s,v5.4s - rev32 v7.16b,v7.16b -.long 0x5e280803 //sha1h v3.16b,v0.16b -.long 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0 - add v20.4s,v16.4s,v6.4s -.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b -.long 0x5e280802 //sha1h v2.16b,v0.16b // 1 -.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s - add v21.4s,v16.4s,v7.4s -.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b -.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b -.long 0x5e280803 //sha1h v3.16b,v0.16b // 2 -.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s - add v20.4s,v16.4s,v4.4s -.long 0x5e281885 //sha1su1 v5.16b,v4.16b -.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b -.long 0x5e280802 //sha1h v2.16b,v0.16b // 3 -.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s - add v21.4s,v17.4s,v5.4s -.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b -.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b -.long 0x5e280803 //sha1h v3.16b,v0.16b // 4 -.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s - add v20.4s,v17.4s,v6.4s -.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b -.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b -.long 0x5e280802 //sha1h v2.16b,v0.16b // 5 -.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - add v21.4s,v17.4s,v7.4s -.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b -.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b -.long 0x5e280803 //sha1h v3.16b,v0.16b // 6 -.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s - add v20.4s,v17.4s,v4.4s -.long 0x5e281885 //sha1su1 v5.16b,v4.16b -.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b -.long 0x5e280802 //sha1h v2.16b,v0.16b // 7 -.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - add v21.4s,v17.4s,v5.4s -.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b -.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b -.long 0x5e280803 //sha1h v3.16b,v0.16b // 8 -.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s - add v20.4s,v18.4s,v6.4s -.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b -.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b -.long 0x5e280802 //sha1h v2.16b,v0.16b // 9 -.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - add v21.4s,v18.4s,v7.4s -.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b -.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b -.long 0x5e280803 //sha1h v3.16b,v0.16b // 10 -.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s - add v20.4s,v18.4s,v4.4s -.long 0x5e281885 //sha1su1 v5.16b,v4.16b -.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b -.long 0x5e280802 //sha1h v2.16b,v0.16b // 11 -.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s - add v21.4s,v18.4s,v5.4s -.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b -.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b -.long 0x5e280803 //sha1h v3.16b,v0.16b // 12 -.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s - add v20.4s,v18.4s,v6.4s -.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b -.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b -.long 0x5e280802 //sha1h v2.16b,v0.16b // 13 -.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s - add v21.4s,v19.4s,v7.4s -.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b -.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b -.long 0x5e280803 //sha1h v3.16b,v0.16b // 14 -.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s - add v20.4s,v19.4s,v4.4s -.long 0x5e281885 //sha1su1 v5.16b,v4.16b -.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b -.long 0x5e280802 //sha1h v2.16b,v0.16b // 15 -.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - add v21.4s,v19.4s,v5.4s -.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b -.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b -.long 0x5e280803 //sha1h v3.16b,v0.16b // 16 -.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s - add v20.4s,v19.4s,v6.4s -.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b -.long 0x5e280802 //sha1h v2.16b,v0.16b // 17 -.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - add v21.4s,v19.4s,v7.4s - -.long 0x5e280803 //sha1h v3.16b,v0.16b // 18 -.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s - -.long 0x5e280802 //sha1h v2.16b,v0.16b // 19 -.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - - add v1.4s,v1.4s,v2.4s - add v0.4s,v0.4s,v22.4s - - cbnz x2,Loop_hw - - st1 {v0.4s},[x0],#16 - st1 {v1.s}[0],[x0] - - ldr x29,[sp],#16 - ret - -.section __TEXT,__const -.align 6 -Lconst: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 -.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/sha256-armv8.S b/third_party/boringssl/apple-aarch64/crypto/fipsmodule/sha256-armv8.S deleted file mode 100644 index b40b260f..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/sha256-armv8.S +++ /dev/null @@ -1,1212 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. -// -// Licensed under the OpenSSL license (the "License"). You may not use -// this file except in compliance with the License. You can obtain a copy -// in the file LICENSE in the source distribution or at -// https://www.openssl.org/source/license.html - -// ==================================================================== -// Written by Andy Polyakov for the OpenSSL -// project. The module is, however, dual licensed under OpenSSL and -// CRYPTOGAMS licenses depending on where you obtain it. For further -// details see http://www.openssl.org/~appro/cryptogams/. -// -// Permission to use under GPLv2 terms is granted. -// ==================================================================== -// -// SHA256/512 for ARMv8. -// -// Performance in cycles per processed byte and improvement coefficient -// over code generated with "default" compiler: -// -// SHA256-hw SHA256(*) SHA512 -// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) -// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) -// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) -// Denver 2.01 10.5 (+26%) 6.70 (+8%) -// X-Gene 20.0 (+100%) 12.8 (+300%(***)) -// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) -// Kryo 1.92 17.4 (+30%) 11.2 (+8%) -// -// (*) Software SHA256 results are of lesser relevance, presented -// mostly for informational purposes. -// (**) The result is a trade-off: it's possible to improve it by -// 10% (or by 1 cycle per round), but at the cost of 20% loss -// on Cortex-A53 (or by 4 cycles per round). -// (***) Super-impressive coefficients over gcc-generated code are -// indication of some compiler "pathology", most notably code -// generated with -mgeneral-regs-only is significantly faster -// and the gap is only 40-90%. - -#ifndef __KERNEL__ -# include -#endif - -.text - - -.private_extern _OPENSSL_armcap_P -.globl _sha256_block_data_order -.private_extern _sha256_block_data_order - -.align 6 -_sha256_block_data_order: - AARCH64_VALID_CALL_TARGET -#ifndef __KERNEL__ -#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P -#else - adrp x16,_OPENSSL_armcap_P@PAGE -#endif - ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF] - tst w16,#ARMV8_SHA256 - b.ne Lv8_entry -#endif - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#4*4 - - ldp w20,w21,[x0] // load context - ldp w22,w23,[x0,#2*4] - ldp w24,w25,[x0,#4*4] - add x2,x1,x2,lsl#6 // end of input - ldp w26,w27,[x0,#6*4] - adrp x30,LK256@PAGE - add x30,x30,LK256@PAGEOFF - stp x0,x2,[x29,#96] - -Loop: - ldp w3,w4,[x1],#2*4 - ldr w19,[x30],#4 // *K++ - eor w28,w21,w22 // magic seed - str x1,[x29,#112] -#ifndef __AARCH64EB__ - rev w3,w3 // 0 -#endif - ror w16,w24,#6 - add w27,w27,w19 // h+=K[i] - eor w6,w24,w24,ror#14 - and w17,w25,w24 - bic w19,w26,w24 - add w27,w27,w3 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w20,w21 // a^b, b^c in next round - eor w16,w16,w6,ror#11 // Sigma1(e) - ror w6,w20,#2 - add w27,w27,w17 // h+=Ch(e,f,g) - eor w17,w20,w20,ror#9 - add w27,w27,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w23,w23,w27 // d+=h - eor w28,w28,w21 // Maj(a,b,c) - eor w17,w6,w17,ror#13 // Sigma0(a) - add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w27,w27,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w4,w4 // 1 -#endif - ldp w5,w6,[x1],#2*4 - add w27,w27,w17 // h+=Sigma0(a) - ror w16,w23,#6 - add w26,w26,w28 // h+=K[i] - eor w7,w23,w23,ror#14 - and w17,w24,w23 - bic w28,w25,w23 - add w26,w26,w4 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w27,w20 // a^b, b^c in next round - eor w16,w16,w7,ror#11 // Sigma1(e) - ror w7,w27,#2 - add w26,w26,w17 // h+=Ch(e,f,g) - eor w17,w27,w27,ror#9 - add w26,w26,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w22,w22,w26 // d+=h - eor w19,w19,w20 // Maj(a,b,c) - eor w17,w7,w17,ror#13 // Sigma0(a) - add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w26,w26,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w5,w5 // 2 -#endif - add w26,w26,w17 // h+=Sigma0(a) - ror w16,w22,#6 - add w25,w25,w19 // h+=K[i] - eor w8,w22,w22,ror#14 - and w17,w23,w22 - bic w19,w24,w22 - add w25,w25,w5 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w26,w27 // a^b, b^c in next round - eor w16,w16,w8,ror#11 // Sigma1(e) - ror w8,w26,#2 - add w25,w25,w17 // h+=Ch(e,f,g) - eor w17,w26,w26,ror#9 - add w25,w25,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w21,w21,w25 // d+=h - eor w28,w28,w27 // Maj(a,b,c) - eor w17,w8,w17,ror#13 // Sigma0(a) - add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w25,w25,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w6,w6 // 3 -#endif - ldp w7,w8,[x1],#2*4 - add w25,w25,w17 // h+=Sigma0(a) - ror w16,w21,#6 - add w24,w24,w28 // h+=K[i] - eor w9,w21,w21,ror#14 - and w17,w22,w21 - bic w28,w23,w21 - add w24,w24,w6 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w25,w26 // a^b, b^c in next round - eor w16,w16,w9,ror#11 // Sigma1(e) - ror w9,w25,#2 - add w24,w24,w17 // h+=Ch(e,f,g) - eor w17,w25,w25,ror#9 - add w24,w24,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w20,w20,w24 // d+=h - eor w19,w19,w26 // Maj(a,b,c) - eor w17,w9,w17,ror#13 // Sigma0(a) - add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w24,w24,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w7,w7 // 4 -#endif - add w24,w24,w17 // h+=Sigma0(a) - ror w16,w20,#6 - add w23,w23,w19 // h+=K[i] - eor w10,w20,w20,ror#14 - and w17,w21,w20 - bic w19,w22,w20 - add w23,w23,w7 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w24,w25 // a^b, b^c in next round - eor w16,w16,w10,ror#11 // Sigma1(e) - ror w10,w24,#2 - add w23,w23,w17 // h+=Ch(e,f,g) - eor w17,w24,w24,ror#9 - add w23,w23,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w27,w27,w23 // d+=h - eor w28,w28,w25 // Maj(a,b,c) - eor w17,w10,w17,ror#13 // Sigma0(a) - add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w23,w23,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w8,w8 // 5 -#endif - ldp w9,w10,[x1],#2*4 - add w23,w23,w17 // h+=Sigma0(a) - ror w16,w27,#6 - add w22,w22,w28 // h+=K[i] - eor w11,w27,w27,ror#14 - and w17,w20,w27 - bic w28,w21,w27 - add w22,w22,w8 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w23,w24 // a^b, b^c in next round - eor w16,w16,w11,ror#11 // Sigma1(e) - ror w11,w23,#2 - add w22,w22,w17 // h+=Ch(e,f,g) - eor w17,w23,w23,ror#9 - add w22,w22,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w26,w26,w22 // d+=h - eor w19,w19,w24 // Maj(a,b,c) - eor w17,w11,w17,ror#13 // Sigma0(a) - add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w22,w22,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w9,w9 // 6 -#endif - add w22,w22,w17 // h+=Sigma0(a) - ror w16,w26,#6 - add w21,w21,w19 // h+=K[i] - eor w12,w26,w26,ror#14 - and w17,w27,w26 - bic w19,w20,w26 - add w21,w21,w9 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w22,w23 // a^b, b^c in next round - eor w16,w16,w12,ror#11 // Sigma1(e) - ror w12,w22,#2 - add w21,w21,w17 // h+=Ch(e,f,g) - eor w17,w22,w22,ror#9 - add w21,w21,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w25,w25,w21 // d+=h - eor w28,w28,w23 // Maj(a,b,c) - eor w17,w12,w17,ror#13 // Sigma0(a) - add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w21,w21,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w10,w10 // 7 -#endif - ldp w11,w12,[x1],#2*4 - add w21,w21,w17 // h+=Sigma0(a) - ror w16,w25,#6 - add w20,w20,w28 // h+=K[i] - eor w13,w25,w25,ror#14 - and w17,w26,w25 - bic w28,w27,w25 - add w20,w20,w10 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w21,w22 // a^b, b^c in next round - eor w16,w16,w13,ror#11 // Sigma1(e) - ror w13,w21,#2 - add w20,w20,w17 // h+=Ch(e,f,g) - eor w17,w21,w21,ror#9 - add w20,w20,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w24,w24,w20 // d+=h - eor w19,w19,w22 // Maj(a,b,c) - eor w17,w13,w17,ror#13 // Sigma0(a) - add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w20,w20,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w11,w11 // 8 -#endif - add w20,w20,w17 // h+=Sigma0(a) - ror w16,w24,#6 - add w27,w27,w19 // h+=K[i] - eor w14,w24,w24,ror#14 - and w17,w25,w24 - bic w19,w26,w24 - add w27,w27,w11 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w20,w21 // a^b, b^c in next round - eor w16,w16,w14,ror#11 // Sigma1(e) - ror w14,w20,#2 - add w27,w27,w17 // h+=Ch(e,f,g) - eor w17,w20,w20,ror#9 - add w27,w27,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w23,w23,w27 // d+=h - eor w28,w28,w21 // Maj(a,b,c) - eor w17,w14,w17,ror#13 // Sigma0(a) - add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w27,w27,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w12,w12 // 9 -#endif - ldp w13,w14,[x1],#2*4 - add w27,w27,w17 // h+=Sigma0(a) - ror w16,w23,#6 - add w26,w26,w28 // h+=K[i] - eor w15,w23,w23,ror#14 - and w17,w24,w23 - bic w28,w25,w23 - add w26,w26,w12 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w27,w20 // a^b, b^c in next round - eor w16,w16,w15,ror#11 // Sigma1(e) - ror w15,w27,#2 - add w26,w26,w17 // h+=Ch(e,f,g) - eor w17,w27,w27,ror#9 - add w26,w26,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w22,w22,w26 // d+=h - eor w19,w19,w20 // Maj(a,b,c) - eor w17,w15,w17,ror#13 // Sigma0(a) - add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w26,w26,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w13,w13 // 10 -#endif - add w26,w26,w17 // h+=Sigma0(a) - ror w16,w22,#6 - add w25,w25,w19 // h+=K[i] - eor w0,w22,w22,ror#14 - and w17,w23,w22 - bic w19,w24,w22 - add w25,w25,w13 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w26,w27 // a^b, b^c in next round - eor w16,w16,w0,ror#11 // Sigma1(e) - ror w0,w26,#2 - add w25,w25,w17 // h+=Ch(e,f,g) - eor w17,w26,w26,ror#9 - add w25,w25,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w21,w21,w25 // d+=h - eor w28,w28,w27 // Maj(a,b,c) - eor w17,w0,w17,ror#13 // Sigma0(a) - add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w25,w25,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w14,w14 // 11 -#endif - ldp w15,w0,[x1],#2*4 - add w25,w25,w17 // h+=Sigma0(a) - str w6,[sp,#12] - ror w16,w21,#6 - add w24,w24,w28 // h+=K[i] - eor w6,w21,w21,ror#14 - and w17,w22,w21 - bic w28,w23,w21 - add w24,w24,w14 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w25,w26 // a^b, b^c in next round - eor w16,w16,w6,ror#11 // Sigma1(e) - ror w6,w25,#2 - add w24,w24,w17 // h+=Ch(e,f,g) - eor w17,w25,w25,ror#9 - add w24,w24,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w20,w20,w24 // d+=h - eor w19,w19,w26 // Maj(a,b,c) - eor w17,w6,w17,ror#13 // Sigma0(a) - add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w24,w24,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w15,w15 // 12 -#endif - add w24,w24,w17 // h+=Sigma0(a) - str w7,[sp,#0] - ror w16,w20,#6 - add w23,w23,w19 // h+=K[i] - eor w7,w20,w20,ror#14 - and w17,w21,w20 - bic w19,w22,w20 - add w23,w23,w15 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w24,w25 // a^b, b^c in next round - eor w16,w16,w7,ror#11 // Sigma1(e) - ror w7,w24,#2 - add w23,w23,w17 // h+=Ch(e,f,g) - eor w17,w24,w24,ror#9 - add w23,w23,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w27,w27,w23 // d+=h - eor w28,w28,w25 // Maj(a,b,c) - eor w17,w7,w17,ror#13 // Sigma0(a) - add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w23,w23,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w0,w0 // 13 -#endif - ldp w1,w2,[x1] - add w23,w23,w17 // h+=Sigma0(a) - str w8,[sp,#4] - ror w16,w27,#6 - add w22,w22,w28 // h+=K[i] - eor w8,w27,w27,ror#14 - and w17,w20,w27 - bic w28,w21,w27 - add w22,w22,w0 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w23,w24 // a^b, b^c in next round - eor w16,w16,w8,ror#11 // Sigma1(e) - ror w8,w23,#2 - add w22,w22,w17 // h+=Ch(e,f,g) - eor w17,w23,w23,ror#9 - add w22,w22,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w26,w26,w22 // d+=h - eor w19,w19,w24 // Maj(a,b,c) - eor w17,w8,w17,ror#13 // Sigma0(a) - add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w22,w22,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w1,w1 // 14 -#endif - ldr w6,[sp,#12] - add w22,w22,w17 // h+=Sigma0(a) - str w9,[sp,#8] - ror w16,w26,#6 - add w21,w21,w19 // h+=K[i] - eor w9,w26,w26,ror#14 - and w17,w27,w26 - bic w19,w20,w26 - add w21,w21,w1 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w22,w23 // a^b, b^c in next round - eor w16,w16,w9,ror#11 // Sigma1(e) - ror w9,w22,#2 - add w21,w21,w17 // h+=Ch(e,f,g) - eor w17,w22,w22,ror#9 - add w21,w21,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w25,w25,w21 // d+=h - eor w28,w28,w23 // Maj(a,b,c) - eor w17,w9,w17,ror#13 // Sigma0(a) - add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w21,w21,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w2,w2 // 15 -#endif - ldr w7,[sp,#0] - add w21,w21,w17 // h+=Sigma0(a) - str w10,[sp,#12] - ror w16,w25,#6 - add w20,w20,w28 // h+=K[i] - ror w9,w4,#7 - and w17,w26,w25 - ror w8,w1,#17 - bic w28,w27,w25 - ror w10,w21,#2 - add w20,w20,w2 // h+=X[i] - eor w16,w16,w25,ror#11 - eor w9,w9,w4,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w21,w22 // a^b, b^c in next round - eor w16,w16,w25,ror#25 // Sigma1(e) - eor w10,w10,w21,ror#13 - add w20,w20,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w8,w8,w1,ror#19 - eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) - add w20,w20,w16 // h+=Sigma1(e) - eor w19,w19,w22 // Maj(a,b,c) - eor w17,w10,w21,ror#22 // Sigma0(a) - eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) - add w3,w3,w12 - add w24,w24,w20 // d+=h - add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w3,w3,w9 - add w20,w20,w17 // h+=Sigma0(a) - add w3,w3,w8 -Loop_16_xx: - ldr w8,[sp,#4] - str w11,[sp,#0] - ror w16,w24,#6 - add w27,w27,w19 // h+=K[i] - ror w10,w5,#7 - and w17,w25,w24 - ror w9,w2,#17 - bic w19,w26,w24 - ror w11,w20,#2 - add w27,w27,w3 // h+=X[i] - eor w16,w16,w24,ror#11 - eor w10,w10,w5,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w20,w21 // a^b, b^c in next round - eor w16,w16,w24,ror#25 // Sigma1(e) - eor w11,w11,w20,ror#13 - add w27,w27,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w9,w9,w2,ror#19 - eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) - add w27,w27,w16 // h+=Sigma1(e) - eor w28,w28,w21 // Maj(a,b,c) - eor w17,w11,w20,ror#22 // Sigma0(a) - eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) - add w4,w4,w13 - add w23,w23,w27 // d+=h - add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w4,w4,w10 - add w27,w27,w17 // h+=Sigma0(a) - add w4,w4,w9 - ldr w9,[sp,#8] - str w12,[sp,#4] - ror w16,w23,#6 - add w26,w26,w28 // h+=K[i] - ror w11,w6,#7 - and w17,w24,w23 - ror w10,w3,#17 - bic w28,w25,w23 - ror w12,w27,#2 - add w26,w26,w4 // h+=X[i] - eor w16,w16,w23,ror#11 - eor w11,w11,w6,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w27,w20 // a^b, b^c in next round - eor w16,w16,w23,ror#25 // Sigma1(e) - eor w12,w12,w27,ror#13 - add w26,w26,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w10,w10,w3,ror#19 - eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) - add w26,w26,w16 // h+=Sigma1(e) - eor w19,w19,w20 // Maj(a,b,c) - eor w17,w12,w27,ror#22 // Sigma0(a) - eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) - add w5,w5,w14 - add w22,w22,w26 // d+=h - add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w5,w5,w11 - add w26,w26,w17 // h+=Sigma0(a) - add w5,w5,w10 - ldr w10,[sp,#12] - str w13,[sp,#8] - ror w16,w22,#6 - add w25,w25,w19 // h+=K[i] - ror w12,w7,#7 - and w17,w23,w22 - ror w11,w4,#17 - bic w19,w24,w22 - ror w13,w26,#2 - add w25,w25,w5 // h+=X[i] - eor w16,w16,w22,ror#11 - eor w12,w12,w7,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w26,w27 // a^b, b^c in next round - eor w16,w16,w22,ror#25 // Sigma1(e) - eor w13,w13,w26,ror#13 - add w25,w25,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w11,w11,w4,ror#19 - eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) - add w25,w25,w16 // h+=Sigma1(e) - eor w28,w28,w27 // Maj(a,b,c) - eor w17,w13,w26,ror#22 // Sigma0(a) - eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) - add w6,w6,w15 - add w21,w21,w25 // d+=h - add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w6,w6,w12 - add w25,w25,w17 // h+=Sigma0(a) - add w6,w6,w11 - ldr w11,[sp,#0] - str w14,[sp,#12] - ror w16,w21,#6 - add w24,w24,w28 // h+=K[i] - ror w13,w8,#7 - and w17,w22,w21 - ror w12,w5,#17 - bic w28,w23,w21 - ror w14,w25,#2 - add w24,w24,w6 // h+=X[i] - eor w16,w16,w21,ror#11 - eor w13,w13,w8,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w25,w26 // a^b, b^c in next round - eor w16,w16,w21,ror#25 // Sigma1(e) - eor w14,w14,w25,ror#13 - add w24,w24,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w12,w12,w5,ror#19 - eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) - add w24,w24,w16 // h+=Sigma1(e) - eor w19,w19,w26 // Maj(a,b,c) - eor w17,w14,w25,ror#22 // Sigma0(a) - eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) - add w7,w7,w0 - add w20,w20,w24 // d+=h - add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w7,w7,w13 - add w24,w24,w17 // h+=Sigma0(a) - add w7,w7,w12 - ldr w12,[sp,#4] - str w15,[sp,#0] - ror w16,w20,#6 - add w23,w23,w19 // h+=K[i] - ror w14,w9,#7 - and w17,w21,w20 - ror w13,w6,#17 - bic w19,w22,w20 - ror w15,w24,#2 - add w23,w23,w7 // h+=X[i] - eor w16,w16,w20,ror#11 - eor w14,w14,w9,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w24,w25 // a^b, b^c in next round - eor w16,w16,w20,ror#25 // Sigma1(e) - eor w15,w15,w24,ror#13 - add w23,w23,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w13,w13,w6,ror#19 - eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) - add w23,w23,w16 // h+=Sigma1(e) - eor w28,w28,w25 // Maj(a,b,c) - eor w17,w15,w24,ror#22 // Sigma0(a) - eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) - add w8,w8,w1 - add w27,w27,w23 // d+=h - add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w8,w8,w14 - add w23,w23,w17 // h+=Sigma0(a) - add w8,w8,w13 - ldr w13,[sp,#8] - str w0,[sp,#4] - ror w16,w27,#6 - add w22,w22,w28 // h+=K[i] - ror w15,w10,#7 - and w17,w20,w27 - ror w14,w7,#17 - bic w28,w21,w27 - ror w0,w23,#2 - add w22,w22,w8 // h+=X[i] - eor w16,w16,w27,ror#11 - eor w15,w15,w10,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w23,w24 // a^b, b^c in next round - eor w16,w16,w27,ror#25 // Sigma1(e) - eor w0,w0,w23,ror#13 - add w22,w22,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w14,w14,w7,ror#19 - eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) - add w22,w22,w16 // h+=Sigma1(e) - eor w19,w19,w24 // Maj(a,b,c) - eor w17,w0,w23,ror#22 // Sigma0(a) - eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) - add w9,w9,w2 - add w26,w26,w22 // d+=h - add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w9,w9,w15 - add w22,w22,w17 // h+=Sigma0(a) - add w9,w9,w14 - ldr w14,[sp,#12] - str w1,[sp,#8] - ror w16,w26,#6 - add w21,w21,w19 // h+=K[i] - ror w0,w11,#7 - and w17,w27,w26 - ror w15,w8,#17 - bic w19,w20,w26 - ror w1,w22,#2 - add w21,w21,w9 // h+=X[i] - eor w16,w16,w26,ror#11 - eor w0,w0,w11,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w22,w23 // a^b, b^c in next round - eor w16,w16,w26,ror#25 // Sigma1(e) - eor w1,w1,w22,ror#13 - add w21,w21,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w15,w15,w8,ror#19 - eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) - add w21,w21,w16 // h+=Sigma1(e) - eor w28,w28,w23 // Maj(a,b,c) - eor w17,w1,w22,ror#22 // Sigma0(a) - eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) - add w10,w10,w3 - add w25,w25,w21 // d+=h - add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w10,w10,w0 - add w21,w21,w17 // h+=Sigma0(a) - add w10,w10,w15 - ldr w15,[sp,#0] - str w2,[sp,#12] - ror w16,w25,#6 - add w20,w20,w28 // h+=K[i] - ror w1,w12,#7 - and w17,w26,w25 - ror w0,w9,#17 - bic w28,w27,w25 - ror w2,w21,#2 - add w20,w20,w10 // h+=X[i] - eor w16,w16,w25,ror#11 - eor w1,w1,w12,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w21,w22 // a^b, b^c in next round - eor w16,w16,w25,ror#25 // Sigma1(e) - eor w2,w2,w21,ror#13 - add w20,w20,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w0,w0,w9,ror#19 - eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) - add w20,w20,w16 // h+=Sigma1(e) - eor w19,w19,w22 // Maj(a,b,c) - eor w17,w2,w21,ror#22 // Sigma0(a) - eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) - add w11,w11,w4 - add w24,w24,w20 // d+=h - add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w11,w11,w1 - add w20,w20,w17 // h+=Sigma0(a) - add w11,w11,w0 - ldr w0,[sp,#4] - str w3,[sp,#0] - ror w16,w24,#6 - add w27,w27,w19 // h+=K[i] - ror w2,w13,#7 - and w17,w25,w24 - ror w1,w10,#17 - bic w19,w26,w24 - ror w3,w20,#2 - add w27,w27,w11 // h+=X[i] - eor w16,w16,w24,ror#11 - eor w2,w2,w13,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w20,w21 // a^b, b^c in next round - eor w16,w16,w24,ror#25 // Sigma1(e) - eor w3,w3,w20,ror#13 - add w27,w27,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w1,w1,w10,ror#19 - eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) - add w27,w27,w16 // h+=Sigma1(e) - eor w28,w28,w21 // Maj(a,b,c) - eor w17,w3,w20,ror#22 // Sigma0(a) - eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) - add w12,w12,w5 - add w23,w23,w27 // d+=h - add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w12,w12,w2 - add w27,w27,w17 // h+=Sigma0(a) - add w12,w12,w1 - ldr w1,[sp,#8] - str w4,[sp,#4] - ror w16,w23,#6 - add w26,w26,w28 // h+=K[i] - ror w3,w14,#7 - and w17,w24,w23 - ror w2,w11,#17 - bic w28,w25,w23 - ror w4,w27,#2 - add w26,w26,w12 // h+=X[i] - eor w16,w16,w23,ror#11 - eor w3,w3,w14,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w27,w20 // a^b, b^c in next round - eor w16,w16,w23,ror#25 // Sigma1(e) - eor w4,w4,w27,ror#13 - add w26,w26,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w2,w2,w11,ror#19 - eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) - add w26,w26,w16 // h+=Sigma1(e) - eor w19,w19,w20 // Maj(a,b,c) - eor w17,w4,w27,ror#22 // Sigma0(a) - eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) - add w13,w13,w6 - add w22,w22,w26 // d+=h - add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w13,w13,w3 - add w26,w26,w17 // h+=Sigma0(a) - add w13,w13,w2 - ldr w2,[sp,#12] - str w5,[sp,#8] - ror w16,w22,#6 - add w25,w25,w19 // h+=K[i] - ror w4,w15,#7 - and w17,w23,w22 - ror w3,w12,#17 - bic w19,w24,w22 - ror w5,w26,#2 - add w25,w25,w13 // h+=X[i] - eor w16,w16,w22,ror#11 - eor w4,w4,w15,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w26,w27 // a^b, b^c in next round - eor w16,w16,w22,ror#25 // Sigma1(e) - eor w5,w5,w26,ror#13 - add w25,w25,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w3,w3,w12,ror#19 - eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) - add w25,w25,w16 // h+=Sigma1(e) - eor w28,w28,w27 // Maj(a,b,c) - eor w17,w5,w26,ror#22 // Sigma0(a) - eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) - add w14,w14,w7 - add w21,w21,w25 // d+=h - add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w14,w14,w4 - add w25,w25,w17 // h+=Sigma0(a) - add w14,w14,w3 - ldr w3,[sp,#0] - str w6,[sp,#12] - ror w16,w21,#6 - add w24,w24,w28 // h+=K[i] - ror w5,w0,#7 - and w17,w22,w21 - ror w4,w13,#17 - bic w28,w23,w21 - ror w6,w25,#2 - add w24,w24,w14 // h+=X[i] - eor w16,w16,w21,ror#11 - eor w5,w5,w0,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w25,w26 // a^b, b^c in next round - eor w16,w16,w21,ror#25 // Sigma1(e) - eor w6,w6,w25,ror#13 - add w24,w24,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w4,w4,w13,ror#19 - eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) - add w24,w24,w16 // h+=Sigma1(e) - eor w19,w19,w26 // Maj(a,b,c) - eor w17,w6,w25,ror#22 // Sigma0(a) - eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) - add w15,w15,w8 - add w20,w20,w24 // d+=h - add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w15,w15,w5 - add w24,w24,w17 // h+=Sigma0(a) - add w15,w15,w4 - ldr w4,[sp,#4] - str w7,[sp,#0] - ror w16,w20,#6 - add w23,w23,w19 // h+=K[i] - ror w6,w1,#7 - and w17,w21,w20 - ror w5,w14,#17 - bic w19,w22,w20 - ror w7,w24,#2 - add w23,w23,w15 // h+=X[i] - eor w16,w16,w20,ror#11 - eor w6,w6,w1,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w24,w25 // a^b, b^c in next round - eor w16,w16,w20,ror#25 // Sigma1(e) - eor w7,w7,w24,ror#13 - add w23,w23,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w5,w5,w14,ror#19 - eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) - add w23,w23,w16 // h+=Sigma1(e) - eor w28,w28,w25 // Maj(a,b,c) - eor w17,w7,w24,ror#22 // Sigma0(a) - eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) - add w0,w0,w9 - add w27,w27,w23 // d+=h - add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w0,w0,w6 - add w23,w23,w17 // h+=Sigma0(a) - add w0,w0,w5 - ldr w5,[sp,#8] - str w8,[sp,#4] - ror w16,w27,#6 - add w22,w22,w28 // h+=K[i] - ror w7,w2,#7 - and w17,w20,w27 - ror w6,w15,#17 - bic w28,w21,w27 - ror w8,w23,#2 - add w22,w22,w0 // h+=X[i] - eor w16,w16,w27,ror#11 - eor w7,w7,w2,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w23,w24 // a^b, b^c in next round - eor w16,w16,w27,ror#25 // Sigma1(e) - eor w8,w8,w23,ror#13 - add w22,w22,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w6,w6,w15,ror#19 - eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) - add w22,w22,w16 // h+=Sigma1(e) - eor w19,w19,w24 // Maj(a,b,c) - eor w17,w8,w23,ror#22 // Sigma0(a) - eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) - add w1,w1,w10 - add w26,w26,w22 // d+=h - add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w1,w1,w7 - add w22,w22,w17 // h+=Sigma0(a) - add w1,w1,w6 - ldr w6,[sp,#12] - str w9,[sp,#8] - ror w16,w26,#6 - add w21,w21,w19 // h+=K[i] - ror w8,w3,#7 - and w17,w27,w26 - ror w7,w0,#17 - bic w19,w20,w26 - ror w9,w22,#2 - add w21,w21,w1 // h+=X[i] - eor w16,w16,w26,ror#11 - eor w8,w8,w3,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w22,w23 // a^b, b^c in next round - eor w16,w16,w26,ror#25 // Sigma1(e) - eor w9,w9,w22,ror#13 - add w21,w21,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w7,w7,w0,ror#19 - eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) - add w21,w21,w16 // h+=Sigma1(e) - eor w28,w28,w23 // Maj(a,b,c) - eor w17,w9,w22,ror#22 // Sigma0(a) - eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) - add w2,w2,w11 - add w25,w25,w21 // d+=h - add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w2,w2,w8 - add w21,w21,w17 // h+=Sigma0(a) - add w2,w2,w7 - ldr w7,[sp,#0] - str w10,[sp,#12] - ror w16,w25,#6 - add w20,w20,w28 // h+=K[i] - ror w9,w4,#7 - and w17,w26,w25 - ror w8,w1,#17 - bic w28,w27,w25 - ror w10,w21,#2 - add w20,w20,w2 // h+=X[i] - eor w16,w16,w25,ror#11 - eor w9,w9,w4,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w21,w22 // a^b, b^c in next round - eor w16,w16,w25,ror#25 // Sigma1(e) - eor w10,w10,w21,ror#13 - add w20,w20,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w8,w8,w1,ror#19 - eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) - add w20,w20,w16 // h+=Sigma1(e) - eor w19,w19,w22 // Maj(a,b,c) - eor w17,w10,w21,ror#22 // Sigma0(a) - eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) - add w3,w3,w12 - add w24,w24,w20 // d+=h - add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w3,w3,w9 - add w20,w20,w17 // h+=Sigma0(a) - add w3,w3,w8 - cbnz w19,Loop_16_xx - - ldp x0,x2,[x29,#96] - ldr x1,[x29,#112] - sub x30,x30,#260 // rewind - - ldp w3,w4,[x0] - ldp w5,w6,[x0,#2*4] - add x1,x1,#14*4 // advance input pointer - ldp w7,w8,[x0,#4*4] - add w20,w20,w3 - ldp w9,w10,[x0,#6*4] - add w21,w21,w4 - add w22,w22,w5 - add w23,w23,w6 - stp w20,w21,[x0] - add w24,w24,w7 - add w25,w25,w8 - stp w22,w23,[x0,#2*4] - add w26,w26,w9 - add w27,w27,w10 - cmp x1,x2 - stp w24,w25,[x0,#4*4] - stp w26,w27,[x0,#6*4] - b.ne Loop - - ldp x19,x20,[x29,#16] - add sp,sp,#4*4 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -.section __TEXT,__const -.align 6 - -LK256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.long 0 //terminator - -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -.text -#ifndef __KERNEL__ - -.align 6 -sha256_block_armv8: -Lv8_entry: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v0.4s,v1.4s},[x0] - adrp x3,LK256@PAGE - add x3,x3,LK256@PAGEOFF - -Loop_hw: - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - sub x2,x2,#1 - ld1 {v16.4s},[x3],#16 - rev32 v4.16b,v4.16b - rev32 v5.16b,v5.16b - rev32 v6.16b,v6.16b - rev32 v7.16b,v7.16b - orr v18.16b,v0.16b,v0.16b // offload - orr v19.16b,v1.16b,v1.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.long 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.long 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.long 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.long 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.long 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.long 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.long 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.long 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - ld1 {v17.4s},[x3] - add v16.4s,v16.4s,v6.4s - sub x3,x3,#64*4-16 // rewind - orr v2.16b,v0.16b,v0.16b -.long 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.long 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - add v17.4s,v17.4s,v7.4s - orr v2.16b,v0.16b,v0.16b -.long 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.long 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - add v0.4s,v0.4s,v18.4s - add v1.4s,v1.4s,v19.4s - - cbnz x2,Loop_hw - - st1 {v0.4s,v1.4s},[x0] - - ldr x29,[sp],#16 - ret - -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/sha512-armv8.S b/third_party/boringssl/apple-aarch64/crypto/fipsmodule/sha512-armv8.S deleted file mode 100644 index b2d366d7..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/sha512-armv8.S +++ /dev/null @@ -1,1614 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. -// -// Licensed under the OpenSSL license (the "License"). You may not use -// this file except in compliance with the License. You can obtain a copy -// in the file LICENSE in the source distribution or at -// https://www.openssl.org/source/license.html - -// ==================================================================== -// Written by Andy Polyakov for the OpenSSL -// project. The module is, however, dual licensed under OpenSSL and -// CRYPTOGAMS licenses depending on where you obtain it. For further -// details see http://www.openssl.org/~appro/cryptogams/. -// -// Permission to use under GPLv2 terms is granted. -// ==================================================================== -// -// SHA256/512 for ARMv8. -// -// Performance in cycles per processed byte and improvement coefficient -// over code generated with "default" compiler: -// -// SHA256-hw SHA256(*) SHA512 -// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) -// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) -// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) -// Denver 2.01 10.5 (+26%) 6.70 (+8%) -// X-Gene 20.0 (+100%) 12.8 (+300%(***)) -// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) -// Kryo 1.92 17.4 (+30%) 11.2 (+8%) -// -// (*) Software SHA256 results are of lesser relevance, presented -// mostly for informational purposes. -// (**) The result is a trade-off: it's possible to improve it by -// 10% (or by 1 cycle per round), but at the cost of 20% loss -// on Cortex-A53 (or by 4 cycles per round). -// (***) Super-impressive coefficients over gcc-generated code are -// indication of some compiler "pathology", most notably code -// generated with -mgeneral-regs-only is significantly faster -// and the gap is only 40-90%. - -#ifndef __KERNEL__ -# include -#endif - -.text - - -.private_extern _OPENSSL_armcap_P -.globl _sha512_block_data_order -.private_extern _sha512_block_data_order - -.align 6 -_sha512_block_data_order: - AARCH64_VALID_CALL_TARGET -#ifndef __KERNEL__ -#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:_OPENSSL_armcap_P -#else - adrp x16,_OPENSSL_armcap_P@PAGE -#endif - ldr w16,[x16,_OPENSSL_armcap_P@PAGEOFF] - tst w16,#ARMV8_SHA512 - b.ne Lv8_entry -#endif - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#4*8 - - ldp x20,x21,[x0] // load context - ldp x22,x23,[x0,#2*8] - ldp x24,x25,[x0,#4*8] - add x2,x1,x2,lsl#7 // end of input - ldp x26,x27,[x0,#6*8] - adrp x30,LK512@PAGE - add x30,x30,LK512@PAGEOFF - stp x0,x2,[x29,#96] - -Loop: - ldp x3,x4,[x1],#2*8 - ldr x19,[x30],#8 // *K++ - eor x28,x21,x22 // magic seed - str x1,[x29,#112] -#ifndef __AARCH64EB__ - rev x3,x3 // 0 -#endif - ror x16,x24,#14 - add x27,x27,x19 // h+=K[i] - eor x6,x24,x24,ror#23 - and x17,x25,x24 - bic x19,x26,x24 - add x27,x27,x3 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x20,x21 // a^b, b^c in next round - eor x16,x16,x6,ror#18 // Sigma1(e) - ror x6,x20,#28 - add x27,x27,x17 // h+=Ch(e,f,g) - eor x17,x20,x20,ror#5 - add x27,x27,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x23,x23,x27 // d+=h - eor x28,x28,x21 // Maj(a,b,c) - eor x17,x6,x17,ror#34 // Sigma0(a) - add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x27,x27,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x4,x4 // 1 -#endif - ldp x5,x6,[x1],#2*8 - add x27,x27,x17 // h+=Sigma0(a) - ror x16,x23,#14 - add x26,x26,x28 // h+=K[i] - eor x7,x23,x23,ror#23 - and x17,x24,x23 - bic x28,x25,x23 - add x26,x26,x4 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x27,x20 // a^b, b^c in next round - eor x16,x16,x7,ror#18 // Sigma1(e) - ror x7,x27,#28 - add x26,x26,x17 // h+=Ch(e,f,g) - eor x17,x27,x27,ror#5 - add x26,x26,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x22,x22,x26 // d+=h - eor x19,x19,x20 // Maj(a,b,c) - eor x17,x7,x17,ror#34 // Sigma0(a) - add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x26,x26,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x5,x5 // 2 -#endif - add x26,x26,x17 // h+=Sigma0(a) - ror x16,x22,#14 - add x25,x25,x19 // h+=K[i] - eor x8,x22,x22,ror#23 - and x17,x23,x22 - bic x19,x24,x22 - add x25,x25,x5 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x26,x27 // a^b, b^c in next round - eor x16,x16,x8,ror#18 // Sigma1(e) - ror x8,x26,#28 - add x25,x25,x17 // h+=Ch(e,f,g) - eor x17,x26,x26,ror#5 - add x25,x25,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x21,x21,x25 // d+=h - eor x28,x28,x27 // Maj(a,b,c) - eor x17,x8,x17,ror#34 // Sigma0(a) - add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x25,x25,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x6,x6 // 3 -#endif - ldp x7,x8,[x1],#2*8 - add x25,x25,x17 // h+=Sigma0(a) - ror x16,x21,#14 - add x24,x24,x28 // h+=K[i] - eor x9,x21,x21,ror#23 - and x17,x22,x21 - bic x28,x23,x21 - add x24,x24,x6 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x25,x26 // a^b, b^c in next round - eor x16,x16,x9,ror#18 // Sigma1(e) - ror x9,x25,#28 - add x24,x24,x17 // h+=Ch(e,f,g) - eor x17,x25,x25,ror#5 - add x24,x24,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x20,x20,x24 // d+=h - eor x19,x19,x26 // Maj(a,b,c) - eor x17,x9,x17,ror#34 // Sigma0(a) - add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x24,x24,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x7,x7 // 4 -#endif - add x24,x24,x17 // h+=Sigma0(a) - ror x16,x20,#14 - add x23,x23,x19 // h+=K[i] - eor x10,x20,x20,ror#23 - and x17,x21,x20 - bic x19,x22,x20 - add x23,x23,x7 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x24,x25 // a^b, b^c in next round - eor x16,x16,x10,ror#18 // Sigma1(e) - ror x10,x24,#28 - add x23,x23,x17 // h+=Ch(e,f,g) - eor x17,x24,x24,ror#5 - add x23,x23,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x27,x27,x23 // d+=h - eor x28,x28,x25 // Maj(a,b,c) - eor x17,x10,x17,ror#34 // Sigma0(a) - add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x23,x23,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x8,x8 // 5 -#endif - ldp x9,x10,[x1],#2*8 - add x23,x23,x17 // h+=Sigma0(a) - ror x16,x27,#14 - add x22,x22,x28 // h+=K[i] - eor x11,x27,x27,ror#23 - and x17,x20,x27 - bic x28,x21,x27 - add x22,x22,x8 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x23,x24 // a^b, b^c in next round - eor x16,x16,x11,ror#18 // Sigma1(e) - ror x11,x23,#28 - add x22,x22,x17 // h+=Ch(e,f,g) - eor x17,x23,x23,ror#5 - add x22,x22,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x26,x26,x22 // d+=h - eor x19,x19,x24 // Maj(a,b,c) - eor x17,x11,x17,ror#34 // Sigma0(a) - add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x22,x22,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x9,x9 // 6 -#endif - add x22,x22,x17 // h+=Sigma0(a) - ror x16,x26,#14 - add x21,x21,x19 // h+=K[i] - eor x12,x26,x26,ror#23 - and x17,x27,x26 - bic x19,x20,x26 - add x21,x21,x9 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x22,x23 // a^b, b^c in next round - eor x16,x16,x12,ror#18 // Sigma1(e) - ror x12,x22,#28 - add x21,x21,x17 // h+=Ch(e,f,g) - eor x17,x22,x22,ror#5 - add x21,x21,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x25,x25,x21 // d+=h - eor x28,x28,x23 // Maj(a,b,c) - eor x17,x12,x17,ror#34 // Sigma0(a) - add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x21,x21,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x10,x10 // 7 -#endif - ldp x11,x12,[x1],#2*8 - add x21,x21,x17 // h+=Sigma0(a) - ror x16,x25,#14 - add x20,x20,x28 // h+=K[i] - eor x13,x25,x25,ror#23 - and x17,x26,x25 - bic x28,x27,x25 - add x20,x20,x10 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x21,x22 // a^b, b^c in next round - eor x16,x16,x13,ror#18 // Sigma1(e) - ror x13,x21,#28 - add x20,x20,x17 // h+=Ch(e,f,g) - eor x17,x21,x21,ror#5 - add x20,x20,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x24,x24,x20 // d+=h - eor x19,x19,x22 // Maj(a,b,c) - eor x17,x13,x17,ror#34 // Sigma0(a) - add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x20,x20,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x11,x11 // 8 -#endif - add x20,x20,x17 // h+=Sigma0(a) - ror x16,x24,#14 - add x27,x27,x19 // h+=K[i] - eor x14,x24,x24,ror#23 - and x17,x25,x24 - bic x19,x26,x24 - add x27,x27,x11 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x20,x21 // a^b, b^c in next round - eor x16,x16,x14,ror#18 // Sigma1(e) - ror x14,x20,#28 - add x27,x27,x17 // h+=Ch(e,f,g) - eor x17,x20,x20,ror#5 - add x27,x27,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x23,x23,x27 // d+=h - eor x28,x28,x21 // Maj(a,b,c) - eor x17,x14,x17,ror#34 // Sigma0(a) - add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x27,x27,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x12,x12 // 9 -#endif - ldp x13,x14,[x1],#2*8 - add x27,x27,x17 // h+=Sigma0(a) - ror x16,x23,#14 - add x26,x26,x28 // h+=K[i] - eor x15,x23,x23,ror#23 - and x17,x24,x23 - bic x28,x25,x23 - add x26,x26,x12 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x27,x20 // a^b, b^c in next round - eor x16,x16,x15,ror#18 // Sigma1(e) - ror x15,x27,#28 - add x26,x26,x17 // h+=Ch(e,f,g) - eor x17,x27,x27,ror#5 - add x26,x26,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x22,x22,x26 // d+=h - eor x19,x19,x20 // Maj(a,b,c) - eor x17,x15,x17,ror#34 // Sigma0(a) - add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x26,x26,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x13,x13 // 10 -#endif - add x26,x26,x17 // h+=Sigma0(a) - ror x16,x22,#14 - add x25,x25,x19 // h+=K[i] - eor x0,x22,x22,ror#23 - and x17,x23,x22 - bic x19,x24,x22 - add x25,x25,x13 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x26,x27 // a^b, b^c in next round - eor x16,x16,x0,ror#18 // Sigma1(e) - ror x0,x26,#28 - add x25,x25,x17 // h+=Ch(e,f,g) - eor x17,x26,x26,ror#5 - add x25,x25,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x21,x21,x25 // d+=h - eor x28,x28,x27 // Maj(a,b,c) - eor x17,x0,x17,ror#34 // Sigma0(a) - add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x25,x25,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x14,x14 // 11 -#endif - ldp x15,x0,[x1],#2*8 - add x25,x25,x17 // h+=Sigma0(a) - str x6,[sp,#24] - ror x16,x21,#14 - add x24,x24,x28 // h+=K[i] - eor x6,x21,x21,ror#23 - and x17,x22,x21 - bic x28,x23,x21 - add x24,x24,x14 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x25,x26 // a^b, b^c in next round - eor x16,x16,x6,ror#18 // Sigma1(e) - ror x6,x25,#28 - add x24,x24,x17 // h+=Ch(e,f,g) - eor x17,x25,x25,ror#5 - add x24,x24,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x20,x20,x24 // d+=h - eor x19,x19,x26 // Maj(a,b,c) - eor x17,x6,x17,ror#34 // Sigma0(a) - add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x24,x24,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x15,x15 // 12 -#endif - add x24,x24,x17 // h+=Sigma0(a) - str x7,[sp,#0] - ror x16,x20,#14 - add x23,x23,x19 // h+=K[i] - eor x7,x20,x20,ror#23 - and x17,x21,x20 - bic x19,x22,x20 - add x23,x23,x15 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x24,x25 // a^b, b^c in next round - eor x16,x16,x7,ror#18 // Sigma1(e) - ror x7,x24,#28 - add x23,x23,x17 // h+=Ch(e,f,g) - eor x17,x24,x24,ror#5 - add x23,x23,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x27,x27,x23 // d+=h - eor x28,x28,x25 // Maj(a,b,c) - eor x17,x7,x17,ror#34 // Sigma0(a) - add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x23,x23,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x0,x0 // 13 -#endif - ldp x1,x2,[x1] - add x23,x23,x17 // h+=Sigma0(a) - str x8,[sp,#8] - ror x16,x27,#14 - add x22,x22,x28 // h+=K[i] - eor x8,x27,x27,ror#23 - and x17,x20,x27 - bic x28,x21,x27 - add x22,x22,x0 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x23,x24 // a^b, b^c in next round - eor x16,x16,x8,ror#18 // Sigma1(e) - ror x8,x23,#28 - add x22,x22,x17 // h+=Ch(e,f,g) - eor x17,x23,x23,ror#5 - add x22,x22,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x26,x26,x22 // d+=h - eor x19,x19,x24 // Maj(a,b,c) - eor x17,x8,x17,ror#34 // Sigma0(a) - add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x22,x22,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x1,x1 // 14 -#endif - ldr x6,[sp,#24] - add x22,x22,x17 // h+=Sigma0(a) - str x9,[sp,#16] - ror x16,x26,#14 - add x21,x21,x19 // h+=K[i] - eor x9,x26,x26,ror#23 - and x17,x27,x26 - bic x19,x20,x26 - add x21,x21,x1 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x22,x23 // a^b, b^c in next round - eor x16,x16,x9,ror#18 // Sigma1(e) - ror x9,x22,#28 - add x21,x21,x17 // h+=Ch(e,f,g) - eor x17,x22,x22,ror#5 - add x21,x21,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x25,x25,x21 // d+=h - eor x28,x28,x23 // Maj(a,b,c) - eor x17,x9,x17,ror#34 // Sigma0(a) - add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x21,x21,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x2,x2 // 15 -#endif - ldr x7,[sp,#0] - add x21,x21,x17 // h+=Sigma0(a) - str x10,[sp,#24] - ror x16,x25,#14 - add x20,x20,x28 // h+=K[i] - ror x9,x4,#1 - and x17,x26,x25 - ror x8,x1,#19 - bic x28,x27,x25 - ror x10,x21,#28 - add x20,x20,x2 // h+=X[i] - eor x16,x16,x25,ror#18 - eor x9,x9,x4,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x21,x22 // a^b, b^c in next round - eor x16,x16,x25,ror#41 // Sigma1(e) - eor x10,x10,x21,ror#34 - add x20,x20,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x8,x8,x1,ror#61 - eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) - add x20,x20,x16 // h+=Sigma1(e) - eor x19,x19,x22 // Maj(a,b,c) - eor x17,x10,x21,ror#39 // Sigma0(a) - eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) - add x3,x3,x12 - add x24,x24,x20 // d+=h - add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x3,x3,x9 - add x20,x20,x17 // h+=Sigma0(a) - add x3,x3,x8 -Loop_16_xx: - ldr x8,[sp,#8] - str x11,[sp,#0] - ror x16,x24,#14 - add x27,x27,x19 // h+=K[i] - ror x10,x5,#1 - and x17,x25,x24 - ror x9,x2,#19 - bic x19,x26,x24 - ror x11,x20,#28 - add x27,x27,x3 // h+=X[i] - eor x16,x16,x24,ror#18 - eor x10,x10,x5,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x20,x21 // a^b, b^c in next round - eor x16,x16,x24,ror#41 // Sigma1(e) - eor x11,x11,x20,ror#34 - add x27,x27,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x9,x9,x2,ror#61 - eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) - add x27,x27,x16 // h+=Sigma1(e) - eor x28,x28,x21 // Maj(a,b,c) - eor x17,x11,x20,ror#39 // Sigma0(a) - eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) - add x4,x4,x13 - add x23,x23,x27 // d+=h - add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x4,x4,x10 - add x27,x27,x17 // h+=Sigma0(a) - add x4,x4,x9 - ldr x9,[sp,#16] - str x12,[sp,#8] - ror x16,x23,#14 - add x26,x26,x28 // h+=K[i] - ror x11,x6,#1 - and x17,x24,x23 - ror x10,x3,#19 - bic x28,x25,x23 - ror x12,x27,#28 - add x26,x26,x4 // h+=X[i] - eor x16,x16,x23,ror#18 - eor x11,x11,x6,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x27,x20 // a^b, b^c in next round - eor x16,x16,x23,ror#41 // Sigma1(e) - eor x12,x12,x27,ror#34 - add x26,x26,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x10,x10,x3,ror#61 - eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) - add x26,x26,x16 // h+=Sigma1(e) - eor x19,x19,x20 // Maj(a,b,c) - eor x17,x12,x27,ror#39 // Sigma0(a) - eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) - add x5,x5,x14 - add x22,x22,x26 // d+=h - add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x5,x5,x11 - add x26,x26,x17 // h+=Sigma0(a) - add x5,x5,x10 - ldr x10,[sp,#24] - str x13,[sp,#16] - ror x16,x22,#14 - add x25,x25,x19 // h+=K[i] - ror x12,x7,#1 - and x17,x23,x22 - ror x11,x4,#19 - bic x19,x24,x22 - ror x13,x26,#28 - add x25,x25,x5 // h+=X[i] - eor x16,x16,x22,ror#18 - eor x12,x12,x7,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x26,x27 // a^b, b^c in next round - eor x16,x16,x22,ror#41 // Sigma1(e) - eor x13,x13,x26,ror#34 - add x25,x25,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x11,x11,x4,ror#61 - eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) - add x25,x25,x16 // h+=Sigma1(e) - eor x28,x28,x27 // Maj(a,b,c) - eor x17,x13,x26,ror#39 // Sigma0(a) - eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) - add x6,x6,x15 - add x21,x21,x25 // d+=h - add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x6,x6,x12 - add x25,x25,x17 // h+=Sigma0(a) - add x6,x6,x11 - ldr x11,[sp,#0] - str x14,[sp,#24] - ror x16,x21,#14 - add x24,x24,x28 // h+=K[i] - ror x13,x8,#1 - and x17,x22,x21 - ror x12,x5,#19 - bic x28,x23,x21 - ror x14,x25,#28 - add x24,x24,x6 // h+=X[i] - eor x16,x16,x21,ror#18 - eor x13,x13,x8,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x25,x26 // a^b, b^c in next round - eor x16,x16,x21,ror#41 // Sigma1(e) - eor x14,x14,x25,ror#34 - add x24,x24,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x12,x12,x5,ror#61 - eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) - add x24,x24,x16 // h+=Sigma1(e) - eor x19,x19,x26 // Maj(a,b,c) - eor x17,x14,x25,ror#39 // Sigma0(a) - eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) - add x7,x7,x0 - add x20,x20,x24 // d+=h - add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x7,x7,x13 - add x24,x24,x17 // h+=Sigma0(a) - add x7,x7,x12 - ldr x12,[sp,#8] - str x15,[sp,#0] - ror x16,x20,#14 - add x23,x23,x19 // h+=K[i] - ror x14,x9,#1 - and x17,x21,x20 - ror x13,x6,#19 - bic x19,x22,x20 - ror x15,x24,#28 - add x23,x23,x7 // h+=X[i] - eor x16,x16,x20,ror#18 - eor x14,x14,x9,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x24,x25 // a^b, b^c in next round - eor x16,x16,x20,ror#41 // Sigma1(e) - eor x15,x15,x24,ror#34 - add x23,x23,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x13,x13,x6,ror#61 - eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) - add x23,x23,x16 // h+=Sigma1(e) - eor x28,x28,x25 // Maj(a,b,c) - eor x17,x15,x24,ror#39 // Sigma0(a) - eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) - add x8,x8,x1 - add x27,x27,x23 // d+=h - add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x8,x8,x14 - add x23,x23,x17 // h+=Sigma0(a) - add x8,x8,x13 - ldr x13,[sp,#16] - str x0,[sp,#8] - ror x16,x27,#14 - add x22,x22,x28 // h+=K[i] - ror x15,x10,#1 - and x17,x20,x27 - ror x14,x7,#19 - bic x28,x21,x27 - ror x0,x23,#28 - add x22,x22,x8 // h+=X[i] - eor x16,x16,x27,ror#18 - eor x15,x15,x10,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x23,x24 // a^b, b^c in next round - eor x16,x16,x27,ror#41 // Sigma1(e) - eor x0,x0,x23,ror#34 - add x22,x22,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x14,x14,x7,ror#61 - eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) - add x22,x22,x16 // h+=Sigma1(e) - eor x19,x19,x24 // Maj(a,b,c) - eor x17,x0,x23,ror#39 // Sigma0(a) - eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) - add x9,x9,x2 - add x26,x26,x22 // d+=h - add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x9,x9,x15 - add x22,x22,x17 // h+=Sigma0(a) - add x9,x9,x14 - ldr x14,[sp,#24] - str x1,[sp,#16] - ror x16,x26,#14 - add x21,x21,x19 // h+=K[i] - ror x0,x11,#1 - and x17,x27,x26 - ror x15,x8,#19 - bic x19,x20,x26 - ror x1,x22,#28 - add x21,x21,x9 // h+=X[i] - eor x16,x16,x26,ror#18 - eor x0,x0,x11,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x22,x23 // a^b, b^c in next round - eor x16,x16,x26,ror#41 // Sigma1(e) - eor x1,x1,x22,ror#34 - add x21,x21,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x15,x15,x8,ror#61 - eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) - add x21,x21,x16 // h+=Sigma1(e) - eor x28,x28,x23 // Maj(a,b,c) - eor x17,x1,x22,ror#39 // Sigma0(a) - eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) - add x10,x10,x3 - add x25,x25,x21 // d+=h - add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x10,x10,x0 - add x21,x21,x17 // h+=Sigma0(a) - add x10,x10,x15 - ldr x15,[sp,#0] - str x2,[sp,#24] - ror x16,x25,#14 - add x20,x20,x28 // h+=K[i] - ror x1,x12,#1 - and x17,x26,x25 - ror x0,x9,#19 - bic x28,x27,x25 - ror x2,x21,#28 - add x20,x20,x10 // h+=X[i] - eor x16,x16,x25,ror#18 - eor x1,x1,x12,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x21,x22 // a^b, b^c in next round - eor x16,x16,x25,ror#41 // Sigma1(e) - eor x2,x2,x21,ror#34 - add x20,x20,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x0,x0,x9,ror#61 - eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) - add x20,x20,x16 // h+=Sigma1(e) - eor x19,x19,x22 // Maj(a,b,c) - eor x17,x2,x21,ror#39 // Sigma0(a) - eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) - add x11,x11,x4 - add x24,x24,x20 // d+=h - add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x11,x11,x1 - add x20,x20,x17 // h+=Sigma0(a) - add x11,x11,x0 - ldr x0,[sp,#8] - str x3,[sp,#0] - ror x16,x24,#14 - add x27,x27,x19 // h+=K[i] - ror x2,x13,#1 - and x17,x25,x24 - ror x1,x10,#19 - bic x19,x26,x24 - ror x3,x20,#28 - add x27,x27,x11 // h+=X[i] - eor x16,x16,x24,ror#18 - eor x2,x2,x13,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x20,x21 // a^b, b^c in next round - eor x16,x16,x24,ror#41 // Sigma1(e) - eor x3,x3,x20,ror#34 - add x27,x27,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x1,x1,x10,ror#61 - eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) - add x27,x27,x16 // h+=Sigma1(e) - eor x28,x28,x21 // Maj(a,b,c) - eor x17,x3,x20,ror#39 // Sigma0(a) - eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) - add x12,x12,x5 - add x23,x23,x27 // d+=h - add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x12,x12,x2 - add x27,x27,x17 // h+=Sigma0(a) - add x12,x12,x1 - ldr x1,[sp,#16] - str x4,[sp,#8] - ror x16,x23,#14 - add x26,x26,x28 // h+=K[i] - ror x3,x14,#1 - and x17,x24,x23 - ror x2,x11,#19 - bic x28,x25,x23 - ror x4,x27,#28 - add x26,x26,x12 // h+=X[i] - eor x16,x16,x23,ror#18 - eor x3,x3,x14,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x27,x20 // a^b, b^c in next round - eor x16,x16,x23,ror#41 // Sigma1(e) - eor x4,x4,x27,ror#34 - add x26,x26,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x2,x2,x11,ror#61 - eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) - add x26,x26,x16 // h+=Sigma1(e) - eor x19,x19,x20 // Maj(a,b,c) - eor x17,x4,x27,ror#39 // Sigma0(a) - eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) - add x13,x13,x6 - add x22,x22,x26 // d+=h - add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x13,x13,x3 - add x26,x26,x17 // h+=Sigma0(a) - add x13,x13,x2 - ldr x2,[sp,#24] - str x5,[sp,#16] - ror x16,x22,#14 - add x25,x25,x19 // h+=K[i] - ror x4,x15,#1 - and x17,x23,x22 - ror x3,x12,#19 - bic x19,x24,x22 - ror x5,x26,#28 - add x25,x25,x13 // h+=X[i] - eor x16,x16,x22,ror#18 - eor x4,x4,x15,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x26,x27 // a^b, b^c in next round - eor x16,x16,x22,ror#41 // Sigma1(e) - eor x5,x5,x26,ror#34 - add x25,x25,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x3,x3,x12,ror#61 - eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) - add x25,x25,x16 // h+=Sigma1(e) - eor x28,x28,x27 // Maj(a,b,c) - eor x17,x5,x26,ror#39 // Sigma0(a) - eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) - add x14,x14,x7 - add x21,x21,x25 // d+=h - add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x14,x14,x4 - add x25,x25,x17 // h+=Sigma0(a) - add x14,x14,x3 - ldr x3,[sp,#0] - str x6,[sp,#24] - ror x16,x21,#14 - add x24,x24,x28 // h+=K[i] - ror x5,x0,#1 - and x17,x22,x21 - ror x4,x13,#19 - bic x28,x23,x21 - ror x6,x25,#28 - add x24,x24,x14 // h+=X[i] - eor x16,x16,x21,ror#18 - eor x5,x5,x0,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x25,x26 // a^b, b^c in next round - eor x16,x16,x21,ror#41 // Sigma1(e) - eor x6,x6,x25,ror#34 - add x24,x24,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x4,x4,x13,ror#61 - eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) - add x24,x24,x16 // h+=Sigma1(e) - eor x19,x19,x26 // Maj(a,b,c) - eor x17,x6,x25,ror#39 // Sigma0(a) - eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) - add x15,x15,x8 - add x20,x20,x24 // d+=h - add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x15,x15,x5 - add x24,x24,x17 // h+=Sigma0(a) - add x15,x15,x4 - ldr x4,[sp,#8] - str x7,[sp,#0] - ror x16,x20,#14 - add x23,x23,x19 // h+=K[i] - ror x6,x1,#1 - and x17,x21,x20 - ror x5,x14,#19 - bic x19,x22,x20 - ror x7,x24,#28 - add x23,x23,x15 // h+=X[i] - eor x16,x16,x20,ror#18 - eor x6,x6,x1,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x24,x25 // a^b, b^c in next round - eor x16,x16,x20,ror#41 // Sigma1(e) - eor x7,x7,x24,ror#34 - add x23,x23,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x5,x5,x14,ror#61 - eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) - add x23,x23,x16 // h+=Sigma1(e) - eor x28,x28,x25 // Maj(a,b,c) - eor x17,x7,x24,ror#39 // Sigma0(a) - eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) - add x0,x0,x9 - add x27,x27,x23 // d+=h - add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x0,x0,x6 - add x23,x23,x17 // h+=Sigma0(a) - add x0,x0,x5 - ldr x5,[sp,#16] - str x8,[sp,#8] - ror x16,x27,#14 - add x22,x22,x28 // h+=K[i] - ror x7,x2,#1 - and x17,x20,x27 - ror x6,x15,#19 - bic x28,x21,x27 - ror x8,x23,#28 - add x22,x22,x0 // h+=X[i] - eor x16,x16,x27,ror#18 - eor x7,x7,x2,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x23,x24 // a^b, b^c in next round - eor x16,x16,x27,ror#41 // Sigma1(e) - eor x8,x8,x23,ror#34 - add x22,x22,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x6,x6,x15,ror#61 - eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) - add x22,x22,x16 // h+=Sigma1(e) - eor x19,x19,x24 // Maj(a,b,c) - eor x17,x8,x23,ror#39 // Sigma0(a) - eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) - add x1,x1,x10 - add x26,x26,x22 // d+=h - add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x1,x1,x7 - add x22,x22,x17 // h+=Sigma0(a) - add x1,x1,x6 - ldr x6,[sp,#24] - str x9,[sp,#16] - ror x16,x26,#14 - add x21,x21,x19 // h+=K[i] - ror x8,x3,#1 - and x17,x27,x26 - ror x7,x0,#19 - bic x19,x20,x26 - ror x9,x22,#28 - add x21,x21,x1 // h+=X[i] - eor x16,x16,x26,ror#18 - eor x8,x8,x3,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x22,x23 // a^b, b^c in next round - eor x16,x16,x26,ror#41 // Sigma1(e) - eor x9,x9,x22,ror#34 - add x21,x21,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x7,x7,x0,ror#61 - eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) - add x21,x21,x16 // h+=Sigma1(e) - eor x28,x28,x23 // Maj(a,b,c) - eor x17,x9,x22,ror#39 // Sigma0(a) - eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) - add x2,x2,x11 - add x25,x25,x21 // d+=h - add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x2,x2,x8 - add x21,x21,x17 // h+=Sigma0(a) - add x2,x2,x7 - ldr x7,[sp,#0] - str x10,[sp,#24] - ror x16,x25,#14 - add x20,x20,x28 // h+=K[i] - ror x9,x4,#1 - and x17,x26,x25 - ror x8,x1,#19 - bic x28,x27,x25 - ror x10,x21,#28 - add x20,x20,x2 // h+=X[i] - eor x16,x16,x25,ror#18 - eor x9,x9,x4,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x21,x22 // a^b, b^c in next round - eor x16,x16,x25,ror#41 // Sigma1(e) - eor x10,x10,x21,ror#34 - add x20,x20,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x8,x8,x1,ror#61 - eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) - add x20,x20,x16 // h+=Sigma1(e) - eor x19,x19,x22 // Maj(a,b,c) - eor x17,x10,x21,ror#39 // Sigma0(a) - eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) - add x3,x3,x12 - add x24,x24,x20 // d+=h - add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x3,x3,x9 - add x20,x20,x17 // h+=Sigma0(a) - add x3,x3,x8 - cbnz x19,Loop_16_xx - - ldp x0,x2,[x29,#96] - ldr x1,[x29,#112] - sub x30,x30,#648 // rewind - - ldp x3,x4,[x0] - ldp x5,x6,[x0,#2*8] - add x1,x1,#14*8 // advance input pointer - ldp x7,x8,[x0,#4*8] - add x20,x20,x3 - ldp x9,x10,[x0,#6*8] - add x21,x21,x4 - add x22,x22,x5 - add x23,x23,x6 - stp x20,x21,[x0] - add x24,x24,x7 - add x25,x25,x8 - stp x22,x23,[x0,#2*8] - add x26,x26,x9 - add x27,x27,x10 - cmp x1,x2 - stp x24,x25,[x0,#4*8] - stp x26,x27,[x0,#6*8] - b.ne Loop - - ldp x19,x20,[x29,#16] - add sp,sp,#4*8 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -.section __TEXT,__const -.align 6 - -LK512: -.quad 0x428a2f98d728ae22,0x7137449123ef65cd -.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc -.quad 0x3956c25bf348b538,0x59f111f1b605d019 -.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 -.quad 0xd807aa98a3030242,0x12835b0145706fbe -.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 -.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 -.quad 0x9bdc06a725c71235,0xc19bf174cf692694 -.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 -.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 -.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 -.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 -.quad 0x983e5152ee66dfab,0xa831c66d2db43210 -.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 -.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 -.quad 0x06ca6351e003826f,0x142929670a0e6e70 -.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 -.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df -.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 -.quad 0x81c2c92e47edaee6,0x92722c851482353b -.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 -.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 -.quad 0xd192e819d6ef5218,0xd69906245565a910 -.quad 0xf40e35855771202a,0x106aa07032bbd1b8 -.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 -.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 -.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb -.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 -.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 -.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec -.quad 0x90befffa23631e28,0xa4506cebde82bde9 -.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b -.quad 0xca273eceea26619c,0xd186b8c721c0c207 -.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 -.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 -.quad 0x113f9804bef90dae,0x1b710b35131c471b -.quad 0x28db77f523047d84,0x32caab7b40c72493 -.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c -.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a -.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 -.quad 0 // terminator - -.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -.text -#ifndef __KERNEL__ - -.align 6 -sha512_block_armv8: -Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - - ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context - adrp x3,LK512@PAGE - add x3,x3,LK512@PAGEOFF - - rev64 v16.16b,v16.16b - rev64 v17.16b,v17.16b - rev64 v18.16b,v18.16b - rev64 v19.16b,v19.16b - rev64 v20.16b,v20.16b - rev64 v21.16b,v21.16b - rev64 v22.16b,v22.16b - rev64 v23.16b,v23.16b - b Loop_hw - -.align 4 -Loop_hw: - ld1 {v24.2d},[x3],#16 - subs x2,x2,#1 - sub x4,x1,#128 - orr v26.16b,v0.16b,v0.16b // offload - orr v27.16b,v1.16b,v1.16b - orr v28.16b,v2.16b,v2.16b - orr v29.16b,v3.16b,v3.16b - csel x1,x1,x4,ne // conditional rewind - add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec08230 //sha512su0 v16.16b,v17.16b - ext v7.16b,v20.16b,v21.16b,#8 -.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08251 //sha512su0 v17.16b,v18.16b - ext v7.16b,v21.16b,v22.16b,#8 -.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec08272 //sha512su0 v18.16b,v19.16b - ext v7.16b,v22.16b,v23.16b,#8 -.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08293 //sha512su0 v19.16b,v20.16b - ext v7.16b,v23.16b,v16.16b,#8 -.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec082b4 //sha512su0 v20.16b,v21.16b - ext v7.16b,v16.16b,v17.16b,#8 -.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec082d5 //sha512su0 v21.16b,v22.16b - ext v7.16b,v17.16b,v18.16b,#8 -.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec082f6 //sha512su0 v22.16b,v23.16b - ext v7.16b,v18.16b,v19.16b,#8 -.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08217 //sha512su0 v23.16b,v16.16b - ext v7.16b,v19.16b,v20.16b,#8 -.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec08230 //sha512su0 v16.16b,v17.16b - ext v7.16b,v20.16b,v21.16b,#8 -.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08251 //sha512su0 v17.16b,v18.16b - ext v7.16b,v21.16b,v22.16b,#8 -.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec08272 //sha512su0 v18.16b,v19.16b - ext v7.16b,v22.16b,v23.16b,#8 -.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08293 //sha512su0 v19.16b,v20.16b - ext v7.16b,v23.16b,v16.16b,#8 -.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec082b4 //sha512su0 v20.16b,v21.16b - ext v7.16b,v16.16b,v17.16b,#8 -.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec082d5 //sha512su0 v21.16b,v22.16b - ext v7.16b,v17.16b,v18.16b,#8 -.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec082f6 //sha512su0 v22.16b,v23.16b - ext v7.16b,v18.16b,v19.16b,#8 -.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08217 //sha512su0 v23.16b,v16.16b - ext v7.16b,v19.16b,v20.16b,#8 -.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec08230 //sha512su0 v16.16b,v17.16b - ext v7.16b,v20.16b,v21.16b,#8 -.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08251 //sha512su0 v17.16b,v18.16b - ext v7.16b,v21.16b,v22.16b,#8 -.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec08272 //sha512su0 v18.16b,v19.16b - ext v7.16b,v22.16b,v23.16b,#8 -.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08293 //sha512su0 v19.16b,v20.16b - ext v7.16b,v23.16b,v16.16b,#8 -.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec082b4 //sha512su0 v20.16b,v21.16b - ext v7.16b,v16.16b,v17.16b,#8 -.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec082d5 //sha512su0 v21.16b,v22.16b - ext v7.16b,v17.16b,v18.16b,#8 -.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec082f6 //sha512su0 v22.16b,v23.16b - ext v7.16b,v18.16b,v19.16b,#8 -.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08217 //sha512su0 v23.16b,v16.16b - ext v7.16b,v19.16b,v20.16b,#8 -.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec08230 //sha512su0 v16.16b,v17.16b - ext v7.16b,v20.16b,v21.16b,#8 -.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.long 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08251 //sha512su0 v17.16b,v18.16b - ext v7.16b,v21.16b,v22.16b,#8 -.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.long 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec08272 //sha512su0 v18.16b,v19.16b - ext v7.16b,v22.16b,v23.16b,#8 -.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.long 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08293 //sha512su0 v19.16b,v20.16b - ext v7.16b,v23.16b,v16.16b,#8 -.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.long 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec082b4 //sha512su0 v20.16b,v21.16b - ext v7.16b,v16.16b,v17.16b,#8 -.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.long 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec082d5 //sha512su0 v21.16b,v22.16b - ext v7.16b,v17.16b,v18.16b,#8 -.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.long 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" -.long 0xcec082f6 //sha512su0 v22.16b,v23.16b - ext v7.16b,v18.16b,v19.16b,#8 -.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.long 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" -.long 0xcec08217 //sha512su0 v23.16b,v16.16b - ext v7.16b,v19.16b,v20.16b,#8 -.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.long 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - ld1 {v25.2d},[x3],#16 - add v24.2d,v24.2d,v16.2d - ld1 {v16.16b},[x1],#16 // load next input - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" -.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b - rev64 v16.16b,v16.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - ld1 {v24.2d},[x3],#16 - add v25.2d,v25.2d,v17.2d - ld1 {v17.16b},[x1],#16 // load next input - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" -.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b - rev64 v17.16b,v17.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - ld1 {v25.2d},[x3],#16 - add v24.2d,v24.2d,v18.2d - ld1 {v18.16b},[x1],#16 // load next input - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" -.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b - rev64 v18.16b,v18.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - ld1 {v24.2d},[x3],#16 - add v25.2d,v25.2d,v19.2d - ld1 {v19.16b},[x1],#16 // load next input - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" -.long 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b - rev64 v19.16b,v19.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.long 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - ld1 {v25.2d},[x3],#16 - add v24.2d,v24.2d,v20.2d - ld1 {v20.16b},[x1],#16 // load next input - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" -.long 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b - rev64 v20.16b,v20.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.long 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - ld1 {v24.2d},[x3],#16 - add v25.2d,v25.2d,v21.2d - ld1 {v21.16b},[x1],#16 // load next input - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" -.long 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b - rev64 v21.16b,v21.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.long 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - ld1 {v25.2d},[x3],#16 - add v24.2d,v24.2d,v22.2d - ld1 {v22.16b},[x1],#16 // load next input - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" -.long 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b - rev64 v22.16b,v22.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.long 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - sub x3,x3,#80*8 // rewind - add v25.2d,v25.2d,v23.2d - ld1 {v23.16b},[x1],#16 // load next input - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" -.long 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b - rev64 v23.16b,v23.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.long 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v0.2d,v0.2d,v26.2d // accumulate - add v1.2d,v1.2d,v27.2d - add v2.2d,v2.2d,v28.2d - add v3.2d,v3.2d,v29.2d - - cbnz x2,Loop_hw - - st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context - - ldr x29,[sp],#16 - ret - -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S b/third_party/boringssl/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S deleted file mode 100644 index 6dfc25d9..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/fipsmodule/vpaes-armv8.S +++ /dev/null @@ -1,1232 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.section __TEXT,__const - - -.align 7 // totally strategic alignment -_vpaes_consts: -Lk_mc_forward: // mc_forward -.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 -.quad 0x080B0A0904070605, 0x000302010C0F0E0D -.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 -.quad 0x000302010C0F0E0D, 0x080B0A0904070605 -Lk_mc_backward: // mc_backward -.quad 0x0605040702010003, 0x0E0D0C0F0A09080B -.quad 0x020100030E0D0C0F, 0x0A09080B06050407 -.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 -.quad 0x0A09080B06050407, 0x020100030E0D0C0F -Lk_sr: // sr -.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 -.quad 0x030E09040F0A0500, 0x0B06010C07020D08 -.quad 0x0F060D040B020900, 0x070E050C030A0108 -.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 - -// -// "Hot" constants -// -Lk_inv: // inv, inva -.quad 0x0E05060F0D080180, 0x040703090A0B0C02 -.quad 0x01040A060F0B0780, 0x030D0E0C02050809 -Lk_ipt: // input transform (lo, hi) -.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 -.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 -Lk_sbo: // sbou, sbot -.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 -.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA -Lk_sb1: // sb1u, sb1t -.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF -.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 -Lk_sb2: // sb2u, sb2t -.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A -.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD - -// -// Decryption stuff -// -Lk_dipt: // decryption input transform -.quad 0x0F505B040B545F00, 0x154A411E114E451A -.quad 0x86E383E660056500, 0x12771772F491F194 -Lk_dsbo: // decryption sbox final output -.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D -.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C -Lk_dsb9: // decryption sbox output *9*u, *9*t -.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 -.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 -Lk_dsbd: // decryption sbox output *D*u, *D*t -.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 -.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 -Lk_dsbb: // decryption sbox output *B*u, *B*t -.quad 0xD022649296B44200, 0x602646F6B0F2D404 -.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B -Lk_dsbe: // decryption sbox output *E*u, *E*t -.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 -.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 - -// -// Key schedule constants -// -Lk_dksd: // decryption key schedule: invskew x*D -.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 -.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E -Lk_dksb: // decryption key schedule: invskew x*B -.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 -.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 -Lk_dkse: // decryption key schedule: invskew x*E + 0x63 -.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 -.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 -Lk_dks9: // decryption key schedule: invskew x*9 -.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC -.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE - -Lk_rcon: // rcon -.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 - -Lk_opt: // output transform -.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 -.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 -Lk_deskew: // deskew tables: inverts the sbox's "skew" -.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A -.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 - -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 -.align 2 - -.align 6 - -.text -## -## _aes_preheat -## -## Fills register %r10 -> .aes_consts (so you can -fPIC) -## and %xmm9-%xmm15 as specified below. -## - -.align 4 -_vpaes_encrypt_preheat: - adrp x10, Lk_inv@PAGE - add x10, x10, Lk_inv@PAGEOFF - movi v17.16b, #0x0f - ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv - ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo - ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 - ret - - -## -## _aes_encrypt_core -## -## AES-encrypt %xmm0. -## -## Inputs: -## %xmm0 = input -## %xmm9-%xmm15 as in _vpaes_preheat -## (%rdx) = scheduled keys -## -## Output in %xmm0 -## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax -## Preserves %xmm6 - %xmm8 so you get some local vectors -## -## - -.align 4 -_vpaes_encrypt_core: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds - adrp x11, Lk_mc_forward@PAGE+16 - add x11, x11, Lk_mc_forward@PAGEOFF+16 - // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo - ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key - and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 - ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 - tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 - // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi - tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 - eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 - eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 - b Lenc_entry - -.align 4 -Lenc_loop: - // middle of middle round - add x10, x11, #0x40 - tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u - ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] - tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t - eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A - tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t - ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] - tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B - eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A - tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B - tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C - eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D - and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D - sub w8, w8, #1 // nr-- - -Lenc_entry: - // top of round - and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i - tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k - eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j - tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io - eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 - cbnz w8, Lenc_loop - - // middle of last round - add x10, x11, #0x80 - // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo - // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 - tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] - tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t - eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A - tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 - ret - - -.globl _vpaes_encrypt -.private_extern _vpaes_encrypt - -.align 4 -_vpaes_encrypt: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v7.16b}, [x0] - bl _vpaes_encrypt_preheat - bl _vpaes_encrypt_core - st1 {v0.16b}, [x1] - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - - - -.align 4 -_vpaes_encrypt_2x: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds - adrp x11, Lk_mc_forward@PAGE+16 - add x11, x11, Lk_mc_forward@PAGEOFF+16 - // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo - ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key - and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 - ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 - and v9.16b, v15.16b, v17.16b - ushr v8.16b, v15.16b, #4 - tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 - tbl v9.16b, {v20.16b}, v9.16b - // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi - tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 - tbl v10.16b, {v21.16b}, v8.16b - eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 - eor v8.16b, v9.16b, v16.16b - eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 - eor v8.16b, v8.16b, v10.16b - b Lenc_2x_entry - -.align 4 -Lenc_2x_loop: - // middle of middle round - add x10, x11, #0x40 - tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u - tbl v12.16b, {v25.16b}, v10.16b - ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] - tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t - tbl v8.16b, {v24.16b}, v11.16b - eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - eor v12.16b, v12.16b, v16.16b - tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u - tbl v13.16b, {v27.16b}, v10.16b - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A - eor v8.16b, v8.16b, v12.16b - tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t - tbl v10.16b, {v26.16b}, v11.16b - ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] - tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B - tbl v11.16b, {v8.16b}, v1.16b - eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A - eor v10.16b, v10.16b, v13.16b - tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D - tbl v8.16b, {v8.16b}, v4.16b - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B - eor v11.16b, v11.16b, v10.16b - tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C - tbl v12.16b, {v11.16b},v1.16b - eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D - eor v8.16b, v8.16b, v11.16b - and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D - eor v8.16b, v8.16b, v12.16b - sub w8, w8, #1 // nr-- - -Lenc_2x_entry: - // top of round - and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i - and v9.16b, v8.16b, v17.16b - ushr v8.16b, v8.16b, #4 - tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k - tbl v13.16b, {v19.16b},v9.16b - eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j - eor v9.16b, v9.16b, v8.16b - tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - tbl v11.16b, {v18.16b},v8.16b - tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - tbl v12.16b, {v18.16b},v9.16b - eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - eor v11.16b, v11.16b, v13.16b - eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - eor v12.16b, v12.16b, v13.16b - tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - tbl v10.16b, {v18.16b},v11.16b - tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - tbl v11.16b, {v18.16b},v12.16b - eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io - eor v10.16b, v10.16b, v9.16b - eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - eor v11.16b, v11.16b, v8.16b - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 - cbnz w8, Lenc_2x_loop - - // middle of last round - add x10, x11, #0x80 - // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo - // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 - tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - tbl v12.16b, {v22.16b}, v10.16b - ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] - tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t - tbl v8.16b, {v23.16b}, v11.16b - eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - eor v12.16b, v12.16b, v16.16b - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A - eor v8.16b, v8.16b, v12.16b - tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 - tbl v1.16b, {v8.16b},v1.16b - ret - - - -.align 4 -_vpaes_decrypt_preheat: - adrp x10, Lk_inv@PAGE - add x10, x10, Lk_inv@PAGEOFF - movi v17.16b, #0x0f - adrp x11, Lk_dipt@PAGE - add x11, x11, Lk_dipt@PAGEOFF - ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv - ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo - ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd - ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe - ret - - -## -## Decryption core -## -## Same API as encryption core. -## - -.align 4 -_vpaes_decrypt_core: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds - - // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo - lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 - eor x11, x11, #0x30 // xor $0x30, %r11 - adrp x10, Lk_sr@PAGE - add x10, x10, Lk_sr@PAGEOFF - and x11, x11, #0x30 // and $0x30, %r11 - add x11, x11, x10 - adrp x10, Lk_mc_forward@PAGE+48 - add x10, x10, Lk_mc_forward@PAGEOFF+48 - - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key - and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 - ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 - tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 - ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 - // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi - tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 - eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 - eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 - b Ldec_entry - -.align 4 -Ldec_loop: -// -// Inverse mix columns -// - // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u - // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t - tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u - tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t - eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 - // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt - - tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu - tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt - - tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu - tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet - - tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu - tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - sub w8, w8, #1 // sub $1,%rax # nr-- - -Ldec_entry: - // top of round - and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i - tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j - tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io - eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 - cbnz w8, Ldec_loop - - // middle of last round - // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou - tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot - ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 - tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t - eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k - eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A - tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 - ret - - -.globl _vpaes_decrypt -.private_extern _vpaes_decrypt - -.align 4 -_vpaes_decrypt: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v7.16b}, [x0] - bl _vpaes_decrypt_preheat - bl _vpaes_decrypt_core - st1 {v0.16b}, [x1] - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -// v14-v15 input, v0-v1 output - -.align 4 -_vpaes_decrypt_2x: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds - - // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo - lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 - eor x11, x11, #0x30 // xor $0x30, %r11 - adrp x10, Lk_sr@PAGE - add x10, x10, Lk_sr@PAGEOFF - and x11, x11, #0x30 // and $0x30, %r11 - add x11, x11, x10 - adrp x10, Lk_mc_forward@PAGE+48 - add x10, x10, Lk_mc_forward@PAGEOFF+48 - - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key - and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 - ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 - and v9.16b, v15.16b, v17.16b - ushr v8.16b, v15.16b, #4 - tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 - tbl v10.16b, {v20.16b},v9.16b - ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 - // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi - tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 - tbl v8.16b, {v21.16b},v8.16b - eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 - eor v10.16b, v10.16b, v16.16b - eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 - eor v8.16b, v8.16b, v10.16b - b Ldec_2x_entry - -.align 4 -Ldec_2x_loop: -// -// Inverse mix columns -// - // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u - // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t - tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u - tbl v12.16b, {v24.16b}, v10.16b - tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t - tbl v9.16b, {v25.16b}, v11.16b - eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 - eor v8.16b, v12.16b, v16.16b - // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt - - tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu - tbl v12.16b, {v26.16b}, v10.16b - tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v8.16b, {v8.16b},v5.16b - tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt - tbl v9.16b, {v27.16b}, v11.16b - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - eor v8.16b, v8.16b, v12.16b - // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - eor v8.16b, v8.16b, v9.16b - // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt - - tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu - tbl v12.16b, {v28.16b}, v10.16b - tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v8.16b, {v8.16b},v5.16b - tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt - tbl v9.16b, {v29.16b}, v11.16b - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - eor v8.16b, v8.16b, v12.16b - // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - eor v8.16b, v8.16b, v9.16b - // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet - - tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu - tbl v12.16b, {v30.16b}, v10.16b - tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v8.16b, {v8.16b},v5.16b - tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet - tbl v9.16b, {v31.16b}, v11.16b - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - eor v8.16b, v8.16b, v12.16b - ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - eor v8.16b, v8.16b, v9.16b - sub w8, w8, #1 // sub $1,%rax # nr-- - -Ldec_2x_entry: - // top of round - and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i - and v9.16b, v8.16b, v17.16b - ushr v8.16b, v8.16b, #4 - tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - tbl v10.16b, {v19.16b},v9.16b - eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j - eor v9.16b, v9.16b, v8.16b - tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - tbl v11.16b, {v18.16b},v8.16b - tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - tbl v12.16b, {v18.16b},v9.16b - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - eor v11.16b, v11.16b, v10.16b - eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - eor v12.16b, v12.16b, v10.16b - tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - tbl v10.16b, {v18.16b},v11.16b - tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - tbl v11.16b, {v18.16b},v12.16b - eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io - eor v10.16b, v10.16b, v9.16b - eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - eor v11.16b, v11.16b, v8.16b - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 - cbnz w8, Ldec_2x_loop - - // middle of last round - // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou - tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - tbl v12.16b, {v22.16b}, v10.16b - // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot - tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t - tbl v9.16b, {v23.16b}, v11.16b - ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 - eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k - eor v12.16b, v12.16b, v16.16b - eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A - eor v8.16b, v9.16b, v12.16b - tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 - tbl v1.16b, {v8.16b},v2.16b - ret - -######################################################## -## ## -## AES key schedule ## -## ## -######################################################## - -.align 4 -_vpaes_key_preheat: - adrp x10, Lk_inv@PAGE - add x10, x10, Lk_inv@PAGEOFF - movi v16.16b, #0x5b // Lk_s63 - adrp x11, Lk_sb1@PAGE - add x11, x11, Lk_sb1@PAGEOFF - movi v17.16b, #0x0f // Lk_s0F - ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt - adrp x10, Lk_dksd@PAGE - add x10, x10, Lk_dksd@PAGEOFF - ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 - adrp x11, Lk_mc_forward@PAGE - add x11, x11, Lk_mc_forward@PAGEOFF - ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb - ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 - ld1 {v8.2d}, [x10] // Lk_rcon - ld1 {v9.2d}, [x11] // Lk_mc_forward[0] - ret - - - -.align 4 -_vpaes_schedule_core: - AARCH64_SIGN_LINK_REGISTER - stp x29, x30, [sp,#-16]! - add x29,sp,#0 - - bl _vpaes_key_preheat // load the tables - - ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) - - // input transform - mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 - bl _vpaes_schedule_transform - mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 - - adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10 - add x10, x10, Lk_sr@PAGEOFF - - add x8, x8, x10 - cbnz w3, Lschedule_am_decrypting - - // encrypting, output zeroth round key after transform - st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) - b Lschedule_go - -Lschedule_am_decrypting: - // decrypting, output zeroth round key after shiftrows - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 - tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) - eor x8, x8, #0x30 // xor $0x30, %r8 - -Lschedule_go: - cmp w1, #192 // cmp $192, %esi - b.hi Lschedule_256 - b.eq Lschedule_192 - // 128: fall though - -## -## .schedule_128 -## -## 128-bit specific part of key schedule. -## -## This schedule is really simple, because all its parts -## are accomplished by the subroutines. -## -Lschedule_128: - mov x0, #10 // mov $10, %esi - -Loop_schedule_128: - sub x0, x0, #1 // dec %esi - bl _vpaes_schedule_round - cbz x0, Lschedule_mangle_last - bl _vpaes_schedule_mangle // write output - b Loop_schedule_128 - -## -## .aes_schedule_192 -## -## 192-bit specific part of key schedule. -## -## The main body of this schedule is the same as the 128-bit -## schedule, but with more smearing. The long, high side is -## stored in %xmm7 as before, and the short, low side is in -## the high bits of %xmm6. -## -## This schedule is somewhat nastier, however, because each -## round produces 192 bits of key material, or 1.5 round keys. -## Therefore, on each cycle we do 2 rounds and produce 3 round -## keys. -## -.align 4 -Lschedule_192: - sub x0, x0, #8 - ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) - bl _vpaes_schedule_transform // input transform - mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part - eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 - ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros - mov x0, #4 // mov $4, %esi - -Loop_schedule_192: - sub x0, x0, #1 // dec %esi - bl _vpaes_schedule_round - ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 - bl _vpaes_schedule_mangle // save key n - bl _vpaes_schedule_192_smear - bl _vpaes_schedule_mangle // save key n+1 - bl _vpaes_schedule_round - cbz x0, Lschedule_mangle_last - bl _vpaes_schedule_mangle // save key n+2 - bl _vpaes_schedule_192_smear - b Loop_schedule_192 - -## -## .aes_schedule_256 -## -## 256-bit specific part of key schedule. -## -## The structure here is very similar to the 128-bit -## schedule, but with an additional "low side" in -## %xmm6. The low side's rounds are the same as the -## high side's, except no rcon and no rotation. -## -.align 4 -Lschedule_256: - ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) - bl _vpaes_schedule_transform // input transform - mov x0, #7 // mov $7, %esi - -Loop_schedule_256: - sub x0, x0, #1 // dec %esi - bl _vpaes_schedule_mangle // output low result - mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 - - // high round - bl _vpaes_schedule_round - cbz x0, Lschedule_mangle_last - bl _vpaes_schedule_mangle - - // low round. swap xmm7 and xmm6 - dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 - movi v4.16b, #0 - mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 - mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 - bl _vpaes_schedule_low_round - mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 - - b Loop_schedule_256 - -## -## .aes_schedule_mangle_last -## -## Mangler for last round of key schedule -## Mangles %xmm0 -## when encrypting, outputs out(%xmm0) ^ 63 -## when decrypting, outputs unskew(%xmm0) -## -## Always called right before return... jumps to cleanup and exits -## -.align 4 -Lschedule_mangle_last: - // schedule last round key from xmm0 - adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew - add x11, x11, Lk_deskew@PAGEOFF - - cbnz w3, Lschedule_mangle_last_dec - - // encrypting - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 - adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform - add x11, x11, Lk_opt@PAGEOFF - add x2, x2, #32 // add $32, %rdx - tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute - -Lschedule_mangle_last_dec: - ld1 {v20.2d,v21.2d}, [x11] // reload constants - sub x2, x2, #16 // add $-16, %rdx - eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 - bl _vpaes_schedule_transform // output transform - st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key - - // cleanup - eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 - eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 - eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 - eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 - eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 - eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 - eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 - eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 - ldp x29, x30, [sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -## -## .aes_schedule_192_smear -## -## Smear the short, low side in the 192-bit key schedule. -## -## Inputs: -## %xmm7: high side, b a x y -## %xmm6: low side, d c 0 0 -## %xmm13: 0 -## -## Outputs: -## %xmm6: b+c+d b+c 0 0 -## %xmm0: b+c+d b+c b a -## - -.align 4 -_vpaes_schedule_192_smear: - movi v1.16b, #0 - dup v0.4s, v7.s[3] - ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 - ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a - eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 - eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 - eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a - mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 - ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros - ret - - -## -## .aes_schedule_round -## -## Runs one main round of the key schedule on %xmm0, %xmm7 -## -## Specifically, runs subbytes on the high dword of %xmm0 -## then rotates it by one byte and xors into the low dword of -## %xmm7. -## -## Adds rcon from low byte of %xmm8, then rotates %xmm8 for -## next rcon. -## -## Smears the dwords of %xmm7 by xoring the low into the -## second low, result into third, result into highest. -## -## Returns results in %xmm7 = %xmm0. -## Clobbers %xmm1-%xmm4, %r11. -## - -.align 4 -_vpaes_schedule_round: - // extract rcon from xmm8 - movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 - ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 - ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 - eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 - - // rotate - dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 - ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 - - // fall through... - - // low round: same as high round, but no rotation and no rcon. -_vpaes_schedule_low_round: - // smear xmm7 - ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 - eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 - ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 - - // subbytes - and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i - eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 - tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j - tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 - tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak - eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak - eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io - eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo - tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou - tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t - eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output - - // add in smeared stuff - eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 - eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 - ret - - -## -## .aes_schedule_transform -## -## Linear-transform %xmm0 according to tables at (%r11) -## -## Requires that %xmm9 = 0x0F0F... as in preheat -## Output in %xmm0 -## Clobbers %xmm1, %xmm2 -## - -.align 4 -_vpaes_schedule_transform: - and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 - // vmovdqa (%r11), %xmm2 # lo - tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 - // vmovdqa 16(%r11), %xmm1 # hi - tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 - eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 - ret - - -## -## .aes_schedule_mangle -## -## Mangle xmm0 from (basis-transformed) standard version -## to our version. -## -## On encrypt, -## xor with 0x63 -## multiply by circulant 0,1,1,1 -## apply shiftrows transform -## -## On decrypt, -## xor with 0x63 -## multiply by "inverse mixcolumns" circulant E,B,D,9 -## deskew -## apply shiftrows transform -## -## -## Writes out to (%rdx), and increments or decrements it -## Keeps track of round number mod 4 in %r8 -## Preserves xmm0 -## Clobbers xmm1-xmm5 -## - -.align 4 -_vpaes_schedule_mangle: - mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later - // vmovdqa .Lk_mc_forward(%rip),%xmm5 - cbnz w3, Lschedule_mangle_dec - - // encrypting - eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 - add x2, x2, #16 // add $16, %rdx - tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 - tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 - tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 - eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 - eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 - - b Lschedule_mangle_both -.align 4 -Lschedule_mangle_dec: - // inverse mix columns - // lea .Lk_dksd(%rip),%r11 - ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi - and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo - - // vmovdqa 0x00(%r11), %xmm2 - tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 - // vmovdqa 0x10(%r11), %xmm3 - tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 - tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 - - // vmovdqa 0x20(%r11), %xmm2 - tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 - eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 - // vmovdqa 0x30(%r11), %xmm3 - tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 - tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 - - // vmovdqa 0x40(%r11), %xmm2 - tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 - eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 - // vmovdqa 0x50(%r11), %xmm3 - tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 - - // vmovdqa 0x60(%r11), %xmm2 - tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 - tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 - // vmovdqa 0x70(%r11), %xmm4 - tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 - eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 - eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 - - sub x2, x2, #16 // add $-16, %rdx - -Lschedule_mangle_both: - tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - add x8, x8, #48 // add $-16, %r8 - and x8, x8, #~(1<<6) // and $0x30, %r8 - st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) - ret - - -.globl _vpaes_set_encrypt_key -.private_extern _vpaes_set_encrypt_key - -.align 4 -_vpaes_set_encrypt_key: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - - lsr w9, w1, #5 // shr $5,%eax - add w9, w9, #5 // $5,%eax - str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - - mov w3, #0 // mov $0,%ecx - mov x8, #0x30 // mov $0x30,%r8d - bl _vpaes_schedule_core - eor x0, x0, x0 - - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -.globl _vpaes_set_decrypt_key -.private_extern _vpaes_set_decrypt_key - -.align 4 -_vpaes_set_decrypt_key: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - - lsr w9, w1, #5 // shr $5,%eax - add w9, w9, #5 // $5,%eax - str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - lsl w9, w9, #4 // shl $4,%eax - add x2, x2, #16 // lea 16(%rdx,%rax),%rdx - add x2, x2, x9 - - mov w3, #1 // mov $1,%ecx - lsr w8, w1, #1 // shr $1,%r8d - and x8, x8, #32 // and $32,%r8d - eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 - bl _vpaes_schedule_core - - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - -.globl _vpaes_cbc_encrypt -.private_extern _vpaes_cbc_encrypt - -.align 4 -_vpaes_cbc_encrypt: - AARCH64_SIGN_LINK_REGISTER - cbz x2, Lcbc_abort - cmp w5, #0 // check direction - b.eq vpaes_cbc_decrypt - - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x17, x2 // reassign - mov x2, x3 // reassign - - ld1 {v0.16b}, [x4] // load ivec - bl _vpaes_encrypt_preheat - b Lcbc_enc_loop - -.align 4 -Lcbc_enc_loop: - ld1 {v7.16b}, [x0],#16 // load input - eor v7.16b, v7.16b, v0.16b // xor with ivec - bl _vpaes_encrypt_core - st1 {v0.16b}, [x1],#16 // save output - subs x17, x17, #16 - b.hi Lcbc_enc_loop - - st1 {v0.16b}, [x4] // write ivec - - ldp x29,x30,[sp],#16 -Lcbc_abort: - AARCH64_VALIDATE_LINK_REGISTER - ret - - - -.align 4 -vpaes_cbc_decrypt: - // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to - // only from vpaes_cbc_encrypt which has already signed the return address. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - stp d10,d11,[sp,#-16]! - stp d12,d13,[sp,#-16]! - stp d14,d15,[sp,#-16]! - - mov x17, x2 // reassign - mov x2, x3 // reassign - ld1 {v6.16b}, [x4] // load ivec - bl _vpaes_decrypt_preheat - tst x17, #16 - b.eq Lcbc_dec_loop2x - - ld1 {v7.16b}, [x0], #16 // load input - bl _vpaes_decrypt_core - eor v0.16b, v0.16b, v6.16b // xor with ivec - orr v6.16b, v7.16b, v7.16b // next ivec value - st1 {v0.16b}, [x1], #16 - subs x17, x17, #16 - b.ls Lcbc_dec_done - -.align 4 -Lcbc_dec_loop2x: - ld1 {v14.16b,v15.16b}, [x0], #32 - bl _vpaes_decrypt_2x - eor v0.16b, v0.16b, v6.16b // xor with ivec - eor v1.16b, v1.16b, v14.16b - orr v6.16b, v15.16b, v15.16b - st1 {v0.16b,v1.16b}, [x1], #32 - subs x17, x17, #32 - b.hi Lcbc_dec_loop2x - -Lcbc_dec_done: - st1 {v6.16b}, [x4] - - ldp d14,d15,[sp],#16 - ldp d12,d13,[sp],#16 - ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - -.globl _vpaes_ctr32_encrypt_blocks -.private_extern _vpaes_ctr32_encrypt_blocks - -.align 4 -_vpaes_ctr32_encrypt_blocks: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - stp d10,d11,[sp,#-16]! - stp d12,d13,[sp,#-16]! - stp d14,d15,[sp,#-16]! - - cbz x2, Lctr32_done - - // Note, unlike the other functions, x2 here is measured in blocks, - // not bytes. - mov x17, x2 - mov x2, x3 - - // Load the IV and counter portion. - ldr w6, [x4, #12] - ld1 {v7.16b}, [x4] - - bl _vpaes_encrypt_preheat - tst x17, #1 - rev w6, w6 // The counter is big-endian. - b.eq Lctr32_prep_loop - - // Handle one block so the remaining block count is even for - // _vpaes_encrypt_2x. - ld1 {v6.16b}, [x0], #16 // Load input ahead of time - bl _vpaes_encrypt_core - eor v0.16b, v0.16b, v6.16b // XOR input and result - st1 {v0.16b}, [x1], #16 - subs x17, x17, #1 - // Update the counter. - add w6, w6, #1 - rev w7, w6 - mov v7.s[3], w7 - b.ls Lctr32_done - -Lctr32_prep_loop: - // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x - // uses v14 and v15. - mov v15.16b, v7.16b - mov v14.16b, v7.16b - add w6, w6, #1 - rev w7, w6 - mov v15.s[3], w7 - -Lctr32_loop: - ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time - bl _vpaes_encrypt_2x - eor v0.16b, v0.16b, v6.16b // XOR input and result - eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) - st1 {v0.16b,v1.16b}, [x1], #32 - subs x17, x17, #2 - // Update the counter. - add w7, w6, #1 - add w6, w6, #2 - rev w7, w7 - mov v14.s[3], w7 - rev w7, w6 - mov v15.s[3], w7 - b.hi Lctr32_loop - -Lctr32_done: - ldp d14,d15,[sp],#16 - ldp d12,d13,[sp],#16 - ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret - -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-aarch64/crypto/test/trampoline-armv8.S b/third_party/boringssl/apple-aarch64/crypto/test/trampoline-armv8.S deleted file mode 100644 index 325da9b1..00000000 --- a/third_party/boringssl/apple-aarch64/crypto/test/trampoline-armv8.S +++ /dev/null @@ -1,758 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.text - -// abi_test_trampoline loads callee-saved registers from |state|, calls |func| -// with |argv|, then saves the callee-saved registers into |state|. It returns -// the result of |func|. The |unwind| argument is unused. -// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state, -// const uint64_t *argv, size_t argc, -// uint64_t unwind); - -.globl _abi_test_trampoline -.private_extern _abi_test_trampoline -.align 4 -_abi_test_trampoline: -Labi_test_trampoline_begin: - AARCH64_SIGN_LINK_REGISTER - // Stack layout (low to high addresses) - // x29,x30 (16 bytes) - // d8-d15 (64 bytes) - // x19-x28 (80 bytes) - // x1 (8 bytes) - // padding (8 bytes) - stp x29, x30, [sp, #-176]! - mov x29, sp - - // Saved callee-saved registers and |state|. - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] - stp x19, x20, [sp, #80] - stp x21, x22, [sp, #96] - stp x23, x24, [sp, #112] - stp x25, x26, [sp, #128] - stp x27, x28, [sp, #144] - str x1, [sp, #160] - - // Load registers from |state|, with the exception of x29. x29 is the - // frame pointer and also callee-saved, but AAPCS64 allows platforms to - // mandate that x29 always point to a frame. iOS64 does so, which means - // we cannot fill x29 with entropy without violating ABI rules - // ourselves. x29 is tested separately below. - ldp d8, d9, [x1], #16 - ldp d10, d11, [x1], #16 - ldp d12, d13, [x1], #16 - ldp d14, d15, [x1], #16 - ldp x19, x20, [x1], #16 - ldp x21, x22, [x1], #16 - ldp x23, x24, [x1], #16 - ldp x25, x26, [x1], #16 - ldp x27, x28, [x1], #16 - - // Move parameters into temporary registers. - mov x9, x0 - mov x10, x2 - mov x11, x3 - - // Load parameters into registers. - cbz x11, Largs_done - ldr x0, [x10], #8 - subs x11, x11, #1 - b.eq Largs_done - ldr x1, [x10], #8 - subs x11, x11, #1 - b.eq Largs_done - ldr x2, [x10], #8 - subs x11, x11, #1 - b.eq Largs_done - ldr x3, [x10], #8 - subs x11, x11, #1 - b.eq Largs_done - ldr x4, [x10], #8 - subs x11, x11, #1 - b.eq Largs_done - ldr x5, [x10], #8 - subs x11, x11, #1 - b.eq Largs_done - ldr x6, [x10], #8 - subs x11, x11, #1 - b.eq Largs_done - ldr x7, [x10], #8 - -Largs_done: - blr x9 - - // Reload |state| and store registers. - ldr x1, [sp, #160] - stp d8, d9, [x1], #16 - stp d10, d11, [x1], #16 - stp d12, d13, [x1], #16 - stp d14, d15, [x1], #16 - stp x19, x20, [x1], #16 - stp x21, x22, [x1], #16 - stp x23, x24, [x1], #16 - stp x25, x26, [x1], #16 - stp x27, x28, [x1], #16 - - // |func| is required to preserve x29, the frame pointer. We cannot load - // random values into x29 (see comment above), so compare it against the - // expected value and zero the field of |state| if corrupted. - mov x9, sp - cmp x29, x9 - b.eq Lx29_ok - str xzr, [x1] - -Lx29_ok: - // Restore callee-saved registers. - ldp d8, d9, [sp, #16] - ldp d10, d11, [sp, #32] - ldp d12, d13, [sp, #48] - ldp d14, d15, [sp, #64] - ldp x19, x20, [sp, #80] - ldp x21, x22, [sp, #96] - ldp x23, x24, [sp, #112] - ldp x25, x26, [sp, #128] - ldp x27, x28, [sp, #144] - - ldp x29, x30, [sp], #176 - AARCH64_VALIDATE_LINK_REGISTER - ret - - -.globl _abi_test_clobber_x0 -.private_extern _abi_test_clobber_x0 -.align 4 -_abi_test_clobber_x0: - AARCH64_VALID_CALL_TARGET - mov x0, xzr - ret - - -.globl _abi_test_clobber_x1 -.private_extern _abi_test_clobber_x1 -.align 4 -_abi_test_clobber_x1: - AARCH64_VALID_CALL_TARGET - mov x1, xzr - ret - - -.globl _abi_test_clobber_x2 -.private_extern _abi_test_clobber_x2 -.align 4 -_abi_test_clobber_x2: - AARCH64_VALID_CALL_TARGET - mov x2, xzr - ret - - -.globl _abi_test_clobber_x3 -.private_extern _abi_test_clobber_x3 -.align 4 -_abi_test_clobber_x3: - AARCH64_VALID_CALL_TARGET - mov x3, xzr - ret - - -.globl _abi_test_clobber_x4 -.private_extern _abi_test_clobber_x4 -.align 4 -_abi_test_clobber_x4: - AARCH64_VALID_CALL_TARGET - mov x4, xzr - ret - - -.globl _abi_test_clobber_x5 -.private_extern _abi_test_clobber_x5 -.align 4 -_abi_test_clobber_x5: - AARCH64_VALID_CALL_TARGET - mov x5, xzr - ret - - -.globl _abi_test_clobber_x6 -.private_extern _abi_test_clobber_x6 -.align 4 -_abi_test_clobber_x6: - AARCH64_VALID_CALL_TARGET - mov x6, xzr - ret - - -.globl _abi_test_clobber_x7 -.private_extern _abi_test_clobber_x7 -.align 4 -_abi_test_clobber_x7: - AARCH64_VALID_CALL_TARGET - mov x7, xzr - ret - - -.globl _abi_test_clobber_x8 -.private_extern _abi_test_clobber_x8 -.align 4 -_abi_test_clobber_x8: - AARCH64_VALID_CALL_TARGET - mov x8, xzr - ret - - -.globl _abi_test_clobber_x9 -.private_extern _abi_test_clobber_x9 -.align 4 -_abi_test_clobber_x9: - AARCH64_VALID_CALL_TARGET - mov x9, xzr - ret - - -.globl _abi_test_clobber_x10 -.private_extern _abi_test_clobber_x10 -.align 4 -_abi_test_clobber_x10: - AARCH64_VALID_CALL_TARGET - mov x10, xzr - ret - - -.globl _abi_test_clobber_x11 -.private_extern _abi_test_clobber_x11 -.align 4 -_abi_test_clobber_x11: - AARCH64_VALID_CALL_TARGET - mov x11, xzr - ret - - -.globl _abi_test_clobber_x12 -.private_extern _abi_test_clobber_x12 -.align 4 -_abi_test_clobber_x12: - AARCH64_VALID_CALL_TARGET - mov x12, xzr - ret - - -.globl _abi_test_clobber_x13 -.private_extern _abi_test_clobber_x13 -.align 4 -_abi_test_clobber_x13: - AARCH64_VALID_CALL_TARGET - mov x13, xzr - ret - - -.globl _abi_test_clobber_x14 -.private_extern _abi_test_clobber_x14 -.align 4 -_abi_test_clobber_x14: - AARCH64_VALID_CALL_TARGET - mov x14, xzr - ret - - -.globl _abi_test_clobber_x15 -.private_extern _abi_test_clobber_x15 -.align 4 -_abi_test_clobber_x15: - AARCH64_VALID_CALL_TARGET - mov x15, xzr - ret - - -.globl _abi_test_clobber_x16 -.private_extern _abi_test_clobber_x16 -.align 4 -_abi_test_clobber_x16: - AARCH64_VALID_CALL_TARGET - mov x16, xzr - ret - - -.globl _abi_test_clobber_x17 -.private_extern _abi_test_clobber_x17 -.align 4 -_abi_test_clobber_x17: - AARCH64_VALID_CALL_TARGET - mov x17, xzr - ret - - -.globl _abi_test_clobber_x19 -.private_extern _abi_test_clobber_x19 -.align 4 -_abi_test_clobber_x19: - AARCH64_VALID_CALL_TARGET - mov x19, xzr - ret - - -.globl _abi_test_clobber_x20 -.private_extern _abi_test_clobber_x20 -.align 4 -_abi_test_clobber_x20: - AARCH64_VALID_CALL_TARGET - mov x20, xzr - ret - - -.globl _abi_test_clobber_x21 -.private_extern _abi_test_clobber_x21 -.align 4 -_abi_test_clobber_x21: - AARCH64_VALID_CALL_TARGET - mov x21, xzr - ret - - -.globl _abi_test_clobber_x22 -.private_extern _abi_test_clobber_x22 -.align 4 -_abi_test_clobber_x22: - AARCH64_VALID_CALL_TARGET - mov x22, xzr - ret - - -.globl _abi_test_clobber_x23 -.private_extern _abi_test_clobber_x23 -.align 4 -_abi_test_clobber_x23: - AARCH64_VALID_CALL_TARGET - mov x23, xzr - ret - - -.globl _abi_test_clobber_x24 -.private_extern _abi_test_clobber_x24 -.align 4 -_abi_test_clobber_x24: - AARCH64_VALID_CALL_TARGET - mov x24, xzr - ret - - -.globl _abi_test_clobber_x25 -.private_extern _abi_test_clobber_x25 -.align 4 -_abi_test_clobber_x25: - AARCH64_VALID_CALL_TARGET - mov x25, xzr - ret - - -.globl _abi_test_clobber_x26 -.private_extern _abi_test_clobber_x26 -.align 4 -_abi_test_clobber_x26: - AARCH64_VALID_CALL_TARGET - mov x26, xzr - ret - - -.globl _abi_test_clobber_x27 -.private_extern _abi_test_clobber_x27 -.align 4 -_abi_test_clobber_x27: - AARCH64_VALID_CALL_TARGET - mov x27, xzr - ret - - -.globl _abi_test_clobber_x28 -.private_extern _abi_test_clobber_x28 -.align 4 -_abi_test_clobber_x28: - AARCH64_VALID_CALL_TARGET - mov x28, xzr - ret - - -.globl _abi_test_clobber_x29 -.private_extern _abi_test_clobber_x29 -.align 4 -_abi_test_clobber_x29: - AARCH64_VALID_CALL_TARGET - mov x29, xzr - ret - - -.globl _abi_test_clobber_d0 -.private_extern _abi_test_clobber_d0 -.align 4 -_abi_test_clobber_d0: - AARCH64_VALID_CALL_TARGET - fmov d0, xzr - ret - - -.globl _abi_test_clobber_d1 -.private_extern _abi_test_clobber_d1 -.align 4 -_abi_test_clobber_d1: - AARCH64_VALID_CALL_TARGET - fmov d1, xzr - ret - - -.globl _abi_test_clobber_d2 -.private_extern _abi_test_clobber_d2 -.align 4 -_abi_test_clobber_d2: - AARCH64_VALID_CALL_TARGET - fmov d2, xzr - ret - - -.globl _abi_test_clobber_d3 -.private_extern _abi_test_clobber_d3 -.align 4 -_abi_test_clobber_d3: - AARCH64_VALID_CALL_TARGET - fmov d3, xzr - ret - - -.globl _abi_test_clobber_d4 -.private_extern _abi_test_clobber_d4 -.align 4 -_abi_test_clobber_d4: - AARCH64_VALID_CALL_TARGET - fmov d4, xzr - ret - - -.globl _abi_test_clobber_d5 -.private_extern _abi_test_clobber_d5 -.align 4 -_abi_test_clobber_d5: - AARCH64_VALID_CALL_TARGET - fmov d5, xzr - ret - - -.globl _abi_test_clobber_d6 -.private_extern _abi_test_clobber_d6 -.align 4 -_abi_test_clobber_d6: - AARCH64_VALID_CALL_TARGET - fmov d6, xzr - ret - - -.globl _abi_test_clobber_d7 -.private_extern _abi_test_clobber_d7 -.align 4 -_abi_test_clobber_d7: - AARCH64_VALID_CALL_TARGET - fmov d7, xzr - ret - - -.globl _abi_test_clobber_d8 -.private_extern _abi_test_clobber_d8 -.align 4 -_abi_test_clobber_d8: - AARCH64_VALID_CALL_TARGET - fmov d8, xzr - ret - - -.globl _abi_test_clobber_d9 -.private_extern _abi_test_clobber_d9 -.align 4 -_abi_test_clobber_d9: - AARCH64_VALID_CALL_TARGET - fmov d9, xzr - ret - - -.globl _abi_test_clobber_d10 -.private_extern _abi_test_clobber_d10 -.align 4 -_abi_test_clobber_d10: - AARCH64_VALID_CALL_TARGET - fmov d10, xzr - ret - - -.globl _abi_test_clobber_d11 -.private_extern _abi_test_clobber_d11 -.align 4 -_abi_test_clobber_d11: - AARCH64_VALID_CALL_TARGET - fmov d11, xzr - ret - - -.globl _abi_test_clobber_d12 -.private_extern _abi_test_clobber_d12 -.align 4 -_abi_test_clobber_d12: - AARCH64_VALID_CALL_TARGET - fmov d12, xzr - ret - - -.globl _abi_test_clobber_d13 -.private_extern _abi_test_clobber_d13 -.align 4 -_abi_test_clobber_d13: - AARCH64_VALID_CALL_TARGET - fmov d13, xzr - ret - - -.globl _abi_test_clobber_d14 -.private_extern _abi_test_clobber_d14 -.align 4 -_abi_test_clobber_d14: - AARCH64_VALID_CALL_TARGET - fmov d14, xzr - ret - - -.globl _abi_test_clobber_d15 -.private_extern _abi_test_clobber_d15 -.align 4 -_abi_test_clobber_d15: - AARCH64_VALID_CALL_TARGET - fmov d15, xzr - ret - - -.globl _abi_test_clobber_d16 -.private_extern _abi_test_clobber_d16 -.align 4 -_abi_test_clobber_d16: - AARCH64_VALID_CALL_TARGET - fmov d16, xzr - ret - - -.globl _abi_test_clobber_d17 -.private_extern _abi_test_clobber_d17 -.align 4 -_abi_test_clobber_d17: - AARCH64_VALID_CALL_TARGET - fmov d17, xzr - ret - - -.globl _abi_test_clobber_d18 -.private_extern _abi_test_clobber_d18 -.align 4 -_abi_test_clobber_d18: - AARCH64_VALID_CALL_TARGET - fmov d18, xzr - ret - - -.globl _abi_test_clobber_d19 -.private_extern _abi_test_clobber_d19 -.align 4 -_abi_test_clobber_d19: - AARCH64_VALID_CALL_TARGET - fmov d19, xzr - ret - - -.globl _abi_test_clobber_d20 -.private_extern _abi_test_clobber_d20 -.align 4 -_abi_test_clobber_d20: - AARCH64_VALID_CALL_TARGET - fmov d20, xzr - ret - - -.globl _abi_test_clobber_d21 -.private_extern _abi_test_clobber_d21 -.align 4 -_abi_test_clobber_d21: - AARCH64_VALID_CALL_TARGET - fmov d21, xzr - ret - - -.globl _abi_test_clobber_d22 -.private_extern _abi_test_clobber_d22 -.align 4 -_abi_test_clobber_d22: - AARCH64_VALID_CALL_TARGET - fmov d22, xzr - ret - - -.globl _abi_test_clobber_d23 -.private_extern _abi_test_clobber_d23 -.align 4 -_abi_test_clobber_d23: - AARCH64_VALID_CALL_TARGET - fmov d23, xzr - ret - - -.globl _abi_test_clobber_d24 -.private_extern _abi_test_clobber_d24 -.align 4 -_abi_test_clobber_d24: - AARCH64_VALID_CALL_TARGET - fmov d24, xzr - ret - - -.globl _abi_test_clobber_d25 -.private_extern _abi_test_clobber_d25 -.align 4 -_abi_test_clobber_d25: - AARCH64_VALID_CALL_TARGET - fmov d25, xzr - ret - - -.globl _abi_test_clobber_d26 -.private_extern _abi_test_clobber_d26 -.align 4 -_abi_test_clobber_d26: - AARCH64_VALID_CALL_TARGET - fmov d26, xzr - ret - - -.globl _abi_test_clobber_d27 -.private_extern _abi_test_clobber_d27 -.align 4 -_abi_test_clobber_d27: - AARCH64_VALID_CALL_TARGET - fmov d27, xzr - ret - - -.globl _abi_test_clobber_d28 -.private_extern _abi_test_clobber_d28 -.align 4 -_abi_test_clobber_d28: - AARCH64_VALID_CALL_TARGET - fmov d28, xzr - ret - - -.globl _abi_test_clobber_d29 -.private_extern _abi_test_clobber_d29 -.align 4 -_abi_test_clobber_d29: - AARCH64_VALID_CALL_TARGET - fmov d29, xzr - ret - - -.globl _abi_test_clobber_d30 -.private_extern _abi_test_clobber_d30 -.align 4 -_abi_test_clobber_d30: - AARCH64_VALID_CALL_TARGET - fmov d30, xzr - ret - - -.globl _abi_test_clobber_d31 -.private_extern _abi_test_clobber_d31 -.align 4 -_abi_test_clobber_d31: - AARCH64_VALID_CALL_TARGET - fmov d31, xzr - ret - - -.globl _abi_test_clobber_v8_upper -.private_extern _abi_test_clobber_v8_upper -.align 4 -_abi_test_clobber_v8_upper: - AARCH64_VALID_CALL_TARGET - fmov v8.d[1], xzr - ret - - -.globl _abi_test_clobber_v9_upper -.private_extern _abi_test_clobber_v9_upper -.align 4 -_abi_test_clobber_v9_upper: - AARCH64_VALID_CALL_TARGET - fmov v9.d[1], xzr - ret - - -.globl _abi_test_clobber_v10_upper -.private_extern _abi_test_clobber_v10_upper -.align 4 -_abi_test_clobber_v10_upper: - AARCH64_VALID_CALL_TARGET - fmov v10.d[1], xzr - ret - - -.globl _abi_test_clobber_v11_upper -.private_extern _abi_test_clobber_v11_upper -.align 4 -_abi_test_clobber_v11_upper: - AARCH64_VALID_CALL_TARGET - fmov v11.d[1], xzr - ret - - -.globl _abi_test_clobber_v12_upper -.private_extern _abi_test_clobber_v12_upper -.align 4 -_abi_test_clobber_v12_upper: - AARCH64_VALID_CALL_TARGET - fmov v12.d[1], xzr - ret - - -.globl _abi_test_clobber_v13_upper -.private_extern _abi_test_clobber_v13_upper -.align 4 -_abi_test_clobber_v13_upper: - AARCH64_VALID_CALL_TARGET - fmov v13.d[1], xzr - ret - - -.globl _abi_test_clobber_v14_upper -.private_extern _abi_test_clobber_v14_upper -.align 4 -_abi_test_clobber_v14_upper: - AARCH64_VALID_CALL_TARGET - fmov v14.d[1], xzr - ret - - -.globl _abi_test_clobber_v15_upper -.private_extern _abi_test_clobber_v15_upper -.align 4 -_abi_test_clobber_v15_upper: - AARCH64_VALID_CALL_TARGET - fmov v15.d[1], xzr - ret - -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-arm/crypto/chacha/chacha-armv4.S b/third_party/boringssl/apple-arm/crypto/chacha/chacha-armv4.S deleted file mode 100644 index cadf2b62..00000000 --- a/third_party/boringssl/apple-arm/crypto/chacha/chacha-armv4.S +++ /dev/null @@ -1,1498 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. - - -.text -#if defined(__thumb2__) || defined(__clang__) -.syntax unified -#endif -#if defined(__thumb2__) -.thumb -#else -.code 32 -#endif - -#if defined(__thumb2__) || defined(__clang__) -#define ldrhsb ldrbhs -#endif - -.align 5 -Lsigma: -.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral -Lone: -.long 1,0,0,0 -#if __ARM_MAX_ARCH__>=7 -LOPENSSL_armcap: -.word OPENSSL_armcap_P-LChaCha20_ctr32 -#else -.word -1 -#endif - -.globl _ChaCha20_ctr32 -.private_extern _ChaCha20_ctr32 -#ifdef __thumb2__ -.thumb_func _ChaCha20_ctr32 -#endif -.align 5 -_ChaCha20_ctr32: -LChaCha20_ctr32: - ldr r12,[sp,#0] @ pull pointer to counter and nonce - stmdb sp!,{r0,r1,r2,r4-r11,lr} -#if __ARM_ARCH__<7 && !defined(__thumb2__) - sub r14,pc,#16 @ _ChaCha20_ctr32 -#else - adr r14,LChaCha20_ctr32 -#endif - cmp r2,#0 @ len==0? -#ifdef __thumb2__ - itt eq -#endif - addeq sp,sp,#4*3 - beq Lno_data -#if __ARM_MAX_ARCH__>=7 - cmp r2,#192 @ test len - bls Lshort - ldr r4,[r14,#-32] - ldr r4,[r14,r4] -# ifdef __APPLE__ - ldr r4,[r4] -# endif - tst r4,#ARMV7_NEON - bne LChaCha20_neon -Lshort: -#endif - ldmia r12,{r4,r5,r6,r7} @ load counter and nonce - sub sp,sp,#4*(16) @ off-load area - sub r14,r14,#64 @ Lsigma - stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce - ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key - ldmia r14,{r0,r1,r2,r3} @ load sigma - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key - stmdb sp!,{r0,r1,r2,r3} @ copy sigma - str r10,[sp,#4*(16+10)] @ off-load "rx" - str r11,[sp,#4*(16+11)] @ off-load "rx" - b Loop_outer_enter - -.align 4 -Loop_outer: - ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material - str r11,[sp,#4*(32+2)] @ save len - str r12, [sp,#4*(32+1)] @ save inp - str r14, [sp,#4*(32+0)] @ save out -Loop_outer_enter: - ldr r11, [sp,#4*(15)] - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - ldr r10, [sp,#4*(13)] - ldr r14,[sp,#4*(14)] - str r11, [sp,#4*(16+15)] - mov r11,#10 - b Loop - -.align 4 -Loop: - subs r11,r11,#1 - add r0,r0,r4 - mov r12,r12,ror#16 - add r1,r1,r5 - mov r10,r10,ror#16 - eor r12,r12,r0,ror#16 - eor r10,r10,r1,ror#16 - add r8,r8,r12 - mov r4,r4,ror#20 - add r9,r9,r10 - mov r5,r5,ror#20 - eor r4,r4,r8,ror#20 - eor r5,r5,r9,ror#20 - add r0,r0,r4 - mov r12,r12,ror#24 - add r1,r1,r5 - mov r10,r10,ror#24 - eor r12,r12,r0,ror#24 - eor r10,r10,r1,ror#24 - add r8,r8,r12 - mov r4,r4,ror#25 - add r9,r9,r10 - mov r5,r5,ror#25 - str r10,[sp,#4*(16+13)] - ldr r10,[sp,#4*(16+15)] - eor r4,r4,r8,ror#25 - eor r5,r5,r9,ror#25 - str r8,[sp,#4*(16+8)] - ldr r8,[sp,#4*(16+10)] - add r2,r2,r6 - mov r14,r14,ror#16 - str r9,[sp,#4*(16+9)] - ldr r9,[sp,#4*(16+11)] - add r3,r3,r7 - mov r10,r10,ror#16 - eor r14,r14,r2,ror#16 - eor r10,r10,r3,ror#16 - add r8,r8,r14 - mov r6,r6,ror#20 - add r9,r9,r10 - mov r7,r7,ror#20 - eor r6,r6,r8,ror#20 - eor r7,r7,r9,ror#20 - add r2,r2,r6 - mov r14,r14,ror#24 - add r3,r3,r7 - mov r10,r10,ror#24 - eor r14,r14,r2,ror#24 - eor r10,r10,r3,ror#24 - add r8,r8,r14 - mov r6,r6,ror#25 - add r9,r9,r10 - mov r7,r7,ror#25 - eor r6,r6,r8,ror#25 - eor r7,r7,r9,ror#25 - add r0,r0,r5 - mov r10,r10,ror#16 - add r1,r1,r6 - mov r12,r12,ror#16 - eor r10,r10,r0,ror#16 - eor r12,r12,r1,ror#16 - add r8,r8,r10 - mov r5,r5,ror#20 - add r9,r9,r12 - mov r6,r6,ror#20 - eor r5,r5,r8,ror#20 - eor r6,r6,r9,ror#20 - add r0,r0,r5 - mov r10,r10,ror#24 - add r1,r1,r6 - mov r12,r12,ror#24 - eor r10,r10,r0,ror#24 - eor r12,r12,r1,ror#24 - add r8,r8,r10 - mov r5,r5,ror#25 - str r10,[sp,#4*(16+15)] - ldr r10,[sp,#4*(16+13)] - add r9,r9,r12 - mov r6,r6,ror#25 - eor r5,r5,r8,ror#25 - eor r6,r6,r9,ror#25 - str r8,[sp,#4*(16+10)] - ldr r8,[sp,#4*(16+8)] - add r2,r2,r7 - mov r10,r10,ror#16 - str r9,[sp,#4*(16+11)] - ldr r9,[sp,#4*(16+9)] - add r3,r3,r4 - mov r14,r14,ror#16 - eor r10,r10,r2,ror#16 - eor r14,r14,r3,ror#16 - add r8,r8,r10 - mov r7,r7,ror#20 - add r9,r9,r14 - mov r4,r4,ror#20 - eor r7,r7,r8,ror#20 - eor r4,r4,r9,ror#20 - add r2,r2,r7 - mov r10,r10,ror#24 - add r3,r3,r4 - mov r14,r14,ror#24 - eor r10,r10,r2,ror#24 - eor r14,r14,r3,ror#24 - add r8,r8,r10 - mov r7,r7,ror#25 - add r9,r9,r14 - mov r4,r4,ror#25 - eor r7,r7,r8,ror#25 - eor r4,r4,r9,ror#25 - bne Loop - - ldr r11,[sp,#4*(32+2)] @ load len - - str r8, [sp,#4*(16+8)] @ modulo-scheduled store - str r9, [sp,#4*(16+9)] - str r12,[sp,#4*(16+12)] - str r10, [sp,#4*(16+13)] - str r14,[sp,#4*(16+14)] - - @ at this point we have first half of 512-bit result in - @ rx and second half at sp+4*(16+8) - - cmp r11,#64 @ done yet? -#ifdef __thumb2__ - itete lo -#endif - addlo r12,sp,#4*(0) @ shortcut or ... - ldrhs r12,[sp,#4*(32+1)] @ ... load inp - addlo r14,sp,#4*(0) @ shortcut or ... - ldrhs r14,[sp,#4*(32+0)] @ ... load out - - ldr r8,[sp,#4*(0)] @ load key material - ldr r9,[sp,#4*(1)] - -#if __ARM_ARCH__>=6 || !defined(__ARMEB__) -# if __ARM_ARCH__<7 - orr r10,r12,r14 - tst r10,#3 @ are input and output aligned? - ldr r10,[sp,#4*(2)] - bne Lunaligned - cmp r11,#64 @ restore flags -# else - ldr r10,[sp,#4*(2)] -# endif - ldr r11,[sp,#4*(3)] - - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - - add r2,r2,r10 - add r3,r3,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r0,r0,r8 @ xor with input - eorhs r1,r1,r9 - add r8,sp,#4*(4) - str r0,[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs r2,r2,r10 - eorhs r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r1,[r14,#-12] - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - add r5,r5,r9 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - add r6,r6,r10 - add r7,r7,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r4,r4,r8 - eorhs r5,r5,r9 - add r8,sp,#4*(8) - str r4,[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs r6,r6,r10 - eorhs r7,r7,r11 - str r5,[r14,#-12] - ldmia r8,{r8,r9,r10,r11} @ load key material - str r6,[r14,#-8] - add r0,sp,#4*(16+8) - str r7,[r14,#-4] - - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half - - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] -# ifdef __thumb2__ - itt hi -# endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it - strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it - add r2,r2,r10 - add r3,r3,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r0,r0,r8 - eorhs r1,r1,r9 - add r8,sp,#4*(12) - str r0,[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs r2,r2,r10 - eorhs r3,r3,r11 - str r1,[r14,#-12] - ldmia r8,{r8,r9,r10,r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - add r5,r5,r9 -# ifdef __thumb2__ - itt hi -# endif - addhi r8,r8,#1 @ next counter value - strhi r8,[sp,#4*(12)] @ save next counter value -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - add r6,r6,r10 - add r7,r7,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r4,r4,r8 - eorhs r5,r5,r9 -# ifdef __thumb2__ - it ne -# endif - ldrne r8,[sp,#4*(32+2)] @ re-load len -# ifdef __thumb2__ - itt hs -# endif - eorhs r6,r6,r10 - eorhs r7,r7,r11 - str r4,[r14],#16 @ store output - str r5,[r14,#-12] -# ifdef __thumb2__ - it hs -# endif - subhs r11,r8,#64 @ len-=64 - str r6,[r14,#-8] - str r7,[r14,#-4] - bhi Loop_outer - - beq Ldone -# if __ARM_ARCH__<7 - b Ltail - -.align 4 -Lunaligned:@ unaligned endian-neutral path - cmp r11,#64 @ restore flags -# endif -#endif -#if __ARM_ARCH__<7 - ldr r11,[sp,#4*(3)] - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 - add r2,r2,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r3,r3,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r0,r8,r0 @ xor with input (or zero) - eor r1,r9,r1 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r2,r10,r2 - strb r0,[r14],#16 @ store output - eor r3,r11,r3 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r1,[r14,#-12] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-8] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r3,[r14,#-4] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-15] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r1,[r14,#-11] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-7] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r3,[r14,#-3] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-14] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r1,[r14,#-10] - strb r2,[r14,#-6] - eor r0,r8,r0,lsr#8 - strb r3,[r14,#-2] - eor r1,r9,r1,lsr#8 - strb r0,[r14,#-13] - eor r2,r10,r2,lsr#8 - strb r1,[r14,#-9] - eor r3,r11,r3,lsr#8 - strb r2,[r14,#-5] - strb r3,[r14,#-1] - add r8,sp,#4*(4+0) - ldmia r8,{r8,r9,r10,r11} @ load key material - add r0,sp,#4*(16+8) - add r4,r4,r8 @ accumulate key material - add r5,r5,r9 - add r6,r6,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r7,r7,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r4,r8,r4 @ xor with input (or zero) - eor r5,r9,r5 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r6,r10,r6 - strb r4,[r14],#16 @ store output - eor r7,r11,r7 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r5,[r14,#-12] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-8] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r7,[r14,#-4] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-15] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r5,[r14,#-11] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-7] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r7,[r14,#-3] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-14] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r5,[r14,#-10] - strb r6,[r14,#-6] - eor r4,r8,r4,lsr#8 - strb r7,[r14,#-2] - eor r5,r9,r5,lsr#8 - strb r4,[r14,#-13] - eor r6,r10,r6,lsr#8 - strb r5,[r14,#-9] - eor r7,r11,r7,lsr#8 - strb r6,[r14,#-5] - strb r7,[r14,#-1] - add r8,sp,#4*(4+4) - ldmia r8,{r8,r9,r10,r11} @ load key material - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half -# ifdef __thumb2__ - itt hi -# endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" - strhi r11,[sp,#4*(16+11)] @ copy "rx" - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 - add r2,r2,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r3,r3,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r0,r8,r0 @ xor with input (or zero) - eor r1,r9,r1 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r2,r10,r2 - strb r0,[r14],#16 @ store output - eor r3,r11,r3 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r1,[r14,#-12] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-8] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r3,[r14,#-4] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-15] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r1,[r14,#-11] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-7] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r3,[r14,#-3] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-14] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r1,[r14,#-10] - strb r2,[r14,#-6] - eor r0,r8,r0,lsr#8 - strb r3,[r14,#-2] - eor r1,r9,r1,lsr#8 - strb r0,[r14,#-13] - eor r2,r10,r2,lsr#8 - strb r1,[r14,#-9] - eor r3,r11,r3,lsr#8 - strb r2,[r14,#-5] - strb r3,[r14,#-1] - add r8,sp,#4*(4+8) - ldmia r8,{r8,r9,r10,r11} @ load key material - add r4,r4,r8 @ accumulate key material -# ifdef __thumb2__ - itt hi -# endif - addhi r8,r8,#1 @ next counter value - strhi r8,[sp,#4*(12)] @ save next counter value - add r5,r5,r9 - add r6,r6,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r7,r7,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r4,r8,r4 @ xor with input (or zero) - eor r5,r9,r5 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r6,r10,r6 - strb r4,[r14],#16 @ store output - eor r7,r11,r7 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r5,[r14,#-12] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-8] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r7,[r14,#-4] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-15] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r5,[r14,#-11] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-7] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r7,[r14,#-3] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-14] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r5,[r14,#-10] - strb r6,[r14,#-6] - eor r4,r8,r4,lsr#8 - strb r7,[r14,#-2] - eor r5,r9,r5,lsr#8 - strb r4,[r14,#-13] - eor r6,r10,r6,lsr#8 - strb r5,[r14,#-9] - eor r7,r11,r7,lsr#8 - strb r6,[r14,#-5] - strb r7,[r14,#-1] -# ifdef __thumb2__ - it ne -# endif - ldrne r8,[sp,#4*(32+2)] @ re-load len -# ifdef __thumb2__ - it hs -# endif - subhs r11,r8,#64 @ len-=64 - bhi Loop_outer - - beq Ldone -#endif - -Ltail: - ldr r12,[sp,#4*(32+1)] @ load inp - add r9,sp,#4*(0) - ldr r14,[sp,#4*(32+0)] @ load out - -Loop_tail: - ldrb r10,[r9],#1 @ read buffer on stack - ldrb r11,[r12],#1 @ read input - subs r8,r8,#1 - eor r11,r11,r10 - strb r11,[r14],#1 @ store output - bne Loop_tail - -Ldone: - add sp,sp,#4*(32+3) -Lno_data: - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} - -#if __ARM_MAX_ARCH__>=7 - - - -#ifdef __thumb2__ -.thumb_func ChaCha20_neon -#endif -.align 5 -ChaCha20_neon: - ldr r12,[sp,#0] @ pull pointer to counter and nonce - stmdb sp!,{r0,r1,r2,r4-r11,lr} -LChaCha20_neon: - adr r14,Lsigma - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so - stmdb sp!,{r0,r1,r2,r3} - - vld1.32 {q1,q2},[r3] @ load key - ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key - - sub sp,sp,#4*(16+16) - vld1.32 {q3},[r12] @ load counter and nonce - add r12,sp,#4*8 - ldmia r14,{r0,r1,r2,r3} @ load sigma - vld1.32 {q0},[r14]! @ load sigma - vld1.32 {q12},[r14] @ one - vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce - vst1.32 {q0,q1},[sp] @ copy sigma|1/2key - - str r10,[sp,#4*(16+10)] @ off-load "rx" - str r11,[sp,#4*(16+11)] @ off-load "rx" - vshl.i32 d26,d24,#1 @ two - vstr d24,[sp,#4*(16+0)] - vshl.i32 d28,d24,#2 @ four - vstr d26,[sp,#4*(16+2)] - vmov q4,q0 - vstr d28,[sp,#4*(16+4)] - vmov q8,q0 - vmov q5,q1 - vmov q9,q1 - b Loop_neon_enter - -.align 4 -Loop_neon_outer: - ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material - cmp r11,#64*2 @ if len<=64*2 - bls Lbreak_neon @ switch to integer-only - vmov q4,q0 - str r11,[sp,#4*(32+2)] @ save len - vmov q8,q0 - str r12, [sp,#4*(32+1)] @ save inp - vmov q5,q1 - str r14, [sp,#4*(32+0)] @ save out - vmov q9,q1 -Loop_neon_enter: - ldr r11, [sp,#4*(15)] - vadd.i32 q7,q3,q12 @ counter+1 - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - vmov q6,q2 - ldr r10, [sp,#4*(13)] - vmov q10,q2 - ldr r14,[sp,#4*(14)] - vadd.i32 q11,q7,q12 @ counter+2 - str r11, [sp,#4*(16+15)] - mov r11,#10 - add r12,r12,#3 @ counter+3 - b Loop_neon - -.align 4 -Loop_neon: - subs r11,r11,#1 - vadd.i32 q0,q0,q1 - add r0,r0,r4 - vadd.i32 q4,q4,q5 - mov r12,r12,ror#16 - vadd.i32 q8,q8,q9 - add r1,r1,r5 - veor q3,q3,q0 - mov r10,r10,ror#16 - veor q7,q7,q4 - eor r12,r12,r0,ror#16 - veor q11,q11,q8 - eor r10,r10,r1,ror#16 - vrev32.16 q3,q3 - add r8,r8,r12 - vrev32.16 q7,q7 - mov r4,r4,ror#20 - vrev32.16 q11,q11 - add r9,r9,r10 - vadd.i32 q2,q2,q3 - mov r5,r5,ror#20 - vadd.i32 q6,q6,q7 - eor r4,r4,r8,ror#20 - vadd.i32 q10,q10,q11 - eor r5,r5,r9,ror#20 - veor q12,q1,q2 - add r0,r0,r4 - veor q13,q5,q6 - mov r12,r12,ror#24 - veor q14,q9,q10 - add r1,r1,r5 - vshr.u32 q1,q12,#20 - mov r10,r10,ror#24 - vshr.u32 q5,q13,#20 - eor r12,r12,r0,ror#24 - vshr.u32 q9,q14,#20 - eor r10,r10,r1,ror#24 - vsli.32 q1,q12,#12 - add r8,r8,r12 - vsli.32 q5,q13,#12 - mov r4,r4,ror#25 - vsli.32 q9,q14,#12 - add r9,r9,r10 - vadd.i32 q0,q0,q1 - mov r5,r5,ror#25 - vadd.i32 q4,q4,q5 - str r10,[sp,#4*(16+13)] - vadd.i32 q8,q8,q9 - ldr r10,[sp,#4*(16+15)] - veor q12,q3,q0 - eor r4,r4,r8,ror#25 - veor q13,q7,q4 - eor r5,r5,r9,ror#25 - veor q14,q11,q8 - str r8,[sp,#4*(16+8)] - vshr.u32 q3,q12,#24 - ldr r8,[sp,#4*(16+10)] - vshr.u32 q7,q13,#24 - add r2,r2,r6 - vshr.u32 q11,q14,#24 - mov r14,r14,ror#16 - vsli.32 q3,q12,#8 - str r9,[sp,#4*(16+9)] - vsli.32 q7,q13,#8 - ldr r9,[sp,#4*(16+11)] - vsli.32 q11,q14,#8 - add r3,r3,r7 - vadd.i32 q2,q2,q3 - mov r10,r10,ror#16 - vadd.i32 q6,q6,q7 - eor r14,r14,r2,ror#16 - vadd.i32 q10,q10,q11 - eor r10,r10,r3,ror#16 - veor q12,q1,q2 - add r8,r8,r14 - veor q13,q5,q6 - mov r6,r6,ror#20 - veor q14,q9,q10 - add r9,r9,r10 - vshr.u32 q1,q12,#25 - mov r7,r7,ror#20 - vshr.u32 q5,q13,#25 - eor r6,r6,r8,ror#20 - vshr.u32 q9,q14,#25 - eor r7,r7,r9,ror#20 - vsli.32 q1,q12,#7 - add r2,r2,r6 - vsli.32 q5,q13,#7 - mov r14,r14,ror#24 - vsli.32 q9,q14,#7 - add r3,r3,r7 - vext.8 q2,q2,q2,#8 - mov r10,r10,ror#24 - vext.8 q6,q6,q6,#8 - eor r14,r14,r2,ror#24 - vext.8 q10,q10,q10,#8 - eor r10,r10,r3,ror#24 - vext.8 q1,q1,q1,#4 - add r8,r8,r14 - vext.8 q5,q5,q5,#4 - mov r6,r6,ror#25 - vext.8 q9,q9,q9,#4 - add r9,r9,r10 - vext.8 q3,q3,q3,#12 - mov r7,r7,ror#25 - vext.8 q7,q7,q7,#12 - eor r6,r6,r8,ror#25 - vext.8 q11,q11,q11,#12 - eor r7,r7,r9,ror#25 - vadd.i32 q0,q0,q1 - add r0,r0,r5 - vadd.i32 q4,q4,q5 - mov r10,r10,ror#16 - vadd.i32 q8,q8,q9 - add r1,r1,r6 - veor q3,q3,q0 - mov r12,r12,ror#16 - veor q7,q7,q4 - eor r10,r10,r0,ror#16 - veor q11,q11,q8 - eor r12,r12,r1,ror#16 - vrev32.16 q3,q3 - add r8,r8,r10 - vrev32.16 q7,q7 - mov r5,r5,ror#20 - vrev32.16 q11,q11 - add r9,r9,r12 - vadd.i32 q2,q2,q3 - mov r6,r6,ror#20 - vadd.i32 q6,q6,q7 - eor r5,r5,r8,ror#20 - vadd.i32 q10,q10,q11 - eor r6,r6,r9,ror#20 - veor q12,q1,q2 - add r0,r0,r5 - veor q13,q5,q6 - mov r10,r10,ror#24 - veor q14,q9,q10 - add r1,r1,r6 - vshr.u32 q1,q12,#20 - mov r12,r12,ror#24 - vshr.u32 q5,q13,#20 - eor r10,r10,r0,ror#24 - vshr.u32 q9,q14,#20 - eor r12,r12,r1,ror#24 - vsli.32 q1,q12,#12 - add r8,r8,r10 - vsli.32 q5,q13,#12 - mov r5,r5,ror#25 - vsli.32 q9,q14,#12 - str r10,[sp,#4*(16+15)] - vadd.i32 q0,q0,q1 - ldr r10,[sp,#4*(16+13)] - vadd.i32 q4,q4,q5 - add r9,r9,r12 - vadd.i32 q8,q8,q9 - mov r6,r6,ror#25 - veor q12,q3,q0 - eor r5,r5,r8,ror#25 - veor q13,q7,q4 - eor r6,r6,r9,ror#25 - veor q14,q11,q8 - str r8,[sp,#4*(16+10)] - vshr.u32 q3,q12,#24 - ldr r8,[sp,#4*(16+8)] - vshr.u32 q7,q13,#24 - add r2,r2,r7 - vshr.u32 q11,q14,#24 - mov r10,r10,ror#16 - vsli.32 q3,q12,#8 - str r9,[sp,#4*(16+11)] - vsli.32 q7,q13,#8 - ldr r9,[sp,#4*(16+9)] - vsli.32 q11,q14,#8 - add r3,r3,r4 - vadd.i32 q2,q2,q3 - mov r14,r14,ror#16 - vadd.i32 q6,q6,q7 - eor r10,r10,r2,ror#16 - vadd.i32 q10,q10,q11 - eor r14,r14,r3,ror#16 - veor q12,q1,q2 - add r8,r8,r10 - veor q13,q5,q6 - mov r7,r7,ror#20 - veor q14,q9,q10 - add r9,r9,r14 - vshr.u32 q1,q12,#25 - mov r4,r4,ror#20 - vshr.u32 q5,q13,#25 - eor r7,r7,r8,ror#20 - vshr.u32 q9,q14,#25 - eor r4,r4,r9,ror#20 - vsli.32 q1,q12,#7 - add r2,r2,r7 - vsli.32 q5,q13,#7 - mov r10,r10,ror#24 - vsli.32 q9,q14,#7 - add r3,r3,r4 - vext.8 q2,q2,q2,#8 - mov r14,r14,ror#24 - vext.8 q6,q6,q6,#8 - eor r10,r10,r2,ror#24 - vext.8 q10,q10,q10,#8 - eor r14,r14,r3,ror#24 - vext.8 q1,q1,q1,#12 - add r8,r8,r10 - vext.8 q5,q5,q5,#12 - mov r7,r7,ror#25 - vext.8 q9,q9,q9,#12 - add r9,r9,r14 - vext.8 q3,q3,q3,#4 - mov r4,r4,ror#25 - vext.8 q7,q7,q7,#4 - eor r7,r7,r8,ror#25 - vext.8 q11,q11,q11,#4 - eor r4,r4,r9,ror#25 - bne Loop_neon - - add r11,sp,#32 - vld1.32 {q12,q13},[sp] @ load key material - vld1.32 {q14,q15},[r11] - - ldr r11,[sp,#4*(32+2)] @ load len - - str r8, [sp,#4*(16+8)] @ modulo-scheduled store - str r9, [sp,#4*(16+9)] - str r12,[sp,#4*(16+12)] - str r10, [sp,#4*(16+13)] - str r14,[sp,#4*(16+14)] - - @ at this point we have first half of 512-bit result in - @ rx and second half at sp+4*(16+8) - - ldr r12,[sp,#4*(32+1)] @ load inp - ldr r14,[sp,#4*(32+0)] @ load out - - vadd.i32 q0,q0,q12 @ accumulate key material - vadd.i32 q4,q4,q12 - vadd.i32 q8,q8,q12 - vldr d24,[sp,#4*(16+0)] @ one - - vadd.i32 q1,q1,q13 - vadd.i32 q5,q5,q13 - vadd.i32 q9,q9,q13 - vldr d26,[sp,#4*(16+2)] @ two - - vadd.i32 q2,q2,q14 - vadd.i32 q6,q6,q14 - vadd.i32 q10,q10,q14 - vadd.i32 d14,d14,d24 @ counter+1 - vadd.i32 d22,d22,d26 @ counter+2 - - vadd.i32 q3,q3,q15 - vadd.i32 q7,q7,q15 - vadd.i32 q11,q11,q15 - - cmp r11,#64*4 - blo Ltail_neon - - vld1.8 {q12,q13},[r12]! @ load input - mov r11,sp - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 @ xor with input - veor q1,q1,q13 - vld1.8 {q12,q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14,q15},[r12]! - - veor q4,q4,q12 - vst1.8 {q0,q1},[r14]! @ store output - veor q5,q5,q13 - vld1.8 {q12,q13},[r12]! - veor q6,q6,q14 - vst1.8 {q2,q3},[r14]! - veor q7,q7,q15 - vld1.8 {q14,q15},[r12]! - - veor q8,q8,q12 - vld1.32 {q0,q1},[r11]! @ load for next iteration - veor d25,d25,d25 - vldr d24,[sp,#4*(16+4)] @ four - veor q9,q9,q13 - vld1.32 {q2,q3},[r11] - veor q10,q10,q14 - vst1.8 {q4,q5},[r14]! - veor q11,q11,q15 - vst1.8 {q6,q7},[r14]! - - vadd.i32 d6,d6,d24 @ next counter value - vldr d24,[sp,#4*(16+0)] @ one - - ldmia sp,{r8,r9,r10,r11} @ load key material - add r0,r0,r8 @ accumulate key material - ldr r8,[r12],#16 @ load input - vst1.8 {q8,q9},[r14]! - add r1,r1,r9 - ldr r9,[r12,#-12] - vst1.8 {q10,q11},[r14]! - add r2,r2,r10 - ldr r10,[r12,#-8] - add r3,r3,r11 - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif - eor r0,r0,r8 @ xor with input - add r8,sp,#4*(4) - eor r1,r1,r9 - str r0,[r14],#16 @ store output - eor r2,r2,r10 - str r1,[r14,#-12] - eor r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - ldr r8,[r12],#16 @ load input - add r5,r5,r9 - ldr r9,[r12,#-12] - add r6,r6,r10 - ldr r10,[r12,#-8] - add r7,r7,r11 - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - eor r4,r4,r8 - add r8,sp,#4*(8) - eor r5,r5,r9 - str r4,[r14],#16 @ store output - eor r6,r6,r10 - str r5,[r14,#-12] - eor r7,r7,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r6,[r14,#-8] - add r0,sp,#4*(16+8) - str r7,[r14,#-4] - - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half - - add r0,r0,r8 @ accumulate key material - ldr r8,[r12],#16 @ load input - add r1,r1,r9 - ldr r9,[r12,#-12] -# ifdef __thumb2__ - it hi -# endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it - add r2,r2,r10 - ldr r10,[r12,#-8] -# ifdef __thumb2__ - it hi -# endif - strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it - add r3,r3,r11 - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif - eor r0,r0,r8 - add r8,sp,#4*(12) - eor r1,r1,r9 - str r0,[r14],#16 @ store output - eor r2,r2,r10 - str r1,[r14,#-12] - eor r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - add r8,r8,#4 @ next counter value - add r5,r5,r9 - str r8,[sp,#4*(12)] @ save next counter value - ldr r8,[r12],#16 @ load input - add r6,r6,r10 - add r4,r4,#3 @ counter+3 - ldr r9,[r12,#-12] - add r7,r7,r11 - ldr r10,[r12,#-8] - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - eor r4,r4,r8 -# ifdef __thumb2__ - it hi -# endif - ldrhi r8,[sp,#4*(32+2)] @ re-load len - eor r5,r5,r9 - eor r6,r6,r10 - str r4,[r14],#16 @ store output - eor r7,r7,r11 - str r5,[r14,#-12] - sub r11,r8,#64*4 @ len-=64*4 - str r6,[r14,#-8] - str r7,[r14,#-4] - bhi Loop_neon_outer - - b Ldone_neon - -.align 4 -Lbreak_neon: - @ harmonize NEON and integer-only stack frames: load data - @ from NEON frame, but save to integer-only one; distance - @ between the two is 4*(32+4+16-32)=4*(20). - - str r11, [sp,#4*(20+32+2)] @ save len - add r11,sp,#4*(32+4) - str r12, [sp,#4*(20+32+1)] @ save inp - str r14, [sp,#4*(20+32+0)] @ save out - - ldr r12,[sp,#4*(16+10)] - ldr r14,[sp,#4*(16+11)] - vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement - str r12,[sp,#4*(20+16+10)] @ copy "rx" - str r14,[sp,#4*(20+16+11)] @ copy "rx" - - ldr r11, [sp,#4*(15)] - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - ldr r10, [sp,#4*(13)] - ldr r14,[sp,#4*(14)] - str r11, [sp,#4*(20+16+15)] - add r11,sp,#4*(20) - vst1.32 {q0,q1},[r11]! @ copy key - add sp,sp,#4*(20) @ switch frame - vst1.32 {q2,q3},[r11] - mov r11,#10 - b Loop @ go integer-only - -.align 4 -Ltail_neon: - cmp r11,#64*3 - bhs L192_or_more_neon - cmp r11,#64*2 - bhs L128_or_more_neon - cmp r11,#64*1 - bhs L64_or_more_neon - - add r8,sp,#4*(8) - vst1.8 {q0,q1},[sp] - add r10,sp,#4*(0) - vst1.8 {q2,q3},[r8] - b Loop_tail_neon - -.align 4 -L64_or_more_neon: - vld1.8 {q12,q13},[r12]! - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - veor q2,q2,q14 - veor q3,q3,q15 - vst1.8 {q0,q1},[r14]! - vst1.8 {q2,q3},[r14]! - - beq Ldone_neon - - add r8,sp,#4*(8) - vst1.8 {q4,q5},[sp] - add r10,sp,#4*(0) - vst1.8 {q6,q7},[r8] - sub r11,r11,#64*1 @ len-=64*1 - b Loop_tail_neon - -.align 4 -L128_or_more_neon: - vld1.8 {q12,q13},[r12]! - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - vld1.8 {q12,q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14,q15},[r12]! - - veor q4,q4,q12 - veor q5,q5,q13 - vst1.8 {q0,q1},[r14]! - veor q6,q6,q14 - vst1.8 {q2,q3},[r14]! - veor q7,q7,q15 - vst1.8 {q4,q5},[r14]! - vst1.8 {q6,q7},[r14]! - - beq Ldone_neon - - add r8,sp,#4*(8) - vst1.8 {q8,q9},[sp] - add r10,sp,#4*(0) - vst1.8 {q10,q11},[r8] - sub r11,r11,#64*2 @ len-=64*2 - b Loop_tail_neon - -.align 4 -L192_or_more_neon: - vld1.8 {q12,q13},[r12]! - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - vld1.8 {q12,q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14,q15},[r12]! - - veor q4,q4,q12 - veor q5,q5,q13 - vld1.8 {q12,q13},[r12]! - veor q6,q6,q14 - vst1.8 {q0,q1},[r14]! - veor q7,q7,q15 - vld1.8 {q14,q15},[r12]! - - veor q8,q8,q12 - vst1.8 {q2,q3},[r14]! - veor q9,q9,q13 - vst1.8 {q4,q5},[r14]! - veor q10,q10,q14 - vst1.8 {q6,q7},[r14]! - veor q11,q11,q15 - vst1.8 {q8,q9},[r14]! - vst1.8 {q10,q11},[r14]! - - beq Ldone_neon - - ldmia sp,{r8,r9,r10,r11} @ load key material - add r0,r0,r8 @ accumulate key material - add r8,sp,#4*(4) - add r1,r1,r9 - add r2,r2,r10 - add r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - - add r4,r4,r8 @ accumulate key material - add r8,sp,#4*(8) - add r5,r5,r9 - add r6,r6,r10 - add r7,r7,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7} - add r0,sp,#4*(16+8) - - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half - - add r0,r0,r8 @ accumulate key material - add r8,sp,#4*(12) - add r1,r1,r9 - add r2,r2,r10 - add r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - - add r4,r4,r8 @ accumulate key material - add r8,sp,#4*(8) - add r5,r5,r9 - add r4,r4,#3 @ counter+3 - add r6,r6,r10 - add r7,r7,r11 - ldr r11,[sp,#4*(32+2)] @ re-load len -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7} - add r10,sp,#4*(0) - sub r11,r11,#64*3 @ len-=64*3 - -Loop_tail_neon: - ldrb r8,[r10],#1 @ read buffer on stack - ldrb r9,[r12],#1 @ read input - subs r11,r11,#1 - eor r8,r8,r9 - strb r8,[r14],#1 @ store output - bne Loop_tail_neon - -Ldone_neon: - add sp,sp,#4*(32+4) - vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15} - add sp,sp,#4*(16+3) - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} - -.comm _OPENSSL_armcap_P,4 -.non_lazy_symbol_pointer -OPENSSL_armcap_P: -.indirect_symbol _OPENSSL_armcap_P -.long 0 -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-arm/crypto/fipsmodule/aesv8-armx32.S b/third_party/boringssl/apple-arm/crypto/fipsmodule/aesv8-armx32.S deleted file mode 100644 index 87b4b0ae..00000000 --- a/third_party/boringssl/apple-arm/crypto/fipsmodule/aesv8-armx32.S +++ /dev/null @@ -1,809 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -#if __ARM_MAX_ARCH__>=7 -.text - - -.code 32 -#undef __thumb2__ -.align 5 -Lrcon: -.long 0x01,0x01,0x01,0x01 -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat -.long 0x1b,0x1b,0x1b,0x1b - -.text - -.globl _aes_hw_set_encrypt_key -.private_extern _aes_hw_set_encrypt_key -#ifdef __thumb2__ -.thumb_func _aes_hw_set_encrypt_key -#endif -.align 5 -_aes_hw_set_encrypt_key: -Lenc_key: - mov r3,#-1 - cmp r0,#0 - beq Lenc_key_abort - cmp r2,#0 - beq Lenc_key_abort - mov r3,#-2 - cmp r1,#128 - blt Lenc_key_abort - cmp r1,#256 - bgt Lenc_key_abort - tst r1,#0x3f - bne Lenc_key_abort - - adr r3,Lrcon - cmp r1,#192 - - veor q0,q0,q0 - vld1.8 {q3},[r0]! - mov r1,#8 @ reuse r1 - vld1.32 {q1,q2},[r3]! - - blt Loop128 - beq L192 - b L256 - -.align 4 -Loop128: - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - bne Loop128 - - vld1.32 {q1},[r3] - - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - veor q3,q3,q10 - vst1.32 {q3},[r2] - add r2,r2,#0x50 - - mov r12,#10 - b Ldone - -.align 4 -L192: - vld1.8 {d16},[r0]! - vmov.i8 q10,#8 @ borrow q10 - vst1.32 {q3},[r2]! - vsub.i8 q2,q2,q10 @ adjust the mask - -Loop192: - vtbl.8 d20,{q8},d4 - vtbl.8 d21,{q8},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {d16},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - - vdup.32 q9,d7[1] - veor q9,q9,q8 - veor q10,q10,q1 - vext.8 q8,q0,q8,#12 - vshl.u8 q1,q1,#1 - veor q8,q8,q9 - veor q3,q3,q10 - veor q8,q8,q10 - vst1.32 {q3},[r2]! - bne Loop192 - - mov r12,#12 - add r2,r2,#0x20 - b Ldone - -.align 4 -L256: - vld1.8 {q8},[r0] - mov r1,#7 - mov r12,#14 - vst1.32 {q3},[r2]! - -Loop256: - vtbl.8 d20,{q8},d4 - vtbl.8 d21,{q8},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q8},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - vst1.32 {q3},[r2]! - beq Ldone - - vdup.32 q10,d7[1] - vext.8 q9,q0,q8,#12 -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q8,q8,q9 - vext.8 q9,q0,q9,#12 - veor q8,q8,q9 - vext.8 q9,q0,q9,#12 - veor q8,q8,q9 - - veor q8,q8,q10 - b Loop256 - -Ldone: - str r12,[r2] - mov r3,#0 - -Lenc_key_abort: - mov r0,r3 @ return value - - bx lr - - -.globl _aes_hw_set_decrypt_key -.private_extern _aes_hw_set_decrypt_key -#ifdef __thumb2__ -.thumb_func _aes_hw_set_decrypt_key -#endif -.align 5 -_aes_hw_set_decrypt_key: - stmdb sp!,{r4,lr} - bl Lenc_key - - cmp r0,#0 - bne Ldec_key_abort - - sub r2,r2,#240 @ restore original r2 - mov r4,#-16 - add r0,r2,r12,lsl#4 @ end of key schedule - - vld1.32 {q0},[r2] - vld1.32 {q1},[r0] - vst1.32 {q0},[r0],r4 - vst1.32 {q1},[r2]! - -Loop_imc: - vld1.32 {q0},[r2] - vld1.32 {q1},[r0] -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - vst1.32 {q0},[r0],r4 - vst1.32 {q1},[r2]! - cmp r0,r2 - bhi Loop_imc - - vld1.32 {q0},[r2] -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - vst1.32 {q0},[r0] - - eor r0,r0,r0 @ return value -Ldec_key_abort: - ldmia sp!,{r4,pc} - -.globl _aes_hw_encrypt -.private_extern _aes_hw_encrypt -#ifdef __thumb2__ -.thumb_func _aes_hw_encrypt -#endif -.align 5 -_aes_hw_encrypt: - AARCH64_VALID_CALL_TARGET - ldr r3,[r2,#240] - vld1.32 {q0},[r2]! - vld1.8 {q2},[r0] - sub r3,r3,#2 - vld1.32 {q1},[r2]! - -Loop_enc: -.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - vld1.32 {q0},[r2]! - subs r3,r3,#2 -.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - vld1.32 {q1},[r2]! - bgt Loop_enc - -.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - vld1.32 {q0},[r2] -.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 - veor q2,q2,q0 - - vst1.8 {q2},[r1] - bx lr - -.globl _aes_hw_decrypt -.private_extern _aes_hw_decrypt -#ifdef __thumb2__ -.thumb_func _aes_hw_decrypt -#endif -.align 5 -_aes_hw_decrypt: - AARCH64_VALID_CALL_TARGET - ldr r3,[r2,#240] - vld1.32 {q0},[r2]! - vld1.8 {q2},[r0] - sub r3,r3,#2 - vld1.32 {q1},[r2]! - -Loop_dec: -.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - vld1.32 {q0},[r2]! - subs r3,r3,#2 -.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - vld1.32 {q1},[r2]! - bgt Loop_dec - -.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - vld1.32 {q0},[r2] -.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 - veor q2,q2,q0 - - vst1.8 {q2},[r1] - bx lr - -.globl _aes_hw_cbc_encrypt -.private_extern _aes_hw_cbc_encrypt -#ifdef __thumb2__ -.thumb_func _aes_hw_cbc_encrypt -#endif -.align 5 -_aes_hw_cbc_encrypt: - mov ip,sp - stmdb sp!,{r4,r5,r6,r7,r8,lr} - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - ldmia ip,{r4,r5} @ load remaining args - subs r2,r2,#16 - mov r8,#16 - blo Lcbc_abort - moveq r8,#0 - - cmp r5,#0 @ en- or decrypting? - ldr r5,[r3,#240] - and r2,r2,#-16 - vld1.8 {q6},[r4] - vld1.8 {q0},[r0],r8 - - vld1.32 {q8,q9},[r3] @ load key schedule... - sub r5,r5,#6 - add r7,r3,r5,lsl#4 @ pointer to last 7 round keys - sub r5,r5,#2 - vld1.32 {q10,q11},[r7]! - vld1.32 {q12,q13},[r7]! - vld1.32 {q14,q15},[r7]! - vld1.32 {q7},[r7] - - add r7,r3,#32 - mov r6,r5 - beq Lcbc_dec - - cmp r5,#2 - veor q0,q0,q6 - veor q5,q8,q7 - beq Lcbc_enc128 - - vld1.32 {q2,q3},[r7] - add r7,r3,#16 - add r6,r3,#16*4 - add r12,r3,#16*5 -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - add r14,r3,#16*6 - add r3,r3,#16*7 - b Lenter_cbc_enc - -.align 4 -Loop_cbc_enc: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vst1.8 {q6},[r1]! -Lenter_cbc_enc: -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q8},[r6] - cmp r5,#4 -.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r12] - beq Lcbc_enc192 - -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q8},[r14] -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r3] - nop - -Lcbc_enc192: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r2,r2,#16 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - moveq r8,#0 -.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.8 {q8},[r0],r8 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - veor q8,q8,q5 -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r7] @ re-pre-load rndkey[1] -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - veor q6,q0,q7 - bhs Loop_cbc_enc - - vst1.8 {q6},[r1]! - b Lcbc_done - -.align 5 -Lcbc_enc128: - vld1.32 {q2,q3},[r7] -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - b Lenter_cbc_enc128 -Loop_cbc_enc128: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vst1.8 {q6},[r1]! -Lenter_cbc_enc128: -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r2,r2,#16 -.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - moveq r8,#0 -.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.8 {q8},[r0],r8 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - veor q8,q8,q5 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - veor q6,q0,q7 - bhs Loop_cbc_enc128 - - vst1.8 {q6},[r1]! - b Lcbc_done -.align 5 -Lcbc_dec: - vld1.8 {q10},[r0]! - subs r2,r2,#32 @ bias - add r6,r5,#2 - vorr q3,q0,q0 - vorr q1,q0,q0 - vorr q11,q10,q10 - blo Lcbc_dec_tail - - vorr q1,q10,q10 - vld1.8 {q10},[r0]! - vorr q2,q0,q0 - vorr q3,q1,q1 - vorr q11,q10,q10 - -Loop3x_cbc_dec: -.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q9},[r7]! - bgt Loop3x_cbc_dec - -.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q4,q6,q7 - subs r2,r2,#0x30 - veor q5,q2,q7 - movlo r6,r2 @ r6, r6, is zero at this point -.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q9,q3,q7 - add r0,r0,r6 @ r0 is adjusted in such way that - @ at exit from the loop q1-q10 - @ are loaded with last "words" - vorr q6,q11,q11 - mov r7,r3 -.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.8 {q2},[r0]! -.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.8 {q3},[r0]! -.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.8 {q11},[r0]! -.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 -.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 -.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 - vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] - add r6,r5,#2 - veor q4,q4,q0 - veor q5,q5,q1 - veor q10,q10,q9 - vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - vst1.8 {q4},[r1]! - vorr q0,q2,q2 - vst1.8 {q5},[r1]! - vorr q1,q3,q3 - vst1.8 {q10},[r1]! - vorr q10,q11,q11 - bhs Loop3x_cbc_dec - - cmn r2,#0x30 - beq Lcbc_done - nop - -Lcbc_dec_tail: -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q9},[r7]! - bgt Lcbc_dec_tail - -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 -.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - cmn r2,#0x20 -.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q5,q6,q7 -.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q9,q3,q7 -.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 -.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 - beq Lcbc_dec_one - veor q5,q5,q1 - veor q9,q9,q10 - vorr q6,q11,q11 - vst1.8 {q5},[r1]! - vst1.8 {q9},[r1]! - b Lcbc_done - -Lcbc_dec_one: - veor q5,q5,q10 - vorr q6,q11,q11 - vst1.8 {q5},[r1]! - -Lcbc_done: - vst1.8 {q6},[r4] -Lcbc_abort: - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!,{r4,r5,r6,r7,r8,pc} - -.globl _aes_hw_ctr32_encrypt_blocks -.private_extern _aes_hw_ctr32_encrypt_blocks -#ifdef __thumb2__ -.thumb_func _aes_hw_ctr32_encrypt_blocks -#endif -.align 5 -_aes_hw_ctr32_encrypt_blocks: - mov ip,sp - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr} - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - ldr r4, [ip] @ load remaining arg - ldr r5,[r3,#240] - - ldr r8, [r4, #12] - vld1.32 {q0},[r4] - - vld1.32 {q8,q9},[r3] @ load key schedule... - sub r5,r5,#4 - mov r12,#16 - cmp r2,#2 - add r7,r3,r5,lsl#4 @ pointer to last 5 round keys - sub r5,r5,#2 - vld1.32 {q12,q13},[r7]! - vld1.32 {q14,q15},[r7]! - vld1.32 {q7},[r7] - add r7,r3,#32 - mov r6,r5 - movlo r12,#0 - - @ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are - @ affected by silicon errata #1742098 [0] and #1655431 [1], - @ respectively, where the second instruction of an aese/aesmc - @ instruction pair may execute twice if an interrupt is taken right - @ after the first instruction consumes an input register of which a - @ single 32-bit lane has been updated the last time it was modified. - @ - @ This function uses a counter in one 32-bit lane. The - @ could write to q1 and q10 directly, but that trips this bugs. - @ We write to q6 and copy to the final register as a workaround. - @ - @ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice - @ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice -#ifndef __ARMEB__ - rev r8, r8 -#endif - add r10, r8, #1 - vorr q6,q0,q0 - rev r10, r10 - vmov.32 d13[1],r10 - add r8, r8, #2 - vorr q1,q6,q6 - bls Lctr32_tail - rev r12, r8 - vmov.32 d13[1],r12 - sub r2,r2,#3 @ bias - vorr q10,q6,q6 - b Loop3x_ctr32 - -.align 4 -Loop3x_ctr32: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 - vld1.32 {q9},[r7]! - bgt Loop3x_ctr32 - -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 - vld1.8 {q2},[r0]! - add r9,r8,#1 -.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 - vld1.8 {q3},[r0]! - rev r9,r9 -.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - vld1.8 {q11},[r0]! - mov r7,r3 -.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 -.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10 -.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - veor q2,q2,q7 - add r10,r8,#2 -.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 - veor q3,q3,q7 - add r8,r8,#3 -.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - @ Note the logic to update q0, q1, and q1 is written to work - @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in - @ 32-bit mode. See the comment above. - veor q11,q11,q7 - vmov.32 d13[1], r9 -.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 - vorr q0,q6,q6 - rev r10,r10 -.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - vmov.32 d13[1], r10 - rev r12,r8 -.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - vorr q1,q6,q6 - vmov.32 d13[1], r12 -.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 - vorr q10,q6,q6 - subs r2,r2,#3 -.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 -.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 -.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15 - - veor q2,q2,q4 - vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] - vst1.8 {q2},[r1]! - veor q3,q3,q5 - mov r6,r5 - vst1.8 {q3},[r1]! - veor q11,q11,q9 - vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - vst1.8 {q11},[r1]! - bhs Loop3x_ctr32 - - adds r2,r2,#3 - beq Lctr32_done - cmp r2,#1 - mov r12,#16 - moveq r12,#0 - -Lctr32_tail: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.32 {q9},[r7]! - bgt Lctr32_tail - -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.8 {q2},[r0],r12 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.8 {q3},[r0] -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - veor q2,q2,q7 -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - veor q3,q3,q7 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 -.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 - - cmp r2,#1 - veor q2,q2,q0 - veor q3,q3,q1 - vst1.8 {q2},[r1]! - beq Lctr32_done - vst1.8 {q3},[r1] - -Lctr32_done: - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} - -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-arm/crypto/fipsmodule/armv4-mont.S b/third_party/boringssl/apple-arm/crypto/fipsmodule/armv4-mont.S deleted file mode 100644 index e549d1f1..00000000 --- a/third_party/boringssl/apple-arm/crypto/fipsmodule/armv4-mont.S +++ /dev/null @@ -1,982 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. - - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -#if __ARM_MAX_ARCH__>=7 -.align 5 -LOPENSSL_armcap: -.word OPENSSL_armcap_P-Lbn_mul_mont -#endif - -.globl _bn_mul_mont -.private_extern _bn_mul_mont -#ifdef __thumb2__ -.thumb_func _bn_mul_mont -#endif - -.align 5 -_bn_mul_mont: -Lbn_mul_mont: - ldr ip,[sp,#4] @ load num - stmdb sp!,{r0,r2} @ sp points at argument block -#if __ARM_MAX_ARCH__>=7 - tst ip,#7 - bne Lialu - adr r0,Lbn_mul_mont - ldr r2,LOPENSSL_armcap - ldr r0,[r0,r2] -#ifdef __APPLE__ - ldr r0,[r0] -#endif - tst r0,#ARMV7_NEON @ NEON available? - ldmia sp, {r0,r2} - beq Lialu - add sp,sp,#8 - b bn_mul8x_mont_neon -.align 4 -Lialu: -#endif - cmp ip,#2 - mov r0,ip @ load num -#ifdef __thumb2__ - ittt lt -#endif - movlt r0,#0 - addlt sp,sp,#2*4 - blt Labrt - - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers - - mov r0,r0,lsl#2 @ rescale r0 for byte count - sub sp,sp,r0 @ alloca(4*num) - sub sp,sp,#4 @ +extra dword - sub r0,r0,#4 @ "num=num-1" - add r4,r2,r0 @ &bp[num-1] - - add r0,sp,r0 @ r0 to point at &tp[num-1] - ldr r8,[r0,#14*4] @ &n0 - ldr r2,[r2] @ bp[0] - ldr r5,[r1],#4 @ ap[0],ap++ - ldr r6,[r3],#4 @ np[0],np++ - ldr r8,[r8] @ *n0 - str r4,[r0,#15*4] @ save &bp[num] - - umull r10,r11,r5,r2 @ ap[0]*bp[0] - str r8,[r0,#14*4] @ save n0 value - mul r8,r10,r8 @ "tp[0]"*n0 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" - mov r4,sp - -L1st: - ldr r5,[r1],#4 @ ap[j],ap++ - mov r10,r11 - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[0] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne L1st - - adds r12,r12,r11 - ldr r4,[r0,#13*4] @ restore bp - mov r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - mov r7,sp - str r14,[r0,#4] @ tp[num]= - -Louter: - sub r7,r0,r7 @ "original" r0-1 value - sub r1,r1,r7 @ "rewind" ap to &ap[1] - ldr r2,[r4,#4]! @ *(++bp) - sub r3,r3,r7 @ "rewind" np to &np[1] - ldr r5,[r1,#-4] @ ap[0] - ldr r10,[sp] @ tp[0] - ldr r6,[r3,#-4] @ np[0] - ldr r7,[sp,#4] @ tp[1] - - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] - str r4,[r0,#13*4] @ save bp - mul r8,r10,r8 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" - mov r4,sp - -Linner: - ldr r5,[r1],#4 @ ap[j],ap++ - adds r10,r11,r7 @ +=tp[j] - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[i] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adc r11,r11,#0 - ldr r7,[r4,#8] @ tp[j+1] - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne Linner - - adds r12,r12,r11 - mov r14,#0 - ldr r4,[r0,#13*4] @ restore bp - adc r14,r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adds r12,r12,r7 - ldr r7,[r0,#15*4] @ restore &bp[num] - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - str r14,[r0,#4] @ tp[num]= - - cmp r4,r7 -#ifdef __thumb2__ - itt ne -#endif - movne r7,sp - bne Louter - - ldr r2,[r0,#12*4] @ pull rp - mov r5,sp - add r0,r0,#4 @ r0 to point at &tp[num] - sub r5,r0,r5 @ "original" num value - mov r4,sp @ "rewind" r4 - mov r1,r4 @ "borrow" r1 - sub r3,r3,r5 @ "rewind" r3 to &np[0] - - subs r7,r7,r7 @ "clear" carry flag -Lsub: ldr r7,[r4],#4 - ldr r6,[r3],#4 - sbcs r7,r7,r6 @ tp[j]-np[j] - str r7,[r2],#4 @ rp[j]= - teq r4,r0 @ preserve carry - bne Lsub - sbcs r14,r14,#0 @ upmost carry - mov r4,sp @ "rewind" r4 - sub r2,r2,r5 @ "rewind" r2 - -Lcopy: ldr r7,[r4] @ conditional copy - ldr r5,[r2] - str sp,[r4],#4 @ zap tp -#ifdef __thumb2__ - it cc -#endif - movcc r5,r7 - str r5,[r2],#4 - teq r4,r0 @ preserve carry - bne Lcopy - - mov sp,r0 - add sp,sp,#4 @ skip over tp[num+1] - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers - add sp,sp,#2*4 @ skip over {r0,r2} - mov r0,#1 -Labrt: -#if __ARM_ARCH__>=5 - bx lr @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif - -#if __ARM_MAX_ARCH__>=7 - - - -#ifdef __thumb2__ -.thumb_func bn_mul8x_mont_neon -#endif -.align 5 -bn_mul8x_mont_neon: - mov ip,sp - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - ldmia ip,{r4,r5} @ load rest of parameter block - mov ip,sp - - cmp r5,#8 - bhi LNEON_8n - - @ special case for r5==8, everything is in register bank... - - vld1.32 {d28[0]}, [r2,:32]! - veor d8,d8,d8 - sub r7,sp,r5,lsl#4 - vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-( - and r7,r7,#-64 - vld1.32 {d30[0]}, [r4,:32] - mov sp,r7 @ alloca - vzip.16 d28,d8 - - vmull.u32 q6,d28,d0[0] - vmull.u32 q7,d28,d0[1] - vmull.u32 q8,d28,d1[0] - vshl.i64 d29,d13,#16 - vmull.u32 q9,d28,d1[1] - - vadd.u64 d29,d29,d12 - veor d8,d8,d8 - vmul.u32 d29,d29,d30 - - vmull.u32 q10,d28,d2[0] - vld1.32 {d4,d5,d6,d7}, [r3]! - vmull.u32 q11,d28,d2[1] - vmull.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmull.u32 q13,d28,d3[1] - - vmlal.u32 q6,d29,d4[0] - sub r9,r5,#1 - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vmov q5,q6 - vmlal.u32 q11,d29,d6[1] - vmov q6,q7 - vmlal.u32 q12,d29,d7[0] - vmov q7,q8 - vmlal.u32 q13,d29,d7[1] - vmov q8,q9 - vmov q9,q10 - vshr.u64 d10,d10,#16 - vmov q10,q11 - vmov q11,q12 - vadd.u64 d10,d10,d11 - vmov q12,q13 - veor q13,q13 - vshr.u64 d10,d10,#16 - - b LNEON_outer8 - -.align 4 -LNEON_outer8: - vld1.32 {d28[0]}, [r2,:32]! - veor d8,d8,d8 - vzip.16 d28,d8 - vadd.u64 d12,d12,d10 - - vmlal.u32 q6,d28,d0[0] - vmlal.u32 q7,d28,d0[1] - vmlal.u32 q8,d28,d1[0] - vshl.i64 d29,d13,#16 - vmlal.u32 q9,d28,d1[1] - - vadd.u64 d29,d29,d12 - veor d8,d8,d8 - subs r9,r9,#1 - vmul.u32 d29,d29,d30 - - vmlal.u32 q10,d28,d2[0] - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q13,d28,d3[1] - - vmlal.u32 q6,d29,d4[0] - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vmov q5,q6 - vmlal.u32 q11,d29,d6[1] - vmov q6,q7 - vmlal.u32 q12,d29,d7[0] - vmov q7,q8 - vmlal.u32 q13,d29,d7[1] - vmov q8,q9 - vmov q9,q10 - vshr.u64 d10,d10,#16 - vmov q10,q11 - vmov q11,q12 - vadd.u64 d10,d10,d11 - vmov q12,q13 - veor q13,q13 - vshr.u64 d10,d10,#16 - - bne LNEON_outer8 - - vadd.u64 d12,d12,d10 - mov r7,sp - vshr.u64 d10,d12,#16 - mov r8,r5 - vadd.u64 d13,d13,d10 - add r6,sp,#96 - vshr.u64 d10,d13,#16 - vzip.16 d12,d13 - - b LNEON_tail_entry - -.align 4 -LNEON_8n: - veor q6,q6,q6 - sub r7,sp,#128 - veor q7,q7,q7 - sub r7,r7,r5,lsl#4 - veor q8,q8,q8 - and r7,r7,#-64 - veor q9,q9,q9 - mov sp,r7 @ alloca - veor q10,q10,q10 - add r7,r7,#256 - veor q11,q11,q11 - sub r8,r5,#8 - veor q12,q12,q12 - veor q13,q13,q13 - -LNEON_8n_init: - vst1.64 {q6,q7},[r7,:256]! - subs r8,r8,#8 - vst1.64 {q8,q9},[r7,:256]! - vst1.64 {q10,q11},[r7,:256]! - vst1.64 {q12,q13},[r7,:256]! - bne LNEON_8n_init - - add r6,sp,#256 - vld1.32 {d0,d1,d2,d3},[r1]! - add r10,sp,#8 - vld1.32 {d30[0]},[r4,:32] - mov r9,r5 - b LNEON_8n_outer - -.align 4 -LNEON_8n_outer: - vld1.32 {d28[0]},[r2,:32]! @ *b++ - veor d8,d8,d8 - vzip.16 d28,d8 - add r7,sp,#128 - vld1.32 {d4,d5,d6,d7},[r3]! - - vmlal.u32 q6,d28,d0[0] - vmlal.u32 q7,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q8,d28,d1[0] - vshl.i64 d29,d13,#16 - vmlal.u32 q9,d28,d1[1] - vadd.u64 d29,d29,d12 - vmlal.u32 q10,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q11,d28,d2[1] - vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0] - vmlal.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q13,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q6,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q7,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q8,d29,d5[0] - vshr.u64 d12,d12,#16 - vmlal.u32 q9,d29,d5[1] - vmlal.u32 q10,d29,d6[0] - vadd.u64 d12,d12,d13 - vmlal.u32 q11,d29,d6[1] - vshr.u64 d12,d12,#16 - vmlal.u32 q12,d29,d7[0] - vmlal.u32 q13,d29,d7[1] - vadd.u64 d14,d14,d12 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0] - vmlal.u32 q7,d28,d0[0] - vld1.64 {q6},[r6,:128]! - vmlal.u32 q8,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q9,d28,d1[0] - vshl.i64 d29,d15,#16 - vmlal.u32 q10,d28,d1[1] - vadd.u64 d29,d29,d14 - vmlal.u32 q11,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q12,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1] - vmlal.u32 q13,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q6,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q7,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q8,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q9,d29,d5[0] - vshr.u64 d14,d14,#16 - vmlal.u32 q10,d29,d5[1] - vmlal.u32 q11,d29,d6[0] - vadd.u64 d14,d14,d15 - vmlal.u32 q12,d29,d6[1] - vshr.u64 d14,d14,#16 - vmlal.u32 q13,d29,d7[0] - vmlal.u32 q6,d29,d7[1] - vadd.u64 d16,d16,d14 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1] - vmlal.u32 q8,d28,d0[0] - vld1.64 {q7},[r6,:128]! - vmlal.u32 q9,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q10,d28,d1[0] - vshl.i64 d29,d17,#16 - vmlal.u32 q11,d28,d1[1] - vadd.u64 d29,d29,d16 - vmlal.u32 q12,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q13,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2] - vmlal.u32 q6,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q7,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q8,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q9,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q10,d29,d5[0] - vshr.u64 d16,d16,#16 - vmlal.u32 q11,d29,d5[1] - vmlal.u32 q12,d29,d6[0] - vadd.u64 d16,d16,d17 - vmlal.u32 q13,d29,d6[1] - vshr.u64 d16,d16,#16 - vmlal.u32 q6,d29,d7[0] - vmlal.u32 q7,d29,d7[1] - vadd.u64 d18,d18,d16 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2] - vmlal.u32 q9,d28,d0[0] - vld1.64 {q8},[r6,:128]! - vmlal.u32 q10,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q11,d28,d1[0] - vshl.i64 d29,d19,#16 - vmlal.u32 q12,d28,d1[1] - vadd.u64 d29,d29,d18 - vmlal.u32 q13,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q6,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3] - vmlal.u32 q7,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q8,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q9,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q10,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q11,d29,d5[0] - vshr.u64 d18,d18,#16 - vmlal.u32 q12,d29,d5[1] - vmlal.u32 q13,d29,d6[0] - vadd.u64 d18,d18,d19 - vmlal.u32 q6,d29,d6[1] - vshr.u64 d18,d18,#16 - vmlal.u32 q7,d29,d7[0] - vmlal.u32 q8,d29,d7[1] - vadd.u64 d20,d20,d18 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3] - vmlal.u32 q10,d28,d0[0] - vld1.64 {q9},[r6,:128]! - vmlal.u32 q11,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q12,d28,d1[0] - vshl.i64 d29,d21,#16 - vmlal.u32 q13,d28,d1[1] - vadd.u64 d29,d29,d20 - vmlal.u32 q6,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q7,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4] - vmlal.u32 q8,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q9,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q10,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q11,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q12,d29,d5[0] - vshr.u64 d20,d20,#16 - vmlal.u32 q13,d29,d5[1] - vmlal.u32 q6,d29,d6[0] - vadd.u64 d20,d20,d21 - vmlal.u32 q7,d29,d6[1] - vshr.u64 d20,d20,#16 - vmlal.u32 q8,d29,d7[0] - vmlal.u32 q9,d29,d7[1] - vadd.u64 d22,d22,d20 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4] - vmlal.u32 q11,d28,d0[0] - vld1.64 {q10},[r6,:128]! - vmlal.u32 q12,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q13,d28,d1[0] - vshl.i64 d29,d23,#16 - vmlal.u32 q6,d28,d1[1] - vadd.u64 d29,d29,d22 - vmlal.u32 q7,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q8,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5] - vmlal.u32 q9,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q10,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q11,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q12,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q13,d29,d5[0] - vshr.u64 d22,d22,#16 - vmlal.u32 q6,d29,d5[1] - vmlal.u32 q7,d29,d6[0] - vadd.u64 d22,d22,d23 - vmlal.u32 q8,d29,d6[1] - vshr.u64 d22,d22,#16 - vmlal.u32 q9,d29,d7[0] - vmlal.u32 q10,d29,d7[1] - vadd.u64 d24,d24,d22 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5] - vmlal.u32 q12,d28,d0[0] - vld1.64 {q11},[r6,:128]! - vmlal.u32 q13,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q6,d28,d1[0] - vshl.i64 d29,d25,#16 - vmlal.u32 q7,d28,d1[1] - vadd.u64 d29,d29,d24 - vmlal.u32 q8,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q9,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6] - vmlal.u32 q10,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q11,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q12,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q13,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q6,d29,d5[0] - vshr.u64 d24,d24,#16 - vmlal.u32 q7,d29,d5[1] - vmlal.u32 q8,d29,d6[0] - vadd.u64 d24,d24,d25 - vmlal.u32 q9,d29,d6[1] - vshr.u64 d24,d24,#16 - vmlal.u32 q10,d29,d7[0] - vmlal.u32 q11,d29,d7[1] - vadd.u64 d26,d26,d24 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6] - vmlal.u32 q13,d28,d0[0] - vld1.64 {q12},[r6,:128]! - vmlal.u32 q6,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q7,d28,d1[0] - vshl.i64 d29,d27,#16 - vmlal.u32 q8,d28,d1[1] - vadd.u64 d29,d29,d26 - vmlal.u32 q9,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q10,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7] - vmlal.u32 q11,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q12,d28,d3[1] - vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] - vmlal.u32 q13,d29,d4[0] - vld1.32 {d0,d1,d2,d3},[r1]! - vmlal.u32 q6,d29,d4[1] - vmlal.u32 q7,d29,d5[0] - vshr.u64 d26,d26,#16 - vmlal.u32 q8,d29,d5[1] - vmlal.u32 q9,d29,d6[0] - vadd.u64 d26,d26,d27 - vmlal.u32 q10,d29,d6[1] - vshr.u64 d26,d26,#16 - vmlal.u32 q11,d29,d7[0] - vmlal.u32 q12,d29,d7[1] - vadd.u64 d12,d12,d26 - vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7] - add r10,sp,#8 @ rewind - sub r8,r5,#8 - b LNEON_8n_inner - -.align 4 -LNEON_8n_inner: - subs r8,r8,#8 - vmlal.u32 q6,d28,d0[0] - vld1.64 {q13},[r6,:128] - vmlal.u32 q7,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0] - vmlal.u32 q8,d28,d1[0] - vld1.32 {d4,d5,d6,d7},[r3]! - vmlal.u32 q9,d28,d1[1] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q10,d28,d2[0] - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vmlal.u32 q13,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1] - vmlal.u32 q6,d29,d4[0] - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - vmlal.u32 q10,d29,d6[0] - vmlal.u32 q11,d29,d6[1] - vmlal.u32 q12,d29,d7[0] - vmlal.u32 q13,d29,d7[1] - vst1.64 {q6},[r7,:128]! - vmlal.u32 q7,d28,d0[0] - vld1.64 {q6},[r6,:128] - vmlal.u32 q8,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1] - vmlal.u32 q9,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q10,d28,d1[1] - vmlal.u32 q11,d28,d2[0] - vmlal.u32 q12,d28,d2[1] - vmlal.u32 q13,d28,d3[0] - vmlal.u32 q6,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2] - vmlal.u32 q7,d29,d4[0] - vmlal.u32 q8,d29,d4[1] - vmlal.u32 q9,d29,d5[0] - vmlal.u32 q10,d29,d5[1] - vmlal.u32 q11,d29,d6[0] - vmlal.u32 q12,d29,d6[1] - vmlal.u32 q13,d29,d7[0] - vmlal.u32 q6,d29,d7[1] - vst1.64 {q7},[r7,:128]! - vmlal.u32 q8,d28,d0[0] - vld1.64 {q7},[r6,:128] - vmlal.u32 q9,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2] - vmlal.u32 q10,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q11,d28,d1[1] - vmlal.u32 q12,d28,d2[0] - vmlal.u32 q13,d28,d2[1] - vmlal.u32 q6,d28,d3[0] - vmlal.u32 q7,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3] - vmlal.u32 q8,d29,d4[0] - vmlal.u32 q9,d29,d4[1] - vmlal.u32 q10,d29,d5[0] - vmlal.u32 q11,d29,d5[1] - vmlal.u32 q12,d29,d6[0] - vmlal.u32 q13,d29,d6[1] - vmlal.u32 q6,d29,d7[0] - vmlal.u32 q7,d29,d7[1] - vst1.64 {q8},[r7,:128]! - vmlal.u32 q9,d28,d0[0] - vld1.64 {q8},[r6,:128] - vmlal.u32 q10,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3] - vmlal.u32 q11,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q12,d28,d1[1] - vmlal.u32 q13,d28,d2[0] - vmlal.u32 q6,d28,d2[1] - vmlal.u32 q7,d28,d3[0] - vmlal.u32 q8,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4] - vmlal.u32 q9,d29,d4[0] - vmlal.u32 q10,d29,d4[1] - vmlal.u32 q11,d29,d5[0] - vmlal.u32 q12,d29,d5[1] - vmlal.u32 q13,d29,d6[0] - vmlal.u32 q6,d29,d6[1] - vmlal.u32 q7,d29,d7[0] - vmlal.u32 q8,d29,d7[1] - vst1.64 {q9},[r7,:128]! - vmlal.u32 q10,d28,d0[0] - vld1.64 {q9},[r6,:128] - vmlal.u32 q11,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4] - vmlal.u32 q12,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q13,d28,d1[1] - vmlal.u32 q6,d28,d2[0] - vmlal.u32 q7,d28,d2[1] - vmlal.u32 q8,d28,d3[0] - vmlal.u32 q9,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5] - vmlal.u32 q10,d29,d4[0] - vmlal.u32 q11,d29,d4[1] - vmlal.u32 q12,d29,d5[0] - vmlal.u32 q13,d29,d5[1] - vmlal.u32 q6,d29,d6[0] - vmlal.u32 q7,d29,d6[1] - vmlal.u32 q8,d29,d7[0] - vmlal.u32 q9,d29,d7[1] - vst1.64 {q10},[r7,:128]! - vmlal.u32 q11,d28,d0[0] - vld1.64 {q10},[r6,:128] - vmlal.u32 q12,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5] - vmlal.u32 q13,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q6,d28,d1[1] - vmlal.u32 q7,d28,d2[0] - vmlal.u32 q8,d28,d2[1] - vmlal.u32 q9,d28,d3[0] - vmlal.u32 q10,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6] - vmlal.u32 q11,d29,d4[0] - vmlal.u32 q12,d29,d4[1] - vmlal.u32 q13,d29,d5[0] - vmlal.u32 q6,d29,d5[1] - vmlal.u32 q7,d29,d6[0] - vmlal.u32 q8,d29,d6[1] - vmlal.u32 q9,d29,d7[0] - vmlal.u32 q10,d29,d7[1] - vst1.64 {q11},[r7,:128]! - vmlal.u32 q12,d28,d0[0] - vld1.64 {q11},[r6,:128] - vmlal.u32 q13,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6] - vmlal.u32 q6,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q7,d28,d1[1] - vmlal.u32 q8,d28,d2[0] - vmlal.u32 q9,d28,d2[1] - vmlal.u32 q10,d28,d3[0] - vmlal.u32 q11,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7] - vmlal.u32 q12,d29,d4[0] - vmlal.u32 q13,d29,d4[1] - vmlal.u32 q6,d29,d5[0] - vmlal.u32 q7,d29,d5[1] - vmlal.u32 q8,d29,d6[0] - vmlal.u32 q9,d29,d6[1] - vmlal.u32 q10,d29,d7[0] - vmlal.u32 q11,d29,d7[1] - vst1.64 {q12},[r7,:128]! - vmlal.u32 q13,d28,d0[0] - vld1.64 {q12},[r6,:128] - vmlal.u32 q6,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7] - vmlal.u32 q7,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q8,d28,d1[1] - vmlal.u32 q9,d28,d2[0] - vmlal.u32 q10,d28,d2[1] - vmlal.u32 q11,d28,d3[0] - vmlal.u32 q12,d28,d3[1] - it eq - subeq r1,r1,r5,lsl#2 @ rewind - vmlal.u32 q13,d29,d4[0] - vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] - vmlal.u32 q6,d29,d4[1] - vld1.32 {d0,d1,d2,d3},[r1]! - vmlal.u32 q7,d29,d5[0] - add r10,sp,#8 @ rewind - vmlal.u32 q8,d29,d5[1] - vmlal.u32 q9,d29,d6[0] - vmlal.u32 q10,d29,d6[1] - vmlal.u32 q11,d29,d7[0] - vst1.64 {q13},[r7,:128]! - vmlal.u32 q12,d29,d7[1] - - bne LNEON_8n_inner - add r6,sp,#128 - vst1.64 {q6,q7},[r7,:256]! - veor q2,q2,q2 @ d4-d5 - vst1.64 {q8,q9},[r7,:256]! - veor q3,q3,q3 @ d6-d7 - vst1.64 {q10,q11},[r7,:256]! - vst1.64 {q12},[r7,:128] - - subs r9,r9,#8 - vld1.64 {q6,q7},[r6,:256]! - vld1.64 {q8,q9},[r6,:256]! - vld1.64 {q10,q11},[r6,:256]! - vld1.64 {q12,q13},[r6,:256]! - - itt ne - subne r3,r3,r5,lsl#2 @ rewind - bne LNEON_8n_outer - - add r7,sp,#128 - vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame - vshr.u64 d10,d12,#16 - vst1.64 {q2,q3},[sp,:256]! - vadd.u64 d13,d13,d10 - vst1.64 {q2,q3}, [sp,:256]! - vshr.u64 d10,d13,#16 - vst1.64 {q2,q3}, [sp,:256]! - vzip.16 d12,d13 - - mov r8,r5 - b LNEON_tail_entry - -.align 4 -LNEON_tail: - vadd.u64 d12,d12,d10 - vshr.u64 d10,d12,#16 - vld1.64 {q8,q9}, [r6, :256]! - vadd.u64 d13,d13,d10 - vld1.64 {q10,q11}, [r6, :256]! - vshr.u64 d10,d13,#16 - vld1.64 {q12,q13}, [r6, :256]! - vzip.16 d12,d13 - -LNEON_tail_entry: - vadd.u64 d14,d14,d10 - vst1.32 {d12[0]}, [r7, :32]! - vshr.u64 d10,d14,#16 - vadd.u64 d15,d15,d10 - vshr.u64 d10,d15,#16 - vzip.16 d14,d15 - vadd.u64 d16,d16,d10 - vst1.32 {d14[0]}, [r7, :32]! - vshr.u64 d10,d16,#16 - vadd.u64 d17,d17,d10 - vshr.u64 d10,d17,#16 - vzip.16 d16,d17 - vadd.u64 d18,d18,d10 - vst1.32 {d16[0]}, [r7, :32]! - vshr.u64 d10,d18,#16 - vadd.u64 d19,d19,d10 - vshr.u64 d10,d19,#16 - vzip.16 d18,d19 - vadd.u64 d20,d20,d10 - vst1.32 {d18[0]}, [r7, :32]! - vshr.u64 d10,d20,#16 - vadd.u64 d21,d21,d10 - vshr.u64 d10,d21,#16 - vzip.16 d20,d21 - vadd.u64 d22,d22,d10 - vst1.32 {d20[0]}, [r7, :32]! - vshr.u64 d10,d22,#16 - vadd.u64 d23,d23,d10 - vshr.u64 d10,d23,#16 - vzip.16 d22,d23 - vadd.u64 d24,d24,d10 - vst1.32 {d22[0]}, [r7, :32]! - vshr.u64 d10,d24,#16 - vadd.u64 d25,d25,d10 - vshr.u64 d10,d25,#16 - vzip.16 d24,d25 - vadd.u64 d26,d26,d10 - vst1.32 {d24[0]}, [r7, :32]! - vshr.u64 d10,d26,#16 - vadd.u64 d27,d27,d10 - vshr.u64 d10,d27,#16 - vzip.16 d26,d27 - vld1.64 {q6,q7}, [r6, :256]! - subs r8,r8,#8 - vst1.32 {d26[0]}, [r7, :32]! - bne LNEON_tail - - vst1.32 {d10[0]}, [r7, :32] @ top-most bit - sub r3,r3,r5,lsl#2 @ rewind r3 - subs r1,sp,#0 @ clear carry flag - add r2,sp,r5,lsl#2 - -LNEON_sub: - ldmia r1!, {r4,r5,r6,r7} - ldmia r3!, {r8,r9,r10,r11} - sbcs r8, r4,r8 - sbcs r9, r5,r9 - sbcs r10,r6,r10 - sbcs r11,r7,r11 - teq r1,r2 @ preserves carry - stmia r0!, {r8,r9,r10,r11} - bne LNEON_sub - - ldr r10, [r1] @ load top-most bit - mov r11,sp - veor q0,q0,q0 - sub r11,r2,r11 @ this is num*4 - veor q1,q1,q1 - mov r1,sp - sub r0,r0,r11 @ rewind r0 - mov r3,r2 @ second 3/4th of frame - sbcs r10,r10,#0 @ result is carry flag - -LNEON_copy_n_zap: - ldmia r1!, {r4,r5,r6,r7} - ldmia r0, {r8,r9,r10,r11} - it cc - movcc r8, r4 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - itt cc - movcc r9, r5 - movcc r10,r6 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - it cc - movcc r11,r7 - ldmia r1, {r4,r5,r6,r7} - stmia r0!, {r8,r9,r10,r11} - sub r1,r1,#16 - ldmia r0, {r8,r9,r10,r11} - it cc - movcc r8, r4 - vst1.64 {q0,q1}, [r1,:256]! @ wipe - itt cc - movcc r9, r5 - movcc r10,r6 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - it cc - movcc r11,r7 - teq r1,r2 @ preserves carry - stmia r0!, {r8,r9,r10,r11} - bne LNEON_copy_n_zap - - mov sp,ip - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} - bx lr @ bx lr - -#endif -.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#if __ARM_MAX_ARCH__>=7 -.comm _OPENSSL_armcap_P,4 -.non_lazy_symbol_pointer -OPENSSL_armcap_P: -.indirect_symbol _OPENSSL_armcap_P -.long 0 -.private_extern _OPENSSL_armcap_P -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-arm/crypto/fipsmodule/bsaes-armv7.S b/third_party/boringssl/apple-arm/crypto/fipsmodule/bsaes-armv7.S deleted file mode 100644 index 8329a8c2..00000000 --- a/third_party/boringssl/apple-arm/crypto/fipsmodule/bsaes-armv7.S +++ /dev/null @@ -1,1536 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. -@ -@ Licensed under the OpenSSL license (the "License"). You may not use -@ this file except in compliance with the License. You can obtain a copy -@ in the file LICENSE in the source distribution or at -@ https://www.openssl.org/source/license.html - - -@ ==================================================================== -@ Written by Andy Polyakov for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. -@ -@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel -@ of Linaro. Permission to use under GPL terms is granted. -@ ==================================================================== - -@ Bit-sliced AES for ARM NEON -@ -@ February 2012. -@ -@ This implementation is direct adaptation of bsaes-x86_64 module for -@ ARM NEON. Except that this module is endian-neutral [in sense that -@ it can be compiled for either endianness] by courtesy of vld1.8's -@ neutrality. Initial version doesn't implement interface to OpenSSL, -@ only low-level primitives and unsupported entry points, just enough -@ to collect performance results, which for Cortex-A8 core are: -@ -@ encrypt 19.5 cycles per byte processed with 128-bit key -@ decrypt 22.1 cycles per byte processed with 128-bit key -@ key conv. 440 cycles per 128-bit key/0.18 of 8x block -@ -@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, -@ which is [much] worse than anticipated (for further details see -@ http://www.openssl.org/~appro/Snapdragon-S4.html). -@ -@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code -@ manages in 20.0 cycles]. -@ -@ When comparing to x86_64 results keep in mind that NEON unit is -@ [mostly] single-issue and thus can't [fully] benefit from -@ instruction-level parallelism. And when comparing to aes-armv4 -@ results keep in mind key schedule conversion overhead (see -@ bsaes-x86_64.pl for further details)... -@ -@ - -@ April-August 2013 -@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. - -#ifndef __KERNEL__ -# include - -# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} -# define VFP_ABI_POP vldmia sp!,{d8-d15} -# define VFP_ABI_FRAME 0x40 -#else -# define VFP_ABI_PUSH -# define VFP_ABI_POP -# define VFP_ABI_FRAME 0 -# define BSAES_ASM_EXTENDED_KEY -# define XTS_CHAIN_TWEAK -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ 7 -#endif - -#ifdef __thumb__ -# define adrl adr -#endif - -#if __ARM_MAX_ARCH__>=7 - - - -.text -.syntax unified @ ARMv7-capable assembler is expected to handle this -#if defined(__thumb2__) && !defined(__APPLE__) -.thumb -#else -.code 32 -# undef __thumb2__ -#endif - -#ifdef __thumb2__ -.thumb_func _bsaes_decrypt8 -#endif -.align 4 -_bsaes_decrypt8: - adr r6,. - vldmia r4!, {q9} @ round 0 key -#if defined(__thumb2__) || defined(__APPLE__) - adr r6,LM0ISR -#else - add r6,r6,#LM0ISR-_bsaes_decrypt8 -#endif - - vldmia r6!, {q8} @ LM0ISR - veor q10, q0, q9 @ xor with round0 key - veor q11, q1, q9 - vtbl.8 d0, {q10}, d16 - vtbl.8 d1, {q10}, d17 - veor q12, q2, q9 - vtbl.8 d2, {q11}, d16 - vtbl.8 d3, {q11}, d17 - veor q13, q3, q9 - vtbl.8 d4, {q12}, d16 - vtbl.8 d5, {q12}, d17 - veor q14, q4, q9 - vtbl.8 d6, {q13}, d16 - vtbl.8 d7, {q13}, d17 - veor q15, q5, q9 - vtbl.8 d8, {q14}, d16 - vtbl.8 d9, {q14}, d17 - veor q10, q6, q9 - vtbl.8 d10, {q15}, d16 - vtbl.8 d11, {q15}, d17 - veor q11, q7, q9 - vtbl.8 d12, {q10}, d16 - vtbl.8 d13, {q10}, d17 - vtbl.8 d14, {q11}, d16 - vtbl.8 d15, {q11}, d17 - vmov.i8 q8,#0x55 @ compose LBS0 - vmov.i8 q9,#0x33 @ compose LBS1 - vshr.u64 q10, q6, #1 - vshr.u64 q11, q4, #1 - veor q10, q10, q7 - veor q11, q11, q5 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #1 - veor q5, q5, q11 - vshl.u64 q11, q11, #1 - veor q6, q6, q10 - veor q4, q4, q11 - vshr.u64 q10, q2, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q3 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q3, q3, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q2, q2, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose LBS2 - vshr.u64 q10, q5, #2 - vshr.u64 q11, q4, #2 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q9 - vand q11, q11, q9 - veor q7, q7, q10 - vshl.u64 q10, q10, #2 - veor q6, q6, q11 - vshl.u64 q11, q11, #2 - veor q5, q5, q10 - veor q4, q4, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q3 - veor q11, q11, q2 - vand q10, q10, q9 - vand q11, q11, q9 - veor q3, q3, q10 - vshl.u64 q10, q10, #2 - veor q2, q2, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q3, #4 - vshr.u64 q11, q2, #4 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q6, q6, q11 - vshl.u64 q11, q11, #4 - veor q3, q3, q10 - veor q2, q2, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q5 - veor q11, q11, q4 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q4, q4, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - sub r5,r5,#1 - b Ldec_sbox -.align 4 -Ldec_loop: - vldmia r4!, {q8,q9,q10,q11} - veor q8, q8, q0 - veor q9, q9, q1 - vtbl.8 d0, {q8}, d24 - vtbl.8 d1, {q8}, d25 - vldmia r4!, {q8} - veor q10, q10, q2 - vtbl.8 d2, {q9}, d24 - vtbl.8 d3, {q9}, d25 - vldmia r4!, {q9} - veor q11, q11, q3 - vtbl.8 d4, {q10}, d24 - vtbl.8 d5, {q10}, d25 - vldmia r4!, {q10} - vtbl.8 d6, {q11}, d24 - vtbl.8 d7, {q11}, d25 - vldmia r4!, {q11} - veor q8, q8, q4 - veor q9, q9, q5 - vtbl.8 d8, {q8}, d24 - vtbl.8 d9, {q8}, d25 - veor q10, q10, q6 - vtbl.8 d10, {q9}, d24 - vtbl.8 d11, {q9}, d25 - veor q11, q11, q7 - vtbl.8 d12, {q10}, d24 - vtbl.8 d13, {q10}, d25 - vtbl.8 d14, {q11}, d24 - vtbl.8 d15, {q11}, d25 -Ldec_sbox: - veor q1, q1, q4 - veor q3, q3, q4 - - veor q4, q4, q7 - veor q1, q1, q6 - veor q2, q2, q7 - veor q6, q6, q4 - - veor q0, q0, q1 - veor q2, q2, q5 - veor q7, q7, q6 - veor q3, q3, q0 - veor q5, q5, q0 - veor q1, q1, q3 - veor q11, q3, q0 - veor q10, q7, q4 - veor q9, q1, q6 - veor q13, q4, q0 - vmov q8, q10 - veor q12, q5, q2 - - vorr q10, q10, q9 - veor q15, q11, q8 - vand q14, q11, q12 - vorr q11, q11, q12 - veor q12, q12, q9 - vand q8, q8, q9 - veor q9, q6, q2 - vand q15, q15, q12 - vand q13, q13, q9 - veor q9, q3, q7 - veor q12, q1, q5 - veor q11, q11, q13 - veor q10, q10, q13 - vand q13, q9, q12 - vorr q9, q9, q12 - veor q11, q11, q15 - veor q8, q8, q13 - veor q10, q10, q14 - veor q9, q9, q15 - veor q8, q8, q14 - vand q12, q4, q6 - veor q9, q9, q14 - vand q13, q0, q2 - vand q14, q7, q1 - vorr q15, q3, q5 - veor q11, q11, q12 - veor q9, q9, q14 - veor q8, q8, q15 - veor q10, q10, q13 - - @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 - - @ new smaller inversion - - vand q14, q11, q9 - vmov q12, q8 - - veor q13, q10, q14 - veor q15, q8, q14 - veor q14, q8, q14 @ q14=q15 - - vbsl q13, q9, q8 - vbsl q15, q11, q10 - veor q11, q11, q10 - - vbsl q12, q13, q14 - vbsl q8, q14, q13 - - vand q14, q12, q15 - veor q9, q9, q8 - - veor q14, q14, q11 - veor q12, q5, q2 - veor q8, q1, q6 - veor q10, q15, q14 - vand q10, q10, q5 - veor q5, q5, q1 - vand q11, q1, q15 - vand q5, q5, q14 - veor q1, q11, q10 - veor q5, q5, q11 - veor q15, q15, q13 - veor q14, q14, q9 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q2 - veor q12, q12, q8 - veor q2, q2, q6 - vand q8, q8, q15 - vand q6, q6, q13 - vand q12, q12, q14 - vand q2, q2, q9 - veor q8, q8, q12 - veor q2, q2, q6 - veor q12, q12, q11 - veor q6, q6, q10 - veor q5, q5, q12 - veor q2, q2, q12 - veor q1, q1, q8 - veor q6, q6, q8 - - veor q12, q3, q0 - veor q8, q7, q4 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q0 - veor q12, q12, q8 - veor q0, q0, q4 - vand q8, q8, q15 - vand q4, q4, q13 - vand q12, q12, q14 - vand q0, q0, q9 - veor q8, q8, q12 - veor q0, q0, q4 - veor q12, q12, q11 - veor q4, q4, q10 - veor q15, q15, q13 - veor q14, q14, q9 - veor q10, q15, q14 - vand q10, q10, q3 - veor q3, q3, q7 - vand q11, q7, q15 - vand q3, q3, q14 - veor q7, q11, q10 - veor q3, q3, q11 - veor q3, q3, q12 - veor q0, q0, q12 - veor q7, q7, q8 - veor q4, q4, q8 - veor q1, q1, q7 - veor q6, q6, q5 - - veor q4, q4, q1 - veor q2, q2, q7 - veor q5, q5, q7 - veor q4, q4, q2 - veor q7, q7, q0 - veor q4, q4, q5 - veor q3, q3, q6 - veor q6, q6, q1 - veor q3, q3, q4 - - veor q4, q4, q0 - veor q7, q7, q3 - subs r5,r5,#1 - bcc Ldec_done - @ multiplication by 0x05-0x00-0x04-0x00 - vext.8 q8, q0, q0, #8 - vext.8 q14, q3, q3, #8 - vext.8 q15, q5, q5, #8 - veor q8, q8, q0 - vext.8 q9, q1, q1, #8 - veor q14, q14, q3 - vext.8 q10, q6, q6, #8 - veor q15, q15, q5 - vext.8 q11, q4, q4, #8 - veor q9, q9, q1 - vext.8 q12, q2, q2, #8 - veor q10, q10, q6 - vext.8 q13, q7, q7, #8 - veor q11, q11, q4 - veor q12, q12, q2 - veor q13, q13, q7 - - veor q0, q0, q14 - veor q1, q1, q14 - veor q6, q6, q8 - veor q2, q2, q10 - veor q4, q4, q9 - veor q1, q1, q15 - veor q6, q6, q15 - veor q2, q2, q14 - veor q7, q7, q11 - veor q4, q4, q14 - veor q3, q3, q12 - veor q2, q2, q15 - veor q7, q7, q15 - veor q5, q5, q13 - vext.8 q8, q0, q0, #12 @ x0 <<< 32 - vext.8 q9, q1, q1, #12 - veor q0, q0, q8 @ x0 ^ (x0 <<< 32) - vext.8 q10, q6, q6, #12 - veor q1, q1, q9 - vext.8 q11, q4, q4, #12 - veor q6, q6, q10 - vext.8 q12, q2, q2, #12 - veor q4, q4, q11 - vext.8 q13, q7, q7, #12 - veor q2, q2, q12 - vext.8 q14, q3, q3, #12 - veor q7, q7, q13 - vext.8 q15, q5, q5, #12 - veor q3, q3, q14 - - veor q9, q9, q0 - veor q5, q5, q15 - vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) - veor q10, q10, q1 - veor q8, q8, q5 - veor q9, q9, q5 - vext.8 q1, q1, q1, #8 - veor q13, q13, q2 - veor q0, q0, q8 - veor q14, q14, q7 - veor q1, q1, q9 - vext.8 q8, q2, q2, #8 - veor q12, q12, q4 - vext.8 q9, q7, q7, #8 - veor q15, q15, q3 - vext.8 q2, q4, q4, #8 - veor q11, q11, q6 - vext.8 q7, q5, q5, #8 - veor q12, q12, q5 - vext.8 q4, q3, q3, #8 - veor q11, q11, q5 - vext.8 q3, q6, q6, #8 - veor q5, q9, q13 - veor q11, q11, q2 - veor q7, q7, q15 - veor q6, q4, q14 - veor q4, q8, q12 - veor q2, q3, q10 - vmov q3, q11 - @ vmov q5, q9 - vldmia r6, {q12} @ LISR - ite eq @ Thumb2 thing, sanity check in ARM - addeq r6,r6,#0x10 - bne Ldec_loop - vldmia r6, {q12} @ LISRM0 - b Ldec_loop -.align 4 -Ldec_done: - vmov.i8 q8,#0x55 @ compose LBS0 - vmov.i8 q9,#0x33 @ compose LBS1 - vshr.u64 q10, q3, #1 - vshr.u64 q11, q2, #1 - veor q10, q10, q5 - veor q11, q11, q7 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #1 - veor q7, q7, q11 - vshl.u64 q11, q11, #1 - veor q3, q3, q10 - veor q2, q2, q11 - vshr.u64 q10, q6, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q4 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q4, q4, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q6, q6, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose LBS2 - vshr.u64 q10, q7, #2 - vshr.u64 q11, q2, #2 - veor q10, q10, q5 - veor q11, q11, q3 - vand q10, q10, q9 - vand q11, q11, q9 - veor q5, q5, q10 - vshl.u64 q10, q10, #2 - veor q3, q3, q11 - vshl.u64 q11, q11, #2 - veor q7, q7, q10 - veor q2, q2, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q4 - veor q11, q11, q6 - vand q10, q10, q9 - vand q11, q11, q9 - veor q4, q4, q10 - vshl.u64 q10, q10, #2 - veor q6, q6, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q4, #4 - vshr.u64 q11, q6, #4 - veor q10, q10, q5 - veor q11, q11, q3 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q3, q3, q11 - vshl.u64 q11, q11, #4 - veor q4, q4, q10 - veor q6, q6, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q7 - veor q11, q11, q2 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q2, q2, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - vldmia r4, {q8} @ last round key - veor q6, q6, q8 - veor q4, q4, q8 - veor q2, q2, q8 - veor q7, q7, q8 - veor q3, q3, q8 - veor q5, q5, q8 - veor q0, q0, q8 - veor q1, q1, q8 - bx lr - - - -.align 6 -_bsaes_const: -LM0ISR:@ InvShiftRows constants -.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 -LISR: -.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 -LISRM0: -.quad 0x01040b0e0205080f, 0x0306090c00070a0d -LM0SR:@ ShiftRows constants -.quad 0x0a0e02060f03070b, 0x0004080c05090d01 -LSR: -.quad 0x0504070600030201, 0x0f0e0d0c0a09080b -LSRM0: -.quad 0x0304090e00050a0f, 0x01060b0c0207080d -LM0: -.quad 0x02060a0e03070b0f, 0x0004080c0105090d -LREVM0SR: -.quad 0x090d01050c000408, 0x03070b0f060a0e02 -.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 6 - - -#ifdef __thumb2__ -.thumb_func _bsaes_encrypt8 -#endif -.align 4 -_bsaes_encrypt8: - adr r6,. - vldmia r4!, {q9} @ round 0 key -#if defined(__thumb2__) || defined(__APPLE__) - adr r6,LM0SR -#else - sub r6,r6,#_bsaes_encrypt8-LM0SR -#endif - - vldmia r6!, {q8} @ LM0SR -_bsaes_encrypt8_alt: - veor q10, q0, q9 @ xor with round0 key - veor q11, q1, q9 - vtbl.8 d0, {q10}, d16 - vtbl.8 d1, {q10}, d17 - veor q12, q2, q9 - vtbl.8 d2, {q11}, d16 - vtbl.8 d3, {q11}, d17 - veor q13, q3, q9 - vtbl.8 d4, {q12}, d16 - vtbl.8 d5, {q12}, d17 - veor q14, q4, q9 - vtbl.8 d6, {q13}, d16 - vtbl.8 d7, {q13}, d17 - veor q15, q5, q9 - vtbl.8 d8, {q14}, d16 - vtbl.8 d9, {q14}, d17 - veor q10, q6, q9 - vtbl.8 d10, {q15}, d16 - vtbl.8 d11, {q15}, d17 - veor q11, q7, q9 - vtbl.8 d12, {q10}, d16 - vtbl.8 d13, {q10}, d17 - vtbl.8 d14, {q11}, d16 - vtbl.8 d15, {q11}, d17 -_bsaes_encrypt8_bitslice: - vmov.i8 q8,#0x55 @ compose LBS0 - vmov.i8 q9,#0x33 @ compose LBS1 - vshr.u64 q10, q6, #1 - vshr.u64 q11, q4, #1 - veor q10, q10, q7 - veor q11, q11, q5 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #1 - veor q5, q5, q11 - vshl.u64 q11, q11, #1 - veor q6, q6, q10 - veor q4, q4, q11 - vshr.u64 q10, q2, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q3 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q3, q3, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q2, q2, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose LBS2 - vshr.u64 q10, q5, #2 - vshr.u64 q11, q4, #2 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q9 - vand q11, q11, q9 - veor q7, q7, q10 - vshl.u64 q10, q10, #2 - veor q6, q6, q11 - vshl.u64 q11, q11, #2 - veor q5, q5, q10 - veor q4, q4, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q3 - veor q11, q11, q2 - vand q10, q10, q9 - vand q11, q11, q9 - veor q3, q3, q10 - vshl.u64 q10, q10, #2 - veor q2, q2, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q3, #4 - vshr.u64 q11, q2, #4 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q6, q6, q11 - vshl.u64 q11, q11, #4 - veor q3, q3, q10 - veor q2, q2, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q5 - veor q11, q11, q4 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q4, q4, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - sub r5,r5,#1 - b Lenc_sbox -.align 4 -Lenc_loop: - vldmia r4!, {q8,q9,q10,q11} - veor q8, q8, q0 - veor q9, q9, q1 - vtbl.8 d0, {q8}, d24 - vtbl.8 d1, {q8}, d25 - vldmia r4!, {q8} - veor q10, q10, q2 - vtbl.8 d2, {q9}, d24 - vtbl.8 d3, {q9}, d25 - vldmia r4!, {q9} - veor q11, q11, q3 - vtbl.8 d4, {q10}, d24 - vtbl.8 d5, {q10}, d25 - vldmia r4!, {q10} - vtbl.8 d6, {q11}, d24 - vtbl.8 d7, {q11}, d25 - vldmia r4!, {q11} - veor q8, q8, q4 - veor q9, q9, q5 - vtbl.8 d8, {q8}, d24 - vtbl.8 d9, {q8}, d25 - veor q10, q10, q6 - vtbl.8 d10, {q9}, d24 - vtbl.8 d11, {q9}, d25 - veor q11, q11, q7 - vtbl.8 d12, {q10}, d24 - vtbl.8 d13, {q10}, d25 - vtbl.8 d14, {q11}, d24 - vtbl.8 d15, {q11}, d25 -Lenc_sbox: - veor q2, q2, q1 - veor q5, q5, q6 - veor q3, q3, q0 - veor q6, q6, q2 - veor q5, q5, q0 - - veor q6, q6, q3 - veor q3, q3, q7 - veor q7, q7, q5 - veor q3, q3, q4 - veor q4, q4, q5 - - veor q2, q2, q7 - veor q3, q3, q1 - veor q1, q1, q5 - veor q11, q7, q4 - veor q10, q1, q2 - veor q9, q5, q3 - veor q13, q2, q4 - vmov q8, q10 - veor q12, q6, q0 - - vorr q10, q10, q9 - veor q15, q11, q8 - vand q14, q11, q12 - vorr q11, q11, q12 - veor q12, q12, q9 - vand q8, q8, q9 - veor q9, q3, q0 - vand q15, q15, q12 - vand q13, q13, q9 - veor q9, q7, q1 - veor q12, q5, q6 - veor q11, q11, q13 - veor q10, q10, q13 - vand q13, q9, q12 - vorr q9, q9, q12 - veor q11, q11, q15 - veor q8, q8, q13 - veor q10, q10, q14 - veor q9, q9, q15 - veor q8, q8, q14 - vand q12, q2, q3 - veor q9, q9, q14 - vand q13, q4, q0 - vand q14, q1, q5 - vorr q15, q7, q6 - veor q11, q11, q12 - veor q9, q9, q14 - veor q8, q8, q15 - veor q10, q10, q13 - - @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 - - @ new smaller inversion - - vand q14, q11, q9 - vmov q12, q8 - - veor q13, q10, q14 - veor q15, q8, q14 - veor q14, q8, q14 @ q14=q15 - - vbsl q13, q9, q8 - vbsl q15, q11, q10 - veor q11, q11, q10 - - vbsl q12, q13, q14 - vbsl q8, q14, q13 - - vand q14, q12, q15 - veor q9, q9, q8 - - veor q14, q14, q11 - veor q12, q6, q0 - veor q8, q5, q3 - veor q10, q15, q14 - vand q10, q10, q6 - veor q6, q6, q5 - vand q11, q5, q15 - vand q6, q6, q14 - veor q5, q11, q10 - veor q6, q6, q11 - veor q15, q15, q13 - veor q14, q14, q9 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q0 - veor q12, q12, q8 - veor q0, q0, q3 - vand q8, q8, q15 - vand q3, q3, q13 - vand q12, q12, q14 - vand q0, q0, q9 - veor q8, q8, q12 - veor q0, q0, q3 - veor q12, q12, q11 - veor q3, q3, q10 - veor q6, q6, q12 - veor q0, q0, q12 - veor q5, q5, q8 - veor q3, q3, q8 - - veor q12, q7, q4 - veor q8, q1, q2 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q4 - veor q12, q12, q8 - veor q4, q4, q2 - vand q8, q8, q15 - vand q2, q2, q13 - vand q12, q12, q14 - vand q4, q4, q9 - veor q8, q8, q12 - veor q4, q4, q2 - veor q12, q12, q11 - veor q2, q2, q10 - veor q15, q15, q13 - veor q14, q14, q9 - veor q10, q15, q14 - vand q10, q10, q7 - veor q7, q7, q1 - vand q11, q1, q15 - vand q7, q7, q14 - veor q1, q11, q10 - veor q7, q7, q11 - veor q7, q7, q12 - veor q4, q4, q12 - veor q1, q1, q8 - veor q2, q2, q8 - veor q7, q7, q0 - veor q1, q1, q6 - veor q6, q6, q0 - veor q4, q4, q7 - veor q0, q0, q1 - - veor q1, q1, q5 - veor q5, q5, q2 - veor q2, q2, q3 - veor q3, q3, q5 - veor q4, q4, q5 - - veor q6, q6, q3 - subs r5,r5,#1 - bcc Lenc_done - vext.8 q8, q0, q0, #12 @ x0 <<< 32 - vext.8 q9, q1, q1, #12 - veor q0, q0, q8 @ x0 ^ (x0 <<< 32) - vext.8 q10, q4, q4, #12 - veor q1, q1, q9 - vext.8 q11, q6, q6, #12 - veor q4, q4, q10 - vext.8 q12, q3, q3, #12 - veor q6, q6, q11 - vext.8 q13, q7, q7, #12 - veor q3, q3, q12 - vext.8 q14, q2, q2, #12 - veor q7, q7, q13 - vext.8 q15, q5, q5, #12 - veor q2, q2, q14 - - veor q9, q9, q0 - veor q5, q5, q15 - vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) - veor q10, q10, q1 - veor q8, q8, q5 - veor q9, q9, q5 - vext.8 q1, q1, q1, #8 - veor q13, q13, q3 - veor q0, q0, q8 - veor q14, q14, q7 - veor q1, q1, q9 - vext.8 q8, q3, q3, #8 - veor q12, q12, q6 - vext.8 q9, q7, q7, #8 - veor q15, q15, q2 - vext.8 q3, q6, q6, #8 - veor q11, q11, q4 - vext.8 q7, q5, q5, #8 - veor q12, q12, q5 - vext.8 q6, q2, q2, #8 - veor q11, q11, q5 - vext.8 q2, q4, q4, #8 - veor q5, q9, q13 - veor q4, q8, q12 - veor q3, q3, q11 - veor q7, q7, q15 - veor q6, q6, q14 - @ vmov q4, q8 - veor q2, q2, q10 - @ vmov q5, q9 - vldmia r6, {q12} @ LSR - ite eq @ Thumb2 thing, samity check in ARM - addeq r6,r6,#0x10 - bne Lenc_loop - vldmia r6, {q12} @ LSRM0 - b Lenc_loop -.align 4 -Lenc_done: - vmov.i8 q8,#0x55 @ compose LBS0 - vmov.i8 q9,#0x33 @ compose LBS1 - vshr.u64 q10, q2, #1 - vshr.u64 q11, q3, #1 - veor q10, q10, q5 - veor q11, q11, q7 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #1 - veor q7, q7, q11 - vshl.u64 q11, q11, #1 - veor q2, q2, q10 - veor q3, q3, q11 - vshr.u64 q10, q4, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q6 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q6, q6, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q4, q4, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose LBS2 - vshr.u64 q10, q7, #2 - vshr.u64 q11, q3, #2 - veor q10, q10, q5 - veor q11, q11, q2 - vand q10, q10, q9 - vand q11, q11, q9 - veor q5, q5, q10 - vshl.u64 q10, q10, #2 - veor q2, q2, q11 - vshl.u64 q11, q11, #2 - veor q7, q7, q10 - veor q3, q3, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q6 - veor q11, q11, q4 - vand q10, q10, q9 - vand q11, q11, q9 - veor q6, q6, q10 - vshl.u64 q10, q10, #2 - veor q4, q4, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q6, #4 - vshr.u64 q11, q4, #4 - veor q10, q10, q5 - veor q11, q11, q2 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q2, q2, q11 - vshl.u64 q11, q11, #4 - veor q6, q6, q10 - veor q4, q4, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q7 - veor q11, q11, q3 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q3, q3, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - vldmia r4, {q8} @ last round key - veor q4, q4, q8 - veor q6, q6, q8 - veor q3, q3, q8 - veor q7, q7, q8 - veor q2, q2, q8 - veor q5, q5, q8 - veor q0, q0, q8 - veor q1, q1, q8 - bx lr - -#ifdef __thumb2__ -.thumb_func _bsaes_key_convert -#endif -.align 4 -_bsaes_key_convert: - adr r6,. - vld1.8 {q7}, [r4]! @ load round 0 key -#if defined(__thumb2__) || defined(__APPLE__) - adr r6,LM0 -#else - sub r6,r6,#_bsaes_key_convert-LM0 -#endif - vld1.8 {q15}, [r4]! @ load round 1 key - - vmov.i8 q8, #0x01 @ bit masks - vmov.i8 q9, #0x02 - vmov.i8 q10, #0x04 - vmov.i8 q11, #0x08 - vmov.i8 q12, #0x10 - vmov.i8 q13, #0x20 - vldmia r6, {q14} @ LM0 - -#ifdef __ARMEL__ - vrev32.8 q7, q7 - vrev32.8 q15, q15 -#endif - sub r5,r5,#1 - vstmia r12!, {q7} @ save round 0 key - b Lkey_loop - -.align 4 -Lkey_loop: - vtbl.8 d14,{q15},d28 - vtbl.8 d15,{q15},d29 - vmov.i8 q6, #0x40 - vmov.i8 q15, #0x80 - - vtst.8 q0, q7, q8 - vtst.8 q1, q7, q9 - vtst.8 q2, q7, q10 - vtst.8 q3, q7, q11 - vtst.8 q4, q7, q12 - vtst.8 q5, q7, q13 - vtst.8 q6, q7, q6 - vtst.8 q7, q7, q15 - vld1.8 {q15}, [r4]! @ load next round key - vmvn q0, q0 @ "pnot" - vmvn q1, q1 - vmvn q5, q5 - vmvn q6, q6 -#ifdef __ARMEL__ - vrev32.8 q15, q15 -#endif - subs r5,r5,#1 - vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key - bne Lkey_loop - - vmov.i8 q7,#0x63 @ compose L63 - @ don't save last round key - bx lr - -.globl _bsaes_cbc_encrypt -.private_extern _bsaes_cbc_encrypt -#ifdef __thumb2__ -.thumb_func _bsaes_cbc_encrypt -#endif -.align 5 -_bsaes_cbc_encrypt: - @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for - @ short inputs. We patch this out, using bsaes for all input sizes. - - @ it is up to the caller to make sure we are called with enc == 0 - - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} - VFP_ABI_PUSH - ldr r8, [ip] @ IV is 1st arg on the stack - mov r2, r2, lsr#4 @ len in 16 byte blocks - sub sp, #0x10 @ scratch space to carry over the IV - mov r9, sp @ save sp - - ldr r10, [r3, #240] @ get # of rounds -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key - add r12, #96 @ sifze of bit-slices key schedule - - @ populate the key schedule - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - mov sp, r12 @ sp is sp - bl _bsaes_key_convert - vldmia sp, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia sp, {q7} -#else - ldr r12, [r3, #244] - eors r12, #1 - beq 0f - - @ populate the key schedule - str r12, [r3, #244] - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - add r12, r3, #248 @ pass key schedule - bl _bsaes_key_convert - add r4, r3, #248 - vldmia r4, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia r4, {q7} - -.align 2 - -#endif - - vld1.8 {q15}, [r8] @ load IV - b Lcbc_dec_loop - -.align 4 -Lcbc_dec_loop: - subs r2, r2, #0x8 - bmi Lcbc_dec_loop_finish - - vld1.8 {q0,q1}, [r0]! @ load input - vld1.8 {q2,q3}, [r0]! -#ifndef BSAES_ASM_EXTENDED_KEY - mov r4, sp @ pass the key -#else - add r4, r3, #248 -#endif - vld1.8 {q4,q5}, [r0]! - mov r5, r10 - vld1.8 {q6,q7}, [r0] - sub r0, r0, #0x60 - vstmia r9, {q15} @ put aside IV - - bl _bsaes_decrypt8 - - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q12,q13}, [r0]! - veor q4, q4, q10 - veor q2, q2, q11 - vld1.8 {q14,q15}, [r0]! - veor q7, q7, q12 - vst1.8 {q0,q1}, [r1]! @ write output - veor q3, q3, q13 - vst1.8 {q6}, [r1]! - veor q5, q5, q14 - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - vst1.8 {q7}, [r1]! - vst1.8 {q3}, [r1]! - vst1.8 {q5}, [r1]! - - b Lcbc_dec_loop - -Lcbc_dec_loop_finish: - adds r2, r2, #8 - beq Lcbc_dec_done - - @ Set up most parameters for the _bsaes_decrypt8 call. -#ifndef BSAES_ASM_EXTENDED_KEY - mov r4, sp @ pass the key -#else - add r4, r3, #248 -#endif - mov r5, r10 - vstmia r9, {q15} @ put aside IV - - vld1.8 {q0}, [r0]! @ load input - cmp r2, #2 - blo Lcbc_dec_one - vld1.8 {q1}, [r0]! - beq Lcbc_dec_two - vld1.8 {q2}, [r0]! - cmp r2, #4 - blo Lcbc_dec_three - vld1.8 {q3}, [r0]! - beq Lcbc_dec_four - vld1.8 {q4}, [r0]! - cmp r2, #6 - blo Lcbc_dec_five - vld1.8 {q5}, [r0]! - beq Lcbc_dec_six - vld1.8 {q6}, [r0]! - sub r0, r0, #0x70 - - bl _bsaes_decrypt8 - - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q12,q13}, [r0]! - veor q4, q4, q10 - veor q2, q2, q11 - vld1.8 {q15}, [r0]! - veor q7, q7, q12 - vst1.8 {q0,q1}, [r1]! @ write output - veor q3, q3, q13 - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - vst1.8 {q7}, [r1]! - vst1.8 {q3}, [r1]! - b Lcbc_dec_done -.align 4 -Lcbc_dec_six: - sub r0, r0, #0x60 - bl _bsaes_decrypt8 - vldmia r9,{q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q12}, [r0]! - veor q4, q4, q10 - veor q2, q2, q11 - vld1.8 {q15}, [r0]! - veor q7, q7, q12 - vst1.8 {q0,q1}, [r1]! @ write output - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - vst1.8 {q7}, [r1]! - b Lcbc_dec_done -.align 4 -Lcbc_dec_five: - sub r0, r0, #0x50 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q15}, [r0]! - veor q4, q4, q10 - vst1.8 {q0,q1}, [r1]! @ write output - veor q2, q2, q11 - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - b Lcbc_dec_done -.align 4 -Lcbc_dec_four: - sub r0, r0, #0x40 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q15}, [r0]! - veor q4, q4, q10 - vst1.8 {q0,q1}, [r1]! @ write output - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - b Lcbc_dec_done -.align 4 -Lcbc_dec_three: - sub r0, r0, #0x30 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q15}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vst1.8 {q0,q1}, [r1]! @ write output - vst1.8 {q6}, [r1]! - b Lcbc_dec_done -.align 4 -Lcbc_dec_two: - sub r0, r0, #0x20 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q15}, [r0]! @ reload input - veor q1, q1, q8 - vst1.8 {q0,q1}, [r1]! @ write output - b Lcbc_dec_done -.align 4 -Lcbc_dec_one: - sub r0, r0, #0x10 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q15}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vst1.8 {q0}, [r1]! @ write output - -Lcbc_dec_done: -#ifndef BSAES_ASM_EXTENDED_KEY - vmov.i32 q0, #0 - vmov.i32 q1, #0 -Lcbc_dec_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r9 - bne Lcbc_dec_bzero -#endif - - mov sp, r9 - add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb - vst1.8 {q15}, [r8] @ return IV - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} - -.globl _bsaes_ctr32_encrypt_blocks -.private_extern _bsaes_ctr32_encrypt_blocks -#ifdef __thumb2__ -.thumb_func _bsaes_ctr32_encrypt_blocks -#endif -.align 5 -_bsaes_ctr32_encrypt_blocks: - @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this - @ out to retain a constant-time implementation. - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} - VFP_ABI_PUSH - ldr r8, [ip] @ ctr is 1st arg on the stack - sub sp, sp, #0x10 @ scratch space to carry over the ctr - mov r9, sp @ save sp - - ldr r10, [r3, #240] @ get # of rounds -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key - add r12, #96 @ size of bit-sliced key schedule - - @ populate the key schedule - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - mov sp, r12 @ sp is sp - bl _bsaes_key_convert - veor q7,q7,q15 @ fix up last round key - vstmia r12, {q7} @ save last round key - - vld1.8 {q0}, [r8] @ load counter -#ifdef __APPLE__ - mov r8, #:lower16:(LREVM0SR-LM0) - add r8, r6, r8 -#else - add r8, r6, #LREVM0SR-LM0 @ borrow r8 -#endif - vldmia sp, {q4} @ load round0 key -#else - ldr r12, [r3, #244] - eors r12, #1 - beq 0f - - @ populate the key schedule - str r12, [r3, #244] - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - add r12, r3, #248 @ pass key schedule - bl _bsaes_key_convert - veor q7,q7,q15 @ fix up last round key - vstmia r12, {q7} @ save last round key - -.align 2 - add r12, r3, #248 - vld1.8 {q0}, [r8] @ load counter - adrl r8, LREVM0SR @ borrow r8 - vldmia r12, {q4} @ load round0 key - sub sp, #0x10 @ place for adjusted round0 key -#endif - - vmov.i32 q8,#1 @ compose 1<<96 - veor q9,q9,q9 - vrev32.8 q0,q0 - vext.8 q8,q9,q8,#4 - vrev32.8 q4,q4 - vadd.u32 q9,q8,q8 @ compose 2<<96 - vstmia sp, {q4} @ save adjusted round0 key - b Lctr_enc_loop - -.align 4 -Lctr_enc_loop: - vadd.u32 q10, q8, q9 @ compose 3<<96 - vadd.u32 q1, q0, q8 @ +1 - vadd.u32 q2, q0, q9 @ +2 - vadd.u32 q3, q0, q10 @ +3 - vadd.u32 q4, q1, q10 - vadd.u32 q5, q2, q10 - vadd.u32 q6, q3, q10 - vadd.u32 q7, q4, q10 - vadd.u32 q10, q5, q10 @ next counter - - @ Borrow prologue from _bsaes_encrypt8 to use the opportunity - @ to flip byte order in 32-bit counter - - vldmia sp, {q9} @ load round0 key -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x10 @ pass next round key -#else - add r4, r3, #264 -#endif - vldmia r8, {q8} @ LREVM0SR - mov r5, r10 @ pass rounds - vstmia r9, {q10} @ save next counter -#ifdef __APPLE__ - mov r6, #:lower16:(LREVM0SR-LSR) - sub r6, r8, r6 -#else - sub r6, r8, #LREVM0SR-LSR @ pass constants -#endif - - bl _bsaes_encrypt8_alt - - subs r2, r2, #8 - blo Lctr_enc_loop_done - - vld1.8 {q8,q9}, [r0]! @ load input - vld1.8 {q10,q11}, [r0]! - veor q0, q8 - veor q1, q9 - vld1.8 {q12,q13}, [r0]! - veor q4, q10 - veor q6, q11 - vld1.8 {q14,q15}, [r0]! - veor q3, q12 - vst1.8 {q0,q1}, [r1]! @ write output - veor q7, q13 - veor q2, q14 - vst1.8 {q4}, [r1]! - veor q5, q15 - vst1.8 {q6}, [r1]! - vmov.i32 q8, #1 @ compose 1<<96 - vst1.8 {q3}, [r1]! - veor q9, q9, q9 - vst1.8 {q7}, [r1]! - vext.8 q8, q9, q8, #4 - vst1.8 {q2}, [r1]! - vadd.u32 q9,q8,q8 @ compose 2<<96 - vst1.8 {q5}, [r1]! - vldmia r9, {q0} @ load counter - - bne Lctr_enc_loop - b Lctr_enc_done - -.align 4 -Lctr_enc_loop_done: - add r2, r2, #8 - vld1.8 {q8}, [r0]! @ load input - veor q0, q8 - vst1.8 {q0}, [r1]! @ write output - cmp r2, #2 - blo Lctr_enc_done - vld1.8 {q9}, [r0]! - veor q1, q9 - vst1.8 {q1}, [r1]! - beq Lctr_enc_done - vld1.8 {q10}, [r0]! - veor q4, q10 - vst1.8 {q4}, [r1]! - cmp r2, #4 - blo Lctr_enc_done - vld1.8 {q11}, [r0]! - veor q6, q11 - vst1.8 {q6}, [r1]! - beq Lctr_enc_done - vld1.8 {q12}, [r0]! - veor q3, q12 - vst1.8 {q3}, [r1]! - cmp r2, #6 - blo Lctr_enc_done - vld1.8 {q13}, [r0]! - veor q7, q13 - vst1.8 {q7}, [r1]! - beq Lctr_enc_done - vld1.8 {q14}, [r0] - veor q2, q14 - vst1.8 {q2}, [r1]! - -Lctr_enc_done: - vmov.i32 q0, #0 - vmov.i32 q1, #0 -#ifndef BSAES_ASM_EXTENDED_KEY -Lctr_enc_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r9 - bne Lctr_enc_bzero -#else - vstmia sp, {q0,q1} -#endif - - mov sp, r9 - add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return - - @ OpenSSL contains aes_nohw_* fallback code here. We patch this - @ out to retain a constant-time implementation. - -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-arm/crypto/fipsmodule/ghash-armv4.S b/third_party/boringssl/apple-arm/crypto/fipsmodule/ghash-armv4.S deleted file mode 100644 index 36f4cceb..00000000 --- a/third_party/boringssl/apple-arm/crypto/fipsmodule/ghash-armv4.S +++ /dev/null @@ -1,258 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL -@ instructions are in aesv8-armx.pl.) - - -.text -#if defined(__thumb2__) || defined(__clang__) -.syntax unified -#define ldrplb ldrbpl -#define ldrneb ldrbne -#endif -#if defined(__thumb2__) -.thumb -#else -.code 32 -#endif -#if __ARM_MAX_ARCH__>=7 - - - -.globl _gcm_init_neon -.private_extern _gcm_init_neon -#ifdef __thumb2__ -.thumb_func _gcm_init_neon -#endif -.align 4 -_gcm_init_neon: - vld1.64 d7,[r1]! @ load H - vmov.i8 q8,#0xe1 - vld1.64 d6,[r1] - vshl.i64 d17,#57 - vshr.u64 d16,#63 @ t0=0xc2....01 - vdup.8 q9,d7[7] - vshr.u64 d26,d6,#63 - vshr.s8 q9,#7 @ broadcast carry bit - vshl.i64 q3,q3,#1 - vand q8,q8,q9 - vorr d7,d26 @ H<<<=1 - veor q3,q3,q8 @ twisted H - vstmia r0,{q3} - - bx lr @ bx lr - - -.globl _gcm_gmult_neon -.private_extern _gcm_gmult_neon -#ifdef __thumb2__ -.thumb_func _gcm_gmult_neon -#endif -.align 4 -_gcm_gmult_neon: - vld1.64 d7,[r0]! @ load Xi - vld1.64 d6,[r0]! - vmov.i64 d29,#0x0000ffffffffffff - vldmia r1,{d26,d27} @ load twisted H - vmov.i64 d30,#0x00000000ffffffff -#ifdef __ARMEL__ - vrev64.8 q3,q3 -#endif - vmov.i64 d31,#0x000000000000ffff - veor d28,d26,d27 @ Karatsuba pre-processing - mov r3,#16 - b Lgmult_neon - - -.globl _gcm_ghash_neon -.private_extern _gcm_ghash_neon -#ifdef __thumb2__ -.thumb_func _gcm_ghash_neon -#endif -.align 4 -_gcm_ghash_neon: - vld1.64 d1,[r0]! @ load Xi - vld1.64 d0,[r0]! - vmov.i64 d29,#0x0000ffffffffffff - vldmia r1,{d26,d27} @ load twisted H - vmov.i64 d30,#0x00000000ffffffff -#ifdef __ARMEL__ - vrev64.8 q0,q0 -#endif - vmov.i64 d31,#0x000000000000ffff - veor d28,d26,d27 @ Karatsuba pre-processing - -Loop_neon: - vld1.64 d7,[r2]! @ load inp - vld1.64 d6,[r2]! -#ifdef __ARMEL__ - vrev64.8 q3,q3 -#endif - veor q3,q0 @ inp^=Xi -Lgmult_neon: - vext.8 d16, d26, d26, #1 @ A1 - vmull.p8 q8, d16, d6 @ F = A1*B - vext.8 d0, d6, d6, #1 @ B1 - vmull.p8 q0, d26, d0 @ E = A*B1 - vext.8 d18, d26, d26, #2 @ A2 - vmull.p8 q9, d18, d6 @ H = A2*B - vext.8 d22, d6, d6, #2 @ B2 - vmull.p8 q11, d26, d22 @ G = A*B2 - vext.8 d20, d26, d26, #3 @ A3 - veor q8, q8, q0 @ L = E + F - vmull.p8 q10, d20, d6 @ J = A3*B - vext.8 d0, d6, d6, #3 @ B3 - veor q9, q9, q11 @ M = G + H - vmull.p8 q0, d26, d0 @ I = A*B3 - veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 - vand d17, d17, d29 - vext.8 d22, d6, d6, #4 @ B4 - veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 - vand d19, d19, d30 - vmull.p8 q11, d26, d22 @ K = A*B4 - veor q10, q10, q0 @ N = I + J - veor d16, d16, d17 - veor d18, d18, d19 - veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 - vand d21, d21, d31 - vext.8 q8, q8, q8, #15 - veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 - vmov.i64 d23, #0 - vext.8 q9, q9, q9, #14 - veor d20, d20, d21 - vmull.p8 q0, d26, d6 @ D = A*B - vext.8 q11, q11, q11, #12 - vext.8 q10, q10, q10, #13 - veor q8, q8, q9 - veor q10, q10, q11 - veor q0, q0, q8 - veor q0, q0, q10 - veor d6,d6,d7 @ Karatsuba pre-processing - vext.8 d16, d28, d28, #1 @ A1 - vmull.p8 q8, d16, d6 @ F = A1*B - vext.8 d2, d6, d6, #1 @ B1 - vmull.p8 q1, d28, d2 @ E = A*B1 - vext.8 d18, d28, d28, #2 @ A2 - vmull.p8 q9, d18, d6 @ H = A2*B - vext.8 d22, d6, d6, #2 @ B2 - vmull.p8 q11, d28, d22 @ G = A*B2 - vext.8 d20, d28, d28, #3 @ A3 - veor q8, q8, q1 @ L = E + F - vmull.p8 q10, d20, d6 @ J = A3*B - vext.8 d2, d6, d6, #3 @ B3 - veor q9, q9, q11 @ M = G + H - vmull.p8 q1, d28, d2 @ I = A*B3 - veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 - vand d17, d17, d29 - vext.8 d22, d6, d6, #4 @ B4 - veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 - vand d19, d19, d30 - vmull.p8 q11, d28, d22 @ K = A*B4 - veor q10, q10, q1 @ N = I + J - veor d16, d16, d17 - veor d18, d18, d19 - veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 - vand d21, d21, d31 - vext.8 q8, q8, q8, #15 - veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 - vmov.i64 d23, #0 - vext.8 q9, q9, q9, #14 - veor d20, d20, d21 - vmull.p8 q1, d28, d6 @ D = A*B - vext.8 q11, q11, q11, #12 - vext.8 q10, q10, q10, #13 - veor q8, q8, q9 - veor q10, q10, q11 - veor q1, q1, q8 - veor q1, q1, q10 - vext.8 d16, d27, d27, #1 @ A1 - vmull.p8 q8, d16, d7 @ F = A1*B - vext.8 d4, d7, d7, #1 @ B1 - vmull.p8 q2, d27, d4 @ E = A*B1 - vext.8 d18, d27, d27, #2 @ A2 - vmull.p8 q9, d18, d7 @ H = A2*B - vext.8 d22, d7, d7, #2 @ B2 - vmull.p8 q11, d27, d22 @ G = A*B2 - vext.8 d20, d27, d27, #3 @ A3 - veor q8, q8, q2 @ L = E + F - vmull.p8 q10, d20, d7 @ J = A3*B - vext.8 d4, d7, d7, #3 @ B3 - veor q9, q9, q11 @ M = G + H - vmull.p8 q2, d27, d4 @ I = A*B3 - veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 - vand d17, d17, d29 - vext.8 d22, d7, d7, #4 @ B4 - veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 - vand d19, d19, d30 - vmull.p8 q11, d27, d22 @ K = A*B4 - veor q10, q10, q2 @ N = I + J - veor d16, d16, d17 - veor d18, d18, d19 - veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 - vand d21, d21, d31 - vext.8 q8, q8, q8, #15 - veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 - vmov.i64 d23, #0 - vext.8 q9, q9, q9, #14 - veor d20, d20, d21 - vmull.p8 q2, d27, d7 @ D = A*B - vext.8 q11, q11, q11, #12 - vext.8 q10, q10, q10, #13 - veor q8, q8, q9 - veor q10, q10, q11 - veor q2, q2, q8 - veor q2, q2, q10 - veor q1,q1,q0 @ Karatsuba post-processing - veor q1,q1,q2 - veor d1,d1,d2 - veor d4,d4,d3 @ Xh|Xl - 256-bit result - - @ equivalent of reduction_avx from ghash-x86_64.pl - vshl.i64 q9,q0,#57 @ 1st phase - vshl.i64 q10,q0,#62 - veor q10,q10,q9 @ - vshl.i64 q9,q0,#63 - veor q10, q10, q9 @ - veor d1,d1,d20 @ - veor d4,d4,d21 - - vshr.u64 q10,q0,#1 @ 2nd phase - veor q2,q2,q0 - veor q0,q0,q10 @ - vshr.u64 q10,q10,#6 - vshr.u64 q0,q0,#1 @ - veor q0,q0,q2 @ - veor q0,q0,q10 @ - - subs r3,#16 - bne Loop_neon - -#ifdef __ARMEL__ - vrev64.8 q0,q0 -#endif - sub r0,#16 - vst1.64 d1,[r0]! @ write out Xi - vst1.64 d0,[r0] - - bx lr @ bx lr - -#endif -.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-arm/crypto/fipsmodule/ghashv8-armx32.S b/third_party/boringssl/apple-arm/crypto/fipsmodule/ghashv8-armx32.S deleted file mode 100644 index dcac580e..00000000 --- a/third_party/boringssl/apple-arm/crypto/fipsmodule/ghashv8-armx32.S +++ /dev/null @@ -1,260 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -#if __ARM_MAX_ARCH__>=7 -.text - -.code 32 -#undef __thumb2__ -.globl _gcm_init_v8 -.private_extern _gcm_init_v8 -#ifdef __thumb2__ -.thumb_func _gcm_init_v8 -#endif -.align 4 -_gcm_init_v8: - AARCH64_VALID_CALL_TARGET - vld1.64 {q9},[r1] @ load input H - vmov.i8 q11,#0xe1 - vshl.i64 q11,q11,#57 @ 0xc2.0 - vext.8 q3,q9,q9,#8 - vshr.u64 q10,q11,#63 - vdup.32 q9,d18[1] - vext.8 q8,q10,q11,#8 @ t0=0xc2....01 - vshr.u64 q10,q3,#63 - vshr.s32 q9,q9,#31 @ broadcast carry bit - vand q10,q10,q8 - vshl.i64 q3,q3,#1 - vext.8 q10,q10,q10,#8 - vand q8,q8,q9 - vorr q3,q3,q10 @ H<<<=1 - veor q12,q3,q8 @ twisted H - vst1.64 {q12},[r0]! @ store Htable[0] - - @ calculate H^2 - vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing -.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 - veor q8,q8,q12 -.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 -.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase - - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - veor q0,q1,q10 - - vext.8 q10,q0,q0,#8 @ 2nd phase -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q10,q10,q2 - veor q14,q0,q10 - - vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing - veor q9,q9,q14 - vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed - vst1.64 {q13,q14},[r0]! @ store Htable[1..2] - bx lr - -.globl _gcm_gmult_v8 -.private_extern _gcm_gmult_v8 -#ifdef __thumb2__ -.thumb_func _gcm_gmult_v8 -#endif -.align 4 -_gcm_gmult_v8: - AARCH64_VALID_CALL_TARGET - vld1.64 {q9},[r0] @ load Xi - vmov.i8 q11,#0xe1 - vld1.64 {q12,q13},[r1] @ load twisted H, ... - vshl.u64 q11,q11,#57 -#ifndef __ARMEB__ - vrev64.8 q9,q9 -#endif - vext.8 q3,q9,q9,#8 - -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo - veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction - - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - veor q0,q1,q10 - - vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q10,q10,q2 - veor q0,q0,q10 - -#ifndef __ARMEB__ - vrev64.8 q0,q0 -#endif - vext.8 q0,q0,q0,#8 - vst1.64 {q0},[r0] @ write out Xi - - bx lr - -.globl _gcm_ghash_v8 -.private_extern _gcm_ghash_v8 -#ifdef __thumb2__ -.thumb_func _gcm_ghash_v8 -#endif -.align 4 -_gcm_ghash_v8: - AARCH64_VALID_CALL_TARGET - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so - vld1.64 {q0},[r0] @ load [rotated] Xi - @ "[rotated]" means that - @ loaded value would have - @ to be rotated in order to - @ make it appear as in - @ algorithm specification - subs r3,r3,#32 @ see if r3 is 32 or larger - mov r12,#16 @ r12 is used as post- - @ increment for input pointer; - @ as loop is modulo-scheduled - @ r12 is zeroed just in time - @ to preclude overstepping - @ inp[len], which means that - @ last block[s] are actually - @ loaded twice, but last - @ copy is not processed - vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 - vmov.i8 q11,#0xe1 - vld1.64 {q14},[r1] - moveq r12,#0 @ is it time to zero r12? - vext.8 q0,q0,q0,#8 @ rotate Xi - vld1.64 {q8},[r2]! @ load [rotated] I[0] - vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant -#ifndef __ARMEB__ - vrev64.8 q8,q8 - vrev64.8 q0,q0 -#endif - vext.8 q3,q8,q8,#8 @ rotate I[0] - blo Lodd_tail_v8 @ r3 was less than 32 - vld1.64 {q9},[r2],r12 @ load [rotated] I[1] -#ifndef __ARMEB__ - vrev64.8 q9,q9 -#endif - vext.8 q7,q9,q9,#8 - veor q3,q3,q0 @ I[i]^=Xi -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 - veor q9,q9,q7 @ Karatsuba pre-processing -.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 - b Loop_mod2x_v8 - -.align 4 -Loop_mod2x_v8: - vext.8 q10,q3,q3,#8 - subs r3,r3,#32 @ is there more data? -.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo - movlo r12,#0 @ is it time to zero r12? - -.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 - veor q10,q10,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi - veor q0,q0,q4 @ accumulate -.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) - vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] - - veor q2,q2,q6 - moveq r12,#0 @ is it time to zero r12? - veor q1,q1,q5 - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] -#ifndef __ARMEB__ - vrev64.8 q8,q8 -#endif - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction - -#ifndef __ARMEB__ - vrev64.8 q9,q9 -#endif - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - vext.8 q7,q9,q9,#8 - vext.8 q3,q8,q8,#8 - veor q0,q1,q10 -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 - veor q3,q3,q2 @ accumulate q3 early - - vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q3,q3,q10 - veor q9,q9,q7 @ Karatsuba pre-processing - veor q3,q3,q0 -.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 - bhs Loop_mod2x_v8 @ there was at least 32 more bytes - - veor q2,q2,q10 - vext.8 q3,q8,q8,#8 @ re-construct q3 - adds r3,r3,#32 @ re-construct r3 - veor q0,q0,q2 @ re-construct q0 - beq Ldone_v8 @ is r3 zero? -Lodd_tail_v8: - vext.8 q10,q0,q0,#8 - veor q3,q3,q0 @ inp^=Xi - veor q9,q8,q10 @ q9 is rotated inp^Xi - -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo - veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction - - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - veor q0,q1,q10 - - vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q10,q10,q2 - veor q0,q0,q10 - -Ldone_v8: -#ifndef __ARMEB__ - vrev64.8 q0,q0 -#endif - vext.8 q0,q0,q0,#8 - vst1.64 {q0},[r0] @ write out Xi - - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so - bx lr - -.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-arm/crypto/fipsmodule/sha1-armv4-large.S b/third_party/boringssl/apple-arm/crypto/fipsmodule/sha1-armv4-large.S deleted file mode 100644 index 82ac8df4..00000000 --- a/third_party/boringssl/apple-arm/crypto/fipsmodule/sha1-armv4-large.S +++ /dev/null @@ -1,1518 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -.globl _sha1_block_data_order -.private_extern _sha1_block_data_order -#ifdef __thumb2__ -.thumb_func _sha1_block_data_order -#endif - -.align 5 -_sha1_block_data_order: -#if __ARM_MAX_ARCH__>=7 -Lsha1_block: - adr r3,Lsha1_block - ldr r12,LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ - ldr r12,[r12] -#endif - tst r12,#ARMV8_SHA1 - bne LARMv8 - tst r12,#ARMV7_NEON - bne LNEON -#endif - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 - ldmia r0,{r3,r4,r5,r6,r7} -Lloop: - ldr r8,LK_00_19 - mov r14,sp - sub sp,sp,#15*4 - mov r5,r5,ror#30 - mov r6,r6,ror#30 - mov r7,r7,ror#30 @ [6] -L_00_15: -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r6,r8,r6,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r4,r5 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r6,r8,r6,ror#2 @ E+=K_00_19 - eor r10,r4,r5 @ F_xx_xx - add r6,r6,r7,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r3,r10,ror#2 - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r6,r6,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r5,r8,r5,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r3,r4 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r5,r8,r5,ror#2 @ E+=K_00_19 - eor r10,r3,r4 @ F_xx_xx - add r5,r5,r6,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r7,r10,ror#2 - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r5,r5,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r4,r8,r4,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r7,r3 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r4,r8,r4,ror#2 @ E+=K_00_19 - eor r10,r7,r3 @ F_xx_xx - add r4,r4,r5,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r6,r10,ror#2 - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r4,r4,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r3,r8,r3,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r6,r7 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r3,r8,r3,ror#2 @ E+=K_00_19 - eor r10,r6,r7 @ F_xx_xx - add r3,r3,r4,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r5,r10,ror#2 - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r3,r3,r10 @ E+=F_00_19(B,C,D) -#if defined(__thumb2__) - mov r12,sp - teq r14,r12 -#else - teq r14,sp -#endif - bne L_00_15 @ [((11+4)*5+2)*3] - sub sp,sp,#25*4 -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - add r6,r6,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - add r5,r5,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - add r4,r4,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - add r3,r3,r10 @ E+=F_00_19(B,C,D) - - ldr r8,LK_20_39 @ [+15+16*4] - cmn sp,#0 @ [+3], clear carry to denote 20_39 -L_20_39_or_60_79: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r4,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_20_39(B,C,D) -#if defined(__thumb2__) - mov r12,sp - teq r14,r12 -#else - teq r14,sp @ preserve carry -#endif - bne L_20_39_or_60_79 @ [+((12+3)*5+2)*4] - bcs L_done @ [+((12+3)*5+2)*4], spare 300 bytes - - ldr r8,LK_40_59 - sub sp,sp,#20*4 @ [+2] -L_40_59: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r4,r10,ror#2 @ F_xx_xx - and r11,r5,r6 @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_40_59(B,C,D) - add r7,r7,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - and r11,r4,r5 @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_40_59(B,C,D) - add r6,r6,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - and r11,r3,r4 @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_40_59(B,C,D) - add r5,r5,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - and r11,r7,r3 @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_40_59(B,C,D) - add r4,r4,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - and r11,r6,r7 @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_40_59(B,C,D) - add r3,r3,r11,ror#2 -#if defined(__thumb2__) - mov r12,sp - teq r14,r12 -#else - teq r14,sp -#endif - bne L_40_59 @ [+((12+5)*5+2)*4] - - ldr r8,LK_60_79 - sub sp,sp,#20*4 - cmp sp,#0 @ set carry to denote 60_79 - b L_20_39_or_60_79 @ [+4], spare 300 bytes -L_done: - add sp,sp,#80*4 @ "deallocate" stack frame - ldmia r0,{r8,r9,r10,r11,r12} - add r3,r8,r3 - add r4,r9,r4 - add r5,r10,r5,ror#2 - add r6,r11,r6,ror#2 - add r7,r12,r7,ror#2 - stmia r0,{r3,r4,r5,r6,r7} - teq r1,r2 - bne Lloop @ [+18], total 1307 - -#if __ARM_ARCH__>=5 - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} -#else - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif - - -.align 5 -LK_00_19:.word 0x5a827999 -LK_20_39:.word 0x6ed9eba1 -LK_40_59:.word 0x8f1bbcdc -LK_60_79:.word 0xca62c1d6 -#if __ARM_MAX_ARCH__>=7 -LOPENSSL_armcap: -.word OPENSSL_armcap_P-Lsha1_block -#endif -.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 5 -#if __ARM_MAX_ARCH__>=7 - - - -#ifdef __thumb2__ -.thumb_func sha1_block_data_order_neon -#endif -.align 4 -sha1_block_data_order_neon: -LNEON: - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 - @ dmb @ errata #451034 on early Cortex A8 - @ vstmdb sp!,{d8-d15} @ ABI specification says so - mov r14,sp - sub r12,sp,#64 - adr r8,LK_00_19 - bic r12,r12,#15 @ align for 128-bit stores - - ldmia r0,{r3,r4,r5,r6,r7} @ load context - mov sp,r12 @ alloca - - vld1.8 {q0,q1},[r1]! @ handles unaligned - veor q15,q15,q15 - vld1.8 {q2,q3},[r1]! - vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19 - vrev32.8 q0,q0 @ yes, even on - vrev32.8 q1,q1 @ big-endian... - vrev32.8 q2,q2 - vadd.i32 q8,q0,q14 - vrev32.8 q3,q3 - vadd.i32 q9,q1,q14 - vst1.32 {q8},[r12,:128]! - vadd.i32 q10,q2,q14 - vst1.32 {q9},[r12,:128]! - vst1.32 {q10},[r12,:128]! - ldr r9,[sp] @ big RAW stall - -Loop_neon: - vext.8 q8,q0,q1,#8 - bic r10,r6,r4 - add r7,r7,r9 - and r11,r5,r4 - vadd.i32 q13,q3,q14 - ldr r9,[sp,#4] - add r7,r7,r3,ror#27 - vext.8 q12,q3,q15,#4 - eor r11,r11,r10 - mov r4,r4,ror#2 - add r7,r7,r11 - veor q8,q8,q0 - bic r10,r5,r3 - add r6,r6,r9 - veor q12,q12,q2 - and r11,r4,r3 - ldr r9,[sp,#8] - veor q12,q12,q8 - add r6,r6,r7,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q13,q15,q12,#4 - bic r10,r4,r7 - add r5,r5,r9 - vadd.i32 q8,q12,q12 - and r11,r3,r7 - ldr r9,[sp,#12] - vsri.32 q8,q12,#31 - add r5,r5,r6,ror#27 - eor r11,r11,r10 - mov r7,r7,ror#2 - vshr.u32 q12,q13,#30 - add r5,r5,r11 - bic r10,r3,r6 - vshl.u32 q13,q13,#2 - add r4,r4,r9 - and r11,r7,r6 - veor q8,q8,q12 - ldr r9,[sp,#16] - add r4,r4,r5,ror#27 - veor q8,q8,q13 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q9,q1,q2,#8 - bic r10,r7,r5 - add r3,r3,r9 - and r11,r6,r5 - vadd.i32 q13,q8,q14 - ldr r9,[sp,#20] - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r4,ror#27 - vext.8 q12,q8,q15,#4 - eor r11,r11,r10 - mov r5,r5,ror#2 - add r3,r3,r11 - veor q9,q9,q1 - bic r10,r6,r4 - add r7,r7,r9 - veor q12,q12,q3 - and r11,r5,r4 - ldr r9,[sp,#24] - veor q12,q12,q9 - add r7,r7,r3,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q13,q15,q12,#4 - bic r10,r5,r3 - add r6,r6,r9 - vadd.i32 q9,q12,q12 - and r11,r4,r3 - ldr r9,[sp,#28] - vsri.32 q9,q12,#31 - add r6,r6,r7,ror#27 - eor r11,r11,r10 - mov r3,r3,ror#2 - vshr.u32 q12,q13,#30 - add r6,r6,r11 - bic r10,r4,r7 - vshl.u32 q13,q13,#2 - add r5,r5,r9 - and r11,r3,r7 - veor q9,q9,q12 - ldr r9,[sp,#32] - add r5,r5,r6,ror#27 - veor q9,q9,q13 - eor r11,r11,r10 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q10,q2,q3,#8 - bic r10,r3,r6 - add r4,r4,r9 - and r11,r7,r6 - vadd.i32 q13,q9,q14 - ldr r9,[sp,#36] - add r4,r4,r5,ror#27 - vext.8 q12,q9,q15,#4 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - veor q10,q10,q2 - bic r10,r7,r5 - add r3,r3,r9 - veor q12,q12,q8 - and r11,r6,r5 - ldr r9,[sp,#40] - veor q12,q12,q10 - add r3,r3,r4,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q13,q15,q12,#4 - bic r10,r6,r4 - add r7,r7,r9 - vadd.i32 q10,q12,q12 - and r11,r5,r4 - ldr r9,[sp,#44] - vsri.32 q10,q12,#31 - add r7,r7,r3,ror#27 - eor r11,r11,r10 - mov r4,r4,ror#2 - vshr.u32 q12,q13,#30 - add r7,r7,r11 - bic r10,r5,r3 - vshl.u32 q13,q13,#2 - add r6,r6,r9 - and r11,r4,r3 - veor q10,q10,q12 - ldr r9,[sp,#48] - add r6,r6,r7,ror#27 - veor q10,q10,q13 - eor r11,r11,r10 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q11,q3,q8,#8 - bic r10,r4,r7 - add r5,r5,r9 - and r11,r3,r7 - vadd.i32 q13,q10,q14 - ldr r9,[sp,#52] - add r5,r5,r6,ror#27 - vext.8 q12,q10,q15,#4 - eor r11,r11,r10 - mov r7,r7,ror#2 - add r5,r5,r11 - veor q11,q11,q3 - bic r10,r3,r6 - add r4,r4,r9 - veor q12,q12,q9 - and r11,r7,r6 - ldr r9,[sp,#56] - veor q12,q12,q11 - add r4,r4,r5,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q13,q15,q12,#4 - bic r10,r7,r5 - add r3,r3,r9 - vadd.i32 q11,q12,q12 - and r11,r6,r5 - ldr r9,[sp,#60] - vsri.32 q11,q12,#31 - add r3,r3,r4,ror#27 - eor r11,r11,r10 - mov r5,r5,ror#2 - vshr.u32 q12,q13,#30 - add r3,r3,r11 - bic r10,r6,r4 - vshl.u32 q13,q13,#2 - add r7,r7,r9 - and r11,r5,r4 - veor q11,q11,q12 - ldr r9,[sp,#0] - add r7,r7,r3,ror#27 - veor q11,q11,q13 - eor r11,r11,r10 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q10,q11,#8 - bic r10,r5,r3 - add r6,r6,r9 - and r11,r4,r3 - veor q0,q0,q8 - ldr r9,[sp,#4] - add r6,r6,r7,ror#27 - veor q0,q0,q1 - eor r11,r11,r10 - mov r3,r3,ror#2 - vadd.i32 q13,q11,q14 - add r6,r6,r11 - bic r10,r4,r7 - veor q12,q12,q0 - add r5,r5,r9 - and r11,r3,r7 - vshr.u32 q0,q12,#30 - ldr r9,[sp,#8] - add r5,r5,r6,ror#27 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - eor r11,r11,r10 - mov r7,r7,ror#2 - vsli.32 q0,q12,#2 - add r5,r5,r11 - bic r10,r3,r6 - add r4,r4,r9 - and r11,r7,r6 - ldr r9,[sp,#12] - add r4,r4,r5,ror#27 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - bic r10,r7,r5 - add r3,r3,r9 - and r11,r6,r5 - ldr r9,[sp,#16] - add r3,r3,r4,ror#27 - eor r11,r11,r10 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q11,q0,#8 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#20] - veor q1,q1,q9 - eor r11,r10,r5 - add r7,r7,r3,ror#27 - veor q1,q1,q2 - mov r4,r4,ror#2 - add r7,r7,r11 - vadd.i32 q13,q0,q14 - eor r10,r3,r5 - add r6,r6,r9 - veor q12,q12,q1 - ldr r9,[sp,#24] - eor r11,r10,r4 - vshr.u32 q1,q12,#30 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - vst1.32 {q13},[r12,:128]! - add r6,r6,r11 - eor r10,r7,r4 - vsli.32 q1,q12,#2 - add r5,r5,r9 - ldr r9,[sp,#28] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#32] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q12,q0,q1,#8 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#36] - veor q2,q2,q10 - eor r11,r10,r6 - add r3,r3,r4,ror#27 - veor q2,q2,q3 - mov r5,r5,ror#2 - add r3,r3,r11 - vadd.i32 q13,q1,q14 - eor r10,r4,r6 - vld1.32 {d28[],d29[]},[r8,:32]! - add r7,r7,r9 - veor q12,q12,q2 - ldr r9,[sp,#40] - eor r11,r10,r5 - vshr.u32 q2,q12,#30 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - vst1.32 {q13},[r12,:128]! - add r7,r7,r11 - eor r10,r3,r5 - vsli.32 q2,q12,#2 - add r6,r6,r9 - ldr r9,[sp,#44] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#48] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q12,q1,q2,#8 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#52] - veor q3,q3,q11 - eor r11,r10,r7 - add r4,r4,r5,ror#27 - veor q3,q3,q8 - mov r6,r6,ror#2 - add r4,r4,r11 - vadd.i32 q13,q2,q14 - eor r10,r5,r7 - add r3,r3,r9 - veor q12,q12,q3 - ldr r9,[sp,#56] - eor r11,r10,r6 - vshr.u32 q3,q12,#30 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - vst1.32 {q13},[r12,:128]! - add r3,r3,r11 - eor r10,r4,r6 - vsli.32 q3,q12,#2 - add r7,r7,r9 - ldr r9,[sp,#60] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#0] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q12,q2,q3,#8 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#4] - veor q8,q8,q0 - eor r11,r10,r3 - add r5,r5,r6,ror#27 - veor q8,q8,q9 - mov r7,r7,ror#2 - add r5,r5,r11 - vadd.i32 q13,q3,q14 - eor r10,r6,r3 - add r4,r4,r9 - veor q12,q12,q8 - ldr r9,[sp,#8] - eor r11,r10,r7 - vshr.u32 q8,q12,#30 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - add r4,r4,r11 - eor r10,r5,r7 - vsli.32 q8,q12,#2 - add r3,r3,r9 - ldr r9,[sp,#12] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#16] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q3,q8,#8 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#20] - veor q9,q9,q1 - eor r11,r10,r4 - add r6,r6,r7,ror#27 - veor q9,q9,q10 - mov r3,r3,ror#2 - add r6,r6,r11 - vadd.i32 q13,q8,q14 - eor r10,r7,r4 - add r5,r5,r9 - veor q12,q12,q9 - ldr r9,[sp,#24] - eor r11,r10,r3 - vshr.u32 q9,q12,#30 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - vst1.32 {q13},[r12,:128]! - add r5,r5,r11 - eor r10,r6,r3 - vsli.32 q9,q12,#2 - add r4,r4,r9 - ldr r9,[sp,#28] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#32] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q8,q9,#8 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#36] - veor q10,q10,q2 - add r7,r7,r3,ror#27 - eor r11,r5,r6 - veor q10,q10,q11 - add r7,r7,r10 - and r11,r11,r4 - vadd.i32 q13,q9,q14 - mov r4,r4,ror#2 - add r7,r7,r11 - veor q12,q12,q10 - add r6,r6,r9 - and r10,r4,r5 - vshr.u32 q10,q12,#30 - ldr r9,[sp,#40] - add r6,r6,r7,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r4,r5 - add r6,r6,r10 - vsli.32 q10,q12,#2 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#44] - add r5,r5,r6,ror#27 - eor r11,r3,r4 - add r5,r5,r10 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#48] - add r4,r4,r5,ror#27 - eor r11,r7,r3 - add r4,r4,r10 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q12,q9,q10,#8 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#52] - veor q11,q11,q3 - add r3,r3,r4,ror#27 - eor r11,r6,r7 - veor q11,q11,q0 - add r3,r3,r10 - and r11,r11,r5 - vadd.i32 q13,q10,q14 - mov r5,r5,ror#2 - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r11 - veor q12,q12,q11 - add r7,r7,r9 - and r10,r5,r6 - vshr.u32 q11,q12,#30 - ldr r9,[sp,#56] - add r7,r7,r3,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r5,r6 - add r7,r7,r10 - vsli.32 q11,q12,#2 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#60] - add r6,r6,r7,ror#27 - eor r11,r4,r5 - add r6,r6,r10 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#0] - add r5,r5,r6,ror#27 - eor r11,r3,r4 - add r5,r5,r10 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q12,q10,q11,#8 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#4] - veor q0,q0,q8 - add r4,r4,r5,ror#27 - eor r11,r7,r3 - veor q0,q0,q1 - add r4,r4,r10 - and r11,r11,r6 - vadd.i32 q13,q11,q14 - mov r6,r6,ror#2 - add r4,r4,r11 - veor q12,q12,q0 - add r3,r3,r9 - and r10,r6,r7 - vshr.u32 q0,q12,#30 - ldr r9,[sp,#8] - add r3,r3,r4,ror#27 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - eor r11,r6,r7 - add r3,r3,r10 - vsli.32 q0,q12,#2 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#12] - add r7,r7,r3,ror#27 - eor r11,r5,r6 - add r7,r7,r10 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#16] - add r6,r6,r7,ror#27 - eor r11,r4,r5 - add r6,r6,r10 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q12,q11,q0,#8 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#20] - veor q1,q1,q9 - add r5,r5,r6,ror#27 - eor r11,r3,r4 - veor q1,q1,q2 - add r5,r5,r10 - and r11,r11,r7 - vadd.i32 q13,q0,q14 - mov r7,r7,ror#2 - add r5,r5,r11 - veor q12,q12,q1 - add r4,r4,r9 - and r10,r7,r3 - vshr.u32 q1,q12,#30 - ldr r9,[sp,#24] - add r4,r4,r5,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r7,r3 - add r4,r4,r10 - vsli.32 q1,q12,#2 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#28] - add r3,r3,r4,ror#27 - eor r11,r6,r7 - add r3,r3,r10 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#32] - add r7,r7,r3,ror#27 - eor r11,r5,r6 - add r7,r7,r10 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q0,q1,#8 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#36] - veor q2,q2,q10 - add r6,r6,r7,ror#27 - eor r11,r4,r5 - veor q2,q2,q3 - add r6,r6,r10 - and r11,r11,r3 - vadd.i32 q13,q1,q14 - mov r3,r3,ror#2 - add r6,r6,r11 - veor q12,q12,q2 - add r5,r5,r9 - and r10,r3,r4 - vshr.u32 q2,q12,#30 - ldr r9,[sp,#40] - add r5,r5,r6,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r3,r4 - add r5,r5,r10 - vsli.32 q2,q12,#2 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#44] - add r4,r4,r5,ror#27 - eor r11,r7,r3 - add r4,r4,r10 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#48] - add r3,r3,r4,ror#27 - eor r11,r6,r7 - add r3,r3,r10 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q1,q2,#8 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#52] - veor q3,q3,q11 - eor r11,r10,r5 - add r7,r7,r3,ror#27 - veor q3,q3,q8 - mov r4,r4,ror#2 - add r7,r7,r11 - vadd.i32 q13,q2,q14 - eor r10,r3,r5 - add r6,r6,r9 - veor q12,q12,q3 - ldr r9,[sp,#56] - eor r11,r10,r4 - vshr.u32 q3,q12,#30 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - vst1.32 {q13},[r12,:128]! - add r6,r6,r11 - eor r10,r7,r4 - vsli.32 q3,q12,#2 - add r5,r5,r9 - ldr r9,[sp,#60] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#0] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - vadd.i32 q13,q3,q14 - eor r10,r5,r7 - add r3,r3,r9 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - teq r1,r2 - sub r8,r8,#16 - it eq - subeq r1,r1,#64 - vld1.8 {q0,q1},[r1]! - ldr r9,[sp,#4] - eor r11,r10,r6 - vld1.8 {q2,q3},[r1]! - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r11 - eor r10,r4,r6 - vrev32.8 q0,q0 - add r7,r7,r9 - ldr r9,[sp,#8] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#12] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#16] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - vrev32.8 q1,q1 - eor r10,r6,r3 - add r4,r4,r9 - vadd.i32 q8,q0,q14 - ldr r9,[sp,#20] - eor r11,r10,r7 - vst1.32 {q8},[r12,:128]! - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#24] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#28] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#32] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - vrev32.8 q2,q2 - eor r10,r7,r4 - add r5,r5,r9 - vadd.i32 q9,q1,q14 - ldr r9,[sp,#36] - eor r11,r10,r3 - vst1.32 {q9},[r12,:128]! - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#40] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#44] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#48] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - vrev32.8 q3,q3 - eor r10,r3,r5 - add r6,r6,r9 - vadd.i32 q10,q2,q14 - ldr r9,[sp,#52] - eor r11,r10,r4 - vst1.32 {q10},[r12,:128]! - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#56] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#60] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - ldmia r0,{r9,r10,r11,r12} @ accumulate context - add r3,r3,r9 - ldr r9,[r0,#16] - add r4,r4,r10 - add r5,r5,r11 - add r6,r6,r12 - it eq - moveq sp,r14 - add r7,r7,r9 - it ne - ldrne r9,[sp] - stmia r0,{r3,r4,r5,r6,r7} - itt ne - addne r12,sp,#3*16 - bne Loop_neon - - @ vldmia sp!,{d8-d15} - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} - -#endif -#if __ARM_MAX_ARCH__>=7 - -# if defined(__thumb2__) -# define INST(a,b,c,d) .byte c,d|0xf,a,b -# else -# define INST(a,b,c,d) .byte a,b,c,d|0x10 -# endif - -#ifdef __thumb2__ -.thumb_func sha1_block_data_order_armv8 -#endif -.align 5 -sha1_block_data_order_armv8: -LARMv8: - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - - veor q1,q1,q1 - adr r3,LK_00_19 - vld1.32 {q0},[r0]! - vld1.32 {d2[0]},[r0] - sub r0,r0,#16 - vld1.32 {d16[],d17[]},[r3,:32]! - vld1.32 {d18[],d19[]},[r3,:32]! - vld1.32 {d20[],d21[]},[r3,:32]! - vld1.32 {d22[],d23[]},[r3,:32] - -Loop_v8: - vld1.8 {q4,q5},[r1]! - vld1.8 {q6,q7},[r1]! - vrev32.8 q4,q4 - vrev32.8 q5,q5 - - vadd.i32 q12,q8,q4 - vrev32.8 q6,q6 - vmov q14,q0 @ offload - subs r2,r2,#1 - - vadd.i32 q13,q8,q5 - vrev32.8 q7,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0 - INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12 - vadd.i32 q12,q8,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1 - INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 - vadd.i32 q13,q8,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2 - INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 - vadd.i32 q12,q8,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3 - INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 - vadd.i32 q13,q9,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4 - INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 - vadd.i32 q12,q9,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q9,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - vadd.i32 q12,q9,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q9,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - vadd.i32 q12,q10,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q10,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10 - INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 - vadd.i32 q12,q10,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11 - INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 - vadd.i32 q13,q10,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12 - INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 - vadd.i32 q12,q10,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13 - INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 - vadd.i32 q13,q11,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14 - INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 - vadd.i32 q12,q11,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q11,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - vadd.i32 q12,q11,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q11,q7 - - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - - vadd.i32 q1,q1,q2 - vadd.i32 q0,q0,q14 - bne Loop_v8 - - vst1.32 {q0},[r0]! - vst1.32 {d2[0]},[r0] - - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - bx lr @ bx lr - -#endif -#if __ARM_MAX_ARCH__>=7 -.comm _OPENSSL_armcap_P,4 -.non_lazy_symbol_pointer -OPENSSL_armcap_P: -.indirect_symbol _OPENSSL_armcap_P -.long 0 -.private_extern _OPENSSL_armcap_P -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-arm/crypto/fipsmodule/sha256-armv4.S b/third_party/boringssl/apple-arm/crypto/fipsmodule/sha256-armv4.S deleted file mode 100644 index 0cf36482..00000000 --- a/third_party/boringssl/apple-arm/crypto/fipsmodule/sha256-armv4.S +++ /dev/null @@ -1,2846 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. -@ -@ Licensed under the OpenSSL license (the "License"). You may not use -@ this file except in compliance with the License. You can obtain a copy -@ in the file LICENSE in the source distribution or at -@ https://www.openssl.org/source/license.html - - -@ ==================================================================== -@ Written by Andy Polyakov for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. -@ -@ Permission to use under GPL terms is granted. -@ ==================================================================== - -@ SHA256 block procedure for ARMv4. May 2007. - -@ Performance is ~2x better than gcc 3.4 generated code and in "abso- -@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per -@ byte [on single-issue Xscale PXA250 core]. - -@ July 2010. -@ -@ Rescheduling for dual-issue pipeline resulted in 22% improvement on -@ Cortex A8 core and ~20 cycles per processed byte. - -@ February 2011. -@ -@ Profiler-assisted and platform-specific optimization resulted in 16% -@ improvement on Cortex A8 core and ~15.4 cycles per processed byte. - -@ September 2013. -@ -@ Add NEON implementation. On Cortex A8 it was measured to process one -@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon -@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only -@ code (meaning that latter performs sub-optimally, nothing was done -@ about it). - -@ May 2014. -@ -@ Add ARMv8 code path performing at 2.0 cpb on Apple A7. - -#ifndef __KERNEL__ -# include -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ 7 -#endif - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those -@ instructions are manually-encoded. (See unsha256.) - - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - - -.align 5 -K256: -.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.word 0 @ terminator -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -LOPENSSL_armcap: -.word OPENSSL_armcap_P-Lsha256_block_data_order -#endif -.align 5 - -.globl _sha256_block_data_order -.private_extern _sha256_block_data_order -#ifdef __thumb2__ -.thumb_func _sha256_block_data_order -#endif -_sha256_block_data_order: -Lsha256_block_data_order: -#if __ARM_ARCH__<7 && !defined(__thumb2__) - sub r3,pc,#8 @ _sha256_block_data_order -#else - adr r3,Lsha256_block_data_order -#endif -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - ldr r12,LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ - ldr r12,[r12] -#endif - tst r12,#ARMV8_SHA256 - bne LARMv8 - tst r12,#ARMV7_NEON - bne LNEON -#endif - add r2,r1,r2,lsl#6 @ len to point at the end of inp - stmdb sp!,{r0,r1,r2,r4-r11,lr} - ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} - sub r14,r3,#256+32 @ K256 - sub sp,sp,#16*4 @ alloca(X[16]) -Loop: -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ magic - eor r12,r12,r12 -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 0 -# if 0==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r8,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 0 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 0==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r8,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#0*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 0==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 0<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#2*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#15*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 1 -# if 1==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r7,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 1 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 1==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r7,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#1*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 1==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 1<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#3*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#0*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 2 -# if 2==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r6,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 2 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 2==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r6,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#2*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 2==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 2<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#4*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#1*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 3 -# if 3==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r5,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 3 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 3==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r5,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#3*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 3==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 3<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#5*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#2*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 4 -# if 4==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r4,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 4 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 4==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r4,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#4*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 4==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 4<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#6*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#3*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 5 -# if 5==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r11,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 5==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r11,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#5*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 5==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 5<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#7*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#4*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 6 -# if 6==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r10,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 6 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 6==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r10,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#6*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 6==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 6<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#8*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#5*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 7 -# if 7==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r9,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 7==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r9,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#7*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 7==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 7<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#9*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#6*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 8 -# if 8==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r8,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 8 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 8==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r8,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#8*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 8==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 8<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#10*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#7*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 9 -# if 9==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r7,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 9 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 9==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r7,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#9*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 9==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 9<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#11*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#8*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 10 -# if 10==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r6,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 10 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 10==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r6,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#10*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 10==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 10<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#12*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#9*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 11 -# if 11==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r5,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 11 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 11==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r5,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#11*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 11==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 11<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#13*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#10*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 12 -# if 12==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r4,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 12 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 12==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r4,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#12*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 12==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 12<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#14*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#11*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 13 -# if 13==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r11,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 13 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 13==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r11,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#13*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 13==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 13<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#15*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#12*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 14 -# if 14==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r10,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 14 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 14==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r10,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#14*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 14==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 14<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#0*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#13*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 15 -# if 15==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r9,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 15 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 15==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r9,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#15*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 15==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 15<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#1*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#14*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) -Lrounds_16_xx: - @ ldr r2,[sp,#1*4] @ 16 - @ ldr r1,[sp,#14*4] - mov r0,r2,ror#7 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#0*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#9*4] - - add r12,r12,r0 - eor r0,r8,r8,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r8,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#0*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 16==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 16<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#2*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#15*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#2*4] @ 17 - @ ldr r1,[sp,#15*4] - mov r0,r2,ror#7 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#1*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#10*4] - - add r3,r3,r0 - eor r0,r7,r7,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r7,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#1*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 17==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 17<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#3*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#0*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#3*4] @ 18 - @ ldr r1,[sp,#0*4] - mov r0,r2,ror#7 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#2*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#11*4] - - add r12,r12,r0 - eor r0,r6,r6,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r6,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#2*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 18==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 18<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#4*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#1*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#4*4] @ 19 - @ ldr r1,[sp,#1*4] - mov r0,r2,ror#7 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#3*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#12*4] - - add r3,r3,r0 - eor r0,r5,r5,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r5,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#3*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 19==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 19<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#5*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#2*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#5*4] @ 20 - @ ldr r1,[sp,#2*4] - mov r0,r2,ror#7 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#4*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#13*4] - - add r12,r12,r0 - eor r0,r4,r4,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r4,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#4*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 20==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 20<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#6*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#3*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#6*4] @ 21 - @ ldr r1,[sp,#3*4] - mov r0,r2,ror#7 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#5*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#14*4] - - add r3,r3,r0 - eor r0,r11,r11,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r11,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#5*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 21==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 21<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#7*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#4*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#7*4] @ 22 - @ ldr r1,[sp,#4*4] - mov r0,r2,ror#7 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#6*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#15*4] - - add r12,r12,r0 - eor r0,r10,r10,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r10,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#6*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 22==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 22<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#8*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#5*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#8*4] @ 23 - @ ldr r1,[sp,#5*4] - mov r0,r2,ror#7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#7*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#0*4] - - add r3,r3,r0 - eor r0,r9,r9,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r9,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#7*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 23==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 23<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#9*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#6*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#9*4] @ 24 - @ ldr r1,[sp,#6*4] - mov r0,r2,ror#7 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#8*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#1*4] - - add r12,r12,r0 - eor r0,r8,r8,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r8,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#8*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 24==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 24<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#10*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#7*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#10*4] @ 25 - @ ldr r1,[sp,#7*4] - mov r0,r2,ror#7 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#9*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#2*4] - - add r3,r3,r0 - eor r0,r7,r7,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r7,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#9*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 25==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 25<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#11*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#8*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#11*4] @ 26 - @ ldr r1,[sp,#8*4] - mov r0,r2,ror#7 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#10*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#3*4] - - add r12,r12,r0 - eor r0,r6,r6,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r6,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#10*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 26==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 26<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#12*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#9*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#12*4] @ 27 - @ ldr r1,[sp,#9*4] - mov r0,r2,ror#7 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#11*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#4*4] - - add r3,r3,r0 - eor r0,r5,r5,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r5,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#11*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 27==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 27<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#13*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#10*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#13*4] @ 28 - @ ldr r1,[sp,#10*4] - mov r0,r2,ror#7 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#12*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#5*4] - - add r12,r12,r0 - eor r0,r4,r4,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r4,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#12*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 28==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 28<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#14*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#11*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#14*4] @ 29 - @ ldr r1,[sp,#11*4] - mov r0,r2,ror#7 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#13*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#6*4] - - add r3,r3,r0 - eor r0,r11,r11,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r11,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#13*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 29==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 29<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#15*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#12*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#15*4] @ 30 - @ ldr r1,[sp,#12*4] - mov r0,r2,ror#7 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#14*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#7*4] - - add r12,r12,r0 - eor r0,r10,r10,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r10,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#14*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 30==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 30<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#0*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#13*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#0*4] @ 31 - @ ldr r1,[sp,#13*4] - mov r0,r2,ror#7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#15*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#8*4] - - add r3,r3,r0 - eor r0,r9,r9,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r9,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#15*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 31==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 31<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#1*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#14*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - ite eq @ Thumb2 thing, sanity check in ARM -#endif - ldreq r3,[sp,#16*4] @ pull ctx - bne Lrounds_16_xx - - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldr r0,[r3,#0] - ldr r2,[r3,#4] - ldr r12,[r3,#8] - add r4,r4,r0 - ldr r0,[r3,#12] - add r5,r5,r2 - ldr r2,[r3,#16] - add r6,r6,r12 - ldr r12,[r3,#20] - add r7,r7,r0 - ldr r0,[r3,#24] - add r8,r8,r2 - ldr r2,[r3,#28] - add r9,r9,r12 - ldr r1,[sp,#17*4] @ pull inp - ldr r12,[sp,#18*4] @ pull inp+len - add r10,r10,r0 - add r11,r11,r2 - stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} - cmp r1,r12 - sub r14,r14,#256 @ rewind Ktbl - bne Loop - - add sp,sp,#19*4 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} -#else - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif - -#if __ARM_MAX_ARCH__>=7 - - - -.globl _sha256_block_data_order_neon -.private_extern _sha256_block_data_order_neon -#ifdef __thumb2__ -.thumb_func _sha256_block_data_order_neon -#endif -.align 5 -.skip 16 -_sha256_block_data_order_neon: -LNEON: - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - - sub r11,sp,#16*4+16 - adr r14,K256 - bic r11,r11,#15 @ align for 128-bit stores - mov r12,sp - mov sp,r11 @ alloca - add r2,r1,r2,lsl#6 @ len to point at the end of inp - - vld1.8 {q0},[r1]! - vld1.8 {q1},[r1]! - vld1.8 {q2},[r1]! - vld1.8 {q3},[r1]! - vld1.32 {q8},[r14,:128]! - vld1.32 {q9},[r14,:128]! - vld1.32 {q10},[r14,:128]! - vld1.32 {q11},[r14,:128]! - vrev32.8 q0,q0 @ yes, even on - str r0,[sp,#64] - vrev32.8 q1,q1 @ big-endian - str r1,[sp,#68] - mov r1,sp - vrev32.8 q2,q2 - str r2,[sp,#72] - vrev32.8 q3,q3 - str r12,[sp,#76] @ save original sp - vadd.i32 q8,q8,q0 - vadd.i32 q9,q9,q1 - vst1.32 {q8},[r1,:128]! - vadd.i32 q10,q10,q2 - vst1.32 {q9},[r1,:128]! - vadd.i32 q11,q11,q3 - vst1.32 {q10},[r1,:128]! - vst1.32 {q11},[r1,:128]! - - ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} - sub r1,r1,#64 - ldr r2,[sp,#0] - eor r12,r12,r12 - eor r3,r5,r6 - b L_00_48 - -.align 4 -L_00_48: - vext.8 q8,q0,q1,#4 - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - vext.8 q9,q2,q3,#4 - add r4,r4,r12 - and r2,r2,r8 - eor r12,r0,r8,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vadd.i32 q0,q0,q9 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - vshr.u32 q9,q8,#3 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#4] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - veor q9,q9,q10 - add r10,r10,r2 - vsli.32 q11,q8,#14 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - vshr.u32 d24,d7,#17 - add r11,r11,r3 - and r2,r2,r7 - veor q9,q9,q11 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - vsli.32 d24,d7,#15 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - vshr.u32 d25,d7,#10 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - vadd.i32 q0,q0,q9 - add r10,r10,r2 - ldr r2,[sp,#8] - veor d25,d25,d24 - and r12,r12,r3 - add r6,r6,r10 - vshr.u32 d24,d7,#19 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - vsli.32 d24,d7,#13 - add r9,r9,r2 - eor r2,r7,r8 - veor d25,d25,d24 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - vadd.i32 d0,d0,d25 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - vshr.u32 d24,d0,#17 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - vsli.32 d24,d0,#15 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - vshr.u32 d25,d0,#10 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - veor d25,d25,d24 - ldr r2,[sp,#12] - and r3,r3,r12 - vshr.u32 d24,d0,#19 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - vld1.32 {q8},[r14,:128]! - add r8,r8,r2 - vsli.32 d24,d0,#13 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - veor d25,d25,d24 - add r9,r9,r3 - and r2,r2,r5 - vadd.i32 d1,d1,d25 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - vadd.i32 q8,q8,q0 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#16] - and r12,r12,r3 - add r4,r4,r8 - vst1.32 {q8},[r1,:128]! - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vext.8 q8,q1,q2,#4 - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - vext.8 q9,q3,q0,#4 - add r8,r8,r12 - and r2,r2,r4 - eor r12,r0,r4,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vadd.i32 q1,q1,q9 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - vshr.u32 q9,q8,#3 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#20] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - veor q9,q9,q10 - add r6,r6,r2 - vsli.32 q11,q8,#14 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - vshr.u32 d24,d1,#17 - add r7,r7,r3 - and r2,r2,r11 - veor q9,q9,q11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - vsli.32 d24,d1,#15 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - vshr.u32 d25,d1,#10 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - vadd.i32 q1,q1,q9 - add r6,r6,r2 - ldr r2,[sp,#24] - veor d25,d25,d24 - and r12,r12,r3 - add r10,r10,r6 - vshr.u32 d24,d1,#19 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - vsli.32 d24,d1,#13 - add r5,r5,r2 - eor r2,r11,r4 - veor d25,d25,d24 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - vadd.i32 d2,d2,d25 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - vshr.u32 d24,d2,#17 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - vsli.32 d24,d2,#15 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - vshr.u32 d25,d2,#10 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - veor d25,d25,d24 - ldr r2,[sp,#28] - and r3,r3,r12 - vshr.u32 d24,d2,#19 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - vld1.32 {q8},[r14,:128]! - add r4,r4,r2 - vsli.32 d24,d2,#13 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - veor d25,d25,d24 - add r5,r5,r3 - and r2,r2,r9 - vadd.i32 d3,d3,d25 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - vadd.i32 q8,q8,q1 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#32] - and r12,r12,r3 - add r8,r8,r4 - vst1.32 {q8},[r1,:128]! - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vext.8 q8,q2,q3,#4 - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - vext.8 q9,q0,q1,#4 - add r4,r4,r12 - and r2,r2,r8 - eor r12,r0,r8,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vadd.i32 q2,q2,q9 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - vshr.u32 q9,q8,#3 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#36] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - veor q9,q9,q10 - add r10,r10,r2 - vsli.32 q11,q8,#14 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - vshr.u32 d24,d3,#17 - add r11,r11,r3 - and r2,r2,r7 - veor q9,q9,q11 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - vsli.32 d24,d3,#15 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - vshr.u32 d25,d3,#10 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - vadd.i32 q2,q2,q9 - add r10,r10,r2 - ldr r2,[sp,#40] - veor d25,d25,d24 - and r12,r12,r3 - add r6,r6,r10 - vshr.u32 d24,d3,#19 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - vsli.32 d24,d3,#13 - add r9,r9,r2 - eor r2,r7,r8 - veor d25,d25,d24 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - vadd.i32 d4,d4,d25 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - vshr.u32 d24,d4,#17 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - vsli.32 d24,d4,#15 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - vshr.u32 d25,d4,#10 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - veor d25,d25,d24 - ldr r2,[sp,#44] - and r3,r3,r12 - vshr.u32 d24,d4,#19 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - vld1.32 {q8},[r14,:128]! - add r8,r8,r2 - vsli.32 d24,d4,#13 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - veor d25,d25,d24 - add r9,r9,r3 - and r2,r2,r5 - vadd.i32 d5,d5,d25 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - vadd.i32 q8,q8,q2 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#48] - and r12,r12,r3 - add r4,r4,r8 - vst1.32 {q8},[r1,:128]! - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vext.8 q8,q3,q0,#4 - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - vext.8 q9,q1,q2,#4 - add r8,r8,r12 - and r2,r2,r4 - eor r12,r0,r4,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vadd.i32 q3,q3,q9 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - vshr.u32 q9,q8,#3 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#52] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - veor q9,q9,q10 - add r6,r6,r2 - vsli.32 q11,q8,#14 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - vshr.u32 d24,d5,#17 - add r7,r7,r3 - and r2,r2,r11 - veor q9,q9,q11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - vsli.32 d24,d5,#15 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - vshr.u32 d25,d5,#10 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - vadd.i32 q3,q3,q9 - add r6,r6,r2 - ldr r2,[sp,#56] - veor d25,d25,d24 - and r12,r12,r3 - add r10,r10,r6 - vshr.u32 d24,d5,#19 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - vsli.32 d24,d5,#13 - add r5,r5,r2 - eor r2,r11,r4 - veor d25,d25,d24 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - vadd.i32 d6,d6,d25 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - vshr.u32 d24,d6,#17 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - vsli.32 d24,d6,#15 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - vshr.u32 d25,d6,#10 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - veor d25,d25,d24 - ldr r2,[sp,#60] - and r3,r3,r12 - vshr.u32 d24,d6,#19 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - vld1.32 {q8},[r14,:128]! - add r4,r4,r2 - vsli.32 d24,d6,#13 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - veor d25,d25,d24 - add r5,r5,r3 - and r2,r2,r9 - vadd.i32 d7,d7,d25 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - vadd.i32 q8,q8,q3 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[r14] - and r12,r12,r3 - add r8,r8,r4 - vst1.32 {q8},[r1,:128]! - add r4,r4,r0,ror#2 - eor r12,r12,r6 - teq r2,#0 @ check for K256 terminator - ldr r2,[sp,#0] - sub r1,r1,#64 - bne L_00_48 - - ldr r1,[sp,#68] - ldr r0,[sp,#72] - sub r14,r14,#256 @ rewind r14 - teq r1,r0 - it eq - subeq r1,r1,#64 @ avoid SEGV - vld1.8 {q0},[r1]! @ load next input block - vld1.8 {q1},[r1]! - vld1.8 {q2},[r1]! - vld1.8 {q3},[r1]! - it ne - strne r1,[sp,#68] - mov r1,sp - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - add r4,r4,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r8 - eor r12,r0,r8,ror#19 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vrev32.8 q0,q0 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vadd.i32 q8,q8,q0 - ldr r2,[sp,#4] - and r3,r3,r12 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - add r10,r10,r2 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - add r11,r11,r3 - and r2,r2,r7 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - add r10,r10,r2 - ldr r2,[sp,#8] - and r12,r12,r3 - add r6,r6,r10 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - add r9,r9,r2 - eor r2,r7,r8 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - ldr r2,[sp,#12] - and r3,r3,r12 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - add r8,r8,r2 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - add r9,r9,r3 - and r2,r2,r5 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#16] - and r12,r12,r3 - add r4,r4,r8 - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vst1.32 {q8},[r1,:128]! - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - add r8,r8,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r4 - eor r12,r0,r4,ror#19 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vrev32.8 q1,q1 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vadd.i32 q8,q8,q1 - ldr r2,[sp,#20] - and r3,r3,r12 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - add r6,r6,r2 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - add r7,r7,r3 - and r2,r2,r11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - add r6,r6,r2 - ldr r2,[sp,#24] - and r12,r12,r3 - add r10,r10,r6 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - add r5,r5,r2 - eor r2,r11,r4 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - ldr r2,[sp,#28] - and r3,r3,r12 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - add r4,r4,r2 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - add r5,r5,r3 - and r2,r2,r9 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#32] - and r12,r12,r3 - add r8,r8,r4 - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vst1.32 {q8},[r1,:128]! - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - add r4,r4,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r8 - eor r12,r0,r8,ror#19 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vrev32.8 q2,q2 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vadd.i32 q8,q8,q2 - ldr r2,[sp,#36] - and r3,r3,r12 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - add r10,r10,r2 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - add r11,r11,r3 - and r2,r2,r7 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - add r10,r10,r2 - ldr r2,[sp,#40] - and r12,r12,r3 - add r6,r6,r10 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - add r9,r9,r2 - eor r2,r7,r8 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - ldr r2,[sp,#44] - and r3,r3,r12 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - add r8,r8,r2 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - add r9,r9,r3 - and r2,r2,r5 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#48] - and r12,r12,r3 - add r4,r4,r8 - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vst1.32 {q8},[r1,:128]! - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - add r8,r8,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r4 - eor r12,r0,r4,ror#19 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vrev32.8 q3,q3 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vadd.i32 q8,q8,q3 - ldr r2,[sp,#52] - and r3,r3,r12 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - add r6,r6,r2 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - add r7,r7,r3 - and r2,r2,r11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - add r6,r6,r2 - ldr r2,[sp,#56] - and r12,r12,r3 - add r10,r10,r6 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - add r5,r5,r2 - eor r2,r11,r4 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - ldr r2,[sp,#60] - and r3,r3,r12 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - add r4,r4,r2 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - add r5,r5,r3 - and r2,r2,r9 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#64] - and r12,r12,r3 - add r8,r8,r4 - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vst1.32 {q8},[r1,:128]! - ldr r0,[r2,#0] - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldr r12,[r2,#4] - ldr r3,[r2,#8] - ldr r1,[r2,#12] - add r4,r4,r0 @ accumulate - ldr r0,[r2,#16] - add r5,r5,r12 - ldr r12,[r2,#20] - add r6,r6,r3 - ldr r3,[r2,#24] - add r7,r7,r1 - ldr r1,[r2,#28] - add r8,r8,r0 - str r4,[r2],#4 - add r9,r9,r12 - str r5,[r2],#4 - add r10,r10,r3 - str r6,[r2],#4 - add r11,r11,r1 - str r7,[r2],#4 - stmia r2,{r8,r9,r10,r11} - - ittte ne - movne r1,sp - ldrne r2,[sp,#0] - eorne r12,r12,r12 - ldreq sp,[sp,#76] @ restore original sp - itt ne - eorne r3,r5,r6 - bne L_00_48 - - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} - -#endif -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - -# if defined(__thumb2__) -# define INST(a,b,c,d) .byte c,d|0xc,a,b -# else -# define INST(a,b,c,d) .byte a,b,c,d -# endif - -#ifdef __thumb2__ -.thumb_func sha256_block_data_order_armv8 -#endif -.align 5 -sha256_block_data_order_armv8: -LARMv8: - vld1.32 {q0,q1},[r0] - sub r3,r3,#256+32 - add r2,r1,r2,lsl#6 @ len to point at the end of inp - b Loop_v8 - -.align 4 -Loop_v8: - vld1.8 {q8,q9},[r1]! - vld1.8 {q10,q11},[r1]! - vld1.32 {q12},[r3]! - vrev32.8 q8,q8 - vrev32.8 q9,q9 - vrev32.8 q10,q10 - vrev32.8 q11,q11 - vmov q14,q0 @ offload - vmov q15,q1 - teq r1,r2 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - - vld1.32 {q13},[r3] - vadd.i32 q12,q12,q10 - sub r3,r3,#256-16 @ rewind - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - - vadd.i32 q13,q13,q11 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - - vadd.i32 q0,q0,q14 - vadd.i32 q1,q1,q15 - it ne - bne Loop_v8 - - vst1.32 {q0,q1},[r0] - - bx lr @ bx lr - -#endif -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm _OPENSSL_armcap_P,4 -.non_lazy_symbol_pointer -OPENSSL_armcap_P: -.indirect_symbol _OPENSSL_armcap_P -.long 0 -.private_extern _OPENSSL_armcap_P -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-arm/crypto/fipsmodule/sha512-armv4.S b/third_party/boringssl/apple-arm/crypto/fipsmodule/sha512-armv4.S deleted file mode 100644 index 21913cb2..00000000 --- a/third_party/boringssl/apple-arm/crypto/fipsmodule/sha512-armv4.S +++ /dev/null @@ -1,1899 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. -@ -@ Licensed under the OpenSSL license (the "License"). You may not use -@ this file except in compliance with the License. You can obtain a copy -@ in the file LICENSE in the source distribution or at -@ https://www.openssl.org/source/license.html - - -@ ==================================================================== -@ Written by Andy Polyakov for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. -@ -@ Permission to use under GPL terms is granted. -@ ==================================================================== - -@ SHA512 block procedure for ARMv4. September 2007. - -@ This code is ~4.5 (four and a half) times faster than code generated -@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue -@ Xscale PXA250 core]. -@ -@ July 2010. -@ -@ Rescheduling for dual-issue pipeline resulted in 6% improvement on -@ Cortex A8 core and ~40 cycles per processed byte. - -@ February 2011. -@ -@ Profiler-assisted and platform-specific optimization resulted in 7% -@ improvement on Coxtex A8 core and ~38 cycles per byte. - -@ March 2011. -@ -@ Add NEON implementation. On Cortex A8 it was measured to process -@ one byte in 23.3 cycles or ~60% faster than integer-only code. - -@ August 2012. -@ -@ Improve NEON performance by 12% on Snapdragon S4. In absolute -@ terms it's 22.6 cycles per byte, which is disappointing result. -@ Technical writers asserted that 3-way S4 pipeline can sustain -@ multiple NEON instructions per cycle, but dual NEON issue could -@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html -@ for further details. On side note Cortex-A15 processes one byte in -@ 16 cycles. - -@ Byte order [in]dependence. ========================================= -@ -@ Originally caller was expected to maintain specific *dword* order in -@ h[0-7], namely with most significant dword at *lower* address, which -@ was reflected in below two parameters as 0 and 4. Now caller is -@ expected to maintain native byte order for whole 64-bit values. -#ifndef __KERNEL__ -# include -# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} -# define VFP_ABI_POP vldmia sp!,{d8-d15} -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ 7 -# define VFP_ABI_PUSH -# define VFP_ABI_POP -#endif - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. - - -#ifdef __ARMEL__ -# define LO 0 -# define HI 4 -# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 -#else -# define HI 0 -# define LO 4 -# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 -#endif - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -# define adrl adr -#else -.code 32 -#endif - - -.align 5 -K512: - WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) - WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) - WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) - WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) - WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) - WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) - WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) - WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) - WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) - WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) - WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) - WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) - WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) - WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) - WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) - WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) - WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) - WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) - WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) - WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) - WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) - WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) - WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) - WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) - WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) - WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) - WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) - WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) - WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) - WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) - WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) - WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) - WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) - WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) - WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) - WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) - WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) - WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) - WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) - WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) - -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -LOPENSSL_armcap: -.word OPENSSL_armcap_P-Lsha512_block_data_order -.skip 32-4 -#else -.skip 32 -#endif - -.globl _sha512_block_data_order -.private_extern _sha512_block_data_order -#ifdef __thumb2__ -.thumb_func _sha512_block_data_order -#endif -_sha512_block_data_order: -Lsha512_block_data_order: -#if __ARM_ARCH__<7 && !defined(__thumb2__) - sub r3,pc,#8 @ _sha512_block_data_order -#else - adr r3,Lsha512_block_data_order -#endif -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - ldr r12,LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ - ldr r12,[r12] -#endif - tst r12,#ARMV7_NEON - bne LNEON -#endif - add r2,r1,r2,lsl#7 @ len to point at the end of inp - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - sub r14,r3,#672 @ K512 - sub sp,sp,#9*8 - - ldr r7,[r0,#32+LO] - ldr r8,[r0,#32+HI] - ldr r9, [r0,#48+LO] - ldr r10, [r0,#48+HI] - ldr r11, [r0,#56+LO] - ldr r12, [r0,#56+HI] -Loop: - str r9, [sp,#48+0] - str r10, [sp,#48+4] - str r11, [sp,#56+0] - str r12, [sp,#56+4] - ldr r5,[r0,#0+LO] - ldr r6,[r0,#0+HI] - ldr r3,[r0,#8+LO] - ldr r4,[r0,#8+HI] - ldr r9, [r0,#16+LO] - ldr r10, [r0,#16+HI] - ldr r11, [r0,#24+LO] - ldr r12, [r0,#24+HI] - str r3,[sp,#8+0] - str r4,[sp,#8+4] - str r9, [sp,#16+0] - str r10, [sp,#16+4] - str r11, [sp,#24+0] - str r12, [sp,#24+4] - ldr r3,[r0,#40+LO] - ldr r4,[r0,#40+HI] - str r3,[sp,#40+0] - str r4,[sp,#40+4] - -L00_15: -#if __ARM_ARCH__<7 - ldrb r3,[r1,#7] - ldrb r9, [r1,#6] - ldrb r10, [r1,#5] - ldrb r11, [r1,#4] - ldrb r4,[r1,#3] - ldrb r12, [r1,#2] - orr r3,r3,r9,lsl#8 - ldrb r9, [r1,#1] - orr r3,r3,r10,lsl#16 - ldrb r10, [r1],#8 - orr r3,r3,r11,lsl#24 - orr r4,r4,r12,lsl#8 - orr r4,r4,r9,lsl#16 - orr r4,r4,r10,lsl#24 -#else - ldr r3,[r1,#4] - ldr r4,[r1],#8 -#ifdef __ARMEL__ - rev r3,r3 - rev r4,r4 -#endif -#endif - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov r9,r7,lsr#14 - str r3,[sp,#64+0] - mov r10,r8,lsr#14 - str r4,[sp,#64+4] - eor r9,r9,r8,lsl#18 - ldr r11,[sp,#56+0] @ h.lo - eor r10,r10,r7,lsl#18 - ldr r12,[sp,#56+4] @ h.hi - eor r9,r9,r7,lsr#18 - eor r10,r10,r8,lsr#18 - eor r9,r9,r8,lsl#14 - eor r10,r10,r7,lsl#14 - eor r9,r9,r8,lsr#9 - eor r10,r10,r7,lsr#9 - eor r9,r9,r7,lsl#23 - eor r10,r10,r8,lsl#23 @ Sigma1(e) - adds r3,r3,r9 - ldr r9,[sp,#40+0] @ f.lo - adc r4,r4,r10 @ T += Sigma1(e) - ldr r10,[sp,#40+4] @ f.hi - adds r3,r3,r11 - ldr r11,[sp,#48+0] @ g.lo - adc r4,r4,r12 @ T += h - ldr r12,[sp,#48+4] @ g.hi - - eor r9,r9,r11 - str r7,[sp,#32+0] - eor r10,r10,r12 - str r8,[sp,#32+4] - and r9,r9,r7 - str r5,[sp,#0+0] - and r10,r10,r8 - str r6,[sp,#0+4] - eor r9,r9,r11 - ldr r11,[r14,#LO] @ K[i].lo - eor r10,r10,r12 @ Ch(e,f,g) - ldr r12,[r14,#HI] @ K[i].hi - - adds r3,r3,r9 - ldr r7,[sp,#24+0] @ d.lo - adc r4,r4,r10 @ T += Ch(e,f,g) - ldr r8,[sp,#24+4] @ d.hi - adds r3,r3,r11 - and r9,r11,#0xff - adc r4,r4,r12 @ T += K[i] - adds r7,r7,r3 - ldr r11,[sp,#8+0] @ b.lo - adc r8,r8,r4 @ d += T - teq r9,#148 - - ldr r12,[sp,#16+0] @ c.lo -#if __ARM_ARCH__>=7 - it eq @ Thumb2 thing, sanity check in ARM -#endif - orreq r14,r14,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov r9,r5,lsr#28 - mov r10,r6,lsr#28 - eor r9,r9,r6,lsl#4 - eor r10,r10,r5,lsl#4 - eor r9,r9,r6,lsr#2 - eor r10,r10,r5,lsr#2 - eor r9,r9,r5,lsl#30 - eor r10,r10,r6,lsl#30 - eor r9,r9,r6,lsr#7 - eor r10,r10,r5,lsr#7 - eor r9,r9,r5,lsl#25 - eor r10,r10,r6,lsl#25 @ Sigma0(a) - adds r3,r3,r9 - and r9,r5,r11 - adc r4,r4,r10 @ T += Sigma0(a) - - ldr r10,[sp,#8+4] @ b.hi - orr r5,r5,r11 - ldr r11,[sp,#16+4] @ c.hi - and r5,r5,r12 - and r12,r6,r10 - orr r6,r6,r10 - orr r5,r5,r9 @ Maj(a,b,c).lo - and r6,r6,r11 - adds r5,r5,r3 - orr r6,r6,r12 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc r6,r6,r4 @ h += T - tst r14,#1 - add r14,r14,#8 - tst r14,#1 - beq L00_15 - ldr r9,[sp,#184+0] - ldr r10,[sp,#184+4] - bic r14,r14,#1 -L16_79: - @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) - @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 - @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 - mov r3,r9,lsr#1 - ldr r11,[sp,#80+0] - mov r4,r10,lsr#1 - ldr r12,[sp,#80+4] - eor r3,r3,r10,lsl#31 - eor r4,r4,r9,lsl#31 - eor r3,r3,r9,lsr#8 - eor r4,r4,r10,lsr#8 - eor r3,r3,r10,lsl#24 - eor r4,r4,r9,lsl#24 - eor r3,r3,r9,lsr#7 - eor r4,r4,r10,lsr#7 - eor r3,r3,r10,lsl#25 - - @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) - @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 - @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 - mov r9,r11,lsr#19 - mov r10,r12,lsr#19 - eor r9,r9,r12,lsl#13 - eor r10,r10,r11,lsl#13 - eor r9,r9,r12,lsr#29 - eor r10,r10,r11,lsr#29 - eor r9,r9,r11,lsl#3 - eor r10,r10,r12,lsl#3 - eor r9,r9,r11,lsr#6 - eor r10,r10,r12,lsr#6 - ldr r11,[sp,#120+0] - eor r9,r9,r12,lsl#26 - - ldr r12,[sp,#120+4] - adds r3,r3,r9 - ldr r9,[sp,#192+0] - adc r4,r4,r10 - - ldr r10,[sp,#192+4] - adds r3,r3,r11 - adc r4,r4,r12 - adds r3,r3,r9 - adc r4,r4,r10 - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov r9,r7,lsr#14 - str r3,[sp,#64+0] - mov r10,r8,lsr#14 - str r4,[sp,#64+4] - eor r9,r9,r8,lsl#18 - ldr r11,[sp,#56+0] @ h.lo - eor r10,r10,r7,lsl#18 - ldr r12,[sp,#56+4] @ h.hi - eor r9,r9,r7,lsr#18 - eor r10,r10,r8,lsr#18 - eor r9,r9,r8,lsl#14 - eor r10,r10,r7,lsl#14 - eor r9,r9,r8,lsr#9 - eor r10,r10,r7,lsr#9 - eor r9,r9,r7,lsl#23 - eor r10,r10,r8,lsl#23 @ Sigma1(e) - adds r3,r3,r9 - ldr r9,[sp,#40+0] @ f.lo - adc r4,r4,r10 @ T += Sigma1(e) - ldr r10,[sp,#40+4] @ f.hi - adds r3,r3,r11 - ldr r11,[sp,#48+0] @ g.lo - adc r4,r4,r12 @ T += h - ldr r12,[sp,#48+4] @ g.hi - - eor r9,r9,r11 - str r7,[sp,#32+0] - eor r10,r10,r12 - str r8,[sp,#32+4] - and r9,r9,r7 - str r5,[sp,#0+0] - and r10,r10,r8 - str r6,[sp,#0+4] - eor r9,r9,r11 - ldr r11,[r14,#LO] @ K[i].lo - eor r10,r10,r12 @ Ch(e,f,g) - ldr r12,[r14,#HI] @ K[i].hi - - adds r3,r3,r9 - ldr r7,[sp,#24+0] @ d.lo - adc r4,r4,r10 @ T += Ch(e,f,g) - ldr r8,[sp,#24+4] @ d.hi - adds r3,r3,r11 - and r9,r11,#0xff - adc r4,r4,r12 @ T += K[i] - adds r7,r7,r3 - ldr r11,[sp,#8+0] @ b.lo - adc r8,r8,r4 @ d += T - teq r9,#23 - - ldr r12,[sp,#16+0] @ c.lo -#if __ARM_ARCH__>=7 - it eq @ Thumb2 thing, sanity check in ARM -#endif - orreq r14,r14,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov r9,r5,lsr#28 - mov r10,r6,lsr#28 - eor r9,r9,r6,lsl#4 - eor r10,r10,r5,lsl#4 - eor r9,r9,r6,lsr#2 - eor r10,r10,r5,lsr#2 - eor r9,r9,r5,lsl#30 - eor r10,r10,r6,lsl#30 - eor r9,r9,r6,lsr#7 - eor r10,r10,r5,lsr#7 - eor r9,r9,r5,lsl#25 - eor r10,r10,r6,lsl#25 @ Sigma0(a) - adds r3,r3,r9 - and r9,r5,r11 - adc r4,r4,r10 @ T += Sigma0(a) - - ldr r10,[sp,#8+4] @ b.hi - orr r5,r5,r11 - ldr r11,[sp,#16+4] @ c.hi - and r5,r5,r12 - and r12,r6,r10 - orr r6,r6,r10 - orr r5,r5,r9 @ Maj(a,b,c).lo - and r6,r6,r11 - adds r5,r5,r3 - orr r6,r6,r12 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc r6,r6,r4 @ h += T - tst r14,#1 - add r14,r14,#8 -#if __ARM_ARCH__>=7 - ittt eq @ Thumb2 thing, sanity check in ARM -#endif - ldreq r9,[sp,#184+0] - ldreq r10,[sp,#184+4] - beq L16_79 - bic r14,r14,#1 - - ldr r3,[sp,#8+0] - ldr r4,[sp,#8+4] - ldr r9, [r0,#0+LO] - ldr r10, [r0,#0+HI] - ldr r11, [r0,#8+LO] - ldr r12, [r0,#8+HI] - adds r9,r5,r9 - str r9, [r0,#0+LO] - adc r10,r6,r10 - str r10, [r0,#0+HI] - adds r11,r3,r11 - str r11, [r0,#8+LO] - adc r12,r4,r12 - str r12, [r0,#8+HI] - - ldr r5,[sp,#16+0] - ldr r6,[sp,#16+4] - ldr r3,[sp,#24+0] - ldr r4,[sp,#24+4] - ldr r9, [r0,#16+LO] - ldr r10, [r0,#16+HI] - ldr r11, [r0,#24+LO] - ldr r12, [r0,#24+HI] - adds r9,r5,r9 - str r9, [r0,#16+LO] - adc r10,r6,r10 - str r10, [r0,#16+HI] - adds r11,r3,r11 - str r11, [r0,#24+LO] - adc r12,r4,r12 - str r12, [r0,#24+HI] - - ldr r3,[sp,#40+0] - ldr r4,[sp,#40+4] - ldr r9, [r0,#32+LO] - ldr r10, [r0,#32+HI] - ldr r11, [r0,#40+LO] - ldr r12, [r0,#40+HI] - adds r7,r7,r9 - str r7,[r0,#32+LO] - adc r8,r8,r10 - str r8,[r0,#32+HI] - adds r11,r3,r11 - str r11, [r0,#40+LO] - adc r12,r4,r12 - str r12, [r0,#40+HI] - - ldr r5,[sp,#48+0] - ldr r6,[sp,#48+4] - ldr r3,[sp,#56+0] - ldr r4,[sp,#56+4] - ldr r9, [r0,#48+LO] - ldr r10, [r0,#48+HI] - ldr r11, [r0,#56+LO] - ldr r12, [r0,#56+HI] - adds r9,r5,r9 - str r9, [r0,#48+LO] - adc r10,r6,r10 - str r10, [r0,#48+HI] - adds r11,r3,r11 - str r11, [r0,#56+LO] - adc r12,r4,r12 - str r12, [r0,#56+HI] - - add sp,sp,#640 - sub r14,r14,#640 - - teq r1,r2 - bne Loop - - add sp,sp,#8*9 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} -#else - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif - -#if __ARM_MAX_ARCH__>=7 - - - -.globl _sha512_block_data_order_neon -.private_extern _sha512_block_data_order_neon -#ifdef __thumb2__ -.thumb_func _sha512_block_data_order_neon -#endif -.align 4 -_sha512_block_data_order_neon: -LNEON: - dmb @ errata #451034 on early Cortex A8 - add r2,r1,r2,lsl#7 @ len to point at the end of inp - adr r3,K512 - VFP_ABI_PUSH - vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context -Loop_neon: - vshr.u64 d24,d20,#14 @ 0 -#if 0<16 - vld1.64 {d0},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d20,#18 -#if 0>0 - vadd.i64 d16,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d20,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 0<16 && defined(__ARMEL__) - vrev64.8 d0,d0 -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d0 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 1 -#if 1<16 - vld1.64 {d1},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 1>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 1<16 && defined(__ARMEL__) - vrev64.8 d1,d1 -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d1 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 d24,d18,#14 @ 2 -#if 2<16 - vld1.64 {d2},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d18,#18 -#if 2>0 - vadd.i64 d22,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d18,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 2<16 && defined(__ARMEL__) - vrev64.8 d2,d2 -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d2 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 3 -#if 3<16 - vld1.64 {d3},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 3>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 3<16 && defined(__ARMEL__) - vrev64.8 d3,d3 -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d3 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 d24,d16,#14 @ 4 -#if 4<16 - vld1.64 {d4},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d16,#18 -#if 4>0 - vadd.i64 d20,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d16,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 4<16 && defined(__ARMEL__) - vrev64.8 d4,d4 -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d4 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 5 -#if 5<16 - vld1.64 {d5},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 5>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 5<16 && defined(__ARMEL__) - vrev64.8 d5,d5 -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d5 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 d24,d22,#14 @ 6 -#if 6<16 - vld1.64 {d6},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d22,#18 -#if 6>0 - vadd.i64 d18,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d22,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 6<16 && defined(__ARMEL__) - vrev64.8 d6,d6 -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d6 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 7 -#if 7<16 - vld1.64 {d7},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 7>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 7<16 && defined(__ARMEL__) - vrev64.8 d7,d7 -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d7 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - vshr.u64 d24,d20,#14 @ 8 -#if 8<16 - vld1.64 {d8},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d20,#18 -#if 8>0 - vadd.i64 d16,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d20,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 8<16 && defined(__ARMEL__) - vrev64.8 d8,d8 -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d8 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 9 -#if 9<16 - vld1.64 {d9},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 9>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 9<16 && defined(__ARMEL__) - vrev64.8 d9,d9 -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d9 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 d24,d18,#14 @ 10 -#if 10<16 - vld1.64 {d10},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d18,#18 -#if 10>0 - vadd.i64 d22,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d18,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 10<16 && defined(__ARMEL__) - vrev64.8 d10,d10 -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d10 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 11 -#if 11<16 - vld1.64 {d11},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 11>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 11<16 && defined(__ARMEL__) - vrev64.8 d11,d11 -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d11 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 d24,d16,#14 @ 12 -#if 12<16 - vld1.64 {d12},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d16,#18 -#if 12>0 - vadd.i64 d20,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d16,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 12<16 && defined(__ARMEL__) - vrev64.8 d12,d12 -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d12 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 13 -#if 13<16 - vld1.64 {d13},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 13>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 13<16 && defined(__ARMEL__) - vrev64.8 d13,d13 -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d13 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 d24,d22,#14 @ 14 -#if 14<16 - vld1.64 {d14},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d22,#18 -#if 14>0 - vadd.i64 d18,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d22,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 14<16 && defined(__ARMEL__) - vrev64.8 d14,d14 -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d14 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 15 -#if 15<16 - vld1.64 {d15},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 15>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 15<16 && defined(__ARMEL__) - vrev64.8 d15,d15 -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d15 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - mov r12,#4 -L16_79_neon: - subs r12,#1 - vshr.u64 q12,q7,#19 - vshr.u64 q13,q7,#61 - vadd.i64 d16,d30 @ h+=Maj from the past - vshr.u64 q15,q7,#6 - vsli.64 q12,q7,#45 - vext.8 q14,q0,q1,#8 @ X[i+1] - vsli.64 q13,q7,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q0,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q4,q5,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d20,#14 @ from NEON_00_15 - vadd.i64 q0,q14 - vshr.u64 d25,d20,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d20,#41 @ from NEON_00_15 - vadd.i64 q0,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 16<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d0 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 17 -#if 17<16 - vld1.64 {d1},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 17>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 17<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d1 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 q12,q0,#19 - vshr.u64 q13,q0,#61 - vadd.i64 d22,d30 @ h+=Maj from the past - vshr.u64 q15,q0,#6 - vsli.64 q12,q0,#45 - vext.8 q14,q1,q2,#8 @ X[i+1] - vsli.64 q13,q0,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q1,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q5,q6,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d18,#14 @ from NEON_00_15 - vadd.i64 q1,q14 - vshr.u64 d25,d18,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d18,#41 @ from NEON_00_15 - vadd.i64 q1,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 18<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d2 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 19 -#if 19<16 - vld1.64 {d3},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 19>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 19<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d3 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 q12,q1,#19 - vshr.u64 q13,q1,#61 - vadd.i64 d20,d30 @ h+=Maj from the past - vshr.u64 q15,q1,#6 - vsli.64 q12,q1,#45 - vext.8 q14,q2,q3,#8 @ X[i+1] - vsli.64 q13,q1,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q2,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q6,q7,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d16,#14 @ from NEON_00_15 - vadd.i64 q2,q14 - vshr.u64 d25,d16,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d16,#41 @ from NEON_00_15 - vadd.i64 q2,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 20<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d4 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 21 -#if 21<16 - vld1.64 {d5},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 21>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 21<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d5 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 q12,q2,#19 - vshr.u64 q13,q2,#61 - vadd.i64 d18,d30 @ h+=Maj from the past - vshr.u64 q15,q2,#6 - vsli.64 q12,q2,#45 - vext.8 q14,q3,q4,#8 @ X[i+1] - vsli.64 q13,q2,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q3,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q7,q0,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d22,#14 @ from NEON_00_15 - vadd.i64 q3,q14 - vshr.u64 d25,d22,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d22,#41 @ from NEON_00_15 - vadd.i64 q3,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 22<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d6 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 23 -#if 23<16 - vld1.64 {d7},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 23>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 23<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d7 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - vshr.u64 q12,q3,#19 - vshr.u64 q13,q3,#61 - vadd.i64 d16,d30 @ h+=Maj from the past - vshr.u64 q15,q3,#6 - vsli.64 q12,q3,#45 - vext.8 q14,q4,q5,#8 @ X[i+1] - vsli.64 q13,q3,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q4,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q0,q1,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d20,#14 @ from NEON_00_15 - vadd.i64 q4,q14 - vshr.u64 d25,d20,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d20,#41 @ from NEON_00_15 - vadd.i64 q4,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 24<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d8 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 25 -#if 25<16 - vld1.64 {d9},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 25>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 25<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d9 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 q12,q4,#19 - vshr.u64 q13,q4,#61 - vadd.i64 d22,d30 @ h+=Maj from the past - vshr.u64 q15,q4,#6 - vsli.64 q12,q4,#45 - vext.8 q14,q5,q6,#8 @ X[i+1] - vsli.64 q13,q4,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q5,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q1,q2,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d18,#14 @ from NEON_00_15 - vadd.i64 q5,q14 - vshr.u64 d25,d18,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d18,#41 @ from NEON_00_15 - vadd.i64 q5,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 26<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d10 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 27 -#if 27<16 - vld1.64 {d11},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 27>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 27<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d11 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 q12,q5,#19 - vshr.u64 q13,q5,#61 - vadd.i64 d20,d30 @ h+=Maj from the past - vshr.u64 q15,q5,#6 - vsli.64 q12,q5,#45 - vext.8 q14,q6,q7,#8 @ X[i+1] - vsli.64 q13,q5,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q6,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q2,q3,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d16,#14 @ from NEON_00_15 - vadd.i64 q6,q14 - vshr.u64 d25,d16,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d16,#41 @ from NEON_00_15 - vadd.i64 q6,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 28<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d12 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 29 -#if 29<16 - vld1.64 {d13},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 29>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 29<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d13 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 q12,q6,#19 - vshr.u64 q13,q6,#61 - vadd.i64 d18,d30 @ h+=Maj from the past - vshr.u64 q15,q6,#6 - vsli.64 q12,q6,#45 - vext.8 q14,q7,q0,#8 @ X[i+1] - vsli.64 q13,q6,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q7,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q3,q4,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d22,#14 @ from NEON_00_15 - vadd.i64 q7,q14 - vshr.u64 d25,d22,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d22,#41 @ from NEON_00_15 - vadd.i64 q7,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 30<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d14 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 31 -#if 31<16 - vld1.64 {d15},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 31>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 31<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d15 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - bne L16_79_neon - - vadd.i64 d16,d30 @ h+=Maj from the past - vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp - vadd.i64 q8,q12 @ vectorized accumulate - vadd.i64 q9,q13 - vadd.i64 q10,q14 - vadd.i64 q11,q15 - vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context - teq r1,r2 - sub r3,#640 @ rewind K512 - bne Loop_neon - - VFP_ABI_POP - bx lr @ .word 0xe12fff1e - -#endif -.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm _OPENSSL_armcap_P,4 -.non_lazy_symbol_pointer -OPENSSL_armcap_P: -.indirect_symbol _OPENSSL_armcap_P -.long 0 -.private_extern _OPENSSL_armcap_P -#endif -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-arm/crypto/fipsmodule/vpaes-armv7.S b/third_party/boringssl/apple-arm/crypto/fipsmodule/vpaes-armv7.S deleted file mode 100644 index 6aead7ca..00000000 --- a/third_party/boringssl/apple-arm/crypto/fipsmodule/vpaes-armv7.S +++ /dev/null @@ -1,1265 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.syntax unified - - - - -#if defined(__thumb2__) -.thumb -#else -.code 32 -#endif - -.text - - -.align 7 @ totally strategic alignment -_vpaes_consts: -Lk_mc_forward:@ mc_forward -.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 -.quad 0x080B0A0904070605, 0x000302010C0F0E0D -.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 -.quad 0x000302010C0F0E0D, 0x080B0A0904070605 -Lk_mc_backward:@ mc_backward -.quad 0x0605040702010003, 0x0E0D0C0F0A09080B -.quad 0x020100030E0D0C0F, 0x0A09080B06050407 -.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 -.quad 0x0A09080B06050407, 0x020100030E0D0C0F -Lk_sr:@ sr -.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 -.quad 0x030E09040F0A0500, 0x0B06010C07020D08 -.quad 0x0F060D040B020900, 0x070E050C030A0108 -.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 - -@ -@ "Hot" constants -@ -Lk_inv:@ inv, inva -.quad 0x0E05060F0D080180, 0x040703090A0B0C02 -.quad 0x01040A060F0B0780, 0x030D0E0C02050809 -Lk_ipt:@ input transform (lo, hi) -.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 -.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 -Lk_sbo:@ sbou, sbot -.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 -.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA -Lk_sb1:@ sb1u, sb1t -.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF -.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 -Lk_sb2:@ sb2u, sb2t -.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A -.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD - -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 -.align 2 - -.align 6 -@@ -@@ _aes_preheat -@@ -@@ Fills q9-q15 as specified below. -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_preheat -#endif -.align 4 -_vpaes_preheat: - adr r10, Lk_inv - vmov.i8 q9, #0x0f @ Lk_s0F - vld1.64 {q10,q11}, [r10]! @ Lk_inv - add r10, r10, #64 @ Skip Lk_ipt, Lk_sbo - vld1.64 {q12,q13}, [r10]! @ Lk_sb1 - vld1.64 {q14,q15}, [r10] @ Lk_sb2 - bx lr - -@@ -@@ _aes_encrypt_core -@@ -@@ AES-encrypt q0. -@@ -@@ Inputs: -@@ q0 = input -@@ q9-q15 as in _vpaes_preheat -@@ [r2] = scheduled keys -@@ -@@ Output in q0 -@@ Clobbers q1-q5, r8-r11 -@@ Preserves q6-q8 so you get some local vectors -@@ -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_encrypt_core -#endif -.align 4 -_vpaes_encrypt_core: - mov r9, r2 - ldr r8, [r2,#240] @ pull rounds - adr r11, Lk_ipt - @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo - @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi - vld1.64 {q2, q3}, [r11] - adr r11, Lk_mc_forward+16 - vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 - vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1 - vtbl.8 d3, {q2}, d3 - vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2 - vtbl.8 d5, {q3}, d1 - veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 - veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 - - @ .Lenc_entry ends with a bnz instruction which is normally paired with - @ subs in .Lenc_loop. - tst r8, r8 - b Lenc_entry - -.align 4 -Lenc_loop: - @ middle of middle round - add r10, r11, #0x40 - vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u - vtbl.8 d9, {q13}, d5 - vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] - vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t - vtbl.8 d1, {q12}, d7 - veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u - vtbl.8 d11, {q15}, d5 - veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A - vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t - vtbl.8 d5, {q14}, d7 - vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] - vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B - vtbl.8 d7, {q0}, d3 - veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A - @ Write to q5 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D - vtbl.8 d11, {q0}, d9 - veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B - vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C - vtbl.8 d9, {q3}, d3 - @ Here we restore the original q0/q5 usage. - veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D - and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4 - veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D - subs r8, r8, #1 @ nr-- - -Lenc_entry: - @ top of round - vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i - vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k - vtbl.8 d11, {q11}, d3 - veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j - vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - vtbl.8 d7, {q10}, d1 - vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - vtbl.8 d9, {q10}, d3 - veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - vtbl.8 d5, {q10}, d7 - vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - vtbl.8 d7, {q10}, d9 - veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io - veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 - bne Lenc_loop - - @ middle of last round - add r10, r11, #0x80 - - adr r11, Lk_sbo - @ Read to q1 instead of q4, so the vtbl.8 instruction below does not - @ overlap table and destination registers. - vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou - vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot Lk_sbo+16 - vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - vtbl.8 d9, {q1}, d5 - vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] - @ Write to q2 instead of q0 below, to avoid overlapping table and - @ destination registers. - vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t - vtbl.8 d5, {q0}, d7 - veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A - @ Here we restore the original q0/q2 usage. - vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 - vtbl.8 d1, {q2}, d3 - bx lr - - -.globl _vpaes_encrypt -.private_extern _vpaes_encrypt -#ifdef __thumb2__ -.thumb_func _vpaes_encrypt -#endif -.align 4 -_vpaes_encrypt: - @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack - @ alignment. - stmdb sp!, {r7,r8,r9,r10,r11,lr} - @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. - vstmdb sp!, {d8,d9,d10,d11} - - vld1.64 {q0}, [r0] - bl _vpaes_preheat - bl _vpaes_encrypt_core - vst1.64 {q0}, [r1] - - vldmia sp!, {d8,d9,d10,d11} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return - - -@ -@ Decryption stuff -@ - -.align 4 -_vpaes_decrypt_consts: -Lk_dipt:@ decryption input transform -.quad 0x0F505B040B545F00, 0x154A411E114E451A -.quad 0x86E383E660056500, 0x12771772F491F194 -Lk_dsbo:@ decryption sbox final output -.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D -.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C -Lk_dsb9:@ decryption sbox output *9*u, *9*t -.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 -.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 -Lk_dsbd:@ decryption sbox output *D*u, *D*t -.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 -.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 -Lk_dsbb:@ decryption sbox output *B*u, *B*t -.quad 0xD022649296B44200, 0x602646F6B0F2D404 -.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B -Lk_dsbe:@ decryption sbox output *E*u, *E*t -.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 -.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 - - -@@ -@@ Decryption core -@@ -@@ Same API as encryption core, except it clobbers q12-q15 rather than using -@@ the values from _vpaes_preheat. q9-q11 must still be set from -@@ _vpaes_preheat. -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_decrypt_core -#endif -.align 4 -_vpaes_decrypt_core: - mov r9, r2 - ldr r8, [r2,#240] @ pull rounds - - @ This function performs shuffles with various constants. The x86_64 - @ version loads them on-demand into %xmm0-%xmm5. This does not work well - @ for ARMv7 because those registers are shuffle destinations. The ARMv8 - @ version preloads those constants into registers, but ARMv7 has half - @ the registers to work with. Instead, we load them on-demand into - @ q12-q15, registers normally use for preloaded constants. This is fine - @ because decryption doesn't use those constants. The values are - @ constant, so this does not interfere with potential 2x optimizations. - adr r7, Lk_dipt - - vld1.64 {q12,q13}, [r7] @ vmovdqa Lk_dipt(%rip), %xmm2 # iptlo - lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11 - eor r11, r11, #0x30 @ xor $0x30, %r11 - adr r10, Lk_sr - and r11, r11, #0x30 @ and $0x30, %r11 - add r11, r11, r10 - adr r10, Lk_mc_forward+48 - - vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 - vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 - vtbl.8 d5, {q12}, d3 - vld1.64 {q5}, [r10] @ vmovdqa Lk_mc_forward+48(%rip), %xmm5 - @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi - vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 - vtbl.8 d1, {q13}, d1 - veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2 - veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 - - @ .Ldec_entry ends with a bnz instruction which is normally paired with - @ subs in .Ldec_loop. - tst r8, r8 - b Ldec_entry - -.align 4 -Ldec_loop: -@ -@ Inverse mix columns -@ - - @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of - @ the function. - adr r10, Lk_dsb9 - vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u - @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t - @ Load sbd* ahead of time. - vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu - @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt - vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u - vtbl.8 d9, {q12}, d5 - vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t - vtbl.8 d3, {q13}, d7 - veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0 - - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - - @ Load sbb* ahead of time. - vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu - @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt - - vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu - vtbl.8 d9, {q14}, d5 - @ Write to q1 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch - vtbl.8 d3, {q0}, d11 - @ Here we restore the original q0/q1 usage. This instruction is - @ reordered from the ARMv8 version so we do not clobber the vtbl.8 - @ below. - veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt - vtbl.8 d3, {q15}, d7 - @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt - - @ Load sbd* ahead of time. - vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu - @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet - - vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu - vtbl.8 d9, {q12}, d5 - @ Write to q1 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch - vtbl.8 d3, {q0}, d11 - @ Here we restore the original q0/q1 usage. This instruction is - @ reordered from the ARMv8 version so we do not clobber the vtbl.8 - @ below. - veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt - vtbl.8 d3, {q13}, d7 - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - - vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu - vtbl.8 d9, {q14}, d5 - @ Write to q1 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch - vtbl.8 d3, {q0}, d11 - @ Here we restore the original q0/q1 usage. This instruction is - @ reordered from the ARMv8 version so we do not clobber the vtbl.8 - @ below. - veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet - vtbl.8 d3, {q15}, d7 - vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5 - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - subs r8, r8, #1 @ sub $1,%rax # nr-- - -Ldec_entry: - @ top of round - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i - vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - vtbl.8 d5, {q11}, d3 - veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j - vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - vtbl.8 d7, {q10}, d1 - vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - vtbl.8 d9, {q10}, d3 - veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - vtbl.8 d5, {q10}, d7 - vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - vtbl.8 d7, {q10}, d9 - veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io - veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0 - bne Ldec_loop - - @ middle of last round - - adr r10, Lk_dsbo - - @ Write to q1 rather than q4 to avoid overlapping table and destination. - vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou - vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - vtbl.8 d9, {q1}, d5 - @ Write to q2 rather than q1 to avoid overlapping table and destination. - vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot - vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t - vtbl.8 d3, {q2}, d7 - vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 - veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k - @ Write to q1 rather than q0 so the table and destination registers - @ below do not overlap. - veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A - vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0 - vtbl.8 d1, {q1}, d5 - bx lr - - -.globl _vpaes_decrypt -.private_extern _vpaes_decrypt -#ifdef __thumb2__ -.thumb_func _vpaes_decrypt -#endif -.align 4 -_vpaes_decrypt: - @ _vpaes_decrypt_core uses r7-r11. - stmdb sp!, {r7,r8,r9,r10,r11,lr} - @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved. - vstmdb sp!, {d8,d9,d10,d11} - - vld1.64 {q0}, [r0] - bl _vpaes_preheat - bl _vpaes_decrypt_core - vst1.64 {q0}, [r1] - - vldmia sp!, {d8,d9,d10,d11} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return - -@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ -@@ @@ -@@ AES key schedule @@ -@@ @@ -@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - -@ This function diverges from both x86_64 and armv7 in which constants are -@ pinned. x86_64 has a common preheat function for all operations. aarch64 -@ separates them because it has enough registers to pin nearly all constants. -@ armv7 does not have enough registers, but needing explicit loads and stores -@ also complicates using x86_64's register allocation directly. -@ -@ We pin some constants for convenience and leave q14 and q15 free to load -@ others on demand. - -@ -@ Key schedule constants -@ - -.align 4 -_vpaes_key_consts: -Lk_dksd:@ decryption key schedule: invskew x*D -.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 -.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E -Lk_dksb:@ decryption key schedule: invskew x*B -.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 -.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 -Lk_dkse:@ decryption key schedule: invskew x*E + 0x63 -.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 -.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 -Lk_dks9:@ decryption key schedule: invskew x*9 -.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC -.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE - -Lk_rcon:@ rcon -.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 - -Lk_opt:@ output transform -.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 -.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 -Lk_deskew:@ deskew tables: inverts the sbox's "skew" -.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A -.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 - - -#ifdef __thumb2__ -.thumb_func _vpaes_key_preheat -#endif -.align 4 -_vpaes_key_preheat: - adr r11, Lk_rcon - vmov.i8 q12, #0x5b @ Lk_s63 - adr r10, Lk_inv @ Must be aligned to 8 mod 16. - vmov.i8 q9, #0x0f @ Lk_s0F - vld1.64 {q10,q11}, [r10] @ Lk_inv - vld1.64 {q8}, [r11] @ Lk_rcon - bx lr - - -#ifdef __thumb2__ -.thumb_func _vpaes_schedule_core -#endif -.align 4 -_vpaes_schedule_core: - @ We only need to save lr, but ARM requires an 8-byte stack alignment, - @ so save an extra register. - stmdb sp!, {r3,lr} - - bl _vpaes_key_preheat @ load the tables - - adr r11, Lk_ipt @ Must be aligned to 8 mod 16. - vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) - - @ input transform - @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not - @ overlap table and destination. - vmov q4, q0 @ vmovdqa %xmm0, %xmm3 - bl _vpaes_schedule_transform - adr r10, Lk_sr @ Must be aligned to 8 mod 16. - vmov q7, q0 @ vmovdqa %xmm0, %xmm7 - - add r8, r8, r10 - tst r3, r3 - bne Lschedule_am_decrypting - - @ encrypting, output zeroth round key after transform - vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) - b Lschedule_go - -Lschedule_am_decrypting: - @ decrypting, output zeroth round key after shiftrows - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 - vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q4}, d3 - vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx) - eor r8, r8, #0x30 @ xor $0x30, %r8 - -Lschedule_go: - cmp r1, #192 @ cmp $192, %esi - bhi Lschedule_256 - beq Lschedule_192 - @ 128: fall though - -@@ -@@ .schedule_128 -@@ -@@ 128-bit specific part of key schedule. -@@ -@@ This schedule is really simple, because all its parts -@@ are accomplished by the subroutines. -@@ -Lschedule_128: - mov r0, #10 @ mov $10, %esi - -Loop_schedule_128: - bl _vpaes_schedule_round - subs r0, r0, #1 @ dec %esi - beq Lschedule_mangle_last - bl _vpaes_schedule_mangle @ write output - b Loop_schedule_128 - -@@ -@@ .aes_schedule_192 -@@ -@@ 192-bit specific part of key schedule. -@@ -@@ The main body of this schedule is the same as the 128-bit -@@ schedule, but with more smearing. The long, high side is -@@ stored in q7 as before, and the short, low side is in -@@ the high bits of q6. -@@ -@@ This schedule is somewhat nastier, however, because each -@@ round produces 192 bits of key material, or 1.5 round keys. -@@ Therefore, on each cycle we do 2 rounds and produce 3 round -@@ keys. -@@ -.align 4 -Lschedule_192: - sub r0, r0, #8 - vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) - bl _vpaes_schedule_transform @ input transform - vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part - vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4 - @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros - mov r0, #4 @ mov $4, %esi - -Loop_schedule_192: - bl _vpaes_schedule_round - vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0 - bl _vpaes_schedule_mangle @ save key n - bl _vpaes_schedule_192_smear - bl _vpaes_schedule_mangle @ save key n+1 - bl _vpaes_schedule_round - subs r0, r0, #1 @ dec %esi - beq Lschedule_mangle_last - bl _vpaes_schedule_mangle @ save key n+2 - bl _vpaes_schedule_192_smear - b Loop_schedule_192 - -@@ -@@ .aes_schedule_256 -@@ -@@ 256-bit specific part of key schedule. -@@ -@@ The structure here is very similar to the 128-bit -@@ schedule, but with an additional "low side" in -@@ q6. The low side's rounds are the same as the -@@ high side's, except no rcon and no rotation. -@@ -.align 4 -Lschedule_256: - vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) - bl _vpaes_schedule_transform @ input transform - mov r0, #7 @ mov $7, %esi - -Loop_schedule_256: - bl _vpaes_schedule_mangle @ output low result - vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 - - @ high round - bl _vpaes_schedule_round - subs r0, r0, #1 @ dec %esi - beq Lschedule_mangle_last - bl _vpaes_schedule_mangle - - @ low round. swap xmm7 and xmm6 - vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 - vmov.i8 q4, #0 - vmov q5, q7 @ vmovdqa %xmm7, %xmm5 - vmov q7, q6 @ vmovdqa %xmm6, %xmm7 - bl _vpaes_schedule_low_round - vmov q7, q5 @ vmovdqa %xmm5, %xmm7 - - b Loop_schedule_256 - -@@ -@@ .aes_schedule_mangle_last -@@ -@@ Mangler for last round of key schedule -@@ Mangles q0 -@@ when encrypting, outputs out(q0) ^ 63 -@@ when decrypting, outputs unskew(q0) -@@ -@@ Always called right before return... jumps to cleanup and exits -@@ -.align 4 -Lschedule_mangle_last: - @ schedule last round key from xmm0 - adr r11, Lk_deskew @ lea Lk_deskew(%rip),%r11 # prepare to deskew - tst r3, r3 - bne Lschedule_mangle_last_dec - - @ encrypting - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 - adr r11, Lk_opt @ lea Lk_opt(%rip), %r11 # prepare to output transform - add r2, r2, #32 @ add $32, %rdx - vmov q2, q0 - vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute - vtbl.8 d1, {q2}, d3 - -Lschedule_mangle_last_dec: - sub r2, r2, #16 @ add $-16, %rdx - veor q0, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm0 - bl _vpaes_schedule_transform @ output transform - vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key - - @ cleanup - veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 - veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 - veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 - veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 - veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 - veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 - veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 - veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 - ldmia sp!, {r3,pc} @ return - - -@@ -@@ .aes_schedule_192_smear -@@ -@@ Smear the short, low side in the 192-bit key schedule. -@@ -@@ Inputs: -@@ q7: high side, b a x y -@@ q6: low side, d c 0 0 -@@ -@@ Outputs: -@@ q6: b+c+d b+c 0 0 -@@ q0: b+c+d b+c b a -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_schedule_192_smear -#endif -.align 4 -_vpaes_schedule_192_smear: - vmov.i8 q1, #0 - vdup.32 q0, d15[1] - vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 - vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a - veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 - veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 - veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a - vmov q0, q6 @ vmovdqa %xmm6, %xmm0 - vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros - bx lr - - -@@ -@@ .aes_schedule_round -@@ -@@ Runs one main round of the key schedule on q0, q7 -@@ -@@ Specifically, runs subbytes on the high dword of q0 -@@ then rotates it by one byte and xors into the low dword of -@@ q7. -@@ -@@ Adds rcon from low byte of q8, then rotates q8 for -@@ next rcon. -@@ -@@ Smears the dwords of q7 by xoring the low into the -@@ second low, result into third, result into highest. -@@ -@@ Returns results in q7 = q0. -@@ Clobbers q1-q4, r11. -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_schedule_round -#endif -.align 4 -_vpaes_schedule_round: - @ extract rcon from xmm8 - vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 - vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1 - vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8 - veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 - - @ rotate - vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 - vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0 - - @ fall through... - - @ low round: same as high round, but no rotation and no rcon. -_vpaes_schedule_low_round: - @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. - @ We pin other values in _vpaes_key_preheat, so load them now. - adr r11, Lk_sb1 - vld1.64 {q14,q15}, [r11] - - @ smear xmm7 - vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1 - veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 - vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4 - - @ subbytes - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i - veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 - vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - vtbl.8 d5, {q11}, d3 - veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j - vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - vtbl.8 d7, {q10}, d1 - veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - vtbl.8 d9, {q10}, d3 - veor q7, q7, q12 @ vpxor Lk_s63(%rip), %xmm7, %xmm7 - vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak - vtbl.8 d7, {q10}, d7 - veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak - vtbl.8 d5, {q10}, d9 - veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io - veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo - vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou - vtbl.8 d9, {q15}, d7 - vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t - vtbl.8 d3, {q14}, d5 - veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output - - @ add in smeared stuff - veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 - veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 - bx lr - - -@@ -@@ .aes_schedule_transform -@@ -@@ Linear-transform q0 according to tables at [r11] -@@ -@@ Requires that q9 = 0x0F0F... as in preheat -@@ Output in q0 -@@ Clobbers q1, q2, q14, q15 -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_schedule_transform -#endif -.align 4 -_vpaes_schedule_transform: - vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo - @ vmovdqa 16(%r11), %xmm1 # hi - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 - vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d3 - vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 - vtbl.8 d1, {q15}, d1 - veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 - bx lr - - -@@ -@@ .aes_schedule_mangle -@@ -@@ Mangles q0 from (basis-transformed) standard version -@@ to our version. -@@ -@@ On encrypt, -@@ xor with 0x63 -@@ multiply by circulant 0,1,1,1 -@@ apply shiftrows transform -@@ -@@ On decrypt, -@@ xor with 0x63 -@@ multiply by "inverse mixcolumns" circulant E,B,D,9 -@@ deskew -@@ apply shiftrows transform -@@ -@@ -@@ Writes out to [r2], and increments or decrements it -@@ Keeps track of round number mod 4 in r8 -@@ Preserves q0 -@@ Clobbers q1-q5 -@@ -#ifdef __thumb2__ -.thumb_func _vpaes_schedule_mangle -#endif -.align 4 -_vpaes_schedule_mangle: - tst r3, r3 - vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later - adr r11, Lk_mc_forward @ Must be aligned to 8 mod 16. - vld1.64 {q5}, [r11] @ vmovdqa Lk_mc_forward(%rip),%xmm5 - bne Lschedule_mangle_dec - - @ encrypting - @ Write to q2 so we do not overlap table and destination below. - veor q2, q0, q12 @ vpxor Lk_s63(%rip), %xmm0, %xmm4 - add r2, r2, #16 @ add $16, %rdx - vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4 - vtbl.8 d9, {q2}, d11 - vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1 - vtbl.8 d3, {q4}, d11 - vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3 - vtbl.8 d7, {q1}, d11 - veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 - veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 - - b Lschedule_mangle_both -.align 4 -Lschedule_mangle_dec: - @ inverse mix columns - adr r11, Lk_dksd @ lea Lk_dksd(%rip),%r11 - vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi - vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo - - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2 - @ vmovdqa 0x10(%r11), %xmm3 - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q15}, d3 - @ Load .Lk_dksb ahead of time. - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2 - @ vmovdqa 0x30(%r11), %xmm3 - @ Write to q13 so we do not overlap table and destination. - veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 - vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 - vtbl.8 d7, {q13}, d11 - - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 - vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q15}, d3 - @ Load .Lk_dkse ahead of time. - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2 - @ vmovdqa 0x50(%r11), %xmm3 - @ Write to q13 so we do not overlap table and destination. - veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 - vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 - vtbl.8 d7, {q13}, d11 - - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 - vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q15}, d3 - @ Load .Lk_dkse ahead of time. - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2 - @ vmovdqa 0x70(%r11), %xmm4 - @ Write to q13 so we do not overlap table and destination. - veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 - - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 - vtbl.8 d7, {q13}, d11 - vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4 - vtbl.8 d9, {q15}, d3 - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 - veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 - veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3 - - sub r2, r2, #16 @ add $-16, %rdx - -Lschedule_mangle_both: - @ Write to q2 so table and destination do not overlap. - vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d5, {q3}, d3 - add r8, r8, #64-16 @ add $-16, %r8 - and r8, r8, #~(1<<6) @ and $0x30, %r8 - vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx) - bx lr - - -.globl _vpaes_set_encrypt_key -.private_extern _vpaes_set_encrypt_key -#ifdef __thumb2__ -.thumb_func _vpaes_set_encrypt_key -#endif -.align 4 -_vpaes_set_encrypt_key: - stmdb sp!, {r7,r8,r9,r10,r11, lr} - vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - - lsr r9, r1, #5 @ shr $5,%eax - add r9, r9, #5 @ $5,%eax - str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - - mov r3, #0 @ mov $0,%ecx - mov r8, #0x30 @ mov $0x30,%r8d - bl _vpaes_schedule_core - eor r0, r0, r0 - - vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return - - -.globl _vpaes_set_decrypt_key -.private_extern _vpaes_set_decrypt_key -#ifdef __thumb2__ -.thumb_func _vpaes_set_decrypt_key -#endif -.align 4 -_vpaes_set_decrypt_key: - stmdb sp!, {r7,r8,r9,r10,r11, lr} - vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - - lsr r9, r1, #5 @ shr $5,%eax - add r9, r9, #5 @ $5,%eax - str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - lsl r9, r9, #4 @ shl $4,%eax - add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx - add r2, r2, r9 - - mov r3, #1 @ mov $1,%ecx - lsr r8, r1, #1 @ shr $1,%r8d - and r8, r8, #32 @ and $32,%r8d - eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32 - bl _vpaes_schedule_core - - vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return - - -@ Additional constants for converting to bsaes. - -.align 4 -_vpaes_convert_consts: -@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear -@ transform in the AES S-box. 0x63 is incorporated into the low half of the -@ table. This was computed with the following script: -@ -@ def u64s_to_u128(x, y): -@ return x | (y << 64) -@ def u128_to_u64s(w): -@ return w & ((1<<64)-1), w >> 64 -@ def get_byte(w, i): -@ return (w >> (i*8)) & 0xff -@ def apply_table(table, b): -@ lo = b & 0xf -@ hi = b >> 4 -@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) -@ def opt(b): -@ table = [ -@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), -@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), -@ ] -@ return apply_table(table, b) -@ def rot_byte(b, n): -@ return 0xff & ((b << n) | (b >> (8-n))) -@ def skew(x): -@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ -@ rot_byte(x, 4)) -@ table = [0, 0] -@ for i in range(16): -@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) -@ table[1] |= skew(opt(i<<4)) << (i*8) -@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0])) -@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1])) -Lk_opt_then_skew: -.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b -.quad 0x1f30062936192f00, 0xb49bad829db284ab - -@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation -@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344 -@ becomes 0x22334411 and then 0x11443322. -Lk_decrypt_transform: -.quad 0x0704050603000102, 0x0f0c0d0e0b08090a - - -@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); -.globl _vpaes_encrypt_key_to_bsaes -.private_extern _vpaes_encrypt_key_to_bsaes -#ifdef __thumb2__ -.thumb_func _vpaes_encrypt_key_to_bsaes -#endif -.align 4 -_vpaes_encrypt_key_to_bsaes: - stmdb sp!, {r11, lr} - - @ See _vpaes_schedule_core for the key schedule logic. In particular, - @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), - @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last - @ contain the transformations not in the bsaes representation. This - @ function inverts those transforms. - @ - @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key - @ representation, which does not match the other aes_nohw_* - @ implementations. The ARM aes_nohw_* stores each 32-bit word - @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the - @ cost of extra REV and VREV32 operations in little-endian ARM. - - vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform - adr r2, Lk_mc_forward @ Must be aligned to 8 mod 16. - add r3, r2, 0x90 @ Lk_sr+0x10-Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) - - vld1.64 {q12}, [r2] - vmov.i8 q10, #0x5b @ Lk_s63 from vpaes-x86_64 - adr r11, Lk_opt @ Must be aligned to 8 mod 16. - vmov.i8 q11, #0x63 @ LK_s63 without Lk_ipt applied - - @ vpaes stores one fewer round count than bsaes, but the number of keys - @ is the same. - ldr r2, [r1,#240] - add r2, r2, #1 - str r2, [r0,#240] - - @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). - @ Invert this with .Lk_opt. - vld1.64 {q0}, [r1]! - bl _vpaes_schedule_transform - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - - @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, - @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, - @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. -Loop_enc_key_to_bsaes: - vld1.64 {q0}, [r1]! - - @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle - @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. - @ We use r3 rather than r8 to avoid a callee-saved register. - vld1.64 {q1}, [r3] - vtbl.8 d4, {q0}, d2 - vtbl.8 d5, {q0}, d3 - add r3, r3, #16 - and r3, r3, #~(1<<6) - vmov q0, q2 - - @ Handle the last key differently. - subs r2, r2, #1 - beq Loop_enc_key_to_bsaes_last - - @ Multiply by the circulant. This is its own inverse. - vtbl.8 d2, {q0}, d24 - vtbl.8 d3, {q0}, d25 - vmov q0, q1 - vtbl.8 d4, {q1}, d24 - vtbl.8 d5, {q1}, d25 - veor q0, q0, q2 - vtbl.8 d2, {q2}, d24 - vtbl.8 d3, {q2}, d25 - veor q0, q0, q1 - - @ XOR and finish. - veor q0, q0, q10 - bl _vpaes_schedule_transform - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - b Loop_enc_key_to_bsaes - -Loop_enc_key_to_bsaes_last: - @ The final key does not have a basis transform (note - @ .Lschedule_mangle_last inverts the original transform). It only XORs - @ 0x63 and applies ShiftRows. The latter was already inverted in the - @ loop. Note that, because we act on the original representation, we use - @ q11, not q10. - veor q0, q0, q11 - vrev32.8 q0, q0 - vst1.64 {q0}, [r0] - - @ Wipe registers which contained key material. - veor q0, q0, q0 - veor q1, q1, q1 - veor q2, q2, q2 - - ldmia sp!, {r11, pc} @ return - - -@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes); -.globl _vpaes_decrypt_key_to_bsaes -.private_extern _vpaes_decrypt_key_to_bsaes -#ifdef __thumb2__ -.thumb_func _vpaes_decrypt_key_to_bsaes -#endif -.align 4 -_vpaes_decrypt_key_to_bsaes: - stmdb sp!, {r11, lr} - - @ See _vpaes_schedule_core for the key schedule logic. Note vpaes - @ computes the decryption key schedule in reverse. Additionally, - @ aes-x86_64.pl shares some transformations, so we must only partially - @ invert vpaes's transformations. In general, vpaes computes in a - @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of - @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is - @ split into a linear skew and XOR of 0x63). We undo all but MixColumns. - @ - @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key - @ representation, which does not match the other aes_nohw_* - @ implementations. The ARM aes_nohw_* stores each 32-bit word - @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the - @ cost of extra REV and VREV32 operations in little-endian ARM. - - adr r2, Lk_decrypt_transform - adr r3, Lk_sr+0x30 - adr r11, Lk_opt_then_skew @ Input to _vpaes_schedule_transform. - vld1.64 {q12}, [r2] @ Reuse q12 from encryption. - vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform - - @ vpaes stores one fewer round count than bsaes, but the number of keys - @ is the same. - ldr r2, [r1,#240] - add r2, r2, #1 - str r2, [r0,#240] - - @ Undo the basis change and reapply the S-box affine transform. See - @ .Lschedule_mangle_last. - vld1.64 {q0}, [r1]! - bl _vpaes_schedule_transform - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - - @ See _vpaes_schedule_mangle for the transform on the middle keys. Note - @ it simultaneously inverts MixColumns and the S-box affine transform. - @ See .Lk_dksd through .Lk_dks9. -Loop_dec_key_to_bsaes: - vld1.64 {q0}, [r1]! - - @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going - @ forwards cancels inverting for which direction we cycle r3. We use r3 - @ rather than r8 to avoid a callee-saved register. - vld1.64 {q1}, [r3] - vtbl.8 d4, {q0}, d2 - vtbl.8 d5, {q0}, d3 - add r3, r3, #64-16 - and r3, r3, #~(1<<6) - vmov q0, q2 - - @ Handle the last key differently. - subs r2, r2, #1 - beq Loop_dec_key_to_bsaes_last - - @ Undo the basis change and reapply the S-box affine transform. - bl _vpaes_schedule_transform - - @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We - @ combine the two operations in .Lk_decrypt_transform. - @ - @ TODO(davidben): Where does the rotation come from? - vtbl.8 d2, {q0}, d24 - vtbl.8 d3, {q0}, d25 - - vst1.64 {q1}, [r0]! - b Loop_dec_key_to_bsaes - -Loop_dec_key_to_bsaes_last: - @ The final key only inverts ShiftRows (already done in the loop). See - @ .Lschedule_am_decrypting. Its basis is not transformed. - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - - @ Wipe registers which contained key material. - veor q0, q0, q0 - veor q1, q1, q1 - veor q2, q2, q2 - - ldmia sp!, {r11, pc} @ return - -.globl _vpaes_ctr32_encrypt_blocks -.private_extern _vpaes_ctr32_encrypt_blocks -#ifdef __thumb2__ -.thumb_func _vpaes_ctr32_encrypt_blocks -#endif -.align 4 -_vpaes_ctr32_encrypt_blocks: - mov ip, sp - stmdb sp!, {r7,r8,r9,r10,r11, lr} - @ This function uses q4-q7 (d8-d15), which are callee-saved. - vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - - cmp r2, #0 - @ r8 is passed on the stack. - ldr r8, [ip] - beq Lctr32_done - - @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3. - mov r9, r3 - mov r3, r2 - mov r2, r9 - - @ Load the IV and counter portion. - ldr r7, [r8, #12] - vld1.8 {q7}, [r8] - - bl _vpaes_preheat - rev r7, r7 @ The counter is big-endian. - -Lctr32_loop: - vmov q0, q7 - vld1.8 {q6}, [r0]! @ Load input ahead of time - bl _vpaes_encrypt_core - veor q0, q0, q6 @ XOR input and result - vst1.8 {q0}, [r1]! - subs r3, r3, #1 - @ Update the counter. - add r7, r7, #1 - rev r9, r7 - vmov.32 d15[1], r9 - bne Lctr32_loop - -Lctr32_done: - vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return - -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-arm/crypto/test/trampoline-armv4.S b/third_party/boringssl/apple-arm/crypto/test/trampoline-armv4.S deleted file mode 100644 index 9d74f553..00000000 --- a/third_party/boringssl/apple-arm/crypto/test/trampoline-armv4.S +++ /dev/null @@ -1,376 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.syntax unified - - - - -.text - -@ abi_test_trampoline loads callee-saved registers from |state|, calls |func| -@ with |argv|, then saves the callee-saved registers into |state|. It returns -@ the result of |func|. The |unwind| argument is unused. -@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state, -@ const uint32_t *argv, size_t argc, -@ int unwind); - -.globl _abi_test_trampoline -.private_extern _abi_test_trampoline -.align 4 -_abi_test_trampoline: - @ Save parameters and all callee-saved registers. For convenience, we - @ save r9 on iOS even though it's volatile. - vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - stmdb sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} - - @ Reserve stack space for six (10-4) stack parameters, plus an extra 4 - @ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3). - sub sp, sp, #28 - - @ Every register in AAPCS is either non-volatile or a parameter (except - @ r9 on iOS), so this code, by the actual call, loses all its scratch - @ registers. First fill in stack parameters while there are registers - @ to spare. - cmp r3, #4 - bls Lstack_args_done - mov r4, sp @ r4 is the output pointer. - add r5, r2, r3, lsl #2 @ Set r5 to the end of argv. - add r2, r2, #16 @ Skip four arguments. -Lstack_args_loop: - ldr r6, [r2], #4 - cmp r2, r5 - str r6, [r4], #4 - bne Lstack_args_loop - -Lstack_args_done: - @ Load registers from |r1|. - vldmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} -#if defined(__APPLE__) - @ r9 is not volatile on iOS. - ldmia r1!, {r4,r5,r6,r7,r8,r10-r11} -#else - ldmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} -#endif - - @ Load register parameters. This uses up our remaining registers, so we - @ repurpose lr as scratch space. - ldr r3, [sp, #40] @ Reload argc. - ldr lr, [sp, #36] @ Load argv into lr. - cmp r3, #3 - bhi Larg_r3 - beq Larg_r2 - cmp r3, #1 - bhi Larg_r1 - beq Larg_r0 - b Largs_done - -Larg_r3: - ldr r3, [lr, #12] @ argv[3] -Larg_r2: - ldr r2, [lr, #8] @ argv[2] -Larg_r1: - ldr r1, [lr, #4] @ argv[1] -Larg_r0: - ldr r0, [lr] @ argv[0] -Largs_done: - - @ With every other register in use, load the function pointer into lr - @ and call the function. - ldr lr, [sp, #28] - blx lr - - @ r1-r3 are free for use again. The trampoline only supports - @ single-return functions. Pass r4-r11 to the caller. - ldr r1, [sp, #32] - vstmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} -#if defined(__APPLE__) - @ r9 is not volatile on iOS. - stmia r1!, {r4,r5,r6,r7,r8,r10-r11} -#else - stmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} -#endif - - @ Unwind the stack and restore registers. - add sp, sp, #44 @ 44 = 28+16 - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} @ Skip r0-r3 (see +16 above). - vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - - bx lr - - -.globl _abi_test_clobber_r0 -.private_extern _abi_test_clobber_r0 -.align 4 -_abi_test_clobber_r0: - mov r0, #0 - bx lr - - -.globl _abi_test_clobber_r1 -.private_extern _abi_test_clobber_r1 -.align 4 -_abi_test_clobber_r1: - mov r1, #0 - bx lr - - -.globl _abi_test_clobber_r2 -.private_extern _abi_test_clobber_r2 -.align 4 -_abi_test_clobber_r2: - mov r2, #0 - bx lr - - -.globl _abi_test_clobber_r3 -.private_extern _abi_test_clobber_r3 -.align 4 -_abi_test_clobber_r3: - mov r3, #0 - bx lr - - -.globl _abi_test_clobber_r4 -.private_extern _abi_test_clobber_r4 -.align 4 -_abi_test_clobber_r4: - mov r4, #0 - bx lr - - -.globl _abi_test_clobber_r5 -.private_extern _abi_test_clobber_r5 -.align 4 -_abi_test_clobber_r5: - mov r5, #0 - bx lr - - -.globl _abi_test_clobber_r6 -.private_extern _abi_test_clobber_r6 -.align 4 -_abi_test_clobber_r6: - mov r6, #0 - bx lr - - -.globl _abi_test_clobber_r7 -.private_extern _abi_test_clobber_r7 -.align 4 -_abi_test_clobber_r7: - mov r7, #0 - bx lr - - -.globl _abi_test_clobber_r8 -.private_extern _abi_test_clobber_r8 -.align 4 -_abi_test_clobber_r8: - mov r8, #0 - bx lr - - -.globl _abi_test_clobber_r9 -.private_extern _abi_test_clobber_r9 -.align 4 -_abi_test_clobber_r9: - mov r9, #0 - bx lr - - -.globl _abi_test_clobber_r10 -.private_extern _abi_test_clobber_r10 -.align 4 -_abi_test_clobber_r10: - mov r10, #0 - bx lr - - -.globl _abi_test_clobber_r11 -.private_extern _abi_test_clobber_r11 -.align 4 -_abi_test_clobber_r11: - mov r11, #0 - bx lr - - -.globl _abi_test_clobber_r12 -.private_extern _abi_test_clobber_r12 -.align 4 -_abi_test_clobber_r12: - mov r12, #0 - bx lr - - -.globl _abi_test_clobber_d0 -.private_extern _abi_test_clobber_d0 -.align 4 -_abi_test_clobber_d0: - mov r0, #0 - vmov s0, r0 - vmov s1, r0 - bx lr - - -.globl _abi_test_clobber_d1 -.private_extern _abi_test_clobber_d1 -.align 4 -_abi_test_clobber_d1: - mov r0, #0 - vmov s2, r0 - vmov s3, r0 - bx lr - - -.globl _abi_test_clobber_d2 -.private_extern _abi_test_clobber_d2 -.align 4 -_abi_test_clobber_d2: - mov r0, #0 - vmov s4, r0 - vmov s5, r0 - bx lr - - -.globl _abi_test_clobber_d3 -.private_extern _abi_test_clobber_d3 -.align 4 -_abi_test_clobber_d3: - mov r0, #0 - vmov s6, r0 - vmov s7, r0 - bx lr - - -.globl _abi_test_clobber_d4 -.private_extern _abi_test_clobber_d4 -.align 4 -_abi_test_clobber_d4: - mov r0, #0 - vmov s8, r0 - vmov s9, r0 - bx lr - - -.globl _abi_test_clobber_d5 -.private_extern _abi_test_clobber_d5 -.align 4 -_abi_test_clobber_d5: - mov r0, #0 - vmov s10, r0 - vmov s11, r0 - bx lr - - -.globl _abi_test_clobber_d6 -.private_extern _abi_test_clobber_d6 -.align 4 -_abi_test_clobber_d6: - mov r0, #0 - vmov s12, r0 - vmov s13, r0 - bx lr - - -.globl _abi_test_clobber_d7 -.private_extern _abi_test_clobber_d7 -.align 4 -_abi_test_clobber_d7: - mov r0, #0 - vmov s14, r0 - vmov s15, r0 - bx lr - - -.globl _abi_test_clobber_d8 -.private_extern _abi_test_clobber_d8 -.align 4 -_abi_test_clobber_d8: - mov r0, #0 - vmov s16, r0 - vmov s17, r0 - bx lr - - -.globl _abi_test_clobber_d9 -.private_extern _abi_test_clobber_d9 -.align 4 -_abi_test_clobber_d9: - mov r0, #0 - vmov s18, r0 - vmov s19, r0 - bx lr - - -.globl _abi_test_clobber_d10 -.private_extern _abi_test_clobber_d10 -.align 4 -_abi_test_clobber_d10: - mov r0, #0 - vmov s20, r0 - vmov s21, r0 - bx lr - - -.globl _abi_test_clobber_d11 -.private_extern _abi_test_clobber_d11 -.align 4 -_abi_test_clobber_d11: - mov r0, #0 - vmov s22, r0 - vmov s23, r0 - bx lr - - -.globl _abi_test_clobber_d12 -.private_extern _abi_test_clobber_d12 -.align 4 -_abi_test_clobber_d12: - mov r0, #0 - vmov s24, r0 - vmov s25, r0 - bx lr - - -.globl _abi_test_clobber_d13 -.private_extern _abi_test_clobber_d13 -.align 4 -_abi_test_clobber_d13: - mov r0, #0 - vmov s26, r0 - vmov s27, r0 - bx lr - - -.globl _abi_test_clobber_d14 -.private_extern _abi_test_clobber_d14 -.align 4 -_abi_test_clobber_d14: - mov r0, #0 - vmov s28, r0 - vmov s29, r0 - bx lr - - -.globl _abi_test_clobber_d15 -.private_extern _abi_test_clobber_d15 -.align 4 -_abi_test_clobber_d15: - mov r0, #0 - vmov s30, r0 - vmov s31, r0 - bx lr - -#endif // !OPENSSL_NO_ASM diff --git a/third_party/boringssl/apple-x86/crypto/chacha/chacha-x86.S b/third_party/boringssl/apple-x86/crypto/chacha/chacha-x86.S deleted file mode 100644 index ef535b21..00000000 --- a/third_party/boringssl/apple-x86/crypto/chacha/chacha-x86.S +++ /dev/null @@ -1,974 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl _ChaCha20_ctr32 -.private_extern _ChaCha20_ctr32 -.align 4 -_ChaCha20_ctr32: -L_ChaCha20_ctr32_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - xorl %eax,%eax - cmpl 28(%esp),%eax - je L000no_data - call Lpic_point -Lpic_point: - popl %eax - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lpic_point(%eax),%ebp - testl $16777216,(%ebp) - jz L001x86 - testl $512,4(%ebp) - jz L001x86 - jmp Lssse3_shortcut -L001x86: - movl 32(%esp),%esi - movl 36(%esp),%edi - subl $132,%esp - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edx - movl %eax,80(%esp) - movl %ebx,84(%esp) - movl %ecx,88(%esp) - movl %edx,92(%esp) - movl 16(%esi),%eax - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%edx - movl %eax,96(%esp) - movl %ebx,100(%esp) - movl %ecx,104(%esp) - movl %edx,108(%esp) - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - subl $1,%eax - movl %eax,112(%esp) - movl %ebx,116(%esp) - movl %ecx,120(%esp) - movl %edx,124(%esp) - jmp L002entry -.align 4,0x90 -L003outer_loop: - movl %ebx,156(%esp) - movl %eax,152(%esp) - movl %ecx,160(%esp) -L002entry: - movl $1634760805,%eax - movl $857760878,4(%esp) - movl $2036477234,8(%esp) - movl $1797285236,12(%esp) - movl 84(%esp),%ebx - movl 88(%esp),%ebp - movl 104(%esp),%ecx - movl 108(%esp),%esi - movl 116(%esp),%edx - movl 120(%esp),%edi - movl %ebx,20(%esp) - movl %ebp,24(%esp) - movl %ecx,40(%esp) - movl %esi,44(%esp) - movl %edx,52(%esp) - movl %edi,56(%esp) - movl 92(%esp),%ebx - movl 124(%esp),%edi - movl 112(%esp),%edx - movl 80(%esp),%ebp - movl 96(%esp),%ecx - movl 100(%esp),%esi - addl $1,%edx - movl %ebx,28(%esp) - movl %edi,60(%esp) - movl %edx,112(%esp) - movl $10,%ebx - jmp L004loop -.align 4,0x90 -L004loop: - addl %ebp,%eax - movl %ebx,128(%esp) - movl %ebp,%ebx - xorl %eax,%edx - roll $16,%edx - addl %edx,%ecx - xorl %ecx,%ebx - movl 52(%esp),%edi - roll $12,%ebx - movl 20(%esp),%ebp - addl %ebx,%eax - xorl %eax,%edx - movl %eax,(%esp) - roll $8,%edx - movl 4(%esp),%eax - addl %edx,%ecx - movl %edx,48(%esp) - xorl %ecx,%ebx - addl %ebp,%eax - roll $7,%ebx - xorl %eax,%edi - movl %ecx,32(%esp) - roll $16,%edi - movl %ebx,16(%esp) - addl %edi,%esi - movl 40(%esp),%ecx - xorl %esi,%ebp - movl 56(%esp),%edx - roll $12,%ebp - movl 24(%esp),%ebx - addl %ebp,%eax - xorl %eax,%edi - movl %eax,4(%esp) - roll $8,%edi - movl 8(%esp),%eax - addl %edi,%esi - movl %edi,52(%esp) - xorl %esi,%ebp - addl %ebx,%eax - roll $7,%ebp - xorl %eax,%edx - movl %esi,36(%esp) - roll $16,%edx - movl %ebp,20(%esp) - addl %edx,%ecx - movl 44(%esp),%esi - xorl %ecx,%ebx - movl 60(%esp),%edi - roll $12,%ebx - movl 28(%esp),%ebp - addl %ebx,%eax - xorl %eax,%edx - movl %eax,8(%esp) - roll $8,%edx - movl 12(%esp),%eax - addl %edx,%ecx - movl %edx,56(%esp) - xorl %ecx,%ebx - addl %ebp,%eax - roll $7,%ebx - xorl %eax,%edi - roll $16,%edi - movl %ebx,24(%esp) - addl %edi,%esi - xorl %esi,%ebp - roll $12,%ebp - movl 20(%esp),%ebx - addl %ebp,%eax - xorl %eax,%edi - movl %eax,12(%esp) - roll $8,%edi - movl (%esp),%eax - addl %edi,%esi - movl %edi,%edx - xorl %esi,%ebp - addl %ebx,%eax - roll $7,%ebp - xorl %eax,%edx - roll $16,%edx - movl %ebp,28(%esp) - addl %edx,%ecx - xorl %ecx,%ebx - movl 48(%esp),%edi - roll $12,%ebx - movl 24(%esp),%ebp - addl %ebx,%eax - xorl %eax,%edx - movl %eax,(%esp) - roll $8,%edx - movl 4(%esp),%eax - addl %edx,%ecx - movl %edx,60(%esp) - xorl %ecx,%ebx - addl %ebp,%eax - roll $7,%ebx - xorl %eax,%edi - movl %ecx,40(%esp) - roll $16,%edi - movl %ebx,20(%esp) - addl %edi,%esi - movl 32(%esp),%ecx - xorl %esi,%ebp - movl 52(%esp),%edx - roll $12,%ebp - movl 28(%esp),%ebx - addl %ebp,%eax - xorl %eax,%edi - movl %eax,4(%esp) - roll $8,%edi - movl 8(%esp),%eax - addl %edi,%esi - movl %edi,48(%esp) - xorl %esi,%ebp - addl %ebx,%eax - roll $7,%ebp - xorl %eax,%edx - movl %esi,44(%esp) - roll $16,%edx - movl %ebp,24(%esp) - addl %edx,%ecx - movl 36(%esp),%esi - xorl %ecx,%ebx - movl 56(%esp),%edi - roll $12,%ebx - movl 16(%esp),%ebp - addl %ebx,%eax - xorl %eax,%edx - movl %eax,8(%esp) - roll $8,%edx - movl 12(%esp),%eax - addl %edx,%ecx - movl %edx,52(%esp) - xorl %ecx,%ebx - addl %ebp,%eax - roll $7,%ebx - xorl %eax,%edi - roll $16,%edi - movl %ebx,28(%esp) - addl %edi,%esi - xorl %esi,%ebp - movl 48(%esp),%edx - roll $12,%ebp - movl 128(%esp),%ebx - addl %ebp,%eax - xorl %eax,%edi - movl %eax,12(%esp) - roll $8,%edi - movl (%esp),%eax - addl %edi,%esi - movl %edi,56(%esp) - xorl %esi,%ebp - roll $7,%ebp - decl %ebx - jnz L004loop - movl 160(%esp),%ebx - addl $1634760805,%eax - addl 80(%esp),%ebp - addl 96(%esp),%ecx - addl 100(%esp),%esi - cmpl $64,%ebx - jb L005tail - movl 156(%esp),%ebx - addl 112(%esp),%edx - addl 120(%esp),%edi - xorl (%ebx),%eax - xorl 16(%ebx),%ebp - movl %eax,(%esp) - movl 152(%esp),%eax - xorl 32(%ebx),%ecx - xorl 36(%ebx),%esi - xorl 48(%ebx),%edx - xorl 56(%ebx),%edi - movl %ebp,16(%eax) - movl %ecx,32(%eax) - movl %esi,36(%eax) - movl %edx,48(%eax) - movl %edi,56(%eax) - movl 4(%esp),%ebp - movl 8(%esp),%ecx - movl 12(%esp),%esi - movl 20(%esp),%edx - movl 24(%esp),%edi - addl $857760878,%ebp - addl $2036477234,%ecx - addl $1797285236,%esi - addl 84(%esp),%edx - addl 88(%esp),%edi - xorl 4(%ebx),%ebp - xorl 8(%ebx),%ecx - xorl 12(%ebx),%esi - xorl 20(%ebx),%edx - xorl 24(%ebx),%edi - movl %ebp,4(%eax) - movl %ecx,8(%eax) - movl %esi,12(%eax) - movl %edx,20(%eax) - movl %edi,24(%eax) - movl 28(%esp),%ebp - movl 40(%esp),%ecx - movl 44(%esp),%esi - movl 52(%esp),%edx - movl 60(%esp),%edi - addl 92(%esp),%ebp - addl 104(%esp),%ecx - addl 108(%esp),%esi - addl 116(%esp),%edx - addl 124(%esp),%edi - xorl 28(%ebx),%ebp - xorl 40(%ebx),%ecx - xorl 44(%ebx),%esi - xorl 52(%ebx),%edx - xorl 60(%ebx),%edi - leal 64(%ebx),%ebx - movl %ebp,28(%eax) - movl (%esp),%ebp - movl %ecx,40(%eax) - movl 160(%esp),%ecx - movl %esi,44(%eax) - movl %edx,52(%eax) - movl %edi,60(%eax) - movl %ebp,(%eax) - leal 64(%eax),%eax - subl $64,%ecx - jnz L003outer_loop - jmp L006done -L005tail: - addl 112(%esp),%edx - addl 120(%esp),%edi - movl %eax,(%esp) - movl %ebp,16(%esp) - movl %ecx,32(%esp) - movl %esi,36(%esp) - movl %edx,48(%esp) - movl %edi,56(%esp) - movl 4(%esp),%ebp - movl 8(%esp),%ecx - movl 12(%esp),%esi - movl 20(%esp),%edx - movl 24(%esp),%edi - addl $857760878,%ebp - addl $2036477234,%ecx - addl $1797285236,%esi - addl 84(%esp),%edx - addl 88(%esp),%edi - movl %ebp,4(%esp) - movl %ecx,8(%esp) - movl %esi,12(%esp) - movl %edx,20(%esp) - movl %edi,24(%esp) - movl 28(%esp),%ebp - movl 40(%esp),%ecx - movl 44(%esp),%esi - movl 52(%esp),%edx - movl 60(%esp),%edi - addl 92(%esp),%ebp - addl 104(%esp),%ecx - addl 108(%esp),%esi - addl 116(%esp),%edx - addl 124(%esp),%edi - movl %ebp,28(%esp) - movl 156(%esp),%ebp - movl %ecx,40(%esp) - movl 152(%esp),%ecx - movl %esi,44(%esp) - xorl %esi,%esi - movl %edx,52(%esp) - movl %edi,60(%esp) - xorl %eax,%eax - xorl %edx,%edx -L007tail_loop: - movb (%esi,%ebp,1),%al - movb (%esp,%esi,1),%dl - leal 1(%esi),%esi - xorb %dl,%al - movb %al,-1(%ecx,%esi,1) - decl %ebx - jnz L007tail_loop -L006done: - addl $132,%esp -L000no_data: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _ChaCha20_ssse3 -.private_extern _ChaCha20_ssse3 -.align 4 -_ChaCha20_ssse3: -L_ChaCha20_ssse3_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi -Lssse3_shortcut: - movl 20(%esp),%edi - movl 24(%esp),%esi - movl 28(%esp),%ecx - movl 32(%esp),%edx - movl 36(%esp),%ebx - movl %esp,%ebp - subl $524,%esp - andl $-64,%esp - movl %ebp,512(%esp) - leal Lssse3_data-Lpic_point(%eax),%eax - movdqu (%ebx),%xmm3 - cmpl $256,%ecx - jb L0081x - movl %edx,516(%esp) - movl %ebx,520(%esp) - subl $256,%ecx - leal 384(%esp),%ebp - movdqu (%edx),%xmm7 - pshufd $0,%xmm3,%xmm0 - pshufd $85,%xmm3,%xmm1 - pshufd $170,%xmm3,%xmm2 - pshufd $255,%xmm3,%xmm3 - paddd 48(%eax),%xmm0 - pshufd $0,%xmm7,%xmm4 - pshufd $85,%xmm7,%xmm5 - psubd 64(%eax),%xmm0 - pshufd $170,%xmm7,%xmm6 - pshufd $255,%xmm7,%xmm7 - movdqa %xmm0,64(%ebp) - movdqa %xmm1,80(%ebp) - movdqa %xmm2,96(%ebp) - movdqa %xmm3,112(%ebp) - movdqu 16(%edx),%xmm3 - movdqa %xmm4,-64(%ebp) - movdqa %xmm5,-48(%ebp) - movdqa %xmm6,-32(%ebp) - movdqa %xmm7,-16(%ebp) - movdqa 32(%eax),%xmm7 - leal 128(%esp),%ebx - pshufd $0,%xmm3,%xmm0 - pshufd $85,%xmm3,%xmm1 - pshufd $170,%xmm3,%xmm2 - pshufd $255,%xmm3,%xmm3 - pshufd $0,%xmm7,%xmm4 - pshufd $85,%xmm7,%xmm5 - pshufd $170,%xmm7,%xmm6 - pshufd $255,%xmm7,%xmm7 - movdqa %xmm0,(%ebp) - movdqa %xmm1,16(%ebp) - movdqa %xmm2,32(%ebp) - movdqa %xmm3,48(%ebp) - movdqa %xmm4,-128(%ebp) - movdqa %xmm5,-112(%ebp) - movdqa %xmm6,-96(%ebp) - movdqa %xmm7,-80(%ebp) - leal 128(%esi),%esi - leal 128(%edi),%edi - jmp L009outer_loop -.align 4,0x90 -L009outer_loop: - movdqa -112(%ebp),%xmm1 - movdqa -96(%ebp),%xmm2 - movdqa -80(%ebp),%xmm3 - movdqa -48(%ebp),%xmm5 - movdqa -32(%ebp),%xmm6 - movdqa -16(%ebp),%xmm7 - movdqa %xmm1,-112(%ebx) - movdqa %xmm2,-96(%ebx) - movdqa %xmm3,-80(%ebx) - movdqa %xmm5,-48(%ebx) - movdqa %xmm6,-32(%ebx) - movdqa %xmm7,-16(%ebx) - movdqa 32(%ebp),%xmm2 - movdqa 48(%ebp),%xmm3 - movdqa 64(%ebp),%xmm4 - movdqa 80(%ebp),%xmm5 - movdqa 96(%ebp),%xmm6 - movdqa 112(%ebp),%xmm7 - paddd 64(%eax),%xmm4 - movdqa %xmm2,32(%ebx) - movdqa %xmm3,48(%ebx) - movdqa %xmm4,64(%ebx) - movdqa %xmm5,80(%ebx) - movdqa %xmm6,96(%ebx) - movdqa %xmm7,112(%ebx) - movdqa %xmm4,64(%ebp) - movdqa -128(%ebp),%xmm0 - movdqa %xmm4,%xmm6 - movdqa -64(%ebp),%xmm3 - movdqa (%ebp),%xmm4 - movdqa 16(%ebp),%xmm5 - movl $10,%edx - nop -.align 4,0x90 -L010loop: - paddd %xmm3,%xmm0 - movdqa %xmm3,%xmm2 - pxor %xmm0,%xmm6 - pshufb (%eax),%xmm6 - paddd %xmm6,%xmm4 - pxor %xmm4,%xmm2 - movdqa -48(%ebx),%xmm3 - movdqa %xmm2,%xmm1 - pslld $12,%xmm2 - psrld $20,%xmm1 - por %xmm1,%xmm2 - movdqa -112(%ebx),%xmm1 - paddd %xmm2,%xmm0 - movdqa 80(%ebx),%xmm7 - pxor %xmm0,%xmm6 - movdqa %xmm0,-128(%ebx) - pshufb 16(%eax),%xmm6 - paddd %xmm6,%xmm4 - movdqa %xmm6,64(%ebx) - pxor %xmm4,%xmm2 - paddd %xmm3,%xmm1 - movdqa %xmm2,%xmm0 - pslld $7,%xmm2 - psrld $25,%xmm0 - pxor %xmm1,%xmm7 - por %xmm0,%xmm2 - movdqa %xmm4,(%ebx) - pshufb (%eax),%xmm7 - movdqa %xmm2,-64(%ebx) - paddd %xmm7,%xmm5 - movdqa 32(%ebx),%xmm4 - pxor %xmm5,%xmm3 - movdqa -32(%ebx),%xmm2 - movdqa %xmm3,%xmm0 - pslld $12,%xmm3 - psrld $20,%xmm0 - por %xmm0,%xmm3 - movdqa -96(%ebx),%xmm0 - paddd %xmm3,%xmm1 - movdqa 96(%ebx),%xmm6 - pxor %xmm1,%xmm7 - movdqa %xmm1,-112(%ebx) - pshufb 16(%eax),%xmm7 - paddd %xmm7,%xmm5 - movdqa %xmm7,80(%ebx) - pxor %xmm5,%xmm3 - paddd %xmm2,%xmm0 - movdqa %xmm3,%xmm1 - pslld $7,%xmm3 - psrld $25,%xmm1 - pxor %xmm0,%xmm6 - por %xmm1,%xmm3 - movdqa %xmm5,16(%ebx) - pshufb (%eax),%xmm6 - movdqa %xmm3,-48(%ebx) - paddd %xmm6,%xmm4 - movdqa 48(%ebx),%xmm5 - pxor %xmm4,%xmm2 - movdqa -16(%ebx),%xmm3 - movdqa %xmm2,%xmm1 - pslld $12,%xmm2 - psrld $20,%xmm1 - por %xmm1,%xmm2 - movdqa -80(%ebx),%xmm1 - paddd %xmm2,%xmm0 - movdqa 112(%ebx),%xmm7 - pxor %xmm0,%xmm6 - movdqa %xmm0,-96(%ebx) - pshufb 16(%eax),%xmm6 - paddd %xmm6,%xmm4 - movdqa %xmm6,96(%ebx) - pxor %xmm4,%xmm2 - paddd %xmm3,%xmm1 - movdqa %xmm2,%xmm0 - pslld $7,%xmm2 - psrld $25,%xmm0 - pxor %xmm1,%xmm7 - por %xmm0,%xmm2 - pshufb (%eax),%xmm7 - movdqa %xmm2,-32(%ebx) - paddd %xmm7,%xmm5 - pxor %xmm5,%xmm3 - movdqa -48(%ebx),%xmm2 - movdqa %xmm3,%xmm0 - pslld $12,%xmm3 - psrld $20,%xmm0 - por %xmm0,%xmm3 - movdqa -128(%ebx),%xmm0 - paddd %xmm3,%xmm1 - pxor %xmm1,%xmm7 - movdqa %xmm1,-80(%ebx) - pshufb 16(%eax),%xmm7 - paddd %xmm7,%xmm5 - movdqa %xmm7,%xmm6 - pxor %xmm5,%xmm3 - paddd %xmm2,%xmm0 - movdqa %xmm3,%xmm1 - pslld $7,%xmm3 - psrld $25,%xmm1 - pxor %xmm0,%xmm6 - por %xmm1,%xmm3 - pshufb (%eax),%xmm6 - movdqa %xmm3,-16(%ebx) - paddd %xmm6,%xmm4 - pxor %xmm4,%xmm2 - movdqa -32(%ebx),%xmm3 - movdqa %xmm2,%xmm1 - pslld $12,%xmm2 - psrld $20,%xmm1 - por %xmm1,%xmm2 - movdqa -112(%ebx),%xmm1 - paddd %xmm2,%xmm0 - movdqa 64(%ebx),%xmm7 - pxor %xmm0,%xmm6 - movdqa %xmm0,-128(%ebx) - pshufb 16(%eax),%xmm6 - paddd %xmm6,%xmm4 - movdqa %xmm6,112(%ebx) - pxor %xmm4,%xmm2 - paddd %xmm3,%xmm1 - movdqa %xmm2,%xmm0 - pslld $7,%xmm2 - psrld $25,%xmm0 - pxor %xmm1,%xmm7 - por %xmm0,%xmm2 - movdqa %xmm4,32(%ebx) - pshufb (%eax),%xmm7 - movdqa %xmm2,-48(%ebx) - paddd %xmm7,%xmm5 - movdqa (%ebx),%xmm4 - pxor %xmm5,%xmm3 - movdqa -16(%ebx),%xmm2 - movdqa %xmm3,%xmm0 - pslld $12,%xmm3 - psrld $20,%xmm0 - por %xmm0,%xmm3 - movdqa -96(%ebx),%xmm0 - paddd %xmm3,%xmm1 - movdqa 80(%ebx),%xmm6 - pxor %xmm1,%xmm7 - movdqa %xmm1,-112(%ebx) - pshufb 16(%eax),%xmm7 - paddd %xmm7,%xmm5 - movdqa %xmm7,64(%ebx) - pxor %xmm5,%xmm3 - paddd %xmm2,%xmm0 - movdqa %xmm3,%xmm1 - pslld $7,%xmm3 - psrld $25,%xmm1 - pxor %xmm0,%xmm6 - por %xmm1,%xmm3 - movdqa %xmm5,48(%ebx) - pshufb (%eax),%xmm6 - movdqa %xmm3,-32(%ebx) - paddd %xmm6,%xmm4 - movdqa 16(%ebx),%xmm5 - pxor %xmm4,%xmm2 - movdqa -64(%ebx),%xmm3 - movdqa %xmm2,%xmm1 - pslld $12,%xmm2 - psrld $20,%xmm1 - por %xmm1,%xmm2 - movdqa -80(%ebx),%xmm1 - paddd %xmm2,%xmm0 - movdqa 96(%ebx),%xmm7 - pxor %xmm0,%xmm6 - movdqa %xmm0,-96(%ebx) - pshufb 16(%eax),%xmm6 - paddd %xmm6,%xmm4 - movdqa %xmm6,80(%ebx) - pxor %xmm4,%xmm2 - paddd %xmm3,%xmm1 - movdqa %xmm2,%xmm0 - pslld $7,%xmm2 - psrld $25,%xmm0 - pxor %xmm1,%xmm7 - por %xmm0,%xmm2 - pshufb (%eax),%xmm7 - movdqa %xmm2,-16(%ebx) - paddd %xmm7,%xmm5 - pxor %xmm5,%xmm3 - movdqa %xmm3,%xmm0 - pslld $12,%xmm3 - psrld $20,%xmm0 - por %xmm0,%xmm3 - movdqa -128(%ebx),%xmm0 - paddd %xmm3,%xmm1 - movdqa 64(%ebx),%xmm6 - pxor %xmm1,%xmm7 - movdqa %xmm1,-80(%ebx) - pshufb 16(%eax),%xmm7 - paddd %xmm7,%xmm5 - movdqa %xmm7,96(%ebx) - pxor %xmm5,%xmm3 - movdqa %xmm3,%xmm1 - pslld $7,%xmm3 - psrld $25,%xmm1 - por %xmm1,%xmm3 - decl %edx - jnz L010loop - movdqa %xmm3,-64(%ebx) - movdqa %xmm4,(%ebx) - movdqa %xmm5,16(%ebx) - movdqa %xmm6,64(%ebx) - movdqa %xmm7,96(%ebx) - movdqa -112(%ebx),%xmm1 - movdqa -96(%ebx),%xmm2 - movdqa -80(%ebx),%xmm3 - paddd -128(%ebp),%xmm0 - paddd -112(%ebp),%xmm1 - paddd -96(%ebp),%xmm2 - paddd -80(%ebp),%xmm3 - movdqa %xmm0,%xmm6 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm6 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 - movdqa %xmm6,%xmm3 - punpcklqdq %xmm7,%xmm6 - punpckhqdq %xmm2,%xmm1 - punpckhqdq %xmm7,%xmm3 - movdqu -128(%esi),%xmm4 - movdqu -64(%esi),%xmm5 - movdqu (%esi),%xmm2 - movdqu 64(%esi),%xmm7 - leal 16(%esi),%esi - pxor %xmm0,%xmm4 - movdqa -64(%ebx),%xmm0 - pxor %xmm1,%xmm5 - movdqa -48(%ebx),%xmm1 - pxor %xmm2,%xmm6 - movdqa -32(%ebx),%xmm2 - pxor %xmm3,%xmm7 - movdqa -16(%ebx),%xmm3 - movdqu %xmm4,-128(%edi) - movdqu %xmm5,-64(%edi) - movdqu %xmm6,(%edi) - movdqu %xmm7,64(%edi) - leal 16(%edi),%edi - paddd -64(%ebp),%xmm0 - paddd -48(%ebp),%xmm1 - paddd -32(%ebp),%xmm2 - paddd -16(%ebp),%xmm3 - movdqa %xmm0,%xmm6 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm6 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 - movdqa %xmm6,%xmm3 - punpcklqdq %xmm7,%xmm6 - punpckhqdq %xmm2,%xmm1 - punpckhqdq %xmm7,%xmm3 - movdqu -128(%esi),%xmm4 - movdqu -64(%esi),%xmm5 - movdqu (%esi),%xmm2 - movdqu 64(%esi),%xmm7 - leal 16(%esi),%esi - pxor %xmm0,%xmm4 - movdqa (%ebx),%xmm0 - pxor %xmm1,%xmm5 - movdqa 16(%ebx),%xmm1 - pxor %xmm2,%xmm6 - movdqa 32(%ebx),%xmm2 - pxor %xmm3,%xmm7 - movdqa 48(%ebx),%xmm3 - movdqu %xmm4,-128(%edi) - movdqu %xmm5,-64(%edi) - movdqu %xmm6,(%edi) - movdqu %xmm7,64(%edi) - leal 16(%edi),%edi - paddd (%ebp),%xmm0 - paddd 16(%ebp),%xmm1 - paddd 32(%ebp),%xmm2 - paddd 48(%ebp),%xmm3 - movdqa %xmm0,%xmm6 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm6 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 - movdqa %xmm6,%xmm3 - punpcklqdq %xmm7,%xmm6 - punpckhqdq %xmm2,%xmm1 - punpckhqdq %xmm7,%xmm3 - movdqu -128(%esi),%xmm4 - movdqu -64(%esi),%xmm5 - movdqu (%esi),%xmm2 - movdqu 64(%esi),%xmm7 - leal 16(%esi),%esi - pxor %xmm0,%xmm4 - movdqa 64(%ebx),%xmm0 - pxor %xmm1,%xmm5 - movdqa 80(%ebx),%xmm1 - pxor %xmm2,%xmm6 - movdqa 96(%ebx),%xmm2 - pxor %xmm3,%xmm7 - movdqa 112(%ebx),%xmm3 - movdqu %xmm4,-128(%edi) - movdqu %xmm5,-64(%edi) - movdqu %xmm6,(%edi) - movdqu %xmm7,64(%edi) - leal 16(%edi),%edi - paddd 64(%ebp),%xmm0 - paddd 80(%ebp),%xmm1 - paddd 96(%ebp),%xmm2 - paddd 112(%ebp),%xmm3 - movdqa %xmm0,%xmm6 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm6 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 - movdqa %xmm6,%xmm3 - punpcklqdq %xmm7,%xmm6 - punpckhqdq %xmm2,%xmm1 - punpckhqdq %xmm7,%xmm3 - movdqu -128(%esi),%xmm4 - movdqu -64(%esi),%xmm5 - movdqu (%esi),%xmm2 - movdqu 64(%esi),%xmm7 - leal 208(%esi),%esi - pxor %xmm0,%xmm4 - pxor %xmm1,%xmm5 - pxor %xmm2,%xmm6 - pxor %xmm3,%xmm7 - movdqu %xmm4,-128(%edi) - movdqu %xmm5,-64(%edi) - movdqu %xmm6,(%edi) - movdqu %xmm7,64(%edi) - leal 208(%edi),%edi - subl $256,%ecx - jnc L009outer_loop - addl $256,%ecx - jz L011done - movl 520(%esp),%ebx - leal -128(%esi),%esi - movl 516(%esp),%edx - leal -128(%edi),%edi - movd 64(%ebp),%xmm2 - movdqu (%ebx),%xmm3 - paddd 96(%eax),%xmm2 - pand 112(%eax),%xmm3 - por %xmm2,%xmm3 -L0081x: - movdqa 32(%eax),%xmm0 - movdqu (%edx),%xmm1 - movdqu 16(%edx),%xmm2 - movdqa (%eax),%xmm6 - movdqa 16(%eax),%xmm7 - movl %ebp,48(%esp) - movdqa %xmm0,(%esp) - movdqa %xmm1,16(%esp) - movdqa %xmm2,32(%esp) - movdqa %xmm3,48(%esp) - movl $10,%edx - jmp L012loop1x -.align 4,0x90 -L013outer1x: - movdqa 80(%eax),%xmm3 - movdqa (%esp),%xmm0 - movdqa 16(%esp),%xmm1 - movdqa 32(%esp),%xmm2 - paddd 48(%esp),%xmm3 - movl $10,%edx - movdqa %xmm3,48(%esp) - jmp L012loop1x -.align 4,0x90 -L012loop1x: - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $57,%xmm1,%xmm1 - pshufd $147,%xmm3,%xmm3 - nop - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $147,%xmm1,%xmm1 - pshufd $57,%xmm3,%xmm3 - decl %edx - jnz L012loop1x - paddd (%esp),%xmm0 - paddd 16(%esp),%xmm1 - paddd 32(%esp),%xmm2 - paddd 48(%esp),%xmm3 - cmpl $64,%ecx - jb L014tail - movdqu (%esi),%xmm4 - movdqu 16(%esi),%xmm5 - pxor %xmm4,%xmm0 - movdqu 32(%esi),%xmm4 - pxor %xmm5,%xmm1 - movdqu 48(%esi),%xmm5 - pxor %xmm4,%xmm2 - pxor %xmm5,%xmm3 - leal 64(%esi),%esi - movdqu %xmm0,(%edi) - movdqu %xmm1,16(%edi) - movdqu %xmm2,32(%edi) - movdqu %xmm3,48(%edi) - leal 64(%edi),%edi - subl $64,%ecx - jnz L013outer1x - jmp L011done -L014tail: - movdqa %xmm0,(%esp) - movdqa %xmm1,16(%esp) - movdqa %xmm2,32(%esp) - movdqa %xmm3,48(%esp) - xorl %eax,%eax - xorl %edx,%edx - xorl %ebp,%ebp -L015tail_loop: - movb (%esp,%ebp,1),%al - movb (%esi,%ebp,1),%dl - leal 1(%ebp),%ebp - xorb %dl,%al - movb %al,-1(%edi,%ebp,1) - decl %ecx - jnz L015tail_loop -L011done: - movl 512(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 6,0x90 -Lssse3_data: -.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 -.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 -.long 1634760805,857760878,2036477234,1797285236 -.long 0,1,2,3 -.long 4,4,4,4 -.long 1,0,0,0 -.long 4,0,0,0 -.long 0,-1,-1,-1 -.align 6,0x90 -.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 -.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 -.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 -.byte 114,103,62,0 -.section __IMPORT,__pointers,non_lazy_symbol_pointers -L_OPENSSL_ia32cap_P$non_lazy_ptr: -.indirect_symbol _OPENSSL_ia32cap_P -.long 0 -#endif diff --git a/third_party/boringssl/apple-x86/crypto/fipsmodule/aesni-x86.S b/third_party/boringssl/apple-x86/crypto/fipsmodule/aesni-x86.S deleted file mode 100644 index 00f6003d..00000000 --- a/third_party/boringssl/apple-x86/crypto/fipsmodule/aesni-x86.S +++ /dev/null @@ -1,2476 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -#ifdef BORINGSSL_DISPATCH_TEST -#endif -.globl _aes_hw_encrypt -.private_extern _aes_hw_encrypt -.align 4 -_aes_hw_encrypt: -L_aes_hw_encrypt_begin: -#ifdef BORINGSSL_DISPATCH_TEST - pushl %ebx - pushl %edx - call L000pic -L000pic: - popl %ebx - leal _BORINGSSL_function_hit+1-L000pic(%ebx),%ebx - movl $1,%edx - movb %dl,(%ebx) - popl %edx - popl %ebx -#endif - movl 4(%esp),%eax - movl 12(%esp),%edx - movups (%eax),%xmm2 - movl 240(%edx),%ecx - movl 8(%esp),%eax - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L001enc1_loop_1: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L001enc1_loop_1 -.byte 102,15,56,221,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%eax) - pxor %xmm2,%xmm2 - ret -.globl _aes_hw_decrypt -.private_extern _aes_hw_decrypt -.align 4 -_aes_hw_decrypt: -L_aes_hw_decrypt_begin: - movl 4(%esp),%eax - movl 12(%esp),%edx - movups (%eax),%xmm2 - movl 240(%edx),%ecx - movl 8(%esp),%eax - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L002dec1_loop_2: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L002dec1_loop_2 -.byte 102,15,56,223,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%eax) - pxor %xmm2,%xmm2 - ret -.private_extern __aesni_encrypt2 -.align 4 -__aesni_encrypt2: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx - addl $16,%ecx -L003enc2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%edx,%ecx,1),%xmm0 - jnz L003enc2_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - ret -.private_extern __aesni_decrypt2 -.align 4 -__aesni_decrypt2: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx - addl $16,%ecx -L004dec2_loop: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 - movups -16(%edx,%ecx,1),%xmm0 - jnz L004dec2_loop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 - ret -.private_extern __aesni_encrypt3 -.align 4 -__aesni_encrypt3: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx - addl $16,%ecx -L005enc3_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 - movups -16(%edx,%ecx,1),%xmm0 - jnz L005enc3_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 - ret -.private_extern __aesni_decrypt3 -.align 4 -__aesni_decrypt3: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx - addl $16,%ecx -L006dec3_loop: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 - movups -16(%edx,%ecx,1),%xmm0 - jnz L006dec3_loop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 - ret -.private_extern __aesni_encrypt4 -.align 4 -__aesni_encrypt4: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - shll $4,%ecx - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 - pxor %xmm0,%xmm5 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx -.byte 15,31,64,0 - addl $16,%ecx -L007enc4_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups -16(%edx,%ecx,1),%xmm0 - jnz L007enc4_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 -.byte 102,15,56,221,232 - ret -.private_extern __aesni_decrypt4 -.align 4 -__aesni_decrypt4: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - shll $4,%ecx - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 - pxor %xmm0,%xmm5 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx -.byte 15,31,64,0 - addl $16,%ecx -L008dec4_loop: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups -16(%edx,%ecx,1),%xmm0 - jnz L008dec4_loop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 -.byte 102,15,56,223,232 - ret -.private_extern __aesni_encrypt6 -.align 4 -__aesni_encrypt6: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 -.byte 102,15,56,220,209 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 -.byte 102,15,56,220,217 - leal 32(%edx,%ecx,1),%edx - negl %ecx -.byte 102,15,56,220,225 - pxor %xmm0,%xmm7 - movups (%edx,%ecx,1),%xmm0 - addl $16,%ecx - jmp L009_aesni_encrypt6_inner -.align 4,0x90 -L010enc6_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -L009_aesni_encrypt6_inner: -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -L_aesni_encrypt6_enter: - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%edx,%ecx,1),%xmm0 - jnz L010enc6_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 - ret -.private_extern __aesni_decrypt6 -.align 4 -__aesni_decrypt6: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 -.byte 102,15,56,222,209 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 -.byte 102,15,56,222,217 - leal 32(%edx,%ecx,1),%edx - negl %ecx -.byte 102,15,56,222,225 - pxor %xmm0,%xmm7 - movups (%edx,%ecx,1),%xmm0 - addl $16,%ecx - jmp L011_aesni_decrypt6_inner -.align 4,0x90 -L012dec6_loop: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -L011_aesni_decrypt6_inner: -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -L_aesni_decrypt6_enter: - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -16(%edx,%ecx,1),%xmm0 - jnz L012dec6_loop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 - ret -.globl _aes_hw_ecb_encrypt -.private_extern _aes_hw_ecb_encrypt -.align 4 -_aes_hw_ecb_encrypt: -L_aes_hw_ecb_encrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl 36(%esp),%ebx - andl $-16,%eax - jz L013ecb_ret - movl 240(%edx),%ecx - testl %ebx,%ebx - jz L014ecb_decrypt - movl %edx,%ebp - movl %ecx,%ebx - cmpl $96,%eax - jb L015ecb_enc_tail - movdqu (%esi),%xmm2 - movdqu 16(%esi),%xmm3 - movdqu 32(%esi),%xmm4 - movdqu 48(%esi),%xmm5 - movdqu 64(%esi),%xmm6 - movdqu 80(%esi),%xmm7 - leal 96(%esi),%esi - subl $96,%eax - jmp L016ecb_enc_loop6_enter -.align 4,0x90 -L017ecb_enc_loop6: - movups %xmm2,(%edi) - movdqu (%esi),%xmm2 - movups %xmm3,16(%edi) - movdqu 16(%esi),%xmm3 - movups %xmm4,32(%edi) - movdqu 32(%esi),%xmm4 - movups %xmm5,48(%edi) - movdqu 48(%esi),%xmm5 - movups %xmm6,64(%edi) - movdqu 64(%esi),%xmm6 - movups %xmm7,80(%edi) - leal 96(%edi),%edi - movdqu 80(%esi),%xmm7 - leal 96(%esi),%esi -L016ecb_enc_loop6_enter: - call __aesni_encrypt6 - movl %ebp,%edx - movl %ebx,%ecx - subl $96,%eax - jnc L017ecb_enc_loop6 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - movups %xmm7,80(%edi) - leal 96(%edi),%edi - addl $96,%eax - jz L013ecb_ret -L015ecb_enc_tail: - movups (%esi),%xmm2 - cmpl $32,%eax - jb L018ecb_enc_one - movups 16(%esi),%xmm3 - je L019ecb_enc_two - movups 32(%esi),%xmm4 - cmpl $64,%eax - jb L020ecb_enc_three - movups 48(%esi),%xmm5 - je L021ecb_enc_four - movups 64(%esi),%xmm6 - xorps %xmm7,%xmm7 - call __aesni_encrypt6 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - jmp L013ecb_ret -.align 4,0x90 -L018ecb_enc_one: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L022enc1_loop_3: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L022enc1_loop_3 -.byte 102,15,56,221,209 - movups %xmm2,(%edi) - jmp L013ecb_ret -.align 4,0x90 -L019ecb_enc_two: - call __aesni_encrypt2 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - jmp L013ecb_ret -.align 4,0x90 -L020ecb_enc_three: - call __aesni_encrypt3 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - jmp L013ecb_ret -.align 4,0x90 -L021ecb_enc_four: - call __aesni_encrypt4 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - jmp L013ecb_ret -.align 4,0x90 -L014ecb_decrypt: - movl %edx,%ebp - movl %ecx,%ebx - cmpl $96,%eax - jb L023ecb_dec_tail - movdqu (%esi),%xmm2 - movdqu 16(%esi),%xmm3 - movdqu 32(%esi),%xmm4 - movdqu 48(%esi),%xmm5 - movdqu 64(%esi),%xmm6 - movdqu 80(%esi),%xmm7 - leal 96(%esi),%esi - subl $96,%eax - jmp L024ecb_dec_loop6_enter -.align 4,0x90 -L025ecb_dec_loop6: - movups %xmm2,(%edi) - movdqu (%esi),%xmm2 - movups %xmm3,16(%edi) - movdqu 16(%esi),%xmm3 - movups %xmm4,32(%edi) - movdqu 32(%esi),%xmm4 - movups %xmm5,48(%edi) - movdqu 48(%esi),%xmm5 - movups %xmm6,64(%edi) - movdqu 64(%esi),%xmm6 - movups %xmm7,80(%edi) - leal 96(%edi),%edi - movdqu 80(%esi),%xmm7 - leal 96(%esi),%esi -L024ecb_dec_loop6_enter: - call __aesni_decrypt6 - movl %ebp,%edx - movl %ebx,%ecx - subl $96,%eax - jnc L025ecb_dec_loop6 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - movups %xmm7,80(%edi) - leal 96(%edi),%edi - addl $96,%eax - jz L013ecb_ret -L023ecb_dec_tail: - movups (%esi),%xmm2 - cmpl $32,%eax - jb L026ecb_dec_one - movups 16(%esi),%xmm3 - je L027ecb_dec_two - movups 32(%esi),%xmm4 - cmpl $64,%eax - jb L028ecb_dec_three - movups 48(%esi),%xmm5 - je L029ecb_dec_four - movups 64(%esi),%xmm6 - xorps %xmm7,%xmm7 - call __aesni_decrypt6 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - jmp L013ecb_ret -.align 4,0x90 -L026ecb_dec_one: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L030dec1_loop_4: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L030dec1_loop_4 -.byte 102,15,56,223,209 - movups %xmm2,(%edi) - jmp L013ecb_ret -.align 4,0x90 -L027ecb_dec_two: - call __aesni_decrypt2 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - jmp L013ecb_ret -.align 4,0x90 -L028ecb_dec_three: - call __aesni_decrypt3 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - jmp L013ecb_ret -.align 4,0x90 -L029ecb_dec_four: - call __aesni_decrypt4 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) -L013ecb_ret: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _aes_hw_ccm64_encrypt_blocks -.private_extern _aes_hw_ccm64_encrypt_blocks -.align 4 -_aes_hw_ccm64_encrypt_blocks: -L_aes_hw_ccm64_encrypt_blocks_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl 36(%esp),%ebx - movl 40(%esp),%ecx - movl %esp,%ebp - subl $60,%esp - andl $-16,%esp - movl %ebp,48(%esp) - movdqu (%ebx),%xmm7 - movdqu (%ecx),%xmm3 - movl 240(%edx),%ecx - movl $202182159,(%esp) - movl $134810123,4(%esp) - movl $67438087,8(%esp) - movl $66051,12(%esp) - movl $1,%ebx - xorl %ebp,%ebp - movl %ebx,16(%esp) - movl %ebp,20(%esp) - movl %ebp,24(%esp) - movl %ebp,28(%esp) - shll $4,%ecx - movl $16,%ebx - leal (%edx),%ebp - movdqa (%esp),%xmm5 - movdqa %xmm7,%xmm2 - leal 32(%edx,%ecx,1),%edx - subl %ecx,%ebx -.byte 102,15,56,0,253 -L031ccm64_enc_outer: - movups (%ebp),%xmm0 - movl %ebx,%ecx - movups (%esi),%xmm6 - xorps %xmm0,%xmm2 - movups 16(%ebp),%xmm1 - xorps %xmm6,%xmm0 - xorps %xmm0,%xmm3 - movups 32(%ebp),%xmm0 -L032ccm64_enc2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%edx,%ecx,1),%xmm0 - jnz L032ccm64_enc2_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - paddq 16(%esp),%xmm7 - decl %eax -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - leal 16(%esi),%esi - xorps %xmm2,%xmm6 - movdqa %xmm7,%xmm2 - movups %xmm6,(%edi) -.byte 102,15,56,0,213 - leal 16(%edi),%edi - jnz L031ccm64_enc_outer - movl 48(%esp),%esp - movl 40(%esp),%edi - movups %xmm3,(%edi) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _aes_hw_ccm64_decrypt_blocks -.private_extern _aes_hw_ccm64_decrypt_blocks -.align 4 -_aes_hw_ccm64_decrypt_blocks: -L_aes_hw_ccm64_decrypt_blocks_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl 36(%esp),%ebx - movl 40(%esp),%ecx - movl %esp,%ebp - subl $60,%esp - andl $-16,%esp - movl %ebp,48(%esp) - movdqu (%ebx),%xmm7 - movdqu (%ecx),%xmm3 - movl 240(%edx),%ecx - movl $202182159,(%esp) - movl $134810123,4(%esp) - movl $67438087,8(%esp) - movl $66051,12(%esp) - movl $1,%ebx - xorl %ebp,%ebp - movl %ebx,16(%esp) - movl %ebp,20(%esp) - movl %ebp,24(%esp) - movl %ebp,28(%esp) - movdqa (%esp),%xmm5 - movdqa %xmm7,%xmm2 - movl %edx,%ebp - movl %ecx,%ebx -.byte 102,15,56,0,253 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L033enc1_loop_5: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L033enc1_loop_5 -.byte 102,15,56,221,209 - shll $4,%ebx - movl $16,%ecx - movups (%esi),%xmm6 - paddq 16(%esp),%xmm7 - leal 16(%esi),%esi - subl %ebx,%ecx - leal 32(%ebp,%ebx,1),%edx - movl %ecx,%ebx - jmp L034ccm64_dec_outer -.align 4,0x90 -L034ccm64_dec_outer: - xorps %xmm2,%xmm6 - movdqa %xmm7,%xmm2 - movups %xmm6,(%edi) - leal 16(%edi),%edi -.byte 102,15,56,0,213 - subl $1,%eax - jz L035ccm64_dec_break - movups (%ebp),%xmm0 - movl %ebx,%ecx - movups 16(%ebp),%xmm1 - xorps %xmm0,%xmm6 - xorps %xmm0,%xmm2 - xorps %xmm6,%xmm3 - movups 32(%ebp),%xmm0 -L036ccm64_dec2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%edx,%ecx,1),%xmm0 - jnz L036ccm64_dec2_loop - movups (%esi),%xmm6 - paddq 16(%esp),%xmm7 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - leal 16(%esi),%esi - jmp L034ccm64_dec_outer -.align 4,0x90 -L035ccm64_dec_break: - movl 240(%ebp),%ecx - movl %ebp,%edx - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm6 - leal 32(%edx),%edx - xorps %xmm6,%xmm3 -L037enc1_loop_6: -.byte 102,15,56,220,217 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L037enc1_loop_6 -.byte 102,15,56,221,217 - movl 48(%esp),%esp - movl 40(%esp),%edi - movups %xmm3,(%edi) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _aes_hw_ctr32_encrypt_blocks -.private_extern _aes_hw_ctr32_encrypt_blocks -.align 4 -_aes_hw_ctr32_encrypt_blocks: -L_aes_hw_ctr32_encrypt_blocks_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi -#ifdef BORINGSSL_DISPATCH_TEST - pushl %ebx - pushl %edx - call L038pic -L038pic: - popl %ebx - leal _BORINGSSL_function_hit+0-L038pic(%ebx),%ebx - movl $1,%edx - movb %dl,(%ebx) - popl %edx - popl %ebx -#endif - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl 36(%esp),%ebx - movl %esp,%ebp - subl $88,%esp - andl $-16,%esp - movl %ebp,80(%esp) - cmpl $1,%eax - je L039ctr32_one_shortcut - movdqu (%ebx),%xmm7 - movl $202182159,(%esp) - movl $134810123,4(%esp) - movl $67438087,8(%esp) - movl $66051,12(%esp) - movl $6,%ecx - xorl %ebp,%ebp - movl %ecx,16(%esp) - movl %ecx,20(%esp) - movl %ecx,24(%esp) - movl %ebp,28(%esp) -.byte 102,15,58,22,251,3 -.byte 102,15,58,34,253,3 - movl 240(%edx),%ecx - bswap %ebx - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movdqa (%esp),%xmm2 -.byte 102,15,58,34,195,0 - leal 3(%ebx),%ebp -.byte 102,15,58,34,205,0 - incl %ebx -.byte 102,15,58,34,195,1 - incl %ebp -.byte 102,15,58,34,205,1 - incl %ebx -.byte 102,15,58,34,195,2 - incl %ebp -.byte 102,15,58,34,205,2 - movdqa %xmm0,48(%esp) -.byte 102,15,56,0,194 - movdqu (%edx),%xmm6 - movdqa %xmm1,64(%esp) -.byte 102,15,56,0,202 - pshufd $192,%xmm0,%xmm2 - pshufd $128,%xmm0,%xmm3 - cmpl $6,%eax - jb L040ctr32_tail - pxor %xmm6,%xmm7 - shll $4,%ecx - movl $16,%ebx - movdqa %xmm7,32(%esp) - movl %edx,%ebp - subl %ecx,%ebx - leal 32(%edx,%ecx,1),%edx - subl $6,%eax - jmp L041ctr32_loop6 -.align 4,0x90 -L041ctr32_loop6: - pshufd $64,%xmm0,%xmm4 - movdqa 32(%esp),%xmm0 - pshufd $192,%xmm1,%xmm5 - pxor %xmm0,%xmm2 - pshufd $128,%xmm1,%xmm6 - pxor %xmm0,%xmm3 - pshufd $64,%xmm1,%xmm7 - movups 16(%ebp),%xmm1 - pxor %xmm0,%xmm4 - pxor %xmm0,%xmm5 -.byte 102,15,56,220,209 - pxor %xmm0,%xmm6 - pxor %xmm0,%xmm7 -.byte 102,15,56,220,217 - movups 32(%ebp),%xmm0 - movl %ebx,%ecx -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - call L_aesni_encrypt6_enter - movups (%esi),%xmm1 - movups 16(%esi),%xmm0 - xorps %xmm1,%xmm2 - movups 32(%esi),%xmm1 - xorps %xmm0,%xmm3 - movups %xmm2,(%edi) - movdqa 16(%esp),%xmm0 - xorps %xmm1,%xmm4 - movdqa 64(%esp),%xmm1 - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - paddd %xmm0,%xmm1 - paddd 48(%esp),%xmm0 - movdqa (%esp),%xmm2 - movups 48(%esi),%xmm3 - movups 64(%esi),%xmm4 - xorps %xmm3,%xmm5 - movups 80(%esi),%xmm3 - leal 96(%esi),%esi - movdqa %xmm0,48(%esp) -.byte 102,15,56,0,194 - xorps %xmm4,%xmm6 - movups %xmm5,48(%edi) - xorps %xmm3,%xmm7 - movdqa %xmm1,64(%esp) -.byte 102,15,56,0,202 - movups %xmm6,64(%edi) - pshufd $192,%xmm0,%xmm2 - movups %xmm7,80(%edi) - leal 96(%edi),%edi - pshufd $128,%xmm0,%xmm3 - subl $6,%eax - jnc L041ctr32_loop6 - addl $6,%eax - jz L042ctr32_ret - movdqu (%ebp),%xmm7 - movl %ebp,%edx - pxor 32(%esp),%xmm7 - movl 240(%ebp),%ecx -L040ctr32_tail: - por %xmm7,%xmm2 - cmpl $2,%eax - jb L043ctr32_one - pshufd $64,%xmm0,%xmm4 - por %xmm7,%xmm3 - je L044ctr32_two - pshufd $192,%xmm1,%xmm5 - por %xmm7,%xmm4 - cmpl $4,%eax - jb L045ctr32_three - pshufd $128,%xmm1,%xmm6 - por %xmm7,%xmm5 - je L046ctr32_four - por %xmm7,%xmm6 - call __aesni_encrypt6 - movups (%esi),%xmm1 - movups 16(%esi),%xmm0 - xorps %xmm1,%xmm2 - movups 32(%esi),%xmm1 - xorps %xmm0,%xmm3 - movups 48(%esi),%xmm0 - xorps %xmm1,%xmm4 - movups 64(%esi),%xmm1 - xorps %xmm0,%xmm5 - movups %xmm2,(%edi) - xorps %xmm1,%xmm6 - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - jmp L042ctr32_ret -.align 4,0x90 -L039ctr32_one_shortcut: - movups (%ebx),%xmm2 - movl 240(%edx),%ecx -L043ctr32_one: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L047enc1_loop_7: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L047enc1_loop_7 -.byte 102,15,56,221,209 - movups (%esi),%xmm6 - xorps %xmm2,%xmm6 - movups %xmm6,(%edi) - jmp L042ctr32_ret -.align 4,0x90 -L044ctr32_two: - call __aesni_encrypt2 - movups (%esi),%xmm5 - movups 16(%esi),%xmm6 - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - jmp L042ctr32_ret -.align 4,0x90 -L045ctr32_three: - call __aesni_encrypt3 - movups (%esi),%xmm5 - movups 16(%esi),%xmm6 - xorps %xmm5,%xmm2 - movups 32(%esi),%xmm7 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - jmp L042ctr32_ret -.align 4,0x90 -L046ctr32_four: - call __aesni_encrypt4 - movups (%esi),%xmm6 - movups 16(%esi),%xmm7 - movups 32(%esi),%xmm1 - xorps %xmm6,%xmm2 - movups 48(%esi),%xmm0 - xorps %xmm7,%xmm3 - movups %xmm2,(%edi) - xorps %xmm1,%xmm4 - movups %xmm3,16(%edi) - xorps %xmm0,%xmm5 - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) -L042ctr32_ret: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - movdqa %xmm0,32(%esp) - pxor %xmm5,%xmm5 - movdqa %xmm0,48(%esp) - pxor %xmm6,%xmm6 - movdqa %xmm0,64(%esp) - pxor %xmm7,%xmm7 - movl 80(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _aes_hw_xts_encrypt -.private_extern _aes_hw_xts_encrypt -.align 4 -_aes_hw_xts_encrypt: -L_aes_hw_xts_encrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 36(%esp),%edx - movl 40(%esp),%esi - movl 240(%edx),%ecx - movups (%esi),%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L048enc1_loop_8: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L048enc1_loop_8 -.byte 102,15,56,221,209 - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl %esp,%ebp - subl $120,%esp - movl 240(%edx),%ecx - andl $-16,%esp - movl $135,96(%esp) - movl $0,100(%esp) - movl $1,104(%esp) - movl $0,108(%esp) - movl %eax,112(%esp) - movl %ebp,116(%esp) - movdqa %xmm2,%xmm1 - pxor %xmm0,%xmm0 - movdqa 96(%esp),%xmm3 - pcmpgtd %xmm1,%xmm0 - andl $-16,%eax - movl %edx,%ebp - movl %ecx,%ebx - subl $96,%eax - jc L049xts_enc_short - shll $4,%ecx - movl $16,%ebx - subl %ecx,%ebx - leal 32(%edx,%ecx,1),%edx - jmp L050xts_enc_loop6 -.align 4,0x90 -L050xts_enc_loop6: - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,16(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,32(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,48(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm7 - movdqa %xmm1,64(%esp) - paddq %xmm1,%xmm1 - movups (%ebp),%xmm0 - pand %xmm3,%xmm7 - movups (%esi),%xmm2 - pxor %xmm1,%xmm7 - movl %ebx,%ecx - movdqu 16(%esi),%xmm3 - xorps %xmm0,%xmm2 - movdqu 32(%esi),%xmm4 - pxor %xmm0,%xmm3 - movdqu 48(%esi),%xmm5 - pxor %xmm0,%xmm4 - movdqu 64(%esi),%xmm6 - pxor %xmm0,%xmm5 - movdqu 80(%esi),%xmm1 - pxor %xmm0,%xmm6 - leal 96(%esi),%esi - pxor (%esp),%xmm2 - movdqa %xmm7,80(%esp) - pxor %xmm1,%xmm7 - movups 16(%ebp),%xmm1 - pxor 16(%esp),%xmm3 - pxor 32(%esp),%xmm4 -.byte 102,15,56,220,209 - pxor 48(%esp),%xmm5 - pxor 64(%esp),%xmm6 -.byte 102,15,56,220,217 - pxor %xmm0,%xmm7 - movups 32(%ebp),%xmm0 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - call L_aesni_encrypt6_enter - movdqa 80(%esp),%xmm1 - pxor %xmm0,%xmm0 - xorps (%esp),%xmm2 - pcmpgtd %xmm1,%xmm0 - xorps 16(%esp),%xmm3 - movups %xmm2,(%edi) - xorps 32(%esp),%xmm4 - movups %xmm3,16(%edi) - xorps 48(%esp),%xmm5 - movups %xmm4,32(%edi) - xorps 64(%esp),%xmm6 - movups %xmm5,48(%edi) - xorps %xmm1,%xmm7 - movups %xmm6,64(%edi) - pshufd $19,%xmm0,%xmm2 - movups %xmm7,80(%edi) - leal 96(%edi),%edi - movdqa 96(%esp),%xmm3 - pxor %xmm0,%xmm0 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - subl $96,%eax - jnc L050xts_enc_loop6 - movl 240(%ebp),%ecx - movl %ebp,%edx - movl %ecx,%ebx -L049xts_enc_short: - addl $96,%eax - jz L051xts_enc_done6x - movdqa %xmm1,%xmm5 - cmpl $32,%eax - jb L052xts_enc_one - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - je L053xts_enc_two - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,%xmm6 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - cmpl $64,%eax - jb L054xts_enc_three - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,%xmm7 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - movdqa %xmm5,(%esp) - movdqa %xmm6,16(%esp) - je L055xts_enc_four - movdqa %xmm7,32(%esp) - pshufd $19,%xmm0,%xmm7 - movdqa %xmm1,48(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm7 - pxor %xmm1,%xmm7 - movdqu (%esi),%xmm2 - movdqu 16(%esi),%xmm3 - movdqu 32(%esi),%xmm4 - pxor (%esp),%xmm2 - movdqu 48(%esi),%xmm5 - pxor 16(%esp),%xmm3 - movdqu 64(%esi),%xmm6 - pxor 32(%esp),%xmm4 - leal 80(%esi),%esi - pxor 48(%esp),%xmm5 - movdqa %xmm7,64(%esp) - pxor %xmm7,%xmm6 - call __aesni_encrypt6 - movaps 64(%esp),%xmm1 - xorps (%esp),%xmm2 - xorps 16(%esp),%xmm3 - xorps 32(%esp),%xmm4 - movups %xmm2,(%edi) - xorps 48(%esp),%xmm5 - movups %xmm3,16(%edi) - xorps %xmm1,%xmm6 - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - leal 80(%edi),%edi - jmp L056xts_enc_done -.align 4,0x90 -L052xts_enc_one: - movups (%esi),%xmm2 - leal 16(%esi),%esi - xorps %xmm5,%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L057enc1_loop_9: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L057enc1_loop_9 -.byte 102,15,56,221,209 - xorps %xmm5,%xmm2 - movups %xmm2,(%edi) - leal 16(%edi),%edi - movdqa %xmm5,%xmm1 - jmp L056xts_enc_done -.align 4,0x90 -L053xts_enc_two: - movaps %xmm1,%xmm6 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - leal 32(%esi),%esi - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - call __aesni_encrypt2 - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - leal 32(%edi),%edi - movdqa %xmm6,%xmm1 - jmp L056xts_enc_done -.align 4,0x90 -L054xts_enc_three: - movaps %xmm1,%xmm7 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - movups 32(%esi),%xmm4 - leal 48(%esi),%esi - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - xorps %xmm7,%xmm4 - call __aesni_encrypt3 - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - xorps %xmm7,%xmm4 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - leal 48(%edi),%edi - movdqa %xmm7,%xmm1 - jmp L056xts_enc_done -.align 4,0x90 -L055xts_enc_four: - movaps %xmm1,%xmm6 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - movups 32(%esi),%xmm4 - xorps (%esp),%xmm2 - movups 48(%esi),%xmm5 - leal 64(%esi),%esi - xorps 16(%esp),%xmm3 - xorps %xmm7,%xmm4 - xorps %xmm6,%xmm5 - call __aesni_encrypt4 - xorps (%esp),%xmm2 - xorps 16(%esp),%xmm3 - xorps %xmm7,%xmm4 - movups %xmm2,(%edi) - xorps %xmm6,%xmm5 - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - leal 64(%edi),%edi - movdqa %xmm6,%xmm1 - jmp L056xts_enc_done -.align 4,0x90 -L051xts_enc_done6x: - movl 112(%esp),%eax - andl $15,%eax - jz L058xts_enc_ret - movdqa %xmm1,%xmm5 - movl %eax,112(%esp) - jmp L059xts_enc_steal -.align 4,0x90 -L056xts_enc_done: - movl 112(%esp),%eax - pxor %xmm0,%xmm0 - andl $15,%eax - jz L058xts_enc_ret - pcmpgtd %xmm1,%xmm0 - movl %eax,112(%esp) - pshufd $19,%xmm0,%xmm5 - paddq %xmm1,%xmm1 - pand 96(%esp),%xmm5 - pxor %xmm1,%xmm5 -L059xts_enc_steal: - movzbl (%esi),%ecx - movzbl -16(%edi),%edx - leal 1(%esi),%esi - movb %cl,-16(%edi) - movb %dl,(%edi) - leal 1(%edi),%edi - subl $1,%eax - jnz L059xts_enc_steal - subl 112(%esp),%edi - movl %ebp,%edx - movl %ebx,%ecx - movups -16(%edi),%xmm2 - xorps %xmm5,%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L060enc1_loop_10: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L060enc1_loop_10 -.byte 102,15,56,221,209 - xorps %xmm5,%xmm2 - movups %xmm2,-16(%edi) -L058xts_enc_ret: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movdqa %xmm0,(%esp) - pxor %xmm3,%xmm3 - movdqa %xmm0,16(%esp) - pxor %xmm4,%xmm4 - movdqa %xmm0,32(%esp) - pxor %xmm5,%xmm5 - movdqa %xmm0,48(%esp) - pxor %xmm6,%xmm6 - movdqa %xmm0,64(%esp) - pxor %xmm7,%xmm7 - movdqa %xmm0,80(%esp) - movl 116(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _aes_hw_xts_decrypt -.private_extern _aes_hw_xts_decrypt -.align 4 -_aes_hw_xts_decrypt: -L_aes_hw_xts_decrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 36(%esp),%edx - movl 40(%esp),%esi - movl 240(%edx),%ecx - movups (%esi),%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L061enc1_loop_11: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L061enc1_loop_11 -.byte 102,15,56,221,209 - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl %esp,%ebp - subl $120,%esp - andl $-16,%esp - xorl %ebx,%ebx - testl $15,%eax - setnz %bl - shll $4,%ebx - subl %ebx,%eax - movl $135,96(%esp) - movl $0,100(%esp) - movl $1,104(%esp) - movl $0,108(%esp) - movl %eax,112(%esp) - movl %ebp,116(%esp) - movl 240(%edx),%ecx - movl %edx,%ebp - movl %ecx,%ebx - movdqa %xmm2,%xmm1 - pxor %xmm0,%xmm0 - movdqa 96(%esp),%xmm3 - pcmpgtd %xmm1,%xmm0 - andl $-16,%eax - subl $96,%eax - jc L062xts_dec_short - shll $4,%ecx - movl $16,%ebx - subl %ecx,%ebx - leal 32(%edx,%ecx,1),%edx - jmp L063xts_dec_loop6 -.align 4,0x90 -L063xts_dec_loop6: - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,16(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,32(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,48(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm7 - movdqa %xmm1,64(%esp) - paddq %xmm1,%xmm1 - movups (%ebp),%xmm0 - pand %xmm3,%xmm7 - movups (%esi),%xmm2 - pxor %xmm1,%xmm7 - movl %ebx,%ecx - movdqu 16(%esi),%xmm3 - xorps %xmm0,%xmm2 - movdqu 32(%esi),%xmm4 - pxor %xmm0,%xmm3 - movdqu 48(%esi),%xmm5 - pxor %xmm0,%xmm4 - movdqu 64(%esi),%xmm6 - pxor %xmm0,%xmm5 - movdqu 80(%esi),%xmm1 - pxor %xmm0,%xmm6 - leal 96(%esi),%esi - pxor (%esp),%xmm2 - movdqa %xmm7,80(%esp) - pxor %xmm1,%xmm7 - movups 16(%ebp),%xmm1 - pxor 16(%esp),%xmm3 - pxor 32(%esp),%xmm4 -.byte 102,15,56,222,209 - pxor 48(%esp),%xmm5 - pxor 64(%esp),%xmm6 -.byte 102,15,56,222,217 - pxor %xmm0,%xmm7 - movups 32(%ebp),%xmm0 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - call L_aesni_decrypt6_enter - movdqa 80(%esp),%xmm1 - pxor %xmm0,%xmm0 - xorps (%esp),%xmm2 - pcmpgtd %xmm1,%xmm0 - xorps 16(%esp),%xmm3 - movups %xmm2,(%edi) - xorps 32(%esp),%xmm4 - movups %xmm3,16(%edi) - xorps 48(%esp),%xmm5 - movups %xmm4,32(%edi) - xorps 64(%esp),%xmm6 - movups %xmm5,48(%edi) - xorps %xmm1,%xmm7 - movups %xmm6,64(%edi) - pshufd $19,%xmm0,%xmm2 - movups %xmm7,80(%edi) - leal 96(%edi),%edi - movdqa 96(%esp),%xmm3 - pxor %xmm0,%xmm0 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - subl $96,%eax - jnc L063xts_dec_loop6 - movl 240(%ebp),%ecx - movl %ebp,%edx - movl %ecx,%ebx -L062xts_dec_short: - addl $96,%eax - jz L064xts_dec_done6x - movdqa %xmm1,%xmm5 - cmpl $32,%eax - jb L065xts_dec_one - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - je L066xts_dec_two - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,%xmm6 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - cmpl $64,%eax - jb L067xts_dec_three - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,%xmm7 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - movdqa %xmm5,(%esp) - movdqa %xmm6,16(%esp) - je L068xts_dec_four - movdqa %xmm7,32(%esp) - pshufd $19,%xmm0,%xmm7 - movdqa %xmm1,48(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm7 - pxor %xmm1,%xmm7 - movdqu (%esi),%xmm2 - movdqu 16(%esi),%xmm3 - movdqu 32(%esi),%xmm4 - pxor (%esp),%xmm2 - movdqu 48(%esi),%xmm5 - pxor 16(%esp),%xmm3 - movdqu 64(%esi),%xmm6 - pxor 32(%esp),%xmm4 - leal 80(%esi),%esi - pxor 48(%esp),%xmm5 - movdqa %xmm7,64(%esp) - pxor %xmm7,%xmm6 - call __aesni_decrypt6 - movaps 64(%esp),%xmm1 - xorps (%esp),%xmm2 - xorps 16(%esp),%xmm3 - xorps 32(%esp),%xmm4 - movups %xmm2,(%edi) - xorps 48(%esp),%xmm5 - movups %xmm3,16(%edi) - xorps %xmm1,%xmm6 - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - leal 80(%edi),%edi - jmp L069xts_dec_done -.align 4,0x90 -L065xts_dec_one: - movups (%esi),%xmm2 - leal 16(%esi),%esi - xorps %xmm5,%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L070dec1_loop_12: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L070dec1_loop_12 -.byte 102,15,56,223,209 - xorps %xmm5,%xmm2 - movups %xmm2,(%edi) - leal 16(%edi),%edi - movdqa %xmm5,%xmm1 - jmp L069xts_dec_done -.align 4,0x90 -L066xts_dec_two: - movaps %xmm1,%xmm6 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - leal 32(%esi),%esi - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - call __aesni_decrypt2 - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - leal 32(%edi),%edi - movdqa %xmm6,%xmm1 - jmp L069xts_dec_done -.align 4,0x90 -L067xts_dec_three: - movaps %xmm1,%xmm7 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - movups 32(%esi),%xmm4 - leal 48(%esi),%esi - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - xorps %xmm7,%xmm4 - call __aesni_decrypt3 - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - xorps %xmm7,%xmm4 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - leal 48(%edi),%edi - movdqa %xmm7,%xmm1 - jmp L069xts_dec_done -.align 4,0x90 -L068xts_dec_four: - movaps %xmm1,%xmm6 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - movups 32(%esi),%xmm4 - xorps (%esp),%xmm2 - movups 48(%esi),%xmm5 - leal 64(%esi),%esi - xorps 16(%esp),%xmm3 - xorps %xmm7,%xmm4 - xorps %xmm6,%xmm5 - call __aesni_decrypt4 - xorps (%esp),%xmm2 - xorps 16(%esp),%xmm3 - xorps %xmm7,%xmm4 - movups %xmm2,(%edi) - xorps %xmm6,%xmm5 - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - leal 64(%edi),%edi - movdqa %xmm6,%xmm1 - jmp L069xts_dec_done -.align 4,0x90 -L064xts_dec_done6x: - movl 112(%esp),%eax - andl $15,%eax - jz L071xts_dec_ret - movl %eax,112(%esp) - jmp L072xts_dec_only_one_more -.align 4,0x90 -L069xts_dec_done: - movl 112(%esp),%eax - pxor %xmm0,%xmm0 - andl $15,%eax - jz L071xts_dec_ret - pcmpgtd %xmm1,%xmm0 - movl %eax,112(%esp) - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa 96(%esp),%xmm3 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 -L072xts_dec_only_one_more: - pshufd $19,%xmm0,%xmm5 - movdqa %xmm1,%xmm6 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm5 - pxor %xmm1,%xmm5 - movl %ebp,%edx - movl %ebx,%ecx - movups (%esi),%xmm2 - xorps %xmm5,%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L073dec1_loop_13: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L073dec1_loop_13 -.byte 102,15,56,223,209 - xorps %xmm5,%xmm2 - movups %xmm2,(%edi) -L074xts_dec_steal: - movzbl 16(%esi),%ecx - movzbl (%edi),%edx - leal 1(%esi),%esi - movb %cl,(%edi) - movb %dl,16(%edi) - leal 1(%edi),%edi - subl $1,%eax - jnz L074xts_dec_steal - subl 112(%esp),%edi - movl %ebp,%edx - movl %ebx,%ecx - movups (%edi),%xmm2 - xorps %xmm6,%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L075dec1_loop_14: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L075dec1_loop_14 -.byte 102,15,56,223,209 - xorps %xmm6,%xmm2 - movups %xmm2,(%edi) -L071xts_dec_ret: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movdqa %xmm0,(%esp) - pxor %xmm3,%xmm3 - movdqa %xmm0,16(%esp) - pxor %xmm4,%xmm4 - movdqa %xmm0,32(%esp) - pxor %xmm5,%xmm5 - movdqa %xmm0,48(%esp) - pxor %xmm6,%xmm6 - movdqa %xmm0,64(%esp) - pxor %xmm7,%xmm7 - movdqa %xmm0,80(%esp) - movl 116(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _aes_hw_cbc_encrypt -.private_extern _aes_hw_cbc_encrypt -.align 4 -_aes_hw_cbc_encrypt: -L_aes_hw_cbc_encrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl %esp,%ebx - movl 24(%esp),%edi - subl $24,%ebx - movl 28(%esp),%eax - andl $-16,%ebx - movl 32(%esp),%edx - movl 36(%esp),%ebp - testl %eax,%eax - jz L076cbc_abort - cmpl $0,40(%esp) - xchgl %esp,%ebx - movups (%ebp),%xmm7 - movl 240(%edx),%ecx - movl %edx,%ebp - movl %ebx,16(%esp) - movl %ecx,%ebx - je L077cbc_decrypt - movaps %xmm7,%xmm2 - cmpl $16,%eax - jb L078cbc_enc_tail - subl $16,%eax - jmp L079cbc_enc_loop -.align 4,0x90 -L079cbc_enc_loop: - movups (%esi),%xmm7 - leal 16(%esi),%esi - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm7 - leal 32(%edx),%edx - xorps %xmm7,%xmm2 -L080enc1_loop_15: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L080enc1_loop_15 -.byte 102,15,56,221,209 - movl %ebx,%ecx - movl %ebp,%edx - movups %xmm2,(%edi) - leal 16(%edi),%edi - subl $16,%eax - jnc L079cbc_enc_loop - addl $16,%eax - jnz L078cbc_enc_tail - movaps %xmm2,%xmm7 - pxor %xmm2,%xmm2 - jmp L081cbc_ret -L078cbc_enc_tail: - movl %eax,%ecx -.long 2767451785 - movl $16,%ecx - subl %eax,%ecx - xorl %eax,%eax -.long 2868115081 - leal -16(%edi),%edi - movl %ebx,%ecx - movl %edi,%esi - movl %ebp,%edx - jmp L079cbc_enc_loop -.align 4,0x90 -L077cbc_decrypt: - cmpl $80,%eax - jbe L082cbc_dec_tail - movaps %xmm7,(%esp) - subl $80,%eax - jmp L083cbc_dec_loop6_enter -.align 4,0x90 -L084cbc_dec_loop6: - movaps %xmm0,(%esp) - movups %xmm7,(%edi) - leal 16(%edi),%edi -L083cbc_dec_loop6_enter: - movdqu (%esi),%xmm2 - movdqu 16(%esi),%xmm3 - movdqu 32(%esi),%xmm4 - movdqu 48(%esi),%xmm5 - movdqu 64(%esi),%xmm6 - movdqu 80(%esi),%xmm7 - call __aesni_decrypt6 - movups (%esi),%xmm1 - movups 16(%esi),%xmm0 - xorps (%esp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%esi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%esi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%esi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%esi),%xmm0 - xorps %xmm1,%xmm7 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - leal 96(%esi),%esi - movups %xmm4,32(%edi) - movl %ebx,%ecx - movups %xmm5,48(%edi) - movl %ebp,%edx - movups %xmm6,64(%edi) - leal 80(%edi),%edi - subl $96,%eax - ja L084cbc_dec_loop6 - movaps %xmm7,%xmm2 - movaps %xmm0,%xmm7 - addl $80,%eax - jle L085cbc_dec_clear_tail_collected - movups %xmm2,(%edi) - leal 16(%edi),%edi -L082cbc_dec_tail: - movups (%esi),%xmm2 - movaps %xmm2,%xmm6 - cmpl $16,%eax - jbe L086cbc_dec_one - movups 16(%esi),%xmm3 - movaps %xmm3,%xmm5 - cmpl $32,%eax - jbe L087cbc_dec_two - movups 32(%esi),%xmm4 - cmpl $48,%eax - jbe L088cbc_dec_three - movups 48(%esi),%xmm5 - cmpl $64,%eax - jbe L089cbc_dec_four - movups 64(%esi),%xmm6 - movaps %xmm7,(%esp) - movups (%esi),%xmm2 - xorps %xmm7,%xmm7 - call __aesni_decrypt6 - movups (%esi),%xmm1 - movups 16(%esi),%xmm0 - xorps (%esp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%esi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%esi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%esi),%xmm7 - xorps %xmm0,%xmm6 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%edi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%edi) - pxor %xmm5,%xmm5 - leal 64(%edi),%edi - movaps %xmm6,%xmm2 - pxor %xmm6,%xmm6 - subl $80,%eax - jmp L090cbc_dec_tail_collected -.align 4,0x90 -L086cbc_dec_one: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -L091dec1_loop_16: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz L091dec1_loop_16 -.byte 102,15,56,223,209 - xorps %xmm7,%xmm2 - movaps %xmm6,%xmm7 - subl $16,%eax - jmp L090cbc_dec_tail_collected -.align 4,0x90 -L087cbc_dec_two: - call __aesni_decrypt2 - xorps %xmm7,%xmm2 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - movaps %xmm3,%xmm2 - pxor %xmm3,%xmm3 - leal 16(%edi),%edi - movaps %xmm5,%xmm7 - subl $32,%eax - jmp L090cbc_dec_tail_collected -.align 4,0x90 -L088cbc_dec_three: - call __aesni_decrypt3 - xorps %xmm7,%xmm2 - xorps %xmm6,%xmm3 - xorps %xmm5,%xmm4 - movups %xmm2,(%edi) - movaps %xmm4,%xmm2 - pxor %xmm4,%xmm4 - movups %xmm3,16(%edi) - pxor %xmm3,%xmm3 - leal 32(%edi),%edi - movups 32(%esi),%xmm7 - subl $48,%eax - jmp L090cbc_dec_tail_collected -.align 4,0x90 -L089cbc_dec_four: - call __aesni_decrypt4 - movups 16(%esi),%xmm1 - movups 32(%esi),%xmm0 - xorps %xmm7,%xmm2 - movups 48(%esi),%xmm7 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - xorps %xmm1,%xmm4 - movups %xmm3,16(%edi) - pxor %xmm3,%xmm3 - xorps %xmm0,%xmm5 - movups %xmm4,32(%edi) - pxor %xmm4,%xmm4 - leal 48(%edi),%edi - movaps %xmm5,%xmm2 - pxor %xmm5,%xmm5 - subl $64,%eax - jmp L090cbc_dec_tail_collected -.align 4,0x90 -L085cbc_dec_clear_tail_collected: - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 -L090cbc_dec_tail_collected: - andl $15,%eax - jnz L092cbc_dec_tail_partial - movups %xmm2,(%edi) - pxor %xmm0,%xmm0 - jmp L081cbc_ret -.align 4,0x90 -L092cbc_dec_tail_partial: - movaps %xmm2,(%esp) - pxor %xmm0,%xmm0 - movl $16,%ecx - movl %esp,%esi - subl %eax,%ecx -.long 2767451785 - movdqa %xmm2,(%esp) -L081cbc_ret: - movl 16(%esp),%esp - movl 36(%esp),%ebp - pxor %xmm2,%xmm2 - pxor %xmm1,%xmm1 - movups %xmm7,(%ebp) - pxor %xmm7,%xmm7 -L076cbc_abort: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.private_extern __aesni_set_encrypt_key -.align 4 -__aesni_set_encrypt_key: - pushl %ebp - pushl %ebx - testl %eax,%eax - jz L093bad_pointer - testl %edx,%edx - jz L093bad_pointer - call L094pic -L094pic: - popl %ebx - leal Lkey_const-L094pic(%ebx),%ebx - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-Lkey_const(%ebx),%ebp - movups (%eax),%xmm0 - xorps %xmm4,%xmm4 - movl 4(%ebp),%ebp - leal 16(%edx),%edx - andl $268437504,%ebp - cmpl $256,%ecx - je L09514rounds - cmpl $192,%ecx - je L09612rounds - cmpl $128,%ecx - jne L097bad_keybits -.align 4,0x90 -L09810rounds: - cmpl $268435456,%ebp - je L09910rounds_alt - movl $9,%ecx - movups %xmm0,-16(%edx) -.byte 102,15,58,223,200,1 - call L100key_128_cold -.byte 102,15,58,223,200,2 - call L101key_128 -.byte 102,15,58,223,200,4 - call L101key_128 -.byte 102,15,58,223,200,8 - call L101key_128 -.byte 102,15,58,223,200,16 - call L101key_128 -.byte 102,15,58,223,200,32 - call L101key_128 -.byte 102,15,58,223,200,64 - call L101key_128 -.byte 102,15,58,223,200,128 - call L101key_128 -.byte 102,15,58,223,200,27 - call L101key_128 -.byte 102,15,58,223,200,54 - call L101key_128 - movups %xmm0,(%edx) - movl %ecx,80(%edx) - jmp L102good_key -.align 4,0x90 -L101key_128: - movups %xmm0,(%edx) - leal 16(%edx),%edx -L100key_128_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - ret -.align 4,0x90 -L09910rounds_alt: - movdqa (%ebx),%xmm5 - movl $8,%ecx - movdqa 32(%ebx),%xmm4 - movdqa %xmm0,%xmm2 - movdqu %xmm0,-16(%edx) -L103loop_key128: -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - pslld $1,%xmm4 - leal 16(%edx),%edx - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - pxor %xmm2,%xmm0 - movdqu %xmm0,-16(%edx) - movdqa %xmm0,%xmm2 - decl %ecx - jnz L103loop_key128 - movdqa 48(%ebx),%xmm4 -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - pslld $1,%xmm4 - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - pxor %xmm2,%xmm0 - movdqu %xmm0,(%edx) - movdqa %xmm0,%xmm2 -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - pxor %xmm2,%xmm0 - movdqu %xmm0,16(%edx) - movl $9,%ecx - movl %ecx,96(%edx) - jmp L102good_key -.align 4,0x90 -L09612rounds: - movq 16(%eax),%xmm2 - cmpl $268435456,%ebp - je L10412rounds_alt - movl $11,%ecx - movups %xmm0,-16(%edx) -.byte 102,15,58,223,202,1 - call L105key_192a_cold -.byte 102,15,58,223,202,2 - call L106key_192b -.byte 102,15,58,223,202,4 - call L107key_192a -.byte 102,15,58,223,202,8 - call L106key_192b -.byte 102,15,58,223,202,16 - call L107key_192a -.byte 102,15,58,223,202,32 - call L106key_192b -.byte 102,15,58,223,202,64 - call L107key_192a -.byte 102,15,58,223,202,128 - call L106key_192b - movups %xmm0,(%edx) - movl %ecx,48(%edx) - jmp L102good_key -.align 4,0x90 -L107key_192a: - movups %xmm0,(%edx) - leal 16(%edx),%edx -.align 4,0x90 -L105key_192a_cold: - movaps %xmm2,%xmm5 -L108key_192b_warm: - shufps $16,%xmm0,%xmm4 - movdqa %xmm2,%xmm3 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - pslldq $4,%xmm3 - xorps %xmm4,%xmm0 - pshufd $85,%xmm1,%xmm1 - pxor %xmm3,%xmm2 - pxor %xmm1,%xmm0 - pshufd $255,%xmm0,%xmm3 - pxor %xmm3,%xmm2 - ret -.align 4,0x90 -L106key_192b: - movaps %xmm0,%xmm3 - shufps $68,%xmm0,%xmm5 - movups %xmm5,(%edx) - shufps $78,%xmm2,%xmm3 - movups %xmm3,16(%edx) - leal 32(%edx),%edx - jmp L108key_192b_warm -.align 4,0x90 -L10412rounds_alt: - movdqa 16(%ebx),%xmm5 - movdqa 32(%ebx),%xmm4 - movl $8,%ecx - movdqu %xmm0,-16(%edx) -L109loop_key192: - movq %xmm2,(%edx) - movdqa %xmm2,%xmm1 -.byte 102,15,56,0,213 -.byte 102,15,56,221,212 - pslld $1,%xmm4 - leal 24(%edx),%edx - movdqa %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm3,%xmm0 - pshufd $255,%xmm0,%xmm3 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - pxor %xmm2,%xmm0 - pxor %xmm3,%xmm2 - movdqu %xmm0,-16(%edx) - decl %ecx - jnz L109loop_key192 - movl $11,%ecx - movl %ecx,32(%edx) - jmp L102good_key -.align 4,0x90 -L09514rounds: - movups 16(%eax),%xmm2 - leal 16(%edx),%edx - cmpl $268435456,%ebp - je L11014rounds_alt - movl $13,%ecx - movups %xmm0,-32(%edx) - movups %xmm2,-16(%edx) -.byte 102,15,58,223,202,1 - call L111key_256a_cold -.byte 102,15,58,223,200,1 - call L112key_256b -.byte 102,15,58,223,202,2 - call L113key_256a -.byte 102,15,58,223,200,2 - call L112key_256b -.byte 102,15,58,223,202,4 - call L113key_256a -.byte 102,15,58,223,200,4 - call L112key_256b -.byte 102,15,58,223,202,8 - call L113key_256a -.byte 102,15,58,223,200,8 - call L112key_256b -.byte 102,15,58,223,202,16 - call L113key_256a -.byte 102,15,58,223,200,16 - call L112key_256b -.byte 102,15,58,223,202,32 - call L113key_256a -.byte 102,15,58,223,200,32 - call L112key_256b -.byte 102,15,58,223,202,64 - call L113key_256a - movups %xmm0,(%edx) - movl %ecx,16(%edx) - xorl %eax,%eax - jmp L102good_key -.align 4,0x90 -L113key_256a: - movups %xmm2,(%edx) - leal 16(%edx),%edx -L111key_256a_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - ret -.align 4,0x90 -L112key_256b: - movups %xmm0,(%edx) - leal 16(%edx),%edx - shufps $16,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $140,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $170,%xmm1,%xmm1 - xorps %xmm1,%xmm2 - ret -.align 4,0x90 -L11014rounds_alt: - movdqa (%ebx),%xmm5 - movdqa 32(%ebx),%xmm4 - movl $7,%ecx - movdqu %xmm0,-32(%edx) - movdqa %xmm2,%xmm1 - movdqu %xmm2,-16(%edx) -L114loop_key256: -.byte 102,15,56,0,213 -.byte 102,15,56,221,212 - movdqa %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm3,%xmm0 - pslld $1,%xmm4 - pxor %xmm2,%xmm0 - movdqu %xmm0,(%edx) - decl %ecx - jz L115done_key256 - pshufd $255,%xmm0,%xmm2 - pxor %xmm3,%xmm3 -.byte 102,15,56,221,211 - movdqa %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm3,%xmm1 - pxor %xmm1,%xmm2 - movdqu %xmm2,16(%edx) - leal 32(%edx),%edx - movdqa %xmm2,%xmm1 - jmp L114loop_key256 -L115done_key256: - movl $13,%ecx - movl %ecx,16(%edx) -L102good_key: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - xorl %eax,%eax - popl %ebx - popl %ebp - ret -.align 2,0x90 -L093bad_pointer: - movl $-1,%eax - popl %ebx - popl %ebp - ret -.align 2,0x90 -L097bad_keybits: - pxor %xmm0,%xmm0 - movl $-2,%eax - popl %ebx - popl %ebp - ret -.globl _aes_hw_set_encrypt_key -.private_extern _aes_hw_set_encrypt_key -.align 4 -_aes_hw_set_encrypt_key: -L_aes_hw_set_encrypt_key_begin: -#ifdef BORINGSSL_DISPATCH_TEST - pushl %ebx - pushl %edx - call L116pic -L116pic: - popl %ebx - leal _BORINGSSL_function_hit+3-L116pic(%ebx),%ebx - movl $1,%edx - movb %dl,(%ebx) - popl %edx - popl %ebx -#endif - movl 4(%esp),%eax - movl 8(%esp),%ecx - movl 12(%esp),%edx - call __aesni_set_encrypt_key - ret -.globl _aes_hw_set_decrypt_key -.private_extern _aes_hw_set_decrypt_key -.align 4 -_aes_hw_set_decrypt_key: -L_aes_hw_set_decrypt_key_begin: - movl 4(%esp),%eax - movl 8(%esp),%ecx - movl 12(%esp),%edx - call __aesni_set_encrypt_key - movl 12(%esp),%edx - shll $4,%ecx - testl %eax,%eax - jnz L117dec_key_ret - leal 16(%edx,%ecx,1),%eax - movups (%edx),%xmm0 - movups (%eax),%xmm1 - movups %xmm0,(%eax) - movups %xmm1,(%edx) - leal 16(%edx),%edx - leal -16(%eax),%eax -L118dec_key_inverse: - movups (%edx),%xmm0 - movups (%eax),%xmm1 -.byte 102,15,56,219,192 -.byte 102,15,56,219,201 - leal 16(%edx),%edx - leal -16(%eax),%eax - movups %xmm0,16(%eax) - movups %xmm1,-16(%edx) - cmpl %edx,%eax - ja L118dec_key_inverse - movups (%edx),%xmm0 -.byte 102,15,56,219,192 - movups %xmm0,(%edx) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - xorl %eax,%eax -L117dec_key_ret: - ret -.align 6,0x90 -Lkey_const: -.long 202313229,202313229,202313229,202313229 -.long 67569157,67569157,67569157,67569157 -.long 1,1,1,1 -.long 27,27,27,27 -.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 -.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 -.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 -.byte 115,108,46,111,114,103,62,0 -.section __IMPORT,__pointers,non_lazy_symbol_pointers -L_OPENSSL_ia32cap_P$non_lazy_ptr: -.indirect_symbol _OPENSSL_ia32cap_P -.long 0 -#endif diff --git a/third_party/boringssl/apple-x86/crypto/fipsmodule/bn-586.S b/third_party/boringssl/apple-x86/crypto/fipsmodule/bn-586.S deleted file mode 100644 index ede2e76d..00000000 --- a/third_party/boringssl/apple-x86/crypto/fipsmodule/bn-586.S +++ /dev/null @@ -1,988 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl _bn_mul_add_words -.private_extern _bn_mul_add_words -.align 4 -_bn_mul_add_words: -L_bn_mul_add_words_begin: - call L000PIC_me_up -L000PIC_me_up: - popl %eax - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L000PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc L001maw_non_sse2 - movl 4(%esp),%eax - movl 8(%esp),%edx - movl 12(%esp),%ecx - movd 16(%esp),%mm0 - pxor %mm1,%mm1 - jmp L002maw_sse2_entry -.align 4,0x90 -L003maw_sse2_unrolled: - movd (%eax),%mm3 - paddq %mm3,%mm1 - movd (%edx),%mm2 - pmuludq %mm0,%mm2 - movd 4(%edx),%mm4 - pmuludq %mm0,%mm4 - movd 8(%edx),%mm6 - pmuludq %mm0,%mm6 - movd 12(%edx),%mm7 - pmuludq %mm0,%mm7 - paddq %mm2,%mm1 - movd 4(%eax),%mm3 - paddq %mm4,%mm3 - movd 8(%eax),%mm5 - paddq %mm6,%mm5 - movd 12(%eax),%mm4 - paddq %mm4,%mm7 - movd %mm1,(%eax) - movd 16(%edx),%mm2 - pmuludq %mm0,%mm2 - psrlq $32,%mm1 - movd 20(%edx),%mm4 - pmuludq %mm0,%mm4 - paddq %mm3,%mm1 - movd 24(%edx),%mm6 - pmuludq %mm0,%mm6 - movd %mm1,4(%eax) - psrlq $32,%mm1 - movd 28(%edx),%mm3 - addl $32,%edx - pmuludq %mm0,%mm3 - paddq %mm5,%mm1 - movd 16(%eax),%mm5 - paddq %mm5,%mm2 - movd %mm1,8(%eax) - psrlq $32,%mm1 - paddq %mm7,%mm1 - movd 20(%eax),%mm5 - paddq %mm5,%mm4 - movd %mm1,12(%eax) - psrlq $32,%mm1 - paddq %mm2,%mm1 - movd 24(%eax),%mm5 - paddq %mm5,%mm6 - movd %mm1,16(%eax) - psrlq $32,%mm1 - paddq %mm4,%mm1 - movd 28(%eax),%mm5 - paddq %mm5,%mm3 - movd %mm1,20(%eax) - psrlq $32,%mm1 - paddq %mm6,%mm1 - movd %mm1,24(%eax) - psrlq $32,%mm1 - paddq %mm3,%mm1 - movd %mm1,28(%eax) - leal 32(%eax),%eax - psrlq $32,%mm1 - subl $8,%ecx - jz L004maw_sse2_exit -L002maw_sse2_entry: - testl $4294967288,%ecx - jnz L003maw_sse2_unrolled -.align 2,0x90 -L005maw_sse2_loop: - movd (%edx),%mm2 - movd (%eax),%mm3 - pmuludq %mm0,%mm2 - leal 4(%edx),%edx - paddq %mm3,%mm1 - paddq %mm2,%mm1 - movd %mm1,(%eax) - subl $1,%ecx - psrlq $32,%mm1 - leal 4(%eax),%eax - jnz L005maw_sse2_loop -L004maw_sse2_exit: - movd %mm1,%eax - emms - ret -.align 4,0x90 -L001maw_non_sse2: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - xorl %esi,%esi - movl 20(%esp),%edi - movl 28(%esp),%ecx - movl 24(%esp),%ebx - andl $4294967288,%ecx - movl 32(%esp),%ebp - pushl %ecx - jz L006maw_finish -.align 4,0x90 -L007maw_loop: - # Round 0 - movl (%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl (%edi),%eax - adcl $0,%edx - movl %eax,(%edi) - movl %edx,%esi - # Round 4 - movl 4(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 4(%edi),%eax - adcl $0,%edx - movl %eax,4(%edi) - movl %edx,%esi - # Round 8 - movl 8(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 8(%edi),%eax - adcl $0,%edx - movl %eax,8(%edi) - movl %edx,%esi - # Round 12 - movl 12(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 12(%edi),%eax - adcl $0,%edx - movl %eax,12(%edi) - movl %edx,%esi - # Round 16 - movl 16(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 16(%edi),%eax - adcl $0,%edx - movl %eax,16(%edi) - movl %edx,%esi - # Round 20 - movl 20(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 20(%edi),%eax - adcl $0,%edx - movl %eax,20(%edi) - movl %edx,%esi - # Round 24 - movl 24(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 24(%edi),%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi - # Round 28 - movl 28(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 28(%edi),%eax - adcl $0,%edx - movl %eax,28(%edi) - movl %edx,%esi - - subl $8,%ecx - leal 32(%ebx),%ebx - leal 32(%edi),%edi - jnz L007maw_loop -L006maw_finish: - movl 32(%esp),%ecx - andl $7,%ecx - jnz L008maw_finish2 - jmp L009maw_end -L008maw_finish2: - # Tail Round 0 - movl (%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl (%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,(%edi) - movl %edx,%esi - jz L009maw_end - # Tail Round 1 - movl 4(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 4(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,4(%edi) - movl %edx,%esi - jz L009maw_end - # Tail Round 2 - movl 8(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 8(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,8(%edi) - movl %edx,%esi - jz L009maw_end - # Tail Round 3 - movl 12(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 12(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,12(%edi) - movl %edx,%esi - jz L009maw_end - # Tail Round 4 - movl 16(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 16(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,16(%edi) - movl %edx,%esi - jz L009maw_end - # Tail Round 5 - movl 20(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 20(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,20(%edi) - movl %edx,%esi - jz L009maw_end - # Tail Round 6 - movl 24(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 24(%edi),%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi -L009maw_end: - movl %esi,%eax - popl %ecx - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _bn_mul_words -.private_extern _bn_mul_words -.align 4 -_bn_mul_words: -L_bn_mul_words_begin: - call L010PIC_me_up -L010PIC_me_up: - popl %eax - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L010PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc L011mw_non_sse2 - movl 4(%esp),%eax - movl 8(%esp),%edx - movl 12(%esp),%ecx - movd 16(%esp),%mm0 - pxor %mm1,%mm1 -.align 4,0x90 -L012mw_sse2_loop: - movd (%edx),%mm2 - pmuludq %mm0,%mm2 - leal 4(%edx),%edx - paddq %mm2,%mm1 - movd %mm1,(%eax) - subl $1,%ecx - psrlq $32,%mm1 - leal 4(%eax),%eax - jnz L012mw_sse2_loop - movd %mm1,%eax - emms - ret -.align 4,0x90 -L011mw_non_sse2: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - xorl %esi,%esi - movl 20(%esp),%edi - movl 24(%esp),%ebx - movl 28(%esp),%ebp - movl 32(%esp),%ecx - andl $4294967288,%ebp - jz L013mw_finish -L014mw_loop: - # Round 0 - movl (%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,(%edi) - movl %edx,%esi - # Round 4 - movl 4(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,4(%edi) - movl %edx,%esi - # Round 8 - movl 8(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,8(%edi) - movl %edx,%esi - # Round 12 - movl 12(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,12(%edi) - movl %edx,%esi - # Round 16 - movl 16(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,16(%edi) - movl %edx,%esi - # Round 20 - movl 20(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,20(%edi) - movl %edx,%esi - # Round 24 - movl 24(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi - # Round 28 - movl 28(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,28(%edi) - movl %edx,%esi - - addl $32,%ebx - addl $32,%edi - subl $8,%ebp - jz L013mw_finish - jmp L014mw_loop -L013mw_finish: - movl 28(%esp),%ebp - andl $7,%ebp - jnz L015mw_finish2 - jmp L016mw_end -L015mw_finish2: - # Tail Round 0 - movl (%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,(%edi) - movl %edx,%esi - decl %ebp - jz L016mw_end - # Tail Round 1 - movl 4(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,4(%edi) - movl %edx,%esi - decl %ebp - jz L016mw_end - # Tail Round 2 - movl 8(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,8(%edi) - movl %edx,%esi - decl %ebp - jz L016mw_end - # Tail Round 3 - movl 12(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,12(%edi) - movl %edx,%esi - decl %ebp - jz L016mw_end - # Tail Round 4 - movl 16(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,16(%edi) - movl %edx,%esi - decl %ebp - jz L016mw_end - # Tail Round 5 - movl 20(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,20(%edi) - movl %edx,%esi - decl %ebp - jz L016mw_end - # Tail Round 6 - movl 24(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi -L016mw_end: - movl %esi,%eax - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _bn_sqr_words -.private_extern _bn_sqr_words -.align 4 -_bn_sqr_words: -L_bn_sqr_words_begin: - call L017PIC_me_up -L017PIC_me_up: - popl %eax - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L017PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc L018sqr_non_sse2 - movl 4(%esp),%eax - movl 8(%esp),%edx - movl 12(%esp),%ecx -.align 4,0x90 -L019sqr_sse2_loop: - movd (%edx),%mm0 - pmuludq %mm0,%mm0 - leal 4(%edx),%edx - movq %mm0,(%eax) - subl $1,%ecx - leal 8(%eax),%eax - jnz L019sqr_sse2_loop - emms - ret -.align 4,0x90 -L018sqr_non_sse2: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%ebx - andl $4294967288,%ebx - jz L020sw_finish -L021sw_loop: - # Round 0 - movl (%edi),%eax - mull %eax - movl %eax,(%esi) - movl %edx,4(%esi) - # Round 4 - movl 4(%edi),%eax - mull %eax - movl %eax,8(%esi) - movl %edx,12(%esi) - # Round 8 - movl 8(%edi),%eax - mull %eax - movl %eax,16(%esi) - movl %edx,20(%esi) - # Round 12 - movl 12(%edi),%eax - mull %eax - movl %eax,24(%esi) - movl %edx,28(%esi) - # Round 16 - movl 16(%edi),%eax - mull %eax - movl %eax,32(%esi) - movl %edx,36(%esi) - # Round 20 - movl 20(%edi),%eax - mull %eax - movl %eax,40(%esi) - movl %edx,44(%esi) - # Round 24 - movl 24(%edi),%eax - mull %eax - movl %eax,48(%esi) - movl %edx,52(%esi) - # Round 28 - movl 28(%edi),%eax - mull %eax - movl %eax,56(%esi) - movl %edx,60(%esi) - - addl $32,%edi - addl $64,%esi - subl $8,%ebx - jnz L021sw_loop -L020sw_finish: - movl 28(%esp),%ebx - andl $7,%ebx - jz L022sw_end - # Tail Round 0 - movl (%edi),%eax - mull %eax - movl %eax,(%esi) - decl %ebx - movl %edx,4(%esi) - jz L022sw_end - # Tail Round 1 - movl 4(%edi),%eax - mull %eax - movl %eax,8(%esi) - decl %ebx - movl %edx,12(%esi) - jz L022sw_end - # Tail Round 2 - movl 8(%edi),%eax - mull %eax - movl %eax,16(%esi) - decl %ebx - movl %edx,20(%esi) - jz L022sw_end - # Tail Round 3 - movl 12(%edi),%eax - mull %eax - movl %eax,24(%esi) - decl %ebx - movl %edx,28(%esi) - jz L022sw_end - # Tail Round 4 - movl 16(%edi),%eax - mull %eax - movl %eax,32(%esi) - decl %ebx - movl %edx,36(%esi) - jz L022sw_end - # Tail Round 5 - movl 20(%edi),%eax - mull %eax - movl %eax,40(%esi) - decl %ebx - movl %edx,44(%esi) - jz L022sw_end - # Tail Round 6 - movl 24(%edi),%eax - mull %eax - movl %eax,48(%esi) - movl %edx,52(%esi) -L022sw_end: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _bn_div_words -.private_extern _bn_div_words -.align 4 -_bn_div_words: -L_bn_div_words_begin: - movl 4(%esp),%edx - movl 8(%esp),%eax - movl 12(%esp),%ecx - divl %ecx - ret -.globl _bn_add_words -.private_extern _bn_add_words -.align 4 -_bn_add_words: -L_bn_add_words_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 20(%esp),%ebx - movl 24(%esp),%esi - movl 28(%esp),%edi - movl 32(%esp),%ebp - xorl %eax,%eax - andl $4294967288,%ebp - jz L023aw_finish -L024aw_loop: - # Round 0 - movl (%esi),%ecx - movl (%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,(%ebx) - # Round 1 - movl 4(%esi),%ecx - movl 4(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,4(%ebx) - # Round 2 - movl 8(%esi),%ecx - movl 8(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,8(%ebx) - # Round 3 - movl 12(%esi),%ecx - movl 12(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,12(%ebx) - # Round 4 - movl 16(%esi),%ecx - movl 16(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,16(%ebx) - # Round 5 - movl 20(%esi),%ecx - movl 20(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,20(%ebx) - # Round 6 - movl 24(%esi),%ecx - movl 24(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,24(%ebx) - # Round 7 - movl 28(%esi),%ecx - movl 28(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,28(%ebx) - - addl $32,%esi - addl $32,%edi - addl $32,%ebx - subl $8,%ebp - jnz L024aw_loop -L023aw_finish: - movl 32(%esp),%ebp - andl $7,%ebp - jz L025aw_end - # Tail Round 0 - movl (%esi),%ecx - movl (%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,(%ebx) - jz L025aw_end - # Tail Round 1 - movl 4(%esi),%ecx - movl 4(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,4(%ebx) - jz L025aw_end - # Tail Round 2 - movl 8(%esi),%ecx - movl 8(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,8(%ebx) - jz L025aw_end - # Tail Round 3 - movl 12(%esi),%ecx - movl 12(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,12(%ebx) - jz L025aw_end - # Tail Round 4 - movl 16(%esi),%ecx - movl 16(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,16(%ebx) - jz L025aw_end - # Tail Round 5 - movl 20(%esi),%ecx - movl 20(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,20(%ebx) - jz L025aw_end - # Tail Round 6 - movl 24(%esi),%ecx - movl 24(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,24(%ebx) -L025aw_end: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _bn_sub_words -.private_extern _bn_sub_words -.align 4 -_bn_sub_words: -L_bn_sub_words_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 20(%esp),%ebx - movl 24(%esp),%esi - movl 28(%esp),%edi - movl 32(%esp),%ebp - xorl %eax,%eax - andl $4294967288,%ebp - jz L026aw_finish -L027aw_loop: - # Round 0 - movl (%esi),%ecx - movl (%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,(%ebx) - # Round 1 - movl 4(%esi),%ecx - movl 4(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,4(%ebx) - # Round 2 - movl 8(%esi),%ecx - movl 8(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,8(%ebx) - # Round 3 - movl 12(%esi),%ecx - movl 12(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,12(%ebx) - # Round 4 - movl 16(%esi),%ecx - movl 16(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,16(%ebx) - # Round 5 - movl 20(%esi),%ecx - movl 20(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,20(%ebx) - # Round 6 - movl 24(%esi),%ecx - movl 24(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,24(%ebx) - # Round 7 - movl 28(%esi),%ecx - movl 28(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,28(%ebx) - - addl $32,%esi - addl $32,%edi - addl $32,%ebx - subl $8,%ebp - jnz L027aw_loop -L026aw_finish: - movl 32(%esp),%ebp - andl $7,%ebp - jz L028aw_end - # Tail Round 0 - movl (%esi),%ecx - movl (%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,(%ebx) - jz L028aw_end - # Tail Round 1 - movl 4(%esi),%ecx - movl 4(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,4(%ebx) - jz L028aw_end - # Tail Round 2 - movl 8(%esi),%ecx - movl 8(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,8(%ebx) - jz L028aw_end - # Tail Round 3 - movl 12(%esi),%ecx - movl 12(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,12(%ebx) - jz L028aw_end - # Tail Round 4 - movl 16(%esi),%ecx - movl 16(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,16(%ebx) - jz L028aw_end - # Tail Round 5 - movl 20(%esi),%ecx - movl 20(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,20(%ebx) - jz L028aw_end - # Tail Round 6 - movl 24(%esi),%ecx - movl 24(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,24(%ebx) -L028aw_end: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.section __IMPORT,__pointers,non_lazy_symbol_pointers -L_OPENSSL_ia32cap_P$non_lazy_ptr: -.indirect_symbol _OPENSSL_ia32cap_P -.long 0 -#endif diff --git a/third_party/boringssl/apple-x86/crypto/fipsmodule/co-586.S b/third_party/boringssl/apple-x86/crypto/fipsmodule/co-586.S deleted file mode 100644 index 015dffaa..00000000 --- a/third_party/boringssl/apple-x86/crypto/fipsmodule/co-586.S +++ /dev/null @@ -1,1257 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl _bn_mul_comba8 -.private_extern _bn_mul_comba8 -.align 4 -_bn_mul_comba8: -L_bn_mul_comba8_begin: - pushl %esi - movl 12(%esp),%esi - pushl %edi - movl 20(%esp),%edi - pushl %ebp - pushl %ebx - xorl %ebx,%ebx - movl (%esi),%eax - xorl %ecx,%ecx - movl (%edi),%edx - # ################## Calculate word 0 - xorl %ebp,%ebp - # mul a[0]*b[0] - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl (%edi),%edx - adcl $0,%ebp - movl %ebx,(%eax) - movl 4(%esi),%eax - # saved r[0] - # ################## Calculate word 1 - xorl %ebx,%ebx - # mul a[1]*b[0] - mull %edx - addl %eax,%ecx - movl (%esi),%eax - adcl %edx,%ebp - movl 4(%edi),%edx - adcl $0,%ebx - # mul a[0]*b[1] - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl (%edi),%edx - adcl $0,%ebx - movl %ecx,4(%eax) - movl 8(%esi),%eax - # saved r[1] - # ################## Calculate word 2 - xorl %ecx,%ecx - # mul a[2]*b[0] - mull %edx - addl %eax,%ebp - movl 4(%esi),%eax - adcl %edx,%ebx - movl 4(%edi),%edx - adcl $0,%ecx - # mul a[1]*b[1] - mull %edx - addl %eax,%ebp - movl (%esi),%eax - adcl %edx,%ebx - movl 8(%edi),%edx - adcl $0,%ecx - # mul a[0]*b[2] - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl (%edi),%edx - adcl $0,%ecx - movl %ebp,8(%eax) - movl 12(%esi),%eax - # saved r[2] - # ################## Calculate word 3 - xorl %ebp,%ebp - # mul a[3]*b[0] - mull %edx - addl %eax,%ebx - movl 8(%esi),%eax - adcl %edx,%ecx - movl 4(%edi),%edx - adcl $0,%ebp - # mul a[2]*b[1] - mull %edx - addl %eax,%ebx - movl 4(%esi),%eax - adcl %edx,%ecx - movl 8(%edi),%edx - adcl $0,%ebp - # mul a[1]*b[2] - mull %edx - addl %eax,%ebx - movl (%esi),%eax - adcl %edx,%ecx - movl 12(%edi),%edx - adcl $0,%ebp - # mul a[0]*b[3] - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl (%edi),%edx - adcl $0,%ebp - movl %ebx,12(%eax) - movl 16(%esi),%eax - # saved r[3] - # ################## Calculate word 4 - xorl %ebx,%ebx - # mul a[4]*b[0] - mull %edx - addl %eax,%ecx - movl 12(%esi),%eax - adcl %edx,%ebp - movl 4(%edi),%edx - adcl $0,%ebx - # mul a[3]*b[1] - mull %edx - addl %eax,%ecx - movl 8(%esi),%eax - adcl %edx,%ebp - movl 8(%edi),%edx - adcl $0,%ebx - # mul a[2]*b[2] - mull %edx - addl %eax,%ecx - movl 4(%esi),%eax - adcl %edx,%ebp - movl 12(%edi),%edx - adcl $0,%ebx - # mul a[1]*b[3] - mull %edx - addl %eax,%ecx - movl (%esi),%eax - adcl %edx,%ebp - movl 16(%edi),%edx - adcl $0,%ebx - # mul a[0]*b[4] - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl (%edi),%edx - adcl $0,%ebx - movl %ecx,16(%eax) - movl 20(%esi),%eax - # saved r[4] - # ################## Calculate word 5 - xorl %ecx,%ecx - # mul a[5]*b[0] - mull %edx - addl %eax,%ebp - movl 16(%esi),%eax - adcl %edx,%ebx - movl 4(%edi),%edx - adcl $0,%ecx - # mul a[4]*b[1] - mull %edx - addl %eax,%ebp - movl 12(%esi),%eax - adcl %edx,%ebx - movl 8(%edi),%edx - adcl $0,%ecx - # mul a[3]*b[2] - mull %edx - addl %eax,%ebp - movl 8(%esi),%eax - adcl %edx,%ebx - movl 12(%edi),%edx - adcl $0,%ecx - # mul a[2]*b[3] - mull %edx - addl %eax,%ebp - movl 4(%esi),%eax - adcl %edx,%ebx - movl 16(%edi),%edx - adcl $0,%ecx - # mul a[1]*b[4] - mull %edx - addl %eax,%ebp - movl (%esi),%eax - adcl %edx,%ebx - movl 20(%edi),%edx - adcl $0,%ecx - # mul a[0]*b[5] - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl (%edi),%edx - adcl $0,%ecx - movl %ebp,20(%eax) - movl 24(%esi),%eax - # saved r[5] - # ################## Calculate word 6 - xorl %ebp,%ebp - # mul a[6]*b[0] - mull %edx - addl %eax,%ebx - movl 20(%esi),%eax - adcl %edx,%ecx - movl 4(%edi),%edx - adcl $0,%ebp - # mul a[5]*b[1] - mull %edx - addl %eax,%ebx - movl 16(%esi),%eax - adcl %edx,%ecx - movl 8(%edi),%edx - adcl $0,%ebp - # mul a[4]*b[2] - mull %edx - addl %eax,%ebx - movl 12(%esi),%eax - adcl %edx,%ecx - movl 12(%edi),%edx - adcl $0,%ebp - # mul a[3]*b[3] - mull %edx - addl %eax,%ebx - movl 8(%esi),%eax - adcl %edx,%ecx - movl 16(%edi),%edx - adcl $0,%ebp - # mul a[2]*b[4] - mull %edx - addl %eax,%ebx - movl 4(%esi),%eax - adcl %edx,%ecx - movl 20(%edi),%edx - adcl $0,%ebp - # mul a[1]*b[5] - mull %edx - addl %eax,%ebx - movl (%esi),%eax - adcl %edx,%ecx - movl 24(%edi),%edx - adcl $0,%ebp - # mul a[0]*b[6] - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl (%edi),%edx - adcl $0,%ebp - movl %ebx,24(%eax) - movl 28(%esi),%eax - # saved r[6] - # ################## Calculate word 7 - xorl %ebx,%ebx - # mul a[7]*b[0] - mull %edx - addl %eax,%ecx - movl 24(%esi),%eax - adcl %edx,%ebp - movl 4(%edi),%edx - adcl $0,%ebx - # mul a[6]*b[1] - mull %edx - addl %eax,%ecx - movl 20(%esi),%eax - adcl %edx,%ebp - movl 8(%edi),%edx - adcl $0,%ebx - # mul a[5]*b[2] - mull %edx - addl %eax,%ecx - movl 16(%esi),%eax - adcl %edx,%ebp - movl 12(%edi),%edx - adcl $0,%ebx - # mul a[4]*b[3] - mull %edx - addl %eax,%ecx - movl 12(%esi),%eax - adcl %edx,%ebp - movl 16(%edi),%edx - adcl $0,%ebx - # mul a[3]*b[4] - mull %edx - addl %eax,%ecx - movl 8(%esi),%eax - adcl %edx,%ebp - movl 20(%edi),%edx - adcl $0,%ebx - # mul a[2]*b[5] - mull %edx - addl %eax,%ecx - movl 4(%esi),%eax - adcl %edx,%ebp - movl 24(%edi),%edx - adcl $0,%ebx - # mul a[1]*b[6] - mull %edx - addl %eax,%ecx - movl (%esi),%eax - adcl %edx,%ebp - movl 28(%edi),%edx - adcl $0,%ebx - # mul a[0]*b[7] - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl 4(%edi),%edx - adcl $0,%ebx - movl %ecx,28(%eax) - movl 28(%esi),%eax - # saved r[7] - # ################## Calculate word 8 - xorl %ecx,%ecx - # mul a[7]*b[1] - mull %edx - addl %eax,%ebp - movl 24(%esi),%eax - adcl %edx,%ebx - movl 8(%edi),%edx - adcl $0,%ecx - # mul a[6]*b[2] - mull %edx - addl %eax,%ebp - movl 20(%esi),%eax - adcl %edx,%ebx - movl 12(%edi),%edx - adcl $0,%ecx - # mul a[5]*b[3] - mull %edx - addl %eax,%ebp - movl 16(%esi),%eax - adcl %edx,%ebx - movl 16(%edi),%edx - adcl $0,%ecx - # mul a[4]*b[4] - mull %edx - addl %eax,%ebp - movl 12(%esi),%eax - adcl %edx,%ebx - movl 20(%edi),%edx - adcl $0,%ecx - # mul a[3]*b[5] - mull %edx - addl %eax,%ebp - movl 8(%esi),%eax - adcl %edx,%ebx - movl 24(%edi),%edx - adcl $0,%ecx - # mul a[2]*b[6] - mull %edx - addl %eax,%ebp - movl 4(%esi),%eax - adcl %edx,%ebx - movl 28(%edi),%edx - adcl $0,%ecx - # mul a[1]*b[7] - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl 8(%edi),%edx - adcl $0,%ecx - movl %ebp,32(%eax) - movl 28(%esi),%eax - # saved r[8] - # ################## Calculate word 9 - xorl %ebp,%ebp - # mul a[7]*b[2] - mull %edx - addl %eax,%ebx - movl 24(%esi),%eax - adcl %edx,%ecx - movl 12(%edi),%edx - adcl $0,%ebp - # mul a[6]*b[3] - mull %edx - addl %eax,%ebx - movl 20(%esi),%eax - adcl %edx,%ecx - movl 16(%edi),%edx - adcl $0,%ebp - # mul a[5]*b[4] - mull %edx - addl %eax,%ebx - movl 16(%esi),%eax - adcl %edx,%ecx - movl 20(%edi),%edx - adcl $0,%ebp - # mul a[4]*b[5] - mull %edx - addl %eax,%ebx - movl 12(%esi),%eax - adcl %edx,%ecx - movl 24(%edi),%edx - adcl $0,%ebp - # mul a[3]*b[6] - mull %edx - addl %eax,%ebx - movl 8(%esi),%eax - adcl %edx,%ecx - movl 28(%edi),%edx - adcl $0,%ebp - # mul a[2]*b[7] - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl 12(%edi),%edx - adcl $0,%ebp - movl %ebx,36(%eax) - movl 28(%esi),%eax - # saved r[9] - # ################## Calculate word 10 - xorl %ebx,%ebx - # mul a[7]*b[3] - mull %edx - addl %eax,%ecx - movl 24(%esi),%eax - adcl %edx,%ebp - movl 16(%edi),%edx - adcl $0,%ebx - # mul a[6]*b[4] - mull %edx - addl %eax,%ecx - movl 20(%esi),%eax - adcl %edx,%ebp - movl 20(%edi),%edx - adcl $0,%ebx - # mul a[5]*b[5] - mull %edx - addl %eax,%ecx - movl 16(%esi),%eax - adcl %edx,%ebp - movl 24(%edi),%edx - adcl $0,%ebx - # mul a[4]*b[6] - mull %edx - addl %eax,%ecx - movl 12(%esi),%eax - adcl %edx,%ebp - movl 28(%edi),%edx - adcl $0,%ebx - # mul a[3]*b[7] - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl 16(%edi),%edx - adcl $0,%ebx - movl %ecx,40(%eax) - movl 28(%esi),%eax - # saved r[10] - # ################## Calculate word 11 - xorl %ecx,%ecx - # mul a[7]*b[4] - mull %edx - addl %eax,%ebp - movl 24(%esi),%eax - adcl %edx,%ebx - movl 20(%edi),%edx - adcl $0,%ecx - # mul a[6]*b[5] - mull %edx - addl %eax,%ebp - movl 20(%esi),%eax - adcl %edx,%ebx - movl 24(%edi),%edx - adcl $0,%ecx - # mul a[5]*b[6] - mull %edx - addl %eax,%ebp - movl 16(%esi),%eax - adcl %edx,%ebx - movl 28(%edi),%edx - adcl $0,%ecx - # mul a[4]*b[7] - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl 20(%edi),%edx - adcl $0,%ecx - movl %ebp,44(%eax) - movl 28(%esi),%eax - # saved r[11] - # ################## Calculate word 12 - xorl %ebp,%ebp - # mul a[7]*b[5] - mull %edx - addl %eax,%ebx - movl 24(%esi),%eax - adcl %edx,%ecx - movl 24(%edi),%edx - adcl $0,%ebp - # mul a[6]*b[6] - mull %edx - addl %eax,%ebx - movl 20(%esi),%eax - adcl %edx,%ecx - movl 28(%edi),%edx - adcl $0,%ebp - # mul a[5]*b[7] - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl 24(%edi),%edx - adcl $0,%ebp - movl %ebx,48(%eax) - movl 28(%esi),%eax - # saved r[12] - # ################## Calculate word 13 - xorl %ebx,%ebx - # mul a[7]*b[6] - mull %edx - addl %eax,%ecx - movl 24(%esi),%eax - adcl %edx,%ebp - movl 28(%edi),%edx - adcl $0,%ebx - # mul a[6]*b[7] - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl 28(%edi),%edx - adcl $0,%ebx - movl %ecx,52(%eax) - movl 28(%esi),%eax - # saved r[13] - # ################## Calculate word 14 - xorl %ecx,%ecx - # mul a[7]*b[7] - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - adcl $0,%ecx - movl %ebp,56(%eax) - # saved r[14] - # save r[15] - movl %ebx,60(%eax) - popl %ebx - popl %ebp - popl %edi - popl %esi - ret -.globl _bn_mul_comba4 -.private_extern _bn_mul_comba4 -.align 4 -_bn_mul_comba4: -L_bn_mul_comba4_begin: - pushl %esi - movl 12(%esp),%esi - pushl %edi - movl 20(%esp),%edi - pushl %ebp - pushl %ebx - xorl %ebx,%ebx - movl (%esi),%eax - xorl %ecx,%ecx - movl (%edi),%edx - # ################## Calculate word 0 - xorl %ebp,%ebp - # mul a[0]*b[0] - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl (%edi),%edx - adcl $0,%ebp - movl %ebx,(%eax) - movl 4(%esi),%eax - # saved r[0] - # ################## Calculate word 1 - xorl %ebx,%ebx - # mul a[1]*b[0] - mull %edx - addl %eax,%ecx - movl (%esi),%eax - adcl %edx,%ebp - movl 4(%edi),%edx - adcl $0,%ebx - # mul a[0]*b[1] - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl (%edi),%edx - adcl $0,%ebx - movl %ecx,4(%eax) - movl 8(%esi),%eax - # saved r[1] - # ################## Calculate word 2 - xorl %ecx,%ecx - # mul a[2]*b[0] - mull %edx - addl %eax,%ebp - movl 4(%esi),%eax - adcl %edx,%ebx - movl 4(%edi),%edx - adcl $0,%ecx - # mul a[1]*b[1] - mull %edx - addl %eax,%ebp - movl (%esi),%eax - adcl %edx,%ebx - movl 8(%edi),%edx - adcl $0,%ecx - # mul a[0]*b[2] - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl (%edi),%edx - adcl $0,%ecx - movl %ebp,8(%eax) - movl 12(%esi),%eax - # saved r[2] - # ################## Calculate word 3 - xorl %ebp,%ebp - # mul a[3]*b[0] - mull %edx - addl %eax,%ebx - movl 8(%esi),%eax - adcl %edx,%ecx - movl 4(%edi),%edx - adcl $0,%ebp - # mul a[2]*b[1] - mull %edx - addl %eax,%ebx - movl 4(%esi),%eax - adcl %edx,%ecx - movl 8(%edi),%edx - adcl $0,%ebp - # mul a[1]*b[2] - mull %edx - addl %eax,%ebx - movl (%esi),%eax - adcl %edx,%ecx - movl 12(%edi),%edx - adcl $0,%ebp - # mul a[0]*b[3] - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl 4(%edi),%edx - adcl $0,%ebp - movl %ebx,12(%eax) - movl 12(%esi),%eax - # saved r[3] - # ################## Calculate word 4 - xorl %ebx,%ebx - # mul a[3]*b[1] - mull %edx - addl %eax,%ecx - movl 8(%esi),%eax - adcl %edx,%ebp - movl 8(%edi),%edx - adcl $0,%ebx - # mul a[2]*b[2] - mull %edx - addl %eax,%ecx - movl 4(%esi),%eax - adcl %edx,%ebp - movl 12(%edi),%edx - adcl $0,%ebx - # mul a[1]*b[3] - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl 8(%edi),%edx - adcl $0,%ebx - movl %ecx,16(%eax) - movl 12(%esi),%eax - # saved r[4] - # ################## Calculate word 5 - xorl %ecx,%ecx - # mul a[3]*b[2] - mull %edx - addl %eax,%ebp - movl 8(%esi),%eax - adcl %edx,%ebx - movl 12(%edi),%edx - adcl $0,%ecx - # mul a[2]*b[3] - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl 12(%edi),%edx - adcl $0,%ecx - movl %ebp,20(%eax) - movl 12(%esi),%eax - # saved r[5] - # ################## Calculate word 6 - xorl %ebp,%ebp - # mul a[3]*b[3] - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - adcl $0,%ebp - movl %ebx,24(%eax) - # saved r[6] - # save r[7] - movl %ecx,28(%eax) - popl %ebx - popl %ebp - popl %edi - popl %esi - ret -.globl _bn_sqr_comba8 -.private_extern _bn_sqr_comba8 -.align 4 -_bn_sqr_comba8: -L_bn_sqr_comba8_begin: - pushl %esi - pushl %edi - pushl %ebp - pushl %ebx - movl 20(%esp),%edi - movl 24(%esp),%esi - xorl %ebx,%ebx - xorl %ecx,%ecx - movl (%esi),%eax - # ############### Calculate word 0 - xorl %ebp,%ebp - # sqr a[0]*a[0] - mull %eax - addl %eax,%ebx - adcl %edx,%ecx - movl (%esi),%edx - adcl $0,%ebp - movl %ebx,(%edi) - movl 4(%esi),%eax - # saved r[0] - # ############### Calculate word 1 - xorl %ebx,%ebx - # sqr a[1]*a[0] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 8(%esi),%eax - adcl $0,%ebx - movl %ecx,4(%edi) - movl (%esi),%edx - # saved r[1] - # ############### Calculate word 2 - xorl %ecx,%ecx - # sqr a[2]*a[0] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 4(%esi),%eax - adcl $0,%ecx - # sqr a[1]*a[1] - mull %eax - addl %eax,%ebp - adcl %edx,%ebx - movl (%esi),%edx - adcl $0,%ecx - movl %ebp,8(%edi) - movl 12(%esi),%eax - # saved r[2] - # ############### Calculate word 3 - xorl %ebp,%ebp - # sqr a[3]*a[0] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 8(%esi),%eax - adcl $0,%ebp - movl 4(%esi),%edx - # sqr a[2]*a[1] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 16(%esi),%eax - adcl $0,%ebp - movl %ebx,12(%edi) - movl (%esi),%edx - # saved r[3] - # ############### Calculate word 4 - xorl %ebx,%ebx - # sqr a[4]*a[0] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 12(%esi),%eax - adcl $0,%ebx - movl 4(%esi),%edx - # sqr a[3]*a[1] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 8(%esi),%eax - adcl $0,%ebx - # sqr a[2]*a[2] - mull %eax - addl %eax,%ecx - adcl %edx,%ebp - movl (%esi),%edx - adcl $0,%ebx - movl %ecx,16(%edi) - movl 20(%esi),%eax - # saved r[4] - # ############### Calculate word 5 - xorl %ecx,%ecx - # sqr a[5]*a[0] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 16(%esi),%eax - adcl $0,%ecx - movl 4(%esi),%edx - # sqr a[4]*a[1] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 12(%esi),%eax - adcl $0,%ecx - movl 8(%esi),%edx - # sqr a[3]*a[2] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 24(%esi),%eax - adcl $0,%ecx - movl %ebp,20(%edi) - movl (%esi),%edx - # saved r[5] - # ############### Calculate word 6 - xorl %ebp,%ebp - # sqr a[6]*a[0] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 20(%esi),%eax - adcl $0,%ebp - movl 4(%esi),%edx - # sqr a[5]*a[1] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 16(%esi),%eax - adcl $0,%ebp - movl 8(%esi),%edx - # sqr a[4]*a[2] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 12(%esi),%eax - adcl $0,%ebp - # sqr a[3]*a[3] - mull %eax - addl %eax,%ebx - adcl %edx,%ecx - movl (%esi),%edx - adcl $0,%ebp - movl %ebx,24(%edi) - movl 28(%esi),%eax - # saved r[6] - # ############### Calculate word 7 - xorl %ebx,%ebx - # sqr a[7]*a[0] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 24(%esi),%eax - adcl $0,%ebx - movl 4(%esi),%edx - # sqr a[6]*a[1] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 20(%esi),%eax - adcl $0,%ebx - movl 8(%esi),%edx - # sqr a[5]*a[2] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 16(%esi),%eax - adcl $0,%ebx - movl 12(%esi),%edx - # sqr a[4]*a[3] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 28(%esi),%eax - adcl $0,%ebx - movl %ecx,28(%edi) - movl 4(%esi),%edx - # saved r[7] - # ############### Calculate word 8 - xorl %ecx,%ecx - # sqr a[7]*a[1] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 24(%esi),%eax - adcl $0,%ecx - movl 8(%esi),%edx - # sqr a[6]*a[2] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 20(%esi),%eax - adcl $0,%ecx - movl 12(%esi),%edx - # sqr a[5]*a[3] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 16(%esi),%eax - adcl $0,%ecx - # sqr a[4]*a[4] - mull %eax - addl %eax,%ebp - adcl %edx,%ebx - movl 8(%esi),%edx - adcl $0,%ecx - movl %ebp,32(%edi) - movl 28(%esi),%eax - # saved r[8] - # ############### Calculate word 9 - xorl %ebp,%ebp - # sqr a[7]*a[2] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 24(%esi),%eax - adcl $0,%ebp - movl 12(%esi),%edx - # sqr a[6]*a[3] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 20(%esi),%eax - adcl $0,%ebp - movl 16(%esi),%edx - # sqr a[5]*a[4] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 28(%esi),%eax - adcl $0,%ebp - movl %ebx,36(%edi) - movl 12(%esi),%edx - # saved r[9] - # ############### Calculate word 10 - xorl %ebx,%ebx - # sqr a[7]*a[3] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 24(%esi),%eax - adcl $0,%ebx - movl 16(%esi),%edx - # sqr a[6]*a[4] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 20(%esi),%eax - adcl $0,%ebx - # sqr a[5]*a[5] - mull %eax - addl %eax,%ecx - adcl %edx,%ebp - movl 16(%esi),%edx - adcl $0,%ebx - movl %ecx,40(%edi) - movl 28(%esi),%eax - # saved r[10] - # ############### Calculate word 11 - xorl %ecx,%ecx - # sqr a[7]*a[4] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 24(%esi),%eax - adcl $0,%ecx - movl 20(%esi),%edx - # sqr a[6]*a[5] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 28(%esi),%eax - adcl $0,%ecx - movl %ebp,44(%edi) - movl 20(%esi),%edx - # saved r[11] - # ############### Calculate word 12 - xorl %ebp,%ebp - # sqr a[7]*a[5] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 24(%esi),%eax - adcl $0,%ebp - # sqr a[6]*a[6] - mull %eax - addl %eax,%ebx - adcl %edx,%ecx - movl 24(%esi),%edx - adcl $0,%ebp - movl %ebx,48(%edi) - movl 28(%esi),%eax - # saved r[12] - # ############### Calculate word 13 - xorl %ebx,%ebx - # sqr a[7]*a[6] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 28(%esi),%eax - adcl $0,%ebx - movl %ecx,52(%edi) - # saved r[13] - # ############### Calculate word 14 - xorl %ecx,%ecx - # sqr a[7]*a[7] - mull %eax - addl %eax,%ebp - adcl %edx,%ebx - adcl $0,%ecx - movl %ebp,56(%edi) - # saved r[14] - movl %ebx,60(%edi) - popl %ebx - popl %ebp - popl %edi - popl %esi - ret -.globl _bn_sqr_comba4 -.private_extern _bn_sqr_comba4 -.align 4 -_bn_sqr_comba4: -L_bn_sqr_comba4_begin: - pushl %esi - pushl %edi - pushl %ebp - pushl %ebx - movl 20(%esp),%edi - movl 24(%esp),%esi - xorl %ebx,%ebx - xorl %ecx,%ecx - movl (%esi),%eax - # ############### Calculate word 0 - xorl %ebp,%ebp - # sqr a[0]*a[0] - mull %eax - addl %eax,%ebx - adcl %edx,%ecx - movl (%esi),%edx - adcl $0,%ebp - movl %ebx,(%edi) - movl 4(%esi),%eax - # saved r[0] - # ############### Calculate word 1 - xorl %ebx,%ebx - # sqr a[1]*a[0] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 8(%esi),%eax - adcl $0,%ebx - movl %ecx,4(%edi) - movl (%esi),%edx - # saved r[1] - # ############### Calculate word 2 - xorl %ecx,%ecx - # sqr a[2]*a[0] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 4(%esi),%eax - adcl $0,%ecx - # sqr a[1]*a[1] - mull %eax - addl %eax,%ebp - adcl %edx,%ebx - movl (%esi),%edx - adcl $0,%ecx - movl %ebp,8(%edi) - movl 12(%esi),%eax - # saved r[2] - # ############### Calculate word 3 - xorl %ebp,%ebp - # sqr a[3]*a[0] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 8(%esi),%eax - adcl $0,%ebp - movl 4(%esi),%edx - # sqr a[2]*a[1] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 12(%esi),%eax - adcl $0,%ebp - movl %ebx,12(%edi) - movl 4(%esi),%edx - # saved r[3] - # ############### Calculate word 4 - xorl %ebx,%ebx - # sqr a[3]*a[1] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 8(%esi),%eax - adcl $0,%ebx - # sqr a[2]*a[2] - mull %eax - addl %eax,%ecx - adcl %edx,%ebp - movl 8(%esi),%edx - adcl $0,%ebx - movl %ecx,16(%edi) - movl 12(%esi),%eax - # saved r[4] - # ############### Calculate word 5 - xorl %ecx,%ecx - # sqr a[3]*a[2] - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 12(%esi),%eax - adcl $0,%ecx - movl %ebp,20(%edi) - # saved r[5] - # ############### Calculate word 6 - xorl %ebp,%ebp - # sqr a[3]*a[3] - mull %eax - addl %eax,%ebx - adcl %edx,%ecx - adcl $0,%ebp - movl %ebx,24(%edi) - # saved r[6] - movl %ecx,28(%edi) - popl %ebx - popl %ebp - popl %edi - popl %esi - ret -#endif diff --git a/third_party/boringssl/apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S b/third_party/boringssl/apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S deleted file mode 100644 index 86566790..00000000 --- a/third_party/boringssl/apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S +++ /dev/null @@ -1,289 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl _gcm_gmult_ssse3 -.private_extern _gcm_gmult_ssse3 -.align 4 -_gcm_gmult_ssse3: -L_gcm_gmult_ssse3_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%edi - movl 24(%esp),%esi - movdqu (%edi),%xmm0 - call L000pic_point -L000pic_point: - popl %eax - movdqa Lreverse_bytes-L000pic_point(%eax),%xmm7 - movdqa Llow4_mask-L000pic_point(%eax),%xmm2 -.byte 102,15,56,0,199 - movdqa %xmm2,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm2,%xmm0 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - movl $5,%eax -L001loop_row_1: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz L001loop_row_1 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movl $5,%eax -L002loop_row_2: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz L002loop_row_2 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movl $6,%eax -L003loop_row_3: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz L003loop_row_3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 -.byte 102,15,56,0,215 - movdqu %xmm2,(%edi) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _gcm_ghash_ssse3 -.private_extern _gcm_ghash_ssse3 -.align 4 -_gcm_ghash_ssse3: -L_gcm_ghash_ssse3_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%edi - movl 24(%esp),%esi - movl 28(%esp),%edx - movl 32(%esp),%ecx - movdqu (%edi),%xmm0 - call L004pic_point -L004pic_point: - popl %ebx - movdqa Lreverse_bytes-L004pic_point(%ebx),%xmm7 - andl $-16,%ecx -.byte 102,15,56,0,199 - pxor %xmm3,%xmm3 -L005loop_ghash: - movdqa Llow4_mask-L004pic_point(%ebx),%xmm2 - movdqu (%edx),%xmm1 -.byte 102,15,56,0,207 - pxor %xmm1,%xmm0 - movdqa %xmm2,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm2,%xmm0 - pxor %xmm2,%xmm2 - movl $5,%eax -L006loop_row_4: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz L006loop_row_4 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movl $5,%eax -L007loop_row_5: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz L007loop_row_5 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movl $6,%eax -L008loop_row_6: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz L008loop_row_6 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movdqa %xmm2,%xmm0 - leal -256(%esi),%esi - leal 16(%edx),%edx - subl $16,%ecx - jnz L005loop_ghash -.byte 102,15,56,0,199 - movdqu %xmm0,(%edi) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 4,0x90 -Lreverse_bytes: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.align 4,0x90 -Llow4_mask: -.long 252645135,252645135,252645135,252645135 -#endif diff --git a/third_party/boringssl/apple-x86/crypto/fipsmodule/ghash-x86.S b/third_party/boringssl/apple-x86/crypto/fipsmodule/ghash-x86.S deleted file mode 100644 index c1e0d539..00000000 --- a/third_party/boringssl/apple-x86/crypto/fipsmodule/ghash-x86.S +++ /dev/null @@ -1,323 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl _gcm_init_clmul -.private_extern _gcm_init_clmul -.align 4 -_gcm_init_clmul: -L_gcm_init_clmul_begin: - movl 4(%esp),%edx - movl 8(%esp),%eax - call L000pic -L000pic: - popl %ecx - leal Lbswap-L000pic(%ecx),%ecx - movdqu (%eax),%xmm2 - pshufd $78,%xmm2,%xmm2 - pshufd $255,%xmm2,%xmm4 - movdqa %xmm2,%xmm3 - psllq $1,%xmm2 - pxor %xmm5,%xmm5 - psrlq $63,%xmm3 - pcmpgtd %xmm4,%xmm5 - pslldq $8,%xmm3 - por %xmm3,%xmm2 - pand 16(%ecx),%xmm5 - pxor %xmm5,%xmm2 - movdqa %xmm2,%xmm0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 - xorps %xmm0,%xmm3 - xorps %xmm1,%xmm3 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - pshufd $78,%xmm2,%xmm3 - pshufd $78,%xmm0,%xmm4 - pxor %xmm2,%xmm3 - movdqu %xmm2,(%edx) - pxor %xmm0,%xmm4 - movdqu %xmm0,16(%edx) -.byte 102,15,58,15,227,8 - movdqu %xmm4,32(%edx) - ret -.globl _gcm_gmult_clmul -.private_extern _gcm_gmult_clmul -.align 4 -_gcm_gmult_clmul: -L_gcm_gmult_clmul_begin: - movl 4(%esp),%eax - movl 8(%esp),%edx - call L001pic -L001pic: - popl %ecx - leal Lbswap-L001pic(%ecx),%ecx - movdqu (%eax),%xmm0 - movdqa (%ecx),%xmm5 - movups (%edx),%xmm2 -.byte 102,15,56,0,197 - movups 32(%edx),%xmm4 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 - xorps %xmm0,%xmm3 - xorps %xmm1,%xmm3 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,197 - movdqu %xmm0,(%eax) - ret -.globl _gcm_ghash_clmul -.private_extern _gcm_ghash_clmul -.align 4 -_gcm_ghash_clmul: -L_gcm_ghash_clmul_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%eax - movl 24(%esp),%edx - movl 28(%esp),%esi - movl 32(%esp),%ebx - call L002pic -L002pic: - popl %ecx - leal Lbswap-L002pic(%ecx),%ecx - movdqu (%eax),%xmm0 - movdqa (%ecx),%xmm5 - movdqu (%edx),%xmm2 -.byte 102,15,56,0,197 - subl $16,%ebx - jz L003odd_tail - movdqu (%esi),%xmm3 - movdqu 16(%esi),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 - movdqu 32(%edx),%xmm5 - pxor %xmm3,%xmm0 - pshufd $78,%xmm6,%xmm3 - movdqa %xmm6,%xmm7 - pxor %xmm6,%xmm3 - leal 32(%esi),%esi -.byte 102,15,58,68,242,0 -.byte 102,15,58,68,250,17 -.byte 102,15,58,68,221,0 - movups 16(%edx),%xmm2 - nop - subl $32,%ebx - jbe L004even_tail - jmp L005mod_loop -.align 5,0x90 -L005mod_loop: - pshufd $78,%xmm0,%xmm4 - movdqa %xmm0,%xmm1 - pxor %xmm0,%xmm4 - nop -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,229,16 - movups (%edx),%xmm2 - xorps %xmm6,%xmm0 - movdqa (%ecx),%xmm5 - xorps %xmm7,%xmm1 - movdqu (%esi),%xmm7 - pxor %xmm0,%xmm3 - movdqu 16(%esi),%xmm6 - pxor %xmm1,%xmm3 -.byte 102,15,56,0,253 - pxor %xmm3,%xmm4 - movdqa %xmm4,%xmm3 - psrldq $8,%xmm4 - pslldq $8,%xmm3 - pxor %xmm4,%xmm1 - pxor %xmm3,%xmm0 -.byte 102,15,56,0,245 - pxor %xmm7,%xmm1 - movdqa %xmm6,%xmm7 - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 -.byte 102,15,58,68,242,0 - movups 32(%edx),%xmm5 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - pshufd $78,%xmm7,%xmm3 - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm7,%xmm3 - pxor %xmm4,%xmm1 -.byte 102,15,58,68,250,17 - movups 16(%edx),%xmm2 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 -.byte 102,15,58,68,221,0 - leal 32(%esi),%esi - subl $32,%ebx - ja L005mod_loop -L004even_tail: - pshufd $78,%xmm0,%xmm4 - movdqa %xmm0,%xmm1 - pxor %xmm0,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,229,16 - movdqa (%ecx),%xmm5 - xorps %xmm6,%xmm0 - xorps %xmm7,%xmm1 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - pxor %xmm3,%xmm4 - movdqa %xmm4,%xmm3 - psrldq $8,%xmm4 - pslldq $8,%xmm3 - pxor %xmm4,%xmm1 - pxor %xmm3,%xmm0 - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - testl %ebx,%ebx - jnz L006done - movups (%edx),%xmm2 -L003odd_tail: - movdqu (%esi),%xmm3 -.byte 102,15,56,0,221 - pxor %xmm3,%xmm0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 - xorps %xmm0,%xmm3 - xorps %xmm1,%xmm3 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 -L006done: -.byte 102,15,56,0,197 - movdqu %xmm0,(%eax) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 6,0x90 -Lbswap: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 -.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 -.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 -.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 -.byte 0 -#endif diff --git a/third_party/boringssl/apple-x86/crypto/fipsmodule/md5-586.S b/third_party/boringssl/apple-x86/crypto/fipsmodule/md5-586.S deleted file mode 100644 index f4c4b50c..00000000 --- a/third_party/boringssl/apple-x86/crypto/fipsmodule/md5-586.S +++ /dev/null @@ -1,685 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl _md5_block_asm_data_order -.private_extern _md5_block_asm_data_order -.align 4 -_md5_block_asm_data_order: -L_md5_block_asm_data_order_begin: - pushl %esi - pushl %edi - movl 12(%esp),%edi - movl 16(%esp),%esi - movl 20(%esp),%ecx - pushl %ebp - shll $6,%ecx - pushl %ebx - addl %esi,%ecx - subl $64,%ecx - movl (%edi),%eax - pushl %ecx - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx -L000start: - - # R0 section - movl %ecx,%edi - movl (%esi),%ebp - # R0 0 - xorl %edx,%edi - andl %ebx,%edi - leal 3614090360(%eax,%ebp,1),%eax - xorl %edx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $7,%eax - movl 4(%esi),%ebp - addl %ebx,%eax - # R0 1 - xorl %ecx,%edi - andl %eax,%edi - leal 3905402710(%edx,%ebp,1),%edx - xorl %ecx,%edi - addl %edi,%edx - movl %eax,%edi - roll $12,%edx - movl 8(%esi),%ebp - addl %eax,%edx - # R0 2 - xorl %ebx,%edi - andl %edx,%edi - leal 606105819(%ecx,%ebp,1),%ecx - xorl %ebx,%edi - addl %edi,%ecx - movl %edx,%edi - roll $17,%ecx - movl 12(%esi),%ebp - addl %edx,%ecx - # R0 3 - xorl %eax,%edi - andl %ecx,%edi - leal 3250441966(%ebx,%ebp,1),%ebx - xorl %eax,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $22,%ebx - movl 16(%esi),%ebp - addl %ecx,%ebx - # R0 4 - xorl %edx,%edi - andl %ebx,%edi - leal 4118548399(%eax,%ebp,1),%eax - xorl %edx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $7,%eax - movl 20(%esi),%ebp - addl %ebx,%eax - # R0 5 - xorl %ecx,%edi - andl %eax,%edi - leal 1200080426(%edx,%ebp,1),%edx - xorl %ecx,%edi - addl %edi,%edx - movl %eax,%edi - roll $12,%edx - movl 24(%esi),%ebp - addl %eax,%edx - # R0 6 - xorl %ebx,%edi - andl %edx,%edi - leal 2821735955(%ecx,%ebp,1),%ecx - xorl %ebx,%edi - addl %edi,%ecx - movl %edx,%edi - roll $17,%ecx - movl 28(%esi),%ebp - addl %edx,%ecx - # R0 7 - xorl %eax,%edi - andl %ecx,%edi - leal 4249261313(%ebx,%ebp,1),%ebx - xorl %eax,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $22,%ebx - movl 32(%esi),%ebp - addl %ecx,%ebx - # R0 8 - xorl %edx,%edi - andl %ebx,%edi - leal 1770035416(%eax,%ebp,1),%eax - xorl %edx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $7,%eax - movl 36(%esi),%ebp - addl %ebx,%eax - # R0 9 - xorl %ecx,%edi - andl %eax,%edi - leal 2336552879(%edx,%ebp,1),%edx - xorl %ecx,%edi - addl %edi,%edx - movl %eax,%edi - roll $12,%edx - movl 40(%esi),%ebp - addl %eax,%edx - # R0 10 - xorl %ebx,%edi - andl %edx,%edi - leal 4294925233(%ecx,%ebp,1),%ecx - xorl %ebx,%edi - addl %edi,%ecx - movl %edx,%edi - roll $17,%ecx - movl 44(%esi),%ebp - addl %edx,%ecx - # R0 11 - xorl %eax,%edi - andl %ecx,%edi - leal 2304563134(%ebx,%ebp,1),%ebx - xorl %eax,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $22,%ebx - movl 48(%esi),%ebp - addl %ecx,%ebx - # R0 12 - xorl %edx,%edi - andl %ebx,%edi - leal 1804603682(%eax,%ebp,1),%eax - xorl %edx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $7,%eax - movl 52(%esi),%ebp - addl %ebx,%eax - # R0 13 - xorl %ecx,%edi - andl %eax,%edi - leal 4254626195(%edx,%ebp,1),%edx - xorl %ecx,%edi - addl %edi,%edx - movl %eax,%edi - roll $12,%edx - movl 56(%esi),%ebp - addl %eax,%edx - # R0 14 - xorl %ebx,%edi - andl %edx,%edi - leal 2792965006(%ecx,%ebp,1),%ecx - xorl %ebx,%edi - addl %edi,%ecx - movl %edx,%edi - roll $17,%ecx - movl 60(%esi),%ebp - addl %edx,%ecx - # R0 15 - xorl %eax,%edi - andl %ecx,%edi - leal 1236535329(%ebx,%ebp,1),%ebx - xorl %eax,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $22,%ebx - movl 4(%esi),%ebp - addl %ecx,%ebx - - # R1 section - # R1 16 - leal 4129170786(%eax,%ebp,1),%eax - xorl %ebx,%edi - andl %edx,%edi - movl 24(%esi),%ebp - xorl %ecx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $5,%eax - addl %ebx,%eax - # R1 17 - leal 3225465664(%edx,%ebp,1),%edx - xorl %eax,%edi - andl %ecx,%edi - movl 44(%esi),%ebp - xorl %ebx,%edi - addl %edi,%edx - movl %eax,%edi - roll $9,%edx - addl %eax,%edx - # R1 18 - leal 643717713(%ecx,%ebp,1),%ecx - xorl %edx,%edi - andl %ebx,%edi - movl (%esi),%ebp - xorl %eax,%edi - addl %edi,%ecx - movl %edx,%edi - roll $14,%ecx - addl %edx,%ecx - # R1 19 - leal 3921069994(%ebx,%ebp,1),%ebx - xorl %ecx,%edi - andl %eax,%edi - movl 20(%esi),%ebp - xorl %edx,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $20,%ebx - addl %ecx,%ebx - # R1 20 - leal 3593408605(%eax,%ebp,1),%eax - xorl %ebx,%edi - andl %edx,%edi - movl 40(%esi),%ebp - xorl %ecx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $5,%eax - addl %ebx,%eax - # R1 21 - leal 38016083(%edx,%ebp,1),%edx - xorl %eax,%edi - andl %ecx,%edi - movl 60(%esi),%ebp - xorl %ebx,%edi - addl %edi,%edx - movl %eax,%edi - roll $9,%edx - addl %eax,%edx - # R1 22 - leal 3634488961(%ecx,%ebp,1),%ecx - xorl %edx,%edi - andl %ebx,%edi - movl 16(%esi),%ebp - xorl %eax,%edi - addl %edi,%ecx - movl %edx,%edi - roll $14,%ecx - addl %edx,%ecx - # R1 23 - leal 3889429448(%ebx,%ebp,1),%ebx - xorl %ecx,%edi - andl %eax,%edi - movl 36(%esi),%ebp - xorl %edx,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $20,%ebx - addl %ecx,%ebx - # R1 24 - leal 568446438(%eax,%ebp,1),%eax - xorl %ebx,%edi - andl %edx,%edi - movl 56(%esi),%ebp - xorl %ecx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $5,%eax - addl %ebx,%eax - # R1 25 - leal 3275163606(%edx,%ebp,1),%edx - xorl %eax,%edi - andl %ecx,%edi - movl 12(%esi),%ebp - xorl %ebx,%edi - addl %edi,%edx - movl %eax,%edi - roll $9,%edx - addl %eax,%edx - # R1 26 - leal 4107603335(%ecx,%ebp,1),%ecx - xorl %edx,%edi - andl %ebx,%edi - movl 32(%esi),%ebp - xorl %eax,%edi - addl %edi,%ecx - movl %edx,%edi - roll $14,%ecx - addl %edx,%ecx - # R1 27 - leal 1163531501(%ebx,%ebp,1),%ebx - xorl %ecx,%edi - andl %eax,%edi - movl 52(%esi),%ebp - xorl %edx,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $20,%ebx - addl %ecx,%ebx - # R1 28 - leal 2850285829(%eax,%ebp,1),%eax - xorl %ebx,%edi - andl %edx,%edi - movl 8(%esi),%ebp - xorl %ecx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $5,%eax - addl %ebx,%eax - # R1 29 - leal 4243563512(%edx,%ebp,1),%edx - xorl %eax,%edi - andl %ecx,%edi - movl 28(%esi),%ebp - xorl %ebx,%edi - addl %edi,%edx - movl %eax,%edi - roll $9,%edx - addl %eax,%edx - # R1 30 - leal 1735328473(%ecx,%ebp,1),%ecx - xorl %edx,%edi - andl %ebx,%edi - movl 48(%esi),%ebp - xorl %eax,%edi - addl %edi,%ecx - movl %edx,%edi - roll $14,%ecx - addl %edx,%ecx - # R1 31 - leal 2368359562(%ebx,%ebp,1),%ebx - xorl %ecx,%edi - andl %eax,%edi - movl 20(%esi),%ebp - xorl %edx,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $20,%ebx - addl %ecx,%ebx - - # R2 section - # R2 32 - xorl %edx,%edi - xorl %ebx,%edi - leal 4294588738(%eax,%ebp,1),%eax - addl %edi,%eax - roll $4,%eax - movl 32(%esi),%ebp - movl %ebx,%edi - # R2 33 - leal 2272392833(%edx,%ebp,1),%edx - addl %ebx,%eax - xorl %ecx,%edi - xorl %eax,%edi - movl 44(%esi),%ebp - addl %edi,%edx - movl %eax,%edi - roll $11,%edx - addl %eax,%edx - # R2 34 - xorl %ebx,%edi - xorl %edx,%edi - leal 1839030562(%ecx,%ebp,1),%ecx - addl %edi,%ecx - roll $16,%ecx - movl 56(%esi),%ebp - movl %edx,%edi - # R2 35 - leal 4259657740(%ebx,%ebp,1),%ebx - addl %edx,%ecx - xorl %eax,%edi - xorl %ecx,%edi - movl 4(%esi),%ebp - addl %edi,%ebx - movl %ecx,%edi - roll $23,%ebx - addl %ecx,%ebx - # R2 36 - xorl %edx,%edi - xorl %ebx,%edi - leal 2763975236(%eax,%ebp,1),%eax - addl %edi,%eax - roll $4,%eax - movl 16(%esi),%ebp - movl %ebx,%edi - # R2 37 - leal 1272893353(%edx,%ebp,1),%edx - addl %ebx,%eax - xorl %ecx,%edi - xorl %eax,%edi - movl 28(%esi),%ebp - addl %edi,%edx - movl %eax,%edi - roll $11,%edx - addl %eax,%edx - # R2 38 - xorl %ebx,%edi - xorl %edx,%edi - leal 4139469664(%ecx,%ebp,1),%ecx - addl %edi,%ecx - roll $16,%ecx - movl 40(%esi),%ebp - movl %edx,%edi - # R2 39 - leal 3200236656(%ebx,%ebp,1),%ebx - addl %edx,%ecx - xorl %eax,%edi - xorl %ecx,%edi - movl 52(%esi),%ebp - addl %edi,%ebx - movl %ecx,%edi - roll $23,%ebx - addl %ecx,%ebx - # R2 40 - xorl %edx,%edi - xorl %ebx,%edi - leal 681279174(%eax,%ebp,1),%eax - addl %edi,%eax - roll $4,%eax - movl (%esi),%ebp - movl %ebx,%edi - # R2 41 - leal 3936430074(%edx,%ebp,1),%edx - addl %ebx,%eax - xorl %ecx,%edi - xorl %eax,%edi - movl 12(%esi),%ebp - addl %edi,%edx - movl %eax,%edi - roll $11,%edx - addl %eax,%edx - # R2 42 - xorl %ebx,%edi - xorl %edx,%edi - leal 3572445317(%ecx,%ebp,1),%ecx - addl %edi,%ecx - roll $16,%ecx - movl 24(%esi),%ebp - movl %edx,%edi - # R2 43 - leal 76029189(%ebx,%ebp,1),%ebx - addl %edx,%ecx - xorl %eax,%edi - xorl %ecx,%edi - movl 36(%esi),%ebp - addl %edi,%ebx - movl %ecx,%edi - roll $23,%ebx - addl %ecx,%ebx - # R2 44 - xorl %edx,%edi - xorl %ebx,%edi - leal 3654602809(%eax,%ebp,1),%eax - addl %edi,%eax - roll $4,%eax - movl 48(%esi),%ebp - movl %ebx,%edi - # R2 45 - leal 3873151461(%edx,%ebp,1),%edx - addl %ebx,%eax - xorl %ecx,%edi - xorl %eax,%edi - movl 60(%esi),%ebp - addl %edi,%edx - movl %eax,%edi - roll $11,%edx - addl %eax,%edx - # R2 46 - xorl %ebx,%edi - xorl %edx,%edi - leal 530742520(%ecx,%ebp,1),%ecx - addl %edi,%ecx - roll $16,%ecx - movl 8(%esi),%ebp - movl %edx,%edi - # R2 47 - leal 3299628645(%ebx,%ebp,1),%ebx - addl %edx,%ecx - xorl %eax,%edi - xorl %ecx,%edi - movl (%esi),%ebp - addl %edi,%ebx - movl $-1,%edi - roll $23,%ebx - addl %ecx,%ebx - - # R3 section - # R3 48 - xorl %edx,%edi - orl %ebx,%edi - leal 4096336452(%eax,%ebp,1),%eax - xorl %ecx,%edi - movl 28(%esi),%ebp - addl %edi,%eax - movl $-1,%edi - roll $6,%eax - xorl %ecx,%edi - addl %ebx,%eax - # R3 49 - orl %eax,%edi - leal 1126891415(%edx,%ebp,1),%edx - xorl %ebx,%edi - movl 56(%esi),%ebp - addl %edi,%edx - movl $-1,%edi - roll $10,%edx - xorl %ebx,%edi - addl %eax,%edx - # R3 50 - orl %edx,%edi - leal 2878612391(%ecx,%ebp,1),%ecx - xorl %eax,%edi - movl 20(%esi),%ebp - addl %edi,%ecx - movl $-1,%edi - roll $15,%ecx - xorl %eax,%edi - addl %edx,%ecx - # R3 51 - orl %ecx,%edi - leal 4237533241(%ebx,%ebp,1),%ebx - xorl %edx,%edi - movl 48(%esi),%ebp - addl %edi,%ebx - movl $-1,%edi - roll $21,%ebx - xorl %edx,%edi - addl %ecx,%ebx - # R3 52 - orl %ebx,%edi - leal 1700485571(%eax,%ebp,1),%eax - xorl %ecx,%edi - movl 12(%esi),%ebp - addl %edi,%eax - movl $-1,%edi - roll $6,%eax - xorl %ecx,%edi - addl %ebx,%eax - # R3 53 - orl %eax,%edi - leal 2399980690(%edx,%ebp,1),%edx - xorl %ebx,%edi - movl 40(%esi),%ebp - addl %edi,%edx - movl $-1,%edi - roll $10,%edx - xorl %ebx,%edi - addl %eax,%edx - # R3 54 - orl %edx,%edi - leal 4293915773(%ecx,%ebp,1),%ecx - xorl %eax,%edi - movl 4(%esi),%ebp - addl %edi,%ecx - movl $-1,%edi - roll $15,%ecx - xorl %eax,%edi - addl %edx,%ecx - # R3 55 - orl %ecx,%edi - leal 2240044497(%ebx,%ebp,1),%ebx - xorl %edx,%edi - movl 32(%esi),%ebp - addl %edi,%ebx - movl $-1,%edi - roll $21,%ebx - xorl %edx,%edi - addl %ecx,%ebx - # R3 56 - orl %ebx,%edi - leal 1873313359(%eax,%ebp,1),%eax - xorl %ecx,%edi - movl 60(%esi),%ebp - addl %edi,%eax - movl $-1,%edi - roll $6,%eax - xorl %ecx,%edi - addl %ebx,%eax - # R3 57 - orl %eax,%edi - leal 4264355552(%edx,%ebp,1),%edx - xorl %ebx,%edi - movl 24(%esi),%ebp - addl %edi,%edx - movl $-1,%edi - roll $10,%edx - xorl %ebx,%edi - addl %eax,%edx - # R3 58 - orl %edx,%edi - leal 2734768916(%ecx,%ebp,1),%ecx - xorl %eax,%edi - movl 52(%esi),%ebp - addl %edi,%ecx - movl $-1,%edi - roll $15,%ecx - xorl %eax,%edi - addl %edx,%ecx - # R3 59 - orl %ecx,%edi - leal 1309151649(%ebx,%ebp,1),%ebx - xorl %edx,%edi - movl 16(%esi),%ebp - addl %edi,%ebx - movl $-1,%edi - roll $21,%ebx - xorl %edx,%edi - addl %ecx,%ebx - # R3 60 - orl %ebx,%edi - leal 4149444226(%eax,%ebp,1),%eax - xorl %ecx,%edi - movl 44(%esi),%ebp - addl %edi,%eax - movl $-1,%edi - roll $6,%eax - xorl %ecx,%edi - addl %ebx,%eax - # R3 61 - orl %eax,%edi - leal 3174756917(%edx,%ebp,1),%edx - xorl %ebx,%edi - movl 8(%esi),%ebp - addl %edi,%edx - movl $-1,%edi - roll $10,%edx - xorl %ebx,%edi - addl %eax,%edx - # R3 62 - orl %edx,%edi - leal 718787259(%ecx,%ebp,1),%ecx - xorl %eax,%edi - movl 36(%esi),%ebp - addl %edi,%ecx - movl $-1,%edi - roll $15,%ecx - xorl %eax,%edi - addl %edx,%ecx - # R3 63 - orl %ecx,%edi - leal 3951481745(%ebx,%ebp,1),%ebx - xorl %edx,%edi - movl 24(%esp),%ebp - addl %edi,%ebx - addl $64,%esi - roll $21,%ebx - movl (%ebp),%edi - addl %ecx,%ebx - addl %edi,%eax - movl 4(%ebp),%edi - addl %edi,%ebx - movl 8(%ebp),%edi - addl %edi,%ecx - movl 12(%ebp),%edi - addl %edi,%edx - movl %eax,(%ebp) - movl %ebx,4(%ebp) - movl (%esp),%edi - movl %ecx,8(%ebp) - movl %edx,12(%ebp) - cmpl %esi,%edi - jae L000start - popl %eax - popl %ebx - popl %ebp - popl %edi - popl %esi - ret -#endif diff --git a/third_party/boringssl/apple-x86/crypto/fipsmodule/sha1-586.S b/third_party/boringssl/apple-x86/crypto/fipsmodule/sha1-586.S deleted file mode 100644 index 3213a621..00000000 --- a/third_party/boringssl/apple-x86/crypto/fipsmodule/sha1-586.S +++ /dev/null @@ -1,3805 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl _sha1_block_data_order -.private_extern _sha1_block_data_order -.align 4 -_sha1_block_data_order: -L_sha1_block_data_order_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call L000pic_point -L000pic_point: - popl %ebp - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L000pic_point(%ebp),%esi - leal LK_XX_XX-L000pic_point(%ebp),%ebp - movl (%esi),%eax - movl 4(%esi),%edx - testl $512,%edx - jz L001x86 - movl 8(%esi),%ecx - testl $16777216,%eax - jz L001x86 - andl $268435456,%edx - andl $1073741824,%eax - orl %edx,%eax - cmpl $1342177280,%eax - je Lavx_shortcut - jmp Lssse3_shortcut -.align 4,0x90 -L001x86: - movl 20(%esp),%ebp - movl 24(%esp),%esi - movl 28(%esp),%eax - subl $76,%esp - shll $6,%eax - addl %esi,%eax - movl %eax,104(%esp) - movl 16(%ebp),%edi - jmp L002loop -.align 4,0x90 -L002loop: - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,(%esp) - movl %ebx,4(%esp) - movl %ecx,8(%esp) - movl %edx,12(%esp) - movl 16(%esi),%eax - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,16(%esp) - movl %ebx,20(%esp) - movl %ecx,24(%esp) - movl %edx,28(%esp) - movl 32(%esi),%eax - movl 36(%esi),%ebx - movl 40(%esi),%ecx - movl 44(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,32(%esp) - movl %ebx,36(%esp) - movl %ecx,40(%esp) - movl %edx,44(%esp) - movl 48(%esi),%eax - movl 52(%esi),%ebx - movl 56(%esi),%ecx - movl 60(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,48(%esp) - movl %ebx,52(%esp) - movl %ecx,56(%esp) - movl %edx,60(%esp) - movl %esi,100(%esp) - movl (%ebp),%eax - movl 4(%ebp),%ebx - movl 8(%ebp),%ecx - movl 12(%ebp),%edx - # 00_15 0 - movl %ecx,%esi - movl %eax,%ebp - roll $5,%ebp - xorl %edx,%esi - addl %edi,%ebp - movl (%esp),%edi - andl %ebx,%esi - rorl $2,%ebx - xorl %edx,%esi - leal 1518500249(%ebp,%edi,1),%ebp - addl %esi,%ebp - # 00_15 1 - movl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - xorl %ecx,%edi - addl %edx,%ebp - movl 4(%esp),%edx - andl %eax,%edi - rorl $2,%eax - xorl %ecx,%edi - leal 1518500249(%ebp,%edx,1),%ebp - addl %edi,%ebp - # 00_15 2 - movl %eax,%edx - movl %ebp,%edi - roll $5,%ebp - xorl %ebx,%edx - addl %ecx,%ebp - movl 8(%esp),%ecx - andl %esi,%edx - rorl $2,%esi - xorl %ebx,%edx - leal 1518500249(%ebp,%ecx,1),%ebp - addl %edx,%ebp - # 00_15 3 - movl %esi,%ecx - movl %ebp,%edx - roll $5,%ebp - xorl %eax,%ecx - addl %ebx,%ebp - movl 12(%esp),%ebx - andl %edi,%ecx - rorl $2,%edi - xorl %eax,%ecx - leal 1518500249(%ebp,%ebx,1),%ebp - addl %ecx,%ebp - # 00_15 4 - movl %edi,%ebx - movl %ebp,%ecx - roll $5,%ebp - xorl %esi,%ebx - addl %eax,%ebp - movl 16(%esp),%eax - andl %edx,%ebx - rorl $2,%edx - xorl %esi,%ebx - leal 1518500249(%ebp,%eax,1),%ebp - addl %ebx,%ebp - # 00_15 5 - movl %edx,%eax - movl %ebp,%ebx - roll $5,%ebp - xorl %edi,%eax - addl %esi,%ebp - movl 20(%esp),%esi - andl %ecx,%eax - rorl $2,%ecx - xorl %edi,%eax - leal 1518500249(%ebp,%esi,1),%ebp - addl %eax,%ebp - # 00_15 6 - movl %ecx,%esi - movl %ebp,%eax - roll $5,%ebp - xorl %edx,%esi - addl %edi,%ebp - movl 24(%esp),%edi - andl %ebx,%esi - rorl $2,%ebx - xorl %edx,%esi - leal 1518500249(%ebp,%edi,1),%ebp - addl %esi,%ebp - # 00_15 7 - movl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - xorl %ecx,%edi - addl %edx,%ebp - movl 28(%esp),%edx - andl %eax,%edi - rorl $2,%eax - xorl %ecx,%edi - leal 1518500249(%ebp,%edx,1),%ebp - addl %edi,%ebp - # 00_15 8 - movl %eax,%edx - movl %ebp,%edi - roll $5,%ebp - xorl %ebx,%edx - addl %ecx,%ebp - movl 32(%esp),%ecx - andl %esi,%edx - rorl $2,%esi - xorl %ebx,%edx - leal 1518500249(%ebp,%ecx,1),%ebp - addl %edx,%ebp - # 00_15 9 - movl %esi,%ecx - movl %ebp,%edx - roll $5,%ebp - xorl %eax,%ecx - addl %ebx,%ebp - movl 36(%esp),%ebx - andl %edi,%ecx - rorl $2,%edi - xorl %eax,%ecx - leal 1518500249(%ebp,%ebx,1),%ebp - addl %ecx,%ebp - # 00_15 10 - movl %edi,%ebx - movl %ebp,%ecx - roll $5,%ebp - xorl %esi,%ebx - addl %eax,%ebp - movl 40(%esp),%eax - andl %edx,%ebx - rorl $2,%edx - xorl %esi,%ebx - leal 1518500249(%ebp,%eax,1),%ebp - addl %ebx,%ebp - # 00_15 11 - movl %edx,%eax - movl %ebp,%ebx - roll $5,%ebp - xorl %edi,%eax - addl %esi,%ebp - movl 44(%esp),%esi - andl %ecx,%eax - rorl $2,%ecx - xorl %edi,%eax - leal 1518500249(%ebp,%esi,1),%ebp - addl %eax,%ebp - # 00_15 12 - movl %ecx,%esi - movl %ebp,%eax - roll $5,%ebp - xorl %edx,%esi - addl %edi,%ebp - movl 48(%esp),%edi - andl %ebx,%esi - rorl $2,%ebx - xorl %edx,%esi - leal 1518500249(%ebp,%edi,1),%ebp - addl %esi,%ebp - # 00_15 13 - movl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - xorl %ecx,%edi - addl %edx,%ebp - movl 52(%esp),%edx - andl %eax,%edi - rorl $2,%eax - xorl %ecx,%edi - leal 1518500249(%ebp,%edx,1),%ebp - addl %edi,%ebp - # 00_15 14 - movl %eax,%edx - movl %ebp,%edi - roll $5,%ebp - xorl %ebx,%edx - addl %ecx,%ebp - movl 56(%esp),%ecx - andl %esi,%edx - rorl $2,%esi - xorl %ebx,%edx - leal 1518500249(%ebp,%ecx,1),%ebp - addl %edx,%ebp - # 00_15 15 - movl %esi,%ecx - movl %ebp,%edx - roll $5,%ebp - xorl %eax,%ecx - addl %ebx,%ebp - movl 60(%esp),%ebx - andl %edi,%ecx - rorl $2,%edi - xorl %eax,%ecx - leal 1518500249(%ebp,%ebx,1),%ebp - movl (%esp),%ebx - addl %ebp,%ecx - # 16_19 16 - movl %edi,%ebp - xorl 8(%esp),%ebx - xorl %esi,%ebp - xorl 32(%esp),%ebx - andl %edx,%ebp - xorl 52(%esp),%ebx - roll $1,%ebx - xorl %esi,%ebp - addl %ebp,%eax - movl %ecx,%ebp - rorl $2,%edx - movl %ebx,(%esp) - roll $5,%ebp - leal 1518500249(%ebx,%eax,1),%ebx - movl 4(%esp),%eax - addl %ebp,%ebx - # 16_19 17 - movl %edx,%ebp - xorl 12(%esp),%eax - xorl %edi,%ebp - xorl 36(%esp),%eax - andl %ecx,%ebp - xorl 56(%esp),%eax - roll $1,%eax - xorl %edi,%ebp - addl %ebp,%esi - movl %ebx,%ebp - rorl $2,%ecx - movl %eax,4(%esp) - roll $5,%ebp - leal 1518500249(%eax,%esi,1),%eax - movl 8(%esp),%esi - addl %ebp,%eax - # 16_19 18 - movl %ecx,%ebp - xorl 16(%esp),%esi - xorl %edx,%ebp - xorl 40(%esp),%esi - andl %ebx,%ebp - xorl 60(%esp),%esi - roll $1,%esi - xorl %edx,%ebp - addl %ebp,%edi - movl %eax,%ebp - rorl $2,%ebx - movl %esi,8(%esp) - roll $5,%ebp - leal 1518500249(%esi,%edi,1),%esi - movl 12(%esp),%edi - addl %ebp,%esi - # 16_19 19 - movl %ebx,%ebp - xorl 20(%esp),%edi - xorl %ecx,%ebp - xorl 44(%esp),%edi - andl %eax,%ebp - xorl (%esp),%edi - roll $1,%edi - xorl %ecx,%ebp - addl %ebp,%edx - movl %esi,%ebp - rorl $2,%eax - movl %edi,12(%esp) - roll $5,%ebp - leal 1518500249(%edi,%edx,1),%edi - movl 16(%esp),%edx - addl %ebp,%edi - # 20_39 20 - movl %esi,%ebp - xorl 24(%esp),%edx - xorl %eax,%ebp - xorl 48(%esp),%edx - xorl %ebx,%ebp - xorl 4(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,16(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 20(%esp),%ecx - addl %ebp,%edx - # 20_39 21 - movl %edi,%ebp - xorl 28(%esp),%ecx - xorl %esi,%ebp - xorl 52(%esp),%ecx - xorl %eax,%ebp - xorl 8(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,20(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 24(%esp),%ebx - addl %ebp,%ecx - # 20_39 22 - movl %edx,%ebp - xorl 32(%esp),%ebx - xorl %edi,%ebp - xorl 56(%esp),%ebx - xorl %esi,%ebp - xorl 12(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,24(%esp) - leal 1859775393(%ebx,%eax,1),%ebx - movl 28(%esp),%eax - addl %ebp,%ebx - # 20_39 23 - movl %ecx,%ebp - xorl 36(%esp),%eax - xorl %edx,%ebp - xorl 60(%esp),%eax - xorl %edi,%ebp - xorl 16(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,28(%esp) - leal 1859775393(%eax,%esi,1),%eax - movl 32(%esp),%esi - addl %ebp,%eax - # 20_39 24 - movl %ebx,%ebp - xorl 40(%esp),%esi - xorl %ecx,%ebp - xorl (%esp),%esi - xorl %edx,%ebp - xorl 20(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,32(%esp) - leal 1859775393(%esi,%edi,1),%esi - movl 36(%esp),%edi - addl %ebp,%esi - # 20_39 25 - movl %eax,%ebp - xorl 44(%esp),%edi - xorl %ebx,%ebp - xorl 4(%esp),%edi - xorl %ecx,%ebp - xorl 24(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,36(%esp) - leal 1859775393(%edi,%edx,1),%edi - movl 40(%esp),%edx - addl %ebp,%edi - # 20_39 26 - movl %esi,%ebp - xorl 48(%esp),%edx - xorl %eax,%ebp - xorl 8(%esp),%edx - xorl %ebx,%ebp - xorl 28(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,40(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 44(%esp),%ecx - addl %ebp,%edx - # 20_39 27 - movl %edi,%ebp - xorl 52(%esp),%ecx - xorl %esi,%ebp - xorl 12(%esp),%ecx - xorl %eax,%ebp - xorl 32(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,44(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 48(%esp),%ebx - addl %ebp,%ecx - # 20_39 28 - movl %edx,%ebp - xorl 56(%esp),%ebx - xorl %edi,%ebp - xorl 16(%esp),%ebx - xorl %esi,%ebp - xorl 36(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,48(%esp) - leal 1859775393(%ebx,%eax,1),%ebx - movl 52(%esp),%eax - addl %ebp,%ebx - # 20_39 29 - movl %ecx,%ebp - xorl 60(%esp),%eax - xorl %edx,%ebp - xorl 20(%esp),%eax - xorl %edi,%ebp - xorl 40(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,52(%esp) - leal 1859775393(%eax,%esi,1),%eax - movl 56(%esp),%esi - addl %ebp,%eax - # 20_39 30 - movl %ebx,%ebp - xorl (%esp),%esi - xorl %ecx,%ebp - xorl 24(%esp),%esi - xorl %edx,%ebp - xorl 44(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,56(%esp) - leal 1859775393(%esi,%edi,1),%esi - movl 60(%esp),%edi - addl %ebp,%esi - # 20_39 31 - movl %eax,%ebp - xorl 4(%esp),%edi - xorl %ebx,%ebp - xorl 28(%esp),%edi - xorl %ecx,%ebp - xorl 48(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,60(%esp) - leal 1859775393(%edi,%edx,1),%edi - movl (%esp),%edx - addl %ebp,%edi - # 20_39 32 - movl %esi,%ebp - xorl 8(%esp),%edx - xorl %eax,%ebp - xorl 32(%esp),%edx - xorl %ebx,%ebp - xorl 52(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 4(%esp),%ecx - addl %ebp,%edx - # 20_39 33 - movl %edi,%ebp - xorl 12(%esp),%ecx - xorl %esi,%ebp - xorl 36(%esp),%ecx - xorl %eax,%ebp - xorl 56(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,4(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 8(%esp),%ebx - addl %ebp,%ecx - # 20_39 34 - movl %edx,%ebp - xorl 16(%esp),%ebx - xorl %edi,%ebp - xorl 40(%esp),%ebx - xorl %esi,%ebp - xorl 60(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,8(%esp) - leal 1859775393(%ebx,%eax,1),%ebx - movl 12(%esp),%eax - addl %ebp,%ebx - # 20_39 35 - movl %ecx,%ebp - xorl 20(%esp),%eax - xorl %edx,%ebp - xorl 44(%esp),%eax - xorl %edi,%ebp - xorl (%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,12(%esp) - leal 1859775393(%eax,%esi,1),%eax - movl 16(%esp),%esi - addl %ebp,%eax - # 20_39 36 - movl %ebx,%ebp - xorl 24(%esp),%esi - xorl %ecx,%ebp - xorl 48(%esp),%esi - xorl %edx,%ebp - xorl 4(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,16(%esp) - leal 1859775393(%esi,%edi,1),%esi - movl 20(%esp),%edi - addl %ebp,%esi - # 20_39 37 - movl %eax,%ebp - xorl 28(%esp),%edi - xorl %ebx,%ebp - xorl 52(%esp),%edi - xorl %ecx,%ebp - xorl 8(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,20(%esp) - leal 1859775393(%edi,%edx,1),%edi - movl 24(%esp),%edx - addl %ebp,%edi - # 20_39 38 - movl %esi,%ebp - xorl 32(%esp),%edx - xorl %eax,%ebp - xorl 56(%esp),%edx - xorl %ebx,%ebp - xorl 12(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,24(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 28(%esp),%ecx - addl %ebp,%edx - # 20_39 39 - movl %edi,%ebp - xorl 36(%esp),%ecx - xorl %esi,%ebp - xorl 60(%esp),%ecx - xorl %eax,%ebp - xorl 16(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,28(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 32(%esp),%ebx - addl %ebp,%ecx - # 40_59 40 - movl %edi,%ebp - xorl 40(%esp),%ebx - xorl %esi,%ebp - xorl (%esp),%ebx - andl %edx,%ebp - xorl 20(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,32(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 36(%esp),%eax - addl %ebp,%ebx - # 40_59 41 - movl %edx,%ebp - xorl 44(%esp),%eax - xorl %edi,%ebp - xorl 4(%esp),%eax - andl %ecx,%ebp - xorl 24(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx - movl %ebx,%esi - roll $5,%esi - movl %eax,36(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax - andl %edi,%ebp - movl 40(%esp),%esi - addl %ebp,%eax - # 40_59 42 - movl %ecx,%ebp - xorl 48(%esp),%esi - xorl %edx,%ebp - xorl 8(%esp),%esi - andl %ebx,%ebp - xorl 28(%esp),%esi - roll $1,%esi - addl %edi,%ebp - rorl $2,%ebx - movl %eax,%edi - roll $5,%edi - movl %esi,40(%esp) - leal 2400959708(%esi,%ebp,1),%esi - movl %ecx,%ebp - addl %edi,%esi - andl %edx,%ebp - movl 44(%esp),%edi - addl %ebp,%esi - # 40_59 43 - movl %ebx,%ebp - xorl 52(%esp),%edi - xorl %ecx,%ebp - xorl 12(%esp),%edi - andl %eax,%ebp - xorl 32(%esp),%edi - roll $1,%edi - addl %edx,%ebp - rorl $2,%eax - movl %esi,%edx - roll $5,%edx - movl %edi,44(%esp) - leal 2400959708(%edi,%ebp,1),%edi - movl %ebx,%ebp - addl %edx,%edi - andl %ecx,%ebp - movl 48(%esp),%edx - addl %ebp,%edi - # 40_59 44 - movl %eax,%ebp - xorl 56(%esp),%edx - xorl %ebx,%ebp - xorl 16(%esp),%edx - andl %esi,%ebp - xorl 36(%esp),%edx - roll $1,%edx - addl %ecx,%ebp - rorl $2,%esi - movl %edi,%ecx - roll $5,%ecx - movl %edx,48(%esp) - leal 2400959708(%edx,%ebp,1),%edx - movl %eax,%ebp - addl %ecx,%edx - andl %ebx,%ebp - movl 52(%esp),%ecx - addl %ebp,%edx - # 40_59 45 - movl %esi,%ebp - xorl 60(%esp),%ecx - xorl %eax,%ebp - xorl 20(%esp),%ecx - andl %edi,%ebp - xorl 40(%esp),%ecx - roll $1,%ecx - addl %ebx,%ebp - rorl $2,%edi - movl %edx,%ebx - roll $5,%ebx - movl %ecx,52(%esp) - leal 2400959708(%ecx,%ebp,1),%ecx - movl %esi,%ebp - addl %ebx,%ecx - andl %eax,%ebp - movl 56(%esp),%ebx - addl %ebp,%ecx - # 40_59 46 - movl %edi,%ebp - xorl (%esp),%ebx - xorl %esi,%ebp - xorl 24(%esp),%ebx - andl %edx,%ebp - xorl 44(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,56(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 60(%esp),%eax - addl %ebp,%ebx - # 40_59 47 - movl %edx,%ebp - xorl 4(%esp),%eax - xorl %edi,%ebp - xorl 28(%esp),%eax - andl %ecx,%ebp - xorl 48(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx - movl %ebx,%esi - roll $5,%esi - movl %eax,60(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax - andl %edi,%ebp - movl (%esp),%esi - addl %ebp,%eax - # 40_59 48 - movl %ecx,%ebp - xorl 8(%esp),%esi - xorl %edx,%ebp - xorl 32(%esp),%esi - andl %ebx,%ebp - xorl 52(%esp),%esi - roll $1,%esi - addl %edi,%ebp - rorl $2,%ebx - movl %eax,%edi - roll $5,%edi - movl %esi,(%esp) - leal 2400959708(%esi,%ebp,1),%esi - movl %ecx,%ebp - addl %edi,%esi - andl %edx,%ebp - movl 4(%esp),%edi - addl %ebp,%esi - # 40_59 49 - movl %ebx,%ebp - xorl 12(%esp),%edi - xorl %ecx,%ebp - xorl 36(%esp),%edi - andl %eax,%ebp - xorl 56(%esp),%edi - roll $1,%edi - addl %edx,%ebp - rorl $2,%eax - movl %esi,%edx - roll $5,%edx - movl %edi,4(%esp) - leal 2400959708(%edi,%ebp,1),%edi - movl %ebx,%ebp - addl %edx,%edi - andl %ecx,%ebp - movl 8(%esp),%edx - addl %ebp,%edi - # 40_59 50 - movl %eax,%ebp - xorl 16(%esp),%edx - xorl %ebx,%ebp - xorl 40(%esp),%edx - andl %esi,%ebp - xorl 60(%esp),%edx - roll $1,%edx - addl %ecx,%ebp - rorl $2,%esi - movl %edi,%ecx - roll $5,%ecx - movl %edx,8(%esp) - leal 2400959708(%edx,%ebp,1),%edx - movl %eax,%ebp - addl %ecx,%edx - andl %ebx,%ebp - movl 12(%esp),%ecx - addl %ebp,%edx - # 40_59 51 - movl %esi,%ebp - xorl 20(%esp),%ecx - xorl %eax,%ebp - xorl 44(%esp),%ecx - andl %edi,%ebp - xorl (%esp),%ecx - roll $1,%ecx - addl %ebx,%ebp - rorl $2,%edi - movl %edx,%ebx - roll $5,%ebx - movl %ecx,12(%esp) - leal 2400959708(%ecx,%ebp,1),%ecx - movl %esi,%ebp - addl %ebx,%ecx - andl %eax,%ebp - movl 16(%esp),%ebx - addl %ebp,%ecx - # 40_59 52 - movl %edi,%ebp - xorl 24(%esp),%ebx - xorl %esi,%ebp - xorl 48(%esp),%ebx - andl %edx,%ebp - xorl 4(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,16(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 20(%esp),%eax - addl %ebp,%ebx - # 40_59 53 - movl %edx,%ebp - xorl 28(%esp),%eax - xorl %edi,%ebp - xorl 52(%esp),%eax - andl %ecx,%ebp - xorl 8(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx - movl %ebx,%esi - roll $5,%esi - movl %eax,20(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax - andl %edi,%ebp - movl 24(%esp),%esi - addl %ebp,%eax - # 40_59 54 - movl %ecx,%ebp - xorl 32(%esp),%esi - xorl %edx,%ebp - xorl 56(%esp),%esi - andl %ebx,%ebp - xorl 12(%esp),%esi - roll $1,%esi - addl %edi,%ebp - rorl $2,%ebx - movl %eax,%edi - roll $5,%edi - movl %esi,24(%esp) - leal 2400959708(%esi,%ebp,1),%esi - movl %ecx,%ebp - addl %edi,%esi - andl %edx,%ebp - movl 28(%esp),%edi - addl %ebp,%esi - # 40_59 55 - movl %ebx,%ebp - xorl 36(%esp),%edi - xorl %ecx,%ebp - xorl 60(%esp),%edi - andl %eax,%ebp - xorl 16(%esp),%edi - roll $1,%edi - addl %edx,%ebp - rorl $2,%eax - movl %esi,%edx - roll $5,%edx - movl %edi,28(%esp) - leal 2400959708(%edi,%ebp,1),%edi - movl %ebx,%ebp - addl %edx,%edi - andl %ecx,%ebp - movl 32(%esp),%edx - addl %ebp,%edi - # 40_59 56 - movl %eax,%ebp - xorl 40(%esp),%edx - xorl %ebx,%ebp - xorl (%esp),%edx - andl %esi,%ebp - xorl 20(%esp),%edx - roll $1,%edx - addl %ecx,%ebp - rorl $2,%esi - movl %edi,%ecx - roll $5,%ecx - movl %edx,32(%esp) - leal 2400959708(%edx,%ebp,1),%edx - movl %eax,%ebp - addl %ecx,%edx - andl %ebx,%ebp - movl 36(%esp),%ecx - addl %ebp,%edx - # 40_59 57 - movl %esi,%ebp - xorl 44(%esp),%ecx - xorl %eax,%ebp - xorl 4(%esp),%ecx - andl %edi,%ebp - xorl 24(%esp),%ecx - roll $1,%ecx - addl %ebx,%ebp - rorl $2,%edi - movl %edx,%ebx - roll $5,%ebx - movl %ecx,36(%esp) - leal 2400959708(%ecx,%ebp,1),%ecx - movl %esi,%ebp - addl %ebx,%ecx - andl %eax,%ebp - movl 40(%esp),%ebx - addl %ebp,%ecx - # 40_59 58 - movl %edi,%ebp - xorl 48(%esp),%ebx - xorl %esi,%ebp - xorl 8(%esp),%ebx - andl %edx,%ebp - xorl 28(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,40(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 44(%esp),%eax - addl %ebp,%ebx - # 40_59 59 - movl %edx,%ebp - xorl 52(%esp),%eax - xorl %edi,%ebp - xorl 12(%esp),%eax - andl %ecx,%ebp - xorl 32(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx - movl %ebx,%esi - roll $5,%esi - movl %eax,44(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax - andl %edi,%ebp - movl 48(%esp),%esi - addl %ebp,%eax - # 20_39 60 - movl %ebx,%ebp - xorl 56(%esp),%esi - xorl %ecx,%ebp - xorl 16(%esp),%esi - xorl %edx,%ebp - xorl 36(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,48(%esp) - leal 3395469782(%esi,%edi,1),%esi - movl 52(%esp),%edi - addl %ebp,%esi - # 20_39 61 - movl %eax,%ebp - xorl 60(%esp),%edi - xorl %ebx,%ebp - xorl 20(%esp),%edi - xorl %ecx,%ebp - xorl 40(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,52(%esp) - leal 3395469782(%edi,%edx,1),%edi - movl 56(%esp),%edx - addl %ebp,%edi - # 20_39 62 - movl %esi,%ebp - xorl (%esp),%edx - xorl %eax,%ebp - xorl 24(%esp),%edx - xorl %ebx,%ebp - xorl 44(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,56(%esp) - leal 3395469782(%edx,%ecx,1),%edx - movl 60(%esp),%ecx - addl %ebp,%edx - # 20_39 63 - movl %edi,%ebp - xorl 4(%esp),%ecx - xorl %esi,%ebp - xorl 28(%esp),%ecx - xorl %eax,%ebp - xorl 48(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,60(%esp) - leal 3395469782(%ecx,%ebx,1),%ecx - movl (%esp),%ebx - addl %ebp,%ecx - # 20_39 64 - movl %edx,%ebp - xorl 8(%esp),%ebx - xorl %edi,%ebp - xorl 32(%esp),%ebx - xorl %esi,%ebp - xorl 52(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,(%esp) - leal 3395469782(%ebx,%eax,1),%ebx - movl 4(%esp),%eax - addl %ebp,%ebx - # 20_39 65 - movl %ecx,%ebp - xorl 12(%esp),%eax - xorl %edx,%ebp - xorl 36(%esp),%eax - xorl %edi,%ebp - xorl 56(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,4(%esp) - leal 3395469782(%eax,%esi,1),%eax - movl 8(%esp),%esi - addl %ebp,%eax - # 20_39 66 - movl %ebx,%ebp - xorl 16(%esp),%esi - xorl %ecx,%ebp - xorl 40(%esp),%esi - xorl %edx,%ebp - xorl 60(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,8(%esp) - leal 3395469782(%esi,%edi,1),%esi - movl 12(%esp),%edi - addl %ebp,%esi - # 20_39 67 - movl %eax,%ebp - xorl 20(%esp),%edi - xorl %ebx,%ebp - xorl 44(%esp),%edi - xorl %ecx,%ebp - xorl (%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,12(%esp) - leal 3395469782(%edi,%edx,1),%edi - movl 16(%esp),%edx - addl %ebp,%edi - # 20_39 68 - movl %esi,%ebp - xorl 24(%esp),%edx - xorl %eax,%ebp - xorl 48(%esp),%edx - xorl %ebx,%ebp - xorl 4(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,16(%esp) - leal 3395469782(%edx,%ecx,1),%edx - movl 20(%esp),%ecx - addl %ebp,%edx - # 20_39 69 - movl %edi,%ebp - xorl 28(%esp),%ecx - xorl %esi,%ebp - xorl 52(%esp),%ecx - xorl %eax,%ebp - xorl 8(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,20(%esp) - leal 3395469782(%ecx,%ebx,1),%ecx - movl 24(%esp),%ebx - addl %ebp,%ecx - # 20_39 70 - movl %edx,%ebp - xorl 32(%esp),%ebx - xorl %edi,%ebp - xorl 56(%esp),%ebx - xorl %esi,%ebp - xorl 12(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,24(%esp) - leal 3395469782(%ebx,%eax,1),%ebx - movl 28(%esp),%eax - addl %ebp,%ebx - # 20_39 71 - movl %ecx,%ebp - xorl 36(%esp),%eax - xorl %edx,%ebp - xorl 60(%esp),%eax - xorl %edi,%ebp - xorl 16(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,28(%esp) - leal 3395469782(%eax,%esi,1),%eax - movl 32(%esp),%esi - addl %ebp,%eax - # 20_39 72 - movl %ebx,%ebp - xorl 40(%esp),%esi - xorl %ecx,%ebp - xorl (%esp),%esi - xorl %edx,%ebp - xorl 20(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,32(%esp) - leal 3395469782(%esi,%edi,1),%esi - movl 36(%esp),%edi - addl %ebp,%esi - # 20_39 73 - movl %eax,%ebp - xorl 44(%esp),%edi - xorl %ebx,%ebp - xorl 4(%esp),%edi - xorl %ecx,%ebp - xorl 24(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,36(%esp) - leal 3395469782(%edi,%edx,1),%edi - movl 40(%esp),%edx - addl %ebp,%edi - # 20_39 74 - movl %esi,%ebp - xorl 48(%esp),%edx - xorl %eax,%ebp - xorl 8(%esp),%edx - xorl %ebx,%ebp - xorl 28(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,40(%esp) - leal 3395469782(%edx,%ecx,1),%edx - movl 44(%esp),%ecx - addl %ebp,%edx - # 20_39 75 - movl %edi,%ebp - xorl 52(%esp),%ecx - xorl %esi,%ebp - xorl 12(%esp),%ecx - xorl %eax,%ebp - xorl 32(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,44(%esp) - leal 3395469782(%ecx,%ebx,1),%ecx - movl 48(%esp),%ebx - addl %ebp,%ecx - # 20_39 76 - movl %edx,%ebp - xorl 56(%esp),%ebx - xorl %edi,%ebp - xorl 16(%esp),%ebx - xorl %esi,%ebp - xorl 36(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,48(%esp) - leal 3395469782(%ebx,%eax,1),%ebx - movl 52(%esp),%eax - addl %ebp,%ebx - # 20_39 77 - movl %ecx,%ebp - xorl 60(%esp),%eax - xorl %edx,%ebp - xorl 20(%esp),%eax - xorl %edi,%ebp - xorl 40(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - leal 3395469782(%eax,%esi,1),%eax - movl 56(%esp),%esi - addl %ebp,%eax - # 20_39 78 - movl %ebx,%ebp - xorl (%esp),%esi - xorl %ecx,%ebp - xorl 24(%esp),%esi - xorl %edx,%ebp - xorl 44(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - leal 3395469782(%esi,%edi,1),%esi - movl 60(%esp),%edi - addl %ebp,%esi - # 20_39 79 - movl %eax,%ebp - xorl 4(%esp),%edi - xorl %ebx,%ebp - xorl 28(%esp),%edi - xorl %ecx,%ebp - xorl 48(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - leal 3395469782(%edi,%edx,1),%edi - addl %ebp,%edi - movl 96(%esp),%ebp - movl 100(%esp),%edx - addl (%ebp),%edi - addl 4(%ebp),%esi - addl 8(%ebp),%eax - addl 12(%ebp),%ebx - addl 16(%ebp),%ecx - movl %edi,(%ebp) - addl $64,%edx - movl %esi,4(%ebp) - cmpl 104(%esp),%edx - movl %eax,8(%ebp) - movl %ecx,%edi - movl %ebx,12(%ebp) - movl %edx,%esi - movl %ecx,16(%ebp) - jb L002loop - addl $76,%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.private_extern __sha1_block_data_order_ssse3 -.align 4 -__sha1_block_data_order_ssse3: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call L003pic_point -L003pic_point: - popl %ebp - leal LK_XX_XX-L003pic_point(%ebp),%ebp -Lssse3_shortcut: - movdqa (%ebp),%xmm7 - movdqa 16(%ebp),%xmm0 - movdqa 32(%ebp),%xmm1 - movdqa 48(%ebp),%xmm2 - movdqa 64(%ebp),%xmm6 - movl 20(%esp),%edi - movl 24(%esp),%ebp - movl 28(%esp),%edx - movl %esp,%esi - subl $208,%esp - andl $-64,%esp - movdqa %xmm0,112(%esp) - movdqa %xmm1,128(%esp) - movdqa %xmm2,144(%esp) - shll $6,%edx - movdqa %xmm7,160(%esp) - addl %ebp,%edx - movdqa %xmm6,176(%esp) - addl $64,%ebp - movl %edi,192(%esp) - movl %ebp,196(%esp) - movl %edx,200(%esp) - movl %esi,204(%esp) - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - movl 16(%edi),%edi - movl %ebx,%esi - movdqu -64(%ebp),%xmm0 - movdqu -48(%ebp),%xmm1 - movdqu -32(%ebp),%xmm2 - movdqu -16(%ebp),%xmm3 -.byte 102,15,56,0,198 -.byte 102,15,56,0,206 -.byte 102,15,56,0,214 - movdqa %xmm7,96(%esp) -.byte 102,15,56,0,222 - paddd %xmm7,%xmm0 - paddd %xmm7,%xmm1 - paddd %xmm7,%xmm2 - movdqa %xmm0,(%esp) - psubd %xmm7,%xmm0 - movdqa %xmm1,16(%esp) - psubd %xmm7,%xmm1 - movdqa %xmm2,32(%esp) - movl %ecx,%ebp - psubd %xmm7,%xmm2 - xorl %edx,%ebp - pshufd $238,%xmm0,%xmm4 - andl %ebp,%esi - jmp L004loop -.align 4,0x90 -L004loop: - rorl $2,%ebx - xorl %edx,%esi - movl %eax,%ebp - punpcklqdq %xmm1,%xmm4 - movdqa %xmm3,%xmm6 - addl (%esp),%edi - xorl %ecx,%ebx - paddd %xmm3,%xmm7 - movdqa %xmm0,64(%esp) - roll $5,%eax - addl %esi,%edi - psrldq $4,%xmm6 - andl %ebx,%ebp - xorl %ecx,%ebx - pxor %xmm0,%xmm4 - addl %eax,%edi - rorl $7,%eax - pxor %xmm2,%xmm6 - xorl %ecx,%ebp - movl %edi,%esi - addl 4(%esp),%edx - pxor %xmm6,%xmm4 - xorl %ebx,%eax - roll $5,%edi - movdqa %xmm7,48(%esp) - addl %ebp,%edx - andl %eax,%esi - movdqa %xmm4,%xmm0 - xorl %ebx,%eax - addl %edi,%edx - rorl $7,%edi - movdqa %xmm4,%xmm6 - xorl %ebx,%esi - pslldq $12,%xmm0 - paddd %xmm4,%xmm4 - movl %edx,%ebp - addl 8(%esp),%ecx - psrld $31,%xmm6 - xorl %eax,%edi - roll $5,%edx - movdqa %xmm0,%xmm7 - addl %esi,%ecx - andl %edi,%ebp - xorl %eax,%edi - psrld $30,%xmm0 - addl %edx,%ecx - rorl $7,%edx - por %xmm6,%xmm4 - xorl %eax,%ebp - movl %ecx,%esi - addl 12(%esp),%ebx - pslld $2,%xmm7 - xorl %edi,%edx - roll $5,%ecx - pxor %xmm0,%xmm4 - movdqa 96(%esp),%xmm0 - addl %ebp,%ebx - andl %edx,%esi - pxor %xmm7,%xmm4 - pshufd $238,%xmm1,%xmm5 - xorl %edi,%edx - addl %ecx,%ebx - rorl $7,%ecx - xorl %edi,%esi - movl %ebx,%ebp - punpcklqdq %xmm2,%xmm5 - movdqa %xmm4,%xmm7 - addl 16(%esp),%eax - xorl %edx,%ecx - paddd %xmm4,%xmm0 - movdqa %xmm1,80(%esp) - roll $5,%ebx - addl %esi,%eax - psrldq $4,%xmm7 - andl %ecx,%ebp - xorl %edx,%ecx - pxor %xmm1,%xmm5 - addl %ebx,%eax - rorl $7,%ebx - pxor %xmm3,%xmm7 - xorl %edx,%ebp - movl %eax,%esi - addl 20(%esp),%edi - pxor %xmm7,%xmm5 - xorl %ecx,%ebx - roll $5,%eax - movdqa %xmm0,(%esp) - addl %ebp,%edi - andl %ebx,%esi - movdqa %xmm5,%xmm1 - xorl %ecx,%ebx - addl %eax,%edi - rorl $7,%eax - movdqa %xmm5,%xmm7 - xorl %ecx,%esi - pslldq $12,%xmm1 - paddd %xmm5,%xmm5 - movl %edi,%ebp - addl 24(%esp),%edx - psrld $31,%xmm7 - xorl %ebx,%eax - roll $5,%edi - movdqa %xmm1,%xmm0 - addl %esi,%edx - andl %eax,%ebp - xorl %ebx,%eax - psrld $30,%xmm1 - addl %edi,%edx - rorl $7,%edi - por %xmm7,%xmm5 - xorl %ebx,%ebp - movl %edx,%esi - addl 28(%esp),%ecx - pslld $2,%xmm0 - xorl %eax,%edi - roll $5,%edx - pxor %xmm1,%xmm5 - movdqa 112(%esp),%xmm1 - addl %ebp,%ecx - andl %edi,%esi - pxor %xmm0,%xmm5 - pshufd $238,%xmm2,%xmm6 - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%edx - xorl %eax,%esi - movl %ecx,%ebp - punpcklqdq %xmm3,%xmm6 - movdqa %xmm5,%xmm0 - addl 32(%esp),%ebx - xorl %edi,%edx - paddd %xmm5,%xmm1 - movdqa %xmm2,96(%esp) - roll $5,%ecx - addl %esi,%ebx - psrldq $4,%xmm0 - andl %edx,%ebp - xorl %edi,%edx - pxor %xmm2,%xmm6 - addl %ecx,%ebx - rorl $7,%ecx - pxor %xmm4,%xmm0 - xorl %edi,%ebp - movl %ebx,%esi - addl 36(%esp),%eax - pxor %xmm0,%xmm6 - xorl %edx,%ecx - roll $5,%ebx - movdqa %xmm1,16(%esp) - addl %ebp,%eax - andl %ecx,%esi - movdqa %xmm6,%xmm2 - xorl %edx,%ecx - addl %ebx,%eax - rorl $7,%ebx - movdqa %xmm6,%xmm0 - xorl %edx,%esi - pslldq $12,%xmm2 - paddd %xmm6,%xmm6 - movl %eax,%ebp - addl 40(%esp),%edi - psrld $31,%xmm0 - xorl %ecx,%ebx - roll $5,%eax - movdqa %xmm2,%xmm1 - addl %esi,%edi - andl %ebx,%ebp - xorl %ecx,%ebx - psrld $30,%xmm2 - addl %eax,%edi - rorl $7,%eax - por %xmm0,%xmm6 - xorl %ecx,%ebp - movdqa 64(%esp),%xmm0 - movl %edi,%esi - addl 44(%esp),%edx - pslld $2,%xmm1 - xorl %ebx,%eax - roll $5,%edi - pxor %xmm2,%xmm6 - movdqa 112(%esp),%xmm2 - addl %ebp,%edx - andl %eax,%esi - pxor %xmm1,%xmm6 - pshufd $238,%xmm3,%xmm7 - xorl %ebx,%eax - addl %edi,%edx - rorl $7,%edi - xorl %ebx,%esi - movl %edx,%ebp - punpcklqdq %xmm4,%xmm7 - movdqa %xmm6,%xmm1 - addl 48(%esp),%ecx - xorl %eax,%edi - paddd %xmm6,%xmm2 - movdqa %xmm3,64(%esp) - roll $5,%edx - addl %esi,%ecx - psrldq $4,%xmm1 - andl %edi,%ebp - xorl %eax,%edi - pxor %xmm3,%xmm7 - addl %edx,%ecx - rorl $7,%edx - pxor %xmm5,%xmm1 - xorl %eax,%ebp - movl %ecx,%esi - addl 52(%esp),%ebx - pxor %xmm1,%xmm7 - xorl %edi,%edx - roll $5,%ecx - movdqa %xmm2,32(%esp) - addl %ebp,%ebx - andl %edx,%esi - movdqa %xmm7,%xmm3 - xorl %edi,%edx - addl %ecx,%ebx - rorl $7,%ecx - movdqa %xmm7,%xmm1 - xorl %edi,%esi - pslldq $12,%xmm3 - paddd %xmm7,%xmm7 - movl %ebx,%ebp - addl 56(%esp),%eax - psrld $31,%xmm1 - xorl %edx,%ecx - roll $5,%ebx - movdqa %xmm3,%xmm2 - addl %esi,%eax - andl %ecx,%ebp - xorl %edx,%ecx - psrld $30,%xmm3 - addl %ebx,%eax - rorl $7,%ebx - por %xmm1,%xmm7 - xorl %edx,%ebp - movdqa 80(%esp),%xmm1 - movl %eax,%esi - addl 60(%esp),%edi - pslld $2,%xmm2 - xorl %ecx,%ebx - roll $5,%eax - pxor %xmm3,%xmm7 - movdqa 112(%esp),%xmm3 - addl %ebp,%edi - andl %ebx,%esi - pxor %xmm2,%xmm7 - pshufd $238,%xmm6,%xmm2 - xorl %ecx,%ebx - addl %eax,%edi - rorl $7,%eax - pxor %xmm4,%xmm0 - punpcklqdq %xmm7,%xmm2 - xorl %ecx,%esi - movl %edi,%ebp - addl (%esp),%edx - pxor %xmm1,%xmm0 - movdqa %xmm4,80(%esp) - xorl %ebx,%eax - roll $5,%edi - movdqa %xmm3,%xmm4 - addl %esi,%edx - paddd %xmm7,%xmm3 - andl %eax,%ebp - pxor %xmm2,%xmm0 - xorl %ebx,%eax - addl %edi,%edx - rorl $7,%edi - xorl %ebx,%ebp - movdqa %xmm0,%xmm2 - movdqa %xmm3,48(%esp) - movl %edx,%esi - addl 4(%esp),%ecx - xorl %eax,%edi - roll $5,%edx - pslld $2,%xmm0 - addl %ebp,%ecx - andl %edi,%esi - psrld $30,%xmm2 - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%edx - xorl %eax,%esi - movl %ecx,%ebp - addl 8(%esp),%ebx - xorl %edi,%edx - roll $5,%ecx - por %xmm2,%xmm0 - addl %esi,%ebx - andl %edx,%ebp - movdqa 96(%esp),%xmm2 - xorl %edi,%edx - addl %ecx,%ebx - addl 12(%esp),%eax - xorl %edi,%ebp - movl %ebx,%esi - pshufd $238,%xmm7,%xmm3 - roll $5,%ebx - addl %ebp,%eax - xorl %edx,%esi - rorl $7,%ecx - addl %ebx,%eax - addl 16(%esp),%edi - pxor %xmm5,%xmm1 - punpcklqdq %xmm0,%xmm3 - xorl %ecx,%esi - movl %eax,%ebp - roll $5,%eax - pxor %xmm2,%xmm1 - movdqa %xmm5,96(%esp) - addl %esi,%edi - xorl %ecx,%ebp - movdqa %xmm4,%xmm5 - rorl $7,%ebx - paddd %xmm0,%xmm4 - addl %eax,%edi - pxor %xmm3,%xmm1 - addl 20(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - roll $5,%edi - movdqa %xmm1,%xmm3 - movdqa %xmm4,(%esp) - addl %ebp,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm1 - addl 24(%esp),%ecx - xorl %eax,%esi - psrld $30,%xmm3 - movl %edx,%ebp - roll $5,%edx - addl %esi,%ecx - xorl %eax,%ebp - rorl $7,%edi - addl %edx,%ecx - por %xmm3,%xmm1 - addl 28(%esp),%ebx - xorl %edi,%ebp - movdqa 64(%esp),%xmm3 - movl %ecx,%esi - roll $5,%ecx - addl %ebp,%ebx - xorl %edi,%esi - rorl $7,%edx - pshufd $238,%xmm0,%xmm4 - addl %ecx,%ebx - addl 32(%esp),%eax - pxor %xmm6,%xmm2 - punpcklqdq %xmm1,%xmm4 - xorl %edx,%esi - movl %ebx,%ebp - roll $5,%ebx - pxor %xmm3,%xmm2 - movdqa %xmm6,64(%esp) - addl %esi,%eax - xorl %edx,%ebp - movdqa 128(%esp),%xmm6 - rorl $7,%ecx - paddd %xmm1,%xmm5 - addl %ebx,%eax - pxor %xmm4,%xmm2 - addl 36(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - roll $5,%eax - movdqa %xmm2,%xmm4 - movdqa %xmm5,16(%esp) - addl %ebp,%edi - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%edi - pslld $2,%xmm2 - addl 40(%esp),%edx - xorl %ebx,%esi - psrld $30,%xmm4 - movl %edi,%ebp - roll $5,%edi - addl %esi,%edx - xorl %ebx,%ebp - rorl $7,%eax - addl %edi,%edx - por %xmm4,%xmm2 - addl 44(%esp),%ecx - xorl %eax,%ebp - movdqa 80(%esp),%xmm4 - movl %edx,%esi - roll $5,%edx - addl %ebp,%ecx - xorl %eax,%esi - rorl $7,%edi - pshufd $238,%xmm1,%xmm5 - addl %edx,%ecx - addl 48(%esp),%ebx - pxor %xmm7,%xmm3 - punpcklqdq %xmm2,%xmm5 - xorl %edi,%esi - movl %ecx,%ebp - roll $5,%ecx - pxor %xmm4,%xmm3 - movdqa %xmm7,80(%esp) - addl %esi,%ebx - xorl %edi,%ebp - movdqa %xmm6,%xmm7 - rorl $7,%edx - paddd %xmm2,%xmm6 - addl %ecx,%ebx - pxor %xmm5,%xmm3 - addl 52(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - roll $5,%ebx - movdqa %xmm3,%xmm5 - movdqa %xmm6,32(%esp) - addl %ebp,%eax - xorl %edx,%esi - rorl $7,%ecx - addl %ebx,%eax - pslld $2,%xmm3 - addl 56(%esp),%edi - xorl %ecx,%esi - psrld $30,%xmm5 - movl %eax,%ebp - roll $5,%eax - addl %esi,%edi - xorl %ecx,%ebp - rorl $7,%ebx - addl %eax,%edi - por %xmm5,%xmm3 - addl 60(%esp),%edx - xorl %ebx,%ebp - movdqa 96(%esp),%xmm5 - movl %edi,%esi - roll $5,%edi - addl %ebp,%edx - xorl %ebx,%esi - rorl $7,%eax - pshufd $238,%xmm2,%xmm6 - addl %edi,%edx - addl (%esp),%ecx - pxor %xmm0,%xmm4 - punpcklqdq %xmm3,%xmm6 - xorl %eax,%esi - movl %edx,%ebp - roll $5,%edx - pxor %xmm5,%xmm4 - movdqa %xmm0,96(%esp) - addl %esi,%ecx - xorl %eax,%ebp - movdqa %xmm7,%xmm0 - rorl $7,%edi - paddd %xmm3,%xmm7 - addl %edx,%ecx - pxor %xmm6,%xmm4 - addl 4(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - roll $5,%ecx - movdqa %xmm4,%xmm6 - movdqa %xmm7,48(%esp) - addl %ebp,%ebx - xorl %edi,%esi - rorl $7,%edx - addl %ecx,%ebx - pslld $2,%xmm4 - addl 8(%esp),%eax - xorl %edx,%esi - psrld $30,%xmm6 - movl %ebx,%ebp - roll $5,%ebx - addl %esi,%eax - xorl %edx,%ebp - rorl $7,%ecx - addl %ebx,%eax - por %xmm6,%xmm4 - addl 12(%esp),%edi - xorl %ecx,%ebp - movdqa 64(%esp),%xmm6 - movl %eax,%esi - roll $5,%eax - addl %ebp,%edi - xorl %ecx,%esi - rorl $7,%ebx - pshufd $238,%xmm3,%xmm7 - addl %eax,%edi - addl 16(%esp),%edx - pxor %xmm1,%xmm5 - punpcklqdq %xmm4,%xmm7 - xorl %ebx,%esi - movl %edi,%ebp - roll $5,%edi - pxor %xmm6,%xmm5 - movdqa %xmm1,64(%esp) - addl %esi,%edx - xorl %ebx,%ebp - movdqa %xmm0,%xmm1 - rorl $7,%eax - paddd %xmm4,%xmm0 - addl %edi,%edx - pxor %xmm7,%xmm5 - addl 20(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - roll $5,%edx - movdqa %xmm5,%xmm7 - movdqa %xmm0,(%esp) - addl %ebp,%ecx - xorl %eax,%esi - rorl $7,%edi - addl %edx,%ecx - pslld $2,%xmm5 - addl 24(%esp),%ebx - xorl %edi,%esi - psrld $30,%xmm7 - movl %ecx,%ebp - roll $5,%ecx - addl %esi,%ebx - xorl %edi,%ebp - rorl $7,%edx - addl %ecx,%ebx - por %xmm7,%xmm5 - addl 28(%esp),%eax - movdqa 80(%esp),%xmm7 - rorl $7,%ecx - movl %ebx,%esi - xorl %edx,%ebp - roll $5,%ebx - pshufd $238,%xmm4,%xmm0 - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 32(%esp),%edi - pxor %xmm2,%xmm6 - punpcklqdq %xmm5,%xmm0 - andl %ecx,%esi - xorl %edx,%ecx - rorl $7,%ebx - pxor %xmm7,%xmm6 - movdqa %xmm2,80(%esp) - movl %eax,%ebp - xorl %ecx,%esi - roll $5,%eax - movdqa %xmm1,%xmm2 - addl %esi,%edi - paddd %xmm5,%xmm1 - xorl %ebx,%ebp - pxor %xmm0,%xmm6 - xorl %ecx,%ebx - addl %eax,%edi - addl 36(%esp),%edx - andl %ebx,%ebp - movdqa %xmm6,%xmm0 - movdqa %xmm1,16(%esp) - xorl %ecx,%ebx - rorl $7,%eax - movl %edi,%esi - xorl %ebx,%ebp - roll $5,%edi - pslld $2,%xmm6 - addl %ebp,%edx - xorl %eax,%esi - psrld $30,%xmm0 - xorl %ebx,%eax - addl %edi,%edx - addl 40(%esp),%ecx - andl %eax,%esi - xorl %ebx,%eax - rorl $7,%edi - por %xmm0,%xmm6 - movl %edx,%ebp - xorl %eax,%esi - movdqa 96(%esp),%xmm0 - roll $5,%edx - addl %esi,%ecx - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - pshufd $238,%xmm5,%xmm1 - addl 44(%esp),%ebx - andl %edi,%ebp - xorl %eax,%edi - rorl $7,%edx - movl %ecx,%esi - xorl %edi,%ebp - roll $5,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - addl 48(%esp),%eax - pxor %xmm3,%xmm7 - punpcklqdq %xmm6,%xmm1 - andl %edx,%esi - xorl %edi,%edx - rorl $7,%ecx - pxor %xmm0,%xmm7 - movdqa %xmm3,96(%esp) - movl %ebx,%ebp - xorl %edx,%esi - roll $5,%ebx - movdqa 144(%esp),%xmm3 - addl %esi,%eax - paddd %xmm6,%xmm2 - xorl %ecx,%ebp - pxor %xmm1,%xmm7 - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%esp),%edi - andl %ecx,%ebp - movdqa %xmm7,%xmm1 - movdqa %xmm2,32(%esp) - xorl %edx,%ecx - rorl $7,%ebx - movl %eax,%esi - xorl %ecx,%ebp - roll $5,%eax - pslld $2,%xmm7 - addl %ebp,%edi - xorl %ebx,%esi - psrld $30,%xmm1 - xorl %ecx,%ebx - addl %eax,%edi - addl 56(%esp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - rorl $7,%eax - por %xmm1,%xmm7 - movl %edi,%ebp - xorl %ebx,%esi - movdqa 64(%esp),%xmm1 - roll $5,%edi - addl %esi,%edx - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - pshufd $238,%xmm6,%xmm2 - addl 60(%esp),%ecx - andl %eax,%ebp - xorl %ebx,%eax - rorl $7,%edi - movl %edx,%esi - xorl %eax,%ebp - roll $5,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - addl (%esp),%ebx - pxor %xmm4,%xmm0 - punpcklqdq %xmm7,%xmm2 - andl %edi,%esi - xorl %eax,%edi - rorl $7,%edx - pxor %xmm1,%xmm0 - movdqa %xmm4,64(%esp) - movl %ecx,%ebp - xorl %edi,%esi - roll $5,%ecx - movdqa %xmm3,%xmm4 - addl %esi,%ebx - paddd %xmm7,%xmm3 - xorl %edx,%ebp - pxor %xmm2,%xmm0 - xorl %edi,%edx - addl %ecx,%ebx - addl 4(%esp),%eax - andl %edx,%ebp - movdqa %xmm0,%xmm2 - movdqa %xmm3,48(%esp) - xorl %edi,%edx - rorl $7,%ecx - movl %ebx,%esi - xorl %edx,%ebp - roll $5,%ebx - pslld $2,%xmm0 - addl %ebp,%eax - xorl %ecx,%esi - psrld $30,%xmm2 - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%esp),%edi - andl %ecx,%esi - xorl %edx,%ecx - rorl $7,%ebx - por %xmm2,%xmm0 - movl %eax,%ebp - xorl %ecx,%esi - movdqa 80(%esp),%xmm2 - roll $5,%eax - addl %esi,%edi - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - pshufd $238,%xmm7,%xmm3 - addl 12(%esp),%edx - andl %ebx,%ebp - xorl %ecx,%ebx - rorl $7,%eax - movl %edi,%esi - xorl %ebx,%ebp - roll $5,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - addl 16(%esp),%ecx - pxor %xmm5,%xmm1 - punpcklqdq %xmm0,%xmm3 - andl %eax,%esi - xorl %ebx,%eax - rorl $7,%edi - pxor %xmm2,%xmm1 - movdqa %xmm5,80(%esp) - movl %edx,%ebp - xorl %eax,%esi - roll $5,%edx - movdqa %xmm4,%xmm5 - addl %esi,%ecx - paddd %xmm0,%xmm4 - xorl %edi,%ebp - pxor %xmm3,%xmm1 - xorl %eax,%edi - addl %edx,%ecx - addl 20(%esp),%ebx - andl %edi,%ebp - movdqa %xmm1,%xmm3 - movdqa %xmm4,(%esp) - xorl %eax,%edi - rorl $7,%edx - movl %ecx,%esi - xorl %edi,%ebp - roll $5,%ecx - pslld $2,%xmm1 - addl %ebp,%ebx - xorl %edx,%esi - psrld $30,%xmm3 - xorl %edi,%edx - addl %ecx,%ebx - addl 24(%esp),%eax - andl %edx,%esi - xorl %edi,%edx - rorl $7,%ecx - por %xmm3,%xmm1 - movl %ebx,%ebp - xorl %edx,%esi - movdqa 96(%esp),%xmm3 - roll $5,%ebx - addl %esi,%eax - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - pshufd $238,%xmm0,%xmm4 - addl 28(%esp),%edi - andl %ecx,%ebp - xorl %edx,%ecx - rorl $7,%ebx - movl %eax,%esi - xorl %ecx,%ebp - roll $5,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - addl 32(%esp),%edx - pxor %xmm6,%xmm2 - punpcklqdq %xmm1,%xmm4 - andl %ebx,%esi - xorl %ecx,%ebx - rorl $7,%eax - pxor %xmm3,%xmm2 - movdqa %xmm6,96(%esp) - movl %edi,%ebp - xorl %ebx,%esi - roll $5,%edi - movdqa %xmm5,%xmm6 - addl %esi,%edx - paddd %xmm1,%xmm5 - xorl %eax,%ebp - pxor %xmm4,%xmm2 - xorl %ebx,%eax - addl %edi,%edx - addl 36(%esp),%ecx - andl %eax,%ebp - movdqa %xmm2,%xmm4 - movdqa %xmm5,16(%esp) - xorl %ebx,%eax - rorl $7,%edi - movl %edx,%esi - xorl %eax,%ebp - roll $5,%edx - pslld $2,%xmm2 - addl %ebp,%ecx - xorl %edi,%esi - psrld $30,%xmm4 - xorl %eax,%edi - addl %edx,%ecx - addl 40(%esp),%ebx - andl %edi,%esi - xorl %eax,%edi - rorl $7,%edx - por %xmm4,%xmm2 - movl %ecx,%ebp - xorl %edi,%esi - movdqa 64(%esp),%xmm4 - roll $5,%ecx - addl %esi,%ebx - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - pshufd $238,%xmm1,%xmm5 - addl 44(%esp),%eax - andl %edx,%ebp - xorl %edi,%edx - rorl $7,%ecx - movl %ebx,%esi - xorl %edx,%ebp - roll $5,%ebx - addl %ebp,%eax - xorl %edx,%esi - addl %ebx,%eax - addl 48(%esp),%edi - pxor %xmm7,%xmm3 - punpcklqdq %xmm2,%xmm5 - xorl %ecx,%esi - movl %eax,%ebp - roll $5,%eax - pxor %xmm4,%xmm3 - movdqa %xmm7,64(%esp) - addl %esi,%edi - xorl %ecx,%ebp - movdqa %xmm6,%xmm7 - rorl $7,%ebx - paddd %xmm2,%xmm6 - addl %eax,%edi - pxor %xmm5,%xmm3 - addl 52(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - roll $5,%edi - movdqa %xmm3,%xmm5 - movdqa %xmm6,32(%esp) - addl %ebp,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm3 - addl 56(%esp),%ecx - xorl %eax,%esi - psrld $30,%xmm5 - movl %edx,%ebp - roll $5,%edx - addl %esi,%ecx - xorl %eax,%ebp - rorl $7,%edi - addl %edx,%ecx - por %xmm5,%xmm3 - addl 60(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - roll $5,%ecx - addl %ebp,%ebx - xorl %edi,%esi - rorl $7,%edx - addl %ecx,%ebx - addl (%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - roll $5,%ebx - addl %esi,%eax - xorl %edx,%ebp - rorl $7,%ecx - paddd %xmm3,%xmm7 - addl %ebx,%eax - addl 4(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - movdqa %xmm7,48(%esp) - roll $5,%eax - addl %ebp,%edi - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%edi - addl 8(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - roll $5,%edi - addl %esi,%edx - xorl %ebx,%ebp - rorl $7,%eax - addl %edi,%edx - addl 12(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - roll $5,%edx - addl %ebp,%ecx - xorl %eax,%esi - rorl $7,%edi - addl %edx,%ecx - movl 196(%esp),%ebp - cmpl 200(%esp),%ebp - je L005done - movdqa 160(%esp),%xmm7 - movdqa 176(%esp),%xmm6 - movdqu (%ebp),%xmm0 - movdqu 16(%ebp),%xmm1 - movdqu 32(%ebp),%xmm2 - movdqu 48(%ebp),%xmm3 - addl $64,%ebp -.byte 102,15,56,0,198 - movl %ebp,196(%esp) - movdqa %xmm7,96(%esp) - addl 16(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - roll $5,%ecx - addl %esi,%ebx - xorl %edi,%ebp - rorl $7,%edx -.byte 102,15,56,0,206 - addl %ecx,%ebx - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - paddd %xmm7,%xmm0 - roll $5,%ebx - addl %ebp,%eax - xorl %edx,%esi - rorl $7,%ecx - movdqa %xmm0,(%esp) - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - psubd %xmm7,%xmm0 - roll $5,%eax - addl %esi,%edi - xorl %ecx,%ebp - rorl $7,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - roll $5,%edi - addl %ebp,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - roll $5,%edx - addl %esi,%ecx - xorl %eax,%ebp - rorl $7,%edi -.byte 102,15,56,0,214 - addl %edx,%ecx - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - paddd %xmm7,%xmm1 - roll $5,%ecx - addl %ebp,%ebx - xorl %edi,%esi - rorl $7,%edx - movdqa %xmm1,16(%esp) - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - psubd %xmm7,%xmm1 - roll $5,%ebx - addl %esi,%eax - xorl %edx,%ebp - rorl $7,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - roll $5,%eax - addl %ebp,%edi - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - roll $5,%edi - addl %esi,%edx - xorl %ebx,%ebp - rorl $7,%eax -.byte 102,15,56,0,222 - addl %edi,%edx - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - paddd %xmm7,%xmm2 - roll $5,%edx - addl %ebp,%ecx - xorl %eax,%esi - rorl $7,%edi - movdqa %xmm2,32(%esp) - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - psubd %xmm7,%xmm2 - roll $5,%ecx - addl %esi,%ebx - xorl %edi,%ebp - rorl $7,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - roll $5,%ebx - addl %ebp,%eax - rorl $7,%ecx - addl %ebx,%eax - movl 192(%esp),%ebp - addl (%ebp),%eax - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,8(%ebp) - movl %ecx,%ebx - movl %edx,12(%ebp) - xorl %edx,%ebx - movl %edi,16(%ebp) - movl %esi,%ebp - pshufd $238,%xmm0,%xmm4 - andl %ebx,%esi - movl %ebp,%ebx - jmp L004loop -.align 4,0x90 -L005done: - addl 16(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - roll $5,%ecx - addl %esi,%ebx - xorl %edi,%ebp - rorl $7,%edx - addl %ecx,%ebx - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - roll $5,%ebx - addl %ebp,%eax - xorl %edx,%esi - rorl $7,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - roll $5,%eax - addl %esi,%edi - xorl %ecx,%ebp - rorl $7,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - roll $5,%edi - addl %ebp,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - roll $5,%edx - addl %esi,%ecx - xorl %eax,%ebp - rorl $7,%edi - addl %edx,%ecx - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - roll $5,%ecx - addl %ebp,%ebx - xorl %edi,%esi - rorl $7,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - roll $5,%ebx - addl %esi,%eax - xorl %edx,%ebp - rorl $7,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - roll $5,%eax - addl %ebp,%edi - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - roll $5,%edi - addl %esi,%edx - xorl %ebx,%ebp - rorl $7,%eax - addl %edi,%edx - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - roll $5,%edx - addl %ebp,%ecx - xorl %eax,%esi - rorl $7,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - roll $5,%ecx - addl %esi,%ebx - xorl %edi,%ebp - rorl $7,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - roll $5,%ebx - addl %ebp,%eax - rorl $7,%ecx - addl %ebx,%eax - movl 192(%esp),%ebp - addl (%ebp),%eax - movl 204(%esp),%esp - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,8(%ebp) - movl %edx,12(%ebp) - movl %edi,16(%ebp) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.private_extern __sha1_block_data_order_avx -.align 4 -__sha1_block_data_order_avx: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call L006pic_point -L006pic_point: - popl %ebp - leal LK_XX_XX-L006pic_point(%ebp),%ebp -Lavx_shortcut: - vzeroall - vmovdqa (%ebp),%xmm7 - vmovdqa 16(%ebp),%xmm0 - vmovdqa 32(%ebp),%xmm1 - vmovdqa 48(%ebp),%xmm2 - vmovdqa 64(%ebp),%xmm6 - movl 20(%esp),%edi - movl 24(%esp),%ebp - movl 28(%esp),%edx - movl %esp,%esi - subl $208,%esp - andl $-64,%esp - vmovdqa %xmm0,112(%esp) - vmovdqa %xmm1,128(%esp) - vmovdqa %xmm2,144(%esp) - shll $6,%edx - vmovdqa %xmm7,160(%esp) - addl %ebp,%edx - vmovdqa %xmm6,176(%esp) - addl $64,%ebp - movl %edi,192(%esp) - movl %ebp,196(%esp) - movl %edx,200(%esp) - movl %esi,204(%esp) - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - movl 16(%edi),%edi - movl %ebx,%esi - vmovdqu -64(%ebp),%xmm0 - vmovdqu -48(%ebp),%xmm1 - vmovdqu -32(%ebp),%xmm2 - vmovdqu -16(%ebp),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vmovdqa %xmm7,96(%esp) - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm7,%xmm0,%xmm4 - vpaddd %xmm7,%xmm1,%xmm5 - vpaddd %xmm7,%xmm2,%xmm6 - vmovdqa %xmm4,(%esp) - movl %ecx,%ebp - vmovdqa %xmm5,16(%esp) - xorl %edx,%ebp - vmovdqa %xmm6,32(%esp) - andl %ebp,%esi - jmp L007loop -.align 4,0x90 -L007loop: - shrdl $2,%ebx,%ebx - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%ebp - addl (%esp),%edi - vpaddd %xmm3,%xmm7,%xmm7 - vmovdqa %xmm0,64(%esp) - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm6 - addl %esi,%edi - andl %ebx,%ebp - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%edi - vpxor %xmm2,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%ebp - vmovdqa %xmm7,48(%esp) - movl %edi,%esi - addl 4(%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%edi,%edi - addl %ebp,%edx - andl %eax,%esi - vpsrld $31,%xmm4,%xmm6 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm0 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%ebp - addl 8(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpsrld $30,%xmm0,%xmm7 - vpor %xmm6,%xmm4,%xmm4 - addl %esi,%ecx - andl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - vpslld $2,%xmm0,%xmm0 - shrdl $7,%edx,%edx - xorl %eax,%ebp - vpxor %xmm7,%xmm4,%xmm4 - movl %ecx,%esi - addl 12(%esp),%ebx - xorl %edi,%edx - shldl $5,%ecx,%ecx - vpxor %xmm0,%xmm4,%xmm4 - addl %ebp,%ebx - andl %edx,%esi - vmovdqa 96(%esp),%xmm0 - xorl %edi,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %edi,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%ebp - addl 16(%esp),%eax - vpaddd %xmm4,%xmm0,%xmm0 - vmovdqa %xmm1,80(%esp) - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm7 - addl %esi,%eax - andl %ecx,%ebp - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - xorl %edx,%ebp - vmovdqa %xmm0,(%esp) - movl %eax,%esi - addl 20(%esp),%edi - vpxor %xmm7,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %ebp,%edi - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm7 - xorl %ecx,%ebx - addl %eax,%edi - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm1 - vpaddd %xmm5,%xmm5,%xmm5 - movl %edi,%ebp - addl 24(%esp),%edx - xorl %ebx,%eax - shldl $5,%edi,%edi - vpsrld $30,%xmm1,%xmm0 - vpor %xmm7,%xmm5,%xmm5 - addl %esi,%edx - andl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - vpslld $2,%xmm1,%xmm1 - shrdl $7,%edi,%edi - xorl %ebx,%ebp - vpxor %xmm0,%xmm5,%xmm5 - movl %edx,%esi - addl 28(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpxor %xmm1,%xmm5,%xmm5 - addl %ebp,%ecx - andl %edi,%esi - vmovdqa 112(%esp),%xmm1 - xorl %eax,%edi - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%ebp - addl 32(%esp),%ebx - vpaddd %xmm5,%xmm1,%xmm1 - vmovdqa %xmm2,96(%esp) - xorl %edi,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm0 - addl %esi,%ebx - andl %edx,%ebp - vpxor %xmm2,%xmm6,%xmm6 - xorl %edi,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%ecx,%ecx - xorl %edi,%ebp - vmovdqa %xmm1,16(%esp) - movl %ebx,%esi - addl 36(%esp),%eax - vpxor %xmm0,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - addl %ebp,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm0 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm2 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%ebp - addl 40(%esp),%edi - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm1 - vpor %xmm0,%xmm6,%xmm6 - addl %esi,%edi - andl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - vpslld $2,%xmm2,%xmm2 - vmovdqa 64(%esp),%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%ebp - vpxor %xmm1,%xmm6,%xmm6 - movl %edi,%esi - addl 44(%esp),%edx - xorl %ebx,%eax - shldl $5,%edi,%edi - vpxor %xmm2,%xmm6,%xmm6 - addl %ebp,%edx - andl %eax,%esi - vmovdqa 112(%esp),%xmm2 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%ebp - addl 48(%esp),%ecx - vpaddd %xmm6,%xmm2,%xmm2 - vmovdqa %xmm3,64(%esp) - xorl %eax,%edi - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm1 - addl %esi,%ecx - andl %edi,%ebp - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%edi - addl %edx,%ecx - vpxor %xmm5,%xmm1,%xmm1 - shrdl $7,%edx,%edx - xorl %eax,%ebp - vmovdqa %xmm2,32(%esp) - movl %ecx,%esi - addl 52(%esp),%ebx - vpxor %xmm1,%xmm7,%xmm7 - xorl %edi,%edx - shldl $5,%ecx,%ecx - addl %ebp,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm1 - xorl %edi,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %edi,%esi - vpslldq $12,%xmm7,%xmm3 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%ebp - addl 56(%esp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm2 - vpor %xmm1,%xmm7,%xmm7 - addl %esi,%eax - andl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - vmovdqa 80(%esp),%xmm1 - shrdl $7,%ebx,%ebx - xorl %edx,%ebp - vpxor %xmm2,%xmm7,%xmm7 - movl %eax,%esi - addl 60(%esp),%edi - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpxor %xmm3,%xmm7,%xmm7 - addl %ebp,%edi - andl %ebx,%esi - vmovdqa 112(%esp),%xmm3 - xorl %ecx,%ebx - addl %eax,%edi - vpalignr $8,%xmm6,%xmm7,%xmm2 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %edi,%ebp - addl (%esp),%edx - vpxor %xmm1,%xmm0,%xmm0 - vmovdqa %xmm4,80(%esp) - xorl %ebx,%eax - shldl $5,%edi,%edi - vmovdqa %xmm3,%xmm4 - vpaddd %xmm7,%xmm3,%xmm3 - addl %esi,%edx - andl %eax,%ebp - vpxor %xmm2,%xmm0,%xmm0 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%ebp - vpsrld $30,%xmm0,%xmm2 - vmovdqa %xmm3,48(%esp) - movl %edx,%esi - addl 4(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %ebp,%ecx - andl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%ebp - addl 8(%esp),%ebx - vpor %xmm2,%xmm0,%xmm0 - xorl %edi,%edx - shldl $5,%ecx,%ecx - vmovdqa 96(%esp),%xmm2 - addl %esi,%ebx - andl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 12(%esp),%eax - xorl %edi,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm3 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm5,96(%esp) - addl %esi,%edi - xorl %ecx,%ebp - vmovdqa %xmm4,%xmm5 - vpaddd %xmm0,%xmm4,%xmm4 - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpxor %xmm3,%xmm1,%xmm1 - addl 20(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - vpsrld $30,%xmm1,%xmm3 - vmovdqa %xmm4,(%esp) - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vpor %xmm3,%xmm1,%xmm1 - addl 28(%esp),%ebx - xorl %edi,%ebp - vmovdqa 64(%esp),%xmm3 - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm4 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - vmovdqa %xmm6,64(%esp) - addl %esi,%eax - xorl %edx,%ebp - vmovdqa 128(%esp),%xmm6 - vpaddd %xmm1,%xmm5,%xmm5 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm4,%xmm2,%xmm2 - addl 36(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm4 - vmovdqa %xmm5,16(%esp) - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpslld $2,%xmm2,%xmm2 - addl 40(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - vpor %xmm4,%xmm2,%xmm2 - addl 44(%esp),%ecx - xorl %eax,%ebp - vmovdqa 80(%esp),%xmm4 - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm5 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - vmovdqa %xmm7,80(%esp) - addl %esi,%ebx - xorl %edi,%ebp - vmovdqa %xmm6,%xmm7 - vpaddd %xmm2,%xmm6,%xmm6 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm5,%xmm3,%xmm3 - addl 52(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm5 - vmovdqa %xmm6,32(%esp) - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpor %xmm5,%xmm3,%xmm3 - addl 60(%esp),%edx - xorl %ebx,%ebp - vmovdqa 96(%esp),%xmm5 - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpalignr $8,%xmm2,%xmm3,%xmm6 - vpxor %xmm0,%xmm4,%xmm4 - addl (%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - vmovdqa %xmm0,96(%esp) - addl %esi,%ecx - xorl %eax,%ebp - vmovdqa %xmm7,%xmm0 - vpaddd %xmm3,%xmm7,%xmm7 - shrdl $7,%edi,%edi - addl %edx,%ecx - vpxor %xmm6,%xmm4,%xmm4 - addl 4(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm6 - vmovdqa %xmm7,48(%esp) - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm6,%xmm4,%xmm4 - addl 12(%esp),%edi - xorl %ecx,%ebp - vmovdqa 64(%esp),%xmm6 - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpalignr $8,%xmm3,%xmm4,%xmm7 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - vpxor %xmm6,%xmm5,%xmm5 - vmovdqa %xmm1,64(%esp) - addl %esi,%edx - xorl %ebx,%ebp - vmovdqa %xmm0,%xmm1 - vpaddd %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - addl %edi,%edx - vpxor %xmm7,%xmm5,%xmm5 - addl 20(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm7 - vmovdqa %xmm0,(%esp) - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm7,%xmm5,%xmm5 - addl 28(%esp),%eax - vmovdqa 80(%esp),%xmm7 - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm0 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%esp),%edi - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - vmovdqa %xmm2,80(%esp) - movl %eax,%ebp - xorl %ecx,%esi - vmovdqa %xmm1,%xmm2 - vpaddd %xmm5,%xmm1,%xmm1 - shldl $5,%eax,%eax - addl %esi,%edi - vpxor %xmm0,%xmm6,%xmm6 - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - addl 36(%esp),%edx - vpsrld $30,%xmm6,%xmm0 - vmovdqa %xmm1,16(%esp) - andl %ebx,%ebp - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %edi,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%ebp - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - addl 40(%esp),%ecx - andl %eax,%esi - vpor %xmm0,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%edi,%edi - vmovdqa 96(%esp),%xmm0 - movl %edx,%ebp - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - addl 44(%esp),%ebx - andl %edi,%ebp - xorl %eax,%edi - shrdl $7,%edx,%edx - movl %ecx,%esi - xorl %edi,%ebp - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm1 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%esp),%eax - andl %edx,%esi - xorl %edi,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - vmovdqa %xmm3,96(%esp) - movl %ebx,%ebp - xorl %edx,%esi - vmovdqa 144(%esp),%xmm3 - vpaddd %xmm6,%xmm2,%xmm2 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm1,%xmm7,%xmm7 - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%esp),%edi - vpsrld $30,%xmm7,%xmm1 - vmovdqa %xmm2,32(%esp) - andl %ecx,%ebp - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%ebp - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - addl 56(%esp),%edx - andl %ebx,%esi - vpor %xmm1,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vmovdqa 64(%esp),%xmm1 - movl %edi,%ebp - xorl %ebx,%esi - shldl $5,%edi,%edi - addl %esi,%edx - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - addl 60(%esp),%ecx - andl %eax,%ebp - xorl %ebx,%eax - shrdl $7,%edi,%edi - movl %edx,%esi - xorl %eax,%ebp - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm2 - vpxor %xmm4,%xmm0,%xmm0 - addl (%esp),%ebx - andl %edi,%esi - xorl %eax,%edi - shrdl $7,%edx,%edx - vpxor %xmm1,%xmm0,%xmm0 - vmovdqa %xmm4,64(%esp) - movl %ecx,%ebp - xorl %edi,%esi - vmovdqa %xmm3,%xmm4 - vpaddd %xmm7,%xmm3,%xmm3 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm2,%xmm0,%xmm0 - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 4(%esp),%eax - vpsrld $30,%xmm0,%xmm2 - vmovdqa %xmm3,48(%esp) - andl %edx,%ebp - xorl %edi,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%esp),%edi - andl %ecx,%esi - vpor %xmm2,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vmovdqa 80(%esp),%xmm2 - movl %eax,%ebp - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - addl 12(%esp),%edx - andl %ebx,%ebp - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %edi,%esi - xorl %ebx,%ebp - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - vpalignr $8,%xmm7,%xmm0,%xmm3 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%esp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%edi,%edi - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm5,80(%esp) - movl %edx,%ebp - xorl %eax,%esi - vmovdqa %xmm4,%xmm5 - vpaddd %xmm0,%xmm4,%xmm4 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm3,%xmm1,%xmm1 - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - addl 20(%esp),%ebx - vpsrld $30,%xmm1,%xmm3 - vmovdqa %xmm4,(%esp) - andl %edi,%ebp - xorl %eax,%edi - shrdl $7,%edx,%edx - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %edi,%ebp - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - addl 24(%esp),%eax - andl %edx,%esi - vpor %xmm3,%xmm1,%xmm1 - xorl %edi,%edx - shrdl $7,%ecx,%ecx - vmovdqa 96(%esp),%xmm3 - movl %ebx,%ebp - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%esp),%edi - andl %ecx,%ebp - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%ebp - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - vpalignr $8,%xmm0,%xmm1,%xmm4 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%esp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - vmovdqa %xmm6,96(%esp) - movl %edi,%ebp - xorl %ebx,%esi - vmovdqa %xmm5,%xmm6 - vpaddd %xmm1,%xmm5,%xmm5 - shldl $5,%edi,%edi - addl %esi,%edx - vpxor %xmm4,%xmm2,%xmm2 - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - addl 36(%esp),%ecx - vpsrld $30,%xmm2,%xmm4 - vmovdqa %xmm5,16(%esp) - andl %eax,%ebp - xorl %ebx,%eax - shrdl $7,%edi,%edi - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%ebp - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - addl 40(%esp),%ebx - andl %edi,%esi - vpor %xmm4,%xmm2,%xmm2 - xorl %eax,%edi - shrdl $7,%edx,%edx - vmovdqa 64(%esp),%xmm4 - movl %ecx,%ebp - xorl %edi,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 44(%esp),%eax - andl %edx,%ebp - xorl %edi,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm5 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - vmovdqa %xmm7,64(%esp) - addl %esi,%edi - xorl %ecx,%ebp - vmovdqa %xmm6,%xmm7 - vpaddd %xmm2,%xmm6,%xmm6 - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpxor %xmm5,%xmm3,%xmm3 - addl 52(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - vpsrld $30,%xmm3,%xmm5 - vmovdqa %xmm6,32(%esp) - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vpor %xmm5,%xmm3,%xmm3 - addl 60(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl (%esp),%eax - vpaddd %xmm3,%xmm7,%xmm7 - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm7,48(%esp) - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 8(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - addl 12(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - movl 196(%esp),%ebp - cmpl 200(%esp),%ebp - je L008done - vmovdqa 160(%esp),%xmm7 - vmovdqa 176(%esp),%xmm6 - vmovdqu (%ebp),%xmm0 - vmovdqu 16(%ebp),%xmm1 - vmovdqu 32(%ebp),%xmm2 - vmovdqu 48(%ebp),%xmm3 - addl $64,%ebp - vpshufb %xmm6,%xmm0,%xmm0 - movl %ebp,196(%esp) - vmovdqa %xmm7,96(%esp) - addl 16(%esp),%ebx - xorl %edi,%esi - vpshufb %xmm6,%xmm1,%xmm1 - movl %ecx,%ebp - shldl $5,%ecx,%ecx - vpaddd %xmm7,%xmm0,%xmm4 - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm4,(%esp) - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - vpshufb %xmm6,%xmm2,%xmm2 - movl %edx,%ebp - shldl $5,%edx,%edx - vpaddd %xmm7,%xmm1,%xmm5 - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vmovdqa %xmm5,16(%esp) - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - vpshufb %xmm6,%xmm3,%xmm3 - movl %edi,%ebp - shldl $5,%edi,%edi - vpaddd %xmm7,%xmm2,%xmm6 - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - vmovdqa %xmm6,32(%esp) - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - movl 192(%esp),%ebp - addl (%ebp),%eax - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,%ebx - movl %ecx,8(%ebp) - xorl %edx,%ebx - movl %edx,12(%ebp) - movl %edi,16(%ebp) - movl %esi,%ebp - andl %ebx,%esi - movl %ebp,%ebx - jmp L007loop -.align 4,0x90 -L008done: - addl 16(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vzeroall - movl 192(%esp),%ebp - addl (%ebp),%eax - movl 204(%esp),%esp - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,8(%ebp) - movl %edx,12(%ebp) - movl %edi,16(%ebp) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 6,0x90 -LK_XX_XX: -.long 1518500249,1518500249,1518500249,1518500249 -.long 1859775393,1859775393,1859775393,1859775393 -.long 2400959708,2400959708,2400959708,2400959708 -.long 3395469782,3395469782,3395469782,3395469782 -.long 66051,67438087,134810123,202182159 -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 -.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 -.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 -.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.section __IMPORT,__pointers,non_lazy_symbol_pointers -L_OPENSSL_ia32cap_P$non_lazy_ptr: -.indirect_symbol _OPENSSL_ia32cap_P -.long 0 -#endif diff --git a/third_party/boringssl/apple-x86/crypto/fipsmodule/sha256-586.S b/third_party/boringssl/apple-x86/crypto/fipsmodule/sha256-586.S deleted file mode 100644 index c81cb9af..00000000 --- a/third_party/boringssl/apple-x86/crypto/fipsmodule/sha256-586.S +++ /dev/null @@ -1,5568 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl _sha256_block_data_order -.private_extern _sha256_block_data_order -.align 4 -_sha256_block_data_order: -L_sha256_block_data_order_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl %esp,%ebx - call L000pic_point -L000pic_point: - popl %ebp - leal L001K256-L000pic_point(%ebp),%ebp - subl $16,%esp - andl $-64,%esp - shll $6,%eax - addl %edi,%eax - movl %esi,(%esp) - movl %edi,4(%esp) - movl %eax,8(%esp) - movl %ebx,12(%esp) - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K256(%ebp),%edx - movl (%edx),%ecx - movl 4(%edx),%ebx - testl $1048576,%ecx - jnz L002loop - movl 8(%edx),%edx - testl $16777216,%ecx - jz L003no_xmm - andl $1073741824,%ecx - andl $268435968,%ebx - orl %ebx,%ecx - andl $1342177280,%ecx - cmpl $1342177280,%ecx - je L004AVX - testl $512,%ebx - jnz L005SSSE3 -L003no_xmm: - subl %edi,%eax - cmpl $256,%eax - jae L006unrolled - jmp L002loop -.align 4,0x90 -L002loop: - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - bswap %eax - movl 12(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 16(%edi),%eax - movl 20(%edi),%ebx - movl 24(%edi),%ecx - bswap %eax - movl 28(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 32(%edi),%eax - movl 36(%edi),%ebx - movl 40(%edi),%ecx - bswap %eax - movl 44(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 48(%edi),%eax - movl 52(%edi),%ebx - movl 56(%edi),%ecx - bswap %eax - movl 60(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - addl $64,%edi - leal -36(%esp),%esp - movl %edi,104(%esp) - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,8(%esp) - xorl %ecx,%ebx - movl %ecx,12(%esp) - movl %edi,16(%esp) - movl %ebx,(%esp) - movl 16(%esi),%edx - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%edi - movl %ebx,24(%esp) - movl %ecx,28(%esp) - movl %edi,32(%esp) -.align 4,0x90 -L00700_15: - movl %edx,%ecx - movl 24(%esp),%esi - rorl $14,%ecx - movl 28(%esp),%edi - xorl %edx,%ecx - xorl %edi,%esi - movl 96(%esp),%ebx - rorl $5,%ecx - andl %edx,%esi - movl %edx,20(%esp) - xorl %ecx,%edx - addl 32(%esp),%ebx - xorl %edi,%esi - rorl $6,%edx - movl %eax,%ecx - addl %esi,%ebx - rorl $9,%ecx - addl %edx,%ebx - movl 8(%esp),%edi - xorl %eax,%ecx - movl %eax,4(%esp) - leal -4(%esp),%esp - rorl $11,%ecx - movl (%ebp),%esi - xorl %eax,%ecx - movl 20(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %esi,%ebx - movl %eax,(%esp) - addl %ebx,%edx - andl 4(%esp),%eax - addl %ecx,%ebx - xorl %edi,%eax - addl $4,%ebp - addl %ebx,%eax - cmpl $3248222580,%esi - jne L00700_15 - movl 156(%esp),%ecx - jmp L00816_63 -.align 4,0x90 -L00816_63: - movl %ecx,%ebx - movl 104(%esp),%esi - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 160(%esp),%ebx - shrl $10,%edi - addl 124(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 24(%esp),%esi - rorl $14,%ecx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %edx,%ecx - xorl %edi,%esi - movl %ebx,96(%esp) - rorl $5,%ecx - andl %edx,%esi - movl %edx,20(%esp) - xorl %ecx,%edx - addl 32(%esp),%ebx - xorl %edi,%esi - rorl $6,%edx - movl %eax,%ecx - addl %esi,%ebx - rorl $9,%ecx - addl %edx,%ebx - movl 8(%esp),%edi - xorl %eax,%ecx - movl %eax,4(%esp) - leal -4(%esp),%esp - rorl $11,%ecx - movl (%ebp),%esi - xorl %eax,%ecx - movl 20(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %esi,%ebx - movl %eax,(%esp) - addl %ebx,%edx - andl 4(%esp),%eax - addl %ecx,%ebx - xorl %edi,%eax - movl 156(%esp),%ecx - addl $4,%ebp - addl %ebx,%eax - cmpl $3329325298,%esi - jne L00816_63 - movl 356(%esp),%esi - movl 8(%esp),%ebx - movl 16(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl 24(%esp),%eax - movl 28(%esp),%ebx - movl 32(%esp),%ecx - movl 360(%esp),%edi - addl 16(%esi),%edx - addl 20(%esi),%eax - addl 24(%esi),%ebx - addl 28(%esi),%ecx - movl %edx,16(%esi) - movl %eax,20(%esi) - movl %ebx,24(%esi) - movl %ecx,28(%esi) - leal 356(%esp),%esp - subl $256,%ebp - cmpl 8(%esp),%edi - jb L002loop - movl 12(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 6,0x90 -L001K256: -.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 -.long 66051,67438087,134810123,202182159 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 -.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 -.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 -.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 -.byte 62,0 -.align 4,0x90 -L006unrolled: - leal -96(%esp),%esp - movl (%esi),%eax - movl 4(%esi),%ebp - movl 8(%esi),%ecx - movl 12(%esi),%ebx - movl %ebp,4(%esp) - xorl %ecx,%ebp - movl %ecx,8(%esp) - movl %ebx,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %ebx,20(%esp) - movl %ecx,24(%esp) - movl %esi,28(%esp) - jmp L009grand_loop -.align 4,0x90 -L009grand_loop: - movl (%edi),%ebx - movl 4(%edi),%ecx - bswap %ebx - movl 8(%edi),%esi - bswap %ecx - movl %ebx,32(%esp) - bswap %esi - movl %ecx,36(%esp) - movl %esi,40(%esp) - movl 12(%edi),%ebx - movl 16(%edi),%ecx - bswap %ebx - movl 20(%edi),%esi - bswap %ecx - movl %ebx,44(%esp) - bswap %esi - movl %ecx,48(%esp) - movl %esi,52(%esp) - movl 24(%edi),%ebx - movl 28(%edi),%ecx - bswap %ebx - movl 32(%edi),%esi - bswap %ecx - movl %ebx,56(%esp) - bswap %esi - movl %ecx,60(%esp) - movl %esi,64(%esp) - movl 36(%edi),%ebx - movl 40(%edi),%ecx - bswap %ebx - movl 44(%edi),%esi - bswap %ecx - movl %ebx,68(%esp) - bswap %esi - movl %ecx,72(%esp) - movl %esi,76(%esp) - movl 48(%edi),%ebx - movl 52(%edi),%ecx - bswap %ebx - movl 56(%edi),%esi - bswap %ecx - movl %ebx,80(%esp) - bswap %esi - movl %ecx,84(%esp) - movl %esi,88(%esp) - movl 60(%edi),%ebx - addl $64,%edi - bswap %ebx - movl %edi,100(%esp) - movl %ebx,92(%esp) - movl %edx,%ecx - movl 20(%esp),%esi - rorl $14,%edx - movl 24(%esp),%edi - xorl %ecx,%edx - movl 32(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1116352408(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 16(%esp),%ecx - rorl $14,%edx - movl 20(%esp),%edi - xorl %esi,%edx - movl 36(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1899447441(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 12(%esp),%esi - rorl $14,%edx - movl 16(%esp),%edi - xorl %ecx,%edx - movl 40(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3049323471(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 8(%esp),%ecx - rorl $14,%edx - movl 12(%esp),%edi - xorl %esi,%edx - movl 44(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3921009573(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 4(%esp),%esi - rorl $14,%edx - movl 8(%esp),%edi - xorl %ecx,%edx - movl 48(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 961987163(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl (%esp),%ecx - rorl $14,%edx - movl 4(%esp),%edi - xorl %esi,%edx - movl 52(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1508970993(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 28(%esp),%esi - rorl $14,%edx - movl (%esp),%edi - xorl %ecx,%edx - movl 56(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2453635748(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 24(%esp),%ecx - rorl $14,%edx - movl 28(%esp),%edi - xorl %esi,%edx - movl 60(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2870763221(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 20(%esp),%esi - rorl $14,%edx - movl 24(%esp),%edi - xorl %ecx,%edx - movl 64(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3624381080(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 16(%esp),%ecx - rorl $14,%edx - movl 20(%esp),%edi - xorl %esi,%edx - movl 68(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 310598401(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 12(%esp),%esi - rorl $14,%edx - movl 16(%esp),%edi - xorl %ecx,%edx - movl 72(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 607225278(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 8(%esp),%ecx - rorl $14,%edx - movl 12(%esp),%edi - xorl %esi,%edx - movl 76(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1426881987(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 4(%esp),%esi - rorl $14,%edx - movl 8(%esp),%edi - xorl %ecx,%edx - movl 80(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1925078388(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl (%esp),%ecx - rorl $14,%edx - movl 4(%esp),%edi - xorl %esi,%edx - movl 84(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2162078206(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 28(%esp),%esi - rorl $14,%edx - movl (%esp),%edi - xorl %ecx,%edx - movl 88(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2614888103(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 24(%esp),%ecx - rorl $14,%edx - movl 28(%esp),%edi - xorl %esi,%edx - movl 92(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3248222580(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 36(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 88(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 32(%esp),%ebx - shrl $10,%edi - addl 68(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,32(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3835390401(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 40(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 92(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 36(%esp),%ebx - shrl $10,%edi - addl 72(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,36(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 4022224774(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 44(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 32(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 40(%esp),%ebx - shrl $10,%edi - addl 76(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,40(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 264347078(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 48(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 36(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 44(%esp),%ebx - shrl $10,%edi - addl 80(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,44(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 604807628(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 52(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 40(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 48(%esp),%ebx - shrl $10,%edi - addl 84(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,48(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 770255983(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 56(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 44(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 52(%esp),%ebx - shrl $10,%edi - addl 88(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,52(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1249150122(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 60(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 48(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 56(%esp),%ebx - shrl $10,%edi - addl 92(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,56(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1555081692(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 64(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 52(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 60(%esp),%ebx - shrl $10,%edi - addl 32(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,60(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1996064986(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 68(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 56(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 64(%esp),%ebx - shrl $10,%edi - addl 36(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,64(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2554220882(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 72(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 60(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 68(%esp),%ebx - shrl $10,%edi - addl 40(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,68(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2821834349(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 76(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 64(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 72(%esp),%ebx - shrl $10,%edi - addl 44(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,72(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2952996808(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 80(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 68(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 76(%esp),%ebx - shrl $10,%edi - addl 48(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,76(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3210313671(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 84(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 72(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 80(%esp),%ebx - shrl $10,%edi - addl 52(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,80(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3336571891(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 88(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 76(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 84(%esp),%ebx - shrl $10,%edi - addl 56(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,84(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3584528711(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 92(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 80(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 88(%esp),%ebx - shrl $10,%edi - addl 60(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,88(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 113926993(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 32(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 84(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 92(%esp),%ebx - shrl $10,%edi - addl 64(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,92(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 338241895(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 36(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 88(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 32(%esp),%ebx - shrl $10,%edi - addl 68(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,32(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 666307205(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 40(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 92(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 36(%esp),%ebx - shrl $10,%edi - addl 72(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,36(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 773529912(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 44(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 32(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 40(%esp),%ebx - shrl $10,%edi - addl 76(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,40(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1294757372(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 48(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 36(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 44(%esp),%ebx - shrl $10,%edi - addl 80(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,44(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1396182291(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 52(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 40(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 48(%esp),%ebx - shrl $10,%edi - addl 84(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,48(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1695183700(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 56(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 44(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 52(%esp),%ebx - shrl $10,%edi - addl 88(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,52(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1986661051(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 60(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 48(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 56(%esp),%ebx - shrl $10,%edi - addl 92(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,56(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2177026350(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 64(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 52(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 60(%esp),%ebx - shrl $10,%edi - addl 32(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,60(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2456956037(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 68(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 56(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 64(%esp),%ebx - shrl $10,%edi - addl 36(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,64(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2730485921(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 72(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 60(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 68(%esp),%ebx - shrl $10,%edi - addl 40(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,68(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2820302411(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 76(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 64(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 72(%esp),%ebx - shrl $10,%edi - addl 44(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,72(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3259730800(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 80(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 68(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 76(%esp),%ebx - shrl $10,%edi - addl 48(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,76(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3345764771(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 84(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 72(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 80(%esp),%ebx - shrl $10,%edi - addl 52(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,80(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3516065817(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 88(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 76(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 84(%esp),%ebx - shrl $10,%edi - addl 56(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,84(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3600352804(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 92(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 80(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 88(%esp),%ebx - shrl $10,%edi - addl 60(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,88(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 4094571909(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 32(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 84(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 92(%esp),%ebx - shrl $10,%edi - addl 64(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,92(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 275423344(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 36(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 88(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 32(%esp),%ebx - shrl $10,%edi - addl 68(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,32(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 430227734(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 40(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 92(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 36(%esp),%ebx - shrl $10,%edi - addl 72(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,36(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 506948616(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 44(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 32(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 40(%esp),%ebx - shrl $10,%edi - addl 76(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,40(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 659060556(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 48(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 36(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 44(%esp),%ebx - shrl $10,%edi - addl 80(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,44(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 883997877(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 52(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 40(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 48(%esp),%ebx - shrl $10,%edi - addl 84(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,48(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 958139571(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 56(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 44(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 52(%esp),%ebx - shrl $10,%edi - addl 88(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,52(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1322822218(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 60(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 48(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 56(%esp),%ebx - shrl $10,%edi - addl 92(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,56(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1537002063(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 64(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 52(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 60(%esp),%ebx - shrl $10,%edi - addl 32(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,60(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1747873779(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 68(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 56(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 64(%esp),%ebx - shrl $10,%edi - addl 36(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,64(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1955562222(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 72(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 60(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 68(%esp),%ebx - shrl $10,%edi - addl 40(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,68(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2024104815(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 76(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 64(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 72(%esp),%ebx - shrl $10,%edi - addl 44(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,72(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2227730452(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 80(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 68(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 76(%esp),%ebx - shrl $10,%edi - addl 48(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,76(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2361852424(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 84(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 72(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 80(%esp),%ebx - shrl $10,%edi - addl 52(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,80(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2428436474(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 88(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 76(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 84(%esp),%ebx - shrl $10,%edi - addl 56(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,84(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2756734187(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 92(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 80(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 88(%esp),%ebx - shrl $10,%edi - addl 60(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3204031479(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 32(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 84(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 92(%esp),%ebx - shrl $10,%edi - addl 64(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3329325298(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 96(%esp),%esi - xorl %edi,%ebp - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebp - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebp,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebp,4(%esp) - xorl %edi,%ebp - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ebx - movl 28(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ebx - addl 28(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %ebx,24(%esi) - movl %ecx,28(%esi) - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ebx,24(%esp) - movl %ecx,28(%esp) - cmpl 104(%esp),%edi - jb L009grand_loop - movl 108(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 5,0x90 -L005SSSE3: - leal -96(%esp),%esp - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,4(%esp) - xorl %ecx,%ebx - movl %ecx,8(%esp) - movl %edi,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%edi - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ecx,24(%esp) - movl %esi,28(%esp) - movdqa 256(%ebp),%xmm7 - jmp L010grand_ssse3 -.align 4,0x90 -L010grand_ssse3: - movdqu (%edi),%xmm0 - movdqu 16(%edi),%xmm1 - movdqu 32(%edi),%xmm2 - movdqu 48(%edi),%xmm3 - addl $64,%edi -.byte 102,15,56,0,199 - movl %edi,100(%esp) -.byte 102,15,56,0,207 - movdqa (%ebp),%xmm4 -.byte 102,15,56,0,215 - movdqa 16(%ebp),%xmm5 - paddd %xmm0,%xmm4 -.byte 102,15,56,0,223 - movdqa 32(%ebp),%xmm6 - paddd %xmm1,%xmm5 - movdqa 48(%ebp),%xmm7 - movdqa %xmm4,32(%esp) - paddd %xmm2,%xmm6 - movdqa %xmm5,48(%esp) - paddd %xmm3,%xmm7 - movdqa %xmm6,64(%esp) - movdqa %xmm7,80(%esp) - jmp L011ssse3_00_47 -.align 4,0x90 -L011ssse3_00_47: - addl $64,%ebp - movl %edx,%ecx - movdqa %xmm1,%xmm4 - rorl $14,%edx - movl 20(%esp),%esi - movdqa %xmm3,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi -.byte 102,15,58,15,224,4 - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi -.byte 102,15,58,15,250,4 - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx - movl %eax,%ecx - movdqa %xmm4,%xmm6 - addl %edi,%edx - movl 4(%esp),%edi - psrld $3,%xmm4 - movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm0 - movl %eax,(%esp) - xorl %eax,%ecx - psrld $7,%xmm6 - xorl %edi,%eax - addl 28(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - pshufd $250,%xmm3,%xmm7 - xorl %esi,%ecx - addl 32(%esp),%edx - pslld $14,%xmm5 - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - psrld $11,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 - movl 16(%esp),%esi - xorl %ecx,%edx - pslld $11,%xmm5 - movl 20(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 - andl %ecx,%esi - movl %ecx,12(%esp) - movdqa %xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 - movl %ebx,%ecx - addl %edi,%edx - psrld $10,%xmm7 - movl (%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm0 - movl %ebx,28(%esp) - xorl %ebx,%ecx - psrlq $17,%xmm6 - xorl %edi,%ebx - addl 24(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 - andl %ebx,%eax - xorl %esi,%ecx - psrlq $2,%xmm6 - addl 36(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - pshufd $128,%xmm7,%xmm7 - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - psrldq $8,%xmm7 - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - paddd %xmm7,%xmm0 - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,24(%esp) - pshufd $80,%xmm0,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 - andl %eax,%ebx - psrlq $17,%xmm6 - xorl %esi,%ecx - addl 40(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - psrlq $2,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - pshufd $8,%xmm7,%xmm7 - xorl %edi,%esi - rorl $5,%edx - movdqa (%ebp),%xmm6 - andl %ecx,%esi - movl %ecx,4(%esp) - pslldq $8,%xmm7 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm0 - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - paddd %xmm0,%xmm6 - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movdqa %xmm6,32(%esp) - movl %edx,%ecx - movdqa %xmm2,%xmm4 - rorl $14,%edx - movl 4(%esp),%esi - movdqa %xmm0,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi -.byte 102,15,58,15,225,4 - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi -.byte 102,15,58,15,251,4 - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx - movl %eax,%ecx - movdqa %xmm4,%xmm6 - addl %edi,%edx - movl 20(%esp),%edi - psrld $3,%xmm4 - movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm1 - movl %eax,16(%esp) - xorl %eax,%ecx - psrld $7,%xmm6 - xorl %edi,%eax - addl 12(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - pshufd $250,%xmm0,%xmm7 - xorl %esi,%ecx - addl 48(%esp),%edx - pslld $14,%xmm5 - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - psrld $11,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 - movl (%esp),%esi - xorl %ecx,%edx - pslld $11,%xmm5 - movl 4(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 - andl %ecx,%esi - movl %ecx,28(%esp) - movdqa %xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 - movl %ebx,%ecx - addl %edi,%edx - psrld $10,%xmm7 - movl 16(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm1 - movl %ebx,12(%esp) - xorl %ebx,%ecx - psrlq $17,%xmm6 - xorl %edi,%ebx - addl 8(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 - andl %ebx,%eax - xorl %esi,%ecx - psrlq $2,%xmm6 - addl 52(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - pshufd $128,%xmm7,%xmm7 - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - psrldq $8,%xmm7 - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - paddd %xmm7,%xmm1 - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,8(%esp) - pshufd $80,%xmm1,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 - andl %eax,%ebx - psrlq $17,%xmm6 - xorl %esi,%ecx - addl 56(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - psrlq $2,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - pshufd $8,%xmm7,%xmm7 - xorl %edi,%esi - rorl $5,%edx - movdqa 16(%ebp),%xmm6 - andl %ecx,%esi - movl %ecx,20(%esp) - pslldq $8,%xmm7 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm1 - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - paddd %xmm1,%xmm6 - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movdqa %xmm6,48(%esp) - movl %edx,%ecx - movdqa %xmm3,%xmm4 - rorl $14,%edx - movl 20(%esp),%esi - movdqa %xmm1,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi -.byte 102,15,58,15,226,4 - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi -.byte 102,15,58,15,248,4 - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx - movl %eax,%ecx - movdqa %xmm4,%xmm6 - addl %edi,%edx - movl 4(%esp),%edi - psrld $3,%xmm4 - movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm2 - movl %eax,(%esp) - xorl %eax,%ecx - psrld $7,%xmm6 - xorl %edi,%eax - addl 28(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - pshufd $250,%xmm1,%xmm7 - xorl %esi,%ecx - addl 64(%esp),%edx - pslld $14,%xmm5 - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - psrld $11,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 - movl 16(%esp),%esi - xorl %ecx,%edx - pslld $11,%xmm5 - movl 20(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 - andl %ecx,%esi - movl %ecx,12(%esp) - movdqa %xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 - movl %ebx,%ecx - addl %edi,%edx - psrld $10,%xmm7 - movl (%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm2 - movl %ebx,28(%esp) - xorl %ebx,%ecx - psrlq $17,%xmm6 - xorl %edi,%ebx - addl 24(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 - andl %ebx,%eax - xorl %esi,%ecx - psrlq $2,%xmm6 - addl 68(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - pshufd $128,%xmm7,%xmm7 - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - psrldq $8,%xmm7 - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - paddd %xmm7,%xmm2 - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,24(%esp) - pshufd $80,%xmm2,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 - andl %eax,%ebx - psrlq $17,%xmm6 - xorl %esi,%ecx - addl 72(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - psrlq $2,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - pshufd $8,%xmm7,%xmm7 - xorl %edi,%esi - rorl $5,%edx - movdqa 32(%ebp),%xmm6 - andl %ecx,%esi - movl %ecx,4(%esp) - pslldq $8,%xmm7 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm2 - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - paddd %xmm2,%xmm6 - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movdqa %xmm6,64(%esp) - movl %edx,%ecx - movdqa %xmm0,%xmm4 - rorl $14,%edx - movl 4(%esp),%esi - movdqa %xmm2,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi -.byte 102,15,58,15,227,4 - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi -.byte 102,15,58,15,249,4 - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx - movl %eax,%ecx - movdqa %xmm4,%xmm6 - addl %edi,%edx - movl 20(%esp),%edi - psrld $3,%xmm4 - movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm3 - movl %eax,16(%esp) - xorl %eax,%ecx - psrld $7,%xmm6 - xorl %edi,%eax - addl 12(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - pshufd $250,%xmm2,%xmm7 - xorl %esi,%ecx - addl 80(%esp),%edx - pslld $14,%xmm5 - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - psrld $11,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 - movl (%esp),%esi - xorl %ecx,%edx - pslld $11,%xmm5 - movl 4(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 - andl %ecx,%esi - movl %ecx,28(%esp) - movdqa %xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 - movl %ebx,%ecx - addl %edi,%edx - psrld $10,%xmm7 - movl 16(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm3 - movl %ebx,12(%esp) - xorl %ebx,%ecx - psrlq $17,%xmm6 - xorl %edi,%ebx - addl 8(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 - andl %ebx,%eax - xorl %esi,%ecx - psrlq $2,%xmm6 - addl 84(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - pshufd $128,%xmm7,%xmm7 - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - psrldq $8,%xmm7 - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - paddd %xmm7,%xmm3 - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,8(%esp) - pshufd $80,%xmm3,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 - andl %eax,%ebx - psrlq $17,%xmm6 - xorl %esi,%ecx - addl 88(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - psrlq $2,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - pshufd $8,%xmm7,%xmm7 - xorl %edi,%esi - rorl $5,%edx - movdqa 48(%ebp),%xmm6 - andl %ecx,%esi - movl %ecx,20(%esp) - pslldq $8,%xmm7 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm3 - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - paddd %xmm3,%xmm6 - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movdqa %xmm6,80(%esp) - cmpl $66051,64(%ebp) - jne L011ssse3_00_47 - movl %edx,%ecx - rorl $14,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 32(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 36(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 40(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 48(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 52(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 56(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 64(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 68(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 72(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 80(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 84(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 88(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl 96(%esp),%esi - xorl %edi,%ebx - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebx,4(%esp) - xorl %edi,%ebx - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %edi,20(%esp) - movl 28(%esp),%edi - movl %ecx,24(%esi) - addl 28(%esi),%edi - movl %ecx,24(%esp) - movl %edi,28(%esi) - movl %edi,28(%esp) - movl 100(%esp),%edi - movdqa 64(%ebp),%xmm7 - subl $192,%ebp - cmpl 104(%esp),%edi - jb L010grand_ssse3 - movl 108(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 5,0x90 -L004AVX: - leal -96(%esp),%esp - vzeroall - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,4(%esp) - xorl %ecx,%ebx - movl %ecx,8(%esp) - movl %edi,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%edi - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ecx,24(%esp) - movl %esi,28(%esp) - vmovdqa 256(%ebp),%xmm7 - jmp L012grand_avx -.align 5,0x90 -L012grand_avx: - vmovdqu (%edi),%xmm0 - vmovdqu 16(%edi),%xmm1 - vmovdqu 32(%edi),%xmm2 - vmovdqu 48(%edi),%xmm3 - addl $64,%edi - vpshufb %xmm7,%xmm0,%xmm0 - movl %edi,100(%esp) - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd (%ebp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 16(%ebp),%xmm1,%xmm5 - vpaddd 32(%ebp),%xmm2,%xmm6 - vpaddd 48(%ebp),%xmm3,%xmm7 - vmovdqa %xmm4,32(%esp) - vmovdqa %xmm5,48(%esp) - vmovdqa %xmm6,64(%esp) - vmovdqa %xmm7,80(%esp) - jmp L013avx_00_47 -.align 4,0x90 -L013avx_00_47: - addl $64,%ebp - vpalignr $4,%xmm0,%xmm1,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - vpalignr $4,%xmm2,%xmm3,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - vpaddd %xmm7,%xmm0,%xmm0 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - vpshufd $250,%xmm3,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 32(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - vpaddd %xmm4,%xmm0,%xmm0 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 36(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - vpaddd %xmm7,%xmm0,%xmm0 - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm0,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 40(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm0,%xmm0 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - vpaddd (%ebp),%xmm0,%xmm6 - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,32(%esp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - vpalignr $4,%xmm3,%xmm0,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - vpaddd %xmm7,%xmm1,%xmm1 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - vpshufd $250,%xmm0,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 48(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - vpaddd %xmm4,%xmm1,%xmm1 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 52(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - vpaddd %xmm7,%xmm1,%xmm1 - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm1,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 56(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm1,%xmm1 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - vpaddd 16(%ebp),%xmm1,%xmm6 - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,48(%esp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - vpalignr $4,%xmm0,%xmm1,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - vpaddd %xmm7,%xmm2,%xmm2 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - vpshufd $250,%xmm1,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 64(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - vpaddd %xmm4,%xmm2,%xmm2 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 68(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - vpaddd %xmm7,%xmm2,%xmm2 - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm2,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 72(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm2,%xmm2 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - vpaddd 32(%ebp),%xmm2,%xmm6 - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,64(%esp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - vpalignr $4,%xmm1,%xmm2,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - vpaddd %xmm7,%xmm3,%xmm3 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - vpshufd $250,%xmm2,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 80(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - vpaddd %xmm4,%xmm3,%xmm3 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 84(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - vpaddd %xmm7,%xmm3,%xmm3 - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm3,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 88(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm3,%xmm3 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - vpaddd 48(%ebp),%xmm3,%xmm6 - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,80(%esp) - cmpl $66051,64(%ebp) - jne L013avx_00_47 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 32(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 36(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 40(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 48(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 52(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 56(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 64(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 68(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 72(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 80(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 84(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 88(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl 96(%esp),%esi - xorl %edi,%ebx - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebx,4(%esp) - xorl %edi,%ebx - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %edi,20(%esp) - movl 28(%esp),%edi - movl %ecx,24(%esi) - addl 28(%esi),%edi - movl %ecx,24(%esp) - movl %edi,28(%esi) - movl %edi,28(%esp) - movl 100(%esp),%edi - vmovdqa 64(%ebp),%xmm7 - subl $192,%ebp - cmpl 104(%esp),%edi - jb L012grand_avx - movl 108(%esp),%esp - vzeroall - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.section __IMPORT,__pointers,non_lazy_symbol_pointers -L_OPENSSL_ia32cap_P$non_lazy_ptr: -.indirect_symbol _OPENSSL_ia32cap_P -.long 0 -#endif diff --git a/third_party/boringssl/apple-x86/crypto/fipsmodule/sha512-586.S b/third_party/boringssl/apple-x86/crypto/fipsmodule/sha512-586.S deleted file mode 100644 index 8c33cf59..00000000 --- a/third_party/boringssl/apple-x86/crypto/fipsmodule/sha512-586.S +++ /dev/null @@ -1,2838 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl _sha512_block_data_order -.private_extern _sha512_block_data_order -.align 4 -_sha512_block_data_order: -L_sha512_block_data_order_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl %esp,%ebx - call L000pic_point -L000pic_point: - popl %ebp - leal L001K512-L000pic_point(%ebp),%ebp - subl $16,%esp - andl $-64,%esp - shll $7,%eax - addl %edi,%eax - movl %esi,(%esp) - movl %edi,4(%esp) - movl %eax,8(%esp) - movl %ebx,12(%esp) - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L001K512(%ebp),%edx - movl (%edx),%ecx - testl $67108864,%ecx - jz L002loop_x86 - movl 4(%edx),%edx - movq (%esi),%mm0 - andl $16777216,%ecx - movq 8(%esi),%mm1 - andl $512,%edx - movq 16(%esi),%mm2 - orl %edx,%ecx - movq 24(%esi),%mm3 - movq 32(%esi),%mm4 - movq 40(%esi),%mm5 - movq 48(%esi),%mm6 - movq 56(%esi),%mm7 - cmpl $16777728,%ecx - je L003SSSE3 - subl $80,%esp - jmp L004loop_sse2 -.align 4,0x90 -L004loop_sse2: - movq %mm1,8(%esp) - movq %mm2,16(%esp) - movq %mm3,24(%esp) - movq %mm5,40(%esp) - movq %mm6,48(%esp) - pxor %mm1,%mm2 - movq %mm7,56(%esp) - movq %mm0,%mm3 - movl (%edi),%eax - movl 4(%edi),%ebx - addl $8,%edi - movl $15,%edx - bswap %eax - bswap %ebx - jmp L00500_14_sse2 -.align 4,0x90 -L00500_14_sse2: - movd %eax,%mm1 - movl (%edi),%eax - movd %ebx,%mm7 - movl 4(%edi),%ebx - addl $8,%edi - bswap %eax - bswap %ebx - punpckldq %mm1,%mm7 - movq %mm4,%mm1 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - movq %mm3,%mm0 - movq %mm7,72(%esp) - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - paddq (%ebp),%mm7 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - subl $8,%esp - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 40(%esp),%mm5 - paddq %mm2,%mm3 - movq %mm0,%mm2 - addl $8,%ebp - paddq %mm6,%mm3 - movq 48(%esp),%mm6 - decl %edx - jnz L00500_14_sse2 - movd %eax,%mm1 - movd %ebx,%mm7 - punpckldq %mm1,%mm7 - movq %mm4,%mm1 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - movq %mm3,%mm0 - movq %mm7,72(%esp) - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - paddq (%ebp),%mm7 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - subl $8,%esp - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 192(%esp),%mm7 - paddq %mm2,%mm3 - movq %mm0,%mm2 - addl $8,%ebp - paddq %mm6,%mm3 - pxor %mm0,%mm0 - movl $32,%edx - jmp L00616_79_sse2 -.align 4,0x90 -L00616_79_sse2: - movq 88(%esp),%mm5 - movq %mm7,%mm1 - psrlq $1,%mm7 - movq %mm5,%mm6 - psrlq $6,%mm5 - psllq $56,%mm1 - paddq %mm3,%mm0 - movq %mm7,%mm3 - psrlq $6,%mm7 - pxor %mm1,%mm3 - psllq $7,%mm1 - pxor %mm7,%mm3 - psrlq $1,%mm7 - pxor %mm1,%mm3 - movq %mm5,%mm1 - psrlq $13,%mm5 - pxor %mm3,%mm7 - psllq $3,%mm6 - pxor %mm5,%mm1 - paddq 200(%esp),%mm7 - pxor %mm6,%mm1 - psrlq $42,%mm5 - paddq 128(%esp),%mm7 - pxor %mm5,%mm1 - psllq $42,%mm6 - movq 40(%esp),%mm5 - pxor %mm6,%mm1 - movq 48(%esp),%mm6 - paddq %mm1,%mm7 - movq %mm4,%mm1 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - movq %mm7,72(%esp) - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - paddq (%ebp),%mm7 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - subl $8,%esp - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 192(%esp),%mm7 - paddq %mm6,%mm2 - addl $8,%ebp - movq 88(%esp),%mm5 - movq %mm7,%mm1 - psrlq $1,%mm7 - movq %mm5,%mm6 - psrlq $6,%mm5 - psllq $56,%mm1 - paddq %mm3,%mm2 - movq %mm7,%mm3 - psrlq $6,%mm7 - pxor %mm1,%mm3 - psllq $7,%mm1 - pxor %mm7,%mm3 - psrlq $1,%mm7 - pxor %mm1,%mm3 - movq %mm5,%mm1 - psrlq $13,%mm5 - pxor %mm3,%mm7 - psllq $3,%mm6 - pxor %mm5,%mm1 - paddq 200(%esp),%mm7 - pxor %mm6,%mm1 - psrlq $42,%mm5 - paddq 128(%esp),%mm7 - pxor %mm5,%mm1 - psllq $42,%mm6 - movq 40(%esp),%mm5 - pxor %mm6,%mm1 - movq 48(%esp),%mm6 - paddq %mm1,%mm7 - movq %mm4,%mm1 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - movq %mm7,72(%esp) - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - paddq (%ebp),%mm7 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - subl $8,%esp - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 192(%esp),%mm7 - paddq %mm6,%mm0 - addl $8,%ebp - decl %edx - jnz L00616_79_sse2 - paddq %mm3,%mm0 - movq 8(%esp),%mm1 - movq 24(%esp),%mm3 - movq 40(%esp),%mm5 - movq 48(%esp),%mm6 - movq 56(%esp),%mm7 - pxor %mm1,%mm2 - paddq (%esi),%mm0 - paddq 8(%esi),%mm1 - paddq 16(%esi),%mm2 - paddq 24(%esi),%mm3 - paddq 32(%esi),%mm4 - paddq 40(%esi),%mm5 - paddq 48(%esi),%mm6 - paddq 56(%esi),%mm7 - movl $640,%eax - movq %mm0,(%esi) - movq %mm1,8(%esi) - movq %mm2,16(%esi) - movq %mm3,24(%esi) - movq %mm4,32(%esi) - movq %mm5,40(%esi) - movq %mm6,48(%esi) - movq %mm7,56(%esi) - leal (%esp,%eax,1),%esp - subl %eax,%ebp - cmpl 88(%esp),%edi - jb L004loop_sse2 - movl 92(%esp),%esp - emms - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 5,0x90 -L003SSSE3: - leal -64(%esp),%edx - subl $256,%esp - movdqa 640(%ebp),%xmm1 - movdqu (%edi),%xmm0 -.byte 102,15,56,0,193 - movdqa (%ebp),%xmm3 - movdqa %xmm1,%xmm2 - movdqu 16(%edi),%xmm1 - paddq %xmm0,%xmm3 -.byte 102,15,56,0,202 - movdqa %xmm3,-128(%edx) - movdqa 16(%ebp),%xmm4 - movdqa %xmm2,%xmm3 - movdqu 32(%edi),%xmm2 - paddq %xmm1,%xmm4 -.byte 102,15,56,0,211 - movdqa %xmm4,-112(%edx) - movdqa 32(%ebp),%xmm5 - movdqa %xmm3,%xmm4 - movdqu 48(%edi),%xmm3 - paddq %xmm2,%xmm5 -.byte 102,15,56,0,220 - movdqa %xmm5,-96(%edx) - movdqa 48(%ebp),%xmm6 - movdqa %xmm4,%xmm5 - movdqu 64(%edi),%xmm4 - paddq %xmm3,%xmm6 -.byte 102,15,56,0,229 - movdqa %xmm6,-80(%edx) - movdqa 64(%ebp),%xmm7 - movdqa %xmm5,%xmm6 - movdqu 80(%edi),%xmm5 - paddq %xmm4,%xmm7 -.byte 102,15,56,0,238 - movdqa %xmm7,-64(%edx) - movdqa %xmm0,(%edx) - movdqa 80(%ebp),%xmm0 - movdqa %xmm6,%xmm7 - movdqu 96(%edi),%xmm6 - paddq %xmm5,%xmm0 -.byte 102,15,56,0,247 - movdqa %xmm0,-48(%edx) - movdqa %xmm1,16(%edx) - movdqa 96(%ebp),%xmm1 - movdqa %xmm7,%xmm0 - movdqu 112(%edi),%xmm7 - paddq %xmm6,%xmm1 -.byte 102,15,56,0,248 - movdqa %xmm1,-32(%edx) - movdqa %xmm2,32(%edx) - movdqa 112(%ebp),%xmm2 - movdqa (%edx),%xmm0 - paddq %xmm7,%xmm2 - movdqa %xmm2,-16(%edx) - nop -.align 5,0x90 -L007loop_ssse3: - movdqa 16(%edx),%xmm2 - movdqa %xmm3,48(%edx) - leal 128(%ebp),%ebp - movq %mm1,8(%esp) - movl %edi,%ebx - movq %mm2,16(%esp) - leal 128(%edi),%edi - movq %mm3,24(%esp) - cmpl %eax,%edi - movq %mm5,40(%esp) - cmovbl %edi,%ebx - movq %mm6,48(%esp) - movl $4,%ecx - pxor %mm1,%mm2 - movq %mm7,56(%esp) - pxor %mm3,%mm3 - jmp L00800_47_ssse3 -.align 5,0x90 -L00800_47_ssse3: - movdqa %xmm5,%xmm3 - movdqa %xmm2,%xmm1 -.byte 102,15,58,15,208,8 - movdqa %xmm4,(%edx) -.byte 102,15,58,15,220,8 - movdqa %xmm2,%xmm4 - psrlq $7,%xmm2 - paddq %xmm3,%xmm0 - movdqa %xmm4,%xmm3 - psrlq $1,%xmm4 - psllq $56,%xmm3 - pxor %xmm4,%xmm2 - psrlq $7,%xmm4 - pxor %xmm3,%xmm2 - psllq $7,%xmm3 - pxor %xmm4,%xmm2 - movdqa %xmm7,%xmm4 - pxor %xmm3,%xmm2 - movdqa %xmm7,%xmm3 - psrlq $6,%xmm4 - paddq %xmm2,%xmm0 - movdqa %xmm7,%xmm2 - psrlq $19,%xmm3 - psllq $3,%xmm2 - pxor %xmm3,%xmm4 - psrlq $42,%xmm3 - pxor %xmm2,%xmm4 - psllq $42,%xmm2 - pxor %xmm3,%xmm4 - movdqa 32(%edx),%xmm3 - pxor %xmm2,%xmm4 - movdqa (%ebp),%xmm2 - movq %mm4,%mm1 - paddq %xmm4,%xmm0 - movq -128(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - paddq %xmm0,%xmm2 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 32(%esp),%mm5 - paddq %mm6,%mm2 - movq 40(%esp),%mm6 - movq %mm4,%mm1 - movq -120(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,24(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,56(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 48(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 16(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq (%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 24(%esp),%mm5 - paddq %mm6,%mm0 - movq 32(%esp),%mm6 - movdqa %xmm2,-128(%edx) - movdqa %xmm6,%xmm4 - movdqa %xmm3,%xmm2 -.byte 102,15,58,15,217,8 - movdqa %xmm5,16(%edx) -.byte 102,15,58,15,229,8 - movdqa %xmm3,%xmm5 - psrlq $7,%xmm3 - paddq %xmm4,%xmm1 - movdqa %xmm5,%xmm4 - psrlq $1,%xmm5 - psllq $56,%xmm4 - pxor %xmm5,%xmm3 - psrlq $7,%xmm5 - pxor %xmm4,%xmm3 - psllq $7,%xmm4 - pxor %xmm5,%xmm3 - movdqa %xmm0,%xmm5 - pxor %xmm4,%xmm3 - movdqa %xmm0,%xmm4 - psrlq $6,%xmm5 - paddq %xmm3,%xmm1 - movdqa %xmm0,%xmm3 - psrlq $19,%xmm4 - psllq $3,%xmm3 - pxor %xmm4,%xmm5 - psrlq $42,%xmm4 - pxor %xmm3,%xmm5 - psllq $42,%xmm3 - pxor %xmm4,%xmm5 - movdqa 48(%edx),%xmm4 - pxor %xmm3,%xmm5 - movdqa 16(%ebp),%xmm3 - movq %mm4,%mm1 - paddq %xmm5,%xmm1 - movq -112(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,16(%esp) - paddq %xmm1,%xmm3 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,48(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 40(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 8(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 56(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 16(%esp),%mm5 - paddq %mm6,%mm2 - movq 24(%esp),%mm6 - movq %mm4,%mm1 - movq -104(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,8(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,40(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 32(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq (%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 48(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 8(%esp),%mm5 - paddq %mm6,%mm0 - movq 16(%esp),%mm6 - movdqa %xmm3,-112(%edx) - movdqa %xmm7,%xmm5 - movdqa %xmm4,%xmm3 -.byte 102,15,58,15,226,8 - movdqa %xmm6,32(%edx) -.byte 102,15,58,15,238,8 - movdqa %xmm4,%xmm6 - psrlq $7,%xmm4 - paddq %xmm5,%xmm2 - movdqa %xmm6,%xmm5 - psrlq $1,%xmm6 - psllq $56,%xmm5 - pxor %xmm6,%xmm4 - psrlq $7,%xmm6 - pxor %xmm5,%xmm4 - psllq $7,%xmm5 - pxor %xmm6,%xmm4 - movdqa %xmm1,%xmm6 - pxor %xmm5,%xmm4 - movdqa %xmm1,%xmm5 - psrlq $6,%xmm6 - paddq %xmm4,%xmm2 - movdqa %xmm1,%xmm4 - psrlq $19,%xmm5 - psllq $3,%xmm4 - pxor %xmm5,%xmm6 - psrlq $42,%xmm5 - pxor %xmm4,%xmm6 - psllq $42,%xmm4 - pxor %xmm5,%xmm6 - movdqa (%edx),%xmm5 - pxor %xmm4,%xmm6 - movdqa 32(%ebp),%xmm4 - movq %mm4,%mm1 - paddq %xmm6,%xmm2 - movq -96(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,(%esp) - paddq %xmm2,%xmm4 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,32(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 24(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 56(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 40(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq (%esp),%mm5 - paddq %mm6,%mm2 - movq 8(%esp),%mm6 - movq %mm4,%mm1 - movq -88(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,56(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,24(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 16(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 48(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 32(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 56(%esp),%mm5 - paddq %mm6,%mm0 - movq (%esp),%mm6 - movdqa %xmm4,-96(%edx) - movdqa %xmm0,%xmm6 - movdqa %xmm5,%xmm4 -.byte 102,15,58,15,235,8 - movdqa %xmm7,48(%edx) -.byte 102,15,58,15,247,8 - movdqa %xmm5,%xmm7 - psrlq $7,%xmm5 - paddq %xmm6,%xmm3 - movdqa %xmm7,%xmm6 - psrlq $1,%xmm7 - psllq $56,%xmm6 - pxor %xmm7,%xmm5 - psrlq $7,%xmm7 - pxor %xmm6,%xmm5 - psllq $7,%xmm6 - pxor %xmm7,%xmm5 - movdqa %xmm2,%xmm7 - pxor %xmm6,%xmm5 - movdqa %xmm2,%xmm6 - psrlq $6,%xmm7 - paddq %xmm5,%xmm3 - movdqa %xmm2,%xmm5 - psrlq $19,%xmm6 - psllq $3,%xmm5 - pxor %xmm6,%xmm7 - psrlq $42,%xmm6 - pxor %xmm5,%xmm7 - psllq $42,%xmm5 - pxor %xmm6,%xmm7 - movdqa 16(%edx),%xmm6 - pxor %xmm5,%xmm7 - movdqa 48(%ebp),%xmm5 - movq %mm4,%mm1 - paddq %xmm7,%xmm3 - movq -80(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,48(%esp) - paddq %xmm3,%xmm5 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,16(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 8(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 40(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 24(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 48(%esp),%mm5 - paddq %mm6,%mm2 - movq 56(%esp),%mm6 - movq %mm4,%mm1 - movq -72(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,40(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,8(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq (%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 32(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 16(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 40(%esp),%mm5 - paddq %mm6,%mm0 - movq 48(%esp),%mm6 - movdqa %xmm5,-80(%edx) - movdqa %xmm1,%xmm7 - movdqa %xmm6,%xmm5 -.byte 102,15,58,15,244,8 - movdqa %xmm0,(%edx) -.byte 102,15,58,15,248,8 - movdqa %xmm6,%xmm0 - psrlq $7,%xmm6 - paddq %xmm7,%xmm4 - movdqa %xmm0,%xmm7 - psrlq $1,%xmm0 - psllq $56,%xmm7 - pxor %xmm0,%xmm6 - psrlq $7,%xmm0 - pxor %xmm7,%xmm6 - psllq $7,%xmm7 - pxor %xmm0,%xmm6 - movdqa %xmm3,%xmm0 - pxor %xmm7,%xmm6 - movdqa %xmm3,%xmm7 - psrlq $6,%xmm0 - paddq %xmm6,%xmm4 - movdqa %xmm3,%xmm6 - psrlq $19,%xmm7 - psllq $3,%xmm6 - pxor %xmm7,%xmm0 - psrlq $42,%xmm7 - pxor %xmm6,%xmm0 - psllq $42,%xmm6 - pxor %xmm7,%xmm0 - movdqa 32(%edx),%xmm7 - pxor %xmm6,%xmm0 - movdqa 64(%ebp),%xmm6 - movq %mm4,%mm1 - paddq %xmm0,%xmm4 - movq -64(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - paddq %xmm4,%xmm6 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 32(%esp),%mm5 - paddq %mm6,%mm2 - movq 40(%esp),%mm6 - movq %mm4,%mm1 - movq -56(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,24(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,56(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 48(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 16(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq (%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 24(%esp),%mm5 - paddq %mm6,%mm0 - movq 32(%esp),%mm6 - movdqa %xmm6,-64(%edx) - movdqa %xmm2,%xmm0 - movdqa %xmm7,%xmm6 -.byte 102,15,58,15,253,8 - movdqa %xmm1,16(%edx) -.byte 102,15,58,15,193,8 - movdqa %xmm7,%xmm1 - psrlq $7,%xmm7 - paddq %xmm0,%xmm5 - movdqa %xmm1,%xmm0 - psrlq $1,%xmm1 - psllq $56,%xmm0 - pxor %xmm1,%xmm7 - psrlq $7,%xmm1 - pxor %xmm0,%xmm7 - psllq $7,%xmm0 - pxor %xmm1,%xmm7 - movdqa %xmm4,%xmm1 - pxor %xmm0,%xmm7 - movdqa %xmm4,%xmm0 - psrlq $6,%xmm1 - paddq %xmm7,%xmm5 - movdqa %xmm4,%xmm7 - psrlq $19,%xmm0 - psllq $3,%xmm7 - pxor %xmm0,%xmm1 - psrlq $42,%xmm0 - pxor %xmm7,%xmm1 - psllq $42,%xmm7 - pxor %xmm0,%xmm1 - movdqa 48(%edx),%xmm0 - pxor %xmm7,%xmm1 - movdqa 80(%ebp),%xmm7 - movq %mm4,%mm1 - paddq %xmm1,%xmm5 - movq -48(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,16(%esp) - paddq %xmm5,%xmm7 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,48(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 40(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 8(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 56(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 16(%esp),%mm5 - paddq %mm6,%mm2 - movq 24(%esp),%mm6 - movq %mm4,%mm1 - movq -40(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,8(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,40(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 32(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq (%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 48(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 8(%esp),%mm5 - paddq %mm6,%mm0 - movq 16(%esp),%mm6 - movdqa %xmm7,-48(%edx) - movdqa %xmm3,%xmm1 - movdqa %xmm0,%xmm7 -.byte 102,15,58,15,198,8 - movdqa %xmm2,32(%edx) -.byte 102,15,58,15,202,8 - movdqa %xmm0,%xmm2 - psrlq $7,%xmm0 - paddq %xmm1,%xmm6 - movdqa %xmm2,%xmm1 - psrlq $1,%xmm2 - psllq $56,%xmm1 - pxor %xmm2,%xmm0 - psrlq $7,%xmm2 - pxor %xmm1,%xmm0 - psllq $7,%xmm1 - pxor %xmm2,%xmm0 - movdqa %xmm5,%xmm2 - pxor %xmm1,%xmm0 - movdqa %xmm5,%xmm1 - psrlq $6,%xmm2 - paddq %xmm0,%xmm6 - movdqa %xmm5,%xmm0 - psrlq $19,%xmm1 - psllq $3,%xmm0 - pxor %xmm1,%xmm2 - psrlq $42,%xmm1 - pxor %xmm0,%xmm2 - psllq $42,%xmm0 - pxor %xmm1,%xmm2 - movdqa (%edx),%xmm1 - pxor %xmm0,%xmm2 - movdqa 96(%ebp),%xmm0 - movq %mm4,%mm1 - paddq %xmm2,%xmm6 - movq -32(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,(%esp) - paddq %xmm6,%xmm0 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,32(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 24(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 56(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 40(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq (%esp),%mm5 - paddq %mm6,%mm2 - movq 8(%esp),%mm6 - movq %mm4,%mm1 - movq -24(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,56(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,24(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 16(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 48(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 32(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 56(%esp),%mm5 - paddq %mm6,%mm0 - movq (%esp),%mm6 - movdqa %xmm0,-32(%edx) - movdqa %xmm4,%xmm2 - movdqa %xmm1,%xmm0 -.byte 102,15,58,15,207,8 - movdqa %xmm3,48(%edx) -.byte 102,15,58,15,211,8 - movdqa %xmm1,%xmm3 - psrlq $7,%xmm1 - paddq %xmm2,%xmm7 - movdqa %xmm3,%xmm2 - psrlq $1,%xmm3 - psllq $56,%xmm2 - pxor %xmm3,%xmm1 - psrlq $7,%xmm3 - pxor %xmm2,%xmm1 - psllq $7,%xmm2 - pxor %xmm3,%xmm1 - movdqa %xmm6,%xmm3 - pxor %xmm2,%xmm1 - movdqa %xmm6,%xmm2 - psrlq $6,%xmm3 - paddq %xmm1,%xmm7 - movdqa %xmm6,%xmm1 - psrlq $19,%xmm2 - psllq $3,%xmm1 - pxor %xmm2,%xmm3 - psrlq $42,%xmm2 - pxor %xmm1,%xmm3 - psllq $42,%xmm1 - pxor %xmm2,%xmm3 - movdqa 16(%edx),%xmm2 - pxor %xmm1,%xmm3 - movdqa 112(%ebp),%xmm1 - movq %mm4,%mm1 - paddq %xmm3,%xmm7 - movq -16(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,48(%esp) - paddq %xmm7,%xmm1 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,16(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 8(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 40(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 24(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 48(%esp),%mm5 - paddq %mm6,%mm2 - movq 56(%esp),%mm6 - movq %mm4,%mm1 - movq -8(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,40(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,8(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq (%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 32(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 16(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 40(%esp),%mm5 - paddq %mm6,%mm0 - movq 48(%esp),%mm6 - movdqa %xmm1,-16(%edx) - leal 128(%ebp),%ebp - decl %ecx - jnz L00800_47_ssse3 - movdqa (%ebp),%xmm1 - leal -640(%ebp),%ebp - movdqu (%ebx),%xmm0 -.byte 102,15,56,0,193 - movdqa (%ebp),%xmm3 - movdqa %xmm1,%xmm2 - movdqu 16(%ebx),%xmm1 - paddq %xmm0,%xmm3 -.byte 102,15,56,0,202 - movq %mm4,%mm1 - movq -128(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 32(%esp),%mm5 - paddq %mm6,%mm2 - movq 40(%esp),%mm6 - movq %mm4,%mm1 - movq -120(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,24(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,56(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 48(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 16(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq (%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 24(%esp),%mm5 - paddq %mm6,%mm0 - movq 32(%esp),%mm6 - movdqa %xmm3,-128(%edx) - movdqa 16(%ebp),%xmm4 - movdqa %xmm2,%xmm3 - movdqu 32(%ebx),%xmm2 - paddq %xmm1,%xmm4 -.byte 102,15,56,0,211 - movq %mm4,%mm1 - movq -112(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,16(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,48(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 40(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 8(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 56(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 16(%esp),%mm5 - paddq %mm6,%mm2 - movq 24(%esp),%mm6 - movq %mm4,%mm1 - movq -104(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,8(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,40(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 32(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq (%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 48(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 8(%esp),%mm5 - paddq %mm6,%mm0 - movq 16(%esp),%mm6 - movdqa %xmm4,-112(%edx) - movdqa 32(%ebp),%xmm5 - movdqa %xmm3,%xmm4 - movdqu 48(%ebx),%xmm3 - paddq %xmm2,%xmm5 -.byte 102,15,56,0,220 - movq %mm4,%mm1 - movq -96(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,32(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 24(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 56(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 40(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq (%esp),%mm5 - paddq %mm6,%mm2 - movq 8(%esp),%mm6 - movq %mm4,%mm1 - movq -88(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,56(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,24(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 16(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 48(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 32(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 56(%esp),%mm5 - paddq %mm6,%mm0 - movq (%esp),%mm6 - movdqa %xmm5,-96(%edx) - movdqa 48(%ebp),%xmm6 - movdqa %xmm4,%xmm5 - movdqu 64(%ebx),%xmm4 - paddq %xmm3,%xmm6 -.byte 102,15,56,0,229 - movq %mm4,%mm1 - movq -80(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,48(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,16(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 8(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 40(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 24(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 48(%esp),%mm5 - paddq %mm6,%mm2 - movq 56(%esp),%mm6 - movq %mm4,%mm1 - movq -72(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,40(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,8(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq (%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 32(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 16(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 40(%esp),%mm5 - paddq %mm6,%mm0 - movq 48(%esp),%mm6 - movdqa %xmm6,-80(%edx) - movdqa 64(%ebp),%xmm7 - movdqa %xmm5,%xmm6 - movdqu 80(%ebx),%xmm5 - paddq %xmm4,%xmm7 -.byte 102,15,56,0,238 - movq %mm4,%mm1 - movq -64(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 32(%esp),%mm5 - paddq %mm6,%mm2 - movq 40(%esp),%mm6 - movq %mm4,%mm1 - movq -56(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,24(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,56(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 48(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 16(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq (%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 24(%esp),%mm5 - paddq %mm6,%mm0 - movq 32(%esp),%mm6 - movdqa %xmm7,-64(%edx) - movdqa %xmm0,(%edx) - movdqa 80(%ebp),%xmm0 - movdqa %xmm6,%xmm7 - movdqu 96(%ebx),%xmm6 - paddq %xmm5,%xmm0 -.byte 102,15,56,0,247 - movq %mm4,%mm1 - movq -48(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,16(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,48(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 40(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 8(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 56(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 16(%esp),%mm5 - paddq %mm6,%mm2 - movq 24(%esp),%mm6 - movq %mm4,%mm1 - movq -40(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,8(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,40(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 32(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq (%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 48(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 8(%esp),%mm5 - paddq %mm6,%mm0 - movq 16(%esp),%mm6 - movdqa %xmm0,-48(%edx) - movdqa %xmm1,16(%edx) - movdqa 96(%ebp),%xmm1 - movdqa %xmm7,%xmm0 - movdqu 112(%ebx),%xmm7 - paddq %xmm6,%xmm1 -.byte 102,15,56,0,248 - movq %mm4,%mm1 - movq -32(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,32(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 24(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 56(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 40(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq (%esp),%mm5 - paddq %mm6,%mm2 - movq 8(%esp),%mm6 - movq %mm4,%mm1 - movq -24(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,56(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,24(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 16(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 48(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 32(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 56(%esp),%mm5 - paddq %mm6,%mm0 - movq (%esp),%mm6 - movdqa %xmm1,-32(%edx) - movdqa %xmm2,32(%edx) - movdqa 112(%ebp),%xmm2 - movdqa (%edx),%xmm0 - paddq %xmm7,%xmm2 - movq %mm4,%mm1 - movq -16(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,48(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,16(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 8(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 40(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 24(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 48(%esp),%mm5 - paddq %mm6,%mm2 - movq 56(%esp),%mm6 - movq %mm4,%mm1 - movq -8(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,40(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,8(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq (%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 32(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 16(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 40(%esp),%mm5 - paddq %mm6,%mm0 - movq 48(%esp),%mm6 - movdqa %xmm2,-16(%edx) - movq 8(%esp),%mm1 - paddq %mm3,%mm0 - movq 24(%esp),%mm3 - movq 56(%esp),%mm7 - pxor %mm1,%mm2 - paddq (%esi),%mm0 - paddq 8(%esi),%mm1 - paddq 16(%esi),%mm2 - paddq 24(%esi),%mm3 - paddq 32(%esi),%mm4 - paddq 40(%esi),%mm5 - paddq 48(%esi),%mm6 - paddq 56(%esi),%mm7 - movq %mm0,(%esi) - movq %mm1,8(%esi) - movq %mm2,16(%esi) - movq %mm3,24(%esi) - movq %mm4,32(%esi) - movq %mm5,40(%esi) - movq %mm6,48(%esi) - movq %mm7,56(%esi) - cmpl %eax,%edi - jb L007loop_ssse3 - movl 76(%edx),%esp - emms - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 4,0x90 -L002loop_x86: - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 16(%edi),%eax - movl 20(%edi),%ebx - movl 24(%edi),%ecx - movl 28(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 32(%edi),%eax - movl 36(%edi),%ebx - movl 40(%edi),%ecx - movl 44(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 48(%edi),%eax - movl 52(%edi),%ebx - movl 56(%edi),%ecx - movl 60(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 64(%edi),%eax - movl 68(%edi),%ebx - movl 72(%edi),%ecx - movl 76(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 80(%edi),%eax - movl 84(%edi),%ebx - movl 88(%edi),%ecx - movl 92(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 96(%edi),%eax - movl 100(%edi),%ebx - movl 104(%edi),%ecx - movl 108(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 112(%edi),%eax - movl 116(%edi),%ebx - movl 120(%edi),%ecx - movl 124(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - addl $128,%edi - subl $72,%esp - movl %edi,204(%esp) - leal 8(%esp),%edi - movl $16,%ecx -.long 2784229001 -.align 4,0x90 -L00900_15_x86: - movl 40(%esp),%ecx - movl 44(%esp),%edx - movl %ecx,%esi - shrl $9,%ecx - movl %edx,%edi - shrl $9,%edx - movl %ecx,%ebx - shll $14,%esi - movl %edx,%eax - shll $14,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%eax - shll $4,%esi - xorl %edx,%ebx - shll $4,%edi - xorl %esi,%ebx - shrl $4,%ecx - xorl %edi,%eax - shrl $4,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 48(%esp),%ecx - movl 52(%esp),%edx - movl 56(%esp),%esi - movl 60(%esp),%edi - addl 64(%esp),%eax - adcl 68(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - andl 40(%esp),%ecx - andl 44(%esp),%edx - addl 192(%esp),%eax - adcl 196(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - movl (%ebp),%esi - movl 4(%ebp),%edi - addl %ecx,%eax - adcl %edx,%ebx - movl 32(%esp),%ecx - movl 36(%esp),%edx - addl %esi,%eax - adcl %edi,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - addl %ecx,%eax - adcl %edx,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl %eax,32(%esp) - movl %ebx,36(%esp) - movl %ecx,%esi - shrl $2,%ecx - movl %edx,%edi - shrl $2,%edx - movl %ecx,%ebx - shll $4,%esi - movl %edx,%eax - shll $4,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%ebx - shll $21,%esi - xorl %edx,%eax - shll $21,%edi - xorl %esi,%eax - shrl $21,%ecx - xorl %edi,%ebx - shrl $21,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl 16(%esp),%esi - movl 20(%esp),%edi - addl (%esp),%eax - adcl 4(%esp),%ebx - orl %esi,%ecx - orl %edi,%edx - andl 24(%esp),%ecx - andl 28(%esp),%edx - andl 8(%esp),%esi - andl 12(%esp),%edi - orl %esi,%ecx - orl %edi,%edx - addl %ecx,%eax - adcl %edx,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - movb (%ebp),%dl - subl $8,%esp - leal 8(%ebp),%ebp - cmpb $148,%dl - jne L00900_15_x86 -.align 4,0x90 -L01016_79_x86: - movl 312(%esp),%ecx - movl 316(%esp),%edx - movl %ecx,%esi - shrl $1,%ecx - movl %edx,%edi - shrl $1,%edx - movl %ecx,%eax - shll $24,%esi - movl %edx,%ebx - shll $24,%edi - xorl %esi,%ebx - shrl $6,%ecx - xorl %edi,%eax - shrl $6,%edx - xorl %ecx,%eax - shll $7,%esi - xorl %edx,%ebx - shll $1,%edi - xorl %esi,%ebx - shrl $1,%ecx - xorl %edi,%eax - shrl $1,%edx - xorl %ecx,%eax - shll $6,%edi - xorl %edx,%ebx - xorl %edi,%eax - movl %eax,(%esp) - movl %ebx,4(%esp) - movl 208(%esp),%ecx - movl 212(%esp),%edx - movl %ecx,%esi - shrl $6,%ecx - movl %edx,%edi - shrl $6,%edx - movl %ecx,%eax - shll $3,%esi - movl %edx,%ebx - shll $3,%edi - xorl %esi,%eax - shrl $13,%ecx - xorl %edi,%ebx - shrl $13,%edx - xorl %ecx,%eax - shll $10,%esi - xorl %edx,%ebx - shll $10,%edi - xorl %esi,%ebx - shrl $10,%ecx - xorl %edi,%eax - shrl $10,%edx - xorl %ecx,%ebx - shll $13,%edi - xorl %edx,%eax - xorl %edi,%eax - movl 320(%esp),%ecx - movl 324(%esp),%edx - addl (%esp),%eax - adcl 4(%esp),%ebx - movl 248(%esp),%esi - movl 252(%esp),%edi - addl %ecx,%eax - adcl %edx,%ebx - addl %esi,%eax - adcl %edi,%ebx - movl %eax,192(%esp) - movl %ebx,196(%esp) - movl 40(%esp),%ecx - movl 44(%esp),%edx - movl %ecx,%esi - shrl $9,%ecx - movl %edx,%edi - shrl $9,%edx - movl %ecx,%ebx - shll $14,%esi - movl %edx,%eax - shll $14,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%eax - shll $4,%esi - xorl %edx,%ebx - shll $4,%edi - xorl %esi,%ebx - shrl $4,%ecx - xorl %edi,%eax - shrl $4,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 48(%esp),%ecx - movl 52(%esp),%edx - movl 56(%esp),%esi - movl 60(%esp),%edi - addl 64(%esp),%eax - adcl 68(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - andl 40(%esp),%ecx - andl 44(%esp),%edx - addl 192(%esp),%eax - adcl 196(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - movl (%ebp),%esi - movl 4(%ebp),%edi - addl %ecx,%eax - adcl %edx,%ebx - movl 32(%esp),%ecx - movl 36(%esp),%edx - addl %esi,%eax - adcl %edi,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - addl %ecx,%eax - adcl %edx,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl %eax,32(%esp) - movl %ebx,36(%esp) - movl %ecx,%esi - shrl $2,%ecx - movl %edx,%edi - shrl $2,%edx - movl %ecx,%ebx - shll $4,%esi - movl %edx,%eax - shll $4,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%ebx - shll $21,%esi - xorl %edx,%eax - shll $21,%edi - xorl %esi,%eax - shrl $21,%ecx - xorl %edi,%ebx - shrl $21,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl 16(%esp),%esi - movl 20(%esp),%edi - addl (%esp),%eax - adcl 4(%esp),%ebx - orl %esi,%ecx - orl %edi,%edx - andl 24(%esp),%ecx - andl 28(%esp),%edx - andl 8(%esp),%esi - andl 12(%esp),%edi - orl %esi,%ecx - orl %edi,%edx - addl %ecx,%eax - adcl %edx,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - movb (%ebp),%dl - subl $8,%esp - leal 8(%ebp),%ebp - cmpb $23,%dl - jne L01016_79_x86 - movl 840(%esp),%esi - movl 844(%esp),%edi - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edx - addl 8(%esp),%eax - adcl 12(%esp),%ebx - movl %eax,(%esi) - movl %ebx,4(%esi) - addl 16(%esp),%ecx - adcl 20(%esp),%edx - movl %ecx,8(%esi) - movl %edx,12(%esi) - movl 16(%esi),%eax - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%edx - addl 24(%esp),%eax - adcl 28(%esp),%ebx - movl %eax,16(%esi) - movl %ebx,20(%esi) - addl 32(%esp),%ecx - adcl 36(%esp),%edx - movl %ecx,24(%esi) - movl %edx,28(%esi) - movl 32(%esi),%eax - movl 36(%esi),%ebx - movl 40(%esi),%ecx - movl 44(%esi),%edx - addl 40(%esp),%eax - adcl 44(%esp),%ebx - movl %eax,32(%esi) - movl %ebx,36(%esi) - addl 48(%esp),%ecx - adcl 52(%esp),%edx - movl %ecx,40(%esi) - movl %edx,44(%esi) - movl 48(%esi),%eax - movl 52(%esi),%ebx - movl 56(%esi),%ecx - movl 60(%esi),%edx - addl 56(%esp),%eax - adcl 60(%esp),%ebx - movl %eax,48(%esi) - movl %ebx,52(%esi) - addl 64(%esp),%ecx - adcl 68(%esp),%edx - movl %ecx,56(%esi) - movl %edx,60(%esi) - addl $840,%esp - subl $640,%ebp - cmpl 8(%esp),%edi - jb L002loop_x86 - movl 12(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 6,0x90 -L001K512: -.long 3609767458,1116352408 -.long 602891725,1899447441 -.long 3964484399,3049323471 -.long 2173295548,3921009573 -.long 4081628472,961987163 -.long 3053834265,1508970993 -.long 2937671579,2453635748 -.long 3664609560,2870763221 -.long 2734883394,3624381080 -.long 1164996542,310598401 -.long 1323610764,607225278 -.long 3590304994,1426881987 -.long 4068182383,1925078388 -.long 991336113,2162078206 -.long 633803317,2614888103 -.long 3479774868,3248222580 -.long 2666613458,3835390401 -.long 944711139,4022224774 -.long 2341262773,264347078 -.long 2007800933,604807628 -.long 1495990901,770255983 -.long 1856431235,1249150122 -.long 3175218132,1555081692 -.long 2198950837,1996064986 -.long 3999719339,2554220882 -.long 766784016,2821834349 -.long 2566594879,2952996808 -.long 3203337956,3210313671 -.long 1034457026,3336571891 -.long 2466948901,3584528711 -.long 3758326383,113926993 -.long 168717936,338241895 -.long 1188179964,666307205 -.long 1546045734,773529912 -.long 1522805485,1294757372 -.long 2643833823,1396182291 -.long 2343527390,1695183700 -.long 1014477480,1986661051 -.long 1206759142,2177026350 -.long 344077627,2456956037 -.long 1290863460,2730485921 -.long 3158454273,2820302411 -.long 3505952657,3259730800 -.long 106217008,3345764771 -.long 3606008344,3516065817 -.long 1432725776,3600352804 -.long 1467031594,4094571909 -.long 851169720,275423344 -.long 3100823752,430227734 -.long 1363258195,506948616 -.long 3750685593,659060556 -.long 3785050280,883997877 -.long 3318307427,958139571 -.long 3812723403,1322822218 -.long 2003034995,1537002063 -.long 3602036899,1747873779 -.long 1575990012,1955562222 -.long 1125592928,2024104815 -.long 2716904306,2227730452 -.long 442776044,2361852424 -.long 593698344,2428436474 -.long 3733110249,2756734187 -.long 2999351573,3204031479 -.long 3815920427,3329325298 -.long 3928383900,3391569614 -.long 566280711,3515267271 -.long 3454069534,3940187606 -.long 4000239992,4118630271 -.long 1914138554,116418474 -.long 2731055270,174292421 -.long 3203993006,289380356 -.long 320620315,460393269 -.long 587496836,685471733 -.long 1086792851,852142971 -.long 365543100,1017036298 -.long 2618297676,1126000580 -.long 3409855158,1288033470 -.long 4234509866,1501505948 -.long 987167468,1607167915 -.long 1246189591,1816402316 -.long 67438087,66051 -.long 202182159,134810123 -.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 -.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 -.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 -.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 -.byte 62,0 -.section __IMPORT,__pointers,non_lazy_symbol_pointers -L_OPENSSL_ia32cap_P$non_lazy_ptr: -.indirect_symbol _OPENSSL_ia32cap_P -.long 0 -#endif diff --git a/third_party/boringssl/apple-x86/crypto/fipsmodule/vpaes-x86.S b/third_party/boringssl/apple-x86/crypto/fipsmodule/vpaes-x86.S deleted file mode 100644 index 00c0190d..00000000 --- a/third_party/boringssl/apple-x86/crypto/fipsmodule/vpaes-x86.S +++ /dev/null @@ -1,681 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -#ifdef BORINGSSL_DISPATCH_TEST -#endif -.align 6,0x90 -L_vpaes_consts: -.long 218628480,235210255,168496130,67568393 -.long 252381056,17041926,33884169,51187212 -.long 252645135,252645135,252645135,252645135 -.long 1512730624,3266504856,1377990664,3401244816 -.long 830229760,1275146365,2969422977,3447763452 -.long 3411033600,2979783055,338359620,2782886510 -.long 4209124096,907596821,221174255,1006095553 -.long 191964160,3799684038,3164090317,1589111125 -.long 182528256,1777043520,2877432650,3265356744 -.long 1874708224,3503451415,3305285752,363511674 -.long 1606117888,3487855781,1093350906,2384367825 -.long 197121,67569157,134941193,202313229 -.long 67569157,134941193,202313229,197121 -.long 134941193,202313229,197121,67569157 -.long 202313229,197121,67569157,134941193 -.long 33619971,100992007,168364043,235736079 -.long 235736079,33619971,100992007,168364043 -.long 168364043,235736079,33619971,100992007 -.long 100992007,168364043,235736079,33619971 -.long 50462976,117835012,185207048,252579084 -.long 252314880,51251460,117574920,184942860 -.long 184682752,252054788,50987272,118359308 -.long 118099200,185467140,251790600,50727180 -.long 2946363062,528716217,1300004225,1881839624 -.long 1532713819,1532713819,1532713819,1532713819 -.long 3602276352,4288629033,3737020424,4153884961 -.long 1354558464,32357713,2958822624,3775749553 -.long 1201988352,132424512,1572796698,503232858 -.long 2213177600,1597421020,4103937655,675398315 -.long 2749646592,4273543773,1511898873,121693092 -.long 3040248576,1103263732,2871565598,1608280554 -.long 2236667136,2588920351,482954393,64377734 -.long 3069987328,291237287,2117370568,3650299247 -.long 533321216,3573750986,2572112006,1401264716 -.long 1339849704,2721158661,548607111,3445553514 -.long 2128193280,3054596040,2183486460,1257083700 -.long 655635200,1165381986,3923443150,2344132524 -.long 190078720,256924420,290342170,357187870 -.long 1610966272,2263057382,4103205268,309794674 -.long 2592527872,2233205587,1335446729,3402964816 -.long 3973531904,3225098121,3002836325,1918774430 -.long 3870401024,2102906079,2284471353,4117666579 -.long 617007872,1021508343,366931923,691083277 -.long 2528395776,3491914898,2968704004,1613121270 -.long 3445188352,3247741094,844474987,4093578302 -.long 651481088,1190302358,1689581232,574775300 -.long 4289380608,206939853,2555985458,2489840491 -.long 2130264064,327674451,3566485037,3349835193 -.long 2470714624,316102159,3636825756,3393945945 -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 -.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 -.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 -.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 -.byte 118,101,114,115,105,116,121,41,0 -.align 6,0x90 -.private_extern __vpaes_preheat -.align 4 -__vpaes_preheat: - addl (%esp),%ebp - movdqa -48(%ebp),%xmm7 - movdqa -16(%ebp),%xmm6 - ret -.private_extern __vpaes_encrypt_core -.align 4 -__vpaes_encrypt_core: - movl $16,%ecx - movl 240(%edx),%eax - movdqa %xmm6,%xmm1 - movdqa (%ebp),%xmm2 - pandn %xmm0,%xmm1 - pand %xmm6,%xmm0 - movdqu (%edx),%xmm5 -.byte 102,15,56,0,208 - movdqa 16(%ebp),%xmm0 - pxor %xmm5,%xmm2 - psrld $4,%xmm1 - addl $16,%edx -.byte 102,15,56,0,193 - leal 192(%ebp),%ebx - pxor %xmm2,%xmm0 - jmp L000enc_entry -.align 4,0x90 -L001enc_loop: - movdqa 32(%ebp),%xmm4 - movdqa 48(%ebp),%xmm0 -.byte 102,15,56,0,226 -.byte 102,15,56,0,195 - pxor %xmm5,%xmm4 - movdqa 64(%ebp),%xmm5 - pxor %xmm4,%xmm0 - movdqa -64(%ebx,%ecx,1),%xmm1 -.byte 102,15,56,0,234 - movdqa 80(%ebp),%xmm2 - movdqa (%ebx,%ecx,1),%xmm4 -.byte 102,15,56,0,211 - movdqa %xmm0,%xmm3 - pxor %xmm5,%xmm2 -.byte 102,15,56,0,193 - addl $16,%edx - pxor %xmm2,%xmm0 -.byte 102,15,56,0,220 - addl $16,%ecx - pxor %xmm0,%xmm3 -.byte 102,15,56,0,193 - andl $48,%ecx - subl $1,%eax - pxor %xmm3,%xmm0 -L000enc_entry: - movdqa %xmm6,%xmm1 - movdqa -32(%ebp),%xmm5 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm6,%xmm0 -.byte 102,15,56,0,232 - movdqa %xmm7,%xmm3 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 - movdqa %xmm7,%xmm4 - pxor %xmm5,%xmm3 -.byte 102,15,56,0,224 - movdqa %xmm7,%xmm2 - pxor %xmm5,%xmm4 -.byte 102,15,56,0,211 - movdqa %xmm7,%xmm3 - pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 - movdqu (%edx),%xmm5 - pxor %xmm1,%xmm3 - jnz L001enc_loop - movdqa 96(%ebp),%xmm4 - movdqa 112(%ebp),%xmm0 -.byte 102,15,56,0,226 - pxor %xmm5,%xmm4 -.byte 102,15,56,0,195 - movdqa 64(%ebx,%ecx,1),%xmm1 - pxor %xmm4,%xmm0 -.byte 102,15,56,0,193 - ret -.private_extern __vpaes_decrypt_core -.align 4 -__vpaes_decrypt_core: - leal 608(%ebp),%ebx - movl 240(%edx),%eax - movdqa %xmm6,%xmm1 - movdqa -64(%ebx),%xmm2 - pandn %xmm0,%xmm1 - movl %eax,%ecx - psrld $4,%xmm1 - movdqu (%edx),%xmm5 - shll $4,%ecx - pand %xmm6,%xmm0 -.byte 102,15,56,0,208 - movdqa -48(%ebx),%xmm0 - xorl $48,%ecx -.byte 102,15,56,0,193 - andl $48,%ecx - pxor %xmm5,%xmm2 - movdqa 176(%ebp),%xmm5 - pxor %xmm2,%xmm0 - addl $16,%edx - leal -352(%ebx,%ecx,1),%ecx - jmp L002dec_entry -.align 4,0x90 -L003dec_loop: - movdqa -32(%ebx),%xmm4 - movdqa -16(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa (%ebx),%xmm4 - pxor %xmm1,%xmm0 - movdqa 16(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa 32(%ebx),%xmm4 - pxor %xmm1,%xmm0 - movdqa 48(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa 64(%ebx),%xmm4 - pxor %xmm1,%xmm0 - movdqa 80(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - addl $16,%edx -.byte 102,15,58,15,237,12 - pxor %xmm1,%xmm0 - subl $1,%eax -L002dec_entry: - movdqa %xmm6,%xmm1 - movdqa -32(%ebp),%xmm2 - pandn %xmm0,%xmm1 - pand %xmm6,%xmm0 - psrld $4,%xmm1 -.byte 102,15,56,0,208 - movdqa %xmm7,%xmm3 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 - movdqa %xmm7,%xmm4 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,224 - pxor %xmm2,%xmm4 - movdqa %xmm7,%xmm2 -.byte 102,15,56,0,211 - movdqa %xmm7,%xmm3 - pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 - movdqu (%edx),%xmm0 - pxor %xmm1,%xmm3 - jnz L003dec_loop - movdqa 96(%ebx),%xmm4 -.byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 112(%ebx),%xmm0 - movdqa (%ecx),%xmm2 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 -.byte 102,15,56,0,194 - ret -.private_extern __vpaes_schedule_core -.align 4 -__vpaes_schedule_core: - addl (%esp),%ebp - movdqu (%esi),%xmm0 - movdqa 320(%ebp),%xmm2 - movdqa %xmm0,%xmm3 - leal (%ebp),%ebx - movdqa %xmm2,4(%esp) - call __vpaes_schedule_transform - movdqa %xmm0,%xmm7 - testl %edi,%edi - jnz L004schedule_am_decrypting - movdqu %xmm0,(%edx) - jmp L005schedule_go -L004schedule_am_decrypting: - movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,217 - movdqu %xmm3,(%edx) - xorl $48,%ecx -L005schedule_go: - cmpl $192,%eax - ja L006schedule_256 - je L007schedule_192 -L008schedule_128: - movl $10,%eax -L009loop_schedule_128: - call __vpaes_schedule_round - decl %eax - jz L010schedule_mangle_last - call __vpaes_schedule_mangle - jmp L009loop_schedule_128 -.align 4,0x90 -L007schedule_192: - movdqu 8(%esi),%xmm0 - call __vpaes_schedule_transform - movdqa %xmm0,%xmm6 - pxor %xmm4,%xmm4 - movhlps %xmm4,%xmm6 - movl $4,%eax -L011loop_schedule_192: - call __vpaes_schedule_round -.byte 102,15,58,15,198,8 - call __vpaes_schedule_mangle - call __vpaes_schedule_192_smear - call __vpaes_schedule_mangle - call __vpaes_schedule_round - decl %eax - jz L010schedule_mangle_last - call __vpaes_schedule_mangle - call __vpaes_schedule_192_smear - jmp L011loop_schedule_192 -.align 4,0x90 -L006schedule_256: - movdqu 16(%esi),%xmm0 - call __vpaes_schedule_transform - movl $7,%eax -L012loop_schedule_256: - call __vpaes_schedule_mangle - movdqa %xmm0,%xmm6 - call __vpaes_schedule_round - decl %eax - jz L010schedule_mangle_last - call __vpaes_schedule_mangle - pshufd $255,%xmm0,%xmm0 - movdqa %xmm7,20(%esp) - movdqa %xmm6,%xmm7 - call L_vpaes_schedule_low_round - movdqa 20(%esp),%xmm7 - jmp L012loop_schedule_256 -.align 4,0x90 -L010schedule_mangle_last: - leal 384(%ebp),%ebx - testl %edi,%edi - jnz L013schedule_mangle_last_dec - movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,193 - leal 352(%ebp),%ebx - addl $32,%edx -L013schedule_mangle_last_dec: - addl $-16,%edx - pxor 336(%ebp),%xmm0 - call __vpaes_schedule_transform - movdqu %xmm0,(%edx) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - ret -.private_extern __vpaes_schedule_192_smear -.align 4 -__vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm1 - pshufd $254,%xmm7,%xmm0 - pxor %xmm1,%xmm6 - pxor %xmm1,%xmm1 - pxor %xmm0,%xmm6 - movdqa %xmm6,%xmm0 - movhlps %xmm1,%xmm6 - ret -.private_extern __vpaes_schedule_round -.align 4 -__vpaes_schedule_round: - movdqa 8(%esp),%xmm2 - pxor %xmm1,%xmm1 -.byte 102,15,58,15,202,15 -.byte 102,15,58,15,210,15 - pxor %xmm1,%xmm7 - pshufd $255,%xmm0,%xmm0 -.byte 102,15,58,15,192,1 - movdqa %xmm2,8(%esp) -L_vpaes_schedule_low_round: - movdqa %xmm7,%xmm1 - pslldq $4,%xmm7 - pxor %xmm1,%xmm7 - movdqa %xmm7,%xmm1 - pslldq $8,%xmm7 - pxor %xmm1,%xmm7 - pxor 336(%ebp),%xmm7 - movdqa -16(%ebp),%xmm4 - movdqa -48(%ebp),%xmm5 - movdqa %xmm4,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm4,%xmm0 - movdqa -32(%ebp),%xmm2 -.byte 102,15,56,0,208 - pxor %xmm1,%xmm0 - movdqa %xmm5,%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 - movdqa %xmm5,%xmm4 -.byte 102,15,56,0,224 - pxor %xmm2,%xmm4 - movdqa %xmm5,%xmm2 -.byte 102,15,56,0,211 - pxor %xmm0,%xmm2 - movdqa %xmm5,%xmm3 -.byte 102,15,56,0,220 - pxor %xmm1,%xmm3 - movdqa 32(%ebp),%xmm4 -.byte 102,15,56,0,226 - movdqa 48(%ebp),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 - pxor %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - ret -.private_extern __vpaes_schedule_transform -.align 4 -__vpaes_schedule_transform: - movdqa -16(%ebp),%xmm2 - movdqa %xmm2,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm2,%xmm0 - movdqa (%ebx),%xmm2 -.byte 102,15,56,0,208 - movdqa 16(%ebx),%xmm0 -.byte 102,15,56,0,193 - pxor %xmm2,%xmm0 - ret -.private_extern __vpaes_schedule_mangle -.align 4 -__vpaes_schedule_mangle: - movdqa %xmm0,%xmm4 - movdqa 128(%ebp),%xmm5 - testl %edi,%edi - jnz L014schedule_mangle_dec - addl $16,%edx - pxor 336(%ebp),%xmm4 -.byte 102,15,56,0,229 - movdqa %xmm4,%xmm3 -.byte 102,15,56,0,229 - pxor %xmm4,%xmm3 -.byte 102,15,56,0,229 - pxor %xmm4,%xmm3 - jmp L015schedule_mangle_both -.align 4,0x90 -L014schedule_mangle_dec: - movdqa -16(%ebp),%xmm2 - leal 416(%ebp),%esi - movdqa %xmm2,%xmm1 - pandn %xmm4,%xmm1 - psrld $4,%xmm1 - pand %xmm2,%xmm4 - movdqa (%esi),%xmm2 -.byte 102,15,56,0,212 - movdqa 16(%esi),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - movdqa 32(%esi),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 48(%esi),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - movdqa 64(%esi),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 80(%esi),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - movdqa 96(%esi),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 112(%esi),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 - addl $-16,%edx -L015schedule_mangle_both: - movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,217 - addl $-16,%ecx - andl $48,%ecx - movdqu %xmm3,(%edx) - ret -.globl _vpaes_set_encrypt_key -.private_extern _vpaes_set_encrypt_key -.align 4 -_vpaes_set_encrypt_key: -L_vpaes_set_encrypt_key_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi -#ifdef BORINGSSL_DISPATCH_TEST - pushl %ebx - pushl %edx - call L016pic -L016pic: - popl %ebx - leal _BORINGSSL_function_hit+5-L016pic(%ebx),%ebx - movl $1,%edx - movb %dl,(%ebx) - popl %edx - popl %ebx -#endif - movl 20(%esp),%esi - leal -56(%esp),%ebx - movl 24(%esp),%eax - andl $-16,%ebx - movl 28(%esp),%edx - xchgl %esp,%ebx - movl %ebx,48(%esp) - movl %eax,%ebx - shrl $5,%ebx - addl $5,%ebx - movl %ebx,240(%edx) - movl $48,%ecx - movl $0,%edi - leal L_vpaes_consts+0x30-L017pic_point,%ebp - call __vpaes_schedule_core -L017pic_point: - movl 48(%esp),%esp - xorl %eax,%eax - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _vpaes_set_decrypt_key -.private_extern _vpaes_set_decrypt_key -.align 4 -_vpaes_set_decrypt_key: -L_vpaes_set_decrypt_key_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - leal -56(%esp),%ebx - movl 24(%esp),%eax - andl $-16,%ebx - movl 28(%esp),%edx - xchgl %esp,%ebx - movl %ebx,48(%esp) - movl %eax,%ebx - shrl $5,%ebx - addl $5,%ebx - movl %ebx,240(%edx) - shll $4,%ebx - leal 16(%edx,%ebx,1),%edx - movl $1,%edi - movl %eax,%ecx - shrl $1,%ecx - andl $32,%ecx - xorl $32,%ecx - leal L_vpaes_consts+0x30-L018pic_point,%ebp - call __vpaes_schedule_core -L018pic_point: - movl 48(%esp),%esp - xorl %eax,%eax - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _vpaes_encrypt -.private_extern _vpaes_encrypt -.align 4 -_vpaes_encrypt: -L_vpaes_encrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi -#ifdef BORINGSSL_DISPATCH_TEST - pushl %ebx - pushl %edx - call L019pic -L019pic: - popl %ebx - leal _BORINGSSL_function_hit+4-L019pic(%ebx),%ebx - movl $1,%edx - movb %dl,(%ebx) - popl %edx - popl %ebx -#endif - leal L_vpaes_consts+0x30-L020pic_point,%ebp - call __vpaes_preheat -L020pic_point: - movl 20(%esp),%esi - leal -56(%esp),%ebx - movl 24(%esp),%edi - andl $-16,%ebx - movl 28(%esp),%edx - xchgl %esp,%ebx - movl %ebx,48(%esp) - movdqu (%esi),%xmm0 - call __vpaes_encrypt_core - movdqu %xmm0,(%edi) - movl 48(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _vpaes_decrypt -.private_extern _vpaes_decrypt -.align 4 -_vpaes_decrypt: -L_vpaes_decrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - leal L_vpaes_consts+0x30-L021pic_point,%ebp - call __vpaes_preheat -L021pic_point: - movl 20(%esp),%esi - leal -56(%esp),%ebx - movl 24(%esp),%edi - andl $-16,%ebx - movl 28(%esp),%edx - xchgl %esp,%ebx - movl %ebx,48(%esp) - movdqu (%esi),%xmm0 - call __vpaes_decrypt_core - movdqu %xmm0,(%edi) - movl 48(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _vpaes_cbc_encrypt -.private_extern _vpaes_cbc_encrypt -.align 4 -_vpaes_cbc_encrypt: -L_vpaes_cbc_encrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - subl $16,%eax - jc L022cbc_abort - leal -56(%esp),%ebx - movl 36(%esp),%ebp - andl $-16,%ebx - movl 40(%esp),%ecx - xchgl %esp,%ebx - movdqu (%ebp),%xmm1 - subl %esi,%edi - movl %ebx,48(%esp) - movl %edi,(%esp) - movl %edx,4(%esp) - movl %ebp,8(%esp) - movl %eax,%edi - leal L_vpaes_consts+0x30-L023pic_point,%ebp - call __vpaes_preheat -L023pic_point: - cmpl $0,%ecx - je L024cbc_dec_loop - jmp L025cbc_enc_loop -.align 4,0x90 -L025cbc_enc_loop: - movdqu (%esi),%xmm0 - pxor %xmm1,%xmm0 - call __vpaes_encrypt_core - movl (%esp),%ebx - movl 4(%esp),%edx - movdqa %xmm0,%xmm1 - movdqu %xmm0,(%ebx,%esi,1) - leal 16(%esi),%esi - subl $16,%edi - jnc L025cbc_enc_loop - jmp L026cbc_done -.align 4,0x90 -L024cbc_dec_loop: - movdqu (%esi),%xmm0 - movdqa %xmm1,16(%esp) - movdqa %xmm0,32(%esp) - call __vpaes_decrypt_core - movl (%esp),%ebx - movl 4(%esp),%edx - pxor 16(%esp),%xmm0 - movdqa 32(%esp),%xmm1 - movdqu %xmm0,(%ebx,%esi,1) - leal 16(%esi),%esi - subl $16,%edi - jnc L024cbc_dec_loop -L026cbc_done: - movl 8(%esp),%ebx - movl 48(%esp),%esp - movdqu %xmm1,(%ebx) -L022cbc_abort: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -#endif diff --git a/third_party/boringssl/apple-x86/crypto/fipsmodule/x86-mont.S b/third_party/boringssl/apple-x86/crypto/fipsmodule/x86-mont.S deleted file mode 100644 index 7850a37a..00000000 --- a/third_party/boringssl/apple-x86/crypto/fipsmodule/x86-mont.S +++ /dev/null @@ -1,485 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl _bn_mul_mont -.private_extern _bn_mul_mont -.align 4 -_bn_mul_mont: -L_bn_mul_mont_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - xorl %eax,%eax - movl 40(%esp),%edi - cmpl $4,%edi - jl L000just_leave - leal 20(%esp),%esi - leal 24(%esp),%edx - addl $2,%edi - negl %edi - leal -32(%esp,%edi,4),%ebp - negl %edi - movl %ebp,%eax - subl %edx,%eax - andl $2047,%eax - subl %eax,%ebp - xorl %ebp,%edx - andl $2048,%edx - xorl $2048,%edx - subl %edx,%ebp - andl $-64,%ebp - movl %esp,%eax - subl %ebp,%eax - andl $-4096,%eax - movl %esp,%edx - leal (%ebp,%eax,1),%esp - movl (%esp),%eax - cmpl %ebp,%esp - ja L001page_walk - jmp L002page_walk_done -.align 4,0x90 -L001page_walk: - leal -4096(%esp),%esp - movl (%esp),%eax - cmpl %ebp,%esp - ja L001page_walk -L002page_walk_done: - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%ebp - movl 16(%esi),%esi - movl (%esi),%esi - movl %eax,4(%esp) - movl %ebx,8(%esp) - movl %ecx,12(%esp) - movl %ebp,16(%esp) - movl %esi,20(%esp) - leal -3(%edi),%ebx - movl %edx,24(%esp) - call L003PIC_me_up -L003PIC_me_up: - popl %eax - movl L_OPENSSL_ia32cap_P$non_lazy_ptr-L003PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc L004non_sse2 - movl $-1,%eax - movd %eax,%mm7 - movl 8(%esp),%esi - movl 12(%esp),%edi - movl 16(%esp),%ebp - xorl %edx,%edx - xorl %ecx,%ecx - movd (%edi),%mm4 - movd (%esi),%mm5 - movd (%ebp),%mm3 - pmuludq %mm4,%mm5 - movq %mm5,%mm2 - movq %mm5,%mm0 - pand %mm7,%mm0 - pmuludq 20(%esp),%mm5 - pmuludq %mm5,%mm3 - paddq %mm0,%mm3 - movd 4(%ebp),%mm1 - movd 4(%esi),%mm0 - psrlq $32,%mm2 - psrlq $32,%mm3 - incl %ecx -.align 4,0x90 -L0051st: - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - pand %mm7,%mm0 - movd 4(%ebp,%ecx,4),%mm1 - paddq %mm0,%mm3 - movd 4(%esi,%ecx,4),%mm0 - psrlq $32,%mm2 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm3 - leal 1(%ecx),%ecx - cmpl %ebx,%ecx - jl L0051st - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - pand %mm7,%mm0 - paddq %mm0,%mm3 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm2 - psrlq $32,%mm3 - paddq %mm2,%mm3 - movq %mm3,32(%esp,%ebx,4) - incl %edx -L006outer: - xorl %ecx,%ecx - movd (%edi,%edx,4),%mm4 - movd (%esi),%mm5 - movd 32(%esp),%mm6 - movd (%ebp),%mm3 - pmuludq %mm4,%mm5 - paddq %mm6,%mm5 - movq %mm5,%mm0 - movq %mm5,%mm2 - pand %mm7,%mm0 - pmuludq 20(%esp),%mm5 - pmuludq %mm5,%mm3 - paddq %mm0,%mm3 - movd 36(%esp),%mm6 - movd 4(%ebp),%mm1 - movd 4(%esi),%mm0 - psrlq $32,%mm2 - psrlq $32,%mm3 - paddq %mm6,%mm2 - incl %ecx - decl %ebx -L007inner: - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - movd 36(%esp,%ecx,4),%mm6 - pand %mm7,%mm0 - movd 4(%ebp,%ecx,4),%mm1 - paddq %mm0,%mm3 - movd 4(%esi,%ecx,4),%mm0 - psrlq $32,%mm2 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm3 - paddq %mm6,%mm2 - decl %ebx - leal 1(%ecx),%ecx - jnz L007inner - movl %ecx,%ebx - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - pand %mm7,%mm0 - paddq %mm0,%mm3 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm2 - psrlq $32,%mm3 - movd 36(%esp,%ebx,4),%mm6 - paddq %mm2,%mm3 - paddq %mm6,%mm3 - movq %mm3,32(%esp,%ebx,4) - leal 1(%edx),%edx - cmpl %ebx,%edx - jle L006outer - emms - jmp L008common_tail -.align 4,0x90 -L004non_sse2: - movl 8(%esp),%esi - leal 1(%ebx),%ebp - movl 12(%esp),%edi - xorl %ecx,%ecx - movl %esi,%edx - andl $1,%ebp - subl %edi,%edx - leal 4(%edi,%ebx,4),%eax - orl %edx,%ebp - movl (%edi),%edi - jz L009bn_sqr_mont - movl %eax,28(%esp) - movl (%esi),%eax - xorl %edx,%edx -.align 4,0x90 -L010mull: - movl %edx,%ebp - mull %edi - addl %eax,%ebp - leal 1(%ecx),%ecx - adcl $0,%edx - movl (%esi,%ecx,4),%eax - cmpl %ebx,%ecx - movl %ebp,28(%esp,%ecx,4) - jl L010mull - movl %edx,%ebp - mull %edi - movl 20(%esp),%edi - addl %ebp,%eax - movl 16(%esp),%esi - adcl $0,%edx - imull 32(%esp),%edi - movl %eax,32(%esp,%ebx,4) - xorl %ecx,%ecx - movl %edx,36(%esp,%ebx,4) - movl %ecx,40(%esp,%ebx,4) - movl (%esi),%eax - mull %edi - addl 32(%esp),%eax - movl 4(%esi),%eax - adcl $0,%edx - incl %ecx - jmp L0112ndmadd -.align 4,0x90 -L0121stmadd: - movl %edx,%ebp - mull %edi - addl 32(%esp,%ecx,4),%ebp - leal 1(%ecx),%ecx - adcl $0,%edx - addl %eax,%ebp - movl (%esi,%ecx,4),%eax - adcl $0,%edx - cmpl %ebx,%ecx - movl %ebp,28(%esp,%ecx,4) - jl L0121stmadd - movl %edx,%ebp - mull %edi - addl 32(%esp,%ebx,4),%eax - movl 20(%esp),%edi - adcl $0,%edx - movl 16(%esp),%esi - addl %eax,%ebp - adcl $0,%edx - imull 32(%esp),%edi - xorl %ecx,%ecx - addl 36(%esp,%ebx,4),%edx - movl %ebp,32(%esp,%ebx,4) - adcl $0,%ecx - movl (%esi),%eax - movl %edx,36(%esp,%ebx,4) - movl %ecx,40(%esp,%ebx,4) - mull %edi - addl 32(%esp),%eax - movl 4(%esi),%eax - adcl $0,%edx - movl $1,%ecx -.align 4,0x90 -L0112ndmadd: - movl %edx,%ebp - mull %edi - addl 32(%esp,%ecx,4),%ebp - leal 1(%ecx),%ecx - adcl $0,%edx - addl %eax,%ebp - movl (%esi,%ecx,4),%eax - adcl $0,%edx - cmpl %ebx,%ecx - movl %ebp,24(%esp,%ecx,4) - jl L0112ndmadd - movl %edx,%ebp - mull %edi - addl 32(%esp,%ebx,4),%ebp - adcl $0,%edx - addl %eax,%ebp - adcl $0,%edx - movl %ebp,28(%esp,%ebx,4) - xorl %eax,%eax - movl 12(%esp),%ecx - addl 36(%esp,%ebx,4),%edx - adcl 40(%esp,%ebx,4),%eax - leal 4(%ecx),%ecx - movl %edx,32(%esp,%ebx,4) - cmpl 28(%esp),%ecx - movl %eax,36(%esp,%ebx,4) - je L008common_tail - movl (%ecx),%edi - movl 8(%esp),%esi - movl %ecx,12(%esp) - xorl %ecx,%ecx - xorl %edx,%edx - movl (%esi),%eax - jmp L0121stmadd -.align 4,0x90 -L009bn_sqr_mont: - movl %ebx,(%esp) - movl %ecx,12(%esp) - movl %edi,%eax - mull %edi - movl %eax,32(%esp) - movl %edx,%ebx - shrl $1,%edx - andl $1,%ebx - incl %ecx -.align 4,0x90 -L013sqr: - movl (%esi,%ecx,4),%eax - movl %edx,%ebp - mull %edi - addl %ebp,%eax - leal 1(%ecx),%ecx - adcl $0,%edx - leal (%ebx,%eax,2),%ebp - shrl $31,%eax - cmpl (%esp),%ecx - movl %eax,%ebx - movl %ebp,28(%esp,%ecx,4) - jl L013sqr - movl (%esi,%ecx,4),%eax - movl %edx,%ebp - mull %edi - addl %ebp,%eax - movl 20(%esp),%edi - adcl $0,%edx - movl 16(%esp),%esi - leal (%ebx,%eax,2),%ebp - imull 32(%esp),%edi - shrl $31,%eax - movl %ebp,32(%esp,%ecx,4) - leal (%eax,%edx,2),%ebp - movl (%esi),%eax - shrl $31,%edx - movl %ebp,36(%esp,%ecx,4) - movl %edx,40(%esp,%ecx,4) - mull %edi - addl 32(%esp),%eax - movl %ecx,%ebx - adcl $0,%edx - movl 4(%esi),%eax - movl $1,%ecx -.align 4,0x90 -L0143rdmadd: - movl %edx,%ebp - mull %edi - addl 32(%esp,%ecx,4),%ebp - adcl $0,%edx - addl %eax,%ebp - movl 4(%esi,%ecx,4),%eax - adcl $0,%edx - movl %ebp,28(%esp,%ecx,4) - movl %edx,%ebp - mull %edi - addl 36(%esp,%ecx,4),%ebp - leal 2(%ecx),%ecx - adcl $0,%edx - addl %eax,%ebp - movl (%esi,%ecx,4),%eax - adcl $0,%edx - cmpl %ebx,%ecx - movl %ebp,24(%esp,%ecx,4) - jl L0143rdmadd - movl %edx,%ebp - mull %edi - addl 32(%esp,%ebx,4),%ebp - adcl $0,%edx - addl %eax,%ebp - adcl $0,%edx - movl %ebp,28(%esp,%ebx,4) - movl 12(%esp),%ecx - xorl %eax,%eax - movl 8(%esp),%esi - addl 36(%esp,%ebx,4),%edx - adcl 40(%esp,%ebx,4),%eax - movl %edx,32(%esp,%ebx,4) - cmpl %ebx,%ecx - movl %eax,36(%esp,%ebx,4) - je L008common_tail - movl 4(%esi,%ecx,4),%edi - leal 1(%ecx),%ecx - movl %edi,%eax - movl %ecx,12(%esp) - mull %edi - addl 32(%esp,%ecx,4),%eax - adcl $0,%edx - movl %eax,32(%esp,%ecx,4) - xorl %ebp,%ebp - cmpl %ebx,%ecx - leal 1(%ecx),%ecx - je L015sqrlast - movl %edx,%ebx - shrl $1,%edx - andl $1,%ebx -.align 4,0x90 -L016sqradd: - movl (%esi,%ecx,4),%eax - movl %edx,%ebp - mull %edi - addl %ebp,%eax - leal (%eax,%eax,1),%ebp - adcl $0,%edx - shrl $31,%eax - addl 32(%esp,%ecx,4),%ebp - leal 1(%ecx),%ecx - adcl $0,%eax - addl %ebx,%ebp - adcl $0,%eax - cmpl (%esp),%ecx - movl %ebp,28(%esp,%ecx,4) - movl %eax,%ebx - jle L016sqradd - movl %edx,%ebp - addl %edx,%edx - shrl $31,%ebp - addl %ebx,%edx - adcl $0,%ebp -L015sqrlast: - movl 20(%esp),%edi - movl 16(%esp),%esi - imull 32(%esp),%edi - addl 32(%esp,%ecx,4),%edx - movl (%esi),%eax - adcl $0,%ebp - movl %edx,32(%esp,%ecx,4) - movl %ebp,36(%esp,%ecx,4) - mull %edi - addl 32(%esp),%eax - leal -1(%ecx),%ebx - adcl $0,%edx - movl $1,%ecx - movl 4(%esi),%eax - jmp L0143rdmadd -.align 4,0x90 -L008common_tail: - movl 16(%esp),%ebp - movl 4(%esp),%edi - leal 32(%esp),%esi - movl (%esi),%eax - movl %ebx,%ecx - xorl %edx,%edx -.align 4,0x90 -L017sub: - sbbl (%ebp,%edx,4),%eax - movl %eax,(%edi,%edx,4) - decl %ecx - movl 4(%esi,%edx,4),%eax - leal 1(%edx),%edx - jge L017sub - sbbl $0,%eax - movl $-1,%edx - xorl %eax,%edx - jmp L018copy -.align 4,0x90 -L018copy: - movl 32(%esp,%ebx,4),%esi - movl (%edi,%ebx,4),%ebp - movl %ecx,32(%esp,%ebx,4) - andl %eax,%esi - andl %edx,%ebp - orl %esi,%ebp - movl %ebp,(%edi,%ebx,4) - decl %ebx - jge L018copy - movl 24(%esp),%esp - movl $1,%eax -L000just_leave: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 -.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 -.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 -.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 -.byte 111,114,103,62,0 -.section __IMPORT,__pointers,non_lazy_symbol_pointers -L_OPENSSL_ia32cap_P$non_lazy_ptr: -.indirect_symbol _OPENSSL_ia32cap_P -.long 0 -#endif diff --git a/third_party/boringssl/apple-x86/crypto/test/trampoline-x86.S b/third_party/boringssl/apple-x86/crypto/test/trampoline-x86.S deleted file mode 100644 index fd40b957..00000000 --- a/third_party/boringssl/apple-x86/crypto/test/trampoline-x86.S +++ /dev/null @@ -1,169 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl _abi_test_trampoline -.private_extern _abi_test_trampoline -.align 4 -_abi_test_trampoline: -L_abi_test_trampoline_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 24(%esp),%ecx - movl (%ecx),%esi - movl 4(%ecx),%edi - movl 8(%ecx),%ebx - movl 12(%ecx),%ebp - subl $44,%esp - movl 72(%esp),%eax - xorl %ecx,%ecx -L000loop: - cmpl 76(%esp),%ecx - jae L001loop_done - movl (%eax,%ecx,4),%edx - movl %edx,(%esp,%ecx,4) - addl $1,%ecx - jmp L000loop -L001loop_done: - call *64(%esp) - addl $44,%esp - movl 24(%esp),%ecx - movl %esi,(%ecx) - movl %edi,4(%ecx) - movl %ebx,8(%ecx) - movl %ebp,12(%ecx) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.globl _abi_test_get_and_clear_direction_flag -.private_extern _abi_test_get_and_clear_direction_flag -.align 4 -_abi_test_get_and_clear_direction_flag: -L_abi_test_get_and_clear_direction_flag_begin: - pushfl - popl %eax - andl $1024,%eax - shrl $10,%eax - cld - ret -.globl _abi_test_set_direction_flag -.private_extern _abi_test_set_direction_flag -.align 4 -_abi_test_set_direction_flag: -L_abi_test_set_direction_flag_begin: - std - ret -.globl _abi_test_clobber_eax -.private_extern _abi_test_clobber_eax -.align 4 -_abi_test_clobber_eax: -L_abi_test_clobber_eax_begin: - xorl %eax,%eax - ret -.globl _abi_test_clobber_ebx -.private_extern _abi_test_clobber_ebx -.align 4 -_abi_test_clobber_ebx: -L_abi_test_clobber_ebx_begin: - xorl %ebx,%ebx - ret -.globl _abi_test_clobber_ecx -.private_extern _abi_test_clobber_ecx -.align 4 -_abi_test_clobber_ecx: -L_abi_test_clobber_ecx_begin: - xorl %ecx,%ecx - ret -.globl _abi_test_clobber_edx -.private_extern _abi_test_clobber_edx -.align 4 -_abi_test_clobber_edx: -L_abi_test_clobber_edx_begin: - xorl %edx,%edx - ret -.globl _abi_test_clobber_edi -.private_extern _abi_test_clobber_edi -.align 4 -_abi_test_clobber_edi: -L_abi_test_clobber_edi_begin: - xorl %edi,%edi - ret -.globl _abi_test_clobber_esi -.private_extern _abi_test_clobber_esi -.align 4 -_abi_test_clobber_esi: -L_abi_test_clobber_esi_begin: - xorl %esi,%esi - ret -.globl _abi_test_clobber_ebp -.private_extern _abi_test_clobber_ebp -.align 4 -_abi_test_clobber_ebp: -L_abi_test_clobber_ebp_begin: - xorl %ebp,%ebp - ret -.globl _abi_test_clobber_xmm0 -.private_extern _abi_test_clobber_xmm0 -.align 4 -_abi_test_clobber_xmm0: -L_abi_test_clobber_xmm0_begin: - pxor %xmm0,%xmm0 - ret -.globl _abi_test_clobber_xmm1 -.private_extern _abi_test_clobber_xmm1 -.align 4 -_abi_test_clobber_xmm1: -L_abi_test_clobber_xmm1_begin: - pxor %xmm1,%xmm1 - ret -.globl _abi_test_clobber_xmm2 -.private_extern _abi_test_clobber_xmm2 -.align 4 -_abi_test_clobber_xmm2: -L_abi_test_clobber_xmm2_begin: - pxor %xmm2,%xmm2 - ret -.globl _abi_test_clobber_xmm3 -.private_extern _abi_test_clobber_xmm3 -.align 4 -_abi_test_clobber_xmm3: -L_abi_test_clobber_xmm3_begin: - pxor %xmm3,%xmm3 - ret -.globl _abi_test_clobber_xmm4 -.private_extern _abi_test_clobber_xmm4 -.align 4 -_abi_test_clobber_xmm4: -L_abi_test_clobber_xmm4_begin: - pxor %xmm4,%xmm4 - ret -.globl _abi_test_clobber_xmm5 -.private_extern _abi_test_clobber_xmm5 -.align 4 -_abi_test_clobber_xmm5: -L_abi_test_clobber_xmm5_begin: - pxor %xmm5,%xmm5 - ret -.globl _abi_test_clobber_xmm6 -.private_extern _abi_test_clobber_xmm6 -.align 4 -_abi_test_clobber_xmm6: -L_abi_test_clobber_xmm6_begin: - pxor %xmm6,%xmm6 - ret -.globl _abi_test_clobber_xmm7 -.private_extern _abi_test_clobber_xmm7 -.align 4 -_abi_test_clobber_xmm7: -L_abi_test_clobber_xmm7_begin: - pxor %xmm7,%xmm7 - ret -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/chacha/chacha-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/chacha/chacha-x86_64.S deleted file mode 100644 index 782ddf4b..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/chacha/chacha-x86_64.S +++ /dev/null @@ -1,1625 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - -.p2align 6 -L$zero: -.long 0,0,0,0 -L$one: -.long 1,0,0,0 -L$inc: -.long 0,1,2,3 -L$four: -.long 4,4,4,4 -L$incy: -.long 0,2,4,6,1,3,5,7 -L$eight: -.long 8,8,8,8,8,8,8,8 -L$rot16: -.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd -L$rot24: -.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe -L$sigma: -.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 -.p2align 6 -L$zeroz: -.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 -L$fourz: -.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 -L$incz: -.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 -L$sixteen: -.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 -.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.globl _ChaCha20_ctr32 -.private_extern _ChaCha20_ctr32 - -.p2align 6 -_ChaCha20_ctr32: - - cmpq $0,%rdx - je L$no_data - movq _OPENSSL_ia32cap_P+4(%rip),%r10 - testl $512,%r10d - jnz L$ChaCha20_ssse3 - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $64+24,%rsp - -L$ctr32_body: - - - movdqu (%rcx),%xmm1 - movdqu 16(%rcx),%xmm2 - movdqu (%r8),%xmm3 - movdqa L$one(%rip),%xmm4 - - - movdqa %xmm1,16(%rsp) - movdqa %xmm2,32(%rsp) - movdqa %xmm3,48(%rsp) - movq %rdx,%rbp - jmp L$oop_outer - -.p2align 5 -L$oop_outer: - movl $0x61707865,%eax - movl $0x3320646e,%ebx - movl $0x79622d32,%ecx - movl $0x6b206574,%edx - movl 16(%rsp),%r8d - movl 20(%rsp),%r9d - movl 24(%rsp),%r10d - movl 28(%rsp),%r11d - movd %xmm3,%r12d - movl 52(%rsp),%r13d - movl 56(%rsp),%r14d - movl 60(%rsp),%r15d - - movq %rbp,64+0(%rsp) - movl $10,%ebp - movq %rsi,64+8(%rsp) -.byte 102,72,15,126,214 - movq %rdi,64+16(%rsp) - movq %rsi,%rdi - shrq $32,%rdi - jmp L$oop - -.p2align 5 -L$oop: - addl %r8d,%eax - xorl %eax,%r12d - roll $16,%r12d - addl %r9d,%ebx - xorl %ebx,%r13d - roll $16,%r13d - addl %r12d,%esi - xorl %esi,%r8d - roll $12,%r8d - addl %r13d,%edi - xorl %edi,%r9d - roll $12,%r9d - addl %r8d,%eax - xorl %eax,%r12d - roll $8,%r12d - addl %r9d,%ebx - xorl %ebx,%r13d - roll $8,%r13d - addl %r12d,%esi - xorl %esi,%r8d - roll $7,%r8d - addl %r13d,%edi - xorl %edi,%r9d - roll $7,%r9d - movl %esi,32(%rsp) - movl %edi,36(%rsp) - movl 40(%rsp),%esi - movl 44(%rsp),%edi - addl %r10d,%ecx - xorl %ecx,%r14d - roll $16,%r14d - addl %r11d,%edx - xorl %edx,%r15d - roll $16,%r15d - addl %r14d,%esi - xorl %esi,%r10d - roll $12,%r10d - addl %r15d,%edi - xorl %edi,%r11d - roll $12,%r11d - addl %r10d,%ecx - xorl %ecx,%r14d - roll $8,%r14d - addl %r11d,%edx - xorl %edx,%r15d - roll $8,%r15d - addl %r14d,%esi - xorl %esi,%r10d - roll $7,%r10d - addl %r15d,%edi - xorl %edi,%r11d - roll $7,%r11d - addl %r9d,%eax - xorl %eax,%r15d - roll $16,%r15d - addl %r10d,%ebx - xorl %ebx,%r12d - roll $16,%r12d - addl %r15d,%esi - xorl %esi,%r9d - roll $12,%r9d - addl %r12d,%edi - xorl %edi,%r10d - roll $12,%r10d - addl %r9d,%eax - xorl %eax,%r15d - roll $8,%r15d - addl %r10d,%ebx - xorl %ebx,%r12d - roll $8,%r12d - addl %r15d,%esi - xorl %esi,%r9d - roll $7,%r9d - addl %r12d,%edi - xorl %edi,%r10d - roll $7,%r10d - movl %esi,40(%rsp) - movl %edi,44(%rsp) - movl 32(%rsp),%esi - movl 36(%rsp),%edi - addl %r11d,%ecx - xorl %ecx,%r13d - roll $16,%r13d - addl %r8d,%edx - xorl %edx,%r14d - roll $16,%r14d - addl %r13d,%esi - xorl %esi,%r11d - roll $12,%r11d - addl %r14d,%edi - xorl %edi,%r8d - roll $12,%r8d - addl %r11d,%ecx - xorl %ecx,%r13d - roll $8,%r13d - addl %r8d,%edx - xorl %edx,%r14d - roll $8,%r14d - addl %r13d,%esi - xorl %esi,%r11d - roll $7,%r11d - addl %r14d,%edi - xorl %edi,%r8d - roll $7,%r8d - decl %ebp - jnz L$oop - movl %edi,36(%rsp) - movl %esi,32(%rsp) - movq 64(%rsp),%rbp - movdqa %xmm2,%xmm1 - movq 64+8(%rsp),%rsi - paddd %xmm4,%xmm3 - movq 64+16(%rsp),%rdi - - addl $0x61707865,%eax - addl $0x3320646e,%ebx - addl $0x79622d32,%ecx - addl $0x6b206574,%edx - addl 16(%rsp),%r8d - addl 20(%rsp),%r9d - addl 24(%rsp),%r10d - addl 28(%rsp),%r11d - addl 48(%rsp),%r12d - addl 52(%rsp),%r13d - addl 56(%rsp),%r14d - addl 60(%rsp),%r15d - paddd 32(%rsp),%xmm1 - - cmpq $64,%rbp - jb L$tail - - xorl 0(%rsi),%eax - xorl 4(%rsi),%ebx - xorl 8(%rsi),%ecx - xorl 12(%rsi),%edx - xorl 16(%rsi),%r8d - xorl 20(%rsi),%r9d - xorl 24(%rsi),%r10d - xorl 28(%rsi),%r11d - movdqu 32(%rsi),%xmm0 - xorl 48(%rsi),%r12d - xorl 52(%rsi),%r13d - xorl 56(%rsi),%r14d - xorl 60(%rsi),%r15d - leaq 64(%rsi),%rsi - pxor %xmm1,%xmm0 - - movdqa %xmm2,32(%rsp) - movd %xmm3,48(%rsp) - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - movdqu %xmm0,32(%rdi) - movl %r12d,48(%rdi) - movl %r13d,52(%rdi) - movl %r14d,56(%rdi) - movl %r15d,60(%rdi) - leaq 64(%rdi),%rdi - - subq $64,%rbp - jnz L$oop_outer - - jmp L$done - -.p2align 4 -L$tail: - movl %eax,0(%rsp) - movl %ebx,4(%rsp) - xorq %rbx,%rbx - movl %ecx,8(%rsp) - movl %edx,12(%rsp) - movl %r8d,16(%rsp) - movl %r9d,20(%rsp) - movl %r10d,24(%rsp) - movl %r11d,28(%rsp) - movdqa %xmm1,32(%rsp) - movl %r12d,48(%rsp) - movl %r13d,52(%rsp) - movl %r14d,56(%rsp) - movl %r15d,60(%rsp) - -L$oop_tail: - movzbl (%rsi,%rbx,1),%eax - movzbl (%rsp,%rbx,1),%edx - leaq 1(%rbx),%rbx - xorl %edx,%eax - movb %al,-1(%rdi,%rbx,1) - decq %rbp - jnz L$oop_tail - -L$done: - leaq 64+24+48(%rsp),%rsi - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$no_data: - .byte 0xf3,0xc3 - - - -.p2align 5 -ChaCha20_ssse3: -L$ChaCha20_ssse3: - - movq %rsp,%r9 - - cmpq $128,%rdx - ja L$ChaCha20_4x - -L$do_sse3_after_all: - subq $64+8,%rsp - movdqa L$sigma(%rip),%xmm0 - movdqu (%rcx),%xmm1 - movdqu 16(%rcx),%xmm2 - movdqu (%r8),%xmm3 - movdqa L$rot16(%rip),%xmm6 - movdqa L$rot24(%rip),%xmm7 - - movdqa %xmm0,0(%rsp) - movdqa %xmm1,16(%rsp) - movdqa %xmm2,32(%rsp) - movdqa %xmm3,48(%rsp) - movq $10,%r8 - jmp L$oop_ssse3 - -.p2align 5 -L$oop_outer_ssse3: - movdqa L$one(%rip),%xmm3 - movdqa 0(%rsp),%xmm0 - movdqa 16(%rsp),%xmm1 - movdqa 32(%rsp),%xmm2 - paddd 48(%rsp),%xmm3 - movq $10,%r8 - movdqa %xmm3,48(%rsp) - jmp L$oop_ssse3 - -.p2align 5 -L$oop_ssse3: - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $57,%xmm1,%xmm1 - pshufd $147,%xmm3,%xmm3 - nop - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $147,%xmm1,%xmm1 - pshufd $57,%xmm3,%xmm3 - decq %r8 - jnz L$oop_ssse3 - paddd 0(%rsp),%xmm0 - paddd 16(%rsp),%xmm1 - paddd 32(%rsp),%xmm2 - paddd 48(%rsp),%xmm3 - - cmpq $64,%rdx - jb L$tail_ssse3 - - movdqu 0(%rsi),%xmm4 - movdqu 16(%rsi),%xmm5 - pxor %xmm4,%xmm0 - movdqu 32(%rsi),%xmm4 - pxor %xmm5,%xmm1 - movdqu 48(%rsi),%xmm5 - leaq 64(%rsi),%rsi - pxor %xmm4,%xmm2 - pxor %xmm5,%xmm3 - - movdqu %xmm0,0(%rdi) - movdqu %xmm1,16(%rdi) - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - leaq 64(%rdi),%rdi - - subq $64,%rdx - jnz L$oop_outer_ssse3 - - jmp L$done_ssse3 - -.p2align 4 -L$tail_ssse3: - movdqa %xmm0,0(%rsp) - movdqa %xmm1,16(%rsp) - movdqa %xmm2,32(%rsp) - movdqa %xmm3,48(%rsp) - xorq %r8,%r8 - -L$oop_tail_ssse3: - movzbl (%rsi,%r8,1),%eax - movzbl (%rsp,%r8,1),%ecx - leaq 1(%r8),%r8 - xorl %ecx,%eax - movb %al,-1(%rdi,%r8,1) - decq %rdx - jnz L$oop_tail_ssse3 - -L$done_ssse3: - leaq (%r9),%rsp - -L$ssse3_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -ChaCha20_4x: -L$ChaCha20_4x: - - movq %rsp,%r9 - - movq %r10,%r11 - shrq $32,%r10 - testq $32,%r10 - jnz L$ChaCha20_8x - cmpq $192,%rdx - ja L$proceed4x - - andq $71303168,%r11 - cmpq $4194304,%r11 - je L$do_sse3_after_all - -L$proceed4x: - subq $0x140+8,%rsp - movdqa L$sigma(%rip),%xmm11 - movdqu (%rcx),%xmm15 - movdqu 16(%rcx),%xmm7 - movdqu (%r8),%xmm3 - leaq 256(%rsp),%rcx - leaq L$rot16(%rip),%r10 - leaq L$rot24(%rip),%r11 - - pshufd $0x00,%xmm11,%xmm8 - pshufd $0x55,%xmm11,%xmm9 - movdqa %xmm8,64(%rsp) - pshufd $0xaa,%xmm11,%xmm10 - movdqa %xmm9,80(%rsp) - pshufd $0xff,%xmm11,%xmm11 - movdqa %xmm10,96(%rsp) - movdqa %xmm11,112(%rsp) - - pshufd $0x00,%xmm15,%xmm12 - pshufd $0x55,%xmm15,%xmm13 - movdqa %xmm12,128-256(%rcx) - pshufd $0xaa,%xmm15,%xmm14 - movdqa %xmm13,144-256(%rcx) - pshufd $0xff,%xmm15,%xmm15 - movdqa %xmm14,160-256(%rcx) - movdqa %xmm15,176-256(%rcx) - - pshufd $0x00,%xmm7,%xmm4 - pshufd $0x55,%xmm7,%xmm5 - movdqa %xmm4,192-256(%rcx) - pshufd $0xaa,%xmm7,%xmm6 - movdqa %xmm5,208-256(%rcx) - pshufd $0xff,%xmm7,%xmm7 - movdqa %xmm6,224-256(%rcx) - movdqa %xmm7,240-256(%rcx) - - pshufd $0x00,%xmm3,%xmm0 - pshufd $0x55,%xmm3,%xmm1 - paddd L$inc(%rip),%xmm0 - pshufd $0xaa,%xmm3,%xmm2 - movdqa %xmm1,272-256(%rcx) - pshufd $0xff,%xmm3,%xmm3 - movdqa %xmm2,288-256(%rcx) - movdqa %xmm3,304-256(%rcx) - - jmp L$oop_enter4x - -.p2align 5 -L$oop_outer4x: - movdqa 64(%rsp),%xmm8 - movdqa 80(%rsp),%xmm9 - movdqa 96(%rsp),%xmm10 - movdqa 112(%rsp),%xmm11 - movdqa 128-256(%rcx),%xmm12 - movdqa 144-256(%rcx),%xmm13 - movdqa 160-256(%rcx),%xmm14 - movdqa 176-256(%rcx),%xmm15 - movdqa 192-256(%rcx),%xmm4 - movdqa 208-256(%rcx),%xmm5 - movdqa 224-256(%rcx),%xmm6 - movdqa 240-256(%rcx),%xmm7 - movdqa 256-256(%rcx),%xmm0 - movdqa 272-256(%rcx),%xmm1 - movdqa 288-256(%rcx),%xmm2 - movdqa 304-256(%rcx),%xmm3 - paddd L$four(%rip),%xmm0 - -L$oop_enter4x: - movdqa %xmm6,32(%rsp) - movdqa %xmm7,48(%rsp) - movdqa (%r10),%xmm7 - movl $10,%eax - movdqa %xmm0,256-256(%rcx) - jmp L$oop4x - -.p2align 5 -L$oop4x: - paddd %xmm12,%xmm8 - paddd %xmm13,%xmm9 - pxor %xmm8,%xmm0 - pxor %xmm9,%xmm1 -.byte 102,15,56,0,199 -.byte 102,15,56,0,207 - paddd %xmm0,%xmm4 - paddd %xmm1,%xmm5 - pxor %xmm4,%xmm12 - pxor %xmm5,%xmm13 - movdqa %xmm12,%xmm6 - pslld $12,%xmm12 - psrld $20,%xmm6 - movdqa %xmm13,%xmm7 - pslld $12,%xmm13 - por %xmm6,%xmm12 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm13 - paddd %xmm12,%xmm8 - paddd %xmm13,%xmm9 - pxor %xmm8,%xmm0 - pxor %xmm9,%xmm1 -.byte 102,15,56,0,198 -.byte 102,15,56,0,206 - paddd %xmm0,%xmm4 - paddd %xmm1,%xmm5 - pxor %xmm4,%xmm12 - pxor %xmm5,%xmm13 - movdqa %xmm12,%xmm7 - pslld $7,%xmm12 - psrld $25,%xmm7 - movdqa %xmm13,%xmm6 - pslld $7,%xmm13 - por %xmm7,%xmm12 - psrld $25,%xmm6 - movdqa (%r10),%xmm7 - por %xmm6,%xmm13 - movdqa %xmm4,0(%rsp) - movdqa %xmm5,16(%rsp) - movdqa 32(%rsp),%xmm4 - movdqa 48(%rsp),%xmm5 - paddd %xmm14,%xmm10 - paddd %xmm15,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm3 -.byte 102,15,56,0,215 -.byte 102,15,56,0,223 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - pxor %xmm4,%xmm14 - pxor %xmm5,%xmm15 - movdqa %xmm14,%xmm6 - pslld $12,%xmm14 - psrld $20,%xmm6 - movdqa %xmm15,%xmm7 - pslld $12,%xmm15 - por %xmm6,%xmm14 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm15 - paddd %xmm14,%xmm10 - paddd %xmm15,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm3 -.byte 102,15,56,0,214 -.byte 102,15,56,0,222 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - pxor %xmm4,%xmm14 - pxor %xmm5,%xmm15 - movdqa %xmm14,%xmm7 - pslld $7,%xmm14 - psrld $25,%xmm7 - movdqa %xmm15,%xmm6 - pslld $7,%xmm15 - por %xmm7,%xmm14 - psrld $25,%xmm6 - movdqa (%r10),%xmm7 - por %xmm6,%xmm15 - paddd %xmm13,%xmm8 - paddd %xmm14,%xmm9 - pxor %xmm8,%xmm3 - pxor %xmm9,%xmm0 -.byte 102,15,56,0,223 -.byte 102,15,56,0,199 - paddd %xmm3,%xmm4 - paddd %xmm0,%xmm5 - pxor %xmm4,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm13,%xmm6 - pslld $12,%xmm13 - psrld $20,%xmm6 - movdqa %xmm14,%xmm7 - pslld $12,%xmm14 - por %xmm6,%xmm13 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm14 - paddd %xmm13,%xmm8 - paddd %xmm14,%xmm9 - pxor %xmm8,%xmm3 - pxor %xmm9,%xmm0 -.byte 102,15,56,0,222 -.byte 102,15,56,0,198 - paddd %xmm3,%xmm4 - paddd %xmm0,%xmm5 - pxor %xmm4,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm13,%xmm7 - pslld $7,%xmm13 - psrld $25,%xmm7 - movdqa %xmm14,%xmm6 - pslld $7,%xmm14 - por %xmm7,%xmm13 - psrld $25,%xmm6 - movdqa (%r10),%xmm7 - por %xmm6,%xmm14 - movdqa %xmm4,32(%rsp) - movdqa %xmm5,48(%rsp) - movdqa 0(%rsp),%xmm4 - movdqa 16(%rsp),%xmm5 - paddd %xmm15,%xmm10 - paddd %xmm12,%xmm11 - pxor %xmm10,%xmm1 - pxor %xmm11,%xmm2 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - paddd %xmm1,%xmm4 - paddd %xmm2,%xmm5 - pxor %xmm4,%xmm15 - pxor %xmm5,%xmm12 - movdqa %xmm15,%xmm6 - pslld $12,%xmm15 - psrld $20,%xmm6 - movdqa %xmm12,%xmm7 - pslld $12,%xmm12 - por %xmm6,%xmm15 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm12 - paddd %xmm15,%xmm10 - paddd %xmm12,%xmm11 - pxor %xmm10,%xmm1 - pxor %xmm11,%xmm2 -.byte 102,15,56,0,206 -.byte 102,15,56,0,214 - paddd %xmm1,%xmm4 - paddd %xmm2,%xmm5 - pxor %xmm4,%xmm15 - pxor %xmm5,%xmm12 - movdqa %xmm15,%xmm7 - pslld $7,%xmm15 - psrld $25,%xmm7 - movdqa %xmm12,%xmm6 - pslld $7,%xmm12 - por %xmm7,%xmm15 - psrld $25,%xmm6 - movdqa (%r10),%xmm7 - por %xmm6,%xmm12 - decl %eax - jnz L$oop4x - - paddd 64(%rsp),%xmm8 - paddd 80(%rsp),%xmm9 - paddd 96(%rsp),%xmm10 - paddd 112(%rsp),%xmm11 - - movdqa %xmm8,%xmm6 - punpckldq %xmm9,%xmm8 - movdqa %xmm10,%xmm7 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm9,%xmm6 - punpckhdq %xmm11,%xmm7 - movdqa %xmm8,%xmm9 - punpcklqdq %xmm10,%xmm8 - movdqa %xmm6,%xmm11 - punpcklqdq %xmm7,%xmm6 - punpckhqdq %xmm10,%xmm9 - punpckhqdq %xmm7,%xmm11 - paddd 128-256(%rcx),%xmm12 - paddd 144-256(%rcx),%xmm13 - paddd 160-256(%rcx),%xmm14 - paddd 176-256(%rcx),%xmm15 - - movdqa %xmm8,0(%rsp) - movdqa %xmm9,16(%rsp) - movdqa 32(%rsp),%xmm8 - movdqa 48(%rsp),%xmm9 - - movdqa %xmm12,%xmm10 - punpckldq %xmm13,%xmm12 - movdqa %xmm14,%xmm7 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm13,%xmm10 - punpckhdq %xmm15,%xmm7 - movdqa %xmm12,%xmm13 - punpcklqdq %xmm14,%xmm12 - movdqa %xmm10,%xmm15 - punpcklqdq %xmm7,%xmm10 - punpckhqdq %xmm14,%xmm13 - punpckhqdq %xmm7,%xmm15 - paddd 192-256(%rcx),%xmm4 - paddd 208-256(%rcx),%xmm5 - paddd 224-256(%rcx),%xmm8 - paddd 240-256(%rcx),%xmm9 - - movdqa %xmm6,32(%rsp) - movdqa %xmm11,48(%rsp) - - movdqa %xmm4,%xmm14 - punpckldq %xmm5,%xmm4 - movdqa %xmm8,%xmm7 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm5,%xmm14 - punpckhdq %xmm9,%xmm7 - movdqa %xmm4,%xmm5 - punpcklqdq %xmm8,%xmm4 - movdqa %xmm14,%xmm9 - punpcklqdq %xmm7,%xmm14 - punpckhqdq %xmm8,%xmm5 - punpckhqdq %xmm7,%xmm9 - paddd 256-256(%rcx),%xmm0 - paddd 272-256(%rcx),%xmm1 - paddd 288-256(%rcx),%xmm2 - paddd 304-256(%rcx),%xmm3 - - movdqa %xmm0,%xmm8 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm8 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 - movdqa %xmm8,%xmm3 - punpcklqdq %xmm7,%xmm8 - punpckhqdq %xmm2,%xmm1 - punpckhqdq %xmm7,%xmm3 - cmpq $256,%rdx - jb L$tail4x - - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - leaq 128(%rsi),%rsi - pxor 16(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - - movdqu %xmm6,64(%rdi) - movdqu 0(%rsi),%xmm6 - movdqu %xmm11,80(%rdi) - movdqu 16(%rsi),%xmm11 - movdqu %xmm2,96(%rdi) - movdqu 32(%rsi),%xmm2 - movdqu %xmm7,112(%rdi) - leaq 128(%rdi),%rdi - movdqu 48(%rsi),%xmm7 - pxor 32(%rsp),%xmm6 - pxor %xmm10,%xmm11 - pxor %xmm14,%xmm2 - pxor %xmm8,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - leaq 128(%rsi),%rsi - pxor 48(%rsp),%xmm6 - pxor %xmm15,%xmm11 - pxor %xmm9,%xmm2 - pxor %xmm3,%xmm7 - movdqu %xmm6,64(%rdi) - movdqu %xmm11,80(%rdi) - movdqu %xmm2,96(%rdi) - movdqu %xmm7,112(%rdi) - leaq 128(%rdi),%rdi - - subq $256,%rdx - jnz L$oop_outer4x - - jmp L$done4x - -L$tail4x: - cmpq $192,%rdx - jae L$192_or_more4x - cmpq $128,%rdx - jae L$128_or_more4x - cmpq $64,%rdx - jae L$64_or_more4x - - - xorq %r10,%r10 - - movdqa %xmm12,16(%rsp) - movdqa %xmm4,32(%rsp) - movdqa %xmm0,48(%rsp) - jmp L$oop_tail4x - -.p2align 5 -L$64_or_more4x: - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - movdqu %xmm6,0(%rdi) - movdqu %xmm11,16(%rdi) - movdqu %xmm2,32(%rdi) - movdqu %xmm7,48(%rdi) - je L$done4x - - movdqa 16(%rsp),%xmm6 - leaq 64(%rsi),%rsi - xorq %r10,%r10 - movdqa %xmm6,0(%rsp) - movdqa %xmm13,16(%rsp) - leaq 64(%rdi),%rdi - movdqa %xmm5,32(%rsp) - subq $64,%rdx - movdqa %xmm1,48(%rsp) - jmp L$oop_tail4x - -.p2align 5 -L$128_or_more4x: - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - pxor 16(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - movdqu %xmm6,64(%rdi) - movdqu %xmm11,80(%rdi) - movdqu %xmm2,96(%rdi) - movdqu %xmm7,112(%rdi) - je L$done4x - - movdqa 32(%rsp),%xmm6 - leaq 128(%rsi),%rsi - xorq %r10,%r10 - movdqa %xmm6,0(%rsp) - movdqa %xmm10,16(%rsp) - leaq 128(%rdi),%rdi - movdqa %xmm14,32(%rsp) - subq $128,%rdx - movdqa %xmm8,48(%rsp) - jmp L$oop_tail4x - -.p2align 5 -L$192_or_more4x: - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - leaq 128(%rsi),%rsi - pxor 16(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - - movdqu %xmm6,64(%rdi) - movdqu 0(%rsi),%xmm6 - movdqu %xmm11,80(%rdi) - movdqu 16(%rsi),%xmm11 - movdqu %xmm2,96(%rdi) - movdqu 32(%rsi),%xmm2 - movdqu %xmm7,112(%rdi) - leaq 128(%rdi),%rdi - movdqu 48(%rsi),%xmm7 - pxor 32(%rsp),%xmm6 - pxor %xmm10,%xmm11 - pxor %xmm14,%xmm2 - pxor %xmm8,%xmm7 - movdqu %xmm6,0(%rdi) - movdqu %xmm11,16(%rdi) - movdqu %xmm2,32(%rdi) - movdqu %xmm7,48(%rdi) - je L$done4x - - movdqa 48(%rsp),%xmm6 - leaq 64(%rsi),%rsi - xorq %r10,%r10 - movdqa %xmm6,0(%rsp) - movdqa %xmm15,16(%rsp) - leaq 64(%rdi),%rdi - movdqa %xmm9,32(%rsp) - subq $192,%rdx - movdqa %xmm3,48(%rsp) - -L$oop_tail4x: - movzbl (%rsi,%r10,1),%eax - movzbl (%rsp,%r10,1),%ecx - leaq 1(%r10),%r10 - xorl %ecx,%eax - movb %al,-1(%rdi,%r10,1) - decq %rdx - jnz L$oop_tail4x - -L$done4x: - leaq (%r9),%rsp - -L$4x_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -ChaCha20_8x: -L$ChaCha20_8x: - - movq %rsp,%r9 - - subq $0x280+8,%rsp - andq $-32,%rsp - vzeroupper - - - - - - - - - - - vbroadcasti128 L$sigma(%rip),%ymm11 - vbroadcasti128 (%rcx),%ymm3 - vbroadcasti128 16(%rcx),%ymm15 - vbroadcasti128 (%r8),%ymm7 - leaq 256(%rsp),%rcx - leaq 512(%rsp),%rax - leaq L$rot16(%rip),%r10 - leaq L$rot24(%rip),%r11 - - vpshufd $0x00,%ymm11,%ymm8 - vpshufd $0x55,%ymm11,%ymm9 - vmovdqa %ymm8,128-256(%rcx) - vpshufd $0xaa,%ymm11,%ymm10 - vmovdqa %ymm9,160-256(%rcx) - vpshufd $0xff,%ymm11,%ymm11 - vmovdqa %ymm10,192-256(%rcx) - vmovdqa %ymm11,224-256(%rcx) - - vpshufd $0x00,%ymm3,%ymm0 - vpshufd $0x55,%ymm3,%ymm1 - vmovdqa %ymm0,256-256(%rcx) - vpshufd $0xaa,%ymm3,%ymm2 - vmovdqa %ymm1,288-256(%rcx) - vpshufd $0xff,%ymm3,%ymm3 - vmovdqa %ymm2,320-256(%rcx) - vmovdqa %ymm3,352-256(%rcx) - - vpshufd $0x00,%ymm15,%ymm12 - vpshufd $0x55,%ymm15,%ymm13 - vmovdqa %ymm12,384-512(%rax) - vpshufd $0xaa,%ymm15,%ymm14 - vmovdqa %ymm13,416-512(%rax) - vpshufd $0xff,%ymm15,%ymm15 - vmovdqa %ymm14,448-512(%rax) - vmovdqa %ymm15,480-512(%rax) - - vpshufd $0x00,%ymm7,%ymm4 - vpshufd $0x55,%ymm7,%ymm5 - vpaddd L$incy(%rip),%ymm4,%ymm4 - vpshufd $0xaa,%ymm7,%ymm6 - vmovdqa %ymm5,544-512(%rax) - vpshufd $0xff,%ymm7,%ymm7 - vmovdqa %ymm6,576-512(%rax) - vmovdqa %ymm7,608-512(%rax) - - jmp L$oop_enter8x - -.p2align 5 -L$oop_outer8x: - vmovdqa 128-256(%rcx),%ymm8 - vmovdqa 160-256(%rcx),%ymm9 - vmovdqa 192-256(%rcx),%ymm10 - vmovdqa 224-256(%rcx),%ymm11 - vmovdqa 256-256(%rcx),%ymm0 - vmovdqa 288-256(%rcx),%ymm1 - vmovdqa 320-256(%rcx),%ymm2 - vmovdqa 352-256(%rcx),%ymm3 - vmovdqa 384-512(%rax),%ymm12 - vmovdqa 416-512(%rax),%ymm13 - vmovdqa 448-512(%rax),%ymm14 - vmovdqa 480-512(%rax),%ymm15 - vmovdqa 512-512(%rax),%ymm4 - vmovdqa 544-512(%rax),%ymm5 - vmovdqa 576-512(%rax),%ymm6 - vmovdqa 608-512(%rax),%ymm7 - vpaddd L$eight(%rip),%ymm4,%ymm4 - -L$oop_enter8x: - vmovdqa %ymm14,64(%rsp) - vmovdqa %ymm15,96(%rsp) - vbroadcasti128 (%r10),%ymm15 - vmovdqa %ymm4,512-512(%rax) - movl $10,%eax - jmp L$oop8x - -.p2align 5 -L$oop8x: - vpaddd %ymm0,%ymm8,%ymm8 - vpxor %ymm4,%ymm8,%ymm4 - vpshufb %ymm15,%ymm4,%ymm4 - vpaddd %ymm1,%ymm9,%ymm9 - vpxor %ymm5,%ymm9,%ymm5 - vpshufb %ymm15,%ymm5,%ymm5 - vpaddd %ymm4,%ymm12,%ymm12 - vpxor %ymm0,%ymm12,%ymm0 - vpslld $12,%ymm0,%ymm14 - vpsrld $20,%ymm0,%ymm0 - vpor %ymm0,%ymm14,%ymm0 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm5,%ymm13,%ymm13 - vpxor %ymm1,%ymm13,%ymm1 - vpslld $12,%ymm1,%ymm15 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm1,%ymm15,%ymm1 - vpaddd %ymm0,%ymm8,%ymm8 - vpxor %ymm4,%ymm8,%ymm4 - vpshufb %ymm14,%ymm4,%ymm4 - vpaddd %ymm1,%ymm9,%ymm9 - vpxor %ymm5,%ymm9,%ymm5 - vpshufb %ymm14,%ymm5,%ymm5 - vpaddd %ymm4,%ymm12,%ymm12 - vpxor %ymm0,%ymm12,%ymm0 - vpslld $7,%ymm0,%ymm15 - vpsrld $25,%ymm0,%ymm0 - vpor %ymm0,%ymm15,%ymm0 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm5,%ymm13,%ymm13 - vpxor %ymm1,%ymm13,%ymm1 - vpslld $7,%ymm1,%ymm14 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm1,%ymm14,%ymm1 - vmovdqa %ymm12,0(%rsp) - vmovdqa %ymm13,32(%rsp) - vmovdqa 64(%rsp),%ymm12 - vmovdqa 96(%rsp),%ymm13 - vpaddd %ymm2,%ymm10,%ymm10 - vpxor %ymm6,%ymm10,%ymm6 - vpshufb %ymm15,%ymm6,%ymm6 - vpaddd %ymm3,%ymm11,%ymm11 - vpxor %ymm7,%ymm11,%ymm7 - vpshufb %ymm15,%ymm7,%ymm7 - vpaddd %ymm6,%ymm12,%ymm12 - vpxor %ymm2,%ymm12,%ymm2 - vpslld $12,%ymm2,%ymm14 - vpsrld $20,%ymm2,%ymm2 - vpor %ymm2,%ymm14,%ymm2 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm7,%ymm13,%ymm13 - vpxor %ymm3,%ymm13,%ymm3 - vpslld $12,%ymm3,%ymm15 - vpsrld $20,%ymm3,%ymm3 - vpor %ymm3,%ymm15,%ymm3 - vpaddd %ymm2,%ymm10,%ymm10 - vpxor %ymm6,%ymm10,%ymm6 - vpshufb %ymm14,%ymm6,%ymm6 - vpaddd %ymm3,%ymm11,%ymm11 - vpxor %ymm7,%ymm11,%ymm7 - vpshufb %ymm14,%ymm7,%ymm7 - vpaddd %ymm6,%ymm12,%ymm12 - vpxor %ymm2,%ymm12,%ymm2 - vpslld $7,%ymm2,%ymm15 - vpsrld $25,%ymm2,%ymm2 - vpor %ymm2,%ymm15,%ymm2 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm7,%ymm13,%ymm13 - vpxor %ymm3,%ymm13,%ymm3 - vpslld $7,%ymm3,%ymm14 - vpsrld $25,%ymm3,%ymm3 - vpor %ymm3,%ymm14,%ymm3 - vpaddd %ymm1,%ymm8,%ymm8 - vpxor %ymm7,%ymm8,%ymm7 - vpshufb %ymm15,%ymm7,%ymm7 - vpaddd %ymm2,%ymm9,%ymm9 - vpxor %ymm4,%ymm9,%ymm4 - vpshufb %ymm15,%ymm4,%ymm4 - vpaddd %ymm7,%ymm12,%ymm12 - vpxor %ymm1,%ymm12,%ymm1 - vpslld $12,%ymm1,%ymm14 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm1,%ymm14,%ymm1 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm4,%ymm13,%ymm13 - vpxor %ymm2,%ymm13,%ymm2 - vpslld $12,%ymm2,%ymm15 - vpsrld $20,%ymm2,%ymm2 - vpor %ymm2,%ymm15,%ymm2 - vpaddd %ymm1,%ymm8,%ymm8 - vpxor %ymm7,%ymm8,%ymm7 - vpshufb %ymm14,%ymm7,%ymm7 - vpaddd %ymm2,%ymm9,%ymm9 - vpxor %ymm4,%ymm9,%ymm4 - vpshufb %ymm14,%ymm4,%ymm4 - vpaddd %ymm7,%ymm12,%ymm12 - vpxor %ymm1,%ymm12,%ymm1 - vpslld $7,%ymm1,%ymm15 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm1,%ymm15,%ymm1 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm4,%ymm13,%ymm13 - vpxor %ymm2,%ymm13,%ymm2 - vpslld $7,%ymm2,%ymm14 - vpsrld $25,%ymm2,%ymm2 - vpor %ymm2,%ymm14,%ymm2 - vmovdqa %ymm12,64(%rsp) - vmovdqa %ymm13,96(%rsp) - vmovdqa 0(%rsp),%ymm12 - vmovdqa 32(%rsp),%ymm13 - vpaddd %ymm3,%ymm10,%ymm10 - vpxor %ymm5,%ymm10,%ymm5 - vpshufb %ymm15,%ymm5,%ymm5 - vpaddd %ymm0,%ymm11,%ymm11 - vpxor %ymm6,%ymm11,%ymm6 - vpshufb %ymm15,%ymm6,%ymm6 - vpaddd %ymm5,%ymm12,%ymm12 - vpxor %ymm3,%ymm12,%ymm3 - vpslld $12,%ymm3,%ymm14 - vpsrld $20,%ymm3,%ymm3 - vpor %ymm3,%ymm14,%ymm3 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm6,%ymm13,%ymm13 - vpxor %ymm0,%ymm13,%ymm0 - vpslld $12,%ymm0,%ymm15 - vpsrld $20,%ymm0,%ymm0 - vpor %ymm0,%ymm15,%ymm0 - vpaddd %ymm3,%ymm10,%ymm10 - vpxor %ymm5,%ymm10,%ymm5 - vpshufb %ymm14,%ymm5,%ymm5 - vpaddd %ymm0,%ymm11,%ymm11 - vpxor %ymm6,%ymm11,%ymm6 - vpshufb %ymm14,%ymm6,%ymm6 - vpaddd %ymm5,%ymm12,%ymm12 - vpxor %ymm3,%ymm12,%ymm3 - vpslld $7,%ymm3,%ymm15 - vpsrld $25,%ymm3,%ymm3 - vpor %ymm3,%ymm15,%ymm3 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm6,%ymm13,%ymm13 - vpxor %ymm0,%ymm13,%ymm0 - vpslld $7,%ymm0,%ymm14 - vpsrld $25,%ymm0,%ymm0 - vpor %ymm0,%ymm14,%ymm0 - decl %eax - jnz L$oop8x - - leaq 512(%rsp),%rax - vpaddd 128-256(%rcx),%ymm8,%ymm8 - vpaddd 160-256(%rcx),%ymm9,%ymm9 - vpaddd 192-256(%rcx),%ymm10,%ymm10 - vpaddd 224-256(%rcx),%ymm11,%ymm11 - - vpunpckldq %ymm9,%ymm8,%ymm14 - vpunpckldq %ymm11,%ymm10,%ymm15 - vpunpckhdq %ymm9,%ymm8,%ymm8 - vpunpckhdq %ymm11,%ymm10,%ymm10 - vpunpcklqdq %ymm15,%ymm14,%ymm9 - vpunpckhqdq %ymm15,%ymm14,%ymm14 - vpunpcklqdq %ymm10,%ymm8,%ymm11 - vpunpckhqdq %ymm10,%ymm8,%ymm8 - vpaddd 256-256(%rcx),%ymm0,%ymm0 - vpaddd 288-256(%rcx),%ymm1,%ymm1 - vpaddd 320-256(%rcx),%ymm2,%ymm2 - vpaddd 352-256(%rcx),%ymm3,%ymm3 - - vpunpckldq %ymm1,%ymm0,%ymm10 - vpunpckldq %ymm3,%ymm2,%ymm15 - vpunpckhdq %ymm1,%ymm0,%ymm0 - vpunpckhdq %ymm3,%ymm2,%ymm2 - vpunpcklqdq %ymm15,%ymm10,%ymm1 - vpunpckhqdq %ymm15,%ymm10,%ymm10 - vpunpcklqdq %ymm2,%ymm0,%ymm3 - vpunpckhqdq %ymm2,%ymm0,%ymm0 - vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 - vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 - vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 - vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 - vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 - vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 - vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 - vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 - vmovdqa %ymm15,0(%rsp) - vmovdqa %ymm9,32(%rsp) - vmovdqa 64(%rsp),%ymm15 - vmovdqa 96(%rsp),%ymm9 - - vpaddd 384-512(%rax),%ymm12,%ymm12 - vpaddd 416-512(%rax),%ymm13,%ymm13 - vpaddd 448-512(%rax),%ymm15,%ymm15 - vpaddd 480-512(%rax),%ymm9,%ymm9 - - vpunpckldq %ymm13,%ymm12,%ymm2 - vpunpckldq %ymm9,%ymm15,%ymm8 - vpunpckhdq %ymm13,%ymm12,%ymm12 - vpunpckhdq %ymm9,%ymm15,%ymm15 - vpunpcklqdq %ymm8,%ymm2,%ymm13 - vpunpckhqdq %ymm8,%ymm2,%ymm2 - vpunpcklqdq %ymm15,%ymm12,%ymm9 - vpunpckhqdq %ymm15,%ymm12,%ymm12 - vpaddd 512-512(%rax),%ymm4,%ymm4 - vpaddd 544-512(%rax),%ymm5,%ymm5 - vpaddd 576-512(%rax),%ymm6,%ymm6 - vpaddd 608-512(%rax),%ymm7,%ymm7 - - vpunpckldq %ymm5,%ymm4,%ymm15 - vpunpckldq %ymm7,%ymm6,%ymm8 - vpunpckhdq %ymm5,%ymm4,%ymm4 - vpunpckhdq %ymm7,%ymm6,%ymm6 - vpunpcklqdq %ymm8,%ymm15,%ymm5 - vpunpckhqdq %ymm8,%ymm15,%ymm15 - vpunpcklqdq %ymm6,%ymm4,%ymm7 - vpunpckhqdq %ymm6,%ymm4,%ymm4 - vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 - vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 - vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 - vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 - vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 - vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 - vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 - vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 - vmovdqa 0(%rsp),%ymm6 - vmovdqa 32(%rsp),%ymm12 - - cmpq $512,%rdx - jb L$tail8x - - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - leaq 128(%rsi),%rsi - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm12,%ymm12 - vpxor 32(%rsi),%ymm13,%ymm13 - vpxor 64(%rsi),%ymm10,%ymm10 - vpxor 96(%rsi),%ymm15,%ymm15 - leaq 128(%rsi),%rsi - vmovdqu %ymm12,0(%rdi) - vmovdqu %ymm13,32(%rdi) - vmovdqu %ymm10,64(%rdi) - vmovdqu %ymm15,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm14,%ymm14 - vpxor 32(%rsi),%ymm2,%ymm2 - vpxor 64(%rsi),%ymm3,%ymm3 - vpxor 96(%rsi),%ymm7,%ymm7 - leaq 128(%rsi),%rsi - vmovdqu %ymm14,0(%rdi) - vmovdqu %ymm2,32(%rdi) - vmovdqu %ymm3,64(%rdi) - vmovdqu %ymm7,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm11,%ymm11 - vpxor 32(%rsi),%ymm9,%ymm9 - vpxor 64(%rsi),%ymm0,%ymm0 - vpxor 96(%rsi),%ymm4,%ymm4 - leaq 128(%rsi),%rsi - vmovdqu %ymm11,0(%rdi) - vmovdqu %ymm9,32(%rdi) - vmovdqu %ymm0,64(%rdi) - vmovdqu %ymm4,96(%rdi) - leaq 128(%rdi),%rdi - - subq $512,%rdx - jnz L$oop_outer8x - - jmp L$done8x - -L$tail8x: - cmpq $448,%rdx - jae L$448_or_more8x - cmpq $384,%rdx - jae L$384_or_more8x - cmpq $320,%rdx - jae L$320_or_more8x - cmpq $256,%rdx - jae L$256_or_more8x - cmpq $192,%rdx - jae L$192_or_more8x - cmpq $128,%rdx - jae L$128_or_more8x - cmpq $64,%rdx - jae L$64_or_more8x - - xorq %r10,%r10 - vmovdqa %ymm6,0(%rsp) - vmovdqa %ymm8,32(%rsp) - jmp L$oop_tail8x - -.p2align 5 -L$64_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - je L$done8x - - leaq 64(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm1,0(%rsp) - leaq 64(%rdi),%rdi - subq $64,%rdx - vmovdqa %ymm5,32(%rsp) - jmp L$oop_tail8x - -.p2align 5 -L$128_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - je L$done8x - - leaq 128(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm12,0(%rsp) - leaq 128(%rdi),%rdi - subq $128,%rdx - vmovdqa %ymm13,32(%rsp) - jmp L$oop_tail8x - -.p2align 5 -L$192_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - je L$done8x - - leaq 192(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm10,0(%rsp) - leaq 192(%rdi),%rdi - subq $192,%rdx - vmovdqa %ymm15,32(%rsp) - jmp L$oop_tail8x - -.p2align 5 -L$256_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - je L$done8x - - leaq 256(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm14,0(%rsp) - leaq 256(%rdi),%rdi - subq $256,%rdx - vmovdqa %ymm2,32(%rsp) - jmp L$oop_tail8x - -.p2align 5 -L$320_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - je L$done8x - - leaq 320(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm3,0(%rsp) - leaq 320(%rdi),%rdi - subq $320,%rdx - vmovdqa %ymm7,32(%rsp) - jmp L$oop_tail8x - -.p2align 5 -L$384_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vpxor 320(%rsi),%ymm3,%ymm3 - vpxor 352(%rsi),%ymm7,%ymm7 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - vmovdqu %ymm3,320(%rdi) - vmovdqu %ymm7,352(%rdi) - je L$done8x - - leaq 384(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm11,0(%rsp) - leaq 384(%rdi),%rdi - subq $384,%rdx - vmovdqa %ymm9,32(%rsp) - jmp L$oop_tail8x - -.p2align 5 -L$448_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vpxor 320(%rsi),%ymm3,%ymm3 - vpxor 352(%rsi),%ymm7,%ymm7 - vpxor 384(%rsi),%ymm11,%ymm11 - vpxor 416(%rsi),%ymm9,%ymm9 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - vmovdqu %ymm3,320(%rdi) - vmovdqu %ymm7,352(%rdi) - vmovdqu %ymm11,384(%rdi) - vmovdqu %ymm9,416(%rdi) - je L$done8x - - leaq 448(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm0,0(%rsp) - leaq 448(%rdi),%rdi - subq $448,%rdx - vmovdqa %ymm4,32(%rsp) - -L$oop_tail8x: - movzbl (%rsi,%r10,1),%eax - movzbl (%rsp,%r10,1),%ecx - leaq 1(%r10),%r10 - xorl %ecx,%eax - movb %al,-1(%rdi,%r10,1) - decq %rdx - jnz L$oop_tail8x - -L$done8x: - vzeroall - leaq (%r9),%rsp - -L$8x_epilogue: - .byte 0xf3,0xc3 - - -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S deleted file mode 100644 index f988089d..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S +++ /dev/null @@ -1,3068 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.data - -.p2align 4 -one: -.quad 1,0 -two: -.quad 2,0 -three: -.quad 3,0 -four: -.quad 4,0 -five: -.quad 5,0 -six: -.quad 6,0 -seven: -.quad 7,0 -eight: -.quad 8,0 - -OR_MASK: -.long 0x00000000,0x00000000,0x00000000,0x80000000 -poly: -.quad 0x1, 0xc200000000000000 -mask: -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d -con1: -.long 1,1,1,1 -con2: -.long 0x1b,0x1b,0x1b,0x1b -con3: -.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 -and_mask: -.long 0,0xffffffff, 0xffffffff, 0xffffffff -.text - -.p2align 4 -GFMUL: - - vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 - vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5 - vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 - vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm2,%xmm2 - vpxor %xmm3,%xmm5,%xmm5 - - vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 - vpshufd $78,%xmm2,%xmm4 - vpxor %xmm4,%xmm3,%xmm2 - - vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 - vpshufd $78,%xmm2,%xmm4 - vpxor %xmm4,%xmm3,%xmm2 - - vpxor %xmm5,%xmm2,%xmm0 - .byte 0xf3,0xc3 - - -.globl _aesgcmsiv_htable_init -.private_extern _aesgcmsiv_htable_init - -.p2align 4 -_aesgcmsiv_htable_init: - - vmovdqa (%rsi),%xmm0 - vmovdqa %xmm0,%xmm1 - vmovdqa %xmm0,(%rdi) - call GFMUL - vmovdqa %xmm0,16(%rdi) - call GFMUL - vmovdqa %xmm0,32(%rdi) - call GFMUL - vmovdqa %xmm0,48(%rdi) - call GFMUL - vmovdqa %xmm0,64(%rdi) - call GFMUL - vmovdqa %xmm0,80(%rdi) - call GFMUL - vmovdqa %xmm0,96(%rdi) - call GFMUL - vmovdqa %xmm0,112(%rdi) - .byte 0xf3,0xc3 - - -.globl _aesgcmsiv_htable6_init -.private_extern _aesgcmsiv_htable6_init - -.p2align 4 -_aesgcmsiv_htable6_init: - - vmovdqa (%rsi),%xmm0 - vmovdqa %xmm0,%xmm1 - vmovdqa %xmm0,(%rdi) - call GFMUL - vmovdqa %xmm0,16(%rdi) - call GFMUL - vmovdqa %xmm0,32(%rdi) - call GFMUL - vmovdqa %xmm0,48(%rdi) - call GFMUL - vmovdqa %xmm0,64(%rdi) - call GFMUL - vmovdqa %xmm0,80(%rdi) - .byte 0xf3,0xc3 - - -.globl _aesgcmsiv_htable_polyval -.private_extern _aesgcmsiv_htable_polyval - -.p2align 4 -_aesgcmsiv_htable_polyval: - - testq %rdx,%rdx - jnz L$htable_polyval_start - .byte 0xf3,0xc3 - -L$htable_polyval_start: - vzeroall - - - - movq %rdx,%r11 - andq $127,%r11 - - jz L$htable_polyval_no_prefix - - vpxor %xmm9,%xmm9,%xmm9 - vmovdqa (%rcx),%xmm1 - subq %r11,%rdx - - subq $16,%r11 - - - vmovdqu (%rsi),%xmm0 - vpxor %xmm1,%xmm0,%xmm0 - - vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5 - vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3 - vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4 - vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - leaq 16(%rsi),%rsi - testq %r11,%r11 - jnz L$htable_polyval_prefix_loop - jmp L$htable_polyval_prefix_complete - - -.p2align 6 -L$htable_polyval_prefix_loop: - subq $16,%r11 - - vmovdqu (%rsi),%xmm0 - - vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - testq %r11,%r11 - - leaq 16(%rsi),%rsi - - jnz L$htable_polyval_prefix_loop - -L$htable_polyval_prefix_complete: - vpsrldq $8,%xmm5,%xmm6 - vpslldq $8,%xmm5,%xmm5 - - vpxor %xmm6,%xmm4,%xmm9 - vpxor %xmm5,%xmm3,%xmm1 - - jmp L$htable_polyval_main_loop - -L$htable_polyval_no_prefix: - - - - - vpxor %xmm1,%xmm1,%xmm1 - vmovdqa (%rcx),%xmm9 - -.p2align 6 -L$htable_polyval_main_loop: - subq $0x80,%rdx - jb L$htable_polyval_out - - vmovdqu 112(%rsi),%xmm0 - - vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5 - vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3 - vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4 - vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vmovdqu 96(%rsi),%xmm0 - vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - - vmovdqu 80(%rsi),%xmm0 - - vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 - vpalignr $8,%xmm1,%xmm1,%xmm1 - - vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vpxor %xmm7,%xmm1,%xmm1 - - vmovdqu 64(%rsi),%xmm0 - - vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vmovdqu 48(%rsi),%xmm0 - - vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 - vpalignr $8,%xmm1,%xmm1,%xmm1 - - vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vpxor %xmm7,%xmm1,%xmm1 - - vmovdqu 32(%rsi),%xmm0 - - vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vpxor %xmm9,%xmm1,%xmm1 - - vmovdqu 16(%rsi),%xmm0 - - vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vmovdqu 0(%rsi),%xmm0 - vpxor %xmm1,%xmm0,%xmm0 - - vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vpsrldq $8,%xmm5,%xmm6 - vpslldq $8,%xmm5,%xmm5 - - vpxor %xmm6,%xmm4,%xmm9 - vpxor %xmm5,%xmm3,%xmm1 - - leaq 128(%rsi),%rsi - jmp L$htable_polyval_main_loop - - - -L$htable_polyval_out: - vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 - vpalignr $8,%xmm1,%xmm1,%xmm1 - vpxor %xmm6,%xmm1,%xmm1 - - vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 - vpalignr $8,%xmm1,%xmm1,%xmm1 - vpxor %xmm6,%xmm1,%xmm1 - vpxor %xmm9,%xmm1,%xmm1 - - vmovdqu %xmm1,(%rcx) - vzeroupper - .byte 0xf3,0xc3 - - -.globl _aesgcmsiv_polyval_horner -.private_extern _aesgcmsiv_polyval_horner - -.p2align 4 -_aesgcmsiv_polyval_horner: - - testq %rcx,%rcx - jnz L$polyval_horner_start - .byte 0xf3,0xc3 - -L$polyval_horner_start: - - - - xorq %r10,%r10 - shlq $4,%rcx - - vmovdqa (%rsi),%xmm1 - vmovdqa (%rdi),%xmm0 - -L$polyval_horner_loop: - vpxor (%rdx,%r10,1),%xmm0,%xmm0 - call GFMUL - - addq $16,%r10 - cmpq %r10,%rcx - jne L$polyval_horner_loop - - - vmovdqa %xmm0,(%rdi) - .byte 0xf3,0xc3 - - -.globl _aes128gcmsiv_aes_ks -.private_extern _aes128gcmsiv_aes_ks - -.p2align 4 -_aes128gcmsiv_aes_ks: - - vmovdqu (%rdi),%xmm1 - vmovdqa %xmm1,(%rsi) - - vmovdqa con1(%rip),%xmm0 - vmovdqa mask(%rip),%xmm15 - - movq $8,%rax - -L$ks128_loop: - addq $16,%rsi - subq $1,%rax - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm1,(%rsi) - jne L$ks128_loop - - vmovdqa con2(%rip),%xmm0 - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm1,16(%rsi) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslldq $4,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm1,32(%rsi) - .byte 0xf3,0xc3 - - -.globl _aes256gcmsiv_aes_ks -.private_extern _aes256gcmsiv_aes_ks - -.p2align 4 -_aes256gcmsiv_aes_ks: - - vmovdqu (%rdi),%xmm1 - vmovdqu 16(%rdi),%xmm3 - vmovdqa %xmm1,(%rsi) - vmovdqa %xmm3,16(%rsi) - vmovdqa con1(%rip),%xmm0 - vmovdqa mask(%rip),%xmm15 - vpxor %xmm14,%xmm14,%xmm14 - movq $6,%rax - -L$ks256_loop: - addq $32,%rsi - subq $1,%rax - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm1,(%rsi) - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpsllq $32,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpshufb con3(%rip),%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vmovdqa %xmm3,16(%rsi) - jne L$ks256_loop - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpsllq $32,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm1,32(%rsi) - .byte 0xf3,0xc3 - -.globl _aes128gcmsiv_aes_ks_enc_x1 -.private_extern _aes128gcmsiv_aes_ks_enc_x1 - -.p2align 4 -_aes128gcmsiv_aes_ks_enc_x1: - - vmovdqa (%rcx),%xmm1 - vmovdqa 0(%rdi),%xmm4 - - vmovdqa %xmm1,(%rdx) - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqa con1(%rip),%xmm0 - vmovdqa mask(%rip),%xmm15 - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,16(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,32(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,48(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,64(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,80(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,96(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,112(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,128(%rdx) - - - vmovdqa con2(%rip),%xmm0 - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,144(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenclast %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,160(%rdx) - - - vmovdqa %xmm4,0(%rsi) - .byte 0xf3,0xc3 - - -.globl _aes128gcmsiv_kdf -.private_extern _aes128gcmsiv_kdf - -.p2align 4 -_aes128gcmsiv_kdf: - - - - - - vmovdqa (%rdx),%xmm1 - vmovdqa 0(%rdi),%xmm9 - vmovdqa and_mask(%rip),%xmm12 - vmovdqa one(%rip),%xmm13 - vpshufd $0x90,%xmm9,%xmm9 - vpand %xmm12,%xmm9,%xmm9 - vpaddd %xmm13,%xmm9,%xmm10 - vpaddd %xmm13,%xmm10,%xmm11 - vpaddd %xmm13,%xmm11,%xmm12 - - vpxor %xmm1,%xmm9,%xmm9 - vpxor %xmm1,%xmm10,%xmm10 - vpxor %xmm1,%xmm11,%xmm11 - vpxor %xmm1,%xmm12,%xmm12 - - vmovdqa 16(%rdx),%xmm1 - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - - vmovdqa 32(%rdx),%xmm2 - vaesenc %xmm2,%xmm9,%xmm9 - vaesenc %xmm2,%xmm10,%xmm10 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - - vmovdqa 48(%rdx),%xmm1 - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - - vmovdqa 64(%rdx),%xmm2 - vaesenc %xmm2,%xmm9,%xmm9 - vaesenc %xmm2,%xmm10,%xmm10 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - - vmovdqa 80(%rdx),%xmm1 - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - - vmovdqa 96(%rdx),%xmm2 - vaesenc %xmm2,%xmm9,%xmm9 - vaesenc %xmm2,%xmm10,%xmm10 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - - vmovdqa 112(%rdx),%xmm1 - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - - vmovdqa 128(%rdx),%xmm2 - vaesenc %xmm2,%xmm9,%xmm9 - vaesenc %xmm2,%xmm10,%xmm10 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - - vmovdqa 144(%rdx),%xmm1 - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - - vmovdqa 160(%rdx),%xmm2 - vaesenclast %xmm2,%xmm9,%xmm9 - vaesenclast %xmm2,%xmm10,%xmm10 - vaesenclast %xmm2,%xmm11,%xmm11 - vaesenclast %xmm2,%xmm12,%xmm12 - - - vmovdqa %xmm9,0(%rsi) - vmovdqa %xmm10,16(%rsi) - vmovdqa %xmm11,32(%rsi) - vmovdqa %xmm12,48(%rsi) - .byte 0xf3,0xc3 - - -.globl _aes128gcmsiv_enc_msg_x4 -.private_extern _aes128gcmsiv_enc_msg_x4 - -.p2align 4 -_aes128gcmsiv_enc_msg_x4: - - testq %r8,%r8 - jnz L$128_enc_msg_x4_start - .byte 0xf3,0xc3 - -L$128_enc_msg_x4_start: - pushq %r12 - - pushq %r13 - - - shrq $4,%r8 - movq %r8,%r10 - shlq $62,%r10 - shrq $62,%r10 - - - vmovdqa (%rdx),%xmm15 - vpor OR_MASK(%rip),%xmm15,%xmm15 - - vmovdqu four(%rip),%xmm4 - vmovdqa %xmm15,%xmm0 - vpaddd one(%rip),%xmm15,%xmm1 - vpaddd two(%rip),%xmm15,%xmm2 - vpaddd three(%rip),%xmm15,%xmm3 - - shrq $2,%r8 - je L$128_enc_msg_x4_check_remainder - - subq $64,%rsi - subq $64,%rdi - -L$128_enc_msg_x4_loop1: - addq $64,%rsi - addq $64,%rdi - - vmovdqa %xmm0,%xmm5 - vmovdqa %xmm1,%xmm6 - vmovdqa %xmm2,%xmm7 - vmovdqa %xmm3,%xmm8 - - vpxor (%rcx),%xmm5,%xmm5 - vpxor (%rcx),%xmm6,%xmm6 - vpxor (%rcx),%xmm7,%xmm7 - vpxor (%rcx),%xmm8,%xmm8 - - vmovdqu 16(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm0,%xmm0 - vmovdqu 32(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm1,%xmm1 - vmovdqu 48(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm2,%xmm2 - vmovdqu 64(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm3,%xmm3 - - vmovdqu 80(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 96(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 112(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 128(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 144(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 160(%rcx),%xmm12 - vaesenclast %xmm12,%xmm5,%xmm5 - vaesenclast %xmm12,%xmm6,%xmm6 - vaesenclast %xmm12,%xmm7,%xmm7 - vaesenclast %xmm12,%xmm8,%xmm8 - - - - vpxor 0(%rdi),%xmm5,%xmm5 - vpxor 16(%rdi),%xmm6,%xmm6 - vpxor 32(%rdi),%xmm7,%xmm7 - vpxor 48(%rdi),%xmm8,%xmm8 - - subq $1,%r8 - - vmovdqu %xmm5,0(%rsi) - vmovdqu %xmm6,16(%rsi) - vmovdqu %xmm7,32(%rsi) - vmovdqu %xmm8,48(%rsi) - - jne L$128_enc_msg_x4_loop1 - - addq $64,%rsi - addq $64,%rdi - -L$128_enc_msg_x4_check_remainder: - cmpq $0,%r10 - je L$128_enc_msg_x4_out - -L$128_enc_msg_x4_loop2: - - - vmovdqa %xmm0,%xmm5 - vpaddd one(%rip),%xmm0,%xmm0 - - vpxor (%rcx),%xmm5,%xmm5 - vaesenc 16(%rcx),%xmm5,%xmm5 - vaesenc 32(%rcx),%xmm5,%xmm5 - vaesenc 48(%rcx),%xmm5,%xmm5 - vaesenc 64(%rcx),%xmm5,%xmm5 - vaesenc 80(%rcx),%xmm5,%xmm5 - vaesenc 96(%rcx),%xmm5,%xmm5 - vaesenc 112(%rcx),%xmm5,%xmm5 - vaesenc 128(%rcx),%xmm5,%xmm5 - vaesenc 144(%rcx),%xmm5,%xmm5 - vaesenclast 160(%rcx),%xmm5,%xmm5 - - - vpxor (%rdi),%xmm5,%xmm5 - vmovdqu %xmm5,(%rsi) - - addq $16,%rdi - addq $16,%rsi - - subq $1,%r10 - jne L$128_enc_msg_x4_loop2 - -L$128_enc_msg_x4_out: - popq %r13 - - popq %r12 - - .byte 0xf3,0xc3 - - -.globl _aes128gcmsiv_enc_msg_x8 -.private_extern _aes128gcmsiv_enc_msg_x8 - -.p2align 4 -_aes128gcmsiv_enc_msg_x8: - - testq %r8,%r8 - jnz L$128_enc_msg_x8_start - .byte 0xf3,0xc3 - -L$128_enc_msg_x8_start: - pushq %r12 - - pushq %r13 - - pushq %rbp - - movq %rsp,%rbp - - - - subq $128,%rsp - andq $-64,%rsp - - shrq $4,%r8 - movq %r8,%r10 - shlq $61,%r10 - shrq $61,%r10 - - - vmovdqu (%rdx),%xmm1 - vpor OR_MASK(%rip),%xmm1,%xmm1 - - - vpaddd seven(%rip),%xmm1,%xmm0 - vmovdqu %xmm0,(%rsp) - vpaddd one(%rip),%xmm1,%xmm9 - vpaddd two(%rip),%xmm1,%xmm10 - vpaddd three(%rip),%xmm1,%xmm11 - vpaddd four(%rip),%xmm1,%xmm12 - vpaddd five(%rip),%xmm1,%xmm13 - vpaddd six(%rip),%xmm1,%xmm14 - vmovdqa %xmm1,%xmm0 - - shrq $3,%r8 - je L$128_enc_msg_x8_check_remainder - - subq $128,%rsi - subq $128,%rdi - -L$128_enc_msg_x8_loop1: - addq $128,%rsi - addq $128,%rdi - - vmovdqa %xmm0,%xmm1 - vmovdqa %xmm9,%xmm2 - vmovdqa %xmm10,%xmm3 - vmovdqa %xmm11,%xmm4 - vmovdqa %xmm12,%xmm5 - vmovdqa %xmm13,%xmm6 - vmovdqa %xmm14,%xmm7 - - vmovdqu (%rsp),%xmm8 - - vpxor (%rcx),%xmm1,%xmm1 - vpxor (%rcx),%xmm2,%xmm2 - vpxor (%rcx),%xmm3,%xmm3 - vpxor (%rcx),%xmm4,%xmm4 - vpxor (%rcx),%xmm5,%xmm5 - vpxor (%rcx),%xmm6,%xmm6 - vpxor (%rcx),%xmm7,%xmm7 - vpxor (%rcx),%xmm8,%xmm8 - - vmovdqu 16(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu (%rsp),%xmm14 - vpaddd eight(%rip),%xmm14,%xmm14 - vmovdqu %xmm14,(%rsp) - vmovdqu 32(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpsubd one(%rip),%xmm14,%xmm14 - vmovdqu 48(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm0,%xmm0 - vmovdqu 64(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm9,%xmm9 - vmovdqu 80(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm10,%xmm10 - vmovdqu 96(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm11,%xmm11 - vmovdqu 112(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm12,%xmm12 - vmovdqu 128(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm13,%xmm13 - vmovdqu 144(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 160(%rcx),%xmm15 - vaesenclast %xmm15,%xmm1,%xmm1 - vaesenclast %xmm15,%xmm2,%xmm2 - vaesenclast %xmm15,%xmm3,%xmm3 - vaesenclast %xmm15,%xmm4,%xmm4 - vaesenclast %xmm15,%xmm5,%xmm5 - vaesenclast %xmm15,%xmm6,%xmm6 - vaesenclast %xmm15,%xmm7,%xmm7 - vaesenclast %xmm15,%xmm8,%xmm8 - - - - vpxor 0(%rdi),%xmm1,%xmm1 - vpxor 16(%rdi),%xmm2,%xmm2 - vpxor 32(%rdi),%xmm3,%xmm3 - vpxor 48(%rdi),%xmm4,%xmm4 - vpxor 64(%rdi),%xmm5,%xmm5 - vpxor 80(%rdi),%xmm6,%xmm6 - vpxor 96(%rdi),%xmm7,%xmm7 - vpxor 112(%rdi),%xmm8,%xmm8 - - decq %r8 - - vmovdqu %xmm1,0(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - vmovdqu %xmm7,96(%rsi) - vmovdqu %xmm8,112(%rsi) - - jne L$128_enc_msg_x8_loop1 - - addq $128,%rsi - addq $128,%rdi - -L$128_enc_msg_x8_check_remainder: - cmpq $0,%r10 - je L$128_enc_msg_x8_out - -L$128_enc_msg_x8_loop2: - - - vmovdqa %xmm0,%xmm1 - vpaddd one(%rip),%xmm0,%xmm0 - - vpxor (%rcx),%xmm1,%xmm1 - vaesenc 16(%rcx),%xmm1,%xmm1 - vaesenc 32(%rcx),%xmm1,%xmm1 - vaesenc 48(%rcx),%xmm1,%xmm1 - vaesenc 64(%rcx),%xmm1,%xmm1 - vaesenc 80(%rcx),%xmm1,%xmm1 - vaesenc 96(%rcx),%xmm1,%xmm1 - vaesenc 112(%rcx),%xmm1,%xmm1 - vaesenc 128(%rcx),%xmm1,%xmm1 - vaesenc 144(%rcx),%xmm1,%xmm1 - vaesenclast 160(%rcx),%xmm1,%xmm1 - - - vpxor (%rdi),%xmm1,%xmm1 - - vmovdqu %xmm1,(%rsi) - - addq $16,%rdi - addq $16,%rsi - - decq %r10 - jne L$128_enc_msg_x8_loop2 - -L$128_enc_msg_x8_out: - movq %rbp,%rsp - - popq %rbp - - popq %r13 - - popq %r12 - - .byte 0xf3,0xc3 - - -.globl _aes128gcmsiv_dec -.private_extern _aes128gcmsiv_dec - -.p2align 4 -_aes128gcmsiv_dec: - - testq $~15,%r9 - jnz L$128_dec_start - .byte 0xf3,0xc3 - -L$128_dec_start: - vzeroupper - vmovdqa (%rdx),%xmm0 - movq %rdx,%rax - - leaq 32(%rax),%rax - leaq 32(%rcx),%rcx - - - vmovdqu (%rdi,%r9,1),%xmm15 - vpor OR_MASK(%rip),%xmm15,%xmm15 - andq $~15,%r9 - - - cmpq $96,%r9 - jb L$128_dec_loop2 - - - subq $96,%r9 - vmovdqa %xmm15,%xmm7 - vpaddd one(%rip),%xmm7,%xmm8 - vpaddd two(%rip),%xmm7,%xmm9 - vpaddd one(%rip),%xmm9,%xmm10 - vpaddd two(%rip),%xmm9,%xmm11 - vpaddd one(%rip),%xmm11,%xmm12 - vpaddd two(%rip),%xmm11,%xmm15 - - vpxor (%r8),%xmm7,%xmm7 - vpxor (%r8),%xmm8,%xmm8 - vpxor (%r8),%xmm9,%xmm9 - vpxor (%r8),%xmm10,%xmm10 - vpxor (%r8),%xmm11,%xmm11 - vpxor (%r8),%xmm12,%xmm12 - - vmovdqu 16(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 32(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 48(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 64(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 80(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 96(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 112(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 128(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 144(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 160(%r8),%xmm4 - vaesenclast %xmm4,%xmm7,%xmm7 - vaesenclast %xmm4,%xmm8,%xmm8 - vaesenclast %xmm4,%xmm9,%xmm9 - vaesenclast %xmm4,%xmm10,%xmm10 - vaesenclast %xmm4,%xmm11,%xmm11 - vaesenclast %xmm4,%xmm12,%xmm12 - - - vpxor 0(%rdi),%xmm7,%xmm7 - vpxor 16(%rdi),%xmm8,%xmm8 - vpxor 32(%rdi),%xmm9,%xmm9 - vpxor 48(%rdi),%xmm10,%xmm10 - vpxor 64(%rdi),%xmm11,%xmm11 - vpxor 80(%rdi),%xmm12,%xmm12 - - vmovdqu %xmm7,0(%rsi) - vmovdqu %xmm8,16(%rsi) - vmovdqu %xmm9,32(%rsi) - vmovdqu %xmm10,48(%rsi) - vmovdqu %xmm11,64(%rsi) - vmovdqu %xmm12,80(%rsi) - - addq $96,%rdi - addq $96,%rsi - jmp L$128_dec_loop1 - - -.p2align 6 -L$128_dec_loop1: - cmpq $96,%r9 - jb L$128_dec_finish_96 - subq $96,%r9 - - vmovdqa %xmm12,%xmm6 - vmovdqa %xmm11,16-32(%rax) - vmovdqa %xmm10,32-32(%rax) - vmovdqa %xmm9,48-32(%rax) - vmovdqa %xmm8,64-32(%rax) - vmovdqa %xmm7,80-32(%rax) - - vmovdqa %xmm15,%xmm7 - vpaddd one(%rip),%xmm7,%xmm8 - vpaddd two(%rip),%xmm7,%xmm9 - vpaddd one(%rip),%xmm9,%xmm10 - vpaddd two(%rip),%xmm9,%xmm11 - vpaddd one(%rip),%xmm11,%xmm12 - vpaddd two(%rip),%xmm11,%xmm15 - - vmovdqa (%r8),%xmm4 - vpxor %xmm4,%xmm7,%xmm7 - vpxor %xmm4,%xmm8,%xmm8 - vpxor %xmm4,%xmm9,%xmm9 - vpxor %xmm4,%xmm10,%xmm10 - vpxor %xmm4,%xmm11,%xmm11 - vpxor %xmm4,%xmm12,%xmm12 - - vmovdqu 0-32(%rcx),%xmm4 - vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 - vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 - vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 16(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu -16(%rax),%xmm6 - vmovdqu -16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 32(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 0(%rax),%xmm6 - vmovdqu 0(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 48(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 16(%rax),%xmm6 - vmovdqu 16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 64(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 32(%rax),%xmm6 - vmovdqu 32(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 80(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 96(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 112(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - - vmovdqa 80-32(%rax),%xmm6 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqu 80-32(%rcx),%xmm5 - - vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 128(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - - vpsrldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm2,%xmm5 - vpslldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm0 - - vmovdqa poly(%rip),%xmm3 - - vmovdqu 144(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 160(%r8),%xmm6 - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpxor 0(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm7,%xmm7 - vpxor 16(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm8,%xmm8 - vpxor 32(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm9,%xmm9 - vpxor 48(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm10,%xmm10 - vpxor 64(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm11,%xmm11 - vpxor 80(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm12,%xmm12 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vmovdqu %xmm7,0(%rsi) - vmovdqu %xmm8,16(%rsi) - vmovdqu %xmm9,32(%rsi) - vmovdqu %xmm10,48(%rsi) - vmovdqu %xmm11,64(%rsi) - vmovdqu %xmm12,80(%rsi) - - vpxor %xmm5,%xmm0,%xmm0 - - leaq 96(%rdi),%rdi - leaq 96(%rsi),%rsi - jmp L$128_dec_loop1 - -L$128_dec_finish_96: - vmovdqa %xmm12,%xmm6 - vmovdqa %xmm11,16-32(%rax) - vmovdqa %xmm10,32-32(%rax) - vmovdqa %xmm9,48-32(%rax) - vmovdqa %xmm8,64-32(%rax) - vmovdqa %xmm7,80-32(%rax) - - vmovdqu 0-32(%rcx),%xmm4 - vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 - vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 - vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu -16(%rax),%xmm6 - vmovdqu -16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 0(%rax),%xmm6 - vmovdqu 0(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 16(%rax),%xmm6 - vmovdqu 16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 32(%rax),%xmm6 - vmovdqu 32(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 80-32(%rax),%xmm6 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqu 80-32(%rcx),%xmm5 - vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm2,%xmm5 - vpslldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm0 - - vmovdqa poly(%rip),%xmm3 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpxor %xmm5,%xmm0,%xmm0 - -L$128_dec_loop2: - - - - cmpq $16,%r9 - jb L$128_dec_out - subq $16,%r9 - - vmovdqa %xmm15,%xmm2 - vpaddd one(%rip),%xmm15,%xmm15 - - vpxor 0(%r8),%xmm2,%xmm2 - vaesenc 16(%r8),%xmm2,%xmm2 - vaesenc 32(%r8),%xmm2,%xmm2 - vaesenc 48(%r8),%xmm2,%xmm2 - vaesenc 64(%r8),%xmm2,%xmm2 - vaesenc 80(%r8),%xmm2,%xmm2 - vaesenc 96(%r8),%xmm2,%xmm2 - vaesenc 112(%r8),%xmm2,%xmm2 - vaesenc 128(%r8),%xmm2,%xmm2 - vaesenc 144(%r8),%xmm2,%xmm2 - vaesenclast 160(%r8),%xmm2,%xmm2 - vpxor (%rdi),%xmm2,%xmm2 - vmovdqu %xmm2,(%rsi) - addq $16,%rdi - addq $16,%rsi - - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa -32(%rcx),%xmm1 - call GFMUL - - jmp L$128_dec_loop2 - -L$128_dec_out: - vmovdqu %xmm0,(%rdx) - .byte 0xf3,0xc3 - - -.globl _aes128gcmsiv_ecb_enc_block -.private_extern _aes128gcmsiv_ecb_enc_block - -.p2align 4 -_aes128gcmsiv_ecb_enc_block: - - vmovdqa (%rdi),%xmm1 - - vpxor (%rdx),%xmm1,%xmm1 - vaesenc 16(%rdx),%xmm1,%xmm1 - vaesenc 32(%rdx),%xmm1,%xmm1 - vaesenc 48(%rdx),%xmm1,%xmm1 - vaesenc 64(%rdx),%xmm1,%xmm1 - vaesenc 80(%rdx),%xmm1,%xmm1 - vaesenc 96(%rdx),%xmm1,%xmm1 - vaesenc 112(%rdx),%xmm1,%xmm1 - vaesenc 128(%rdx),%xmm1,%xmm1 - vaesenc 144(%rdx),%xmm1,%xmm1 - vaesenclast 160(%rdx),%xmm1,%xmm1 - - vmovdqa %xmm1,(%rsi) - - .byte 0xf3,0xc3 - - -.globl _aes256gcmsiv_aes_ks_enc_x1 -.private_extern _aes256gcmsiv_aes_ks_enc_x1 - -.p2align 4 -_aes256gcmsiv_aes_ks_enc_x1: - - vmovdqa con1(%rip),%xmm0 - vmovdqa mask(%rip),%xmm15 - vmovdqa (%rdi),%xmm8 - vmovdqa (%rcx),%xmm1 - vmovdqa 16(%rcx),%xmm3 - vpxor %xmm1,%xmm8,%xmm8 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm1,(%rdx) - vmovdqu %xmm3,16(%rdx) - vpxor %xmm14,%xmm14,%xmm14 - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,32(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,48(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,64(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,80(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,96(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,112(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,128(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,144(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,160(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,176(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,192(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,208(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenclast %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,224(%rdx) - - vmovdqa %xmm8,(%rsi) - .byte 0xf3,0xc3 - - -.globl _aes256gcmsiv_ecb_enc_block -.private_extern _aes256gcmsiv_ecb_enc_block - -.p2align 4 -_aes256gcmsiv_ecb_enc_block: - - vmovdqa (%rdi),%xmm1 - vpxor (%rdx),%xmm1,%xmm1 - vaesenc 16(%rdx),%xmm1,%xmm1 - vaesenc 32(%rdx),%xmm1,%xmm1 - vaesenc 48(%rdx),%xmm1,%xmm1 - vaesenc 64(%rdx),%xmm1,%xmm1 - vaesenc 80(%rdx),%xmm1,%xmm1 - vaesenc 96(%rdx),%xmm1,%xmm1 - vaesenc 112(%rdx),%xmm1,%xmm1 - vaesenc 128(%rdx),%xmm1,%xmm1 - vaesenc 144(%rdx),%xmm1,%xmm1 - vaesenc 160(%rdx),%xmm1,%xmm1 - vaesenc 176(%rdx),%xmm1,%xmm1 - vaesenc 192(%rdx),%xmm1,%xmm1 - vaesenc 208(%rdx),%xmm1,%xmm1 - vaesenclast 224(%rdx),%xmm1,%xmm1 - vmovdqa %xmm1,(%rsi) - .byte 0xf3,0xc3 - - -.globl _aes256gcmsiv_enc_msg_x4 -.private_extern _aes256gcmsiv_enc_msg_x4 - -.p2align 4 -_aes256gcmsiv_enc_msg_x4: - - testq %r8,%r8 - jnz L$256_enc_msg_x4_start - .byte 0xf3,0xc3 - -L$256_enc_msg_x4_start: - movq %r8,%r10 - shrq $4,%r8 - shlq $60,%r10 - jz L$256_enc_msg_x4_start2 - addq $1,%r8 - -L$256_enc_msg_x4_start2: - movq %r8,%r10 - shlq $62,%r10 - shrq $62,%r10 - - - vmovdqa (%rdx),%xmm15 - vpor OR_MASK(%rip),%xmm15,%xmm15 - - vmovdqa four(%rip),%xmm4 - vmovdqa %xmm15,%xmm0 - vpaddd one(%rip),%xmm15,%xmm1 - vpaddd two(%rip),%xmm15,%xmm2 - vpaddd three(%rip),%xmm15,%xmm3 - - shrq $2,%r8 - je L$256_enc_msg_x4_check_remainder - - subq $64,%rsi - subq $64,%rdi - -L$256_enc_msg_x4_loop1: - addq $64,%rsi - addq $64,%rdi - - vmovdqa %xmm0,%xmm5 - vmovdqa %xmm1,%xmm6 - vmovdqa %xmm2,%xmm7 - vmovdqa %xmm3,%xmm8 - - vpxor (%rcx),%xmm5,%xmm5 - vpxor (%rcx),%xmm6,%xmm6 - vpxor (%rcx),%xmm7,%xmm7 - vpxor (%rcx),%xmm8,%xmm8 - - vmovdqu 16(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm0,%xmm0 - vmovdqu 32(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm1,%xmm1 - vmovdqu 48(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm2,%xmm2 - vmovdqu 64(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm3,%xmm3 - - vmovdqu 80(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 96(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 112(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 128(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 144(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 160(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 176(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 192(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 208(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 224(%rcx),%xmm12 - vaesenclast %xmm12,%xmm5,%xmm5 - vaesenclast %xmm12,%xmm6,%xmm6 - vaesenclast %xmm12,%xmm7,%xmm7 - vaesenclast %xmm12,%xmm8,%xmm8 - - - - vpxor 0(%rdi),%xmm5,%xmm5 - vpxor 16(%rdi),%xmm6,%xmm6 - vpxor 32(%rdi),%xmm7,%xmm7 - vpxor 48(%rdi),%xmm8,%xmm8 - - subq $1,%r8 - - vmovdqu %xmm5,0(%rsi) - vmovdqu %xmm6,16(%rsi) - vmovdqu %xmm7,32(%rsi) - vmovdqu %xmm8,48(%rsi) - - jne L$256_enc_msg_x4_loop1 - - addq $64,%rsi - addq $64,%rdi - -L$256_enc_msg_x4_check_remainder: - cmpq $0,%r10 - je L$256_enc_msg_x4_out - -L$256_enc_msg_x4_loop2: - - - - vmovdqa %xmm0,%xmm5 - vpaddd one(%rip),%xmm0,%xmm0 - vpxor (%rcx),%xmm5,%xmm5 - vaesenc 16(%rcx),%xmm5,%xmm5 - vaesenc 32(%rcx),%xmm5,%xmm5 - vaesenc 48(%rcx),%xmm5,%xmm5 - vaesenc 64(%rcx),%xmm5,%xmm5 - vaesenc 80(%rcx),%xmm5,%xmm5 - vaesenc 96(%rcx),%xmm5,%xmm5 - vaesenc 112(%rcx),%xmm5,%xmm5 - vaesenc 128(%rcx),%xmm5,%xmm5 - vaesenc 144(%rcx),%xmm5,%xmm5 - vaesenc 160(%rcx),%xmm5,%xmm5 - vaesenc 176(%rcx),%xmm5,%xmm5 - vaesenc 192(%rcx),%xmm5,%xmm5 - vaesenc 208(%rcx),%xmm5,%xmm5 - vaesenclast 224(%rcx),%xmm5,%xmm5 - - - vpxor (%rdi),%xmm5,%xmm5 - - vmovdqu %xmm5,(%rsi) - - addq $16,%rdi - addq $16,%rsi - - subq $1,%r10 - jne L$256_enc_msg_x4_loop2 - -L$256_enc_msg_x4_out: - .byte 0xf3,0xc3 - - -.globl _aes256gcmsiv_enc_msg_x8 -.private_extern _aes256gcmsiv_enc_msg_x8 - -.p2align 4 -_aes256gcmsiv_enc_msg_x8: - - testq %r8,%r8 - jnz L$256_enc_msg_x8_start - .byte 0xf3,0xc3 - -L$256_enc_msg_x8_start: - - movq %rsp,%r11 - subq $16,%r11 - andq $-64,%r11 - - movq %r8,%r10 - shrq $4,%r8 - shlq $60,%r10 - jz L$256_enc_msg_x8_start2 - addq $1,%r8 - -L$256_enc_msg_x8_start2: - movq %r8,%r10 - shlq $61,%r10 - shrq $61,%r10 - - - vmovdqa (%rdx),%xmm1 - vpor OR_MASK(%rip),%xmm1,%xmm1 - - - vpaddd seven(%rip),%xmm1,%xmm0 - vmovdqa %xmm0,(%r11) - vpaddd one(%rip),%xmm1,%xmm9 - vpaddd two(%rip),%xmm1,%xmm10 - vpaddd three(%rip),%xmm1,%xmm11 - vpaddd four(%rip),%xmm1,%xmm12 - vpaddd five(%rip),%xmm1,%xmm13 - vpaddd six(%rip),%xmm1,%xmm14 - vmovdqa %xmm1,%xmm0 - - shrq $3,%r8 - jz L$256_enc_msg_x8_check_remainder - - subq $128,%rsi - subq $128,%rdi - -L$256_enc_msg_x8_loop1: - addq $128,%rsi - addq $128,%rdi - - vmovdqa %xmm0,%xmm1 - vmovdqa %xmm9,%xmm2 - vmovdqa %xmm10,%xmm3 - vmovdqa %xmm11,%xmm4 - vmovdqa %xmm12,%xmm5 - vmovdqa %xmm13,%xmm6 - vmovdqa %xmm14,%xmm7 - - vmovdqa (%r11),%xmm8 - - vpxor (%rcx),%xmm1,%xmm1 - vpxor (%rcx),%xmm2,%xmm2 - vpxor (%rcx),%xmm3,%xmm3 - vpxor (%rcx),%xmm4,%xmm4 - vpxor (%rcx),%xmm5,%xmm5 - vpxor (%rcx),%xmm6,%xmm6 - vpxor (%rcx),%xmm7,%xmm7 - vpxor (%rcx),%xmm8,%xmm8 - - vmovdqu 16(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqa (%r11),%xmm14 - vpaddd eight(%rip),%xmm14,%xmm14 - vmovdqa %xmm14,(%r11) - vmovdqu 32(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpsubd one(%rip),%xmm14,%xmm14 - vmovdqu 48(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm0,%xmm0 - vmovdqu 64(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm9,%xmm9 - vmovdqu 80(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm10,%xmm10 - vmovdqu 96(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm11,%xmm11 - vmovdqu 112(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm12,%xmm12 - vmovdqu 128(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm13,%xmm13 - vmovdqu 144(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 160(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 176(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 192(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 208(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 224(%rcx),%xmm15 - vaesenclast %xmm15,%xmm1,%xmm1 - vaesenclast %xmm15,%xmm2,%xmm2 - vaesenclast %xmm15,%xmm3,%xmm3 - vaesenclast %xmm15,%xmm4,%xmm4 - vaesenclast %xmm15,%xmm5,%xmm5 - vaesenclast %xmm15,%xmm6,%xmm6 - vaesenclast %xmm15,%xmm7,%xmm7 - vaesenclast %xmm15,%xmm8,%xmm8 - - - - vpxor 0(%rdi),%xmm1,%xmm1 - vpxor 16(%rdi),%xmm2,%xmm2 - vpxor 32(%rdi),%xmm3,%xmm3 - vpxor 48(%rdi),%xmm4,%xmm4 - vpxor 64(%rdi),%xmm5,%xmm5 - vpxor 80(%rdi),%xmm6,%xmm6 - vpxor 96(%rdi),%xmm7,%xmm7 - vpxor 112(%rdi),%xmm8,%xmm8 - - subq $1,%r8 - - vmovdqu %xmm1,0(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - vmovdqu %xmm7,96(%rsi) - vmovdqu %xmm8,112(%rsi) - - jne L$256_enc_msg_x8_loop1 - - addq $128,%rsi - addq $128,%rdi - -L$256_enc_msg_x8_check_remainder: - cmpq $0,%r10 - je L$256_enc_msg_x8_out - -L$256_enc_msg_x8_loop2: - - - vmovdqa %xmm0,%xmm1 - vpaddd one(%rip),%xmm0,%xmm0 - - vpxor (%rcx),%xmm1,%xmm1 - vaesenc 16(%rcx),%xmm1,%xmm1 - vaesenc 32(%rcx),%xmm1,%xmm1 - vaesenc 48(%rcx),%xmm1,%xmm1 - vaesenc 64(%rcx),%xmm1,%xmm1 - vaesenc 80(%rcx),%xmm1,%xmm1 - vaesenc 96(%rcx),%xmm1,%xmm1 - vaesenc 112(%rcx),%xmm1,%xmm1 - vaesenc 128(%rcx),%xmm1,%xmm1 - vaesenc 144(%rcx),%xmm1,%xmm1 - vaesenc 160(%rcx),%xmm1,%xmm1 - vaesenc 176(%rcx),%xmm1,%xmm1 - vaesenc 192(%rcx),%xmm1,%xmm1 - vaesenc 208(%rcx),%xmm1,%xmm1 - vaesenclast 224(%rcx),%xmm1,%xmm1 - - - vpxor (%rdi),%xmm1,%xmm1 - - vmovdqu %xmm1,(%rsi) - - addq $16,%rdi - addq $16,%rsi - subq $1,%r10 - jnz L$256_enc_msg_x8_loop2 - -L$256_enc_msg_x8_out: - .byte 0xf3,0xc3 - - - -.globl _aes256gcmsiv_dec -.private_extern _aes256gcmsiv_dec - -.p2align 4 -_aes256gcmsiv_dec: - - testq $~15,%r9 - jnz L$256_dec_start - .byte 0xf3,0xc3 - -L$256_dec_start: - vzeroupper - vmovdqa (%rdx),%xmm0 - movq %rdx,%rax - - leaq 32(%rax),%rax - leaq 32(%rcx),%rcx - - - vmovdqu (%rdi,%r9,1),%xmm15 - vpor OR_MASK(%rip),%xmm15,%xmm15 - andq $~15,%r9 - - - cmpq $96,%r9 - jb L$256_dec_loop2 - - - subq $96,%r9 - vmovdqa %xmm15,%xmm7 - vpaddd one(%rip),%xmm7,%xmm8 - vpaddd two(%rip),%xmm7,%xmm9 - vpaddd one(%rip),%xmm9,%xmm10 - vpaddd two(%rip),%xmm9,%xmm11 - vpaddd one(%rip),%xmm11,%xmm12 - vpaddd two(%rip),%xmm11,%xmm15 - - vpxor (%r8),%xmm7,%xmm7 - vpxor (%r8),%xmm8,%xmm8 - vpxor (%r8),%xmm9,%xmm9 - vpxor (%r8),%xmm10,%xmm10 - vpxor (%r8),%xmm11,%xmm11 - vpxor (%r8),%xmm12,%xmm12 - - vmovdqu 16(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 32(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 48(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 64(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 80(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 96(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 112(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 128(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 144(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 160(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 176(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 192(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 208(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 224(%r8),%xmm4 - vaesenclast %xmm4,%xmm7,%xmm7 - vaesenclast %xmm4,%xmm8,%xmm8 - vaesenclast %xmm4,%xmm9,%xmm9 - vaesenclast %xmm4,%xmm10,%xmm10 - vaesenclast %xmm4,%xmm11,%xmm11 - vaesenclast %xmm4,%xmm12,%xmm12 - - - vpxor 0(%rdi),%xmm7,%xmm7 - vpxor 16(%rdi),%xmm8,%xmm8 - vpxor 32(%rdi),%xmm9,%xmm9 - vpxor 48(%rdi),%xmm10,%xmm10 - vpxor 64(%rdi),%xmm11,%xmm11 - vpxor 80(%rdi),%xmm12,%xmm12 - - vmovdqu %xmm7,0(%rsi) - vmovdqu %xmm8,16(%rsi) - vmovdqu %xmm9,32(%rsi) - vmovdqu %xmm10,48(%rsi) - vmovdqu %xmm11,64(%rsi) - vmovdqu %xmm12,80(%rsi) - - addq $96,%rdi - addq $96,%rsi - jmp L$256_dec_loop1 - - -.p2align 6 -L$256_dec_loop1: - cmpq $96,%r9 - jb L$256_dec_finish_96 - subq $96,%r9 - - vmovdqa %xmm12,%xmm6 - vmovdqa %xmm11,16-32(%rax) - vmovdqa %xmm10,32-32(%rax) - vmovdqa %xmm9,48-32(%rax) - vmovdqa %xmm8,64-32(%rax) - vmovdqa %xmm7,80-32(%rax) - - vmovdqa %xmm15,%xmm7 - vpaddd one(%rip),%xmm7,%xmm8 - vpaddd two(%rip),%xmm7,%xmm9 - vpaddd one(%rip),%xmm9,%xmm10 - vpaddd two(%rip),%xmm9,%xmm11 - vpaddd one(%rip),%xmm11,%xmm12 - vpaddd two(%rip),%xmm11,%xmm15 - - vmovdqa (%r8),%xmm4 - vpxor %xmm4,%xmm7,%xmm7 - vpxor %xmm4,%xmm8,%xmm8 - vpxor %xmm4,%xmm9,%xmm9 - vpxor %xmm4,%xmm10,%xmm10 - vpxor %xmm4,%xmm11,%xmm11 - vpxor %xmm4,%xmm12,%xmm12 - - vmovdqu 0-32(%rcx),%xmm4 - vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 - vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 - vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 16(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu -16(%rax),%xmm6 - vmovdqu -16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 32(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 0(%rax),%xmm6 - vmovdqu 0(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 48(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 16(%rax),%xmm6 - vmovdqu 16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 64(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 32(%rax),%xmm6 - vmovdqu 32(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 80(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 96(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 112(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - - vmovdqa 80-32(%rax),%xmm6 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqu 80-32(%rcx),%xmm5 - - vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 128(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - - vpsrldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm2,%xmm5 - vpslldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm0 - - vmovdqa poly(%rip),%xmm3 - - vmovdqu 144(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 160(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 176(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 192(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 208(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 224(%r8),%xmm6 - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpxor 0(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm7,%xmm7 - vpxor 16(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm8,%xmm8 - vpxor 32(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm9,%xmm9 - vpxor 48(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm10,%xmm10 - vpxor 64(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm11,%xmm11 - vpxor 80(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm12,%xmm12 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vmovdqu %xmm7,0(%rsi) - vmovdqu %xmm8,16(%rsi) - vmovdqu %xmm9,32(%rsi) - vmovdqu %xmm10,48(%rsi) - vmovdqu %xmm11,64(%rsi) - vmovdqu %xmm12,80(%rsi) - - vpxor %xmm5,%xmm0,%xmm0 - - leaq 96(%rdi),%rdi - leaq 96(%rsi),%rsi - jmp L$256_dec_loop1 - -L$256_dec_finish_96: - vmovdqa %xmm12,%xmm6 - vmovdqa %xmm11,16-32(%rax) - vmovdqa %xmm10,32-32(%rax) - vmovdqa %xmm9,48-32(%rax) - vmovdqa %xmm8,64-32(%rax) - vmovdqa %xmm7,80-32(%rax) - - vmovdqu 0-32(%rcx),%xmm4 - vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 - vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 - vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu -16(%rax),%xmm6 - vmovdqu -16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 0(%rax),%xmm6 - vmovdqu 0(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 16(%rax),%xmm6 - vmovdqu 16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 32(%rax),%xmm6 - vmovdqu 32(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 80-32(%rax),%xmm6 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqu 80-32(%rcx),%xmm5 - vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm2,%xmm5 - vpslldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm0 - - vmovdqa poly(%rip),%xmm3 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpxor %xmm5,%xmm0,%xmm0 - -L$256_dec_loop2: - - - - cmpq $16,%r9 - jb L$256_dec_out - subq $16,%r9 - - vmovdqa %xmm15,%xmm2 - vpaddd one(%rip),%xmm15,%xmm15 - - vpxor 0(%r8),%xmm2,%xmm2 - vaesenc 16(%r8),%xmm2,%xmm2 - vaesenc 32(%r8),%xmm2,%xmm2 - vaesenc 48(%r8),%xmm2,%xmm2 - vaesenc 64(%r8),%xmm2,%xmm2 - vaesenc 80(%r8),%xmm2,%xmm2 - vaesenc 96(%r8),%xmm2,%xmm2 - vaesenc 112(%r8),%xmm2,%xmm2 - vaesenc 128(%r8),%xmm2,%xmm2 - vaesenc 144(%r8),%xmm2,%xmm2 - vaesenc 160(%r8),%xmm2,%xmm2 - vaesenc 176(%r8),%xmm2,%xmm2 - vaesenc 192(%r8),%xmm2,%xmm2 - vaesenc 208(%r8),%xmm2,%xmm2 - vaesenclast 224(%r8),%xmm2,%xmm2 - vpxor (%rdi),%xmm2,%xmm2 - vmovdqu %xmm2,(%rsi) - addq $16,%rdi - addq $16,%rsi - - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa -32(%rcx),%xmm1 - call GFMUL - - jmp L$256_dec_loop2 - -L$256_dec_out: - vmovdqu %xmm0,(%rdx) - .byte 0xf3,0xc3 - - -.globl _aes256gcmsiv_kdf -.private_extern _aes256gcmsiv_kdf - -.p2align 4 -_aes256gcmsiv_kdf: - - - - - - vmovdqa (%rdx),%xmm1 - vmovdqa 0(%rdi),%xmm4 - vmovdqa and_mask(%rip),%xmm11 - vmovdqa one(%rip),%xmm8 - vpshufd $0x90,%xmm4,%xmm4 - vpand %xmm11,%xmm4,%xmm4 - vpaddd %xmm8,%xmm4,%xmm6 - vpaddd %xmm8,%xmm6,%xmm7 - vpaddd %xmm8,%xmm7,%xmm11 - vpaddd %xmm8,%xmm11,%xmm12 - vpaddd %xmm8,%xmm12,%xmm13 - - vpxor %xmm1,%xmm4,%xmm4 - vpxor %xmm1,%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm1,%xmm11,%xmm11 - vpxor %xmm1,%xmm12,%xmm12 - vpxor %xmm1,%xmm13,%xmm13 - - vmovdqa 16(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 32(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 48(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 64(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 80(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 96(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 112(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 128(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 144(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 160(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 176(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 192(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 208(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 224(%rdx),%xmm2 - vaesenclast %xmm2,%xmm4,%xmm4 - vaesenclast %xmm2,%xmm6,%xmm6 - vaesenclast %xmm2,%xmm7,%xmm7 - vaesenclast %xmm2,%xmm11,%xmm11 - vaesenclast %xmm2,%xmm12,%xmm12 - vaesenclast %xmm2,%xmm13,%xmm13 - - - vmovdqa %xmm4,0(%rsi) - vmovdqa %xmm6,16(%rsi) - vmovdqa %xmm7,32(%rsi) - vmovdqa %xmm11,48(%rsi) - vmovdqa %xmm12,64(%rsi) - vmovdqa %xmm13,80(%rsi) - .byte 0xf3,0xc3 - - -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/third_party/boringssl/apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S deleted file mode 100644 index 6813510c..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S +++ /dev/null @@ -1,8878 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - -chacha20_poly1305_constants: - -.p2align 6 -L$chacha20_consts: -.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' -.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' -L$rol8: -.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 -.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 -L$rol16: -.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 -.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 -L$avx2_init: -.long 0,0,0,0 -L$sse_inc: -.long 1,0,0,0 -L$avx2_inc: -.long 2,0,0,0,2,0,0,0 -L$clamp: -.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC -.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF -.p2align 4 -L$and_masks: -.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff - - -.p2align 6 -poly_hash_ad_internal: - - - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - cmpq $13,%r8 - jne L$hash_ad_loop -L$poly_fast_tls_ad: - - movq (%rcx),%r10 - movq 5(%rcx),%r11 - shrq $24,%r11 - movq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - .byte 0xf3,0xc3 -L$hash_ad_loop: - - cmpq $16,%r8 - jb L$hash_ad_tail - addq 0+0(%rcx),%r10 - adcq 8+0(%rcx),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rcx),%rcx - subq $16,%r8 - jmp L$hash_ad_loop -L$hash_ad_tail: - cmpq $0,%r8 - je L$hash_ad_done - - xorq %r13,%r13 - xorq %r14,%r14 - xorq %r15,%r15 - addq %r8,%rcx -L$hash_ad_tail_loop: - shldq $8,%r13,%r14 - shlq $8,%r13 - movzbq -1(%rcx),%r15 - xorq %r15,%r13 - decq %rcx - decq %r8 - jne L$hash_ad_tail_loop - - addq %r13,%r10 - adcq %r14,%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - -L$hash_ad_done: - .byte 0xf3,0xc3 - - - -.globl _chacha20_poly1305_open -.private_extern _chacha20_poly1305_open - -.p2align 6 -_chacha20_poly1305_open: - - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - - - pushq %r9 - - subq $288 + 0 + 32,%rsp - - - leaq 32(%rsp),%rbp - andq $-32,%rbp - - movq %rdx,%rbx - movq %r8,0+0+32(%rbp) - movq %rbx,8+0+32(%rbp) - - movl _OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_open_avx2 - - cmpq $128,%rbx - jbe L$open_sse_128 - - movdqa L$chacha20_consts(%rip),%xmm0 - movdqu 0(%r9),%xmm4 - movdqu 16(%r9),%xmm8 - movdqu 32(%r9),%xmm12 - - movdqa %xmm12,%xmm7 - - movdqa %xmm4,0+48(%rbp) - movdqa %xmm8,0+64(%rbp) - movdqa %xmm12,0+96(%rbp) - movq $10,%r10 -L$open_sse_init_rounds: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - - decq %r10 - jne L$open_sse_init_rounds - - paddd L$chacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - - pand L$clamp(%rip),%xmm0 - movdqa %xmm0,0+0(%rbp) - movdqa %xmm4,0+16(%rbp) - - movq %r8,%r8 - call poly_hash_ad_internal -L$open_sse_main_loop: - cmpq $256,%rbx - jb L$open_sse_tail - - movdqa L$chacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa %xmm0,%xmm2 - movdqa %xmm4,%xmm6 - movdqa %xmm8,%xmm10 - movdqa %xmm0,%xmm3 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm11 - movdqa 0+96(%rbp),%xmm15 - paddd L$sse_inc(%rip),%xmm15 - movdqa %xmm15,%xmm14 - paddd L$sse_inc(%rip),%xmm14 - movdqa %xmm14,%xmm13 - paddd L$sse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - movdqa %xmm15,0+144(%rbp) - - - - movq $4,%rcx - movq %rsi,%r8 -L$open_sse_main_loop_rounds: - movdqa %xmm8,0+80(%rbp) - movdqa L$rol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - - leaq 16(%r8),%r8 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movdqa L$rol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 -.byte 102,15,58,15,255,4 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,12 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - movdqa %xmm8,0+80(%rbp) - movdqa L$rol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movdqa L$rol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 -.byte 102,15,58,15,255,12 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,4 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - - decq %rcx - jge L$open_sse_main_loop_rounds - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%r8),%r8 - cmpq $-6,%rcx - jg L$open_sse_main_loop_rounds - paddd L$chacha20_consts(%rip),%xmm3 - paddd 0+48(%rbp),%xmm7 - paddd 0+64(%rbp),%xmm11 - paddd 0+144(%rbp),%xmm15 - paddd L$chacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd L$chacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd L$chacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqa %xmm12,0+80(%rbp) - movdqu 0 + 0(%rsi),%xmm12 - pxor %xmm3,%xmm12 - movdqu %xmm12,0 + 0(%rdi) - movdqu 16 + 0(%rsi),%xmm12 - pxor %xmm7,%xmm12 - movdqu %xmm12,16 + 0(%rdi) - movdqu 32 + 0(%rsi),%xmm12 - pxor %xmm11,%xmm12 - movdqu %xmm12,32 + 0(%rdi) - movdqu 48 + 0(%rsi),%xmm12 - pxor %xmm15,%xmm12 - movdqu %xmm12,48 + 0(%rdi) - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 64(%rdi) - movdqu %xmm6,16 + 64(%rdi) - movdqu %xmm10,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - movdqu 0 + 128(%rsi),%xmm3 - movdqu 16 + 128(%rsi),%xmm7 - movdqu 32 + 128(%rsi),%xmm11 - movdqu 48 + 128(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 128(%rdi) - movdqu %xmm5,16 + 128(%rdi) - movdqu %xmm9,32 + 128(%rdi) - movdqu %xmm15,48 + 128(%rdi) - movdqu 0 + 192(%rsi),%xmm3 - movdqu 16 + 192(%rsi),%xmm7 - movdqu 32 + 192(%rsi),%xmm11 - movdqu 48 + 192(%rsi),%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm7,%xmm4 - pxor %xmm11,%xmm8 - pxor 0+80(%rbp),%xmm15 - movdqu %xmm0,0 + 192(%rdi) - movdqu %xmm4,16 + 192(%rdi) - movdqu %xmm8,32 + 192(%rdi) - movdqu %xmm15,48 + 192(%rdi) - - leaq 256(%rsi),%rsi - leaq 256(%rdi),%rdi - subq $256,%rbx - jmp L$open_sse_main_loop -L$open_sse_tail: - - testq %rbx,%rbx - jz L$open_sse_finalize - cmpq $192,%rbx - ja L$open_sse_tail_256 - cmpq $128,%rbx - ja L$open_sse_tail_192 - cmpq $64,%rbx - ja L$open_sse_tail_128 - movdqa L$chacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa 0+96(%rbp),%xmm12 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - - xorq %r8,%r8 - movq %rbx,%rcx - cmpq $16,%rcx - jb L$open_sse_tail_64_rounds -L$open_sse_tail_64_rounds_and_x1hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - subq $16,%rcx -L$open_sse_tail_64_rounds: - addq $16,%r8 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - - cmpq $16,%rcx - jae L$open_sse_tail_64_rounds_and_x1hash - cmpq $160,%r8 - jne L$open_sse_tail_64_rounds - paddd L$chacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - - jmp L$open_sse_tail_64_dec_loop - -L$open_sse_tail_128: - movdqa L$chacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa 0+96(%rbp),%xmm13 - paddd L$sse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - - movq %rbx,%rcx - andq $-16,%rcx - xorq %r8,%r8 -L$open_sse_tail_128_rounds_and_x1hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - -L$open_sse_tail_128_rounds: - addq $16,%r8 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - - cmpq %rcx,%r8 - jb L$open_sse_tail_128_rounds_and_x1hash - cmpq $160,%r8 - jne L$open_sse_tail_128_rounds - paddd L$chacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd L$chacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqu 0 + 0(%rsi),%xmm3 - movdqu 16 + 0(%rsi),%xmm7 - movdqu 32 + 0(%rsi),%xmm11 - movdqu 48 + 0(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 0(%rdi) - movdqu %xmm5,16 + 0(%rdi) - movdqu %xmm9,32 + 0(%rdi) - movdqu %xmm15,48 + 0(%rdi) - - subq $64,%rbx - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - jmp L$open_sse_tail_64_dec_loop - -L$open_sse_tail_192: - movdqa L$chacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa %xmm0,%xmm2 - movdqa %xmm4,%xmm6 - movdqa %xmm8,%xmm10 - movdqa 0+96(%rbp),%xmm14 - paddd L$sse_inc(%rip),%xmm14 - movdqa %xmm14,%xmm13 - paddd L$sse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - - movq %rbx,%rcx - movq $160,%r8 - cmpq $160,%rcx - cmovgq %r8,%rcx - andq $-16,%rcx - xorq %r8,%r8 -L$open_sse_tail_192_rounds_and_x1hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - -L$open_sse_tail_192_rounds: - addq $16,%r8 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 - - cmpq %rcx,%r8 - jb L$open_sse_tail_192_rounds_and_x1hash - cmpq $160,%r8 - jne L$open_sse_tail_192_rounds - cmpq $176,%rbx - jb L$open_sse_tail_192_finish - addq 0+160(%rsi),%r10 - adcq 8+160(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - cmpq $192,%rbx - jb L$open_sse_tail_192_finish - addq 0+176(%rsi),%r10 - adcq 8+176(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - -L$open_sse_tail_192_finish: - paddd L$chacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd L$chacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd L$chacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqu 0 + 0(%rsi),%xmm3 - movdqu 16 + 0(%rsi),%xmm7 - movdqu 32 + 0(%rsi),%xmm11 - movdqu 48 + 0(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 0(%rdi) - movdqu %xmm6,16 + 0(%rdi) - movdqu %xmm10,32 + 0(%rdi) - movdqu %xmm15,48 + 0(%rdi) - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 64(%rdi) - movdqu %xmm5,16 + 64(%rdi) - movdqu %xmm9,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - - subq $128,%rbx - leaq 128(%rsi),%rsi - leaq 128(%rdi),%rdi - jmp L$open_sse_tail_64_dec_loop - -L$open_sse_tail_256: - movdqa L$chacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa %xmm0,%xmm2 - movdqa %xmm4,%xmm6 - movdqa %xmm8,%xmm10 - movdqa %xmm0,%xmm3 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm11 - movdqa 0+96(%rbp),%xmm15 - paddd L$sse_inc(%rip),%xmm15 - movdqa %xmm15,%xmm14 - paddd L$sse_inc(%rip),%xmm14 - movdqa %xmm14,%xmm13 - paddd L$sse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - movdqa %xmm15,0+144(%rbp) - - xorq %r8,%r8 -L$open_sse_tail_256_rounds_and_x1hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movdqa %xmm11,0+80(%rbp) - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm4 - pxor %xmm11,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm4 - pxor %xmm11,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm5 - pxor %xmm11,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm5 - pxor %xmm11,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm6 - pxor %xmm11,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm6 - pxor %xmm11,%xmm6 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 - movdqa 0+80(%rbp),%xmm11 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movdqa %xmm9,0+80(%rbp) - paddd %xmm7,%xmm3 - pxor %xmm3,%xmm15 - pshufb L$rol16(%rip),%xmm15 - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm9 - pslld $12,%xmm9 - psrld $20,%xmm7 - pxor %xmm9,%xmm7 - paddd %xmm7,%xmm3 - pxor %xmm3,%xmm15 - pshufb L$rol8(%rip),%xmm15 - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm9 - pslld $7,%xmm9 - psrld $25,%xmm7 - pxor %xmm9,%xmm7 -.byte 102,15,58,15,255,4 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,12 - movdqa 0+80(%rbp),%xmm9 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - movdqa %xmm11,0+80(%rbp) - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm4 - pxor %xmm11,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm4 - pxor %xmm11,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm5 - pxor %xmm11,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm5 - pxor %xmm11,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm6 - pxor %xmm11,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm6 - pxor %xmm11,%xmm6 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 - movdqa 0+80(%rbp),%xmm11 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - movdqa %xmm9,0+80(%rbp) - paddd %xmm7,%xmm3 - pxor %xmm3,%xmm15 - pshufb L$rol16(%rip),%xmm15 - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm9 - pslld $12,%xmm9 - psrld $20,%xmm7 - pxor %xmm9,%xmm7 - paddd %xmm7,%xmm3 - pxor %xmm3,%xmm15 - pshufb L$rol8(%rip),%xmm15 - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm9 - pslld $7,%xmm9 - psrld $25,%xmm7 - pxor %xmm9,%xmm7 -.byte 102,15,58,15,255,12 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,4 - movdqa 0+80(%rbp),%xmm9 - - addq $16,%r8 - cmpq $160,%r8 - jb L$open_sse_tail_256_rounds_and_x1hash - - movq %rbx,%rcx - andq $-16,%rcx -L$open_sse_tail_256_hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - addq $16,%r8 - cmpq %rcx,%r8 - jb L$open_sse_tail_256_hash - paddd L$chacha20_consts(%rip),%xmm3 - paddd 0+48(%rbp),%xmm7 - paddd 0+64(%rbp),%xmm11 - paddd 0+144(%rbp),%xmm15 - paddd L$chacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd L$chacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd L$chacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqa %xmm12,0+80(%rbp) - movdqu 0 + 0(%rsi),%xmm12 - pxor %xmm3,%xmm12 - movdqu %xmm12,0 + 0(%rdi) - movdqu 16 + 0(%rsi),%xmm12 - pxor %xmm7,%xmm12 - movdqu %xmm12,16 + 0(%rdi) - movdqu 32 + 0(%rsi),%xmm12 - pxor %xmm11,%xmm12 - movdqu %xmm12,32 + 0(%rdi) - movdqu 48 + 0(%rsi),%xmm12 - pxor %xmm15,%xmm12 - movdqu %xmm12,48 + 0(%rdi) - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 64(%rdi) - movdqu %xmm6,16 + 64(%rdi) - movdqu %xmm10,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - movdqu 0 + 128(%rsi),%xmm3 - movdqu 16 + 128(%rsi),%xmm7 - movdqu 32 + 128(%rsi),%xmm11 - movdqu 48 + 128(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 128(%rdi) - movdqu %xmm5,16 + 128(%rdi) - movdqu %xmm9,32 + 128(%rdi) - movdqu %xmm15,48 + 128(%rdi) - - movdqa 0+80(%rbp),%xmm12 - subq $192,%rbx - leaq 192(%rsi),%rsi - leaq 192(%rdi),%rdi - - -L$open_sse_tail_64_dec_loop: - cmpq $16,%rbx - jb L$open_sse_tail_16_init - subq $16,%rbx - movdqu (%rsi),%xmm3 - pxor %xmm3,%xmm0 - movdqu %xmm0,(%rdi) - leaq 16(%rsi),%rsi - leaq 16(%rdi),%rdi - movdqa %xmm4,%xmm0 - movdqa %xmm8,%xmm4 - movdqa %xmm12,%xmm8 - jmp L$open_sse_tail_64_dec_loop -L$open_sse_tail_16_init: - movdqa %xmm0,%xmm1 - - -L$open_sse_tail_16: - testq %rbx,%rbx - jz L$open_sse_finalize - - - - pxor %xmm3,%xmm3 - leaq -1(%rsi,%rbx,1),%rsi - movq %rbx,%r8 -L$open_sse_tail_16_compose: - pslldq $1,%xmm3 - pinsrb $0,(%rsi),%xmm3 - subq $1,%rsi - subq $1,%r8 - jnz L$open_sse_tail_16_compose - -.byte 102,73,15,126,221 - pextrq $1,%xmm3,%r14 - - pxor %xmm1,%xmm3 - - -L$open_sse_tail_16_extract: - pextrb $0,%xmm3,(%rdi) - psrldq $1,%xmm3 - addq $1,%rdi - subq $1,%rbx - jne L$open_sse_tail_16_extract - - addq %r13,%r10 - adcq %r14,%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - -L$open_sse_finalize: - addq 0+0+32(%rbp),%r10 - adcq 8+0+32(%rbp),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - - movq %r10,%r13 - movq %r11,%r14 - movq %r12,%r15 - subq $-5,%r10 - sbbq $-1,%r11 - sbbq $3,%r12 - cmovcq %r13,%r10 - cmovcq %r14,%r11 - cmovcq %r15,%r12 - - addq 0+0+16(%rbp),%r10 - adcq 8+0+16(%rbp),%r11 - - - addq $288 + 0 + 32,%rsp - - - popq %r9 - - movq %r10,(%r9) - movq %r11,8(%r9) - popq %r15 - - popq %r14 - - popq %r13 - - popq %r12 - - popq %rbx - - popq %rbp - - .byte 0xf3,0xc3 - -L$open_sse_128: - - movdqu L$chacha20_consts(%rip),%xmm0 - movdqa %xmm0,%xmm1 - movdqa %xmm0,%xmm2 - movdqu 0(%r9),%xmm4 - movdqa %xmm4,%xmm5 - movdqa %xmm4,%xmm6 - movdqu 16(%r9),%xmm8 - movdqa %xmm8,%xmm9 - movdqa %xmm8,%xmm10 - movdqu 32(%r9),%xmm12 - movdqa %xmm12,%xmm13 - paddd L$sse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm14 - paddd L$sse_inc(%rip),%xmm14 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm11 - movdqa %xmm13,%xmm15 - movq $10,%r10 - -L$open_sse_128_rounds: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 - - decq %r10 - jnz L$open_sse_128_rounds - paddd L$chacha20_consts(%rip),%xmm0 - paddd L$chacha20_consts(%rip),%xmm1 - paddd L$chacha20_consts(%rip),%xmm2 - paddd %xmm7,%xmm4 - paddd %xmm7,%xmm5 - paddd %xmm7,%xmm6 - paddd %xmm11,%xmm9 - paddd %xmm11,%xmm10 - paddd %xmm15,%xmm13 - paddd L$sse_inc(%rip),%xmm15 - paddd %xmm15,%xmm14 - - pand L$clamp(%rip),%xmm0 - movdqa %xmm0,0+0(%rbp) - movdqa %xmm4,0+16(%rbp) - - movq %r8,%r8 - call poly_hash_ad_internal -L$open_sse_128_xor_hash: - cmpq $16,%rbx - jb L$open_sse_tail_16 - subq $16,%rbx - addq 0+0(%rsi),%r10 - adcq 8+0(%rsi),%r11 - adcq $1,%r12 - - - movdqu 0(%rsi),%xmm3 - pxor %xmm3,%xmm1 - movdqu %xmm1,0(%rdi) - leaq 16(%rsi),%rsi - leaq 16(%rdi),%rdi - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - - movdqa %xmm5,%xmm1 - movdqa %xmm9,%xmm5 - movdqa %xmm13,%xmm9 - movdqa %xmm2,%xmm13 - movdqa %xmm6,%xmm2 - movdqa %xmm10,%xmm6 - movdqa %xmm14,%xmm10 - jmp L$open_sse_128_xor_hash - - - - - - - - - -.globl _chacha20_poly1305_seal -.private_extern _chacha20_poly1305_seal - -.p2align 6 -_chacha20_poly1305_seal: - - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - - - pushq %r9 - - subq $288 + 0 + 32,%rsp - - leaq 32(%rsp),%rbp - andq $-32,%rbp - - movq 56(%r9),%rbx - addq %rdx,%rbx - movq %r8,0+0+32(%rbp) - movq %rbx,8+0+32(%rbp) - movq %rdx,%rbx - - movl _OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_seal_avx2 - - cmpq $128,%rbx - jbe L$seal_sse_128 - - movdqa L$chacha20_consts(%rip),%xmm0 - movdqu 0(%r9),%xmm4 - movdqu 16(%r9),%xmm8 - movdqu 32(%r9),%xmm12 - - movdqa %xmm0,%xmm1 - movdqa %xmm0,%xmm2 - movdqa %xmm0,%xmm3 - movdqa %xmm4,%xmm5 - movdqa %xmm4,%xmm6 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm9 - movdqa %xmm8,%xmm10 - movdqa %xmm8,%xmm11 - movdqa %xmm12,%xmm15 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,%xmm14 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,%xmm13 - paddd L$sse_inc(%rip),%xmm12 - - movdqa %xmm4,0+48(%rbp) - movdqa %xmm8,0+64(%rbp) - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - movdqa %xmm15,0+144(%rbp) - movq $10,%r10 -L$seal_sse_init_rounds: - movdqa %xmm8,0+80(%rbp) - movdqa L$rol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movdqa L$rol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 -.byte 102,15,58,15,255,4 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,12 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - movdqa %xmm8,0+80(%rbp) - movdqa L$rol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movdqa L$rol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 -.byte 102,15,58,15,255,12 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,4 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - - decq %r10 - jnz L$seal_sse_init_rounds - paddd L$chacha20_consts(%rip),%xmm3 - paddd 0+48(%rbp),%xmm7 - paddd 0+64(%rbp),%xmm11 - paddd 0+144(%rbp),%xmm15 - paddd L$chacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd L$chacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd L$chacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - - - pand L$clamp(%rip),%xmm3 - movdqa %xmm3,0+0(%rbp) - movdqa %xmm7,0+16(%rbp) - - movq %r8,%r8 - call poly_hash_ad_internal - movdqu 0 + 0(%rsi),%xmm3 - movdqu 16 + 0(%rsi),%xmm7 - movdqu 32 + 0(%rsi),%xmm11 - movdqu 48 + 0(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 0(%rdi) - movdqu %xmm6,16 + 0(%rdi) - movdqu %xmm10,32 + 0(%rdi) - movdqu %xmm15,48 + 0(%rdi) - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 64(%rdi) - movdqu %xmm5,16 + 64(%rdi) - movdqu %xmm9,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - - cmpq $192,%rbx - ja L$seal_sse_main_init - movq $128,%rcx - subq $128,%rbx - leaq 128(%rsi),%rsi - jmp L$seal_sse_128_tail_hash -L$seal_sse_main_init: - movdqu 0 + 128(%rsi),%xmm3 - movdqu 16 + 128(%rsi),%xmm7 - movdqu 32 + 128(%rsi),%xmm11 - movdqu 48 + 128(%rsi),%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm7,%xmm4 - pxor %xmm11,%xmm8 - pxor %xmm12,%xmm15 - movdqu %xmm0,0 + 128(%rdi) - movdqu %xmm4,16 + 128(%rdi) - movdqu %xmm8,32 + 128(%rdi) - movdqu %xmm15,48 + 128(%rdi) - - movq $192,%rcx - subq $192,%rbx - leaq 192(%rsi),%rsi - movq $2,%rcx - movq $8,%r8 - cmpq $64,%rbx - jbe L$seal_sse_tail_64 - cmpq $128,%rbx - jbe L$seal_sse_tail_128 - cmpq $192,%rbx - jbe L$seal_sse_tail_192 - -L$seal_sse_main_loop: - movdqa L$chacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa %xmm0,%xmm2 - movdqa %xmm4,%xmm6 - movdqa %xmm8,%xmm10 - movdqa %xmm0,%xmm3 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm11 - movdqa 0+96(%rbp),%xmm15 - paddd L$sse_inc(%rip),%xmm15 - movdqa %xmm15,%xmm14 - paddd L$sse_inc(%rip),%xmm14 - movdqa %xmm14,%xmm13 - paddd L$sse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - movdqa %xmm15,0+144(%rbp) - -.p2align 5 -L$seal_sse_main_rounds: - movdqa %xmm8,0+80(%rbp) - movdqa L$rol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movdqa L$rol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 -.byte 102,15,58,15,255,4 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,12 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - movdqa %xmm8,0+80(%rbp) - movdqa L$rol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movdqa L$rol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 -.byte 102,15,58,15,255,12 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,4 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - - leaq 16(%rdi),%rdi - decq %r8 - jge L$seal_sse_main_rounds - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi - decq %rcx - jg L$seal_sse_main_rounds - paddd L$chacha20_consts(%rip),%xmm3 - paddd 0+48(%rbp),%xmm7 - paddd 0+64(%rbp),%xmm11 - paddd 0+144(%rbp),%xmm15 - paddd L$chacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd L$chacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd L$chacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - - movdqa %xmm14,0+80(%rbp) - movdqa %xmm14,0+80(%rbp) - movdqu 0 + 0(%rsi),%xmm14 - pxor %xmm3,%xmm14 - movdqu %xmm14,0 + 0(%rdi) - movdqu 16 + 0(%rsi),%xmm14 - pxor %xmm7,%xmm14 - movdqu %xmm14,16 + 0(%rdi) - movdqu 32 + 0(%rsi),%xmm14 - pxor %xmm11,%xmm14 - movdqu %xmm14,32 + 0(%rdi) - movdqu 48 + 0(%rsi),%xmm14 - pxor %xmm15,%xmm14 - movdqu %xmm14,48 + 0(%rdi) - - movdqa 0+80(%rbp),%xmm14 - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 64(%rdi) - movdqu %xmm6,16 + 64(%rdi) - movdqu %xmm10,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - movdqu 0 + 128(%rsi),%xmm3 - movdqu 16 + 128(%rsi),%xmm7 - movdqu 32 + 128(%rsi),%xmm11 - movdqu 48 + 128(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 128(%rdi) - movdqu %xmm5,16 + 128(%rdi) - movdqu %xmm9,32 + 128(%rdi) - movdqu %xmm15,48 + 128(%rdi) - - cmpq $256,%rbx - ja L$seal_sse_main_loop_xor - - movq $192,%rcx - subq $192,%rbx - leaq 192(%rsi),%rsi - jmp L$seal_sse_128_tail_hash -L$seal_sse_main_loop_xor: - movdqu 0 + 192(%rsi),%xmm3 - movdqu 16 + 192(%rsi),%xmm7 - movdqu 32 + 192(%rsi),%xmm11 - movdqu 48 + 192(%rsi),%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm7,%xmm4 - pxor %xmm11,%xmm8 - pxor %xmm12,%xmm15 - movdqu %xmm0,0 + 192(%rdi) - movdqu %xmm4,16 + 192(%rdi) - movdqu %xmm8,32 + 192(%rdi) - movdqu %xmm15,48 + 192(%rdi) - - leaq 256(%rsi),%rsi - subq $256,%rbx - movq $6,%rcx - movq $4,%r8 - cmpq $192,%rbx - jg L$seal_sse_main_loop - movq %rbx,%rcx - testq %rbx,%rbx - je L$seal_sse_128_tail_hash - movq $6,%rcx - cmpq $128,%rbx - ja L$seal_sse_tail_192 - cmpq $64,%rbx - ja L$seal_sse_tail_128 - -L$seal_sse_tail_64: - movdqa L$chacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa 0+96(%rbp),%xmm12 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - -L$seal_sse_tail_64_rounds_and_x2hash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -L$seal_sse_tail_64_rounds_and_x1hash: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi - decq %rcx - jg L$seal_sse_tail_64_rounds_and_x2hash - decq %r8 - jge L$seal_sse_tail_64_rounds_and_x1hash - paddd L$chacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - - jmp L$seal_sse_128_tail_xor - -L$seal_sse_tail_128: - movdqa L$chacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa 0+96(%rbp),%xmm13 - paddd L$sse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - -L$seal_sse_tail_128_rounds_and_x2hash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -L$seal_sse_tail_128_rounds_and_x1hash: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - - leaq 16(%rdi),%rdi - decq %rcx - jg L$seal_sse_tail_128_rounds_and_x2hash - decq %r8 - jge L$seal_sse_tail_128_rounds_and_x1hash - paddd L$chacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd L$chacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqu 0 + 0(%rsi),%xmm3 - movdqu 16 + 0(%rsi),%xmm7 - movdqu 32 + 0(%rsi),%xmm11 - movdqu 48 + 0(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 0(%rdi) - movdqu %xmm5,16 + 0(%rdi) - movdqu %xmm9,32 + 0(%rdi) - movdqu %xmm15,48 + 0(%rdi) - - movq $64,%rcx - subq $64,%rbx - leaq 64(%rsi),%rsi - jmp L$seal_sse_128_tail_hash - -L$seal_sse_tail_192: - movdqa L$chacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa %xmm0,%xmm2 - movdqa %xmm4,%xmm6 - movdqa %xmm8,%xmm10 - movdqa 0+96(%rbp),%xmm14 - paddd L$sse_inc(%rip),%xmm14 - movdqa %xmm14,%xmm13 - paddd L$sse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - -L$seal_sse_tail_192_rounds_and_x2hash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -L$seal_sse_tail_192_rounds_and_x1hash: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 - - leaq 16(%rdi),%rdi - decq %rcx - jg L$seal_sse_tail_192_rounds_and_x2hash - decq %r8 - jge L$seal_sse_tail_192_rounds_and_x1hash - paddd L$chacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd L$chacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd L$chacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqu 0 + 0(%rsi),%xmm3 - movdqu 16 + 0(%rsi),%xmm7 - movdqu 32 + 0(%rsi),%xmm11 - movdqu 48 + 0(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 0(%rdi) - movdqu %xmm6,16 + 0(%rdi) - movdqu %xmm10,32 + 0(%rdi) - movdqu %xmm15,48 + 0(%rdi) - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 64(%rdi) - movdqu %xmm5,16 + 64(%rdi) - movdqu %xmm9,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - - movq $128,%rcx - subq $128,%rbx - leaq 128(%rsi),%rsi - -L$seal_sse_128_tail_hash: - cmpq $16,%rcx - jb L$seal_sse_128_tail_xor - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - subq $16,%rcx - leaq 16(%rdi),%rdi - jmp L$seal_sse_128_tail_hash - -L$seal_sse_128_tail_xor: - cmpq $16,%rbx - jb L$seal_sse_tail_16 - subq $16,%rbx - - movdqu 0(%rsi),%xmm3 - pxor %xmm3,%xmm0 - movdqu %xmm0,0(%rdi) - - addq 0(%rdi),%r10 - adcq 8(%rdi),%r11 - adcq $1,%r12 - leaq 16(%rsi),%rsi - leaq 16(%rdi),%rdi - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - - movdqa %xmm4,%xmm0 - movdqa %xmm8,%xmm4 - movdqa %xmm12,%xmm8 - movdqa %xmm1,%xmm12 - movdqa %xmm5,%xmm1 - movdqa %xmm9,%xmm5 - movdqa %xmm13,%xmm9 - jmp L$seal_sse_128_tail_xor - -L$seal_sse_tail_16: - testq %rbx,%rbx - jz L$process_blocks_of_extra_in - - movq %rbx,%r8 - movq %rbx,%rcx - leaq -1(%rsi,%rbx,1),%rsi - pxor %xmm15,%xmm15 -L$seal_sse_tail_16_compose: - pslldq $1,%xmm15 - pinsrb $0,(%rsi),%xmm15 - leaq -1(%rsi),%rsi - decq %rcx - jne L$seal_sse_tail_16_compose - - - pxor %xmm0,%xmm15 - - - movq %rbx,%rcx - movdqu %xmm15,%xmm0 -L$seal_sse_tail_16_extract: - pextrb $0,%xmm0,(%rdi) - psrldq $1,%xmm0 - addq $1,%rdi - subq $1,%rcx - jnz L$seal_sse_tail_16_extract - - - - - - - - - movq 288 + 0 + 32(%rsp),%r9 - movq 56(%r9),%r14 - movq 48(%r9),%r13 - testq %r14,%r14 - jz L$process_partial_block - - movq $16,%r15 - subq %rbx,%r15 - cmpq %r15,%r14 - - jge L$load_extra_in - movq %r14,%r15 - -L$load_extra_in: - - - leaq -1(%r13,%r15,1),%rsi - - - addq %r15,%r13 - subq %r15,%r14 - movq %r13,48(%r9) - movq %r14,56(%r9) - - - - addq %r15,%r8 - - - pxor %xmm11,%xmm11 -L$load_extra_load_loop: - pslldq $1,%xmm11 - pinsrb $0,(%rsi),%xmm11 - leaq -1(%rsi),%rsi - subq $1,%r15 - jnz L$load_extra_load_loop - - - - - movq %rbx,%r15 - -L$load_extra_shift_loop: - pslldq $1,%xmm11 - subq $1,%r15 - jnz L$load_extra_shift_loop - - - - - leaq L$and_masks(%rip),%r15 - shlq $4,%rbx - pand -16(%r15,%rbx,1),%xmm15 - - - por %xmm11,%xmm15 - - - -.byte 102,77,15,126,253 - pextrq $1,%xmm15,%r14 - addq %r13,%r10 - adcq %r14,%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - -L$process_blocks_of_extra_in: - - movq 288+32+0 (%rsp),%r9 - movq 48(%r9),%rsi - movq 56(%r9),%r8 - movq %r8,%rcx - shrq $4,%r8 - -L$process_extra_hash_loop: - jz process_extra_in_trailer - addq 0+0(%rsi),%r10 - adcq 8+0(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rsi),%rsi - subq $1,%r8 - jmp L$process_extra_hash_loop -process_extra_in_trailer: - andq $15,%rcx - movq %rcx,%rbx - jz L$do_length_block - leaq -1(%rsi,%rcx,1),%rsi - -L$process_extra_in_trailer_load: - pslldq $1,%xmm15 - pinsrb $0,(%rsi),%xmm15 - leaq -1(%rsi),%rsi - subq $1,%rcx - jnz L$process_extra_in_trailer_load - -L$process_partial_block: - - leaq L$and_masks(%rip),%r15 - shlq $4,%rbx - pand -16(%r15,%rbx,1),%xmm15 -.byte 102,77,15,126,253 - pextrq $1,%xmm15,%r14 - addq %r13,%r10 - adcq %r14,%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - -L$do_length_block: - addq 0+0+32(%rbp),%r10 - adcq 8+0+32(%rbp),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - - movq %r10,%r13 - movq %r11,%r14 - movq %r12,%r15 - subq $-5,%r10 - sbbq $-1,%r11 - sbbq $3,%r12 - cmovcq %r13,%r10 - cmovcq %r14,%r11 - cmovcq %r15,%r12 - - addq 0+0+16(%rbp),%r10 - adcq 8+0+16(%rbp),%r11 - - - addq $288 + 0 + 32,%rsp - - - popq %r9 - - movq %r10,(%r9) - movq %r11,8(%r9) - popq %r15 - - popq %r14 - - popq %r13 - - popq %r12 - - popq %rbx - - popq %rbp - - .byte 0xf3,0xc3 - -L$seal_sse_128: - - movdqu L$chacha20_consts(%rip),%xmm0 - movdqa %xmm0,%xmm1 - movdqa %xmm0,%xmm2 - movdqu 0(%r9),%xmm4 - movdqa %xmm4,%xmm5 - movdqa %xmm4,%xmm6 - movdqu 16(%r9),%xmm8 - movdqa %xmm8,%xmm9 - movdqa %xmm8,%xmm10 - movdqu 32(%r9),%xmm14 - movdqa %xmm14,%xmm12 - paddd L$sse_inc(%rip),%xmm12 - movdqa %xmm12,%xmm13 - paddd L$sse_inc(%rip),%xmm13 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm11 - movdqa %xmm12,%xmm15 - movq $10,%r10 - -L$seal_sse_128_rounds: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb L$rol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb L$rol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb L$rol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 - - decq %r10 - jnz L$seal_sse_128_rounds - paddd L$chacha20_consts(%rip),%xmm0 - paddd L$chacha20_consts(%rip),%xmm1 - paddd L$chacha20_consts(%rip),%xmm2 - paddd %xmm7,%xmm4 - paddd %xmm7,%xmm5 - paddd %xmm7,%xmm6 - paddd %xmm11,%xmm8 - paddd %xmm11,%xmm9 - paddd %xmm15,%xmm12 - paddd L$sse_inc(%rip),%xmm15 - paddd %xmm15,%xmm13 - - pand L$clamp(%rip),%xmm2 - movdqa %xmm2,0+0(%rbp) - movdqa %xmm6,0+16(%rbp) - - movq %r8,%r8 - call poly_hash_ad_internal - jmp L$seal_sse_128_tail_xor - - - - - -.p2align 6 -chacha20_poly1305_open_avx2: - - - - - - - - - - - - - vzeroupper - vmovdqa L$chacha20_consts(%rip),%ymm0 - vbroadcasti128 0(%r9),%ymm4 - vbroadcasti128 16(%r9),%ymm8 - vbroadcasti128 32(%r9),%ymm12 - vpaddd L$avx2_init(%rip),%ymm12,%ymm12 - cmpq $192,%rbx - jbe L$open_avx2_192 - cmpq $320,%rbx - jbe L$open_avx2_320 - - vmovdqa %ymm4,0+64(%rbp) - vmovdqa %ymm8,0+96(%rbp) - vmovdqa %ymm12,0+160(%rbp) - movq $10,%r10 -L$open_avx2_init_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - - decq %r10 - jne L$open_avx2_init_rounds - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - - vpand L$clamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0+0(%rbp) - - vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 - - movq %r8,%r8 - call poly_hash_ad_internal - - xorq %rcx,%rcx -L$open_avx2_init_hash: - addq 0+0(%rsi,%rcx,1),%r10 - adcq 8+0(%rsi,%rcx,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - addq $16,%rcx - cmpq $64,%rcx - jne L$open_avx2_init_hash - - vpxor 0(%rsi),%ymm0,%ymm0 - vpxor 32(%rsi),%ymm4,%ymm4 - - vmovdqu %ymm0,0(%rdi) - vmovdqu %ymm4,32(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - subq $64,%rbx -L$open_avx2_main_loop: - - cmpq $512,%rbx - jb L$open_avx2_main_loop_done - vmovdqa L$chacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa L$avx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm15 - vpaddd %ymm15,%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,0+256(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm12,0+160(%rbp) - - xorq %rcx,%rcx -L$open_avx2_main_loop_rounds: - addq 0+0(%rsi,%rcx,1),%r10 - adcq 8+0(%rsi,%rcx,1),%r11 - adcq $1,%r12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - addq %rax,%r15 - adcq %rdx,%r9 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - addq 0+16(%rsi,%rcx,1),%r10 - adcq 8+16(%rsi,%rcx,1),%r11 - adcq $1,%r12 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - addq %rax,%r15 - adcq %rdx,%r9 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - addq 0+32(%rsi,%rcx,1),%r10 - adcq 8+32(%rsi,%rcx,1),%r11 - adcq $1,%r12 - - leaq 48(%rcx),%rcx - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - addq %rax,%r15 - adcq %rdx,%r9 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpalignr $4,%ymm12,%ymm12,%ymm12 - - cmpq $60*8,%rcx - jne L$open_avx2_main_loop_rounds - vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 0+64(%rbp),%ymm7,%ymm7 - vpaddd 0+96(%rbp),%ymm11,%ymm11 - vpaddd 0+256(%rbp),%ymm15,%ymm15 - vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vmovdqa %ymm0,0+128(%rbp) - addq 0+60*8(%rsi),%r10 - adcq 8+60*8(%rsi),%r11 - adcq $1,%r12 - vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 - vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 - vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 - vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 - vpxor 0+0(%rsi),%ymm0,%ymm0 - vpxor 32+0(%rsi),%ymm3,%ymm3 - vpxor 64+0(%rsi),%ymm7,%ymm7 - vpxor 96+0(%rsi),%ymm11,%ymm11 - vmovdqu %ymm0,0+0(%rdi) - vmovdqu %ymm3,32+0(%rdi) - vmovdqu %ymm7,64+0(%rdi) - vmovdqu %ymm11,96+0(%rdi) - - vmovdqa 0+128(%rbp),%ymm0 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm2,%ymm2 - vpxor 64+128(%rsi),%ymm6,%ymm6 - vpxor 96+128(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm2,32+128(%rdi) - vmovdqu %ymm6,64+128(%rdi) - vmovdqu %ymm10,96+128(%rdi) - addq 0+60*8+16(%rsi),%r10 - adcq 8+60*8+16(%rsi),%r11 - adcq $1,%r12 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+256(%rsi),%ymm3,%ymm3 - vpxor 32+256(%rsi),%ymm1,%ymm1 - vpxor 64+256(%rsi),%ymm5,%ymm5 - vpxor 96+256(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+256(%rdi) - vmovdqu %ymm1,32+256(%rdi) - vmovdqu %ymm5,64+256(%rdi) - vmovdqu %ymm9,96+256(%rdi) - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 - vpxor 0+384(%rsi),%ymm3,%ymm3 - vpxor 32+384(%rsi),%ymm0,%ymm0 - vpxor 64+384(%rsi),%ymm4,%ymm4 - vpxor 96+384(%rsi),%ymm8,%ymm8 - vmovdqu %ymm3,0+384(%rdi) - vmovdqu %ymm0,32+384(%rdi) - vmovdqu %ymm4,64+384(%rdi) - vmovdqu %ymm8,96+384(%rdi) - - leaq 512(%rsi),%rsi - leaq 512(%rdi),%rdi - subq $512,%rbx - jmp L$open_avx2_main_loop -L$open_avx2_main_loop_done: - testq %rbx,%rbx - vzeroupper - je L$open_sse_finalize - - cmpq $384,%rbx - ja L$open_avx2_tail_512 - cmpq $256,%rbx - ja L$open_avx2_tail_384 - cmpq $128,%rbx - ja L$open_avx2_tail_256 - vmovdqa L$chacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa L$avx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - - xorq %r8,%r8 - movq %rbx,%rcx - andq $-16,%rcx - testq %rcx,%rcx - je L$open_avx2_tail_128_rounds -L$open_avx2_tail_128_rounds_and_x1hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - -L$open_avx2_tail_128_rounds: - addq $16,%r8 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - - cmpq %rcx,%r8 - jb L$open_avx2_tail_128_rounds_and_x1hash - cmpq $160,%r8 - jne L$open_avx2_tail_128_rounds - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - jmp L$open_avx2_tail_128_xor - -L$open_avx2_tail_256: - vmovdqa L$chacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa L$avx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - - movq %rbx,0+128(%rbp) - movq %rbx,%rcx - subq $128,%rcx - shrq $4,%rcx - movq $10,%r8 - cmpq $10,%rcx - cmovgq %r8,%rcx - movq %rsi,%rbx - xorq %r8,%r8 -L$open_avx2_tail_256_rounds_and_x1hash: - addq 0+0(%rbx),%r10 - adcq 8+0(%rbx),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rbx),%rbx -L$open_avx2_tail_256_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - - incq %r8 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm6,%ymm6,%ymm6 - - cmpq %rcx,%r8 - jb L$open_avx2_tail_256_rounds_and_x1hash - cmpq $10,%r8 - jne L$open_avx2_tail_256_rounds - movq %rbx,%r8 - subq %rsi,%rbx - movq %rbx,%rcx - movq 0+128(%rbp),%rbx -L$open_avx2_tail_256_hash: - addq $16,%rcx - cmpq %rbx,%rcx - jg L$open_avx2_tail_256_done - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%r8),%r8 - jmp L$open_avx2_tail_256_hash -L$open_avx2_tail_256_done: - vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+0(%rsi),%ymm3,%ymm3 - vpxor 32+0(%rsi),%ymm1,%ymm1 - vpxor 64+0(%rsi),%ymm5,%ymm5 - vpxor 96+0(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+0(%rdi) - vmovdqu %ymm1,32+0(%rdi) - vmovdqu %ymm5,64+0(%rdi) - vmovdqu %ymm9,96+0(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - leaq 128(%rsi),%rsi - leaq 128(%rdi),%rdi - subq $128,%rbx - jmp L$open_avx2_tail_128_xor - -L$open_avx2_tail_384: - vmovdqa L$chacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa L$avx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm14,0+224(%rbp) - - movq %rbx,0+128(%rbp) - movq %rbx,%rcx - subq $256,%rcx - shrq $4,%rcx - addq $6,%rcx - movq $10,%r8 - cmpq $10,%rcx - cmovgq %r8,%rcx - movq %rsi,%rbx - xorq %r8,%r8 -L$open_avx2_tail_384_rounds_and_x2hash: - addq 0+0(%rbx),%r10 - adcq 8+0(%rbx),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rbx),%rbx -L$open_avx2_tail_384_rounds_and_x1hash: - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - addq 0+0(%rbx),%r10 - adcq 8+0(%rbx),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rbx),%rbx - incq %r8 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - - cmpq %rcx,%r8 - jb L$open_avx2_tail_384_rounds_and_x2hash - cmpq $10,%r8 - jne L$open_avx2_tail_384_rounds_and_x1hash - movq %rbx,%r8 - subq %rsi,%rbx - movq %rbx,%rcx - movq 0+128(%rbp),%rbx -L$open_avx2_384_tail_hash: - addq $16,%rcx - cmpq %rbx,%rcx - jg L$open_avx2_384_tail_done - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%r8),%r8 - jmp L$open_avx2_384_tail_hash -L$open_avx2_384_tail_done: - vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+0(%rsi),%ymm3,%ymm3 - vpxor 32+0(%rsi),%ymm2,%ymm2 - vpxor 64+0(%rsi),%ymm6,%ymm6 - vpxor 96+0(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+0(%rdi) - vmovdqu %ymm2,32+0(%rdi) - vmovdqu %ymm6,64+0(%rdi) - vmovdqu %ymm10,96+0(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm1,%ymm1 - vpxor 64+128(%rsi),%ymm5,%ymm5 - vpxor 96+128(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm1,32+128(%rdi) - vmovdqu %ymm5,64+128(%rdi) - vmovdqu %ymm9,96+128(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - leaq 256(%rsi),%rsi - leaq 256(%rdi),%rdi - subq $256,%rbx - jmp L$open_avx2_tail_128_xor - -L$open_avx2_tail_512: - vmovdqa L$chacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa L$avx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm15 - vpaddd %ymm15,%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,0+256(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm12,0+160(%rbp) - - xorq %rcx,%rcx - movq %rsi,%r8 -L$open_avx2_tail_512_rounds_and_x2hash: - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%r8),%r8 -L$open_avx2_tail_512_rounds_and_x1hash: - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - addq 0+16(%r8),%r10 - adcq 8+16(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%r8),%r8 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm12,%ymm12,%ymm12 - - incq %rcx - cmpq $4,%rcx - jl L$open_avx2_tail_512_rounds_and_x2hash - cmpq $10,%rcx - jne L$open_avx2_tail_512_rounds_and_x1hash - movq %rbx,%rcx - subq $384,%rcx - andq $-16,%rcx -L$open_avx2_tail_512_hash: - testq %rcx,%rcx - je L$open_avx2_tail_512_done - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%r8),%r8 - subq $16,%rcx - jmp L$open_avx2_tail_512_hash -L$open_avx2_tail_512_done: - vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 0+64(%rbp),%ymm7,%ymm7 - vpaddd 0+96(%rbp),%ymm11,%ymm11 - vpaddd 0+256(%rbp),%ymm15,%ymm15 - vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vmovdqa %ymm0,0+128(%rbp) - vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 - vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 - vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 - vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 - vpxor 0+0(%rsi),%ymm0,%ymm0 - vpxor 32+0(%rsi),%ymm3,%ymm3 - vpxor 64+0(%rsi),%ymm7,%ymm7 - vpxor 96+0(%rsi),%ymm11,%ymm11 - vmovdqu %ymm0,0+0(%rdi) - vmovdqu %ymm3,32+0(%rdi) - vmovdqu %ymm7,64+0(%rdi) - vmovdqu %ymm11,96+0(%rdi) - - vmovdqa 0+128(%rbp),%ymm0 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm2,%ymm2 - vpxor 64+128(%rsi),%ymm6,%ymm6 - vpxor 96+128(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm2,32+128(%rdi) - vmovdqu %ymm6,64+128(%rdi) - vmovdqu %ymm10,96+128(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+256(%rsi),%ymm3,%ymm3 - vpxor 32+256(%rsi),%ymm1,%ymm1 - vpxor 64+256(%rsi),%ymm5,%ymm5 - vpxor 96+256(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+256(%rdi) - vmovdqu %ymm1,32+256(%rdi) - vmovdqu %ymm5,64+256(%rdi) - vmovdqu %ymm9,96+256(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - leaq 384(%rsi),%rsi - leaq 384(%rdi),%rdi - subq $384,%rbx -L$open_avx2_tail_128_xor: - cmpq $32,%rbx - jb L$open_avx2_tail_32_xor - subq $32,%rbx - vpxor (%rsi),%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - leaq 32(%rsi),%rsi - leaq 32(%rdi),%rdi - vmovdqa %ymm4,%ymm0 - vmovdqa %ymm8,%ymm4 - vmovdqa %ymm12,%ymm8 - jmp L$open_avx2_tail_128_xor -L$open_avx2_tail_32_xor: - cmpq $16,%rbx - vmovdqa %xmm0,%xmm1 - jb L$open_avx2_exit - subq $16,%rbx - - vpxor (%rsi),%xmm0,%xmm1 - vmovdqu %xmm1,(%rdi) - leaq 16(%rsi),%rsi - leaq 16(%rdi),%rdi - vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 - vmovdqa %xmm0,%xmm1 -L$open_avx2_exit: - vzeroupper - jmp L$open_sse_tail_16 - -L$open_avx2_192: - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm8,%ymm10 - vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 - vmovdqa %ymm12,%ymm11 - vmovdqa %ymm13,%ymm15 - movq $10,%r10 -L$open_avx2_192_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - - decq %r10 - jne L$open_avx2_192_rounds - vpaddd %ymm2,%ymm0,%ymm0 - vpaddd %ymm2,%ymm1,%ymm1 - vpaddd %ymm6,%ymm4,%ymm4 - vpaddd %ymm6,%ymm5,%ymm5 - vpaddd %ymm10,%ymm8,%ymm8 - vpaddd %ymm10,%ymm9,%ymm9 - vpaddd %ymm11,%ymm12,%ymm12 - vpaddd %ymm15,%ymm13,%ymm13 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - - vpand L$clamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0+0(%rbp) - - vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 -L$open_avx2_short: - movq %r8,%r8 - call poly_hash_ad_internal -L$open_avx2_short_hash_and_xor_loop: - cmpq $32,%rbx - jb L$open_avx2_short_tail_32 - subq $32,%rbx - addq 0+0(%rsi),%r10 - adcq 8+0(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - addq 0+16(%rsi),%r10 - adcq 8+16(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - - vpxor (%rsi),%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - leaq 32(%rsi),%rsi - leaq 32(%rdi),%rdi - - vmovdqa %ymm4,%ymm0 - vmovdqa %ymm8,%ymm4 - vmovdqa %ymm12,%ymm8 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm5,%ymm1 - vmovdqa %ymm9,%ymm5 - vmovdqa %ymm13,%ymm9 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm6,%ymm2 - jmp L$open_avx2_short_hash_and_xor_loop -L$open_avx2_short_tail_32: - cmpq $16,%rbx - vmovdqa %xmm0,%xmm1 - jb L$open_avx2_short_tail_32_exit - subq $16,%rbx - addq 0+0(%rsi),%r10 - adcq 8+0(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - vpxor (%rsi),%xmm0,%xmm3 - vmovdqu %xmm3,(%rdi) - leaq 16(%rsi),%rsi - leaq 16(%rdi),%rdi - vextracti128 $1,%ymm0,%xmm1 -L$open_avx2_short_tail_32_exit: - vzeroupper - jmp L$open_sse_tail_16 - -L$open_avx2_320: - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm8,%ymm10 - vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 - vpaddd L$avx2_inc(%rip),%ymm13,%ymm14 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm14,0+224(%rbp) - movq $10,%r10 -L$open_avx2_320_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm6,%ymm6,%ymm6 - - decq %r10 - jne L$open_avx2_320_rounds - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd %ymm7,%ymm4,%ymm4 - vpaddd %ymm7,%ymm5,%ymm5 - vpaddd %ymm7,%ymm6,%ymm6 - vpaddd %ymm11,%ymm8,%ymm8 - vpaddd %ymm11,%ymm9,%ymm9 - vpaddd %ymm11,%ymm10,%ymm10 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - - vpand L$clamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0+0(%rbp) - - vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 - jmp L$open_avx2_short - - - - - -.p2align 6 -chacha20_poly1305_seal_avx2: - - - - - - - - - - - - - vzeroupper - vmovdqa L$chacha20_consts(%rip),%ymm0 - vbroadcasti128 0(%r9),%ymm4 - vbroadcasti128 16(%r9),%ymm8 - vbroadcasti128 32(%r9),%ymm12 - vpaddd L$avx2_init(%rip),%ymm12,%ymm12 - cmpq $192,%rbx - jbe L$seal_avx2_192 - cmpq $320,%rbx - jbe L$seal_avx2_320 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm4,0+64(%rbp) - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm8,%ymm11 - vmovdqa %ymm8,0+96(%rbp) - vmovdqa %ymm12,%ymm15 - vpaddd L$avx2_inc(%rip),%ymm15,%ymm14 - vpaddd L$avx2_inc(%rip),%ymm14,%ymm13 - vpaddd L$avx2_inc(%rip),%ymm13,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm15,0+256(%rbp) - movq $10,%r10 -L$seal_avx2_init_rounds: - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm12,%ymm12,%ymm12 - - decq %r10 - jnz L$seal_avx2_init_rounds - vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 0+64(%rbp),%ymm7,%ymm7 - vpaddd 0+96(%rbp),%ymm11,%ymm11 - vpaddd 0+256(%rbp),%ymm15,%ymm15 - vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 - vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 - vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 - vpand L$clamp(%rip),%ymm15,%ymm15 - vmovdqa %ymm15,0+0(%rbp) - movq %r8,%r8 - call poly_hash_ad_internal - - vpxor 0(%rsi),%ymm3,%ymm3 - vpxor 32(%rsi),%ymm11,%ymm11 - vmovdqu %ymm3,0(%rdi) - vmovdqu %ymm11,32(%rdi) - vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+64(%rsi),%ymm15,%ymm15 - vpxor 32+64(%rsi),%ymm2,%ymm2 - vpxor 64+64(%rsi),%ymm6,%ymm6 - vpxor 96+64(%rsi),%ymm10,%ymm10 - vmovdqu %ymm15,0+64(%rdi) - vmovdqu %ymm2,32+64(%rdi) - vmovdqu %ymm6,64+64(%rdi) - vmovdqu %ymm10,96+64(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+192(%rsi),%ymm15,%ymm15 - vpxor 32+192(%rsi),%ymm1,%ymm1 - vpxor 64+192(%rsi),%ymm5,%ymm5 - vpxor 96+192(%rsi),%ymm9,%ymm9 - vmovdqu %ymm15,0+192(%rdi) - vmovdqu %ymm1,32+192(%rdi) - vmovdqu %ymm5,64+192(%rdi) - vmovdqu %ymm9,96+192(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm15,%ymm8 - - leaq 320(%rsi),%rsi - subq $320,%rbx - movq $320,%rcx - cmpq $128,%rbx - jbe L$seal_avx2_short_hash_remainder - vpxor 0(%rsi),%ymm0,%ymm0 - vpxor 32(%rsi),%ymm4,%ymm4 - vpxor 64(%rsi),%ymm8,%ymm8 - vpxor 96(%rsi),%ymm12,%ymm12 - vmovdqu %ymm0,320(%rdi) - vmovdqu %ymm4,352(%rdi) - vmovdqu %ymm8,384(%rdi) - vmovdqu %ymm12,416(%rdi) - leaq 128(%rsi),%rsi - subq $128,%rbx - movq $8,%rcx - movq $2,%r8 - cmpq $128,%rbx - jbe L$seal_avx2_tail_128 - cmpq $256,%rbx - jbe L$seal_avx2_tail_256 - cmpq $384,%rbx - jbe L$seal_avx2_tail_384 - cmpq $512,%rbx - jbe L$seal_avx2_tail_512 - vmovdqa L$chacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa L$avx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm15 - vpaddd %ymm15,%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,0+256(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - - subq $16,%rdi - movq $9,%rcx - jmp L$seal_avx2_main_loop_rounds_entry -.p2align 5 -L$seal_avx2_main_loop: - vmovdqa L$chacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa L$avx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm15 - vpaddd %ymm15,%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,0+256(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm12,0+160(%rbp) - - movq $10,%rcx -.p2align 5 -L$seal_avx2_main_loop_rounds: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - addq %rax,%r15 - adcq %rdx,%r9 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - -L$seal_avx2_main_loop_rounds_entry: - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - addq %rax,%r15 - adcq %rdx,%r9 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - addq 0+32(%rdi),%r10 - adcq 8+32(%rdi),%r11 - adcq $1,%r12 - - leaq 48(%rdi),%rdi - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - addq %rax,%r15 - adcq %rdx,%r9 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpalignr $4,%ymm12,%ymm12,%ymm12 - - decq %rcx - jne L$seal_avx2_main_loop_rounds - vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 0+64(%rbp),%ymm7,%ymm7 - vpaddd 0+96(%rbp),%ymm11,%ymm11 - vpaddd 0+256(%rbp),%ymm15,%ymm15 - vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vmovdqa %ymm0,0+128(%rbp) - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 - vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 - vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 - vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 - vpxor 0+0(%rsi),%ymm0,%ymm0 - vpxor 32+0(%rsi),%ymm3,%ymm3 - vpxor 64+0(%rsi),%ymm7,%ymm7 - vpxor 96+0(%rsi),%ymm11,%ymm11 - vmovdqu %ymm0,0+0(%rdi) - vmovdqu %ymm3,32+0(%rdi) - vmovdqu %ymm7,64+0(%rdi) - vmovdqu %ymm11,96+0(%rdi) - - vmovdqa 0+128(%rbp),%ymm0 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm2,%ymm2 - vpxor 64+128(%rsi),%ymm6,%ymm6 - vpxor 96+128(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm2,32+128(%rdi) - vmovdqu %ymm6,64+128(%rdi) - vmovdqu %ymm10,96+128(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+256(%rsi),%ymm3,%ymm3 - vpxor 32+256(%rsi),%ymm1,%ymm1 - vpxor 64+256(%rsi),%ymm5,%ymm5 - vpxor 96+256(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+256(%rdi) - vmovdqu %ymm1,32+256(%rdi) - vmovdqu %ymm5,64+256(%rdi) - vmovdqu %ymm9,96+256(%rdi) - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 - vpxor 0+384(%rsi),%ymm3,%ymm3 - vpxor 32+384(%rsi),%ymm0,%ymm0 - vpxor 64+384(%rsi),%ymm4,%ymm4 - vpxor 96+384(%rsi),%ymm8,%ymm8 - vmovdqu %ymm3,0+384(%rdi) - vmovdqu %ymm0,32+384(%rdi) - vmovdqu %ymm4,64+384(%rdi) - vmovdqu %ymm8,96+384(%rdi) - - leaq 512(%rsi),%rsi - subq $512,%rbx - cmpq $512,%rbx - jg L$seal_avx2_main_loop - - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - movq $10,%rcx - xorq %r8,%r8 - - cmpq $384,%rbx - ja L$seal_avx2_tail_512 - cmpq $256,%rbx - ja L$seal_avx2_tail_384 - cmpq $128,%rbx - ja L$seal_avx2_tail_256 - -L$seal_avx2_tail_128: - vmovdqa L$chacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa L$avx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - -L$seal_avx2_tail_128_rounds_and_3xhash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -L$seal_avx2_tail_128_rounds_and_2xhash: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - decq %rcx - jg L$seal_avx2_tail_128_rounds_and_3xhash - decq %r8 - jge L$seal_avx2_tail_128_rounds_and_2xhash - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - jmp L$seal_avx2_short_loop - -L$seal_avx2_tail_256: - vmovdqa L$chacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa L$avx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - -L$seal_avx2_tail_256_rounds_and_3xhash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -L$seal_avx2_tail_256_rounds_and_2xhash: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - decq %rcx - jg L$seal_avx2_tail_256_rounds_and_3xhash - decq %r8 - jge L$seal_avx2_tail_256_rounds_and_2xhash - vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+0(%rsi),%ymm3,%ymm3 - vpxor 32+0(%rsi),%ymm1,%ymm1 - vpxor 64+0(%rsi),%ymm5,%ymm5 - vpxor 96+0(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+0(%rdi) - vmovdqu %ymm1,32+0(%rdi) - vmovdqu %ymm5,64+0(%rdi) - vmovdqu %ymm9,96+0(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - movq $128,%rcx - leaq 128(%rsi),%rsi - subq $128,%rbx - jmp L$seal_avx2_short_hash_remainder - -L$seal_avx2_tail_384: - vmovdqa L$chacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa L$avx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm14,0+224(%rbp) - -L$seal_avx2_tail_384_rounds_and_3xhash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -L$seal_avx2_tail_384_rounds_and_2xhash: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm6,%ymm6,%ymm6 - - leaq 32(%rdi),%rdi - decq %rcx - jg L$seal_avx2_tail_384_rounds_and_3xhash - decq %r8 - jge L$seal_avx2_tail_384_rounds_and_2xhash - vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+0(%rsi),%ymm3,%ymm3 - vpxor 32+0(%rsi),%ymm2,%ymm2 - vpxor 64+0(%rsi),%ymm6,%ymm6 - vpxor 96+0(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+0(%rdi) - vmovdqu %ymm2,32+0(%rdi) - vmovdqu %ymm6,64+0(%rdi) - vmovdqu %ymm10,96+0(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm1,%ymm1 - vpxor 64+128(%rsi),%ymm5,%ymm5 - vpxor 96+128(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm1,32+128(%rdi) - vmovdqu %ymm5,64+128(%rdi) - vmovdqu %ymm9,96+128(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - movq $256,%rcx - leaq 256(%rsi),%rsi - subq $256,%rbx - jmp L$seal_avx2_short_hash_remainder - -L$seal_avx2_tail_512: - vmovdqa L$chacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa L$avx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm15 - vpaddd %ymm15,%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,0+256(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm12,0+160(%rbp) - -L$seal_avx2_tail_512_rounds_and_3xhash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -L$seal_avx2_tail_512_rounds_and_2xhash: - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - addq %rax,%r15 - adcq %rdx,%r9 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa L$rol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa L$rol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm12,%ymm12,%ymm12 - - - - - - - - - - - - - - - - - addq %rax,%r15 - adcq %rdx,%r9 - - - - - - - - - - - - - - - - - - - - - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - decq %rcx - jg L$seal_avx2_tail_512_rounds_and_3xhash - decq %r8 - jge L$seal_avx2_tail_512_rounds_and_2xhash - vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 0+64(%rbp),%ymm7,%ymm7 - vpaddd 0+96(%rbp),%ymm11,%ymm11 - vpaddd 0+256(%rbp),%ymm15,%ymm15 - vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vmovdqa %ymm0,0+128(%rbp) - vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 - vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 - vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 - vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 - vpxor 0+0(%rsi),%ymm0,%ymm0 - vpxor 32+0(%rsi),%ymm3,%ymm3 - vpxor 64+0(%rsi),%ymm7,%ymm7 - vpxor 96+0(%rsi),%ymm11,%ymm11 - vmovdqu %ymm0,0+0(%rdi) - vmovdqu %ymm3,32+0(%rdi) - vmovdqu %ymm7,64+0(%rdi) - vmovdqu %ymm11,96+0(%rdi) - - vmovdqa 0+128(%rbp),%ymm0 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm2,%ymm2 - vpxor 64+128(%rsi),%ymm6,%ymm6 - vpxor 96+128(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm2,32+128(%rdi) - vmovdqu %ymm6,64+128(%rdi) - vmovdqu %ymm10,96+128(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+256(%rsi),%ymm3,%ymm3 - vpxor 32+256(%rsi),%ymm1,%ymm1 - vpxor 64+256(%rsi),%ymm5,%ymm5 - vpxor 96+256(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+256(%rdi) - vmovdqu %ymm1,32+256(%rdi) - vmovdqu %ymm5,64+256(%rdi) - vmovdqu %ymm9,96+256(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - movq $384,%rcx - leaq 384(%rsi),%rsi - subq $384,%rbx - jmp L$seal_avx2_short_hash_remainder - -L$seal_avx2_320: - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm8,%ymm10 - vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 - vpaddd L$avx2_inc(%rip),%ymm13,%ymm14 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm14,0+224(%rbp) - movq $10,%r10 -L$seal_avx2_320_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb L$rol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm6,%ymm6,%ymm6 - - decq %r10 - jne L$seal_avx2_320_rounds - vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 - vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 - vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 - vpaddd %ymm7,%ymm4,%ymm4 - vpaddd %ymm7,%ymm5,%ymm5 - vpaddd %ymm7,%ymm6,%ymm6 - vpaddd %ymm11,%ymm8,%ymm8 - vpaddd %ymm11,%ymm9,%ymm9 - vpaddd %ymm11,%ymm10,%ymm10 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - - vpand L$clamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0+0(%rbp) - - vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 - jmp L$seal_avx2_short - -L$seal_avx2_192: - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm8,%ymm10 - vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 - vmovdqa %ymm12,%ymm11 - vmovdqa %ymm13,%ymm15 - movq $10,%r10 -L$seal_avx2_192_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb L$rol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb L$rol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - - decq %r10 - jne L$seal_avx2_192_rounds - vpaddd %ymm2,%ymm0,%ymm0 - vpaddd %ymm2,%ymm1,%ymm1 - vpaddd %ymm6,%ymm4,%ymm4 - vpaddd %ymm6,%ymm5,%ymm5 - vpaddd %ymm10,%ymm8,%ymm8 - vpaddd %ymm10,%ymm9,%ymm9 - vpaddd %ymm11,%ymm12,%ymm12 - vpaddd %ymm15,%ymm13,%ymm13 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - - vpand L$clamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0+0(%rbp) - - vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 -L$seal_avx2_short: - movq %r8,%r8 - call poly_hash_ad_internal - xorq %rcx,%rcx -L$seal_avx2_short_hash_remainder: - cmpq $16,%rcx - jb L$seal_avx2_short_loop - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - subq $16,%rcx - addq $16,%rdi - jmp L$seal_avx2_short_hash_remainder -L$seal_avx2_short_loop: - cmpq $32,%rbx - jb L$seal_avx2_short_tail - subq $32,%rbx - - vpxor (%rsi),%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - leaq 32(%rsi),%rsi - - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - - vmovdqa %ymm4,%ymm0 - vmovdqa %ymm8,%ymm4 - vmovdqa %ymm12,%ymm8 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm5,%ymm1 - vmovdqa %ymm9,%ymm5 - vmovdqa %ymm13,%ymm9 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm6,%ymm2 - jmp L$seal_avx2_short_loop -L$seal_avx2_short_tail: - cmpq $16,%rbx - jb L$seal_avx2_exit - subq $16,%rbx - vpxor (%rsi),%xmm0,%xmm3 - vmovdqu %xmm3,(%rdi) - leaq 16(%rsi),%rsi - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi - vextracti128 $1,%ymm0,%xmm0 -L$seal_avx2_exit: - vzeroupper - jmp L$seal_sse_tail_16 - - -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S deleted file mode 100644 index e497c35f..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S +++ /dev/null @@ -1,850 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - -.p2align 5 -_aesni_ctr32_ghash_6x: - - vmovdqu 32(%r11),%xmm2 - subq $6,%rdx - vpxor %xmm4,%xmm4,%xmm4 - vmovdqu 0-128(%rcx),%xmm15 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpaddb %xmm2,%xmm11,%xmm12 - vpaddb %xmm2,%xmm12,%xmm13 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm15,%xmm1,%xmm9 - vmovdqu %xmm4,16+8(%rsp) - jmp L$oop6x - -.p2align 5 -L$oop6x: - addl $100663296,%ebx - jc L$handle_ctr32 - vmovdqu 0-32(%r9),%xmm3 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm15,%xmm10,%xmm10 - vpxor %xmm15,%xmm11,%xmm11 - -L$resume_ctr32: - vmovdqu %xmm1,(%r8) - vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 - vpxor %xmm15,%xmm12,%xmm12 - vmovups 16-128(%rcx),%xmm2 - vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 - - - - - - - - - - - - - - - - - - xorq %r12,%r12 - cmpq %r14,%r15 - - vaesenc %xmm2,%xmm9,%xmm9 - vmovdqu 48+8(%rsp),%xmm0 - vpxor %xmm15,%xmm13,%xmm13 - vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 - vaesenc %xmm2,%xmm10,%xmm10 - vpxor %xmm15,%xmm14,%xmm14 - setnc %r12b - vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vmovdqu 16-32(%r9),%xmm3 - negq %r12 - vaesenc %xmm2,%xmm12,%xmm12 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 - vpxor %xmm4,%xmm8,%xmm8 - vaesenc %xmm2,%xmm13,%xmm13 - vpxor %xmm5,%xmm1,%xmm4 - andq $0x60,%r12 - vmovups 32-128(%rcx),%xmm15 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 - vaesenc %xmm2,%xmm14,%xmm14 - - vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 - leaq (%r14,%r12,1),%r14 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 - vmovdqu 64+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 88(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 80(%r14),%r12 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,32+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,40+8(%rsp) - vmovdqu 48-32(%r9),%xmm5 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 48-128(%rcx),%xmm15 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm3,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 - vaesenc %xmm15,%xmm11,%xmm11 - vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 - vmovdqu 80+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqu 64-32(%r9),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 64-128(%rcx),%xmm15 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 72(%r14),%r13 - vpxor %xmm5,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 64(%r14),%r12 - vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 - vmovdqu 96+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,48+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,56+8(%rsp) - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 96-32(%r9),%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 80-128(%rcx),%xmm15 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 56(%r14),%r13 - vpxor %xmm1,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 - vpxor 112+8(%rsp),%xmm8,%xmm8 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 48(%r14),%r12 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,64+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,72+8(%rsp) - vpxor %xmm3,%xmm4,%xmm4 - vmovdqu 112-32(%r9),%xmm3 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 96-128(%rcx),%xmm15 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 40(%r14),%r13 - vpxor %xmm2,%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 32(%r14),%r12 - vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,80+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,88+8(%rsp) - vpxor %xmm5,%xmm6,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor %xmm1,%xmm6,%xmm6 - - vmovups 112-128(%rcx),%xmm15 - vpslldq $8,%xmm6,%xmm5 - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 16(%r11),%xmm3 - - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm8,%xmm7,%xmm7 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm5,%xmm4,%xmm4 - movbeq 24(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 16(%r14),%r12 - vpalignr $8,%xmm4,%xmm4,%xmm0 - vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 - movq %r13,96+8(%rsp) - vaesenc %xmm15,%xmm12,%xmm12 - movq %r12,104+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - vmovups 128-128(%rcx),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 144-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm10,%xmm10 - vpsrldq $8,%xmm6,%xmm6 - vaesenc %xmm1,%xmm11,%xmm11 - vpxor %xmm6,%xmm7,%xmm7 - vaesenc %xmm1,%xmm12,%xmm12 - vpxor %xmm0,%xmm4,%xmm4 - movbeq 8(%r14),%r13 - vaesenc %xmm1,%xmm13,%xmm13 - movbeq 0(%r14),%r12 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 160-128(%rcx),%xmm1 - cmpl $11,%ebp - jb L$enc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 176-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 192-128(%rcx),%xmm1 - je L$enc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 208-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 224-128(%rcx),%xmm1 - jmp L$enc_tail - -.p2align 5 -L$handle_ctr32: - vmovdqu (%r11),%xmm0 - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vmovdqu 0-32(%r9),%xmm3 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm15,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm15,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpshufb %xmm0,%xmm14,%xmm14 - vpshufb %xmm0,%xmm1,%xmm1 - jmp L$resume_ctr32 - -.p2align 5 -L$enc_tail: - vaesenc %xmm15,%xmm9,%xmm9 - vmovdqu %xmm7,16+8(%rsp) - vpalignr $8,%xmm4,%xmm4,%xmm8 - vaesenc %xmm15,%xmm10,%xmm10 - vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 - vpxor 0(%rdi),%xmm1,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 16(%rdi),%xmm1,%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 32(%rdi),%xmm1,%xmm5 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 48(%rdi),%xmm1,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 64(%rdi),%xmm1,%xmm7 - vpxor 80(%rdi),%xmm1,%xmm3 - vmovdqu (%r8),%xmm1 - - vaesenclast %xmm2,%xmm9,%xmm9 - vmovdqu 32(%r11),%xmm2 - vaesenclast %xmm0,%xmm10,%xmm10 - vpaddb %xmm2,%xmm1,%xmm0 - movq %r13,112+8(%rsp) - leaq 96(%rdi),%rdi - vaesenclast %xmm5,%xmm11,%xmm11 - vpaddb %xmm2,%xmm0,%xmm5 - movq %r12,120+8(%rsp) - leaq 96(%rsi),%rsi - vmovdqu 0-128(%rcx),%xmm15 - vaesenclast %xmm6,%xmm12,%xmm12 - vpaddb %xmm2,%xmm5,%xmm6 - vaesenclast %xmm7,%xmm13,%xmm13 - vpaddb %xmm2,%xmm6,%xmm7 - vaesenclast %xmm3,%xmm14,%xmm14 - vpaddb %xmm2,%xmm7,%xmm3 - - addq $0x60,%r10 - subq $0x6,%rdx - jc L$6x_done - - vmovups %xmm9,-96(%rsi) - vpxor %xmm15,%xmm1,%xmm9 - vmovups %xmm10,-80(%rsi) - vmovdqa %xmm0,%xmm10 - vmovups %xmm11,-64(%rsi) - vmovdqa %xmm5,%xmm11 - vmovups %xmm12,-48(%rsi) - vmovdqa %xmm6,%xmm12 - vmovups %xmm13,-32(%rsi) - vmovdqa %xmm7,%xmm13 - vmovups %xmm14,-16(%rsi) - vmovdqa %xmm3,%xmm14 - vmovdqu 32+8(%rsp),%xmm7 - jmp L$oop6x - -L$6x_done: - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpxor %xmm4,%xmm8,%xmm8 - - .byte 0xf3,0xc3 - - -.globl _aesni_gcm_decrypt -.private_extern _aesni_gcm_decrypt - -.p2align 5 -_aesni_gcm_decrypt: - - xorq %r10,%r10 - - - - cmpq $0x60,%rdx - jb L$gcm_dec_abort - - leaq (%rsp),%rax - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq L$bswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $0xf80,%r15 - vmovdqu (%r9),%xmm8 - andq $-128,%rsp - vmovdqu (%r11),%xmm0 - leaq 128(%rcx),%rcx - leaq 32+32(%r9),%r9 - movl 240-128(%rcx),%ebp - vpshufb %xmm0,%xmm8,%xmm8 - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc L$dec_no_key_aliasing - cmpq $768,%r15 - jnc L$dec_no_key_aliasing - subq %r15,%rsp -L$dec_no_key_aliasing: - - vmovdqu 80(%rdi),%xmm7 - leaq (%rdi),%r14 - vmovdqu 64(%rdi),%xmm4 - - - - - - - - leaq -192(%rdi,%rdx,1),%r15 - - vmovdqu 48(%rdi),%xmm5 - shrq $4,%rdx - xorq %r10,%r10 - vmovdqu 32(%rdi),%xmm6 - vpshufb %xmm0,%xmm7,%xmm7 - vmovdqu 16(%rdi),%xmm2 - vpshufb %xmm0,%xmm4,%xmm4 - vmovdqu (%rdi),%xmm3 - vpshufb %xmm0,%xmm5,%xmm5 - vmovdqu %xmm4,48(%rsp) - vpshufb %xmm0,%xmm6,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm2,%xmm2 - vmovdqu %xmm6,80(%rsp) - vpshufb %xmm0,%xmm3,%xmm3 - vmovdqu %xmm2,96(%rsp) - vmovdqu %xmm3,112(%rsp) - - call _aesni_ctr32_ghash_6x - - vmovups %xmm9,-96(%rsi) - vmovups %xmm10,-80(%rsi) - vmovups %xmm11,-64(%rsi) - vmovups %xmm12,-48(%rsi) - vmovups %xmm13,-32(%rsi) - vmovups %xmm14,-16(%rsi) - - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) - - vzeroupper - movq -48(%rax),%r15 - - movq -40(%rax),%r14 - - movq -32(%rax),%r13 - - movq -24(%rax),%r12 - - movq -16(%rax),%rbp - - movq -8(%rax),%rbx - - leaq (%rax),%rsp - -L$gcm_dec_abort: - movq %r10,%rax - .byte 0xf3,0xc3 - - - -.p2align 5 -_aesni_ctr32_6x: - - vmovdqu 0-128(%rcx),%xmm4 - vmovdqu 32(%r11),%xmm2 - leaq -1(%rbp),%r13 - vmovups 16-128(%rcx),%xmm15 - leaq 32-128(%rcx),%r12 - vpxor %xmm4,%xmm1,%xmm9 - addl $100663296,%ebx - jc L$handle_ctr32_2 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddb %xmm2,%xmm11,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddb %xmm2,%xmm12,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp L$oop_ctr32 - -.p2align 4 -L$oop_ctr32: - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - vmovups (%r12),%xmm15 - leaq 16(%r12),%r12 - decl %r13d - jnz L$oop_ctr32 - - vmovdqu (%r12),%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 0(%rdi),%xmm3,%xmm4 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor 16(%rdi),%xmm3,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 32(%rdi),%xmm3,%xmm6 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 48(%rdi),%xmm3,%xmm8 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 64(%rdi),%xmm3,%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 80(%rdi),%xmm3,%xmm3 - leaq 96(%rdi),%rdi - - vaesenclast %xmm4,%xmm9,%xmm9 - vaesenclast %xmm5,%xmm10,%xmm10 - vaesenclast %xmm6,%xmm11,%xmm11 - vaesenclast %xmm8,%xmm12,%xmm12 - vaesenclast %xmm2,%xmm13,%xmm13 - vaesenclast %xmm3,%xmm14,%xmm14 - vmovups %xmm9,0(%rsi) - vmovups %xmm10,16(%rsi) - vmovups %xmm11,32(%rsi) - vmovups %xmm12,48(%rsi) - vmovups %xmm13,64(%rsi) - vmovups %xmm14,80(%rsi) - leaq 96(%rsi),%rsi - - .byte 0xf3,0xc3 -.p2align 5 -L$handle_ctr32_2: - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpshufb %xmm0,%xmm14,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpshufb %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp L$oop_ctr32 - - - -.globl _aesni_gcm_encrypt -.private_extern _aesni_gcm_encrypt - -.p2align 5 -_aesni_gcm_encrypt: - -#ifdef BORINGSSL_DISPATCH_TEST - - movb $1,_BORINGSSL_function_hit+2(%rip) -#endif - xorq %r10,%r10 - - - - - cmpq $288,%rdx - jb L$gcm_enc_abort - - leaq (%rsp),%rax - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq L$bswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $0xf80,%r15 - leaq 128(%rcx),%rcx - vmovdqu (%r11),%xmm0 - andq $-128,%rsp - movl 240-128(%rcx),%ebp - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc L$enc_no_key_aliasing - cmpq $768,%r15 - jnc L$enc_no_key_aliasing - subq %r15,%rsp -L$enc_no_key_aliasing: - - leaq (%rsi),%r14 - - - - - - - - - leaq -192(%rsi,%rdx,1),%r15 - - shrq $4,%rdx - - call _aesni_ctr32_6x - vpshufb %xmm0,%xmm9,%xmm8 - vpshufb %xmm0,%xmm10,%xmm2 - vmovdqu %xmm8,112(%rsp) - vpshufb %xmm0,%xmm11,%xmm4 - vmovdqu %xmm2,96(%rsp) - vpshufb %xmm0,%xmm12,%xmm5 - vmovdqu %xmm4,80(%rsp) - vpshufb %xmm0,%xmm13,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm14,%xmm7 - vmovdqu %xmm6,48(%rsp) - - call _aesni_ctr32_6x - - vmovdqu (%r9),%xmm8 - leaq 32+32(%r9),%r9 - subq $12,%rdx - movq $192,%r10 - vpshufb %xmm0,%xmm8,%xmm8 - - call _aesni_ctr32_ghash_6x - vmovdqu 32(%rsp),%xmm7 - vmovdqu (%r11),%xmm0 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm7,%xmm7,%xmm1 - vmovdqu 32-32(%r9),%xmm15 - vmovups %xmm9,-96(%rsi) - vpshufb %xmm0,%xmm9,%xmm9 - vpxor %xmm7,%xmm1,%xmm1 - vmovups %xmm10,-80(%rsi) - vpshufb %xmm0,%xmm10,%xmm10 - vmovups %xmm11,-64(%rsi) - vpshufb %xmm0,%xmm11,%xmm11 - vmovups %xmm12,-48(%rsi) - vpshufb %xmm0,%xmm12,%xmm12 - vmovups %xmm13,-32(%rsi) - vpshufb %xmm0,%xmm13,%xmm13 - vmovups %xmm14,-16(%rsi) - vpshufb %xmm0,%xmm14,%xmm14 - vmovdqu %xmm9,16(%rsp) - vmovdqu 48(%rsp),%xmm6 - vmovdqu 16-32(%r9),%xmm0 - vpunpckhqdq %xmm6,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 - vpxor %xmm6,%xmm2,%xmm2 - vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 - - vmovdqu 64(%rsp),%xmm9 - vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm9,%xmm9,%xmm5 - vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 - vpxor %xmm9,%xmm5,%xmm5 - vpxor %xmm7,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vmovdqu 80(%rsp),%xmm1 - vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm4,%xmm7,%xmm7 - vpunpckhqdq %xmm1,%xmm1,%xmm4 - vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpxor %xmm6,%xmm9,%xmm9 - vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 96(%rsp),%xmm2 - vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm7,%xmm6,%xmm6 - vpunpckhqdq %xmm2,%xmm2,%xmm7 - vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpxor %xmm9,%xmm1,%xmm1 - vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm5,%xmm4,%xmm4 - - vpxor 112(%rsp),%xmm8,%xmm8 - vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 - vmovdqu 112-32(%r9),%xmm0 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm1,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 - vpxor %xmm4,%xmm7,%xmm4 - - vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm1 - vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 - vpxor %xmm14,%xmm1,%xmm1 - vpxor %xmm5,%xmm6,%xmm5 - vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 - vmovdqu 32-32(%r9),%xmm15 - vpxor %xmm2,%xmm8,%xmm7 - vpxor %xmm4,%xmm9,%xmm6 - - vmovdqu 16-32(%r9),%xmm0 - vpxor %xmm5,%xmm7,%xmm9 - vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 - vpxor %xmm9,%xmm6,%xmm6 - vpunpckhqdq %xmm13,%xmm13,%xmm2 - vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 - vpxor %xmm13,%xmm2,%xmm2 - vpslldq $8,%xmm6,%xmm9 - vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 - vpxor %xmm9,%xmm5,%xmm8 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm6,%xmm7,%xmm7 - - vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm12,%xmm12,%xmm9 - vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 - vpxor %xmm12,%xmm9,%xmm9 - vpxor %xmm14,%xmm13,%xmm13 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm11,%xmm11,%xmm1 - vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm13,%xmm12,%xmm12 - vxorps 16(%rsp),%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm9,%xmm9 - - vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm10,%xmm10,%xmm2 - vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 - vpxor %xmm10,%xmm2,%xmm2 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpxor %xmm12,%xmm11,%xmm11 - vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm9,%xmm1,%xmm1 - - vxorps %xmm7,%xmm14,%xmm14 - vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 - vmovdqu 112-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm11,%xmm10,%xmm10 - vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 - vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 - vpxor %xmm4,%xmm5,%xmm5 - vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 - vpxor %xmm10,%xmm7,%xmm7 - vpxor %xmm2,%xmm6,%xmm6 - - vpxor %xmm5,%xmm7,%xmm4 - vpxor %xmm4,%xmm6,%xmm6 - vpslldq $8,%xmm6,%xmm1 - vmovdqu 16(%r11),%xmm3 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm1,%xmm5,%xmm8 - vpxor %xmm6,%xmm7,%xmm7 - - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 - vpxor %xmm2,%xmm8,%xmm8 - - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 - vpxor %xmm7,%xmm2,%xmm2 - vpxor %xmm2,%xmm8,%xmm8 - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) - - vzeroupper - movq -48(%rax),%r15 - - movq -40(%rax),%r14 - - movq -32(%rax),%r13 - - movq -24(%rax),%r12 - - movq -16(%rax),%rbp - - movq -8(%rax),%rbx - - leaq (%rax),%rsp - -L$gcm_enc_abort: - movq %r10,%rax - .byte 0xf3,0xc3 - - -.p2align 6 -L$bswap_mask: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -L$poly: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -L$one_msb: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 -L$two_lsb: -.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -L$one_lsb: -.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.p2align 6 -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/aesni-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/aesni-x86_64.S deleted file mode 100644 index 7633880e..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/aesni-x86_64.S +++ /dev/null @@ -1,2503 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - -.globl _aes_hw_encrypt -.private_extern _aes_hw_encrypt - -.p2align 4 -_aes_hw_encrypt: - -#ifdef BORINGSSL_DISPATCH_TEST - - movb $1,_BORINGSSL_function_hit+1(%rip) -#endif - movups (%rdi),%xmm2 - movl 240(%rdx),%eax - movups (%rdx),%xmm0 - movups 16(%rdx),%xmm1 - leaq 32(%rdx),%rdx - xorps %xmm0,%xmm2 -L$oop_enc1_1: -.byte 102,15,56,220,209 - decl %eax - movups (%rdx),%xmm1 - leaq 16(%rdx),%rdx - jnz L$oop_enc1_1 -.byte 102,15,56,221,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - .byte 0xf3,0xc3 - - - -.globl _aes_hw_decrypt -.private_extern _aes_hw_decrypt - -.p2align 4 -_aes_hw_decrypt: - - movups (%rdi),%xmm2 - movl 240(%rdx),%eax - movups (%rdx),%xmm0 - movups 16(%rdx),%xmm1 - leaq 32(%rdx),%rdx - xorps %xmm0,%xmm2 -L$oop_dec1_2: -.byte 102,15,56,222,209 - decl %eax - movups (%rdx),%xmm1 - leaq 16(%rdx),%rdx - jnz L$oop_dec1_2 -.byte 102,15,56,223,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - .byte 0xf3,0xc3 - - - -.p2align 4 -_aesni_encrypt2: - - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax - addq $16,%rax - -L$enc_loop2: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$enc_loop2 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - .byte 0xf3,0xc3 - - - -.p2align 4 -_aesni_decrypt2: - - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax - addq $16,%rax - -L$dec_loop2: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$dec_loop2 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 - .byte 0xf3,0xc3 - - - -.p2align 4 -_aesni_encrypt3: - - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - xorps %xmm0,%xmm4 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax - addq $16,%rax - -L$enc_loop3: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$enc_loop3 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 - .byte 0xf3,0xc3 - - - -.p2align 4 -_aesni_decrypt3: - - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - xorps %xmm0,%xmm4 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax - addq $16,%rax - -L$dec_loop3: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$dec_loop3 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 - .byte 0xf3,0xc3 - - - -.p2align 4 -_aesni_encrypt4: - - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - xorps %xmm0,%xmm4 - xorps %xmm0,%xmm5 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 0x0f,0x1f,0x00 - addq $16,%rax - -L$enc_loop4: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$enc_loop4 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 -.byte 102,15,56,221,232 - .byte 0xf3,0xc3 - - - -.p2align 4 -_aesni_decrypt4: - - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - xorps %xmm0,%xmm4 - xorps %xmm0,%xmm5 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 0x0f,0x1f,0x00 - addq $16,%rax - -L$dec_loop4: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$dec_loop4 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 -.byte 102,15,56,223,232 - .byte 0xf3,0xc3 - - - -.p2align 4 -_aesni_encrypt6: - - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 -.byte 102,15,56,220,209 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 102,15,56,220,217 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 -.byte 102,15,56,220,225 - pxor %xmm0,%xmm7 - movups (%rcx,%rax,1),%xmm0 - addq $16,%rax - jmp L$enc_loop6_enter -.p2align 4 -L$enc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -L$enc_loop6_enter: -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$enc_loop6 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 - .byte 0xf3,0xc3 - - - -.p2align 4 -_aesni_decrypt6: - - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 -.byte 102,15,56,222,209 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 102,15,56,222,217 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 -.byte 102,15,56,222,225 - pxor %xmm0,%xmm7 - movups (%rcx,%rax,1),%xmm0 - addq $16,%rax - jmp L$dec_loop6_enter -.p2align 4 -L$dec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -L$dec_loop6_enter: -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$dec_loop6 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 - .byte 0xf3,0xc3 - - - -.p2align 4 -_aesni_encrypt8: - - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - pxor %xmm0,%xmm4 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 102,15,56,220,209 - pxor %xmm0,%xmm7 - pxor %xmm0,%xmm8 -.byte 102,15,56,220,217 - pxor %xmm0,%xmm9 - movups (%rcx,%rax,1),%xmm0 - addq $16,%rax - jmp L$enc_loop8_inner -.p2align 4 -L$enc_loop8: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -L$enc_loop8_inner: -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 -L$enc_loop8_enter: - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$enc_loop8 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 -.byte 102,68,15,56,221,192 -.byte 102,68,15,56,221,200 - .byte 0xf3,0xc3 - - - -.p2align 4 -_aesni_decrypt8: - - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - pxor %xmm0,%xmm4 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 102,15,56,222,209 - pxor %xmm0,%xmm7 - pxor %xmm0,%xmm8 -.byte 102,15,56,222,217 - pxor %xmm0,%xmm9 - movups (%rcx,%rax,1),%xmm0 - addq $16,%rax - jmp L$dec_loop8_inner -.p2align 4 -L$dec_loop8: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -L$dec_loop8_inner: -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 -L$dec_loop8_enter: - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups -16(%rcx,%rax,1),%xmm0 - jnz L$dec_loop8 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 -.byte 102,68,15,56,223,192 -.byte 102,68,15,56,223,200 - .byte 0xf3,0xc3 - - -.globl _aes_hw_ecb_encrypt -.private_extern _aes_hw_ecb_encrypt - -.p2align 4 -_aes_hw_ecb_encrypt: - - andq $-16,%rdx - jz L$ecb_ret - - movl 240(%rcx),%eax - movups (%rcx),%xmm0 - movq %rcx,%r11 - movl %eax,%r10d - testl %r8d,%r8d - jz L$ecb_decrypt - - cmpq $0x80,%rdx - jb L$ecb_enc_tail - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - movdqu 96(%rdi),%xmm8 - movdqu 112(%rdi),%xmm9 - leaq 128(%rdi),%rdi - subq $0x80,%rdx - jmp L$ecb_enc_loop8_enter -.p2align 4 -L$ecb_enc_loop8: - movups %xmm2,(%rsi) - movq %r11,%rcx - movdqu (%rdi),%xmm2 - movl %r10d,%eax - movups %xmm3,16(%rsi) - movdqu 16(%rdi),%xmm3 - movups %xmm4,32(%rsi) - movdqu 32(%rdi),%xmm4 - movups %xmm5,48(%rsi) - movdqu 48(%rdi),%xmm5 - movups %xmm6,64(%rsi) - movdqu 64(%rdi),%xmm6 - movups %xmm7,80(%rsi) - movdqu 80(%rdi),%xmm7 - movups %xmm8,96(%rsi) - movdqu 96(%rdi),%xmm8 - movups %xmm9,112(%rsi) - leaq 128(%rsi),%rsi - movdqu 112(%rdi),%xmm9 - leaq 128(%rdi),%rdi -L$ecb_enc_loop8_enter: - - call _aesni_encrypt8 - - subq $0x80,%rdx - jnc L$ecb_enc_loop8 - - movups %xmm2,(%rsi) - movq %r11,%rcx - movups %xmm3,16(%rsi) - movl %r10d,%eax - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - movups %xmm8,96(%rsi) - movups %xmm9,112(%rsi) - leaq 128(%rsi),%rsi - addq $0x80,%rdx - jz L$ecb_ret - -L$ecb_enc_tail: - movups (%rdi),%xmm2 - cmpq $0x20,%rdx - jb L$ecb_enc_one - movups 16(%rdi),%xmm3 - je L$ecb_enc_two - movups 32(%rdi),%xmm4 - cmpq $0x40,%rdx - jb L$ecb_enc_three - movups 48(%rdi),%xmm5 - je L$ecb_enc_four - movups 64(%rdi),%xmm6 - cmpq $0x60,%rdx - jb L$ecb_enc_five - movups 80(%rdi),%xmm7 - je L$ecb_enc_six - movdqu 96(%rdi),%xmm8 - xorps %xmm9,%xmm9 - call _aesni_encrypt8 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - movups %xmm8,96(%rsi) - jmp L$ecb_ret -.p2align 4 -L$ecb_enc_one: - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_3: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_3 -.byte 102,15,56,221,209 - movups %xmm2,(%rsi) - jmp L$ecb_ret -.p2align 4 -L$ecb_enc_two: - call _aesni_encrypt2 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - jmp L$ecb_ret -.p2align 4 -L$ecb_enc_three: - call _aesni_encrypt3 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - jmp L$ecb_ret -.p2align 4 -L$ecb_enc_four: - call _aesni_encrypt4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - jmp L$ecb_ret -.p2align 4 -L$ecb_enc_five: - xorps %xmm7,%xmm7 - call _aesni_encrypt6 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - jmp L$ecb_ret -.p2align 4 -L$ecb_enc_six: - call _aesni_encrypt6 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - jmp L$ecb_ret - -.p2align 4 -L$ecb_decrypt: - cmpq $0x80,%rdx - jb L$ecb_dec_tail - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - movdqu 96(%rdi),%xmm8 - movdqu 112(%rdi),%xmm9 - leaq 128(%rdi),%rdi - subq $0x80,%rdx - jmp L$ecb_dec_loop8_enter -.p2align 4 -L$ecb_dec_loop8: - movups %xmm2,(%rsi) - movq %r11,%rcx - movdqu (%rdi),%xmm2 - movl %r10d,%eax - movups %xmm3,16(%rsi) - movdqu 16(%rdi),%xmm3 - movups %xmm4,32(%rsi) - movdqu 32(%rdi),%xmm4 - movups %xmm5,48(%rsi) - movdqu 48(%rdi),%xmm5 - movups %xmm6,64(%rsi) - movdqu 64(%rdi),%xmm6 - movups %xmm7,80(%rsi) - movdqu 80(%rdi),%xmm7 - movups %xmm8,96(%rsi) - movdqu 96(%rdi),%xmm8 - movups %xmm9,112(%rsi) - leaq 128(%rsi),%rsi - movdqu 112(%rdi),%xmm9 - leaq 128(%rdi),%rdi -L$ecb_dec_loop8_enter: - - call _aesni_decrypt8 - - movups (%r11),%xmm0 - subq $0x80,%rdx - jnc L$ecb_dec_loop8 - - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movq %r11,%rcx - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movl %r10d,%eax - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm7 - movups %xmm8,96(%rsi) - pxor %xmm8,%xmm8 - movups %xmm9,112(%rsi) - pxor %xmm9,%xmm9 - leaq 128(%rsi),%rsi - addq $0x80,%rdx - jz L$ecb_ret - -L$ecb_dec_tail: - movups (%rdi),%xmm2 - cmpq $0x20,%rdx - jb L$ecb_dec_one - movups 16(%rdi),%xmm3 - je L$ecb_dec_two - movups 32(%rdi),%xmm4 - cmpq $0x40,%rdx - jb L$ecb_dec_three - movups 48(%rdi),%xmm5 - je L$ecb_dec_four - movups 64(%rdi),%xmm6 - cmpq $0x60,%rdx - jb L$ecb_dec_five - movups 80(%rdi),%xmm7 - je L$ecb_dec_six - movups 96(%rdi),%xmm8 - movups (%rcx),%xmm0 - xorps %xmm9,%xmm9 - call _aesni_decrypt8 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm7 - movups %xmm8,96(%rsi) - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - jmp L$ecb_ret -.p2align 4 -L$ecb_dec_one: - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_4: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_4 -.byte 102,15,56,223,209 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - jmp L$ecb_ret -.p2align 4 -L$ecb_dec_two: - call _aesni_decrypt2 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - jmp L$ecb_ret -.p2align 4 -L$ecb_dec_three: - call _aesni_decrypt3 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - jmp L$ecb_ret -.p2align 4 -L$ecb_dec_four: - call _aesni_decrypt4 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - jmp L$ecb_ret -.p2align 4 -L$ecb_dec_five: - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - jmp L$ecb_ret -.p2align 4 -L$ecb_dec_six: - call _aesni_decrypt6 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm7 - -L$ecb_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - .byte 0xf3,0xc3 - - -.globl _aes_hw_ctr32_encrypt_blocks -.private_extern _aes_hw_ctr32_encrypt_blocks - -.p2align 4 -_aes_hw_ctr32_encrypt_blocks: - -#ifdef BORINGSSL_DISPATCH_TEST - movb $1,_BORINGSSL_function_hit(%rip) -#endif - cmpq $1,%rdx - jne L$ctr32_bulk - - - - movups (%r8),%xmm2 - movups (%rdi),%xmm3 - movl 240(%rcx),%edx - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_enc1_5: -.byte 102,15,56,220,209 - decl %edx - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_5 -.byte 102,15,56,221,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - xorps %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm2,%xmm2 - jmp L$ctr32_epilogue - -.p2align 4 -L$ctr32_bulk: - leaq (%rsp),%r11 - - pushq %rbp - - subq $128,%rsp - andq $-16,%rsp - - - - - movdqu (%r8),%xmm2 - movdqu (%rcx),%xmm0 - movl 12(%r8),%r8d - pxor %xmm0,%xmm2 - movl 12(%rcx),%ebp - movdqa %xmm2,0(%rsp) - bswapl %r8d - movdqa %xmm2,%xmm3 - movdqa %xmm2,%xmm4 - movdqa %xmm2,%xmm5 - movdqa %xmm2,64(%rsp) - movdqa %xmm2,80(%rsp) - movdqa %xmm2,96(%rsp) - movq %rdx,%r10 - movdqa %xmm2,112(%rsp) - - leaq 1(%r8),%rax - leaq 2(%r8),%rdx - bswapl %eax - bswapl %edx - xorl %ebp,%eax - xorl %ebp,%edx -.byte 102,15,58,34,216,3 - leaq 3(%r8),%rax - movdqa %xmm3,16(%rsp) -.byte 102,15,58,34,226,3 - bswapl %eax - movq %r10,%rdx - leaq 4(%r8),%r10 - movdqa %xmm4,32(%rsp) - xorl %ebp,%eax - bswapl %r10d -.byte 102,15,58,34,232,3 - xorl %ebp,%r10d - movdqa %xmm5,48(%rsp) - leaq 5(%r8),%r9 - movl %r10d,64+12(%rsp) - bswapl %r9d - leaq 6(%r8),%r10 - movl 240(%rcx),%eax - xorl %ebp,%r9d - bswapl %r10d - movl %r9d,80+12(%rsp) - xorl %ebp,%r10d - leaq 7(%r8),%r9 - movl %r10d,96+12(%rsp) - bswapl %r9d - leaq _OPENSSL_ia32cap_P(%rip),%r10 - movl 4(%r10),%r10d - xorl %ebp,%r9d - andl $71303168,%r10d - movl %r9d,112+12(%rsp) - - movups 16(%rcx),%xmm1 - - movdqa 64(%rsp),%xmm6 - movdqa 80(%rsp),%xmm7 - - cmpq $8,%rdx - jb L$ctr32_tail - - subq $6,%rdx - cmpl $4194304,%r10d - je L$ctr32_6x - - leaq 128(%rcx),%rcx - subq $2,%rdx - jmp L$ctr32_loop8 - -.p2align 4 -L$ctr32_6x: - shll $4,%eax - movl $48,%r10d - bswapl %ebp - leaq 32(%rcx,%rax,1),%rcx - subq %rax,%r10 - jmp L$ctr32_loop6 - -.p2align 4 -L$ctr32_loop6: - addl $6,%r8d - movups -48(%rcx,%r10,1),%xmm0 -.byte 102,15,56,220,209 - movl %r8d,%eax - xorl %ebp,%eax -.byte 102,15,56,220,217 -.byte 0x0f,0x38,0xf1,0x44,0x24,12 - leal 1(%r8),%eax -.byte 102,15,56,220,225 - xorl %ebp,%eax -.byte 0x0f,0x38,0xf1,0x44,0x24,28 -.byte 102,15,56,220,233 - leal 2(%r8),%eax - xorl %ebp,%eax -.byte 102,15,56,220,241 -.byte 0x0f,0x38,0xf1,0x44,0x24,44 - leal 3(%r8),%eax -.byte 102,15,56,220,249 - movups -32(%rcx,%r10,1),%xmm1 - xorl %ebp,%eax - -.byte 102,15,56,220,208 -.byte 0x0f,0x38,0xf1,0x44,0x24,60 - leal 4(%r8),%eax -.byte 102,15,56,220,216 - xorl %ebp,%eax -.byte 0x0f,0x38,0xf1,0x44,0x24,76 -.byte 102,15,56,220,224 - leal 5(%r8),%eax - xorl %ebp,%eax -.byte 102,15,56,220,232 -.byte 0x0f,0x38,0xf1,0x44,0x24,92 - movq %r10,%rax -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%rcx,%r10,1),%xmm0 - - call L$enc_loop6 - - movdqu (%rdi),%xmm8 - movdqu 16(%rdi),%xmm9 - movdqu 32(%rdi),%xmm10 - movdqu 48(%rdi),%xmm11 - movdqu 64(%rdi),%xmm12 - movdqu 80(%rdi),%xmm13 - leaq 96(%rdi),%rdi - movups -64(%rcx,%r10,1),%xmm1 - pxor %xmm2,%xmm8 - movaps 0(%rsp),%xmm2 - pxor %xmm3,%xmm9 - movaps 16(%rsp),%xmm3 - pxor %xmm4,%xmm10 - movaps 32(%rsp),%xmm4 - pxor %xmm5,%xmm11 - movaps 48(%rsp),%xmm5 - pxor %xmm6,%xmm12 - movaps 64(%rsp),%xmm6 - pxor %xmm7,%xmm13 - movaps 80(%rsp),%xmm7 - movdqu %xmm8,(%rsi) - movdqu %xmm9,16(%rsi) - movdqu %xmm10,32(%rsi) - movdqu %xmm11,48(%rsi) - movdqu %xmm12,64(%rsi) - movdqu %xmm13,80(%rsi) - leaq 96(%rsi),%rsi - - subq $6,%rdx - jnc L$ctr32_loop6 - - addq $6,%rdx - jz L$ctr32_done - - leal -48(%r10),%eax - leaq -80(%rcx,%r10,1),%rcx - negl %eax - shrl $4,%eax - jmp L$ctr32_tail - -.p2align 5 -L$ctr32_loop8: - addl $8,%r8d - movdqa 96(%rsp),%xmm8 -.byte 102,15,56,220,209 - movl %r8d,%r9d - movdqa 112(%rsp),%xmm9 -.byte 102,15,56,220,217 - bswapl %r9d - movups 32-128(%rcx),%xmm0 -.byte 102,15,56,220,225 - xorl %ebp,%r9d - nop -.byte 102,15,56,220,233 - movl %r9d,0+12(%rsp) - leaq 1(%r8),%r9 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 48-128(%rcx),%xmm1 - bswapl %r9d -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movl %r9d,16+12(%rsp) - leaq 2(%r8),%r9 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 64-128(%rcx),%xmm0 - bswapl %r9d -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movl %r9d,32+12(%rsp) - leaq 3(%r8),%r9 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 80-128(%rcx),%xmm1 - bswapl %r9d -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movl %r9d,48+12(%rsp) - leaq 4(%r8),%r9 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 96-128(%rcx),%xmm0 - bswapl %r9d -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movl %r9d,64+12(%rsp) - leaq 5(%r8),%r9 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 112-128(%rcx),%xmm1 - bswapl %r9d -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movl %r9d,80+12(%rsp) - leaq 6(%r8),%r9 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 128-128(%rcx),%xmm0 - bswapl %r9d -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movl %r9d,96+12(%rsp) - leaq 7(%r8),%r9 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 144-128(%rcx),%xmm1 - bswapl %r9d -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 - xorl %ebp,%r9d - movdqu 0(%rdi),%xmm10 -.byte 102,15,56,220,232 - movl %r9d,112+12(%rsp) - cmpl $11,%eax -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 160-128(%rcx),%xmm0 - - jb L$ctr32_enc_done - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 176-128(%rcx),%xmm1 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 192-128(%rcx),%xmm0 - je L$ctr32_enc_done - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 208-128(%rcx),%xmm1 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 224-128(%rcx),%xmm0 - jmp L$ctr32_enc_done - -.p2align 4 -L$ctr32_enc_done: - movdqu 16(%rdi),%xmm11 - pxor %xmm0,%xmm10 - movdqu 32(%rdi),%xmm12 - pxor %xmm0,%xmm11 - movdqu 48(%rdi),%xmm13 - pxor %xmm0,%xmm12 - movdqu 64(%rdi),%xmm14 - pxor %xmm0,%xmm13 - movdqu 80(%rdi),%xmm15 - pxor %xmm0,%xmm14 - pxor %xmm0,%xmm15 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movdqu 96(%rdi),%xmm1 - leaq 128(%rdi),%rdi - -.byte 102,65,15,56,221,210 - pxor %xmm0,%xmm1 - movdqu 112-128(%rdi),%xmm10 -.byte 102,65,15,56,221,219 - pxor %xmm0,%xmm10 - movdqa 0(%rsp),%xmm11 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 - movdqa 16(%rsp),%xmm12 - movdqa 32(%rsp),%xmm13 -.byte 102,65,15,56,221,246 -.byte 102,65,15,56,221,255 - movdqa 48(%rsp),%xmm14 - movdqa 64(%rsp),%xmm15 -.byte 102,68,15,56,221,193 - movdqa 80(%rsp),%xmm0 - movups 16-128(%rcx),%xmm1 -.byte 102,69,15,56,221,202 - - movups %xmm2,(%rsi) - movdqa %xmm11,%xmm2 - movups %xmm3,16(%rsi) - movdqa %xmm12,%xmm3 - movups %xmm4,32(%rsi) - movdqa %xmm13,%xmm4 - movups %xmm5,48(%rsi) - movdqa %xmm14,%xmm5 - movups %xmm6,64(%rsi) - movdqa %xmm15,%xmm6 - movups %xmm7,80(%rsi) - movdqa %xmm0,%xmm7 - movups %xmm8,96(%rsi) - movups %xmm9,112(%rsi) - leaq 128(%rsi),%rsi - - subq $8,%rdx - jnc L$ctr32_loop8 - - addq $8,%rdx - jz L$ctr32_done - leaq -128(%rcx),%rcx - -L$ctr32_tail: - - - leaq 16(%rcx),%rcx - cmpq $4,%rdx - jb L$ctr32_loop3 - je L$ctr32_loop4 - - - shll $4,%eax - movdqa 96(%rsp),%xmm8 - pxor %xmm9,%xmm9 - - movups 16(%rcx),%xmm0 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - leaq 32-16(%rcx,%rax,1),%rcx - negq %rax -.byte 102,15,56,220,225 - addq $16,%rax - movups (%rdi),%xmm10 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 - movups 16(%rdi),%xmm11 - movups 32(%rdi),%xmm12 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 - - call L$enc_loop8_enter - - movdqu 48(%rdi),%xmm13 - pxor %xmm10,%xmm2 - movdqu 64(%rdi),%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm10,%xmm6 - movdqu %xmm5,48(%rsi) - movdqu %xmm6,64(%rsi) - cmpq $6,%rdx - jb L$ctr32_done - - movups 80(%rdi),%xmm11 - xorps %xmm11,%xmm7 - movups %xmm7,80(%rsi) - je L$ctr32_done - - movups 96(%rdi),%xmm12 - xorps %xmm12,%xmm8 - movups %xmm8,96(%rsi) - jmp L$ctr32_done - -.p2align 5 -L$ctr32_loop4: -.byte 102,15,56,220,209 - leaq 16(%rcx),%rcx - decl %eax -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%rcx),%xmm1 - jnz L$ctr32_loop4 -.byte 102,15,56,221,209 -.byte 102,15,56,221,217 - movups (%rdi),%xmm10 - movups 16(%rdi),%xmm11 -.byte 102,15,56,221,225 -.byte 102,15,56,221,233 - movups 32(%rdi),%xmm12 - movups 48(%rdi),%xmm13 - - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - xorps %xmm11,%xmm3 - movups %xmm3,16(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm4,32(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm5,48(%rsi) - jmp L$ctr32_done - -.p2align 5 -L$ctr32_loop3: -.byte 102,15,56,220,209 - leaq 16(%rcx),%rcx - decl %eax -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 - movups (%rcx),%xmm1 - jnz L$ctr32_loop3 -.byte 102,15,56,221,209 -.byte 102,15,56,221,217 -.byte 102,15,56,221,225 - - movups (%rdi),%xmm10 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - cmpq $2,%rdx - jb L$ctr32_done - - movups 16(%rdi),%xmm11 - xorps %xmm11,%xmm3 - movups %xmm3,16(%rsi) - je L$ctr32_done - - movups 32(%rdi),%xmm12 - xorps %xmm12,%xmm4 - movups %xmm4,32(%rsi) - -L$ctr32_done: - xorps %xmm0,%xmm0 - xorl %ebp,%ebp - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - movaps %xmm0,112(%rsp) - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp - - leaq (%r11),%rsp - -L$ctr32_epilogue: - .byte 0xf3,0xc3 - - -.globl _aes_hw_cbc_encrypt -.private_extern _aes_hw_cbc_encrypt - -.p2align 4 -_aes_hw_cbc_encrypt: - - testq %rdx,%rdx - jz L$cbc_ret - - movl 240(%rcx),%r10d - movq %rcx,%r11 - testl %r9d,%r9d - jz L$cbc_decrypt - - movups (%r8),%xmm2 - movl %r10d,%eax - cmpq $16,%rdx - jb L$cbc_enc_tail - subq $16,%rdx - jmp L$cbc_enc_loop -.p2align 4 -L$cbc_enc_loop: - movups (%rdi),%xmm3 - leaq 16(%rdi),%rdi - - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm3 - leaq 32(%rcx),%rcx - xorps %xmm3,%xmm2 -L$oop_enc1_6: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_enc1_6 -.byte 102,15,56,221,209 - movl %r10d,%eax - movq %r11,%rcx - movups %xmm2,0(%rsi) - leaq 16(%rsi),%rsi - subq $16,%rdx - jnc L$cbc_enc_loop - addq $16,%rdx - jnz L$cbc_enc_tail - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%r8) - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - jmp L$cbc_ret - -L$cbc_enc_tail: - movq %rdx,%rcx - xchgq %rdi,%rsi -.long 0x9066A4F3 - movl $16,%ecx - subq %rdx,%rcx - xorl %eax,%eax -.long 0x9066AAF3 - leaq -16(%rdi),%rdi - movl %r10d,%eax - movq %rdi,%rsi - movq %r11,%rcx - xorq %rdx,%rdx - jmp L$cbc_enc_loop - -.p2align 4 -L$cbc_decrypt: - cmpq $16,%rdx - jne L$cbc_decrypt_bulk - - - - movdqu (%rdi),%xmm2 - movdqu (%r8),%xmm3 - movdqa %xmm2,%xmm4 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_7: -.byte 102,15,56,222,209 - decl %r10d - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_7 -.byte 102,15,56,223,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movdqu %xmm4,(%r8) - xorps %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - jmp L$cbc_ret -.p2align 4 -L$cbc_decrypt_bulk: - leaq (%rsp),%r11 - - pushq %rbp - - subq $16,%rsp - andq $-16,%rsp - movq %rcx,%rbp - movups (%r8),%xmm10 - movl %r10d,%eax - cmpq $0x50,%rdx - jbe L$cbc_dec_tail - - movups (%rcx),%xmm0 - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqa %xmm2,%xmm11 - movdqu 32(%rdi),%xmm4 - movdqa %xmm3,%xmm12 - movdqu 48(%rdi),%xmm5 - movdqa %xmm4,%xmm13 - movdqu 64(%rdi),%xmm6 - movdqa %xmm5,%xmm14 - movdqu 80(%rdi),%xmm7 - movdqa %xmm6,%xmm15 - leaq _OPENSSL_ia32cap_P(%rip),%r9 - movl 4(%r9),%r9d - cmpq $0x70,%rdx - jbe L$cbc_dec_six_or_seven - - andl $71303168,%r9d - subq $0x50,%rdx - cmpl $4194304,%r9d - je L$cbc_dec_loop6_enter - subq $0x20,%rdx - leaq 112(%rcx),%rcx - jmp L$cbc_dec_loop8_enter -.p2align 4 -L$cbc_dec_loop8: - movups %xmm9,(%rsi) - leaq 16(%rsi),%rsi -L$cbc_dec_loop8_enter: - movdqu 96(%rdi),%xmm8 - pxor %xmm0,%xmm2 - movdqu 112(%rdi),%xmm9 - pxor %xmm0,%xmm3 - movups 16-112(%rcx),%xmm1 - pxor %xmm0,%xmm4 - movq $-1,%rbp - cmpq $0x70,%rdx - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 - pxor %xmm0,%xmm7 - pxor %xmm0,%xmm8 - -.byte 102,15,56,222,209 - pxor %xmm0,%xmm9 - movups 32-112(%rcx),%xmm0 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 - adcq $0,%rbp - andq $128,%rbp -.byte 102,68,15,56,222,201 - addq %rdi,%rbp - movups 48-112(%rcx),%xmm1 -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 64-112(%rcx),%xmm0 - nop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 80-112(%rcx),%xmm1 - nop -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 96-112(%rcx),%xmm0 - nop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 112-112(%rcx),%xmm1 - nop -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 128-112(%rcx),%xmm0 - nop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 144-112(%rcx),%xmm1 - cmpl $11,%eax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 160-112(%rcx),%xmm0 - jb L$cbc_dec_done -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 176-112(%rcx),%xmm1 - nop -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 192-112(%rcx),%xmm0 - je L$cbc_dec_done -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 208-112(%rcx),%xmm1 - nop -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 224-112(%rcx),%xmm0 - jmp L$cbc_dec_done -.p2align 4 -L$cbc_dec_done: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 - pxor %xmm0,%xmm10 - pxor %xmm0,%xmm11 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm0,%xmm12 - pxor %xmm0,%xmm13 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - pxor %xmm0,%xmm14 - pxor %xmm0,%xmm15 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movdqu 80(%rdi),%xmm1 - -.byte 102,65,15,56,223,210 - movdqu 96(%rdi),%xmm10 - pxor %xmm0,%xmm1 -.byte 102,65,15,56,223,219 - pxor %xmm0,%xmm10 - movdqu 112(%rdi),%xmm0 -.byte 102,65,15,56,223,228 - leaq 128(%rdi),%rdi - movdqu 0(%rbp),%xmm11 -.byte 102,65,15,56,223,237 -.byte 102,65,15,56,223,246 - movdqu 16(%rbp),%xmm12 - movdqu 32(%rbp),%xmm13 -.byte 102,65,15,56,223,255 -.byte 102,68,15,56,223,193 - movdqu 48(%rbp),%xmm14 - movdqu 64(%rbp),%xmm15 -.byte 102,69,15,56,223,202 - movdqa %xmm0,%xmm10 - movdqu 80(%rbp),%xmm1 - movups -112(%rcx),%xmm0 - - movups %xmm2,(%rsi) - movdqa %xmm11,%xmm2 - movups %xmm3,16(%rsi) - movdqa %xmm12,%xmm3 - movups %xmm4,32(%rsi) - movdqa %xmm13,%xmm4 - movups %xmm5,48(%rsi) - movdqa %xmm14,%xmm5 - movups %xmm6,64(%rsi) - movdqa %xmm15,%xmm6 - movups %xmm7,80(%rsi) - movdqa %xmm1,%xmm7 - movups %xmm8,96(%rsi) - leaq 112(%rsi),%rsi - - subq $0x80,%rdx - ja L$cbc_dec_loop8 - - movaps %xmm9,%xmm2 - leaq -112(%rcx),%rcx - addq $0x70,%rdx - jle L$cbc_dec_clear_tail_collected - movups %xmm9,(%rsi) - leaq 16(%rsi),%rsi - cmpq $0x50,%rdx - jbe L$cbc_dec_tail - - movaps %xmm11,%xmm2 -L$cbc_dec_six_or_seven: - cmpq $0x60,%rdx - ja L$cbc_dec_seven - - movaps %xmm7,%xmm8 - call _aesni_decrypt6 - pxor %xmm10,%xmm2 - movaps %xmm8,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - pxor %xmm14,%xmm6 - movdqu %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - pxor %xmm15,%xmm7 - movdqu %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - leaq 80(%rsi),%rsi - movdqa %xmm7,%xmm2 - pxor %xmm7,%xmm7 - jmp L$cbc_dec_tail_collected - -.p2align 4 -L$cbc_dec_seven: - movups 96(%rdi),%xmm8 - xorps %xmm9,%xmm9 - call _aesni_decrypt8 - movups 80(%rdi),%xmm9 - pxor %xmm10,%xmm2 - movups 96(%rdi),%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - pxor %xmm14,%xmm6 - movdqu %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - pxor %xmm15,%xmm7 - movdqu %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - pxor %xmm9,%xmm8 - movdqu %xmm7,80(%rsi) - pxor %xmm7,%xmm7 - leaq 96(%rsi),%rsi - movdqa %xmm8,%xmm2 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - jmp L$cbc_dec_tail_collected - -.p2align 4 -L$cbc_dec_loop6: - movups %xmm7,(%rsi) - leaq 16(%rsi),%rsi - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqa %xmm2,%xmm11 - movdqu 32(%rdi),%xmm4 - movdqa %xmm3,%xmm12 - movdqu 48(%rdi),%xmm5 - movdqa %xmm4,%xmm13 - movdqu 64(%rdi),%xmm6 - movdqa %xmm5,%xmm14 - movdqu 80(%rdi),%xmm7 - movdqa %xmm6,%xmm15 -L$cbc_dec_loop6_enter: - leaq 96(%rdi),%rdi - movdqa %xmm7,%xmm8 - - call _aesni_decrypt6 - - pxor %xmm10,%xmm2 - movdqa %xmm8,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm14,%xmm6 - movq %rbp,%rcx - movdqu %xmm5,48(%rsi) - pxor %xmm15,%xmm7 - movl %r10d,%eax - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - subq $0x60,%rdx - ja L$cbc_dec_loop6 - - movdqa %xmm7,%xmm2 - addq $0x50,%rdx - jle L$cbc_dec_clear_tail_collected - movups %xmm7,(%rsi) - leaq 16(%rsi),%rsi - -L$cbc_dec_tail: - movups (%rdi),%xmm2 - subq $0x10,%rdx - jbe L$cbc_dec_one - - movups 16(%rdi),%xmm3 - movaps %xmm2,%xmm11 - subq $0x10,%rdx - jbe L$cbc_dec_two - - movups 32(%rdi),%xmm4 - movaps %xmm3,%xmm12 - subq $0x10,%rdx - jbe L$cbc_dec_three - - movups 48(%rdi),%xmm5 - movaps %xmm4,%xmm13 - subq $0x10,%rdx - jbe L$cbc_dec_four - - movups 64(%rdi),%xmm6 - movaps %xmm5,%xmm14 - movaps %xmm6,%xmm15 - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - pxor %xmm10,%xmm2 - movaps %xmm15,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - pxor %xmm14,%xmm6 - movdqu %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - leaq 64(%rsi),%rsi - movdqa %xmm6,%xmm2 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - subq $0x10,%rdx - jmp L$cbc_dec_tail_collected - -.p2align 4 -L$cbc_dec_one: - movaps %xmm2,%xmm11 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -L$oop_dec1_8: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz L$oop_dec1_8 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movaps %xmm11,%xmm10 - jmp L$cbc_dec_tail_collected -.p2align 4 -L$cbc_dec_two: - movaps %xmm3,%xmm12 - call _aesni_decrypt2 - pxor %xmm10,%xmm2 - movaps %xmm12,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - movdqa %xmm3,%xmm2 - pxor %xmm3,%xmm3 - leaq 16(%rsi),%rsi - jmp L$cbc_dec_tail_collected -.p2align 4 -L$cbc_dec_three: - movaps %xmm4,%xmm13 - call _aesni_decrypt3 - pxor %xmm10,%xmm2 - movaps %xmm13,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movdqa %xmm4,%xmm2 - pxor %xmm4,%xmm4 - leaq 32(%rsi),%rsi - jmp L$cbc_dec_tail_collected -.p2align 4 -L$cbc_dec_four: - movaps %xmm5,%xmm14 - call _aesni_decrypt4 - pxor %xmm10,%xmm2 - movaps %xmm14,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movdqa %xmm5,%xmm2 - pxor %xmm5,%xmm5 - leaq 48(%rsi),%rsi - jmp L$cbc_dec_tail_collected - -.p2align 4 -L$cbc_dec_clear_tail_collected: - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 -L$cbc_dec_tail_collected: - movups %xmm10,(%r8) - andq $15,%rdx - jnz L$cbc_dec_tail_partial - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - jmp L$cbc_dec_ret -.p2align 4 -L$cbc_dec_tail_partial: - movaps %xmm2,(%rsp) - pxor %xmm2,%xmm2 - movq $16,%rcx - movq %rsi,%rdi - subq %rdx,%rcx - leaq (%rsp),%rsi -.long 0x9066A4F3 - movdqa %xmm2,(%rsp) - -L$cbc_dec_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movq -8(%r11),%rbp - - leaq (%r11),%rsp - -L$cbc_ret: - .byte 0xf3,0xc3 - - -.globl _aes_hw_set_decrypt_key -.private_extern _aes_hw_set_decrypt_key - -.p2align 4 -_aes_hw_set_decrypt_key: - -.byte 0x48,0x83,0xEC,0x08 - - call __aesni_set_encrypt_key - shll $4,%esi - testl %eax,%eax - jnz L$dec_key_ret - leaq 16(%rdx,%rsi,1),%rdi - - movups (%rdx),%xmm0 - movups (%rdi),%xmm1 - movups %xmm0,(%rdi) - movups %xmm1,(%rdx) - leaq 16(%rdx),%rdx - leaq -16(%rdi),%rdi - -L$dec_key_inverse: - movups (%rdx),%xmm0 - movups (%rdi),%xmm1 -.byte 102,15,56,219,192 -.byte 102,15,56,219,201 - leaq 16(%rdx),%rdx - leaq -16(%rdi),%rdi - movups %xmm0,16(%rdi) - movups %xmm1,-16(%rdx) - cmpq %rdx,%rdi - ja L$dec_key_inverse - - movups (%rdx),%xmm0 -.byte 102,15,56,219,192 - pxor %xmm1,%xmm1 - movups %xmm0,(%rdi) - pxor %xmm0,%xmm0 -L$dec_key_ret: - addq $8,%rsp - - .byte 0xf3,0xc3 - -L$SEH_end_set_decrypt_key: - -.globl _aes_hw_set_encrypt_key -.private_extern _aes_hw_set_encrypt_key - -.p2align 4 -_aes_hw_set_encrypt_key: -__aesni_set_encrypt_key: - -#ifdef BORINGSSL_DISPATCH_TEST - movb $1,_BORINGSSL_function_hit+3(%rip) -#endif -.byte 0x48,0x83,0xEC,0x08 - - movq $-1,%rax - testq %rdi,%rdi - jz L$enc_key_ret - testq %rdx,%rdx - jz L$enc_key_ret - - movups (%rdi),%xmm0 - xorps %xmm4,%xmm4 - leaq _OPENSSL_ia32cap_P(%rip),%r10 - movl 4(%r10),%r10d - andl $268437504,%r10d - leaq 16(%rdx),%rax - cmpl $256,%esi - je L$14rounds - cmpl $192,%esi - je L$12rounds - cmpl $128,%esi - jne L$bad_keybits - -L$10rounds: - movl $9,%esi - cmpl $268435456,%r10d - je L$10rounds_alt - - movups %xmm0,(%rdx) -.byte 102,15,58,223,200,1 - call L$key_expansion_128_cold -.byte 102,15,58,223,200,2 - call L$key_expansion_128 -.byte 102,15,58,223,200,4 - call L$key_expansion_128 -.byte 102,15,58,223,200,8 - call L$key_expansion_128 -.byte 102,15,58,223,200,16 - call L$key_expansion_128 -.byte 102,15,58,223,200,32 - call L$key_expansion_128 -.byte 102,15,58,223,200,64 - call L$key_expansion_128 -.byte 102,15,58,223,200,128 - call L$key_expansion_128 -.byte 102,15,58,223,200,27 - call L$key_expansion_128 -.byte 102,15,58,223,200,54 - call L$key_expansion_128 - movups %xmm0,(%rax) - movl %esi,80(%rax) - xorl %eax,%eax - jmp L$enc_key_ret - -.p2align 4 -L$10rounds_alt: - movdqa L$key_rotate(%rip),%xmm5 - movl $8,%r10d - movdqa L$key_rcon1(%rip),%xmm4 - movdqa %xmm0,%xmm2 - movdqu %xmm0,(%rdx) - jmp L$oop_key128 - -.p2align 4 -L$oop_key128: -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - pslld $1,%xmm4 - leaq 16(%rax),%rax - - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - - pxor %xmm2,%xmm0 - movdqu %xmm0,-16(%rax) - movdqa %xmm0,%xmm2 - - decl %r10d - jnz L$oop_key128 - - movdqa L$key_rcon1b(%rip),%xmm4 - -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - pslld $1,%xmm4 - - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - - pxor %xmm2,%xmm0 - movdqu %xmm0,(%rax) - - movdqa %xmm0,%xmm2 -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - - pxor %xmm2,%xmm0 - movdqu %xmm0,16(%rax) - - movl %esi,96(%rax) - xorl %eax,%eax - jmp L$enc_key_ret - -.p2align 4 -L$12rounds: - movq 16(%rdi),%xmm2 - movl $11,%esi - cmpl $268435456,%r10d - je L$12rounds_alt - - movups %xmm0,(%rdx) -.byte 102,15,58,223,202,1 - call L$key_expansion_192a_cold -.byte 102,15,58,223,202,2 - call L$key_expansion_192b -.byte 102,15,58,223,202,4 - call L$key_expansion_192a -.byte 102,15,58,223,202,8 - call L$key_expansion_192b -.byte 102,15,58,223,202,16 - call L$key_expansion_192a -.byte 102,15,58,223,202,32 - call L$key_expansion_192b -.byte 102,15,58,223,202,64 - call L$key_expansion_192a -.byte 102,15,58,223,202,128 - call L$key_expansion_192b - movups %xmm0,(%rax) - movl %esi,48(%rax) - xorq %rax,%rax - jmp L$enc_key_ret - -.p2align 4 -L$12rounds_alt: - movdqa L$key_rotate192(%rip),%xmm5 - movdqa L$key_rcon1(%rip),%xmm4 - movl $8,%r10d - movdqu %xmm0,(%rdx) - jmp L$oop_key192 - -.p2align 4 -L$oop_key192: - movq %xmm2,0(%rax) - movdqa %xmm2,%xmm1 -.byte 102,15,56,0,213 -.byte 102,15,56,221,212 - pslld $1,%xmm4 - leaq 24(%rax),%rax - - movdqa %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm3,%xmm0 - - pshufd $0xff,%xmm0,%xmm3 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - - pxor %xmm2,%xmm0 - pxor %xmm3,%xmm2 - movdqu %xmm0,-16(%rax) - - decl %r10d - jnz L$oop_key192 - - movl %esi,32(%rax) - xorl %eax,%eax - jmp L$enc_key_ret - -.p2align 4 -L$14rounds: - movups 16(%rdi),%xmm2 - movl $13,%esi - leaq 16(%rax),%rax - cmpl $268435456,%r10d - je L$14rounds_alt - - movups %xmm0,(%rdx) - movups %xmm2,16(%rdx) -.byte 102,15,58,223,202,1 - call L$key_expansion_256a_cold -.byte 102,15,58,223,200,1 - call L$key_expansion_256b -.byte 102,15,58,223,202,2 - call L$key_expansion_256a -.byte 102,15,58,223,200,2 - call L$key_expansion_256b -.byte 102,15,58,223,202,4 - call L$key_expansion_256a -.byte 102,15,58,223,200,4 - call L$key_expansion_256b -.byte 102,15,58,223,202,8 - call L$key_expansion_256a -.byte 102,15,58,223,200,8 - call L$key_expansion_256b -.byte 102,15,58,223,202,16 - call L$key_expansion_256a -.byte 102,15,58,223,200,16 - call L$key_expansion_256b -.byte 102,15,58,223,202,32 - call L$key_expansion_256a -.byte 102,15,58,223,200,32 - call L$key_expansion_256b -.byte 102,15,58,223,202,64 - call L$key_expansion_256a - movups %xmm0,(%rax) - movl %esi,16(%rax) - xorq %rax,%rax - jmp L$enc_key_ret - -.p2align 4 -L$14rounds_alt: - movdqa L$key_rotate(%rip),%xmm5 - movdqa L$key_rcon1(%rip),%xmm4 - movl $7,%r10d - movdqu %xmm0,0(%rdx) - movdqa %xmm2,%xmm1 - movdqu %xmm2,16(%rdx) - jmp L$oop_key256 - -.p2align 4 -L$oop_key256: -.byte 102,15,56,0,213 -.byte 102,15,56,221,212 - - movdqa %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm3,%xmm0 - pslld $1,%xmm4 - - pxor %xmm2,%xmm0 - movdqu %xmm0,(%rax) - - decl %r10d - jz L$done_key256 - - pshufd $0xff,%xmm0,%xmm2 - pxor %xmm3,%xmm3 -.byte 102,15,56,221,211 - - movdqa %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm3,%xmm1 - - pxor %xmm1,%xmm2 - movdqu %xmm2,16(%rax) - leaq 32(%rax),%rax - movdqa %xmm2,%xmm1 - - jmp L$oop_key256 - -L$done_key256: - movl %esi,16(%rax) - xorl %eax,%eax - jmp L$enc_key_ret - -.p2align 4 -L$bad_keybits: - movq $-2,%rax -L$enc_key_ret: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - addq $8,%rsp - - .byte 0xf3,0xc3 - -L$SEH_end_set_encrypt_key: - -.p2align 4 -L$key_expansion_128: - movups %xmm0,(%rax) - leaq 16(%rax),%rax -L$key_expansion_128_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - .byte 0xf3,0xc3 - -.p2align 4 -L$key_expansion_192a: - movups %xmm0,(%rax) - leaq 16(%rax),%rax -L$key_expansion_192a_cold: - movaps %xmm2,%xmm5 -L$key_expansion_192b_warm: - shufps $16,%xmm0,%xmm4 - movdqa %xmm2,%xmm3 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - pslldq $4,%xmm3 - xorps %xmm4,%xmm0 - pshufd $85,%xmm1,%xmm1 - pxor %xmm3,%xmm2 - pxor %xmm1,%xmm0 - pshufd $255,%xmm0,%xmm3 - pxor %xmm3,%xmm2 - .byte 0xf3,0xc3 - -.p2align 4 -L$key_expansion_192b: - movaps %xmm0,%xmm3 - shufps $68,%xmm0,%xmm5 - movups %xmm5,(%rax) - shufps $78,%xmm2,%xmm3 - movups %xmm3,16(%rax) - leaq 32(%rax),%rax - jmp L$key_expansion_192b_warm - -.p2align 4 -L$key_expansion_256a: - movups %xmm2,(%rax) - leaq 16(%rax),%rax -L$key_expansion_256a_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - .byte 0xf3,0xc3 - -.p2align 4 -L$key_expansion_256b: - movups %xmm0,(%rax) - leaq 16(%rax),%rax - - shufps $16,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $140,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $170,%xmm1,%xmm1 - xorps %xmm1,%xmm2 - .byte 0xf3,0xc3 - - -.p2align 6 -L$bswap_mask: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -L$increment32: -.long 6,6,6,0 -L$increment64: -.long 1,0,0,0 -L$xts_magic: -.long 0x87,0,1,0 -L$increment1: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 -L$key_rotate: -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d -L$key_rotate192: -.long 0x04070605,0x04070605,0x04070605,0x04070605 -L$key_rcon1: -.long 1,1,1,1 -L$key_rcon1b: -.long 0x1b,0x1b,0x1b,0x1b - -.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.p2align 6 -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S deleted file mode 100644 index 7f92fc51..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S +++ /dev/null @@ -1,426 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - - - - -.globl _gcm_gmult_ssse3 -.private_extern _gcm_gmult_ssse3 -.p2align 4 -_gcm_gmult_ssse3: - -L$gmult_seh_begin: - movdqu (%rdi),%xmm0 - movdqa L$reverse_bytes(%rip),%xmm10 - movdqa L$low4_mask(%rip),%xmm2 - - -.byte 102,65,15,56,0,194 - - - movdqa %xmm2,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm2,%xmm0 - - - - - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - movq $5,%rax -L$oop_row_1: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz L$oop_row_1 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movq $5,%rax -L$oop_row_2: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz L$oop_row_2 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movq $6,%rax -L$oop_row_3: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz L$oop_row_3 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - -.byte 102,65,15,56,0,210 - movdqu %xmm2,(%rdi) - - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 -L$gmult_seh_end: - - - - - - - - -.globl _gcm_ghash_ssse3 -.private_extern _gcm_ghash_ssse3 -.p2align 4 -_gcm_ghash_ssse3: -L$ghash_seh_begin: - - movdqu (%rdi),%xmm0 - movdqa L$reverse_bytes(%rip),%xmm10 - movdqa L$low4_mask(%rip),%xmm11 - - - andq $-16,%rcx - - - -.byte 102,65,15,56,0,194 - - - pxor %xmm3,%xmm3 -L$oop_ghash: - - movdqu (%rdx),%xmm1 -.byte 102,65,15,56,0,202 - pxor %xmm1,%xmm0 - - - movdqa %xmm11,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm11,%xmm0 - - - - - pxor %xmm2,%xmm2 - - movq $5,%rax -L$oop_row_4: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz L$oop_row_4 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movq $5,%rax -L$oop_row_5: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz L$oop_row_5 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movq $6,%rax -L$oop_row_6: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz L$oop_row_6 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movdqa %xmm2,%xmm0 - - - leaq -256(%rsi),%rsi - - - leaq 16(%rdx),%rdx - subq $16,%rcx - jnz L$oop_ghash - - -.byte 102,65,15,56,0,194 - movdqu %xmm0,(%rdi) - - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 -L$ghash_seh_end: - - - -.p2align 4 - - -L$reverse_bytes: -.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -L$low4_mask: -.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/ghash-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/ghash-x86_64.S deleted file mode 100644 index fd767a05..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/ghash-x86_64.S +++ /dev/null @@ -1,1125 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - -.globl _gcm_init_clmul -.private_extern _gcm_init_clmul - -.p2align 4 -_gcm_init_clmul: - -L$_init_clmul: - movdqu (%rsi),%xmm2 - pshufd $78,%xmm2,%xmm2 - - - pshufd $255,%xmm2,%xmm4 - movdqa %xmm2,%xmm3 - psllq $1,%xmm2 - pxor %xmm5,%xmm5 - psrlq $63,%xmm3 - pcmpgtd %xmm4,%xmm5 - pslldq $8,%xmm3 - por %xmm3,%xmm2 - - - pand L$0x1c2_polynomial(%rip),%xmm5 - pxor %xmm5,%xmm2 - - - pshufd $78,%xmm2,%xmm6 - movdqa %xmm2,%xmm0 - pxor %xmm2,%xmm6 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - pshufd $78,%xmm2,%xmm3 - pshufd $78,%xmm0,%xmm4 - pxor %xmm2,%xmm3 - movdqu %xmm2,0(%rdi) - pxor %xmm0,%xmm4 - movdqu %xmm0,16(%rdi) -.byte 102,15,58,15,227,8 - movdqu %xmm4,32(%rdi) - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - movdqa %xmm0,%xmm5 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - pshufd $78,%xmm5,%xmm3 - pshufd $78,%xmm0,%xmm4 - pxor %xmm5,%xmm3 - movdqu %xmm5,48(%rdi) - pxor %xmm0,%xmm4 - movdqu %xmm0,64(%rdi) -.byte 102,15,58,15,227,8 - movdqu %xmm4,80(%rdi) - .byte 0xf3,0xc3 - - -.globl _gcm_gmult_clmul -.private_extern _gcm_gmult_clmul - -.p2align 4 -_gcm_gmult_clmul: - -L$_gmult_clmul: - movdqu (%rdi),%xmm0 - movdqa L$bswap_mask(%rip),%xmm5 - movdqu (%rsi),%xmm2 - movdqu 32(%rsi),%xmm4 -.byte 102,15,56,0,197 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,197 - movdqu %xmm0,(%rdi) - .byte 0xf3,0xc3 - - -.globl _gcm_ghash_clmul -.private_extern _gcm_ghash_clmul - -.p2align 5 -_gcm_ghash_clmul: - -L$_ghash_clmul: - movdqa L$bswap_mask(%rip),%xmm10 - - movdqu (%rdi),%xmm0 - movdqu (%rsi),%xmm2 - movdqu 32(%rsi),%xmm7 -.byte 102,65,15,56,0,194 - - subq $0x10,%rcx - jz L$odd_tail - - movdqu 16(%rsi),%xmm6 - leaq _OPENSSL_ia32cap_P(%rip),%rax - movl 4(%rax),%eax - cmpq $0x30,%rcx - jb L$skip4x - - andl $71303168,%eax - cmpl $4194304,%eax - je L$skip4x - - subq $0x30,%rcx - movq $0xA040608020C0E000,%rax - movdqu 48(%rsi),%xmm14 - movdqu 64(%rsi),%xmm15 - - - - - movdqu 48(%rdx),%xmm3 - movdqu 32(%rdx),%xmm11 -.byte 102,65,15,56,0,218 -.byte 102,69,15,56,0,218 - movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,68,218,0 -.byte 102,15,58,68,234,17 -.byte 102,15,58,68,231,0 - - movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 - pxor %xmm11,%xmm12 -.byte 102,68,15,58,68,222,0 -.byte 102,68,15,58,68,238,17 -.byte 102,68,15,58,68,231,16 - xorps %xmm11,%xmm3 - xorps %xmm13,%xmm5 - movups 80(%rsi),%xmm7 - xorps %xmm12,%xmm4 - - movdqu 16(%rdx),%xmm11 - movdqu 0(%rdx),%xmm8 -.byte 102,69,15,56,0,218 -.byte 102,69,15,56,0,194 - movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 - pxor %xmm8,%xmm0 - pxor %xmm11,%xmm12 -.byte 102,69,15,58,68,222,0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm8 - pxor %xmm0,%xmm8 -.byte 102,69,15,58,68,238,17 -.byte 102,68,15,58,68,231,0 - xorps %xmm11,%xmm3 - xorps %xmm13,%xmm5 - - leaq 64(%rdx),%rdx - subq $0x40,%rcx - jc L$tail4x - - jmp L$mod4_loop -.p2align 5 -L$mod4_loop: -.byte 102,65,15,58,68,199,0 - xorps %xmm12,%xmm4 - movdqu 48(%rdx),%xmm11 -.byte 102,69,15,56,0,218 -.byte 102,65,15,58,68,207,17 - xorps %xmm3,%xmm0 - movdqu 32(%rdx),%xmm3 - movdqa %xmm11,%xmm13 -.byte 102,68,15,58,68,199,16 - pshufd $78,%xmm11,%xmm12 - xorps %xmm5,%xmm1 - pxor %xmm11,%xmm12 -.byte 102,65,15,56,0,218 - movups 32(%rsi),%xmm7 - xorps %xmm4,%xmm8 -.byte 102,68,15,58,68,218,0 - pshufd $78,%xmm3,%xmm4 - - pxor %xmm0,%xmm8 - movdqa %xmm3,%xmm5 - pxor %xmm1,%xmm8 - pxor %xmm3,%xmm4 - movdqa %xmm8,%xmm9 -.byte 102,68,15,58,68,234,17 - pslldq $8,%xmm8 - psrldq $8,%xmm9 - pxor %xmm8,%xmm0 - movdqa L$7_mask(%rip),%xmm8 - pxor %xmm9,%xmm1 -.byte 102,76,15,110,200 - - pand %xmm0,%xmm8 -.byte 102,69,15,56,0,200 - pxor %xmm0,%xmm9 -.byte 102,68,15,58,68,231,0 - psllq $57,%xmm9 - movdqa %xmm9,%xmm8 - pslldq $8,%xmm9 -.byte 102,15,58,68,222,0 - psrldq $8,%xmm8 - pxor %xmm9,%xmm0 - pxor %xmm8,%xmm1 - movdqu 0(%rdx),%xmm8 - - movdqa %xmm0,%xmm9 - psrlq $1,%xmm0 -.byte 102,15,58,68,238,17 - xorps %xmm11,%xmm3 - movdqu 16(%rdx),%xmm11 -.byte 102,69,15,56,0,218 -.byte 102,15,58,68,231,16 - xorps %xmm13,%xmm5 - movups 80(%rsi),%xmm7 -.byte 102,69,15,56,0,194 - pxor %xmm9,%xmm1 - pxor %xmm0,%xmm9 - psrlq $5,%xmm0 - - movdqa %xmm11,%xmm13 - pxor %xmm12,%xmm4 - pshufd $78,%xmm11,%xmm12 - pxor %xmm9,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm11,%xmm12 -.byte 102,69,15,58,68,222,0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - movdqa %xmm0,%xmm1 -.byte 102,69,15,58,68,238,17 - xorps %xmm11,%xmm3 - pshufd $78,%xmm0,%xmm8 - pxor %xmm0,%xmm8 - -.byte 102,68,15,58,68,231,0 - xorps %xmm13,%xmm5 - - leaq 64(%rdx),%rdx - subq $0x40,%rcx - jnc L$mod4_loop - -L$tail4x: -.byte 102,65,15,58,68,199,0 -.byte 102,65,15,58,68,207,17 -.byte 102,68,15,58,68,199,16 - xorps %xmm12,%xmm4 - xorps %xmm3,%xmm0 - xorps %xmm5,%xmm1 - pxor %xmm0,%xmm1 - pxor %xmm4,%xmm8 - - pxor %xmm1,%xmm8 - pxor %xmm0,%xmm1 - - movdqa %xmm8,%xmm9 - psrldq $8,%xmm8 - pslldq $8,%xmm9 - pxor %xmm8,%xmm1 - pxor %xmm9,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - addq $0x40,%rcx - jz L$done - movdqu 32(%rsi),%xmm7 - subq $0x10,%rcx - jz L$odd_tail -L$skip4x: - - - - - - movdqu (%rdx),%xmm8 - movdqu 16(%rdx),%xmm3 -.byte 102,69,15,56,0,194 -.byte 102,65,15,56,0,218 - pxor %xmm8,%xmm0 - - movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,68,218,0 -.byte 102,15,58,68,234,17 -.byte 102,15,58,68,231,0 - - leaq 32(%rdx),%rdx - nop - subq $0x20,%rcx - jbe L$even_tail - nop - jmp L$mod_loop - -.p2align 5 -L$mod_loop: - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 - pxor %xmm0,%xmm4 - -.byte 102,15,58,68,198,0 -.byte 102,15,58,68,206,17 -.byte 102,15,58,68,231,16 - - pxor %xmm3,%xmm0 - pxor %xmm5,%xmm1 - movdqu (%rdx),%xmm9 - pxor %xmm0,%xmm8 -.byte 102,69,15,56,0,202 - movdqu 16(%rdx),%xmm3 - - pxor %xmm1,%xmm8 - pxor %xmm9,%xmm1 - pxor %xmm8,%xmm4 -.byte 102,65,15,56,0,218 - movdqa %xmm4,%xmm8 - psrldq $8,%xmm8 - pslldq $8,%xmm4 - pxor %xmm8,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm3,%xmm5 - - movdqa %xmm0,%xmm9 - movdqa %xmm0,%xmm8 - psllq $5,%xmm0 - pxor %xmm0,%xmm8 -.byte 102,15,58,68,218,0 - psllq $1,%xmm0 - pxor %xmm8,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm8 - pslldq $8,%xmm0 - psrldq $8,%xmm8 - pxor %xmm9,%xmm0 - pshufd $78,%xmm5,%xmm4 - pxor %xmm8,%xmm1 - pxor %xmm5,%xmm4 - - movdqa %xmm0,%xmm9 - psrlq $1,%xmm0 -.byte 102,15,58,68,234,17 - pxor %xmm9,%xmm1 - pxor %xmm0,%xmm9 - psrlq $5,%xmm0 - pxor %xmm9,%xmm0 - leaq 32(%rdx),%rdx - psrlq $1,%xmm0 -.byte 102,15,58,68,231,0 - pxor %xmm1,%xmm0 - - subq $0x20,%rcx - ja L$mod_loop - -L$even_tail: - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 - pxor %xmm0,%xmm4 - -.byte 102,15,58,68,198,0 -.byte 102,15,58,68,206,17 -.byte 102,15,58,68,231,16 - - pxor %xmm3,%xmm0 - pxor %xmm5,%xmm1 - pxor %xmm0,%xmm8 - pxor %xmm1,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm8 - psrldq $8,%xmm8 - pslldq $8,%xmm4 - pxor %xmm8,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - testq %rcx,%rcx - jnz L$done - -L$odd_tail: - movdqu (%rdx),%xmm8 -.byte 102,69,15,56,0,194 - pxor %xmm8,%xmm0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,223,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 -L$done: -.byte 102,65,15,56,0,194 - movdqu %xmm0,(%rdi) - .byte 0xf3,0xc3 - - -.globl _gcm_init_avx -.private_extern _gcm_init_avx - -.p2align 5 -_gcm_init_avx: - - vzeroupper - - vmovdqu (%rsi),%xmm2 - vpshufd $78,%xmm2,%xmm2 - - - vpshufd $255,%xmm2,%xmm4 - vpsrlq $63,%xmm2,%xmm3 - vpsllq $1,%xmm2,%xmm2 - vpxor %xmm5,%xmm5,%xmm5 - vpcmpgtd %xmm4,%xmm5,%xmm5 - vpslldq $8,%xmm3,%xmm3 - vpor %xmm3,%xmm2,%xmm2 - - - vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5 - vpxor %xmm5,%xmm2,%xmm2 - - vpunpckhqdq %xmm2,%xmm2,%xmm6 - vmovdqa %xmm2,%xmm0 - vpxor %xmm2,%xmm6,%xmm6 - movq $4,%r10 - jmp L$init_start_avx -.p2align 5 -L$init_loop_avx: - vpalignr $8,%xmm3,%xmm4,%xmm5 - vmovdqu %xmm5,-16(%rdi) - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 -L$init_start_avx: - vmovdqa %xmm0,%xmm5 - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 - vpshufd $78,%xmm5,%xmm3 - vpshufd $78,%xmm0,%xmm4 - vpxor %xmm5,%xmm3,%xmm3 - vmovdqu %xmm5,0(%rdi) - vpxor %xmm0,%xmm4,%xmm4 - vmovdqu %xmm0,16(%rdi) - leaq 48(%rdi),%rdi - subq $1,%r10 - jnz L$init_loop_avx - - vpalignr $8,%xmm4,%xmm3,%xmm5 - vmovdqu %xmm5,-16(%rdi) - - vzeroupper - .byte 0xf3,0xc3 - - -.globl _gcm_gmult_avx -.private_extern _gcm_gmult_avx - -.p2align 5 -_gcm_gmult_avx: - - jmp L$_gmult_clmul - - -.globl _gcm_ghash_avx -.private_extern _gcm_ghash_avx - -.p2align 5 -_gcm_ghash_avx: - - vzeroupper - - vmovdqu (%rdi),%xmm10 - leaq L$0x1c2_polynomial(%rip),%r10 - leaq 64(%rsi),%rsi - vmovdqu L$bswap_mask(%rip),%xmm13 - vpshufb %xmm13,%xmm10,%xmm10 - cmpq $0x80,%rcx - jb L$short_avx - subq $0x80,%rcx - - vmovdqu 112(%rdx),%xmm14 - vmovdqu 0-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vmovdqu 32-64(%rsi),%xmm7 - - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm14,%xmm9,%xmm9 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 80(%rdx),%xmm14 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 48-64(%rsi),%xmm6 - vpxor %xmm14,%xmm9,%xmm9 - vmovdqu 64(%rdx),%xmm15 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 48(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 32(%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 16(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu (%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 - - leaq 128(%rdx),%rdx - cmpq $0x80,%rcx - jb L$tail_avx - - vpxor %xmm10,%xmm15,%xmm15 - subq $0x80,%rcx - jmp L$oop8x_avx - -.p2align 5 -L$oop8x_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 112(%rdx),%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm15,%xmm8,%xmm8 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 - vmovdqu 0-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 - vmovdqu 32-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm3,%xmm10,%xmm10 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vxorps %xmm4,%xmm11,%xmm11 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm5,%xmm12,%xmm12 - vxorps %xmm15,%xmm8,%xmm8 - - vmovdqu 80(%rdx),%xmm14 - vpxor %xmm10,%xmm12,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm11,%xmm12,%xmm12 - vpslldq $8,%xmm12,%xmm9 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vpsrldq $8,%xmm12,%xmm12 - vpxor %xmm9,%xmm10,%xmm10 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vxorps %xmm12,%xmm11,%xmm11 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 64(%rdx),%xmm15 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vxorps %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - - vmovdqu 48(%rdx),%xmm14 - vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 32(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - vxorps %xmm12,%xmm10,%xmm10 - - vmovdqu 16(%rdx),%xmm14 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 - vxorps %xmm11,%xmm12,%xmm12 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu (%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm12,%xmm15,%xmm15 - vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 - vpxor %xmm10,%xmm15,%xmm15 - - leaq 128(%rdx),%rdx - subq $0x80,%rcx - jnc L$oop8x_avx - - addq $0x80,%rcx - jmp L$tail_no_xor_avx - -.p2align 5 -L$short_avx: - vmovdqu -16(%rdx,%rcx,1),%xmm14 - leaq (%rdx,%rcx,1),%rdx - vmovdqu 0-64(%rsi),%xmm6 - vmovdqu 32-64(%rsi),%xmm7 - vpshufb %xmm13,%xmm14,%xmm15 - - vmovdqa %xmm0,%xmm3 - vmovdqa %xmm1,%xmm4 - vmovdqa %xmm2,%xmm5 - subq $0x10,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -32(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -48(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovdqu 80-64(%rsi),%xmm7 - subq $0x10,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -64(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -80(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 96-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovdqu 128-64(%rsi),%xmm7 - subq $0x10,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -96(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz L$tail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -112(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 144-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovq 184-64(%rsi),%xmm7 - subq $0x10,%rcx - jmp L$tail_avx - -.p2align 5 -L$tail_avx: - vpxor %xmm10,%xmm15,%xmm15 -L$tail_no_xor_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - - vmovdqu (%r10),%xmm12 - - vpxor %xmm0,%xmm3,%xmm10 - vpxor %xmm1,%xmm4,%xmm11 - vpxor %xmm2,%xmm5,%xmm5 - - vpxor %xmm10,%xmm5,%xmm5 - vpxor %xmm11,%xmm5,%xmm5 - vpslldq $8,%xmm5,%xmm9 - vpsrldq $8,%xmm5,%xmm5 - vpxor %xmm9,%xmm10,%xmm10 - vpxor %xmm5,%xmm11,%xmm11 - - vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm11,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - cmpq $0,%rcx - jne L$short_avx - - vpshufb %xmm13,%xmm10,%xmm10 - vmovdqu %xmm10,(%rdi) - vzeroupper - .byte 0xf3,0xc3 - - -.p2align 6 -L$bswap_mask: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -L$0x1c2_polynomial: -.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -L$7_mask: -.long 7,0,7,0 -.p2align 6 - -.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.p2align 6 -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/md5-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/md5-x86_64.S deleted file mode 100644 index 06e3ba06..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/md5-x86_64.S +++ /dev/null @@ -1,696 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.p2align 4 - -.globl _md5_block_asm_data_order -.private_extern _md5_block_asm_data_order - -_md5_block_asm_data_order: - - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r14 - - pushq %r15 - -L$prologue: - - - - - movq %rdi,%rbp - shlq $6,%rdx - leaq (%rsi,%rdx,1),%rdi - movl 0(%rbp),%eax - movl 4(%rbp),%ebx - movl 8(%rbp),%ecx - movl 12(%rbp),%edx - - - - - - - - cmpq %rdi,%rsi - je L$end - - -L$loop: - movl %eax,%r8d - movl %ebx,%r9d - movl %ecx,%r14d - movl %edx,%r15d - movl 0(%rsi),%r10d - movl %edx,%r11d - xorl %ecx,%r11d - leal -680876936(%rax,%r10,1),%eax - andl %ebx,%r11d - xorl %edx,%r11d - movl 4(%rsi),%r10d - addl %r11d,%eax - roll $7,%eax - movl %ecx,%r11d - addl %ebx,%eax - xorl %ebx,%r11d - leal -389564586(%rdx,%r10,1),%edx - andl %eax,%r11d - xorl %ecx,%r11d - movl 8(%rsi),%r10d - addl %r11d,%edx - roll $12,%edx - movl %ebx,%r11d - addl %eax,%edx - xorl %eax,%r11d - leal 606105819(%rcx,%r10,1),%ecx - andl %edx,%r11d - xorl %ebx,%r11d - movl 12(%rsi),%r10d - addl %r11d,%ecx - roll $17,%ecx - movl %eax,%r11d - addl %edx,%ecx - xorl %edx,%r11d - leal -1044525330(%rbx,%r10,1),%ebx - andl %ecx,%r11d - xorl %eax,%r11d - movl 16(%rsi),%r10d - addl %r11d,%ebx - roll $22,%ebx - movl %edx,%r11d - addl %ecx,%ebx - xorl %ecx,%r11d - leal -176418897(%rax,%r10,1),%eax - andl %ebx,%r11d - xorl %edx,%r11d - movl 20(%rsi),%r10d - addl %r11d,%eax - roll $7,%eax - movl %ecx,%r11d - addl %ebx,%eax - xorl %ebx,%r11d - leal 1200080426(%rdx,%r10,1),%edx - andl %eax,%r11d - xorl %ecx,%r11d - movl 24(%rsi),%r10d - addl %r11d,%edx - roll $12,%edx - movl %ebx,%r11d - addl %eax,%edx - xorl %eax,%r11d - leal -1473231341(%rcx,%r10,1),%ecx - andl %edx,%r11d - xorl %ebx,%r11d - movl 28(%rsi),%r10d - addl %r11d,%ecx - roll $17,%ecx - movl %eax,%r11d - addl %edx,%ecx - xorl %edx,%r11d - leal -45705983(%rbx,%r10,1),%ebx - andl %ecx,%r11d - xorl %eax,%r11d - movl 32(%rsi),%r10d - addl %r11d,%ebx - roll $22,%ebx - movl %edx,%r11d - addl %ecx,%ebx - xorl %ecx,%r11d - leal 1770035416(%rax,%r10,1),%eax - andl %ebx,%r11d - xorl %edx,%r11d - movl 36(%rsi),%r10d - addl %r11d,%eax - roll $7,%eax - movl %ecx,%r11d - addl %ebx,%eax - xorl %ebx,%r11d - leal -1958414417(%rdx,%r10,1),%edx - andl %eax,%r11d - xorl %ecx,%r11d - movl 40(%rsi),%r10d - addl %r11d,%edx - roll $12,%edx - movl %ebx,%r11d - addl %eax,%edx - xorl %eax,%r11d - leal -42063(%rcx,%r10,1),%ecx - andl %edx,%r11d - xorl %ebx,%r11d - movl 44(%rsi),%r10d - addl %r11d,%ecx - roll $17,%ecx - movl %eax,%r11d - addl %edx,%ecx - xorl %edx,%r11d - leal -1990404162(%rbx,%r10,1),%ebx - andl %ecx,%r11d - xorl %eax,%r11d - movl 48(%rsi),%r10d - addl %r11d,%ebx - roll $22,%ebx - movl %edx,%r11d - addl %ecx,%ebx - xorl %ecx,%r11d - leal 1804603682(%rax,%r10,1),%eax - andl %ebx,%r11d - xorl %edx,%r11d - movl 52(%rsi),%r10d - addl %r11d,%eax - roll $7,%eax - movl %ecx,%r11d - addl %ebx,%eax - xorl %ebx,%r11d - leal -40341101(%rdx,%r10,1),%edx - andl %eax,%r11d - xorl %ecx,%r11d - movl 56(%rsi),%r10d - addl %r11d,%edx - roll $12,%edx - movl %ebx,%r11d - addl %eax,%edx - xorl %eax,%r11d - leal -1502002290(%rcx,%r10,1),%ecx - andl %edx,%r11d - xorl %ebx,%r11d - movl 60(%rsi),%r10d - addl %r11d,%ecx - roll $17,%ecx - movl %eax,%r11d - addl %edx,%ecx - xorl %edx,%r11d - leal 1236535329(%rbx,%r10,1),%ebx - andl %ecx,%r11d - xorl %eax,%r11d - movl 0(%rsi),%r10d - addl %r11d,%ebx - roll $22,%ebx - movl %edx,%r11d - addl %ecx,%ebx - movl 4(%rsi),%r10d - movl %edx,%r11d - movl %edx,%r12d - notl %r11d - leal -165796510(%rax,%r10,1),%eax - andl %ebx,%r12d - andl %ecx,%r11d - movl 24(%rsi),%r10d - orl %r11d,%r12d - movl %ecx,%r11d - addl %r12d,%eax - movl %ecx,%r12d - roll $5,%eax - addl %ebx,%eax - notl %r11d - leal -1069501632(%rdx,%r10,1),%edx - andl %eax,%r12d - andl %ebx,%r11d - movl 44(%rsi),%r10d - orl %r11d,%r12d - movl %ebx,%r11d - addl %r12d,%edx - movl %ebx,%r12d - roll $9,%edx - addl %eax,%edx - notl %r11d - leal 643717713(%rcx,%r10,1),%ecx - andl %edx,%r12d - andl %eax,%r11d - movl 0(%rsi),%r10d - orl %r11d,%r12d - movl %eax,%r11d - addl %r12d,%ecx - movl %eax,%r12d - roll $14,%ecx - addl %edx,%ecx - notl %r11d - leal -373897302(%rbx,%r10,1),%ebx - andl %ecx,%r12d - andl %edx,%r11d - movl 20(%rsi),%r10d - orl %r11d,%r12d - movl %edx,%r11d - addl %r12d,%ebx - movl %edx,%r12d - roll $20,%ebx - addl %ecx,%ebx - notl %r11d - leal -701558691(%rax,%r10,1),%eax - andl %ebx,%r12d - andl %ecx,%r11d - movl 40(%rsi),%r10d - orl %r11d,%r12d - movl %ecx,%r11d - addl %r12d,%eax - movl %ecx,%r12d - roll $5,%eax - addl %ebx,%eax - notl %r11d - leal 38016083(%rdx,%r10,1),%edx - andl %eax,%r12d - andl %ebx,%r11d - movl 60(%rsi),%r10d - orl %r11d,%r12d - movl %ebx,%r11d - addl %r12d,%edx - movl %ebx,%r12d - roll $9,%edx - addl %eax,%edx - notl %r11d - leal -660478335(%rcx,%r10,1),%ecx - andl %edx,%r12d - andl %eax,%r11d - movl 16(%rsi),%r10d - orl %r11d,%r12d - movl %eax,%r11d - addl %r12d,%ecx - movl %eax,%r12d - roll $14,%ecx - addl %edx,%ecx - notl %r11d - leal -405537848(%rbx,%r10,1),%ebx - andl %ecx,%r12d - andl %edx,%r11d - movl 36(%rsi),%r10d - orl %r11d,%r12d - movl %edx,%r11d - addl %r12d,%ebx - movl %edx,%r12d - roll $20,%ebx - addl %ecx,%ebx - notl %r11d - leal 568446438(%rax,%r10,1),%eax - andl %ebx,%r12d - andl %ecx,%r11d - movl 56(%rsi),%r10d - orl %r11d,%r12d - movl %ecx,%r11d - addl %r12d,%eax - movl %ecx,%r12d - roll $5,%eax - addl %ebx,%eax - notl %r11d - leal -1019803690(%rdx,%r10,1),%edx - andl %eax,%r12d - andl %ebx,%r11d - movl 12(%rsi),%r10d - orl %r11d,%r12d - movl %ebx,%r11d - addl %r12d,%edx - movl %ebx,%r12d - roll $9,%edx - addl %eax,%edx - notl %r11d - leal -187363961(%rcx,%r10,1),%ecx - andl %edx,%r12d - andl %eax,%r11d - movl 32(%rsi),%r10d - orl %r11d,%r12d - movl %eax,%r11d - addl %r12d,%ecx - movl %eax,%r12d - roll $14,%ecx - addl %edx,%ecx - notl %r11d - leal 1163531501(%rbx,%r10,1),%ebx - andl %ecx,%r12d - andl %edx,%r11d - movl 52(%rsi),%r10d - orl %r11d,%r12d - movl %edx,%r11d - addl %r12d,%ebx - movl %edx,%r12d - roll $20,%ebx - addl %ecx,%ebx - notl %r11d - leal -1444681467(%rax,%r10,1),%eax - andl %ebx,%r12d - andl %ecx,%r11d - movl 8(%rsi),%r10d - orl %r11d,%r12d - movl %ecx,%r11d - addl %r12d,%eax - movl %ecx,%r12d - roll $5,%eax - addl %ebx,%eax - notl %r11d - leal -51403784(%rdx,%r10,1),%edx - andl %eax,%r12d - andl %ebx,%r11d - movl 28(%rsi),%r10d - orl %r11d,%r12d - movl %ebx,%r11d - addl %r12d,%edx - movl %ebx,%r12d - roll $9,%edx - addl %eax,%edx - notl %r11d - leal 1735328473(%rcx,%r10,1),%ecx - andl %edx,%r12d - andl %eax,%r11d - movl 48(%rsi),%r10d - orl %r11d,%r12d - movl %eax,%r11d - addl %r12d,%ecx - movl %eax,%r12d - roll $14,%ecx - addl %edx,%ecx - notl %r11d - leal -1926607734(%rbx,%r10,1),%ebx - andl %ecx,%r12d - andl %edx,%r11d - movl 0(%rsi),%r10d - orl %r11d,%r12d - movl %edx,%r11d - addl %r12d,%ebx - movl %edx,%r12d - roll $20,%ebx - addl %ecx,%ebx - movl 20(%rsi),%r10d - movl %ecx,%r11d - leal -378558(%rax,%r10,1),%eax - movl 32(%rsi),%r10d - xorl %edx,%r11d - xorl %ebx,%r11d - addl %r11d,%eax - roll $4,%eax - movl %ebx,%r11d - addl %ebx,%eax - leal -2022574463(%rdx,%r10,1),%edx - movl 44(%rsi),%r10d - xorl %ecx,%r11d - xorl %eax,%r11d - addl %r11d,%edx - roll $11,%edx - movl %eax,%r11d - addl %eax,%edx - leal 1839030562(%rcx,%r10,1),%ecx - movl 56(%rsi),%r10d - xorl %ebx,%r11d - xorl %edx,%r11d - addl %r11d,%ecx - roll $16,%ecx - movl %edx,%r11d - addl %edx,%ecx - leal -35309556(%rbx,%r10,1),%ebx - movl 4(%rsi),%r10d - xorl %eax,%r11d - xorl %ecx,%r11d - addl %r11d,%ebx - roll $23,%ebx - movl %ecx,%r11d - addl %ecx,%ebx - leal -1530992060(%rax,%r10,1),%eax - movl 16(%rsi),%r10d - xorl %edx,%r11d - xorl %ebx,%r11d - addl %r11d,%eax - roll $4,%eax - movl %ebx,%r11d - addl %ebx,%eax - leal 1272893353(%rdx,%r10,1),%edx - movl 28(%rsi),%r10d - xorl %ecx,%r11d - xorl %eax,%r11d - addl %r11d,%edx - roll $11,%edx - movl %eax,%r11d - addl %eax,%edx - leal -155497632(%rcx,%r10,1),%ecx - movl 40(%rsi),%r10d - xorl %ebx,%r11d - xorl %edx,%r11d - addl %r11d,%ecx - roll $16,%ecx - movl %edx,%r11d - addl %edx,%ecx - leal -1094730640(%rbx,%r10,1),%ebx - movl 52(%rsi),%r10d - xorl %eax,%r11d - xorl %ecx,%r11d - addl %r11d,%ebx - roll $23,%ebx - movl %ecx,%r11d - addl %ecx,%ebx - leal 681279174(%rax,%r10,1),%eax - movl 0(%rsi),%r10d - xorl %edx,%r11d - xorl %ebx,%r11d - addl %r11d,%eax - roll $4,%eax - movl %ebx,%r11d - addl %ebx,%eax - leal -358537222(%rdx,%r10,1),%edx - movl 12(%rsi),%r10d - xorl %ecx,%r11d - xorl %eax,%r11d - addl %r11d,%edx - roll $11,%edx - movl %eax,%r11d - addl %eax,%edx - leal -722521979(%rcx,%r10,1),%ecx - movl 24(%rsi),%r10d - xorl %ebx,%r11d - xorl %edx,%r11d - addl %r11d,%ecx - roll $16,%ecx - movl %edx,%r11d - addl %edx,%ecx - leal 76029189(%rbx,%r10,1),%ebx - movl 36(%rsi),%r10d - xorl %eax,%r11d - xorl %ecx,%r11d - addl %r11d,%ebx - roll $23,%ebx - movl %ecx,%r11d - addl %ecx,%ebx - leal -640364487(%rax,%r10,1),%eax - movl 48(%rsi),%r10d - xorl %edx,%r11d - xorl %ebx,%r11d - addl %r11d,%eax - roll $4,%eax - movl %ebx,%r11d - addl %ebx,%eax - leal -421815835(%rdx,%r10,1),%edx - movl 60(%rsi),%r10d - xorl %ecx,%r11d - xorl %eax,%r11d - addl %r11d,%edx - roll $11,%edx - movl %eax,%r11d - addl %eax,%edx - leal 530742520(%rcx,%r10,1),%ecx - movl 8(%rsi),%r10d - xorl %ebx,%r11d - xorl %edx,%r11d - addl %r11d,%ecx - roll $16,%ecx - movl %edx,%r11d - addl %edx,%ecx - leal -995338651(%rbx,%r10,1),%ebx - movl 0(%rsi),%r10d - xorl %eax,%r11d - xorl %ecx,%r11d - addl %r11d,%ebx - roll $23,%ebx - movl %ecx,%r11d - addl %ecx,%ebx - movl 0(%rsi),%r10d - movl $0xffffffff,%r11d - xorl %edx,%r11d - leal -198630844(%rax,%r10,1),%eax - orl %ebx,%r11d - xorl %ecx,%r11d - addl %r11d,%eax - movl 28(%rsi),%r10d - movl $0xffffffff,%r11d - roll $6,%eax - xorl %ecx,%r11d - addl %ebx,%eax - leal 1126891415(%rdx,%r10,1),%edx - orl %eax,%r11d - xorl %ebx,%r11d - addl %r11d,%edx - movl 56(%rsi),%r10d - movl $0xffffffff,%r11d - roll $10,%edx - xorl %ebx,%r11d - addl %eax,%edx - leal -1416354905(%rcx,%r10,1),%ecx - orl %edx,%r11d - xorl %eax,%r11d - addl %r11d,%ecx - movl 20(%rsi),%r10d - movl $0xffffffff,%r11d - roll $15,%ecx - xorl %eax,%r11d - addl %edx,%ecx - leal -57434055(%rbx,%r10,1),%ebx - orl %ecx,%r11d - xorl %edx,%r11d - addl %r11d,%ebx - movl 48(%rsi),%r10d - movl $0xffffffff,%r11d - roll $21,%ebx - xorl %edx,%r11d - addl %ecx,%ebx - leal 1700485571(%rax,%r10,1),%eax - orl %ebx,%r11d - xorl %ecx,%r11d - addl %r11d,%eax - movl 12(%rsi),%r10d - movl $0xffffffff,%r11d - roll $6,%eax - xorl %ecx,%r11d - addl %ebx,%eax - leal -1894986606(%rdx,%r10,1),%edx - orl %eax,%r11d - xorl %ebx,%r11d - addl %r11d,%edx - movl 40(%rsi),%r10d - movl $0xffffffff,%r11d - roll $10,%edx - xorl %ebx,%r11d - addl %eax,%edx - leal -1051523(%rcx,%r10,1),%ecx - orl %edx,%r11d - xorl %eax,%r11d - addl %r11d,%ecx - movl 4(%rsi),%r10d - movl $0xffffffff,%r11d - roll $15,%ecx - xorl %eax,%r11d - addl %edx,%ecx - leal -2054922799(%rbx,%r10,1),%ebx - orl %ecx,%r11d - xorl %edx,%r11d - addl %r11d,%ebx - movl 32(%rsi),%r10d - movl $0xffffffff,%r11d - roll $21,%ebx - xorl %edx,%r11d - addl %ecx,%ebx - leal 1873313359(%rax,%r10,1),%eax - orl %ebx,%r11d - xorl %ecx,%r11d - addl %r11d,%eax - movl 60(%rsi),%r10d - movl $0xffffffff,%r11d - roll $6,%eax - xorl %ecx,%r11d - addl %ebx,%eax - leal -30611744(%rdx,%r10,1),%edx - orl %eax,%r11d - xorl %ebx,%r11d - addl %r11d,%edx - movl 24(%rsi),%r10d - movl $0xffffffff,%r11d - roll $10,%edx - xorl %ebx,%r11d - addl %eax,%edx - leal -1560198380(%rcx,%r10,1),%ecx - orl %edx,%r11d - xorl %eax,%r11d - addl %r11d,%ecx - movl 52(%rsi),%r10d - movl $0xffffffff,%r11d - roll $15,%ecx - xorl %eax,%r11d - addl %edx,%ecx - leal 1309151649(%rbx,%r10,1),%ebx - orl %ecx,%r11d - xorl %edx,%r11d - addl %r11d,%ebx - movl 16(%rsi),%r10d - movl $0xffffffff,%r11d - roll $21,%ebx - xorl %edx,%r11d - addl %ecx,%ebx - leal -145523070(%rax,%r10,1),%eax - orl %ebx,%r11d - xorl %ecx,%r11d - addl %r11d,%eax - movl 44(%rsi),%r10d - movl $0xffffffff,%r11d - roll $6,%eax - xorl %ecx,%r11d - addl %ebx,%eax - leal -1120210379(%rdx,%r10,1),%edx - orl %eax,%r11d - xorl %ebx,%r11d - addl %r11d,%edx - movl 8(%rsi),%r10d - movl $0xffffffff,%r11d - roll $10,%edx - xorl %ebx,%r11d - addl %eax,%edx - leal 718787259(%rcx,%r10,1),%ecx - orl %edx,%r11d - xorl %eax,%r11d - addl %r11d,%ecx - movl 36(%rsi),%r10d - movl $0xffffffff,%r11d - roll $15,%ecx - xorl %eax,%r11d - addl %edx,%ecx - leal -343485551(%rbx,%r10,1),%ebx - orl %ecx,%r11d - xorl %edx,%r11d - addl %r11d,%ebx - movl 0(%rsi),%r10d - movl $0xffffffff,%r11d - roll $21,%ebx - xorl %edx,%r11d - addl %ecx,%ebx - - addl %r8d,%eax - addl %r9d,%ebx - addl %r14d,%ecx - addl %r15d,%edx - - - addq $64,%rsi - cmpq %rdi,%rsi - jb L$loop - - -L$end: - movl %eax,0(%rbp) - movl %ebx,4(%rbp) - movl %ecx,8(%rbp) - movl %edx,12(%rbp) - - movq (%rsp),%r15 - - movq 8(%rsp),%r14 - - movq 16(%rsp),%r12 - - movq 24(%rsp),%rbx - - movq 32(%rsp),%rbp - - addq $40,%rsp - -L$epilogue: - .byte 0xf3,0xc3 - - -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S deleted file mode 100644 index 36057aa1..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S +++ /dev/null @@ -1,4467 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - -.p2align 6 -L$poly: -.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 - -L$One: -.long 1,1,1,1,1,1,1,1 -L$Two: -.long 2,2,2,2,2,2,2,2 -L$Three: -.long 3,3,3,3,3,3,3,3 -L$ONE_mont: -.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe - - -L$ord: -.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 -L$ordK: -.quad 0xccd1c8aaee00bc4f - - - -.globl _ecp_nistz256_neg -.private_extern _ecp_nistz256_neg - -.p2align 5 -_ecp_nistz256_neg: - - pushq %r12 - - pushq %r13 - -L$neg_body: - - xorq %r8,%r8 - xorq %r9,%r9 - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r13,%r13 - - subq 0(%rsi),%r8 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - movq %r8,%rax - sbbq 24(%rsi),%r11 - leaq L$poly(%rip),%rsi - movq %r9,%rdx - sbbq $0,%r13 - - addq 0(%rsi),%r8 - movq %r10,%rcx - adcq 8(%rsi),%r9 - adcq 16(%rsi),%r10 - movq %r11,%r12 - adcq 24(%rsi),%r11 - testq %r13,%r13 - - cmovzq %rax,%r8 - cmovzq %rdx,%r9 - movq %r8,0(%rdi) - cmovzq %rcx,%r10 - movq %r9,8(%rdi) - cmovzq %r12,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 0(%rsp),%r13 - - movq 8(%rsp),%r12 - - leaq 16(%rsp),%rsp - -L$neg_epilogue: - .byte 0xf3,0xc3 - - - - - - - - -.globl _ecp_nistz256_ord_mul_mont -.private_extern _ecp_nistz256_ord_mul_mont - -.p2align 5 -_ecp_nistz256_ord_mul_mont: - - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je L$ecp_nistz256_ord_mul_montx - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$ord_mul_body: - - movq 0(%rdx),%rax - movq %rdx,%rbx - leaq L$ord(%rip),%r14 - movq L$ordK(%rip),%r15 - - - movq %rax,%rcx - mulq 0(%rsi) - movq %rax,%r8 - movq %rcx,%rax - movq %rdx,%r9 - - mulq 8(%rsi) - addq %rax,%r9 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rsi) - addq %rax,%r10 - movq %rcx,%rax - adcq $0,%rdx - - movq %r8,%r13 - imulq %r15,%r8 - - movq %rdx,%r11 - mulq 24(%rsi) - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%r12 - - - mulq 0(%r14) - movq %r8,%rbp - addq %rax,%r13 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%rcx - - subq %r8,%r10 - sbbq $0,%r8 - - mulq 8(%r14) - addq %rcx,%r9 - adcq $0,%rdx - addq %rax,%r9 - movq %rbp,%rax - adcq %rdx,%r10 - movq %rbp,%rdx - adcq $0,%r8 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r11 - movq 8(%rbx),%rax - sbbq %rdx,%rbp - - addq %r8,%r11 - adcq %rbp,%r12 - adcq $0,%r13 - - - movq %rax,%rcx - mulq 0(%rsi) - addq %rax,%r9 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rbp,%r10 - adcq $0,%rdx - addq %rax,%r10 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rbp,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %rcx,%rax - adcq $0,%rdx - - movq %r9,%rcx - imulq %r15,%r9 - - movq %rdx,%rbp - mulq 24(%rsi) - addq %rbp,%r12 - adcq $0,%rdx - xorq %r8,%r8 - addq %rax,%r12 - movq %r9,%rax - adcq %rdx,%r13 - adcq $0,%r8 - - - mulq 0(%r14) - movq %r9,%rbp - addq %rax,%rcx - movq %r9,%rax - adcq %rdx,%rcx - - subq %r9,%r11 - sbbq $0,%r9 - - mulq 8(%r14) - addq %rcx,%r10 - adcq $0,%rdx - addq %rax,%r10 - movq %rbp,%rax - adcq %rdx,%r11 - movq %rbp,%rdx - adcq $0,%r9 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r12 - movq 16(%rbx),%rax - sbbq %rdx,%rbp - - addq %r9,%r12 - adcq %rbp,%r13 - adcq $0,%r8 - - - movq %rax,%rcx - mulq 0(%rsi) - addq %rax,%r10 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rbp,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rbp,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %rcx,%rax - adcq $0,%rdx - - movq %r10,%rcx - imulq %r15,%r10 - - movq %rdx,%rbp - mulq 24(%rsi) - addq %rbp,%r13 - adcq $0,%rdx - xorq %r9,%r9 - addq %rax,%r13 - movq %r10,%rax - adcq %rdx,%r8 - adcq $0,%r9 - - - mulq 0(%r14) - movq %r10,%rbp - addq %rax,%rcx - movq %r10,%rax - adcq %rdx,%rcx - - subq %r10,%r12 - sbbq $0,%r10 - - mulq 8(%r14) - addq %rcx,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %rbp,%rax - adcq %rdx,%r12 - movq %rbp,%rdx - adcq $0,%r10 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r13 - movq 24(%rbx),%rax - sbbq %rdx,%rbp - - addq %r10,%r13 - adcq %rbp,%r8 - adcq $0,%r9 - - - movq %rax,%rcx - mulq 0(%rsi) - addq %rax,%r11 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rbp,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rbp,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %rcx,%rax - adcq $0,%rdx - - movq %r11,%rcx - imulq %r15,%r11 - - movq %rdx,%rbp - mulq 24(%rsi) - addq %rbp,%r8 - adcq $0,%rdx - xorq %r10,%r10 - addq %rax,%r8 - movq %r11,%rax - adcq %rdx,%r9 - adcq $0,%r10 - - - mulq 0(%r14) - movq %r11,%rbp - addq %rax,%rcx - movq %r11,%rax - adcq %rdx,%rcx - - subq %r11,%r13 - sbbq $0,%r11 - - mulq 8(%r14) - addq %rcx,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %rbp,%rax - adcq %rdx,%r13 - movq %rbp,%rdx - adcq $0,%r11 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r8 - sbbq %rdx,%rbp - - addq %r11,%r8 - adcq %rbp,%r9 - adcq $0,%r10 - - - movq %r12,%rsi - subq 0(%r14),%r12 - movq %r13,%r11 - sbbq 8(%r14),%r13 - movq %r8,%rcx - sbbq 16(%r14),%r8 - movq %r9,%rbp - sbbq 24(%r14),%r9 - sbbq $0,%r10 - - cmovcq %rsi,%r12 - cmovcq %r11,%r13 - cmovcq %rcx,%r8 - cmovcq %rbp,%r9 - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - movq 0(%rsp),%r15 - - movq 8(%rsp),%r14 - - movq 16(%rsp),%r13 - - movq 24(%rsp),%r12 - - movq 32(%rsp),%rbx - - movq 40(%rsp),%rbp - - leaq 48(%rsp),%rsp - -L$ord_mul_epilogue: - .byte 0xf3,0xc3 - - - - - - - - - -.globl _ecp_nistz256_ord_sqr_mont -.private_extern _ecp_nistz256_ord_sqr_mont - -.p2align 5 -_ecp_nistz256_ord_sqr_mont: - - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je L$ecp_nistz256_ord_sqr_montx - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$ord_sqr_body: - - movq 0(%rsi),%r8 - movq 8(%rsi),%rax - movq 16(%rsi),%r14 - movq 24(%rsi),%r15 - leaq L$ord(%rip),%rsi - movq %rdx,%rbx - jmp L$oop_ord_sqr - -.p2align 5 -L$oop_ord_sqr: - - movq %rax,%rbp - mulq %r8 - movq %rax,%r9 -.byte 102,72,15,110,205 - movq %r14,%rax - movq %rdx,%r10 - - mulq %r8 - addq %rax,%r10 - movq %r15,%rax -.byte 102,73,15,110,214 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %r8 - addq %rax,%r11 - movq %r15,%rax -.byte 102,73,15,110,223 - adcq $0,%rdx - movq %rdx,%r12 - - - mulq %r14 - movq %rax,%r13 - movq %r14,%rax - movq %rdx,%r14 - - - mulq %rbp - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - - addq %r15,%r12 - adcq %rdx,%r13 - adcq $0,%r14 - - - xorq %r15,%r15 - movq %r8,%rax - addq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %r12,%r12 - adcq %r13,%r13 - adcq %r14,%r14 - adcq $0,%r15 - - - mulq %rax - movq %rax,%r8 -.byte 102,72,15,126,200 - movq %rdx,%rbp - - mulq %rax - addq %rbp,%r9 - adcq %rax,%r10 -.byte 102,72,15,126,208 - adcq $0,%rdx - movq %rdx,%rbp - - mulq %rax - addq %rbp,%r11 - adcq %rax,%r12 -.byte 102,72,15,126,216 - adcq $0,%rdx - movq %rdx,%rbp - - movq %r8,%rcx - imulq 32(%rsi),%r8 - - mulq %rax - addq %rbp,%r13 - adcq %rax,%r14 - movq 0(%rsi),%rax - adcq %rdx,%r15 - - - mulq %r8 - movq %r8,%rbp - addq %rax,%rcx - movq 8(%rsi),%rax - adcq %rdx,%rcx - - subq %r8,%r10 - sbbq $0,%rbp - - mulq %r8 - addq %rcx,%r9 - adcq $0,%rdx - addq %rax,%r9 - movq %r8,%rax - adcq %rdx,%r10 - movq %r8,%rdx - adcq $0,%rbp - - movq %r9,%rcx - imulq 32(%rsi),%r9 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r11 - movq 0(%rsi),%rax - sbbq %rdx,%r8 - - addq %rbp,%r11 - adcq $0,%r8 - - - mulq %r9 - movq %r9,%rbp - addq %rax,%rcx - movq 8(%rsi),%rax - adcq %rdx,%rcx - - subq %r9,%r11 - sbbq $0,%rbp - - mulq %r9 - addq %rcx,%r10 - adcq $0,%rdx - addq %rax,%r10 - movq %r9,%rax - adcq %rdx,%r11 - movq %r9,%rdx - adcq $0,%rbp - - movq %r10,%rcx - imulq 32(%rsi),%r10 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r8 - movq 0(%rsi),%rax - sbbq %rdx,%r9 - - addq %rbp,%r8 - adcq $0,%r9 - - - mulq %r10 - movq %r10,%rbp - addq %rax,%rcx - movq 8(%rsi),%rax - adcq %rdx,%rcx - - subq %r10,%r8 - sbbq $0,%rbp - - mulq %r10 - addq %rcx,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %r10,%rax - adcq %rdx,%r8 - movq %r10,%rdx - adcq $0,%rbp - - movq %r11,%rcx - imulq 32(%rsi),%r11 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r9 - movq 0(%rsi),%rax - sbbq %rdx,%r10 - - addq %rbp,%r9 - adcq $0,%r10 - - - mulq %r11 - movq %r11,%rbp - addq %rax,%rcx - movq 8(%rsi),%rax - adcq %rdx,%rcx - - subq %r11,%r9 - sbbq $0,%rbp - - mulq %r11 - addq %rcx,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r11,%rax - adcq %rdx,%r9 - movq %r11,%rdx - adcq $0,%rbp - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r10 - sbbq %rdx,%r11 - - addq %rbp,%r10 - adcq $0,%r11 - - - xorq %rdx,%rdx - addq %r12,%r8 - adcq %r13,%r9 - movq %r8,%r12 - adcq %r14,%r10 - adcq %r15,%r11 - movq %r9,%rax - adcq $0,%rdx - - - subq 0(%rsi),%r8 - movq %r10,%r14 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - movq %r11,%r15 - sbbq 24(%rsi),%r11 - sbbq $0,%rdx - - cmovcq %r12,%r8 - cmovncq %r9,%rax - cmovncq %r10,%r14 - cmovncq %r11,%r15 - - decq %rbx - jnz L$oop_ord_sqr - - movq %r8,0(%rdi) - movq %rax,8(%rdi) - pxor %xmm1,%xmm1 - movq %r14,16(%rdi) - pxor %xmm2,%xmm2 - movq %r15,24(%rdi) - pxor %xmm3,%xmm3 - - movq 0(%rsp),%r15 - - movq 8(%rsp),%r14 - - movq 16(%rsp),%r13 - - movq 24(%rsp),%r12 - - movq 32(%rsp),%rbx - - movq 40(%rsp),%rbp - - leaq 48(%rsp),%rsp - -L$ord_sqr_epilogue: - .byte 0xf3,0xc3 - - - - -.p2align 5 -ecp_nistz256_ord_mul_montx: - -L$ecp_nistz256_ord_mul_montx: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$ord_mulx_body: - - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - leaq L$ord-128(%rip),%r14 - movq L$ordK(%rip),%r15 - - - mulxq %r9,%r8,%r9 - mulxq %r10,%rcx,%r10 - mulxq %r11,%rbp,%r11 - addq %rcx,%r9 - mulxq %r12,%rcx,%r12 - movq %r8,%rdx - mulxq %r15,%rdx,%rax - adcq %rbp,%r10 - adcq %rcx,%r11 - adcq $0,%r12 - - - xorq %r13,%r13 - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 24+128(%r14),%rcx,%rbp - movq 8(%rbx),%rdx - adcxq %rcx,%r11 - adoxq %rbp,%r12 - adcxq %r8,%r12 - adoxq %r8,%r13 - adcq $0,%r13 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r9,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - adcxq %r8,%r13 - adoxq %r8,%r8 - adcq $0,%r8 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%r14),%rcx,%rbp - movq 16(%rbx),%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcxq %r9,%r13 - adoxq %r9,%r8 - adcq $0,%r8 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r10,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - adcxq %r9,%r8 - adoxq %r9,%r9 - adcq $0,%r9 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%r14),%rcx,%rbp - movq 24(%rbx),%rdx - adcxq %rcx,%r13 - adoxq %rbp,%r8 - adcxq %r10,%r8 - adoxq %r10,%r9 - adcq $0,%r9 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r11,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r8 - adoxq %rbp,%r9 - - adcxq %r10,%r9 - adoxq %r10,%r10 - adcq $0,%r10 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%r14),%rcx,%rbp - leaq 128(%r14),%r14 - movq %r12,%rbx - adcxq %rcx,%r8 - adoxq %rbp,%r9 - movq %r13,%rdx - adcxq %r11,%r9 - adoxq %r11,%r10 - adcq $0,%r10 - - - - movq %r8,%rcx - subq 0(%r14),%r12 - sbbq 8(%r14),%r13 - sbbq 16(%r14),%r8 - movq %r9,%rbp - sbbq 24(%r14),%r9 - sbbq $0,%r10 - - cmovcq %rbx,%r12 - cmovcq %rdx,%r13 - cmovcq %rcx,%r8 - cmovcq %rbp,%r9 - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - movq 0(%rsp),%r15 - - movq 8(%rsp),%r14 - - movq 16(%rsp),%r13 - - movq 24(%rsp),%r12 - - movq 32(%rsp),%rbx - - movq 40(%rsp),%rbp - - leaq 48(%rsp),%rsp - -L$ord_mulx_epilogue: - .byte 0xf3,0xc3 - - - - -.p2align 5 -ecp_nistz256_ord_sqr_montx: - -L$ecp_nistz256_ord_sqr_montx: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$ord_sqrx_body: - - movq %rdx,%rbx - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq L$ord(%rip),%rsi - jmp L$oop_ord_sqrx - -.p2align 5 -L$oop_ord_sqrx: - mulxq %r14,%r9,%r10 - mulxq %r15,%rcx,%r11 - movq %rdx,%rax -.byte 102,73,15,110,206 - mulxq %r8,%rbp,%r12 - movq %r14,%rdx - addq %rcx,%r10 -.byte 102,73,15,110,215 - adcq %rbp,%r11 - adcq $0,%r12 - xorq %r13,%r13 - - mulxq %r15,%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq %r8,%rcx,%rbp - movq %r15,%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcq $0,%r13 - - mulxq %r8,%rcx,%r14 - movq %rax,%rdx -.byte 102,73,15,110,216 - xorq %r15,%r15 - adcxq %r9,%r9 - adoxq %rcx,%r13 - adcxq %r10,%r10 - adoxq %r15,%r14 - - - mulxq %rdx,%r8,%rbp -.byte 102,72,15,126,202 - adcxq %r11,%r11 - adoxq %rbp,%r9 - adcxq %r12,%r12 - mulxq %rdx,%rcx,%rax -.byte 102,72,15,126,210 - adcxq %r13,%r13 - adoxq %rcx,%r10 - adcxq %r14,%r14 - mulxq %rdx,%rcx,%rbp -.byte 0x67 -.byte 102,72,15,126,218 - adoxq %rax,%r11 - adcxq %r15,%r15 - adoxq %rcx,%r12 - adoxq %rbp,%r13 - mulxq %rdx,%rcx,%rax - adoxq %rcx,%r14 - adoxq %rax,%r15 - - - movq %r8,%rdx - mulxq 32(%rsi),%rdx,%rcx - - xorq %rax,%rax - mulxq 0(%rsi),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - mulxq 8(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - mulxq 16(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - mulxq 24(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r8 - adcxq %rax,%r8 - - - movq %r9,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adoxq %rcx,%r9 - adcxq %rbp,%r10 - mulxq 8(%rsi),%rcx,%rbp - adoxq %rcx,%r10 - adcxq %rbp,%r11 - mulxq 16(%rsi),%rcx,%rbp - adoxq %rcx,%r11 - adcxq %rbp,%r8 - mulxq 24(%rsi),%rcx,%rbp - adoxq %rcx,%r8 - adcxq %rbp,%r9 - adoxq %rax,%r9 - - - movq %r10,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - mulxq 8(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r8 - mulxq 16(%rsi),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - mulxq 24(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - adcxq %rax,%r10 - - - movq %r11,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adoxq %rcx,%r11 - adcxq %rbp,%r8 - mulxq 8(%rsi),%rcx,%rbp - adoxq %rcx,%r8 - adcxq %rbp,%r9 - mulxq 16(%rsi),%rcx,%rbp - adoxq %rcx,%r9 - adcxq %rbp,%r10 - mulxq 24(%rsi),%rcx,%rbp - adoxq %rcx,%r10 - adcxq %rbp,%r11 - adoxq %rax,%r11 - - - addq %r8,%r12 - adcq %r13,%r9 - movq %r12,%rdx - adcq %r14,%r10 - adcq %r15,%r11 - movq %r9,%r14 - adcq $0,%rax - - - subq 0(%rsi),%r12 - movq %r10,%r15 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - movq %r11,%r8 - sbbq 24(%rsi),%r11 - sbbq $0,%rax - - cmovncq %r12,%rdx - cmovncq %r9,%r14 - cmovncq %r10,%r15 - cmovncq %r11,%r8 - - decq %rbx - jnz L$oop_ord_sqrx - - movq %rdx,0(%rdi) - movq %r14,8(%rdi) - pxor %xmm1,%xmm1 - movq %r15,16(%rdi) - pxor %xmm2,%xmm2 - movq %r8,24(%rdi) - pxor %xmm3,%xmm3 - - movq 0(%rsp),%r15 - - movq 8(%rsp),%r14 - - movq 16(%rsp),%r13 - - movq 24(%rsp),%r12 - - movq 32(%rsp),%rbx - - movq 40(%rsp),%rbp - - leaq 48(%rsp),%rsp - -L$ord_sqrx_epilogue: - .byte 0xf3,0xc3 - - - - - - - - -.globl _ecp_nistz256_mul_mont -.private_extern _ecp_nistz256_mul_mont - -.p2align 5 -_ecp_nistz256_mul_mont: - - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx -L$mul_mont: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$mul_body: - cmpl $0x80100,%ecx - je L$mul_montx - movq %rdx,%rbx - movq 0(%rdx),%rax - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - - call __ecp_nistz256_mul_montq - jmp L$mul_mont_done - -.p2align 5 -L$mul_montx: - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_mul_montx -L$mul_mont_done: - movq 0(%rsp),%r15 - - movq 8(%rsp),%r14 - - movq 16(%rsp),%r13 - - movq 24(%rsp),%r12 - - movq 32(%rsp),%rbx - - movq 40(%rsp),%rbp - - leaq 48(%rsp),%rsp - -L$mul_epilogue: - .byte 0xf3,0xc3 - - - - -.p2align 5 -__ecp_nistz256_mul_montq: - - - - movq %rax,%rbp - mulq %r9 - movq L$poly+8(%rip),%r14 - movq %rax,%r8 - movq %rbp,%rax - movq %rdx,%r9 - - mulq %r10 - movq L$poly+24(%rip),%r15 - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %r11 - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %r12 - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - xorq %r13,%r13 - movq %rdx,%r12 - - - - - - - - - - - movq %r8,%rbp - shlq $32,%r8 - mulq %r15 - shrq $32,%rbp - addq %r8,%r9 - adcq %rbp,%r10 - adcq %rax,%r11 - movq 8(%rbx),%rax - adcq %rdx,%r12 - adcq $0,%r13 - xorq %r8,%r8 - - - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rcx,%r10 - adcq $0,%rdx - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 16(%rsi) - addq %rcx,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 24(%rsi) - addq %rcx,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %r9,%rax - adcq %rdx,%r13 - adcq $0,%r8 - - - - movq %r9,%rbp - shlq $32,%r9 - mulq %r15 - shrq $32,%rbp - addq %r9,%r10 - adcq %rbp,%r11 - adcq %rax,%r12 - movq 16(%rbx),%rax - adcq %rdx,%r13 - adcq $0,%r8 - xorq %r9,%r9 - - - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rcx,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 16(%rsi) - addq %rcx,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 24(%rsi) - addq %rcx,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %r10,%rax - adcq %rdx,%r8 - adcq $0,%r9 - - - - movq %r10,%rbp - shlq $32,%r10 - mulq %r15 - shrq $32,%rbp - addq %r10,%r11 - adcq %rbp,%r12 - adcq %rax,%r13 - movq 24(%rbx),%rax - adcq %rdx,%r8 - adcq $0,%r9 - xorq %r10,%r10 - - - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rcx,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 16(%rsi) - addq %rcx,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 24(%rsi) - addq %rcx,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r11,%rax - adcq %rdx,%r9 - adcq $0,%r10 - - - - movq %r11,%rbp - shlq $32,%r11 - mulq %r15 - shrq $32,%rbp - addq %r11,%r12 - adcq %rbp,%r13 - movq %r12,%rcx - adcq %rax,%r8 - adcq %rdx,%r9 - movq %r13,%rbp - adcq $0,%r10 - - - - subq $-1,%r12 - movq %r8,%rbx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%rdx - sbbq %r15,%r9 - sbbq $0,%r10 - - cmovcq %rcx,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rbx,%r8 - movq %r13,8(%rdi) - cmovcq %rdx,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - - - - - - - - - -.globl _ecp_nistz256_sqr_mont -.private_extern _ecp_nistz256_sqr_mont - -.p2align 5 -_ecp_nistz256_sqr_mont: - - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$sqr_body: - cmpl $0x80100,%ecx - je L$sqr_montx - movq 0(%rsi),%rax - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - - call __ecp_nistz256_sqr_montq - jmp L$sqr_mont_done - -.p2align 5 -L$sqr_montx: - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_sqr_montx -L$sqr_mont_done: - movq 0(%rsp),%r15 - - movq 8(%rsp),%r14 - - movq 16(%rsp),%r13 - - movq 24(%rsp),%r12 - - movq 32(%rsp),%rbx - - movq 40(%rsp),%rbp - - leaq 48(%rsp),%rsp - -L$sqr_epilogue: - .byte 0xf3,0xc3 - - - - -.p2align 5 -__ecp_nistz256_sqr_montq: - - movq %rax,%r13 - mulq %r14 - movq %rax,%r9 - movq %r15,%rax - movq %rdx,%r10 - - mulq %r13 - addq %rax,%r10 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %r13 - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r12 - - - mulq %r14 - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq %r14 - addq %rax,%r12 - movq %r8,%rax - adcq $0,%rdx - addq %rbp,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - - mulq %r15 - xorq %r15,%r15 - addq %rax,%r13 - movq 0(%rsi),%rax - movq %rdx,%r14 - adcq $0,%r14 - - addq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %r12,%r12 - adcq %r13,%r13 - adcq %r14,%r14 - adcq $0,%r15 - - mulq %rax - movq %rax,%r8 - movq 8(%rsi),%rax - movq %rdx,%rcx - - mulq %rax - addq %rcx,%r9 - adcq %rax,%r10 - movq 16(%rsi),%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq %rax - addq %rcx,%r11 - adcq %rax,%r12 - movq 24(%rsi),%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq %rax - addq %rcx,%r13 - adcq %rax,%r14 - movq %r8,%rax - adcq %rdx,%r15 - - movq L$poly+8(%rip),%rsi - movq L$poly+24(%rip),%rbp - - - - - movq %r8,%rcx - shlq $32,%r8 - mulq %rbp - shrq $32,%rcx - addq %r8,%r9 - adcq %rcx,%r10 - adcq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - - - - movq %r9,%rcx - shlq $32,%r9 - movq %rdx,%r8 - mulq %rbp - shrq $32,%rcx - addq %r9,%r10 - adcq %rcx,%r11 - adcq %rax,%r8 - movq %r10,%rax - adcq $0,%rdx - - - - movq %r10,%rcx - shlq $32,%r10 - movq %rdx,%r9 - mulq %rbp - shrq $32,%rcx - addq %r10,%r11 - adcq %rcx,%r8 - adcq %rax,%r9 - movq %r11,%rax - adcq $0,%rdx - - - - movq %r11,%rcx - shlq $32,%r11 - movq %rdx,%r10 - mulq %rbp - shrq $32,%rcx - addq %r11,%r8 - adcq %rcx,%r9 - adcq %rax,%r10 - adcq $0,%rdx - xorq %r11,%r11 - - - - addq %r8,%r12 - adcq %r9,%r13 - movq %r12,%r8 - adcq %r10,%r14 - adcq %rdx,%r15 - movq %r13,%r9 - adcq $0,%r11 - - subq $-1,%r12 - movq %r14,%r10 - sbbq %rsi,%r13 - sbbq $0,%r14 - movq %r15,%rcx - sbbq %rbp,%r15 - sbbq $0,%r11 - - cmovcq %r8,%r12 - cmovcq %r9,%r13 - movq %r12,0(%rdi) - cmovcq %r10,%r14 - movq %r13,8(%rdi) - cmovcq %rcx,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - - .byte 0xf3,0xc3 - - - -.p2align 5 -__ecp_nistz256_mul_montx: - - - - mulxq %r9,%r8,%r9 - mulxq %r10,%rcx,%r10 - movq $32,%r14 - xorq %r13,%r13 - mulxq %r11,%rbp,%r11 - movq L$poly+24(%rip),%r15 - adcq %rcx,%r9 - mulxq %r12,%rcx,%r12 - movq %r8,%rdx - adcq %rbp,%r10 - shlxq %r14,%r8,%rbp - adcq %rcx,%r11 - shrxq %r14,%r8,%rcx - adcq $0,%r12 - - - - addq %rbp,%r9 - adcq %rcx,%r10 - - mulxq %r15,%rcx,%rbp - movq 8(%rbx),%rdx - adcq %rcx,%r11 - adcq %rbp,%r12 - adcq $0,%r13 - xorq %r8,%r8 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r9,%rdx - adcxq %rcx,%r12 - shlxq %r14,%r9,%rcx - adoxq %rbp,%r13 - shrxq %r14,%r9,%rbp - - adcxq %r8,%r13 - adoxq %r8,%r8 - adcq $0,%r8 - - - - addq %rcx,%r10 - adcq %rbp,%r11 - - mulxq %r15,%rcx,%rbp - movq 16(%rbx),%rdx - adcq %rcx,%r12 - adcq %rbp,%r13 - adcq $0,%r8 - xorq %r9,%r9 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r10,%rdx - adcxq %rcx,%r13 - shlxq %r14,%r10,%rcx - adoxq %rbp,%r8 - shrxq %r14,%r10,%rbp - - adcxq %r9,%r8 - adoxq %r9,%r9 - adcq $0,%r9 - - - - addq %rcx,%r11 - adcq %rbp,%r12 - - mulxq %r15,%rcx,%rbp - movq 24(%rbx),%rdx - adcq %rcx,%r13 - adcq %rbp,%r8 - adcq $0,%r9 - xorq %r10,%r10 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r11,%rdx - adcxq %rcx,%r8 - shlxq %r14,%r11,%rcx - adoxq %rbp,%r9 - shrxq %r14,%r11,%rbp - - adcxq %r10,%r9 - adoxq %r10,%r10 - adcq $0,%r10 - - - - addq %rcx,%r12 - adcq %rbp,%r13 - - mulxq %r15,%rcx,%rbp - movq %r12,%rbx - movq L$poly+8(%rip),%r14 - adcq %rcx,%r8 - movq %r13,%rdx - adcq %rbp,%r9 - adcq $0,%r10 - - - - xorl %eax,%eax - movq %r8,%rcx - sbbq $-1,%r12 - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%rbp - sbbq %r15,%r9 - sbbq $0,%r10 - - cmovcq %rbx,%r12 - cmovcq %rdx,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %rbp,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - - - -.p2align 5 -__ecp_nistz256_sqr_montx: - - mulxq %r14,%r9,%r10 - mulxq %r15,%rcx,%r11 - xorl %eax,%eax - adcq %rcx,%r10 - mulxq %r8,%rbp,%r12 - movq %r14,%rdx - adcq %rbp,%r11 - adcq $0,%r12 - xorq %r13,%r13 - - - mulxq %r15,%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq %r8,%rcx,%rbp - movq %r15,%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcq $0,%r13 - - - mulxq %r8,%rcx,%r14 - movq 0+128(%rsi),%rdx - xorq %r15,%r15 - adcxq %r9,%r9 - adoxq %rcx,%r13 - adcxq %r10,%r10 - adoxq %r15,%r14 - - mulxq %rdx,%r8,%rbp - movq 8+128(%rsi),%rdx - adcxq %r11,%r11 - adoxq %rbp,%r9 - adcxq %r12,%r12 - mulxq %rdx,%rcx,%rax - movq 16+128(%rsi),%rdx - adcxq %r13,%r13 - adoxq %rcx,%r10 - adcxq %r14,%r14 -.byte 0x67 - mulxq %rdx,%rcx,%rbp - movq 24+128(%rsi),%rdx - adoxq %rax,%r11 - adcxq %r15,%r15 - adoxq %rcx,%r12 - movq $32,%rsi - adoxq %rbp,%r13 -.byte 0x67,0x67 - mulxq %rdx,%rcx,%rax - movq L$poly+24(%rip),%rdx - adoxq %rcx,%r14 - shlxq %rsi,%r8,%rcx - adoxq %rax,%r15 - shrxq %rsi,%r8,%rax - movq %rdx,%rbp - - - addq %rcx,%r9 - adcq %rax,%r10 - - mulxq %r8,%rcx,%r8 - adcq %rcx,%r11 - shlxq %rsi,%r9,%rcx - adcq $0,%r8 - shrxq %rsi,%r9,%rax - - - addq %rcx,%r10 - adcq %rax,%r11 - - mulxq %r9,%rcx,%r9 - adcq %rcx,%r8 - shlxq %rsi,%r10,%rcx - adcq $0,%r9 - shrxq %rsi,%r10,%rax - - - addq %rcx,%r11 - adcq %rax,%r8 - - mulxq %r10,%rcx,%r10 - adcq %rcx,%r9 - shlxq %rsi,%r11,%rcx - adcq $0,%r10 - shrxq %rsi,%r11,%rax - - - addq %rcx,%r8 - adcq %rax,%r9 - - mulxq %r11,%rcx,%r11 - adcq %rcx,%r10 - adcq $0,%r11 - - xorq %rdx,%rdx - addq %r8,%r12 - movq L$poly+8(%rip),%rsi - adcq %r9,%r13 - movq %r12,%r8 - adcq %r10,%r14 - adcq %r11,%r15 - movq %r13,%r9 - adcq $0,%rdx - - subq $-1,%r12 - movq %r14,%r10 - sbbq %rsi,%r13 - sbbq $0,%r14 - movq %r15,%r11 - sbbq %rbp,%r15 - sbbq $0,%rdx - - cmovcq %r8,%r12 - cmovcq %r9,%r13 - movq %r12,0(%rdi) - cmovcq %r10,%r14 - movq %r13,8(%rdi) - cmovcq %r11,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - - .byte 0xf3,0xc3 - - - - -.globl _ecp_nistz256_select_w5 -.private_extern _ecp_nistz256_select_w5 - -.p2align 5 -_ecp_nistz256_select_w5: - - leaq _OPENSSL_ia32cap_P(%rip),%rax - movq 8(%rax),%rax - testl $32,%eax - jnz L$avx2_select_w5 - movdqa L$One(%rip),%xmm0 - movd %edx,%xmm1 - - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - - movdqa %xmm0,%xmm8 - pshufd $0,%xmm1,%xmm1 - - movq $16,%rax -L$select_loop_sse_w5: - - movdqa %xmm8,%xmm15 - paddd %xmm0,%xmm8 - pcmpeqd %xmm1,%xmm15 - - movdqa 0(%rsi),%xmm9 - movdqa 16(%rsi),%xmm10 - movdqa 32(%rsi),%xmm11 - movdqa 48(%rsi),%xmm12 - movdqa 64(%rsi),%xmm13 - movdqa 80(%rsi),%xmm14 - leaq 96(%rsi),%rsi - - pand %xmm15,%xmm9 - pand %xmm15,%xmm10 - por %xmm9,%xmm2 - pand %xmm15,%xmm11 - por %xmm10,%xmm3 - pand %xmm15,%xmm12 - por %xmm11,%xmm4 - pand %xmm15,%xmm13 - por %xmm12,%xmm5 - pand %xmm15,%xmm14 - por %xmm13,%xmm6 - por %xmm14,%xmm7 - - decq %rax - jnz L$select_loop_sse_w5 - - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - movdqu %xmm4,32(%rdi) - movdqu %xmm5,48(%rdi) - movdqu %xmm6,64(%rdi) - movdqu %xmm7,80(%rdi) - .byte 0xf3,0xc3 - -L$SEH_end_ecp_nistz256_select_w5: - - - - -.globl _ecp_nistz256_select_w7 -.private_extern _ecp_nistz256_select_w7 - -.p2align 5 -_ecp_nistz256_select_w7: - - leaq _OPENSSL_ia32cap_P(%rip),%rax - movq 8(%rax),%rax - testl $32,%eax - jnz L$avx2_select_w7 - movdqa L$One(%rip),%xmm8 - movd %edx,%xmm1 - - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - movdqa %xmm8,%xmm0 - pshufd $0,%xmm1,%xmm1 - movq $64,%rax - -L$select_loop_sse_w7: - movdqa %xmm8,%xmm15 - paddd %xmm0,%xmm8 - movdqa 0(%rsi),%xmm9 - movdqa 16(%rsi),%xmm10 - pcmpeqd %xmm1,%xmm15 - movdqa 32(%rsi),%xmm11 - movdqa 48(%rsi),%xmm12 - leaq 64(%rsi),%rsi - - pand %xmm15,%xmm9 - pand %xmm15,%xmm10 - por %xmm9,%xmm2 - pand %xmm15,%xmm11 - por %xmm10,%xmm3 - pand %xmm15,%xmm12 - por %xmm11,%xmm4 - prefetcht0 255(%rsi) - por %xmm12,%xmm5 - - decq %rax - jnz L$select_loop_sse_w7 - - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - movdqu %xmm4,32(%rdi) - movdqu %xmm5,48(%rdi) - .byte 0xf3,0xc3 - -L$SEH_end_ecp_nistz256_select_w7: - - - - -.p2align 5 -ecp_nistz256_avx2_select_w5: - -L$avx2_select_w5: - vzeroupper - vmovdqa L$Two(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - vpxor %ymm4,%ymm4,%ymm4 - - vmovdqa L$One(%rip),%ymm5 - vmovdqa L$Two(%rip),%ymm10 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - movq $8,%rax -L$select_loop_avx2_w5: - - vmovdqa 0(%rsi),%ymm6 - vmovdqa 32(%rsi),%ymm7 - vmovdqa 64(%rsi),%ymm8 - - vmovdqa 96(%rsi),%ymm11 - vmovdqa 128(%rsi),%ymm12 - vmovdqa 160(%rsi),%ymm13 - - vpcmpeqd %ymm1,%ymm5,%ymm9 - vpcmpeqd %ymm1,%ymm10,%ymm14 - - vpaddd %ymm0,%ymm5,%ymm5 - vpaddd %ymm0,%ymm10,%ymm10 - leaq 192(%rsi),%rsi - - vpand %ymm9,%ymm6,%ymm6 - vpand %ymm9,%ymm7,%ymm7 - vpand %ymm9,%ymm8,%ymm8 - vpand %ymm14,%ymm11,%ymm11 - vpand %ymm14,%ymm12,%ymm12 - vpand %ymm14,%ymm13,%ymm13 - - vpxor %ymm6,%ymm2,%ymm2 - vpxor %ymm7,%ymm3,%ymm3 - vpxor %ymm8,%ymm4,%ymm4 - vpxor %ymm11,%ymm2,%ymm2 - vpxor %ymm12,%ymm3,%ymm3 - vpxor %ymm13,%ymm4,%ymm4 - - decq %rax - jnz L$select_loop_avx2_w5 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vmovdqu %ymm4,64(%rdi) - vzeroupper - .byte 0xf3,0xc3 - -L$SEH_end_ecp_nistz256_avx2_select_w5: - - - - -.globl _ecp_nistz256_avx2_select_w7 -.private_extern _ecp_nistz256_avx2_select_w7 - -.p2align 5 -_ecp_nistz256_avx2_select_w7: - -L$avx2_select_w7: - vzeroupper - vmovdqa L$Three(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - - vmovdqa L$One(%rip),%ymm4 - vmovdqa L$Two(%rip),%ymm8 - vmovdqa L$Three(%rip),%ymm12 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - - movq $21,%rax -L$select_loop_avx2_w7: - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vmovdqa 64(%rsi),%ymm9 - vmovdqa 96(%rsi),%ymm10 - - vmovdqa 128(%rsi),%ymm13 - vmovdqa 160(%rsi),%ymm14 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - vpcmpeqd %ymm1,%ymm8,%ymm11 - vpcmpeqd %ymm1,%ymm12,%ymm15 - - vpaddd %ymm0,%ymm4,%ymm4 - vpaddd %ymm0,%ymm8,%ymm8 - vpaddd %ymm0,%ymm12,%ymm12 - leaq 192(%rsi),%rsi - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - vpand %ymm11,%ymm9,%ymm9 - vpand %ymm11,%ymm10,%ymm10 - vpand %ymm15,%ymm13,%ymm13 - vpand %ymm15,%ymm14,%ymm14 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - vpxor %ymm9,%ymm2,%ymm2 - vpxor %ymm10,%ymm3,%ymm3 - vpxor %ymm13,%ymm2,%ymm2 - vpxor %ymm14,%ymm3,%ymm3 - - decq %rax - jnz L$select_loop_avx2_w7 - - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vzeroupper - .byte 0xf3,0xc3 - -L$SEH_end_ecp_nistz256_avx2_select_w7: - - -.p2align 5 -__ecp_nistz256_add_toq: - - xorq %r11,%r11 - addq 0(%rbx),%r12 - adcq 8(%rbx),%r13 - movq %r12,%rax - adcq 16(%rbx),%r8 - adcq 24(%rbx),%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - - - -.p2align 5 -__ecp_nistz256_sub_fromq: - - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r13 - movq %r12,%rax - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - movq %r13,%rbp - sbbq %r11,%r11 - - addq $-1,%r12 - movq %r8,%rcx - adcq %r14,%r13 - adcq $0,%r8 - movq %r9,%r10 - adcq %r15,%r9 - testq %r11,%r11 - - cmovzq %rax,%r12 - cmovzq %rbp,%r13 - movq %r12,0(%rdi) - cmovzq %rcx,%r8 - movq %r13,8(%rdi) - cmovzq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - - - -.p2align 5 -__ecp_nistz256_subq: - - subq %r12,%rax - sbbq %r13,%rbp - movq %rax,%r12 - sbbq %r8,%rcx - sbbq %r9,%r10 - movq %rbp,%r13 - sbbq %r11,%r11 - - addq $-1,%rax - movq %rcx,%r8 - adcq %r14,%rbp - adcq $0,%rcx - movq %r10,%r9 - adcq %r15,%r10 - testq %r11,%r11 - - cmovnzq %rax,%r12 - cmovnzq %rbp,%r13 - cmovnzq %rcx,%r8 - cmovnzq %r10,%r9 - - .byte 0xf3,0xc3 - - - - -.p2align 5 -__ecp_nistz256_mul_by_2q: - - xorq %r11,%r11 - addq %r12,%r12 - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - -.globl _ecp_nistz256_point_double -.private_extern _ecp_nistz256_point_double - -.p2align 5 -_ecp_nistz256_point_double: - - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je L$point_doublex - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $160+8,%rsp - -L$point_doubleq_body: - -L$point_double_shortcutq: - movdqu 0(%rsi),%xmm0 - movq %rsi,%rbx - movdqu 16(%rsi),%xmm1 - movq 32+0(%rsi),%r12 - movq 32+8(%rsi),%r13 - movq 32+16(%rsi),%r8 - movq 32+24(%rsi),%r9 - movq L$poly+8(%rip),%r14 - movq L$poly+24(%rip),%r15 - movdqa %xmm0,96(%rsp) - movdqa %xmm1,96+16(%rsp) - leaq 32(%rdi),%r10 - leaq 64(%rdi),%r11 -.byte 102,72,15,110,199 -.byte 102,73,15,110,202 -.byte 102,73,15,110,211 - - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_by_2q - - movq 64+0(%rsi),%rax - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - leaq 64-0(%rsi),%rsi - leaq 64(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 0+0(%rsp),%rax - movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 0(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 32(%rbx),%rax - movq 64+0(%rbx),%r9 - movq 64+8(%rbx),%r10 - movq 64+16(%rbx),%r11 - movq 64+24(%rbx),%r12 - leaq 64-0(%rbx),%rsi - leaq 32(%rbx),%rbx -.byte 102,72,15,126,215 - call __ecp_nistz256_mul_montq - call __ecp_nistz256_mul_by_2q - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_toq - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 0+0(%rsp),%rax - movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 -.byte 102,72,15,126,207 - call __ecp_nistz256_sqr_montq - xorq %r9,%r9 - movq %r12,%rax - addq $-1,%r12 - movq %r13,%r10 - adcq %rsi,%r13 - movq %r14,%rcx - adcq $0,%r14 - movq %r15,%r8 - adcq %rbp,%r15 - adcq $0,%r9 - xorq %rsi,%rsi - testq $1,%rax - - cmovzq %rax,%r12 - cmovzq %r10,%r13 - cmovzq %rcx,%r14 - cmovzq %r8,%r15 - cmovzq %rsi,%r9 - - movq %r13,%rax - shrq $1,%r12 - shlq $63,%rax - movq %r14,%r10 - shrq $1,%r13 - orq %rax,%r12 - shlq $63,%r10 - movq %r15,%rcx - shrq $1,%r14 - orq %r10,%r13 - shlq $63,%rcx - movq %r12,0(%rdi) - shrq $1,%r15 - movq %r13,8(%rdi) - shlq $63,%r9 - orq %rcx,%r14 - orq %r9,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - movq 64(%rsp),%rax - leaq 64(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2q - - leaq 32(%rsp),%rbx - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_toq - - movq 96(%rsp),%rax - leaq 96(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2q - - movq 0+32(%rsp),%rax - movq 8+32(%rsp),%r14 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r15 - movq 24+32(%rsp),%r8 -.byte 102,72,15,126,199 - call __ecp_nistz256_sqr_montq - - leaq 128(%rsp),%rbx - movq %r14,%r8 - movq %r15,%r9 - movq %rsi,%r14 - movq %rbp,%r15 - call __ecp_nistz256_sub_fromq - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 0(%rsp),%rdi - call __ecp_nistz256_subq - - movq 32(%rsp),%rax - leaq 32(%rsp),%rbx - movq %r12,%r14 - xorl %ecx,%ecx - movq %r12,0+0(%rsp) - movq %r13,%r10 - movq %r13,0+8(%rsp) - cmovzq %r8,%r11 - movq %r8,0+16(%rsp) - leaq 0-0(%rsp),%rsi - cmovzq %r9,%r12 - movq %r9,0+24(%rsp) - movq %r14,%r9 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - -.byte 102,72,15,126,203 -.byte 102,72,15,126,207 - call __ecp_nistz256_sub_fromq - - leaq 160+56(%rsp),%rsi - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbx - - movq -8(%rsi),%rbp - - leaq (%rsi),%rsp - -L$point_doubleq_epilogue: - .byte 0xf3,0xc3 - - -.globl _ecp_nistz256_point_add -.private_extern _ecp_nistz256_point_add - -.p2align 5 -_ecp_nistz256_point_add: - - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je L$point_addx - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $576+8,%rsp - -L$point_addq_body: - - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq %rsi,%rbx - movq %rdx,%rsi - movdqa %xmm0,384(%rsp) - movdqa %xmm1,384+16(%rsp) - movdqa %xmm2,416(%rsp) - movdqa %xmm3,416+16(%rsp) - movdqa %xmm4,448(%rsp) - movdqa %xmm5,448+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rsi),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rsi),%xmm3 - movq 64+0(%rsi),%rax - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,480(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,480+16(%rsp) - movdqu 64(%rsi),%xmm0 - movdqu 80(%rsi),%xmm1 - movdqa %xmm2,512(%rsp) - movdqa %xmm3,512+16(%rsp) - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - - leaq 64-0(%rsi),%rsi - movq %rax,544+0(%rsp) - movq %r14,544+8(%rsp) - movq %r15,544+16(%rsp) - movq %r8,544+24(%rsp) - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm1,%xmm4 - por %xmm1,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - movq 64+0(%rbx),%rax - movq 64+8(%rbx),%r14 - movq 64+16(%rbx),%r15 - movq 64+24(%rbx),%r8 -.byte 102,72,15,110,203 - - leaq 64-0(%rbx),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 544(%rsp),%rax - leaq 544(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq 0+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 448(%rsp),%rax - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 416(%rsp),%rax - leaq 416(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq 0+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 512(%rsp),%rax - leaq 512(%rsp),%rbx - movq 0+256(%rsp),%r9 - movq 8+256(%rsp),%r10 - leaq 0+256(%rsp),%rsi - movq 16+256(%rsp),%r11 - movq 24+256(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 224(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - orq %r13,%r12 - movdqa %xmm4,%xmm2 - orq %r8,%r12 - orq %r9,%r12 - por %xmm5,%xmm2 -.byte 102,73,15,110,220 - - movq 384(%rsp),%rax - leaq 384(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq 0+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 480(%rsp),%rax - leaq 480(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 160(%rsp),%rbx - leaq 0(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - orq %r13,%r12 - orq %r8,%r12 - orq %r9,%r12 - -.byte 102,73,15,126,208 -.byte 102,73,15,126,217 - orq %r8,%r12 -.byte 0x3e - jnz L$add_proceedq - - - - testq %r9,%r9 - jz L$add_doubleq - - - - - - -.byte 102,72,15,126,199 - pxor %xmm0,%xmm0 - movdqu %xmm0,0(%rdi) - movdqu %xmm0,16(%rdi) - movdqu %xmm0,32(%rdi) - movdqu %xmm0,48(%rdi) - movdqu %xmm0,64(%rdi) - movdqu %xmm0,80(%rdi) - jmp L$add_doneq - -.p2align 5 -L$add_doubleq: -.byte 102,72,15,126,206 -.byte 102,72,15,126,199 - addq $416,%rsp - - jmp L$point_double_shortcutq - - -.p2align 5 -L$add_proceedq: - movq 0+64(%rsp),%rax - movq 8+64(%rsp),%r14 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 448(%rsp),%rax - leaq 448(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 0+0(%rsp),%rax - movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 544(%rsp),%rax - leaq 544(%rsp),%rbx - movq 0+352(%rsp),%r9 - movq 8+352(%rsp),%r10 - leaq 0+352(%rsp),%rsi - movq 16+352(%rsp),%r11 - movq 24+352(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 0(%rsp),%rax - leaq 0(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 160(%rsp),%rax - leaq 160(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montq - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 96(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subq - - leaq 128(%rsp),%rbx - leaq 288(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 192+0(%rsp),%rax - movq 192+8(%rsp),%rbp - movq 192+16(%rsp),%rcx - movq 192+24(%rsp),%r10 - leaq 320(%rsp),%rdi - - call __ecp_nistz256_subq - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 128(%rsp),%rax - leaq 128(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq 0+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 320(%rsp),%rax - leaq 320(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 320(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 256(%rsp),%rbx - leaq 320(%rsp),%rdi - call __ecp_nistz256_sub_fromq - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 352(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 352+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 544(%rsp),%xmm2 - pand 544+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 480(%rsp),%xmm2 - pand 480+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 320(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 320+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 512(%rsp),%xmm2 - pand 512+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - -L$add_doneq: - leaq 576+56(%rsp),%rsi - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbx - - movq -8(%rsi),%rbp - - leaq (%rsi),%rsp - -L$point_addq_epilogue: - .byte 0xf3,0xc3 - - -.globl _ecp_nistz256_point_add_affine -.private_extern _ecp_nistz256_point_add_affine - -.p2align 5 -_ecp_nistz256_point_add_affine: - - leaq _OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je L$point_add_affinex - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $480+8,%rsp - -L$add_affineq_body: - - movdqu 0(%rsi),%xmm0 - movq %rdx,%rbx - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq 64+0(%rsi),%rax - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,320(%rsp) - movdqa %xmm1,320+16(%rsp) - movdqa %xmm2,352(%rsp) - movdqa %xmm3,352+16(%rsp) - movdqa %xmm4,384(%rsp) - movdqa %xmm5,384+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rbx),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rbx),%xmm1 - movdqu 32(%rbx),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rbx),%xmm3 - movdqa %xmm0,416(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,416+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - movdqa %xmm2,448(%rsp) - movdqa %xmm3,448+16(%rsp) - por %xmm2,%xmm3 - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm1,%xmm3 - - leaq 64-0(%rsi),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm3,%xmm4 - movq 0(%rbx),%rax - - movq %r12,%r9 - por %xmm3,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - movq %r13,%r10 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - movq %r14,%r11 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - - leaq 32-0(%rsp),%rsi - movq %r15,%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 320(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 384(%rsp),%rax - leaq 384(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 384(%rsp),%rax - leaq 384(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 288(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 448(%rsp),%rax - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 352(%rsp),%rbx - leaq 96(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 0+64(%rsp),%rax - movq 8+64(%rsp),%r14 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 128(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 0+96(%rsp),%rax - movq 8+96(%rsp),%r14 - leaq 0+96(%rsp),%rsi - movq 16+96(%rsp),%r15 - movq 24+96(%rsp),%r8 - leaq 192(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 128(%rsp),%rax - leaq 128(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 320(%rsp),%rax - leaq 320(%rsp),%rbx - movq 0+128(%rsp),%r9 - movq 8+128(%rsp),%r10 - leaq 0+128(%rsp),%rsi - movq 16+128(%rsp),%r11 - movq 24+128(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 192(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subq - - leaq 160(%rsp),%rbx - leaq 224(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 64(%rsp),%rdi - - call __ecp_nistz256_subq - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 352(%rsp),%rax - leaq 352(%rsp),%rbx - movq 0+160(%rsp),%r9 - movq 8+160(%rsp),%r10 - leaq 0+160(%rsp),%rsi - movq 16+160(%rsp),%r11 - movq 24+160(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 96(%rsp),%rax - leaq 96(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 64(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 32(%rsp),%rbx - leaq 256(%rsp),%rdi - call __ecp_nistz256_sub_fromq - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand L$ONE_mont(%rip),%xmm2 - pand L$ONE_mont+16(%rip),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 224(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 224+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 320(%rsp),%xmm2 - pand 320+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 256(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 256+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 352(%rsp),%xmm2 - pand 352+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - - leaq 480+56(%rsp),%rsi - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbx - - movq -8(%rsi),%rbp - - leaq (%rsi),%rsp - -L$add_affineq_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -__ecp_nistz256_add_tox: - - xorq %r11,%r11 - adcq 0(%rbx),%r12 - adcq 8(%rbx),%r13 - movq %r12,%rax - adcq 16(%rbx),%r8 - adcq 24(%rbx),%r9 - movq %r13,%rbp - adcq $0,%r11 - - xorq %r10,%r10 - sbbq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - - - -.p2align 5 -__ecp_nistz256_sub_fromx: - - xorq %r11,%r11 - sbbq 0(%rbx),%r12 - sbbq 8(%rbx),%r13 - movq %r12,%rax - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - movq %r13,%rbp - sbbq $0,%r11 - - xorq %r10,%r10 - adcq $-1,%r12 - movq %r8,%rcx - adcq %r14,%r13 - adcq $0,%r8 - movq %r9,%r10 - adcq %r15,%r9 - - btq $0,%r11 - cmovncq %rax,%r12 - cmovncq %rbp,%r13 - movq %r12,0(%rdi) - cmovncq %rcx,%r8 - movq %r13,8(%rdi) - cmovncq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - - - -.p2align 5 -__ecp_nistz256_subx: - - xorq %r11,%r11 - sbbq %r12,%rax - sbbq %r13,%rbp - movq %rax,%r12 - sbbq %r8,%rcx - sbbq %r9,%r10 - movq %rbp,%r13 - sbbq $0,%r11 - - xorq %r9,%r9 - adcq $-1,%rax - movq %rcx,%r8 - adcq %r14,%rbp - adcq $0,%rcx - movq %r10,%r9 - adcq %r15,%r10 - - btq $0,%r11 - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - cmovcq %rcx,%r8 - cmovcq %r10,%r9 - - .byte 0xf3,0xc3 - - - - -.p2align 5 -__ecp_nistz256_mul_by_2x: - - xorq %r11,%r11 - adcq %r12,%r12 - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - xorq %r10,%r10 - sbbq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 - - - -.p2align 5 -ecp_nistz256_point_doublex: - -L$point_doublex: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $160+8,%rsp - -L$point_doublex_body: - -L$point_double_shortcutx: - movdqu 0(%rsi),%xmm0 - movq %rsi,%rbx - movdqu 16(%rsi),%xmm1 - movq 32+0(%rsi),%r12 - movq 32+8(%rsi),%r13 - movq 32+16(%rsi),%r8 - movq 32+24(%rsi),%r9 - movq L$poly+8(%rip),%r14 - movq L$poly+24(%rip),%r15 - movdqa %xmm0,96(%rsp) - movdqa %xmm1,96+16(%rsp) - leaq 32(%rdi),%r10 - leaq 64(%rdi),%r11 -.byte 102,72,15,110,199 -.byte 102,73,15,110,202 -.byte 102,73,15,110,211 - - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - leaq 64-128(%rsi),%rsi - leaq 64(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 0(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 32(%rbx),%rdx - movq 64+0(%rbx),%r9 - movq 64+8(%rbx),%r10 - movq 64+16(%rbx),%r11 - movq 64+24(%rbx),%r12 - leaq 64-128(%rbx),%rsi - leaq 32(%rbx),%rbx -.byte 102,72,15,126,215 - call __ecp_nistz256_mul_montx - call __ecp_nistz256_mul_by_2x - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 -.byte 102,72,15,126,207 - call __ecp_nistz256_sqr_montx - xorq %r9,%r9 - movq %r12,%rax - addq $-1,%r12 - movq %r13,%r10 - adcq %rsi,%r13 - movq %r14,%rcx - adcq $0,%r14 - movq %r15,%r8 - adcq %rbp,%r15 - adcq $0,%r9 - xorq %rsi,%rsi - testq $1,%rax - - cmovzq %rax,%r12 - cmovzq %r10,%r13 - cmovzq %rcx,%r14 - cmovzq %r8,%r15 - cmovzq %rsi,%r9 - - movq %r13,%rax - shrq $1,%r12 - shlq $63,%rax - movq %r14,%r10 - shrq $1,%r13 - orq %rax,%r12 - shlq $63,%r10 - movq %r15,%rcx - shrq $1,%r14 - orq %r10,%r13 - shlq $63,%rcx - movq %r12,0(%rdi) - shrq $1,%r15 - movq %r13,8(%rdi) - shlq $63,%r9 - orq %rcx,%r14 - orq %r9,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - movq 64(%rsp),%rdx - leaq 64(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - leaq 32(%rsp),%rbx - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox - - movq 96(%rsp),%rdx - leaq 96(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - movq 0+32(%rsp),%rdx - movq 8+32(%rsp),%r14 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r15 - movq 24+32(%rsp),%r8 -.byte 102,72,15,126,199 - call __ecp_nistz256_sqr_montx - - leaq 128(%rsp),%rbx - movq %r14,%r8 - movq %r15,%r9 - movq %rsi,%r14 - movq %rbp,%r15 - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 0(%rsp),%rdi - call __ecp_nistz256_subx - - movq 32(%rsp),%rdx - leaq 32(%rsp),%rbx - movq %r12,%r14 - xorl %ecx,%ecx - movq %r12,0+0(%rsp) - movq %r13,%r10 - movq %r13,0+8(%rsp) - cmovzq %r8,%r11 - movq %r8,0+16(%rsp) - leaq 0-128(%rsp),%rsi - cmovzq %r9,%r12 - movq %r9,0+24(%rsp) - movq %r14,%r9 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - -.byte 102,72,15,126,203 -.byte 102,72,15,126,207 - call __ecp_nistz256_sub_fromx - - leaq 160+56(%rsp),%rsi - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbx - - movq -8(%rsi),%rbp - - leaq (%rsi),%rsp - -L$point_doublex_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -ecp_nistz256_point_addx: - -L$point_addx: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $576+8,%rsp - -L$point_addx_body: - - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq %rsi,%rbx - movq %rdx,%rsi - movdqa %xmm0,384(%rsp) - movdqa %xmm1,384+16(%rsp) - movdqa %xmm2,416(%rsp) - movdqa %xmm3,416+16(%rsp) - movdqa %xmm4,448(%rsp) - movdqa %xmm5,448+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rsi),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rsi),%xmm3 - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,480(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,480+16(%rsp) - movdqu 64(%rsi),%xmm0 - movdqu 80(%rsi),%xmm1 - movdqa %xmm2,512(%rsp) - movdqa %xmm3,512+16(%rsp) - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - - leaq 64-128(%rsi),%rsi - movq %rdx,544+0(%rsp) - movq %r14,544+8(%rsp) - movq %r15,544+16(%rsp) - movq %r8,544+24(%rsp) - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm1,%xmm4 - por %xmm1,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - movq 64+0(%rbx),%rdx - movq 64+8(%rbx),%r14 - movq 64+16(%rbx),%r15 - movq 64+24(%rbx),%r8 -.byte 102,72,15,110,203 - - leaq 64-128(%rbx),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 544(%rsp),%rdx - leaq 544(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 416(%rsp),%rdx - leaq 416(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 512(%rsp),%rdx - leaq 512(%rsp),%rbx - movq 0+256(%rsp),%r9 - movq 8+256(%rsp),%r10 - leaq -128+256(%rsp),%rsi - movq 16+256(%rsp),%r11 - movq 24+256(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 224(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - orq %r13,%r12 - movdqa %xmm4,%xmm2 - orq %r8,%r12 - orq %r9,%r12 - por %xmm5,%xmm2 -.byte 102,73,15,110,220 - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 480(%rsp),%rdx - leaq 480(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 160(%rsp),%rbx - leaq 0(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - orq %r13,%r12 - orq %r8,%r12 - orq %r9,%r12 - -.byte 102,73,15,126,208 -.byte 102,73,15,126,217 - orq %r8,%r12 -.byte 0x3e - jnz L$add_proceedx - - - - testq %r9,%r9 - jz L$add_doublex - - - - - - -.byte 102,72,15,126,199 - pxor %xmm0,%xmm0 - movdqu %xmm0,0(%rdi) - movdqu %xmm0,16(%rdi) - movdqu %xmm0,32(%rdi) - movdqu %xmm0,48(%rdi) - movdqu %xmm0,64(%rdi) - movdqu %xmm0,80(%rdi) - jmp L$add_donex - -.p2align 5 -L$add_doublex: -.byte 102,72,15,126,206 -.byte 102,72,15,126,199 - addq $416,%rsp - - jmp L$point_double_shortcutx - - -.p2align 5 -L$add_proceedx: - movq 0+64(%rsp),%rdx - movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 544(%rsp),%rdx - leaq 544(%rsp),%rbx - movq 0+352(%rsp),%r9 - movq 8+352(%rsp),%r10 - leaq -128+352(%rsp),%rsi - movq 16+352(%rsp),%r11 - movq 24+352(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 0(%rsp),%rdx - leaq 0(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 160(%rsp),%rdx - leaq 160(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 96(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subx - - leaq 128(%rsp),%rbx - leaq 288(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 192+0(%rsp),%rax - movq 192+8(%rsp),%rbp - movq 192+16(%rsp),%rcx - movq 192+24(%rsp),%r10 - leaq 320(%rsp),%rdi - - call __ecp_nistz256_subx - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 128(%rsp),%rdx - leaq 128(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 320(%rsp),%rdx - leaq 320(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 320(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 256(%rsp),%rbx - leaq 320(%rsp),%rdi - call __ecp_nistz256_sub_fromx - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 352(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 352+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 544(%rsp),%xmm2 - pand 544+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 480(%rsp),%xmm2 - pand 480+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 320(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 320+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 512(%rsp),%xmm2 - pand 512+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - -L$add_donex: - leaq 576+56(%rsp),%rsi - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbx - - movq -8(%rsi),%rbp - - leaq (%rsi),%rsp - -L$point_addx_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -ecp_nistz256_point_add_affinex: - -L$point_add_affinex: - pushq %rbp - - pushq %rbx - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - subq $480+8,%rsp - -L$add_affinex_body: - - movdqu 0(%rsi),%xmm0 - movq %rdx,%rbx - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,320(%rsp) - movdqa %xmm1,320+16(%rsp) - movdqa %xmm2,352(%rsp) - movdqa %xmm3,352+16(%rsp) - movdqa %xmm4,384(%rsp) - movdqa %xmm5,384+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rbx),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rbx),%xmm1 - movdqu 32(%rbx),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rbx),%xmm3 - movdqa %xmm0,416(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,416+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - movdqa %xmm2,448(%rsp) - movdqa %xmm3,448+16(%rsp) - por %xmm2,%xmm3 - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm1,%xmm3 - - leaq 64-128(%rsi),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm3,%xmm4 - movq 0(%rbx),%rdx - - movq %r12,%r9 - por %xmm3,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - movq %r13,%r10 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - movq %r14,%r11 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - - leaq 32-128(%rsp),%rsi - movq %r15,%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 320(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 288(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 352(%rsp),%rbx - leaq 96(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+64(%rsp),%rdx - movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 128(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 0+96(%rsp),%rdx - movq 8+96(%rsp),%r14 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r15 - movq 24+96(%rsp),%r8 - leaq 192(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 128(%rsp),%rdx - leaq 128(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 320(%rsp),%rdx - leaq 320(%rsp),%rbx - movq 0+128(%rsp),%r9 - movq 8+128(%rsp),%r10 - leaq -128+128(%rsp),%rsi - movq 16+128(%rsp),%r11 - movq 24+128(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 192(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subx - - leaq 160(%rsp),%rbx - leaq 224(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 64(%rsp),%rdi - - call __ecp_nistz256_subx - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 352(%rsp),%rdx - leaq 352(%rsp),%rbx - movq 0+160(%rsp),%r9 - movq 8+160(%rsp),%r10 - leaq -128+160(%rsp),%rsi - movq 16+160(%rsp),%r11 - movq 24+160(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 96(%rsp),%rdx - leaq 96(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 64(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 32(%rsp),%rbx - leaq 256(%rsp),%rdi - call __ecp_nistz256_sub_fromx - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand L$ONE_mont(%rip),%xmm2 - pand L$ONE_mont+16(%rip),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 224(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 224+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 320(%rsp),%xmm2 - pand 320+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 256(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 256+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 352(%rsp),%xmm2 - pand 352+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - - leaq 480+56(%rsp),%rsi - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbx - - movq -8(%rsi),%rbp - - leaq (%rsi),%rsp - -L$add_affinex_epilogue: - .byte 0xf3,0xc3 - - -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S deleted file mode 100644 index ae7293ac..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S +++ /dev/null @@ -1,328 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - -.private_extern _beeu_mod_inverse_vartime -.globl _beeu_mod_inverse_vartime -.private_extern _beeu_mod_inverse_vartime -.p2align 5 -_beeu_mod_inverse_vartime: - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - pushq %rbx - - pushq %rsi - - - subq $80,%rsp - - movq %rdi,0(%rsp) - - - movq $1,%r8 - xorq %r9,%r9 - xorq %r10,%r10 - xorq %r11,%r11 - xorq %rdi,%rdi - - xorq %r12,%r12 - xorq %r13,%r13 - xorq %r14,%r14 - xorq %r15,%r15 - xorq %rbp,%rbp - - - vmovdqu 0(%rsi),%xmm0 - vmovdqu 16(%rsi),%xmm1 - vmovdqu %xmm0,48(%rsp) - vmovdqu %xmm1,64(%rsp) - - vmovdqu 0(%rdx),%xmm0 - vmovdqu 16(%rdx),%xmm1 - vmovdqu %xmm0,16(%rsp) - vmovdqu %xmm1,32(%rsp) - -L$beeu_loop: - xorq %rbx,%rbx - orq 48(%rsp),%rbx - orq 56(%rsp),%rbx - orq 64(%rsp),%rbx - orq 72(%rsp),%rbx - jz L$beeu_loop_end - - - - - - - - - - - movq $1,%rcx - - -L$beeu_shift_loop_XB: - movq %rcx,%rbx - andq 48(%rsp),%rbx - jnz L$beeu_shift_loop_end_XB - - - movq $1,%rbx - andq %r8,%rbx - jz L$shift1_0 - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - adcq 24(%rdx),%r11 - adcq $0,%rdi - -L$shift1_0: - shrdq $1,%r9,%r8 - shrdq $1,%r10,%r9 - shrdq $1,%r11,%r10 - shrdq $1,%rdi,%r11 - shrq $1,%rdi - - shlq $1,%rcx - - - - - - cmpq $0x8000000,%rcx - jne L$beeu_shift_loop_XB - -L$beeu_shift_loop_end_XB: - bsfq %rcx,%rcx - testq %rcx,%rcx - jz L$beeu_no_shift_XB - - - - movq 8+48(%rsp),%rax - movq 16+48(%rsp),%rbx - movq 24+48(%rsp),%rsi - - shrdq %cl,%rax,0+48(%rsp) - shrdq %cl,%rbx,8+48(%rsp) - shrdq %cl,%rsi,16+48(%rsp) - - shrq %cl,%rsi - movq %rsi,24+48(%rsp) - - -L$beeu_no_shift_XB: - - movq $1,%rcx - - -L$beeu_shift_loop_YA: - movq %rcx,%rbx - andq 16(%rsp),%rbx - jnz L$beeu_shift_loop_end_YA - - - movq $1,%rbx - andq %r12,%rbx - jz L$shift1_1 - addq 0(%rdx),%r12 - adcq 8(%rdx),%r13 - adcq 16(%rdx),%r14 - adcq 24(%rdx),%r15 - adcq $0,%rbp - -L$shift1_1: - shrdq $1,%r13,%r12 - shrdq $1,%r14,%r13 - shrdq $1,%r15,%r14 - shrdq $1,%rbp,%r15 - shrq $1,%rbp - - shlq $1,%rcx - - - - - - cmpq $0x8000000,%rcx - jne L$beeu_shift_loop_YA - -L$beeu_shift_loop_end_YA: - bsfq %rcx,%rcx - testq %rcx,%rcx - jz L$beeu_no_shift_YA - - - - movq 8+16(%rsp),%rax - movq 16+16(%rsp),%rbx - movq 24+16(%rsp),%rsi - - shrdq %cl,%rax,0+16(%rsp) - shrdq %cl,%rbx,8+16(%rsp) - shrdq %cl,%rsi,16+16(%rsp) - - shrq %cl,%rsi - movq %rsi,24+16(%rsp) - - -L$beeu_no_shift_YA: - - movq 48(%rsp),%rax - movq 56(%rsp),%rbx - movq 64(%rsp),%rsi - movq 72(%rsp),%rcx - subq 16(%rsp),%rax - sbbq 24(%rsp),%rbx - sbbq 32(%rsp),%rsi - sbbq 40(%rsp),%rcx - jnc L$beeu_B_bigger_than_A - - - movq 16(%rsp),%rax - movq 24(%rsp),%rbx - movq 32(%rsp),%rsi - movq 40(%rsp),%rcx - subq 48(%rsp),%rax - sbbq 56(%rsp),%rbx - sbbq 64(%rsp),%rsi - sbbq 72(%rsp),%rcx - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - movq %rsi,32(%rsp) - movq %rcx,40(%rsp) - - - addq %r8,%r12 - adcq %r9,%r13 - adcq %r10,%r14 - adcq %r11,%r15 - adcq %rdi,%rbp - jmp L$beeu_loop - -L$beeu_B_bigger_than_A: - - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - movq %rsi,64(%rsp) - movq %rcx,72(%rsp) - - - addq %r12,%r8 - adcq %r13,%r9 - adcq %r14,%r10 - adcq %r15,%r11 - adcq %rbp,%rdi - - jmp L$beeu_loop - -L$beeu_loop_end: - - - - - movq 16(%rsp),%rbx - subq $1,%rbx - orq 24(%rsp),%rbx - orq 32(%rsp),%rbx - orq 40(%rsp),%rbx - - jnz L$beeu_err - - - - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - movq 24(%rdx),%r11 - xorq %rdi,%rdi - -L$beeu_reduction_loop: - movq %r12,16(%rsp) - movq %r13,24(%rsp) - movq %r14,32(%rsp) - movq %r15,40(%rsp) - movq %rbp,48(%rsp) - - - subq %r8,%r12 - sbbq %r9,%r13 - sbbq %r10,%r14 - sbbq %r11,%r15 - sbbq $0,%rbp - - - cmovcq 16(%rsp),%r12 - cmovcq 24(%rsp),%r13 - cmovcq 32(%rsp),%r14 - cmovcq 40(%rsp),%r15 - jnc L$beeu_reduction_loop - - - subq %r12,%r8 - sbbq %r13,%r9 - sbbq %r14,%r10 - sbbq %r15,%r11 - -L$beeu_save: - - movq 0(%rsp),%rdi - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - - movq $1,%rax - jmp L$beeu_finish - -L$beeu_err: - - xorq %rax,%rax - -L$beeu_finish: - addq $80,%rsp - - popq %rsi - - popq %rbx - - popq %r15 - - popq %r14 - - popq %r13 - - popq %r12 - - popq %rbp - - .byte 0xf3,0xc3 - - - -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S deleted file mode 100644 index 664c0674..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S +++ /dev/null @@ -1,62 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - - -.globl _CRYPTO_rdrand -.private_extern _CRYPTO_rdrand - -.p2align 4 -_CRYPTO_rdrand: - - xorq %rax,%rax -.byte 72,15,199,242 - - adcq %rax,%rax - movq %rdx,0(%rdi) - .byte 0xf3,0xc3 - - - - - - - -.globl _CRYPTO_rdrand_multiple8_buf -.private_extern _CRYPTO_rdrand_multiple8_buf - -.p2align 4 -_CRYPTO_rdrand_multiple8_buf: - - testq %rsi,%rsi - jz L$out - movq $8,%rdx -L$loop: -.byte 72,15,199,241 - jnc L$err - movq %rcx,0(%rdi) - addq %rdx,%rdi - subq %rdx,%rsi - jnz L$loop -L$out: - movq $1,%rax - .byte 0xf3,0xc3 -L$err: - xorq %rax,%rax - .byte 0xf3,0xc3 - - -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/rsaz-avx2.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/rsaz-avx2.S deleted file mode 100644 index bebc699a..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/rsaz-avx2.S +++ /dev/null @@ -1,1748 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - -.globl _rsaz_1024_sqr_avx2 -.private_extern _rsaz_1024_sqr_avx2 - -.p2align 6 -_rsaz_1024_sqr_avx2: - - leaq (%rsp),%rax - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - vzeroupper - movq %rax,%rbp - - movq %rdx,%r13 - subq $832,%rsp - movq %r13,%r15 - subq $-128,%rdi - subq $-128,%rsi - subq $-128,%r13 - - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - vpxor %ymm9,%ymm9,%ymm9 - jz L$sqr_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%r13),%ymm0 - andq $-2048,%rsp - vmovdqu 32-128(%r13),%ymm1 - vmovdqu 64-128(%r13),%ymm2 - vmovdqu 96-128(%r13),%ymm3 - vmovdqu 128-128(%r13),%ymm4 - vmovdqu 160-128(%r13),%ymm5 - vmovdqu 192-128(%r13),%ymm6 - vmovdqu 224-128(%r13),%ymm7 - vmovdqu 256-128(%r13),%ymm8 - leaq 832+128(%rsp),%r13 - vmovdqu %ymm0,0-128(%r13) - vmovdqu %ymm1,32-128(%r13) - vmovdqu %ymm2,64-128(%r13) - vmovdqu %ymm3,96-128(%r13) - vmovdqu %ymm4,128-128(%r13) - vmovdqu %ymm5,160-128(%r13) - vmovdqu %ymm6,192-128(%r13) - vmovdqu %ymm7,224-128(%r13) - vmovdqu %ymm8,256-128(%r13) - vmovdqu %ymm9,288-128(%r13) - -L$sqr_1024_no_n_copy: - andq $-1024,%rsp - - vmovdqu 32-128(%rsi),%ymm1 - vmovdqu 64-128(%rsi),%ymm2 - vmovdqu 96-128(%rsi),%ymm3 - vmovdqu 128-128(%rsi),%ymm4 - vmovdqu 160-128(%rsi),%ymm5 - vmovdqu 192-128(%rsi),%ymm6 - vmovdqu 224-128(%rsi),%ymm7 - vmovdqu 256-128(%rsi),%ymm8 - - leaq 192(%rsp),%rbx - vmovdqu L$and_mask(%rip),%ymm15 - jmp L$OOP_GRANDE_SQR_1024 - -.p2align 5 -L$OOP_GRANDE_SQR_1024: - leaq 576+128(%rsp),%r9 - leaq 448(%rsp),%r12 - - - - - vpaddq %ymm1,%ymm1,%ymm1 - vpbroadcastq 0-128(%rsi),%ymm10 - vpaddq %ymm2,%ymm2,%ymm2 - vmovdqa %ymm1,0-128(%r9) - vpaddq %ymm3,%ymm3,%ymm3 - vmovdqa %ymm2,32-128(%r9) - vpaddq %ymm4,%ymm4,%ymm4 - vmovdqa %ymm3,64-128(%r9) - vpaddq %ymm5,%ymm5,%ymm5 - vmovdqa %ymm4,96-128(%r9) - vpaddq %ymm6,%ymm6,%ymm6 - vmovdqa %ymm5,128-128(%r9) - vpaddq %ymm7,%ymm7,%ymm7 - vmovdqa %ymm6,160-128(%r9) - vpaddq %ymm8,%ymm8,%ymm8 - vmovdqa %ymm7,192-128(%r9) - vpxor %ymm9,%ymm9,%ymm9 - vmovdqa %ymm8,224-128(%r9) - - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpbroadcastq 32-128(%rsi),%ymm11 - vmovdqu %ymm9,288-192(%rbx) - vpmuludq %ymm10,%ymm1,%ymm1 - vmovdqu %ymm9,320-448(%r12) - vpmuludq %ymm10,%ymm2,%ymm2 - vmovdqu %ymm9,352-448(%r12) - vpmuludq %ymm10,%ymm3,%ymm3 - vmovdqu %ymm9,384-448(%r12) - vpmuludq %ymm10,%ymm4,%ymm4 - vmovdqu %ymm9,416-448(%r12) - vpmuludq %ymm10,%ymm5,%ymm5 - vmovdqu %ymm9,448-448(%r12) - vpmuludq %ymm10,%ymm6,%ymm6 - vmovdqu %ymm9,480-448(%r12) - vpmuludq %ymm10,%ymm7,%ymm7 - vmovdqu %ymm9,512-448(%r12) - vpmuludq %ymm10,%ymm8,%ymm8 - vpbroadcastq 64-128(%rsi),%ymm10 - vmovdqu %ymm9,544-448(%r12) - - movq %rsi,%r15 - movl $4,%r14d - jmp L$sqr_entry_1024 -.p2align 5 -L$OOP_SQR_1024: - vpbroadcastq 32-128(%r15),%ymm11 - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpaddq 0-192(%rbx),%ymm0,%ymm0 - vpmuludq 0-128(%r9),%ymm10,%ymm1 - vpaddq 32-192(%rbx),%ymm1,%ymm1 - vpmuludq 32-128(%r9),%ymm10,%ymm2 - vpaddq 64-192(%rbx),%ymm2,%ymm2 - vpmuludq 64-128(%r9),%ymm10,%ymm3 - vpaddq 96-192(%rbx),%ymm3,%ymm3 - vpmuludq 96-128(%r9),%ymm10,%ymm4 - vpaddq 128-192(%rbx),%ymm4,%ymm4 - vpmuludq 128-128(%r9),%ymm10,%ymm5 - vpaddq 160-192(%rbx),%ymm5,%ymm5 - vpmuludq 160-128(%r9),%ymm10,%ymm6 - vpaddq 192-192(%rbx),%ymm6,%ymm6 - vpmuludq 192-128(%r9),%ymm10,%ymm7 - vpaddq 224-192(%rbx),%ymm7,%ymm7 - vpmuludq 224-128(%r9),%ymm10,%ymm8 - vpbroadcastq 64-128(%r15),%ymm10 - vpaddq 256-192(%rbx),%ymm8,%ymm8 -L$sqr_entry_1024: - vmovdqu %ymm0,0-192(%rbx) - vmovdqu %ymm1,32-192(%rbx) - - vpmuludq 32-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 32-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 64-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 96-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 128-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 160-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 192-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 224-128(%r9),%ymm11,%ymm0 - vpbroadcastq 96-128(%r15),%ymm11 - vpaddq 288-192(%rbx),%ymm0,%ymm0 - - vmovdqu %ymm2,64-192(%rbx) - vmovdqu %ymm3,96-192(%rbx) - - vpmuludq 64-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 64-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 96-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 128-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 160-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 224-128(%r9),%ymm10,%ymm1 - vpbroadcastq 128-128(%r15),%ymm10 - vpaddq 320-448(%r12),%ymm1,%ymm1 - - vmovdqu %ymm4,128-192(%rbx) - vmovdqu %ymm5,160-192(%rbx) - - vpmuludq 96-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 96-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq 128-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm0,%ymm0 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq 224-128(%r9),%ymm11,%ymm2 - vpbroadcastq 160-128(%r15),%ymm11 - vpaddq 352-448(%r12),%ymm2,%ymm2 - - vmovdqu %ymm6,192-192(%rbx) - vmovdqu %ymm7,224-192(%rbx) - - vpmuludq 128-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 128-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 160-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 192-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 224-128(%r9),%ymm10,%ymm3 - vpbroadcastq 192-128(%r15),%ymm10 - vpaddq 384-448(%r12),%ymm3,%ymm3 - - vmovdqu %ymm8,256-192(%rbx) - vmovdqu %ymm0,288-192(%rbx) - leaq 8(%rbx),%rbx - - vpmuludq 160-128(%rsi),%ymm11,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 224-128(%r9),%ymm11,%ymm4 - vpbroadcastq 224-128(%r15),%ymm11 - vpaddq 416-448(%r12),%ymm4,%ymm4 - - vmovdqu %ymm1,320-448(%r12) - vmovdqu %ymm2,352-448(%r12) - - vpmuludq 192-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpbroadcastq 256-128(%r15),%ymm0 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq 224-128(%r9),%ymm10,%ymm5 - vpbroadcastq 0+8-128(%r15),%ymm10 - vpaddq 448-448(%r12),%ymm5,%ymm5 - - vmovdqu %ymm3,384-448(%r12) - vmovdqu %ymm4,416-448(%r12) - leaq 8(%r15),%r15 - - vpmuludq 224-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 224-128(%r9),%ymm11,%ymm6 - vpaddq 480-448(%r12),%ymm6,%ymm6 - - vpmuludq 256-128(%rsi),%ymm0,%ymm7 - vmovdqu %ymm5,448-448(%r12) - vpaddq 512-448(%r12),%ymm7,%ymm7 - vmovdqu %ymm6,480-448(%r12) - vmovdqu %ymm7,512-448(%r12) - leaq 8(%r12),%r12 - - decl %r14d - jnz L$OOP_SQR_1024 - - vmovdqu 256(%rsp),%ymm8 - vmovdqu 288(%rsp),%ymm1 - vmovdqu 320(%rsp),%ymm2 - leaq 192(%rsp),%rbx - - vpsrlq $29,%ymm8,%ymm14 - vpand %ymm15,%ymm8,%ymm8 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - - vpermq $0x93,%ymm14,%ymm14 - vpxor %ymm9,%ymm9,%ymm9 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm8,%ymm8 - vpblendd $3,%ymm11,%ymm9,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,288-192(%rbx) - vmovdqu %ymm2,320-192(%rbx) - - movq (%rsp),%rax - movq 8(%rsp),%r10 - movq 16(%rsp),%r11 - movq 24(%rsp),%r12 - vmovdqu 32(%rsp),%ymm1 - vmovdqu 64-192(%rbx),%ymm2 - vmovdqu 96-192(%rbx),%ymm3 - vmovdqu 128-192(%rbx),%ymm4 - vmovdqu 160-192(%rbx),%ymm5 - vmovdqu 192-192(%rbx),%ymm6 - vmovdqu 224-192(%rbx),%ymm7 - - movq %rax,%r9 - imull %ecx,%eax - andl $0x1fffffff,%eax - vmovd %eax,%xmm12 - - movq %rax,%rdx - imulq -128(%r13),%rax - vpbroadcastq %xmm12,%ymm12 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax - shrq $29,%r9 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - addq %r9,%r10 - addq %rax,%r11 - imulq 24-128(%r13),%rdx - addq %rdx,%r12 - - movq %r10,%rax - imull %ecx,%eax - andl $0x1fffffff,%eax - - movl $9,%r14d - jmp L$OOP_REDUCE_1024 - -.p2align 5 -L$OOP_REDUCE_1024: - vmovd %eax,%xmm13 - vpbroadcastq %xmm13,%ymm13 - - vpmuludq 32-128(%r13),%ymm12,%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm10,%ymm1,%ymm1 - addq %rax,%r10 - vpmuludq 64-128(%r13),%ymm12,%ymm14 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm14,%ymm2,%ymm2 - vpmuludq 96-128(%r13),%ymm12,%ymm11 -.byte 0x67 - addq %rax,%r11 -.byte 0x67 - movq %rdx,%rax - imulq 16-128(%r13),%rax - shrq $29,%r10 - vpaddq %ymm11,%ymm3,%ymm3 - vpmuludq 128-128(%r13),%ymm12,%ymm10 - addq %rax,%r12 - addq %r10,%r11 - vpaddq %ymm10,%ymm4,%ymm4 - vpmuludq 160-128(%r13),%ymm12,%ymm14 - movq %r11,%rax - imull %ecx,%eax - vpaddq %ymm14,%ymm5,%ymm5 - vpmuludq 192-128(%r13),%ymm12,%ymm11 - andl $0x1fffffff,%eax - vpaddq %ymm11,%ymm6,%ymm6 - vpmuludq 224-128(%r13),%ymm12,%ymm10 - vpaddq %ymm10,%ymm7,%ymm7 - vpmuludq 256-128(%r13),%ymm12,%ymm14 - vmovd %eax,%xmm12 - - vpaddq %ymm14,%ymm8,%ymm8 - - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 32-8-128(%r13),%ymm13,%ymm11 - vmovdqu 96-8-128(%r13),%ymm14 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm1,%ymm1 - vpmuludq 64-8-128(%r13),%ymm13,%ymm10 - vmovdqu 128-8-128(%r13),%ymm11 - addq %rax,%r11 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm10,%ymm2,%ymm2 - addq %r12,%rax - shrq $29,%r11 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 160-8-128(%r13),%ymm10 - addq %r11,%rax - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 192-8-128(%r13),%ymm14 -.byte 0x67 - movq %rax,%r12 - imull %ecx,%eax - vpaddq %ymm11,%ymm4,%ymm4 - vpmuludq %ymm13,%ymm10,%ymm10 -.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 - andl $0x1fffffff,%eax - vpaddq %ymm10,%ymm5,%ymm5 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 256-8-128(%r13),%ymm10 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 288-8-128(%r13),%ymm9 - vmovd %eax,%xmm0 - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm7,%ymm7 - vpmuludq %ymm13,%ymm10,%ymm10 - vmovdqu 32-16-128(%r13),%ymm14 - vpbroadcastq %xmm0,%ymm0 - vpaddq %ymm10,%ymm8,%ymm8 - vpmuludq %ymm13,%ymm9,%ymm9 - vmovdqu 64-16-128(%r13),%ymm11 - addq %rax,%r12 - - vmovdqu 32-24-128(%r13),%ymm13 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 96-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq %ymm0,%ymm13,%ymm13 - vpmuludq %ymm12,%ymm11,%ymm11 -.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff - vpaddq %ymm1,%ymm13,%ymm13 - vpaddq %ymm11,%ymm2,%ymm2 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 160-16-128(%r13),%ymm11 -.byte 0x67 - vmovq %xmm13,%rax - vmovdqu %ymm13,(%rsp) - vpaddq %ymm10,%ymm3,%ymm3 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 192-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq %ymm12,%ymm11,%ymm11 - vmovdqu 224-16-128(%r13),%ymm14 - vpaddq %ymm11,%ymm5,%ymm5 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 256-16-128(%r13),%ymm11 - vpaddq %ymm10,%ymm6,%ymm6 - vpmuludq %ymm12,%ymm14,%ymm14 - shrq $29,%r12 - vmovdqu 288-16-128(%r13),%ymm10 - addq %r12,%rax - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq %ymm12,%ymm11,%ymm11 - - movq %rax,%r9 - imull %ecx,%eax - vpaddq %ymm11,%ymm8,%ymm8 - vpmuludq %ymm12,%ymm10,%ymm10 - andl $0x1fffffff,%eax - vmovd %eax,%xmm12 - vmovdqu 96-24-128(%r13),%ymm11 -.byte 0x67 - vpaddq %ymm10,%ymm9,%ymm9 - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 64-24-128(%r13),%ymm0,%ymm14 - vmovdqu 128-24-128(%r13),%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - movq 8(%rsp),%r10 - vpaddq %ymm14,%ymm2,%ymm1 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 160-24-128(%r13),%ymm14 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax -.byte 0x67 - shrq $29,%r9 - movq 16(%rsp),%r11 - vpaddq %ymm11,%ymm3,%ymm2 - vpmuludq %ymm0,%ymm10,%ymm10 - vmovdqu 192-24-128(%r13),%ymm11 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - vpaddq %ymm10,%ymm4,%ymm3 - vpmuludq %ymm0,%ymm14,%ymm14 - vmovdqu 224-24-128(%r13),%ymm10 - imulq 24-128(%r13),%rdx - addq %rax,%r11 - leaq (%r9,%r10,1),%rax - vpaddq %ymm14,%ymm5,%ymm4 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 256-24-128(%r13),%ymm14 - movq %rax,%r10 - imull %ecx,%eax - vpmuludq %ymm0,%ymm10,%ymm10 - vpaddq %ymm11,%ymm6,%ymm5 - vmovdqu 288-24-128(%r13),%ymm11 - andl $0x1fffffff,%eax - vpaddq %ymm10,%ymm7,%ymm6 - vpmuludq %ymm0,%ymm14,%ymm14 - addq 24(%rsp),%rdx - vpaddq %ymm14,%ymm8,%ymm7 - vpmuludq %ymm0,%ymm11,%ymm11 - vpaddq %ymm11,%ymm9,%ymm8 - vmovq %r12,%xmm9 - movq %rdx,%r12 - - decl %r14d - jnz L$OOP_REDUCE_1024 - leaq 448(%rsp),%r12 - vpaddq %ymm9,%ymm13,%ymm0 - vpxor %ymm9,%ymm9,%ymm9 - - vpaddq 288-192(%rbx),%ymm0,%ymm0 - vpaddq 320-448(%r12),%ymm1,%ymm1 - vpaddq 352-448(%r12),%ymm2,%ymm2 - vpaddq 384-448(%r12),%ymm3,%ymm3 - vpaddq 416-448(%r12),%ymm4,%ymm4 - vpaddq 448-448(%r12),%ymm5,%ymm5 - vpaddq 480-448(%r12),%ymm6,%ymm6 - vpaddq 512-448(%r12),%ymm7,%ymm7 - vpaddq 544-448(%r12),%ymm8,%ymm8 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vpaddq %ymm13,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vmovdqu %ymm0,0-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,32-128(%rdi) - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vmovdqu %ymm2,64-128(%rdi) - vpaddq %ymm13,%ymm4,%ymm4 - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vpaddq %ymm13,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vmovdqu %ymm4,128-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vmovdqu %ymm5,160-128(%rdi) - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vmovdqu %ymm6,192-128(%rdi) - vpaddq %ymm13,%ymm8,%ymm8 - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - - movq %rdi,%rsi - decl %r8d - jne L$OOP_GRANDE_SQR_1024 - - vzeroall - movq %rbp,%rax - - movq -48(%rax),%r15 - - movq -40(%rax),%r14 - - movq -32(%rax),%r13 - - movq -24(%rax),%r12 - - movq -16(%rax),%rbp - - movq -8(%rax),%rbx - - leaq (%rax),%rsp - -L$sqr_1024_epilogue: - .byte 0xf3,0xc3 - - -.globl _rsaz_1024_mul_avx2 -.private_extern _rsaz_1024_mul_avx2 - -.p2align 6 -_rsaz_1024_mul_avx2: - - leaq (%rsp),%rax - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - movq %rax,%rbp - - vzeroall - movq %rdx,%r13 - subq $64,%rsp - - - - - - -.byte 0x67,0x67 - movq %rsi,%r15 - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - movq %rsi,%r15 - cmovnzq %r13,%rsi - cmovnzq %r15,%r13 - - movq %rcx,%r15 - subq $-128,%rsi - subq $-128,%rcx - subq $-128,%rdi - - andq $4095,%r15 - addq $320,%r15 -.byte 0x67,0x67 - shrq $12,%r15 - jz L$mul_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%rcx),%ymm0 - andq $-512,%rsp - vmovdqu 32-128(%rcx),%ymm1 - vmovdqu 64-128(%rcx),%ymm2 - vmovdqu 96-128(%rcx),%ymm3 - vmovdqu 128-128(%rcx),%ymm4 - vmovdqu 160-128(%rcx),%ymm5 - vmovdqu 192-128(%rcx),%ymm6 - vmovdqu 224-128(%rcx),%ymm7 - vmovdqu 256-128(%rcx),%ymm8 - leaq 64+128(%rsp),%rcx - vmovdqu %ymm0,0-128(%rcx) - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm1,32-128(%rcx) - vpxor %ymm1,%ymm1,%ymm1 - vmovdqu %ymm2,64-128(%rcx) - vpxor %ymm2,%ymm2,%ymm2 - vmovdqu %ymm3,96-128(%rcx) - vpxor %ymm3,%ymm3,%ymm3 - vmovdqu %ymm4,128-128(%rcx) - vpxor %ymm4,%ymm4,%ymm4 - vmovdqu %ymm5,160-128(%rcx) - vpxor %ymm5,%ymm5,%ymm5 - vmovdqu %ymm6,192-128(%rcx) - vpxor %ymm6,%ymm6,%ymm6 - vmovdqu %ymm7,224-128(%rcx) - vpxor %ymm7,%ymm7,%ymm7 - vmovdqu %ymm8,256-128(%rcx) - vmovdqa %ymm0,%ymm8 - vmovdqu %ymm9,288-128(%rcx) -L$mul_1024_no_n_copy: - andq $-64,%rsp - - movq (%r13),%rbx - vpbroadcastq (%r13),%ymm10 - vmovdqu %ymm0,(%rsp) - xorq %r9,%r9 -.byte 0x67 - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - - vmovdqu L$and_mask(%rip),%ymm15 - movl $9,%r14d - vmovdqu %ymm9,288-128(%rdi) - jmp L$oop_mul_1024 - -.p2align 5 -L$oop_mul_1024: - vpsrlq $29,%ymm3,%ymm9 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r9,%rax - movq %rbx,%r10 - imulq 8-128(%rsi),%r10 - addq 8(%rsp),%r10 - - movq %rax,%r9 - imull %r8d,%eax - andl $0x1fffffff,%eax - - movq %rbx,%r11 - imulq 16-128(%rsi),%r11 - addq 16(%rsp),%r11 - - movq %rbx,%r12 - imulq 24-128(%rsi),%r12 - addq 24(%rsp),%r12 - vpmuludq 32-128(%rsi),%ymm10,%ymm0 - vmovd %eax,%xmm11 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq 64-128(%rsi),%ymm10,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 96-128(%rsi),%ymm10,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq 128-128(%rsi),%ymm10,%ymm0 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq 160-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 192-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq 224-128(%rsi),%ymm10,%ymm0 - vpermq $0x93,%ymm9,%ymm9 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq 256-128(%rsi),%ymm10,%ymm12 - vpbroadcastq 8(%r13),%ymm10 - vpaddq %ymm12,%ymm8,%ymm8 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%rcx),%rax - addq %rax,%r11 - shrq $29,%r9 - imulq 24-128(%rcx),%rdx - addq %rdx,%r12 - addq %r9,%r10 - - vpmuludq 32-128(%rcx),%ymm11,%ymm13 - vmovq %xmm10,%rbx - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 64-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm2,%ymm2 - vpmuludq 96-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 128-128(%rcx),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 160-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm5,%ymm5 - vpmuludq 192-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 224-128(%rcx),%ymm11,%ymm13 - vpblendd $3,%ymm14,%ymm9,%ymm12 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 256-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm12,%ymm3,%ymm3 - vpaddq %ymm0,%ymm8,%ymm8 - - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rsi),%ymm12 - movq %rbx,%rax - imulq 8-128(%rsi),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rsi),%ymm13 - - movq %r10,%rax - vpblendd $0xfc,%ymm14,%ymm9,%ymm9 - imull %r8d,%eax - vpaddq %ymm9,%ymm4,%ymm4 - andl $0x1fffffff,%eax - - imulq 16-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovd %eax,%xmm11 - vmovdqu -8+96-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -8+128-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+160-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+192-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -8+224-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+256-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+288-128(%rsi),%ymm9 - vpaddq %ymm12,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm13,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm9,%ymm9 - vpbroadcastq 16(%r13),%ymm10 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rcx),%ymm0 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rcx),%ymm12 - shrq $29,%r10 - imulq 16-128(%rcx),%rdx - addq %rdx,%r12 - addq %r10,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -8+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rsi),%ymm0 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r11,%rax - - vmovdqu -16+64-128(%rsi),%ymm12 - movq %rax,%r11 - imull %r8d,%eax - andl $0x1fffffff,%eax - - imulq 8-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -16+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -16+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -16+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 24(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rcx),%ymm0 - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r11 - vmovdqu -16+64-128(%rcx),%ymm12 - imulq 8-128(%rcx),%rdx - addq %rdx,%r12 - shrq $29,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -16+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+32-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+64-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm9,%ymm9 - - addq %r11,%r12 - imulq -128(%rsi),%rbx - addq %rbx,%r12 - - movq %r12,%rax - imull %r8d,%eax - andl $0x1fffffff,%eax - - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -24+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -24+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -24+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 32(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - addq $32,%r13 - - vmovdqu -24+32-128(%rcx),%ymm0 - imulq -128(%rcx),%rax - addq %rax,%r12 - shrq $29,%r12 - - vmovdqu -24+64-128(%rcx),%ymm12 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -24+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm0 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu %ymm0,(%rsp) - vpaddq %ymm12,%ymm2,%ymm1 - vmovdqu -24+128-128(%rcx),%ymm0 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm2 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm3 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm4 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm5 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+288-128(%rcx),%ymm13 - movq %r12,%r9 - vpaddq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm11,%ymm12,%ymm12 - addq (%rsp),%r9 - vpaddq %ymm12,%ymm8,%ymm7 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovq %r12,%xmm12 - vpaddq %ymm13,%ymm9,%ymm8 - - decl %r14d - jnz L$oop_mul_1024 - vpaddq (%rsp),%ymm12,%ymm0 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm10,%ymm10 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpermq $0x93,%ymm11,%ymm11 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm10,%ymm10 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm11,%ymm11 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vmovdqu %ymm0,0-128(%rdi) - vmovdqu %ymm1,32-128(%rdi) - vmovdqu %ymm2,64-128(%rdi) - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vmovdqu %ymm4,128-128(%rdi) - vmovdqu %ymm5,160-128(%rdi) - vmovdqu %ymm6,192-128(%rdi) - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - vzeroupper - - movq %rbp,%rax - - movq -48(%rax),%r15 - - movq -40(%rax),%r14 - - movq -32(%rax),%r13 - - movq -24(%rax),%r12 - - movq -16(%rax),%rbp - - movq -8(%rax),%rbx - - leaq (%rax),%rsp - -L$mul_1024_epilogue: - .byte 0xf3,0xc3 - - -.globl _rsaz_1024_red2norm_avx2 -.private_extern _rsaz_1024_red2norm_avx2 - -.p2align 5 -_rsaz_1024_red2norm_avx2: - - subq $-128,%rsi - xorq %rax,%rax - movq -128(%rsi),%r8 - movq -120(%rsi),%r9 - movq -112(%rsi),%r10 - shlq $0,%r8 - shlq $29,%r9 - movq %r10,%r11 - shlq $58,%r10 - shrq $6,%r11 - addq %r8,%rax - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,0(%rdi) - movq %r11,%rax - movq -104(%rsi),%r8 - movq -96(%rsi),%r9 - shlq $23,%r8 - movq %r9,%r10 - shlq $52,%r9 - shrq $12,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,8(%rdi) - movq %r10,%rax - movq -88(%rsi),%r11 - movq -80(%rsi),%r8 - shlq $17,%r11 - movq %r8,%r9 - shlq $46,%r8 - shrq $18,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,16(%rdi) - movq %r9,%rax - movq -72(%rsi),%r10 - movq -64(%rsi),%r11 - shlq $11,%r10 - movq %r11,%r8 - shlq $40,%r11 - shrq $24,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,24(%rdi) - movq %r8,%rax - movq -56(%rsi),%r9 - movq -48(%rsi),%r10 - movq -40(%rsi),%r11 - shlq $5,%r9 - shlq $34,%r10 - movq %r11,%r8 - shlq $63,%r11 - shrq $1,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,32(%rdi) - movq %r8,%rax - movq -32(%rsi),%r9 - movq -24(%rsi),%r10 - shlq $28,%r9 - movq %r10,%r11 - shlq $57,%r10 - shrq $7,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,40(%rdi) - movq %r11,%rax - movq -16(%rsi),%r8 - movq -8(%rsi),%r9 - shlq $22,%r8 - movq %r9,%r10 - shlq $51,%r9 - shrq $13,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,48(%rdi) - movq %r10,%rax - movq 0(%rsi),%r11 - movq 8(%rsi),%r8 - shlq $16,%r11 - movq %r8,%r9 - shlq $45,%r8 - shrq $19,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,56(%rdi) - movq %r9,%rax - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - shlq $10,%r10 - movq %r11,%r8 - shlq $39,%r11 - shrq $25,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,64(%rdi) - movq %r8,%rax - movq 32(%rsi),%r9 - movq 40(%rsi),%r10 - movq 48(%rsi),%r11 - shlq $4,%r9 - shlq $33,%r10 - movq %r11,%r8 - shlq $62,%r11 - shrq $2,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,72(%rdi) - movq %r8,%rax - movq 56(%rsi),%r9 - movq 64(%rsi),%r10 - shlq $27,%r9 - movq %r10,%r11 - shlq $56,%r10 - shrq $8,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,80(%rdi) - movq %r11,%rax - movq 72(%rsi),%r8 - movq 80(%rsi),%r9 - shlq $21,%r8 - movq %r9,%r10 - shlq $50,%r9 - shrq $14,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,88(%rdi) - movq %r10,%rax - movq 88(%rsi),%r11 - movq 96(%rsi),%r8 - shlq $15,%r11 - movq %r8,%r9 - shlq $44,%r8 - shrq $20,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,96(%rdi) - movq %r9,%rax - movq 104(%rsi),%r10 - movq 112(%rsi),%r11 - shlq $9,%r10 - movq %r11,%r8 - shlq $38,%r11 - shrq $26,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,104(%rdi) - movq %r8,%rax - movq 120(%rsi),%r9 - movq 128(%rsi),%r10 - movq 136(%rsi),%r11 - shlq $3,%r9 - shlq $32,%r10 - movq %r11,%r8 - shlq $61,%r11 - shrq $3,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,112(%rdi) - movq %r8,%rax - movq 144(%rsi),%r9 - movq 152(%rsi),%r10 - shlq $26,%r9 - movq %r10,%r11 - shlq $55,%r10 - shrq $9,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,120(%rdi) - movq %r11,%rax - .byte 0xf3,0xc3 - - - -.globl _rsaz_1024_norm2red_avx2 -.private_extern _rsaz_1024_norm2red_avx2 - -.p2align 5 -_rsaz_1024_norm2red_avx2: - - subq $-128,%rdi - movq (%rsi),%r8 - movl $0x1fffffff,%eax - movq 8(%rsi),%r9 - movq %r8,%r11 - shrq $0,%r11 - andq %rax,%r11 - movq %r11,-128(%rdi) - movq %r8,%r10 - shrq $29,%r10 - andq %rax,%r10 - movq %r10,-120(%rdi) - shrdq $58,%r9,%r8 - andq %rax,%r8 - movq %r8,-112(%rdi) - movq 16(%rsi),%r10 - movq %r9,%r8 - shrq $23,%r8 - andq %rax,%r8 - movq %r8,-104(%rdi) - shrdq $52,%r10,%r9 - andq %rax,%r9 - movq %r9,-96(%rdi) - movq 24(%rsi),%r11 - movq %r10,%r9 - shrq $17,%r9 - andq %rax,%r9 - movq %r9,-88(%rdi) - shrdq $46,%r11,%r10 - andq %rax,%r10 - movq %r10,-80(%rdi) - movq 32(%rsi),%r8 - movq %r11,%r10 - shrq $11,%r10 - andq %rax,%r10 - movq %r10,-72(%rdi) - shrdq $40,%r8,%r11 - andq %rax,%r11 - movq %r11,-64(%rdi) - movq 40(%rsi),%r9 - movq %r8,%r11 - shrq $5,%r11 - andq %rax,%r11 - movq %r11,-56(%rdi) - movq %r8,%r10 - shrq $34,%r10 - andq %rax,%r10 - movq %r10,-48(%rdi) - shrdq $63,%r9,%r8 - andq %rax,%r8 - movq %r8,-40(%rdi) - movq 48(%rsi),%r10 - movq %r9,%r8 - shrq $28,%r8 - andq %rax,%r8 - movq %r8,-32(%rdi) - shrdq $57,%r10,%r9 - andq %rax,%r9 - movq %r9,-24(%rdi) - movq 56(%rsi),%r11 - movq %r10,%r9 - shrq $22,%r9 - andq %rax,%r9 - movq %r9,-16(%rdi) - shrdq $51,%r11,%r10 - andq %rax,%r10 - movq %r10,-8(%rdi) - movq 64(%rsi),%r8 - movq %r11,%r10 - shrq $16,%r10 - andq %rax,%r10 - movq %r10,0(%rdi) - shrdq $45,%r8,%r11 - andq %rax,%r11 - movq %r11,8(%rdi) - movq 72(%rsi),%r9 - movq %r8,%r11 - shrq $10,%r11 - andq %rax,%r11 - movq %r11,16(%rdi) - shrdq $39,%r9,%r8 - andq %rax,%r8 - movq %r8,24(%rdi) - movq 80(%rsi),%r10 - movq %r9,%r8 - shrq $4,%r8 - andq %rax,%r8 - movq %r8,32(%rdi) - movq %r9,%r11 - shrq $33,%r11 - andq %rax,%r11 - movq %r11,40(%rdi) - shrdq $62,%r10,%r9 - andq %rax,%r9 - movq %r9,48(%rdi) - movq 88(%rsi),%r11 - movq %r10,%r9 - shrq $27,%r9 - andq %rax,%r9 - movq %r9,56(%rdi) - shrdq $56,%r11,%r10 - andq %rax,%r10 - movq %r10,64(%rdi) - movq 96(%rsi),%r8 - movq %r11,%r10 - shrq $21,%r10 - andq %rax,%r10 - movq %r10,72(%rdi) - shrdq $50,%r8,%r11 - andq %rax,%r11 - movq %r11,80(%rdi) - movq 104(%rsi),%r9 - movq %r8,%r11 - shrq $15,%r11 - andq %rax,%r11 - movq %r11,88(%rdi) - shrdq $44,%r9,%r8 - andq %rax,%r8 - movq %r8,96(%rdi) - movq 112(%rsi),%r10 - movq %r9,%r8 - shrq $9,%r8 - andq %rax,%r8 - movq %r8,104(%rdi) - shrdq $38,%r10,%r9 - andq %rax,%r9 - movq %r9,112(%rdi) - movq 120(%rsi),%r11 - movq %r10,%r9 - shrq $3,%r9 - andq %rax,%r9 - movq %r9,120(%rdi) - movq %r10,%r8 - shrq $32,%r8 - andq %rax,%r8 - movq %r8,128(%rdi) - shrdq $61,%r11,%r10 - andq %rax,%r10 - movq %r10,136(%rdi) - xorq %r8,%r8 - movq %r11,%r10 - shrq $26,%r10 - andq %rax,%r10 - movq %r10,144(%rdi) - shrdq $55,%r8,%r11 - andq %rax,%r11 - movq %r11,152(%rdi) - movq %r8,160(%rdi) - movq %r8,168(%rdi) - movq %r8,176(%rdi) - movq %r8,184(%rdi) - .byte 0xf3,0xc3 - - -.globl _rsaz_1024_scatter5_avx2 -.private_extern _rsaz_1024_scatter5_avx2 - -.p2align 5 -_rsaz_1024_scatter5_avx2: - - vzeroupper - vmovdqu L$scatter_permd(%rip),%ymm5 - shll $4,%edx - leaq (%rdi,%rdx,1),%rdi - movl $9,%eax - jmp L$oop_scatter_1024 - -.p2align 5 -L$oop_scatter_1024: - vmovdqu (%rsi),%ymm0 - leaq 32(%rsi),%rsi - vpermd %ymm0,%ymm5,%ymm0 - vmovdqu %xmm0,(%rdi) - leaq 512(%rdi),%rdi - decl %eax - jnz L$oop_scatter_1024 - - vzeroupper - .byte 0xf3,0xc3 - - - -.globl _rsaz_1024_gather5_avx2 -.private_extern _rsaz_1024_gather5_avx2 - -.p2align 5 -_rsaz_1024_gather5_avx2: - - vzeroupper - movq %rsp,%r11 - - leaq -256(%rsp),%rsp - andq $-32,%rsp - leaq L$inc(%rip),%r10 - leaq -128(%rsp),%rax - - vmovd %edx,%xmm4 - vmovdqa (%r10),%ymm0 - vmovdqa 32(%r10),%ymm1 - vmovdqa 64(%r10),%ymm5 - vpbroadcastd %xmm4,%ymm4 - - vpaddd %ymm5,%ymm0,%ymm2 - vpcmpeqd %ymm4,%ymm0,%ymm0 - vpaddd %ymm5,%ymm1,%ymm3 - vpcmpeqd %ymm4,%ymm1,%ymm1 - vmovdqa %ymm0,0+128(%rax) - vpaddd %ymm5,%ymm2,%ymm0 - vpcmpeqd %ymm4,%ymm2,%ymm2 - vmovdqa %ymm1,32+128(%rax) - vpaddd %ymm5,%ymm3,%ymm1 - vpcmpeqd %ymm4,%ymm3,%ymm3 - vmovdqa %ymm2,64+128(%rax) - vpaddd %ymm5,%ymm0,%ymm2 - vpcmpeqd %ymm4,%ymm0,%ymm0 - vmovdqa %ymm3,96+128(%rax) - vpaddd %ymm5,%ymm1,%ymm3 - vpcmpeqd %ymm4,%ymm1,%ymm1 - vmovdqa %ymm0,128+128(%rax) - vpaddd %ymm5,%ymm2,%ymm8 - vpcmpeqd %ymm4,%ymm2,%ymm2 - vmovdqa %ymm1,160+128(%rax) - vpaddd %ymm5,%ymm3,%ymm9 - vpcmpeqd %ymm4,%ymm3,%ymm3 - vmovdqa %ymm2,192+128(%rax) - vpaddd %ymm5,%ymm8,%ymm10 - vpcmpeqd %ymm4,%ymm8,%ymm8 - vmovdqa %ymm3,224+128(%rax) - vpaddd %ymm5,%ymm9,%ymm11 - vpcmpeqd %ymm4,%ymm9,%ymm9 - vpaddd %ymm5,%ymm10,%ymm12 - vpcmpeqd %ymm4,%ymm10,%ymm10 - vpaddd %ymm5,%ymm11,%ymm13 - vpcmpeqd %ymm4,%ymm11,%ymm11 - vpaddd %ymm5,%ymm12,%ymm14 - vpcmpeqd %ymm4,%ymm12,%ymm12 - vpaddd %ymm5,%ymm13,%ymm15 - vpcmpeqd %ymm4,%ymm13,%ymm13 - vpcmpeqd %ymm4,%ymm14,%ymm14 - vpcmpeqd %ymm4,%ymm15,%ymm15 - - vmovdqa -32(%r10),%ymm7 - leaq 128(%rsi),%rsi - movl $9,%edx - -L$oop_gather_1024: - vmovdqa 0-128(%rsi),%ymm0 - vmovdqa 32-128(%rsi),%ymm1 - vmovdqa 64-128(%rsi),%ymm2 - vmovdqa 96-128(%rsi),%ymm3 - vpand 0+128(%rax),%ymm0,%ymm0 - vpand 32+128(%rax),%ymm1,%ymm1 - vpand 64+128(%rax),%ymm2,%ymm2 - vpor %ymm0,%ymm1,%ymm4 - vpand 96+128(%rax),%ymm3,%ymm3 - vmovdqa 128-128(%rsi),%ymm0 - vmovdqa 160-128(%rsi),%ymm1 - vpor %ymm2,%ymm3,%ymm5 - vmovdqa 192-128(%rsi),%ymm2 - vmovdqa 224-128(%rsi),%ymm3 - vpand 128+128(%rax),%ymm0,%ymm0 - vpand 160+128(%rax),%ymm1,%ymm1 - vpand 192+128(%rax),%ymm2,%ymm2 - vpor %ymm0,%ymm4,%ymm4 - vpand 224+128(%rax),%ymm3,%ymm3 - vpand 256-128(%rsi),%ymm8,%ymm0 - vpor %ymm1,%ymm5,%ymm5 - vpand 288-128(%rsi),%ymm9,%ymm1 - vpor %ymm2,%ymm4,%ymm4 - vpand 320-128(%rsi),%ymm10,%ymm2 - vpor %ymm3,%ymm5,%ymm5 - vpand 352-128(%rsi),%ymm11,%ymm3 - vpor %ymm0,%ymm4,%ymm4 - vpand 384-128(%rsi),%ymm12,%ymm0 - vpor %ymm1,%ymm5,%ymm5 - vpand 416-128(%rsi),%ymm13,%ymm1 - vpor %ymm2,%ymm4,%ymm4 - vpand 448-128(%rsi),%ymm14,%ymm2 - vpor %ymm3,%ymm5,%ymm5 - vpand 480-128(%rsi),%ymm15,%ymm3 - leaq 512(%rsi),%rsi - vpor %ymm0,%ymm4,%ymm4 - vpor %ymm1,%ymm5,%ymm5 - vpor %ymm2,%ymm4,%ymm4 - vpor %ymm3,%ymm5,%ymm5 - - vpor %ymm5,%ymm4,%ymm4 - vextracti128 $1,%ymm4,%xmm5 - vpor %xmm4,%xmm5,%xmm5 - vpermd %ymm5,%ymm7,%ymm5 - vmovdqu %ymm5,(%rdi) - leaq 32(%rdi),%rdi - decl %edx - jnz L$oop_gather_1024 - - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - vzeroupper - leaq (%r11),%rsp - - .byte 0xf3,0xc3 - -L$SEH_end_rsaz_1024_gather5: - -.p2align 6 -L$and_mask: -.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff -L$scatter_permd: -.long 0,2,4,6,7,7,7,7 -L$gather_permd: -.long 0,7,1,7,2,7,3,7 -L$inc: -.long 0,0,0,0, 1,1,1,1 -.long 2,2,2,2, 3,3,3,3 -.long 4,4,4,4, 4,4,4,4 -.p2align 6 -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/sha1-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/sha1-x86_64.S deleted file mode 100644 index d50851ed..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/sha1-x86_64.S +++ /dev/null @@ -1,5466 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - -.globl _sha1_block_data_order -.private_extern _sha1_block_data_order - -.p2align 4 -_sha1_block_data_order: - - leaq _OPENSSL_ia32cap_P(%rip),%r10 - movl 0(%r10),%r9d - movl 4(%r10),%r8d - movl 8(%r10),%r10d - testl $512,%r8d - jz L$ialu - testl $536870912,%r10d - jnz _shaext_shortcut - andl $296,%r10d - cmpl $296,%r10d - je _avx2_shortcut - andl $268435456,%r8d - andl $1073741824,%r9d - orl %r9d,%r8d - cmpl $1342177280,%r8d - je _avx_shortcut - jmp _ssse3_shortcut - -.p2align 4 -L$ialu: - movq %rsp,%rax - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - movq %rdi,%r8 - subq $72,%rsp - movq %rsi,%r9 - andq $-64,%rsp - movq %rdx,%r10 - movq %rax,64(%rsp) - -L$prologue: - - movl 0(%r8),%esi - movl 4(%r8),%edi - movl 8(%r8),%r11d - movl 12(%r8),%r12d - movl 16(%r8),%r13d - jmp L$loop - -.p2align 4 -L$loop: - movl 0(%r9),%edx - bswapl %edx - movl 4(%r9),%ebp - movl %r12d,%eax - movl %edx,0(%rsp) - movl %esi,%ecx - bswapl %ebp - xorl %r11d,%eax - roll $5,%ecx - andl %edi,%eax - leal 1518500249(%rdx,%r13,1),%r13d - addl %ecx,%r13d - xorl %r12d,%eax - roll $30,%edi - addl %eax,%r13d - movl 8(%r9),%r14d - movl %r11d,%eax - movl %ebp,4(%rsp) - movl %r13d,%ecx - bswapl %r14d - xorl %edi,%eax - roll $5,%ecx - andl %esi,%eax - leal 1518500249(%rbp,%r12,1),%r12d - addl %ecx,%r12d - xorl %r11d,%eax - roll $30,%esi - addl %eax,%r12d - movl 12(%r9),%edx - movl %edi,%eax - movl %r14d,8(%rsp) - movl %r12d,%ecx - bswapl %edx - xorl %esi,%eax - roll $5,%ecx - andl %r13d,%eax - leal 1518500249(%r14,%r11,1),%r11d - addl %ecx,%r11d - xorl %edi,%eax - roll $30,%r13d - addl %eax,%r11d - movl 16(%r9),%ebp - movl %esi,%eax - movl %edx,12(%rsp) - movl %r11d,%ecx - bswapl %ebp - xorl %r13d,%eax - roll $5,%ecx - andl %r12d,%eax - leal 1518500249(%rdx,%rdi,1),%edi - addl %ecx,%edi - xorl %esi,%eax - roll $30,%r12d - addl %eax,%edi - movl 20(%r9),%r14d - movl %r13d,%eax - movl %ebp,16(%rsp) - movl %edi,%ecx - bswapl %r14d - xorl %r12d,%eax - roll $5,%ecx - andl %r11d,%eax - leal 1518500249(%rbp,%rsi,1),%esi - addl %ecx,%esi - xorl %r13d,%eax - roll $30,%r11d - addl %eax,%esi - movl 24(%r9),%edx - movl %r12d,%eax - movl %r14d,20(%rsp) - movl %esi,%ecx - bswapl %edx - xorl %r11d,%eax - roll $5,%ecx - andl %edi,%eax - leal 1518500249(%r14,%r13,1),%r13d - addl %ecx,%r13d - xorl %r12d,%eax - roll $30,%edi - addl %eax,%r13d - movl 28(%r9),%ebp - movl %r11d,%eax - movl %edx,24(%rsp) - movl %r13d,%ecx - bswapl %ebp - xorl %edi,%eax - roll $5,%ecx - andl %esi,%eax - leal 1518500249(%rdx,%r12,1),%r12d - addl %ecx,%r12d - xorl %r11d,%eax - roll $30,%esi - addl %eax,%r12d - movl 32(%r9),%r14d - movl %edi,%eax - movl %ebp,28(%rsp) - movl %r12d,%ecx - bswapl %r14d - xorl %esi,%eax - roll $5,%ecx - andl %r13d,%eax - leal 1518500249(%rbp,%r11,1),%r11d - addl %ecx,%r11d - xorl %edi,%eax - roll $30,%r13d - addl %eax,%r11d - movl 36(%r9),%edx - movl %esi,%eax - movl %r14d,32(%rsp) - movl %r11d,%ecx - bswapl %edx - xorl %r13d,%eax - roll $5,%ecx - andl %r12d,%eax - leal 1518500249(%r14,%rdi,1),%edi - addl %ecx,%edi - xorl %esi,%eax - roll $30,%r12d - addl %eax,%edi - movl 40(%r9),%ebp - movl %r13d,%eax - movl %edx,36(%rsp) - movl %edi,%ecx - bswapl %ebp - xorl %r12d,%eax - roll $5,%ecx - andl %r11d,%eax - leal 1518500249(%rdx,%rsi,1),%esi - addl %ecx,%esi - xorl %r13d,%eax - roll $30,%r11d - addl %eax,%esi - movl 44(%r9),%r14d - movl %r12d,%eax - movl %ebp,40(%rsp) - movl %esi,%ecx - bswapl %r14d - xorl %r11d,%eax - roll $5,%ecx - andl %edi,%eax - leal 1518500249(%rbp,%r13,1),%r13d - addl %ecx,%r13d - xorl %r12d,%eax - roll $30,%edi - addl %eax,%r13d - movl 48(%r9),%edx - movl %r11d,%eax - movl %r14d,44(%rsp) - movl %r13d,%ecx - bswapl %edx - xorl %edi,%eax - roll $5,%ecx - andl %esi,%eax - leal 1518500249(%r14,%r12,1),%r12d - addl %ecx,%r12d - xorl %r11d,%eax - roll $30,%esi - addl %eax,%r12d - movl 52(%r9),%ebp - movl %edi,%eax - movl %edx,48(%rsp) - movl %r12d,%ecx - bswapl %ebp - xorl %esi,%eax - roll $5,%ecx - andl %r13d,%eax - leal 1518500249(%rdx,%r11,1),%r11d - addl %ecx,%r11d - xorl %edi,%eax - roll $30,%r13d - addl %eax,%r11d - movl 56(%r9),%r14d - movl %esi,%eax - movl %ebp,52(%rsp) - movl %r11d,%ecx - bswapl %r14d - xorl %r13d,%eax - roll $5,%ecx - andl %r12d,%eax - leal 1518500249(%rbp,%rdi,1),%edi - addl %ecx,%edi - xorl %esi,%eax - roll $30,%r12d - addl %eax,%edi - movl 60(%r9),%edx - movl %r13d,%eax - movl %r14d,56(%rsp) - movl %edi,%ecx - bswapl %edx - xorl %r12d,%eax - roll $5,%ecx - andl %r11d,%eax - leal 1518500249(%r14,%rsi,1),%esi - addl %ecx,%esi - xorl %r13d,%eax - roll $30,%r11d - addl %eax,%esi - xorl 0(%rsp),%ebp - movl %r12d,%eax - movl %edx,60(%rsp) - movl %esi,%ecx - xorl 8(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - xorl 32(%rsp),%ebp - andl %edi,%eax - leal 1518500249(%rdx,%r13,1),%r13d - roll $30,%edi - xorl %r12d,%eax - addl %ecx,%r13d - roll $1,%ebp - addl %eax,%r13d - xorl 4(%rsp),%r14d - movl %r11d,%eax - movl %ebp,0(%rsp) - movl %r13d,%ecx - xorl 12(%rsp),%r14d - xorl %edi,%eax - roll $5,%ecx - xorl 36(%rsp),%r14d - andl %esi,%eax - leal 1518500249(%rbp,%r12,1),%r12d - roll $30,%esi - xorl %r11d,%eax - addl %ecx,%r12d - roll $1,%r14d - addl %eax,%r12d - xorl 8(%rsp),%edx - movl %edi,%eax - movl %r14d,4(%rsp) - movl %r12d,%ecx - xorl 16(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - xorl 40(%rsp),%edx - andl %r13d,%eax - leal 1518500249(%r14,%r11,1),%r11d - roll $30,%r13d - xorl %edi,%eax - addl %ecx,%r11d - roll $1,%edx - addl %eax,%r11d - xorl 12(%rsp),%ebp - movl %esi,%eax - movl %edx,8(%rsp) - movl %r11d,%ecx - xorl 20(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - xorl 44(%rsp),%ebp - andl %r12d,%eax - leal 1518500249(%rdx,%rdi,1),%edi - roll $30,%r12d - xorl %esi,%eax - addl %ecx,%edi - roll $1,%ebp - addl %eax,%edi - xorl 16(%rsp),%r14d - movl %r13d,%eax - movl %ebp,12(%rsp) - movl %edi,%ecx - xorl 24(%rsp),%r14d - xorl %r12d,%eax - roll $5,%ecx - xorl 48(%rsp),%r14d - andl %r11d,%eax - leal 1518500249(%rbp,%rsi,1),%esi - roll $30,%r11d - xorl %r13d,%eax - addl %ecx,%esi - roll $1,%r14d - addl %eax,%esi - xorl 20(%rsp),%edx - movl %edi,%eax - movl %r14d,16(%rsp) - movl %esi,%ecx - xorl 28(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - xorl 52(%rsp),%edx - leal 1859775393(%r14,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%edx - xorl 24(%rsp),%ebp - movl %esi,%eax - movl %edx,20(%rsp) - movl %r13d,%ecx - xorl 32(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - xorl 56(%rsp),%ebp - leal 1859775393(%rdx,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%ebp - xorl 28(%rsp),%r14d - movl %r13d,%eax - movl %ebp,24(%rsp) - movl %r12d,%ecx - xorl 36(%rsp),%r14d - xorl %edi,%eax - roll $5,%ecx - xorl 60(%rsp),%r14d - leal 1859775393(%rbp,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%r14d - xorl 32(%rsp),%edx - movl %r12d,%eax - movl %r14d,28(%rsp) - movl %r11d,%ecx - xorl 40(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - xorl 0(%rsp),%edx - leal 1859775393(%r14,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%edx - xorl 36(%rsp),%ebp - movl %r11d,%eax - movl %edx,32(%rsp) - movl %edi,%ecx - xorl 44(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - xorl 4(%rsp),%ebp - leal 1859775393(%rdx,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%ebp - xorl 40(%rsp),%r14d - movl %edi,%eax - movl %ebp,36(%rsp) - movl %esi,%ecx - xorl 48(%rsp),%r14d - xorl %r12d,%eax - roll $5,%ecx - xorl 8(%rsp),%r14d - leal 1859775393(%rbp,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%r14d - xorl 44(%rsp),%edx - movl %esi,%eax - movl %r14d,40(%rsp) - movl %r13d,%ecx - xorl 52(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - xorl 12(%rsp),%edx - leal 1859775393(%r14,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%edx - xorl 48(%rsp),%ebp - movl %r13d,%eax - movl %edx,44(%rsp) - movl %r12d,%ecx - xorl 56(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - xorl 16(%rsp),%ebp - leal 1859775393(%rdx,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%ebp - xorl 52(%rsp),%r14d - movl %r12d,%eax - movl %ebp,48(%rsp) - movl %r11d,%ecx - xorl 60(%rsp),%r14d - xorl %esi,%eax - roll $5,%ecx - xorl 20(%rsp),%r14d - leal 1859775393(%rbp,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%r14d - xorl 56(%rsp),%edx - movl %r11d,%eax - movl %r14d,52(%rsp) - movl %edi,%ecx - xorl 0(%rsp),%edx - xorl %r13d,%eax - roll $5,%ecx - xorl 24(%rsp),%edx - leal 1859775393(%r14,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%edx - xorl 60(%rsp),%ebp - movl %edi,%eax - movl %edx,56(%rsp) - movl %esi,%ecx - xorl 4(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - xorl 28(%rsp),%ebp - leal 1859775393(%rdx,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%ebp - xorl 0(%rsp),%r14d - movl %esi,%eax - movl %ebp,60(%rsp) - movl %r13d,%ecx - xorl 8(%rsp),%r14d - xorl %r11d,%eax - roll $5,%ecx - xorl 32(%rsp),%r14d - leal 1859775393(%rbp,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%r14d - xorl 4(%rsp),%edx - movl %r13d,%eax - movl %r14d,0(%rsp) - movl %r12d,%ecx - xorl 12(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - xorl 36(%rsp),%edx - leal 1859775393(%r14,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%edx - xorl 8(%rsp),%ebp - movl %r12d,%eax - movl %edx,4(%rsp) - movl %r11d,%ecx - xorl 16(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - xorl 40(%rsp),%ebp - leal 1859775393(%rdx,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%ebp - xorl 12(%rsp),%r14d - movl %r11d,%eax - movl %ebp,8(%rsp) - movl %edi,%ecx - xorl 20(%rsp),%r14d - xorl %r13d,%eax - roll $5,%ecx - xorl 44(%rsp),%r14d - leal 1859775393(%rbp,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%r14d - xorl 16(%rsp),%edx - movl %edi,%eax - movl %r14d,12(%rsp) - movl %esi,%ecx - xorl 24(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - xorl 48(%rsp),%edx - leal 1859775393(%r14,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%edx - xorl 20(%rsp),%ebp - movl %esi,%eax - movl %edx,16(%rsp) - movl %r13d,%ecx - xorl 28(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - xorl 52(%rsp),%ebp - leal 1859775393(%rdx,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%ebp - xorl 24(%rsp),%r14d - movl %r13d,%eax - movl %ebp,20(%rsp) - movl %r12d,%ecx - xorl 32(%rsp),%r14d - xorl %edi,%eax - roll $5,%ecx - xorl 56(%rsp),%r14d - leal 1859775393(%rbp,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%r14d - xorl 28(%rsp),%edx - movl %r12d,%eax - movl %r14d,24(%rsp) - movl %r11d,%ecx - xorl 36(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - xorl 60(%rsp),%edx - leal 1859775393(%r14,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%edx - xorl 32(%rsp),%ebp - movl %r11d,%eax - movl %edx,28(%rsp) - movl %edi,%ecx - xorl 40(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - xorl 0(%rsp),%ebp - leal 1859775393(%rdx,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%ebp - xorl 36(%rsp),%r14d - movl %r12d,%eax - movl %ebp,32(%rsp) - movl %r12d,%ebx - xorl 44(%rsp),%r14d - andl %r11d,%eax - movl %esi,%ecx - xorl 4(%rsp),%r14d - leal -1894007588(%rbp,%r13,1),%r13d - xorl %r11d,%ebx - roll $5,%ecx - addl %eax,%r13d - roll $1,%r14d - andl %edi,%ebx - addl %ecx,%r13d - roll $30,%edi - addl %ebx,%r13d - xorl 40(%rsp),%edx - movl %r11d,%eax - movl %r14d,36(%rsp) - movl %r11d,%ebx - xorl 48(%rsp),%edx - andl %edi,%eax - movl %r13d,%ecx - xorl 8(%rsp),%edx - leal -1894007588(%r14,%r12,1),%r12d - xorl %edi,%ebx - roll $5,%ecx - addl %eax,%r12d - roll $1,%edx - andl %esi,%ebx - addl %ecx,%r12d - roll $30,%esi - addl %ebx,%r12d - xorl 44(%rsp),%ebp - movl %edi,%eax - movl %edx,40(%rsp) - movl %edi,%ebx - xorl 52(%rsp),%ebp - andl %esi,%eax - movl %r12d,%ecx - xorl 12(%rsp),%ebp - leal -1894007588(%rdx,%r11,1),%r11d - xorl %esi,%ebx - roll $5,%ecx - addl %eax,%r11d - roll $1,%ebp - andl %r13d,%ebx - addl %ecx,%r11d - roll $30,%r13d - addl %ebx,%r11d - xorl 48(%rsp),%r14d - movl %esi,%eax - movl %ebp,44(%rsp) - movl %esi,%ebx - xorl 56(%rsp),%r14d - andl %r13d,%eax - movl %r11d,%ecx - xorl 16(%rsp),%r14d - leal -1894007588(%rbp,%rdi,1),%edi - xorl %r13d,%ebx - roll $5,%ecx - addl %eax,%edi - roll $1,%r14d - andl %r12d,%ebx - addl %ecx,%edi - roll $30,%r12d - addl %ebx,%edi - xorl 52(%rsp),%edx - movl %r13d,%eax - movl %r14d,48(%rsp) - movl %r13d,%ebx - xorl 60(%rsp),%edx - andl %r12d,%eax - movl %edi,%ecx - xorl 20(%rsp),%edx - leal -1894007588(%r14,%rsi,1),%esi - xorl %r12d,%ebx - roll $5,%ecx - addl %eax,%esi - roll $1,%edx - andl %r11d,%ebx - addl %ecx,%esi - roll $30,%r11d - addl %ebx,%esi - xorl 56(%rsp),%ebp - movl %r12d,%eax - movl %edx,52(%rsp) - movl %r12d,%ebx - xorl 0(%rsp),%ebp - andl %r11d,%eax - movl %esi,%ecx - xorl 24(%rsp),%ebp - leal -1894007588(%rdx,%r13,1),%r13d - xorl %r11d,%ebx - roll $5,%ecx - addl %eax,%r13d - roll $1,%ebp - andl %edi,%ebx - addl %ecx,%r13d - roll $30,%edi - addl %ebx,%r13d - xorl 60(%rsp),%r14d - movl %r11d,%eax - movl %ebp,56(%rsp) - movl %r11d,%ebx - xorl 4(%rsp),%r14d - andl %edi,%eax - movl %r13d,%ecx - xorl 28(%rsp),%r14d - leal -1894007588(%rbp,%r12,1),%r12d - xorl %edi,%ebx - roll $5,%ecx - addl %eax,%r12d - roll $1,%r14d - andl %esi,%ebx - addl %ecx,%r12d - roll $30,%esi - addl %ebx,%r12d - xorl 0(%rsp),%edx - movl %edi,%eax - movl %r14d,60(%rsp) - movl %edi,%ebx - xorl 8(%rsp),%edx - andl %esi,%eax - movl %r12d,%ecx - xorl 32(%rsp),%edx - leal -1894007588(%r14,%r11,1),%r11d - xorl %esi,%ebx - roll $5,%ecx - addl %eax,%r11d - roll $1,%edx - andl %r13d,%ebx - addl %ecx,%r11d - roll $30,%r13d - addl %ebx,%r11d - xorl 4(%rsp),%ebp - movl %esi,%eax - movl %edx,0(%rsp) - movl %esi,%ebx - xorl 12(%rsp),%ebp - andl %r13d,%eax - movl %r11d,%ecx - xorl 36(%rsp),%ebp - leal -1894007588(%rdx,%rdi,1),%edi - xorl %r13d,%ebx - roll $5,%ecx - addl %eax,%edi - roll $1,%ebp - andl %r12d,%ebx - addl %ecx,%edi - roll $30,%r12d - addl %ebx,%edi - xorl 8(%rsp),%r14d - movl %r13d,%eax - movl %ebp,4(%rsp) - movl %r13d,%ebx - xorl 16(%rsp),%r14d - andl %r12d,%eax - movl %edi,%ecx - xorl 40(%rsp),%r14d - leal -1894007588(%rbp,%rsi,1),%esi - xorl %r12d,%ebx - roll $5,%ecx - addl %eax,%esi - roll $1,%r14d - andl %r11d,%ebx - addl %ecx,%esi - roll $30,%r11d - addl %ebx,%esi - xorl 12(%rsp),%edx - movl %r12d,%eax - movl %r14d,8(%rsp) - movl %r12d,%ebx - xorl 20(%rsp),%edx - andl %r11d,%eax - movl %esi,%ecx - xorl 44(%rsp),%edx - leal -1894007588(%r14,%r13,1),%r13d - xorl %r11d,%ebx - roll $5,%ecx - addl %eax,%r13d - roll $1,%edx - andl %edi,%ebx - addl %ecx,%r13d - roll $30,%edi - addl %ebx,%r13d - xorl 16(%rsp),%ebp - movl %r11d,%eax - movl %edx,12(%rsp) - movl %r11d,%ebx - xorl 24(%rsp),%ebp - andl %edi,%eax - movl %r13d,%ecx - xorl 48(%rsp),%ebp - leal -1894007588(%rdx,%r12,1),%r12d - xorl %edi,%ebx - roll $5,%ecx - addl %eax,%r12d - roll $1,%ebp - andl %esi,%ebx - addl %ecx,%r12d - roll $30,%esi - addl %ebx,%r12d - xorl 20(%rsp),%r14d - movl %edi,%eax - movl %ebp,16(%rsp) - movl %edi,%ebx - xorl 28(%rsp),%r14d - andl %esi,%eax - movl %r12d,%ecx - xorl 52(%rsp),%r14d - leal -1894007588(%rbp,%r11,1),%r11d - xorl %esi,%ebx - roll $5,%ecx - addl %eax,%r11d - roll $1,%r14d - andl %r13d,%ebx - addl %ecx,%r11d - roll $30,%r13d - addl %ebx,%r11d - xorl 24(%rsp),%edx - movl %esi,%eax - movl %r14d,20(%rsp) - movl %esi,%ebx - xorl 32(%rsp),%edx - andl %r13d,%eax - movl %r11d,%ecx - xorl 56(%rsp),%edx - leal -1894007588(%r14,%rdi,1),%edi - xorl %r13d,%ebx - roll $5,%ecx - addl %eax,%edi - roll $1,%edx - andl %r12d,%ebx - addl %ecx,%edi - roll $30,%r12d - addl %ebx,%edi - xorl 28(%rsp),%ebp - movl %r13d,%eax - movl %edx,24(%rsp) - movl %r13d,%ebx - xorl 36(%rsp),%ebp - andl %r12d,%eax - movl %edi,%ecx - xorl 60(%rsp),%ebp - leal -1894007588(%rdx,%rsi,1),%esi - xorl %r12d,%ebx - roll $5,%ecx - addl %eax,%esi - roll $1,%ebp - andl %r11d,%ebx - addl %ecx,%esi - roll $30,%r11d - addl %ebx,%esi - xorl 32(%rsp),%r14d - movl %r12d,%eax - movl %ebp,28(%rsp) - movl %r12d,%ebx - xorl 40(%rsp),%r14d - andl %r11d,%eax - movl %esi,%ecx - xorl 0(%rsp),%r14d - leal -1894007588(%rbp,%r13,1),%r13d - xorl %r11d,%ebx - roll $5,%ecx - addl %eax,%r13d - roll $1,%r14d - andl %edi,%ebx - addl %ecx,%r13d - roll $30,%edi - addl %ebx,%r13d - xorl 36(%rsp),%edx - movl %r11d,%eax - movl %r14d,32(%rsp) - movl %r11d,%ebx - xorl 44(%rsp),%edx - andl %edi,%eax - movl %r13d,%ecx - xorl 4(%rsp),%edx - leal -1894007588(%r14,%r12,1),%r12d - xorl %edi,%ebx - roll $5,%ecx - addl %eax,%r12d - roll $1,%edx - andl %esi,%ebx - addl %ecx,%r12d - roll $30,%esi - addl %ebx,%r12d - xorl 40(%rsp),%ebp - movl %edi,%eax - movl %edx,36(%rsp) - movl %edi,%ebx - xorl 48(%rsp),%ebp - andl %esi,%eax - movl %r12d,%ecx - xorl 8(%rsp),%ebp - leal -1894007588(%rdx,%r11,1),%r11d - xorl %esi,%ebx - roll $5,%ecx - addl %eax,%r11d - roll $1,%ebp - andl %r13d,%ebx - addl %ecx,%r11d - roll $30,%r13d - addl %ebx,%r11d - xorl 44(%rsp),%r14d - movl %esi,%eax - movl %ebp,40(%rsp) - movl %esi,%ebx - xorl 52(%rsp),%r14d - andl %r13d,%eax - movl %r11d,%ecx - xorl 12(%rsp),%r14d - leal -1894007588(%rbp,%rdi,1),%edi - xorl %r13d,%ebx - roll $5,%ecx - addl %eax,%edi - roll $1,%r14d - andl %r12d,%ebx - addl %ecx,%edi - roll $30,%r12d - addl %ebx,%edi - xorl 48(%rsp),%edx - movl %r13d,%eax - movl %r14d,44(%rsp) - movl %r13d,%ebx - xorl 56(%rsp),%edx - andl %r12d,%eax - movl %edi,%ecx - xorl 16(%rsp),%edx - leal -1894007588(%r14,%rsi,1),%esi - xorl %r12d,%ebx - roll $5,%ecx - addl %eax,%esi - roll $1,%edx - andl %r11d,%ebx - addl %ecx,%esi - roll $30,%r11d - addl %ebx,%esi - xorl 52(%rsp),%ebp - movl %edi,%eax - movl %edx,48(%rsp) - movl %esi,%ecx - xorl 60(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - xorl 20(%rsp),%ebp - leal -899497514(%rdx,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%ebp - xorl 56(%rsp),%r14d - movl %esi,%eax - movl %ebp,52(%rsp) - movl %r13d,%ecx - xorl 0(%rsp),%r14d - xorl %r11d,%eax - roll $5,%ecx - xorl 24(%rsp),%r14d - leal -899497514(%rbp,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%r14d - xorl 60(%rsp),%edx - movl %r13d,%eax - movl %r14d,56(%rsp) - movl %r12d,%ecx - xorl 4(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - xorl 28(%rsp),%edx - leal -899497514(%r14,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%edx - xorl 0(%rsp),%ebp - movl %r12d,%eax - movl %edx,60(%rsp) - movl %r11d,%ecx - xorl 8(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - xorl 32(%rsp),%ebp - leal -899497514(%rdx,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%ebp - xorl 4(%rsp),%r14d - movl %r11d,%eax - movl %ebp,0(%rsp) - movl %edi,%ecx - xorl 12(%rsp),%r14d - xorl %r13d,%eax - roll $5,%ecx - xorl 36(%rsp),%r14d - leal -899497514(%rbp,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%r14d - xorl 8(%rsp),%edx - movl %edi,%eax - movl %r14d,4(%rsp) - movl %esi,%ecx - xorl 16(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - xorl 40(%rsp),%edx - leal -899497514(%r14,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%edx - xorl 12(%rsp),%ebp - movl %esi,%eax - movl %edx,8(%rsp) - movl %r13d,%ecx - xorl 20(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - xorl 44(%rsp),%ebp - leal -899497514(%rdx,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%ebp - xorl 16(%rsp),%r14d - movl %r13d,%eax - movl %ebp,12(%rsp) - movl %r12d,%ecx - xorl 24(%rsp),%r14d - xorl %edi,%eax - roll $5,%ecx - xorl 48(%rsp),%r14d - leal -899497514(%rbp,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%r14d - xorl 20(%rsp),%edx - movl %r12d,%eax - movl %r14d,16(%rsp) - movl %r11d,%ecx - xorl 28(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - xorl 52(%rsp),%edx - leal -899497514(%r14,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%edx - xorl 24(%rsp),%ebp - movl %r11d,%eax - movl %edx,20(%rsp) - movl %edi,%ecx - xorl 32(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - xorl 56(%rsp),%ebp - leal -899497514(%rdx,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%ebp - xorl 28(%rsp),%r14d - movl %edi,%eax - movl %ebp,24(%rsp) - movl %esi,%ecx - xorl 36(%rsp),%r14d - xorl %r12d,%eax - roll $5,%ecx - xorl 60(%rsp),%r14d - leal -899497514(%rbp,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%r14d - xorl 32(%rsp),%edx - movl %esi,%eax - movl %r14d,28(%rsp) - movl %r13d,%ecx - xorl 40(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - xorl 0(%rsp),%edx - leal -899497514(%r14,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%edx - xorl 36(%rsp),%ebp - movl %r13d,%eax - - movl %r12d,%ecx - xorl 44(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - xorl 4(%rsp),%ebp - leal -899497514(%rdx,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%ebp - xorl 40(%rsp),%r14d - movl %r12d,%eax - - movl %r11d,%ecx - xorl 48(%rsp),%r14d - xorl %esi,%eax - roll $5,%ecx - xorl 8(%rsp),%r14d - leal -899497514(%rbp,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%r14d - xorl 44(%rsp),%edx - movl %r11d,%eax - - movl %edi,%ecx - xorl 52(%rsp),%edx - xorl %r13d,%eax - roll $5,%ecx - xorl 12(%rsp),%edx - leal -899497514(%r14,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%edx - xorl 48(%rsp),%ebp - movl %edi,%eax - - movl %esi,%ecx - xorl 56(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - xorl 16(%rsp),%ebp - leal -899497514(%rdx,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%ebp - xorl 52(%rsp),%r14d - movl %esi,%eax - - movl %r13d,%ecx - xorl 60(%rsp),%r14d - xorl %r11d,%eax - roll $5,%ecx - xorl 20(%rsp),%r14d - leal -899497514(%rbp,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%r14d - xorl 56(%rsp),%edx - movl %r13d,%eax - - movl %r12d,%ecx - xorl 0(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - xorl 24(%rsp),%edx - leal -899497514(%r14,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%edx - xorl 60(%rsp),%ebp - movl %r12d,%eax - - movl %r11d,%ecx - xorl 4(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - xorl 28(%rsp),%ebp - leal -899497514(%rdx,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%ebp - movl %r11d,%eax - movl %edi,%ecx - xorl %r13d,%eax - leal -899497514(%rbp,%rsi,1),%esi - roll $5,%ecx - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - addl 0(%r8),%esi - addl 4(%r8),%edi - addl 8(%r8),%r11d - addl 12(%r8),%r12d - addl 16(%r8),%r13d - movl %esi,0(%r8) - movl %edi,4(%r8) - movl %r11d,8(%r8) - movl %r12d,12(%r8) - movl %r13d,16(%r8) - - subq $1,%r10 - leaq 64(%r9),%r9 - jnz L$loop - - movq 64(%rsp),%rsi - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -sha1_block_data_order_shaext: -_shaext_shortcut: - - movdqu (%rdi),%xmm0 - movd 16(%rdi),%xmm1 - movdqa K_XX_XX+160(%rip),%xmm3 - - movdqu (%rsi),%xmm4 - pshufd $27,%xmm0,%xmm0 - movdqu 16(%rsi),%xmm5 - pshufd $27,%xmm1,%xmm1 - movdqu 32(%rsi),%xmm6 -.byte 102,15,56,0,227 - movdqu 48(%rsi),%xmm7 -.byte 102,15,56,0,235 -.byte 102,15,56,0,243 - movdqa %xmm1,%xmm9 -.byte 102,15,56,0,251 - jmp L$oop_shaext - -.p2align 4 -L$oop_shaext: - decq %rdx - leaq 64(%rsi),%r8 - paddd %xmm4,%xmm1 - cmovneq %r8,%rsi - movdqa %xmm0,%xmm8 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 - movdqu (%rsi),%xmm4 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,213 - movdqu 16(%rsi),%xmm5 -.byte 102,15,56,0,227 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,206 - movdqu 32(%rsi),%xmm6 -.byte 102,15,56,0,235 - - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,215 - movdqu 48(%rsi),%xmm7 -.byte 102,15,56,0,243 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 65,15,56,200,201 -.byte 102,15,56,0,251 - - paddd %xmm8,%xmm0 - movdqa %xmm1,%xmm9 - - jnz L$oop_shaext - - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 - movdqu %xmm0,(%rdi) - movd %xmm1,16(%rdi) - .byte 0xf3,0xc3 - - - -.p2align 4 -sha1_block_data_order_ssse3: -_ssse3_shortcut: - - movq %rsp,%r11 - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - leaq -64(%rsp),%rsp - andq $-64,%rsp - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - shlq $6,%r10 - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r14 - - movl 0(%r8),%eax - movl 4(%r8),%ebx - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl %ebx,%esi - movl 16(%r8),%ebp - movl %ecx,%edi - xorl %edx,%edi - andl %edi,%esi - - movdqa 64(%r14),%xmm6 - movdqa -64(%r14),%xmm9 - movdqu 0(%r9),%xmm0 - movdqu 16(%r9),%xmm1 - movdqu 32(%r9),%xmm2 - movdqu 48(%r9),%xmm3 -.byte 102,15,56,0,198 -.byte 102,15,56,0,206 -.byte 102,15,56,0,214 - addq $64,%r9 - paddd %xmm9,%xmm0 -.byte 102,15,56,0,222 - paddd %xmm9,%xmm1 - paddd %xmm9,%xmm2 - movdqa %xmm0,0(%rsp) - psubd %xmm9,%xmm0 - movdqa %xmm1,16(%rsp) - psubd %xmm9,%xmm1 - movdqa %xmm2,32(%rsp) - psubd %xmm9,%xmm2 - jmp L$oop_ssse3 -.p2align 4 -L$oop_ssse3: - rorl $2,%ebx - pshufd $238,%xmm0,%xmm4 - xorl %edx,%esi - movdqa %xmm3,%xmm8 - paddd %xmm3,%xmm9 - movl %eax,%edi - addl 0(%rsp),%ebp - punpcklqdq %xmm1,%xmm4 - xorl %ecx,%ebx - roll $5,%eax - addl %esi,%ebp - psrldq $4,%xmm8 - andl %ebx,%edi - xorl %ecx,%ebx - pxor %xmm0,%xmm4 - addl %eax,%ebp - rorl $7,%eax - pxor %xmm2,%xmm8 - xorl %ecx,%edi - movl %ebp,%esi - addl 4(%rsp),%edx - pxor %xmm8,%xmm4 - xorl %ebx,%eax - roll $5,%ebp - movdqa %xmm9,48(%rsp) - addl %edi,%edx - andl %eax,%esi - movdqa %xmm4,%xmm10 - xorl %ebx,%eax - addl %ebp,%edx - rorl $7,%ebp - movdqa %xmm4,%xmm8 - xorl %ebx,%esi - pslldq $12,%xmm10 - paddd %xmm4,%xmm4 - movl %edx,%edi - addl 8(%rsp),%ecx - psrld $31,%xmm8 - xorl %eax,%ebp - roll $5,%edx - addl %esi,%ecx - movdqa %xmm10,%xmm9 - andl %ebp,%edi - xorl %eax,%ebp - psrld $30,%xmm10 - addl %edx,%ecx - rorl $7,%edx - por %xmm8,%xmm4 - xorl %eax,%edi - movl %ecx,%esi - addl 12(%rsp),%ebx - pslld $2,%xmm9 - pxor %xmm10,%xmm4 - xorl %ebp,%edx - movdqa -64(%r14),%xmm10 - roll $5,%ecx - addl %edi,%ebx - andl %edx,%esi - pxor %xmm9,%xmm4 - xorl %ebp,%edx - addl %ecx,%ebx - rorl $7,%ecx - pshufd $238,%xmm1,%xmm5 - xorl %ebp,%esi - movdqa %xmm4,%xmm9 - paddd %xmm4,%xmm10 - movl %ebx,%edi - addl 16(%rsp),%eax - punpcklqdq %xmm2,%xmm5 - xorl %edx,%ecx - roll $5,%ebx - addl %esi,%eax - psrldq $4,%xmm9 - andl %ecx,%edi - xorl %edx,%ecx - pxor %xmm1,%xmm5 - addl %ebx,%eax - rorl $7,%ebx - pxor %xmm3,%xmm9 - xorl %edx,%edi - movl %eax,%esi - addl 20(%rsp),%ebp - pxor %xmm9,%xmm5 - xorl %ecx,%ebx - roll $5,%eax - movdqa %xmm10,0(%rsp) - addl %edi,%ebp - andl %ebx,%esi - movdqa %xmm5,%xmm8 - xorl %ecx,%ebx - addl %eax,%ebp - rorl $7,%eax - movdqa %xmm5,%xmm9 - xorl %ecx,%esi - pslldq $12,%xmm8 - paddd %xmm5,%xmm5 - movl %ebp,%edi - addl 24(%rsp),%edx - psrld $31,%xmm9 - xorl %ebx,%eax - roll $5,%ebp - addl %esi,%edx - movdqa %xmm8,%xmm10 - andl %eax,%edi - xorl %ebx,%eax - psrld $30,%xmm8 - addl %ebp,%edx - rorl $7,%ebp - por %xmm9,%xmm5 - xorl %ebx,%edi - movl %edx,%esi - addl 28(%rsp),%ecx - pslld $2,%xmm10 - pxor %xmm8,%xmm5 - xorl %eax,%ebp - movdqa -32(%r14),%xmm8 - roll $5,%edx - addl %edi,%ecx - andl %ebp,%esi - pxor %xmm10,%xmm5 - xorl %eax,%ebp - addl %edx,%ecx - rorl $7,%edx - pshufd $238,%xmm2,%xmm6 - xorl %eax,%esi - movdqa %xmm5,%xmm10 - paddd %xmm5,%xmm8 - movl %ecx,%edi - addl 32(%rsp),%ebx - punpcklqdq %xmm3,%xmm6 - xorl %ebp,%edx - roll $5,%ecx - addl %esi,%ebx - psrldq $4,%xmm10 - andl %edx,%edi - xorl %ebp,%edx - pxor %xmm2,%xmm6 - addl %ecx,%ebx - rorl $7,%ecx - pxor %xmm4,%xmm10 - xorl %ebp,%edi - movl %ebx,%esi - addl 36(%rsp),%eax - pxor %xmm10,%xmm6 - xorl %edx,%ecx - roll $5,%ebx - movdqa %xmm8,16(%rsp) - addl %edi,%eax - andl %ecx,%esi - movdqa %xmm6,%xmm9 - xorl %edx,%ecx - addl %ebx,%eax - rorl $7,%ebx - movdqa %xmm6,%xmm10 - xorl %edx,%esi - pslldq $12,%xmm9 - paddd %xmm6,%xmm6 - movl %eax,%edi - addl 40(%rsp),%ebp - psrld $31,%xmm10 - xorl %ecx,%ebx - roll $5,%eax - addl %esi,%ebp - movdqa %xmm9,%xmm8 - andl %ebx,%edi - xorl %ecx,%ebx - psrld $30,%xmm9 - addl %eax,%ebp - rorl $7,%eax - por %xmm10,%xmm6 - xorl %ecx,%edi - movl %ebp,%esi - addl 44(%rsp),%edx - pslld $2,%xmm8 - pxor %xmm9,%xmm6 - xorl %ebx,%eax - movdqa -32(%r14),%xmm9 - roll $5,%ebp - addl %edi,%edx - andl %eax,%esi - pxor %xmm8,%xmm6 - xorl %ebx,%eax - addl %ebp,%edx - rorl $7,%ebp - pshufd $238,%xmm3,%xmm7 - xorl %ebx,%esi - movdqa %xmm6,%xmm8 - paddd %xmm6,%xmm9 - movl %edx,%edi - addl 48(%rsp),%ecx - punpcklqdq %xmm4,%xmm7 - xorl %eax,%ebp - roll $5,%edx - addl %esi,%ecx - psrldq $4,%xmm8 - andl %ebp,%edi - xorl %eax,%ebp - pxor %xmm3,%xmm7 - addl %edx,%ecx - rorl $7,%edx - pxor %xmm5,%xmm8 - xorl %eax,%edi - movl %ecx,%esi - addl 52(%rsp),%ebx - pxor %xmm8,%xmm7 - xorl %ebp,%edx - roll $5,%ecx - movdqa %xmm9,32(%rsp) - addl %edi,%ebx - andl %edx,%esi - movdqa %xmm7,%xmm10 - xorl %ebp,%edx - addl %ecx,%ebx - rorl $7,%ecx - movdqa %xmm7,%xmm8 - xorl %ebp,%esi - pslldq $12,%xmm10 - paddd %xmm7,%xmm7 - movl %ebx,%edi - addl 56(%rsp),%eax - psrld $31,%xmm8 - xorl %edx,%ecx - roll $5,%ebx - addl %esi,%eax - movdqa %xmm10,%xmm9 - andl %ecx,%edi - xorl %edx,%ecx - psrld $30,%xmm10 - addl %ebx,%eax - rorl $7,%ebx - por %xmm8,%xmm7 - xorl %edx,%edi - movl %eax,%esi - addl 60(%rsp),%ebp - pslld $2,%xmm9 - pxor %xmm10,%xmm7 - xorl %ecx,%ebx - movdqa -32(%r14),%xmm10 - roll $5,%eax - addl %edi,%ebp - andl %ebx,%esi - pxor %xmm9,%xmm7 - pshufd $238,%xmm6,%xmm9 - xorl %ecx,%ebx - addl %eax,%ebp - rorl $7,%eax - pxor %xmm4,%xmm0 - xorl %ecx,%esi - movl %ebp,%edi - addl 0(%rsp),%edx - punpcklqdq %xmm7,%xmm9 - xorl %ebx,%eax - roll $5,%ebp - pxor %xmm1,%xmm0 - addl %esi,%edx - andl %eax,%edi - movdqa %xmm10,%xmm8 - xorl %ebx,%eax - paddd %xmm7,%xmm10 - addl %ebp,%edx - pxor %xmm9,%xmm0 - rorl $7,%ebp - xorl %ebx,%edi - movl %edx,%esi - addl 4(%rsp),%ecx - movdqa %xmm0,%xmm9 - xorl %eax,%ebp - roll $5,%edx - movdqa %xmm10,48(%rsp) - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - pslld $2,%xmm0 - addl %edx,%ecx - rorl $7,%edx - psrld $30,%xmm9 - xorl %eax,%esi - movl %ecx,%edi - addl 8(%rsp),%ebx - por %xmm9,%xmm0 - xorl %ebp,%edx - roll $5,%ecx - pshufd $238,%xmm7,%xmm10 - addl %esi,%ebx - andl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 12(%rsp),%eax - xorl %ebp,%edi - movl %ebx,%esi - roll $5,%ebx - addl %edi,%eax - xorl %edx,%esi - rorl $7,%ecx - addl %ebx,%eax - pxor %xmm5,%xmm1 - addl 16(%rsp),%ebp - xorl %ecx,%esi - punpcklqdq %xmm0,%xmm10 - movl %eax,%edi - roll $5,%eax - pxor %xmm2,%xmm1 - addl %esi,%ebp - xorl %ecx,%edi - movdqa %xmm8,%xmm9 - rorl $7,%ebx - paddd %xmm0,%xmm8 - addl %eax,%ebp - pxor %xmm10,%xmm1 - addl 20(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - movdqa %xmm1,%xmm10 - addl %edi,%edx - xorl %ebx,%esi - movdqa %xmm8,0(%rsp) - rorl $7,%eax - addl %ebp,%edx - addl 24(%rsp),%ecx - pslld $2,%xmm1 - xorl %eax,%esi - movl %edx,%edi - psrld $30,%xmm10 - roll $5,%edx - addl %esi,%ecx - xorl %eax,%edi - rorl $7,%ebp - por %xmm10,%xmm1 - addl %edx,%ecx - addl 28(%rsp),%ebx - pshufd $238,%xmm0,%xmm8 - xorl %ebp,%edi - movl %ecx,%esi - roll $5,%ecx - addl %edi,%ebx - xorl %ebp,%esi - rorl $7,%edx - addl %ecx,%ebx - pxor %xmm6,%xmm2 - addl 32(%rsp),%eax - xorl %edx,%esi - punpcklqdq %xmm1,%xmm8 - movl %ebx,%edi - roll $5,%ebx - pxor %xmm3,%xmm2 - addl %esi,%eax - xorl %edx,%edi - movdqa 0(%r14),%xmm10 - rorl $7,%ecx - paddd %xmm1,%xmm9 - addl %ebx,%eax - pxor %xmm8,%xmm2 - addl 36(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - movdqa %xmm2,%xmm8 - addl %edi,%ebp - xorl %ecx,%esi - movdqa %xmm9,16(%rsp) - rorl $7,%ebx - addl %eax,%ebp - addl 40(%rsp),%edx - pslld $2,%xmm2 - xorl %ebx,%esi - movl %ebp,%edi - psrld $30,%xmm8 - roll $5,%ebp - addl %esi,%edx - xorl %ebx,%edi - rorl $7,%eax - por %xmm8,%xmm2 - addl %ebp,%edx - addl 44(%rsp),%ecx - pshufd $238,%xmm1,%xmm9 - xorl %eax,%edi - movl %edx,%esi - roll $5,%edx - addl %edi,%ecx - xorl %eax,%esi - rorl $7,%ebp - addl %edx,%ecx - pxor %xmm7,%xmm3 - addl 48(%rsp),%ebx - xorl %ebp,%esi - punpcklqdq %xmm2,%xmm9 - movl %ecx,%edi - roll $5,%ecx - pxor %xmm4,%xmm3 - addl %esi,%ebx - xorl %ebp,%edi - movdqa %xmm10,%xmm8 - rorl $7,%edx - paddd %xmm2,%xmm10 - addl %ecx,%ebx - pxor %xmm9,%xmm3 - addl 52(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - roll $5,%ebx - movdqa %xmm3,%xmm9 - addl %edi,%eax - xorl %edx,%esi - movdqa %xmm10,32(%rsp) - rorl $7,%ecx - addl %ebx,%eax - addl 56(%rsp),%ebp - pslld $2,%xmm3 - xorl %ecx,%esi - movl %eax,%edi - psrld $30,%xmm9 - roll $5,%eax - addl %esi,%ebp - xorl %ecx,%edi - rorl $7,%ebx - por %xmm9,%xmm3 - addl %eax,%ebp - addl 60(%rsp),%edx - pshufd $238,%xmm2,%xmm10 - xorl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - addl %edi,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %ebp,%edx - pxor %xmm0,%xmm4 - addl 0(%rsp),%ecx - xorl %eax,%esi - punpcklqdq %xmm3,%xmm10 - movl %edx,%edi - roll $5,%edx - pxor %xmm5,%xmm4 - addl %esi,%ecx - xorl %eax,%edi - movdqa %xmm8,%xmm9 - rorl $7,%ebp - paddd %xmm3,%xmm8 - addl %edx,%ecx - pxor %xmm10,%xmm4 - addl 4(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - roll $5,%ecx - movdqa %xmm4,%xmm10 - addl %edi,%ebx - xorl %ebp,%esi - movdqa %xmm8,48(%rsp) - rorl $7,%edx - addl %ecx,%ebx - addl 8(%rsp),%eax - pslld $2,%xmm4 - xorl %edx,%esi - movl %ebx,%edi - psrld $30,%xmm10 - roll $5,%ebx - addl %esi,%eax - xorl %edx,%edi - rorl $7,%ecx - por %xmm10,%xmm4 - addl %ebx,%eax - addl 12(%rsp),%ebp - pshufd $238,%xmm3,%xmm8 - xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - addl %edi,%ebp - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%ebp - pxor %xmm1,%xmm5 - addl 16(%rsp),%edx - xorl %ebx,%esi - punpcklqdq %xmm4,%xmm8 - movl %ebp,%edi - roll $5,%ebp - pxor %xmm6,%xmm5 - addl %esi,%edx - xorl %ebx,%edi - movdqa %xmm9,%xmm10 - rorl $7,%eax - paddd %xmm4,%xmm9 - addl %ebp,%edx - pxor %xmm8,%xmm5 - addl 20(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - roll $5,%edx - movdqa %xmm5,%xmm8 - addl %edi,%ecx - xorl %eax,%esi - movdqa %xmm9,0(%rsp) - rorl $7,%ebp - addl %edx,%ecx - addl 24(%rsp),%ebx - pslld $2,%xmm5 - xorl %ebp,%esi - movl %ecx,%edi - psrld $30,%xmm8 - roll $5,%ecx - addl %esi,%ebx - xorl %ebp,%edi - rorl $7,%edx - por %xmm8,%xmm5 - addl %ecx,%ebx - addl 28(%rsp),%eax - pshufd $238,%xmm4,%xmm9 - rorl $7,%ecx - movl %ebx,%esi - xorl %edx,%edi - roll $5,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - pxor %xmm2,%xmm6 - addl 32(%rsp),%ebp - andl %ecx,%esi - xorl %edx,%ecx - rorl $7,%ebx - punpcklqdq %xmm5,%xmm9 - movl %eax,%edi - xorl %ecx,%esi - pxor %xmm7,%xmm6 - roll $5,%eax - addl %esi,%ebp - movdqa %xmm10,%xmm8 - xorl %ebx,%edi - paddd %xmm5,%xmm10 - xorl %ecx,%ebx - pxor %xmm9,%xmm6 - addl %eax,%ebp - addl 36(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - rorl $7,%eax - movdqa %xmm6,%xmm9 - movl %ebp,%esi - xorl %ebx,%edi - movdqa %xmm10,16(%rsp) - roll $5,%ebp - addl %edi,%edx - xorl %eax,%esi - pslld $2,%xmm6 - xorl %ebx,%eax - addl %ebp,%edx - psrld $30,%xmm9 - addl 40(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - por %xmm9,%xmm6 - rorl $7,%ebp - movl %edx,%edi - xorl %eax,%esi - roll $5,%edx - pshufd $238,%xmm5,%xmm10 - addl %esi,%ecx - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 44(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - rorl $7,%edx - movl %ecx,%esi - xorl %ebp,%edi - roll $5,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - pxor %xmm3,%xmm7 - addl 48(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - rorl $7,%ecx - punpcklqdq %xmm6,%xmm10 - movl %ebx,%edi - xorl %edx,%esi - pxor %xmm0,%xmm7 - roll $5,%ebx - addl %esi,%eax - movdqa 32(%r14),%xmm9 - xorl %ecx,%edi - paddd %xmm6,%xmm8 - xorl %edx,%ecx - pxor %xmm10,%xmm7 - addl %ebx,%eax - addl 52(%rsp),%ebp - andl %ecx,%edi - xorl %edx,%ecx - rorl $7,%ebx - movdqa %xmm7,%xmm10 - movl %eax,%esi - xorl %ecx,%edi - movdqa %xmm8,32(%rsp) - roll $5,%eax - addl %edi,%ebp - xorl %ebx,%esi - pslld $2,%xmm7 - xorl %ecx,%ebx - addl %eax,%ebp - psrld $30,%xmm10 - addl 56(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - por %xmm10,%xmm7 - rorl $7,%eax - movl %ebp,%edi - xorl %ebx,%esi - roll $5,%ebp - pshufd $238,%xmm6,%xmm8 - addl %esi,%edx - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 60(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - rorl $7,%ebp - movl %edx,%esi - xorl %eax,%edi - roll $5,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - pxor %xmm4,%xmm0 - addl 0(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - rorl $7,%edx - punpcklqdq %xmm7,%xmm8 - movl %ecx,%edi - xorl %ebp,%esi - pxor %xmm1,%xmm0 - roll $5,%ecx - addl %esi,%ebx - movdqa %xmm9,%xmm10 - xorl %edx,%edi - paddd %xmm7,%xmm9 - xorl %ebp,%edx - pxor %xmm8,%xmm0 - addl %ecx,%ebx - addl 4(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - rorl $7,%ecx - movdqa %xmm0,%xmm8 - movl %ebx,%esi - xorl %edx,%edi - movdqa %xmm9,48(%rsp) - roll $5,%ebx - addl %edi,%eax - xorl %ecx,%esi - pslld $2,%xmm0 - xorl %edx,%ecx - addl %ebx,%eax - psrld $30,%xmm8 - addl 8(%rsp),%ebp - andl %ecx,%esi - xorl %edx,%ecx - por %xmm8,%xmm0 - rorl $7,%ebx - movl %eax,%edi - xorl %ecx,%esi - roll $5,%eax - pshufd $238,%xmm7,%xmm9 - addl %esi,%ebp - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 12(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - rorl $7,%eax - movl %ebp,%esi - xorl %ebx,%edi - roll $5,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - pxor %xmm5,%xmm1 - addl 16(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - rorl $7,%ebp - punpcklqdq %xmm0,%xmm9 - movl %edx,%edi - xorl %eax,%esi - pxor %xmm2,%xmm1 - roll $5,%edx - addl %esi,%ecx - movdqa %xmm10,%xmm8 - xorl %ebp,%edi - paddd %xmm0,%xmm10 - xorl %eax,%ebp - pxor %xmm9,%xmm1 - addl %edx,%ecx - addl 20(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - rorl $7,%edx - movdqa %xmm1,%xmm9 - movl %ecx,%esi - xorl %ebp,%edi - movdqa %xmm10,0(%rsp) - roll $5,%ecx - addl %edi,%ebx - xorl %edx,%esi - pslld $2,%xmm1 - xorl %ebp,%edx - addl %ecx,%ebx - psrld $30,%xmm9 - addl 24(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - por %xmm9,%xmm1 - rorl $7,%ecx - movl %ebx,%edi - xorl %edx,%esi - roll $5,%ebx - pshufd $238,%xmm0,%xmm10 - addl %esi,%eax - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%rsp),%ebp - andl %ecx,%edi - xorl %edx,%ecx - rorl $7,%ebx - movl %eax,%esi - xorl %ecx,%edi - roll $5,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - pxor %xmm6,%xmm2 - addl 32(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - rorl $7,%eax - punpcklqdq %xmm1,%xmm10 - movl %ebp,%edi - xorl %ebx,%esi - pxor %xmm3,%xmm2 - roll $5,%ebp - addl %esi,%edx - movdqa %xmm8,%xmm9 - xorl %eax,%edi - paddd %xmm1,%xmm8 - xorl %ebx,%eax - pxor %xmm10,%xmm2 - addl %ebp,%edx - addl 36(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - rorl $7,%ebp - movdqa %xmm2,%xmm10 - movl %edx,%esi - xorl %eax,%edi - movdqa %xmm8,16(%rsp) - roll $5,%edx - addl %edi,%ecx - xorl %ebp,%esi - pslld $2,%xmm2 - xorl %eax,%ebp - addl %edx,%ecx - psrld $30,%xmm10 - addl 40(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - por %xmm10,%xmm2 - rorl $7,%edx - movl %ecx,%edi - xorl %ebp,%esi - roll $5,%ecx - pshufd $238,%xmm1,%xmm8 - addl %esi,%ebx - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 44(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - rorl $7,%ecx - movl %ebx,%esi - xorl %edx,%edi - roll $5,%ebx - addl %edi,%eax - xorl %edx,%esi - addl %ebx,%eax - pxor %xmm7,%xmm3 - addl 48(%rsp),%ebp - xorl %ecx,%esi - punpcklqdq %xmm2,%xmm8 - movl %eax,%edi - roll $5,%eax - pxor %xmm4,%xmm3 - addl %esi,%ebp - xorl %ecx,%edi - movdqa %xmm9,%xmm10 - rorl $7,%ebx - paddd %xmm2,%xmm9 - addl %eax,%ebp - pxor %xmm8,%xmm3 - addl 52(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - movdqa %xmm3,%xmm8 - addl %edi,%edx - xorl %ebx,%esi - movdqa %xmm9,32(%rsp) - rorl $7,%eax - addl %ebp,%edx - addl 56(%rsp),%ecx - pslld $2,%xmm3 - xorl %eax,%esi - movl %edx,%edi - psrld $30,%xmm8 - roll $5,%edx - addl %esi,%ecx - xorl %eax,%edi - rorl $7,%ebp - por %xmm8,%xmm3 - addl %edx,%ecx - addl 60(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - roll $5,%ecx - addl %edi,%ebx - xorl %ebp,%esi - rorl $7,%edx - addl %ecx,%ebx - addl 0(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - roll $5,%ebx - paddd %xmm3,%xmm10 - addl %esi,%eax - xorl %edx,%edi - movdqa %xmm10,48(%rsp) - rorl $7,%ecx - addl %ebx,%eax - addl 4(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - addl %edi,%ebp - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%ebp - addl 8(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - roll $5,%ebp - addl %esi,%edx - xorl %ebx,%edi - rorl $7,%eax - addl %ebp,%edx - addl 12(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - roll $5,%edx - addl %edi,%ecx - xorl %eax,%esi - rorl $7,%ebp - addl %edx,%ecx - cmpq %r10,%r9 - je L$done_ssse3 - movdqa 64(%r14),%xmm6 - movdqa -64(%r14),%xmm9 - movdqu 0(%r9),%xmm0 - movdqu 16(%r9),%xmm1 - movdqu 32(%r9),%xmm2 - movdqu 48(%r9),%xmm3 -.byte 102,15,56,0,198 - addq $64,%r9 - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi -.byte 102,15,56,0,206 - roll $5,%ecx - addl %esi,%ebx - xorl %ebp,%edi - rorl $7,%edx - paddd %xmm9,%xmm0 - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - movdqa %xmm0,0(%rsp) - roll $5,%ebx - addl %edi,%eax - xorl %edx,%esi - rorl $7,%ecx - psubd %xmm9,%xmm0 - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - roll $5,%eax - addl %esi,%ebp - xorl %ecx,%edi - rorl $7,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - addl %edi,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi -.byte 102,15,56,0,214 - roll $5,%edx - addl %esi,%ecx - xorl %eax,%edi - rorl $7,%ebp - paddd %xmm9,%xmm1 - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - movdqa %xmm1,16(%rsp) - roll $5,%ecx - addl %edi,%ebx - xorl %ebp,%esi - rorl $7,%edx - psubd %xmm9,%xmm1 - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - roll $5,%ebx - addl %esi,%eax - xorl %edx,%edi - rorl $7,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - addl %edi,%ebp - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi -.byte 102,15,56,0,222 - roll $5,%ebp - addl %esi,%edx - xorl %ebx,%edi - rorl $7,%eax - paddd %xmm9,%xmm2 - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - movdqa %xmm2,32(%rsp) - roll $5,%edx - addl %edi,%ecx - xorl %eax,%esi - rorl $7,%ebp - psubd %xmm9,%xmm2 - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - roll $5,%ecx - addl %esi,%ebx - xorl %ebp,%edi - rorl $7,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - roll $5,%ebx - addl %edi,%eax - rorl $7,%ecx - addl %ebx,%eax - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - addl 12(%r8),%edx - movl %eax,0(%r8) - addl 16(%r8),%ebp - movl %esi,4(%r8) - movl %esi,%ebx - movl %ecx,8(%r8) - movl %ecx,%edi - movl %edx,12(%r8) - xorl %edx,%edi - movl %ebp,16(%r8) - andl %edi,%esi - jmp L$oop_ssse3 - -.p2align 4 -L$done_ssse3: - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - roll $5,%ecx - addl %esi,%ebx - xorl %ebp,%edi - rorl $7,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - roll $5,%ebx - addl %edi,%eax - xorl %edx,%esi - rorl $7,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - roll $5,%eax - addl %esi,%ebp - xorl %ecx,%edi - rorl $7,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - addl %edi,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - roll $5,%edx - addl %esi,%ecx - xorl %eax,%edi - rorl $7,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - roll $5,%ecx - addl %edi,%ebx - xorl %ebp,%esi - rorl $7,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - roll $5,%ebx - addl %esi,%eax - xorl %edx,%edi - rorl $7,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - addl %edi,%ebp - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - roll $5,%ebp - addl %esi,%edx - xorl %ebx,%edi - rorl $7,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - roll $5,%edx - addl %edi,%ecx - xorl %eax,%esi - rorl $7,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - roll $5,%ecx - addl %esi,%ebx - xorl %ebp,%edi - rorl $7,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - roll $5,%ebx - addl %edi,%eax - rorl $7,%ecx - addl %ebx,%eax - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - movl %eax,0(%r8) - addl 12(%r8),%edx - movl %esi,4(%r8) - addl 16(%r8),%ebp - movl %ecx,8(%r8) - movl %edx,12(%r8) - movl %ebp,16(%r8) - movq -40(%r11),%r14 - - movq -32(%r11),%r13 - - movq -24(%r11),%r12 - - movq -16(%r11),%rbp - - movq -8(%r11),%rbx - - leaq (%r11),%rsp - -L$epilogue_ssse3: - .byte 0xf3,0xc3 - - - -.p2align 4 -sha1_block_data_order_avx: -_avx_shortcut: - - movq %rsp,%r11 - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - leaq -64(%rsp),%rsp - vzeroupper - andq $-64,%rsp - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - shlq $6,%r10 - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r14 - - movl 0(%r8),%eax - movl 4(%r8),%ebx - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl %ebx,%esi - movl 16(%r8),%ebp - movl %ecx,%edi - xorl %edx,%edi - andl %edi,%esi - - vmovdqa 64(%r14),%xmm6 - vmovdqa -64(%r14),%xmm11 - vmovdqu 0(%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r9 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm11,%xmm0,%xmm4 - vpaddd %xmm11,%xmm1,%xmm5 - vpaddd %xmm11,%xmm2,%xmm6 - vmovdqa %xmm4,0(%rsp) - vmovdqa %xmm5,16(%rsp) - vmovdqa %xmm6,32(%rsp) - jmp L$oop_avx -.p2align 4 -L$oop_avx: - shrdl $2,%ebx,%ebx - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%edi - addl 0(%rsp),%ebp - vpaddd %xmm3,%xmm11,%xmm9 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%ebp - vpxor %xmm2,%xmm8,%xmm8 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 4(%rsp),%edx - vpxor %xmm8,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vmovdqa %xmm9,48(%rsp) - addl %edi,%edx - andl %eax,%esi - vpsrld $31,%xmm4,%xmm8 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm10 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%edi - addl 8(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm4,%xmm4 - addl %esi,%ecx - andl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm4,%xmm4 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 12(%rsp),%ebx - vpxor %xmm10,%xmm4,%xmm4 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %edi,%ebx - andl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%edi - addl 16(%rsp),%eax - vpaddd %xmm4,%xmm11,%xmm9 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm8 - addl %esi,%eax - andl %ecx,%edi - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm8,%xmm8 - shrdl $7,%ebx,%ebx - xorl %edx,%edi - movl %eax,%esi - addl 20(%rsp),%ebp - vpxor %xmm8,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vmovdqa %xmm9,0(%rsp) - addl %edi,%ebp - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm8 - xorl %ecx,%ebx - addl %eax,%ebp - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm10 - vpaddd %xmm5,%xmm5,%xmm5 - movl %ebp,%edi - addl 24(%rsp),%edx - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm5,%xmm5 - addl %esi,%edx - andl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm5,%xmm5 - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - movl %edx,%esi - addl 28(%rsp),%ecx - vpxor %xmm10,%xmm5,%xmm5 - xorl %eax,%ebp - shldl $5,%edx,%edx - vmovdqa -32(%r14),%xmm11 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%edi - addl 32(%rsp),%ebx - vpaddd %xmm5,%xmm11,%xmm9 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm8 - addl %esi,%ebx - andl %edx,%edi - vpxor %xmm2,%xmm6,%xmm6 - xorl %ebp,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm8,%xmm8 - shrdl $7,%ecx,%ecx - xorl %ebp,%edi - movl %ebx,%esi - addl 36(%rsp),%eax - vpxor %xmm8,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vmovdqa %xmm9,16(%rsp) - addl %edi,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm8 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm10 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%edi - addl 40(%rsp),%ebp - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm6,%xmm6 - addl %esi,%ebp - andl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 44(%rsp),%edx - vpxor %xmm10,%xmm6,%xmm6 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - addl %edi,%edx - andl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%edi - addl 48(%rsp),%ecx - vpaddd %xmm6,%xmm11,%xmm9 - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%ebp - addl %edx,%ecx - vpxor %xmm5,%xmm8,%xmm8 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 52(%rsp),%ebx - vpxor %xmm8,%xmm7,%xmm7 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vmovdqa %xmm9,32(%rsp) - addl %edi,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm8 - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpslldq $12,%xmm7,%xmm10 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%edi - addl 56(%rsp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm7,%xmm7 - addl %esi,%eax - andl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - xorl %edx,%edi - movl %eax,%esi - addl 60(%rsp),%ebp - vpxor %xmm10,%xmm7,%xmm7 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %edi,%ebp - andl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %ebp,%edi - addl 0(%rsp),%edx - vpxor %xmm1,%xmm0,%xmm0 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpaddd %xmm7,%xmm11,%xmm9 - addl %esi,%edx - andl %eax,%edi - vpxor %xmm8,%xmm0,%xmm0 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - movl %edx,%esi - addl 4(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%edi - addl 8(%rsp),%ebx - vpor %xmm8,%xmm0,%xmm0 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %esi,%ebx - andl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 12(%rsp),%eax - xorl %ebp,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm0,%xmm11,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm1,%xmm1 - addl 20(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm1,%xmm1 - addl 28(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - addl %esi,%eax - xorl %edx,%edi - vpaddd %xmm1,%xmm11,%xmm9 - vmovdqa 0(%r14),%xmm11 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm8,%xmm2,%xmm2 - addl 36(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpslld $2,%xmm2,%xmm2 - addl 40(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpor %xmm8,%xmm2,%xmm2 - addl 44(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebx - xorl %ebp,%edi - vpaddd %xmm2,%xmm11,%xmm9 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpalignr $8,%xmm2,%xmm3,%xmm8 - vpxor %xmm0,%xmm4,%xmm4 - addl 0(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - addl %esi,%ecx - xorl %eax,%edi - vpaddd %xmm3,%xmm11,%xmm9 - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpxor %xmm8,%xmm4,%xmm4 - addl 4(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm8 - vmovdqa %xmm9,48(%rsp) - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm8,%xmm4,%xmm4 - addl 12(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm3,%xmm4,%xmm8 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpxor %xmm6,%xmm5,%xmm5 - addl %esi,%edx - xorl %ebx,%edi - vpaddd %xmm4,%xmm11,%xmm9 - shrdl $7,%eax,%eax - addl %ebp,%edx - vpxor %xmm8,%xmm5,%xmm5 - addl 20(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm8,%xmm5,%xmm5 - addl 28(%rsp),%eax - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm8 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%rsp),%ebp - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - movl %eax,%edi - xorl %ecx,%esi - vpaddd %xmm5,%xmm11,%xmm9 - shldl $5,%eax,%eax - addl %esi,%ebp - vpxor %xmm8,%xmm6,%xmm6 - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 36(%rsp),%edx - vpsrld $30,%xmm6,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - addl 40(%rsp),%ecx - andl %eax,%esi - vpor %xmm8,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%edi - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 44(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%esi - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm8 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - movl %ebx,%edi - xorl %edx,%esi - vpaddd %xmm6,%xmm11,%xmm9 - vmovdqa 32(%r14),%xmm11 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm8,%xmm7,%xmm7 - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%rsp),%ebp - vpsrld $30,%xmm7,%xmm8 - vmovdqa %xmm9,32(%rsp) - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - addl 56(%rsp),%edx - andl %ebx,%esi - vpor %xmm8,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%edi - xorl %ebx,%esi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 60(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - addl 0(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vpxor %xmm1,%xmm0,%xmm0 - movl %ecx,%edi - xorl %ebp,%esi - vpaddd %xmm7,%xmm11,%xmm9 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm8,%xmm0,%xmm0 - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 4(%rsp),%eax - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%rsp),%ebp - andl %ecx,%esi - vpor %xmm8,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%edi - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 12(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - vpxor %xmm2,%xmm1,%xmm1 - movl %edx,%edi - xorl %eax,%esi - vpaddd %xmm0,%xmm11,%xmm9 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 20(%rsp),%ebx - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - addl 24(%rsp),%eax - andl %edx,%esi - vpor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%edi - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%rsp),%ebp - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - movl %ebp,%edi - xorl %ebx,%esi - vpaddd %xmm1,%xmm11,%xmm9 - shldl $5,%ebp,%ebp - addl %esi,%edx - vpxor %xmm8,%xmm2,%xmm2 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 36(%rsp),%ecx - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - addl 40(%rsp),%ebx - andl %ebp,%esi - vpor %xmm8,%xmm2,%xmm2 - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%edi - xorl %ebp,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 44(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm2,%xmm11,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 0(%rsp),%eax - vpaddd %xmm3,%xmm11,%xmm9 - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm9,48(%rsp) - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 8(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 12(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - cmpq %r10,%r9 - je L$done_avx - vmovdqa 64(%r14),%xmm6 - vmovdqa -64(%r14),%xmm11 - vmovdqu 0(%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r9 - addl 16(%rsp),%ebx - xorl %ebp,%esi - vpshufb %xmm6,%xmm1,%xmm1 - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpaddd %xmm11,%xmm0,%xmm4 - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm4,0(%rsp) - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - vpshufb %xmm6,%xmm2,%xmm2 - movl %edx,%edi - shldl $5,%edx,%edx - vpaddd %xmm11,%xmm1,%xmm5 - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vmovdqa %xmm5,16(%rsp) - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - vpshufb %xmm6,%xmm3,%xmm3 - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpaddd %xmm11,%xmm2,%xmm6 - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vmovdqa %xmm6,32(%rsp) - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - addl 12(%r8),%edx - movl %eax,0(%r8) - addl 16(%r8),%ebp - movl %esi,4(%r8) - movl %esi,%ebx - movl %ecx,8(%r8) - movl %ecx,%edi - movl %edx,12(%r8) - xorl %edx,%edi - movl %ebp,16(%r8) - andl %edi,%esi - jmp L$oop_avx - -.p2align 4 -L$done_avx: - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vzeroupper - - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - movl %eax,0(%r8) - addl 12(%r8),%edx - movl %esi,4(%r8) - addl 16(%r8),%ebp - movl %ecx,8(%r8) - movl %edx,12(%r8) - movl %ebp,16(%r8) - movq -40(%r11),%r14 - - movq -32(%r11),%r13 - - movq -24(%r11),%r12 - - movq -16(%r11),%rbp - - movq -8(%r11),%rbx - - leaq (%r11),%rsp - -L$epilogue_avx: - .byte 0xf3,0xc3 - - - -.p2align 4 -sha1_block_data_order_avx2: -_avx2_shortcut: - - movq %rsp,%r11 - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - vzeroupper - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - leaq -640(%rsp),%rsp - shlq $6,%r10 - leaq 64(%r9),%r13 - andq $-128,%rsp - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r14 - - movl 0(%r8),%eax - cmpq %r10,%r13 - cmovaeq %r9,%r13 - movl 4(%r8),%ebp - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl 16(%r8),%esi - vmovdqu 64(%r14),%ymm6 - - vmovdqu (%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - leaq 64(%r9),%r9 - vinserti128 $1,(%r13),%ymm0,%ymm0 - vinserti128 $1,16(%r13),%ymm1,%ymm1 - vpshufb %ymm6,%ymm0,%ymm0 - vinserti128 $1,32(%r13),%ymm2,%ymm2 - vpshufb %ymm6,%ymm1,%ymm1 - vinserti128 $1,48(%r13),%ymm3,%ymm3 - vpshufb %ymm6,%ymm2,%ymm2 - vmovdqu -64(%r14),%ymm11 - vpshufb %ymm6,%ymm3,%ymm3 - - vpaddd %ymm11,%ymm0,%ymm4 - vpaddd %ymm11,%ymm1,%ymm5 - vmovdqu %ymm4,0(%rsp) - vpaddd %ymm11,%ymm2,%ymm6 - vmovdqu %ymm5,32(%rsp) - vpaddd %ymm11,%ymm3,%ymm7 - vmovdqu %ymm6,64(%rsp) - vmovdqu %ymm7,96(%rsp) - vpalignr $8,%ymm0,%ymm1,%ymm4 - vpsrldq $4,%ymm3,%ymm8 - vpxor %ymm0,%ymm4,%ymm4 - vpxor %ymm2,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $31,%ymm4,%ymm8 - vpslldq $12,%ymm4,%ymm10 - vpaddd %ymm4,%ymm4,%ymm4 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm4,%ymm4 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm4,%ymm4 - vpxor %ymm10,%ymm4,%ymm4 - vpaddd %ymm11,%ymm4,%ymm9 - vmovdqu %ymm9,128(%rsp) - vpalignr $8,%ymm1,%ymm2,%ymm5 - vpsrldq $4,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm3,%ymm8,%ymm8 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r14),%ymm11 - vpslldq $12,%ymm5,%ymm10 - vpaddd %ymm5,%ymm5,%ymm5 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm5,%ymm5 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm10,%ymm5,%ymm5 - vpaddd %ymm11,%ymm5,%ymm9 - vmovdqu %ymm9,160(%rsp) - vpalignr $8,%ymm2,%ymm3,%ymm6 - vpsrldq $4,%ymm5,%ymm8 - vpxor %ymm2,%ymm6,%ymm6 - vpxor %ymm4,%ymm8,%ymm8 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $31,%ymm6,%ymm8 - vpslldq $12,%ymm6,%ymm10 - vpaddd %ymm6,%ymm6,%ymm6 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm6,%ymm6 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm6,%ymm6 - vpxor %ymm10,%ymm6,%ymm6 - vpaddd %ymm11,%ymm6,%ymm9 - vmovdqu %ymm9,192(%rsp) - vpalignr $8,%ymm3,%ymm4,%ymm7 - vpsrldq $4,%ymm6,%ymm8 - vpxor %ymm3,%ymm7,%ymm7 - vpxor %ymm5,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm7,%ymm8 - vpslldq $12,%ymm7,%ymm10 - vpaddd %ymm7,%ymm7,%ymm7 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm7,%ymm7 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm7,%ymm7 - vpxor %ymm10,%ymm7,%ymm7 - vpaddd %ymm11,%ymm7,%ymm9 - vmovdqu %ymm9,224(%rsp) - leaq 128(%rsp),%r13 - jmp L$oop_avx2 -.p2align 5 -L$oop_avx2: - rorxl $2,%ebp,%ebx - andnl %edx,%ebp,%edi - andl %ecx,%ebp - xorl %edi,%ebp - jmp L$align32_1 -.p2align 5 -L$align32_1: - vpalignr $8,%ymm6,%ymm7,%ymm8 - vpxor %ymm4,%ymm0,%ymm0 - addl -128(%r13),%esi - andnl %ecx,%eax,%edi - vpxor %ymm1,%ymm0,%ymm0 - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpxor %ymm8,%ymm0,%ymm0 - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - vpsrld $30,%ymm0,%ymm8 - vpslld $2,%ymm0,%ymm0 - addl -124(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - vpor %ymm8,%ymm0,%ymm0 - addl %r12d,%edx - xorl %edi,%esi - addl -120(%r13),%ecx - andnl %ebp,%edx,%edi - vpaddd %ymm11,%ymm0,%ymm9 - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - vmovdqu %ymm9,256(%rsp) - addl %r12d,%ecx - xorl %edi,%edx - addl -116(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -96(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - vpalignr $8,%ymm7,%ymm0,%ymm8 - vpxor %ymm5,%ymm1,%ymm1 - addl -92(%r13),%eax - andnl %edx,%ebp,%edi - vpxor %ymm2,%ymm1,%ymm1 - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - vpxor %ymm8,%ymm1,%ymm1 - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - vpsrld $30,%ymm1,%ymm8 - vpslld $2,%ymm1,%ymm1 - addl -88(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - vpor %ymm8,%ymm1,%ymm1 - addl %r12d,%esi - xorl %edi,%eax - addl -84(%r13),%edx - andnl %ebx,%esi,%edi - vpaddd %ymm11,%ymm1,%ymm9 - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - vmovdqu %ymm9,288(%rsp) - addl %r12d,%edx - xorl %edi,%esi - addl -64(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -60(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - vpalignr $8,%ymm0,%ymm1,%ymm8 - vpxor %ymm6,%ymm2,%ymm2 - addl -56(%r13),%ebp - andnl %esi,%ebx,%edi - vpxor %ymm3,%ymm2,%ymm2 - vmovdqu 0(%r14),%ymm11 - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpxor %ymm8,%ymm2,%ymm2 - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - vpsrld $30,%ymm2,%ymm8 - vpslld $2,%ymm2,%ymm2 - addl -52(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - vpor %ymm8,%ymm2,%ymm2 - addl %r12d,%eax - xorl %edi,%ebp - addl -32(%r13),%esi - andnl %ecx,%eax,%edi - vpaddd %ymm11,%ymm2,%ymm9 - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - vmovdqu %ymm9,320(%rsp) - addl %r12d,%esi - xorl %edi,%eax - addl -28(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -24(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - vpalignr $8,%ymm1,%ymm2,%ymm8 - vpxor %ymm7,%ymm3,%ymm3 - addl -20(%r13),%ebx - andnl %eax,%ecx,%edi - vpxor %ymm4,%ymm3,%ymm3 - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpxor %ymm8,%ymm3,%ymm3 - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - vpsrld $30,%ymm3,%ymm8 - vpslld $2,%ymm3,%ymm3 - addl 0(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - vpor %ymm8,%ymm3,%ymm3 - addl %r12d,%ebp - xorl %edi,%ebx - addl 4(%r13),%eax - andnl %edx,%ebp,%edi - vpaddd %ymm11,%ymm3,%ymm9 - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - vmovdqu %ymm9,352(%rsp) - addl %r12d,%eax - xorl %edi,%ebp - addl 8(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl 12(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vpalignr $8,%ymm2,%ymm3,%ymm8 - vpxor %ymm0,%ymm4,%ymm4 - addl 32(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - vpxor %ymm5,%ymm4,%ymm4 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpxor %ymm8,%ymm4,%ymm4 - addl %r12d,%ecx - xorl %ebp,%edx - addl 36(%r13),%ebx - vpsrld $30,%ymm4,%ymm8 - vpslld $2,%ymm4,%ymm4 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vpor %ymm8,%ymm4,%ymm4 - addl 40(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpaddd %ymm11,%ymm4,%ymm9 - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 44(%r13),%eax - vmovdqu %ymm9,384(%rsp) - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpalignr $8,%ymm3,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - addl 68(%r13),%edx - leal (%rdx,%rax,1),%edx - vpxor %ymm6,%ymm5,%ymm5 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - vpxor %ymm8,%ymm5,%ymm5 - addl %r12d,%edx - xorl %ebx,%esi - addl 72(%r13),%ecx - vpsrld $30,%ymm5,%ymm8 - vpslld $2,%ymm5,%ymm5 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - vpor %ymm8,%ymm5,%ymm5 - addl 76(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpaddd %ymm11,%ymm5,%ymm9 - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 96(%r13),%ebp - vmovdqu %ymm9,416(%rsp) - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 100(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpalignr $8,%ymm4,%ymm5,%ymm8 - vpxor %ymm2,%ymm6,%ymm6 - addl 104(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpxor %ymm7,%ymm6,%ymm6 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - vpxor %ymm8,%ymm6,%ymm6 - addl %r12d,%esi - xorl %ecx,%eax - addl 108(%r13),%edx - leaq 256(%r13),%r13 - vpsrld $30,%ymm6,%ymm8 - vpslld $2,%ymm6,%ymm6 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vpor %ymm8,%ymm6,%ymm6 - addl -128(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpaddd %ymm11,%ymm6,%ymm9 - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -124(%r13),%ebx - vmovdqu %ymm9,448(%rsp) - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -120(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpalignr $8,%ymm5,%ymm6,%ymm8 - vpxor %ymm3,%ymm7,%ymm7 - addl -116(%r13),%eax - leal (%rax,%rbx,1),%eax - vpxor %ymm0,%ymm7,%ymm7 - vmovdqu 32(%r14),%ymm11 - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - vpxor %ymm8,%ymm7,%ymm7 - addl %r12d,%eax - xorl %edx,%ebp - addl -96(%r13),%esi - vpsrld $30,%ymm7,%ymm8 - vpslld $2,%ymm7,%ymm7 - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpor %ymm8,%ymm7,%ymm7 - addl -92(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpaddd %ymm11,%ymm7,%ymm9 - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -88(%r13),%ecx - vmovdqu %ymm9,480(%rsp) - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -84(%r13),%ebx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - jmp L$align32_2 -.p2align 5 -L$align32_2: - vpalignr $8,%ymm6,%ymm7,%ymm8 - vpxor %ymm4,%ymm0,%ymm0 - addl -64(%r13),%ebp - xorl %esi,%ecx - vpxor %ymm1,%ymm0,%ymm0 - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - vpxor %ymm8,%ymm0,%ymm0 - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - vpsrld $30,%ymm0,%ymm8 - vpslld $2,%ymm0,%ymm0 - addl %r12d,%ebp - andl %edi,%ebx - addl -60(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - vpor %ymm8,%ymm0,%ymm0 - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - vpaddd %ymm11,%ymm0,%ymm9 - addl %r12d,%eax - andl %edi,%ebp - addl -56(%r13),%esi - xorl %ecx,%ebp - vmovdqu %ymm9,512(%rsp) - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl -52(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - addl -32(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - vpalignr $8,%ymm7,%ymm0,%ymm8 - vpxor %ymm5,%ymm1,%ymm1 - addl -28(%r13),%ebx - xorl %eax,%edx - vpxor %ymm2,%ymm1,%ymm1 - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - vpxor %ymm8,%ymm1,%ymm1 - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vpsrld $30,%ymm1,%ymm8 - vpslld $2,%ymm1,%ymm1 - addl %r12d,%ebx - andl %edi,%ecx - addl -24(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - vpor %ymm8,%ymm1,%ymm1 - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - vpaddd %ymm11,%ymm1,%ymm9 - addl %r12d,%ebp - andl %edi,%ebx - addl -20(%r13),%eax - xorl %edx,%ebx - vmovdqu %ymm9,544(%rsp) - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 0(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl 4(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - vpalignr $8,%ymm0,%ymm1,%ymm8 - vpxor %ymm6,%ymm2,%ymm2 - addl 8(%r13),%ecx - xorl %ebp,%esi - vpxor %ymm3,%ymm2,%ymm2 - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - vpxor %ymm8,%ymm2,%ymm2 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpsrld $30,%ymm2,%ymm8 - vpslld $2,%ymm2,%ymm2 - addl %r12d,%ecx - andl %edi,%edx - addl 12(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - vpor %ymm8,%ymm2,%ymm2 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vpaddd %ymm11,%ymm2,%ymm9 - addl %r12d,%ebx - andl %edi,%ecx - addl 32(%r13),%ebp - xorl %esi,%ecx - vmovdqu %ymm9,576(%rsp) - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 36(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 40(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - vpalignr $8,%ymm1,%ymm2,%ymm8 - vpxor %ymm7,%ymm3,%ymm3 - addl 44(%r13),%edx - xorl %ebx,%eax - vpxor %ymm4,%ymm3,%ymm3 - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - vpxor %ymm8,%ymm3,%ymm3 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - vpsrld $30,%ymm3,%ymm8 - vpslld $2,%ymm3,%ymm3 - addl %r12d,%edx - andl %edi,%esi - addl 64(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - vpor %ymm8,%ymm3,%ymm3 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpaddd %ymm11,%ymm3,%ymm9 - addl %r12d,%ecx - andl %edi,%edx - addl 68(%r13),%ebx - xorl %eax,%edx - vmovdqu %ymm9,608(%rsp) - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl 72(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 76(%r13),%eax - xorl %edx,%ebx - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl 100(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 104(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 108(%r13),%ebx - leaq 256(%r13),%r13 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -128(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -124(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -120(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -116(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -96(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -92(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -88(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -84(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -60(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -56(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -52(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -32(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -28(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -24(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -20(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - addl %r12d,%edx - leaq 128(%r9),%r13 - leaq 128(%r9),%rdi - cmpq %r10,%r13 - cmovaeq %r9,%r13 - - - addl 0(%r8),%edx - addl 4(%r8),%esi - addl 8(%r8),%ebp - movl %edx,0(%r8) - addl 12(%r8),%ebx - movl %esi,4(%r8) - movl %edx,%eax - addl 16(%r8),%ecx - movl %ebp,%r12d - movl %ebp,8(%r8) - movl %ebx,%edx - - movl %ebx,12(%r8) - movl %esi,%ebp - movl %ecx,16(%r8) - - movl %ecx,%esi - movl %r12d,%ecx - - - cmpq %r10,%r9 - je L$done_avx2 - vmovdqu 64(%r14),%ymm6 - cmpq %r10,%rdi - ja L$ast_avx2 - - vmovdqu -64(%rdi),%xmm0 - vmovdqu -48(%rdi),%xmm1 - vmovdqu -32(%rdi),%xmm2 - vmovdqu -16(%rdi),%xmm3 - vinserti128 $1,0(%r13),%ymm0,%ymm0 - vinserti128 $1,16(%r13),%ymm1,%ymm1 - vinserti128 $1,32(%r13),%ymm2,%ymm2 - vinserti128 $1,48(%r13),%ymm3,%ymm3 - jmp L$ast_avx2 - -.p2align 5 -L$ast_avx2: - leaq 128+16(%rsp),%r13 - rorxl $2,%ebp,%ebx - andnl %edx,%ebp,%edi - andl %ecx,%ebp - xorl %edi,%ebp - subq $-128,%r9 - addl -128(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -124(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -120(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -116(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -96(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl -92(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl -88(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -84(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -64(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -60(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -56(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl -52(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl -32(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -28(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -24(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -20(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl 0(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl 4(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl 8(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl 12(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 32(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 36(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 40(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 44(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vmovdqu -64(%r14),%ymm11 - vpshufb %ymm6,%ymm0,%ymm0 - addl 68(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 72(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 76(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 96(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 100(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpshufb %ymm6,%ymm1,%ymm1 - vpaddd %ymm11,%ymm0,%ymm8 - addl 104(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl 108(%r13),%edx - leaq 256(%r13),%r13 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -128(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -124(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -120(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vmovdqu %ymm8,0(%rsp) - vpshufb %ymm6,%ymm2,%ymm2 - vpaddd %ymm11,%ymm1,%ymm9 - addl -116(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -92(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -88(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -84(%r13),%ebx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - vmovdqu %ymm9,32(%rsp) - vpshufb %ymm6,%ymm3,%ymm3 - vpaddd %ymm11,%ymm2,%ymm6 - addl -64(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl -60(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl -56(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl -52(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - addl -32(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - jmp L$align32_3 -.p2align 5 -L$align32_3: - vmovdqu %ymm6,64(%rsp) - vpaddd %ymm11,%ymm3,%ymm7 - addl -28(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl -24(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl -20(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 0(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl 4(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - vmovdqu %ymm7,96(%rsp) - addl 8(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - addl 12(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl 32(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 36(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 40(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - vpalignr $8,%ymm0,%ymm1,%ymm4 - addl 44(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - vpsrldq $4,%ymm3,%ymm8 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpxor %ymm0,%ymm4,%ymm4 - vpxor %ymm2,%ymm8,%ymm8 - xorl %ebp,%esi - addl %r12d,%edx - vpxor %ymm8,%ymm4,%ymm4 - andl %edi,%esi - addl 64(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - vpsrld $31,%ymm4,%ymm8 - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - vpslldq $12,%ymm4,%ymm10 - vpaddd %ymm4,%ymm4,%ymm4 - rorxl $2,%edx,%esi - xorl %eax,%edx - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm4,%ymm4 - addl %r12d,%ecx - andl %edi,%edx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm4,%ymm4 - addl 68(%r13),%ebx - xorl %eax,%edx - vpxor %ymm10,%ymm4,%ymm4 - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - vpaddd %ymm11,%ymm4,%ymm9 - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vmovdqu %ymm9,128(%rsp) - addl %r12d,%ebx - andl %edi,%ecx - addl 72(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 76(%r13),%eax - xorl %edx,%ebx - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpalignr $8,%ymm1,%ymm2,%ymm5 - addl 96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpsrldq $4,%ymm4,%ymm8 - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm3,%ymm8,%ymm8 - addl 100(%r13),%edx - leal (%rdx,%rax,1),%edx - vpxor %ymm8,%ymm5,%ymm5 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r14),%ymm11 - xorl %ebx,%esi - addl 104(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - vpslldq $12,%ymm5,%ymm10 - vpaddd %ymm5,%ymm5,%ymm5 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm5,%ymm5 - xorl %eax,%edx - addl %r12d,%ecx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm5,%ymm5 - xorl %ebp,%edx - addl 108(%r13),%ebx - leaq 256(%r13),%r13 - vpxor %ymm10,%ymm5,%ymm5 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpaddd %ymm11,%ymm5,%ymm9 - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vmovdqu %ymm9,160(%rsp) - addl -128(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpalignr $8,%ymm2,%ymm3,%ymm6 - addl -124(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - vpsrldq $4,%ymm5,%ymm8 - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpxor %ymm2,%ymm6,%ymm6 - vpxor %ymm4,%ymm8,%ymm8 - addl -120(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpxor %ymm8,%ymm6,%ymm6 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - vpsrld $31,%ymm6,%ymm8 - xorl %ecx,%eax - addl -116(%r13),%edx - leal (%rdx,%rax,1),%edx - vpslldq $12,%ymm6,%ymm10 - vpaddd %ymm6,%ymm6,%ymm6 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm6,%ymm6 - xorl %ebp,%esi - addl %r12d,%edx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm6,%ymm6 - xorl %ebx,%esi - addl -96(%r13),%ecx - vpxor %ymm10,%ymm6,%ymm6 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpaddd %ymm11,%ymm6,%ymm9 - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - vmovdqu %ymm9,192(%rsp) - addl -92(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vpalignr $8,%ymm3,%ymm4,%ymm7 - addl -88(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpsrldq $4,%ymm6,%ymm8 - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpxor %ymm3,%ymm7,%ymm7 - vpxor %ymm5,%ymm8,%ymm8 - addl -84(%r13),%eax - leal (%rax,%rbx,1),%eax - vpxor %ymm8,%ymm7,%ymm7 - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - vpsrld $31,%ymm7,%ymm8 - xorl %edx,%ebp - addl -64(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpslldq $12,%ymm7,%ymm10 - vpaddd %ymm7,%ymm7,%ymm7 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm7,%ymm7 - xorl %ebx,%eax - addl %r12d,%esi - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm7,%ymm7 - xorl %ecx,%eax - addl -60(%r13),%edx - vpxor %ymm10,%ymm7,%ymm7 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpaddd %ymm11,%ymm7,%ymm9 - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vmovdqu %ymm9,224(%rsp) - addl -56(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -52(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -32(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -28(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -24(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -20(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - addl %r12d,%edx - leaq 128(%rsp),%r13 - - - addl 0(%r8),%edx - addl 4(%r8),%esi - addl 8(%r8),%ebp - movl %edx,0(%r8) - addl 12(%r8),%ebx - movl %esi,4(%r8) - movl %edx,%eax - addl 16(%r8),%ecx - movl %ebp,%r12d - movl %ebp,8(%r8) - movl %ebx,%edx - - movl %ebx,12(%r8) - movl %esi,%ebp - movl %ecx,16(%r8) - - movl %ecx,%esi - movl %r12d,%ecx - - - cmpq %r10,%r9 - jbe L$oop_avx2 - -L$done_avx2: - vzeroupper - movq -40(%r11),%r14 - - movq -32(%r11),%r13 - - movq -24(%r11),%r12 - - movq -16(%r11),%rbp - - movq -8(%r11),%rbx - - leaq (%r11),%rsp - -L$epilogue_avx2: - .byte 0xf3,0xc3 - - -.p2align 6 -K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 -.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.p2align 6 -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/sha256-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/sha256-x86_64.S deleted file mode 100644 index 00dc01c9..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/sha256-x86_64.S +++ /dev/null @@ -1,4182 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - -.globl _sha256_block_data_order -.private_extern _sha256_block_data_order - -.p2align 4 -_sha256_block_data_order: - - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 0(%r11),%r9d - movl 4(%r11),%r10d - movl 8(%r11),%r11d - testl $536870912,%r11d - jnz L$shaext_shortcut - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je L$avx_shortcut - testl $512,%r10d - jnz L$ssse3_shortcut - movq %rsp,%rax - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - shlq $4,%rdx - subq $64+32,%rsp - leaq (%rsi,%rdx,4),%rdx - andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %rax,88(%rsp) - -L$prologue: - - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - jmp L$loop - -.p2align 4 -L$loop: - movl %ebx,%edi - leaq K256(%rip),%rbp - xorl %ecx,%edi - movl 0(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - - leaq 4(%rbp),%rbp - addl %r14d,%r11d - movl 4(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - - leaq 4(%rbp),%rbp - addl %r14d,%r10d - movl 8(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - - leaq 4(%rbp),%rbp - addl %r14d,%r9d - movl 12(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - - leaq 20(%rbp),%rbp - addl %r14d,%r8d - movl 16(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - - leaq 4(%rbp),%rbp - addl %r14d,%edx - movl 20(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - - leaq 4(%rbp),%rbp - addl %r14d,%ecx - movl 24(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - - leaq 4(%rbp),%rbp - addl %r14d,%ebx - movl 28(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - - leaq 20(%rbp),%rbp - addl %r14d,%eax - movl 32(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - - leaq 4(%rbp),%rbp - addl %r14d,%r11d - movl 36(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - - leaq 4(%rbp),%rbp - addl %r14d,%r10d - movl 40(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - - leaq 4(%rbp),%rbp - addl %r14d,%r9d - movl 44(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - - leaq 20(%rbp),%rbp - addl %r14d,%r8d - movl 48(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - - leaq 4(%rbp),%rbp - addl %r14d,%edx - movl 52(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - - leaq 4(%rbp),%rbp - addl %r14d,%ecx - movl 56(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - - leaq 4(%rbp),%rbp - addl %r14d,%ebx - movl 60(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - - leaq 20(%rbp),%rbp - jmp L$rounds_16_xx -.p2align 4 -L$rounds_16_xx: - movl 4(%rsp),%r13d - movl 56(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%eax - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 36(%rsp),%r12d - - addl 0(%rsp),%r12d - movl %r8d,%r13d - addl %r15d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - - leaq 4(%rbp),%rbp - movl 8(%rsp),%r13d - movl 60(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r11d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 40(%rsp),%r12d - - addl 4(%rsp),%r12d - movl %edx,%r13d - addl %edi,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - - leaq 4(%rbp),%rbp - movl 12(%rsp),%r13d - movl 0(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r10d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 44(%rsp),%r12d - - addl 8(%rsp),%r12d - movl %ecx,%r13d - addl %r15d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - - leaq 4(%rbp),%rbp - movl 16(%rsp),%r13d - movl 4(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r9d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 48(%rsp),%r12d - - addl 12(%rsp),%r12d - movl %ebx,%r13d - addl %edi,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - - leaq 20(%rbp),%rbp - movl 20(%rsp),%r13d - movl 8(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r8d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 52(%rsp),%r12d - - addl 16(%rsp),%r12d - movl %eax,%r13d - addl %r15d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - - leaq 4(%rbp),%rbp - movl 24(%rsp),%r13d - movl 12(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%edx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 56(%rsp),%r12d - - addl 20(%rsp),%r12d - movl %r11d,%r13d - addl %edi,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - - leaq 4(%rbp),%rbp - movl 28(%rsp),%r13d - movl 16(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ecx - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 60(%rsp),%r12d - - addl 24(%rsp),%r12d - movl %r10d,%r13d - addl %r15d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - - leaq 4(%rbp),%rbp - movl 32(%rsp),%r13d - movl 20(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ebx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 0(%rsp),%r12d - - addl 28(%rsp),%r12d - movl %r9d,%r13d - addl %edi,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - - leaq 20(%rbp),%rbp - movl 36(%rsp),%r13d - movl 24(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%eax - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 4(%rsp),%r12d - - addl 32(%rsp),%r12d - movl %r8d,%r13d - addl %r15d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - - leaq 4(%rbp),%rbp - movl 40(%rsp),%r13d - movl 28(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r11d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 8(%rsp),%r12d - - addl 36(%rsp),%r12d - movl %edx,%r13d - addl %edi,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - - leaq 4(%rbp),%rbp - movl 44(%rsp),%r13d - movl 32(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r10d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 12(%rsp),%r12d - - addl 40(%rsp),%r12d - movl %ecx,%r13d - addl %r15d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - - leaq 4(%rbp),%rbp - movl 48(%rsp),%r13d - movl 36(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r9d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 16(%rsp),%r12d - - addl 44(%rsp),%r12d - movl %ebx,%r13d - addl %edi,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - - leaq 20(%rbp),%rbp - movl 52(%rsp),%r13d - movl 40(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r8d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 20(%rsp),%r12d - - addl 48(%rsp),%r12d - movl %eax,%r13d - addl %r15d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - - leaq 4(%rbp),%rbp - movl 56(%rsp),%r13d - movl 44(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%edx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 24(%rsp),%r12d - - addl 52(%rsp),%r12d - movl %r11d,%r13d - addl %edi,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - - leaq 4(%rbp),%rbp - movl 60(%rsp),%r13d - movl 48(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ecx - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 28(%rsp),%r12d - - addl 56(%rsp),%r12d - movl %r10d,%r13d - addl %r15d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - - leaq 4(%rbp),%rbp - movl 0(%rsp),%r13d - movl 52(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ebx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 32(%rsp),%r12d - - addl 60(%rsp),%r12d - movl %r9d,%r13d - addl %edi,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - - leaq 20(%rbp),%rbp - cmpb $0,3(%rbp) - jnz L$rounds_16_xx - - movq 64+0(%rsp),%rdi - addl %r14d,%eax - leaq 64(%rsi),%rsi - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb L$loop - - movq 88(%rsp),%rsi - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$epilogue: - .byte 0xf3,0xc3 - - -.p2align 6 - -K256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 - -.p2align 6 -sha256_block_data_order_shaext: - -L$shaext_shortcut: - leaq K256+128(%rip),%rcx - movdqu (%rdi),%xmm1 - movdqu 16(%rdi),%xmm2 - movdqa 512-128(%rcx),%xmm7 - - pshufd $0x1b,%xmm1,%xmm0 - pshufd $0xb1,%xmm1,%xmm1 - pshufd $0x1b,%xmm2,%xmm2 - movdqa %xmm7,%xmm8 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp L$oop_shaext - -.p2align 4 -L$oop_shaext: - movdqu (%rsi),%xmm3 - movdqu 16(%rsi),%xmm4 - movdqu 32(%rsi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%rsi),%xmm6 - - movdqa 0-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 - movdqa %xmm2,%xmm10 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - nop - movdqa %xmm1,%xmm9 -.byte 15,56,203,202 - - movdqa 32-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - leaq 64(%rsi),%rsi -.byte 15,56,204,220 -.byte 15,56,203,202 - - movdqa 64-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - - movdqa 96-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 128-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 160-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 192-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 224-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 256-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 288-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 320-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 352-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 384-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 416-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - - movdqa 448-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa %xmm8,%xmm7 -.byte 15,56,203,202 - - movdqa 480-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - decq %rdx - nop -.byte 15,56,203,202 - - paddd %xmm10,%xmm2 - paddd %xmm9,%xmm1 - jnz L$oop_shaext - - pshufd $0xb1,%xmm2,%xmm2 - pshufd $0x1b,%xmm1,%xmm7 - pshufd $0xb1,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - - movdqu %xmm1,(%rdi) - movdqu %xmm2,16(%rdi) - .byte 0xf3,0xc3 - - - -.p2align 6 -sha256_block_data_order_ssse3: - -L$ssse3_shortcut: - movq %rsp,%rax - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - shlq $4,%rdx - subq $96,%rsp - leaq (%rsi,%rdx,4),%rdx - andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %rax,88(%rsp) - -L$prologue_ssse3: - - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - - - jmp L$loop_ssse3 -.p2align 4 -L$loop_ssse3: - movdqa K256+512(%rip),%xmm7 - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 -.byte 102,15,56,0,199 - movdqu 48(%rsi),%xmm3 - leaq K256(%rip),%rbp -.byte 102,15,56,0,207 - movdqa 0(%rbp),%xmm4 - movdqa 32(%rbp),%xmm5 -.byte 102,15,56,0,215 - paddd %xmm0,%xmm4 - movdqa 64(%rbp),%xmm6 -.byte 102,15,56,0,223 - movdqa 96(%rbp),%xmm7 - paddd %xmm1,%xmm5 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - movdqa %xmm4,0(%rsp) - movl %eax,%r14d - movdqa %xmm5,16(%rsp) - movl %ebx,%edi - movdqa %xmm6,32(%rsp) - xorl %ecx,%edi - movdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp L$ssse3_00_47 - -.p2align 4 -L$ssse3_00_47: - subq $-128,%rbp - rorl $14,%r13d - movdqa %xmm1,%xmm4 - movl %r14d,%eax - movl %r9d,%r12d - movdqa %xmm3,%xmm7 - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,224,4 - andl %r8d,%r12d - xorl %r8d,%r13d -.byte 102,15,58,15,250,4 - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - addl %r12d,%r11d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm0 - rorl $2,%r14d - addl %r11d,%edx - psrld $7,%xmm6 - addl %edi,%r11d - movl %edx,%r13d - pshufd $250,%xmm3,%xmm7 - addl %r11d,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%r11d - movl %r8d,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 4(%rsp),%r10d - movl %r11d,%edi - pxor %xmm6,%xmm4 - xorl %r9d,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - addl %r12d,%r10d - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm0 - rorl $2,%r14d - addl %r10d,%ecx - psrlq $17,%xmm6 - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %ecx,%r13d - xorl %r8d,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - psrldq $8,%xmm7 - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - paddd %xmm7,%xmm0 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - pshufd $80,%xmm0,%xmm7 - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - movdqa %xmm7,%xmm6 - addl %edi,%r9d - movl %ebx,%r13d - psrld $10,%xmm7 - addl %r9d,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - psrlq $2,%xmm6 - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - pxor %xmm6,%xmm7 - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %r10d,%edi - addl %r12d,%r8d - movdqa 0(%rbp),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - paddd %xmm7,%xmm0 - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - paddd %xmm0,%xmm6 - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,0(%rsp) - rorl $14,%r13d - movdqa %xmm2,%xmm4 - movl %r14d,%r8d - movl %ebx,%r12d - movdqa %xmm0,%xmm7 - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,225,4 - andl %eax,%r12d - xorl %eax,%r13d -.byte 102,15,58,15,251,4 - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - addl %r12d,%edx - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm1 - rorl $2,%r14d - addl %edx,%r11d - psrld $7,%xmm6 - addl %edi,%edx - movl %r11d,%r13d - pshufd $250,%xmm0,%xmm7 - addl %edx,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%edx - movl %eax,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 20(%rsp),%ecx - movl %edx,%edi - pxor %xmm6,%xmm4 - xorl %ebx,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - addl %r12d,%ecx - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm1 - rorl $2,%r14d - addl %ecx,%r10d - psrlq $17,%xmm6 - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r10d,%r13d - xorl %eax,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - psrldq $8,%xmm7 - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - paddd %xmm7,%xmm1 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - pshufd $80,%xmm1,%xmm7 - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - movdqa %xmm7,%xmm6 - addl %edi,%ebx - movl %r9d,%r13d - psrld $10,%xmm7 - addl %ebx,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - psrlq $2,%xmm6 - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - pxor %xmm6,%xmm7 - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %ecx,%edi - addl %r12d,%eax - movdqa 32(%rbp),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - paddd %xmm7,%xmm1 - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - paddd %xmm1,%xmm6 - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,16(%rsp) - rorl $14,%r13d - movdqa %xmm3,%xmm4 - movl %r14d,%eax - movl %r9d,%r12d - movdqa %xmm1,%xmm7 - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,226,4 - andl %r8d,%r12d - xorl %r8d,%r13d -.byte 102,15,58,15,248,4 - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - addl %r12d,%r11d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm2 - rorl $2,%r14d - addl %r11d,%edx - psrld $7,%xmm6 - addl %edi,%r11d - movl %edx,%r13d - pshufd $250,%xmm1,%xmm7 - addl %r11d,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%r11d - movl %r8d,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 36(%rsp),%r10d - movl %r11d,%edi - pxor %xmm6,%xmm4 - xorl %r9d,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - addl %r12d,%r10d - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm2 - rorl $2,%r14d - addl %r10d,%ecx - psrlq $17,%xmm6 - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %ecx,%r13d - xorl %r8d,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - psrldq $8,%xmm7 - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - paddd %xmm7,%xmm2 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - pshufd $80,%xmm2,%xmm7 - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - movdqa %xmm7,%xmm6 - addl %edi,%r9d - movl %ebx,%r13d - psrld $10,%xmm7 - addl %r9d,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - psrlq $2,%xmm6 - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - pxor %xmm6,%xmm7 - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %r10d,%edi - addl %r12d,%r8d - movdqa 64(%rbp),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - paddd %xmm7,%xmm2 - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - paddd %xmm2,%xmm6 - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,32(%rsp) - rorl $14,%r13d - movdqa %xmm0,%xmm4 - movl %r14d,%r8d - movl %ebx,%r12d - movdqa %xmm2,%xmm7 - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,227,4 - andl %eax,%r12d - xorl %eax,%r13d -.byte 102,15,58,15,249,4 - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - addl %r12d,%edx - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm3 - rorl $2,%r14d - addl %edx,%r11d - psrld $7,%xmm6 - addl %edi,%edx - movl %r11d,%r13d - pshufd $250,%xmm2,%xmm7 - addl %edx,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%edx - movl %eax,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 52(%rsp),%ecx - movl %edx,%edi - pxor %xmm6,%xmm4 - xorl %ebx,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - addl %r12d,%ecx - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm3 - rorl $2,%r14d - addl %ecx,%r10d - psrlq $17,%xmm6 - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r10d,%r13d - xorl %eax,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - psrldq $8,%xmm7 - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - paddd %xmm7,%xmm3 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - pshufd $80,%xmm3,%xmm7 - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - movdqa %xmm7,%xmm6 - addl %edi,%ebx - movl %r9d,%r13d - psrld $10,%xmm7 - addl %ebx,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - psrlq $2,%xmm6 - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - pxor %xmm6,%xmm7 - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %ecx,%edi - addl %r12d,%eax - movdqa 96(%rbp),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - paddd %xmm7,%xmm3 - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - paddd %xmm3,%xmm6 - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,48(%rsp) - cmpb $0,131(%rbp) - jne L$ssse3_00_47 - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - rorl $6,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - rorl $2,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - rorl $11,%r14d - xorl %eax,%edi - addl %r12d,%r10d - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - rorl $2,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - rorl $6,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - rorl $6,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - rorl $2,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - rorl $11,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - rorl $2,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - xorl %ecx,%edi - addl %r12d,%eax - rorl $6,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - rorl $6,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - rorl $2,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - rorl $11,%r14d - xorl %eax,%edi - addl %r12d,%r10d - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - rorl $2,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - rorl $6,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - rorl $6,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - rorl $2,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - rorl $11,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - rorl $2,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - xorl %ecx,%edi - addl %r12d,%eax - rorl $6,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%rdi - movl %r14d,%eax - - addl 0(%rdi),%eax - leaq 64(%rsi),%rsi - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb L$loop_ssse3 - - movq 88(%rsp),%rsi - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$epilogue_ssse3: - .byte 0xf3,0xc3 - - - -.p2align 6 -sha256_block_data_order_avx: - -L$avx_shortcut: - movq %rsp,%rax - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - shlq $4,%rdx - subq $96,%rsp - leaq (%rsi,%rdx,4),%rdx - andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %rax,88(%rsp) - -L$prologue_avx: - - vzeroupper - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - vmovdqa K256+512+32(%rip),%xmm8 - vmovdqa K256+512+64(%rip),%xmm9 - jmp L$loop_avx -.p2align 4 -L$loop_avx: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi),%xmm0 - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%edi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%edi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp L$avx_00_47 - -.p2align 4 -L$avx_00_47: - subq $-128,%rbp - vpalignr $4,%xmm0,%xmm1,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm2,%xmm3,%xmm7 - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - vpshufd $250,%xmm3,%xmm7 - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm0,%xmm0 - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpaddd %xmm6,%xmm0,%xmm0 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - vpshufd $80,%xmm0,%xmm7 - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - vpaddd %xmm6,%xmm0,%xmm0 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpaddd 0(%rbp),%xmm0,%xmm6 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm3,%xmm0,%xmm7 - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - vpshufd $250,%xmm0,%xmm7 - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm1,%xmm1 - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpaddd %xmm6,%xmm1,%xmm1 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - vpshufd $80,%xmm1,%xmm7 - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - vpxor %xmm7,%xmm6,%xmm6 - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - vpaddd %xmm6,%xmm1,%xmm1 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpaddd 32(%rbp),%xmm1,%xmm6 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm0,%xmm1,%xmm7 - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - vpshufd $250,%xmm1,%xmm7 - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm2,%xmm2 - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpaddd %xmm6,%xmm2,%xmm2 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - vpshufd $80,%xmm2,%xmm7 - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - vpaddd %xmm6,%xmm2,%xmm2 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpaddd 64(%rbp),%xmm2,%xmm6 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm1,%xmm2,%xmm7 - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - vpshufd $250,%xmm2,%xmm7 - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm3,%xmm3 - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpaddd %xmm6,%xmm3,%xmm3 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - vpshufd $80,%xmm3,%xmm7 - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - vpxor %xmm7,%xmm6,%xmm6 - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - vpaddd %xmm6,%xmm3,%xmm3 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpaddd 96(%rbp),%xmm3,%xmm6 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - cmpb $0,131(%rbp) - jne L$avx_00_47 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%rdi - movl %r14d,%eax - - addl 0(%rdi),%eax - leaq 64(%rsi),%rsi - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb L$loop_avx - - movq 88(%rsp),%rsi - - vzeroupper - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$epilogue_avx: - .byte 0xf3,0xc3 - - -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/sha512-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/sha512-x86_64.S deleted file mode 100644 index 5732f439..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/sha512-x86_64.S +++ /dev/null @@ -1,2990 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - -.globl _sha512_block_data_order -.private_extern _sha512_block_data_order - -.p2align 4 -_sha512_block_data_order: - - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 0(%r11),%r9d - movl 4(%r11),%r10d - movl 8(%r11),%r11d - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je L$avx_shortcut - movq %rsp,%rax - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - shlq $4,%rdx - subq $128+32,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,152(%rsp) - -L$prologue: - - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp L$loop - -.p2align 4 -L$loop: - movq %rbx,%rdi - leaq K512(%rip),%rbp - xorq %rcx,%rdi - movq 0(%rsi),%r12 - movq %r8,%r13 - movq %rax,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r9,%r15 - - xorq %r8,%r13 - rorq $5,%r14 - xorq %r10,%r15 - - movq %r12,0(%rsp) - xorq %rax,%r14 - andq %r8,%r15 - - rorq $4,%r13 - addq %r11,%r12 - xorq %r10,%r15 - - rorq $6,%r14 - xorq %r8,%r13 - addq %r15,%r12 - - movq %rax,%r15 - addq (%rbp),%r12 - xorq %rax,%r14 - - xorq %rbx,%r15 - rorq $14,%r13 - movq %rbx,%r11 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r11 - addq %r12,%rdx - addq %r12,%r11 - - leaq 8(%rbp),%rbp - addq %r14,%r11 - movq 8(%rsi),%r12 - movq %rdx,%r13 - movq %r11,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r8,%rdi - - xorq %rdx,%r13 - rorq $5,%r14 - xorq %r9,%rdi - - movq %r12,8(%rsp) - xorq %r11,%r14 - andq %rdx,%rdi - - rorq $4,%r13 - addq %r10,%r12 - xorq %r9,%rdi - - rorq $6,%r14 - xorq %rdx,%r13 - addq %rdi,%r12 - - movq %r11,%rdi - addq (%rbp),%r12 - xorq %r11,%r14 - - xorq %rax,%rdi - rorq $14,%r13 - movq %rax,%r10 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r10 - addq %r12,%rcx - addq %r12,%r10 - - leaq 24(%rbp),%rbp - addq %r14,%r10 - movq 16(%rsi),%r12 - movq %rcx,%r13 - movq %r10,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rdx,%r15 - - xorq %rcx,%r13 - rorq $5,%r14 - xorq %r8,%r15 - - movq %r12,16(%rsp) - xorq %r10,%r14 - andq %rcx,%r15 - - rorq $4,%r13 - addq %r9,%r12 - xorq %r8,%r15 - - rorq $6,%r14 - xorq %rcx,%r13 - addq %r15,%r12 - - movq %r10,%r15 - addq (%rbp),%r12 - xorq %r10,%r14 - - xorq %r11,%r15 - rorq $14,%r13 - movq %r11,%r9 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r9 - addq %r12,%rbx - addq %r12,%r9 - - leaq 8(%rbp),%rbp - addq %r14,%r9 - movq 24(%rsi),%r12 - movq %rbx,%r13 - movq %r9,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rcx,%rdi - - xorq %rbx,%r13 - rorq $5,%r14 - xorq %rdx,%rdi - - movq %r12,24(%rsp) - xorq %r9,%r14 - andq %rbx,%rdi - - rorq $4,%r13 - addq %r8,%r12 - xorq %rdx,%rdi - - rorq $6,%r14 - xorq %rbx,%r13 - addq %rdi,%r12 - - movq %r9,%rdi - addq (%rbp),%r12 - xorq %r9,%r14 - - xorq %r10,%rdi - rorq $14,%r13 - movq %r10,%r8 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r8 - addq %r12,%rax - addq %r12,%r8 - - leaq 24(%rbp),%rbp - addq %r14,%r8 - movq 32(%rsi),%r12 - movq %rax,%r13 - movq %r8,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rbx,%r15 - - xorq %rax,%r13 - rorq $5,%r14 - xorq %rcx,%r15 - - movq %r12,32(%rsp) - xorq %r8,%r14 - andq %rax,%r15 - - rorq $4,%r13 - addq %rdx,%r12 - xorq %rcx,%r15 - - rorq $6,%r14 - xorq %rax,%r13 - addq %r15,%r12 - - movq %r8,%r15 - addq (%rbp),%r12 - xorq %r8,%r14 - - xorq %r9,%r15 - rorq $14,%r13 - movq %r9,%rdx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rdx - addq %r12,%r11 - addq %r12,%rdx - - leaq 8(%rbp),%rbp - addq %r14,%rdx - movq 40(%rsi),%r12 - movq %r11,%r13 - movq %rdx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rax,%rdi - - xorq %r11,%r13 - rorq $5,%r14 - xorq %rbx,%rdi - - movq %r12,40(%rsp) - xorq %rdx,%r14 - andq %r11,%rdi - - rorq $4,%r13 - addq %rcx,%r12 - xorq %rbx,%rdi - - rorq $6,%r14 - xorq %r11,%r13 - addq %rdi,%r12 - - movq %rdx,%rdi - addq (%rbp),%r12 - xorq %rdx,%r14 - - xorq %r8,%rdi - rorq $14,%r13 - movq %r8,%rcx - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rcx - addq %r12,%r10 - addq %r12,%rcx - - leaq 24(%rbp),%rbp - addq %r14,%rcx - movq 48(%rsi),%r12 - movq %r10,%r13 - movq %rcx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r11,%r15 - - xorq %r10,%r13 - rorq $5,%r14 - xorq %rax,%r15 - - movq %r12,48(%rsp) - xorq %rcx,%r14 - andq %r10,%r15 - - rorq $4,%r13 - addq %rbx,%r12 - xorq %rax,%r15 - - rorq $6,%r14 - xorq %r10,%r13 - addq %r15,%r12 - - movq %rcx,%r15 - addq (%rbp),%r12 - xorq %rcx,%r14 - - xorq %rdx,%r15 - rorq $14,%r13 - movq %rdx,%rbx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rbx - addq %r12,%r9 - addq %r12,%rbx - - leaq 8(%rbp),%rbp - addq %r14,%rbx - movq 56(%rsi),%r12 - movq %r9,%r13 - movq %rbx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r10,%rdi - - xorq %r9,%r13 - rorq $5,%r14 - xorq %r11,%rdi - - movq %r12,56(%rsp) - xorq %rbx,%r14 - andq %r9,%rdi - - rorq $4,%r13 - addq %rax,%r12 - xorq %r11,%rdi - - rorq $6,%r14 - xorq %r9,%r13 - addq %rdi,%r12 - - movq %rbx,%rdi - addq (%rbp),%r12 - xorq %rbx,%r14 - - xorq %rcx,%rdi - rorq $14,%r13 - movq %rcx,%rax - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rax - addq %r12,%r8 - addq %r12,%rax - - leaq 24(%rbp),%rbp - addq %r14,%rax - movq 64(%rsi),%r12 - movq %r8,%r13 - movq %rax,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r9,%r15 - - xorq %r8,%r13 - rorq $5,%r14 - xorq %r10,%r15 - - movq %r12,64(%rsp) - xorq %rax,%r14 - andq %r8,%r15 - - rorq $4,%r13 - addq %r11,%r12 - xorq %r10,%r15 - - rorq $6,%r14 - xorq %r8,%r13 - addq %r15,%r12 - - movq %rax,%r15 - addq (%rbp),%r12 - xorq %rax,%r14 - - xorq %rbx,%r15 - rorq $14,%r13 - movq %rbx,%r11 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r11 - addq %r12,%rdx - addq %r12,%r11 - - leaq 8(%rbp),%rbp - addq %r14,%r11 - movq 72(%rsi),%r12 - movq %rdx,%r13 - movq %r11,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r8,%rdi - - xorq %rdx,%r13 - rorq $5,%r14 - xorq %r9,%rdi - - movq %r12,72(%rsp) - xorq %r11,%r14 - andq %rdx,%rdi - - rorq $4,%r13 - addq %r10,%r12 - xorq %r9,%rdi - - rorq $6,%r14 - xorq %rdx,%r13 - addq %rdi,%r12 - - movq %r11,%rdi - addq (%rbp),%r12 - xorq %r11,%r14 - - xorq %rax,%rdi - rorq $14,%r13 - movq %rax,%r10 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r10 - addq %r12,%rcx - addq %r12,%r10 - - leaq 24(%rbp),%rbp - addq %r14,%r10 - movq 80(%rsi),%r12 - movq %rcx,%r13 - movq %r10,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rdx,%r15 - - xorq %rcx,%r13 - rorq $5,%r14 - xorq %r8,%r15 - - movq %r12,80(%rsp) - xorq %r10,%r14 - andq %rcx,%r15 - - rorq $4,%r13 - addq %r9,%r12 - xorq %r8,%r15 - - rorq $6,%r14 - xorq %rcx,%r13 - addq %r15,%r12 - - movq %r10,%r15 - addq (%rbp),%r12 - xorq %r10,%r14 - - xorq %r11,%r15 - rorq $14,%r13 - movq %r11,%r9 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r9 - addq %r12,%rbx - addq %r12,%r9 - - leaq 8(%rbp),%rbp - addq %r14,%r9 - movq 88(%rsi),%r12 - movq %rbx,%r13 - movq %r9,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rcx,%rdi - - xorq %rbx,%r13 - rorq $5,%r14 - xorq %rdx,%rdi - - movq %r12,88(%rsp) - xorq %r9,%r14 - andq %rbx,%rdi - - rorq $4,%r13 - addq %r8,%r12 - xorq %rdx,%rdi - - rorq $6,%r14 - xorq %rbx,%r13 - addq %rdi,%r12 - - movq %r9,%rdi - addq (%rbp),%r12 - xorq %r9,%r14 - - xorq %r10,%rdi - rorq $14,%r13 - movq %r10,%r8 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r8 - addq %r12,%rax - addq %r12,%r8 - - leaq 24(%rbp),%rbp - addq %r14,%r8 - movq 96(%rsi),%r12 - movq %rax,%r13 - movq %r8,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rbx,%r15 - - xorq %rax,%r13 - rorq $5,%r14 - xorq %rcx,%r15 - - movq %r12,96(%rsp) - xorq %r8,%r14 - andq %rax,%r15 - - rorq $4,%r13 - addq %rdx,%r12 - xorq %rcx,%r15 - - rorq $6,%r14 - xorq %rax,%r13 - addq %r15,%r12 - - movq %r8,%r15 - addq (%rbp),%r12 - xorq %r8,%r14 - - xorq %r9,%r15 - rorq $14,%r13 - movq %r9,%rdx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rdx - addq %r12,%r11 - addq %r12,%rdx - - leaq 8(%rbp),%rbp - addq %r14,%rdx - movq 104(%rsi),%r12 - movq %r11,%r13 - movq %rdx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rax,%rdi - - xorq %r11,%r13 - rorq $5,%r14 - xorq %rbx,%rdi - - movq %r12,104(%rsp) - xorq %rdx,%r14 - andq %r11,%rdi - - rorq $4,%r13 - addq %rcx,%r12 - xorq %rbx,%rdi - - rorq $6,%r14 - xorq %r11,%r13 - addq %rdi,%r12 - - movq %rdx,%rdi - addq (%rbp),%r12 - xorq %rdx,%r14 - - xorq %r8,%rdi - rorq $14,%r13 - movq %r8,%rcx - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rcx - addq %r12,%r10 - addq %r12,%rcx - - leaq 24(%rbp),%rbp - addq %r14,%rcx - movq 112(%rsi),%r12 - movq %r10,%r13 - movq %rcx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r11,%r15 - - xorq %r10,%r13 - rorq $5,%r14 - xorq %rax,%r15 - - movq %r12,112(%rsp) - xorq %rcx,%r14 - andq %r10,%r15 - - rorq $4,%r13 - addq %rbx,%r12 - xorq %rax,%r15 - - rorq $6,%r14 - xorq %r10,%r13 - addq %r15,%r12 - - movq %rcx,%r15 - addq (%rbp),%r12 - xorq %rcx,%r14 - - xorq %rdx,%r15 - rorq $14,%r13 - movq %rdx,%rbx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rbx - addq %r12,%r9 - addq %r12,%rbx - - leaq 8(%rbp),%rbp - addq %r14,%rbx - movq 120(%rsi),%r12 - movq %r9,%r13 - movq %rbx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r10,%rdi - - xorq %r9,%r13 - rorq $5,%r14 - xorq %r11,%rdi - - movq %r12,120(%rsp) - xorq %rbx,%r14 - andq %r9,%rdi - - rorq $4,%r13 - addq %rax,%r12 - xorq %r11,%rdi - - rorq $6,%r14 - xorq %r9,%r13 - addq %rdi,%r12 - - movq %rbx,%rdi - addq (%rbp),%r12 - xorq %rbx,%r14 - - xorq %rcx,%rdi - rorq $14,%r13 - movq %rcx,%rax - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rax - addq %r12,%r8 - addq %r12,%rax - - leaq 24(%rbp),%rbp - jmp L$rounds_16_xx -.p2align 4 -L$rounds_16_xx: - movq 8(%rsp),%r13 - movq 112(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rax - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 72(%rsp),%r12 - - addq 0(%rsp),%r12 - movq %r8,%r13 - addq %r15,%r12 - movq %rax,%r14 - rorq $23,%r13 - movq %r9,%r15 - - xorq %r8,%r13 - rorq $5,%r14 - xorq %r10,%r15 - - movq %r12,0(%rsp) - xorq %rax,%r14 - andq %r8,%r15 - - rorq $4,%r13 - addq %r11,%r12 - xorq %r10,%r15 - - rorq $6,%r14 - xorq %r8,%r13 - addq %r15,%r12 - - movq %rax,%r15 - addq (%rbp),%r12 - xorq %rax,%r14 - - xorq %rbx,%r15 - rorq $14,%r13 - movq %rbx,%r11 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r11 - addq %r12,%rdx - addq %r12,%r11 - - leaq 8(%rbp),%rbp - movq 16(%rsp),%r13 - movq 120(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r11 - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 80(%rsp),%r12 - - addq 8(%rsp),%r12 - movq %rdx,%r13 - addq %rdi,%r12 - movq %r11,%r14 - rorq $23,%r13 - movq %r8,%rdi - - xorq %rdx,%r13 - rorq $5,%r14 - xorq %r9,%rdi - - movq %r12,8(%rsp) - xorq %r11,%r14 - andq %rdx,%rdi - - rorq $4,%r13 - addq %r10,%r12 - xorq %r9,%rdi - - rorq $6,%r14 - xorq %rdx,%r13 - addq %rdi,%r12 - - movq %r11,%rdi - addq (%rbp),%r12 - xorq %r11,%r14 - - xorq %rax,%rdi - rorq $14,%r13 - movq %rax,%r10 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r10 - addq %r12,%rcx - addq %r12,%r10 - - leaq 24(%rbp),%rbp - movq 24(%rsp),%r13 - movq 0(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r10 - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 88(%rsp),%r12 - - addq 16(%rsp),%r12 - movq %rcx,%r13 - addq %r15,%r12 - movq %r10,%r14 - rorq $23,%r13 - movq %rdx,%r15 - - xorq %rcx,%r13 - rorq $5,%r14 - xorq %r8,%r15 - - movq %r12,16(%rsp) - xorq %r10,%r14 - andq %rcx,%r15 - - rorq $4,%r13 - addq %r9,%r12 - xorq %r8,%r15 - - rorq $6,%r14 - xorq %rcx,%r13 - addq %r15,%r12 - - movq %r10,%r15 - addq (%rbp),%r12 - xorq %r10,%r14 - - xorq %r11,%r15 - rorq $14,%r13 - movq %r11,%r9 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r9 - addq %r12,%rbx - addq %r12,%r9 - - leaq 8(%rbp),%rbp - movq 32(%rsp),%r13 - movq 8(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r9 - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 96(%rsp),%r12 - - addq 24(%rsp),%r12 - movq %rbx,%r13 - addq %rdi,%r12 - movq %r9,%r14 - rorq $23,%r13 - movq %rcx,%rdi - - xorq %rbx,%r13 - rorq $5,%r14 - xorq %rdx,%rdi - - movq %r12,24(%rsp) - xorq %r9,%r14 - andq %rbx,%rdi - - rorq $4,%r13 - addq %r8,%r12 - xorq %rdx,%rdi - - rorq $6,%r14 - xorq %rbx,%r13 - addq %rdi,%r12 - - movq %r9,%rdi - addq (%rbp),%r12 - xorq %r9,%r14 - - xorq %r10,%rdi - rorq $14,%r13 - movq %r10,%r8 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r8 - addq %r12,%rax - addq %r12,%r8 - - leaq 24(%rbp),%rbp - movq 40(%rsp),%r13 - movq 16(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r8 - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 104(%rsp),%r12 - - addq 32(%rsp),%r12 - movq %rax,%r13 - addq %r15,%r12 - movq %r8,%r14 - rorq $23,%r13 - movq %rbx,%r15 - - xorq %rax,%r13 - rorq $5,%r14 - xorq %rcx,%r15 - - movq %r12,32(%rsp) - xorq %r8,%r14 - andq %rax,%r15 - - rorq $4,%r13 - addq %rdx,%r12 - xorq %rcx,%r15 - - rorq $6,%r14 - xorq %rax,%r13 - addq %r15,%r12 - - movq %r8,%r15 - addq (%rbp),%r12 - xorq %r8,%r14 - - xorq %r9,%r15 - rorq $14,%r13 - movq %r9,%rdx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rdx - addq %r12,%r11 - addq %r12,%rdx - - leaq 8(%rbp),%rbp - movq 48(%rsp),%r13 - movq 24(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rdx - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 112(%rsp),%r12 - - addq 40(%rsp),%r12 - movq %r11,%r13 - addq %rdi,%r12 - movq %rdx,%r14 - rorq $23,%r13 - movq %rax,%rdi - - xorq %r11,%r13 - rorq $5,%r14 - xorq %rbx,%rdi - - movq %r12,40(%rsp) - xorq %rdx,%r14 - andq %r11,%rdi - - rorq $4,%r13 - addq %rcx,%r12 - xorq %rbx,%rdi - - rorq $6,%r14 - xorq %r11,%r13 - addq %rdi,%r12 - - movq %rdx,%rdi - addq (%rbp),%r12 - xorq %rdx,%r14 - - xorq %r8,%rdi - rorq $14,%r13 - movq %r8,%rcx - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rcx - addq %r12,%r10 - addq %r12,%rcx - - leaq 24(%rbp),%rbp - movq 56(%rsp),%r13 - movq 32(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rcx - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 120(%rsp),%r12 - - addq 48(%rsp),%r12 - movq %r10,%r13 - addq %r15,%r12 - movq %rcx,%r14 - rorq $23,%r13 - movq %r11,%r15 - - xorq %r10,%r13 - rorq $5,%r14 - xorq %rax,%r15 - - movq %r12,48(%rsp) - xorq %rcx,%r14 - andq %r10,%r15 - - rorq $4,%r13 - addq %rbx,%r12 - xorq %rax,%r15 - - rorq $6,%r14 - xorq %r10,%r13 - addq %r15,%r12 - - movq %rcx,%r15 - addq (%rbp),%r12 - xorq %rcx,%r14 - - xorq %rdx,%r15 - rorq $14,%r13 - movq %rdx,%rbx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rbx - addq %r12,%r9 - addq %r12,%rbx - - leaq 8(%rbp),%rbp - movq 64(%rsp),%r13 - movq 40(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rbx - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 0(%rsp),%r12 - - addq 56(%rsp),%r12 - movq %r9,%r13 - addq %rdi,%r12 - movq %rbx,%r14 - rorq $23,%r13 - movq %r10,%rdi - - xorq %r9,%r13 - rorq $5,%r14 - xorq %r11,%rdi - - movq %r12,56(%rsp) - xorq %rbx,%r14 - andq %r9,%rdi - - rorq $4,%r13 - addq %rax,%r12 - xorq %r11,%rdi - - rorq $6,%r14 - xorq %r9,%r13 - addq %rdi,%r12 - - movq %rbx,%rdi - addq (%rbp),%r12 - xorq %rbx,%r14 - - xorq %rcx,%rdi - rorq $14,%r13 - movq %rcx,%rax - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rax - addq %r12,%r8 - addq %r12,%rax - - leaq 24(%rbp),%rbp - movq 72(%rsp),%r13 - movq 48(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rax - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 8(%rsp),%r12 - - addq 64(%rsp),%r12 - movq %r8,%r13 - addq %r15,%r12 - movq %rax,%r14 - rorq $23,%r13 - movq %r9,%r15 - - xorq %r8,%r13 - rorq $5,%r14 - xorq %r10,%r15 - - movq %r12,64(%rsp) - xorq %rax,%r14 - andq %r8,%r15 - - rorq $4,%r13 - addq %r11,%r12 - xorq %r10,%r15 - - rorq $6,%r14 - xorq %r8,%r13 - addq %r15,%r12 - - movq %rax,%r15 - addq (%rbp),%r12 - xorq %rax,%r14 - - xorq %rbx,%r15 - rorq $14,%r13 - movq %rbx,%r11 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r11 - addq %r12,%rdx - addq %r12,%r11 - - leaq 8(%rbp),%rbp - movq 80(%rsp),%r13 - movq 56(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r11 - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 16(%rsp),%r12 - - addq 72(%rsp),%r12 - movq %rdx,%r13 - addq %rdi,%r12 - movq %r11,%r14 - rorq $23,%r13 - movq %r8,%rdi - - xorq %rdx,%r13 - rorq $5,%r14 - xorq %r9,%rdi - - movq %r12,72(%rsp) - xorq %r11,%r14 - andq %rdx,%rdi - - rorq $4,%r13 - addq %r10,%r12 - xorq %r9,%rdi - - rorq $6,%r14 - xorq %rdx,%r13 - addq %rdi,%r12 - - movq %r11,%rdi - addq (%rbp),%r12 - xorq %r11,%r14 - - xorq %rax,%rdi - rorq $14,%r13 - movq %rax,%r10 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r10 - addq %r12,%rcx - addq %r12,%r10 - - leaq 24(%rbp),%rbp - movq 88(%rsp),%r13 - movq 64(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r10 - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 24(%rsp),%r12 - - addq 80(%rsp),%r12 - movq %rcx,%r13 - addq %r15,%r12 - movq %r10,%r14 - rorq $23,%r13 - movq %rdx,%r15 - - xorq %rcx,%r13 - rorq $5,%r14 - xorq %r8,%r15 - - movq %r12,80(%rsp) - xorq %r10,%r14 - andq %rcx,%r15 - - rorq $4,%r13 - addq %r9,%r12 - xorq %r8,%r15 - - rorq $6,%r14 - xorq %rcx,%r13 - addq %r15,%r12 - - movq %r10,%r15 - addq (%rbp),%r12 - xorq %r10,%r14 - - xorq %r11,%r15 - rorq $14,%r13 - movq %r11,%r9 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r9 - addq %r12,%rbx - addq %r12,%r9 - - leaq 8(%rbp),%rbp - movq 96(%rsp),%r13 - movq 72(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r9 - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 32(%rsp),%r12 - - addq 88(%rsp),%r12 - movq %rbx,%r13 - addq %rdi,%r12 - movq %r9,%r14 - rorq $23,%r13 - movq %rcx,%rdi - - xorq %rbx,%r13 - rorq $5,%r14 - xorq %rdx,%rdi - - movq %r12,88(%rsp) - xorq %r9,%r14 - andq %rbx,%rdi - - rorq $4,%r13 - addq %r8,%r12 - xorq %rdx,%rdi - - rorq $6,%r14 - xorq %rbx,%r13 - addq %rdi,%r12 - - movq %r9,%rdi - addq (%rbp),%r12 - xorq %r9,%r14 - - xorq %r10,%rdi - rorq $14,%r13 - movq %r10,%r8 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r8 - addq %r12,%rax - addq %r12,%r8 - - leaq 24(%rbp),%rbp - movq 104(%rsp),%r13 - movq 80(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r8 - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 40(%rsp),%r12 - - addq 96(%rsp),%r12 - movq %rax,%r13 - addq %r15,%r12 - movq %r8,%r14 - rorq $23,%r13 - movq %rbx,%r15 - - xorq %rax,%r13 - rorq $5,%r14 - xorq %rcx,%r15 - - movq %r12,96(%rsp) - xorq %r8,%r14 - andq %rax,%r15 - - rorq $4,%r13 - addq %rdx,%r12 - xorq %rcx,%r15 - - rorq $6,%r14 - xorq %rax,%r13 - addq %r15,%r12 - - movq %r8,%r15 - addq (%rbp),%r12 - xorq %r8,%r14 - - xorq %r9,%r15 - rorq $14,%r13 - movq %r9,%rdx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rdx - addq %r12,%r11 - addq %r12,%rdx - - leaq 8(%rbp),%rbp - movq 112(%rsp),%r13 - movq 88(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rdx - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 48(%rsp),%r12 - - addq 104(%rsp),%r12 - movq %r11,%r13 - addq %rdi,%r12 - movq %rdx,%r14 - rorq $23,%r13 - movq %rax,%rdi - - xorq %r11,%r13 - rorq $5,%r14 - xorq %rbx,%rdi - - movq %r12,104(%rsp) - xorq %rdx,%r14 - andq %r11,%rdi - - rorq $4,%r13 - addq %rcx,%r12 - xorq %rbx,%rdi - - rorq $6,%r14 - xorq %r11,%r13 - addq %rdi,%r12 - - movq %rdx,%rdi - addq (%rbp),%r12 - xorq %rdx,%r14 - - xorq %r8,%rdi - rorq $14,%r13 - movq %r8,%rcx - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rcx - addq %r12,%r10 - addq %r12,%rcx - - leaq 24(%rbp),%rbp - movq 120(%rsp),%r13 - movq 96(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rcx - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 56(%rsp),%r12 - - addq 112(%rsp),%r12 - movq %r10,%r13 - addq %r15,%r12 - movq %rcx,%r14 - rorq $23,%r13 - movq %r11,%r15 - - xorq %r10,%r13 - rorq $5,%r14 - xorq %rax,%r15 - - movq %r12,112(%rsp) - xorq %rcx,%r14 - andq %r10,%r15 - - rorq $4,%r13 - addq %rbx,%r12 - xorq %rax,%r15 - - rorq $6,%r14 - xorq %r10,%r13 - addq %r15,%r12 - - movq %rcx,%r15 - addq (%rbp),%r12 - xorq %rcx,%r14 - - xorq %rdx,%r15 - rorq $14,%r13 - movq %rdx,%rbx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rbx - addq %r12,%r9 - addq %r12,%rbx - - leaq 8(%rbp),%rbp - movq 0(%rsp),%r13 - movq 104(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rbx - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 64(%rsp),%r12 - - addq 120(%rsp),%r12 - movq %r9,%r13 - addq %rdi,%r12 - movq %rbx,%r14 - rorq $23,%r13 - movq %r10,%rdi - - xorq %r9,%r13 - rorq $5,%r14 - xorq %r11,%rdi - - movq %r12,120(%rsp) - xorq %rbx,%r14 - andq %r9,%rdi - - rorq $4,%r13 - addq %rax,%r12 - xorq %r11,%rdi - - rorq $6,%r14 - xorq %r9,%r13 - addq %rdi,%r12 - - movq %rbx,%rdi - addq (%rbp),%r12 - xorq %rbx,%r14 - - xorq %rcx,%rdi - rorq $14,%r13 - movq %rcx,%rax - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rax - addq %r12,%r8 - addq %r12,%rax - - leaq 24(%rbp),%rbp - cmpb $0,7(%rbp) - jnz L$rounds_16_xx - - movq 128+0(%rsp),%rdi - addq %r14,%rax - leaq 128(%rsi),%rsi - - addq 0(%rdi),%rax - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb L$loop - - movq 152(%rsp),%rsi - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$epilogue: - .byte 0xf3,0xc3 - - -.p2align 6 - -K512: -.quad 0x428a2f98d728ae22,0x7137449123ef65cd -.quad 0x428a2f98d728ae22,0x7137449123ef65cd -.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc -.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc -.quad 0x3956c25bf348b538,0x59f111f1b605d019 -.quad 0x3956c25bf348b538,0x59f111f1b605d019 -.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 -.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 -.quad 0xd807aa98a3030242,0x12835b0145706fbe -.quad 0xd807aa98a3030242,0x12835b0145706fbe -.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 -.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 -.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 -.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 -.quad 0x9bdc06a725c71235,0xc19bf174cf692694 -.quad 0x9bdc06a725c71235,0xc19bf174cf692694 -.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 -.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 -.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 -.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 -.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 -.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 -.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 -.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 -.quad 0x983e5152ee66dfab,0xa831c66d2db43210 -.quad 0x983e5152ee66dfab,0xa831c66d2db43210 -.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 -.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 -.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 -.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 -.quad 0x06ca6351e003826f,0x142929670a0e6e70 -.quad 0x06ca6351e003826f,0x142929670a0e6e70 -.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 -.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 -.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df -.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df -.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 -.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 -.quad 0x81c2c92e47edaee6,0x92722c851482353b -.quad 0x81c2c92e47edaee6,0x92722c851482353b -.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 -.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 -.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 -.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 -.quad 0xd192e819d6ef5218,0xd69906245565a910 -.quad 0xd192e819d6ef5218,0xd69906245565a910 -.quad 0xf40e35855771202a,0x106aa07032bbd1b8 -.quad 0xf40e35855771202a,0x106aa07032bbd1b8 -.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 -.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 -.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 -.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 -.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb -.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb -.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 -.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 -.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 -.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 -.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec -.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec -.quad 0x90befffa23631e28,0xa4506cebde82bde9 -.quad 0x90befffa23631e28,0xa4506cebde82bde9 -.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b -.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b -.quad 0xca273eceea26619c,0xd186b8c721c0c207 -.quad 0xca273eceea26619c,0xd186b8c721c0c207 -.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 -.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 -.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 -.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 -.quad 0x113f9804bef90dae,0x1b710b35131c471b -.quad 0x113f9804bef90dae,0x1b710b35131c471b -.quad 0x28db77f523047d84,0x32caab7b40c72493 -.quad 0x28db77f523047d84,0x32caab7b40c72493 -.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c -.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c -.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a -.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a -.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 -.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 - -.quad 0x0001020304050607,0x08090a0b0c0d0e0f -.quad 0x0001020304050607,0x08090a0b0c0d0e0f -.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 - -.p2align 6 -sha512_block_data_order_avx: - -L$avx_shortcut: - movq %rsp,%rax - - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,152(%rsp) - -L$prologue_avx: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp L$loop_avx -.p2align 4 -L$loop_avx: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp L$avx_00_47 - -.p2align 4 -L$avx_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r8,%r13 - xorq %r10,%r12 - vpaddq %xmm11,%xmm0,%xmm0 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r8,%r12 - xorq %r8,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 0(%rsp),%r11 - movq %rax,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rbx,%r15 - addq %r12,%r11 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm7,%xmm11 - addq %r11,%rdx - addq %rdi,%r11 - vpxor %xmm9,%xmm8,%xmm8 - movq %rdx,%r13 - addq %r11,%r14 - vpsllq $3,%xmm7,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %r8,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm7,%xmm9 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rdx,%r12 - xorq %rdx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 8(%rsp),%r10 - movq %r11,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r9,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rax,%rdi - addq %r12,%r10 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm0,%xmm0 - xorq %r11,%r14 - addq %r13,%r10 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rcx,%r13 - xorq %r8,%r12 - vpaddq %xmm11,%xmm1,%xmm1 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rcx,%r12 - xorq %rcx,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 16(%rsp),%r9 - movq %r10,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r11,%r15 - addq %r12,%r9 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm0,%xmm11 - addq %r9,%rbx - addq %rdi,%r9 - vpxor %xmm9,%xmm8,%xmm8 - movq %rbx,%r13 - addq %r9,%r14 - vpsllq $3,%xmm0,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm0,%xmm9 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rbx,%r12 - xorq %rbx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 24(%rsp),%r8 - movq %r9,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r10,%rdi - addq %r12,%r8 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm1,%xmm1 - xorq %r9,%r14 - addq %r13,%r8 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rax,%r13 - xorq %rcx,%r12 - vpaddq %xmm11,%xmm2,%xmm2 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rax,%r12 - xorq %rax,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 32(%rsp),%rdx - movq %r8,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r9,%r15 - addq %r12,%rdx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm1,%xmm11 - addq %rdx,%r11 - addq %rdi,%rdx - vpxor %xmm9,%xmm8,%xmm8 - movq %r11,%r13 - addq %rdx,%r14 - vpsllq $3,%xmm1,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %rax,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm1,%xmm9 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r11,%r12 - xorq %r11,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 40(%rsp),%rcx - movq %rdx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r8,%rdi - addq %r12,%rcx - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm2,%xmm2 - xorq %rdx,%r14 - addq %r13,%rcx - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r10,%r13 - xorq %rax,%r12 - vpaddq %xmm11,%xmm3,%xmm3 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r10,%r12 - xorq %r10,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 48(%rsp),%rbx - movq %rcx,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rdx,%r15 - addq %r12,%rbx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm2,%xmm11 - addq %rbx,%r9 - addq %rdi,%rbx - vpxor %xmm9,%xmm8,%xmm8 - movq %r9,%r13 - addq %rbx,%r14 - vpsllq $3,%xmm2,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r10,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm2,%xmm9 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r9,%r12 - xorq %r9,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 56(%rsp),%rax - movq %rbx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r11,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rcx,%rdi - addq %r12,%rax - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm3,%xmm3 - xorq %rbx,%r14 - addq %r13,%rax - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r8,%r13 - xorq %r10,%r12 - vpaddq %xmm11,%xmm4,%xmm4 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r8,%r12 - xorq %r8,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 64(%rsp),%r11 - movq %rax,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rbx,%r15 - addq %r12,%r11 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm3,%xmm11 - addq %r11,%rdx - addq %rdi,%r11 - vpxor %xmm9,%xmm8,%xmm8 - movq %rdx,%r13 - addq %r11,%r14 - vpsllq $3,%xmm3,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %r8,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm3,%xmm9 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rdx,%r12 - xorq %rdx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 72(%rsp),%r10 - movq %r11,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r9,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rax,%rdi - addq %r12,%r10 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm4,%xmm4 - xorq %r11,%r14 - addq %r13,%r10 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rcx,%r13 - xorq %r8,%r12 - vpaddq %xmm11,%xmm5,%xmm5 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rcx,%r12 - xorq %rcx,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 80(%rsp),%r9 - movq %r10,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r11,%r15 - addq %r12,%r9 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm4,%xmm11 - addq %r9,%rbx - addq %rdi,%r9 - vpxor %xmm9,%xmm8,%xmm8 - movq %rbx,%r13 - addq %r9,%r14 - vpsllq $3,%xmm4,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm4,%xmm9 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rbx,%r12 - xorq %rbx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 88(%rsp),%r8 - movq %r9,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r10,%rdi - addq %r12,%r8 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm5,%xmm5 - xorq %r9,%r14 - addq %r13,%r8 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rax,%r13 - xorq %rcx,%r12 - vpaddq %xmm11,%xmm6,%xmm6 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rax,%r12 - xorq %rax,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 96(%rsp),%rdx - movq %r8,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r9,%r15 - addq %r12,%rdx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm5,%xmm11 - addq %rdx,%r11 - addq %rdi,%rdx - vpxor %xmm9,%xmm8,%xmm8 - movq %r11,%r13 - addq %rdx,%r14 - vpsllq $3,%xmm5,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %rax,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm5,%xmm9 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r11,%r12 - xorq %r11,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 104(%rsp),%rcx - movq %rdx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r8,%rdi - addq %r12,%rcx - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm6,%xmm6 - xorq %rdx,%r14 - addq %r13,%rcx - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r10,%r13 - xorq %rax,%r12 - vpaddq %xmm11,%xmm7,%xmm7 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r10,%r12 - xorq %r10,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 112(%rsp),%rbx - movq %rcx,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rdx,%r15 - addq %r12,%rbx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm6,%xmm11 - addq %rbx,%r9 - addq %rdi,%rbx - vpxor %xmm9,%xmm8,%xmm8 - movq %r9,%r13 - addq %rbx,%r14 - vpsllq $3,%xmm6,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r10,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm6,%xmm9 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r9,%r12 - xorq %r9,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 120(%rsp),%rax - movq %rbx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r11,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rcx,%rdi - addq %r12,%rax - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm7,%xmm7 - xorq %rbx,%r14 - addq %r13,%rax - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne L$avx_00_47 - shrdq $23,%r13,%r13 - movq %r14,%rax - movq %r9,%r12 - shrdq $5,%r14,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r11 - movq %r8,%r12 - shrdq $5,%r14,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - shrdq $6,%r14,%r14 - xorq %rax,%rdi - addq %r12,%r10 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r10 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - xorq %r11,%r15 - addq %r12,%r9 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r9 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - xorq %r10,%rdi - addq %r12,%r8 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r8 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - xorq %r9,%r15 - addq %r12,%rdx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - shrdq $28,%r14,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rdx - movq %rax,%r12 - shrdq $5,%r14,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - xorq %r8,%rdi - addq %r12,%rcx - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rcx - movq %r11,%r12 - shrdq $5,%r14,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rbx - movq %r10,%r12 - shrdq $5,%r14,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - shrdq $6,%r14,%r14 - xorq %rcx,%rdi - addq %r12,%rax - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rax - movq %r9,%r12 - shrdq $5,%r14,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r11 - movq %r8,%r12 - shrdq $5,%r14,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - shrdq $6,%r14,%r14 - xorq %rax,%rdi - addq %r12,%r10 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r10 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - xorq %r11,%r15 - addq %r12,%r9 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r9 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - xorq %r10,%rdi - addq %r12,%r8 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r8 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - xorq %r9,%r15 - addq %r12,%rdx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - shrdq $28,%r14,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rdx - movq %rax,%r12 - shrdq $5,%r14,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - xorq %r8,%rdi - addq %r12,%rcx - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rcx - movq %r11,%r12 - shrdq $5,%r14,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rbx - movq %r10,%r12 - shrdq $5,%r14,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - shrdq $6,%r14,%r14 - xorq %rcx,%rdi - addq %r12,%rax - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb L$loop_avx - - movq 152(%rsp),%rsi - - vzeroupper - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$epilogue_avx: - .byte 0xf3,0xc3 - - -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S deleted file mode 100644 index 31cf3290..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S +++ /dev/null @@ -1,1130 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - - - - - - - - - - - - - - - -.p2align 4 -_vpaes_encrypt_core: - - movq %rdx,%r9 - movq $16,%r11 - movl 240(%rdx),%eax - movdqa %xmm9,%xmm1 - movdqa L$k_ipt(%rip),%xmm2 - pandn %xmm0,%xmm1 - movdqu (%r9),%xmm5 - psrld $4,%xmm1 - pand %xmm9,%xmm0 -.byte 102,15,56,0,208 - movdqa L$k_ipt+16(%rip),%xmm0 -.byte 102,15,56,0,193 - pxor %xmm5,%xmm2 - addq $16,%r9 - pxor %xmm2,%xmm0 - leaq L$k_mc_backward(%rip),%r10 - jmp L$enc_entry - -.p2align 4 -L$enc_loop: - - movdqa %xmm13,%xmm4 - movdqa %xmm12,%xmm0 -.byte 102,15,56,0,226 -.byte 102,15,56,0,195 - pxor %xmm5,%xmm4 - movdqa %xmm15,%xmm5 - pxor %xmm4,%xmm0 - movdqa -64(%r11,%r10,1),%xmm1 -.byte 102,15,56,0,234 - movdqa (%r11,%r10,1),%xmm4 - movdqa %xmm14,%xmm2 -.byte 102,15,56,0,211 - movdqa %xmm0,%xmm3 - pxor %xmm5,%xmm2 -.byte 102,15,56,0,193 - addq $16,%r9 - pxor %xmm2,%xmm0 -.byte 102,15,56,0,220 - addq $16,%r11 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,193 - andq $0x30,%r11 - subq $1,%rax - pxor %xmm3,%xmm0 - -L$enc_entry: - - movdqa %xmm9,%xmm1 - movdqa %xmm11,%xmm5 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm9,%xmm0 -.byte 102,15,56,0,232 - movdqa %xmm10,%xmm3 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 - movdqa %xmm10,%xmm4 - pxor %xmm5,%xmm3 -.byte 102,15,56,0,224 - movdqa %xmm10,%xmm2 - pxor %xmm5,%xmm4 -.byte 102,15,56,0,211 - movdqa %xmm10,%xmm3 - pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 - movdqu (%r9),%xmm5 - pxor %xmm1,%xmm3 - jnz L$enc_loop - - - movdqa -96(%r10),%xmm4 - movdqa -80(%r10),%xmm0 -.byte 102,15,56,0,226 - pxor %xmm5,%xmm4 -.byte 102,15,56,0,195 - movdqa 64(%r11,%r10,1),%xmm1 - pxor %xmm4,%xmm0 -.byte 102,15,56,0,193 - .byte 0xf3,0xc3 - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.p2align 4 -_vpaes_encrypt_core_2x: - - movq %rdx,%r9 - movq $16,%r11 - movl 240(%rdx),%eax - movdqa %xmm9,%xmm1 - movdqa %xmm9,%xmm7 - movdqa L$k_ipt(%rip),%xmm2 - movdqa %xmm2,%xmm8 - pandn %xmm0,%xmm1 - pandn %xmm6,%xmm7 - movdqu (%r9),%xmm5 - - psrld $4,%xmm1 - psrld $4,%xmm7 - pand %xmm9,%xmm0 - pand %xmm9,%xmm6 -.byte 102,15,56,0,208 -.byte 102,68,15,56,0,198 - movdqa L$k_ipt+16(%rip),%xmm0 - movdqa %xmm0,%xmm6 -.byte 102,15,56,0,193 -.byte 102,15,56,0,247 - pxor %xmm5,%xmm2 - pxor %xmm5,%xmm8 - addq $16,%r9 - pxor %xmm2,%xmm0 - pxor %xmm8,%xmm6 - leaq L$k_mc_backward(%rip),%r10 - jmp L$enc2x_entry - -.p2align 4 -L$enc2x_loop: - - movdqa L$k_sb1(%rip),%xmm4 - movdqa L$k_sb1+16(%rip),%xmm0 - movdqa %xmm4,%xmm12 - movdqa %xmm0,%xmm6 -.byte 102,15,56,0,226 -.byte 102,69,15,56,0,224 -.byte 102,15,56,0,195 -.byte 102,65,15,56,0,243 - pxor %xmm5,%xmm4 - pxor %xmm5,%xmm12 - movdqa L$k_sb2(%rip),%xmm5 - movdqa %xmm5,%xmm13 - pxor %xmm4,%xmm0 - pxor %xmm12,%xmm6 - movdqa -64(%r11,%r10,1),%xmm1 - -.byte 102,15,56,0,234 -.byte 102,69,15,56,0,232 - movdqa (%r11,%r10,1),%xmm4 - - movdqa L$k_sb2+16(%rip),%xmm2 - movdqa %xmm2,%xmm8 -.byte 102,15,56,0,211 -.byte 102,69,15,56,0,195 - movdqa %xmm0,%xmm3 - movdqa %xmm6,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm13,%xmm8 -.byte 102,15,56,0,193 -.byte 102,15,56,0,241 - addq $16,%r9 - pxor %xmm2,%xmm0 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,220 -.byte 102,68,15,56,0,220 - addq $16,%r11 - pxor %xmm0,%xmm3 - pxor %xmm6,%xmm11 -.byte 102,15,56,0,193 -.byte 102,15,56,0,241 - andq $0x30,%r11 - subq $1,%rax - pxor %xmm3,%xmm0 - pxor %xmm11,%xmm6 - -L$enc2x_entry: - - movdqa %xmm9,%xmm1 - movdqa %xmm9,%xmm7 - movdqa L$k_inv+16(%rip),%xmm5 - movdqa %xmm5,%xmm13 - pandn %xmm0,%xmm1 - pandn %xmm6,%xmm7 - psrld $4,%xmm1 - psrld $4,%xmm7 - pand %xmm9,%xmm0 - pand %xmm9,%xmm6 -.byte 102,15,56,0,232 -.byte 102,68,15,56,0,238 - movdqa %xmm10,%xmm3 - movdqa %xmm10,%xmm11 - pxor %xmm1,%xmm0 - pxor %xmm7,%xmm6 -.byte 102,15,56,0,217 -.byte 102,68,15,56,0,223 - movdqa %xmm10,%xmm4 - movdqa %xmm10,%xmm12 - pxor %xmm5,%xmm3 - pxor %xmm13,%xmm11 -.byte 102,15,56,0,224 -.byte 102,68,15,56,0,230 - movdqa %xmm10,%xmm2 - movdqa %xmm10,%xmm8 - pxor %xmm5,%xmm4 - pxor %xmm13,%xmm12 -.byte 102,15,56,0,211 -.byte 102,69,15,56,0,195 - movdqa %xmm10,%xmm3 - movdqa %xmm10,%xmm11 - pxor %xmm0,%xmm2 - pxor %xmm6,%xmm8 -.byte 102,15,56,0,220 -.byte 102,69,15,56,0,220 - movdqu (%r9),%xmm5 - - pxor %xmm1,%xmm3 - pxor %xmm7,%xmm11 - jnz L$enc2x_loop - - - movdqa -96(%r10),%xmm4 - movdqa -80(%r10),%xmm0 - movdqa %xmm4,%xmm12 - movdqa %xmm0,%xmm6 -.byte 102,15,56,0,226 -.byte 102,69,15,56,0,224 - pxor %xmm5,%xmm4 - pxor %xmm5,%xmm12 -.byte 102,15,56,0,195 -.byte 102,65,15,56,0,243 - movdqa 64(%r11,%r10,1),%xmm1 - - pxor %xmm4,%xmm0 - pxor %xmm12,%xmm6 -.byte 102,15,56,0,193 -.byte 102,15,56,0,241 - .byte 0xf3,0xc3 - - - - - - - - - -.p2align 4 -_vpaes_decrypt_core: - - movq %rdx,%r9 - movl 240(%rdx),%eax - movdqa %xmm9,%xmm1 - movdqa L$k_dipt(%rip),%xmm2 - pandn %xmm0,%xmm1 - movq %rax,%r11 - psrld $4,%xmm1 - movdqu (%r9),%xmm5 - shlq $4,%r11 - pand %xmm9,%xmm0 -.byte 102,15,56,0,208 - movdqa L$k_dipt+16(%rip),%xmm0 - xorq $0x30,%r11 - leaq L$k_dsbd(%rip),%r10 -.byte 102,15,56,0,193 - andq $0x30,%r11 - pxor %xmm5,%xmm2 - movdqa L$k_mc_forward+48(%rip),%xmm5 - pxor %xmm2,%xmm0 - addq $16,%r9 - addq %r10,%r11 - jmp L$dec_entry - -.p2align 4 -L$dec_loop: - - - - movdqa -32(%r10),%xmm4 - movdqa -16(%r10),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa 0(%r10),%xmm4 - pxor %xmm1,%xmm0 - movdqa 16(%r10),%xmm1 - -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa 32(%r10),%xmm4 - pxor %xmm1,%xmm0 - movdqa 48(%r10),%xmm1 - -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa 64(%r10),%xmm4 - pxor %xmm1,%xmm0 - movdqa 80(%r10),%xmm1 - -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - addq $16,%r9 -.byte 102,15,58,15,237,12 - pxor %xmm1,%xmm0 - subq $1,%rax - -L$dec_entry: - - movdqa %xmm9,%xmm1 - pandn %xmm0,%xmm1 - movdqa %xmm11,%xmm2 - psrld $4,%xmm1 - pand %xmm9,%xmm0 -.byte 102,15,56,0,208 - movdqa %xmm10,%xmm3 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 - movdqa %xmm10,%xmm4 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,224 - pxor %xmm2,%xmm4 - movdqa %xmm10,%xmm2 -.byte 102,15,56,0,211 - movdqa %xmm10,%xmm3 - pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 - movdqu (%r9),%xmm0 - pxor %xmm1,%xmm3 - jnz L$dec_loop - - - movdqa 96(%r10),%xmm4 -.byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 112(%r10),%xmm0 - movdqa -352(%r11),%xmm2 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 -.byte 102,15,56,0,194 - .byte 0xf3,0xc3 - - - - - - - - - -.p2align 4 -_vpaes_schedule_core: - - - - - - - call _vpaes_preheat - movdqa L$k_rcon(%rip),%xmm8 - movdqu (%rdi),%xmm0 - - - movdqa %xmm0,%xmm3 - leaq L$k_ipt(%rip),%r11 - call _vpaes_schedule_transform - movdqa %xmm0,%xmm7 - - leaq L$k_sr(%rip),%r10 - testq %rcx,%rcx - jnz L$schedule_am_decrypting - - - movdqu %xmm0,(%rdx) - jmp L$schedule_go - -L$schedule_am_decrypting: - - movdqa (%r8,%r10,1),%xmm1 -.byte 102,15,56,0,217 - movdqu %xmm3,(%rdx) - xorq $0x30,%r8 - -L$schedule_go: - cmpl $192,%esi - ja L$schedule_256 - je L$schedule_192 - - - - - - - - - - -L$schedule_128: - movl $10,%esi - -L$oop_schedule_128: - call _vpaes_schedule_round - decq %rsi - jz L$schedule_mangle_last - call _vpaes_schedule_mangle - jmp L$oop_schedule_128 - - - - - - - - - - - - - - - - -.p2align 4 -L$schedule_192: - movdqu 8(%rdi),%xmm0 - call _vpaes_schedule_transform - movdqa %xmm0,%xmm6 - pxor %xmm4,%xmm4 - movhlps %xmm4,%xmm6 - movl $4,%esi - -L$oop_schedule_192: - call _vpaes_schedule_round -.byte 102,15,58,15,198,8 - call _vpaes_schedule_mangle - call _vpaes_schedule_192_smear - call _vpaes_schedule_mangle - call _vpaes_schedule_round - decq %rsi - jz L$schedule_mangle_last - call _vpaes_schedule_mangle - call _vpaes_schedule_192_smear - jmp L$oop_schedule_192 - - - - - - - - - - - -.p2align 4 -L$schedule_256: - movdqu 16(%rdi),%xmm0 - call _vpaes_schedule_transform - movl $7,%esi - -L$oop_schedule_256: - call _vpaes_schedule_mangle - movdqa %xmm0,%xmm6 - - - call _vpaes_schedule_round - decq %rsi - jz L$schedule_mangle_last - call _vpaes_schedule_mangle - - - pshufd $0xFF,%xmm0,%xmm0 - movdqa %xmm7,%xmm5 - movdqa %xmm6,%xmm7 - call _vpaes_schedule_low_round - movdqa %xmm5,%xmm7 - - jmp L$oop_schedule_256 - - - - - - - - - - - - -.p2align 4 -L$schedule_mangle_last: - - leaq L$k_deskew(%rip),%r11 - testq %rcx,%rcx - jnz L$schedule_mangle_last_dec - - - movdqa (%r8,%r10,1),%xmm1 -.byte 102,15,56,0,193 - leaq L$k_opt(%rip),%r11 - addq $32,%rdx - -L$schedule_mangle_last_dec: - addq $-16,%rdx - pxor L$k_s63(%rip),%xmm0 - call _vpaes_schedule_transform - movdqu %xmm0,(%rdx) - - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - .byte 0xf3,0xc3 - - - - - - - - - - - - - - - - - - -.p2align 4 -_vpaes_schedule_192_smear: - - pshufd $0x80,%xmm6,%xmm1 - pshufd $0xFE,%xmm7,%xmm0 - pxor %xmm1,%xmm6 - pxor %xmm1,%xmm1 - pxor %xmm0,%xmm6 - movdqa %xmm6,%xmm0 - movhlps %xmm1,%xmm6 - .byte 0xf3,0xc3 - - - - - - - - - - - - - - - - - - - - - - -.p2align 4 -_vpaes_schedule_round: - - - pxor %xmm1,%xmm1 -.byte 102,65,15,58,15,200,15 -.byte 102,69,15,58,15,192,15 - pxor %xmm1,%xmm7 - - - pshufd $0xFF,%xmm0,%xmm0 -.byte 102,15,58,15,192,1 - - - - -_vpaes_schedule_low_round: - - movdqa %xmm7,%xmm1 - pslldq $4,%xmm7 - pxor %xmm1,%xmm7 - movdqa %xmm7,%xmm1 - pslldq $8,%xmm7 - pxor %xmm1,%xmm7 - pxor L$k_s63(%rip),%xmm7 - - - movdqa %xmm9,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm9,%xmm0 - movdqa %xmm11,%xmm2 -.byte 102,15,56,0,208 - pxor %xmm1,%xmm0 - movdqa %xmm10,%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 - movdqa %xmm10,%xmm4 -.byte 102,15,56,0,224 - pxor %xmm2,%xmm4 - movdqa %xmm10,%xmm2 -.byte 102,15,56,0,211 - pxor %xmm0,%xmm2 - movdqa %xmm10,%xmm3 -.byte 102,15,56,0,220 - pxor %xmm1,%xmm3 - movdqa %xmm13,%xmm4 -.byte 102,15,56,0,226 - movdqa %xmm12,%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 - - - pxor %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - .byte 0xf3,0xc3 - - - - - - - - - - - - - -.p2align 4 -_vpaes_schedule_transform: - - movdqa %xmm9,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm9,%xmm0 - movdqa (%r11),%xmm2 -.byte 102,15,56,0,208 - movdqa 16(%r11),%xmm0 -.byte 102,15,56,0,193 - pxor %xmm2,%xmm0 - .byte 0xf3,0xc3 - - - - - - - - - - - - - - - - - - - - - - - - - - - -.p2align 4 -_vpaes_schedule_mangle: - - movdqa %xmm0,%xmm4 - movdqa L$k_mc_forward(%rip),%xmm5 - testq %rcx,%rcx - jnz L$schedule_mangle_dec - - - addq $16,%rdx - pxor L$k_s63(%rip),%xmm4 -.byte 102,15,56,0,229 - movdqa %xmm4,%xmm3 -.byte 102,15,56,0,229 - pxor %xmm4,%xmm3 -.byte 102,15,56,0,229 - pxor %xmm4,%xmm3 - - jmp L$schedule_mangle_both -.p2align 4 -L$schedule_mangle_dec: - - leaq L$k_dksd(%rip),%r11 - movdqa %xmm9,%xmm1 - pandn %xmm4,%xmm1 - psrld $4,%xmm1 - pand %xmm9,%xmm4 - - movdqa 0(%r11),%xmm2 -.byte 102,15,56,0,212 - movdqa 16(%r11),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - - movdqa 32(%r11),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 48(%r11),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - - movdqa 64(%r11),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 80(%r11),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - - movdqa 96(%r11),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 112(%r11),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 - - addq $-16,%rdx - -L$schedule_mangle_both: - movdqa (%r8,%r10,1),%xmm1 -.byte 102,15,56,0,217 - addq $-16,%r8 - andq $0x30,%r8 - movdqu %xmm3,(%rdx) - .byte 0xf3,0xc3 - - - - - - -.globl _vpaes_set_encrypt_key -.private_extern _vpaes_set_encrypt_key - -.p2align 4 -_vpaes_set_encrypt_key: - -#ifdef BORINGSSL_DISPATCH_TEST - - movb $1,_BORINGSSL_function_hit+5(%rip) -#endif - - movl %esi,%eax - shrl $5,%eax - addl $5,%eax - movl %eax,240(%rdx) - - movl $0,%ecx - movl $0x30,%r8d - call _vpaes_schedule_core - xorl %eax,%eax - .byte 0xf3,0xc3 - - - -.globl _vpaes_set_decrypt_key -.private_extern _vpaes_set_decrypt_key - -.p2align 4 -_vpaes_set_decrypt_key: - - movl %esi,%eax - shrl $5,%eax - addl $5,%eax - movl %eax,240(%rdx) - shll $4,%eax - leaq 16(%rdx,%rax,1),%rdx - - movl $1,%ecx - movl %esi,%r8d - shrl $1,%r8d - andl $32,%r8d - xorl $32,%r8d - call _vpaes_schedule_core - xorl %eax,%eax - .byte 0xf3,0xc3 - - - -.globl _vpaes_encrypt -.private_extern _vpaes_encrypt - -.p2align 4 -_vpaes_encrypt: - -#ifdef BORINGSSL_DISPATCH_TEST - - movb $1,_BORINGSSL_function_hit+4(%rip) -#endif - movdqu (%rdi),%xmm0 - call _vpaes_preheat - call _vpaes_encrypt_core - movdqu %xmm0,(%rsi) - .byte 0xf3,0xc3 - - - -.globl _vpaes_decrypt -.private_extern _vpaes_decrypt - -.p2align 4 -_vpaes_decrypt: - - movdqu (%rdi),%xmm0 - call _vpaes_preheat - call _vpaes_decrypt_core - movdqu %xmm0,(%rsi) - .byte 0xf3,0xc3 - - -.globl _vpaes_cbc_encrypt -.private_extern _vpaes_cbc_encrypt - -.p2align 4 -_vpaes_cbc_encrypt: - - xchgq %rcx,%rdx - subq $16,%rcx - jc L$cbc_abort - movdqu (%r8),%xmm6 - subq %rdi,%rsi - call _vpaes_preheat - cmpl $0,%r9d - je L$cbc_dec_loop - jmp L$cbc_enc_loop -.p2align 4 -L$cbc_enc_loop: - movdqu (%rdi),%xmm0 - pxor %xmm6,%xmm0 - call _vpaes_encrypt_core - movdqa %xmm0,%xmm6 - movdqu %xmm0,(%rsi,%rdi,1) - leaq 16(%rdi),%rdi - subq $16,%rcx - jnc L$cbc_enc_loop - jmp L$cbc_done -.p2align 4 -L$cbc_dec_loop: - movdqu (%rdi),%xmm0 - movdqa %xmm0,%xmm7 - call _vpaes_decrypt_core - pxor %xmm6,%xmm0 - movdqa %xmm7,%xmm6 - movdqu %xmm0,(%rsi,%rdi,1) - leaq 16(%rdi),%rdi - subq $16,%rcx - jnc L$cbc_dec_loop -L$cbc_done: - movdqu %xmm6,(%r8) -L$cbc_abort: - .byte 0xf3,0xc3 - - -.globl _vpaes_ctr32_encrypt_blocks -.private_extern _vpaes_ctr32_encrypt_blocks - -.p2align 4 -_vpaes_ctr32_encrypt_blocks: - - - xchgq %rcx,%rdx - testq %rcx,%rcx - jz L$ctr32_abort - movdqu (%r8),%xmm0 - movdqa L$ctr_add_one(%rip),%xmm8 - subq %rdi,%rsi - call _vpaes_preheat - movdqa %xmm0,%xmm6 - pshufb L$rev_ctr(%rip),%xmm6 - - testq $1,%rcx - jz L$ctr32_prep_loop - - - - movdqu (%rdi),%xmm7 - call _vpaes_encrypt_core - pxor %xmm7,%xmm0 - paddd %xmm8,%xmm6 - movdqu %xmm0,(%rsi,%rdi,1) - subq $1,%rcx - leaq 16(%rdi),%rdi - jz L$ctr32_done - -L$ctr32_prep_loop: - - - movdqa %xmm6,%xmm14 - movdqa %xmm6,%xmm15 - paddd %xmm8,%xmm15 - -L$ctr32_loop: - movdqa L$rev_ctr(%rip),%xmm1 - movdqa %xmm14,%xmm0 - movdqa %xmm15,%xmm6 -.byte 102,15,56,0,193 -.byte 102,15,56,0,241 - call _vpaes_encrypt_core_2x - movdqu (%rdi),%xmm1 - movdqu 16(%rdi),%xmm2 - movdqa L$ctr_add_two(%rip),%xmm3 - pxor %xmm1,%xmm0 - pxor %xmm2,%xmm6 - paddd %xmm3,%xmm14 - paddd %xmm3,%xmm15 - movdqu %xmm0,(%rsi,%rdi,1) - movdqu %xmm6,16(%rsi,%rdi,1) - subq $2,%rcx - leaq 32(%rdi),%rdi - jnz L$ctr32_loop - -L$ctr32_done: -L$ctr32_abort: - .byte 0xf3,0xc3 - - - - - - - - - -.p2align 4 -_vpaes_preheat: - - leaq L$k_s0F(%rip),%r10 - movdqa -32(%r10),%xmm10 - movdqa -16(%r10),%xmm11 - movdqa 0(%r10),%xmm9 - movdqa 48(%r10),%xmm13 - movdqa 64(%r10),%xmm12 - movdqa 80(%r10),%xmm15 - movdqa 96(%r10),%xmm14 - .byte 0xf3,0xc3 - - - - - - - - -.p2align 6 -_vpaes_consts: -L$k_inv: -.quad 0x0E05060F0D080180, 0x040703090A0B0C02 -.quad 0x01040A060F0B0780, 0x030D0E0C02050809 - -L$k_s0F: -.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F - -L$k_ipt: -.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 -.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 - -L$k_sb1: -.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 -.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF -L$k_sb2: -.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD -.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A -L$k_sbo: -.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 -.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA - -L$k_mc_forward: -.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 -.quad 0x080B0A0904070605, 0x000302010C0F0E0D -.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 -.quad 0x000302010C0F0E0D, 0x080B0A0904070605 - -L$k_mc_backward: -.quad 0x0605040702010003, 0x0E0D0C0F0A09080B -.quad 0x020100030E0D0C0F, 0x0A09080B06050407 -.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 -.quad 0x0A09080B06050407, 0x020100030E0D0C0F - -L$k_sr: -.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 -.quad 0x030E09040F0A0500, 0x0B06010C07020D08 -.quad 0x0F060D040B020900, 0x070E050C030A0108 -.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 - -L$k_rcon: -.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 - -L$k_s63: -.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B - -L$k_opt: -.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 -.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 - -L$k_deskew: -.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A -.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 - - - - - -L$k_dksd: -.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 -.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E -L$k_dksb: -.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 -.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 -L$k_dkse: -.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 -.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 -L$k_dks9: -.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC -.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE - - - - - -L$k_dipt: -.quad 0x0F505B040B545F00, 0x154A411E114E451A -.quad 0x86E383E660056500, 0x12771772F491F194 - -L$k_dsb9: -.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 -.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 -L$k_dsbd: -.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 -.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 -L$k_dsbb: -.quad 0xD022649296B44200, 0x602646F6B0F2D404 -.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B -L$k_dsbe: -.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 -.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 -L$k_dsbo: -.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D -.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C - - -L$rev_ctr: -.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 - - -L$ctr_add_one: -.quad 0x0000000000000000, 0x0000000100000000 -L$ctr_add_two: -.quad 0x0000000000000000, 0x0000000200000000 - -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 -.p2align 6 - -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/x86_64-mont.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/x86_64-mont.S deleted file mode 100644 index d354b2d4..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/x86_64-mont.S +++ /dev/null @@ -1,1256 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - -.globl _bn_mul_mont -.private_extern _bn_mul_mont - -.p2align 4 -_bn_mul_mont: - - movl %r9d,%r9d - movq %rsp,%rax - - testl $3,%r9d - jnz L$mul_enter - cmpl $8,%r9d - jb L$mul_enter - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - cmpq %rsi,%rdx - jne L$mul4x_enter - testl $7,%r9d - jz L$sqr8x_enter - jmp L$mul4x_enter - -.p2align 4 -L$mul_enter: - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - - negq %r9 - movq %rsp,%r11 - leaq -16(%rsp,%r9,8),%r10 - negq %r9 - andq $-1024,%r10 - - - - - - - - - - subq %r10,%r11 - andq $-4096,%r11 - leaq (%r10,%r11,1),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja L$mul_page_walk - jmp L$mul_page_walk_done - -.p2align 4 -L$mul_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja L$mul_page_walk -L$mul_page_walk_done: - - movq %rax,8(%rsp,%r9,8) - -L$mul_body: - movq %rdx,%r12 - movq (%r8),%r8 - movq (%r12),%rbx - movq (%rsi),%rax - - xorq %r14,%r14 - xorq %r15,%r15 - - movq %r8,%rbp - mulq %rbx - movq %rax,%r10 - movq (%rcx),%rax - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq %rdx,%r13 - - leaq 1(%r15),%r15 - jmp L$1st_enter - -.p2align 4 -L$1st: - addq %rax,%r13 - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%r13 - movq %r10,%r11 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - -L$1st_enter: - mulq %rbx - addq %rax,%r11 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - leaq 1(%r15),%r15 - movq %rdx,%r10 - - mulq %rbp - cmpq %r9,%r15 - jne L$1st - - addq %rax,%r13 - movq (%rsi),%rax - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - movq %r10,%r11 - - xorq %rdx,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r9,8) - movq %rdx,(%rsp,%r9,8) - - leaq 1(%r14),%r14 - jmp L$outer -.p2align 4 -L$outer: - movq (%r12,%r14,8),%rbx - xorq %r15,%r15 - movq %r8,%rbp - movq (%rsp),%r10 - mulq %rbx - addq %rax,%r10 - movq (%rcx),%rax - adcq $0,%rdx - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq 8(%rsp),%r10 - movq %rdx,%r13 - - leaq 1(%r15),%r15 - jmp L$inner_enter - -.p2align 4 -L$inner: - addq %rax,%r13 - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - movq (%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - -L$inner_enter: - mulq %rbx - addq %rax,%r11 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - leaq 1(%r15),%r15 - - mulq %rbp - cmpq %r9,%r15 - jne L$inner - - addq %rax,%r13 - movq (%rsi),%rax - adcq $0,%rdx - addq %r10,%r13 - movq (%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - - xorq %rdx,%rdx - addq %r11,%r13 - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r9,8) - movq %rdx,(%rsp,%r9,8) - - leaq 1(%r14),%r14 - cmpq %r9,%r14 - jb L$outer - - xorq %r14,%r14 - movq (%rsp),%rax - movq %r9,%r15 - -.p2align 4 -L$sub: sbbq (%rcx,%r14,8),%rax - movq %rax,(%rdi,%r14,8) - movq 8(%rsp,%r14,8),%rax - leaq 1(%r14),%r14 - decq %r15 - jnz L$sub - - sbbq $0,%rax - movq $-1,%rbx - xorq %rax,%rbx - xorq %r14,%r14 - movq %r9,%r15 - -L$copy: - movq (%rdi,%r14,8),%rcx - movq (%rsp,%r14,8),%rdx - andq %rbx,%rcx - andq %rax,%rdx - movq %r9,(%rsp,%r14,8) - orq %rcx,%rdx - movq %rdx,(%rdi,%r14,8) - leaq 1(%r14),%r14 - subq $1,%r15 - jnz L$copy - - movq 8(%rsp,%r9,8),%rsi - - movq $1,%rax - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$mul_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 4 -bn_mul4x_mont: - - movl %r9d,%r9d - movq %rsp,%rax - -L$mul4x_enter: - andl $0x80100,%r11d - cmpl $0x80100,%r11d - je L$mulx4x_enter - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - - negq %r9 - movq %rsp,%r11 - leaq -32(%rsp,%r9,8),%r10 - negq %r9 - andq $-1024,%r10 - - subq %r10,%r11 - andq $-4096,%r11 - leaq (%r10,%r11,1),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja L$mul4x_page_walk - jmp L$mul4x_page_walk_done - -L$mul4x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja L$mul4x_page_walk -L$mul4x_page_walk_done: - - movq %rax,8(%rsp,%r9,8) - -L$mul4x_body: - movq %rdi,16(%rsp,%r9,8) - movq %rdx,%r12 - movq (%r8),%r8 - movq (%r12),%rbx - movq (%rsi),%rax - - xorq %r14,%r14 - xorq %r15,%r15 - - movq %r8,%rbp - mulq %rbx - movq %rax,%r10 - movq (%rcx),%rax - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 4(%r15),%r15 - adcq $0,%rdx - movq %rdi,(%rsp) - movq %rdx,%r13 - jmp L$1st4x -.p2align 4 -L$1st4x: - mulq %rbx - addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%rsp,%r15,8) - movq %rdx,%r13 - - mulq %rbx - addq %rax,%r10 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq 8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx,%r15,8),%rax - adcq $0,%rdx - leaq 4(%r15),%r15 - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq -16(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-32(%rsp,%r15,8) - movq %rdx,%r13 - cmpq %r9,%r15 - jb L$1st4x - - mulq %rbx - addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%rsp,%r15,8) - movq %rdx,%r13 - - xorq %rdi,%rdi - addq %r10,%r13 - adcq $0,%rdi - movq %r13,-8(%rsp,%r15,8) - movq %rdi,(%rsp,%r15,8) - - leaq 1(%r14),%r14 -.p2align 2 -L$outer4x: - movq (%r12,%r14,8),%rbx - xorq %r15,%r15 - movq (%rsp),%r10 - movq %r8,%rbp - mulq %rbx - addq %rax,%r10 - movq (%rcx),%rax - adcq $0,%rdx - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - addq 8(%rsp),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 4(%r15),%r15 - adcq $0,%rdx - movq %rdi,(%rsp) - movq %rdx,%r13 - jmp L$inner4x -.p2align 4 -L$inner4x: - mulq %rbx - addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax - adcq $0,%rdx - addq -16(%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax - adcq $0,%rdx - addq -8(%rsp,%r15,8),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%rsp,%r15,8) - movq %rdx,%r13 - - mulq %rbx - addq %rax,%r10 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - addq (%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq 8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx,%r15,8),%rax - adcq $0,%rdx - addq 8(%rsp,%r15,8),%r11 - adcq $0,%rdx - leaq 4(%r15),%r15 - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq -16(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-32(%rsp,%r15,8) - movq %rdx,%r13 - cmpq %r9,%r15 - jb L$inner4x - - mulq %rbx - addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax - adcq $0,%rdx - addq -16(%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax - adcq $0,%rdx - addq -8(%rsp,%r15,8),%r11 - adcq $0,%rdx - leaq 1(%r14),%r14 - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%rsp,%r15,8) - movq %rdx,%r13 - - xorq %rdi,%rdi - addq %r10,%r13 - adcq $0,%rdi - addq (%rsp,%r9,8),%r13 - adcq $0,%rdi - movq %r13,-8(%rsp,%r15,8) - movq %rdi,(%rsp,%r15,8) - - cmpq %r9,%r14 - jb L$outer4x - movq 16(%rsp,%r9,8),%rdi - leaq -4(%r9),%r15 - movq 0(%rsp),%rax - movq 8(%rsp),%rdx - shrq $2,%r15 - leaq (%rsp),%rsi - xorq %r14,%r14 - - subq 0(%rcx),%rax - movq 16(%rsi),%rbx - movq 24(%rsi),%rbp - sbbq 8(%rcx),%rdx - -L$sub4x: - movq %rax,0(%rdi,%r14,8) - movq %rdx,8(%rdi,%r14,8) - sbbq 16(%rcx,%r14,8),%rbx - movq 32(%rsi,%r14,8),%rax - movq 40(%rsi,%r14,8),%rdx - sbbq 24(%rcx,%r14,8),%rbp - movq %rbx,16(%rdi,%r14,8) - movq %rbp,24(%rdi,%r14,8) - sbbq 32(%rcx,%r14,8),%rax - movq 48(%rsi,%r14,8),%rbx - movq 56(%rsi,%r14,8),%rbp - sbbq 40(%rcx,%r14,8),%rdx - leaq 4(%r14),%r14 - decq %r15 - jnz L$sub4x - - movq %rax,0(%rdi,%r14,8) - movq 32(%rsi,%r14,8),%rax - sbbq 16(%rcx,%r14,8),%rbx - movq %rdx,8(%rdi,%r14,8) - sbbq 24(%rcx,%r14,8),%rbp - movq %rbx,16(%rdi,%r14,8) - - sbbq $0,%rax - movq %rbp,24(%rdi,%r14,8) - pxor %xmm0,%xmm0 -.byte 102,72,15,110,224 - pcmpeqd %xmm5,%xmm5 - pshufd $0,%xmm4,%xmm4 - movq %r9,%r15 - pxor %xmm4,%xmm5 - shrq $2,%r15 - xorl %eax,%eax - - jmp L$copy4x -.p2align 4 -L$copy4x: - movdqa (%rsp,%rax,1),%xmm1 - movdqu (%rdi,%rax,1),%xmm2 - pand %xmm4,%xmm1 - pand %xmm5,%xmm2 - movdqa 16(%rsp,%rax,1),%xmm3 - movdqa %xmm0,(%rsp,%rax,1) - por %xmm2,%xmm1 - movdqu 16(%rdi,%rax,1),%xmm2 - movdqu %xmm1,(%rdi,%rax,1) - pand %xmm4,%xmm3 - pand %xmm5,%xmm2 - movdqa %xmm0,16(%rsp,%rax,1) - por %xmm2,%xmm3 - movdqu %xmm3,16(%rdi,%rax,1) - leaq 32(%rax),%rax - decq %r15 - jnz L$copy4x - movq 8(%rsp,%r9,8),%rsi - - movq $1,%rax - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$mul4x_epilogue: - .byte 0xf3,0xc3 - - - - - - -.p2align 5 -bn_sqr8x_mont: - - movq %rsp,%rax - -L$sqr8x_enter: - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$sqr8x_prologue: - - movl %r9d,%r10d - shll $3,%r9d - shlq $3+2,%r10 - negq %r9 - - - - - - - leaq -64(%rsp,%r9,2),%r11 - movq %rsp,%rbp - movq (%r8),%r8 - subq %rsi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb L$sqr8x_sp_alt - subq %r11,%rbp - leaq -64(%rbp,%r9,2),%rbp - jmp L$sqr8x_sp_done - -.p2align 5 -L$sqr8x_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -L$sqr8x_sp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$sqr8x_page_walk - jmp L$sqr8x_page_walk_done - -.p2align 4 -L$sqr8x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$sqr8x_page_walk -L$sqr8x_page_walk_done: - - movq %r9,%r10 - negq %r9 - - movq %r8,32(%rsp) - movq %rax,40(%rsp) - -L$sqr8x_body: - -.byte 102,72,15,110,209 - pxor %xmm0,%xmm0 -.byte 102,72,15,110,207 -.byte 102,73,15,110,218 - leaq _OPENSSL_ia32cap_P(%rip),%rax - movl 8(%rax),%eax - andl $0x80100,%eax - cmpl $0x80100,%eax - jne L$sqr8x_nox - - call _bn_sqrx8x_internal - - - - - leaq (%r8,%rcx,1),%rbx - movq %rcx,%r9 - movq %rcx,%rdx -.byte 102,72,15,126,207 - sarq $3+2,%rcx - jmp L$sqr8x_sub - -.p2align 5 -L$sqr8x_nox: - call _bn_sqr8x_internal - - - - - leaq (%rdi,%r9,1),%rbx - movq %r9,%rcx - movq %r9,%rdx -.byte 102,72,15,126,207 - sarq $3+2,%rcx - jmp L$sqr8x_sub - -.p2align 5 -L$sqr8x_sub: - movq 0(%rbx),%r12 - movq 8(%rbx),%r13 - movq 16(%rbx),%r14 - movq 24(%rbx),%r15 - leaq 32(%rbx),%rbx - sbbq 0(%rbp),%r12 - sbbq 8(%rbp),%r13 - sbbq 16(%rbp),%r14 - sbbq 24(%rbp),%r15 - leaq 32(%rbp),%rbp - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r14,16(%rdi) - movq %r15,24(%rdi) - leaq 32(%rdi),%rdi - incq %rcx - jnz L$sqr8x_sub - - sbbq $0,%rax - leaq (%rbx,%r9,1),%rbx - leaq (%rdi,%r9,1),%rdi - -.byte 102,72,15,110,200 - pxor %xmm0,%xmm0 - pshufd $0,%xmm1,%xmm1 - movq 40(%rsp),%rsi - - jmp L$sqr8x_cond_copy - -.p2align 5 -L$sqr8x_cond_copy: - movdqa 0(%rbx),%xmm2 - movdqa 16(%rbx),%xmm3 - leaq 32(%rbx),%rbx - movdqu 0(%rdi),%xmm4 - movdqu 16(%rdi),%xmm5 - leaq 32(%rdi),%rdi - movdqa %xmm0,-32(%rbx) - movdqa %xmm0,-16(%rbx) - movdqa %xmm0,-32(%rbx,%rdx,1) - movdqa %xmm0,-16(%rbx,%rdx,1) - pcmpeqd %xmm1,%xmm0 - pand %xmm1,%xmm2 - pand %xmm1,%xmm3 - pand %xmm0,%xmm4 - pand %xmm0,%xmm5 - pxor %xmm0,%xmm0 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqu %xmm4,-32(%rdi) - movdqu %xmm5,-16(%rdi) - addq $32,%r9 - jnz L$sqr8x_cond_copy - - movq $1,%rax - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$sqr8x_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -bn_mulx4x_mont: - - movq %rsp,%rax - -L$mulx4x_enter: - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$mulx4x_prologue: - - shll $3,%r9d - xorq %r10,%r10 - subq %r9,%r10 - movq (%r8),%r8 - leaq -72(%rsp,%r10,1),%rbp - andq $-128,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$mulx4x_page_walk - jmp L$mulx4x_page_walk_done - -.p2align 4 -L$mulx4x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$mulx4x_page_walk -L$mulx4x_page_walk_done: - - leaq (%rdx,%r9,1),%r10 - - - - - - - - - - - - - movq %r9,0(%rsp) - shrq $5,%r9 - movq %r10,16(%rsp) - subq $1,%r9 - movq %r8,24(%rsp) - movq %rdi,32(%rsp) - movq %rax,40(%rsp) - - movq %r9,48(%rsp) - jmp L$mulx4x_body - -.p2align 5 -L$mulx4x_body: - leaq 8(%rdx),%rdi - movq (%rdx),%rdx - leaq 64+32(%rsp),%rbx - movq %rdx,%r9 - - mulxq 0(%rsi),%r8,%rax - mulxq 8(%rsi),%r11,%r14 - addq %rax,%r11 - movq %rdi,8(%rsp) - mulxq 16(%rsi),%r12,%r13 - adcq %r14,%r12 - adcq $0,%r13 - - movq %r8,%rdi - imulq 24(%rsp),%r8 - xorq %rbp,%rbp - - mulxq 24(%rsi),%rax,%r14 - movq %r8,%rdx - leaq 32(%rsi),%rsi - adcxq %rax,%r13 - adcxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%rdi - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 -.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 - movq 48(%rsp),%rdi - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-24(%rbx) - adcxq %rax,%r12 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r12,-16(%rbx) - - jmp L$mulx4x_1st - -.p2align 5 -L$mulx4x_1st: - adcxq %rbp,%r15 - mulxq 0(%rsi),%r10,%rax - adcxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 -.byte 0x67,0x67 - movq %r8,%rdx - adcxq %rax,%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - movq %r11,-32(%rbx) - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz L$mulx4x_1st - - movq 0(%rsp),%rax - movq 8(%rsp),%rdi - adcq %rbp,%r15 - addq %r15,%r14 - sbbq %r15,%r15 - movq %r14,-8(%rbx) - jmp L$mulx4x_outer - -.p2align 5 -L$mulx4x_outer: - movq (%rdi),%rdx - leaq 8(%rdi),%rdi - subq %rax,%rsi - movq %r15,(%rbx) - leaq 64+32(%rsp),%rbx - subq %rax,%rcx - - mulxq 0(%rsi),%r8,%r11 - xorl %ebp,%ebp - movq %rdx,%r9 - mulxq 8(%rsi),%r14,%r12 - adoxq -32(%rbx),%r8 - adcxq %r14,%r11 - mulxq 16(%rsi),%r15,%r13 - adoxq -24(%rbx),%r11 - adcxq %r15,%r12 - adoxq -16(%rbx),%r12 - adcxq %rbp,%r13 - adoxq %rbp,%r13 - - movq %rdi,8(%rsp) - movq %r8,%r15 - imulq 24(%rsp),%r8 - xorl %ebp,%ebp - - mulxq 24(%rsi),%rax,%r14 - movq %r8,%rdx - adcxq %rax,%r13 - adoxq -8(%rbx),%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - adoxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 16(%rcx),%rax,%r12 - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-24(%rbx) - leaq 32(%rcx),%rcx - adcxq %rax,%r12 - adoxq %rbp,%r15 - movq 48(%rsp),%rdi - movq %r12,-16(%rbx) - - jmp L$mulx4x_inner - -.p2align 5 -L$mulx4x_inner: - mulxq 0(%rsi),%r10,%rax - adcxq %rbp,%r15 - adoxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq 0(%rbx),%r10 - adoxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq 8(%rbx),%r11 - adoxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 - movq %r8,%rdx - adcxq 16(%rbx),%r12 - adoxq %rax,%r13 - adcxq 24(%rbx),%r13 - adoxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - adcxq %rbp,%r14 - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-32(%rbx) - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz L$mulx4x_inner - - movq 0(%rsp),%rax - movq 8(%rsp),%rdi - adcq %rbp,%r15 - subq 0(%rbx),%rbp - adcq %r15,%r14 - sbbq %r15,%r15 - movq %r14,-8(%rbx) - - cmpq 16(%rsp),%rdi - jne L$mulx4x_outer - - leaq 64(%rsp),%rbx - subq %rax,%rcx - negq %r15 - movq %rax,%rdx - shrq $3+2,%rax - movq 32(%rsp),%rdi - jmp L$mulx4x_sub - -.p2align 5 -L$mulx4x_sub: - movq 0(%rbx),%r11 - movq 8(%rbx),%r12 - movq 16(%rbx),%r13 - movq 24(%rbx),%r14 - leaq 32(%rbx),%rbx - sbbq 0(%rcx),%r11 - sbbq 8(%rcx),%r12 - sbbq 16(%rcx),%r13 - sbbq 24(%rcx),%r14 - leaq 32(%rcx),%rcx - movq %r11,0(%rdi) - movq %r12,8(%rdi) - movq %r13,16(%rdi) - movq %r14,24(%rdi) - leaq 32(%rdi),%rdi - decq %rax - jnz L$mulx4x_sub - - sbbq $0,%r15 - leaq 64(%rsp),%rbx - subq %rdx,%rdi - -.byte 102,73,15,110,207 - pxor %xmm0,%xmm0 - pshufd $0,%xmm1,%xmm1 - movq 40(%rsp),%rsi - - jmp L$mulx4x_cond_copy - -.p2align 5 -L$mulx4x_cond_copy: - movdqa 0(%rbx),%xmm2 - movdqa 16(%rbx),%xmm3 - leaq 32(%rbx),%rbx - movdqu 0(%rdi),%xmm4 - movdqu 16(%rdi),%xmm5 - leaq 32(%rdi),%rdi - movdqa %xmm0,-32(%rbx) - movdqa %xmm0,-16(%rbx) - pcmpeqd %xmm1,%xmm0 - pand %xmm1,%xmm2 - pand %xmm1,%xmm3 - pand %xmm0,%xmm4 - pand %xmm0,%xmm5 - pxor %xmm0,%xmm0 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqu %xmm4,-32(%rdi) - movdqu %xmm5,-16(%rdi) - subq $32,%rdx - jnz L$mulx4x_cond_copy - - movq %rdx,(%rbx) - - movq $1,%rax - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$mulx4x_epilogue: - .byte 0xf3,0xc3 - - -.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.p2align 4 -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/x86_64-mont5.S b/third_party/boringssl/apple-x86_64/crypto/fipsmodule/x86_64-mont5.S deleted file mode 100644 index 3a1768f5..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/fipsmodule/x86_64-mont5.S +++ /dev/null @@ -1,3607 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - -.globl _bn_mul_mont_gather5 -.private_extern _bn_mul_mont_gather5 - -.p2align 6 -_bn_mul_mont_gather5: - - movl %r9d,%r9d - movq %rsp,%rax - - testl $7,%r9d - jnz L$mul_enter - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - jmp L$mul4x_enter - -.p2align 4 -L$mul_enter: - movd 8(%rsp),%xmm5 - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - - - negq %r9 - movq %rsp,%r11 - leaq -280(%rsp,%r9,8),%r10 - negq %r9 - andq $-1024,%r10 - - - - - - - - - - subq %r10,%r11 - andq $-4096,%r11 - leaq (%r10,%r11,1),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja L$mul_page_walk - jmp L$mul_page_walk_done - -L$mul_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja L$mul_page_walk -L$mul_page_walk_done: - - leaq L$inc(%rip),%r10 - movq %rax,8(%rsp,%r9,8) - -L$mul_body: - - leaq 128(%rdx),%r12 - movdqa 0(%r10),%xmm0 - movdqa 16(%r10),%xmm1 - leaq 24-112(%rsp,%r9,8),%r10 - andq $-16,%r10 - - pshufd $0,%xmm5,%xmm5 - movdqa %xmm1,%xmm4 - movdqa %xmm1,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 -.byte 0x67 - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,112(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,128(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,144(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,160(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,176(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,192(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,208(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,224(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,240(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,256(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,272(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,288(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,304(%r10) - - paddd %xmm2,%xmm3 -.byte 0x67 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,320(%r10) - - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,336(%r10) - pand 64(%r12),%xmm0 - - pand 80(%r12),%xmm1 - pand 96(%r12),%xmm2 - movdqa %xmm3,352(%r10) - pand 112(%r12),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -128(%r12),%xmm4 - movdqa -112(%r12),%xmm5 - movdqa -96(%r12),%xmm2 - pand 112(%r10),%xmm4 - movdqa -80(%r12),%xmm3 - pand 128(%r10),%xmm5 - por %xmm4,%xmm0 - pand 144(%r10),%xmm2 - por %xmm5,%xmm1 - pand 160(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -64(%r12),%xmm4 - movdqa -48(%r12),%xmm5 - movdqa -32(%r12),%xmm2 - pand 176(%r10),%xmm4 - movdqa -16(%r12),%xmm3 - pand 192(%r10),%xmm5 - por %xmm4,%xmm0 - pand 208(%r10),%xmm2 - por %xmm5,%xmm1 - pand 224(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa 0(%r12),%xmm4 - movdqa 16(%r12),%xmm5 - movdqa 32(%r12),%xmm2 - pand 240(%r10),%xmm4 - movdqa 48(%r12),%xmm3 - pand 256(%r10),%xmm5 - por %xmm4,%xmm0 - pand 272(%r10),%xmm2 - por %xmm5,%xmm1 - pand 288(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - por %xmm1,%xmm0 - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 - leaq 256(%r12),%r12 -.byte 102,72,15,126,195 - - movq (%r8),%r8 - movq (%rsi),%rax - - xorq %r14,%r14 - xorq %r15,%r15 - - movq %r8,%rbp - mulq %rbx - movq %rax,%r10 - movq (%rcx),%rax - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq %rdx,%r13 - - leaq 1(%r15),%r15 - jmp L$1st_enter - -.p2align 4 -L$1st: - addq %rax,%r13 - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%r13 - movq %r10,%r11 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - -L$1st_enter: - mulq %rbx - addq %rax,%r11 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - leaq 1(%r15),%r15 - movq %rdx,%r10 - - mulq %rbp - cmpq %r9,%r15 - jne L$1st - - - addq %rax,%r13 - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %r13,-16(%rsp,%r9,8) - movq %rdx,%r13 - movq %r10,%r11 - - xorq %rdx,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r9,8) - movq %rdx,(%rsp,%r9,8) - - leaq 1(%r14),%r14 - jmp L$outer -.p2align 4 -L$outer: - leaq 24+128(%rsp,%r9,8),%rdx - andq $-16,%rdx - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - movdqa -128(%r12),%xmm0 - movdqa -112(%r12),%xmm1 - movdqa -96(%r12),%xmm2 - movdqa -80(%r12),%xmm3 - pand -128(%rdx),%xmm0 - pand -112(%rdx),%xmm1 - por %xmm0,%xmm4 - pand -96(%rdx),%xmm2 - por %xmm1,%xmm5 - pand -80(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa -64(%r12),%xmm0 - movdqa -48(%r12),%xmm1 - movdqa -32(%r12),%xmm2 - movdqa -16(%r12),%xmm3 - pand -64(%rdx),%xmm0 - pand -48(%rdx),%xmm1 - por %xmm0,%xmm4 - pand -32(%rdx),%xmm2 - por %xmm1,%xmm5 - pand -16(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 0(%r12),%xmm0 - movdqa 16(%r12),%xmm1 - movdqa 32(%r12),%xmm2 - movdqa 48(%r12),%xmm3 - pand 0(%rdx),%xmm0 - pand 16(%rdx),%xmm1 - por %xmm0,%xmm4 - pand 32(%rdx),%xmm2 - por %xmm1,%xmm5 - pand 48(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 64(%r12),%xmm0 - movdqa 80(%r12),%xmm1 - movdqa 96(%r12),%xmm2 - movdqa 112(%r12),%xmm3 - pand 64(%rdx),%xmm0 - pand 80(%rdx),%xmm1 - por %xmm0,%xmm4 - pand 96(%rdx),%xmm2 - por %xmm1,%xmm5 - pand 112(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - por %xmm5,%xmm4 - pshufd $0x4e,%xmm4,%xmm0 - por %xmm4,%xmm0 - leaq 256(%r12),%r12 - - movq (%rsi),%rax -.byte 102,72,15,126,195 - - xorq %r15,%r15 - movq %r8,%rbp - movq (%rsp),%r10 - - mulq %rbx - addq %rax,%r10 - movq (%rcx),%rax - adcq $0,%rdx - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq 8(%rsp),%r10 - movq %rdx,%r13 - - leaq 1(%r15),%r15 - jmp L$inner_enter - -.p2align 4 -L$inner: - addq %rax,%r13 - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - movq (%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - -L$inner_enter: - mulq %rbx - addq %rax,%r11 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - leaq 1(%r15),%r15 - - mulq %rbp - cmpq %r9,%r15 - jne L$inner - - addq %rax,%r13 - adcq $0,%rdx - addq %r10,%r13 - movq (%rsp,%r9,8),%r10 - adcq $0,%rdx - movq %r13,-16(%rsp,%r9,8) - movq %rdx,%r13 - - xorq %rdx,%rdx - addq %r11,%r13 - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r9,8) - movq %rdx,(%rsp,%r9,8) - - leaq 1(%r14),%r14 - cmpq %r9,%r14 - jb L$outer - - xorq %r14,%r14 - movq (%rsp),%rax - leaq (%rsp),%rsi - movq %r9,%r15 - jmp L$sub -.p2align 4 -L$sub: sbbq (%rcx,%r14,8),%rax - movq %rax,(%rdi,%r14,8) - movq 8(%rsi,%r14,8),%rax - leaq 1(%r14),%r14 - decq %r15 - jnz L$sub - - sbbq $0,%rax - movq $-1,%rbx - xorq %rax,%rbx - xorq %r14,%r14 - movq %r9,%r15 - -L$copy: - movq (%rdi,%r14,8),%rcx - movq (%rsp,%r14,8),%rdx - andq %rbx,%rcx - andq %rax,%rdx - movq %r14,(%rsp,%r14,8) - orq %rcx,%rdx - movq %rdx,(%rdi,%r14,8) - leaq 1(%r14),%r14 - subq $1,%r15 - jnz L$copy - - movq 8(%rsp,%r9,8),%rsi - - movq $1,%rax - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$mul_epilogue: - .byte 0xf3,0xc3 - - - -.p2align 5 -bn_mul4x_mont_gather5: - -.byte 0x67 - movq %rsp,%rax - -L$mul4x_enter: - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je L$mulx4x_enter - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$mul4x_prologue: - -.byte 0x67 - shll $3,%r9d - leaq (%r9,%r9,2),%r10 - negq %r9 - - - - - - - - - - - leaq -320(%rsp,%r9,2),%r11 - movq %rsp,%rbp - subq %rdi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb L$mul4xsp_alt - subq %r11,%rbp - leaq -320(%rbp,%r9,2),%rbp - jmp L$mul4xsp_done - -.p2align 5 -L$mul4xsp_alt: - leaq 4096-320(,%r9,2),%r10 - leaq -320(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -L$mul4xsp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$mul4x_page_walk - jmp L$mul4x_page_walk_done - -L$mul4x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$mul4x_page_walk -L$mul4x_page_walk_done: - - negq %r9 - - movq %rax,40(%rsp) - -L$mul4x_body: - - call mul4x_internal - - movq 40(%rsp),%rsi - - movq $1,%rax - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$mul4x_epilogue: - .byte 0xf3,0xc3 - - - - -.p2align 5 -mul4x_internal: - - shlq $5,%r9 - movd 8(%rax),%xmm5 - leaq L$inc(%rip),%rax - leaq 128(%rdx,%r9,1),%r13 - shrq $5,%r9 - movdqa 0(%rax),%xmm0 - movdqa 16(%rax),%xmm1 - leaq 88-112(%rsp,%r9,1),%r10 - leaq 128(%rdx),%r12 - - pshufd $0,%xmm5,%xmm5 - movdqa %xmm1,%xmm4 -.byte 0x67,0x67 - movdqa %xmm1,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 -.byte 0x67 - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,112(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,128(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,144(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,160(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,176(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,192(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,208(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,224(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,240(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,256(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,272(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,288(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,304(%r10) - - paddd %xmm2,%xmm3 -.byte 0x67 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,320(%r10) - - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,336(%r10) - pand 64(%r12),%xmm0 - - pand 80(%r12),%xmm1 - pand 96(%r12),%xmm2 - movdqa %xmm3,352(%r10) - pand 112(%r12),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -128(%r12),%xmm4 - movdqa -112(%r12),%xmm5 - movdqa -96(%r12),%xmm2 - pand 112(%r10),%xmm4 - movdqa -80(%r12),%xmm3 - pand 128(%r10),%xmm5 - por %xmm4,%xmm0 - pand 144(%r10),%xmm2 - por %xmm5,%xmm1 - pand 160(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -64(%r12),%xmm4 - movdqa -48(%r12),%xmm5 - movdqa -32(%r12),%xmm2 - pand 176(%r10),%xmm4 - movdqa -16(%r12),%xmm3 - pand 192(%r10),%xmm5 - por %xmm4,%xmm0 - pand 208(%r10),%xmm2 - por %xmm5,%xmm1 - pand 224(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa 0(%r12),%xmm4 - movdqa 16(%r12),%xmm5 - movdqa 32(%r12),%xmm2 - pand 240(%r10),%xmm4 - movdqa 48(%r12),%xmm3 - pand 256(%r10),%xmm5 - por %xmm4,%xmm0 - pand 272(%r10),%xmm2 - por %xmm5,%xmm1 - pand 288(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - por %xmm1,%xmm0 - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 - leaq 256(%r12),%r12 -.byte 102,72,15,126,195 - - movq %r13,16+8(%rsp) - movq %rdi,56+8(%rsp) - - movq (%r8),%r8 - movq (%rsi),%rax - leaq (%rsi,%r9,1),%rsi - negq %r9 - - movq %r8,%rbp - mulq %rbx - movq %rax,%r10 - movq (%rcx),%rax - - imulq %r10,%rbp - leaq 64+8(%rsp),%r14 - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi,%r9,1),%rax - adcq $0,%rdx - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi,%r9,1),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 32(%r9),%r15 - leaq 32(%rcx),%rcx - adcq $0,%rdx - movq %rdi,(%r14) - movq %rdx,%r13 - jmp L$1st4x - -.p2align 5 -L$1st4x: - mulq %rbx - addq %rax,%r10 - movq -16(%rcx),%rax - leaq 32(%r14),%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%r14) - movq %rdx,%r13 - - mulq %rbx - addq %rax,%r10 - movq 0(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq 8(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-8(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 32(%rcx),%rcx - adcq $0,%rdx - movq %rdi,(%r14) - movq %rdx,%r13 - - addq $32,%r15 - jnz L$1st4x - - mulq %rbx - addq %rax,%r10 - movq -16(%rcx),%rax - leaq 32(%r14),%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r9,1),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%r14) - movq %rdx,%r13 - - leaq (%rcx,%r9,1),%rcx - - xorq %rdi,%rdi - addq %r10,%r13 - adcq $0,%rdi - movq %r13,-8(%r14) - - jmp L$outer4x - -.p2align 5 -L$outer4x: - leaq 16+128(%r14),%rdx - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - movdqa -128(%r12),%xmm0 - movdqa -112(%r12),%xmm1 - movdqa -96(%r12),%xmm2 - movdqa -80(%r12),%xmm3 - pand -128(%rdx),%xmm0 - pand -112(%rdx),%xmm1 - por %xmm0,%xmm4 - pand -96(%rdx),%xmm2 - por %xmm1,%xmm5 - pand -80(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa -64(%r12),%xmm0 - movdqa -48(%r12),%xmm1 - movdqa -32(%r12),%xmm2 - movdqa -16(%r12),%xmm3 - pand -64(%rdx),%xmm0 - pand -48(%rdx),%xmm1 - por %xmm0,%xmm4 - pand -32(%rdx),%xmm2 - por %xmm1,%xmm5 - pand -16(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 0(%r12),%xmm0 - movdqa 16(%r12),%xmm1 - movdqa 32(%r12),%xmm2 - movdqa 48(%r12),%xmm3 - pand 0(%rdx),%xmm0 - pand 16(%rdx),%xmm1 - por %xmm0,%xmm4 - pand 32(%rdx),%xmm2 - por %xmm1,%xmm5 - pand 48(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 64(%r12),%xmm0 - movdqa 80(%r12),%xmm1 - movdqa 96(%r12),%xmm2 - movdqa 112(%r12),%xmm3 - pand 64(%rdx),%xmm0 - pand 80(%rdx),%xmm1 - por %xmm0,%xmm4 - pand 96(%rdx),%xmm2 - por %xmm1,%xmm5 - pand 112(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - por %xmm5,%xmm4 - pshufd $0x4e,%xmm4,%xmm0 - por %xmm4,%xmm0 - leaq 256(%r12),%r12 -.byte 102,72,15,126,195 - - movq (%r14,%r9,1),%r10 - movq %r8,%rbp - mulq %rbx - addq %rax,%r10 - movq (%rcx),%rax - adcq $0,%rdx - - imulq %r10,%rbp - movq %rdx,%r11 - movq %rdi,(%r14) - - leaq (%r14,%r9,1),%r14 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi,%r9,1),%rax - adcq $0,%rdx - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - addq 8(%r14),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi,%r9,1),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 32(%r9),%r15 - leaq 32(%rcx),%rcx - adcq $0,%rdx - movq %rdx,%r13 - jmp L$inner4x - -.p2align 5 -L$inner4x: - mulq %rbx - addq %rax,%r10 - movq -16(%rcx),%rax - adcq $0,%rdx - addq 16(%r14),%r10 - leaq 32(%r14),%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdi,-32(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx),%rax - adcq $0,%rdx - addq -8(%r14),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %r13,-24(%r14) - movq %rdx,%r13 - - mulq %rbx - addq %rax,%r10 - movq 0(%rcx),%rax - adcq $0,%rdx - addq (%r14),%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq 8(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdi,-16(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - addq 8(%r14),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 32(%rcx),%rcx - adcq $0,%rdx - movq %r13,-8(%r14) - movq %rdx,%r13 - - addq $32,%r15 - jnz L$inner4x - - mulq %rbx - addq %rax,%r10 - movq -16(%rcx),%rax - adcq $0,%rdx - addq 16(%r14),%r10 - leaq 32(%r14),%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdi,-32(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq %rbp,%rax - movq -8(%rcx),%rbp - adcq $0,%rdx - addq -8(%r14),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r9,1),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %r13,-24(%r14) - movq %rdx,%r13 - - movq %rdi,-16(%r14) - leaq (%rcx,%r9,1),%rcx - - xorq %rdi,%rdi - addq %r10,%r13 - adcq $0,%rdi - addq (%r14),%r13 - adcq $0,%rdi - movq %r13,-8(%r14) - - cmpq 16+8(%rsp),%r12 - jb L$outer4x - xorq %rax,%rax - subq %r13,%rbp - adcq %r15,%r15 - orq %r15,%rdi - subq %rdi,%rax - leaq (%r14,%r9,1),%rbx - movq (%rcx),%r12 - leaq (%rcx),%rbp - movq %r9,%rcx - sarq $3+2,%rcx - movq 56+8(%rsp),%rdi - decq %r12 - xorq %r10,%r10 - movq 8(%rbp),%r13 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 - jmp L$sqr4x_sub_entry - - -.globl _bn_power5 -.private_extern _bn_power5 - -.p2align 5 -_bn_power5: - - movq %rsp,%rax - - leaq _OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je L$powerx5_enter - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$power5_prologue: - - shll $3,%r9d - leal (%r9,%r9,2),%r10d - negq %r9 - movq (%r8),%r8 - - - - - - - - - leaq -320(%rsp,%r9,2),%r11 - movq %rsp,%rbp - subq %rdi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb L$pwr_sp_alt - subq %r11,%rbp - leaq -320(%rbp,%r9,2),%rbp - jmp L$pwr_sp_done - -.p2align 5 -L$pwr_sp_alt: - leaq 4096-320(,%r9,2),%r10 - leaq -320(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -L$pwr_sp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$pwr_page_walk - jmp L$pwr_page_walk_done - -L$pwr_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$pwr_page_walk -L$pwr_page_walk_done: - - movq %r9,%r10 - negq %r9 - - - - - - - - - - - movq %r8,32(%rsp) - movq %rax,40(%rsp) - -L$power5_body: -.byte 102,72,15,110,207 -.byte 102,72,15,110,209 -.byte 102,73,15,110,218 -.byte 102,72,15,110,226 - - call __bn_sqr8x_internal - call __bn_post4x_internal - call __bn_sqr8x_internal - call __bn_post4x_internal - call __bn_sqr8x_internal - call __bn_post4x_internal - call __bn_sqr8x_internal - call __bn_post4x_internal - call __bn_sqr8x_internal - call __bn_post4x_internal - -.byte 102,72,15,126,209 -.byte 102,72,15,126,226 - movq %rsi,%rdi - movq 40(%rsp),%rax - leaq 32(%rsp),%r8 - - call mul4x_internal - - movq 40(%rsp),%rsi - - movq $1,%rax - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$power5_epilogue: - .byte 0xf3,0xc3 - - - -.globl _bn_sqr8x_internal -.private_extern _bn_sqr8x_internal -.private_extern _bn_sqr8x_internal - -.p2align 5 -_bn_sqr8x_internal: -__bn_sqr8x_internal: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - leaq 32(%r10),%rbp - leaq (%rsi,%r9,1),%rsi - - movq %r9,%rcx - - - movq -32(%rsi,%rbp,1),%r14 - leaq 48+8(%rsp,%r9,2),%rdi - movq -24(%rsi,%rbp,1),%rax - leaq -32(%rdi,%rbp,1),%rdi - movq -16(%rsi,%rbp,1),%rbx - movq %rax,%r15 - - mulq %r14 - movq %rax,%r10 - movq %rbx,%rax - movq %rdx,%r11 - movq %r10,-24(%rdi,%rbp,1) - - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq $0,%rdx - movq %r11,-16(%rdi,%rbp,1) - movq %rdx,%r10 - - - movq -8(%rsi,%rbp,1),%rbx - mulq %r15 - movq %rax,%r12 - movq %rbx,%rax - movq %rdx,%r13 - - leaq (%rbp),%rcx - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - movq %rdx,%r11 - adcq $0,%r11 - addq %r12,%r10 - adcq $0,%r11 - movq %r10,-8(%rdi,%rcx,1) - jmp L$sqr4x_1st - -.p2align 5 -L$sqr4x_1st: - movq (%rsi,%rcx,1),%rbx - mulq %r15 - addq %rax,%r13 - movq %rbx,%rax - movq %rdx,%r12 - adcq $0,%r12 - - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - movq 8(%rsi,%rcx,1),%rbx - movq %rdx,%r10 - adcq $0,%r10 - addq %r13,%r11 - adcq $0,%r10 - - - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - movq %r11,(%rdi,%rcx,1) - movq %rdx,%r13 - adcq $0,%r13 - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - movq 16(%rsi,%rcx,1),%rbx - movq %rdx,%r11 - adcq $0,%r11 - addq %r12,%r10 - adcq $0,%r11 - - mulq %r15 - addq %rax,%r13 - movq %rbx,%rax - movq %r10,8(%rdi,%rcx,1) - movq %rdx,%r12 - adcq $0,%r12 - - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - movq 24(%rsi,%rcx,1),%rbx - movq %rdx,%r10 - adcq $0,%r10 - addq %r13,%r11 - adcq $0,%r10 - - - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - movq %r11,16(%rdi,%rcx,1) - movq %rdx,%r13 - adcq $0,%r13 - leaq 32(%rcx),%rcx - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - movq %rdx,%r11 - adcq $0,%r11 - addq %r12,%r10 - adcq $0,%r11 - movq %r10,-8(%rdi,%rcx,1) - - cmpq $0,%rcx - jne L$sqr4x_1st - - mulq %r15 - addq %rax,%r13 - leaq 16(%rbp),%rbp - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - - movq %r13,(%rdi) - movq %rdx,%r12 - movq %rdx,8(%rdi) - jmp L$sqr4x_outer - -.p2align 5 -L$sqr4x_outer: - movq -32(%rsi,%rbp,1),%r14 - leaq 48+8(%rsp,%r9,2),%rdi - movq -24(%rsi,%rbp,1),%rax - leaq -32(%rdi,%rbp,1),%rdi - movq -16(%rsi,%rbp,1),%rbx - movq %rax,%r15 - - mulq %r14 - movq -24(%rdi,%rbp,1),%r10 - addq %rax,%r10 - movq %rbx,%rax - adcq $0,%rdx - movq %r10,-24(%rdi,%rbp,1) - movq %rdx,%r11 - - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq $0,%rdx - addq -16(%rdi,%rbp,1),%r11 - movq %rdx,%r10 - adcq $0,%r10 - movq %r11,-16(%rdi,%rbp,1) - - xorq %r12,%r12 - - movq -8(%rsi,%rbp,1),%rbx - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - adcq $0,%rdx - addq -8(%rdi,%rbp,1),%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq $0,%rdx - addq %r12,%r10 - movq %rdx,%r11 - adcq $0,%r11 - movq %r10,-8(%rdi,%rbp,1) - - leaq (%rbp),%rcx - jmp L$sqr4x_inner - -.p2align 5 -L$sqr4x_inner: - movq (%rsi,%rcx,1),%rbx - mulq %r15 - addq %rax,%r13 - movq %rbx,%rax - movq %rdx,%r12 - adcq $0,%r12 - addq (%rdi,%rcx,1),%r13 - adcq $0,%r12 - -.byte 0x67 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - movq 8(%rsi,%rcx,1),%rbx - movq %rdx,%r10 - adcq $0,%r10 - addq %r13,%r11 - adcq $0,%r10 - - mulq %r15 - addq %rax,%r12 - movq %r11,(%rdi,%rcx,1) - movq %rbx,%rax - movq %rdx,%r13 - adcq $0,%r13 - addq 8(%rdi,%rcx,1),%r12 - leaq 16(%rcx),%rcx - adcq $0,%r13 - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq $0,%rdx - addq %r12,%r10 - movq %rdx,%r11 - adcq $0,%r11 - movq %r10,-8(%rdi,%rcx,1) - - cmpq $0,%rcx - jne L$sqr4x_inner - -.byte 0x67 - mulq %r15 - addq %rax,%r13 - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - - movq %r13,(%rdi) - movq %rdx,%r12 - movq %rdx,8(%rdi) - - addq $16,%rbp - jnz L$sqr4x_outer - - - movq -32(%rsi),%r14 - leaq 48+8(%rsp,%r9,2),%rdi - movq -24(%rsi),%rax - leaq -32(%rdi,%rbp,1),%rdi - movq -16(%rsi),%rbx - movq %rax,%r15 - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - movq %rdx,%r11 - adcq $0,%r11 - - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - movq %r10,-24(%rdi) - movq %rdx,%r10 - adcq $0,%r10 - addq %r13,%r11 - movq -8(%rsi),%rbx - adcq $0,%r10 - - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - movq %r11,-16(%rdi) - movq %rdx,%r13 - adcq $0,%r13 - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - movq %rdx,%r11 - adcq $0,%r11 - addq %r12,%r10 - adcq $0,%r11 - movq %r10,-8(%rdi) - - mulq %r15 - addq %rax,%r13 - movq -16(%rsi),%rax - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - - movq %r13,(%rdi) - movq %rdx,%r12 - movq %rdx,8(%rdi) - - mulq %rbx - addq $16,%rbp - xorq %r14,%r14 - subq %r9,%rbp - xorq %r15,%r15 - - addq %r12,%rax - adcq $0,%rdx - movq %rax,8(%rdi) - movq %rdx,16(%rdi) - movq %r15,24(%rdi) - - movq -16(%rsi,%rbp,1),%rax - leaq 48+8(%rsp),%rdi - xorq %r10,%r10 - movq 8(%rdi),%r11 - - leaq (%r14,%r10,2),%r12 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq 16(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 24(%rdi),%r11 - adcq %rax,%r12 - movq -8(%rsi,%rbp,1),%rax - movq %r12,(%rdi) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,8(%rdi) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - movq 32(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 40(%rdi),%r11 - adcq %rax,%rbx - movq 0(%rsi,%rbp,1),%rax - movq %rbx,16(%rdi) - adcq %rdx,%r8 - leaq 16(%rbp),%rbp - movq %r8,24(%rdi) - sbbq %r15,%r15 - leaq 64(%rdi),%rdi - jmp L$sqr4x_shift_n_add - -.p2align 5 -L$sqr4x_shift_n_add: - leaq (%r14,%r10,2),%r12 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq -16(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq -8(%rdi),%r11 - adcq %rax,%r12 - movq -8(%rsi,%rbp,1),%rax - movq %r12,-32(%rdi) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,-24(%rdi) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - movq 0(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 8(%rdi),%r11 - adcq %rax,%rbx - movq 0(%rsi,%rbp,1),%rax - movq %rbx,-16(%rdi) - adcq %rdx,%r8 - - leaq (%r14,%r10,2),%r12 - movq %r8,-8(%rdi) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq 16(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 24(%rdi),%r11 - adcq %rax,%r12 - movq 8(%rsi,%rbp,1),%rax - movq %r12,0(%rdi) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,8(%rdi) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - movq 32(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 40(%rdi),%r11 - adcq %rax,%rbx - movq 16(%rsi,%rbp,1),%rax - movq %rbx,16(%rdi) - adcq %rdx,%r8 - movq %r8,24(%rdi) - sbbq %r15,%r15 - leaq 64(%rdi),%rdi - addq $32,%rbp - jnz L$sqr4x_shift_n_add - - leaq (%r14,%r10,2),%r12 -.byte 0x67 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq -16(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq -8(%rdi),%r11 - adcq %rax,%r12 - movq -8(%rsi),%rax - movq %r12,-32(%rdi) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,-24(%rdi) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - mulq %rax - negq %r15 - adcq %rax,%rbx - adcq %rdx,%r8 - movq %rbx,-16(%rdi) - movq %r8,-8(%rdi) -.byte 102,72,15,126,213 -__bn_sqr8x_reduction: - xorq %rax,%rax - leaq (%r9,%rbp,1),%rcx - leaq 48+8(%rsp,%r9,2),%rdx - movq %rcx,0+8(%rsp) - leaq 48+8(%rsp,%r9,1),%rdi - movq %rdx,8+8(%rsp) - negq %r9 - jmp L$8x_reduction_loop - -.p2align 5 -L$8x_reduction_loop: - leaq (%rdi,%r9,1),%rdi -.byte 0x66 - movq 0(%rdi),%rbx - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq %rax,(%rdx) - leaq 64(%rdi),%rdi - -.byte 0x67 - movq %rbx,%r8 - imulq 32+8(%rsp),%rbx - movq 0(%rbp),%rax - movl $8,%ecx - jmp L$8x_reduce - -.p2align 5 -L$8x_reduce: - mulq %rbx - movq 8(%rbp),%rax - negq %r8 - movq %rdx,%r8 - adcq $0,%r8 - - mulq %rbx - addq %rax,%r9 - movq 16(%rbp),%rax - adcq $0,%rdx - addq %r9,%r8 - movq %rbx,48-8+8(%rsp,%rcx,8) - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r10 - movq 24(%rbp),%rax - adcq $0,%rdx - addq %r10,%r9 - movq 32+8(%rsp),%rsi - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r11 - movq 32(%rbp),%rax - adcq $0,%rdx - imulq %r8,%rsi - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r12 - movq 40(%rbp),%rax - adcq $0,%rdx - addq %r12,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r13 - movq 48(%rbp),%rax - adcq $0,%rdx - addq %r13,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r14 - movq 56(%rbp),%rax - adcq $0,%rdx - addq %r14,%r13 - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - movq %rsi,%rbx - addq %rax,%r15 - movq 0(%rbp),%rax - adcq $0,%rdx - addq %r15,%r14 - movq %rdx,%r15 - adcq $0,%r15 - - decl %ecx - jnz L$8x_reduce - - leaq 64(%rbp),%rbp - xorq %rax,%rax - movq 8+8(%rsp),%rdx - cmpq 0+8(%rsp),%rbp - jae L$8x_no_tail - -.byte 0x66 - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - sbbq %rsi,%rsi - - movq 48+56+8(%rsp),%rbx - movl $8,%ecx - movq 0(%rbp),%rax - jmp L$8x_tail - -.p2align 5 -L$8x_tail: - mulq %rbx - addq %rax,%r8 - movq 8(%rbp),%rax - movq %r8,(%rdi) - movq %rdx,%r8 - adcq $0,%r8 - - mulq %rbx - addq %rax,%r9 - movq 16(%rbp),%rax - adcq $0,%rdx - addq %r9,%r8 - leaq 8(%rdi),%rdi - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r10 - movq 24(%rbp),%rax - adcq $0,%rdx - addq %r10,%r9 - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r11 - movq 32(%rbp),%rax - adcq $0,%rdx - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r12 - movq 40(%rbp),%rax - adcq $0,%rdx - addq %r12,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r13 - movq 48(%rbp),%rax - adcq $0,%rdx - addq %r13,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r14 - movq 56(%rbp),%rax - adcq $0,%rdx - addq %r14,%r13 - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - movq 48-16+8(%rsp,%rcx,8),%rbx - addq %rax,%r15 - adcq $0,%rdx - addq %r15,%r14 - movq 0(%rbp),%rax - movq %rdx,%r15 - adcq $0,%r15 - - decl %ecx - jnz L$8x_tail - - leaq 64(%rbp),%rbp - movq 8+8(%rsp),%rdx - cmpq 0+8(%rsp),%rbp - jae L$8x_tail_done - - movq 48+56+8(%rsp),%rbx - negq %rsi - movq 0(%rbp),%rax - adcq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - sbbq %rsi,%rsi - - movl $8,%ecx - jmp L$8x_tail - -.p2align 5 -L$8x_tail_done: - xorq %rax,%rax - addq (%rdx),%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rax - - negq %rsi -L$8x_no_tail: - adcq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - adcq $0,%rax - movq -8(%rbp),%rcx - xorq %rsi,%rsi - -.byte 102,72,15,126,213 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) -.byte 102,73,15,126,217 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - leaq 64(%rdi),%rdi - - cmpq %rdx,%rdi - jb L$8x_reduction_loop - .byte 0xf3,0xc3 - - - -.p2align 5 -__bn_post4x_internal: - - movq 0(%rbp),%r12 - leaq (%rdi,%r9,1),%rbx - movq %r9,%rcx -.byte 102,72,15,126,207 - negq %rax -.byte 102,72,15,126,206 - sarq $3+2,%rcx - decq %r12 - xorq %r10,%r10 - movq 8(%rbp),%r13 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 - jmp L$sqr4x_sub_entry - -.p2align 4 -L$sqr4x_sub: - movq 0(%rbp),%r12 - movq 8(%rbp),%r13 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 -L$sqr4x_sub_entry: - leaq 32(%rbp),%rbp - notq %r12 - notq %r13 - notq %r14 - notq %r15 - andq %rax,%r12 - andq %rax,%r13 - andq %rax,%r14 - andq %rax,%r15 - - negq %r10 - adcq 0(%rbx),%r12 - adcq 8(%rbx),%r13 - adcq 16(%rbx),%r14 - adcq 24(%rbx),%r15 - movq %r12,0(%rdi) - leaq 32(%rbx),%rbx - movq %r13,8(%rdi) - sbbq %r10,%r10 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - leaq 32(%rdi),%rdi - - incq %rcx - jnz L$sqr4x_sub - - movq %r9,%r10 - negq %r9 - .byte 0xf3,0xc3 - - - -.p2align 5 -bn_mulx4x_mont_gather5: - - movq %rsp,%rax - -L$mulx4x_enter: - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$mulx4x_prologue: - - shll $3,%r9d - leaq (%r9,%r9,2),%r10 - negq %r9 - movq (%r8),%r8 - - - - - - - - - - - leaq -320(%rsp,%r9,2),%r11 - movq %rsp,%rbp - subq %rdi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb L$mulx4xsp_alt - subq %r11,%rbp - leaq -320(%rbp,%r9,2),%rbp - jmp L$mulx4xsp_done - -L$mulx4xsp_alt: - leaq 4096-320(,%r9,2),%r10 - leaq -320(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -L$mulx4xsp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$mulx4x_page_walk - jmp L$mulx4x_page_walk_done - -L$mulx4x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$mulx4x_page_walk -L$mulx4x_page_walk_done: - - - - - - - - - - - - - - movq %r8,32(%rsp) - movq %rax,40(%rsp) - -L$mulx4x_body: - call mulx4x_internal - - movq 40(%rsp),%rsi - - movq $1,%rax - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$mulx4x_epilogue: - .byte 0xf3,0xc3 - - - - -.p2align 5 -mulx4x_internal: - - movq %r9,8(%rsp) - movq %r9,%r10 - negq %r9 - shlq $5,%r9 - negq %r10 - leaq 128(%rdx,%r9,1),%r13 - shrq $5+5,%r9 - movd 8(%rax),%xmm5 - subq $1,%r9 - leaq L$inc(%rip),%rax - movq %r13,16+8(%rsp) - movq %r9,24+8(%rsp) - movq %rdi,56+8(%rsp) - movdqa 0(%rax),%xmm0 - movdqa 16(%rax),%xmm1 - leaq 88-112(%rsp,%r10,1),%r10 - leaq 128(%rdx),%rdi - - pshufd $0,%xmm5,%xmm5 - movdqa %xmm1,%xmm4 -.byte 0x67 - movdqa %xmm1,%xmm2 -.byte 0x67 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,112(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,128(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,144(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,160(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,176(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,192(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,208(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,224(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,240(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,256(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,272(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,288(%r10) - movdqa %xmm4,%xmm3 -.byte 0x67 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,304(%r10) - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,320(%r10) - - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,336(%r10) - - pand 64(%rdi),%xmm0 - pand 80(%rdi),%xmm1 - pand 96(%rdi),%xmm2 - movdqa %xmm3,352(%r10) - pand 112(%rdi),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -128(%rdi),%xmm4 - movdqa -112(%rdi),%xmm5 - movdqa -96(%rdi),%xmm2 - pand 112(%r10),%xmm4 - movdqa -80(%rdi),%xmm3 - pand 128(%r10),%xmm5 - por %xmm4,%xmm0 - pand 144(%r10),%xmm2 - por %xmm5,%xmm1 - pand 160(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -64(%rdi),%xmm4 - movdqa -48(%rdi),%xmm5 - movdqa -32(%rdi),%xmm2 - pand 176(%r10),%xmm4 - movdqa -16(%rdi),%xmm3 - pand 192(%r10),%xmm5 - por %xmm4,%xmm0 - pand 208(%r10),%xmm2 - por %xmm5,%xmm1 - pand 224(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa 0(%rdi),%xmm4 - movdqa 16(%rdi),%xmm5 - movdqa 32(%rdi),%xmm2 - pand 240(%r10),%xmm4 - movdqa 48(%rdi),%xmm3 - pand 256(%r10),%xmm5 - por %xmm4,%xmm0 - pand 272(%r10),%xmm2 - por %xmm5,%xmm1 - pand 288(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - pxor %xmm1,%xmm0 - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 - leaq 256(%rdi),%rdi -.byte 102,72,15,126,194 - leaq 64+32+8(%rsp),%rbx - - movq %rdx,%r9 - mulxq 0(%rsi),%r8,%rax - mulxq 8(%rsi),%r11,%r12 - addq %rax,%r11 - mulxq 16(%rsi),%rax,%r13 - adcq %rax,%r12 - adcq $0,%r13 - mulxq 24(%rsi),%rax,%r14 - - movq %r8,%r15 - imulq 32+8(%rsp),%r8 - xorq %rbp,%rbp - movq %r8,%rdx - - movq %rdi,8+8(%rsp) - - leaq 32(%rsi),%rsi - adcxq %rax,%r13 - adcxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 16(%rcx),%rax,%r12 - movq 24+8(%rsp),%rdi - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-24(%rbx) - adcxq %rax,%r12 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r12,-16(%rbx) - jmp L$mulx4x_1st - -.p2align 5 -L$mulx4x_1st: - adcxq %rbp,%r15 - mulxq 0(%rsi),%r10,%rax - adcxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 -.byte 0x67,0x67 - movq %r8,%rdx - adcxq %rax,%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - movq %r11,-32(%rbx) - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz L$mulx4x_1st - - movq 8(%rsp),%rax - adcq %rbp,%r15 - leaq (%rsi,%rax,1),%rsi - addq %r15,%r14 - movq 8+8(%rsp),%rdi - adcq %rbp,%rbp - movq %r14,-8(%rbx) - jmp L$mulx4x_outer - -.p2align 5 -L$mulx4x_outer: - leaq 16-256(%rbx),%r10 - pxor %xmm4,%xmm4 -.byte 0x67,0x67 - pxor %xmm5,%xmm5 - movdqa -128(%rdi),%xmm0 - movdqa -112(%rdi),%xmm1 - movdqa -96(%rdi),%xmm2 - pand 256(%r10),%xmm0 - movdqa -80(%rdi),%xmm3 - pand 272(%r10),%xmm1 - por %xmm0,%xmm4 - pand 288(%r10),%xmm2 - por %xmm1,%xmm5 - pand 304(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa -64(%rdi),%xmm0 - movdqa -48(%rdi),%xmm1 - movdqa -32(%rdi),%xmm2 - pand 320(%r10),%xmm0 - movdqa -16(%rdi),%xmm3 - pand 336(%r10),%xmm1 - por %xmm0,%xmm4 - pand 352(%r10),%xmm2 - por %xmm1,%xmm5 - pand 368(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 0(%rdi),%xmm0 - movdqa 16(%rdi),%xmm1 - movdqa 32(%rdi),%xmm2 - pand 384(%r10),%xmm0 - movdqa 48(%rdi),%xmm3 - pand 400(%r10),%xmm1 - por %xmm0,%xmm4 - pand 416(%r10),%xmm2 - por %xmm1,%xmm5 - pand 432(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 64(%rdi),%xmm0 - movdqa 80(%rdi),%xmm1 - movdqa 96(%rdi),%xmm2 - pand 448(%r10),%xmm0 - movdqa 112(%rdi),%xmm3 - pand 464(%r10),%xmm1 - por %xmm0,%xmm4 - pand 480(%r10),%xmm2 - por %xmm1,%xmm5 - pand 496(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - por %xmm5,%xmm4 - pshufd $0x4e,%xmm4,%xmm0 - por %xmm4,%xmm0 - leaq 256(%rdi),%rdi -.byte 102,72,15,126,194 - - movq %rbp,(%rbx) - leaq 32(%rbx,%rax,1),%rbx - mulxq 0(%rsi),%r8,%r11 - xorq %rbp,%rbp - movq %rdx,%r9 - mulxq 8(%rsi),%r14,%r12 - adoxq -32(%rbx),%r8 - adcxq %r14,%r11 - mulxq 16(%rsi),%r15,%r13 - adoxq -24(%rbx),%r11 - adcxq %r15,%r12 - mulxq 24(%rsi),%rdx,%r14 - adoxq -16(%rbx),%r12 - adcxq %rdx,%r13 - leaq (%rcx,%rax,1),%rcx - leaq 32(%rsi),%rsi - adoxq -8(%rbx),%r13 - adcxq %rbp,%r14 - adoxq %rbp,%r14 - - movq %r8,%r15 - imulq 32+8(%rsp),%r8 - - movq %r8,%rdx - xorq %rbp,%rbp - movq %rdi,8+8(%rsp) - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 16(%rcx),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq 24+8(%rsp),%rdi - movq %r10,-32(%rbx) - adcxq %rax,%r12 - movq %r11,-24(%rbx) - adoxq %rbp,%r15 - movq %r12,-16(%rbx) - leaq 32(%rcx),%rcx - jmp L$mulx4x_inner - -.p2align 5 -L$mulx4x_inner: - mulxq 0(%rsi),%r10,%rax - adcxq %rbp,%r15 - adoxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq 0(%rbx),%r10 - adoxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq 8(%rbx),%r11 - adoxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 - movq %r8,%rdx - adcxq 16(%rbx),%r12 - adoxq %rax,%r13 - adcxq 24(%rbx),%r13 - adoxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - adcxq %rbp,%r14 - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - adoxq %r15,%r13 - movq %r11,-32(%rbx) - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - leaq 32(%rcx),%rcx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - movq %r13,-16(%rbx) - - decq %rdi - jnz L$mulx4x_inner - - movq 0+8(%rsp),%rax - adcq %rbp,%r15 - subq 0(%rbx),%rdi - movq 8+8(%rsp),%rdi - movq 16+8(%rsp),%r10 - adcq %r15,%r14 - leaq (%rsi,%rax,1),%rsi - adcq %rbp,%rbp - movq %r14,-8(%rbx) - - cmpq %r10,%rdi - jb L$mulx4x_outer - - movq -8(%rcx),%r10 - movq %rbp,%r8 - movq (%rcx,%rax,1),%r12 - leaq (%rcx,%rax,1),%rbp - movq %rax,%rcx - leaq (%rbx,%rax,1),%rdi - xorl %eax,%eax - xorq %r15,%r15 - subq %r14,%r10 - adcq %r15,%r15 - orq %r15,%r8 - sarq $3+2,%rcx - subq %r8,%rax - movq 56+8(%rsp),%rdx - decq %r12 - movq 8(%rbp),%r13 - xorq %r8,%r8 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 - jmp L$sqrx4x_sub_entry - - - -.p2align 5 -bn_powerx5: - - movq %rsp,%rax - -L$powerx5_enter: - pushq %rbx - - pushq %rbp - - pushq %r12 - - pushq %r13 - - pushq %r14 - - pushq %r15 - -L$powerx5_prologue: - - shll $3,%r9d - leaq (%r9,%r9,2),%r10 - negq %r9 - movq (%r8),%r8 - - - - - - - - - leaq -320(%rsp,%r9,2),%r11 - movq %rsp,%rbp - subq %rdi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb L$pwrx_sp_alt - subq %r11,%rbp - leaq -320(%rbp,%r9,2),%rbp - jmp L$pwrx_sp_done - -.p2align 5 -L$pwrx_sp_alt: - leaq 4096-320(,%r9,2),%r10 - leaq -320(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -L$pwrx_sp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$pwrx_page_walk - jmp L$pwrx_page_walk_done - -L$pwrx_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja L$pwrx_page_walk -L$pwrx_page_walk_done: - - movq %r9,%r10 - negq %r9 - - - - - - - - - - - - - pxor %xmm0,%xmm0 -.byte 102,72,15,110,207 -.byte 102,72,15,110,209 -.byte 102,73,15,110,218 -.byte 102,72,15,110,226 - movq %r8,32(%rsp) - movq %rax,40(%rsp) - -L$powerx5_body: - - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - - movq %r10,%r9 - movq %rsi,%rdi -.byte 102,72,15,126,209 -.byte 102,72,15,126,226 - movq 40(%rsp),%rax - - call mulx4x_internal - - movq 40(%rsp),%rsi - - movq $1,%rax - - movq -48(%rsi),%r15 - - movq -40(%rsi),%r14 - - movq -32(%rsi),%r13 - - movq -24(%rsi),%r12 - - movq -16(%rsi),%rbp - - movq -8(%rsi),%rbx - - leaq (%rsi),%rsp - -L$powerx5_epilogue: - .byte 0xf3,0xc3 - - - -.globl _bn_sqrx8x_internal -.private_extern _bn_sqrx8x_internal -.private_extern _bn_sqrx8x_internal - -.p2align 5 -_bn_sqrx8x_internal: -__bn_sqrx8x_internal: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - leaq 48+8(%rsp),%rdi - leaq (%rsi,%r9,1),%rbp - movq %r9,0+8(%rsp) - movq %rbp,8+8(%rsp) - jmp L$sqr8x_zero_start - -.p2align 5 -.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 -L$sqrx8x_zero: -.byte 0x3e - movdqa %xmm0,0(%rdi) - movdqa %xmm0,16(%rdi) - movdqa %xmm0,32(%rdi) - movdqa %xmm0,48(%rdi) -L$sqr8x_zero_start: - movdqa %xmm0,64(%rdi) - movdqa %xmm0,80(%rdi) - movdqa %xmm0,96(%rdi) - movdqa %xmm0,112(%rdi) - leaq 128(%rdi),%rdi - subq $64,%r9 - jnz L$sqrx8x_zero - - movq 0(%rsi),%rdx - - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - xorq %r13,%r13 - xorq %r14,%r14 - xorq %r15,%r15 - leaq 48+8(%rsp),%rdi - xorq %rbp,%rbp - jmp L$sqrx8x_outer_loop - -.p2align 5 -L$sqrx8x_outer_loop: - mulxq 8(%rsi),%r8,%rax - adcxq %r9,%r8 - adoxq %rax,%r10 - mulxq 16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 -.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 - adcxq %r11,%r10 - adoxq %rax,%r12 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 - adcxq %r12,%r11 - adoxq %rax,%r13 - mulxq 40(%rsi),%r12,%rax - adcxq %r13,%r12 - adoxq %rax,%r14 - mulxq 48(%rsi),%r13,%rax - adcxq %r14,%r13 - adoxq %r15,%rax - mulxq 56(%rsi),%r14,%r15 - movq 8(%rsi),%rdx - adcxq %rax,%r14 - adoxq %rbp,%r15 - adcq 64(%rdi),%r15 - movq %r8,8(%rdi) - movq %r9,16(%rdi) - sbbq %rcx,%rcx - xorq %rbp,%rbp - - - mulxq 16(%rsi),%r8,%rbx - mulxq 24(%rsi),%r9,%rax - adcxq %r10,%r8 - adoxq %rbx,%r9 - mulxq 32(%rsi),%r10,%rbx - adcxq %r11,%r9 - adoxq %rax,%r10 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 - adcxq %r12,%r10 - adoxq %rbx,%r11 -.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 - adcxq %r13,%r11 - adoxq %r14,%r12 -.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 - movq 16(%rsi),%rdx - adcxq %rax,%r12 - adoxq %rbx,%r13 - adcxq %r15,%r13 - adoxq %rbp,%r14 - adcxq %rbp,%r14 - - movq %r8,24(%rdi) - movq %r9,32(%rdi) - - mulxq 24(%rsi),%r8,%rbx - mulxq 32(%rsi),%r9,%rax - adcxq %r10,%r8 - adoxq %rbx,%r9 - mulxq 40(%rsi),%r10,%rbx - adcxq %r11,%r9 - adoxq %rax,%r10 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 - adcxq %r12,%r10 - adoxq %r13,%r11 -.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 -.byte 0x3e - movq 24(%rsi),%rdx - adcxq %rbx,%r11 - adoxq %rax,%r12 - adcxq %r14,%r12 - movq %r8,40(%rdi) - movq %r9,48(%rdi) - mulxq 32(%rsi),%r8,%rax - adoxq %rbp,%r13 - adcxq %rbp,%r13 - - mulxq 40(%rsi),%r9,%rbx - adcxq %r10,%r8 - adoxq %rax,%r9 - mulxq 48(%rsi),%r10,%rax - adcxq %r11,%r9 - adoxq %r12,%r10 - mulxq 56(%rsi),%r11,%r12 - movq 32(%rsi),%rdx - movq 40(%rsi),%r14 - adcxq %rbx,%r10 - adoxq %rax,%r11 - movq 48(%rsi),%r15 - adcxq %r13,%r11 - adoxq %rbp,%r12 - adcxq %rbp,%r12 - - movq %r8,56(%rdi) - movq %r9,64(%rdi) - - mulxq %r14,%r9,%rax - movq 56(%rsi),%r8 - adcxq %r10,%r9 - mulxq %r15,%r10,%rbx - adoxq %rax,%r10 - adcxq %r11,%r10 - mulxq %r8,%r11,%rax - movq %r14,%rdx - adoxq %rbx,%r11 - adcxq %r12,%r11 - - adcxq %rbp,%rax - - mulxq %r15,%r14,%rbx - mulxq %r8,%r12,%r13 - movq %r15,%rdx - leaq 64(%rsi),%rsi - adcxq %r14,%r11 - adoxq %rbx,%r12 - adcxq %rax,%r12 - adoxq %rbp,%r13 - -.byte 0x67,0x67 - mulxq %r8,%r8,%r14 - adcxq %r8,%r13 - adcxq %rbp,%r14 - - cmpq 8+8(%rsp),%rsi - je L$sqrx8x_outer_break - - negq %rcx - movq $-8,%rcx - movq %rbp,%r15 - movq 64(%rdi),%r8 - adcxq 72(%rdi),%r9 - adcxq 80(%rdi),%r10 - adcxq 88(%rdi),%r11 - adcq 96(%rdi),%r12 - adcq 104(%rdi),%r13 - adcq 112(%rdi),%r14 - adcq 120(%rdi),%r15 - leaq (%rsi),%rbp - leaq 128(%rdi),%rdi - sbbq %rax,%rax - - movq -64(%rsi),%rdx - movq %rax,16+8(%rsp) - movq %rdi,24+8(%rsp) - - - xorl %eax,%eax - jmp L$sqrx8x_loop - -.p2align 5 -L$sqrx8x_loop: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rbp),%rax,%r14 - movq %rbx,(%rdi,%rcx,8) - movl $0,%ebx - adcxq %rax,%r13 - adoxq %r15,%r14 - -.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 - movq 8(%rsi,%rcx,8),%rdx - adcxq %rax,%r14 - adoxq %rbx,%r15 - adcxq %rbx,%r15 - -.byte 0x67 - incq %rcx - jnz L$sqrx8x_loop - - leaq 64(%rbp),%rbp - movq $-8,%rcx - cmpq 8+8(%rsp),%rbp - je L$sqrx8x_break - - subq 16+8(%rsp),%rbx -.byte 0x66 - movq -64(%rsi),%rdx - adcxq 0(%rdi),%r8 - adcxq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi -.byte 0x67 - sbbq %rax,%rax - xorl %ebx,%ebx - movq %rax,16+8(%rsp) - jmp L$sqrx8x_loop - -.p2align 5 -L$sqrx8x_break: - xorq %rbp,%rbp - subq 16+8(%rsp),%rbx - adcxq %rbp,%r8 - movq 24+8(%rsp),%rcx - adcxq %rbp,%r9 - movq 0(%rsi),%rdx - adcq $0,%r10 - movq %r8,0(%rdi) - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - cmpq %rcx,%rdi - je L$sqrx8x_outer_loop - - movq %r9,8(%rdi) - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - movq 40(%rcx),%r13 - movq %r14,48(%rdi) - movq 48(%rcx),%r14 - movq %r15,56(%rdi) - movq 56(%rcx),%r15 - movq %rcx,%rdi - jmp L$sqrx8x_outer_loop - -.p2align 5 -L$sqrx8x_outer_break: - movq %r9,72(%rdi) -.byte 102,72,15,126,217 - movq %r10,80(%rdi) - movq %r11,88(%rdi) - movq %r12,96(%rdi) - movq %r13,104(%rdi) - movq %r14,112(%rdi) - leaq 48+8(%rsp),%rdi - movq (%rsi,%rcx,1),%rdx - - movq 8(%rdi),%r11 - xorq %r10,%r10 - movq 0+8(%rsp),%r9 - adoxq %r11,%r11 - movq 16(%rdi),%r12 - movq 24(%rdi),%r13 - - -.p2align 5 -L$sqrx4x_shift_n_add: - mulxq %rdx,%rax,%rbx - adoxq %r12,%r12 - adcxq %r10,%rax -.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 -.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 - adoxq %r13,%r13 - adcxq %r11,%rbx - movq 40(%rdi),%r11 - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r10,%r10 - adcxq %r12,%rax - movq 16(%rsi,%rcx,1),%rdx - movq 48(%rdi),%r12 - adoxq %r11,%r11 - adcxq %r13,%rbx - movq 56(%rdi),%r13 - movq %rax,16(%rdi) - movq %rbx,24(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r12,%r12 - adcxq %r10,%rax - movq 24(%rsi,%rcx,1),%rdx - leaq 32(%rcx),%rcx - movq 64(%rdi),%r10 - adoxq %r13,%r13 - adcxq %r11,%rbx - movq 72(%rdi),%r11 - movq %rax,32(%rdi) - movq %rbx,40(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r10,%r10 - adcxq %r12,%rax - jrcxz L$sqrx4x_shift_n_add_break -.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 - adoxq %r11,%r11 - adcxq %r13,%rbx - movq 80(%rdi),%r12 - movq 88(%rdi),%r13 - movq %rax,48(%rdi) - movq %rbx,56(%rdi) - leaq 64(%rdi),%rdi - nop - jmp L$sqrx4x_shift_n_add - -.p2align 5 -L$sqrx4x_shift_n_add_break: - adcxq %r13,%rbx - movq %rax,48(%rdi) - movq %rbx,56(%rdi) - leaq 64(%rdi),%rdi -.byte 102,72,15,126,213 -__bn_sqrx8x_reduction: - xorl %eax,%eax - movq 32+8(%rsp),%rbx - movq 48+8(%rsp),%rdx - leaq -64(%rbp,%r9,1),%rcx - - movq %rcx,0+8(%rsp) - movq %rdi,8+8(%rsp) - - leaq 48+8(%rsp),%rdi - jmp L$sqrx8x_reduction_loop - -.p2align 5 -L$sqrx8x_reduction_loop: - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq %rdx,%r8 - imulq %rbx,%rdx - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq %rax,24+8(%rsp) - - leaq 64(%rdi),%rdi - xorq %rsi,%rsi - movq $-8,%rcx - jmp L$sqrx8x_reduce - -.p2align 5 -L$sqrx8x_reduce: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rbx,%rax - adoxq %r9,%r8 - - mulxq 8(%rbp),%rbx,%r9 - adcxq %rbx,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rbx,%r10 - adcxq %rbx,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rbx,%r11 - adcxq %rbx,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 - movq %rdx,%rax - movq %r8,%rdx - adcxq %rbx,%r11 - adoxq %r13,%r12 - - mulxq 32+8(%rsp),%rbx,%rdx - movq %rax,%rdx - movq %rax,64+48+8(%rsp,%rcx,8) - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rbp),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rbp),%rax,%r15 - movq %rbx,%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - adcxq %rsi,%r15 - -.byte 0x67,0x67,0x67 - incq %rcx - jnz L$sqrx8x_reduce - - movq %rsi,%rax - cmpq 0+8(%rsp),%rbp - jae L$sqrx8x_no_tail - - movq 48+8(%rsp),%rdx - addq 0(%rdi),%r8 - leaq 64(%rbp),%rbp - movq $-8,%rcx - adcxq 8(%rdi),%r9 - adcxq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi - sbbq %rax,%rax - - xorq %rsi,%rsi - movq %rax,16+8(%rsp) - jmp L$sqrx8x_tail - -.p2align 5 -L$sqrx8x_tail: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rbp),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rbp),%rax,%r15 - movq 72+48+8(%rsp,%rcx,8),%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - movq %rbx,(%rdi,%rcx,8) - movq %r8,%rbx - adcxq %rsi,%r15 - - incq %rcx - jnz L$sqrx8x_tail - - cmpq 0+8(%rsp),%rbp - jae L$sqrx8x_tail_done - - subq 16+8(%rsp),%rsi - movq 48+8(%rsp),%rdx - leaq 64(%rbp),%rbp - adcq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi - sbbq %rax,%rax - subq $8,%rcx - - xorq %rsi,%rsi - movq %rax,16+8(%rsp) - jmp L$sqrx8x_tail - -.p2align 5 -L$sqrx8x_tail_done: - xorq %rax,%rax - addq 24+8(%rsp),%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rax - - subq 16+8(%rsp),%rsi -L$sqrx8x_no_tail: - adcq 0(%rdi),%r8 -.byte 102,72,15,126,217 - adcq 8(%rdi),%r9 - movq 56(%rbp),%rsi -.byte 102,72,15,126,213 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - adcq $0,%rax - - movq 32+8(%rsp),%rbx - movq 64(%rdi,%rcx,1),%rdx - - movq %r8,0(%rdi) - leaq 64(%rdi),%r8 - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - leaq 64(%rdi,%rcx,1),%rdi - cmpq 8+8(%rsp),%r8 - jb L$sqrx8x_reduction_loop - .byte 0xf3,0xc3 - - -.p2align 5 - -__bn_postx4x_internal: - - movq 0(%rbp),%r12 - movq %rcx,%r10 - movq %rcx,%r9 - negq %rax - sarq $3+2,%rcx - -.byte 102,72,15,126,202 -.byte 102,72,15,126,206 - decq %r12 - movq 8(%rbp),%r13 - xorq %r8,%r8 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 - jmp L$sqrx4x_sub_entry - -.p2align 4 -L$sqrx4x_sub: - movq 0(%rbp),%r12 - movq 8(%rbp),%r13 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 -L$sqrx4x_sub_entry: - andnq %rax,%r12,%r12 - leaq 32(%rbp),%rbp - andnq %rax,%r13,%r13 - andnq %rax,%r14,%r14 - andnq %rax,%r15,%r15 - - negq %r8 - adcq 0(%rdi),%r12 - adcq 8(%rdi),%r13 - adcq 16(%rdi),%r14 - adcq 24(%rdi),%r15 - movq %r12,0(%rdx) - leaq 32(%rdi),%rdi - movq %r13,8(%rdx) - sbbq %r8,%r8 - movq %r14,16(%rdx) - movq %r15,24(%rdx) - leaq 32(%rdx),%rdx - - incq %rcx - jnz L$sqrx4x_sub - - negq %r9 - - .byte 0xf3,0xc3 - - -.globl _bn_scatter5 -.private_extern _bn_scatter5 - -.p2align 4 -_bn_scatter5: - - cmpl $0,%esi - jz L$scatter_epilogue - leaq (%rdx,%rcx,8),%rdx -L$scatter: - movq (%rdi),%rax - leaq 8(%rdi),%rdi - movq %rax,(%rdx) - leaq 256(%rdx),%rdx - subl $1,%esi - jnz L$scatter -L$scatter_epilogue: - .byte 0xf3,0xc3 - - - -.globl _bn_gather5 -.private_extern _bn_gather5 - -.p2align 5 -_bn_gather5: - -L$SEH_begin_bn_gather5: - -.byte 0x4c,0x8d,0x14,0x24 - -.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 - leaq L$inc(%rip),%rax - andq $-16,%rsp - - movd %ecx,%xmm5 - movdqa 0(%rax),%xmm0 - movdqa 16(%rax),%xmm1 - leaq 128(%rdx),%r11 - leaq 128(%rsp),%rax - - pshufd $0,%xmm5,%xmm5 - movdqa %xmm1,%xmm4 - movdqa %xmm1,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm4,%xmm3 - - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,-128(%rax) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,-112(%rax) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,-96(%rax) - movdqa %xmm4,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,-80(%rax) - movdqa %xmm4,%xmm3 - - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,-64(%rax) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,-48(%rax) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,-32(%rax) - movdqa %xmm4,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,-16(%rax) - movdqa %xmm4,%xmm3 - - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,0(%rax) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,16(%rax) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,32(%rax) - movdqa %xmm4,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,48(%rax) - movdqa %xmm4,%xmm3 - - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,64(%rax) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,80(%rax) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,96(%rax) - movdqa %xmm4,%xmm2 - movdqa %xmm3,112(%rax) - jmp L$gather - -.p2align 5 -L$gather: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - movdqa -128(%r11),%xmm0 - movdqa -112(%r11),%xmm1 - movdqa -96(%r11),%xmm2 - pand -128(%rax),%xmm0 - movdqa -80(%r11),%xmm3 - pand -112(%rax),%xmm1 - por %xmm0,%xmm4 - pand -96(%rax),%xmm2 - por %xmm1,%xmm5 - pand -80(%rax),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa -64(%r11),%xmm0 - movdqa -48(%r11),%xmm1 - movdqa -32(%r11),%xmm2 - pand -64(%rax),%xmm0 - movdqa -16(%r11),%xmm3 - pand -48(%rax),%xmm1 - por %xmm0,%xmm4 - pand -32(%rax),%xmm2 - por %xmm1,%xmm5 - pand -16(%rax),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 0(%r11),%xmm0 - movdqa 16(%r11),%xmm1 - movdqa 32(%r11),%xmm2 - pand 0(%rax),%xmm0 - movdqa 48(%r11),%xmm3 - pand 16(%rax),%xmm1 - por %xmm0,%xmm4 - pand 32(%rax),%xmm2 - por %xmm1,%xmm5 - pand 48(%rax),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 64(%r11),%xmm0 - movdqa 80(%r11),%xmm1 - movdqa 96(%r11),%xmm2 - pand 64(%rax),%xmm0 - movdqa 112(%r11),%xmm3 - pand 80(%rax),%xmm1 - por %xmm0,%xmm4 - pand 96(%rax),%xmm2 - por %xmm1,%xmm5 - pand 112(%rax),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - por %xmm5,%xmm4 - leaq 256(%r11),%r11 - pshufd $0x4e,%xmm4,%xmm0 - por %xmm4,%xmm0 - movq %xmm0,(%rdi) - leaq 8(%rdi),%rdi - subl $1,%esi - jnz L$gather - - leaq (%r10),%rsp - - .byte 0xf3,0xc3 -L$SEH_end_bn_gather5: - - -.p2align 6 -L$inc: -.long 0,0, 1,1 -.long 2,2, 2,2 -.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -#endif diff --git a/third_party/boringssl/apple-x86_64/crypto/test/trampoline-x86_64.S b/third_party/boringssl/apple-x86_64/crypto/test/trampoline-x86_64.S deleted file mode 100644 index 5f20aa78..00000000 --- a/third_party/boringssl/apple-x86_64/crypto/test/trampoline-x86_64.S +++ /dev/null @@ -1,513 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - - - - - - - -.globl _abi_test_trampoline -.private_extern _abi_test_trampoline -.p2align 4 -_abi_test_trampoline: -L$abi_test_trampoline_seh_begin: - - - - - - - - - - - subq $120,%rsp - -L$abi_test_trampoline_seh_prolog_alloc: - movq %r8,48(%rsp) - movq %rbx,64(%rsp) - -L$abi_test_trampoline_seh_prolog_rbx: - movq %rbp,72(%rsp) - -L$abi_test_trampoline_seh_prolog_rbp: - movq %r12,80(%rsp) - -L$abi_test_trampoline_seh_prolog_r12: - movq %r13,88(%rsp) - -L$abi_test_trampoline_seh_prolog_r13: - movq %r14,96(%rsp) - -L$abi_test_trampoline_seh_prolog_r14: - movq %r15,104(%rsp) - -L$abi_test_trampoline_seh_prolog_r15: -L$abi_test_trampoline_seh_prolog_end: - movq 0(%rsi),%rbx - movq 8(%rsi),%rbp - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - movq 32(%rsi),%r14 - movq 40(%rsi),%r15 - - movq %rdi,32(%rsp) - movq %rsi,40(%rsp) - - - - - movq %rdx,%r10 - movq %rcx,%r11 - decq %r11 - js L$args_done - movq (%r10),%rdi - addq $8,%r10 - decq %r11 - js L$args_done - movq (%r10),%rsi - addq $8,%r10 - decq %r11 - js L$args_done - movq (%r10),%rdx - addq $8,%r10 - decq %r11 - js L$args_done - movq (%r10),%rcx - addq $8,%r10 - decq %r11 - js L$args_done - movq (%r10),%r8 - addq $8,%r10 - decq %r11 - js L$args_done - movq (%r10),%r9 - addq $8,%r10 - leaq 0(%rsp),%rax -L$args_loop: - decq %r11 - js L$args_done - - - - - - - movq %r11,56(%rsp) - movq (%r10),%r11 - movq %r11,(%rax) - movq 56(%rsp),%r11 - - addq $8,%r10 - addq $8,%rax - jmp L$args_loop - -L$args_done: - movq 32(%rsp),%rax - movq 48(%rsp),%r10 - testq %r10,%r10 - jz L$no_unwind - - - pushfq - orq $0x100,0(%rsp) - popfq - - - - nop -.globl _abi_test_unwind_start -.private_extern _abi_test_unwind_start -_abi_test_unwind_start: - - call *%rax -.globl _abi_test_unwind_return -.private_extern _abi_test_unwind_return -_abi_test_unwind_return: - - - - - pushfq - andq $-0x101,0(%rsp) - popfq -.globl _abi_test_unwind_stop -.private_extern _abi_test_unwind_stop -_abi_test_unwind_stop: - - jmp L$call_done - -L$no_unwind: - call *%rax - -L$call_done: - - movq 40(%rsp),%rsi - movq %rbx,0(%rsi) - movq %rbp,8(%rsi) - movq %r12,16(%rsi) - movq %r13,24(%rsi) - movq %r14,32(%rsi) - movq %r15,40(%rsi) - movq 64(%rsp),%rbx - - movq 72(%rsp),%rbp - - movq 80(%rsp),%r12 - - movq 88(%rsp),%r13 - - movq 96(%rsp),%r14 - - movq 104(%rsp),%r15 - - addq $120,%rsp - - - - .byte 0xf3,0xc3 - -L$abi_test_trampoline_seh_end: - - -.globl _abi_test_clobber_rax -.private_extern _abi_test_clobber_rax -.p2align 4 -_abi_test_clobber_rax: - xorq %rax,%rax - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_rbx -.private_extern _abi_test_clobber_rbx -.p2align 4 -_abi_test_clobber_rbx: - xorq %rbx,%rbx - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_rcx -.private_extern _abi_test_clobber_rcx -.p2align 4 -_abi_test_clobber_rcx: - xorq %rcx,%rcx - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_rdx -.private_extern _abi_test_clobber_rdx -.p2align 4 -_abi_test_clobber_rdx: - xorq %rdx,%rdx - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_rdi -.private_extern _abi_test_clobber_rdi -.p2align 4 -_abi_test_clobber_rdi: - xorq %rdi,%rdi - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_rsi -.private_extern _abi_test_clobber_rsi -.p2align 4 -_abi_test_clobber_rsi: - xorq %rsi,%rsi - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_rbp -.private_extern _abi_test_clobber_rbp -.p2align 4 -_abi_test_clobber_rbp: - xorq %rbp,%rbp - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_r8 -.private_extern _abi_test_clobber_r8 -.p2align 4 -_abi_test_clobber_r8: - xorq %r8,%r8 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_r9 -.private_extern _abi_test_clobber_r9 -.p2align 4 -_abi_test_clobber_r9: - xorq %r9,%r9 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_r10 -.private_extern _abi_test_clobber_r10 -.p2align 4 -_abi_test_clobber_r10: - xorq %r10,%r10 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_r11 -.private_extern _abi_test_clobber_r11 -.p2align 4 -_abi_test_clobber_r11: - xorq %r11,%r11 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_r12 -.private_extern _abi_test_clobber_r12 -.p2align 4 -_abi_test_clobber_r12: - xorq %r12,%r12 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_r13 -.private_extern _abi_test_clobber_r13 -.p2align 4 -_abi_test_clobber_r13: - xorq %r13,%r13 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_r14 -.private_extern _abi_test_clobber_r14 -.p2align 4 -_abi_test_clobber_r14: - xorq %r14,%r14 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_r15 -.private_extern _abi_test_clobber_r15 -.p2align 4 -_abi_test_clobber_r15: - xorq %r15,%r15 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm0 -.private_extern _abi_test_clobber_xmm0 -.p2align 4 -_abi_test_clobber_xmm0: - pxor %xmm0,%xmm0 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm1 -.private_extern _abi_test_clobber_xmm1 -.p2align 4 -_abi_test_clobber_xmm1: - pxor %xmm1,%xmm1 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm2 -.private_extern _abi_test_clobber_xmm2 -.p2align 4 -_abi_test_clobber_xmm2: - pxor %xmm2,%xmm2 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm3 -.private_extern _abi_test_clobber_xmm3 -.p2align 4 -_abi_test_clobber_xmm3: - pxor %xmm3,%xmm3 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm4 -.private_extern _abi_test_clobber_xmm4 -.p2align 4 -_abi_test_clobber_xmm4: - pxor %xmm4,%xmm4 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm5 -.private_extern _abi_test_clobber_xmm5 -.p2align 4 -_abi_test_clobber_xmm5: - pxor %xmm5,%xmm5 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm6 -.private_extern _abi_test_clobber_xmm6 -.p2align 4 -_abi_test_clobber_xmm6: - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm7 -.private_extern _abi_test_clobber_xmm7 -.p2align 4 -_abi_test_clobber_xmm7: - pxor %xmm7,%xmm7 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm8 -.private_extern _abi_test_clobber_xmm8 -.p2align 4 -_abi_test_clobber_xmm8: - pxor %xmm8,%xmm8 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm9 -.private_extern _abi_test_clobber_xmm9 -.p2align 4 -_abi_test_clobber_xmm9: - pxor %xmm9,%xmm9 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm10 -.private_extern _abi_test_clobber_xmm10 -.p2align 4 -_abi_test_clobber_xmm10: - pxor %xmm10,%xmm10 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm11 -.private_extern _abi_test_clobber_xmm11 -.p2align 4 -_abi_test_clobber_xmm11: - pxor %xmm11,%xmm11 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm12 -.private_extern _abi_test_clobber_xmm12 -.p2align 4 -_abi_test_clobber_xmm12: - pxor %xmm12,%xmm12 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm13 -.private_extern _abi_test_clobber_xmm13 -.p2align 4 -_abi_test_clobber_xmm13: - pxor %xmm13,%xmm13 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm14 -.private_extern _abi_test_clobber_xmm14 -.p2align 4 -_abi_test_clobber_xmm14: - pxor %xmm14,%xmm14 - .byte 0xf3,0xc3 - - -.globl _abi_test_clobber_xmm15 -.private_extern _abi_test_clobber_xmm15 -.p2align 4 -_abi_test_clobber_xmm15: - pxor %xmm15,%xmm15 - .byte 0xf3,0xc3 - - - - - -.globl _abi_test_bad_unwind_wrong_register -.private_extern _abi_test_bad_unwind_wrong_register -.p2align 4 -_abi_test_bad_unwind_wrong_register: - -L$abi_test_bad_unwind_wrong_register_seh_begin: - pushq %r12 - -L$abi_test_bad_unwind_wrong_register_seh_push_r13: - - - - nop - popq %r12 - - .byte 0xf3,0xc3 -L$abi_test_bad_unwind_wrong_register_seh_end: - - - - - - - -.globl _abi_test_bad_unwind_temporary -.private_extern _abi_test_bad_unwind_temporary -.p2align 4 -_abi_test_bad_unwind_temporary: - -L$abi_test_bad_unwind_temporary_seh_begin: - pushq %r12 - -L$abi_test_bad_unwind_temporary_seh_push_r12: - - movq %r12,%rax - incq %rax - movq %rax,(%rsp) - - - - movq %r12,(%rsp) - - - popq %r12 - - .byte 0xf3,0xc3 -L$abi_test_bad_unwind_temporary_seh_end: - - - - - - - -.globl _abi_test_get_and_clear_direction_flag -.private_extern _abi_test_get_and_clear_direction_flag -_abi_test_get_and_clear_direction_flag: - pushfq - popq %rax - andq $0x400,%rax - shrq $10,%rax - cld - .byte 0xf3,0xc3 - - - - - -.globl _abi_test_set_direction_flag -.private_extern _abi_test_set_direction_flag -_abi_test_set_direction_flag: - std - .byte 0xf3,0xc3 - -#endif diff --git a/third_party/boringssl/src/crypto/curve25519/asm/x25519-asm-arm.S b/third_party/boringssl/crypto/curve25519/asm/x25519-asm-arm.S similarity index 97% rename from third_party/boringssl/src/crypto/curve25519/asm/x25519-asm-arm.S rename to third_party/boringssl/crypto/curve25519/asm/x25519-asm-arm.S index 41bc0c6e..5ce26cbe 100644 --- a/third_party/boringssl/src/crypto/curve25519/asm/x25519-asm-arm.S +++ b/third_party/boringssl/crypto/curve25519/asm/x25519-asm-arm.S @@ -1,33 +1,25 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2015 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. /* This file is taken from crypto_scalarmult/curve25519/neon2/scalarmult.s in * SUPERCOP 20141124 (http://bench.cr.yp.to/supercop.html). That code is public - * domain licensed but the standard ISC license is included above to keep + * domain licensed but the standard Apache 2.0 license is included above to keep * licensing simple. */ -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif +#include -#if !defined(OPENSSL_NO_ASM) && defined(__arm__) && !defined(__APPLE__) - -#if defined(BORINGSSL_PREFIX) -#include -#endif +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) .fpu neon .text @@ -2129,8 +2121,4 @@ mov sp,r12 vpop {q4,q5,q6,q7} bx lr -#endif /* !OPENSSL_NO_ASM && __arm__ && !__APPLE__ */ - -#if defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif +#endif /* !OPENSSL_NO_ASM && OPENSSL_ARM && __ELF__ */ diff --git a/third_party/boringssl/src/crypto/hrss/asm/poly_rq_mul.S b/third_party/boringssl/crypto/hrss/asm/poly_rq_mul.S similarity index 99% rename from third_party/boringssl/src/crypto/hrss/asm/poly_rq_mul.S rename to third_party/boringssl/crypto/hrss/asm/poly_rq_mul.S index c37d7d0b..abbc4e3f 100644 --- a/third_party/boringssl/src/crypto/hrss/asm/poly_rq_mul.S +++ b/third_party/boringssl/crypto/hrss/asm/poly_rq_mul.S @@ -1,22 +1,20 @@ // Copyright (c) 2017, the HRSS authors. // -// Permission to use, copy, modify, and/or distribute this software for any -// purpose with or without fee is hereby granted, provided that the above -// copyright notice and this permission notice appear in all copies. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at // -// THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES -// WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF -// MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY -// SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES -// WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION -// OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN -// CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. -#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && defined(__linux__) +#include -#if defined(BORINGSSL_PREFIX) -#include -#endif +#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && defined(OPENSSL_LINUX) && defined(OPENSSL_X86_64) // This is the polynomial multiplication function from [HRSS], provided by kind // permission of the authors. @@ -303,6 +301,7 @@ mask_mod8192: .att_syntax prefix poly_Rq_mul: .cfi_startproc +_CET_ENDBR push %rbp .cfi_adjust_cfa_offset 8 .cfi_offset rbp, -16 @@ -8476,6 +8475,7 @@ vmovdqu 1320(%rdi), %ymm11 vpaddw 2752(%r8), %ymm11, %ymm11 vpand mask_mod8192(%rip), %ymm11, %ymm11 vmovdqu %ymm11, 1320(%rdi) +vzeroupper pop %r12 .cfi_restore r12 pop %rbp @@ -8487,7 +8487,3 @@ ret .size poly_Rq_mul,.-poly_Rq_mul #endif - -#if defined(__ELF__) -.section .note.GNU-stack,"",@progbits -#endif diff --git a/third_party/boringssl/src/crypto/poly1305/poly1305_arm_asm.S b/third_party/boringssl/crypto/poly1305/poly1305_arm_asm.S similarity index 99% rename from third_party/boringssl/src/crypto/poly1305/poly1305_arm_asm.S rename to third_party/boringssl/crypto/poly1305/poly1305_arm_asm.S index 80a4b31f..619984e2 100644 --- a/third_party/boringssl/src/crypto/poly1305/poly1305_arm_asm.S +++ b/third_party/boringssl/crypto/poly1305/poly1305_arm_asm.S @@ -1,14 +1,6 @@ -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif +#include -#if defined(__arm__) && !defined(OPENSSL_NO_ASM) && !defined(__APPLE__) - -#if defined(BORINGSSL_PREFIX) -#include -#endif +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) # This implementation was taken from the public domain, neon2 version in # SUPERCOP by D. J. Bernstein and Peter Schwabe. @@ -2022,8 +2014,4 @@ vst1.8 d4,[r0,: 64] add sp,sp,#0 bx lr -#endif /* __arm__ && !OPENSSL_NO_ASM && !__APPLE__ */ - -#if defined(__ELF__) -.section .note.GNU-stack,"",%progbits -#endif +#endif /* !OPENSSL_NO_ASM && OPENSSL_ARM && __ELF__ */ diff --git a/third_party/boringssl/err_data.c b/third_party/boringssl/err_data.c deleted file mode 100644 index 7991be6f..00000000 --- a/third_party/boringssl/err_data.c +++ /dev/null @@ -1,1505 +0,0 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - - /* This file was generated by err_data_generate.go. */ - -#include -#include - -#include - -static_assert(ERR_LIB_NONE == 1, "library value changed"); -static_assert(ERR_LIB_SYS == 2, "library value changed"); -static_assert(ERR_LIB_BN == 3, "library value changed"); -static_assert(ERR_LIB_RSA == 4, "library value changed"); -static_assert(ERR_LIB_DH == 5, "library value changed"); -static_assert(ERR_LIB_EVP == 6, "library value changed"); -static_assert(ERR_LIB_BUF == 7, "library value changed"); -static_assert(ERR_LIB_OBJ == 8, "library value changed"); -static_assert(ERR_LIB_PEM == 9, "library value changed"); -static_assert(ERR_LIB_DSA == 10, "library value changed"); -static_assert(ERR_LIB_X509 == 11, "library value changed"); -static_assert(ERR_LIB_ASN1 == 12, "library value changed"); -static_assert(ERR_LIB_CONF == 13, "library value changed"); -static_assert(ERR_LIB_CRYPTO == 14, "library value changed"); -static_assert(ERR_LIB_EC == 15, "library value changed"); -static_assert(ERR_LIB_SSL == 16, "library value changed"); -static_assert(ERR_LIB_BIO == 17, "library value changed"); -static_assert(ERR_LIB_PKCS7 == 18, "library value changed"); -static_assert(ERR_LIB_PKCS8 == 19, "library value changed"); -static_assert(ERR_LIB_X509V3 == 20, "library value changed"); -static_assert(ERR_LIB_RAND == 21, "library value changed"); -static_assert(ERR_LIB_ENGINE == 22, "library value changed"); -static_assert(ERR_LIB_OCSP == 23, "library value changed"); -static_assert(ERR_LIB_UI == 24, "library value changed"); -static_assert(ERR_LIB_COMP == 25, "library value changed"); -static_assert(ERR_LIB_ECDSA == 26, "library value changed"); -static_assert(ERR_LIB_ECDH == 27, "library value changed"); -static_assert(ERR_LIB_HMAC == 28, "library value changed"); -static_assert(ERR_LIB_DIGEST == 29, "library value changed"); -static_assert(ERR_LIB_CIPHER == 30, "library value changed"); -static_assert(ERR_LIB_HKDF == 31, "library value changed"); -static_assert(ERR_LIB_TRUST_TOKEN == 32, "library value changed"); -static_assert(ERR_LIB_USER == 33, "library value changed"); -static_assert(ERR_NUM_LIBS == 34, "number of libraries changed"); - -const uint32_t kOpenSSLReasonValues[] = { - 0xc320885, - 0xc32889f, - 0xc3308ae, - 0xc3388be, - 0xc3408cd, - 0xc3488e6, - 0xc3508f2, - 0xc35890f, - 0xc36092f, - 0xc36893d, - 0xc37094d, - 0xc37895a, - 0xc38096a, - 0xc388975, - 0xc39098b, - 0xc39899a, - 0xc3a09ae, - 0xc3a8892, - 0xc3b00f7, - 0xc3b8921, - 0x10320892, - 0x10329620, - 0x1033162c, - 0x10339645, - 0x10341658, - 0x10348f72, - 0x10350cab, - 0x1035966b, - 0x10361695, - 0x103696a8, - 0x103716c7, - 0x103796e0, - 0x103816f5, - 0x10389713, - 0x10391722, - 0x1039973e, - 0x103a1759, - 0x103a9768, - 0x103b1784, - 0x103b979f, - 0x103c17c5, - 0x103c80f7, - 0x103d17d6, - 0x103d97ea, - 0x103e1809, - 0x103e9818, - 0x103f182f, - 0x103f9842, - 0x10400c6f, - 0x10409855, - 0x10411873, - 0x10419886, - 0x104218a0, - 0x104298b0, - 0x104318c4, - 0x104398da, - 0x104418f2, - 0x10449907, - 0x1045191b, - 0x1045992d, - 0x10460635, - 0x1046899a, - 0x10471942, - 0x10479959, - 0x1048196e, - 0x1048997c, - 0x10490ebe, - 0x104997b6, - 0x104a1680, - 0x14320c52, - 0x14328c60, - 0x14330c6f, - 0x14338c81, - 0x143400b9, - 0x143480f7, - 0x18320090, - 0x18328fc8, - 0x183300b9, - 0x18338fde, - 0x18340ff2, - 0x183480f7, - 0x18351011, - 0x18359029, - 0x1836103e, - 0x18369052, - 0x1837108a, - 0x183790a0, - 0x183810b4, - 0x183890c4, - 0x18390ac0, - 0x183990d4, - 0x183a10fa, - 0x183a9120, - 0x183b0cca, - 0x183b916f, - 0x183c1181, - 0x183c918c, - 0x183d119c, - 0x183d91ad, - 0x183e11be, - 0x183e91d0, - 0x183f11f9, - 0x183f9212, - 0x1840122a, - 0x1840870d, - 0x18411143, - 0x1841910e, - 0x1842112d, - 0x18428cb7, - 0x184310e9, - 0x18439155, - 0x18441007, - 0x18449076, - 0x20321264, - 0x20329251, - 0x24321270, - 0x243289e0, - 0x24331282, - 0x2433928f, - 0x2434129c, - 0x243492ae, - 0x243512bd, - 0x243592da, - 0x243612e7, - 0x243692f5, - 0x24371303, - 0x24379311, - 0x2438131a, - 0x24389327, - 0x2439133a, - 0x28320c9f, - 0x28328cca, - 0x28330c6f, - 0x28338cdd, - 0x28340cab, - 0x283480b9, - 0x283500f7, - 0x28358cb7, - 0x2c3232bf, - 0x2c329351, - 0x2c3332cd, - 0x2c33b2df, - 0x2c3432f3, - 0x2c34b305, - 0x2c353320, - 0x2c35b332, - 0x2c363362, - 0x2c36833a, - 0x2c37336f, - 0x2c37b39b, - 0x2c3833c0, - 0x2c38b3d7, - 0x2c3933f5, - 0x2c39b405, - 0x2c3a3417, - 0x2c3ab42b, - 0x2c3b343c, - 0x2c3bb45b, - 0x2c3c1363, - 0x2c3c9379, - 0x2c3d34a0, - 0x2c3d9392, - 0x2c3e34ca, - 0x2c3eb4d8, - 0x2c3f34f0, - 0x2c3fb508, - 0x2c403532, - 0x2c409264, - 0x2c413543, - 0x2c41b556, - 0x2c42122a, - 0x2c42b567, - 0x2c43076d, - 0x2c43b44d, - 0x2c4433ae, - 0x2c44b515, - 0x2c453345, - 0x2c45b381, - 0x2c4633e5, - 0x2c46b46f, - 0x2c473484, - 0x2c47b4bd, - 0x30320000, - 0x30328015, - 0x3033001f, - 0x30338038, - 0x30340057, - 0x30348071, - 0x30350078, - 0x30358090, - 0x303600a1, - 0x303680b9, - 0x303700c6, - 0x303780d5, - 0x303800f7, - 0x30388104, - 0x30390117, - 0x30398132, - 0x303a0147, - 0x303a815b, - 0x303b016f, - 0x303b8180, - 0x303c0199, - 0x303c81b6, - 0x303d01c4, - 0x303d81d8, - 0x303e01e8, - 0x303e8201, - 0x303f0211, - 0x303f8224, - 0x30400233, - 0x3040823f, - 0x30410254, - 0x30418264, - 0x3042027b, - 0x30428288, - 0x3043029b, - 0x304382aa, - 0x304402bf, - 0x304482e0, - 0x304502f3, - 0x30458306, - 0x3046031f, - 0x3046833a, - 0x30470372, - 0x30478384, - 0x304803a2, - 0x304883b3, - 0x304903c2, - 0x304983da, - 0x304a03ec, - 0x304a8400, - 0x304b0418, - 0x304b842b, - 0x304c0436, - 0x304c8447, - 0x304d0453, - 0x304d8469, - 0x304e0477, - 0x304e848d, - 0x304f049f, - 0x304f84b1, - 0x305004d4, - 0x305084e7, - 0x305104f8, - 0x30518508, - 0x30520520, - 0x30528535, - 0x3053054d, - 0x30538561, - 0x30540579, - 0x30548592, - 0x305505ab, - 0x305585c8, - 0x305605d3, - 0x305685eb, - 0x305705fb, - 0x3057860c, - 0x3058061f, - 0x30588635, - 0x3059063e, - 0x30598653, - 0x305a0666, - 0x305a8675, - 0x305b0695, - 0x305b86a4, - 0x305c06c5, - 0x305c86e1, - 0x305d06ed, - 0x305d870d, - 0x305e0729, - 0x305e874d, - 0x305f0763, - 0x305f876d, - 0x306004c4, - 0x3060804a, - 0x30610357, - 0x3061873a, - 0x30620392, - 0x34320bb0, - 0x34328bc4, - 0x34330be1, - 0x34338bf4, - 0x34340c03, - 0x34348c3c, - 0x34350c20, - 0x3c320090, - 0x3c328d07, - 0x3c330d20, - 0x3c338d3b, - 0x3c340d58, - 0x3c348d82, - 0x3c350d9d, - 0x3c358dc3, - 0x3c360ddc, - 0x3c368df4, - 0x3c370e05, - 0x3c378e13, - 0x3c380e20, - 0x3c388e34, - 0x3c390cca, - 0x3c398e57, - 0x3c3a0e6b, - 0x3c3a895a, - 0x3c3b0e7b, - 0x3c3b8e96, - 0x3c3c0ea8, - 0x3c3c8edb, - 0x3c3d0ee5, - 0x3c3d8ef9, - 0x3c3e0f07, - 0x3c3e8f2c, - 0x3c3f0cf3, - 0x3c3f8f15, - 0x3c4000b9, - 0x3c4080f7, - 0x3c410d73, - 0x3c418db2, - 0x3c420ebe, - 0x3c428e48, - 0x40321a0e, - 0x40329a24, - 0x40331a52, - 0x40339a5c, - 0x40341a73, - 0x40349a91, - 0x40351aa1, - 0x40359ab3, - 0x40361ac0, - 0x40369acc, - 0x40371ae1, - 0x40379af3, - 0x40381afe, - 0x40389b10, - 0x40390f72, - 0x40399b20, - 0x403a1b33, - 0x403a9b54, - 0x403b1b65, - 0x403b9b75, - 0x403c0071, - 0x403c8090, - 0x403d1bd6, - 0x403d9bec, - 0x403e1bfb, - 0x403e9c33, - 0x403f1c4d, - 0x403f9c75, - 0x40401c8a, - 0x40409c9e, - 0x40411cd9, - 0x40419cf4, - 0x40421d0d, - 0x40429d20, - 0x40431d34, - 0x40439d62, - 0x40441d79, - 0x404480b9, - 0x40451d8e, - 0x40459da0, - 0x40461dc4, - 0x40469de4, - 0x40471df2, - 0x40479e19, - 0x40481e8a, - 0x40489f44, - 0x40491f5b, - 0x40499f75, - 0x404a1f8c, - 0x404a9faa, - 0x404b1fc2, - 0x404b9fef, - 0x404c2005, - 0x404ca017, - 0x404d2038, - 0x404da071, - 0x404e2085, - 0x404ea092, - 0x404f212c, - 0x404fa1a2, - 0x40502211, - 0x4050a225, - 0x40512258, - 0x40522268, - 0x4052a28c, - 0x405322a4, - 0x4053a2b7, - 0x405422cc, - 0x4054a2ef, - 0x4055231a, - 0x4055a357, - 0x4056237c, - 0x4056a395, - 0x405723ad, - 0x4057a3c0, - 0x405823d5, - 0x4058a3fc, - 0x4059242b, - 0x4059a458, - 0x405a246c, - 0x405aa47c, - 0x405b2494, - 0x405ba4a5, - 0x405c24b8, - 0x405ca4f7, - 0x405d2504, - 0x405da529, - 0x405e2567, - 0x405e8afe, - 0x405f2588, - 0x405fa595, - 0x406025a3, - 0x4060a5c5, - 0x40612626, - 0x4061a65e, - 0x40622675, - 0x4062a686, - 0x406326d3, - 0x4063a6e8, - 0x406426ff, - 0x4064a72b, - 0x40652746, - 0x4065a75d, - 0x40662775, - 0x4066a79f, - 0x406727ca, - 0x4067a80f, - 0x40682857, - 0x4068a878, - 0x406928aa, - 0x4069a8d8, - 0x406a28f9, - 0x406aa919, - 0x406b2aa1, - 0x406baac4, - 0x406c2ada, - 0x406cade4, - 0x406d2e13, - 0x406dae3b, - 0x406e2e69, - 0x406eaeb6, - 0x406f2f0f, - 0x406faf47, - 0x40702f5a, - 0x4070af77, - 0x4071084d, - 0x4071af89, - 0x40722f9c, - 0x4072afd2, - 0x40732fea, - 0x4073957b, - 0x40742ffe, - 0x4074b018, - 0x40753029, - 0x4075b03d, - 0x4076304b, - 0x40769327, - 0x40773070, - 0x4077b0b0, - 0x407830cb, - 0x4078b104, - 0x4079311b, - 0x4079b131, - 0x407a315d, - 0x407ab170, - 0x407b3185, - 0x407bb197, - 0x407c31c8, - 0x407cb1d1, - 0x407d2893, - 0x407da1ca, - 0x407e30e0, - 0x407ea40c, - 0x407f1e06, - 0x407f9fd9, - 0x4080213c, - 0x40809e2e, - 0x4081227a, - 0x4081a0e0, - 0x40822e54, - 0x40829b81, - 0x408323e7, - 0x4083a710, - 0x40841e42, - 0x4084a444, - 0x408524c9, - 0x4085a5ed, - 0x40862549, - 0x4086a1e4, - 0x40872e9a, - 0x4087a63b, - 0x40881bbf, - 0x4088a822, - 0x40891c0e, - 0x40899b9b, - 0x408a2b12, - 0x408a9993, - 0x408b31ac, - 0x408baf24, - 0x408c24d9, - 0x408c99cb, - 0x408d1f2a, - 0x408d9e74, - 0x408e205a, - 0x408ea337, - 0x408f2836, - 0x408fa609, - 0x409027eb, - 0x4090a51b, - 0x40912afa, - 0x409199f1, - 0x40921c5b, - 0x4092aed5, - 0x40932fb5, - 0x4093a1f5, - 0x40941e56, - 0x4094ab2b, - 0x40952697, - 0x4095b13d, - 0x40962e81, - 0x4096a155, - 0x40972240, - 0x4097a0a9, - 0x40981cbb, - 0x4098a6ab, - 0x40992ef1, - 0x4099a364, - 0x409a22fd, - 0x409a99af, - 0x409b1eb0, - 0x409b9edb, - 0x409c3092, - 0x409c9f03, - 0x409d2111, - 0x409da0f6, - 0x409e1d4c, - 0x409ea18a, - 0x409f2172, - 0x409f9ea3, - 0x40a021b2, - 0x40a0a0c3, - 0x41f429cc, - 0x41f92a5e, - 0x41fe2951, - 0x41feac07, - 0x41ff2d35, - 0x420329e5, - 0x42082a07, - 0x4208aa43, - 0x42092935, - 0x4209aa7d, - 0x420a298c, - 0x420aa96c, - 0x420b29ac, - 0x420baa25, - 0x420c2d51, - 0x420cab3b, - 0x420d2bee, - 0x420dac25, - 0x42122c58, - 0x42172d18, - 0x4217ac9a, - 0x421c2cbc, - 0x421f2c77, - 0x42212dc9, - 0x42262cfb, - 0x422b2da7, - 0x422babc9, - 0x422c2d89, - 0x422cab7c, - 0x422d2b55, - 0x422dad68, - 0x422e2ba8, - 0x42302cd7, - 0x4230ac3f, - 0x44320778, - 0x44328787, - 0x44330793, - 0x443387a1, - 0x443407b4, - 0x443487c5, - 0x443507cc, - 0x443587d6, - 0x443607e9, - 0x443687ff, - 0x44370811, - 0x4437881e, - 0x4438082d, - 0x44388835, - 0x4439084d, - 0x4439885b, - 0x443a086e, - 0x48321351, - 0x48329363, - 0x48331379, - 0x48339392, - 0x4c3213cf, - 0x4c3293df, - 0x4c3313f2, - 0x4c339412, - 0x4c3400b9, - 0x4c3480f7, - 0x4c35141e, - 0x4c35942c, - 0x4c361448, - 0x4c36946e, - 0x4c37147d, - 0x4c37948b, - 0x4c3814a0, - 0x4c3894ac, - 0x4c3914cc, - 0x4c3994f6, - 0x4c3a150f, - 0x4c3a9528, - 0x4c3b0635, - 0x4c3b9541, - 0x4c3c1553, - 0x4c3c9562, - 0x4c3d157b, - 0x4c3d8c92, - 0x4c3e15e8, - 0x4c3e958a, - 0x4c3f160a, - 0x4c3f9327, - 0x4c4015a0, - 0x4c4093bb, - 0x4c4115d8, - 0x4c41945b, - 0x4c4215c4, - 0x4c4293a3, - 0x50323579, - 0x5032b588, - 0x50333593, - 0x5033b5a3, - 0x503435bc, - 0x5034b5d6, - 0x503535e4, - 0x5035b5fa, - 0x5036360c, - 0x5036b622, - 0x5037363b, - 0x5037b64e, - 0x50383666, - 0x5038b677, - 0x5039368c, - 0x5039b6a0, - 0x503a36c0, - 0x503ab6d6, - 0x503b36ee, - 0x503bb700, - 0x503c371c, - 0x503cb733, - 0x503d374c, - 0x503db762, - 0x503e376f, - 0x503eb785, - 0x503f3797, - 0x503f83b3, - 0x504037aa, - 0x5040b7ba, - 0x504137d4, - 0x5041b7e3, - 0x504237fd, - 0x5042b81a, - 0x5043382a, - 0x5043b83a, - 0x50443857, - 0x50448469, - 0x5045386b, - 0x5045b889, - 0x5046389c, - 0x5046b8b2, - 0x504738c4, - 0x5047b8d9, - 0x504838ff, - 0x5048b90d, - 0x50493920, - 0x5049b935, - 0x504a394b, - 0x504ab95b, - 0x504b397b, - 0x504bb98e, - 0x504c39b1, - 0x504cb9df, - 0x504d3a0c, - 0x504dba29, - 0x504e3a44, - 0x504eba60, - 0x504f3a72, - 0x504fba89, - 0x50503a98, - 0x50508729, - 0x50513aab, - 0x5051b849, - 0x505239f1, - 0x58320fb0, - 0x68320f72, - 0x68328cca, - 0x68330cdd, - 0x68338f80, - 0x68340f90, - 0x683480f7, - 0x6c320f38, - 0x6c328c81, - 0x6c330f43, - 0x6c338f5c, - 0x74320a66, - 0x743280b9, - 0x74330c92, - 0x783209cb, - 0x783289e0, - 0x783309ec, - 0x78338090, - 0x783409fb, - 0x78348a10, - 0x78350a2f, - 0x78358a51, - 0x78360a66, - 0x78368a7c, - 0x78370a8c, - 0x78378aad, - 0x78380ac0, - 0x78388ad2, - 0x78390adf, - 0x78398afe, - 0x783a0b13, - 0x783a8b21, - 0x783b0b2b, - 0x783b8b3f, - 0x783c0b56, - 0x783c8b6b, - 0x783d0b82, - 0x783d8b97, - 0x783e0aed, - 0x783e8a9f, - 0x7c321240, - 0x8032146e, - 0x80328090, - 0x8033328e, - 0x803380b9, - 0x8034329d, - 0x8034b205, - 0x80353223, - 0x8035b2b1, - 0x80363265, - 0x8036b214, - 0x80373257, - 0x8037b1f2, - 0x80383278, - 0x8038b234, - 0x80393249, -}; - -const size_t kOpenSSLReasonValuesLen = sizeof(kOpenSSLReasonValues) / sizeof(kOpenSSLReasonValues[0]); - -const char kOpenSSLReasonStringData[] = - "ASN1_LENGTH_MISMATCH\0" - "AUX_ERROR\0" - "BAD_GET_ASN1_OBJECT_CALL\0" - "BAD_OBJECT_HEADER\0" - "BAD_TEMPLATE\0" - "BMPSTRING_IS_WRONG_LENGTH\0" - "BN_LIB\0" - "BOOLEAN_IS_WRONG_LENGTH\0" - "BUFFER_TOO_SMALL\0" - "CONTEXT_NOT_INITIALISED\0" - "DECODE_ERROR\0" - "DEPTH_EXCEEDED\0" - "DIGEST_AND_KEY_TYPE_NOT_SUPPORTED\0" - "ENCODE_ERROR\0" - "ERROR_GETTING_TIME\0" - "EXPECTING_AN_ASN1_SEQUENCE\0" - "EXPECTING_AN_INTEGER\0" - "EXPECTING_AN_OBJECT\0" - "EXPECTING_A_BOOLEAN\0" - "EXPECTING_A_TIME\0" - "EXPLICIT_LENGTH_MISMATCH\0" - "EXPLICIT_TAG_NOT_CONSTRUCTED\0" - "FIELD_MISSING\0" - "FIRST_NUM_TOO_LARGE\0" - "HEADER_TOO_LONG\0" - "ILLEGAL_BITSTRING_FORMAT\0" - "ILLEGAL_BOOLEAN\0" - "ILLEGAL_CHARACTERS\0" - "ILLEGAL_FORMAT\0" - "ILLEGAL_HEX\0" - "ILLEGAL_IMPLICIT_TAG\0" - "ILLEGAL_INTEGER\0" - "ILLEGAL_NESTED_TAGGING\0" - "ILLEGAL_NULL\0" - "ILLEGAL_NULL_VALUE\0" - "ILLEGAL_OBJECT\0" - "ILLEGAL_OPTIONAL_ANY\0" - "ILLEGAL_OPTIONS_ON_ITEM_TEMPLATE\0" - "ILLEGAL_TAGGED_ANY\0" - "ILLEGAL_TIME_VALUE\0" - "INTEGER_NOT_ASCII_FORMAT\0" - "INTEGER_TOO_LARGE_FOR_LONG\0" - "INVALID_BIT_STRING_BITS_LEFT\0" - "INVALID_BIT_STRING_PADDING\0" - "INVALID_BMPSTRING\0" - "INVALID_DIGIT\0" - "INVALID_INTEGER\0" - "INVALID_MODIFIER\0" - "INVALID_NUMBER\0" - "INVALID_OBJECT_ENCODING\0" - "INVALID_SEPARATOR\0" - "INVALID_TIME_FORMAT\0" - "INVALID_UNIVERSALSTRING\0" - "INVALID_UTF8STRING\0" - "LIST_ERROR\0" - "MISSING_ASN1_EOS\0" - "MISSING_EOC\0" - "MISSING_SECOND_NUMBER\0" - "MISSING_VALUE\0" - "MSTRING_NOT_UNIVERSAL\0" - "MSTRING_WRONG_TAG\0" - "NESTED_ASN1_ERROR\0" - "NESTED_ASN1_STRING\0" - "NESTED_TOO_DEEP\0" - "NON_HEX_CHARACTERS\0" - "NOT_ASCII_FORMAT\0" - "NOT_ENOUGH_DATA\0" - "NO_MATCHING_CHOICE_TYPE\0" - "NULL_IS_WRONG_LENGTH\0" - "OBJECT_NOT_ASCII_FORMAT\0" - "ODD_NUMBER_OF_CHARS\0" - "SECOND_NUMBER_TOO_LARGE\0" - "SEQUENCE_LENGTH_MISMATCH\0" - "SEQUENCE_NOT_CONSTRUCTED\0" - "SEQUENCE_OR_SET_NEEDS_CONFIG\0" - "SHORT_LINE\0" - "STREAMING_NOT_SUPPORTED\0" - "STRING_TOO_LONG\0" - "STRING_TOO_SHORT\0" - "TAG_VALUE_TOO_HIGH\0" - "TIME_NOT_ASCII_FORMAT\0" - "TOO_LONG\0" - "TYPE_NOT_CONSTRUCTED\0" - "TYPE_NOT_PRIMITIVE\0" - "UNEXPECTED_EOC\0" - "UNIVERSALSTRING_IS_WRONG_LENGTH\0" - "UNKNOWN_FORMAT\0" - "UNKNOWN_MESSAGE_DIGEST_ALGORITHM\0" - "UNKNOWN_SIGNATURE_ALGORITHM\0" - "UNKNOWN_TAG\0" - "UNSUPPORTED_ANY_DEFINED_BY_TYPE\0" - "UNSUPPORTED_PUBLIC_KEY_TYPE\0" - "UNSUPPORTED_TYPE\0" - "WRONG_INTEGER_TYPE\0" - "WRONG_PUBLIC_KEY_TYPE\0" - "WRONG_TAG\0" - "WRONG_TYPE\0" - "BAD_FOPEN_MODE\0" - "BROKEN_PIPE\0" - "CONNECT_ERROR\0" - "ERROR_SETTING_NBIO\0" - "INVALID_ARGUMENT\0" - "IN_USE\0" - "KEEPALIVE\0" - "NBIO_CONNECT_ERROR\0" - "NO_HOSTNAME_SPECIFIED\0" - "NO_PORT_SPECIFIED\0" - "NO_SUCH_FILE\0" - "NULL_PARAMETER\0" - "SYS_LIB\0" - "UNABLE_TO_CREATE_SOCKET\0" - "UNINITIALIZED\0" - "UNSUPPORTED_METHOD\0" - "WRITE_TO_READ_ONLY_BIO\0" - "ARG2_LT_ARG3\0" - "BAD_ENCODING\0" - "BAD_RECIPROCAL\0" - "BIGNUM_TOO_LONG\0" - "BITS_TOO_SMALL\0" - "CALLED_WITH_EVEN_MODULUS\0" - "DIV_BY_ZERO\0" - "EXPAND_ON_STATIC_BIGNUM_DATA\0" - "INPUT_NOT_REDUCED\0" - "INVALID_INPUT\0" - "INVALID_RANGE\0" - "NEGATIVE_NUMBER\0" - "NOT_A_SQUARE\0" - "NOT_INITIALIZED\0" - "NO_INVERSE\0" - "PRIVATE_KEY_TOO_LARGE\0" - "P_IS_NOT_PRIME\0" - "TOO_MANY_ITERATIONS\0" - "TOO_MANY_TEMPORARY_VARIABLES\0" - "AES_KEY_SETUP_FAILED\0" - "BAD_DECRYPT\0" - "BAD_KEY_LENGTH\0" - "CTRL_NOT_IMPLEMENTED\0" - "CTRL_OPERATION_NOT_IMPLEMENTED\0" - "DATA_NOT_MULTIPLE_OF_BLOCK_LENGTH\0" - "INITIALIZATION_ERROR\0" - "INPUT_NOT_INITIALIZED\0" - "INVALID_AD_SIZE\0" - "INVALID_KEY_LENGTH\0" - "INVALID_NONCE\0" - "INVALID_NONCE_SIZE\0" - "INVALID_OPERATION\0" - "IV_TOO_LARGE\0" - "NO_CIPHER_SET\0" - "NO_DIRECTION_SET\0" - "OUTPUT_ALIASES_INPUT\0" - "TAG_TOO_LARGE\0" - "TOO_LARGE\0" - "UNSUPPORTED_AD_SIZE\0" - "UNSUPPORTED_INPUT_SIZE\0" - "UNSUPPORTED_KEY_SIZE\0" - "UNSUPPORTED_NONCE_SIZE\0" - "UNSUPPORTED_TAG_SIZE\0" - "WRONG_FINAL_BLOCK_LENGTH\0" - "LIST_CANNOT_BE_NULL\0" - "MISSING_CLOSE_SQUARE_BRACKET\0" - "MISSING_EQUAL_SIGN\0" - "NO_CLOSE_BRACE\0" - "UNABLE_TO_CREATE_NEW_SECTION\0" - "VARIABLE_EXPANSION_TOO_LONG\0" - "VARIABLE_HAS_NO_VALUE\0" - "BAD_GENERATOR\0" - "INVALID_PUBKEY\0" - "MODULUS_TOO_LARGE\0" - "NO_PRIVATE_VALUE\0" - "UNKNOWN_HASH\0" - "BAD_Q_VALUE\0" - "BAD_VERSION\0" - "INVALID_PARAMETERS\0" - "MISSING_PARAMETERS\0" - "NEED_NEW_SETUP_VALUES\0" - "BIGNUM_OUT_OF_RANGE\0" - "COORDINATES_OUT_OF_RANGE\0" - "D2I_ECPKPARAMETERS_FAILURE\0" - "EC_GROUP_NEW_BY_NAME_FAILURE\0" - "GROUP2PKPARAMETERS_FAILURE\0" - "GROUP_MISMATCH\0" - "I2D_ECPKPARAMETERS_FAILURE\0" - "INCOMPATIBLE_OBJECTS\0" - "INVALID_COFACTOR\0" - "INVALID_COMPRESSED_POINT\0" - "INVALID_COMPRESSION_BIT\0" - "INVALID_ENCODING\0" - "INVALID_FIELD\0" - "INVALID_FORM\0" - "INVALID_GROUP_ORDER\0" - "INVALID_PRIVATE_KEY\0" - "INVALID_SCALAR\0" - "MISSING_PRIVATE_KEY\0" - "NON_NAMED_CURVE\0" - "PKPARAMETERS2GROUP_FAILURE\0" - "POINT_AT_INFINITY\0" - "POINT_IS_NOT_ON_CURVE\0" - "PUBLIC_KEY_VALIDATION_FAILED\0" - "SLOT_FULL\0" - "UNDEFINED_GENERATOR\0" - "UNKNOWN_GROUP\0" - "UNKNOWN_ORDER\0" - "WRONG_CURVE_PARAMETERS\0" - "WRONG_ORDER\0" - "KDF_FAILED\0" - "POINT_ARITHMETIC_FAILURE\0" - "UNKNOWN_DIGEST_LENGTH\0" - "BAD_SIGNATURE\0" - "NOT_IMPLEMENTED\0" - "RANDOM_NUMBER_GENERATION_FAILED\0" - "OPERATION_NOT_SUPPORTED\0" - "COMMAND_NOT_SUPPORTED\0" - "DIFFERENT_KEY_TYPES\0" - "DIFFERENT_PARAMETERS\0" - "EMPTY_PSK\0" - "EXPECTING_AN_EC_KEY_KEY\0" - "EXPECTING_AN_RSA_KEY\0" - "EXPECTING_A_DSA_KEY\0" - "ILLEGAL_OR_UNSUPPORTED_PADDING_MODE\0" - "INVALID_BUFFER_SIZE\0" - "INVALID_DIGEST_LENGTH\0" - "INVALID_DIGEST_TYPE\0" - "INVALID_KEYBITS\0" - "INVALID_MGF1_MD\0" - "INVALID_PADDING_MODE\0" - "INVALID_PEER_KEY\0" - "INVALID_PSS_SALTLEN\0" - "INVALID_SIGNATURE\0" - "KEYS_NOT_SET\0" - "MEMORY_LIMIT_EXCEEDED\0" - "NOT_A_PRIVATE_KEY\0" - "NOT_XOF_OR_INVALID_LENGTH\0" - "NO_DEFAULT_DIGEST\0" - "NO_KEY_SET\0" - "NO_MDC2_SUPPORT\0" - "NO_NID_FOR_CURVE\0" - "NO_OPERATION_SET\0" - "NO_PARAMETERS_SET\0" - "OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE\0" - "OPERATON_NOT_INITIALIZED\0" - "UNKNOWN_PUBLIC_KEY_TYPE\0" - "UNSUPPORTED_ALGORITHM\0" - "OUTPUT_TOO_LARGE\0" - "INVALID_OID_STRING\0" - "UNKNOWN_NID\0" - "BAD_BASE64_DECODE\0" - "BAD_END_LINE\0" - "BAD_IV_CHARS\0" - "BAD_PASSWORD_READ\0" - "CIPHER_IS_NULL\0" - "ERROR_CONVERTING_PRIVATE_KEY\0" - "NOT_DEK_INFO\0" - "NOT_ENCRYPTED\0" - "NOT_PROC_TYPE\0" - "NO_START_LINE\0" - "READ_KEY\0" - "SHORT_HEADER\0" - "UNSUPPORTED_CIPHER\0" - "UNSUPPORTED_ENCRYPTION\0" - "BAD_PKCS7_VERSION\0" - "NOT_PKCS7_SIGNED_DATA\0" - "NO_CERTIFICATES_INCLUDED\0" - "NO_CRLS_INCLUDED\0" - "AMBIGUOUS_FRIENDLY_NAME\0" - "BAD_ITERATION_COUNT\0" - "BAD_PKCS12_DATA\0" - "BAD_PKCS12_VERSION\0" - "CIPHER_HAS_NO_OBJECT_IDENTIFIER\0" - "CRYPT_ERROR\0" - "ENCRYPT_ERROR\0" - "ERROR_SETTING_CIPHER_PARAMS\0" - "INCORRECT_PASSWORD\0" - "INVALID_CHARACTERS\0" - "KEYGEN_FAILURE\0" - "KEY_GEN_ERROR\0" - "METHOD_NOT_SUPPORTED\0" - "MISSING_MAC\0" - "MULTIPLE_PRIVATE_KEYS_IN_PKCS12\0" - "PKCS12_PUBLIC_KEY_INTEGRITY_NOT_SUPPORTED\0" - "PKCS12_TOO_DEEPLY_NESTED\0" - "PRIVATE_KEY_DECODE_ERROR\0" - "PRIVATE_KEY_ENCODE_ERROR\0" - "UNKNOWN_ALGORITHM\0" - "UNKNOWN_CIPHER\0" - "UNKNOWN_CIPHER_ALGORITHM\0" - "UNKNOWN_DIGEST\0" - "UNSUPPORTED_KEYLENGTH\0" - "UNSUPPORTED_KEY_DERIVATION_FUNCTION\0" - "UNSUPPORTED_OPTIONS\0" - "UNSUPPORTED_PRF\0" - "UNSUPPORTED_PRIVATE_KEY_ALGORITHM\0" - "UNSUPPORTED_SALT_TYPE\0" - "BAD_E_VALUE\0" - "BAD_FIXED_HEADER_DECRYPT\0" - "BAD_PAD_BYTE_COUNT\0" - "BAD_RSA_PARAMETERS\0" - "BLOCK_TYPE_IS_NOT_01\0" - "BLOCK_TYPE_IS_NOT_02\0" - "BN_NOT_INITIALIZED\0" - "CANNOT_RECOVER_MULTI_PRIME_KEY\0" - "CRT_PARAMS_ALREADY_GIVEN\0" - "CRT_VALUES_INCORRECT\0" - "DATA_LEN_NOT_EQUAL_TO_MOD_LEN\0" - "DATA_TOO_LARGE\0" - "DATA_TOO_LARGE_FOR_KEY_SIZE\0" - "DATA_TOO_LARGE_FOR_MODULUS\0" - "DATA_TOO_SMALL\0" - "DATA_TOO_SMALL_FOR_KEY_SIZE\0" - "DIGEST_TOO_BIG_FOR_RSA_KEY\0" - "D_E_NOT_CONGRUENT_TO_1\0" - "D_OUT_OF_RANGE\0" - "EMPTY_PUBLIC_KEY\0" - "FIRST_OCTET_INVALID\0" - "INCONSISTENT_SET_OF_CRT_VALUES\0" - "INTERNAL_ERROR\0" - "INVALID_MESSAGE_LENGTH\0" - "KEY_SIZE_TOO_SMALL\0" - "LAST_OCTET_INVALID\0" - "MUST_HAVE_AT_LEAST_TWO_PRIMES\0" - "NO_PUBLIC_EXPONENT\0" - "NULL_BEFORE_BLOCK_MISSING\0" - "N_NOT_EQUAL_P_Q\0" - "OAEP_DECODING_ERROR\0" - "ONLY_ONE_OF_P_Q_GIVEN\0" - "OUTPUT_BUFFER_TOO_SMALL\0" - "PADDING_CHECK_FAILED\0" - "PKCS_DECODING_ERROR\0" - "SLEN_CHECK_FAILED\0" - "SLEN_RECOVERY_FAILED\0" - "UNKNOWN_ALGORITHM_TYPE\0" - "UNKNOWN_PADDING_TYPE\0" - "VALUE_MISSING\0" - "WRONG_SIGNATURE_LENGTH\0" - "ALPN_MISMATCH_ON_EARLY_DATA\0" - "ALPS_MISMATCH_ON_EARLY_DATA\0" - "APPLICATION_DATA_INSTEAD_OF_HANDSHAKE\0" - "APPLICATION_DATA_ON_SHUTDOWN\0" - "APP_DATA_IN_HANDSHAKE\0" - "ATTEMPT_TO_REUSE_SESSION_IN_DIFFERENT_CONTEXT\0" - "BAD_ALERT\0" - "BAD_CHANGE_CIPHER_SPEC\0" - "BAD_DATA_RETURNED_BY_CALLBACK\0" - "BAD_DH_P_LENGTH\0" - "BAD_DIGEST_LENGTH\0" - "BAD_ECC_CERT\0" - "BAD_ECPOINT\0" - "BAD_HANDSHAKE_RECORD\0" - "BAD_HELLO_REQUEST\0" - "BAD_LENGTH\0" - "BAD_PACKET_LENGTH\0" - "BAD_RSA_ENCRYPT\0" - "BAD_SRTP_MKI_VALUE\0" - "BAD_SRTP_PROTECTION_PROFILE_LIST\0" - "BAD_SSL_FILETYPE\0" - "BAD_WRITE_RETRY\0" - "BIO_NOT_SET\0" - "BLOCK_CIPHER_PAD_IS_WRONG\0" - "CANNOT_HAVE_BOTH_PRIVKEY_AND_METHOD\0" - "CANNOT_PARSE_LEAF_CERT\0" - "CA_DN_LENGTH_MISMATCH\0" - "CA_DN_TOO_LONG\0" - "CCS_RECEIVED_EARLY\0" - "CERTIFICATE_AND_PRIVATE_KEY_MISMATCH\0" - "CERTIFICATE_VERIFY_FAILED\0" - "CERT_CB_ERROR\0" - "CERT_DECOMPRESSION_FAILED\0" - "CERT_LENGTH_MISMATCH\0" - "CHANNEL_ID_NOT_P256\0" - "CHANNEL_ID_SIGNATURE_INVALID\0" - "CIPHER_MISMATCH_ON_EARLY_DATA\0" - "CIPHER_OR_HASH_UNAVAILABLE\0" - "CLIENTHELLO_PARSE_FAILED\0" - "CLIENTHELLO_TLSEXT\0" - "CONNECTION_REJECTED\0" - "CONNECTION_TYPE_NOT_SET\0" - "COULD_NOT_PARSE_HINTS\0" - "CUSTOM_EXTENSION_ERROR\0" - "DATA_LENGTH_TOO_LONG\0" - "DECRYPTION_FAILED\0" - "DECRYPTION_FAILED_OR_BAD_RECORD_MAC\0" - "DH_PUBLIC_VALUE_LENGTH_IS_WRONG\0" - "DH_P_TOO_LONG\0" - "DIGEST_CHECK_FAILED\0" - "DOWNGRADE_DETECTED\0" - "DTLS_MESSAGE_TOO_BIG\0" - "DUPLICATE_EXTENSION\0" - "DUPLICATE_KEY_SHARE\0" - "DUPLICATE_SIGNATURE_ALGORITHM\0" - "EARLY_DATA_NOT_IN_USE\0" - "ECC_CERT_NOT_FOR_SIGNING\0" - "ECH_REJECTED\0" - "ECH_SERVER_CONFIG_AND_PRIVATE_KEY_MISMATCH\0" - "ECH_SERVER_CONFIG_UNSUPPORTED_EXTENSION\0" - "ECH_SERVER_WOULD_HAVE_NO_RETRY_CONFIGS\0" - "EMPTY_HELLO_RETRY_REQUEST\0" - "EMS_STATE_INCONSISTENT\0" - "ENCRYPTED_LENGTH_TOO_LONG\0" - "ERROR_ADDING_EXTENSION\0" - "ERROR_IN_RECEIVED_CIPHER_LIST\0" - "ERROR_PARSING_EXTENSION\0" - "EXCESSIVE_MESSAGE_SIZE\0" - "EXCESS_HANDSHAKE_DATA\0" - "EXTRA_DATA_IN_MESSAGE\0" - "FRAGMENT_MISMATCH\0" - "GOT_NEXT_PROTO_WITHOUT_EXTENSION\0" - "HANDSHAKE_FAILURE_ON_CLIENT_HELLO\0" - "HANDSHAKE_NOT_COMPLETE\0" - "HTTPS_PROXY_REQUEST\0" - "HTTP_REQUEST\0" - "INAPPROPRIATE_FALLBACK\0" - "INCONSISTENT_CLIENT_HELLO\0" - "INCONSISTENT_ECH_NEGOTIATION\0" - "INVALID_ALPN_PROTOCOL\0" - "INVALID_ALPN_PROTOCOL_LIST\0" - "INVALID_CLIENT_HELLO_INNER\0" - "INVALID_COMMAND\0" - "INVALID_COMPRESSION_LIST\0" - "INVALID_DELEGATED_CREDENTIAL\0" - "INVALID_ECH_CONFIG_LIST\0" - "INVALID_ECH_PUBLIC_NAME\0" - "INVALID_MESSAGE\0" - "INVALID_OUTER_EXTENSION\0" - "INVALID_OUTER_RECORD_TYPE\0" - "INVALID_SCT_LIST\0" - "INVALID_SIGNATURE_ALGORITHM\0" - "INVALID_SSL_SESSION\0" - "INVALID_TICKET_KEYS_LENGTH\0" - "KEY_USAGE_BIT_INCORRECT\0" - "LENGTH_MISMATCH\0" - "MISSING_EXTENSION\0" - "MISSING_KEY_SHARE\0" - "MISSING_RSA_CERTIFICATE\0" - "MISSING_TMP_DH_KEY\0" - "MISSING_TMP_ECDH_KEY\0" - "MIXED_SPECIAL_OPERATOR_WITH_GROUPS\0" - "MTU_TOO_SMALL\0" - "NEGOTIATED_ALPS_WITHOUT_ALPN\0" - "NEGOTIATED_BOTH_NPN_AND_ALPN\0" - "NEGOTIATED_TB_WITHOUT_EMS_OR_RI\0" - "NESTED_GROUP\0" - "NO_APPLICATION_PROTOCOL\0" - "NO_CERTIFICATES_RETURNED\0" - "NO_CERTIFICATE_ASSIGNED\0" - "NO_CERTIFICATE_SET\0" - "NO_CIPHERS_AVAILABLE\0" - "NO_CIPHERS_PASSED\0" - "NO_CIPHERS_SPECIFIED\0" - "NO_CIPHER_MATCH\0" - "NO_COMMON_SIGNATURE_ALGORITHMS\0" - "NO_COMPRESSION_SPECIFIED\0" - "NO_GROUPS_SPECIFIED\0" - "NO_METHOD_SPECIFIED\0" - "NO_P256_SUPPORT\0" - "NO_PRIVATE_KEY_ASSIGNED\0" - "NO_RENEGOTIATION\0" - "NO_REQUIRED_DIGEST\0" - "NO_SHARED_CIPHER\0" - "NO_SHARED_GROUP\0" - "NO_SUPPORTED_VERSIONS_ENABLED\0" - "NULL_SSL_CTX\0" - "NULL_SSL_METHOD_PASSED\0" - "OCSP_CB_ERROR\0" - "OLD_SESSION_CIPHER_NOT_RETURNED\0" - "OLD_SESSION_PRF_HASH_MISMATCH\0" - "OLD_SESSION_VERSION_NOT_RETURNED\0" - "PARSE_TLSEXT\0" - "PATH_TOO_LONG\0" - "PEER_DID_NOT_RETURN_A_CERTIFICATE\0" - "PEER_ERROR_UNSUPPORTED_CERTIFICATE_TYPE\0" - "PRE_SHARED_KEY_MUST_BE_LAST\0" - "PRIVATE_KEY_OPERATION_FAILED\0" - "PROTOCOL_IS_SHUTDOWN\0" - "PSK_IDENTITY_BINDER_COUNT_MISMATCH\0" - "PSK_IDENTITY_NOT_FOUND\0" - "PSK_NO_CLIENT_CB\0" - "PSK_NO_SERVER_CB\0" - "QUIC_INTERNAL_ERROR\0" - "QUIC_TRANSPORT_PARAMETERS_MISCONFIGURED\0" - "READ_TIMEOUT_EXPIRED\0" - "RECORD_LENGTH_MISMATCH\0" - "RECORD_TOO_LARGE\0" - "RENEGOTIATION_EMS_MISMATCH\0" - "RENEGOTIATION_ENCODING_ERR\0" - "RENEGOTIATION_MISMATCH\0" - "REQUIRED_CIPHER_MISSING\0" - "RESUMED_EMS_SESSION_WITHOUT_EMS_EXTENSION\0" - "RESUMED_NON_EMS_SESSION_WITH_EMS_EXTENSION\0" - "SCSV_RECEIVED_WHEN_RENEGOTIATING\0" - "SECOND_SERVERHELLO_VERSION_MISMATCH\0" - "SERVERHELLO_TLSEXT\0" - "SERVER_CERT_CHANGED\0" - "SERVER_ECHOED_INVALID_SESSION_ID\0" - "SESSION_ID_CONTEXT_UNINITIALIZED\0" - "SESSION_MAY_NOT_BE_CREATED\0" - "SHUTDOWN_WHILE_IN_INIT\0" - "SIGNATURE_ALGORITHMS_EXTENSION_SENT_BY_SERVER\0" - "SRTP_COULD_NOT_ALLOCATE_PROFILES\0" - "SRTP_UNKNOWN_PROTECTION_PROFILE\0" - "SSL3_EXT_INVALID_SERVERNAME\0" - "SSLV3_ALERT_BAD_CERTIFICATE\0" - "SSLV3_ALERT_BAD_RECORD_MAC\0" - "SSLV3_ALERT_CERTIFICATE_EXPIRED\0" - "SSLV3_ALERT_CERTIFICATE_REVOKED\0" - "SSLV3_ALERT_CERTIFICATE_UNKNOWN\0" - "SSLV3_ALERT_CLOSE_NOTIFY\0" - "SSLV3_ALERT_DECOMPRESSION_FAILURE\0" - "SSLV3_ALERT_HANDSHAKE_FAILURE\0" - "SSLV3_ALERT_ILLEGAL_PARAMETER\0" - "SSLV3_ALERT_NO_CERTIFICATE\0" - "SSLV3_ALERT_UNEXPECTED_MESSAGE\0" - "SSLV3_ALERT_UNSUPPORTED_CERTIFICATE\0" - "SSL_CTX_HAS_NO_DEFAULT_SSL_VERSION\0" - "SSL_HANDSHAKE_FAILURE\0" - "SSL_SESSION_ID_CONTEXT_TOO_LONG\0" - "SSL_SESSION_ID_TOO_LONG\0" - "TICKET_ENCRYPTION_FAILED\0" - "TLS13_DOWNGRADE\0" - "TLSV1_ALERT_ACCESS_DENIED\0" - "TLSV1_ALERT_BAD_CERTIFICATE_HASH_VALUE\0" - "TLSV1_ALERT_BAD_CERTIFICATE_STATUS_RESPONSE\0" - "TLSV1_ALERT_CERTIFICATE_REQUIRED\0" - "TLSV1_ALERT_CERTIFICATE_UNOBTAINABLE\0" - "TLSV1_ALERT_DECODE_ERROR\0" - "TLSV1_ALERT_DECRYPTION_FAILED\0" - "TLSV1_ALERT_DECRYPT_ERROR\0" - "TLSV1_ALERT_ECH_REQUIRED\0" - "TLSV1_ALERT_EXPORT_RESTRICTION\0" - "TLSV1_ALERT_INAPPROPRIATE_FALLBACK\0" - "TLSV1_ALERT_INSUFFICIENT_SECURITY\0" - "TLSV1_ALERT_INTERNAL_ERROR\0" - "TLSV1_ALERT_NO_APPLICATION_PROTOCOL\0" - "TLSV1_ALERT_NO_RENEGOTIATION\0" - "TLSV1_ALERT_PROTOCOL_VERSION\0" - "TLSV1_ALERT_RECORD_OVERFLOW\0" - "TLSV1_ALERT_UNKNOWN_CA\0" - "TLSV1_ALERT_UNKNOWN_PSK_IDENTITY\0" - "TLSV1_ALERT_UNRECOGNIZED_NAME\0" - "TLSV1_ALERT_UNSUPPORTED_EXTENSION\0" - "TLSV1_ALERT_USER_CANCELLED\0" - "TLS_PEER_DID_NOT_RESPOND_WITH_CERTIFICATE_LIST\0" - "TLS_RSA_ENCRYPTED_VALUE_LENGTH_IS_WRONG\0" - "TOO_MANY_EMPTY_FRAGMENTS\0" - "TOO_MANY_KEY_UPDATES\0" - "TOO_MANY_WARNING_ALERTS\0" - "TOO_MUCH_READ_EARLY_DATA\0" - "TOO_MUCH_SKIPPED_EARLY_DATA\0" - "UNABLE_TO_FIND_ECDH_PARAMETERS\0" - "UNCOMPRESSED_CERT_TOO_LARGE\0" - "UNEXPECTED_COMPATIBILITY_MODE\0" - "UNEXPECTED_EXTENSION\0" - "UNEXPECTED_EXTENSION_ON_EARLY_DATA\0" - "UNEXPECTED_MESSAGE\0" - "UNEXPECTED_OPERATOR_IN_GROUP\0" - "UNEXPECTED_RECORD\0" - "UNKNOWN_ALERT_TYPE\0" - "UNKNOWN_CERTIFICATE_TYPE\0" - "UNKNOWN_CERT_COMPRESSION_ALG\0" - "UNKNOWN_CIPHER_RETURNED\0" - "UNKNOWN_CIPHER_TYPE\0" - "UNKNOWN_KEY_EXCHANGE_TYPE\0" - "UNKNOWN_PROTOCOL\0" - "UNKNOWN_SSL_VERSION\0" - "UNKNOWN_STATE\0" - "UNSAFE_LEGACY_RENEGOTIATION_DISABLED\0" - "UNSUPPORTED_COMPRESSION_ALGORITHM\0" - "UNSUPPORTED_ECH_SERVER_CONFIG\0" - "UNSUPPORTED_ELLIPTIC_CURVE\0" - "UNSUPPORTED_PROTOCOL\0" - "UNSUPPORTED_PROTOCOL_FOR_CUSTOM_KEY\0" - "WRONG_CERTIFICATE_TYPE\0" - "WRONG_CIPHER_RETURNED\0" - "WRONG_CURVE\0" - "WRONG_ENCRYPTION_LEVEL_RECEIVED\0" - "WRONG_MESSAGE_TYPE\0" - "WRONG_SIGNATURE_TYPE\0" - "WRONG_SSL_VERSION\0" - "WRONG_VERSION_NUMBER\0" - "WRONG_VERSION_ON_EARLY_DATA\0" - "X509_LIB\0" - "X509_VERIFICATION_SETUP_PROBLEMS\0" - "BAD_VALIDITY_CHECK\0" - "DECODE_FAILURE\0" - "INVALID_KEY_ID\0" - "INVALID_METADATA\0" - "INVALID_METADATA_KEY\0" - "INVALID_PROOF\0" - "INVALID_TOKEN\0" - "NO_KEYS_CONFIGURED\0" - "NO_SRR_KEY_CONFIGURED\0" - "OVER_BATCHSIZE\0" - "SRR_SIGNATURE_ERROR\0" - "TOO_MANY_KEYS\0" - "AKID_MISMATCH\0" - "BAD_X509_FILETYPE\0" - "BASE64_DECODE_ERROR\0" - "CANT_CHECK_DH_KEY\0" - "CERT_ALREADY_IN_HASH_TABLE\0" - "CRL_ALREADY_DELTA\0" - "CRL_VERIFY_FAILURE\0" - "DELTA_CRL_WITHOUT_CRL_NUMBER\0" - "IDP_MISMATCH\0" - "INVALID_DIRECTORY\0" - "INVALID_FIELD_FOR_VERSION\0" - "INVALID_FIELD_NAME\0" - "INVALID_PARAMETER\0" - "INVALID_PSS_PARAMETERS\0" - "INVALID_TRUST\0" - "INVALID_VERSION\0" - "ISSUER_MISMATCH\0" - "KEY_TYPE_MISMATCH\0" - "KEY_VALUES_MISMATCH\0" - "LOADING_CERT_DIR\0" - "LOADING_DEFAULTS\0" - "NAME_TOO_LONG\0" - "NEWER_CRL_NOT_NEWER\0" - "NO_CERTIFICATE_FOUND\0" - "NO_CERTIFICATE_OR_CRL_FOUND\0" - "NO_CERT_SET_FOR_US_TO_VERIFY\0" - "NO_CRL_FOUND\0" - "NO_CRL_NUMBER\0" - "PUBLIC_KEY_DECODE_ERROR\0" - "PUBLIC_KEY_ENCODE_ERROR\0" - "SHOULD_RETRY\0" - "SIGNATURE_ALGORITHM_MISMATCH\0" - "UNKNOWN_KEY_TYPE\0" - "UNKNOWN_PURPOSE_ID\0" - "UNKNOWN_TRUST_ID\0" - "WRONG_LOOKUP_TYPE\0" - "BAD_IP_ADDRESS\0" - "BAD_OBJECT\0" - "BN_DEC2BN_ERROR\0" - "BN_TO_ASN1_INTEGER_ERROR\0" - "CANNOT_FIND_FREE_FUNCTION\0" - "DIRNAME_ERROR\0" - "DISTPOINT_ALREADY_SET\0" - "DUPLICATE_ZONE_ID\0" - "ERROR_CONVERTING_ZONE\0" - "ERROR_CREATING_EXTENSION\0" - "ERROR_IN_EXTENSION\0" - "EXPECTED_A_SECTION_NAME\0" - "EXTENSION_EXISTS\0" - "EXTENSION_NAME_ERROR\0" - "EXTENSION_NOT_FOUND\0" - "EXTENSION_SETTING_NOT_SUPPORTED\0" - "EXTENSION_VALUE_ERROR\0" - "ILLEGAL_EMPTY_EXTENSION\0" - "ILLEGAL_HEX_DIGIT\0" - "INCORRECT_POLICY_SYNTAX_TAG\0" - "INVALID_BOOLEAN_STRING\0" - "INVALID_EXTENSION_STRING\0" - "INVALID_MULTIPLE_RDNS\0" - "INVALID_NAME\0" - "INVALID_NULL_ARGUMENT\0" - "INVALID_NULL_NAME\0" - "INVALID_NULL_VALUE\0" - "INVALID_NUMBERS\0" - "INVALID_OBJECT_IDENTIFIER\0" - "INVALID_OPTION\0" - "INVALID_POLICY_IDENTIFIER\0" - "INVALID_PROXY_POLICY_SETTING\0" - "INVALID_PURPOSE\0" - "INVALID_SECTION\0" - "INVALID_SYNTAX\0" - "INVALID_VALUE\0" - "ISSUER_DECODE_ERROR\0" - "NEED_ORGANIZATION_AND_NUMBERS\0" - "NO_CONFIG_DATABASE\0" - "NO_ISSUER_CERTIFICATE\0" - "NO_ISSUER_DETAILS\0" - "NO_POLICY_IDENTIFIER\0" - "NO_PROXY_CERT_POLICY_LANGUAGE_DEFINED\0" - "NO_PUBLIC_KEY\0" - "NO_SUBJECT_DETAILS\0" - "ODD_NUMBER_OF_DIGITS\0" - "OPERATION_NOT_DEFINED\0" - "OTHERNAME_ERROR\0" - "POLICY_LANGUAGE_ALREADY_DEFINED\0" - "POLICY_PATH_LENGTH\0" - "POLICY_PATH_LENGTH_ALREADY_DEFINED\0" - "POLICY_WHEN_PROXY_LANGUAGE_REQUIRES_NO_POLICY\0" - "SECTION_NOT_FOUND\0" - "TRAILING_DATA_IN_EXTENSION\0" - "UNABLE_TO_GET_ISSUER_DETAILS\0" - "UNABLE_TO_GET_ISSUER_KEYID\0" - "UNKNOWN_BIT_STRING_ARGUMENT\0" - "UNKNOWN_EXTENSION\0" - "UNKNOWN_EXTENSION_NAME\0" - "UNKNOWN_OPTION\0" - "UNSUPPORTED_OPTION\0" - "USER_TOO_LONG\0" - ""; - diff --git a/third_party/boringssl/gen/bcm/aes-gcm-avx2-x86_64-apple.S b/third_party/boringssl/gen/bcm/aes-gcm-avx2-x86_64-apple.S new file mode 100644 index 00000000..eb2ac3bd --- /dev/null +++ b/third_party/boringssl/gen/bcm/aes-gcm-avx2-x86_64-apple.S @@ -0,0 +1,1320 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.section __DATA,__const +.p2align 4 + + +L$bswap_mask: +.quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + + + + + + + +L$gfpoly: +.quad 1, 0xc200000000000000 + + +L$gfpoly_and_internal_carrybit: +.quad 1, 0xc200000000000001 + +.p2align 5 + +L$ctr_pattern: +.quad 0, 0 +.quad 1, 0 +L$inc_2blocks: +.quad 2, 0 +.quad 2, 0 + +.text +.globl _gcm_init_vpclmulqdq_avx2 +.private_extern _gcm_init_vpclmulqdq_avx2 + +.p2align 5 +_gcm_init_vpclmulqdq_avx2: + + +_CET_ENDBR + + + + + + vpshufd $0x4e,(%rsi),%xmm3 + + + + + + vpshufd $0xd3,%xmm3,%xmm0 + vpsrad $31,%xmm0,%xmm0 + vpaddq %xmm3,%xmm3,%xmm3 + vpand L$gfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + + vbroadcasti128 L$gfpoly(%rip),%ymm6 + + + vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 + vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5 + vpclmulqdq $0x01,%xmm0,%xmm6,%xmm1 + vpshufd $0x4e,%xmm0,%xmm0 + vpxor %xmm0,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm5,%xmm5 + vpxor %xmm0,%xmm5,%xmm5 + + + + vinserti128 $1,%xmm3,%ymm5,%ymm3 + vinserti128 $1,%xmm5,%ymm5,%ymm5 + + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + + + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,64(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128+32(%rdi) + + + vpclmulqdq $0x00,%ymm5,%ymm4,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm4,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm4,%ymm3 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm3,%ymm3 + vpxor %ymm0,%ymm3,%ymm3 + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,0(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128(%rdi) + + vzeroupper + ret + + + +.globl _gcm_gmult_vpclmulqdq_avx2 +.private_extern _gcm_gmult_vpclmulqdq_avx2 + +.p2align 5 +_gcm_gmult_vpclmulqdq_avx2: + + +_CET_ENDBR + + + + vmovdqu (%rdi),%xmm0 + vmovdqu L$bswap_mask(%rip),%xmm1 + vmovdqu 128-16(%rsi),%xmm2 + vmovdqu L$gfpoly(%rip),%xmm3 + vpshufb %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6 + vpshufd $0x4e,%xmm4,%xmm4 + vpxor %xmm4,%xmm5,%xmm5 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4 + vpshufd $0x4e,%xmm5,%xmm5 + vpxor %xmm5,%xmm0,%xmm0 + vpxor %xmm4,%xmm0,%xmm0 + + + vpshufb %xmm1,%xmm0,%xmm0 + vmovdqu %xmm0,(%rdi) + + + ret + + + +.globl _gcm_ghash_vpclmulqdq_avx2 +.private_extern _gcm_ghash_vpclmulqdq_avx2 + +.p2align 5 +_gcm_ghash_vpclmulqdq_avx2: + + +_CET_ENDBR + + + + + + + vmovdqu L$bswap_mask(%rip),%xmm6 + vmovdqu L$gfpoly(%rip),%xmm7 + + + vmovdqu (%rdi),%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + + + cmpq $32,%rcx + jb L$ghash_lastblock + + + + vinserti128 $1,%xmm6,%ymm6,%ymm6 + vinserti128 $1,%xmm7,%ymm7,%ymm7 + + cmpq $127,%rcx + jbe L$ghash_loop_1x + + + vmovdqu 128(%rsi),%ymm8 + vmovdqu 128+32(%rsi),%ymm9 +L$ghash_loop_4x: + + vmovdqu 0(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 0(%rsi),%ymm2 + vpxor %ymm5,%ymm1,%ymm1 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm8,%ymm0,%ymm4 + + vmovdqu 32(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 32(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm8,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu 64(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 64(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + + vmovdqu 96(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 96(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm5,%ymm4,%ymm4 + + + vbroadcasti128 L$gfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm3,%ymm2,%ymm0 + vpshufd $0x4e,%ymm3,%ymm3 + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vpclmulqdq $0x01,%ymm4,%ymm2,%ymm0 + vpshufd $0x4e,%ymm4,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm0,%ymm5,%ymm5 + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + + subq $-128,%rdx + addq $-128,%rcx + cmpq $127,%rcx + ja L$ghash_loop_4x + + + cmpq $32,%rcx + jb L$ghash_loop_1x_done +L$ghash_loop_1x: + vmovdqu (%rdx),%ymm0 + vpshufb %ymm6,%ymm0,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vmovdqu 128-32(%rsi),%ymm0 + vpclmulqdq $0x00,%ymm0,%ymm5,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2 + vpclmulqdq $0x10,%ymm0,%ymm5,%ymm3 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x01,%ymm1,%ymm7,%ymm3 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm2,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x11,%ymm0,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm7,%ymm1 + vpshufd $0x4e,%ymm2,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm1,%ymm5,%ymm5 + + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + addq $32,%rdx + subq $32,%rcx + cmpq $32,%rcx + jae L$ghash_loop_1x +L$ghash_loop_1x_done: + + +L$ghash_lastblock: + testq %rcx,%rcx + jz L$ghash_done + vmovdqu (%rdx),%xmm0 + vpshufb %xmm6,%xmm0,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + vmovdqu 128-16(%rsi),%xmm0 + vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2 + vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm2,%xmm2 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1 + vpshufd $0x4e,%xmm2,%xmm2 + vpxor %xmm2,%xmm5,%xmm5 + vpxor %xmm1,%xmm5,%xmm5 + + +L$ghash_done: + + vpshufb %xmm6,%xmm5,%xmm5 + vmovdqu %xmm5,(%rdi) + + vzeroupper + ret + + + +.globl _aes_gcm_enc_update_vaes_avx2 +.private_extern _aes_gcm_enc_update_vaes_avx2 + +.p2align 5 +_aes_gcm_enc_update_vaes_avx2: + + +_CET_ENDBR + pushq %r12 + + + movq 16(%rsp),%r12 +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+6(%rip) +#endif + vbroadcasti128 L$bswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd L$ctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe L$crypt_loop_4x_done__func1 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 + + + + vmovdqu L$inc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + leaq 16(%rcx),%rax +L$vaesenc_loop_first_4_vecs__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_first_4_vecs__func1 + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + addq $-128,%rdx + cmpq $127,%rdx + jbe L$ghash_last_ciphertext_4x__func1 +.p2align 4 +L$crypt_loop_4x__func1: + + + + + vmovdqu L$inc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl L$aes128__func1 + je L$aes192__func1 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +L$aes192__func1: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +L$aes128__func1: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 L$gfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + subq $-128,%rsi + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + + addq $-128,%rdx + cmpq $127,%rdx + ja L$crypt_loop_4x__func1 +L$ghash_last_ciphertext_4x__func1: + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 L$gfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + subq $-128,%rsi +L$crypt_loop_4x_done__func1: + + testq %rdx,%rdx + jz L$done__func1 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb L$lessthan64bytes__func1 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_1__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_1__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %ymm0,%ymm13,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz L$reduce__func1 + + vpxor %xmm1,%xmm1,%xmm1 + + +L$lessthan64bytes__func1: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_2__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_2__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb L$xor_one_block__func1 + je L$xor_two_blocks__func1 + +L$xor_three_blocks__func1: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp L$ghash_mul_one_vec_unreduced__func1 + +L$xor_two_blocks__func1: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp L$ghash_mul_one_vec_unreduced__func1 + +L$xor_one_block__func1: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +L$ghash_mul_one_vec_unreduced__func1: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +L$reduce__func1: + + vbroadcasti128 L$gfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +L$done__func1: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 + + ret + + + +.globl _aes_gcm_dec_update_vaes_avx2 +.private_extern _aes_gcm_dec_update_vaes_avx2 + +.p2align 5 +_aes_gcm_dec_update_vaes_avx2: + + +_CET_ENDBR + pushq %r12 + + + movq 16(%rsp),%r12 + vbroadcasti128 L$bswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd L$ctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe L$crypt_loop_4x_done__func2 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 +.p2align 4 +L$crypt_loop_4x__func2: + + + + + vmovdqu L$inc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl L$aes128__func2 + je L$aes192__func2 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +L$aes192__func2: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +L$aes128__func2: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 L$gfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + subq $-128,%rsi + addq $-128,%rdx + cmpq $127,%rdx + ja L$crypt_loop_4x__func2 +L$crypt_loop_4x_done__func2: + + testq %rdx,%rdx + jz L$done__func2 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb L$lessthan64bytes__func2 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_1__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_1__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %ymm0,%ymm3,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz L$reduce__func2 + + vpxor %xmm1,%xmm1,%xmm1 + + +L$lessthan64bytes__func2: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd L$inc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_2__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_2__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb L$xor_one_block__func2 + je L$xor_two_blocks__func2 + +L$xor_three_blocks__func2: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %xmm0,%xmm3,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp L$ghash_mul_one_vec_unreduced__func2 + +L$xor_two_blocks__func2: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm2,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp L$ghash_mul_one_vec_unreduced__func2 + +L$xor_one_block__func2: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm2,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +L$ghash_mul_one_vec_unreduced__func2: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +L$reduce__func2: + + vbroadcasti128 L$gfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +L$done__func2: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 + + ret + + + +#endif diff --git a/third_party/boringssl/gen/bcm/aes-gcm-avx2-x86_64-linux.S b/third_party/boringssl/gen/bcm/aes-gcm-avx2-x86_64-linux.S new file mode 100644 index 00000000..018397a1 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aes-gcm-avx2-x86_64-linux.S @@ -0,0 +1,1325 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.section .rodata +.align 16 + + +.Lbswap_mask: +.quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + + + + + + + +.Lgfpoly: +.quad 1, 0xc200000000000000 + + +.Lgfpoly_and_internal_carrybit: +.quad 1, 0xc200000000000001 + +.align 32 + +.Lctr_pattern: +.quad 0, 0 +.quad 1, 0 +.Linc_2blocks: +.quad 2, 0 +.quad 2, 0 + +.text +.globl gcm_init_vpclmulqdq_avx2 +.hidden gcm_init_vpclmulqdq_avx2 +.type gcm_init_vpclmulqdq_avx2,@function +.align 32 +gcm_init_vpclmulqdq_avx2: +.cfi_startproc + +_CET_ENDBR + + + + + + vpshufd $0x4e,(%rsi),%xmm3 + + + + + + vpshufd $0xd3,%xmm3,%xmm0 + vpsrad $31,%xmm0,%xmm0 + vpaddq %xmm3,%xmm3,%xmm3 + vpand .Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm0 + vpxor %xmm0,%xmm3,%xmm3 + + vbroadcasti128 .Lgfpoly(%rip),%ymm6 + + + vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 + vpclmulqdq $0x11,%xmm3,%xmm3,%xmm5 + vpclmulqdq $0x01,%xmm0,%xmm6,%xmm1 + vpshufd $0x4e,%xmm0,%xmm0 + vpxor %xmm0,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm6,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm5,%xmm5 + vpxor %xmm0,%xmm5,%xmm5 + + + + vinserti128 $1,%xmm3,%ymm5,%ymm3 + vinserti128 $1,%xmm5,%ymm5,%ymm5 + + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + + + vmovdqu %ymm3,96(%rdi) + vmovdqu %ymm4,64(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128+32(%rdi) + + + vpclmulqdq $0x00,%ymm5,%ymm4,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm4,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm4,%ymm3 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm3,%ymm3 + vpxor %ymm0,%ymm3,%ymm3 + + vpclmulqdq $0x00,%ymm5,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm5,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm5,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm6,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpxor %ymm0,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x11,%ymm5,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm6,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,0(%rdi) + + + + vpunpcklqdq %ymm3,%ymm4,%ymm0 + vpunpckhqdq %ymm3,%ymm4,%ymm1 + vpxor %ymm1,%ymm0,%ymm0 + vmovdqu %ymm0,128(%rdi) + + vzeroupper + ret + +.cfi_endproc +.size gcm_init_vpclmulqdq_avx2, . - gcm_init_vpclmulqdq_avx2 +.globl gcm_gmult_vpclmulqdq_avx2 +.hidden gcm_gmult_vpclmulqdq_avx2 +.type gcm_gmult_vpclmulqdq_avx2,@function +.align 32 +gcm_gmult_vpclmulqdq_avx2: +.cfi_startproc + +_CET_ENDBR + + + + vmovdqu (%rdi),%xmm0 + vmovdqu .Lbswap_mask(%rip),%xmm1 + vmovdqu 128-16(%rsi),%xmm2 + vmovdqu .Lgfpoly(%rip),%xmm3 + vpshufb %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6 + vpshufd $0x4e,%xmm4,%xmm4 + vpxor %xmm4,%xmm5,%xmm5 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4 + vpshufd $0x4e,%xmm5,%xmm5 + vpxor %xmm5,%xmm0,%xmm0 + vpxor %xmm4,%xmm0,%xmm0 + + + vpshufb %xmm1,%xmm0,%xmm0 + vmovdqu %xmm0,(%rdi) + + + ret + +.cfi_endproc +.size gcm_gmult_vpclmulqdq_avx2, . - gcm_gmult_vpclmulqdq_avx2 +.globl gcm_ghash_vpclmulqdq_avx2 +.hidden gcm_ghash_vpclmulqdq_avx2 +.type gcm_ghash_vpclmulqdq_avx2,@function +.align 32 +gcm_ghash_vpclmulqdq_avx2: +.cfi_startproc + +_CET_ENDBR + + + + + + + vmovdqu .Lbswap_mask(%rip),%xmm6 + vmovdqu .Lgfpoly(%rip),%xmm7 + + + vmovdqu (%rdi),%xmm5 + vpshufb %xmm6,%xmm5,%xmm5 + + + cmpq $32,%rcx + jb .Lghash_lastblock + + + + vinserti128 $1,%xmm6,%ymm6,%ymm6 + vinserti128 $1,%xmm7,%ymm7,%ymm7 + + cmpq $127,%rcx + jbe .Lghash_loop_1x + + + vmovdqu 128(%rsi),%ymm8 + vmovdqu 128+32(%rsi),%ymm9 +.Lghash_loop_4x: + + vmovdqu 0(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 0(%rsi),%ymm2 + vpxor %ymm5,%ymm1,%ymm1 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm8,%ymm0,%ymm4 + + vmovdqu 32(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 32(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm8,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vmovdqu 64(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 64(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x00,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + + vmovdqu 96(%rdx),%ymm1 + vpshufb %ymm6,%ymm1,%ymm1 + vmovdqu 96(%rsi),%ymm2 + vpclmulqdq $0x00,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm3,%ymm3 + vpclmulqdq $0x11,%ymm2,%ymm1,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vpunpckhqdq %ymm1,%ymm1,%ymm0 + vpxor %ymm1,%ymm0,%ymm0 + vpclmulqdq $0x10,%ymm9,%ymm0,%ymm0 + vpxor %ymm0,%ymm4,%ymm4 + + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm5,%ymm4,%ymm4 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm3,%ymm2,%ymm0 + vpshufd $0x4e,%ymm3,%ymm3 + vpxor %ymm3,%ymm4,%ymm4 + vpxor %ymm0,%ymm4,%ymm4 + + vpclmulqdq $0x01,%ymm4,%ymm2,%ymm0 + vpshufd $0x4e,%ymm4,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpxor %ymm0,%ymm5,%ymm5 + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + + subq $-128,%rdx + addq $-128,%rcx + cmpq $127,%rcx + ja .Lghash_loop_4x + + + cmpq $32,%rcx + jb .Lghash_loop_1x_done +.Lghash_loop_1x: + vmovdqu (%rdx),%ymm0 + vpshufb %ymm6,%ymm0,%ymm0 + vpxor %ymm0,%ymm5,%ymm5 + vmovdqu 128-32(%rsi),%ymm0 + vpclmulqdq $0x00,%ymm0,%ymm5,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2 + vpclmulqdq $0x10,%ymm0,%ymm5,%ymm3 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x01,%ymm1,%ymm7,%ymm3 + vpshufd $0x4e,%ymm1,%ymm1 + vpxor %ymm1,%ymm2,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x11,%ymm0,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm7,%ymm1 + vpshufd $0x4e,%ymm2,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpxor %ymm1,%ymm5,%ymm5 + + vextracti128 $1,%ymm5,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + addq $32,%rdx + subq $32,%rcx + cmpq $32,%rcx + jae .Lghash_loop_1x +.Lghash_loop_1x_done: + + +.Lghash_lastblock: + testq %rcx,%rcx + jz .Lghash_done + vmovdqu (%rdx),%xmm0 + vpshufb %xmm6,%xmm0,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + vmovdqu 128-16(%rsi),%xmm0 + vpclmulqdq $0x00,%xmm0,%xmm5,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm5,%xmm2 + vpclmulqdq $0x10,%xmm0,%xmm5,%xmm3 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x01,%xmm1,%xmm7,%xmm3 + vpshufd $0x4e,%xmm1,%xmm1 + vpxor %xmm1,%xmm2,%xmm2 + vpxor %xmm3,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm2,%xmm7,%xmm1 + vpshufd $0x4e,%xmm2,%xmm2 + vpxor %xmm2,%xmm5,%xmm5 + vpxor %xmm1,%xmm5,%xmm5 + + +.Lghash_done: + + vpshufb %xmm6,%xmm5,%xmm5 + vmovdqu %xmm5,(%rdi) + + vzeroupper + ret + +.cfi_endproc +.size gcm_ghash_vpclmulqdq_avx2, . - gcm_ghash_vpclmulqdq_avx2 +.globl aes_gcm_enc_update_vaes_avx2 +.hidden aes_gcm_enc_update_vaes_avx2 +.type aes_gcm_enc_update_vaes_avx2,@function +.align 32 +aes_gcm_enc_update_vaes_avx2: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+6(%rip) +#endif + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func1 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + leaq 16(%rcx),%rax +.Lvaesenc_loop_first_4_vecs__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_first_4_vecs__func1 + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + addq $-128,%rdx + cmpq $127,%rdx + jbe .Lghash_last_ciphertext_4x__func1 +.align 16 +.Lcrypt_loop_4x__func1: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func1 + je .Laes192__func1 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes192__func1: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes128__func1: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + subq $-128,%rsi + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func1 +.Lghash_last_ciphertext_4x__func1: + + vmovdqu 0(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vmovdqu 32(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vmovdqu 64(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rsi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + subq $-128,%rsi +.Lcrypt_loop_4x_done__func1: + + testq %rdx,%rdx + jz .Ldone__func1 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func1 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %ymm0,%ymm13,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func1 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func1: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func1: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func1 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func1 + je .Lxor_two_blocks__func1 + +.Lxor_three_blocks__func1: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm12,%ymm12 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_two_blocks__func1: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func1 + +.Lxor_one_block__func1: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func1: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func1: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func1: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret + +.cfi_endproc +.size aes_gcm_enc_update_vaes_avx2, . - aes_gcm_enc_update_vaes_avx2 +.globl aes_gcm_dec_update_vaes_avx2 +.hidden aes_gcm_dec_update_vaes_avx2 +.type aes_gcm_dec_update_vaes_avx2,@function +.align 32 +aes_gcm_dec_update_vaes_avx2: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 + vbroadcasti128 .Lbswap_mask(%rip),%ymm0 + + + + vmovdqu (%r12),%xmm1 + vpshufb %xmm0,%xmm1,%xmm1 + vbroadcasti128 (%r8),%ymm11 + vpshufb %ymm0,%ymm11,%ymm11 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti128 (%rcx),%ymm9 + vbroadcasti128 (%r11),%ymm10 + + + vpaddd .Lctr_pattern(%rip),%ymm11,%ymm11 + + + + cmpq $127,%rdx + jbe .Lcrypt_loop_4x_done__func2 + + vmovdqu 128(%r9),%ymm7 + vmovdqu 128+32(%r9),%ymm8 +.align 16 +.Lcrypt_loop_4x__func2: + + + + + vmovdqu .Linc_2blocks(%rip),%ymm2 + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm14 + vpaddd %ymm2,%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm15 + vpaddd %ymm2,%ymm11,%ymm11 + + + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + vpxor %ymm9,%ymm14,%ymm14 + vpxor %ymm9,%ymm15,%ymm15 + + cmpl $24,%r10d + jl .Laes128__func2 + je .Laes192__func2 + + vbroadcasti128 -208(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -192(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes192__func2: + vbroadcasti128 -176(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vbroadcasti128 -160(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + +.Laes128__func2: + prefetcht0 512(%rdi) + prefetcht0 512+64(%rdi) + + vmovdqu 0(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 0(%r9),%ymm4 + vpxor %ymm1,%ymm3,%ymm3 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm7,%ymm2,%ymm6 + + vbroadcasti128 -144(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vbroadcasti128 -128(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 32(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 32(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm7,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -112(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vmovdqu 64(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + vmovdqu 64(%r9),%ymm4 + + vbroadcasti128 -96(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -80(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x00,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + + vmovdqu 96(%rdi),%ymm3 + vpshufb %ymm0,%ymm3,%ymm3 + + vbroadcasti128 -64(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vmovdqu 96(%r9),%ymm4 + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm5,%ymm5 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm2 + vpxor %ymm2,%ymm1,%ymm1 + vpunpckhqdq %ymm3,%ymm3,%ymm2 + vpxor %ymm3,%ymm2,%ymm2 + vpclmulqdq $0x10,%ymm8,%ymm2,%ymm2 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -48(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm1,%ymm6,%ymm6 + + + vbroadcasti128 .Lgfpoly(%rip),%ymm4 + vpclmulqdq $0x01,%ymm5,%ymm4,%ymm2 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm2,%ymm6,%ymm6 + + vbroadcasti128 -32(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + + vpclmulqdq $0x01,%ymm6,%ymm4,%ymm2 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm1,%ymm1 + vpxor %ymm2,%ymm1,%ymm1 + + vbroadcasti128 -16(%r11),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + vaesenc %ymm2,%ymm14,%ymm14 + vaesenc %ymm2,%ymm15,%ymm15 + + vextracti128 $1,%ymm1,%xmm2 + vpxor %xmm2,%xmm1,%xmm1 + + + + vpxor 0(%rdi),%ymm10,%ymm2 + vpxor 32(%rdi),%ymm10,%ymm3 + vpxor 64(%rdi),%ymm10,%ymm5 + vpxor 96(%rdi),%ymm10,%ymm6 + vaesenclast %ymm2,%ymm12,%ymm12 + vaesenclast %ymm3,%ymm13,%ymm13 + vaesenclast %ymm5,%ymm14,%ymm14 + vaesenclast %ymm6,%ymm15,%ymm15 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + vmovdqu %ymm14,64(%rsi) + vmovdqu %ymm15,96(%rsi) + + subq $-128,%rdi + subq $-128,%rsi + addq $-128,%rdx + cmpq $127,%rdx + ja .Lcrypt_loop_4x__func2 +.Lcrypt_loop_4x_done__func2: + + testq %rdx,%rdx + jz .Ldone__func2 + + + + + + leaq 128(%r9),%r8 + subq %rdx,%r8 + + + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + vpxor %xmm7,%xmm7,%xmm7 + + cmpq $64,%rdx + jb .Llessthan64bytes__func2 + + + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_1__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_1__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%ymm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %ymm3,%ymm13,%ymm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %ymm13,32(%rsi) + + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %ymm0,%ymm3,%ymm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%ymm3 + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm7 + vpclmulqdq $0x00,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm3,%ymm13,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + jz .Lreduce__func2 + + vpxor %xmm1,%xmm1,%xmm1 + + +.Llessthan64bytes__func2: + vpshufb %ymm0,%ymm11,%ymm12 + vpaddd .Linc_2blocks(%rip),%ymm11,%ymm11 + vpshufb %ymm0,%ymm11,%ymm13 + vpxor %ymm9,%ymm12,%ymm12 + vpxor %ymm9,%ymm13,%ymm13 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_2__func2: + vbroadcasti128 (%rax),%ymm2 + vaesenc %ymm2,%ymm12,%ymm12 + vaesenc %ymm2,%ymm13,%ymm13 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_2__func2 + vaesenclast %ymm10,%ymm12,%ymm12 + vaesenclast %ymm10,%ymm13,%ymm13 + + + + + cmpq $32,%rdx + jb .Lxor_one_block__func2 + je .Lxor_two_blocks__func2 + +.Lxor_three_blocks__func2: + vmovdqu 0(%rdi),%ymm2 + vmovdqu 32(%rdi),%xmm3 + vpxor %ymm2,%ymm12,%ymm12 + vpxor %xmm3,%xmm13,%xmm13 + vmovdqu %ymm12,0(%rsi) + vmovdqu %xmm13,32(%rsi) + + vpshufb %ymm0,%ymm2,%ymm12 + vpshufb %xmm0,%xmm3,%xmm13 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + vmovdqu 32(%r8),%xmm3 + vpclmulqdq $0x00,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%xmm3,%xmm13,%xmm4 + vpxor %ymm4,%ymm7,%ymm7 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_two_blocks__func2: + vmovdqu (%rdi),%ymm2 + vpxor %ymm2,%ymm12,%ymm12 + vmovdqu %ymm12,(%rsi) + vpshufb %ymm0,%ymm2,%ymm12 + vpxor %ymm1,%ymm12,%ymm12 + vmovdqu (%r8),%ymm2 + jmp .Lghash_mul_one_vec_unreduced__func2 + +.Lxor_one_block__func2: + vmovdqu (%rdi),%xmm2 + vpxor %xmm2,%xmm12,%xmm12 + vmovdqu %xmm12,(%rsi) + vpshufb %xmm0,%xmm2,%xmm12 + vpxor %xmm1,%xmm12,%xmm12 + vmovdqu (%r8),%xmm2 + +.Lghash_mul_one_vec_unreduced__func2: + vpclmulqdq $0x00,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm5,%ymm5 + vpclmulqdq $0x01,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x10,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm6,%ymm6 + vpclmulqdq $0x11,%ymm2,%ymm12,%ymm4 + vpxor %ymm4,%ymm7,%ymm7 + +.Lreduce__func2: + + vbroadcasti128 .Lgfpoly(%rip),%ymm2 + vpclmulqdq $0x01,%ymm5,%ymm2,%ymm3 + vpshufd $0x4e,%ymm5,%ymm5 + vpxor %ymm5,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpclmulqdq $0x01,%ymm6,%ymm2,%ymm3 + vpshufd $0x4e,%ymm6,%ymm6 + vpxor %ymm6,%ymm7,%ymm7 + vpxor %ymm3,%ymm7,%ymm7 + vextracti128 $1,%ymm7,%xmm1 + vpxor %xmm7,%xmm1,%xmm1 + +.Ldone__func2: + + vpshufb %xmm0,%xmm1,%xmm1 + vmovdqu %xmm1,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret + +.cfi_endproc +.size aes_gcm_dec_update_vaes_avx2, . - aes_gcm_dec_update_vaes_avx2 +#endif diff --git a/third_party/boringssl/gen/bcm/aes-gcm-avx2-x86_64-win.asm b/third_party/boringssl/gen/bcm/aes-gcm-avx2-x86_64-win.asm new file mode 100644 index 00000000..41104322 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aes-gcm-avx2-x86_64-win.asm @@ -0,0 +1,1599 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .rdata rdata align=8 +ALIGN 16 + + +$L$bswap_mask: + DQ 0x08090a0b0c0d0e0f,0x0001020304050607 + + + + + + + + +$L$gfpoly: + DQ 1,0xc200000000000000 + + +$L$gfpoly_and_internal_carrybit: + DQ 1,0xc200000000000001 + +ALIGN 32 + +$L$ctr_pattern: + DQ 0,0 + DQ 1,0 +$L$inc_2blocks: + DQ 2,0 + DQ 2,0 + +section .text code align=64 + +global gcm_init_vpclmulqdq_avx2 + +ALIGN 32 +gcm_init_vpclmulqdq_avx2: + +$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1: +_CET_ENDBR + sub rsp,24 +$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2: + vmovdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3: + +$L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4: + + + + vpshufd xmm3,XMMWORD[rdx],0x4e + + + + + + vpshufd xmm0,xmm3,0xd3 + vpsrad xmm0,xmm0,31 + vpaddq xmm3,xmm3,xmm3 + vpand xmm0,xmm0,XMMWORD[$L$gfpoly_and_internal_carrybit] + vpxor xmm3,xmm3,xmm0 + + vbroadcasti128 ymm6,XMMWORD[$L$gfpoly] + + + vpclmulqdq xmm0,xmm3,xmm3,0x00 + vpclmulqdq xmm5,xmm3,xmm3,0x11 + vpclmulqdq xmm1,xmm6,xmm0,0x01 + vpshufd xmm0,xmm0,0x4e + vpxor xmm1,xmm1,xmm0 + vpclmulqdq xmm0,xmm6,xmm1,0x01 + vpshufd xmm1,xmm1,0x4e + vpxor xmm5,xmm5,xmm1 + vpxor xmm5,xmm5,xmm0 + + + + vinserti128 ymm3,ymm5,xmm3,1 + vinserti128 ymm5,ymm5,xmm5,1 + + + vpclmulqdq ymm0,ymm3,ymm5,0x00 + vpclmulqdq ymm1,ymm3,ymm5,0x01 + vpclmulqdq ymm2,ymm3,ymm5,0x10 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm2,ymm6,ymm0,0x01 + vpshufd ymm0,ymm0,0x4e + vpxor ymm1,ymm1,ymm0 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm4,ymm3,ymm5,0x11 + vpclmulqdq ymm0,ymm6,ymm1,0x01 + vpshufd ymm1,ymm1,0x4e + vpxor ymm4,ymm4,ymm1 + vpxor ymm4,ymm4,ymm0 + + + + vmovdqu YMMWORD[96+rcx],ymm3 + vmovdqu YMMWORD[64+rcx],ymm4 + + + + vpunpcklqdq ymm0,ymm4,ymm3 + vpunpckhqdq ymm1,ymm4,ymm3 + vpxor ymm0,ymm0,ymm1 + vmovdqu YMMWORD[(128+32)+rcx],ymm0 + + + vpclmulqdq ymm0,ymm4,ymm5,0x00 + vpclmulqdq ymm1,ymm4,ymm5,0x01 + vpclmulqdq ymm2,ymm4,ymm5,0x10 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm2,ymm6,ymm0,0x01 + vpshufd ymm0,ymm0,0x4e + vpxor ymm1,ymm1,ymm0 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm3,ymm4,ymm5,0x11 + vpclmulqdq ymm0,ymm6,ymm1,0x01 + vpshufd ymm1,ymm1,0x4e + vpxor ymm3,ymm3,ymm1 + vpxor ymm3,ymm3,ymm0 + + vpclmulqdq ymm0,ymm3,ymm5,0x00 + vpclmulqdq ymm1,ymm3,ymm5,0x01 + vpclmulqdq ymm2,ymm3,ymm5,0x10 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm2,ymm6,ymm0,0x01 + vpshufd ymm0,ymm0,0x4e + vpxor ymm1,ymm1,ymm0 + vpxor ymm1,ymm1,ymm2 + vpclmulqdq ymm4,ymm3,ymm5,0x11 + vpclmulqdq ymm0,ymm6,ymm1,0x01 + vpshufd ymm1,ymm1,0x4e + vpxor ymm4,ymm4,ymm1 + vpxor ymm4,ymm4,ymm0 + + vmovdqu YMMWORD[32+rcx],ymm3 + vmovdqu YMMWORD[rcx],ymm4 + + + + vpunpcklqdq ymm0,ymm4,ymm3 + vpunpckhqdq ymm1,ymm4,ymm3 + vpxor ymm0,ymm0,ymm1 + vmovdqu YMMWORD[128+rcx],ymm0 + + vzeroupper + vmovdqa xmm6,XMMWORD[rsp] + add rsp,24 + ret +$L$SEH_end_gcm_init_vpclmulqdq_avx2_5: + + +global gcm_gmult_vpclmulqdq_avx2 + +ALIGN 32 +gcm_gmult_vpclmulqdq_avx2: + +$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1: +_CET_ENDBR + sub rsp,24 +$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_2: + vmovdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_3: + +$L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx2_4: + + vmovdqu xmm0,XMMWORD[rcx] + vmovdqu xmm1,XMMWORD[$L$bswap_mask] + vmovdqu xmm2,XMMWORD[((128-16))+rdx] + vmovdqu xmm3,XMMWORD[$L$gfpoly] + vpshufb xmm0,xmm0,xmm1 + + vpclmulqdq xmm4,xmm0,xmm2,0x00 + vpclmulqdq xmm5,xmm0,xmm2,0x01 + vpclmulqdq xmm6,xmm0,xmm2,0x10 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm3,xmm4,0x01 + vpshufd xmm4,xmm4,0x4e + vpxor xmm5,xmm5,xmm4 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm0,xmm0,xmm2,0x11 + vpclmulqdq xmm4,xmm3,xmm5,0x01 + vpshufd xmm5,xmm5,0x4e + vpxor xmm0,xmm0,xmm5 + vpxor xmm0,xmm0,xmm4 + + + vpshufb xmm0,xmm0,xmm1 + vmovdqu XMMWORD[rcx],xmm0 + + + vmovdqa xmm6,XMMWORD[rsp] + add rsp,24 + ret +$L$SEH_end_gcm_gmult_vpclmulqdq_avx2_5: + + +global gcm_ghash_vpclmulqdq_avx2 + +ALIGN 32 +gcm_ghash_vpclmulqdq_avx2: + +$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1: +_CET_ENDBR + sub rsp,72 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_2: + vmovdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_3: + vmovdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_4: + vmovdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_5: + vmovdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_6: + +$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_7: + + + + + vmovdqu xmm6,XMMWORD[$L$bswap_mask] + vmovdqu xmm7,XMMWORD[$L$gfpoly] + + + vmovdqu xmm5,XMMWORD[rcx] + vpshufb xmm5,xmm5,xmm6 + + + cmp r9,32 + jb NEAR $L$ghash_lastblock + + + + vinserti128 ymm6,ymm6,xmm6,1 + vinserti128 ymm7,ymm7,xmm7,1 + + cmp r9,127 + jbe NEAR $L$ghash_loop_1x + + + vmovdqu ymm8,YMMWORD[128+rdx] + vmovdqu ymm9,YMMWORD[((128+32))+rdx] +$L$ghash_loop_4x: + + vmovdqu ymm1,YMMWORD[r8] + vpshufb ymm1,ymm1,ymm6 + vmovdqu ymm2,YMMWORD[rdx] + vpxor ymm1,ymm1,ymm5 + vpclmulqdq ymm3,ymm1,ymm2,0x00 + vpclmulqdq ymm5,ymm1,ymm2,0x11 + vpunpckhqdq ymm0,ymm1,ymm1 + vpxor ymm0,ymm0,ymm1 + vpclmulqdq ymm4,ymm0,ymm8,0x00 + + vmovdqu ymm1,YMMWORD[32+r8] + vpshufb ymm1,ymm1,ymm6 + vmovdqu ymm2,YMMWORD[32+rdx] + vpclmulqdq ymm0,ymm1,ymm2,0x00 + vpxor ymm3,ymm3,ymm0 + vpclmulqdq ymm0,ymm1,ymm2,0x11 + vpxor ymm5,ymm5,ymm0 + vpunpckhqdq ymm0,ymm1,ymm1 + vpxor ymm0,ymm0,ymm1 + vpclmulqdq ymm0,ymm0,ymm8,0x10 + vpxor ymm4,ymm4,ymm0 + + vmovdqu ymm1,YMMWORD[64+r8] + vpshufb ymm1,ymm1,ymm6 + vmovdqu ymm2,YMMWORD[64+rdx] + vpclmulqdq ymm0,ymm1,ymm2,0x00 + vpxor ymm3,ymm3,ymm0 + vpclmulqdq ymm0,ymm1,ymm2,0x11 + vpxor ymm5,ymm5,ymm0 + vpunpckhqdq ymm0,ymm1,ymm1 + vpxor ymm0,ymm0,ymm1 + vpclmulqdq ymm0,ymm0,ymm9,0x00 + vpxor ymm4,ymm4,ymm0 + + + vmovdqu ymm1,YMMWORD[96+r8] + vpshufb ymm1,ymm1,ymm6 + vmovdqu ymm2,YMMWORD[96+rdx] + vpclmulqdq ymm0,ymm1,ymm2,0x00 + vpxor ymm3,ymm3,ymm0 + vpclmulqdq ymm0,ymm1,ymm2,0x11 + vpxor ymm5,ymm5,ymm0 + vpunpckhqdq ymm0,ymm1,ymm1 + vpxor ymm0,ymm0,ymm1 + vpclmulqdq ymm0,ymm0,ymm9,0x10 + vpxor ymm4,ymm4,ymm0 + + vpxor ymm4,ymm4,ymm3 + vpxor ymm4,ymm4,ymm5 + + + vbroadcasti128 ymm2,XMMWORD[$L$gfpoly] + vpclmulqdq ymm0,ymm2,ymm3,0x01 + vpshufd ymm3,ymm3,0x4e + vpxor ymm4,ymm4,ymm3 + vpxor ymm4,ymm4,ymm0 + + vpclmulqdq ymm0,ymm2,ymm4,0x01 + vpshufd ymm4,ymm4,0x4e + vpxor ymm5,ymm5,ymm4 + vpxor ymm5,ymm5,ymm0 + vextracti128 xmm0,ymm5,1 + vpxor xmm5,xmm5,xmm0 + + sub r8,-128 + add r9,-128 + cmp r9,127 + ja NEAR $L$ghash_loop_4x + + + cmp r9,32 + jb NEAR $L$ghash_loop_1x_done +$L$ghash_loop_1x: + vmovdqu ymm0,YMMWORD[r8] + vpshufb ymm0,ymm0,ymm6 + vpxor ymm5,ymm5,ymm0 + vmovdqu ymm0,YMMWORD[((128-32))+rdx] + vpclmulqdq ymm1,ymm5,ymm0,0x00 + vpclmulqdq ymm2,ymm5,ymm0,0x01 + vpclmulqdq ymm3,ymm5,ymm0,0x10 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm3,ymm7,ymm1,0x01 + vpshufd ymm1,ymm1,0x4e + vpxor ymm2,ymm2,ymm1 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm5,ymm5,ymm0,0x11 + vpclmulqdq ymm1,ymm7,ymm2,0x01 + vpshufd ymm2,ymm2,0x4e + vpxor ymm5,ymm5,ymm2 + vpxor ymm5,ymm5,ymm1 + + vextracti128 xmm0,ymm5,1 + vpxor xmm5,xmm5,xmm0 + add r8,32 + sub r9,32 + cmp r9,32 + jae NEAR $L$ghash_loop_1x +$L$ghash_loop_1x_done: + + +$L$ghash_lastblock: + test r9,r9 + jz NEAR $L$ghash_done + vmovdqu xmm0,XMMWORD[r8] + vpshufb xmm0,xmm0,xmm6 + vpxor xmm5,xmm5,xmm0 + vmovdqu xmm0,XMMWORD[((128-16))+rdx] + vpclmulqdq xmm1,xmm5,xmm0,0x00 + vpclmulqdq xmm2,xmm5,xmm0,0x01 + vpclmulqdq xmm3,xmm5,xmm0,0x10 + vpxor xmm2,xmm2,xmm3 + vpclmulqdq xmm3,xmm7,xmm1,0x01 + vpshufd xmm1,xmm1,0x4e + vpxor xmm2,xmm2,xmm1 + vpxor xmm2,xmm2,xmm3 + vpclmulqdq xmm5,xmm5,xmm0,0x11 + vpclmulqdq xmm1,xmm7,xmm2,0x01 + vpshufd xmm2,xmm2,0x4e + vpxor xmm5,xmm5,xmm2 + vpxor xmm5,xmm5,xmm1 + + +$L$ghash_done: + + vpshufb xmm5,xmm5,xmm6 + vmovdqu XMMWORD[rcx],xmm5 + + vzeroupper + vmovdqa xmm6,XMMWORD[rsp] + vmovdqa xmm7,XMMWORD[16+rsp] + vmovdqa xmm8,XMMWORD[32+rsp] + vmovdqa xmm9,XMMWORD[48+rsp] + add rsp,72 + ret +$L$SEH_end_gcm_ghash_vpclmulqdq_avx2_8: + + +global aes_gcm_enc_update_vaes_avx2 + +ALIGN 32 +aes_gcm_enc_update_vaes_avx2: + +$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1: +_CET_ENDBR + push rsi +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2: + push rdi +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3: + push r12 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4: + + mov rsi,QWORD[64+rsp] + mov rdi,QWORD[72+rsp] + mov r12,QWORD[80+rsp] + sub rsp,160 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5: + vmovdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6: + vmovdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7: + vmovdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8: + vmovdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9: + vmovdqa XMMWORD[64+rsp],xmm10 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10: + vmovdqa XMMWORD[80+rsp],xmm11 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11: + vmovdqa XMMWORD[96+rsp],xmm12 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12: + vmovdqa XMMWORD[112+rsp],xmm13 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13: + vmovdqa XMMWORD[128+rsp],xmm14 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14: + vmovdqa XMMWORD[144+rsp],xmm15 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15: + +$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16: +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+6))],1 +%endif + vbroadcasti128 ymm0,XMMWORD[$L$bswap_mask] + + + + vmovdqu xmm1,XMMWORD[r12] + vpshufb xmm1,xmm1,xmm0 + vbroadcasti128 ymm11,XMMWORD[rsi] + vpshufb ymm11,ymm11,ymm0 + + + + mov r10d,DWORD[240+r9] + lea r10d,[((-20))+r10*4] + + + + + lea r11,[96+r10*4+r9] + vbroadcasti128 ymm9,XMMWORD[r9] + vbroadcasti128 ymm10,XMMWORD[r11] + + + vpaddd ymm11,ymm11,YMMWORD[$L$ctr_pattern] + + + + cmp r8,127 + jbe NEAR $L$crypt_loop_4x_done__func1 + + vmovdqu ymm7,YMMWORD[128+rdi] + vmovdqu ymm8,YMMWORD[((128+32))+rdi] + + + + vmovdqu ymm2,YMMWORD[$L$inc_2blocks] + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm14,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm15,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + + + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + vpxor ymm14,ymm14,ymm9 + vpxor ymm15,ymm15,ymm9 + + lea rax,[16+r9] +$L$vaesenc_loop_first_4_vecs__func1: + vbroadcasti128 ymm2,XMMWORD[rax] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_first_4_vecs__func1 + vpxor ymm2,ymm10,YMMWORD[rcx] + vpxor ymm3,ymm10,YMMWORD[32+rcx] + vpxor ymm5,ymm10,YMMWORD[64+rcx] + vpxor ymm6,ymm10,YMMWORD[96+rcx] + vaesenclast ymm12,ymm12,ymm2 + vaesenclast ymm13,ymm13,ymm3 + vaesenclast ymm14,ymm14,ymm5 + vaesenclast ymm15,ymm15,ymm6 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + vmovdqu YMMWORD[64+rdx],ymm14 + vmovdqu YMMWORD[96+rdx],ymm15 + + sub rcx,-128 + add r8,-128 + cmp r8,127 + jbe NEAR $L$ghash_last_ciphertext_4x__func1 +ALIGN 16 +$L$crypt_loop_4x__func1: + + + + + vmovdqu ymm2,YMMWORD[$L$inc_2blocks] + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm14,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm15,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + + + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + vpxor ymm14,ymm14,ymm9 + vpxor ymm15,ymm15,ymm9 + + cmp r10d,24 + jl NEAR $L$aes128__func1 + je NEAR $L$aes192__func1 + + vbroadcasti128 ymm2,XMMWORD[((-208))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-192))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + +$L$aes192__func1: + vbroadcasti128 ymm2,XMMWORD[((-176))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-160))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + +$L$aes128__func1: + prefetcht0 [512+rcx] + prefetcht0 [((512+64))+rcx] + + vmovdqu ymm3,YMMWORD[rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[rdi] + vpxor ymm3,ymm3,ymm1 + vpclmulqdq ymm5,ymm3,ymm4,0x00 + vpclmulqdq ymm1,ymm3,ymm4,0x11 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm6,ymm2,ymm7,0x00 + + vbroadcasti128 ymm2,XMMWORD[((-144))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vbroadcasti128 ymm2,XMMWORD[((-128))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vmovdqu ymm3,YMMWORD[32+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[32+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm7,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-112))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vmovdqu ymm3,YMMWORD[64+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[64+rdi] + + vbroadcasti128 ymm2,XMMWORD[((-96))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-80))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x00 + vpxor ymm6,ymm6,ymm2 + + + vmovdqu ymm3,YMMWORD[96+rdx] + vpshufb ymm3,ymm3,ymm0 + + vbroadcasti128 ymm2,XMMWORD[((-64))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vmovdqu ymm4,YMMWORD[96+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-48))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm1 + + + vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] + vpclmulqdq ymm2,ymm4,ymm5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-32))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vpclmulqdq ymm2,ymm4,ymm6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm1,ymm1,ymm6 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-16))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vextracti128 xmm2,ymm1,1 + vpxor xmm1,xmm1,xmm2 + + + sub rdx,-128 + vpxor ymm2,ymm10,YMMWORD[rcx] + vpxor ymm3,ymm10,YMMWORD[32+rcx] + vpxor ymm5,ymm10,YMMWORD[64+rcx] + vpxor ymm6,ymm10,YMMWORD[96+rcx] + vaesenclast ymm12,ymm12,ymm2 + vaesenclast ymm13,ymm13,ymm3 + vaesenclast ymm14,ymm14,ymm5 + vaesenclast ymm15,ymm15,ymm6 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + vmovdqu YMMWORD[64+rdx],ymm14 + vmovdqu YMMWORD[96+rdx],ymm15 + + sub rcx,-128 + + add r8,-128 + cmp r8,127 + ja NEAR $L$crypt_loop_4x__func1 +$L$ghash_last_ciphertext_4x__func1: + + vmovdqu ymm3,YMMWORD[rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[rdi] + vpxor ymm3,ymm3,ymm1 + vpclmulqdq ymm5,ymm3,ymm4,0x00 + vpclmulqdq ymm1,ymm3,ymm4,0x11 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm6,ymm2,ymm7,0x00 + + vmovdqu ymm3,YMMWORD[32+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[32+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm7,0x10 + vpxor ymm6,ymm6,ymm2 + + vmovdqu ymm3,YMMWORD[64+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[64+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x00 + vpxor ymm6,ymm6,ymm2 + + + vmovdqu ymm3,YMMWORD[96+rdx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[96+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x10 + vpxor ymm6,ymm6,ymm2 + + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm1 + + + vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] + vpclmulqdq ymm2,ymm4,ymm5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm2 + + vpclmulqdq ymm2,ymm4,ymm6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm1,ymm1,ymm6 + vpxor ymm1,ymm1,ymm2 + vextracti128 xmm2,ymm1,1 + vpxor xmm1,xmm1,xmm2 + + sub rdx,-128 +$L$crypt_loop_4x_done__func1: + + test r8,r8 + jz NEAR $L$done__func1 + + + + + + lea rsi,[128+rdi] + sub rsi,r8 + + + vpxor xmm5,xmm5,xmm5 + vpxor xmm6,xmm6,xmm6 + vpxor xmm7,xmm7,xmm7 + + cmp r8,64 + jb NEAR $L$lessthan64bytes__func1 + + + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_1__func1: + vbroadcasti128 ymm2,XMMWORD[rax] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_1__func1 + vaesenclast ymm12,ymm12,ymm10 + vaesenclast ymm13,ymm13,ymm10 + + + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu ymm3,YMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor ymm13,ymm13,ymm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + + + vpshufb ymm12,ymm12,ymm0 + vpshufb ymm13,ymm13,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu ymm3,YMMWORD[32+rsi] + vpclmulqdq ymm5,ymm12,ymm2,0x00 + vpclmulqdq ymm6,ymm12,ymm2,0x01 + vpclmulqdq ymm4,ymm12,ymm2,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm7,ymm12,ymm2,0x11 + vpclmulqdq ymm4,ymm13,ymm3,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x11 + vpxor ymm7,ymm7,ymm4 + + add rsi,64 + add rcx,64 + add rdx,64 + sub r8,64 + jz NEAR $L$reduce__func1 + + vpxor xmm1,xmm1,xmm1 + + +$L$lessthan64bytes__func1: + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_2__func1: + vbroadcasti128 ymm2,XMMWORD[rax] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_2__func1 + vaesenclast ymm12,ymm12,ymm10 + vaesenclast ymm13,ymm13,ymm10 + + + + + cmp r8,32 + jb NEAR $L$xor_one_block__func1 + je NEAR $L$xor_two_blocks__func1 + +$L$xor_three_blocks__func1: + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu xmm3,XMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor xmm13,xmm13,xmm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu XMMWORD[32+rdx],xmm13 + + vpshufb ymm12,ymm12,ymm0 + vpshufb xmm13,xmm13,xmm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu xmm3,XMMWORD[32+rsi] + vpclmulqdq xmm4,xmm13,xmm3,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x11 + vpxor ymm7,ymm7,ymm4 + jmp NEAR $L$ghash_mul_one_vec_unreduced__func1 + +$L$xor_two_blocks__func1: + vmovdqu ymm2,YMMWORD[rcx] + vpxor ymm12,ymm12,ymm2 + vmovdqu YMMWORD[rdx],ymm12 + vpshufb ymm12,ymm12,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + jmp NEAR $L$ghash_mul_one_vec_unreduced__func1 + +$L$xor_one_block__func1: + vmovdqu xmm2,XMMWORD[rcx] + vpxor xmm12,xmm12,xmm2 + vmovdqu XMMWORD[rdx],xmm12 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm12,xmm12,xmm1 + vmovdqu xmm2,XMMWORD[rsi] + +$L$ghash_mul_one_vec_unreduced__func1: + vpclmulqdq ymm4,ymm12,ymm2,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x11 + vpxor ymm7,ymm7,ymm4 + +$L$reduce__func1: + + vbroadcasti128 ymm2,XMMWORD[$L$gfpoly] + vpclmulqdq ymm3,ymm2,ymm5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm3 + vpclmulqdq ymm3,ymm2,ymm6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm7,ymm7,ymm6 + vpxor ymm7,ymm7,ymm3 + vextracti128 xmm1,ymm7,1 + vpxor xmm1,xmm1,xmm7 + +$L$done__func1: + + vpshufb xmm1,xmm1,xmm0 + vmovdqu XMMWORD[r12],xmm1 + + vzeroupper + vmovdqa xmm6,XMMWORD[rsp] + vmovdqa xmm7,XMMWORD[16+rsp] + vmovdqa xmm8,XMMWORD[32+rsp] + vmovdqa xmm9,XMMWORD[48+rsp] + vmovdqa xmm10,XMMWORD[64+rsp] + vmovdqa xmm11,XMMWORD[80+rsp] + vmovdqa xmm12,XMMWORD[96+rsp] + vmovdqa xmm13,XMMWORD[112+rsp] + vmovdqa xmm14,XMMWORD[128+rsp] + vmovdqa xmm15,XMMWORD[144+rsp] + add rsp,160 + pop r12 + pop rdi + pop rsi + ret +$L$SEH_end_aes_gcm_enc_update_vaes_avx2_17: + + +global aes_gcm_dec_update_vaes_avx2 + +ALIGN 32 +aes_gcm_dec_update_vaes_avx2: + +$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1: +_CET_ENDBR + push rsi +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2: + push rdi +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3: + push r12 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4: + + mov rsi,QWORD[64+rsp] + mov rdi,QWORD[72+rsp] + mov r12,QWORD[80+rsp] + sub rsp,160 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5: + vmovdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6: + vmovdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7: + vmovdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8: + vmovdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9: + vmovdqa XMMWORD[64+rsp],xmm10 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10: + vmovdqa XMMWORD[80+rsp],xmm11 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11: + vmovdqa XMMWORD[96+rsp],xmm12 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12: + vmovdqa XMMWORD[112+rsp],xmm13 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13: + vmovdqa XMMWORD[128+rsp],xmm14 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14: + vmovdqa XMMWORD[144+rsp],xmm15 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15: + +$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16: + vbroadcasti128 ymm0,XMMWORD[$L$bswap_mask] + + + + vmovdqu xmm1,XMMWORD[r12] + vpshufb xmm1,xmm1,xmm0 + vbroadcasti128 ymm11,XMMWORD[rsi] + vpshufb ymm11,ymm11,ymm0 + + + + mov r10d,DWORD[240+r9] + lea r10d,[((-20))+r10*4] + + + + + lea r11,[96+r10*4+r9] + vbroadcasti128 ymm9,XMMWORD[r9] + vbroadcasti128 ymm10,XMMWORD[r11] + + + vpaddd ymm11,ymm11,YMMWORD[$L$ctr_pattern] + + + + cmp r8,127 + jbe NEAR $L$crypt_loop_4x_done__func2 + + vmovdqu ymm7,YMMWORD[128+rdi] + vmovdqu ymm8,YMMWORD[((128+32))+rdi] +ALIGN 16 +$L$crypt_loop_4x__func2: + + + + + vmovdqu ymm2,YMMWORD[$L$inc_2blocks] + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm14,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + vpshufb ymm15,ymm11,ymm0 + vpaddd ymm11,ymm11,ymm2 + + + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + vpxor ymm14,ymm14,ymm9 + vpxor ymm15,ymm15,ymm9 + + cmp r10d,24 + jl NEAR $L$aes128__func2 + je NEAR $L$aes192__func2 + + vbroadcasti128 ymm2,XMMWORD[((-208))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-192))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + +$L$aes192__func2: + vbroadcasti128 ymm2,XMMWORD[((-176))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-160))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + +$L$aes128__func2: + prefetcht0 [512+rcx] + prefetcht0 [((512+64))+rcx] + + vmovdqu ymm3,YMMWORD[rcx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[rdi] + vpxor ymm3,ymm3,ymm1 + vpclmulqdq ymm5,ymm3,ymm4,0x00 + vpclmulqdq ymm1,ymm3,ymm4,0x11 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm6,ymm2,ymm7,0x00 + + vbroadcasti128 ymm2,XMMWORD[((-144))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vbroadcasti128 ymm2,XMMWORD[((-128))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vmovdqu ymm3,YMMWORD[32+rcx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[32+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm7,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-112))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vmovdqu ymm3,YMMWORD[64+rcx] + vpshufb ymm3,ymm3,ymm0 + vmovdqu ymm4,YMMWORD[64+rdi] + + vbroadcasti128 ymm2,XMMWORD[((-96))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-80))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x00 + vpxor ymm6,ymm6,ymm2 + + + vmovdqu ymm3,YMMWORD[96+rcx] + vpshufb ymm3,ymm3,ymm0 + + vbroadcasti128 ymm2,XMMWORD[((-64))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vmovdqu ymm4,YMMWORD[96+rdi] + vpclmulqdq ymm2,ymm3,ymm4,0x00 + vpxor ymm5,ymm5,ymm2 + vpclmulqdq ymm2,ymm3,ymm4,0x11 + vpxor ymm1,ymm1,ymm2 + vpunpckhqdq ymm2,ymm3,ymm3 + vpxor ymm2,ymm2,ymm3 + vpclmulqdq ymm2,ymm2,ymm8,0x10 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-48))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm1 + + + vbroadcasti128 ymm4,XMMWORD[$L$gfpoly] + vpclmulqdq ymm2,ymm4,ymm5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-32))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + + vpclmulqdq ymm2,ymm4,ymm6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm1,ymm1,ymm6 + vpxor ymm1,ymm1,ymm2 + + vbroadcasti128 ymm2,XMMWORD[((-16))+r11] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + vaesenc ymm14,ymm14,ymm2 + vaesenc ymm15,ymm15,ymm2 + + vextracti128 xmm2,ymm1,1 + vpxor xmm1,xmm1,xmm2 + + + + vpxor ymm2,ymm10,YMMWORD[rcx] + vpxor ymm3,ymm10,YMMWORD[32+rcx] + vpxor ymm5,ymm10,YMMWORD[64+rcx] + vpxor ymm6,ymm10,YMMWORD[96+rcx] + vaesenclast ymm12,ymm12,ymm2 + vaesenclast ymm13,ymm13,ymm3 + vaesenclast ymm14,ymm14,ymm5 + vaesenclast ymm15,ymm15,ymm6 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + vmovdqu YMMWORD[64+rdx],ymm14 + vmovdqu YMMWORD[96+rdx],ymm15 + + sub rcx,-128 + sub rdx,-128 + add r8,-128 + cmp r8,127 + ja NEAR $L$crypt_loop_4x__func2 +$L$crypt_loop_4x_done__func2: + + test r8,r8 + jz NEAR $L$done__func2 + + + + + + lea rsi,[128+rdi] + sub rsi,r8 + + + vpxor xmm5,xmm5,xmm5 + vpxor xmm6,xmm6,xmm6 + vpxor xmm7,xmm7,xmm7 + + cmp r8,64 + jb NEAR $L$lessthan64bytes__func2 + + + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_1__func2: + vbroadcasti128 ymm2,XMMWORD[rax] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_1__func2 + vaesenclast ymm12,ymm12,ymm10 + vaesenclast ymm13,ymm13,ymm10 + + + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu ymm3,YMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor ymm13,ymm13,ymm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu YMMWORD[32+rdx],ymm13 + + + vpshufb ymm12,ymm2,ymm0 + vpshufb ymm13,ymm3,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu ymm3,YMMWORD[32+rsi] + vpclmulqdq ymm5,ymm12,ymm2,0x00 + vpclmulqdq ymm6,ymm12,ymm2,0x01 + vpclmulqdq ymm4,ymm12,ymm2,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm7,ymm12,ymm2,0x11 + vpclmulqdq ymm4,ymm13,ymm3,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm13,ymm3,0x11 + vpxor ymm7,ymm7,ymm4 + + add rsi,64 + add rcx,64 + add rdx,64 + sub r8,64 + jz NEAR $L$reduce__func2 + + vpxor xmm1,xmm1,xmm1 + + +$L$lessthan64bytes__func2: + vpshufb ymm12,ymm11,ymm0 + vpaddd ymm11,ymm11,YMMWORD[$L$inc_2blocks] + vpshufb ymm13,ymm11,ymm0 + vpxor ymm12,ymm12,ymm9 + vpxor ymm13,ymm13,ymm9 + lea rax,[16+r9] +$L$vaesenc_loop_tail_2__func2: + vbroadcasti128 ymm2,XMMWORD[rax] + vaesenc ymm12,ymm12,ymm2 + vaesenc ymm13,ymm13,ymm2 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_2__func2 + vaesenclast ymm12,ymm12,ymm10 + vaesenclast ymm13,ymm13,ymm10 + + + + + cmp r8,32 + jb NEAR $L$xor_one_block__func2 + je NEAR $L$xor_two_blocks__func2 + +$L$xor_three_blocks__func2: + vmovdqu ymm2,YMMWORD[rcx] + vmovdqu xmm3,XMMWORD[32+rcx] + vpxor ymm12,ymm12,ymm2 + vpxor xmm13,xmm13,xmm3 + vmovdqu YMMWORD[rdx],ymm12 + vmovdqu XMMWORD[32+rdx],xmm13 + + vpshufb ymm12,ymm2,ymm0 + vpshufb xmm13,xmm3,xmm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + vmovdqu xmm3,XMMWORD[32+rsi] + vpclmulqdq xmm4,xmm13,xmm3,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq xmm4,xmm13,xmm3,0x11 + vpxor ymm7,ymm7,ymm4 + jmp NEAR $L$ghash_mul_one_vec_unreduced__func2 + +$L$xor_two_blocks__func2: + vmovdqu ymm2,YMMWORD[rcx] + vpxor ymm12,ymm12,ymm2 + vmovdqu YMMWORD[rdx],ymm12 + vpshufb ymm12,ymm2,ymm0 + vpxor ymm12,ymm12,ymm1 + vmovdqu ymm2,YMMWORD[rsi] + jmp NEAR $L$ghash_mul_one_vec_unreduced__func2 + +$L$xor_one_block__func2: + vmovdqu xmm2,XMMWORD[rcx] + vpxor xmm12,xmm12,xmm2 + vmovdqu XMMWORD[rdx],xmm12 + vpshufb xmm12,xmm2,xmm0 + vpxor xmm12,xmm12,xmm1 + vmovdqu xmm2,XMMWORD[rsi] + +$L$ghash_mul_one_vec_unreduced__func2: + vpclmulqdq ymm4,ymm12,ymm2,0x00 + vpxor ymm5,ymm5,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x01 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x10 + vpxor ymm6,ymm6,ymm4 + vpclmulqdq ymm4,ymm12,ymm2,0x11 + vpxor ymm7,ymm7,ymm4 + +$L$reduce__func2: + + vbroadcasti128 ymm2,XMMWORD[$L$gfpoly] + vpclmulqdq ymm3,ymm2,ymm5,0x01 + vpshufd ymm5,ymm5,0x4e + vpxor ymm6,ymm6,ymm5 + vpxor ymm6,ymm6,ymm3 + vpclmulqdq ymm3,ymm2,ymm6,0x01 + vpshufd ymm6,ymm6,0x4e + vpxor ymm7,ymm7,ymm6 + vpxor ymm7,ymm7,ymm3 + vextracti128 xmm1,ymm7,1 + vpxor xmm1,xmm1,xmm7 + +$L$done__func2: + + vpshufb xmm1,xmm1,xmm0 + vmovdqu XMMWORD[r12],xmm1 + + vzeroupper + vmovdqa xmm6,XMMWORD[rsp] + vmovdqa xmm7,XMMWORD[16+rsp] + vmovdqa xmm8,XMMWORD[32+rsp] + vmovdqa xmm9,XMMWORD[48+rsp] + vmovdqa xmm10,XMMWORD[64+rsp] + vmovdqa xmm11,XMMWORD[80+rsp] + vmovdqa xmm12,XMMWORD[96+rsp] + vmovdqa xmm13,XMMWORD[112+rsp] + vmovdqa xmm14,XMMWORD[128+rsp] + vmovdqa xmm15,XMMWORD[144+rsp] + add rsp,160 + pop r12 + pop rdi + pop rsi + ret +$L$SEH_end_aes_gcm_dec_update_vaes_avx2_17: + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 wrt ..imagebase + DD $L$SEH_end_gcm_init_vpclmulqdq_avx2_5 wrt ..imagebase + DD $L$SEH_info_gcm_init_vpclmulqdq_avx2_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1 wrt ..imagebase + DD $L$SEH_end_gcm_gmult_vpclmulqdq_avx2_5 wrt ..imagebase + DD $L$SEH_info_gcm_gmult_vpclmulqdq_avx2_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 wrt ..imagebase + DD $L$SEH_end_gcm_ghash_vpclmulqdq_avx2_8 wrt ..imagebase + DD $L$SEH_info_gcm_ghash_vpclmulqdq_avx2_0 wrt ..imagebase + + DD $L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 wrt ..imagebase + DD $L$SEH_end_aes_gcm_enc_update_vaes_avx2_17 wrt ..imagebase + DD $L$SEH_info_aes_gcm_enc_update_vaes_avx2_0 wrt ..imagebase + + DD $L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 wrt ..imagebase + DD $L$SEH_end_aes_gcm_dec_update_vaes_avx2_17 wrt ..imagebase + DD $L$SEH_info_aes_gcm_dec_update_vaes_avx2_0 wrt ..imagebase + + +section .xdata rdata align=8 +ALIGN 4 +$L$SEH_info_gcm_init_vpclmulqdq_avx2_0: + DB 1 + DB $L$SEH_endprologue_gcm_init_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 + DB 3 + DB 0 + DB $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_init_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_init_vpclmulqdq_avx2_1 + DB 34 + + DW 0 +$L$SEH_info_gcm_gmult_vpclmulqdq_avx2_0: + DB 1 + DB $L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1 + DB 3 + DB 0 + DB $L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_gmult_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx2_1 + DB 34 + + DW 0 +$L$SEH_info_gcm_ghash_vpclmulqdq_avx2_0: + DB 1 + DB $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx2_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 9 + DB 0 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 152 + DW 3 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 136 + DW 2 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 120 + DW 1 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx2_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx2_1 + DB 130 + + DW 0 +$L$SEH_info_aes_gcm_enc_update_vaes_avx2_0: + DB 1 + DB $L$SEH_endprologue_aes_gcm_enc_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 25 + DB 0 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 1 + DW 20 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 192 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 112 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx2_1 + DB 96 + + DW 0 +$L$SEH_info_aes_gcm_dec_update_vaes_avx2_0: + DB 1 + DB $L$SEH_endprologue_aes_gcm_dec_update_vaes_avx2_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 25 + DB 0 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 1 + DW 20 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 192 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 112 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx2_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx2_1 + DB 96 + + DW 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/aes-gcm-avx512-x86_64-apple.S b/third_party/boringssl/gen/bcm/aes-gcm-avx512-x86_64-apple.S new file mode 100644 index 00000000..874ec0a4 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aes-gcm-avx512-x86_64-apple.S @@ -0,0 +1,1246 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.section __DATA,__const +.p2align 6 + + +L$bswap_mask: +.quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + + + + + + + +L$gfpoly: +.quad 1, 0xc200000000000000 + + +L$gfpoly_and_internal_carrybit: +.quad 1, 0xc200000000000001 + + +L$ctr_pattern: +.quad 0, 0 +.quad 1, 0 +.quad 2, 0 +.quad 3, 0 + + +L$inc_4blocks: +.quad 4, 0 + +.text +.globl _gcm_init_vpclmulqdq_avx512 +.private_extern _gcm_init_vpclmulqdq_avx512 + +.p2align 5 +_gcm_init_vpclmulqdq_avx512: + + +_CET_ENDBR + + leaq 256-64(%rdi),%r8 + + + + vpshufd $0x4e,(%rsi),%xmm3 + + + + + + + + + + + + + + + + + vpshufd $0xd3,%xmm3,%xmm0 + vpsrad $31,%xmm0,%xmm0 + vpaddq %xmm3,%xmm3,%xmm3 + + vpternlogd $0x78,L$gfpoly_and_internal_carrybit(%rip),%xmm0,%xmm3 + + + vbroadcasti32x4 L$gfpoly(%rip),%zmm5 + + + + + + + + + vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 + vpclmulqdq $0x11,%xmm3,%xmm3,%xmm4 + vpclmulqdq $0x01,%xmm0,%xmm5,%xmm1 + vpshufd $0x4e,%xmm0,%xmm0 + vpxor %xmm0,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm5,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpternlogd $0x96,%xmm0,%xmm1,%xmm4 + + + + vinserti128 $1,%xmm3,%ymm4,%ymm3 + vinserti128 $1,%xmm4,%ymm4,%ymm4 + + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm4,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm4,%ymm3,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpternlogd $0x96,%ymm2,%ymm0,%ymm1 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm5,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpternlogd $0x96,%ymm0,%ymm1,%ymm4 + + vinserti64x4 $1,%ymm3,%zmm4,%zmm3 + vshufi64x2 $0,%zmm4,%zmm4,%zmm4 + + + vmovdqu8 %zmm3,(%r8) + + + + + movl $3,%eax +L$precompute_next: + subq $64,%r8 + vpclmulqdq $0x00,%zmm4,%zmm3,%zmm0 + vpclmulqdq $0x01,%zmm4,%zmm3,%zmm1 + vpclmulqdq $0x10,%zmm4,%zmm3,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm0,%zmm5,%zmm2 + vpshufd $0x4e,%zmm0,%zmm0 + vpternlogd $0x96,%zmm2,%zmm0,%zmm1 + vpclmulqdq $0x11,%zmm4,%zmm3,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm5,%zmm0 + vpshufd $0x4e,%zmm1,%zmm1 + vpternlogd $0x96,%zmm0,%zmm1,%zmm3 + + vmovdqu8 %zmm3,(%r8) + decl %eax + jnz L$precompute_next + + vzeroupper + ret + + + +.globl _gcm_gmult_vpclmulqdq_avx512 +.private_extern _gcm_gmult_vpclmulqdq_avx512 + +.p2align 5 +_gcm_gmult_vpclmulqdq_avx512: + + +_CET_ENDBR + + + + vmovdqu (%rdi),%xmm0 + vmovdqu L$bswap_mask(%rip),%xmm1 + vmovdqu 256-16(%rsi),%xmm2 + vmovdqu L$gfpoly(%rip),%xmm3 + vpshufb %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6 + vpxord %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6 + vpshufd $0x4e,%xmm4,%xmm4 + vpternlogd $0x96,%xmm6,%xmm4,%xmm5 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4 + vpshufd $0x4e,%xmm5,%xmm5 + vpternlogd $0x96,%xmm4,%xmm5,%xmm0 + + + vpshufb %xmm1,%xmm0,%xmm0 + vmovdqu %xmm0,(%rdi) + + + ret + + + +.globl _gcm_ghash_vpclmulqdq_avx512 +.private_extern _gcm_ghash_vpclmulqdq_avx512 + +.p2align 5 +_gcm_ghash_vpclmulqdq_avx512: + + +_CET_ENDBR + + + + + + + vmovdqu L$bswap_mask(%rip),%xmm4 + vmovdqu L$gfpoly(%rip),%xmm10 + + + vmovdqu (%rdi),%xmm5 + vpshufb %xmm4,%xmm5,%xmm5 + + + cmpq $64,%rcx + jb L$aad_blockbyblock + + + + vshufi64x2 $0,%zmm4,%zmm4,%zmm4 + vshufi64x2 $0,%zmm10,%zmm10,%zmm10 + + + vmovdqu8 256-64(%rsi),%zmm9 + + cmpq $256,%rcx + jb L$aad_loop_1x + + + vmovdqu8 256-256(%rsi),%zmm6 + vmovdqu8 256-192(%rsi),%zmm7 + vmovdqu8 256-128(%rsi),%zmm8 + + +L$aad_loop_4x: + vmovdqu8 0(%rdx),%zmm0 + vmovdqu8 64(%rdx),%zmm1 + vmovdqu8 128(%rdx),%zmm2 + vmovdqu8 192(%rdx),%zmm3 + vpshufb %zmm4,%zmm0,%zmm0 + vpxord %zmm5,%zmm0,%zmm0 + vpshufb %zmm4,%zmm1,%zmm1 + vpshufb %zmm4,%zmm2,%zmm2 + vpshufb %zmm4,%zmm3,%zmm3 + vpclmulqdq $0x00,%zmm6,%zmm0,%zmm5 + vpclmulqdq $0x00,%zmm7,%zmm1,%zmm11 + vpclmulqdq $0x00,%zmm8,%zmm2,%zmm12 + vpxord %zmm11,%zmm5,%zmm5 + vpclmulqdq $0x00,%zmm9,%zmm3,%zmm13 + vpternlogd $0x96,%zmm13,%zmm12,%zmm5 + vpclmulqdq $0x01,%zmm6,%zmm0,%zmm11 + vpclmulqdq $0x01,%zmm7,%zmm1,%zmm12 + vpclmulqdq $0x01,%zmm8,%zmm2,%zmm13 + vpternlogd $0x96,%zmm13,%zmm12,%zmm11 + vpclmulqdq $0x01,%zmm9,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm6,%zmm0,%zmm13 + vpternlogd $0x96,%zmm13,%zmm12,%zmm11 + vpclmulqdq $0x10,%zmm7,%zmm1,%zmm12 + vpclmulqdq $0x10,%zmm8,%zmm2,%zmm13 + vpternlogd $0x96,%zmm13,%zmm12,%zmm11 + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm13 + vpclmulqdq $0x10,%zmm9,%zmm3,%zmm12 + vpxord %zmm12,%zmm11,%zmm11 + vpshufd $0x4e,%zmm5,%zmm5 + vpclmulqdq $0x11,%zmm6,%zmm0,%zmm0 + vpclmulqdq $0x11,%zmm7,%zmm1,%zmm1 + vpclmulqdq $0x11,%zmm8,%zmm2,%zmm2 + vpternlogd $0x96,%zmm13,%zmm5,%zmm11 + vpclmulqdq $0x11,%zmm9,%zmm3,%zmm3 + vpternlogd $0x96,%zmm2,%zmm1,%zmm0 + vpclmulqdq $0x01,%zmm11,%zmm10,%zmm12 + vpxord %zmm3,%zmm0,%zmm5 + vpshufd $0x4e,%zmm11,%zmm11 + vpternlogd $0x96,%zmm12,%zmm11,%zmm5 + vextracti32x4 $1,%zmm5,%xmm0 + vextracti32x4 $2,%zmm5,%xmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vpxord %xmm0,%xmm5,%xmm5 + vpternlogd $0x96,%xmm1,%xmm2,%xmm5 + + addq $256,%rdx + subq $256,%rcx + cmpq $256,%rcx + jae L$aad_loop_4x + + + cmpq $64,%rcx + jb L$aad_large_done +L$aad_loop_1x: + vmovdqu8 (%rdx),%zmm0 + vpshufb %zmm4,%zmm0,%zmm0 + vpxord %zmm0,%zmm5,%zmm5 + vpclmulqdq $0x00,%zmm9,%zmm5,%zmm0 + vpclmulqdq $0x01,%zmm9,%zmm5,%zmm1 + vpclmulqdq $0x10,%zmm9,%zmm5,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm0,%zmm10,%zmm2 + vpshufd $0x4e,%zmm0,%zmm0 + vpternlogd $0x96,%zmm2,%zmm0,%zmm1 + vpclmulqdq $0x11,%zmm9,%zmm5,%zmm5 + vpclmulqdq $0x01,%zmm1,%zmm10,%zmm0 + vpshufd $0x4e,%zmm1,%zmm1 + vpternlogd $0x96,%zmm0,%zmm1,%zmm5 + + vextracti32x4 $1,%zmm5,%xmm0 + vextracti32x4 $2,%zmm5,%xmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vpxord %xmm0,%xmm5,%xmm5 + vpternlogd $0x96,%xmm1,%xmm2,%xmm5 + + addq $64,%rdx + subq $64,%rcx + cmpq $64,%rcx + jae L$aad_loop_1x + +L$aad_large_done: + + +L$aad_blockbyblock: + testq %rcx,%rcx + jz L$aad_done + vmovdqu 256-16(%rsi),%xmm9 +L$aad_loop_blockbyblock: + vmovdqu (%rdx),%xmm0 + vpshufb %xmm4,%xmm0,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm9,%xmm5,%xmm0 + vpclmulqdq $0x01,%xmm9,%xmm5,%xmm1 + vpclmulqdq $0x10,%xmm9,%xmm5,%xmm2 + vpxord %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm10,%xmm2 + vpshufd $0x4e,%xmm0,%xmm0 + vpternlogd $0x96,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x11,%xmm9,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm1,%xmm10,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpternlogd $0x96,%xmm0,%xmm1,%xmm5 + + addq $16,%rdx + subq $16,%rcx + jnz L$aad_loop_blockbyblock + +L$aad_done: + + vpshufb %xmm4,%xmm5,%xmm5 + vmovdqu %xmm5,(%rdi) + + vzeroupper + ret + + + +.globl _aes_gcm_enc_update_vaes_avx512 +.private_extern _aes_gcm_enc_update_vaes_avx512 + +.p2align 5 +_aes_gcm_enc_update_vaes_avx512: + + +_CET_ENDBR + pushq %r12 + + + movq 16(%rsp),%r12 +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+7(%rip) +#endif + + vbroadcasti32x4 L$bswap_mask(%rip),%zmm8 + vbroadcasti32x4 L$gfpoly(%rip),%zmm31 + + + + vmovdqu (%r12),%xmm10 + vpshufb %xmm8,%xmm10,%xmm10 + vbroadcasti32x4 (%r8),%zmm12 + vpshufb %zmm8,%zmm12,%zmm12 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti32x4 (%rcx),%zmm13 + vbroadcasti32x4 (%r11),%zmm14 + + + vpaddd L$ctr_pattern(%rip),%zmm12,%zmm12 + + + vbroadcasti32x4 L$inc_4blocks(%rip),%zmm11 + + + + cmpq $256,%rdx + jb L$crypt_loop_4x_done__func1 + + + vmovdqu8 256-256(%r9),%zmm27 + vmovdqu8 256-192(%r9),%zmm28 + vmovdqu8 256-128(%r9),%zmm29 + vmovdqu8 256-64(%r9),%zmm30 + + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm1 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm2 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm3 + vpaddd %zmm11,%zmm12,%zmm12 + + + vpxord %zmm13,%zmm0,%zmm0 + vpxord %zmm13,%zmm1,%zmm1 + vpxord %zmm13,%zmm2,%zmm2 + vpxord %zmm13,%zmm3,%zmm3 + + leaq 16(%rcx),%rax +L$vaesenc_loop_first_4_vecs__func1: + vbroadcasti32x4 (%rax),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_first_4_vecs__func1 + vpxord 0(%rdi),%zmm14,%zmm4 + vpxord 64(%rdi),%zmm14,%zmm5 + vpxord 128(%rdi),%zmm14,%zmm6 + vpxord 192(%rdi),%zmm14,%zmm7 + vaesenclast %zmm4,%zmm0,%zmm4 + vaesenclast %zmm5,%zmm1,%zmm5 + vaesenclast %zmm6,%zmm2,%zmm6 + vaesenclast %zmm7,%zmm3,%zmm7 + vmovdqu8 %zmm4,0(%rsi) + vmovdqu8 %zmm5,64(%rsi) + vmovdqu8 %zmm6,128(%rsi) + vmovdqu8 %zmm7,192(%rsi) + + addq $256,%rdi + addq $256,%rsi + subq $256,%rdx + cmpq $256,%rdx + jb L$ghash_last_ciphertext_4x__func1 + + vbroadcasti32x4 -144(%r11),%zmm15 + vbroadcasti32x4 -128(%r11),%zmm16 + vbroadcasti32x4 -112(%r11),%zmm17 + vbroadcasti32x4 -96(%r11),%zmm18 + vbroadcasti32x4 -80(%r11),%zmm19 + vbroadcasti32x4 -64(%r11),%zmm20 + vbroadcasti32x4 -48(%r11),%zmm21 + vbroadcasti32x4 -32(%r11),%zmm22 + vbroadcasti32x4 -16(%r11),%zmm23 + +L$crypt_loop_4x__func1: + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm1 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm2 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm3 + vpaddd %zmm11,%zmm12,%zmm12 + + + vpxord %zmm13,%zmm0,%zmm0 + vpxord %zmm13,%zmm1,%zmm1 + vpxord %zmm13,%zmm2,%zmm2 + vpxord %zmm13,%zmm3,%zmm3 + + cmpl $24,%r10d + jl L$aes128__func1 + je L$aes192__func1 + + vbroadcasti32x4 -208(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + + vbroadcasti32x4 -192(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + +L$aes192__func1: + vbroadcasti32x4 -176(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + + vbroadcasti32x4 -160(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + +L$aes128__func1: + + + + + prefetcht0 512+0(%rdi) + prefetcht0 512+64(%rdi) + prefetcht0 512+128(%rdi) + prefetcht0 512+192(%rdi) + + + + + vpshufb %zmm8,%zmm4,%zmm4 + vpxord %zmm10,%zmm4,%zmm4 + vpshufb %zmm8,%zmm5,%zmm5 + vpshufb %zmm8,%zmm6,%zmm6 + + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm1,%zmm1 + vaesenc %zmm15,%zmm2,%zmm2 + vaesenc %zmm15,%zmm3,%zmm3 + + vpshufb %zmm8,%zmm7,%zmm7 + vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10 + vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24 + vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25 + + vaesenc %zmm16,%zmm0,%zmm0 + vaesenc %zmm16,%zmm1,%zmm1 + vaesenc %zmm16,%zmm2,%zmm2 + vaesenc %zmm16,%zmm3,%zmm3 + + vpxord %zmm24,%zmm10,%zmm10 + vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm10 + vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24 + + vaesenc %zmm17,%zmm0,%zmm0 + vaesenc %zmm17,%zmm1,%zmm1 + vaesenc %zmm17,%zmm2,%zmm2 + vaesenc %zmm17,%zmm3,%zmm3 + + vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25 + + vaesenc %zmm18,%zmm0,%zmm0 + vaesenc %zmm18,%zmm1,%zmm1 + vaesenc %zmm18,%zmm2,%zmm2 + vaesenc %zmm18,%zmm3,%zmm3 + + vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26 + + vaesenc %zmm19,%zmm0,%zmm0 + vaesenc %zmm19,%zmm1,%zmm1 + vaesenc %zmm19,%zmm2,%zmm2 + vaesenc %zmm19,%zmm3,%zmm3 + + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26 + vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25 + vpxord %zmm25,%zmm24,%zmm24 + + vaesenc %zmm20,%zmm0,%zmm0 + vaesenc %zmm20,%zmm1,%zmm1 + vaesenc %zmm20,%zmm2,%zmm2 + vaesenc %zmm20,%zmm3,%zmm3 + + vpshufd $0x4e,%zmm10,%zmm10 + vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4 + vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5 + vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6 + + vaesenc %zmm21,%zmm0,%zmm0 + vaesenc %zmm21,%zmm1,%zmm1 + vaesenc %zmm21,%zmm2,%zmm2 + vaesenc %zmm21,%zmm3,%zmm3 + + vpternlogd $0x96,%zmm26,%zmm10,%zmm24 + vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7 + vpternlogd $0x96,%zmm6,%zmm5,%zmm4 + vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25 + + vaesenc %zmm22,%zmm0,%zmm0 + vaesenc %zmm22,%zmm1,%zmm1 + vaesenc %zmm22,%zmm2,%zmm2 + vaesenc %zmm22,%zmm3,%zmm3 + + vpxord %zmm7,%zmm4,%zmm10 + vpshufd $0x4e,%zmm24,%zmm24 + vpternlogd $0x96,%zmm25,%zmm24,%zmm10 + + vaesenc %zmm23,%zmm0,%zmm0 + vaesenc %zmm23,%zmm1,%zmm1 + vaesenc %zmm23,%zmm2,%zmm2 + vaesenc %zmm23,%zmm3,%zmm3 + + + vextracti32x4 $1,%zmm10,%xmm4 + vextracti32x4 $2,%zmm10,%xmm5 + vextracti32x4 $3,%zmm10,%xmm6 + vpxord %xmm4,%xmm10,%xmm10 + vpternlogd $0x96,%xmm5,%xmm6,%xmm10 + + vpxord 0(%rdi),%zmm14,%zmm4 + vpxord 64(%rdi),%zmm14,%zmm5 + vpxord 128(%rdi),%zmm14,%zmm6 + vpxord 192(%rdi),%zmm14,%zmm7 + vaesenclast %zmm4,%zmm0,%zmm4 + vaesenclast %zmm5,%zmm1,%zmm5 + vaesenclast %zmm6,%zmm2,%zmm6 + vaesenclast %zmm7,%zmm3,%zmm7 + vmovdqu8 %zmm4,0(%rsi) + vmovdqu8 %zmm5,64(%rsi) + vmovdqu8 %zmm6,128(%rsi) + vmovdqu8 %zmm7,192(%rsi) + + addq $256,%rdi + addq $256,%rsi + subq $256,%rdx + cmpq $256,%rdx + jae L$crypt_loop_4x__func1 +L$ghash_last_ciphertext_4x__func1: + vpshufb %zmm8,%zmm4,%zmm4 + vpxord %zmm10,%zmm4,%zmm4 + vpshufb %zmm8,%zmm5,%zmm5 + vpshufb %zmm8,%zmm6,%zmm6 + vpshufb %zmm8,%zmm7,%zmm7 + vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10 + vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24 + vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25 + vpxord %zmm24,%zmm10,%zmm10 + vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm10 + vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24 + vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25 + vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26 + vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25 + vpxord %zmm25,%zmm24,%zmm24 + vpshufd $0x4e,%zmm10,%zmm10 + vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4 + vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5 + vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6 + vpternlogd $0x96,%zmm26,%zmm10,%zmm24 + vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7 + vpternlogd $0x96,%zmm6,%zmm5,%zmm4 + vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25 + vpxord %zmm7,%zmm4,%zmm10 + vpshufd $0x4e,%zmm24,%zmm24 + vpternlogd $0x96,%zmm25,%zmm24,%zmm10 + vextracti32x4 $1,%zmm10,%xmm4 + vextracti32x4 $2,%zmm10,%xmm5 + vextracti32x4 $3,%zmm10,%xmm6 + vpxord %xmm4,%xmm10,%xmm10 + vpternlogd $0x96,%xmm5,%xmm6,%xmm10 + +L$crypt_loop_4x_done__func1: + + testq %rdx,%rdx + jz L$done__func1 + + + + + + + + + + + + + + + + + + + + + movq %rdx,%rax + negq %rax + andq $-16,%rax + leaq 256(%r9,%rax,1),%r8 + vpxor %xmm4,%xmm4,%xmm4 + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + + cmpq $64,%rdx + jb L$partial_vec__func1 + +L$crypt_loop_1x__func1: + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpaddd %zmm11,%zmm12,%zmm12 + vpxord %zmm13,%zmm0,%zmm0 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_full_vec__func1: + vbroadcasti32x4 (%rax),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_full_vec__func1 + vaesenclast %zmm14,%zmm0,%zmm0 + + + vmovdqu8 (%rdi),%zmm1 + vpxord %zmm1,%zmm0,%zmm0 + vmovdqu8 %zmm0,(%rsi) + + + vmovdqu8 (%r8),%zmm30 + vpshufb %zmm8,%zmm0,%zmm0 + vpxord %zmm10,%zmm0,%zmm0 + vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7 + vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1 + vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2 + vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3 + vpxord %zmm7,%zmm4,%zmm4 + vpternlogd $0x96,%zmm2,%zmm1,%zmm5 + vpxord %zmm3,%zmm6,%zmm6 + + vpxor %xmm10,%xmm10,%xmm10 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + cmpq $64,%rdx + jae L$crypt_loop_1x__func1 + + testq %rdx,%rdx + jz L$reduce__func1 + +L$partial_vec__func1: + + + + + movq $-1,%rax + bzhiq %rdx,%rax,%rax + kmovq %rax,%k1 + addq $15,%rdx + andq $-16,%rdx + movq $-1,%rax + bzhiq %rdx,%rax,%rax + kmovq %rax,%k2 + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpxord %zmm13,%zmm0,%zmm0 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_partialvec__func1: + vbroadcasti32x4 (%rax),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_partialvec__func1 + vaesenclast %zmm14,%zmm0,%zmm0 + + + vmovdqu8 (%rdi),%zmm1{%k1}{z} + vpxord %zmm1,%zmm0,%zmm0 + vmovdqu8 %zmm0,(%rsi){%k1} + + + + + + + + + + + + + + vmovdqu8 (%r8),%zmm30{%k2}{z} + vmovdqu8 %zmm0,%zmm1{%k1}{z} + vpshufb %zmm8,%zmm1,%zmm0 + vpxord %zmm10,%zmm0,%zmm0 + vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7 + vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1 + vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2 + vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3 + vpxord %zmm7,%zmm4,%zmm4 + vpternlogd $0x96,%zmm2,%zmm1,%zmm5 + vpxord %zmm3,%zmm6,%zmm6 + + +L$reduce__func1: + + vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0 + vpshufd $0x4e,%zmm4,%zmm4 + vpternlogd $0x96,%zmm0,%zmm4,%zmm5 + vpclmulqdq $0x01,%zmm5,%zmm31,%zmm0 + vpshufd $0x4e,%zmm5,%zmm5 + vpternlogd $0x96,%zmm0,%zmm5,%zmm6 + + vextracti32x4 $1,%zmm6,%xmm0 + vextracti32x4 $2,%zmm6,%xmm1 + vextracti32x4 $3,%zmm6,%xmm2 + vpxord %xmm0,%xmm6,%xmm10 + vpternlogd $0x96,%xmm1,%xmm2,%xmm10 + + +L$done__func1: + + vpshufb %xmm8,%xmm10,%xmm10 + vmovdqu %xmm10,(%r12) + + vzeroupper + popq %r12 + + ret + + + +.globl _aes_gcm_dec_update_vaes_avx512 +.private_extern _aes_gcm_dec_update_vaes_avx512 + +.p2align 5 +_aes_gcm_dec_update_vaes_avx512: + + +_CET_ENDBR + pushq %r12 + + + movq 16(%rsp),%r12 + + vbroadcasti32x4 L$bswap_mask(%rip),%zmm8 + vbroadcasti32x4 L$gfpoly(%rip),%zmm31 + + + + vmovdqu (%r12),%xmm10 + vpshufb %xmm8,%xmm10,%xmm10 + vbroadcasti32x4 (%r8),%zmm12 + vpshufb %zmm8,%zmm12,%zmm12 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti32x4 (%rcx),%zmm13 + vbroadcasti32x4 (%r11),%zmm14 + + + vpaddd L$ctr_pattern(%rip),%zmm12,%zmm12 + + + vbroadcasti32x4 L$inc_4blocks(%rip),%zmm11 + + + + cmpq $256,%rdx + jb L$crypt_loop_4x_done__func2 + + + vmovdqu8 256-256(%r9),%zmm27 + vmovdqu8 256-192(%r9),%zmm28 + vmovdqu8 256-128(%r9),%zmm29 + vmovdqu8 256-64(%r9),%zmm30 + + vbroadcasti32x4 -144(%r11),%zmm15 + vbroadcasti32x4 -128(%r11),%zmm16 + vbroadcasti32x4 -112(%r11),%zmm17 + vbroadcasti32x4 -96(%r11),%zmm18 + vbroadcasti32x4 -80(%r11),%zmm19 + vbroadcasti32x4 -64(%r11),%zmm20 + vbroadcasti32x4 -48(%r11),%zmm21 + vbroadcasti32x4 -32(%r11),%zmm22 + vbroadcasti32x4 -16(%r11),%zmm23 + +L$crypt_loop_4x__func2: + vmovdqu8 0(%rdi),%zmm4 + vmovdqu8 64(%rdi),%zmm5 + vmovdqu8 128(%rdi),%zmm6 + vmovdqu8 192(%rdi),%zmm7 + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm1 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm2 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm3 + vpaddd %zmm11,%zmm12,%zmm12 + + + vpxord %zmm13,%zmm0,%zmm0 + vpxord %zmm13,%zmm1,%zmm1 + vpxord %zmm13,%zmm2,%zmm2 + vpxord %zmm13,%zmm3,%zmm3 + + cmpl $24,%r10d + jl L$aes128__func2 + je L$aes192__func2 + + vbroadcasti32x4 -208(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + + vbroadcasti32x4 -192(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + +L$aes192__func2: + vbroadcasti32x4 -176(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + + vbroadcasti32x4 -160(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + +L$aes128__func2: + + + + + prefetcht0 512+0(%rdi) + prefetcht0 512+64(%rdi) + prefetcht0 512+128(%rdi) + prefetcht0 512+192(%rdi) + + + + + vpshufb %zmm8,%zmm4,%zmm4 + vpxord %zmm10,%zmm4,%zmm4 + vpshufb %zmm8,%zmm5,%zmm5 + vpshufb %zmm8,%zmm6,%zmm6 + + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm1,%zmm1 + vaesenc %zmm15,%zmm2,%zmm2 + vaesenc %zmm15,%zmm3,%zmm3 + + vpshufb %zmm8,%zmm7,%zmm7 + vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10 + vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24 + vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25 + + vaesenc %zmm16,%zmm0,%zmm0 + vaesenc %zmm16,%zmm1,%zmm1 + vaesenc %zmm16,%zmm2,%zmm2 + vaesenc %zmm16,%zmm3,%zmm3 + + vpxord %zmm24,%zmm10,%zmm10 + vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm10 + vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24 + + vaesenc %zmm17,%zmm0,%zmm0 + vaesenc %zmm17,%zmm1,%zmm1 + vaesenc %zmm17,%zmm2,%zmm2 + vaesenc %zmm17,%zmm3,%zmm3 + + vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25 + + vaesenc %zmm18,%zmm0,%zmm0 + vaesenc %zmm18,%zmm1,%zmm1 + vaesenc %zmm18,%zmm2,%zmm2 + vaesenc %zmm18,%zmm3,%zmm3 + + vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26 + + vaesenc %zmm19,%zmm0,%zmm0 + vaesenc %zmm19,%zmm1,%zmm1 + vaesenc %zmm19,%zmm2,%zmm2 + vaesenc %zmm19,%zmm3,%zmm3 + + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26 + vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25 + vpxord %zmm25,%zmm24,%zmm24 + + vaesenc %zmm20,%zmm0,%zmm0 + vaesenc %zmm20,%zmm1,%zmm1 + vaesenc %zmm20,%zmm2,%zmm2 + vaesenc %zmm20,%zmm3,%zmm3 + + vpshufd $0x4e,%zmm10,%zmm10 + vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4 + vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5 + vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6 + + vaesenc %zmm21,%zmm0,%zmm0 + vaesenc %zmm21,%zmm1,%zmm1 + vaesenc %zmm21,%zmm2,%zmm2 + vaesenc %zmm21,%zmm3,%zmm3 + + vpternlogd $0x96,%zmm26,%zmm10,%zmm24 + vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7 + vpternlogd $0x96,%zmm6,%zmm5,%zmm4 + vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25 + + vaesenc %zmm22,%zmm0,%zmm0 + vaesenc %zmm22,%zmm1,%zmm1 + vaesenc %zmm22,%zmm2,%zmm2 + vaesenc %zmm22,%zmm3,%zmm3 + + vpxord %zmm7,%zmm4,%zmm10 + vpshufd $0x4e,%zmm24,%zmm24 + vpternlogd $0x96,%zmm25,%zmm24,%zmm10 + + vaesenc %zmm23,%zmm0,%zmm0 + vaesenc %zmm23,%zmm1,%zmm1 + vaesenc %zmm23,%zmm2,%zmm2 + vaesenc %zmm23,%zmm3,%zmm3 + + + vextracti32x4 $1,%zmm10,%xmm4 + vextracti32x4 $2,%zmm10,%xmm5 + vextracti32x4 $3,%zmm10,%xmm6 + vpxord %xmm4,%xmm10,%xmm10 + vpternlogd $0x96,%xmm5,%xmm6,%xmm10 + + vpxord 0(%rdi),%zmm14,%zmm4 + vpxord 64(%rdi),%zmm14,%zmm5 + vpxord 128(%rdi),%zmm14,%zmm6 + vpxord 192(%rdi),%zmm14,%zmm7 + vaesenclast %zmm4,%zmm0,%zmm4 + vaesenclast %zmm5,%zmm1,%zmm5 + vaesenclast %zmm6,%zmm2,%zmm6 + vaesenclast %zmm7,%zmm3,%zmm7 + vmovdqu8 %zmm4,0(%rsi) + vmovdqu8 %zmm5,64(%rsi) + vmovdqu8 %zmm6,128(%rsi) + vmovdqu8 %zmm7,192(%rsi) + + addq $256,%rdi + addq $256,%rsi + subq $256,%rdx + cmpq $256,%rdx + jae L$crypt_loop_4x__func2 +L$crypt_loop_4x_done__func2: + + testq %rdx,%rdx + jz L$done__func2 + + + + + + + + + + + + + + + + + + + + + movq %rdx,%rax + negq %rax + andq $-16,%rax + leaq 256(%r9,%rax,1),%r8 + vpxor %xmm4,%xmm4,%xmm4 + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + + cmpq $64,%rdx + jb L$partial_vec__func2 + +L$crypt_loop_1x__func2: + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpaddd %zmm11,%zmm12,%zmm12 + vpxord %zmm13,%zmm0,%zmm0 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_full_vec__func2: + vbroadcasti32x4 (%rax),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_full_vec__func2 + vaesenclast %zmm14,%zmm0,%zmm0 + + + vmovdqu8 (%rdi),%zmm1 + vpxord %zmm1,%zmm0,%zmm0 + vmovdqu8 %zmm0,(%rsi) + + + vmovdqu8 (%r8),%zmm30 + vpshufb %zmm8,%zmm1,%zmm0 + vpxord %zmm10,%zmm0,%zmm0 + vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7 + vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1 + vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2 + vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3 + vpxord %zmm7,%zmm4,%zmm4 + vpternlogd $0x96,%zmm2,%zmm1,%zmm5 + vpxord %zmm3,%zmm6,%zmm6 + + vpxor %xmm10,%xmm10,%xmm10 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + cmpq $64,%rdx + jae L$crypt_loop_1x__func2 + + testq %rdx,%rdx + jz L$reduce__func2 + +L$partial_vec__func2: + + + + + movq $-1,%rax + bzhiq %rdx,%rax,%rax + kmovq %rax,%k1 + addq $15,%rdx + andq $-16,%rdx + movq $-1,%rax + bzhiq %rdx,%rax,%rax + kmovq %rax,%k2 + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpxord %zmm13,%zmm0,%zmm0 + leaq 16(%rcx),%rax +L$vaesenc_loop_tail_partialvec__func2: + vbroadcasti32x4 (%rax),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + addq $16,%rax + cmpq %rax,%r11 + jne L$vaesenc_loop_tail_partialvec__func2 + vaesenclast %zmm14,%zmm0,%zmm0 + + + vmovdqu8 (%rdi),%zmm1{%k1}{z} + vpxord %zmm1,%zmm0,%zmm0 + vmovdqu8 %zmm0,(%rsi){%k1} + + + + + + + + + + + + + + vmovdqu8 (%r8),%zmm30{%k2}{z} + + vpshufb %zmm8,%zmm1,%zmm0 + vpxord %zmm10,%zmm0,%zmm0 + vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7 + vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1 + vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2 + vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3 + vpxord %zmm7,%zmm4,%zmm4 + vpternlogd $0x96,%zmm2,%zmm1,%zmm5 + vpxord %zmm3,%zmm6,%zmm6 + + +L$reduce__func2: + + vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0 + vpshufd $0x4e,%zmm4,%zmm4 + vpternlogd $0x96,%zmm0,%zmm4,%zmm5 + vpclmulqdq $0x01,%zmm5,%zmm31,%zmm0 + vpshufd $0x4e,%zmm5,%zmm5 + vpternlogd $0x96,%zmm0,%zmm5,%zmm6 + + vextracti32x4 $1,%zmm6,%xmm0 + vextracti32x4 $2,%zmm6,%xmm1 + vextracti32x4 $3,%zmm6,%xmm2 + vpxord %xmm0,%xmm6,%xmm10 + vpternlogd $0x96,%xmm1,%xmm2,%xmm10 + + +L$done__func2: + + vpshufb %xmm8,%xmm10,%xmm10 + vmovdqu %xmm10,(%r12) + + vzeroupper + popq %r12 + + ret + + + +#endif diff --git a/third_party/boringssl/gen/bcm/aes-gcm-avx512-x86_64-linux.S b/third_party/boringssl/gen/bcm/aes-gcm-avx512-x86_64-linux.S new file mode 100644 index 00000000..bec5e883 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aes-gcm-avx512-x86_64-linux.S @@ -0,0 +1,1251 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.section .rodata +.align 64 + + +.Lbswap_mask: +.quad 0x08090a0b0c0d0e0f, 0x0001020304050607 + + + + + + + + +.Lgfpoly: +.quad 1, 0xc200000000000000 + + +.Lgfpoly_and_internal_carrybit: +.quad 1, 0xc200000000000001 + + +.Lctr_pattern: +.quad 0, 0 +.quad 1, 0 +.quad 2, 0 +.quad 3, 0 + + +.Linc_4blocks: +.quad 4, 0 + +.text +.globl gcm_init_vpclmulqdq_avx512 +.hidden gcm_init_vpclmulqdq_avx512 +.type gcm_init_vpclmulqdq_avx512,@function +.align 32 +gcm_init_vpclmulqdq_avx512: +.cfi_startproc + +_CET_ENDBR + + leaq 256-64(%rdi),%r8 + + + + vpshufd $0x4e,(%rsi),%xmm3 + + + + + + + + + + + + + + + + + vpshufd $0xd3,%xmm3,%xmm0 + vpsrad $31,%xmm0,%xmm0 + vpaddq %xmm3,%xmm3,%xmm3 + + vpternlogd $0x78,.Lgfpoly_and_internal_carrybit(%rip),%xmm0,%xmm3 + + + vbroadcasti32x4 .Lgfpoly(%rip),%zmm5 + + + + + + + + + vpclmulqdq $0x00,%xmm3,%xmm3,%xmm0 + vpclmulqdq $0x11,%xmm3,%xmm3,%xmm4 + vpclmulqdq $0x01,%xmm0,%xmm5,%xmm1 + vpshufd $0x4e,%xmm0,%xmm0 + vpxor %xmm0,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm1,%xmm5,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpternlogd $0x96,%xmm0,%xmm1,%xmm4 + + + + vinserti128 $1,%xmm3,%ymm4,%ymm3 + vinserti128 $1,%xmm4,%ymm4,%ymm4 + + + vpclmulqdq $0x00,%ymm4,%ymm3,%ymm0 + vpclmulqdq $0x01,%ymm4,%ymm3,%ymm1 + vpclmulqdq $0x10,%ymm4,%ymm3,%ymm2 + vpxord %ymm2,%ymm1,%ymm1 + vpclmulqdq $0x01,%ymm0,%ymm5,%ymm2 + vpshufd $0x4e,%ymm0,%ymm0 + vpternlogd $0x96,%ymm2,%ymm0,%ymm1 + vpclmulqdq $0x11,%ymm4,%ymm3,%ymm4 + vpclmulqdq $0x01,%ymm1,%ymm5,%ymm0 + vpshufd $0x4e,%ymm1,%ymm1 + vpternlogd $0x96,%ymm0,%ymm1,%ymm4 + + vinserti64x4 $1,%ymm3,%zmm4,%zmm3 + vshufi64x2 $0,%zmm4,%zmm4,%zmm4 + + + vmovdqu8 %zmm3,(%r8) + + + + + movl $3,%eax +.Lprecompute_next: + subq $64,%r8 + vpclmulqdq $0x00,%zmm4,%zmm3,%zmm0 + vpclmulqdq $0x01,%zmm4,%zmm3,%zmm1 + vpclmulqdq $0x10,%zmm4,%zmm3,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm0,%zmm5,%zmm2 + vpshufd $0x4e,%zmm0,%zmm0 + vpternlogd $0x96,%zmm2,%zmm0,%zmm1 + vpclmulqdq $0x11,%zmm4,%zmm3,%zmm3 + vpclmulqdq $0x01,%zmm1,%zmm5,%zmm0 + vpshufd $0x4e,%zmm1,%zmm1 + vpternlogd $0x96,%zmm0,%zmm1,%zmm3 + + vmovdqu8 %zmm3,(%r8) + decl %eax + jnz .Lprecompute_next + + vzeroupper + ret + +.cfi_endproc +.size gcm_init_vpclmulqdq_avx512, . - gcm_init_vpclmulqdq_avx512 +.globl gcm_gmult_vpclmulqdq_avx512 +.hidden gcm_gmult_vpclmulqdq_avx512 +.type gcm_gmult_vpclmulqdq_avx512,@function +.align 32 +gcm_gmult_vpclmulqdq_avx512: +.cfi_startproc + +_CET_ENDBR + + + + vmovdqu (%rdi),%xmm0 + vmovdqu .Lbswap_mask(%rip),%xmm1 + vmovdqu 256-16(%rsi),%xmm2 + vmovdqu .Lgfpoly(%rip),%xmm3 + vpshufb %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm4 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm6 + vpxord %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm4,%xmm3,%xmm6 + vpshufd $0x4e,%xmm4,%xmm4 + vpternlogd $0x96,%xmm6,%xmm4,%xmm5 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x01,%xmm5,%xmm3,%xmm4 + vpshufd $0x4e,%xmm5,%xmm5 + vpternlogd $0x96,%xmm4,%xmm5,%xmm0 + + + vpshufb %xmm1,%xmm0,%xmm0 + vmovdqu %xmm0,(%rdi) + + + ret + +.cfi_endproc +.size gcm_gmult_vpclmulqdq_avx512, . - gcm_gmult_vpclmulqdq_avx512 +.globl gcm_ghash_vpclmulqdq_avx512 +.hidden gcm_ghash_vpclmulqdq_avx512 +.type gcm_ghash_vpclmulqdq_avx512,@function +.align 32 +gcm_ghash_vpclmulqdq_avx512: +.cfi_startproc + +_CET_ENDBR + + + + + + + vmovdqu .Lbswap_mask(%rip),%xmm4 + vmovdqu .Lgfpoly(%rip),%xmm10 + + + vmovdqu (%rdi),%xmm5 + vpshufb %xmm4,%xmm5,%xmm5 + + + cmpq $64,%rcx + jb .Laad_blockbyblock + + + + vshufi64x2 $0,%zmm4,%zmm4,%zmm4 + vshufi64x2 $0,%zmm10,%zmm10,%zmm10 + + + vmovdqu8 256-64(%rsi),%zmm9 + + cmpq $256,%rcx + jb .Laad_loop_1x + + + vmovdqu8 256-256(%rsi),%zmm6 + vmovdqu8 256-192(%rsi),%zmm7 + vmovdqu8 256-128(%rsi),%zmm8 + + +.Laad_loop_4x: + vmovdqu8 0(%rdx),%zmm0 + vmovdqu8 64(%rdx),%zmm1 + vmovdqu8 128(%rdx),%zmm2 + vmovdqu8 192(%rdx),%zmm3 + vpshufb %zmm4,%zmm0,%zmm0 + vpxord %zmm5,%zmm0,%zmm0 + vpshufb %zmm4,%zmm1,%zmm1 + vpshufb %zmm4,%zmm2,%zmm2 + vpshufb %zmm4,%zmm3,%zmm3 + vpclmulqdq $0x00,%zmm6,%zmm0,%zmm5 + vpclmulqdq $0x00,%zmm7,%zmm1,%zmm11 + vpclmulqdq $0x00,%zmm8,%zmm2,%zmm12 + vpxord %zmm11,%zmm5,%zmm5 + vpclmulqdq $0x00,%zmm9,%zmm3,%zmm13 + vpternlogd $0x96,%zmm13,%zmm12,%zmm5 + vpclmulqdq $0x01,%zmm6,%zmm0,%zmm11 + vpclmulqdq $0x01,%zmm7,%zmm1,%zmm12 + vpclmulqdq $0x01,%zmm8,%zmm2,%zmm13 + vpternlogd $0x96,%zmm13,%zmm12,%zmm11 + vpclmulqdq $0x01,%zmm9,%zmm3,%zmm12 + vpclmulqdq $0x10,%zmm6,%zmm0,%zmm13 + vpternlogd $0x96,%zmm13,%zmm12,%zmm11 + vpclmulqdq $0x10,%zmm7,%zmm1,%zmm12 + vpclmulqdq $0x10,%zmm8,%zmm2,%zmm13 + vpternlogd $0x96,%zmm13,%zmm12,%zmm11 + vpclmulqdq $0x01,%zmm5,%zmm10,%zmm13 + vpclmulqdq $0x10,%zmm9,%zmm3,%zmm12 + vpxord %zmm12,%zmm11,%zmm11 + vpshufd $0x4e,%zmm5,%zmm5 + vpclmulqdq $0x11,%zmm6,%zmm0,%zmm0 + vpclmulqdq $0x11,%zmm7,%zmm1,%zmm1 + vpclmulqdq $0x11,%zmm8,%zmm2,%zmm2 + vpternlogd $0x96,%zmm13,%zmm5,%zmm11 + vpclmulqdq $0x11,%zmm9,%zmm3,%zmm3 + vpternlogd $0x96,%zmm2,%zmm1,%zmm0 + vpclmulqdq $0x01,%zmm11,%zmm10,%zmm12 + vpxord %zmm3,%zmm0,%zmm5 + vpshufd $0x4e,%zmm11,%zmm11 + vpternlogd $0x96,%zmm12,%zmm11,%zmm5 + vextracti32x4 $1,%zmm5,%xmm0 + vextracti32x4 $2,%zmm5,%xmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vpxord %xmm0,%xmm5,%xmm5 + vpternlogd $0x96,%xmm1,%xmm2,%xmm5 + + addq $256,%rdx + subq $256,%rcx + cmpq $256,%rcx + jae .Laad_loop_4x + + + cmpq $64,%rcx + jb .Laad_large_done +.Laad_loop_1x: + vmovdqu8 (%rdx),%zmm0 + vpshufb %zmm4,%zmm0,%zmm0 + vpxord %zmm0,%zmm5,%zmm5 + vpclmulqdq $0x00,%zmm9,%zmm5,%zmm0 + vpclmulqdq $0x01,%zmm9,%zmm5,%zmm1 + vpclmulqdq $0x10,%zmm9,%zmm5,%zmm2 + vpxord %zmm2,%zmm1,%zmm1 + vpclmulqdq $0x01,%zmm0,%zmm10,%zmm2 + vpshufd $0x4e,%zmm0,%zmm0 + vpternlogd $0x96,%zmm2,%zmm0,%zmm1 + vpclmulqdq $0x11,%zmm9,%zmm5,%zmm5 + vpclmulqdq $0x01,%zmm1,%zmm10,%zmm0 + vpshufd $0x4e,%zmm1,%zmm1 + vpternlogd $0x96,%zmm0,%zmm1,%zmm5 + + vextracti32x4 $1,%zmm5,%xmm0 + vextracti32x4 $2,%zmm5,%xmm1 + vextracti32x4 $3,%zmm5,%xmm2 + vpxord %xmm0,%xmm5,%xmm5 + vpternlogd $0x96,%xmm1,%xmm2,%xmm5 + + addq $64,%rdx + subq $64,%rcx + cmpq $64,%rcx + jae .Laad_loop_1x + +.Laad_large_done: + + +.Laad_blockbyblock: + testq %rcx,%rcx + jz .Laad_done + vmovdqu 256-16(%rsi),%xmm9 +.Laad_loop_blockbyblock: + vmovdqu (%rdx),%xmm0 + vpshufb %xmm4,%xmm0,%xmm0 + vpxor %xmm0,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm9,%xmm5,%xmm0 + vpclmulqdq $0x01,%xmm9,%xmm5,%xmm1 + vpclmulqdq $0x10,%xmm9,%xmm5,%xmm2 + vpxord %xmm2,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm0,%xmm10,%xmm2 + vpshufd $0x4e,%xmm0,%xmm0 + vpternlogd $0x96,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x11,%xmm9,%xmm5,%xmm5 + vpclmulqdq $0x01,%xmm1,%xmm10,%xmm0 + vpshufd $0x4e,%xmm1,%xmm1 + vpternlogd $0x96,%xmm0,%xmm1,%xmm5 + + addq $16,%rdx + subq $16,%rcx + jnz .Laad_loop_blockbyblock + +.Laad_done: + + vpshufb %xmm4,%xmm5,%xmm5 + vmovdqu %xmm5,(%rdi) + + vzeroupper + ret + +.cfi_endproc +.size gcm_ghash_vpclmulqdq_avx512, . - gcm_ghash_vpclmulqdq_avx512 +.globl aes_gcm_enc_update_vaes_avx512 +.hidden aes_gcm_enc_update_vaes_avx512 +.type aes_gcm_enc_update_vaes_avx512,@function +.align 32 +aes_gcm_enc_update_vaes_avx512: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+7(%rip) +#endif + + vbroadcasti32x4 .Lbswap_mask(%rip),%zmm8 + vbroadcasti32x4 .Lgfpoly(%rip),%zmm31 + + + + vmovdqu (%r12),%xmm10 + vpshufb %xmm8,%xmm10,%xmm10 + vbroadcasti32x4 (%r8),%zmm12 + vpshufb %zmm8,%zmm12,%zmm12 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti32x4 (%rcx),%zmm13 + vbroadcasti32x4 (%r11),%zmm14 + + + vpaddd .Lctr_pattern(%rip),%zmm12,%zmm12 + + + vbroadcasti32x4 .Linc_4blocks(%rip),%zmm11 + + + + cmpq $256,%rdx + jb .Lcrypt_loop_4x_done__func1 + + + vmovdqu8 256-256(%r9),%zmm27 + vmovdqu8 256-192(%r9),%zmm28 + vmovdqu8 256-128(%r9),%zmm29 + vmovdqu8 256-64(%r9),%zmm30 + + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm1 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm2 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm3 + vpaddd %zmm11,%zmm12,%zmm12 + + + vpxord %zmm13,%zmm0,%zmm0 + vpxord %zmm13,%zmm1,%zmm1 + vpxord %zmm13,%zmm2,%zmm2 + vpxord %zmm13,%zmm3,%zmm3 + + leaq 16(%rcx),%rax +.Lvaesenc_loop_first_4_vecs__func1: + vbroadcasti32x4 (%rax),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_first_4_vecs__func1 + vpxord 0(%rdi),%zmm14,%zmm4 + vpxord 64(%rdi),%zmm14,%zmm5 + vpxord 128(%rdi),%zmm14,%zmm6 + vpxord 192(%rdi),%zmm14,%zmm7 + vaesenclast %zmm4,%zmm0,%zmm4 + vaesenclast %zmm5,%zmm1,%zmm5 + vaesenclast %zmm6,%zmm2,%zmm6 + vaesenclast %zmm7,%zmm3,%zmm7 + vmovdqu8 %zmm4,0(%rsi) + vmovdqu8 %zmm5,64(%rsi) + vmovdqu8 %zmm6,128(%rsi) + vmovdqu8 %zmm7,192(%rsi) + + addq $256,%rdi + addq $256,%rsi + subq $256,%rdx + cmpq $256,%rdx + jb .Lghash_last_ciphertext_4x__func1 + + vbroadcasti32x4 -144(%r11),%zmm15 + vbroadcasti32x4 -128(%r11),%zmm16 + vbroadcasti32x4 -112(%r11),%zmm17 + vbroadcasti32x4 -96(%r11),%zmm18 + vbroadcasti32x4 -80(%r11),%zmm19 + vbroadcasti32x4 -64(%r11),%zmm20 + vbroadcasti32x4 -48(%r11),%zmm21 + vbroadcasti32x4 -32(%r11),%zmm22 + vbroadcasti32x4 -16(%r11),%zmm23 + +.Lcrypt_loop_4x__func1: + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm1 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm2 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm3 + vpaddd %zmm11,%zmm12,%zmm12 + + + vpxord %zmm13,%zmm0,%zmm0 + vpxord %zmm13,%zmm1,%zmm1 + vpxord %zmm13,%zmm2,%zmm2 + vpxord %zmm13,%zmm3,%zmm3 + + cmpl $24,%r10d + jl .Laes128__func1 + je .Laes192__func1 + + vbroadcasti32x4 -208(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + + vbroadcasti32x4 -192(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + +.Laes192__func1: + vbroadcasti32x4 -176(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + + vbroadcasti32x4 -160(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + +.Laes128__func1: + + + + + prefetcht0 512+0(%rdi) + prefetcht0 512+64(%rdi) + prefetcht0 512+128(%rdi) + prefetcht0 512+192(%rdi) + + + + + vpshufb %zmm8,%zmm4,%zmm4 + vpxord %zmm10,%zmm4,%zmm4 + vpshufb %zmm8,%zmm5,%zmm5 + vpshufb %zmm8,%zmm6,%zmm6 + + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm1,%zmm1 + vaesenc %zmm15,%zmm2,%zmm2 + vaesenc %zmm15,%zmm3,%zmm3 + + vpshufb %zmm8,%zmm7,%zmm7 + vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10 + vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24 + vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25 + + vaesenc %zmm16,%zmm0,%zmm0 + vaesenc %zmm16,%zmm1,%zmm1 + vaesenc %zmm16,%zmm2,%zmm2 + vaesenc %zmm16,%zmm3,%zmm3 + + vpxord %zmm24,%zmm10,%zmm10 + vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm10 + vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24 + + vaesenc %zmm17,%zmm0,%zmm0 + vaesenc %zmm17,%zmm1,%zmm1 + vaesenc %zmm17,%zmm2,%zmm2 + vaesenc %zmm17,%zmm3,%zmm3 + + vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25 + + vaesenc %zmm18,%zmm0,%zmm0 + vaesenc %zmm18,%zmm1,%zmm1 + vaesenc %zmm18,%zmm2,%zmm2 + vaesenc %zmm18,%zmm3,%zmm3 + + vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26 + + vaesenc %zmm19,%zmm0,%zmm0 + vaesenc %zmm19,%zmm1,%zmm1 + vaesenc %zmm19,%zmm2,%zmm2 + vaesenc %zmm19,%zmm3,%zmm3 + + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26 + vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25 + vpxord %zmm25,%zmm24,%zmm24 + + vaesenc %zmm20,%zmm0,%zmm0 + vaesenc %zmm20,%zmm1,%zmm1 + vaesenc %zmm20,%zmm2,%zmm2 + vaesenc %zmm20,%zmm3,%zmm3 + + vpshufd $0x4e,%zmm10,%zmm10 + vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4 + vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5 + vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6 + + vaesenc %zmm21,%zmm0,%zmm0 + vaesenc %zmm21,%zmm1,%zmm1 + vaesenc %zmm21,%zmm2,%zmm2 + vaesenc %zmm21,%zmm3,%zmm3 + + vpternlogd $0x96,%zmm26,%zmm10,%zmm24 + vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7 + vpternlogd $0x96,%zmm6,%zmm5,%zmm4 + vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25 + + vaesenc %zmm22,%zmm0,%zmm0 + vaesenc %zmm22,%zmm1,%zmm1 + vaesenc %zmm22,%zmm2,%zmm2 + vaesenc %zmm22,%zmm3,%zmm3 + + vpxord %zmm7,%zmm4,%zmm10 + vpshufd $0x4e,%zmm24,%zmm24 + vpternlogd $0x96,%zmm25,%zmm24,%zmm10 + + vaesenc %zmm23,%zmm0,%zmm0 + vaesenc %zmm23,%zmm1,%zmm1 + vaesenc %zmm23,%zmm2,%zmm2 + vaesenc %zmm23,%zmm3,%zmm3 + + + vextracti32x4 $1,%zmm10,%xmm4 + vextracti32x4 $2,%zmm10,%xmm5 + vextracti32x4 $3,%zmm10,%xmm6 + vpxord %xmm4,%xmm10,%xmm10 + vpternlogd $0x96,%xmm5,%xmm6,%xmm10 + + vpxord 0(%rdi),%zmm14,%zmm4 + vpxord 64(%rdi),%zmm14,%zmm5 + vpxord 128(%rdi),%zmm14,%zmm6 + vpxord 192(%rdi),%zmm14,%zmm7 + vaesenclast %zmm4,%zmm0,%zmm4 + vaesenclast %zmm5,%zmm1,%zmm5 + vaesenclast %zmm6,%zmm2,%zmm6 + vaesenclast %zmm7,%zmm3,%zmm7 + vmovdqu8 %zmm4,0(%rsi) + vmovdqu8 %zmm5,64(%rsi) + vmovdqu8 %zmm6,128(%rsi) + vmovdqu8 %zmm7,192(%rsi) + + addq $256,%rdi + addq $256,%rsi + subq $256,%rdx + cmpq $256,%rdx + jae .Lcrypt_loop_4x__func1 +.Lghash_last_ciphertext_4x__func1: + vpshufb %zmm8,%zmm4,%zmm4 + vpxord %zmm10,%zmm4,%zmm4 + vpshufb %zmm8,%zmm5,%zmm5 + vpshufb %zmm8,%zmm6,%zmm6 + vpshufb %zmm8,%zmm7,%zmm7 + vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10 + vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24 + vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25 + vpxord %zmm24,%zmm10,%zmm10 + vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm10 + vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24 + vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25 + vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26 + vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25 + vpxord %zmm25,%zmm24,%zmm24 + vpshufd $0x4e,%zmm10,%zmm10 + vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4 + vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5 + vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6 + vpternlogd $0x96,%zmm26,%zmm10,%zmm24 + vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7 + vpternlogd $0x96,%zmm6,%zmm5,%zmm4 + vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25 + vpxord %zmm7,%zmm4,%zmm10 + vpshufd $0x4e,%zmm24,%zmm24 + vpternlogd $0x96,%zmm25,%zmm24,%zmm10 + vextracti32x4 $1,%zmm10,%xmm4 + vextracti32x4 $2,%zmm10,%xmm5 + vextracti32x4 $3,%zmm10,%xmm6 + vpxord %xmm4,%xmm10,%xmm10 + vpternlogd $0x96,%xmm5,%xmm6,%xmm10 + +.Lcrypt_loop_4x_done__func1: + + testq %rdx,%rdx + jz .Ldone__func1 + + + + + + + + + + + + + + + + + + + + + movq %rdx,%rax + negq %rax + andq $-16,%rax + leaq 256(%r9,%rax,1),%r8 + vpxor %xmm4,%xmm4,%xmm4 + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + + cmpq $64,%rdx + jb .Lpartial_vec__func1 + +.Lcrypt_loop_1x__func1: + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpaddd %zmm11,%zmm12,%zmm12 + vpxord %zmm13,%zmm0,%zmm0 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_full_vec__func1: + vbroadcasti32x4 (%rax),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_full_vec__func1 + vaesenclast %zmm14,%zmm0,%zmm0 + + + vmovdqu8 (%rdi),%zmm1 + vpxord %zmm1,%zmm0,%zmm0 + vmovdqu8 %zmm0,(%rsi) + + + vmovdqu8 (%r8),%zmm30 + vpshufb %zmm8,%zmm0,%zmm0 + vpxord %zmm10,%zmm0,%zmm0 + vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7 + vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1 + vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2 + vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3 + vpxord %zmm7,%zmm4,%zmm4 + vpternlogd $0x96,%zmm2,%zmm1,%zmm5 + vpxord %zmm3,%zmm6,%zmm6 + + vpxor %xmm10,%xmm10,%xmm10 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + cmpq $64,%rdx + jae .Lcrypt_loop_1x__func1 + + testq %rdx,%rdx + jz .Lreduce__func1 + +.Lpartial_vec__func1: + + + + + movq $-1,%rax + bzhiq %rdx,%rax,%rax + kmovq %rax,%k1 + addq $15,%rdx + andq $-16,%rdx + movq $-1,%rax + bzhiq %rdx,%rax,%rax + kmovq %rax,%k2 + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpxord %zmm13,%zmm0,%zmm0 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_partialvec__func1: + vbroadcasti32x4 (%rax),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_partialvec__func1 + vaesenclast %zmm14,%zmm0,%zmm0 + + + vmovdqu8 (%rdi),%zmm1{%k1}{z} + vpxord %zmm1,%zmm0,%zmm0 + vmovdqu8 %zmm0,(%rsi){%k1} + + + + + + + + + + + + + + vmovdqu8 (%r8),%zmm30{%k2}{z} + vmovdqu8 %zmm0,%zmm1{%k1}{z} + vpshufb %zmm8,%zmm1,%zmm0 + vpxord %zmm10,%zmm0,%zmm0 + vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7 + vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1 + vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2 + vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3 + vpxord %zmm7,%zmm4,%zmm4 + vpternlogd $0x96,%zmm2,%zmm1,%zmm5 + vpxord %zmm3,%zmm6,%zmm6 + + +.Lreduce__func1: + + vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0 + vpshufd $0x4e,%zmm4,%zmm4 + vpternlogd $0x96,%zmm0,%zmm4,%zmm5 + vpclmulqdq $0x01,%zmm5,%zmm31,%zmm0 + vpshufd $0x4e,%zmm5,%zmm5 + vpternlogd $0x96,%zmm0,%zmm5,%zmm6 + + vextracti32x4 $1,%zmm6,%xmm0 + vextracti32x4 $2,%zmm6,%xmm1 + vextracti32x4 $3,%zmm6,%xmm2 + vpxord %xmm0,%xmm6,%xmm10 + vpternlogd $0x96,%xmm1,%xmm2,%xmm10 + + +.Ldone__func1: + + vpshufb %xmm8,%xmm10,%xmm10 + vmovdqu %xmm10,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret + +.cfi_endproc +.size aes_gcm_enc_update_vaes_avx512, . - aes_gcm_enc_update_vaes_avx512 +.globl aes_gcm_dec_update_vaes_avx512 +.hidden aes_gcm_dec_update_vaes_avx512 +.type aes_gcm_dec_update_vaes_avx512,@function +.align 32 +aes_gcm_dec_update_vaes_avx512: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + movq 16(%rsp),%r12 + + vbroadcasti32x4 .Lbswap_mask(%rip),%zmm8 + vbroadcasti32x4 .Lgfpoly(%rip),%zmm31 + + + + vmovdqu (%r12),%xmm10 + vpshufb %xmm8,%xmm10,%xmm10 + vbroadcasti32x4 (%r8),%zmm12 + vpshufb %zmm8,%zmm12,%zmm12 + + + + movl 240(%rcx),%r10d + leal -20(,%r10,4),%r10d + + + + + leaq 96(%rcx,%r10,4),%r11 + vbroadcasti32x4 (%rcx),%zmm13 + vbroadcasti32x4 (%r11),%zmm14 + + + vpaddd .Lctr_pattern(%rip),%zmm12,%zmm12 + + + vbroadcasti32x4 .Linc_4blocks(%rip),%zmm11 + + + + cmpq $256,%rdx + jb .Lcrypt_loop_4x_done__func2 + + + vmovdqu8 256-256(%r9),%zmm27 + vmovdqu8 256-192(%r9),%zmm28 + vmovdqu8 256-128(%r9),%zmm29 + vmovdqu8 256-64(%r9),%zmm30 + + vbroadcasti32x4 -144(%r11),%zmm15 + vbroadcasti32x4 -128(%r11),%zmm16 + vbroadcasti32x4 -112(%r11),%zmm17 + vbroadcasti32x4 -96(%r11),%zmm18 + vbroadcasti32x4 -80(%r11),%zmm19 + vbroadcasti32x4 -64(%r11),%zmm20 + vbroadcasti32x4 -48(%r11),%zmm21 + vbroadcasti32x4 -32(%r11),%zmm22 + vbroadcasti32x4 -16(%r11),%zmm23 + +.Lcrypt_loop_4x__func2: + vmovdqu8 0(%rdi),%zmm4 + vmovdqu8 64(%rdi),%zmm5 + vmovdqu8 128(%rdi),%zmm6 + vmovdqu8 192(%rdi),%zmm7 + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm1 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm2 + vpaddd %zmm11,%zmm12,%zmm12 + vpshufb %zmm8,%zmm12,%zmm3 + vpaddd %zmm11,%zmm12,%zmm12 + + + vpxord %zmm13,%zmm0,%zmm0 + vpxord %zmm13,%zmm1,%zmm1 + vpxord %zmm13,%zmm2,%zmm2 + vpxord %zmm13,%zmm3,%zmm3 + + cmpl $24,%r10d + jl .Laes128__func2 + je .Laes192__func2 + + vbroadcasti32x4 -208(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + + vbroadcasti32x4 -192(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + +.Laes192__func2: + vbroadcasti32x4 -176(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + + vbroadcasti32x4 -160(%r11),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + vaesenc %zmm9,%zmm1,%zmm1 + vaesenc %zmm9,%zmm2,%zmm2 + vaesenc %zmm9,%zmm3,%zmm3 + +.Laes128__func2: + + + + + prefetcht0 512+0(%rdi) + prefetcht0 512+64(%rdi) + prefetcht0 512+128(%rdi) + prefetcht0 512+192(%rdi) + + + + + vpshufb %zmm8,%zmm4,%zmm4 + vpxord %zmm10,%zmm4,%zmm4 + vpshufb %zmm8,%zmm5,%zmm5 + vpshufb %zmm8,%zmm6,%zmm6 + + vaesenc %zmm15,%zmm0,%zmm0 + vaesenc %zmm15,%zmm1,%zmm1 + vaesenc %zmm15,%zmm2,%zmm2 + vaesenc %zmm15,%zmm3,%zmm3 + + vpshufb %zmm8,%zmm7,%zmm7 + vpclmulqdq $0x00,%zmm27,%zmm4,%zmm10 + vpclmulqdq $0x00,%zmm28,%zmm5,%zmm24 + vpclmulqdq $0x00,%zmm29,%zmm6,%zmm25 + + vaesenc %zmm16,%zmm0,%zmm0 + vaesenc %zmm16,%zmm1,%zmm1 + vaesenc %zmm16,%zmm2,%zmm2 + vaesenc %zmm16,%zmm3,%zmm3 + + vpxord %zmm24,%zmm10,%zmm10 + vpclmulqdq $0x00,%zmm30,%zmm7,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm10 + vpclmulqdq $0x01,%zmm27,%zmm4,%zmm24 + + vaesenc %zmm17,%zmm0,%zmm0 + vaesenc %zmm17,%zmm1,%zmm1 + vaesenc %zmm17,%zmm2,%zmm2 + vaesenc %zmm17,%zmm3,%zmm3 + + vpclmulqdq $0x01,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x01,%zmm29,%zmm6,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm30,%zmm7,%zmm25 + + vaesenc %zmm18,%zmm0,%zmm0 + vaesenc %zmm18,%zmm1,%zmm1 + vaesenc %zmm18,%zmm2,%zmm2 + vaesenc %zmm18,%zmm3,%zmm3 + + vpclmulqdq $0x10,%zmm27,%zmm4,%zmm26 + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x10,%zmm28,%zmm5,%zmm25 + vpclmulqdq $0x10,%zmm29,%zmm6,%zmm26 + + vaesenc %zmm19,%zmm0,%zmm0 + vaesenc %zmm19,%zmm1,%zmm1 + vaesenc %zmm19,%zmm2,%zmm2 + vaesenc %zmm19,%zmm3,%zmm3 + + vpternlogd $0x96,%zmm26,%zmm25,%zmm24 + vpclmulqdq $0x01,%zmm10,%zmm31,%zmm26 + vpclmulqdq $0x10,%zmm30,%zmm7,%zmm25 + vpxord %zmm25,%zmm24,%zmm24 + + vaesenc %zmm20,%zmm0,%zmm0 + vaesenc %zmm20,%zmm1,%zmm1 + vaesenc %zmm20,%zmm2,%zmm2 + vaesenc %zmm20,%zmm3,%zmm3 + + vpshufd $0x4e,%zmm10,%zmm10 + vpclmulqdq $0x11,%zmm27,%zmm4,%zmm4 + vpclmulqdq $0x11,%zmm28,%zmm5,%zmm5 + vpclmulqdq $0x11,%zmm29,%zmm6,%zmm6 + + vaesenc %zmm21,%zmm0,%zmm0 + vaesenc %zmm21,%zmm1,%zmm1 + vaesenc %zmm21,%zmm2,%zmm2 + vaesenc %zmm21,%zmm3,%zmm3 + + vpternlogd $0x96,%zmm26,%zmm10,%zmm24 + vpclmulqdq $0x11,%zmm30,%zmm7,%zmm7 + vpternlogd $0x96,%zmm6,%zmm5,%zmm4 + vpclmulqdq $0x01,%zmm24,%zmm31,%zmm25 + + vaesenc %zmm22,%zmm0,%zmm0 + vaesenc %zmm22,%zmm1,%zmm1 + vaesenc %zmm22,%zmm2,%zmm2 + vaesenc %zmm22,%zmm3,%zmm3 + + vpxord %zmm7,%zmm4,%zmm10 + vpshufd $0x4e,%zmm24,%zmm24 + vpternlogd $0x96,%zmm25,%zmm24,%zmm10 + + vaesenc %zmm23,%zmm0,%zmm0 + vaesenc %zmm23,%zmm1,%zmm1 + vaesenc %zmm23,%zmm2,%zmm2 + vaesenc %zmm23,%zmm3,%zmm3 + + + vextracti32x4 $1,%zmm10,%xmm4 + vextracti32x4 $2,%zmm10,%xmm5 + vextracti32x4 $3,%zmm10,%xmm6 + vpxord %xmm4,%xmm10,%xmm10 + vpternlogd $0x96,%xmm5,%xmm6,%xmm10 + + vpxord 0(%rdi),%zmm14,%zmm4 + vpxord 64(%rdi),%zmm14,%zmm5 + vpxord 128(%rdi),%zmm14,%zmm6 + vpxord 192(%rdi),%zmm14,%zmm7 + vaesenclast %zmm4,%zmm0,%zmm4 + vaesenclast %zmm5,%zmm1,%zmm5 + vaesenclast %zmm6,%zmm2,%zmm6 + vaesenclast %zmm7,%zmm3,%zmm7 + vmovdqu8 %zmm4,0(%rsi) + vmovdqu8 %zmm5,64(%rsi) + vmovdqu8 %zmm6,128(%rsi) + vmovdqu8 %zmm7,192(%rsi) + + addq $256,%rdi + addq $256,%rsi + subq $256,%rdx + cmpq $256,%rdx + jae .Lcrypt_loop_4x__func2 +.Lcrypt_loop_4x_done__func2: + + testq %rdx,%rdx + jz .Ldone__func2 + + + + + + + + + + + + + + + + + + + + + movq %rdx,%rax + negq %rax + andq $-16,%rax + leaq 256(%r9,%rax,1),%r8 + vpxor %xmm4,%xmm4,%xmm4 + vpxor %xmm5,%xmm5,%xmm5 + vpxor %xmm6,%xmm6,%xmm6 + + cmpq $64,%rdx + jb .Lpartial_vec__func2 + +.Lcrypt_loop_1x__func2: + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpaddd %zmm11,%zmm12,%zmm12 + vpxord %zmm13,%zmm0,%zmm0 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_full_vec__func2: + vbroadcasti32x4 (%rax),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_full_vec__func2 + vaesenclast %zmm14,%zmm0,%zmm0 + + + vmovdqu8 (%rdi),%zmm1 + vpxord %zmm1,%zmm0,%zmm0 + vmovdqu8 %zmm0,(%rsi) + + + vmovdqu8 (%r8),%zmm30 + vpshufb %zmm8,%zmm1,%zmm0 + vpxord %zmm10,%zmm0,%zmm0 + vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7 + vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1 + vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2 + vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3 + vpxord %zmm7,%zmm4,%zmm4 + vpternlogd $0x96,%zmm2,%zmm1,%zmm5 + vpxord %zmm3,%zmm6,%zmm6 + + vpxor %xmm10,%xmm10,%xmm10 + + addq $64,%r8 + addq $64,%rdi + addq $64,%rsi + subq $64,%rdx + cmpq $64,%rdx + jae .Lcrypt_loop_1x__func2 + + testq %rdx,%rdx + jz .Lreduce__func2 + +.Lpartial_vec__func2: + + + + + movq $-1,%rax + bzhiq %rdx,%rax,%rax + kmovq %rax,%k1 + addq $15,%rdx + andq $-16,%rdx + movq $-1,%rax + bzhiq %rdx,%rax,%rax + kmovq %rax,%k2 + + + + vpshufb %zmm8,%zmm12,%zmm0 + vpxord %zmm13,%zmm0,%zmm0 + leaq 16(%rcx),%rax +.Lvaesenc_loop_tail_partialvec__func2: + vbroadcasti32x4 (%rax),%zmm9 + vaesenc %zmm9,%zmm0,%zmm0 + addq $16,%rax + cmpq %rax,%r11 + jne .Lvaesenc_loop_tail_partialvec__func2 + vaesenclast %zmm14,%zmm0,%zmm0 + + + vmovdqu8 (%rdi),%zmm1{%k1}{z} + vpxord %zmm1,%zmm0,%zmm0 + vmovdqu8 %zmm0,(%rsi){%k1} + + + + + + + + + + + + + + vmovdqu8 (%r8),%zmm30{%k2}{z} + + vpshufb %zmm8,%zmm1,%zmm0 + vpxord %zmm10,%zmm0,%zmm0 + vpclmulqdq $0x00,%zmm30,%zmm0,%zmm7 + vpclmulqdq $0x01,%zmm30,%zmm0,%zmm1 + vpclmulqdq $0x10,%zmm30,%zmm0,%zmm2 + vpclmulqdq $0x11,%zmm30,%zmm0,%zmm3 + vpxord %zmm7,%zmm4,%zmm4 + vpternlogd $0x96,%zmm2,%zmm1,%zmm5 + vpxord %zmm3,%zmm6,%zmm6 + + +.Lreduce__func2: + + vpclmulqdq $0x01,%zmm4,%zmm31,%zmm0 + vpshufd $0x4e,%zmm4,%zmm4 + vpternlogd $0x96,%zmm0,%zmm4,%zmm5 + vpclmulqdq $0x01,%zmm5,%zmm31,%zmm0 + vpshufd $0x4e,%zmm5,%zmm5 + vpternlogd $0x96,%zmm0,%zmm5,%zmm6 + + vextracti32x4 $1,%zmm6,%xmm0 + vextracti32x4 $2,%zmm6,%xmm1 + vextracti32x4 $3,%zmm6,%xmm2 + vpxord %xmm0,%xmm6,%xmm10 + vpternlogd $0x96,%xmm1,%xmm2,%xmm10 + + +.Ldone__func2: + + vpshufb %xmm8,%xmm10,%xmm10 + vmovdqu %xmm10,(%r12) + + vzeroupper + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret + +.cfi_endproc +.size aes_gcm_dec_update_vaes_avx512, . - aes_gcm_dec_update_vaes_avx512 +#endif diff --git a/third_party/boringssl/gen/bcm/aes-gcm-avx512-x86_64-win.asm b/third_party/boringssl/gen/bcm/aes-gcm-avx512-x86_64-win.asm new file mode 100644 index 00000000..34f4a4f6 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aes-gcm-avx512-x86_64-win.asm @@ -0,0 +1,1527 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .rdata rdata align=8 +ALIGN 64 + + +$L$bswap_mask: + DQ 0x08090a0b0c0d0e0f,0x0001020304050607 + + + + + + + + +$L$gfpoly: + DQ 1,0xc200000000000000 + + +$L$gfpoly_and_internal_carrybit: + DQ 1,0xc200000000000001 + + +$L$ctr_pattern: + DQ 0,0 + DQ 1,0 + DQ 2,0 + DQ 3,0 + + +$L$inc_4blocks: + DQ 4,0 + +section .text code align=64 + +global gcm_init_vpclmulqdq_avx512 + +ALIGN 32 +gcm_init_vpclmulqdq_avx512: + + +_CET_ENDBR + + lea r8,[((256-64))+rcx] + + + + vpshufd xmm3,XMMWORD[rdx],0x4e + + + + + + + + + + + + + + + + + vpshufd xmm0,xmm3,0xd3 + vpsrad xmm0,xmm0,31 + vpaddq xmm3,xmm3,xmm3 + + vpternlogd xmm3,xmm0,XMMWORD[$L$gfpoly_and_internal_carrybit],0x78 + + + vbroadcasti32x4 zmm5,ZMMWORD[$L$gfpoly] + + + + + + + + + vpclmulqdq xmm0,xmm3,xmm3,0x00 + vpclmulqdq xmm4,xmm3,xmm3,0x11 + vpclmulqdq xmm1,xmm5,xmm0,0x01 + vpshufd xmm0,xmm0,0x4e + vpxor xmm1,xmm1,xmm0 + vpclmulqdq xmm0,xmm5,xmm1,0x01 + vpshufd xmm1,xmm1,0x4e + vpternlogd xmm4,xmm1,xmm0,0x96 + + + + vinserti128 ymm3,ymm4,xmm3,1 + vinserti128 ymm4,ymm4,xmm4,1 + + + vpclmulqdq ymm0,ymm3,ymm4,0x00 + vpclmulqdq ymm1,ymm3,ymm4,0x01 + vpclmulqdq ymm2,ymm3,ymm4,0x10 + vpxord ymm1,ymm1,ymm2 + vpclmulqdq ymm2,ymm5,ymm0,0x01 + vpshufd ymm0,ymm0,0x4e + vpternlogd ymm1,ymm0,ymm2,0x96 + vpclmulqdq ymm4,ymm3,ymm4,0x11 + vpclmulqdq ymm0,ymm5,ymm1,0x01 + vpshufd ymm1,ymm1,0x4e + vpternlogd ymm4,ymm1,ymm0,0x96 + + vinserti64x4 zmm3,zmm4,ymm3,1 + vshufi64x2 zmm4,zmm4,zmm4,0 + + + vmovdqu8 ZMMWORD[r8],zmm3 + + + + + mov eax,3 +$L$precompute_next: + sub r8,64 + vpclmulqdq zmm0,zmm3,zmm4,0x00 + vpclmulqdq zmm1,zmm3,zmm4,0x01 + vpclmulqdq zmm2,zmm3,zmm4,0x10 + vpxord zmm1,zmm1,zmm2 + vpclmulqdq zmm2,zmm5,zmm0,0x01 + vpshufd zmm0,zmm0,0x4e + vpternlogd zmm1,zmm0,zmm2,0x96 + vpclmulqdq zmm3,zmm3,zmm4,0x11 + vpclmulqdq zmm0,zmm5,zmm1,0x01 + vpshufd zmm1,zmm1,0x4e + vpternlogd zmm3,zmm1,zmm0,0x96 + + vmovdqu8 ZMMWORD[r8],zmm3 + dec eax + jnz NEAR $L$precompute_next + + vzeroupper + ret + + + +global gcm_gmult_vpclmulqdq_avx512 + +ALIGN 32 +gcm_gmult_vpclmulqdq_avx512: + +$L$SEH_begin_gcm_gmult_vpclmulqdq_avx512_1: +_CET_ENDBR + sub rsp,24 +$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx512_2: + vmovdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_gmult_vpclmulqdq_avx512_3: + +$L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx512_4: + + vmovdqu xmm0,XMMWORD[rcx] + vmovdqu xmm1,XMMWORD[$L$bswap_mask] + vmovdqu xmm2,XMMWORD[((256-16))+rdx] + vmovdqu xmm3,XMMWORD[$L$gfpoly] + vpshufb xmm0,xmm0,xmm1 + + vpclmulqdq xmm4,xmm0,xmm2,0x00 + vpclmulqdq xmm5,xmm0,xmm2,0x01 + vpclmulqdq xmm6,xmm0,xmm2,0x10 + vpxord xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm3,xmm4,0x01 + vpshufd xmm4,xmm4,0x4e + vpternlogd xmm5,xmm4,xmm6,0x96 + vpclmulqdq xmm0,xmm0,xmm2,0x11 + vpclmulqdq xmm4,xmm3,xmm5,0x01 + vpshufd xmm5,xmm5,0x4e + vpternlogd xmm0,xmm5,xmm4,0x96 + + + vpshufb xmm0,xmm0,xmm1 + vmovdqu XMMWORD[rcx],xmm0 + + + vmovdqa xmm6,XMMWORD[rsp] + add rsp,24 + ret +$L$SEH_end_gcm_gmult_vpclmulqdq_avx512_5: + + +global gcm_ghash_vpclmulqdq_avx512 + +ALIGN 32 +gcm_ghash_vpclmulqdq_avx512: + +$L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1: +_CET_ENDBR + sub rsp,136 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_2: + vmovdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_3: + vmovdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_4: + vmovdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_5: + vmovdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_6: + vmovdqa XMMWORD[64+rsp],xmm10 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_7: + vmovdqa XMMWORD[80+rsp],xmm11 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_8: + vmovdqa XMMWORD[96+rsp],xmm12 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_9: + vmovdqa XMMWORD[112+rsp],xmm13 +$L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_10: + +$L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx512_11: + + + + + vmovdqu xmm4,XMMWORD[$L$bswap_mask] + vmovdqu xmm10,XMMWORD[$L$gfpoly] + + + vmovdqu xmm5,XMMWORD[rcx] + vpshufb xmm5,xmm5,xmm4 + + + cmp r9,64 + jb NEAR $L$aad_blockbyblock + + + + vshufi64x2 zmm4,zmm4,zmm4,0 + vshufi64x2 zmm10,zmm10,zmm10,0 + + + vmovdqu8 zmm9,ZMMWORD[((256-64))+rdx] + + cmp r9,256 + jb NEAR $L$aad_loop_1x + + + vmovdqu8 zmm6,ZMMWORD[((256-256))+rdx] + vmovdqu8 zmm7,ZMMWORD[((256-192))+rdx] + vmovdqu8 zmm8,ZMMWORD[((256-128))+rdx] + + +$L$aad_loop_4x: + vmovdqu8 zmm0,ZMMWORD[r8] + vmovdqu8 zmm1,ZMMWORD[64+r8] + vmovdqu8 zmm2,ZMMWORD[128+r8] + vmovdqu8 zmm3,ZMMWORD[192+r8] + vpshufb zmm0,zmm0,zmm4 + vpxord zmm0,zmm0,zmm5 + vpshufb zmm1,zmm1,zmm4 + vpshufb zmm2,zmm2,zmm4 + vpshufb zmm3,zmm3,zmm4 + vpclmulqdq zmm5,zmm0,zmm6,0x00 + vpclmulqdq zmm11,zmm1,zmm7,0x00 + vpclmulqdq zmm12,zmm2,zmm8,0x00 + vpxord zmm5,zmm5,zmm11 + vpclmulqdq zmm13,zmm3,zmm9,0x00 + vpternlogd zmm5,zmm12,zmm13,0x96 + vpclmulqdq zmm11,zmm0,zmm6,0x01 + vpclmulqdq zmm12,zmm1,zmm7,0x01 + vpclmulqdq zmm13,zmm2,zmm8,0x01 + vpternlogd zmm11,zmm12,zmm13,0x96 + vpclmulqdq zmm12,zmm3,zmm9,0x01 + vpclmulqdq zmm13,zmm0,zmm6,0x10 + vpternlogd zmm11,zmm12,zmm13,0x96 + vpclmulqdq zmm12,zmm1,zmm7,0x10 + vpclmulqdq zmm13,zmm2,zmm8,0x10 + vpternlogd zmm11,zmm12,zmm13,0x96 + vpclmulqdq zmm13,zmm10,zmm5,0x01 + vpclmulqdq zmm12,zmm3,zmm9,0x10 + vpxord zmm11,zmm11,zmm12 + vpshufd zmm5,zmm5,0x4e + vpclmulqdq zmm0,zmm0,zmm6,0x11 + vpclmulqdq zmm1,zmm1,zmm7,0x11 + vpclmulqdq zmm2,zmm2,zmm8,0x11 + vpternlogd zmm11,zmm5,zmm13,0x96 + vpclmulqdq zmm3,zmm3,zmm9,0x11 + vpternlogd zmm0,zmm1,zmm2,0x96 + vpclmulqdq zmm12,zmm10,zmm11,0x01 + vpxord zmm5,zmm0,zmm3 + vpshufd zmm11,zmm11,0x4e + vpternlogd zmm5,zmm11,zmm12,0x96 + vextracti32x4 xmm0,zmm5,1 + vextracti32x4 xmm1,zmm5,2 + vextracti32x4 xmm2,zmm5,3 + vpxord xmm5,xmm5,xmm0 + vpternlogd xmm5,xmm2,xmm1,0x96 + + add r8,256 + sub r9,256 + cmp r9,256 + jae NEAR $L$aad_loop_4x + + + cmp r9,64 + jb NEAR $L$aad_large_done +$L$aad_loop_1x: + vmovdqu8 zmm0,ZMMWORD[r8] + vpshufb zmm0,zmm0,zmm4 + vpxord zmm5,zmm5,zmm0 + vpclmulqdq zmm0,zmm5,zmm9,0x00 + vpclmulqdq zmm1,zmm5,zmm9,0x01 + vpclmulqdq zmm2,zmm5,zmm9,0x10 + vpxord zmm1,zmm1,zmm2 + vpclmulqdq zmm2,zmm10,zmm0,0x01 + vpshufd zmm0,zmm0,0x4e + vpternlogd zmm1,zmm0,zmm2,0x96 + vpclmulqdq zmm5,zmm5,zmm9,0x11 + vpclmulqdq zmm0,zmm10,zmm1,0x01 + vpshufd zmm1,zmm1,0x4e + vpternlogd zmm5,zmm1,zmm0,0x96 + + vextracti32x4 xmm0,zmm5,1 + vextracti32x4 xmm1,zmm5,2 + vextracti32x4 xmm2,zmm5,3 + vpxord xmm5,xmm5,xmm0 + vpternlogd xmm5,xmm2,xmm1,0x96 + + add r8,64 + sub r9,64 + cmp r9,64 + jae NEAR $L$aad_loop_1x + +$L$aad_large_done: + + +$L$aad_blockbyblock: + test r9,r9 + jz NEAR $L$aad_done + vmovdqu xmm9,XMMWORD[((256-16))+rdx] +$L$aad_loop_blockbyblock: + vmovdqu xmm0,XMMWORD[r8] + vpshufb xmm0,xmm0,xmm4 + vpxor xmm5,xmm5,xmm0 + vpclmulqdq xmm0,xmm5,xmm9,0x00 + vpclmulqdq xmm1,xmm5,xmm9,0x01 + vpclmulqdq xmm2,xmm5,xmm9,0x10 + vpxord xmm1,xmm1,xmm2 + vpclmulqdq xmm2,xmm10,xmm0,0x01 + vpshufd xmm0,xmm0,0x4e + vpternlogd xmm1,xmm0,xmm2,0x96 + vpclmulqdq xmm5,xmm5,xmm9,0x11 + vpclmulqdq xmm0,xmm10,xmm1,0x01 + vpshufd xmm1,xmm1,0x4e + vpternlogd xmm5,xmm1,xmm0,0x96 + + add r8,16 + sub r9,16 + jnz NEAR $L$aad_loop_blockbyblock + +$L$aad_done: + + vpshufb xmm5,xmm5,xmm4 + vmovdqu XMMWORD[rcx],xmm5 + + vzeroupper + vmovdqa xmm6,XMMWORD[rsp] + vmovdqa xmm7,XMMWORD[16+rsp] + vmovdqa xmm8,XMMWORD[32+rsp] + vmovdqa xmm9,XMMWORD[48+rsp] + vmovdqa xmm10,XMMWORD[64+rsp] + vmovdqa xmm11,XMMWORD[80+rsp] + vmovdqa xmm12,XMMWORD[96+rsp] + vmovdqa xmm13,XMMWORD[112+rsp] + add rsp,136 + ret +$L$SEH_end_gcm_ghash_vpclmulqdq_avx512_12: + + +global aes_gcm_enc_update_vaes_avx512 + +ALIGN 32 +aes_gcm_enc_update_vaes_avx512: + +$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1: +_CET_ENDBR + push rsi +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_2: + push rdi +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_3: + push r12 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_4: + + mov rsi,QWORD[64+rsp] + mov rdi,QWORD[72+rsp] + mov r12,QWORD[80+rsp] + sub rsp,160 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_5: + vmovdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_6: + vmovdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_7: + vmovdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_8: + vmovdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_9: + vmovdqa XMMWORD[64+rsp],xmm10 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_10: + vmovdqa XMMWORD[80+rsp],xmm11 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_11: + vmovdqa XMMWORD[96+rsp],xmm12 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_12: + vmovdqa XMMWORD[112+rsp],xmm13 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_13: + vmovdqa XMMWORD[128+rsp],xmm14 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_14: + vmovdqa XMMWORD[144+rsp],xmm15 +$L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_15: + +$L$SEH_endprologue_aes_gcm_enc_update_vaes_avx512_16: +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+7))],1 +%endif + + vbroadcasti32x4 zmm8,ZMMWORD[$L$bswap_mask] + vbroadcasti32x4 zmm31,ZMMWORD[$L$gfpoly] + + + + vmovdqu xmm10,XMMWORD[r12] + vpshufb xmm10,xmm10,xmm8 + vbroadcasti32x4 zmm12,ZMMWORD[rsi] + vpshufb zmm12,zmm12,zmm8 + + + + mov r10d,DWORD[240+r9] + lea r10d,[((-20))+r10*4] + + + + + lea r11,[96+r10*4+r9] + vbroadcasti32x4 zmm13,ZMMWORD[r9] + vbroadcasti32x4 zmm14,ZMMWORD[r11] + + + vpaddd zmm12,zmm12,ZMMWORD[$L$ctr_pattern] + + + vbroadcasti32x4 zmm11,ZMMWORD[$L$inc_4blocks] + + + + cmp r8,256 + jb NEAR $L$crypt_loop_4x_done__func1 + + + vmovdqu8 zmm27,ZMMWORD[((256-256))+rdi] + vmovdqu8 zmm28,ZMMWORD[((256-192))+rdi] + vmovdqu8 zmm29,ZMMWORD[((256-128))+rdi] + vmovdqu8 zmm30,ZMMWORD[((256-64))+rdi] + + + + + vpshufb zmm0,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + vpshufb zmm1,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + vpshufb zmm2,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + vpshufb zmm3,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + + + vpxord zmm0,zmm0,zmm13 + vpxord zmm1,zmm1,zmm13 + vpxord zmm2,zmm2,zmm13 + vpxord zmm3,zmm3,zmm13 + + lea rax,[16+r9] +$L$vaesenc_loop_first_4_vecs__func1: + vbroadcasti32x4 zmm9,ZMMWORD[rax] + vaesenc zmm0,zmm0,zmm9 + vaesenc zmm1,zmm1,zmm9 + vaesenc zmm2,zmm2,zmm9 + vaesenc zmm3,zmm3,zmm9 + + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_first_4_vecs__func1 + vpxord zmm4,zmm14,ZMMWORD[rcx] + vpxord zmm5,zmm14,ZMMWORD[64+rcx] + vpxord zmm6,zmm14,ZMMWORD[128+rcx] + vpxord zmm7,zmm14,ZMMWORD[192+rcx] + vaesenclast zmm4,zmm0,zmm4 + vaesenclast zmm5,zmm1,zmm5 + vaesenclast zmm6,zmm2,zmm6 + vaesenclast zmm7,zmm3,zmm7 + vmovdqu8 ZMMWORD[rdx],zmm4 + vmovdqu8 ZMMWORD[64+rdx],zmm5 + vmovdqu8 ZMMWORD[128+rdx],zmm6 + vmovdqu8 ZMMWORD[192+rdx],zmm7 + + add rcx,256 + add rdx,256 + sub r8,256 + cmp r8,256 + jb NEAR $L$ghash_last_ciphertext_4x__func1 + + vbroadcasti32x4 zmm15,ZMMWORD[((-144))+r11] + vbroadcasti32x4 zmm16,ZMMWORD[((-128))+r11] + vbroadcasti32x4 zmm17,ZMMWORD[((-112))+r11] + vbroadcasti32x4 zmm18,ZMMWORD[((-96))+r11] + vbroadcasti32x4 zmm19,ZMMWORD[((-80))+r11] + vbroadcasti32x4 zmm20,ZMMWORD[((-64))+r11] + vbroadcasti32x4 zmm21,ZMMWORD[((-48))+r11] + vbroadcasti32x4 zmm22,ZMMWORD[((-32))+r11] + vbroadcasti32x4 zmm23,ZMMWORD[((-16))+r11] + +$L$crypt_loop_4x__func1: + + + + vpshufb zmm0,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + vpshufb zmm1,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + vpshufb zmm2,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + vpshufb zmm3,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + + + vpxord zmm0,zmm0,zmm13 + vpxord zmm1,zmm1,zmm13 + vpxord zmm2,zmm2,zmm13 + vpxord zmm3,zmm3,zmm13 + + cmp r10d,24 + jl NEAR $L$aes128__func1 + je NEAR $L$aes192__func1 + + vbroadcasti32x4 zmm9,ZMMWORD[((-208))+r11] + vaesenc zmm0,zmm0,zmm9 + vaesenc zmm1,zmm1,zmm9 + vaesenc zmm2,zmm2,zmm9 + vaesenc zmm3,zmm3,zmm9 + + vbroadcasti32x4 zmm9,ZMMWORD[((-192))+r11] + vaesenc zmm0,zmm0,zmm9 + vaesenc zmm1,zmm1,zmm9 + vaesenc zmm2,zmm2,zmm9 + vaesenc zmm3,zmm3,zmm9 + +$L$aes192__func1: + vbroadcasti32x4 zmm9,ZMMWORD[((-176))+r11] + vaesenc zmm0,zmm0,zmm9 + vaesenc zmm1,zmm1,zmm9 + vaesenc zmm2,zmm2,zmm9 + vaesenc zmm3,zmm3,zmm9 + + vbroadcasti32x4 zmm9,ZMMWORD[((-160))+r11] + vaesenc zmm0,zmm0,zmm9 + vaesenc zmm1,zmm1,zmm9 + vaesenc zmm2,zmm2,zmm9 + vaesenc zmm3,zmm3,zmm9 + +$L$aes128__func1: + + + + + prefetcht0 [((512+0))+rcx] + prefetcht0 [((512+64))+rcx] + prefetcht0 [((512+128))+rcx] + prefetcht0 [((512+192))+rcx] + + + + + vpshufb zmm4,zmm4,zmm8 + vpxord zmm4,zmm4,zmm10 + vpshufb zmm5,zmm5,zmm8 + vpshufb zmm6,zmm6,zmm8 + + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm1,zmm1,zmm15 + vaesenc zmm2,zmm2,zmm15 + vaesenc zmm3,zmm3,zmm15 + + vpshufb zmm7,zmm7,zmm8 + vpclmulqdq zmm10,zmm4,zmm27,0x00 + vpclmulqdq zmm24,zmm5,zmm28,0x00 + vpclmulqdq zmm25,zmm6,zmm29,0x00 + + vaesenc zmm0,zmm0,zmm16 + vaesenc zmm1,zmm1,zmm16 + vaesenc zmm2,zmm2,zmm16 + vaesenc zmm3,zmm3,zmm16 + + vpxord zmm10,zmm10,zmm24 + vpclmulqdq zmm26,zmm7,zmm30,0x00 + vpternlogd zmm10,zmm25,zmm26,0x96 + vpclmulqdq zmm24,zmm4,zmm27,0x01 + + vaesenc zmm0,zmm0,zmm17 + vaesenc zmm1,zmm1,zmm17 + vaesenc zmm2,zmm2,zmm17 + vaesenc zmm3,zmm3,zmm17 + + vpclmulqdq zmm25,zmm5,zmm28,0x01 + vpclmulqdq zmm26,zmm6,zmm29,0x01 + vpternlogd zmm24,zmm25,zmm26,0x96 + vpclmulqdq zmm25,zmm7,zmm30,0x01 + + vaesenc zmm0,zmm0,zmm18 + vaesenc zmm1,zmm1,zmm18 + vaesenc zmm2,zmm2,zmm18 + vaesenc zmm3,zmm3,zmm18 + + vpclmulqdq zmm26,zmm4,zmm27,0x10 + vpternlogd zmm24,zmm25,zmm26,0x96 + vpclmulqdq zmm25,zmm5,zmm28,0x10 + vpclmulqdq zmm26,zmm6,zmm29,0x10 + + vaesenc zmm0,zmm0,zmm19 + vaesenc zmm1,zmm1,zmm19 + vaesenc zmm2,zmm2,zmm19 + vaesenc zmm3,zmm3,zmm19 + + vpternlogd zmm24,zmm25,zmm26,0x96 + vpclmulqdq zmm26,zmm31,zmm10,0x01 + vpclmulqdq zmm25,zmm7,zmm30,0x10 + vpxord zmm24,zmm24,zmm25 + + vaesenc zmm0,zmm0,zmm20 + vaesenc zmm1,zmm1,zmm20 + vaesenc zmm2,zmm2,zmm20 + vaesenc zmm3,zmm3,zmm20 + + vpshufd zmm10,zmm10,0x4e + vpclmulqdq zmm4,zmm4,zmm27,0x11 + vpclmulqdq zmm5,zmm5,zmm28,0x11 + vpclmulqdq zmm6,zmm6,zmm29,0x11 + + vaesenc zmm0,zmm0,zmm21 + vaesenc zmm1,zmm1,zmm21 + vaesenc zmm2,zmm2,zmm21 + vaesenc zmm3,zmm3,zmm21 + + vpternlogd zmm24,zmm10,zmm26,0x96 + vpclmulqdq zmm7,zmm7,zmm30,0x11 + vpternlogd zmm4,zmm5,zmm6,0x96 + vpclmulqdq zmm25,zmm31,zmm24,0x01 + + vaesenc zmm0,zmm0,zmm22 + vaesenc zmm1,zmm1,zmm22 + vaesenc zmm2,zmm2,zmm22 + vaesenc zmm3,zmm3,zmm22 + + vpxord zmm10,zmm4,zmm7 + vpshufd zmm24,zmm24,0x4e + vpternlogd zmm10,zmm24,zmm25,0x96 + + vaesenc zmm0,zmm0,zmm23 + vaesenc zmm1,zmm1,zmm23 + vaesenc zmm2,zmm2,zmm23 + vaesenc zmm3,zmm3,zmm23 + + + vextracti32x4 xmm4,zmm10,1 + vextracti32x4 xmm5,zmm10,2 + vextracti32x4 xmm6,zmm10,3 + vpxord xmm10,xmm10,xmm4 + vpternlogd xmm10,xmm6,xmm5,0x96 + + vpxord zmm4,zmm14,ZMMWORD[rcx] + vpxord zmm5,zmm14,ZMMWORD[64+rcx] + vpxord zmm6,zmm14,ZMMWORD[128+rcx] + vpxord zmm7,zmm14,ZMMWORD[192+rcx] + vaesenclast zmm4,zmm0,zmm4 + vaesenclast zmm5,zmm1,zmm5 + vaesenclast zmm6,zmm2,zmm6 + vaesenclast zmm7,zmm3,zmm7 + vmovdqu8 ZMMWORD[rdx],zmm4 + vmovdqu8 ZMMWORD[64+rdx],zmm5 + vmovdqu8 ZMMWORD[128+rdx],zmm6 + vmovdqu8 ZMMWORD[192+rdx],zmm7 + + add rcx,256 + add rdx,256 + sub r8,256 + cmp r8,256 + jae NEAR $L$crypt_loop_4x__func1 +$L$ghash_last_ciphertext_4x__func1: + vpshufb zmm4,zmm4,zmm8 + vpxord zmm4,zmm4,zmm10 + vpshufb zmm5,zmm5,zmm8 + vpshufb zmm6,zmm6,zmm8 + vpshufb zmm7,zmm7,zmm8 + vpclmulqdq zmm10,zmm4,zmm27,0x00 + vpclmulqdq zmm24,zmm5,zmm28,0x00 + vpclmulqdq zmm25,zmm6,zmm29,0x00 + vpxord zmm10,zmm10,zmm24 + vpclmulqdq zmm26,zmm7,zmm30,0x00 + vpternlogd zmm10,zmm25,zmm26,0x96 + vpclmulqdq zmm24,zmm4,zmm27,0x01 + vpclmulqdq zmm25,zmm5,zmm28,0x01 + vpclmulqdq zmm26,zmm6,zmm29,0x01 + vpternlogd zmm24,zmm25,zmm26,0x96 + vpclmulqdq zmm25,zmm7,zmm30,0x01 + vpclmulqdq zmm26,zmm4,zmm27,0x10 + vpternlogd zmm24,zmm25,zmm26,0x96 + vpclmulqdq zmm25,zmm5,zmm28,0x10 + vpclmulqdq zmm26,zmm6,zmm29,0x10 + vpternlogd zmm24,zmm25,zmm26,0x96 + vpclmulqdq zmm26,zmm31,zmm10,0x01 + vpclmulqdq zmm25,zmm7,zmm30,0x10 + vpxord zmm24,zmm24,zmm25 + vpshufd zmm10,zmm10,0x4e + vpclmulqdq zmm4,zmm4,zmm27,0x11 + vpclmulqdq zmm5,zmm5,zmm28,0x11 + vpclmulqdq zmm6,zmm6,zmm29,0x11 + vpternlogd zmm24,zmm10,zmm26,0x96 + vpclmulqdq zmm7,zmm7,zmm30,0x11 + vpternlogd zmm4,zmm5,zmm6,0x96 + vpclmulqdq zmm25,zmm31,zmm24,0x01 + vpxord zmm10,zmm4,zmm7 + vpshufd zmm24,zmm24,0x4e + vpternlogd zmm10,zmm24,zmm25,0x96 + vextracti32x4 xmm4,zmm10,1 + vextracti32x4 xmm5,zmm10,2 + vextracti32x4 xmm6,zmm10,3 + vpxord xmm10,xmm10,xmm4 + vpternlogd xmm10,xmm6,xmm5,0x96 + +$L$crypt_loop_4x_done__func1: + + test r8,r8 + jz NEAR $L$done__func1 + + + + + + + + + + + + + + + + + + + + + mov rax,r8 + neg rax + and rax,-16 + lea rsi,[256+rax*1+rdi] + vpxor xmm4,xmm4,xmm4 + vpxor xmm5,xmm5,xmm5 + vpxor xmm6,xmm6,xmm6 + + cmp r8,64 + jb NEAR $L$partial_vec__func1 + +$L$crypt_loop_1x__func1: + + + + vpshufb zmm0,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + vpxord zmm0,zmm0,zmm13 + lea rax,[16+r9] +$L$vaesenc_loop_tail_full_vec__func1: + vbroadcasti32x4 zmm9,ZMMWORD[rax] + vaesenc zmm0,zmm0,zmm9 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_full_vec__func1 + vaesenclast zmm0,zmm0,zmm14 + + + vmovdqu8 zmm1,ZMMWORD[rcx] + vpxord zmm0,zmm0,zmm1 + vmovdqu8 ZMMWORD[rdx],zmm0 + + + vmovdqu8 zmm30,ZMMWORD[rsi] + vpshufb zmm0,zmm0,zmm8 + vpxord zmm0,zmm0,zmm10 + vpclmulqdq zmm7,zmm0,zmm30,0x00 + vpclmulqdq zmm1,zmm0,zmm30,0x01 + vpclmulqdq zmm2,zmm0,zmm30,0x10 + vpclmulqdq zmm3,zmm0,zmm30,0x11 + vpxord zmm4,zmm4,zmm7 + vpternlogd zmm5,zmm1,zmm2,0x96 + vpxord zmm6,zmm6,zmm3 + + vpxor xmm10,xmm10,xmm10 + + add rsi,64 + add rcx,64 + add rdx,64 + sub r8,64 + cmp r8,64 + jae NEAR $L$crypt_loop_1x__func1 + + test r8,r8 + jz NEAR $L$reduce__func1 + +$L$partial_vec__func1: + + + + + mov rax,-1 + bzhi rax,rax,r8 + kmovq k1,rax + add r8,15 + and r8,-16 + mov rax,-1 + bzhi rax,rax,r8 + kmovq k2,rax + + + + vpshufb zmm0,zmm12,zmm8 + vpxord zmm0,zmm0,zmm13 + lea rax,[16+r9] +$L$vaesenc_loop_tail_partialvec__func1: + vbroadcasti32x4 zmm9,ZMMWORD[rax] + vaesenc zmm0,zmm0,zmm9 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_partialvec__func1 + vaesenclast zmm0,zmm0,zmm14 + + + vmovdqu8 zmm1{k1}{z},[rcx] + vpxord zmm0,zmm0,zmm1 + vmovdqu8 ZMMWORD[rdx]{k1},zmm0 + + + + + + + + + + + + + + vmovdqu8 zmm30{k2}{z},[rsi] + vmovdqu8 zmm1{k1}{z},zmm0 + vpshufb zmm0,zmm1,zmm8 + vpxord zmm0,zmm0,zmm10 + vpclmulqdq zmm7,zmm0,zmm30,0x00 + vpclmulqdq zmm1,zmm0,zmm30,0x01 + vpclmulqdq zmm2,zmm0,zmm30,0x10 + vpclmulqdq zmm3,zmm0,zmm30,0x11 + vpxord zmm4,zmm4,zmm7 + vpternlogd zmm5,zmm1,zmm2,0x96 + vpxord zmm6,zmm6,zmm3 + + +$L$reduce__func1: + + vpclmulqdq zmm0,zmm31,zmm4,0x01 + vpshufd zmm4,zmm4,0x4e + vpternlogd zmm5,zmm4,zmm0,0x96 + vpclmulqdq zmm0,zmm31,zmm5,0x01 + vpshufd zmm5,zmm5,0x4e + vpternlogd zmm6,zmm5,zmm0,0x96 + + vextracti32x4 xmm0,zmm6,1 + vextracti32x4 xmm1,zmm6,2 + vextracti32x4 xmm2,zmm6,3 + vpxord xmm10,xmm6,xmm0 + vpternlogd xmm10,xmm2,xmm1,0x96 + + +$L$done__func1: + + vpshufb xmm10,xmm10,xmm8 + vmovdqu XMMWORD[r12],xmm10 + + vzeroupper + vmovdqa xmm6,XMMWORD[rsp] + vmovdqa xmm7,XMMWORD[16+rsp] + vmovdqa xmm8,XMMWORD[32+rsp] + vmovdqa xmm9,XMMWORD[48+rsp] + vmovdqa xmm10,XMMWORD[64+rsp] + vmovdqa xmm11,XMMWORD[80+rsp] + vmovdqa xmm12,XMMWORD[96+rsp] + vmovdqa xmm13,XMMWORD[112+rsp] + vmovdqa xmm14,XMMWORD[128+rsp] + vmovdqa xmm15,XMMWORD[144+rsp] + add rsp,160 + pop r12 + pop rdi + pop rsi + ret +$L$SEH_end_aes_gcm_enc_update_vaes_avx512_17: + + +global aes_gcm_dec_update_vaes_avx512 + +ALIGN 32 +aes_gcm_dec_update_vaes_avx512: + +$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1: +_CET_ENDBR + push rsi +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_2: + push rdi +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_3: + push r12 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_4: + + mov rsi,QWORD[64+rsp] + mov rdi,QWORD[72+rsp] + mov r12,QWORD[80+rsp] + sub rsp,160 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_5: + vmovdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_6: + vmovdqa XMMWORD[16+rsp],xmm7 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_7: + vmovdqa XMMWORD[32+rsp],xmm8 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_8: + vmovdqa XMMWORD[48+rsp],xmm9 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_9: + vmovdqa XMMWORD[64+rsp],xmm10 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_10: + vmovdqa XMMWORD[80+rsp],xmm11 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_11: + vmovdqa XMMWORD[96+rsp],xmm12 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_12: + vmovdqa XMMWORD[112+rsp],xmm13 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_13: + vmovdqa XMMWORD[128+rsp],xmm14 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_14: + vmovdqa XMMWORD[144+rsp],xmm15 +$L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_15: + +$L$SEH_endprologue_aes_gcm_dec_update_vaes_avx512_16: + + vbroadcasti32x4 zmm8,ZMMWORD[$L$bswap_mask] + vbroadcasti32x4 zmm31,ZMMWORD[$L$gfpoly] + + + + vmovdqu xmm10,XMMWORD[r12] + vpshufb xmm10,xmm10,xmm8 + vbroadcasti32x4 zmm12,ZMMWORD[rsi] + vpshufb zmm12,zmm12,zmm8 + + + + mov r10d,DWORD[240+r9] + lea r10d,[((-20))+r10*4] + + + + + lea r11,[96+r10*4+r9] + vbroadcasti32x4 zmm13,ZMMWORD[r9] + vbroadcasti32x4 zmm14,ZMMWORD[r11] + + + vpaddd zmm12,zmm12,ZMMWORD[$L$ctr_pattern] + + + vbroadcasti32x4 zmm11,ZMMWORD[$L$inc_4blocks] + + + + cmp r8,256 + jb NEAR $L$crypt_loop_4x_done__func2 + + + vmovdqu8 zmm27,ZMMWORD[((256-256))+rdi] + vmovdqu8 zmm28,ZMMWORD[((256-192))+rdi] + vmovdqu8 zmm29,ZMMWORD[((256-128))+rdi] + vmovdqu8 zmm30,ZMMWORD[((256-64))+rdi] + + vbroadcasti32x4 zmm15,ZMMWORD[((-144))+r11] + vbroadcasti32x4 zmm16,ZMMWORD[((-128))+r11] + vbroadcasti32x4 zmm17,ZMMWORD[((-112))+r11] + vbroadcasti32x4 zmm18,ZMMWORD[((-96))+r11] + vbroadcasti32x4 zmm19,ZMMWORD[((-80))+r11] + vbroadcasti32x4 zmm20,ZMMWORD[((-64))+r11] + vbroadcasti32x4 zmm21,ZMMWORD[((-48))+r11] + vbroadcasti32x4 zmm22,ZMMWORD[((-32))+r11] + vbroadcasti32x4 zmm23,ZMMWORD[((-16))+r11] + +$L$crypt_loop_4x__func2: + vmovdqu8 zmm4,ZMMWORD[rcx] + vmovdqu8 zmm5,ZMMWORD[64+rcx] + vmovdqu8 zmm6,ZMMWORD[128+rcx] + vmovdqu8 zmm7,ZMMWORD[192+rcx] + + + + vpshufb zmm0,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + vpshufb zmm1,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + vpshufb zmm2,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + vpshufb zmm3,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + + + vpxord zmm0,zmm0,zmm13 + vpxord zmm1,zmm1,zmm13 + vpxord zmm2,zmm2,zmm13 + vpxord zmm3,zmm3,zmm13 + + cmp r10d,24 + jl NEAR $L$aes128__func2 + je NEAR $L$aes192__func2 + + vbroadcasti32x4 zmm9,ZMMWORD[((-208))+r11] + vaesenc zmm0,zmm0,zmm9 + vaesenc zmm1,zmm1,zmm9 + vaesenc zmm2,zmm2,zmm9 + vaesenc zmm3,zmm3,zmm9 + + vbroadcasti32x4 zmm9,ZMMWORD[((-192))+r11] + vaesenc zmm0,zmm0,zmm9 + vaesenc zmm1,zmm1,zmm9 + vaesenc zmm2,zmm2,zmm9 + vaesenc zmm3,zmm3,zmm9 + +$L$aes192__func2: + vbroadcasti32x4 zmm9,ZMMWORD[((-176))+r11] + vaesenc zmm0,zmm0,zmm9 + vaesenc zmm1,zmm1,zmm9 + vaesenc zmm2,zmm2,zmm9 + vaesenc zmm3,zmm3,zmm9 + + vbroadcasti32x4 zmm9,ZMMWORD[((-160))+r11] + vaesenc zmm0,zmm0,zmm9 + vaesenc zmm1,zmm1,zmm9 + vaesenc zmm2,zmm2,zmm9 + vaesenc zmm3,zmm3,zmm9 + +$L$aes128__func2: + + + + + prefetcht0 [((512+0))+rcx] + prefetcht0 [((512+64))+rcx] + prefetcht0 [((512+128))+rcx] + prefetcht0 [((512+192))+rcx] + + + + + vpshufb zmm4,zmm4,zmm8 + vpxord zmm4,zmm4,zmm10 + vpshufb zmm5,zmm5,zmm8 + vpshufb zmm6,zmm6,zmm8 + + vaesenc zmm0,zmm0,zmm15 + vaesenc zmm1,zmm1,zmm15 + vaesenc zmm2,zmm2,zmm15 + vaesenc zmm3,zmm3,zmm15 + + vpshufb zmm7,zmm7,zmm8 + vpclmulqdq zmm10,zmm4,zmm27,0x00 + vpclmulqdq zmm24,zmm5,zmm28,0x00 + vpclmulqdq zmm25,zmm6,zmm29,0x00 + + vaesenc zmm0,zmm0,zmm16 + vaesenc zmm1,zmm1,zmm16 + vaesenc zmm2,zmm2,zmm16 + vaesenc zmm3,zmm3,zmm16 + + vpxord zmm10,zmm10,zmm24 + vpclmulqdq zmm26,zmm7,zmm30,0x00 + vpternlogd zmm10,zmm25,zmm26,0x96 + vpclmulqdq zmm24,zmm4,zmm27,0x01 + + vaesenc zmm0,zmm0,zmm17 + vaesenc zmm1,zmm1,zmm17 + vaesenc zmm2,zmm2,zmm17 + vaesenc zmm3,zmm3,zmm17 + + vpclmulqdq zmm25,zmm5,zmm28,0x01 + vpclmulqdq zmm26,zmm6,zmm29,0x01 + vpternlogd zmm24,zmm25,zmm26,0x96 + vpclmulqdq zmm25,zmm7,zmm30,0x01 + + vaesenc zmm0,zmm0,zmm18 + vaesenc zmm1,zmm1,zmm18 + vaesenc zmm2,zmm2,zmm18 + vaesenc zmm3,zmm3,zmm18 + + vpclmulqdq zmm26,zmm4,zmm27,0x10 + vpternlogd zmm24,zmm25,zmm26,0x96 + vpclmulqdq zmm25,zmm5,zmm28,0x10 + vpclmulqdq zmm26,zmm6,zmm29,0x10 + + vaesenc zmm0,zmm0,zmm19 + vaesenc zmm1,zmm1,zmm19 + vaesenc zmm2,zmm2,zmm19 + vaesenc zmm3,zmm3,zmm19 + + vpternlogd zmm24,zmm25,zmm26,0x96 + vpclmulqdq zmm26,zmm31,zmm10,0x01 + vpclmulqdq zmm25,zmm7,zmm30,0x10 + vpxord zmm24,zmm24,zmm25 + + vaesenc zmm0,zmm0,zmm20 + vaesenc zmm1,zmm1,zmm20 + vaesenc zmm2,zmm2,zmm20 + vaesenc zmm3,zmm3,zmm20 + + vpshufd zmm10,zmm10,0x4e + vpclmulqdq zmm4,zmm4,zmm27,0x11 + vpclmulqdq zmm5,zmm5,zmm28,0x11 + vpclmulqdq zmm6,zmm6,zmm29,0x11 + + vaesenc zmm0,zmm0,zmm21 + vaesenc zmm1,zmm1,zmm21 + vaesenc zmm2,zmm2,zmm21 + vaesenc zmm3,zmm3,zmm21 + + vpternlogd zmm24,zmm10,zmm26,0x96 + vpclmulqdq zmm7,zmm7,zmm30,0x11 + vpternlogd zmm4,zmm5,zmm6,0x96 + vpclmulqdq zmm25,zmm31,zmm24,0x01 + + vaesenc zmm0,zmm0,zmm22 + vaesenc zmm1,zmm1,zmm22 + vaesenc zmm2,zmm2,zmm22 + vaesenc zmm3,zmm3,zmm22 + + vpxord zmm10,zmm4,zmm7 + vpshufd zmm24,zmm24,0x4e + vpternlogd zmm10,zmm24,zmm25,0x96 + + vaesenc zmm0,zmm0,zmm23 + vaesenc zmm1,zmm1,zmm23 + vaesenc zmm2,zmm2,zmm23 + vaesenc zmm3,zmm3,zmm23 + + + vextracti32x4 xmm4,zmm10,1 + vextracti32x4 xmm5,zmm10,2 + vextracti32x4 xmm6,zmm10,3 + vpxord xmm10,xmm10,xmm4 + vpternlogd xmm10,xmm6,xmm5,0x96 + + vpxord zmm4,zmm14,ZMMWORD[rcx] + vpxord zmm5,zmm14,ZMMWORD[64+rcx] + vpxord zmm6,zmm14,ZMMWORD[128+rcx] + vpxord zmm7,zmm14,ZMMWORD[192+rcx] + vaesenclast zmm4,zmm0,zmm4 + vaesenclast zmm5,zmm1,zmm5 + vaesenclast zmm6,zmm2,zmm6 + vaesenclast zmm7,zmm3,zmm7 + vmovdqu8 ZMMWORD[rdx],zmm4 + vmovdqu8 ZMMWORD[64+rdx],zmm5 + vmovdqu8 ZMMWORD[128+rdx],zmm6 + vmovdqu8 ZMMWORD[192+rdx],zmm7 + + add rcx,256 + add rdx,256 + sub r8,256 + cmp r8,256 + jae NEAR $L$crypt_loop_4x__func2 +$L$crypt_loop_4x_done__func2: + + test r8,r8 + jz NEAR $L$done__func2 + + + + + + + + + + + + + + + + + + + + + mov rax,r8 + neg rax + and rax,-16 + lea rsi,[256+rax*1+rdi] + vpxor xmm4,xmm4,xmm4 + vpxor xmm5,xmm5,xmm5 + vpxor xmm6,xmm6,xmm6 + + cmp r8,64 + jb NEAR $L$partial_vec__func2 + +$L$crypt_loop_1x__func2: + + + + vpshufb zmm0,zmm12,zmm8 + vpaddd zmm12,zmm12,zmm11 + vpxord zmm0,zmm0,zmm13 + lea rax,[16+r9] +$L$vaesenc_loop_tail_full_vec__func2: + vbroadcasti32x4 zmm9,ZMMWORD[rax] + vaesenc zmm0,zmm0,zmm9 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_full_vec__func2 + vaesenclast zmm0,zmm0,zmm14 + + + vmovdqu8 zmm1,ZMMWORD[rcx] + vpxord zmm0,zmm0,zmm1 + vmovdqu8 ZMMWORD[rdx],zmm0 + + + vmovdqu8 zmm30,ZMMWORD[rsi] + vpshufb zmm0,zmm1,zmm8 + vpxord zmm0,zmm0,zmm10 + vpclmulqdq zmm7,zmm0,zmm30,0x00 + vpclmulqdq zmm1,zmm0,zmm30,0x01 + vpclmulqdq zmm2,zmm0,zmm30,0x10 + vpclmulqdq zmm3,zmm0,zmm30,0x11 + vpxord zmm4,zmm4,zmm7 + vpternlogd zmm5,zmm1,zmm2,0x96 + vpxord zmm6,zmm6,zmm3 + + vpxor xmm10,xmm10,xmm10 + + add rsi,64 + add rcx,64 + add rdx,64 + sub r8,64 + cmp r8,64 + jae NEAR $L$crypt_loop_1x__func2 + + test r8,r8 + jz NEAR $L$reduce__func2 + +$L$partial_vec__func2: + + + + + mov rax,-1 + bzhi rax,rax,r8 + kmovq k1,rax + add r8,15 + and r8,-16 + mov rax,-1 + bzhi rax,rax,r8 + kmovq k2,rax + + + + vpshufb zmm0,zmm12,zmm8 + vpxord zmm0,zmm0,zmm13 + lea rax,[16+r9] +$L$vaesenc_loop_tail_partialvec__func2: + vbroadcasti32x4 zmm9,ZMMWORD[rax] + vaesenc zmm0,zmm0,zmm9 + add rax,16 + cmp r11,rax + jne NEAR $L$vaesenc_loop_tail_partialvec__func2 + vaesenclast zmm0,zmm0,zmm14 + + + vmovdqu8 zmm1{k1}{z},[rcx] + vpxord zmm0,zmm0,zmm1 + vmovdqu8 ZMMWORD[rdx]{k1},zmm0 + + + + + + + + + + + + + + vmovdqu8 zmm30{k2}{z},[rsi] + + vpshufb zmm0,zmm1,zmm8 + vpxord zmm0,zmm0,zmm10 + vpclmulqdq zmm7,zmm0,zmm30,0x00 + vpclmulqdq zmm1,zmm0,zmm30,0x01 + vpclmulqdq zmm2,zmm0,zmm30,0x10 + vpclmulqdq zmm3,zmm0,zmm30,0x11 + vpxord zmm4,zmm4,zmm7 + vpternlogd zmm5,zmm1,zmm2,0x96 + vpxord zmm6,zmm6,zmm3 + + +$L$reduce__func2: + + vpclmulqdq zmm0,zmm31,zmm4,0x01 + vpshufd zmm4,zmm4,0x4e + vpternlogd zmm5,zmm4,zmm0,0x96 + vpclmulqdq zmm0,zmm31,zmm5,0x01 + vpshufd zmm5,zmm5,0x4e + vpternlogd zmm6,zmm5,zmm0,0x96 + + vextracti32x4 xmm0,zmm6,1 + vextracti32x4 xmm1,zmm6,2 + vextracti32x4 xmm2,zmm6,3 + vpxord xmm10,xmm6,xmm0 + vpternlogd xmm10,xmm2,xmm1,0x96 + + +$L$done__func2: + + vpshufb xmm10,xmm10,xmm8 + vmovdqu XMMWORD[r12],xmm10 + + vzeroupper + vmovdqa xmm6,XMMWORD[rsp] + vmovdqa xmm7,XMMWORD[16+rsp] + vmovdqa xmm8,XMMWORD[32+rsp] + vmovdqa xmm9,XMMWORD[48+rsp] + vmovdqa xmm10,XMMWORD[64+rsp] + vmovdqa xmm11,XMMWORD[80+rsp] + vmovdqa xmm12,XMMWORD[96+rsp] + vmovdqa xmm13,XMMWORD[112+rsp] + vmovdqa xmm14,XMMWORD[128+rsp] + vmovdqa xmm15,XMMWORD[144+rsp] + add rsp,160 + pop r12 + pop rdi + pop rsi + ret +$L$SEH_end_aes_gcm_dec_update_vaes_avx512_17: + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_gcm_gmult_vpclmulqdq_avx512_1 wrt ..imagebase + DD $L$SEH_end_gcm_gmult_vpclmulqdq_avx512_5 wrt ..imagebase + DD $L$SEH_info_gcm_gmult_vpclmulqdq_avx512_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1 wrt ..imagebase + DD $L$SEH_end_gcm_ghash_vpclmulqdq_avx512_12 wrt ..imagebase + DD $L$SEH_info_gcm_ghash_vpclmulqdq_avx512_0 wrt ..imagebase + + DD $L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 wrt ..imagebase + DD $L$SEH_end_aes_gcm_enc_update_vaes_avx512_17 wrt ..imagebase + DD $L$SEH_info_aes_gcm_enc_update_vaes_avx512_0 wrt ..imagebase + + DD $L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 wrt ..imagebase + DD $L$SEH_end_aes_gcm_dec_update_vaes_avx512_17 wrt ..imagebase + DD $L$SEH_info_aes_gcm_dec_update_vaes_avx512_0 wrt ..imagebase + + +section .xdata rdata align=8 +ALIGN 4 +$L$SEH_info_gcm_gmult_vpclmulqdq_avx512_0: + DB 1 + DB $L$SEH_endprologue_gcm_gmult_vpclmulqdq_avx512_4-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx512_1 + DB 3 + DB 0 + DB $L$SEH_prologue_gcm_gmult_vpclmulqdq_avx512_3-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx512_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_gmult_vpclmulqdq_avx512_2-$L$SEH_begin_gcm_gmult_vpclmulqdq_avx512_1 + DB 34 + + DW 0 +$L$SEH_info_gcm_ghash_vpclmulqdq_avx512_0: + DB 1 + DB $L$SEH_endprologue_gcm_ghash_vpclmulqdq_avx512_11-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1 + DB 18 + DB 0 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_10-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1 + DB 216 + DW 7 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_9-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1 + DB 200 + DW 6 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_8-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1 + DB 184 + DW 5 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_7-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1 + DB 168 + DW 4 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_6-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1 + DB 152 + DW 3 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_5-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1 + DB 136 + DW 2 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_4-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1 + DB 120 + DW 1 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_3-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_ghash_vpclmulqdq_avx512_2-$L$SEH_begin_gcm_ghash_vpclmulqdq_avx512_1 + DB 1 + DW 17 + +$L$SEH_info_aes_gcm_enc_update_vaes_avx512_0: + DB 1 + DB $L$SEH_endprologue_aes_gcm_enc_update_vaes_avx512_16-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 25 + DB 0 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_15-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_14-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_13-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_12-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_11-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_10-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_9-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_8-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_7-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_6-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_5-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 1 + DW 20 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_4-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 192 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_3-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 112 + DB $L$SEH_prologue_aes_gcm_enc_update_vaes_avx512_2-$L$SEH_begin_aes_gcm_enc_update_vaes_avx512_1 + DB 96 + + DW 0 +$L$SEH_info_aes_gcm_dec_update_vaes_avx512_0: + DB 1 + DB $L$SEH_endprologue_aes_gcm_dec_update_vaes_avx512_16-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 25 + DB 0 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_15-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_14-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_13-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_12-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_11-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_10-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_9-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_8-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_7-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_6-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_5-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 1 + DW 20 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_4-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 192 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_3-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 112 + DB $L$SEH_prologue_aes_gcm_dec_update_vaes_avx512_2-$L$SEH_begin_aes_gcm_dec_update_vaes_avx512_1 + DB 96 + + DW 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/aesni-gcm-x86_64-apple.S b/third_party/boringssl/gen/bcm/aesni-gcm-x86_64-apple.S new file mode 100644 index 00000000..e1247bc8 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesni-gcm-x86_64-apple.S @@ -0,0 +1,868 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + + +.p2align 5 +_aesni_ctr32_ghash_6x: + + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp L$oop6x + +.p2align 5 +L$oop6x: + addl $100663296,%ebx + jc L$handle_ctr32 + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +L$resume_ctr32: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + + + + + + + + + + + + + + + + + + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 88(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 80(%r14),%r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 72(%r14),%r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 64(%r14),%r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 56(%r14),%r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 48(%r14),%r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 40(%r14),%r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 32(%r14),%r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movbeq 24(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 16(%r14),%r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movbeq 8(%r14),%r13 + vaesenc %xmm1,%xmm13,%xmm13 + movbeq 0(%r14),%r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $11,%r10d + jb L$enc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + je L$enc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp L$enc_tail + +.p2align 5 +L$handle_ctr32: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp L$resume_ctr32 + +.p2align 5 +L$enc_tail: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + + prefetcht0 512(%rdi) + prefetcht0 576(%rdi) + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%rax + subq $0x6,%rdx + jc L$6x_done + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp L$oop6x + +L$6x_done: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + + ret + + +.globl _aesni_gcm_decrypt +.private_extern _aesni_gcm_decrypt + +.p2align 5 +_aesni_gcm_decrypt: + + +_CET_ENDBR + xorq %rax,%rax + + + + cmpq $0x60,%rdx + jb L$gcm_dec_abort + + pushq %rbp + + + movq %rsp,%rbp + + pushq %rbx + + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + pushq %r15 + + + vzeroupper + + movq 16(%rbp),%r12 + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq L$bswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + vmovdqu (%r12),%xmm8 + andq $-128,%rsp + vmovdqu (%r11),%xmm0 + leaq 128(%rcx),%rcx + leaq 32(%r9),%r9 + movl 240-128(%rcx),%r10d + vpshufb %xmm0,%xmm8,%xmm8 + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc L$dec_no_key_aliasing + cmpq $768,%r15 + jnc L$dec_no_key_aliasing + subq %r15,%rsp +L$dec_no_key_aliasing: + + vmovdqu 80(%rdi),%xmm7 + movq %rdi,%r14 + vmovdqu 64(%rdi),%xmm4 + + + + + + + + leaq -192(%rdi,%rdx,1),%r15 + + vmovdqu 48(%rdi),%xmm5 + shrq $4,%rdx + xorq %rax,%rax + vmovdqu 32(%rdi),%xmm6 + vpshufb %xmm0,%xmm7,%xmm7 + vmovdqu 16(%rdi),%xmm2 + vpshufb %xmm0,%xmm4,%xmm4 + vmovdqu (%rdi),%xmm3 + vpshufb %xmm0,%xmm5,%xmm5 + vmovdqu %xmm4,48(%rsp) + vpshufb %xmm0,%xmm6,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm2,%xmm2 + vmovdqu %xmm6,80(%rsp) + vpshufb %xmm0,%xmm3,%xmm3 + vmovdqu %xmm2,96(%rsp) + vmovdqu %xmm3,112(%rsp) + + call _aesni_ctr32_ghash_6x + + movq 16(%rbp),%r12 + vmovups %xmm9,-96(%rsi) + vmovups %xmm10,-80(%rsi) + vmovups %xmm11,-64(%rsi) + vmovups %xmm12,-48(%rsi) + vmovups %xmm13,-32(%rsi) + vmovups %xmm14,-16(%rsi) + + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,(%r12) + + vzeroupper + leaq -40(%rbp),%rsp + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbx + + popq %rbp + +L$gcm_dec_abort: + ret + + + + +.p2align 5 +_aesni_ctr32_6x: + + vmovdqu 0-128(%rcx),%xmm4 + vmovdqu 32(%r11),%xmm2 + leaq -1(%r10),%r13 + vmovups 16-128(%rcx),%xmm15 + leaq 32-128(%rcx),%r12 + vpxor %xmm4,%xmm1,%xmm9 + addl $100663296,%ebx + jc L$handle_ctr32_2 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddb %xmm2,%xmm11,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddb %xmm2,%xmm12,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp L$oop_ctr32 + +.p2align 4 +L$oop_ctr32: + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + vmovups (%r12),%xmm15 + leaq 16(%r12),%r12 + decl %r13d + jnz L$oop_ctr32 + + vmovdqu (%r12),%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 0(%rdi),%xmm3,%xmm4 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor 16(%rdi),%xmm3,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 32(%rdi),%xmm3,%xmm6 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 48(%rdi),%xmm3,%xmm8 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 64(%rdi),%xmm3,%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 80(%rdi),%xmm3,%xmm3 + leaq 96(%rdi),%rdi + + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm5,%xmm10,%xmm10 + vaesenclast %xmm6,%xmm11,%xmm11 + vaesenclast %xmm8,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + vaesenclast %xmm3,%xmm14,%xmm14 + vmovups %xmm9,0(%rsi) + vmovups %xmm10,16(%rsi) + vmovups %xmm11,32(%rsi) + vmovups %xmm12,48(%rsi) + vmovups %xmm13,64(%rsi) + vmovups %xmm14,80(%rsi) + leaq 96(%rsi),%rsi + + ret +.p2align 5 +L$handle_ctr32_2: + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpshufb %xmm0,%xmm14,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpshufb %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp L$oop_ctr32 + + + +.globl _aesni_gcm_encrypt +.private_extern _aesni_gcm_encrypt + +.p2align 5 +_aesni_gcm_encrypt: + + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+2(%rip) +#endif + xorq %rax,%rax + + + + + cmpq $288,%rdx + jb L$gcm_enc_abort + + pushq %rbp + + + movq %rsp,%rbp + + pushq %rbx + + + pushq %r12 + + + pushq %r13 + + + pushq %r14 + + + pushq %r15 + + + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq L$bswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + leaq 128(%rcx),%rcx + vmovdqu (%r11),%xmm0 + andq $-128,%rsp + movl 240-128(%rcx),%r10d + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc L$enc_no_key_aliasing + cmpq $768,%r15 + jnc L$enc_no_key_aliasing + subq %r15,%rsp +L$enc_no_key_aliasing: + + movq %rsi,%r14 + + + + + + + + + leaq -192(%rsi,%rdx,1),%r15 + + shrq $4,%rdx + + call _aesni_ctr32_6x + vpshufb %xmm0,%xmm9,%xmm8 + vpshufb %xmm0,%xmm10,%xmm2 + vmovdqu %xmm8,112(%rsp) + vpshufb %xmm0,%xmm11,%xmm4 + vmovdqu %xmm2,96(%rsp) + vpshufb %xmm0,%xmm12,%xmm5 + vmovdqu %xmm4,80(%rsp) + vpshufb %xmm0,%xmm13,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm14,%xmm7 + vmovdqu %xmm6,48(%rsp) + + call _aesni_ctr32_6x + + movq 16(%rbp),%r12 + leaq 32(%r9),%r9 + vmovdqu (%r12),%xmm8 + subq $12,%rdx + movq $192,%rax + vpshufb %xmm0,%xmm8,%xmm8 + + call _aesni_ctr32_ghash_6x + vmovdqu 32(%rsp),%xmm7 + vmovdqu (%r11),%xmm0 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm7,%xmm7,%xmm1 + vmovdqu 32-32(%r9),%xmm15 + vmovups %xmm9,-96(%rsi) + vpshufb %xmm0,%xmm9,%xmm9 + vpxor %xmm7,%xmm1,%xmm1 + vmovups %xmm10,-80(%rsi) + vpshufb %xmm0,%xmm10,%xmm10 + vmovups %xmm11,-64(%rsi) + vpshufb %xmm0,%xmm11,%xmm11 + vmovups %xmm12,-48(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vmovups %xmm13,-32(%rsi) + vpshufb %xmm0,%xmm13,%xmm13 + vmovups %xmm14,-16(%rsi) + vpshufb %xmm0,%xmm14,%xmm14 + vmovdqu %xmm9,16(%rsp) + vmovdqu 48(%rsp),%xmm6 + vmovdqu 16-32(%r9),%xmm0 + vpunpckhqdq %xmm6,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 + vpxor %xmm6,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + + vmovdqu 64(%rsp),%xmm9 + vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm9,%xmm9,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 + vpxor %xmm9,%xmm5,%xmm5 + vpxor %xmm7,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vmovdqu 80(%rsp),%xmm1 + vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm4,%xmm7,%xmm7 + vpunpckhqdq %xmm1,%xmm1,%xmm4 + vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm6,%xmm9,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 96(%rsp),%xmm2 + vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm2,%xmm2,%xmm7 + vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpxor %xmm9,%xmm1,%xmm1 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm5,%xmm4,%xmm4 + + vpxor 112(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 + vmovdqu 112-32(%r9),%xmm0 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm1,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 + vpxor %xmm4,%xmm7,%xmm4 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm1 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 + vpxor %xmm14,%xmm1,%xmm1 + vpxor %xmm5,%xmm6,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 + vmovdqu 32-32(%r9),%xmm15 + vpxor %xmm2,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm6 + + vmovdqu 16-32(%r9),%xmm0 + vpxor %xmm5,%xmm7,%xmm9 + vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 + vpxor %xmm9,%xmm6,%xmm6 + vpunpckhqdq %xmm13,%xmm13,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 + vpxor %xmm13,%xmm2,%xmm2 + vpslldq $8,%xmm6,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + vpxor %xmm9,%xmm5,%xmm8 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm6,%xmm7,%xmm7 + + vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm12,%xmm12,%xmm9 + vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 + vpxor %xmm12,%xmm9,%xmm9 + vpxor %xmm14,%xmm13,%xmm13 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm11,%xmm11,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vxorps 16(%rsp),%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm9,%xmm9 + + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm10,%xmm10,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 + vpxor %xmm10,%xmm2,%xmm2 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpxor %xmm12,%xmm11,%xmm11 + vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm9,%xmm1,%xmm1 + + vxorps %xmm7,%xmm14,%xmm14 + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 + vmovdqu 112-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm11,%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 + vpxor %xmm4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 + vpxor %xmm10,%xmm7,%xmm7 + vpxor %xmm2,%xmm6,%xmm6 + + vpxor %xmm5,%xmm7,%xmm4 + vpxor %xmm4,%xmm6,%xmm6 + vpslldq $8,%xmm6,%xmm1 + vmovdqu 16(%r11),%xmm3 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm1,%xmm5,%xmm8 + vpxor %xmm6,%xmm7,%xmm7 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm2,%xmm8,%xmm8 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm7,%xmm2,%xmm2 + vpxor %xmm2,%xmm8,%xmm8 + movq 16(%rbp),%r12 + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,(%r12) + + vzeroupper + leaq -40(%rbp),%rsp + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbx + + popq %rbp + +L$gcm_enc_abort: + ret + + + +.section __DATA,__const +.p2align 6 +L$bswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +L$poly: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +L$one_msb: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +L$two_lsb: +.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +L$one_lsb: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 6 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/aesni-gcm-x86_64-linux.S b/third_party/boringssl/gen/bcm/aesni-gcm-x86_64-linux.S new file mode 100644 index 00000000..774a8d12 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesni-gcm-x86_64-linux.S @@ -0,0 +1,883 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.type _aesni_ctr32_ghash_6x,@function +.align 32 +_aesni_ctr32_ghash_6x: +.cfi_startproc + vmovdqu 32(%r11),%xmm2 + subq $6,%rdx + vpxor %xmm4,%xmm4,%xmm4 + vmovdqu 0-128(%rcx),%xmm15 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpaddb %xmm2,%xmm11,%xmm12 + vpaddb %xmm2,%xmm12,%xmm13 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm15,%xmm1,%xmm9 + vmovdqu %xmm4,16+8(%rsp) + jmp .Loop6x + +.align 32 +.Loop6x: + addl $100663296,%ebx + jc .Lhandle_ctr32 + vmovdqu 0-32(%r9),%xmm3 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm15,%xmm10,%xmm10 + vpxor %xmm15,%xmm11,%xmm11 + +.Lresume_ctr32: + vmovdqu %xmm1,(%r8) + vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 + vpxor %xmm15,%xmm12,%xmm12 + vmovups 16-128(%rcx),%xmm2 + vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 + + + + + + + + + + + + + + + + + + xorq %r12,%r12 + cmpq %r14,%r15 + + vaesenc %xmm2,%xmm9,%xmm9 + vmovdqu 48+8(%rsp),%xmm0 + vpxor %xmm15,%xmm13,%xmm13 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 + vaesenc %xmm2,%xmm10,%xmm10 + vpxor %xmm15,%xmm14,%xmm14 + setnc %r12b + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vmovdqu 16-32(%r9),%xmm3 + negq %r12 + vaesenc %xmm2,%xmm12,%xmm12 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 + vpxor %xmm4,%xmm8,%xmm8 + vaesenc %xmm2,%xmm13,%xmm13 + vpxor %xmm5,%xmm1,%xmm4 + andq $0x60,%r12 + vmovups 32-128(%rcx),%xmm15 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 + vaesenc %xmm2,%xmm14,%xmm14 + + vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 + leaq (%r14,%r12,1),%r14 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 + vmovdqu 64+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 88(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 80(%r14),%r12 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,32+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,40+8(%rsp) + vmovdqu 48-32(%r9),%xmm5 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 48-128(%rcx),%xmm15 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm3,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 + vaesenc %xmm15,%xmm11,%xmm11 + vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 + vmovdqu 80+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor %xmm1,%xmm4,%xmm4 + vmovdqu 64-32(%r9),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 64-128(%rcx),%xmm15 + vpxor %xmm2,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 72(%r14),%r13 + vpxor %xmm5,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 64(%r14),%r12 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 + vmovdqu 96+8(%rsp),%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,48+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,56+8(%rsp) + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 96-32(%r9),%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 80-128(%rcx),%xmm15 + vpxor %xmm3,%xmm6,%xmm6 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 56(%r14),%r13 + vpxor %xmm1,%xmm7,%xmm7 + vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 + vpxor 112+8(%rsp),%xmm8,%xmm8 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 48(%r14),%r12 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,64+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,72+8(%rsp) + vpxor %xmm3,%xmm4,%xmm4 + vmovdqu 112-32(%r9),%xmm3 + vaesenc %xmm15,%xmm14,%xmm14 + + vmovups 96-128(%rcx),%xmm15 + vpxor %xmm5,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm1,%xmm6,%xmm6 + vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 + vaesenc %xmm15,%xmm10,%xmm10 + movbeq 40(%r14),%r13 + vpxor %xmm2,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 32(%r14),%r12 + vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 + vaesenc %xmm15,%xmm12,%xmm12 + movq %r13,80+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + movq %r12,88+8(%rsp) + vpxor %xmm5,%xmm6,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor %xmm1,%xmm6,%xmm6 + + vmovups 112-128(%rcx),%xmm15 + vpslldq $8,%xmm6,%xmm5 + vpxor %xmm2,%xmm4,%xmm4 + vmovdqu 16(%r11),%xmm3 + + vaesenc %xmm15,%xmm9,%xmm9 + vpxor %xmm8,%xmm7,%xmm7 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor %xmm5,%xmm4,%xmm4 + movbeq 24(%r14),%r13 + vaesenc %xmm15,%xmm11,%xmm11 + movbeq 16(%r14),%r12 + vpalignr $8,%xmm4,%xmm4,%xmm0 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + movq %r13,96+8(%rsp) + vaesenc %xmm15,%xmm12,%xmm12 + movq %r12,104+8(%rsp) + vaesenc %xmm15,%xmm13,%xmm13 + vmovups 128-128(%rcx),%xmm1 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vmovups 144-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm10,%xmm10 + vpsrldq $8,%xmm6,%xmm6 + vaesenc %xmm1,%xmm11,%xmm11 + vpxor %xmm6,%xmm7,%xmm7 + vaesenc %xmm1,%xmm12,%xmm12 + vpxor %xmm0,%xmm4,%xmm4 + movbeq 8(%r14),%r13 + vaesenc %xmm1,%xmm13,%xmm13 + movbeq 0(%r14),%r12 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 160-128(%rcx),%xmm1 + cmpl $11,%r10d + jb .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 176-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 192-128(%rcx),%xmm1 + je .Lenc_tail + + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + vmovups 208-128(%rcx),%xmm15 + vaesenc %xmm1,%xmm14,%xmm14 + vmovups 224-128(%rcx),%xmm1 + jmp .Lenc_tail + +.align 32 +.Lhandle_ctr32: + vmovdqu (%r11),%xmm0 + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vmovdqu 0-32(%r9),%xmm3 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm15,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm15,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpshufb %xmm0,%xmm14,%xmm14 + vpshufb %xmm0,%xmm1,%xmm1 + jmp .Lresume_ctr32 + +.align 32 +.Lenc_tail: + vaesenc %xmm15,%xmm9,%xmm9 + vmovdqu %xmm7,16+8(%rsp) + vpalignr $8,%xmm4,%xmm4,%xmm8 + vaesenc %xmm15,%xmm10,%xmm10 + vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 + vpxor 0(%rdi),%xmm1,%xmm2 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 16(%rdi),%xmm1,%xmm0 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 32(%rdi),%xmm1,%xmm5 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 48(%rdi),%xmm1,%xmm6 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 64(%rdi),%xmm1,%xmm7 + vpxor 80(%rdi),%xmm1,%xmm3 + vmovdqu (%r8),%xmm1 + + vaesenclast %xmm2,%xmm9,%xmm9 + vmovdqu 32(%r11),%xmm2 + vaesenclast %xmm0,%xmm10,%xmm10 + vpaddb %xmm2,%xmm1,%xmm0 + movq %r13,112+8(%rsp) + leaq 96(%rdi),%rdi + + prefetcht0 512(%rdi) + prefetcht0 576(%rdi) + vaesenclast %xmm5,%xmm11,%xmm11 + vpaddb %xmm2,%xmm0,%xmm5 + movq %r12,120+8(%rsp) + leaq 96(%rsi),%rsi + vmovdqu 0-128(%rcx),%xmm15 + vaesenclast %xmm6,%xmm12,%xmm12 + vpaddb %xmm2,%xmm5,%xmm6 + vaesenclast %xmm7,%xmm13,%xmm13 + vpaddb %xmm2,%xmm6,%xmm7 + vaesenclast %xmm3,%xmm14,%xmm14 + vpaddb %xmm2,%xmm7,%xmm3 + + addq $0x60,%rax + subq $0x6,%rdx + jc .L6x_done + + vmovups %xmm9,-96(%rsi) + vpxor %xmm15,%xmm1,%xmm9 + vmovups %xmm10,-80(%rsi) + vmovdqa %xmm0,%xmm10 + vmovups %xmm11,-64(%rsi) + vmovdqa %xmm5,%xmm11 + vmovups %xmm12,-48(%rsi) + vmovdqa %xmm6,%xmm12 + vmovups %xmm13,-32(%rsi) + vmovdqa %xmm7,%xmm13 + vmovups %xmm14,-16(%rsi) + vmovdqa %xmm3,%xmm14 + vmovdqu 32+8(%rsp),%xmm7 + jmp .Loop6x + +.L6x_done: + vpxor 16+8(%rsp),%xmm8,%xmm8 + vpxor %xmm4,%xmm8,%xmm8 + + ret +.cfi_endproc +.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x +.globl aesni_gcm_decrypt +.hidden aesni_gcm_decrypt +.type aesni_gcm_decrypt,@function +.align 32 +aesni_gcm_decrypt: +.cfi_startproc + +_CET_ENDBR + xorq %rax,%rax + + + + cmpq $0x60,%rdx + jb .Lgcm_dec_abort + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + pushq %rbx +.cfi_offset %rbx,-24 + + pushq %r12 +.cfi_offset %r12,-32 + + pushq %r13 +.cfi_offset %r13,-40 + + pushq %r14 +.cfi_offset %r14,-48 + + pushq %r15 +.cfi_offset %r15,-56 + + vzeroupper + + movq 16(%rbp),%r12 + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + vmovdqu (%r12),%xmm8 + andq $-128,%rsp + vmovdqu (%r11),%xmm0 + leaq 128(%rcx),%rcx + leaq 32(%r9),%r9 + movl 240-128(%rcx),%r10d + vpshufb %xmm0,%xmm8,%xmm8 + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Ldec_no_key_aliasing + cmpq $768,%r15 + jnc .Ldec_no_key_aliasing + subq %r15,%rsp +.Ldec_no_key_aliasing: + + vmovdqu 80(%rdi),%xmm7 + movq %rdi,%r14 + vmovdqu 64(%rdi),%xmm4 + + + + + + + + leaq -192(%rdi,%rdx,1),%r15 + + vmovdqu 48(%rdi),%xmm5 + shrq $4,%rdx + xorq %rax,%rax + vmovdqu 32(%rdi),%xmm6 + vpshufb %xmm0,%xmm7,%xmm7 + vmovdqu 16(%rdi),%xmm2 + vpshufb %xmm0,%xmm4,%xmm4 + vmovdqu (%rdi),%xmm3 + vpshufb %xmm0,%xmm5,%xmm5 + vmovdqu %xmm4,48(%rsp) + vpshufb %xmm0,%xmm6,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm2,%xmm2 + vmovdqu %xmm6,80(%rsp) + vpshufb %xmm0,%xmm3,%xmm3 + vmovdqu %xmm2,96(%rsp) + vmovdqu %xmm3,112(%rsp) + + call _aesni_ctr32_ghash_6x + + movq 16(%rbp),%r12 + vmovups %xmm9,-96(%rsi) + vmovups %xmm10,-80(%rsi) + vmovups %xmm11,-64(%rsi) + vmovups %xmm12,-48(%rsi) + vmovups %xmm13,-32(%rsi) + vmovups %xmm14,-16(%rsi) + + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,(%r12) + + vzeroupper + leaq -40(%rbp),%rsp +.cfi_def_cfa %rsp, 0x38 + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp +.Lgcm_dec_abort: + ret + +.cfi_endproc +.size aesni_gcm_decrypt,.-aesni_gcm_decrypt +.type _aesni_ctr32_6x,@function +.align 32 +_aesni_ctr32_6x: +.cfi_startproc + vmovdqu 0-128(%rcx),%xmm4 + vmovdqu 32(%r11),%xmm2 + leaq -1(%r10),%r13 + vmovups 16-128(%rcx),%xmm15 + leaq 32-128(%rcx),%r12 + vpxor %xmm4,%xmm1,%xmm9 + addl $100663296,%ebx + jc .Lhandle_ctr32_2 + vpaddb %xmm2,%xmm1,%xmm10 + vpaddb %xmm2,%xmm10,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddb %xmm2,%xmm11,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddb %xmm2,%xmm12,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpaddb %xmm2,%xmm13,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpaddb %xmm2,%xmm14,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 + +.align 16 +.Loop_ctr32: + vaesenc %xmm15,%xmm9,%xmm9 + vaesenc %xmm15,%xmm10,%xmm10 + vaesenc %xmm15,%xmm11,%xmm11 + vaesenc %xmm15,%xmm12,%xmm12 + vaesenc %xmm15,%xmm13,%xmm13 + vaesenc %xmm15,%xmm14,%xmm14 + vmovups (%r12),%xmm15 + leaq 16(%r12),%r12 + decl %r13d + jnz .Loop_ctr32 + + vmovdqu (%r12),%xmm3 + vaesenc %xmm15,%xmm9,%xmm9 + vpxor 0(%rdi),%xmm3,%xmm4 + vaesenc %xmm15,%xmm10,%xmm10 + vpxor 16(%rdi),%xmm3,%xmm5 + vaesenc %xmm15,%xmm11,%xmm11 + vpxor 32(%rdi),%xmm3,%xmm6 + vaesenc %xmm15,%xmm12,%xmm12 + vpxor 48(%rdi),%xmm3,%xmm8 + vaesenc %xmm15,%xmm13,%xmm13 + vpxor 64(%rdi),%xmm3,%xmm2 + vaesenc %xmm15,%xmm14,%xmm14 + vpxor 80(%rdi),%xmm3,%xmm3 + leaq 96(%rdi),%rdi + + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm5,%xmm10,%xmm10 + vaesenclast %xmm6,%xmm11,%xmm11 + vaesenclast %xmm8,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + vaesenclast %xmm3,%xmm14,%xmm14 + vmovups %xmm9,0(%rsi) + vmovups %xmm10,16(%rsi) + vmovups %xmm11,32(%rsi) + vmovups %xmm12,48(%rsi) + vmovups %xmm13,64(%rsi) + vmovups %xmm14,80(%rsi) + leaq 96(%rsi),%rsi + + ret +.align 32 +.Lhandle_ctr32_2: + vpshufb %xmm0,%xmm1,%xmm6 + vmovdqu 48(%r11),%xmm5 + vpaddd 64(%r11),%xmm6,%xmm10 + vpaddd %xmm5,%xmm6,%xmm11 + vpaddd %xmm5,%xmm10,%xmm12 + vpshufb %xmm0,%xmm10,%xmm10 + vpaddd %xmm5,%xmm11,%xmm13 + vpshufb %xmm0,%xmm11,%xmm11 + vpxor %xmm4,%xmm10,%xmm10 + vpaddd %xmm5,%xmm12,%xmm14 + vpshufb %xmm0,%xmm12,%xmm12 + vpxor %xmm4,%xmm11,%xmm11 + vpaddd %xmm5,%xmm13,%xmm1 + vpshufb %xmm0,%xmm13,%xmm13 + vpxor %xmm4,%xmm12,%xmm12 + vpshufb %xmm0,%xmm14,%xmm14 + vpxor %xmm4,%xmm13,%xmm13 + vpshufb %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm14,%xmm14 + jmp .Loop_ctr32 +.cfi_endproc +.size _aesni_ctr32_6x,.-_aesni_ctr32_6x + +.globl aesni_gcm_encrypt +.hidden aesni_gcm_encrypt +.type aesni_gcm_encrypt,@function +.align 32 +aesni_gcm_encrypt: +.cfi_startproc + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+2(%rip) +#endif + xorq %rax,%rax + + + + + cmpq $288,%rdx + jb .Lgcm_enc_abort + + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + + movq %rsp,%rbp +.cfi_def_cfa_register %rbp + pushq %rbx +.cfi_offset %rbx,-24 + + pushq %r12 +.cfi_offset %r12,-32 + + pushq %r13 +.cfi_offset %r13,-40 + + pushq %r14 +.cfi_offset %r14,-48 + + pushq %r15 +.cfi_offset %r15,-56 + + vzeroupper + + vmovdqu (%r8),%xmm1 + addq $-128,%rsp + movl 12(%r8),%ebx + leaq .Lbswap_mask(%rip),%r11 + leaq -128(%rcx),%r14 + movq $0xf80,%r15 + leaq 128(%rcx),%rcx + vmovdqu (%r11),%xmm0 + andq $-128,%rsp + movl 240-128(%rcx),%r10d + + andq %r15,%r14 + andq %rsp,%r15 + subq %r14,%r15 + jc .Lenc_no_key_aliasing + cmpq $768,%r15 + jnc .Lenc_no_key_aliasing + subq %r15,%rsp +.Lenc_no_key_aliasing: + + movq %rsi,%r14 + + + + + + + + + leaq -192(%rsi,%rdx,1),%r15 + + shrq $4,%rdx + + call _aesni_ctr32_6x + vpshufb %xmm0,%xmm9,%xmm8 + vpshufb %xmm0,%xmm10,%xmm2 + vmovdqu %xmm8,112(%rsp) + vpshufb %xmm0,%xmm11,%xmm4 + vmovdqu %xmm2,96(%rsp) + vpshufb %xmm0,%xmm12,%xmm5 + vmovdqu %xmm4,80(%rsp) + vpshufb %xmm0,%xmm13,%xmm6 + vmovdqu %xmm5,64(%rsp) + vpshufb %xmm0,%xmm14,%xmm7 + vmovdqu %xmm6,48(%rsp) + + call _aesni_ctr32_6x + + movq 16(%rbp),%r12 + leaq 32(%r9),%r9 + vmovdqu (%r12),%xmm8 + subq $12,%rdx + movq $192,%rax + vpshufb %xmm0,%xmm8,%xmm8 + + call _aesni_ctr32_ghash_6x + vmovdqu 32(%rsp),%xmm7 + vmovdqu (%r11),%xmm0 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm7,%xmm7,%xmm1 + vmovdqu 32-32(%r9),%xmm15 + vmovups %xmm9,-96(%rsi) + vpshufb %xmm0,%xmm9,%xmm9 + vpxor %xmm7,%xmm1,%xmm1 + vmovups %xmm10,-80(%rsi) + vpshufb %xmm0,%xmm10,%xmm10 + vmovups %xmm11,-64(%rsi) + vpshufb %xmm0,%xmm11,%xmm11 + vmovups %xmm12,-48(%rsi) + vpshufb %xmm0,%xmm12,%xmm12 + vmovups %xmm13,-32(%rsi) + vpshufb %xmm0,%xmm13,%xmm13 + vmovups %xmm14,-16(%rsi) + vpshufb %xmm0,%xmm14,%xmm14 + vmovdqu %xmm9,16(%rsp) + vmovdqu 48(%rsp),%xmm6 + vmovdqu 16-32(%r9),%xmm0 + vpunpckhqdq %xmm6,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 + vpxor %xmm6,%xmm2,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + + vmovdqu 64(%rsp),%xmm9 + vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm9,%xmm9,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 + vpxor %xmm9,%xmm5,%xmm5 + vpxor %xmm7,%xmm6,%xmm6 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vmovdqu 80(%rsp),%xmm1 + vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm4,%xmm7,%xmm7 + vpunpckhqdq %xmm1,%xmm1,%xmm4 + vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm6,%xmm9,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 96(%rsp),%xmm2 + vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm7,%xmm6,%xmm6 + vpunpckhqdq %xmm2,%xmm2,%xmm7 + vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 + vpxor %xmm2,%xmm7,%xmm7 + vpxor %xmm9,%xmm1,%xmm1 + vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm5,%xmm4,%xmm4 + + vpxor 112(%rsp),%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 + vmovdqu 112-32(%r9),%xmm0 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm1,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 + vpxor %xmm4,%xmm7,%xmm4 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 + vmovdqu 0-32(%r9),%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm1 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 + vpxor %xmm14,%xmm1,%xmm1 + vpxor %xmm5,%xmm6,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 + vmovdqu 32-32(%r9),%xmm15 + vpxor %xmm2,%xmm8,%xmm7 + vpxor %xmm4,%xmm9,%xmm6 + + vmovdqu 16-32(%r9),%xmm0 + vpxor %xmm5,%xmm7,%xmm9 + vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 + vpxor %xmm9,%xmm6,%xmm6 + vpunpckhqdq %xmm13,%xmm13,%xmm2 + vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 + vpxor %xmm13,%xmm2,%xmm2 + vpslldq $8,%xmm6,%xmm9 + vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 + vpxor %xmm9,%xmm5,%xmm8 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm6,%xmm7,%xmm7 + + vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 + vmovdqu 48-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm12,%xmm12,%xmm9 + vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 + vpxor %xmm12,%xmm9,%xmm9 + vpxor %xmm14,%xmm13,%xmm13 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 + vmovdqu 80-32(%r9),%xmm15 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 + vmovdqu 64-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm11,%xmm11,%xmm1 + vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 + vpxor %xmm11,%xmm1,%xmm1 + vpxor %xmm13,%xmm12,%xmm12 + vxorps 16(%rsp),%xmm7,%xmm7 + vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 + vpxor %xmm2,%xmm9,%xmm9 + + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 + vmovdqu 96-32(%r9),%xmm3 + vpxor %xmm4,%xmm5,%xmm5 + vpunpckhqdq %xmm10,%xmm10,%xmm2 + vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 + vpxor %xmm10,%xmm2,%xmm2 + vpalignr $8,%xmm8,%xmm8,%xmm14 + vpxor %xmm12,%xmm11,%xmm11 + vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 + vmovdqu 128-32(%r9),%xmm15 + vpxor %xmm9,%xmm1,%xmm1 + + vxorps %xmm7,%xmm14,%xmm14 + vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 + vxorps %xmm14,%xmm8,%xmm8 + + vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 + vmovdqu 112-32(%r9),%xmm0 + vpxor %xmm5,%xmm4,%xmm4 + vpunpckhqdq %xmm8,%xmm8,%xmm9 + vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 + vpxor %xmm8,%xmm9,%xmm9 + vpxor %xmm11,%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 + vpxor %xmm1,%xmm2,%xmm2 + + vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 + vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 + vpxor %xmm4,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 + vpxor %xmm10,%xmm7,%xmm7 + vpxor %xmm2,%xmm6,%xmm6 + + vpxor %xmm5,%xmm7,%xmm4 + vpxor %xmm4,%xmm6,%xmm6 + vpslldq $8,%xmm6,%xmm1 + vmovdqu 16(%r11),%xmm3 + vpsrldq $8,%xmm6,%xmm6 + vpxor %xmm1,%xmm5,%xmm8 + vpxor %xmm6,%xmm7,%xmm7 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm2,%xmm8,%xmm8 + + vpalignr $8,%xmm8,%xmm8,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 + vpxor %xmm7,%xmm2,%xmm2 + vpxor %xmm2,%xmm8,%xmm8 + movq 16(%rbp),%r12 + vpshufb (%r11),%xmm8,%xmm8 + vmovdqu %xmm8,(%r12) + + vzeroupper + leaq -40(%rbp),%rsp +.cfi_def_cfa %rsp, 0x38 + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp +.Lgcm_enc_abort: + ret + +.cfi_endproc +.size aesni_gcm_encrypt,.-aesni_gcm_encrypt +.section .rodata +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lpoly: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.Lone_msb: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Ltwo_lsb: +.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.Lone_lsb: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/aesni-gcm-x86_64-win.asm b/third_party/boringssl/gen/bcm/aesni-gcm-x86_64-win.asm new file mode 100644 index 00000000..3f7241ad --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesni-gcm-x86_64-win.asm @@ -0,0 +1,1106 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + + +ALIGN 32 +_aesni_ctr32_ghash_6x: + + vmovdqu xmm2,XMMWORD[32+r11] + sub r8,6 + vpxor xmm4,xmm4,xmm4 + vmovdqu xmm15,XMMWORD[((0-128))+r9] + vpaddb xmm10,xmm1,xmm2 + vpaddb xmm11,xmm10,xmm2 + vpaddb xmm12,xmm11,xmm2 + vpaddb xmm13,xmm12,xmm2 + vpaddb xmm14,xmm13,xmm2 + vpxor xmm9,xmm1,xmm15 + vmovdqu XMMWORD[(16+8)+rsp],xmm4 + jmp NEAR $L$oop6x + +ALIGN 32 +$L$oop6x: + add ebx,100663296 + jc NEAR $L$handle_ctr32 + vmovdqu xmm3,XMMWORD[((0-32))+rsi] + vpaddb xmm1,xmm14,xmm2 + vpxor xmm10,xmm10,xmm15 + vpxor xmm11,xmm11,xmm15 + +$L$resume_ctr32: + vmovdqu XMMWORD[rdi],xmm1 + vpclmulqdq xmm5,xmm7,xmm3,0x10 + vpxor xmm12,xmm12,xmm15 + vmovups xmm2,XMMWORD[((16-128))+r9] + vpclmulqdq xmm6,xmm7,xmm3,0x01 + + + + + + + + + + + + + + + + + + xor r12,r12 + cmp r15,r14 + + vaesenc xmm9,xmm9,xmm2 + vmovdqu xmm0,XMMWORD[((48+8))+rsp] + vpxor xmm13,xmm13,xmm15 + vpclmulqdq xmm1,xmm7,xmm3,0x00 + vaesenc xmm10,xmm10,xmm2 + vpxor xmm14,xmm14,xmm15 + setnc r12b + vpclmulqdq xmm7,xmm7,xmm3,0x11 + vaesenc xmm11,xmm11,xmm2 + vmovdqu xmm3,XMMWORD[((16-32))+rsi] + neg r12 + vaesenc xmm12,xmm12,xmm2 + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm0,xmm3,0x00 + vpxor xmm8,xmm8,xmm4 + vaesenc xmm13,xmm13,xmm2 + vpxor xmm4,xmm1,xmm5 + and r12,0x60 + vmovups xmm15,XMMWORD[((32-128))+r9] + vpclmulqdq xmm1,xmm0,xmm3,0x10 + vaesenc xmm14,xmm14,xmm2 + + vpclmulqdq xmm2,xmm0,xmm3,0x01 + lea r14,[r12*1+r14] + vaesenc xmm9,xmm9,xmm15 + vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] + vpclmulqdq xmm3,xmm0,xmm3,0x11 + vmovdqu xmm0,XMMWORD[((64+8))+rsp] + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[88+r14] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[80+r14] + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((32+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((40+8))+rsp],r12 + vmovdqu xmm5,XMMWORD[((48-32))+rsi] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((48-128))+r9] + vpxor xmm6,xmm6,xmm1 + vpclmulqdq xmm1,xmm0,xmm5,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm2 + vpclmulqdq xmm2,xmm0,xmm5,0x10 + vaesenc xmm10,xmm10,xmm15 + vpxor xmm7,xmm7,xmm3 + vpclmulqdq xmm3,xmm0,xmm5,0x01 + vaesenc xmm11,xmm11,xmm15 + vpclmulqdq xmm5,xmm0,xmm5,0x11 + vmovdqu xmm0,XMMWORD[((80+8))+rsp] + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vpxor xmm4,xmm4,xmm1 + vmovdqu xmm1,XMMWORD[((64-32))+rsi] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((64-128))+r9] + vpxor xmm6,xmm6,xmm2 + vpclmulqdq xmm2,xmm0,xmm1,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm3 + vpclmulqdq xmm3,xmm0,xmm1,0x10 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[72+r14] + vpxor xmm7,xmm7,xmm5 + vpclmulqdq xmm5,xmm0,xmm1,0x01 + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[64+r14] + vpclmulqdq xmm1,xmm0,xmm1,0x11 + vmovdqu xmm0,XMMWORD[((96+8))+rsp] + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((48+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((56+8))+rsp],r12 + vpxor xmm4,xmm4,xmm2 + vmovdqu xmm2,XMMWORD[((96-32))+rsi] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((80-128))+r9] + vpxor xmm6,xmm6,xmm3 + vpclmulqdq xmm3,xmm0,xmm2,0x00 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm0,xmm2,0x10 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[56+r14] + vpxor xmm7,xmm7,xmm1 + vpclmulqdq xmm1,xmm0,xmm2,0x01 + vpxor xmm8,xmm8,XMMWORD[((112+8))+rsp] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[48+r14] + vpclmulqdq xmm2,xmm0,xmm2,0x11 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((64+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((72+8))+rsp],r12 + vpxor xmm4,xmm4,xmm3 + vmovdqu xmm3,XMMWORD[((112-32))+rsi] + vaesenc xmm14,xmm14,xmm15 + + vmovups xmm15,XMMWORD[((96-128))+r9] + vpxor xmm6,xmm6,xmm5 + vpclmulqdq xmm5,xmm8,xmm3,0x10 + vaesenc xmm9,xmm9,xmm15 + vpxor xmm6,xmm6,xmm1 + vpclmulqdq xmm1,xmm8,xmm3,0x01 + vaesenc xmm10,xmm10,xmm15 + movbe r13,QWORD[40+r14] + vpxor xmm7,xmm7,xmm2 + vpclmulqdq xmm2,xmm8,xmm3,0x00 + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[32+r14] + vpclmulqdq xmm8,xmm8,xmm3,0x11 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((80+8))+rsp],r13 + vaesenc xmm13,xmm13,xmm15 + mov QWORD[((88+8))+rsp],r12 + vpxor xmm6,xmm6,xmm5 + vaesenc xmm14,xmm14,xmm15 + vpxor xmm6,xmm6,xmm1 + + vmovups xmm15,XMMWORD[((112-128))+r9] + vpslldq xmm5,xmm6,8 + vpxor xmm4,xmm4,xmm2 + vmovdqu xmm3,XMMWORD[16+r11] + + vaesenc xmm9,xmm9,xmm15 + vpxor xmm7,xmm7,xmm8 + vaesenc xmm10,xmm10,xmm15 + vpxor xmm4,xmm4,xmm5 + movbe r13,QWORD[24+r14] + vaesenc xmm11,xmm11,xmm15 + movbe r12,QWORD[16+r14] + vpalignr xmm0,xmm4,xmm4,8 + vpclmulqdq xmm4,xmm4,xmm3,0x10 + mov QWORD[((96+8))+rsp],r13 + vaesenc xmm12,xmm12,xmm15 + mov QWORD[((104+8))+rsp],r12 + vaesenc xmm13,xmm13,xmm15 + vmovups xmm1,XMMWORD[((128-128))+r9] + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vmovups xmm15,XMMWORD[((144-128))+r9] + vaesenc xmm10,xmm10,xmm1 + vpsrldq xmm6,xmm6,8 + vaesenc xmm11,xmm11,xmm1 + vpxor xmm7,xmm7,xmm6 + vaesenc xmm12,xmm12,xmm1 + vpxor xmm4,xmm4,xmm0 + movbe r13,QWORD[8+r14] + vaesenc xmm13,xmm13,xmm1 + movbe r12,QWORD[r14] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((160-128))+r9] + cmp r10d,11 + jb NEAR $L$enc_tail + + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + vmovups xmm15,XMMWORD[((176-128))+r9] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((192-128))+r9] + je NEAR $L$enc_tail + + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + vmovups xmm15,XMMWORD[((208-128))+r9] + vaesenc xmm14,xmm14,xmm1 + vmovups xmm1,XMMWORD[((224-128))+r9] + jmp NEAR $L$enc_tail + +ALIGN 32 +$L$handle_ctr32: + vmovdqu xmm0,XMMWORD[r11] + vpshufb xmm6,xmm1,xmm0 + vmovdqu xmm5,XMMWORD[48+r11] + vpaddd xmm10,xmm6,XMMWORD[64+r11] + vpaddd xmm11,xmm6,xmm5 + vmovdqu xmm3,XMMWORD[((0-32))+rsi] + vpaddd xmm12,xmm10,xmm5 + vpshufb xmm10,xmm10,xmm0 + vpaddd xmm13,xmm11,xmm5 + vpshufb xmm11,xmm11,xmm0 + vpxor xmm10,xmm10,xmm15 + vpaddd xmm14,xmm12,xmm5 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm11,xmm11,xmm15 + vpaddd xmm1,xmm13,xmm5 + vpshufb xmm13,xmm13,xmm0 + vpshufb xmm14,xmm14,xmm0 + vpshufb xmm1,xmm1,xmm0 + jmp NEAR $L$resume_ctr32 + +ALIGN 32 +$L$enc_tail: + vaesenc xmm9,xmm9,xmm15 + vmovdqu XMMWORD[(16+8)+rsp],xmm7 + vpalignr xmm8,xmm4,xmm4,8 + vaesenc xmm10,xmm10,xmm15 + vpclmulqdq xmm4,xmm4,xmm3,0x10 + vpxor xmm2,xmm1,XMMWORD[rcx] + vaesenc xmm11,xmm11,xmm15 + vpxor xmm0,xmm1,XMMWORD[16+rcx] + vaesenc xmm12,xmm12,xmm15 + vpxor xmm5,xmm1,XMMWORD[32+rcx] + vaesenc xmm13,xmm13,xmm15 + vpxor xmm6,xmm1,XMMWORD[48+rcx] + vaesenc xmm14,xmm14,xmm15 + vpxor xmm7,xmm1,XMMWORD[64+rcx] + vpxor xmm3,xmm1,XMMWORD[80+rcx] + vmovdqu xmm1,XMMWORD[rdi] + + vaesenclast xmm9,xmm9,xmm2 + vmovdqu xmm2,XMMWORD[32+r11] + vaesenclast xmm10,xmm10,xmm0 + vpaddb xmm0,xmm1,xmm2 + mov QWORD[((112+8))+rsp],r13 + lea rcx,[96+rcx] + + prefetcht0 [512+rcx] + prefetcht0 [576+rcx] + vaesenclast xmm11,xmm11,xmm5 + vpaddb xmm5,xmm0,xmm2 + mov QWORD[((120+8))+rsp],r12 + lea rdx,[96+rdx] + vmovdqu xmm15,XMMWORD[((0-128))+r9] + vaesenclast xmm12,xmm12,xmm6 + vpaddb xmm6,xmm5,xmm2 + vaesenclast xmm13,xmm13,xmm7 + vpaddb xmm7,xmm6,xmm2 + vaesenclast xmm14,xmm14,xmm3 + vpaddb xmm3,xmm7,xmm2 + + add rax,0x60 + sub r8,0x6 + jc NEAR $L$6x_done + + vmovups XMMWORD[(-96)+rdx],xmm9 + vpxor xmm9,xmm1,xmm15 + vmovups XMMWORD[(-80)+rdx],xmm10 + vmovdqa xmm10,xmm0 + vmovups XMMWORD[(-64)+rdx],xmm11 + vmovdqa xmm11,xmm5 + vmovups XMMWORD[(-48)+rdx],xmm12 + vmovdqa xmm12,xmm6 + vmovups XMMWORD[(-32)+rdx],xmm13 + vmovdqa xmm13,xmm7 + vmovups XMMWORD[(-16)+rdx],xmm14 + vmovdqa xmm14,xmm3 + vmovdqu xmm7,XMMWORD[((32+8))+rsp] + jmp NEAR $L$oop6x + +$L$6x_done: + vpxor xmm8,xmm8,XMMWORD[((16+8))+rsp] + vpxor xmm8,xmm8,xmm4 + + ret + + +global aesni_gcm_decrypt + +ALIGN 32 +aesni_gcm_decrypt: + +$L$SEH_begin_aesni_gcm_decrypt_1: +_CET_ENDBR + xor rax,rax + + + + cmp r8,0x60 + jb NEAR $L$gcm_dec_abort + + push rbp + +$L$SEH_prologue_aesni_gcm_decrypt_2: + mov rbp,rsp + + push rbx + +$L$SEH_prologue_aesni_gcm_decrypt_3: + push r12 + +$L$SEH_prologue_aesni_gcm_decrypt_4: + push r13 + +$L$SEH_prologue_aesni_gcm_decrypt_5: + push r14 + +$L$SEH_prologue_aesni_gcm_decrypt_6: + push r15 + +$L$SEH_prologue_aesni_gcm_decrypt_7: + lea rsp,[((-168))+rsp] +$L$SEH_prologue_aesni_gcm_decrypt_8: +$L$SEH_prologue_aesni_gcm_decrypt_9: + + + + mov QWORD[16+rbp],rdi +$L$SEH_prologue_aesni_gcm_decrypt_10: + mov QWORD[24+rbp],rsi +$L$SEH_prologue_aesni_gcm_decrypt_11: + mov rdi,QWORD[48+rbp] + mov rsi,QWORD[56+rbp] + + movaps XMMWORD[(-208)+rbp],xmm6 +$L$SEH_prologue_aesni_gcm_decrypt_12: + movaps XMMWORD[(-192)+rbp],xmm7 +$L$SEH_prologue_aesni_gcm_decrypt_13: + movaps XMMWORD[(-176)+rbp],xmm8 +$L$SEH_prologue_aesni_gcm_decrypt_14: + movaps XMMWORD[(-160)+rbp],xmm9 +$L$SEH_prologue_aesni_gcm_decrypt_15: + movaps XMMWORD[(-144)+rbp],xmm10 +$L$SEH_prologue_aesni_gcm_decrypt_16: + movaps XMMWORD[(-128)+rbp],xmm11 +$L$SEH_prologue_aesni_gcm_decrypt_17: + movaps XMMWORD[(-112)+rbp],xmm12 +$L$SEH_prologue_aesni_gcm_decrypt_18: + movaps XMMWORD[(-96)+rbp],xmm13 +$L$SEH_prologue_aesni_gcm_decrypt_19: + movaps XMMWORD[(-80)+rbp],xmm14 +$L$SEH_prologue_aesni_gcm_decrypt_20: + movaps XMMWORD[(-64)+rbp],xmm15 +$L$SEH_prologue_aesni_gcm_decrypt_21: +$L$SEH_endprologue_aesni_gcm_decrypt_22: + vzeroupper + + mov r12,QWORD[64+rbp] + vmovdqu xmm1,XMMWORD[rdi] + add rsp,-128 + mov ebx,DWORD[12+rdi] + lea r11,[$L$bswap_mask] + lea r14,[((-128))+r9] + mov r15,0xf80 + vmovdqu xmm8,XMMWORD[r12] + and rsp,-128 + vmovdqu xmm0,XMMWORD[r11] + lea r9,[128+r9] + lea rsi,[32+rsi] + mov r10d,DWORD[((240-128))+r9] + vpshufb xmm8,xmm8,xmm0 + + and r14,r15 + and r15,rsp + sub r15,r14 + jc NEAR $L$dec_no_key_aliasing + cmp r15,768 + jnc NEAR $L$dec_no_key_aliasing + sub rsp,r15 +$L$dec_no_key_aliasing: + + vmovdqu xmm7,XMMWORD[80+rcx] + mov r14,rcx + vmovdqu xmm4,XMMWORD[64+rcx] + + + + + + + + lea r15,[((-192))+r8*1+rcx] + + vmovdqu xmm5,XMMWORD[48+rcx] + shr r8,4 + xor rax,rax + vmovdqu xmm6,XMMWORD[32+rcx] + vpshufb xmm7,xmm7,xmm0 + vmovdqu xmm2,XMMWORD[16+rcx] + vpshufb xmm4,xmm4,xmm0 + vmovdqu xmm3,XMMWORD[rcx] + vpshufb xmm5,xmm5,xmm0 + vmovdqu XMMWORD[48+rsp],xmm4 + vpshufb xmm6,xmm6,xmm0 + vmovdqu XMMWORD[64+rsp],xmm5 + vpshufb xmm2,xmm2,xmm0 + vmovdqu XMMWORD[80+rsp],xmm6 + vpshufb xmm3,xmm3,xmm0 + vmovdqu XMMWORD[96+rsp],xmm2 + vmovdqu XMMWORD[112+rsp],xmm3 + + call _aesni_ctr32_ghash_6x + + mov r12,QWORD[64+rbp] + vmovups XMMWORD[(-96)+rdx],xmm9 + vmovups XMMWORD[(-80)+rdx],xmm10 + vmovups XMMWORD[(-64)+rdx],xmm11 + vmovups XMMWORD[(-48)+rdx],xmm12 + vmovups XMMWORD[(-32)+rdx],xmm13 + vmovups XMMWORD[(-16)+rdx],xmm14 + + vpshufb xmm8,xmm8,XMMWORD[r11] + vmovdqu XMMWORD[r12],xmm8 + + vzeroupper + movaps xmm6,XMMWORD[((-208))+rbp] + movaps xmm7,XMMWORD[((-192))+rbp] + movaps xmm8,XMMWORD[((-176))+rbp] + movaps xmm9,XMMWORD[((-160))+rbp] + movaps xmm10,XMMWORD[((-144))+rbp] + movaps xmm11,XMMWORD[((-128))+rbp] + movaps xmm12,XMMWORD[((-112))+rbp] + movaps xmm13,XMMWORD[((-96))+rbp] + movaps xmm14,XMMWORD[((-80))+rbp] + movaps xmm15,XMMWORD[((-64))+rbp] + mov rdi,QWORD[16+rbp] + mov rsi,QWORD[24+rbp] + lea rsp,[((-40))+rbp] + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbx + + pop rbp + +$L$gcm_dec_abort: + ret +$L$SEH_end_aesni_gcm_decrypt_23: + + + +ALIGN 32 +_aesni_ctr32_6x: + + vmovdqu xmm4,XMMWORD[((0-128))+r9] + vmovdqu xmm2,XMMWORD[32+r11] + lea r13,[((-1))+r10] + vmovups xmm15,XMMWORD[((16-128))+r9] + lea r12,[((32-128))+r9] + vpxor xmm9,xmm1,xmm4 + add ebx,100663296 + jc NEAR $L$handle_ctr32_2 + vpaddb xmm10,xmm1,xmm2 + vpaddb xmm11,xmm10,xmm2 + vpxor xmm10,xmm10,xmm4 + vpaddb xmm12,xmm11,xmm2 + vpxor xmm11,xmm11,xmm4 + vpaddb xmm13,xmm12,xmm2 + vpxor xmm12,xmm12,xmm4 + vpaddb xmm14,xmm13,xmm2 + vpxor xmm13,xmm13,xmm4 + vpaddb xmm1,xmm14,xmm2 + vpxor xmm14,xmm14,xmm4 + jmp NEAR $L$oop_ctr32 + +ALIGN 16 +$L$oop_ctr32: + vaesenc xmm9,xmm9,xmm15 + vaesenc xmm10,xmm10,xmm15 + vaesenc xmm11,xmm11,xmm15 + vaesenc xmm12,xmm12,xmm15 + vaesenc xmm13,xmm13,xmm15 + vaesenc xmm14,xmm14,xmm15 + vmovups xmm15,XMMWORD[r12] + lea r12,[16+r12] + dec r13d + jnz NEAR $L$oop_ctr32 + + vmovdqu xmm3,XMMWORD[r12] + vaesenc xmm9,xmm9,xmm15 + vpxor xmm4,xmm3,XMMWORD[rcx] + vaesenc xmm10,xmm10,xmm15 + vpxor xmm5,xmm3,XMMWORD[16+rcx] + vaesenc xmm11,xmm11,xmm15 + vpxor xmm6,xmm3,XMMWORD[32+rcx] + vaesenc xmm12,xmm12,xmm15 + vpxor xmm8,xmm3,XMMWORD[48+rcx] + vaesenc xmm13,xmm13,xmm15 + vpxor xmm2,xmm3,XMMWORD[64+rcx] + vaesenc xmm14,xmm14,xmm15 + vpxor xmm3,xmm3,XMMWORD[80+rcx] + lea rcx,[96+rcx] + + vaesenclast xmm9,xmm9,xmm4 + vaesenclast xmm10,xmm10,xmm5 + vaesenclast xmm11,xmm11,xmm6 + vaesenclast xmm12,xmm12,xmm8 + vaesenclast xmm13,xmm13,xmm2 + vaesenclast xmm14,xmm14,xmm3 + vmovups XMMWORD[rdx],xmm9 + vmovups XMMWORD[16+rdx],xmm10 + vmovups XMMWORD[32+rdx],xmm11 + vmovups XMMWORD[48+rdx],xmm12 + vmovups XMMWORD[64+rdx],xmm13 + vmovups XMMWORD[80+rdx],xmm14 + lea rdx,[96+rdx] + + ret +ALIGN 32 +$L$handle_ctr32_2: + vpshufb xmm6,xmm1,xmm0 + vmovdqu xmm5,XMMWORD[48+r11] + vpaddd xmm10,xmm6,XMMWORD[64+r11] + vpaddd xmm11,xmm6,xmm5 + vpaddd xmm12,xmm10,xmm5 + vpshufb xmm10,xmm10,xmm0 + vpaddd xmm13,xmm11,xmm5 + vpshufb xmm11,xmm11,xmm0 + vpxor xmm10,xmm10,xmm4 + vpaddd xmm14,xmm12,xmm5 + vpshufb xmm12,xmm12,xmm0 + vpxor xmm11,xmm11,xmm4 + vpaddd xmm1,xmm13,xmm5 + vpshufb xmm13,xmm13,xmm0 + vpxor xmm12,xmm12,xmm4 + vpshufb xmm14,xmm14,xmm0 + vpxor xmm13,xmm13,xmm4 + vpshufb xmm1,xmm1,xmm0 + vpxor xmm14,xmm14,xmm4 + jmp NEAR $L$oop_ctr32 + + + +global aesni_gcm_encrypt + +ALIGN 32 +aesni_gcm_encrypt: + +$L$SEH_begin_aesni_gcm_encrypt_1: +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+2))],1 +%endif + xor rax,rax + + + + + cmp r8,0x60*3 + jb NEAR $L$gcm_enc_abort + + push rbp + +$L$SEH_prologue_aesni_gcm_encrypt_2: + mov rbp,rsp + + push rbx + +$L$SEH_prologue_aesni_gcm_encrypt_3: + push r12 + +$L$SEH_prologue_aesni_gcm_encrypt_4: + push r13 + +$L$SEH_prologue_aesni_gcm_encrypt_5: + push r14 + +$L$SEH_prologue_aesni_gcm_encrypt_6: + push r15 + +$L$SEH_prologue_aesni_gcm_encrypt_7: + lea rsp,[((-168))+rsp] +$L$SEH_prologue_aesni_gcm_encrypt_8: +$L$SEH_prologue_aesni_gcm_encrypt_9: + + + + mov QWORD[16+rbp],rdi +$L$SEH_prologue_aesni_gcm_encrypt_10: + mov QWORD[24+rbp],rsi +$L$SEH_prologue_aesni_gcm_encrypt_11: + mov rdi,QWORD[48+rbp] + mov rsi,QWORD[56+rbp] + + movaps XMMWORD[(-208)+rbp],xmm6 +$L$SEH_prologue_aesni_gcm_encrypt_12: + movaps XMMWORD[(-192)+rbp],xmm7 +$L$SEH_prologue_aesni_gcm_encrypt_13: + movaps XMMWORD[(-176)+rbp],xmm8 +$L$SEH_prologue_aesni_gcm_encrypt_14: + movaps XMMWORD[(-160)+rbp],xmm9 +$L$SEH_prologue_aesni_gcm_encrypt_15: + movaps XMMWORD[(-144)+rbp],xmm10 +$L$SEH_prologue_aesni_gcm_encrypt_16: + movaps XMMWORD[(-128)+rbp],xmm11 +$L$SEH_prologue_aesni_gcm_encrypt_17: + movaps XMMWORD[(-112)+rbp],xmm12 +$L$SEH_prologue_aesni_gcm_encrypt_18: + movaps XMMWORD[(-96)+rbp],xmm13 +$L$SEH_prologue_aesni_gcm_encrypt_19: + movaps XMMWORD[(-80)+rbp],xmm14 +$L$SEH_prologue_aesni_gcm_encrypt_20: + movaps XMMWORD[(-64)+rbp],xmm15 +$L$SEH_prologue_aesni_gcm_encrypt_21: +$L$SEH_endprologue_aesni_gcm_encrypt_22: + vzeroupper + + vmovdqu xmm1,XMMWORD[rdi] + add rsp,-128 + mov ebx,DWORD[12+rdi] + lea r11,[$L$bswap_mask] + lea r14,[((-128))+r9] + mov r15,0xf80 + lea r9,[128+r9] + vmovdqu xmm0,XMMWORD[r11] + and rsp,-128 + mov r10d,DWORD[((240-128))+r9] + + and r14,r15 + and r15,rsp + sub r15,r14 + jc NEAR $L$enc_no_key_aliasing + cmp r15,768 + jnc NEAR $L$enc_no_key_aliasing + sub rsp,r15 +$L$enc_no_key_aliasing: + + mov r14,rdx + + + + + + + + + lea r15,[((-192))+r8*1+rdx] + + shr r8,4 + + call _aesni_ctr32_6x + vpshufb xmm8,xmm9,xmm0 + vpshufb xmm2,xmm10,xmm0 + vmovdqu XMMWORD[112+rsp],xmm8 + vpshufb xmm4,xmm11,xmm0 + vmovdqu XMMWORD[96+rsp],xmm2 + vpshufb xmm5,xmm12,xmm0 + vmovdqu XMMWORD[80+rsp],xmm4 + vpshufb xmm6,xmm13,xmm0 + vmovdqu XMMWORD[64+rsp],xmm5 + vpshufb xmm7,xmm14,xmm0 + vmovdqu XMMWORD[48+rsp],xmm6 + + call _aesni_ctr32_6x + + mov r12,QWORD[64+rbp] + lea rsi,[32+rsi] + vmovdqu xmm8,XMMWORD[r12] + sub r8,12 + mov rax,0x60*2 + vpshufb xmm8,xmm8,xmm0 + + call _aesni_ctr32_ghash_6x + vmovdqu xmm7,XMMWORD[32+rsp] + vmovdqu xmm0,XMMWORD[r11] + vmovdqu xmm3,XMMWORD[((0-32))+rsi] + vpunpckhqdq xmm1,xmm7,xmm7 + vmovdqu xmm15,XMMWORD[((32-32))+rsi] + vmovups XMMWORD[(-96)+rdx],xmm9 + vpshufb xmm9,xmm9,xmm0 + vpxor xmm1,xmm1,xmm7 + vmovups XMMWORD[(-80)+rdx],xmm10 + vpshufb xmm10,xmm10,xmm0 + vmovups XMMWORD[(-64)+rdx],xmm11 + vpshufb xmm11,xmm11,xmm0 + vmovups XMMWORD[(-48)+rdx],xmm12 + vpshufb xmm12,xmm12,xmm0 + vmovups XMMWORD[(-32)+rdx],xmm13 + vpshufb xmm13,xmm13,xmm0 + vmovups XMMWORD[(-16)+rdx],xmm14 + vpshufb xmm14,xmm14,xmm0 + vmovdqu XMMWORD[16+rsp],xmm9 + vmovdqu xmm6,XMMWORD[48+rsp] + vmovdqu xmm0,XMMWORD[((16-32))+rsi] + vpunpckhqdq xmm2,xmm6,xmm6 + vpclmulqdq xmm5,xmm7,xmm3,0x00 + vpxor xmm2,xmm2,xmm6 + vpclmulqdq xmm7,xmm7,xmm3,0x11 + vpclmulqdq xmm1,xmm1,xmm15,0x00 + + vmovdqu xmm9,XMMWORD[64+rsp] + vpclmulqdq xmm4,xmm6,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((48-32))+rsi] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm5,xmm9,xmm9 + vpclmulqdq xmm6,xmm6,xmm0,0x11 + vpxor xmm5,xmm5,xmm9 + vpxor xmm6,xmm6,xmm7 + vpclmulqdq xmm2,xmm2,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((80-32))+rsi] + vpxor xmm2,xmm2,xmm1 + + vmovdqu xmm1,XMMWORD[80+rsp] + vpclmulqdq xmm7,xmm9,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((64-32))+rsi] + vpxor xmm7,xmm7,xmm4 + vpunpckhqdq xmm4,xmm1,xmm1 + vpclmulqdq xmm9,xmm9,xmm3,0x11 + vpxor xmm4,xmm4,xmm1 + vpxor xmm9,xmm9,xmm6 + vpclmulqdq xmm5,xmm5,xmm15,0x00 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm2,XMMWORD[96+rsp] + vpclmulqdq xmm6,xmm1,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((96-32))+rsi] + vpxor xmm6,xmm6,xmm7 + vpunpckhqdq xmm7,xmm2,xmm2 + vpclmulqdq xmm1,xmm1,xmm0,0x11 + vpxor xmm7,xmm7,xmm2 + vpxor xmm1,xmm1,xmm9 + vpclmulqdq xmm4,xmm4,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((128-32))+rsi] + vpxor xmm4,xmm4,xmm5 + + vpxor xmm8,xmm8,XMMWORD[112+rsp] + vpclmulqdq xmm5,xmm2,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((112-32))+rsi] + vpunpckhqdq xmm9,xmm8,xmm8 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm2,xmm2,xmm3,0x11 + vpxor xmm9,xmm9,xmm8 + vpxor xmm2,xmm2,xmm1 + vpclmulqdq xmm7,xmm7,xmm15,0x00 + vpxor xmm4,xmm7,xmm4 + + vpclmulqdq xmm6,xmm8,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((0-32))+rsi] + vpunpckhqdq xmm1,xmm14,xmm14 + vpclmulqdq xmm8,xmm8,xmm0,0x11 + vpxor xmm1,xmm1,xmm14 + vpxor xmm5,xmm6,xmm5 + vpclmulqdq xmm9,xmm9,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((32-32))+rsi] + vpxor xmm7,xmm8,xmm2 + vpxor xmm6,xmm9,xmm4 + + vmovdqu xmm0,XMMWORD[((16-32))+rsi] + vpxor xmm9,xmm7,xmm5 + vpclmulqdq xmm4,xmm14,xmm3,0x00 + vpxor xmm6,xmm6,xmm9 + vpunpckhqdq xmm2,xmm13,xmm13 + vpclmulqdq xmm14,xmm14,xmm3,0x11 + vpxor xmm2,xmm2,xmm13 + vpslldq xmm9,xmm6,8 + vpclmulqdq xmm1,xmm1,xmm15,0x00 + vpxor xmm8,xmm5,xmm9 + vpsrldq xmm6,xmm6,8 + vpxor xmm7,xmm7,xmm6 + + vpclmulqdq xmm5,xmm13,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((48-32))+rsi] + vpxor xmm5,xmm5,xmm4 + vpunpckhqdq xmm9,xmm12,xmm12 + vpclmulqdq xmm13,xmm13,xmm0,0x11 + vpxor xmm9,xmm9,xmm12 + vpxor xmm13,xmm13,xmm14 + vpalignr xmm14,xmm8,xmm8,8 + vpclmulqdq xmm2,xmm2,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((80-32))+rsi] + vpxor xmm2,xmm2,xmm1 + + vpclmulqdq xmm4,xmm12,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((64-32))+rsi] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm1,xmm11,xmm11 + vpclmulqdq xmm12,xmm12,xmm3,0x11 + vpxor xmm1,xmm1,xmm11 + vpxor xmm12,xmm12,xmm13 + vxorps xmm7,xmm7,XMMWORD[16+rsp] + vpclmulqdq xmm9,xmm9,xmm15,0x00 + vpxor xmm9,xmm9,xmm2 + + vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 + vxorps xmm8,xmm8,xmm14 + + vpclmulqdq xmm5,xmm11,xmm0,0x00 + vmovdqu xmm3,XMMWORD[((96-32))+rsi] + vpxor xmm5,xmm5,xmm4 + vpunpckhqdq xmm2,xmm10,xmm10 + vpclmulqdq xmm11,xmm11,xmm0,0x11 + vpxor xmm2,xmm2,xmm10 + vpalignr xmm14,xmm8,xmm8,8 + vpxor xmm11,xmm11,xmm12 + vpclmulqdq xmm1,xmm1,xmm15,0x10 + vmovdqu xmm15,XMMWORD[((128-32))+rsi] + vpxor xmm1,xmm1,xmm9 + + vxorps xmm14,xmm14,xmm7 + vpclmulqdq xmm8,xmm8,XMMWORD[16+r11],0x10 + vxorps xmm8,xmm8,xmm14 + + vpclmulqdq xmm4,xmm10,xmm3,0x00 + vmovdqu xmm0,XMMWORD[((112-32))+rsi] + vpxor xmm4,xmm4,xmm5 + vpunpckhqdq xmm9,xmm8,xmm8 + vpclmulqdq xmm10,xmm10,xmm3,0x11 + vpxor xmm9,xmm9,xmm8 + vpxor xmm10,xmm10,xmm11 + vpclmulqdq xmm2,xmm2,xmm15,0x00 + vpxor xmm2,xmm2,xmm1 + + vpclmulqdq xmm5,xmm8,xmm0,0x00 + vpclmulqdq xmm7,xmm8,xmm0,0x11 + vpxor xmm5,xmm5,xmm4 + vpclmulqdq xmm6,xmm9,xmm15,0x10 + vpxor xmm7,xmm7,xmm10 + vpxor xmm6,xmm6,xmm2 + + vpxor xmm4,xmm7,xmm5 + vpxor xmm6,xmm6,xmm4 + vpslldq xmm1,xmm6,8 + vmovdqu xmm3,XMMWORD[16+r11] + vpsrldq xmm6,xmm6,8 + vpxor xmm8,xmm5,xmm1 + vpxor xmm7,xmm7,xmm6 + + vpalignr xmm2,xmm8,xmm8,8 + vpclmulqdq xmm8,xmm8,xmm3,0x10 + vpxor xmm8,xmm8,xmm2 + + vpalignr xmm2,xmm8,xmm8,8 + vpclmulqdq xmm8,xmm8,xmm3,0x10 + vpxor xmm2,xmm2,xmm7 + vpxor xmm8,xmm8,xmm2 + mov r12,QWORD[64+rbp] + vpshufb xmm8,xmm8,XMMWORD[r11] + vmovdqu XMMWORD[r12],xmm8 + + vzeroupper + movaps xmm6,XMMWORD[((-208))+rbp] + movaps xmm7,XMMWORD[((-192))+rbp] + movaps xmm8,XMMWORD[((-176))+rbp] + movaps xmm9,XMMWORD[((-160))+rbp] + movaps xmm10,XMMWORD[((-144))+rbp] + movaps xmm11,XMMWORD[((-128))+rbp] + movaps xmm12,XMMWORD[((-112))+rbp] + movaps xmm13,XMMWORD[((-96))+rbp] + movaps xmm14,XMMWORD[((-80))+rbp] + movaps xmm15,XMMWORD[((-64))+rbp] + mov rdi,QWORD[16+rbp] + mov rsi,QWORD[24+rbp] + lea rsp,[((-40))+rbp] + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbx + + pop rbp + +$L$gcm_enc_abort: + ret +$L$SEH_end_aesni_gcm_encrypt_23: + + +section .rdata rdata align=8 +ALIGN 64 +$L$bswap_mask: + DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +$L$poly: + DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +$L$one_msb: + DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +$L$two_lsb: + DB 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 +$L$one_lsb: + DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + DB 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108 + DB 101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82 + DB 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 + DB 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +ALIGN 64 +section .text + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_aesni_gcm_decrypt_1 wrt ..imagebase + DD $L$SEH_end_aesni_gcm_decrypt_23 wrt ..imagebase + DD $L$SEH_info_aesni_gcm_decrypt_0 wrt ..imagebase + + DD $L$SEH_begin_aesni_gcm_encrypt_1 wrt ..imagebase + DD $L$SEH_end_aesni_gcm_encrypt_23 wrt ..imagebase + DD $L$SEH_info_aesni_gcm_encrypt_0 wrt ..imagebase + + +section .xdata rdata align=8 +ALIGN 4 +$L$SEH_info_aesni_gcm_decrypt_0: + DB 1 + DB $L$SEH_endprologue_aesni_gcm_decrypt_22-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 33 + DB 213 + DB $L$SEH_prologue_aesni_gcm_decrypt_21-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aesni_gcm_decrypt_20-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aesni_gcm_decrypt_19-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aesni_gcm_decrypt_18-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aesni_gcm_decrypt_17-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aesni_gcm_decrypt_16-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aesni_gcm_decrypt_15-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aesni_gcm_decrypt_14-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aesni_gcm_decrypt_13-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aesni_gcm_decrypt_12-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aesni_gcm_decrypt_11-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 100 + DW 29 + DB $L$SEH_prologue_aesni_gcm_decrypt_10-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 116 + DW 28 + DB $L$SEH_prologue_aesni_gcm_decrypt_9-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 3 + DB $L$SEH_prologue_aesni_gcm_decrypt_8-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 1 + DW 21 + DB $L$SEH_prologue_aesni_gcm_decrypt_7-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 240 + DB $L$SEH_prologue_aesni_gcm_decrypt_6-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 224 + DB $L$SEH_prologue_aesni_gcm_decrypt_5-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 208 + DB $L$SEH_prologue_aesni_gcm_decrypt_4-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 192 + DB $L$SEH_prologue_aesni_gcm_decrypt_3-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 48 + DB $L$SEH_prologue_aesni_gcm_decrypt_2-$L$SEH_begin_aesni_gcm_decrypt_1 + DB 80 + + DW 0 +$L$SEH_info_aesni_gcm_encrypt_0: + DB 1 + DB $L$SEH_endprologue_aesni_gcm_encrypt_22-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 33 + DB 213 + DB $L$SEH_prologue_aesni_gcm_encrypt_21-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 248 + DW 9 + DB $L$SEH_prologue_aesni_gcm_encrypt_20-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 232 + DW 8 + DB $L$SEH_prologue_aesni_gcm_encrypt_19-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 216 + DW 7 + DB $L$SEH_prologue_aesni_gcm_encrypt_18-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 200 + DW 6 + DB $L$SEH_prologue_aesni_gcm_encrypt_17-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 184 + DW 5 + DB $L$SEH_prologue_aesni_gcm_encrypt_16-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 168 + DW 4 + DB $L$SEH_prologue_aesni_gcm_encrypt_15-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 152 + DW 3 + DB $L$SEH_prologue_aesni_gcm_encrypt_14-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 136 + DW 2 + DB $L$SEH_prologue_aesni_gcm_encrypt_13-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 120 + DW 1 + DB $L$SEH_prologue_aesni_gcm_encrypt_12-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 104 + DW 0 + DB $L$SEH_prologue_aesni_gcm_encrypt_11-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 100 + DW 29 + DB $L$SEH_prologue_aesni_gcm_encrypt_10-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 116 + DW 28 + DB $L$SEH_prologue_aesni_gcm_encrypt_9-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 3 + DB $L$SEH_prologue_aesni_gcm_encrypt_8-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 1 + DW 21 + DB $L$SEH_prologue_aesni_gcm_encrypt_7-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 240 + DB $L$SEH_prologue_aesni_gcm_encrypt_6-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 224 + DB $L$SEH_prologue_aesni_gcm_encrypt_5-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 208 + DB $L$SEH_prologue_aesni_gcm_encrypt_4-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 192 + DB $L$SEH_prologue_aesni_gcm_encrypt_3-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 48 + DB $L$SEH_prologue_aesni_gcm_encrypt_2-$L$SEH_begin_aesni_gcm_encrypt_1 + DB 80 + + DW 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/aesni-x86-apple.S b/third_party/boringssl/gen/bcm/aesni-x86-apple.S new file mode 100644 index 00000000..7454d23d --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesni-x86-apple.S @@ -0,0 +1,2490 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt +.align 4 +_aes_hw_encrypt: +L_aes_hw_encrypt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L000pic_for_function_hit +L000pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+1-L000pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 12(%esp),%edx + movups (%eax),%xmm2 + movl 240(%edx),%ecx + movl 8(%esp),%eax + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L001enc1_loop_1: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L001enc1_loop_1 + aesenclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%eax) + pxor %xmm2,%xmm2 + ret +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt +.align 4 +_aes_hw_decrypt: +L_aes_hw_decrypt_begin: + movl 4(%esp),%eax + movl 12(%esp),%edx + movups (%eax),%xmm2 + movl 240(%edx),%ecx + movl 8(%esp),%eax + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L002dec1_loop_2: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L002dec1_loop_2 + aesdeclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%eax) + pxor %xmm2,%xmm2 + ret +.private_extern __aesni_encrypt2 +.align 4 +__aesni_encrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L003enc2_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + movups -16(%edx,%ecx,1),%xmm0 + jnz L003enc2_loop + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + ret +.private_extern __aesni_decrypt2 +.align 4 +__aesni_decrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L004dec2_loop: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + movups -16(%edx,%ecx,1),%xmm0 + jnz L004dec2_loop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + ret +.private_extern __aesni_encrypt3 +.align 4 +__aesni_encrypt3: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L005enc3_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + movups -16(%edx,%ecx,1),%xmm0 + jnz L005enc3_loop + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + ret +.private_extern __aesni_decrypt3 +.align 4 +__aesni_decrypt3: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +L006dec3_loop: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + movups -16(%edx,%ecx,1),%xmm0 + jnz L006dec3_loop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + ret +.private_extern __aesni_encrypt4 +.align 4 +__aesni_encrypt4: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + shll $4,%ecx + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +L007enc4_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + movups -16(%edx,%ecx,1),%xmm0 + jnz L007enc4_loop + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + aesenclast %xmm0,%xmm5 + ret +.private_extern __aesni_decrypt4 +.align 4 +__aesni_decrypt4: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + shll $4,%ecx + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +L008dec4_loop: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + movups -16(%edx,%ecx,1),%xmm0 + jnz L008dec4_loop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + aesdeclast %xmm0,%xmm5 + ret +.private_extern __aesni_encrypt6 +.align 4 +__aesni_encrypt6: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + aesenc %xmm1,%xmm2 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + aesenc %xmm1,%xmm3 + leal 32(%edx,%ecx,1),%edx + negl %ecx + aesenc %xmm1,%xmm4 + pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp L009_aesni_encrypt6_inner +.align 4,0x90 +L010enc6_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 +L009_aesni_encrypt6_inner: + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 +L_aesni_encrypt6_enter: + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + movups -16(%edx,%ecx,1),%xmm0 + jnz L010enc6_loop + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + aesenclast %xmm0,%xmm5 + aesenclast %xmm0,%xmm6 + aesenclast %xmm0,%xmm7 + ret +.private_extern __aesni_decrypt6 +.align 4 +__aesni_decrypt6: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + aesdec %xmm1,%xmm2 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + aesdec %xmm1,%xmm3 + leal 32(%edx,%ecx,1),%edx + negl %ecx + aesdec %xmm1,%xmm4 + pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp L011_aesni_decrypt6_inner +.align 4,0x90 +L012dec6_loop: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 +L011_aesni_decrypt6_inner: + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 +L_aesni_decrypt6_enter: + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + movups -16(%edx,%ecx,1),%xmm0 + jnz L012dec6_loop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + aesdeclast %xmm0,%xmm5 + aesdeclast %xmm0,%xmm6 + aesdeclast %xmm0,%xmm7 + ret +.globl _aes_hw_ecb_encrypt +.private_extern _aes_hw_ecb_encrypt +.align 4 +_aes_hw_ecb_encrypt: +L_aes_hw_ecb_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + andl $-16,%eax + jz L013ecb_ret + movl 240(%edx),%ecx + testl %ebx,%ebx + jz L014ecb_decrypt + movl %edx,%ebp + movl %ecx,%ebx + cmpl $96,%eax + jb L015ecb_enc_tail + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + subl $96,%eax + jmp L016ecb_enc_loop6_enter +.align 4,0x90 +L017ecb_enc_loop6: + movups %xmm2,(%edi) + movdqu (%esi),%xmm2 + movups %xmm3,16(%edi) + movdqu 16(%esi),%xmm3 + movups %xmm4,32(%edi) + movdqu 32(%esi),%xmm4 + movups %xmm5,48(%edi) + movdqu 48(%esi),%xmm5 + movups %xmm6,64(%edi) + movdqu 64(%esi),%xmm6 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi +L016ecb_enc_loop6_enter: + call __aesni_encrypt6 + movl %ebp,%edx + movl %ebx,%ecx + subl $96,%eax + jnc L017ecb_enc_loop6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + movups %xmm7,80(%edi) + leal 96(%edi),%edi + addl $96,%eax + jz L013ecb_ret +L015ecb_enc_tail: + movups (%esi),%xmm2 + cmpl $32,%eax + jb L018ecb_enc_one + movups 16(%esi),%xmm3 + je L019ecb_enc_two + movups 32(%esi),%xmm4 + cmpl $64,%eax + jb L020ecb_enc_three + movups 48(%esi),%xmm5 + je L021ecb_enc_four + movups 64(%esi),%xmm6 + xorps %xmm7,%xmm7 + call __aesni_encrypt6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + jmp L013ecb_ret +.align 4,0x90 +L018ecb_enc_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L022enc1_loop_3: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L022enc1_loop_3 + aesenclast %xmm1,%xmm2 + movups %xmm2,(%edi) + jmp L013ecb_ret +.align 4,0x90 +L019ecb_enc_two: + call __aesni_encrypt2 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + jmp L013ecb_ret +.align 4,0x90 +L020ecb_enc_three: + call __aesni_encrypt3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + jmp L013ecb_ret +.align 4,0x90 +L021ecb_enc_four: + call __aesni_encrypt4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + jmp L013ecb_ret +.align 4,0x90 +L014ecb_decrypt: + movl %edx,%ebp + movl %ecx,%ebx + cmpl $96,%eax + jb L023ecb_dec_tail + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + subl $96,%eax + jmp L024ecb_dec_loop6_enter +.align 4,0x90 +L025ecb_dec_loop6: + movups %xmm2,(%edi) + movdqu (%esi),%xmm2 + movups %xmm3,16(%edi) + movdqu 16(%esi),%xmm3 + movups %xmm4,32(%edi) + movdqu 32(%esi),%xmm4 + movups %xmm5,48(%edi) + movdqu 48(%esi),%xmm5 + movups %xmm6,64(%edi) + movdqu 64(%esi),%xmm6 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi +L024ecb_dec_loop6_enter: + call __aesni_decrypt6 + movl %ebp,%edx + movl %ebx,%ecx + subl $96,%eax + jnc L025ecb_dec_loop6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + movups %xmm7,80(%edi) + leal 96(%edi),%edi + addl $96,%eax + jz L013ecb_ret +L023ecb_dec_tail: + movups (%esi),%xmm2 + cmpl $32,%eax + jb L026ecb_dec_one + movups 16(%esi),%xmm3 + je L027ecb_dec_two + movups 32(%esi),%xmm4 + cmpl $64,%eax + jb L028ecb_dec_three + movups 48(%esi),%xmm5 + je L029ecb_dec_four + movups 64(%esi),%xmm6 + xorps %xmm7,%xmm7 + call __aesni_decrypt6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + jmp L013ecb_ret +.align 4,0x90 +L026ecb_dec_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L030dec1_loop_4: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L030dec1_loop_4 + aesdeclast %xmm1,%xmm2 + movups %xmm2,(%edi) + jmp L013ecb_ret +.align 4,0x90 +L027ecb_dec_two: + call __aesni_decrypt2 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + jmp L013ecb_ret +.align 4,0x90 +L028ecb_dec_three: + call __aesni_decrypt3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + jmp L013ecb_ret +.align 4,0x90 +L029ecb_dec_four: + call __aesni_decrypt4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) +L013ecb_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_ccm64_encrypt_blocks +.private_extern _aes_hw_ccm64_encrypt_blocks +.align 4 +_aes_hw_ccm64_encrypt_blocks: +L_aes_hw_ccm64_encrypt_blocks_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl 40(%esp),%ecx + movl %esp,%ebp + subl $60,%esp + andl $-16,%esp + movl %ebp,48(%esp) + movdqu (%ebx),%xmm7 + movdqu (%ecx),%xmm3 + movl 240(%edx),%ecx + movl $202182159,(%esp) + movl $134810123,4(%esp) + movl $67438087,8(%esp) + movl $66051,12(%esp) + movl $1,%ebx + xorl %ebp,%ebp + movl %ebx,16(%esp) + movl %ebp,20(%esp) + movl %ebp,24(%esp) + movl %ebp,28(%esp) + shll $4,%ecx + movl $16,%ebx + leal (%edx),%ebp + movdqa (%esp),%xmm5 + movdqa %xmm7,%xmm2 + leal 32(%edx,%ecx,1),%edx + subl %ecx,%ebx + pshufb %xmm5,%xmm7 +L031ccm64_enc_outer: + movups (%ebp),%xmm0 + movl %ebx,%ecx + movups (%esi),%xmm6 + xorps %xmm0,%xmm2 + movups 16(%ebp),%xmm1 + xorps %xmm6,%xmm0 + xorps %xmm0,%xmm3 + movups 32(%ebp),%xmm0 +L032ccm64_enc2_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + movups -16(%edx,%ecx,1),%xmm0 + jnz L032ccm64_enc2_loop + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + paddq 16(%esp),%xmm7 + decl %eax + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + leal 16(%esi),%esi + xorps %xmm2,%xmm6 + movdqa %xmm7,%xmm2 + movups %xmm6,(%edi) + pshufb %xmm5,%xmm2 + leal 16(%edi),%edi + jnz L031ccm64_enc_outer + movl 48(%esp),%esp + movl 40(%esp),%edi + movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_ccm64_decrypt_blocks +.private_extern _aes_hw_ccm64_decrypt_blocks +.align 4 +_aes_hw_ccm64_decrypt_blocks: +L_aes_hw_ccm64_decrypt_blocks_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl 40(%esp),%ecx + movl %esp,%ebp + subl $60,%esp + andl $-16,%esp + movl %ebp,48(%esp) + movdqu (%ebx),%xmm7 + movdqu (%ecx),%xmm3 + movl 240(%edx),%ecx + movl $202182159,(%esp) + movl $134810123,4(%esp) + movl $67438087,8(%esp) + movl $66051,12(%esp) + movl $1,%ebx + xorl %ebp,%ebp + movl %ebx,16(%esp) + movl %ebp,20(%esp) + movl %ebp,24(%esp) + movl %ebp,28(%esp) + movdqa (%esp),%xmm5 + movdqa %xmm7,%xmm2 + movl %edx,%ebp + movl %ecx,%ebx + pshufb %xmm5,%xmm7 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L033enc1_loop_5: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L033enc1_loop_5 + aesenclast %xmm1,%xmm2 + shll $4,%ebx + movl $16,%ecx + movups (%esi),%xmm6 + paddq 16(%esp),%xmm7 + leal 16(%esi),%esi + subl %ebx,%ecx + leal 32(%ebp,%ebx,1),%edx + movl %ecx,%ebx + jmp L034ccm64_dec_outer +.align 4,0x90 +L034ccm64_dec_outer: + xorps %xmm2,%xmm6 + movdqa %xmm7,%xmm2 + movups %xmm6,(%edi) + leal 16(%edi),%edi + pshufb %xmm5,%xmm2 + subl $1,%eax + jz L035ccm64_dec_break + movups (%ebp),%xmm0 + movl %ebx,%ecx + movups 16(%ebp),%xmm1 + xorps %xmm0,%xmm6 + xorps %xmm0,%xmm2 + xorps %xmm6,%xmm3 + movups 32(%ebp),%xmm0 +L036ccm64_dec2_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + movups -16(%edx,%ecx,1),%xmm0 + jnz L036ccm64_dec2_loop + movups (%esi),%xmm6 + paddq 16(%esp),%xmm7 + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + leal 16(%esi),%esi + jmp L034ccm64_dec_outer +.align 4,0x90 +L035ccm64_dec_break: + movl 240(%ebp),%ecx + movl %ebp,%edx + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm6 + leal 32(%edx),%edx + xorps %xmm6,%xmm3 +L037enc1_loop_6: + aesenc %xmm1,%xmm3 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L037enc1_loop_6 + aesenclast %xmm1,%xmm3 + movl 48(%esp),%esp + movl 40(%esp),%edi + movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks +.align 4 +_aes_hw_ctr32_encrypt_blocks: +L_aes_hw_ctr32_encrypt_blocks_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L038pic_for_function_hit +L038pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+0-L038pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl %esp,%ebp + subl $88,%esp + andl $-16,%esp + movl %ebp,80(%esp) + cmpl $1,%eax + je L039ctr32_one_shortcut + movdqu (%ebx),%xmm7 + movl $202182159,(%esp) + movl $134810123,4(%esp) + movl $67438087,8(%esp) + movl $66051,12(%esp) + movl $6,%ecx + xorl %ebp,%ebp + movl %ecx,16(%esp) + movl %ecx,20(%esp) + movl %ecx,24(%esp) + movl %ebp,28(%esp) + pextrd $3,%xmm7,%ebx + pinsrd $3,%ebp,%xmm7 + movl 240(%edx),%ecx + bswap %ebx + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqa (%esp),%xmm2 + pinsrd $0,%ebx,%xmm0 + leal 3(%ebx),%ebp + pinsrd $0,%ebp,%xmm1 + incl %ebx + pinsrd $1,%ebx,%xmm0 + incl %ebp + pinsrd $1,%ebp,%xmm1 + incl %ebx + pinsrd $2,%ebx,%xmm0 + incl %ebp + pinsrd $2,%ebp,%xmm1 + movdqa %xmm0,48(%esp) + pshufb %xmm2,%xmm0 + movdqu (%edx),%xmm6 + movdqa %xmm1,64(%esp) + pshufb %xmm2,%xmm1 + pshufd $192,%xmm0,%xmm2 + pshufd $128,%xmm0,%xmm3 + cmpl $6,%eax + jb L040ctr32_tail + pxor %xmm6,%xmm7 + shll $4,%ecx + movl $16,%ebx + movdqa %xmm7,32(%esp) + movl %edx,%ebp + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + subl $6,%eax + jmp L041ctr32_loop6 +.align 4,0x90 +L041ctr32_loop6: + pshufd $64,%xmm0,%xmm4 + movdqa 32(%esp),%xmm0 + pshufd $192,%xmm1,%xmm5 + pxor %xmm0,%xmm2 + pshufd $128,%xmm1,%xmm6 + pxor %xmm0,%xmm3 + pshufd $64,%xmm1,%xmm7 + movups 16(%ebp),%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + aesenc %xmm1,%xmm2 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + aesenc %xmm1,%xmm3 + movups 32(%ebp),%xmm0 + movl %ebx,%ecx + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + call L_aesni_encrypt6_enter + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps %xmm1,%xmm2 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm3 + movups %xmm2,(%edi) + movdqa 16(%esp),%xmm0 + xorps %xmm1,%xmm4 + movdqa 64(%esp),%xmm1 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + paddd %xmm0,%xmm1 + paddd 48(%esp),%xmm0 + movdqa (%esp),%xmm2 + movups 48(%esi),%xmm3 + movups 64(%esi),%xmm4 + xorps %xmm3,%xmm5 + movups 80(%esi),%xmm3 + leal 96(%esi),%esi + movdqa %xmm0,48(%esp) + pshufb %xmm2,%xmm0 + xorps %xmm4,%xmm6 + movups %xmm5,48(%edi) + xorps %xmm3,%xmm7 + movdqa %xmm1,64(%esp) + pshufb %xmm2,%xmm1 + movups %xmm6,64(%edi) + pshufd $192,%xmm0,%xmm2 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + pshufd $128,%xmm0,%xmm3 + subl $6,%eax + jnc L041ctr32_loop6 + addl $6,%eax + jz L042ctr32_ret + movdqu (%ebp),%xmm7 + movl %ebp,%edx + pxor 32(%esp),%xmm7 + movl 240(%ebp),%ecx +L040ctr32_tail: + por %xmm7,%xmm2 + cmpl $2,%eax + jb L043ctr32_one + pshufd $64,%xmm0,%xmm4 + por %xmm7,%xmm3 + je L044ctr32_two + pshufd $192,%xmm1,%xmm5 + por %xmm7,%xmm4 + cmpl $4,%eax + jb L045ctr32_three + pshufd $128,%xmm1,%xmm6 + por %xmm7,%xmm5 + je L046ctr32_four + por %xmm7,%xmm6 + call __aesni_encrypt6 + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps %xmm1,%xmm2 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm3 + movups 48(%esi),%xmm0 + xorps %xmm1,%xmm4 + movups 64(%esi),%xmm1 + xorps %xmm0,%xmm5 + movups %xmm2,(%edi) + xorps %xmm1,%xmm6 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + jmp L042ctr32_ret +.align 4,0x90 +L039ctr32_one_shortcut: + movups (%ebx),%xmm2 + movl 240(%edx),%ecx +L043ctr32_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L047enc1_loop_7: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L047enc1_loop_7 + aesenclast %xmm1,%xmm2 + movups (%esi),%xmm6 + xorps %xmm2,%xmm6 + movups %xmm6,(%edi) + jmp L042ctr32_ret +.align 4,0x90 +L044ctr32_two: + call __aesni_encrypt2 + movups (%esi),%xmm5 + movups 16(%esi),%xmm6 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + jmp L042ctr32_ret +.align 4,0x90 +L045ctr32_three: + call __aesni_encrypt3 + movups (%esi),%xmm5 + movups 16(%esi),%xmm6 + xorps %xmm5,%xmm2 + movups 32(%esi),%xmm7 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + xorps %xmm7,%xmm4 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + jmp L042ctr32_ret +.align 4,0x90 +L046ctr32_four: + call __aesni_encrypt4 + movups (%esi),%xmm6 + movups 16(%esi),%xmm7 + movups 32(%esi),%xmm1 + xorps %xmm6,%xmm2 + movups 48(%esi),%xmm0 + xorps %xmm7,%xmm3 + movups %xmm2,(%edi) + xorps %xmm1,%xmm4 + movups %xmm3,16(%edi) + xorps %xmm0,%xmm5 + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) +L042ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movl 80(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_xts_encrypt +.private_extern _aes_hw_xts_encrypt +.align 4 +_aes_hw_xts_encrypt: +L_aes_hw_xts_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 36(%esp),%edx + movl 40(%esp),%esi + movl 240(%edx),%ecx + movups (%esi),%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L048enc1_loop_8: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L048enc1_loop_8 + aesenclast %xmm1,%xmm2 + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl %esp,%ebp + subl $120,%esp + movl 240(%edx),%ecx + andl $-16,%esp + movl $135,96(%esp) + movl $0,100(%esp) + movl $1,104(%esp) + movl $0,108(%esp) + movl %eax,112(%esp) + movl %ebp,116(%esp) + movdqa %xmm2,%xmm1 + pxor %xmm0,%xmm0 + movdqa 96(%esp),%xmm3 + pcmpgtd %xmm1,%xmm0 + andl $-16,%eax + movl %edx,%ebp + movl %ecx,%ebx + subl $96,%eax + jc L049xts_enc_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp L050xts_enc_loop6 +.align 4,0x90 +L050xts_enc_loop6: + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,16(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,32(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,64(%esp) + paddq %xmm1,%xmm1 + movups (%ebp),%xmm0 + pand %xmm3,%xmm7 + movups (%esi),%xmm2 + pxor %xmm1,%xmm7 + movl %ebx,%ecx + movdqu 16(%esi),%xmm3 + xorps %xmm0,%xmm2 + movdqu 32(%esi),%xmm4 + pxor %xmm0,%xmm3 + movdqu 48(%esi),%xmm5 + pxor %xmm0,%xmm4 + movdqu 64(%esi),%xmm6 + pxor %xmm0,%xmm5 + movdqu 80(%esi),%xmm1 + pxor %xmm0,%xmm6 + leal 96(%esi),%esi + pxor (%esp),%xmm2 + movdqa %xmm7,80(%esp) + pxor %xmm1,%xmm7 + movups 16(%ebp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + aesenc %xmm1,%xmm2 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + aesenc %xmm1,%xmm3 + pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + call L_aesni_encrypt6_enter + movdqa 80(%esp),%xmm1 + pxor %xmm0,%xmm0 + xorps (%esp),%xmm2 + pcmpgtd %xmm1,%xmm0 + xorps 16(%esp),%xmm3 + movups %xmm2,(%edi) + xorps 32(%esp),%xmm4 + movups %xmm3,16(%edi) + xorps 48(%esp),%xmm5 + movups %xmm4,32(%edi) + xorps 64(%esp),%xmm6 + movups %xmm5,48(%edi) + xorps %xmm1,%xmm7 + movups %xmm6,64(%edi) + pshufd $19,%xmm0,%xmm2 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqa 96(%esp),%xmm3 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + subl $96,%eax + jnc L050xts_enc_loop6 + movl 240(%ebp),%ecx + movl %ebp,%edx + movl %ecx,%ebx +L049xts_enc_short: + addl $96,%eax + jz L051xts_enc_done6x + movdqa %xmm1,%xmm5 + cmpl $32,%eax + jb L052xts_enc_one + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + je L053xts_enc_two + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm6 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + cmpl $64,%eax + jb L054xts_enc_three + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm7 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + movdqa %xmm5,(%esp) + movdqa %xmm6,16(%esp) + je L055xts_enc_four + movdqa %xmm7,32(%esp) + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm7 + pxor %xmm1,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + pxor (%esp),%xmm2 + movdqu 48(%esi),%xmm5 + pxor 16(%esp),%xmm3 + movdqu 64(%esi),%xmm6 + pxor 32(%esp),%xmm4 + leal 80(%esi),%esi + pxor 48(%esp),%xmm5 + movdqa %xmm7,64(%esp) + pxor %xmm7,%xmm6 + call __aesni_encrypt6 + movaps 64(%esp),%xmm1 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps 32(%esp),%xmm4 + movups %xmm2,(%edi) + xorps 48(%esp),%xmm5 + movups %xmm3,16(%edi) + xorps %xmm1,%xmm6 + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + leal 80(%edi),%edi + jmp L056xts_enc_done +.align 4,0x90 +L052xts_enc_one: + movups (%esi),%xmm2 + leal 16(%esi),%esi + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L057enc1_loop_9: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L057enc1_loop_9 + aesenclast %xmm1,%xmm2 + xorps %xmm5,%xmm2 + movups %xmm2,(%edi) + leal 16(%edi),%edi + movdqa %xmm5,%xmm1 + jmp L056xts_enc_done +.align 4,0x90 +L053xts_enc_two: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + leal 32(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + call __aesni_encrypt2 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + leal 32(%edi),%edi + movdqa %xmm6,%xmm1 + jmp L056xts_enc_done +.align 4,0x90 +L054xts_enc_three: + movaps %xmm1,%xmm7 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + leal 48(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + call __aesni_encrypt3 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + leal 48(%edi),%edi + movdqa %xmm7,%xmm1 + jmp L056xts_enc_done +.align 4,0x90 +L055xts_enc_four: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + xorps (%esp),%xmm2 + movups 48(%esi),%xmm5 + leal 64(%esi),%esi + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + xorps %xmm6,%xmm5 + call __aesni_encrypt4 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + xorps %xmm6,%xmm5 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + leal 64(%edi),%edi + movdqa %xmm6,%xmm1 + jmp L056xts_enc_done +.align 4,0x90 +L051xts_enc_done6x: + movl 112(%esp),%eax + andl $15,%eax + jz L058xts_enc_ret + movdqa %xmm1,%xmm5 + movl %eax,112(%esp) + jmp L059xts_enc_steal +.align 4,0x90 +L056xts_enc_done: + movl 112(%esp),%eax + pxor %xmm0,%xmm0 + andl $15,%eax + jz L058xts_enc_ret + pcmpgtd %xmm1,%xmm0 + movl %eax,112(%esp) + pshufd $19,%xmm0,%xmm5 + paddq %xmm1,%xmm1 + pand 96(%esp),%xmm5 + pxor %xmm1,%xmm5 +L059xts_enc_steal: + movzbl (%esi),%ecx + movzbl -16(%edi),%edx + leal 1(%esi),%esi + movb %cl,-16(%edi) + movb %dl,(%edi) + leal 1(%edi),%edi + subl $1,%eax + jnz L059xts_enc_steal + subl 112(%esp),%edi + movl %ebp,%edx + movl %ebx,%ecx + movups -16(%edi),%xmm2 + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L060enc1_loop_10: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L060enc1_loop_10 + aesenclast %xmm1,%xmm2 + xorps %xmm5,%xmm2 + movups %xmm2,-16(%edi) +L058xts_enc_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) + movl 116(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_xts_decrypt +.private_extern _aes_hw_xts_decrypt +.align 4 +_aes_hw_xts_decrypt: +L_aes_hw_xts_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 36(%esp),%edx + movl 40(%esp),%esi + movl 240(%edx),%ecx + movups (%esi),%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L061enc1_loop_11: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L061enc1_loop_11 + aesenclast %xmm1,%xmm2 + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl %esp,%ebp + subl $120,%esp + andl $-16,%esp + xorl %ebx,%ebx + testl $15,%eax + setnz %bl + shll $4,%ebx + subl %ebx,%eax + movl $135,96(%esp) + movl $0,100(%esp) + movl $1,104(%esp) + movl $0,108(%esp) + movl %eax,112(%esp) + movl %ebp,116(%esp) + movl 240(%edx),%ecx + movl %edx,%ebp + movl %ecx,%ebx + movdqa %xmm2,%xmm1 + pxor %xmm0,%xmm0 + movdqa 96(%esp),%xmm3 + pcmpgtd %xmm1,%xmm0 + andl $-16,%eax + subl $96,%eax + jc L062xts_dec_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp L063xts_dec_loop6 +.align 4,0x90 +L063xts_dec_loop6: + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,16(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,32(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,64(%esp) + paddq %xmm1,%xmm1 + movups (%ebp),%xmm0 + pand %xmm3,%xmm7 + movups (%esi),%xmm2 + pxor %xmm1,%xmm7 + movl %ebx,%ecx + movdqu 16(%esi),%xmm3 + xorps %xmm0,%xmm2 + movdqu 32(%esi),%xmm4 + pxor %xmm0,%xmm3 + movdqu 48(%esi),%xmm5 + pxor %xmm0,%xmm4 + movdqu 64(%esi),%xmm6 + pxor %xmm0,%xmm5 + movdqu 80(%esi),%xmm1 + pxor %xmm0,%xmm6 + leal 96(%esi),%esi + pxor (%esp),%xmm2 + movdqa %xmm7,80(%esp) + pxor %xmm1,%xmm7 + movups 16(%ebp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + aesdec %xmm1,%xmm2 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + aesdec %xmm1,%xmm3 + pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + call L_aesni_decrypt6_enter + movdqa 80(%esp),%xmm1 + pxor %xmm0,%xmm0 + xorps (%esp),%xmm2 + pcmpgtd %xmm1,%xmm0 + xorps 16(%esp),%xmm3 + movups %xmm2,(%edi) + xorps 32(%esp),%xmm4 + movups %xmm3,16(%edi) + xorps 48(%esp),%xmm5 + movups %xmm4,32(%edi) + xorps 64(%esp),%xmm6 + movups %xmm5,48(%edi) + xorps %xmm1,%xmm7 + movups %xmm6,64(%edi) + pshufd $19,%xmm0,%xmm2 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqa 96(%esp),%xmm3 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + subl $96,%eax + jnc L063xts_dec_loop6 + movl 240(%ebp),%ecx + movl %ebp,%edx + movl %ecx,%ebx +L062xts_dec_short: + addl $96,%eax + jz L064xts_dec_done6x + movdqa %xmm1,%xmm5 + cmpl $32,%eax + jb L065xts_dec_one + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + je L066xts_dec_two + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm6 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + cmpl $64,%eax + jb L067xts_dec_three + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm7 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + movdqa %xmm5,(%esp) + movdqa %xmm6,16(%esp) + je L068xts_dec_four + movdqa %xmm7,32(%esp) + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm7 + pxor %xmm1,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + pxor (%esp),%xmm2 + movdqu 48(%esi),%xmm5 + pxor 16(%esp),%xmm3 + movdqu 64(%esi),%xmm6 + pxor 32(%esp),%xmm4 + leal 80(%esi),%esi + pxor 48(%esp),%xmm5 + movdqa %xmm7,64(%esp) + pxor %xmm7,%xmm6 + call __aesni_decrypt6 + movaps 64(%esp),%xmm1 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps 32(%esp),%xmm4 + movups %xmm2,(%edi) + xorps 48(%esp),%xmm5 + movups %xmm3,16(%edi) + xorps %xmm1,%xmm6 + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + leal 80(%edi),%edi + jmp L069xts_dec_done +.align 4,0x90 +L065xts_dec_one: + movups (%esi),%xmm2 + leal 16(%esi),%esi + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L070dec1_loop_12: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L070dec1_loop_12 + aesdeclast %xmm1,%xmm2 + xorps %xmm5,%xmm2 + movups %xmm2,(%edi) + leal 16(%edi),%edi + movdqa %xmm5,%xmm1 + jmp L069xts_dec_done +.align 4,0x90 +L066xts_dec_two: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + leal 32(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + call __aesni_decrypt2 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + leal 32(%edi),%edi + movdqa %xmm6,%xmm1 + jmp L069xts_dec_done +.align 4,0x90 +L067xts_dec_three: + movaps %xmm1,%xmm7 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + leal 48(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + call __aesni_decrypt3 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + leal 48(%edi),%edi + movdqa %xmm7,%xmm1 + jmp L069xts_dec_done +.align 4,0x90 +L068xts_dec_four: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + xorps (%esp),%xmm2 + movups 48(%esi),%xmm5 + leal 64(%esi),%esi + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + xorps %xmm6,%xmm5 + call __aesni_decrypt4 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + xorps %xmm6,%xmm5 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + leal 64(%edi),%edi + movdqa %xmm6,%xmm1 + jmp L069xts_dec_done +.align 4,0x90 +L064xts_dec_done6x: + movl 112(%esp),%eax + andl $15,%eax + jz L071xts_dec_ret + movl %eax,112(%esp) + jmp L072xts_dec_only_one_more +.align 4,0x90 +L069xts_dec_done: + movl 112(%esp),%eax + pxor %xmm0,%xmm0 + andl $15,%eax + jz L071xts_dec_ret + pcmpgtd %xmm1,%xmm0 + movl %eax,112(%esp) + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa 96(%esp),%xmm3 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 +L072xts_dec_only_one_more: + pshufd $19,%xmm0,%xmm5 + movdqa %xmm1,%xmm6 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm5 + pxor %xmm1,%xmm5 + movl %ebp,%edx + movl %ebx,%ecx + movups (%esi),%xmm2 + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L073dec1_loop_13: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L073dec1_loop_13 + aesdeclast %xmm1,%xmm2 + xorps %xmm5,%xmm2 + movups %xmm2,(%edi) +L074xts_dec_steal: + movzbl 16(%esi),%ecx + movzbl (%edi),%edx + leal 1(%esi),%esi + movb %cl,(%edi) + movb %dl,16(%edi) + leal 1(%edi),%edi + subl $1,%eax + jnz L074xts_dec_steal + subl 112(%esp),%edi + movl %ebp,%edx + movl %ebx,%ecx + movups (%edi),%xmm2 + xorps %xmm6,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L075dec1_loop_14: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L075dec1_loop_14 + aesdeclast %xmm1,%xmm2 + xorps %xmm6,%xmm2 + movups %xmm2,(%edi) +L071xts_dec_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) + movl 116(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt +.align 4 +_aes_hw_cbc_encrypt: +L_aes_hw_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl %esp,%ebx + movl 24(%esp),%edi + subl $24,%ebx + movl 28(%esp),%eax + andl $-16,%ebx + movl 32(%esp),%edx + movl 36(%esp),%ebp + testl %eax,%eax + jz L076cbc_abort + cmpl $0,40(%esp) + xchgl %esp,%ebx + movups (%ebp),%xmm7 + movl 240(%edx),%ecx + movl %edx,%ebp + movl %ebx,16(%esp) + movl %ecx,%ebx + je L077cbc_decrypt + movaps %xmm7,%xmm2 + cmpl $16,%eax + jb L078cbc_enc_tail + subl $16,%eax + jmp L079cbc_enc_loop +.align 4,0x90 +L079cbc_enc_loop: + movups (%esi),%xmm7 + leal 16(%esi),%esi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm7 + leal 32(%edx),%edx + xorps %xmm7,%xmm2 +L080enc1_loop_15: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L080enc1_loop_15 + aesenclast %xmm1,%xmm2 + movl %ebx,%ecx + movl %ebp,%edx + movups %xmm2,(%edi) + leal 16(%edi),%edi + subl $16,%eax + jnc L079cbc_enc_loop + addl $16,%eax + jnz L078cbc_enc_tail + movaps %xmm2,%xmm7 + pxor %xmm2,%xmm2 + jmp L081cbc_ret +L078cbc_enc_tail: + movl %eax,%ecx +.long 2767451785 + movl $16,%ecx + subl %eax,%ecx + xorl %eax,%eax +.long 2868115081 + leal -16(%edi),%edi + movl %ebx,%ecx + movl %edi,%esi + movl %ebp,%edx + jmp L079cbc_enc_loop +.align 4,0x90 +L077cbc_decrypt: + cmpl $80,%eax + jbe L082cbc_dec_tail + movaps %xmm7,(%esp) + subl $80,%eax + jmp L083cbc_dec_loop6_enter +.align 4,0x90 +L084cbc_dec_loop6: + movaps %xmm0,(%esp) + movups %xmm7,(%edi) + leal 16(%edi),%edi +L083cbc_dec_loop6_enter: + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + call __aesni_decrypt6 + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps (%esp),%xmm2 + xorps %xmm1,%xmm3 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm4 + movups 48(%esi),%xmm0 + xorps %xmm1,%xmm5 + movups 64(%esi),%xmm1 + xorps %xmm0,%xmm6 + movups 80(%esi),%xmm0 + xorps %xmm1,%xmm7 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + leal 96(%esi),%esi + movups %xmm4,32(%edi) + movl %ebx,%ecx + movups %xmm5,48(%edi) + movl %ebp,%edx + movups %xmm6,64(%edi) + leal 80(%edi),%edi + subl $96,%eax + ja L084cbc_dec_loop6 + movaps %xmm7,%xmm2 + movaps %xmm0,%xmm7 + addl $80,%eax + jle L085cbc_dec_clear_tail_collected + movups %xmm2,(%edi) + leal 16(%edi),%edi +L082cbc_dec_tail: + movups (%esi),%xmm2 + movaps %xmm2,%xmm6 + cmpl $16,%eax + jbe L086cbc_dec_one + movups 16(%esi),%xmm3 + movaps %xmm3,%xmm5 + cmpl $32,%eax + jbe L087cbc_dec_two + movups 32(%esi),%xmm4 + cmpl $48,%eax + jbe L088cbc_dec_three + movups 48(%esi),%xmm5 + cmpl $64,%eax + jbe L089cbc_dec_four + movups 64(%esi),%xmm6 + movaps %xmm7,(%esp) + movups (%esi),%xmm2 + xorps %xmm7,%xmm7 + call __aesni_decrypt6 + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps (%esp),%xmm2 + xorps %xmm1,%xmm3 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm4 + movups 48(%esi),%xmm0 + xorps %xmm1,%xmm5 + movups 64(%esi),%xmm7 + xorps %xmm0,%xmm6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%edi) + pxor %xmm5,%xmm5 + leal 64(%edi),%edi + movaps %xmm6,%xmm2 + pxor %xmm6,%xmm6 + subl $80,%eax + jmp L090cbc_dec_tail_collected +.align 4,0x90 +L086cbc_dec_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +L091dec1_loop_16: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz L091dec1_loop_16 + aesdeclast %xmm1,%xmm2 + xorps %xmm7,%xmm2 + movaps %xmm6,%xmm7 + subl $16,%eax + jmp L090cbc_dec_tail_collected +.align 4,0x90 +L087cbc_dec_two: + call __aesni_decrypt2 + xorps %xmm7,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movaps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + leal 16(%edi),%edi + movaps %xmm5,%xmm7 + subl $32,%eax + jmp L090cbc_dec_tail_collected +.align 4,0x90 +L088cbc_dec_three: + call __aesni_decrypt3 + xorps %xmm7,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm5,%xmm4 + movups %xmm2,(%edi) + movaps %xmm4,%xmm2 + pxor %xmm4,%xmm4 + movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 + leal 32(%edi),%edi + movups 32(%esi),%xmm7 + subl $48,%eax + jmp L090cbc_dec_tail_collected +.align 4,0x90 +L089cbc_dec_four: + call __aesni_decrypt4 + movups 16(%esi),%xmm1 + movups 32(%esi),%xmm0 + xorps %xmm7,%xmm2 + movups 48(%esi),%xmm7 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + xorps %xmm1,%xmm4 + movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 + xorps %xmm0,%xmm5 + movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 + leal 48(%edi),%edi + movaps %xmm5,%xmm2 + pxor %xmm5,%xmm5 + subl $64,%eax + jmp L090cbc_dec_tail_collected +.align 4,0x90 +L085cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 +L090cbc_dec_tail_collected: + andl $15,%eax + jnz L092cbc_dec_tail_partial + movups %xmm2,(%edi) + pxor %xmm0,%xmm0 + jmp L081cbc_ret +.align 4,0x90 +L092cbc_dec_tail_partial: + movaps %xmm2,(%esp) + pxor %xmm0,%xmm0 + movl $16,%ecx + movl %esp,%esi + subl %eax,%ecx +.long 2767451785 + movdqa %xmm2,(%esp) +L081cbc_ret: + movl 16(%esp),%esp + movl 36(%esp),%ebp + pxor %xmm2,%xmm2 + pxor %xmm1,%xmm1 + movups %xmm7,(%ebp) + pxor %xmm7,%xmm7 +L076cbc_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _aes_hw_set_encrypt_key_base +.private_extern _aes_hw_set_encrypt_key_base +.align 4 +_aes_hw_set_encrypt_key_base: +L_aes_hw_set_encrypt_key_base_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L093pic_for_function_hit +L093pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+3-L093pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 8(%esp),%ecx + movl 12(%esp),%edx + pushl %ebx + call L094pic +L094pic: + popl %ebx + leal Lkey_const-L094pic(%ebx),%ebx + movups (%eax),%xmm0 + xorps %xmm4,%xmm4 + leal 16(%edx),%edx + cmpl $256,%ecx + je L09514rounds + cmpl $192,%ecx + je L09612rounds + cmpl $128,%ecx + jne L097bad_keybits +.align 4,0x90 +L09810rounds: + movl $9,%ecx + movups %xmm0,-16(%edx) + aeskeygenassist $1,%xmm0,%xmm1 + call L099key_128_cold + aeskeygenassist $2,%xmm0,%xmm1 + call L100key_128 + aeskeygenassist $4,%xmm0,%xmm1 + call L100key_128 + aeskeygenassist $8,%xmm0,%xmm1 + call L100key_128 + aeskeygenassist $16,%xmm0,%xmm1 + call L100key_128 + aeskeygenassist $32,%xmm0,%xmm1 + call L100key_128 + aeskeygenassist $64,%xmm0,%xmm1 + call L100key_128 + aeskeygenassist $128,%xmm0,%xmm1 + call L100key_128 + aeskeygenassist $27,%xmm0,%xmm1 + call L100key_128 + aeskeygenassist $54,%xmm0,%xmm1 + call L100key_128 + movups %xmm0,(%edx) + movl %ecx,80(%edx) + jmp L101good_key +.align 4,0x90 +L100key_128: + movups %xmm0,(%edx) + leal 16(%edx),%edx +L099key_128_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.align 4,0x90 +L09612rounds: + movq 16(%eax),%xmm2 + movl $11,%ecx + movups %xmm0,-16(%edx) + aeskeygenassist $1,%xmm2,%xmm1 + call L102key_192a_cold + aeskeygenassist $2,%xmm2,%xmm1 + call L103key_192b + aeskeygenassist $4,%xmm2,%xmm1 + call L104key_192a + aeskeygenassist $8,%xmm2,%xmm1 + call L103key_192b + aeskeygenassist $16,%xmm2,%xmm1 + call L104key_192a + aeskeygenassist $32,%xmm2,%xmm1 + call L103key_192b + aeskeygenassist $64,%xmm2,%xmm1 + call L104key_192a + aeskeygenassist $128,%xmm2,%xmm1 + call L103key_192b + movups %xmm0,(%edx) + movl %ecx,48(%edx) + jmp L101good_key +.align 4,0x90 +L104key_192a: + movups %xmm0,(%edx) + leal 16(%edx),%edx +.align 4,0x90 +L102key_192a_cold: + movaps %xmm2,%xmm5 +L105key_192b_warm: + shufps $16,%xmm0,%xmm4 + movdqa %xmm2,%xmm3 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + pslldq $4,%xmm3 + xorps %xmm4,%xmm0 + pshufd $85,%xmm1,%xmm1 + pxor %xmm3,%xmm2 + pxor %xmm1,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm3,%xmm2 + ret +.align 4,0x90 +L103key_192b: + movaps %xmm0,%xmm3 + shufps $68,%xmm0,%xmm5 + movups %xmm5,(%edx) + shufps $78,%xmm2,%xmm3 + movups %xmm3,16(%edx) + leal 32(%edx),%edx + jmp L105key_192b_warm +.align 4,0x90 +L09514rounds: + movups 16(%eax),%xmm2 + leal 16(%edx),%edx + movl $13,%ecx + movups %xmm0,-32(%edx) + movups %xmm2,-16(%edx) + aeskeygenassist $1,%xmm2,%xmm1 + call L106key_256a_cold + aeskeygenassist $1,%xmm0,%xmm1 + call L107key_256b + aeskeygenassist $2,%xmm2,%xmm1 + call L108key_256a + aeskeygenassist $2,%xmm0,%xmm1 + call L107key_256b + aeskeygenassist $4,%xmm2,%xmm1 + call L108key_256a + aeskeygenassist $4,%xmm0,%xmm1 + call L107key_256b + aeskeygenassist $8,%xmm2,%xmm1 + call L108key_256a + aeskeygenassist $8,%xmm0,%xmm1 + call L107key_256b + aeskeygenassist $16,%xmm2,%xmm1 + call L108key_256a + aeskeygenassist $16,%xmm0,%xmm1 + call L107key_256b + aeskeygenassist $32,%xmm2,%xmm1 + call L108key_256a + aeskeygenassist $32,%xmm0,%xmm1 + call L107key_256b + aeskeygenassist $64,%xmm2,%xmm1 + call L108key_256a + movups %xmm0,(%edx) + movl %ecx,16(%edx) + xorl %eax,%eax + jmp L101good_key +.align 4,0x90 +L108key_256a: + movups %xmm2,(%edx) + leal 16(%edx),%edx +L106key_256a_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.align 4,0x90 +L107key_256b: + movups %xmm0,(%edx) + leal 16(%edx),%edx + shufps $16,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $140,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $170,%xmm1,%xmm1 + xorps %xmm1,%xmm2 + ret +L101good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + ret +.align 2,0x90 +L097bad_keybits: + pxor %xmm0,%xmm0 + movl $-2,%eax + popl %ebx + ret +.globl _aes_hw_set_encrypt_key_alt +.private_extern _aes_hw_set_encrypt_key_alt +.align 4 +_aes_hw_set_encrypt_key_alt: +L_aes_hw_set_encrypt_key_alt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L109pic_for_function_hit +L109pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+3-L109pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 8(%esp),%ecx + movl 12(%esp),%edx + pushl %ebx + call L110pic +L110pic: + popl %ebx + leal Lkey_const-L110pic(%ebx),%ebx + movups (%eax),%xmm0 + xorps %xmm4,%xmm4 + leal 16(%edx),%edx + cmpl $256,%ecx + je L11114rounds_alt + cmpl $192,%ecx + je L11212rounds_alt + cmpl $128,%ecx + jne L113bad_keybits +.align 4,0x90 +L11410rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +L115loop_key128: + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz L115loop_key128 + movdqa 48(%ebx),%xmm4 + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp L116good_key +.align 4,0x90 +L11212rounds_alt: + movq 16(%eax),%xmm2 + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +L117loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 + pshufb %xmm5,%xmm2 + aesenclast %xmm4,%xmm2 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz L117loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp L116good_key +.align 4,0x90 +L11114rounds_alt: + movups 16(%eax),%xmm2 + leal 16(%edx),%edx + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +L118loop_key256: + pshufb %xmm5,%xmm2 + aesenclast %xmm4,%xmm2 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz L119done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 + aesenclast %xmm3,%xmm2 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp L118loop_key256 +L119done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +L116good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + ret +.align 2,0x90 +L113bad_keybits: + pxor %xmm0,%xmm0 + movl $-2,%eax + popl %ebx + ret +.globl _aes_hw_encrypt_key_to_decrypt_key +.private_extern _aes_hw_encrypt_key_to_decrypt_key +.align 4 +_aes_hw_encrypt_key_to_decrypt_key: +L_aes_hw_encrypt_key_to_decrypt_key_begin: + movl 4(%esp),%edx + movl 240(%edx),%ecx + shll $4,%ecx + leal 16(%edx,%ecx,1),%eax + movups (%edx),%xmm0 + movups (%eax),%xmm1 + movups %xmm0,(%eax) + movups %xmm1,(%edx) + leal 16(%edx),%edx + leal -16(%eax),%eax +L120dec_key_inverse: + movups (%edx),%xmm0 + movups (%eax),%xmm1 + aesimc %xmm0,%xmm0 + aesimc %xmm1,%xmm1 + leal 16(%edx),%edx + leal -16(%eax),%eax + movups %xmm0,16(%eax) + movups %xmm1,-16(%edx) + cmpl %edx,%eax + ja L120dec_key_inverse + movups (%edx),%xmm0 + aesimc %xmm0,%xmm0 + movups %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + ret +.align 6,0x90 +Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 +.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 +.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 +.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 +.byte 115,108,46,111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/aesni-x86-linux.S b/third_party/boringssl/gen/bcm/aesni-x86-linux.S new file mode 100644 index 00000000..b6772c1e --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesni-x86-linux.S @@ -0,0 +1,2530 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.globl aes_hw_encrypt +.hidden aes_hw_encrypt +.type aes_hw_encrypt,@function +.align 16 +aes_hw_encrypt: +.L_aes_hw_encrypt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L000pic_for_function_hit +.L000pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+1-.L000pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 12(%esp),%edx + movups (%eax),%xmm2 + movl 240(%edx),%ecx + movl 8(%esp),%eax + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L001enc1_loop_1: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L001enc1_loop_1 + aesenclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%eax) + pxor %xmm2,%xmm2 + ret +.size aes_hw_encrypt,.-.L_aes_hw_encrypt_begin +.globl aes_hw_decrypt +.hidden aes_hw_decrypt +.type aes_hw_decrypt,@function +.align 16 +aes_hw_decrypt: +.L_aes_hw_decrypt_begin: + movl 4(%esp),%eax + movl 12(%esp),%edx + movups (%eax),%xmm2 + movl 240(%edx),%ecx + movl 8(%esp),%eax + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L002dec1_loop_2: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L002dec1_loop_2 + aesdeclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%eax) + pxor %xmm2,%xmm2 + ret +.size aes_hw_decrypt,.-.L_aes_hw_decrypt_begin +.hidden _aesni_encrypt2 +.type _aesni_encrypt2,@function +.align 16 +_aesni_encrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L003enc2_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L003enc2_loop + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + ret +.size _aesni_encrypt2,.-_aesni_encrypt2 +.hidden _aesni_decrypt2 +.type _aesni_decrypt2,@function +.align 16 +_aesni_decrypt2: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L004dec2_loop: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L004dec2_loop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + ret +.size _aesni_decrypt2,.-_aesni_decrypt2 +.hidden _aesni_encrypt3 +.type _aesni_encrypt3,@function +.align 16 +_aesni_encrypt3: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L005enc3_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L005enc3_loop + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + ret +.size _aesni_encrypt3,.-_aesni_encrypt3 +.hidden _aesni_decrypt3 +.type _aesni_decrypt3,@function +.align 16 +_aesni_decrypt3: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx + addl $16,%ecx +.L006dec3_loop: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L006dec3_loop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + ret +.size _aesni_decrypt3,.-_aesni_decrypt3 +.hidden _aesni_encrypt4 +.type _aesni_encrypt4,@function +.align 16 +_aesni_encrypt4: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + shll $4,%ecx + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +.L007enc4_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L007enc4_loop + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + aesenclast %xmm0,%xmm5 + ret +.size _aesni_encrypt4,.-_aesni_encrypt4 +.hidden _aesni_decrypt4 +.type _aesni_decrypt4,@function +.align 16 +_aesni_decrypt4: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + shll $4,%ecx + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + movups 32(%edx),%xmm0 + leal 32(%edx,%ecx,1),%edx + negl %ecx +.byte 15,31,64,0 + addl $16,%ecx +.L008dec4_loop: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L008dec4_loop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + aesdeclast %xmm0,%xmm5 + ret +.size _aesni_decrypt4,.-_aesni_decrypt4 +.hidden _aesni_encrypt6 +.type _aesni_encrypt6,@function +.align 16 +_aesni_encrypt6: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + aesenc %xmm1,%xmm2 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + aesenc %xmm1,%xmm3 + leal 32(%edx,%ecx,1),%edx + negl %ecx + aesenc %xmm1,%xmm4 + pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp .L009_aesni_encrypt6_inner +.align 16 +.L010enc6_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 +.L009_aesni_encrypt6_inner: + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 +.L_aesni_encrypt6_enter: + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L010enc6_loop + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + aesenclast %xmm0,%xmm5 + aesenclast %xmm0,%xmm6 + aesenclast %xmm0,%xmm7 + ret +.size _aesni_encrypt6,.-_aesni_encrypt6 +.hidden _aesni_decrypt6 +.type _aesni_decrypt6,@function +.align 16 +_aesni_decrypt6: + movups (%edx),%xmm0 + shll $4,%ecx + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + aesdec %xmm1,%xmm2 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + aesdec %xmm1,%xmm3 + leal 32(%edx,%ecx,1),%edx + negl %ecx + aesdec %xmm1,%xmm4 + pxor %xmm0,%xmm7 + movups (%edx,%ecx,1),%xmm0 + addl $16,%ecx + jmp .L011_aesni_decrypt6_inner +.align 16 +.L012dec6_loop: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 +.L011_aesni_decrypt6_inner: + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 +.L_aesni_decrypt6_enter: + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L012dec6_loop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + aesdeclast %xmm0,%xmm5 + aesdeclast %xmm0,%xmm6 + aesdeclast %xmm0,%xmm7 + ret +.size _aesni_decrypt6,.-_aesni_decrypt6 +.globl aes_hw_ecb_encrypt +.hidden aes_hw_ecb_encrypt +.type aes_hw_ecb_encrypt,@function +.align 16 +aes_hw_ecb_encrypt: +.L_aes_hw_ecb_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + andl $-16,%eax + jz .L013ecb_ret + movl 240(%edx),%ecx + testl %ebx,%ebx + jz .L014ecb_decrypt + movl %edx,%ebp + movl %ecx,%ebx + cmpl $96,%eax + jb .L015ecb_enc_tail + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + subl $96,%eax + jmp .L016ecb_enc_loop6_enter +.align 16 +.L017ecb_enc_loop6: + movups %xmm2,(%edi) + movdqu (%esi),%xmm2 + movups %xmm3,16(%edi) + movdqu 16(%esi),%xmm3 + movups %xmm4,32(%edi) + movdqu 32(%esi),%xmm4 + movups %xmm5,48(%edi) + movdqu 48(%esi),%xmm5 + movups %xmm6,64(%edi) + movdqu 64(%esi),%xmm6 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi +.L016ecb_enc_loop6_enter: + call _aesni_encrypt6 + movl %ebp,%edx + movl %ebx,%ecx + subl $96,%eax + jnc .L017ecb_enc_loop6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + movups %xmm7,80(%edi) + leal 96(%edi),%edi + addl $96,%eax + jz .L013ecb_ret +.L015ecb_enc_tail: + movups (%esi),%xmm2 + cmpl $32,%eax + jb .L018ecb_enc_one + movups 16(%esi),%xmm3 + je .L019ecb_enc_two + movups 32(%esi),%xmm4 + cmpl $64,%eax + jb .L020ecb_enc_three + movups 48(%esi),%xmm5 + je .L021ecb_enc_four + movups 64(%esi),%xmm6 + xorps %xmm7,%xmm7 + call _aesni_encrypt6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + jmp .L013ecb_ret +.align 16 +.L018ecb_enc_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L022enc1_loop_3: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L022enc1_loop_3 + aesenclast %xmm1,%xmm2 + movups %xmm2,(%edi) + jmp .L013ecb_ret +.align 16 +.L019ecb_enc_two: + call _aesni_encrypt2 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + jmp .L013ecb_ret +.align 16 +.L020ecb_enc_three: + call _aesni_encrypt3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + jmp .L013ecb_ret +.align 16 +.L021ecb_enc_four: + call _aesni_encrypt4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + jmp .L013ecb_ret +.align 16 +.L014ecb_decrypt: + movl %edx,%ebp + movl %ecx,%ebx + cmpl $96,%eax + jb .L023ecb_dec_tail + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi + subl $96,%eax + jmp .L024ecb_dec_loop6_enter +.align 16 +.L025ecb_dec_loop6: + movups %xmm2,(%edi) + movdqu (%esi),%xmm2 + movups %xmm3,16(%edi) + movdqu 16(%esi),%xmm3 + movups %xmm4,32(%edi) + movdqu 32(%esi),%xmm4 + movups %xmm5,48(%edi) + movdqu 48(%esi),%xmm5 + movups %xmm6,64(%edi) + movdqu 64(%esi),%xmm6 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqu 80(%esi),%xmm7 + leal 96(%esi),%esi +.L024ecb_dec_loop6_enter: + call _aesni_decrypt6 + movl %ebp,%edx + movl %ebx,%ecx + subl $96,%eax + jnc .L025ecb_dec_loop6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + movups %xmm7,80(%edi) + leal 96(%edi),%edi + addl $96,%eax + jz .L013ecb_ret +.L023ecb_dec_tail: + movups (%esi),%xmm2 + cmpl $32,%eax + jb .L026ecb_dec_one + movups 16(%esi),%xmm3 + je .L027ecb_dec_two + movups 32(%esi),%xmm4 + cmpl $64,%eax + jb .L028ecb_dec_three + movups 48(%esi),%xmm5 + je .L029ecb_dec_four + movups 64(%esi),%xmm6 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + jmp .L013ecb_ret +.align 16 +.L026ecb_dec_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L030dec1_loop_4: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L030dec1_loop_4 + aesdeclast %xmm1,%xmm2 + movups %xmm2,(%edi) + jmp .L013ecb_ret +.align 16 +.L027ecb_dec_two: + call _aesni_decrypt2 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + jmp .L013ecb_ret +.align 16 +.L028ecb_dec_three: + call _aesni_decrypt3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + jmp .L013ecb_ret +.align 16 +.L029ecb_dec_four: + call _aesni_decrypt4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) +.L013ecb_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size aes_hw_ecb_encrypt,.-.L_aes_hw_ecb_encrypt_begin +.globl aes_hw_ccm64_encrypt_blocks +.hidden aes_hw_ccm64_encrypt_blocks +.type aes_hw_ccm64_encrypt_blocks,@function +.align 16 +aes_hw_ccm64_encrypt_blocks: +.L_aes_hw_ccm64_encrypt_blocks_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl 40(%esp),%ecx + movl %esp,%ebp + subl $60,%esp + andl $-16,%esp + movl %ebp,48(%esp) + movdqu (%ebx),%xmm7 + movdqu (%ecx),%xmm3 + movl 240(%edx),%ecx + movl $202182159,(%esp) + movl $134810123,4(%esp) + movl $67438087,8(%esp) + movl $66051,12(%esp) + movl $1,%ebx + xorl %ebp,%ebp + movl %ebx,16(%esp) + movl %ebp,20(%esp) + movl %ebp,24(%esp) + movl %ebp,28(%esp) + shll $4,%ecx + movl $16,%ebx + leal (%edx),%ebp + movdqa (%esp),%xmm5 + movdqa %xmm7,%xmm2 + leal 32(%edx,%ecx,1),%edx + subl %ecx,%ebx + pshufb %xmm5,%xmm7 +.L031ccm64_enc_outer: + movups (%ebp),%xmm0 + movl %ebx,%ecx + movups (%esi),%xmm6 + xorps %xmm0,%xmm2 + movups 16(%ebp),%xmm1 + xorps %xmm6,%xmm0 + xorps %xmm0,%xmm3 + movups 32(%ebp),%xmm0 +.L032ccm64_enc2_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L032ccm64_enc2_loop + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + paddq 16(%esp),%xmm7 + decl %eax + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + leal 16(%esi),%esi + xorps %xmm2,%xmm6 + movdqa %xmm7,%xmm2 + movups %xmm6,(%edi) + pshufb %xmm5,%xmm2 + leal 16(%edi),%edi + jnz .L031ccm64_enc_outer + movl 48(%esp),%esp + movl 40(%esp),%edi + movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size aes_hw_ccm64_encrypt_blocks,.-.L_aes_hw_ccm64_encrypt_blocks_begin +.globl aes_hw_ccm64_decrypt_blocks +.hidden aes_hw_ccm64_decrypt_blocks +.type aes_hw_ccm64_decrypt_blocks,@function +.align 16 +aes_hw_ccm64_decrypt_blocks: +.L_aes_hw_ccm64_decrypt_blocks_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl 40(%esp),%ecx + movl %esp,%ebp + subl $60,%esp + andl $-16,%esp + movl %ebp,48(%esp) + movdqu (%ebx),%xmm7 + movdqu (%ecx),%xmm3 + movl 240(%edx),%ecx + movl $202182159,(%esp) + movl $134810123,4(%esp) + movl $67438087,8(%esp) + movl $66051,12(%esp) + movl $1,%ebx + xorl %ebp,%ebp + movl %ebx,16(%esp) + movl %ebp,20(%esp) + movl %ebp,24(%esp) + movl %ebp,28(%esp) + movdqa (%esp),%xmm5 + movdqa %xmm7,%xmm2 + movl %edx,%ebp + movl %ecx,%ebx + pshufb %xmm5,%xmm7 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L033enc1_loop_5: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L033enc1_loop_5 + aesenclast %xmm1,%xmm2 + shll $4,%ebx + movl $16,%ecx + movups (%esi),%xmm6 + paddq 16(%esp),%xmm7 + leal 16(%esi),%esi + subl %ebx,%ecx + leal 32(%ebp,%ebx,1),%edx + movl %ecx,%ebx + jmp .L034ccm64_dec_outer +.align 16 +.L034ccm64_dec_outer: + xorps %xmm2,%xmm6 + movdqa %xmm7,%xmm2 + movups %xmm6,(%edi) + leal 16(%edi),%edi + pshufb %xmm5,%xmm2 + subl $1,%eax + jz .L035ccm64_dec_break + movups (%ebp),%xmm0 + movl %ebx,%ecx + movups 16(%ebp),%xmm1 + xorps %xmm0,%xmm6 + xorps %xmm0,%xmm2 + xorps %xmm6,%xmm3 + movups 32(%ebp),%xmm0 +.L036ccm64_dec2_loop: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + movups (%edx,%ecx,1),%xmm1 + addl $32,%ecx + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + movups -16(%edx,%ecx,1),%xmm0 + jnz .L036ccm64_dec2_loop + movups (%esi),%xmm6 + paddq 16(%esp),%xmm7 + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + leal 16(%esi),%esi + jmp .L034ccm64_dec_outer +.align 16 +.L035ccm64_dec_break: + movl 240(%ebp),%ecx + movl %ebp,%edx + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm6 + leal 32(%edx),%edx + xorps %xmm6,%xmm3 +.L037enc1_loop_6: + aesenc %xmm1,%xmm3 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L037enc1_loop_6 + aesenclast %xmm1,%xmm3 + movl 48(%esp),%esp + movl 40(%esp),%edi + movups %xmm3,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size aes_hw_ccm64_decrypt_blocks,.-.L_aes_hw_ccm64_decrypt_blocks_begin +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,@function +.align 16 +aes_hw_ctr32_encrypt_blocks: +.L_aes_hw_ctr32_encrypt_blocks_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L038pic_for_function_hit +.L038pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+0-.L038pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl %esp,%ebp + subl $88,%esp + andl $-16,%esp + movl %ebp,80(%esp) + cmpl $1,%eax + je .L039ctr32_one_shortcut + movdqu (%ebx),%xmm7 + movl $202182159,(%esp) + movl $134810123,4(%esp) + movl $67438087,8(%esp) + movl $66051,12(%esp) + movl $6,%ecx + xorl %ebp,%ebp + movl %ecx,16(%esp) + movl %ecx,20(%esp) + movl %ecx,24(%esp) + movl %ebp,28(%esp) + pextrd $3,%xmm7,%ebx + pinsrd $3,%ebp,%xmm7 + movl 240(%edx),%ecx + bswap %ebx + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqa (%esp),%xmm2 + pinsrd $0,%ebx,%xmm0 + leal 3(%ebx),%ebp + pinsrd $0,%ebp,%xmm1 + incl %ebx + pinsrd $1,%ebx,%xmm0 + incl %ebp + pinsrd $1,%ebp,%xmm1 + incl %ebx + pinsrd $2,%ebx,%xmm0 + incl %ebp + pinsrd $2,%ebp,%xmm1 + movdqa %xmm0,48(%esp) + pshufb %xmm2,%xmm0 + movdqu (%edx),%xmm6 + movdqa %xmm1,64(%esp) + pshufb %xmm2,%xmm1 + pshufd $192,%xmm0,%xmm2 + pshufd $128,%xmm0,%xmm3 + cmpl $6,%eax + jb .L040ctr32_tail + pxor %xmm6,%xmm7 + shll $4,%ecx + movl $16,%ebx + movdqa %xmm7,32(%esp) + movl %edx,%ebp + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + subl $6,%eax + jmp .L041ctr32_loop6 +.align 16 +.L041ctr32_loop6: + pshufd $64,%xmm0,%xmm4 + movdqa 32(%esp),%xmm0 + pshufd $192,%xmm1,%xmm5 + pxor %xmm0,%xmm2 + pshufd $128,%xmm1,%xmm6 + pxor %xmm0,%xmm3 + pshufd $64,%xmm1,%xmm7 + movups 16(%ebp),%xmm1 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + aesenc %xmm1,%xmm2 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + aesenc %xmm1,%xmm3 + movups 32(%ebp),%xmm0 + movl %ebx,%ecx + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + call .L_aesni_encrypt6_enter + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps %xmm1,%xmm2 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm3 + movups %xmm2,(%edi) + movdqa 16(%esp),%xmm0 + xorps %xmm1,%xmm4 + movdqa 64(%esp),%xmm1 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + paddd %xmm0,%xmm1 + paddd 48(%esp),%xmm0 + movdqa (%esp),%xmm2 + movups 48(%esi),%xmm3 + movups 64(%esi),%xmm4 + xorps %xmm3,%xmm5 + movups 80(%esi),%xmm3 + leal 96(%esi),%esi + movdqa %xmm0,48(%esp) + pshufb %xmm2,%xmm0 + xorps %xmm4,%xmm6 + movups %xmm5,48(%edi) + xorps %xmm3,%xmm7 + movdqa %xmm1,64(%esp) + pshufb %xmm2,%xmm1 + movups %xmm6,64(%edi) + pshufd $192,%xmm0,%xmm2 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + pshufd $128,%xmm0,%xmm3 + subl $6,%eax + jnc .L041ctr32_loop6 + addl $6,%eax + jz .L042ctr32_ret + movdqu (%ebp),%xmm7 + movl %ebp,%edx + pxor 32(%esp),%xmm7 + movl 240(%ebp),%ecx +.L040ctr32_tail: + por %xmm7,%xmm2 + cmpl $2,%eax + jb .L043ctr32_one + pshufd $64,%xmm0,%xmm4 + por %xmm7,%xmm3 + je .L044ctr32_two + pshufd $192,%xmm1,%xmm5 + por %xmm7,%xmm4 + cmpl $4,%eax + jb .L045ctr32_three + pshufd $128,%xmm1,%xmm6 + por %xmm7,%xmm5 + je .L046ctr32_four + por %xmm7,%xmm6 + call _aesni_encrypt6 + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps %xmm1,%xmm2 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm3 + movups 48(%esi),%xmm0 + xorps %xmm1,%xmm4 + movups 64(%esi),%xmm1 + xorps %xmm0,%xmm5 + movups %xmm2,(%edi) + xorps %xmm1,%xmm6 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + jmp .L042ctr32_ret +.align 16 +.L039ctr32_one_shortcut: + movups (%ebx),%xmm2 + movl 240(%edx),%ecx +.L043ctr32_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L047enc1_loop_7: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L047enc1_loop_7 + aesenclast %xmm1,%xmm2 + movups (%esi),%xmm6 + xorps %xmm2,%xmm6 + movups %xmm6,(%edi) + jmp .L042ctr32_ret +.align 16 +.L044ctr32_two: + call _aesni_encrypt2 + movups (%esi),%xmm5 + movups 16(%esi),%xmm6 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + jmp .L042ctr32_ret +.align 16 +.L045ctr32_three: + call _aesni_encrypt3 + movups (%esi),%xmm5 + movups 16(%esi),%xmm6 + xorps %xmm5,%xmm2 + movups 32(%esi),%xmm7 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + xorps %xmm7,%xmm4 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + jmp .L042ctr32_ret +.align 16 +.L046ctr32_four: + call _aesni_encrypt4 + movups (%esi),%xmm6 + movups 16(%esi),%xmm7 + movups 32(%esi),%xmm1 + xorps %xmm6,%xmm2 + movups 48(%esi),%xmm0 + xorps %xmm7,%xmm3 + movups %xmm2,(%edi) + xorps %xmm1,%xmm4 + movups %xmm3,16(%edi) + xorps %xmm0,%xmm5 + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) +.L042ctr32_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movl 80(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size aes_hw_ctr32_encrypt_blocks,.-.L_aes_hw_ctr32_encrypt_blocks_begin +.globl aes_hw_xts_encrypt +.hidden aes_hw_xts_encrypt +.type aes_hw_xts_encrypt,@function +.align 16 +aes_hw_xts_encrypt: +.L_aes_hw_xts_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 36(%esp),%edx + movl 40(%esp),%esi + movl 240(%edx),%ecx + movups (%esi),%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L048enc1_loop_8: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L048enc1_loop_8 + aesenclast %xmm1,%xmm2 + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl %esp,%ebp + subl $120,%esp + movl 240(%edx),%ecx + andl $-16,%esp + movl $135,96(%esp) + movl $0,100(%esp) + movl $1,104(%esp) + movl $0,108(%esp) + movl %eax,112(%esp) + movl %ebp,116(%esp) + movdqa %xmm2,%xmm1 + pxor %xmm0,%xmm0 + movdqa 96(%esp),%xmm3 + pcmpgtd %xmm1,%xmm0 + andl $-16,%eax + movl %edx,%ebp + movl %ecx,%ebx + subl $96,%eax + jc .L049xts_enc_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp .L050xts_enc_loop6 +.align 16 +.L050xts_enc_loop6: + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,16(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,32(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,64(%esp) + paddq %xmm1,%xmm1 + movups (%ebp),%xmm0 + pand %xmm3,%xmm7 + movups (%esi),%xmm2 + pxor %xmm1,%xmm7 + movl %ebx,%ecx + movdqu 16(%esi),%xmm3 + xorps %xmm0,%xmm2 + movdqu 32(%esi),%xmm4 + pxor %xmm0,%xmm3 + movdqu 48(%esi),%xmm5 + pxor %xmm0,%xmm4 + movdqu 64(%esi),%xmm6 + pxor %xmm0,%xmm5 + movdqu 80(%esi),%xmm1 + pxor %xmm0,%xmm6 + leal 96(%esi),%esi + pxor (%esp),%xmm2 + movdqa %xmm7,80(%esp) + pxor %xmm1,%xmm7 + movups 16(%ebp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + aesenc %xmm1,%xmm2 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + aesenc %xmm1,%xmm3 + pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + call .L_aesni_encrypt6_enter + movdqa 80(%esp),%xmm1 + pxor %xmm0,%xmm0 + xorps (%esp),%xmm2 + pcmpgtd %xmm1,%xmm0 + xorps 16(%esp),%xmm3 + movups %xmm2,(%edi) + xorps 32(%esp),%xmm4 + movups %xmm3,16(%edi) + xorps 48(%esp),%xmm5 + movups %xmm4,32(%edi) + xorps 64(%esp),%xmm6 + movups %xmm5,48(%edi) + xorps %xmm1,%xmm7 + movups %xmm6,64(%edi) + pshufd $19,%xmm0,%xmm2 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqa 96(%esp),%xmm3 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + subl $96,%eax + jnc .L050xts_enc_loop6 + movl 240(%ebp),%ecx + movl %ebp,%edx + movl %ecx,%ebx +.L049xts_enc_short: + addl $96,%eax + jz .L051xts_enc_done6x + movdqa %xmm1,%xmm5 + cmpl $32,%eax + jb .L052xts_enc_one + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + je .L053xts_enc_two + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm6 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + cmpl $64,%eax + jb .L054xts_enc_three + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm7 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + movdqa %xmm5,(%esp) + movdqa %xmm6,16(%esp) + je .L055xts_enc_four + movdqa %xmm7,32(%esp) + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm7 + pxor %xmm1,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + pxor (%esp),%xmm2 + movdqu 48(%esi),%xmm5 + pxor 16(%esp),%xmm3 + movdqu 64(%esi),%xmm6 + pxor 32(%esp),%xmm4 + leal 80(%esi),%esi + pxor 48(%esp),%xmm5 + movdqa %xmm7,64(%esp) + pxor %xmm7,%xmm6 + call _aesni_encrypt6 + movaps 64(%esp),%xmm1 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps 32(%esp),%xmm4 + movups %xmm2,(%edi) + xorps 48(%esp),%xmm5 + movups %xmm3,16(%edi) + xorps %xmm1,%xmm6 + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + leal 80(%edi),%edi + jmp .L056xts_enc_done +.align 16 +.L052xts_enc_one: + movups (%esi),%xmm2 + leal 16(%esi),%esi + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L057enc1_loop_9: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L057enc1_loop_9 + aesenclast %xmm1,%xmm2 + xorps %xmm5,%xmm2 + movups %xmm2,(%edi) + leal 16(%edi),%edi + movdqa %xmm5,%xmm1 + jmp .L056xts_enc_done +.align 16 +.L053xts_enc_two: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + leal 32(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + call _aesni_encrypt2 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + leal 32(%edi),%edi + movdqa %xmm6,%xmm1 + jmp .L056xts_enc_done +.align 16 +.L054xts_enc_three: + movaps %xmm1,%xmm7 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + leal 48(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + call _aesni_encrypt3 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + leal 48(%edi),%edi + movdqa %xmm7,%xmm1 + jmp .L056xts_enc_done +.align 16 +.L055xts_enc_four: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + xorps (%esp),%xmm2 + movups 48(%esi),%xmm5 + leal 64(%esi),%esi + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + xorps %xmm6,%xmm5 + call _aesni_encrypt4 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + xorps %xmm6,%xmm5 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + leal 64(%edi),%edi + movdqa %xmm6,%xmm1 + jmp .L056xts_enc_done +.align 16 +.L051xts_enc_done6x: + movl 112(%esp),%eax + andl $15,%eax + jz .L058xts_enc_ret + movdqa %xmm1,%xmm5 + movl %eax,112(%esp) + jmp .L059xts_enc_steal +.align 16 +.L056xts_enc_done: + movl 112(%esp),%eax + pxor %xmm0,%xmm0 + andl $15,%eax + jz .L058xts_enc_ret + pcmpgtd %xmm1,%xmm0 + movl %eax,112(%esp) + pshufd $19,%xmm0,%xmm5 + paddq %xmm1,%xmm1 + pand 96(%esp),%xmm5 + pxor %xmm1,%xmm5 +.L059xts_enc_steal: + movzbl (%esi),%ecx + movzbl -16(%edi),%edx + leal 1(%esi),%esi + movb %cl,-16(%edi) + movb %dl,(%edi) + leal 1(%edi),%edi + subl $1,%eax + jnz .L059xts_enc_steal + subl 112(%esp),%edi + movl %ebp,%edx + movl %ebx,%ecx + movups -16(%edi),%xmm2 + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L060enc1_loop_10: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L060enc1_loop_10 + aesenclast %xmm1,%xmm2 + xorps %xmm5,%xmm2 + movups %xmm2,-16(%edi) +.L058xts_enc_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) + movl 116(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size aes_hw_xts_encrypt,.-.L_aes_hw_xts_encrypt_begin +.globl aes_hw_xts_decrypt +.hidden aes_hw_xts_decrypt +.type aes_hw_xts_decrypt,@function +.align 16 +aes_hw_xts_decrypt: +.L_aes_hw_xts_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 36(%esp),%edx + movl 40(%esp),%esi + movl 240(%edx),%ecx + movups (%esi),%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L061enc1_loop_11: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L061enc1_loop_11 + aesenclast %xmm1,%xmm2 + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + movl %esp,%ebp + subl $120,%esp + andl $-16,%esp + xorl %ebx,%ebx + testl $15,%eax + setnz %bl + shll $4,%ebx + subl %ebx,%eax + movl $135,96(%esp) + movl $0,100(%esp) + movl $1,104(%esp) + movl $0,108(%esp) + movl %eax,112(%esp) + movl %ebp,116(%esp) + movl 240(%edx),%ecx + movl %edx,%ebp + movl %ecx,%ebx + movdqa %xmm2,%xmm1 + pxor %xmm0,%xmm0 + movdqa 96(%esp),%xmm3 + pcmpgtd %xmm1,%xmm0 + andl $-16,%eax + subl $96,%eax + jc .L062xts_dec_short + shll $4,%ecx + movl $16,%ebx + subl %ecx,%ebx + leal 32(%edx,%ecx,1),%edx + jmp .L063xts_dec_loop6 +.align 16 +.L063xts_dec_loop6: + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,16(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,32(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,64(%esp) + paddq %xmm1,%xmm1 + movups (%ebp),%xmm0 + pand %xmm3,%xmm7 + movups (%esi),%xmm2 + pxor %xmm1,%xmm7 + movl %ebx,%ecx + movdqu 16(%esi),%xmm3 + xorps %xmm0,%xmm2 + movdqu 32(%esi),%xmm4 + pxor %xmm0,%xmm3 + movdqu 48(%esi),%xmm5 + pxor %xmm0,%xmm4 + movdqu 64(%esi),%xmm6 + pxor %xmm0,%xmm5 + movdqu 80(%esi),%xmm1 + pxor %xmm0,%xmm6 + leal 96(%esi),%esi + pxor (%esp),%xmm2 + movdqa %xmm7,80(%esp) + pxor %xmm1,%xmm7 + movups 16(%ebp),%xmm1 + pxor 16(%esp),%xmm3 + pxor 32(%esp),%xmm4 + aesdec %xmm1,%xmm2 + pxor 48(%esp),%xmm5 + pxor 64(%esp),%xmm6 + aesdec %xmm1,%xmm3 + pxor %xmm0,%xmm7 + movups 32(%ebp),%xmm0 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + call .L_aesni_decrypt6_enter + movdqa 80(%esp),%xmm1 + pxor %xmm0,%xmm0 + xorps (%esp),%xmm2 + pcmpgtd %xmm1,%xmm0 + xorps 16(%esp),%xmm3 + movups %xmm2,(%edi) + xorps 32(%esp),%xmm4 + movups %xmm3,16(%edi) + xorps 48(%esp),%xmm5 + movups %xmm4,32(%edi) + xorps 64(%esp),%xmm6 + movups %xmm5,48(%edi) + xorps %xmm1,%xmm7 + movups %xmm6,64(%edi) + pshufd $19,%xmm0,%xmm2 + movups %xmm7,80(%edi) + leal 96(%edi),%edi + movdqa 96(%esp),%xmm3 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + subl $96,%eax + jnc .L063xts_dec_loop6 + movl 240(%ebp),%ecx + movl %ebp,%edx + movl %ecx,%ebx +.L062xts_dec_short: + addl $96,%eax + jz .L064xts_dec_done6x + movdqa %xmm1,%xmm5 + cmpl $32,%eax + jb .L065xts_dec_one + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + je .L066xts_dec_two + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm6 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + cmpl $64,%eax + jb .L067xts_dec_three + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa %xmm1,%xmm7 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 + movdqa %xmm5,(%esp) + movdqa %xmm6,16(%esp) + je .L068xts_dec_four + movdqa %xmm7,32(%esp) + pshufd $19,%xmm0,%xmm7 + movdqa %xmm1,48(%esp) + paddq %xmm1,%xmm1 + pand %xmm3,%xmm7 + pxor %xmm1,%xmm7 + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + pxor (%esp),%xmm2 + movdqu 48(%esi),%xmm5 + pxor 16(%esp),%xmm3 + movdqu 64(%esi),%xmm6 + pxor 32(%esp),%xmm4 + leal 80(%esi),%esi + pxor 48(%esp),%xmm5 + movdqa %xmm7,64(%esp) + pxor %xmm7,%xmm6 + call _aesni_decrypt6 + movaps 64(%esp),%xmm1 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps 32(%esp),%xmm4 + movups %xmm2,(%edi) + xorps 48(%esp),%xmm5 + movups %xmm3,16(%edi) + xorps %xmm1,%xmm6 + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + movups %xmm6,64(%edi) + leal 80(%edi),%edi + jmp .L069xts_dec_done +.align 16 +.L065xts_dec_one: + movups (%esi),%xmm2 + leal 16(%esi),%esi + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L070dec1_loop_12: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L070dec1_loop_12 + aesdeclast %xmm1,%xmm2 + xorps %xmm5,%xmm2 + movups %xmm2,(%edi) + leal 16(%edi),%edi + movdqa %xmm5,%xmm1 + jmp .L069xts_dec_done +.align 16 +.L066xts_dec_two: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + leal 32(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + call _aesni_decrypt2 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + leal 32(%edi),%edi + movdqa %xmm6,%xmm1 + jmp .L069xts_dec_done +.align 16 +.L067xts_dec_three: + movaps %xmm1,%xmm7 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + leal 48(%esi),%esi + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + call _aesni_decrypt3 + xorps %xmm5,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + leal 48(%edi),%edi + movdqa %xmm7,%xmm1 + jmp .L069xts_dec_done +.align 16 +.L068xts_dec_four: + movaps %xmm1,%xmm6 + movups (%esi),%xmm2 + movups 16(%esi),%xmm3 + movups 32(%esi),%xmm4 + xorps (%esp),%xmm2 + movups 48(%esi),%xmm5 + leal 64(%esi),%esi + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + xorps %xmm6,%xmm5 + call _aesni_decrypt4 + xorps (%esp),%xmm2 + xorps 16(%esp),%xmm3 + xorps %xmm7,%xmm4 + movups %xmm2,(%edi) + xorps %xmm6,%xmm5 + movups %xmm3,16(%edi) + movups %xmm4,32(%edi) + movups %xmm5,48(%edi) + leal 64(%edi),%edi + movdqa %xmm6,%xmm1 + jmp .L069xts_dec_done +.align 16 +.L064xts_dec_done6x: + movl 112(%esp),%eax + andl $15,%eax + jz .L071xts_dec_ret + movl %eax,112(%esp) + jmp .L072xts_dec_only_one_more +.align 16 +.L069xts_dec_done: + movl 112(%esp),%eax + pxor %xmm0,%xmm0 + andl $15,%eax + jz .L071xts_dec_ret + pcmpgtd %xmm1,%xmm0 + movl %eax,112(%esp) + pshufd $19,%xmm0,%xmm2 + pxor %xmm0,%xmm0 + movdqa 96(%esp),%xmm3 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm2 + pcmpgtd %xmm1,%xmm0 + pxor %xmm2,%xmm1 +.L072xts_dec_only_one_more: + pshufd $19,%xmm0,%xmm5 + movdqa %xmm1,%xmm6 + paddq %xmm1,%xmm1 + pand %xmm3,%xmm5 + pxor %xmm1,%xmm5 + movl %ebp,%edx + movl %ebx,%ecx + movups (%esi),%xmm2 + xorps %xmm5,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L073dec1_loop_13: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L073dec1_loop_13 + aesdeclast %xmm1,%xmm2 + xorps %xmm5,%xmm2 + movups %xmm2,(%edi) +.L074xts_dec_steal: + movzbl 16(%esi),%ecx + movzbl (%edi),%edx + leal 1(%esi),%esi + movb %cl,(%edi) + movb %dl,16(%edi) + leal 1(%edi),%edi + subl $1,%eax + jnz .L074xts_dec_steal + subl 112(%esp),%edi + movl %ebp,%edx + movl %ebx,%ecx + movups (%edi),%xmm2 + xorps %xmm6,%xmm2 + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L075dec1_loop_14: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L075dec1_loop_14 + aesdeclast %xmm1,%xmm2 + xorps %xmm6,%xmm2 + movups %xmm2,(%edi) +.L071xts_dec_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + movdqa %xmm0,(%esp) + pxor %xmm3,%xmm3 + movdqa %xmm0,16(%esp) + pxor %xmm4,%xmm4 + movdqa %xmm0,32(%esp) + pxor %xmm5,%xmm5 + movdqa %xmm0,48(%esp) + pxor %xmm6,%xmm6 + movdqa %xmm0,64(%esp) + pxor %xmm7,%xmm7 + movdqa %xmm0,80(%esp) + movl 116(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size aes_hw_xts_decrypt,.-.L_aes_hw_xts_decrypt_begin +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,@function +.align 16 +aes_hw_cbc_encrypt: +.L_aes_hw_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl %esp,%ebx + movl 24(%esp),%edi + subl $24,%ebx + movl 28(%esp),%eax + andl $-16,%ebx + movl 32(%esp),%edx + movl 36(%esp),%ebp + testl %eax,%eax + jz .L076cbc_abort + cmpl $0,40(%esp) + xchgl %esp,%ebx + movups (%ebp),%xmm7 + movl 240(%edx),%ecx + movl %edx,%ebp + movl %ebx,16(%esp) + movl %ecx,%ebx + je .L077cbc_decrypt + movaps %xmm7,%xmm2 + cmpl $16,%eax + jb .L078cbc_enc_tail + subl $16,%eax + jmp .L079cbc_enc_loop +.align 16 +.L079cbc_enc_loop: + movups (%esi),%xmm7 + leal 16(%esi),%esi + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + xorps %xmm0,%xmm7 + leal 32(%edx),%edx + xorps %xmm7,%xmm2 +.L080enc1_loop_15: + aesenc %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L080enc1_loop_15 + aesenclast %xmm1,%xmm2 + movl %ebx,%ecx + movl %ebp,%edx + movups %xmm2,(%edi) + leal 16(%edi),%edi + subl $16,%eax + jnc .L079cbc_enc_loop + addl $16,%eax + jnz .L078cbc_enc_tail + movaps %xmm2,%xmm7 + pxor %xmm2,%xmm2 + jmp .L081cbc_ret +.L078cbc_enc_tail: + movl %eax,%ecx +.long 2767451785 + movl $16,%ecx + subl %eax,%ecx + xorl %eax,%eax +.long 2868115081 + leal -16(%edi),%edi + movl %ebx,%ecx + movl %edi,%esi + movl %ebp,%edx + jmp .L079cbc_enc_loop +.align 16 +.L077cbc_decrypt: + cmpl $80,%eax + jbe .L082cbc_dec_tail + movaps %xmm7,(%esp) + subl $80,%eax + jmp .L083cbc_dec_loop6_enter +.align 16 +.L084cbc_dec_loop6: + movaps %xmm0,(%esp) + movups %xmm7,(%edi) + leal 16(%edi),%edi +.L083cbc_dec_loop6_enter: + movdqu (%esi),%xmm2 + movdqu 16(%esi),%xmm3 + movdqu 32(%esi),%xmm4 + movdqu 48(%esi),%xmm5 + movdqu 64(%esi),%xmm6 + movdqu 80(%esi),%xmm7 + call _aesni_decrypt6 + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps (%esp),%xmm2 + xorps %xmm1,%xmm3 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm4 + movups 48(%esi),%xmm0 + xorps %xmm1,%xmm5 + movups 64(%esi),%xmm1 + xorps %xmm0,%xmm6 + movups 80(%esi),%xmm0 + xorps %xmm1,%xmm7 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + leal 96(%esi),%esi + movups %xmm4,32(%edi) + movl %ebx,%ecx + movups %xmm5,48(%edi) + movl %ebp,%edx + movups %xmm6,64(%edi) + leal 80(%edi),%edi + subl $96,%eax + ja .L084cbc_dec_loop6 + movaps %xmm7,%xmm2 + movaps %xmm0,%xmm7 + addl $80,%eax + jle .L085cbc_dec_clear_tail_collected + movups %xmm2,(%edi) + leal 16(%edi),%edi +.L082cbc_dec_tail: + movups (%esi),%xmm2 + movaps %xmm2,%xmm6 + cmpl $16,%eax + jbe .L086cbc_dec_one + movups 16(%esi),%xmm3 + movaps %xmm3,%xmm5 + cmpl $32,%eax + jbe .L087cbc_dec_two + movups 32(%esi),%xmm4 + cmpl $48,%eax + jbe .L088cbc_dec_three + movups 48(%esi),%xmm5 + cmpl $64,%eax + jbe .L089cbc_dec_four + movups 64(%esi),%xmm6 + movaps %xmm7,(%esp) + movups (%esi),%xmm2 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + movups (%esi),%xmm1 + movups 16(%esi),%xmm0 + xorps (%esp),%xmm2 + xorps %xmm1,%xmm3 + movups 32(%esi),%xmm1 + xorps %xmm0,%xmm4 + movups 48(%esi),%xmm0 + xorps %xmm1,%xmm5 + movups 64(%esi),%xmm7 + xorps %xmm0,%xmm6 + movups %xmm2,(%edi) + movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%edi) + pxor %xmm5,%xmm5 + leal 64(%edi),%edi + movaps %xmm6,%xmm2 + pxor %xmm6,%xmm6 + subl $80,%eax + jmp .L090cbc_dec_tail_collected +.align 16 +.L086cbc_dec_one: + movups (%edx),%xmm0 + movups 16(%edx),%xmm1 + leal 32(%edx),%edx + xorps %xmm0,%xmm2 +.L091dec1_loop_16: + aesdec %xmm1,%xmm2 + decl %ecx + movups (%edx),%xmm1 + leal 16(%edx),%edx + jnz .L091dec1_loop_16 + aesdeclast %xmm1,%xmm2 + xorps %xmm7,%xmm2 + movaps %xmm6,%xmm7 + subl $16,%eax + jmp .L090cbc_dec_tail_collected +.align 16 +.L087cbc_dec_two: + call _aesni_decrypt2 + xorps %xmm7,%xmm2 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + movaps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + leal 16(%edi),%edi + movaps %xmm5,%xmm7 + subl $32,%eax + jmp .L090cbc_dec_tail_collected +.align 16 +.L088cbc_dec_three: + call _aesni_decrypt3 + xorps %xmm7,%xmm2 + xorps %xmm6,%xmm3 + xorps %xmm5,%xmm4 + movups %xmm2,(%edi) + movaps %xmm4,%xmm2 + pxor %xmm4,%xmm4 + movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 + leal 32(%edi),%edi + movups 32(%esi),%xmm7 + subl $48,%eax + jmp .L090cbc_dec_tail_collected +.align 16 +.L089cbc_dec_four: + call _aesni_decrypt4 + movups 16(%esi),%xmm1 + movups 32(%esi),%xmm0 + xorps %xmm7,%xmm2 + movups 48(%esi),%xmm7 + xorps %xmm6,%xmm3 + movups %xmm2,(%edi) + xorps %xmm1,%xmm4 + movups %xmm3,16(%edi) + pxor %xmm3,%xmm3 + xorps %xmm0,%xmm5 + movups %xmm4,32(%edi) + pxor %xmm4,%xmm4 + leal 48(%edi),%edi + movaps %xmm5,%xmm2 + pxor %xmm5,%xmm5 + subl $64,%eax + jmp .L090cbc_dec_tail_collected +.align 16 +.L085cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 +.L090cbc_dec_tail_collected: + andl $15,%eax + jnz .L092cbc_dec_tail_partial + movups %xmm2,(%edi) + pxor %xmm0,%xmm0 + jmp .L081cbc_ret +.align 16 +.L092cbc_dec_tail_partial: + movaps %xmm2,(%esp) + pxor %xmm0,%xmm0 + movl $16,%ecx + movl %esp,%esi + subl %eax,%ecx +.long 2767451785 + movdqa %xmm2,(%esp) +.L081cbc_ret: + movl 16(%esp),%esp + movl 36(%esp),%ebp + pxor %xmm2,%xmm2 + pxor %xmm1,%xmm1 + movups %xmm7,(%ebp) + pxor %xmm7,%xmm7 +.L076cbc_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin +.globl aes_hw_set_encrypt_key_base +.hidden aes_hw_set_encrypt_key_base +.type aes_hw_set_encrypt_key_base,@function +.align 16 +aes_hw_set_encrypt_key_base: +.L_aes_hw_set_encrypt_key_base_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L093pic_for_function_hit +.L093pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+3-.L093pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 8(%esp),%ecx + movl 12(%esp),%edx + pushl %ebx + call .L094pic +.L094pic: + popl %ebx + leal .Lkey_const-.L094pic(%ebx),%ebx + movups (%eax),%xmm0 + xorps %xmm4,%xmm4 + leal 16(%edx),%edx + cmpl $256,%ecx + je .L09514rounds + cmpl $192,%ecx + je .L09612rounds + cmpl $128,%ecx + jne .L097bad_keybits +.align 16 +.L09810rounds: + movl $9,%ecx + movups %xmm0,-16(%edx) + aeskeygenassist $1,%xmm0,%xmm1 + call .L099key_128_cold + aeskeygenassist $2,%xmm0,%xmm1 + call .L100key_128 + aeskeygenassist $4,%xmm0,%xmm1 + call .L100key_128 + aeskeygenassist $8,%xmm0,%xmm1 + call .L100key_128 + aeskeygenassist $16,%xmm0,%xmm1 + call .L100key_128 + aeskeygenassist $32,%xmm0,%xmm1 + call .L100key_128 + aeskeygenassist $64,%xmm0,%xmm1 + call .L100key_128 + aeskeygenassist $128,%xmm0,%xmm1 + call .L100key_128 + aeskeygenassist $27,%xmm0,%xmm1 + call .L100key_128 + aeskeygenassist $54,%xmm0,%xmm1 + call .L100key_128 + movups %xmm0,(%edx) + movl %ecx,80(%edx) + jmp .L101good_key +.align 16 +.L100key_128: + movups %xmm0,(%edx) + leal 16(%edx),%edx +.L099key_128_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.align 16 +.L09612rounds: + movq 16(%eax),%xmm2 + movl $11,%ecx + movups %xmm0,-16(%edx) + aeskeygenassist $1,%xmm2,%xmm1 + call .L102key_192a_cold + aeskeygenassist $2,%xmm2,%xmm1 + call .L103key_192b + aeskeygenassist $4,%xmm2,%xmm1 + call .L104key_192a + aeskeygenassist $8,%xmm2,%xmm1 + call .L103key_192b + aeskeygenassist $16,%xmm2,%xmm1 + call .L104key_192a + aeskeygenassist $32,%xmm2,%xmm1 + call .L103key_192b + aeskeygenassist $64,%xmm2,%xmm1 + call .L104key_192a + aeskeygenassist $128,%xmm2,%xmm1 + call .L103key_192b + movups %xmm0,(%edx) + movl %ecx,48(%edx) + jmp .L101good_key +.align 16 +.L104key_192a: + movups %xmm0,(%edx) + leal 16(%edx),%edx +.align 16 +.L102key_192a_cold: + movaps %xmm2,%xmm5 +.L105key_192b_warm: + shufps $16,%xmm0,%xmm4 + movdqa %xmm2,%xmm3 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + pslldq $4,%xmm3 + xorps %xmm4,%xmm0 + pshufd $85,%xmm1,%xmm1 + pxor %xmm3,%xmm2 + pxor %xmm1,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm3,%xmm2 + ret +.align 16 +.L103key_192b: + movaps %xmm0,%xmm3 + shufps $68,%xmm0,%xmm5 + movups %xmm5,(%edx) + shufps $78,%xmm2,%xmm3 + movups %xmm3,16(%edx) + leal 32(%edx),%edx + jmp .L105key_192b_warm +.align 16 +.L09514rounds: + movups 16(%eax),%xmm2 + leal 16(%edx),%edx + movl $13,%ecx + movups %xmm0,-32(%edx) + movups %xmm2,-16(%edx) + aeskeygenassist $1,%xmm2,%xmm1 + call .L106key_256a_cold + aeskeygenassist $1,%xmm0,%xmm1 + call .L107key_256b + aeskeygenassist $2,%xmm2,%xmm1 + call .L108key_256a + aeskeygenassist $2,%xmm0,%xmm1 + call .L107key_256b + aeskeygenassist $4,%xmm2,%xmm1 + call .L108key_256a + aeskeygenassist $4,%xmm0,%xmm1 + call .L107key_256b + aeskeygenassist $8,%xmm2,%xmm1 + call .L108key_256a + aeskeygenassist $8,%xmm0,%xmm1 + call .L107key_256b + aeskeygenassist $16,%xmm2,%xmm1 + call .L108key_256a + aeskeygenassist $16,%xmm0,%xmm1 + call .L107key_256b + aeskeygenassist $32,%xmm2,%xmm1 + call .L108key_256a + aeskeygenassist $32,%xmm0,%xmm1 + call .L107key_256b + aeskeygenassist $64,%xmm2,%xmm1 + call .L108key_256a + movups %xmm0,(%edx) + movl %ecx,16(%edx) + xorl %eax,%eax + jmp .L101good_key +.align 16 +.L108key_256a: + movups %xmm2,(%edx) + leal 16(%edx),%edx +.L106key_256a_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.align 16 +.L107key_256b: + movups %xmm0,(%edx) + leal 16(%edx),%edx + shufps $16,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $140,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $170,%xmm1,%xmm1 + xorps %xmm1,%xmm2 + ret +.L101good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + ret +.align 4 +.L097bad_keybits: + pxor %xmm0,%xmm0 + movl $-2,%eax + popl %ebx + ret +.size aes_hw_set_encrypt_key_base,.-.L_aes_hw_set_encrypt_key_base_begin +.globl aes_hw_set_encrypt_key_alt +.hidden aes_hw_set_encrypt_key_alt +.type aes_hw_set_encrypt_key_alt,@function +.align 16 +aes_hw_set_encrypt_key_alt: +.L_aes_hw_set_encrypt_key_alt_begin: +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L109pic_for_function_hit +.L109pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+3-.L109pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 4(%esp),%eax + movl 8(%esp),%ecx + movl 12(%esp),%edx + pushl %ebx + call .L110pic +.L110pic: + popl %ebx + leal .Lkey_const-.L110pic(%ebx),%ebx + movups (%eax),%xmm0 + xorps %xmm4,%xmm4 + leal 16(%edx),%edx + cmpl $256,%ecx + je .L11114rounds_alt + cmpl $192,%ecx + je .L11212rounds_alt + cmpl $128,%ecx + jne .L113bad_keybits +.align 16 +.L11410rounds_alt: + movdqa (%ebx),%xmm5 + movl $8,%ecx + movdqa 32(%ebx),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,-16(%edx) +.L115loop_key128: + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld $1,%xmm4 + leal 16(%edx),%edx + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%edx) + movdqa %xmm0,%xmm2 + decl %ecx + jnz .L115loop_key128 + movdqa 48(%ebx),%xmm4 + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld $1,%xmm4 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + movdqa %xmm0,%xmm2 + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%edx) + movl $9,%ecx + movl %ecx,96(%edx) + jmp .L116good_key +.align 16 +.L11212rounds_alt: + movq 16(%eax),%xmm2 + movdqa 16(%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $8,%ecx + movdqu %xmm0,-16(%edx) +.L117loop_key192: + movq %xmm2,(%edx) + movdqa %xmm2,%xmm1 + pshufb %xmm5,%xmm2 + aesenclast %xmm4,%xmm2 + pslld $1,%xmm4 + leal 24(%edx),%edx + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%edx) + decl %ecx + jnz .L117loop_key192 + movl $11,%ecx + movl %ecx,32(%edx) + jmp .L116good_key +.align 16 +.L11114rounds_alt: + movups 16(%eax),%xmm2 + leal 16(%edx),%edx + movdqa (%ebx),%xmm5 + movdqa 32(%ebx),%xmm4 + movl $7,%ecx + movdqu %xmm0,-32(%edx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,-16(%edx) +.L118loop_key256: + pshufb %xmm5,%xmm2 + aesenclast %xmm4,%xmm2 + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + pxor %xmm2,%xmm0 + movdqu %xmm0,(%edx) + decl %ecx + jz .L119done_key256 + pshufd $255,%xmm0,%xmm2 + pxor %xmm3,%xmm3 + aesenclast %xmm3,%xmm2 + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%edx) + leal 32(%edx),%edx + movdqa %xmm2,%xmm1 + jmp .L118loop_key256 +.L119done_key256: + movl $13,%ecx + movl %ecx,16(%edx) +.L116good_key: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + xorl %eax,%eax + popl %ebx + ret +.align 4 +.L113bad_keybits: + pxor %xmm0,%xmm0 + movl $-2,%eax + popl %ebx + ret +.size aes_hw_set_encrypt_key_alt,.-.L_aes_hw_set_encrypt_key_alt_begin +.globl aes_hw_encrypt_key_to_decrypt_key +.hidden aes_hw_encrypt_key_to_decrypt_key +.type aes_hw_encrypt_key_to_decrypt_key,@function +.align 16 +aes_hw_encrypt_key_to_decrypt_key: +.L_aes_hw_encrypt_key_to_decrypt_key_begin: + movl 4(%esp),%edx + movl 240(%edx),%ecx + shll $4,%ecx + leal 16(%edx,%ecx,1),%eax + movups (%edx),%xmm0 + movups (%eax),%xmm1 + movups %xmm0,(%eax) + movups %xmm1,(%edx) + leal 16(%edx),%edx + leal -16(%eax),%eax +.L120dec_key_inverse: + movups (%edx),%xmm0 + movups (%eax),%xmm1 + aesimc %xmm0,%xmm0 + aesimc %xmm1,%xmm1 + leal 16(%edx),%edx + leal -16(%eax),%eax + movups %xmm0,16(%eax) + movups %xmm1,-16(%edx) + cmpl %edx,%eax + ja .L120dec_key_inverse + movups (%edx),%xmm0 + aesimc %xmm0,%xmm0 + movups %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + ret +.size aes_hw_encrypt_key_to_decrypt_key,.-.L_aes_hw_encrypt_key_to_decrypt_key_begin +.align 64 +.Lkey_const: +.long 202313229,202313229,202313229,202313229 +.long 67569157,67569157,67569157,67569157 +.long 1,1,1,1 +.long 27,27,27,27 +.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 +.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 +.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 +.byte 115,108,46,111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/aesni-x86-win.asm b/third_party/boringssl/gen/bcm/aesni-x86-win.asm new file mode 100644 index 00000000..660b7728 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesni-x86-win.asm @@ -0,0 +1,2482 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +%ifdef BORINGSSL_DISPATCH_TEST +extern _BORINGSSL_function_hit +%endif +global _aes_hw_encrypt +align 16 +_aes_hw_encrypt: +L$_aes_hw_encrypt_begin: +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$000pic_for_function_hit +L$000pic_for_function_hit: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+1-L$000pic_for_function_hit)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + mov eax,DWORD [4+esp] + mov edx,DWORD [12+esp] + movups xmm2,[eax] + mov ecx,DWORD [240+edx] + mov eax,DWORD [8+esp] + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$001enc1_loop_1: + aesenc xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$001enc1_loop_1 + aesenclast xmm2,xmm1 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movups [eax],xmm2 + pxor xmm2,xmm2 + ret +global _aes_hw_decrypt +align 16 +_aes_hw_decrypt: +L$_aes_hw_decrypt_begin: + mov eax,DWORD [4+esp] + mov edx,DWORD [12+esp] + movups xmm2,[eax] + mov ecx,DWORD [240+edx] + mov eax,DWORD [8+esp] + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$002dec1_loop_2: + aesdec xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$002dec1_loop_2 + aesdeclast xmm2,xmm1 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movups [eax],xmm2 + pxor xmm2,xmm2 + ret +align 16 +__aesni_encrypt2: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx + add ecx,16 +L$003enc2_loop: + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + movups xmm1,[ecx*1+edx] + add ecx,32 + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$003enc2_loop + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenclast xmm2,xmm0 + aesenclast xmm3,xmm0 + ret +align 16 +__aesni_decrypt2: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx + add ecx,16 +L$004dec2_loop: + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + movups xmm1,[ecx*1+edx] + add ecx,32 + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$004dec2_loop + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdeclast xmm2,xmm0 + aesdeclast xmm3,xmm0 + ret +align 16 +__aesni_encrypt3: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx + add ecx,16 +L$005enc3_loop: + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + movups xmm1,[ecx*1+edx] + add ecx,32 + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + aesenc xmm4,xmm0 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$005enc3_loop + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenclast xmm2,xmm0 + aesenclast xmm3,xmm0 + aesenclast xmm4,xmm0 + ret +align 16 +__aesni_decrypt3: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx + add ecx,16 +L$006dec3_loop: + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + movups xmm1,[ecx*1+edx] + add ecx,32 + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$006dec3_loop + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdeclast xmm2,xmm0 + aesdeclast xmm3,xmm0 + aesdeclast xmm4,xmm0 + ret +align 16 +__aesni_encrypt4: + movups xmm0,[edx] + movups xmm1,[16+edx] + shl ecx,4 + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + pxor xmm5,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx +db 15,31,64,0 + add ecx,16 +L$007enc4_loop: + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + movups xmm1,[ecx*1+edx] + add ecx,32 + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + aesenc xmm4,xmm0 + aesenc xmm5,xmm0 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$007enc4_loop + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + aesenclast xmm2,xmm0 + aesenclast xmm3,xmm0 + aesenclast xmm4,xmm0 + aesenclast xmm5,xmm0 + ret +align 16 +__aesni_decrypt4: + movups xmm0,[edx] + movups xmm1,[16+edx] + shl ecx,4 + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + pxor xmm5,xmm0 + movups xmm0,[32+edx] + lea edx,[32+ecx*1+edx] + neg ecx +db 15,31,64,0 + add ecx,16 +L$008dec4_loop: + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + movups xmm1,[ecx*1+edx] + add ecx,32 + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + aesdec xmm5,xmm0 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$008dec4_loop + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdeclast xmm2,xmm0 + aesdeclast xmm3,xmm0 + aesdeclast xmm4,xmm0 + aesdeclast xmm5,xmm0 + ret +align 16 +__aesni_encrypt6: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + aesenc xmm2,xmm1 + pxor xmm5,xmm0 + pxor xmm6,xmm0 + aesenc xmm3,xmm1 + lea edx,[32+ecx*1+edx] + neg ecx + aesenc xmm4,xmm1 + pxor xmm7,xmm0 + movups xmm0,[ecx*1+edx] + add ecx,16 + jmp NEAR L$009_aesni_encrypt6_inner +align 16 +L$010enc6_loop: + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 +L$009_aesni_encrypt6_inner: + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 +L$_aesni_encrypt6_enter: + movups xmm1,[ecx*1+edx] + add ecx,32 + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + aesenc xmm4,xmm0 + aesenc xmm5,xmm0 + aesenc xmm6,xmm0 + aesenc xmm7,xmm0 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$010enc6_loop + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + aesenclast xmm2,xmm0 + aesenclast xmm3,xmm0 + aesenclast xmm4,xmm0 + aesenclast xmm5,xmm0 + aesenclast xmm6,xmm0 + aesenclast xmm7,xmm0 + ret +align 16 +__aesni_decrypt6: + movups xmm0,[edx] + shl ecx,4 + movups xmm1,[16+edx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + aesdec xmm2,xmm1 + pxor xmm5,xmm0 + pxor xmm6,xmm0 + aesdec xmm3,xmm1 + lea edx,[32+ecx*1+edx] + neg ecx + aesdec xmm4,xmm1 + pxor xmm7,xmm0 + movups xmm0,[ecx*1+edx] + add ecx,16 + jmp NEAR L$011_aesni_decrypt6_inner +align 16 +L$012dec6_loop: + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 +L$011_aesni_decrypt6_inner: + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 +L$_aesni_decrypt6_enter: + movups xmm1,[ecx*1+edx] + add ecx,32 + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + aesdec xmm5,xmm0 + aesdec xmm6,xmm0 + aesdec xmm7,xmm0 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$012dec6_loop + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + aesdeclast xmm2,xmm0 + aesdeclast xmm3,xmm0 + aesdeclast xmm4,xmm0 + aesdeclast xmm5,xmm0 + aesdeclast xmm6,xmm0 + aesdeclast xmm7,xmm0 + ret +global _aes_hw_ecb_encrypt +align 16 +_aes_hw_ecb_encrypt: +L$_aes_hw_ecb_encrypt_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebx,DWORD [36+esp] + and eax,-16 + jz NEAR L$013ecb_ret + mov ecx,DWORD [240+edx] + test ebx,ebx + jz NEAR L$014ecb_decrypt + mov ebp,edx + mov ebx,ecx + cmp eax,96 + jb NEAR L$015ecb_enc_tail + movdqu xmm2,[esi] + movdqu xmm3,[16+esi] + movdqu xmm4,[32+esi] + movdqu xmm5,[48+esi] + movdqu xmm6,[64+esi] + movdqu xmm7,[80+esi] + lea esi,[96+esi] + sub eax,96 + jmp NEAR L$016ecb_enc_loop6_enter +align 16 +L$017ecb_enc_loop6: + movups [edi],xmm2 + movdqu xmm2,[esi] + movups [16+edi],xmm3 + movdqu xmm3,[16+esi] + movups [32+edi],xmm4 + movdqu xmm4,[32+esi] + movups [48+edi],xmm5 + movdqu xmm5,[48+esi] + movups [64+edi],xmm6 + movdqu xmm6,[64+esi] + movups [80+edi],xmm7 + lea edi,[96+edi] + movdqu xmm7,[80+esi] + lea esi,[96+esi] +L$016ecb_enc_loop6_enter: + call __aesni_encrypt6 + mov edx,ebp + mov ecx,ebx + sub eax,96 + jnc NEAR L$017ecb_enc_loop6 + movups [edi],xmm2 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + movups [64+edi],xmm6 + movups [80+edi],xmm7 + lea edi,[96+edi] + add eax,96 + jz NEAR L$013ecb_ret +L$015ecb_enc_tail: + movups xmm2,[esi] + cmp eax,32 + jb NEAR L$018ecb_enc_one + movups xmm3,[16+esi] + je NEAR L$019ecb_enc_two + movups xmm4,[32+esi] + cmp eax,64 + jb NEAR L$020ecb_enc_three + movups xmm5,[48+esi] + je NEAR L$021ecb_enc_four + movups xmm6,[64+esi] + xorps xmm7,xmm7 + call __aesni_encrypt6 + movups [edi],xmm2 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + movups [64+edi],xmm6 + jmp NEAR L$013ecb_ret +align 16 +L$018ecb_enc_one: + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$022enc1_loop_3: + aesenc xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$022enc1_loop_3 + aesenclast xmm2,xmm1 + movups [edi],xmm2 + jmp NEAR L$013ecb_ret +align 16 +L$019ecb_enc_two: + call __aesni_encrypt2 + movups [edi],xmm2 + movups [16+edi],xmm3 + jmp NEAR L$013ecb_ret +align 16 +L$020ecb_enc_three: + call __aesni_encrypt3 + movups [edi],xmm2 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + jmp NEAR L$013ecb_ret +align 16 +L$021ecb_enc_four: + call __aesni_encrypt4 + movups [edi],xmm2 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + jmp NEAR L$013ecb_ret +align 16 +L$014ecb_decrypt: + mov ebp,edx + mov ebx,ecx + cmp eax,96 + jb NEAR L$023ecb_dec_tail + movdqu xmm2,[esi] + movdqu xmm3,[16+esi] + movdqu xmm4,[32+esi] + movdqu xmm5,[48+esi] + movdqu xmm6,[64+esi] + movdqu xmm7,[80+esi] + lea esi,[96+esi] + sub eax,96 + jmp NEAR L$024ecb_dec_loop6_enter +align 16 +L$025ecb_dec_loop6: + movups [edi],xmm2 + movdqu xmm2,[esi] + movups [16+edi],xmm3 + movdqu xmm3,[16+esi] + movups [32+edi],xmm4 + movdqu xmm4,[32+esi] + movups [48+edi],xmm5 + movdqu xmm5,[48+esi] + movups [64+edi],xmm6 + movdqu xmm6,[64+esi] + movups [80+edi],xmm7 + lea edi,[96+edi] + movdqu xmm7,[80+esi] + lea esi,[96+esi] +L$024ecb_dec_loop6_enter: + call __aesni_decrypt6 + mov edx,ebp + mov ecx,ebx + sub eax,96 + jnc NEAR L$025ecb_dec_loop6 + movups [edi],xmm2 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + movups [64+edi],xmm6 + movups [80+edi],xmm7 + lea edi,[96+edi] + add eax,96 + jz NEAR L$013ecb_ret +L$023ecb_dec_tail: + movups xmm2,[esi] + cmp eax,32 + jb NEAR L$026ecb_dec_one + movups xmm3,[16+esi] + je NEAR L$027ecb_dec_two + movups xmm4,[32+esi] + cmp eax,64 + jb NEAR L$028ecb_dec_three + movups xmm5,[48+esi] + je NEAR L$029ecb_dec_four + movups xmm6,[64+esi] + xorps xmm7,xmm7 + call __aesni_decrypt6 + movups [edi],xmm2 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + movups [64+edi],xmm6 + jmp NEAR L$013ecb_ret +align 16 +L$026ecb_dec_one: + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$030dec1_loop_4: + aesdec xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$030dec1_loop_4 + aesdeclast xmm2,xmm1 + movups [edi],xmm2 + jmp NEAR L$013ecb_ret +align 16 +L$027ecb_dec_two: + call __aesni_decrypt2 + movups [edi],xmm2 + movups [16+edi],xmm3 + jmp NEAR L$013ecb_ret +align 16 +L$028ecb_dec_three: + call __aesni_decrypt3 + movups [edi],xmm2 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + jmp NEAR L$013ecb_ret +align 16 +L$029ecb_dec_four: + call __aesni_decrypt4 + movups [edi],xmm2 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + movups [48+edi],xmm5 +L$013ecb_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + pop edi + pop esi + pop ebx + pop ebp + ret +global _aes_hw_ccm64_encrypt_blocks +align 16 +_aes_hw_ccm64_encrypt_blocks: +L$_aes_hw_ccm64_encrypt_blocks_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebx,DWORD [36+esp] + mov ecx,DWORD [40+esp] + mov ebp,esp + sub esp,60 + and esp,-16 + mov DWORD [48+esp],ebp + movdqu xmm7,[ebx] + movdqu xmm3,[ecx] + mov ecx,DWORD [240+edx] + mov DWORD [esp],202182159 + mov DWORD [4+esp],134810123 + mov DWORD [8+esp],67438087 + mov DWORD [12+esp],66051 + mov ebx,1 + xor ebp,ebp + mov DWORD [16+esp],ebx + mov DWORD [20+esp],ebp + mov DWORD [24+esp],ebp + mov DWORD [28+esp],ebp + shl ecx,4 + mov ebx,16 + lea ebp,[edx] + movdqa xmm5,[esp] + movdqa xmm2,xmm7 + lea edx,[32+ecx*1+edx] + sub ebx,ecx + pshufb xmm7,xmm5 +L$031ccm64_enc_outer: + movups xmm0,[ebp] + mov ecx,ebx + movups xmm6,[esi] + xorps xmm2,xmm0 + movups xmm1,[16+ebp] + xorps xmm0,xmm6 + xorps xmm3,xmm0 + movups xmm0,[32+ebp] +L$032ccm64_enc2_loop: + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + movups xmm1,[ecx*1+edx] + add ecx,32 + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$032ccm64_enc2_loop + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + paddq xmm7,[16+esp] + dec eax + aesenclast xmm2,xmm0 + aesenclast xmm3,xmm0 + lea esi,[16+esi] + xorps xmm6,xmm2 + movdqa xmm2,xmm7 + movups [edi],xmm6 + pshufb xmm2,xmm5 + lea edi,[16+edi] + jnz NEAR L$031ccm64_enc_outer + mov esp,DWORD [48+esp] + mov edi,DWORD [40+esp] + movups [edi],xmm3 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + pop edi + pop esi + pop ebx + pop ebp + ret +global _aes_hw_ccm64_decrypt_blocks +align 16 +_aes_hw_ccm64_decrypt_blocks: +L$_aes_hw_ccm64_decrypt_blocks_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebx,DWORD [36+esp] + mov ecx,DWORD [40+esp] + mov ebp,esp + sub esp,60 + and esp,-16 + mov DWORD [48+esp],ebp + movdqu xmm7,[ebx] + movdqu xmm3,[ecx] + mov ecx,DWORD [240+edx] + mov DWORD [esp],202182159 + mov DWORD [4+esp],134810123 + mov DWORD [8+esp],67438087 + mov DWORD [12+esp],66051 + mov ebx,1 + xor ebp,ebp + mov DWORD [16+esp],ebx + mov DWORD [20+esp],ebp + mov DWORD [24+esp],ebp + mov DWORD [28+esp],ebp + movdqa xmm5,[esp] + movdqa xmm2,xmm7 + mov ebp,edx + mov ebx,ecx + pshufb xmm7,xmm5 + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$033enc1_loop_5: + aesenc xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$033enc1_loop_5 + aesenclast xmm2,xmm1 + shl ebx,4 + mov ecx,16 + movups xmm6,[esi] + paddq xmm7,[16+esp] + lea esi,[16+esi] + sub ecx,ebx + lea edx,[32+ebx*1+ebp] + mov ebx,ecx + jmp NEAR L$034ccm64_dec_outer +align 16 +L$034ccm64_dec_outer: + xorps xmm6,xmm2 + movdqa xmm2,xmm7 + movups [edi],xmm6 + lea edi,[16+edi] + pshufb xmm2,xmm5 + sub eax,1 + jz NEAR L$035ccm64_dec_break + movups xmm0,[ebp] + mov ecx,ebx + movups xmm1,[16+ebp] + xorps xmm6,xmm0 + xorps xmm2,xmm0 + xorps xmm3,xmm6 + movups xmm0,[32+ebp] +L$036ccm64_dec2_loop: + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + movups xmm1,[ecx*1+edx] + add ecx,32 + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + movups xmm0,[ecx*1+edx-16] + jnz NEAR L$036ccm64_dec2_loop + movups xmm6,[esi] + paddq xmm7,[16+esp] + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenclast xmm2,xmm0 + aesenclast xmm3,xmm0 + lea esi,[16+esi] + jmp NEAR L$034ccm64_dec_outer +align 16 +L$035ccm64_dec_break: + mov ecx,DWORD [240+ebp] + mov edx,ebp + movups xmm0,[edx] + movups xmm1,[16+edx] + xorps xmm6,xmm0 + lea edx,[32+edx] + xorps xmm3,xmm6 +L$037enc1_loop_6: + aesenc xmm3,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$037enc1_loop_6 + aesenclast xmm3,xmm1 + mov esp,DWORD [48+esp] + mov edi,DWORD [40+esp] + movups [edi],xmm3 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + pop edi + pop esi + pop ebx + pop ebp + ret +global _aes_hw_ctr32_encrypt_blocks +align 16 +_aes_hw_ctr32_encrypt_blocks: +L$_aes_hw_ctr32_encrypt_blocks_begin: + push ebp + push ebx + push esi + push edi +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$038pic_for_function_hit +L$038pic_for_function_hit: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+0-L$038pic_for_function_hit)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebx,DWORD [36+esp] + mov ebp,esp + sub esp,88 + and esp,-16 + mov DWORD [80+esp],ebp + cmp eax,1 + je NEAR L$039ctr32_one_shortcut + movdqu xmm7,[ebx] + mov DWORD [esp],202182159 + mov DWORD [4+esp],134810123 + mov DWORD [8+esp],67438087 + mov DWORD [12+esp],66051 + mov ecx,6 + xor ebp,ebp + mov DWORD [16+esp],ecx + mov DWORD [20+esp],ecx + mov DWORD [24+esp],ecx + mov DWORD [28+esp],ebp + pextrd ebx,xmm7,3 + pinsrd xmm7,ebp,3 + mov ecx,DWORD [240+edx] + bswap ebx + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movdqa xmm2,[esp] + pinsrd xmm0,ebx,0 + lea ebp,[3+ebx] + pinsrd xmm1,ebp,0 + inc ebx + pinsrd xmm0,ebx,1 + inc ebp + pinsrd xmm1,ebp,1 + inc ebx + pinsrd xmm0,ebx,2 + inc ebp + pinsrd xmm1,ebp,2 + movdqa [48+esp],xmm0 + pshufb xmm0,xmm2 + movdqu xmm6,[edx] + movdqa [64+esp],xmm1 + pshufb xmm1,xmm2 + pshufd xmm2,xmm0,192 + pshufd xmm3,xmm0,128 + cmp eax,6 + jb NEAR L$040ctr32_tail + pxor xmm7,xmm6 + shl ecx,4 + mov ebx,16 + movdqa [32+esp],xmm7 + mov ebp,edx + sub ebx,ecx + lea edx,[32+ecx*1+edx] + sub eax,6 + jmp NEAR L$041ctr32_loop6 +align 16 +L$041ctr32_loop6: + pshufd xmm4,xmm0,64 + movdqa xmm0,[32+esp] + pshufd xmm5,xmm1,192 + pxor xmm2,xmm0 + pshufd xmm6,xmm1,128 + pxor xmm3,xmm0 + pshufd xmm7,xmm1,64 + movups xmm1,[16+ebp] + pxor xmm4,xmm0 + pxor xmm5,xmm0 + aesenc xmm2,xmm1 + pxor xmm6,xmm0 + pxor xmm7,xmm0 + aesenc xmm3,xmm1 + movups xmm0,[32+ebp] + mov ecx,ebx + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + call L$_aesni_encrypt6_enter + movups xmm1,[esi] + movups xmm0,[16+esi] + xorps xmm2,xmm1 + movups xmm1,[32+esi] + xorps xmm3,xmm0 + movups [edi],xmm2 + movdqa xmm0,[16+esp] + xorps xmm4,xmm1 + movdqa xmm1,[64+esp] + movups [16+edi],xmm3 + movups [32+edi],xmm4 + paddd xmm1,xmm0 + paddd xmm0,[48+esp] + movdqa xmm2,[esp] + movups xmm3,[48+esi] + movups xmm4,[64+esi] + xorps xmm5,xmm3 + movups xmm3,[80+esi] + lea esi,[96+esi] + movdqa [48+esp],xmm0 + pshufb xmm0,xmm2 + xorps xmm6,xmm4 + movups [48+edi],xmm5 + xorps xmm7,xmm3 + movdqa [64+esp],xmm1 + pshufb xmm1,xmm2 + movups [64+edi],xmm6 + pshufd xmm2,xmm0,192 + movups [80+edi],xmm7 + lea edi,[96+edi] + pshufd xmm3,xmm0,128 + sub eax,6 + jnc NEAR L$041ctr32_loop6 + add eax,6 + jz NEAR L$042ctr32_ret + movdqu xmm7,[ebp] + mov edx,ebp + pxor xmm7,[32+esp] + mov ecx,DWORD [240+ebp] +L$040ctr32_tail: + por xmm2,xmm7 + cmp eax,2 + jb NEAR L$043ctr32_one + pshufd xmm4,xmm0,64 + por xmm3,xmm7 + je NEAR L$044ctr32_two + pshufd xmm5,xmm1,192 + por xmm4,xmm7 + cmp eax,4 + jb NEAR L$045ctr32_three + pshufd xmm6,xmm1,128 + por xmm5,xmm7 + je NEAR L$046ctr32_four + por xmm6,xmm7 + call __aesni_encrypt6 + movups xmm1,[esi] + movups xmm0,[16+esi] + xorps xmm2,xmm1 + movups xmm1,[32+esi] + xorps xmm3,xmm0 + movups xmm0,[48+esi] + xorps xmm4,xmm1 + movups xmm1,[64+esi] + xorps xmm5,xmm0 + movups [edi],xmm2 + xorps xmm6,xmm1 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + movups [64+edi],xmm6 + jmp NEAR L$042ctr32_ret +align 16 +L$039ctr32_one_shortcut: + movups xmm2,[ebx] + mov ecx,DWORD [240+edx] +L$043ctr32_one: + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$047enc1_loop_7: + aesenc xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$047enc1_loop_7 + aesenclast xmm2,xmm1 + movups xmm6,[esi] + xorps xmm6,xmm2 + movups [edi],xmm6 + jmp NEAR L$042ctr32_ret +align 16 +L$044ctr32_two: + call __aesni_encrypt2 + movups xmm5,[esi] + movups xmm6,[16+esi] + xorps xmm2,xmm5 + xorps xmm3,xmm6 + movups [edi],xmm2 + movups [16+edi],xmm3 + jmp NEAR L$042ctr32_ret +align 16 +L$045ctr32_three: + call __aesni_encrypt3 + movups xmm5,[esi] + movups xmm6,[16+esi] + xorps xmm2,xmm5 + movups xmm7,[32+esi] + xorps xmm3,xmm6 + movups [edi],xmm2 + xorps xmm4,xmm7 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + jmp NEAR L$042ctr32_ret +align 16 +L$046ctr32_four: + call __aesni_encrypt4 + movups xmm6,[esi] + movups xmm7,[16+esi] + movups xmm1,[32+esi] + xorps xmm2,xmm6 + movups xmm0,[48+esi] + xorps xmm3,xmm7 + movups [edi],xmm2 + xorps xmm4,xmm1 + movups [16+edi],xmm3 + xorps xmm5,xmm0 + movups [32+edi],xmm4 + movups [48+edi],xmm5 +L$042ctr32_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + movdqa [32+esp],xmm0 + pxor xmm5,xmm5 + movdqa [48+esp],xmm0 + pxor xmm6,xmm6 + movdqa [64+esp],xmm0 + pxor xmm7,xmm7 + mov esp,DWORD [80+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +global _aes_hw_xts_encrypt +align 16 +_aes_hw_xts_encrypt: +L$_aes_hw_xts_encrypt_begin: + push ebp + push ebx + push esi + push edi + mov edx,DWORD [36+esp] + mov esi,DWORD [40+esp] + mov ecx,DWORD [240+edx] + movups xmm2,[esi] + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$048enc1_loop_8: + aesenc xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$048enc1_loop_8 + aesenclast xmm2,xmm1 + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebp,esp + sub esp,120 + mov ecx,DWORD [240+edx] + and esp,-16 + mov DWORD [96+esp],135 + mov DWORD [100+esp],0 + mov DWORD [104+esp],1 + mov DWORD [108+esp],0 + mov DWORD [112+esp],eax + mov DWORD [116+esp],ebp + movdqa xmm1,xmm2 + pxor xmm0,xmm0 + movdqa xmm3,[96+esp] + pcmpgtd xmm0,xmm1 + and eax,-16 + mov ebp,edx + mov ebx,ecx + sub eax,96 + jc NEAR L$049xts_enc_short + shl ecx,4 + mov ebx,16 + sub ebx,ecx + lea edx,[32+ecx*1+edx] + jmp NEAR L$050xts_enc_loop6 +align 16 +L$050xts_enc_loop6: + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa [esp],xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa [16+esp],xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa [32+esp],xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa [48+esp],xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + pshufd xmm7,xmm0,19 + movdqa [64+esp],xmm1 + paddq xmm1,xmm1 + movups xmm0,[ebp] + pand xmm7,xmm3 + movups xmm2,[esi] + pxor xmm7,xmm1 + mov ecx,ebx + movdqu xmm3,[16+esi] + xorps xmm2,xmm0 + movdqu xmm4,[32+esi] + pxor xmm3,xmm0 + movdqu xmm5,[48+esi] + pxor xmm4,xmm0 + movdqu xmm6,[64+esi] + pxor xmm5,xmm0 + movdqu xmm1,[80+esi] + pxor xmm6,xmm0 + lea esi,[96+esi] + pxor xmm2,[esp] + movdqa [80+esp],xmm7 + pxor xmm7,xmm1 + movups xmm1,[16+ebp] + pxor xmm3,[16+esp] + pxor xmm4,[32+esp] + aesenc xmm2,xmm1 + pxor xmm5,[48+esp] + pxor xmm6,[64+esp] + aesenc xmm3,xmm1 + pxor xmm7,xmm0 + movups xmm0,[32+ebp] + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + call L$_aesni_encrypt6_enter + movdqa xmm1,[80+esp] + pxor xmm0,xmm0 + xorps xmm2,[esp] + pcmpgtd xmm0,xmm1 + xorps xmm3,[16+esp] + movups [edi],xmm2 + xorps xmm4,[32+esp] + movups [16+edi],xmm3 + xorps xmm5,[48+esp] + movups [32+edi],xmm4 + xorps xmm6,[64+esp] + movups [48+edi],xmm5 + xorps xmm7,xmm1 + movups [64+edi],xmm6 + pshufd xmm2,xmm0,19 + movups [80+edi],xmm7 + lea edi,[96+edi] + movdqa xmm3,[96+esp] + pxor xmm0,xmm0 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + sub eax,96 + jnc NEAR L$050xts_enc_loop6 + mov ecx,DWORD [240+ebp] + mov edx,ebp + mov ebx,ecx +L$049xts_enc_short: + add eax,96 + jz NEAR L$051xts_enc_done6x + movdqa xmm5,xmm1 + cmp eax,32 + jb NEAR L$052xts_enc_one + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + je NEAR L$053xts_enc_two + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa xmm6,xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + cmp eax,64 + jb NEAR L$054xts_enc_three + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa xmm7,xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + movdqa [esp],xmm5 + movdqa [16+esp],xmm6 + je NEAR L$055xts_enc_four + movdqa [32+esp],xmm7 + pshufd xmm7,xmm0,19 + movdqa [48+esp],xmm1 + paddq xmm1,xmm1 + pand xmm7,xmm3 + pxor xmm7,xmm1 + movdqu xmm2,[esi] + movdqu xmm3,[16+esi] + movdqu xmm4,[32+esi] + pxor xmm2,[esp] + movdqu xmm5,[48+esi] + pxor xmm3,[16+esp] + movdqu xmm6,[64+esi] + pxor xmm4,[32+esp] + lea esi,[80+esi] + pxor xmm5,[48+esp] + movdqa [64+esp],xmm7 + pxor xmm6,xmm7 + call __aesni_encrypt6 + movaps xmm1,[64+esp] + xorps xmm2,[esp] + xorps xmm3,[16+esp] + xorps xmm4,[32+esp] + movups [edi],xmm2 + xorps xmm5,[48+esp] + movups [16+edi],xmm3 + xorps xmm6,xmm1 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + movups [64+edi],xmm6 + lea edi,[80+edi] + jmp NEAR L$056xts_enc_done +align 16 +L$052xts_enc_one: + movups xmm2,[esi] + lea esi,[16+esi] + xorps xmm2,xmm5 + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$057enc1_loop_9: + aesenc xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$057enc1_loop_9 + aesenclast xmm2,xmm1 + xorps xmm2,xmm5 + movups [edi],xmm2 + lea edi,[16+edi] + movdqa xmm1,xmm5 + jmp NEAR L$056xts_enc_done +align 16 +L$053xts_enc_two: + movaps xmm6,xmm1 + movups xmm2,[esi] + movups xmm3,[16+esi] + lea esi,[32+esi] + xorps xmm2,xmm5 + xorps xmm3,xmm6 + call __aesni_encrypt2 + xorps xmm2,xmm5 + xorps xmm3,xmm6 + movups [edi],xmm2 + movups [16+edi],xmm3 + lea edi,[32+edi] + movdqa xmm1,xmm6 + jmp NEAR L$056xts_enc_done +align 16 +L$054xts_enc_three: + movaps xmm7,xmm1 + movups xmm2,[esi] + movups xmm3,[16+esi] + movups xmm4,[32+esi] + lea esi,[48+esi] + xorps xmm2,xmm5 + xorps xmm3,xmm6 + xorps xmm4,xmm7 + call __aesni_encrypt3 + xorps xmm2,xmm5 + xorps xmm3,xmm6 + xorps xmm4,xmm7 + movups [edi],xmm2 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + lea edi,[48+edi] + movdqa xmm1,xmm7 + jmp NEAR L$056xts_enc_done +align 16 +L$055xts_enc_four: + movaps xmm6,xmm1 + movups xmm2,[esi] + movups xmm3,[16+esi] + movups xmm4,[32+esi] + xorps xmm2,[esp] + movups xmm5,[48+esi] + lea esi,[64+esi] + xorps xmm3,[16+esp] + xorps xmm4,xmm7 + xorps xmm5,xmm6 + call __aesni_encrypt4 + xorps xmm2,[esp] + xorps xmm3,[16+esp] + xorps xmm4,xmm7 + movups [edi],xmm2 + xorps xmm5,xmm6 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + lea edi,[64+edi] + movdqa xmm1,xmm6 + jmp NEAR L$056xts_enc_done +align 16 +L$051xts_enc_done6x: + mov eax,DWORD [112+esp] + and eax,15 + jz NEAR L$058xts_enc_ret + movdqa xmm5,xmm1 + mov DWORD [112+esp],eax + jmp NEAR L$059xts_enc_steal +align 16 +L$056xts_enc_done: + mov eax,DWORD [112+esp] + pxor xmm0,xmm0 + and eax,15 + jz NEAR L$058xts_enc_ret + pcmpgtd xmm0,xmm1 + mov DWORD [112+esp],eax + pshufd xmm5,xmm0,19 + paddq xmm1,xmm1 + pand xmm5,[96+esp] + pxor xmm5,xmm1 +L$059xts_enc_steal: + movzx ecx,BYTE [esi] + movzx edx,BYTE [edi-16] + lea esi,[1+esi] + mov BYTE [edi-16],cl + mov BYTE [edi],dl + lea edi,[1+edi] + sub eax,1 + jnz NEAR L$059xts_enc_steal + sub edi,DWORD [112+esp] + mov edx,ebp + mov ecx,ebx + movups xmm2,[edi-16] + xorps xmm2,xmm5 + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$060enc1_loop_10: + aesenc xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$060enc1_loop_10 + aesenclast xmm2,xmm1 + xorps xmm2,xmm5 + movups [edi-16],xmm2 +L$058xts_enc_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + movdqa [esp],xmm0 + pxor xmm3,xmm3 + movdqa [16+esp],xmm0 + pxor xmm4,xmm4 + movdqa [32+esp],xmm0 + pxor xmm5,xmm5 + movdqa [48+esp],xmm0 + pxor xmm6,xmm6 + movdqa [64+esp],xmm0 + pxor xmm7,xmm7 + movdqa [80+esp],xmm0 + mov esp,DWORD [116+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +global _aes_hw_xts_decrypt +align 16 +_aes_hw_xts_decrypt: +L$_aes_hw_xts_decrypt_begin: + push ebp + push ebx + push esi + push edi + mov edx,DWORD [36+esp] + mov esi,DWORD [40+esp] + mov ecx,DWORD [240+edx] + movups xmm2,[esi] + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$061enc1_loop_11: + aesenc xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$061enc1_loop_11 + aesenclast xmm2,xmm1 + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebp,esp + sub esp,120 + and esp,-16 + xor ebx,ebx + test eax,15 + setnz bl + shl ebx,4 + sub eax,ebx + mov DWORD [96+esp],135 + mov DWORD [100+esp],0 + mov DWORD [104+esp],1 + mov DWORD [108+esp],0 + mov DWORD [112+esp],eax + mov DWORD [116+esp],ebp + mov ecx,DWORD [240+edx] + mov ebp,edx + mov ebx,ecx + movdqa xmm1,xmm2 + pxor xmm0,xmm0 + movdqa xmm3,[96+esp] + pcmpgtd xmm0,xmm1 + and eax,-16 + sub eax,96 + jc NEAR L$062xts_dec_short + shl ecx,4 + mov ebx,16 + sub ebx,ecx + lea edx,[32+ecx*1+edx] + jmp NEAR L$063xts_dec_loop6 +align 16 +L$063xts_dec_loop6: + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa [esp],xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa [16+esp],xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa [32+esp],xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa [48+esp],xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + pshufd xmm7,xmm0,19 + movdqa [64+esp],xmm1 + paddq xmm1,xmm1 + movups xmm0,[ebp] + pand xmm7,xmm3 + movups xmm2,[esi] + pxor xmm7,xmm1 + mov ecx,ebx + movdqu xmm3,[16+esi] + xorps xmm2,xmm0 + movdqu xmm4,[32+esi] + pxor xmm3,xmm0 + movdqu xmm5,[48+esi] + pxor xmm4,xmm0 + movdqu xmm6,[64+esi] + pxor xmm5,xmm0 + movdqu xmm1,[80+esi] + pxor xmm6,xmm0 + lea esi,[96+esi] + pxor xmm2,[esp] + movdqa [80+esp],xmm7 + pxor xmm7,xmm1 + movups xmm1,[16+ebp] + pxor xmm3,[16+esp] + pxor xmm4,[32+esp] + aesdec xmm2,xmm1 + pxor xmm5,[48+esp] + pxor xmm6,[64+esp] + aesdec xmm3,xmm1 + pxor xmm7,xmm0 + movups xmm0,[32+ebp] + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + call L$_aesni_decrypt6_enter + movdqa xmm1,[80+esp] + pxor xmm0,xmm0 + xorps xmm2,[esp] + pcmpgtd xmm0,xmm1 + xorps xmm3,[16+esp] + movups [edi],xmm2 + xorps xmm4,[32+esp] + movups [16+edi],xmm3 + xorps xmm5,[48+esp] + movups [32+edi],xmm4 + xorps xmm6,[64+esp] + movups [48+edi],xmm5 + xorps xmm7,xmm1 + movups [64+edi],xmm6 + pshufd xmm2,xmm0,19 + movups [80+edi],xmm7 + lea edi,[96+edi] + movdqa xmm3,[96+esp] + pxor xmm0,xmm0 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + sub eax,96 + jnc NEAR L$063xts_dec_loop6 + mov ecx,DWORD [240+ebp] + mov edx,ebp + mov ebx,ecx +L$062xts_dec_short: + add eax,96 + jz NEAR L$064xts_dec_done6x + movdqa xmm5,xmm1 + cmp eax,32 + jb NEAR L$065xts_dec_one + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + je NEAR L$066xts_dec_two + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa xmm6,xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + cmp eax,64 + jb NEAR L$067xts_dec_three + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa xmm7,xmm1 + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 + movdqa [esp],xmm5 + movdqa [16+esp],xmm6 + je NEAR L$068xts_dec_four + movdqa [32+esp],xmm7 + pshufd xmm7,xmm0,19 + movdqa [48+esp],xmm1 + paddq xmm1,xmm1 + pand xmm7,xmm3 + pxor xmm7,xmm1 + movdqu xmm2,[esi] + movdqu xmm3,[16+esi] + movdqu xmm4,[32+esi] + pxor xmm2,[esp] + movdqu xmm5,[48+esi] + pxor xmm3,[16+esp] + movdqu xmm6,[64+esi] + pxor xmm4,[32+esp] + lea esi,[80+esi] + pxor xmm5,[48+esp] + movdqa [64+esp],xmm7 + pxor xmm6,xmm7 + call __aesni_decrypt6 + movaps xmm1,[64+esp] + xorps xmm2,[esp] + xorps xmm3,[16+esp] + xorps xmm4,[32+esp] + movups [edi],xmm2 + xorps xmm5,[48+esp] + movups [16+edi],xmm3 + xorps xmm6,xmm1 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + movups [64+edi],xmm6 + lea edi,[80+edi] + jmp NEAR L$069xts_dec_done +align 16 +L$065xts_dec_one: + movups xmm2,[esi] + lea esi,[16+esi] + xorps xmm2,xmm5 + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$070dec1_loop_12: + aesdec xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$070dec1_loop_12 + aesdeclast xmm2,xmm1 + xorps xmm2,xmm5 + movups [edi],xmm2 + lea edi,[16+edi] + movdqa xmm1,xmm5 + jmp NEAR L$069xts_dec_done +align 16 +L$066xts_dec_two: + movaps xmm6,xmm1 + movups xmm2,[esi] + movups xmm3,[16+esi] + lea esi,[32+esi] + xorps xmm2,xmm5 + xorps xmm3,xmm6 + call __aesni_decrypt2 + xorps xmm2,xmm5 + xorps xmm3,xmm6 + movups [edi],xmm2 + movups [16+edi],xmm3 + lea edi,[32+edi] + movdqa xmm1,xmm6 + jmp NEAR L$069xts_dec_done +align 16 +L$067xts_dec_three: + movaps xmm7,xmm1 + movups xmm2,[esi] + movups xmm3,[16+esi] + movups xmm4,[32+esi] + lea esi,[48+esi] + xorps xmm2,xmm5 + xorps xmm3,xmm6 + xorps xmm4,xmm7 + call __aesni_decrypt3 + xorps xmm2,xmm5 + xorps xmm3,xmm6 + xorps xmm4,xmm7 + movups [edi],xmm2 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + lea edi,[48+edi] + movdqa xmm1,xmm7 + jmp NEAR L$069xts_dec_done +align 16 +L$068xts_dec_four: + movaps xmm6,xmm1 + movups xmm2,[esi] + movups xmm3,[16+esi] + movups xmm4,[32+esi] + xorps xmm2,[esp] + movups xmm5,[48+esi] + lea esi,[64+esi] + xorps xmm3,[16+esp] + xorps xmm4,xmm7 + xorps xmm5,xmm6 + call __aesni_decrypt4 + xorps xmm2,[esp] + xorps xmm3,[16+esp] + xorps xmm4,xmm7 + movups [edi],xmm2 + xorps xmm5,xmm6 + movups [16+edi],xmm3 + movups [32+edi],xmm4 + movups [48+edi],xmm5 + lea edi,[64+edi] + movdqa xmm1,xmm6 + jmp NEAR L$069xts_dec_done +align 16 +L$064xts_dec_done6x: + mov eax,DWORD [112+esp] + and eax,15 + jz NEAR L$071xts_dec_ret + mov DWORD [112+esp],eax + jmp NEAR L$072xts_dec_only_one_more +align 16 +L$069xts_dec_done: + mov eax,DWORD [112+esp] + pxor xmm0,xmm0 + and eax,15 + jz NEAR L$071xts_dec_ret + pcmpgtd xmm0,xmm1 + mov DWORD [112+esp],eax + pshufd xmm2,xmm0,19 + pxor xmm0,xmm0 + movdqa xmm3,[96+esp] + paddq xmm1,xmm1 + pand xmm2,xmm3 + pcmpgtd xmm0,xmm1 + pxor xmm1,xmm2 +L$072xts_dec_only_one_more: + pshufd xmm5,xmm0,19 + movdqa xmm6,xmm1 + paddq xmm1,xmm1 + pand xmm5,xmm3 + pxor xmm5,xmm1 + mov edx,ebp + mov ecx,ebx + movups xmm2,[esi] + xorps xmm2,xmm5 + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$073dec1_loop_13: + aesdec xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$073dec1_loop_13 + aesdeclast xmm2,xmm1 + xorps xmm2,xmm5 + movups [edi],xmm2 +L$074xts_dec_steal: + movzx ecx,BYTE [16+esi] + movzx edx,BYTE [edi] + lea esi,[1+esi] + mov BYTE [edi],cl + mov BYTE [16+edi],dl + lea edi,[1+edi] + sub eax,1 + jnz NEAR L$074xts_dec_steal + sub edi,DWORD [112+esp] + mov edx,ebp + mov ecx,ebx + movups xmm2,[edi] + xorps xmm2,xmm6 + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$075dec1_loop_14: + aesdec xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$075dec1_loop_14 + aesdeclast xmm2,xmm1 + xorps xmm2,xmm6 + movups [edi],xmm2 +L$071xts_dec_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + movdqa [esp],xmm0 + pxor xmm3,xmm3 + movdqa [16+esp],xmm0 + pxor xmm4,xmm4 + movdqa [32+esp],xmm0 + pxor xmm5,xmm5 + movdqa [48+esp],xmm0 + pxor xmm6,xmm6 + movdqa [64+esp],xmm0 + pxor xmm7,xmm7 + movdqa [80+esp],xmm0 + mov esp,DWORD [116+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +global _aes_hw_cbc_encrypt +align 16 +_aes_hw_cbc_encrypt: +L$_aes_hw_cbc_encrypt_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + mov ebx,esp + mov edi,DWORD [24+esp] + sub ebx,24 + mov eax,DWORD [28+esp] + and ebx,-16 + mov edx,DWORD [32+esp] + mov ebp,DWORD [36+esp] + test eax,eax + jz NEAR L$076cbc_abort + cmp DWORD [40+esp],0 + xchg ebx,esp + movups xmm7,[ebp] + mov ecx,DWORD [240+edx] + mov ebp,edx + mov DWORD [16+esp],ebx + mov ebx,ecx + je NEAR L$077cbc_decrypt + movaps xmm2,xmm7 + cmp eax,16 + jb NEAR L$078cbc_enc_tail + sub eax,16 + jmp NEAR L$079cbc_enc_loop +align 16 +L$079cbc_enc_loop: + movups xmm7,[esi] + lea esi,[16+esi] + movups xmm0,[edx] + movups xmm1,[16+edx] + xorps xmm7,xmm0 + lea edx,[32+edx] + xorps xmm2,xmm7 +L$080enc1_loop_15: + aesenc xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$080enc1_loop_15 + aesenclast xmm2,xmm1 + mov ecx,ebx + mov edx,ebp + movups [edi],xmm2 + lea edi,[16+edi] + sub eax,16 + jnc NEAR L$079cbc_enc_loop + add eax,16 + jnz NEAR L$078cbc_enc_tail + movaps xmm7,xmm2 + pxor xmm2,xmm2 + jmp NEAR L$081cbc_ret +L$078cbc_enc_tail: + mov ecx,eax +dd 2767451785 + mov ecx,16 + sub ecx,eax + xor eax,eax +dd 2868115081 + lea edi,[edi-16] + mov ecx,ebx + mov esi,edi + mov edx,ebp + jmp NEAR L$079cbc_enc_loop +align 16 +L$077cbc_decrypt: + cmp eax,80 + jbe NEAR L$082cbc_dec_tail + movaps [esp],xmm7 + sub eax,80 + jmp NEAR L$083cbc_dec_loop6_enter +align 16 +L$084cbc_dec_loop6: + movaps [esp],xmm0 + movups [edi],xmm7 + lea edi,[16+edi] +L$083cbc_dec_loop6_enter: + movdqu xmm2,[esi] + movdqu xmm3,[16+esi] + movdqu xmm4,[32+esi] + movdqu xmm5,[48+esi] + movdqu xmm6,[64+esi] + movdqu xmm7,[80+esi] + call __aesni_decrypt6 + movups xmm1,[esi] + movups xmm0,[16+esi] + xorps xmm2,[esp] + xorps xmm3,xmm1 + movups xmm1,[32+esi] + xorps xmm4,xmm0 + movups xmm0,[48+esi] + xorps xmm5,xmm1 + movups xmm1,[64+esi] + xorps xmm6,xmm0 + movups xmm0,[80+esi] + xorps xmm7,xmm1 + movups [edi],xmm2 + movups [16+edi],xmm3 + lea esi,[96+esi] + movups [32+edi],xmm4 + mov ecx,ebx + movups [48+edi],xmm5 + mov edx,ebp + movups [64+edi],xmm6 + lea edi,[80+edi] + sub eax,96 + ja NEAR L$084cbc_dec_loop6 + movaps xmm2,xmm7 + movaps xmm7,xmm0 + add eax,80 + jle NEAR L$085cbc_dec_clear_tail_collected + movups [edi],xmm2 + lea edi,[16+edi] +L$082cbc_dec_tail: + movups xmm2,[esi] + movaps xmm6,xmm2 + cmp eax,16 + jbe NEAR L$086cbc_dec_one + movups xmm3,[16+esi] + movaps xmm5,xmm3 + cmp eax,32 + jbe NEAR L$087cbc_dec_two + movups xmm4,[32+esi] + cmp eax,48 + jbe NEAR L$088cbc_dec_three + movups xmm5,[48+esi] + cmp eax,64 + jbe NEAR L$089cbc_dec_four + movups xmm6,[64+esi] + movaps [esp],xmm7 + movups xmm2,[esi] + xorps xmm7,xmm7 + call __aesni_decrypt6 + movups xmm1,[esi] + movups xmm0,[16+esi] + xorps xmm2,[esp] + xorps xmm3,xmm1 + movups xmm1,[32+esi] + xorps xmm4,xmm0 + movups xmm0,[48+esi] + xorps xmm5,xmm1 + movups xmm7,[64+esi] + xorps xmm6,xmm0 + movups [edi],xmm2 + movups [16+edi],xmm3 + pxor xmm3,xmm3 + movups [32+edi],xmm4 + pxor xmm4,xmm4 + movups [48+edi],xmm5 + pxor xmm5,xmm5 + lea edi,[64+edi] + movaps xmm2,xmm6 + pxor xmm6,xmm6 + sub eax,80 + jmp NEAR L$090cbc_dec_tail_collected +align 16 +L$086cbc_dec_one: + movups xmm0,[edx] + movups xmm1,[16+edx] + lea edx,[32+edx] + xorps xmm2,xmm0 +L$091dec1_loop_16: + aesdec xmm2,xmm1 + dec ecx + movups xmm1,[edx] + lea edx,[16+edx] + jnz NEAR L$091dec1_loop_16 + aesdeclast xmm2,xmm1 + xorps xmm2,xmm7 + movaps xmm7,xmm6 + sub eax,16 + jmp NEAR L$090cbc_dec_tail_collected +align 16 +L$087cbc_dec_two: + call __aesni_decrypt2 + xorps xmm2,xmm7 + xorps xmm3,xmm6 + movups [edi],xmm2 + movaps xmm2,xmm3 + pxor xmm3,xmm3 + lea edi,[16+edi] + movaps xmm7,xmm5 + sub eax,32 + jmp NEAR L$090cbc_dec_tail_collected +align 16 +L$088cbc_dec_three: + call __aesni_decrypt3 + xorps xmm2,xmm7 + xorps xmm3,xmm6 + xorps xmm4,xmm5 + movups [edi],xmm2 + movaps xmm2,xmm4 + pxor xmm4,xmm4 + movups [16+edi],xmm3 + pxor xmm3,xmm3 + lea edi,[32+edi] + movups xmm7,[32+esi] + sub eax,48 + jmp NEAR L$090cbc_dec_tail_collected +align 16 +L$089cbc_dec_four: + call __aesni_decrypt4 + movups xmm1,[16+esi] + movups xmm0,[32+esi] + xorps xmm2,xmm7 + movups xmm7,[48+esi] + xorps xmm3,xmm6 + movups [edi],xmm2 + xorps xmm4,xmm1 + movups [16+edi],xmm3 + pxor xmm3,xmm3 + xorps xmm5,xmm0 + movups [32+edi],xmm4 + pxor xmm4,xmm4 + lea edi,[48+edi] + movaps xmm2,xmm5 + pxor xmm5,xmm5 + sub eax,64 + jmp NEAR L$090cbc_dec_tail_collected +align 16 +L$085cbc_dec_clear_tail_collected: + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 +L$090cbc_dec_tail_collected: + and eax,15 + jnz NEAR L$092cbc_dec_tail_partial + movups [edi],xmm2 + pxor xmm0,xmm0 + jmp NEAR L$081cbc_ret +align 16 +L$092cbc_dec_tail_partial: + movaps [esp],xmm2 + pxor xmm0,xmm0 + mov ecx,16 + mov esi,esp + sub ecx,eax +dd 2767451785 + movdqa [esp],xmm2 +L$081cbc_ret: + mov esp,DWORD [16+esp] + mov ebp,DWORD [36+esp] + pxor xmm2,xmm2 + pxor xmm1,xmm1 + movups [ebp],xmm7 + pxor xmm7,xmm7 +L$076cbc_abort: + pop edi + pop esi + pop ebx + pop ebp + ret +global _aes_hw_set_encrypt_key_base +align 16 +_aes_hw_set_encrypt_key_base: +L$_aes_hw_set_encrypt_key_base_begin: +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$093pic_for_function_hit +L$093pic_for_function_hit: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+3-L$093pic_for_function_hit)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + mov eax,DWORD [4+esp] + mov ecx,DWORD [8+esp] + mov edx,DWORD [12+esp] + push ebx + call L$094pic +L$094pic: + pop ebx + lea ebx,[(L$key_const-L$094pic)+ebx] + movups xmm0,[eax] + xorps xmm4,xmm4 + lea edx,[16+edx] + cmp ecx,256 + je NEAR L$09514rounds + cmp ecx,192 + je NEAR L$09612rounds + cmp ecx,128 + jne NEAR L$097bad_keybits +align 16 +L$09810rounds: + mov ecx,9 + movups [edx-16],xmm0 + aeskeygenassist xmm1,xmm0,1 + call L$099key_128_cold + aeskeygenassist xmm1,xmm0,2 + call L$100key_128 + aeskeygenassist xmm1,xmm0,4 + call L$100key_128 + aeskeygenassist xmm1,xmm0,8 + call L$100key_128 + aeskeygenassist xmm1,xmm0,16 + call L$100key_128 + aeskeygenassist xmm1,xmm0,32 + call L$100key_128 + aeskeygenassist xmm1,xmm0,64 + call L$100key_128 + aeskeygenassist xmm1,xmm0,128 + call L$100key_128 + aeskeygenassist xmm1,xmm0,27 + call L$100key_128 + aeskeygenassist xmm1,xmm0,54 + call L$100key_128 + movups [edx],xmm0 + mov DWORD [80+edx],ecx + jmp NEAR L$101good_key +align 16 +L$100key_128: + movups [edx],xmm0 + lea edx,[16+edx] +L$099key_128_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + ret +align 16 +L$09612rounds: + movq xmm2,[16+eax] + mov ecx,11 + movups [edx-16],xmm0 + aeskeygenassist xmm1,xmm2,1 + call L$102key_192a_cold + aeskeygenassist xmm1,xmm2,2 + call L$103key_192b + aeskeygenassist xmm1,xmm2,4 + call L$104key_192a + aeskeygenassist xmm1,xmm2,8 + call L$103key_192b + aeskeygenassist xmm1,xmm2,16 + call L$104key_192a + aeskeygenassist xmm1,xmm2,32 + call L$103key_192b + aeskeygenassist xmm1,xmm2,64 + call L$104key_192a + aeskeygenassist xmm1,xmm2,128 + call L$103key_192b + movups [edx],xmm0 + mov DWORD [48+edx],ecx + jmp NEAR L$101good_key +align 16 +L$104key_192a: + movups [edx],xmm0 + lea edx,[16+edx] +align 16 +L$102key_192a_cold: + movaps xmm5,xmm2 +L$105key_192b_warm: + shufps xmm4,xmm0,16 + movdqa xmm3,xmm2 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + pslldq xmm3,4 + xorps xmm0,xmm4 + pshufd xmm1,xmm1,85 + pxor xmm2,xmm3 + pxor xmm0,xmm1 + pshufd xmm3,xmm0,255 + pxor xmm2,xmm3 + ret +align 16 +L$103key_192b: + movaps xmm3,xmm0 + shufps xmm5,xmm0,68 + movups [edx],xmm5 + shufps xmm3,xmm2,78 + movups [16+edx],xmm3 + lea edx,[32+edx] + jmp NEAR L$105key_192b_warm +align 16 +L$09514rounds: + movups xmm2,[16+eax] + lea edx,[16+edx] + mov ecx,13 + movups [edx-32],xmm0 + movups [edx-16],xmm2 + aeskeygenassist xmm1,xmm2,1 + call L$106key_256a_cold + aeskeygenassist xmm1,xmm0,1 + call L$107key_256b + aeskeygenassist xmm1,xmm2,2 + call L$108key_256a + aeskeygenassist xmm1,xmm0,2 + call L$107key_256b + aeskeygenassist xmm1,xmm2,4 + call L$108key_256a + aeskeygenassist xmm1,xmm0,4 + call L$107key_256b + aeskeygenassist xmm1,xmm2,8 + call L$108key_256a + aeskeygenassist xmm1,xmm0,8 + call L$107key_256b + aeskeygenassist xmm1,xmm2,16 + call L$108key_256a + aeskeygenassist xmm1,xmm0,16 + call L$107key_256b + aeskeygenassist xmm1,xmm2,32 + call L$108key_256a + aeskeygenassist xmm1,xmm0,32 + call L$107key_256b + aeskeygenassist xmm1,xmm2,64 + call L$108key_256a + movups [edx],xmm0 + mov DWORD [16+edx],ecx + xor eax,eax + jmp NEAR L$101good_key +align 16 +L$108key_256a: + movups [edx],xmm2 + lea edx,[16+edx] +L$106key_256a_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + ret +align 16 +L$107key_256b: + movups [edx],xmm0 + lea edx,[16+edx] + shufps xmm4,xmm2,16 + xorps xmm2,xmm4 + shufps xmm4,xmm2,140 + xorps xmm2,xmm4 + shufps xmm1,xmm1,170 + xorps xmm2,xmm1 + ret +L$101good_key: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + xor eax,eax + pop ebx + ret +align 4 +L$097bad_keybits: + pxor xmm0,xmm0 + mov eax,-2 + pop ebx + ret +global _aes_hw_set_encrypt_key_alt +align 16 +_aes_hw_set_encrypt_key_alt: +L$_aes_hw_set_encrypt_key_alt_begin: +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$109pic_for_function_hit +L$109pic_for_function_hit: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+3-L$109pic_for_function_hit)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + mov eax,DWORD [4+esp] + mov ecx,DWORD [8+esp] + mov edx,DWORD [12+esp] + push ebx + call L$110pic +L$110pic: + pop ebx + lea ebx,[(L$key_const-L$110pic)+ebx] + movups xmm0,[eax] + xorps xmm4,xmm4 + lea edx,[16+edx] + cmp ecx,256 + je NEAR L$11114rounds_alt + cmp ecx,192 + je NEAR L$11212rounds_alt + cmp ecx,128 + jne NEAR L$113bad_keybits +align 16 +L$11410rounds_alt: + movdqa xmm5,[ebx] + mov ecx,8 + movdqa xmm4,[32+ebx] + movdqa xmm2,xmm0 + movdqu [edx-16],xmm0 +L$115loop_key128: + pshufb xmm0,xmm5 + aesenclast xmm0,xmm4 + pslld xmm4,1 + lea edx,[16+edx] + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu [edx-16],xmm0 + movdqa xmm2,xmm0 + dec ecx + jnz NEAR L$115loop_key128 + movdqa xmm4,[48+ebx] + pshufb xmm0,xmm5 + aesenclast xmm0,xmm4 + pslld xmm4,1 + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu [edx],xmm0 + movdqa xmm2,xmm0 + pshufb xmm0,xmm5 + aesenclast xmm0,xmm4 + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + pxor xmm0,xmm2 + movdqu [16+edx],xmm0 + mov ecx,9 + mov DWORD [96+edx],ecx + jmp NEAR L$116good_key +align 16 +L$11212rounds_alt: + movq xmm2,[16+eax] + movdqa xmm5,[16+ebx] + movdqa xmm4,[32+ebx] + mov ecx,8 + movdqu [edx-16],xmm0 +L$117loop_key192: + movq [edx],xmm2 + movdqa xmm1,xmm2 + pshufb xmm2,xmm5 + aesenclast xmm2,xmm4 + pslld xmm4,1 + lea edx,[24+edx] + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pshufd xmm3,xmm0,255 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pxor xmm0,xmm2 + pxor xmm2,xmm3 + movdqu [edx-16],xmm0 + dec ecx + jnz NEAR L$117loop_key192 + mov ecx,11 + mov DWORD [32+edx],ecx + jmp NEAR L$116good_key +align 16 +L$11114rounds_alt: + movups xmm2,[16+eax] + lea edx,[16+edx] + movdqa xmm5,[ebx] + movdqa xmm4,[32+ebx] + mov ecx,7 + movdqu [edx-32],xmm0 + movdqa xmm1,xmm2 + movdqu [edx-16],xmm2 +L$118loop_key256: + pshufb xmm2,xmm5 + aesenclast xmm2,xmm4 + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pslld xmm4,1 + pxor xmm0,xmm2 + movdqu [edx],xmm0 + dec ecx + jz NEAR L$119done_key256 + pshufd xmm2,xmm0,255 + pxor xmm3,xmm3 + aesenclast xmm2,xmm3 + movdqa xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm1,xmm3 + pxor xmm2,xmm1 + movdqu [16+edx],xmm2 + lea edx,[32+edx] + movdqa xmm1,xmm2 + jmp NEAR L$118loop_key256 +L$119done_key256: + mov ecx,13 + mov DWORD [16+edx],ecx +L$116good_key: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + xor eax,eax + pop ebx + ret +align 4 +L$113bad_keybits: + pxor xmm0,xmm0 + mov eax,-2 + pop ebx + ret +global _aes_hw_encrypt_key_to_decrypt_key +align 16 +_aes_hw_encrypt_key_to_decrypt_key: +L$_aes_hw_encrypt_key_to_decrypt_key_begin: + mov edx,DWORD [4+esp] + mov ecx,DWORD [240+edx] + shl ecx,4 + lea eax,[16+ecx*1+edx] + movups xmm0,[edx] + movups xmm1,[eax] + movups [eax],xmm0 + movups [edx],xmm1 + lea edx,[16+edx] + lea eax,[eax-16] +L$120dec_key_inverse: + movups xmm0,[edx] + movups xmm1,[eax] + aesimc xmm0,xmm0 + aesimc xmm1,xmm1 + lea edx,[16+edx] + lea eax,[eax-16] + movups [16+eax],xmm0 + movups [edx-16],xmm1 + cmp eax,edx + ja NEAR L$120dec_key_inverse + movups xmm0,[edx] + aesimc xmm0,xmm0 + movups [edx],xmm0 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + ret +align 64 +L$key_const: +dd 202313229,202313229,202313229,202313229 +dd 67569157,67569157,67569157,67569157 +dd 1,1,1,1 +dd 27,27,27,27 +db 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 +db 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 +db 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 +db 115,108,46,111,114,103,62,0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/aesni-x86_64-apple.S b/third_party/boringssl/gen/bcm/aesni-x86_64-apple.S new file mode 100644 index 00000000..958cc5a2 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesni-x86_64-apple.S @@ -0,0 +1,2394 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt + +.p2align 4 +_aes_hw_encrypt: + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+1(%rip) +#endif + movups (%rdi),%xmm2 + movl 240(%rdx),%eax + movups (%rdx),%xmm0 + movups 16(%rdx),%xmm1 + leaq 32(%rdx),%rdx + xorps %xmm0,%xmm2 +L$oop_enc1_1: + aesenc %xmm1,%xmm2 + decl %eax + movups (%rdx),%xmm1 + leaq 16(%rdx),%rdx + jnz L$oop_enc1_1 + aesenclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + ret + + + +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt + +.p2align 4 +_aes_hw_decrypt: + +_CET_ENDBR + movups (%rdi),%xmm2 + movl 240(%rdx),%eax + movups (%rdx),%xmm0 + movups 16(%rdx),%xmm1 + leaq 32(%rdx),%rdx + xorps %xmm0,%xmm2 +L$oop_dec1_2: + aesdec %xmm1,%xmm2 + decl %eax + movups (%rdx),%xmm1 + leaq 16(%rdx),%rdx + jnz L$oop_dec1_2 + aesdeclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + ret + + + +.p2align 4 +_aesni_encrypt2: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +L$enc_loop2: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$enc_loop2 + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + ret + + + +.p2align 4 +_aesni_decrypt2: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +L$dec_loop2: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$dec_loop2 + + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + ret + + + +.p2align 4 +_aesni_encrypt3: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +L$enc_loop3: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$enc_loop3 + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + ret + + + +.p2align 4 +_aesni_decrypt3: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +L$dec_loop3: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$dec_loop3 + + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + ret + + + +.p2align 4 +_aesni_encrypt4: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + xorps %xmm0,%xmm5 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax + +L$enc_loop4: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$enc_loop4 + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + aesenclast %xmm0,%xmm5 + ret + + + +.p2align 4 +_aesni_decrypt4: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + xorps %xmm0,%xmm5 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax + +L$dec_loop4: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$dec_loop4 + + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + aesdeclast %xmm0,%xmm5 + ret + + + +.p2align 4 +_aesni_encrypt6: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + aesenc %xmm1,%xmm2 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + aesenc %xmm1,%xmm3 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + aesenc %xmm1,%xmm4 + pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp L$enc_loop6_enter +.p2align 4 +L$enc_loop6: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 +L$enc_loop6_enter: + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$enc_loop6 + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + aesenclast %xmm0,%xmm5 + aesenclast %xmm0,%xmm6 + aesenclast %xmm0,%xmm7 + ret + + + +.p2align 4 +_aesni_decrypt6: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + aesdec %xmm1,%xmm2 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + aesdec %xmm1,%xmm3 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + aesdec %xmm1,%xmm4 + pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp L$dec_loop6_enter +.p2align 4 +L$dec_loop6: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 +L$dec_loop6_enter: + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$dec_loop6 + + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + aesdeclast %xmm0,%xmm5 + aesdeclast %xmm0,%xmm6 + aesdeclast %xmm0,%xmm7 + ret + + + +.p2align 4 +_aesni_encrypt8: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + aesenc %xmm1,%xmm2 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 + aesenc %xmm1,%xmm3 + pxor %xmm0,%xmm9 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp L$enc_loop8_inner +.p2align 4 +L$enc_loop8: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 +L$enc_loop8_inner: + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 +L$enc_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$enc_loop8 + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + aesenclast %xmm0,%xmm5 + aesenclast %xmm0,%xmm6 + aesenclast %xmm0,%xmm7 + aesenclast %xmm0,%xmm8 + aesenclast %xmm0,%xmm9 + ret + + + +.p2align 4 +_aesni_decrypt8: + + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + aesdec %xmm1,%xmm2 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 + aesdec %xmm1,%xmm3 + pxor %xmm0,%xmm9 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp L$dec_loop8_inner +.p2align 4 +L$dec_loop8: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 +L$dec_loop8_inner: + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 +L$dec_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups -16(%rcx,%rax,1),%xmm0 + jnz L$dec_loop8 + + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + aesdeclast %xmm0,%xmm5 + aesdeclast %xmm0,%xmm6 + aesdeclast %xmm0,%xmm7 + aesdeclast %xmm0,%xmm8 + aesdeclast %xmm0,%xmm9 + ret + + +.globl _aes_hw_ecb_encrypt +.private_extern _aes_hw_ecb_encrypt + +.p2align 4 +_aes_hw_ecb_encrypt: + +_CET_ENDBR + andq $-16,%rdx + jz L$ecb_ret + + movl 240(%rcx),%eax + movups (%rcx),%xmm0 + movq %rcx,%r11 + movl %eax,%r10d + testl %r8d,%r8d + jz L$ecb_decrypt + + cmpq $0x80,%rdx + jb L$ecb_enc_tail + + movdqu (%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + movdqu 96(%rdi),%xmm8 + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi + subq $0x80,%rdx + jmp L$ecb_enc_loop8_enter +.p2align 4 +L$ecb_enc_loop8: + movups %xmm2,(%rsi) + movq %r11,%rcx + movdqu (%rdi),%xmm2 + movl %r10d,%eax + movups %xmm3,16(%rsi) + movdqu 16(%rdi),%xmm3 + movups %xmm4,32(%rsi) + movdqu 32(%rdi),%xmm4 + movups %xmm5,48(%rsi) + movdqu 48(%rdi),%xmm5 + movups %xmm6,64(%rsi) + movdqu 64(%rdi),%xmm6 + movups %xmm7,80(%rsi) + movdqu 80(%rdi),%xmm7 + movups %xmm8,96(%rsi) + movdqu 96(%rdi),%xmm8 + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi +L$ecb_enc_loop8_enter: + + call _aesni_encrypt8 + + subq $0x80,%rdx + jnc L$ecb_enc_loop8 + + movups %xmm2,(%rsi) + movq %r11,%rcx + movups %xmm3,16(%rsi) + movl %r10d,%eax + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + addq $0x80,%rdx + jz L$ecb_ret + +L$ecb_enc_tail: + movups (%rdi),%xmm2 + cmpq $0x20,%rdx + jb L$ecb_enc_one + movups 16(%rdi),%xmm3 + je L$ecb_enc_two + movups 32(%rdi),%xmm4 + cmpq $0x40,%rdx + jb L$ecb_enc_three + movups 48(%rdi),%xmm5 + je L$ecb_enc_four + movups 64(%rdi),%xmm6 + cmpq $0x60,%rdx + jb L$ecb_enc_five + movups 80(%rdi),%xmm7 + je L$ecb_enc_six + movdqu 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_encrypt8 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + movups %xmm8,96(%rsi) + jmp L$ecb_ret +.p2align 4 +L$ecb_enc_one: + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +L$oop_enc1_3: + aesenc %xmm1,%xmm2 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_enc1_3 + aesenclast %xmm1,%xmm2 + movups %xmm2,(%rsi) + jmp L$ecb_ret +.p2align 4 +L$ecb_enc_two: + call _aesni_encrypt2 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + jmp L$ecb_ret +.p2align 4 +L$ecb_enc_three: + call _aesni_encrypt3 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + jmp L$ecb_ret +.p2align 4 +L$ecb_enc_four: + call _aesni_encrypt4 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + jmp L$ecb_ret +.p2align 4 +L$ecb_enc_five: + xorps %xmm7,%xmm7 + call _aesni_encrypt6 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + jmp L$ecb_ret +.p2align 4 +L$ecb_enc_six: + call _aesni_encrypt6 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + jmp L$ecb_ret + +.p2align 4 +L$ecb_decrypt: + cmpq $0x80,%rdx + jb L$ecb_dec_tail + + movdqu (%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + movdqu 96(%rdi),%xmm8 + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi + subq $0x80,%rdx + jmp L$ecb_dec_loop8_enter +.p2align 4 +L$ecb_dec_loop8: + movups %xmm2,(%rsi) + movq %r11,%rcx + movdqu (%rdi),%xmm2 + movl %r10d,%eax + movups %xmm3,16(%rsi) + movdqu 16(%rdi),%xmm3 + movups %xmm4,32(%rsi) + movdqu 32(%rdi),%xmm4 + movups %xmm5,48(%rsi) + movdqu 48(%rdi),%xmm5 + movups %xmm6,64(%rsi) + movdqu 64(%rdi),%xmm6 + movups %xmm7,80(%rsi) + movdqu 80(%rdi),%xmm7 + movups %xmm8,96(%rsi) + movdqu 96(%rdi),%xmm8 + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi +L$ecb_dec_loop8_enter: + + call _aesni_decrypt8 + + movups (%r11),%xmm0 + subq $0x80,%rdx + jnc L$ecb_dec_loop8 + + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movq %r11,%rcx + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movl %r10d,%eax + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 + movups %xmm9,112(%rsi) + pxor %xmm9,%xmm9 + leaq 128(%rsi),%rsi + addq $0x80,%rdx + jz L$ecb_ret + +L$ecb_dec_tail: + movups (%rdi),%xmm2 + cmpq $0x20,%rdx + jb L$ecb_dec_one + movups 16(%rdi),%xmm3 + je L$ecb_dec_two + movups 32(%rdi),%xmm4 + cmpq $0x40,%rdx + jb L$ecb_dec_three + movups 48(%rdi),%xmm5 + je L$ecb_dec_four + movups 64(%rdi),%xmm6 + cmpq $0x60,%rdx + jb L$ecb_dec_five + movups 80(%rdi),%xmm7 + je L$ecb_dec_six + movups 96(%rdi),%xmm8 + movups (%rcx),%xmm0 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + jmp L$ecb_ret +.p2align 4 +L$ecb_dec_one: + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +L$oop_dec1_4: + aesdec %xmm1,%xmm2 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_dec1_4 + aesdeclast %xmm1,%xmm2 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp L$ecb_ret +.p2align 4 +L$ecb_dec_two: + call _aesni_decrypt2 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + jmp L$ecb_ret +.p2align 4 +L$ecb_dec_three: + call _aesni_decrypt3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + jmp L$ecb_ret +.p2align 4 +L$ecb_dec_four: + call _aesni_decrypt4 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + jmp L$ecb_ret +.p2align 4 +L$ecb_dec_five: + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + jmp L$ecb_ret +.p2align 4 +L$ecb_dec_six: + call _aesni_decrypt6 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + +L$ecb_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + ret + + +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks + +.p2align 4 +_aes_hw_ctr32_encrypt_blocks: + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,_BORINGSSL_function_hit(%rip) +#endif + cmpq $1,%rdx + jne L$ctr32_bulk + + + + movups (%r8),%xmm2 + movups (%rdi),%xmm3 + movl 240(%rcx),%edx + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +L$oop_enc1_5: + aesenc %xmm1,%xmm2 + decl %edx + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_enc1_5 + aesenclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm2 + jmp L$ctr32_epilogue + +.p2align 4 +L$ctr32_bulk: + leaq (%rsp),%r11 + + pushq %rbp + + subq $128,%rsp + andq $-16,%rsp + + + + + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%ebp + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movq %rdx,%r10 + movdqa %xmm2,112(%rsp) + + leaq 1(%r8),%rax + leaq 2(%r8),%rdx + bswapl %eax + bswapl %edx + xorl %ebp,%eax + xorl %ebp,%edx + pinsrd $3,%eax,%xmm3 + leaq 3(%r8),%rax + movdqa %xmm3,16(%rsp) + pinsrd $3,%edx,%xmm4 + bswapl %eax + movq %r10,%rdx + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %ebp,%eax + bswapl %r10d + pinsrd $3,%eax,%xmm5 + xorl %ebp,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 + movl 240(%rcx),%eax + xorl %ebp,%r9d + bswapl %r10d + movl %r9d,80+12(%rsp) + xorl %ebp,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + xorl %ebp,%r9d + movl %r9d,112+12(%rsp) + + movups 16(%rcx),%xmm1 + + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + + cmpq $8,%rdx + jb L$ctr32_tail + + leaq 128(%rcx),%rcx + subq $8,%rdx + jmp L$ctr32_loop8 + +.p2align 5 +L$ctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 + aesenc %xmm1,%xmm2 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 + aesenc %xmm1,%xmm3 + bswapl %r9d + movups 32-128(%rcx),%xmm0 + aesenc %xmm1,%xmm4 + xorl %ebp,%r9d + nop + aesenc %xmm1,%xmm5 + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 48-128(%rcx),%xmm1 + bswapl %r9d + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 64-128(%rcx),%xmm0 + bswapl %r9d + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 80-128(%rcx),%xmm1 + bswapl %r9d + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 96-128(%rcx),%xmm0 + bswapl %r9d + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 112-128(%rcx),%xmm1 + bswapl %r9d + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 128-128(%rcx),%xmm0 + bswapl %r9d + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 144-128(%rcx),%xmm1 + bswapl %r9d + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + xorl %ebp,%r9d + movdqu 0(%rdi),%xmm10 + aesenc %xmm0,%xmm5 + movl %r9d,112+12(%rsp) + cmpl $11,%eax + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 160-128(%rcx),%xmm0 + + jb L$ctr32_enc_done + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 176-128(%rcx),%xmm1 + + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 192-128(%rcx),%xmm0 + je L$ctr32_enc_done + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 208-128(%rcx),%xmm1 + + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 224-128(%rcx),%xmm0 + jmp L$ctr32_enc_done + +.p2align 4 +L$ctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 + prefetcht0 448(%rdi) + prefetcht0 512(%rdi) + pxor %xmm0,%xmm15 + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movdqu 96(%rdi),%xmm1 + leaq 128(%rdi),%rdi + + aesenclast %xmm10,%xmm2 + pxor %xmm0,%xmm1 + movdqu 112-128(%rdi),%xmm10 + aesenclast %xmm11,%xmm3 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 + aesenclast %xmm12,%xmm4 + aesenclast %xmm13,%xmm5 + movdqa 16(%rsp),%xmm12 + movdqa 32(%rsp),%xmm13 + aesenclast %xmm14,%xmm6 + aesenclast %xmm15,%xmm7 + movdqa 48(%rsp),%xmm14 + movdqa 64(%rsp),%xmm15 + aesenclast %xmm1,%xmm8 + movdqa 80(%rsp),%xmm0 + movups 16-128(%rcx),%xmm1 + aesenclast %xmm10,%xmm9 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc L$ctr32_loop8 + + addq $8,%rdx + jz L$ctr32_done + leaq -128(%rcx),%rcx + +L$ctr32_tail: + + + leaq 16(%rcx),%rcx + cmpq $4,%rdx + jb L$ctr32_loop3 + je L$ctr32_loop4 + + + shll $4,%eax + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 + + movups 16(%rcx),%xmm0 + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + leaq 32-16(%rcx,%rax,1),%rcx + negq %rax + aesenc %xmm1,%xmm4 + addq $16,%rax + movups (%rdi),%xmm10 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + movups 16(%rdi),%xmm11 + movups 32(%rdi),%xmm12 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + + call L$enc_loop8_enter + + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb L$ctr32_done + + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je L$ctr32_done + + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) + jmp L$ctr32_done + +.p2align 5 +L$ctr32_loop4: + aesenc %xmm1,%xmm2 + leaq 16(%rcx),%rcx + decl %eax + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movups (%rcx),%xmm1 + jnz L$ctr32_loop4 + aesenclast %xmm1,%xmm2 + aesenclast %xmm1,%xmm3 + movups (%rdi),%xmm10 + movups 16(%rdi),%xmm11 + aesenclast %xmm1,%xmm4 + aesenclast %xmm1,%xmm5 + movups 32(%rdi),%xmm12 + movups 48(%rdi),%xmm13 + + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) + jmp L$ctr32_done + +.p2align 5 +L$ctr32_loop3: + aesenc %xmm1,%xmm2 + leaq 16(%rcx),%rcx + decl %eax + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + movups (%rcx),%xmm1 + jnz L$ctr32_loop3 + aesenclast %xmm1,%xmm2 + aesenclast %xmm1,%xmm3 + aesenclast %xmm1,%xmm4 + + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb L$ctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je L$ctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) + +L$ctr32_done: + xorps %xmm0,%xmm0 + xorl %ebp,%ebp + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,112(%rsp) + pxor %xmm15,%xmm15 + movq -8(%r11),%rbp + + leaq (%r11),%rsp + +L$ctr32_epilogue: + ret + + +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt + +.p2align 4 +_aes_hw_cbc_encrypt: + +_CET_ENDBR + testq %rdx,%rdx + jz L$cbc_ret + + movl 240(%rcx),%r10d + movq %rcx,%r11 + testl %r9d,%r9d + jz L$cbc_decrypt + + movups (%r8),%xmm2 + movl %r10d,%eax + cmpq $16,%rdx + jb L$cbc_enc_tail + subq $16,%rdx + jmp L$cbc_enc_loop +.p2align 4 +L$cbc_enc_loop: + movups (%rdi),%xmm3 + leaq 16(%rdi),%rdi + + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm3 + leaq 32(%rcx),%rcx + xorps %xmm3,%xmm2 +L$oop_enc1_6: + aesenc %xmm1,%xmm2 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_enc1_6 + aesenclast %xmm1,%xmm2 + movl %r10d,%eax + movq %r11,%rcx + movups %xmm2,0(%rsi) + leaq 16(%rsi),%rsi + subq $16,%rdx + jnc L$cbc_enc_loop + addq $16,%rdx + jnz L$cbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + jmp L$cbc_ret + +L$cbc_enc_tail: + movq %rdx,%rcx + xchgq %rdi,%rsi +.long 0x9066A4F3 + movl $16,%ecx + subq %rdx,%rcx + xorl %eax,%eax +.long 0x9066AAF3 + leaq -16(%rdi),%rdi + movl %r10d,%eax + movq %rdi,%rsi + movq %r11,%rcx + xorq %rdx,%rdx + jmp L$cbc_enc_loop + +.p2align 4 +L$cbc_decrypt: + cmpq $16,%rdx + jne L$cbc_decrypt_bulk + + + + movdqu (%rdi),%xmm2 + movdqu (%r8),%xmm3 + movdqa %xmm2,%xmm4 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +L$oop_dec1_7: + aesdec %xmm1,%xmm2 + decl %r10d + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_dec1_7 + aesdeclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqu %xmm4,(%r8) + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp L$cbc_ret +.p2align 4 +L$cbc_decrypt_bulk: + leaq (%rsp),%r11 + + pushq %rbp + + subq $16,%rsp + andq $-16,%rsp + movq %rcx,%rbp + movups (%r8),%xmm10 + movl %r10d,%eax + cmpq $0x50,%rdx + jbe L$cbc_dec_tail + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + cmpq $0x70,%rdx + jbe L$cbc_dec_six_or_seven + + subq $0x70,%rdx + leaq 112(%rcx),%rcx + jmp L$cbc_dec_loop8_enter +.p2align 4 +L$cbc_dec_loop8: + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi +L$cbc_dec_loop8_enter: + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + movq $-1,%rbp + cmpq $0x70,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 + + aesdec %xmm1,%xmm2 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + adcq $0,%rbp + andq $128,%rbp + aesdec %xmm1,%xmm9 + addq %rdi,%rbp + movups 48-112(%rcx),%xmm1 + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 64-112(%rcx),%xmm0 + nop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movups 80-112(%rcx),%xmm1 + nop + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 96-112(%rcx),%xmm0 + nop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movups 112-112(%rcx),%xmm1 + nop + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 128-112(%rcx),%xmm0 + nop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movups 144-112(%rcx),%xmm1 + cmpl $11,%eax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 160-112(%rcx),%xmm0 + jb L$cbc_dec_done + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movups 176-112(%rcx),%xmm1 + nop + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 192-112(%rcx),%xmm0 + je L$cbc_dec_done + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movups 208-112(%rcx),%xmm1 + nop + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 224-112(%rcx),%xmm0 + jmp L$cbc_dec_done +.p2align 4 +L$cbc_dec_done: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + pxor %xmm0,%xmm12 + pxor %xmm0,%xmm13 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movdqu 80(%rdi),%xmm1 + + aesdeclast %xmm10,%xmm2 + movdqu 96(%rdi),%xmm10 + pxor %xmm0,%xmm1 + aesdeclast %xmm11,%xmm3 + pxor %xmm0,%xmm10 + movdqu 112(%rdi),%xmm0 + aesdeclast %xmm12,%xmm4 + leaq 128(%rdi),%rdi + movdqu 0(%rbp),%xmm11 + aesdeclast %xmm13,%xmm5 + aesdeclast %xmm14,%xmm6 + movdqu 16(%rbp),%xmm12 + movdqu 32(%rbp),%xmm13 + aesdeclast %xmm15,%xmm7 + aesdeclast %xmm1,%xmm8 + movdqu 48(%rbp),%xmm14 + movdqu 64(%rbp),%xmm15 + aesdeclast %xmm10,%xmm9 + movdqa %xmm0,%xmm10 + movdqu 80(%rbp),%xmm1 + movups -112(%rcx),%xmm0 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm1,%xmm7 + movups %xmm8,96(%rsi) + leaq 112(%rsi),%rsi + + subq $0x80,%rdx + ja L$cbc_dec_loop8 + + movaps %xmm9,%xmm2 + leaq -112(%rcx),%rcx + addq $0x70,%rdx + jle L$cbc_dec_clear_tail_collected + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi + cmpq $0x50,%rdx + jbe L$cbc_dec_tail + + movaps %xmm11,%xmm2 +L$cbc_dec_six_or_seven: + cmpq $0x60,%rdx + ja L$cbc_dec_seven + + movaps %xmm7,%xmm8 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + leaq 80(%rsi),%rsi + movdqa %xmm7,%xmm2 + pxor %xmm7,%xmm7 + jmp L$cbc_dec_tail_collected + +.p2align 4 +L$cbc_dec_seven: + movups 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups 80(%rdi),%xmm9 + pxor %xmm10,%xmm2 + movups 96(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm9,%xmm8 + movdqu %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + leaq 96(%rsi),%rsi + movdqa %xmm8,%xmm2 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + jmp L$cbc_dec_tail_collected + +L$cbc_dec_tail: + movups (%rdi),%xmm2 + subq $0x10,%rdx + jbe L$cbc_dec_one + + movups 16(%rdi),%xmm3 + movaps %xmm2,%xmm11 + subq $0x10,%rdx + jbe L$cbc_dec_two + + movups 32(%rdi),%xmm4 + movaps %xmm3,%xmm12 + subq $0x10,%rdx + jbe L$cbc_dec_three + + movups 48(%rdi),%xmm5 + movaps %xmm4,%xmm13 + subq $0x10,%rdx + jbe L$cbc_dec_four + + movups 64(%rdi),%xmm6 + movaps %xmm5,%xmm14 + movaps %xmm6,%xmm15 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm15,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + leaq 64(%rsi),%rsi + movdqa %xmm6,%xmm2 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + subq $0x10,%rdx + jmp L$cbc_dec_tail_collected + +.p2align 4 +L$cbc_dec_one: + movaps %xmm2,%xmm11 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +L$oop_dec1_8: + aesdec %xmm1,%xmm2 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz L$oop_dec1_8 + aesdeclast %xmm1,%xmm2 + xorps %xmm10,%xmm2 + movaps %xmm11,%xmm10 + jmp L$cbc_dec_tail_collected +.p2align 4 +L$cbc_dec_two: + movaps %xmm3,%xmm12 + call _aesni_decrypt2 + pxor %xmm10,%xmm2 + movaps %xmm12,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + movdqa %xmm3,%xmm2 + pxor %xmm3,%xmm3 + leaq 16(%rsi),%rsi + jmp L$cbc_dec_tail_collected +.p2align 4 +L$cbc_dec_three: + movaps %xmm4,%xmm13 + call _aesni_decrypt3 + pxor %xmm10,%xmm2 + movaps %xmm13,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movdqa %xmm4,%xmm2 + pxor %xmm4,%xmm4 + leaq 32(%rsi),%rsi + jmp L$cbc_dec_tail_collected +.p2align 4 +L$cbc_dec_four: + movaps %xmm5,%xmm14 + call _aesni_decrypt4 + pxor %xmm10,%xmm2 + movaps %xmm14,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movdqa %xmm5,%xmm2 + pxor %xmm5,%xmm5 + leaq 48(%rsi),%rsi + jmp L$cbc_dec_tail_collected + +.p2align 4 +L$cbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 +L$cbc_dec_tail_collected: + movups %xmm10,(%r8) + andq $15,%rdx + jnz L$cbc_dec_tail_partial + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp L$cbc_dec_ret +.p2align 4 +L$cbc_dec_tail_partial: + movaps %xmm2,(%rsp) + pxor %xmm2,%xmm2 + movq $16,%rcx + movq %rsi,%rdi + subq %rdx,%rcx + leaq (%rsp),%rsi +.long 0x9066A4F3 + movdqa %xmm2,(%rsp) + +L$cbc_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movq -8(%r11),%rbp + + leaq (%r11),%rsp + +L$cbc_ret: + ret + + +.globl _aes_hw_encrypt_key_to_decrypt_key +.private_extern _aes_hw_encrypt_key_to_decrypt_key + +.p2align 4 +_aes_hw_encrypt_key_to_decrypt_key: + +_CET_ENDBR + + movl 240(%rdi),%esi + shll $4,%esi + + leaq 16(%rdi,%rsi,1),%rdx + + movups (%rdi),%xmm0 + movups (%rdx),%xmm1 + movups %xmm0,(%rdx) + movups %xmm1,(%rdi) + leaq 16(%rdi),%rdi + leaq -16(%rdx),%rdx + +L$dec_key_inverse: + movups (%rdi),%xmm0 + movups (%rdx),%xmm1 + aesimc %xmm0,%xmm0 + aesimc %xmm1,%xmm1 + leaq 16(%rdi),%rdi + leaq -16(%rdx),%rdx + movups %xmm0,16(%rdx) + movups %xmm1,-16(%rdi) + cmpq %rdi,%rdx + ja L$dec_key_inverse + + movups (%rdi),%xmm0 + aesimc %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm0,(%rdx) + pxor %xmm0,%xmm0 + ret + + +.globl _aes_hw_set_encrypt_key_base +.private_extern _aes_hw_set_encrypt_key_base + +.p2align 4 +_aes_hw_set_encrypt_key_base: + + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,_BORINGSSL_function_hit+3(%rip) +#endif + subq $8,%rsp + + + + movups (%rdi),%xmm0 + xorps %xmm4,%xmm4 + leaq 16(%rdx),%rax + cmpl $256,%esi + je L$14rounds + cmpl $192,%esi + je L$12rounds + cmpl $128,%esi + jne L$bad_keybits + +L$10rounds: + movl $9,%esi + + movups %xmm0,(%rdx) + aeskeygenassist $0x1,%xmm0,%xmm1 + call L$key_expansion_128_cold + aeskeygenassist $0x2,%xmm0,%xmm1 + call L$key_expansion_128 + aeskeygenassist $0x4,%xmm0,%xmm1 + call L$key_expansion_128 + aeskeygenassist $0x8,%xmm0,%xmm1 + call L$key_expansion_128 + aeskeygenassist $0x10,%xmm0,%xmm1 + call L$key_expansion_128 + aeskeygenassist $0x20,%xmm0,%xmm1 + call L$key_expansion_128 + aeskeygenassist $0x40,%xmm0,%xmm1 + call L$key_expansion_128 + aeskeygenassist $0x80,%xmm0,%xmm1 + call L$key_expansion_128 + aeskeygenassist $0x1b,%xmm0,%xmm1 + call L$key_expansion_128 + aeskeygenassist $0x36,%xmm0,%xmm1 + call L$key_expansion_128 + movups %xmm0,(%rax) + movl %esi,80(%rax) + xorl %eax,%eax + jmp L$enc_key_ret + +.p2align 4 +L$12rounds: + movq 16(%rdi),%xmm2 + movl $11,%esi + + movups %xmm0,(%rdx) + aeskeygenassist $0x1,%xmm2,%xmm1 + call L$key_expansion_192a_cold + aeskeygenassist $0x2,%xmm2,%xmm1 + call L$key_expansion_192b + aeskeygenassist $0x4,%xmm2,%xmm1 + call L$key_expansion_192a + aeskeygenassist $0x8,%xmm2,%xmm1 + call L$key_expansion_192b + aeskeygenassist $0x10,%xmm2,%xmm1 + call L$key_expansion_192a + aeskeygenassist $0x20,%xmm2,%xmm1 + call L$key_expansion_192b + aeskeygenassist $0x40,%xmm2,%xmm1 + call L$key_expansion_192a + aeskeygenassist $0x80,%xmm2,%xmm1 + call L$key_expansion_192b + movups %xmm0,(%rax) + movl %esi,48(%rax) + xorq %rax,%rax + jmp L$enc_key_ret + +.p2align 4 +L$14rounds: + movups 16(%rdi),%xmm2 + movl $13,%esi + leaq 16(%rax),%rax + + movups %xmm0,(%rdx) + movups %xmm2,16(%rdx) + aeskeygenassist $0x1,%xmm2,%xmm1 + call L$key_expansion_256a_cold + aeskeygenassist $0x1,%xmm0,%xmm1 + call L$key_expansion_256b + aeskeygenassist $0x2,%xmm2,%xmm1 + call L$key_expansion_256a + aeskeygenassist $0x2,%xmm0,%xmm1 + call L$key_expansion_256b + aeskeygenassist $0x4,%xmm2,%xmm1 + call L$key_expansion_256a + aeskeygenassist $0x4,%xmm0,%xmm1 + call L$key_expansion_256b + aeskeygenassist $0x8,%xmm2,%xmm1 + call L$key_expansion_256a + aeskeygenassist $0x8,%xmm0,%xmm1 + call L$key_expansion_256b + aeskeygenassist $0x10,%xmm2,%xmm1 + call L$key_expansion_256a + aeskeygenassist $0x10,%xmm0,%xmm1 + call L$key_expansion_256b + aeskeygenassist $0x20,%xmm2,%xmm1 + call L$key_expansion_256a + aeskeygenassist $0x20,%xmm0,%xmm1 + call L$key_expansion_256b + aeskeygenassist $0x40,%xmm2,%xmm1 + call L$key_expansion_256a + movups %xmm0,(%rax) + movl %esi,16(%rax) + xorq %rax,%rax + jmp L$enc_key_ret + +.p2align 4 +L$bad_keybits: + movq $-2,%rax +L$enc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + addq $8,%rsp + + ret + + + +.p2align 4 +L$key_expansion_128: + + movups %xmm0,(%rax) + leaq 16(%rax),%rax +L$key_expansion_128_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret + + +.p2align 4 +L$key_expansion_192a: + + movups %xmm0,(%rax) + leaq 16(%rax),%rax +L$key_expansion_192a_cold: + movaps %xmm2,%xmm5 +L$key_expansion_192b_warm: + shufps $16,%xmm0,%xmm4 + movdqa %xmm2,%xmm3 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + pslldq $4,%xmm3 + xorps %xmm4,%xmm0 + pshufd $85,%xmm1,%xmm1 + pxor %xmm3,%xmm2 + pxor %xmm1,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm3,%xmm2 + ret + + +.p2align 4 +L$key_expansion_192b: + + movaps %xmm0,%xmm3 + shufps $68,%xmm0,%xmm5 + movups %xmm5,(%rax) + shufps $78,%xmm2,%xmm3 + movups %xmm3,16(%rax) + leaq 32(%rax),%rax + jmp L$key_expansion_192b_warm + + +.p2align 4 +L$key_expansion_256a: + + movups %xmm2,(%rax) + leaq 16(%rax),%rax +L$key_expansion_256a_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret + + +.p2align 4 +L$key_expansion_256b: + + movups %xmm0,(%rax) + leaq 16(%rax),%rax + + shufps $16,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $140,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $170,%xmm1,%xmm1 + xorps %xmm1,%xmm2 + ret + + + +.globl _aes_hw_set_encrypt_key_alt +.private_extern _aes_hw_set_encrypt_key_alt + +.p2align 4 +_aes_hw_set_encrypt_key_alt: + + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,_BORINGSSL_function_hit+3(%rip) +#endif + subq $8,%rsp + + + + movups (%rdi),%xmm0 + xorps %xmm4,%xmm4 + leaq 16(%rdx),%rax + cmpl $256,%esi + je L$14rounds_alt + cmpl $192,%esi + je L$12rounds_alt + cmpl $128,%esi + jne L$bad_keybits_alt + + movl $9,%esi + movdqa L$key_rotate(%rip),%xmm5 + movl $8,%r10d + movdqa L$key_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,(%rdx) + jmp L$oop_key128 + +.p2align 4 +L$oop_key128: + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld $1,%xmm4 + leaq 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + decl %r10d + jnz L$oop_key128 + + movdqa L$key_rcon1b(%rip),%xmm4 + + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld $1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + movl %esi,96(%rax) + xorl %eax,%eax + jmp L$enc_key_ret_alt + +.p2align 4 +L$12rounds_alt: + movq 16(%rdi),%xmm2 + movl $11,%esi + movdqa L$key_rotate192(%rip),%xmm5 + movdqa L$key_rcon1(%rip),%xmm4 + movl $8,%r10d + movdqu %xmm0,(%rdx) + jmp L$oop_key192 + +.p2align 4 +L$oop_key192: + movq %xmm2,0(%rax) + movdqa %xmm2,%xmm1 + pshufb %xmm5,%xmm2 + aesenclast %xmm4,%xmm2 + pslld $1,%xmm4 + leaq 24(%rax),%rax + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + + pshufd $0xff,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%rax) + + decl %r10d + jnz L$oop_key192 + + movl %esi,32(%rax) + xorl %eax,%eax + jmp L$enc_key_ret_alt + +.p2align 4 +L$14rounds_alt: + movups 16(%rdi),%xmm2 + movl $13,%esi + leaq 16(%rax),%rax + movdqa L$key_rotate(%rip),%xmm5 + movdqa L$key_rcon1(%rip),%xmm4 + movl $7,%r10d + movdqu %xmm0,0(%rdx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16(%rdx) + jmp L$oop_key256 + +.p2align 4 +L$oop_key256: + pshufb %xmm5,%xmm2 + aesenclast %xmm4,%xmm2 + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + decl %r10d + jz L$done_key256 + + pshufd $0xff,%xmm0,%xmm2 + pxor %xmm3,%xmm3 + aesenclast %xmm3,%xmm2 + + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + leaq 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp L$oop_key256 + +L$done_key256: + movl %esi,16(%rax) + xorl %eax,%eax + jmp L$enc_key_ret_alt + +.p2align 4 +L$bad_keybits_alt: + movq $-2,%rax +L$enc_key_ret_alt: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + addq $8,%rsp + + ret + + + +.section __DATA,__const +.p2align 6 +L$bswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +L$increment32: +.long 6,6,6,0 +L$increment64: +.long 1,0,0,0 +L$xts_magic: +.long 0x87,0,1,0 +L$increment1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +L$key_rotate: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +L$key_rotate192: +.long 0x04070605,0x04070605,0x04070605,0x04070605 +L$key_rcon1: +.long 1,1,1,1 +L$key_rcon1b: +.long 0x1b,0x1b,0x1b,0x1b + +.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 6 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/aesni-x86_64-linux.S b/third_party/boringssl/gen/bcm/aesni-x86_64-linux.S new file mode 100644 index 00000000..4bce582c --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesni-x86_64-linux.S @@ -0,0 +1,2395 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text +.globl aes_hw_encrypt +.hidden aes_hw_encrypt +.type aes_hw_encrypt,@function +.align 16 +aes_hw_encrypt: +.cfi_startproc +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+1(%rip) +#endif + movups (%rdi),%xmm2 + movl 240(%rdx),%eax + movups (%rdx),%xmm0 + movups 16(%rdx),%xmm1 + leaq 32(%rdx),%rdx + xorps %xmm0,%xmm2 +.Loop_enc1_1: + aesenc %xmm1,%xmm2 + decl %eax + movups (%rdx),%xmm1 + leaq 16(%rdx),%rdx + jnz .Loop_enc1_1 + aesenclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + ret +.cfi_endproc +.size aes_hw_encrypt,.-aes_hw_encrypt + +.globl aes_hw_decrypt +.hidden aes_hw_decrypt +.type aes_hw_decrypt,@function +.align 16 +aes_hw_decrypt: +.cfi_startproc +_CET_ENDBR + movups (%rdi),%xmm2 + movl 240(%rdx),%eax + movups (%rdx),%xmm0 + movups 16(%rdx),%xmm1 + leaq 32(%rdx),%rdx + xorps %xmm0,%xmm2 +.Loop_dec1_2: + aesdec %xmm1,%xmm2 + decl %eax + movups (%rdx),%xmm1 + leaq 16(%rdx),%rdx + jnz .Loop_dec1_2 + aesdeclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + ret +.cfi_endproc +.size aes_hw_decrypt, .-aes_hw_decrypt +.type _aesni_encrypt2,@function +.align 16 +_aesni_encrypt2: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Lenc_loop2: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop2 + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + ret +.cfi_endproc +.size _aesni_encrypt2,.-_aesni_encrypt2 +.type _aesni_decrypt2,@function +.align 16 +_aesni_decrypt2: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Ldec_loop2: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop2 + + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + ret +.cfi_endproc +.size _aesni_decrypt2,.-_aesni_decrypt2 +.type _aesni_encrypt3,@function +.align 16 +_aesni_encrypt3: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Lenc_loop3: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop3 + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + ret +.cfi_endproc +.size _aesni_encrypt3,.-_aesni_encrypt3 +.type _aesni_decrypt3,@function +.align 16 +_aesni_decrypt3: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + addq $16,%rax + +.Ldec_loop3: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop3 + + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + ret +.cfi_endproc +.size _aesni_decrypt3,.-_aesni_decrypt3 +.type _aesni_encrypt4,@function +.align 16 +_aesni_encrypt4: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + xorps %xmm0,%xmm5 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax + +.Lenc_loop4: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop4 + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + aesenclast %xmm0,%xmm5 + ret +.cfi_endproc +.size _aesni_encrypt4,.-_aesni_encrypt4 +.type _aesni_decrypt4,@function +.align 16 +_aesni_decrypt4: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + xorps %xmm0,%xmm4 + xorps %xmm0,%xmm5 + movups 32(%rcx),%xmm0 + leaq 32(%rcx,%rax,1),%rcx + negq %rax +.byte 0x0f,0x1f,0x00 + addq $16,%rax + +.Ldec_loop4: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop4 + + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + aesdeclast %xmm0,%xmm5 + ret +.cfi_endproc +.size _aesni_decrypt4,.-_aesni_decrypt4 +.type _aesni_encrypt6,@function +.align 16 +_aesni_encrypt6: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + aesenc %xmm1,%xmm2 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + aesenc %xmm1,%xmm3 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + aesenc %xmm1,%xmm4 + pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Lenc_loop6_enter +.align 16 +.Lenc_loop6: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 +.Lenc_loop6_enter: + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop6 + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + aesenclast %xmm0,%xmm5 + aesenclast %xmm0,%xmm6 + aesenclast %xmm0,%xmm7 + ret +.cfi_endproc +.size _aesni_encrypt6,.-_aesni_encrypt6 +.type _aesni_decrypt6,@function +.align 16 +_aesni_decrypt6: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + pxor %xmm0,%xmm3 + pxor %xmm0,%xmm4 + aesdec %xmm1,%xmm2 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + aesdec %xmm1,%xmm3 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + aesdec %xmm1,%xmm4 + pxor %xmm0,%xmm7 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Ldec_loop6_enter +.align 16 +.Ldec_loop6: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 +.Ldec_loop6_enter: + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop6 + + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + aesdeclast %xmm0,%xmm5 + aesdeclast %xmm0,%xmm6 + aesdeclast %xmm0,%xmm7 + ret +.cfi_endproc +.size _aesni_decrypt6,.-_aesni_decrypt6 +.type _aesni_encrypt8,@function +.align 16 +_aesni_encrypt8: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + aesenc %xmm1,%xmm2 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 + aesenc %xmm1,%xmm3 + pxor %xmm0,%xmm9 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Lenc_loop8_inner +.align 16 +.Lenc_loop8: + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 +.Lenc_loop8_inner: + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 +.Lenc_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Lenc_loop8 + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + aesenclast %xmm0,%xmm2 + aesenclast %xmm0,%xmm3 + aesenclast %xmm0,%xmm4 + aesenclast %xmm0,%xmm5 + aesenclast %xmm0,%xmm6 + aesenclast %xmm0,%xmm7 + aesenclast %xmm0,%xmm8 + aesenclast %xmm0,%xmm9 + ret +.cfi_endproc +.size _aesni_encrypt8,.-_aesni_encrypt8 +.type _aesni_decrypt8,@function +.align 16 +_aesni_decrypt8: +.cfi_startproc + movups (%rcx),%xmm0 + shll $4,%eax + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm2 + xorps %xmm0,%xmm3 + pxor %xmm0,%xmm4 + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + leaq 32(%rcx,%rax,1),%rcx + negq %rax + aesdec %xmm1,%xmm2 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 + aesdec %xmm1,%xmm3 + pxor %xmm0,%xmm9 + movups (%rcx,%rax,1),%xmm0 + addq $16,%rax + jmp .Ldec_loop8_inner +.align 16 +.Ldec_loop8: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 +.Ldec_loop8_inner: + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 +.Ldec_loop8_enter: + movups (%rcx,%rax,1),%xmm1 + addq $32,%rax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups -16(%rcx,%rax,1),%xmm0 + jnz .Ldec_loop8 + + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + aesdeclast %xmm0,%xmm2 + aesdeclast %xmm0,%xmm3 + aesdeclast %xmm0,%xmm4 + aesdeclast %xmm0,%xmm5 + aesdeclast %xmm0,%xmm6 + aesdeclast %xmm0,%xmm7 + aesdeclast %xmm0,%xmm8 + aesdeclast %xmm0,%xmm9 + ret +.cfi_endproc +.size _aesni_decrypt8,.-_aesni_decrypt8 +.globl aes_hw_ecb_encrypt +.hidden aes_hw_ecb_encrypt +.type aes_hw_ecb_encrypt,@function +.align 16 +aes_hw_ecb_encrypt: +.cfi_startproc +_CET_ENDBR + andq $-16,%rdx + jz .Lecb_ret + + movl 240(%rcx),%eax + movups (%rcx),%xmm0 + movq %rcx,%r11 + movl %eax,%r10d + testl %r8d,%r8d + jz .Lecb_decrypt + + cmpq $0x80,%rdx + jb .Lecb_enc_tail + + movdqu (%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + movdqu 96(%rdi),%xmm8 + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi + subq $0x80,%rdx + jmp .Lecb_enc_loop8_enter +.align 16 +.Lecb_enc_loop8: + movups %xmm2,(%rsi) + movq %r11,%rcx + movdqu (%rdi),%xmm2 + movl %r10d,%eax + movups %xmm3,16(%rsi) + movdqu 16(%rdi),%xmm3 + movups %xmm4,32(%rsi) + movdqu 32(%rdi),%xmm4 + movups %xmm5,48(%rsi) + movdqu 48(%rdi),%xmm5 + movups %xmm6,64(%rsi) + movdqu 64(%rdi),%xmm6 + movups %xmm7,80(%rsi) + movdqu 80(%rdi),%xmm7 + movups %xmm8,96(%rsi) + movdqu 96(%rdi),%xmm8 + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi +.Lecb_enc_loop8_enter: + + call _aesni_encrypt8 + + subq $0x80,%rdx + jnc .Lecb_enc_loop8 + + movups %xmm2,(%rsi) + movq %r11,%rcx + movups %xmm3,16(%rsi) + movl %r10d,%eax + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + addq $0x80,%rdx + jz .Lecb_ret + +.Lecb_enc_tail: + movups (%rdi),%xmm2 + cmpq $0x20,%rdx + jb .Lecb_enc_one + movups 16(%rdi),%xmm3 + je .Lecb_enc_two + movups 32(%rdi),%xmm4 + cmpq $0x40,%rdx + jb .Lecb_enc_three + movups 48(%rdi),%xmm5 + je .Lecb_enc_four + movups 64(%rdi),%xmm6 + cmpq $0x60,%rdx + jb .Lecb_enc_five + movups 80(%rdi),%xmm7 + je .Lecb_enc_six + movdqu 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_encrypt8 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + movups %xmm8,96(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_one: + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_enc1_3: + aesenc %xmm1,%xmm2 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_3 + aesenclast %xmm1,%xmm2 + movups %xmm2,(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_two: + call _aesni_encrypt2 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_three: + call _aesni_encrypt3 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_four: + call _aesni_encrypt4 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_five: + xorps %xmm7,%xmm7 + call _aesni_encrypt6 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + jmp .Lecb_ret +.align 16 +.Lecb_enc_six: + call _aesni_encrypt6 + movups %xmm2,(%rsi) + movups %xmm3,16(%rsi) + movups %xmm4,32(%rsi) + movups %xmm5,48(%rsi) + movups %xmm6,64(%rsi) + movups %xmm7,80(%rsi) + jmp .Lecb_ret + +.align 16 +.Lecb_decrypt: + cmpq $0x80,%rdx + jb .Lecb_dec_tail + + movdqu (%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqu 32(%rdi),%xmm4 + movdqu 48(%rdi),%xmm5 + movdqu 64(%rdi),%xmm6 + movdqu 80(%rdi),%xmm7 + movdqu 96(%rdi),%xmm8 + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi + subq $0x80,%rdx + jmp .Lecb_dec_loop8_enter +.align 16 +.Lecb_dec_loop8: + movups %xmm2,(%rsi) + movq %r11,%rcx + movdqu (%rdi),%xmm2 + movl %r10d,%eax + movups %xmm3,16(%rsi) + movdqu 16(%rdi),%xmm3 + movups %xmm4,32(%rsi) + movdqu 32(%rdi),%xmm4 + movups %xmm5,48(%rsi) + movdqu 48(%rdi),%xmm5 + movups %xmm6,64(%rsi) + movdqu 64(%rdi),%xmm6 + movups %xmm7,80(%rsi) + movdqu 80(%rdi),%xmm7 + movups %xmm8,96(%rsi) + movdqu 96(%rdi),%xmm8 + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + movdqu 112(%rdi),%xmm9 + leaq 128(%rdi),%rdi +.Lecb_dec_loop8_enter: + + call _aesni_decrypt8 + + movups (%r11),%xmm0 + subq $0x80,%rdx + jnc .Lecb_dec_loop8 + + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movq %r11,%rcx + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movl %r10d,%eax + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 + movups %xmm9,112(%rsi) + pxor %xmm9,%xmm9 + leaq 128(%rsi),%rsi + addq $0x80,%rdx + jz .Lecb_ret + +.Lecb_dec_tail: + movups (%rdi),%xmm2 + cmpq $0x20,%rdx + jb .Lecb_dec_one + movups 16(%rdi),%xmm3 + je .Lecb_dec_two + movups 32(%rdi),%xmm4 + cmpq $0x40,%rdx + jb .Lecb_dec_three + movups 48(%rdi),%xmm5 + je .Lecb_dec_four + movups 64(%rdi),%xmm6 + cmpq $0x60,%rdx + jb .Lecb_dec_five + movups 80(%rdi),%xmm7 + je .Lecb_dec_six + movups 96(%rdi),%xmm8 + movups (%rcx),%xmm0 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + movups %xmm8,96(%rsi) + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + jmp .Lecb_ret +.align 16 +.Lecb_dec_one: + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_4: + aesdec %xmm1,%xmm2 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_4 + aesdeclast %xmm1,%xmm2 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp .Lecb_ret +.align 16 +.Lecb_dec_two: + call _aesni_decrypt2 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + jmp .Lecb_ret +.align 16 +.Lecb_dec_three: + call _aesni_decrypt3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + jmp .Lecb_ret +.align 16 +.Lecb_dec_four: + call _aesni_decrypt4 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + jmp .Lecb_ret +.align 16 +.Lecb_dec_five: + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + jmp .Lecb_ret +.align 16 +.Lecb_dec_six: + call _aesni_decrypt6 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + movups %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movups %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movups %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + movups %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + movups %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + +.Lecb_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + ret +.cfi_endproc +.size aes_hw_ecb_encrypt,.-aes_hw_ecb_encrypt +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,@function +.align 16 +aes_hw_ctr32_encrypt_blocks: +.cfi_startproc +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit(%rip) +#endif + cmpq $1,%rdx + jne .Lctr32_bulk + + + + movups (%r8),%xmm2 + movups (%rdi),%xmm3 + movl 240(%rcx),%edx + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_enc1_5: + aesenc %xmm1,%xmm2 + decl %edx + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_5 + aesenclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + xorps %xmm2,%xmm2 + jmp .Lctr32_epilogue + +.align 16 +.Lctr32_bulk: + leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 + pushq %rbp +.cfi_offset %rbp,-16 + subq $128,%rsp + andq $-16,%rsp + + + + + movdqu (%r8),%xmm2 + movdqu (%rcx),%xmm0 + movl 12(%r8),%r8d + pxor %xmm0,%xmm2 + movl 12(%rcx),%ebp + movdqa %xmm2,0(%rsp) + bswapl %r8d + movdqa %xmm2,%xmm3 + movdqa %xmm2,%xmm4 + movdqa %xmm2,%xmm5 + movdqa %xmm2,64(%rsp) + movdqa %xmm2,80(%rsp) + movdqa %xmm2,96(%rsp) + movq %rdx,%r10 + movdqa %xmm2,112(%rsp) + + leaq 1(%r8),%rax + leaq 2(%r8),%rdx + bswapl %eax + bswapl %edx + xorl %ebp,%eax + xorl %ebp,%edx + pinsrd $3,%eax,%xmm3 + leaq 3(%r8),%rax + movdqa %xmm3,16(%rsp) + pinsrd $3,%edx,%xmm4 + bswapl %eax + movq %r10,%rdx + leaq 4(%r8),%r10 + movdqa %xmm4,32(%rsp) + xorl %ebp,%eax + bswapl %r10d + pinsrd $3,%eax,%xmm5 + xorl %ebp,%r10d + movdqa %xmm5,48(%rsp) + leaq 5(%r8),%r9 + movl %r10d,64+12(%rsp) + bswapl %r9d + leaq 6(%r8),%r10 + movl 240(%rcx),%eax + xorl %ebp,%r9d + bswapl %r10d + movl %r9d,80+12(%rsp) + xorl %ebp,%r10d + leaq 7(%r8),%r9 + movl %r10d,96+12(%rsp) + bswapl %r9d + xorl %ebp,%r9d + movl %r9d,112+12(%rsp) + + movups 16(%rcx),%xmm1 + + movdqa 64(%rsp),%xmm6 + movdqa 80(%rsp),%xmm7 + + cmpq $8,%rdx + jb .Lctr32_tail + + leaq 128(%rcx),%rcx + subq $8,%rdx + jmp .Lctr32_loop8 + +.align 32 +.Lctr32_loop8: + addl $8,%r8d + movdqa 96(%rsp),%xmm8 + aesenc %xmm1,%xmm2 + movl %r8d,%r9d + movdqa 112(%rsp),%xmm9 + aesenc %xmm1,%xmm3 + bswapl %r9d + movups 32-128(%rcx),%xmm0 + aesenc %xmm1,%xmm4 + xorl %ebp,%r9d + nop + aesenc %xmm1,%xmm5 + movl %r9d,0+12(%rsp) + leaq 1(%r8),%r9 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 48-128(%rcx),%xmm1 + bswapl %r9d + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + movl %r9d,16+12(%rsp) + leaq 2(%r8),%r9 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 64-128(%rcx),%xmm0 + bswapl %r9d + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movl %r9d,32+12(%rsp) + leaq 3(%r8),%r9 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 80-128(%rcx),%xmm1 + bswapl %r9d + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + movl %r9d,48+12(%rsp) + leaq 4(%r8),%r9 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 96-128(%rcx),%xmm0 + bswapl %r9d + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movl %r9d,64+12(%rsp) + leaq 5(%r8),%r9 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 112-128(%rcx),%xmm1 + bswapl %r9d + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + movl %r9d,80+12(%rsp) + leaq 6(%r8),%r9 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 128-128(%rcx),%xmm0 + bswapl %r9d + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + xorl %ebp,%r9d +.byte 0x66,0x90 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movl %r9d,96+12(%rsp) + leaq 7(%r8),%r9 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 144-128(%rcx),%xmm1 + bswapl %r9d + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + xorl %ebp,%r9d + movdqu 0(%rdi),%xmm10 + aesenc %xmm0,%xmm5 + movl %r9d,112+12(%rsp) + cmpl $11,%eax + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 160-128(%rcx),%xmm0 + + jb .Lctr32_enc_done + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 176-128(%rcx),%xmm1 + + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 192-128(%rcx),%xmm0 + je .Lctr32_enc_done + + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movups 208-128(%rcx),%xmm1 + + aesenc %xmm0,%xmm2 + aesenc %xmm0,%xmm3 + aesenc %xmm0,%xmm4 + aesenc %xmm0,%xmm5 + aesenc %xmm0,%xmm6 + aesenc %xmm0,%xmm7 + aesenc %xmm0,%xmm8 + aesenc %xmm0,%xmm9 + movups 224-128(%rcx),%xmm0 + jmp .Lctr32_enc_done + +.align 16 +.Lctr32_enc_done: + movdqu 16(%rdi),%xmm11 + pxor %xmm0,%xmm10 + movdqu 32(%rdi),%xmm12 + pxor %xmm0,%xmm11 + movdqu 48(%rdi),%xmm13 + pxor %xmm0,%xmm12 + movdqu 64(%rdi),%xmm14 + pxor %xmm0,%xmm13 + movdqu 80(%rdi),%xmm15 + pxor %xmm0,%xmm14 + prefetcht0 448(%rdi) + prefetcht0 512(%rdi) + pxor %xmm0,%xmm15 + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + aesenc %xmm1,%xmm9 + movdqu 96(%rdi),%xmm1 + leaq 128(%rdi),%rdi + + aesenclast %xmm10,%xmm2 + pxor %xmm0,%xmm1 + movdqu 112-128(%rdi),%xmm10 + aesenclast %xmm11,%xmm3 + pxor %xmm0,%xmm10 + movdqa 0(%rsp),%xmm11 + aesenclast %xmm12,%xmm4 + aesenclast %xmm13,%xmm5 + movdqa 16(%rsp),%xmm12 + movdqa 32(%rsp),%xmm13 + aesenclast %xmm14,%xmm6 + aesenclast %xmm15,%xmm7 + movdqa 48(%rsp),%xmm14 + movdqa 64(%rsp),%xmm15 + aesenclast %xmm1,%xmm8 + movdqa 80(%rsp),%xmm0 + movups 16-128(%rcx),%xmm1 + aesenclast %xmm10,%xmm9 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm0,%xmm7 + movups %xmm8,96(%rsi) + movups %xmm9,112(%rsi) + leaq 128(%rsi),%rsi + + subq $8,%rdx + jnc .Lctr32_loop8 + + addq $8,%rdx + jz .Lctr32_done + leaq -128(%rcx),%rcx + +.Lctr32_tail: + + + leaq 16(%rcx),%rcx + cmpq $4,%rdx + jb .Lctr32_loop3 + je .Lctr32_loop4 + + + shll $4,%eax + movdqa 96(%rsp),%xmm8 + pxor %xmm9,%xmm9 + + movups 16(%rcx),%xmm0 + aesenc %xmm1,%xmm2 + aesenc %xmm1,%xmm3 + leaq 32-16(%rcx,%rax,1),%rcx + negq %rax + aesenc %xmm1,%xmm4 + addq $16,%rax + movups (%rdi),%xmm10 + aesenc %xmm1,%xmm5 + aesenc %xmm1,%xmm6 + movups 16(%rdi),%xmm11 + movups 32(%rdi),%xmm12 + aesenc %xmm1,%xmm7 + aesenc %xmm1,%xmm8 + + call .Lenc_loop8_enter + + movdqu 48(%rdi),%xmm13 + pxor %xmm10,%xmm2 + movdqu 64(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm10,%xmm6 + movdqu %xmm5,48(%rsi) + movdqu %xmm6,64(%rsi) + cmpq $6,%rdx + jb .Lctr32_done + + movups 80(%rdi),%xmm11 + xorps %xmm11,%xmm7 + movups %xmm7,80(%rsi) + je .Lctr32_done + + movups 96(%rdi),%xmm12 + xorps %xmm12,%xmm8 + movups %xmm8,96(%rsi) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop4: + aesenc %xmm1,%xmm2 + leaq 16(%rcx),%rcx + decl %eax + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + aesenc %xmm1,%xmm5 + movups (%rcx),%xmm1 + jnz .Lctr32_loop4 + aesenclast %xmm1,%xmm2 + aesenclast %xmm1,%xmm3 + movups (%rdi),%xmm10 + movups 16(%rdi),%xmm11 + aesenclast %xmm1,%xmm4 + aesenclast %xmm1,%xmm5 + movups 32(%rdi),%xmm12 + movups 48(%rdi),%xmm13 + + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm4,32(%rsi) + pxor %xmm13,%xmm5 + movdqu %xmm5,48(%rsi) + jmp .Lctr32_done + +.align 32 +.Lctr32_loop3: + aesenc %xmm1,%xmm2 + leaq 16(%rcx),%rcx + decl %eax + aesenc %xmm1,%xmm3 + aesenc %xmm1,%xmm4 + movups (%rcx),%xmm1 + jnz .Lctr32_loop3 + aesenclast %xmm1,%xmm2 + aesenclast %xmm1,%xmm3 + aesenclast %xmm1,%xmm4 + + movups (%rdi),%xmm10 + xorps %xmm10,%xmm2 + movups %xmm2,(%rsi) + cmpq $2,%rdx + jb .Lctr32_done + + movups 16(%rdi),%xmm11 + xorps %xmm11,%xmm3 + movups %xmm3,16(%rsi) + je .Lctr32_done + + movups 32(%rdi),%xmm12 + xorps %xmm12,%xmm4 + movups %xmm4,32(%rsi) + +.Lctr32_done: + xorps %xmm0,%xmm0 + xorl %ebp,%ebp + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + movaps %xmm0,0(%rsp) + pxor %xmm8,%xmm8 + movaps %xmm0,16(%rsp) + pxor %xmm9,%xmm9 + movaps %xmm0,32(%rsp) + pxor %xmm10,%xmm10 + movaps %xmm0,48(%rsp) + pxor %xmm11,%xmm11 + movaps %xmm0,64(%rsp) + pxor %xmm12,%xmm12 + movaps %xmm0,80(%rsp) + pxor %xmm13,%xmm13 + movaps %xmm0,96(%rsp) + pxor %xmm14,%xmm14 + movaps %xmm0,112(%rsp) + pxor %xmm15,%xmm15 + movq -8(%r11),%rbp +.cfi_restore %rbp + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lctr32_epilogue: + ret +.cfi_endproc +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,@function +.align 16 +aes_hw_cbc_encrypt: +.cfi_startproc +_CET_ENDBR + testq %rdx,%rdx + jz .Lcbc_ret + + movl 240(%rcx),%r10d + movq %rcx,%r11 + testl %r9d,%r9d + jz .Lcbc_decrypt + + movups (%r8),%xmm2 + movl %r10d,%eax + cmpq $16,%rdx + jb .Lcbc_enc_tail + subq $16,%rdx + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movups (%rdi),%xmm3 + leaq 16(%rdi),%rdi + + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + xorps %xmm0,%xmm3 + leaq 32(%rcx),%rcx + xorps %xmm3,%xmm2 +.Loop_enc1_6: + aesenc %xmm1,%xmm2 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_enc1_6 + aesenclast %xmm1,%xmm2 + movl %r10d,%eax + movq %r11,%rcx + movups %xmm2,0(%rsi) + leaq 16(%rsi),%rsi + subq $16,%rdx + jnc .Lcbc_enc_loop + addq $16,%rdx + jnz .Lcbc_enc_tail + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm2,(%r8) + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + jmp .Lcbc_ret + +.Lcbc_enc_tail: + movq %rdx,%rcx + xchgq %rdi,%rsi +.long 0x9066A4F3 + movl $16,%ecx + subq %rdx,%rcx + xorl %eax,%eax +.long 0x9066AAF3 + leaq -16(%rdi),%rdi + movl %r10d,%eax + movq %rdi,%rsi + movq %r11,%rcx + xorq %rdx,%rdx + jmp .Lcbc_enc_loop + +.align 16 +.Lcbc_decrypt: + cmpq $16,%rdx + jne .Lcbc_decrypt_bulk + + + + movdqu (%rdi),%xmm2 + movdqu (%r8),%xmm3 + movdqa %xmm2,%xmm4 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_7: + aesdec %xmm1,%xmm2 + decl %r10d + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_7 + aesdeclast %xmm1,%xmm2 + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movdqu %xmm4,(%r8) + xorps %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp .Lcbc_ret +.align 16 +.Lcbc_decrypt_bulk: + leaq (%rsp),%r11 +.cfi_def_cfa_register %r11 + pushq %rbp +.cfi_offset %rbp,-16 + subq $16,%rsp + andq $-16,%rsp + movq %rcx,%rbp + movups (%r8),%xmm10 + movl %r10d,%eax + cmpq $0x50,%rdx + jbe .Lcbc_dec_tail + + movups (%rcx),%xmm0 + movdqu 0(%rdi),%xmm2 + movdqu 16(%rdi),%xmm3 + movdqa %xmm2,%xmm11 + movdqu 32(%rdi),%xmm4 + movdqa %xmm3,%xmm12 + movdqu 48(%rdi),%xmm5 + movdqa %xmm4,%xmm13 + movdqu 64(%rdi),%xmm6 + movdqa %xmm5,%xmm14 + movdqu 80(%rdi),%xmm7 + movdqa %xmm6,%xmm15 + cmpq $0x70,%rdx + jbe .Lcbc_dec_six_or_seven + + subq $0x70,%rdx + leaq 112(%rcx),%rcx + jmp .Lcbc_dec_loop8_enter +.align 16 +.Lcbc_dec_loop8: + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi +.Lcbc_dec_loop8_enter: + movdqu 96(%rdi),%xmm8 + pxor %xmm0,%xmm2 + movdqu 112(%rdi),%xmm9 + pxor %xmm0,%xmm3 + movups 16-112(%rcx),%xmm1 + pxor %xmm0,%xmm4 + movq $-1,%rbp + cmpq $0x70,%rdx + pxor %xmm0,%xmm5 + pxor %xmm0,%xmm6 + pxor %xmm0,%xmm7 + pxor %xmm0,%xmm8 + + aesdec %xmm1,%xmm2 + pxor %xmm0,%xmm9 + movups 32-112(%rcx),%xmm0 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + adcq $0,%rbp + andq $128,%rbp + aesdec %xmm1,%xmm9 + addq %rdi,%rbp + movups 48-112(%rcx),%xmm1 + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 64-112(%rcx),%xmm0 + nop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movups 80-112(%rcx),%xmm1 + nop + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 96-112(%rcx),%xmm0 + nop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movups 112-112(%rcx),%xmm1 + nop + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 128-112(%rcx),%xmm0 + nop + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movups 144-112(%rcx),%xmm1 + cmpl $11,%eax + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 160-112(%rcx),%xmm0 + jb .Lcbc_dec_done + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movups 176-112(%rcx),%xmm1 + nop + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 192-112(%rcx),%xmm0 + je .Lcbc_dec_done + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movups 208-112(%rcx),%xmm1 + nop + aesdec %xmm0,%xmm2 + aesdec %xmm0,%xmm3 + aesdec %xmm0,%xmm4 + aesdec %xmm0,%xmm5 + aesdec %xmm0,%xmm6 + aesdec %xmm0,%xmm7 + aesdec %xmm0,%xmm8 + aesdec %xmm0,%xmm9 + movups 224-112(%rcx),%xmm0 + jmp .Lcbc_dec_done +.align 16 +.Lcbc_dec_done: + aesdec %xmm1,%xmm2 + aesdec %xmm1,%xmm3 + pxor %xmm0,%xmm10 + pxor %xmm0,%xmm11 + aesdec %xmm1,%xmm4 + aesdec %xmm1,%xmm5 + pxor %xmm0,%xmm12 + pxor %xmm0,%xmm13 + aesdec %xmm1,%xmm6 + aesdec %xmm1,%xmm7 + pxor %xmm0,%xmm14 + pxor %xmm0,%xmm15 + aesdec %xmm1,%xmm8 + aesdec %xmm1,%xmm9 + movdqu 80(%rdi),%xmm1 + + aesdeclast %xmm10,%xmm2 + movdqu 96(%rdi),%xmm10 + pxor %xmm0,%xmm1 + aesdeclast %xmm11,%xmm3 + pxor %xmm0,%xmm10 + movdqu 112(%rdi),%xmm0 + aesdeclast %xmm12,%xmm4 + leaq 128(%rdi),%rdi + movdqu 0(%rbp),%xmm11 + aesdeclast %xmm13,%xmm5 + aesdeclast %xmm14,%xmm6 + movdqu 16(%rbp),%xmm12 + movdqu 32(%rbp),%xmm13 + aesdeclast %xmm15,%xmm7 + aesdeclast %xmm1,%xmm8 + movdqu 48(%rbp),%xmm14 + movdqu 64(%rbp),%xmm15 + aesdeclast %xmm10,%xmm9 + movdqa %xmm0,%xmm10 + movdqu 80(%rbp),%xmm1 + movups -112(%rcx),%xmm0 + + movups %xmm2,(%rsi) + movdqa %xmm11,%xmm2 + movups %xmm3,16(%rsi) + movdqa %xmm12,%xmm3 + movups %xmm4,32(%rsi) + movdqa %xmm13,%xmm4 + movups %xmm5,48(%rsi) + movdqa %xmm14,%xmm5 + movups %xmm6,64(%rsi) + movdqa %xmm15,%xmm6 + movups %xmm7,80(%rsi) + movdqa %xmm1,%xmm7 + movups %xmm8,96(%rsi) + leaq 112(%rsi),%rsi + + subq $0x80,%rdx + ja .Lcbc_dec_loop8 + + movaps %xmm9,%xmm2 + leaq -112(%rcx),%rcx + addq $0x70,%rdx + jle .Lcbc_dec_clear_tail_collected + movups %xmm9,(%rsi) + leaq 16(%rsi),%rsi + cmpq $0x50,%rdx + jbe .Lcbc_dec_tail + + movaps %xmm11,%xmm2 +.Lcbc_dec_six_or_seven: + cmpq $0x60,%rdx + ja .Lcbc_dec_seven + + movaps %xmm7,%xmm8 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm8,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + leaq 80(%rsi),%rsi + movdqa %xmm7,%xmm2 + pxor %xmm7,%xmm7 + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_seven: + movups 96(%rdi),%xmm8 + xorps %xmm9,%xmm9 + call _aesni_decrypt8 + movups 80(%rdi),%xmm9 + pxor %xmm10,%xmm2 + movups 96(%rdi),%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + pxor %xmm15,%xmm7 + movdqu %xmm6,64(%rsi) + pxor %xmm6,%xmm6 + pxor %xmm9,%xmm8 + movdqu %xmm7,80(%rsi) + pxor %xmm7,%xmm7 + leaq 96(%rsi),%rsi + movdqa %xmm8,%xmm2 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 + jmp .Lcbc_dec_tail_collected + +.Lcbc_dec_tail: + movups (%rdi),%xmm2 + subq $0x10,%rdx + jbe .Lcbc_dec_one + + movups 16(%rdi),%xmm3 + movaps %xmm2,%xmm11 + subq $0x10,%rdx + jbe .Lcbc_dec_two + + movups 32(%rdi),%xmm4 + movaps %xmm3,%xmm12 + subq $0x10,%rdx + jbe .Lcbc_dec_three + + movups 48(%rdi),%xmm5 + movaps %xmm4,%xmm13 + subq $0x10,%rdx + jbe .Lcbc_dec_four + + movups 64(%rdi),%xmm6 + movaps %xmm5,%xmm14 + movaps %xmm6,%xmm15 + xorps %xmm7,%xmm7 + call _aesni_decrypt6 + pxor %xmm10,%xmm2 + movaps %xmm15,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + pxor %xmm14,%xmm6 + movdqu %xmm5,48(%rsi) + pxor %xmm5,%xmm5 + leaq 64(%rsi),%rsi + movdqa %xmm6,%xmm2 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + subq $0x10,%rdx + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_one: + movaps %xmm2,%xmm11 + movups (%rcx),%xmm0 + movups 16(%rcx),%xmm1 + leaq 32(%rcx),%rcx + xorps %xmm0,%xmm2 +.Loop_dec1_8: + aesdec %xmm1,%xmm2 + decl %eax + movups (%rcx),%xmm1 + leaq 16(%rcx),%rcx + jnz .Loop_dec1_8 + aesdeclast %xmm1,%xmm2 + xorps %xmm10,%xmm2 + movaps %xmm11,%xmm10 + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_two: + movaps %xmm3,%xmm12 + call _aesni_decrypt2 + pxor %xmm10,%xmm2 + movaps %xmm12,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + movdqa %xmm3,%xmm2 + pxor %xmm3,%xmm3 + leaq 16(%rsi),%rsi + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_three: + movaps %xmm4,%xmm13 + call _aesni_decrypt3 + pxor %xmm10,%xmm2 + movaps %xmm13,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + movdqa %xmm4,%xmm2 + pxor %xmm4,%xmm4 + leaq 32(%rsi),%rsi + jmp .Lcbc_dec_tail_collected +.align 16 +.Lcbc_dec_four: + movaps %xmm5,%xmm14 + call _aesni_decrypt4 + pxor %xmm10,%xmm2 + movaps %xmm14,%xmm10 + pxor %xmm11,%xmm3 + movdqu %xmm2,(%rsi) + pxor %xmm12,%xmm4 + movdqu %xmm3,16(%rsi) + pxor %xmm3,%xmm3 + pxor %xmm13,%xmm5 + movdqu %xmm4,32(%rsi) + pxor %xmm4,%xmm4 + movdqa %xmm5,%xmm2 + pxor %xmm5,%xmm5 + leaq 48(%rsi),%rsi + jmp .Lcbc_dec_tail_collected + +.align 16 +.Lcbc_dec_clear_tail_collected: + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + pxor %xmm8,%xmm8 + pxor %xmm9,%xmm9 +.Lcbc_dec_tail_collected: + movups %xmm10,(%r8) + andq $15,%rdx + jnz .Lcbc_dec_tail_partial + movups %xmm2,(%rsi) + pxor %xmm2,%xmm2 + jmp .Lcbc_dec_ret +.align 16 +.Lcbc_dec_tail_partial: + movaps %xmm2,(%rsp) + pxor %xmm2,%xmm2 + movq $16,%rcx + movq %rsi,%rdi + subq %rdx,%rcx + leaq (%rsp),%rsi +.long 0x9066A4F3 + movdqa %xmm2,(%rsp) + +.Lcbc_dec_ret: + xorps %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movq -8(%r11),%rbp +.cfi_restore %rbp + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lcbc_ret: + ret +.cfi_endproc +.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt +.globl aes_hw_encrypt_key_to_decrypt_key +.hidden aes_hw_encrypt_key_to_decrypt_key +.type aes_hw_encrypt_key_to_decrypt_key,@function +.align 16 +aes_hw_encrypt_key_to_decrypt_key: +.cfi_startproc +_CET_ENDBR + + movl 240(%rdi),%esi + shll $4,%esi + + leaq 16(%rdi,%rsi,1),%rdx + + movups (%rdi),%xmm0 + movups (%rdx),%xmm1 + movups %xmm0,(%rdx) + movups %xmm1,(%rdi) + leaq 16(%rdi),%rdi + leaq -16(%rdx),%rdx + +.Ldec_key_inverse: + movups (%rdi),%xmm0 + movups (%rdx),%xmm1 + aesimc %xmm0,%xmm0 + aesimc %xmm1,%xmm1 + leaq 16(%rdi),%rdi + leaq -16(%rdx),%rdx + movups %xmm0,16(%rdx) + movups %xmm1,-16(%rdi) + cmpq %rdi,%rdx + ja .Ldec_key_inverse + + movups (%rdi),%xmm0 + aesimc %xmm0,%xmm0 + pxor %xmm1,%xmm1 + movups %xmm0,(%rdx) + pxor %xmm0,%xmm0 + ret +.cfi_endproc +.size aes_hw_encrypt_key_to_decrypt_key,.-aes_hw_encrypt_key_to_decrypt_key +.globl aes_hw_set_encrypt_key_base +.hidden aes_hw_set_encrypt_key_base +.type aes_hw_set_encrypt_key_base,@function +.align 16 +aes_hw_set_encrypt_key_base: +.cfi_startproc + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit+3(%rip) +#endif + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movups (%rdi),%xmm0 + xorps %xmm4,%xmm4 + leaq 16(%rdx),%rax + cmpl $256,%esi + je .L14rounds + cmpl $192,%esi + je .L12rounds + cmpl $128,%esi + jne .Lbad_keybits + +.L10rounds: + movl $9,%esi + + movups %xmm0,(%rdx) + aeskeygenassist $0x1,%xmm0,%xmm1 + call .Lkey_expansion_128_cold + aeskeygenassist $0x2,%xmm0,%xmm1 + call .Lkey_expansion_128 + aeskeygenassist $0x4,%xmm0,%xmm1 + call .Lkey_expansion_128 + aeskeygenassist $0x8,%xmm0,%xmm1 + call .Lkey_expansion_128 + aeskeygenassist $0x10,%xmm0,%xmm1 + call .Lkey_expansion_128 + aeskeygenassist $0x20,%xmm0,%xmm1 + call .Lkey_expansion_128 + aeskeygenassist $0x40,%xmm0,%xmm1 + call .Lkey_expansion_128 + aeskeygenassist $0x80,%xmm0,%xmm1 + call .Lkey_expansion_128 + aeskeygenassist $0x1b,%xmm0,%xmm1 + call .Lkey_expansion_128 + aeskeygenassist $0x36,%xmm0,%xmm1 + call .Lkey_expansion_128 + movups %xmm0,(%rax) + movl %esi,80(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret + +.align 16 +.L12rounds: + movq 16(%rdi),%xmm2 + movl $11,%esi + + movups %xmm0,(%rdx) + aeskeygenassist $0x1,%xmm2,%xmm1 + call .Lkey_expansion_192a_cold + aeskeygenassist $0x2,%xmm2,%xmm1 + call .Lkey_expansion_192b + aeskeygenassist $0x4,%xmm2,%xmm1 + call .Lkey_expansion_192a + aeskeygenassist $0x8,%xmm2,%xmm1 + call .Lkey_expansion_192b + aeskeygenassist $0x10,%xmm2,%xmm1 + call .Lkey_expansion_192a + aeskeygenassist $0x20,%xmm2,%xmm1 + call .Lkey_expansion_192b + aeskeygenassist $0x40,%xmm2,%xmm1 + call .Lkey_expansion_192a + aeskeygenassist $0x80,%xmm2,%xmm1 + call .Lkey_expansion_192b + movups %xmm0,(%rax) + movl %esi,48(%rax) + xorq %rax,%rax + jmp .Lenc_key_ret + +.align 16 +.L14rounds: + movups 16(%rdi),%xmm2 + movl $13,%esi + leaq 16(%rax),%rax + + movups %xmm0,(%rdx) + movups %xmm2,16(%rdx) + aeskeygenassist $0x1,%xmm2,%xmm1 + call .Lkey_expansion_256a_cold + aeskeygenassist $0x1,%xmm0,%xmm1 + call .Lkey_expansion_256b + aeskeygenassist $0x2,%xmm2,%xmm1 + call .Lkey_expansion_256a + aeskeygenassist $0x2,%xmm0,%xmm1 + call .Lkey_expansion_256b + aeskeygenassist $0x4,%xmm2,%xmm1 + call .Lkey_expansion_256a + aeskeygenassist $0x4,%xmm0,%xmm1 + call .Lkey_expansion_256b + aeskeygenassist $0x8,%xmm2,%xmm1 + call .Lkey_expansion_256a + aeskeygenassist $0x8,%xmm0,%xmm1 + call .Lkey_expansion_256b + aeskeygenassist $0x10,%xmm2,%xmm1 + call .Lkey_expansion_256a + aeskeygenassist $0x10,%xmm0,%xmm1 + call .Lkey_expansion_256b + aeskeygenassist $0x20,%xmm2,%xmm1 + call .Lkey_expansion_256a + aeskeygenassist $0x20,%xmm0,%xmm1 + call .Lkey_expansion_256b + aeskeygenassist $0x40,%xmm2,%xmm1 + call .Lkey_expansion_256a + movups %xmm0,(%rax) + movl %esi,16(%rax) + xorq %rax,%rax + jmp .Lenc_key_ret + +.align 16 +.Lbad_keybits: + movq $-2,%rax +.Lenc_key_ret: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + addq $8,%rsp +.cfi_adjust_cfa_offset -8 + ret +.cfi_endproc + + +.align 16 +.Lkey_expansion_128: +.cfi_startproc + movups %xmm0,(%rax) + leaq 16(%rax),%rax +.Lkey_expansion_128_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.cfi_endproc + +.align 16 +.Lkey_expansion_192a: +.cfi_startproc + movups %xmm0,(%rax) + leaq 16(%rax),%rax +.Lkey_expansion_192a_cold: + movaps %xmm2,%xmm5 +.Lkey_expansion_192b_warm: + shufps $16,%xmm0,%xmm4 + movdqa %xmm2,%xmm3 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + pslldq $4,%xmm3 + xorps %xmm4,%xmm0 + pshufd $85,%xmm1,%xmm1 + pxor %xmm3,%xmm2 + pxor %xmm1,%xmm0 + pshufd $255,%xmm0,%xmm3 + pxor %xmm3,%xmm2 + ret +.cfi_endproc + +.align 16 +.Lkey_expansion_192b: +.cfi_startproc + movaps %xmm0,%xmm3 + shufps $68,%xmm0,%xmm5 + movups %xmm5,(%rax) + shufps $78,%xmm2,%xmm3 + movups %xmm3,16(%rax) + leaq 32(%rax),%rax + jmp .Lkey_expansion_192b_warm +.cfi_endproc + +.align 16 +.Lkey_expansion_256a: +.cfi_startproc + movups %xmm2,(%rax) + leaq 16(%rax),%rax +.Lkey_expansion_256a_cold: + shufps $16,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $140,%xmm0,%xmm4 + xorps %xmm4,%xmm0 + shufps $255,%xmm1,%xmm1 + xorps %xmm1,%xmm0 + ret +.cfi_endproc + +.align 16 +.Lkey_expansion_256b: +.cfi_startproc + movups %xmm0,(%rax) + leaq 16(%rax),%rax + + shufps $16,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $140,%xmm2,%xmm4 + xorps %xmm4,%xmm2 + shufps $170,%xmm1,%xmm1 + xorps %xmm1,%xmm2 + ret +.cfi_endproc +.size aes_hw_set_encrypt_key_base,.-aes_hw_set_encrypt_key_base + +.globl aes_hw_set_encrypt_key_alt +.hidden aes_hw_set_encrypt_key_alt +.type aes_hw_set_encrypt_key_alt,@function +.align 16 +aes_hw_set_encrypt_key_alt: +.cfi_startproc + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + movb $1,BORINGSSL_function_hit+3(%rip) +#endif + subq $8,%rsp +.cfi_adjust_cfa_offset 8 + + + movups (%rdi),%xmm0 + xorps %xmm4,%xmm4 + leaq 16(%rdx),%rax + cmpl $256,%esi + je .L14rounds_alt + cmpl $192,%esi + je .L12rounds_alt + cmpl $128,%esi + jne .Lbad_keybits_alt + + movl $9,%esi + movdqa .Lkey_rotate(%rip),%xmm5 + movl $8,%r10d + movdqa .Lkey_rcon1(%rip),%xmm4 + movdqa %xmm0,%xmm2 + movdqu %xmm0,(%rdx) + jmp .Loop_key128 + +.align 16 +.Loop_key128: + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld $1,%xmm4 + leaq 16(%rax),%rax + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,-16(%rax) + movdqa %xmm0,%xmm2 + + decl %r10d + jnz .Loop_key128 + + movdqa .Lkey_rcon1b(%rip),%xmm4 + + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + pslld $1,%xmm4 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + movdqa %xmm0,%xmm2 + pshufb %xmm5,%xmm0 + aesenclast %xmm4,%xmm0 + + movdqa %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm2,%xmm3 + pslldq $4,%xmm2 + pxor %xmm3,%xmm2 + + pxor %xmm2,%xmm0 + movdqu %xmm0,16(%rax) + + movl %esi,96(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret_alt + +.align 16 +.L12rounds_alt: + movq 16(%rdi),%xmm2 + movl $11,%esi + movdqa .Lkey_rotate192(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + movl $8,%r10d + movdqu %xmm0,(%rdx) + jmp .Loop_key192 + +.align 16 +.Loop_key192: + movq %xmm2,0(%rax) + movdqa %xmm2,%xmm1 + pshufb %xmm5,%xmm2 + aesenclast %xmm4,%xmm2 + pslld $1,%xmm4 + leaq 24(%rax),%rax + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + + pshufd $0xff,%xmm0,%xmm3 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + + pxor %xmm2,%xmm0 + pxor %xmm3,%xmm2 + movdqu %xmm0,-16(%rax) + + decl %r10d + jnz .Loop_key192 + + movl %esi,32(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret_alt + +.align 16 +.L14rounds_alt: + movups 16(%rdi),%xmm2 + movl $13,%esi + leaq 16(%rax),%rax + movdqa .Lkey_rotate(%rip),%xmm5 + movdqa .Lkey_rcon1(%rip),%xmm4 + movl $7,%r10d + movdqu %xmm0,0(%rdx) + movdqa %xmm2,%xmm1 + movdqu %xmm2,16(%rdx) + jmp .Loop_key256 + +.align 16 +.Loop_key256: + pshufb %xmm5,%xmm2 + aesenclast %xmm4,%xmm2 + + movdqa %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm0,%xmm3 + pslldq $4,%xmm0 + pxor %xmm3,%xmm0 + pslld $1,%xmm4 + + pxor %xmm2,%xmm0 + movdqu %xmm0,(%rax) + + decl %r10d + jz .Ldone_key256 + + pshufd $0xff,%xmm0,%xmm2 + pxor %xmm3,%xmm3 + aesenclast %xmm3,%xmm2 + + movdqa %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm1,%xmm3 + pslldq $4,%xmm1 + pxor %xmm3,%xmm1 + + pxor %xmm1,%xmm2 + movdqu %xmm2,16(%rax) + leaq 32(%rax),%rax + movdqa %xmm2,%xmm1 + + jmp .Loop_key256 + +.Ldone_key256: + movl %esi,16(%rax) + xorl %eax,%eax + jmp .Lenc_key_ret_alt + +.align 16 +.Lbad_keybits_alt: + movq $-2,%rax +.Lenc_key_ret_alt: + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + addq $8,%rsp +.cfi_adjust_cfa_offset -8 + ret +.cfi_endproc + +.size aes_hw_set_encrypt_key_alt,.-aes_hw_set_encrypt_key_alt +.section .rodata +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.Lincrement32: +.long 6,6,6,0 +.Lincrement64: +.long 1,0,0,0 +.Lxts_magic: +.long 0x87,0,1,0 +.Lincrement1: +.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +.Lkey_rotate: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +.Lkey_rotate192: +.long 0x04070605,0x04070605,0x04070605,0x04070605 +.Lkey_rcon1: +.long 1,1,1,1 +.Lkey_rcon1b: +.long 0x1b,0x1b,0x1b,0x1b + +.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/aesni-x86_64-win.asm b/third_party/boringssl/gen/bcm/aesni-x86_64-win.asm new file mode 100644 index 00000000..e7a8613a --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesni-x86_64-win.asm @@ -0,0 +1,2730 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + +global aes_hw_encrypt + +ALIGN 16 +aes_hw_encrypt: + +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+1))],1 +%endif + movups xmm2,XMMWORD[rcx] + mov eax,DWORD[240+r8] + movups xmm0,XMMWORD[r8] + movups xmm1,XMMWORD[16+r8] + lea r8,[32+r8] + xorps xmm2,xmm0 +$L$oop_enc1_1: + aesenc xmm2,xmm1 + dec eax + movups xmm1,XMMWORD[r8] + lea r8,[16+r8] + jnz NEAR $L$oop_enc1_1 + aesenclast xmm2,xmm1 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movups XMMWORD[rdx],xmm2 + pxor xmm2,xmm2 + ret + + + +global aes_hw_decrypt + +ALIGN 16 +aes_hw_decrypt: + +_CET_ENDBR + movups xmm2,XMMWORD[rcx] + mov eax,DWORD[240+r8] + movups xmm0,XMMWORD[r8] + movups xmm1,XMMWORD[16+r8] + lea r8,[32+r8] + xorps xmm2,xmm0 +$L$oop_dec1_2: + aesdec xmm2,xmm1 + dec eax + movups xmm1,XMMWORD[r8] + lea r8,[16+r8] + jnz NEAR $L$oop_dec1_2 + aesdeclast xmm2,xmm1 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movups XMMWORD[rdx],xmm2 + pxor xmm2,xmm2 + ret + + + +ALIGN 16 +_aesni_encrypt2: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax + add rax,16 + +$L$enc_loop2: + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop2 + + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenclast xmm2,xmm0 + aesenclast xmm3,xmm0 + ret + + + +ALIGN 16 +_aesni_decrypt2: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax + add rax,16 + +$L$dec_loop2: + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$dec_loop2 + + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdeclast xmm2,xmm0 + aesdeclast xmm3,xmm0 + ret + + + +ALIGN 16 +_aesni_encrypt3: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + xorps xmm4,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax + add rax,16 + +$L$enc_loop3: + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + aesenc xmm4,xmm0 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop3 + + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenclast xmm2,xmm0 + aesenclast xmm3,xmm0 + aesenclast xmm4,xmm0 + ret + + + +ALIGN 16 +_aesni_decrypt3: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + xorps xmm4,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax + add rax,16 + +$L$dec_loop3: + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$dec_loop3 + + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdeclast xmm2,xmm0 + aesdeclast xmm3,xmm0 + aesdeclast xmm4,xmm0 + ret + + + +ALIGN 16 +_aesni_encrypt4: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + xorps xmm4,xmm0 + xorps xmm5,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax + DB 0x0f,0x1f,0x00 + add rax,16 + +$L$enc_loop4: + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + aesenc xmm4,xmm0 + aesenc xmm5,xmm0 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop4 + + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + aesenclast xmm2,xmm0 + aesenclast xmm3,xmm0 + aesenclast xmm4,xmm0 + aesenclast xmm5,xmm0 + ret + + + +ALIGN 16 +_aesni_decrypt4: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + xorps xmm4,xmm0 + xorps xmm5,xmm0 + movups xmm0,XMMWORD[32+rcx] + lea rcx,[32+rax*1+rcx] + neg rax + DB 0x0f,0x1f,0x00 + add rax,16 + +$L$dec_loop4: + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + aesdec xmm5,xmm0 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$dec_loop4 + + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdeclast xmm2,xmm0 + aesdeclast xmm3,xmm0 + aesdeclast xmm4,xmm0 + aesdeclast xmm5,xmm0 + ret + + + +ALIGN 16 +_aesni_encrypt6: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + aesenc xmm2,xmm1 + lea rcx,[32+rax*1+rcx] + neg rax + aesenc xmm3,xmm1 + pxor xmm5,xmm0 + pxor xmm6,xmm0 + aesenc xmm4,xmm1 + pxor xmm7,xmm0 + movups xmm0,XMMWORD[rax*1+rcx] + add rax,16 + jmp NEAR $L$enc_loop6_enter +ALIGN 16 +$L$enc_loop6: + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 +$L$enc_loop6_enter: + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + aesenc xmm4,xmm0 + aesenc xmm5,xmm0 + aesenc xmm6,xmm0 + aesenc xmm7,xmm0 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop6 + + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + aesenclast xmm2,xmm0 + aesenclast xmm3,xmm0 + aesenclast xmm4,xmm0 + aesenclast xmm5,xmm0 + aesenclast xmm6,xmm0 + aesenclast xmm7,xmm0 + ret + + + +ALIGN 16 +_aesni_decrypt6: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + pxor xmm3,xmm0 + pxor xmm4,xmm0 + aesdec xmm2,xmm1 + lea rcx,[32+rax*1+rcx] + neg rax + aesdec xmm3,xmm1 + pxor xmm5,xmm0 + pxor xmm6,xmm0 + aesdec xmm4,xmm1 + pxor xmm7,xmm0 + movups xmm0,XMMWORD[rax*1+rcx] + add rax,16 + jmp NEAR $L$dec_loop6_enter +ALIGN 16 +$L$dec_loop6: + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 +$L$dec_loop6_enter: + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + aesdec xmm5,xmm0 + aesdec xmm6,xmm0 + aesdec xmm7,xmm0 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$dec_loop6 + + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + aesdeclast xmm2,xmm0 + aesdeclast xmm3,xmm0 + aesdeclast xmm4,xmm0 + aesdeclast xmm5,xmm0 + aesdeclast xmm6,xmm0 + aesdeclast xmm7,xmm0 + ret + + + +ALIGN 16 +_aesni_encrypt8: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + pxor xmm4,xmm0 + pxor xmm5,xmm0 + pxor xmm6,xmm0 + lea rcx,[32+rax*1+rcx] + neg rax + aesenc xmm2,xmm1 + pxor xmm7,xmm0 + pxor xmm8,xmm0 + aesenc xmm3,xmm1 + pxor xmm9,xmm0 + movups xmm0,XMMWORD[rax*1+rcx] + add rax,16 + jmp NEAR $L$enc_loop8_inner +ALIGN 16 +$L$enc_loop8: + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 +$L$enc_loop8_inner: + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + aesenc xmm8,xmm1 + aesenc xmm9,xmm1 +$L$enc_loop8_enter: + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + aesenc xmm4,xmm0 + aesenc xmm5,xmm0 + aesenc xmm6,xmm0 + aesenc xmm7,xmm0 + aesenc xmm8,xmm0 + aesenc xmm9,xmm0 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$enc_loop8 + + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + aesenc xmm8,xmm1 + aesenc xmm9,xmm1 + aesenclast xmm2,xmm0 + aesenclast xmm3,xmm0 + aesenclast xmm4,xmm0 + aesenclast xmm5,xmm0 + aesenclast xmm6,xmm0 + aesenclast xmm7,xmm0 + aesenclast xmm8,xmm0 + aesenclast xmm9,xmm0 + ret + + + +ALIGN 16 +_aesni_decrypt8: + + movups xmm0,XMMWORD[rcx] + shl eax,4 + movups xmm1,XMMWORD[16+rcx] + xorps xmm2,xmm0 + xorps xmm3,xmm0 + pxor xmm4,xmm0 + pxor xmm5,xmm0 + pxor xmm6,xmm0 + lea rcx,[32+rax*1+rcx] + neg rax + aesdec xmm2,xmm1 + pxor xmm7,xmm0 + pxor xmm8,xmm0 + aesdec xmm3,xmm1 + pxor xmm9,xmm0 + movups xmm0,XMMWORD[rax*1+rcx] + add rax,16 + jmp NEAR $L$dec_loop8_inner +ALIGN 16 +$L$dec_loop8: + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 +$L$dec_loop8_inner: + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + aesdec xmm8,xmm1 + aesdec xmm9,xmm1 +$L$dec_loop8_enter: + movups xmm1,XMMWORD[rax*1+rcx] + add rax,32 + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + aesdec xmm5,xmm0 + aesdec xmm6,xmm0 + aesdec xmm7,xmm0 + aesdec xmm8,xmm0 + aesdec xmm9,xmm0 + movups xmm0,XMMWORD[((-16))+rax*1+rcx] + jnz NEAR $L$dec_loop8 + + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + aesdec xmm8,xmm1 + aesdec xmm9,xmm1 + aesdeclast xmm2,xmm0 + aesdeclast xmm3,xmm0 + aesdeclast xmm4,xmm0 + aesdeclast xmm5,xmm0 + aesdeclast xmm6,xmm0 + aesdeclast xmm7,xmm0 + aesdeclast xmm8,xmm0 + aesdeclast xmm9,xmm0 + ret + + +global aes_hw_ecb_encrypt + +ALIGN 16 +aes_hw_ecb_encrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes_hw_ecb_encrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + lea rsp,[((-88))+rsp] + movaps XMMWORD[rsp],xmm6 + movaps XMMWORD[16+rsp],xmm7 + movaps XMMWORD[32+rsp],xmm8 + movaps XMMWORD[48+rsp],xmm9 +$L$ecb_enc_body: + and rdx,-16 + jz NEAR $L$ecb_ret + + mov eax,DWORD[240+rcx] + movups xmm0,XMMWORD[rcx] + mov r11,rcx + mov r10d,eax + test r8d,r8d + jz NEAR $L$ecb_decrypt + + cmp rdx,0x80 + jb NEAR $L$ecb_enc_tail + + movdqu xmm2,XMMWORD[rdi] + movdqu xmm3,XMMWORD[16+rdi] + movdqu xmm4,XMMWORD[32+rdi] + movdqu xmm5,XMMWORD[48+rdi] + movdqu xmm6,XMMWORD[64+rdi] + movdqu xmm7,XMMWORD[80+rdi] + movdqu xmm8,XMMWORD[96+rdi] + movdqu xmm9,XMMWORD[112+rdi] + lea rdi,[128+rdi] + sub rdx,0x80 + jmp NEAR $L$ecb_enc_loop8_enter +ALIGN 16 +$L$ecb_enc_loop8: + movups XMMWORD[rsi],xmm2 + mov rcx,r11 + movdqu xmm2,XMMWORD[rdi] + mov eax,r10d + movups XMMWORD[16+rsi],xmm3 + movdqu xmm3,XMMWORD[16+rdi] + movups XMMWORD[32+rsi],xmm4 + movdqu xmm4,XMMWORD[32+rdi] + movups XMMWORD[48+rsi],xmm5 + movdqu xmm5,XMMWORD[48+rdi] + movups XMMWORD[64+rsi],xmm6 + movdqu xmm6,XMMWORD[64+rdi] + movups XMMWORD[80+rsi],xmm7 + movdqu xmm7,XMMWORD[80+rdi] + movups XMMWORD[96+rsi],xmm8 + movdqu xmm8,XMMWORD[96+rdi] + movups XMMWORD[112+rsi],xmm9 + lea rsi,[128+rsi] + movdqu xmm9,XMMWORD[112+rdi] + lea rdi,[128+rdi] +$L$ecb_enc_loop8_enter: + + call _aesni_encrypt8 + + sub rdx,0x80 + jnc NEAR $L$ecb_enc_loop8 + + movups XMMWORD[rsi],xmm2 + mov rcx,r11 + movups XMMWORD[16+rsi],xmm3 + mov eax,r10d + movups XMMWORD[32+rsi],xmm4 + movups XMMWORD[48+rsi],xmm5 + movups XMMWORD[64+rsi],xmm6 + movups XMMWORD[80+rsi],xmm7 + movups XMMWORD[96+rsi],xmm8 + movups XMMWORD[112+rsi],xmm9 + lea rsi,[128+rsi] + add rdx,0x80 + jz NEAR $L$ecb_ret + +$L$ecb_enc_tail: + movups xmm2,XMMWORD[rdi] + cmp rdx,0x20 + jb NEAR $L$ecb_enc_one + movups xmm3,XMMWORD[16+rdi] + je NEAR $L$ecb_enc_two + movups xmm4,XMMWORD[32+rdi] + cmp rdx,0x40 + jb NEAR $L$ecb_enc_three + movups xmm5,XMMWORD[48+rdi] + je NEAR $L$ecb_enc_four + movups xmm6,XMMWORD[64+rdi] + cmp rdx,0x60 + jb NEAR $L$ecb_enc_five + movups xmm7,XMMWORD[80+rdi] + je NEAR $L$ecb_enc_six + movdqu xmm8,XMMWORD[96+rdi] + xorps xmm9,xmm9 + call _aesni_encrypt8 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + movups XMMWORD[48+rsi],xmm5 + movups XMMWORD[64+rsi],xmm6 + movups XMMWORD[80+rsi],xmm7 + movups XMMWORD[96+rsi],xmm8 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_enc_one: + movups xmm0,XMMWORD[rcx] + movups xmm1,XMMWORD[16+rcx] + lea rcx,[32+rcx] + xorps xmm2,xmm0 +$L$oop_enc1_3: + aesenc xmm2,xmm1 + dec eax + movups xmm1,XMMWORD[rcx] + lea rcx,[16+rcx] + jnz NEAR $L$oop_enc1_3 + aesenclast xmm2,xmm1 + movups XMMWORD[rsi],xmm2 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_enc_two: + call _aesni_encrypt2 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_enc_three: + call _aesni_encrypt3 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_enc_four: + call _aesni_encrypt4 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + movups XMMWORD[48+rsi],xmm5 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_enc_five: + xorps xmm7,xmm7 + call _aesni_encrypt6 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + movups XMMWORD[48+rsi],xmm5 + movups XMMWORD[64+rsi],xmm6 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_enc_six: + call _aesni_encrypt6 + movups XMMWORD[rsi],xmm2 + movups XMMWORD[16+rsi],xmm3 + movups XMMWORD[32+rsi],xmm4 + movups XMMWORD[48+rsi],xmm5 + movups XMMWORD[64+rsi],xmm6 + movups XMMWORD[80+rsi],xmm7 + jmp NEAR $L$ecb_ret + +ALIGN 16 +$L$ecb_decrypt: + cmp rdx,0x80 + jb NEAR $L$ecb_dec_tail + + movdqu xmm2,XMMWORD[rdi] + movdqu xmm3,XMMWORD[16+rdi] + movdqu xmm4,XMMWORD[32+rdi] + movdqu xmm5,XMMWORD[48+rdi] + movdqu xmm6,XMMWORD[64+rdi] + movdqu xmm7,XMMWORD[80+rdi] + movdqu xmm8,XMMWORD[96+rdi] + movdqu xmm9,XMMWORD[112+rdi] + lea rdi,[128+rdi] + sub rdx,0x80 + jmp NEAR $L$ecb_dec_loop8_enter +ALIGN 16 +$L$ecb_dec_loop8: + movups XMMWORD[rsi],xmm2 + mov rcx,r11 + movdqu xmm2,XMMWORD[rdi] + mov eax,r10d + movups XMMWORD[16+rsi],xmm3 + movdqu xmm3,XMMWORD[16+rdi] + movups XMMWORD[32+rsi],xmm4 + movdqu xmm4,XMMWORD[32+rdi] + movups XMMWORD[48+rsi],xmm5 + movdqu xmm5,XMMWORD[48+rdi] + movups XMMWORD[64+rsi],xmm6 + movdqu xmm6,XMMWORD[64+rdi] + movups XMMWORD[80+rsi],xmm7 + movdqu xmm7,XMMWORD[80+rdi] + movups XMMWORD[96+rsi],xmm8 + movdqu xmm8,XMMWORD[96+rdi] + movups XMMWORD[112+rsi],xmm9 + lea rsi,[128+rsi] + movdqu xmm9,XMMWORD[112+rdi] + lea rdi,[128+rdi] +$L$ecb_dec_loop8_enter: + + call _aesni_decrypt8 + + movups xmm0,XMMWORD[r11] + sub rdx,0x80 + jnc NEAR $L$ecb_dec_loop8 + + movups XMMWORD[rsi],xmm2 + pxor xmm2,xmm2 + mov rcx,r11 + movups XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + mov eax,r10d + movups XMMWORD[32+rsi],xmm4 + pxor xmm4,xmm4 + movups XMMWORD[48+rsi],xmm5 + pxor xmm5,xmm5 + movups XMMWORD[64+rsi],xmm6 + pxor xmm6,xmm6 + movups XMMWORD[80+rsi],xmm7 + pxor xmm7,xmm7 + movups XMMWORD[96+rsi],xmm8 + pxor xmm8,xmm8 + movups XMMWORD[112+rsi],xmm9 + pxor xmm9,xmm9 + lea rsi,[128+rsi] + add rdx,0x80 + jz NEAR $L$ecb_ret + +$L$ecb_dec_tail: + movups xmm2,XMMWORD[rdi] + cmp rdx,0x20 + jb NEAR $L$ecb_dec_one + movups xmm3,XMMWORD[16+rdi] + je NEAR $L$ecb_dec_two + movups xmm4,XMMWORD[32+rdi] + cmp rdx,0x40 + jb NEAR $L$ecb_dec_three + movups xmm5,XMMWORD[48+rdi] + je NEAR $L$ecb_dec_four + movups xmm6,XMMWORD[64+rdi] + cmp rdx,0x60 + jb NEAR $L$ecb_dec_five + movups xmm7,XMMWORD[80+rdi] + je NEAR $L$ecb_dec_six + movups xmm8,XMMWORD[96+rdi] + movups xmm0,XMMWORD[rcx] + xorps xmm9,xmm9 + call _aesni_decrypt8 + movups XMMWORD[rsi],xmm2 + pxor xmm2,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + movups XMMWORD[32+rsi],xmm4 + pxor xmm4,xmm4 + movups XMMWORD[48+rsi],xmm5 + pxor xmm5,xmm5 + movups XMMWORD[64+rsi],xmm6 + pxor xmm6,xmm6 + movups XMMWORD[80+rsi],xmm7 + pxor xmm7,xmm7 + movups XMMWORD[96+rsi],xmm8 + pxor xmm8,xmm8 + pxor xmm9,xmm9 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_dec_one: + movups xmm0,XMMWORD[rcx] + movups xmm1,XMMWORD[16+rcx] + lea rcx,[32+rcx] + xorps xmm2,xmm0 +$L$oop_dec1_4: + aesdec xmm2,xmm1 + dec eax + movups xmm1,XMMWORD[rcx] + lea rcx,[16+rcx] + jnz NEAR $L$oop_dec1_4 + aesdeclast xmm2,xmm1 + movups XMMWORD[rsi],xmm2 + pxor xmm2,xmm2 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_dec_two: + call _aesni_decrypt2 + movups XMMWORD[rsi],xmm2 + pxor xmm2,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_dec_three: + call _aesni_decrypt3 + movups XMMWORD[rsi],xmm2 + pxor xmm2,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + movups XMMWORD[32+rsi],xmm4 + pxor xmm4,xmm4 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_dec_four: + call _aesni_decrypt4 + movups XMMWORD[rsi],xmm2 + pxor xmm2,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + movups XMMWORD[32+rsi],xmm4 + pxor xmm4,xmm4 + movups XMMWORD[48+rsi],xmm5 + pxor xmm5,xmm5 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_dec_five: + xorps xmm7,xmm7 + call _aesni_decrypt6 + movups XMMWORD[rsi],xmm2 + pxor xmm2,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + movups XMMWORD[32+rsi],xmm4 + pxor xmm4,xmm4 + movups XMMWORD[48+rsi],xmm5 + pxor xmm5,xmm5 + movups XMMWORD[64+rsi],xmm6 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + jmp NEAR $L$ecb_ret +ALIGN 16 +$L$ecb_dec_six: + call _aesni_decrypt6 + movups XMMWORD[rsi],xmm2 + pxor xmm2,xmm2 + movups XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + movups XMMWORD[32+rsi],xmm4 + pxor xmm4,xmm4 + movups XMMWORD[48+rsi],xmm5 + pxor xmm5,xmm5 + movups XMMWORD[64+rsi],xmm6 + pxor xmm6,xmm6 + movups XMMWORD[80+rsi],xmm7 + pxor xmm7,xmm7 + +$L$ecb_ret: + xorps xmm0,xmm0 + pxor xmm1,xmm1 + movaps xmm6,XMMWORD[rsp] + movaps XMMWORD[rsp],xmm0 + movaps xmm7,XMMWORD[16+rsp] + movaps XMMWORD[16+rsp],xmm0 + movaps xmm8,XMMWORD[32+rsp] + movaps XMMWORD[32+rsp],xmm0 + movaps xmm9,XMMWORD[48+rsp] + movaps XMMWORD[48+rsp],xmm0 + lea rsp,[88+rsp] +$L$ecb_enc_ret: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes_hw_ecb_encrypt: +global aes_hw_ctr32_encrypt_blocks + +ALIGN 16 +aes_hw_ctr32_encrypt_blocks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes_hw_ctr32_encrypt_blocks: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST + mov BYTE[BORINGSSL_function_hit],1 +%endif + cmp rdx,1 + jne NEAR $L$ctr32_bulk + + + + movups xmm2,XMMWORD[r8] + movups xmm3,XMMWORD[rdi] + mov edx,DWORD[240+rcx] + movups xmm0,XMMWORD[rcx] + movups xmm1,XMMWORD[16+rcx] + lea rcx,[32+rcx] + xorps xmm2,xmm0 +$L$oop_enc1_5: + aesenc xmm2,xmm1 + dec edx + movups xmm1,XMMWORD[rcx] + lea rcx,[16+rcx] + jnz NEAR $L$oop_enc1_5 + aesenclast xmm2,xmm1 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + xorps xmm2,xmm3 + pxor xmm3,xmm3 + movups XMMWORD[rsi],xmm2 + xorps xmm2,xmm2 + jmp NEAR $L$ctr32_epilogue + +ALIGN 16 +$L$ctr32_bulk: + lea r11,[rsp] + + push rbp + + sub rsp,288 + and rsp,-16 + movaps XMMWORD[(-168)+r11],xmm6 + movaps XMMWORD[(-152)+r11],xmm7 + movaps XMMWORD[(-136)+r11],xmm8 + movaps XMMWORD[(-120)+r11],xmm9 + movaps XMMWORD[(-104)+r11],xmm10 + movaps XMMWORD[(-88)+r11],xmm11 + movaps XMMWORD[(-72)+r11],xmm12 + movaps XMMWORD[(-56)+r11],xmm13 + movaps XMMWORD[(-40)+r11],xmm14 + movaps XMMWORD[(-24)+r11],xmm15 +$L$ctr32_body: + + + + + movdqu xmm2,XMMWORD[r8] + movdqu xmm0,XMMWORD[rcx] + mov r8d,DWORD[12+r8] + pxor xmm2,xmm0 + mov ebp,DWORD[12+rcx] + movdqa XMMWORD[rsp],xmm2 + bswap r8d + movdqa xmm3,xmm2 + movdqa xmm4,xmm2 + movdqa xmm5,xmm2 + movdqa XMMWORD[64+rsp],xmm2 + movdqa XMMWORD[80+rsp],xmm2 + movdqa XMMWORD[96+rsp],xmm2 + mov r10,rdx + movdqa XMMWORD[112+rsp],xmm2 + + lea rax,[1+r8] + lea rdx,[2+r8] + bswap eax + bswap edx + xor eax,ebp + xor edx,ebp + pinsrd xmm3,eax,3 + lea rax,[3+r8] + movdqa XMMWORD[16+rsp],xmm3 + pinsrd xmm4,edx,3 + bswap eax + mov rdx,r10 + lea r10,[4+r8] + movdqa XMMWORD[32+rsp],xmm4 + xor eax,ebp + bswap r10d + pinsrd xmm5,eax,3 + xor r10d,ebp + movdqa XMMWORD[48+rsp],xmm5 + lea r9,[5+r8] + mov DWORD[((64+12))+rsp],r10d + bswap r9d + lea r10,[6+r8] + mov eax,DWORD[240+rcx] + xor r9d,ebp + bswap r10d + mov DWORD[((80+12))+rsp],r9d + xor r10d,ebp + lea r9,[7+r8] + mov DWORD[((96+12))+rsp],r10d + bswap r9d + xor r9d,ebp + mov DWORD[((112+12))+rsp],r9d + + movups xmm1,XMMWORD[16+rcx] + + movdqa xmm6,XMMWORD[64+rsp] + movdqa xmm7,XMMWORD[80+rsp] + + cmp rdx,8 + jb NEAR $L$ctr32_tail + + lea rcx,[128+rcx] + sub rdx,8 + jmp NEAR $L$ctr32_loop8 + +ALIGN 32 +$L$ctr32_loop8: + add r8d,8 + movdqa xmm8,XMMWORD[96+rsp] + aesenc xmm2,xmm1 + mov r9d,r8d + movdqa xmm9,XMMWORD[112+rsp] + aesenc xmm3,xmm1 + bswap r9d + movups xmm0,XMMWORD[((32-128))+rcx] + aesenc xmm4,xmm1 + xor r9d,ebp + nop + aesenc xmm5,xmm1 + mov DWORD[((0+12))+rsp],r9d + lea r9,[1+r8] + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + aesenc xmm8,xmm1 + aesenc xmm9,xmm1 + movups xmm1,XMMWORD[((48-128))+rcx] + bswap r9d + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + xor r9d,ebp + DB 0x66,0x90 + aesenc xmm4,xmm0 + aesenc xmm5,xmm0 + mov DWORD[((16+12))+rsp],r9d + lea r9,[2+r8] + aesenc xmm6,xmm0 + aesenc xmm7,xmm0 + aesenc xmm8,xmm0 + aesenc xmm9,xmm0 + movups xmm0,XMMWORD[((64-128))+rcx] + bswap r9d + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + xor r9d,ebp + DB 0x66,0x90 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + mov DWORD[((32+12))+rsp],r9d + lea r9,[3+r8] + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + aesenc xmm8,xmm1 + aesenc xmm9,xmm1 + movups xmm1,XMMWORD[((80-128))+rcx] + bswap r9d + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + xor r9d,ebp + DB 0x66,0x90 + aesenc xmm4,xmm0 + aesenc xmm5,xmm0 + mov DWORD[((48+12))+rsp],r9d + lea r9,[4+r8] + aesenc xmm6,xmm0 + aesenc xmm7,xmm0 + aesenc xmm8,xmm0 + aesenc xmm9,xmm0 + movups xmm0,XMMWORD[((96-128))+rcx] + bswap r9d + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + xor r9d,ebp + DB 0x66,0x90 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + mov DWORD[((64+12))+rsp],r9d + lea r9,[5+r8] + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + aesenc xmm8,xmm1 + aesenc xmm9,xmm1 + movups xmm1,XMMWORD[((112-128))+rcx] + bswap r9d + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + xor r9d,ebp + DB 0x66,0x90 + aesenc xmm4,xmm0 + aesenc xmm5,xmm0 + mov DWORD[((80+12))+rsp],r9d + lea r9,[6+r8] + aesenc xmm6,xmm0 + aesenc xmm7,xmm0 + aesenc xmm8,xmm0 + aesenc xmm9,xmm0 + movups xmm0,XMMWORD[((128-128))+rcx] + bswap r9d + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + xor r9d,ebp + DB 0x66,0x90 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + mov DWORD[((96+12))+rsp],r9d + lea r9,[7+r8] + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + aesenc xmm8,xmm1 + aesenc xmm9,xmm1 + movups xmm1,XMMWORD[((144-128))+rcx] + bswap r9d + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + aesenc xmm4,xmm0 + xor r9d,ebp + movdqu xmm10,XMMWORD[rdi] + aesenc xmm5,xmm0 + mov DWORD[((112+12))+rsp],r9d + cmp eax,11 + aesenc xmm6,xmm0 + aesenc xmm7,xmm0 + aesenc xmm8,xmm0 + aesenc xmm9,xmm0 + movups xmm0,XMMWORD[((160-128))+rcx] + + jb NEAR $L$ctr32_enc_done + + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + aesenc xmm8,xmm1 + aesenc xmm9,xmm1 + movups xmm1,XMMWORD[((176-128))+rcx] + + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + aesenc xmm4,xmm0 + aesenc xmm5,xmm0 + aesenc xmm6,xmm0 + aesenc xmm7,xmm0 + aesenc xmm8,xmm0 + aesenc xmm9,xmm0 + movups xmm0,XMMWORD[((192-128))+rcx] + je NEAR $L$ctr32_enc_done + + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + aesenc xmm8,xmm1 + aesenc xmm9,xmm1 + movups xmm1,XMMWORD[((208-128))+rcx] + + aesenc xmm2,xmm0 + aesenc xmm3,xmm0 + aesenc xmm4,xmm0 + aesenc xmm5,xmm0 + aesenc xmm6,xmm0 + aesenc xmm7,xmm0 + aesenc xmm8,xmm0 + aesenc xmm9,xmm0 + movups xmm0,XMMWORD[((224-128))+rcx] + jmp NEAR $L$ctr32_enc_done + +ALIGN 16 +$L$ctr32_enc_done: + movdqu xmm11,XMMWORD[16+rdi] + pxor xmm10,xmm0 + movdqu xmm12,XMMWORD[32+rdi] + pxor xmm11,xmm0 + movdqu xmm13,XMMWORD[48+rdi] + pxor xmm12,xmm0 + movdqu xmm14,XMMWORD[64+rdi] + pxor xmm13,xmm0 + movdqu xmm15,XMMWORD[80+rdi] + pxor xmm14,xmm0 + prefetcht0 [448+rdi] + prefetcht0 [512+rdi] + pxor xmm15,xmm0 + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + aesenc xmm7,xmm1 + aesenc xmm8,xmm1 + aesenc xmm9,xmm1 + movdqu xmm1,XMMWORD[96+rdi] + lea rdi,[128+rdi] + + aesenclast xmm2,xmm10 + pxor xmm1,xmm0 + movdqu xmm10,XMMWORD[((112-128))+rdi] + aesenclast xmm3,xmm11 + pxor xmm10,xmm0 + movdqa xmm11,XMMWORD[rsp] + aesenclast xmm4,xmm12 + aesenclast xmm5,xmm13 + movdqa xmm12,XMMWORD[16+rsp] + movdqa xmm13,XMMWORD[32+rsp] + aesenclast xmm6,xmm14 + aesenclast xmm7,xmm15 + movdqa xmm14,XMMWORD[48+rsp] + movdqa xmm15,XMMWORD[64+rsp] + aesenclast xmm8,xmm1 + movdqa xmm0,XMMWORD[80+rsp] + movups xmm1,XMMWORD[((16-128))+rcx] + aesenclast xmm9,xmm10 + + movups XMMWORD[rsi],xmm2 + movdqa xmm2,xmm11 + movups XMMWORD[16+rsi],xmm3 + movdqa xmm3,xmm12 + movups XMMWORD[32+rsi],xmm4 + movdqa xmm4,xmm13 + movups XMMWORD[48+rsi],xmm5 + movdqa xmm5,xmm14 + movups XMMWORD[64+rsi],xmm6 + movdqa xmm6,xmm15 + movups XMMWORD[80+rsi],xmm7 + movdqa xmm7,xmm0 + movups XMMWORD[96+rsi],xmm8 + movups XMMWORD[112+rsi],xmm9 + lea rsi,[128+rsi] + + sub rdx,8 + jnc NEAR $L$ctr32_loop8 + + add rdx,8 + jz NEAR $L$ctr32_done + lea rcx,[((-128))+rcx] + +$L$ctr32_tail: + + + lea rcx,[16+rcx] + cmp rdx,4 + jb NEAR $L$ctr32_loop3 + je NEAR $L$ctr32_loop4 + + + shl eax,4 + movdqa xmm8,XMMWORD[96+rsp] + pxor xmm9,xmm9 + + movups xmm0,XMMWORD[16+rcx] + aesenc xmm2,xmm1 + aesenc xmm3,xmm1 + lea rcx,[((32-16))+rax*1+rcx] + neg rax + aesenc xmm4,xmm1 + add rax,16 + movups xmm10,XMMWORD[rdi] + aesenc xmm5,xmm1 + aesenc xmm6,xmm1 + movups xmm11,XMMWORD[16+rdi] + movups xmm12,XMMWORD[32+rdi] + aesenc xmm7,xmm1 + aesenc xmm8,xmm1 + + call $L$enc_loop8_enter + + movdqu xmm13,XMMWORD[48+rdi] + pxor xmm2,xmm10 + movdqu xmm10,XMMWORD[64+rdi] + pxor xmm3,xmm11 + movdqu XMMWORD[rsi],xmm2 + pxor xmm4,xmm12 + movdqu XMMWORD[16+rsi],xmm3 + pxor xmm5,xmm13 + movdqu XMMWORD[32+rsi],xmm4 + pxor xmm6,xmm10 + movdqu XMMWORD[48+rsi],xmm5 + movdqu XMMWORD[64+rsi],xmm6 + cmp rdx,6 + jb NEAR $L$ctr32_done + + movups xmm11,XMMWORD[80+rdi] + xorps xmm7,xmm11 + movups XMMWORD[80+rsi],xmm7 + je NEAR $L$ctr32_done + + movups xmm12,XMMWORD[96+rdi] + xorps xmm8,xmm12 + movups XMMWORD[96+rsi],xmm8 + jmp NEAR $L$ctr32_done + +ALIGN 32 +$L$ctr32_loop4: + aesenc xmm2,xmm1 + lea rcx,[16+rcx] + dec eax + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + aesenc xmm5,xmm1 + movups xmm1,XMMWORD[rcx] + jnz NEAR $L$ctr32_loop4 + aesenclast xmm2,xmm1 + aesenclast xmm3,xmm1 + movups xmm10,XMMWORD[rdi] + movups xmm11,XMMWORD[16+rdi] + aesenclast xmm4,xmm1 + aesenclast xmm5,xmm1 + movups xmm12,XMMWORD[32+rdi] + movups xmm13,XMMWORD[48+rdi] + + xorps xmm2,xmm10 + movups XMMWORD[rsi],xmm2 + xorps xmm3,xmm11 + movups XMMWORD[16+rsi],xmm3 + pxor xmm4,xmm12 + movdqu XMMWORD[32+rsi],xmm4 + pxor xmm5,xmm13 + movdqu XMMWORD[48+rsi],xmm5 + jmp NEAR $L$ctr32_done + +ALIGN 32 +$L$ctr32_loop3: + aesenc xmm2,xmm1 + lea rcx,[16+rcx] + dec eax + aesenc xmm3,xmm1 + aesenc xmm4,xmm1 + movups xmm1,XMMWORD[rcx] + jnz NEAR $L$ctr32_loop3 + aesenclast xmm2,xmm1 + aesenclast xmm3,xmm1 + aesenclast xmm4,xmm1 + + movups xmm10,XMMWORD[rdi] + xorps xmm2,xmm10 + movups XMMWORD[rsi],xmm2 + cmp rdx,2 + jb NEAR $L$ctr32_done + + movups xmm11,XMMWORD[16+rdi] + xorps xmm3,xmm11 + movups XMMWORD[16+rsi],xmm3 + je NEAR $L$ctr32_done + + movups xmm12,XMMWORD[32+rdi] + xorps xmm4,xmm12 + movups XMMWORD[32+rsi],xmm4 + +$L$ctr32_done: + xorps xmm0,xmm0 + xor ebp,ebp + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movaps xmm6,XMMWORD[((-168))+r11] + movaps XMMWORD[(-168)+r11],xmm0 + movaps xmm7,XMMWORD[((-152))+r11] + movaps XMMWORD[(-152)+r11],xmm0 + movaps xmm8,XMMWORD[((-136))+r11] + movaps XMMWORD[(-136)+r11],xmm0 + movaps xmm9,XMMWORD[((-120))+r11] + movaps XMMWORD[(-120)+r11],xmm0 + movaps xmm10,XMMWORD[((-104))+r11] + movaps XMMWORD[(-104)+r11],xmm0 + movaps xmm11,XMMWORD[((-88))+r11] + movaps XMMWORD[(-88)+r11],xmm0 + movaps xmm12,XMMWORD[((-72))+r11] + movaps XMMWORD[(-72)+r11],xmm0 + movaps xmm13,XMMWORD[((-56))+r11] + movaps XMMWORD[(-56)+r11],xmm0 + movaps xmm14,XMMWORD[((-40))+r11] + movaps XMMWORD[(-40)+r11],xmm0 + movaps xmm15,XMMWORD[((-24))+r11] + movaps XMMWORD[(-24)+r11],xmm0 + movaps XMMWORD[rsp],xmm0 + movaps XMMWORD[16+rsp],xmm0 + movaps XMMWORD[32+rsp],xmm0 + movaps XMMWORD[48+rsp],xmm0 + movaps XMMWORD[64+rsp],xmm0 + movaps XMMWORD[80+rsp],xmm0 + movaps XMMWORD[96+rsp],xmm0 + movaps XMMWORD[112+rsp],xmm0 + mov rbp,QWORD[((-8))+r11] + + lea rsp,[r11] + +$L$ctr32_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes_hw_ctr32_encrypt_blocks: +global aes_hw_cbc_encrypt + +ALIGN 16 +aes_hw_cbc_encrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes_hw_cbc_encrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + test rdx,rdx + jz NEAR $L$cbc_ret + + mov r10d,DWORD[240+rcx] + mov r11,rcx + test r9d,r9d + jz NEAR $L$cbc_decrypt + + movups xmm2,XMMWORD[r8] + mov eax,r10d + cmp rdx,16 + jb NEAR $L$cbc_enc_tail + sub rdx,16 + jmp NEAR $L$cbc_enc_loop +ALIGN 16 +$L$cbc_enc_loop: + movups xmm3,XMMWORD[rdi] + lea rdi,[16+rdi] + + movups xmm0,XMMWORD[rcx] + movups xmm1,XMMWORD[16+rcx] + xorps xmm3,xmm0 + lea rcx,[32+rcx] + xorps xmm2,xmm3 +$L$oop_enc1_6: + aesenc xmm2,xmm1 + dec eax + movups xmm1,XMMWORD[rcx] + lea rcx,[16+rcx] + jnz NEAR $L$oop_enc1_6 + aesenclast xmm2,xmm1 + mov eax,r10d + mov rcx,r11 + movups XMMWORD[rsi],xmm2 + lea rsi,[16+rsi] + sub rdx,16 + jnc NEAR $L$cbc_enc_loop + add rdx,16 + jnz NEAR $L$cbc_enc_tail + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movups XMMWORD[r8],xmm2 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + jmp NEAR $L$cbc_ret + +$L$cbc_enc_tail: + mov rcx,rdx + xchg rsi,rdi + DD 0x9066A4F3 + mov ecx,16 + sub rcx,rdx + xor eax,eax + DD 0x9066AAF3 + lea rdi,[((-16))+rdi] + mov eax,r10d + mov rsi,rdi + mov rcx,r11 + xor rdx,rdx + jmp NEAR $L$cbc_enc_loop + +ALIGN 16 +$L$cbc_decrypt: + cmp rdx,16 + jne NEAR $L$cbc_decrypt_bulk + + + + movdqu xmm2,XMMWORD[rdi] + movdqu xmm3,XMMWORD[r8] + movdqa xmm4,xmm2 + movups xmm0,XMMWORD[rcx] + movups xmm1,XMMWORD[16+rcx] + lea rcx,[32+rcx] + xorps xmm2,xmm0 +$L$oop_dec1_7: + aesdec xmm2,xmm1 + dec r10d + movups xmm1,XMMWORD[rcx] + lea rcx,[16+rcx] + jnz NEAR $L$oop_dec1_7 + aesdeclast xmm2,xmm1 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + movdqu XMMWORD[r8],xmm4 + xorps xmm2,xmm3 + pxor xmm3,xmm3 + movups XMMWORD[rsi],xmm2 + pxor xmm2,xmm2 + jmp NEAR $L$cbc_ret +ALIGN 16 +$L$cbc_decrypt_bulk: + lea r11,[rsp] + + push rbp + + sub rsp,176 + and rsp,-16 + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$cbc_decrypt_body: + mov rbp,rcx + movups xmm10,XMMWORD[r8] + mov eax,r10d + cmp rdx,0x50 + jbe NEAR $L$cbc_dec_tail + + movups xmm0,XMMWORD[rcx] + movdqu xmm2,XMMWORD[rdi] + movdqu xmm3,XMMWORD[16+rdi] + movdqa xmm11,xmm2 + movdqu xmm4,XMMWORD[32+rdi] + movdqa xmm12,xmm3 + movdqu xmm5,XMMWORD[48+rdi] + movdqa xmm13,xmm4 + movdqu xmm6,XMMWORD[64+rdi] + movdqa xmm14,xmm5 + movdqu xmm7,XMMWORD[80+rdi] + movdqa xmm15,xmm6 + cmp rdx,0x70 + jbe NEAR $L$cbc_dec_six_or_seven + + sub rdx,0x70 + lea rcx,[112+rcx] + jmp NEAR $L$cbc_dec_loop8_enter +ALIGN 16 +$L$cbc_dec_loop8: + movups XMMWORD[rsi],xmm9 + lea rsi,[16+rsi] +$L$cbc_dec_loop8_enter: + movdqu xmm8,XMMWORD[96+rdi] + pxor xmm2,xmm0 + movdqu xmm9,XMMWORD[112+rdi] + pxor xmm3,xmm0 + movups xmm1,XMMWORD[((16-112))+rcx] + pxor xmm4,xmm0 + mov rbp,-1 + cmp rdx,0x70 + pxor xmm5,xmm0 + pxor xmm6,xmm0 + pxor xmm7,xmm0 + pxor xmm8,xmm0 + + aesdec xmm2,xmm1 + pxor xmm9,xmm0 + movups xmm0,XMMWORD[((32-112))+rcx] + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + aesdec xmm8,xmm1 + adc rbp,0 + and rbp,128 + aesdec xmm9,xmm1 + add rbp,rdi + movups xmm1,XMMWORD[((48-112))+rcx] + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + aesdec xmm5,xmm0 + aesdec xmm6,xmm0 + aesdec xmm7,xmm0 + aesdec xmm8,xmm0 + aesdec xmm9,xmm0 + movups xmm0,XMMWORD[((64-112))+rcx] + nop + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + aesdec xmm8,xmm1 + aesdec xmm9,xmm1 + movups xmm1,XMMWORD[((80-112))+rcx] + nop + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + aesdec xmm5,xmm0 + aesdec xmm6,xmm0 + aesdec xmm7,xmm0 + aesdec xmm8,xmm0 + aesdec xmm9,xmm0 + movups xmm0,XMMWORD[((96-112))+rcx] + nop + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + aesdec xmm8,xmm1 + aesdec xmm9,xmm1 + movups xmm1,XMMWORD[((112-112))+rcx] + nop + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + aesdec xmm5,xmm0 + aesdec xmm6,xmm0 + aesdec xmm7,xmm0 + aesdec xmm8,xmm0 + aesdec xmm9,xmm0 + movups xmm0,XMMWORD[((128-112))+rcx] + nop + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + aesdec xmm8,xmm1 + aesdec xmm9,xmm1 + movups xmm1,XMMWORD[((144-112))+rcx] + cmp eax,11 + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + aesdec xmm5,xmm0 + aesdec xmm6,xmm0 + aesdec xmm7,xmm0 + aesdec xmm8,xmm0 + aesdec xmm9,xmm0 + movups xmm0,XMMWORD[((160-112))+rcx] + jb NEAR $L$cbc_dec_done + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + aesdec xmm8,xmm1 + aesdec xmm9,xmm1 + movups xmm1,XMMWORD[((176-112))+rcx] + nop + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + aesdec xmm5,xmm0 + aesdec xmm6,xmm0 + aesdec xmm7,xmm0 + aesdec xmm8,xmm0 + aesdec xmm9,xmm0 + movups xmm0,XMMWORD[((192-112))+rcx] + je NEAR $L$cbc_dec_done + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + aesdec xmm8,xmm1 + aesdec xmm9,xmm1 + movups xmm1,XMMWORD[((208-112))+rcx] + nop + aesdec xmm2,xmm0 + aesdec xmm3,xmm0 + aesdec xmm4,xmm0 + aesdec xmm5,xmm0 + aesdec xmm6,xmm0 + aesdec xmm7,xmm0 + aesdec xmm8,xmm0 + aesdec xmm9,xmm0 + movups xmm0,XMMWORD[((224-112))+rcx] + jmp NEAR $L$cbc_dec_done +ALIGN 16 +$L$cbc_dec_done: + aesdec xmm2,xmm1 + aesdec xmm3,xmm1 + pxor xmm10,xmm0 + pxor xmm11,xmm0 + aesdec xmm4,xmm1 + aesdec xmm5,xmm1 + pxor xmm12,xmm0 + pxor xmm13,xmm0 + aesdec xmm6,xmm1 + aesdec xmm7,xmm1 + pxor xmm14,xmm0 + pxor xmm15,xmm0 + aesdec xmm8,xmm1 + aesdec xmm9,xmm1 + movdqu xmm1,XMMWORD[80+rdi] + + aesdeclast xmm2,xmm10 + movdqu xmm10,XMMWORD[96+rdi] + pxor xmm1,xmm0 + aesdeclast xmm3,xmm11 + pxor xmm10,xmm0 + movdqu xmm0,XMMWORD[112+rdi] + aesdeclast xmm4,xmm12 + lea rdi,[128+rdi] + movdqu xmm11,XMMWORD[rbp] + aesdeclast xmm5,xmm13 + aesdeclast xmm6,xmm14 + movdqu xmm12,XMMWORD[16+rbp] + movdqu xmm13,XMMWORD[32+rbp] + aesdeclast xmm7,xmm15 + aesdeclast xmm8,xmm1 + movdqu xmm14,XMMWORD[48+rbp] + movdqu xmm15,XMMWORD[64+rbp] + aesdeclast xmm9,xmm10 + movdqa xmm10,xmm0 + movdqu xmm1,XMMWORD[80+rbp] + movups xmm0,XMMWORD[((-112))+rcx] + + movups XMMWORD[rsi],xmm2 + movdqa xmm2,xmm11 + movups XMMWORD[16+rsi],xmm3 + movdqa xmm3,xmm12 + movups XMMWORD[32+rsi],xmm4 + movdqa xmm4,xmm13 + movups XMMWORD[48+rsi],xmm5 + movdqa xmm5,xmm14 + movups XMMWORD[64+rsi],xmm6 + movdqa xmm6,xmm15 + movups XMMWORD[80+rsi],xmm7 + movdqa xmm7,xmm1 + movups XMMWORD[96+rsi],xmm8 + lea rsi,[112+rsi] + + sub rdx,0x80 + ja NEAR $L$cbc_dec_loop8 + + movaps xmm2,xmm9 + lea rcx,[((-112))+rcx] + add rdx,0x70 + jle NEAR $L$cbc_dec_clear_tail_collected + movups XMMWORD[rsi],xmm9 + lea rsi,[16+rsi] + cmp rdx,0x50 + jbe NEAR $L$cbc_dec_tail + + movaps xmm2,xmm11 +$L$cbc_dec_six_or_seven: + cmp rdx,0x60 + ja NEAR $L$cbc_dec_seven + + movaps xmm8,xmm7 + call _aesni_decrypt6 + pxor xmm2,xmm10 + movaps xmm10,xmm8 + pxor xmm3,xmm11 + movdqu XMMWORD[rsi],xmm2 + pxor xmm4,xmm12 + movdqu XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + pxor xmm5,xmm13 + movdqu XMMWORD[32+rsi],xmm4 + pxor xmm4,xmm4 + pxor xmm6,xmm14 + movdqu XMMWORD[48+rsi],xmm5 + pxor xmm5,xmm5 + pxor xmm7,xmm15 + movdqu XMMWORD[64+rsi],xmm6 + pxor xmm6,xmm6 + lea rsi,[80+rsi] + movdqa xmm2,xmm7 + pxor xmm7,xmm7 + jmp NEAR $L$cbc_dec_tail_collected + +ALIGN 16 +$L$cbc_dec_seven: + movups xmm8,XMMWORD[96+rdi] + xorps xmm9,xmm9 + call _aesni_decrypt8 + movups xmm9,XMMWORD[80+rdi] + pxor xmm2,xmm10 + movups xmm10,XMMWORD[96+rdi] + pxor xmm3,xmm11 + movdqu XMMWORD[rsi],xmm2 + pxor xmm4,xmm12 + movdqu XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + pxor xmm5,xmm13 + movdqu XMMWORD[32+rsi],xmm4 + pxor xmm4,xmm4 + pxor xmm6,xmm14 + movdqu XMMWORD[48+rsi],xmm5 + pxor xmm5,xmm5 + pxor xmm7,xmm15 + movdqu XMMWORD[64+rsi],xmm6 + pxor xmm6,xmm6 + pxor xmm8,xmm9 + movdqu XMMWORD[80+rsi],xmm7 + pxor xmm7,xmm7 + lea rsi,[96+rsi] + movdqa xmm2,xmm8 + pxor xmm8,xmm8 + pxor xmm9,xmm9 + jmp NEAR $L$cbc_dec_tail_collected + +$L$cbc_dec_tail: + movups xmm2,XMMWORD[rdi] + sub rdx,0x10 + jbe NEAR $L$cbc_dec_one + + movups xmm3,XMMWORD[16+rdi] + movaps xmm11,xmm2 + sub rdx,0x10 + jbe NEAR $L$cbc_dec_two + + movups xmm4,XMMWORD[32+rdi] + movaps xmm12,xmm3 + sub rdx,0x10 + jbe NEAR $L$cbc_dec_three + + movups xmm5,XMMWORD[48+rdi] + movaps xmm13,xmm4 + sub rdx,0x10 + jbe NEAR $L$cbc_dec_four + + movups xmm6,XMMWORD[64+rdi] + movaps xmm14,xmm5 + movaps xmm15,xmm6 + xorps xmm7,xmm7 + call _aesni_decrypt6 + pxor xmm2,xmm10 + movaps xmm10,xmm15 + pxor xmm3,xmm11 + movdqu XMMWORD[rsi],xmm2 + pxor xmm4,xmm12 + movdqu XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + pxor xmm5,xmm13 + movdqu XMMWORD[32+rsi],xmm4 + pxor xmm4,xmm4 + pxor xmm6,xmm14 + movdqu XMMWORD[48+rsi],xmm5 + pxor xmm5,xmm5 + lea rsi,[64+rsi] + movdqa xmm2,xmm6 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + sub rdx,0x10 + jmp NEAR $L$cbc_dec_tail_collected + +ALIGN 16 +$L$cbc_dec_one: + movaps xmm11,xmm2 + movups xmm0,XMMWORD[rcx] + movups xmm1,XMMWORD[16+rcx] + lea rcx,[32+rcx] + xorps xmm2,xmm0 +$L$oop_dec1_8: + aesdec xmm2,xmm1 + dec eax + movups xmm1,XMMWORD[rcx] + lea rcx,[16+rcx] + jnz NEAR $L$oop_dec1_8 + aesdeclast xmm2,xmm1 + xorps xmm2,xmm10 + movaps xmm10,xmm11 + jmp NEAR $L$cbc_dec_tail_collected +ALIGN 16 +$L$cbc_dec_two: + movaps xmm12,xmm3 + call _aesni_decrypt2 + pxor xmm2,xmm10 + movaps xmm10,xmm12 + pxor xmm3,xmm11 + movdqu XMMWORD[rsi],xmm2 + movdqa xmm2,xmm3 + pxor xmm3,xmm3 + lea rsi,[16+rsi] + jmp NEAR $L$cbc_dec_tail_collected +ALIGN 16 +$L$cbc_dec_three: + movaps xmm13,xmm4 + call _aesni_decrypt3 + pxor xmm2,xmm10 + movaps xmm10,xmm13 + pxor xmm3,xmm11 + movdqu XMMWORD[rsi],xmm2 + pxor xmm4,xmm12 + movdqu XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + movdqa xmm2,xmm4 + pxor xmm4,xmm4 + lea rsi,[32+rsi] + jmp NEAR $L$cbc_dec_tail_collected +ALIGN 16 +$L$cbc_dec_four: + movaps xmm14,xmm5 + call _aesni_decrypt4 + pxor xmm2,xmm10 + movaps xmm10,xmm14 + pxor xmm3,xmm11 + movdqu XMMWORD[rsi],xmm2 + pxor xmm4,xmm12 + movdqu XMMWORD[16+rsi],xmm3 + pxor xmm3,xmm3 + pxor xmm5,xmm13 + movdqu XMMWORD[32+rsi],xmm4 + pxor xmm4,xmm4 + movdqa xmm2,xmm5 + pxor xmm5,xmm5 + lea rsi,[48+rsi] + jmp NEAR $L$cbc_dec_tail_collected + +ALIGN 16 +$L$cbc_dec_clear_tail_collected: + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 +$L$cbc_dec_tail_collected: + movups XMMWORD[r8],xmm10 + and rdx,15 + jnz NEAR $L$cbc_dec_tail_partial + movups XMMWORD[rsi],xmm2 + pxor xmm2,xmm2 + jmp NEAR $L$cbc_dec_ret +ALIGN 16 +$L$cbc_dec_tail_partial: + movaps XMMWORD[rsp],xmm2 + pxor xmm2,xmm2 + mov rcx,16 + mov rdi,rsi + sub rcx,rdx + lea rsi,[rsp] + DD 0x9066A4F3 + movdqa XMMWORD[rsp],xmm2 + +$L$cbc_dec_ret: + xorps xmm0,xmm0 + pxor xmm1,xmm1 + movaps xmm6,XMMWORD[16+rsp] + movaps XMMWORD[16+rsp],xmm0 + movaps xmm7,XMMWORD[32+rsp] + movaps XMMWORD[32+rsp],xmm0 + movaps xmm8,XMMWORD[48+rsp] + movaps XMMWORD[48+rsp],xmm0 + movaps xmm9,XMMWORD[64+rsp] + movaps XMMWORD[64+rsp],xmm0 + movaps xmm10,XMMWORD[80+rsp] + movaps XMMWORD[80+rsp],xmm0 + movaps xmm11,XMMWORD[96+rsp] + movaps XMMWORD[96+rsp],xmm0 + movaps xmm12,XMMWORD[112+rsp] + movaps XMMWORD[112+rsp],xmm0 + movaps xmm13,XMMWORD[128+rsp] + movaps XMMWORD[128+rsp],xmm0 + movaps xmm14,XMMWORD[144+rsp] + movaps XMMWORD[144+rsp],xmm0 + movaps xmm15,XMMWORD[160+rsp] + movaps XMMWORD[160+rsp],xmm0 + mov rbp,QWORD[((-8))+r11] + + lea rsp,[r11] + +$L$cbc_ret: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes_hw_cbc_encrypt: +global aes_hw_encrypt_key_to_decrypt_key + +ALIGN 16 +aes_hw_encrypt_key_to_decrypt_key: + +_CET_ENDBR + + mov edx,DWORD[240+rcx] + shl edx,4 + + lea r8,[16+rdx*1+rcx] + + movups xmm0,XMMWORD[rcx] + movups xmm1,XMMWORD[r8] + movups XMMWORD[r8],xmm0 + movups XMMWORD[rcx],xmm1 + lea rcx,[16+rcx] + lea r8,[((-16))+r8] + +$L$dec_key_inverse: + movups xmm0,XMMWORD[rcx] + movups xmm1,XMMWORD[r8] + aesimc xmm0,xmm0 + aesimc xmm1,xmm1 + lea rcx,[16+rcx] + lea r8,[((-16))+r8] + movups XMMWORD[16+r8],xmm0 + movups XMMWORD[(-16)+rcx],xmm1 + cmp r8,rcx + ja NEAR $L$dec_key_inverse + + movups xmm0,XMMWORD[rcx] + aesimc xmm0,xmm0 + pxor xmm1,xmm1 + movups XMMWORD[r8],xmm0 + pxor xmm0,xmm0 + ret + + +global aes_hw_set_encrypt_key_base + +ALIGN 16 +aes_hw_set_encrypt_key_base: + +$L$SEH_begin_aes_hw_set_encrypt_key_base_1: +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST + mov BYTE[((BORINGSSL_function_hit+3))],1 +%endif + sub rsp,8 + +$L$SEH_prologue_aes_hw_set_encrypt_key_base_2: +$L$SEH_endprologue_aes_hw_set_encrypt_key_base_3: + movups xmm0,XMMWORD[rcx] + xorps xmm4,xmm4 + lea rax,[16+r8] + cmp edx,256 + je NEAR $L$14rounds + cmp edx,192 + je NEAR $L$12rounds + cmp edx,128 + jne NEAR $L$bad_keybits + +$L$10rounds: + mov edx,9 + + movups XMMWORD[r8],xmm0 + aeskeygenassist xmm1,xmm0,0x1 + call $L$key_expansion_128_cold + aeskeygenassist xmm1,xmm0,0x2 + call $L$key_expansion_128 + aeskeygenassist xmm1,xmm0,0x4 + call $L$key_expansion_128 + aeskeygenassist xmm1,xmm0,0x8 + call $L$key_expansion_128 + aeskeygenassist xmm1,xmm0,0x10 + call $L$key_expansion_128 + aeskeygenassist xmm1,xmm0,0x20 + call $L$key_expansion_128 + aeskeygenassist xmm1,xmm0,0x40 + call $L$key_expansion_128 + aeskeygenassist xmm1,xmm0,0x80 + call $L$key_expansion_128 + aeskeygenassist xmm1,xmm0,0x1b + call $L$key_expansion_128 + aeskeygenassist xmm1,xmm0,0x36 + call $L$key_expansion_128 + movups XMMWORD[rax],xmm0 + mov DWORD[80+rax],edx + xor eax,eax + jmp NEAR $L$enc_key_ret + +ALIGN 16 +$L$12rounds: + movq xmm2,QWORD[16+rcx] + mov edx,11 + + movups XMMWORD[r8],xmm0 + aeskeygenassist xmm1,xmm2,0x1 + call $L$key_expansion_192a_cold + aeskeygenassist xmm1,xmm2,0x2 + call $L$key_expansion_192b + aeskeygenassist xmm1,xmm2,0x4 + call $L$key_expansion_192a + aeskeygenassist xmm1,xmm2,0x8 + call $L$key_expansion_192b + aeskeygenassist xmm1,xmm2,0x10 + call $L$key_expansion_192a + aeskeygenassist xmm1,xmm2,0x20 + call $L$key_expansion_192b + aeskeygenassist xmm1,xmm2,0x40 + call $L$key_expansion_192a + aeskeygenassist xmm1,xmm2,0x80 + call $L$key_expansion_192b + movups XMMWORD[rax],xmm0 + mov DWORD[48+rax],edx + xor rax,rax + jmp NEAR $L$enc_key_ret + +ALIGN 16 +$L$14rounds: + movups xmm2,XMMWORD[16+rcx] + mov edx,13 + lea rax,[16+rax] + + movups XMMWORD[r8],xmm0 + movups XMMWORD[16+r8],xmm2 + aeskeygenassist xmm1,xmm2,0x1 + call $L$key_expansion_256a_cold + aeskeygenassist xmm1,xmm0,0x1 + call $L$key_expansion_256b + aeskeygenassist xmm1,xmm2,0x2 + call $L$key_expansion_256a + aeskeygenassist xmm1,xmm0,0x2 + call $L$key_expansion_256b + aeskeygenassist xmm1,xmm2,0x4 + call $L$key_expansion_256a + aeskeygenassist xmm1,xmm0,0x4 + call $L$key_expansion_256b + aeskeygenassist xmm1,xmm2,0x8 + call $L$key_expansion_256a + aeskeygenassist xmm1,xmm0,0x8 + call $L$key_expansion_256b + aeskeygenassist xmm1,xmm2,0x10 + call $L$key_expansion_256a + aeskeygenassist xmm1,xmm0,0x10 + call $L$key_expansion_256b + aeskeygenassist xmm1,xmm2,0x20 + call $L$key_expansion_256a + aeskeygenassist xmm1,xmm0,0x20 + call $L$key_expansion_256b + aeskeygenassist xmm1,xmm2,0x40 + call $L$key_expansion_256a + movups XMMWORD[rax],xmm0 + mov DWORD[16+rax],edx + xor rax,rax + jmp NEAR $L$enc_key_ret + +ALIGN 16 +$L$bad_keybits: + mov rax,-2 +$L$enc_key_ret: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + add rsp,8 + + ret + +$L$SEH_end_aes_hw_set_encrypt_key_base_4: + +ALIGN 16 +$L$key_expansion_128: + + movups XMMWORD[rax],xmm0 + lea rax,[16+rax] +$L$key_expansion_128_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + ret + + +ALIGN 16 +$L$key_expansion_192a: + + movups XMMWORD[rax],xmm0 + lea rax,[16+rax] +$L$key_expansion_192a_cold: + movaps xmm5,xmm2 +$L$key_expansion_192b_warm: + shufps xmm4,xmm0,16 + movdqa xmm3,xmm2 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + pslldq xmm3,4 + xorps xmm0,xmm4 + pshufd xmm1,xmm1,85 + pxor xmm2,xmm3 + pxor xmm0,xmm1 + pshufd xmm3,xmm0,255 + pxor xmm2,xmm3 + ret + + +ALIGN 16 +$L$key_expansion_192b: + + movaps xmm3,xmm0 + shufps xmm5,xmm0,68 + movups XMMWORD[rax],xmm5 + shufps xmm3,xmm2,78 + movups XMMWORD[16+rax],xmm3 + lea rax,[32+rax] + jmp NEAR $L$key_expansion_192b_warm + + +ALIGN 16 +$L$key_expansion_256a: + + movups XMMWORD[rax],xmm2 + lea rax,[16+rax] +$L$key_expansion_256a_cold: + shufps xmm4,xmm0,16 + xorps xmm0,xmm4 + shufps xmm4,xmm0,140 + xorps xmm0,xmm4 + shufps xmm1,xmm1,255 + xorps xmm0,xmm1 + ret + + +ALIGN 16 +$L$key_expansion_256b: + + movups XMMWORD[rax],xmm0 + lea rax,[16+rax] + + shufps xmm4,xmm2,16 + xorps xmm2,xmm4 + shufps xmm4,xmm2,140 + xorps xmm2,xmm4 + shufps xmm1,xmm1,170 + xorps xmm2,xmm1 + ret + + + +global aes_hw_set_encrypt_key_alt + +ALIGN 16 +aes_hw_set_encrypt_key_alt: + +$L$SEH_begin_aes_hw_set_encrypt_key_alt_1: +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST + mov BYTE[((BORINGSSL_function_hit+3))],1 +%endif + sub rsp,8 + +$L$SEH_prologue_aes_hw_set_encrypt_key_alt_2: +$L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3: + movups xmm0,XMMWORD[rcx] + xorps xmm4,xmm4 + lea rax,[16+r8] + cmp edx,256 + je NEAR $L$14rounds_alt + cmp edx,192 + je NEAR $L$12rounds_alt + cmp edx,128 + jne NEAR $L$bad_keybits_alt + + mov edx,9 + movdqa xmm5,XMMWORD[$L$key_rotate] + mov r10d,8 + movdqa xmm4,XMMWORD[$L$key_rcon1] + movdqa xmm2,xmm0 + movdqu XMMWORD[r8],xmm0 + jmp NEAR $L$oop_key128 + +ALIGN 16 +$L$oop_key128: + pshufb xmm0,xmm5 + aesenclast xmm0,xmm4 + pslld xmm4,1 + lea rax,[16+rax] + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD[(-16)+rax],xmm0 + movdqa xmm2,xmm0 + + dec r10d + jnz NEAR $L$oop_key128 + + movdqa xmm4,XMMWORD[$L$key_rcon1b] + + pshufb xmm0,xmm5 + aesenclast xmm0,xmm4 + pslld xmm4,1 + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD[rax],xmm0 + + movdqa xmm2,xmm0 + pshufb xmm0,xmm5 + aesenclast xmm0,xmm4 + + movdqa xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm3,xmm2 + pslldq xmm2,4 + pxor xmm2,xmm3 + + pxor xmm0,xmm2 + movdqu XMMWORD[16+rax],xmm0 + + mov DWORD[96+rax],edx + xor eax,eax + jmp NEAR $L$enc_key_ret_alt + +ALIGN 16 +$L$12rounds_alt: + movq xmm2,QWORD[16+rcx] + mov edx,11 + movdqa xmm5,XMMWORD[$L$key_rotate192] + movdqa xmm4,XMMWORD[$L$key_rcon1] + mov r10d,8 + movdqu XMMWORD[r8],xmm0 + jmp NEAR $L$oop_key192 + +ALIGN 16 +$L$oop_key192: + movq QWORD[rax],xmm2 + movdqa xmm1,xmm2 + pshufb xmm2,xmm5 + aesenclast xmm2,xmm4 + pslld xmm4,1 + lea rax,[24+rax] + + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + + pshufd xmm3,xmm0,0xff + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + + pxor xmm0,xmm2 + pxor xmm2,xmm3 + movdqu XMMWORD[(-16)+rax],xmm0 + + dec r10d + jnz NEAR $L$oop_key192 + + mov DWORD[32+rax],edx + xor eax,eax + jmp NEAR $L$enc_key_ret_alt + +ALIGN 16 +$L$14rounds_alt: + movups xmm2,XMMWORD[16+rcx] + mov edx,13 + lea rax,[16+rax] + movdqa xmm5,XMMWORD[$L$key_rotate] + movdqa xmm4,XMMWORD[$L$key_rcon1] + mov r10d,7 + movdqu XMMWORD[r8],xmm0 + movdqa xmm1,xmm2 + movdqu XMMWORD[16+r8],xmm2 + jmp NEAR $L$oop_key256 + +ALIGN 16 +$L$oop_key256: + pshufb xmm2,xmm5 + aesenclast xmm2,xmm4 + + movdqa xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm3,xmm0 + pslldq xmm0,4 + pxor xmm0,xmm3 + pslld xmm4,1 + + pxor xmm0,xmm2 + movdqu XMMWORD[rax],xmm0 + + dec r10d + jz NEAR $L$done_key256 + + pshufd xmm2,xmm0,0xff + pxor xmm3,xmm3 + aesenclast xmm2,xmm3 + + movdqa xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm3,xmm1 + pslldq xmm1,4 + pxor xmm1,xmm3 + + pxor xmm2,xmm1 + movdqu XMMWORD[16+rax],xmm2 + lea rax,[32+rax] + movdqa xmm1,xmm2 + + jmp NEAR $L$oop_key256 + +$L$done_key256: + mov DWORD[16+rax],edx + xor eax,eax + jmp NEAR $L$enc_key_ret_alt + +ALIGN 16 +$L$bad_keybits_alt: + mov rax,-2 +$L$enc_key_ret_alt: + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + add rsp,8 + + ret + +$L$SEH_end_aes_hw_set_encrypt_key_alt_4: + +section .rdata rdata align=8 +ALIGN 64 +$L$bswap_mask: + DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +$L$increment32: + DD 6,6,6,0 +$L$increment64: + DD 1,0,0,0 +$L$xts_magic: + DD 0x87,0,1,0 +$L$increment1: + DB 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 +$L$key_rotate: + DD 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +$L$key_rotate192: + DD 0x04070605,0x04070605,0x04070605,0x04070605 +$L$key_rcon1: + DD 1,1,1,1 +$L$key_rcon1b: + DD 0x1b,0x1b,0x1b,0x1b + + DB 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 + DB 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 + DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 + DB 115,108,46,111,114,103,62,0 +ALIGN 64 +section .text + +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +ecb_ccm64_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[rax] + lea rdi,[512+r8] + mov ecx,8 + DD 0xa548f3fc + lea rax,[88+rax] + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +ctr_xts_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[208+r8] + + lea rsi,[((-168))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + mov rbp,QWORD[((-8))+rax] + mov QWORD[160+r8],rbp + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +cbc_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[152+r8] + mov rbx,QWORD[248+r8] + + lea r10,[$L$cbc_decrypt_bulk] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[120+r8] + + lea r10,[$L$cbc_decrypt_body] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + lea r10,[$L$cbc_ret] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[16+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + mov rax,QWORD[208+r8] + + mov rbp,QWORD[((-8))+rax] + mov QWORD[160+r8],rbp + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_aes_hw_ecb_encrypt wrt ..imagebase + DD $L$SEH_end_aes_hw_ecb_encrypt wrt ..imagebase + DD $L$SEH_info_ecb wrt ..imagebase + + DD $L$SEH_begin_aes_hw_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_end_aes_hw_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_info_ctr32 wrt ..imagebase + DD $L$SEH_begin_aes_hw_cbc_encrypt wrt ..imagebase + DD $L$SEH_end_aes_hw_cbc_encrypt wrt ..imagebase + DD $L$SEH_info_cbc wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ecb: + DB 9,0,0,0 + DD ecb_ccm64_se_handler wrt ..imagebase + DD $L$ecb_enc_body wrt ..imagebase,$L$ecb_enc_ret wrt ..imagebase +$L$SEH_info_ctr32: + DB 9,0,0,0 + DD ctr_xts_se_handler wrt ..imagebase + DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase +$L$SEH_info_cbc: + DB 9,0,0,0 + DD cbc_se_handler wrt ..imagebase +section .pdata +ALIGN 4 + DD $L$SEH_begin_aes_hw_set_encrypt_key_base_1 wrt ..imagebase + DD $L$SEH_end_aes_hw_set_encrypt_key_base_4 wrt ..imagebase + DD $L$SEH_info_aes_hw_set_encrypt_key_base_0 wrt ..imagebase + + DD $L$SEH_begin_aes_hw_set_encrypt_key_alt_1 wrt ..imagebase + DD $L$SEH_end_aes_hw_set_encrypt_key_alt_4 wrt ..imagebase + DD $L$SEH_info_aes_hw_set_encrypt_key_alt_0 wrt ..imagebase + + +section .xdata +ALIGN 4 +$L$SEH_info_aes_hw_set_encrypt_key_base_0: + DB 1 + DB $L$SEH_endprologue_aes_hw_set_encrypt_key_base_3-$L$SEH_begin_aes_hw_set_encrypt_key_base_1 + DB 1 + DB 0 + DB $L$SEH_prologue_aes_hw_set_encrypt_key_base_2-$L$SEH_begin_aes_hw_set_encrypt_key_base_1 + DB 2 + + DW 0 +$L$SEH_info_aes_hw_set_encrypt_key_alt_0: + DB 1 + DB $L$SEH_endprologue_aes_hw_set_encrypt_key_alt_3-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1 + DB 1 + DB 0 + DB $L$SEH_prologue_aes_hw_set_encrypt_key_alt_2-$L$SEH_begin_aes_hw_set_encrypt_key_alt_1 + DB 2 + + DW 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/aesv8-armv7-linux.S b/third_party/boringssl/gen/bcm/aesv8-armv7-linux.S new file mode 100644 index 00000000..2b3929a1 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesv8-armv7-linux.S @@ -0,0 +1,782 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +#if __ARM_MAX_ARCH__>=7 +.text +.arch armv7-a @ don't confuse not-so-latest binutils with argv8 :-) +.fpu neon +.code 32 +#undef __thumb2__ +.align 5 +.Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl aes_hw_set_encrypt_key +.hidden aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,%function +.align 5 +aes_hw_set_encrypt_key: +.Lenc_key: + mov r3,#-2 + cmp r1,#128 + blt .Lenc_key_abort + cmp r1,#256 + bgt .Lenc_key_abort + tst r1,#0x3f + bne .Lenc_key_abort + + adr r3,.Lrcon + cmp r1,#192 + + veor q0,q0,q0 + vld1.8 {q3},[r0]! + mov r1,#8 @ reuse r1 + vld1.32 {q1,q2},[r3]! + + blt .Loop128 + beq .L192 + b .L256 + +.align 4 +.Loop128: + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + bne .Loop128 + + vld1.32 {q1},[r3] + + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + + vtbl.8 d20,{q3},d4 + vtbl.8 d21,{q3},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q3},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + veor q3,q3,q10 + vst1.32 {q3},[r2] + add r2,r2,#0x50 + + mov r12,#10 + b .Ldone + +.align 4 +.L192: + vld1.8 {d16},[r0]! + vmov.i8 q10,#8 @ borrow q10 + vst1.32 {q3},[r2]! + vsub.i8 q2,q2,q10 @ adjust the mask + +.Loop192: + vtbl.8 d20,{q8},d4 + vtbl.8 d21,{q8},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {d16},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + + vdup.32 q9,d7[1] + veor q9,q9,q8 + veor q10,q10,q1 + vext.8 q8,q0,q8,#12 + vshl.u8 q1,q1,#1 + veor q8,q8,q9 + veor q3,q3,q10 + veor q8,q8,q10 + vst1.32 {q3},[r2]! + bne .Loop192 + + mov r12,#12 + add r2,r2,#0x20 + b .Ldone + +.align 4 +.L256: + vld1.8 {q8},[r0] + mov r1,#7 + mov r12,#14 + vst1.32 {q3},[r2]! + +.Loop256: + vtbl.8 d20,{q8},d4 + vtbl.8 d21,{q8},d5 + vext.8 q9,q0,q3,#12 + vst1.32 {q8},[r2]! +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + subs r1,r1,#1 + + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q3,q3,q9 + vext.8 q9,q0,q9,#12 + veor q10,q10,q1 + veor q3,q3,q9 + vshl.u8 q1,q1,#1 + veor q3,q3,q10 + vst1.32 {q3},[r2]! + beq .Ldone + + vdup.32 q10,d7[1] + vext.8 q9,q0,q8,#12 +.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 + + veor q8,q8,q9 + vext.8 q9,q0,q9,#12 + veor q8,q8,q9 + vext.8 q9,q0,q9,#12 + veor q8,q8,q9 + + veor q8,q8,q10 + b .Loop256 + +.Ldone: + str r12,[r2] + mov r3,#0 + +.Lenc_key_abort: + mov r0,r3 @ return value + + bx lr +.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key + +.globl aes_hw_set_decrypt_key +.hidden aes_hw_set_decrypt_key +.type aes_hw_set_decrypt_key,%function +.align 5 +aes_hw_set_decrypt_key: + stmdb sp!,{r4,lr} + bl .Lenc_key + + cmp r0,#0 + bne .Ldec_key_abort + + sub r2,r2,#240 @ restore original r2 + mov r4,#-16 + add r0,r2,r12,lsl#4 @ end of key schedule + + vld1.32 {q0},[r2] + vld1.32 {q1},[r0] + vst1.32 {q0},[r0],r4 + vst1.32 {q1},[r2]! + +.Loop_imc: + vld1.32 {q0},[r2] + vld1.32 {q1},[r0] +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 + vst1.32 {q0},[r0],r4 + vst1.32 {q1},[r2]! + cmp r0,r2 + bhi .Loop_imc + + vld1.32 {q0},[r2] +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 + vst1.32 {q0},[r0] + + eor r0,r0,r0 @ return value +.Ldec_key_abort: + ldmia sp!,{r4,pc} +.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key +.globl aes_hw_encrypt +.hidden aes_hw_encrypt +.type aes_hw_encrypt,%function +.align 5 +aes_hw_encrypt: + AARCH64_VALID_CALL_TARGET + ldr r3,[r2,#240] + vld1.32 {q0},[r2]! + vld1.8 {q2},[r0] + sub r3,r3,#2 + vld1.32 {q1},[r2]! + +.Loop_enc: +.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 +.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q0},[r2]! + subs r3,r3,#2 +.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 +.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q1},[r2]! + bgt .Loop_enc + +.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 +.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 + vld1.32 {q0},[r2] +.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 + veor q2,q2,q0 + + vst1.8 {q2},[r1] + bx lr +.size aes_hw_encrypt,.-aes_hw_encrypt +.globl aes_hw_decrypt +.hidden aes_hw_decrypt +.type aes_hw_decrypt,%function +.align 5 +aes_hw_decrypt: + AARCH64_VALID_CALL_TARGET + ldr r3,[r2,#240] + vld1.32 {q0},[r2]! + vld1.8 {q2},[r0] + sub r3,r3,#2 + vld1.32 {q1},[r2]! + +.Loop_dec: +.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 +.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q0},[r2]! + subs r3,r3,#2 +.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 +.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q1},[r2]! + bgt .Loop_dec + +.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 +.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 + vld1.32 {q0},[r2] +.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 + veor q2,q2,q0 + + vst1.8 {q2},[r1] + bx lr +.size aes_hw_decrypt,.-aes_hw_decrypt +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,%function +.align 5 +aes_hw_cbc_encrypt: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,lr} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldmia ip,{r4,r5} @ load remaining args + subs r2,r2,#16 + mov r8,#16 + blo .Lcbc_abort + moveq r8,#0 + + cmp r5,#0 @ en- or decrypting? + ldr r5,[r3,#240] + and r2,r2,#-16 + vld1.8 {q6},[r4] + vld1.8 {q0},[r0],r8 + + vld1.32 {q8,q9},[r3] @ load key schedule... + sub r5,r5,#6 + add r7,r3,r5,lsl#4 @ pointer to last 7 round keys + sub r5,r5,#2 + vld1.32 {q10,q11},[r7]! + vld1.32 {q12,q13},[r7]! + vld1.32 {q14,q15},[r7]! + vld1.32 {q7},[r7] + + add r7,r3,#32 + mov r6,r5 + beq .Lcbc_dec + + cmp r5,#2 + veor q0,q0,q6 + veor q5,q8,q7 + beq .Lcbc_enc128 + + vld1.32 {q2,q3},[r7] + add r7,r3,#16 + add r6,r3,#16*4 + add r12,r3,#16*5 +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + add r14,r3,#16*6 + add r3,r3,#16*7 + b .Lenter_cbc_enc + +.align 4 +.Loop_cbc_enc: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vst1.8 {q6},[r1]! +.Lenter_cbc_enc: +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q8},[r6] + cmp r5,#4 +.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r12] + beq .Lcbc_enc192 + +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q8},[r14] +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r3] + nop + +.Lcbc_enc192: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r2,r2,#16 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + moveq r8,#0 +.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q8},[r0],r8 +.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q8,q8,q5 +.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.32 {q9},[r7] @ re-pre-load rndkey[1] +.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + veor q6,q0,q7 + bhs .Loop_cbc_enc + + vst1.8 {q6},[r1]! + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + vld1.32 {q2,q3},[r7] +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vst1.8 {q6},[r1]! +.Lenter_cbc_enc128: +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + subs r2,r2,#16 +.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + moveq r8,#0 +.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + vld1.8 {q8},[r0],r8 +.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 + veor q8,q8,q5 +.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 + veor q6,q0,q7 + bhs .Loop_cbc_enc128 + + vst1.8 {q6},[r1]! + b .Lcbc_done +.align 5 +.Lcbc_dec: + vld1.8 {q10},[r0]! + subs r2,r2,#32 @ bias + add r6,r5,#2 + vorr q3,q0,q0 + vorr q1,q0,q0 + vorr q11,q10,q10 + blo .Lcbc_dec_tail + + vorr q1,q10,q10 + vld1.8 {q10},[r0]! + vorr q2,q0,q0 + vorr q3,q1,q1 + vorr q11,q10,q10 + +.Loop3x_cbc_dec: +.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q9},[r7]! + bgt .Loop3x_cbc_dec + +.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q4,q6,q7 + subs r2,r2,#0x30 + veor q5,q2,q7 + movlo r6,r2 @ r6, r6, is zero at this point +.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q9,q3,q7 + add r0,r0,r6 @ r0 is adjusted in such way that + @ at exit from the loop q1-q10 + @ are loaded with last "words" + vorr q6,q11,q11 + mov r7,r3 +.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.8 {q2},[r0]! +.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.8 {q3},[r0]! +.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 +.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 +.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.8 {q11},[r0]! +.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 +.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 +.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + add r6,r5,#2 + veor q4,q4,q0 + veor q5,q5,q1 + veor q10,q10,q9 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vst1.8 {q4},[r1]! + vorr q0,q2,q2 + vst1.8 {q5},[r1]! + vorr q1,q3,q3 + vst1.8 {q10},[r1]! + vorr q10,q11,q11 + bhs .Loop3x_cbc_dec + + cmn r2,#0x30 + beq .Lcbc_done + nop + +.Lcbc_dec_tail: +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + vld1.32 {q9},[r7]! + bgt .Lcbc_dec_tail + +.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 +.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 +.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + cmn r2,#0x20 +.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q5,q6,q7 +.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 +.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 +.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 +.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 + veor q9,q3,q7 +.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 +.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 + beq .Lcbc_dec_one + veor q5,q5,q1 + veor q9,q9,q10 + vorr q6,q11,q11 + vst1.8 {q5},[r1]! + vst1.8 {q9},[r1]! + b .Lcbc_done + +.Lcbc_dec_one: + veor q5,q5,q10 + vorr q6,q11,q11 + vst1.8 {q5},[r1]! + +.Lcbc_done: + vst1.8 {q6},[r4] +.Lcbc_abort: + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,pc} +.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,%function +.align 5 +aes_hw_ctr32_encrypt_blocks: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldr r4, [ip] @ load remaining arg + ldr r5,[r3,#240] + + ldr r8, [r4, #12] + vld1.32 {q0},[r4] + + vld1.32 {q8,q9},[r3] @ load key schedule... + sub r5,r5,#4 + mov r12,#16 + cmp r2,#2 + add r7,r3,r5,lsl#4 @ pointer to last 5 round keys + sub r5,r5,#2 + vld1.32 {q12,q13},[r7]! + vld1.32 {q14,q15},[r7]! + vld1.32 {q7},[r7] + add r7,r3,#32 + mov r6,r5 + movlo r12,#0 + + @ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are + @ affected by silicon errata #1742098 [0] and #1655431 [1], + @ respectively, where the second instruction of an aese/aesmc + @ instruction pair may execute twice if an interrupt is taken right + @ after the first instruction consumes an input register of which a + @ single 32-bit lane has been updated the last time it was modified. + @ + @ This function uses a counter in one 32-bit lane. The + @ could write to q1 and q10 directly, but that trips this bugs. + @ We write to q6 and copy to the final register as a workaround. + @ + @ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice + @ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice +#ifndef __ARMEB__ + rev r8, r8 +#endif + add r10, r8, #1 + vorr q6,q0,q0 + rev r10, r10 + vmov.32 d13[1],r10 + add r8, r8, #2 + vorr q1,q6,q6 + bls .Lctr32_tail + rev r12, r8 + vmov.32 d13[1],r12 + sub r2,r2,#3 @ bias + vorr q10,q6,q6 + b .Loop3x_ctr32 + +.align 4 +.Loop3x_ctr32: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 +.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 +.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 +.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 +.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.32 {q9},[r7]! + bgt .Loop3x_ctr32 + +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 + vld1.8 {q2},[r0]! + add r9,r8,#1 +.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 +.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 + vld1.8 {q3},[r0]! + rev r9,r9 +.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 +.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + vld1.8 {q11},[r0]! + mov r7,r3 +.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 +.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10 +.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 +.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + veor q2,q2,q7 + add r10,r8,#2 +.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 +.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + veor q3,q3,q7 + add r8,r8,#3 +.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 +.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + @ Note the logic to update q0, q1, and q1 is written to work + @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in + @ 32-bit mode. See the comment above. + veor q11,q11,q7 + vmov.32 d13[1], r9 +.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 +.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + vorr q0,q6,q6 + rev r10,r10 +.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 +.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 + vmov.32 d13[1], r10 + rev r12,r8 +.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 +.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 + vorr q1,q6,q6 + vmov.32 d13[1], r12 +.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 +.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 + vorr q10,q6,q6 + subs r2,r2,#3 +.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 +.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 +.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15 + + veor q2,q2,q4 + vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] + vst1.8 {q2},[r1]! + veor q3,q3,q5 + mov r6,r5 + vst1.8 {q3},[r1]! + veor q11,q11,q9 + vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] + vst1.8 {q11},[r1]! + bhs .Loop3x_ctr32 + + adds r2,r2,#3 + beq .Lctr32_done + cmp r2,#1 + mov r12,#16 + moveq r12,#0 + +.Lctr32_tail: +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.32 {q8},[r7]! + subs r6,r6,#2 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.32 {q9},[r7]! + bgt .Lctr32_tail + +.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 +.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.8 {q2},[r0],r12 +.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + vld1.8 {q3},[r0] +.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + veor q2,q2,q7 +.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 +.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 +.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 +.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 + veor q3,q3,q7 +.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 +.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 + + cmp r2,#1 + veor q2,q2,q0 + veor q3,q3,q1 + vst1.8 {q2},[r1]! + beq .Lctr32_done + vst1.8 {q3},[r1] + +.Lctr32_done: + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/aesv8-armv8-apple.S b/third_party/boringssl/gen/bcm/aesv8-armv8-apple.S new file mode 100644 index 00000000..e34778be --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesv8-armv8-apple.S @@ -0,0 +1,785 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +#if __ARM_MAX_ARCH__>=7 +.text +.arch_extension crypto + +.section __TEXT,__const +.align 5 +Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl _aes_hw_set_encrypt_key +.private_extern _aes_hw_set_encrypt_key + +.align 5 +_aes_hw_set_encrypt_key: +Lenc_key: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x3,#-2 + cmp w1,#128 + b.lt Lenc_key_abort + cmp w1,#256 + b.gt Lenc_key_abort + tst w1,#0x3f + b.ne Lenc_key_abort + + adrp x3,Lrcon@PAGE + add x3,x3,Lrcon@PAGEOFF + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt Loop128 + b.eq L192 + b L256 + +.align 4 +Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b Ldone + +.align 4 +L192: + ld1 {v4.8b},[x0],#8 + movi v6.16b,#8 // borrow v6.16b + st1 {v3.4s},[x2],#16 + sub v2.16b,v2.16b,v6.16b // adjust the mask + +Loop192: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.8b},[x2],#8 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + + dup v5.4s,v3.s[3] + eor v5.16b,v5.16b,v4.16b + eor v6.16b,v6.16b,v1.16b + ext v4.16b,v0.16b,v4.16b,#12 + shl v1.16b,v1.16b,#1 + eor v4.16b,v4.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + eor v4.16b,v4.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.ne Loop192 + + mov w12,#12 + add x2,x2,#0x20 + b Ldone + +.align 4 +L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b Loop256 + +Ldone: + str w12,[x2] + mov x3,#0 + +Lenc_key_abort: + mov x0,x3 // return value + ldr x29,[sp],#16 + ret + + +.globl _aes_hw_set_decrypt_key +.private_extern _aes_hw_set_decrypt_key + +.align 5 +_aes_hw_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + bl Lenc_key + + cmp x0,#0 + b.ne Ldec_key_abort + + sub x2,x2,#240 // restore original x2 + mov x4,#-16 + add x0,x2,x12,lsl#4 // end of key schedule + + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + +Loop_imc: + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + cmp x0,x2 + b.hi Loop_imc + + ld1 {v0.4s},[x2] + aesimc v0.16b,v0.16b + st1 {v0.4s},[x0] + + eor x0,x0,x0 // return value +Ldec_key_abort: + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _aes_hw_encrypt +.private_extern _aes_hw_encrypt + +.align 5 +_aes_hw_encrypt: + AARCH64_VALID_CALL_TARGET + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +Loop_enc: + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aese v2.16b,v1.16b + aesmc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt Loop_enc + + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aese v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret + +.globl _aes_hw_decrypt +.private_extern _aes_hw_decrypt + +.align 5 +_aes_hw_decrypt: + AARCH64_VALID_CALL_TARGET + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +Loop_dec: + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aesd v2.16b,v1.16b + aesimc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt Loop_dec + + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aesd v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret + +.globl _aes_hw_cbc_encrypt +.private_extern _aes_hw_cbc_encrypt + +.align 5 +_aes_hw_cbc_encrypt: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + subs x2,x2,#16 + mov x8,#16 + b.lo Lcbc_abort + csel x8,xzr,x8,eq + + cmp w5,#0 // en- or decrypting? + ldr w5,[x3,#240] + and x2,x2,#-16 + ld1 {v6.16b},[x4] + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s,v19.4s},[x7],#32 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + b.eq Lcbc_dec + + cmp w5,#2 + eor v0.16b,v0.16b,v6.16b + eor v5.16b,v16.16b,v7.16b + b.eq Lcbc_enc128 + + ld1 {v2.4s,v3.4s},[x7] + add x7,x3,#16 + add x6,x3,#16*4 + add x12,x3,#16*5 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + add x14,x3,#16*6 + add x3,x3,#16*7 + b Lenter_cbc_enc + +.align 4 +Loop_cbc_enc: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +Lenter_cbc_enc: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x6] + cmp w5,#4 + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x12] + b.eq Lcbc_enc192 + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x14] + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x3] + nop + +Lcbc_enc192: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x7] // re-pre-load rndkey[1] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs Loop_cbc_enc + + st1 {v6.16b},[x1],#16 + b Lcbc_done + +.align 5 +Lcbc_enc128: + ld1 {v2.4s,v3.4s},[x7] + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + b Lenter_cbc_enc128 +Loop_cbc_enc128: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +Lenter_cbc_enc128: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs Loop_cbc_enc128 + + st1 {v6.16b},[x1],#16 + b Lcbc_done +.align 5 +Lcbc_dec: + ld1 {v18.16b},[x0],#16 + subs x2,x2,#32 // bias + add w6,w5,#2 + orr v3.16b,v0.16b,v0.16b + orr v1.16b,v0.16b,v0.16b + orr v19.16b,v18.16b,v18.16b + b.lo Lcbc_dec_tail + + orr v1.16b,v18.16b,v18.16b + ld1 {v18.16b},[x0],#16 + orr v2.16b,v0.16b,v0.16b + orr v3.16b,v1.16b,v1.16b + orr v19.16b,v18.16b,v18.16b + +Loop3x_cbc_dec: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_cbc_dec + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 + eor v5.16b,v2.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v18.16b + // are loaded with last "words" + orr v6.16b,v19.16b,v19.16b + mov x7,x3 + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + ld1 {v2.16b},[x0],#16 + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + ld1 {v19.16b},[x0],#16 + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + add w6,w5,#2 + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + eor v18.16b,v18.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[x1],#16 + orr v0.16b,v2.16b,v2.16b + st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v18.16b},[x1],#16 + orr v18.16b,v19.16b,v19.16b + b.hs Loop3x_cbc_dec + + cmn x2,#0x30 + b.eq Lcbc_done + nop + +Lcbc_dec_tail: + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Lcbc_dec_tail + + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + cmn x2,#0x20 + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + eor v5.16b,v6.16b,v7.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + b.eq Lcbc_dec_one + eor v5.16b,v5.16b,v1.16b + eor v17.16b,v17.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + b Lcbc_done + +Lcbc_dec_one: + eor v5.16b,v5.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + +Lcbc_done: + st1 {v6.16b},[x4] +Lcbc_abort: + ldr x29,[sp],#16 + ret + +.globl _aes_hw_ctr32_encrypt_blocks +.private_extern _aes_hw_ctr32_encrypt_blocks + +.align 5 +_aes_hw_ctr32_encrypt_blocks: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last 5 round keys + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 + csel x12,xzr,x12,lo + + // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are + // affected by silicon errata #1742098 [0] and #1655431 [1], + // respectively, where the second instruction of an aese/aesmc + // instruction pair may execute twice if an interrupt is taken right + // after the first instruction consumes an input register of which a + // single 32-bit lane has been updated the last time it was modified. + // + // This function uses a counter in one 32-bit lane. The vmov lines + // could write to v1.16b and v18.16b directly, but that trips this bugs. + // We write to v6.16b and copy to the final register as a workaround. + // + // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice + // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice +#ifndef __AARCH64EB__ + rev w8, w8 +#endif + add w10, w8, #1 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v6.s[3],w10 + add w8, w8, #2 + orr v1.16b,v6.16b,v6.16b + b.ls Lctr32_tail + rev w12, w8 + mov v6.s[3],w12 + sub x2,x2,#3 // bias + orr v18.16b,v6.16b,v6.16b + b Loop3x_ctr32 + +.align 4 +Loop3x_ctr32: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_ctr32 + + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + add w9,w8,#1 + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + rev w9,w9 + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work + // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in + // 32-bit mode. See the comment above. + eor v19.16b,v19.16b,v7.16b + mov v6.s[3], w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + orr v0.16b,v6.16b,v6.16b + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + mov v6.s[3], w10 + rev w12,w8 + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + orr v1.16b,v6.16b,v6.16b + mov v6.s[3], w12 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + orr v18.16b,v6.16b,v6.16b + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b + + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[x1],#16 + b.hs Loop3x_ctr32 + + adds x2,x2,#3 + b.eq Lctr32_done + cmp x2,#1 + mov x12,#16 + csel x12,xzr,x12,eq + +Lctr32_tail: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq Lctr32_done + st1 {v3.16b},[x1] + +Lctr32_done: + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/aesv8-armv8-linux.S b/third_party/boringssl/gen/bcm/aesv8-armv8-linux.S new file mode 100644 index 00000000..cbf5c0a6 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesv8-armv8-linux.S @@ -0,0 +1,784 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +#if __ARM_MAX_ARCH__>=7 +.text +.arch armv8-a+crypto +.section .rodata +.align 5 +.Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl aes_hw_set_encrypt_key +.hidden aes_hw_set_encrypt_key +.type aes_hw_set_encrypt_key,%function +.align 5 +aes_hw_set_encrypt_key: +.Lenc_key: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x3,#-2 + cmp w1,#128 + b.lt .Lenc_key_abort + cmp w1,#256 + b.gt .Lenc_key_abort + tst w1,#0x3f + b.ne .Lenc_key_abort + + adrp x3,.Lrcon + add x3,x3,:lo12:.Lrcon + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt .Loop128 + b.eq .L192 + b .L256 + +.align 4 +.Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne .Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b .Ldone + +.align 4 +.L192: + ld1 {v4.8b},[x0],#8 + movi v6.16b,#8 // borrow v6.16b + st1 {v3.4s},[x2],#16 + sub v2.16b,v2.16b,v6.16b // adjust the mask + +.Loop192: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.8b},[x2],#8 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + + dup v5.4s,v3.s[3] + eor v5.16b,v5.16b,v4.16b + eor v6.16b,v6.16b,v1.16b + ext v4.16b,v0.16b,v4.16b,#12 + shl v1.16b,v1.16b,#1 + eor v4.16b,v4.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + eor v4.16b,v4.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.ne .Loop192 + + mov w12,#12 + add x2,x2,#0x20 + b .Ldone + +.align 4 +.L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +.Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq .Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b .Loop256 + +.Ldone: + str w12,[x2] + mov x3,#0 + +.Lenc_key_abort: + mov x0,x3 // return value + ldr x29,[sp],#16 + ret +.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key + +.globl aes_hw_set_decrypt_key +.hidden aes_hw_set_decrypt_key +.type aes_hw_set_decrypt_key,%function +.align 5 +aes_hw_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + bl .Lenc_key + + cmp x0,#0 + b.ne .Ldec_key_abort + + sub x2,x2,#240 // restore original x2 + mov x4,#-16 + add x0,x2,x12,lsl#4 // end of key schedule + + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + +.Loop_imc: + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + cmp x0,x2 + b.hi .Loop_imc + + ld1 {v0.4s},[x2] + aesimc v0.16b,v0.16b + st1 {v0.4s},[x0] + + eor x0,x0,x0 // return value +.Ldec_key_abort: + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key +.globl aes_hw_encrypt +.hidden aes_hw_encrypt +.type aes_hw_encrypt,%function +.align 5 +aes_hw_encrypt: + AARCH64_VALID_CALL_TARGET + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_enc: + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aese v2.16b,v1.16b + aesmc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt .Loop_enc + + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aese v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_hw_encrypt,.-aes_hw_encrypt +.globl aes_hw_decrypt +.hidden aes_hw_decrypt +.type aes_hw_decrypt,%function +.align 5 +aes_hw_decrypt: + AARCH64_VALID_CALL_TARGET + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +.Loop_dec: + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aesd v2.16b,v1.16b + aesimc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt .Loop_dec + + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aesd v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret +.size aes_hw_decrypt,.-aes_hw_decrypt +.globl aes_hw_cbc_encrypt +.hidden aes_hw_cbc_encrypt +.type aes_hw_cbc_encrypt,%function +.align 5 +aes_hw_cbc_encrypt: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + subs x2,x2,#16 + mov x8,#16 + b.lo .Lcbc_abort + csel x8,xzr,x8,eq + + cmp w5,#0 // en- or decrypting? + ldr w5,[x3,#240] + and x2,x2,#-16 + ld1 {v6.16b},[x4] + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s,v19.4s},[x7],#32 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + b.eq .Lcbc_dec + + cmp w5,#2 + eor v0.16b,v0.16b,v6.16b + eor v5.16b,v16.16b,v7.16b + b.eq .Lcbc_enc128 + + ld1 {v2.4s,v3.4s},[x7] + add x7,x3,#16 + add x6,x3,#16*4 + add x12,x3,#16*5 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + add x14,x3,#16*6 + add x3,x3,#16*7 + b .Lenter_cbc_enc + +.align 4 +.Loop_cbc_enc: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +.Lenter_cbc_enc: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x6] + cmp w5,#4 + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x12] + b.eq .Lcbc_enc192 + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x14] + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x3] + nop + +.Lcbc_enc192: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x7] // re-pre-load rndkey[1] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs .Loop_cbc_enc + + st1 {v6.16b},[x1],#16 + b .Lcbc_done + +.align 5 +.Lcbc_enc128: + ld1 {v2.4s,v3.4s},[x7] + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + b .Lenter_cbc_enc128 +.Loop_cbc_enc128: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +.Lenter_cbc_enc128: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs .Loop_cbc_enc128 + + st1 {v6.16b},[x1],#16 + b .Lcbc_done +.align 5 +.Lcbc_dec: + ld1 {v18.16b},[x0],#16 + subs x2,x2,#32 // bias + add w6,w5,#2 + orr v3.16b,v0.16b,v0.16b + orr v1.16b,v0.16b,v0.16b + orr v19.16b,v18.16b,v18.16b + b.lo .Lcbc_dec_tail + + orr v1.16b,v18.16b,v18.16b + ld1 {v18.16b},[x0],#16 + orr v2.16b,v0.16b,v0.16b + orr v3.16b,v1.16b,v1.16b + orr v19.16b,v18.16b,v18.16b + +.Loop3x_cbc_dec: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop3x_cbc_dec + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 + eor v5.16b,v2.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v18.16b + // are loaded with last "words" + orr v6.16b,v19.16b,v19.16b + mov x7,x3 + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + ld1 {v2.16b},[x0],#16 + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + ld1 {v19.16b},[x0],#16 + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + add w6,w5,#2 + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + eor v18.16b,v18.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[x1],#16 + orr v0.16b,v2.16b,v2.16b + st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v18.16b},[x1],#16 + orr v18.16b,v19.16b,v19.16b + b.hs .Loop3x_cbc_dec + + cmn x2,#0x30 + b.eq .Lcbc_done + nop + +.Lcbc_dec_tail: + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lcbc_dec_tail + + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + cmn x2,#0x20 + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + eor v5.16b,v6.16b,v7.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + b.eq .Lcbc_dec_one + eor v5.16b,v5.16b,v1.16b + eor v17.16b,v17.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + b .Lcbc_done + +.Lcbc_dec_one: + eor v5.16b,v5.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + +.Lcbc_done: + st1 {v6.16b},[x4] +.Lcbc_abort: + ldr x29,[sp],#16 + ret +.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt +.globl aes_hw_ctr32_encrypt_blocks +.hidden aes_hw_ctr32_encrypt_blocks +.type aes_hw_ctr32_encrypt_blocks,%function +.align 5 +aes_hw_ctr32_encrypt_blocks: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last 5 round keys + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 + csel x12,xzr,x12,lo + + // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are + // affected by silicon errata #1742098 [0] and #1655431 [1], + // respectively, where the second instruction of an aese/aesmc + // instruction pair may execute twice if an interrupt is taken right + // after the first instruction consumes an input register of which a + // single 32-bit lane has been updated the last time it was modified. + // + // This function uses a counter in one 32-bit lane. The vmov lines + // could write to v1.16b and v18.16b directly, but that trips this bugs. + // We write to v6.16b and copy to the final register as a workaround. + // + // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice + // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice +#ifndef __AARCH64EB__ + rev w8, w8 +#endif + add w10, w8, #1 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v6.s[3],w10 + add w8, w8, #2 + orr v1.16b,v6.16b,v6.16b + b.ls .Lctr32_tail + rev w12, w8 + mov v6.s[3],w12 + sub x2,x2,#3 // bias + orr v18.16b,v6.16b,v6.16b + b .Loop3x_ctr32 + +.align 4 +.Loop3x_ctr32: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt .Loop3x_ctr32 + + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + add w9,w8,#1 + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + rev w9,w9 + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work + // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in + // 32-bit mode. See the comment above. + eor v19.16b,v19.16b,v7.16b + mov v6.s[3], w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + orr v0.16b,v6.16b,v6.16b + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + mov v6.s[3], w10 + rev w12,w8 + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + orr v1.16b,v6.16b,v6.16b + mov v6.s[3], w12 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + orr v18.16b,v6.16b,v6.16b + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b + + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[x1],#16 + b.hs .Loop3x_ctr32 + + adds x2,x2,#3 + b.eq .Lctr32_done + cmp x2,#1 + mov x12,#16 + csel x12,xzr,x12,eq + +.Lctr32_tail: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt .Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq .Lctr32_done + st1 {v3.16b},[x1] + +.Lctr32_done: + ldr x29,[sp],#16 + ret +.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/aesv8-armv8-win.S b/third_party/boringssl/gen/bcm/aesv8-armv8-win.S new file mode 100644 index 00000000..54bd1c51 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesv8-armv8-win.S @@ -0,0 +1,796 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if __ARM_MAX_ARCH__>=7 +.text +.arch armv8-a+crypto +.section .rodata +.align 5 +Lrcon: +.long 0x01,0x01,0x01,0x01 +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat +.long 0x1b,0x1b,0x1b,0x1b + +.text + +.globl aes_hw_set_encrypt_key + +.def aes_hw_set_encrypt_key + .type 32 +.endef +.align 5 +aes_hw_set_encrypt_key: +Lenc_key: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + mov x3,#-2 + cmp w1,#128 + b.lt Lenc_key_abort + cmp w1,#256 + b.gt Lenc_key_abort + tst w1,#0x3f + b.ne Lenc_key_abort + + adrp x3,Lrcon + add x3,x3,:lo12:Lrcon + cmp w1,#192 + + eor v0.16b,v0.16b,v0.16b + ld1 {v3.16b},[x0],#16 + mov w1,#8 // reuse w1 + ld1 {v1.4s,v2.4s},[x3],#32 + + b.lt Loop128 + b.eq L192 + b L256 + +.align 4 +Loop128: + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + b.ne Loop128 + + ld1 {v1.4s},[x3] + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + + tbl v6.16b,{v3.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v3.4s},[x2],#16 + aese v6.16b,v0.16b + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2] + add x2,x2,#0x50 + + mov w12,#10 + b Ldone + +.align 4 +L192: + ld1 {v4.8b},[x0],#8 + movi v6.16b,#8 // borrow v6.16b + st1 {v3.4s},[x2],#16 + sub v2.16b,v2.16b,v6.16b // adjust the mask + +Loop192: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.8b},[x2],#8 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + + dup v5.4s,v3.s[3] + eor v5.16b,v5.16b,v4.16b + eor v6.16b,v6.16b,v1.16b + ext v4.16b,v0.16b,v4.16b,#12 + shl v1.16b,v1.16b,#1 + eor v4.16b,v4.16b,v5.16b + eor v3.16b,v3.16b,v6.16b + eor v4.16b,v4.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.ne Loop192 + + mov w12,#12 + add x2,x2,#0x20 + b Ldone + +.align 4 +L256: + ld1 {v4.16b},[x0] + mov w1,#7 + mov w12,#14 + st1 {v3.4s},[x2],#16 + +Loop256: + tbl v6.16b,{v4.16b},v2.16b + ext v5.16b,v0.16b,v3.16b,#12 + st1 {v4.4s},[x2],#16 + aese v6.16b,v0.16b + subs w1,w1,#1 + + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v3.16b,v3.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v6.16b,v6.16b,v1.16b + eor v3.16b,v3.16b,v5.16b + shl v1.16b,v1.16b,#1 + eor v3.16b,v3.16b,v6.16b + st1 {v3.4s},[x2],#16 + b.eq Ldone + + dup v6.4s,v3.s[3] // just splat + ext v5.16b,v0.16b,v4.16b,#12 + aese v6.16b,v0.16b + + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + ext v5.16b,v0.16b,v5.16b,#12 + eor v4.16b,v4.16b,v5.16b + + eor v4.16b,v4.16b,v6.16b + b Loop256 + +Ldone: + str w12,[x2] + mov x3,#0 + +Lenc_key_abort: + mov x0,x3 // return value + ldr x29,[sp],#16 + ret + + +.globl aes_hw_set_decrypt_key + +.def aes_hw_set_decrypt_key + .type 32 +.endef +.align 5 +aes_hw_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + bl Lenc_key + + cmp x0,#0 + b.ne Ldec_key_abort + + sub x2,x2,#240 // restore original x2 + mov x4,#-16 + add x0,x2,x12,lsl#4 // end of key schedule + + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + +Loop_imc: + ld1 {v0.4s},[x2] + ld1 {v1.4s},[x0] + aesimc v0.16b,v0.16b + aesimc v1.16b,v1.16b + st1 {v0.4s},[x0],x4 + st1 {v1.4s},[x2],#16 + cmp x0,x2 + b.hi Loop_imc + + ld1 {v0.4s},[x2] + aesimc v0.16b,v0.16b + st1 {v0.4s},[x0] + + eor x0,x0,x0 // return value +Ldec_key_abort: + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl aes_hw_encrypt + +.def aes_hw_encrypt + .type 32 +.endef +.align 5 +aes_hw_encrypt: + AARCH64_VALID_CALL_TARGET + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +Loop_enc: + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aese v2.16b,v1.16b + aesmc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt Loop_enc + + aese v2.16b,v0.16b + aesmc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aese v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret + +.globl aes_hw_decrypt + +.def aes_hw_decrypt + .type 32 +.endef +.align 5 +aes_hw_decrypt: + AARCH64_VALID_CALL_TARGET + ldr w3,[x2,#240] + ld1 {v0.4s},[x2],#16 + ld1 {v2.16b},[x0] + sub w3,w3,#2 + ld1 {v1.4s},[x2],#16 + +Loop_dec: + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2],#16 + subs w3,w3,#2 + aesd v2.16b,v1.16b + aesimc v2.16b,v2.16b + ld1 {v1.4s},[x2],#16 + b.gt Loop_dec + + aesd v2.16b,v0.16b + aesimc v2.16b,v2.16b + ld1 {v0.4s},[x2] + aesd v2.16b,v1.16b + eor v2.16b,v2.16b,v0.16b + + st1 {v2.16b},[x1] + ret + +.globl aes_hw_cbc_encrypt + +.def aes_hw_cbc_encrypt + .type 32 +.endef +.align 5 +aes_hw_cbc_encrypt: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + subs x2,x2,#16 + mov x8,#16 + b.lo Lcbc_abort + csel x8,xzr,x8,eq + + cmp w5,#0 // en- or decrypting? + ldr w5,[x3,#240] + and x2,x2,#-16 + ld1 {v6.16b},[x4] + ld1 {v0.16b},[x0],x8 + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#6 + add x7,x3,x5,lsl#4 // pointer to last 7 round keys + sub w5,w5,#2 + ld1 {v18.4s,v19.4s},[x7],#32 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + + add x7,x3,#32 + mov w6,w5 + b.eq Lcbc_dec + + cmp w5,#2 + eor v0.16b,v0.16b,v6.16b + eor v5.16b,v16.16b,v7.16b + b.eq Lcbc_enc128 + + ld1 {v2.4s,v3.4s},[x7] + add x7,x3,#16 + add x6,x3,#16*4 + add x12,x3,#16*5 + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + add x14,x3,#16*6 + add x3,x3,#16*7 + b Lenter_cbc_enc + +.align 4 +Loop_cbc_enc: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +Lenter_cbc_enc: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x6] + cmp w5,#4 + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x12] + b.eq Lcbc_enc192 + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + ld1 {v16.4s},[x14] + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x3] + nop + +Lcbc_enc192: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + ld1 {v17.4s},[x7] // re-pre-load rndkey[1] + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs Loop_cbc_enc + + st1 {v6.16b},[x1],#16 + b Lcbc_done + +.align 5 +Lcbc_enc128: + ld1 {v2.4s,v3.4s},[x7] + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + b Lenter_cbc_enc128 +Loop_cbc_enc128: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + st1 {v6.16b},[x1],#16 +Lenter_cbc_enc128: + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + subs x2,x2,#16 + aese v0.16b,v2.16b + aesmc v0.16b,v0.16b + csel x8,xzr,x8,eq + aese v0.16b,v3.16b + aesmc v0.16b,v0.16b + aese v0.16b,v18.16b + aesmc v0.16b,v0.16b + aese v0.16b,v19.16b + aesmc v0.16b,v0.16b + ld1 {v16.16b},[x0],x8 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + eor v16.16b,v16.16b,v5.16b + aese v0.16b,v23.16b + eor v6.16b,v0.16b,v7.16b + b.hs Loop_cbc_enc128 + + st1 {v6.16b},[x1],#16 + b Lcbc_done +.align 5 +Lcbc_dec: + ld1 {v18.16b},[x0],#16 + subs x2,x2,#32 // bias + add w6,w5,#2 + orr v3.16b,v0.16b,v0.16b + orr v1.16b,v0.16b,v0.16b + orr v19.16b,v18.16b,v18.16b + b.lo Lcbc_dec_tail + + orr v1.16b,v18.16b,v18.16b + ld1 {v18.16b},[x0],#16 + orr v2.16b,v0.16b,v0.16b + orr v3.16b,v1.16b,v1.16b + orr v19.16b,v18.16b,v18.16b + +Loop3x_cbc_dec: + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_cbc_dec + + aesd v0.16b,v16.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + eor v4.16b,v6.16b,v7.16b + subs x2,x2,#0x30 + eor v5.16b,v2.16b,v7.16b + csel x6,x2,x6,lo // x6, w6, is zero at this point + aesd v0.16b,v17.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + add x0,x0,x6 // x0 is adjusted in such way that + // at exit from the loop v1.16b-v18.16b + // are loaded with last "words" + orr v6.16b,v19.16b,v19.16b + mov x7,x3 + aesd v0.16b,v20.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + ld1 {v2.16b},[x0],#16 + aesd v0.16b,v21.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + aesd v0.16b,v22.16b + aesimc v0.16b,v0.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + ld1 {v19.16b},[x0],#16 + aesd v0.16b,v23.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + add w6,w5,#2 + eor v4.16b,v4.16b,v0.16b + eor v5.16b,v5.16b,v1.16b + eor v18.16b,v18.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v4.16b},[x1],#16 + orr v0.16b,v2.16b,v2.16b + st1 {v5.16b},[x1],#16 + orr v1.16b,v3.16b,v3.16b + st1 {v18.16b},[x1],#16 + orr v18.16b,v19.16b,v19.16b + b.hs Loop3x_cbc_dec + + cmn x2,#0x30 + b.eq Lcbc_done + nop + +Lcbc_dec_tail: + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Lcbc_dec_tail + + aesd v1.16b,v16.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v16.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v17.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v17.16b + aesimc v18.16b,v18.16b + aesd v1.16b,v20.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v20.16b + aesimc v18.16b,v18.16b + cmn x2,#0x20 + aesd v1.16b,v21.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v21.16b + aesimc v18.16b,v18.16b + eor v5.16b,v6.16b,v7.16b + aesd v1.16b,v22.16b + aesimc v1.16b,v1.16b + aesd v18.16b,v22.16b + aesimc v18.16b,v18.16b + eor v17.16b,v3.16b,v7.16b + aesd v1.16b,v23.16b + aesd v18.16b,v23.16b + b.eq Lcbc_dec_one + eor v5.16b,v5.16b,v1.16b + eor v17.16b,v17.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + st1 {v17.16b},[x1],#16 + b Lcbc_done + +Lcbc_dec_one: + eor v5.16b,v5.16b,v18.16b + orr v6.16b,v19.16b,v19.16b + st1 {v5.16b},[x1],#16 + +Lcbc_done: + st1 {v6.16b},[x4] +Lcbc_abort: + ldr x29,[sp],#16 + ret + +.globl aes_hw_ctr32_encrypt_blocks + +.def aes_hw_ctr32_encrypt_blocks + .type 32 +.endef +.align 5 +aes_hw_ctr32_encrypt_blocks: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + ldr w5,[x3,#240] + + ldr w8, [x4, #12] + ld1 {v0.4s},[x4] + + ld1 {v16.4s,v17.4s},[x3] // load key schedule... + sub w5,w5,#4 + mov x12,#16 + cmp x2,#2 + add x7,x3,x5,lsl#4 // pointer to last 5 round keys + sub w5,w5,#2 + ld1 {v20.4s,v21.4s},[x7],#32 + ld1 {v22.4s,v23.4s},[x7],#32 + ld1 {v7.4s},[x7] + add x7,x3,#32 + mov w6,w5 + csel x12,xzr,x12,lo + + // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are + // affected by silicon errata #1742098 [0] and #1655431 [1], + // respectively, where the second instruction of an aese/aesmc + // instruction pair may execute twice if an interrupt is taken right + // after the first instruction consumes an input register of which a + // single 32-bit lane has been updated the last time it was modified. + // + // This function uses a counter in one 32-bit lane. The vmov lines + // could write to v1.16b and v18.16b directly, but that trips this bugs. + // We write to v6.16b and copy to the final register as a workaround. + // + // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice + // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice +#ifndef __AARCH64EB__ + rev w8, w8 +#endif + add w10, w8, #1 + orr v6.16b,v0.16b,v0.16b + rev w10, w10 + mov v6.s[3],w10 + add w8, w8, #2 + orr v1.16b,v6.16b,v6.16b + b.ls Lctr32_tail + rev w12, w8 + mov v6.s[3],w12 + sub x2,x2,#3 // bias + orr v18.16b,v6.16b,v6.16b + b Loop3x_ctr32 + +.align 4 +Loop3x_ctr32: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + aese v18.16b,v17.16b + aesmc v18.16b,v18.16b + ld1 {v17.4s},[x7],#16 + b.gt Loop3x_ctr32 + + aese v0.16b,v16.16b + aesmc v4.16b,v0.16b + aese v1.16b,v16.16b + aesmc v5.16b,v1.16b + ld1 {v2.16b},[x0],#16 + add w9,w8,#1 + aese v18.16b,v16.16b + aesmc v18.16b,v18.16b + ld1 {v3.16b},[x0],#16 + rev w9,w9 + aese v4.16b,v17.16b + aesmc v4.16b,v4.16b + aese v5.16b,v17.16b + aesmc v5.16b,v5.16b + ld1 {v19.16b},[x0],#16 + mov x7,x3 + aese v18.16b,v17.16b + aesmc v17.16b,v18.16b + aese v4.16b,v20.16b + aesmc v4.16b,v4.16b + aese v5.16b,v20.16b + aesmc v5.16b,v5.16b + eor v2.16b,v2.16b,v7.16b + add w10,w8,#2 + aese v17.16b,v20.16b + aesmc v17.16b,v17.16b + eor v3.16b,v3.16b,v7.16b + add w8,w8,#3 + aese v4.16b,v21.16b + aesmc v4.16b,v4.16b + aese v5.16b,v21.16b + aesmc v5.16b,v5.16b + // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work + // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in + // 32-bit mode. See the comment above. + eor v19.16b,v19.16b,v7.16b + mov v6.s[3], w9 + aese v17.16b,v21.16b + aesmc v17.16b,v17.16b + orr v0.16b,v6.16b,v6.16b + rev w10,w10 + aese v4.16b,v22.16b + aesmc v4.16b,v4.16b + mov v6.s[3], w10 + rev w12,w8 + aese v5.16b,v22.16b + aesmc v5.16b,v5.16b + orr v1.16b,v6.16b,v6.16b + mov v6.s[3], w12 + aese v17.16b,v22.16b + aesmc v17.16b,v17.16b + orr v18.16b,v6.16b,v6.16b + subs x2,x2,#3 + aese v4.16b,v23.16b + aese v5.16b,v23.16b + aese v17.16b,v23.16b + + eor v2.16b,v2.16b,v4.16b + ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] + st1 {v2.16b},[x1],#16 + eor v3.16b,v3.16b,v5.16b + mov w6,w5 + st1 {v3.16b},[x1],#16 + eor v19.16b,v19.16b,v17.16b + ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] + st1 {v19.16b},[x1],#16 + b.hs Loop3x_ctr32 + + adds x2,x2,#3 + b.eq Lctr32_done + cmp x2,#1 + mov x12,#16 + csel x12,xzr,x12,eq + +Lctr32_tail: + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + ld1 {v16.4s},[x7],#16 + subs w6,w6,#2 + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v17.4s},[x7],#16 + b.gt Lctr32_tail + + aese v0.16b,v16.16b + aesmc v0.16b,v0.16b + aese v1.16b,v16.16b + aesmc v1.16b,v1.16b + aese v0.16b,v17.16b + aesmc v0.16b,v0.16b + aese v1.16b,v17.16b + aesmc v1.16b,v1.16b + ld1 {v2.16b},[x0],x12 + aese v0.16b,v20.16b + aesmc v0.16b,v0.16b + aese v1.16b,v20.16b + aesmc v1.16b,v1.16b + ld1 {v3.16b},[x0] + aese v0.16b,v21.16b + aesmc v0.16b,v0.16b + aese v1.16b,v21.16b + aesmc v1.16b,v1.16b + eor v2.16b,v2.16b,v7.16b + aese v0.16b,v22.16b + aesmc v0.16b,v0.16b + aese v1.16b,v22.16b + aesmc v1.16b,v1.16b + eor v3.16b,v3.16b,v7.16b + aese v0.16b,v23.16b + aese v1.16b,v23.16b + + cmp x2,#1 + eor v2.16b,v2.16b,v0.16b + eor v3.16b,v3.16b,v1.16b + st1 {v2.16b},[x1],#16 + b.eq Lctr32_done + st1 {v3.16b},[x1] + +Lctr32_done: + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/aesv8-gcm-armv8-apple.S b/third_party/boringssl/gen/bcm/aesv8-gcm-armv8-apple.S new file mode 100644 index 00000000..6a76daab --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesv8-gcm-armv8-apple.S @@ -0,0 +1,2909 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +#if __ARM_MAX_ARCH__ >= 8 +.arch_extension crypto +.arch_extension sha3 + +.text +.globl _aes_gcm_enc_kernel +.private_extern _aes_gcm_enc_kernel + +.align 4 +_aes_gcm_enc_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + ld1 { v0.16b}, [x4] // Load initial counter block + stp x19, x20, [sp, #16] + mov v1.16b, v0.16b // Initialize ctr1-3 from ctr0 + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 // Pointer to counter block in memory + mov x8, x5 // Pointer to AES key schedule context + stp x21, x22, [sp, #32] + // [sp, #48] is unused but allocated to align the stack layout with aes_gcm_dec_kernel + stp d8, d9, [sp, #64] // Save Neon registers + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // Load number of AES rounds + add x7, x8, x17, lsl #4 // Calculate pointer to the last round key + ldp x13, x14, [x7] // load round N key (for final XOR) + ldr q31, [x7, #-16] // load round N-1 key + add x4, x0, x1, lsr #3 // Calculate end of input + lsr x5, x1, #3 // Total byte length + mov x15, x5 + ldr w12, [x16, #12] // Load counter's low 32 bits + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // Align main loop end to a multiple of 64 bytes + add x5, x5, x0 + rev w12, w12 // Reverse for big-endian increment + uxtw x10, w12 // Zero extend reversed w12 into x10 for final counter update + // Pre-compute this value instead of using two instructions to reconstruct it every iteration + mov x21, #0xc200000000000000 // GHASH reduction constant + str x21, [sp, #128] + // We maintain four copies of ctr values on the stack. Each loop iteration we + // store the updated ctr value to the last four bytes (e.g., 160 + 12). + // We then load the four values. This avoids a singificant number of + // expensive GPR->NEON and NEON->NEON moves. To avoid LDST forwarding we + // calculate and store the values one iteration ahead so they have time to + // drain before we load them. + str q0, [sp, #160] // Store base counter for block 0-3 + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + // Since we need the values right away don't go through the stack this first + // time. Manually insert the incremented big-endian counter values. + rev w20, w12 + mov v0.s[3], w20 // ctr0 + 0 + add w20, w12, #1 + rev w20, w20 + mov v1.s[3], w20 // ctr0 + 1 + add w20, w12, #2 + rev w20, w20 + mov v2.s[3], w20 // ctr0 + 2 + add w20, w12, #3 + rev w20, w20 + mov v3.s[3], w20 // ctr0 + 3 + // Calculate the ctr values for the *next* (not current) group of four + // blocks. Store the incremented parts to the stack. + add w20, w12, #4 + rev w20, w20 + str w20, [sp, #172] // ctr0 + 4 for next iter + add w20, w12, #5 + rev w20, w20 + str w20, [sp, #188] // ctr0 + 5 for next iter + add w20, w12, #6 + rev w20, w20 + str w20, [sp, #204] // ctr0 + 6 for next iter + add w20, w12, #7 + rev w20, w20 + str w20, [sp, #220] // ctr0 + 7 for next iter + add w12, w12, #8 // Advance counter past these two sets + // --- Start AES for first 4 blocks --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load H2, H3 (GHASH keys) + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 // Byte swap H3 for GHASH + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 // Byte swap H2 for GHASH + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q15, [x6, #80] // load H4 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 // Byte swap H4 for GHASH + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] // Load initial GHASH accumulator (T) + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap T for GHASH + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b // Correct byte order within 64-bit lanes + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // Karatsuba key: H4_low | H3_low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load H1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 // Byte swap H1 for GHASH + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // Karatsuba key: H4_high | H3_high + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // Karatsuba key: H2_low | H1_low + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + ldr q30, [x7] // Preload round N key for final EOR + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Lenc_finish_first_blocks // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + b.eq Lenc_finish_first_blocks // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 +Lenc_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks to process in the tail + eor v17.16b, v17.16b, v9.16b // Karatsuba key: H3^H4 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // Karatsuba key: H2_high | H1_high + eor v16.16b, v16.16b, v8.16b // Karatsuba key: H1^H2 + b.ge Lenc_tail // handle tail if no more full 4-block sets + ldp q6, q7, [x0, #32] // AES blocks 2,3 load plaintext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load plaintext + // Compute and store first 4 ciphertext blocks + eor v4.16b, v4.16b, v30.16b + eor v4.16b, v4.16b, v0.16b // AES block 0 - result = PT ^ AES(ctr0) + eor v5.16b, v5.16b, v30.16b + eor v5.16b, v5.16b, v1.16b // AES block 1 - result = PT ^ AES(ctr1) + eor v6.16b, v6.16b, v30.16b + eor v6.16b, v6.16b, v2.16b // AES block 2 - result = PT ^ AES(ctr2) + eor v7.16b, v7.16b, v30.16b + eor v7.16b, v7.16b, v3.16b // AES block 3 - result = PT ^ AES(ctr3) + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 0-3 - store result + // Load counter values for the second iteration from the stack + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // Prepare and store counter values for the third iteration + rev w20, w12 + str w20, [sp, #172] // ctr + 8 + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] // ctr + 9 + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] // ctr + 10 + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] // ctr + 11 + add w12, w12, #4 // Advance counter base + cmp x0, x5 // check if we have <= 4 blocks remaining + b.ge Lenc_prepretail // go to prepretail if < 2 full loops left +Lenc_main_loop: // main loop start (processes 4 blocks per iteration) + // --- AES Pipeline for blocks 4k+4 to 4k+7 --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + ldr d8, [sp, #128] // Load GHASH reduction constant + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + // --- GHASH Pipeline (interleaved with AES) for blocks 4k to 4k+3 --- + rev64 v4.16b, v4.16b // GHASH block 4k - Byte swap CT + rev64 v5.16b, v5.16b // GHASH block 4k+1 - Byte swap CT + rev64 v6.16b, v6.16b // GHASH block 4k+2 - Byte swap CT + rev64 v7.16b, v7.16b // GHASH block 4k+3 - Byte swap CT + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // GHASH - prepare acc for XOR + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // GHASH block 4k - Y_i = CT_i ^ Y_{i-1} + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid Karatsuba key + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d20, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v20.8b, v20.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + mov d21, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v21.8b, v21.8b, v5.8b // GHASH block 4k+1 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull v10.1q, v20.1d, v10.1d // GHASH block 4k - mid + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + pmull v21.1q, v21.1d, v17.1d // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v21.16b // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + ext v22.16b, v22.16b, v6.16b, #8 // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v22.16b, v22.16b, v6.16b // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull2 v22.1q, v22.2d, v16.2d // GHASH block 4k+2 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + mov d23, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v23.8b, v23.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + pmull v23.1q, v23.1d, v16.1d // GHASH block 4k+3 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v10.16b, v10.16b, v22.16b + eor v10.16b, v10.16b, v23.16b // GHASH block 4k+2/3 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v22.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v21.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v22.16b, v22.16b, v21.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull2 v23.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + pmull v21.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull v20.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor v9.16b, v9.16b, v22.16b + eor v9.16b, v9.16b, v23.16b // GHASH block 4k/1/2/3 - high + pmull v22.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + ldp q6, q7, [x0, #32] + ldp q4, q5, [x0], #64 + eor v20.16b, v20.16b, v21.16b // GHASH block 4k+1 - low + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v22.16b + eor v11.16b, v11.16b, v20.16b // GHASH block 4k/1/2/3 - low + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v9.16b + eor v10.16b, v10.16b, v11.16b // MODULO - karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v20.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Lenc_main_loop_continue // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Lenc_main_loop_continue // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +Lenc_main_loop_continue: + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v20.16b + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v20.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v9.16b, v11.16b + eor v11.16b, v11.16b, v20.16b // MODULO - fold into low + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor v4.16b, v4.16b, v30.16b + eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor v5.16b, v5.16b, v30.16b + eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v6.16b, v6.16b, v30.16b + eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor v7.16b, v7.16b, v30.16b + eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // We used these registers as temporaries above so reload the RKs. + ldp q20, q21, [x8, #32] // load rk2, rk3 + ldp q22, q23, [x8, #64] // load rk4, rk5 + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + rev w20, w12 + str w20, [sp, #172] + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] + add w12, w12, #4 + cmp x0, x5 // LOOP CONTROL + b.lt Lenc_main_loop +Lenc_prepretail: // PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + rev64 v4.16b, v4.16b // GHASH block 0 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v5.16b, v5.16b // GHASH block 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid Karatsuba key + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + mov d8, v6.d[1] // GHASH block 2 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v9.16b, v9.16b, v4.16b // GHASH block 2 - high + mov d4, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + eor v4.8b, v4.8b, v7.8b // GHASH block 3 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + pmull v4.1q, v4.1d, v16.1d // GHASH block 3 - mid + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + eor v9.16b, v9.16b, v5.16b // GHASH block 3 - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + ldr d8, [sp, #128] + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + eor v10.16b, v10.16b, v4.16b // GHASH block 3 - mid + pmull v6.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + eor v11.16b, v11.16b, v6.16b // GHASH block 3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v10.16b, v10.16b, v9.16b // karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v11.16b + b.lt Lenc_finish_prepretail // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Lenc_finish_prepretail // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Lenc_finish_prepretail: + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + eor v10.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v9.16b + pmull v4.1q, v10.1d, v8.1d + ext v10.16b, v10.16b, v10.16b, #8 + eor v11.16b, v11.16b, v4.16b + eor v11.16b, v11.16b, v10.16b +Lenc_tail: // TAIL: Process remaining 0 to 3 blocks + ext v8.16b, v11.16b, v11.16b, #8 // Save current GHASH state for partial tag feed-in + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 // AES block 0 - load plaintext + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + fmov d4, x6 // AES block 0 - mov low + fmov v4.d[1], x7 // AES block 0 - mov high + eor v5.16b, v4.16b, v0.16b // AES block 0 - result + b.gt Lenc_blocks_more_than_3 + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt Lenc_blocks_more_than_2 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Lenc_blocks_more_than_1 + b Lenc_blocks_less_than_1 +Lenc_blocks_more_than_3: // blocks left > 3 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-3 block + eor x6, x6, x13 // AES final-2 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor x7, x7, x14 // AES final-2 block - round N high + mov d22, v4.d[1] // GHASH final-3 block - mid + fmov d5, x6 // AES final-2 block - mov low + fmov v5.d[1], x7 // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + mov d10, v17.d[1] // GHASH final-3 block - mid + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b // AES final-2 block - result +Lenc_blocks_more_than_2: // blocks left > 2 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-2 block + eor x6, x6, x13 // AES final-1 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + fmov d5, x6 // AES final-1 block - mov low + eor x7, x7, x14 // AES final-1 block - round N high + fmov v5.d[1], x7 // AES final-1 block - mov high + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + eor v5.16b, v5.16b, v2.16b // AES final-1 block - result + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid +Lenc_blocks_more_than_1: // blocks left > 1 + st1 { v5.16b}, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block: Byte Swap CT + ldp x6, x7, [x0], #16 // AES final block - load plaintext + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + movi v8.8b, #0 // Clear for next block + eor x6, x6, x13 // AES final block - round N low + mov d22, v4.d[1] // GHASH final-1 block - mid + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor x7, x7, x14 // AES final block - round N high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + fmov d5, x6 // AES final block - mov low + fmov v5.d[1], x7 // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + eor v5.16b, v5.16b, v3.16b // AES final block - result + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low +Lenc_blocks_less_than_1: // Last partial block handling + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x13, xzr // Mask for low 64 bits + sub x1, x1, #128 // + neg x1, x1 // Valid bits in the last block (1-128) + ldr q18, [x2] // Load destination for merging + mvn x14, xzr // Mask for high 64 bits + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + fmov d0, x6 // ctr0d is mask for last block + fmov v0.d[1], x7 + and v5.16b, v5.16b, v0.16b // Mask out unused bits of the last CT block + rev64 v4.16b, v5.16b // GHASH final block - byte swap + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + bif v5.16b, v18.16b, v0.16b // Bitwise Insert: merge with existing data at output_ptr + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + mov d8, v4.d[1] // GHASH final block - mid + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + fmov d8, x21 + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + st1 { v5.16b}, [x2] // store all 16B + eor v11.16b, v11.16b, v9.16b + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap GHASH result + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _aes_gcm_dec_kernel +.private_extern _aes_gcm_dec_kernel + +.align 4 +_aes_gcm_dec_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + stp x19, x20, [sp, #16] + ld1 { v0.16b}, [x4] + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // Load number of AES rounds + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + add x4, x0, x1, lsr #3 // end_input_ptr + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldr w9, [x16, #12] // Load scalar 32-bit counter (CTR) + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x5, x5, x0 + rev w9, w9 // Reverse it once for big-endian incrementing + uxtw x10, w9 // Zero extend reversed w9 into x10 + str q0, [sp, #160] + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + rev w20, w9 + mov v0.s[3], w20 + add w20, w9, #1 + rev w20, w20 + mov v1.s[3], w20 + add w20, w9, #2 + rev w20, w20 + mov v2.s[3], w20 + add w20, w9, #3 + rev w20, w20 + mov v3.s[3], w20 + add w20, w9, #4 + rev w20, w20 + str w20, [sp, #172] + add w20, w9, #5 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #6 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #7 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #8 + // Pre-compute this value instead of using two instructions for moving and then shifting in the main loop + mov x21, #0xc200000000000000 + str x21, [sp, #128] + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load h2, h3 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 + ldr q15, [x6, #80] // load h4 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load h1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_finish_first_blocks // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Ldec_finish_first_blocks // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Ldec_finish_first_blocks: + ldr q27, [x19] // load rkN + cmp x0, x5 // check if we have <= 4 blocks + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + eor v16.16b, v16.16b, v8.16b // h2k | h1k + b.ge Ldec_tail // handle tail + // Setup for AES blocks 0-3 is done purely on NEON side instead of mixing NEON and scalar instructions. + // This is because the final result of the AES block needs to be EORd with the final round key + // value (v30). This avoids several fmovs. + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + eor v0.16b, v4.16b, v0.16b + eor v0.16b, v0.16b, v27.16b // AES block 0 - result + eor v1.16b, v5.16b, v1.16b + eor v1.16b, v1.16b, v27.16b // AES block 1 - result + eor v2.16b, v6.16b, v2.16b + eor v2.16b, v2.16b, v27.16b // AES block 2 - result + eor v3.16b, v7.16b, v3.16b + eor v3.16b, v3.16b, v27.16b // AES block 3 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 0-3 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 // check if we have <= 4 blocks + b.ge Ldec_prepretail // do prepretail +Ldec_main_loop: // main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + rev64 v4.16b, v4.16b // GHASH block 4k + rev64 v5.16b, v5.16b // GHASH block 4k+1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // PRE 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d8, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v11.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+1 - low & GHASH block 4k+2 - low + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v28.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull v27.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v10.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+1 - mid & GHASH block 4k+2 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v28.16b + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+2 - high & GHASH block 4k+3 - high + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + ldr d8, [sp, #128] + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor v10.16b, v10.16b, v6.16b + eor v10.16b, v10.16b, v7.16b // GHASH block 4k+3 - mid & MODULO - fold into mid + eor v11.16b, v11.16b, v27.16b // GHASH block 4k+3 - low + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v6.16b + eor v10.16b, v10.16b, v9.16b // MODULO - karatsuba tidy up & MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_main_loop_continue // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Ldec_main_loop_continue // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +Ldec_main_loop_continue: + ldr q27, [x19] // load rkN + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor v0.16b, v4.16b, v0.16b + eor v0.16b, v0.16b, v27.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor v1.16b, v5.16b, v1.16b + eor v1.16b, v1.16b, v27.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v2.16b, v6.16b, v2.16b + eor v2.16b, v2.16b, v27.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor v3.16b, v7.16b, v3.16b + eor v3.16b, v3.16b, v27.16b // AES block 4k+7 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 + b.lt Ldec_main_loop +Ldec_prepretail: // PREPRETAIL + rev64 v4.16b, v4.16b // GHASH block 0 + rev64 v5.16b, v5.16b // GHASH block 1 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + mov d8, v6.d[1] // GHASH block 2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + eor v9.16b, v9.16b, v4.16b + eor v9.16b, v9.16b, v5.16b // GHASH block 2 - high & GHASH block 3 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + mov d6, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_finish_prepretail // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Ldec_finish_prepretail // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Ldec_finish_prepretail: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 +Ldec_tail: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 0 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 0 - result + mov x6, v0.d[0] // AES block 0 - mov low + mov x7, v0.d[1] // AES block 0 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + b.gt Ldec_blocks_more_than_3 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + cmp x5, #32 + b.gt Ldec_blocks_more_than_2 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Ldec_blocks_more_than_1 + b Ldec_blocks_less_than_1 +Ldec_blocks_more_than_3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +Ldec_blocks_more_than_2: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +Ldec_blocks_more_than_1: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +Ldec_blocks_less_than_1: // blocks left <= 1 + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + ldr d8, [sp, #128] + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl _aes_gcm_enc_kernel_eor3 +.private_extern _aes_gcm_enc_kernel_eor3 + +.align 4 +_aes_gcm_enc_kernel_eor3: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + ld1 { v0.16b}, [x4] // Load initial counter block + stp x19, x20, [sp, #16] + mov v1.16b, v0.16b // Initialize ctr1-3 from ctr0 + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 // Pointer to counter block in memory + mov x8, x5 // Pointer to AES key schedule context + stp x21, x22, [sp, #32] + // [sp, #48] is unused but allocated to align the stack layout with aes_gcm_dec_kernel_eor3 + stp d8, d9, [sp, #64] // Save Neon registers + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // Load number of AES rounds + add x7, x8, x17, lsl #4 // Calculate pointer to the last round key + ldp x13, x14, [x7] // load round N key (for final XOR) + ldr q31, [x7, #-16] // load round N-1 key + add x4, x0, x1, lsr #3 // Calculate end of input + lsr x5, x1, #3 // Total byte length + mov x15, x5 + ldr w12, [x16, #12] // Load counter's low 32 bits + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // Align main loop end to a multiple of 64 bytes + add x5, x5, x0 + rev w12, w12 // Reverse for big-endian increment + uxtw x10, w12 // Zero extend reversed w12 into x10 for final counter update + // Pre-compute this value instead of using two instructions to reconstruct it every iteration + mov x21, #0xc200000000000000 // GHASH reduction constant + str x21, [sp, #128] + // We maintain four copies of ctr values on the stack. Each loop iteration we + // store the updated ctr value to the last four bytes (e.g., 160 + 12). + // We then load the four values. This avoids a singificant number of + // expensive GPR->NEON and NEON->NEON moves. To avoid LDST forwarding we + // calculate and store the values one iteration ahead so they have time to + // drain before we load them. + str q0, [sp, #160] // Store base counter for block 0-3 + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + // Since we need the values right away don't go through the stack this first + // time. Manually insert the incremented big-endian counter values. + rev w20, w12 + mov v0.s[3], w20 // ctr0 + 0 + add w20, w12, #1 + rev w20, w20 + mov v1.s[3], w20 // ctr0 + 1 + add w20, w12, #2 + rev w20, w20 + mov v2.s[3], w20 // ctr0 + 2 + add w20, w12, #3 + rev w20, w20 + mov v3.s[3], w20 // ctr0 + 3 + // Calculate the ctr values for the *next* (not current) group of four + // blocks. Store the incremented parts to the stack. + add w20, w12, #4 + rev w20, w20 + str w20, [sp, #172] // ctr0 + 4 for next iter + add w20, w12, #5 + rev w20, w20 + str w20, [sp, #188] // ctr0 + 5 for next iter + add w20, w12, #6 + rev w20, w20 + str w20, [sp, #204] // ctr0 + 6 for next iter + add w20, w12, #7 + rev w20, w20 + str w20, [sp, #220] // ctr0 + 7 for next iter + add w12, w12, #8 // Advance counter past these two sets + // --- Start AES for first 4 blocks --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load H2, H3 (GHASH keys) + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 // Byte swap H3 for GHASH + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 // Byte swap H2 for GHASH + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q15, [x6, #80] // load H4 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 // Byte swap H4 for GHASH + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] // Load initial GHASH accumulator (T) + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap T for GHASH + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b // Correct byte order within 64-bit lanes + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // Karatsuba key: H4_low | H3_low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load H1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 // Byte swap H1 for GHASH + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // Karatsuba key: H4_high | H3_high + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // Karatsuba key: H2_low | H1_low + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + ldr q30, [x7] // Preload round N key for final EOR + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Lenc_finish_first_blocks_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + b.eq Lenc_finish_first_blocks_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 +Lenc_finish_first_blocks_eor3: + cmp x0, x5 // check if we have <= 4 blocks to process in the tail + eor v17.16b, v17.16b, v9.16b // Karatsuba key: H3^H4 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // Karatsuba key: H2_high | H1_high + eor v16.16b, v16.16b, v8.16b // Karatsuba key: H1^H2 + b.ge Lenc_tail_eor3 // handle tail if no more full 4-block sets + ldp q6, q7, [x0, #32] // AES blocks 2,3 load plaintext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load plaintext + // Compute and store first 4 ciphertext blocks + eor3 v4.16b, v4.16b, v30.16b, v0.16b // AES block 0 - result = PT ^ AES(ctr0) + eor3 v5.16b, v5.16b, v30.16b, v1.16b // AES block 1 - result = PT ^ AES(ctr1) + eor3 v6.16b, v6.16b, v30.16b, v2.16b // AES block 2 - result = PT ^ AES(ctr2) + eor3 v7.16b, v7.16b, v30.16b, v3.16b // AES block 3 - result = PT ^ AES(ctr3) + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 0-3 - store result + // Load counter values for the second iteration from the stack + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // Prepare and store counter values for the third iteration + rev w20, w12 + str w20, [sp, #172] // ctr + 8 + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] // ctr + 9 + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] // ctr + 10 + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] // ctr + 11 + add w12, w12, #4 // Advance counter base + cmp x0, x5 // check if we have <= 4 blocks remaining + b.ge Lenc_prepretail_eor3 // go to prepretail if < 2 full loops left +Lenc_main_loop_eor3: // main loop start (processes 4 blocks per iteration) + // --- AES Pipeline for blocks 4k+4 to 4k+7 --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + ldr d8, [sp, #128] // Load GHASH reduction constant + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + // --- GHASH Pipeline (interleaved with AES) for blocks 4k to 4k+3 --- + rev64 v4.16b, v4.16b // GHASH block 4k - Byte swap CT + rev64 v5.16b, v5.16b // GHASH block 4k+1 - Byte swap CT + rev64 v6.16b, v6.16b // GHASH block 4k+2 - Byte swap CT + rev64 v7.16b, v7.16b // GHASH block 4k+3 - Byte swap CT + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // GHASH - prepare acc for XOR + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // GHASH block 4k - Y_i = CT_i ^ Y_{i-1} + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid Karatsuba key + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d20, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v20.8b, v20.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + mov d21, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v21.8b, v21.8b, v5.8b // GHASH block 4k+1 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull v10.1q, v20.1d, v10.1d // GHASH block 4k - mid + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + pmull v21.1q, v21.1d, v17.1d // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v21.16b // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + ext v22.16b, v22.16b, v6.16b, #8 // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v22.16b, v22.16b, v6.16b // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull2 v22.1q, v22.2d, v16.2d // GHASH block 4k+2 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + mov d23, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v23.8b, v23.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + pmull v23.1q, v23.1d, v16.1d // GHASH block 4k+3 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor3 v10.16b, v10.16b, v22.16b, v23.16b // GHASH block 4k+2/3 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v22.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v21.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v22.16b, v22.16b, v21.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull2 v23.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + pmull v21.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull v20.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor3 v9.16b, v9.16b, v22.16b, v23.16b // GHASH block 4k/1/2/3 - high + pmull v22.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + ldp q6, q7, [x0, #32] + ldp q4, q5, [x0], #64 + eor v20.16b, v20.16b, v21.16b // GHASH block 4k+1 - low + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor3 v11.16b, v11.16b, v22.16b, v20.16b // GHASH block 4k/1/2/3 - low + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + eor3 v10.16b, v10.16b, v9.16b, v11.16b // MODULO - karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v20.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Lenc_main_loop_continue_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Lenc_main_loop_continue_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +Lenc_main_loop_continue_eor3: + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v20.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v20.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor3 v11.16b, v9.16b, v11.16b, v20.16b // MODULO - fold into low + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor3 v4.16b, v4.16b, v30.16b, v0.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor3 v5.16b, v5.16b, v30.16b, v1.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor3 v6.16b, v6.16b, v30.16b, v2.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor3 v7.16b, v7.16b, v30.16b, v3.16b // AES block 4k+7 - result + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // We used these registers as temporaries above so reload the RKs. + ldp q20, q21, [x8, #32] // load rk2, rk3 + ldp q22, q23, [x8, #64] // load rk4, rk5 + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + rev w20, w12 + str w20, [sp, #172] + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] + add w12, w12, #4 + cmp x0, x5 // LOOP_eor3 CONTROL + b.lt Lenc_main_loop_eor3 +Lenc_prepretail_eor3: // PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + rev64 v4.16b, v4.16b // GHASH block 0 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v5.16b, v5.16b // GHASH block 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid Karatsuba key + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + mov d8, v6.d[1] // GHASH block 2 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v9.16b, v9.16b, v4.16b // GHASH block 2 - high + mov d4, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + eor v4.8b, v4.8b, v7.8b // GHASH block 3 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + pmull v4.1q, v4.1d, v16.1d // GHASH block 3 - mid + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + eor v9.16b, v9.16b, v5.16b // GHASH block 3 - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + ldr d8, [sp, #128] + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + eor v10.16b, v10.16b, v4.16b // GHASH block 3 - mid + pmull v6.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + eor v11.16b, v11.16b, v6.16b // GHASH block 3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v10.16b, v10.16b, v9.16b // karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v11.16b + b.lt Lenc_finish_prepretail_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Lenc_finish_prepretail_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Lenc_finish_prepretail_eor3: + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + eor3 v10.16b, v10.16b, v4.16b, v9.16b + pmull v4.1q, v10.1d, v8.1d + ext v10.16b, v10.16b, v10.16b, #8 + eor3 v11.16b, v11.16b, v4.16b, v10.16b +Lenc_tail_eor3: // TAIL: Process remaining 0 to 3 blocks + ext v8.16b, v11.16b, v11.16b, #8 // Save current GHASH state for partial tag feed-in + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 // AES block 0 - load plaintext + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + fmov d4, x6 // AES block 0 - mov low + fmov v4.d[1], x7 // AES block 0 - mov high + eor v5.16b, v4.16b, v0.16b // AES block 0 - result + b.gt Lenc_blocks_more_than_3_eor3 + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt Lenc_blocks_more_than_2_eor3 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Lenc_blocks_more_than_1_eor3 + b Lenc_blocks_less_than_1_eor3 +Lenc_blocks_more_than_3_eor3: // blocks left > 3 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-3 block + eor x6, x6, x13 // AES final-2 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor x7, x7, x14 // AES final-2 block - round N high + mov d22, v4.d[1] // GHASH final-3 block - mid + fmov d5, x6 // AES final-2 block - mov low + fmov v5.d[1], x7 // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + mov d10, v17.d[1] // GHASH final-3 block - mid + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b // AES final-2 block - result +Lenc_blocks_more_than_2_eor3: // blocks left > 2 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-2 block + eor x6, x6, x13 // AES final-1 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + fmov d5, x6 // AES final-1 block - mov low + eor x7, x7, x14 // AES final-1 block - round N high + fmov v5.d[1], x7 // AES final-1 block - mov high + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + eor v5.16b, v5.16b, v2.16b // AES final-1 block - result + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid +Lenc_blocks_more_than_1_eor3: // blocks left > 1 + st1 { v5.16b}, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block: Byte Swap CT + ldp x6, x7, [x0], #16 // AES final block - load plaintext + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + movi v8.8b, #0 // Clear for next block + eor x6, x6, x13 // AES final block - round N low + mov d22, v4.d[1] // GHASH final-1 block - mid + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor x7, x7, x14 // AES final block - round N high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + fmov d5, x6 // AES final block - mov low + fmov v5.d[1], x7 // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + eor v5.16b, v5.16b, v3.16b // AES final block - result + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low +Lenc_blocks_less_than_1_eor3: // Last partial block handling + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x13, xzr // Mask for low 64 bits + sub x1, x1, #128 // + neg x1, x1 // Valid bits in the last block (1-128) + ldr q18, [x2] // Load destination for merging + mvn x14, xzr // Mask for high 64 bits + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + fmov d0, x6 // ctr0d is mask for last block + fmov v0.d[1], x7 + and v5.16b, v5.16b, v0.16b // Mask out unused bits of the last CT block + rev64 v4.16b, v5.16b // GHASH final block - byte swap + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + bif v5.16b, v18.16b, v0.16b // Bitwise Insert: merge with existing data at output_ptr + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + mov d8, v4.d[1] // GHASH final block - mid + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + fmov d8, x21 + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v7.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + st1 { v5.16b}, [x2] // store all 16B + eor3 v11.16b, v11.16b, v9.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap GHASH result + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _aes_gcm_dec_kernel_eor3 +.private_extern _aes_gcm_dec_kernel_eor3 + +.align 4 +_aes_gcm_dec_kernel_eor3: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + stp x19, x20, [sp, #16] + ld1 { v0.16b}, [x4] + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // Load number of AES rounds + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + add x4, x0, x1, lsr #3 // end_input_ptr + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldr w9, [x16, #12] // Load scalar 32-bit counter (CTR) + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x5, x5, x0 + rev w9, w9 // Reverse it once for big-endian incrementing + uxtw x10, w9 // Zero extend reversed w9 into x10 + str q0, [sp, #160] + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + rev w20, w9 + mov v0.s[3], w20 + add w20, w9, #1 + rev w20, w20 + mov v1.s[3], w20 + add w20, w9, #2 + rev w20, w20 + mov v2.s[3], w20 + add w20, w9, #3 + rev w20, w20 + mov v3.s[3], w20 + add w20, w9, #4 + rev w20, w20 + str w20, [sp, #172] + add w20, w9, #5 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #6 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #7 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #8 + // Pre-compute this value instead of using two instructions for moving and then shifting in the main loop + mov x21, #0xc200000000000000 + str x21, [sp, #128] + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load h2, h3 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 + ldr q15, [x6, #80] // load h4 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load h1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_finish_first_blocks_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Ldec_finish_first_blocks_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Ldec_finish_first_blocks_eor3: + ldr q27, [x19] // load rkN + cmp x0, x5 // check if we have <= 4 blocks + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + eor v16.16b, v16.16b, v8.16b // h2k | h1k + b.ge Ldec_tail_eor3 // handle tail + // Setup for AES blocks 0-3 is done purely on NEON side instead of mixing NEON and scalar instructions. + // This is because the final result of the AES block needs to be EORd with the final round key + // value (v30). This avoids several fmovs. + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + eor3 v0.16b, v4.16b, v0.16b, v27.16b // AES block 0 - result + eor3 v1.16b, v5.16b, v1.16b, v27.16b // AES block 1 - result + eor3 v2.16b, v6.16b, v2.16b, v27.16b // AES block 2 - result + eor3 v3.16b, v7.16b, v3.16b, v27.16b // AES block 3 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 0-3 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 // check if we have <= 4 blocks + b.ge Ldec_prepretail_eor3 // do prepretail +Ldec_main_loop_eor3: // main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + rev64 v4.16b, v4.16b // GHASH block 4k + rev64 v5.16b, v5.16b // GHASH block 4k+1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // PRE 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d8, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor3 v11.16b, v11.16b, v8.16b, v5.16b // GHASH block 4k+1 - low & GHASH block 4k+2 - low + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v28.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull v27.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor3 v10.16b, v10.16b, v4.16b, v8.16b // GHASH block 4k+1 - mid & GHASH block 4k+2 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + eor3 v9.16b, v9.16b, v28.16b, v5.16b // GHASH block 4k+2 - high & GHASH block 4k+3 - high + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + ldr d8, [sp, #128] + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor3 v10.16b, v10.16b, v6.16b, v7.16b // GHASH block 4k+3 - mid & MODULO - fold into mid + eor v11.16b, v11.16b, v27.16b // GHASH block 4k+3 - low + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v6.16b, v9.16b // MODULO - karatsuba tidy up & MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor3 v11.16b, v11.16b, v8.16b, v10.16b // MODULO - fold into low + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_main_loop_continue_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Ldec_main_loop_continue_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +Ldec_main_loop_continue_eor3: + ldr q27, [x19] // load rkN + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor3 v0.16b, v4.16b, v0.16b, v27.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor3 v1.16b, v5.16b, v1.16b, v27.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor3 v2.16b, v6.16b, v2.16b, v27.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor3 v3.16b, v7.16b, v3.16b, v27.16b // AES block 4k+7 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 + b.lt Ldec_main_loop_eor3 +Ldec_prepretail_eor3: // PREPRETAIL + rev64 v4.16b, v4.16b // GHASH block 0 + rev64 v5.16b, v5.16b // GHASH block 1 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + mov d8, v6.d[1] // GHASH block 2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + eor3 v9.16b, v9.16b, v4.16b, v5.16b // GHASH block 2 - high & GHASH block 3 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + mov d6, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_finish_prepretail_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Ldec_finish_prepretail_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Ldec_finish_prepretail_eor3: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v7.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor3 v11.16b, v11.16b, v8.16b, v10.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 +Ldec_tail_eor3: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 0 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 0 - result + mov x6, v0.d[0] // AES block 0 - mov low + mov x7, v0.d[1] // AES block 0 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + b.gt Ldec_blocks_more_than_3_eor3 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + cmp x5, #32 + b.gt Ldec_blocks_more_than_2_eor3 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Ldec_blocks_more_than_1_eor3 + b Ldec_blocks_less_than_1_eor3 +Ldec_blocks_more_than_3_eor3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +Ldec_blocks_more_than_2_eor3: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +Ldec_blocks_more_than_1_eor3: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +Ldec_blocks_less_than_1_eor3: // blocks left <= 1 + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + ldr d8, [sp, #128] + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // __ARM_MAX_ARCH__ >= 8 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/aesv8-gcm-armv8-linux.S b/third_party/boringssl/gen/bcm/aesv8-gcm-armv8-linux.S new file mode 100644 index 00000000..9d99ec31 --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesv8-gcm-armv8-linux.S @@ -0,0 +1,2907 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +#if __ARM_MAX_ARCH__ >= 8 +.arch armv8.2-a+crypto+sha3 +.text +.globl aes_gcm_enc_kernel +.hidden aes_gcm_enc_kernel +.type aes_gcm_enc_kernel,%function +.align 4 +aes_gcm_enc_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + ld1 { v0.16b}, [x4] // .Load initial counter block + stp x19, x20, [sp, #16] + mov v1.16b, v0.16b // Initialize ctr1-3 from ctr0 + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 // Pointer to counter block in memory + mov x8, x5 // Pointer to AES key schedule context + stp x21, x22, [sp, #32] + // [sp, #48] is unused but allocated to align the stack layout with aes_gcm_dec_kernel + stp d8, d9, [sp, #64] // Save Neon registers + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // .Load number of AES rounds + add x7, x8, x17, lsl #4 // Calculate pointer to the last round key + ldp x13, x14, [x7] // load round N key (for final XOR) + ldr q31, [x7, #-16] // load round N-1 key + add x4, x0, x1, lsr #3 // Calculate end of input + lsr x5, x1, #3 // Total byte length + mov x15, x5 + ldr w12, [x16, #12] // .Load counter's low 32 bits + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // Align main loop end to a multiple of 64 bytes + add x5, x5, x0 + rev w12, w12 // Reverse for big-endian increment + uxtw x10, w12 // Zero extend reversed w12 into x10 for final counter update + // Pre-compute this value instead of using two instructions to reconstruct it every iteration + mov x21, #0xc200000000000000 // GHASH reduction constant + str x21, [sp, #128] + // We maintain four copies of ctr values on the stack. Each loop iteration we + // store the updated ctr value to the last four bytes (e.g., 160 + 12). + // We then load the four values. This avoids a singificant number of + // expensive GPR->NEON and NEON->NEON moves. To avoid LDST forwarding we + // calculate and store the values one iteration ahead so they have time to + // drain before we load them. + str q0, [sp, #160] // Store base counter for block 0-3 + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + // Since we need the values right away don't go through the stack this first + // time. Manually insert the incremented big-endian counter values. + rev w20, w12 + mov v0.s[3], w20 // ctr0 + 0 + add w20, w12, #1 + rev w20, w20 + mov v1.s[3], w20 // ctr0 + 1 + add w20, w12, #2 + rev w20, w20 + mov v2.s[3], w20 // ctr0 + 2 + add w20, w12, #3 + rev w20, w20 + mov v3.s[3], w20 // ctr0 + 3 + // Calculate the ctr values for the *next* (not current) group of four + // blocks. Store the incremented parts to the stack. + add w20, w12, #4 + rev w20, w20 + str w20, [sp, #172] // ctr0 + 4 for next iter + add w20, w12, #5 + rev w20, w20 + str w20, [sp, #188] // ctr0 + 5 for next iter + add w20, w12, #6 + rev w20, w20 + str w20, [sp, #204] // ctr0 + 6 for next iter + add w20, w12, #7 + rev w20, w20 + str w20, [sp, #220] // ctr0 + 7 for next iter + add w12, w12, #8 // Advance counter past these two sets + // --- Start AES for first 4 blocks --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load H2, H3 (GHASH keys) + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 // Byte swap H3 for GHASH + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 // Byte swap H2 for GHASH + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q15, [x6, #80] // load H4 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 // Byte swap H4 for GHASH + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] // .Load initial GHASH accumulator (T) + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap T for GHASH + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b // Correct byte order within 64-bit lanes + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // Karatsuba key: H4_low | H3_low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load H1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 // Byte swap H1 for GHASH + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // Karatsuba key: H4_high | H3_high + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // Karatsuba key: H2_low | H1_low + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + ldr q30, [x7] // Preload round N key for final EOR + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt .Lenc_finish_first_blocks // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + b.eq .Lenc_finish_first_blocks // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 +.Lenc_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks to process in the tail + eor v17.16b, v17.16b, v9.16b // Karatsuba key: H3^H4 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // Karatsuba key: H2_high | H1_high + eor v16.16b, v16.16b, v8.16b // Karatsuba key: H1^H2 + b.ge .Lenc_tail // handle tail if no more full 4-block sets + ldp q6, q7, [x0, #32] // AES blocks 2,3 load plaintext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load plaintext + // Compute and store first 4 ciphertext blocks + eor v4.16b, v4.16b, v30.16b + eor v4.16b, v4.16b, v0.16b // AES block 0 - result = PT ^ AES(ctr0) + eor v5.16b, v5.16b, v30.16b + eor v5.16b, v5.16b, v1.16b // AES block 1 - result = PT ^ AES(ctr1) + eor v6.16b, v6.16b, v30.16b + eor v6.16b, v6.16b, v2.16b // AES block 2 - result = PT ^ AES(ctr2) + eor v7.16b, v7.16b, v30.16b + eor v7.16b, v7.16b, v3.16b // AES block 3 - result = PT ^ AES(ctr3) + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 0-3 - store result + // Load counter values for the second iteration from the stack + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // Prepare and store counter values for the third iteration + rev w20, w12 + str w20, [sp, #172] // ctr + 8 + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] // ctr + 9 + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] // ctr + 10 + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] // ctr + 11 + add w12, w12, #4 // Advance counter base + cmp x0, x5 // check if we have <= 4 blocks remaining + b.ge .Lenc_prepretail // go to prepretail if < 2 full loops left +.Lenc_main_loop: // main loop start (processes 4 blocks per iteration) + // --- AES Pipeline for blocks 4k+4 to 4k+7 --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + ldr d8, [sp, #128] // .Load GHASH reduction constant + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + // --- GHASH Pipeline (interleaved with AES) for blocks 4k to 4k+3 --- + rev64 v4.16b, v4.16b // GHASH block 4k - Byte swap CT + rev64 v5.16b, v5.16b // GHASH block 4k+1 - Byte swap CT + rev64 v6.16b, v6.16b // GHASH block 4k+2 - Byte swap CT + rev64 v7.16b, v7.16b // GHASH block 4k+3 - Byte swap CT + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // GHASH - prepare acc for XOR + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // GHASH block 4k - Y_i = CT_i ^ Y_{i-1} + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid Karatsuba key + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d20, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v20.8b, v20.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + mov d21, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v21.8b, v21.8b, v5.8b // GHASH block 4k+1 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull v10.1q, v20.1d, v10.1d // GHASH block 4k - mid + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + pmull v21.1q, v21.1d, v17.1d // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v21.16b // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + ext v22.16b, v22.16b, v6.16b, #8 // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v22.16b, v22.16b, v6.16b // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull2 v22.1q, v22.2d, v16.2d // GHASH block 4k+2 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + mov d23, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v23.8b, v23.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + pmull v23.1q, v23.1d, v16.1d // GHASH block 4k+3 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v10.16b, v10.16b, v22.16b + eor v10.16b, v10.16b, v23.16b // GHASH block 4k+2/3 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v22.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v21.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v22.16b, v22.16b, v21.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull2 v23.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + pmull v21.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull v20.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor v9.16b, v9.16b, v22.16b + eor v9.16b, v9.16b, v23.16b // GHASH block 4k/1/2/3 - high + pmull v22.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + ldp q6, q7, [x0, #32] + ldp q4, q5, [x0], #64 + eor v20.16b, v20.16b, v21.16b // GHASH block 4k+1 - low + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v22.16b + eor v11.16b, v11.16b, v20.16b // GHASH block 4k/1/2/3 - low + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v9.16b + eor v10.16b, v10.16b, v11.16b // MODULO - karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v20.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt .Lenc_main_loop_continue // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq .Lenc_main_loop_continue // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +.Lenc_main_loop_continue: + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v20.16b + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v20.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v9.16b, v11.16b + eor v11.16b, v11.16b, v20.16b // MODULO - fold into low + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor v4.16b, v4.16b, v30.16b + eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor v5.16b, v5.16b, v30.16b + eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v6.16b, v6.16b, v30.16b + eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor v7.16b, v7.16b, v30.16b + eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // We used these registers as temporaries above so reload the RKs. + ldp q20, q21, [x8, #32] // load rk2, rk3 + ldp q22, q23, [x8, #64] // load rk4, rk5 + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + rev w20, w12 + str w20, [sp, #172] + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] + add w12, w12, #4 + cmp x0, x5 // .LOOP CONTROL + b.lt .Lenc_main_loop +.Lenc_prepretail: // PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + rev64 v4.16b, v4.16b // GHASH block 0 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v5.16b, v5.16b // GHASH block 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid Karatsuba key + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + mov d8, v6.d[1] // GHASH block 2 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v9.16b, v9.16b, v4.16b // GHASH block 2 - high + mov d4, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + eor v4.8b, v4.8b, v7.8b // GHASH block 3 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + pmull v4.1q, v4.1d, v16.1d // GHASH block 3 - mid + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + eor v9.16b, v9.16b, v5.16b // GHASH block 3 - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + ldr d8, [sp, #128] + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + eor v10.16b, v10.16b, v4.16b // GHASH block 3 - mid + pmull v6.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + eor v11.16b, v11.16b, v6.16b // GHASH block 3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v10.16b, v10.16b, v9.16b // karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v11.16b + b.lt .Lenc_finish_prepretail // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq .Lenc_finish_prepretail // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +.Lenc_finish_prepretail: + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + eor v10.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v9.16b + pmull v4.1q, v10.1d, v8.1d + ext v10.16b, v10.16b, v10.16b, #8 + eor v11.16b, v11.16b, v4.16b + eor v11.16b, v11.16b, v10.16b +.Lenc_tail: // TAIL: Process remaining 0 to 3 blocks + ext v8.16b, v11.16b, v11.16b, #8 // Save current GHASH state for partial tag feed-in + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 // AES block 0 - load plaintext + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + fmov d4, x6 // AES block 0 - mov low + fmov v4.d[1], x7 // AES block 0 - mov high + eor v5.16b, v4.16b, v0.16b // AES block 0 - result + b.gt .Lenc_blocks_more_than_3 + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt .Lenc_blocks_more_than_2 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt .Lenc_blocks_more_than_1 + b .Lenc_blocks_less_than_1 +.Lenc_blocks_more_than_3: // blocks left > 3 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-3 block + eor x6, x6, x13 // AES final-2 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor x7, x7, x14 // AES final-2 block - round N high + mov d22, v4.d[1] // GHASH final-3 block - mid + fmov d5, x6 // AES final-2 block - mov low + fmov v5.d[1], x7 // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + mov d10, v17.d[1] // GHASH final-3 block - mid + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b // AES final-2 block - result +.Lenc_blocks_more_than_2: // blocks left > 2 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-2 block + eor x6, x6, x13 // AES final-1 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + fmov d5, x6 // AES final-1 block - mov low + eor x7, x7, x14 // AES final-1 block - round N high + fmov v5.d[1], x7 // AES final-1 block - mov high + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + eor v5.16b, v5.16b, v2.16b // AES final-1 block - result + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid +.Lenc_blocks_more_than_1: // blocks left > 1 + st1 { v5.16b}, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block: Byte Swap CT + ldp x6, x7, [x0], #16 // AES final block - load plaintext + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + movi v8.8b, #0 // Clear for next block + eor x6, x6, x13 // AES final block - round N low + mov d22, v4.d[1] // GHASH final-1 block - mid + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor x7, x7, x14 // AES final block - round N high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + fmov d5, x6 // AES final block - mov low + fmov v5.d[1], x7 // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + eor v5.16b, v5.16b, v3.16b // AES final block - result + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low +.Lenc_blocks_less_than_1: // .Last partial block handling + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x13, xzr // Mask for low 64 bits + sub x1, x1, #128 // + neg x1, x1 // Valid bits in the last block (1-128) + ldr q18, [x2] // .Load destination for merging + mvn x14, xzr // Mask for high 64 bits + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + fmov d0, x6 // ctr0d is mask for last block + fmov v0.d[1], x7 + and v5.16b, v5.16b, v0.16b // Mask out unused bits of the last CT block + rev64 v4.16b, v5.16b // GHASH final block - byte swap + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + bif v5.16b, v18.16b, v0.16b // Bitwise Insert: merge with existing data at output_ptr + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + mov d8, v4.d[1] // GHASH final block - mid + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + fmov d8, x21 + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + st1 { v5.16b}, [x2] // store all 16B + eor v11.16b, v11.16b, v9.16b + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap GHASH result + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size aes_gcm_enc_kernel,.-aes_gcm_enc_kernel +.globl aes_gcm_dec_kernel +.hidden aes_gcm_dec_kernel +.type aes_gcm_dec_kernel,%function +.align 4 +aes_gcm_dec_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + stp x19, x20, [sp, #16] + ld1 { v0.16b}, [x4] + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // .Load number of AES rounds + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + add x4, x0, x1, lsr #3 // end_input_ptr + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldr w9, [x16, #12] // .Load scalar 32-bit counter (CTR) + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x5, x5, x0 + rev w9, w9 // Reverse it once for big-endian incrementing + uxtw x10, w9 // Zero extend reversed w9 into x10 + str q0, [sp, #160] + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + rev w20, w9 + mov v0.s[3], w20 + add w20, w9, #1 + rev w20, w20 + mov v1.s[3], w20 + add w20, w9, #2 + rev w20, w20 + mov v2.s[3], w20 + add w20, w9, #3 + rev w20, w20 + mov v3.s[3], w20 + add w20, w9, #4 + rev w20, w20 + str w20, [sp, #172] + add w20, w9, #5 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #6 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #7 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #8 + // Pre-compute this value instead of using two instructions for moving and then shifting in the main loop + mov x21, #0xc200000000000000 + str x21, [sp, #128] + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load h2, h3 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 + ldr q15, [x6, #80] // load h4 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load h1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt .Ldec_finish_first_blocks // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq .Ldec_finish_first_blocks // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +.Ldec_finish_first_blocks: + ldr q27, [x19] // load rkN + cmp x0, x5 // check if we have <= 4 blocks + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + eor v16.16b, v16.16b, v8.16b // h2k | h1k + b.ge .Ldec_tail // handle tail + // Setup for AES blocks 0-3 is done purely on NEON side instead of mixing NEON and scalar instructions. + // This is because the final result of the AES block needs to be EORd with the final round key + // value (v30). This avoids several fmovs. + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + eor v0.16b, v4.16b, v0.16b + eor v0.16b, v0.16b, v27.16b // AES block 0 - result + eor v1.16b, v5.16b, v1.16b + eor v1.16b, v1.16b, v27.16b // AES block 1 - result + eor v2.16b, v6.16b, v2.16b + eor v2.16b, v2.16b, v27.16b // AES block 2 - result + eor v3.16b, v7.16b, v3.16b + eor v3.16b, v3.16b, v27.16b // AES block 3 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 0-3 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 // check if we have <= 4 blocks + b.ge .Ldec_prepretail // do prepretail +.Ldec_main_loop: // main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + rev64 v4.16b, v4.16b // GHASH block 4k + rev64 v5.16b, v5.16b // GHASH block 4k+1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // PRE 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d8, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v11.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+1 - low & GHASH block 4k+2 - low + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v28.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull v27.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v10.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+1 - mid & GHASH block 4k+2 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v28.16b + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+2 - high & GHASH block 4k+3 - high + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + ldr d8, [sp, #128] + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor v10.16b, v10.16b, v6.16b + eor v10.16b, v10.16b, v7.16b // GHASH block 4k+3 - mid & MODULO - fold into mid + eor v11.16b, v11.16b, v27.16b // GHASH block 4k+3 - low + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v6.16b + eor v10.16b, v10.16b, v9.16b // MODULO - karatsuba tidy up & MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt .Ldec_main_loop_continue // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq .Ldec_main_loop_continue // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +.Ldec_main_loop_continue: + ldr q27, [x19] // load rkN + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor v0.16b, v4.16b, v0.16b + eor v0.16b, v0.16b, v27.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor v1.16b, v5.16b, v1.16b + eor v1.16b, v1.16b, v27.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v2.16b, v6.16b, v2.16b + eor v2.16b, v2.16b, v27.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor v3.16b, v7.16b, v3.16b + eor v3.16b, v3.16b, v27.16b // AES block 4k+7 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 + b.lt .Ldec_main_loop +.Ldec_prepretail: // PREPRETAIL + rev64 v4.16b, v4.16b // GHASH block 0 + rev64 v5.16b, v5.16b // GHASH block 1 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + mov d8, v6.d[1] // GHASH block 2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + eor v9.16b, v9.16b, v4.16b + eor v9.16b, v9.16b, v5.16b // GHASH block 2 - high & GHASH block 3 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + mov d6, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt .Ldec_finish_prepretail // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq .Ldec_finish_prepretail // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +.Ldec_finish_prepretail: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 +.Ldec_tail: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 0 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 0 - result + mov x6, v0.d[0] // AES block 0 - mov low + mov x7, v0.d[1] // AES block 0 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + b.gt .Ldec_blocks_more_than_3 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + cmp x5, #32 + b.gt .Ldec_blocks_more_than_2 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt .Ldec_blocks_more_than_1 + b .Ldec_blocks_less_than_1 +.Ldec_blocks_more_than_3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +.Ldec_blocks_more_than_2: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +.Ldec_blocks_more_than_1: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +.Ldec_blocks_less_than_1: // blocks left <= 1 + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + ldr d8, [sp, #128] + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size aes_gcm_dec_kernel,.-aes_gcm_dec_kernel + +.globl aes_gcm_enc_kernel_eor3 +.hidden aes_gcm_enc_kernel_eor3 +.type aes_gcm_enc_kernel_eor3,%function +.align 4 +aes_gcm_enc_kernel_eor3: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + ld1 { v0.16b}, [x4] // .Load initial counter block + stp x19, x20, [sp, #16] + mov v1.16b, v0.16b // Initialize ctr1-3 from ctr0 + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 // Pointer to counter block in memory + mov x8, x5 // Pointer to AES key schedule context + stp x21, x22, [sp, #32] + // [sp, #48] is unused but allocated to align the stack layout with aes_gcm_dec_kernel_eor3 + stp d8, d9, [sp, #64] // Save Neon registers + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // .Load number of AES rounds + add x7, x8, x17, lsl #4 // Calculate pointer to the last round key + ldp x13, x14, [x7] // load round N key (for final XOR) + ldr q31, [x7, #-16] // load round N-1 key + add x4, x0, x1, lsr #3 // Calculate end of input + lsr x5, x1, #3 // Total byte length + mov x15, x5 + ldr w12, [x16, #12] // .Load counter's low 32 bits + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // Align main loop end to a multiple of 64 bytes + add x5, x5, x0 + rev w12, w12 // Reverse for big-endian increment + uxtw x10, w12 // Zero extend reversed w12 into x10 for final counter update + // Pre-compute this value instead of using two instructions to reconstruct it every iteration + mov x21, #0xc200000000000000 // GHASH reduction constant + str x21, [sp, #128] + // We maintain four copies of ctr values on the stack. Each loop iteration we + // store the updated ctr value to the last four bytes (e.g., 160 + 12). + // We then load the four values. This avoids a singificant number of + // expensive GPR->NEON and NEON->NEON moves. To avoid LDST forwarding we + // calculate and store the values one iteration ahead so they have time to + // drain before we load them. + str q0, [sp, #160] // Store base counter for block 0-3 + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + // Since we need the values right away don't go through the stack this first + // time. Manually insert the incremented big-endian counter values. + rev w20, w12 + mov v0.s[3], w20 // ctr0 + 0 + add w20, w12, #1 + rev w20, w20 + mov v1.s[3], w20 // ctr0 + 1 + add w20, w12, #2 + rev w20, w20 + mov v2.s[3], w20 // ctr0 + 2 + add w20, w12, #3 + rev w20, w20 + mov v3.s[3], w20 // ctr0 + 3 + // Calculate the ctr values for the *next* (not current) group of four + // blocks. Store the incremented parts to the stack. + add w20, w12, #4 + rev w20, w20 + str w20, [sp, #172] // ctr0 + 4 for next iter + add w20, w12, #5 + rev w20, w20 + str w20, [sp, #188] // ctr0 + 5 for next iter + add w20, w12, #6 + rev w20, w20 + str w20, [sp, #204] // ctr0 + 6 for next iter + add w20, w12, #7 + rev w20, w20 + str w20, [sp, #220] // ctr0 + 7 for next iter + add w12, w12, #8 // Advance counter past these two sets + // --- Start AES for first 4 blocks --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load H2, H3 (GHASH keys) + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 // Byte swap H3 for GHASH + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 // Byte swap H2 for GHASH + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q15, [x6, #80] // load H4 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 // Byte swap H4 for GHASH + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] // .Load initial GHASH accumulator (T) + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap T for GHASH + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b // Correct byte order within 64-bit lanes + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // Karatsuba key: H4_low | H3_low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load H1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 // Byte swap H1 for GHASH + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // Karatsuba key: H4_high | H3_high + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // Karatsuba key: H2_low | H1_low + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + ldr q30, [x7] // Preload round N key for final EOR + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt .Lenc_finish_first_blocks_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + b.eq .Lenc_finish_first_blocks_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 +.Lenc_finish_first_blocks_eor3: + cmp x0, x5 // check if we have <= 4 blocks to process in the tail + eor v17.16b, v17.16b, v9.16b // Karatsuba key: H3^H4 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // Karatsuba key: H2_high | H1_high + eor v16.16b, v16.16b, v8.16b // Karatsuba key: H1^H2 + b.ge .Lenc_tail_eor3 // handle tail if no more full 4-block sets + ldp q6, q7, [x0, #32] // AES blocks 2,3 load plaintext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load plaintext + // Compute and store first 4 ciphertext blocks + eor3 v4.16b, v4.16b, v30.16b, v0.16b // AES block 0 - result = PT ^ AES(ctr0) + eor3 v5.16b, v5.16b, v30.16b, v1.16b // AES block 1 - result = PT ^ AES(ctr1) + eor3 v6.16b, v6.16b, v30.16b, v2.16b // AES block 2 - result = PT ^ AES(ctr2) + eor3 v7.16b, v7.16b, v30.16b, v3.16b // AES block 3 - result = PT ^ AES(ctr3) + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 0-3 - store result + // Load counter values for the second iteration from the stack + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // Prepare and store counter values for the third iteration + rev w20, w12 + str w20, [sp, #172] // ctr + 8 + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] // ctr + 9 + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] // ctr + 10 + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] // ctr + 11 + add w12, w12, #4 // Advance counter base + cmp x0, x5 // check if we have <= 4 blocks remaining + b.ge .Lenc_prepretail_eor3 // go to prepretail if < 2 full loops left +.Lenc_main_loop_eor3: // main loop start (processes 4 blocks per iteration) + // --- AES Pipeline for blocks 4k+4 to 4k+7 --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + ldr d8, [sp, #128] // .Load GHASH reduction constant + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + // --- GHASH Pipeline (interleaved with AES) for blocks 4k to 4k+3 --- + rev64 v4.16b, v4.16b // GHASH block 4k - Byte swap CT + rev64 v5.16b, v5.16b // GHASH block 4k+1 - Byte swap CT + rev64 v6.16b, v6.16b // GHASH block 4k+2 - Byte swap CT + rev64 v7.16b, v7.16b // GHASH block 4k+3 - Byte swap CT + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // GHASH - prepare acc for XOR + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // GHASH block 4k - Y_i = CT_i ^ Y_{i-1} + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid Karatsuba key + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d20, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v20.8b, v20.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + mov d21, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v21.8b, v21.8b, v5.8b // GHASH block 4k+1 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull v10.1q, v20.1d, v10.1d // GHASH block 4k - mid + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + pmull v21.1q, v21.1d, v17.1d // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v21.16b // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + ext v22.16b, v22.16b, v6.16b, #8 // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v22.16b, v22.16b, v6.16b // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull2 v22.1q, v22.2d, v16.2d // GHASH block 4k+2 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + mov d23, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v23.8b, v23.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + pmull v23.1q, v23.1d, v16.1d // GHASH block 4k+3 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor3 v10.16b, v10.16b, v22.16b, v23.16b // GHASH block 4k+2/3 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v22.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v21.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v22.16b, v22.16b, v21.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull2 v23.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + pmull v21.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull v20.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor3 v9.16b, v9.16b, v22.16b, v23.16b // GHASH block 4k/1/2/3 - high + pmull v22.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + ldp q6, q7, [x0, #32] + ldp q4, q5, [x0], #64 + eor v20.16b, v20.16b, v21.16b // GHASH block 4k+1 - low + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor3 v11.16b, v11.16b, v22.16b, v20.16b // GHASH block 4k/1/2/3 - low + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + eor3 v10.16b, v10.16b, v9.16b, v11.16b // MODULO - karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v20.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt .Lenc_main_loop_continue_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq .Lenc_main_loop_continue_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +.Lenc_main_loop_continue_eor3: + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v20.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v20.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor3 v11.16b, v9.16b, v11.16b, v20.16b // MODULO - fold into low + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor3 v4.16b, v4.16b, v30.16b, v0.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor3 v5.16b, v5.16b, v30.16b, v1.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor3 v6.16b, v6.16b, v30.16b, v2.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor3 v7.16b, v7.16b, v30.16b, v3.16b // AES block 4k+7 - result + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // We used these registers as temporaries above so reload the RKs. + ldp q20, q21, [x8, #32] // load rk2, rk3 + ldp q22, q23, [x8, #64] // load rk4, rk5 + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + rev w20, w12 + str w20, [sp, #172] + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] + add w12, w12, #4 + cmp x0, x5 // .LOOP_eor3 CONTROL + b.lt .Lenc_main_loop_eor3 +.Lenc_prepretail_eor3: // PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + rev64 v4.16b, v4.16b // GHASH block 0 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v5.16b, v5.16b // GHASH block 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid Karatsuba key + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + mov d8, v6.d[1] // GHASH block 2 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v9.16b, v9.16b, v4.16b // GHASH block 2 - high + mov d4, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + eor v4.8b, v4.8b, v7.8b // GHASH block 3 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + pmull v4.1q, v4.1d, v16.1d // GHASH block 3 - mid + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + eor v9.16b, v9.16b, v5.16b // GHASH block 3 - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + ldr d8, [sp, #128] + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + eor v10.16b, v10.16b, v4.16b // GHASH block 3 - mid + pmull v6.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + eor v11.16b, v11.16b, v6.16b // GHASH block 3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v10.16b, v10.16b, v9.16b // karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v11.16b + b.lt .Lenc_finish_prepretail_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq .Lenc_finish_prepretail_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +.Lenc_finish_prepretail_eor3: + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + eor3 v10.16b, v10.16b, v4.16b, v9.16b + pmull v4.1q, v10.1d, v8.1d + ext v10.16b, v10.16b, v10.16b, #8 + eor3 v11.16b, v11.16b, v4.16b, v10.16b +.Lenc_tail_eor3: // TAIL: Process remaining 0 to 3 blocks + ext v8.16b, v11.16b, v11.16b, #8 // Save current GHASH state for partial tag feed-in + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 // AES block 0 - load plaintext + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + fmov d4, x6 // AES block 0 - mov low + fmov v4.d[1], x7 // AES block 0 - mov high + eor v5.16b, v4.16b, v0.16b // AES block 0 - result + b.gt .Lenc_blocks_more_than_3_eor3 + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt .Lenc_blocks_more_than_2_eor3 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt .Lenc_blocks_more_than_1_eor3 + b .Lenc_blocks_less_than_1_eor3 +.Lenc_blocks_more_than_3_eor3: // blocks left > 3 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-3 block + eor x6, x6, x13 // AES final-2 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor x7, x7, x14 // AES final-2 block - round N high + mov d22, v4.d[1] // GHASH final-3 block - mid + fmov d5, x6 // AES final-2 block - mov low + fmov v5.d[1], x7 // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + mov d10, v17.d[1] // GHASH final-3 block - mid + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b // AES final-2 block - result +.Lenc_blocks_more_than_2_eor3: // blocks left > 2 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-2 block + eor x6, x6, x13 // AES final-1 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + fmov d5, x6 // AES final-1 block - mov low + eor x7, x7, x14 // AES final-1 block - round N high + fmov v5.d[1], x7 // AES final-1 block - mov high + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + eor v5.16b, v5.16b, v2.16b // AES final-1 block - result + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid +.Lenc_blocks_more_than_1_eor3: // blocks left > 1 + st1 { v5.16b}, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block: Byte Swap CT + ldp x6, x7, [x0], #16 // AES final block - load plaintext + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + movi v8.8b, #0 // Clear for next block + eor x6, x6, x13 // AES final block - round N low + mov d22, v4.d[1] // GHASH final-1 block - mid + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor x7, x7, x14 // AES final block - round N high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + fmov d5, x6 // AES final block - mov low + fmov v5.d[1], x7 // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + eor v5.16b, v5.16b, v3.16b // AES final block - result + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low +.Lenc_blocks_less_than_1_eor3: // .Last partial block handling + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x13, xzr // Mask for low 64 bits + sub x1, x1, #128 // + neg x1, x1 // Valid bits in the last block (1-128) + ldr q18, [x2] // .Load destination for merging + mvn x14, xzr // Mask for high 64 bits + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + fmov d0, x6 // ctr0d is mask for last block + fmov v0.d[1], x7 + and v5.16b, v5.16b, v0.16b // Mask out unused bits of the last CT block + rev64 v4.16b, v5.16b // GHASH final block - byte swap + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + bif v5.16b, v18.16b, v0.16b // Bitwise Insert: merge with existing data at output_ptr + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + mov d8, v4.d[1] // GHASH final block - mid + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + fmov d8, x21 + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v7.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + st1 { v5.16b}, [x2] // store all 16B + eor3 v11.16b, v11.16b, v9.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap GHASH result + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size aes_gcm_enc_kernel_eor3,.-aes_gcm_enc_kernel_eor3 +.globl aes_gcm_dec_kernel_eor3 +.hidden aes_gcm_dec_kernel_eor3 +.type aes_gcm_dec_kernel_eor3,%function +.align 4 +aes_gcm_dec_kernel_eor3: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + stp x19, x20, [sp, #16] + ld1 { v0.16b}, [x4] + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // .Load number of AES rounds + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + add x4, x0, x1, lsr #3 // end_input_ptr + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldr w9, [x16, #12] // .Load scalar 32-bit counter (CTR) + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x5, x5, x0 + rev w9, w9 // Reverse it once for big-endian incrementing + uxtw x10, w9 // Zero extend reversed w9 into x10 + str q0, [sp, #160] + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + rev w20, w9 + mov v0.s[3], w20 + add w20, w9, #1 + rev w20, w20 + mov v1.s[3], w20 + add w20, w9, #2 + rev w20, w20 + mov v2.s[3], w20 + add w20, w9, #3 + rev w20, w20 + mov v3.s[3], w20 + add w20, w9, #4 + rev w20, w20 + str w20, [sp, #172] + add w20, w9, #5 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #6 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #7 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #8 + // Pre-compute this value instead of using two instructions for moving and then shifting in the main loop + mov x21, #0xc200000000000000 + str x21, [sp, #128] + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load h2, h3 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 + ldr q15, [x6, #80] // load h4 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load h1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt .Ldec_finish_first_blocks_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq .Ldec_finish_first_blocks_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +.Ldec_finish_first_blocks_eor3: + ldr q27, [x19] // load rkN + cmp x0, x5 // check if we have <= 4 blocks + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + eor v16.16b, v16.16b, v8.16b // h2k | h1k + b.ge .Ldec_tail_eor3 // handle tail + // Setup for AES blocks 0-3 is done purely on NEON side instead of mixing NEON and scalar instructions. + // This is because the final result of the AES block needs to be EORd with the final round key + // value (v30). This avoids several fmovs. + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + eor3 v0.16b, v4.16b, v0.16b, v27.16b // AES block 0 - result + eor3 v1.16b, v5.16b, v1.16b, v27.16b // AES block 1 - result + eor3 v2.16b, v6.16b, v2.16b, v27.16b // AES block 2 - result + eor3 v3.16b, v7.16b, v3.16b, v27.16b // AES block 3 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 0-3 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 // check if we have <= 4 blocks + b.ge .Ldec_prepretail_eor3 // do prepretail +.Ldec_main_loop_eor3: // main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + rev64 v4.16b, v4.16b // GHASH block 4k + rev64 v5.16b, v5.16b // GHASH block 4k+1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // PRE 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d8, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor3 v11.16b, v11.16b, v8.16b, v5.16b // GHASH block 4k+1 - low & GHASH block 4k+2 - low + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v28.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull v27.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor3 v10.16b, v10.16b, v4.16b, v8.16b // GHASH block 4k+1 - mid & GHASH block 4k+2 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + eor3 v9.16b, v9.16b, v28.16b, v5.16b // GHASH block 4k+2 - high & GHASH block 4k+3 - high + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + ldr d8, [sp, #128] + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor3 v10.16b, v10.16b, v6.16b, v7.16b // GHASH block 4k+3 - mid & MODULO - fold into mid + eor v11.16b, v11.16b, v27.16b // GHASH block 4k+3 - low + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v6.16b, v9.16b // MODULO - karatsuba tidy up & MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor3 v11.16b, v11.16b, v8.16b, v10.16b // MODULO - fold into low + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt .Ldec_main_loop_continue_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq .Ldec_main_loop_continue_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +.Ldec_main_loop_continue_eor3: + ldr q27, [x19] // load rkN + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor3 v0.16b, v4.16b, v0.16b, v27.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor3 v1.16b, v5.16b, v1.16b, v27.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor3 v2.16b, v6.16b, v2.16b, v27.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor3 v3.16b, v7.16b, v3.16b, v27.16b // AES block 4k+7 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 + b.lt .Ldec_main_loop_eor3 +.Ldec_prepretail_eor3: // PREPRETAIL + rev64 v4.16b, v4.16b // GHASH block 0 + rev64 v5.16b, v5.16b // GHASH block 1 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + mov d8, v6.d[1] // GHASH block 2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + eor3 v9.16b, v9.16b, v4.16b, v5.16b // GHASH block 2 - high & GHASH block 3 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + mov d6, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt .Ldec_finish_prepretail_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq .Ldec_finish_prepretail_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +.Ldec_finish_prepretail_eor3: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v7.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor3 v11.16b, v11.16b, v8.16b, v10.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 +.Ldec_tail_eor3: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 0 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 0 - result + mov x6, v0.d[0] // AES block 0 - mov low + mov x7, v0.d[1] // AES block 0 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + b.gt .Ldec_blocks_more_than_3_eor3 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + cmp x5, #32 + b.gt .Ldec_blocks_more_than_2_eor3 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt .Ldec_blocks_more_than_1_eor3 + b .Ldec_blocks_less_than_1_eor3 +.Ldec_blocks_more_than_3_eor3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +.Ldec_blocks_more_than_2_eor3: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +.Ldec_blocks_more_than_1_eor3: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +.Ldec_blocks_less_than_1_eor3: // blocks left <= 1 + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + ldr d8, [sp, #128] + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size aes_gcm_dec_kernel_eor3,.-aes_gcm_dec_kernel_eor3 +#endif // __ARM_MAX_ARCH__ >= 8 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/aesv8-gcm-armv8-win.S b/third_party/boringssl/gen/bcm/aesv8-gcm-armv8-win.S new file mode 100644 index 00000000..a5a2b42b --- /dev/null +++ b/third_party/boringssl/gen/bcm/aesv8-gcm-armv8-win.S @@ -0,0 +1,2915 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if __ARM_MAX_ARCH__ >= 8 +.arch armv8.2-a+crypto+sha3 +.text +.globl aes_gcm_enc_kernel + +.def aes_gcm_enc_kernel + .type 32 +.endef +.align 4 +aes_gcm_enc_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + ld1 { v0.16b}, [x4] // Load initial counter block + stp x19, x20, [sp, #16] + mov v1.16b, v0.16b // Initialize ctr1-3 from ctr0 + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 // Pointer to counter block in memory + mov x8, x5 // Pointer to AES key schedule context + stp x21, x22, [sp, #32] + // [sp, #48] is unused but allocated to align the stack layout with aes_gcm_dec_kernel + stp d8, d9, [sp, #64] // Save Neon registers + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // Load number of AES rounds + add x7, x8, x17, lsl #4 // Calculate pointer to the last round key + ldp x13, x14, [x7] // load round N key (for final XOR) + ldr q31, [x7, #-16] // load round N-1 key + add x4, x0, x1, lsr #3 // Calculate end of input + lsr x5, x1, #3 // Total byte length + mov x15, x5 + ldr w12, [x16, #12] // Load counter's low 32 bits + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // Align main loop end to a multiple of 64 bytes + add x5, x5, x0 + rev w12, w12 // Reverse for big-endian increment + uxtw x10, w12 // Zero extend reversed w12 into x10 for final counter update + // Pre-compute this value instead of using two instructions to reconstruct it every iteration + mov x21, #0xc200000000000000 // GHASH reduction constant + str x21, [sp, #128] + // We maintain four copies of ctr values on the stack. Each loop iteration we + // store the updated ctr value to the last four bytes (e.g., 160 + 12). + // We then load the four values. This avoids a singificant number of + // expensive GPR->NEON and NEON->NEON moves. To avoid LDST forwarding we + // calculate and store the values one iteration ahead so they have time to + // drain before we load them. + str q0, [sp, #160] // Store base counter for block 0-3 + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + // Since we need the values right away don't go through the stack this first + // time. Manually insert the incremented big-endian counter values. + rev w20, w12 + mov v0.s[3], w20 // ctr0 + 0 + add w20, w12, #1 + rev w20, w20 + mov v1.s[3], w20 // ctr0 + 1 + add w20, w12, #2 + rev w20, w20 + mov v2.s[3], w20 // ctr0 + 2 + add w20, w12, #3 + rev w20, w20 + mov v3.s[3], w20 // ctr0 + 3 + // Calculate the ctr values for the *next* (not current) group of four + // blocks. Store the incremented parts to the stack. + add w20, w12, #4 + rev w20, w20 + str w20, [sp, #172] // ctr0 + 4 for next iter + add w20, w12, #5 + rev w20, w20 + str w20, [sp, #188] // ctr0 + 5 for next iter + add w20, w12, #6 + rev w20, w20 + str w20, [sp, #204] // ctr0 + 6 for next iter + add w20, w12, #7 + rev w20, w20 + str w20, [sp, #220] // ctr0 + 7 for next iter + add w12, w12, #8 // Advance counter past these two sets + // --- Start AES for first 4 blocks --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load H2, H3 (GHASH keys) + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 // Byte swap H3 for GHASH + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 // Byte swap H2 for GHASH + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q15, [x6, #80] // load H4 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 // Byte swap H4 for GHASH + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] // Load initial GHASH accumulator (T) + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap T for GHASH + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b // Correct byte order within 64-bit lanes + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // Karatsuba key: H4_low | H3_low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load H1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 // Byte swap H1 for GHASH + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // Karatsuba key: H4_high | H3_high + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // Karatsuba key: H2_low | H1_low + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + ldr q30, [x7] // Preload round N key for final EOR + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Lenc_finish_first_blocks // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + b.eq Lenc_finish_first_blocks // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 +Lenc_finish_first_blocks: + cmp x0, x5 // check if we have <= 4 blocks to process in the tail + eor v17.16b, v17.16b, v9.16b // Karatsuba key: H3^H4 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // Karatsuba key: H2_high | H1_high + eor v16.16b, v16.16b, v8.16b // Karatsuba key: H1^H2 + b.ge Lenc_tail // handle tail if no more full 4-block sets + ldp q6, q7, [x0, #32] // AES blocks 2,3 load plaintext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load plaintext + // Compute and store first 4 ciphertext blocks + eor v4.16b, v4.16b, v30.16b + eor v4.16b, v4.16b, v0.16b // AES block 0 - result = PT ^ AES(ctr0) + eor v5.16b, v5.16b, v30.16b + eor v5.16b, v5.16b, v1.16b // AES block 1 - result = PT ^ AES(ctr1) + eor v6.16b, v6.16b, v30.16b + eor v6.16b, v6.16b, v2.16b // AES block 2 - result = PT ^ AES(ctr2) + eor v7.16b, v7.16b, v30.16b + eor v7.16b, v7.16b, v3.16b // AES block 3 - result = PT ^ AES(ctr3) + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 0-3 - store result + // Load counter values for the second iteration from the stack + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // Prepare and store counter values for the third iteration + rev w20, w12 + str w20, [sp, #172] // ctr + 8 + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] // ctr + 9 + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] // ctr + 10 + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] // ctr + 11 + add w12, w12, #4 // Advance counter base + cmp x0, x5 // check if we have <= 4 blocks remaining + b.ge Lenc_prepretail // go to prepretail if < 2 full loops left +Lenc_main_loop: // main loop start (processes 4 blocks per iteration) + // --- AES Pipeline for blocks 4k+4 to 4k+7 --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + ldr d8, [sp, #128] // Load GHASH reduction constant + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + // --- GHASH Pipeline (interleaved with AES) for blocks 4k to 4k+3 --- + rev64 v4.16b, v4.16b // GHASH block 4k - Byte swap CT + rev64 v5.16b, v5.16b // GHASH block 4k+1 - Byte swap CT + rev64 v6.16b, v6.16b // GHASH block 4k+2 - Byte swap CT + rev64 v7.16b, v7.16b // GHASH block 4k+3 - Byte swap CT + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // GHASH - prepare acc for XOR + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // GHASH block 4k - Y_i = CT_i ^ Y_{i-1} + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid Karatsuba key + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d20, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v20.8b, v20.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + mov d21, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v21.8b, v21.8b, v5.8b // GHASH block 4k+1 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull v10.1q, v20.1d, v10.1d // GHASH block 4k - mid + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + pmull v21.1q, v21.1d, v17.1d // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v21.16b // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + ext v22.16b, v22.16b, v6.16b, #8 // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v22.16b, v22.16b, v6.16b // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull2 v22.1q, v22.2d, v16.2d // GHASH block 4k+2 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + mov d23, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v23.8b, v23.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + pmull v23.1q, v23.1d, v16.1d // GHASH block 4k+3 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v10.16b, v10.16b, v22.16b + eor v10.16b, v10.16b, v23.16b // GHASH block 4k+2/3 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v22.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v21.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v22.16b, v22.16b, v21.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull2 v23.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + pmull v21.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull v20.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor v9.16b, v9.16b, v22.16b + eor v9.16b, v9.16b, v23.16b // GHASH block 4k/1/2/3 - high + pmull v22.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + ldp q6, q7, [x0, #32] + ldp q4, q5, [x0], #64 + eor v20.16b, v20.16b, v21.16b // GHASH block 4k+1 - low + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor v11.16b, v11.16b, v22.16b + eor v11.16b, v11.16b, v20.16b // GHASH block 4k/1/2/3 - low + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + eor v10.16b, v10.16b, v9.16b + eor v10.16b, v10.16b, v11.16b // MODULO - karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v20.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Lenc_main_loop_continue // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Lenc_main_loop_continue // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +Lenc_main_loop_continue: + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v20.16b + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v20.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v9.16b, v11.16b + eor v11.16b, v11.16b, v20.16b // MODULO - fold into low + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor v4.16b, v4.16b, v30.16b + eor v4.16b, v4.16b, v0.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor v5.16b, v5.16b, v30.16b + eor v5.16b, v5.16b, v1.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v6.16b, v6.16b, v30.16b + eor v6.16b, v6.16b, v2.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor v7.16b, v7.16b, v30.16b + eor v7.16b, v7.16b, v3.16b // AES block 4k+7 - result + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // We used these registers as temporaries above so reload the RKs. + ldp q20, q21, [x8, #32] // load rk2, rk3 + ldp q22, q23, [x8, #64] // load rk4, rk5 + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + rev w20, w12 + str w20, [sp, #172] + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] + add w12, w12, #4 + cmp x0, x5 // LOOP CONTROL + b.lt Lenc_main_loop +Lenc_prepretail: // PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + rev64 v4.16b, v4.16b // GHASH block 0 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v5.16b, v5.16b // GHASH block 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid Karatsuba key + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + mov d8, v6.d[1] // GHASH block 2 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v9.16b, v9.16b, v4.16b // GHASH block 2 - high + mov d4, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + eor v4.8b, v4.8b, v7.8b // GHASH block 3 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + pmull v4.1q, v4.1d, v16.1d // GHASH block 3 - mid + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + eor v9.16b, v9.16b, v5.16b // GHASH block 3 - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + ldr d8, [sp, #128] + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + eor v10.16b, v10.16b, v4.16b // GHASH block 3 - mid + pmull v6.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + eor v11.16b, v11.16b, v6.16b // GHASH block 3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v10.16b, v10.16b, v9.16b // karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v11.16b + b.lt Lenc_finish_prepretail // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Lenc_finish_prepretail // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Lenc_finish_prepretail: + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + eor v10.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v9.16b + pmull v4.1q, v10.1d, v8.1d + ext v10.16b, v10.16b, v10.16b, #8 + eor v11.16b, v11.16b, v4.16b + eor v11.16b, v11.16b, v10.16b +Lenc_tail: // TAIL: Process remaining 0 to 3 blocks + ext v8.16b, v11.16b, v11.16b, #8 // Save current GHASH state for partial tag feed-in + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 // AES block 0 - load plaintext + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + fmov d4, x6 // AES block 0 - mov low + fmov v4.d[1], x7 // AES block 0 - mov high + eor v5.16b, v4.16b, v0.16b // AES block 0 - result + b.gt Lenc_blocks_more_than_3 + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt Lenc_blocks_more_than_2 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Lenc_blocks_more_than_1 + b Lenc_blocks_less_than_1 +Lenc_blocks_more_than_3: // blocks left > 3 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-3 block + eor x6, x6, x13 // AES final-2 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor x7, x7, x14 // AES final-2 block - round N high + mov d22, v4.d[1] // GHASH final-3 block - mid + fmov d5, x6 // AES final-2 block - mov low + fmov v5.d[1], x7 // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + mov d10, v17.d[1] // GHASH final-3 block - mid + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b // AES final-2 block - result +Lenc_blocks_more_than_2: // blocks left > 2 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-2 block + eor x6, x6, x13 // AES final-1 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + fmov d5, x6 // AES final-1 block - mov low + eor x7, x7, x14 // AES final-1 block - round N high + fmov v5.d[1], x7 // AES final-1 block - mov high + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + eor v5.16b, v5.16b, v2.16b // AES final-1 block - result + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid +Lenc_blocks_more_than_1: // blocks left > 1 + st1 { v5.16b}, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block: Byte Swap CT + ldp x6, x7, [x0], #16 // AES final block - load plaintext + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + movi v8.8b, #0 // Clear for next block + eor x6, x6, x13 // AES final block - round N low + mov d22, v4.d[1] // GHASH final-1 block - mid + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor x7, x7, x14 // AES final block - round N high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + fmov d5, x6 // AES final block - mov low + fmov v5.d[1], x7 // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + eor v5.16b, v5.16b, v3.16b // AES final block - result + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low +Lenc_blocks_less_than_1: // Last partial block handling + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x13, xzr // Mask for low 64 bits + sub x1, x1, #128 // + neg x1, x1 // Valid bits in the last block (1-128) + ldr q18, [x2] // Load destination for merging + mvn x14, xzr // Mask for high 64 bits + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + fmov d0, x6 // ctr0d is mask for last block + fmov v0.d[1], x7 + and v5.16b, v5.16b, v0.16b // Mask out unused bits of the last CT block + rev64 v4.16b, v5.16b // GHASH final block - byte swap + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + bif v5.16b, v18.16b, v0.16b // Bitwise Insert: merge with existing data at output_ptr + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + mov d8, v4.d[1] // GHASH final block - mid + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + fmov d8, x21 + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + st1 { v5.16b}, [x2] // store all 16B + eor v11.16b, v11.16b, v9.16b + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap GHASH result + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl aes_gcm_dec_kernel + +.def aes_gcm_dec_kernel + .type 32 +.endef +.align 4 +aes_gcm_dec_kernel: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + stp x19, x20, [sp, #16] + ld1 { v0.16b}, [x4] + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // Load number of AES rounds + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + add x4, x0, x1, lsr #3 // end_input_ptr + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldr w9, [x16, #12] // Load scalar 32-bit counter (CTR) + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x5, x5, x0 + rev w9, w9 // Reverse it once for big-endian incrementing + uxtw x10, w9 // Zero extend reversed w9 into x10 + str q0, [sp, #160] + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + rev w20, w9 + mov v0.s[3], w20 + add w20, w9, #1 + rev w20, w20 + mov v1.s[3], w20 + add w20, w9, #2 + rev w20, w20 + mov v2.s[3], w20 + add w20, w9, #3 + rev w20, w20 + mov v3.s[3], w20 + add w20, w9, #4 + rev w20, w20 + str w20, [sp, #172] + add w20, w9, #5 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #6 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #7 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #8 + // Pre-compute this value instead of using two instructions for moving and then shifting in the main loop + mov x21, #0xc200000000000000 + str x21, [sp, #128] + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load h2, h3 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 + ldr q15, [x6, #80] // load h4 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load h1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_finish_first_blocks // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Ldec_finish_first_blocks // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Ldec_finish_first_blocks: + ldr q27, [x19] // load rkN + cmp x0, x5 // check if we have <= 4 blocks + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + eor v16.16b, v16.16b, v8.16b // h2k | h1k + b.ge Ldec_tail // handle tail + // Setup for AES blocks 0-3 is done purely on NEON side instead of mixing NEON and scalar instructions. + // This is because the final result of the AES block needs to be EORd with the final round key + // value (v30). This avoids several fmovs. + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + eor v0.16b, v4.16b, v0.16b + eor v0.16b, v0.16b, v27.16b // AES block 0 - result + eor v1.16b, v5.16b, v1.16b + eor v1.16b, v1.16b, v27.16b // AES block 1 - result + eor v2.16b, v6.16b, v2.16b + eor v2.16b, v2.16b, v27.16b // AES block 2 - result + eor v3.16b, v7.16b, v3.16b + eor v3.16b, v3.16b, v27.16b // AES block 3 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 0-3 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 // check if we have <= 4 blocks + b.ge Ldec_prepretail // do prepretail +Ldec_main_loop: // main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + rev64 v4.16b, v4.16b // GHASH block 4k + rev64 v5.16b, v5.16b // GHASH block 4k+1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // PRE 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d8, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v11.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v5.16b // GHASH block 4k+1 - low & GHASH block 4k+2 - low + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v28.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull v27.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor v10.16b, v10.16b, v4.16b + eor v10.16b, v10.16b, v8.16b // GHASH block 4k+1 - mid & GHASH block 4k+2 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + eor v9.16b, v9.16b, v28.16b + eor v9.16b, v9.16b, v5.16b // GHASH block 4k+2 - high & GHASH block 4k+3 - high + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + ldr d8, [sp, #128] + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor v10.16b, v10.16b, v6.16b + eor v10.16b, v10.16b, v7.16b // GHASH block 4k+3 - mid & MODULO - fold into mid + eor v11.16b, v11.16b, v27.16b // GHASH block 4k+3 - low + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v6.16b + eor v10.16b, v10.16b, v9.16b // MODULO - karatsuba tidy up & MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_main_loop_continue // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Ldec_main_loop_continue // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +Ldec_main_loop_continue: + ldr q27, [x19] // load rkN + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor v0.16b, v4.16b, v0.16b + eor v0.16b, v0.16b, v27.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor v1.16b, v5.16b, v1.16b + eor v1.16b, v1.16b, v27.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor v2.16b, v6.16b, v2.16b + eor v2.16b, v2.16b, v27.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor v3.16b, v7.16b, v3.16b + eor v3.16b, v3.16b, v27.16b // AES block 4k+7 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 + b.lt Ldec_main_loop +Ldec_prepretail: // PREPRETAIL + rev64 v4.16b, v4.16b // GHASH block 0 + rev64 v5.16b, v5.16b // GHASH block 1 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + mov d8, v6.d[1] // GHASH block 2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + eor v9.16b, v9.16b, v4.16b + eor v9.16b, v9.16b, v5.16b // GHASH block 2 - high & GHASH block 3 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + mov d6, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_finish_prepretail // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Ldec_finish_prepretail // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Ldec_finish_prepretail: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 +Ldec_tail: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 0 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 0 - result + mov x6, v0.d[0] // AES block 0 - mov low + mov x7, v0.d[1] // AES block 0 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + b.gt Ldec_blocks_more_than_3 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + cmp x5, #32 + b.gt Ldec_blocks_more_than_2 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Ldec_blocks_more_than_1 + b Ldec_blocks_less_than_1 +Ldec_blocks_more_than_3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +Ldec_blocks_more_than_2: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +Ldec_blocks_more_than_1: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +Ldec_blocks_less_than_1: // blocks left <= 1 + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + ldr d8, [sp, #128] + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl aes_gcm_enc_kernel_eor3 + +.def aes_gcm_enc_kernel_eor3 + .type 32 +.endef +.align 4 +aes_gcm_enc_kernel_eor3: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + ld1 { v0.16b}, [x4] // Load initial counter block + stp x19, x20, [sp, #16] + mov v1.16b, v0.16b // Initialize ctr1-3 from ctr0 + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 // Pointer to counter block in memory + mov x8, x5 // Pointer to AES key schedule context + stp x21, x22, [sp, #32] + // [sp, #48] is unused but allocated to align the stack layout with aes_gcm_dec_kernel_eor3 + stp d8, d9, [sp, #64] // Save Neon registers + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // Load number of AES rounds + add x7, x8, x17, lsl #4 // Calculate pointer to the last round key + ldp x13, x14, [x7] // load round N key (for final XOR) + ldr q31, [x7, #-16] // load round N-1 key + add x4, x0, x1, lsr #3 // Calculate end of input + lsr x5, x1, #3 // Total byte length + mov x15, x5 + ldr w12, [x16, #12] // Load counter's low 32 bits + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // Align main loop end to a multiple of 64 bytes + add x5, x5, x0 + rev w12, w12 // Reverse for big-endian increment + uxtw x10, w12 // Zero extend reversed w12 into x10 for final counter update + // Pre-compute this value instead of using two instructions to reconstruct it every iteration + mov x21, #0xc200000000000000 // GHASH reduction constant + str x21, [sp, #128] + // We maintain four copies of ctr values on the stack. Each loop iteration we + // store the updated ctr value to the last four bytes (e.g., 160 + 12). + // We then load the four values. This avoids a singificant number of + // expensive GPR->NEON and NEON->NEON moves. To avoid LDST forwarding we + // calculate and store the values one iteration ahead so they have time to + // drain before we load them. + str q0, [sp, #160] // Store base counter for block 0-3 + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + // Since we need the values right away don't go through the stack this first + // time. Manually insert the incremented big-endian counter values. + rev w20, w12 + mov v0.s[3], w20 // ctr0 + 0 + add w20, w12, #1 + rev w20, w20 + mov v1.s[3], w20 // ctr0 + 1 + add w20, w12, #2 + rev w20, w20 + mov v2.s[3], w20 // ctr0 + 2 + add w20, w12, #3 + rev w20, w20 + mov v3.s[3], w20 // ctr0 + 3 + // Calculate the ctr values for the *next* (not current) group of four + // blocks. Store the incremented parts to the stack. + add w20, w12, #4 + rev w20, w20 + str w20, [sp, #172] // ctr0 + 4 for next iter + add w20, w12, #5 + rev w20, w20 + str w20, [sp, #188] // ctr0 + 5 for next iter + add w20, w12, #6 + rev w20, w20 + str w20, [sp, #204] // ctr0 + 6 for next iter + add w20, w12, #7 + rev w20, w20 + str w20, [sp, #220] // ctr0 + 7 for next iter + add w12, w12, #8 // Advance counter past these two sets + // --- Start AES for first 4 blocks --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load H2, H3 (GHASH keys) + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 // Byte swap H3 for GHASH + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 // Byte swap H2 for GHASH + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + ldr q15, [x6, #80] // load H4 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 // Byte swap H4 for GHASH + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] // Load initial GHASH accumulator (T) + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap T for GHASH + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b // Correct byte order within 64-bit lanes + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // Karatsuba key: H4_low | H3_low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load H1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 // Byte swap H1 for GHASH + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // Karatsuba key: H4_high | H3_high + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // Karatsuba key: H2_low | H1_low + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + ldr q30, [x7] // Preload round N key for final EOR + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Lenc_finish_first_blocks_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + b.eq Lenc_finish_first_blocks_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 +Lenc_finish_first_blocks_eor3: + cmp x0, x5 // check if we have <= 4 blocks to process in the tail + eor v17.16b, v17.16b, v9.16b // Karatsuba key: H3^H4 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // Karatsuba key: H2_high | H1_high + eor v16.16b, v16.16b, v8.16b // Karatsuba key: H1^H2 + b.ge Lenc_tail_eor3 // handle tail if no more full 4-block sets + ldp q6, q7, [x0, #32] // AES blocks 2,3 load plaintext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load plaintext + // Compute and store first 4 ciphertext blocks + eor3 v4.16b, v4.16b, v30.16b, v0.16b // AES block 0 - result = PT ^ AES(ctr0) + eor3 v5.16b, v5.16b, v30.16b, v1.16b // AES block 1 - result = PT ^ AES(ctr1) + eor3 v6.16b, v6.16b, v30.16b, v2.16b // AES block 2 - result = PT ^ AES(ctr2) + eor3 v7.16b, v7.16b, v30.16b, v3.16b // AES block 3 - result = PT ^ AES(ctr3) + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 0-3 - store result + // Load counter values for the second iteration from the stack + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // Prepare and store counter values for the third iteration + rev w20, w12 + str w20, [sp, #172] // ctr + 8 + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] // ctr + 9 + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] // ctr + 10 + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] // ctr + 11 + add w12, w12, #4 // Advance counter base + cmp x0, x5 // check if we have <= 4 blocks remaining + b.ge Lenc_prepretail_eor3 // go to prepretail if < 2 full loops left +Lenc_main_loop_eor3: // main loop start (processes 4 blocks per iteration) + // --- AES Pipeline for blocks 4k+4 to 4k+7 --- + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + ldr d8, [sp, #128] // Load GHASH reduction constant + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + // --- GHASH Pipeline (interleaved with AES) for blocks 4k to 4k+3 --- + rev64 v4.16b, v4.16b // GHASH block 4k - Byte swap CT + rev64 v5.16b, v5.16b // GHASH block 4k+1 - Byte swap CT + rev64 v6.16b, v6.16b // GHASH block 4k+2 - Byte swap CT + rev64 v7.16b, v7.16b // GHASH block 4k+3 - Byte swap CT + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // GHASH - prepare acc for XOR + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // GHASH block 4k - Y_i = CT_i ^ Y_{i-1} + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid Karatsuba key + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d20, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v20.8b, v20.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + mov d21, v5.d[1] // GHASH block 4k+1 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + eor v21.8b, v21.8b, v5.8b // GHASH block 4k+1 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + pmull v10.1q, v20.1d, v10.1d // GHASH block 4k - mid + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + pmull v21.1q, v21.1d, v17.1d // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + eor v10.16b, v10.16b, v21.16b // GHASH block 4k+1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + ext v22.16b, v22.16b, v6.16b, #8 // GHASH block 4k+2 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v22.16b, v22.16b, v6.16b // GHASH block 4k+2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull2 v22.1q, v22.2d, v16.2d // GHASH block 4k+2 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + mov d23, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor v23.8b, v23.8b, v7.8b // GHASH block 4k+3 - mid + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + pmull v23.1q, v23.1d, v16.1d // GHASH block 4k+3 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor3 v10.16b, v10.16b, v22.16b, v23.16b // GHASH block 4k+2/3 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v22.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v21.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v22.16b, v22.16b, v21.16b // GHASH block 4k+3 - high + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull2 v23.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + pmull v21.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull v20.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + eor3 v9.16b, v9.16b, v22.16b, v23.16b // GHASH block 4k/1/2/3 - high + pmull v22.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + ldp q6, q7, [x0, #32] + ldp q4, q5, [x0], #64 + eor v20.16b, v20.16b, v21.16b // GHASH block 4k+1 - low + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + eor3 v11.16b, v11.16b, v22.16b, v20.16b // GHASH block 4k/1/2/3 - low + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + eor3 v10.16b, v10.16b, v9.16b, v11.16b // MODULO - karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v20.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Lenc_main_loop_continue_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Lenc_main_loop_continue_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +Lenc_main_loop_continue_eor3: + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v20.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v20.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor3 v11.16b, v9.16b, v11.16b, v20.16b // MODULO - fold into low + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor3 v4.16b, v4.16b, v30.16b, v0.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor3 v5.16b, v5.16b, v30.16b, v1.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor3 v6.16b, v6.16b, v30.16b, v2.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor3 v7.16b, v7.16b, v30.16b, v3.16b // AES block 4k+7 - result + ldp q0, q1, [sp, #160] + ldp q2, q3, [sp, #192] + // We used these registers as temporaries above so reload the RKs. + ldp q20, q21, [x8, #32] // load rk2, rk3 + ldp q22, q23, [x8, #64] // load rk4, rk5 + st1 { v4.16b, v5.16b, v6.16b, v7.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + rev w20, w12 + str w20, [sp, #172] + add w20, w12, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w12, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w12, #3 + rev w20, w20 + str w20, [sp, #220] + add w12, w12, #4 + cmp x0, x5 // LOOP_eor3 CONTROL + b.lt Lenc_main_loop_eor3 +Lenc_prepretail_eor3: // PREPRETAIL + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + rev64 v4.16b, v4.16b // GHASH block 0 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v5.16b, v5.16b // GHASH block 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid Karatsuba key + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + mov d8, v6.d[1] // GHASH block 2 - mid + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v9.16b, v9.16b, v4.16b // GHASH block 2 - high + mov d4, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + eor v4.8b, v4.8b, v7.8b // GHASH block 3 - mid + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + pmull v4.1q, v4.1d, v16.1d // GHASH block 3 - mid + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + eor v9.16b, v9.16b, v5.16b // GHASH block 3 - high + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + ldr d8, [sp, #128] + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + eor v10.16b, v10.16b, v4.16b // GHASH block 3 - mid + pmull v6.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + eor v11.16b, v11.16b, v6.16b // GHASH block 3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v10.16b, v10.16b, v9.16b // karatsuba tidy up + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + pmull v4.1q, v9.1d, v8.1d + ext v9.16b, v9.16b, v9.16b, #8 + eor v10.16b, v10.16b, v11.16b + b.lt Lenc_finish_prepretail_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Lenc_finish_prepretail_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Lenc_finish_prepretail_eor3: + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + eor3 v10.16b, v10.16b, v4.16b, v9.16b + pmull v4.1q, v10.1d, v8.1d + ext v10.16b, v10.16b, v10.16b, #8 + eor3 v11.16b, v11.16b, v4.16b, v10.16b +Lenc_tail_eor3: // TAIL: Process remaining 0 to 3 blocks + ext v8.16b, v11.16b, v11.16b, #8 // Save current GHASH state for partial tag feed-in + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ldp x6, x7, [x0], #16 // AES block 0 - load plaintext + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + fmov d4, x6 // AES block 0 - mov low + fmov v4.d[1], x7 // AES block 0 - mov high + eor v5.16b, v4.16b, v0.16b // AES block 0 - result + b.gt Lenc_blocks_more_than_3_eor3 + cmp x5, #32 + mov v3.16b, v2.16b + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + movi v10.8b, #0 + b.gt Lenc_blocks_more_than_2_eor3 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Lenc_blocks_more_than_1_eor3 + b Lenc_blocks_less_than_1_eor3 +Lenc_blocks_more_than_3_eor3: // blocks left > 3 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-2 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-3 block + eor x6, x6, x13 // AES final-2 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor x7, x7, x14 // AES final-2 block - round N high + mov d22, v4.d[1] // GHASH final-3 block - mid + fmov d5, x6 // AES final-2 block - mov low + fmov v5.d[1], x7 // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + mov d10, v17.d[1] // GHASH final-3 block - mid + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor v5.16b, v5.16b, v1.16b // AES final-2 block - result +Lenc_blocks_more_than_2_eor3: // blocks left > 2 + st1 { v5.16b}, [x2], #16 // AES final-2 block - store result + ldp x6, x7, [x0], #16 // AES final-1 block - load input low & high + rev64 v4.16b, v5.16b // GHASH final-2 block + eor x6, x6, x13 // AES final-1 block - round N low + eor v4.16b, v4.16b, v8.16b // feed in partial tag + fmov d5, x6 // AES final-1 block - mov low + eor x7, x7, x14 // AES final-1 block - round N high + fmov v5.d[1], x7 // AES final-1 block - mov high + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + eor v5.16b, v5.16b, v2.16b // AES final-1 block - result + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid +Lenc_blocks_more_than_1_eor3: // blocks left > 1 + st1 { v5.16b}, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block: Byte Swap CT + ldp x6, x7, [x0], #16 // AES final block - load plaintext + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + movi v8.8b, #0 // Clear for next block + eor x6, x6, x13 // AES final block - round N low + mov d22, v4.d[1] // GHASH final-1 block - mid + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor x7, x7, x14 // AES final block - round N high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + fmov d5, x6 // AES final block - mov low + fmov v5.d[1], x7 // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + eor v5.16b, v5.16b, v3.16b // AES final block - result + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low +Lenc_blocks_less_than_1_eor3: // Last partial block handling + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x13, xzr // Mask for low 64 bits + sub x1, x1, #128 // + neg x1, x1 // Valid bits in the last block (1-128) + ldr q18, [x2] // Load destination for merging + mvn x14, xzr // Mask for high 64 bits + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x6, x13, x14, lt + csel x7, x14, xzr, lt + fmov d0, x6 // ctr0d is mask for last block + fmov v0.d[1], x7 + and v5.16b, v5.16b, v0.16b // Mask out unused bits of the last CT block + rev64 v4.16b, v5.16b // GHASH final block - byte swap + eor v4.16b, v4.16b, v8.16b // Feed in partial tag + bif v5.16b, v18.16b, v0.16b // Bitwise Insert: merge with existing data at output_ptr + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + mov d8, v4.d[1] // GHASH final block - mid + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + eor v4.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + fmov d8, x21 + eor v10.16b, v10.16b, v4.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v7.16b, v9.16b // MODULO - fold into mid + pmull v9.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + st1 { v5.16b}, [x2] // store all 16B + eor3 v11.16b, v11.16b, v9.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 // Byte swap GHASH result + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl aes_gcm_dec_kernel_eor3 + +.def aes_gcm_dec_kernel_eor3 + .type 32 +.endef +.align 4 +aes_gcm_dec_kernel_eor3: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp, #-224]! + mov x29, sp + stp x19, x20, [sp, #16] + ld1 { v0.16b}, [x4] + mov v1.16b, v0.16b + mov v2.16b, v0.16b + mov v3.16b, v0.16b + mov x16, x4 + mov x8, x5 + stp x21, x22, [sp, #32] + stp x23, x24, [sp, #48] + stp d8, d9, [sp, #64] + stp d10, d11, [sp, #80] + stp d12, d13, [sp, #96] + stp d14, d15, [sp, #112] + ldr w17, [x8, #240] // Load number of AES rounds + add x19, x8, x17, lsl #4 // borrow input_l1 for last key + ldp x13, x14, [x19] // load round N keys + ldr q31, [x19, #-16] // load round N-1 keys + add x4, x0, x1, lsr #3 // end_input_ptr + lsr x5, x1, #3 // byte_len + mov x15, x5 + ldr w9, [x16, #12] // Load scalar 32-bit counter (CTR) + sub x5, x5, #1 // byte_len - 1 + ldr q18, [x8, #0] // load rk0 + and x5, x5, #0xffffffffffffffc0 // number of bytes to be processed in main loop (at least 1 byte must be handled by tail) + add x5, x5, x0 + rev w9, w9 // Reverse it once for big-endian incrementing + uxtw x10, w9 // Zero extend reversed w9 into x10 + str q0, [sp, #160] + str q0, [sp, #176] + str q0, [sp, #192] + str q0, [sp, #208] + rev w20, w9 + mov v0.s[3], w20 + add w20, w9, #1 + rev w20, w20 + mov v1.s[3], w20 + add w20, w9, #2 + rev w20, w20 + mov v2.s[3], w20 + add w20, w9, #3 + rev w20, w20 + mov v3.s[3], w20 + add w20, w9, #4 + rev w20, w20 + str w20, [sp, #172] + add w20, w9, #5 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #6 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #7 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #8 + // Pre-compute this value instead of using two instructions for moving and then shifting in the main loop + mov x21, #0xc200000000000000 + str x21, [sp, #128] + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + ldp q19, q20, [x8, #16] // load rk1, rk2 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + ldp q21, q22, [x8, #48] // load rk3, rk4 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + ldp q23, q24, [x8, #80] // load rk5, rk6 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + ldp q13, q14, [x6, #32] // load h2, h3 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + ldp q25, q26, [x8, #112] // load rk7, rk8 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + ext v14.16b, v14.16b, v14.16b, #8 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + ext v13.16b, v13.16b, v13.16b, #8 + ldr q15, [x6, #80] // load h4 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + ext v15.16b, v15.16b, v15.16b, #8 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + ld1 { v11.16b}, [x3] + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + ext v11.16b, v11.16b, v11.16b, #8 + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + rev64 v11.16b, v11.16b + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + trn2 v17.2d, v14.2d, v15.2d // h4l | h3l + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + ldr q12, [x6] // load h1 + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + ext v12.16b, v12.16b, v12.16b, #8 + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + trn1 v9.2d, v14.2d, v15.2d // h4h | h3h + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + trn2 v16.2d, v12.2d, v13.2d // h2l | h1l + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + cmp x17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_finish_first_blocks_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Ldec_finish_first_blocks_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Ldec_finish_first_blocks_eor3: + ldr q27, [x19] // load rkN + cmp x0, x5 // check if we have <= 4 blocks + eor v17.16b, v17.16b, v9.16b // h4k | h3k + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + trn1 v8.2d, v12.2d, v13.2d // h2h | h1h + eor v16.16b, v16.16b, v8.16b // h2k | h1k + b.ge Ldec_tail_eor3 // handle tail + // Setup for AES blocks 0-3 is done purely on NEON side instead of mixing NEON and scalar instructions. + // This is because the final result of the AES block needs to be EORd with the final round key + // value (v30). This avoids several fmovs. + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + eor3 v0.16b, v4.16b, v0.16b, v27.16b // AES block 0 - result + eor3 v1.16b, v5.16b, v1.16b, v27.16b // AES block 1 - result + eor3 v2.16b, v6.16b, v2.16b, v27.16b // AES block 2 - result + eor3 v3.16b, v7.16b, v3.16b, v27.16b // AES block 3 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 0-3 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 // check if we have <= 4 blocks + b.ge Ldec_prepretail_eor3 // do prepretail +Ldec_main_loop_eor3: // main loop start + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 0 + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 0 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 0 + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 1 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 1 + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 1 + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 1 + rev64 v4.16b, v4.16b // GHASH block 4k + rev64 v5.16b, v5.16b // GHASH block 4k+1 + rev64 v6.16b, v6.16b // GHASH block 4k+2 + rev64 v7.16b, v7.16b // GHASH block 4k+3 + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 2 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 2 + eor v4.16b, v4.16b, v11.16b // PRE 1 + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 2 + pmull v11.1q, v4.1d, v15.1d // GHASH block 4k - low + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 2 + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 4k - high + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 3 + mov d10, v17.d[1] // GHASH block 4k - mid + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 3 + mov d8, v4.d[1] // GHASH block 4k - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 3 + eor v8.8b, v8.8b, v4.8b // GHASH block 4k - mid + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 3 + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 4 + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 4k+1 - high + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 4 + eor v9.16b, v9.16b, v4.16b // GHASH block 4k+1 - high + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 4 + mov d4, v5.d[1] // GHASH block 4k+1 - mid + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 4 + pmull v10.1q, v8.1d, v10.1d // GHASH block 4k - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 5 + pmull v8.1q, v5.1d, v14.1d // GHASH block 4k+1 - low + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 5 + eor v4.8b, v4.8b, v5.8b // GHASH block 4k+1 - mid + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 5 + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 5 + pmull v5.1q, v6.1d, v13.1d // GHASH block 4k+2 - low + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 6 + eor3 v11.16b, v11.16b, v8.16b, v5.16b // GHASH block 4k+1 - low & GHASH block 4k+2 - low + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 6 + mov d8, v6.d[1] // GHASH block 4k+2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 6 + eor v8.8b, v8.8b, v6.8b // GHASH block 4k+2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 4k+1 - mid + ins v8.d[1], v8.d[0] // GHASH block 4k+2 - mid + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 6 + pmull2 v28.1q, v6.2d, v13.2d // GHASH block 4k+2 - high + mov d6, v7.d[1] // GHASH block 4k+3 - mid + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 7 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 4k+2 - mid + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 7 + pmull v27.1q, v7.1d, v12.1d // GHASH block 4k+3 - low + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 7 + eor3 v10.16b, v10.16b, v4.16b, v8.16b // GHASH block 4k+1 - mid & GHASH block 4k+2 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 7 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 4k+3 - high + eor v6.8b, v6.8b, v7.8b // GHASH block 4k+3 - mid + eor3 v9.16b, v9.16b, v28.16b, v5.16b // GHASH block 4k+2 - high & GHASH block 4k+3 - high + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 8 + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 8 + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 8 + pmull v6.1q, v6.1d, v16.1d // GHASH block 4k+3 - mid + ldr d8, [sp, #128] + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 8 + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + eor3 v10.16b, v10.16b, v6.16b, v7.16b // GHASH block 4k+3 - mid & MODULO - fold into mid + eor v11.16b, v11.16b, v27.16b // GHASH block 4k+3 - low + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v6.16b, v9.16b // MODULO - karatsuba tidy up & MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor3 v11.16b, v11.16b, v8.16b, v10.16b // MODULO - fold into low + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_main_loop_continue_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 10 + b.eq Ldec_main_loop_continue_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 4k+4 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 4k+5 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 4k+6 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 4k+7 - round 12 +Ldec_main_loop_continue_eor3: + ldr q27, [x19] // load rkN + ldp q6, q7, [x0, #32] // AES blocks 2,3 load ciphertext + ldp q4, q5, [x0], #64 // AES blocks 0,1 load ciphertext + aese v0.16b, v31.16b // AES block 4k+4 - round N-1 + eor3 v0.16b, v4.16b, v0.16b, v27.16b // AES block 4k+4 - result + aese v1.16b, v31.16b // AES block 4k+5 - round N-1 + eor3 v1.16b, v5.16b, v1.16b, v27.16b // AES block 4k+5 - result + aese v2.16b, v31.16b // AES block 4k+6 - round N-1 + eor3 v2.16b, v6.16b, v2.16b, v27.16b // AES block 4k+6 - result + aese v3.16b, v31.16b // AES block 4k+7 - round N-1 + eor3 v3.16b, v7.16b, v3.16b, v27.16b // AES block 4k+7 - result + st1 { v0.16b, v1.16b, v2.16b, v3.16b}, [x2], #64 // AES blocks 4k+4-7 - store result + ldr q0, [sp, #160] + ldr q1, [sp, #176] + ldr q2, [sp, #192] + ldr q3, [sp, #208] + rev w20, w9 + str w20, [sp, #172] + add w20, w9, #1 + rev w20, w20 + str w20, [sp, #188] + add w20, w9, #2 + rev w20, w20 + str w20, [sp, #204] + add w20, w9, #3 + rev w20, w20 + str w20, [sp, #220] + add w9, w9, #4 + cmp x0, x5 + b.lt Ldec_main_loop_eor3 +Ldec_prepretail_eor3: // PREPRETAIL + rev64 v4.16b, v4.16b // GHASH block 0 + rev64 v5.16b, v5.16b // GHASH block 1 + ext v11.16b, v11.16b, v11.16b, #8 // PRE 0 + aese v0.16b, v18.16b + aesmc v0.16b, v0.16b // AES block 0 - round 0 + aese v1.16b, v18.16b + aesmc v1.16b, v1.16b // AES block 1 - round 0 + eor v4.16b, v4.16b, v11.16b // PRE 1 + rev64 v6.16b, v6.16b // GHASH block 2 + aese v1.16b, v19.16b + aesmc v1.16b, v1.16b // AES block 1 - round 1 + pmull v11.1q, v4.1d, v15.1d // GHASH block 0 - low + mov d8, v4.d[1] // GHASH block 0 - mid + pmull2 v9.1q, v4.2d, v15.2d // GHASH block 0 - high + aese v2.16b, v18.16b + aesmc v2.16b, v2.16b // AES block 2 - round 0 + mov d10, v17.d[1] // GHASH block 0 - mid + aese v0.16b, v19.16b + aesmc v0.16b, v0.16b // AES block 0 - round 1 + eor v8.8b, v8.8b, v4.8b // GHASH block 0 - mid + pmull2 v4.1q, v5.2d, v14.2d // GHASH block 1 - high + aese v2.16b, v19.16b + aesmc v2.16b, v2.16b // AES block 2 - round 1 + rev64 v7.16b, v7.16b // GHASH block 3 + aese v3.16b, v18.16b + aesmc v3.16b, v3.16b // AES block 3 - round 0 + pmull v10.1q, v8.1d, v10.1d // GHASH block 0 - mid + eor v9.16b, v9.16b, v4.16b // GHASH block 1 - high + pmull v8.1q, v5.1d, v14.1d // GHASH block 1 - low + aese v3.16b, v19.16b + aesmc v3.16b, v3.16b // AES block 3 - round 1 + mov d4, v5.d[1] // GHASH block 1 - mid + aese v0.16b, v20.16b + aesmc v0.16b, v0.16b // AES block 0 - round 2 + aese v1.16b, v20.16b + aesmc v1.16b, v1.16b // AES block 1 - round 2 + eor v11.16b, v11.16b, v8.16b // GHASH block 1 - low + aese v2.16b, v20.16b + aesmc v2.16b, v2.16b // AES block 2 - round 2 + aese v0.16b, v21.16b + aesmc v0.16b, v0.16b // AES block 0 - round 3 + mov d8, v6.d[1] // GHASH block 2 - mid + aese v3.16b, v20.16b + aesmc v3.16b, v3.16b // AES block 3 - round 2 + eor v4.8b, v4.8b, v5.8b // GHASH block 1 - mid + pmull v5.1q, v6.1d, v13.1d // GHASH block 2 - low + aese v0.16b, v22.16b + aesmc v0.16b, v0.16b // AES block 0 - round 4 + aese v3.16b, v21.16b + aesmc v3.16b, v3.16b // AES block 3 - round 3 + eor v8.8b, v8.8b, v6.8b // GHASH block 2 - mid + pmull v4.1q, v4.1d, v17.1d // GHASH block 1 - mid + aese v0.16b, v23.16b + aesmc v0.16b, v0.16b // AES block 0 - round 5 + eor v11.16b, v11.16b, v5.16b // GHASH block 2 - low + aese v3.16b, v22.16b + aesmc v3.16b, v3.16b // AES block 3 - round 4 + pmull2 v5.1q, v7.2d, v12.2d // GHASH block 3 - high + eor v10.16b, v10.16b, v4.16b // GHASH block 1 - mid + pmull2 v4.1q, v6.2d, v13.2d // GHASH block 2 - high + aese v3.16b, v23.16b + aesmc v3.16b, v3.16b // AES block 3 - round 5 + ins v8.d[1], v8.d[0] // GHASH block 2 - mid + aese v2.16b, v21.16b + aesmc v2.16b, v2.16b // AES block 2 - round 3 + aese v1.16b, v21.16b + aesmc v1.16b, v1.16b // AES block 1 - round 3 + eor3 v9.16b, v9.16b, v4.16b, v5.16b // GHASH block 2 - high & GHASH block 3 - high + pmull v4.1q, v7.1d, v12.1d // GHASH block 3 - low + aese v2.16b, v22.16b + aesmc v2.16b, v2.16b // AES block 2 - round 4 + mov d6, v7.d[1] // GHASH block 3 - mid + aese v1.16b, v22.16b + aesmc v1.16b, v1.16b // AES block 1 - round 4 + pmull2 v8.1q, v8.2d, v16.2d // GHASH block 2 - mid + aese v2.16b, v23.16b + aesmc v2.16b, v2.16b // AES block 2 - round 5 + eor v6.8b, v6.8b, v7.8b // GHASH block 3 - mid + aese v1.16b, v23.16b + aesmc v1.16b, v1.16b // AES block 1 - round 5 + aese v3.16b, v24.16b + aesmc v3.16b, v3.16b // AES block 3 - round 6 + eor v10.16b, v10.16b, v8.16b // GHASH block 2 - mid + aese v2.16b, v24.16b + aesmc v2.16b, v2.16b // AES block 2 - round 6 + aese v0.16b, v24.16b + aesmc v0.16b, v0.16b // AES block 0 - round 6 + movi v8.8b, #0xc2 + aese v1.16b, v24.16b + aesmc v1.16b, v1.16b // AES block 1 - round 6 + eor v11.16b, v11.16b, v4.16b // GHASH block 3 - low + pmull v6.1q, v6.1d, v16.1d // GHASH block 3 - mid + aese v3.16b, v25.16b + aesmc v3.16b, v3.16b // AES block 3 - round 7 + aese v1.16b, v25.16b + aesmc v1.16b, v1.16b // AES block 1 - round 7 + aese v0.16b, v25.16b + aesmc v0.16b, v0.16b // AES block 0 - round 7 + eor v10.16b, v10.16b, v6.16b // GHASH block 3 - mid + aese v3.16b, v26.16b + aesmc v3.16b, v3.16b // AES block 3 - round 8 + aese v2.16b, v25.16b + aesmc v2.16b, v2.16b // AES block 2 - round 7 + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + aese v1.16b, v26.16b + aesmc v1.16b, v1.16b // AES block 1 - round 8 + aese v0.16b, v26.16b + aesmc v0.16b, v0.16b // AES block 0 - round 8 + shl d8, d8, #56 // mod_constant + aese v2.16b, v26.16b + aesmc v2.16b, v2.16b // AES block 2 - round 8 + cmp w17, #12 // setup flags for AES-128/192/256 check + b.lt Ldec_finish_prepretail_eor3 // branch if AES-128 + ldp q27, q28, [x8, #144] // load rk9, rk10 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 9 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 9 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 9 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 9 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 10 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 10 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 10 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 10 + b.eq Ldec_finish_prepretail_eor3 // branch if AES-192 + ldp q27, q28, [x8, #176] // load rk11, rk12 + aese v0.16b, v27.16b + aesmc v0.16b, v0.16b // AES block 0 - round 11 + aese v1.16b, v27.16b + aesmc v1.16b, v1.16b // AES block 1 - round 11 + aese v2.16b, v27.16b + aesmc v2.16b, v2.16b // AES block 2 - round 11 + aese v3.16b, v27.16b + aesmc v3.16b, v3.16b // AES block 3 - round 11 + aese v0.16b, v28.16b + aesmc v0.16b, v0.16b // AES block 0 - round 12 + aese v1.16b, v28.16b + aesmc v1.16b, v1.16b // AES block 1 - round 12 + aese v2.16b, v28.16b + aesmc v2.16b, v2.16b // AES block 2 - round 12 + aese v3.16b, v28.16b + aesmc v3.16b, v3.16b // AES block 3 - round 12 +Ldec_finish_prepretail_eor3: + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor3 v10.16b, v10.16b, v7.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor3 v11.16b, v11.16b, v8.16b, v10.16b // MODULO - fold into low + aese v1.16b, v31.16b // AES block 1 - round N-1 + aese v0.16b, v31.16b // AES block 0 - round N-1 + aese v3.16b, v31.16b // AES block 3 - round N-1 + aese v2.16b, v31.16b // AES block 2 - round N-1 +Ldec_tail_eor3: // TAIL + sub x5, x4, x0 // main_end_input_ptr is number of bytes left to process + ld1 { v5.16b}, [x0], #16 // AES block 0 - load ciphertext + eor v0.16b, v5.16b, v0.16b // AES block 0 - result + mov x6, v0.d[0] // AES block 0 - mov low + mov x7, v0.d[1] // AES block 0 - mov high + ext v8.16b, v11.16b, v11.16b, #8 // prepare final partial tag + eor x6, x6, x13 // AES block 0 - round N low + eor x7, x7, x14 // AES block 0 - round N high + cmp x5, #48 + b.gt Ldec_blocks_more_than_3_eor3 + mov v3.16b, v2.16b + movi v10.8b, #0 + movi v11.8b, #0 + movi v9.8b, #0 + mov v2.16b, v1.16b + cmp x5, #32 + b.gt Ldec_blocks_more_than_2_eor3 + mov v3.16b, v1.16b + cmp x5, #16 + b.gt Ldec_blocks_more_than_1_eor3 + b Ldec_blocks_less_than_1_eor3 +Ldec_blocks_more_than_3_eor3: // blocks left > 3 + rev64 v4.16b, v5.16b // GHASH final-3 block + ld1 { v5.16b}, [x0], #16 // AES final-2 block - load ciphertext + stp x6, x7, [x2], #16 // AES final-3 block - store result + mov d10, v17.d[1] // GHASH final-3 block - mid + eor v4.16b, v4.16b, v8.16b // feed in partial tag + eor v0.16b, v5.16b, v1.16b // AES final-2 block - result + mov d22, v4.d[1] // GHASH final-3 block - mid + mov x6, v0.d[0] // AES final-2 block - mov low + mov x7, v0.d[1] // AES final-2 block - mov high + eor v22.8b, v22.8b, v4.8b // GHASH final-3 block - mid + movi v8.8b, #0 // suppress further partial tag feed in + pmull2 v9.1q, v4.2d, v15.2d // GHASH final-3 block - high + pmull v10.1q, v22.1d, v10.1d // GHASH final-3 block - mid + eor x6, x6, x13 // AES final-2 block - round N low + pmull v11.1q, v4.1d, v15.1d // GHASH final-3 block - low + eor x7, x7, x14 // AES final-2 block - round N high +Ldec_blocks_more_than_2_eor3: // blocks left > 2 + rev64 v4.16b, v5.16b // GHASH final-2 block + ld1 { v5.16b}, [x0], #16 // AES final-1 block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + stp x6, x7, [x2], #16 // AES final-2 block - store result + eor v0.16b, v5.16b, v2.16b // AES final-1 block - result + mov d22, v4.d[1] // GHASH final-2 block - mid + pmull v21.1q, v4.1d, v14.1d // GHASH final-2 block - low + pmull2 v20.1q, v4.2d, v14.2d // GHASH final-2 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-2 block - mid + mov x6, v0.d[0] // AES final-1 block - mov low + mov x7, v0.d[1] // AES final-1 block - mov high + eor v11.16b, v11.16b, v21.16b // GHASH final-2 block - low + movi v8.8b, #0 // suppress further partial tag feed in + pmull v22.1q, v22.1d, v17.1d // GHASH final-2 block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final-2 block - high + eor x6, x6, x13 // AES final-1 block - round N low + eor v10.16b, v10.16b, v22.16b // GHASH final-2 block - mid + eor x7, x7, x14 // AES final-1 block - round N high +Ldec_blocks_more_than_1_eor3: // blocks left > 1 + stp x6, x7, [x2], #16 // AES final-1 block - store result + rev64 v4.16b, v5.16b // GHASH final-1 block + ld1 { v5.16b}, [x0], #16 // AES final block - load ciphertext + eor v4.16b, v4.16b, v8.16b // feed in partial tag + movi v8.8b, #0 // suppress further partial tag feed in + mov d22, v4.d[1] // GHASH final-1 block - mid + eor v0.16b, v5.16b, v3.16b // AES final block - result + pmull2 v20.1q, v4.2d, v13.2d // GHASH final-1 block - high + eor v22.8b, v22.8b, v4.8b // GHASH final-1 block - mid + pmull v21.1q, v4.1d, v13.1d // GHASH final-1 block - low + mov x6, v0.d[0] // AES final block - mov low + ins v22.d[1], v22.d[0] // GHASH final-1 block - mid + mov x7, v0.d[1] // AES final block - mov high + pmull2 v22.1q, v22.2d, v16.2d // GHASH final-1 block - mid + eor x6, x6, x13 // AES final block - round N low + eor v11.16b, v11.16b, v21.16b // GHASH final-1 block - low + eor v9.16b, v9.16b, v20.16b // GHASH final-1 block - high + eor v10.16b, v10.16b, v22.16b // GHASH final-1 block - mid + eor x7, x7, x14 // AES final block - round N high +Ldec_blocks_less_than_1_eor3: // blocks left <= 1 + add x10, x10, x1, lsr #7 // Calculate the updated counter based on the number of 16B chunks we processed + rev w10, w10 + str w10, [x16, #12] // store the updated counter + and x1, x1, #127 // bit_length %= 128 + mvn x14, xzr // rkN_h = 0xffffffffffffffff + sub x1, x1, #128 // bit_length -= 128 + mvn x13, xzr // rkN_l = 0xffffffffffffffff + ldp x4, x5, [x2] // load existing bytes we need to not overwrite + neg x1, x1 // bit_length = 128 - #bits in input (in range [1,128]) + and x1, x1, #127 // bit_length %= 128 + lsr x14, x14, x1 // rkN_h is mask for top 64b of last block + cmp x1, #64 + csel x9, x13, x14, lt + csel x10, x14, xzr, lt + fmov d0, x9 // ctr0b is mask for last block + and x6, x6, x9 + mov v0.d[1], x10 + bic x4, x4, x9 // mask out low existing bytes + bic x5, x5, x10 // mask out high existing bytes + orr x6, x6, x4 + and x7, x7, x10 + orr x7, x7, x5 + and v5.16b, v5.16b, v0.16b // possibly partial last block has zeroes in highest bits + rev64 v4.16b, v5.16b // GHASH final block + eor v4.16b, v4.16b, v8.16b // feed in partial tag + pmull v21.1q, v4.1d, v12.1d // GHASH final block - low + mov d8, v4.d[1] // GHASH final block - mid + eor v8.8b, v8.8b, v4.8b // GHASH final block - mid + pmull2 v20.1q, v4.2d, v12.2d // GHASH final block - high + pmull v8.1q, v8.1d, v16.1d // GHASH final block - mid + eor v9.16b, v9.16b, v20.16b // GHASH final block - high + eor v11.16b, v11.16b, v21.16b // GHASH final block - low + eor v10.16b, v10.16b, v8.16b // GHASH final block - mid + ldr d8, [sp, #128] + eor v6.16b, v11.16b, v9.16b // MODULO - karatsuba tidy up + eor v10.16b, v10.16b, v6.16b // MODULO - karatsuba tidy up + pmull v7.1q, v9.1d, v8.1d // MODULO - top 64b align with mid + ext v9.16b, v9.16b, v9.16b, #8 // MODULO - other top alignment + eor v10.16b, v10.16b, v7.16b // MODULO - fold into mid + eor v10.16b, v10.16b, v9.16b // MODULO - fold into mid + pmull v8.1q, v10.1d, v8.1d // MODULO - mid 64b align with low + ext v10.16b, v10.16b, v10.16b, #8 // MODULO - other mid alignment + eor v11.16b, v11.16b, v8.16b // MODULO - fold into low + stp x6, x7, [x2] + eor v11.16b, v11.16b, v10.16b // MODULO - fold into low + ext v11.16b, v11.16b, v11.16b, #8 + rev64 v11.16b, v11.16b // Final Tag + mov x0, x15 + st1 { v11.16b }, [x3] // Store final tag + ldp x19, x20, [sp, #16] + ldp x21, x22, [sp, #32] + ldp x23, x24, [sp, #48] + ldp d8, d9, [sp, #64] + ldp d10, d11, [sp, #80] + ldp d12, d13, [sp, #96] + ldp d14, d15, [sp, #112] + ldp x29, x30, [sp], #224 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // __ARM_MAX_ARCH__ >= 8 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/armv4-mont-linux.S b/third_party/boringssl/gen/bcm/armv4-mont-linux.S new file mode 100644 index 00000000..73e42b53 --- /dev/null +++ b/third_party/boringssl/gen/bcm/armv4-mont-linux.S @@ -0,0 +1,932 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.globl bn_mul_mont_nohw +.hidden bn_mul_mont_nohw +.type bn_mul_mont_nohw,%function + +.align 5 +bn_mul_mont_nohw: + ldr ip,[sp,#4] @ load num + stmdb sp!,{r0,r2} @ sp points at argument block + @ No return value. Instead, the caller must ensure num >= 2 + mov r0,ip @ load num + @ No return value + + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers + + mov r0,r0,lsl#2 @ rescale r0 for byte count + sub sp,sp,r0 @ alloca(4*num) + sub sp,sp,#4 @ +extra dword + sub r0,r0,#4 @ "num=num-1" + add r4,r2,r0 @ &bp[num-1] + + add r0,sp,r0 @ r0 to point at &tp[num-1] + ldr r8,[r0,#14*4] @ &n0 + ldr r2,[r2] @ bp[0] + ldr r5,[r1],#4 @ ap[0],ap++ + ldr r6,[r3],#4 @ np[0],np++ + ldr r8,[r8] @ *n0 + str r4,[r0,#15*4] @ save &bp[num] + + umull r10,r11,r5,r2 @ ap[0]*bp[0] + str r8,[r0,#14*4] @ save n0 value + mul r8,r10,r8 @ "tp[0]"*n0 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" + mov r4,sp + +.L1st: + ldr r5,[r1],#4 @ ap[j],ap++ + mov r10,r11 + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[0] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne .L1st + + adds r12,r12,r11 + ldr r4,[r0,#13*4] @ restore bp + mov r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + mov r7,sp + str r14,[r0,#4] @ tp[num]= + +.Louter: + sub r7,r0,r7 @ "original" r0-1 value + sub r1,r1,r7 @ "rewind" ap to &ap[1] + ldr r2,[r4,#4]! @ *(++bp) + sub r3,r3,r7 @ "rewind" np to &np[1] + ldr r5,[r1,#-4] @ ap[0] + ldr r10,[sp] @ tp[0] + ldr r6,[r3,#-4] @ np[0] + ldr r7,[sp,#4] @ tp[1] + + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] + str r4,[r0,#13*4] @ save bp + mul r8,r10,r8 + mov r12,#0 + umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" + mov r4,sp + +.Linner: + ldr r5,[r1],#4 @ ap[j],ap++ + adds r10,r11,r7 @ +=tp[j] + ldr r6,[r3],#4 @ np[j],np++ + mov r11,#0 + umlal r10,r11,r5,r2 @ ap[j]*bp[i] + mov r14,#0 + umlal r12,r14,r6,r8 @ np[j]*n0 + adc r11,r11,#0 + ldr r7,[r4,#8] @ tp[j+1] + adds r12,r12,r10 + str r12,[r4],#4 @ tp[j-1]=,tp++ + adc r12,r14,#0 + cmp r4,r0 + bne .Linner + + adds r12,r12,r11 + mov r14,#0 + ldr r4,[r0,#13*4] @ restore bp + adc r14,r14,#0 + ldr r8,[r0,#14*4] @ restore n0 + adds r12,r12,r7 + ldr r7,[r0,#15*4] @ restore &bp[num] + adc r14,r14,#0 + str r12,[r0] @ tp[num-1]= + str r14,[r0,#4] @ tp[num]= + + cmp r4,r7 +#ifdef __thumb2__ + itt ne +#endif + movne r7,sp + bne .Louter + + ldr r2,[r0,#12*4] @ pull rp + mov r5,sp + add r0,r0,#4 @ r0 to point at &tp[num] + sub r5,r0,r5 @ "original" num value + mov r4,sp @ "rewind" r4 + mov r1,r4 @ "borrow" r1 + sub r3,r3,r5 @ "rewind" r3 to &np[0] + + subs r7,r7,r7 @ "clear" carry flag +.Lsub: ldr r7,[r4],#4 + ldr r6,[r3],#4 + sbcs r7,r7,r6 @ tp[j]-np[j] + str r7,[r2],#4 @ rp[j]= + teq r4,r0 @ preserve carry + bne .Lsub + sbcs r14,r14,#0 @ upmost carry + mov r4,sp @ "rewind" r4 + sub r2,r2,r5 @ "rewind" r2 + +.Lcopy: ldr r7,[r4] @ conditional copy + ldr r5,[r2] + str sp,[r4],#4 @ zap tp +#ifdef __thumb2__ + it cc +#endif + movcc r5,r7 + str r5,[r2],#4 + teq r4,r0 @ preserve carry + bne .Lcopy + + mov sp,r0 + add sp,sp,#4 @ skip over tp[num+1] + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers + add sp,sp,#2*4 @ skip over {r0,r2} + @ No return value +#if __ARM_ARCH>=5 + bx lr @ bx lr +#else + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size bn_mul_mont_nohw,.-bn_mul_mont_nohw +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl bn_mul8x_mont_neon +.hidden bn_mul8x_mont_neon +.type bn_mul8x_mont_neon,%function +.align 5 +bn_mul8x_mont_neon: + mov ip,sp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + ldmia ip,{r4,r5} @ load rest of parameter block + mov ip,sp + + cmp r5,#8 + bhi .LNEON_8n + + @ special case for r5==8, everything is in register bank... + + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + sub r7,sp,r5,lsl#4 + vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-( + and r7,r7,#-64 + vld1.32 {d30[0]}, [r4,:32] + mov sp,r7 @ alloca + vzip.16 d28,d8 + + vmull.u32 q6,d28,d0[0] + vmull.u32 q7,d28,d0[1] + vmull.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmull.u32 q9,d28,d1[1] + + vadd.u64 d29,d29,d12 + veor d8,d8,d8 + vmul.u32 d29,d29,d30 + + vmull.u32 q10,d28,d2[0] + vld1.32 {d4,d5,d6,d7}, [r3]! + vmull.u32 q11,d28,d2[1] + vmull.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmull.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + sub r9,r5,#1 + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + b .LNEON_outer8 + +.align 4 +.LNEON_outer8: + vld1.32 {d28[0]}, [r2,:32]! + veor d8,d8,d8 + vzip.16 d28,d8 + vadd.u64 d12,d12,d10 + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + vmlal.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmlal.u32 q9,d28,d1[1] + + vadd.u64 d29,d29,d12 + veor d8,d8,d8 + subs r9,r9,#1 + vmul.u32 d29,d29,d30 + + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + + vmlal.u32 q10,d29,d6[0] + vmov q5,q6 + vmlal.u32 q11,d29,d6[1] + vmov q6,q7 + vmlal.u32 q12,d29,d7[0] + vmov q7,q8 + vmlal.u32 q13,d29,d7[1] + vmov q8,q9 + vmov q9,q10 + vshr.u64 d10,d10,#16 + vmov q10,q11 + vmov q11,q12 + vadd.u64 d10,d10,d11 + vmov q12,q13 + veor q13,q13 + vshr.u64 d10,d10,#16 + + bne .LNEON_outer8 + + vadd.u64 d12,d12,d10 + mov r7,sp + vshr.u64 d10,d12,#16 + mov r8,r5 + vadd.u64 d13,d13,d10 + add r6,sp,#96 + vshr.u64 d10,d13,#16 + vzip.16 d12,d13 + + b .LNEON_tail_entry + +.align 4 +.LNEON_8n: + veor q6,q6,q6 + sub r7,sp,#128 + veor q7,q7,q7 + sub r7,r7,r5,lsl#4 + veor q8,q8,q8 + and r7,r7,#-64 + veor q9,q9,q9 + mov sp,r7 @ alloca + veor q10,q10,q10 + add r7,r7,#256 + veor q11,q11,q11 + sub r8,r5,#8 + veor q12,q12,q12 + veor q13,q13,q13 + +.LNEON_8n_init: + vst1.64 {q6,q7},[r7,:256]! + subs r8,r8,#8 + vst1.64 {q8,q9},[r7,:256]! + vst1.64 {q10,q11},[r7,:256]! + vst1.64 {q12,q13},[r7,:256]! + bne .LNEON_8n_init + + add r6,sp,#256 + vld1.32 {d0,d1,d2,d3},[r1]! + add r10,sp,#8 + vld1.32 {d30[0]},[r4,:32] + mov r9,r5 + b .LNEON_8n_outer + +.align 4 +.LNEON_8n_outer: + vld1.32 {d28[0]},[r2,:32]! @ *b++ + veor d8,d8,d8 + vzip.16 d28,d8 + add r7,sp,#128 + vld1.32 {d4,d5,d6,d7},[r3]! + + vmlal.u32 q6,d28,d0[0] + vmlal.u32 q7,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q8,d28,d1[0] + vshl.i64 d29,d13,#16 + vmlal.u32 q9,d28,d1[1] + vadd.u64 d29,d29,d12 + vmlal.u32 q10,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q11,d28,d2[1] + vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0] + vmlal.u32 q12,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q13,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q6,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q7,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q8,d29,d5[0] + vshr.u64 d12,d12,#16 + vmlal.u32 q9,d29,d5[1] + vmlal.u32 q10,d29,d6[0] + vadd.u64 d12,d12,d13 + vmlal.u32 q11,d29,d6[1] + vshr.u64 d12,d12,#16 + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vadd.u64 d14,d14,d12 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0] + vmlal.u32 q7,d28,d0[0] + vld1.64 {q6},[r6,:128]! + vmlal.u32 q8,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q9,d28,d1[0] + vshl.i64 d29,d15,#16 + vmlal.u32 q10,d28,d1[1] + vadd.u64 d29,d29,d14 + vmlal.u32 q11,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q12,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1] + vmlal.u32 q13,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q6,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q7,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q8,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q9,d29,d5[0] + vshr.u64 d14,d14,#16 + vmlal.u32 q10,d29,d5[1] + vmlal.u32 q11,d29,d6[0] + vadd.u64 d14,d14,d15 + vmlal.u32 q12,d29,d6[1] + vshr.u64 d14,d14,#16 + vmlal.u32 q13,d29,d7[0] + vmlal.u32 q6,d29,d7[1] + vadd.u64 d16,d16,d14 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1] + vmlal.u32 q8,d28,d0[0] + vld1.64 {q7},[r6,:128]! + vmlal.u32 q9,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q10,d28,d1[0] + vshl.i64 d29,d17,#16 + vmlal.u32 q11,d28,d1[1] + vadd.u64 d29,d29,d16 + vmlal.u32 q12,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q13,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2] + vmlal.u32 q6,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q7,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q8,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q9,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q10,d29,d5[0] + vshr.u64 d16,d16,#16 + vmlal.u32 q11,d29,d5[1] + vmlal.u32 q12,d29,d6[0] + vadd.u64 d16,d16,d17 + vmlal.u32 q13,d29,d6[1] + vshr.u64 d16,d16,#16 + vmlal.u32 q6,d29,d7[0] + vmlal.u32 q7,d29,d7[1] + vadd.u64 d18,d18,d16 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2] + vmlal.u32 q9,d28,d0[0] + vld1.64 {q8},[r6,:128]! + vmlal.u32 q10,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q11,d28,d1[0] + vshl.i64 d29,d19,#16 + vmlal.u32 q12,d28,d1[1] + vadd.u64 d29,d29,d18 + vmlal.u32 q13,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q6,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3] + vmlal.u32 q7,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q8,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q9,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q10,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q11,d29,d5[0] + vshr.u64 d18,d18,#16 + vmlal.u32 q12,d29,d5[1] + vmlal.u32 q13,d29,d6[0] + vadd.u64 d18,d18,d19 + vmlal.u32 q6,d29,d6[1] + vshr.u64 d18,d18,#16 + vmlal.u32 q7,d29,d7[0] + vmlal.u32 q8,d29,d7[1] + vadd.u64 d20,d20,d18 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3] + vmlal.u32 q10,d28,d0[0] + vld1.64 {q9},[r6,:128]! + vmlal.u32 q11,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q12,d28,d1[0] + vshl.i64 d29,d21,#16 + vmlal.u32 q13,d28,d1[1] + vadd.u64 d29,d29,d20 + vmlal.u32 q6,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q7,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4] + vmlal.u32 q8,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q9,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q10,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q11,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q12,d29,d5[0] + vshr.u64 d20,d20,#16 + vmlal.u32 q13,d29,d5[1] + vmlal.u32 q6,d29,d6[0] + vadd.u64 d20,d20,d21 + vmlal.u32 q7,d29,d6[1] + vshr.u64 d20,d20,#16 + vmlal.u32 q8,d29,d7[0] + vmlal.u32 q9,d29,d7[1] + vadd.u64 d22,d22,d20 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4] + vmlal.u32 q11,d28,d0[0] + vld1.64 {q10},[r6,:128]! + vmlal.u32 q12,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q13,d28,d1[0] + vshl.i64 d29,d23,#16 + vmlal.u32 q6,d28,d1[1] + vadd.u64 d29,d29,d22 + vmlal.u32 q7,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q8,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5] + vmlal.u32 q9,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q10,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q11,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q12,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q13,d29,d5[0] + vshr.u64 d22,d22,#16 + vmlal.u32 q6,d29,d5[1] + vmlal.u32 q7,d29,d6[0] + vadd.u64 d22,d22,d23 + vmlal.u32 q8,d29,d6[1] + vshr.u64 d22,d22,#16 + vmlal.u32 q9,d29,d7[0] + vmlal.u32 q10,d29,d7[1] + vadd.u64 d24,d24,d22 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5] + vmlal.u32 q12,d28,d0[0] + vld1.64 {q11},[r6,:128]! + vmlal.u32 q13,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q6,d28,d1[0] + vshl.i64 d29,d25,#16 + vmlal.u32 q7,d28,d1[1] + vadd.u64 d29,d29,d24 + vmlal.u32 q8,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q9,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6] + vmlal.u32 q10,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q11,d28,d3[1] + vld1.32 {d28[0]},[r2,:32]! @ *b++ + vmlal.u32 q12,d29,d4[0] + veor d10,d10,d10 + vmlal.u32 q13,d29,d4[1] + vzip.16 d28,d10 + vmlal.u32 q6,d29,d5[0] + vshr.u64 d24,d24,#16 + vmlal.u32 q7,d29,d5[1] + vmlal.u32 q8,d29,d6[0] + vadd.u64 d24,d24,d25 + vmlal.u32 q9,d29,d6[1] + vshr.u64 d24,d24,#16 + vmlal.u32 q10,d29,d7[0] + vmlal.u32 q11,d29,d7[1] + vadd.u64 d26,d26,d24 + vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6] + vmlal.u32 q13,d28,d0[0] + vld1.64 {q12},[r6,:128]! + vmlal.u32 q6,d28,d0[1] + veor d8,d8,d8 + vmlal.u32 q7,d28,d1[0] + vshl.i64 d29,d27,#16 + vmlal.u32 q8,d28,d1[1] + vadd.u64 d29,d29,d26 + vmlal.u32 q9,d28,d2[0] + vmul.u32 d29,d29,d30 + vmlal.u32 q10,d28,d2[1] + vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7] + vmlal.u32 q11,d28,d3[0] + vzip.16 d29,d8 + vmlal.u32 q12,d28,d3[1] + vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 q13,d29,d4[0] + vld1.32 {d0,d1,d2,d3},[r1]! + vmlal.u32 q6,d29,d4[1] + vmlal.u32 q7,d29,d5[0] + vshr.u64 d26,d26,#16 + vmlal.u32 q8,d29,d5[1] + vmlal.u32 q9,d29,d6[0] + vadd.u64 d26,d26,d27 + vmlal.u32 q10,d29,d6[1] + vshr.u64 d26,d26,#16 + vmlal.u32 q11,d29,d7[0] + vmlal.u32 q12,d29,d7[1] + vadd.u64 d12,d12,d26 + vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7] + add r10,sp,#8 @ rewind + sub r8,r5,#8 + b .LNEON_8n_inner + +.align 4 +.LNEON_8n_inner: + subs r8,r8,#8 + vmlal.u32 q6,d28,d0[0] + vld1.64 {q13},[r6,:128] + vmlal.u32 q7,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0] + vmlal.u32 q8,d28,d1[0] + vld1.32 {d4,d5,d6,d7},[r3]! + vmlal.u32 q9,d28,d1[1] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q10,d28,d2[0] + vmlal.u32 q11,d28,d2[1] + vmlal.u32 q12,d28,d3[0] + vmlal.u32 q13,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1] + vmlal.u32 q6,d29,d4[0] + vmlal.u32 q7,d29,d4[1] + vmlal.u32 q8,d29,d5[0] + vmlal.u32 q9,d29,d5[1] + vmlal.u32 q10,d29,d6[0] + vmlal.u32 q11,d29,d6[1] + vmlal.u32 q12,d29,d7[0] + vmlal.u32 q13,d29,d7[1] + vst1.64 {q6},[r7,:128]! + vmlal.u32 q7,d28,d0[0] + vld1.64 {q6},[r6,:128] + vmlal.u32 q8,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1] + vmlal.u32 q9,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q10,d28,d1[1] + vmlal.u32 q11,d28,d2[0] + vmlal.u32 q12,d28,d2[1] + vmlal.u32 q13,d28,d3[0] + vmlal.u32 q6,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2] + vmlal.u32 q7,d29,d4[0] + vmlal.u32 q8,d29,d4[1] + vmlal.u32 q9,d29,d5[0] + vmlal.u32 q10,d29,d5[1] + vmlal.u32 q11,d29,d6[0] + vmlal.u32 q12,d29,d6[1] + vmlal.u32 q13,d29,d7[0] + vmlal.u32 q6,d29,d7[1] + vst1.64 {q7},[r7,:128]! + vmlal.u32 q8,d28,d0[0] + vld1.64 {q7},[r6,:128] + vmlal.u32 q9,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2] + vmlal.u32 q10,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q11,d28,d1[1] + vmlal.u32 q12,d28,d2[0] + vmlal.u32 q13,d28,d2[1] + vmlal.u32 q6,d28,d3[0] + vmlal.u32 q7,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3] + vmlal.u32 q8,d29,d4[0] + vmlal.u32 q9,d29,d4[1] + vmlal.u32 q10,d29,d5[0] + vmlal.u32 q11,d29,d5[1] + vmlal.u32 q12,d29,d6[0] + vmlal.u32 q13,d29,d6[1] + vmlal.u32 q6,d29,d7[0] + vmlal.u32 q7,d29,d7[1] + vst1.64 {q8},[r7,:128]! + vmlal.u32 q9,d28,d0[0] + vld1.64 {q8},[r6,:128] + vmlal.u32 q10,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3] + vmlal.u32 q11,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q12,d28,d1[1] + vmlal.u32 q13,d28,d2[0] + vmlal.u32 q6,d28,d2[1] + vmlal.u32 q7,d28,d3[0] + vmlal.u32 q8,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4] + vmlal.u32 q9,d29,d4[0] + vmlal.u32 q10,d29,d4[1] + vmlal.u32 q11,d29,d5[0] + vmlal.u32 q12,d29,d5[1] + vmlal.u32 q13,d29,d6[0] + vmlal.u32 q6,d29,d6[1] + vmlal.u32 q7,d29,d7[0] + vmlal.u32 q8,d29,d7[1] + vst1.64 {q9},[r7,:128]! + vmlal.u32 q10,d28,d0[0] + vld1.64 {q9},[r6,:128] + vmlal.u32 q11,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4] + vmlal.u32 q12,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q13,d28,d1[1] + vmlal.u32 q6,d28,d2[0] + vmlal.u32 q7,d28,d2[1] + vmlal.u32 q8,d28,d3[0] + vmlal.u32 q9,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5] + vmlal.u32 q10,d29,d4[0] + vmlal.u32 q11,d29,d4[1] + vmlal.u32 q12,d29,d5[0] + vmlal.u32 q13,d29,d5[1] + vmlal.u32 q6,d29,d6[0] + vmlal.u32 q7,d29,d6[1] + vmlal.u32 q8,d29,d7[0] + vmlal.u32 q9,d29,d7[1] + vst1.64 {q10},[r7,:128]! + vmlal.u32 q11,d28,d0[0] + vld1.64 {q10},[r6,:128] + vmlal.u32 q12,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5] + vmlal.u32 q13,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q6,d28,d1[1] + vmlal.u32 q7,d28,d2[0] + vmlal.u32 q8,d28,d2[1] + vmlal.u32 q9,d28,d3[0] + vmlal.u32 q10,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6] + vmlal.u32 q11,d29,d4[0] + vmlal.u32 q12,d29,d4[1] + vmlal.u32 q13,d29,d5[0] + vmlal.u32 q6,d29,d5[1] + vmlal.u32 q7,d29,d6[0] + vmlal.u32 q8,d29,d6[1] + vmlal.u32 q9,d29,d7[0] + vmlal.u32 q10,d29,d7[1] + vst1.64 {q11},[r7,:128]! + vmlal.u32 q12,d28,d0[0] + vld1.64 {q11},[r6,:128] + vmlal.u32 q13,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6] + vmlal.u32 q6,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q7,d28,d1[1] + vmlal.u32 q8,d28,d2[0] + vmlal.u32 q9,d28,d2[1] + vmlal.u32 q10,d28,d3[0] + vmlal.u32 q11,d28,d3[1] + vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7] + vmlal.u32 q12,d29,d4[0] + vmlal.u32 q13,d29,d4[1] + vmlal.u32 q6,d29,d5[0] + vmlal.u32 q7,d29,d5[1] + vmlal.u32 q8,d29,d6[0] + vmlal.u32 q9,d29,d6[1] + vmlal.u32 q10,d29,d7[0] + vmlal.u32 q11,d29,d7[1] + vst1.64 {q12},[r7,:128]! + vmlal.u32 q13,d28,d0[0] + vld1.64 {q12},[r6,:128] + vmlal.u32 q6,d28,d0[1] + vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7] + vmlal.u32 q7,d28,d1[0] + it ne + addne r6,r6,#16 @ don't advance in last iteration + vmlal.u32 q8,d28,d1[1] + vmlal.u32 q9,d28,d2[0] + vmlal.u32 q10,d28,d2[1] + vmlal.u32 q11,d28,d3[0] + vmlal.u32 q12,d28,d3[1] + it eq + subeq r1,r1,r5,lsl#2 @ rewind + vmlal.u32 q13,d29,d4[0] + vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] + vmlal.u32 q6,d29,d4[1] + vld1.32 {d0,d1,d2,d3},[r1]! + vmlal.u32 q7,d29,d5[0] + add r10,sp,#8 @ rewind + vmlal.u32 q8,d29,d5[1] + vmlal.u32 q9,d29,d6[0] + vmlal.u32 q10,d29,d6[1] + vmlal.u32 q11,d29,d7[0] + vst1.64 {q13},[r7,:128]! + vmlal.u32 q12,d29,d7[1] + + bne .LNEON_8n_inner + add r6,sp,#128 + vst1.64 {q6,q7},[r7,:256]! + veor q2,q2,q2 @ d4-d5 + vst1.64 {q8,q9},[r7,:256]! + veor q3,q3,q3 @ d6-d7 + vst1.64 {q10,q11},[r7,:256]! + vst1.64 {q12},[r7,:128] + + subs r9,r9,#8 + vld1.64 {q6,q7},[r6,:256]! + vld1.64 {q8,q9},[r6,:256]! + vld1.64 {q10,q11},[r6,:256]! + vld1.64 {q12,q13},[r6,:256]! + + itt ne + subne r3,r3,r5,lsl#2 @ rewind + bne .LNEON_8n_outer + + add r7,sp,#128 + vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame + vshr.u64 d10,d12,#16 + vst1.64 {q2,q3},[sp,:256]! + vadd.u64 d13,d13,d10 + vst1.64 {q2,q3}, [sp,:256]! + vshr.u64 d10,d13,#16 + vst1.64 {q2,q3}, [sp,:256]! + vzip.16 d12,d13 + + mov r8,r5 + b .LNEON_tail_entry + +.align 4 +.LNEON_tail: + vadd.u64 d12,d12,d10 + vshr.u64 d10,d12,#16 + vld1.64 {q8,q9}, [r6, :256]! + vadd.u64 d13,d13,d10 + vld1.64 {q10,q11}, [r6, :256]! + vshr.u64 d10,d13,#16 + vld1.64 {q12,q13}, [r6, :256]! + vzip.16 d12,d13 + +.LNEON_tail_entry: + vadd.u64 d14,d14,d10 + vst1.32 {d12[0]}, [r7, :32]! + vshr.u64 d10,d14,#16 + vadd.u64 d15,d15,d10 + vshr.u64 d10,d15,#16 + vzip.16 d14,d15 + vadd.u64 d16,d16,d10 + vst1.32 {d14[0]}, [r7, :32]! + vshr.u64 d10,d16,#16 + vadd.u64 d17,d17,d10 + vshr.u64 d10,d17,#16 + vzip.16 d16,d17 + vadd.u64 d18,d18,d10 + vst1.32 {d16[0]}, [r7, :32]! + vshr.u64 d10,d18,#16 + vadd.u64 d19,d19,d10 + vshr.u64 d10,d19,#16 + vzip.16 d18,d19 + vadd.u64 d20,d20,d10 + vst1.32 {d18[0]}, [r7, :32]! + vshr.u64 d10,d20,#16 + vadd.u64 d21,d21,d10 + vshr.u64 d10,d21,#16 + vzip.16 d20,d21 + vadd.u64 d22,d22,d10 + vst1.32 {d20[0]}, [r7, :32]! + vshr.u64 d10,d22,#16 + vadd.u64 d23,d23,d10 + vshr.u64 d10,d23,#16 + vzip.16 d22,d23 + vadd.u64 d24,d24,d10 + vst1.32 {d22[0]}, [r7, :32]! + vshr.u64 d10,d24,#16 + vadd.u64 d25,d25,d10 + vshr.u64 d10,d25,#16 + vzip.16 d24,d25 + vadd.u64 d26,d26,d10 + vst1.32 {d24[0]}, [r7, :32]! + vshr.u64 d10,d26,#16 + vadd.u64 d27,d27,d10 + vshr.u64 d10,d27,#16 + vzip.16 d26,d27 + vld1.64 {q6,q7}, [r6, :256]! + subs r8,r8,#8 + vst1.32 {d26[0]}, [r7, :32]! + bne .LNEON_tail + + vst1.32 {d10[0]}, [r7, :32] @ top-most bit + sub r3,r3,r5,lsl#2 @ rewind r3 + subs r1,sp,#0 @ clear carry flag + add r2,sp,r5,lsl#2 + +.LNEON_sub: + ldmia r1!, {r4,r5,r6,r7} + ldmia r3!, {r8,r9,r10,r11} + sbcs r8, r4,r8 + sbcs r9, r5,r9 + sbcs r10,r6,r10 + sbcs r11,r7,r11 + teq r1,r2 @ preserves carry + stmia r0!, {r8,r9,r10,r11} + bne .LNEON_sub + + ldr r10, [r1] @ load top-most bit + mov r11,sp + veor q0,q0,q0 + sub r11,r2,r11 @ this is num*4 + veor q1,q1,q1 + mov r1,sp + sub r0,r0,r11 @ rewind r0 + mov r3,r2 @ second 3/4th of frame + sbcs r10,r10,#0 @ result is carry flag + +.LNEON_copy_n_zap: + ldmia r1!, {r4,r5,r6,r7} + ldmia r0, {r8,r9,r10,r11} + it cc + movcc r8, r4 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + it cc + movcc r11,r7 + ldmia r1, {r4,r5,r6,r7} + stmia r0!, {r8,r9,r10,r11} + sub r1,r1,#16 + ldmia r0, {r8,r9,r10,r11} + it cc + movcc r8, r4 + vst1.64 {q0,q1}, [r1,:256]! @ wipe + itt cc + movcc r9, r5 + movcc r10,r6 + vst1.64 {q0,q1}, [r3,:256]! @ wipe + it cc + movcc r11,r7 + teq r1,r2 @ preserves carry + stmia r0!, {r8,r9,r10,r11} + bne .LNEON_copy_n_zap + + mov sp,ip + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} + @ No return value + bx lr @ bx lr +.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon +#endif +.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/armv8-mont-apple.S b/third_party/boringssl/gen/bcm/armv8-mont-apple.S new file mode 100644 index 00000000..5e9a8e91 --- /dev/null +++ b/third_party/boringssl/gen/bcm/armv8-mont-apple.S @@ -0,0 +1,1442 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.text + +.globl _bn_mul_mont_words +.private_extern _bn_mul_mont_words + +.align 5 +_bn_mul_mont_words: + AARCH64_SIGN_LINK_REGISTER + tst x5,#7 + b.eq __bn_sqr8x_mont + tst x5,#3 + b.eq __bn_mul4x_mont +Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + // This can allocate at most 8 * BN_MONTGOMERY_MAX_WORDS on the stack, + // or 2 KiB. This fits well within a page, so it is not necessary to + // fault pages in the correct order. + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,L1st_skip + +L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,L1st + +L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,Linner_skip + +Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,Linner + +Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + // No return value + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.align 5 +__bn_sqr8x_mont: + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to + // only from bn_mul_mont_words which has already signed the return address. + cmp x1,x2 + b.ne __bn_mul4x_mont +Lsqr8x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + // This can allocate at most 16 * BN_MONTGOMERY_MAX_WORDS on the stack, + // or 4 KiB. The fixed allocation above pushes to just above a page. On + // Windows, we must ensure new pages are first accessed in order. See + // https://learn.microsoft.com/en-us/cpp/build/arm64-windows-abi-conventions?view=msvc-170#stack + // + // The order is correct, but precariously so: the code above access as + // low as [sp,#16]. This leaves a jump of 16 + 4096 = 4112 bytes. If + // [sp,#16] were at page boundary, those 4112 bytes would span two + // pages. If [x2] were the next access, we would skip a guard page. + // + // Fortunately, the first access is [x2,#8*8], at .Lsqr8x_zero_start. + // We jump at most 4112 - 64 = 4048 bytes, less than a page. If any of + // this changes, we must insert a no-op access or call __chkstk. + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b Lsqr8x_zero_start + +Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 + adcs x25,x25,x15 + umulh x15,x9,x6 + adcs x26,x26,x16 + umulh x16,x10,x6 + stp x19,x20,[x2],#8*2 // t[0..1] + adc x19,xzr,xzr // t[8] + adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) + umulh x17,x11,x6 + adcs x22,x22,x14 + umulh x14,x12,x6 + adcs x23,x23,x15 + umulh x15,x13,x6 + adcs x24,x24,x16 + mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) + adcs x25,x25,x17 + mul x17,x9,x7 + adcs x26,x26,x14 + mul x14,x10,x7 + adc x19,x19,x15 + + mul x15,x11,x7 + adds x22,x22,x16 + mul x16,x12,x7 + adcs x23,x23,x17 + mul x17,x13,x7 + adcs x24,x24,x14 + umulh x14,x8,x7 // hi(a[2..7]*a[1]) + adcs x25,x25,x15 + umulh x15,x9,x7 + adcs x26,x26,x16 + umulh x16,x10,x7 + adcs x19,x19,x17 + umulh x17,x11,x7 + stp x21,x22,[x2],#8*2 // t[2..3] + adc x20,xzr,xzr // t[9] + adds x23,x23,x14 + umulh x14,x12,x7 + adcs x24,x24,x15 + umulh x15,x13,x7 + adcs x25,x25,x16 + mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) + adcs x26,x26,x17 + mul x17,x10,x8 + adcs x19,x19,x14 + mul x14,x11,x8 + adc x20,x20,x15 + + mul x15,x12,x8 + adds x24,x24,x16 + mul x16,x13,x8 + adcs x25,x25,x17 + umulh x17,x9,x8 // hi(a[3..7]*a[2]) + adcs x26,x26,x14 + umulh x14,x10,x8 + adcs x19,x19,x15 + umulh x15,x11,x8 + adcs x20,x20,x16 + umulh x16,x12,x8 + stp x23,x24,[x2],#8*2 // t[4..5] + adc x21,xzr,xzr // t[10] + adds x25,x25,x17 + umulh x17,x13,x8 + adcs x26,x26,x14 + mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) + adcs x19,x19,x15 + mul x15,x11,x9 + adcs x20,x20,x16 + mul x16,x12,x9 + adc x21,x21,x17 + + mul x17,x13,x9 + adds x26,x26,x14 + umulh x14,x10,x9 // hi(a[4..7]*a[3]) + adcs x19,x19,x15 + umulh x15,x11,x9 + adcs x20,x20,x16 + umulh x16,x12,x9 + adcs x21,x21,x17 + umulh x17,x13,x9 + stp x25,x26,[x2],#8*2 // t[6..7] + adc x22,xzr,xzr // t[11] + adds x19,x19,x14 + mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) + adcs x20,x20,x15 + mul x15,x12,x10 + adcs x21,x21,x16 + mul x16,x13,x10 + adc x22,x22,x17 + + umulh x17,x11,x10 // hi(a[5..7]*a[4]) + adds x20,x20,x14 + umulh x14,x12,x10 + adcs x21,x21,x15 + umulh x15,x13,x10 + adcs x22,x22,x16 + mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) + adc x23,xzr,xzr // t[12] + adds x21,x21,x17 + mul x17,x13,x11 + adcs x22,x22,x14 + umulh x14,x12,x11 // hi(a[6..7]*a[5]) + adc x23,x23,x15 + + umulh x15,x13,x11 + adds x22,x22,x16 + mul x16,x13,x12 // lo(a[7]*a[6]) (vii) + adcs x23,x23,x17 + umulh x17,x13,x12 // hi(a[7]*a[6]) + adc x24,xzr,xzr // t[13] + adds x23,x23,x14 + sub x27,x3,x1 // done yet? + adc x24,x24,x15 + + adds x24,x24,x16 + sub x14,x3,x5 // rewound ap + adc x25,xzr,xzr // t[14] + add x25,x25,x17 + + cbz x27,Lsqr8x_outer_break + + mov x4,x6 + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x0,x1 + adcs x26,xzr,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved below + mov x27,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +Lsqr8x_mul: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp x1,x3 // done yet? + b.eq Lsqr8x_break + + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + ldr x4,[x0,#-8*8] + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_mul + +.align 4 +Lsqr8x_break: + ldp x6,x7,[x0,#8*0] + add x1,x0,#8*8 + ldp x8,x9,[x0,#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[x0,#8*4] + sub x15,x2,x14 + ldp x12,x13,[x0,#8*6] + cbz x14,Lsqr8x_outer_loop + + stp x19,x20,[x2,#8*0] + ldp x19,x20,[x15,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x15,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x15,#8*4] + stp x25,x26,[x2,#8*6] + mov x2,x15 + ldp x25,x26,[x15,#8*6] + b Lsqr8x_outer_loop + +.align 4 +Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] + ldp x15,x16,[sp,#8*1] + ldp x11,x13,[x14,#8*2] + add x1,x14,#8*4 + ldp x17,x14,[sp,#8*3] + + stp x19,x20,[x2,#8*0] + mul x19,x7,x7 + stp x21,x22,[x2,#8*2] + umulh x7,x7,x7 + stp x23,x24,[x2,#8*4] + mul x8,x9,x9 + stp x25,x26,[x2,#8*6] + mov x2,sp + umulh x9,x9,x9 + adds x20,x7,x15,lsl#1 + extr x15,x16,x15,#63 + sub x27,x5,#8*4 + +Lsqr4x_shift_n_add: + adcs x21,x8,x15 + extr x16,x17,x16,#63 + sub x27,x27,#8*4 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + ldp x7,x9,[x1],#8*2 + umulh x11,x11,x11 + mul x12,x13,x13 + umulh x13,x13,x13 + extr x17,x14,x17,#63 + stp x19,x20,[x2,#8*0] + adcs x23,x10,x17 + extr x14,x15,x14,#63 + stp x21,x22,[x2,#8*2] + adcs x24,x11,x14 + ldp x17,x14,[x2,#8*7] + extr x15,x16,x15,#63 + adcs x25,x12,x15 + extr x16,x17,x16,#63 + adcs x26,x13,x16 + ldp x15,x16,[x2,#8*9] + mul x6,x7,x7 + ldp x11,x13,[x1],#8*2 + umulh x7,x7,x7 + mul x8,x9,x9 + umulh x9,x9,x9 + stp x23,x24,[x2,#8*4] + extr x17,x14,x17,#63 + stp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + adcs x19,x6,x17 + extr x14,x15,x14,#63 + adcs x20,x7,x14 + ldp x17,x14,[x2,#8*3] + extr x15,x16,x15,#63 + cbnz x27,Lsqr4x_shift_n_add + ldp x1,x4,[x29,#104] // pull np and n0 + + adcs x21,x8,x15 + extr x16,x17,x16,#63 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + umulh x11,x11,x11 + stp x19,x20,[x2,#8*0] + mul x12,x13,x13 + umulh x13,x13,x13 + stp x21,x22,[x2,#8*2] + extr x17,x14,x17,#63 + adcs x23,x10,x17 + extr x14,x15,x14,#63 + ldp x19,x20,[sp,#8*0] + adcs x24,x11,x14 + extr x15,x16,x15,#63 + ldp x6,x7,[x1,#8*0] + adcs x25,x12,x15 + extr x16,xzr,x16,#63 + ldp x8,x9,[x1,#8*2] + adc x26,x13,x16 + ldp x10,x11,[x1,#8*4] + + // Reduce by 512 bits per iteration + mul x28,x4,x19 // t[0]*n0 + ldp x12,x13,[x1,#8*6] + add x3,x1,x5 + ldp x21,x22,[sp,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[sp,#8*4] + stp x25,x26,[x2,#8*6] + ldp x25,x26,[sp,#8*6] + add x1,x1,#8*8 + mov x30,xzr // initial top-most carry + mov x2,sp + mov x27,#8 + +Lsqr8x_reduction: + // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) + mul x15,x7,x28 + sub x27,x27,#1 + mul x16,x8,x28 + str x28,[x2],#8 // put aside t[0]*n0 for tail processing + mul x17,x9,x28 + // (*) adds xzr,x19,x14 + subs xzr,x19,#1 // (*) + mul x14,x10,x28 + adcs x19,x20,x15 + mul x15,x11,x28 + adcs x20,x21,x16 + mul x16,x12,x28 + adcs x21,x22,x17 + mul x17,x13,x28 + adcs x22,x23,x14 + umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) + adcs x23,x24,x15 + umulh x15,x7,x28 + adcs x24,x25,x16 + umulh x16,x8,x28 + adcs x25,x26,x17 + umulh x17,x9,x28 + adc x26,xzr,xzr + adds x19,x19,x14 + umulh x14,x10,x28 + adcs x20,x20,x15 + umulh x15,x11,x28 + adcs x21,x21,x16 + umulh x16,x12,x28 + adcs x22,x22,x17 + umulh x17,x13,x28 + mul x28,x4,x19 // next t[0]*n0 + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adc x26,x26,x17 + cbnz x27,Lsqr8x_reduction + + ldp x14,x15,[x2,#8*0] + ldp x16,x17,[x2,#8*2] + mov x0,x2 + sub x27,x3,x1 // done yet? + adds x19,x19,x14 + adcs x20,x20,x15 + ldp x14,x15,[x2,#8*4] + adcs x21,x21,x16 + adcs x22,x22,x17 + ldp x16,x17,[x2,#8*6] + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adcs x26,x26,x17 + //adc x28,xzr,xzr // moved below + cbz x27,Lsqr8x8_post_condition + + ldr x4,[x2,#-8*8] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + mov x27,#-8*8 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + +Lsqr8x_tail: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp x6,x7,[x2,#8*0] + sub x27,x3,x1 // done yet? + sub x16,x3,x5 // rewound np + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + cbz x27,Lsqr8x_tail_break + + ldr x4,[x0,#-8*8] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_tail + +.align 4 +Lsqr8x_tail_break: + ldr x4,[x29,#112] // pull n0 + add x27,x2,#8*8 // end of current t[num] window + + subs xzr,x30,#1 // "move" top-most carry to carry bit + adcs x14,x19,x6 + adcs x15,x20,x7 + ldp x19,x20,[x0,#8*0] + adcs x21,x21,x8 + ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + adcs x22,x22,x9 + ldp x8,x9,[x16,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x16,#8*4] + adcs x25,x25,x12 + adcs x26,x26,x13 + ldp x12,x13,[x16,#8*6] + add x1,x16,#8*8 + adc x30,xzr,xzr // top-most carry + mul x28,x4,x19 + stp x14,x15,[x2,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x0,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x0,#8*4] + cmp x27,x29 // did we hit the bottom? + stp x25,x26,[x2,#8*6] + mov x2,x0 // slide the window + ldp x25,x26,[x0,#8*6] + mov x27,#8 + b.ne Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x0,[x29,#96] // pull rp + add x2,x2,#8*8 + subs x14,x19,x6 + sbcs x15,x20,x7 + sub x27,x5,#8*8 + mov x3,x0 // x0 copy + +Lsqr8x_sub: + sbcs x16,x21,x8 + ldp x6,x7,[x1,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x1,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x10,x11,[x1,#8*4] + sbcs x17,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + ldp x19,x20,[x2,#8*0] + sub x27,x27,#8*8 + ldp x21,x22,[x2,#8*2] + ldp x23,x24,[x2,#8*4] + ldp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + stp x14,x15,[x0,#8*4] + sbcs x14,x19,x6 + stp x16,x17,[x0,#8*6] + add x0,x0,#8*8 + sbcs x15,x20,x7 + cbnz x27,Lsqr8x_sub + + sbcs x16,x21,x8 + mov x2,sp + add x1,sp,x5 + ldp x6,x7,[x3,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x3,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x19,x20,[x1,#8*0] + sbcs x17,x26,x13 + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp x14,x15,[x0,#8*4] + stp x16,x17,[x0,#8*6] + + sub x27,x5,#8*4 +Lsqr4x_cond_copy: + sub x27,x27,#8*4 + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + ldp x6,x7,[x3,#8*4] + ldp x19,x20,[x1,#8*4] + csel x16,x21,x8,lo + stp xzr,xzr,[x2,#8*2] + add x2,x2,#8*4 + csel x17,x22,x9,lo + ldp x8,x9,[x3,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + stp xzr,xzr,[x1,#8*0] + stp xzr,xzr,[x1,#8*2] + cbnz x27,Lsqr4x_cond_copy + + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + stp xzr,xzr,[x2,#8*2] + csel x16,x21,x8,lo + csel x17,x22,x9,lo + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + + b Lsqr8x_done + +.align 4 +Lsqr8x8_post_condition: + adc x28,xzr,xzr + ldr x30,[x29,#8] // pull return address + // x19-7,x28 hold result, x6-7 hold modulus + subs x6,x19,x6 + ldr x1,[x29,#96] // pull rp + sbcs x7,x20,x7 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x8 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x9 + stp xzr,xzr,[sp,#8*4] + sbcs x10,x23,x10 + stp xzr,xzr,[sp,#8*6] + sbcs x11,x24,x11 + stp xzr,xzr,[sp,#8*8] + sbcs x12,x25,x12 + stp xzr,xzr,[sp,#8*10] + sbcs x13,x26,x13 + stp xzr,xzr,[sp,#8*12] + sbcs x28,x28,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // x6-7 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + csel x10,x23,x10,lo + csel x11,x24,x11,lo + stp x8,x9,[x1,#8*2] + csel x12,x25,x12,lo + csel x13,x26,x13,lo + stp x10,x11,[x1,#8*4] + stp x12,x13,[x1,#8*6] + +Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + // No return value + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.align 5 +__bn_mul4x_mont: + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to + // only from bn_mul_mont_words or __bn_mul8x_mont which have already signed the + // return address. + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + // This can allocate at most 8 * BN_MONTGOMERY_MAX_WORDS on the stack, + // or 2 KiB. This fits well within a page, so it is not necessary to + // fault pages in the correct order. + sub x26,sp,x5,lsl#3 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + sub sp,x26,#8*4 // alloca + + add x10,x2,x5 + add x27,x1,x5 + stp x0,x10,[x29,#96] // offload rp and &b[num] + + ldr x24,[x2,#8*0] // b[0] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + ldp x14,x15,[x3,#8*0] // n[0..3] + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + mov x28,#0 + mov x26,sp + +Loop_mul4x_1st_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[0]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[0]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + sub x10,x27,x1 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_reduction + + cbz x10,Lmul4x4_post_condition + + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldr x25,[sp] // a[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +Loop_mul4x_1st_tail: + mul x10,x6,x24 // lo(a[4..7]*b[i]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[i]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + adcs x23,x23,x0 + umulh x13,x17,x25 + adc x0,xzr,xzr + ldr x25,[sp,x28] // next t[0]*n0 + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_tail + + sub x11,x27,x5 // rewound x1 + cbz x10,Lmul4x_proceed + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_1st_tail + +.align 5 +Lmul4x_proceed: + ldr x24,[x2,#8*4]! // *++b + adc x30,x0,xzr + ldp x6,x7,[x11,#8*0] // a[0..3] + sub x3,x3,x5 // rewind np + ldp x8,x9,[x11,#8*2] + add x1,x11,#8*4 + + stp x19,x20,[x26,#8*0] // result!!! + ldp x19,x20,[sp,#8*4] // t[0..3] + stp x21,x22,[x26,#8*2] // result!!! + ldp x21,x22,[sp,#8*6] + + ldp x14,x15,[x3,#8*0] // n[0..3] + mov x26,sp + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + +.align 4 +Loop_mul4x_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[4]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + // (*) mul x10,x14,x25 + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_reduction + + adc x0,x0,xzr + ldp x10,x11,[x26,#8*4] // t[4..7] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + + ldr x25,[sp] // t[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.align 4 +Loop_mul4x_tail: + mul x10,x6,x24 // lo(a[4..7]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[4]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + umulh x13,x17,x25 + adcs x23,x23,x0 + ldr x25,[sp,x28] // next a[0]*n0 + adc x0,xzr,xzr + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_tail + + sub x11,x3,x5 // rewound np? + adc x0,x0,xzr + cbz x10,Loop_mul4x_break + + ldp x10,x11,[x26,#8*4] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_tail + +.align 4 +Loop_mul4x_break: + ldp x12,x13,[x29,#96] // pull rp and &b[num] + adds x19,x19,x30 + add x2,x2,#8*4 // bp++ + adcs x20,x20,xzr + sub x1,x1,x5 // rewind ap + adcs x21,x21,xzr + stp x19,x20,[x26,#8*0] // result!!! + adcs x22,x22,xzr + ldp x19,x20,[sp,#8*4] // t[0..3] + adc x30,x0,xzr + stp x21,x22,[x26,#8*2] // result!!! + cmp x2,x13 // done yet? + ldp x21,x22,[sp,#8*6] + ldp x14,x15,[x11,#8*0] // n[0..3] + ldp x16,x17,[x11,#8*2] + add x3,x11,#8*4 + b.eq Lmul4x_post + + ldr x24,[x2] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + adds x1,x1,#8*4 // clear carry bit + mov x0,xzr + mov x26,sp + b Loop_mul4x_reduction + +.align 4 +Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov x0,x12 + mov x27,x12 // x0 copy + subs x10,x19,x14 + add x26,sp,#8*8 + sbcs x11,x20,x15 + sub x28,x5,#8*4 + +Lmul4x_sub: + sbcs x12,x21,x16 + ldp x14,x15,[x3,#8*0] + sub x28,x28,#8*4 + ldp x19,x20,[x26,#8*0] + sbcs x13,x22,x17 + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + ldp x21,x22,[x26,#8*2] + add x26,x26,#8*4 + stp x10,x11,[x0,#8*0] + sbcs x10,x19,x14 + stp x12,x13,[x0,#8*2] + add x0,x0,#8*4 + sbcs x11,x20,x15 + cbnz x28,Lmul4x_sub + + sbcs x12,x21,x16 + mov x26,sp + add x1,sp,#8*4 + ldp x6,x7,[x27,#8*0] + sbcs x13,x22,x17 + stp x10,x11,[x0,#8*0] + ldp x8,x9,[x27,#8*2] + stp x12,x13,[x0,#8*2] + ldp x19,x20,[x1,#8*0] + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub x28,x5,#8*4 +Lmul4x_cond_copy: + sub x28,x28,#8*4 + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + ldp x6,x7,[x27,#8*4] + ldp x19,x20,[x1,#8*4] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*2] + add x26,x26,#8*4 + csel x13,x22,x9,lo + ldp x8,x9,[x27,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + add x27,x27,#8*4 + cbnz x28,Lmul4x_cond_copy + + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + stp xzr,xzr,[x26,#8*2] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*3] + csel x13,x22,x9,lo + stp xzr,xzr,[x26,#8*4] + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + + b Lmul4x_done + +.align 4 +Lmul4x4_post_condition: + adc x0,x0,xzr + ldr x1,[x29,#96] // pull rp + // x19-3,x0 hold result, x14-7 hold modulus + subs x6,x19,x14 + ldr x30,[x29,#8] // pull return address + sbcs x7,x20,x15 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x16 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x17 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,x0,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // x6-3 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + stp x8,x9,[x1,#8*2] + +Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + // No return value + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret + +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 4 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/armv8-mont-linux.S b/third_party/boringssl/gen/bcm/armv8-mont-linux.S new file mode 100644 index 00000000..de02424e --- /dev/null +++ b/third_party/boringssl/gen/bcm/armv8-mont-linux.S @@ -0,0 +1,1442 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.text + +.globl bn_mul_mont_words +.hidden bn_mul_mont_words +.type bn_mul_mont_words,%function +.align 5 +bn_mul_mont_words: + AARCH64_SIGN_LINK_REGISTER + tst x5,#7 + b.eq __bn_sqr8x_mont + tst x5,#3 + b.eq __bn_mul4x_mont +.Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + // This can allocate at most 8 * BN_MONTGOMERY_MAX_WORDS on the stack, + // or 2 KiB. This fits well within a page, so it is not necessary to + // fault pages in the correct order. + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,.L1st_skip + +.L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,.L1st + +.L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +.Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,.Linner_skip + +.Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,.Linner + +.Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,.Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +.Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,.Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +.Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,.Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + // No return value + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size bn_mul_mont_words,.-bn_mul_mont_words +.type __bn_sqr8x_mont,%function +.align 5 +__bn_sqr8x_mont: + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to + // only from bn_mul_mont_words which has already signed the return address. + cmp x1,x2 + b.ne __bn_mul4x_mont +.Lsqr8x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + // This can allocate at most 16 * BN_MONTGOMERY_MAX_WORDS on the stack, + // or 4 KiB. The fixed allocation above pushes to just above a page. On + // Windows, we must ensure new pages are first accessed in order. See + // https://learn.microsoft.com/en-us/cpp/build/arm64-windows-abi-conventions?view=msvc-170#stack + // + // The order is correct, but precariously so: the code above access as + // low as [sp,#16]. This leaves a jump of 16 + 4096 = 4112 bytes. If + // [sp,#16] were at page boundary, those 4112 bytes would span two + // pages. If [x2] were the next access, we would skip a guard page. + // + // Fortunately, the first access is [x2,#8*8], at .Lsqr8x_zero_start. + // We jump at most 4112 - 64 = 4048 bytes, less than a page. If any of + // this changes, we must insert a no-op access or call __chkstk. + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b .Lsqr8x_zero_start + +.Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +.Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,.Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +.Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 + adcs x25,x25,x15 + umulh x15,x9,x6 + adcs x26,x26,x16 + umulh x16,x10,x6 + stp x19,x20,[x2],#8*2 // t[0..1] + adc x19,xzr,xzr // t[8] + adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) + umulh x17,x11,x6 + adcs x22,x22,x14 + umulh x14,x12,x6 + adcs x23,x23,x15 + umulh x15,x13,x6 + adcs x24,x24,x16 + mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) + adcs x25,x25,x17 + mul x17,x9,x7 + adcs x26,x26,x14 + mul x14,x10,x7 + adc x19,x19,x15 + + mul x15,x11,x7 + adds x22,x22,x16 + mul x16,x12,x7 + adcs x23,x23,x17 + mul x17,x13,x7 + adcs x24,x24,x14 + umulh x14,x8,x7 // hi(a[2..7]*a[1]) + adcs x25,x25,x15 + umulh x15,x9,x7 + adcs x26,x26,x16 + umulh x16,x10,x7 + adcs x19,x19,x17 + umulh x17,x11,x7 + stp x21,x22,[x2],#8*2 // t[2..3] + adc x20,xzr,xzr // t[9] + adds x23,x23,x14 + umulh x14,x12,x7 + adcs x24,x24,x15 + umulh x15,x13,x7 + adcs x25,x25,x16 + mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) + adcs x26,x26,x17 + mul x17,x10,x8 + adcs x19,x19,x14 + mul x14,x11,x8 + adc x20,x20,x15 + + mul x15,x12,x8 + adds x24,x24,x16 + mul x16,x13,x8 + adcs x25,x25,x17 + umulh x17,x9,x8 // hi(a[3..7]*a[2]) + adcs x26,x26,x14 + umulh x14,x10,x8 + adcs x19,x19,x15 + umulh x15,x11,x8 + adcs x20,x20,x16 + umulh x16,x12,x8 + stp x23,x24,[x2],#8*2 // t[4..5] + adc x21,xzr,xzr // t[10] + adds x25,x25,x17 + umulh x17,x13,x8 + adcs x26,x26,x14 + mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) + adcs x19,x19,x15 + mul x15,x11,x9 + adcs x20,x20,x16 + mul x16,x12,x9 + adc x21,x21,x17 + + mul x17,x13,x9 + adds x26,x26,x14 + umulh x14,x10,x9 // hi(a[4..7]*a[3]) + adcs x19,x19,x15 + umulh x15,x11,x9 + adcs x20,x20,x16 + umulh x16,x12,x9 + adcs x21,x21,x17 + umulh x17,x13,x9 + stp x25,x26,[x2],#8*2 // t[6..7] + adc x22,xzr,xzr // t[11] + adds x19,x19,x14 + mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) + adcs x20,x20,x15 + mul x15,x12,x10 + adcs x21,x21,x16 + mul x16,x13,x10 + adc x22,x22,x17 + + umulh x17,x11,x10 // hi(a[5..7]*a[4]) + adds x20,x20,x14 + umulh x14,x12,x10 + adcs x21,x21,x15 + umulh x15,x13,x10 + adcs x22,x22,x16 + mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) + adc x23,xzr,xzr // t[12] + adds x21,x21,x17 + mul x17,x13,x11 + adcs x22,x22,x14 + umulh x14,x12,x11 // hi(a[6..7]*a[5]) + adc x23,x23,x15 + + umulh x15,x13,x11 + adds x22,x22,x16 + mul x16,x13,x12 // lo(a[7]*a[6]) (vii) + adcs x23,x23,x17 + umulh x17,x13,x12 // hi(a[7]*a[6]) + adc x24,xzr,xzr // t[13] + adds x23,x23,x14 + sub x27,x3,x1 // done yet? + adc x24,x24,x15 + + adds x24,x24,x16 + sub x14,x3,x5 // rewound ap + adc x25,xzr,xzr // t[14] + add x25,x25,x17 + + cbz x27,.Lsqr8x_outer_break + + mov x4,x6 + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x0,x1 + adcs x26,xzr,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved below + mov x27,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +.Lsqr8x_mul: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,.Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp x1,x3 // done yet? + b.eq .Lsqr8x_break + + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + ldr x4,[x0,#-8*8] + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b .Lsqr8x_mul + +.align 4 +.Lsqr8x_break: + ldp x6,x7,[x0,#8*0] + add x1,x0,#8*8 + ldp x8,x9,[x0,#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[x0,#8*4] + sub x15,x2,x14 + ldp x12,x13,[x0,#8*6] + cbz x14,.Lsqr8x_outer_loop + + stp x19,x20,[x2,#8*0] + ldp x19,x20,[x15,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x15,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x15,#8*4] + stp x25,x26,[x2,#8*6] + mov x2,x15 + ldp x25,x26,[x15,#8*6] + b .Lsqr8x_outer_loop + +.align 4 +.Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] + ldp x15,x16,[sp,#8*1] + ldp x11,x13,[x14,#8*2] + add x1,x14,#8*4 + ldp x17,x14,[sp,#8*3] + + stp x19,x20,[x2,#8*0] + mul x19,x7,x7 + stp x21,x22,[x2,#8*2] + umulh x7,x7,x7 + stp x23,x24,[x2,#8*4] + mul x8,x9,x9 + stp x25,x26,[x2,#8*6] + mov x2,sp + umulh x9,x9,x9 + adds x20,x7,x15,lsl#1 + extr x15,x16,x15,#63 + sub x27,x5,#8*4 + +.Lsqr4x_shift_n_add: + adcs x21,x8,x15 + extr x16,x17,x16,#63 + sub x27,x27,#8*4 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + ldp x7,x9,[x1],#8*2 + umulh x11,x11,x11 + mul x12,x13,x13 + umulh x13,x13,x13 + extr x17,x14,x17,#63 + stp x19,x20,[x2,#8*0] + adcs x23,x10,x17 + extr x14,x15,x14,#63 + stp x21,x22,[x2,#8*2] + adcs x24,x11,x14 + ldp x17,x14,[x2,#8*7] + extr x15,x16,x15,#63 + adcs x25,x12,x15 + extr x16,x17,x16,#63 + adcs x26,x13,x16 + ldp x15,x16,[x2,#8*9] + mul x6,x7,x7 + ldp x11,x13,[x1],#8*2 + umulh x7,x7,x7 + mul x8,x9,x9 + umulh x9,x9,x9 + stp x23,x24,[x2,#8*4] + extr x17,x14,x17,#63 + stp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + adcs x19,x6,x17 + extr x14,x15,x14,#63 + adcs x20,x7,x14 + ldp x17,x14,[x2,#8*3] + extr x15,x16,x15,#63 + cbnz x27,.Lsqr4x_shift_n_add + ldp x1,x4,[x29,#104] // pull np and n0 + + adcs x21,x8,x15 + extr x16,x17,x16,#63 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + umulh x11,x11,x11 + stp x19,x20,[x2,#8*0] + mul x12,x13,x13 + umulh x13,x13,x13 + stp x21,x22,[x2,#8*2] + extr x17,x14,x17,#63 + adcs x23,x10,x17 + extr x14,x15,x14,#63 + ldp x19,x20,[sp,#8*0] + adcs x24,x11,x14 + extr x15,x16,x15,#63 + ldp x6,x7,[x1,#8*0] + adcs x25,x12,x15 + extr x16,xzr,x16,#63 + ldp x8,x9,[x1,#8*2] + adc x26,x13,x16 + ldp x10,x11,[x1,#8*4] + + // Reduce by 512 bits per iteration + mul x28,x4,x19 // t[0]*n0 + ldp x12,x13,[x1,#8*6] + add x3,x1,x5 + ldp x21,x22,[sp,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[sp,#8*4] + stp x25,x26,[x2,#8*6] + ldp x25,x26,[sp,#8*6] + add x1,x1,#8*8 + mov x30,xzr // initial top-most carry + mov x2,sp + mov x27,#8 + +.Lsqr8x_reduction: + // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) + mul x15,x7,x28 + sub x27,x27,#1 + mul x16,x8,x28 + str x28,[x2],#8 // put aside t[0]*n0 for tail processing + mul x17,x9,x28 + // (*) adds xzr,x19,x14 + subs xzr,x19,#1 // (*) + mul x14,x10,x28 + adcs x19,x20,x15 + mul x15,x11,x28 + adcs x20,x21,x16 + mul x16,x12,x28 + adcs x21,x22,x17 + mul x17,x13,x28 + adcs x22,x23,x14 + umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) + adcs x23,x24,x15 + umulh x15,x7,x28 + adcs x24,x25,x16 + umulh x16,x8,x28 + adcs x25,x26,x17 + umulh x17,x9,x28 + adc x26,xzr,xzr + adds x19,x19,x14 + umulh x14,x10,x28 + adcs x20,x20,x15 + umulh x15,x11,x28 + adcs x21,x21,x16 + umulh x16,x12,x28 + adcs x22,x22,x17 + umulh x17,x13,x28 + mul x28,x4,x19 // next t[0]*n0 + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adc x26,x26,x17 + cbnz x27,.Lsqr8x_reduction + + ldp x14,x15,[x2,#8*0] + ldp x16,x17,[x2,#8*2] + mov x0,x2 + sub x27,x3,x1 // done yet? + adds x19,x19,x14 + adcs x20,x20,x15 + ldp x14,x15,[x2,#8*4] + adcs x21,x21,x16 + adcs x22,x22,x17 + ldp x16,x17,[x2,#8*6] + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adcs x26,x26,x17 + //adc x28,xzr,xzr // moved below + cbz x27,.Lsqr8x8_post_condition + + ldr x4,[x2,#-8*8] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + mov x27,#-8*8 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + +.Lsqr8x_tail: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,.Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp x6,x7,[x2,#8*0] + sub x27,x3,x1 // done yet? + sub x16,x3,x5 // rewound np + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + cbz x27,.Lsqr8x_tail_break + + ldr x4,[x0,#-8*8] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b .Lsqr8x_tail + +.align 4 +.Lsqr8x_tail_break: + ldr x4,[x29,#112] // pull n0 + add x27,x2,#8*8 // end of current t[num] window + + subs xzr,x30,#1 // "move" top-most carry to carry bit + adcs x14,x19,x6 + adcs x15,x20,x7 + ldp x19,x20,[x0,#8*0] + adcs x21,x21,x8 + ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + adcs x22,x22,x9 + ldp x8,x9,[x16,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x16,#8*4] + adcs x25,x25,x12 + adcs x26,x26,x13 + ldp x12,x13,[x16,#8*6] + add x1,x16,#8*8 + adc x30,xzr,xzr // top-most carry + mul x28,x4,x19 + stp x14,x15,[x2,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x0,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x0,#8*4] + cmp x27,x29 // did we hit the bottom? + stp x25,x26,[x2,#8*6] + mov x2,x0 // slide the window + ldp x25,x26,[x0,#8*6] + mov x27,#8 + b.ne .Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x0,[x29,#96] // pull rp + add x2,x2,#8*8 + subs x14,x19,x6 + sbcs x15,x20,x7 + sub x27,x5,#8*8 + mov x3,x0 // x0 copy + +.Lsqr8x_sub: + sbcs x16,x21,x8 + ldp x6,x7,[x1,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x1,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x10,x11,[x1,#8*4] + sbcs x17,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + ldp x19,x20,[x2,#8*0] + sub x27,x27,#8*8 + ldp x21,x22,[x2,#8*2] + ldp x23,x24,[x2,#8*4] + ldp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + stp x14,x15,[x0,#8*4] + sbcs x14,x19,x6 + stp x16,x17,[x0,#8*6] + add x0,x0,#8*8 + sbcs x15,x20,x7 + cbnz x27,.Lsqr8x_sub + + sbcs x16,x21,x8 + mov x2,sp + add x1,sp,x5 + ldp x6,x7,[x3,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x3,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x19,x20,[x1,#8*0] + sbcs x17,x26,x13 + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp x14,x15,[x0,#8*4] + stp x16,x17,[x0,#8*6] + + sub x27,x5,#8*4 +.Lsqr4x_cond_copy: + sub x27,x27,#8*4 + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + ldp x6,x7,[x3,#8*4] + ldp x19,x20,[x1,#8*4] + csel x16,x21,x8,lo + stp xzr,xzr,[x2,#8*2] + add x2,x2,#8*4 + csel x17,x22,x9,lo + ldp x8,x9,[x3,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + stp xzr,xzr,[x1,#8*0] + stp xzr,xzr,[x1,#8*2] + cbnz x27,.Lsqr4x_cond_copy + + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + stp xzr,xzr,[x2,#8*2] + csel x16,x21,x8,lo + csel x17,x22,x9,lo + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + + b .Lsqr8x_done + +.align 4 +.Lsqr8x8_post_condition: + adc x28,xzr,xzr + ldr x30,[x29,#8] // pull return address + // x19-7,x28 hold result, x6-7 hold modulus + subs x6,x19,x6 + ldr x1,[x29,#96] // pull rp + sbcs x7,x20,x7 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x8 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x9 + stp xzr,xzr,[sp,#8*4] + sbcs x10,x23,x10 + stp xzr,xzr,[sp,#8*6] + sbcs x11,x24,x11 + stp xzr,xzr,[sp,#8*8] + sbcs x12,x25,x12 + stp xzr,xzr,[sp,#8*10] + sbcs x13,x26,x13 + stp xzr,xzr,[sp,#8*12] + sbcs x28,x28,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // x6-7 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + csel x10,x23,x10,lo + csel x11,x24,x11,lo + stp x8,x9,[x1,#8*2] + csel x12,x25,x12,lo + csel x13,x26,x13,lo + stp x10,x11,[x1,#8*4] + stp x12,x13,[x1,#8*6] + +.Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + // No return value + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret +.size __bn_sqr8x_mont,.-__bn_sqr8x_mont +.type __bn_mul4x_mont,%function +.align 5 +__bn_mul4x_mont: + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to + // only from bn_mul_mont_words or __bn_mul8x_mont which have already signed the + // return address. + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + // This can allocate at most 8 * BN_MONTGOMERY_MAX_WORDS on the stack, + // or 2 KiB. This fits well within a page, so it is not necessary to + // fault pages in the correct order. + sub x26,sp,x5,lsl#3 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + sub sp,x26,#8*4 // alloca + + add x10,x2,x5 + add x27,x1,x5 + stp x0,x10,[x29,#96] // offload rp and &b[num] + + ldr x24,[x2,#8*0] // b[0] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + ldp x14,x15,[x3,#8*0] // n[0..3] + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + mov x28,#0 + mov x26,sp + +.Loop_mul4x_1st_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[0]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[0]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + sub x10,x27,x1 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_1st_reduction + + cbz x10,.Lmul4x4_post_condition + + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldr x25,[sp] // a[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.Loop_mul4x_1st_tail: + mul x10,x6,x24 // lo(a[4..7]*b[i]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[i]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + adcs x23,x23,x0 + umulh x13,x17,x25 + adc x0,xzr,xzr + ldr x25,[sp,x28] // next t[0]*n0 + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_1st_tail + + sub x11,x27,x5 // rewound x1 + cbz x10,.Lmul4x_proceed + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b .Loop_mul4x_1st_tail + +.align 5 +.Lmul4x_proceed: + ldr x24,[x2,#8*4]! // *++b + adc x30,x0,xzr + ldp x6,x7,[x11,#8*0] // a[0..3] + sub x3,x3,x5 // rewind np + ldp x8,x9,[x11,#8*2] + add x1,x11,#8*4 + + stp x19,x20,[x26,#8*0] // result!!! + ldp x19,x20,[sp,#8*4] // t[0..3] + stp x21,x22,[x26,#8*2] // result!!! + ldp x21,x22,[sp,#8*6] + + ldp x14,x15,[x3,#8*0] // n[0..3] + mov x26,sp + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + +.align 4 +.Loop_mul4x_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[4]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + // (*) mul x10,x14,x25 + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_reduction + + adc x0,x0,xzr + ldp x10,x11,[x26,#8*4] // t[4..7] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + + ldr x25,[sp] // t[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.align 4 +.Loop_mul4x_tail: + mul x10,x6,x24 // lo(a[4..7]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[4]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + umulh x13,x17,x25 + adcs x23,x23,x0 + ldr x25,[sp,x28] // next a[0]*n0 + adc x0,xzr,xzr + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,.Loop_mul4x_tail + + sub x11,x3,x5 // rewound np? + adc x0,x0,xzr + cbz x10,.Loop_mul4x_break + + ldp x10,x11,[x26,#8*4] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b .Loop_mul4x_tail + +.align 4 +.Loop_mul4x_break: + ldp x12,x13,[x29,#96] // pull rp and &b[num] + adds x19,x19,x30 + add x2,x2,#8*4 // bp++ + adcs x20,x20,xzr + sub x1,x1,x5 // rewind ap + adcs x21,x21,xzr + stp x19,x20,[x26,#8*0] // result!!! + adcs x22,x22,xzr + ldp x19,x20,[sp,#8*4] // t[0..3] + adc x30,x0,xzr + stp x21,x22,[x26,#8*2] // result!!! + cmp x2,x13 // done yet? + ldp x21,x22,[sp,#8*6] + ldp x14,x15,[x11,#8*0] // n[0..3] + ldp x16,x17,[x11,#8*2] + add x3,x11,#8*4 + b.eq .Lmul4x_post + + ldr x24,[x2] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + adds x1,x1,#8*4 // clear carry bit + mov x0,xzr + mov x26,sp + b .Loop_mul4x_reduction + +.align 4 +.Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov x0,x12 + mov x27,x12 // x0 copy + subs x10,x19,x14 + add x26,sp,#8*8 + sbcs x11,x20,x15 + sub x28,x5,#8*4 + +.Lmul4x_sub: + sbcs x12,x21,x16 + ldp x14,x15,[x3,#8*0] + sub x28,x28,#8*4 + ldp x19,x20,[x26,#8*0] + sbcs x13,x22,x17 + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + ldp x21,x22,[x26,#8*2] + add x26,x26,#8*4 + stp x10,x11,[x0,#8*0] + sbcs x10,x19,x14 + stp x12,x13,[x0,#8*2] + add x0,x0,#8*4 + sbcs x11,x20,x15 + cbnz x28,.Lmul4x_sub + + sbcs x12,x21,x16 + mov x26,sp + add x1,sp,#8*4 + ldp x6,x7,[x27,#8*0] + sbcs x13,x22,x17 + stp x10,x11,[x0,#8*0] + ldp x8,x9,[x27,#8*2] + stp x12,x13,[x0,#8*2] + ldp x19,x20,[x1,#8*0] + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub x28,x5,#8*4 +.Lmul4x_cond_copy: + sub x28,x28,#8*4 + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + ldp x6,x7,[x27,#8*4] + ldp x19,x20,[x1,#8*4] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*2] + add x26,x26,#8*4 + csel x13,x22,x9,lo + ldp x8,x9,[x27,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + add x27,x27,#8*4 + cbnz x28,.Lmul4x_cond_copy + + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + stp xzr,xzr,[x26,#8*2] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*3] + csel x13,x22,x9,lo + stp xzr,xzr,[x26,#8*4] + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + + b .Lmul4x_done + +.align 4 +.Lmul4x4_post_condition: + adc x0,x0,xzr + ldr x1,[x29,#96] // pull rp + // x19-3,x0 hold result, x14-7 hold modulus + subs x6,x19,x14 + ldr x30,[x29,#8] // pull return address + sbcs x7,x20,x15 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x16 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x17 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,x0,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // x6-3 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + stp x8,x9,[x1,#8*2] + +.Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + // No return value + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret +.size __bn_mul4x_mont,.-__bn_mul4x_mont +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 4 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/armv8-mont-win.S b/third_party/boringssl/gen/bcm/armv8-mont-win.S new file mode 100644 index 00000000..adfdba44 --- /dev/null +++ b/third_party/boringssl/gen/bcm/armv8-mont-win.S @@ -0,0 +1,1448 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.text + +.globl bn_mul_mont_words + +.def bn_mul_mont_words + .type 32 +.endef +.align 5 +bn_mul_mont_words: + AARCH64_SIGN_LINK_REGISTER + tst x5,#7 + b.eq __bn_sqr8x_mont + tst x5,#3 + b.eq __bn_mul4x_mont +Lmul_mont: + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + ldr x9,[x2],#8 // bp[0] + sub x22,sp,x5,lsl#3 + ldp x7,x8,[x1],#16 // ap[0..1] + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + and x22,x22,#-16 // ABI says so + ldp x13,x14,[x3],#16 // np[0..1] + + mul x6,x7,x9 // ap[0]*bp[0] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + mul x10,x8,x9 // ap[1]*bp[0] + umulh x11,x8,x9 + + mul x15,x6,x4 // "tp[0]"*n0 + // This can allocate at most 8 * BN_MONTGOMERY_MAX_WORDS on the stack, + // or 2 KiB. This fits well within a page, so it is not necessary to + // fault pages in the correct order. + mov sp,x22 // alloca + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 // discarded + // (*) As for removal of first multiplication and addition + // instructions. The outcome of first addition is + // guaranteed to be zero, which leaves two computationally + // significant outcomes: it either carries or not. Then + // question is when does it carry? Is there alternative + // way to deduce it? If you follow operations, you can + // observe that condition for carry is quite simple: + // x6 being non-zero. So that carry can be calculated + // by adding -1 to x6. That's what next instruction does. + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + adc x13,x13,xzr + cbz x21,L1st_skip + +L1st: + ldr x8,[x1],#8 + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + ldr x14,[x3],#8 + adds x12,x16,x13 + mul x10,x8,x9 // ap[j]*bp[0] + adc x13,x17,xzr + umulh x11,x8,x9 + + adds x12,x12,x6 + mul x16,x14,x15 // np[j]*m1 + adc x13,x13,xzr + umulh x17,x14,x15 + str x12,[x22],#8 // tp[j-1] + cbnz x21,L1st + +L1st_skip: + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adc x13,x17,xzr + + adds x12,x12,x6 + sub x20,x5,#8 // i=num-1 + adcs x13,x13,x7 + + adc x19,xzr,xzr // upmost overflow bit + stp x12,x13,[x22] + +Louter: + ldr x9,[x2],#8 // bp[i] + ldp x7,x8,[x1],#16 + ldr x23,[sp] // tp[0] + add x22,sp,#8 + + mul x6,x7,x9 // ap[0]*bp[i] + sub x21,x5,#16 // j=num-2 + umulh x7,x7,x9 + ldp x13,x14,[x3],#16 + mul x10,x8,x9 // ap[1]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x15,x6,x4 + sub x20,x20,#8 // i-- + + // (*) mul x12,x13,x15 // np[0]*m1 + umulh x13,x13,x15 + mul x16,x14,x15 // np[1]*m1 + // (*) adds x12,x12,x6 + subs xzr,x6,#1 // (*) + umulh x17,x14,x15 + cbz x21,Linner_skip + +Linner: + ldr x8,[x1],#8 + adc x13,x13,xzr + ldr x23,[x22],#8 // tp[j] + adds x6,x10,x7 + sub x21,x21,#8 // j-- + adc x7,x11,xzr + + adds x12,x16,x13 + ldr x14,[x3],#8 + adc x13,x17,xzr + + mul x10,x8,x9 // ap[j]*bp[i] + adds x6,x6,x23 + umulh x11,x8,x9 + adc x7,x7,xzr + + mul x16,x14,x15 // np[j]*m1 + adds x12,x12,x6 + umulh x17,x14,x15 + str x12,[x22,#-16] // tp[j-1] + cbnz x21,Linner + +Linner_skip: + ldr x23,[x22],#8 // tp[j] + adc x13,x13,xzr + adds x6,x10,x7 + sub x1,x1,x5 // rewind x1 + adc x7,x11,xzr + + adds x12,x16,x13 + sub x3,x3,x5 // rewind x3 + adcs x13,x17,x19 + adc x19,xzr,xzr + + adds x6,x6,x23 + adc x7,x7,xzr + + adds x12,x12,x6 + adcs x13,x13,x7 + adc x19,x19,xzr // upmost overflow bit + stp x12,x13,[x22,#-16] + + cbnz x20,Louter + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x14,[x3],#8 // np[0] + subs x21,x5,#8 // j=num-1 and clear borrow + mov x1,x0 +Lsub: + sbcs x8,x23,x14 // tp[j]-np[j] + ldr x23,[x22],#8 + sub x21,x21,#8 // j-- + ldr x14,[x3],#8 + str x8,[x1],#8 // rp[j]=tp[j]-np[j] + cbnz x21,Lsub + + sbcs x8,x23,x14 + sbcs x19,x19,xzr // did it borrow? + str x8,[x1],#8 // rp[num-1] + + ldr x23,[sp] // tp[0] + add x22,sp,#8 + ldr x8,[x0],#8 // rp[0] + sub x5,x5,#8 // num-- + nop +Lcond_copy: + sub x5,x5,#8 // num-- + csel x14,x23,x8,lo // did it borrow? + ldr x23,[x22],#8 + ldr x8,[x0],#8 + str xzr,[x22,#-16] // wipe tp + str x14,[x0,#-16] + cbnz x5,Lcond_copy + + csel x14,x23,x8,lo + str xzr,[x22,#-8] // wipe tp + str x14,[x0,#-8] + + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + // No return value + ldp x23,x24,[x29,#48] + ldr x29,[sp],#64 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.def __bn_sqr8x_mont + .type 32 +.endef +.align 5 +__bn_sqr8x_mont: + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to + // only from bn_mul_mont_words which has already signed the return address. + cmp x1,x2 + b.ne __bn_mul4x_mont +Lsqr8x_mont: + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x3,[sp,#96] // offload rp and np + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + ldp x12,x13,[x1,#8*6] + + // This can allocate at most 16 * BN_MONTGOMERY_MAX_WORDS on the stack, + // or 4 KiB. The fixed allocation above pushes to just above a page. On + // Windows, we must ensure new pages are first accessed in order. See + // https://learn.microsoft.com/en-us/cpp/build/arm64-windows-abi-conventions?view=msvc-170#stack + // + // The order is correct, but precariously so: the code above access as + // low as [sp,#16]. This leaves a jump of 16 + 4096 = 4112 bytes. If + // [sp,#16] were at page boundary, those 4112 bytes would span two + // pages. If [x2] were the next access, we would skip a guard page. + // + // Fortunately, the first access is [x2,#8*8], at .Lsqr8x_zero_start. + // We jump at most 4112 - 64 = 4048 bytes, less than a page. If any of + // this changes, we must insert a no-op access or call __chkstk. + sub x2,sp,x5,lsl#4 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + mov sp,x2 // alloca + sub x27,x5,#8*8 + b Lsqr8x_zero_start + +Lsqr8x_zero: + sub x27,x27,#8*8 + stp xzr,xzr,[x2,#8*0] + stp xzr,xzr,[x2,#8*2] + stp xzr,xzr,[x2,#8*4] + stp xzr,xzr,[x2,#8*6] +Lsqr8x_zero_start: + stp xzr,xzr,[x2,#8*8] + stp xzr,xzr,[x2,#8*10] + stp xzr,xzr,[x2,#8*12] + stp xzr,xzr,[x2,#8*14] + add x2,x2,#8*16 + cbnz x27,Lsqr8x_zero + + add x3,x1,x5 + add x1,x1,#8*8 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + mov x23,xzr + mov x24,xzr + mov x25,xzr + mov x26,xzr + mov x2,sp + str x4,[x29,#112] // offload n0 + + // Multiply everything but a[i]*a[i] +.align 4 +Lsqr8x_outer_loop: + // a[1]a[0] (i) + // a[2]a[0] + // a[3]a[0] + // a[4]a[0] + // a[5]a[0] + // a[6]a[0] + // a[7]a[0] + // a[2]a[1] (ii) + // a[3]a[1] + // a[4]a[1] + // a[5]a[1] + // a[6]a[1] + // a[7]a[1] + // a[3]a[2] (iii) + // a[4]a[2] + // a[5]a[2] + // a[6]a[2] + // a[7]a[2] + // a[4]a[3] (iv) + // a[5]a[3] + // a[6]a[3] + // a[7]a[3] + // a[5]a[4] (v) + // a[6]a[4] + // a[7]a[4] + // a[6]a[5] (vi) + // a[7]a[5] + // a[7]a[6] (vii) + + mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) + mul x15,x8,x6 + mul x16,x9,x6 + mul x17,x10,x6 + adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) + mul x14,x11,x6 + adcs x21,x21,x15 + mul x15,x12,x6 + adcs x22,x22,x16 + mul x16,x13,x6 + adcs x23,x23,x17 + umulh x17,x7,x6 // hi(a[1..7]*a[0]) + adcs x24,x24,x14 + umulh x14,x8,x6 + adcs x25,x25,x15 + umulh x15,x9,x6 + adcs x26,x26,x16 + umulh x16,x10,x6 + stp x19,x20,[x2],#8*2 // t[0..1] + adc x19,xzr,xzr // t[8] + adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) + umulh x17,x11,x6 + adcs x22,x22,x14 + umulh x14,x12,x6 + adcs x23,x23,x15 + umulh x15,x13,x6 + adcs x24,x24,x16 + mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) + adcs x25,x25,x17 + mul x17,x9,x7 + adcs x26,x26,x14 + mul x14,x10,x7 + adc x19,x19,x15 + + mul x15,x11,x7 + adds x22,x22,x16 + mul x16,x12,x7 + adcs x23,x23,x17 + mul x17,x13,x7 + adcs x24,x24,x14 + umulh x14,x8,x7 // hi(a[2..7]*a[1]) + adcs x25,x25,x15 + umulh x15,x9,x7 + adcs x26,x26,x16 + umulh x16,x10,x7 + adcs x19,x19,x17 + umulh x17,x11,x7 + stp x21,x22,[x2],#8*2 // t[2..3] + adc x20,xzr,xzr // t[9] + adds x23,x23,x14 + umulh x14,x12,x7 + adcs x24,x24,x15 + umulh x15,x13,x7 + adcs x25,x25,x16 + mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) + adcs x26,x26,x17 + mul x17,x10,x8 + adcs x19,x19,x14 + mul x14,x11,x8 + adc x20,x20,x15 + + mul x15,x12,x8 + adds x24,x24,x16 + mul x16,x13,x8 + adcs x25,x25,x17 + umulh x17,x9,x8 // hi(a[3..7]*a[2]) + adcs x26,x26,x14 + umulh x14,x10,x8 + adcs x19,x19,x15 + umulh x15,x11,x8 + adcs x20,x20,x16 + umulh x16,x12,x8 + stp x23,x24,[x2],#8*2 // t[4..5] + adc x21,xzr,xzr // t[10] + adds x25,x25,x17 + umulh x17,x13,x8 + adcs x26,x26,x14 + mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) + adcs x19,x19,x15 + mul x15,x11,x9 + adcs x20,x20,x16 + mul x16,x12,x9 + adc x21,x21,x17 + + mul x17,x13,x9 + adds x26,x26,x14 + umulh x14,x10,x9 // hi(a[4..7]*a[3]) + adcs x19,x19,x15 + umulh x15,x11,x9 + adcs x20,x20,x16 + umulh x16,x12,x9 + adcs x21,x21,x17 + umulh x17,x13,x9 + stp x25,x26,[x2],#8*2 // t[6..7] + adc x22,xzr,xzr // t[11] + adds x19,x19,x14 + mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) + adcs x20,x20,x15 + mul x15,x12,x10 + adcs x21,x21,x16 + mul x16,x13,x10 + adc x22,x22,x17 + + umulh x17,x11,x10 // hi(a[5..7]*a[4]) + adds x20,x20,x14 + umulh x14,x12,x10 + adcs x21,x21,x15 + umulh x15,x13,x10 + adcs x22,x22,x16 + mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) + adc x23,xzr,xzr // t[12] + adds x21,x21,x17 + mul x17,x13,x11 + adcs x22,x22,x14 + umulh x14,x12,x11 // hi(a[6..7]*a[5]) + adc x23,x23,x15 + + umulh x15,x13,x11 + adds x22,x22,x16 + mul x16,x13,x12 // lo(a[7]*a[6]) (vii) + adcs x23,x23,x17 + umulh x17,x13,x12 // hi(a[7]*a[6]) + adc x24,xzr,xzr // t[13] + adds x23,x23,x14 + sub x27,x3,x1 // done yet? + adc x24,x24,x15 + + adds x24,x24,x16 + sub x14,x3,x5 // rewound ap + adc x25,xzr,xzr // t[14] + add x25,x25,x17 + + cbz x27,Lsqr8x_outer_break + + mov x4,x6 + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x0,x1 + adcs x26,xzr,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved below + mov x27,#-8*8 + + // a[8]a[0] + // a[9]a[0] + // a[a]a[0] + // a[b]a[0] + // a[c]a[0] + // a[d]a[0] + // a[e]a[0] + // a[f]a[0] + // a[8]a[1] + // a[f]a[1]........................ + // a[8]a[2] + // a[f]a[2]........................ + // a[8]a[3] + // a[f]a[3]........................ + // a[8]a[4] + // a[f]a[4]........................ + // a[8]a[5] + // a[f]a[5]........................ + // a[8]a[6] + // a[f]a[6]........................ + // a[8]a[7] + // a[f]a[7]........................ +Lsqr8x_mul: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_mul + // note that carry flag is guaranteed + // to be zero at this point + cmp x1,x3 // done yet? + b.eq Lsqr8x_break + + ldp x6,x7,[x2,#8*0] + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + adds x19,x19,x6 + ldr x4,[x0,#-8*8] + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_mul + +.align 4 +Lsqr8x_break: + ldp x6,x7,[x0,#8*0] + add x1,x0,#8*8 + ldp x8,x9,[x0,#8*2] + sub x14,x3,x1 // is it last iteration? + ldp x10,x11,[x0,#8*4] + sub x15,x2,x14 + ldp x12,x13,[x0,#8*6] + cbz x14,Lsqr8x_outer_loop + + stp x19,x20,[x2,#8*0] + ldp x19,x20,[x15,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x15,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x15,#8*4] + stp x25,x26,[x2,#8*6] + mov x2,x15 + ldp x25,x26,[x15,#8*6] + b Lsqr8x_outer_loop + +.align 4 +Lsqr8x_outer_break: + // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] + ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] + ldp x15,x16,[sp,#8*1] + ldp x11,x13,[x14,#8*2] + add x1,x14,#8*4 + ldp x17,x14,[sp,#8*3] + + stp x19,x20,[x2,#8*0] + mul x19,x7,x7 + stp x21,x22,[x2,#8*2] + umulh x7,x7,x7 + stp x23,x24,[x2,#8*4] + mul x8,x9,x9 + stp x25,x26,[x2,#8*6] + mov x2,sp + umulh x9,x9,x9 + adds x20,x7,x15,lsl#1 + extr x15,x16,x15,#63 + sub x27,x5,#8*4 + +Lsqr4x_shift_n_add: + adcs x21,x8,x15 + extr x16,x17,x16,#63 + sub x27,x27,#8*4 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + ldp x7,x9,[x1],#8*2 + umulh x11,x11,x11 + mul x12,x13,x13 + umulh x13,x13,x13 + extr x17,x14,x17,#63 + stp x19,x20,[x2,#8*0] + adcs x23,x10,x17 + extr x14,x15,x14,#63 + stp x21,x22,[x2,#8*2] + adcs x24,x11,x14 + ldp x17,x14,[x2,#8*7] + extr x15,x16,x15,#63 + adcs x25,x12,x15 + extr x16,x17,x16,#63 + adcs x26,x13,x16 + ldp x15,x16,[x2,#8*9] + mul x6,x7,x7 + ldp x11,x13,[x1],#8*2 + umulh x7,x7,x7 + mul x8,x9,x9 + umulh x9,x9,x9 + stp x23,x24,[x2,#8*4] + extr x17,x14,x17,#63 + stp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + adcs x19,x6,x17 + extr x14,x15,x14,#63 + adcs x20,x7,x14 + ldp x17,x14,[x2,#8*3] + extr x15,x16,x15,#63 + cbnz x27,Lsqr4x_shift_n_add + ldp x1,x4,[x29,#104] // pull np and n0 + + adcs x21,x8,x15 + extr x16,x17,x16,#63 + adcs x22,x9,x16 + ldp x15,x16,[x2,#8*5] + mul x10,x11,x11 + umulh x11,x11,x11 + stp x19,x20,[x2,#8*0] + mul x12,x13,x13 + umulh x13,x13,x13 + stp x21,x22,[x2,#8*2] + extr x17,x14,x17,#63 + adcs x23,x10,x17 + extr x14,x15,x14,#63 + ldp x19,x20,[sp,#8*0] + adcs x24,x11,x14 + extr x15,x16,x15,#63 + ldp x6,x7,[x1,#8*0] + adcs x25,x12,x15 + extr x16,xzr,x16,#63 + ldp x8,x9,[x1,#8*2] + adc x26,x13,x16 + ldp x10,x11,[x1,#8*4] + + // Reduce by 512 bits per iteration + mul x28,x4,x19 // t[0]*n0 + ldp x12,x13,[x1,#8*6] + add x3,x1,x5 + ldp x21,x22,[sp,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[sp,#8*4] + stp x25,x26,[x2,#8*6] + ldp x25,x26,[sp,#8*6] + add x1,x1,#8*8 + mov x30,xzr // initial top-most carry + mov x2,sp + mov x27,#8 + +Lsqr8x_reduction: + // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) + mul x15,x7,x28 + sub x27,x27,#1 + mul x16,x8,x28 + str x28,[x2],#8 // put aside t[0]*n0 for tail processing + mul x17,x9,x28 + // (*) adds xzr,x19,x14 + subs xzr,x19,#1 // (*) + mul x14,x10,x28 + adcs x19,x20,x15 + mul x15,x11,x28 + adcs x20,x21,x16 + mul x16,x12,x28 + adcs x21,x22,x17 + mul x17,x13,x28 + adcs x22,x23,x14 + umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) + adcs x23,x24,x15 + umulh x15,x7,x28 + adcs x24,x25,x16 + umulh x16,x8,x28 + adcs x25,x26,x17 + umulh x17,x9,x28 + adc x26,xzr,xzr + adds x19,x19,x14 + umulh x14,x10,x28 + adcs x20,x20,x15 + umulh x15,x11,x28 + adcs x21,x21,x16 + umulh x16,x12,x28 + adcs x22,x22,x17 + umulh x17,x13,x28 + mul x28,x4,x19 // next t[0]*n0 + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adc x26,x26,x17 + cbnz x27,Lsqr8x_reduction + + ldp x14,x15,[x2,#8*0] + ldp x16,x17,[x2,#8*2] + mov x0,x2 + sub x27,x3,x1 // done yet? + adds x19,x19,x14 + adcs x20,x20,x15 + ldp x14,x15,[x2,#8*4] + adcs x21,x21,x16 + adcs x22,x22,x17 + ldp x16,x17,[x2,#8*6] + adcs x23,x23,x14 + adcs x24,x24,x15 + adcs x25,x25,x16 + adcs x26,x26,x17 + //adc x28,xzr,xzr // moved below + cbz x27,Lsqr8x8_post_condition + + ldr x4,[x2,#-8*8] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + ldp x10,x11,[x1,#8*4] + mov x27,#-8*8 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + +Lsqr8x_tail: + mul x14,x6,x4 + adc x28,xzr,xzr // carry bit, modulo-scheduled + mul x15,x7,x4 + add x27,x27,#8 + mul x16,x8,x4 + mul x17,x9,x4 + adds x19,x19,x14 + mul x14,x10,x4 + adcs x20,x20,x15 + mul x15,x11,x4 + adcs x21,x21,x16 + mul x16,x12,x4 + adcs x22,x22,x17 + mul x17,x13,x4 + adcs x23,x23,x14 + umulh x14,x6,x4 + adcs x24,x24,x15 + umulh x15,x7,x4 + adcs x25,x25,x16 + umulh x16,x8,x4 + adcs x26,x26,x17 + umulh x17,x9,x4 + adc x28,x28,xzr + str x19,[x2],#8 + adds x19,x20,x14 + umulh x14,x10,x4 + adcs x20,x21,x15 + umulh x15,x11,x4 + adcs x21,x22,x16 + umulh x16,x12,x4 + adcs x22,x23,x17 + umulh x17,x13,x4 + ldr x4,[x0,x27] + adcs x23,x24,x14 + adcs x24,x25,x15 + adcs x25,x26,x16 + adcs x26,x28,x17 + //adc x28,xzr,xzr // moved above + cbnz x27,Lsqr8x_tail + // note that carry flag is guaranteed + // to be zero at this point + ldp x6,x7,[x2,#8*0] + sub x27,x3,x1 // done yet? + sub x16,x3,x5 // rewound np + ldp x8,x9,[x2,#8*2] + ldp x10,x11,[x2,#8*4] + ldp x12,x13,[x2,#8*6] + cbz x27,Lsqr8x_tail_break + + ldr x4,[x0,#-8*8] + adds x19,x19,x6 + adcs x20,x20,x7 + ldp x6,x7,[x1,#8*0] + adcs x21,x21,x8 + adcs x22,x22,x9 + ldp x8,x9,[x1,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x1,#8*4] + adcs x25,x25,x12 + mov x27,#-8*8 + adcs x26,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + //adc x28,xzr,xzr // moved above + b Lsqr8x_tail + +.align 4 +Lsqr8x_tail_break: + ldr x4,[x29,#112] // pull n0 + add x27,x2,#8*8 // end of current t[num] window + + subs xzr,x30,#1 // "move" top-most carry to carry bit + adcs x14,x19,x6 + adcs x15,x20,x7 + ldp x19,x20,[x0,#8*0] + adcs x21,x21,x8 + ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] + adcs x22,x22,x9 + ldp x8,x9,[x16,#8*2] + adcs x23,x23,x10 + adcs x24,x24,x11 + ldp x10,x11,[x16,#8*4] + adcs x25,x25,x12 + adcs x26,x26,x13 + ldp x12,x13,[x16,#8*6] + add x1,x16,#8*8 + adc x30,xzr,xzr // top-most carry + mul x28,x4,x19 + stp x14,x15,[x2,#8*0] + stp x21,x22,[x2,#8*2] + ldp x21,x22,[x0,#8*2] + stp x23,x24,[x2,#8*4] + ldp x23,x24,[x0,#8*4] + cmp x27,x29 // did we hit the bottom? + stp x25,x26,[x2,#8*6] + mov x2,x0 // slide the window + ldp x25,x26,[x0,#8*6] + mov x27,#8 + b.ne Lsqr8x_reduction + + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + ldr x0,[x29,#96] // pull rp + add x2,x2,#8*8 + subs x14,x19,x6 + sbcs x15,x20,x7 + sub x27,x5,#8*8 + mov x3,x0 // x0 copy + +Lsqr8x_sub: + sbcs x16,x21,x8 + ldp x6,x7,[x1,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x1,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x10,x11,[x1,#8*4] + sbcs x17,x26,x13 + ldp x12,x13,[x1,#8*6] + add x1,x1,#8*8 + ldp x19,x20,[x2,#8*0] + sub x27,x27,#8*8 + ldp x21,x22,[x2,#8*2] + ldp x23,x24,[x2,#8*4] + ldp x25,x26,[x2,#8*6] + add x2,x2,#8*8 + stp x14,x15,[x0,#8*4] + sbcs x14,x19,x6 + stp x16,x17,[x0,#8*6] + add x0,x0,#8*8 + sbcs x15,x20,x7 + cbnz x27,Lsqr8x_sub + + sbcs x16,x21,x8 + mov x2,sp + add x1,sp,x5 + ldp x6,x7,[x3,#8*0] + sbcs x17,x22,x9 + stp x14,x15,[x0,#8*0] + sbcs x14,x23,x10 + ldp x8,x9,[x3,#8*2] + sbcs x15,x24,x11 + stp x16,x17,[x0,#8*2] + sbcs x16,x25,x12 + ldp x19,x20,[x1,#8*0] + sbcs x17,x26,x13 + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + stp x14,x15,[x0,#8*4] + stp x16,x17,[x0,#8*6] + + sub x27,x5,#8*4 +Lsqr4x_cond_copy: + sub x27,x27,#8*4 + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + ldp x6,x7,[x3,#8*4] + ldp x19,x20,[x1,#8*4] + csel x16,x21,x8,lo + stp xzr,xzr,[x2,#8*2] + add x2,x2,#8*4 + csel x17,x22,x9,lo + ldp x8,x9,[x3,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + stp xzr,xzr,[x1,#8*0] + stp xzr,xzr,[x1,#8*2] + cbnz x27,Lsqr4x_cond_copy + + csel x14,x19,x6,lo + stp xzr,xzr,[x2,#8*0] + csel x15,x20,x7,lo + stp xzr,xzr,[x2,#8*2] + csel x16,x21,x8,lo + csel x17,x22,x9,lo + stp x14,x15,[x3,#8*0] + stp x16,x17,[x3,#8*2] + + b Lsqr8x_done + +.align 4 +Lsqr8x8_post_condition: + adc x28,xzr,xzr + ldr x30,[x29,#8] // pull return address + // x19-7,x28 hold result, x6-7 hold modulus + subs x6,x19,x6 + ldr x1,[x29,#96] // pull rp + sbcs x7,x20,x7 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x8 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x9 + stp xzr,xzr,[sp,#8*4] + sbcs x10,x23,x10 + stp xzr,xzr,[sp,#8*6] + sbcs x11,x24,x11 + stp xzr,xzr,[sp,#8*8] + sbcs x12,x25,x12 + stp xzr,xzr,[sp,#8*10] + sbcs x13,x26,x13 + stp xzr,xzr,[sp,#8*12] + sbcs x28,x28,xzr // did it borrow? + stp xzr,xzr,[sp,#8*14] + + // x6-7 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + csel x10,x23,x10,lo + csel x11,x24,x11,lo + stp x8,x9,[x1,#8*2] + csel x12,x25,x12,lo + csel x13,x26,x13,lo + stp x10,x11,[x1,#8*4] + stp x12,x13,[x1,#8*6] + +Lsqr8x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + // No return value + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret + +.def __bn_mul4x_mont + .type 32 +.endef +.align 5 +__bn_mul4x_mont: + // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to + // only from bn_mul_mont_words or __bn_mul8x_mont which have already signed the + // return address. + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + // This can allocate at most 8 * BN_MONTGOMERY_MAX_WORDS on the stack, + // or 2 KiB. This fits well within a page, so it is not necessary to + // fault pages in the correct order. + sub x26,sp,x5,lsl#3 + lsl x5,x5,#3 + ldr x4,[x4] // *n0 + sub sp,x26,#8*4 // alloca + + add x10,x2,x5 + add x27,x1,x5 + stp x0,x10,[x29,#96] // offload rp and &b[num] + + ldr x24,[x2,#8*0] // b[0] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + mov x19,xzr + mov x20,xzr + mov x21,xzr + mov x22,xzr + ldp x14,x15,[x3,#8*0] // n[0..3] + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + mov x28,#0 + mov x26,sp + +Loop_mul4x_1st_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[0]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[0]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + sub x10,x27,x1 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_reduction + + cbz x10,Lmul4x4_post_condition + + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldr x25,[sp] // a[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +Loop_mul4x_1st_tail: + mul x10,x6,x24 // lo(a[4..7]*b[i]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[i]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] (or b[0]) + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + adcs x23,x23,x0 + umulh x13,x17,x25 + adc x0,xzr,xzr + ldr x25,[sp,x28] // next t[0]*n0 + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_1st_tail + + sub x11,x27,x5 // rewound x1 + cbz x10,Lmul4x_proceed + + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_1st_tail + +.align 5 +Lmul4x_proceed: + ldr x24,[x2,#8*4]! // *++b + adc x30,x0,xzr + ldp x6,x7,[x11,#8*0] // a[0..3] + sub x3,x3,x5 // rewind np + ldp x8,x9,[x11,#8*2] + add x1,x11,#8*4 + + stp x19,x20,[x26,#8*0] // result!!! + ldp x19,x20,[sp,#8*4] // t[0..3] + stp x21,x22,[x26,#8*2] // result!!! + ldp x21,x22,[sp,#8*6] + + ldp x14,x15,[x3,#8*0] // n[0..3] + mov x26,sp + ldp x16,x17,[x3,#8*2] + adds x3,x3,#8*4 // clear carry bit + mov x0,xzr + +.align 4 +Loop_mul4x_reduction: + mul x10,x6,x24 // lo(a[0..3]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[0..3]*b[4]) + adcs x20,x20,x11 + mul x25,x19,x4 // t[0]*n0 + adcs x21,x21,x12 + umulh x11,x7,x24 + adcs x22,x22,x13 + umulh x12,x8,x24 + adc x23,xzr,xzr + umulh x13,x9,x24 + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + // (*) mul x10,x14,x25 + str x25,[x26],#8 // put aside t[0]*n0 for tail processing + adcs x21,x21,x11 + mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + // (*) adds xzr,x19,x10 + subs xzr,x19,#1 // (*) + umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 + adcs x19,x20,x11 + umulh x11,x15,x25 + adcs x20,x21,x12 + umulh x12,x16,x25 + adcs x21,x22,x13 + umulh x13,x17,x25 + adcs x22,x23,x0 + adc x0,xzr,xzr + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_reduction + + adc x0,x0,xzr + ldp x10,x11,[x26,#8*4] // t[4..7] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] // a[4..7] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + + ldr x25,[sp] // t[0]*n0 + ldp x14,x15,[x3,#8*0] // n[4..7] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + +.align 4 +Loop_mul4x_tail: + mul x10,x6,x24 // lo(a[4..7]*b[4]) + adc x0,x0,xzr // modulo-scheduled + mul x11,x7,x24 + add x28,x28,#8 + mul x12,x8,x24 + and x28,x28,#31 + mul x13,x9,x24 + adds x19,x19,x10 + umulh x10,x6,x24 // hi(a[4..7]*b[4]) + adcs x20,x20,x11 + umulh x11,x7,x24 + adcs x21,x21,x12 + umulh x12,x8,x24 + adcs x22,x22,x13 + umulh x13,x9,x24 + adc x23,xzr,xzr + ldr x24,[x2,x28] // next b[i] + adds x20,x20,x10 + mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) + adcs x21,x21,x11 + mul x11,x15,x25 + adcs x22,x22,x12 + mul x12,x16,x25 + adc x23,x23,x13 // can't overflow + mul x13,x17,x25 + adds x19,x19,x10 + umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) + adcs x20,x20,x11 + umulh x11,x15,x25 + adcs x21,x21,x12 + umulh x12,x16,x25 + adcs x22,x22,x13 + umulh x13,x17,x25 + adcs x23,x23,x0 + ldr x25,[sp,x28] // next a[0]*n0 + adc x0,xzr,xzr + str x19,[x26],#8 // result!!! + adds x19,x20,x10 + sub x10,x27,x1 // done yet? + adcs x20,x21,x11 + adcs x21,x22,x12 + adcs x22,x23,x13 + //adc x0,x0,xzr + cbnz x28,Loop_mul4x_tail + + sub x11,x3,x5 // rewound np? + adc x0,x0,xzr + cbz x10,Loop_mul4x_break + + ldp x10,x11,[x26,#8*4] + ldp x12,x13,[x26,#8*6] + ldp x6,x7,[x1,#8*0] + ldp x8,x9,[x1,#8*2] + add x1,x1,#8*4 + adds x19,x19,x10 + adcs x20,x20,x11 + adcs x21,x21,x12 + adcs x22,x22,x13 + //adc x0,x0,xzr + ldp x14,x15,[x3,#8*0] + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + b Loop_mul4x_tail + +.align 4 +Loop_mul4x_break: + ldp x12,x13,[x29,#96] // pull rp and &b[num] + adds x19,x19,x30 + add x2,x2,#8*4 // bp++ + adcs x20,x20,xzr + sub x1,x1,x5 // rewind ap + adcs x21,x21,xzr + stp x19,x20,[x26,#8*0] // result!!! + adcs x22,x22,xzr + ldp x19,x20,[sp,#8*4] // t[0..3] + adc x30,x0,xzr + stp x21,x22,[x26,#8*2] // result!!! + cmp x2,x13 // done yet? + ldp x21,x22,[sp,#8*6] + ldp x14,x15,[x11,#8*0] // n[0..3] + ldp x16,x17,[x11,#8*2] + add x3,x11,#8*4 + b.eq Lmul4x_post + + ldr x24,[x2] + ldp x6,x7,[x1,#8*0] // a[0..3] + ldp x8,x9,[x1,#8*2] + adds x1,x1,#8*4 // clear carry bit + mov x0,xzr + mov x26,sp + b Loop_mul4x_reduction + +.align 4 +Lmul4x_post: + // Final step. We see if result is larger than modulus, and + // if it is, subtract the modulus. But comparison implies + // subtraction. So we subtract modulus, see if it borrowed, + // and conditionally copy original value. + mov x0,x12 + mov x27,x12 // x0 copy + subs x10,x19,x14 + add x26,sp,#8*8 + sbcs x11,x20,x15 + sub x28,x5,#8*4 + +Lmul4x_sub: + sbcs x12,x21,x16 + ldp x14,x15,[x3,#8*0] + sub x28,x28,#8*4 + ldp x19,x20,[x26,#8*0] + sbcs x13,x22,x17 + ldp x16,x17,[x3,#8*2] + add x3,x3,#8*4 + ldp x21,x22,[x26,#8*2] + add x26,x26,#8*4 + stp x10,x11,[x0,#8*0] + sbcs x10,x19,x14 + stp x12,x13,[x0,#8*2] + add x0,x0,#8*4 + sbcs x11,x20,x15 + cbnz x28,Lmul4x_sub + + sbcs x12,x21,x16 + mov x26,sp + add x1,sp,#8*4 + ldp x6,x7,[x27,#8*0] + sbcs x13,x22,x17 + stp x10,x11,[x0,#8*0] + ldp x8,x9,[x27,#8*2] + stp x12,x13,[x0,#8*2] + ldp x19,x20,[x1,#8*0] + ldp x21,x22,[x1,#8*2] + sbcs xzr,x30,xzr // did it borrow? + ldr x30,[x29,#8] // pull return address + + sub x28,x5,#8*4 +Lmul4x_cond_copy: + sub x28,x28,#8*4 + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + ldp x6,x7,[x27,#8*4] + ldp x19,x20,[x1,#8*4] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*2] + add x26,x26,#8*4 + csel x13,x22,x9,lo + ldp x8,x9,[x27,#8*6] + ldp x21,x22,[x1,#8*6] + add x1,x1,#8*4 + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + add x27,x27,#8*4 + cbnz x28,Lmul4x_cond_copy + + csel x10,x19,x6,lo + stp xzr,xzr,[x26,#8*0] + csel x11,x20,x7,lo + stp xzr,xzr,[x26,#8*2] + csel x12,x21,x8,lo + stp xzr,xzr,[x26,#8*3] + csel x13,x22,x9,lo + stp xzr,xzr,[x26,#8*4] + stp x10,x11,[x27,#8*0] + stp x12,x13,[x27,#8*2] + + b Lmul4x_done + +.align 4 +Lmul4x4_post_condition: + adc x0,x0,xzr + ldr x1,[x29,#96] // pull rp + // x19-3,x0 hold result, x14-7 hold modulus + subs x6,x19,x14 + ldr x30,[x29,#8] // pull return address + sbcs x7,x20,x15 + stp xzr,xzr,[sp,#8*0] + sbcs x8,x21,x16 + stp xzr,xzr,[sp,#8*2] + sbcs x9,x22,x17 + stp xzr,xzr,[sp,#8*4] + sbcs xzr,x0,xzr // did it borrow? + stp xzr,xzr,[sp,#8*6] + + // x6-3 hold result-modulus + csel x6,x19,x6,lo + csel x7,x20,x7,lo + csel x8,x21,x8,lo + csel x9,x22,x9,lo + stp x6,x7,[x1,#8*0] + stp x8,x9,[x1,#8*2] + +Lmul4x_done: + ldp x19,x20,[x29,#16] + mov sp,x29 + ldp x21,x22,[x29,#32] + // No return value + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldr x29,[sp],#128 + // x30 is popped earlier + AARCH64_VALIDATE_LINK_REGISTER + ret + +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 4 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/bn-586-apple.S b/third_party/boringssl/gen/bcm/bn-586-apple.S new file mode 100644 index 00000000..e96d0a4d --- /dev/null +++ b/third_party/boringssl/gen/bcm/bn-586-apple.S @@ -0,0 +1,530 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _bn_mul_add_words +.private_extern _bn_mul_add_words +.align 4 +_bn_mul_add_words: +L_bn_mul_add_words_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 + jmp L000maw_sse2_entry +.align 4,0x90 +L001maw_sse2_unrolled: + movd (%eax),%mm3 + paddq %mm3,%mm1 + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + movd 4(%edx),%mm4 + pmuludq %mm0,%mm4 + movd 8(%edx),%mm6 + pmuludq %mm0,%mm6 + movd 12(%edx),%mm7 + pmuludq %mm0,%mm7 + paddq %mm2,%mm1 + movd 4(%eax),%mm3 + paddq %mm4,%mm3 + movd 8(%eax),%mm5 + paddq %mm6,%mm5 + movd 12(%eax),%mm4 + paddq %mm4,%mm7 + movd %mm1,(%eax) + movd 16(%edx),%mm2 + pmuludq %mm0,%mm2 + psrlq $32,%mm1 + movd 20(%edx),%mm4 + pmuludq %mm0,%mm4 + paddq %mm3,%mm1 + movd 24(%edx),%mm6 + pmuludq %mm0,%mm6 + movd %mm1,4(%eax) + psrlq $32,%mm1 + movd 28(%edx),%mm3 + addl $32,%edx + pmuludq %mm0,%mm3 + paddq %mm5,%mm1 + movd 16(%eax),%mm5 + paddq %mm5,%mm2 + movd %mm1,8(%eax) + psrlq $32,%mm1 + paddq %mm7,%mm1 + movd 20(%eax),%mm5 + paddq %mm5,%mm4 + movd %mm1,12(%eax) + psrlq $32,%mm1 + paddq %mm2,%mm1 + movd 24(%eax),%mm5 + paddq %mm5,%mm6 + movd %mm1,16(%eax) + psrlq $32,%mm1 + paddq %mm4,%mm1 + movd 28(%eax),%mm5 + paddq %mm5,%mm3 + movd %mm1,20(%eax) + psrlq $32,%mm1 + paddq %mm6,%mm1 + movd %mm1,24(%eax) + psrlq $32,%mm1 + paddq %mm3,%mm1 + movd %mm1,28(%eax) + leal 32(%eax),%eax + psrlq $32,%mm1 + subl $8,%ecx + jz L002maw_sse2_exit +L000maw_sse2_entry: + testl $4294967288,%ecx + jnz L001maw_sse2_unrolled +.align 2,0x90 +L003maw_sse2_loop: + movd (%edx),%mm2 + movd (%eax),%mm3 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm3,%mm1 + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz L003maw_sse2_loop +L002maw_sse2_exit: + movd %mm1,%eax + emms + ret + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _bn_mul_words +.private_extern _bn_mul_words +.align 4 +_bn_mul_words: +L_bn_mul_words_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 +.align 4,0x90 +L004mw_sse2_loop: + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz L004mw_sse2_loop + movd %mm1,%eax + emms + ret + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _bn_sqr_add_words +.private_extern _bn_sqr_add_words +.align 4 +_bn_sqr_add_words: +L_bn_sqr_add_words_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + pxor %mm1,%mm1 +.align 4,0x90 +L005sqr_sse2_loop: + movd (%edx),%mm0 + movd (%eax),%mm2 + movd 4(%eax),%mm3 + pmuludq %mm0,%mm0 + leal 4(%edx),%edx + paddq %mm0,%mm1 + paddq %mm2,%mm1 + movd %mm1,(%eax) + psrlq $32,%mm1 + paddq %mm3,%mm1 + movd %mm1,4(%eax) + psrlq $32,%mm1 + subl $1,%ecx + leal 8(%eax),%eax + jnz L005sqr_sse2_loop + emms + ret + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _bn_add_words +.private_extern _bn_add_words +.align 4 +_bn_add_words: +L_bn_add_words_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + movl 20(%esp),%ebx + movl 24(%esp),%esi + movl 28(%esp),%edi + movl 32(%esp),%ebp + xorl %eax,%eax + andl $4294967288,%ebp + jz L006aw_finish +L007aw_loop: + # Round 0 + movl (%esi),%ecx + movl (%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,(%ebx) + # Round 1 + movl 4(%esi),%ecx + movl 4(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,4(%ebx) + # Round 2 + movl 8(%esi),%ecx + movl 8(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,8(%ebx) + # Round 3 + movl 12(%esi),%ecx + movl 12(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,12(%ebx) + # Round 4 + movl 16(%esi),%ecx + movl 16(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,16(%ebx) + # Round 5 + movl 20(%esi),%ecx + movl 20(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,20(%ebx) + # Round 6 + movl 24(%esi),%ecx + movl 24(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) + # Round 7 + movl 28(%esi),%ecx + movl 28(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,28(%ebx) + + addl $32,%esi + addl $32,%edi + addl $32,%ebx + subl $8,%ebp + jnz L007aw_loop +L006aw_finish: + movl 32(%esp),%ebp + andl $7,%ebp + jz L008aw_end + # Tail Round 0 + movl (%esi),%ecx + movl (%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,(%ebx) + jz L008aw_end + # Tail Round 1 + movl 4(%esi),%ecx + movl 4(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,4(%ebx) + jz L008aw_end + # Tail Round 2 + movl 8(%esi),%ecx + movl 8(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,8(%ebx) + jz L008aw_end + # Tail Round 3 + movl 12(%esi),%ecx + movl 12(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,12(%ebx) + jz L008aw_end + # Tail Round 4 + movl 16(%esi),%ecx + movl 16(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,16(%ebx) + jz L008aw_end + # Tail Round 5 + movl 20(%esi),%ecx + movl 20(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,20(%ebx) + jz L008aw_end + # Tail Round 6 + movl 24(%esi),%ecx + movl 24(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) +L008aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _bn_sub_words +.private_extern _bn_sub_words +.align 4 +_bn_sub_words: +L_bn_sub_words_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + movl 20(%esp),%ebx + movl 24(%esp),%esi + movl 28(%esp),%edi + movl 32(%esp),%ebp + xorl %eax,%eax + andl $4294967288,%ebp + jz L009aw_finish +L010aw_loop: + # Round 0 + movl (%esi),%ecx + movl (%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,(%ebx) + # Round 1 + movl 4(%esi),%ecx + movl 4(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,4(%ebx) + # Round 2 + movl 8(%esi),%ecx + movl 8(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,8(%ebx) + # Round 3 + movl 12(%esi),%ecx + movl 12(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,12(%ebx) + # Round 4 + movl 16(%esi),%ecx + movl 16(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,16(%ebx) + # Round 5 + movl 20(%esi),%ecx + movl 20(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,20(%ebx) + # Round 6 + movl 24(%esi),%ecx + movl 24(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) + # Round 7 + movl 28(%esi),%ecx + movl 28(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,28(%ebx) + + addl $32,%esi + addl $32,%edi + addl $32,%ebx + subl $8,%ebp + jnz L010aw_loop +L009aw_finish: + movl 32(%esp),%ebp + andl $7,%ebp + jz L011aw_end + # Tail Round 0 + movl (%esi),%ecx + movl (%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,(%ebx) + jz L011aw_end + # Tail Round 1 + movl 4(%esi),%ecx + movl 4(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,4(%ebx) + jz L011aw_end + # Tail Round 2 + movl 8(%esi),%ecx + movl 8(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,8(%ebx) + jz L011aw_end + # Tail Round 3 + movl 12(%esi),%ecx + movl 12(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,12(%ebx) + jz L011aw_end + # Tail Round 4 + movl 16(%esi),%ecx + movl 16(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,16(%ebx) + jz L011aw_end + # Tail Round 5 + movl 20(%esi),%ecx + movl 20(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,20(%ebx) + jz L011aw_end + # Tail Round 6 + movl 24(%esi),%ecx + movl 24(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) +L011aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/bn-586-linux.S b/third_party/boringssl/gen/bcm/bn-586-linux.S new file mode 100644 index 00000000..8e2bdb0e --- /dev/null +++ b/third_party/boringssl/gen/bcm/bn-586-linux.S @@ -0,0 +1,540 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl bn_mul_add_words +.hidden bn_mul_add_words +.type bn_mul_add_words,@function +.align 16 +bn_mul_add_words: +.L_bn_mul_add_words_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 + jmp .L000maw_sse2_entry +.align 16 +.L001maw_sse2_unrolled: + movd (%eax),%mm3 + paddq %mm3,%mm1 + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + movd 4(%edx),%mm4 + pmuludq %mm0,%mm4 + movd 8(%edx),%mm6 + pmuludq %mm0,%mm6 + movd 12(%edx),%mm7 + pmuludq %mm0,%mm7 + paddq %mm2,%mm1 + movd 4(%eax),%mm3 + paddq %mm4,%mm3 + movd 8(%eax),%mm5 + paddq %mm6,%mm5 + movd 12(%eax),%mm4 + paddq %mm4,%mm7 + movd %mm1,(%eax) + movd 16(%edx),%mm2 + pmuludq %mm0,%mm2 + psrlq $32,%mm1 + movd 20(%edx),%mm4 + pmuludq %mm0,%mm4 + paddq %mm3,%mm1 + movd 24(%edx),%mm6 + pmuludq %mm0,%mm6 + movd %mm1,4(%eax) + psrlq $32,%mm1 + movd 28(%edx),%mm3 + addl $32,%edx + pmuludq %mm0,%mm3 + paddq %mm5,%mm1 + movd 16(%eax),%mm5 + paddq %mm5,%mm2 + movd %mm1,8(%eax) + psrlq $32,%mm1 + paddq %mm7,%mm1 + movd 20(%eax),%mm5 + paddq %mm5,%mm4 + movd %mm1,12(%eax) + psrlq $32,%mm1 + paddq %mm2,%mm1 + movd 24(%eax),%mm5 + paddq %mm5,%mm6 + movd %mm1,16(%eax) + psrlq $32,%mm1 + paddq %mm4,%mm1 + movd 28(%eax),%mm5 + paddq %mm5,%mm3 + movd %mm1,20(%eax) + psrlq $32,%mm1 + paddq %mm6,%mm1 + movd %mm1,24(%eax) + psrlq $32,%mm1 + paddq %mm3,%mm1 + movd %mm1,28(%eax) + leal 32(%eax),%eax + psrlq $32,%mm1 + subl $8,%ecx + jz .L002maw_sse2_exit +.L000maw_sse2_entry: + testl $4294967288,%ecx + jnz .L001maw_sse2_unrolled +.align 4 +.L003maw_sse2_loop: + movd (%edx),%mm2 + movd (%eax),%mm3 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm3,%mm1 + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz .L003maw_sse2_loop +.L002maw_sse2_exit: + movd %mm1,%eax + emms + ret + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size bn_mul_add_words,.-.L_bn_mul_add_words_begin +.globl bn_mul_words +.hidden bn_mul_words +.type bn_mul_words,@function +.align 16 +bn_mul_words: +.L_bn_mul_words_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + movd 16(%esp),%mm0 + pxor %mm1,%mm1 +.align 16 +.L004mw_sse2_loop: + movd (%edx),%mm2 + pmuludq %mm0,%mm2 + leal 4(%edx),%edx + paddq %mm2,%mm1 + movd %mm1,(%eax) + subl $1,%ecx + psrlq $32,%mm1 + leal 4(%eax),%eax + jnz .L004mw_sse2_loop + movd %mm1,%eax + emms + ret + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size bn_mul_words,.-.L_bn_mul_words_begin +.globl bn_sqr_add_words +.hidden bn_sqr_add_words +.type bn_sqr_add_words,@function +.align 16 +bn_sqr_add_words: +.L_bn_sqr_add_words_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + movl 12(%esp),%ecx + pxor %mm1,%mm1 +.align 16 +.L005sqr_sse2_loop: + movd (%edx),%mm0 + movd (%eax),%mm2 + movd 4(%eax),%mm3 + pmuludq %mm0,%mm0 + leal 4(%edx),%edx + paddq %mm0,%mm1 + paddq %mm2,%mm1 + movd %mm1,(%eax) + psrlq $32,%mm1 + paddq %mm3,%mm1 + movd %mm1,4(%eax) + psrlq $32,%mm1 + subl $1,%ecx + leal 8(%eax),%eax + jnz .L005sqr_sse2_loop + emms + ret + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size bn_sqr_add_words,.-.L_bn_sqr_add_words_begin +.globl bn_add_words +.hidden bn_add_words +.type bn_add_words,@function +.align 16 +bn_add_words: +.L_bn_add_words_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + movl 20(%esp),%ebx + movl 24(%esp),%esi + movl 28(%esp),%edi + movl 32(%esp),%ebp + xorl %eax,%eax + andl $4294967288,%ebp + jz .L006aw_finish +.L007aw_loop: + + movl (%esi),%ecx + movl (%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,(%ebx) + + movl 4(%esi),%ecx + movl 4(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,4(%ebx) + + movl 8(%esi),%ecx + movl 8(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,8(%ebx) + + movl 12(%esi),%ecx + movl 12(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,12(%ebx) + + movl 16(%esi),%ecx + movl 16(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,16(%ebx) + + movl 20(%esi),%ecx + movl 20(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,20(%ebx) + + movl 24(%esi),%ecx + movl 24(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) + + movl 28(%esi),%ecx + movl 28(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,28(%ebx) + + addl $32,%esi + addl $32,%edi + addl $32,%ebx + subl $8,%ebp + jnz .L007aw_loop +.L006aw_finish: + movl 32(%esp),%ebp + andl $7,%ebp + jz .L008aw_end + + movl (%esi),%ecx + movl (%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,(%ebx) + jz .L008aw_end + + movl 4(%esi),%ecx + movl 4(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,4(%ebx) + jz .L008aw_end + + movl 8(%esi),%ecx + movl 8(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,8(%ebx) + jz .L008aw_end + + movl 12(%esi),%ecx + movl 12(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,12(%ebx) + jz .L008aw_end + + movl 16(%esi),%ecx + movl 16(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,16(%ebx) + jz .L008aw_end + + movl 20(%esi),%ecx + movl 20(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,20(%ebx) + jz .L008aw_end + + movl 24(%esi),%ecx + movl 24(%edi),%edx + addl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + addl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) +.L008aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size bn_add_words,.-.L_bn_add_words_begin +.globl bn_sub_words +.hidden bn_sub_words +.type bn_sub_words,@function +.align 16 +bn_sub_words: +.L_bn_sub_words_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + + movl 20(%esp),%ebx + movl 24(%esp),%esi + movl 28(%esp),%edi + movl 32(%esp),%ebp + xorl %eax,%eax + andl $4294967288,%ebp + jz .L009aw_finish +.L010aw_loop: + + movl (%esi),%ecx + movl (%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,(%ebx) + + movl 4(%esi),%ecx + movl 4(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,4(%ebx) + + movl 8(%esi),%ecx + movl 8(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,8(%ebx) + + movl 12(%esi),%ecx + movl 12(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,12(%ebx) + + movl 16(%esi),%ecx + movl 16(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,16(%ebx) + + movl 20(%esi),%ecx + movl 20(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,20(%ebx) + + movl 24(%esi),%ecx + movl 24(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) + + movl 28(%esi),%ecx + movl 28(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,28(%ebx) + + addl $32,%esi + addl $32,%edi + addl $32,%ebx + subl $8,%ebp + jnz .L010aw_loop +.L009aw_finish: + movl 32(%esp),%ebp + andl $7,%ebp + jz .L011aw_end + + movl (%esi),%ecx + movl (%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,(%ebx) + jz .L011aw_end + + movl 4(%esi),%ecx + movl 4(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,4(%ebx) + jz .L011aw_end + + movl 8(%esi),%ecx + movl 8(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,8(%ebx) + jz .L011aw_end + + movl 12(%esi),%ecx + movl 12(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,12(%ebx) + jz .L011aw_end + + movl 16(%esi),%ecx + movl 16(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,16(%ebx) + jz .L011aw_end + + movl 20(%esi),%ecx + movl 20(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + decl %ebp + movl %ecx,20(%ebx) + jz .L011aw_end + + movl 24(%esi),%ecx + movl 24(%edi),%edx + subl %eax,%ecx + movl $0,%eax + adcl %eax,%eax + subl %edx,%ecx + adcl $0,%eax + movl %ecx,24(%ebx) +.L011aw_end: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size bn_sub_words,.-.L_bn_sub_words_begin +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/bn-586-win.asm b/third_party/boringssl/gen/bcm/bn-586-win.asm new file mode 100644 index 00000000..03740cdd --- /dev/null +++ b/third_party/boringssl/gen/bcm/bn-586-win.asm @@ -0,0 +1,536 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _bn_mul_add_words +align 16 +_bn_mul_add_words: +L$_bn_mul_add_words_begin: + mov eax,DWORD [4+esp] + mov edx,DWORD [8+esp] + mov ecx,DWORD [12+esp] + movd mm0,DWORD [16+esp] + pxor mm1,mm1 + jmp NEAR L$000maw_sse2_entry +align 16 +L$001maw_sse2_unrolled: + movd mm3,DWORD [eax] + paddq mm1,mm3 + movd mm2,DWORD [edx] + pmuludq mm2,mm0 + movd mm4,DWORD [4+edx] + pmuludq mm4,mm0 + movd mm6,DWORD [8+edx] + pmuludq mm6,mm0 + movd mm7,DWORD [12+edx] + pmuludq mm7,mm0 + paddq mm1,mm2 + movd mm3,DWORD [4+eax] + paddq mm3,mm4 + movd mm5,DWORD [8+eax] + paddq mm5,mm6 + movd mm4,DWORD [12+eax] + paddq mm7,mm4 + movd DWORD [eax],mm1 + movd mm2,DWORD [16+edx] + pmuludq mm2,mm0 + psrlq mm1,32 + movd mm4,DWORD [20+edx] + pmuludq mm4,mm0 + paddq mm1,mm3 + movd mm6,DWORD [24+edx] + pmuludq mm6,mm0 + movd DWORD [4+eax],mm1 + psrlq mm1,32 + movd mm3,DWORD [28+edx] + add edx,32 + pmuludq mm3,mm0 + paddq mm1,mm5 + movd mm5,DWORD [16+eax] + paddq mm2,mm5 + movd DWORD [8+eax],mm1 + psrlq mm1,32 + paddq mm1,mm7 + movd mm5,DWORD [20+eax] + paddq mm4,mm5 + movd DWORD [12+eax],mm1 + psrlq mm1,32 + paddq mm1,mm2 + movd mm5,DWORD [24+eax] + paddq mm6,mm5 + movd DWORD [16+eax],mm1 + psrlq mm1,32 + paddq mm1,mm4 + movd mm5,DWORD [28+eax] + paddq mm3,mm5 + movd DWORD [20+eax],mm1 + psrlq mm1,32 + paddq mm1,mm6 + movd DWORD [24+eax],mm1 + psrlq mm1,32 + paddq mm1,mm3 + movd DWORD [28+eax],mm1 + lea eax,[32+eax] + psrlq mm1,32 + sub ecx,8 + jz NEAR L$002maw_sse2_exit +L$000maw_sse2_entry: + test ecx,4294967288 + jnz NEAR L$001maw_sse2_unrolled +align 4 +L$003maw_sse2_loop: + movd mm2,DWORD [edx] + movd mm3,DWORD [eax] + pmuludq mm2,mm0 + lea edx,[4+edx] + paddq mm1,mm3 + paddq mm1,mm2 + movd DWORD [eax],mm1 + sub ecx,1 + psrlq mm1,32 + lea eax,[4+eax] + jnz NEAR L$003maw_sse2_loop +L$002maw_sse2_exit: + movd eax,mm1 + emms + ret + pop edi + pop esi + pop ebx + pop ebp + ret +global _bn_mul_words +align 16 +_bn_mul_words: +L$_bn_mul_words_begin: + mov eax,DWORD [4+esp] + mov edx,DWORD [8+esp] + mov ecx,DWORD [12+esp] + movd mm0,DWORD [16+esp] + pxor mm1,mm1 +align 16 +L$004mw_sse2_loop: + movd mm2,DWORD [edx] + pmuludq mm2,mm0 + lea edx,[4+edx] + paddq mm1,mm2 + movd DWORD [eax],mm1 + sub ecx,1 + psrlq mm1,32 + lea eax,[4+eax] + jnz NEAR L$004mw_sse2_loop + movd eax,mm1 + emms + ret + pop edi + pop esi + pop ebx + pop ebp + ret +global _bn_sqr_add_words +align 16 +_bn_sqr_add_words: +L$_bn_sqr_add_words_begin: + mov eax,DWORD [4+esp] + mov edx,DWORD [8+esp] + mov ecx,DWORD [12+esp] + pxor mm1,mm1 +align 16 +L$005sqr_sse2_loop: + movd mm0,DWORD [edx] + movd mm2,DWORD [eax] + movd mm3,DWORD [4+eax] + pmuludq mm0,mm0 + lea edx,[4+edx] + paddq mm1,mm0 + paddq mm1,mm2 + movd DWORD [eax],mm1 + psrlq mm1,32 + paddq mm1,mm3 + movd DWORD [4+eax],mm1 + psrlq mm1,32 + sub ecx,1 + lea eax,[8+eax] + jnz NEAR L$005sqr_sse2_loop + emms + ret + pop edi + pop esi + pop ebx + pop ebp + ret +global _bn_add_words +align 16 +_bn_add_words: +L$_bn_add_words_begin: + push ebp + push ebx + push esi + push edi + ; + mov ebx,DWORD [20+esp] + mov esi,DWORD [24+esp] + mov edi,DWORD [28+esp] + mov ebp,DWORD [32+esp] + xor eax,eax + and ebp,4294967288 + jz NEAR L$006aw_finish +L$007aw_loop: + ; Round 0 + mov ecx,DWORD [esi] + mov edx,DWORD [edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + mov DWORD [ebx],ecx + ; Round 1 + mov ecx,DWORD [4+esi] + mov edx,DWORD [4+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + mov DWORD [4+ebx],ecx + ; Round 2 + mov ecx,DWORD [8+esi] + mov edx,DWORD [8+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + mov DWORD [8+ebx],ecx + ; Round 3 + mov ecx,DWORD [12+esi] + mov edx,DWORD [12+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + mov DWORD [12+ebx],ecx + ; Round 4 + mov ecx,DWORD [16+esi] + mov edx,DWORD [16+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + mov DWORD [16+ebx],ecx + ; Round 5 + mov ecx,DWORD [20+esi] + mov edx,DWORD [20+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + mov DWORD [20+ebx],ecx + ; Round 6 + mov ecx,DWORD [24+esi] + mov edx,DWORD [24+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + mov DWORD [24+ebx],ecx + ; Round 7 + mov ecx,DWORD [28+esi] + mov edx,DWORD [28+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + mov DWORD [28+ebx],ecx + ; + add esi,32 + add edi,32 + add ebx,32 + sub ebp,8 + jnz NEAR L$007aw_loop +L$006aw_finish: + mov ebp,DWORD [32+esp] + and ebp,7 + jz NEAR L$008aw_end + ; Tail Round 0 + mov ecx,DWORD [esi] + mov edx,DWORD [edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + dec ebp + mov DWORD [ebx],ecx + jz NEAR L$008aw_end + ; Tail Round 1 + mov ecx,DWORD [4+esi] + mov edx,DWORD [4+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + dec ebp + mov DWORD [4+ebx],ecx + jz NEAR L$008aw_end + ; Tail Round 2 + mov ecx,DWORD [8+esi] + mov edx,DWORD [8+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + dec ebp + mov DWORD [8+ebx],ecx + jz NEAR L$008aw_end + ; Tail Round 3 + mov ecx,DWORD [12+esi] + mov edx,DWORD [12+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + dec ebp + mov DWORD [12+ebx],ecx + jz NEAR L$008aw_end + ; Tail Round 4 + mov ecx,DWORD [16+esi] + mov edx,DWORD [16+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + dec ebp + mov DWORD [16+ebx],ecx + jz NEAR L$008aw_end + ; Tail Round 5 + mov ecx,DWORD [20+esi] + mov edx,DWORD [20+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + dec ebp + mov DWORD [20+ebx],ecx + jz NEAR L$008aw_end + ; Tail Round 6 + mov ecx,DWORD [24+esi] + mov edx,DWORD [24+edi] + add ecx,eax + mov eax,0 + adc eax,eax + add ecx,edx + adc eax,0 + mov DWORD [24+ebx],ecx +L$008aw_end: + pop edi + pop esi + pop ebx + pop ebp + ret +global _bn_sub_words +align 16 +_bn_sub_words: +L$_bn_sub_words_begin: + push ebp + push ebx + push esi + push edi + ; + mov ebx,DWORD [20+esp] + mov esi,DWORD [24+esp] + mov edi,DWORD [28+esp] + mov ebp,DWORD [32+esp] + xor eax,eax + and ebp,4294967288 + jz NEAR L$009aw_finish +L$010aw_loop: + ; Round 0 + mov ecx,DWORD [esi] + mov edx,DWORD [edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + mov DWORD [ebx],ecx + ; Round 1 + mov ecx,DWORD [4+esi] + mov edx,DWORD [4+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + mov DWORD [4+ebx],ecx + ; Round 2 + mov ecx,DWORD [8+esi] + mov edx,DWORD [8+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + mov DWORD [8+ebx],ecx + ; Round 3 + mov ecx,DWORD [12+esi] + mov edx,DWORD [12+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + mov DWORD [12+ebx],ecx + ; Round 4 + mov ecx,DWORD [16+esi] + mov edx,DWORD [16+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + mov DWORD [16+ebx],ecx + ; Round 5 + mov ecx,DWORD [20+esi] + mov edx,DWORD [20+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + mov DWORD [20+ebx],ecx + ; Round 6 + mov ecx,DWORD [24+esi] + mov edx,DWORD [24+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + mov DWORD [24+ebx],ecx + ; Round 7 + mov ecx,DWORD [28+esi] + mov edx,DWORD [28+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + mov DWORD [28+ebx],ecx + ; + add esi,32 + add edi,32 + add ebx,32 + sub ebp,8 + jnz NEAR L$010aw_loop +L$009aw_finish: + mov ebp,DWORD [32+esp] + and ebp,7 + jz NEAR L$011aw_end + ; Tail Round 0 + mov ecx,DWORD [esi] + mov edx,DWORD [edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + dec ebp + mov DWORD [ebx],ecx + jz NEAR L$011aw_end + ; Tail Round 1 + mov ecx,DWORD [4+esi] + mov edx,DWORD [4+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + dec ebp + mov DWORD [4+ebx],ecx + jz NEAR L$011aw_end + ; Tail Round 2 + mov ecx,DWORD [8+esi] + mov edx,DWORD [8+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + dec ebp + mov DWORD [8+ebx],ecx + jz NEAR L$011aw_end + ; Tail Round 3 + mov ecx,DWORD [12+esi] + mov edx,DWORD [12+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + dec ebp + mov DWORD [12+ebx],ecx + jz NEAR L$011aw_end + ; Tail Round 4 + mov ecx,DWORD [16+esi] + mov edx,DWORD [16+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + dec ebp + mov DWORD [16+ebx],ecx + jz NEAR L$011aw_end + ; Tail Round 5 + mov ecx,DWORD [20+esi] + mov edx,DWORD [20+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + dec ebp + mov DWORD [20+ebx],ecx + jz NEAR L$011aw_end + ; Tail Round 6 + mov ecx,DWORD [24+esi] + mov edx,DWORD [24+edi] + sub ecx,eax + mov eax,0 + adc eax,eax + sub ecx,edx + adc eax,0 + mov DWORD [24+ebx],ecx +L$011aw_end: + pop edi + pop esi + pop ebx + pop ebp + ret +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/bn-armv8-apple.S b/third_party/boringssl/gen/bcm/bn-armv8-apple.S new file mode 100644 index 00000000..d37c48e8 --- /dev/null +++ b/third_party/boringssl/gen/bcm/bn-armv8-apple.S @@ -0,0 +1,87 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.text + +// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, +// size_t num); + +.globl _bn_add_words +.private_extern _bn_add_words +.align 4 +_bn_add_words: + AARCH64_VALID_CALL_TARGET + # Clear the carry flag. + cmn xzr, xzr + + # aarch64 can load two registers at a time, so we do two loop iterations at + # at a time. Split x3 = 2 * x8 + x3. This allows loop + # operations to use CBNZ without clobbering the carry flag. + lsr x8, x3, #1 + and x3, x3, #1 + + cbz x8, Ladd_tail +Ladd_loop: + ldp x4, x5, [x1], #16 + ldp x6, x7, [x2], #16 + sub x8, x8, #1 + adcs x4, x4, x6 + adcs x5, x5, x7 + stp x4, x5, [x0], #16 + cbnz x8, Ladd_loop + +Ladd_tail: + cbz x3, Ladd_exit + ldr x4, [x1], #8 + ldr x6, [x2], #8 + adcs x4, x4, x6 + str x4, [x0], #8 + +Ladd_exit: + cset x0, cs + ret + + +// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, +// size_t num); + +.globl _bn_sub_words +.private_extern _bn_sub_words +.align 4 +_bn_sub_words: + AARCH64_VALID_CALL_TARGET + # Set the carry flag. Arm's borrow bit is flipped from the carry flag, + # so we want C = 1 here. + cmp xzr, xzr + + # aarch64 can load two registers at a time, so we do two loop iterations at + # at a time. Split x3 = 2 * x8 + x3. This allows loop + # operations to use CBNZ without clobbering the carry flag. + lsr x8, x3, #1 + and x3, x3, #1 + + cbz x8, Lsub_tail +Lsub_loop: + ldp x4, x5, [x1], #16 + ldp x6, x7, [x2], #16 + sub x8, x8, #1 + sbcs x4, x4, x6 + sbcs x5, x5, x7 + stp x4, x5, [x0], #16 + cbnz x8, Lsub_loop + +Lsub_tail: + cbz x3, Lsub_exit + ldr x4, [x1], #8 + ldr x6, [x2], #8 + sbcs x4, x4, x6 + str x4, [x0], #8 + +Lsub_exit: + cset x0, cc + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/bn-armv8-linux.S b/third_party/boringssl/gen/bcm/bn-armv8-linux.S new file mode 100644 index 00000000..50be088d --- /dev/null +++ b/third_party/boringssl/gen/bcm/bn-armv8-linux.S @@ -0,0 +1,87 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.text + +// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, +// size_t num); +.type bn_add_words, %function +.globl bn_add_words +.hidden bn_add_words +.align 4 +bn_add_words: + AARCH64_VALID_CALL_TARGET + # Clear the carry flag. + cmn xzr, xzr + + # aarch64 can load two registers at a time, so we do two loop iterations at + # at a time. Split x3 = 2 * x8 + x3. This allows loop + # operations to use CBNZ without clobbering the carry flag. + lsr x8, x3, #1 + and x3, x3, #1 + + cbz x8, .Ladd_tail +.Ladd_loop: + ldp x4, x5, [x1], #16 + ldp x6, x7, [x2], #16 + sub x8, x8, #1 + adcs x4, x4, x6 + adcs x5, x5, x7 + stp x4, x5, [x0], #16 + cbnz x8, .Ladd_loop + +.Ladd_tail: + cbz x3, .Ladd_exit + ldr x4, [x1], #8 + ldr x6, [x2], #8 + adcs x4, x4, x6 + str x4, [x0], #8 + +.Ladd_exit: + cset x0, cs + ret +.size bn_add_words,.-bn_add_words + +// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, +// size_t num); +.type bn_sub_words, %function +.globl bn_sub_words +.hidden bn_sub_words +.align 4 +bn_sub_words: + AARCH64_VALID_CALL_TARGET + # Set the carry flag. Arm's borrow bit is flipped from the carry flag, + # so we want C = 1 here. + cmp xzr, xzr + + # aarch64 can load two registers at a time, so we do two loop iterations at + # at a time. Split x3 = 2 * x8 + x3. This allows loop + # operations to use CBNZ without clobbering the carry flag. + lsr x8, x3, #1 + and x3, x3, #1 + + cbz x8, .Lsub_tail +.Lsub_loop: + ldp x4, x5, [x1], #16 + ldp x6, x7, [x2], #16 + sub x8, x8, #1 + sbcs x4, x4, x6 + sbcs x5, x5, x7 + stp x4, x5, [x0], #16 + cbnz x8, .Lsub_loop + +.Lsub_tail: + cbz x3, .Lsub_exit + ldr x4, [x1], #8 + ldr x6, [x2], #8 + sbcs x4, x4, x6 + str x4, [x0], #8 + +.Lsub_exit: + cset x0, cc + ret +.size bn_sub_words,.-bn_sub_words +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/bn-armv8-win.S b/third_party/boringssl/gen/bcm/bn-armv8-win.S new file mode 100644 index 00000000..6131bd94 --- /dev/null +++ b/third_party/boringssl/gen/bcm/bn-armv8-win.S @@ -0,0 +1,87 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.text + +// BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, +// size_t num); + +.globl bn_add_words + +.align 4 +bn_add_words: + AARCH64_VALID_CALL_TARGET + # Clear the carry flag. + cmn xzr, xzr + + # aarch64 can load two registers at a time, so we do two loop iterations at + # at a time. Split x3 = 2 * x8 + x3. This allows loop + # operations to use CBNZ without clobbering the carry flag. + lsr x8, x3, #1 + and x3, x3, #1 + + cbz x8, Ladd_tail +Ladd_loop: + ldp x4, x5, [x1], #16 + ldp x6, x7, [x2], #16 + sub x8, x8, #1 + adcs x4, x4, x6 + adcs x5, x5, x7 + stp x4, x5, [x0], #16 + cbnz x8, Ladd_loop + +Ladd_tail: + cbz x3, Ladd_exit + ldr x4, [x1], #8 + ldr x6, [x2], #8 + adcs x4, x4, x6 + str x4, [x0], #8 + +Ladd_exit: + cset x0, cs + ret + + +// BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, +// size_t num); + +.globl bn_sub_words + +.align 4 +bn_sub_words: + AARCH64_VALID_CALL_TARGET + # Set the carry flag. Arm's borrow bit is flipped from the carry flag, + # so we want C = 1 here. + cmp xzr, xzr + + # aarch64 can load two registers at a time, so we do two loop iterations at + # at a time. Split x3 = 2 * x8 + x3. This allows loop + # operations to use CBNZ without clobbering the carry flag. + lsr x8, x3, #1 + and x3, x3, #1 + + cbz x8, Lsub_tail +Lsub_loop: + ldp x4, x5, [x1], #16 + ldp x6, x7, [x2], #16 + sub x8, x8, #1 + sbcs x4, x4, x6 + sbcs x5, x5, x7 + stp x4, x5, [x0], #16 + cbnz x8, Lsub_loop + +Lsub_tail: + cbz x3, Lsub_exit + ldr x4, [x1], #8 + ldr x6, [x2], #8 + sbcs x4, x4, x6 + str x4, [x0], #8 + +Lsub_exit: + cset x0, cc + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/bsaes-armv7-linux.S b/third_party/boringssl/gen/bcm/bsaes-armv7-linux.S new file mode 100644 index 00000000..01177475 --- /dev/null +++ b/third_party/boringssl/gen/bcm/bsaes-armv7-linux.S @@ -0,0 +1,1520 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the Apache License, Version 2.0 (the "License"); +@ you may not use this file except in compliance with the License. +@ You may obtain a copy of the License at +@ +@ https://www.apache.org/licenses/LICENSE-2.0 +@ +@ Unless required by applicable law or agreed to in writing, software +@ distributed under the License is distributed on an "AS IS" BASIS, +@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ See the License for the specific language governing permissions and +@ limitations under the License. + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. +@ +@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel +@ of Linaro. +@ ==================================================================== + +@ Bit-sliced AES for ARM NEON +@ +@ February 2012. +@ +@ This implementation is direct adaptation of bsaes-x86_64 module for +@ ARM NEON. Except that this module is endian-neutral [in sense that +@ it can be compiled for either endianness] by courtesy of vld1.8's +@ neutrality. Initial version doesn't implement interface to OpenSSL, +@ only low-level primitives and unsupported entry points, just enough +@ to collect performance results, which for Cortex-A8 core are: +@ +@ encrypt 19.5 cycles per byte processed with 128-bit key +@ decrypt 22.1 cycles per byte processed with 128-bit key +@ key conv. 440 cycles per 128-bit key/0.18 of 8x block +@ +@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, +@ which is [much] worse than anticipated (for further details see +@ http://www.openssl.org/~appro/Snapdragon-S4.html). +@ +@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code +@ manages in 20.0 cycles]. +@ +@ When comparing to x86_64 results keep in mind that NEON unit is +@ [mostly] single-issue and thus can't [fully] benefit from +@ instruction-level parallelism. And when comparing to aes-armv4 +@ results keep in mind key schedule conversion overhead (see +@ bsaes-x86_64.pl for further details)... +@ +@ + +@ April-August 2013 +@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. + +#ifndef __KERNEL__ +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +# define VFP_ABI_FRAME 0x40 +#else +# define VFP_ABI_PUSH +# define VFP_ABI_POP +# define VFP_ABI_FRAME 0 +# define BSAES_ASM_EXTENDED_KEY +# define XTS_CHAIN_TWEAK +# define __ARM_MAX_ARCH__ 7 +#endif + +#ifdef __thumb__ +# define adrl adr +#endif + +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.text +.syntax unified @ ARMv7-capable assembler is expected to handle this +#if defined(__thumb2__) && !defined(__APPLE__) +.thumb +#else +.code 32 +# undef __thumb2__ +#endif + +.type _bsaes_decrypt8,%function +.align 4 +_bsaes_decrypt8: + adr r6,. + vldmia r4!, {q9} @ round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr r6,.LM0ISR +#else + add r6,r6,#.LM0ISR-_bsaes_decrypt8 +#endif + + vldmia r6!, {q8} @ .LM0ISR + veor q10, q0, q9 @ xor with round0 key + veor q11, q1, q9 + vtbl.8 d0, {q10}, d16 + vtbl.8 d1, {q10}, d17 + veor q12, q2, q9 + vtbl.8 d2, {q11}, d16 + vtbl.8 d3, {q11}, d17 + veor q13, q3, q9 + vtbl.8 d4, {q12}, d16 + vtbl.8 d5, {q12}, d17 + veor q14, q4, q9 + vtbl.8 d6, {q13}, d16 + vtbl.8 d7, {q13}, d17 + veor q15, q5, q9 + vtbl.8 d8, {q14}, d16 + vtbl.8 d9, {q14}, d17 + veor q10, q6, q9 + vtbl.8 d10, {q15}, d16 + vtbl.8 d11, {q15}, d17 + veor q11, q7, q9 + vtbl.8 d12, {q10}, d16 + vtbl.8 d13, {q10}, d17 + vtbl.8 d14, {q11}, d16 + vtbl.8 d15, {q11}, d17 + vmov.i8 q8,#0x55 @ compose .LBS0 + vmov.i8 q9,#0x33 @ compose .LBS1 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q4, #1 + veor q10, q10, q7 + veor q11, q11, q5 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #1 + veor q5, q5, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q3 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q3, q3, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose .LBS2 + vshr.u64 q10, q5, #2 + vshr.u64 q11, q4, #2 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q7, q7, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q5, q5, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q3 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q3, q3, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q3, #4 + vshr.u64 q11, q2, #4 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q6, q6, q11 + vshl.u64 q11, q11, #4 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q5 + veor q11, q11, q4 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q4, q4, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + sub r5,r5,#1 + b .Ldec_sbox +.align 4 +.Ldec_loop: + vldmia r4!, {q8,q9,q10,q11} + veor q8, q8, q0 + veor q9, q9, q1 + vtbl.8 d0, {q8}, d24 + vtbl.8 d1, {q8}, d25 + vldmia r4!, {q8} + veor q10, q10, q2 + vtbl.8 d2, {q9}, d24 + vtbl.8 d3, {q9}, d25 + vldmia r4!, {q9} + veor q11, q11, q3 + vtbl.8 d4, {q10}, d24 + vtbl.8 d5, {q10}, d25 + vldmia r4!, {q10} + vtbl.8 d6, {q11}, d24 + vtbl.8 d7, {q11}, d25 + vldmia r4!, {q11} + veor q8, q8, q4 + veor q9, q9, q5 + vtbl.8 d8, {q8}, d24 + vtbl.8 d9, {q8}, d25 + veor q10, q10, q6 + vtbl.8 d10, {q9}, d24 + vtbl.8 d11, {q9}, d25 + veor q11, q11, q7 + vtbl.8 d12, {q10}, d24 + vtbl.8 d13, {q10}, d25 + vtbl.8 d14, {q11}, d24 + vtbl.8 d15, {q11}, d25 +.Ldec_sbox: + veor q1, q1, q4 + veor q3, q3, q4 + + veor q4, q4, q7 + veor q1, q1, q6 + veor q2, q2, q7 + veor q6, q6, q4 + + veor q0, q0, q1 + veor q2, q2, q5 + veor q7, q7, q6 + veor q3, q3, q0 + veor q5, q5, q0 + veor q1, q1, q3 + veor q11, q3, q0 + veor q10, q7, q4 + veor q9, q1, q6 + veor q13, q4, q0 + vmov q8, q10 + veor q12, q5, q2 + + vorr q10, q10, q9 + veor q15, q11, q8 + vand q14, q11, q12 + vorr q11, q11, q12 + veor q12, q12, q9 + vand q8, q8, q9 + veor q9, q6, q2 + vand q15, q15, q12 + vand q13, q13, q9 + veor q9, q3, q7 + veor q12, q1, q5 + veor q11, q11, q13 + veor q10, q10, q13 + vand q13, q9, q12 + vorr q9, q9, q12 + veor q11, q11, q15 + veor q8, q8, q13 + veor q10, q10, q14 + veor q9, q9, q15 + veor q8, q8, q14 + vand q12, q4, q6 + veor q9, q9, q14 + vand q13, q0, q2 + vand q14, q7, q1 + vorr q15, q3, q5 + veor q11, q11, q12 + veor q9, q9, q14 + veor q8, q8, q15 + veor q10, q10, q13 + + @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + + @ new smaller inversion + + vand q14, q11, q9 + vmov q12, q8 + + veor q13, q10, q14 + veor q15, q8, q14 + veor q14, q8, q14 @ q14=q15 + + vbsl q13, q9, q8 + vbsl q15, q11, q10 + veor q11, q11, q10 + + vbsl q12, q13, q14 + vbsl q8, q14, q13 + + vand q14, q12, q15 + veor q9, q9, q8 + + veor q14, q14, q11 + veor q12, q5, q2 + veor q8, q1, q6 + veor q10, q15, q14 + vand q10, q10, q5 + veor q5, q5, q1 + vand q11, q1, q15 + vand q5, q5, q14 + veor q1, q11, q10 + veor q5, q5, q11 + veor q15, q15, q13 + veor q14, q14, q9 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q2 + veor q12, q12, q8 + veor q2, q2, q6 + vand q8, q8, q15 + vand q6, q6, q13 + vand q12, q12, q14 + vand q2, q2, q9 + veor q8, q8, q12 + veor q2, q2, q6 + veor q12, q12, q11 + veor q6, q6, q10 + veor q5, q5, q12 + veor q2, q2, q12 + veor q1, q1, q8 + veor q6, q6, q8 + + veor q12, q3, q0 + veor q8, q7, q4 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q0 + veor q12, q12, q8 + veor q0, q0, q4 + vand q8, q8, q15 + vand q4, q4, q13 + vand q12, q12, q14 + vand q0, q0, q9 + veor q8, q8, q12 + veor q0, q0, q4 + veor q12, q12, q11 + veor q4, q4, q10 + veor q15, q15, q13 + veor q14, q14, q9 + veor q10, q15, q14 + vand q10, q10, q3 + veor q3, q3, q7 + vand q11, q7, q15 + vand q3, q3, q14 + veor q7, q11, q10 + veor q3, q3, q11 + veor q3, q3, q12 + veor q0, q0, q12 + veor q7, q7, q8 + veor q4, q4, q8 + veor q1, q1, q7 + veor q6, q6, q5 + + veor q4, q4, q1 + veor q2, q2, q7 + veor q5, q5, q7 + veor q4, q4, q2 + veor q7, q7, q0 + veor q4, q4, q5 + veor q3, q3, q6 + veor q6, q6, q1 + veor q3, q3, q4 + + veor q4, q4, q0 + veor q7, q7, q3 + subs r5,r5,#1 + bcc .Ldec_done + @ multiplication by 0x05-0x00-0x04-0x00 + vext.8 q8, q0, q0, #8 + vext.8 q14, q3, q3, #8 + vext.8 q15, q5, q5, #8 + veor q8, q8, q0 + vext.8 q9, q1, q1, #8 + veor q14, q14, q3 + vext.8 q10, q6, q6, #8 + veor q15, q15, q5 + vext.8 q11, q4, q4, #8 + veor q9, q9, q1 + vext.8 q12, q2, q2, #8 + veor q10, q10, q6 + vext.8 q13, q7, q7, #8 + veor q11, q11, q4 + veor q12, q12, q2 + veor q13, q13, q7 + + veor q0, q0, q14 + veor q1, q1, q14 + veor q6, q6, q8 + veor q2, q2, q10 + veor q4, q4, q9 + veor q1, q1, q15 + veor q6, q6, q15 + veor q2, q2, q14 + veor q7, q7, q11 + veor q4, q4, q14 + veor q3, q3, q12 + veor q2, q2, q15 + veor q7, q7, q15 + veor q5, q5, q13 + vext.8 q8, q0, q0, #12 @ x0 <<< 32 + vext.8 q9, q1, q1, #12 + veor q0, q0, q8 @ x0 ^ (x0 <<< 32) + vext.8 q10, q6, q6, #12 + veor q1, q1, q9 + vext.8 q11, q4, q4, #12 + veor q6, q6, q10 + vext.8 q12, q2, q2, #12 + veor q4, q4, q11 + vext.8 q13, q7, q7, #12 + veor q2, q2, q12 + vext.8 q14, q3, q3, #12 + veor q7, q7, q13 + vext.8 q15, q5, q5, #12 + veor q3, q3, q14 + + veor q9, q9, q0 + veor q5, q5, q15 + vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor q10, q10, q1 + veor q8, q8, q5 + veor q9, q9, q5 + vext.8 q1, q1, q1, #8 + veor q13, q13, q2 + veor q0, q0, q8 + veor q14, q14, q7 + veor q1, q1, q9 + vext.8 q8, q2, q2, #8 + veor q12, q12, q4 + vext.8 q9, q7, q7, #8 + veor q15, q15, q3 + vext.8 q2, q4, q4, #8 + veor q11, q11, q6 + vext.8 q7, q5, q5, #8 + veor q12, q12, q5 + vext.8 q4, q3, q3, #8 + veor q11, q11, q5 + vext.8 q3, q6, q6, #8 + veor q5, q9, q13 + veor q11, q11, q2 + veor q7, q7, q15 + veor q6, q4, q14 + veor q4, q8, q12 + veor q2, q3, q10 + vmov q3, q11 + @ vmov q5, q9 + vldmia r6, {q12} @ .LISR + ite eq @ Thumb2 thing, sanity check in ARM + addeq r6,r6,#0x10 + bne .Ldec_loop + vldmia r6, {q12} @ .LISRM0 + b .Ldec_loop +.align 4 +.Ldec_done: + vmov.i8 q8,#0x55 @ compose .LBS0 + vmov.i8 q9,#0x33 @ compose .LBS1 + vshr.u64 q10, q3, #1 + vshr.u64 q11, q2, #1 + veor q10, q10, q5 + veor q11, q11, q7 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #1 + veor q7, q7, q11 + vshl.u64 q11, q11, #1 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q4 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q4, q4, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose .LBS2 + vshr.u64 q10, q7, #2 + vshr.u64 q11, q2, #2 + veor q10, q10, q5 + veor q11, q11, q3 + vand q10, q10, q9 + vand q11, q11, q9 + veor q5, q5, q10 + vshl.u64 q10, q10, #2 + veor q3, q3, q11 + vshl.u64 q11, q11, #2 + veor q7, q7, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q4 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q4, q4, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q4, #4 + vshr.u64 q11, q6, #4 + veor q10, q10, q5 + veor q11, q11, q3 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q3, q3, q11 + vshl.u64 q11, q11, #4 + veor q4, q4, q10 + veor q6, q6, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q7 + veor q11, q11, q2 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q2, q2, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + vldmia r4, {q8} @ last round key + veor q6, q6, q8 + veor q4, q4, q8 + veor q2, q2, q8 + veor q7, q7, q8 + veor q3, q3, q8 + veor q5, q5, q8 + veor q0, q0, q8 + veor q1, q1, q8 + bx lr +.size _bsaes_decrypt8,.-_bsaes_decrypt8 + +.type _bsaes_const,%object +.align 6 +_bsaes_const: +.LM0ISR:@ InvShiftRows constants +.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 +.LISR: +.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 +.LISRM0: +.quad 0x01040b0e0205080f, 0x0306090c00070a0d +.LM0SR:@ ShiftRows constants +.quad 0x0a0e02060f03070b, 0x0004080c05090d01 +.LSR: +.quad 0x0504070600030201, 0x0f0e0d0c0a09080b +.LSRM0: +.quad 0x0304090e00050a0f, 0x01060b0c0207080d +.LM0: +.quad 0x02060a0e03070b0f, 0x0004080c0105090d +.LREVM0SR: +.quad 0x090d01050c000408, 0x03070b0f060a0e02 +.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 6 +.size _bsaes_const,.-_bsaes_const + +.type _bsaes_encrypt8,%function +.align 4 +_bsaes_encrypt8: + adr r6,. + vldmia r4!, {q9} @ round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr r6,.LM0SR +#else + sub r6,r6,#_bsaes_encrypt8-.LM0SR +#endif + + vldmia r6!, {q8} @ .LM0SR +_bsaes_encrypt8_alt: + veor q10, q0, q9 @ xor with round0 key + veor q11, q1, q9 + vtbl.8 d0, {q10}, d16 + vtbl.8 d1, {q10}, d17 + veor q12, q2, q9 + vtbl.8 d2, {q11}, d16 + vtbl.8 d3, {q11}, d17 + veor q13, q3, q9 + vtbl.8 d4, {q12}, d16 + vtbl.8 d5, {q12}, d17 + veor q14, q4, q9 + vtbl.8 d6, {q13}, d16 + vtbl.8 d7, {q13}, d17 + veor q15, q5, q9 + vtbl.8 d8, {q14}, d16 + vtbl.8 d9, {q14}, d17 + veor q10, q6, q9 + vtbl.8 d10, {q15}, d16 + vtbl.8 d11, {q15}, d17 + veor q11, q7, q9 + vtbl.8 d12, {q10}, d16 + vtbl.8 d13, {q10}, d17 + vtbl.8 d14, {q11}, d16 + vtbl.8 d15, {q11}, d17 +_bsaes_encrypt8_bitslice: + vmov.i8 q8,#0x55 @ compose .LBS0 + vmov.i8 q9,#0x33 @ compose .LBS1 + vshr.u64 q10, q6, #1 + vshr.u64 q11, q4, #1 + veor q10, q10, q7 + veor q11, q11, q5 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #1 + veor q5, q5, q11 + vshl.u64 q11, q11, #1 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q3 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q3, q3, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose .LBS2 + vshr.u64 q10, q5, #2 + vshr.u64 q11, q4, #2 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q9 + vand q11, q11, q9 + veor q7, q7, q10 + vshl.u64 q10, q10, #2 + veor q6, q6, q11 + vshl.u64 q11, q11, #2 + veor q5, q5, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q3 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q3, q3, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q3, #4 + vshr.u64 q11, q2, #4 + veor q10, q10, q7 + veor q11, q11, q6 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q6, q6, q11 + vshl.u64 q11, q11, #4 + veor q3, q3, q10 + veor q2, q2, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q5 + veor q11, q11, q4 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q4, q4, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + sub r5,r5,#1 + b .Lenc_sbox +.align 4 +.Lenc_loop: + vldmia r4!, {q8,q9,q10,q11} + veor q8, q8, q0 + veor q9, q9, q1 + vtbl.8 d0, {q8}, d24 + vtbl.8 d1, {q8}, d25 + vldmia r4!, {q8} + veor q10, q10, q2 + vtbl.8 d2, {q9}, d24 + vtbl.8 d3, {q9}, d25 + vldmia r4!, {q9} + veor q11, q11, q3 + vtbl.8 d4, {q10}, d24 + vtbl.8 d5, {q10}, d25 + vldmia r4!, {q10} + vtbl.8 d6, {q11}, d24 + vtbl.8 d7, {q11}, d25 + vldmia r4!, {q11} + veor q8, q8, q4 + veor q9, q9, q5 + vtbl.8 d8, {q8}, d24 + vtbl.8 d9, {q8}, d25 + veor q10, q10, q6 + vtbl.8 d10, {q9}, d24 + vtbl.8 d11, {q9}, d25 + veor q11, q11, q7 + vtbl.8 d12, {q10}, d24 + vtbl.8 d13, {q10}, d25 + vtbl.8 d14, {q11}, d24 + vtbl.8 d15, {q11}, d25 +.Lenc_sbox: + veor q2, q2, q1 + veor q5, q5, q6 + veor q3, q3, q0 + veor q6, q6, q2 + veor q5, q5, q0 + + veor q6, q6, q3 + veor q3, q3, q7 + veor q7, q7, q5 + veor q3, q3, q4 + veor q4, q4, q5 + + veor q2, q2, q7 + veor q3, q3, q1 + veor q1, q1, q5 + veor q11, q7, q4 + veor q10, q1, q2 + veor q9, q5, q3 + veor q13, q2, q4 + vmov q8, q10 + veor q12, q6, q0 + + vorr q10, q10, q9 + veor q15, q11, q8 + vand q14, q11, q12 + vorr q11, q11, q12 + veor q12, q12, q9 + vand q8, q8, q9 + veor q9, q3, q0 + vand q15, q15, q12 + vand q13, q13, q9 + veor q9, q7, q1 + veor q12, q5, q6 + veor q11, q11, q13 + veor q10, q10, q13 + vand q13, q9, q12 + vorr q9, q9, q12 + veor q11, q11, q15 + veor q8, q8, q13 + veor q10, q10, q14 + veor q9, q9, q15 + veor q8, q8, q14 + vand q12, q2, q3 + veor q9, q9, q14 + vand q13, q4, q0 + vand q14, q1, q5 + vorr q15, q7, q6 + veor q11, q11, q12 + veor q9, q9, q14 + veor q8, q8, q15 + veor q10, q10, q13 + + @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 + + @ new smaller inversion + + vand q14, q11, q9 + vmov q12, q8 + + veor q13, q10, q14 + veor q15, q8, q14 + veor q14, q8, q14 @ q14=q15 + + vbsl q13, q9, q8 + vbsl q15, q11, q10 + veor q11, q11, q10 + + vbsl q12, q13, q14 + vbsl q8, q14, q13 + + vand q14, q12, q15 + veor q9, q9, q8 + + veor q14, q14, q11 + veor q12, q6, q0 + veor q8, q5, q3 + veor q10, q15, q14 + vand q10, q10, q6 + veor q6, q6, q5 + vand q11, q5, q15 + vand q6, q6, q14 + veor q5, q11, q10 + veor q6, q6, q11 + veor q15, q15, q13 + veor q14, q14, q9 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q0 + veor q12, q12, q8 + veor q0, q0, q3 + vand q8, q8, q15 + vand q3, q3, q13 + vand q12, q12, q14 + vand q0, q0, q9 + veor q8, q8, q12 + veor q0, q0, q3 + veor q12, q12, q11 + veor q3, q3, q10 + veor q6, q6, q12 + veor q0, q0, q12 + veor q5, q5, q8 + veor q3, q3, q8 + + veor q12, q7, q4 + veor q8, q1, q2 + veor q11, q15, q14 + veor q10, q13, q9 + vand q11, q11, q12 + vand q10, q10, q4 + veor q12, q12, q8 + veor q4, q4, q2 + vand q8, q8, q15 + vand q2, q2, q13 + vand q12, q12, q14 + vand q4, q4, q9 + veor q8, q8, q12 + veor q4, q4, q2 + veor q12, q12, q11 + veor q2, q2, q10 + veor q15, q15, q13 + veor q14, q14, q9 + veor q10, q15, q14 + vand q10, q10, q7 + veor q7, q7, q1 + vand q11, q1, q15 + vand q7, q7, q14 + veor q1, q11, q10 + veor q7, q7, q11 + veor q7, q7, q12 + veor q4, q4, q12 + veor q1, q1, q8 + veor q2, q2, q8 + veor q7, q7, q0 + veor q1, q1, q6 + veor q6, q6, q0 + veor q4, q4, q7 + veor q0, q0, q1 + + veor q1, q1, q5 + veor q5, q5, q2 + veor q2, q2, q3 + veor q3, q3, q5 + veor q4, q4, q5 + + veor q6, q6, q3 + subs r5,r5,#1 + bcc .Lenc_done + vext.8 q8, q0, q0, #12 @ x0 <<< 32 + vext.8 q9, q1, q1, #12 + veor q0, q0, q8 @ x0 ^ (x0 <<< 32) + vext.8 q10, q4, q4, #12 + veor q1, q1, q9 + vext.8 q11, q6, q6, #12 + veor q4, q4, q10 + vext.8 q12, q3, q3, #12 + veor q6, q6, q11 + vext.8 q13, q7, q7, #12 + veor q3, q3, q12 + vext.8 q14, q2, q2, #12 + veor q7, q7, q13 + vext.8 q15, q5, q5, #12 + veor q2, q2, q14 + + veor q9, q9, q0 + veor q5, q5, q15 + vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) + veor q10, q10, q1 + veor q8, q8, q5 + veor q9, q9, q5 + vext.8 q1, q1, q1, #8 + veor q13, q13, q3 + veor q0, q0, q8 + veor q14, q14, q7 + veor q1, q1, q9 + vext.8 q8, q3, q3, #8 + veor q12, q12, q6 + vext.8 q9, q7, q7, #8 + veor q15, q15, q2 + vext.8 q3, q6, q6, #8 + veor q11, q11, q4 + vext.8 q7, q5, q5, #8 + veor q12, q12, q5 + vext.8 q6, q2, q2, #8 + veor q11, q11, q5 + vext.8 q2, q4, q4, #8 + veor q5, q9, q13 + veor q4, q8, q12 + veor q3, q3, q11 + veor q7, q7, q15 + veor q6, q6, q14 + @ vmov q4, q8 + veor q2, q2, q10 + @ vmov q5, q9 + vldmia r6, {q12} @ .LSR + ite eq @ Thumb2 thing, samity check in ARM + addeq r6,r6,#0x10 + bne .Lenc_loop + vldmia r6, {q12} @ .LSRM0 + b .Lenc_loop +.align 4 +.Lenc_done: + vmov.i8 q8,#0x55 @ compose .LBS0 + vmov.i8 q9,#0x33 @ compose .LBS1 + vshr.u64 q10, q2, #1 + vshr.u64 q11, q3, #1 + veor q10, q10, q5 + veor q11, q11, q7 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #1 + veor q7, q7, q11 + vshl.u64 q11, q11, #1 + veor q2, q2, q10 + veor q3, q3, q11 + vshr.u64 q10, q4, #1 + vshr.u64 q11, q0, #1 + veor q10, q10, q6 + veor q11, q11, q1 + vand q10, q10, q8 + vand q11, q11, q8 + veor q6, q6, q10 + vshl.u64 q10, q10, #1 + veor q1, q1, q11 + vshl.u64 q11, q11, #1 + veor q4, q4, q10 + veor q0, q0, q11 + vmov.i8 q8,#0x0f @ compose .LBS2 + vshr.u64 q10, q7, #2 + vshr.u64 q11, q3, #2 + veor q10, q10, q5 + veor q11, q11, q2 + vand q10, q10, q9 + vand q11, q11, q9 + veor q5, q5, q10 + vshl.u64 q10, q10, #2 + veor q2, q2, q11 + vshl.u64 q11, q11, #2 + veor q7, q7, q10 + veor q3, q3, q11 + vshr.u64 q10, q1, #2 + vshr.u64 q11, q0, #2 + veor q10, q10, q6 + veor q11, q11, q4 + vand q10, q10, q9 + vand q11, q11, q9 + veor q6, q6, q10 + vshl.u64 q10, q10, #2 + veor q4, q4, q11 + vshl.u64 q11, q11, #2 + veor q1, q1, q10 + veor q0, q0, q11 + vshr.u64 q10, q6, #4 + vshr.u64 q11, q4, #4 + veor q10, q10, q5 + veor q11, q11, q2 + vand q10, q10, q8 + vand q11, q11, q8 + veor q5, q5, q10 + vshl.u64 q10, q10, #4 + veor q2, q2, q11 + vshl.u64 q11, q11, #4 + veor q6, q6, q10 + veor q4, q4, q11 + vshr.u64 q10, q1, #4 + vshr.u64 q11, q0, #4 + veor q10, q10, q7 + veor q11, q11, q3 + vand q10, q10, q8 + vand q11, q11, q8 + veor q7, q7, q10 + vshl.u64 q10, q10, #4 + veor q3, q3, q11 + vshl.u64 q11, q11, #4 + veor q1, q1, q10 + veor q0, q0, q11 + vldmia r4, {q8} @ last round key + veor q4, q4, q8 + veor q6, q6, q8 + veor q3, q3, q8 + veor q7, q7, q8 + veor q2, q2, q8 + veor q5, q5, q8 + veor q0, q0, q8 + veor q1, q1, q8 + bx lr +.size _bsaes_encrypt8,.-_bsaes_encrypt8 +.type _bsaes_key_convert,%function +.align 4 +_bsaes_key_convert: + adr r6,. + vld1.8 {q7}, [r4]! @ load round 0 key +#if defined(__thumb2__) || defined(__APPLE__) + adr r6,.LM0 +#else + sub r6,r6,#_bsaes_key_convert-.LM0 +#endif + vld1.8 {q15}, [r4]! @ load round 1 key + + vmov.i8 q8, #0x01 @ bit masks + vmov.i8 q9, #0x02 + vmov.i8 q10, #0x04 + vmov.i8 q11, #0x08 + vmov.i8 q12, #0x10 + vmov.i8 q13, #0x20 + vldmia r6, {q14} @ .LM0 + +#ifdef __ARMEL__ + vrev32.8 q7, q7 + vrev32.8 q15, q15 +#endif + sub r5,r5,#1 + vstmia r12!, {q7} @ save round 0 key + b .Lkey_loop + +.align 4 +.Lkey_loop: + vtbl.8 d14,{q15},d28 + vtbl.8 d15,{q15},d29 + vmov.i8 q6, #0x40 + vmov.i8 q15, #0x80 + + vtst.8 q0, q7, q8 + vtst.8 q1, q7, q9 + vtst.8 q2, q7, q10 + vtst.8 q3, q7, q11 + vtst.8 q4, q7, q12 + vtst.8 q5, q7, q13 + vtst.8 q6, q7, q6 + vtst.8 q7, q7, q15 + vld1.8 {q15}, [r4]! @ load next round key + vmvn q0, q0 @ "pnot" + vmvn q1, q1 + vmvn q5, q5 + vmvn q6, q6 +#ifdef __ARMEL__ + vrev32.8 q15, q15 +#endif + subs r5,r5,#1 + vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key + bne .Lkey_loop + + vmov.i8 q7,#0x63 @ compose .L63 + @ don't save last round key + bx lr +.size _bsaes_key_convert,.-_bsaes_key_convert +.globl bsaes_cbc_encrypt +.hidden bsaes_cbc_encrypt +.type bsaes_cbc_encrypt,%function +.align 5 +bsaes_cbc_encrypt: + @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for + @ short inputs. We patch this out, using bsaes for all input sizes. + + @ it is up to the caller to make sure we are called with enc == 0 + + mov ip, sp + stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} + VFP_ABI_PUSH + ldr r8, [ip] @ IV is 1st arg on the stack + mov r2, r2, lsr#4 @ len in 16 byte blocks + sub sp, #0x10 @ scratch space to carry over the IV + mov r9, sp @ save sp + + ldr r10, [r3, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key + add r12, #96 @ sifze of bit-slices key schedule + + @ populate the key schedule + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + mov sp, r12 @ sp is sp + bl _bsaes_key_convert + vldmia sp, {q6} + vstmia r12, {q15} @ save last round key + veor q7, q7, q6 @ fix up round 0 key + vstmia sp, {q7} +#else + ldr r12, [r3, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [r3, #244] + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + add r12, r3, #248 @ pass key schedule + bl _bsaes_key_convert + add r4, r3, #248 + vldmia r4, {q6} + vstmia r12, {q15} @ save last round key + veor q7, q7, q6 @ fix up round 0 key + vstmia r4, {q7} + +.align 2 + +#endif + + vld1.8 {q15}, [r8] @ load IV + b .Lcbc_dec_loop + +.align 4 +.Lcbc_dec_loop: + subs r2, r2, #0x8 + bmi .Lcbc_dec_loop_finish + + vld1.8 {q0,q1}, [r0]! @ load input + vld1.8 {q2,q3}, [r0]! +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, sp @ pass the key +#else + add r4, r3, #248 +#endif + vld1.8 {q4,q5}, [r0]! + mov r5, r10 + vld1.8 {q6,q7}, [r0] + sub r0, r0, #0x60 + vstmia r9, {q15} @ put aside IV + + bl _bsaes_decrypt8 + + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12,q13}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q14,q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0,q1}, [r1]! @ write output + veor q3, q3, q13 + vst1.8 {q6}, [r1]! + veor q5, q5, q14 + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + vst1.8 {q3}, [r1]! + vst1.8 {q5}, [r1]! + + b .Lcbc_dec_loop + +.Lcbc_dec_loop_finish: + adds r2, r2, #8 + beq .Lcbc_dec_done + + @ Set up most parameters for the _bsaes_decrypt8 call. +#ifndef BSAES_ASM_EXTENDED_KEY + mov r4, sp @ pass the key +#else + add r4, r3, #248 +#endif + mov r5, r10 + vstmia r9, {q15} @ put aside IV + + vld1.8 {q0}, [r0]! @ load input + cmp r2, #2 + blo .Lcbc_dec_one + vld1.8 {q1}, [r0]! + beq .Lcbc_dec_two + vld1.8 {q2}, [r0]! + cmp r2, #4 + blo .Lcbc_dec_three + vld1.8 {q3}, [r0]! + beq .Lcbc_dec_four + vld1.8 {q4}, [r0]! + cmp r2, #6 + blo .Lcbc_dec_five + vld1.8 {q5}, [r0]! + beq .Lcbc_dec_six + vld1.8 {q6}, [r0]! + sub r0, r0, #0x70 + + bl _bsaes_decrypt8 + + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12,q13}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0,q1}, [r1]! @ write output + veor q3, q3, q13 + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + vst1.8 {q3}, [r1]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_six: + sub r0, r0, #0x60 + bl _bsaes_decrypt8 + vldmia r9,{q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q12}, [r0]! + veor q4, q4, q10 + veor q2, q2, q11 + vld1.8 {q15}, [r0]! + veor q7, q7, q12 + vst1.8 {q0,q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + vst1.8 {q7}, [r1]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_five: + sub r0, r0, #0x50 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10,q11}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q15}, [r0]! + veor q4, q4, q10 + vst1.8 {q0,q1}, [r1]! @ write output + veor q2, q2, q11 + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + vst1.8 {q2}, [r1]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_four: + sub r0, r0, #0x40 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q10}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vld1.8 {q15}, [r0]! + veor q4, q4, q10 + vst1.8 {q0,q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + vst1.8 {q4}, [r1]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_three: + sub r0, r0, #0x30 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8,q9}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q15}, [r0]! + veor q1, q1, q8 + veor q6, q6, q9 + vst1.8 {q0,q1}, [r1]! @ write output + vst1.8 {q6}, [r1]! + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_two: + sub r0, r0, #0x20 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q8}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vld1.8 {q15}, [r0]! @ reload input + veor q1, q1, q8 + vst1.8 {q0,q1}, [r1]! @ write output + b .Lcbc_dec_done +.align 4 +.Lcbc_dec_one: + sub r0, r0, #0x10 + bl _bsaes_decrypt8 + vldmia r9, {q14} @ reload IV + vld1.8 {q15}, [r0]! @ reload input + veor q0, q0, q14 @ ^= IV + vst1.8 {q0}, [r1]! @ write output + +.Lcbc_dec_done: +#ifndef BSAES_ASM_EXTENDED_KEY + vmov.i32 q0, #0 + vmov.i32 q1, #0 +.Lcbc_dec_bzero:@ wipe key schedule [if any] + vstmia sp!, {q0,q1} + cmp sp, r9 + bne .Lcbc_dec_bzero +#endif + + mov sp, r9 + add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb + vst1.8 {q15}, [r8] @ return IV + VFP_ABI_POP + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} +.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt +.globl bsaes_ctr32_encrypt_blocks +.hidden bsaes_ctr32_encrypt_blocks +.type bsaes_ctr32_encrypt_blocks,%function +.align 5 +bsaes_ctr32_encrypt_blocks: + @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this + @ out to retain a constant-time implementation. + mov ip, sp + stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} + VFP_ABI_PUSH + ldr r8, [ip] @ ctr is 1st arg on the stack + sub sp, sp, #0x10 @ scratch space to carry over the ctr + mov r9, sp @ save sp + + ldr r10, [r3, #240] @ get # of rounds +#ifndef BSAES_ASM_EXTENDED_KEY + @ allocate the key schedule on the stack + sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key + add r12, #96 @ size of bit-sliced key schedule + + @ populate the key schedule + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + mov sp, r12 @ sp is sp + bl _bsaes_key_convert + veor q7,q7,q15 @ fix up last round key + vstmia r12, {q7} @ save last round key + + vld1.8 {q0}, [r8] @ load counter +#ifdef __APPLE__ + mov r8, #:lower16:(.LREVM0SR-.LM0) + add r8, r6, r8 +#else + add r8, r6, #.LREVM0SR-.LM0 @ borrow r8 +#endif + vldmia sp, {q4} @ load round0 key +#else + ldr r12, [r3, #244] + eors r12, #1 + beq 0f + + @ populate the key schedule + str r12, [r3, #244] + mov r4, r3 @ pass key + mov r5, r10 @ pass # of rounds + add r12, r3, #248 @ pass key schedule + bl _bsaes_key_convert + veor q7,q7,q15 @ fix up last round key + vstmia r12, {q7} @ save last round key + +.align 2 + add r12, r3, #248 + vld1.8 {q0}, [r8] @ load counter + adrl r8, .LREVM0SR @ borrow r8 + vldmia r12, {q4} @ load round0 key + sub sp, #0x10 @ place for adjusted round0 key +#endif + + vmov.i32 q8,#1 @ compose 1<<96 + veor q9,q9,q9 + vrev32.8 q0,q0 + vext.8 q8,q9,q8,#4 + vrev32.8 q4,q4 + vadd.u32 q9,q8,q8 @ compose 2<<96 + vstmia sp, {q4} @ save adjusted round0 key + b .Lctr_enc_loop + +.align 4 +.Lctr_enc_loop: + vadd.u32 q10, q8, q9 @ compose 3<<96 + vadd.u32 q1, q0, q8 @ +1 + vadd.u32 q2, q0, q9 @ +2 + vadd.u32 q3, q0, q10 @ +3 + vadd.u32 q4, q1, q10 + vadd.u32 q5, q2, q10 + vadd.u32 q6, q3, q10 + vadd.u32 q7, q4, q10 + vadd.u32 q10, q5, q10 @ next counter + + @ Borrow prologue from _bsaes_encrypt8 to use the opportunity + @ to flip byte order in 32-bit counter + + vldmia sp, {q9} @ load round0 key +#ifndef BSAES_ASM_EXTENDED_KEY + add r4, sp, #0x10 @ pass next round key +#else + add r4, r3, #264 +#endif + vldmia r8, {q8} @ .LREVM0SR + mov r5, r10 @ pass rounds + vstmia r9, {q10} @ save next counter +#ifdef __APPLE__ + mov r6, #:lower16:(.LREVM0SR-.LSR) + sub r6, r8, r6 +#else + sub r6, r8, #.LREVM0SR-.LSR @ pass constants +#endif + + bl _bsaes_encrypt8_alt + + subs r2, r2, #8 + blo .Lctr_enc_loop_done + + vld1.8 {q8,q9}, [r0]! @ load input + vld1.8 {q10,q11}, [r0]! + veor q0, q8 + veor q1, q9 + vld1.8 {q12,q13}, [r0]! + veor q4, q10 + veor q6, q11 + vld1.8 {q14,q15}, [r0]! + veor q3, q12 + vst1.8 {q0,q1}, [r1]! @ write output + veor q7, q13 + veor q2, q14 + vst1.8 {q4}, [r1]! + veor q5, q15 + vst1.8 {q6}, [r1]! + vmov.i32 q8, #1 @ compose 1<<96 + vst1.8 {q3}, [r1]! + veor q9, q9, q9 + vst1.8 {q7}, [r1]! + vext.8 q8, q9, q8, #4 + vst1.8 {q2}, [r1]! + vadd.u32 q9,q8,q8 @ compose 2<<96 + vst1.8 {q5}, [r1]! + vldmia r9, {q0} @ load counter + + bne .Lctr_enc_loop + b .Lctr_enc_done + +.align 4 +.Lctr_enc_loop_done: + add r2, r2, #8 + vld1.8 {q8}, [r0]! @ load input + veor q0, q8 + vst1.8 {q0}, [r1]! @ write output + cmp r2, #2 + blo .Lctr_enc_done + vld1.8 {q9}, [r0]! + veor q1, q9 + vst1.8 {q1}, [r1]! + beq .Lctr_enc_done + vld1.8 {q10}, [r0]! + veor q4, q10 + vst1.8 {q4}, [r1]! + cmp r2, #4 + blo .Lctr_enc_done + vld1.8 {q11}, [r0]! + veor q6, q11 + vst1.8 {q6}, [r1]! + beq .Lctr_enc_done + vld1.8 {q12}, [r0]! + veor q3, q12 + vst1.8 {q3}, [r1]! + cmp r2, #6 + blo .Lctr_enc_done + vld1.8 {q13}, [r0]! + veor q7, q13 + vst1.8 {q7}, [r1]! + beq .Lctr_enc_done + vld1.8 {q14}, [r0] + veor q2, q14 + vst1.8 {q2}, [r1]! + +.Lctr_enc_done: + vmov.i32 q0, #0 + vmov.i32 q1, #0 +#ifndef BSAES_ASM_EXTENDED_KEY +.Lctr_enc_bzero:@ wipe key schedule [if any] + vstmia sp!, {q0,q1} + cmp sp, r9 + bne .Lctr_enc_bzero +#else + vstmia sp, {q0,q1} +#endif + + mov sp, r9 + add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb + VFP_ABI_POP + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return + + @ OpenSSL contains aes_nohw_* fallback code here. We patch this + @ out to retain a constant-time implementation. +.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/co-586-apple.S b/third_party/boringssl/gen/bcm/co-586-apple.S new file mode 100644 index 00000000..ab985eec --- /dev/null +++ b/third_party/boringssl/gen/bcm/co-586-apple.S @@ -0,0 +1,1256 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _bn_mul_comba8 +.private_extern _bn_mul_comba8 +.align 4 +_bn_mul_comba8: +L_bn_mul_comba8_begin: + pushl %esi + movl 12(%esp),%esi + pushl %edi + movl 20(%esp),%edi + pushl %ebp + pushl %ebx + xorl %ebx,%ebx + movl (%esi),%eax + xorl %ecx,%ecx + movl (%edi),%edx + # ################## Calculate word 0 + xorl %ebp,%ebp + # mul a[0]*b[0] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,(%eax) + movl 4(%esi),%eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx,%ebx + # mul a[1]*b[0] + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + # mul a[0]*b[1] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl (%edi),%edx + adcl $0,%ebx + movl %ecx,4(%eax) + movl 8(%esi),%eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx,%ecx + # mul a[2]*b[0] + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 4(%edi),%edx + adcl $0,%ecx + # mul a[1]*b[1] + mull %edx + addl %eax,%ebp + movl (%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + # mul a[0]*b[2] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl (%edi),%edx + adcl $0,%ecx + movl %ebp,8(%eax) + movl 12(%esi),%eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp,%ebp + # mul a[3]*b[0] + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + # mul a[2]*b[1] + mull %edx + addl %eax,%ebx + movl 4(%esi),%eax + adcl %edx,%ecx + movl 8(%edi),%edx + adcl $0,%ebp + # mul a[1]*b[2] + mull %edx + addl %eax,%ebx + movl (%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + # mul a[0]*b[3] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,12(%eax) + movl 16(%esi),%eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx,%ebx + # mul a[4]*b[0] + mull %edx + addl %eax,%ecx + movl 12(%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + # mul a[3]*b[1] + mull %edx + addl %eax,%ecx + movl 8(%esi),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + # mul a[2]*b[2] + mull %edx + addl %eax,%ecx + movl 4(%esi),%eax + adcl %edx,%ebp + movl 12(%edi),%edx + adcl $0,%ebx + # mul a[1]*b[3] + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + # mul a[0]*b[4] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl (%edi),%edx + adcl $0,%ebx + movl %ecx,16(%eax) + movl 20(%esi),%eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx,%ecx + # mul a[5]*b[0] + mull %edx + addl %eax,%ebp + movl 16(%esi),%eax + adcl %edx,%ebx + movl 4(%edi),%edx + adcl $0,%ecx + # mul a[4]*b[1] + mull %edx + addl %eax,%ebp + movl 12(%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + # mul a[3]*b[2] + mull %edx + addl %eax,%ebp + movl 8(%esi),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + # mul a[2]*b[3] + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 16(%edi),%edx + adcl $0,%ecx + # mul a[1]*b[4] + mull %edx + addl %eax,%ebp + movl (%esi),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + # mul a[0]*b[5] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl (%edi),%edx + adcl $0,%ecx + movl %ebp,20(%eax) + movl 24(%esi),%eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp,%ebp + # mul a[6]*b[0] + mull %edx + addl %eax,%ebx + movl 20(%esi),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + # mul a[5]*b[1] + mull %edx + addl %eax,%ebx + movl 16(%esi),%eax + adcl %edx,%ecx + movl 8(%edi),%edx + adcl $0,%ebp + # mul a[4]*b[2] + mull %edx + addl %eax,%ebx + movl 12(%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + # mul a[3]*b[3] + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 16(%edi),%edx + adcl $0,%ebp + # mul a[2]*b[4] + mull %edx + addl %eax,%ebx + movl 4(%esi),%eax + adcl %edx,%ecx + movl 20(%edi),%edx + adcl $0,%ebp + # mul a[1]*b[5] + mull %edx + addl %eax,%ebx + movl (%esi),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + # mul a[0]*b[6] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,24(%eax) + movl 28(%esi),%eax + # saved r[6] + # ################## Calculate word 7 + xorl %ebx,%ebx + # mul a[7]*b[0] + mull %edx + addl %eax,%ecx + movl 24(%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + # mul a[6]*b[1] + mull %edx + addl %eax,%ecx + movl 20(%esi),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + # mul a[5]*b[2] + mull %edx + addl %eax,%ecx + movl 16(%esi),%eax + adcl %edx,%ebp + movl 12(%edi),%edx + adcl $0,%ebx + # mul a[4]*b[3] + mull %edx + addl %eax,%ecx + movl 12(%esi),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + # mul a[3]*b[4] + mull %edx + addl %eax,%ecx + movl 8(%esi),%eax + adcl %edx,%ebp + movl 20(%edi),%edx + adcl $0,%ebx + # mul a[2]*b[5] + mull %edx + addl %eax,%ecx + movl 4(%esi),%eax + adcl %edx,%ebp + movl 24(%edi),%edx + adcl $0,%ebx + # mul a[1]*b[6] + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + # mul a[0]*b[7] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + movl %ecx,28(%eax) + movl 28(%esi),%eax + # saved r[7] + # ################## Calculate word 8 + xorl %ecx,%ecx + # mul a[7]*b[1] + mull %edx + addl %eax,%ebp + movl 24(%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + # mul a[6]*b[2] + mull %edx + addl %eax,%ebp + movl 20(%esi),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + # mul a[5]*b[3] + mull %edx + addl %eax,%ebp + movl 16(%esi),%eax + adcl %edx,%ebx + movl 16(%edi),%edx + adcl $0,%ecx + # mul a[4]*b[4] + mull %edx + addl %eax,%ebp + movl 12(%esi),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + # mul a[3]*b[5] + mull %edx + addl %eax,%ebp + movl 8(%esi),%eax + adcl %edx,%ebx + movl 24(%edi),%edx + adcl $0,%ecx + # mul a[2]*b[6] + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 28(%edi),%edx + adcl $0,%ecx + # mul a[1]*b[7] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + movl %ebp,32(%eax) + movl 28(%esi),%eax + # saved r[8] + # ################## Calculate word 9 + xorl %ebp,%ebp + # mul a[7]*b[2] + mull %edx + addl %eax,%ebx + movl 24(%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + # mul a[6]*b[3] + mull %edx + addl %eax,%ebx + movl 20(%esi),%eax + adcl %edx,%ecx + movl 16(%edi),%edx + adcl $0,%ebp + # mul a[5]*b[4] + mull %edx + addl %eax,%ebx + movl 16(%esi),%eax + adcl %edx,%ecx + movl 20(%edi),%edx + adcl $0,%ebp + # mul a[4]*b[5] + mull %edx + addl %eax,%ebx + movl 12(%esi),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + # mul a[3]*b[6] + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 28(%edi),%edx + adcl $0,%ebp + # mul a[2]*b[7] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + movl %ebx,36(%eax) + movl 28(%esi),%eax + # saved r[9] + # ################## Calculate word 10 + xorl %ebx,%ebx + # mul a[7]*b[3] + mull %edx + addl %eax,%ecx + movl 24(%esi),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + # mul a[6]*b[4] + mull %edx + addl %eax,%ecx + movl 20(%esi),%eax + adcl %edx,%ebp + movl 20(%edi),%edx + adcl $0,%ebx + # mul a[5]*b[5] + mull %edx + addl %eax,%ecx + movl 16(%esi),%eax + adcl %edx,%ebp + movl 24(%edi),%edx + adcl $0,%ebx + # mul a[4]*b[6] + mull %edx + addl %eax,%ecx + movl 12(%esi),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + # mul a[3]*b[7] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + movl %ecx,40(%eax) + movl 28(%esi),%eax + # saved r[10] + # ################## Calculate word 11 + xorl %ecx,%ecx + # mul a[7]*b[4] + mull %edx + addl %eax,%ebp + movl 24(%esi),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + # mul a[6]*b[5] + mull %edx + addl %eax,%ebp + movl 20(%esi),%eax + adcl %edx,%ebx + movl 24(%edi),%edx + adcl $0,%ecx + # mul a[5]*b[6] + mull %edx + addl %eax,%ebp + movl 16(%esi),%eax + adcl %edx,%ebx + movl 28(%edi),%edx + adcl $0,%ecx + # mul a[4]*b[7] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + movl %ebp,44(%eax) + movl 28(%esi),%eax + # saved r[11] + # ################## Calculate word 12 + xorl %ebp,%ebp + # mul a[7]*b[5] + mull %edx + addl %eax,%ebx + movl 24(%esi),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + # mul a[6]*b[6] + mull %edx + addl %eax,%ebx + movl 20(%esi),%eax + adcl %edx,%ecx + movl 28(%edi),%edx + adcl $0,%ebp + # mul a[5]*b[7] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + movl %ebx,48(%eax) + movl 28(%esi),%eax + # saved r[12] + # ################## Calculate word 13 + xorl %ebx,%ebx + # mul a[7]*b[6] + mull %edx + addl %eax,%ecx + movl 24(%esi),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + # mul a[6]*b[7] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + movl %ecx,52(%eax) + movl 28(%esi),%eax + # saved r[13] + # ################## Calculate word 14 + xorl %ecx,%ecx + # mul a[7]*b[7] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + adcl $0,%ecx + movl %ebp,56(%eax) + # saved r[14] + # save r[15] + movl %ebx,60(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.globl _bn_mul_comba4 +.private_extern _bn_mul_comba4 +.align 4 +_bn_mul_comba4: +L_bn_mul_comba4_begin: + pushl %esi + movl 12(%esp),%esi + pushl %edi + movl 20(%esp),%edi + pushl %ebp + pushl %ebx + xorl %ebx,%ebx + movl (%esi),%eax + xorl %ecx,%ecx + movl (%edi),%edx + # ################## Calculate word 0 + xorl %ebp,%ebp + # mul a[0]*b[0] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,(%eax) + movl 4(%esi),%eax + # saved r[0] + # ################## Calculate word 1 + xorl %ebx,%ebx + # mul a[1]*b[0] + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + # mul a[0]*b[1] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl (%edi),%edx + adcl $0,%ebx + movl %ecx,4(%eax) + movl 8(%esi),%eax + # saved r[1] + # ################## Calculate word 2 + xorl %ecx,%ecx + # mul a[2]*b[0] + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 4(%edi),%edx + adcl $0,%ecx + # mul a[1]*b[1] + mull %edx + addl %eax,%ebp + movl (%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + # mul a[0]*b[2] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl (%edi),%edx + adcl $0,%ecx + movl %ebp,8(%eax) + movl 12(%esi),%eax + # saved r[2] + # ################## Calculate word 3 + xorl %ebp,%ebp + # mul a[3]*b[0] + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + # mul a[2]*b[1] + mull %edx + addl %eax,%ebx + movl 4(%esi),%eax + adcl %edx,%ecx + movl 8(%edi),%edx + adcl $0,%ebp + # mul a[1]*b[2] + mull %edx + addl %eax,%ebx + movl (%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + # mul a[0]*b[3] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + movl %ebx,12(%eax) + movl 12(%esi),%eax + # saved r[3] + # ################## Calculate word 4 + xorl %ebx,%ebx + # mul a[3]*b[1] + mull %edx + addl %eax,%ecx + movl 8(%esi),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + # mul a[2]*b[2] + mull %edx + addl %eax,%ecx + movl 4(%esi),%eax + adcl %edx,%ebp + movl 12(%edi),%edx + adcl $0,%ebx + # mul a[1]*b[3] + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + movl %ecx,16(%eax) + movl 12(%esi),%eax + # saved r[4] + # ################## Calculate word 5 + xorl %ecx,%ecx + # mul a[3]*b[2] + mull %edx + addl %eax,%ebp + movl 8(%esi),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + # mul a[2]*b[3] + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + movl %ebp,20(%eax) + movl 12(%esi),%eax + # saved r[5] + # ################## Calculate word 6 + xorl %ebp,%ebp + # mul a[3]*b[3] + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + adcl $0,%ebp + movl %ebx,24(%eax) + # saved r[6] + # save r[7] + movl %ecx,28(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.globl _bn_sqr_comba8 +.private_extern _bn_sqr_comba8 +.align 4 +_bn_sqr_comba8: +L_bn_sqr_comba8_begin: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp),%edi + movl 24(%esp),%esi + xorl %ebx,%ebx + xorl %ecx,%ecx + movl (%esi),%eax + # ############### Calculate word 0 + xorl %ebp,%ebp + # sqr a[0]*a[0] + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl (%esi),%edx + adcl $0,%ebp + movl %ebx,(%edi) + movl 4(%esi),%eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx,%ebx + # sqr a[1]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + movl %ecx,4(%edi) + movl (%esi),%edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx,%ecx + # sqr a[2]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 4(%esi),%eax + adcl $0,%ecx + # sqr a[1]*a[1] + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + movl (%esi),%edx + adcl $0,%ecx + movl %ebp,8(%edi) + movl 12(%esi),%eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp,%ebp + # sqr a[3]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 8(%esi),%eax + adcl $0,%ebp + movl 4(%esi),%edx + # sqr a[2]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 16(%esi),%eax + adcl $0,%ebp + movl %ebx,12(%edi) + movl (%esi),%edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx,%ebx + # sqr a[4]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 12(%esi),%eax + adcl $0,%ebx + movl 4(%esi),%edx + # sqr a[3]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + # sqr a[2]*a[2] + mull %eax + addl %eax,%ecx + adcl %edx,%ebp + movl (%esi),%edx + adcl $0,%ebx + movl %ecx,16(%edi) + movl 20(%esi),%eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx,%ecx + # sqr a[5]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 16(%esi),%eax + adcl $0,%ecx + movl 4(%esi),%edx + # sqr a[4]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 12(%esi),%eax + adcl $0,%ecx + movl 8(%esi),%edx + # sqr a[3]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 24(%esi),%eax + adcl $0,%ecx + movl %ebp,20(%edi) + movl (%esi),%edx + # saved r[5] + # ############### Calculate word 6 + xorl %ebp,%ebp + # sqr a[6]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 20(%esi),%eax + adcl $0,%ebp + movl 4(%esi),%edx + # sqr a[5]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 16(%esi),%eax + adcl $0,%ebp + movl 8(%esi),%edx + # sqr a[4]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 12(%esi),%eax + adcl $0,%ebp + # sqr a[3]*a[3] + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl (%esi),%edx + adcl $0,%ebp + movl %ebx,24(%edi) + movl 28(%esi),%eax + # saved r[6] + # ############### Calculate word 7 + xorl %ebx,%ebx + # sqr a[7]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 24(%esi),%eax + adcl $0,%ebx + movl 4(%esi),%edx + # sqr a[6]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 20(%esi),%eax + adcl $0,%ebx + movl 8(%esi),%edx + # sqr a[5]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 16(%esi),%eax + adcl $0,%ebx + movl 12(%esi),%edx + # sqr a[4]*a[3] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 28(%esi),%eax + adcl $0,%ebx + movl %ecx,28(%edi) + movl 4(%esi),%edx + # saved r[7] + # ############### Calculate word 8 + xorl %ecx,%ecx + # sqr a[7]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 24(%esi),%eax + adcl $0,%ecx + movl 8(%esi),%edx + # sqr a[6]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 20(%esi),%eax + adcl $0,%ecx + movl 12(%esi),%edx + # sqr a[5]*a[3] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 16(%esi),%eax + adcl $0,%ecx + # sqr a[4]*a[4] + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + movl 8(%esi),%edx + adcl $0,%ecx + movl %ebp,32(%edi) + movl 28(%esi),%eax + # saved r[8] + # ############### Calculate word 9 + xorl %ebp,%ebp + # sqr a[7]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 24(%esi),%eax + adcl $0,%ebp + movl 12(%esi),%edx + # sqr a[6]*a[3] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 20(%esi),%eax + adcl $0,%ebp + movl 16(%esi),%edx + # sqr a[5]*a[4] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 28(%esi),%eax + adcl $0,%ebp + movl %ebx,36(%edi) + movl 12(%esi),%edx + # saved r[9] + # ############### Calculate word 10 + xorl %ebx,%ebx + # sqr a[7]*a[3] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 24(%esi),%eax + adcl $0,%ebx + movl 16(%esi),%edx + # sqr a[6]*a[4] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 20(%esi),%eax + adcl $0,%ebx + # sqr a[5]*a[5] + mull %eax + addl %eax,%ecx + adcl %edx,%ebp + movl 16(%esi),%edx + adcl $0,%ebx + movl %ecx,40(%edi) + movl 28(%esi),%eax + # saved r[10] + # ############### Calculate word 11 + xorl %ecx,%ecx + # sqr a[7]*a[4] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 24(%esi),%eax + adcl $0,%ecx + movl 20(%esi),%edx + # sqr a[6]*a[5] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 28(%esi),%eax + adcl $0,%ecx + movl %ebp,44(%edi) + movl 20(%esi),%edx + # saved r[11] + # ############### Calculate word 12 + xorl %ebp,%ebp + # sqr a[7]*a[5] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 24(%esi),%eax + adcl $0,%ebp + # sqr a[6]*a[6] + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl 24(%esi),%edx + adcl $0,%ebp + movl %ebx,48(%edi) + movl 28(%esi),%eax + # saved r[12] + # ############### Calculate word 13 + xorl %ebx,%ebx + # sqr a[7]*a[6] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 28(%esi),%eax + adcl $0,%ebx + movl %ecx,52(%edi) + # saved r[13] + # ############### Calculate word 14 + xorl %ecx,%ecx + # sqr a[7]*a[7] + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + adcl $0,%ecx + movl %ebp,56(%edi) + # saved r[14] + movl %ebx,60(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.globl _bn_sqr_comba4 +.private_extern _bn_sqr_comba4 +.align 4 +_bn_sqr_comba4: +L_bn_sqr_comba4_begin: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp),%edi + movl 24(%esp),%esi + xorl %ebx,%ebx + xorl %ecx,%ecx + movl (%esi),%eax + # ############### Calculate word 0 + xorl %ebp,%ebp + # sqr a[0]*a[0] + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl (%esi),%edx + adcl $0,%ebp + movl %ebx,(%edi) + movl 4(%esi),%eax + # saved r[0] + # ############### Calculate word 1 + xorl %ebx,%ebx + # sqr a[1]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + movl %ecx,4(%edi) + movl (%esi),%edx + # saved r[1] + # ############### Calculate word 2 + xorl %ecx,%ecx + # sqr a[2]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 4(%esi),%eax + adcl $0,%ecx + # sqr a[1]*a[1] + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + movl (%esi),%edx + adcl $0,%ecx + movl %ebp,8(%edi) + movl 12(%esi),%eax + # saved r[2] + # ############### Calculate word 3 + xorl %ebp,%ebp + # sqr a[3]*a[0] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 8(%esi),%eax + adcl $0,%ebp + movl 4(%esi),%edx + # sqr a[2]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 12(%esi),%eax + adcl $0,%ebp + movl %ebx,12(%edi) + movl 4(%esi),%edx + # saved r[3] + # ############### Calculate word 4 + xorl %ebx,%ebx + # sqr a[3]*a[1] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + # sqr a[2]*a[2] + mull %eax + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%edx + adcl $0,%ebx + movl %ecx,16(%edi) + movl 12(%esi),%eax + # saved r[4] + # ############### Calculate word 5 + xorl %ecx,%ecx + # sqr a[3]*a[2] + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 12(%esi),%eax + adcl $0,%ecx + movl %ebp,20(%edi) + # saved r[5] + # ############### Calculate word 6 + xorl %ebp,%ebp + # sqr a[3]*a[3] + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + adcl $0,%ebp + movl %ebx,24(%edi) + # saved r[6] + movl %ecx,28(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/co-586-linux.S b/third_party/boringssl/gen/bcm/co-586-linux.S new file mode 100644 index 00000000..b4812e31 --- /dev/null +++ b/third_party/boringssl/gen/bcm/co-586-linux.S @@ -0,0 +1,1264 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl bn_mul_comba8 +.hidden bn_mul_comba8 +.type bn_mul_comba8,@function +.align 16 +bn_mul_comba8: +.L_bn_mul_comba8_begin: + pushl %esi + movl 12(%esp),%esi + pushl %edi + movl 20(%esp),%edi + pushl %ebp + pushl %ebx + xorl %ebx,%ebx + movl (%esi),%eax + xorl %ecx,%ecx + movl (%edi),%edx + + xorl %ebp,%ebp + + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,(%eax) + movl 4(%esi),%eax + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl (%edi),%edx + adcl $0,%ebx + movl %ecx,4(%eax) + movl 8(%esi),%eax + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 4(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl (%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl (%edi),%edx + adcl $0,%ecx + movl %ebp,8(%eax) + movl 12(%esi),%eax + + + xorl %ebp,%ebp + + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 4(%esi),%eax + adcl %edx,%ecx + movl 8(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl (%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,12(%eax) + movl 16(%esi),%eax + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%ecx + movl 12(%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 8(%esi),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 4(%esi),%eax + adcl %edx,%ebp + movl 12(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl (%edi),%edx + adcl $0,%ebx + movl %ecx,16(%eax) + movl 20(%esi),%eax + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%ebp + movl 16(%esi),%eax + adcl %edx,%ebx + movl 4(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 12(%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 8(%esi),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 16(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl (%esi),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl (%edi),%edx + adcl $0,%ecx + movl %ebp,20(%eax) + movl 24(%esi),%eax + + + xorl %ebp,%ebp + + mull %edx + addl %eax,%ebx + movl 20(%esi),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 16(%esi),%eax + adcl %edx,%ecx + movl 8(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 12(%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 16(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 4(%esi),%eax + adcl %edx,%ecx + movl 20(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl (%esi),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,24(%eax) + movl 28(%esi),%eax + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%ecx + movl 24(%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 20(%esi),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 16(%esi),%eax + adcl %edx,%ebp + movl 12(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 12(%esi),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 8(%esi),%eax + adcl %edx,%ebp + movl 20(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 4(%esi),%eax + adcl %edx,%ebp + movl 24(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + movl %ecx,28(%eax) + movl 28(%esi),%eax + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%ebp + movl 24(%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 20(%esi),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 16(%esi),%eax + adcl %edx,%ebx + movl 16(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 12(%esi),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 8(%esi),%eax + adcl %edx,%ebx + movl 24(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 28(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + movl %ebp,32(%eax) + movl 28(%esi),%eax + + + xorl %ebp,%ebp + + mull %edx + addl %eax,%ebx + movl 24(%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 20(%esi),%eax + adcl %edx,%ecx + movl 16(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 16(%esi),%eax + adcl %edx,%ecx + movl 20(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 12(%esi),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 28(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + movl %ebx,36(%eax) + movl 28(%esi),%eax + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%ecx + movl 24(%esi),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 20(%esi),%eax + adcl %edx,%ebp + movl 20(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 16(%esi),%eax + adcl %edx,%ebp + movl 24(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 12(%esi),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 16(%edi),%edx + adcl $0,%ebx + movl %ecx,40(%eax) + movl 28(%esi),%eax + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%ebp + movl 24(%esi),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 20(%esi),%eax + adcl %edx,%ebx + movl 24(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 16(%esi),%eax + adcl %edx,%ebx + movl 28(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl 20(%edi),%edx + adcl $0,%ecx + movl %ebp,44(%eax) + movl 28(%esi),%eax + + + xorl %ebp,%ebp + + mull %edx + addl %eax,%ebx + movl 24(%esi),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 20(%esi),%eax + adcl %edx,%ecx + movl 28(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl 24(%edi),%edx + adcl $0,%ebp + movl %ebx,48(%eax) + movl 28(%esi),%eax + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%ecx + movl 24(%esi),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 28(%edi),%edx + adcl $0,%ebx + movl %ecx,52(%eax) + movl 28(%esi),%eax + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + adcl $0,%ecx + movl %ebp,56(%eax) + + + movl %ebx,60(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.size bn_mul_comba8,.-.L_bn_mul_comba8_begin +.globl bn_mul_comba4 +.hidden bn_mul_comba4 +.type bn_mul_comba4,@function +.align 16 +bn_mul_comba4: +.L_bn_mul_comba4_begin: + pushl %esi + movl 12(%esp),%esi + pushl %edi + movl 20(%esp),%edi + pushl %ebp + pushl %ebx + xorl %ebx,%ebx + movl (%esi),%eax + xorl %ecx,%ecx + movl (%edi),%edx + + xorl %ebp,%ebp + + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl (%edi),%edx + adcl $0,%ebp + movl %ebx,(%eax) + movl 4(%esi),%eax + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%ecx + movl (%esi),%eax + adcl %edx,%ebp + movl 4(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl (%edi),%edx + adcl $0,%ebx + movl %ecx,4(%eax) + movl 8(%esi),%eax + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%ebp + movl 4(%esi),%eax + adcl %edx,%ebx + movl 4(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl (%esi),%eax + adcl %edx,%ebx + movl 8(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl (%edi),%edx + adcl $0,%ecx + movl %ebp,8(%eax) + movl 12(%esi),%eax + + + xorl %ebp,%ebp + + mull %edx + addl %eax,%ebx + movl 8(%esi),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 4(%esi),%eax + adcl %edx,%ecx + movl 8(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl (%esi),%eax + adcl %edx,%ecx + movl 12(%edi),%edx + adcl $0,%ebp + + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + movl 4(%edi),%edx + adcl $0,%ebp + movl %ebx,12(%eax) + movl 12(%esi),%eax + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%ecx + movl 8(%esi),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 4(%esi),%eax + adcl %edx,%ebp + movl 12(%edi),%edx + adcl $0,%ebx + + mull %edx + addl %eax,%ecx + movl 20(%esp),%eax + adcl %edx,%ebp + movl 8(%edi),%edx + adcl $0,%ebx + movl %ecx,16(%eax) + movl 12(%esi),%eax + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%ebp + movl 8(%esi),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + + mull %edx + addl %eax,%ebp + movl 20(%esp),%eax + adcl %edx,%ebx + movl 12(%edi),%edx + adcl $0,%ecx + movl %ebp,20(%eax) + movl 12(%esi),%eax + + + xorl %ebp,%ebp + + mull %edx + addl %eax,%ebx + movl 20(%esp),%eax + adcl %edx,%ecx + adcl $0,%ebp + movl %ebx,24(%eax) + + + movl %ecx,28(%eax) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.size bn_mul_comba4,.-.L_bn_mul_comba4_begin +.globl bn_sqr_comba8 +.hidden bn_sqr_comba8 +.type bn_sqr_comba8,@function +.align 16 +bn_sqr_comba8: +.L_bn_sqr_comba8_begin: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp),%edi + movl 24(%esp),%esi + xorl %ebx,%ebx + xorl %ecx,%ecx + movl (%esi),%eax + + xorl %ebp,%ebp + + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl (%esi),%edx + adcl $0,%ebp + movl %ebx,(%edi) + movl 4(%esi),%eax + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + movl %ecx,4(%edi) + movl (%esi),%edx + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 4(%esi),%eax + adcl $0,%ecx + + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + movl (%esi),%edx + adcl $0,%ecx + movl %ebp,8(%edi) + movl 12(%esi),%eax + + + xorl %ebp,%ebp + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 8(%esi),%eax + adcl $0,%ebp + movl 4(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 16(%esi),%eax + adcl $0,%ebp + movl %ebx,12(%edi) + movl (%esi),%edx + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 12(%esi),%eax + adcl $0,%ebx + movl 4(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + + mull %eax + addl %eax,%ecx + adcl %edx,%ebp + movl (%esi),%edx + adcl $0,%ebx + movl %ecx,16(%edi) + movl 20(%esi),%eax + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 16(%esi),%eax + adcl $0,%ecx + movl 4(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 12(%esi),%eax + adcl $0,%ecx + movl 8(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 24(%esi),%eax + adcl $0,%ecx + movl %ebp,20(%edi) + movl (%esi),%edx + + + xorl %ebp,%ebp + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 20(%esi),%eax + adcl $0,%ebp + movl 4(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 16(%esi),%eax + adcl $0,%ebp + movl 8(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 12(%esi),%eax + adcl $0,%ebp + + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl (%esi),%edx + adcl $0,%ebp + movl %ebx,24(%edi) + movl 28(%esi),%eax + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 24(%esi),%eax + adcl $0,%ebx + movl 4(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 20(%esi),%eax + adcl $0,%ebx + movl 8(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 16(%esi),%eax + adcl $0,%ebx + movl 12(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 28(%esi),%eax + adcl $0,%ebx + movl %ecx,28(%edi) + movl 4(%esi),%edx + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 24(%esi),%eax + adcl $0,%ecx + movl 8(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 20(%esi),%eax + adcl $0,%ecx + movl 12(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 16(%esi),%eax + adcl $0,%ecx + + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + movl 8(%esi),%edx + adcl $0,%ecx + movl %ebp,32(%edi) + movl 28(%esi),%eax + + + xorl %ebp,%ebp + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 24(%esi),%eax + adcl $0,%ebp + movl 12(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 20(%esi),%eax + adcl $0,%ebp + movl 16(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 28(%esi),%eax + adcl $0,%ebp + movl %ebx,36(%edi) + movl 12(%esi),%edx + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 24(%esi),%eax + adcl $0,%ebx + movl 16(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 20(%esi),%eax + adcl $0,%ebx + + mull %eax + addl %eax,%ecx + adcl %edx,%ebp + movl 16(%esi),%edx + adcl $0,%ebx + movl %ecx,40(%edi) + movl 28(%esi),%eax + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 24(%esi),%eax + adcl $0,%ecx + movl 20(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 28(%esi),%eax + adcl $0,%ecx + movl %ebp,44(%edi) + movl 20(%esi),%edx + + + xorl %ebp,%ebp + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 24(%esi),%eax + adcl $0,%ebp + + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl 24(%esi),%edx + adcl $0,%ebp + movl %ebx,48(%edi) + movl 28(%esi),%eax + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 28(%esi),%eax + adcl $0,%ebx + movl %ecx,52(%edi) + + + xorl %ecx,%ecx + + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + adcl $0,%ecx + movl %ebp,56(%edi) + + movl %ebx,60(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.size bn_sqr_comba8,.-.L_bn_sqr_comba8_begin +.globl bn_sqr_comba4 +.hidden bn_sqr_comba4 +.type bn_sqr_comba4,@function +.align 16 +bn_sqr_comba4: +.L_bn_sqr_comba4_begin: + pushl %esi + pushl %edi + pushl %ebp + pushl %ebx + movl 20(%esp),%edi + movl 24(%esp),%esi + xorl %ebx,%ebx + xorl %ecx,%ecx + movl (%esi),%eax + + xorl %ebp,%ebp + + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + movl (%esi),%edx + adcl $0,%ebp + movl %ebx,(%edi) + movl 4(%esi),%eax + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + movl %ecx,4(%edi) + movl (%esi),%edx + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 4(%esi),%eax + adcl $0,%ecx + + mull %eax + addl %eax,%ebp + adcl %edx,%ebx + movl (%esi),%edx + adcl $0,%ecx + movl %ebp,8(%edi) + movl 12(%esi),%eax + + + xorl %ebp,%ebp + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 8(%esi),%eax + adcl $0,%ebp + movl 4(%esi),%edx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebp + addl %eax,%ebx + adcl %edx,%ecx + movl 12(%esi),%eax + adcl $0,%ebp + movl %ebx,12(%edi) + movl 4(%esi),%edx + + + xorl %ebx,%ebx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ebx + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%eax + adcl $0,%ebx + + mull %eax + addl %eax,%ecx + adcl %edx,%ebp + movl 8(%esi),%edx + adcl $0,%ebx + movl %ecx,16(%edi) + movl 12(%esi),%eax + + + xorl %ecx,%ecx + + mull %edx + addl %eax,%eax + adcl %edx,%edx + adcl $0,%ecx + addl %eax,%ebp + adcl %edx,%ebx + movl 12(%esi),%eax + adcl $0,%ecx + movl %ebp,20(%edi) + + + xorl %ebp,%ebp + + mull %eax + addl %eax,%ebx + adcl %edx,%ecx + adcl $0,%ebp + movl %ebx,24(%edi) + + movl %ecx,28(%edi) + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/co-586-win.asm b/third_party/boringssl/gen/bcm/co-586-win.asm new file mode 100644 index 00000000..c3e09d27 --- /dev/null +++ b/third_party/boringssl/gen/bcm/co-586-win.asm @@ -0,0 +1,1263 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _bn_mul_comba8 +align 16 +_bn_mul_comba8: +L$_bn_mul_comba8_begin: + push esi + mov esi,DWORD [12+esp] + push edi + mov edi,DWORD [20+esp] + push ebp + push ebx + xor ebx,ebx + mov eax,DWORD [esi] + xor ecx,ecx + mov edx,DWORD [edi] + ; ################## Calculate word 0 + xor ebp,ebp + ; mul a[0]*b[0] + mul edx + add ebx,eax + mov eax,DWORD [20+esp] + adc ecx,edx + mov edx,DWORD [edi] + adc ebp,0 + mov DWORD [eax],ebx + mov eax,DWORD [4+esi] + ; saved r[0] + ; ################## Calculate word 1 + xor ebx,ebx + ; mul a[1]*b[0] + mul edx + add ecx,eax + mov eax,DWORD [esi] + adc ebp,edx + mov edx,DWORD [4+edi] + adc ebx,0 + ; mul a[0]*b[1] + mul edx + add ecx,eax + mov eax,DWORD [20+esp] + adc ebp,edx + mov edx,DWORD [edi] + adc ebx,0 + mov DWORD [4+eax],ecx + mov eax,DWORD [8+esi] + ; saved r[1] + ; ################## Calculate word 2 + xor ecx,ecx + ; mul a[2]*b[0] + mul edx + add ebp,eax + mov eax,DWORD [4+esi] + adc ebx,edx + mov edx,DWORD [4+edi] + adc ecx,0 + ; mul a[1]*b[1] + mul edx + add ebp,eax + mov eax,DWORD [esi] + adc ebx,edx + mov edx,DWORD [8+edi] + adc ecx,0 + ; mul a[0]*b[2] + mul edx + add ebp,eax + mov eax,DWORD [20+esp] + adc ebx,edx + mov edx,DWORD [edi] + adc ecx,0 + mov DWORD [8+eax],ebp + mov eax,DWORD [12+esi] + ; saved r[2] + ; ################## Calculate word 3 + xor ebp,ebp + ; mul a[3]*b[0] + mul edx + add ebx,eax + mov eax,DWORD [8+esi] + adc ecx,edx + mov edx,DWORD [4+edi] + adc ebp,0 + ; mul a[2]*b[1] + mul edx + add ebx,eax + mov eax,DWORD [4+esi] + adc ecx,edx + mov edx,DWORD [8+edi] + adc ebp,0 + ; mul a[1]*b[2] + mul edx + add ebx,eax + mov eax,DWORD [esi] + adc ecx,edx + mov edx,DWORD [12+edi] + adc ebp,0 + ; mul a[0]*b[3] + mul edx + add ebx,eax + mov eax,DWORD [20+esp] + adc ecx,edx + mov edx,DWORD [edi] + adc ebp,0 + mov DWORD [12+eax],ebx + mov eax,DWORD [16+esi] + ; saved r[3] + ; ################## Calculate word 4 + xor ebx,ebx + ; mul a[4]*b[0] + mul edx + add ecx,eax + mov eax,DWORD [12+esi] + adc ebp,edx + mov edx,DWORD [4+edi] + adc ebx,0 + ; mul a[3]*b[1] + mul edx + add ecx,eax + mov eax,DWORD [8+esi] + adc ebp,edx + mov edx,DWORD [8+edi] + adc ebx,0 + ; mul a[2]*b[2] + mul edx + add ecx,eax + mov eax,DWORD [4+esi] + adc ebp,edx + mov edx,DWORD [12+edi] + adc ebx,0 + ; mul a[1]*b[3] + mul edx + add ecx,eax + mov eax,DWORD [esi] + adc ebp,edx + mov edx,DWORD [16+edi] + adc ebx,0 + ; mul a[0]*b[4] + mul edx + add ecx,eax + mov eax,DWORD [20+esp] + adc ebp,edx + mov edx,DWORD [edi] + adc ebx,0 + mov DWORD [16+eax],ecx + mov eax,DWORD [20+esi] + ; saved r[4] + ; ################## Calculate word 5 + xor ecx,ecx + ; mul a[5]*b[0] + mul edx + add ebp,eax + mov eax,DWORD [16+esi] + adc ebx,edx + mov edx,DWORD [4+edi] + adc ecx,0 + ; mul a[4]*b[1] + mul edx + add ebp,eax + mov eax,DWORD [12+esi] + adc ebx,edx + mov edx,DWORD [8+edi] + adc ecx,0 + ; mul a[3]*b[2] + mul edx + add ebp,eax + mov eax,DWORD [8+esi] + adc ebx,edx + mov edx,DWORD [12+edi] + adc ecx,0 + ; mul a[2]*b[3] + mul edx + add ebp,eax + mov eax,DWORD [4+esi] + adc ebx,edx + mov edx,DWORD [16+edi] + adc ecx,0 + ; mul a[1]*b[4] + mul edx + add ebp,eax + mov eax,DWORD [esi] + adc ebx,edx + mov edx,DWORD [20+edi] + adc ecx,0 + ; mul a[0]*b[5] + mul edx + add ebp,eax + mov eax,DWORD [20+esp] + adc ebx,edx + mov edx,DWORD [edi] + adc ecx,0 + mov DWORD [20+eax],ebp + mov eax,DWORD [24+esi] + ; saved r[5] + ; ################## Calculate word 6 + xor ebp,ebp + ; mul a[6]*b[0] + mul edx + add ebx,eax + mov eax,DWORD [20+esi] + adc ecx,edx + mov edx,DWORD [4+edi] + adc ebp,0 + ; mul a[5]*b[1] + mul edx + add ebx,eax + mov eax,DWORD [16+esi] + adc ecx,edx + mov edx,DWORD [8+edi] + adc ebp,0 + ; mul a[4]*b[2] + mul edx + add ebx,eax + mov eax,DWORD [12+esi] + adc ecx,edx + mov edx,DWORD [12+edi] + adc ebp,0 + ; mul a[3]*b[3] + mul edx + add ebx,eax + mov eax,DWORD [8+esi] + adc ecx,edx + mov edx,DWORD [16+edi] + adc ebp,0 + ; mul a[2]*b[4] + mul edx + add ebx,eax + mov eax,DWORD [4+esi] + adc ecx,edx + mov edx,DWORD [20+edi] + adc ebp,0 + ; mul a[1]*b[5] + mul edx + add ebx,eax + mov eax,DWORD [esi] + adc ecx,edx + mov edx,DWORD [24+edi] + adc ebp,0 + ; mul a[0]*b[6] + mul edx + add ebx,eax + mov eax,DWORD [20+esp] + adc ecx,edx + mov edx,DWORD [edi] + adc ebp,0 + mov DWORD [24+eax],ebx + mov eax,DWORD [28+esi] + ; saved r[6] + ; ################## Calculate word 7 + xor ebx,ebx + ; mul a[7]*b[0] + mul edx + add ecx,eax + mov eax,DWORD [24+esi] + adc ebp,edx + mov edx,DWORD [4+edi] + adc ebx,0 + ; mul a[6]*b[1] + mul edx + add ecx,eax + mov eax,DWORD [20+esi] + adc ebp,edx + mov edx,DWORD [8+edi] + adc ebx,0 + ; mul a[5]*b[2] + mul edx + add ecx,eax + mov eax,DWORD [16+esi] + adc ebp,edx + mov edx,DWORD [12+edi] + adc ebx,0 + ; mul a[4]*b[3] + mul edx + add ecx,eax + mov eax,DWORD [12+esi] + adc ebp,edx + mov edx,DWORD [16+edi] + adc ebx,0 + ; mul a[3]*b[4] + mul edx + add ecx,eax + mov eax,DWORD [8+esi] + adc ebp,edx + mov edx,DWORD [20+edi] + adc ebx,0 + ; mul a[2]*b[5] + mul edx + add ecx,eax + mov eax,DWORD [4+esi] + adc ebp,edx + mov edx,DWORD [24+edi] + adc ebx,0 + ; mul a[1]*b[6] + mul edx + add ecx,eax + mov eax,DWORD [esi] + adc ebp,edx + mov edx,DWORD [28+edi] + adc ebx,0 + ; mul a[0]*b[7] + mul edx + add ecx,eax + mov eax,DWORD [20+esp] + adc ebp,edx + mov edx,DWORD [4+edi] + adc ebx,0 + mov DWORD [28+eax],ecx + mov eax,DWORD [28+esi] + ; saved r[7] + ; ################## Calculate word 8 + xor ecx,ecx + ; mul a[7]*b[1] + mul edx + add ebp,eax + mov eax,DWORD [24+esi] + adc ebx,edx + mov edx,DWORD [8+edi] + adc ecx,0 + ; mul a[6]*b[2] + mul edx + add ebp,eax + mov eax,DWORD [20+esi] + adc ebx,edx + mov edx,DWORD [12+edi] + adc ecx,0 + ; mul a[5]*b[3] + mul edx + add ebp,eax + mov eax,DWORD [16+esi] + adc ebx,edx + mov edx,DWORD [16+edi] + adc ecx,0 + ; mul a[4]*b[4] + mul edx + add ebp,eax + mov eax,DWORD [12+esi] + adc ebx,edx + mov edx,DWORD [20+edi] + adc ecx,0 + ; mul a[3]*b[5] + mul edx + add ebp,eax + mov eax,DWORD [8+esi] + adc ebx,edx + mov edx,DWORD [24+edi] + adc ecx,0 + ; mul a[2]*b[6] + mul edx + add ebp,eax + mov eax,DWORD [4+esi] + adc ebx,edx + mov edx,DWORD [28+edi] + adc ecx,0 + ; mul a[1]*b[7] + mul edx + add ebp,eax + mov eax,DWORD [20+esp] + adc ebx,edx + mov edx,DWORD [8+edi] + adc ecx,0 + mov DWORD [32+eax],ebp + mov eax,DWORD [28+esi] + ; saved r[8] + ; ################## Calculate word 9 + xor ebp,ebp + ; mul a[7]*b[2] + mul edx + add ebx,eax + mov eax,DWORD [24+esi] + adc ecx,edx + mov edx,DWORD [12+edi] + adc ebp,0 + ; mul a[6]*b[3] + mul edx + add ebx,eax + mov eax,DWORD [20+esi] + adc ecx,edx + mov edx,DWORD [16+edi] + adc ebp,0 + ; mul a[5]*b[4] + mul edx + add ebx,eax + mov eax,DWORD [16+esi] + adc ecx,edx + mov edx,DWORD [20+edi] + adc ebp,0 + ; mul a[4]*b[5] + mul edx + add ebx,eax + mov eax,DWORD [12+esi] + adc ecx,edx + mov edx,DWORD [24+edi] + adc ebp,0 + ; mul a[3]*b[6] + mul edx + add ebx,eax + mov eax,DWORD [8+esi] + adc ecx,edx + mov edx,DWORD [28+edi] + adc ebp,0 + ; mul a[2]*b[7] + mul edx + add ebx,eax + mov eax,DWORD [20+esp] + adc ecx,edx + mov edx,DWORD [12+edi] + adc ebp,0 + mov DWORD [36+eax],ebx + mov eax,DWORD [28+esi] + ; saved r[9] + ; ################## Calculate word 10 + xor ebx,ebx + ; mul a[7]*b[3] + mul edx + add ecx,eax + mov eax,DWORD [24+esi] + adc ebp,edx + mov edx,DWORD [16+edi] + adc ebx,0 + ; mul a[6]*b[4] + mul edx + add ecx,eax + mov eax,DWORD [20+esi] + adc ebp,edx + mov edx,DWORD [20+edi] + adc ebx,0 + ; mul a[5]*b[5] + mul edx + add ecx,eax + mov eax,DWORD [16+esi] + adc ebp,edx + mov edx,DWORD [24+edi] + adc ebx,0 + ; mul a[4]*b[6] + mul edx + add ecx,eax + mov eax,DWORD [12+esi] + adc ebp,edx + mov edx,DWORD [28+edi] + adc ebx,0 + ; mul a[3]*b[7] + mul edx + add ecx,eax + mov eax,DWORD [20+esp] + adc ebp,edx + mov edx,DWORD [16+edi] + adc ebx,0 + mov DWORD [40+eax],ecx + mov eax,DWORD [28+esi] + ; saved r[10] + ; ################## Calculate word 11 + xor ecx,ecx + ; mul a[7]*b[4] + mul edx + add ebp,eax + mov eax,DWORD [24+esi] + adc ebx,edx + mov edx,DWORD [20+edi] + adc ecx,0 + ; mul a[6]*b[5] + mul edx + add ebp,eax + mov eax,DWORD [20+esi] + adc ebx,edx + mov edx,DWORD [24+edi] + adc ecx,0 + ; mul a[5]*b[6] + mul edx + add ebp,eax + mov eax,DWORD [16+esi] + adc ebx,edx + mov edx,DWORD [28+edi] + adc ecx,0 + ; mul a[4]*b[7] + mul edx + add ebp,eax + mov eax,DWORD [20+esp] + adc ebx,edx + mov edx,DWORD [20+edi] + adc ecx,0 + mov DWORD [44+eax],ebp + mov eax,DWORD [28+esi] + ; saved r[11] + ; ################## Calculate word 12 + xor ebp,ebp + ; mul a[7]*b[5] + mul edx + add ebx,eax + mov eax,DWORD [24+esi] + adc ecx,edx + mov edx,DWORD [24+edi] + adc ebp,0 + ; mul a[6]*b[6] + mul edx + add ebx,eax + mov eax,DWORD [20+esi] + adc ecx,edx + mov edx,DWORD [28+edi] + adc ebp,0 + ; mul a[5]*b[7] + mul edx + add ebx,eax + mov eax,DWORD [20+esp] + adc ecx,edx + mov edx,DWORD [24+edi] + adc ebp,0 + mov DWORD [48+eax],ebx + mov eax,DWORD [28+esi] + ; saved r[12] + ; ################## Calculate word 13 + xor ebx,ebx + ; mul a[7]*b[6] + mul edx + add ecx,eax + mov eax,DWORD [24+esi] + adc ebp,edx + mov edx,DWORD [28+edi] + adc ebx,0 + ; mul a[6]*b[7] + mul edx + add ecx,eax + mov eax,DWORD [20+esp] + adc ebp,edx + mov edx,DWORD [28+edi] + adc ebx,0 + mov DWORD [52+eax],ecx + mov eax,DWORD [28+esi] + ; saved r[13] + ; ################## Calculate word 14 + xor ecx,ecx + ; mul a[7]*b[7] + mul edx + add ebp,eax + mov eax,DWORD [20+esp] + adc ebx,edx + adc ecx,0 + mov DWORD [56+eax],ebp + ; saved r[14] + ; save r[15] + mov DWORD [60+eax],ebx + pop ebx + pop ebp + pop edi + pop esi + ret +global _bn_mul_comba4 +align 16 +_bn_mul_comba4: +L$_bn_mul_comba4_begin: + push esi + mov esi,DWORD [12+esp] + push edi + mov edi,DWORD [20+esp] + push ebp + push ebx + xor ebx,ebx + mov eax,DWORD [esi] + xor ecx,ecx + mov edx,DWORD [edi] + ; ################## Calculate word 0 + xor ebp,ebp + ; mul a[0]*b[0] + mul edx + add ebx,eax + mov eax,DWORD [20+esp] + adc ecx,edx + mov edx,DWORD [edi] + adc ebp,0 + mov DWORD [eax],ebx + mov eax,DWORD [4+esi] + ; saved r[0] + ; ################## Calculate word 1 + xor ebx,ebx + ; mul a[1]*b[0] + mul edx + add ecx,eax + mov eax,DWORD [esi] + adc ebp,edx + mov edx,DWORD [4+edi] + adc ebx,0 + ; mul a[0]*b[1] + mul edx + add ecx,eax + mov eax,DWORD [20+esp] + adc ebp,edx + mov edx,DWORD [edi] + adc ebx,0 + mov DWORD [4+eax],ecx + mov eax,DWORD [8+esi] + ; saved r[1] + ; ################## Calculate word 2 + xor ecx,ecx + ; mul a[2]*b[0] + mul edx + add ebp,eax + mov eax,DWORD [4+esi] + adc ebx,edx + mov edx,DWORD [4+edi] + adc ecx,0 + ; mul a[1]*b[1] + mul edx + add ebp,eax + mov eax,DWORD [esi] + adc ebx,edx + mov edx,DWORD [8+edi] + adc ecx,0 + ; mul a[0]*b[2] + mul edx + add ebp,eax + mov eax,DWORD [20+esp] + adc ebx,edx + mov edx,DWORD [edi] + adc ecx,0 + mov DWORD [8+eax],ebp + mov eax,DWORD [12+esi] + ; saved r[2] + ; ################## Calculate word 3 + xor ebp,ebp + ; mul a[3]*b[0] + mul edx + add ebx,eax + mov eax,DWORD [8+esi] + adc ecx,edx + mov edx,DWORD [4+edi] + adc ebp,0 + ; mul a[2]*b[1] + mul edx + add ebx,eax + mov eax,DWORD [4+esi] + adc ecx,edx + mov edx,DWORD [8+edi] + adc ebp,0 + ; mul a[1]*b[2] + mul edx + add ebx,eax + mov eax,DWORD [esi] + adc ecx,edx + mov edx,DWORD [12+edi] + adc ebp,0 + ; mul a[0]*b[3] + mul edx + add ebx,eax + mov eax,DWORD [20+esp] + adc ecx,edx + mov edx,DWORD [4+edi] + adc ebp,0 + mov DWORD [12+eax],ebx + mov eax,DWORD [12+esi] + ; saved r[3] + ; ################## Calculate word 4 + xor ebx,ebx + ; mul a[3]*b[1] + mul edx + add ecx,eax + mov eax,DWORD [8+esi] + adc ebp,edx + mov edx,DWORD [8+edi] + adc ebx,0 + ; mul a[2]*b[2] + mul edx + add ecx,eax + mov eax,DWORD [4+esi] + adc ebp,edx + mov edx,DWORD [12+edi] + adc ebx,0 + ; mul a[1]*b[3] + mul edx + add ecx,eax + mov eax,DWORD [20+esp] + adc ebp,edx + mov edx,DWORD [8+edi] + adc ebx,0 + mov DWORD [16+eax],ecx + mov eax,DWORD [12+esi] + ; saved r[4] + ; ################## Calculate word 5 + xor ecx,ecx + ; mul a[3]*b[2] + mul edx + add ebp,eax + mov eax,DWORD [8+esi] + adc ebx,edx + mov edx,DWORD [12+edi] + adc ecx,0 + ; mul a[2]*b[3] + mul edx + add ebp,eax + mov eax,DWORD [20+esp] + adc ebx,edx + mov edx,DWORD [12+edi] + adc ecx,0 + mov DWORD [20+eax],ebp + mov eax,DWORD [12+esi] + ; saved r[5] + ; ################## Calculate word 6 + xor ebp,ebp + ; mul a[3]*b[3] + mul edx + add ebx,eax + mov eax,DWORD [20+esp] + adc ecx,edx + adc ebp,0 + mov DWORD [24+eax],ebx + ; saved r[6] + ; save r[7] + mov DWORD [28+eax],ecx + pop ebx + pop ebp + pop edi + pop esi + ret +global _bn_sqr_comba8 +align 16 +_bn_sqr_comba8: +L$_bn_sqr_comba8_begin: + push esi + push edi + push ebp + push ebx + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + xor ebx,ebx + xor ecx,ecx + mov eax,DWORD [esi] + ; ############### Calculate word 0 + xor ebp,ebp + ; sqr a[0]*a[0] + mul eax + add ebx,eax + adc ecx,edx + mov edx,DWORD [esi] + adc ebp,0 + mov DWORD [edi],ebx + mov eax,DWORD [4+esi] + ; saved r[0] + ; ############### Calculate word 1 + xor ebx,ebx + ; sqr a[1]*a[0] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [8+esi] + adc ebx,0 + mov DWORD [4+edi],ecx + mov edx,DWORD [esi] + ; saved r[1] + ; ############### Calculate word 2 + xor ecx,ecx + ; sqr a[2]*a[0] + mul edx + add eax,eax + adc edx,edx + adc ecx,0 + add ebp,eax + adc ebx,edx + mov eax,DWORD [4+esi] + adc ecx,0 + ; sqr a[1]*a[1] + mul eax + add ebp,eax + adc ebx,edx + mov edx,DWORD [esi] + adc ecx,0 + mov DWORD [8+edi],ebp + mov eax,DWORD [12+esi] + ; saved r[2] + ; ############### Calculate word 3 + xor ebp,ebp + ; sqr a[3]*a[0] + mul edx + add eax,eax + adc edx,edx + adc ebp,0 + add ebx,eax + adc ecx,edx + mov eax,DWORD [8+esi] + adc ebp,0 + mov edx,DWORD [4+esi] + ; sqr a[2]*a[1] + mul edx + add eax,eax + adc edx,edx + adc ebp,0 + add ebx,eax + adc ecx,edx + mov eax,DWORD [16+esi] + adc ebp,0 + mov DWORD [12+edi],ebx + mov edx,DWORD [esi] + ; saved r[3] + ; ############### Calculate word 4 + xor ebx,ebx + ; sqr a[4]*a[0] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [12+esi] + adc ebx,0 + mov edx,DWORD [4+esi] + ; sqr a[3]*a[1] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [8+esi] + adc ebx,0 + ; sqr a[2]*a[2] + mul eax + add ecx,eax + adc ebp,edx + mov edx,DWORD [esi] + adc ebx,0 + mov DWORD [16+edi],ecx + mov eax,DWORD [20+esi] + ; saved r[4] + ; ############### Calculate word 5 + xor ecx,ecx + ; sqr a[5]*a[0] + mul edx + add eax,eax + adc edx,edx + adc ecx,0 + add ebp,eax + adc ebx,edx + mov eax,DWORD [16+esi] + adc ecx,0 + mov edx,DWORD [4+esi] + ; sqr a[4]*a[1] + mul edx + add eax,eax + adc edx,edx + adc ecx,0 + add ebp,eax + adc ebx,edx + mov eax,DWORD [12+esi] + adc ecx,0 + mov edx,DWORD [8+esi] + ; sqr a[3]*a[2] + mul edx + add eax,eax + adc edx,edx + adc ecx,0 + add ebp,eax + adc ebx,edx + mov eax,DWORD [24+esi] + adc ecx,0 + mov DWORD [20+edi],ebp + mov edx,DWORD [esi] + ; saved r[5] + ; ############### Calculate word 6 + xor ebp,ebp + ; sqr a[6]*a[0] + mul edx + add eax,eax + adc edx,edx + adc ebp,0 + add ebx,eax + adc ecx,edx + mov eax,DWORD [20+esi] + adc ebp,0 + mov edx,DWORD [4+esi] + ; sqr a[5]*a[1] + mul edx + add eax,eax + adc edx,edx + adc ebp,0 + add ebx,eax + adc ecx,edx + mov eax,DWORD [16+esi] + adc ebp,0 + mov edx,DWORD [8+esi] + ; sqr a[4]*a[2] + mul edx + add eax,eax + adc edx,edx + adc ebp,0 + add ebx,eax + adc ecx,edx + mov eax,DWORD [12+esi] + adc ebp,0 + ; sqr a[3]*a[3] + mul eax + add ebx,eax + adc ecx,edx + mov edx,DWORD [esi] + adc ebp,0 + mov DWORD [24+edi],ebx + mov eax,DWORD [28+esi] + ; saved r[6] + ; ############### Calculate word 7 + xor ebx,ebx + ; sqr a[7]*a[0] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [24+esi] + adc ebx,0 + mov edx,DWORD [4+esi] + ; sqr a[6]*a[1] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [20+esi] + adc ebx,0 + mov edx,DWORD [8+esi] + ; sqr a[5]*a[2] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [16+esi] + adc ebx,0 + mov edx,DWORD [12+esi] + ; sqr a[4]*a[3] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [28+esi] + adc ebx,0 + mov DWORD [28+edi],ecx + mov edx,DWORD [4+esi] + ; saved r[7] + ; ############### Calculate word 8 + xor ecx,ecx + ; sqr a[7]*a[1] + mul edx + add eax,eax + adc edx,edx + adc ecx,0 + add ebp,eax + adc ebx,edx + mov eax,DWORD [24+esi] + adc ecx,0 + mov edx,DWORD [8+esi] + ; sqr a[6]*a[2] + mul edx + add eax,eax + adc edx,edx + adc ecx,0 + add ebp,eax + adc ebx,edx + mov eax,DWORD [20+esi] + adc ecx,0 + mov edx,DWORD [12+esi] + ; sqr a[5]*a[3] + mul edx + add eax,eax + adc edx,edx + adc ecx,0 + add ebp,eax + adc ebx,edx + mov eax,DWORD [16+esi] + adc ecx,0 + ; sqr a[4]*a[4] + mul eax + add ebp,eax + adc ebx,edx + mov edx,DWORD [8+esi] + adc ecx,0 + mov DWORD [32+edi],ebp + mov eax,DWORD [28+esi] + ; saved r[8] + ; ############### Calculate word 9 + xor ebp,ebp + ; sqr a[7]*a[2] + mul edx + add eax,eax + adc edx,edx + adc ebp,0 + add ebx,eax + adc ecx,edx + mov eax,DWORD [24+esi] + adc ebp,0 + mov edx,DWORD [12+esi] + ; sqr a[6]*a[3] + mul edx + add eax,eax + adc edx,edx + adc ebp,0 + add ebx,eax + adc ecx,edx + mov eax,DWORD [20+esi] + adc ebp,0 + mov edx,DWORD [16+esi] + ; sqr a[5]*a[4] + mul edx + add eax,eax + adc edx,edx + adc ebp,0 + add ebx,eax + adc ecx,edx + mov eax,DWORD [28+esi] + adc ebp,0 + mov DWORD [36+edi],ebx + mov edx,DWORD [12+esi] + ; saved r[9] + ; ############### Calculate word 10 + xor ebx,ebx + ; sqr a[7]*a[3] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [24+esi] + adc ebx,0 + mov edx,DWORD [16+esi] + ; sqr a[6]*a[4] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [20+esi] + adc ebx,0 + ; sqr a[5]*a[5] + mul eax + add ecx,eax + adc ebp,edx + mov edx,DWORD [16+esi] + adc ebx,0 + mov DWORD [40+edi],ecx + mov eax,DWORD [28+esi] + ; saved r[10] + ; ############### Calculate word 11 + xor ecx,ecx + ; sqr a[7]*a[4] + mul edx + add eax,eax + adc edx,edx + adc ecx,0 + add ebp,eax + adc ebx,edx + mov eax,DWORD [24+esi] + adc ecx,0 + mov edx,DWORD [20+esi] + ; sqr a[6]*a[5] + mul edx + add eax,eax + adc edx,edx + adc ecx,0 + add ebp,eax + adc ebx,edx + mov eax,DWORD [28+esi] + adc ecx,0 + mov DWORD [44+edi],ebp + mov edx,DWORD [20+esi] + ; saved r[11] + ; ############### Calculate word 12 + xor ebp,ebp + ; sqr a[7]*a[5] + mul edx + add eax,eax + adc edx,edx + adc ebp,0 + add ebx,eax + adc ecx,edx + mov eax,DWORD [24+esi] + adc ebp,0 + ; sqr a[6]*a[6] + mul eax + add ebx,eax + adc ecx,edx + mov edx,DWORD [24+esi] + adc ebp,0 + mov DWORD [48+edi],ebx + mov eax,DWORD [28+esi] + ; saved r[12] + ; ############### Calculate word 13 + xor ebx,ebx + ; sqr a[7]*a[6] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [28+esi] + adc ebx,0 + mov DWORD [52+edi],ecx + ; saved r[13] + ; ############### Calculate word 14 + xor ecx,ecx + ; sqr a[7]*a[7] + mul eax + add ebp,eax + adc ebx,edx + adc ecx,0 + mov DWORD [56+edi],ebp + ; saved r[14] + mov DWORD [60+edi],ebx + pop ebx + pop ebp + pop edi + pop esi + ret +global _bn_sqr_comba4 +align 16 +_bn_sqr_comba4: +L$_bn_sqr_comba4_begin: + push esi + push edi + push ebp + push ebx + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + xor ebx,ebx + xor ecx,ecx + mov eax,DWORD [esi] + ; ############### Calculate word 0 + xor ebp,ebp + ; sqr a[0]*a[0] + mul eax + add ebx,eax + adc ecx,edx + mov edx,DWORD [esi] + adc ebp,0 + mov DWORD [edi],ebx + mov eax,DWORD [4+esi] + ; saved r[0] + ; ############### Calculate word 1 + xor ebx,ebx + ; sqr a[1]*a[0] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [8+esi] + adc ebx,0 + mov DWORD [4+edi],ecx + mov edx,DWORD [esi] + ; saved r[1] + ; ############### Calculate word 2 + xor ecx,ecx + ; sqr a[2]*a[0] + mul edx + add eax,eax + adc edx,edx + adc ecx,0 + add ebp,eax + adc ebx,edx + mov eax,DWORD [4+esi] + adc ecx,0 + ; sqr a[1]*a[1] + mul eax + add ebp,eax + adc ebx,edx + mov edx,DWORD [esi] + adc ecx,0 + mov DWORD [8+edi],ebp + mov eax,DWORD [12+esi] + ; saved r[2] + ; ############### Calculate word 3 + xor ebp,ebp + ; sqr a[3]*a[0] + mul edx + add eax,eax + adc edx,edx + adc ebp,0 + add ebx,eax + adc ecx,edx + mov eax,DWORD [8+esi] + adc ebp,0 + mov edx,DWORD [4+esi] + ; sqr a[2]*a[1] + mul edx + add eax,eax + adc edx,edx + adc ebp,0 + add ebx,eax + adc ecx,edx + mov eax,DWORD [12+esi] + adc ebp,0 + mov DWORD [12+edi],ebx + mov edx,DWORD [4+esi] + ; saved r[3] + ; ############### Calculate word 4 + xor ebx,ebx + ; sqr a[3]*a[1] + mul edx + add eax,eax + adc edx,edx + adc ebx,0 + add ecx,eax + adc ebp,edx + mov eax,DWORD [8+esi] + adc ebx,0 + ; sqr a[2]*a[2] + mul eax + add ecx,eax + adc ebp,edx + mov edx,DWORD [8+esi] + adc ebx,0 + mov DWORD [16+edi],ecx + mov eax,DWORD [12+esi] + ; saved r[4] + ; ############### Calculate word 5 + xor ecx,ecx + ; sqr a[3]*a[2] + mul edx + add eax,eax + adc edx,edx + adc ecx,0 + add ebp,eax + adc ebx,edx + mov eax,DWORD [12+esi] + adc ecx,0 + mov DWORD [20+edi],ebp + ; saved r[5] + ; ############### Calculate word 6 + xor ebp,ebp + ; sqr a[3]*a[3] + mul eax + add ebx,eax + adc ecx,edx + adc ebp,0 + mov DWORD [24+edi],ebx + ; saved r[6] + mov DWORD [28+edi],ecx + pop ebx + pop ebp + pop edi + pop esi + ret +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/ghash-armv4-linux.S b/third_party/boringssl/gen/bcm/ghash-armv4-linux.S new file mode 100644 index 00000000..397340cf --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-armv4-linux.S @@ -0,0 +1,242 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL +@ instructions are in aesv8-armx.pl.) +.arch armv7-a + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#define ldrplb ldrbpl +#define ldrneb ldrbne +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl gcm_init_neon +.hidden gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + vld1.64 d7,[r1]! @ load H + vmov.i8 q8,#0xe1 + vld1.64 d6,[r1] + vshl.i64 d17,#57 + vshr.u64 d16,#63 @ t0=0xc2....01 + vdup.8 q9,d7[7] + vshr.u64 d26,d6,#63 + vshr.s8 q9,#7 @ broadcast carry bit + vshl.i64 q3,q3,#1 + vand q8,q8,q9 + vorr d7,d26 @ H<<<=1 + veor q3,q3,q8 @ twisted H + vstmia r0,{q3} + + bx lr @ bx lr +.size gcm_init_neon,.-gcm_init_neon + +.globl gcm_gmult_neon +.hidden gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + vld1.64 d7,[r0]! @ load Xi + vld1.64 d6,[r0]! + vmov.i64 d29,#0x0000ffffffffffff + vldmia r1,{d26,d27} @ load twisted H + vmov.i64 d30,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 q3,q3 +#endif + vmov.i64 d31,#0x000000000000ffff + veor d28,d26,d27 @ Karatsuba pre-processing + mov r3,#16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.globl gcm_ghash_neon +.hidden gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + vld1.64 d1,[r0]! @ load Xi + vld1.64 d0,[r0]! + vmov.i64 d29,#0x0000ffffffffffff + vldmia r1,{d26,d27} @ load twisted H + vmov.i64 d30,#0x00000000ffffffff +#ifdef __ARMEL__ + vrev64.8 q0,q0 +#endif + vmov.i64 d31,#0x000000000000ffff + veor d28,d26,d27 @ Karatsuba pre-processing + +.Loop_neon: + vld1.64 d7,[r2]! @ load inp + vld1.64 d6,[r2]! +#ifdef __ARMEL__ + vrev64.8 q3,q3 +#endif + veor q3,q0 @ inp^=Xi +.Lgmult_neon: + vext.8 d16, d26, d26, #1 @ A1 + vmull.p8 q8, d16, d6 @ F = A1*B + vext.8 d0, d6, d6, #1 @ B1 + vmull.p8 q0, d26, d0 @ E = A*B1 + vext.8 d18, d26, d26, #2 @ A2 + vmull.p8 q9, d18, d6 @ H = A2*B + vext.8 d22, d6, d6, #2 @ B2 + vmull.p8 q11, d26, d22 @ G = A*B2 + vext.8 d20, d26, d26, #3 @ A3 + veor q8, q8, q0 @ L = E + F + vmull.p8 q10, d20, d6 @ J = A3*B + vext.8 d0, d6, d6, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q0, d26, d0 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d6, d6, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d26, d22 @ K = A*B4 + veor q10, q10, q0 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q0, d26, d6 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q0, q0, q8 + veor q0, q0, q10 + veor d6,d6,d7 @ Karatsuba pre-processing + vext.8 d16, d28, d28, #1 @ A1 + vmull.p8 q8, d16, d6 @ F = A1*B + vext.8 d2, d6, d6, #1 @ B1 + vmull.p8 q1, d28, d2 @ E = A*B1 + vext.8 d18, d28, d28, #2 @ A2 + vmull.p8 q9, d18, d6 @ H = A2*B + vext.8 d22, d6, d6, #2 @ B2 + vmull.p8 q11, d28, d22 @ G = A*B2 + vext.8 d20, d28, d28, #3 @ A3 + veor q8, q8, q1 @ L = E + F + vmull.p8 q10, d20, d6 @ J = A3*B + vext.8 d2, d6, d6, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q1, d28, d2 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d6, d6, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d28, d22 @ K = A*B4 + veor q10, q10, q1 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q1, d28, d6 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q1, q1, q8 + veor q1, q1, q10 + vext.8 d16, d27, d27, #1 @ A1 + vmull.p8 q8, d16, d7 @ F = A1*B + vext.8 d4, d7, d7, #1 @ B1 + vmull.p8 q2, d27, d4 @ E = A*B1 + vext.8 d18, d27, d27, #2 @ A2 + vmull.p8 q9, d18, d7 @ H = A2*B + vext.8 d22, d7, d7, #2 @ B2 + vmull.p8 q11, d27, d22 @ G = A*B2 + vext.8 d20, d27, d27, #3 @ A3 + veor q8, q8, q2 @ L = E + F + vmull.p8 q10, d20, d7 @ J = A3*B + vext.8 d4, d7, d7, #3 @ B3 + veor q9, q9, q11 @ M = G + H + vmull.p8 q2, d27, d4 @ I = A*B3 + veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 + vand d17, d17, d29 + vext.8 d22, d7, d7, #4 @ B4 + veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 + vand d19, d19, d30 + vmull.p8 q11, d27, d22 @ K = A*B4 + veor q10, q10, q2 @ N = I + J + veor d16, d16, d17 + veor d18, d18, d19 + veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 + vand d21, d21, d31 + vext.8 q8, q8, q8, #15 + veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 + vmov.i64 d23, #0 + vext.8 q9, q9, q9, #14 + veor d20, d20, d21 + vmull.p8 q2, d27, d7 @ D = A*B + vext.8 q11, q11, q11, #12 + vext.8 q10, q10, q10, #13 + veor q8, q8, q9 + veor q10, q10, q11 + veor q2, q2, q8 + veor q2, q2, q10 + veor q1,q1,q0 @ Karatsuba post-processing + veor q1,q1,q2 + veor d1,d1,d2 + veor d4,d4,d3 @ Xh|Xl - 256-bit result + + @ equivalent of reduction_avx from ghash-x86_64.pl + vshl.i64 q9,q0,#57 @ 1st phase + vshl.i64 q10,q0,#62 + veor q10,q10,q9 @ + vshl.i64 q9,q0,#63 + veor q10, q10, q9 @ + veor d1,d1,d20 @ + veor d4,d4,d21 + + vshr.u64 q10,q0,#1 @ 2nd phase + veor q2,q2,q0 + veor q0,q0,q10 @ + vshr.u64 q10,q10,#6 + vshr.u64 q0,q0,#1 @ + veor q0,q0,q2 @ + veor q0,q0,q10 @ + + subs r3,#16 + bne .Loop_neon + +#ifdef __ARMEL__ + vrev64.8 q0,q0 +#endif + sub r0,#16 + vst1.64 d1,[r0]! @ write out Xi + vst1.64 d0,[r0] + + bx lr @ bx lr +.size gcm_ghash_neon,.-gcm_ghash_neon +#endif +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/ghash-neon-armv8-apple.S b/third_party/boringssl/gen/bcm/ghash-neon-armv8-apple.S new file mode 100644 index 00000000..15b822c0 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-neon-armv8-apple.S @@ -0,0 +1,333 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.text + +.globl _gcm_init_neon +.private_extern _gcm_init_neon + +.align 4 +_gcm_init_neon: + AARCH64_VALID_CALL_TARGET + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret + + +.globl _gcm_gmult_neon +.private_extern _gcm_gmult_neon + +.align 4 +_gcm_gmult_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks@PAGE // load constants + add x9, x9, Lmasks@PAGEOFF + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b Lgmult_neon + + +.globl _gcm_ghash_neon +.private_extern _gcm_ghash_neon + +.align 4 +_gcm_ghash_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks@PAGE // load constants + add x9, x9, Lmasks@PAGEOFF + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret + + +.section __TEXT,__const +.align 4 +Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/ghash-neon-armv8-linux.S b/third_party/boringssl/gen/bcm/ghash-neon-armv8-linux.S new file mode 100644 index 00000000..ee57d512 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-neon-armv8-linux.S @@ -0,0 +1,333 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.text + +.globl gcm_init_neon +.hidden gcm_init_neon +.type gcm_init_neon,%function +.align 4 +gcm_init_neon: + AARCH64_VALID_CALL_TARGET + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret +.size gcm_init_neon,.-gcm_init_neon + +.globl gcm_gmult_neon +.hidden gcm_gmult_neon +.type gcm_gmult_neon,%function +.align 4 +gcm_gmult_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, .Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b .Lgmult_neon +.size gcm_gmult_neon,.-gcm_gmult_neon + +.globl gcm_ghash_neon +.hidden gcm_ghash_neon +.type gcm_ghash_neon,%function +.align 4 +gcm_ghash_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, .Lmasks // load constants + add x9, x9, :lo12:.Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +.Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +.Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne .Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret +.size gcm_ghash_neon,.-gcm_ghash_neon + +.section .rodata +.align 4 +.Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/ghash-neon-armv8-win.S b/third_party/boringssl/gen/bcm/ghash-neon-armv8-win.S new file mode 100644 index 00000000..91814da0 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-neon-armv8-win.S @@ -0,0 +1,339 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.text + +.globl gcm_init_neon + +.def gcm_init_neon + .type 32 +.endef +.align 4 +gcm_init_neon: + AARCH64_VALID_CALL_TARGET + // This function is adapted from gcm_init_v8. xC2 is t3. + ld1 {v17.2d}, [x1] // load H + movi v19.16b, #0xe1 + shl v19.2d, v19.2d, #57 // 0xc2.0 + ext v3.16b, v17.16b, v17.16b, #8 + ushr v18.2d, v19.2d, #63 + dup v17.4s, v17.s[1] + ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 + ushr v18.2d, v3.2d, #63 + sshr v17.4s, v17.4s, #31 // broadcast carry bit + and v18.16b, v18.16b, v16.16b + shl v3.2d, v3.2d, #1 + ext v18.16b, v18.16b, v18.16b, #8 + and v16.16b, v16.16b, v17.16b + orr v3.16b, v3.16b, v18.16b // H<<<=1 + eor v5.16b, v3.16b, v16.16b // twisted H + st1 {v5.2d}, [x0] // store Htable[0] + ret + + +.globl gcm_gmult_neon + +.def gcm_gmult_neon + .type 32 +.endef +.align 4 +gcm_gmult_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v3.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks // load constants + add x9, x9, :lo12:Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v3.16b, v3.16b // byteswap Xi + ext v3.16b, v3.16b, v3.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + + mov x3, #16 + b Lgmult_neon + + +.globl gcm_ghash_neon + +.def gcm_ghash_neon + .type 32 +.endef +.align 4 +gcm_ghash_neon: + AARCH64_VALID_CALL_TARGET + ld1 {v0.16b}, [x0] // load Xi + ld1 {v5.1d}, [x1], #8 // load twisted H + ld1 {v6.1d}, [x1] + adrp x9, Lmasks // load constants + add x9, x9, :lo12:Lmasks + ld1 {v24.2d, v25.2d}, [x9] + rev64 v0.16b, v0.16b // byteswap Xi + ext v0.16b, v0.16b, v0.16b, #8 + eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing + +Loop_neon: + ld1 {v3.16b}, [x2], #16 // load inp + rev64 v3.16b, v3.16b // byteswap inp + ext v3.16b, v3.16b, v3.16b, #8 + eor v3.16b, v3.16b, v0.16b // inp ^= Xi + +Lgmult_neon: + // Split the input into v3 and v4. (The upper halves are unused, + // so it is okay to leave them alone.) + ins v4.d[0], v3.d[1] + ext v16.8b, v5.8b, v5.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v0.8b, v3.8b, v3.8b, #1 // B1 + pmull v0.8h, v5.8b, v0.8b // E = A*B1 + ext v17.8b, v5.8b, v5.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v5.8b, v19.8b // G = A*B2 + ext v18.8b, v5.8b, v5.8b, #3 // A3 + eor v16.16b, v16.16b, v0.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v0.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v0.8h, v5.8b, v0.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v0.16b // N = I + J + pmull v19.8h, v5.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v0.8h, v5.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v0.16b, v0.16b, v16.16b + eor v0.16b, v0.16b, v18.16b + eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing + ext v16.8b, v7.8b, v7.8b, #1 // A1 + pmull v16.8h, v16.8b, v3.8b // F = A1*B + ext v1.8b, v3.8b, v3.8b, #1 // B1 + pmull v1.8h, v7.8b, v1.8b // E = A*B1 + ext v17.8b, v7.8b, v7.8b, #2 // A2 + pmull v17.8h, v17.8b, v3.8b // H = A2*B + ext v19.8b, v3.8b, v3.8b, #2 // B2 + pmull v19.8h, v7.8b, v19.8b // G = A*B2 + ext v18.8b, v7.8b, v7.8b, #3 // A3 + eor v16.16b, v16.16b, v1.16b // L = E + F + pmull v18.8h, v18.8b, v3.8b // J = A3*B + ext v1.8b, v3.8b, v3.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v1.8h, v7.8b, v1.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v3.8b, v3.8b, #4 // B4 + eor v18.16b, v18.16b, v1.16b // N = I + J + pmull v19.8h, v7.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v1.8h, v7.8b, v3.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v1.16b, v1.16b, v16.16b + eor v1.16b, v1.16b, v18.16b + ext v16.8b, v6.8b, v6.8b, #1 // A1 + pmull v16.8h, v16.8b, v4.8b // F = A1*B + ext v2.8b, v4.8b, v4.8b, #1 // B1 + pmull v2.8h, v6.8b, v2.8b // E = A*B1 + ext v17.8b, v6.8b, v6.8b, #2 // A2 + pmull v17.8h, v17.8b, v4.8b // H = A2*B + ext v19.8b, v4.8b, v4.8b, #2 // B2 + pmull v19.8h, v6.8b, v19.8b // G = A*B2 + ext v18.8b, v6.8b, v6.8b, #3 // A3 + eor v16.16b, v16.16b, v2.16b // L = E + F + pmull v18.8h, v18.8b, v4.8b // J = A3*B + ext v2.8b, v4.8b, v4.8b, #3 // B3 + eor v17.16b, v17.16b, v19.16b // M = G + H + pmull v2.8h, v6.8b, v2.8b // I = A*B3 + + // Here we diverge from the 32-bit version. It computes the following + // (instructions reordered for clarity): + // + // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) + // vand $t0#hi, $t0#hi, $k48 + // veor $t0#lo, $t0#lo, $t0#hi + // + // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) + // vand $t1#hi, $t1#hi, $k32 + // veor $t1#lo, $t1#lo, $t1#hi + // + // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) + // vand $t2#hi, $t2#hi, $k16 + // veor $t2#lo, $t2#lo, $t2#hi + // + // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) + // vmov.i64 $t3#hi, #0 + // + // $kN is a mask with the bottom N bits set. AArch64 cannot compute on + // upper halves of SIMD registers, so we must split each half into + // separate registers. To compensate, we pair computations up and + // parallelize. + + ext v19.8b, v4.8b, v4.8b, #4 // B4 + eor v18.16b, v18.16b, v2.16b // N = I + J + pmull v19.8h, v6.8b, v19.8b // K = A*B4 + + // This can probably be scheduled more efficiently. For now, we just + // pair up independent instructions. + zip1 v20.2d, v16.2d, v17.2d + zip1 v22.2d, v18.2d, v19.2d + zip2 v21.2d, v16.2d, v17.2d + zip2 v23.2d, v18.2d, v19.2d + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + and v21.16b, v21.16b, v24.16b + and v23.16b, v23.16b, v25.16b + eor v20.16b, v20.16b, v21.16b + eor v22.16b, v22.16b, v23.16b + zip1 v16.2d, v20.2d, v21.2d + zip1 v18.2d, v22.2d, v23.2d + zip2 v17.2d, v20.2d, v21.2d + zip2 v19.2d, v22.2d, v23.2d + + ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 + ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 + pmull v2.8h, v6.8b, v4.8b // D = A*B + ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 + ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 + eor v16.16b, v16.16b, v17.16b + eor v18.16b, v18.16b, v19.16b + eor v2.16b, v2.16b, v16.16b + eor v2.16b, v2.16b, v18.16b + ext v16.16b, v0.16b, v2.16b, #8 + eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing + eor v1.16b, v1.16b, v2.16b + eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi + ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result + // This is a no-op due to the ins instruction below. + // ins v2.d[0], v1.d[1] + + // equivalent of reduction_avx from ghash-x86_64.pl + shl v17.2d, v0.2d, #57 // 1st phase + shl v18.2d, v0.2d, #62 + eor v18.16b, v18.16b, v17.16b // + shl v17.2d, v0.2d, #63 + eor v18.16b, v18.16b, v17.16b // + // Note Xm contains {Xl.d[1], Xh.d[0]}. + eor v18.16b, v18.16b, v1.16b + ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] + ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] + + ushr v18.2d, v0.2d, #1 // 2nd phase + eor v2.16b, v2.16b,v0.16b + eor v0.16b, v0.16b,v18.16b // + ushr v18.2d, v18.2d, #6 + ushr v0.2d, v0.2d, #1 // + eor v0.16b, v0.16b, v2.16b // + eor v0.16b, v0.16b, v18.16b // + + subs x3, x3, #16 + bne Loop_neon + + rev64 v0.16b, v0.16b // byteswap Xi and write + ext v0.16b, v0.16b, v0.16b, #8 + st1 {v0.16b}, [x0] + + ret + + +.section .rodata +.align 4 +Lmasks: +.quad 0x0000ffffffffffff // k48 +.quad 0x00000000ffffffff // k32 +.quad 0x000000000000ffff // k16 +.quad 0x0000000000000000 // k0 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/ghash-ssse3-x86-apple.S b/third_party/boringssl/gen/bcm/ghash-ssse3-x86-apple.S new file mode 100644 index 00000000..b58a4fa8 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-ssse3-x86-apple.S @@ -0,0 +1,288 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _gcm_gmult_ssse3 +.private_extern _gcm_gmult_ssse3 +.align 4 +_gcm_gmult_ssse3: +L_gcm_gmult_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movdqu (%edi),%xmm0 + call L000pic_point +L000pic_point: + popl %eax + movdqa Lreverse_bytes-L000pic_point(%eax),%xmm7 + movdqa Llow4_mask-L000pic_point(%eax),%xmm2 + pshufb %xmm7,%xmm0 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L001loop_row_1: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L001loop_row_1 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L002loop_row_2: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L002loop_row_2 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +L003loop_row_3: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L003loop_row_3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + pshufb %xmm7,%xmm2 + movdqu %xmm2,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _gcm_ghash_ssse3 +.private_extern _gcm_ghash_ssse3 +.align 4 +_gcm_ghash_ssse3: +L_gcm_ghash_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + movdqu (%edi),%xmm0 + call L004pic_point +L004pic_point: + popl %ebx + movdqa Lreverse_bytes-L004pic_point(%ebx),%xmm7 + andl $-16,%ecx + pshufb %xmm7,%xmm0 + pxor %xmm3,%xmm3 +L005loop_ghash: + movdqa Llow4_mask-L004pic_point(%ebx),%xmm2 + movdqu (%edx),%xmm1 + pshufb %xmm7,%xmm1 + pxor %xmm1,%xmm0 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + movl $5,%eax +L006loop_row_4: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L006loop_row_4 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +L007loop_row_5: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L007loop_row_5 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +L008loop_row_6: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz L008loop_row_6 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + leal -256(%esi),%esi + leal 16(%edx),%edx + subl $16,%ecx + jnz L005loop_ghash + pshufb %xmm7,%xmm0 + movdqu %xmm0,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 4,0x90 +Lreverse_bytes: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.align 4,0x90 +Llow4_mask: +.long 252645135,252645135,252645135,252645135 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/ghash-ssse3-x86-linux.S b/third_party/boringssl/gen/bcm/ghash-ssse3-x86-linux.S new file mode 100644 index 00000000..7e8d7bb7 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-ssse3-x86-linux.S @@ -0,0 +1,292 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl gcm_gmult_ssse3 +.hidden gcm_gmult_ssse3 +.type gcm_gmult_ssse3,@function +.align 16 +gcm_gmult_ssse3: +.L_gcm_gmult_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movdqu (%edi),%xmm0 + call .L000pic_point +.L000pic_point: + popl %eax + movdqa .Lreverse_bytes-.L000pic_point(%eax),%xmm7 + movdqa .Llow4_mask-.L000pic_point(%eax),%xmm2 + pshufb %xmm7,%xmm0 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +.L001loop_row_1: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L001loop_row_1 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +.L002loop_row_2: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L002loop_row_2 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +.L003loop_row_3: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L003loop_row_3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + pshufb %xmm7,%xmm2 + movdqu %xmm2,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size gcm_gmult_ssse3,.-.L_gcm_gmult_ssse3_begin +.globl gcm_ghash_ssse3 +.hidden gcm_ghash_ssse3 +.type gcm_ghash_ssse3,@function +.align 16 +gcm_ghash_ssse3: +.L_gcm_ghash_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%edx + movl 32(%esp),%ecx + movdqu (%edi),%xmm0 + call .L004pic_point +.L004pic_point: + popl %ebx + movdqa .Lreverse_bytes-.L004pic_point(%ebx),%xmm7 + andl $-16,%ecx + pshufb %xmm7,%xmm0 + pxor %xmm3,%xmm3 +.L005loop_ghash: + movdqa .Llow4_mask-.L004pic_point(%ebx),%xmm2 + movdqu (%edx),%xmm1 + pshufb %xmm7,%xmm1 + pxor %xmm1,%xmm0 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + pxor %xmm2,%xmm2 + movl $5,%eax +.L006loop_row_4: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L006loop_row_4 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $5,%eax +.L007loop_row_5: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L007loop_row_5 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movl $6,%eax +.L008loop_row_6: + movdqu (%esi),%xmm4 + leal 16(%esi),%esi + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + subl $1,%eax + jnz .L008loop_row_6 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + leal -256(%esi),%esi + leal 16(%edx),%edx + subl $16,%ecx + jnz .L005loop_ghash + pshufb %xmm7,%xmm0 + movdqu %xmm0,(%edi) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size gcm_ghash_ssse3,.-.L_gcm_ghash_ssse3_begin +.align 16 +.Lreverse_bytes: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.align 16 +.Llow4_mask: +.long 252645135,252645135,252645135,252645135 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/ghash-ssse3-x86-win.asm b/third_party/boringssl/gen/bcm/ghash-ssse3-x86-win.asm new file mode 100644 index 00000000..387b4a1f --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-ssse3-x86-win.asm @@ -0,0 +1,297 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _gcm_gmult_ssse3 +align 16 +_gcm_gmult_ssse3: +L$_gcm_gmult_ssse3_begin: + push ebp + push ebx + push esi + push edi + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + movdqu xmm0,[edi] + call L$000pic_point +L$000pic_point: + pop eax + movdqa xmm7,[(L$reverse_bytes-L$000pic_point)+eax] + movdqa xmm2,[(L$low4_mask-L$000pic_point)+eax] + pshufb xmm0,xmm7 + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov eax,5 +L$001loop_row_1: + movdqu xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$001loop_row_1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,5 +L$002loop_row_2: + movdqu xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$002loop_row_2 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,6 +L$003loop_row_3: + movdqu xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$003loop_row_3 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + pshufb xmm2,xmm7 + movdqu [edi],xmm2 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pop edi + pop esi + pop ebx + pop ebp + ret +global _gcm_ghash_ssse3 +align 16 +_gcm_ghash_ssse3: +L$_gcm_ghash_ssse3_begin: + push ebp + push ebx + push esi + push edi + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + mov edx,DWORD [28+esp] + mov ecx,DWORD [32+esp] + movdqu xmm0,[edi] + call L$004pic_point +L$004pic_point: + pop ebx + movdqa xmm7,[(L$reverse_bytes-L$004pic_point)+ebx] + and ecx,-16 + pshufb xmm0,xmm7 + pxor xmm3,xmm3 +L$005loop_ghash: + movdqa xmm2,[(L$low4_mask-L$004pic_point)+ebx] + movdqu xmm1,[edx] + pshufb xmm1,xmm7 + pxor xmm0,xmm1 + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + pxor xmm2,xmm2 + mov eax,5 +L$006loop_row_4: + movdqu xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$006loop_row_4 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,5 +L$007loop_row_5: + movdqu xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$007loop_row_5 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov eax,6 +L$008loop_row_6: + movdqu xmm4,[esi] + lea esi,[16+esi] + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + pxor xmm2,xmm5 + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + sub eax,1 + jnz NEAR L$008loop_row_6 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + movdqa xmm0,xmm2 + lea esi,[esi-256] + lea edx,[16+edx] + sub ecx,16 + jnz NEAR L$005loop_ghash + pshufb xmm0,xmm7 + movdqu [edi],xmm0 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pop edi + pop esi + pop ebx + pop ebp + ret +align 16 +L$reverse_bytes: +db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +align 16 +L$low4_mask: +dd 252645135,252645135,252645135,252645135 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/ghash-ssse3-x86_64-apple.S b/third_party/boringssl/gen/bcm/ghash-ssse3-x86_64-apple.S new file mode 100644 index 00000000..53af23f8 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-ssse3-x86_64-apple.S @@ -0,0 +1,423 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + + + + + + +.globl _gcm_gmult_ssse3 +.private_extern _gcm_gmult_ssse3 +.p2align 4 +_gcm_gmult_ssse3: + + +_CET_ENDBR + movdqu (%rdi),%xmm0 + movdqa L$reverse_bytes(%rip),%xmm10 + movdqa L$low4_mask(%rip),%xmm2 + + + pshufb %xmm10,%xmm0 + + + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + + + + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +L$oop_row_1: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_1 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +L$oop_row_2: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_2 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +L$oop_row_3: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_3 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + + pshufb %xmm10,%xmm2 + movdqu %xmm2,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + ret + + + + + + + + + +.globl _gcm_ghash_ssse3 +.private_extern _gcm_ghash_ssse3 +.p2align 4 +_gcm_ghash_ssse3: + + +_CET_ENDBR + movdqu (%rdi),%xmm0 + movdqa L$reverse_bytes(%rip),%xmm10 + movdqa L$low4_mask(%rip),%xmm11 + + + andq $-16,%rcx + + + + pshufb %xmm10,%xmm0 + + + pxor %xmm3,%xmm3 +L$oop_ghash: + + movdqu (%rdx),%xmm1 + pshufb %xmm10,%xmm1 + pxor %xmm1,%xmm0 + + + movdqa %xmm11,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm11,%xmm0 + + + + + pxor %xmm2,%xmm2 + + movq $5,%rax +L$oop_row_4: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_4 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +L$oop_row_5: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_5 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +L$oop_row_6: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz L$oop_row_6 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + + + leaq -256(%rsi),%rsi + + + leaq 16(%rdx),%rdx + subq $16,%rcx + jnz L$oop_ghash + + + pshufb %xmm10,%xmm0 + movdqu %xmm0,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + ret + + + + +.section __DATA,__const +.p2align 4 + + +L$reverse_bytes: +.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +L$low4_mask: +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +.text +#endif diff --git a/third_party/boringssl/gen/bcm/ghash-ssse3-x86_64-linux.S b/third_party/boringssl/gen/bcm/ghash-ssse3-x86_64-linux.S new file mode 100644 index 00000000..edce38d0 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-ssse3-x86_64-linux.S @@ -0,0 +1,423 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + + + + + +.type gcm_gmult_ssse3, @function +.globl gcm_gmult_ssse3 +.hidden gcm_gmult_ssse3 +.align 16 +gcm_gmult_ssse3: +.cfi_startproc + +_CET_ENDBR + movdqu (%rdi),%xmm0 + movdqa .Lreverse_bytes(%rip),%xmm10 + movdqa .Llow4_mask(%rip),%xmm2 + + + pshufb %xmm10,%xmm0 + + + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + + + + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +.Loop_row_1: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_1 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +.Loop_row_2: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_2 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +.Loop_row_3: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_3 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + + pshufb %xmm10,%xmm2 + movdqu %xmm2,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + ret +.cfi_endproc + +.size gcm_gmult_ssse3,.-gcm_gmult_ssse3 + + + + + +.type gcm_ghash_ssse3, @function +.globl gcm_ghash_ssse3 +.hidden gcm_ghash_ssse3 +.align 16 +gcm_ghash_ssse3: +.cfi_startproc + +_CET_ENDBR + movdqu (%rdi),%xmm0 + movdqa .Lreverse_bytes(%rip),%xmm10 + movdqa .Llow4_mask(%rip),%xmm11 + + + andq $-16,%rcx + + + + pshufb %xmm10,%xmm0 + + + pxor %xmm3,%xmm3 +.Loop_ghash: + + movdqu (%rdx),%xmm1 + pshufb %xmm10,%xmm1 + pxor %xmm1,%xmm0 + + + movdqa %xmm11,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm11,%xmm0 + + + + + pxor %xmm2,%xmm2 + + movq $5,%rax +.Loop_row_4: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_4 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $5,%rax +.Loop_row_5: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_5 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movq $6,%rax +.Loop_row_6: + movdqu (%rsi),%xmm4 + leaq 16(%rsi),%rsi + + + movdqa %xmm2,%xmm6 + palignr $1,%xmm3,%xmm6 + movdqa %xmm6,%xmm3 + psrldq $1,%xmm2 + + + + + movdqa %xmm4,%xmm5 + pshufb %xmm0,%xmm4 + pshufb %xmm1,%xmm5 + + + pxor %xmm5,%xmm2 + + + + movdqa %xmm4,%xmm5 + psllq $60,%xmm5 + movdqa %xmm5,%xmm6 + pslldq $8,%xmm6 + pxor %xmm6,%xmm3 + + + psrldq $8,%xmm5 + pxor %xmm5,%xmm2 + psrlq $4,%xmm4 + pxor %xmm4,%xmm2 + + subq $1,%rax + jnz .Loop_row_6 + + + + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $1,%xmm3 + pxor %xmm3,%xmm2 + psrlq $5,%xmm3 + pxor %xmm3,%xmm2 + pxor %xmm3,%xmm3 + movdqa %xmm2,%xmm0 + + + leaq -256(%rsi),%rsi + + + leaq 16(%rdx),%rdx + subq $16,%rcx + jnz .Loop_ghash + + + pshufb %xmm10,%xmm0 + movdqu %xmm0,(%rdi) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + ret +.cfi_endproc + +.size gcm_ghash_ssse3,.-gcm_ghash_ssse3 + +.section .rodata +.align 16 + + +.Lreverse_bytes: +.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 + +.Llow4_mask: +.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f +.text +#endif diff --git a/third_party/boringssl/gen/bcm/ghash-ssse3-x86_64-win.asm b/third_party/boringssl/gen/bcm/ghash-ssse3-x86_64-win.asm new file mode 100644 index 00000000..5f78da86 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-ssse3-x86_64-win.asm @@ -0,0 +1,502 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + + + + + + +global gcm_gmult_ssse3 +ALIGN 16 +gcm_gmult_ssse3: + +$L$SEH_begin_gcm_gmult_ssse3_1: +_CET_ENDBR + sub rsp,40 +$L$SEH_prologue_gcm_gmult_ssse3_2: + movdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_gmult_ssse3_3: + movdqa XMMWORD[16+rsp],xmm10 +$L$SEH_prologue_gcm_gmult_ssse3_4: +$L$SEH_endprologue_gcm_gmult_ssse3_5: + movdqu xmm0,XMMWORD[rcx] + movdqa xmm10,XMMWORD[$L$reverse_bytes] + movdqa xmm2,XMMWORD[$L$low4_mask] + + + pshufb xmm0,xmm10 + + + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + + + + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + mov rax,5 +$L$oop_row_1: + movdqu xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_1 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,5 +$L$oop_row_2: + movdqu xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_2 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,6 +$L$oop_row_3: + movdqu xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_3 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + + pshufb xmm2,xmm10 + movdqu XMMWORD[rcx],xmm2 + + + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + movdqa xmm6,XMMWORD[rsp] + movdqa xmm10,XMMWORD[16+rsp] + add rsp,40 + ret + +$L$SEH_end_gcm_gmult_ssse3_6: + + + + + + + +global gcm_ghash_ssse3 +ALIGN 16 +gcm_ghash_ssse3: + +$L$SEH_begin_gcm_ghash_ssse3_1: +_CET_ENDBR + sub rsp,56 +$L$SEH_prologue_gcm_ghash_ssse3_2: + movdqa XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_ghash_ssse3_3: + movdqa XMMWORD[16+rsp],xmm10 +$L$SEH_prologue_gcm_ghash_ssse3_4: + movdqa XMMWORD[32+rsp],xmm11 +$L$SEH_prologue_gcm_ghash_ssse3_5: +$L$SEH_endprologue_gcm_ghash_ssse3_6: + movdqu xmm0,XMMWORD[rcx] + movdqa xmm10,XMMWORD[$L$reverse_bytes] + movdqa xmm11,XMMWORD[$L$low4_mask] + + + and r9,-16 + + + + pshufb xmm0,xmm10 + + + pxor xmm3,xmm3 +$L$oop_ghash: + + movdqu xmm1,XMMWORD[r8] + pshufb xmm1,xmm10 + pxor xmm0,xmm1 + + + movdqa xmm1,xmm11 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm11 + + + + + pxor xmm2,xmm2 + + mov rax,5 +$L$oop_row_4: + movdqu xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_4 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,5 +$L$oop_row_5: + movdqu xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_5 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + mov rax,6 +$L$oop_row_6: + movdqu xmm4,XMMWORD[rdx] + lea rdx,[16+rdx] + + + movdqa xmm6,xmm2 + palignr xmm6,xmm3,1 + movdqa xmm3,xmm6 + psrldq xmm2,1 + + + + + movdqa xmm5,xmm4 + pshufb xmm4,xmm0 + pshufb xmm5,xmm1 + + + pxor xmm2,xmm5 + + + + movdqa xmm5,xmm4 + psllq xmm5,60 + movdqa xmm6,xmm5 + pslldq xmm6,8 + pxor xmm3,xmm6 + + + psrldq xmm5,8 + pxor xmm2,xmm5 + psrlq xmm4,4 + pxor xmm2,xmm4 + + sub rax,1 + jnz NEAR $L$oop_row_6 + + + + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,1 + pxor xmm2,xmm3 + psrlq xmm3,5 + pxor xmm2,xmm3 + pxor xmm3,xmm3 + movdqa xmm0,xmm2 + + + lea rdx,[((-256))+rdx] + + + lea r8,[16+r8] + sub r9,16 + jnz NEAR $L$oop_ghash + + + pshufb xmm0,xmm10 + movdqu XMMWORD[rcx],xmm0 + + + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + movdqa xmm6,XMMWORD[rsp] + movdqa xmm10,XMMWORD[16+rsp] + movdqa xmm11,XMMWORD[32+rsp] + add rsp,56 + ret + +$L$SEH_end_gcm_ghash_ssse3_7: + + +section .rdata rdata align=8 +ALIGN 16 + + +$L$reverse_bytes: + DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 + +$L$low4_mask: + DQ 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f +section .text + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_gcm_gmult_ssse3_1 wrt ..imagebase + DD $L$SEH_end_gcm_gmult_ssse3_6 wrt ..imagebase + DD $L$SEH_info_gcm_gmult_ssse3_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_ghash_ssse3_1 wrt ..imagebase + DD $L$SEH_end_gcm_ghash_ssse3_7 wrt ..imagebase + DD $L$SEH_info_gcm_ghash_ssse3_0 wrt ..imagebase + + +section .xdata rdata align=8 +ALIGN 4 +$L$SEH_info_gcm_gmult_ssse3_0: + DB 1 + DB $L$SEH_endprologue_gcm_gmult_ssse3_5-$L$SEH_begin_gcm_gmult_ssse3_1 + DB 5 + DB 0 + DB $L$SEH_prologue_gcm_gmult_ssse3_4-$L$SEH_begin_gcm_gmult_ssse3_1 + DB 168 + DW 1 + DB $L$SEH_prologue_gcm_gmult_ssse3_3-$L$SEH_begin_gcm_gmult_ssse3_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_gmult_ssse3_2-$L$SEH_begin_gcm_gmult_ssse3_1 + DB 66 + + DW 0 +$L$SEH_info_gcm_ghash_ssse3_0: + DB 1 + DB $L$SEH_endprologue_gcm_ghash_ssse3_6-$L$SEH_begin_gcm_ghash_ssse3_1 + DB 7 + DB 0 + DB $L$SEH_prologue_gcm_ghash_ssse3_5-$L$SEH_begin_gcm_ghash_ssse3_1 + DB 184 + DW 2 + DB $L$SEH_prologue_gcm_ghash_ssse3_4-$L$SEH_begin_gcm_ghash_ssse3_1 + DB 168 + DW 1 + DB $L$SEH_prologue_gcm_ghash_ssse3_3-$L$SEH_begin_gcm_ghash_ssse3_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_ghash_ssse3_2-$L$SEH_begin_gcm_ghash_ssse3_1 + DB 98 + + DW 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/ghash-x86-apple.S b/third_party/boringssl/gen/bcm/ghash-x86-apple.S new file mode 100644 index 00000000..62214872 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-x86-apple.S @@ -0,0 +1,322 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _gcm_init_clmul +.private_extern _gcm_init_clmul +.align 4 +_gcm_init_clmul: +L_gcm_init_clmul_begin: + movl 4(%esp),%edx + movl 8(%esp),%eax + call L000pic +L000pic: + popl %ecx + leal Lbswap-L000pic(%ecx),%ecx + movdqu (%eax),%xmm2 + pshufd $78,%xmm2,%xmm2 + pshufd $255,%xmm2,%xmm4 + movdqa %xmm2,%xmm3 + psllq $1,%xmm2 + pxor %xmm5,%xmm5 + psrlq $63,%xmm3 + pcmpgtd %xmm4,%xmm5 + pslldq $8,%xmm3 + por %xmm3,%xmm2 + pand 16(%ecx),%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm2,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,(%edx) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%edx) + palignr $8,%xmm3,%xmm4 + movdqu %xmm4,32(%edx) + ret +.globl _gcm_gmult_clmul +.private_extern _gcm_gmult_clmul +.align 4 +_gcm_gmult_clmul: +L_gcm_gmult_clmul_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + call L001pic +L001pic: + popl %ecx + leal Lbswap-L001pic(%ecx),%ecx + movdqu (%eax),%xmm0 + movdqa (%ecx),%xmm5 + movups (%edx),%xmm2 + pshufb %xmm5,%xmm0 + movups 32(%edx),%xmm4 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufb %xmm5,%xmm0 + movdqu %xmm0,(%eax) + ret +.globl _gcm_ghash_clmul +.private_extern _gcm_ghash_clmul +.align 4 +_gcm_ghash_clmul: +L_gcm_ghash_clmul_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%eax + movl 24(%esp),%edx + movl 28(%esp),%esi + movl 32(%esp),%ebx + call L002pic +L002pic: + popl %ecx + leal Lbswap-L002pic(%ecx),%ecx + movdqu (%eax),%xmm0 + movdqa (%ecx),%xmm5 + movdqu (%edx),%xmm2 + pshufb %xmm5,%xmm0 + subl $16,%ebx + jz L003odd_tail + movdqu (%esi),%xmm3 + movdqu 16(%esi),%xmm6 + pshufb %xmm5,%xmm3 + pshufb %xmm5,%xmm6 + movdqu 32(%edx),%xmm5 + pxor %xmm3,%xmm0 + pshufd $78,%xmm6,%xmm3 + movdqa %xmm6,%xmm7 + pxor %xmm6,%xmm3 + leal 32(%esi),%esi + pclmulqdq $0,%xmm2,%xmm6 + pclmulqdq $17,%xmm2,%xmm7 + pclmulqdq $0,%xmm5,%xmm3 + movups 16(%edx),%xmm2 + nop + subl $32,%ebx + jbe L004even_tail + jmp L005mod_loop +.align 5,0x90 +L005mod_loop: + pshufd $78,%xmm0,%xmm4 + movdqa %xmm0,%xmm1 + pxor %xmm0,%xmm4 + nop + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $16,%xmm5,%xmm4 + movups (%edx),%xmm2 + xorps %xmm6,%xmm0 + movdqa (%ecx),%xmm5 + xorps %xmm7,%xmm1 + movdqu (%esi),%xmm7 + pxor %xmm0,%xmm3 + movdqu 16(%esi),%xmm6 + pxor %xmm1,%xmm3 + pshufb %xmm5,%xmm7 + pxor %xmm3,%xmm4 + movdqa %xmm4,%xmm3 + psrldq $8,%xmm4 + pslldq $8,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm3,%xmm0 + pshufb %xmm5,%xmm6 + pxor %xmm7,%xmm1 + movdqa %xmm6,%xmm7 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + pclmulqdq $0,%xmm2,%xmm6 + movups 32(%edx),%xmm5 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + pshufd $78,%xmm7,%xmm3 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm7,%xmm3 + pxor %xmm4,%xmm1 + pclmulqdq $17,%xmm2,%xmm7 + movups 16(%edx),%xmm2 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pclmulqdq $0,%xmm5,%xmm3 + leal 32(%esi),%esi + subl $32,%ebx + ja L005mod_loop +L004even_tail: + pshufd $78,%xmm0,%xmm4 + movdqa %xmm0,%xmm1 + pxor %xmm0,%xmm4 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $16,%xmm5,%xmm4 + movdqa (%ecx),%xmm5 + xorps %xmm6,%xmm0 + xorps %xmm7,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + pxor %xmm3,%xmm4 + movdqa %xmm4,%xmm3 + psrldq $8,%xmm4 + pslldq $8,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + testl %ebx,%ebx + jnz L006done + movups (%edx),%xmm2 +L003odd_tail: + movdqu (%esi),%xmm3 + pshufb %xmm5,%xmm3 + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +L006done: + pshufb %xmm5,%xmm0 + movdqu %xmm0,(%eax) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +Lbswap: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 +.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 +.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 +.byte 0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/ghash-x86-linux.S b/third_party/boringssl/gen/bcm/ghash-x86-linux.S new file mode 100644 index 00000000..960eeff8 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-x86-linux.S @@ -0,0 +1,328 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl gcm_init_clmul +.hidden gcm_init_clmul +.type gcm_init_clmul,@function +.align 16 +gcm_init_clmul: +.L_gcm_init_clmul_begin: + movl 4(%esp),%edx + movl 8(%esp),%eax + call .L000pic +.L000pic: + popl %ecx + leal .Lbswap-.L000pic(%ecx),%ecx + movdqu (%eax),%xmm2 + pshufd $78,%xmm2,%xmm2 + pshufd $255,%xmm2,%xmm4 + movdqa %xmm2,%xmm3 + psllq $1,%xmm2 + pxor %xmm5,%xmm5 + psrlq $63,%xmm3 + pcmpgtd %xmm4,%xmm5 + pslldq $8,%xmm3 + por %xmm3,%xmm2 + pand 16(%ecx),%xmm5 + pxor %xmm5,%xmm2 + movdqa %xmm2,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,(%edx) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%edx) + palignr $8,%xmm3,%xmm4 + movdqu %xmm4,32(%edx) + ret +.size gcm_init_clmul,.-.L_gcm_init_clmul_begin +.globl gcm_gmult_clmul +.hidden gcm_gmult_clmul +.type gcm_gmult_clmul,@function +.align 16 +gcm_gmult_clmul: +.L_gcm_gmult_clmul_begin: + movl 4(%esp),%eax + movl 8(%esp),%edx + call .L001pic +.L001pic: + popl %ecx + leal .Lbswap-.L001pic(%ecx),%ecx + movdqu (%eax),%xmm0 + movdqa (%ecx),%xmm5 + movups (%edx),%xmm2 + pshufb %xmm5,%xmm0 + movups 32(%edx),%xmm4 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufb %xmm5,%xmm0 + movdqu %xmm0,(%eax) + ret +.size gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin +.globl gcm_ghash_clmul +.hidden gcm_ghash_clmul +.type gcm_ghash_clmul,@function +.align 16 +gcm_ghash_clmul: +.L_gcm_ghash_clmul_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%eax + movl 24(%esp),%edx + movl 28(%esp),%esi + movl 32(%esp),%ebx + call .L002pic +.L002pic: + popl %ecx + leal .Lbswap-.L002pic(%ecx),%ecx + movdqu (%eax),%xmm0 + movdqa (%ecx),%xmm5 + movdqu (%edx),%xmm2 + pshufb %xmm5,%xmm0 + subl $16,%ebx + jz .L003odd_tail + movdqu (%esi),%xmm3 + movdqu 16(%esi),%xmm6 + pshufb %xmm5,%xmm3 + pshufb %xmm5,%xmm6 + movdqu 32(%edx),%xmm5 + pxor %xmm3,%xmm0 + pshufd $78,%xmm6,%xmm3 + movdqa %xmm6,%xmm7 + pxor %xmm6,%xmm3 + leal 32(%esi),%esi + pclmulqdq $0,%xmm2,%xmm6 + pclmulqdq $17,%xmm2,%xmm7 + pclmulqdq $0,%xmm5,%xmm3 + movups 16(%edx),%xmm2 + nop + subl $32,%ebx + jbe .L004even_tail + jmp .L005mod_loop +.align 32 +.L005mod_loop: + pshufd $78,%xmm0,%xmm4 + movdqa %xmm0,%xmm1 + pxor %xmm0,%xmm4 + nop + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $16,%xmm5,%xmm4 + movups (%edx),%xmm2 + xorps %xmm6,%xmm0 + movdqa (%ecx),%xmm5 + xorps %xmm7,%xmm1 + movdqu (%esi),%xmm7 + pxor %xmm0,%xmm3 + movdqu 16(%esi),%xmm6 + pxor %xmm1,%xmm3 + pshufb %xmm5,%xmm7 + pxor %xmm3,%xmm4 + movdqa %xmm4,%xmm3 + psrldq $8,%xmm4 + pslldq $8,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm3,%xmm0 + pshufb %xmm5,%xmm6 + pxor %xmm7,%xmm1 + movdqa %xmm6,%xmm7 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + pclmulqdq $0,%xmm2,%xmm6 + movups 32(%edx),%xmm5 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + pshufd $78,%xmm7,%xmm3 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm7,%xmm3 + pxor %xmm4,%xmm1 + pclmulqdq $17,%xmm2,%xmm7 + movups 16(%edx),%xmm2 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pclmulqdq $0,%xmm5,%xmm3 + leal 32(%esi),%esi + subl $32,%ebx + ja .L005mod_loop +.L004even_tail: + pshufd $78,%xmm0,%xmm4 + movdqa %xmm0,%xmm1 + pxor %xmm0,%xmm4 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $16,%xmm5,%xmm4 + movdqa (%ecx),%xmm5 + xorps %xmm6,%xmm0 + xorps %xmm7,%xmm1 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + pxor %xmm3,%xmm4 + movdqa %xmm4,%xmm3 + psrldq $8,%xmm4 + pslldq $8,%xmm3 + pxor %xmm4,%xmm1 + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + testl %ebx,%ebx + jnz .L006done + movups (%edx),%xmm2 +.L003odd_tail: + movdqu (%esi),%xmm3 + pshufb %xmm5,%xmm3 + pxor %xmm3,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pshufd $78,%xmm2,%xmm4 + pxor %xmm0,%xmm3 + pxor %xmm2,%xmm4 + pclmulqdq $0,%xmm2,%xmm0 + pclmulqdq $17,%xmm2,%xmm1 + pclmulqdq $0,%xmm4,%xmm3 + xorps %xmm0,%xmm3 + xorps %xmm1,%xmm3 + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.L006done: + pshufb %xmm5,%xmm0 + movdqu %xmm0,(%eax) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin +.align 64 +.Lbswap: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 +.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 +.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 +.byte 0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/ghash-x86-win.asm b/third_party/boringssl/gen/bcm/ghash-x86-win.asm new file mode 100644 index 00000000..64e8332d --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-x86-win.asm @@ -0,0 +1,330 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _gcm_init_clmul +align 16 +_gcm_init_clmul: +L$_gcm_init_clmul_begin: + mov edx,DWORD [4+esp] + mov eax,DWORD [8+esp] + call L$000pic +L$000pic: + pop ecx + lea ecx,[(L$bswap-L$000pic)+ecx] + movdqu xmm2,[eax] + pshufd xmm2,xmm2,78 + pshufd xmm4,xmm2,255 + movdqa xmm3,xmm2 + psllq xmm2,1 + pxor xmm5,xmm5 + psrlq xmm3,63 + pcmpgtd xmm5,xmm4 + pslldq xmm3,8 + por xmm2,xmm3 + pand xmm5,[16+ecx] + pxor xmm2,xmm5 + movdqa xmm0,xmm2 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pshufd xmm4,xmm2,78 + pxor xmm3,xmm0 + pxor xmm4,xmm2 + pclmulqdq xmm0,xmm2,0 + pclmulqdq xmm1,xmm2,17 + pclmulqdq xmm3,xmm4,0 + xorps xmm3,xmm0 + xorps xmm3,xmm1 + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pshufd xmm3,xmm2,78 + pshufd xmm4,xmm0,78 + pxor xmm3,xmm2 + movdqu [edx],xmm2 + pxor xmm4,xmm0 + movdqu [16+edx],xmm0 + palignr xmm4,xmm3,8 + movdqu [32+edx],xmm4 + ret +global _gcm_gmult_clmul +align 16 +_gcm_gmult_clmul: +L$_gcm_gmult_clmul_begin: + mov eax,DWORD [4+esp] + mov edx,DWORD [8+esp] + call L$001pic +L$001pic: + pop ecx + lea ecx,[(L$bswap-L$001pic)+ecx] + movdqu xmm0,[eax] + movdqa xmm5,[ecx] + movups xmm2,[edx] + pshufb xmm0,xmm5 + movups xmm4,[32+edx] + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 + pclmulqdq xmm0,xmm2,0 + pclmulqdq xmm1,xmm2,17 + pclmulqdq xmm3,xmm4,0 + xorps xmm3,xmm0 + xorps xmm3,xmm1 + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pshufb xmm0,xmm5 + movdqu [eax],xmm0 + ret +global _gcm_ghash_clmul +align 16 +_gcm_ghash_clmul: +L$_gcm_ghash_clmul_begin: + push ebp + push ebx + push esi + push edi + mov eax,DWORD [20+esp] + mov edx,DWORD [24+esp] + mov esi,DWORD [28+esp] + mov ebx,DWORD [32+esp] + call L$002pic +L$002pic: + pop ecx + lea ecx,[(L$bswap-L$002pic)+ecx] + movdqu xmm0,[eax] + movdqa xmm5,[ecx] + movdqu xmm2,[edx] + pshufb xmm0,xmm5 + sub ebx,16 + jz NEAR L$003odd_tail + movdqu xmm3,[esi] + movdqu xmm6,[16+esi] + pshufb xmm3,xmm5 + pshufb xmm6,xmm5 + movdqu xmm5,[32+edx] + pxor xmm0,xmm3 + pshufd xmm3,xmm6,78 + movdqa xmm7,xmm6 + pxor xmm3,xmm6 + lea esi,[32+esi] + pclmulqdq xmm6,xmm2,0 + pclmulqdq xmm7,xmm2,17 + pclmulqdq xmm3,xmm5,0 + movups xmm2,[16+edx] + nop + sub ebx,32 + jbe NEAR L$004even_tail + jmp NEAR L$005mod_loop +align 32 +L$005mod_loop: + pshufd xmm4,xmm0,78 + movdqa xmm1,xmm0 + pxor xmm4,xmm0 + nop + pclmulqdq xmm0,xmm2,0 + pclmulqdq xmm1,xmm2,17 + pclmulqdq xmm4,xmm5,16 + movups xmm2,[edx] + xorps xmm0,xmm6 + movdqa xmm5,[ecx] + xorps xmm1,xmm7 + movdqu xmm7,[esi] + pxor xmm3,xmm0 + movdqu xmm6,[16+esi] + pxor xmm3,xmm1 + pshufb xmm7,xmm5 + pxor xmm4,xmm3 + movdqa xmm3,xmm4 + psrldq xmm4,8 + pslldq xmm3,8 + pxor xmm1,xmm4 + pxor xmm0,xmm3 + pshufb xmm6,xmm5 + pxor xmm1,xmm7 + movdqa xmm7,xmm6 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + pclmulqdq xmm6,xmm2,0 + movups xmm5,[32+edx] + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + pshufd xmm3,xmm7,78 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm3,xmm7 + pxor xmm1,xmm4 + pclmulqdq xmm7,xmm2,17 + movups xmm2,[16+edx] + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pclmulqdq xmm3,xmm5,0 + lea esi,[32+esi] + sub ebx,32 + ja NEAR L$005mod_loop +L$004even_tail: + pshufd xmm4,xmm0,78 + movdqa xmm1,xmm0 + pxor xmm4,xmm0 + pclmulqdq xmm0,xmm2,0 + pclmulqdq xmm1,xmm2,17 + pclmulqdq xmm4,xmm5,16 + movdqa xmm5,[ecx] + xorps xmm0,xmm6 + xorps xmm1,xmm7 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + pxor xmm4,xmm3 + movdqa xmm3,xmm4 + psrldq xmm4,8 + pslldq xmm3,8 + pxor xmm1,xmm4 + pxor xmm0,xmm3 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + test ebx,ebx + jnz NEAR L$006done + movups xmm2,[edx] +L$003odd_tail: + movdqu xmm3,[esi] + pshufb xmm3,xmm5 + pxor xmm0,xmm3 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pshufd xmm4,xmm2,78 + pxor xmm3,xmm0 + pxor xmm4,xmm2 + pclmulqdq xmm0,xmm2,0 + pclmulqdq xmm1,xmm2,17 + pclmulqdq xmm3,xmm4,0 + xorps xmm3,xmm0 + xorps xmm3,xmm1 + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 +L$006done: + pshufb xmm0,xmm5 + movdqu [eax],xmm0 + pop edi + pop esi + pop ebx + pop ebp + ret +align 64 +L$bswap: +db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +db 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 +db 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 +db 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 +db 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 +db 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/ghash-x86_64-apple.S b/third_party/boringssl/gen/bcm/ghash-x86_64-apple.S new file mode 100644 index 00000000..0cf60d1b --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-x86_64-apple.S @@ -0,0 +1,1127 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text +.globl _gcm_init_clmul +.private_extern _gcm_init_clmul + +.p2align 4 +_gcm_init_clmul: + + +_CET_ENDBR +L$_init_clmul: + movdqu (%rsi),%xmm2 + pshufd $78,%xmm2,%xmm2 + + + pshufd $255,%xmm2,%xmm4 + movdqa %xmm2,%xmm3 + psllq $1,%xmm2 + pxor %xmm5,%xmm5 + psrlq $63,%xmm3 + pcmpgtd %xmm4,%xmm5 + pslldq $8,%xmm3 + por %xmm3,%xmm2 + + + pand L$0x1c2_polynomial(%rip),%xmm5 + pxor %xmm5,%xmm2 + + + pshufd $78,%xmm2,%xmm6 + movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0x00,%xmm2,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm1 + pclmulqdq $0x00,%xmm6,%xmm3 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rdi) + palignr $8,%xmm3,%xmm4 + movdqu %xmm4,32(%rdi) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0x00,%xmm2,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm1 + pclmulqdq $0x00,%xmm6,%xmm3 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0x00,%xmm2,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm1 + pclmulqdq $0x00,%xmm6,%xmm3 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rdi) + palignr $8,%xmm3,%xmm4 + movdqu %xmm4,80(%rdi) + ret + + + +.globl _gcm_gmult_clmul +.private_extern _gcm_gmult_clmul + +.p2align 4 +_gcm_gmult_clmul: + +_CET_ENDBR +L$_gmult_clmul: + movdqu (%rdi),%xmm0 + movdqa L$bswap_mask(%rip),%xmm5 + movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm4 + pshufb %xmm5,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0x00,%xmm2,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm1 + pclmulqdq $0x00,%xmm4,%xmm3 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufb %xmm5,%xmm0 + movdqu %xmm0,(%rdi) + ret + + +.globl _gcm_ghash_clmul +.private_extern _gcm_ghash_clmul + +.p2align 5 +_gcm_ghash_clmul: + + +_CET_ENDBR +L$_ghash_clmul: + movdqa L$bswap_mask(%rip),%xmm10 + + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm7 + pshufb %xmm10,%xmm0 + + subq $0x10,%rcx + jz L$odd_tail + + movdqu 16(%rsi),%xmm6 + cmpq $0x30,%rcx + jb L$skip4x + + subq $0x30,%rcx + movq $0xA040608020C0E000,%rax + movdqu 48(%rsi),%xmm14 + movdqu 64(%rsi),%xmm15 + + + + + movdqu 48(%rdx),%xmm3 + movdqu 32(%rdx),%xmm11 + pshufb %xmm10,%xmm3 + pshufb %xmm10,%xmm11 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 + pclmulqdq $0x00,%xmm2,%xmm3 + pclmulqdq $0x11,%xmm2,%xmm5 + pclmulqdq $0x00,%xmm7,%xmm4 + + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 + pclmulqdq $0x00,%xmm6,%xmm11 + pclmulqdq $0x11,%xmm6,%xmm13 + pclmulqdq $0x10,%xmm7,%xmm12 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 + xorps %xmm12,%xmm4 + + movdqu 16(%rdx),%xmm11 + movdqu 0(%rdx),%xmm8 + pshufb %xmm10,%xmm11 + pshufb %xmm10,%xmm8 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm8,%xmm0 + pxor %xmm11,%xmm12 + pclmulqdq $0x00,%xmm14,%xmm11 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + pclmulqdq $0x11,%xmm14,%xmm13 + pclmulqdq $0x00,%xmm7,%xmm12 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 + + leaq 64(%rdx),%rdx + subq $0x40,%rcx + jc L$tail4x + + jmp L$mod4_loop +.p2align 5 +L$mod4_loop: + pclmulqdq $0x00,%xmm15,%xmm0 + xorps %xmm12,%xmm4 + movdqu 48(%rdx),%xmm11 + pshufb %xmm10,%xmm11 + pclmulqdq $0x11,%xmm15,%xmm1 + xorps %xmm3,%xmm0 + movdqu 32(%rdx),%xmm3 + movdqa %xmm11,%xmm13 + pclmulqdq $0x10,%xmm7,%xmm8 + pshufd $78,%xmm11,%xmm12 + xorps %xmm5,%xmm1 + pxor %xmm11,%xmm12 + pshufb %xmm10,%xmm3 + movups 32(%rsi),%xmm7 + xorps %xmm4,%xmm8 + pclmulqdq $0x00,%xmm2,%xmm11 + pshufd $78,%xmm3,%xmm4 + + pxor %xmm0,%xmm8 + movdqa %xmm3,%xmm5 + pxor %xmm1,%xmm8 + pxor %xmm3,%xmm4 + movdqa %xmm8,%xmm9 + pclmulqdq $0x11,%xmm2,%xmm13 + pslldq $8,%xmm8 + psrldq $8,%xmm9 + pxor %xmm8,%xmm0 + movdqa L$7_mask(%rip),%xmm8 + pxor %xmm9,%xmm1 + movq %rax,%xmm9 + + pand %xmm0,%xmm8 + pshufb %xmm8,%xmm9 + pxor %xmm0,%xmm9 + pclmulqdq $0x00,%xmm7,%xmm12 + psllq $57,%xmm9 + movdqa %xmm9,%xmm8 + pslldq $8,%xmm9 + pclmulqdq $0x00,%xmm6,%xmm3 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + movdqu 0(%rdx),%xmm8 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 + pclmulqdq $0x11,%xmm6,%xmm5 + xorps %xmm11,%xmm3 + movdqu 16(%rdx),%xmm11 + pshufb %xmm10,%xmm11 + pclmulqdq $0x10,%xmm7,%xmm4 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 + pshufb %xmm10,%xmm8 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm4 + pshufd $78,%xmm11,%xmm12 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm11,%xmm12 + pclmulqdq $0x00,%xmm14,%xmm11 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + pclmulqdq $0x11,%xmm14,%xmm13 + xorps %xmm11,%xmm3 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + + pclmulqdq $0x00,%xmm7,%xmm12 + xorps %xmm13,%xmm5 + + leaq 64(%rdx),%rdx + subq $0x40,%rcx + jnc L$mod4_loop + +L$tail4x: + pclmulqdq $0x00,%xmm15,%xmm0 + pclmulqdq $0x11,%xmm15,%xmm1 + pclmulqdq $0x10,%xmm7,%xmm8 + xorps %xmm12,%xmm4 + xorps %xmm3,%xmm0 + xorps %xmm5,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm4,%xmm8 + + pxor %xmm1,%xmm8 + pxor %xmm0,%xmm1 + + movdqa %xmm8,%xmm9 + psrldq $8,%xmm8 + pslldq $8,%xmm9 + pxor %xmm8,%xmm1 + pxor %xmm9,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + addq $0x40,%rcx + jz L$done + movdqu 32(%rsi),%xmm7 + subq $0x10,%rcx + jz L$odd_tail +L$skip4x: + + + + + + movdqu (%rdx),%xmm8 + movdqu 16(%rdx),%xmm3 + pshufb %xmm10,%xmm8 + pshufb %xmm10,%xmm3 + pxor %xmm8,%xmm0 + + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 + pclmulqdq $0x00,%xmm2,%xmm3 + pclmulqdq $0x11,%xmm2,%xmm5 + pclmulqdq $0x00,%xmm7,%xmm4 + + leaq 32(%rdx),%rdx + nop + subq $0x20,%rcx + jbe L$even_tail + nop + jmp L$mod_loop + +.p2align 5 +L$mod_loop: + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + + pclmulqdq $0x00,%xmm6,%xmm0 + pclmulqdq $0x11,%xmm6,%xmm1 + pclmulqdq $0x10,%xmm7,%xmm4 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + movdqu (%rdx),%xmm9 + pxor %xmm0,%xmm8 + pshufb %xmm10,%xmm9 + movdqu 16(%rdx),%xmm3 + + pxor %xmm1,%xmm8 + pxor %xmm9,%xmm1 + pxor %xmm8,%xmm4 + pshufb %xmm10,%xmm3 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm3,%xmm5 + + movdqa %xmm0,%xmm9 + movdqa %xmm0,%xmm8 + psllq $5,%xmm0 + pxor %xmm0,%xmm8 + pclmulqdq $0x00,%xmm2,%xmm3 + psllq $1,%xmm0 + pxor %xmm8,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm8 + pslldq $8,%xmm0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pshufd $78,%xmm5,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm5,%xmm4 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm5 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + pxor %xmm9,%xmm0 + leaq 32(%rdx),%rdx + psrlq $1,%xmm0 + pclmulqdq $0x00,%xmm7,%xmm4 + pxor %xmm1,%xmm0 + + subq $0x20,%rcx + ja L$mod_loop + +L$even_tail: + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + + pclmulqdq $0x00,%xmm6,%xmm0 + pclmulqdq $0x11,%xmm6,%xmm1 + pclmulqdq $0x10,%xmm7,%xmm4 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm1,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + testq %rcx,%rcx + jnz L$done + +L$odd_tail: + movdqu (%rdx),%xmm8 + pshufb %xmm10,%xmm8 + pxor %xmm8,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0x00,%xmm2,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm1 + pclmulqdq $0x00,%xmm7,%xmm3 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +L$done: + pshufb %xmm10,%xmm0 + movdqu %xmm0,(%rdi) + ret + + + +.globl _gcm_init_avx +.private_extern _gcm_init_avx + +.p2align 5 +_gcm_init_avx: + + +_CET_ENDBR + vzeroupper + + vmovdqu (%rsi),%xmm2 + vpshufd $78,%xmm2,%xmm2 + + + vpshufd $255,%xmm2,%xmm4 + vpsrlq $63,%xmm2,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpxor %xmm5,%xmm5,%xmm5 + vpcmpgtd %xmm4,%xmm5,%xmm5 + vpslldq $8,%xmm3,%xmm3 + vpor %xmm3,%xmm2,%xmm2 + + + vpand L$0x1c2_polynomial(%rip),%xmm5,%xmm5 + vpxor %xmm5,%xmm2,%xmm2 + + vpunpckhqdq %xmm2,%xmm2,%xmm6 + vmovdqa %xmm2,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + movq $4,%r10 + jmp L$init_start_avx +.p2align 5 +L$init_loop_avx: + vpalignr $8,%xmm3,%xmm4,%xmm5 + vmovdqu %xmm5,-16(%rdi) + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 +L$init_start_avx: + vmovdqa %xmm0,%xmm5 + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + vpshufd $78,%xmm5,%xmm3 + vpshufd $78,%xmm0,%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqu %xmm5,0(%rdi) + vpxor %xmm0,%xmm4,%xmm4 + vmovdqu %xmm0,16(%rdi) + leaq 48(%rdi),%rdi + subq $1,%r10 + jnz L$init_loop_avx + + vpalignr $8,%xmm4,%xmm3,%xmm5 + vmovdqu %xmm5,-16(%rdi) + + vzeroupper + ret + + + +.globl _gcm_gmult_avx +.private_extern _gcm_gmult_avx + +.p2align 5 +_gcm_gmult_avx: + +_CET_ENDBR + jmp L$_gmult_clmul + + +.globl _gcm_ghash_avx +.private_extern _gcm_ghash_avx + +.p2align 5 +_gcm_ghash_avx: + + +_CET_ENDBR + vzeroupper + + vmovdqu (%rdi),%xmm10 + leaq L$0x1c2_polynomial(%rip),%r10 + leaq 64(%rsi),%rsi + vmovdqu L$bswap_mask(%rip),%xmm13 + vpshufb %xmm13,%xmm10,%xmm10 + cmpq $0x80,%rcx + jb L$short_avx + subq $0x80,%rcx + + vmovdqu 112(%rdx),%xmm14 + vmovdqu 0-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vmovdqu 32-64(%rsi),%xmm7 + + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm14,%xmm9,%xmm9 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 80(%rdx),%xmm14 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 48-64(%rsi),%xmm6 + vpxor %xmm14,%xmm9,%xmm9 + vmovdqu 64(%rdx),%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 48(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 32(%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 16(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu (%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + + leaq 128(%rdx),%rdx + cmpq $0x80,%rcx + jb L$tail_avx + + vpxor %xmm10,%xmm15,%xmm15 + subq $0x80,%rcx + jmp L$oop8x_avx + +.p2align 5 +L$oop8x_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 112(%rdx),%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpxor %xmm15,%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 + vmovdqu 0-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 + vmovdqu 32-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm3,%xmm10,%xmm10 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vxorps %xmm4,%xmm11,%xmm11 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm5,%xmm12,%xmm12 + vxorps %xmm15,%xmm8,%xmm8 + + vmovdqu 80(%rdx),%xmm14 + vpxor %xmm10,%xmm12,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm11,%xmm12,%xmm12 + vpslldq $8,%xmm12,%xmm9 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vpsrldq $8,%xmm12,%xmm12 + vpxor %xmm9,%xmm10,%xmm10 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vxorps %xmm12,%xmm11,%xmm11 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 64(%rdx),%xmm15 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vxorps %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + + vmovdqu 48(%rdx),%xmm14 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 32(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + vxorps %xmm12,%xmm10,%xmm10 + + vmovdqu 16(%rdx),%xmm14 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vxorps %xmm11,%xmm12,%xmm12 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu (%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm12,%xmm15,%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + vpxor %xmm10,%xmm15,%xmm15 + + leaq 128(%rdx),%rdx + subq $0x80,%rcx + jnc L$oop8x_avx + + addq $0x80,%rcx + jmp L$tail_no_xor_avx + +.p2align 5 +L$short_avx: + vmovdqu -16(%rdx,%rcx,1),%xmm14 + leaq (%rdx,%rcx,1),%rdx + vmovdqu 0-64(%rsi),%xmm6 + vmovdqu 32-64(%rsi),%xmm7 + vpshufb %xmm13,%xmm14,%xmm15 + + vmovdqa %xmm0,%xmm3 + vmovdqa %xmm1,%xmm4 + vmovdqa %xmm2,%xmm5 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -32(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -48(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 80-64(%rsi),%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -64(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -80(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 96-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 128-64(%rsi),%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -96(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz L$tail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -112(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 144-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovq 184-64(%rsi),%xmm7 + subq $0x10,%rcx + jmp L$tail_avx + +.p2align 5 +L$tail_avx: + vpxor %xmm10,%xmm15,%xmm15 +L$tail_no_xor_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + + vmovdqu (%r10),%xmm12 + + vpxor %xmm0,%xmm3,%xmm10 + vpxor %xmm1,%xmm4,%xmm11 + vpxor %xmm2,%xmm5,%xmm5 + + vpxor %xmm10,%xmm5,%xmm5 + vpxor %xmm11,%xmm5,%xmm5 + vpslldq $8,%xmm5,%xmm9 + vpsrldq $8,%xmm5,%xmm5 + vpxor %xmm9,%xmm10,%xmm10 + vpxor %xmm5,%xmm11,%xmm11 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm11,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + cmpq $0,%rcx + jne L$short_avx + + vpshufb %xmm13,%xmm10,%xmm10 + vmovdqu %xmm10,(%rdi) + vzeroupper + ret + + + +.section __DATA,__const +.p2align 6 +L$bswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +L$0x1c2_polynomial: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +L$7_mask: +.long 7,0,7,0 +.p2align 6 + +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 6 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/ghash-x86_64-linux.S b/third_party/boringssl/gen/bcm/ghash-x86_64-linux.S new file mode 100644 index 00000000..f1ffcb82 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-x86_64-linux.S @@ -0,0 +1,1127 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text +.globl gcm_init_clmul +.hidden gcm_init_clmul +.type gcm_init_clmul,@function +.align 16 +gcm_init_clmul: +.cfi_startproc + +_CET_ENDBR +.L_init_clmul: + movdqu (%rsi),%xmm2 + pshufd $78,%xmm2,%xmm2 + + + pshufd $255,%xmm2,%xmm4 + movdqa %xmm2,%xmm3 + psllq $1,%xmm2 + pxor %xmm5,%xmm5 + psrlq $63,%xmm3 + pcmpgtd %xmm4,%xmm5 + pslldq $8,%xmm3 + por %xmm3,%xmm2 + + + pand .L0x1c2_polynomial(%rip),%xmm5 + pxor %xmm5,%xmm2 + + + pshufd $78,%xmm2,%xmm6 + movdqa %xmm2,%xmm0 + pxor %xmm2,%xmm6 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0x00,%xmm2,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm1 + pclmulqdq $0x00,%xmm6,%xmm3 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm2,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm2,%xmm3 + movdqu %xmm2,0(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,16(%rdi) + palignr $8,%xmm3,%xmm4 + movdqu %xmm4,32(%rdi) + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0x00,%xmm2,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm1 + pclmulqdq $0x00,%xmm6,%xmm3 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm5 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0x00,%xmm2,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm1 + pclmulqdq $0x00,%xmm6,%xmm3 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufd $78,%xmm5,%xmm3 + pshufd $78,%xmm0,%xmm4 + pxor %xmm5,%xmm3 + movdqu %xmm5,48(%rdi) + pxor %xmm0,%xmm4 + movdqu %xmm0,64(%rdi) + palignr $8,%xmm3,%xmm4 + movdqu %xmm4,80(%rdi) + ret +.cfi_endproc + +.size gcm_init_clmul,.-gcm_init_clmul +.globl gcm_gmult_clmul +.hidden gcm_gmult_clmul +.type gcm_gmult_clmul,@function +.align 16 +gcm_gmult_clmul: +.cfi_startproc +_CET_ENDBR +.L_gmult_clmul: + movdqu (%rdi),%xmm0 + movdqa .Lbswap_mask(%rip),%xmm5 + movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm4 + pshufb %xmm5,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0x00,%xmm2,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm1 + pclmulqdq $0x00,%xmm4,%xmm3 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + pshufb %xmm5,%xmm0 + movdqu %xmm0,(%rdi) + ret +.cfi_endproc +.size gcm_gmult_clmul,.-gcm_gmult_clmul +.globl gcm_ghash_clmul +.hidden gcm_ghash_clmul +.type gcm_ghash_clmul,@function +.align 32 +gcm_ghash_clmul: +.cfi_startproc + +_CET_ENDBR +.L_ghash_clmul: + movdqa .Lbswap_mask(%rip),%xmm10 + + movdqu (%rdi),%xmm0 + movdqu (%rsi),%xmm2 + movdqu 32(%rsi),%xmm7 + pshufb %xmm10,%xmm0 + + subq $0x10,%rcx + jz .Lodd_tail + + movdqu 16(%rsi),%xmm6 + cmpq $0x30,%rcx + jb .Lskip4x + + subq $0x30,%rcx + movq $0xA040608020C0E000,%rax + movdqu 48(%rsi),%xmm14 + movdqu 64(%rsi),%xmm15 + + + + + movdqu 48(%rdx),%xmm3 + movdqu 32(%rdx),%xmm11 + pshufb %xmm10,%xmm3 + pshufb %xmm10,%xmm11 + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 + pclmulqdq $0x00,%xmm2,%xmm3 + pclmulqdq $0x11,%xmm2,%xmm5 + pclmulqdq $0x00,%xmm7,%xmm4 + + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm11,%xmm12 + pclmulqdq $0x00,%xmm6,%xmm11 + pclmulqdq $0x11,%xmm6,%xmm13 + pclmulqdq $0x10,%xmm7,%xmm12 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 + xorps %xmm12,%xmm4 + + movdqu 16(%rdx),%xmm11 + movdqu 0(%rdx),%xmm8 + pshufb %xmm10,%xmm11 + pshufb %xmm10,%xmm8 + movdqa %xmm11,%xmm13 + pshufd $78,%xmm11,%xmm12 + pxor %xmm8,%xmm0 + pxor %xmm11,%xmm12 + pclmulqdq $0x00,%xmm14,%xmm11 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + pclmulqdq $0x11,%xmm14,%xmm13 + pclmulqdq $0x00,%xmm7,%xmm12 + xorps %xmm11,%xmm3 + xorps %xmm13,%xmm5 + + leaq 64(%rdx),%rdx + subq $0x40,%rcx + jc .Ltail4x + + jmp .Lmod4_loop +.align 32 +.Lmod4_loop: + pclmulqdq $0x00,%xmm15,%xmm0 + xorps %xmm12,%xmm4 + movdqu 48(%rdx),%xmm11 + pshufb %xmm10,%xmm11 + pclmulqdq $0x11,%xmm15,%xmm1 + xorps %xmm3,%xmm0 + movdqu 32(%rdx),%xmm3 + movdqa %xmm11,%xmm13 + pclmulqdq $0x10,%xmm7,%xmm8 + pshufd $78,%xmm11,%xmm12 + xorps %xmm5,%xmm1 + pxor %xmm11,%xmm12 + pshufb %xmm10,%xmm3 + movups 32(%rsi),%xmm7 + xorps %xmm4,%xmm8 + pclmulqdq $0x00,%xmm2,%xmm11 + pshufd $78,%xmm3,%xmm4 + + pxor %xmm0,%xmm8 + movdqa %xmm3,%xmm5 + pxor %xmm1,%xmm8 + pxor %xmm3,%xmm4 + movdqa %xmm8,%xmm9 + pclmulqdq $0x11,%xmm2,%xmm13 + pslldq $8,%xmm8 + psrldq $8,%xmm9 + pxor %xmm8,%xmm0 + movdqa .L7_mask(%rip),%xmm8 + pxor %xmm9,%xmm1 + movq %rax,%xmm9 + + pand %xmm0,%xmm8 + pshufb %xmm8,%xmm9 + pxor %xmm0,%xmm9 + pclmulqdq $0x00,%xmm7,%xmm12 + psllq $57,%xmm9 + movdqa %xmm9,%xmm8 + pslldq $8,%xmm9 + pclmulqdq $0x00,%xmm6,%xmm3 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + movdqu 0(%rdx),%xmm8 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 + pclmulqdq $0x11,%xmm6,%xmm5 + xorps %xmm11,%xmm3 + movdqu 16(%rdx),%xmm11 + pshufb %xmm10,%xmm11 + pclmulqdq $0x10,%xmm7,%xmm4 + xorps %xmm13,%xmm5 + movups 80(%rsi),%xmm7 + pshufb %xmm10,%xmm8 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + + movdqa %xmm11,%xmm13 + pxor %xmm12,%xmm4 + pshufd $78,%xmm11,%xmm12 + pxor %xmm9,%xmm0 + pxor %xmm8,%xmm1 + pxor %xmm11,%xmm12 + pclmulqdq $0x00,%xmm14,%xmm11 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + movdqa %xmm0,%xmm1 + pclmulqdq $0x11,%xmm14,%xmm13 + xorps %xmm11,%xmm3 + pshufd $78,%xmm0,%xmm8 + pxor %xmm0,%xmm8 + + pclmulqdq $0x00,%xmm7,%xmm12 + xorps %xmm13,%xmm5 + + leaq 64(%rdx),%rdx + subq $0x40,%rcx + jnc .Lmod4_loop + +.Ltail4x: + pclmulqdq $0x00,%xmm15,%xmm0 + pclmulqdq $0x11,%xmm15,%xmm1 + pclmulqdq $0x10,%xmm7,%xmm8 + xorps %xmm12,%xmm4 + xorps %xmm3,%xmm0 + xorps %xmm5,%xmm1 + pxor %xmm0,%xmm1 + pxor %xmm4,%xmm8 + + pxor %xmm1,%xmm8 + pxor %xmm0,%xmm1 + + movdqa %xmm8,%xmm9 + psrldq $8,%xmm8 + pslldq $8,%xmm9 + pxor %xmm8,%xmm1 + pxor %xmm9,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + addq $0x40,%rcx + jz .Ldone + movdqu 32(%rsi),%xmm7 + subq $0x10,%rcx + jz .Lodd_tail +.Lskip4x: + + + + + + movdqu (%rdx),%xmm8 + movdqu 16(%rdx),%xmm3 + pshufb %xmm10,%xmm8 + pshufb %xmm10,%xmm3 + pxor %xmm8,%xmm0 + + movdqa %xmm3,%xmm5 + pshufd $78,%xmm3,%xmm4 + pxor %xmm3,%xmm4 + pclmulqdq $0x00,%xmm2,%xmm3 + pclmulqdq $0x11,%xmm2,%xmm5 + pclmulqdq $0x00,%xmm7,%xmm4 + + leaq 32(%rdx),%rdx + nop + subq $0x20,%rcx + jbe .Leven_tail + nop + jmp .Lmod_loop + +.align 32 +.Lmod_loop: + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + + pclmulqdq $0x00,%xmm6,%xmm0 + pclmulqdq $0x11,%xmm6,%xmm1 + pclmulqdq $0x10,%xmm7,%xmm4 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + movdqu (%rdx),%xmm9 + pxor %xmm0,%xmm8 + pshufb %xmm10,%xmm9 + movdqu 16(%rdx),%xmm3 + + pxor %xmm1,%xmm8 + pxor %xmm9,%xmm1 + pxor %xmm8,%xmm4 + pshufb %xmm10,%xmm3 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm3,%xmm5 + + movdqa %xmm0,%xmm9 + movdqa %xmm0,%xmm8 + psllq $5,%xmm0 + pxor %xmm0,%xmm8 + pclmulqdq $0x00,%xmm2,%xmm3 + psllq $1,%xmm0 + pxor %xmm8,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm8 + pslldq $8,%xmm0 + psrldq $8,%xmm8 + pxor %xmm9,%xmm0 + pshufd $78,%xmm5,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm5,%xmm4 + + movdqa %xmm0,%xmm9 + psrlq $1,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm5 + pxor %xmm9,%xmm1 + pxor %xmm0,%xmm9 + psrlq $5,%xmm0 + pxor %xmm9,%xmm0 + leaq 32(%rdx),%rdx + psrlq $1,%xmm0 + pclmulqdq $0x00,%xmm7,%xmm4 + pxor %xmm1,%xmm0 + + subq $0x20,%rcx + ja .Lmod_loop + +.Leven_tail: + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm8 + pshufd $78,%xmm0,%xmm4 + pxor %xmm0,%xmm4 + + pclmulqdq $0x00,%xmm6,%xmm0 + pclmulqdq $0x11,%xmm6,%xmm1 + pclmulqdq $0x10,%xmm7,%xmm4 + + pxor %xmm3,%xmm0 + pxor %xmm5,%xmm1 + pxor %xmm0,%xmm8 + pxor %xmm1,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm8 + psrldq $8,%xmm8 + pslldq $8,%xmm4 + pxor %xmm8,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 + testq %rcx,%rcx + jnz .Ldone + +.Lodd_tail: + movdqu (%rdx),%xmm8 + pshufb %xmm10,%xmm8 + pxor %xmm8,%xmm0 + movdqa %xmm0,%xmm1 + pshufd $78,%xmm0,%xmm3 + pxor %xmm0,%xmm3 + pclmulqdq $0x00,%xmm2,%xmm0 + pclmulqdq $0x11,%xmm2,%xmm1 + pclmulqdq $0x00,%xmm7,%xmm3 + pxor %xmm0,%xmm3 + pxor %xmm1,%xmm3 + + movdqa %xmm3,%xmm4 + psrldq $8,%xmm3 + pslldq $8,%xmm4 + pxor %xmm3,%xmm1 + pxor %xmm4,%xmm0 + + movdqa %xmm0,%xmm4 + movdqa %xmm0,%xmm3 + psllq $5,%xmm0 + pxor %xmm0,%xmm3 + psllq $1,%xmm0 + pxor %xmm3,%xmm0 + psllq $57,%xmm0 + movdqa %xmm0,%xmm3 + pslldq $8,%xmm0 + psrldq $8,%xmm3 + pxor %xmm4,%xmm0 + pxor %xmm3,%xmm1 + + + movdqa %xmm0,%xmm4 + psrlq $1,%xmm0 + pxor %xmm4,%xmm1 + pxor %xmm0,%xmm4 + psrlq $5,%xmm0 + pxor %xmm4,%xmm0 + psrlq $1,%xmm0 + pxor %xmm1,%xmm0 +.Ldone: + pshufb %xmm10,%xmm0 + movdqu %xmm0,(%rdi) + ret +.cfi_endproc + +.size gcm_ghash_clmul,.-gcm_ghash_clmul +.globl gcm_init_avx +.hidden gcm_init_avx +.type gcm_init_avx,@function +.align 32 +gcm_init_avx: +.cfi_startproc + +_CET_ENDBR + vzeroupper + + vmovdqu (%rsi),%xmm2 + vpshufd $78,%xmm2,%xmm2 + + + vpshufd $255,%xmm2,%xmm4 + vpsrlq $63,%xmm2,%xmm3 + vpsllq $1,%xmm2,%xmm2 + vpxor %xmm5,%xmm5,%xmm5 + vpcmpgtd %xmm4,%xmm5,%xmm5 + vpslldq $8,%xmm3,%xmm3 + vpor %xmm3,%xmm2,%xmm2 + + + vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 + vpxor %xmm5,%xmm2,%xmm2 + + vpunpckhqdq %xmm2,%xmm2,%xmm6 + vmovdqa %xmm2,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + movq $4,%r10 + jmp .Linit_start_avx +.align 32 +.Linit_loop_avx: + vpalignr $8,%xmm3,%xmm4,%xmm5 + vmovdqu %xmm5,-16(%rdi) + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 +.Linit_start_avx: + vmovdqa %xmm0,%xmm5 + vpunpckhqdq %xmm0,%xmm0,%xmm3 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 + vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 + vpxor %xmm0,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm0,%xmm0 + vpxor %xmm3,%xmm1,%xmm1 + vpsllq $57,%xmm0,%xmm3 + vpsllq $62,%xmm0,%xmm4 + vpxor %xmm3,%xmm4,%xmm4 + vpsllq $63,%xmm0,%xmm3 + vpxor %xmm3,%xmm4,%xmm4 + vpslldq $8,%xmm4,%xmm3 + vpsrldq $8,%xmm4,%xmm4 + vpxor %xmm3,%xmm0,%xmm0 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrlq $1,%xmm0,%xmm4 + vpxor %xmm0,%xmm1,%xmm1 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $5,%xmm4,%xmm4 + vpxor %xmm4,%xmm0,%xmm0 + vpsrlq $1,%xmm0,%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + vpshufd $78,%xmm5,%xmm3 + vpshufd $78,%xmm0,%xmm4 + vpxor %xmm5,%xmm3,%xmm3 + vmovdqu %xmm5,0(%rdi) + vpxor %xmm0,%xmm4,%xmm4 + vmovdqu %xmm0,16(%rdi) + leaq 48(%rdi),%rdi + subq $1,%r10 + jnz .Linit_loop_avx + + vpalignr $8,%xmm4,%xmm3,%xmm5 + vmovdqu %xmm5,-16(%rdi) + + vzeroupper + ret + +.cfi_endproc +.size gcm_init_avx,.-gcm_init_avx +.globl gcm_gmult_avx +.hidden gcm_gmult_avx +.type gcm_gmult_avx,@function +.align 32 +gcm_gmult_avx: +.cfi_startproc +_CET_ENDBR + jmp .L_gmult_clmul +.cfi_endproc +.size gcm_gmult_avx,.-gcm_gmult_avx +.globl gcm_ghash_avx +.hidden gcm_ghash_avx +.type gcm_ghash_avx,@function +.align 32 +gcm_ghash_avx: +.cfi_startproc + +_CET_ENDBR + vzeroupper + + vmovdqu (%rdi),%xmm10 + leaq .L0x1c2_polynomial(%rip),%r10 + leaq 64(%rsi),%rsi + vmovdqu .Lbswap_mask(%rip),%xmm13 + vpshufb %xmm13,%xmm10,%xmm10 + cmpq $0x80,%rcx + jb .Lshort_avx + subq $0x80,%rcx + + vmovdqu 112(%rdx),%xmm14 + vmovdqu 0-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vmovdqu 32-64(%rsi),%xmm7 + + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm14,%xmm9,%xmm9 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 80(%rdx),%xmm14 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 48-64(%rsi),%xmm6 + vpxor %xmm14,%xmm9,%xmm9 + vmovdqu 64(%rdx),%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 48(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 32(%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + + vmovdqu 16(%rdx),%xmm14 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb %xmm13,%xmm14,%xmm14 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpxor %xmm5,%xmm2,%xmm2 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu (%rdx),%xmm15 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm1,%xmm4,%xmm4 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + + leaq 128(%rdx),%rdx + cmpq $0x80,%rcx + jb .Ltail_avx + + vpxor %xmm10,%xmm15,%xmm15 + subq $0x80,%rcx + jmp .Loop8x_avx + +.align 32 +.Loop8x_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vmovdqu 112(%rdx),%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpxor %xmm15,%xmm8,%xmm8 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 + vmovdqu 0-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 + vmovdqu 32-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + + vmovdqu 96(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpxor %xmm3,%xmm10,%xmm10 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vxorps %xmm4,%xmm11,%xmm11 + vmovdqu 16-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm5,%xmm12,%xmm12 + vxorps %xmm15,%xmm8,%xmm8 + + vmovdqu 80(%rdx),%xmm14 + vpxor %xmm10,%xmm12,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpxor %xmm11,%xmm12,%xmm12 + vpslldq $8,%xmm12,%xmm9 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vpsrldq $8,%xmm12,%xmm12 + vpxor %xmm9,%xmm10,%xmm10 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm14 + vxorps %xmm12,%xmm11,%xmm11 + vpxor %xmm1,%xmm4,%xmm4 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 80-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 64(%rdx),%xmm15 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vxorps %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + + vmovdqu 48(%rdx),%xmm14 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 96-64(%rsi),%xmm6 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 128-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu 32(%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpxor %xmm3,%xmm0,%xmm0 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm5,%xmm2,%xmm2 + vxorps %xmm12,%xmm10,%xmm10 + + vmovdqu 16(%rdx),%xmm14 + vpalignr $8,%xmm10,%xmm10,%xmm12 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 + vpshufb %xmm13,%xmm14,%xmm14 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 + vmovdqu 144-64(%rsi),%xmm6 + vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 + vxorps %xmm11,%xmm12,%xmm12 + vpunpckhqdq %xmm14,%xmm14,%xmm9 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 + vmovdqu 176-64(%rsi),%xmm7 + vpxor %xmm14,%xmm9,%xmm9 + vpxor %xmm2,%xmm5,%xmm5 + + vmovdqu (%rdx),%xmm15 + vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 + vpshufb %xmm13,%xmm15,%xmm15 + vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 + vmovdqu 160-64(%rsi),%xmm6 + vpxor %xmm12,%xmm15,%xmm15 + vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 + vpxor %xmm10,%xmm15,%xmm15 + + leaq 128(%rdx),%rdx + subq $0x80,%rcx + jnc .Loop8x_avx + + addq $0x80,%rcx + jmp .Ltail_no_xor_avx + +.align 32 +.Lshort_avx: + vmovdqu -16(%rdx,%rcx,1),%xmm14 + leaq (%rdx,%rcx,1),%rdx + vmovdqu 0-64(%rsi),%xmm6 + vmovdqu 32-64(%rsi),%xmm7 + vpshufb %xmm13,%xmm14,%xmm15 + + vmovdqa %xmm0,%xmm3 + vmovdqa %xmm1,%xmm4 + vmovdqa %xmm2,%xmm5 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -32(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 16-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -48(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 48-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 80-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -64(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 64-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -80(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 96-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovdqu 128-64(%rsi),%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -96(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 112-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vpsrldq $8,%xmm7,%xmm7 + subq $0x10,%rcx + jz .Ltail_avx + + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vmovdqu -112(%rdx),%xmm14 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vmovdqu 144-64(%rsi),%xmm6 + vpshufb %xmm13,%xmm14,%xmm15 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + vmovq 184-64(%rsi),%xmm7 + subq $0x10,%rcx + jmp .Ltail_avx + +.align 32 +.Ltail_avx: + vpxor %xmm10,%xmm15,%xmm15 +.Ltail_no_xor_avx: + vpunpckhqdq %xmm15,%xmm15,%xmm8 + vpxor %xmm0,%xmm3,%xmm3 + vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 + vpxor %xmm15,%xmm8,%xmm8 + vpxor %xmm1,%xmm4,%xmm4 + vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 + vpxor %xmm2,%xmm5,%xmm5 + vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 + + vmovdqu (%r10),%xmm12 + + vpxor %xmm0,%xmm3,%xmm10 + vpxor %xmm1,%xmm4,%xmm11 + vpxor %xmm2,%xmm5,%xmm5 + + vpxor %xmm10,%xmm5,%xmm5 + vpxor %xmm11,%xmm5,%xmm5 + vpslldq $8,%xmm5,%xmm9 + vpsrldq $8,%xmm5,%xmm5 + vpxor %xmm9,%xmm10,%xmm10 + vpxor %xmm5,%xmm11,%xmm11 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 + vpalignr $8,%xmm10,%xmm10,%xmm10 + vpxor %xmm11,%xmm10,%xmm10 + vpxor %xmm9,%xmm10,%xmm10 + + cmpq $0,%rcx + jne .Lshort_avx + + vpshufb %xmm13,%xmm10,%xmm10 + vmovdqu %xmm10,(%rdi) + vzeroupper + ret +.cfi_endproc + +.size gcm_ghash_avx,.-gcm_ghash_avx +.section .rodata +.align 64 +.Lbswap_mask: +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.L0x1c2_polynomial: +.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +.L7_mask: +.long 7,0,7,0 +.align 64 + +.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/ghash-x86_64-win.asm b/third_party/boringssl/gen/bcm/ghash-x86_64-win.asm new file mode 100644 index 00000000..dd732483 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghash-x86_64-win.asm @@ -0,0 +1,1342 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + +global gcm_init_clmul + +ALIGN 16 +gcm_init_clmul: + +$L$SEH_begin_gcm_init_clmul_1: +_CET_ENDBR +$L$_init_clmul: + sub rsp,0x18 +$L$SEH_prologue_gcm_init_clmul_2: + movaps XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_init_clmul_3: +$L$SEH_endprologue_gcm_init_clmul_4: + movdqu xmm2,XMMWORD[rdx] + pshufd xmm2,xmm2,78 + + + pshufd xmm4,xmm2,255 + movdqa xmm3,xmm2 + psllq xmm2,1 + pxor xmm5,xmm5 + psrlq xmm3,63 + pcmpgtd xmm5,xmm4 + pslldq xmm3,8 + por xmm2,xmm3 + + + pand xmm5,XMMWORD[$L$0x1c2_polynomial] + pxor xmm2,xmm5 + + + pshufd xmm6,xmm2,78 + movdqa xmm0,xmm2 + pxor xmm6,xmm2 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 + pclmulqdq xmm0,xmm2,0x00 + pclmulqdq xmm1,xmm2,0x11 + pclmulqdq xmm3,xmm6,0x00 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pshufd xmm3,xmm2,78 + pshufd xmm4,xmm0,78 + pxor xmm3,xmm2 + movdqu XMMWORD[rcx],xmm2 + pxor xmm4,xmm0 + movdqu XMMWORD[16+rcx],xmm0 + palignr xmm4,xmm3,8 + movdqu XMMWORD[32+rcx],xmm4 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 + pclmulqdq xmm0,xmm2,0x00 + pclmulqdq xmm1,xmm2,0x11 + pclmulqdq xmm3,xmm6,0x00 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + movdqa xmm5,xmm0 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 + pclmulqdq xmm0,xmm2,0x00 + pclmulqdq xmm1,xmm2,0x11 + pclmulqdq xmm3,xmm6,0x00 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pshufd xmm3,xmm5,78 + pshufd xmm4,xmm0,78 + pxor xmm3,xmm5 + movdqu XMMWORD[48+rcx],xmm5 + pxor xmm4,xmm0 + movdqu XMMWORD[64+rcx],xmm0 + palignr xmm4,xmm3,8 + movdqu XMMWORD[80+rcx],xmm4 + movaps xmm6,XMMWORD[rsp] + lea rsp,[24+rsp] + ret + +$L$SEH_end_gcm_init_clmul_5: + +global gcm_gmult_clmul + +ALIGN 16 +gcm_gmult_clmul: + +_CET_ENDBR +$L$_gmult_clmul: + movdqu xmm0,XMMWORD[rcx] + movdqa xmm5,XMMWORD[$L$bswap_mask] + movdqu xmm2,XMMWORD[rdx] + movdqu xmm4,XMMWORD[32+rdx] + pshufb xmm0,xmm5 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 + pclmulqdq xmm0,xmm2,0x00 + pclmulqdq xmm1,xmm2,0x11 + pclmulqdq xmm3,xmm4,0x00 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + pshufb xmm0,xmm5 + movdqu XMMWORD[rcx],xmm0 + ret + + +global gcm_ghash_clmul + +ALIGN 32 +gcm_ghash_clmul: + +$L$SEH_begin_gcm_ghash_clmul_1: +_CET_ENDBR +$L$_ghash_clmul: + lea rax,[((-136))+rsp] + lea rsp,[((-32))+rax] +$L$SEH_prologue_gcm_ghash_clmul_2: + movaps XMMWORD[(-32)+rax],xmm6 +$L$SEH_prologue_gcm_ghash_clmul_3: + movaps XMMWORD[(-16)+rax],xmm7 +$L$SEH_prologue_gcm_ghash_clmul_4: + movaps XMMWORD[rax],xmm8 +$L$SEH_prologue_gcm_ghash_clmul_5: + movaps XMMWORD[16+rax],xmm9 +$L$SEH_prologue_gcm_ghash_clmul_6: + movaps XMMWORD[32+rax],xmm10 +$L$SEH_prologue_gcm_ghash_clmul_7: + movaps XMMWORD[48+rax],xmm11 +$L$SEH_prologue_gcm_ghash_clmul_8: + movaps XMMWORD[64+rax],xmm12 +$L$SEH_prologue_gcm_ghash_clmul_9: + movaps XMMWORD[80+rax],xmm13 +$L$SEH_prologue_gcm_ghash_clmul_10: + movaps XMMWORD[96+rax],xmm14 +$L$SEH_prologue_gcm_ghash_clmul_11: + movaps XMMWORD[112+rax],xmm15 +$L$SEH_prologue_gcm_ghash_clmul_12: +$L$SEH_endprologue_gcm_ghash_clmul_13: + movdqa xmm10,XMMWORD[$L$bswap_mask] + + movdqu xmm0,XMMWORD[rcx] + movdqu xmm2,XMMWORD[rdx] + movdqu xmm7,XMMWORD[32+rdx] + pshufb xmm0,xmm10 + + sub r9,0x10 + jz NEAR $L$odd_tail + + movdqu xmm6,XMMWORD[16+rdx] + cmp r9,0x30 + jb NEAR $L$skip4x + + sub r9,0x30 + mov rax,0xA040608020C0E000 + movdqu xmm14,XMMWORD[48+rdx] + movdqu xmm15,XMMWORD[64+rdx] + + + + + movdqu xmm3,XMMWORD[48+r8] + movdqu xmm11,XMMWORD[32+r8] + pshufb xmm3,xmm10 + pshufb xmm11,xmm10 + movdqa xmm5,xmm3 + pshufd xmm4,xmm3,78 + pxor xmm4,xmm3 + pclmulqdq xmm3,xmm2,0x00 + pclmulqdq xmm5,xmm2,0x11 + pclmulqdq xmm4,xmm7,0x00 + + movdqa xmm13,xmm11 + pshufd xmm12,xmm11,78 + pxor xmm12,xmm11 + pclmulqdq xmm11,xmm6,0x00 + pclmulqdq xmm13,xmm6,0x11 + pclmulqdq xmm12,xmm7,0x10 + xorps xmm3,xmm11 + xorps xmm5,xmm13 + movups xmm7,XMMWORD[80+rdx] + xorps xmm4,xmm12 + + movdqu xmm11,XMMWORD[16+r8] + movdqu xmm8,XMMWORD[r8] + pshufb xmm11,xmm10 + pshufb xmm8,xmm10 + movdqa xmm13,xmm11 + pshufd xmm12,xmm11,78 + pxor xmm0,xmm8 + pxor xmm12,xmm11 + pclmulqdq xmm11,xmm14,0x00 + movdqa xmm1,xmm0 + pshufd xmm8,xmm0,78 + pxor xmm8,xmm0 + pclmulqdq xmm13,xmm14,0x11 + pclmulqdq xmm12,xmm7,0x00 + xorps xmm3,xmm11 + xorps xmm5,xmm13 + + lea r8,[64+r8] + sub r9,0x40 + jc NEAR $L$tail4x + + jmp NEAR $L$mod4_loop +ALIGN 32 +$L$mod4_loop: + pclmulqdq xmm0,xmm15,0x00 + xorps xmm4,xmm12 + movdqu xmm11,XMMWORD[48+r8] + pshufb xmm11,xmm10 + pclmulqdq xmm1,xmm15,0x11 + xorps xmm0,xmm3 + movdqu xmm3,XMMWORD[32+r8] + movdqa xmm13,xmm11 + pclmulqdq xmm8,xmm7,0x10 + pshufd xmm12,xmm11,78 + xorps xmm1,xmm5 + pxor xmm12,xmm11 + pshufb xmm3,xmm10 + movups xmm7,XMMWORD[32+rdx] + xorps xmm8,xmm4 + pclmulqdq xmm11,xmm2,0x00 + pshufd xmm4,xmm3,78 + + pxor xmm8,xmm0 + movdqa xmm5,xmm3 + pxor xmm8,xmm1 + pxor xmm4,xmm3 + movdqa xmm9,xmm8 + pclmulqdq xmm13,xmm2,0x11 + pslldq xmm8,8 + psrldq xmm9,8 + pxor xmm0,xmm8 + movdqa xmm8,XMMWORD[$L$7_mask] + pxor xmm1,xmm9 + movq xmm9,rax + + pand xmm8,xmm0 + pshufb xmm9,xmm8 + pxor xmm9,xmm0 + pclmulqdq xmm12,xmm7,0x00 + psllq xmm9,57 + movdqa xmm8,xmm9 + pslldq xmm9,8 + pclmulqdq xmm3,xmm6,0x00 + psrldq xmm8,8 + pxor xmm0,xmm9 + pxor xmm1,xmm8 + movdqu xmm8,XMMWORD[r8] + + movdqa xmm9,xmm0 + psrlq xmm0,1 + pclmulqdq xmm5,xmm6,0x11 + xorps xmm3,xmm11 + movdqu xmm11,XMMWORD[16+r8] + pshufb xmm11,xmm10 + pclmulqdq xmm4,xmm7,0x10 + xorps xmm5,xmm13 + movups xmm7,XMMWORD[80+rdx] + pshufb xmm8,xmm10 + pxor xmm1,xmm9 + pxor xmm9,xmm0 + psrlq xmm0,5 + + movdqa xmm13,xmm11 + pxor xmm4,xmm12 + pshufd xmm12,xmm11,78 + pxor xmm0,xmm9 + pxor xmm1,xmm8 + pxor xmm12,xmm11 + pclmulqdq xmm11,xmm14,0x00 + psrlq xmm0,1 + pxor xmm0,xmm1 + movdqa xmm1,xmm0 + pclmulqdq xmm13,xmm14,0x11 + xorps xmm3,xmm11 + pshufd xmm8,xmm0,78 + pxor xmm8,xmm0 + + pclmulqdq xmm12,xmm7,0x00 + xorps xmm5,xmm13 + + lea r8,[64+r8] + sub r9,0x40 + jnc NEAR $L$mod4_loop + +$L$tail4x: + pclmulqdq xmm0,xmm15,0x00 + pclmulqdq xmm1,xmm15,0x11 + pclmulqdq xmm8,xmm7,0x10 + xorps xmm4,xmm12 + xorps xmm0,xmm3 + xorps xmm1,xmm5 + pxor xmm1,xmm0 + pxor xmm8,xmm4 + + pxor xmm8,xmm1 + pxor xmm1,xmm0 + + movdqa xmm9,xmm8 + psrldq xmm8,8 + pslldq xmm9,8 + pxor xmm1,xmm8 + pxor xmm0,xmm9 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + add r9,0x40 + jz NEAR $L$done + movdqu xmm7,XMMWORD[32+rdx] + sub r9,0x10 + jz NEAR $L$odd_tail +$L$skip4x: + + + + + + movdqu xmm8,XMMWORD[r8] + movdqu xmm3,XMMWORD[16+r8] + pshufb xmm8,xmm10 + pshufb xmm3,xmm10 + pxor xmm0,xmm8 + + movdqa xmm5,xmm3 + pshufd xmm4,xmm3,78 + pxor xmm4,xmm3 + pclmulqdq xmm3,xmm2,0x00 + pclmulqdq xmm5,xmm2,0x11 + pclmulqdq xmm4,xmm7,0x00 + + lea r8,[32+r8] + nop + sub r9,0x20 + jbe NEAR $L$even_tail + nop + jmp NEAR $L$mod_loop + +ALIGN 32 +$L$mod_loop: + movdqa xmm1,xmm0 + movdqa xmm8,xmm4 + pshufd xmm4,xmm0,78 + pxor xmm4,xmm0 + + pclmulqdq xmm0,xmm6,0x00 + pclmulqdq xmm1,xmm6,0x11 + pclmulqdq xmm4,xmm7,0x10 + + pxor xmm0,xmm3 + pxor xmm1,xmm5 + movdqu xmm9,XMMWORD[r8] + pxor xmm8,xmm0 + pshufb xmm9,xmm10 + movdqu xmm3,XMMWORD[16+r8] + + pxor xmm8,xmm1 + pxor xmm1,xmm9 + pxor xmm4,xmm8 + pshufb xmm3,xmm10 + movdqa xmm8,xmm4 + psrldq xmm8,8 + pslldq xmm4,8 + pxor xmm1,xmm8 + pxor xmm0,xmm4 + + movdqa xmm5,xmm3 + + movdqa xmm9,xmm0 + movdqa xmm8,xmm0 + psllq xmm0,5 + pxor xmm8,xmm0 + pclmulqdq xmm3,xmm2,0x00 + psllq xmm0,1 + pxor xmm0,xmm8 + psllq xmm0,57 + movdqa xmm8,xmm0 + pslldq xmm0,8 + psrldq xmm8,8 + pxor xmm0,xmm9 + pshufd xmm4,xmm5,78 + pxor xmm1,xmm8 + pxor xmm4,xmm5 + + movdqa xmm9,xmm0 + psrlq xmm0,1 + pclmulqdq xmm5,xmm2,0x11 + pxor xmm1,xmm9 + pxor xmm9,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm9 + lea r8,[32+r8] + psrlq xmm0,1 + pclmulqdq xmm4,xmm7,0x00 + pxor xmm0,xmm1 + + sub r9,0x20 + ja NEAR $L$mod_loop + +$L$even_tail: + movdqa xmm1,xmm0 + movdqa xmm8,xmm4 + pshufd xmm4,xmm0,78 + pxor xmm4,xmm0 + + pclmulqdq xmm0,xmm6,0x00 + pclmulqdq xmm1,xmm6,0x11 + pclmulqdq xmm4,xmm7,0x10 + + pxor xmm0,xmm3 + pxor xmm1,xmm5 + pxor xmm8,xmm0 + pxor xmm8,xmm1 + pxor xmm4,xmm8 + movdqa xmm8,xmm4 + psrldq xmm8,8 + pslldq xmm4,8 + pxor xmm1,xmm8 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 + test r9,r9 + jnz NEAR $L$done + +$L$odd_tail: + movdqu xmm8,XMMWORD[r8] + pshufb xmm8,xmm10 + pxor xmm0,xmm8 + movdqa xmm1,xmm0 + pshufd xmm3,xmm0,78 + pxor xmm3,xmm0 + pclmulqdq xmm0,xmm2,0x00 + pclmulqdq xmm1,xmm2,0x11 + pclmulqdq xmm3,xmm7,0x00 + pxor xmm3,xmm0 + pxor xmm3,xmm1 + + movdqa xmm4,xmm3 + psrldq xmm3,8 + pslldq xmm4,8 + pxor xmm1,xmm3 + pxor xmm0,xmm4 + + movdqa xmm4,xmm0 + movdqa xmm3,xmm0 + psllq xmm0,5 + pxor xmm3,xmm0 + psllq xmm0,1 + pxor xmm0,xmm3 + psllq xmm0,57 + movdqa xmm3,xmm0 + pslldq xmm0,8 + psrldq xmm3,8 + pxor xmm0,xmm4 + pxor xmm1,xmm3 + + + movdqa xmm4,xmm0 + psrlq xmm0,1 + pxor xmm1,xmm4 + pxor xmm4,xmm0 + psrlq xmm0,5 + pxor xmm0,xmm4 + psrlq xmm0,1 + pxor xmm0,xmm1 +$L$done: + pshufb xmm0,xmm10 + movdqu XMMWORD[rcx],xmm0 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] + ret + +$L$SEH_end_gcm_ghash_clmul_14: + +global gcm_init_avx + +ALIGN 32 +gcm_init_avx: + +$L$SEH_begin_gcm_init_avx_1: +_CET_ENDBR + sub rsp,0x18 +$L$SEH_prologue_gcm_init_avx_2: + movaps XMMWORD[rsp],xmm6 +$L$SEH_prologue_gcm_init_avx_3: +$L$SEH_endprologue_gcm_init_avx_4: + vzeroupper + + vmovdqu xmm2,XMMWORD[rdx] + vpshufd xmm2,xmm2,78 + + + vpshufd xmm4,xmm2,255 + vpsrlq xmm3,xmm2,63 + vpsllq xmm2,xmm2,1 + vpxor xmm5,xmm5,xmm5 + vpcmpgtd xmm5,xmm5,xmm4 + vpslldq xmm3,xmm3,8 + vpor xmm2,xmm2,xmm3 + + + vpand xmm5,xmm5,XMMWORD[$L$0x1c2_polynomial] + vpxor xmm2,xmm2,xmm5 + + vpunpckhqdq xmm6,xmm2,xmm2 + vmovdqa xmm0,xmm2 + vpxor xmm6,xmm6,xmm2 + mov r10,4 + jmp NEAR $L$init_start_avx +ALIGN 32 +$L$init_loop_avx: + vpalignr xmm5,xmm4,xmm3,8 + vmovdqu XMMWORD[(-16)+rcx],xmm5 + vpunpckhqdq xmm3,xmm0,xmm0 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm1,xmm0,xmm2,0x11 + vpclmulqdq xmm0,xmm0,xmm2,0x00 + vpclmulqdq xmm3,xmm3,xmm6,0x00 + vpxor xmm4,xmm1,xmm0 + vpxor xmm3,xmm3,xmm4 + + vpslldq xmm4,xmm3,8 + vpsrldq xmm3,xmm3,8 + vpxor xmm0,xmm0,xmm4 + vpxor xmm1,xmm1,xmm3 + vpsllq xmm3,xmm0,57 + vpsllq xmm4,xmm0,62 + vpxor xmm4,xmm4,xmm3 + vpsllq xmm3,xmm0,63 + vpxor xmm4,xmm4,xmm3 + vpslldq xmm3,xmm4,8 + vpsrldq xmm4,xmm4,8 + vpxor xmm0,xmm0,xmm3 + vpxor xmm1,xmm1,xmm4 + + vpsrlq xmm4,xmm0,1 + vpxor xmm1,xmm1,xmm0 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm4,xmm4,5 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm0,xmm0,1 + vpxor xmm0,xmm0,xmm1 +$L$init_start_avx: + vmovdqa xmm5,xmm0 + vpunpckhqdq xmm3,xmm0,xmm0 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm1,xmm0,xmm2,0x11 + vpclmulqdq xmm0,xmm0,xmm2,0x00 + vpclmulqdq xmm3,xmm3,xmm6,0x00 + vpxor xmm4,xmm1,xmm0 + vpxor xmm3,xmm3,xmm4 + + vpslldq xmm4,xmm3,8 + vpsrldq xmm3,xmm3,8 + vpxor xmm0,xmm0,xmm4 + vpxor xmm1,xmm1,xmm3 + vpsllq xmm3,xmm0,57 + vpsllq xmm4,xmm0,62 + vpxor xmm4,xmm4,xmm3 + vpsllq xmm3,xmm0,63 + vpxor xmm4,xmm4,xmm3 + vpslldq xmm3,xmm4,8 + vpsrldq xmm4,xmm4,8 + vpxor xmm0,xmm0,xmm3 + vpxor xmm1,xmm1,xmm4 + + vpsrlq xmm4,xmm0,1 + vpxor xmm1,xmm1,xmm0 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm4,xmm4,5 + vpxor xmm0,xmm0,xmm4 + vpsrlq xmm0,xmm0,1 + vpxor xmm0,xmm0,xmm1 + vpshufd xmm3,xmm5,78 + vpshufd xmm4,xmm0,78 + vpxor xmm3,xmm3,xmm5 + vmovdqu XMMWORD[rcx],xmm5 + vpxor xmm4,xmm4,xmm0 + vmovdqu XMMWORD[16+rcx],xmm0 + lea rcx,[48+rcx] + sub r10,1 + jnz NEAR $L$init_loop_avx + + vpalignr xmm5,xmm3,xmm4,8 + vmovdqu XMMWORD[(-16)+rcx],xmm5 + + vzeroupper + movaps xmm6,XMMWORD[rsp] + lea rsp,[24+rsp] + ret +$L$SEH_end_gcm_init_avx_5: + + +global gcm_gmult_avx + +ALIGN 32 +gcm_gmult_avx: + +_CET_ENDBR + jmp NEAR $L$_gmult_clmul + + +global gcm_ghash_avx + +ALIGN 32 +gcm_ghash_avx: + +$L$SEH_begin_gcm_ghash_avx_1: +_CET_ENDBR + lea rax,[((-136))+rsp] + lea rsp,[((-32))+rax] +$L$SEH_prologue_gcm_ghash_avx_2: + movaps XMMWORD[(-32)+rax],xmm6 +$L$SEH_prologue_gcm_ghash_avx_3: + movaps XMMWORD[(-16)+rax],xmm7 +$L$SEH_prologue_gcm_ghash_avx_4: + movaps XMMWORD[rax],xmm8 +$L$SEH_prologue_gcm_ghash_avx_5: + movaps XMMWORD[16+rax],xmm9 +$L$SEH_prologue_gcm_ghash_avx_6: + movaps XMMWORD[32+rax],xmm10 +$L$SEH_prologue_gcm_ghash_avx_7: + movaps XMMWORD[48+rax],xmm11 +$L$SEH_prologue_gcm_ghash_avx_8: + movaps XMMWORD[64+rax],xmm12 +$L$SEH_prologue_gcm_ghash_avx_9: + movaps XMMWORD[80+rax],xmm13 +$L$SEH_prologue_gcm_ghash_avx_10: + movaps XMMWORD[96+rax],xmm14 +$L$SEH_prologue_gcm_ghash_avx_11: + movaps XMMWORD[112+rax],xmm15 +$L$SEH_prologue_gcm_ghash_avx_12: +$L$SEH_endprologue_gcm_ghash_avx_13: + vzeroupper + + vmovdqu xmm10,XMMWORD[rcx] + lea r10,[$L$0x1c2_polynomial] + lea rdx,[64+rdx] + vmovdqu xmm13,XMMWORD[$L$bswap_mask] + vpshufb xmm10,xmm10,xmm13 + cmp r9,0x80 + jb NEAR $L$short_avx + sub r9,0x80 + + vmovdqu xmm14,XMMWORD[112+r8] + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vpshufb xmm14,xmm14,xmm13 + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + + vpunpckhqdq xmm9,xmm14,xmm14 + vmovdqu xmm15,XMMWORD[96+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm9,xmm9,xmm14 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vmovdqu xmm14,XMMWORD[80+r8] + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vmovdqu xmm15,XMMWORD[64+r8] + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + + vpshufb xmm15,xmm15,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[48+r8] + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpxor xmm2,xmm2,xmm5 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[32+r8] + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[16+r8] + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm14,xmm14,xmm13 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpxor xmm2,xmm2,xmm5 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((176-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[r8] + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm4,xmm4,xmm1 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((160-64))+rdx] + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm9,xmm7,0x10 + + lea r8,[128+r8] + cmp r9,0x80 + jb NEAR $L$tail_avx + + vpxor xmm15,xmm15,xmm10 + sub r9,0x80 + jmp NEAR $L$oop8x_avx + +ALIGN 32 +$L$oop8x_avx: + vpunpckhqdq xmm8,xmm15,xmm15 + vmovdqu xmm14,XMMWORD[112+r8] + vpxor xmm3,xmm3,xmm0 + vpxor xmm8,xmm8,xmm15 + vpclmulqdq xmm10,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm11,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm12,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + vpxor xmm9,xmm9,xmm14 + + vmovdqu xmm15,XMMWORD[96+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpxor xmm10,xmm10,xmm3 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vxorps xmm11,xmm11,xmm4 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm12,xmm12,xmm5 + vxorps xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[80+r8] + vpxor xmm12,xmm12,xmm10 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpxor xmm12,xmm12,xmm11 + vpslldq xmm9,xmm12,8 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vpsrldq xmm12,xmm12,8 + vpxor xmm10,xmm10,xmm9 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpshufb xmm14,xmm14,xmm13 + vxorps xmm11,xmm11,xmm12 + vpxor xmm4,xmm4,xmm1 + vpunpckhqdq xmm9,xmm14,xmm14 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[64+r8] + vpalignr xmm12,xmm10,xmm10,8 + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vxorps xmm8,xmm8,xmm15 + vpxor xmm2,xmm2,xmm5 + + vmovdqu xmm14,XMMWORD[48+r8] + vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[32+r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpxor xmm0,xmm0,xmm3 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm2,xmm9,xmm7,0x00 + vpxor xmm8,xmm8,xmm15 + vpxor xmm2,xmm2,xmm5 + vxorps xmm10,xmm10,xmm12 + + vmovdqu xmm14,XMMWORD[16+r8] + vpalignr xmm12,xmm10,xmm10,8 + vpclmulqdq xmm3,xmm15,xmm6,0x00 + vpshufb xmm14,xmm14,xmm13 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm4,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpclmulqdq xmm10,xmm10,XMMWORD[r10],0x10 + vxorps xmm12,xmm12,xmm11 + vpunpckhqdq xmm9,xmm14,xmm14 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm5,xmm8,xmm7,0x10 + vmovdqu xmm7,XMMWORD[((176-64))+rdx] + vpxor xmm9,xmm9,xmm14 + vpxor xmm5,xmm5,xmm2 + + vmovdqu xmm15,XMMWORD[r8] + vpclmulqdq xmm0,xmm14,xmm6,0x00 + vpshufb xmm15,xmm15,xmm13 + vpclmulqdq xmm1,xmm14,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((160-64))+rdx] + vpxor xmm15,xmm15,xmm12 + vpclmulqdq xmm2,xmm9,xmm7,0x10 + vpxor xmm15,xmm15,xmm10 + + lea r8,[128+r8] + sub r9,0x80 + jnc NEAR $L$oop8x_avx + + add r9,0x80 + jmp NEAR $L$tail_no_xor_avx + +ALIGN 32 +$L$short_avx: + vmovdqu xmm14,XMMWORD[((-16))+r9*1+r8] + lea r8,[r9*1+r8] + vmovdqu xmm6,XMMWORD[((0-64))+rdx] + vmovdqu xmm7,XMMWORD[((32-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + + vmovdqa xmm3,xmm0 + vmovdqa xmm4,xmm1 + vmovdqa xmm5,xmm2 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-32))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((16-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-48))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((48-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((80-64))+rdx] + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-64))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((64-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-80))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((96-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovdqu xmm7,XMMWORD[((128-64))+rdx] + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-96))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((112-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vpsrldq xmm7,xmm7,8 + sub r9,0x10 + jz NEAR $L$tail_avx + + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vmovdqu xmm14,XMMWORD[((-112))+r8] + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vmovdqu xmm6,XMMWORD[((144-64))+rdx] + vpshufb xmm15,xmm14,xmm13 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + vmovq xmm7,QWORD[((184-64))+rdx] + sub r9,0x10 + jmp NEAR $L$tail_avx + +ALIGN 32 +$L$tail_avx: + vpxor xmm15,xmm15,xmm10 +$L$tail_no_xor_avx: + vpunpckhqdq xmm8,xmm15,xmm15 + vpxor xmm3,xmm3,xmm0 + vpclmulqdq xmm0,xmm15,xmm6,0x00 + vpxor xmm8,xmm8,xmm15 + vpxor xmm4,xmm4,xmm1 + vpclmulqdq xmm1,xmm15,xmm6,0x11 + vpxor xmm5,xmm5,xmm2 + vpclmulqdq xmm2,xmm8,xmm7,0x00 + + vmovdqu xmm12,XMMWORD[r10] + + vpxor xmm10,xmm3,xmm0 + vpxor xmm11,xmm4,xmm1 + vpxor xmm5,xmm5,xmm2 + + vpxor xmm5,xmm5,xmm10 + vpxor xmm5,xmm5,xmm11 + vpslldq xmm9,xmm5,8 + vpsrldq xmm5,xmm5,8 + vpxor xmm10,xmm10,xmm9 + vpxor xmm11,xmm11,xmm5 + + vpclmulqdq xmm9,xmm10,xmm12,0x10 + vpalignr xmm10,xmm10,xmm10,8 + vpxor xmm10,xmm10,xmm9 + + vpclmulqdq xmm9,xmm10,xmm12,0x10 + vpalignr xmm10,xmm10,xmm10,8 + vpxor xmm10,xmm10,xmm11 + vpxor xmm10,xmm10,xmm9 + + cmp r9,0 + jne NEAR $L$short_avx + + vpshufb xmm10,xmm10,xmm13 + vmovdqu XMMWORD[rcx],xmm10 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] + ret + +$L$SEH_end_gcm_ghash_avx_14: + +section .rdata rdata align=8 +ALIGN 64 +$L$bswap_mask: + DB 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +$L$0x1c2_polynomial: + DB 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 +$L$7_mask: + DD 7,0,7,0 +ALIGN 64 + + DB 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52 + DB 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 + DB 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 + DB 114,103,62,0 +ALIGN 64 +section .text + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_gcm_init_clmul_1 wrt ..imagebase + DD $L$SEH_end_gcm_init_clmul_5 wrt ..imagebase + DD $L$SEH_info_gcm_init_clmul_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_ghash_clmul_1 wrt ..imagebase + DD $L$SEH_end_gcm_ghash_clmul_14 wrt ..imagebase + DD $L$SEH_info_gcm_ghash_clmul_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_init_avx_1 wrt ..imagebase + DD $L$SEH_end_gcm_init_avx_5 wrt ..imagebase + DD $L$SEH_info_gcm_init_avx_0 wrt ..imagebase + + DD $L$SEH_begin_gcm_ghash_avx_1 wrt ..imagebase + DD $L$SEH_end_gcm_ghash_avx_14 wrt ..imagebase + DD $L$SEH_info_gcm_ghash_avx_0 wrt ..imagebase + + +section .xdata rdata align=8 +ALIGN 4 +$L$SEH_info_gcm_init_clmul_0: + DB 1 + DB $L$SEH_endprologue_gcm_init_clmul_4-$L$SEH_begin_gcm_init_clmul_1 + DB 3 + DB 0 + DB $L$SEH_prologue_gcm_init_clmul_3-$L$SEH_begin_gcm_init_clmul_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_init_clmul_2-$L$SEH_begin_gcm_init_clmul_1 + DB 34 + + DW 0 +$L$SEH_info_gcm_ghash_clmul_0: + DB 1 + DB $L$SEH_endprologue_gcm_ghash_clmul_13-$L$SEH_begin_gcm_ghash_clmul_1 + DB 22 + DB 0 + DB $L$SEH_prologue_gcm_ghash_clmul_12-$L$SEH_begin_gcm_ghash_clmul_1 + DB 248 + DW 9 + DB $L$SEH_prologue_gcm_ghash_clmul_11-$L$SEH_begin_gcm_ghash_clmul_1 + DB 232 + DW 8 + DB $L$SEH_prologue_gcm_ghash_clmul_10-$L$SEH_begin_gcm_ghash_clmul_1 + DB 216 + DW 7 + DB $L$SEH_prologue_gcm_ghash_clmul_9-$L$SEH_begin_gcm_ghash_clmul_1 + DB 200 + DW 6 + DB $L$SEH_prologue_gcm_ghash_clmul_8-$L$SEH_begin_gcm_ghash_clmul_1 + DB 184 + DW 5 + DB $L$SEH_prologue_gcm_ghash_clmul_7-$L$SEH_begin_gcm_ghash_clmul_1 + DB 168 + DW 4 + DB $L$SEH_prologue_gcm_ghash_clmul_6-$L$SEH_begin_gcm_ghash_clmul_1 + DB 152 + DW 3 + DB $L$SEH_prologue_gcm_ghash_clmul_5-$L$SEH_begin_gcm_ghash_clmul_1 + DB 136 + DW 2 + DB $L$SEH_prologue_gcm_ghash_clmul_4-$L$SEH_begin_gcm_ghash_clmul_1 + DB 120 + DW 1 + DB $L$SEH_prologue_gcm_ghash_clmul_3-$L$SEH_begin_gcm_ghash_clmul_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_ghash_clmul_2-$L$SEH_begin_gcm_ghash_clmul_1 + DB 1 + DW 21 + +$L$SEH_info_gcm_init_avx_0: + DB 1 + DB $L$SEH_endprologue_gcm_init_avx_4-$L$SEH_begin_gcm_init_avx_1 + DB 3 + DB 0 + DB $L$SEH_prologue_gcm_init_avx_3-$L$SEH_begin_gcm_init_avx_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_init_avx_2-$L$SEH_begin_gcm_init_avx_1 + DB 34 + + DW 0 +$L$SEH_info_gcm_ghash_avx_0: + DB 1 + DB $L$SEH_endprologue_gcm_ghash_avx_13-$L$SEH_begin_gcm_ghash_avx_1 + DB 22 + DB 0 + DB $L$SEH_prologue_gcm_ghash_avx_12-$L$SEH_begin_gcm_ghash_avx_1 + DB 248 + DW 9 + DB $L$SEH_prologue_gcm_ghash_avx_11-$L$SEH_begin_gcm_ghash_avx_1 + DB 232 + DW 8 + DB $L$SEH_prologue_gcm_ghash_avx_10-$L$SEH_begin_gcm_ghash_avx_1 + DB 216 + DW 7 + DB $L$SEH_prologue_gcm_ghash_avx_9-$L$SEH_begin_gcm_ghash_avx_1 + DB 200 + DW 6 + DB $L$SEH_prologue_gcm_ghash_avx_8-$L$SEH_begin_gcm_ghash_avx_1 + DB 184 + DW 5 + DB $L$SEH_prologue_gcm_ghash_avx_7-$L$SEH_begin_gcm_ghash_avx_1 + DB 168 + DW 4 + DB $L$SEH_prologue_gcm_ghash_avx_6-$L$SEH_begin_gcm_ghash_avx_1 + DB 152 + DW 3 + DB $L$SEH_prologue_gcm_ghash_avx_5-$L$SEH_begin_gcm_ghash_avx_1 + DB 136 + DW 2 + DB $L$SEH_prologue_gcm_ghash_avx_4-$L$SEH_begin_gcm_ghash_avx_1 + DB 120 + DW 1 + DB $L$SEH_prologue_gcm_ghash_avx_3-$L$SEH_begin_gcm_ghash_avx_1 + DB 104 + DW 0 + DB $L$SEH_prologue_gcm_ghash_avx_2-$L$SEH_begin_gcm_ghash_avx_1 + DB 1 + DW 21 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/ghashv8-armv7-linux.S b/third_party/boringssl/gen/bcm/ghashv8-armv7-linux.S new file mode 100644 index 00000000..6c4b2a9e --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghashv8-armv7-linux.S @@ -0,0 +1,244 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +#if __ARM_MAX_ARCH__>=7 +.text +.fpu neon +.code 32 +#undef __thumb2__ +.globl gcm_init_v8 +.hidden gcm_init_v8 +.type gcm_init_v8,%function +.align 4 +gcm_init_v8: + AARCH64_VALID_CALL_TARGET + vld1.64 {q9},[r1] @ load input H + vmov.i8 q11,#0xe1 + vshl.i64 q11,q11,#57 @ 0xc2.0 + vext.8 q3,q9,q9,#8 + vshr.u64 q10,q11,#63 + vdup.32 q9,d18[1] + vext.8 q8,q10,q11,#8 @ t0=0xc2....01 + vshr.u64 q10,q3,#63 + vshr.s32 q9,q9,#31 @ broadcast carry bit + vand q10,q10,q8 + vshl.i64 q3,q3,#1 + vext.8 q10,q10,q10,#8 + vand q8,q8,q9 + vorr q3,q3,q10 @ H<<<=1 + veor q12,q3,q8 @ twisted H + vst1.64 {q12},[r0]! @ store Htable[0] + + @ calculate H^2 + vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing +.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 + veor q8,q8,q12 +.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 +.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q14,q0,q10 + + vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing + veor q9,q9,q14 + vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed + vst1.64 {q13,q14},[r0]! @ store Htable[1..2] + bx lr +.size gcm_init_v8,.-gcm_init_v8 +.globl gcm_gmult_v8 +.hidden gcm_gmult_v8 +.type gcm_gmult_v8,%function +.align 4 +gcm_gmult_v8: + AARCH64_VALID_CALL_TARGET + vld1.64 {q9},[r0] @ load Xi + vmov.i8 q11,#0xe1 + vld1.64 {q12,q13},[r1] @ load twisted H, ... + vshl.u64 q11,q11,#57 +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vext.8 q3,q9,q9,#8 + +.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + veor q9,q9,q3 @ Karatsuba pre-processing +.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi +.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q0,q0,q10 + +#ifndef __ARMEB__ + vrev64.8 q0,q0 +#endif + vext.8 q0,q0,q0,#8 + vst1.64 {q0},[r0] @ write out Xi + + bx lr +.size gcm_gmult_v8,.-gcm_gmult_v8 +.globl gcm_ghash_v8 +.hidden gcm_ghash_v8 +.type gcm_ghash_v8,%function +.align 4 +gcm_ghash_v8: + AARCH64_VALID_CALL_TARGET + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so + vld1.64 {q0},[r0] @ load [rotated] Xi + @ "[rotated]" means that + @ loaded value would have + @ to be rotated in order to + @ make it appear as in + @ algorithm specification + subs r3,r3,#32 @ see if r3 is 32 or larger + mov r12,#16 @ r12 is used as post- + @ increment for input pointer; + @ as loop is modulo-scheduled + @ r12 is zeroed just in time + @ to preclude overstepping + @ inp[len], which means that + @ last block[s] are actually + @ loaded twice, but last + @ copy is not processed + vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 + vmov.i8 q11,#0xe1 + vld1.64 {q14},[r1] + moveq r12,#0 @ is it time to zero r12? + vext.8 q0,q0,q0,#8 @ rotate Xi + vld1.64 {q8},[r2]! @ load [rotated] I[0] + vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant +#ifndef __ARMEB__ + vrev64.8 q8,q8 + vrev64.8 q0,q0 +#endif + vext.8 q3,q8,q8,#8 @ rotate I[0] + blo .Lodd_tail_v8 @ r3 was less than 32 + vld1.64 {q9},[r2],r12 @ load [rotated] I[1] +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vext.8 q7,q9,q9,#8 + veor q3,q3,q0 @ I[i]^=Xi +.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + veor q9,q9,q7 @ Karatsuba pre-processing +.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + b .Loop_mod2x_v8 + +.align 4 +.Loop_mod2x_v8: + vext.8 q10,q3,q3,#8 + subs r3,r3,#32 @ is there more data? +.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo + movlo r12,#0 @ is it time to zero r12? + +.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 + veor q10,q10,q3 @ Karatsuba pre-processing +.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi + veor q0,q0,q4 @ accumulate +.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] + + veor q2,q2,q6 + moveq r12,#0 @ is it time to zero r12? + veor q1,q1,q5 + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] +#ifndef __ARMEB__ + vrev64.8 q8,q8 +#endif + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + +#ifndef __ARMEB__ + vrev64.8 q9,q9 +#endif + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + vext.8 q7,q9,q9,#8 + vext.8 q3,q8,q8,#8 + veor q0,q1,q10 +.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 + veor q3,q3,q2 @ accumulate q3 early + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q3,q3,q10 + veor q9,q9,q7 @ Karatsuba pre-processing + veor q3,q3,q0 +.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 + bhs .Loop_mod2x_v8 @ there was at least 32 more bytes + + veor q2,q2,q10 + vext.8 q3,q8,q8,#8 @ re-construct q3 + adds r3,r3,#32 @ re-construct r3 + veor q0,q0,q2 @ re-construct q0 + beq .Ldone_v8 @ is r3 zero? +.Lodd_tail_v8: + vext.8 q10,q0,q0,#8 + veor q3,q3,q0 @ inp^=Xi + veor q9,q8,q10 @ q9 is rotated inp^Xi + +.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo + veor q9,q9,q3 @ Karatsuba pre-processing +.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi +.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) + + vext.8 q9,q0,q2,#8 @ Karatsuba post-processing + veor q10,q0,q2 + veor q1,q1,q9 + veor q1,q1,q10 +.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction + + vmov d4,d3 @ Xh|Xm - 256-bit result + vmov d3,d0 @ Xm is rotated Xl + veor q0,q1,q10 + + vext.8 q10,q0,q0,#8 @ 2nd phase of reduction +.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 + veor q10,q10,q2 + veor q0,q0,q10 + +.Ldone_v8: +#ifndef __ARMEB__ + vrev64.8 q0,q0 +#endif + vext.8 q0,q0,q0,#8 + vst1.64 {q0},[r0] @ write out Xi + + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so + bx lr +.size gcm_ghash_v8,.-gcm_ghash_v8 +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/ghashv8-armv8-apple.S b/third_party/boringssl/gen/bcm/ghashv8-armv8-apple.S new file mode 100644 index 00000000..ee458be4 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghashv8-armv8-apple.S @@ -0,0 +1,564 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +#if __ARM_MAX_ARCH__>=7 +.text +.arch_extension crypto + +.globl _gcm_init_v8 +.private_extern _gcm_init_v8 + +.align 4 +_gcm_init_v8: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 + ext v3.16b,v17.16b,v17.16b,#8 + ushr v18.2d,v19.2d,#63 + dup v17.4s,v17.s[1] + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 + sshr v17.4s,v17.4s,#31 //broadcast carry bit + and v18.16b,v18.16b,v16.16b + shl v3.2d,v3.2d,#1 + ext v18.16b,v18.16b,v18.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + //calculate H^3 and H^4 + pmull v0.1q,v20.1d, v22.1d + pmull v5.1q,v22.1d,v22.1d + pmull2 v2.1q,v20.2d, v22.2d + pmull2 v7.1q,v22.2d,v22.2d + pmull v1.1q,v16.1d,v17.1d + pmull v6.1q,v17.1d,v17.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v20.16b, v0.16b,v18.16b //H^3 + eor v22.16b,v5.16b,v4.16b //H^4 + + ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing + ext v17.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v20.16b + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] + ret + +.globl _gcm_gmult_v8 +.private_extern _gcm_gmult_v8 + +.align 4 +_gcm_gmult_v8: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + shl v19.2d,v19.2d,#57 +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v3.16b,v17.16b,v17.16b,#8 + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + +.globl _gcm_ghash_v8 +.private_extern _gcm_ghash_v8 + +.align 4 +_gcm_ghash_v8: + AARCH64_VALID_CALL_TARGET + cmp x3,#64 + b.hs Lgcm_ghash_v8_4x + ld1 {v0.2d},[x0] //load [rotated] Xi + //"[rotated]" means that + //loaded value would have + //to be rotated in order to + //make it appear as in + //algorithm specification + subs x3,x3,#32 //see if x3 is 32 or larger + mov x12,#16 //x12 is used as post- + //increment for input pointer; + //as loop is modulo-scheduled + //x12 is zeroed just in time + //to preclude overstepping + //inp[len], which means that + //last block[s] are actually + //loaded twice, but last + //copy is not processed + ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v22.2d},[x1] + csel x12,xzr,x12,eq //is it time to zero x12? + ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi + ld1 {v16.2d},[x2],#16 //load [rotated] I[0] + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant +#ifndef __AARCH64EB__ + rev64 v16.16b,v16.16b + rev64 v0.16b,v0.16b +#endif + ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] + b.lo Lodd_tail_v8 //x3 was less than 32 + ld1 {v17.2d},[x2],x12 //load [rotated] I[1] +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v7.16b,v17.16b,v17.16b,#8 + eor v3.16b,v3.16b,v0.16b //I[i]^=Xi + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + pmull2 v6.1q,v20.2d,v7.2d + b Loop_mod2x_v8 + +.align 4 +Loop_mod2x_v8: + ext v18.16b,v3.16b,v3.16b,#8 + subs x3,x3,#32 //is there more data? + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + csel x12,xzr,x12,lo //is it time to zero x12? + + pmull v5.1q,v21.1d,v17.1d + eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + eor v0.16b,v0.16b,v4.16b //accumulate + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] + + eor v2.16b,v2.16b,v6.16b + csel x12,xzr,x12,eq //is it time to zero x12? + eor v1.16b,v1.16b,v5.16b + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] +#ifndef __AARCH64EB__ + rev64 v16.16b,v16.16b +#endif + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v7.16b,v17.16b,v17.16b,#8 + ext v3.16b,v16.16b,v16.16b,#8 + eor v0.16b,v1.16b,v18.16b + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v3.16b,v3.16b,v18.16b + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + eor v3.16b,v3.16b,v0.16b + pmull2 v6.1q,v20.2d,v7.2d + b.hs Loop_mod2x_v8 //there was at least 32 more bytes + + eor v2.16b,v2.16b,v18.16b + ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b + adds x3,x3,#32 //re-construct x3 + eor v0.16b,v0.16b,v2.16b //re-construct v0.16b + b.eq Ldone_v8 //is x3 zero? +Lodd_tail_v8: + ext v18.16b,v0.16b,v0.16b,#8 + eor v3.16b,v3.16b,v0.16b //inp^=Xi + eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +Ldone_v8: +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + + +.align 4 +gcm_ghash_v8_4x: +Lgcm_ghash_v8_4x: + ld1 {v0.2d},[x0] //load [rotated] Xi + ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant + + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v7.16b,v7.16b + rev64 v4.16b,v4.16b +#endif + ext v25.16b,v7.16b,v7.16b,#8 + ext v24.16b,v6.16b,v6.16b,#8 + ext v23.16b,v5.16b,v5.16b,#8 + + pmull v29.1q,v20.1d,v25.1d //H·Ii+3 + eor v7.16b,v7.16b,v25.16b + pmull2 v31.1q,v20.2d,v25.2d + pmull v30.1q,v21.1d,v7.1d + + pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 + eor v6.16b,v6.16b,v24.16b + pmull2 v24.1q,v22.2d,v24.2d + pmull2 v6.1q,v21.2d,v6.2d + + eor v29.16b,v29.16b,v16.16b + eor v31.16b,v31.16b,v24.16b + eor v30.16b,v30.16b,v6.16b + + pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 + eor v5.16b,v5.16b,v23.16b + pmull2 v23.1q,v26.2d,v23.2d + pmull v5.1q,v27.1d,v5.1d + + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + eor v30.16b,v30.16b,v5.16b + + subs x3,x3,#128 + b.lo Ltail4x + + b Loop4x + +.align 4 +Loop4x: + eor v16.16b,v4.16b,v0.16b + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 + ext v3.16b,v16.16b,v16.16b,#8 +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v7.16b,v7.16b + rev64 v4.16b,v4.16b +#endif + + pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v28.2d,v3.2d + ext v25.16b,v7.16b,v7.16b,#8 + pmull2 v1.1q,v27.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + ext v24.16b,v6.16b,v6.16b,#8 + eor v1.16b,v1.16b,v30.16b + ext v23.16b,v5.16b,v5.16b,#8 + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + pmull v29.1q,v20.1d,v25.1d //H·Ii+3 + eor v7.16b,v7.16b,v25.16b + eor v1.16b,v1.16b,v17.16b + pmull2 v31.1q,v20.2d,v25.2d + eor v1.16b,v1.16b,v18.16b + pmull v30.1q,v21.1d,v7.1d + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 + eor v6.16b,v6.16b,v24.16b + pmull2 v24.1q,v22.2d,v24.2d + eor v0.16b,v1.16b,v18.16b + pmull2 v6.1q,v21.2d,v6.2d + + eor v29.16b,v29.16b,v16.16b + eor v31.16b,v31.16b,v24.16b + eor v30.16b,v30.16b,v6.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 + eor v5.16b,v5.16b,v23.16b + eor v18.16b,v18.16b,v2.16b + pmull2 v23.1q,v26.2d,v23.2d + pmull v5.1q,v27.1d,v5.1d + + eor v0.16b,v0.16b,v18.16b + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + ext v0.16b,v0.16b,v0.16b,#8 + eor v30.16b,v30.16b,v5.16b + + subs x3,x3,#64 + b.hs Loop4x + +Ltail4x: + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v28.2d,v3.2d + pmull2 v1.1q,v27.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + + adds x3,x3,#64 + b.eq Ldone4x + + cmp x3,#32 + b.lo Lone + b.eq Ltwo +Lthree: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d,v5.2d,v6.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v24.16b,v6.16b,v6.16b,#8 + ext v23.16b,v5.16b,v5.16b,#8 + eor v0.16b,v1.16b,v18.16b + + pmull v29.1q,v20.1d,v24.1d //H·Ii+2 + eor v6.16b,v6.16b,v24.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + pmull2 v31.1q,v20.2d,v24.2d + pmull v30.1q,v21.1d,v6.1d + eor v0.16b,v0.16b,v18.16b + pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 + eor v5.16b,v5.16b,v23.16b + ext v0.16b,v0.16b,v0.16b,#8 + + pmull2 v23.1q,v22.2d,v23.2d + eor v16.16b,v4.16b,v0.16b + pmull2 v5.1q,v21.2d,v5.2d + ext v3.16b,v16.16b,v16.16b,#8 + + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + eor v30.16b,v30.16b,v5.16b + + pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v26.2d,v3.2d + pmull v1.1q,v27.1d,v16.1d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + b Ldone4x + +.align 4 +Ltwo: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d,v5.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v23.16b,v5.16b,v5.16b,#8 + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + + pmull v29.1q,v20.1d,v23.1d //H·Ii+1 + eor v5.16b,v5.16b,v23.16b + + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull2 v31.1q,v20.2d,v23.2d + pmull v30.1q,v21.1d,v5.1d + + pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v22.2d,v3.2d + pmull2 v1.1q,v21.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + b Ldone4x + +.align 4 +Lone: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull v0.1q,v20.1d,v3.1d + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v20.2d,v3.2d + pmull v1.1q,v21.1d,v16.1d + +Ldone4x: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + st1 {v0.2d},[x0] //write out Xi + + ret + +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/ghashv8-armv8-linux.S b/third_party/boringssl/gen/bcm/ghashv8-armv8-linux.S new file mode 100644 index 00000000..bb881761 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghashv8-armv8-linux.S @@ -0,0 +1,563 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +#if __ARM_MAX_ARCH__>=7 +.text +.arch armv8-a+crypto +.globl gcm_init_v8 +.hidden gcm_init_v8 +.type gcm_init_v8,%function +.align 4 +gcm_init_v8: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 + ext v3.16b,v17.16b,v17.16b,#8 + ushr v18.2d,v19.2d,#63 + dup v17.4s,v17.s[1] + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 + sshr v17.4s,v17.4s,#31 //broadcast carry bit + and v18.16b,v18.16b,v16.16b + shl v3.2d,v3.2d,#1 + ext v18.16b,v18.16b,v18.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + //calculate H^3 and H^4 + pmull v0.1q,v20.1d, v22.1d + pmull v5.1q,v22.1d,v22.1d + pmull2 v2.1q,v20.2d, v22.2d + pmull2 v7.1q,v22.2d,v22.2d + pmull v1.1q,v16.1d,v17.1d + pmull v6.1q,v17.1d,v17.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v20.16b, v0.16b,v18.16b //H^3 + eor v22.16b,v5.16b,v4.16b //H^4 + + ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing + ext v17.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v20.16b + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] + ret +.size gcm_init_v8,.-gcm_init_v8 +.globl gcm_gmult_v8 +.hidden gcm_gmult_v8 +.type gcm_gmult_v8,%function +.align 4 +gcm_gmult_v8: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + shl v19.2d,v19.2d,#57 +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v3.16b,v17.16b,v17.16b,#8 + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret +.size gcm_gmult_v8,.-gcm_gmult_v8 +.globl gcm_ghash_v8 +.hidden gcm_ghash_v8 +.type gcm_ghash_v8,%function +.align 4 +gcm_ghash_v8: + AARCH64_VALID_CALL_TARGET + cmp x3,#64 + b.hs .Lgcm_ghash_v8_4x + ld1 {v0.2d},[x0] //load [rotated] Xi + //"[rotated]" means that + //loaded value would have + //to be rotated in order to + //make it appear as in + //algorithm specification + subs x3,x3,#32 //see if x3 is 32 or larger + mov x12,#16 //x12 is used as post- + //increment for input pointer; + //as loop is modulo-scheduled + //x12 is zeroed just in time + //to preclude overstepping + //inp[len], which means that + //last block[s] are actually + //loaded twice, but last + //copy is not processed + ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v22.2d},[x1] + csel x12,xzr,x12,eq //is it time to zero x12? + ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi + ld1 {v16.2d},[x2],#16 //load [rotated] I[0] + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant +#ifndef __AARCH64EB__ + rev64 v16.16b,v16.16b + rev64 v0.16b,v0.16b +#endif + ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] + b.lo .Lodd_tail_v8 //x3 was less than 32 + ld1 {v17.2d},[x2],x12 //load [rotated] I[1] +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v7.16b,v17.16b,v17.16b,#8 + eor v3.16b,v3.16b,v0.16b //I[i]^=Xi + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + pmull2 v6.1q,v20.2d,v7.2d + b .Loop_mod2x_v8 + +.align 4 +.Loop_mod2x_v8: + ext v18.16b,v3.16b,v3.16b,#8 + subs x3,x3,#32 //is there more data? + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + csel x12,xzr,x12,lo //is it time to zero x12? + + pmull v5.1q,v21.1d,v17.1d + eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + eor v0.16b,v0.16b,v4.16b //accumulate + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] + + eor v2.16b,v2.16b,v6.16b + csel x12,xzr,x12,eq //is it time to zero x12? + eor v1.16b,v1.16b,v5.16b + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] +#ifndef __AARCH64EB__ + rev64 v16.16b,v16.16b +#endif + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v7.16b,v17.16b,v17.16b,#8 + ext v3.16b,v16.16b,v16.16b,#8 + eor v0.16b,v1.16b,v18.16b + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v3.16b,v3.16b,v18.16b + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + eor v3.16b,v3.16b,v0.16b + pmull2 v6.1q,v20.2d,v7.2d + b.hs .Loop_mod2x_v8 //there was at least 32 more bytes + + eor v2.16b,v2.16b,v18.16b + ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b + adds x3,x3,#32 //re-construct x3 + eor v0.16b,v0.16b,v2.16b //re-construct v0.16b + b.eq .Ldone_v8 //is x3 zero? +.Lodd_tail_v8: + ext v18.16b,v0.16b,v0.16b,#8 + eor v3.16b,v3.16b,v0.16b //inp^=Xi + eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +.Ldone_v8: +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret +.size gcm_ghash_v8,.-gcm_ghash_v8 +.type gcm_ghash_v8_4x,%function +.align 4 +gcm_ghash_v8_4x: +.Lgcm_ghash_v8_4x: + ld1 {v0.2d},[x0] //load [rotated] Xi + ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant + + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v7.16b,v7.16b + rev64 v4.16b,v4.16b +#endif + ext v25.16b,v7.16b,v7.16b,#8 + ext v24.16b,v6.16b,v6.16b,#8 + ext v23.16b,v5.16b,v5.16b,#8 + + pmull v29.1q,v20.1d,v25.1d //H·Ii+3 + eor v7.16b,v7.16b,v25.16b + pmull2 v31.1q,v20.2d,v25.2d + pmull v30.1q,v21.1d,v7.1d + + pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 + eor v6.16b,v6.16b,v24.16b + pmull2 v24.1q,v22.2d,v24.2d + pmull2 v6.1q,v21.2d,v6.2d + + eor v29.16b,v29.16b,v16.16b + eor v31.16b,v31.16b,v24.16b + eor v30.16b,v30.16b,v6.16b + + pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 + eor v5.16b,v5.16b,v23.16b + pmull2 v23.1q,v26.2d,v23.2d + pmull v5.1q,v27.1d,v5.1d + + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + eor v30.16b,v30.16b,v5.16b + + subs x3,x3,#128 + b.lo .Ltail4x + + b .Loop4x + +.align 4 +.Loop4x: + eor v16.16b,v4.16b,v0.16b + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 + ext v3.16b,v16.16b,v16.16b,#8 +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v7.16b,v7.16b + rev64 v4.16b,v4.16b +#endif + + pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v28.2d,v3.2d + ext v25.16b,v7.16b,v7.16b,#8 + pmull2 v1.1q,v27.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + ext v24.16b,v6.16b,v6.16b,#8 + eor v1.16b,v1.16b,v30.16b + ext v23.16b,v5.16b,v5.16b,#8 + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + pmull v29.1q,v20.1d,v25.1d //H·Ii+3 + eor v7.16b,v7.16b,v25.16b + eor v1.16b,v1.16b,v17.16b + pmull2 v31.1q,v20.2d,v25.2d + eor v1.16b,v1.16b,v18.16b + pmull v30.1q,v21.1d,v7.1d + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 + eor v6.16b,v6.16b,v24.16b + pmull2 v24.1q,v22.2d,v24.2d + eor v0.16b,v1.16b,v18.16b + pmull2 v6.1q,v21.2d,v6.2d + + eor v29.16b,v29.16b,v16.16b + eor v31.16b,v31.16b,v24.16b + eor v30.16b,v30.16b,v6.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 + eor v5.16b,v5.16b,v23.16b + eor v18.16b,v18.16b,v2.16b + pmull2 v23.1q,v26.2d,v23.2d + pmull v5.1q,v27.1d,v5.1d + + eor v0.16b,v0.16b,v18.16b + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + ext v0.16b,v0.16b,v0.16b,#8 + eor v30.16b,v30.16b,v5.16b + + subs x3,x3,#64 + b.hs .Loop4x + +.Ltail4x: + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v28.2d,v3.2d + pmull2 v1.1q,v27.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + + adds x3,x3,#64 + b.eq .Ldone4x + + cmp x3,#32 + b.lo .Lone + b.eq .Ltwo +.Lthree: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d,v5.2d,v6.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v24.16b,v6.16b,v6.16b,#8 + ext v23.16b,v5.16b,v5.16b,#8 + eor v0.16b,v1.16b,v18.16b + + pmull v29.1q,v20.1d,v24.1d //H·Ii+2 + eor v6.16b,v6.16b,v24.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + pmull2 v31.1q,v20.2d,v24.2d + pmull v30.1q,v21.1d,v6.1d + eor v0.16b,v0.16b,v18.16b + pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 + eor v5.16b,v5.16b,v23.16b + ext v0.16b,v0.16b,v0.16b,#8 + + pmull2 v23.1q,v22.2d,v23.2d + eor v16.16b,v4.16b,v0.16b + pmull2 v5.1q,v21.2d,v5.2d + ext v3.16b,v16.16b,v16.16b,#8 + + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + eor v30.16b,v30.16b,v5.16b + + pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v26.2d,v3.2d + pmull v1.1q,v27.1d,v16.1d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + b .Ldone4x + +.align 4 +.Ltwo: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d,v5.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v23.16b,v5.16b,v5.16b,#8 + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + + pmull v29.1q,v20.1d,v23.1d //H·Ii+1 + eor v5.16b,v5.16b,v23.16b + + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull2 v31.1q,v20.2d,v23.2d + pmull v30.1q,v21.1d,v5.1d + + pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v22.2d,v3.2d + pmull2 v1.1q,v21.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + b .Ldone4x + +.align 4 +.Lone: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull v0.1q,v20.1d,v3.1d + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v20.2d,v3.2d + pmull v1.1q,v21.1d,v16.1d + +.Ldone4x: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + st1 {v0.2d},[x0] //write out Xi + + ret +.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/ghashv8-armv8-win.S b/third_party/boringssl/gen/bcm/ghashv8-armv8-win.S new file mode 100644 index 00000000..d4ad2b88 --- /dev/null +++ b/third_party/boringssl/gen/bcm/ghashv8-armv8-win.S @@ -0,0 +1,571 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +#if __ARM_MAX_ARCH__>=7 +.text +.arch armv8-a+crypto +.globl gcm_init_v8 + +.def gcm_init_v8 + .type 32 +.endef +.align 4 +gcm_init_v8: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x1] //load input H + movi v19.16b,#0xe1 + shl v19.2d,v19.2d,#57 //0xc2.0 + ext v3.16b,v17.16b,v17.16b,#8 + ushr v18.2d,v19.2d,#63 + dup v17.4s,v17.s[1] + ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 + ushr v18.2d,v3.2d,#63 + sshr v17.4s,v17.4s,#31 //broadcast carry bit + and v18.16b,v18.16b,v16.16b + shl v3.2d,v3.2d,#1 + ext v18.16b,v18.16b,v18.16b,#8 + and v16.16b,v16.16b,v17.16b + orr v3.16b,v3.16b,v18.16b //H<<<=1 + eor v20.16b,v3.16b,v16.16b //twisted H + st1 {v20.2d},[x0],#16 //store Htable[0] + + //calculate H^2 + ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing + pmull v0.1q,v20.1d,v20.1d + eor v16.16b,v16.16b,v20.16b + pmull2 v2.1q,v20.2d,v20.2d + pmull v1.1q,v16.1d,v16.1d + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v22.16b,v0.16b,v18.16b + + ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] + //calculate H^3 and H^4 + pmull v0.1q,v20.1d, v22.1d + pmull v5.1q,v22.1d,v22.1d + pmull2 v2.1q,v20.2d, v22.2d + pmull2 v7.1q,v22.2d,v22.2d + pmull v1.1q,v16.1d,v17.1d + pmull v6.1q,v17.1d,v17.1d + + ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + ext v17.16b,v5.16b,v7.16b,#8 + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v16.16b + eor v4.16b,v5.16b,v7.16b + eor v6.16b,v6.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase + eor v6.16b,v6.16b,v4.16b + pmull v4.1q,v5.1d,v19.1d + + ins v2.d[0],v1.d[1] + ins v7.d[0],v6.d[1] + ins v1.d[1],v0.d[0] + ins v6.d[1],v5.d[0] + eor v0.16b,v1.16b,v18.16b + eor v5.16b,v6.16b,v4.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase + ext v4.16b,v5.16b,v5.16b,#8 + pmull v0.1q,v0.1d,v19.1d + pmull v5.1q,v5.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v4.16b,v4.16b,v7.16b + eor v20.16b, v0.16b,v18.16b //H^3 + eor v22.16b,v5.16b,v4.16b //H^4 + + ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing + ext v17.16b,v22.16b,v22.16b,#8 + eor v16.16b,v16.16b,v20.16b + eor v17.16b,v17.16b,v22.16b + ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed + st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] + ret + +.globl gcm_gmult_v8 + +.def gcm_gmult_v8 + .type 32 +.endef +.align 4 +gcm_gmult_v8: + AARCH64_VALID_CALL_TARGET + ld1 {v17.2d},[x0] //load Xi + movi v19.16b,#0xe1 + ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... + shl v19.2d,v19.2d,#57 +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v3.16b,v17.16b,v17.16b,#8 + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + +.globl gcm_ghash_v8 + +.def gcm_ghash_v8 + .type 32 +.endef +.align 4 +gcm_ghash_v8: + AARCH64_VALID_CALL_TARGET + cmp x3,#64 + b.hs Lgcm_ghash_v8_4x + ld1 {v0.2d},[x0] //load [rotated] Xi + //"[rotated]" means that + //loaded value would have + //to be rotated in order to + //make it appear as in + //algorithm specification + subs x3,x3,#32 //see if x3 is 32 or larger + mov x12,#16 //x12 is used as post- + //increment for input pointer; + //as loop is modulo-scheduled + //x12 is zeroed just in time + //to preclude overstepping + //inp[len], which means that + //last block[s] are actually + //loaded twice, but last + //copy is not processed + ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v22.2d},[x1] + csel x12,xzr,x12,eq //is it time to zero x12? + ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi + ld1 {v16.2d},[x2],#16 //load [rotated] I[0] + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant +#ifndef __AARCH64EB__ + rev64 v16.16b,v16.16b + rev64 v0.16b,v0.16b +#endif + ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] + b.lo Lodd_tail_v8 //x3 was less than 32 + ld1 {v17.2d},[x2],x12 //load [rotated] I[1] +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ext v7.16b,v17.16b,v17.16b,#8 + eor v3.16b,v3.16b,v0.16b //I[i]^=Xi + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + pmull2 v6.1q,v20.2d,v7.2d + b Loop_mod2x_v8 + +.align 4 +Loop_mod2x_v8: + ext v18.16b,v3.16b,v3.16b,#8 + subs x3,x3,#32 //is there more data? + pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo + csel x12,xzr,x12,lo //is it time to zero x12? + + pmull v5.1q,v21.1d,v17.1d + eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi + eor v0.16b,v0.16b,v4.16b //accumulate + pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) + ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] + + eor v2.16b,v2.16b,v6.16b + csel x12,xzr,x12,eq //is it time to zero x12? + eor v1.16b,v1.16b,v5.16b + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] +#ifndef __AARCH64EB__ + rev64 v16.16b,v16.16b +#endif + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + +#ifndef __AARCH64EB__ + rev64 v17.16b,v17.16b +#endif + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v7.16b,v17.16b,v17.16b,#8 + ext v3.16b,v16.16b,v16.16b,#8 + eor v0.16b,v1.16b,v18.16b + pmull v4.1q,v20.1d,v7.1d //H·Ii+1 + eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v3.16b,v3.16b,v18.16b + eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing + eor v3.16b,v3.16b,v0.16b + pmull2 v6.1q,v20.2d,v7.2d + b.hs Loop_mod2x_v8 //there was at least 32 more bytes + + eor v2.16b,v2.16b,v18.16b + ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b + adds x3,x3,#32 //re-construct x3 + eor v0.16b,v0.16b,v2.16b //re-construct v0.16b + b.eq Ldone_v8 //is x3 zero? +Lodd_tail_v8: + ext v18.16b,v0.16b,v0.16b,#8 + eor v3.16b,v3.16b,v0.16b //inp^=Xi + eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi + + pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo + eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing + pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi + pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + +Ldone_v8: +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + ext v0.16b,v0.16b,v0.16b,#8 + st1 {v0.2d},[x0] //write out Xi + + ret + +.def gcm_ghash_v8_4x + .type 32 +.endef +.align 4 +gcm_ghash_v8_4x: +Lgcm_ghash_v8_4x: + ld1 {v0.2d},[x0] //load [rotated] Xi + ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 + movi v19.16b,#0xe1 + ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 + shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant + + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v7.16b,v7.16b + rev64 v4.16b,v4.16b +#endif + ext v25.16b,v7.16b,v7.16b,#8 + ext v24.16b,v6.16b,v6.16b,#8 + ext v23.16b,v5.16b,v5.16b,#8 + + pmull v29.1q,v20.1d,v25.1d //H·Ii+3 + eor v7.16b,v7.16b,v25.16b + pmull2 v31.1q,v20.2d,v25.2d + pmull v30.1q,v21.1d,v7.1d + + pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 + eor v6.16b,v6.16b,v24.16b + pmull2 v24.1q,v22.2d,v24.2d + pmull2 v6.1q,v21.2d,v6.2d + + eor v29.16b,v29.16b,v16.16b + eor v31.16b,v31.16b,v24.16b + eor v30.16b,v30.16b,v6.16b + + pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 + eor v5.16b,v5.16b,v23.16b + pmull2 v23.1q,v26.2d,v23.2d + pmull v5.1q,v27.1d,v5.1d + + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + eor v30.16b,v30.16b,v5.16b + + subs x3,x3,#128 + b.lo Ltail4x + + b Loop4x + +.align 4 +Loop4x: + eor v16.16b,v4.16b,v0.16b + ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 + ext v3.16b,v16.16b,v16.16b,#8 +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v7.16b,v7.16b + rev64 v4.16b,v4.16b +#endif + + pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v28.2d,v3.2d + ext v25.16b,v7.16b,v7.16b,#8 + pmull2 v1.1q,v27.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + ext v24.16b,v6.16b,v6.16b,#8 + eor v1.16b,v1.16b,v30.16b + ext v23.16b,v5.16b,v5.16b,#8 + + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + pmull v29.1q,v20.1d,v25.1d //H·Ii+3 + eor v7.16b,v7.16b,v25.16b + eor v1.16b,v1.16b,v17.16b + pmull2 v31.1q,v20.2d,v25.2d + eor v1.16b,v1.16b,v18.16b + pmull v30.1q,v21.1d,v7.1d + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 + eor v6.16b,v6.16b,v24.16b + pmull2 v24.1q,v22.2d,v24.2d + eor v0.16b,v1.16b,v18.16b + pmull2 v6.1q,v21.2d,v6.2d + + eor v29.16b,v29.16b,v16.16b + eor v31.16b,v31.16b,v24.16b + eor v30.16b,v30.16b,v6.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 + eor v5.16b,v5.16b,v23.16b + eor v18.16b,v18.16b,v2.16b + pmull2 v23.1q,v26.2d,v23.2d + pmull v5.1q,v27.1d,v5.1d + + eor v0.16b,v0.16b,v18.16b + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + ext v0.16b,v0.16b,v0.16b,#8 + eor v30.16b,v30.16b,v5.16b + + subs x3,x3,#64 + b.hs Loop4x + +Ltail4x: + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v28.2d,v3.2d + pmull2 v1.1q,v27.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + + adds x3,x3,#64 + b.eq Ldone4x + + cmp x3,#32 + b.lo Lone + b.eq Ltwo +Lthree: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d,v5.2d,v6.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v6.16b,v6.16b + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v24.16b,v6.16b,v6.16b,#8 + ext v23.16b,v5.16b,v5.16b,#8 + eor v0.16b,v1.16b,v18.16b + + pmull v29.1q,v20.1d,v24.1d //H·Ii+2 + eor v6.16b,v6.16b,v24.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + pmull2 v31.1q,v20.2d,v24.2d + pmull v30.1q,v21.1d,v6.1d + eor v0.16b,v0.16b,v18.16b + pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 + eor v5.16b,v5.16b,v23.16b + ext v0.16b,v0.16b,v0.16b,#8 + + pmull2 v23.1q,v22.2d,v23.2d + eor v16.16b,v4.16b,v0.16b + pmull2 v5.1q,v21.2d,v5.2d + ext v3.16b,v16.16b,v16.16b,#8 + + eor v29.16b,v29.16b,v7.16b + eor v31.16b,v31.16b,v23.16b + eor v30.16b,v30.16b,v5.16b + + pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v26.2d,v3.2d + pmull v1.1q,v27.1d,v16.1d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + b Ldone4x + +.align 4 +Ltwo: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d,v5.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v5.16b,v5.16b + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + ext v23.16b,v5.16b,v5.16b,#8 + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + + pmull v29.1q,v20.1d,v23.1d //H·Ii+1 + eor v5.16b,v5.16b,v23.16b + + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull2 v31.1q,v20.2d,v23.2d + pmull v30.1q,v21.1d,v5.1d + + pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v22.2d,v3.2d + pmull2 v1.1q,v21.2d,v16.2d + + eor v0.16b,v0.16b,v29.16b + eor v2.16b,v2.16b,v31.16b + eor v1.16b,v1.16b,v30.16b + b Ldone4x + +.align 4 +Lone: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + ld1 {v4.2d},[x2] + eor v1.16b,v1.16b,v18.16b +#ifndef __AARCH64EB__ + rev64 v4.16b,v4.16b +#endif + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + + eor v16.16b,v4.16b,v0.16b + ext v3.16b,v16.16b,v16.16b,#8 + + pmull v0.1q,v20.1d,v3.1d + eor v16.16b,v16.16b,v3.16b + pmull2 v2.1q,v20.2d,v3.2d + pmull v1.1q,v21.1d,v16.1d + +Ldone4x: + ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing + eor v18.16b,v0.16b,v2.16b + eor v1.16b,v1.16b,v17.16b + eor v1.16b,v1.16b,v18.16b + + pmull v18.1q,v0.1d,v19.1d //1st phase of reduction + ins v2.d[0],v1.d[1] + ins v1.d[1],v0.d[0] + eor v0.16b,v1.16b,v18.16b + + ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction + pmull v0.1q,v0.1d,v19.1d + eor v18.16b,v18.16b,v2.16b + eor v0.16b,v0.16b,v18.16b + ext v0.16b,v0.16b,v0.16b,#8 + +#ifndef __AARCH64EB__ + rev64 v0.16b,v0.16b +#endif + st1 {v0.2d},[x0] //write out Xi + + ret + +.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/p256-armv8-asm-apple.S b/third_party/boringssl/gen/bcm/p256-armv8-asm-apple.S new file mode 100644 index 00000000..283240fb --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256-armv8-asm-apple.S @@ -0,0 +1,1724 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.section __TEXT,__const +.align 5 +Lpoly: +.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 +LRR: // 2^512 mod P precomputed for NIST P256 polynomial +.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd +Lone_mont: +.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +Lone: +.quad 1,0,0,0 +Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +LordK: +.quad 0xccd1c8aaee00bc4f +.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.text + +// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl _ecp_nistz256_mul_mont +.private_extern _ecp_nistz256_mul_mont + +.align 4 +_ecp_nistz256_mul_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl _ecp_nistz256_sqr_mont +.private_extern _ecp_nistz256_sqr_mont + +.align 4 +_ecp_nistz256_sqr_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sqr_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl _ecp_nistz256_div_by_2 +.private_extern _ecp_nistz256_div_by_2 + +.align 4 +_ecp_nistz256_div_by_2: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_div_by_2 + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl _ecp_nistz256_mul_by_2 +.private_extern _ecp_nistz256_mul_by_2 + +.align 4 +_ecp_nistz256_mul_by_2: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + + bl __ecp_nistz256_add_to // ret = a+a // 2*a + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl _ecp_nistz256_mul_by_3 +.private_extern _ecp_nistz256_mul_by_3 + +.align 4 +_ecp_nistz256_mul_by_3: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + + bl __ecp_nistz256_add_to // ret = a+a // 2*a + + mov x8,x4 + mov x9,x5 + mov x10,x6 + mov x11,x7 + + bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl _ecp_nistz256_sub +.private_extern _ecp_nistz256_sub + +.align 4 +_ecp_nistz256_sub: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl _ecp_nistz256_neg +.private_extern _ecp_nistz256_neg + +.align 4 +_ecp_nistz256_neg: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x2,x1 + mov x14,xzr // a = 0 + mov x15,xzr + mov x16,xzr + mov x17,xzr + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +// to x4-x7 and b[0] - to x3 + +.align 4 +__ecp_nistz256_mul_mont: + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x11,x7,x3 + ldr x3,[x2,#8] // b[1] + + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adc x19,xzr,x11 + mov x20,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(1+1)] // b[1+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(2+1)] // b[2+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + // last reduction + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adcs x17,x19,x11 + adc x19,x20,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +// to x4-x7 + +.align 4 +__ecp_nistz256_sqr_mont: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x2,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + lsl x8,x14,#32 + adcs x1,x1,x11 + lsr x9,x14,#32 + adc x2,x2,x7 + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adc x17,x11,xzr // can't overflow + + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x2 + adc x19,xzr,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to +// x4-x7 and x8-x11. This is done because it's used in multiple +// contexts, e.g. in multiplication by 2 and 3... + +.align 4 +__ecp_nistz256_add_to: + adds x14,x14,x8 // ret = a+b + adcs x15,x15,x9 + adcs x16,x16,x10 + adcs x17,x17,x11 + adc x1,xzr,xzr // zap x1 + + adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x1,xzr // did subtraction borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + + +.align 4 +__ecp_nistz256_sub_from: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x14,x8 // ret = a-b + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbcs x17,x17,x11 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret + + + +.align 4 +__ecp_nistz256_sub_morf: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x8,x14 // ret = b-a + sbcs x15,x9,x15 + sbcs x16,x10,x16 + sbcs x17,x11,x17 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret + + + +.align 4 +__ecp_nistz256_div_by_2: + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adcs x11,x17,x13 + adc x1,xzr,xzr // zap x1 + tst x14,#1 // is a even? + + csel x14,x14,x8,eq // ret = even ? a : a+modulus + csel x15,x15,x9,eq + csel x16,x16,x10,eq + csel x17,x17,x11,eq + csel x1,xzr,x1,eq + + lsr x14,x14,#1 // ret >>= 1 + orr x14,x14,x15,lsl#63 + lsr x15,x15,#1 + orr x15,x15,x16,lsl#63 + lsr x16,x16,#1 + orr x16,x16,x17,lsl#63 + lsr x17,x17,#1 + stp x14,x15,[x0] + orr x17,x17,x1,lsl#63 + stp x16,x17,[x0,#16] + + ret + +.globl _ecp_nistz256_point_double +.private_extern _ecp_nistz256_point_double + +.align 5 +_ecp_nistz256_point_double: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 + +Ldouble_shortcut: + ldp x14,x15,[x1,#32] + mov x21,x0 + ldp x16,x17,[x1,#48] + mov x22,x1 + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + mov x8,x14 + ldr x13,[x13,#24] + mov x9,x15 + ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[x22,#64+16] + add x0,sp,#0 + bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); + + add x0,sp,#64 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); + + ldp x8,x9,[x22] + ldp x10,x11,[x22,#16] + mov x4,x14 // put Zsqr aside for p256_sub + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); + + add x2,x22,#0 + mov x14,x4 // restore Zsqr + mov x15,x5 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x16,x6 + mov x17,x7 + ldp x6,x7,[sp,#0+16] + add x0,sp,#64 + bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); + + add x0,sp,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); + + ldr x3,[x22,#32] + ldp x4,x5,[x22,#64] + ldp x6,x7,[x22,#64+16] + add x2,x22,#32 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#0+16] + add x0,x21,#64 + bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); + + add x0,sp,#96 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); + + ldr x3,[sp,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x0,x21,#32 + bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); + + add x2,sp,#64 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); + + mov x8,x14 // duplicate M + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 // put M aside + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to + mov x8,x4 // restore M + mov x9,x5 + ldr x3,[x22] // forward load for p256_mul_mont + mov x10,x6 + ldp x4,x5,[sp,#0] + mov x11,x7 + ldp x6,x7,[sp,#0+16] + bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); + + add x2,x22,#0 + add x0,sp,#0 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#32+16] + add x0,sp,#96 + bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); + + add x0,x21,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); + + add x2,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); + + add x2,sp,#0 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); + + ldr x3,[sp,#32] + mov x4,x14 // copy S + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x2,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); + + add x2,x21,#32 + add x0,x21,#32 + bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _ecp_nistz256_point_add +.private_extern _ecp_nistz256_point_add + +.align 5 +_ecp_nistz256_point_add: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#32*12 + + ldp x4,x5,[x2,#64] // in2_z + ldp x6,x7,[x2,#64+16] + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + orr x8,x4,x5 + orr x10,x6,x7 + orr x25,x8,x10 + cmp x25,#0 + csetm x25,ne // ~in2infty + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); + + ldp x4,x5,[x22,#64] // in1_z + ldp x6,x7,[x22,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x2,x23,#64 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x22,#64 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#32] + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x2,x22,#32 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#352] + ldp x6,x7,[sp,#352+16] + add x2,x23,#32 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,sp,#320 + ldr x3,[sp,#192] // forward load for p256_mul_mont + ldp x4,x5,[x22] + ldp x6,x7,[x22,#16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x26,x14,x16 // ~is_equal(S1,S2) + + add x2,sp,#192 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr x3,[sp,#128] + ldp x4,x5,[x23] + ldp x6,x7,[x23,#16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); + + add x2,sp,#256 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x14,x14,x16 // ~is_equal(U1,U2) + + mvn x27,x24 // -1/0 -> 0/-1 + mvn x28,x25 // -1/0 -> 0/-1 + orr x14,x14,x27 + orr x14,x14,x28 + orr x14,x14,x26 + cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + +Ladd_double: + mov x1,x22 + mov x0,x21 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames + b Ldouble_shortcut + +.align 4 +Ladd_proceed: + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#64] + ldp x6,x7,[sp,#64+16] + add x2,x23,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); + + ldr x3,[sp,#96] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,sp,#96 + add x0,sp,#224 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[sp,#128] + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#128 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#192 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#224 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#288 + ldr x3,[sp,#224] // forward load for p256_mul_mont + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,sp,#224 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#160 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#352 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + +Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _ecp_nistz256_point_add_affine +.private_extern _ecp_nistz256_point_add_affine + +.align 5 +_ecp_nistz256_point_add_affine: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,Lpoly@PAGE + add x13,x13,Lpoly@PAGEOFF + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + ldp x4,x5,[x1,#64] // in1_z + ldp x6,x7,[x1,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + + ldp x14,x15,[x2] // in2_x + ldp x16,x17,[x2,#16] + ldp x8,x9,[x2,#32] // in2_y + ldp x10,x11,[x2,#48] + orr x14,x14,x15 + orr x16,x16,x17 + orr x8,x8,x9 + orr x10,x10,x11 + orr x14,x14,x16 + orr x8,x8,x10 + orr x25,x14,x8 + cmp x25,#0 + csetm x25,ne // ~in2infty + + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + ldr x3,[x23] + add x2,x23,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); + + add x2,x22,#0 + ldr x3,[x22,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); + + add x2,x22,#64 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#160] + ldp x6,x7,[sp,#160+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x23,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,x22,#32 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#192 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); + + add x0,sp,#224 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x0,sp,#288 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,sp,#160 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[x22] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,x22,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#224 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#288 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#256 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#96 + ldr x3,[x22,#32] // forward load for p256_mul_mont + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,x22,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); + + ldr x3,[sp,#192] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#192 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#128 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + adrp x23,Lone_mont@PAGE-64 + add x23,x23,Lone_mont@PAGEOFF-64 + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl _ecp_nistz256_ord_mul_mont +.private_extern _ecp_nistz256_ord_mul_mont + +.align 4 +_ecp_nistz256_ord_mul_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,Lord@PAGE + add x23,x23,Lord@PAGEOFF + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x19,x7,x3 + + mul x24,x14,x23 + + adds x15,x15,x8 // accumulate high parts of multiplication + adcs x16,x16,x9 + adcs x17,x17,x10 + adc x19,x19,xzr + mov x20,xzr + ldr x3,[x2,#8*1] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*2] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*3] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + lsl x8,x24,#32 // last reduction + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// uint64_t rep); +.globl _ecp_nistz256_ord_sqr_mont +.private_extern _ecp_nistz256_ord_sqr_mont + +.align 4 +_ecp_nistz256_ord_sqr_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,Lord@PAGE + add x23,x23,Lord@PAGEOFF + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + b Loop_ord_sqr + +.align 4 +Loop_ord_sqr: + sub x2,x2,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x3,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + mul x24,x14,x23 + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + adcs x1,x1,x11 + adc x3,x3,x7 + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + mul x24,x14,x23 + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x3 + adc x19,xzr,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x5,x15,x9,lo + csel x6,x16,x10,lo + csel x7,x17,x11,lo + + cbnz x2,Loop_ord_sqr + + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); +.globl _ecp_nistz256_select_w5 +.private_extern _ecp_nistz256_select_w5 + +.align 4 +_ecp_nistz256_select_w5: + AARCH64_VALID_CALL_TARGET + + // x10 := x0 + // w9 := 0; loop counter and incremented internal index + mov x10, x0 + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + movi v20.16b, #0 + movi v21.16b, #0 + +Lselect_w5_loop: + // Loop 16 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // continue loading ... + ld1 {v26.2d, v27.2d}, [x1],#32 + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + bit v20.16b, v26.16b, v3.16b + bit v21.16b, v27.16b, v3.16b + + // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back + tbz w9, #4, Lselect_w5_loop + + // Write [v16-v21] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 + st1 {v20.2d, v21.2d}, [x10] + + ret + + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); +.globl _ecp_nistz256_select_w7 +.private_extern _ecp_nistz256_select_w7 + +.align 4 +_ecp_nistz256_select_w7: + AARCH64_VALID_CALL_TARGET + + // w9 := 0; loop counter and incremented internal index + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + +Lselect_w7_loop: + // Loop 64 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back + tbz w9, #6, Lselect_w7_loop + + // Write [v16-v19] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] + + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/p256-armv8-asm-linux.S b/third_party/boringssl/gen/bcm/p256-armv8-asm-linux.S new file mode 100644 index 00000000..1d63f024 --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256-armv8-asm-linux.S @@ -0,0 +1,1724 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.section .rodata +.align 5 +.Lpoly: +.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 +.LRR: // 2^512 mod P precomputed for NIST P256 polynomial +.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd +.Lone_mont: +.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +.Lone: +.quad 1,0,0,0 +.Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f +.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.text + +// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_mul_mont +.hidden ecp_nistz256_mul_mont +.type ecp_nistz256_mul_mont,%function +.align 4 +ecp_nistz256_mul_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont + +// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_sqr_mont +.hidden ecp_nistz256_sqr_mont +.type ecp_nistz256_sqr_mont,%function +.align 4 +ecp_nistz256_sqr_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sqr_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont + +// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_div_by_2 +.hidden ecp_nistz256_div_by_2 +.type ecp_nistz256_div_by_2,%function +.align 4 +ecp_nistz256_div_by_2: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_div_by_2 + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 + +// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_2 +.hidden ecp_nistz256_mul_by_2 +.type ecp_nistz256_mul_by_2,%function +.align 4 +ecp_nistz256_mul_by_2: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + + bl __ecp_nistz256_add_to // ret = a+a // 2*a + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 + +// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_3 +.hidden ecp_nistz256_mul_by_3 +.type ecp_nistz256_mul_by_3,%function +.align 4 +ecp_nistz256_mul_by_3: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + + bl __ecp_nistz256_add_to // ret = a+a // 2*a + + mov x8,x4 + mov x9,x5 + mov x10,x6 + mov x11,x7 + + bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 + +// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_sub +.hidden ecp_nistz256_sub +.type ecp_nistz256_sub,%function +.align 4 +ecp_nistz256_sub: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_sub,.-ecp_nistz256_sub + +// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_neg +.hidden ecp_nistz256_neg +.type ecp_nistz256_neg,%function +.align 4 +ecp_nistz256_neg: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x2,x1 + mov x14,xzr // a = 0 + mov x15,xzr + mov x16,xzr + mov x17,xzr + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_neg,.-ecp_nistz256_neg + +// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +// to x4-x7 and b[0] - to x3 +.type __ecp_nistz256_mul_mont,%function +.align 4 +__ecp_nistz256_mul_mont: + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x11,x7,x3 + ldr x3,[x2,#8] // b[1] + + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adc x19,xzr,x11 + mov x20,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(1+1)] // b[1+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(2+1)] // b[2+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + // last reduction + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adcs x17,x19,x11 + adc x19,x20,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont + +// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +// to x4-x7 +.type __ecp_nistz256_sqr_mont,%function +.align 4 +__ecp_nistz256_sqr_mont: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x2,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + lsl x8,x14,#32 + adcs x1,x1,x11 + lsr x9,x14,#32 + adc x2,x2,x7 + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adc x17,x11,xzr // can't overflow + + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x2 + adc x19,xzr,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont + +// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to +// x4-x7 and x8-x11. This is done because it's used in multiple +// contexts, e.g. in multiplication by 2 and 3... +.type __ecp_nistz256_add_to,%function +.align 4 +__ecp_nistz256_add_to: + adds x14,x14,x8 // ret = a+b + adcs x15,x15,x9 + adcs x16,x16,x10 + adcs x17,x17,x11 + adc x1,xzr,xzr // zap x1 + + adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x1,xzr // did subtraction borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to + +.type __ecp_nistz256_sub_from,%function +.align 4 +__ecp_nistz256_sub_from: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x14,x8 // ret = a-b + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbcs x17,x17,x11 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from + +.type __ecp_nistz256_sub_morf,%function +.align 4 +__ecp_nistz256_sub_morf: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x8,x14 // ret = b-a + sbcs x15,x9,x15 + sbcs x16,x10,x16 + sbcs x17,x11,x17 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf + +.type __ecp_nistz256_div_by_2,%function +.align 4 +__ecp_nistz256_div_by_2: + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adcs x11,x17,x13 + adc x1,xzr,xzr // zap x1 + tst x14,#1 // is a even? + + csel x14,x14,x8,eq // ret = even ? a : a+modulus + csel x15,x15,x9,eq + csel x16,x16,x10,eq + csel x17,x17,x11,eq + csel x1,xzr,x1,eq + + lsr x14,x14,#1 // ret >>= 1 + orr x14,x14,x15,lsl#63 + lsr x15,x15,#1 + orr x15,x15,x16,lsl#63 + lsr x16,x16,#1 + orr x16,x16,x17,lsl#63 + lsr x17,x17,#1 + stp x14,x15,[x0] + orr x17,x17,x1,lsl#63 + stp x16,x17,[x0,#16] + + ret +.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 +.globl ecp_nistz256_point_double +.hidden ecp_nistz256_point_double +.type ecp_nistz256_point_double,%function +.align 5 +ecp_nistz256_point_double: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 + +.Ldouble_shortcut: + ldp x14,x15,[x1,#32] + mov x21,x0 + ldp x16,x17,[x1,#48] + mov x22,x1 + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + mov x8,x14 + ldr x13,[x13,#24] + mov x9,x15 + ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[x22,#64+16] + add x0,sp,#0 + bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); + + add x0,sp,#64 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); + + ldp x8,x9,[x22] + ldp x10,x11,[x22,#16] + mov x4,x14 // put Zsqr aside for p256_sub + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); + + add x2,x22,#0 + mov x14,x4 // restore Zsqr + mov x15,x5 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x16,x6 + mov x17,x7 + ldp x6,x7,[sp,#0+16] + add x0,sp,#64 + bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); + + add x0,sp,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); + + ldr x3,[x22,#32] + ldp x4,x5,[x22,#64] + ldp x6,x7,[x22,#64+16] + add x2,x22,#32 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#0+16] + add x0,x21,#64 + bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); + + add x0,sp,#96 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); + + ldr x3,[sp,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x0,x21,#32 + bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); + + add x2,sp,#64 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); + + mov x8,x14 // duplicate M + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 // put M aside + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to + mov x8,x4 // restore M + mov x9,x5 + ldr x3,[x22] // forward load for p256_mul_mont + mov x10,x6 + ldp x4,x5,[sp,#0] + mov x11,x7 + ldp x6,x7,[sp,#0+16] + bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); + + add x2,x22,#0 + add x0,sp,#0 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#32+16] + add x0,sp,#96 + bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); + + add x0,x21,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); + + add x2,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); + + add x2,sp,#0 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); + + ldr x3,[sp,#32] + mov x4,x14 // copy S + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x2,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); + + add x2,x21,#32 + add x0,x21,#32 + bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_double,.-ecp_nistz256_point_double +.globl ecp_nistz256_point_add +.hidden ecp_nistz256_point_add +.type ecp_nistz256_point_add,%function +.align 5 +ecp_nistz256_point_add: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#32*12 + + ldp x4,x5,[x2,#64] // in2_z + ldp x6,x7,[x2,#64+16] + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + orr x8,x4,x5 + orr x10,x6,x7 + orr x25,x8,x10 + cmp x25,#0 + csetm x25,ne // ~in2infty + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); + + ldp x4,x5,[x22,#64] // in1_z + ldp x6,x7,[x22,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x2,x23,#64 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x22,#64 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#32] + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x2,x22,#32 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#352] + ldp x6,x7,[sp,#352+16] + add x2,x23,#32 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,sp,#320 + ldr x3,[sp,#192] // forward load for p256_mul_mont + ldp x4,x5,[x22] + ldp x6,x7,[x22,#16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x26,x14,x16 // ~is_equal(S1,S2) + + add x2,sp,#192 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr x3,[sp,#128] + ldp x4,x5,[x23] + ldp x6,x7,[x23,#16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); + + add x2,sp,#256 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x14,x14,x16 // ~is_equal(U1,U2) + + mvn x27,x24 // -1/0 -> 0/-1 + mvn x28,x25 // -1/0 -> 0/-1 + orr x14,x14,x27 + orr x14,x14,x28 + orr x14,x14,x26 + cbnz x14,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + +.Ladd_double: + mov x1,x22 + mov x0,x21 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames + b .Ldouble_shortcut + +.align 4 +.Ladd_proceed: + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#64] + ldp x6,x7,[sp,#64+16] + add x2,x23,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); + + ldr x3,[sp,#96] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,sp,#96 + add x0,sp,#224 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[sp,#128] + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#128 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#192 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#224 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#288 + ldr x3,[sp,#224] // forward load for p256_mul_mont + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,sp,#224 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#160 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#352 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + +.Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_add,.-ecp_nistz256_point_add +.globl ecp_nistz256_point_add_affine +.hidden ecp_nistz256_point_add_affine +.type ecp_nistz256_point_add_affine,%function +.align 5 +ecp_nistz256_point_add_affine: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,.Lpoly + add x13,x13,:lo12:.Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + ldp x4,x5,[x1,#64] // in1_z + ldp x6,x7,[x1,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + + ldp x14,x15,[x2] // in2_x + ldp x16,x17,[x2,#16] + ldp x8,x9,[x2,#32] // in2_y + ldp x10,x11,[x2,#48] + orr x14,x14,x15 + orr x16,x16,x17 + orr x8,x8,x9 + orr x10,x10,x11 + orr x14,x14,x16 + orr x8,x8,x10 + orr x25,x14,x8 + cmp x25,#0 + csetm x25,ne // ~in2infty + + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + ldr x3,[x23] + add x2,x23,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); + + add x2,x22,#0 + ldr x3,[x22,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); + + add x2,x22,#64 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#160] + ldp x6,x7,[sp,#160+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x23,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,x22,#32 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#192 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); + + add x0,sp,#224 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x0,sp,#288 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,sp,#160 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[x22] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,x22,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#224 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#288 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#256 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#96 + ldr x3,[x22,#32] // forward load for p256_mul_mont + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,x22,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); + + ldr x3,[sp,#192] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#192 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#128 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + adrp x23,.Lone_mont-64 + add x23,x23,:lo12:.Lone_mont-64 + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont +.hidden ecp_nistz256_ord_mul_mont +.type ecp_nistz256_ord_mul_mont,%function +.align 4 +ecp_nistz256_ord_mul_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,.Lord + add x23,x23,:lo12:.Lord + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x19,x7,x3 + + mul x24,x14,x23 + + adds x15,x15,x8 // accumulate high parts of multiplication + adcs x16,x16,x9 + adcs x17,x17,x10 + adc x19,x19,xzr + mov x20,xzr + ldr x3,[x2,#8*1] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*2] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*3] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + lsl x8,x24,#32 // last reduction + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// uint64_t rep); +.globl ecp_nistz256_ord_sqr_mont +.hidden ecp_nistz256_ord_sqr_mont +.type ecp_nistz256_ord_sqr_mont,%function +.align 4 +ecp_nistz256_ord_sqr_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,.Lord + add x23,x23,:lo12:.Lord + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + b .Loop_ord_sqr + +.align 4 +.Loop_ord_sqr: + sub x2,x2,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x3,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + mul x24,x14,x23 + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + adcs x1,x1,x11 + adc x3,x3,x7 + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + mul x24,x14,x23 + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x3 + adc x19,xzr,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x5,x15,x9,lo + csel x6,x16,x10,lo + csel x7,x17,x11,lo + + cbnz x2,.Loop_ord_sqr + + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret +.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w5 +.hidden ecp_nistz256_select_w5 +.type ecp_nistz256_select_w5,%function +.align 4 +ecp_nistz256_select_w5: + AARCH64_VALID_CALL_TARGET + + // x10 := x0 + // w9 := 0; loop counter and incremented internal index + mov x10, x0 + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + movi v20.16b, #0 + movi v21.16b, #0 + +.Lselect_w5_loop: + // Loop 16 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // continue loading ... + ld1 {v26.2d, v27.2d}, [x1],#32 + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + bit v20.16b, v26.16b, v3.16b + bit v21.16b, v27.16b, v3.16b + + // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back + tbz w9, #4, .Lselect_w5_loop + + // Write [v16-v21] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 + st1 {v20.2d, v21.2d}, [x10] + + ret +.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w7 +.hidden ecp_nistz256_select_w7 +.type ecp_nistz256_select_w7,%function +.align 4 +ecp_nistz256_select_w7: + AARCH64_VALID_CALL_TARGET + + // w9 := 0; loop counter and incremented internal index + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + +.Lselect_w7_loop: + // Loop 64 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back + tbz w9, #6, .Lselect_w7_loop + + // Write [v16-v19] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] + + ret +.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/p256-armv8-asm-win.S b/third_party/boringssl/gen/bcm/p256-armv8-asm-win.S new file mode 100644 index 00000000..f15b8ab0 --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256-armv8-asm-win.S @@ -0,0 +1,1764 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.section .rodata +.align 5 +Lpoly: +.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 +LRR: // 2^512 mod P precomputed for NIST P256 polynomial +.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd +Lone_mont: +.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe +Lone: +.quad 1,0,0,0 +Lord: +.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +LordK: +.quad 0xccd1c8aaee00bc4f +.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.text + +// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_mul_mont + +.def ecp_nistz256_mul_mont + .type 32 +.endef +.align 4 +ecp_nistz256_mul_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_mul_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_sqr_mont + +.def ecp_nistz256_sqr_mont + .type 32 +.endef +.align 4 +ecp_nistz256_sqr_mont: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-32]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sqr_mont + + ldp x19,x20,[sp,#16] + ldp x29,x30,[sp],#32 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_div_by_2 + +.def ecp_nistz256_div_by_2 + .type 32 +.endef +.align 4 +ecp_nistz256_div_by_2: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_div_by_2 + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_2 + +.def ecp_nistz256_mul_by_2 + .type 32 +.endef +.align 4 +ecp_nistz256_mul_by_2: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + + bl __ecp_nistz256_add_to // ret = a+a // 2*a + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_mul_by_3 + +.def ecp_nistz256_mul_by_3 + .type 32 +.endef +.align 4 +ecp_nistz256_mul_by_3: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + + bl __ecp_nistz256_add_to // ret = a+a // 2*a + + mov x8,x4 + mov x9,x5 + mov x10,x6 + mov x11,x7 + + bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], +// const BN_ULONG x2[4]); +.globl ecp_nistz256_sub + +.def ecp_nistz256_sub + .type 32 +.endef +.align 4 +ecp_nistz256_sub: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ldp x14,x15,[x1] + ldp x16,x17,[x1,#16] + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); +.globl ecp_nistz256_neg + +.def ecp_nistz256_neg + .type 32 +.endef +.align 4 +ecp_nistz256_neg: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x2,x1 + mov x14,xzr // a = 0 + mov x15,xzr + mov x16,xzr + mov x17,xzr + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + bl __ecp_nistz256_sub_from + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded +// to x4-x7 and b[0] - to x3 +.def __ecp_nistz256_mul_mont + .type 32 +.endef +.align 4 +__ecp_nistz256_mul_mont: + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x11,x7,x3 + ldr x3,[x2,#8] // b[1] + + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adc x19,xzr,x11 + mov x20,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(1+1)] // b[1+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + ldr x3,[x2,#8*(2+1)] // b[2+1] + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + mul x8,x4,x3 // lo(a[0]*b[i]) + adcs x15,x16,x9 + mul x9,x5,x3 // lo(a[1]*b[i]) + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + mul x10,x6,x3 // lo(a[2]*b[i]) + adcs x17,x19,x11 + mul x11,x7,x3 // lo(a[3]*b[i]) + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts of multiplication + umulh x8,x4,x3 // hi(a[0]*b[i]) + adcs x15,x15,x9 + umulh x9,x5,x3 // hi(a[1]*b[i]) + adcs x16,x16,x10 + umulh x10,x6,x3 // hi(a[2]*b[i]) + adcs x17,x17,x11 + umulh x11,x7,x3 // hi(a[3]*b[i]) + adc x19,x19,xzr + adds x15,x15,x8 // accumulate high parts of multiplication + lsl x8,x14,#32 + adcs x16,x16,x9 + lsr x9,x14,#32 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + // last reduction + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adcs x17,x19,x11 + adc x19,x20,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded +// to x4-x7 +.def __ecp_nistz256_sqr_mont + .type 32 +.endef +.align 4 +__ecp_nistz256_sqr_mont: + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x2,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + lsl x8,x14,#32 + adcs x1,x1,x11 + lsr x9,x14,#32 + adc x2,x2,x7 + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + lsl x8,x14,#32 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + lsr x9,x14,#32 + adc x17,x11,xzr // can't overflow + subs x10,x14,x8 // "*0xffff0001" + sbc x11,x14,x9 + adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] + adcs x15,x16,x9 + adcs x16,x17,x10 // +=acc[0]*0xffff0001 + adc x17,x11,xzr // can't overflow + + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x2 + adc x19,xzr,xzr + + adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x19,xzr // did it borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to +// x4-x7 and x8-x11. This is done because it's used in multiple +// contexts, e.g. in multiplication by 2 and 3... +.def __ecp_nistz256_add_to + .type 32 +.endef +.align 4 +__ecp_nistz256_add_to: + adds x14,x14,x8 // ret = a+b + adcs x15,x15,x9 + adcs x16,x16,x10 + adcs x17,x17,x11 + adc x1,xzr,xzr // zap x1 + + adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus + sbcs x9,x15,x12 + sbcs x10,x16,xzr + sbcs x11,x17,x13 + sbcs xzr,x1,xzr // did subtraction borrow? + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ret + + +.def __ecp_nistz256_sub_from + .type 32 +.endef +.align 4 +__ecp_nistz256_sub_from: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x14,x8 // ret = a-b + sbcs x15,x15,x9 + sbcs x16,x16,x10 + sbcs x17,x17,x11 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret + + +.def __ecp_nistz256_sub_morf + .type 32 +.endef +.align 4 +__ecp_nistz256_sub_morf: + ldp x8,x9,[x2] + ldp x10,x11,[x2,#16] + subs x14,x8,x14 // ret = b-a + sbcs x15,x9,x15 + sbcs x16,x10,x16 + sbcs x17,x11,x17 + sbc x1,xzr,xzr // zap x1 + + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adc x11,x17,x13 + cmp x1,xzr // did subtraction borrow? + + csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret + csel x15,x15,x9,eq + csel x16,x16,x10,eq + stp x14,x15,[x0] + csel x17,x17,x11,eq + stp x16,x17,[x0,#16] + + ret + + +.def __ecp_nistz256_div_by_2 + .type 32 +.endef +.align 4 +__ecp_nistz256_div_by_2: + subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus + adcs x9,x15,x12 + adcs x10,x16,xzr + adcs x11,x17,x13 + adc x1,xzr,xzr // zap x1 + tst x14,#1 // is a even? + + csel x14,x14,x8,eq // ret = even ? a : a+modulus + csel x15,x15,x9,eq + csel x16,x16,x10,eq + csel x17,x17,x11,eq + csel x1,xzr,x1,eq + + lsr x14,x14,#1 // ret >>= 1 + orr x14,x14,x15,lsl#63 + lsr x15,x15,#1 + orr x15,x15,x16,lsl#63 + lsr x16,x16,#1 + orr x16,x16,x17,lsl#63 + lsr x17,x17,#1 + stp x14,x15,[x0] + orr x17,x17,x1,lsl#63 + stp x16,x17,[x0,#16] + + ret + +.globl ecp_nistz256_point_double + +.def ecp_nistz256_point_double + .type 32 +.endef +.align 5 +ecp_nistz256_point_double: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + sub sp,sp,#32*4 + +Ldouble_shortcut: + ldp x14,x15,[x1,#32] + mov x21,x0 + ldp x16,x17,[x1,#48] + mov x22,x1 + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + mov x8,x14 + ldr x13,[x13,#24] + mov x9,x15 + ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[x22,#64+16] + add x0,sp,#0 + bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); + + add x0,sp,#64 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); + + ldp x8,x9,[x22] + ldp x10,x11,[x22,#16] + mov x4,x14 // put Zsqr aside for p256_sub + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); + + add x2,x22,#0 + mov x14,x4 // restore Zsqr + mov x15,x5 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x16,x6 + mov x17,x7 + ldp x6,x7,[sp,#0+16] + add x0,sp,#64 + bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); + + add x0,sp,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); + + ldr x3,[x22,#32] + ldp x4,x5,[x22,#64] + ldp x6,x7,[x22,#64+16] + add x2,x22,#32 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#0+16] + add x0,x21,#64 + bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); + + add x0,sp,#96 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); + + ldr x3,[sp,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x0,x21,#32 + bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); + + add x2,sp,#64 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); + + mov x8,x14 // duplicate M + mov x9,x15 + mov x10,x16 + mov x11,x17 + mov x4,x14 // put M aside + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x0,sp,#32 + bl __ecp_nistz256_add_to + mov x8,x4 // restore M + mov x9,x5 + ldr x3,[x22] // forward load for p256_mul_mont + mov x10,x6 + ldp x4,x5,[sp,#0] + mov x11,x7 + ldp x6,x7,[sp,#0+16] + bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); + + add x2,x22,#0 + add x0,sp,#0 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); + + mov x8,x14 + mov x9,x15 + ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont + mov x10,x16 + mov x11,x17 + ldp x6,x7,[sp,#32+16] + add x0,sp,#96 + bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); + + add x0,x21,#0 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); + + add x2,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); + + add x2,sp,#0 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); + + ldr x3,[sp,#32] + mov x4,x14 // copy S + mov x5,x15 + mov x6,x16 + mov x7,x17 + add x2,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); + + add x2,x21,#32 + add x0,x21,#32 + bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl ecp_nistz256_point_add + +.def ecp_nistz256_point_add + .type 32 +.endef +.align 5 +ecp_nistz256_point_add: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#32*12 + + ldp x4,x5,[x2,#64] // in2_z + ldp x6,x7,[x2,#64+16] + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + orr x8,x4,x5 + orr x10,x6,x7 + orr x25,x8,x10 + cmp x25,#0 + csetm x25,ne // ~in2infty + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); + + ldp x4,x5,[x22,#64] // in1_z + ldp x6,x7,[x22,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x2,x23,#64 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x22,#64 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#32] + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x2,x22,#32 + add x0,sp,#320 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#352] + ldp x6,x7,[sp,#352+16] + add x2,x23,#32 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,sp,#320 + ldr x3,[sp,#192] // forward load for p256_mul_mont + ldp x4,x5,[x22] + ldp x6,x7,[x22,#16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x26,x14,x16 // ~is_equal(S1,S2) + + add x2,sp,#192 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); + + ldr x3,[sp,#128] + ldp x4,x5,[x23] + ldp x6,x7,[x23,#16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); + + add x2,sp,#256 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#96 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); + + orr x14,x14,x15 // see if result is zero + orr x16,x16,x17 + orr x14,x14,x16 // ~is_equal(U1,U2) + + mvn x27,x24 // -1/0 -> 0/-1 + mvn x28,x25 // -1/0 -> 0/-1 + orr x14,x14,x27 + orr x14,x14,x28 + orr x14,x14,x26 + cbnz x14,Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) + +Ladd_double: + mov x1,x22 + mov x0,x21 + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames + b Ldouble_shortcut + +.align 4 +Ladd_proceed: + add x0,sp,#192 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldp x4,x5,[sp,#96] + ldp x6,x7,[sp,#96+16] + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldr x3,[x23,#64] + ldp x4,x5,[sp,#64] + ldp x6,x7,[sp,#64+16] + add x2,x23,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); + + ldr x3,[sp,#96] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,sp,#96 + add x0,sp,#224 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[sp,#128] + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x2,sp,#128 + add x0,sp,#288 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#128 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#192 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#224 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#288 + ldr x3,[sp,#224] // forward load for p256_mul_mont + ldp x4,x5,[sp,#320] + ldp x6,x7,[sp,#320+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,sp,#224 + add x0,sp,#352 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#160 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#352 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + +Ladd_done: + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl ecp_nistz256_point_add_affine + +.def ecp_nistz256_point_add_affine + .type 32 +.endef +.align 5 +ecp_nistz256_point_add_affine: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-80]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + sub sp,sp,#32*10 + + mov x21,x0 + mov x22,x1 + mov x23,x2 + adrp x13,Lpoly + add x13,x13,:lo12:Lpoly + ldr x12,[x13,#8] + ldr x13,[x13,#24] + + ldp x4,x5,[x1,#64] // in1_z + ldp x6,x7,[x1,#64+16] + orr x8,x4,x5 + orr x10,x6,x7 + orr x24,x8,x10 + cmp x24,#0 + csetm x24,ne // ~in1infty + + ldp x14,x15,[x2] // in2_x + ldp x16,x17,[x2,#16] + ldp x8,x9,[x2,#32] // in2_y + ldp x10,x11,[x2,#48] + orr x14,x14,x15 + orr x16,x16,x17 + orr x8,x8,x9 + orr x10,x10,x11 + orr x14,x14,x16 + orr x8,x8,x10 + orr x25,x14,x8 + cmp x25,#0 + csetm x25,ne // ~in2infty + + add x0,sp,#128 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); + + mov x4,x14 + mov x5,x15 + mov x6,x16 + mov x7,x17 + ldr x3,[x23] + add x2,x23,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); + + add x2,x22,#0 + ldr x3,[x22,#64] // forward load for p256_mul_mont + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x0,sp,#160 + bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); + + add x2,x22,#64 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); + + ldr x3,[x22,#64] + ldp x4,x5,[sp,#160] + ldp x6,x7,[sp,#160+16] + add x2,x22,#64 + add x0,sp,#64 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); + + ldr x3,[x23,#32] + ldp x4,x5,[sp,#128] + ldp x6,x7,[sp,#128+16] + add x2,x23,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); + + add x2,x22,#32 + ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont + ldp x6,x7,[sp,#160+16] + add x0,sp,#192 + bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); + + add x0,sp,#224 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); + + ldp x4,x5,[sp,#192] + ldp x6,x7,[sp,#192+16] + add x0,sp,#288 + bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); + + ldr x3,[sp,#160] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,sp,#160 + add x0,sp,#256 + bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); + + ldr x3,[x22] + ldp x4,x5,[sp,#224] + ldp x6,x7,[sp,#224+16] + add x2,x22,#0 + add x0,sp,#96 + bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); + + mov x8,x14 + mov x9,x15 + mov x10,x16 + mov x11,x17 + add x0,sp,#224 + bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); + + add x2,sp,#288 + add x0,sp,#0 + bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); + + add x2,sp,#256 + bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); + + add x2,sp,#96 + ldr x3,[x22,#32] // forward load for p256_mul_mont + ldp x4,x5,[sp,#256] + ldp x6,x7,[sp,#256+16] + add x0,sp,#32 + bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); + + add x2,x22,#32 + add x0,sp,#128 + bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); + + ldr x3,[sp,#192] + ldp x4,x5,[sp,#32] + ldp x6,x7,[sp,#32+16] + add x2,sp,#192 + add x0,sp,#32 + bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); + + add x2,sp,#128 + bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); + + ldp x4,x5,[sp,#0] // res + ldp x6,x7,[sp,#0+16] + ldp x8,x9,[x23] // in2 + ldp x10,x11,[x23,#16] + ldp x14,x15,[x22,#0] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#0+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+0+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+0+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#0+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#0+48] + stp x14,x15,[x21,#0] + stp x16,x17,[x21,#0+16] + adrp x23,Lone_mont-64 + add x23,x23,:lo12:Lone_mont-64 + ldp x14,x15,[x22,#32] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#32+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + ldp x4,x5,[sp,#0+32+32] // res + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + ldp x6,x7,[sp,#0+32+48] + csel x14,x8,x14,ne + csel x15,x9,x15,ne + ldp x8,x9,[x23,#32+32] // in2 + csel x16,x10,x16,ne + csel x17,x11,x17,ne + ldp x10,x11,[x23,#32+48] + stp x14,x15,[x21,#32] + stp x16,x17,[x21,#32+16] + ldp x14,x15,[x22,#64] // in1 + cmp x24,#0 // ~, remember? + ldp x16,x17,[x22,#64+16] + csel x8,x4,x8,ne + csel x9,x5,x9,ne + csel x10,x6,x10,ne + csel x11,x7,x11,ne + cmp x25,#0 // ~, remember? + csel x14,x8,x14,ne + csel x15,x9,x15,ne + csel x16,x10,x16,ne + csel x17,x11,x17,ne + stp x14,x15,[x21,#64] + stp x16,x17,[x21,#64+16] + + add sp,x29,#0 // destroy frame + ldp x19,x20,[x29,#16] + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x29,x30,[sp],#80 + AARCH64_VALIDATE_LINK_REGISTER + ret + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], +// uint64_t b[4]); +.globl ecp_nistz256_ord_mul_mont + +.def ecp_nistz256_ord_mul_mont + .type 32 +.endef +.align 4 +ecp_nistz256_ord_mul_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,Lord + add x23,x23,:lo12:Lord + ldr x3,[x2] // bp[0] + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + + mul x14,x4,x3 // a[0]*b[0] + umulh x8,x4,x3 + + mul x15,x5,x3 // a[1]*b[0] + umulh x9,x5,x3 + + mul x16,x6,x3 // a[2]*b[0] + umulh x10,x6,x3 + + mul x17,x7,x3 // a[3]*b[0] + umulh x19,x7,x3 + + mul x24,x14,x23 + + adds x15,x15,x8 // accumulate high parts of multiplication + adcs x16,x16,x9 + adcs x17,x17,x10 + adc x19,x19,xzr + mov x20,xzr + ldr x3,[x2,#8*1] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*2] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + ldr x3,[x2,#8*3] // b[i] + + lsl x8,x24,#32 + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + mul x8,x4,x3 + adc x11,x11,xzr + mul x9,x5,x3 + + adds x14,x15,x10 + mul x10,x6,x3 + adcs x15,x16,x11 + mul x11,x7,x3 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + adds x14,x14,x8 // accumulate low parts + umulh x8,x4,x3 + adcs x15,x15,x9 + umulh x9,x5,x3 + adcs x16,x16,x10 + umulh x10,x6,x3 + adcs x17,x17,x11 + umulh x11,x7,x3 + adc x19,x19,xzr + mul x24,x14,x23 + adds x15,x15,x8 // accumulate high parts + adcs x16,x16,x9 + adcs x17,x17,x10 + adcs x19,x19,x11 + adc x20,xzr,xzr + lsl x8,x24,#32 // last reduction + subs x16,x16,x24 + lsr x9,x24,#32 + sbcs x17,x17,x8 + sbcs x19,x19,x9 + sbc x20,x20,xzr + + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adcs x17,x19,x24 + adc x19,x20,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x15,x15,x9,lo + csel x16,x16,x10,lo + stp x14,x15,[x0] + csel x17,x17,x11,lo + stp x16,x17,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], +// uint64_t rep); +.globl ecp_nistz256_ord_sqr_mont + +.def ecp_nistz256_ord_sqr_mont + .type 32 +.endef +.align 4 +ecp_nistz256_ord_sqr_mont: + AARCH64_VALID_CALL_TARGET + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + stp x29,x30,[sp,#-64]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + + adrp x23,Lord + add x23,x23,:lo12:Lord + ldp x4,x5,[x1] + ldp x6,x7,[x1,#16] + + ldp x12,x13,[x23,#0] + ldp x21,x22,[x23,#16] + ldr x23,[x23,#32] + b Loop_ord_sqr + +.align 4 +Loop_ord_sqr: + sub x2,x2,#1 + //////////////////////////////////////////////////////////////// + // | | | | | |a1*a0| | + // | | | | |a2*a0| | | + // | |a3*a2|a3*a0| | | | + // | | | |a2*a1| | | | + // | | |a3*a1| | | | | + // *| | | | | | | | 2| + // +|a3*a3|a2*a2|a1*a1|a0*a0| + // |--+--+--+--+--+--+--+--| + // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow + // + // "can't overflow" below mark carrying into high part of + // multiplication result, which can't overflow, because it + // can never be all ones. + + mul x15,x5,x4 // a[1]*a[0] + umulh x9,x5,x4 + mul x16,x6,x4 // a[2]*a[0] + umulh x10,x6,x4 + mul x17,x7,x4 // a[3]*a[0] + umulh x19,x7,x4 + + adds x16,x16,x9 // accumulate high parts of multiplication + mul x8,x6,x5 // a[2]*a[1] + umulh x9,x6,x5 + adcs x17,x17,x10 + mul x10,x7,x5 // a[3]*a[1] + umulh x11,x7,x5 + adc x19,x19,xzr // can't overflow + + mul x20,x7,x6 // a[3]*a[2] + umulh x1,x7,x6 + + adds x9,x9,x10 // accumulate high parts of multiplication + mul x14,x4,x4 // a[0]*a[0] + adc x10,x11,xzr // can't overflow + + adds x17,x17,x8 // accumulate low parts of multiplication + umulh x4,x4,x4 + adcs x19,x19,x9 + mul x9,x5,x5 // a[1]*a[1] + adcs x20,x20,x10 + umulh x5,x5,x5 + adc x1,x1,xzr // can't overflow + + adds x15,x15,x15 // acc[1-6]*=2 + mul x10,x6,x6 // a[2]*a[2] + adcs x16,x16,x16 + umulh x6,x6,x6 + adcs x17,x17,x17 + mul x11,x7,x7 // a[3]*a[3] + adcs x19,x19,x19 + umulh x7,x7,x7 + adcs x20,x20,x20 + adcs x1,x1,x1 + adc x3,xzr,xzr + + adds x15,x15,x4 // +a[i]*a[i] + mul x24,x14,x23 + adcs x16,x16,x9 + adcs x17,x17,x5 + adcs x19,x19,x10 + adcs x20,x20,x6 + adcs x1,x1,x11 + adc x3,x3,x7 + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + mul x24,x14,x23 + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x24 + mul x10,x13,x24 + umulh x11,x13,x24 + + adcs x10,x10,x9 + adc x11,x11,xzr + + adds x14,x15,x10 + adcs x15,x16,x11 + adcs x16,x17,x24 + adc x17,xzr,x24 // can't overflow + mul x11,x14,x23 + lsl x8,x24,#32 + subs x15,x15,x24 + lsr x9,x24,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + subs xzr,x14,#1 + umulh x9,x12,x11 + mul x10,x13,x11 + umulh x24,x13,x11 + + adcs x10,x10,x9 + adc x24,x24,xzr + + adds x14,x15,x10 + adcs x15,x16,x24 + adcs x16,x17,x11 + adc x17,xzr,x11 // can't overflow + lsl x8,x11,#32 + subs x15,x15,x11 + lsr x9,x11,#32 + sbcs x16,x16,x8 + sbc x17,x17,x9 // can't borrow + adds x14,x14,x19 // accumulate upper half + adcs x15,x15,x20 + adcs x16,x16,x1 + adcs x17,x17,x3 + adc x19,xzr,xzr + + subs x8,x14,x12 // ret -= modulus + sbcs x9,x15,x13 + sbcs x10,x16,x21 + sbcs x11,x17,x22 + sbcs xzr,x19,xzr + + csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus + csel x5,x15,x9,lo + csel x6,x16,x10,lo + csel x7,x17,x11,lo + + cbnz x2,Loop_ord_sqr + + stp x4,x5,[x0] + stp x6,x7,[x0,#16] + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldr x29,[sp],#64 + ret + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w5 + +.def ecp_nistz256_select_w5 + .type 32 +.endef +.align 4 +ecp_nistz256_select_w5: + AARCH64_VALID_CALL_TARGET + + // x10 := x0 + // w9 := 0; loop counter and incremented internal index + mov x10, x0 + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + movi v20.16b, #0 + movi v21.16b, #0 + +Lselect_w5_loop: + // Loop 16 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // continue loading ... + ld1 {v26.2d, v27.2d}, [x1],#32 + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + bit v20.16b, v26.16b, v3.16b + bit v21.16b, v27.16b, v3.16b + + // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back + tbz w9, #4, Lselect_w5_loop + + // Write [v16-v21] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 + st1 {v20.2d, v21.2d}, [x10] + + ret + + + +//////////////////////////////////////////////////////////////////////// +// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); +.globl ecp_nistz256_select_w7 + +.def ecp_nistz256_select_w7 + .type 32 +.endef +.align 4 +ecp_nistz256_select_w7: + AARCH64_VALID_CALL_TARGET + + // w9 := 0; loop counter and incremented internal index + mov w9, #0 + + // [v16-v21] := 0 + movi v16.16b, #0 + movi v17.16b, #0 + movi v18.16b, #0 + movi v19.16b, #0 + +Lselect_w7_loop: + // Loop 64 times. + + // Increment index (loop counter); tested at the end of the loop + add w9, w9, #1 + + // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 + // and advance x1 to point to the next entry + ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 + + // x11 := (w9 == w2)? All 1s : All 0s + cmp w9, w2 + csetm x11, eq + + // duplicate mask_64 into Mask (all 0s or all 1s) + dup v3.2d, x11 + + // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] + // i.e., values in output registers will remain the same if w9 != w2 + bit v16.16b, v22.16b, v3.16b + bit v17.16b, v23.16b, v3.16b + + bit v18.16b, v24.16b, v3.16b + bit v19.16b, v25.16b, v3.16b + + // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back + tbz w9, #6, Lselect_w7_loop + + // Write [v16-v19] to memory at the output pointer + st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] + + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/p256-x86_64-asm-apple.S b/third_party/boringssl/gen/bcm/p256-x86_64-asm-apple.S new file mode 100644 index 00000000..80ffa014 --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256-x86_64-asm-apple.S @@ -0,0 +1,4513 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + + +.section __DATA,__const +.p2align 6 +L$poly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + +L$One: +.long 1,1,1,1,1,1,1,1 +L$Two: +.long 2,2,2,2,2,2,2,2 +L$Three: +.long 3,3,3,3,3,3,3,3 +L$ONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + + +L$ord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +L$ordK: +.quad 0xccd1c8aaee00bc4f +.text + + + +.globl _ecp_nistz256_neg +.private_extern _ecp_nistz256_neg + +.p2align 5 +_ecp_nistz256_neg: + +_CET_ENDBR + pushq %r12 + + pushq %r13 + +L$neg_body: + + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r13,%r13 + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r8,%rax + sbbq 24(%rsi),%r11 + leaq L$poly(%rip),%rsi + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r13 + + movq 8(%rsp),%r12 + + leaq 16(%rsp),%rsp + +L$neg_epilogue: + ret + + + + + + + + +.globl _ecp_nistz256_ord_mul_mont_nohw +.private_extern _ecp_nistz256_ord_mul_mont_nohw + +.p2align 5 +_ecp_nistz256_ord_mul_mont_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_mul_body: + + movq 0(%rdx),%rax + movq %rdx,%rbx + leaq L$ord(%rip),%r14 + movq L$ordK(%rip),%r15 + + + movq %rax,%rcx + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + + movq %r8,%r13 + imulq %r15,%r8 + + movq %rdx,%r11 + mulq 24(%rsi) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq 0(%r14) + movq %r8,%rbp + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%r8 + + mulq 8(%r14) + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %rbp,%rax + adcq %rdx,%r10 + movq %rbp,%rdx + adcq $0,%r8 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 8(%rbx),%rax + sbbq %rdx,%rbp + + addq %r8,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + + movq %r9,%rcx + imulq %r15,%r9 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + mulq 0(%r14) + movq %r9,%rbp + addq %rax,%rcx + movq %r9,%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%r9 + + mulq 8(%r14) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq %rdx,%r11 + movq %rbp,%rdx + adcq $0,%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r12 + movq 16(%rbx),%rax + sbbq %rdx,%rbp + + addq %r9,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + + movq %r10,%rcx + imulq %r15,%r10 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + mulq 0(%r14) + movq %r10,%rbp + addq %rax,%rcx + movq %r10,%rax + adcq %rdx,%rcx + + subq %r10,%r12 + sbbq $0,%r10 + + mulq 8(%r14) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq %rdx,%r12 + movq %rbp,%rdx + adcq $0,%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r13 + movq 24(%rbx),%rax + sbbq %rdx,%rbp + + addq %r10,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + + movq %r11,%rcx + imulq %r15,%r11 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + mulq 0(%r14) + movq %r11,%rbp + addq %rax,%rcx + movq %r11,%rax + adcq %rdx,%rcx + + subq %r11,%r13 + sbbq $0,%r11 + + mulq 8(%r14) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + movq %rbp,%rdx + adcq $0,%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + sbbq %rdx,%rbp + + addq %r11,%r8 + adcq %rbp,%r9 + adcq $0,%r10 + + + movq %r12,%rsi + subq 0(%r14),%r12 + movq %r13,%r11 + sbbq 8(%r14),%r13 + movq %r8,%rcx + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rsi,%r12 + cmovcq %r11,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_mul_epilogue: + ret + + + + + + + + + +.globl _ecp_nistz256_ord_sqr_mont_nohw +.private_extern _ecp_nistz256_ord_sqr_mont_nohw + +.p2align 5 +_ecp_nistz256_ord_sqr_mont_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_sqr_body: + + movq 0(%rsi),%r8 + movq 8(%rsi),%rax + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + leaq L$ord(%rip),%rsi + movq %rdx,%rbx + jmp L$oop_ord_sqr + +.p2align 5 +L$oop_ord_sqr: + + movq %rax,%rbp + mulq %r8 + movq %rax,%r9 + movq %rbp,%xmm1 + movq %r14,%rax + movq %rdx,%r10 + + mulq %r8 + addq %rax,%r10 + movq %r15,%rax + movq %r14,%xmm2 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r8 + addq %rax,%r11 + movq %r15,%rax + movq %r15,%xmm3 + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + movq %rax,%r13 + movq %r14,%rax + movq %rdx,%r14 + + + mulq %rbp + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + + addq %r15,%r12 + adcq %rdx,%r13 + adcq $0,%r14 + + + xorq %r15,%r15 + movq %r8,%rax + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + + mulq %rax + movq %rax,%r8 + movq %xmm1,%rax + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r9 + adcq %rax,%r10 + movq %xmm2,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r11 + adcq %rax,%r12 + movq %xmm3,%rax + adcq $0,%rdx + movq %rdx,%rbp + + movq %r8,%rcx + imulq 32(%rsi),%r8 + + mulq %rax + addq %rbp,%r13 + adcq %rax,%r14 + movq 0(%rsi),%rax + adcq %rdx,%r15 + + + mulq %r8 + movq %r8,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%rbp + + mulq %r8 + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r8,%rax + adcq %rdx,%r10 + movq %r8,%rdx + adcq $0,%rbp + + movq %r9,%rcx + imulq 32(%rsi),%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 0(%rsi),%rax + sbbq %rdx,%r8 + + addq %rbp,%r11 + adcq $0,%r8 + + + mulq %r9 + movq %r9,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%rbp + + mulq %r9 + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + movq %r9,%rdx + adcq $0,%rbp + + movq %r10,%rcx + imulq 32(%rsi),%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + movq 0(%rsi),%rax + sbbq %rdx,%r9 + + addq %rbp,%r8 + adcq $0,%r9 + + + mulq %r10 + movq %r10,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r10,%r8 + sbbq $0,%rbp + + mulq %r10 + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r10,%rax + adcq %rdx,%r8 + movq %r10,%rdx + adcq $0,%rbp + + movq %r11,%rcx + imulq 32(%rsi),%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r9 + movq 0(%rsi),%rax + sbbq %rdx,%r10 + + addq %rbp,%r9 + adcq $0,%r10 + + + mulq %r11 + movq %r11,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r11,%r9 + sbbq $0,%rbp + + mulq %r11 + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + movq %r11,%rdx + adcq $0,%rbp + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r10 + sbbq %rdx,%r11 + + addq %rbp,%r10 + adcq $0,%r11 + + + xorq %rdx,%rdx + addq %r12,%r8 + adcq %r13,%r9 + movq %r8,%r12 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%rax + adcq $0,%rdx + + + subq 0(%rsi),%r8 + movq %r10,%r14 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r15 + sbbq 24(%rsi),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r8 + cmovncq %r9,%rax + cmovncq %r10,%r14 + cmovncq %r11,%r15 + + decq %rbx + jnz L$oop_ord_sqr + + movq %r8,0(%rdi) + movq %rax,8(%rdi) + pxor %xmm1,%xmm1 + movq %r14,16(%rdi) + pxor %xmm2,%xmm2 + movq %r15,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_sqr_epilogue: + ret + + + +.globl _ecp_nistz256_ord_mul_mont_adx +.private_extern _ecp_nistz256_ord_mul_mont_adx + +.p2align 5 +_ecp_nistz256_ord_mul_mont_adx: + +L$ecp_nistz256_ord_mul_mont_adx: +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq L$ord-128(%rip),%r14 + movq L$ordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_mulx_epilogue: + ret + + + +.globl _ecp_nistz256_ord_sqr_mont_adx +.private_extern _ecp_nistz256_ord_sqr_mont_adx + +.p2align 5 +_ecp_nistz256_ord_sqr_mont_adx: + +_CET_ENDBR +L$ecp_nistz256_ord_sqr_mont_adx: + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$ord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq L$ord(%rip),%rsi + jmp L$oop_ord_sqrx + +.p2align 5 +L$oop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax + movq %r14,%xmm1 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 + movq %r15,%xmm2 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx + movq %r8,%xmm3 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp + movq %xmm1,%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq %xmm2,%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 + movq %xmm3,%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz L$oop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$ord_sqrx_epilogue: + ret + + + + + + + + +.globl _ecp_nistz256_mul_mont_nohw +.private_extern _ecp_nistz256_mul_mont_nohw + +.p2align 5 +_ecp_nistz256_mul_mont_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mul_body: + movq %rdx,%rbx + movq 0(%rdx),%rax + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$mul_epilogue: + ret + + + + +.p2align 5 +__ecp_nistz256_mul_montq: + + + + movq %rax,%rbp + mulq %r9 + movq L$poly+8(%rip),%r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r10 + movq L$poly+24(%rip),%r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r11 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + xorq %r13,%r13 + movq %rdx,%r12 + + + + + + + + + + + movq %r8,%rbp + shlq $32,%r8 + mulq %r15 + shrq $32,%rbp + addq %r8,%r9 + adcq %rbp,%r10 + adcq %rax,%r11 + movq 8(%rbx),%rax + adcq %rdx,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + + movq %r9,%rbp + shlq $32,%r9 + mulq %r15 + shrq $32,%rbp + addq %r9,%r10 + adcq %rbp,%r11 + adcq %rax,%r12 + movq 16(%rbx),%rax + adcq %rdx,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + + movq %r10,%rbp + shlq $32,%r10 + mulq %r15 + shrq $32,%rbp + addq %r10,%r11 + adcq %rbp,%r12 + adcq %rax,%r13 + movq 24(%rbx),%rax + adcq %rdx,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + + movq %r11,%rbp + shlq $32,%r11 + mulq %r15 + shrq $32,%rbp + addq %r11,%r12 + adcq %rbp,%r13 + movq %r12,%rcx + adcq %rax,%r8 + adcq %rdx,%r9 + movq %r13,%rbp + adcq $0,%r10 + + + + subq $-1,%r12 + movq %r8,%rbx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rdx + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rcx,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rbx,%r8 + movq %r13,8(%rdi) + cmovcq %rdx,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + + + + + + + + +.globl _ecp_nistz256_sqr_mont_nohw +.private_extern _ecp_nistz256_sqr_mont_nohw + +.p2align 5 +_ecp_nistz256_sqr_mont_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$sqr_body: + movq 0(%rsi),%rax + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + + call __ecp_nistz256_sqr_montq + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$sqr_epilogue: + ret + + + + +.p2align 5 +__ecp_nistz256_sqr_montq: + + movq %rax,%r13 + mulq %r14 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + + mulq %r13 + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r13 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %r14 + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + + mulq %r15 + xorq %r15,%r15 + addq %rax,%r13 + movq 0(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + mulq %rax + movq %rax,%r8 + movq 8(%rsi),%rax + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r9 + adcq %rax,%r10 + movq 16(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r11 + adcq %rax,%r12 + movq 24(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r13 + adcq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r15 + + movq L$poly+8(%rip),%rsi + movq L$poly+24(%rip),%rbp + + + + + movq %r8,%rcx + shlq $32,%r8 + mulq %rbp + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %rbp + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %rbp + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %rbp + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + adcq %rax,%r10 + adcq $0,%rdx + xorq %r11,%r11 + + + + addq %r8,%r12 + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %rdx,%r15 + movq %r13,%r9 + adcq $0,%r11 + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%rcx + sbbq %rbp,%r15 + sbbq $0,%r11 + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %rcx,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + ret + + +.globl _ecp_nistz256_mul_mont_adx +.private_extern _ecp_nistz256_mul_mont_adx + +.p2align 5 +_ecp_nistz256_mul_mont_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mulx_body: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$mulx_epilogue: + ret + + + + +.p2align 5 +__ecp_nistz256_mul_montx: + + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + movq $32,%r14 + xorq %r13,%r13 + mulxq %r11,%rbp,%r11 + movq L$poly+24(%rip),%r15 + adcq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + adcq %rbp,%r10 + shlxq %r14,%r8,%rbp + adcq %rcx,%r11 + shrxq %r14,%r8,%rcx + adcq $0,%r12 + + + + addq %rbp,%r9 + adcq %rcx,%r10 + + mulxq %r15,%rcx,%rbp + movq 8(%rbx),%rdx + adcq %rcx,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + adcxq %rcx,%r12 + shlxq %r14,%r9,%rcx + adoxq %rbp,%r13 + shrxq %r14,%r9,%rbp + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + + addq %rcx,%r10 + adcq %rbp,%r11 + + mulxq %r15,%rcx,%rbp + movq 16(%rbx),%rdx + adcq %rcx,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + adcxq %rcx,%r13 + shlxq %r14,%r10,%rcx + adoxq %rbp,%r8 + shrxq %r14,%r10,%rbp + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + + addq %rcx,%r11 + adcq %rbp,%r12 + + mulxq %r15,%rcx,%rbp + movq 24(%rbx),%rdx + adcq %rcx,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + adcxq %rcx,%r8 + shlxq %r14,%r11,%rcx + adoxq %rbp,%r9 + shrxq %r14,%r11,%rbp + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + + addq %rcx,%r12 + adcq %rbp,%r13 + + mulxq %r15,%rcx,%rbp + movq %r12,%rbx + movq L$poly+8(%rip),%r14 + adcq %rcx,%r8 + movq %r13,%rdx + adcq %rbp,%r9 + adcq $0,%r10 + + + + xorl %eax,%eax + movq %r8,%rcx + sbbq $-1,%r12 + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rbp + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %rbp,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + +.globl _ecp_nistz256_sqr_mont_adx +.private_extern _ecp_nistz256_sqr_mont_adx + +.p2align 5 +_ecp_nistz256_sqr_mont_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$sqrx_body: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx + + movq 0(%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r13 + + movq 24(%rsp),%r12 + + movq 32(%rsp),%rbx + + movq 40(%rsp),%rbp + + leaq 48(%rsp),%rsp + +L$sqrx_epilogue: + ret + + + + +.p2align 5 +__ecp_nistz256_sqr_montx: + + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + xorl %eax,%eax + adcq %rcx,%r10 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + + mulxq %r8,%rcx,%r14 + movq 0+128(%rsi),%rdx + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + mulxq %rdx,%r8,%rbp + movq 8+128(%rsi),%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq 16+128(%rsi),%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 +.byte 0x67 + mulxq %rdx,%rcx,%rbp + movq 24+128(%rsi),%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + movq $32,%rsi + adoxq %rbp,%r13 +.byte 0x67,0x67 + mulxq %rdx,%rcx,%rax + movq L$poly+24(%rip),%rdx + adoxq %rcx,%r14 + shlxq %rsi,%r8,%rcx + adoxq %rax,%r15 + shrxq %rsi,%r8,%rax + movq %rdx,%rbp + + + addq %rcx,%r9 + adcq %rax,%r10 + + mulxq %r8,%rcx,%r8 + adcq %rcx,%r11 + shlxq %rsi,%r9,%rcx + adcq $0,%r8 + shrxq %rsi,%r9,%rax + + + addq %rcx,%r10 + adcq %rax,%r11 + + mulxq %r9,%rcx,%r9 + adcq %rcx,%r8 + shlxq %rsi,%r10,%rcx + adcq $0,%r9 + shrxq %rsi,%r10,%rax + + + addq %rcx,%r11 + adcq %rax,%r8 + + mulxq %r10,%rcx,%r10 + adcq %rcx,%r9 + shlxq %rsi,%r11,%rcx + adcq $0,%r10 + shrxq %rsi,%r11,%rax + + + addq %rcx,%r8 + adcq %rax,%r9 + + mulxq %r11,%rcx,%r11 + adcq %rcx,%r10 + adcq $0,%r11 + + xorq %rdx,%rdx + addq %r8,%r12 + movq L$poly+8(%rip),%rsi + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %r11,%r15 + movq %r13,%r9 + adcq $0,%rdx + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%r11 + sbbq %rbp,%r15 + sbbq $0,%rdx + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %r11,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + ret + + + + +.globl _ecp_nistz256_select_w5_nohw +.private_extern _ecp_nistz256_select_w5_nohw + +.p2align 5 +_ecp_nistz256_select_w5_nohw: + +_CET_ENDBR + movdqa L$One(%rip),%xmm0 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + + movdqa %xmm0,%xmm8 + pshufd $0,%xmm1,%xmm1 + + movq $16,%rax +L$select_loop_sse_w5: + + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + pcmpeqd %xmm1,%xmm15 + + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + movdqa 64(%rsi),%xmm13 + movdqa 80(%rsi),%xmm14 + leaq 96(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + pand %xmm15,%xmm13 + por %xmm12,%xmm5 + pand %xmm15,%xmm14 + por %xmm13,%xmm6 + por %xmm14,%xmm7 + + decq %rax + jnz L$select_loop_sse_w5 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + movdqu %xmm6,64(%rdi) + movdqu %xmm7,80(%rdi) + ret + +L$SEH_end_ecp_nistz256_select_w5_nohw: + + + + +.globl _ecp_nistz256_select_w7_nohw +.private_extern _ecp_nistz256_select_w7_nohw + +.p2align 5 +_ecp_nistz256_select_w7_nohw: + +_CET_ENDBR + movdqa L$One(%rip),%xmm8 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + movdqa %xmm8,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq $64,%rax + +L$select_loop_sse_w7: + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + pcmpeqd %xmm1,%xmm15 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + leaq 64(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + prefetcht0 255(%rsi) + por %xmm12,%xmm5 + + decq %rax + jnz L$select_loop_sse_w7 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + ret + +L$SEH_end_ecp_nistz256_select_w7_nohw: + + + +.globl _ecp_nistz256_select_w5_avx2 +.private_extern _ecp_nistz256_select_w5_avx2 + +.p2align 5 +_ecp_nistz256_select_w5_avx2: + +_CET_ENDBR + vzeroupper + vmovdqa L$Two(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + vpxor %ymm4,%ymm4,%ymm4 + + vmovdqa L$One(%rip),%ymm5 + vmovdqa L$Two(%rip),%ymm10 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + movq $8,%rax +L$select_loop_avx2_w5: + + vmovdqa 0(%rsi),%ymm6 + vmovdqa 32(%rsi),%ymm7 + vmovdqa 64(%rsi),%ymm8 + + vmovdqa 96(%rsi),%ymm11 + vmovdqa 128(%rsi),%ymm12 + vmovdqa 160(%rsi),%ymm13 + + vpcmpeqd %ymm1,%ymm5,%ymm9 + vpcmpeqd %ymm1,%ymm10,%ymm14 + + vpaddd %ymm0,%ymm5,%ymm5 + vpaddd %ymm0,%ymm10,%ymm10 + leaq 192(%rsi),%rsi + + vpand %ymm9,%ymm6,%ymm6 + vpand %ymm9,%ymm7,%ymm7 + vpand %ymm9,%ymm8,%ymm8 + vpand %ymm14,%ymm11,%ymm11 + vpand %ymm14,%ymm12,%ymm12 + vpand %ymm14,%ymm13,%ymm13 + + vpxor %ymm6,%ymm2,%ymm2 + vpxor %ymm7,%ymm3,%ymm3 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm11,%ymm2,%ymm2 + vpxor %ymm12,%ymm3,%ymm3 + vpxor %ymm13,%ymm4,%ymm4 + + decq %rax + jnz L$select_loop_avx2_w5 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,64(%rdi) + vzeroupper + ret + +L$SEH_end_ecp_nistz256_select_w5_avx2: + + + + +.globl _ecp_nistz256_select_w7_avx2 +.private_extern _ecp_nistz256_select_w7_avx2 + +.p2align 5 +_ecp_nistz256_select_w7_avx2: + +_CET_ENDBR + vzeroupper + vmovdqa L$Three(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + + vmovdqa L$One(%rip),%ymm4 + vmovdqa L$Two(%rip),%ymm8 + vmovdqa L$Three(%rip),%ymm12 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + + movq $21,%rax +L$select_loop_avx2_w7: + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vmovdqa 64(%rsi),%ymm9 + vmovdqa 96(%rsi),%ymm10 + + vmovdqa 128(%rsi),%ymm13 + vmovdqa 160(%rsi),%ymm14 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + vpcmpeqd %ymm1,%ymm8,%ymm11 + vpcmpeqd %ymm1,%ymm12,%ymm15 + + vpaddd %ymm0,%ymm4,%ymm4 + vpaddd %ymm0,%ymm8,%ymm8 + vpaddd %ymm0,%ymm12,%ymm12 + leaq 192(%rsi),%rsi + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm11,%ymm9,%ymm9 + vpand %ymm11,%ymm10,%ymm10 + vpand %ymm15,%ymm13,%ymm13 + vpand %ymm15,%ymm14,%ymm14 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm13,%ymm2,%ymm2 + vpxor %ymm14,%ymm3,%ymm3 + + decq %rax + jnz L$select_loop_avx2_w7 + + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vzeroupper + ret + +L$SEH_end_ecp_nistz256_select_w7_avx2: + + +.p2align 5 +__ecp_nistz256_add_toq: + + xorq %r11,%r11 + addq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + + +.p2align 5 +__ecp_nistz256_sub_fromq: + + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + addq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + + +.p2align 5 +__ecp_nistz256_subq: + + subq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq %r11,%r11 + + addq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + testq %r11,%r11 + + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 + + ret + + + + +.p2align 5 +__ecp_nistz256_mul_by_2q: + + xorq %r11,%r11 + addq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + +.globl _ecp_nistz256_point_double_nohw +.private_extern _ecp_nistz256_point_double_nohw + +.p2align 5 +_ecp_nistz256_point_double_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $160+8,%rsp + +L$point_doubleq_body: + +L$point_double_shortcutq: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq L$poly+8(%rip),%r14 + movq L$poly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 + movq %rdi,%xmm0 + movq %r10,%xmm1 + movq %r11,%xmm2 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-0(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 32(%rbx),%rax + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-0(%rbx),%rsi + leaq 32(%rbx),%rbx + movq %xmm2,%rdi + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + movq %xmm1,%rdi + call __ecp_nistz256_sqr_montq + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rax + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 0+32(%rsp),%rax + movq 8+32(%rsp),%r14 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 + movq %xmm0,%rdi + call __ecp_nistz256_sqr_montq + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subq + + movq 32(%rsp),%rax + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-0(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq %xmm1,%rbx + movq %xmm1,%rdi + call __ecp_nistz256_sub_fromq + + leaq 160+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_doubleq_epilogue: + ret + + +.globl _ecp_nistz256_point_add_nohw +.private_extern _ecp_nistz256_point_add_nohw + +.p2align 5 +_ecp_nistz256_point_add_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $576+8,%rsp + +L$point_addq_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 + movq %rdi,%xmm0 + + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rax + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 + movq %rbx,%xmm1 + + leaq 64-0(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 416(%rsp),%rax + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 512(%rsp),%rax + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq 0+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 + movq %r12,%xmm3 + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 480(%rsp),%rax + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + + movq %xmm2,%r8 + movq %xmm3,%r9 + orq %r8,%r12 +.byte 0x3e + jnz L$add_proceedq + + + + testq %r9,%r9 + jz L$add_doubleq + + + + + + + movq %xmm0,%rdi + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp L$add_doneq + +.p2align 5 +L$add_doubleq: + movq %xmm1,%rsi + movq %xmm0,%rdi + addq $416,%rsp + + jmp L$point_double_shortcutq + + +.p2align 5 +L$add_proceedq: + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq 0+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%rax + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 160(%rsp),%rax + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq %xmm0,%rdi + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +L$add_doneq: + leaq 576+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_addq_epilogue: + ret + + +.globl _ecp_nistz256_point_add_affine_nohw +.private_extern _ecp_nistz256_point_add_affine_nohw + +.p2align 5 +_ecp_nistz256_point_add_affine_nohw: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $480+8,%rsp + +L$add_affineq_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 + movq %rdi,%xmm0 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rax + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-0(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+96(%rsp),%rax + movq 8+96(%rsp),%r14 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq 0+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rax + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq 0+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq %xmm0,%rdi + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand L$ONE_mont(%rip),%xmm2 + pand L$ONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$add_affineq_epilogue: + ret + + + +.p2align 5 +__ecp_nistz256_add_tox: + + xorq %r11,%r11 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + + +.p2align 5 +__ecp_nistz256_sub_fromx: + + xorq %r11,%r11 + sbbq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq $0,%r11 + + xorq %r10,%r10 + adcq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + + btq $0,%r11 + cmovncq %rax,%r12 + cmovncq %rbp,%r13 + movq %r12,0(%rdi) + cmovncq %rcx,%r8 + movq %r13,8(%rdi) + cmovncq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + + + +.p2align 5 +__ecp_nistz256_subx: + + xorq %r11,%r11 + sbbq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq $0,%r11 + + xorq %r9,%r9 + adcq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + + btq $0,%r11 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + cmovcq %rcx,%r8 + cmovcq %r10,%r9 + + ret + + + + +.p2align 5 +__ecp_nistz256_mul_by_2x: + + xorq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret + + +.globl _ecp_nistz256_point_double_adx +.private_extern _ecp_nistz256_point_double_adx + +.p2align 5 +_ecp_nistz256_point_double_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $160+8,%rsp + +L$point_doublex_body: + +L$point_double_shortcutx: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq L$poly+8(%rip),%r14 + movq L$poly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 + movq %rdi,%xmm0 + movq %r10,%xmm1 + movq %r11,%xmm2 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-128(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 32(%rbx),%rdx + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-128(%rbx),%rsi + leaq 32(%rbx),%rbx + movq %xmm2,%rdi + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + movq %xmm1,%rdi + call __ecp_nistz256_sqr_montx + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rdx + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 0+32(%rsp),%rdx + movq 8+32(%rsp),%r14 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 + movq %xmm0,%rdi + call __ecp_nistz256_sqr_montx + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subx + + movq 32(%rsp),%rdx + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-128(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq %xmm1,%rbx + movq %xmm1,%rdi + call __ecp_nistz256_sub_fromx + + leaq 160+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_doublex_epilogue: + ret + + +.globl _ecp_nistz256_point_add_adx +.private_extern _ecp_nistz256_point_add_adx + +.p2align 5 +_ecp_nistz256_point_add_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $576+8,%rsp + +L$point_addx_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 + movq %rdi,%xmm0 + + leaq 64-128(%rsi),%rsi + movq %rdx,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rdx + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 + movq %rbx,%xmm1 + + leaq 64-128(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 416(%rsp),%rdx + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 512(%rsp),%rdx + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq -128+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 + movq %r12,%xmm3 + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 480(%rsp),%rdx + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + + movq %xmm2,%r8 + movq %xmm3,%r9 + orq %r8,%r12 +.byte 0x3e + jnz L$add_proceedx + + + + testq %r9,%r9 + jz L$add_doublex + + + + + + + movq %xmm0,%rdi + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp L$add_donex + +.p2align 5 +L$add_doublex: + movq %xmm1,%rsi + movq %xmm0,%rdi + addq $416,%rsp + + jmp L$point_double_shortcutx + + +.p2align 5 +L$add_proceedx: + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq -128+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%rdx + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 160(%rsp),%rdx + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq %xmm0,%rdi + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +L$add_donex: + leaq 576+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$point_addx_epilogue: + ret + + +.globl _ecp_nistz256_point_add_affine_adx +.private_extern _ecp_nistz256_point_add_affine_adx + +.p2align 5 +_ecp_nistz256_point_add_affine_adx: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $480+8,%rsp + +L$add_affinex_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 + movq %rdi,%xmm0 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-128(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rdx + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-128(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+96(%rsp),%rdx + movq 8+96(%rsp),%r14 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq -128+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rdx + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq -128+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq %xmm0,%rdi + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand L$ONE_mont(%rip),%xmm2 + pand L$ONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbx + + movq -8(%rsi),%rbp + + leaq (%rsi),%rsp + +L$add_affinex_epilogue: + ret + + +#endif diff --git a/third_party/boringssl/gen/bcm/p256-x86_64-asm-linux.S b/third_party/boringssl/gen/bcm/p256-x86_64-asm-linux.S new file mode 100644 index 00000000..c54d5772 --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256-x86_64-asm-linux.S @@ -0,0 +1,4599 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + + +.section .rodata +.align 64 +.Lpoly: +.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 + +.LOne: +.long 1,1,1,1,1,1,1,1 +.LTwo: +.long 2,2,2,2,2,2,2,2 +.LThree: +.long 3,3,3,3,3,3,3,3 +.LONE_mont: +.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe + + +.Lord: +.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 +.LordK: +.quad 0xccd1c8aaee00bc4f +.text + + + +.globl ecp_nistz256_neg +.hidden ecp_nistz256_neg +.type ecp_nistz256_neg,@function +.align 32 +ecp_nistz256_neg: +.cfi_startproc +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 +.Lneg_body: + + xorq %r8,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r13,%r13 + + subq 0(%rsi),%r8 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r8,%rax + sbbq 24(%rsi),%r11 + leaq .Lpoly(%rip),%rsi + movq %r9,%rdx + sbbq $0,%r13 + + addq 0(%rsi),%r8 + movq %r10,%rcx + adcq 8(%rsi),%r9 + adcq 16(%rsi),%r10 + movq %r11,%r12 + adcq 24(%rsi),%r11 + testq %r13,%r13 + + cmovzq %rax,%r8 + cmovzq %rdx,%r9 + movq %r8,0(%rdi) + cmovzq %rcx,%r10 + movq %r9,8(%rdi) + cmovzq %r12,%r11 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + movq 0(%rsp),%r13 +.cfi_restore %r13 + movq 8(%rsp),%r12 +.cfi_restore %r12 + leaq 16(%rsp),%rsp +.cfi_adjust_cfa_offset -16 +.Lneg_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_neg,.-ecp_nistz256_neg + + + + + + +.globl ecp_nistz256_ord_mul_mont_nohw +.hidden ecp_nistz256_ord_mul_mont_nohw +.type ecp_nistz256_ord_mul_mont_nohw,@function +.align 32 +ecp_nistz256_ord_mul_mont_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mul_body: + + movq 0(%rdx),%rax + movq %rdx,%rbx + leaq .Lord(%rip),%r14 + movq .LordK(%rip),%r15 + + + movq %rax,%rcx + mulq 0(%rsi) + movq %rax,%r8 + movq %rcx,%rax + movq %rdx,%r9 + + mulq 8(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq 16(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + + movq %r8,%r13 + imulq %r15,%r8 + + movq %rdx,%r11 + mulq 24(%rsi) + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq 0(%r14) + movq %r8,%rbp + addq %rax,%r13 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%r8 + + mulq 8(%r14) + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %rbp,%rax + adcq %rdx,%r10 + movq %rbp,%rdx + adcq $0,%r8 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 8(%rbx),%rax + sbbq %rdx,%rbp + + addq %r8,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r9 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + + movq %r9,%rcx + imulq %r15,%r9 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + xorq %r8,%r8 + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + mulq 0(%r14) + movq %r9,%rbp + addq %rax,%rcx + movq %r9,%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%r9 + + mulq 8(%r14) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq %rdx,%r11 + movq %rbp,%rdx + adcq $0,%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r12 + movq 16(%rbx),%rax + sbbq %rdx,%rbp + + addq %r9,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r10 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + + movq %r10,%rcx + imulq %r15,%r10 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + xorq %r9,%r9 + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + mulq 0(%r14) + movq %r10,%rbp + addq %rax,%rcx + movq %r10,%rax + adcq %rdx,%rcx + + subq %r10,%r12 + sbbq $0,%r10 + + mulq 8(%r14) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq %rdx,%r12 + movq %rbp,%rdx + adcq $0,%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r13 + movq 24(%rbx),%rax + sbbq %rdx,%rbp + + addq %r10,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + + + movq %rax,%rcx + mulq 0(%rsi) + addq %rax,%r11 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 8(%rsi) + addq %rbp,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rcx,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq 16(%rsi) + addq %rbp,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rcx,%rax + adcq $0,%rdx + + movq %r11,%rcx + imulq %r15,%r11 + + movq %rdx,%rbp + mulq 24(%rsi) + addq %rbp,%r8 + adcq $0,%rdx + xorq %r10,%r10 + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + mulq 0(%r14) + movq %r11,%rbp + addq %rax,%rcx + movq %r11,%rax + adcq %rdx,%rcx + + subq %r11,%r13 + sbbq $0,%r11 + + mulq 8(%r14) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq %rdx,%r13 + movq %rbp,%rdx + adcq $0,%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + sbbq %rdx,%rbp + + addq %r11,%r8 + adcq %rbp,%r9 + adcq $0,%r10 + + + movq %r12,%rsi + subq 0(%r14),%r12 + movq %r13,%r11 + sbbq 8(%r14),%r13 + movq %r8,%rcx + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rsi,%r12 + cmovcq %r11,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mul_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_mul_mont_nohw,.-ecp_nistz256_ord_mul_mont_nohw + + + + + + + +.globl ecp_nistz256_ord_sqr_mont_nohw +.hidden ecp_nistz256_ord_sqr_mont_nohw +.type ecp_nistz256_ord_sqr_mont_nohw,@function +.align 32 +ecp_nistz256_ord_sqr_mont_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqr_body: + + movq 0(%rsi),%r8 + movq 8(%rsi),%rax + movq 16(%rsi),%r14 + movq 24(%rsi),%r15 + leaq .Lord(%rip),%rsi + movq %rdx,%rbx + jmp .Loop_ord_sqr + +.align 32 +.Loop_ord_sqr: + + movq %rax,%rbp + mulq %r8 + movq %rax,%r9 + movq %rbp,%xmm1 + movq %r14,%rax + movq %rdx,%r10 + + mulq %r8 + addq %rax,%r10 + movq %r15,%rax + movq %r14,%xmm2 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r8 + addq %rax,%r11 + movq %r15,%rax + movq %r15,%xmm3 + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + movq %rax,%r13 + movq %r14,%rax + movq %rdx,%r14 + + + mulq %rbp + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r15 + + mulq %rbp + addq %rax,%r12 + adcq $0,%rdx + + addq %r15,%r12 + adcq %rdx,%r13 + adcq $0,%r14 + + + xorq %r15,%r15 + movq %r8,%rax + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + + mulq %rax + movq %rax,%r8 + movq %xmm1,%rax + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r9 + adcq %rax,%r10 + movq %xmm2,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %rax + addq %rbp,%r11 + adcq %rax,%r12 + movq %xmm3,%rax + adcq $0,%rdx + movq %rdx,%rbp + + movq %r8,%rcx + imulq 32(%rsi),%r8 + + mulq %rax + addq %rbp,%r13 + adcq %rax,%r14 + movq 0(%rsi),%rax + adcq %rdx,%r15 + + + mulq %r8 + movq %r8,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r8,%r10 + sbbq $0,%rbp + + mulq %r8 + addq %rcx,%r9 + adcq $0,%rdx + addq %rax,%r9 + movq %r8,%rax + adcq %rdx,%r10 + movq %r8,%rdx + adcq $0,%rbp + + movq %r9,%rcx + imulq 32(%rsi),%r9 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r11 + movq 0(%rsi),%rax + sbbq %rdx,%r8 + + addq %rbp,%r11 + adcq $0,%r8 + + + mulq %r9 + movq %r9,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r9,%r11 + sbbq $0,%rbp + + mulq %r9 + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %r9,%rax + adcq %rdx,%r11 + movq %r9,%rdx + adcq $0,%rbp + + movq %r10,%rcx + imulq 32(%rsi),%r10 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r8 + movq 0(%rsi),%rax + sbbq %rdx,%r9 + + addq %rbp,%r8 + adcq $0,%r9 + + + mulq %r10 + movq %r10,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r10,%r8 + sbbq $0,%rbp + + mulq %r10 + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %r10,%rax + adcq %rdx,%r8 + movq %r10,%rdx + adcq $0,%rbp + + movq %r11,%rcx + imulq 32(%rsi),%r11 + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r9 + movq 0(%rsi),%rax + sbbq %rdx,%r10 + + addq %rbp,%r9 + adcq $0,%r10 + + + mulq %r11 + movq %r11,%rbp + addq %rax,%rcx + movq 8(%rsi),%rax + adcq %rdx,%rcx + + subq %r11,%r9 + sbbq $0,%rbp + + mulq %r11 + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + movq %r11,%rdx + adcq $0,%rbp + + shlq $32,%rax + shrq $32,%rdx + subq %rax,%r10 + sbbq %rdx,%r11 + + addq %rbp,%r10 + adcq $0,%r11 + + + xorq %rdx,%rdx + addq %r12,%r8 + adcq %r13,%r9 + movq %r8,%r12 + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%rax + adcq $0,%rdx + + + subq 0(%rsi),%r8 + movq %r10,%r14 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r15 + sbbq 24(%rsi),%r11 + sbbq $0,%rdx + + cmovcq %r12,%r8 + cmovncq %r9,%rax + cmovncq %r10,%r14 + cmovncq %r11,%r15 + + decq %rbx + jnz .Loop_ord_sqr + + movq %r8,0(%rdi) + movq %rax,8(%rdi) + pxor %xmm1,%xmm1 + movq %r14,16(%rdi) + pxor %xmm2,%xmm2 + movq %r15,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqr_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_sqr_mont_nohw,.-ecp_nistz256_ord_sqr_mont_nohw + +.globl ecp_nistz256_ord_mul_mont_adx +.hidden ecp_nistz256_ord_mul_mont_adx +.type ecp_nistz256_ord_mul_mont_adx,@function +.align 32 +ecp_nistz256_ord_mul_mont_adx: +.cfi_startproc +.Lecp_nistz256_ord_mul_mont_adx: +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_mulx_body: + + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + leaq .Lord-128(%rip),%r14 + movq .LordK(%rip),%r15 + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + mulxq %r11,%rbp,%r11 + addq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + mulxq %r15,%rdx,%rax + adcq %rbp,%r10 + adcq %rcx,%r11 + adcq $0,%r12 + + + xorq %r13,%r13 + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 24+128(%r14),%rcx,%rbp + movq 8(%rbx),%rdx + adcxq %rcx,%r11 + adoxq %rbp,%r12 + adcxq %r8,%r12 + adoxq %r8,%r13 + adcq $0,%r13 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%r14),%rcx,%rbp + movq 16(%rbx),%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcxq %r9,%r13 + adoxq %r9,%r8 + adcq $0,%r8 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%r14),%rcx,%rbp + movq 24(%rbx),%rdx + adcxq %rcx,%r13 + adoxq %rbp,%r8 + adcxq %r10,%r8 + adoxq %r10,%r9 + adcq $0,%r9 + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + mulxq %r15,%rdx,%rax + adcxq %rcx,%r8 + adoxq %rbp,%r9 + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + mulxq 0+128(%r14),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%r14),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%r14),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%r14),%rcx,%rbp + leaq 128(%r14),%r14 + movq %r12,%rbx + adcxq %rcx,%r8 + adoxq %rbp,%r9 + movq %r13,%rdx + adcxq %r11,%r9 + adoxq %r11,%r10 + adcq $0,%r10 + + + + movq %r8,%rcx + subq 0(%r14),%r12 + sbbq 8(%r14),%r13 + sbbq 16(%r14),%r8 + movq %r9,%rbp + sbbq 24(%r14),%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + cmovcq %rcx,%r8 + cmovcq %rbp,%r9 + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_mulx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_mul_mont_adx,.-ecp_nistz256_ord_mul_mont_adx + +.globl ecp_nistz256_ord_sqr_mont_adx +.hidden ecp_nistz256_ord_sqr_mont_adx +.type ecp_nistz256_ord_sqr_mont_adx,@function +.align 32 +ecp_nistz256_ord_sqr_mont_adx: +.cfi_startproc +_CET_ENDBR +.Lecp_nistz256_ord_sqr_mont_adx: + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lord_sqrx_body: + + movq %rdx,%rbx + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq .Lord(%rip),%rsi + jmp .Loop_ord_sqrx + +.align 32 +.Loop_ord_sqrx: + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + movq %rdx,%rax + movq %r14,%xmm1 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + addq %rcx,%r10 + movq %r15,%xmm2 + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + mulxq %r8,%rcx,%r14 + movq %rax,%rdx + movq %r8,%xmm3 + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + + mulxq %rdx,%r8,%rbp + movq %xmm1,%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq %xmm2,%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 + mulxq %rdx,%rcx,%rbp +.byte 0x67 + movq %xmm3,%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + adoxq %rbp,%r13 + mulxq %rdx,%rcx,%rax + adoxq %rcx,%r14 + adoxq %rax,%r15 + + + movq %r8,%rdx + mulxq 32(%rsi),%rdx,%rcx + + xorq %rax,%rax + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + adcxq %rax,%r8 + + + movq %r9,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + adoxq %rax,%r9 + + + movq %r10,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + mulxq 8(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r8 + mulxq 16(%rsi),%rcx,%rbp + adcxq %rcx,%r8 + adoxq %rbp,%r9 + mulxq 24(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + adcxq %rax,%r10 + + + movq %r11,%rdx + mulxq 32(%rsi),%rdx,%rcx + + mulxq 0(%rsi),%rcx,%rbp + adoxq %rcx,%r11 + adcxq %rbp,%r8 + mulxq 8(%rsi),%rcx,%rbp + adoxq %rcx,%r8 + adcxq %rbp,%r9 + mulxq 16(%rsi),%rcx,%rbp + adoxq %rcx,%r9 + adcxq %rbp,%r10 + mulxq 24(%rsi),%rcx,%rbp + adoxq %rcx,%r10 + adcxq %rbp,%r11 + adoxq %rax,%r11 + + + addq %r8,%r12 + adcq %r13,%r9 + movq %r12,%rdx + adcq %r14,%r10 + adcq %r15,%r11 + movq %r9,%r14 + adcq $0,%rax + + + subq 0(%rsi),%r12 + movq %r10,%r15 + sbbq 8(%rsi),%r9 + sbbq 16(%rsi),%r10 + movq %r11,%r8 + sbbq 24(%rsi),%r11 + sbbq $0,%rax + + cmovncq %r12,%rdx + cmovncq %r9,%r14 + cmovncq %r10,%r15 + cmovncq %r11,%r8 + + decq %rbx + jnz .Loop_ord_sqrx + + movq %rdx,0(%rdi) + movq %r14,8(%rdi) + pxor %xmm1,%xmm1 + movq %r15,16(%rdi) + pxor %xmm2,%xmm2 + movq %r8,24(%rdi) + pxor %xmm3,%xmm3 + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lord_sqrx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_ord_sqr_mont_adx,.-ecp_nistz256_ord_sqr_mont_adx + + + + + + +.globl ecp_nistz256_mul_mont_nohw +.hidden ecp_nistz256_mul_mont_nohw +.type ecp_nistz256_mul_mont_nohw,@function +.align 32 +ecp_nistz256_mul_mont_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lmul_body: + movq %rdx,%rbx + movq 0(%rdx),%rax + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmul_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_mul_mont_nohw,.-ecp_nistz256_mul_mont_nohw + +.type __ecp_nistz256_mul_montq,@function +.align 32 +__ecp_nistz256_mul_montq: +.cfi_startproc + + + movq %rax,%rbp + mulq %r9 + movq .Lpoly+8(%rip),%r14 + movq %rax,%r8 + movq %rbp,%rax + movq %rdx,%r9 + + mulq %r10 + movq .Lpoly+24(%rip),%r15 + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %r11 + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r12 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + xorq %r13,%r13 + movq %rdx,%r12 + + + + + + + + + + + movq %r8,%rbp + shlq $32,%r8 + mulq %r15 + shrq $32,%rbp + addq %r8,%r9 + adcq %rbp,%r10 + adcq %rax,%r11 + movq 8(%rbx),%rax + adcq %rdx,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r9 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r10 + adcq $0,%rdx + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %r9,%rax + adcq %rdx,%r13 + adcq $0,%r8 + + + + movq %r9,%rbp + shlq $32,%r9 + mulq %r15 + shrq $32,%rbp + addq %r9,%r10 + adcq %rbp,%r11 + adcq %rax,%r12 + movq 16(%rbx),%rax + adcq %rdx,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r10 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r11 + adcq $0,%rdx + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %r10,%rax + adcq %rdx,%r8 + adcq $0,%r9 + + + + movq %r10,%rbp + shlq $32,%r10 + mulq %r15 + shrq $32,%rbp + addq %r10,%r11 + adcq %rbp,%r12 + adcq %rax,%r13 + movq 24(%rbx),%rax + adcq %rdx,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + movq %rax,%rbp + mulq 0(%rsi) + addq %rax,%r11 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 8(%rsi) + addq %rcx,%r12 + adcq $0,%rdx + addq %rax,%r12 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 16(%rsi) + addq %rcx,%r13 + adcq $0,%rdx + addq %rax,%r13 + movq %rbp,%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq 24(%rsi) + addq %rcx,%r8 + adcq $0,%rdx + addq %rax,%r8 + movq %r11,%rax + adcq %rdx,%r9 + adcq $0,%r10 + + + + movq %r11,%rbp + shlq $32,%r11 + mulq %r15 + shrq $32,%rbp + addq %r11,%r12 + adcq %rbp,%r13 + movq %r12,%rcx + adcq %rax,%r8 + adcq %rdx,%r9 + movq %r13,%rbp + adcq $0,%r10 + + + + subq $-1,%r12 + movq %r8,%rbx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rdx + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rcx,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rbx,%r8 + movq %r13,8(%rdi) + cmovcq %rdx,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq + + + + + + + + +.globl ecp_nistz256_sqr_mont_nohw +.hidden ecp_nistz256_sqr_mont_nohw +.type ecp_nistz256_sqr_mont_nohw,@function +.align 32 +ecp_nistz256_sqr_mont_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lsqr_body: + movq 0(%rsi),%rax + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + + call __ecp_nistz256_sqr_montq + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqr_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_sqr_mont_nohw,.-ecp_nistz256_sqr_mont_nohw + +.type __ecp_nistz256_sqr_montq,@function +.align 32 +__ecp_nistz256_sqr_montq: +.cfi_startproc + movq %rax,%r13 + mulq %r14 + movq %rax,%r9 + movq %r15,%rax + movq %rdx,%r10 + + mulq %r13 + addq %rax,%r10 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %r13 + addq %rax,%r11 + movq %r15,%rax + adcq $0,%rdx + movq %rdx,%r12 + + + mulq %r14 + addq %rax,%r11 + movq %r8,%rax + adcq $0,%rdx + movq %rdx,%rbp + + mulq %r14 + addq %rax,%r12 + movq %r8,%rax + adcq $0,%rdx + addq %rbp,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + + mulq %r15 + xorq %r15,%r15 + addq %rax,%r13 + movq 0(%rsi),%rax + movq %rdx,%r14 + adcq $0,%r14 + + addq %r9,%r9 + adcq %r10,%r10 + adcq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + adcq %r14,%r14 + adcq $0,%r15 + + mulq %rax + movq %rax,%r8 + movq 8(%rsi),%rax + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r9 + adcq %rax,%r10 + movq 16(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r11 + adcq %rax,%r12 + movq 24(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rcx + + mulq %rax + addq %rcx,%r13 + adcq %rax,%r14 + movq %r8,%rax + adcq %rdx,%r15 + + movq .Lpoly+8(%rip),%rsi + movq .Lpoly+24(%rip),%rbp + + + + + movq %r8,%rcx + shlq $32,%r8 + mulq %rbp + shrq $32,%rcx + addq %r8,%r9 + adcq %rcx,%r10 + adcq %rax,%r11 + movq %r9,%rax + adcq $0,%rdx + + + + movq %r9,%rcx + shlq $32,%r9 + movq %rdx,%r8 + mulq %rbp + shrq $32,%rcx + addq %r9,%r10 + adcq %rcx,%r11 + adcq %rax,%r8 + movq %r10,%rax + adcq $0,%rdx + + + + movq %r10,%rcx + shlq $32,%r10 + movq %rdx,%r9 + mulq %rbp + shrq $32,%rcx + addq %r10,%r11 + adcq %rcx,%r8 + adcq %rax,%r9 + movq %r11,%rax + adcq $0,%rdx + + + + movq %r11,%rcx + shlq $32,%r11 + movq %rdx,%r10 + mulq %rbp + shrq $32,%rcx + addq %r11,%r8 + adcq %rcx,%r9 + adcq %rax,%r10 + adcq $0,%rdx + xorq %r11,%r11 + + + + addq %r8,%r12 + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %rdx,%r15 + movq %r13,%r9 + adcq $0,%r11 + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%rcx + sbbq %rbp,%r15 + sbbq $0,%r11 + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %rcx,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq +.globl ecp_nistz256_mul_mont_adx +.hidden ecp_nistz256_mul_mont_adx +.type ecp_nistz256_mul_mont_adx,@function +.align 32 +ecp_nistz256_mul_mont_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lmulx_body: + movq %rdx,%rbx + movq 0(%rdx),%rdx + movq 0(%rsi),%r9 + movq 8(%rsi),%r10 + movq 16(%rsi),%r11 + movq 24(%rsi),%r12 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lmulx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_mul_mont_adx,.-ecp_nistz256_mul_mont_adx + +.type __ecp_nistz256_mul_montx,@function +.align 32 +__ecp_nistz256_mul_montx: +.cfi_startproc + + + mulxq %r9,%r8,%r9 + mulxq %r10,%rcx,%r10 + movq $32,%r14 + xorq %r13,%r13 + mulxq %r11,%rbp,%r11 + movq .Lpoly+24(%rip),%r15 + adcq %rcx,%r9 + mulxq %r12,%rcx,%r12 + movq %r8,%rdx + adcq %rbp,%r10 + shlxq %r14,%r8,%rbp + adcq %rcx,%r11 + shrxq %r14,%r8,%rcx + adcq $0,%r12 + + + + addq %rbp,%r9 + adcq %rcx,%r10 + + mulxq %r15,%rcx,%rbp + movq 8(%rbx),%rdx + adcq %rcx,%r11 + adcq %rbp,%r12 + adcq $0,%r13 + xorq %r8,%r8 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r9 + adoxq %rbp,%r10 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r9,%rdx + adcxq %rcx,%r12 + shlxq %r14,%r9,%rcx + adoxq %rbp,%r13 + shrxq %r14,%r9,%rbp + + adcxq %r8,%r13 + adoxq %r8,%r8 + adcq $0,%r8 + + + + addq %rcx,%r10 + adcq %rbp,%r11 + + mulxq %r15,%rcx,%rbp + movq 16(%rbx),%rdx + adcq %rcx,%r12 + adcq %rbp,%r13 + adcq $0,%r8 + xorq %r9,%r9 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r10 + adoxq %rbp,%r11 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r10,%rdx + adcxq %rcx,%r13 + shlxq %r14,%r10,%rcx + adoxq %rbp,%r8 + shrxq %r14,%r10,%rbp + + adcxq %r9,%r8 + adoxq %r9,%r9 + adcq $0,%r9 + + + + addq %rcx,%r11 + adcq %rbp,%r12 + + mulxq %r15,%rcx,%rbp + movq 24(%rbx),%rdx + adcq %rcx,%r13 + adcq %rbp,%r8 + adcq $0,%r9 + xorq %r10,%r10 + + + + mulxq 0+128(%rsi),%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq 8+128(%rsi),%rcx,%rbp + adcxq %rcx,%r12 + adoxq %rbp,%r13 + + mulxq 16+128(%rsi),%rcx,%rbp + adcxq %rcx,%r13 + adoxq %rbp,%r8 + + mulxq 24+128(%rsi),%rcx,%rbp + movq %r11,%rdx + adcxq %rcx,%r8 + shlxq %r14,%r11,%rcx + adoxq %rbp,%r9 + shrxq %r14,%r11,%rbp + + adcxq %r10,%r9 + adoxq %r10,%r10 + adcq $0,%r10 + + + + addq %rcx,%r12 + adcq %rbp,%r13 + + mulxq %r15,%rcx,%rbp + movq %r12,%rbx + movq .Lpoly+8(%rip),%r14 + adcq %rcx,%r8 + movq %r13,%rdx + adcq %rbp,%r9 + adcq $0,%r10 + + + + xorl %eax,%eax + movq %r8,%rcx + sbbq $-1,%r12 + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%rbp + sbbq %r15,%r9 + sbbq $0,%r10 + + cmovcq %rbx,%r12 + cmovcq %rdx,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %rbp,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx + +.globl ecp_nistz256_sqr_mont_adx +.hidden ecp_nistz256_sqr_mont_adx +.type ecp_nistz256_sqr_mont_adx,@function +.align 32 +ecp_nistz256_sqr_mont_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 +.Lsqrx_body: + movq 0(%rsi),%rdx + movq 8(%rsi),%r14 + movq 16(%rsi),%r15 + movq 24(%rsi),%r8 + leaq -128(%rsi),%rsi + + call __ecp_nistz256_sqr_montx + + movq 0(%rsp),%r15 +.cfi_restore %r15 + movq 8(%rsp),%r14 +.cfi_restore %r14 + movq 16(%rsp),%r13 +.cfi_restore %r13 + movq 24(%rsp),%r12 +.cfi_restore %r12 + movq 32(%rsp),%rbx +.cfi_restore %rbx + movq 40(%rsp),%rbp +.cfi_restore %rbp + leaq 48(%rsp),%rsp +.cfi_adjust_cfa_offset -48 +.Lsqrx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_sqr_mont_adx,.-ecp_nistz256_sqr_mont_adx + +.type __ecp_nistz256_sqr_montx,@function +.align 32 +__ecp_nistz256_sqr_montx: +.cfi_startproc + mulxq %r14,%r9,%r10 + mulxq %r15,%rcx,%r11 + xorl %eax,%eax + adcq %rcx,%r10 + mulxq %r8,%rbp,%r12 + movq %r14,%rdx + adcq %rbp,%r11 + adcq $0,%r12 + xorq %r13,%r13 + + + mulxq %r15,%rcx,%rbp + adcxq %rcx,%r11 + adoxq %rbp,%r12 + + mulxq %r8,%rcx,%rbp + movq %r15,%rdx + adcxq %rcx,%r12 + adoxq %rbp,%r13 + adcq $0,%r13 + + + mulxq %r8,%rcx,%r14 + movq 0+128(%rsi),%rdx + xorq %r15,%r15 + adcxq %r9,%r9 + adoxq %rcx,%r13 + adcxq %r10,%r10 + adoxq %r15,%r14 + + mulxq %rdx,%r8,%rbp + movq 8+128(%rsi),%rdx + adcxq %r11,%r11 + adoxq %rbp,%r9 + adcxq %r12,%r12 + mulxq %rdx,%rcx,%rax + movq 16+128(%rsi),%rdx + adcxq %r13,%r13 + adoxq %rcx,%r10 + adcxq %r14,%r14 +.byte 0x67 + mulxq %rdx,%rcx,%rbp + movq 24+128(%rsi),%rdx + adoxq %rax,%r11 + adcxq %r15,%r15 + adoxq %rcx,%r12 + movq $32,%rsi + adoxq %rbp,%r13 +.byte 0x67,0x67 + mulxq %rdx,%rcx,%rax + movq .Lpoly+24(%rip),%rdx + adoxq %rcx,%r14 + shlxq %rsi,%r8,%rcx + adoxq %rax,%r15 + shrxq %rsi,%r8,%rax + movq %rdx,%rbp + + + addq %rcx,%r9 + adcq %rax,%r10 + + mulxq %r8,%rcx,%r8 + adcq %rcx,%r11 + shlxq %rsi,%r9,%rcx + adcq $0,%r8 + shrxq %rsi,%r9,%rax + + + addq %rcx,%r10 + adcq %rax,%r11 + + mulxq %r9,%rcx,%r9 + adcq %rcx,%r8 + shlxq %rsi,%r10,%rcx + adcq $0,%r9 + shrxq %rsi,%r10,%rax + + + addq %rcx,%r11 + adcq %rax,%r8 + + mulxq %r10,%rcx,%r10 + adcq %rcx,%r9 + shlxq %rsi,%r11,%rcx + adcq $0,%r10 + shrxq %rsi,%r11,%rax + + + addq %rcx,%r8 + adcq %rax,%r9 + + mulxq %r11,%rcx,%r11 + adcq %rcx,%r10 + adcq $0,%r11 + + xorq %rdx,%rdx + addq %r8,%r12 + movq .Lpoly+8(%rip),%rsi + adcq %r9,%r13 + movq %r12,%r8 + adcq %r10,%r14 + adcq %r11,%r15 + movq %r13,%r9 + adcq $0,%rdx + + subq $-1,%r12 + movq %r14,%r10 + sbbq %rsi,%r13 + sbbq $0,%r14 + movq %r15,%r11 + sbbq %rbp,%r15 + sbbq $0,%rdx + + cmovcq %r8,%r12 + cmovcq %r9,%r13 + movq %r12,0(%rdi) + cmovcq %r10,%r14 + movq %r13,8(%rdi) + cmovcq %r11,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx + + +.globl ecp_nistz256_select_w5_nohw +.hidden ecp_nistz256_select_w5_nohw +.type ecp_nistz256_select_w5_nohw,@function +.align 32 +ecp_nistz256_select_w5_nohw: +.cfi_startproc +_CET_ENDBR + movdqa .LOne(%rip),%xmm0 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + + movdqa %xmm0,%xmm8 + pshufd $0,%xmm1,%xmm1 + + movq $16,%rax +.Lselect_loop_sse_w5: + + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + pcmpeqd %xmm1,%xmm15 + + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + movdqa 64(%rsi),%xmm13 + movdqa 80(%rsi),%xmm14 + leaq 96(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + pand %xmm15,%xmm13 + por %xmm12,%xmm5 + pand %xmm15,%xmm14 + por %xmm13,%xmm6 + por %xmm14,%xmm7 + + decq %rax + jnz .Lselect_loop_sse_w5 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + movdqu %xmm6,64(%rdi) + movdqu %xmm7,80(%rdi) + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w5_nohw: +.size ecp_nistz256_select_w5_nohw,.-ecp_nistz256_select_w5_nohw + + + +.globl ecp_nistz256_select_w7_nohw +.hidden ecp_nistz256_select_w7_nohw +.type ecp_nistz256_select_w7_nohw,@function +.align 32 +ecp_nistz256_select_w7_nohw: +.cfi_startproc +_CET_ENDBR + movdqa .LOne(%rip),%xmm8 + movd %edx,%xmm1 + + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + + movdqa %xmm8,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq $64,%rax + +.Lselect_loop_sse_w7: + movdqa %xmm8,%xmm15 + paddd %xmm0,%xmm8 + movdqa 0(%rsi),%xmm9 + movdqa 16(%rsi),%xmm10 + pcmpeqd %xmm1,%xmm15 + movdqa 32(%rsi),%xmm11 + movdqa 48(%rsi),%xmm12 + leaq 64(%rsi),%rsi + + pand %xmm15,%xmm9 + pand %xmm15,%xmm10 + por %xmm9,%xmm2 + pand %xmm15,%xmm11 + por %xmm10,%xmm3 + pand %xmm15,%xmm12 + por %xmm11,%xmm4 + prefetcht0 255(%rsi) + por %xmm12,%xmm5 + + decq %rax + jnz .Lselect_loop_sse_w7 + + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + movdqu %xmm4,32(%rdi) + movdqu %xmm5,48(%rdi) + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w7_nohw: +.size ecp_nistz256_select_w7_nohw,.-ecp_nistz256_select_w7_nohw + + +.globl ecp_nistz256_select_w5_avx2 +.hidden ecp_nistz256_select_w5_avx2 +.type ecp_nistz256_select_w5_avx2,@function +.align 32 +ecp_nistz256_select_w5_avx2: +.cfi_startproc +_CET_ENDBR + vzeroupper + vmovdqa .LTwo(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + vpxor %ymm4,%ymm4,%ymm4 + + vmovdqa .LOne(%rip),%ymm5 + vmovdqa .LTwo(%rip),%ymm10 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + movq $8,%rax +.Lselect_loop_avx2_w5: + + vmovdqa 0(%rsi),%ymm6 + vmovdqa 32(%rsi),%ymm7 + vmovdqa 64(%rsi),%ymm8 + + vmovdqa 96(%rsi),%ymm11 + vmovdqa 128(%rsi),%ymm12 + vmovdqa 160(%rsi),%ymm13 + + vpcmpeqd %ymm1,%ymm5,%ymm9 + vpcmpeqd %ymm1,%ymm10,%ymm14 + + vpaddd %ymm0,%ymm5,%ymm5 + vpaddd %ymm0,%ymm10,%ymm10 + leaq 192(%rsi),%rsi + + vpand %ymm9,%ymm6,%ymm6 + vpand %ymm9,%ymm7,%ymm7 + vpand %ymm9,%ymm8,%ymm8 + vpand %ymm14,%ymm11,%ymm11 + vpand %ymm14,%ymm12,%ymm12 + vpand %ymm14,%ymm13,%ymm13 + + vpxor %ymm6,%ymm2,%ymm2 + vpxor %ymm7,%ymm3,%ymm3 + vpxor %ymm8,%ymm4,%ymm4 + vpxor %ymm11,%ymm2,%ymm2 + vpxor %ymm12,%ymm3,%ymm3 + vpxor %ymm13,%ymm4,%ymm4 + + decq %rax + jnz .Lselect_loop_avx2_w5 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vmovdqu %ymm4,64(%rdi) + vzeroupper + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w5_avx2: +.size ecp_nistz256_select_w5_avx2,.-ecp_nistz256_select_w5_avx2 + + + +.globl ecp_nistz256_select_w7_avx2 +.hidden ecp_nistz256_select_w7_avx2 +.type ecp_nistz256_select_w7_avx2,@function +.align 32 +ecp_nistz256_select_w7_avx2: +.cfi_startproc +_CET_ENDBR + vzeroupper + vmovdqa .LThree(%rip),%ymm0 + + vpxor %ymm2,%ymm2,%ymm2 + vpxor %ymm3,%ymm3,%ymm3 + + vmovdqa .LOne(%rip),%ymm4 + vmovdqa .LTwo(%rip),%ymm8 + vmovdqa .LThree(%rip),%ymm12 + + vmovd %edx,%xmm1 + vpermd %ymm1,%ymm2,%ymm1 + + + movq $21,%rax +.Lselect_loop_avx2_w7: + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vmovdqa 64(%rsi),%ymm9 + vmovdqa 96(%rsi),%ymm10 + + vmovdqa 128(%rsi),%ymm13 + vmovdqa 160(%rsi),%ymm14 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + vpcmpeqd %ymm1,%ymm8,%ymm11 + vpcmpeqd %ymm1,%ymm12,%ymm15 + + vpaddd %ymm0,%ymm4,%ymm4 + vpaddd %ymm0,%ymm8,%ymm8 + vpaddd %ymm0,%ymm12,%ymm12 + leaq 192(%rsi),%rsi + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + vpand %ymm11,%ymm9,%ymm9 + vpand %ymm11,%ymm10,%ymm10 + vpand %ymm15,%ymm13,%ymm13 + vpand %ymm15,%ymm14,%ymm14 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + vpxor %ymm9,%ymm2,%ymm2 + vpxor %ymm10,%ymm3,%ymm3 + vpxor %ymm13,%ymm2,%ymm2 + vpxor %ymm14,%ymm3,%ymm3 + + decq %rax + jnz .Lselect_loop_avx2_w7 + + + vmovdqa 0(%rsi),%ymm5 + vmovdqa 32(%rsi),%ymm6 + + vpcmpeqd %ymm1,%ymm4,%ymm7 + + vpand %ymm7,%ymm5,%ymm5 + vpand %ymm7,%ymm6,%ymm6 + + vpxor %ymm5,%ymm2,%ymm2 + vpxor %ymm6,%ymm3,%ymm3 + + vmovdqu %ymm2,0(%rdi) + vmovdqu %ymm3,32(%rdi) + vzeroupper + ret +.cfi_endproc +.LSEH_end_ecp_nistz256_select_w7_avx2: +.size ecp_nistz256_select_w7_avx2,.-ecp_nistz256_select_w7_avx2 +.type __ecp_nistz256_add_toq,@function +.align 32 +__ecp_nistz256_add_toq: +.cfi_startproc + xorq %r11,%r11 + addq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq + +.type __ecp_nistz256_sub_fromq,@function +.align 32 +__ecp_nistz256_sub_fromq: +.cfi_startproc + subq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq %r11,%r11 + + addq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + testq %r11,%r11 + + cmovzq %rax,%r12 + cmovzq %rbp,%r13 + movq %r12,0(%rdi) + cmovzq %rcx,%r8 + movq %r13,8(%rdi) + cmovzq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq + +.type __ecp_nistz256_subq,@function +.align 32 +__ecp_nistz256_subq: +.cfi_startproc + subq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq %r11,%r11 + + addq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + testq %r11,%r11 + + cmovnzq %rax,%r12 + cmovnzq %rbp,%r13 + cmovnzq %rcx,%r8 + cmovnzq %r10,%r9 + + ret +.cfi_endproc +.size __ecp_nistz256_subq,.-__ecp_nistz256_subq + +.type __ecp_nistz256_mul_by_2q,@function +.align 32 +__ecp_nistz256_mul_by_2q: +.cfi_startproc + xorq %r11,%r11 + addq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q +.globl ecp_nistz256_point_double_nohw +.hidden ecp_nistz256_point_double_nohw +.type ecp_nistz256_point_double_nohw,@function +.align 32 +ecp_nistz256_point_double_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doubleq_body: + +.Lpoint_double_shortcutq: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 + movq %rdi,%xmm0 + movq %r10,%xmm1 + movq %r11,%xmm2 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-0(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 32(%rbx),%rax + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-0(%rbx),%rsi + leaq 32(%rbx),%rbx + movq %xmm2,%rdi + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + movq %xmm1,%rdi + call __ecp_nistz256_sqr_montq + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rax + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_toq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2q + + movq 0+32(%rsp),%rax + movq 8+32(%rsp),%r14 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 + movq %xmm0,%rdi + call __ecp_nistz256_sqr_montq + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subq + + movq 32(%rsp),%rax + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-0(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq %xmm1,%rbx + movq %xmm1,%rdi + call __ecp_nistz256_sub_fromq + + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doubleq_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_double_nohw,.-ecp_nistz256_point_double_nohw +.globl ecp_nistz256_point_add_nohw +.hidden ecp_nistz256_point_add_nohw +.type ecp_nistz256_point_add_nohw,@function +.align 32 +ecp_nistz256_point_add_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addq_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 + movq %rdi,%xmm0 + + leaq 64-0(%rsi),%rsi + movq %rax,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rax + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 + movq %rbx,%xmm1 + + leaq 64-0(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 416(%rsp),%rax + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 512(%rsp),%rax + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq 0+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 + movq %r12,%xmm3 + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 480(%rsp),%rax + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + + movq %xmm2,%r8 + movq %xmm3,%r9 + orq %r8,%r12 +.byte 0x3e + jnz .Ladd_proceedq + + + + testq %r9,%r9 + jz .Ladd_doubleq + + + + + + + movq %xmm0,%rdi + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp .Ladd_doneq + +.align 32 +.Ladd_doubleq: + movq %xmm1,%rsi + movq %xmm0,%rdi + addq $416,%rsp +.cfi_adjust_cfa_offset -416 + jmp .Lpoint_double_shortcutq +.cfi_adjust_cfa_offset 416 + +.align 32 +.Ladd_proceedq: + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0+0(%rsp),%rax + movq 8+0(%rsp),%r14 + leaq 0+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 544(%rsp),%rax + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq 0+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 0(%rsp),%rax + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 160(%rsp),%rax + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq 0+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq %xmm0,%rdi + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_doneq: + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addq_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add_nohw,.-ecp_nistz256_point_add_nohw +.globl ecp_nistz256_point_add_affine_nohw +.hidden ecp_nistz256_point_add_affine_nohw +.type ecp_nistz256_point_add_affine_nohw,@function +.align 32 +ecp_nistz256_point_add_affine_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affineq_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rax + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 + movq %rdi,%xmm0 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-0(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rax + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-0(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 384(%rsp),%rax + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 448(%rsp),%rax + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq 0+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+64(%rsp),%rax + movq 8+64(%rsp),%r14 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 0+96(%rsp),%rax + movq 8+96(%rsp),%r14 + leaq 0+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montq + + movq 128(%rsp),%rax + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 320(%rsp),%rax + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq 0+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montq + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subq + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subq + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rax + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq 0+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montq + + movq 96(%rsp),%rax + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq 0+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montq + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromq + + movq %xmm0,%rdi + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affineq_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add_affine_nohw,.-ecp_nistz256_point_add_affine_nohw +.type __ecp_nistz256_add_tox,@function +.align 32 +__ecp_nistz256_add_tox: +.cfi_startproc + xorq %r11,%r11 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + movq %r12,%rax + adcq 16(%rbx),%r8 + adcq 24(%rbx),%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox + +.type __ecp_nistz256_sub_fromx,@function +.align 32 +__ecp_nistz256_sub_fromx: +.cfi_startproc + xorq %r11,%r11 + sbbq 0(%rbx),%r12 + sbbq 8(%rbx),%r13 + movq %r12,%rax + sbbq 16(%rbx),%r8 + sbbq 24(%rbx),%r9 + movq %r13,%rbp + sbbq $0,%r11 + + xorq %r10,%r10 + adcq $-1,%r12 + movq %r8,%rcx + adcq %r14,%r13 + adcq $0,%r8 + movq %r9,%r10 + adcq %r15,%r9 + + btq $0,%r11 + cmovncq %rax,%r12 + cmovncq %rbp,%r13 + movq %r12,0(%rdi) + cmovncq %rcx,%r8 + movq %r13,8(%rdi) + cmovncq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx + +.type __ecp_nistz256_subx,@function +.align 32 +__ecp_nistz256_subx: +.cfi_startproc + xorq %r11,%r11 + sbbq %r12,%rax + sbbq %r13,%rbp + movq %rax,%r12 + sbbq %r8,%rcx + sbbq %r9,%r10 + movq %rbp,%r13 + sbbq $0,%r11 + + xorq %r9,%r9 + adcq $-1,%rax + movq %rcx,%r8 + adcq %r14,%rbp + adcq $0,%rcx + movq %r10,%r9 + adcq %r15,%r10 + + btq $0,%r11 + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + cmovcq %rcx,%r8 + cmovcq %r10,%r9 + + ret +.cfi_endproc +.size __ecp_nistz256_subx,.-__ecp_nistz256_subx + +.type __ecp_nistz256_mul_by_2x,@function +.align 32 +__ecp_nistz256_mul_by_2x: +.cfi_startproc + xorq %r11,%r11 + adcq %r12,%r12 + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + xorq %r10,%r10 + sbbq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + cmovcq %rbp,%r13 + movq %r12,0(%rdi) + cmovcq %rcx,%r8 + movq %r13,8(%rdi) + cmovcq %r10,%r9 + movq %r8,16(%rdi) + movq %r9,24(%rdi) + + ret +.cfi_endproc +.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x +.globl ecp_nistz256_point_double_adx +.hidden ecp_nistz256_point_double_adx +.type ecp_nistz256_point_double_adx,@function +.align 32 +ecp_nistz256_point_double_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $160+8,%rsp +.cfi_adjust_cfa_offset 32*5+8 +.Lpoint_doublex_body: + +.Lpoint_double_shortcutx: + movdqu 0(%rsi),%xmm0 + movq %rsi,%rbx + movdqu 16(%rsi),%xmm1 + movq 32+0(%rsi),%r12 + movq 32+8(%rsi),%r13 + movq 32+16(%rsi),%r8 + movq 32+24(%rsi),%r9 + movq .Lpoly+8(%rip),%r14 + movq .Lpoly+24(%rip),%r15 + movdqa %xmm0,96(%rsp) + movdqa %xmm1,96+16(%rsp) + leaq 32(%rdi),%r10 + leaq 64(%rdi),%r11 + movq %rdi,%xmm0 + movq %r10,%xmm1 + movq %r11,%xmm2 + + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + leaq 64-128(%rsi),%rsi + leaq 64(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 0(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 32(%rbx),%rdx + movq 64+0(%rbx),%r9 + movq 64+8(%rbx),%r10 + movq 64+16(%rbx),%r11 + movq 64+24(%rbx),%r12 + leaq 64-128(%rbx),%rsi + leaq 32(%rbx),%rbx + movq %xmm2,%rdi + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96+0(%rsp),%r12 + movq 96+8(%rsp),%r13 + leaq 64(%rsp),%rbx + movq 96+16(%rsp),%r8 + movq 96+24(%rsp),%r9 + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + movq %xmm1,%rdi + call __ecp_nistz256_sqr_montx + xorq %r9,%r9 + movq %r12,%rax + addq $-1,%r12 + movq %r13,%r10 + adcq %rsi,%r13 + movq %r14,%rcx + adcq $0,%r14 + movq %r15,%r8 + adcq %rbp,%r15 + adcq $0,%r9 + xorq %rsi,%rsi + testq $1,%rax + + cmovzq %rax,%r12 + cmovzq %r10,%r13 + cmovzq %rcx,%r14 + cmovzq %r8,%r15 + cmovzq %rsi,%r9 + + movq %r13,%rax + shrq $1,%r12 + shlq $63,%rax + movq %r14,%r10 + shrq $1,%r13 + orq %rax,%r12 + shlq $63,%r10 + movq %r15,%rcx + shrq $1,%r14 + orq %r10,%r13 + shlq $63,%rcx + movq %r12,0(%rdi) + shrq $1,%r15 + movq %r13,8(%rdi) + shlq $63,%r9 + orq %rcx,%r14 + orq %r9,%r15 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + movq 64(%rsp),%rdx + leaq 64(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + leaq 32(%rsp),%rbx + leaq 32(%rsp),%rdi + call __ecp_nistz256_add_tox + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_by_2x + + movq 0+32(%rsp),%rdx + movq 8+32(%rsp),%r14 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r15 + movq 24+32(%rsp),%r8 + movq %xmm0,%rdi + call __ecp_nistz256_sqr_montx + + leaq 128(%rsp),%rbx + movq %r14,%r8 + movq %r15,%r9 + movq %rsi,%r14 + movq %rbp,%r15 + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 0(%rsp),%rdi + call __ecp_nistz256_subx + + movq 32(%rsp),%rdx + leaq 32(%rsp),%rbx + movq %r12,%r14 + xorl %ecx,%ecx + movq %r12,0+0(%rsp) + movq %r13,%r10 + movq %r13,0+8(%rsp) + cmovzq %r8,%r11 + movq %r8,0+16(%rsp) + leaq 0-128(%rsp),%rsi + cmovzq %r9,%r12 + movq %r9,0+24(%rsp) + movq %r14,%r9 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq %xmm1,%rbx + movq %xmm1,%rdi + call __ecp_nistz256_sub_fromx + + leaq 160+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_doublex_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_double_adx,.-ecp_nistz256_point_double_adx +.globl ecp_nistz256_point_add_adx +.hidden ecp_nistz256_point_add_adx +.type ecp_nistz256_point_add_adx,@function +.align 32 +ecp_nistz256_point_add_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $576+8,%rsp +.cfi_adjust_cfa_offset 32*18+8 +.Lpoint_addx_body: + + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq %rsi,%rbx + movq %rdx,%rsi + movdqa %xmm0,384(%rsp) + movdqa %xmm1,384+16(%rsp) + movdqa %xmm2,416(%rsp) + movdqa %xmm3,416+16(%rsp) + movdqa %xmm4,448(%rsp) + movdqa %xmm5,448+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rsi),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rsi),%xmm3 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,480(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,480+16(%rsp) + movdqu 64(%rsi),%xmm0 + movdqu 80(%rsi),%xmm1 + movdqa %xmm2,512(%rsp) + movdqa %xmm3,512+16(%rsp) + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm0,%xmm1 + movq %rdi,%xmm0 + + leaq 64-128(%rsi),%rsi + movq %rdx,544+0(%rsp) + movq %r14,544+8(%rsp) + movq %r15,544+16(%rsp) + movq %r8,544+24(%rsp) + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm1,%xmm4 + por %xmm1,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + movq 64+0(%rbx),%rdx + movq 64+8(%rbx),%r14 + movq 64+16(%rbx),%r15 + movq 64+24(%rbx),%r8 + movq %rbx,%xmm1 + + leaq 64-128(%rbx),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 416(%rsp),%rdx + leaq 416(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 224(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 512(%rsp),%rdx + leaq 512(%rsp),%rbx + movq 0+256(%rsp),%r9 + movq 8+256(%rsp),%r10 + leaq -128+256(%rsp),%rsi + movq 16+256(%rsp),%r11 + movq 24+256(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 224(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + movdqa %xmm4,%xmm2 + orq %r8,%r12 + orq %r9,%r12 + por %xmm5,%xmm2 + movq %r12,%xmm3 + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+96(%rsp),%r9 + movq 8+96(%rsp),%r10 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r11 + movq 24+96(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 480(%rsp),%rdx + leaq 480(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 160(%rsp),%rbx + leaq 0(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + orq %r13,%r12 + orq %r8,%r12 + orq %r9,%r12 + + movq %xmm2,%r8 + movq %xmm3,%r9 + orq %r8,%r12 +.byte 0x3e + jnz .Ladd_proceedx + + + + testq %r9,%r9 + jz .Ladd_doublex + + + + + + + movq %xmm0,%rdi + pxor %xmm0,%xmm0 + movdqu %xmm0,0(%rdi) + movdqu %xmm0,16(%rdi) + movdqu %xmm0,32(%rdi) + movdqu %xmm0,48(%rdi) + movdqu %xmm0,64(%rdi) + movdqu %xmm0,80(%rdi) + jmp .Ladd_donex + +.align 32 +.Ladd_doublex: + movq %xmm1,%rsi + movq %xmm0,%rdi + addq $416,%rsp +.cfi_adjust_cfa_offset -416 + jmp .Lpoint_double_shortcutx +.cfi_adjust_cfa_offset 416 + +.align 32 +.Ladd_proceedx: + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 96(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+0(%rsp),%r9 + movq 8+0(%rsp),%r10 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r11 + movq 24+0(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0+0(%rsp),%rdx + movq 8+0(%rsp),%r14 + leaq -128+0(%rsp),%rsi + movq 16+0(%rsp),%r15 + movq 24+0(%rsp),%r8 + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 544(%rsp),%rdx + leaq 544(%rsp),%rbx + movq 0+352(%rsp),%r9 + movq 8+352(%rsp),%r10 + leaq -128+352(%rsp),%rsi + movq 16+352(%rsp),%r11 + movq 24+352(%rsp),%r12 + leaq 352(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 0(%rsp),%rdx + leaq 0(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 128(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 160(%rsp),%rdx + leaq 160(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 192(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 96(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 128(%rsp),%rbx + leaq 288(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 192+0(%rsp),%rax + movq 192+8(%rsp),%rbp + movq 192+16(%rsp),%rcx + movq 192+24(%rsp),%r10 + leaq 320(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+224(%rsp),%r9 + movq 8+224(%rsp),%r10 + leaq -128+224(%rsp),%rsi + movq 16+224(%rsp),%r11 + movq 24+224(%rsp),%r12 + leaq 256(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 320(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 256(%rsp),%rbx + leaq 320(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq %xmm0,%rdi + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 352(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 352+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 544(%rsp),%xmm2 + pand 544+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 480(%rsp),%xmm2 + pand 480+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 320(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 320+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 512(%rsp),%xmm2 + pand 512+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + +.Ladd_donex: + leaq 576+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpoint_addx_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add_adx,.-ecp_nistz256_point_add_adx +.globl ecp_nistz256_point_add_affine_adx +.hidden ecp_nistz256_point_add_affine_adx +.type ecp_nistz256_point_add_affine_adx,@function +.align 32 +ecp_nistz256_point_add_affine_adx: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + subq $480+8,%rsp +.cfi_adjust_cfa_offset 32*15+8 +.Ladd_affinex_body: + + movdqu 0(%rsi),%xmm0 + movq %rdx,%rbx + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm3 + movdqu 64(%rsi),%xmm4 + movdqu 80(%rsi),%xmm5 + movq 64+0(%rsi),%rdx + movq 64+8(%rsi),%r14 + movq 64+16(%rsi),%r15 + movq 64+24(%rsi),%r8 + movdqa %xmm0,320(%rsp) + movdqa %xmm1,320+16(%rsp) + movdqa %xmm2,352(%rsp) + movdqa %xmm3,352+16(%rsp) + movdqa %xmm4,384(%rsp) + movdqa %xmm5,384+16(%rsp) + por %xmm4,%xmm5 + + movdqu 0(%rbx),%xmm0 + pshufd $0xb1,%xmm5,%xmm3 + movdqu 16(%rbx),%xmm1 + movdqu 32(%rbx),%xmm2 + por %xmm3,%xmm5 + movdqu 48(%rbx),%xmm3 + movdqa %xmm0,416(%rsp) + pshufd $0x1e,%xmm5,%xmm4 + movdqa %xmm1,416+16(%rsp) + por %xmm0,%xmm1 + movq %rdi,%xmm0 + movdqa %xmm2,448(%rsp) + movdqa %xmm3,448+16(%rsp) + por %xmm2,%xmm3 + por %xmm4,%xmm5 + pxor %xmm4,%xmm4 + por %xmm1,%xmm3 + + leaq 64-128(%rsi),%rsi + leaq 32(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + pcmpeqd %xmm4,%xmm5 + pshufd $0xb1,%xmm3,%xmm4 + movq 0(%rbx),%rdx + + movq %r12,%r9 + por %xmm3,%xmm4 + pshufd $0,%xmm5,%xmm5 + pshufd $0x1e,%xmm4,%xmm3 + movq %r13,%r10 + por %xmm3,%xmm4 + pxor %xmm3,%xmm3 + movq %r14,%r11 + pcmpeqd %xmm3,%xmm4 + pshufd $0,%xmm4,%xmm4 + + leaq 32-128(%rsp),%rsi + movq %r15,%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 320(%rsp),%rbx + leaq 64(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 384(%rsp),%rdx + leaq 384(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 288(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 448(%rsp),%rdx + leaq 448(%rsp),%rbx + movq 0+32(%rsp),%r9 + movq 8+32(%rsp),%r10 + leaq -128+32(%rsp),%rsi + movq 16+32(%rsp),%r11 + movq 24+32(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 352(%rsp),%rbx + leaq 96(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+64(%rsp),%rdx + movq 8+64(%rsp),%r14 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r15 + movq 24+64(%rsp),%r8 + leaq 128(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 0+96(%rsp),%rdx + movq 8+96(%rsp),%r14 + leaq -128+96(%rsp),%rsi + movq 16+96(%rsp),%r15 + movq 24+96(%rsp),%r8 + leaq 192(%rsp),%rdi + call __ecp_nistz256_sqr_montx + + movq 128(%rsp),%rdx + leaq 128(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 160(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 320(%rsp),%rdx + leaq 320(%rsp),%rbx + movq 0+128(%rsp),%r9 + movq 8+128(%rsp),%r10 + leaq -128+128(%rsp),%rsi + movq 16+128(%rsp),%r11 + movq 24+128(%rsp),%r12 + leaq 0(%rsp),%rdi + call __ecp_nistz256_mul_montx + + + + + xorq %r11,%r11 + addq %r12,%r12 + leaq 192(%rsp),%rsi + adcq %r13,%r13 + movq %r12,%rax + adcq %r8,%r8 + adcq %r9,%r9 + movq %r13,%rbp + adcq $0,%r11 + + subq $-1,%r12 + movq %r8,%rcx + sbbq %r14,%r13 + sbbq $0,%r8 + movq %r9,%r10 + sbbq %r15,%r9 + sbbq $0,%r11 + + cmovcq %rax,%r12 + movq 0(%rsi),%rax + cmovcq %rbp,%r13 + movq 8(%rsi),%rbp + cmovcq %rcx,%r8 + movq 16(%rsi),%rcx + cmovcq %r10,%r9 + movq 24(%rsi),%r10 + + call __ecp_nistz256_subx + + leaq 160(%rsp),%rbx + leaq 224(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq 0+0(%rsp),%rax + movq 0+8(%rsp),%rbp + movq 0+16(%rsp),%rcx + movq 0+24(%rsp),%r10 + leaq 64(%rsp),%rdi + + call __ecp_nistz256_subx + + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r8,16(%rdi) + movq %r9,24(%rdi) + movq 352(%rsp),%rdx + leaq 352(%rsp),%rbx + movq 0+160(%rsp),%r9 + movq 8+160(%rsp),%r10 + leaq -128+160(%rsp),%rsi + movq 16+160(%rsp),%r11 + movq 24+160(%rsp),%r12 + leaq 32(%rsp),%rdi + call __ecp_nistz256_mul_montx + + movq 96(%rsp),%rdx + leaq 96(%rsp),%rbx + movq 0+64(%rsp),%r9 + movq 8+64(%rsp),%r10 + leaq -128+64(%rsp),%rsi + movq 16+64(%rsp),%r11 + movq 24+64(%rsp),%r12 + leaq 64(%rsp),%rdi + call __ecp_nistz256_mul_montx + + leaq 32(%rsp),%rbx + leaq 256(%rsp),%rdi + call __ecp_nistz256_sub_fromx + + movq %xmm0,%rdi + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 288(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 288+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand .LONE_mont(%rip),%xmm2 + pand .LONE_mont+16(%rip),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 384(%rsp),%xmm2 + pand 384+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,64(%rdi) + movdqu %xmm3,80(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 224(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 224+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 416(%rsp),%xmm2 + pand 416+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 320(%rsp),%xmm2 + pand 320+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,0(%rdi) + movdqu %xmm3,16(%rdi) + + movdqa %xmm5,%xmm0 + movdqa %xmm5,%xmm1 + pandn 256(%rsp),%xmm0 + movdqa %xmm5,%xmm2 + pandn 256+16(%rsp),%xmm1 + movdqa %xmm5,%xmm3 + pand 448(%rsp),%xmm2 + pand 448+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + + movdqa %xmm4,%xmm0 + movdqa %xmm4,%xmm1 + pandn %xmm2,%xmm0 + movdqa %xmm4,%xmm2 + pandn %xmm3,%xmm1 + movdqa %xmm4,%xmm3 + pand 352(%rsp),%xmm2 + pand 352+16(%rsp),%xmm3 + por %xmm0,%xmm2 + por %xmm1,%xmm3 + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + + leaq 480+56(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbx +.cfi_restore %rbx + movq -8(%rsi),%rbp +.cfi_restore %rbp + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Ladd_affinex_epilogue: + ret +.cfi_endproc +.size ecp_nistz256_point_add_affine_adx,.-ecp_nistz256_point_add_affine_adx +#endif diff --git a/third_party/boringssl/gen/bcm/p256-x86_64-asm-win.asm b/third_party/boringssl/gen/bcm/p256-x86_64-asm-win.asm new file mode 100644 index 00000000..729e8ece --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256-x86_64-asm-win.asm @@ -0,0 +1,5073 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + + +section .rdata rdata align=8 +ALIGN 64 +$L$poly: + DQ 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 + +$L$One: + DD 1,1,1,1,1,1,1,1 +$L$Two: + DD 2,2,2,2,2,2,2,2 +$L$Three: + DD 3,3,3,3,3,3,3,3 +$L$ONE_mont: + DQ 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe + + +$L$ord: + DQ 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 +$L$ordK: + DQ 0xccd1c8aaee00bc4f +section .text + + + + +global ecp_nistz256_neg + +ALIGN 32 +ecp_nistz256_neg: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_neg: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + push r12 + + push r13 + +$L$neg_body: + + xor r8,r8 + xor r9,r9 + xor r10,r10 + xor r11,r11 + xor r13,r13 + + sub r8,QWORD[rsi] + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov rax,r8 + sbb r11,QWORD[24+rsi] + lea rsi,[$L$poly] + mov rdx,r9 + sbb r13,0 + + add r8,QWORD[rsi] + mov rcx,r10 + adc r9,QWORD[8+rsi] + adc r10,QWORD[16+rsi] + mov r12,r11 + adc r11,QWORD[24+rsi] + test r13,r13 + + cmovz r8,rax + cmovz r9,rdx + mov QWORD[rdi],r8 + cmovz r10,rcx + mov QWORD[8+rdi],r9 + cmovz r11,r12 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + mov r13,QWORD[rsp] + + mov r12,QWORD[8+rsp] + + lea rsp,[16+rsp] + +$L$neg_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_neg: + + + + + + +global ecp_nistz256_ord_mul_mont_nohw + +ALIGN 32 +ecp_nistz256_ord_mul_mont_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_mul_mont_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mul_body: + + mov rax,QWORD[rdx] + mov rbx,rdx + lea r14,[$L$ord] + mov r15,QWORD[$L$ordK] + + + mov rcx,rax + mul QWORD[rsi] + mov r8,rax + mov rax,rcx + mov r9,rdx + + mul QWORD[8+rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov r10,rdx + + mul QWORD[16+rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + + mov r13,r8 + imul r8,r15 + + mov r11,rdx + mul QWORD[24+rsi] + add r11,rax + mov rax,r8 + adc rdx,0 + mov r12,rdx + + + mul QWORD[r14] + mov rbp,r8 + add r13,rax + mov rax,r8 + adc rdx,0 + mov rcx,rdx + + sub r10,r8 + sbb r8,0 + + mul QWORD[8+r14] + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,rbp + adc r10,rdx + mov rdx,rbp + adc r8,0 + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[8+rbx] + sbb rbp,rdx + + add r11,r8 + adc r12,rbp + adc r13,0 + + + mov rcx,rax + mul QWORD[rsi] + add r9,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r10,rbp + adc rdx,0 + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r9 + imul r9,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r12,rbp + adc rdx,0 + xor r8,r8 + add r12,rax + mov rax,r9 + adc r13,rdx + adc r8,0 + + + mul QWORD[r14] + mov rbp,r9 + add rcx,rax + mov rax,r9 + adc rcx,rdx + + sub r11,r9 + sbb r9,0 + + mul QWORD[8+r14] + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,rbp + adc r11,rdx + mov rdx,rbp + adc r9,0 + + shl rax,32 + shr rdx,32 + sub r12,rax + mov rax,QWORD[16+rbx] + sbb rbp,rdx + + add r12,r9 + adc r13,rbp + adc r8,0 + + + mov rcx,rax + mul QWORD[rsi] + add r10,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r11,rbp + adc rdx,0 + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r10 + imul r10,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r13,rbp + adc rdx,0 + xor r9,r9 + add r13,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + + mul QWORD[r14] + mov rbp,r10 + add rcx,rax + mov rax,r10 + adc rcx,rdx + + sub r12,r10 + sbb r10,0 + + mul QWORD[8+r14] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc r12,rdx + mov rdx,rbp + adc r10,0 + + shl rax,32 + shr rdx,32 + sub r13,rax + mov rax,QWORD[24+rbx] + sbb rbp,rdx + + add r13,r10 + adc r8,rbp + adc r9,0 + + + mov rcx,rax + mul QWORD[rsi] + add r11,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[8+rsi] + add r12,rbp + adc rdx,0 + add r12,rax + mov rax,rcx + adc rdx,0 + mov rbp,rdx + + mul QWORD[16+rsi] + add r13,rbp + adc rdx,0 + add r13,rax + mov rax,rcx + adc rdx,0 + + mov rcx,r11 + imul r11,r15 + + mov rbp,rdx + mul QWORD[24+rsi] + add r8,rbp + adc rdx,0 + xor r10,r10 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + + mul QWORD[r14] + mov rbp,r11 + add rcx,rax + mov rax,r11 + adc rcx,rdx + + sub r13,r11 + sbb r11,0 + + mul QWORD[8+r14] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc r13,rdx + mov rdx,rbp + adc r11,0 + + shl rax,32 + shr rdx,32 + sub r8,rax + sbb rbp,rdx + + add r8,r11 + adc r9,rbp + adc r10,0 + + + mov rsi,r12 + sub r12,QWORD[r14] + mov r11,r13 + sbb r13,QWORD[8+r14] + mov rcx,r8 + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rsi + cmovc r13,r11 + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_ord_mul_mont_nohw: + + + + + + + +global ecp_nistz256_ord_sqr_mont_nohw + +ALIGN 32 +ecp_nistz256_ord_sqr_mont_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_sqr_mont_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqr_body: + + mov r8,QWORD[rsi] + mov rax,QWORD[8+rsi] + mov r14,QWORD[16+rsi] + mov r15,QWORD[24+rsi] + lea rsi,[$L$ord] + mov rbx,rdx + jmp NEAR $L$oop_ord_sqr + +ALIGN 32 +$L$oop_ord_sqr: + + mov rbp,rax + mul r8 + mov r9,rax + movq xmm1,rbp + mov rax,r14 + mov r10,rdx + + mul r8 + add r10,rax + mov rax,r15 + movq xmm2,r14 + adc rdx,0 + mov r11,rdx + + mul r8 + add r11,rax + mov rax,r15 + movq xmm3,r15 + adc rdx,0 + mov r12,rdx + + + mul r14 + mov r13,rax + mov rax,r14 + mov r14,rdx + + + mul rbp + add r11,rax + mov rax,r15 + adc rdx,0 + mov r15,rdx + + mul rbp + add r12,rax + adc rdx,0 + + add r12,r15 + adc r13,rdx + adc r14,0 + + + xor r15,r15 + mov rax,r8 + add r9,r9 + adc r10,r10 + adc r11,r11 + adc r12,r12 + adc r13,r13 + adc r14,r14 + adc r15,0 + + + mul rax + mov r8,rax + movq rax,xmm1 + mov rbp,rdx + + mul rax + add r9,rbp + adc r10,rax + movq rax,xmm2 + adc rdx,0 + mov rbp,rdx + + mul rax + add r11,rbp + adc r12,rax + movq rax,xmm3 + adc rdx,0 + mov rbp,rdx + + mov rcx,r8 + imul r8,QWORD[32+rsi] + + mul rax + add r13,rbp + adc r14,rax + mov rax,QWORD[rsi] + adc r15,rdx + + + mul r8 + mov rbp,r8 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r10,r8 + sbb rbp,0 + + mul r8 + add r9,rcx + adc rdx,0 + add r9,rax + mov rax,r8 + adc r10,rdx + mov rdx,r8 + adc rbp,0 + + mov rcx,r9 + imul r9,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r11,rax + mov rax,QWORD[rsi] + sbb r8,rdx + + add r11,rbp + adc r8,0 + + + mul r9 + mov rbp,r9 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r11,r9 + sbb rbp,0 + + mul r9 + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,r9 + adc r11,rdx + mov rdx,r9 + adc rbp,0 + + mov rcx,r10 + imul r10,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r8,rax + mov rax,QWORD[rsi] + sbb r9,rdx + + add r8,rbp + adc r9,0 + + + mul r10 + mov rbp,r10 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r8,r10 + sbb rbp,0 + + mul r10 + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,r10 + adc r8,rdx + mov rdx,r10 + adc rbp,0 + + mov rcx,r11 + imul r11,QWORD[32+rsi] + + shl rax,32 + shr rdx,32 + sub r9,rax + mov rax,QWORD[rsi] + sbb r10,rdx + + add r9,rbp + adc r10,0 + + + mul r11 + mov rbp,r11 + add rcx,rax + mov rax,QWORD[8+rsi] + adc rcx,rdx + + sub r9,r11 + sbb rbp,0 + + mul r11 + add r8,rcx + adc rdx,0 + add r8,rax + mov rax,r11 + adc r9,rdx + mov rdx,r11 + adc rbp,0 + + shl rax,32 + shr rdx,32 + sub r10,rax + sbb r11,rdx + + add r10,rbp + adc r11,0 + + + xor rdx,rdx + add r8,r12 + adc r9,r13 + mov r12,r8 + adc r10,r14 + adc r11,r15 + mov rax,r9 + adc rdx,0 + + + sub r8,QWORD[rsi] + mov r14,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r15,r11 + sbb r11,QWORD[24+rsi] + sbb rdx,0 + + cmovc r8,r12 + cmovnc rax,r9 + cmovnc r14,r10 + cmovnc r15,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqr + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],rax + pxor xmm1,xmm1 + mov QWORD[16+rdi],r14 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r15 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_ord_sqr_mont_nohw: + +global ecp_nistz256_ord_mul_mont_adx + +ALIGN 32 +ecp_nistz256_ord_mul_mont_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_mul_mont_adx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +$L$ecp_nistz256_ord_mul_mont_adx: +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_mulx_body: + + mov rbx,rdx + mov rdx,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + lea r14,[(($L$ord-128))] + mov r15,QWORD[$L$ordK] + + + mulx r9,r8,r9 + mulx r10,rcx,r10 + mulx r11,rbp,r11 + add r9,rcx + mulx r12,rcx,r12 + mov rdx,r8 + mulx rax,rdx,r15 + adc r10,rbp + adc r11,rcx + adc r12,0 + + + xor r13,r13 + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r8,rcx + adox r9,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[8+rbx] + adcx r11,rcx + adox r12,rbp + adcx r12,r8 + adox r13,r8 + adc r13,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r9 + mulx rax,rdx,r15 + adcx r12,rcx + adox r13,rbp + + adcx r13,r8 + adox r8,r8 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[16+rbx] + adcx r12,rcx + adox r13,rbp + adcx r13,r9 + adox r8,r9 + adc r8,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r10 + mulx rax,rdx,r15 + adcx r13,rcx + adox r8,rbp + + adcx r8,r9 + adox r9,r9 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + mov rdx,QWORD[24+rbx] + adcx r13,rcx + adox r8,rbp + adcx r8,r10 + adox r9,r10 + adc r9,0 + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r11 + mulx rax,rdx,r15 + adcx r8,rcx + adox r9,rbp + + adcx r9,r10 + adox r10,r10 + adc r10,0 + + + mulx rbp,rcx,QWORD[((0+128))+r14] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+r14] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+r14] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+r14] + lea r14,[128+r14] + mov rbx,r12 + adcx r8,rcx + adox r9,rbp + mov rdx,r13 + adcx r9,r11 + adox r10,r11 + adc r10,0 + + + + mov rcx,r8 + sub r12,QWORD[r14] + sbb r13,QWORD[8+r14] + sbb r8,QWORD[16+r14] + mov rbp,r9 + sbb r9,QWORD[24+r14] + sbb r10,0 + + cmovc r12,rbx + cmovc r13,rdx + cmovc r8,rcx + cmovc r9,rbp + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_mulx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_ord_mul_mont_adx: + +global ecp_nistz256_ord_sqr_mont_adx + +ALIGN 32 +ecp_nistz256_ord_sqr_mont_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_ord_sqr_mont_adx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR +$L$ecp_nistz256_ord_sqr_mont_adx: + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$ord_sqrx_body: + + mov rbx,rdx + mov rdx,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + lea rsi,[$L$ord] + jmp NEAR $L$oop_ord_sqrx + +ALIGN 32 +$L$oop_ord_sqrx: + mulx r10,r9,r14 + mulx r11,rcx,r15 + mov rax,rdx + movq xmm1,r14 + mulx r12,rbp,r8 + mov rdx,r14 + add r10,rcx + movq xmm2,r15 + adc r11,rbp + adc r12,0 + xor r13,r13 + + mulx rbp,rcx,r15 + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,r8 + mov rdx,r15 + adcx r12,rcx + adox r13,rbp + adc r13,0 + + mulx r14,rcx,r8 + mov rdx,rax + movq xmm3,r8 + xor r15,r15 + adcx r9,r9 + adox r13,rcx + adcx r10,r10 + adox r14,r15 + + + mulx rbp,r8,rdx + movq rdx,xmm1 + adcx r11,r11 + adox r9,rbp + adcx r12,r12 + mulx rax,rcx,rdx + movq rdx,xmm2 + adcx r13,r13 + adox r10,rcx + adcx r14,r14 + mulx rbp,rcx,rdx + DB 0x67 + movq rdx,xmm3 + adox r11,rax + adcx r15,r15 + adox r12,rcx + adox r13,rbp + mulx rax,rcx,rdx + adox r14,rcx + adox r15,rax + + + mov rdx,r8 + mulx rcx,rdx,QWORD[32+rsi] + + xor rax,rax + mulx rbp,rcx,QWORD[rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r9,rcx + adox r10,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r11,rcx + adox r8,rbp + adcx r8,rax + + + mov rdx,r9 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r10,rcx + adcx r11,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r8,rcx + adcx r9,rbp + adox r9,rax + + + mov rdx,r10 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adcx r10,rcx + adox r11,rbp + mulx rbp,rcx,QWORD[8+rsi] + adcx r11,rcx + adox r8,rbp + mulx rbp,rcx,QWORD[16+rsi] + adcx r8,rcx + adox r9,rbp + mulx rbp,rcx,QWORD[24+rsi] + adcx r9,rcx + adox r10,rbp + adcx r10,rax + + + mov rdx,r11 + mulx rcx,rdx,QWORD[32+rsi] + + mulx rbp,rcx,QWORD[rsi] + adox r11,rcx + adcx r8,rbp + mulx rbp,rcx,QWORD[8+rsi] + adox r8,rcx + adcx r9,rbp + mulx rbp,rcx,QWORD[16+rsi] + adox r9,rcx + adcx r10,rbp + mulx rbp,rcx,QWORD[24+rsi] + adox r10,rcx + adcx r11,rbp + adox r11,rax + + + add r12,r8 + adc r9,r13 + mov rdx,r12 + adc r10,r14 + adc r11,r15 + mov r14,r9 + adc rax,0 + + + sub r12,QWORD[rsi] + mov r15,r10 + sbb r9,QWORD[8+rsi] + sbb r10,QWORD[16+rsi] + mov r8,r11 + sbb r11,QWORD[24+rsi] + sbb rax,0 + + cmovnc rdx,r12 + cmovnc r14,r9 + cmovnc r15,r10 + cmovnc r8,r11 + + dec rbx + jnz NEAR $L$oop_ord_sqrx + + mov QWORD[rdi],rdx + mov QWORD[8+rdi],r14 + pxor xmm1,xmm1 + mov QWORD[16+rdi],r15 + pxor xmm2,xmm2 + mov QWORD[24+rdi],r8 + pxor xmm3,xmm3 + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$ord_sqrx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_ord_sqr_mont_adx: + + + + + + +global ecp_nistz256_mul_mont_nohw + +ALIGN 32 +ecp_nistz256_mul_mont_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_mul_mont_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mul_body: + mov rbx,rdx + mov rax,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + + call __ecp_nistz256_mul_montq + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_mul_mont_nohw: + + +ALIGN 32 +__ecp_nistz256_mul_montq: + + + + mov rbp,rax + mul r9 + mov r14,QWORD[(($L$poly+8))] + mov r8,rax + mov rax,rbp + mov r9,rdx + + mul r10 + mov r15,QWORD[(($L$poly+24))] + add r9,rax + mov rax,rbp + adc rdx,0 + mov r10,rdx + + mul r11 + add r10,rax + mov rax,rbp + adc rdx,0 + mov r11,rdx + + mul r12 + add r11,rax + mov rax,r8 + adc rdx,0 + xor r13,r13 + mov r12,rdx + + + + + + + + + + + mov rbp,r8 + shl r8,32 + mul r15 + shr rbp,32 + add r9,r8 + adc r10,rbp + adc r11,rax + mov rax,QWORD[8+rbx] + adc r12,rdx + adc r13,0 + xor r8,r8 + + + + mov rbp,rax + mul QWORD[rsi] + add r9,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r10,rcx + adc rdx,0 + add r10,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,r9 + adc r13,rdx + adc r8,0 + + + + mov rbp,r9 + shl r9,32 + mul r15 + shr rbp,32 + add r10,r9 + adc r11,rbp + adc r12,rax + mov rax,QWORD[16+rbx] + adc r13,rdx + adc r8,0 + xor r9,r9 + + + + mov rbp,rax + mul QWORD[rsi] + add r10,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r11,rcx + adc rdx,0 + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r13,rcx + adc rdx,0 + add r13,rax + mov rax,r10 + adc r8,rdx + adc r9,0 + + + + mov rbp,r10 + shl r10,32 + mul r15 + shr rbp,32 + add r11,r10 + adc r12,rbp + adc r13,rax + mov rax,QWORD[24+rbx] + adc r8,rdx + adc r9,0 + xor r10,r10 + + + + mov rbp,rax + mul QWORD[rsi] + add r11,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[8+rsi] + add r12,rcx + adc rdx,0 + add r12,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[16+rsi] + add r13,rcx + adc rdx,0 + add r13,rax + mov rax,rbp + adc rdx,0 + mov rcx,rdx + + mul QWORD[24+rsi] + add r8,rcx + adc rdx,0 + add r8,rax + mov rax,r11 + adc r9,rdx + adc r10,0 + + + + mov rbp,r11 + shl r11,32 + mul r15 + shr rbp,32 + add r12,r11 + adc r13,rbp + mov rcx,r12 + adc r8,rax + adc r9,rdx + mov rbp,r13 + adc r10,0 + + + + sub r12,-1 + mov rbx,r8 + sbb r13,r14 + sbb r8,0 + mov rdx,r9 + sbb r9,r15 + sbb r10,0 + + cmovc r12,rcx + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rbx + mov QWORD[8+rdi],r13 + cmovc r9,rdx + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + + + + + + + + +global ecp_nistz256_sqr_mont_nohw + +ALIGN 32 +ecp_nistz256_sqr_mont_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_sqr_mont_nohw: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$sqr_body: + mov rax,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + + call __ecp_nistz256_sqr_montq + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$sqr_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_sqr_mont_nohw: + + +ALIGN 32 +__ecp_nistz256_sqr_montq: + + mov r13,rax + mul r14 + mov r9,rax + mov rax,r15 + mov r10,rdx + + mul r13 + add r10,rax + mov rax,r8 + adc rdx,0 + mov r11,rdx + + mul r13 + add r11,rax + mov rax,r15 + adc rdx,0 + mov r12,rdx + + + mul r14 + add r11,rax + mov rax,r8 + adc rdx,0 + mov rbp,rdx + + mul r14 + add r12,rax + mov rax,r8 + adc rdx,0 + add r12,rbp + mov r13,rdx + adc r13,0 + + + mul r15 + xor r15,r15 + add r13,rax + mov rax,QWORD[rsi] + mov r14,rdx + adc r14,0 + + add r9,r9 + adc r10,r10 + adc r11,r11 + adc r12,r12 + adc r13,r13 + adc r14,r14 + adc r15,0 + + mul rax + mov r8,rax + mov rax,QWORD[8+rsi] + mov rcx,rdx + + mul rax + add r9,rcx + adc r10,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + mov rcx,rdx + + mul rax + add r11,rcx + adc r12,rax + mov rax,QWORD[24+rsi] + adc rdx,0 + mov rcx,rdx + + mul rax + add r13,rcx + adc r14,rax + mov rax,r8 + adc r15,rdx + + mov rsi,QWORD[(($L$poly+8))] + mov rbp,QWORD[(($L$poly+24))] + + + + + mov rcx,r8 + shl r8,32 + mul rbp + shr rcx,32 + add r9,r8 + adc r10,rcx + adc r11,rax + mov rax,r9 + adc rdx,0 + + + + mov rcx,r9 + shl r9,32 + mov r8,rdx + mul rbp + shr rcx,32 + add r10,r9 + adc r11,rcx + adc r8,rax + mov rax,r10 + adc rdx,0 + + + + mov rcx,r10 + shl r10,32 + mov r9,rdx + mul rbp + shr rcx,32 + add r11,r10 + adc r8,rcx + adc r9,rax + mov rax,r11 + adc rdx,0 + + + + mov rcx,r11 + shl r11,32 + mov r10,rdx + mul rbp + shr rcx,32 + add r8,r11 + adc r9,rcx + adc r10,rax + adc rdx,0 + xor r11,r11 + + + + add r12,r8 + adc r13,r9 + mov r8,r12 + adc r14,r10 + adc r15,rdx + mov r9,r13 + adc r11,0 + + sub r12,-1 + mov r10,r14 + sbb r13,rsi + sbb r14,0 + mov rcx,r15 + sbb r15,rbp + sbb r11,0 + + cmovc r12,r8 + cmovc r13,r9 + mov QWORD[rdi],r12 + cmovc r14,r10 + mov QWORD[8+rdi],r13 + cmovc r15,rcx + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + + ret + + +global ecp_nistz256_mul_mont_adx + +ALIGN 32 +ecp_nistz256_mul_mont_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_mul_mont_adx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx_body: + mov rbx,rdx + mov rdx,QWORD[rdx] + mov r9,QWORD[rsi] + mov r10,QWORD[8+rsi] + mov r11,QWORD[16+rsi] + mov r12,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + + call __ecp_nistz256_mul_montx + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$mulx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_mul_mont_adx: + + +ALIGN 32 +__ecp_nistz256_mul_montx: + + + + mulx r9,r8,r9 + mulx r10,rcx,r10 + mov r14,32 + xor r13,r13 + mulx r11,rbp,r11 + mov r15,QWORD[(($L$poly+24))] + adc r9,rcx + mulx r12,rcx,r12 + mov rdx,r8 + adc r10,rbp + shlx rbp,r8,r14 + adc r11,rcx + shrx rcx,r8,r14 + adc r12,0 + + + + add r9,rbp + adc r10,rcx + + mulx rbp,rcx,r15 + mov rdx,QWORD[8+rbx] + adc r11,rcx + adc r12,rbp + adc r13,0 + xor r8,r8 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r9,rcx + adox r10,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r9 + adcx r12,rcx + shlx rcx,r9,r14 + adox r13,rbp + shrx rbp,r9,r14 + + adcx r13,r8 + adox r8,r8 + adc r8,0 + + + + add r10,rcx + adc r11,rbp + + mulx rbp,rcx,r15 + mov rdx,QWORD[16+rbx] + adc r12,rcx + adc r13,rbp + adc r8,0 + xor r9,r9 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r10,rcx + adox r11,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r10 + adcx r13,rcx + shlx rcx,r10,r14 + adox r8,rbp + shrx rbp,r10,r14 + + adcx r8,r9 + adox r9,r9 + adc r9,0 + + + + add r11,rcx + adc r12,rbp + + mulx rbp,rcx,r15 + mov rdx,QWORD[24+rbx] + adc r13,rcx + adc r8,rbp + adc r9,0 + xor r10,r10 + + + + mulx rbp,rcx,QWORD[((0+128))+rsi] + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,QWORD[((8+128))+rsi] + adcx r12,rcx + adox r13,rbp + + mulx rbp,rcx,QWORD[((16+128))+rsi] + adcx r13,rcx + adox r8,rbp + + mulx rbp,rcx,QWORD[((24+128))+rsi] + mov rdx,r11 + adcx r8,rcx + shlx rcx,r11,r14 + adox r9,rbp + shrx rbp,r11,r14 + + adcx r9,r10 + adox r10,r10 + adc r10,0 + + + + add r12,rcx + adc r13,rbp + + mulx rbp,rcx,r15 + mov rbx,r12 + mov r14,QWORD[(($L$poly+8))] + adc r8,rcx + mov rdx,r13 + adc r9,rbp + adc r10,0 + + + + xor eax,eax + mov rcx,r8 + sbb r12,-1 + sbb r13,r14 + sbb r8,0 + mov rbp,r9 + sbb r9,r15 + sbb r10,0 + + cmovc r12,rbx + cmovc r13,rdx + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,rbp + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + +global ecp_nistz256_sqr_mont_adx + +ALIGN 32 +ecp_nistz256_sqr_mont_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_sqr_mont_adx: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + +$L$sqrx_body: + mov rdx,QWORD[rsi] + mov r14,QWORD[8+rsi] + mov r15,QWORD[16+rsi] + mov r8,QWORD[24+rsi] + lea rsi,[((-128))+rsi] + + call __ecp_nistz256_sqr_montx + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r13,QWORD[16+rsp] + + mov r12,QWORD[24+rsp] + + mov rbx,QWORD[32+rsp] + + mov rbp,QWORD[40+rsp] + + lea rsp,[48+rsp] + +$L$sqrx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_sqr_mont_adx: + + +ALIGN 32 +__ecp_nistz256_sqr_montx: + + mulx r10,r9,r14 + mulx r11,rcx,r15 + xor eax,eax + adc r10,rcx + mulx r12,rbp,r8 + mov rdx,r14 + adc r11,rbp + adc r12,0 + xor r13,r13 + + + mulx rbp,rcx,r15 + adcx r11,rcx + adox r12,rbp + + mulx rbp,rcx,r8 + mov rdx,r15 + adcx r12,rcx + adox r13,rbp + adc r13,0 + + + mulx r14,rcx,r8 + mov rdx,QWORD[((0+128))+rsi] + xor r15,r15 + adcx r9,r9 + adox r13,rcx + adcx r10,r10 + adox r14,r15 + + mulx rbp,r8,rdx + mov rdx,QWORD[((8+128))+rsi] + adcx r11,r11 + adox r9,rbp + adcx r12,r12 + mulx rax,rcx,rdx + mov rdx,QWORD[((16+128))+rsi] + adcx r13,r13 + adox r10,rcx + adcx r14,r14 + DB 0x67 + mulx rbp,rcx,rdx + mov rdx,QWORD[((24+128))+rsi] + adox r11,rax + adcx r15,r15 + adox r12,rcx + mov rsi,32 + adox r13,rbp + DB 0x67,0x67 + mulx rax,rcx,rdx + mov rdx,QWORD[(($L$poly+24))] + adox r14,rcx + shlx rcx,r8,rsi + adox r15,rax + shrx rax,r8,rsi + mov rbp,rdx + + + add r9,rcx + adc r10,rax + + mulx r8,rcx,r8 + adc r11,rcx + shlx rcx,r9,rsi + adc r8,0 + shrx rax,r9,rsi + + + add r10,rcx + adc r11,rax + + mulx r9,rcx,r9 + adc r8,rcx + shlx rcx,r10,rsi + adc r9,0 + shrx rax,r10,rsi + + + add r11,rcx + adc r8,rax + + mulx r10,rcx,r10 + adc r9,rcx + shlx rcx,r11,rsi + adc r10,0 + shrx rax,r11,rsi + + + add r8,rcx + adc r9,rax + + mulx r11,rcx,r11 + adc r10,rcx + adc r11,0 + + xor rdx,rdx + add r12,r8 + mov rsi,QWORD[(($L$poly+8))] + adc r13,r9 + mov r8,r12 + adc r14,r10 + adc r15,r11 + mov r9,r13 + adc rdx,0 + + sub r12,-1 + mov r10,r14 + sbb r13,rsi + sbb r14,0 + mov r11,r15 + sbb r15,rbp + sbb rdx,0 + + cmovc r12,r8 + cmovc r13,r9 + mov QWORD[rdi],r12 + cmovc r14,r10 + mov QWORD[8+rdi],r13 + cmovc r15,r11 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + + ret + + + + +global ecp_nistz256_select_w5_nohw + +ALIGN 32 +ecp_nistz256_select_w5_nohw: + +_CET_ENDBR + lea rax,[((-136))+rsp] +$L$SEH_begin_ecp_nistz256_select_w5_nohw: + DB 0x48,0x8d,0x60,0xe0 + DB 0x0f,0x29,0x70,0xe0 + DB 0x0f,0x29,0x78,0xf0 + DB 0x44,0x0f,0x29,0x00 + DB 0x44,0x0f,0x29,0x48,0x10 + DB 0x44,0x0f,0x29,0x50,0x20 + DB 0x44,0x0f,0x29,0x58,0x30 + DB 0x44,0x0f,0x29,0x60,0x40 + DB 0x44,0x0f,0x29,0x68,0x50 + DB 0x44,0x0f,0x29,0x70,0x60 + DB 0x44,0x0f,0x29,0x78,0x70 + movdqa xmm0,XMMWORD[$L$One] + movd xmm1,r8d + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + + movdqa xmm8,xmm0 + pshufd xmm1,xmm1,0 + + mov rax,16 +$L$select_loop_sse_w5: + + movdqa xmm15,xmm8 + paddd xmm8,xmm0 + pcmpeqd xmm15,xmm1 + + movdqa xmm9,XMMWORD[rdx] + movdqa xmm10,XMMWORD[16+rdx] + movdqa xmm11,XMMWORD[32+rdx] + movdqa xmm12,XMMWORD[48+rdx] + movdqa xmm13,XMMWORD[64+rdx] + movdqa xmm14,XMMWORD[80+rdx] + lea rdx,[96+rdx] + + pand xmm9,xmm15 + pand xmm10,xmm15 + por xmm2,xmm9 + pand xmm11,xmm15 + por xmm3,xmm10 + pand xmm12,xmm15 + por xmm4,xmm11 + pand xmm13,xmm15 + por xmm5,xmm12 + pand xmm14,xmm15 + por xmm6,xmm13 + por xmm7,xmm14 + + dec rax + jnz NEAR $L$select_loop_sse_w5 + + movdqu XMMWORD[rcx],xmm2 + movdqu XMMWORD[16+rcx],xmm3 + movdqu XMMWORD[32+rcx],xmm4 + movdqu XMMWORD[48+rcx],xmm5 + movdqu XMMWORD[64+rcx],xmm6 + movdqu XMMWORD[80+rcx],xmm7 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] + ret + +$L$SEH_end_ecp_nistz256_select_w5_nohw: + + + + +global ecp_nistz256_select_w7_nohw + +ALIGN 32 +ecp_nistz256_select_w7_nohw: + +_CET_ENDBR + lea rax,[((-136))+rsp] +$L$SEH_begin_ecp_nistz256_select_w7_nohw: + DB 0x48,0x8d,0x60,0xe0 + DB 0x0f,0x29,0x70,0xe0 + DB 0x0f,0x29,0x78,0xf0 + DB 0x44,0x0f,0x29,0x00 + DB 0x44,0x0f,0x29,0x48,0x10 + DB 0x44,0x0f,0x29,0x50,0x20 + DB 0x44,0x0f,0x29,0x58,0x30 + DB 0x44,0x0f,0x29,0x60,0x40 + DB 0x44,0x0f,0x29,0x68,0x50 + DB 0x44,0x0f,0x29,0x70,0x60 + DB 0x44,0x0f,0x29,0x78,0x70 + movdqa xmm8,XMMWORD[$L$One] + movd xmm1,r8d + + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + + movdqa xmm0,xmm8 + pshufd xmm1,xmm1,0 + mov rax,64 + +$L$select_loop_sse_w7: + movdqa xmm15,xmm8 + paddd xmm8,xmm0 + movdqa xmm9,XMMWORD[rdx] + movdqa xmm10,XMMWORD[16+rdx] + pcmpeqd xmm15,xmm1 + movdqa xmm11,XMMWORD[32+rdx] + movdqa xmm12,XMMWORD[48+rdx] + lea rdx,[64+rdx] + + pand xmm9,xmm15 + pand xmm10,xmm15 + por xmm2,xmm9 + pand xmm11,xmm15 + por xmm3,xmm10 + pand xmm12,xmm15 + por xmm4,xmm11 + prefetcht0 [255+rdx] + por xmm5,xmm12 + + dec rax + jnz NEAR $L$select_loop_sse_w7 + + movdqu XMMWORD[rcx],xmm2 + movdqu XMMWORD[16+rcx],xmm3 + movdqu XMMWORD[32+rcx],xmm4 + movdqu XMMWORD[48+rcx],xmm5 + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[168+rsp] + ret + +$L$SEH_end_ecp_nistz256_select_w7_nohw: + + + +global ecp_nistz256_select_w5_avx2 + +ALIGN 32 +ecp_nistz256_select_w5_avx2: + +_CET_ENDBR + vzeroupper + lea rax,[((-136))+rsp] + mov r11,rsp +$L$SEH_begin_ecp_nistz256_select_w5_avx2: + DB 0x48,0x8d,0x60,0xe0 + DB 0xc5,0xf8,0x29,0x70,0xe0 + DB 0xc5,0xf8,0x29,0x78,0xf0 + DB 0xc5,0x78,0x29,0x40,0x00 + DB 0xc5,0x78,0x29,0x48,0x10 + DB 0xc5,0x78,0x29,0x50,0x20 + DB 0xc5,0x78,0x29,0x58,0x30 + DB 0xc5,0x78,0x29,0x60,0x40 + DB 0xc5,0x78,0x29,0x68,0x50 + DB 0xc5,0x78,0x29,0x70,0x60 + DB 0xc5,0x78,0x29,0x78,0x70 + vmovdqa ymm0,YMMWORD[$L$Two] + + vpxor ymm2,ymm2,ymm2 + vpxor ymm3,ymm3,ymm3 + vpxor ymm4,ymm4,ymm4 + + vmovdqa ymm5,YMMWORD[$L$One] + vmovdqa ymm10,YMMWORD[$L$Two] + + vmovd xmm1,r8d + vpermd ymm1,ymm2,ymm1 + + mov rax,8 +$L$select_loop_avx2_w5: + + vmovdqa ymm6,YMMWORD[rdx] + vmovdqa ymm7,YMMWORD[32+rdx] + vmovdqa ymm8,YMMWORD[64+rdx] + + vmovdqa ymm11,YMMWORD[96+rdx] + vmovdqa ymm12,YMMWORD[128+rdx] + vmovdqa ymm13,YMMWORD[160+rdx] + + vpcmpeqd ymm9,ymm5,ymm1 + vpcmpeqd ymm14,ymm10,ymm1 + + vpaddd ymm5,ymm5,ymm0 + vpaddd ymm10,ymm10,ymm0 + lea rdx,[192+rdx] + + vpand ymm6,ymm6,ymm9 + vpand ymm7,ymm7,ymm9 + vpand ymm8,ymm8,ymm9 + vpand ymm11,ymm11,ymm14 + vpand ymm12,ymm12,ymm14 + vpand ymm13,ymm13,ymm14 + + vpxor ymm2,ymm2,ymm6 + vpxor ymm3,ymm3,ymm7 + vpxor ymm4,ymm4,ymm8 + vpxor ymm2,ymm2,ymm11 + vpxor ymm3,ymm3,ymm12 + vpxor ymm4,ymm4,ymm13 + + dec rax + jnz NEAR $L$select_loop_avx2_w5 + + vmovdqu YMMWORD[rcx],ymm2 + vmovdqu YMMWORD[32+rcx],ymm3 + vmovdqu YMMWORD[64+rcx],ymm4 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[r11] + ret + +$L$SEH_end_ecp_nistz256_select_w5_avx2: + + + + +global ecp_nistz256_select_w7_avx2 + +ALIGN 32 +ecp_nistz256_select_w7_avx2: + +_CET_ENDBR + vzeroupper + mov r11,rsp + lea rax,[((-136))+rsp] +$L$SEH_begin_ecp_nistz256_select_w7_avx2: + DB 0x48,0x8d,0x60,0xe0 + DB 0xc5,0xf8,0x29,0x70,0xe0 + DB 0xc5,0xf8,0x29,0x78,0xf0 + DB 0xc5,0x78,0x29,0x40,0x00 + DB 0xc5,0x78,0x29,0x48,0x10 + DB 0xc5,0x78,0x29,0x50,0x20 + DB 0xc5,0x78,0x29,0x58,0x30 + DB 0xc5,0x78,0x29,0x60,0x40 + DB 0xc5,0x78,0x29,0x68,0x50 + DB 0xc5,0x78,0x29,0x70,0x60 + DB 0xc5,0x78,0x29,0x78,0x70 + vmovdqa ymm0,YMMWORD[$L$Three] + + vpxor ymm2,ymm2,ymm2 + vpxor ymm3,ymm3,ymm3 + + vmovdqa ymm4,YMMWORD[$L$One] + vmovdqa ymm8,YMMWORD[$L$Two] + vmovdqa ymm12,YMMWORD[$L$Three] + + vmovd xmm1,r8d + vpermd ymm1,ymm2,ymm1 + + + mov rax,21 +$L$select_loop_avx2_w7: + + vmovdqa ymm5,YMMWORD[rdx] + vmovdqa ymm6,YMMWORD[32+rdx] + + vmovdqa ymm9,YMMWORD[64+rdx] + vmovdqa ymm10,YMMWORD[96+rdx] + + vmovdqa ymm13,YMMWORD[128+rdx] + vmovdqa ymm14,YMMWORD[160+rdx] + + vpcmpeqd ymm7,ymm4,ymm1 + vpcmpeqd ymm11,ymm8,ymm1 + vpcmpeqd ymm15,ymm12,ymm1 + + vpaddd ymm4,ymm4,ymm0 + vpaddd ymm8,ymm8,ymm0 + vpaddd ymm12,ymm12,ymm0 + lea rdx,[192+rdx] + + vpand ymm5,ymm5,ymm7 + vpand ymm6,ymm6,ymm7 + vpand ymm9,ymm9,ymm11 + vpand ymm10,ymm10,ymm11 + vpand ymm13,ymm13,ymm15 + vpand ymm14,ymm14,ymm15 + + vpxor ymm2,ymm2,ymm5 + vpxor ymm3,ymm3,ymm6 + vpxor ymm2,ymm2,ymm9 + vpxor ymm3,ymm3,ymm10 + vpxor ymm2,ymm2,ymm13 + vpxor ymm3,ymm3,ymm14 + + dec rax + jnz NEAR $L$select_loop_avx2_w7 + + + vmovdqa ymm5,YMMWORD[rdx] + vmovdqa ymm6,YMMWORD[32+rdx] + + vpcmpeqd ymm7,ymm4,ymm1 + + vpand ymm5,ymm5,ymm7 + vpand ymm6,ymm6,ymm7 + + vpxor ymm2,ymm2,ymm5 + vpxor ymm3,ymm3,ymm6 + + vmovdqu YMMWORD[rcx],ymm2 + vmovdqu YMMWORD[32+rcx],ymm3 + vzeroupper + movaps xmm6,XMMWORD[rsp] + movaps xmm7,XMMWORD[16+rsp] + movaps xmm8,XMMWORD[32+rsp] + movaps xmm9,XMMWORD[48+rsp] + movaps xmm10,XMMWORD[64+rsp] + movaps xmm11,XMMWORD[80+rsp] + movaps xmm12,XMMWORD[96+rsp] + movaps xmm13,XMMWORD[112+rsp] + movaps xmm14,XMMWORD[128+rsp] + movaps xmm15,XMMWORD[144+rsp] + lea rsp,[r11] + ret + +$L$SEH_end_ecp_nistz256_select_w7_avx2: + + +ALIGN 32 +__ecp_nistz256_add_toq: + + xor r11,r11 + add r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + mov rax,r12 + adc r8,QWORD[16+rbx] + adc r9,QWORD[24+rbx] + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + + +ALIGN 32 +__ecp_nistz256_sub_fromq: + + sub r12,QWORD[rbx] + sbb r13,QWORD[8+rbx] + mov rax,r12 + sbb r8,QWORD[16+rbx] + sbb r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,r11 + + add r12,-1 + mov rcx,r8 + adc r13,r14 + adc r8,0 + mov r10,r9 + adc r9,r15 + test r11,r11 + + cmovz r12,rax + cmovz r13,rbp + mov QWORD[rdi],r12 + cmovz r8,rcx + mov QWORD[8+rdi],r13 + cmovz r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + + +ALIGN 32 +__ecp_nistz256_subq: + + sub rax,r12 + sbb rbp,r13 + mov r12,rax + sbb rcx,r8 + sbb r10,r9 + mov r13,rbp + sbb r11,r11 + + add rax,-1 + mov r8,rcx + adc rbp,r14 + adc rcx,0 + mov r9,r10 + adc r10,r15 + test r11,r11 + + cmovnz r12,rax + cmovnz r13,rbp + cmovnz r8,rcx + cmovnz r9,r10 + + ret + + + + +ALIGN 32 +__ecp_nistz256_mul_by_2q: + + xor r11,r11 + add r12,r12 + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + +global ecp_nistz256_point_double_nohw + +ALIGN 32 +ecp_nistz256_point_double_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_double_nohw: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*5+8 + +$L$point_doubleq_body: + +$L$point_double_shortcutq: + movdqu xmm0,XMMWORD[rsi] + mov rbx,rsi + movdqu xmm1,XMMWORD[16+rsi] + mov r12,QWORD[((32+0))+rsi] + mov r13,QWORD[((32+8))+rsi] + mov r8,QWORD[((32+16))+rsi] + mov r9,QWORD[((32+24))+rsi] + mov r14,QWORD[(($L$poly+8))] + mov r15,QWORD[(($L$poly+24))] + movdqa XMMWORD[96+rsp],xmm0 + movdqa XMMWORD[(96+16)+rsp],xmm1 + lea r10,[32+rdi] + lea r11,[64+rdi] + movq xmm0,rdi + movq xmm1,r10 + movq xmm2,r11 + + lea rdi,[rsp] + call __ecp_nistz256_mul_by_2q + + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + lea rsi,[((64-0))+rsi] + lea rdi,[64+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[32+rbx] + mov r9,QWORD[((64+0))+rbx] + mov r10,QWORD[((64+8))+rbx] + mov r11,QWORD[((64+16))+rbx] + mov r12,QWORD[((64+24))+rbx] + lea rsi,[((64-0))+rbx] + lea rbx,[32+rbx] + movq rdi,xmm2 + call __ecp_nistz256_mul_montq + call __ecp_nistz256_mul_by_2q + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_toq + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + movq rdi,xmm1 + call __ecp_nistz256_sqr_montq + xor r9,r9 + mov rax,r12 + add r12,-1 + mov r10,r13 + adc r13,rsi + mov rcx,r14 + adc r14,0 + mov r8,r15 + adc r15,rbp + adc r9,0 + xor rsi,rsi + test rax,1 + + cmovz r12,rax + cmovz r13,r10 + cmovz r14,rcx + cmovz r15,r8 + cmovz r9,rsi + + mov rax,r13 + shr r12,1 + shl rax,63 + mov r10,r14 + shr r13,1 + or r12,rax + shl r10,63 + mov rcx,r15 + shr r14,1 + or r13,r10 + shl rcx,63 + mov QWORD[rdi],r12 + shr r15,1 + mov QWORD[8+rdi],r13 + shl r9,63 + or r14,rcx + or r15,r9 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + mov rax,QWORD[64+rsp] + lea rbx,[64+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2q + + lea rbx,[32+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_toq + + mov rax,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2q + + mov rax,QWORD[((0+32))+rsp] + mov r14,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r15,QWORD[((16+32))+rsp] + mov r8,QWORD[((24+32))+rsp] + movq rdi,xmm0 + call __ecp_nistz256_sqr_montq + + lea rbx,[128+rsp] + mov r8,r14 + mov r9,r15 + mov r14,rsi + mov r15,rbp + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_subq + + mov rax,QWORD[32+rsp] + lea rbx,[32+rsp] + mov r14,r12 + xor ecx,ecx + mov QWORD[((0+0))+rsp],r12 + mov r10,r13 + mov QWORD[((0+8))+rsp],r13 + cmovz r11,r8 + mov QWORD[((0+16))+rsp],r8 + lea rsi,[((0-0))+rsp] + cmovz r12,r9 + mov QWORD[((0+24))+rsp],r9 + mov r9,r14 + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + movq rbx,xmm1 + movq rdi,xmm1 + call __ecp_nistz256_sub_fromq + + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doubleq_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_double_nohw: +global ecp_nistz256_point_add_nohw + +ALIGN 32 +ecp_nistz256_point_add_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*18+8 + +$L$point_addq_body: + + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rbx,rsi + mov rsi,rdx + movdqa XMMWORD[384+rsp],xmm0 + movdqa XMMWORD[(384+16)+rsp],xmm1 + movdqa XMMWORD[416+rsp],xmm2 + movdqa XMMWORD[(416+16)+rsp],xmm3 + movdqa XMMWORD[448+rsp],xmm4 + movdqa XMMWORD[(448+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rsi] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rsi] + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[480+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(480+16)+rsp],xmm1 + movdqu xmm0,XMMWORD[64+rsi] + movdqu xmm1,XMMWORD[80+rsi] + movdqa XMMWORD[512+rsp],xmm2 + movdqa XMMWORD[(512+16)+rsp],xmm3 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm1,xmm0 + movq xmm0,rdi + + lea rsi,[((64-0))+rsi] + mov QWORD[((544+0))+rsp],rax + mov QWORD[((544+8))+rsp],r14 + mov QWORD[((544+16))+rsp],r15 + mov QWORD[((544+24))+rsp],r8 + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montq + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm1,0xb1 + por xmm4,xmm1 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + por xmm4,xmm3 + pxor xmm3,xmm3 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + mov rax,QWORD[((64+0))+rbx] + mov r14,QWORD[((64+8))+rbx] + mov r15,QWORD[((64+16))+rbx] + mov r8,QWORD[((64+24))+rbx] + movq xmm1,rbx + + lea rsi,[((64-0))+rbx] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[416+rsp] + lea rbx,[416+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((0+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[512+rsp] + lea rbx,[512+rsp] + mov r9,QWORD[((0+256))+rsp] + mov r10,QWORD[((8+256))+rsp] + lea rsi,[((0+256))+rsp] + mov r11,QWORD[((16+256))+rsp] + mov r12,QWORD[((24+256))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[224+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + or r12,r13 + movdqa xmm2,xmm4 + or r12,r8 + or r12,r9 + por xmm2,xmm5 + movq xmm3,r12 + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[480+rsp] + lea rbx,[480+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[160+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sub_fromq + + or r12,r13 + or r12,r8 + or r12,r9 + + movq r8,xmm2 + movq r9,xmm3 + or r12,r8 + DB 0x3e + jnz NEAR $L$add_proceedq + + + + test r9,r9 + jz NEAR $L$add_doubleq + + + + + + + movq rdi,xmm0 + pxor xmm0,xmm0 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[80+rdi],xmm0 + jmp NEAR $L$add_doneq + +ALIGN 32 +$L$add_doubleq: + movq rsi,xmm1 + movq rdi,xmm0 + add rsp,416 + + jmp NEAR $L$point_double_shortcutq + + +ALIGN 32 +$L$add_proceedq: + mov rax,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((0+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+352))+rsp] + mov r10,QWORD[((8+352))+rsp] + lea rsi,[((0+352))+rsp] + mov r11,QWORD[((16+352))+rsp] + mov r12,QWORD[((24+352))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[rsp] + lea rbx,[rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[160+rsp] + lea rbx,[160+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montq + + + + + xor r11,r11 + add r12,r12 + lea rsi,[96+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subq + + lea rbx,[128+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((192+0))+rsp] + mov rbp,QWORD[((192+8))+rsp] + mov rcx,QWORD[((192+16))+rsp] + mov r10,QWORD[((192+24))+rsp] + lea rdi,[320+rsp] + + call __ecp_nistz256_subq + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rax,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((0+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[256+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_sub_fromq + + movq rdi,xmm0 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[352+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((352+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[544+rsp] + pand xmm3,XMMWORD[((544+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[480+rsp] + pand xmm3,XMMWORD[((480+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[320+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((320+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[512+rsp] + pand xmm3,XMMWORD[((512+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + +$L$add_doneq: + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addq_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_add_nohw: +global ecp_nistz256_point_add_affine_nohw + +ALIGN 32 +ecp_nistz256_point_add_affine_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_affine_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*15+8 + +$L$add_affineq_body: + + movdqu xmm0,XMMWORD[rsi] + mov rbx,rdx + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rax,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[320+rsp],xmm0 + movdqa XMMWORD[(320+16)+rsp],xmm1 + movdqa XMMWORD[352+rsp],xmm2 + movdqa XMMWORD[(352+16)+rsp],xmm3 + movdqa XMMWORD[384+rsp],xmm4 + movdqa XMMWORD[(384+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rbx] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rbx] + movdqu xmm2,XMMWORD[32+rbx] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rbx] + movdqa XMMWORD[416+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(416+16)+rsp],xmm1 + por xmm1,xmm0 + movq xmm0,rdi + movdqa XMMWORD[448+rsp],xmm2 + movdqa XMMWORD[(448+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-0))+rsi] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montq + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + mov rax,QWORD[rbx] + + mov r9,r12 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + mov r10,r13 + por xmm4,xmm3 + pxor xmm3,xmm3 + mov r11,r14 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + + lea rsi,[((32-0))+rsp] + mov r12,r15 + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[320+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((0+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[352+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[((0+96))+rsp] + mov r14,QWORD[((8+96))+rsp] + lea rsi,[((0+96))+rsp] + mov r15,QWORD[((16+96))+rsp] + mov r8,QWORD[((24+96))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_sqr_montq + + mov rax,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+128))+rsp] + mov r10,QWORD[((8+128))+rsp] + lea rsi,[((0+128))+rsp] + mov r11,QWORD[((16+128))+rsp] + mov r12,QWORD[((24+128))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montq + + + + + xor r11,r11 + add r12,r12 + lea rsi,[192+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subq + + lea rbx,[160+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_sub_fromq + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[64+rsp] + + call __ecp_nistz256_subq + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rax,QWORD[352+rsp] + lea rbx,[352+rsp] + mov r9,QWORD[((0+160))+rsp] + mov r10,QWORD[((8+160))+rsp] + lea rsi,[((0+160))+rsp] + mov r11,QWORD[((16+160))+rsp] + mov r12,QWORD[((24+160))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montq + + mov rax,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((0+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_mul_montq + + lea rbx,[32+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_sub_fromq + + movq rdi,xmm0 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[$L$ONE_mont] + pand xmm3,XMMWORD[(($L$ONE_mont+16))] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[224+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((224+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[320+rsp] + pand xmm3,XMMWORD[((320+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[256+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((256+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[352+rsp] + pand xmm3,XMMWORD[((352+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affineq_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_add_affine_nohw: + +ALIGN 32 +__ecp_nistz256_add_tox: + + xor r11,r11 + adc r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + mov rax,r12 + adc r8,QWORD[16+rbx] + adc r9,QWORD[24+rbx] + mov rbp,r13 + adc r11,0 + + xor r10,r10 + sbb r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + + +ALIGN 32 +__ecp_nistz256_sub_fromx: + + xor r11,r11 + sbb r12,QWORD[rbx] + sbb r13,QWORD[8+rbx] + mov rax,r12 + sbb r8,QWORD[16+rbx] + sbb r9,QWORD[24+rbx] + mov rbp,r13 + sbb r11,0 + + xor r10,r10 + adc r12,-1 + mov rcx,r8 + adc r13,r14 + adc r8,0 + mov r10,r9 + adc r9,r15 + + bt r11,0 + cmovnc r12,rax + cmovnc r13,rbp + mov QWORD[rdi],r12 + cmovnc r8,rcx + mov QWORD[8+rdi],r13 + cmovnc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + + + +ALIGN 32 +__ecp_nistz256_subx: + + xor r11,r11 + sbb rax,r12 + sbb rbp,r13 + mov r12,rax + sbb rcx,r8 + sbb r10,r9 + mov r13,rbp + sbb r11,0 + + xor r9,r9 + adc rax,-1 + mov r8,rcx + adc rbp,r14 + adc rcx,0 + mov r9,r10 + adc r10,r15 + + bt r11,0 + cmovc r12,rax + cmovc r13,rbp + cmovc r8,rcx + cmovc r9,r10 + + ret + + + + +ALIGN 32 +__ecp_nistz256_mul_by_2x: + + xor r11,r11 + adc r12,r12 + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + xor r10,r10 + sbb r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + cmovc r13,rbp + mov QWORD[rdi],r12 + cmovc r8,rcx + mov QWORD[8+rdi],r13 + cmovc r9,r10 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + + ret + + +global ecp_nistz256_point_double_adx + +ALIGN 32 +ecp_nistz256_point_double_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_double_adx: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*5+8 + +$L$point_doublex_body: + +$L$point_double_shortcutx: + movdqu xmm0,XMMWORD[rsi] + mov rbx,rsi + movdqu xmm1,XMMWORD[16+rsi] + mov r12,QWORD[((32+0))+rsi] + mov r13,QWORD[((32+8))+rsi] + mov r8,QWORD[((32+16))+rsi] + mov r9,QWORD[((32+24))+rsi] + mov r14,QWORD[(($L$poly+8))] + mov r15,QWORD[(($L$poly+24))] + movdqa XMMWORD[96+rsp],xmm0 + movdqa XMMWORD[(96+16)+rsp],xmm1 + lea r10,[32+rdi] + lea r11,[64+rdi] + movq xmm0,rdi + movq xmm1,r10 + movq xmm2,r11 + + lea rdi,[rsp] + call __ecp_nistz256_mul_by_2x + + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + lea rsi,[((64-128))+rsi] + lea rdi,[64+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[32+rbx] + mov r9,QWORD[((64+0))+rbx] + mov r10,QWORD[((64+8))+rbx] + mov r11,QWORD[((64+16))+rbx] + mov r12,QWORD[((64+24))+rbx] + lea rsi,[((64-128))+rbx] + lea rbx,[32+rbx] + movq rdi,xmm2 + call __ecp_nistz256_mul_montx + call __ecp_nistz256_mul_by_2x + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_tox + + mov r12,QWORD[((96+0))+rsp] + mov r13,QWORD[((96+8))+rsp] + lea rbx,[64+rsp] + mov r8,QWORD[((96+16))+rsp] + mov r9,QWORD[((96+24))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + movq rdi,xmm1 + call __ecp_nistz256_sqr_montx + xor r9,r9 + mov rax,r12 + add r12,-1 + mov r10,r13 + adc r13,rsi + mov rcx,r14 + adc r14,0 + mov r8,r15 + adc r15,rbp + adc r9,0 + xor rsi,rsi + test rax,1 + + cmovz r12,rax + cmovz r13,r10 + cmovz r14,rcx + cmovz r15,r8 + cmovz r9,rsi + + mov rax,r13 + shr r12,1 + shl rax,63 + mov r10,r14 + shr r13,1 + or r12,rax + shl r10,63 + mov rcx,r15 + shr r14,1 + or r13,r10 + shl rcx,63 + mov QWORD[rdi],r12 + shr r15,1 + mov QWORD[8+rdi],r13 + shl r9,63 + or r14,rcx + or r15,r9 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + mov rdx,QWORD[64+rsp] + lea rbx,[64+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2x + + lea rbx,[32+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_add_tox + + mov rdx,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + lea rdi,[128+rsp] + call __ecp_nistz256_mul_by_2x + + mov rdx,QWORD[((0+32))+rsp] + mov r14,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r15,QWORD[((16+32))+rsp] + mov r8,QWORD[((24+32))+rsp] + movq rdi,xmm0 + call __ecp_nistz256_sqr_montx + + lea rbx,[128+rsp] + mov r8,r14 + mov r9,r15 + mov r14,rsi + mov r15,rbp + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_subx + + mov rdx,QWORD[32+rsp] + lea rbx,[32+rsp] + mov r14,r12 + xor ecx,ecx + mov QWORD[((0+0))+rsp],r12 + mov r10,r13 + mov QWORD[((0+8))+rsp],r13 + cmovz r11,r8 + mov QWORD[((0+16))+rsp],r8 + lea rsi,[((0-128))+rsp] + cmovz r12,r9 + mov QWORD[((0+24))+rsp],r9 + mov r9,r14 + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + movq rbx,xmm1 + movq rdi,xmm1 + call __ecp_nistz256_sub_fromx + + lea rsi,[((160+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_doublex_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_double_adx: +global ecp_nistz256_point_add_adx + +ALIGN 32 +ecp_nistz256_point_add_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_adx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*18+8 + +$L$point_addx_body: + + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rbx,rsi + mov rsi,rdx + movdqa XMMWORD[384+rsp],xmm0 + movdqa XMMWORD[(384+16)+rsp],xmm1 + movdqa XMMWORD[416+rsp],xmm2 + movdqa XMMWORD[(416+16)+rsp],xmm3 + movdqa XMMWORD[448+rsp],xmm4 + movdqa XMMWORD[(448+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rsi] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rsi] + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[480+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(480+16)+rsp],xmm1 + movdqu xmm0,XMMWORD[64+rsi] + movdqu xmm1,XMMWORD[80+rsi] + movdqa XMMWORD[512+rsp],xmm2 + movdqa XMMWORD[(512+16)+rsp],xmm3 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm1,xmm0 + movq xmm0,rdi + + lea rsi,[((64-128))+rsi] + mov QWORD[((544+0))+rsp],rdx + mov QWORD[((544+8))+rsp],r14 + mov QWORD[((544+16))+rsp],r15 + mov QWORD[((544+24))+rsp],r8 + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montx + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm1,0xb1 + por xmm4,xmm1 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + por xmm4,xmm3 + pxor xmm3,xmm3 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + mov rdx,QWORD[((64+0))+rbx] + mov r14,QWORD[((64+8))+rbx] + mov r15,QWORD[((64+16))+rbx] + mov r8,QWORD[((64+24))+rbx] + movq xmm1,rbx + + lea rsi,[((64-128))+rbx] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[416+rsp] + lea rbx,[416+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((-128+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[512+rsp] + lea rbx,[512+rsp] + mov r9,QWORD[((0+256))+rsp] + mov r10,QWORD[((8+256))+rsp] + lea rsi,[((-128+256))+rsp] + mov r11,QWORD[((16+256))+rsp] + mov r12,QWORD[((24+256))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[224+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + or r12,r13 + movdqa xmm2,xmm4 + or r12,r8 + or r12,r9 + por xmm2,xmm5 + movq xmm3,r12 + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+96))+rsp] + mov r10,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r11,QWORD[((16+96))+rsp] + mov r12,QWORD[((24+96))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[480+rsp] + lea rbx,[480+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[160+rsp] + lea rdi,[rsp] + call __ecp_nistz256_sub_fromx + + or r12,r13 + or r12,r8 + or r12,r9 + + movq r8,xmm2 + movq r9,xmm3 + or r12,r8 + DB 0x3e + jnz NEAR $L$add_proceedx + + + + test r9,r9 + jz NEAR $L$add_doublex + + + + + + + movq rdi,xmm0 + pxor xmm0,xmm0 + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm0 + movdqu XMMWORD[32+rdi],xmm0 + movdqu XMMWORD[48+rdi],xmm0 + movdqu XMMWORD[64+rdi],xmm0 + movdqu XMMWORD[80+rdi],xmm0 + jmp NEAR $L$add_donex + +ALIGN 32 +$L$add_doublex: + movq rsi,xmm1 + movq rdi,xmm0 + add rsp,416 + + jmp NEAR $L$point_double_shortcutx + + +ALIGN 32 +$L$add_proceedx: + mov rdx,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+0))+rsp] + mov r10,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r11,QWORD[((16+0))+rsp] + mov r12,QWORD[((24+0))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[((0+0))+rsp] + mov r14,QWORD[((8+0))+rsp] + lea rsi,[((-128+0))+rsp] + mov r15,QWORD[((16+0))+rsp] + mov r8,QWORD[((24+0))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[544+rsp] + lea rbx,[544+rsp] + mov r9,QWORD[((0+352))+rsp] + mov r10,QWORD[((8+352))+rsp] + lea rsi,[((-128+352))+rsp] + mov r11,QWORD[((16+352))+rsp] + mov r12,QWORD[((24+352))+rsp] + lea rdi,[352+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[rsp] + lea rbx,[rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[160+rsp] + lea rbx,[160+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_mul_montx + + + + + xor r11,r11 + add r12,r12 + lea rsi,[96+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subx + + lea rbx,[128+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((192+0))+rsp] + mov rbp,QWORD[((192+8))+rsp] + mov rcx,QWORD[((192+16))+rsp] + mov r10,QWORD[((192+24))+rsp] + lea rdi,[320+rsp] + + call __ecp_nistz256_subx + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rdx,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+224))+rsp] + mov r10,QWORD[((8+224))+rsp] + lea rsi,[((-128+224))+rsp] + mov r11,QWORD[((16+224))+rsp] + mov r12,QWORD[((24+224))+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[256+rsp] + lea rdi,[320+rsp] + call __ecp_nistz256_sub_fromx + + movq rdi,xmm0 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[352+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((352+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[544+rsp] + pand xmm3,XMMWORD[((544+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[480+rsp] + pand xmm3,XMMWORD[((480+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[320+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((320+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[512+rsp] + pand xmm3,XMMWORD[((512+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + +$L$add_donex: + lea rsi,[((576+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$point_addx_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_add_adx: +global ecp_nistz256_point_add_affine_adx + +ALIGN 32 +ecp_nistz256_point_add_affine_adx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ecp_nistz256_point_add_affine_adx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,32*15+8 + +$L$add_affinex_body: + + movdqu xmm0,XMMWORD[rsi] + mov rbx,rdx + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm3,XMMWORD[48+rsi] + movdqu xmm4,XMMWORD[64+rsi] + movdqu xmm5,XMMWORD[80+rsi] + mov rdx,QWORD[((64+0))+rsi] + mov r14,QWORD[((64+8))+rsi] + mov r15,QWORD[((64+16))+rsi] + mov r8,QWORD[((64+24))+rsi] + movdqa XMMWORD[320+rsp],xmm0 + movdqa XMMWORD[(320+16)+rsp],xmm1 + movdqa XMMWORD[352+rsp],xmm2 + movdqa XMMWORD[(352+16)+rsp],xmm3 + movdqa XMMWORD[384+rsp],xmm4 + movdqa XMMWORD[(384+16)+rsp],xmm5 + por xmm5,xmm4 + + movdqu xmm0,XMMWORD[rbx] + pshufd xmm3,xmm5,0xb1 + movdqu xmm1,XMMWORD[16+rbx] + movdqu xmm2,XMMWORD[32+rbx] + por xmm5,xmm3 + movdqu xmm3,XMMWORD[48+rbx] + movdqa XMMWORD[416+rsp],xmm0 + pshufd xmm4,xmm5,0x1e + movdqa XMMWORD[(416+16)+rsp],xmm1 + por xmm1,xmm0 + movq xmm0,rdi + movdqa XMMWORD[448+rsp],xmm2 + movdqa XMMWORD[(448+16)+rsp],xmm3 + por xmm3,xmm2 + por xmm5,xmm4 + pxor xmm4,xmm4 + por xmm3,xmm1 + + lea rsi,[((64-128))+rsi] + lea rdi,[32+rsp] + call __ecp_nistz256_sqr_montx + + pcmpeqd xmm5,xmm4 + pshufd xmm4,xmm3,0xb1 + mov rdx,QWORD[rbx] + + mov r9,r12 + por xmm4,xmm3 + pshufd xmm5,xmm5,0 + pshufd xmm3,xmm4,0x1e + mov r10,r13 + por xmm4,xmm3 + pxor xmm3,xmm3 + mov r11,r14 + pcmpeqd xmm4,xmm3 + pshufd xmm4,xmm4,0 + + lea rsi,[((32-128))+rsp] + mov r12,r15 + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[320+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[384+rsp] + lea rbx,[384+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[288+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[448+rsp] + lea rbx,[448+rsp] + mov r9,QWORD[((0+32))+rsp] + mov r10,QWORD[((8+32))+rsp] + lea rsi,[((-128+32))+rsp] + mov r11,QWORD[((16+32))+rsp] + mov r12,QWORD[((24+32))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[352+rsp] + lea rdi,[96+rsp] + call __ecp_nistz256_sub_fromx + + mov rdx,QWORD[((0+64))+rsp] + mov r14,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r15,QWORD[((16+64))+rsp] + mov r8,QWORD[((24+64))+rsp] + lea rdi,[128+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[((0+96))+rsp] + mov r14,QWORD[((8+96))+rsp] + lea rsi,[((-128+96))+rsp] + mov r15,QWORD[((16+96))+rsp] + mov r8,QWORD[((24+96))+rsp] + lea rdi,[192+rsp] + call __ecp_nistz256_sqr_montx + + mov rdx,QWORD[128+rsp] + lea rbx,[128+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[160+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[320+rsp] + lea rbx,[320+rsp] + mov r9,QWORD[((0+128))+rsp] + mov r10,QWORD[((8+128))+rsp] + lea rsi,[((-128+128))+rsp] + mov r11,QWORD[((16+128))+rsp] + mov r12,QWORD[((24+128))+rsp] + lea rdi,[rsp] + call __ecp_nistz256_mul_montx + + + + + xor r11,r11 + add r12,r12 + lea rsi,[192+rsp] + adc r13,r13 + mov rax,r12 + adc r8,r8 + adc r9,r9 + mov rbp,r13 + adc r11,0 + + sub r12,-1 + mov rcx,r8 + sbb r13,r14 + sbb r8,0 + mov r10,r9 + sbb r9,r15 + sbb r11,0 + + cmovc r12,rax + mov rax,QWORD[rsi] + cmovc r13,rbp + mov rbp,QWORD[8+rsi] + cmovc r8,rcx + mov rcx,QWORD[16+rsi] + cmovc r9,r10 + mov r10,QWORD[24+rsi] + + call __ecp_nistz256_subx + + lea rbx,[160+rsp] + lea rdi,[224+rsp] + call __ecp_nistz256_sub_fromx + + mov rax,QWORD[((0+0))+rsp] + mov rbp,QWORD[((0+8))+rsp] + mov rcx,QWORD[((0+16))+rsp] + mov r10,QWORD[((0+24))+rsp] + lea rdi,[64+rsp] + + call __ecp_nistz256_subx + + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r8 + mov QWORD[24+rdi],r9 + mov rdx,QWORD[352+rsp] + lea rbx,[352+rsp] + mov r9,QWORD[((0+160))+rsp] + mov r10,QWORD[((8+160))+rsp] + lea rsi,[((-128+160))+rsp] + mov r11,QWORD[((16+160))+rsp] + mov r12,QWORD[((24+160))+rsp] + lea rdi,[32+rsp] + call __ecp_nistz256_mul_montx + + mov rdx,QWORD[96+rsp] + lea rbx,[96+rsp] + mov r9,QWORD[((0+64))+rsp] + mov r10,QWORD[((8+64))+rsp] + lea rsi,[((-128+64))+rsp] + mov r11,QWORD[((16+64))+rsp] + mov r12,QWORD[((24+64))+rsp] + lea rdi,[64+rsp] + call __ecp_nistz256_mul_montx + + lea rbx,[32+rsp] + lea rdi,[256+rsp] + call __ecp_nistz256_sub_fromx + + movq rdi,xmm0 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[288+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((288+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[$L$ONE_mont] + pand xmm3,XMMWORD[(($L$ONE_mont+16))] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[384+rsp] + pand xmm3,XMMWORD[((384+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[64+rdi],xmm2 + movdqu XMMWORD[80+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[224+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((224+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[416+rsp] + pand xmm3,XMMWORD[((416+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[320+rsp] + pand xmm3,XMMWORD[((320+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[rdi],xmm2 + movdqu XMMWORD[16+rdi],xmm3 + + movdqa xmm0,xmm5 + movdqa xmm1,xmm5 + pandn xmm0,XMMWORD[256+rsp] + movdqa xmm2,xmm5 + pandn xmm1,XMMWORD[((256+16))+rsp] + movdqa xmm3,xmm5 + pand xmm2,XMMWORD[448+rsp] + pand xmm3,XMMWORD[((448+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + + movdqa xmm0,xmm4 + movdqa xmm1,xmm4 + pandn xmm0,xmm2 + movdqa xmm2,xmm4 + pandn xmm1,xmm3 + movdqa xmm3,xmm4 + pand xmm2,XMMWORD[352+rsp] + pand xmm3,XMMWORD[((352+16))+rsp] + por xmm2,xmm0 + por xmm3,xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + + lea rsi,[((480+56))+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbx,QWORD[((-16))+rsi] + + mov rbp,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$add_affinex_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ecp_nistz256_point_add_affine_adx: +EXTERN __imp_RtlVirtualUnwind + + +ALIGN 16 +short_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[16+rax] + + mov r12,QWORD[((-8))+rax] + mov r13,QWORD[((-16))+rax] + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10d,DWORD[8+r11] + lea rax,[r10*1+rax] + + mov rbp,QWORD[((-8))+rax] + mov rbx,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ecp_nistz256_neg wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_neg wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_neg wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_mul_mont_nohw wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_mul_mont_adx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_ord_sqr_mont_adx wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_mul_mont_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_mul_mont_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_mul_mont_nohw wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_sqr_mont_nohw wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_mul_mont_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_mul_mont_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_mul_mont_adx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_sqr_mont_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_sqr_mont_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_sqr_mont_adx wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_select_w5_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w5_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX_nohw wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_select_w7_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w7_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX_nohw wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_select_w5_avx2 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w5_avx2 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX_avx2 wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_select_w7_avx2 wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_select_w7_avx2 wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_select_wX_avx2 wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_point_double_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_double_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_double_nohw wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_nohw wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_affine_nohw wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_affine_nohw wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_affine_nohw wrt ..imagebase + DD $L$SEH_begin_ecp_nistz256_point_double_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_double_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_double_adx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_adx wrt ..imagebase + + DD $L$SEH_begin_ecp_nistz256_point_add_affine_adx wrt ..imagebase + DD $L$SEH_end_ecp_nistz256_point_add_affine_adx wrt ..imagebase + DD $L$SEH_info_ecp_nistz256_point_add_affine_adx wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ecp_nistz256_neg: + DB 9,0,0,0 + DD short_handler wrt ..imagebase + DD $L$neg_body wrt ..imagebase,$L$neg_epilogue wrt ..imagebase +$L$SEH_info_ecp_nistz256_ord_mul_mont_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mul_body wrt ..imagebase,$L$ord_mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_sqr_mont_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqr_body wrt ..imagebase,$L$ord_sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_mul_mont_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_mulx_body wrt ..imagebase,$L$ord_mulx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_ord_sqr_mont_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$ord_sqrx_body wrt ..imagebase,$L$ord_sqrx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_mul_mont_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_sqr_mont_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$sqr_body wrt ..imagebase,$L$sqr_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_mul_mont_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$mulx_body wrt ..imagebase,$L$mulx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_sqr_mont_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$sqrx_body wrt ..imagebase,$L$sqrx_epilogue wrt ..imagebase + DD 48,0 +$L$SEH_info_ecp_nistz256_select_wX_nohw: + DB 0x01,0x33,0x16,0x00 + DB 0x33,0xf8,0x09,0x00 + DB 0x2e,0xe8,0x08,0x00 + DB 0x29,0xd8,0x07,0x00 + DB 0x24,0xc8,0x06,0x00 + DB 0x1f,0xb8,0x05,0x00 + DB 0x1a,0xa8,0x04,0x00 + DB 0x15,0x98,0x03,0x00 + DB 0x10,0x88,0x02,0x00 + DB 0x0c,0x78,0x01,0x00 + DB 0x08,0x68,0x00,0x00 + DB 0x04,0x01,0x15,0x00 +ALIGN 8 +$L$SEH_info_ecp_nistz256_select_wX_avx2: + DB 0x01,0x36,0x17,0x0b + DB 0x36,0xf8,0x09,0x00 + DB 0x31,0xe8,0x08,0x00 + DB 0x2c,0xd8,0x07,0x00 + DB 0x27,0xc8,0x06,0x00 + DB 0x22,0xb8,0x05,0x00 + DB 0x1d,0xa8,0x04,0x00 + DB 0x18,0x98,0x03,0x00 + DB 0x13,0x88,0x02,0x00 + DB 0x0e,0x78,0x01,0x00 + DB 0x09,0x68,0x00,0x00 + DB 0x04,0x01,0x15,0x00 + DB 0x00,0xb3,0x00,0x00 +ALIGN 8 +$L$SEH_info_ecp_nistz256_point_double_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doubleq_body wrt ..imagebase,$L$point_doubleq_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_ecp_nistz256_point_add_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addq_body wrt ..imagebase,$L$point_addq_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_ecp_nistz256_point_add_affine_nohw: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affineq_body wrt ..imagebase,$L$add_affineq_epilogue wrt ..imagebase + DD 32*15+56,0 +ALIGN 8 +$L$SEH_info_ecp_nistz256_point_double_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_doublex_body wrt ..imagebase,$L$point_doublex_epilogue wrt ..imagebase + DD 32*5+56,0 +$L$SEH_info_ecp_nistz256_point_add_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$point_addx_body wrt ..imagebase,$L$point_addx_epilogue wrt ..imagebase + DD 32*18+56,0 +$L$SEH_info_ecp_nistz256_point_add_affine_adx: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$add_affinex_body wrt ..imagebase,$L$add_affinex_epilogue wrt ..imagebase + DD 32*15+56,0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/p256_beeu-armv8-asm-apple.S b/third_party/boringssl/gen/bcm/p256_beeu-armv8-asm-apple.S new file mode 100644 index 00000000..af2f0efa --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256_beeu-armv8-asm-apple.S @@ -0,0 +1,310 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.text +.globl _beeu_mod_inverse_vartime +.private_extern _beeu_mod_inverse_vartime + +.align 4 +_beeu_mod_inverse_vartime: + // Reserve enough space for 14 8-byte registers on the stack + // in the first stp call for x29, x30. + // Then store the remaining callee-saved registers. + // + // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 | + // ^ ^ + // sp <------------------- 112 bytes ----------------> old sp + // x29 (FP) + // + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-112]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x2,[sp,#96] + + // B = b3..b0 := a + ldp x25,x26,[x1] + ldp x27,x28,[x1,#16] + + // n3..n0 := n + // Note: the value of input params are changed in the following. + ldp x0,x1,[x2] + ldp x2,x30,[x2,#16] + + // A = a3..a0 := n + mov x21, x0 + mov x22, x1 + mov x23, x2 + mov x24, x30 + + // X = x4..x0 := 1 + mov x3, #1 + eor x4, x4, x4 + eor x5, x5, x5 + eor x6, x6, x6 + eor x7, x7, x7 + + // Y = y4..y0 := 0 + eor x8, x8, x8 + eor x9, x9, x9 + eor x10, x10, x10 + eor x11, x11, x11 + eor x12, x12, x12 + +Lbeeu_loop: + // if B == 0, jump to .Lbeeu_loop_end + orr x14, x25, x26 + orr x14, x14, x27 + + // reverse the bit order of x25. This is needed for clz after this macro + rbit x15, x25 + + orr x14, x14, x28 + cbz x14,Lbeeu_loop_end + + + // 0 < B < |n|, + // 0 < A <= |n|, + // (1) X*a == B (mod |n|), + // (2) (-1)*Y*a == A (mod |n|) + + // Now divide B by the maximum possible power of two in the + // integers, and divide X by the same value mod |n|. + // When we're done, (1) still holds. + + // shift := number of trailing 0s in x25 + // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO) + orr x15, x15, #1 // Clamp the shift to 63 bits (see SHIFT256) + clz x13, x15 + + // If there is no shift, goto shift_A_Y + cbz x13, Lbeeu_shift_A_Y + + // Shift B right by "x13" bits + neg x14, x13 + lsr x25, x25, x13 + lsl x15, x26, x14 + + lsr x26, x26, x13 + lsl x19, x27, x14 + + orr x25, x25, x15 + + lsr x27, x27, x13 + lsl x20, x28, x14 + + orr x26, x26, x19 + + lsr x28, x28, x13 + + orr x27, x27, x20 + + + // Shift X right by "x13" bits, adding n whenever X becomes odd. + // x13--; + // x14 := 0; needed in the addition to the most significant word in SHIFT1 + eor x14, x14, x14 +Lbeeu_shift_loop_X: + tbz x3, #0, Lshift1_0 + adds x3, x3, x0 + adcs x4, x4, x1 + adcs x5, x5, x2 + adcs x6, x6, x30 + adc x7, x7, x14 +Lshift1_0: + // var0 := [var1|var0]<64..1>; + // i.e. concatenate var1 and var0, + // extract bits <64..1> from the resulting 128-bit value + // and put them in var0 + extr x3, x4, x3, #1 + extr x4, x5, x4, #1 + extr x5, x6, x5, #1 + extr x6, x7, x6, #1 + lsr x7, x7, #1 + + subs x13, x13, #1 + bne Lbeeu_shift_loop_X + + // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl + // with the following differences: + // - "x13" is set directly to the number of trailing 0s in B + // (using rbit and clz instructions) + // - The loop is only used to call SHIFT1(X) + // and x13 is decreased while executing the X loop. + // - SHIFT256(B, x13) is performed before right-shifting X; they are independent + // - "x13" is clamped to 63 bits. + +Lbeeu_shift_A_Y: + // Same for A and Y. + // Afterwards, (2) still holds. + // Reverse the bit order of x21 + // x13 := number of trailing 0s in x21 (= number of leading 0s in x15) + rbit x15, x21 + orr x15, x15, #1 // Clamp the shift to 63 bits (see SHIFT256) + clz x13, x15 + + // If there is no shift, goto |B-A|, X+Y update + cbz x13, Lbeeu_update_B_X_or_A_Y + + // Shift A right by "x13" bits + neg x14, x13 + lsr x21, x21, x13 + lsl x15, x22, x14 + + lsr x22, x22, x13 + lsl x19, x23, x14 + + orr x21, x21, x15 + + lsr x23, x23, x13 + lsl x20, x24, x14 + + orr x22, x22, x19 + + lsr x24, x24, x13 + + orr x23, x23, x20 + + + // Shift Y right by "x13" bits, adding n whenever Y becomes odd. + // x13--; + // x14 := 0; needed in the addition to the most significant word in SHIFT1 + eor x14, x14, x14 +Lbeeu_shift_loop_Y: + tbz x8, #0, Lshift1_1 + adds x8, x8, x0 + adcs x9, x9, x1 + adcs x10, x10, x2 + adcs x11, x11, x30 + adc x12, x12, x14 +Lshift1_1: + // var0 := [var1|var0]<64..1>; + // i.e. concatenate var1 and var0, + // extract bits <64..1> from the resulting 128-bit value + // and put them in var0 + extr x8, x9, x8, #1 + extr x9, x10, x9, #1 + extr x10, x11, x10, #1 + extr x11, x12, x11, #1 + lsr x12, x12, #1 + + subs x13, x13, #1 + bne Lbeeu_shift_loop_Y + +Lbeeu_update_B_X_or_A_Y: + // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow) + // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words + // without taking a sign bit if generated. The lack of a carry would + // indicate a negative result. See, for example, + // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes + subs x14, x25, x21 + sbcs x15, x26, x22 + sbcs x19, x27, x23 + sbcs x20, x28, x24 + bcs Lbeeu_B_greater_than_A + + // Else A > B => + // A := A - B; Y := Y + X; goto beginning of the loop + subs x21, x21, x25 + sbcs x22, x22, x26 + sbcs x23, x23, x27 + sbcs x24, x24, x28 + + adds x8, x8, x3 + adcs x9, x9, x4 + adcs x10, x10, x5 + adcs x11, x11, x6 + adc x12, x12, x7 + b Lbeeu_loop + +Lbeeu_B_greater_than_A: + // Continue with B > A => + // B := B - A; X := X + Y; goto beginning of the loop + mov x25, x14 + mov x26, x15 + mov x27, x19 + mov x28, x20 + + adds x3, x3, x8 + adcs x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + b Lbeeu_loop + +Lbeeu_loop_end: + // The Euclid's algorithm loop ends when A == gcd(a,n); + // this would be 1, when a and n are co-prime (i.e. do not have a common factor). + // Since (-1)*Y*a == A (mod |n|), Y>0 + // then out = -Y mod n + + // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) + // Is A-1 == 0? + // If not, fail. + sub x14, x21, #1 + orr x14, x14, x22 + orr x14, x14, x23 + orr x14, x14, x24 + cbnz x14, Lbeeu_err + + // If Y>n ==> Y:=Y-n +Lbeeu_reduction_loop: + // x_i := y_i - n_i (X is no longer needed, use it as temp) + // (x14 = 0 from above) + subs x3, x8, x0 + sbcs x4, x9, x1 + sbcs x5, x10, x2 + sbcs x6, x11, x30 + sbcs x7, x12, x14 + + // If result is non-negative (i.e., cs = carry set = no borrow), + // y_i := x_i; goto reduce again + // else + // y_i := y_i; continue + csel x8, x3, x8, cs + csel x9, x4, x9, cs + csel x10, x5, x10, cs + csel x11, x6, x11, cs + csel x12, x7, x12, cs + bcs Lbeeu_reduction_loop + + // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0) + // out = -Y = n-Y + subs x8, x0, x8 + sbcs x9, x1, x9 + sbcs x10, x2, x10 + sbcs x11, x30, x11 + + // Save Y in output (out (x0) was saved on the stack) + ldr x3, [sp,#96] + stp x8, x9, [x3] + stp x10, x11, [x3,#16] + // return 1 (success) + mov x0, #1 + b Lbeeu_finish + +Lbeeu_err: + // return 0 (error) + eor x0, x0, x0 + +Lbeeu_finish: + // Restore callee-saved registers, except x0, x2 + add sp,x29,#0 + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldp x29,x30,[sp],#112 + + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/p256_beeu-armv8-asm-linux.S b/third_party/boringssl/gen/bcm/p256_beeu-armv8-asm-linux.S new file mode 100644 index 00000000..0c1cf05d --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256_beeu-armv8-asm-linux.S @@ -0,0 +1,310 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.text +.globl beeu_mod_inverse_vartime +.hidden beeu_mod_inverse_vartime +.type beeu_mod_inverse_vartime, %function +.align 4 +beeu_mod_inverse_vartime: + // Reserve enough space for 14 8-byte registers on the stack + // in the first stp call for x29, x30. + // Then store the remaining callee-saved registers. + // + // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 | + // ^ ^ + // sp <------------------- 112 bytes ----------------> old sp + // x29 (FP) + // + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-112]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x2,[sp,#96] + + // B = b3..b0 := a + ldp x25,x26,[x1] + ldp x27,x28,[x1,#16] + + // n3..n0 := n + // Note: the value of input params are changed in the following. + ldp x0,x1,[x2] + ldp x2,x30,[x2,#16] + + // A = a3..a0 := n + mov x21, x0 + mov x22, x1 + mov x23, x2 + mov x24, x30 + + // X = x4..x0 := 1 + mov x3, #1 + eor x4, x4, x4 + eor x5, x5, x5 + eor x6, x6, x6 + eor x7, x7, x7 + + // Y = y4..y0 := 0 + eor x8, x8, x8 + eor x9, x9, x9 + eor x10, x10, x10 + eor x11, x11, x11 + eor x12, x12, x12 + +.Lbeeu_loop: + // if B == 0, jump to .Lbeeu_loop_end + orr x14, x25, x26 + orr x14, x14, x27 + + // reverse the bit order of x25. This is needed for clz after this macro + rbit x15, x25 + + orr x14, x14, x28 + cbz x14,.Lbeeu_loop_end + + + // 0 < B < |n|, + // 0 < A <= |n|, + // (1) X*a == B (mod |n|), + // (2) (-1)*Y*a == A (mod |n|) + + // Now divide B by the maximum possible power of two in the + // integers, and divide X by the same value mod |n|. + // When we're done, (1) still holds. + + // shift := number of trailing 0s in x25 + // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO) + orr x15, x15, #1 // Clamp the shift to 63 bits (see SHIFT256) + clz x13, x15 + + // If there is no shift, goto shift_A_Y + cbz x13, .Lbeeu_shift_A_Y + + // Shift B right by "x13" bits + neg x14, x13 + lsr x25, x25, x13 + lsl x15, x26, x14 + + lsr x26, x26, x13 + lsl x19, x27, x14 + + orr x25, x25, x15 + + lsr x27, x27, x13 + lsl x20, x28, x14 + + orr x26, x26, x19 + + lsr x28, x28, x13 + + orr x27, x27, x20 + + + // Shift X right by "x13" bits, adding n whenever X becomes odd. + // x13--; + // x14 := 0; needed in the addition to the most significant word in SHIFT1 + eor x14, x14, x14 +.Lbeeu_shift_loop_X: + tbz x3, #0, .Lshift1_0 + adds x3, x3, x0 + adcs x4, x4, x1 + adcs x5, x5, x2 + adcs x6, x6, x30 + adc x7, x7, x14 +.Lshift1_0: + // var0 := [var1|var0]<64..1>; + // i.e. concatenate var1 and var0, + // extract bits <64..1> from the resulting 128-bit value + // and put them in var0 + extr x3, x4, x3, #1 + extr x4, x5, x4, #1 + extr x5, x6, x5, #1 + extr x6, x7, x6, #1 + lsr x7, x7, #1 + + subs x13, x13, #1 + bne .Lbeeu_shift_loop_X + + // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl + // with the following differences: + // - "x13" is set directly to the number of trailing 0s in B + // (using rbit and clz instructions) + // - The loop is only used to call SHIFT1(X) + // and x13 is decreased while executing the X loop. + // - SHIFT256(B, x13) is performed before right-shifting X; they are independent + // - "x13" is clamped to 63 bits. + +.Lbeeu_shift_A_Y: + // Same for A and Y. + // Afterwards, (2) still holds. + // Reverse the bit order of x21 + // x13 := number of trailing 0s in x21 (= number of leading 0s in x15) + rbit x15, x21 + orr x15, x15, #1 // Clamp the shift to 63 bits (see SHIFT256) + clz x13, x15 + + // If there is no shift, goto |B-A|, X+Y update + cbz x13, .Lbeeu_update_B_X_or_A_Y + + // Shift A right by "x13" bits + neg x14, x13 + lsr x21, x21, x13 + lsl x15, x22, x14 + + lsr x22, x22, x13 + lsl x19, x23, x14 + + orr x21, x21, x15 + + lsr x23, x23, x13 + lsl x20, x24, x14 + + orr x22, x22, x19 + + lsr x24, x24, x13 + + orr x23, x23, x20 + + + // Shift Y right by "x13" bits, adding n whenever Y becomes odd. + // x13--; + // x14 := 0; needed in the addition to the most significant word in SHIFT1 + eor x14, x14, x14 +.Lbeeu_shift_loop_Y: + tbz x8, #0, .Lshift1_1 + adds x8, x8, x0 + adcs x9, x9, x1 + adcs x10, x10, x2 + adcs x11, x11, x30 + adc x12, x12, x14 +.Lshift1_1: + // var0 := [var1|var0]<64..1>; + // i.e. concatenate var1 and var0, + // extract bits <64..1> from the resulting 128-bit value + // and put them in var0 + extr x8, x9, x8, #1 + extr x9, x10, x9, #1 + extr x10, x11, x10, #1 + extr x11, x12, x11, #1 + lsr x12, x12, #1 + + subs x13, x13, #1 + bne .Lbeeu_shift_loop_Y + +.Lbeeu_update_B_X_or_A_Y: + // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow) + // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words + // without taking a sign bit if generated. The lack of a carry would + // indicate a negative result. See, for example, + // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes + subs x14, x25, x21 + sbcs x15, x26, x22 + sbcs x19, x27, x23 + sbcs x20, x28, x24 + bcs .Lbeeu_B_greater_than_A + + // Else A > B => + // A := A - B; Y := Y + X; goto beginning of the loop + subs x21, x21, x25 + sbcs x22, x22, x26 + sbcs x23, x23, x27 + sbcs x24, x24, x28 + + adds x8, x8, x3 + adcs x9, x9, x4 + adcs x10, x10, x5 + adcs x11, x11, x6 + adc x12, x12, x7 + b .Lbeeu_loop + +.Lbeeu_B_greater_than_A: + // Continue with B > A => + // B := B - A; X := X + Y; goto beginning of the loop + mov x25, x14 + mov x26, x15 + mov x27, x19 + mov x28, x20 + + adds x3, x3, x8 + adcs x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + b .Lbeeu_loop + +.Lbeeu_loop_end: + // The Euclid's algorithm loop ends when A == gcd(a,n); + // this would be 1, when a and n are co-prime (i.e. do not have a common factor). + // Since (-1)*Y*a == A (mod |n|), Y>0 + // then out = -Y mod n + + // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) + // Is A-1 == 0? + // If not, fail. + sub x14, x21, #1 + orr x14, x14, x22 + orr x14, x14, x23 + orr x14, x14, x24 + cbnz x14, .Lbeeu_err + + // If Y>n ==> Y:=Y-n +.Lbeeu_reduction_loop: + // x_i := y_i - n_i (X is no longer needed, use it as temp) + // (x14 = 0 from above) + subs x3, x8, x0 + sbcs x4, x9, x1 + sbcs x5, x10, x2 + sbcs x6, x11, x30 + sbcs x7, x12, x14 + + // If result is non-negative (i.e., cs = carry set = no borrow), + // y_i := x_i; goto reduce again + // else + // y_i := y_i; continue + csel x8, x3, x8, cs + csel x9, x4, x9, cs + csel x10, x5, x10, cs + csel x11, x6, x11, cs + csel x12, x7, x12, cs + bcs .Lbeeu_reduction_loop + + // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0) + // out = -Y = n-Y + subs x8, x0, x8 + sbcs x9, x1, x9 + sbcs x10, x2, x10 + sbcs x11, x30, x11 + + // Save Y in output (out (x0) was saved on the stack) + ldr x3, [sp,#96] + stp x8, x9, [x3] + stp x10, x11, [x3,#16] + // return 1 (success) + mov x0, #1 + b .Lbeeu_finish + +.Lbeeu_err: + // return 0 (error) + eor x0, x0, x0 + +.Lbeeu_finish: + // Restore callee-saved registers, except x0, x2 + add sp,x29,#0 + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldp x29,x30,[sp],#112 + + AARCH64_VALIDATE_LINK_REGISTER + ret +.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/p256_beeu-armv8-asm-win.S b/third_party/boringssl/gen/bcm/p256_beeu-armv8-asm-win.S new file mode 100644 index 00000000..70440a17 --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256_beeu-armv8-asm-win.S @@ -0,0 +1,310 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.text +.globl beeu_mod_inverse_vartime + + +.align 4 +beeu_mod_inverse_vartime: + // Reserve enough space for 14 8-byte registers on the stack + // in the first stp call for x29, x30. + // Then store the remaining callee-saved registers. + // + // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 | + // ^ ^ + // sp <------------------- 112 bytes ----------------> old sp + // x29 (FP) + // + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-112]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + stp x0,x2,[sp,#96] + + // B = b3..b0 := a + ldp x25,x26,[x1] + ldp x27,x28,[x1,#16] + + // n3..n0 := n + // Note: the value of input params are changed in the following. + ldp x0,x1,[x2] + ldp x2,x30,[x2,#16] + + // A = a3..a0 := n + mov x21, x0 + mov x22, x1 + mov x23, x2 + mov x24, x30 + + // X = x4..x0 := 1 + mov x3, #1 + eor x4, x4, x4 + eor x5, x5, x5 + eor x6, x6, x6 + eor x7, x7, x7 + + // Y = y4..y0 := 0 + eor x8, x8, x8 + eor x9, x9, x9 + eor x10, x10, x10 + eor x11, x11, x11 + eor x12, x12, x12 + +Lbeeu_loop: + // if B == 0, jump to .Lbeeu_loop_end + orr x14, x25, x26 + orr x14, x14, x27 + + // reverse the bit order of x25. This is needed for clz after this macro + rbit x15, x25 + + orr x14, x14, x28 + cbz x14,Lbeeu_loop_end + + + // 0 < B < |n|, + // 0 < A <= |n|, + // (1) X*a == B (mod |n|), + // (2) (-1)*Y*a == A (mod |n|) + + // Now divide B by the maximum possible power of two in the + // integers, and divide X by the same value mod |n|. + // When we're done, (1) still holds. + + // shift := number of trailing 0s in x25 + // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO) + orr x15, x15, #1 // Clamp the shift to 63 bits (see SHIFT256) + clz x13, x15 + + // If there is no shift, goto shift_A_Y + cbz x13, Lbeeu_shift_A_Y + + // Shift B right by "x13" bits + neg x14, x13 + lsr x25, x25, x13 + lsl x15, x26, x14 + + lsr x26, x26, x13 + lsl x19, x27, x14 + + orr x25, x25, x15 + + lsr x27, x27, x13 + lsl x20, x28, x14 + + orr x26, x26, x19 + + lsr x28, x28, x13 + + orr x27, x27, x20 + + + // Shift X right by "x13" bits, adding n whenever X becomes odd. + // x13--; + // x14 := 0; needed in the addition to the most significant word in SHIFT1 + eor x14, x14, x14 +Lbeeu_shift_loop_X: + tbz x3, #0, Lshift1_0 + adds x3, x3, x0 + adcs x4, x4, x1 + adcs x5, x5, x2 + adcs x6, x6, x30 + adc x7, x7, x14 +Lshift1_0: + // var0 := [var1|var0]<64..1>; + // i.e. concatenate var1 and var0, + // extract bits <64..1> from the resulting 128-bit value + // and put them in var0 + extr x3, x4, x3, #1 + extr x4, x5, x4, #1 + extr x5, x6, x5, #1 + extr x6, x7, x6, #1 + lsr x7, x7, #1 + + subs x13, x13, #1 + bne Lbeeu_shift_loop_X + + // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl + // with the following differences: + // - "x13" is set directly to the number of trailing 0s in B + // (using rbit and clz instructions) + // - The loop is only used to call SHIFT1(X) + // and x13 is decreased while executing the X loop. + // - SHIFT256(B, x13) is performed before right-shifting X; they are independent + // - "x13" is clamped to 63 bits. + +Lbeeu_shift_A_Y: + // Same for A and Y. + // Afterwards, (2) still holds. + // Reverse the bit order of x21 + // x13 := number of trailing 0s in x21 (= number of leading 0s in x15) + rbit x15, x21 + orr x15, x15, #1 // Clamp the shift to 63 bits (see SHIFT256) + clz x13, x15 + + // If there is no shift, goto |B-A|, X+Y update + cbz x13, Lbeeu_update_B_X_or_A_Y + + // Shift A right by "x13" bits + neg x14, x13 + lsr x21, x21, x13 + lsl x15, x22, x14 + + lsr x22, x22, x13 + lsl x19, x23, x14 + + orr x21, x21, x15 + + lsr x23, x23, x13 + lsl x20, x24, x14 + + orr x22, x22, x19 + + lsr x24, x24, x13 + + orr x23, x23, x20 + + + // Shift Y right by "x13" bits, adding n whenever Y becomes odd. + // x13--; + // x14 := 0; needed in the addition to the most significant word in SHIFT1 + eor x14, x14, x14 +Lbeeu_shift_loop_Y: + tbz x8, #0, Lshift1_1 + adds x8, x8, x0 + adcs x9, x9, x1 + adcs x10, x10, x2 + adcs x11, x11, x30 + adc x12, x12, x14 +Lshift1_1: + // var0 := [var1|var0]<64..1>; + // i.e. concatenate var1 and var0, + // extract bits <64..1> from the resulting 128-bit value + // and put them in var0 + extr x8, x9, x8, #1 + extr x9, x10, x9, #1 + extr x10, x11, x10, #1 + extr x11, x12, x11, #1 + lsr x12, x12, #1 + + subs x13, x13, #1 + bne Lbeeu_shift_loop_Y + +Lbeeu_update_B_X_or_A_Y: + // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow) + // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words + // without taking a sign bit if generated. The lack of a carry would + // indicate a negative result. See, for example, + // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes + subs x14, x25, x21 + sbcs x15, x26, x22 + sbcs x19, x27, x23 + sbcs x20, x28, x24 + bcs Lbeeu_B_greater_than_A + + // Else A > B => + // A := A - B; Y := Y + X; goto beginning of the loop + subs x21, x21, x25 + sbcs x22, x22, x26 + sbcs x23, x23, x27 + sbcs x24, x24, x28 + + adds x8, x8, x3 + adcs x9, x9, x4 + adcs x10, x10, x5 + adcs x11, x11, x6 + adc x12, x12, x7 + b Lbeeu_loop + +Lbeeu_B_greater_than_A: + // Continue with B > A => + // B := B - A; X := X + Y; goto beginning of the loop + mov x25, x14 + mov x26, x15 + mov x27, x19 + mov x28, x20 + + adds x3, x3, x8 + adcs x4, x4, x9 + adcs x5, x5, x10 + adcs x6, x6, x11 + adc x7, x7, x12 + b Lbeeu_loop + +Lbeeu_loop_end: + // The Euclid's algorithm loop ends when A == gcd(a,n); + // this would be 1, when a and n are co-prime (i.e. do not have a common factor). + // Since (-1)*Y*a == A (mod |n|), Y>0 + // then out = -Y mod n + + // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) + // Is A-1 == 0? + // If not, fail. + sub x14, x21, #1 + orr x14, x14, x22 + orr x14, x14, x23 + orr x14, x14, x24 + cbnz x14, Lbeeu_err + + // If Y>n ==> Y:=Y-n +Lbeeu_reduction_loop: + // x_i := y_i - n_i (X is no longer needed, use it as temp) + // (x14 = 0 from above) + subs x3, x8, x0 + sbcs x4, x9, x1 + sbcs x5, x10, x2 + sbcs x6, x11, x30 + sbcs x7, x12, x14 + + // If result is non-negative (i.e., cs = carry set = no borrow), + // y_i := x_i; goto reduce again + // else + // y_i := y_i; continue + csel x8, x3, x8, cs + csel x9, x4, x9, cs + csel x10, x5, x10, cs + csel x11, x6, x11, cs + csel x12, x7, x12, cs + bcs Lbeeu_reduction_loop + + // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0) + // out = -Y = n-Y + subs x8, x0, x8 + sbcs x9, x1, x9 + sbcs x10, x2, x10 + sbcs x11, x30, x11 + + // Save Y in output (out (x0) was saved on the stack) + ldr x3, [sp,#96] + stp x8, x9, [x3] + stp x10, x11, [x3,#16] + // return 1 (success) + mov x0, #1 + b Lbeeu_finish + +Lbeeu_err: + // return 0 (error) + eor x0, x0, x0 + +Lbeeu_finish: + // Restore callee-saved registers, except x0, x2 + add sp,x29,#0 + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldp x29,x30,[sp],#112 + + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/p256_beeu-x86_64-asm-apple.S b/third_party/boringssl/gen/bcm/p256_beeu-x86_64-asm-apple.S new file mode 100644 index 00000000..35461b1f --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256_beeu-x86_64-asm-apple.S @@ -0,0 +1,321 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.globl _beeu_mod_inverse_vartime +.private_extern _beeu_mod_inverse_vartime + +.p2align 5 +_beeu_mod_inverse_vartime: + +_CET_ENDBR + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + pushq %rbx + + pushq %rsi + + + subq $80,%rsp + + movq %rdi,0(%rsp) + + + movq $1,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %rdi,%rdi + + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + xorq %rbp,%rbp + + + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu %xmm0,48(%rsp) + vmovdqu %xmm1,64(%rsp) + + vmovdqu 0(%rdx),%xmm0 + vmovdqu 16(%rdx),%xmm1 + vmovdqu %xmm0,16(%rsp) + vmovdqu %xmm1,32(%rsp) + +L$beeu_loop: + xorq %rbx,%rbx + orq 48(%rsp),%rbx + orq 56(%rsp),%rbx + orq 64(%rsp),%rbx + orq 72(%rsp),%rbx + jz L$beeu_loop_end + + + + + + + + + + + movq $1,%rcx + + +L$beeu_shift_loop_XB: + movq %rcx,%rbx + andq 48(%rsp),%rbx + jnz L$beeu_shift_loop_end_XB + + + movq $1,%rbx + andq %r8,%rbx + jz L$shift1_0 + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq $0,%rdi + +L$shift1_0: + shrdq $1,%r9,%r8 + shrdq $1,%r10,%r9 + shrdq $1,%r11,%r10 + shrdq $1,%rdi,%r11 + shrq $1,%rdi + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne L$beeu_shift_loop_XB + +L$beeu_shift_loop_end_XB: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz L$beeu_no_shift_XB + + + + movq 8+48(%rsp),%rax + movq 16+48(%rsp),%rbx + movq 24+48(%rsp),%rsi + + shrdq %cl,%rax,0+48(%rsp) + shrdq %cl,%rbx,8+48(%rsp) + shrdq %cl,%rsi,16+48(%rsp) + + shrq %cl,%rsi + movq %rsi,24+48(%rsp) + + +L$beeu_no_shift_XB: + + movq $1,%rcx + + +L$beeu_shift_loop_YA: + movq %rcx,%rbx + andq 16(%rsp),%rbx + jnz L$beeu_shift_loop_end_YA + + + movq $1,%rbx + andq %r12,%rbx + jz L$shift1_1 + addq 0(%rdx),%r12 + adcq 8(%rdx),%r13 + adcq 16(%rdx),%r14 + adcq 24(%rdx),%r15 + adcq $0,%rbp + +L$shift1_1: + shrdq $1,%r13,%r12 + shrdq $1,%r14,%r13 + shrdq $1,%r15,%r14 + shrdq $1,%rbp,%r15 + shrq $1,%rbp + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne L$beeu_shift_loop_YA + +L$beeu_shift_loop_end_YA: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz L$beeu_no_shift_YA + + + + movq 8+16(%rsp),%rax + movq 16+16(%rsp),%rbx + movq 24+16(%rsp),%rsi + + shrdq %cl,%rax,0+16(%rsp) + shrdq %cl,%rbx,8+16(%rsp) + shrdq %cl,%rsi,16+16(%rsp) + + shrq %cl,%rsi + movq %rsi,24+16(%rsp) + + +L$beeu_no_shift_YA: + + movq 48(%rsp),%rax + movq 56(%rsp),%rbx + movq 64(%rsp),%rsi + movq 72(%rsp),%rcx + subq 16(%rsp),%rax + sbbq 24(%rsp),%rbx + sbbq 32(%rsp),%rsi + sbbq 40(%rsp),%rcx + jnc L$beeu_B_bigger_than_A + + + movq 16(%rsp),%rax + movq 24(%rsp),%rbx + movq 32(%rsp),%rsi + movq 40(%rsp),%rcx + subq 48(%rsp),%rax + sbbq 56(%rsp),%rbx + sbbq 64(%rsp),%rsi + sbbq 72(%rsp),%rcx + movq %rax,16(%rsp) + movq %rbx,24(%rsp) + movq %rsi,32(%rsp) + movq %rcx,40(%rsp) + + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %r11,%r15 + adcq %rdi,%rbp + jmp L$beeu_loop + +L$beeu_B_bigger_than_A: + + movq %rax,48(%rsp) + movq %rbx,56(%rsp) + movq %rsi,64(%rsp) + movq %rcx,72(%rsp) + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rbp,%rdi + + jmp L$beeu_loop + +L$beeu_loop_end: + + + + + movq 16(%rsp),%rbx + subq $1,%rbx + orq 24(%rsp),%rbx + orq 32(%rsp),%rbx + orq 40(%rsp),%rbx + + jnz L$beeu_err + + + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + xorq %rdi,%rdi + +L$beeu_reduction_loop: + movq %r12,16(%rsp) + movq %r13,24(%rsp) + movq %r14,32(%rsp) + movq %r15,40(%rsp) + movq %rbp,48(%rsp) + + + subq %r8,%r12 + sbbq %r9,%r13 + sbbq %r10,%r14 + sbbq %r11,%r15 + sbbq $0,%rbp + + + cmovcq 16(%rsp),%r12 + cmovcq 24(%rsp),%r13 + cmovcq 32(%rsp),%r14 + cmovcq 40(%rsp),%r15 + jnc L$beeu_reduction_loop + + + subq %r12,%r8 + sbbq %r13,%r9 + sbbq %r14,%r10 + sbbq %r15,%r11 + +L$beeu_save: + + movq 0(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + + movq $1,%rax + jmp L$beeu_finish + +L$beeu_err: + + xorq %rax,%rax + +L$beeu_finish: + addq $80,%rsp + + popq %rsi + + popq %rbx + + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbp + + ret + + + +#endif diff --git a/third_party/boringssl/gen/bcm/p256_beeu-x86_64-asm-linux.S b/third_party/boringssl/gen/bcm/p256_beeu-x86_64-asm-linux.S new file mode 100644 index 00000000..1056268c --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256_beeu-x86_64-asm-linux.S @@ -0,0 +1,335 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.globl beeu_mod_inverse_vartime +.hidden beeu_mod_inverse_vartime +.type beeu_mod_inverse_vartime,@function +.align 32 +beeu_mod_inverse_vartime: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp,-16 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12,-24 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13,-32 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14,-40 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15,-48 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx,-56 + pushq %rsi +.cfi_adjust_cfa_offset 8 +.cfi_offset rsi,-64 + + subq $80,%rsp +.cfi_adjust_cfa_offset 80 + movq %rdi,0(%rsp) + + + movq $1,%r8 + xorq %r9,%r9 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %rdi,%rdi + + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + xorq %rbp,%rbp + + + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu %xmm0,48(%rsp) + vmovdqu %xmm1,64(%rsp) + + vmovdqu 0(%rdx),%xmm0 + vmovdqu 16(%rdx),%xmm1 + vmovdqu %xmm0,16(%rsp) + vmovdqu %xmm1,32(%rsp) + +.Lbeeu_loop: + xorq %rbx,%rbx + orq 48(%rsp),%rbx + orq 56(%rsp),%rbx + orq 64(%rsp),%rbx + orq 72(%rsp),%rbx + jz .Lbeeu_loop_end + + + + + + + + + + + movq $1,%rcx + + +.Lbeeu_shift_loop_XB: + movq %rcx,%rbx + andq 48(%rsp),%rbx + jnz .Lbeeu_shift_loop_end_XB + + + movq $1,%rbx + andq %r8,%rbx + jz .Lshift1_0 + addq 0(%rdx),%r8 + adcq 8(%rdx),%r9 + adcq 16(%rdx),%r10 + adcq 24(%rdx),%r11 + adcq $0,%rdi + +.Lshift1_0: + shrdq $1,%r9,%r8 + shrdq $1,%r10,%r9 + shrdq $1,%r11,%r10 + shrdq $1,%rdi,%r11 + shrq $1,%rdi + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne .Lbeeu_shift_loop_XB + +.Lbeeu_shift_loop_end_XB: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz .Lbeeu_no_shift_XB + + + + movq 8+48(%rsp),%rax + movq 16+48(%rsp),%rbx + movq 24+48(%rsp),%rsi + + shrdq %cl,%rax,0+48(%rsp) + shrdq %cl,%rbx,8+48(%rsp) + shrdq %cl,%rsi,16+48(%rsp) + + shrq %cl,%rsi + movq %rsi,24+48(%rsp) + + +.Lbeeu_no_shift_XB: + + movq $1,%rcx + + +.Lbeeu_shift_loop_YA: + movq %rcx,%rbx + andq 16(%rsp),%rbx + jnz .Lbeeu_shift_loop_end_YA + + + movq $1,%rbx + andq %r12,%rbx + jz .Lshift1_1 + addq 0(%rdx),%r12 + adcq 8(%rdx),%r13 + adcq 16(%rdx),%r14 + adcq 24(%rdx),%r15 + adcq $0,%rbp + +.Lshift1_1: + shrdq $1,%r13,%r12 + shrdq $1,%r14,%r13 + shrdq $1,%r15,%r14 + shrdq $1,%rbp,%r15 + shrq $1,%rbp + + shlq $1,%rcx + + + + + + cmpq $0x8000000,%rcx + jne .Lbeeu_shift_loop_YA + +.Lbeeu_shift_loop_end_YA: + bsfq %rcx,%rcx + testq %rcx,%rcx + jz .Lbeeu_no_shift_YA + + + + movq 8+16(%rsp),%rax + movq 16+16(%rsp),%rbx + movq 24+16(%rsp),%rsi + + shrdq %cl,%rax,0+16(%rsp) + shrdq %cl,%rbx,8+16(%rsp) + shrdq %cl,%rsi,16+16(%rsp) + + shrq %cl,%rsi + movq %rsi,24+16(%rsp) + + +.Lbeeu_no_shift_YA: + + movq 48(%rsp),%rax + movq 56(%rsp),%rbx + movq 64(%rsp),%rsi + movq 72(%rsp),%rcx + subq 16(%rsp),%rax + sbbq 24(%rsp),%rbx + sbbq 32(%rsp),%rsi + sbbq 40(%rsp),%rcx + jnc .Lbeeu_B_bigger_than_A + + + movq 16(%rsp),%rax + movq 24(%rsp),%rbx + movq 32(%rsp),%rsi + movq 40(%rsp),%rcx + subq 48(%rsp),%rax + sbbq 56(%rsp),%rbx + sbbq 64(%rsp),%rsi + sbbq 72(%rsp),%rcx + movq %rax,16(%rsp) + movq %rbx,24(%rsp) + movq %rsi,32(%rsp) + movq %rcx,40(%rsp) + + + addq %r8,%r12 + adcq %r9,%r13 + adcq %r10,%r14 + adcq %r11,%r15 + adcq %rdi,%rbp + jmp .Lbeeu_loop + +.Lbeeu_B_bigger_than_A: + + movq %rax,48(%rsp) + movq %rbx,56(%rsp) + movq %rsi,64(%rsp) + movq %rcx,72(%rsp) + + + addq %r12,%r8 + adcq %r13,%r9 + adcq %r14,%r10 + adcq %r15,%r11 + adcq %rbp,%rdi + + jmp .Lbeeu_loop + +.Lbeeu_loop_end: + + + + + movq 16(%rsp),%rbx + subq $1,%rbx + orq 24(%rsp),%rbx + orq 32(%rsp),%rbx + orq 40(%rsp),%rbx + + jnz .Lbeeu_err + + + + + movq 0(%rdx),%r8 + movq 8(%rdx),%r9 + movq 16(%rdx),%r10 + movq 24(%rdx),%r11 + xorq %rdi,%rdi + +.Lbeeu_reduction_loop: + movq %r12,16(%rsp) + movq %r13,24(%rsp) + movq %r14,32(%rsp) + movq %r15,40(%rsp) + movq %rbp,48(%rsp) + + + subq %r8,%r12 + sbbq %r9,%r13 + sbbq %r10,%r14 + sbbq %r11,%r15 + sbbq $0,%rbp + + + cmovcq 16(%rsp),%r12 + cmovcq 24(%rsp),%r13 + cmovcq 32(%rsp),%r14 + cmovcq 40(%rsp),%r15 + jnc .Lbeeu_reduction_loop + + + subq %r12,%r8 + sbbq %r13,%r9 + sbbq %r14,%r10 + sbbq %r15,%r11 + +.Lbeeu_save: + + movq 0(%rsp),%rdi + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + + + movq $1,%rax + jmp .Lbeeu_finish + +.Lbeeu_err: + + xorq %rax,%rax + +.Lbeeu_finish: + addq $80,%rsp +.cfi_adjust_cfa_offset -80 + popq %rsi +.cfi_adjust_cfa_offset -8 +.cfi_restore rsi + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore rbx + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore r12 + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore rbp + ret +.cfi_endproc + +.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime +#endif diff --git a/third_party/boringssl/gen/bcm/p256_beeu-x86_64-asm-win.asm b/third_party/boringssl/gen/bcm/p256_beeu-x86_64-asm-win.asm new file mode 100644 index 00000000..5586c39d --- /dev/null +++ b/third_party/boringssl/gen/bcm/p256_beeu-x86_64-asm-win.asm @@ -0,0 +1,345 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + +global beeu_mod_inverse_vartime + +ALIGN 32 +beeu_mod_inverse_vartime: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_beeu_mod_inverse_vartime: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + push rbx + + push rsi + + + sub rsp,80 + + mov QWORD[rsp],rdi + + + mov r8,1 + xor r9,r9 + xor r10,r10 + xor r11,r11 + xor rdi,rdi + + xor r12,r12 + xor r13,r13 + xor r14,r14 + xor r15,r15 + xor rbp,rbp + + + vmovdqu xmm0,XMMWORD[rsi] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu XMMWORD[48+rsp],xmm0 + vmovdqu XMMWORD[64+rsp],xmm1 + + vmovdqu xmm0,XMMWORD[rdx] + vmovdqu xmm1,XMMWORD[16+rdx] + vmovdqu XMMWORD[16+rsp],xmm0 + vmovdqu XMMWORD[32+rsp],xmm1 + +$L$beeu_loop: + xor rbx,rbx + or rbx,QWORD[48+rsp] + or rbx,QWORD[56+rsp] + or rbx,QWORD[64+rsp] + or rbx,QWORD[72+rsp] + jz NEAR $L$beeu_loop_end + + + + + + + + + + + mov rcx,1 + + +$L$beeu_shift_loop_XB: + mov rbx,rcx + and rbx,QWORD[48+rsp] + jnz NEAR $L$beeu_shift_loop_end_XB + + + mov rbx,1 + and rbx,r8 + jz NEAR $L$shift1_0 + add r8,QWORD[rdx] + adc r9,QWORD[8+rdx] + adc r10,QWORD[16+rdx] + adc r11,QWORD[24+rdx] + adc rdi,0 + +$L$shift1_0: + shrd r8,r9,1 + shrd r9,r10,1 + shrd r10,r11,1 + shrd r11,rdi,1 + shr rdi,1 + + shl rcx,1 + + + + + + cmp rcx,0x8000000 + jne NEAR $L$beeu_shift_loop_XB + +$L$beeu_shift_loop_end_XB: + bsf rcx,rcx + test rcx,rcx + jz NEAR $L$beeu_no_shift_XB + + + + mov rax,QWORD[((8+48))+rsp] + mov rbx,QWORD[((16+48))+rsp] + mov rsi,QWORD[((24+48))+rsp] + + shrd QWORD[((0+48))+rsp],rax,cl + shrd QWORD[((8+48))+rsp],rbx,cl + shrd QWORD[((16+48))+rsp],rsi,cl + + shr rsi,cl + mov QWORD[((24+48))+rsp],rsi + + +$L$beeu_no_shift_XB: + + mov rcx,1 + + +$L$beeu_shift_loop_YA: + mov rbx,rcx + and rbx,QWORD[16+rsp] + jnz NEAR $L$beeu_shift_loop_end_YA + + + mov rbx,1 + and rbx,r12 + jz NEAR $L$shift1_1 + add r12,QWORD[rdx] + adc r13,QWORD[8+rdx] + adc r14,QWORD[16+rdx] + adc r15,QWORD[24+rdx] + adc rbp,0 + +$L$shift1_1: + shrd r12,r13,1 + shrd r13,r14,1 + shrd r14,r15,1 + shrd r15,rbp,1 + shr rbp,1 + + shl rcx,1 + + + + + + cmp rcx,0x8000000 + jne NEAR $L$beeu_shift_loop_YA + +$L$beeu_shift_loop_end_YA: + bsf rcx,rcx + test rcx,rcx + jz NEAR $L$beeu_no_shift_YA + + + + mov rax,QWORD[((8+16))+rsp] + mov rbx,QWORD[((16+16))+rsp] + mov rsi,QWORD[((24+16))+rsp] + + shrd QWORD[((0+16))+rsp],rax,cl + shrd QWORD[((8+16))+rsp],rbx,cl + shrd QWORD[((16+16))+rsp],rsi,cl + + shr rsi,cl + mov QWORD[((24+16))+rsp],rsi + + +$L$beeu_no_shift_YA: + + mov rax,QWORD[48+rsp] + mov rbx,QWORD[56+rsp] + mov rsi,QWORD[64+rsp] + mov rcx,QWORD[72+rsp] + sub rax,QWORD[16+rsp] + sbb rbx,QWORD[24+rsp] + sbb rsi,QWORD[32+rsp] + sbb rcx,QWORD[40+rsp] + jnc NEAR $L$beeu_B_bigger_than_A + + + mov rax,QWORD[16+rsp] + mov rbx,QWORD[24+rsp] + mov rsi,QWORD[32+rsp] + mov rcx,QWORD[40+rsp] + sub rax,QWORD[48+rsp] + sbb rbx,QWORD[56+rsp] + sbb rsi,QWORD[64+rsp] + sbb rcx,QWORD[72+rsp] + mov QWORD[16+rsp],rax + mov QWORD[24+rsp],rbx + mov QWORD[32+rsp],rsi + mov QWORD[40+rsp],rcx + + + add r12,r8 + adc r13,r9 + adc r14,r10 + adc r15,r11 + adc rbp,rdi + jmp NEAR $L$beeu_loop + +$L$beeu_B_bigger_than_A: + + mov QWORD[48+rsp],rax + mov QWORD[56+rsp],rbx + mov QWORD[64+rsp],rsi + mov QWORD[72+rsp],rcx + + + add r8,r12 + adc r9,r13 + adc r10,r14 + adc r11,r15 + adc rdi,rbp + + jmp NEAR $L$beeu_loop + +$L$beeu_loop_end: + + + + + mov rbx,QWORD[16+rsp] + sub rbx,1 + or rbx,QWORD[24+rsp] + or rbx,QWORD[32+rsp] + or rbx,QWORD[40+rsp] + + jnz NEAR $L$beeu_err + + + + + mov r8,QWORD[rdx] + mov r9,QWORD[8+rdx] + mov r10,QWORD[16+rdx] + mov r11,QWORD[24+rdx] + xor rdi,rdi + +$L$beeu_reduction_loop: + mov QWORD[16+rsp],r12 + mov QWORD[24+rsp],r13 + mov QWORD[32+rsp],r14 + mov QWORD[40+rsp],r15 + mov QWORD[48+rsp],rbp + + + sub r12,r8 + sbb r13,r9 + sbb r14,r10 + sbb r15,r11 + sbb rbp,0 + + + cmovc r12,QWORD[16+rsp] + cmovc r13,QWORD[24+rsp] + cmovc r14,QWORD[32+rsp] + cmovc r15,QWORD[40+rsp] + jnc NEAR $L$beeu_reduction_loop + + + sub r8,r12 + sbb r9,r13 + sbb r10,r14 + sbb r11,r15 + +$L$beeu_save: + + mov rdi,QWORD[rsp] + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + + + mov rax,1 + jmp NEAR $L$beeu_finish + +$L$beeu_err: + + xor rax,rax + +$L$beeu_finish: + add rsp,80 + + pop rsi + + pop rbx + + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbp + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + + +$L$SEH_end_beeu_mod_inverse_vartime: +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/rdrand-x86_64-apple.S b/third_party/boringssl/gen/bcm/rdrand-x86_64-apple.S new file mode 100644 index 00000000..4f990d95 --- /dev/null +++ b/third_party/boringssl/gen/bcm/rdrand-x86_64-apple.S @@ -0,0 +1,57 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + + + + +.globl _CRYPTO_rdrand +.private_extern _CRYPTO_rdrand + +.p2align 4 +_CRYPTO_rdrand: + +_CET_ENDBR + xorq %rax,%rax + rdrand %rdx + + adcq %rax,%rax + movq %rdx,0(%rdi) + ret + + + + + + + +.globl _CRYPTO_rdrand_multiple8_buf +.private_extern _CRYPTO_rdrand_multiple8_buf + +.p2align 4 +_CRYPTO_rdrand_multiple8_buf: + +_CET_ENDBR + testq %rsi,%rsi + jz L$out + movq $8,%rdx +L$loop: + rdrand %rcx + jnc L$err + movq %rcx,0(%rdi) + addq %rdx,%rdi + subq %rdx,%rsi + jnz L$loop +L$out: + movq $1,%rax + ret +L$err: + xorq %rax,%rax + ret + + +#endif diff --git a/third_party/boringssl/gen/bcm/rdrand-x86_64-linux.S b/third_party/boringssl/gen/bcm/rdrand-x86_64-linux.S new file mode 100644 index 00000000..52a1eb20 --- /dev/null +++ b/third_party/boringssl/gen/bcm/rdrand-x86_64-linux.S @@ -0,0 +1,57 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + + + + +.globl CRYPTO_rdrand +.hidden CRYPTO_rdrand +.type CRYPTO_rdrand,@function +.align 16 +CRYPTO_rdrand: +.cfi_startproc +_CET_ENDBR + xorq %rax,%rax + rdrand %rdx + + adcq %rax,%rax + movq %rdx,0(%rdi) + ret +.cfi_endproc +.size CRYPTO_rdrand,.-CRYPTO_rdrand + + + + + +.globl CRYPTO_rdrand_multiple8_buf +.hidden CRYPTO_rdrand_multiple8_buf +.type CRYPTO_rdrand_multiple8_buf,@function +.align 16 +CRYPTO_rdrand_multiple8_buf: +.cfi_startproc +_CET_ENDBR + testq %rsi,%rsi + jz .Lout + movq $8,%rdx +.Lloop: + rdrand %rcx + jnc .Lerr + movq %rcx,0(%rdi) + addq %rdx,%rdi + subq %rdx,%rsi + jnz .Lloop +.Lout: + movq $1,%rax + ret +.Lerr: + xorq %rax,%rax + ret +.cfi_endproc +.size CRYPTO_rdrand_multiple8_buf,.-CRYPTO_rdrand_multiple8_buf +#endif diff --git a/third_party/boringssl/gen/bcm/rdrand-x86_64-win.asm b/third_party/boringssl/gen/bcm/rdrand-x86_64-win.asm new file mode 100644 index 00000000..39d402f2 --- /dev/null +++ b/third_party/boringssl/gen/bcm/rdrand-x86_64-win.asm @@ -0,0 +1,66 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + + + + +global CRYPTO_rdrand + +ALIGN 16 +CRYPTO_rdrand: + +_CET_ENDBR + xor rax,rax + rdrand r8 + + adc rax,rax + mov QWORD[rcx],r8 + ret + + + + + + + +global CRYPTO_rdrand_multiple8_buf + +ALIGN 16 +CRYPTO_rdrand_multiple8_buf: + +_CET_ENDBR + test rdx,rdx + jz NEAR $L$out + mov r8,8 +$L$loop: + rdrand r9 + jnc NEAR $L$err + mov QWORD[rcx],r9 + add rcx,r8 + sub rdx,r8 + jnz NEAR $L$loop +$L$out: + mov rax,1 + ret +$L$err: + xor rax,rax + ret + + +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/rsaz-avx2-apple.S b/third_party/boringssl/gen/bcm/rsaz-avx2-apple.S new file mode 100644 index 00000000..36723091 --- /dev/null +++ b/third_party/boringssl/gen/bcm/rsaz-avx2-apple.S @@ -0,0 +1,1749 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.globl _rsaz_1024_sqr_avx2 +.private_extern _rsaz_1024_sqr_avx2 + +.p2align 6 +_rsaz_1024_sqr_avx2: + +_CET_ENDBR + leaq (%rsp),%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + vzeroupper + movq %rax,%rbp + + movq %rdx,%r13 + subq $832,%rsp + movq %r13,%r15 + subq $-128,%rdi + subq $-128,%rsi + subq $-128,%r13 + + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + vpxor %ymm9,%ymm9,%ymm9 + jz L$sqr_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%r13),%ymm0 + andq $-2048,%rsp + vmovdqu 32-128(%r13),%ymm1 + vmovdqu 64-128(%r13),%ymm2 + vmovdqu 96-128(%r13),%ymm3 + vmovdqu 128-128(%r13),%ymm4 + vmovdqu 160-128(%r13),%ymm5 + vmovdqu 192-128(%r13),%ymm6 + vmovdqu 224-128(%r13),%ymm7 + vmovdqu 256-128(%r13),%ymm8 + leaq 832+128(%rsp),%r13 + vmovdqu %ymm0,0-128(%r13) + vmovdqu %ymm1,32-128(%r13) + vmovdqu %ymm2,64-128(%r13) + vmovdqu %ymm3,96-128(%r13) + vmovdqu %ymm4,128-128(%r13) + vmovdqu %ymm5,160-128(%r13) + vmovdqu %ymm6,192-128(%r13) + vmovdqu %ymm7,224-128(%r13) + vmovdqu %ymm8,256-128(%r13) + vmovdqu %ymm9,288-128(%r13) + +L$sqr_1024_no_n_copy: + andq $-1024,%rsp + + vmovdqu 32-128(%rsi),%ymm1 + vmovdqu 64-128(%rsi),%ymm2 + vmovdqu 96-128(%rsi),%ymm3 + vmovdqu 128-128(%rsi),%ymm4 + vmovdqu 160-128(%rsi),%ymm5 + vmovdqu 192-128(%rsi),%ymm6 + vmovdqu 224-128(%rsi),%ymm7 + vmovdqu 256-128(%rsi),%ymm8 + + leaq 192(%rsp),%rbx + vmovdqu L$and_mask(%rip),%ymm15 + jmp L$OOP_GRANDE_SQR_1024 + +.p2align 5 +L$OOP_GRANDE_SQR_1024: + leaq 576+128(%rsp),%r9 + leaq 448(%rsp),%r12 + + + + + vpaddq %ymm1,%ymm1,%ymm1 + vpbroadcastq 0-128(%rsi),%ymm10 + vpaddq %ymm2,%ymm2,%ymm2 + vmovdqa %ymm1,0-128(%r9) + vpaddq %ymm3,%ymm3,%ymm3 + vmovdqa %ymm2,32-128(%r9) + vpaddq %ymm4,%ymm4,%ymm4 + vmovdqa %ymm3,64-128(%r9) + vpaddq %ymm5,%ymm5,%ymm5 + vmovdqa %ymm4,96-128(%r9) + vpaddq %ymm6,%ymm6,%ymm6 + vmovdqa %ymm5,128-128(%r9) + vpaddq %ymm7,%ymm7,%ymm7 + vmovdqa %ymm6,160-128(%r9) + vpaddq %ymm8,%ymm8,%ymm8 + vmovdqa %ymm7,192-128(%r9) + vpxor %ymm9,%ymm9,%ymm9 + vmovdqa %ymm8,224-128(%r9) + + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpbroadcastq 32-128(%rsi),%ymm11 + vmovdqu %ymm9,288-192(%rbx) + vpmuludq %ymm10,%ymm1,%ymm1 + vmovdqu %ymm9,320-448(%r12) + vpmuludq %ymm10,%ymm2,%ymm2 + vmovdqu %ymm9,352-448(%r12) + vpmuludq %ymm10,%ymm3,%ymm3 + vmovdqu %ymm9,384-448(%r12) + vpmuludq %ymm10,%ymm4,%ymm4 + vmovdqu %ymm9,416-448(%r12) + vpmuludq %ymm10,%ymm5,%ymm5 + vmovdqu %ymm9,448-448(%r12) + vpmuludq %ymm10,%ymm6,%ymm6 + vmovdqu %ymm9,480-448(%r12) + vpmuludq %ymm10,%ymm7,%ymm7 + vmovdqu %ymm9,512-448(%r12) + vpmuludq %ymm10,%ymm8,%ymm8 + vpbroadcastq 64-128(%rsi),%ymm10 + vmovdqu %ymm9,544-448(%r12) + + movq %rsi,%r15 + movl $4,%r14d + jmp L$sqr_entry_1024 +.p2align 5 +L$OOP_SQR_1024: + vpbroadcastq 32-128(%r15),%ymm11 + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpaddq 0-192(%rbx),%ymm0,%ymm0 + vpmuludq 0-128(%r9),%ymm10,%ymm1 + vpaddq 32-192(%rbx),%ymm1,%ymm1 + vpmuludq 32-128(%r9),%ymm10,%ymm2 + vpaddq 64-192(%rbx),%ymm2,%ymm2 + vpmuludq 64-128(%r9),%ymm10,%ymm3 + vpaddq 96-192(%rbx),%ymm3,%ymm3 + vpmuludq 96-128(%r9),%ymm10,%ymm4 + vpaddq 128-192(%rbx),%ymm4,%ymm4 + vpmuludq 128-128(%r9),%ymm10,%ymm5 + vpaddq 160-192(%rbx),%ymm5,%ymm5 + vpmuludq 160-128(%r9),%ymm10,%ymm6 + vpaddq 192-192(%rbx),%ymm6,%ymm6 + vpmuludq 192-128(%r9),%ymm10,%ymm7 + vpaddq 224-192(%rbx),%ymm7,%ymm7 + vpmuludq 224-128(%r9),%ymm10,%ymm8 + vpbroadcastq 64-128(%r15),%ymm10 + vpaddq 256-192(%rbx),%ymm8,%ymm8 +L$sqr_entry_1024: + vmovdqu %ymm0,0-192(%rbx) + vmovdqu %ymm1,32-192(%rbx) + + vpmuludq 32-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 32-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 64-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 96-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 128-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 160-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 192-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 224-128(%r9),%ymm11,%ymm0 + vpbroadcastq 96-128(%r15),%ymm11 + vpaddq 288-192(%rbx),%ymm0,%ymm0 + + vmovdqu %ymm2,64-192(%rbx) + vmovdqu %ymm3,96-192(%rbx) + + vpmuludq 64-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 64-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 96-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 128-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 160-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 224-128(%r9),%ymm10,%ymm1 + vpbroadcastq 128-128(%r15),%ymm10 + vpaddq 320-448(%r12),%ymm1,%ymm1 + + vmovdqu %ymm4,128-192(%rbx) + vmovdqu %ymm5,160-192(%rbx) + + vpmuludq 96-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 96-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq 128-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm0,%ymm0 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq 224-128(%r9),%ymm11,%ymm2 + vpbroadcastq 160-128(%r15),%ymm11 + vpaddq 352-448(%r12),%ymm2,%ymm2 + + vmovdqu %ymm6,192-192(%rbx) + vmovdqu %ymm7,224-192(%rbx) + + vpmuludq 128-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 128-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 160-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 192-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 224-128(%r9),%ymm10,%ymm3 + vpbroadcastq 192-128(%r15),%ymm10 + vpaddq 384-448(%r12),%ymm3,%ymm3 + + vmovdqu %ymm8,256-192(%rbx) + vmovdqu %ymm0,288-192(%rbx) + leaq 8(%rbx),%rbx + + vpmuludq 160-128(%rsi),%ymm11,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 224-128(%r9),%ymm11,%ymm4 + vpbroadcastq 224-128(%r15),%ymm11 + vpaddq 416-448(%r12),%ymm4,%ymm4 + + vmovdqu %ymm1,320-448(%r12) + vmovdqu %ymm2,352-448(%r12) + + vpmuludq 192-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpbroadcastq 256-128(%r15),%ymm0 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq 224-128(%r9),%ymm10,%ymm5 + vpbroadcastq 0+8-128(%r15),%ymm10 + vpaddq 448-448(%r12),%ymm5,%ymm5 + + vmovdqu %ymm3,384-448(%r12) + vmovdqu %ymm4,416-448(%r12) + leaq 8(%r15),%r15 + + vpmuludq 224-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 224-128(%r9),%ymm11,%ymm6 + vpaddq 480-448(%r12),%ymm6,%ymm6 + + vpmuludq 256-128(%rsi),%ymm0,%ymm7 + vmovdqu %ymm5,448-448(%r12) + vpaddq 512-448(%r12),%ymm7,%ymm7 + vmovdqu %ymm6,480-448(%r12) + vmovdqu %ymm7,512-448(%r12) + leaq 8(%r12),%r12 + + decl %r14d + jnz L$OOP_SQR_1024 + + vmovdqu 256(%rsp),%ymm8 + vmovdqu 288(%rsp),%ymm1 + vmovdqu 320(%rsp),%ymm2 + leaq 192(%rsp),%rbx + + vpsrlq $29,%ymm8,%ymm14 + vpand %ymm15,%ymm8,%ymm8 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + + vpermq $0x93,%ymm14,%ymm14 + vpxor %ymm9,%ymm9,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm8,%ymm8 + vpblendd $3,%ymm11,%ymm9,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,288-192(%rbx) + vmovdqu %ymm2,320-192(%rbx) + + movq (%rsp),%rax + movq 8(%rsp),%r10 + movq 16(%rsp),%r11 + movq 24(%rsp),%r12 + vmovdqu 32(%rsp),%ymm1 + vmovdqu 64-192(%rbx),%ymm2 + vmovdqu 96-192(%rbx),%ymm3 + vmovdqu 128-192(%rbx),%ymm4 + vmovdqu 160-192(%rbx),%ymm5 + vmovdqu 192-192(%rbx),%ymm6 + vmovdqu 224-192(%rbx),%ymm7 + + movq %rax,%r9 + imull %ecx,%eax + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + + movq %rax,%rdx + imulq -128(%r13),%rax + vpbroadcastq %xmm12,%ymm12 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax + shrq $29,%r9 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + addq %r9,%r10 + addq %rax,%r11 + imulq 24-128(%r13),%rdx + addq %rdx,%r12 + + movq %r10,%rax + imull %ecx,%eax + andl $0x1fffffff,%eax + + movl $9,%r14d + jmp L$OOP_REDUCE_1024 + +.p2align 5 +L$OOP_REDUCE_1024: + vmovd %eax,%xmm13 + vpbroadcastq %xmm13,%ymm13 + + vpmuludq 32-128(%r13),%ymm12,%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm10,%ymm1,%ymm1 + addq %rax,%r10 + vpmuludq 64-128(%r13),%ymm12,%ymm14 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm14,%ymm2,%ymm2 + vpmuludq 96-128(%r13),%ymm12,%ymm11 +.byte 0x67 + addq %rax,%r11 +.byte 0x67 + movq %rdx,%rax + imulq 16-128(%r13),%rax + shrq $29,%r10 + vpaddq %ymm11,%ymm3,%ymm3 + vpmuludq 128-128(%r13),%ymm12,%ymm10 + addq %rax,%r12 + addq %r10,%r11 + vpaddq %ymm10,%ymm4,%ymm4 + vpmuludq 160-128(%r13),%ymm12,%ymm14 + movq %r11,%rax + imull %ecx,%eax + vpaddq %ymm14,%ymm5,%ymm5 + vpmuludq 192-128(%r13),%ymm12,%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm11,%ymm6,%ymm6 + vpmuludq 224-128(%r13),%ymm12,%ymm10 + vpaddq %ymm10,%ymm7,%ymm7 + vpmuludq 256-128(%r13),%ymm12,%ymm14 + vmovd %eax,%xmm12 + + vpaddq %ymm14,%ymm8,%ymm8 + + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 32-8-128(%r13),%ymm13,%ymm11 + vmovdqu 96-8-128(%r13),%ymm14 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm1,%ymm1 + vpmuludq 64-8-128(%r13),%ymm13,%ymm10 + vmovdqu 128-8-128(%r13),%ymm11 + addq %rax,%r11 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm10,%ymm2,%ymm2 + addq %r12,%rax + shrq $29,%r11 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 160-8-128(%r13),%ymm10 + addq %r11,%rax + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 192-8-128(%r13),%ymm14 +.byte 0x67 + movq %rax,%r12 + imull %ecx,%eax + vpaddq %ymm11,%ymm4,%ymm4 + vpmuludq %ymm13,%ymm10,%ymm10 +.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm5,%ymm5 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 256-8-128(%r13),%ymm10 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 288-8-128(%r13),%ymm9 + vmovd %eax,%xmm0 + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm7,%ymm7 + vpmuludq %ymm13,%ymm10,%ymm10 + vmovdqu 32-16-128(%r13),%ymm14 + vpbroadcastq %xmm0,%ymm0 + vpaddq %ymm10,%ymm8,%ymm8 + vpmuludq %ymm13,%ymm9,%ymm9 + vmovdqu 64-16-128(%r13),%ymm11 + addq %rax,%r12 + + vmovdqu 32-24-128(%r13),%ymm13 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 96-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq %ymm0,%ymm13,%ymm13 + vpmuludq %ymm12,%ymm11,%ymm11 +.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff + vpaddq %ymm1,%ymm13,%ymm13 + vpaddq %ymm11,%ymm2,%ymm2 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 160-16-128(%r13),%ymm11 +.byte 0x67 + vmovq %xmm13,%rax + vmovdqu %ymm13,(%rsp) + vpaddq %ymm10,%ymm3,%ymm3 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 192-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq %ymm12,%ymm11,%ymm11 + vmovdqu 224-16-128(%r13),%ymm14 + vpaddq %ymm11,%ymm5,%ymm5 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 256-16-128(%r13),%ymm11 + vpaddq %ymm10,%ymm6,%ymm6 + vpmuludq %ymm12,%ymm14,%ymm14 + shrq $29,%r12 + vmovdqu 288-16-128(%r13),%ymm10 + addq %r12,%rax + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq %ymm12,%ymm11,%ymm11 + + movq %rax,%r9 + imull %ecx,%eax + vpaddq %ymm11,%ymm8,%ymm8 + vpmuludq %ymm12,%ymm10,%ymm10 + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + vmovdqu 96-24-128(%r13),%ymm11 +.byte 0x67 + vpaddq %ymm10,%ymm9,%ymm9 + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 64-24-128(%r13),%ymm0,%ymm14 + vmovdqu 128-24-128(%r13),%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + movq 8(%rsp),%r10 + vpaddq %ymm14,%ymm2,%ymm1 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 160-24-128(%r13),%ymm14 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax +.byte 0x67 + shrq $29,%r9 + movq 16(%rsp),%r11 + vpaddq %ymm11,%ymm3,%ymm2 + vpmuludq %ymm0,%ymm10,%ymm10 + vmovdqu 192-24-128(%r13),%ymm11 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + vpaddq %ymm10,%ymm4,%ymm3 + vpmuludq %ymm0,%ymm14,%ymm14 + vmovdqu 224-24-128(%r13),%ymm10 + imulq 24-128(%r13),%rdx + addq %rax,%r11 + leaq (%r9,%r10,1),%rax + vpaddq %ymm14,%ymm5,%ymm4 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 256-24-128(%r13),%ymm14 + movq %rax,%r10 + imull %ecx,%eax + vpmuludq %ymm0,%ymm10,%ymm10 + vpaddq %ymm11,%ymm6,%ymm5 + vmovdqu 288-24-128(%r13),%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm7,%ymm6 + vpmuludq %ymm0,%ymm14,%ymm14 + addq 24(%rsp),%rdx + vpaddq %ymm14,%ymm8,%ymm7 + vpmuludq %ymm0,%ymm11,%ymm11 + vpaddq %ymm11,%ymm9,%ymm8 + vmovq %r12,%xmm9 + movq %rdx,%r12 + + decl %r14d + jnz L$OOP_REDUCE_1024 + leaq 448(%rsp),%r12 + vpaddq %ymm9,%ymm13,%ymm0 + vpxor %ymm9,%ymm9,%ymm9 + + vpaddq 288-192(%rbx),%ymm0,%ymm0 + vpaddq 320-448(%r12),%ymm1,%ymm1 + vpaddq 352-448(%r12),%ymm2,%ymm2 + vpaddq 384-448(%r12),%ymm3,%ymm3 + vpaddq 416-448(%r12),%ymm4,%ymm4 + vpaddq 448-448(%r12),%ymm5,%ymm5 + vpaddq 480-448(%r12),%ymm6,%ymm6 + vpaddq 512-448(%r12),%ymm7,%ymm7 + vpaddq 544-448(%r12),%ymm8,%ymm8 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vpaddq %ymm13,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vmovdqu %ymm0,0-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,32-128(%rdi) + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vmovdqu %ymm2,64-128(%rdi) + vpaddq %ymm13,%ymm4,%ymm4 + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vpaddq %ymm13,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vmovdqu %ymm4,128-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vmovdqu %ymm5,160-128(%rdi) + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vmovdqu %ymm6,192-128(%rdi) + vpaddq %ymm13,%ymm8,%ymm8 + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + + movq %rdi,%rsi + decl %r8d + jne L$OOP_GRANDE_SQR_1024 + + vzeroall + movq %rbp,%rax + + movq -48(%rax),%r15 + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + +L$sqr_1024_epilogue: + ret + + +.globl _rsaz_1024_mul_avx2 +.private_extern _rsaz_1024_mul_avx2 + +.p2align 6 +_rsaz_1024_mul_avx2: + +_CET_ENDBR + leaq (%rsp),%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + movq %rax,%rbp + + vzeroall + movq %rdx,%r13 + subq $64,%rsp + + + + + + +.byte 0x67,0x67 + movq %rsi,%r15 + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + movq %rsi,%r15 + cmovnzq %r13,%rsi + cmovnzq %r15,%r13 + + movq %rcx,%r15 + subq $-128,%rsi + subq $-128,%rcx + subq $-128,%rdi + + andq $4095,%r15 + addq $320,%r15 +.byte 0x67,0x67 + shrq $12,%r15 + jz L$mul_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%rcx),%ymm0 + andq $-512,%rsp + vmovdqu 32-128(%rcx),%ymm1 + vmovdqu 64-128(%rcx),%ymm2 + vmovdqu 96-128(%rcx),%ymm3 + vmovdqu 128-128(%rcx),%ymm4 + vmovdqu 160-128(%rcx),%ymm5 + vmovdqu 192-128(%rcx),%ymm6 + vmovdqu 224-128(%rcx),%ymm7 + vmovdqu 256-128(%rcx),%ymm8 + leaq 64+128(%rsp),%rcx + vmovdqu %ymm0,0-128(%rcx) + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm1,32-128(%rcx) + vpxor %ymm1,%ymm1,%ymm1 + vmovdqu %ymm2,64-128(%rcx) + vpxor %ymm2,%ymm2,%ymm2 + vmovdqu %ymm3,96-128(%rcx) + vpxor %ymm3,%ymm3,%ymm3 + vmovdqu %ymm4,128-128(%rcx) + vpxor %ymm4,%ymm4,%ymm4 + vmovdqu %ymm5,160-128(%rcx) + vpxor %ymm5,%ymm5,%ymm5 + vmovdqu %ymm6,192-128(%rcx) + vpxor %ymm6,%ymm6,%ymm6 + vmovdqu %ymm7,224-128(%rcx) + vpxor %ymm7,%ymm7,%ymm7 + vmovdqu %ymm8,256-128(%rcx) + vmovdqa %ymm0,%ymm8 + vmovdqu %ymm9,288-128(%rcx) +L$mul_1024_no_n_copy: + andq $-64,%rsp + + movq (%r13),%rbx + vpbroadcastq (%r13),%ymm10 + vmovdqu %ymm0,(%rsp) + xorq %r9,%r9 +.byte 0x67 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + + vmovdqu L$and_mask(%rip),%ymm15 + movl $9,%r14d + vmovdqu %ymm9,288-128(%rdi) + jmp L$oop_mul_1024 + +.p2align 5 +L$oop_mul_1024: + vpsrlq $29,%ymm3,%ymm9 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r9,%rax + movq %rbx,%r10 + imulq 8-128(%rsi),%r10 + addq 8(%rsp),%r10 + + movq %rax,%r9 + imull %r8d,%eax + andl $0x1fffffff,%eax + + movq %rbx,%r11 + imulq 16-128(%rsi),%r11 + addq 16(%rsp),%r11 + + movq %rbx,%r12 + imulq 24-128(%rsi),%r12 + addq 24(%rsp),%r12 + vpmuludq 32-128(%rsi),%ymm10,%ymm0 + vmovd %eax,%xmm11 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq 64-128(%rsi),%ymm10,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 96-128(%rsi),%ymm10,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq 128-128(%rsi),%ymm10,%ymm0 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq 160-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 192-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq 224-128(%rsi),%ymm10,%ymm0 + vpermq $0x93,%ymm9,%ymm9 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq 256-128(%rsi),%ymm10,%ymm12 + vpbroadcastq 8(%r13),%ymm10 + vpaddq %ymm12,%ymm8,%ymm8 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%rcx),%rax + addq %rax,%r11 + shrq $29,%r9 + imulq 24-128(%rcx),%rdx + addq %rdx,%r12 + addq %r9,%r10 + + vpmuludq 32-128(%rcx),%ymm11,%ymm13 + vmovq %xmm10,%rbx + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 64-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm2,%ymm2 + vpmuludq 96-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 128-128(%rcx),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 160-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm5,%ymm5 + vpmuludq 192-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 224-128(%rcx),%ymm11,%ymm13 + vpblendd $3,%ymm14,%ymm9,%ymm12 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 256-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm12,%ymm3,%ymm3 + vpaddq %ymm0,%ymm8,%ymm8 + + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rsi),%ymm12 + movq %rbx,%rax + imulq 8-128(%rsi),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rsi),%ymm13 + + movq %r10,%rax + vpblendd $0xfc,%ymm14,%ymm9,%ymm9 + imull %r8d,%eax + vpaddq %ymm9,%ymm4,%ymm4 + andl $0x1fffffff,%eax + + imulq 16-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovd %eax,%xmm11 + vmovdqu -8+96-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -8+128-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+160-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+192-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -8+224-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+256-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+288-128(%rsi),%ymm9 + vpaddq %ymm12,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm13,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm9,%ymm9 + vpbroadcastq 16(%r13),%ymm10 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rcx),%ymm0 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rcx),%ymm12 + shrq $29,%r10 + imulq 16-128(%rcx),%rdx + addq %rdx,%r12 + addq %r10,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -8+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rsi),%ymm0 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r11,%rax + + vmovdqu -16+64-128(%rsi),%ymm12 + movq %rax,%r11 + imull %r8d,%eax + andl $0x1fffffff,%eax + + imulq 8-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -16+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -16+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -16+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 24(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rcx),%ymm0 + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r11 + vmovdqu -16+64-128(%rcx),%ymm12 + imulq 8-128(%rcx),%rdx + addq %rdx,%r12 + shrq $29,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -16+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+32-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+64-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm9,%ymm9 + + addq %r11,%r12 + imulq -128(%rsi),%rbx + addq %rbx,%r12 + + movq %r12,%rax + imull %r8d,%eax + andl $0x1fffffff,%eax + + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -24+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -24+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -24+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 32(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + addq $32,%r13 + + vmovdqu -24+32-128(%rcx),%ymm0 + imulq -128(%rcx),%rax + addq %rax,%r12 + shrq $29,%r12 + + vmovdqu -24+64-128(%rcx),%ymm12 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -24+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm0 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu %ymm0,(%rsp) + vpaddq %ymm12,%ymm2,%ymm1 + vmovdqu -24+128-128(%rcx),%ymm0 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm2 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm3 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm4 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm5 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+288-128(%rcx),%ymm13 + movq %r12,%r9 + vpaddq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm11,%ymm12,%ymm12 + addq (%rsp),%r9 + vpaddq %ymm12,%ymm8,%ymm7 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovq %r12,%xmm12 + vpaddq %ymm13,%ymm9,%ymm8 + + decl %r14d + jnz L$oop_mul_1024 + vpaddq (%rsp),%ymm12,%ymm0 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm10,%ymm10 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpermq $0x93,%ymm11,%ymm11 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm10,%ymm10 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vmovdqu %ymm0,0-128(%rdi) + vmovdqu %ymm1,32-128(%rdi) + vmovdqu %ymm2,64-128(%rdi) + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vmovdqu %ymm4,128-128(%rdi) + vmovdqu %ymm5,160-128(%rdi) + vmovdqu %ymm6,192-128(%rdi) + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + vzeroupper + + movq %rbp,%rax + + movq -48(%rax),%r15 + + movq -40(%rax),%r14 + + movq -32(%rax),%r13 + + movq -24(%rax),%r12 + + movq -16(%rax),%rbp + + movq -8(%rax),%rbx + + leaq (%rax),%rsp + +L$mul_1024_epilogue: + ret + + +.globl _rsaz_1024_red2norm_avx2 +.private_extern _rsaz_1024_red2norm_avx2 + +.p2align 5 +_rsaz_1024_red2norm_avx2: + +_CET_ENDBR + subq $-128,%rsi + xorq %rax,%rax + movq -128(%rsi),%r8 + movq -120(%rsi),%r9 + movq -112(%rsi),%r10 + shlq $0,%r8 + shlq $29,%r9 + movq %r10,%r11 + shlq $58,%r10 + shrq $6,%r11 + addq %r8,%rax + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,0(%rdi) + movq %r11,%rax + movq -104(%rsi),%r8 + movq -96(%rsi),%r9 + shlq $23,%r8 + movq %r9,%r10 + shlq $52,%r9 + shrq $12,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,8(%rdi) + movq %r10,%rax + movq -88(%rsi),%r11 + movq -80(%rsi),%r8 + shlq $17,%r11 + movq %r8,%r9 + shlq $46,%r8 + shrq $18,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,16(%rdi) + movq %r9,%rax + movq -72(%rsi),%r10 + movq -64(%rsi),%r11 + shlq $11,%r10 + movq %r11,%r8 + shlq $40,%r11 + shrq $24,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,24(%rdi) + movq %r8,%rax + movq -56(%rsi),%r9 + movq -48(%rsi),%r10 + movq -40(%rsi),%r11 + shlq $5,%r9 + shlq $34,%r10 + movq %r11,%r8 + shlq $63,%r11 + shrq $1,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,32(%rdi) + movq %r8,%rax + movq -32(%rsi),%r9 + movq -24(%rsi),%r10 + shlq $28,%r9 + movq %r10,%r11 + shlq $57,%r10 + shrq $7,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,40(%rdi) + movq %r11,%rax + movq -16(%rsi),%r8 + movq -8(%rsi),%r9 + shlq $22,%r8 + movq %r9,%r10 + shlq $51,%r9 + shrq $13,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,48(%rdi) + movq %r10,%rax + movq 0(%rsi),%r11 + movq 8(%rsi),%r8 + shlq $16,%r11 + movq %r8,%r9 + shlq $45,%r8 + shrq $19,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,56(%rdi) + movq %r9,%rax + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + shlq $10,%r10 + movq %r11,%r8 + shlq $39,%r11 + shrq $25,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,64(%rdi) + movq %r8,%rax + movq 32(%rsi),%r9 + movq 40(%rsi),%r10 + movq 48(%rsi),%r11 + shlq $4,%r9 + shlq $33,%r10 + movq %r11,%r8 + shlq $62,%r11 + shrq $2,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,72(%rdi) + movq %r8,%rax + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + shlq $27,%r9 + movq %r10,%r11 + shlq $56,%r10 + shrq $8,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,80(%rdi) + movq %r11,%rax + movq 72(%rsi),%r8 + movq 80(%rsi),%r9 + shlq $21,%r8 + movq %r9,%r10 + shlq $50,%r9 + shrq $14,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,88(%rdi) + movq %r10,%rax + movq 88(%rsi),%r11 + movq 96(%rsi),%r8 + shlq $15,%r11 + movq %r8,%r9 + shlq $44,%r8 + shrq $20,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,96(%rdi) + movq %r9,%rax + movq 104(%rsi),%r10 + movq 112(%rsi),%r11 + shlq $9,%r10 + movq %r11,%r8 + shlq $38,%r11 + shrq $26,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,104(%rdi) + movq %r8,%rax + movq 120(%rsi),%r9 + movq 128(%rsi),%r10 + movq 136(%rsi),%r11 + shlq $3,%r9 + shlq $32,%r10 + movq %r11,%r8 + shlq $61,%r11 + shrq $3,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,112(%rdi) + movq %r8,%rax + movq 144(%rsi),%r9 + movq 152(%rsi),%r10 + shlq $26,%r9 + movq %r10,%r11 + shlq $55,%r10 + shrq $9,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,120(%rdi) + movq %r11,%rax + ret + + + +.globl _rsaz_1024_norm2red_avx2 +.private_extern _rsaz_1024_norm2red_avx2 + +.p2align 5 +_rsaz_1024_norm2red_avx2: + +_CET_ENDBR + subq $-128,%rdi + movq (%rsi),%r8 + movl $0x1fffffff,%eax + movq 8(%rsi),%r9 + movq %r8,%r11 + shrq $0,%r11 + andq %rax,%r11 + movq %r11,-128(%rdi) + movq %r8,%r10 + shrq $29,%r10 + andq %rax,%r10 + movq %r10,-120(%rdi) + shrdq $58,%r9,%r8 + andq %rax,%r8 + movq %r8,-112(%rdi) + movq 16(%rsi),%r10 + movq %r9,%r8 + shrq $23,%r8 + andq %rax,%r8 + movq %r8,-104(%rdi) + shrdq $52,%r10,%r9 + andq %rax,%r9 + movq %r9,-96(%rdi) + movq 24(%rsi),%r11 + movq %r10,%r9 + shrq $17,%r9 + andq %rax,%r9 + movq %r9,-88(%rdi) + shrdq $46,%r11,%r10 + andq %rax,%r10 + movq %r10,-80(%rdi) + movq 32(%rsi),%r8 + movq %r11,%r10 + shrq $11,%r10 + andq %rax,%r10 + movq %r10,-72(%rdi) + shrdq $40,%r8,%r11 + andq %rax,%r11 + movq %r11,-64(%rdi) + movq 40(%rsi),%r9 + movq %r8,%r11 + shrq $5,%r11 + andq %rax,%r11 + movq %r11,-56(%rdi) + movq %r8,%r10 + shrq $34,%r10 + andq %rax,%r10 + movq %r10,-48(%rdi) + shrdq $63,%r9,%r8 + andq %rax,%r8 + movq %r8,-40(%rdi) + movq 48(%rsi),%r10 + movq %r9,%r8 + shrq $28,%r8 + andq %rax,%r8 + movq %r8,-32(%rdi) + shrdq $57,%r10,%r9 + andq %rax,%r9 + movq %r9,-24(%rdi) + movq 56(%rsi),%r11 + movq %r10,%r9 + shrq $22,%r9 + andq %rax,%r9 + movq %r9,-16(%rdi) + shrdq $51,%r11,%r10 + andq %rax,%r10 + movq %r10,-8(%rdi) + movq 64(%rsi),%r8 + movq %r11,%r10 + shrq $16,%r10 + andq %rax,%r10 + movq %r10,0(%rdi) + shrdq $45,%r8,%r11 + andq %rax,%r11 + movq %r11,8(%rdi) + movq 72(%rsi),%r9 + movq %r8,%r11 + shrq $10,%r11 + andq %rax,%r11 + movq %r11,16(%rdi) + shrdq $39,%r9,%r8 + andq %rax,%r8 + movq %r8,24(%rdi) + movq 80(%rsi),%r10 + movq %r9,%r8 + shrq $4,%r8 + andq %rax,%r8 + movq %r8,32(%rdi) + movq %r9,%r11 + shrq $33,%r11 + andq %rax,%r11 + movq %r11,40(%rdi) + shrdq $62,%r10,%r9 + andq %rax,%r9 + movq %r9,48(%rdi) + movq 88(%rsi),%r11 + movq %r10,%r9 + shrq $27,%r9 + andq %rax,%r9 + movq %r9,56(%rdi) + shrdq $56,%r11,%r10 + andq %rax,%r10 + movq %r10,64(%rdi) + movq 96(%rsi),%r8 + movq %r11,%r10 + shrq $21,%r10 + andq %rax,%r10 + movq %r10,72(%rdi) + shrdq $50,%r8,%r11 + andq %rax,%r11 + movq %r11,80(%rdi) + movq 104(%rsi),%r9 + movq %r8,%r11 + shrq $15,%r11 + andq %rax,%r11 + movq %r11,88(%rdi) + shrdq $44,%r9,%r8 + andq %rax,%r8 + movq %r8,96(%rdi) + movq 112(%rsi),%r10 + movq %r9,%r8 + shrq $9,%r8 + andq %rax,%r8 + movq %r8,104(%rdi) + shrdq $38,%r10,%r9 + andq %rax,%r9 + movq %r9,112(%rdi) + movq 120(%rsi),%r11 + movq %r10,%r9 + shrq $3,%r9 + andq %rax,%r9 + movq %r9,120(%rdi) + movq %r10,%r8 + shrq $32,%r8 + andq %rax,%r8 + movq %r8,128(%rdi) + shrdq $61,%r11,%r10 + andq %rax,%r10 + movq %r10,136(%rdi) + xorq %r8,%r8 + movq %r11,%r10 + shrq $26,%r10 + andq %rax,%r10 + movq %r10,144(%rdi) + shrdq $55,%r8,%r11 + andq %rax,%r11 + movq %r11,152(%rdi) + movq %r8,160(%rdi) + movq %r8,168(%rdi) + movq %r8,176(%rdi) + movq %r8,184(%rdi) + ret + + +.globl _rsaz_1024_scatter5_avx2 +.private_extern _rsaz_1024_scatter5_avx2 + +.p2align 5 +_rsaz_1024_scatter5_avx2: + +_CET_ENDBR + vzeroupper + vmovdqu L$scatter_permd(%rip),%ymm5 + shll $4,%edx + leaq (%rdi,%rdx,1),%rdi + movl $9,%eax + jmp L$oop_scatter_1024 + +.p2align 5 +L$oop_scatter_1024: + vmovdqu (%rsi),%ymm0 + leaq 32(%rsi),%rsi + vpermd %ymm0,%ymm5,%ymm0 + vmovdqu %xmm0,(%rdi) + leaq 512(%rdi),%rdi + decl %eax + jnz L$oop_scatter_1024 + + vzeroupper + ret + + + +.globl _rsaz_1024_gather5_avx2 +.private_extern _rsaz_1024_gather5_avx2 + +.p2align 5 +_rsaz_1024_gather5_avx2: + +_CET_ENDBR + vzeroupper + movq %rsp,%r11 + + leaq -256(%rsp),%rsp + andq $-32,%rsp + leaq L$inc(%rip),%r10 + leaq -128(%rsp),%rax + + vmovd %edx,%xmm4 + vmovdqa (%r10),%ymm0 + vmovdqa 32(%r10),%ymm1 + vmovdqa 64(%r10),%ymm5 + vpbroadcastd %xmm4,%ymm4 + + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,0+128(%rax) + vpaddd %ymm5,%ymm2,%ymm0 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,32+128(%rax) + vpaddd %ymm5,%ymm3,%ymm1 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,64+128(%rax) + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vmovdqa %ymm3,96+128(%rax) + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,128+128(%rax) + vpaddd %ymm5,%ymm2,%ymm8 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,160+128(%rax) + vpaddd %ymm5,%ymm3,%ymm9 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,192+128(%rax) + vpaddd %ymm5,%ymm8,%ymm10 + vpcmpeqd %ymm4,%ymm8,%ymm8 + vmovdqa %ymm3,224+128(%rax) + vpaddd %ymm5,%ymm9,%ymm11 + vpcmpeqd %ymm4,%ymm9,%ymm9 + vpaddd %ymm5,%ymm10,%ymm12 + vpcmpeqd %ymm4,%ymm10,%ymm10 + vpaddd %ymm5,%ymm11,%ymm13 + vpcmpeqd %ymm4,%ymm11,%ymm11 + vpaddd %ymm5,%ymm12,%ymm14 + vpcmpeqd %ymm4,%ymm12,%ymm12 + vpaddd %ymm5,%ymm13,%ymm15 + vpcmpeqd %ymm4,%ymm13,%ymm13 + vpcmpeqd %ymm4,%ymm14,%ymm14 + vpcmpeqd %ymm4,%ymm15,%ymm15 + + vmovdqa -32(%r10),%ymm7 + leaq 128(%rsi),%rsi + movl $9,%edx + +L$oop_gather_1024: + vmovdqa 0-128(%rsi),%ymm0 + vmovdqa 32-128(%rsi),%ymm1 + vmovdqa 64-128(%rsi),%ymm2 + vmovdqa 96-128(%rsi),%ymm3 + vpand 0+128(%rax),%ymm0,%ymm0 + vpand 32+128(%rax),%ymm1,%ymm1 + vpand 64+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm1,%ymm4 + vpand 96+128(%rax),%ymm3,%ymm3 + vmovdqa 128-128(%rsi),%ymm0 + vmovdqa 160-128(%rsi),%ymm1 + vpor %ymm2,%ymm3,%ymm5 + vmovdqa 192-128(%rsi),%ymm2 + vmovdqa 224-128(%rsi),%ymm3 + vpand 128+128(%rax),%ymm0,%ymm0 + vpand 160+128(%rax),%ymm1,%ymm1 + vpand 192+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm4,%ymm4 + vpand 224+128(%rax),%ymm3,%ymm3 + vpand 256-128(%rsi),%ymm8,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 288-128(%rsi),%ymm9,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 320-128(%rsi),%ymm10,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 352-128(%rsi),%ymm11,%ymm3 + vpor %ymm0,%ymm4,%ymm4 + vpand 384-128(%rsi),%ymm12,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 416-128(%rsi),%ymm13,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 448-128(%rsi),%ymm14,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 480-128(%rsi),%ymm15,%ymm3 + leaq 512(%rsi),%rsi + vpor %ymm0,%ymm4,%ymm4 + vpor %ymm1,%ymm5,%ymm5 + vpor %ymm2,%ymm4,%ymm4 + vpor %ymm3,%ymm5,%ymm5 + + vpor %ymm5,%ymm4,%ymm4 + vextracti128 $1,%ymm4,%xmm5 + vpor %xmm4,%xmm5,%xmm5 + vpermd %ymm5,%ymm7,%ymm5 + vmovdqu %ymm5,(%rdi) + leaq 32(%rdi),%rdi + decl %edx + jnz L$oop_gather_1024 + + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + vzeroupper + leaq (%r11),%rsp + + ret + +L$SEH_end_rsaz_1024_gather5: + +.section __DATA,__const +.p2align 6 +L$and_mask: +.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff +L$scatter_permd: +.long 0,2,4,6,7,7,7,7 +L$gather_permd: +.long 0,7,1,7,2,7,3,7 +L$inc: +.long 0,0,0,0, 1,1,1,1 +.long 2,2,2,2, 3,3,3,3 +.long 4,4,4,4, 4,4,4,4 +.p2align 6 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/rsaz-avx2-linux.S b/third_party/boringssl/gen/bcm/rsaz-avx2-linux.S new file mode 100644 index 00000000..65a6c2e8 --- /dev/null +++ b/third_party/boringssl/gen/bcm/rsaz-avx2-linux.S @@ -0,0 +1,1749 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.globl rsaz_1024_sqr_avx2 +.hidden rsaz_1024_sqr_avx2 +.type rsaz_1024_sqr_avx2,@function +.align 64 +rsaz_1024_sqr_avx2: +.cfi_startproc +_CET_ENDBR + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + vzeroupper + movq %rax,%rbp +.cfi_def_cfa_register %rbp + movq %rdx,%r13 + subq $832,%rsp + movq %r13,%r15 + subq $-128,%rdi + subq $-128,%rsi + subq $-128,%r13 + + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + vpxor %ymm9,%ymm9,%ymm9 + jz .Lsqr_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%r13),%ymm0 + andq $-2048,%rsp + vmovdqu 32-128(%r13),%ymm1 + vmovdqu 64-128(%r13),%ymm2 + vmovdqu 96-128(%r13),%ymm3 + vmovdqu 128-128(%r13),%ymm4 + vmovdqu 160-128(%r13),%ymm5 + vmovdqu 192-128(%r13),%ymm6 + vmovdqu 224-128(%r13),%ymm7 + vmovdqu 256-128(%r13),%ymm8 + leaq 832+128(%rsp),%r13 + vmovdqu %ymm0,0-128(%r13) + vmovdqu %ymm1,32-128(%r13) + vmovdqu %ymm2,64-128(%r13) + vmovdqu %ymm3,96-128(%r13) + vmovdqu %ymm4,128-128(%r13) + vmovdqu %ymm5,160-128(%r13) + vmovdqu %ymm6,192-128(%r13) + vmovdqu %ymm7,224-128(%r13) + vmovdqu %ymm8,256-128(%r13) + vmovdqu %ymm9,288-128(%r13) + +.Lsqr_1024_no_n_copy: + andq $-1024,%rsp + + vmovdqu 32-128(%rsi),%ymm1 + vmovdqu 64-128(%rsi),%ymm2 + vmovdqu 96-128(%rsi),%ymm3 + vmovdqu 128-128(%rsi),%ymm4 + vmovdqu 160-128(%rsi),%ymm5 + vmovdqu 192-128(%rsi),%ymm6 + vmovdqu 224-128(%rsi),%ymm7 + vmovdqu 256-128(%rsi),%ymm8 + + leaq 192(%rsp),%rbx + vmovdqu .Land_mask(%rip),%ymm15 + jmp .LOOP_GRANDE_SQR_1024 + +.align 32 +.LOOP_GRANDE_SQR_1024: + leaq 576+128(%rsp),%r9 + leaq 448(%rsp),%r12 + + + + + vpaddq %ymm1,%ymm1,%ymm1 + vpbroadcastq 0-128(%rsi),%ymm10 + vpaddq %ymm2,%ymm2,%ymm2 + vmovdqa %ymm1,0-128(%r9) + vpaddq %ymm3,%ymm3,%ymm3 + vmovdqa %ymm2,32-128(%r9) + vpaddq %ymm4,%ymm4,%ymm4 + vmovdqa %ymm3,64-128(%r9) + vpaddq %ymm5,%ymm5,%ymm5 + vmovdqa %ymm4,96-128(%r9) + vpaddq %ymm6,%ymm6,%ymm6 + vmovdqa %ymm5,128-128(%r9) + vpaddq %ymm7,%ymm7,%ymm7 + vmovdqa %ymm6,160-128(%r9) + vpaddq %ymm8,%ymm8,%ymm8 + vmovdqa %ymm7,192-128(%r9) + vpxor %ymm9,%ymm9,%ymm9 + vmovdqa %ymm8,224-128(%r9) + + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpbroadcastq 32-128(%rsi),%ymm11 + vmovdqu %ymm9,288-192(%rbx) + vpmuludq %ymm10,%ymm1,%ymm1 + vmovdqu %ymm9,320-448(%r12) + vpmuludq %ymm10,%ymm2,%ymm2 + vmovdqu %ymm9,352-448(%r12) + vpmuludq %ymm10,%ymm3,%ymm3 + vmovdqu %ymm9,384-448(%r12) + vpmuludq %ymm10,%ymm4,%ymm4 + vmovdqu %ymm9,416-448(%r12) + vpmuludq %ymm10,%ymm5,%ymm5 + vmovdqu %ymm9,448-448(%r12) + vpmuludq %ymm10,%ymm6,%ymm6 + vmovdqu %ymm9,480-448(%r12) + vpmuludq %ymm10,%ymm7,%ymm7 + vmovdqu %ymm9,512-448(%r12) + vpmuludq %ymm10,%ymm8,%ymm8 + vpbroadcastq 64-128(%rsi),%ymm10 + vmovdqu %ymm9,544-448(%r12) + + movq %rsi,%r15 + movl $4,%r14d + jmp .Lsqr_entry_1024 +.align 32 +.LOOP_SQR_1024: + vpbroadcastq 32-128(%r15),%ymm11 + vpmuludq 0-128(%rsi),%ymm10,%ymm0 + vpaddq 0-192(%rbx),%ymm0,%ymm0 + vpmuludq 0-128(%r9),%ymm10,%ymm1 + vpaddq 32-192(%rbx),%ymm1,%ymm1 + vpmuludq 32-128(%r9),%ymm10,%ymm2 + vpaddq 64-192(%rbx),%ymm2,%ymm2 + vpmuludq 64-128(%r9),%ymm10,%ymm3 + vpaddq 96-192(%rbx),%ymm3,%ymm3 + vpmuludq 96-128(%r9),%ymm10,%ymm4 + vpaddq 128-192(%rbx),%ymm4,%ymm4 + vpmuludq 128-128(%r9),%ymm10,%ymm5 + vpaddq 160-192(%rbx),%ymm5,%ymm5 + vpmuludq 160-128(%r9),%ymm10,%ymm6 + vpaddq 192-192(%rbx),%ymm6,%ymm6 + vpmuludq 192-128(%r9),%ymm10,%ymm7 + vpaddq 224-192(%rbx),%ymm7,%ymm7 + vpmuludq 224-128(%r9),%ymm10,%ymm8 + vpbroadcastq 64-128(%r15),%ymm10 + vpaddq 256-192(%rbx),%ymm8,%ymm8 +.Lsqr_entry_1024: + vmovdqu %ymm0,0-192(%rbx) + vmovdqu %ymm1,32-192(%rbx) + + vpmuludq 32-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 32-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 64-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 96-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 128-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 160-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 192-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 224-128(%r9),%ymm11,%ymm0 + vpbroadcastq 96-128(%r15),%ymm11 + vpaddq 288-192(%rbx),%ymm0,%ymm0 + + vmovdqu %ymm2,64-192(%rbx) + vmovdqu %ymm3,96-192(%rbx) + + vpmuludq 64-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 64-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 96-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq 128-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 160-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 224-128(%r9),%ymm10,%ymm1 + vpbroadcastq 128-128(%r15),%ymm10 + vpaddq 320-448(%r12),%ymm1,%ymm1 + + vmovdqu %ymm4,128-192(%rbx) + vmovdqu %ymm5,160-192(%rbx) + + vpmuludq 96-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 96-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq 128-128(%r9),%ymm11,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm0,%ymm0 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq 224-128(%r9),%ymm11,%ymm2 + vpbroadcastq 160-128(%r15),%ymm11 + vpaddq 352-448(%r12),%ymm2,%ymm2 + + vmovdqu %ymm6,192-192(%rbx) + vmovdqu %ymm7,224-192(%rbx) + + vpmuludq 128-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq 128-128(%r9),%ymm10,%ymm14 + vpaddq %ymm14,%ymm0,%ymm0 + vpmuludq 160-128(%r9),%ymm10,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 192-128(%r9),%ymm10,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 224-128(%r9),%ymm10,%ymm3 + vpbroadcastq 192-128(%r15),%ymm10 + vpaddq 384-448(%r12),%ymm3,%ymm3 + + vmovdqu %ymm8,256-192(%rbx) + vmovdqu %ymm0,288-192(%rbx) + leaq 8(%rbx),%rbx + + vpmuludq 160-128(%rsi),%ymm11,%ymm13 + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 160-128(%r9),%ymm11,%ymm12 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 192-128(%r9),%ymm11,%ymm14 + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq 224-128(%r9),%ymm11,%ymm4 + vpbroadcastq 224-128(%r15),%ymm11 + vpaddq 416-448(%r12),%ymm4,%ymm4 + + vmovdqu %ymm1,320-448(%r12) + vmovdqu %ymm2,352-448(%r12) + + vpmuludq 192-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 192-128(%r9),%ymm10,%ymm14 + vpbroadcastq 256-128(%r15),%ymm0 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq 224-128(%r9),%ymm10,%ymm5 + vpbroadcastq 0+8-128(%r15),%ymm10 + vpaddq 448-448(%r12),%ymm5,%ymm5 + + vmovdqu %ymm3,384-448(%r12) + vmovdqu %ymm4,416-448(%r12) + leaq 8(%r15),%r15 + + vpmuludq 224-128(%rsi),%ymm11,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 224-128(%r9),%ymm11,%ymm6 + vpaddq 480-448(%r12),%ymm6,%ymm6 + + vpmuludq 256-128(%rsi),%ymm0,%ymm7 + vmovdqu %ymm5,448-448(%r12) + vpaddq 512-448(%r12),%ymm7,%ymm7 + vmovdqu %ymm6,480-448(%r12) + vmovdqu %ymm7,512-448(%r12) + leaq 8(%r12),%r12 + + decl %r14d + jnz .LOOP_SQR_1024 + + vmovdqu 256(%rsp),%ymm8 + vmovdqu 288(%rsp),%ymm1 + vmovdqu 320(%rsp),%ymm2 + leaq 192(%rsp),%rbx + + vpsrlq $29,%ymm8,%ymm14 + vpand %ymm15,%ymm8,%ymm8 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + + vpermq $0x93,%ymm14,%ymm14 + vpxor %ymm9,%ymm9,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm8,%ymm8 + vpblendd $3,%ymm11,%ymm9,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,288-192(%rbx) + vmovdqu %ymm2,320-192(%rbx) + + movq (%rsp),%rax + movq 8(%rsp),%r10 + movq 16(%rsp),%r11 + movq 24(%rsp),%r12 + vmovdqu 32(%rsp),%ymm1 + vmovdqu 64-192(%rbx),%ymm2 + vmovdqu 96-192(%rbx),%ymm3 + vmovdqu 128-192(%rbx),%ymm4 + vmovdqu 160-192(%rbx),%ymm5 + vmovdqu 192-192(%rbx),%ymm6 + vmovdqu 224-192(%rbx),%ymm7 + + movq %rax,%r9 + imull %ecx,%eax + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + + movq %rax,%rdx + imulq -128(%r13),%rax + vpbroadcastq %xmm12,%ymm12 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax + shrq $29,%r9 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + addq %r9,%r10 + addq %rax,%r11 + imulq 24-128(%r13),%rdx + addq %rdx,%r12 + + movq %r10,%rax + imull %ecx,%eax + andl $0x1fffffff,%eax + + movl $9,%r14d + jmp .LOOP_REDUCE_1024 + +.align 32 +.LOOP_REDUCE_1024: + vmovd %eax,%xmm13 + vpbroadcastq %xmm13,%ymm13 + + vpmuludq 32-128(%r13),%ymm12,%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm10,%ymm1,%ymm1 + addq %rax,%r10 + vpmuludq 64-128(%r13),%ymm12,%ymm14 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm14,%ymm2,%ymm2 + vpmuludq 96-128(%r13),%ymm12,%ymm11 +.byte 0x67 + addq %rax,%r11 +.byte 0x67 + movq %rdx,%rax + imulq 16-128(%r13),%rax + shrq $29,%r10 + vpaddq %ymm11,%ymm3,%ymm3 + vpmuludq 128-128(%r13),%ymm12,%ymm10 + addq %rax,%r12 + addq %r10,%r11 + vpaddq %ymm10,%ymm4,%ymm4 + vpmuludq 160-128(%r13),%ymm12,%ymm14 + movq %r11,%rax + imull %ecx,%eax + vpaddq %ymm14,%ymm5,%ymm5 + vpmuludq 192-128(%r13),%ymm12,%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm11,%ymm6,%ymm6 + vpmuludq 224-128(%r13),%ymm12,%ymm10 + vpaddq %ymm10,%ymm7,%ymm7 + vpmuludq 256-128(%r13),%ymm12,%ymm14 + vmovd %eax,%xmm12 + + vpaddq %ymm14,%ymm8,%ymm8 + + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 32-8-128(%r13),%ymm13,%ymm11 + vmovdqu 96-8-128(%r13),%ymm14 + movq %rax,%rdx + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm1,%ymm1 + vpmuludq 64-8-128(%r13),%ymm13,%ymm10 + vmovdqu 128-8-128(%r13),%ymm11 + addq %rax,%r11 + movq %rdx,%rax + imulq 8-128(%r13),%rax + vpaddq %ymm10,%ymm2,%ymm2 + addq %r12,%rax + shrq $29,%r11 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 160-8-128(%r13),%ymm10 + addq %r11,%rax + vpaddq %ymm14,%ymm3,%ymm3 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 192-8-128(%r13),%ymm14 +.byte 0x67 + movq %rax,%r12 + imull %ecx,%eax + vpaddq %ymm11,%ymm4,%ymm4 + vpmuludq %ymm13,%ymm10,%ymm10 +.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm5,%ymm5 + vpmuludq %ymm13,%ymm14,%ymm14 + vmovdqu 256-8-128(%r13),%ymm10 + vpaddq %ymm14,%ymm6,%ymm6 + vpmuludq %ymm13,%ymm11,%ymm11 + vmovdqu 288-8-128(%r13),%ymm9 + vmovd %eax,%xmm0 + imulq -128(%r13),%rax + vpaddq %ymm11,%ymm7,%ymm7 + vpmuludq %ymm13,%ymm10,%ymm10 + vmovdqu 32-16-128(%r13),%ymm14 + vpbroadcastq %xmm0,%ymm0 + vpaddq %ymm10,%ymm8,%ymm8 + vpmuludq %ymm13,%ymm9,%ymm9 + vmovdqu 64-16-128(%r13),%ymm11 + addq %rax,%r12 + + vmovdqu 32-24-128(%r13),%ymm13 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 96-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm1,%ymm1 + vpmuludq %ymm0,%ymm13,%ymm13 + vpmuludq %ymm12,%ymm11,%ymm11 +.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff + vpaddq %ymm1,%ymm13,%ymm13 + vpaddq %ymm11,%ymm2,%ymm2 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 160-16-128(%r13),%ymm11 +.byte 0x67 + vmovq %xmm13,%rax + vmovdqu %ymm13,(%rsp) + vpaddq %ymm10,%ymm3,%ymm3 + vpmuludq %ymm12,%ymm14,%ymm14 + vmovdqu 192-16-128(%r13),%ymm10 + vpaddq %ymm14,%ymm4,%ymm4 + vpmuludq %ymm12,%ymm11,%ymm11 + vmovdqu 224-16-128(%r13),%ymm14 + vpaddq %ymm11,%ymm5,%ymm5 + vpmuludq %ymm12,%ymm10,%ymm10 + vmovdqu 256-16-128(%r13),%ymm11 + vpaddq %ymm10,%ymm6,%ymm6 + vpmuludq %ymm12,%ymm14,%ymm14 + shrq $29,%r12 + vmovdqu 288-16-128(%r13),%ymm10 + addq %r12,%rax + vpaddq %ymm14,%ymm7,%ymm7 + vpmuludq %ymm12,%ymm11,%ymm11 + + movq %rax,%r9 + imull %ecx,%eax + vpaddq %ymm11,%ymm8,%ymm8 + vpmuludq %ymm12,%ymm10,%ymm10 + andl $0x1fffffff,%eax + vmovd %eax,%xmm12 + vmovdqu 96-24-128(%r13),%ymm11 +.byte 0x67 + vpaddq %ymm10,%ymm9,%ymm9 + vpbroadcastq %xmm12,%ymm12 + + vpmuludq 64-24-128(%r13),%ymm0,%ymm14 + vmovdqu 128-24-128(%r13),%ymm10 + movq %rax,%rdx + imulq -128(%r13),%rax + movq 8(%rsp),%r10 + vpaddq %ymm14,%ymm2,%ymm1 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 160-24-128(%r13),%ymm14 + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%r13),%rax +.byte 0x67 + shrq $29,%r9 + movq 16(%rsp),%r11 + vpaddq %ymm11,%ymm3,%ymm2 + vpmuludq %ymm0,%ymm10,%ymm10 + vmovdqu 192-24-128(%r13),%ymm11 + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%r13),%rax + vpaddq %ymm10,%ymm4,%ymm3 + vpmuludq %ymm0,%ymm14,%ymm14 + vmovdqu 224-24-128(%r13),%ymm10 + imulq 24-128(%r13),%rdx + addq %rax,%r11 + leaq (%r9,%r10,1),%rax + vpaddq %ymm14,%ymm5,%ymm4 + vpmuludq %ymm0,%ymm11,%ymm11 + vmovdqu 256-24-128(%r13),%ymm14 + movq %rax,%r10 + imull %ecx,%eax + vpmuludq %ymm0,%ymm10,%ymm10 + vpaddq %ymm11,%ymm6,%ymm5 + vmovdqu 288-24-128(%r13),%ymm11 + andl $0x1fffffff,%eax + vpaddq %ymm10,%ymm7,%ymm6 + vpmuludq %ymm0,%ymm14,%ymm14 + addq 24(%rsp),%rdx + vpaddq %ymm14,%ymm8,%ymm7 + vpmuludq %ymm0,%ymm11,%ymm11 + vpaddq %ymm11,%ymm9,%ymm8 + vmovq %r12,%xmm9 + movq %rdx,%r12 + + decl %r14d + jnz .LOOP_REDUCE_1024 + leaq 448(%rsp),%r12 + vpaddq %ymm9,%ymm13,%ymm0 + vpxor %ymm9,%ymm9,%ymm9 + + vpaddq 288-192(%rbx),%ymm0,%ymm0 + vpaddq 320-448(%r12),%ymm1,%ymm1 + vpaddq 352-448(%r12),%ymm2,%ymm2 + vpaddq 384-448(%r12),%ymm3,%ymm3 + vpaddq 416-448(%r12),%ymm4,%ymm4 + vpaddq 448-448(%r12),%ymm5,%ymm5 + vpaddq 480-448(%r12),%ymm6,%ymm6 + vpaddq 512-448(%r12),%ymm7,%ymm7 + vpaddq 544-448(%r12),%ymm8,%ymm8 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vpaddq %ymm13,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm14 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm11 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm12,%ymm12 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm13,%ymm13 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm0,%ymm0 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm1,%ymm1 + vmovdqu %ymm0,0-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm2,%ymm2 + vmovdqu %ymm1,32-128(%rdi) + vpblendd $3,%ymm13,%ymm9,%ymm13 + vpaddq %ymm12,%ymm3,%ymm3 + vmovdqu %ymm2,64-128(%rdi) + vpaddq %ymm13,%ymm4,%ymm4 + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vpaddq %ymm13,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm14 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm11 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm12 + vpermq $0x93,%ymm14,%ymm14 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm13 + vpermq $0x93,%ymm11,%ymm11 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm13,%ymm13 + + vpblendd $3,%ymm9,%ymm14,%ymm10 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm14,%ymm11,%ymm14 + vpaddq %ymm10,%ymm4,%ymm4 + vpblendd $3,%ymm11,%ymm12,%ymm11 + vpaddq %ymm14,%ymm5,%ymm5 + vmovdqu %ymm4,128-128(%rdi) + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm11,%ymm6,%ymm6 + vmovdqu %ymm5,160-128(%rdi) + vpblendd $3,%ymm13,%ymm0,%ymm13 + vpaddq %ymm12,%ymm7,%ymm7 + vmovdqu %ymm6,192-128(%rdi) + vpaddq %ymm13,%ymm8,%ymm8 + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + + movq %rdi,%rsi + decl %r8d + jne .LOOP_GRANDE_SQR_1024 + + vzeroall + movq %rbp,%rax +.cfi_def_cfa_register %rax + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lsqr_1024_epilogue: + ret +.cfi_endproc +.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 +.globl rsaz_1024_mul_avx2 +.hidden rsaz_1024_mul_avx2 +.type rsaz_1024_mul_avx2,@function +.align 64 +rsaz_1024_mul_avx2: +.cfi_startproc +_CET_ENDBR + leaq (%rsp),%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + movq %rax,%rbp +.cfi_def_cfa_register %rbp + vzeroall + movq %rdx,%r13 + subq $64,%rsp + + + + + + +.byte 0x67,0x67 + movq %rsi,%r15 + andq $4095,%r15 + addq $320,%r15 + shrq $12,%r15 + movq %rsi,%r15 + cmovnzq %r13,%rsi + cmovnzq %r15,%r13 + + movq %rcx,%r15 + subq $-128,%rsi + subq $-128,%rcx + subq $-128,%rdi + + andq $4095,%r15 + addq $320,%r15 +.byte 0x67,0x67 + shrq $12,%r15 + jz .Lmul_1024_no_n_copy + + + + + + subq $320,%rsp + vmovdqu 0-128(%rcx),%ymm0 + andq $-512,%rsp + vmovdqu 32-128(%rcx),%ymm1 + vmovdqu 64-128(%rcx),%ymm2 + vmovdqu 96-128(%rcx),%ymm3 + vmovdqu 128-128(%rcx),%ymm4 + vmovdqu 160-128(%rcx),%ymm5 + vmovdqu 192-128(%rcx),%ymm6 + vmovdqu 224-128(%rcx),%ymm7 + vmovdqu 256-128(%rcx),%ymm8 + leaq 64+128(%rsp),%rcx + vmovdqu %ymm0,0-128(%rcx) + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm1,32-128(%rcx) + vpxor %ymm1,%ymm1,%ymm1 + vmovdqu %ymm2,64-128(%rcx) + vpxor %ymm2,%ymm2,%ymm2 + vmovdqu %ymm3,96-128(%rcx) + vpxor %ymm3,%ymm3,%ymm3 + vmovdqu %ymm4,128-128(%rcx) + vpxor %ymm4,%ymm4,%ymm4 + vmovdqu %ymm5,160-128(%rcx) + vpxor %ymm5,%ymm5,%ymm5 + vmovdqu %ymm6,192-128(%rcx) + vpxor %ymm6,%ymm6,%ymm6 + vmovdqu %ymm7,224-128(%rcx) + vpxor %ymm7,%ymm7,%ymm7 + vmovdqu %ymm8,256-128(%rcx) + vmovdqa %ymm0,%ymm8 + vmovdqu %ymm9,288-128(%rcx) +.Lmul_1024_no_n_copy: + andq $-64,%rsp + + movq (%r13),%rbx + vpbroadcastq (%r13),%ymm10 + vmovdqu %ymm0,(%rsp) + xorq %r9,%r9 +.byte 0x67 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + + vmovdqu .Land_mask(%rip),%ymm15 + movl $9,%r14d + vmovdqu %ymm9,288-128(%rdi) + jmp .Loop_mul_1024 + +.align 32 +.Loop_mul_1024: + vpsrlq $29,%ymm3,%ymm9 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r9,%rax + movq %rbx,%r10 + imulq 8-128(%rsi),%r10 + addq 8(%rsp),%r10 + + movq %rax,%r9 + imull %r8d,%eax + andl $0x1fffffff,%eax + + movq %rbx,%r11 + imulq 16-128(%rsi),%r11 + addq 16(%rsp),%r11 + + movq %rbx,%r12 + imulq 24-128(%rsi),%r12 + addq 24(%rsp),%r12 + vpmuludq 32-128(%rsi),%ymm10,%ymm0 + vmovd %eax,%xmm11 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq 64-128(%rsi),%ymm10,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq 96-128(%rsi),%ymm10,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq 128-128(%rsi),%ymm10,%ymm0 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq 160-128(%rsi),%ymm10,%ymm12 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq 192-128(%rsi),%ymm10,%ymm13 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq 224-128(%rsi),%ymm10,%ymm0 + vpermq $0x93,%ymm9,%ymm9 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq 256-128(%rsi),%ymm10,%ymm12 + vpbroadcastq 8(%r13),%ymm10 + vpaddq %ymm12,%ymm8,%ymm8 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r9 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r10 + movq %rdx,%rax + imulq 16-128(%rcx),%rax + addq %rax,%r11 + shrq $29,%r9 + imulq 24-128(%rcx),%rdx + addq %rdx,%r12 + addq %r9,%r10 + + vpmuludq 32-128(%rcx),%ymm11,%ymm13 + vmovq %xmm10,%rbx + vpaddq %ymm13,%ymm1,%ymm1 + vpmuludq 64-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm2,%ymm2 + vpmuludq 96-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm3,%ymm3 + vpmuludq 128-128(%rcx),%ymm11,%ymm13 + vpaddq %ymm13,%ymm4,%ymm4 + vpmuludq 160-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm0,%ymm5,%ymm5 + vpmuludq 192-128(%rcx),%ymm11,%ymm12 + vpaddq %ymm12,%ymm6,%ymm6 + vpmuludq 224-128(%rcx),%ymm11,%ymm13 + vpblendd $3,%ymm14,%ymm9,%ymm12 + vpaddq %ymm13,%ymm7,%ymm7 + vpmuludq 256-128(%rcx),%ymm11,%ymm0 + vpaddq %ymm12,%ymm3,%ymm3 + vpaddq %ymm0,%ymm8,%ymm8 + + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rsi),%ymm12 + movq %rbx,%rax + imulq 8-128(%rsi),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rsi),%ymm13 + + movq %r10,%rax + vpblendd $0xfc,%ymm14,%ymm9,%ymm9 + imull %r8d,%eax + vpaddq %ymm9,%ymm4,%ymm4 + andl $0x1fffffff,%eax + + imulq 16-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovd %eax,%xmm11 + vmovdqu -8+96-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -8+128-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+160-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+192-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -8+224-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -8+256-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -8+288-128(%rsi),%ymm9 + vpaddq %ymm12,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm13,%ymm13 + vpaddq %ymm13,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm9,%ymm9 + vpbroadcastq 16(%r13),%ymm10 + + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r10 + vmovdqu -8+32-128(%rcx),%ymm0 + movq %rdx,%rax + imulq 8-128(%rcx),%rax + addq %rax,%r11 + vmovdqu -8+64-128(%rcx),%ymm12 + shrq $29,%r10 + imulq 16-128(%rcx),%rdx + addq %rdx,%r12 + addq %r10,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -8+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -8+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -8+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -8+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rsi),%ymm0 + movq %rbx,%rax + imulq -128(%rsi),%rax + addq %r11,%rax + + vmovdqu -16+64-128(%rsi),%ymm12 + movq %rax,%r11 + imull %r8d,%eax + andl $0x1fffffff,%eax + + imulq 8-128(%rsi),%rbx + addq %rbx,%r12 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -16+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -16+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -16+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -16+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -16+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 24(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + + vmovdqu -16+32-128(%rcx),%ymm0 + movq %rax,%rdx + imulq -128(%rcx),%rax + addq %rax,%r11 + vmovdqu -16+64-128(%rcx),%ymm12 + imulq 8-128(%rcx),%rdx + addq %rdx,%r12 + shrq $29,%r11 + + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -16+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+128-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -16+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -16+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -16+288-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+32-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+64-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm9,%ymm9 + + addq %r11,%r12 + imulq -128(%rsi),%rbx + addq %rbx,%r12 + + movq %r12,%rax + imull %r8d,%eax + andl $0x1fffffff,%eax + + vpmuludq %ymm10,%ymm0,%ymm0 + vmovd %eax,%xmm11 + vmovdqu -24+96-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm1,%ymm1 + vpmuludq %ymm10,%ymm12,%ymm12 + vpbroadcastq %xmm11,%ymm11 + vmovdqu -24+128-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm2,%ymm2 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+160-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm3,%ymm3 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+192-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm4,%ymm4 + vpmuludq %ymm10,%ymm12,%ymm12 + vmovdqu -24+224-128(%rsi),%ymm0 + vpaddq %ymm12,%ymm5,%ymm5 + vpmuludq %ymm10,%ymm13,%ymm13 + vmovdqu -24+256-128(%rsi),%ymm12 + vpaddq %ymm13,%ymm6,%ymm6 + vpmuludq %ymm10,%ymm0,%ymm0 + vmovdqu -24+288-128(%rsi),%ymm13 + vpaddq %ymm0,%ymm7,%ymm7 + vpmuludq %ymm10,%ymm12,%ymm12 + vpaddq %ymm12,%ymm8,%ymm8 + vpmuludq %ymm10,%ymm13,%ymm13 + vpbroadcastq 32(%r13),%ymm10 + vpaddq %ymm13,%ymm9,%ymm9 + addq $32,%r13 + + vmovdqu -24+32-128(%rcx),%ymm0 + imulq -128(%rcx),%rax + addq %rax,%r12 + shrq $29,%r12 + + vmovdqu -24+64-128(%rcx),%ymm12 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovq %xmm10,%rbx + vmovdqu -24+96-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm1,%ymm0 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu %ymm0,(%rsp) + vpaddq %ymm12,%ymm2,%ymm1 + vmovdqu -24+128-128(%rcx),%ymm0 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+160-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm3,%ymm2 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+192-128(%rcx),%ymm13 + vpaddq %ymm0,%ymm4,%ymm3 + vpmuludq %ymm11,%ymm12,%ymm12 + vmovdqu -24+224-128(%rcx),%ymm0 + vpaddq %ymm12,%ymm5,%ymm4 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovdqu -24+256-128(%rcx),%ymm12 + vpaddq %ymm13,%ymm6,%ymm5 + vpmuludq %ymm11,%ymm0,%ymm0 + vmovdqu -24+288-128(%rcx),%ymm13 + movq %r12,%r9 + vpaddq %ymm0,%ymm7,%ymm6 + vpmuludq %ymm11,%ymm12,%ymm12 + addq (%rsp),%r9 + vpaddq %ymm12,%ymm8,%ymm7 + vpmuludq %ymm11,%ymm13,%ymm13 + vmovq %r12,%xmm12 + vpaddq %ymm13,%ymm9,%ymm8 + + decl %r14d + jnz .Loop_mul_1024 + vpaddq (%rsp),%ymm12,%ymm0 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm10,%ymm10 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpermq $0x93,%ymm11,%ymm11 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vpsrlq $29,%ymm0,%ymm12 + vpand %ymm15,%ymm0,%ymm0 + vpsrlq $29,%ymm1,%ymm13 + vpand %ymm15,%ymm1,%ymm1 + vpsrlq $29,%ymm2,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm2,%ymm2 + vpsrlq $29,%ymm3,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm3,%ymm3 + vpermq $0x93,%ymm10,%ymm10 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm11,%ymm11 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm0,%ymm0 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm1,%ymm1 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm2,%ymm2 + vpblendd $3,%ymm11,%ymm14,%ymm11 + vpaddq %ymm10,%ymm3,%ymm3 + vpaddq %ymm11,%ymm4,%ymm4 + + vmovdqu %ymm0,0-128(%rdi) + vmovdqu %ymm1,32-128(%rdi) + vmovdqu %ymm2,64-128(%rdi) + vmovdqu %ymm3,96-128(%rdi) + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vpsrlq $29,%ymm4,%ymm12 + vpand %ymm15,%ymm4,%ymm4 + vpsrlq $29,%ymm5,%ymm13 + vpand %ymm15,%ymm5,%ymm5 + vpsrlq $29,%ymm6,%ymm10 + vpermq $0x93,%ymm12,%ymm12 + vpand %ymm15,%ymm6,%ymm6 + vpsrlq $29,%ymm7,%ymm11 + vpermq $0x93,%ymm13,%ymm13 + vpand %ymm15,%ymm7,%ymm7 + vpsrlq $29,%ymm8,%ymm0 + vpermq $0x93,%ymm10,%ymm10 + vpand %ymm15,%ymm8,%ymm8 + vpermq $0x93,%ymm11,%ymm11 + + vpblendd $3,%ymm14,%ymm12,%ymm9 + vpermq $0x93,%ymm0,%ymm0 + vpblendd $3,%ymm12,%ymm13,%ymm12 + vpaddq %ymm9,%ymm4,%ymm4 + vpblendd $3,%ymm13,%ymm10,%ymm13 + vpaddq %ymm12,%ymm5,%ymm5 + vpblendd $3,%ymm10,%ymm11,%ymm10 + vpaddq %ymm13,%ymm6,%ymm6 + vpblendd $3,%ymm11,%ymm0,%ymm11 + vpaddq %ymm10,%ymm7,%ymm7 + vpaddq %ymm11,%ymm8,%ymm8 + + vmovdqu %ymm4,128-128(%rdi) + vmovdqu %ymm5,160-128(%rdi) + vmovdqu %ymm6,192-128(%rdi) + vmovdqu %ymm7,224-128(%rdi) + vmovdqu %ymm8,256-128(%rdi) + vzeroupper + + movq %rbp,%rax +.cfi_def_cfa_register %rax + movq -48(%rax),%r15 +.cfi_restore %r15 + movq -40(%rax),%r14 +.cfi_restore %r14 + movq -32(%rax),%r13 +.cfi_restore %r13 + movq -24(%rax),%r12 +.cfi_restore %r12 + movq -16(%rax),%rbp +.cfi_restore %rbp + movq -8(%rax),%rbx +.cfi_restore %rbx + leaq (%rax),%rsp +.cfi_def_cfa_register %rsp +.Lmul_1024_epilogue: + ret +.cfi_endproc +.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 +.globl rsaz_1024_red2norm_avx2 +.hidden rsaz_1024_red2norm_avx2 +.type rsaz_1024_red2norm_avx2,@function +.align 32 +rsaz_1024_red2norm_avx2: +.cfi_startproc +_CET_ENDBR + subq $-128,%rsi + xorq %rax,%rax + movq -128(%rsi),%r8 + movq -120(%rsi),%r9 + movq -112(%rsi),%r10 + shlq $0,%r8 + shlq $29,%r9 + movq %r10,%r11 + shlq $58,%r10 + shrq $6,%r11 + addq %r8,%rax + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,0(%rdi) + movq %r11,%rax + movq -104(%rsi),%r8 + movq -96(%rsi),%r9 + shlq $23,%r8 + movq %r9,%r10 + shlq $52,%r9 + shrq $12,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,8(%rdi) + movq %r10,%rax + movq -88(%rsi),%r11 + movq -80(%rsi),%r8 + shlq $17,%r11 + movq %r8,%r9 + shlq $46,%r8 + shrq $18,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,16(%rdi) + movq %r9,%rax + movq -72(%rsi),%r10 + movq -64(%rsi),%r11 + shlq $11,%r10 + movq %r11,%r8 + shlq $40,%r11 + shrq $24,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,24(%rdi) + movq %r8,%rax + movq -56(%rsi),%r9 + movq -48(%rsi),%r10 + movq -40(%rsi),%r11 + shlq $5,%r9 + shlq $34,%r10 + movq %r11,%r8 + shlq $63,%r11 + shrq $1,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,32(%rdi) + movq %r8,%rax + movq -32(%rsi),%r9 + movq -24(%rsi),%r10 + shlq $28,%r9 + movq %r10,%r11 + shlq $57,%r10 + shrq $7,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,40(%rdi) + movq %r11,%rax + movq -16(%rsi),%r8 + movq -8(%rsi),%r9 + shlq $22,%r8 + movq %r9,%r10 + shlq $51,%r9 + shrq $13,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,48(%rdi) + movq %r10,%rax + movq 0(%rsi),%r11 + movq 8(%rsi),%r8 + shlq $16,%r11 + movq %r8,%r9 + shlq $45,%r8 + shrq $19,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,56(%rdi) + movq %r9,%rax + movq 16(%rsi),%r10 + movq 24(%rsi),%r11 + shlq $10,%r10 + movq %r11,%r8 + shlq $39,%r11 + shrq $25,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,64(%rdi) + movq %r8,%rax + movq 32(%rsi),%r9 + movq 40(%rsi),%r10 + movq 48(%rsi),%r11 + shlq $4,%r9 + shlq $33,%r10 + movq %r11,%r8 + shlq $62,%r11 + shrq $2,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,72(%rdi) + movq %r8,%rax + movq 56(%rsi),%r9 + movq 64(%rsi),%r10 + shlq $27,%r9 + movq %r10,%r11 + shlq $56,%r10 + shrq $8,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,80(%rdi) + movq %r11,%rax + movq 72(%rsi),%r8 + movq 80(%rsi),%r9 + shlq $21,%r8 + movq %r9,%r10 + shlq $50,%r9 + shrq $14,%r10 + addq %r8,%rax + addq %r9,%rax + adcq $0,%r10 + movq %rax,88(%rdi) + movq %r10,%rax + movq 88(%rsi),%r11 + movq 96(%rsi),%r8 + shlq $15,%r11 + movq %r8,%r9 + shlq $44,%r8 + shrq $20,%r9 + addq %r11,%rax + addq %r8,%rax + adcq $0,%r9 + movq %rax,96(%rdi) + movq %r9,%rax + movq 104(%rsi),%r10 + movq 112(%rsi),%r11 + shlq $9,%r10 + movq %r11,%r8 + shlq $38,%r11 + shrq $26,%r8 + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,104(%rdi) + movq %r8,%rax + movq 120(%rsi),%r9 + movq 128(%rsi),%r10 + movq 136(%rsi),%r11 + shlq $3,%r9 + shlq $32,%r10 + movq %r11,%r8 + shlq $61,%r11 + shrq $3,%r8 + addq %r9,%rax + addq %r10,%rax + addq %r11,%rax + adcq $0,%r8 + movq %rax,112(%rdi) + movq %r8,%rax + movq 144(%rsi),%r9 + movq 152(%rsi),%r10 + shlq $26,%r9 + movq %r10,%r11 + shlq $55,%r10 + shrq $9,%r11 + addq %r9,%rax + addq %r10,%rax + adcq $0,%r11 + movq %rax,120(%rdi) + movq %r11,%rax + ret +.cfi_endproc +.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 + +.globl rsaz_1024_norm2red_avx2 +.hidden rsaz_1024_norm2red_avx2 +.type rsaz_1024_norm2red_avx2,@function +.align 32 +rsaz_1024_norm2red_avx2: +.cfi_startproc +_CET_ENDBR + subq $-128,%rdi + movq (%rsi),%r8 + movl $0x1fffffff,%eax + movq 8(%rsi),%r9 + movq %r8,%r11 + shrq $0,%r11 + andq %rax,%r11 + movq %r11,-128(%rdi) + movq %r8,%r10 + shrq $29,%r10 + andq %rax,%r10 + movq %r10,-120(%rdi) + shrdq $58,%r9,%r8 + andq %rax,%r8 + movq %r8,-112(%rdi) + movq 16(%rsi),%r10 + movq %r9,%r8 + shrq $23,%r8 + andq %rax,%r8 + movq %r8,-104(%rdi) + shrdq $52,%r10,%r9 + andq %rax,%r9 + movq %r9,-96(%rdi) + movq 24(%rsi),%r11 + movq %r10,%r9 + shrq $17,%r9 + andq %rax,%r9 + movq %r9,-88(%rdi) + shrdq $46,%r11,%r10 + andq %rax,%r10 + movq %r10,-80(%rdi) + movq 32(%rsi),%r8 + movq %r11,%r10 + shrq $11,%r10 + andq %rax,%r10 + movq %r10,-72(%rdi) + shrdq $40,%r8,%r11 + andq %rax,%r11 + movq %r11,-64(%rdi) + movq 40(%rsi),%r9 + movq %r8,%r11 + shrq $5,%r11 + andq %rax,%r11 + movq %r11,-56(%rdi) + movq %r8,%r10 + shrq $34,%r10 + andq %rax,%r10 + movq %r10,-48(%rdi) + shrdq $63,%r9,%r8 + andq %rax,%r8 + movq %r8,-40(%rdi) + movq 48(%rsi),%r10 + movq %r9,%r8 + shrq $28,%r8 + andq %rax,%r8 + movq %r8,-32(%rdi) + shrdq $57,%r10,%r9 + andq %rax,%r9 + movq %r9,-24(%rdi) + movq 56(%rsi),%r11 + movq %r10,%r9 + shrq $22,%r9 + andq %rax,%r9 + movq %r9,-16(%rdi) + shrdq $51,%r11,%r10 + andq %rax,%r10 + movq %r10,-8(%rdi) + movq 64(%rsi),%r8 + movq %r11,%r10 + shrq $16,%r10 + andq %rax,%r10 + movq %r10,0(%rdi) + shrdq $45,%r8,%r11 + andq %rax,%r11 + movq %r11,8(%rdi) + movq 72(%rsi),%r9 + movq %r8,%r11 + shrq $10,%r11 + andq %rax,%r11 + movq %r11,16(%rdi) + shrdq $39,%r9,%r8 + andq %rax,%r8 + movq %r8,24(%rdi) + movq 80(%rsi),%r10 + movq %r9,%r8 + shrq $4,%r8 + andq %rax,%r8 + movq %r8,32(%rdi) + movq %r9,%r11 + shrq $33,%r11 + andq %rax,%r11 + movq %r11,40(%rdi) + shrdq $62,%r10,%r9 + andq %rax,%r9 + movq %r9,48(%rdi) + movq 88(%rsi),%r11 + movq %r10,%r9 + shrq $27,%r9 + andq %rax,%r9 + movq %r9,56(%rdi) + shrdq $56,%r11,%r10 + andq %rax,%r10 + movq %r10,64(%rdi) + movq 96(%rsi),%r8 + movq %r11,%r10 + shrq $21,%r10 + andq %rax,%r10 + movq %r10,72(%rdi) + shrdq $50,%r8,%r11 + andq %rax,%r11 + movq %r11,80(%rdi) + movq 104(%rsi),%r9 + movq %r8,%r11 + shrq $15,%r11 + andq %rax,%r11 + movq %r11,88(%rdi) + shrdq $44,%r9,%r8 + andq %rax,%r8 + movq %r8,96(%rdi) + movq 112(%rsi),%r10 + movq %r9,%r8 + shrq $9,%r8 + andq %rax,%r8 + movq %r8,104(%rdi) + shrdq $38,%r10,%r9 + andq %rax,%r9 + movq %r9,112(%rdi) + movq 120(%rsi),%r11 + movq %r10,%r9 + shrq $3,%r9 + andq %rax,%r9 + movq %r9,120(%rdi) + movq %r10,%r8 + shrq $32,%r8 + andq %rax,%r8 + movq %r8,128(%rdi) + shrdq $61,%r11,%r10 + andq %rax,%r10 + movq %r10,136(%rdi) + xorq %r8,%r8 + movq %r11,%r10 + shrq $26,%r10 + andq %rax,%r10 + movq %r10,144(%rdi) + shrdq $55,%r8,%r11 + andq %rax,%r11 + movq %r11,152(%rdi) + movq %r8,160(%rdi) + movq %r8,168(%rdi) + movq %r8,176(%rdi) + movq %r8,184(%rdi) + ret +.cfi_endproc +.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 +.globl rsaz_1024_scatter5_avx2 +.hidden rsaz_1024_scatter5_avx2 +.type rsaz_1024_scatter5_avx2,@function +.align 32 +rsaz_1024_scatter5_avx2: +.cfi_startproc +_CET_ENDBR + vzeroupper + vmovdqu .Lscatter_permd(%rip),%ymm5 + shll $4,%edx + leaq (%rdi,%rdx,1),%rdi + movl $9,%eax + jmp .Loop_scatter_1024 + +.align 32 +.Loop_scatter_1024: + vmovdqu (%rsi),%ymm0 + leaq 32(%rsi),%rsi + vpermd %ymm0,%ymm5,%ymm0 + vmovdqu %xmm0,(%rdi) + leaq 512(%rdi),%rdi + decl %eax + jnz .Loop_scatter_1024 + + vzeroupper + ret +.cfi_endproc +.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 + +.globl rsaz_1024_gather5_avx2 +.hidden rsaz_1024_gather5_avx2 +.type rsaz_1024_gather5_avx2,@function +.align 32 +rsaz_1024_gather5_avx2: +.cfi_startproc +_CET_ENDBR + vzeroupper + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + leaq -256(%rsp),%rsp + andq $-32,%rsp + leaq .Linc(%rip),%r10 + leaq -128(%rsp),%rax + + vmovd %edx,%xmm4 + vmovdqa (%r10),%ymm0 + vmovdqa 32(%r10),%ymm1 + vmovdqa 64(%r10),%ymm5 + vpbroadcastd %xmm4,%ymm4 + + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,0+128(%rax) + vpaddd %ymm5,%ymm2,%ymm0 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,32+128(%rax) + vpaddd %ymm5,%ymm3,%ymm1 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,64+128(%rax) + vpaddd %ymm5,%ymm0,%ymm2 + vpcmpeqd %ymm4,%ymm0,%ymm0 + vmovdqa %ymm3,96+128(%rax) + vpaddd %ymm5,%ymm1,%ymm3 + vpcmpeqd %ymm4,%ymm1,%ymm1 + vmovdqa %ymm0,128+128(%rax) + vpaddd %ymm5,%ymm2,%ymm8 + vpcmpeqd %ymm4,%ymm2,%ymm2 + vmovdqa %ymm1,160+128(%rax) + vpaddd %ymm5,%ymm3,%ymm9 + vpcmpeqd %ymm4,%ymm3,%ymm3 + vmovdqa %ymm2,192+128(%rax) + vpaddd %ymm5,%ymm8,%ymm10 + vpcmpeqd %ymm4,%ymm8,%ymm8 + vmovdqa %ymm3,224+128(%rax) + vpaddd %ymm5,%ymm9,%ymm11 + vpcmpeqd %ymm4,%ymm9,%ymm9 + vpaddd %ymm5,%ymm10,%ymm12 + vpcmpeqd %ymm4,%ymm10,%ymm10 + vpaddd %ymm5,%ymm11,%ymm13 + vpcmpeqd %ymm4,%ymm11,%ymm11 + vpaddd %ymm5,%ymm12,%ymm14 + vpcmpeqd %ymm4,%ymm12,%ymm12 + vpaddd %ymm5,%ymm13,%ymm15 + vpcmpeqd %ymm4,%ymm13,%ymm13 + vpcmpeqd %ymm4,%ymm14,%ymm14 + vpcmpeqd %ymm4,%ymm15,%ymm15 + + vmovdqa -32(%r10),%ymm7 + leaq 128(%rsi),%rsi + movl $9,%edx + +.Loop_gather_1024: + vmovdqa 0-128(%rsi),%ymm0 + vmovdqa 32-128(%rsi),%ymm1 + vmovdqa 64-128(%rsi),%ymm2 + vmovdqa 96-128(%rsi),%ymm3 + vpand 0+128(%rax),%ymm0,%ymm0 + vpand 32+128(%rax),%ymm1,%ymm1 + vpand 64+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm1,%ymm4 + vpand 96+128(%rax),%ymm3,%ymm3 + vmovdqa 128-128(%rsi),%ymm0 + vmovdqa 160-128(%rsi),%ymm1 + vpor %ymm2,%ymm3,%ymm5 + vmovdqa 192-128(%rsi),%ymm2 + vmovdqa 224-128(%rsi),%ymm3 + vpand 128+128(%rax),%ymm0,%ymm0 + vpand 160+128(%rax),%ymm1,%ymm1 + vpand 192+128(%rax),%ymm2,%ymm2 + vpor %ymm0,%ymm4,%ymm4 + vpand 224+128(%rax),%ymm3,%ymm3 + vpand 256-128(%rsi),%ymm8,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 288-128(%rsi),%ymm9,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 320-128(%rsi),%ymm10,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 352-128(%rsi),%ymm11,%ymm3 + vpor %ymm0,%ymm4,%ymm4 + vpand 384-128(%rsi),%ymm12,%ymm0 + vpor %ymm1,%ymm5,%ymm5 + vpand 416-128(%rsi),%ymm13,%ymm1 + vpor %ymm2,%ymm4,%ymm4 + vpand 448-128(%rsi),%ymm14,%ymm2 + vpor %ymm3,%ymm5,%ymm5 + vpand 480-128(%rsi),%ymm15,%ymm3 + leaq 512(%rsi),%rsi + vpor %ymm0,%ymm4,%ymm4 + vpor %ymm1,%ymm5,%ymm5 + vpor %ymm2,%ymm4,%ymm4 + vpor %ymm3,%ymm5,%ymm5 + + vpor %ymm5,%ymm4,%ymm4 + vextracti128 $1,%ymm4,%xmm5 + vpor %xmm4,%xmm5,%xmm5 + vpermd %ymm5,%ymm7,%ymm5 + vmovdqu %ymm5,(%rdi) + leaq 32(%rdi),%rdi + decl %edx + jnz .Loop_gather_1024 + + vpxor %ymm0,%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + vzeroupper + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp + ret +.cfi_endproc +.LSEH_end_rsaz_1024_gather5: +.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 +.section .rodata +.align 64 +.Land_mask: +.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff +.Lscatter_permd: +.long 0,2,4,6,7,7,7,7 +.Lgather_permd: +.long 0,7,1,7,2,7,3,7 +.Linc: +.long 0,0,0,0, 1,1,1,1 +.long 2,2,2,2, 3,3,3,3 +.long 4,4,4,4, 4,4,4,4 +.align 64 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/rsaz-avx2-win.asm b/third_party/boringssl/gen/bcm/rsaz-avx2-win.asm new file mode 100644 index 00000000..a518277c --- /dev/null +++ b/third_party/boringssl/gen/bcm/rsaz-avx2-win.asm @@ -0,0 +1,1987 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + +global rsaz_1024_sqr_avx2 + +ALIGN 64 +rsaz_1024_sqr_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_rsaz_1024_sqr_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + lea rax,[rsp] + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + vzeroupper + lea rsp,[((-168))+rsp] + vmovaps XMMWORD[(-216)+rax],xmm6 + vmovaps XMMWORD[(-200)+rax],xmm7 + vmovaps XMMWORD[(-184)+rax],xmm8 + vmovaps XMMWORD[(-168)+rax],xmm9 + vmovaps XMMWORD[(-152)+rax],xmm10 + vmovaps XMMWORD[(-136)+rax],xmm11 + vmovaps XMMWORD[(-120)+rax],xmm12 + vmovaps XMMWORD[(-104)+rax],xmm13 + vmovaps XMMWORD[(-88)+rax],xmm14 + vmovaps XMMWORD[(-72)+rax],xmm15 +$L$sqr_1024_body: + mov rbp,rax + + mov r13,rdx + sub rsp,832 + mov r15,r13 + sub rdi,-128 + sub rsi,-128 + sub r13,-128 + + and r15,4095 + add r15,32*10 + shr r15,12 + vpxor ymm9,ymm9,ymm9 + jz NEAR $L$sqr_1024_no_n_copy + + + + + + sub rsp,32*10 + vmovdqu ymm0,YMMWORD[((0-128))+r13] + and rsp,-2048 + vmovdqu ymm1,YMMWORD[((32-128))+r13] + vmovdqu ymm2,YMMWORD[((64-128))+r13] + vmovdqu ymm3,YMMWORD[((96-128))+r13] + vmovdqu ymm4,YMMWORD[((128-128))+r13] + vmovdqu ymm5,YMMWORD[((160-128))+r13] + vmovdqu ymm6,YMMWORD[((192-128))+r13] + vmovdqu ymm7,YMMWORD[((224-128))+r13] + vmovdqu ymm8,YMMWORD[((256-128))+r13] + lea r13,[((832+128))+rsp] + vmovdqu YMMWORD[(0-128)+r13],ymm0 + vmovdqu YMMWORD[(32-128)+r13],ymm1 + vmovdqu YMMWORD[(64-128)+r13],ymm2 + vmovdqu YMMWORD[(96-128)+r13],ymm3 + vmovdqu YMMWORD[(128-128)+r13],ymm4 + vmovdqu YMMWORD[(160-128)+r13],ymm5 + vmovdqu YMMWORD[(192-128)+r13],ymm6 + vmovdqu YMMWORD[(224-128)+r13],ymm7 + vmovdqu YMMWORD[(256-128)+r13],ymm8 + vmovdqu YMMWORD[(288-128)+r13],ymm9 + +$L$sqr_1024_no_n_copy: + and rsp,-1024 + + vmovdqu ymm1,YMMWORD[((32-128))+rsi] + vmovdqu ymm2,YMMWORD[((64-128))+rsi] + vmovdqu ymm3,YMMWORD[((96-128))+rsi] + vmovdqu ymm4,YMMWORD[((128-128))+rsi] + vmovdqu ymm5,YMMWORD[((160-128))+rsi] + vmovdqu ymm6,YMMWORD[((192-128))+rsi] + vmovdqu ymm7,YMMWORD[((224-128))+rsi] + vmovdqu ymm8,YMMWORD[((256-128))+rsi] + + lea rbx,[192+rsp] + vmovdqu ymm15,YMMWORD[$L$and_mask] + jmp NEAR $L$OOP_GRANDE_SQR_1024 + +ALIGN 32 +$L$OOP_GRANDE_SQR_1024: + lea r9,[((576+128))+rsp] + lea r12,[448+rsp] + + + + + vpaddq ymm1,ymm1,ymm1 + vpbroadcastq ymm10,QWORD[((0-128))+rsi] + vpaddq ymm2,ymm2,ymm2 + vmovdqa YMMWORD[(0-128)+r9],ymm1 + vpaddq ymm3,ymm3,ymm3 + vmovdqa YMMWORD[(32-128)+r9],ymm2 + vpaddq ymm4,ymm4,ymm4 + vmovdqa YMMWORD[(64-128)+r9],ymm3 + vpaddq ymm5,ymm5,ymm5 + vmovdqa YMMWORD[(96-128)+r9],ymm4 + vpaddq ymm6,ymm6,ymm6 + vmovdqa YMMWORD[(128-128)+r9],ymm5 + vpaddq ymm7,ymm7,ymm7 + vmovdqa YMMWORD[(160-128)+r9],ymm6 + vpaddq ymm8,ymm8,ymm8 + vmovdqa YMMWORD[(192-128)+r9],ymm7 + vpxor ymm9,ymm9,ymm9 + vmovdqa YMMWORD[(224-128)+r9],ymm8 + + vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi] + vpbroadcastq ymm11,QWORD[((32-128))+rsi] + vmovdqu YMMWORD[(288-192)+rbx],ymm9 + vpmuludq ymm1,ymm1,ymm10 + vmovdqu YMMWORD[(320-448)+r12],ymm9 + vpmuludq ymm2,ymm2,ymm10 + vmovdqu YMMWORD[(352-448)+r12],ymm9 + vpmuludq ymm3,ymm3,ymm10 + vmovdqu YMMWORD[(384-448)+r12],ymm9 + vpmuludq ymm4,ymm4,ymm10 + vmovdqu YMMWORD[(416-448)+r12],ymm9 + vpmuludq ymm5,ymm5,ymm10 + vmovdqu YMMWORD[(448-448)+r12],ymm9 + vpmuludq ymm6,ymm6,ymm10 + vmovdqu YMMWORD[(480-448)+r12],ymm9 + vpmuludq ymm7,ymm7,ymm10 + vmovdqu YMMWORD[(512-448)+r12],ymm9 + vpmuludq ymm8,ymm8,ymm10 + vpbroadcastq ymm10,QWORD[((64-128))+rsi] + vmovdqu YMMWORD[(544-448)+r12],ymm9 + + mov r15,rsi + mov r14d,4 + jmp NEAR $L$sqr_entry_1024 +ALIGN 32 +$L$OOP_SQR_1024: + vpbroadcastq ymm11,QWORD[((32-128))+r15] + vpmuludq ymm0,ymm10,YMMWORD[((0-128))+rsi] + vpaddq ymm0,ymm0,YMMWORD[((0-192))+rbx] + vpmuludq ymm1,ymm10,YMMWORD[((0-128))+r9] + vpaddq ymm1,ymm1,YMMWORD[((32-192))+rbx] + vpmuludq ymm2,ymm10,YMMWORD[((32-128))+r9] + vpaddq ymm2,ymm2,YMMWORD[((64-192))+rbx] + vpmuludq ymm3,ymm10,YMMWORD[((64-128))+r9] + vpaddq ymm3,ymm3,YMMWORD[((96-192))+rbx] + vpmuludq ymm4,ymm10,YMMWORD[((96-128))+r9] + vpaddq ymm4,ymm4,YMMWORD[((128-192))+rbx] + vpmuludq ymm5,ymm10,YMMWORD[((128-128))+r9] + vpaddq ymm5,ymm5,YMMWORD[((160-192))+rbx] + vpmuludq ymm6,ymm10,YMMWORD[((160-128))+r9] + vpaddq ymm6,ymm6,YMMWORD[((192-192))+rbx] + vpmuludq ymm7,ymm10,YMMWORD[((192-128))+r9] + vpaddq ymm7,ymm7,YMMWORD[((224-192))+rbx] + vpmuludq ymm8,ymm10,YMMWORD[((224-128))+r9] + vpbroadcastq ymm10,QWORD[((64-128))+r15] + vpaddq ymm8,ymm8,YMMWORD[((256-192))+rbx] +$L$sqr_entry_1024: + vmovdqu YMMWORD[(0-192)+rbx],ymm0 + vmovdqu YMMWORD[(32-192)+rbx],ymm1 + + vpmuludq ymm12,ymm11,YMMWORD[((32-128))+rsi] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm14,ymm11,YMMWORD[((32-128))+r9] + vpaddq ymm3,ymm3,ymm14 + vpmuludq ymm13,ymm11,YMMWORD[((64-128))+r9] + vpaddq ymm4,ymm4,ymm13 + vpmuludq ymm12,ymm11,YMMWORD[((96-128))+r9] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm14,ymm11,YMMWORD[((128-128))+r9] + vpaddq ymm6,ymm6,ymm14 + vpmuludq ymm13,ymm11,YMMWORD[((160-128))+r9] + vpaddq ymm7,ymm7,ymm13 + vpmuludq ymm12,ymm11,YMMWORD[((192-128))+r9] + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm0,ymm11,YMMWORD[((224-128))+r9] + vpbroadcastq ymm11,QWORD[((96-128))+r15] + vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx] + + vmovdqu YMMWORD[(64-192)+rbx],ymm2 + vmovdqu YMMWORD[(96-192)+rbx],ymm3 + + vpmuludq ymm13,ymm10,YMMWORD[((64-128))+rsi] + vpaddq ymm4,ymm4,ymm13 + vpmuludq ymm12,ymm10,YMMWORD[((64-128))+r9] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm14,ymm10,YMMWORD[((96-128))+r9] + vpaddq ymm6,ymm6,ymm14 + vpmuludq ymm13,ymm10,YMMWORD[((128-128))+r9] + vpaddq ymm7,ymm7,ymm13 + vpmuludq ymm12,ymm10,YMMWORD[((160-128))+r9] + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9] + vpaddq ymm0,ymm0,ymm14 + vpmuludq ymm1,ymm10,YMMWORD[((224-128))+r9] + vpbroadcastq ymm10,QWORD[((128-128))+r15] + vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12] + + vmovdqu YMMWORD[(128-192)+rbx],ymm4 + vmovdqu YMMWORD[(160-192)+rbx],ymm5 + + vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rsi] + vpaddq ymm6,ymm6,ymm12 + vpmuludq ymm14,ymm11,YMMWORD[((96-128))+r9] + vpaddq ymm7,ymm7,ymm14 + vpmuludq ymm13,ymm11,YMMWORD[((128-128))+r9] + vpaddq ymm8,ymm8,ymm13 + vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9] + vpaddq ymm0,ymm0,ymm12 + vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9] + vpaddq ymm1,ymm1,ymm14 + vpmuludq ymm2,ymm11,YMMWORD[((224-128))+r9] + vpbroadcastq ymm11,QWORD[((160-128))+r15] + vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12] + + vmovdqu YMMWORD[(192-192)+rbx],ymm6 + vmovdqu YMMWORD[(224-192)+rbx],ymm7 + + vpmuludq ymm12,ymm10,YMMWORD[((128-128))+rsi] + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm14,ymm10,YMMWORD[((128-128))+r9] + vpaddq ymm0,ymm0,ymm14 + vpmuludq ymm13,ymm10,YMMWORD[((160-128))+r9] + vpaddq ymm1,ymm1,ymm13 + vpmuludq ymm12,ymm10,YMMWORD[((192-128))+r9] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm3,ymm10,YMMWORD[((224-128))+r9] + vpbroadcastq ymm10,QWORD[((192-128))+r15] + vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12] + + vmovdqu YMMWORD[(256-192)+rbx],ymm8 + vmovdqu YMMWORD[(288-192)+rbx],ymm0 + lea rbx,[8+rbx] + + vpmuludq ymm13,ymm11,YMMWORD[((160-128))+rsi] + vpaddq ymm1,ymm1,ymm13 + vpmuludq ymm12,ymm11,YMMWORD[((160-128))+r9] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm14,ymm11,YMMWORD[((192-128))+r9] + vpaddq ymm3,ymm3,ymm14 + vpmuludq ymm4,ymm11,YMMWORD[((224-128))+r9] + vpbroadcastq ymm11,QWORD[((224-128))+r15] + vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12] + + vmovdqu YMMWORD[(320-448)+r12],ymm1 + vmovdqu YMMWORD[(352-448)+r12],ymm2 + + vpmuludq ymm12,ymm10,YMMWORD[((192-128))+rsi] + vpaddq ymm3,ymm3,ymm12 + vpmuludq ymm14,ymm10,YMMWORD[((192-128))+r9] + vpbroadcastq ymm0,QWORD[((256-128))+r15] + vpaddq ymm4,ymm4,ymm14 + vpmuludq ymm5,ymm10,YMMWORD[((224-128))+r9] + vpbroadcastq ymm10,QWORD[((0+8-128))+r15] + vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12] + + vmovdqu YMMWORD[(384-448)+r12],ymm3 + vmovdqu YMMWORD[(416-448)+r12],ymm4 + lea r15,[8+r15] + + vpmuludq ymm12,ymm11,YMMWORD[((224-128))+rsi] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm6,ymm11,YMMWORD[((224-128))+r9] + vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12] + + vpmuludq ymm7,ymm0,YMMWORD[((256-128))+rsi] + vmovdqu YMMWORD[(448-448)+r12],ymm5 + vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12] + vmovdqu YMMWORD[(480-448)+r12],ymm6 + vmovdqu YMMWORD[(512-448)+r12],ymm7 + lea r12,[8+r12] + + dec r14d + jnz NEAR $L$OOP_SQR_1024 + + vmovdqu ymm8,YMMWORD[256+rsp] + vmovdqu ymm1,YMMWORD[288+rsp] + vmovdqu ymm2,YMMWORD[320+rsp] + lea rbx,[192+rsp] + + vpsrlq ymm14,ymm8,29 + vpand ymm8,ymm8,ymm15 + vpsrlq ymm11,ymm1,29 + vpand ymm1,ymm1,ymm15 + + vpermq ymm14,ymm14,0x93 + vpxor ymm9,ymm9,ymm9 + vpermq ymm11,ymm11,0x93 + + vpblendd ymm10,ymm14,ymm9,3 + vpblendd ymm14,ymm11,ymm14,3 + vpaddq ymm8,ymm8,ymm10 + vpblendd ymm11,ymm9,ymm11,3 + vpaddq ymm1,ymm1,ymm14 + vpaddq ymm2,ymm2,ymm11 + vmovdqu YMMWORD[(288-192)+rbx],ymm1 + vmovdqu YMMWORD[(320-192)+rbx],ymm2 + + mov rax,QWORD[rsp] + mov r10,QWORD[8+rsp] + mov r11,QWORD[16+rsp] + mov r12,QWORD[24+rsp] + vmovdqu ymm1,YMMWORD[32+rsp] + vmovdqu ymm2,YMMWORD[((64-192))+rbx] + vmovdqu ymm3,YMMWORD[((96-192))+rbx] + vmovdqu ymm4,YMMWORD[((128-192))+rbx] + vmovdqu ymm5,YMMWORD[((160-192))+rbx] + vmovdqu ymm6,YMMWORD[((192-192))+rbx] + vmovdqu ymm7,YMMWORD[((224-192))+rbx] + + mov r9,rax + imul eax,ecx + and eax,0x1fffffff + vmovd xmm12,eax + + mov rdx,rax + imul rax,QWORD[((-128))+r13] + vpbroadcastq ymm12,xmm12 + add r9,rax + mov rax,rdx + imul rax,QWORD[((8-128))+r13] + shr r9,29 + add r10,rax + mov rax,rdx + imul rax,QWORD[((16-128))+r13] + add r10,r9 + add r11,rax + imul rdx,QWORD[((24-128))+r13] + add r12,rdx + + mov rax,r10 + imul eax,ecx + and eax,0x1fffffff + + mov r14d,9 + jmp NEAR $L$OOP_REDUCE_1024 + +ALIGN 32 +$L$OOP_REDUCE_1024: + vmovd xmm13,eax + vpbroadcastq ymm13,xmm13 + + vpmuludq ymm10,ymm12,YMMWORD[((32-128))+r13] + mov rdx,rax + imul rax,QWORD[((-128))+r13] + vpaddq ymm1,ymm1,ymm10 + add r10,rax + vpmuludq ymm14,ymm12,YMMWORD[((64-128))+r13] + mov rax,rdx + imul rax,QWORD[((8-128))+r13] + vpaddq ymm2,ymm2,ymm14 + vpmuludq ymm11,ymm12,YMMWORD[((96-128))+r13] + DB 0x67 + add r11,rax + DB 0x67 + mov rax,rdx + imul rax,QWORD[((16-128))+r13] + shr r10,29 + vpaddq ymm3,ymm3,ymm11 + vpmuludq ymm10,ymm12,YMMWORD[((128-128))+r13] + add r12,rax + add r11,r10 + vpaddq ymm4,ymm4,ymm10 + vpmuludq ymm14,ymm12,YMMWORD[((160-128))+r13] + mov rax,r11 + imul eax,ecx + vpaddq ymm5,ymm5,ymm14 + vpmuludq ymm11,ymm12,YMMWORD[((192-128))+r13] + and eax,0x1fffffff + vpaddq ymm6,ymm6,ymm11 + vpmuludq ymm10,ymm12,YMMWORD[((224-128))+r13] + vpaddq ymm7,ymm7,ymm10 + vpmuludq ymm14,ymm12,YMMWORD[((256-128))+r13] + vmovd xmm12,eax + + vpaddq ymm8,ymm8,ymm14 + + vpbroadcastq ymm12,xmm12 + + vpmuludq ymm11,ymm13,YMMWORD[((32-8-128))+r13] + vmovdqu ymm14,YMMWORD[((96-8-128))+r13] + mov rdx,rax + imul rax,QWORD[((-128))+r13] + vpaddq ymm1,ymm1,ymm11 + vpmuludq ymm10,ymm13,YMMWORD[((64-8-128))+r13] + vmovdqu ymm11,YMMWORD[((128-8-128))+r13] + add r11,rax + mov rax,rdx + imul rax,QWORD[((8-128))+r13] + vpaddq ymm2,ymm2,ymm10 + add rax,r12 + shr r11,29 + vpmuludq ymm14,ymm14,ymm13 + vmovdqu ymm10,YMMWORD[((160-8-128))+r13] + add rax,r11 + vpaddq ymm3,ymm3,ymm14 + vpmuludq ymm11,ymm11,ymm13 + vmovdqu ymm14,YMMWORD[((192-8-128))+r13] + DB 0x67 + mov r12,rax + imul eax,ecx + vpaddq ymm4,ymm4,ymm11 + vpmuludq ymm10,ymm10,ymm13 + DB 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 + and eax,0x1fffffff + vpaddq ymm5,ymm5,ymm10 + vpmuludq ymm14,ymm14,ymm13 + vmovdqu ymm10,YMMWORD[((256-8-128))+r13] + vpaddq ymm6,ymm6,ymm14 + vpmuludq ymm11,ymm11,ymm13 + vmovdqu ymm9,YMMWORD[((288-8-128))+r13] + vmovd xmm0,eax + imul rax,QWORD[((-128))+r13] + vpaddq ymm7,ymm7,ymm11 + vpmuludq ymm10,ymm10,ymm13 + vmovdqu ymm14,YMMWORD[((32-16-128))+r13] + vpbroadcastq ymm0,xmm0 + vpaddq ymm8,ymm8,ymm10 + vpmuludq ymm9,ymm9,ymm13 + vmovdqu ymm11,YMMWORD[((64-16-128))+r13] + add r12,rax + + vmovdqu ymm13,YMMWORD[((32-24-128))+r13] + vpmuludq ymm14,ymm14,ymm12 + vmovdqu ymm10,YMMWORD[((96-16-128))+r13] + vpaddq ymm1,ymm1,ymm14 + vpmuludq ymm13,ymm13,ymm0 + vpmuludq ymm11,ymm11,ymm12 + DB 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff + vpaddq ymm13,ymm13,ymm1 + vpaddq ymm2,ymm2,ymm11 + vpmuludq ymm10,ymm10,ymm12 + vmovdqu ymm11,YMMWORD[((160-16-128))+r13] + DB 0x67 + vmovq rax,xmm13 + vmovdqu YMMWORD[rsp],ymm13 + vpaddq ymm3,ymm3,ymm10 + vpmuludq ymm14,ymm14,ymm12 + vmovdqu ymm10,YMMWORD[((192-16-128))+r13] + vpaddq ymm4,ymm4,ymm14 + vpmuludq ymm11,ymm11,ymm12 + vmovdqu ymm14,YMMWORD[((224-16-128))+r13] + vpaddq ymm5,ymm5,ymm11 + vpmuludq ymm10,ymm10,ymm12 + vmovdqu ymm11,YMMWORD[((256-16-128))+r13] + vpaddq ymm6,ymm6,ymm10 + vpmuludq ymm14,ymm14,ymm12 + shr r12,29 + vmovdqu ymm10,YMMWORD[((288-16-128))+r13] + add rax,r12 + vpaddq ymm7,ymm7,ymm14 + vpmuludq ymm11,ymm11,ymm12 + + mov r9,rax + imul eax,ecx + vpaddq ymm8,ymm8,ymm11 + vpmuludq ymm10,ymm10,ymm12 + and eax,0x1fffffff + vmovd xmm12,eax + vmovdqu ymm11,YMMWORD[((96-24-128))+r13] + DB 0x67 + vpaddq ymm9,ymm9,ymm10 + vpbroadcastq ymm12,xmm12 + + vpmuludq ymm14,ymm0,YMMWORD[((64-24-128))+r13] + vmovdqu ymm10,YMMWORD[((128-24-128))+r13] + mov rdx,rax + imul rax,QWORD[((-128))+r13] + mov r10,QWORD[8+rsp] + vpaddq ymm1,ymm2,ymm14 + vpmuludq ymm11,ymm11,ymm0 + vmovdqu ymm14,YMMWORD[((160-24-128))+r13] + add r9,rax + mov rax,rdx + imul rax,QWORD[((8-128))+r13] + DB 0x67 + shr r9,29 + mov r11,QWORD[16+rsp] + vpaddq ymm2,ymm3,ymm11 + vpmuludq ymm10,ymm10,ymm0 + vmovdqu ymm11,YMMWORD[((192-24-128))+r13] + add r10,rax + mov rax,rdx + imul rax,QWORD[((16-128))+r13] + vpaddq ymm3,ymm4,ymm10 + vpmuludq ymm14,ymm14,ymm0 + vmovdqu ymm10,YMMWORD[((224-24-128))+r13] + imul rdx,QWORD[((24-128))+r13] + add r11,rax + lea rax,[r10*1+r9] + vpaddq ymm4,ymm5,ymm14 + vpmuludq ymm11,ymm11,ymm0 + vmovdqu ymm14,YMMWORD[((256-24-128))+r13] + mov r10,rax + imul eax,ecx + vpmuludq ymm10,ymm10,ymm0 + vpaddq ymm5,ymm6,ymm11 + vmovdqu ymm11,YMMWORD[((288-24-128))+r13] + and eax,0x1fffffff + vpaddq ymm6,ymm7,ymm10 + vpmuludq ymm14,ymm14,ymm0 + add rdx,QWORD[24+rsp] + vpaddq ymm7,ymm8,ymm14 + vpmuludq ymm11,ymm11,ymm0 + vpaddq ymm8,ymm9,ymm11 + vmovq xmm9,r12 + mov r12,rdx + + dec r14d + jnz NEAR $L$OOP_REDUCE_1024 + lea r12,[448+rsp] + vpaddq ymm0,ymm13,ymm9 + vpxor ymm9,ymm9,ymm9 + + vpaddq ymm0,ymm0,YMMWORD[((288-192))+rbx] + vpaddq ymm1,ymm1,YMMWORD[((320-448))+r12] + vpaddq ymm2,ymm2,YMMWORD[((352-448))+r12] + vpaddq ymm3,ymm3,YMMWORD[((384-448))+r12] + vpaddq ymm4,ymm4,YMMWORD[((416-448))+r12] + vpaddq ymm5,ymm5,YMMWORD[((448-448))+r12] + vpaddq ymm6,ymm6,YMMWORD[((480-448))+r12] + vpaddq ymm7,ymm7,YMMWORD[((512-448))+r12] + vpaddq ymm8,ymm8,YMMWORD[((544-448))+r12] + + vpsrlq ymm14,ymm0,29 + vpand ymm0,ymm0,ymm15 + vpsrlq ymm11,ymm1,29 + vpand ymm1,ymm1,ymm15 + vpsrlq ymm12,ymm2,29 + vpermq ymm14,ymm14,0x93 + vpand ymm2,ymm2,ymm15 + vpsrlq ymm13,ymm3,29 + vpermq ymm11,ymm11,0x93 + vpand ymm3,ymm3,ymm15 + vpermq ymm12,ymm12,0x93 + + vpblendd ymm10,ymm14,ymm9,3 + vpermq ymm13,ymm13,0x93 + vpblendd ymm14,ymm11,ymm14,3 + vpaddq ymm0,ymm0,ymm10 + vpblendd ymm11,ymm12,ymm11,3 + vpaddq ymm1,ymm1,ymm14 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm2,ymm2,ymm11 + vpblendd ymm13,ymm9,ymm13,3 + vpaddq ymm3,ymm3,ymm12 + vpaddq ymm4,ymm4,ymm13 + + vpsrlq ymm14,ymm0,29 + vpand ymm0,ymm0,ymm15 + vpsrlq ymm11,ymm1,29 + vpand ymm1,ymm1,ymm15 + vpsrlq ymm12,ymm2,29 + vpermq ymm14,ymm14,0x93 + vpand ymm2,ymm2,ymm15 + vpsrlq ymm13,ymm3,29 + vpermq ymm11,ymm11,0x93 + vpand ymm3,ymm3,ymm15 + vpermq ymm12,ymm12,0x93 + + vpblendd ymm10,ymm14,ymm9,3 + vpermq ymm13,ymm13,0x93 + vpblendd ymm14,ymm11,ymm14,3 + vpaddq ymm0,ymm0,ymm10 + vpblendd ymm11,ymm12,ymm11,3 + vpaddq ymm1,ymm1,ymm14 + vmovdqu YMMWORD[(0-128)+rdi],ymm0 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm2,ymm2,ymm11 + vmovdqu YMMWORD[(32-128)+rdi],ymm1 + vpblendd ymm13,ymm9,ymm13,3 + vpaddq ymm3,ymm3,ymm12 + vmovdqu YMMWORD[(64-128)+rdi],ymm2 + vpaddq ymm4,ymm4,ymm13 + vmovdqu YMMWORD[(96-128)+rdi],ymm3 + vpsrlq ymm14,ymm4,29 + vpand ymm4,ymm4,ymm15 + vpsrlq ymm11,ymm5,29 + vpand ymm5,ymm5,ymm15 + vpsrlq ymm12,ymm6,29 + vpermq ymm14,ymm14,0x93 + vpand ymm6,ymm6,ymm15 + vpsrlq ymm13,ymm7,29 + vpermq ymm11,ymm11,0x93 + vpand ymm7,ymm7,ymm15 + vpsrlq ymm0,ymm8,29 + vpermq ymm12,ymm12,0x93 + vpand ymm8,ymm8,ymm15 + vpermq ymm13,ymm13,0x93 + + vpblendd ymm10,ymm14,ymm9,3 + vpermq ymm0,ymm0,0x93 + vpblendd ymm14,ymm11,ymm14,3 + vpaddq ymm4,ymm4,ymm10 + vpblendd ymm11,ymm12,ymm11,3 + vpaddq ymm5,ymm5,ymm14 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm6,ymm6,ymm11 + vpblendd ymm13,ymm0,ymm13,3 + vpaddq ymm7,ymm7,ymm12 + vpaddq ymm8,ymm8,ymm13 + + vpsrlq ymm14,ymm4,29 + vpand ymm4,ymm4,ymm15 + vpsrlq ymm11,ymm5,29 + vpand ymm5,ymm5,ymm15 + vpsrlq ymm12,ymm6,29 + vpermq ymm14,ymm14,0x93 + vpand ymm6,ymm6,ymm15 + vpsrlq ymm13,ymm7,29 + vpermq ymm11,ymm11,0x93 + vpand ymm7,ymm7,ymm15 + vpsrlq ymm0,ymm8,29 + vpermq ymm12,ymm12,0x93 + vpand ymm8,ymm8,ymm15 + vpermq ymm13,ymm13,0x93 + + vpblendd ymm10,ymm14,ymm9,3 + vpermq ymm0,ymm0,0x93 + vpblendd ymm14,ymm11,ymm14,3 + vpaddq ymm4,ymm4,ymm10 + vpblendd ymm11,ymm12,ymm11,3 + vpaddq ymm5,ymm5,ymm14 + vmovdqu YMMWORD[(128-128)+rdi],ymm4 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm6,ymm6,ymm11 + vmovdqu YMMWORD[(160-128)+rdi],ymm5 + vpblendd ymm13,ymm0,ymm13,3 + vpaddq ymm7,ymm7,ymm12 + vmovdqu YMMWORD[(192-128)+rdi],ymm6 + vpaddq ymm8,ymm8,ymm13 + vmovdqu YMMWORD[(224-128)+rdi],ymm7 + vmovdqu YMMWORD[(256-128)+rdi],ymm8 + + mov rsi,rdi + dec r8d + jne NEAR $L$OOP_GRANDE_SQR_1024 + + vzeroall + mov rax,rbp + +$L$sqr_1024_in_tail: + movaps xmm6,XMMWORD[((-216))+rax] + movaps xmm7,XMMWORD[((-200))+rax] + movaps xmm8,XMMWORD[((-184))+rax] + movaps xmm9,XMMWORD[((-168))+rax] + movaps xmm10,XMMWORD[((-152))+rax] + movaps xmm11,XMMWORD[((-136))+rax] + movaps xmm12,XMMWORD[((-120))+rax] + movaps xmm13,XMMWORD[((-104))+rax] + movaps xmm14,XMMWORD[((-88))+rax] + movaps xmm15,XMMWORD[((-72))+rax] + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$sqr_1024_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_rsaz_1024_sqr_avx2: +global rsaz_1024_mul_avx2 + +ALIGN 64 +rsaz_1024_mul_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_rsaz_1024_mul_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + lea rax,[rsp] + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + vzeroupper + lea rsp,[((-168))+rsp] + vmovaps XMMWORD[(-216)+rax],xmm6 + vmovaps XMMWORD[(-200)+rax],xmm7 + vmovaps XMMWORD[(-184)+rax],xmm8 + vmovaps XMMWORD[(-168)+rax],xmm9 + vmovaps XMMWORD[(-152)+rax],xmm10 + vmovaps XMMWORD[(-136)+rax],xmm11 + vmovaps XMMWORD[(-120)+rax],xmm12 + vmovaps XMMWORD[(-104)+rax],xmm13 + vmovaps XMMWORD[(-88)+rax],xmm14 + vmovaps XMMWORD[(-72)+rax],xmm15 +$L$mul_1024_body: + mov rbp,rax + + vzeroall + mov r13,rdx + sub rsp,64 + + + + + + + DB 0x67,0x67 + mov r15,rsi + and r15,4095 + add r15,32*10 + shr r15,12 + mov r15,rsi + cmovnz rsi,r13 + cmovnz r13,r15 + + mov r15,rcx + sub rsi,-128 + sub rcx,-128 + sub rdi,-128 + + and r15,4095 + add r15,32*10 + DB 0x67,0x67 + shr r15,12 + jz NEAR $L$mul_1024_no_n_copy + + + + + + sub rsp,32*10 + vmovdqu ymm0,YMMWORD[((0-128))+rcx] + and rsp,-512 + vmovdqu ymm1,YMMWORD[((32-128))+rcx] + vmovdqu ymm2,YMMWORD[((64-128))+rcx] + vmovdqu ymm3,YMMWORD[((96-128))+rcx] + vmovdqu ymm4,YMMWORD[((128-128))+rcx] + vmovdqu ymm5,YMMWORD[((160-128))+rcx] + vmovdqu ymm6,YMMWORD[((192-128))+rcx] + vmovdqu ymm7,YMMWORD[((224-128))+rcx] + vmovdqu ymm8,YMMWORD[((256-128))+rcx] + lea rcx,[((64+128))+rsp] + vmovdqu YMMWORD[(0-128)+rcx],ymm0 + vpxor ymm0,ymm0,ymm0 + vmovdqu YMMWORD[(32-128)+rcx],ymm1 + vpxor ymm1,ymm1,ymm1 + vmovdqu YMMWORD[(64-128)+rcx],ymm2 + vpxor ymm2,ymm2,ymm2 + vmovdqu YMMWORD[(96-128)+rcx],ymm3 + vpxor ymm3,ymm3,ymm3 + vmovdqu YMMWORD[(128-128)+rcx],ymm4 + vpxor ymm4,ymm4,ymm4 + vmovdqu YMMWORD[(160-128)+rcx],ymm5 + vpxor ymm5,ymm5,ymm5 + vmovdqu YMMWORD[(192-128)+rcx],ymm6 + vpxor ymm6,ymm6,ymm6 + vmovdqu YMMWORD[(224-128)+rcx],ymm7 + vpxor ymm7,ymm7,ymm7 + vmovdqu YMMWORD[(256-128)+rcx],ymm8 + vmovdqa ymm8,ymm0 + vmovdqu YMMWORD[(288-128)+rcx],ymm9 +$L$mul_1024_no_n_copy: + and rsp,-64 + + mov rbx,QWORD[r13] + vpbroadcastq ymm10,QWORD[r13] + vmovdqu YMMWORD[rsp],ymm0 + xor r9,r9 + DB 0x67 + xor r10,r10 + xor r11,r11 + xor r12,r12 + + vmovdqu ymm15,YMMWORD[$L$and_mask] + mov r14d,9 + vmovdqu YMMWORD[(288-128)+rdi],ymm9 + jmp NEAR $L$oop_mul_1024 + +ALIGN 32 +$L$oop_mul_1024: + vpsrlq ymm9,ymm3,29 + mov rax,rbx + imul rax,QWORD[((-128))+rsi] + add rax,r9 + mov r10,rbx + imul r10,QWORD[((8-128))+rsi] + add r10,QWORD[8+rsp] + + mov r9,rax + imul eax,r8d + and eax,0x1fffffff + + mov r11,rbx + imul r11,QWORD[((16-128))+rsi] + add r11,QWORD[16+rsp] + + mov r12,rbx + imul r12,QWORD[((24-128))+rsi] + add r12,QWORD[24+rsp] + vpmuludq ymm0,ymm10,YMMWORD[((32-128))+rsi] + vmovd xmm11,eax + vpaddq ymm1,ymm1,ymm0 + vpmuludq ymm12,ymm10,YMMWORD[((64-128))+rsi] + vpbroadcastq ymm11,xmm11 + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm13,ymm10,YMMWORD[((96-128))+rsi] + vpand ymm3,ymm3,ymm15 + vpaddq ymm3,ymm3,ymm13 + vpmuludq ymm0,ymm10,YMMWORD[((128-128))+rsi] + vpaddq ymm4,ymm4,ymm0 + vpmuludq ymm12,ymm10,YMMWORD[((160-128))+rsi] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm13,ymm10,YMMWORD[((192-128))+rsi] + vpaddq ymm6,ymm6,ymm13 + vpmuludq ymm0,ymm10,YMMWORD[((224-128))+rsi] + vpermq ymm9,ymm9,0x93 + vpaddq ymm7,ymm7,ymm0 + vpmuludq ymm12,ymm10,YMMWORD[((256-128))+rsi] + vpbroadcastq ymm10,QWORD[8+r13] + vpaddq ymm8,ymm8,ymm12 + + mov rdx,rax + imul rax,QWORD[((-128))+rcx] + add r9,rax + mov rax,rdx + imul rax,QWORD[((8-128))+rcx] + add r10,rax + mov rax,rdx + imul rax,QWORD[((16-128))+rcx] + add r11,rax + shr r9,29 + imul rdx,QWORD[((24-128))+rcx] + add r12,rdx + add r10,r9 + + vpmuludq ymm13,ymm11,YMMWORD[((32-128))+rcx] + vmovq rbx,xmm10 + vpaddq ymm1,ymm1,ymm13 + vpmuludq ymm0,ymm11,YMMWORD[((64-128))+rcx] + vpaddq ymm2,ymm2,ymm0 + vpmuludq ymm12,ymm11,YMMWORD[((96-128))+rcx] + vpaddq ymm3,ymm3,ymm12 + vpmuludq ymm13,ymm11,YMMWORD[((128-128))+rcx] + vpaddq ymm4,ymm4,ymm13 + vpmuludq ymm0,ymm11,YMMWORD[((160-128))+rcx] + vpaddq ymm5,ymm5,ymm0 + vpmuludq ymm12,ymm11,YMMWORD[((192-128))+rcx] + vpaddq ymm6,ymm6,ymm12 + vpmuludq ymm13,ymm11,YMMWORD[((224-128))+rcx] + vpblendd ymm12,ymm9,ymm14,3 + vpaddq ymm7,ymm7,ymm13 + vpmuludq ymm0,ymm11,YMMWORD[((256-128))+rcx] + vpaddq ymm3,ymm3,ymm12 + vpaddq ymm8,ymm8,ymm0 + + mov rax,rbx + imul rax,QWORD[((-128))+rsi] + add r10,rax + vmovdqu ymm12,YMMWORD[((-8+32-128))+rsi] + mov rax,rbx + imul rax,QWORD[((8-128))+rsi] + add r11,rax + vmovdqu ymm13,YMMWORD[((-8+64-128))+rsi] + + mov rax,r10 + vpblendd ymm9,ymm9,ymm14,0xfc + imul eax,r8d + vpaddq ymm4,ymm4,ymm9 + and eax,0x1fffffff + + imul rbx,QWORD[((16-128))+rsi] + add r12,rbx + vpmuludq ymm12,ymm12,ymm10 + vmovd xmm11,eax + vmovdqu ymm0,YMMWORD[((-8+96-128))+rsi] + vpaddq ymm1,ymm1,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vpbroadcastq ymm11,xmm11 + vmovdqu ymm12,YMMWORD[((-8+128-128))+rsi] + vpaddq ymm2,ymm2,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-8+160-128))+rsi] + vpaddq ymm3,ymm3,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vmovdqu ymm0,YMMWORD[((-8+192-128))+rsi] + vpaddq ymm4,ymm4,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vmovdqu ymm12,YMMWORD[((-8+224-128))+rsi] + vpaddq ymm5,ymm5,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-8+256-128))+rsi] + vpaddq ymm6,ymm6,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vmovdqu ymm9,YMMWORD[((-8+288-128))+rsi] + vpaddq ymm7,ymm7,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vpaddq ymm8,ymm8,ymm13 + vpmuludq ymm9,ymm9,ymm10 + vpbroadcastq ymm10,QWORD[16+r13] + + mov rdx,rax + imul rax,QWORD[((-128))+rcx] + add r10,rax + vmovdqu ymm0,YMMWORD[((-8+32-128))+rcx] + mov rax,rdx + imul rax,QWORD[((8-128))+rcx] + add r11,rax + vmovdqu ymm12,YMMWORD[((-8+64-128))+rcx] + shr r10,29 + imul rdx,QWORD[((16-128))+rcx] + add r12,rdx + add r11,r10 + + vpmuludq ymm0,ymm0,ymm11 + vmovq rbx,xmm10 + vmovdqu ymm13,YMMWORD[((-8+96-128))+rcx] + vpaddq ymm1,ymm1,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-8+128-128))+rcx] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-8+160-128))+rcx] + vpaddq ymm3,ymm3,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-8+192-128))+rcx] + vpaddq ymm4,ymm4,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-8+224-128))+rcx] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-8+256-128))+rcx] + vpaddq ymm6,ymm6,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-8+288-128))+rcx] + vpaddq ymm7,ymm7,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vpaddq ymm9,ymm9,ymm13 + + vmovdqu ymm0,YMMWORD[((-16+32-128))+rsi] + mov rax,rbx + imul rax,QWORD[((-128))+rsi] + add rax,r11 + + vmovdqu ymm12,YMMWORD[((-16+64-128))+rsi] + mov r11,rax + imul eax,r8d + and eax,0x1fffffff + + imul rbx,QWORD[((8-128))+rsi] + add r12,rbx + vpmuludq ymm0,ymm0,ymm10 + vmovd xmm11,eax + vmovdqu ymm13,YMMWORD[((-16+96-128))+rsi] + vpaddq ymm1,ymm1,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vpbroadcastq ymm11,xmm11 + vmovdqu ymm0,YMMWORD[((-16+128-128))+rsi] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vmovdqu ymm12,YMMWORD[((-16+160-128))+rsi] + vpaddq ymm3,ymm3,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-16+192-128))+rsi] + vpaddq ymm4,ymm4,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vmovdqu ymm0,YMMWORD[((-16+224-128))+rsi] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vmovdqu ymm12,YMMWORD[((-16+256-128))+rsi] + vpaddq ymm6,ymm6,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-16+288-128))+rsi] + vpaddq ymm7,ymm7,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vpbroadcastq ymm10,QWORD[24+r13] + vpaddq ymm9,ymm9,ymm13 + + vmovdqu ymm0,YMMWORD[((-16+32-128))+rcx] + mov rdx,rax + imul rax,QWORD[((-128))+rcx] + add r11,rax + vmovdqu ymm12,YMMWORD[((-16+64-128))+rcx] + imul rdx,QWORD[((8-128))+rcx] + add r12,rdx + shr r11,29 + + vpmuludq ymm0,ymm0,ymm11 + vmovq rbx,xmm10 + vmovdqu ymm13,YMMWORD[((-16+96-128))+rcx] + vpaddq ymm1,ymm1,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-16+128-128))+rcx] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-16+160-128))+rcx] + vpaddq ymm3,ymm3,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-16+192-128))+rcx] + vpaddq ymm4,ymm4,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-16+224-128))+rcx] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-16+256-128))+rcx] + vpaddq ymm6,ymm6,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-16+288-128))+rcx] + vpaddq ymm7,ymm7,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-24+32-128))+rsi] + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-24+64-128))+rsi] + vpaddq ymm9,ymm9,ymm13 + + add r12,r11 + imul rbx,QWORD[((-128))+rsi] + add r12,rbx + + mov rax,r12 + imul eax,r8d + and eax,0x1fffffff + + vpmuludq ymm0,ymm0,ymm10 + vmovd xmm11,eax + vmovdqu ymm13,YMMWORD[((-24+96-128))+rsi] + vpaddq ymm1,ymm1,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vpbroadcastq ymm11,xmm11 + vmovdqu ymm0,YMMWORD[((-24+128-128))+rsi] + vpaddq ymm2,ymm2,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vmovdqu ymm12,YMMWORD[((-24+160-128))+rsi] + vpaddq ymm3,ymm3,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-24+192-128))+rsi] + vpaddq ymm4,ymm4,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vmovdqu ymm0,YMMWORD[((-24+224-128))+rsi] + vpaddq ymm5,ymm5,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vmovdqu ymm12,YMMWORD[((-24+256-128))+rsi] + vpaddq ymm6,ymm6,ymm13 + vpmuludq ymm0,ymm0,ymm10 + vmovdqu ymm13,YMMWORD[((-24+288-128))+rsi] + vpaddq ymm7,ymm7,ymm0 + vpmuludq ymm12,ymm12,ymm10 + vpaddq ymm8,ymm8,ymm12 + vpmuludq ymm13,ymm13,ymm10 + vpbroadcastq ymm10,QWORD[32+r13] + vpaddq ymm9,ymm9,ymm13 + add r13,32 + + vmovdqu ymm0,YMMWORD[((-24+32-128))+rcx] + imul rax,QWORD[((-128))+rcx] + add r12,rax + shr r12,29 + + vmovdqu ymm12,YMMWORD[((-24+64-128))+rcx] + vpmuludq ymm0,ymm0,ymm11 + vmovq rbx,xmm10 + vmovdqu ymm13,YMMWORD[((-24+96-128))+rcx] + vpaddq ymm0,ymm1,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu YMMWORD[rsp],ymm0 + vpaddq ymm1,ymm2,ymm12 + vmovdqu ymm0,YMMWORD[((-24+128-128))+rcx] + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-24+160-128))+rcx] + vpaddq ymm2,ymm3,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-24+192-128))+rcx] + vpaddq ymm3,ymm4,ymm0 + vpmuludq ymm12,ymm12,ymm11 + vmovdqu ymm0,YMMWORD[((-24+224-128))+rcx] + vpaddq ymm4,ymm5,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovdqu ymm12,YMMWORD[((-24+256-128))+rcx] + vpaddq ymm5,ymm6,ymm13 + vpmuludq ymm0,ymm0,ymm11 + vmovdqu ymm13,YMMWORD[((-24+288-128))+rcx] + mov r9,r12 + vpaddq ymm6,ymm7,ymm0 + vpmuludq ymm12,ymm12,ymm11 + add r9,QWORD[rsp] + vpaddq ymm7,ymm8,ymm12 + vpmuludq ymm13,ymm13,ymm11 + vmovq xmm12,r12 + vpaddq ymm8,ymm9,ymm13 + + dec r14d + jnz NEAR $L$oop_mul_1024 + vpaddq ymm0,ymm12,YMMWORD[rsp] + + vpsrlq ymm12,ymm0,29 + vpand ymm0,ymm0,ymm15 + vpsrlq ymm13,ymm1,29 + vpand ymm1,ymm1,ymm15 + vpsrlq ymm10,ymm2,29 + vpermq ymm12,ymm12,0x93 + vpand ymm2,ymm2,ymm15 + vpsrlq ymm11,ymm3,29 + vpermq ymm13,ymm13,0x93 + vpand ymm3,ymm3,ymm15 + + vpblendd ymm9,ymm12,ymm14,3 + vpermq ymm10,ymm10,0x93 + vpblendd ymm12,ymm13,ymm12,3 + vpermq ymm11,ymm11,0x93 + vpaddq ymm0,ymm0,ymm9 + vpblendd ymm13,ymm10,ymm13,3 + vpaddq ymm1,ymm1,ymm12 + vpblendd ymm10,ymm11,ymm10,3 + vpaddq ymm2,ymm2,ymm13 + vpblendd ymm11,ymm14,ymm11,3 + vpaddq ymm3,ymm3,ymm10 + vpaddq ymm4,ymm4,ymm11 + + vpsrlq ymm12,ymm0,29 + vpand ymm0,ymm0,ymm15 + vpsrlq ymm13,ymm1,29 + vpand ymm1,ymm1,ymm15 + vpsrlq ymm10,ymm2,29 + vpermq ymm12,ymm12,0x93 + vpand ymm2,ymm2,ymm15 + vpsrlq ymm11,ymm3,29 + vpermq ymm13,ymm13,0x93 + vpand ymm3,ymm3,ymm15 + vpermq ymm10,ymm10,0x93 + + vpblendd ymm9,ymm12,ymm14,3 + vpermq ymm11,ymm11,0x93 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm0,ymm0,ymm9 + vpblendd ymm13,ymm10,ymm13,3 + vpaddq ymm1,ymm1,ymm12 + vpblendd ymm10,ymm11,ymm10,3 + vpaddq ymm2,ymm2,ymm13 + vpblendd ymm11,ymm14,ymm11,3 + vpaddq ymm3,ymm3,ymm10 + vpaddq ymm4,ymm4,ymm11 + + vmovdqu YMMWORD[(0-128)+rdi],ymm0 + vmovdqu YMMWORD[(32-128)+rdi],ymm1 + vmovdqu YMMWORD[(64-128)+rdi],ymm2 + vmovdqu YMMWORD[(96-128)+rdi],ymm3 + vpsrlq ymm12,ymm4,29 + vpand ymm4,ymm4,ymm15 + vpsrlq ymm13,ymm5,29 + vpand ymm5,ymm5,ymm15 + vpsrlq ymm10,ymm6,29 + vpermq ymm12,ymm12,0x93 + vpand ymm6,ymm6,ymm15 + vpsrlq ymm11,ymm7,29 + vpermq ymm13,ymm13,0x93 + vpand ymm7,ymm7,ymm15 + vpsrlq ymm0,ymm8,29 + vpermq ymm10,ymm10,0x93 + vpand ymm8,ymm8,ymm15 + vpermq ymm11,ymm11,0x93 + + vpblendd ymm9,ymm12,ymm14,3 + vpermq ymm0,ymm0,0x93 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm4,ymm4,ymm9 + vpblendd ymm13,ymm10,ymm13,3 + vpaddq ymm5,ymm5,ymm12 + vpblendd ymm10,ymm11,ymm10,3 + vpaddq ymm6,ymm6,ymm13 + vpblendd ymm11,ymm0,ymm11,3 + vpaddq ymm7,ymm7,ymm10 + vpaddq ymm8,ymm8,ymm11 + + vpsrlq ymm12,ymm4,29 + vpand ymm4,ymm4,ymm15 + vpsrlq ymm13,ymm5,29 + vpand ymm5,ymm5,ymm15 + vpsrlq ymm10,ymm6,29 + vpermq ymm12,ymm12,0x93 + vpand ymm6,ymm6,ymm15 + vpsrlq ymm11,ymm7,29 + vpermq ymm13,ymm13,0x93 + vpand ymm7,ymm7,ymm15 + vpsrlq ymm0,ymm8,29 + vpermq ymm10,ymm10,0x93 + vpand ymm8,ymm8,ymm15 + vpermq ymm11,ymm11,0x93 + + vpblendd ymm9,ymm12,ymm14,3 + vpermq ymm0,ymm0,0x93 + vpblendd ymm12,ymm13,ymm12,3 + vpaddq ymm4,ymm4,ymm9 + vpblendd ymm13,ymm10,ymm13,3 + vpaddq ymm5,ymm5,ymm12 + vpblendd ymm10,ymm11,ymm10,3 + vpaddq ymm6,ymm6,ymm13 + vpblendd ymm11,ymm0,ymm11,3 + vpaddq ymm7,ymm7,ymm10 + vpaddq ymm8,ymm8,ymm11 + + vmovdqu YMMWORD[(128-128)+rdi],ymm4 + vmovdqu YMMWORD[(160-128)+rdi],ymm5 + vmovdqu YMMWORD[(192-128)+rdi],ymm6 + vmovdqu YMMWORD[(224-128)+rdi],ymm7 + vmovdqu YMMWORD[(256-128)+rdi],ymm8 + vzeroupper + + mov rax,rbp + +$L$mul_1024_in_tail: + movaps xmm6,XMMWORD[((-216))+rax] + movaps xmm7,XMMWORD[((-200))+rax] + movaps xmm8,XMMWORD[((-184))+rax] + movaps xmm9,XMMWORD[((-168))+rax] + movaps xmm10,XMMWORD[((-152))+rax] + movaps xmm11,XMMWORD[((-136))+rax] + movaps xmm12,XMMWORD[((-120))+rax] + movaps xmm13,XMMWORD[((-104))+rax] + movaps xmm14,XMMWORD[((-88))+rax] + movaps xmm15,XMMWORD[((-72))+rax] + mov r15,QWORD[((-48))+rax] + + mov r14,QWORD[((-40))+rax] + + mov r13,QWORD[((-32))+rax] + + mov r12,QWORD[((-24))+rax] + + mov rbp,QWORD[((-16))+rax] + + mov rbx,QWORD[((-8))+rax] + + lea rsp,[rax] + +$L$mul_1024_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_rsaz_1024_mul_avx2: +global rsaz_1024_red2norm_avx2 + +ALIGN 32 +rsaz_1024_red2norm_avx2: + +_CET_ENDBR + sub rdx,-128 + xor rax,rax + mov r8,QWORD[((-128))+rdx] + mov r9,QWORD[((-120))+rdx] + mov r10,QWORD[((-112))+rdx] + shl r8,0 + shl r9,29 + mov r11,r10 + shl r10,58 + shr r11,6 + add rax,r8 + add rax,r9 + add rax,r10 + adc r11,0 + mov QWORD[rcx],rax + mov rax,r11 + mov r8,QWORD[((-104))+rdx] + mov r9,QWORD[((-96))+rdx] + shl r8,23 + mov r10,r9 + shl r9,52 + shr r10,12 + add rax,r8 + add rax,r9 + adc r10,0 + mov QWORD[8+rcx],rax + mov rax,r10 + mov r11,QWORD[((-88))+rdx] + mov r8,QWORD[((-80))+rdx] + shl r11,17 + mov r9,r8 + shl r8,46 + shr r9,18 + add rax,r11 + add rax,r8 + adc r9,0 + mov QWORD[16+rcx],rax + mov rax,r9 + mov r10,QWORD[((-72))+rdx] + mov r11,QWORD[((-64))+rdx] + shl r10,11 + mov r8,r11 + shl r11,40 + shr r8,24 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[24+rcx],rax + mov rax,r8 + mov r9,QWORD[((-56))+rdx] + mov r10,QWORD[((-48))+rdx] + mov r11,QWORD[((-40))+rdx] + shl r9,5 + shl r10,34 + mov r8,r11 + shl r11,63 + shr r8,1 + add rax,r9 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[32+rcx],rax + mov rax,r8 + mov r9,QWORD[((-32))+rdx] + mov r10,QWORD[((-24))+rdx] + shl r9,28 + mov r11,r10 + shl r10,57 + shr r11,7 + add rax,r9 + add rax,r10 + adc r11,0 + mov QWORD[40+rcx],rax + mov rax,r11 + mov r8,QWORD[((-16))+rdx] + mov r9,QWORD[((-8))+rdx] + shl r8,22 + mov r10,r9 + shl r9,51 + shr r10,13 + add rax,r8 + add rax,r9 + adc r10,0 + mov QWORD[48+rcx],rax + mov rax,r10 + mov r11,QWORD[rdx] + mov r8,QWORD[8+rdx] + shl r11,16 + mov r9,r8 + shl r8,45 + shr r9,19 + add rax,r11 + add rax,r8 + adc r9,0 + mov QWORD[56+rcx],rax + mov rax,r9 + mov r10,QWORD[16+rdx] + mov r11,QWORD[24+rdx] + shl r10,10 + mov r8,r11 + shl r11,39 + shr r8,25 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[64+rcx],rax + mov rax,r8 + mov r9,QWORD[32+rdx] + mov r10,QWORD[40+rdx] + mov r11,QWORD[48+rdx] + shl r9,4 + shl r10,33 + mov r8,r11 + shl r11,62 + shr r8,2 + add rax,r9 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[72+rcx],rax + mov rax,r8 + mov r9,QWORD[56+rdx] + mov r10,QWORD[64+rdx] + shl r9,27 + mov r11,r10 + shl r10,56 + shr r11,8 + add rax,r9 + add rax,r10 + adc r11,0 + mov QWORD[80+rcx],rax + mov rax,r11 + mov r8,QWORD[72+rdx] + mov r9,QWORD[80+rdx] + shl r8,21 + mov r10,r9 + shl r9,50 + shr r10,14 + add rax,r8 + add rax,r9 + adc r10,0 + mov QWORD[88+rcx],rax + mov rax,r10 + mov r11,QWORD[88+rdx] + mov r8,QWORD[96+rdx] + shl r11,15 + mov r9,r8 + shl r8,44 + shr r9,20 + add rax,r11 + add rax,r8 + adc r9,0 + mov QWORD[96+rcx],rax + mov rax,r9 + mov r10,QWORD[104+rdx] + mov r11,QWORD[112+rdx] + shl r10,9 + mov r8,r11 + shl r11,38 + shr r8,26 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[104+rcx],rax + mov rax,r8 + mov r9,QWORD[120+rdx] + mov r10,QWORD[128+rdx] + mov r11,QWORD[136+rdx] + shl r9,3 + shl r10,32 + mov r8,r11 + shl r11,61 + shr r8,3 + add rax,r9 + add rax,r10 + add rax,r11 + adc r8,0 + mov QWORD[112+rcx],rax + mov rax,r8 + mov r9,QWORD[144+rdx] + mov r10,QWORD[152+rdx] + shl r9,26 + mov r11,r10 + shl r10,55 + shr r11,9 + add rax,r9 + add rax,r10 + adc r11,0 + mov QWORD[120+rcx],rax + mov rax,r11 + ret + + + +global rsaz_1024_norm2red_avx2 + +ALIGN 32 +rsaz_1024_norm2red_avx2: + +_CET_ENDBR + sub rcx,-128 + mov r8,QWORD[rdx] + mov eax,0x1fffffff + mov r9,QWORD[8+rdx] + mov r11,r8 + shr r11,0 + and r11,rax + mov QWORD[((-128))+rcx],r11 + mov r10,r8 + shr r10,29 + and r10,rax + mov QWORD[((-120))+rcx],r10 + shrd r8,r9,58 + and r8,rax + mov QWORD[((-112))+rcx],r8 + mov r10,QWORD[16+rdx] + mov r8,r9 + shr r8,23 + and r8,rax + mov QWORD[((-104))+rcx],r8 + shrd r9,r10,52 + and r9,rax + mov QWORD[((-96))+rcx],r9 + mov r11,QWORD[24+rdx] + mov r9,r10 + shr r9,17 + and r9,rax + mov QWORD[((-88))+rcx],r9 + shrd r10,r11,46 + and r10,rax + mov QWORD[((-80))+rcx],r10 + mov r8,QWORD[32+rdx] + mov r10,r11 + shr r10,11 + and r10,rax + mov QWORD[((-72))+rcx],r10 + shrd r11,r8,40 + and r11,rax + mov QWORD[((-64))+rcx],r11 + mov r9,QWORD[40+rdx] + mov r11,r8 + shr r11,5 + and r11,rax + mov QWORD[((-56))+rcx],r11 + mov r10,r8 + shr r10,34 + and r10,rax + mov QWORD[((-48))+rcx],r10 + shrd r8,r9,63 + and r8,rax + mov QWORD[((-40))+rcx],r8 + mov r10,QWORD[48+rdx] + mov r8,r9 + shr r8,28 + and r8,rax + mov QWORD[((-32))+rcx],r8 + shrd r9,r10,57 + and r9,rax + mov QWORD[((-24))+rcx],r9 + mov r11,QWORD[56+rdx] + mov r9,r10 + shr r9,22 + and r9,rax + mov QWORD[((-16))+rcx],r9 + shrd r10,r11,51 + and r10,rax + mov QWORD[((-8))+rcx],r10 + mov r8,QWORD[64+rdx] + mov r10,r11 + shr r10,16 + and r10,rax + mov QWORD[rcx],r10 + shrd r11,r8,45 + and r11,rax + mov QWORD[8+rcx],r11 + mov r9,QWORD[72+rdx] + mov r11,r8 + shr r11,10 + and r11,rax + mov QWORD[16+rcx],r11 + shrd r8,r9,39 + and r8,rax + mov QWORD[24+rcx],r8 + mov r10,QWORD[80+rdx] + mov r8,r9 + shr r8,4 + and r8,rax + mov QWORD[32+rcx],r8 + mov r11,r9 + shr r11,33 + and r11,rax + mov QWORD[40+rcx],r11 + shrd r9,r10,62 + and r9,rax + mov QWORD[48+rcx],r9 + mov r11,QWORD[88+rdx] + mov r9,r10 + shr r9,27 + and r9,rax + mov QWORD[56+rcx],r9 + shrd r10,r11,56 + and r10,rax + mov QWORD[64+rcx],r10 + mov r8,QWORD[96+rdx] + mov r10,r11 + shr r10,21 + and r10,rax + mov QWORD[72+rcx],r10 + shrd r11,r8,50 + and r11,rax + mov QWORD[80+rcx],r11 + mov r9,QWORD[104+rdx] + mov r11,r8 + shr r11,15 + and r11,rax + mov QWORD[88+rcx],r11 + shrd r8,r9,44 + and r8,rax + mov QWORD[96+rcx],r8 + mov r10,QWORD[112+rdx] + mov r8,r9 + shr r8,9 + and r8,rax + mov QWORD[104+rcx],r8 + shrd r9,r10,38 + and r9,rax + mov QWORD[112+rcx],r9 + mov r11,QWORD[120+rdx] + mov r9,r10 + shr r9,3 + and r9,rax + mov QWORD[120+rcx],r9 + mov r8,r10 + shr r8,32 + and r8,rax + mov QWORD[128+rcx],r8 + shrd r10,r11,61 + and r10,rax + mov QWORD[136+rcx],r10 + xor r8,r8 + mov r10,r11 + shr r10,26 + and r10,rax + mov QWORD[144+rcx],r10 + shrd r11,r8,55 + and r11,rax + mov QWORD[152+rcx],r11 + mov QWORD[160+rcx],r8 + mov QWORD[168+rcx],r8 + mov QWORD[176+rcx],r8 + mov QWORD[184+rcx],r8 + ret + + +global rsaz_1024_scatter5_avx2 + +ALIGN 32 +rsaz_1024_scatter5_avx2: + +_CET_ENDBR + vzeroupper + vmovdqu ymm5,YMMWORD[$L$scatter_permd] + shl r8d,4 + lea rcx,[r8*1+rcx] + mov eax,9 + jmp NEAR $L$oop_scatter_1024 + +ALIGN 32 +$L$oop_scatter_1024: + vmovdqu ymm0,YMMWORD[rdx] + lea rdx,[32+rdx] + vpermd ymm0,ymm5,ymm0 + vmovdqu XMMWORD[rcx],xmm0 + lea rcx,[512+rcx] + dec eax + jnz NEAR $L$oop_scatter_1024 + + vzeroupper + ret + + + +global rsaz_1024_gather5_avx2 + +ALIGN 32 +rsaz_1024_gather5_avx2: + +_CET_ENDBR + vzeroupper + mov r11,rsp + + lea rax,[((-136))+rsp] +$L$SEH_begin_rsaz_1024_gather5: + + DB 0x48,0x8d,0x60,0xe0 + DB 0xc5,0xf8,0x29,0x70,0xe0 + DB 0xc5,0xf8,0x29,0x78,0xf0 + DB 0xc5,0x78,0x29,0x40,0x00 + DB 0xc5,0x78,0x29,0x48,0x10 + DB 0xc5,0x78,0x29,0x50,0x20 + DB 0xc5,0x78,0x29,0x58,0x30 + DB 0xc5,0x78,0x29,0x60,0x40 + DB 0xc5,0x78,0x29,0x68,0x50 + DB 0xc5,0x78,0x29,0x70,0x60 + DB 0xc5,0x78,0x29,0x78,0x70 + lea rsp,[((-256))+rsp] + and rsp,-32 + lea r10,[$L$inc] + lea rax,[((-128))+rsp] + + vmovd xmm4,r8d + vmovdqa ymm0,YMMWORD[r10] + vmovdqa ymm1,YMMWORD[32+r10] + vmovdqa ymm5,YMMWORD[64+r10] + vpbroadcastd ymm4,xmm4 + + vpaddd ymm2,ymm0,ymm5 + vpcmpeqd ymm0,ymm0,ymm4 + vpaddd ymm3,ymm1,ymm5 + vpcmpeqd ymm1,ymm1,ymm4 + vmovdqa YMMWORD[(0+128)+rax],ymm0 + vpaddd ymm0,ymm2,ymm5 + vpcmpeqd ymm2,ymm2,ymm4 + vmovdqa YMMWORD[(32+128)+rax],ymm1 + vpaddd ymm1,ymm3,ymm5 + vpcmpeqd ymm3,ymm3,ymm4 + vmovdqa YMMWORD[(64+128)+rax],ymm2 + vpaddd ymm2,ymm0,ymm5 + vpcmpeqd ymm0,ymm0,ymm4 + vmovdqa YMMWORD[(96+128)+rax],ymm3 + vpaddd ymm3,ymm1,ymm5 + vpcmpeqd ymm1,ymm1,ymm4 + vmovdqa YMMWORD[(128+128)+rax],ymm0 + vpaddd ymm8,ymm2,ymm5 + vpcmpeqd ymm2,ymm2,ymm4 + vmovdqa YMMWORD[(160+128)+rax],ymm1 + vpaddd ymm9,ymm3,ymm5 + vpcmpeqd ymm3,ymm3,ymm4 + vmovdqa YMMWORD[(192+128)+rax],ymm2 + vpaddd ymm10,ymm8,ymm5 + vpcmpeqd ymm8,ymm8,ymm4 + vmovdqa YMMWORD[(224+128)+rax],ymm3 + vpaddd ymm11,ymm9,ymm5 + vpcmpeqd ymm9,ymm9,ymm4 + vpaddd ymm12,ymm10,ymm5 + vpcmpeqd ymm10,ymm10,ymm4 + vpaddd ymm13,ymm11,ymm5 + vpcmpeqd ymm11,ymm11,ymm4 + vpaddd ymm14,ymm12,ymm5 + vpcmpeqd ymm12,ymm12,ymm4 + vpaddd ymm15,ymm13,ymm5 + vpcmpeqd ymm13,ymm13,ymm4 + vpcmpeqd ymm14,ymm14,ymm4 + vpcmpeqd ymm15,ymm15,ymm4 + + vmovdqa ymm7,YMMWORD[((-32))+r10] + lea rdx,[128+rdx] + mov r8d,9 + +$L$oop_gather_1024: + vmovdqa ymm0,YMMWORD[((0-128))+rdx] + vmovdqa ymm1,YMMWORD[((32-128))+rdx] + vmovdqa ymm2,YMMWORD[((64-128))+rdx] + vmovdqa ymm3,YMMWORD[((96-128))+rdx] + vpand ymm0,ymm0,YMMWORD[((0+128))+rax] + vpand ymm1,ymm1,YMMWORD[((32+128))+rax] + vpand ymm2,ymm2,YMMWORD[((64+128))+rax] + vpor ymm4,ymm1,ymm0 + vpand ymm3,ymm3,YMMWORD[((96+128))+rax] + vmovdqa ymm0,YMMWORD[((128-128))+rdx] + vmovdqa ymm1,YMMWORD[((160-128))+rdx] + vpor ymm5,ymm3,ymm2 + vmovdqa ymm2,YMMWORD[((192-128))+rdx] + vmovdqa ymm3,YMMWORD[((224-128))+rdx] + vpand ymm0,ymm0,YMMWORD[((128+128))+rax] + vpand ymm1,ymm1,YMMWORD[((160+128))+rax] + vpand ymm2,ymm2,YMMWORD[((192+128))+rax] + vpor ymm4,ymm4,ymm0 + vpand ymm3,ymm3,YMMWORD[((224+128))+rax] + vpand ymm0,ymm8,YMMWORD[((256-128))+rdx] + vpor ymm5,ymm5,ymm1 + vpand ymm1,ymm9,YMMWORD[((288-128))+rdx] + vpor ymm4,ymm4,ymm2 + vpand ymm2,ymm10,YMMWORD[((320-128))+rdx] + vpor ymm5,ymm5,ymm3 + vpand ymm3,ymm11,YMMWORD[((352-128))+rdx] + vpor ymm4,ymm4,ymm0 + vpand ymm0,ymm12,YMMWORD[((384-128))+rdx] + vpor ymm5,ymm5,ymm1 + vpand ymm1,ymm13,YMMWORD[((416-128))+rdx] + vpor ymm4,ymm4,ymm2 + vpand ymm2,ymm14,YMMWORD[((448-128))+rdx] + vpor ymm5,ymm5,ymm3 + vpand ymm3,ymm15,YMMWORD[((480-128))+rdx] + lea rdx,[512+rdx] + vpor ymm4,ymm4,ymm0 + vpor ymm5,ymm5,ymm1 + vpor ymm4,ymm4,ymm2 + vpor ymm5,ymm5,ymm3 + + vpor ymm4,ymm4,ymm5 + vextracti128 xmm5,ymm4,1 + vpor xmm5,xmm5,xmm4 + vpermd ymm5,ymm7,ymm5 + vmovdqu YMMWORD[rcx],ymm5 + lea rcx,[32+rcx] + dec r8d + jnz NEAR $L$oop_gather_1024 + + vpxor ymm0,ymm0,ymm0 + vmovdqu YMMWORD[rcx],ymm0 + vzeroupper + movaps xmm6,XMMWORD[((-168))+r11] + movaps xmm7,XMMWORD[((-152))+r11] + movaps xmm8,XMMWORD[((-136))+r11] + movaps xmm9,XMMWORD[((-120))+r11] + movaps xmm10,XMMWORD[((-104))+r11] + movaps xmm11,XMMWORD[((-88))+r11] + movaps xmm12,XMMWORD[((-72))+r11] + movaps xmm13,XMMWORD[((-56))+r11] + movaps xmm14,XMMWORD[((-40))+r11] + movaps xmm15,XMMWORD[((-24))+r11] + lea rsp,[r11] + + ret + +$L$SEH_end_rsaz_1024_gather5: + +section .rdata rdata align=8 +ALIGN 64 +$L$and_mask: + DQ 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff +$L$scatter_permd: + DD 0,2,4,6,7,7,7,7 +$L$gather_permd: + DD 0,7,1,7,2,7,3,7 +$L$inc: + DD 0,0,0,0,1,1,1,1 + DD 2,2,2,2,3,3,3,3 + DD 4,4,4,4,4,4,4,4 +ALIGN 64 +section .text + +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +rsaz_se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rbp,QWORD[160+r8] + + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + cmovc rax,rbp + + mov r15,QWORD[((-48))+rax] + mov r14,QWORD[((-40))+rax] + mov r13,QWORD[((-32))+rax] + mov r12,QWORD[((-24))+rax] + mov rbp,QWORD[((-16))+rax] + mov rbx,QWORD[((-8))+rax] + mov QWORD[240+r8],r15 + mov QWORD[232+r8],r14 + mov QWORD[224+r8],r13 + mov QWORD[216+r8],r12 + mov QWORD[160+r8],rbp + mov QWORD[144+r8],rbx + + lea rsi,[((-216))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_rsaz_1024_sqr_avx2 wrt ..imagebase + DD $L$SEH_end_rsaz_1024_sqr_avx2 wrt ..imagebase + DD $L$SEH_info_rsaz_1024_sqr_avx2 wrt ..imagebase + + DD $L$SEH_begin_rsaz_1024_mul_avx2 wrt ..imagebase + DD $L$SEH_end_rsaz_1024_mul_avx2 wrt ..imagebase + DD $L$SEH_info_rsaz_1024_mul_avx2 wrt ..imagebase + + DD $L$SEH_begin_rsaz_1024_gather5 wrt ..imagebase + DD $L$SEH_end_rsaz_1024_gather5 wrt ..imagebase + DD $L$SEH_info_rsaz_1024_gather5 wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_rsaz_1024_sqr_avx2: + DB 9,0,0,0 + DD rsaz_se_handler wrt ..imagebase + DD $L$sqr_1024_body wrt ..imagebase,$L$sqr_1024_epilogue wrt ..imagebase,$L$sqr_1024_in_tail wrt ..imagebase + DD 0 +$L$SEH_info_rsaz_1024_mul_avx2: + DB 9,0,0,0 + DD rsaz_se_handler wrt ..imagebase + DD $L$mul_1024_body wrt ..imagebase,$L$mul_1024_epilogue wrt ..imagebase,$L$mul_1024_in_tail wrt ..imagebase + DD 0 +$L$SEH_info_rsaz_1024_gather5: + DB 0x01,0x36,0x17,0x0b + DB 0x36,0xf8,0x09,0x00 + DB 0x31,0xe8,0x08,0x00 + DB 0x2c,0xd8,0x07,0x00 + DB 0x27,0xc8,0x06,0x00 + DB 0x22,0xb8,0x05,0x00 + DB 0x1d,0xa8,0x04,0x00 + DB 0x18,0x98,0x03,0x00 + DB 0x13,0x88,0x02,0x00 + DB 0x0e,0x78,0x01,0x00 + DB 0x09,0x68,0x00,0x00 + DB 0x04,0x01,0x15,0x00 + DB 0x00,0xb3,0x00,0x00 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/sha1-586-apple.S b/third_party/boringssl/gen/bcm/sha1-586-apple.S new file mode 100644 index 00000000..f2c45ec8 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha1-586-apple.S @@ -0,0 +1,3782 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _sha1_block_data_order_nohw +.private_extern _sha1_block_data_order_nohw +.align 4 +_sha1_block_data_order_nohw: +L_sha1_block_data_order_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%ebp + movl 24(%esp),%esi + movl 28(%esp),%eax + subl $76,%esp + shll $6,%eax + addl %esi,%eax + movl %eax,104(%esp) + movl 16(%ebp),%edi + jmp L000loop +.align 4,0x90 +L000loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,(%esp) + movl %ebx,4(%esp) + movl %ecx,8(%esp) + movl %edx,12(%esp) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,16(%esp) + movl %ebx,20(%esp) + movl %ecx,24(%esp) + movl %edx,28(%esp) + movl 32(%esi),%eax + movl 36(%esi),%ebx + movl 40(%esi),%ecx + movl 44(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edx,44(%esp) + movl 48(%esi),%eax + movl 52(%esi),%ebx + movl 56(%esi),%ecx + movl 60(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,48(%esp) + movl %ebx,52(%esp) + movl %ecx,56(%esp) + movl %edx,60(%esp) + movl %esi,100(%esp) + movl (%ebp),%eax + movl 4(%ebp),%ebx + movl 8(%ebp),%ecx + movl 12(%ebp),%edx + # 00_15 0 + movl %ecx,%esi + movl %eax,%ebp + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl (%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + # 00_15 1 + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 4(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + # 00_15 2 + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 8(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + # 00_15 3 + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 12(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + addl %ecx,%ebp + # 00_15 4 + movl %edi,%ebx + movl %ebp,%ecx + roll $5,%ebp + xorl %esi,%ebx + addl %eax,%ebp + movl 16(%esp),%eax + andl %edx,%ebx + rorl $2,%edx + xorl %esi,%ebx + leal 1518500249(%ebp,%eax,1),%ebp + addl %ebx,%ebp + # 00_15 5 + movl %edx,%eax + movl %ebp,%ebx + roll $5,%ebp + xorl %edi,%eax + addl %esi,%ebp + movl 20(%esp),%esi + andl %ecx,%eax + rorl $2,%ecx + xorl %edi,%eax + leal 1518500249(%ebp,%esi,1),%ebp + addl %eax,%ebp + # 00_15 6 + movl %ecx,%esi + movl %ebp,%eax + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl 24(%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + # 00_15 7 + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 28(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + # 00_15 8 + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 32(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + # 00_15 9 + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 36(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + addl %ecx,%ebp + # 00_15 10 + movl %edi,%ebx + movl %ebp,%ecx + roll $5,%ebp + xorl %esi,%ebx + addl %eax,%ebp + movl 40(%esp),%eax + andl %edx,%ebx + rorl $2,%edx + xorl %esi,%ebx + leal 1518500249(%ebp,%eax,1),%ebp + addl %ebx,%ebp + # 00_15 11 + movl %edx,%eax + movl %ebp,%ebx + roll $5,%ebp + xorl %edi,%eax + addl %esi,%ebp + movl 44(%esp),%esi + andl %ecx,%eax + rorl $2,%ecx + xorl %edi,%eax + leal 1518500249(%ebp,%esi,1),%ebp + addl %eax,%ebp + # 00_15 12 + movl %ecx,%esi + movl %ebp,%eax + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl 48(%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + # 00_15 13 + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 52(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + # 00_15 14 + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 56(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + # 00_15 15 + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 60(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + movl (%esp),%ebx + addl %ebp,%ecx + # 16_19 16 + movl %edi,%ebp + xorl 8(%esp),%ebx + xorl %esi,%ebp + xorl 32(%esp),%ebx + andl %edx,%ebp + xorl 52(%esp),%ebx + roll $1,%ebx + xorl %esi,%ebp + addl %ebp,%eax + movl %ecx,%ebp + rorl $2,%edx + movl %ebx,(%esp) + roll $5,%ebp + leal 1518500249(%ebx,%eax,1),%ebx + movl 4(%esp),%eax + addl %ebp,%ebx + # 16_19 17 + movl %edx,%ebp + xorl 12(%esp),%eax + xorl %edi,%ebp + xorl 36(%esp),%eax + andl %ecx,%ebp + xorl 56(%esp),%eax + roll $1,%eax + xorl %edi,%ebp + addl %ebp,%esi + movl %ebx,%ebp + rorl $2,%ecx + movl %eax,4(%esp) + roll $5,%ebp + leal 1518500249(%eax,%esi,1),%eax + movl 8(%esp),%esi + addl %ebp,%eax + # 16_19 18 + movl %ecx,%ebp + xorl 16(%esp),%esi + xorl %edx,%ebp + xorl 40(%esp),%esi + andl %ebx,%ebp + xorl 60(%esp),%esi + roll $1,%esi + xorl %edx,%ebp + addl %ebp,%edi + movl %eax,%ebp + rorl $2,%ebx + movl %esi,8(%esp) + roll $5,%ebp + leal 1518500249(%esi,%edi,1),%esi + movl 12(%esp),%edi + addl %ebp,%esi + # 16_19 19 + movl %ebx,%ebp + xorl 20(%esp),%edi + xorl %ecx,%ebp + xorl 44(%esp),%edi + andl %eax,%ebp + xorl (%esp),%edi + roll $1,%edi + xorl %ecx,%ebp + addl %ebp,%edx + movl %esi,%ebp + rorl $2,%eax + movl %edi,12(%esp) + roll $5,%ebp + leal 1518500249(%edi,%edx,1),%edi + movl 16(%esp),%edx + addl %ebp,%edi + # 20_39 20 + movl %esi,%ebp + xorl 24(%esp),%edx + xorl %eax,%ebp + xorl 48(%esp),%edx + xorl %ebx,%ebp + xorl 4(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,16(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 20(%esp),%ecx + addl %ebp,%edx + # 20_39 21 + movl %edi,%ebp + xorl 28(%esp),%ecx + xorl %esi,%ebp + xorl 52(%esp),%ecx + xorl %eax,%ebp + xorl 8(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,20(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 24(%esp),%ebx + addl %ebp,%ecx + # 20_39 22 + movl %edx,%ebp + xorl 32(%esp),%ebx + xorl %edi,%ebp + xorl 56(%esp),%ebx + xorl %esi,%ebp + xorl 12(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,24(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 28(%esp),%eax + addl %ebp,%ebx + # 20_39 23 + movl %ecx,%ebp + xorl 36(%esp),%eax + xorl %edx,%ebp + xorl 60(%esp),%eax + xorl %edi,%ebp + xorl 16(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,28(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 32(%esp),%esi + addl %ebp,%eax + # 20_39 24 + movl %ebx,%ebp + xorl 40(%esp),%esi + xorl %ecx,%ebp + xorl (%esp),%esi + xorl %edx,%ebp + xorl 20(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,32(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 36(%esp),%edi + addl %ebp,%esi + # 20_39 25 + movl %eax,%ebp + xorl 44(%esp),%edi + xorl %ebx,%ebp + xorl 4(%esp),%edi + xorl %ecx,%ebp + xorl 24(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,36(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl 40(%esp),%edx + addl %ebp,%edi + # 20_39 26 + movl %esi,%ebp + xorl 48(%esp),%edx + xorl %eax,%ebp + xorl 8(%esp),%edx + xorl %ebx,%ebp + xorl 28(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,40(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 44(%esp),%ecx + addl %ebp,%edx + # 20_39 27 + movl %edi,%ebp + xorl 52(%esp),%ecx + xorl %esi,%ebp + xorl 12(%esp),%ecx + xorl %eax,%ebp + xorl 32(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,44(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 48(%esp),%ebx + addl %ebp,%ecx + # 20_39 28 + movl %edx,%ebp + xorl 56(%esp),%ebx + xorl %edi,%ebp + xorl 16(%esp),%ebx + xorl %esi,%ebp + xorl 36(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,48(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 52(%esp),%eax + addl %ebp,%ebx + # 20_39 29 + movl %ecx,%ebp + xorl 60(%esp),%eax + xorl %edx,%ebp + xorl 20(%esp),%eax + xorl %edi,%ebp + xorl 40(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,52(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 56(%esp),%esi + addl %ebp,%eax + # 20_39 30 + movl %ebx,%ebp + xorl (%esp),%esi + xorl %ecx,%ebp + xorl 24(%esp),%esi + xorl %edx,%ebp + xorl 44(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,56(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 60(%esp),%edi + addl %ebp,%esi + # 20_39 31 + movl %eax,%ebp + xorl 4(%esp),%edi + xorl %ebx,%ebp + xorl 28(%esp),%edi + xorl %ecx,%ebp + xorl 48(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,60(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl (%esp),%edx + addl %ebp,%edi + # 20_39 32 + movl %esi,%ebp + xorl 8(%esp),%edx + xorl %eax,%ebp + xorl 32(%esp),%edx + xorl %ebx,%ebp + xorl 52(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 4(%esp),%ecx + addl %ebp,%edx + # 20_39 33 + movl %edi,%ebp + xorl 12(%esp),%ecx + xorl %esi,%ebp + xorl 36(%esp),%ecx + xorl %eax,%ebp + xorl 56(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,4(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 8(%esp),%ebx + addl %ebp,%ecx + # 20_39 34 + movl %edx,%ebp + xorl 16(%esp),%ebx + xorl %edi,%ebp + xorl 40(%esp),%ebx + xorl %esi,%ebp + xorl 60(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,8(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 12(%esp),%eax + addl %ebp,%ebx + # 20_39 35 + movl %ecx,%ebp + xorl 20(%esp),%eax + xorl %edx,%ebp + xorl 44(%esp),%eax + xorl %edi,%ebp + xorl (%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,12(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 16(%esp),%esi + addl %ebp,%eax + # 20_39 36 + movl %ebx,%ebp + xorl 24(%esp),%esi + xorl %ecx,%ebp + xorl 48(%esp),%esi + xorl %edx,%ebp + xorl 4(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,16(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 20(%esp),%edi + addl %ebp,%esi + # 20_39 37 + movl %eax,%ebp + xorl 28(%esp),%edi + xorl %ebx,%ebp + xorl 52(%esp),%edi + xorl %ecx,%ebp + xorl 8(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,20(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl 24(%esp),%edx + addl %ebp,%edi + # 20_39 38 + movl %esi,%ebp + xorl 32(%esp),%edx + xorl %eax,%ebp + xorl 56(%esp),%edx + xorl %ebx,%ebp + xorl 12(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,24(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 28(%esp),%ecx + addl %ebp,%edx + # 20_39 39 + movl %edi,%ebp + xorl 36(%esp),%ecx + xorl %esi,%ebp + xorl 60(%esp),%ecx + xorl %eax,%ebp + xorl 16(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,28(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 32(%esp),%ebx + addl %ebp,%ecx + # 40_59 40 + movl %edi,%ebp + xorl 40(%esp),%ebx + xorl %esi,%ebp + xorl (%esp),%ebx + andl %edx,%ebp + xorl 20(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,32(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 36(%esp),%eax + addl %ebp,%ebx + # 40_59 41 + movl %edx,%ebp + xorl 44(%esp),%eax + xorl %edi,%ebp + xorl 4(%esp),%eax + andl %ecx,%ebp + xorl 24(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,36(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 40(%esp),%esi + addl %ebp,%eax + # 40_59 42 + movl %ecx,%ebp + xorl 48(%esp),%esi + xorl %edx,%ebp + xorl 8(%esp),%esi + andl %ebx,%ebp + xorl 28(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,40(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 44(%esp),%edi + addl %ebp,%esi + # 40_59 43 + movl %ebx,%ebp + xorl 52(%esp),%edi + xorl %ecx,%ebp + xorl 12(%esp),%edi + andl %eax,%ebp + xorl 32(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,44(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 48(%esp),%edx + addl %ebp,%edi + # 40_59 44 + movl %eax,%ebp + xorl 56(%esp),%edx + xorl %ebx,%ebp + xorl 16(%esp),%edx + andl %esi,%ebp + xorl 36(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,48(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 52(%esp),%ecx + addl %ebp,%edx + # 40_59 45 + movl %esi,%ebp + xorl 60(%esp),%ecx + xorl %eax,%ebp + xorl 20(%esp),%ecx + andl %edi,%ebp + xorl 40(%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,52(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 56(%esp),%ebx + addl %ebp,%ecx + # 40_59 46 + movl %edi,%ebp + xorl (%esp),%ebx + xorl %esi,%ebp + xorl 24(%esp),%ebx + andl %edx,%ebp + xorl 44(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,56(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 60(%esp),%eax + addl %ebp,%ebx + # 40_59 47 + movl %edx,%ebp + xorl 4(%esp),%eax + xorl %edi,%ebp + xorl 28(%esp),%eax + andl %ecx,%ebp + xorl 48(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,60(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl (%esp),%esi + addl %ebp,%eax + # 40_59 48 + movl %ecx,%ebp + xorl 8(%esp),%esi + xorl %edx,%ebp + xorl 32(%esp),%esi + andl %ebx,%ebp + xorl 52(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 4(%esp),%edi + addl %ebp,%esi + # 40_59 49 + movl %ebx,%ebp + xorl 12(%esp),%edi + xorl %ecx,%ebp + xorl 36(%esp),%edi + andl %eax,%ebp + xorl 56(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,4(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 8(%esp),%edx + addl %ebp,%edi + # 40_59 50 + movl %eax,%ebp + xorl 16(%esp),%edx + xorl %ebx,%ebp + xorl 40(%esp),%edx + andl %esi,%ebp + xorl 60(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,8(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 12(%esp),%ecx + addl %ebp,%edx + # 40_59 51 + movl %esi,%ebp + xorl 20(%esp),%ecx + xorl %eax,%ebp + xorl 44(%esp),%ecx + andl %edi,%ebp + xorl (%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,12(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 16(%esp),%ebx + addl %ebp,%ecx + # 40_59 52 + movl %edi,%ebp + xorl 24(%esp),%ebx + xorl %esi,%ebp + xorl 48(%esp),%ebx + andl %edx,%ebp + xorl 4(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,16(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 20(%esp),%eax + addl %ebp,%ebx + # 40_59 53 + movl %edx,%ebp + xorl 28(%esp),%eax + xorl %edi,%ebp + xorl 52(%esp),%eax + andl %ecx,%ebp + xorl 8(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,20(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 24(%esp),%esi + addl %ebp,%eax + # 40_59 54 + movl %ecx,%ebp + xorl 32(%esp),%esi + xorl %edx,%ebp + xorl 56(%esp),%esi + andl %ebx,%ebp + xorl 12(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,24(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 28(%esp),%edi + addl %ebp,%esi + # 40_59 55 + movl %ebx,%ebp + xorl 36(%esp),%edi + xorl %ecx,%ebp + xorl 60(%esp),%edi + andl %eax,%ebp + xorl 16(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,28(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 32(%esp),%edx + addl %ebp,%edi + # 40_59 56 + movl %eax,%ebp + xorl 40(%esp),%edx + xorl %ebx,%ebp + xorl (%esp),%edx + andl %esi,%ebp + xorl 20(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,32(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 36(%esp),%ecx + addl %ebp,%edx + # 40_59 57 + movl %esi,%ebp + xorl 44(%esp),%ecx + xorl %eax,%ebp + xorl 4(%esp),%ecx + andl %edi,%ebp + xorl 24(%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,36(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 40(%esp),%ebx + addl %ebp,%ecx + # 40_59 58 + movl %edi,%ebp + xorl 48(%esp),%ebx + xorl %esi,%ebp + xorl 8(%esp),%ebx + andl %edx,%ebp + xorl 28(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,40(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 44(%esp),%eax + addl %ebp,%ebx + # 40_59 59 + movl %edx,%ebp + xorl 52(%esp),%eax + xorl %edi,%ebp + xorl 12(%esp),%eax + andl %ecx,%ebp + xorl 32(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,44(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 48(%esp),%esi + addl %ebp,%eax + # 20_39 60 + movl %ebx,%ebp + xorl 56(%esp),%esi + xorl %ecx,%ebp + xorl 16(%esp),%esi + xorl %edx,%ebp + xorl 36(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,48(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 52(%esp),%edi + addl %ebp,%esi + # 20_39 61 + movl %eax,%ebp + xorl 60(%esp),%edi + xorl %ebx,%ebp + xorl 20(%esp),%edi + xorl %ecx,%ebp + xorl 40(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,52(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 56(%esp),%edx + addl %ebp,%edi + # 20_39 62 + movl %esi,%ebp + xorl (%esp),%edx + xorl %eax,%ebp + xorl 24(%esp),%edx + xorl %ebx,%ebp + xorl 44(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,56(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 60(%esp),%ecx + addl %ebp,%edx + # 20_39 63 + movl %edi,%ebp + xorl 4(%esp),%ecx + xorl %esi,%ebp + xorl 28(%esp),%ecx + xorl %eax,%ebp + xorl 48(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,60(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl (%esp),%ebx + addl %ebp,%ecx + # 20_39 64 + movl %edx,%ebp + xorl 8(%esp),%ebx + xorl %edi,%ebp + xorl 32(%esp),%ebx + xorl %esi,%ebp + xorl 52(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 4(%esp),%eax + addl %ebp,%ebx + # 20_39 65 + movl %ecx,%ebp + xorl 12(%esp),%eax + xorl %edx,%ebp + xorl 36(%esp),%eax + xorl %edi,%ebp + xorl 56(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,4(%esp) + leal 3395469782(%eax,%esi,1),%eax + movl 8(%esp),%esi + addl %ebp,%eax + # 20_39 66 + movl %ebx,%ebp + xorl 16(%esp),%esi + xorl %ecx,%ebp + xorl 40(%esp),%esi + xorl %edx,%ebp + xorl 60(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,8(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 12(%esp),%edi + addl %ebp,%esi + # 20_39 67 + movl %eax,%ebp + xorl 20(%esp),%edi + xorl %ebx,%ebp + xorl 44(%esp),%edi + xorl %ecx,%ebp + xorl (%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,12(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 16(%esp),%edx + addl %ebp,%edi + # 20_39 68 + movl %esi,%ebp + xorl 24(%esp),%edx + xorl %eax,%ebp + xorl 48(%esp),%edx + xorl %ebx,%ebp + xorl 4(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,16(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 20(%esp),%ecx + addl %ebp,%edx + # 20_39 69 + movl %edi,%ebp + xorl 28(%esp),%ecx + xorl %esi,%ebp + xorl 52(%esp),%ecx + xorl %eax,%ebp + xorl 8(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,20(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl 24(%esp),%ebx + addl %ebp,%ecx + # 20_39 70 + movl %edx,%ebp + xorl 32(%esp),%ebx + xorl %edi,%ebp + xorl 56(%esp),%ebx + xorl %esi,%ebp + xorl 12(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,24(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 28(%esp),%eax + addl %ebp,%ebx + # 20_39 71 + movl %ecx,%ebp + xorl 36(%esp),%eax + xorl %edx,%ebp + xorl 60(%esp),%eax + xorl %edi,%ebp + xorl 16(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,28(%esp) + leal 3395469782(%eax,%esi,1),%eax + movl 32(%esp),%esi + addl %ebp,%eax + # 20_39 72 + movl %ebx,%ebp + xorl 40(%esp),%esi + xorl %ecx,%ebp + xorl (%esp),%esi + xorl %edx,%ebp + xorl 20(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,32(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 36(%esp),%edi + addl %ebp,%esi + # 20_39 73 + movl %eax,%ebp + xorl 44(%esp),%edi + xorl %ebx,%ebp + xorl 4(%esp),%edi + xorl %ecx,%ebp + xorl 24(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,36(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 40(%esp),%edx + addl %ebp,%edi + # 20_39 74 + movl %esi,%ebp + xorl 48(%esp),%edx + xorl %eax,%ebp + xorl 8(%esp),%edx + xorl %ebx,%ebp + xorl 28(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,40(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 44(%esp),%ecx + addl %ebp,%edx + # 20_39 75 + movl %edi,%ebp + xorl 52(%esp),%ecx + xorl %esi,%ebp + xorl 12(%esp),%ecx + xorl %eax,%ebp + xorl 32(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,44(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl 48(%esp),%ebx + addl %ebp,%ecx + # 20_39 76 + movl %edx,%ebp + xorl 56(%esp),%ebx + xorl %edi,%ebp + xorl 16(%esp),%ebx + xorl %esi,%ebp + xorl 36(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,48(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 52(%esp),%eax + addl %ebp,%ebx + # 20_39 77 + movl %ecx,%ebp + xorl 60(%esp),%eax + xorl %edx,%ebp + xorl 20(%esp),%eax + xorl %edi,%ebp + xorl 40(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + leal 3395469782(%eax,%esi,1),%eax + movl 56(%esp),%esi + addl %ebp,%eax + # 20_39 78 + movl %ebx,%ebp + xorl (%esp),%esi + xorl %ecx,%ebp + xorl 24(%esp),%esi + xorl %edx,%ebp + xorl 44(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + leal 3395469782(%esi,%edi,1),%esi + movl 60(%esp),%edi + addl %ebp,%esi + # 20_39 79 + movl %eax,%ebp + xorl 4(%esp),%edi + xorl %ebx,%ebp + xorl 28(%esp),%edi + xorl %ecx,%ebp + xorl 48(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + leal 3395469782(%edi,%edx,1),%edi + addl %ebp,%edi + movl 96(%esp),%ebp + movl 100(%esp),%edx + addl (%ebp),%edi + addl 4(%ebp),%esi + addl 8(%ebp),%eax + addl 12(%ebp),%ebx + addl 16(%ebp),%ecx + movl %edi,(%ebp) + addl $64,%edx + movl %esi,4(%ebp) + cmpl 104(%esp),%edx + movl %eax,8(%ebp) + movl %ecx,%edi + movl %ebx,12(%ebp) + movl %edx,%esi + movl %ecx,16(%ebp) + jb L000loop + addl $76,%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _sha1_block_data_order_ssse3 +.private_extern _sha1_block_data_order_ssse3 +.align 4 +_sha1_block_data_order_ssse3: +L_sha1_block_data_order_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call L001pic_point +L001pic_point: + popl %ebp + leal LK_XX_XX-L001pic_point(%ebp),%ebp + movdqa (%ebp),%xmm7 + movdqa 16(%ebp),%xmm0 + movdqa 32(%ebp),%xmm1 + movdqa 48(%ebp),%xmm2 + movdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + movdqa %xmm0,112(%esp) + movdqa %xmm1,128(%esp) + movdqa %xmm2,144(%esp) + shll $6,%edx + movdqa %xmm7,160(%esp) + addl %ebp,%edx + movdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + movdqu -64(%ebp),%xmm0 + movdqu -48(%ebp),%xmm1 + movdqu -32(%ebp),%xmm2 + movdqu -16(%ebp),%xmm3 + pshufb %xmm6,%xmm0 + pshufb %xmm6,%xmm1 + pshufb %xmm6,%xmm2 + movdqa %xmm7,96(%esp) + pshufb %xmm6,%xmm3 + paddd %xmm7,%xmm0 + paddd %xmm7,%xmm1 + paddd %xmm7,%xmm2 + movdqa %xmm0,(%esp) + psubd %xmm7,%xmm0 + movdqa %xmm1,16(%esp) + psubd %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + movl %ecx,%ebp + psubd %xmm7,%xmm2 + xorl %edx,%ebp + pshufd $238,%xmm0,%xmm4 + andl %ebp,%esi + jmp L002loop +.align 4,0x90 +L002loop: + rorl $2,%ebx + xorl %edx,%esi + movl %eax,%ebp + punpcklqdq %xmm1,%xmm4 + movdqa %xmm3,%xmm6 + addl (%esp),%edi + xorl %ecx,%ebx + paddd %xmm3,%xmm7 + movdqa %xmm0,64(%esp) + roll $5,%eax + addl %esi,%edi + psrldq $4,%xmm6 + andl %ebx,%ebp + xorl %ecx,%ebx + pxor %xmm0,%xmm4 + addl %eax,%edi + rorl $7,%eax + pxor %xmm2,%xmm6 + xorl %ecx,%ebp + movl %edi,%esi + addl 4(%esp),%edx + pxor %xmm6,%xmm4 + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm7,48(%esp) + addl %ebp,%edx + andl %eax,%esi + movdqa %xmm4,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + movdqa %xmm4,%xmm6 + xorl %ebx,%esi + pslldq $12,%xmm0 + paddd %xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + psrld $31,%xmm6 + xorl %eax,%edi + roll $5,%edx + movdqa %xmm0,%xmm7 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + psrld $30,%xmm0 + addl %edx,%ecx + rorl $7,%edx + por %xmm6,%xmm4 + xorl %eax,%ebp + movl %ecx,%esi + addl 12(%esp),%ebx + pslld $2,%xmm7 + xorl %edi,%edx + roll $5,%ecx + pxor %xmm0,%xmm4 + movdqa 96(%esp),%xmm0 + addl %ebp,%ebx + andl %edx,%esi + pxor %xmm7,%xmm4 + pshufd $238,%xmm1,%xmm5 + xorl %edi,%edx + addl %ecx,%ebx + rorl $7,%ecx + xorl %edi,%esi + movl %ebx,%ebp + punpcklqdq %xmm2,%xmm5 + movdqa %xmm4,%xmm7 + addl 16(%esp),%eax + xorl %edx,%ecx + paddd %xmm4,%xmm0 + movdqa %xmm1,80(%esp) + roll $5,%ebx + addl %esi,%eax + psrldq $4,%xmm7 + andl %ecx,%ebp + xorl %edx,%ecx + pxor %xmm1,%xmm5 + addl %ebx,%eax + rorl $7,%ebx + pxor %xmm3,%xmm7 + xorl %edx,%ebp + movl %eax,%esi + addl 20(%esp),%edi + pxor %xmm7,%xmm5 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm0,(%esp) + addl %ebp,%edi + andl %ebx,%esi + movdqa %xmm5,%xmm1 + xorl %ecx,%ebx + addl %eax,%edi + rorl $7,%eax + movdqa %xmm5,%xmm7 + xorl %ecx,%esi + pslldq $12,%xmm1 + paddd %xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + psrld $31,%xmm7 + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm1,%xmm0 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + psrld $30,%xmm1 + addl %edi,%edx + rorl $7,%edi + por %xmm7,%xmm5 + xorl %ebx,%ebp + movl %edx,%esi + addl 28(%esp),%ecx + pslld $2,%xmm0 + xorl %eax,%edi + roll $5,%edx + pxor %xmm1,%xmm5 + movdqa 112(%esp),%xmm1 + addl %ebp,%ecx + andl %edi,%esi + pxor %xmm0,%xmm5 + pshufd $238,%xmm2,%xmm6 + xorl %eax,%edi + addl %edx,%ecx + rorl $7,%edx + xorl %eax,%esi + movl %ecx,%ebp + punpcklqdq %xmm3,%xmm6 + movdqa %xmm5,%xmm0 + addl 32(%esp),%ebx + xorl %edi,%edx + paddd %xmm5,%xmm1 + movdqa %xmm2,96(%esp) + roll $5,%ecx + addl %esi,%ebx + psrldq $4,%xmm0 + andl %edx,%ebp + xorl %edi,%edx + pxor %xmm2,%xmm6 + addl %ecx,%ebx + rorl $7,%ecx + pxor %xmm4,%xmm0 + xorl %edi,%ebp + movl %ebx,%esi + addl 36(%esp),%eax + pxor %xmm0,%xmm6 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm1,16(%esp) + addl %ebp,%eax + andl %ecx,%esi + movdqa %xmm6,%xmm2 + xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm0 + xorl %edx,%esi + pslldq $12,%xmm2 + paddd %xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + psrld $31,%xmm0 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm2,%xmm1 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + psrld $30,%xmm2 + addl %eax,%edi + rorl $7,%eax + por %xmm0,%xmm6 + xorl %ecx,%ebp + movdqa 64(%esp),%xmm0 + movl %edi,%esi + addl 44(%esp),%edx + pslld $2,%xmm1 + xorl %ebx,%eax + roll $5,%edi + pxor %xmm2,%xmm6 + movdqa 112(%esp),%xmm2 + addl %ebp,%edx + andl %eax,%esi + pxor %xmm1,%xmm6 + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + xorl %ebx,%esi + movl %edx,%ebp + punpcklqdq %xmm4,%xmm7 + movdqa %xmm6,%xmm1 + addl 48(%esp),%ecx + xorl %eax,%edi + paddd %xmm6,%xmm2 + movdqa %xmm3,64(%esp) + roll $5,%edx + addl %esi,%ecx + psrldq $4,%xmm1 + andl %edi,%ebp + xorl %eax,%edi + pxor %xmm3,%xmm7 + addl %edx,%ecx + rorl $7,%edx + pxor %xmm5,%xmm1 + xorl %eax,%ebp + movl %ecx,%esi + addl 52(%esp),%ebx + pxor %xmm1,%xmm7 + xorl %edi,%edx + roll $5,%ecx + movdqa %xmm2,32(%esp) + addl %ebp,%ebx + andl %edx,%esi + movdqa %xmm7,%xmm3 + xorl %edi,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm1 + xorl %edi,%esi + pslldq $12,%xmm3 + paddd %xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + psrld $31,%xmm1 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm3,%xmm2 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + psrld $30,%xmm3 + addl %ebx,%eax + rorl $7,%ebx + por %xmm1,%xmm7 + xorl %edx,%ebp + movdqa 80(%esp),%xmm1 + movl %eax,%esi + addl 60(%esp),%edi + pslld $2,%xmm2 + xorl %ecx,%ebx + roll $5,%eax + pxor %xmm3,%xmm7 + movdqa 112(%esp),%xmm3 + addl %ebp,%edi + andl %ebx,%esi + pxor %xmm2,%xmm7 + pshufd $238,%xmm6,%xmm2 + xorl %ecx,%ebx + addl %eax,%edi + rorl $7,%eax + pxor %xmm4,%xmm0 + punpcklqdq %xmm7,%xmm2 + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + pxor %xmm1,%xmm0 + movdqa %xmm4,80(%esp) + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm3,%xmm4 + addl %esi,%edx + paddd %xmm7,%xmm3 + andl %eax,%ebp + pxor %xmm2,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + xorl %ebx,%ebp + movdqa %xmm0,%xmm2 + movdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + roll $5,%edx + pslld $2,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + psrld $30,%xmm2 + xorl %eax,%edi + addl %edx,%ecx + rorl $7,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + xorl %edi,%edx + roll $5,%ecx + por %xmm2,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + movdqa 96(%esp),%xmm2 + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + pshufd $238,%xmm7,%xmm3 + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 16(%esp),%edi + pxor %xmm5,%xmm1 + punpcklqdq %xmm0,%xmm3 + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm2,%xmm1 + movdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + movdqa %xmm4,%xmm5 + rorl $7,%ebx + paddd %xmm0,%xmm4 + addl %eax,%edi + pxor %xmm3,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + pslld $2,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + psrld $30,%xmm3 + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + por %xmm3,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + movdqa 64(%esp),%xmm3 + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + pshufd $238,%xmm0,%xmm4 + addl %ecx,%ebx + addl 32(%esp),%eax + pxor %xmm6,%xmm2 + punpcklqdq %xmm1,%xmm4 + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + pxor %xmm3,%xmm2 + movdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + movdqa 128(%esp),%xmm6 + rorl $7,%ecx + paddd %xmm1,%xmm5 + addl %ebx,%eax + pxor %xmm4,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + pslld $2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + psrld $30,%xmm4 + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + por %xmm4,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + movdqa 80(%esp),%xmm4 + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + pshufd $238,%xmm1,%xmm5 + addl %edx,%ecx + addl 48(%esp),%ebx + pxor %xmm7,%xmm3 + punpcklqdq %xmm2,%xmm5 + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + pxor %xmm4,%xmm3 + movdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + movdqa %xmm6,%xmm7 + rorl $7,%edx + paddd %xmm2,%xmm6 + addl %ecx,%ebx + pxor %xmm5,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pslld $2,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + psrld $30,%xmm5 + movl %eax,%ebp + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + por %xmm5,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + movdqa 96(%esp),%xmm5 + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + pshufd $238,%xmm2,%xmm6 + addl %edi,%edx + addl (%esp),%ecx + pxor %xmm0,%xmm4 + punpcklqdq %xmm3,%xmm6 + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + pxor %xmm5,%xmm4 + movdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + movdqa %xmm7,%xmm0 + rorl $7,%edi + paddd %xmm3,%xmm7 + addl %edx,%ecx + pxor %xmm6,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + movdqa %xmm4,%xmm6 + movdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + pslld $2,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + psrld $30,%xmm6 + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + por %xmm6,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + movdqa 64(%esp),%xmm6 + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + pshufd $238,%xmm3,%xmm7 + addl %eax,%edi + addl 16(%esp),%edx + pxor %xmm1,%xmm5 + punpcklqdq %xmm4,%xmm7 + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + pxor %xmm6,%xmm5 + movdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + movdqa %xmm0,%xmm1 + rorl $7,%eax + paddd %xmm4,%xmm0 + addl %edi,%edx + pxor %xmm7,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + movdqa %xmm5,%xmm7 + movdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + pslld $2,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + psrld $30,%xmm7 + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + por %xmm7,%xmm5 + addl 28(%esp),%eax + movdqa 80(%esp),%xmm7 + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + pshufd $238,%xmm4,%xmm0 + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 32(%esp),%edi + pxor %xmm2,%xmm6 + punpcklqdq %xmm5,%xmm0 + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + pxor %xmm7,%xmm6 + movdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + roll $5,%eax + movdqa %xmm1,%xmm2 + addl %esi,%edi + paddd %xmm5,%xmm1 + xorl %ebx,%ebp + pxor %xmm0,%xmm6 + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + andl %ebx,%ebp + movdqa %xmm6,%xmm0 + movdqa %xmm1,16(%esp) + xorl %ecx,%ebx + rorl $7,%eax + movl %edi,%esi + xorl %ebx,%ebp + roll $5,%edi + pslld $2,%xmm6 + addl %ebp,%edx + xorl %eax,%esi + psrld $30,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%edi + por %xmm0,%xmm6 + movl %edx,%ebp + xorl %eax,%esi + movdqa 96(%esp),%xmm0 + roll $5,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + pshufd $238,%xmm5,%xmm1 + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + rorl $7,%edx + movl %ecx,%esi + xorl %edi,%ebp + roll $5,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 48(%esp),%eax + pxor %xmm3,%xmm7 + punpcklqdq %xmm6,%xmm1 + andl %edx,%esi + xorl %edi,%edx + rorl $7,%ecx + pxor %xmm0,%xmm7 + movdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + roll $5,%ebx + movdqa 144(%esp),%xmm3 + addl %esi,%eax + paddd %xmm6,%xmm2 + xorl %ecx,%ebp + pxor %xmm1,%xmm7 + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + andl %ecx,%ebp + movdqa %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%ebp + roll $5,%eax + pslld $2,%xmm7 + addl %ebp,%edi + xorl %ebx,%esi + psrld $30,%xmm1 + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + por %xmm1,%xmm7 + movl %edi,%ebp + xorl %ebx,%esi + movdqa 64(%esp),%xmm1 + roll $5,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + pshufd $238,%xmm6,%xmm2 + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + rorl $7,%edi + movl %edx,%esi + xorl %eax,%ebp + roll $5,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl (%esp),%ebx + pxor %xmm4,%xmm0 + punpcklqdq %xmm7,%xmm2 + andl %edi,%esi + xorl %eax,%edi + rorl $7,%edx + pxor %xmm1,%xmm0 + movdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + roll $5,%ecx + movdqa %xmm3,%xmm4 + addl %esi,%ebx + paddd %xmm7,%xmm3 + xorl %edx,%ebp + pxor %xmm2,%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + andl %edx,%ebp + movdqa %xmm0,%xmm2 + movdqa %xmm3,48(%esp) + xorl %edi,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + pslld $2,%xmm0 + addl %ebp,%eax + xorl %ecx,%esi + psrld $30,%xmm2 + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + por %xmm2,%xmm0 + movl %eax,%ebp + xorl %ecx,%esi + movdqa 80(%esp),%xmm2 + roll $5,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + pshufd $238,%xmm7,%xmm3 + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + rorl $7,%eax + movl %edi,%esi + xorl %ebx,%ebp + roll $5,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 16(%esp),%ecx + pxor %xmm5,%xmm1 + punpcklqdq %xmm0,%xmm3 + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%edi + pxor %xmm2,%xmm1 + movdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + roll $5,%edx + movdqa %xmm4,%xmm5 + addl %esi,%ecx + paddd %xmm0,%xmm4 + xorl %edi,%ebp + pxor %xmm3,%xmm1 + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + andl %edi,%ebp + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + xorl %eax,%edi + rorl $7,%edx + movl %ecx,%esi + xorl %edi,%ebp + roll $5,%ecx + pslld $2,%xmm1 + addl %ebp,%ebx + xorl %edx,%esi + psrld $30,%xmm3 + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + rorl $7,%ecx + por %xmm3,%xmm1 + movl %ebx,%ebp + xorl %edx,%esi + movdqa 96(%esp),%xmm3 + roll $5,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + pshufd $238,%xmm0,%xmm4 + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%ebp + roll $5,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 32(%esp),%edx + pxor %xmm6,%xmm2 + punpcklqdq %xmm1,%xmm4 + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + pxor %xmm3,%xmm2 + movdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + roll $5,%edi + movdqa %xmm5,%xmm6 + addl %esi,%edx + paddd %xmm1,%xmm5 + xorl %eax,%ebp + pxor %xmm4,%xmm2 + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + andl %eax,%ebp + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + xorl %ebx,%eax + rorl $7,%edi + movl %edx,%esi + xorl %eax,%ebp + roll $5,%edx + pslld $2,%xmm2 + addl %ebp,%ecx + xorl %edi,%esi + psrld $30,%xmm4 + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + rorl $7,%edx + por %xmm4,%xmm2 + movl %ecx,%ebp + xorl %edi,%esi + movdqa 64(%esp),%xmm4 + roll $5,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + pshufd $238,%xmm1,%xmm5 + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + addl 48(%esp),%edi + pxor %xmm7,%xmm3 + punpcklqdq %xmm2,%xmm5 + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm4,%xmm3 + movdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + movdqa %xmm6,%xmm7 + rorl $7,%ebx + paddd %xmm2,%xmm6 + addl %eax,%edi + pxor %xmm5,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + pslld $2,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + psrld $30,%xmm5 + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + por %xmm5,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + addl (%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + paddd %xmm3,%xmm7 + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + movdqa %xmm7,48(%esp) + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je L003done + movdqa 160(%esp),%xmm7 + movdqa 176(%esp),%xmm6 + movdqu (%ebp),%xmm0 + movdqu 16(%ebp),%xmm1 + movdqu 32(%ebp),%xmm2 + movdqu 48(%ebp),%xmm3 + addl $64,%ebp + pshufb %xmm6,%xmm0 + movl %ebp,196(%esp) + movdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + pshufb %xmm6,%xmm1 + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + paddd %xmm7,%xmm0 + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + movdqa %xmm0,(%esp) + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + psubd %xmm7,%xmm0 + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + pshufb %xmm6,%xmm2 + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + paddd %xmm7,%xmm1 + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + movdqa %xmm1,16(%esp) + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + psubd %xmm7,%xmm1 + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + pshufb %xmm6,%xmm3 + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + paddd %xmm7,%xmm2 + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + movdqa %xmm2,32(%esp) + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + psubd %xmm7,%xmm2 + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + rorl $7,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %ecx,%ebx + movl %edx,12(%ebp) + xorl %edx,%ebx + movl %edi,16(%ebp) + movl %esi,%ebp + pshufd $238,%xmm0,%xmm4 + andl %ebx,%esi + movl %ebp,%ebx + jmp L002loop +.align 4,0x90 +L003done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + rorl $7,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _sha1_block_data_order_avx +.private_extern _sha1_block_data_order_avx +.align 4 +_sha1_block_data_order_avx: +L_sha1_block_data_order_avx_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call L004pic_point +L004pic_point: + popl %ebp + leal LK_XX_XX-L004pic_point(%ebp),%ebp + vzeroall + vmovdqa (%ebp),%xmm7 + vmovdqa 16(%ebp),%xmm0 + vmovdqa 32(%ebp),%xmm1 + vmovdqa 48(%ebp),%xmm2 + vmovdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + vmovdqa %xmm0,112(%esp) + vmovdqa %xmm1,128(%esp) + vmovdqa %xmm2,144(%esp) + shll $6,%edx + vmovdqa %xmm7,160(%esp) + addl %ebp,%edx + vmovdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + vmovdqu -64(%ebp),%xmm0 + vmovdqu -48(%ebp),%xmm1 + vmovdqu -32(%ebp),%xmm2 + vmovdqu -16(%ebp),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vmovdqa %xmm7,96(%esp) + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm7,%xmm0,%xmm4 + vpaddd %xmm7,%xmm1,%xmm5 + vpaddd %xmm7,%xmm2,%xmm6 + vmovdqa %xmm4,(%esp) + movl %ecx,%ebp + vmovdqa %xmm5,16(%esp) + xorl %edx,%ebp + vmovdqa %xmm6,32(%esp) + andl %ebp,%esi + jmp L005loop +.align 4,0x90 +L005loop: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%ebp + addl (%esp),%edi + vpaddd %xmm3,%xmm7,%xmm7 + vmovdqa %xmm0,64(%esp) + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%edi + vpxor %xmm2,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vmovdqa %xmm7,48(%esp) + movl %edi,%esi + addl 4(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%edi,%edi + addl %ebp,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm6 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm0 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrld $30,%xmm0,%xmm7 + vpor %xmm6,%xmm4,%xmm4 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + vpslld $2,%xmm0,%xmm0 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vpxor %xmm7,%xmm4,%xmm4 + movl %ecx,%esi + addl 12(%esp),%ebx + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpxor %xmm0,%xmm4,%xmm4 + addl %ebp,%ebx + andl %edx,%esi + vmovdqa 96(%esp),%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%ebp + addl 16(%esp),%eax + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqa %xmm1,80(%esp) + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vmovdqa %xmm0,(%esp) + movl %eax,%esi + addl 20(%esp),%edi + vpxor %xmm7,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %ebp,%edi + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm7 + xorl %ecx,%ebx + addl %eax,%edi + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm1 + vpaddd %xmm5,%xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm0 + vpor %xmm7,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpxor %xmm0,%xmm5,%xmm5 + movl %edx,%esi + addl 28(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpxor %xmm1,%xmm5,%xmm5 + addl %ebp,%ecx + andl %edi,%esi + vmovdqa 112(%esp),%xmm1 + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%ebp + addl 32(%esp),%ebx + vpaddd %xmm5,%xmm1,%xmm1 + vmovdqa %xmm2,96(%esp) + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + vpxor %xmm2,%xmm6,%xmm6 + xorl %edi,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%ecx,%ecx + xorl %edi,%ebp + vmovdqa %xmm1,16(%esp) + movl %ebx,%esi + addl 36(%esp),%eax + vpxor %xmm0,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + addl %ebp,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm2 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm1 + vpor %xmm0,%xmm6,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + vmovdqa 64(%esp),%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vpxor %xmm1,%xmm6,%xmm6 + movl %edi,%esi + addl 44(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpxor %xmm2,%xmm6,%xmm6 + addl %ebp,%edx + andl %eax,%esi + vmovdqa 112(%esp),%xmm2 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%ebp + addl 48(%esp),%ecx + vpaddd %xmm6,%xmm2,%xmm2 + vmovdqa %xmm3,64(%esp) + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm1 + addl %esi,%ecx + andl %edi,%ebp + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%edi + addl %edx,%ecx + vpxor %xmm5,%xmm1,%xmm1 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vmovdqa %xmm2,32(%esp) + movl %ecx,%esi + addl 52(%esp),%ebx + vpxor %xmm1,%xmm7,%xmm7 + xorl %edi,%edx + shldl $5,%ecx,%ecx + addl %ebp,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm1 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpslldq $12,%xmm7,%xmm3 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm2 + vpor %xmm1,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + vmovdqa 80(%esp),%xmm1 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vpxor %xmm2,%xmm7,%xmm7 + movl %eax,%esi + addl 60(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpxor %xmm3,%xmm7,%xmm7 + addl %ebp,%edi + andl %ebx,%esi + vmovdqa 112(%esp),%xmm3 + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,80(%esp) + xorl %ebx,%eax + shldl $5,%edi,%edi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + addl %esi,%edx + andl %eax,%ebp + vpxor %xmm2,%xmm0,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + vpor %xmm2,%xmm0,%xmm0 + xorl %edi,%edx + shldl $5,%ecx,%ecx + vmovdqa 96(%esp),%xmm2 + addl %esi,%ebx + andl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm3,%xmm1,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm3,%xmm1,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + vmovdqa 64(%esp),%xmm3 + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + vmovdqa 128(%esp),%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm4,%xmm2,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vpor %xmm4,%xmm2,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + vmovdqa 80(%esp),%xmm4 + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + vmovdqa 96(%esp),%xmm5 + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpalignr $8,%xmm2,%xmm3,%xmm6 + vpxor %xmm0,%xmm4,%xmm4 + addl (%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + vmovdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + vmovdqa %xmm7,%xmm0 + vpaddd %xmm3,%xmm7,%xmm7 + shrdl $7,%edi,%edi + addl %edx,%ecx + vpxor %xmm6,%xmm4,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm6 + vmovdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm6,%xmm4,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + vmovdqa 64(%esp),%xmm6 + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpalignr $8,%xmm3,%xmm4,%xmm7 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + vpxor %xmm6,%xmm5,%xmm5 + vmovdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + vmovdqa %xmm0,%xmm1 + vpaddd %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + addl %edi,%edx + vpxor %xmm7,%xmm5,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm7 + vmovdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm7,%xmm5,%xmm5 + addl 28(%esp),%eax + vmovdqa 80(%esp),%xmm7 + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + vmovdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + vmovdqa %xmm1,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + shldl $5,%eax,%eax + addl %esi,%edi + vpxor %xmm0,%xmm6,%xmm6 + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + vpsrld $30,%xmm6,%xmm0 + vmovdqa %xmm1,16(%esp) + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + vpor %xmm0,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%edi,%edi + vmovdqa 96(%esp),%xmm0 + movl %edx,%ebp + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm1 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + vmovdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + vmovdqa 144(%esp),%xmm3 + vpaddd %xmm6,%xmm2,%xmm2 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + vpsrld $30,%xmm7,%xmm1 + vmovdqa %xmm2,32(%esp) + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + vpor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vmovdqa 64(%esp),%xmm1 + movl %edi,%ebp + xorl %ebx,%esi + shldl $5,%edi,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + addl (%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm2,%xmm0,%xmm0 + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + vpor %xmm2,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vmovdqa 80(%esp),%xmm2 + movl %eax,%ebp + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%edi,%edi + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm3,%xmm1,%xmm1 + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + vpor %xmm3,%xmm1,%xmm1 + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vmovdqa 96(%esp),%xmm3 + movl %ebx,%ebp + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + vmovdqa %xmm5,%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shldl $5,%edi,%edi + addl %esi,%edx + vpxor %xmm4,%xmm2,%xmm2 + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + vpor %xmm4,%xmm2,%xmm2 + xorl %eax,%edi + shrdl $7,%edx,%edx + vmovdqa 64(%esp),%xmm4 + movl %ecx,%ebp + xorl %edi,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl (%esp),%eax + vpaddd %xmm3,%xmm7,%xmm7 + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm7,48(%esp) + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je L006done + vmovdqa 160(%esp),%xmm7 + vmovdqa 176(%esp),%xmm6 + vmovdqu (%ebp),%xmm0 + vmovdqu 16(%ebp),%xmm1 + vmovdqu 32(%ebp),%xmm2 + vmovdqu 48(%ebp),%xmm3 + addl $64,%ebp + vpshufb %xmm6,%xmm0,%xmm0 + movl %ebp,196(%esp) + vmovdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpaddd %xmm7,%xmm0,%xmm4 + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,(%esp) + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%ebp + shldl $5,%edx,%edx + vpaddd %xmm7,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vmovdqa %xmm5,16(%esp) + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %edi,%ebp + shldl $5,%edi,%edi + vpaddd %xmm7,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vmovdqa %xmm6,32(%esp) + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,%ebx + movl %ecx,8(%ebp) + xorl %edx,%ebx + movl %edx,12(%ebp) + movl %edi,16(%ebp) + movl %esi,%ebp + andl %ebx,%esi + movl %ebp,%ebx + jmp L005loop +.align 4,0x90 +L006done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroall + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +LK_XX_XX: +.long 1518500249,1518500249,1518500249,1518500249 +.long 1859775393,1859775393,1859775393,1859775393 +.long 2400959708,2400959708,2400959708,2400959708 +.long 3395469782,3395469782,3395469782,3395469782 +.long 66051,67438087,134810123,202182159 +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 +.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 +.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 +.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/sha1-586-linux.S b/third_party/boringssl/gen/bcm/sha1-586-linux.S new file mode 100644 index 00000000..3d8d2137 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha1-586-linux.S @@ -0,0 +1,3788 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl sha1_block_data_order_nohw +.hidden sha1_block_data_order_nohw +.type sha1_block_data_order_nohw,@function +.align 16 +sha1_block_data_order_nohw: +.L_sha1_block_data_order_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%ebp + movl 24(%esp),%esi + movl 28(%esp),%eax + subl $76,%esp + shll $6,%eax + addl %esi,%eax + movl %eax,104(%esp) + movl 16(%ebp),%edi + jmp .L000loop +.align 16 +.L000loop: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,(%esp) + movl %ebx,4(%esp) + movl %ecx,8(%esp) + movl %edx,12(%esp) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,16(%esp) + movl %ebx,20(%esp) + movl %ecx,24(%esp) + movl %edx,28(%esp) + movl 32(%esi),%eax + movl 36(%esi),%ebx + movl 40(%esi),%ecx + movl 44(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,32(%esp) + movl %ebx,36(%esp) + movl %ecx,40(%esp) + movl %edx,44(%esp) + movl 48(%esi),%eax + movl 52(%esi),%ebx + movl 56(%esi),%ecx + movl 60(%esi),%edx + bswap %eax + bswap %ebx + bswap %ecx + bswap %edx + movl %eax,48(%esp) + movl %ebx,52(%esp) + movl %ecx,56(%esp) + movl %edx,60(%esp) + movl %esi,100(%esp) + movl (%ebp),%eax + movl 4(%ebp),%ebx + movl 8(%ebp),%ecx + movl 12(%ebp),%edx + + movl %ecx,%esi + movl %eax,%ebp + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl (%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 4(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 8(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 12(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + addl %ecx,%ebp + + movl %edi,%ebx + movl %ebp,%ecx + roll $5,%ebp + xorl %esi,%ebx + addl %eax,%ebp + movl 16(%esp),%eax + andl %edx,%ebx + rorl $2,%edx + xorl %esi,%ebx + leal 1518500249(%ebp,%eax,1),%ebp + addl %ebx,%ebp + + movl %edx,%eax + movl %ebp,%ebx + roll $5,%ebp + xorl %edi,%eax + addl %esi,%ebp + movl 20(%esp),%esi + andl %ecx,%eax + rorl $2,%ecx + xorl %edi,%eax + leal 1518500249(%ebp,%esi,1),%ebp + addl %eax,%ebp + + movl %ecx,%esi + movl %ebp,%eax + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl 24(%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 28(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 32(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 36(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + addl %ecx,%ebp + + movl %edi,%ebx + movl %ebp,%ecx + roll $5,%ebp + xorl %esi,%ebx + addl %eax,%ebp + movl 40(%esp),%eax + andl %edx,%ebx + rorl $2,%edx + xorl %esi,%ebx + leal 1518500249(%ebp,%eax,1),%ebp + addl %ebx,%ebp + + movl %edx,%eax + movl %ebp,%ebx + roll $5,%ebp + xorl %edi,%eax + addl %esi,%ebp + movl 44(%esp),%esi + andl %ecx,%eax + rorl $2,%ecx + xorl %edi,%eax + leal 1518500249(%ebp,%esi,1),%ebp + addl %eax,%ebp + + movl %ecx,%esi + movl %ebp,%eax + roll $5,%ebp + xorl %edx,%esi + addl %edi,%ebp + movl 48(%esp),%edi + andl %ebx,%esi + rorl $2,%ebx + xorl %edx,%esi + leal 1518500249(%ebp,%edi,1),%ebp + addl %esi,%ebp + + movl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + xorl %ecx,%edi + addl %edx,%ebp + movl 52(%esp),%edx + andl %eax,%edi + rorl $2,%eax + xorl %ecx,%edi + leal 1518500249(%ebp,%edx,1),%ebp + addl %edi,%ebp + + movl %eax,%edx + movl %ebp,%edi + roll $5,%ebp + xorl %ebx,%edx + addl %ecx,%ebp + movl 56(%esp),%ecx + andl %esi,%edx + rorl $2,%esi + xorl %ebx,%edx + leal 1518500249(%ebp,%ecx,1),%ebp + addl %edx,%ebp + + movl %esi,%ecx + movl %ebp,%edx + roll $5,%ebp + xorl %eax,%ecx + addl %ebx,%ebp + movl 60(%esp),%ebx + andl %edi,%ecx + rorl $2,%edi + xorl %eax,%ecx + leal 1518500249(%ebp,%ebx,1),%ebp + movl (%esp),%ebx + addl %ebp,%ecx + + movl %edi,%ebp + xorl 8(%esp),%ebx + xorl %esi,%ebp + xorl 32(%esp),%ebx + andl %edx,%ebp + xorl 52(%esp),%ebx + roll $1,%ebx + xorl %esi,%ebp + addl %ebp,%eax + movl %ecx,%ebp + rorl $2,%edx + movl %ebx,(%esp) + roll $5,%ebp + leal 1518500249(%ebx,%eax,1),%ebx + movl 4(%esp),%eax + addl %ebp,%ebx + + movl %edx,%ebp + xorl 12(%esp),%eax + xorl %edi,%ebp + xorl 36(%esp),%eax + andl %ecx,%ebp + xorl 56(%esp),%eax + roll $1,%eax + xorl %edi,%ebp + addl %ebp,%esi + movl %ebx,%ebp + rorl $2,%ecx + movl %eax,4(%esp) + roll $5,%ebp + leal 1518500249(%eax,%esi,1),%eax + movl 8(%esp),%esi + addl %ebp,%eax + + movl %ecx,%ebp + xorl 16(%esp),%esi + xorl %edx,%ebp + xorl 40(%esp),%esi + andl %ebx,%ebp + xorl 60(%esp),%esi + roll $1,%esi + xorl %edx,%ebp + addl %ebp,%edi + movl %eax,%ebp + rorl $2,%ebx + movl %esi,8(%esp) + roll $5,%ebp + leal 1518500249(%esi,%edi,1),%esi + movl 12(%esp),%edi + addl %ebp,%esi + + movl %ebx,%ebp + xorl 20(%esp),%edi + xorl %ecx,%ebp + xorl 44(%esp),%edi + andl %eax,%ebp + xorl (%esp),%edi + roll $1,%edi + xorl %ecx,%ebp + addl %ebp,%edx + movl %esi,%ebp + rorl $2,%eax + movl %edi,12(%esp) + roll $5,%ebp + leal 1518500249(%edi,%edx,1),%edi + movl 16(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 24(%esp),%edx + xorl %eax,%ebp + xorl 48(%esp),%edx + xorl %ebx,%ebp + xorl 4(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,16(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 20(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 28(%esp),%ecx + xorl %esi,%ebp + xorl 52(%esp),%ecx + xorl %eax,%ebp + xorl 8(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,20(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 24(%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 32(%esp),%ebx + xorl %edi,%ebp + xorl 56(%esp),%ebx + xorl %esi,%ebp + xorl 12(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,24(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 28(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 36(%esp),%eax + xorl %edx,%ebp + xorl 60(%esp),%eax + xorl %edi,%ebp + xorl 16(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,28(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 32(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl 40(%esp),%esi + xorl %ecx,%ebp + xorl (%esp),%esi + xorl %edx,%ebp + xorl 20(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,32(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 36(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 44(%esp),%edi + xorl %ebx,%ebp + xorl 4(%esp),%edi + xorl %ecx,%ebp + xorl 24(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,36(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl 40(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 48(%esp),%edx + xorl %eax,%ebp + xorl 8(%esp),%edx + xorl %ebx,%ebp + xorl 28(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,40(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 44(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 52(%esp),%ecx + xorl %esi,%ebp + xorl 12(%esp),%ecx + xorl %eax,%ebp + xorl 32(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,44(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 48(%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 56(%esp),%ebx + xorl %edi,%ebp + xorl 16(%esp),%ebx + xorl %esi,%ebp + xorl 36(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,48(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 52(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 60(%esp),%eax + xorl %edx,%ebp + xorl 20(%esp),%eax + xorl %edi,%ebp + xorl 40(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,52(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 56(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl (%esp),%esi + xorl %ecx,%ebp + xorl 24(%esp),%esi + xorl %edx,%ebp + xorl 44(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,56(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 60(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 4(%esp),%edi + xorl %ebx,%ebp + xorl 28(%esp),%edi + xorl %ecx,%ebp + xorl 48(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,60(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl (%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 8(%esp),%edx + xorl %eax,%ebp + xorl 32(%esp),%edx + xorl %ebx,%ebp + xorl 52(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 4(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 12(%esp),%ecx + xorl %esi,%ebp + xorl 36(%esp),%ecx + xorl %eax,%ebp + xorl 56(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,4(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 8(%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 16(%esp),%ebx + xorl %edi,%ebp + xorl 40(%esp),%ebx + xorl %esi,%ebp + xorl 60(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,8(%esp) + leal 1859775393(%ebx,%eax,1),%ebx + movl 12(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 20(%esp),%eax + xorl %edx,%ebp + xorl 44(%esp),%eax + xorl %edi,%ebp + xorl (%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,12(%esp) + leal 1859775393(%eax,%esi,1),%eax + movl 16(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl 24(%esp),%esi + xorl %ecx,%ebp + xorl 48(%esp),%esi + xorl %edx,%ebp + xorl 4(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,16(%esp) + leal 1859775393(%esi,%edi,1),%esi + movl 20(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 28(%esp),%edi + xorl %ebx,%ebp + xorl 52(%esp),%edi + xorl %ecx,%ebp + xorl 8(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,20(%esp) + leal 1859775393(%edi,%edx,1),%edi + movl 24(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 32(%esp),%edx + xorl %eax,%ebp + xorl 56(%esp),%edx + xorl %ebx,%ebp + xorl 12(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,24(%esp) + leal 1859775393(%edx,%ecx,1),%edx + movl 28(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 36(%esp),%ecx + xorl %esi,%ebp + xorl 60(%esp),%ecx + xorl %eax,%ebp + xorl 16(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,28(%esp) + leal 1859775393(%ecx,%ebx,1),%ecx + movl 32(%esp),%ebx + addl %ebp,%ecx + + movl %edi,%ebp + xorl 40(%esp),%ebx + xorl %esi,%ebp + xorl (%esp),%ebx + andl %edx,%ebp + xorl 20(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,32(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 36(%esp),%eax + addl %ebp,%ebx + + movl %edx,%ebp + xorl 44(%esp),%eax + xorl %edi,%ebp + xorl 4(%esp),%eax + andl %ecx,%ebp + xorl 24(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,36(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 40(%esp),%esi + addl %ebp,%eax + + movl %ecx,%ebp + xorl 48(%esp),%esi + xorl %edx,%ebp + xorl 8(%esp),%esi + andl %ebx,%ebp + xorl 28(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,40(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 44(%esp),%edi + addl %ebp,%esi + + movl %ebx,%ebp + xorl 52(%esp),%edi + xorl %ecx,%ebp + xorl 12(%esp),%edi + andl %eax,%ebp + xorl 32(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,44(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 48(%esp),%edx + addl %ebp,%edi + + movl %eax,%ebp + xorl 56(%esp),%edx + xorl %ebx,%ebp + xorl 16(%esp),%edx + andl %esi,%ebp + xorl 36(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,48(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 52(%esp),%ecx + addl %ebp,%edx + + movl %esi,%ebp + xorl 60(%esp),%ecx + xorl %eax,%ebp + xorl 20(%esp),%ecx + andl %edi,%ebp + xorl 40(%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,52(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 56(%esp),%ebx + addl %ebp,%ecx + + movl %edi,%ebp + xorl (%esp),%ebx + xorl %esi,%ebp + xorl 24(%esp),%ebx + andl %edx,%ebp + xorl 44(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,56(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 60(%esp),%eax + addl %ebp,%ebx + + movl %edx,%ebp + xorl 4(%esp),%eax + xorl %edi,%ebp + xorl 28(%esp),%eax + andl %ecx,%ebp + xorl 48(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,60(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl (%esp),%esi + addl %ebp,%eax + + movl %ecx,%ebp + xorl 8(%esp),%esi + xorl %edx,%ebp + xorl 32(%esp),%esi + andl %ebx,%ebp + xorl 52(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 4(%esp),%edi + addl %ebp,%esi + + movl %ebx,%ebp + xorl 12(%esp),%edi + xorl %ecx,%ebp + xorl 36(%esp),%edi + andl %eax,%ebp + xorl 56(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,4(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 8(%esp),%edx + addl %ebp,%edi + + movl %eax,%ebp + xorl 16(%esp),%edx + xorl %ebx,%ebp + xorl 40(%esp),%edx + andl %esi,%ebp + xorl 60(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,8(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 12(%esp),%ecx + addl %ebp,%edx + + movl %esi,%ebp + xorl 20(%esp),%ecx + xorl %eax,%ebp + xorl 44(%esp),%ecx + andl %edi,%ebp + xorl (%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,12(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 16(%esp),%ebx + addl %ebp,%ecx + + movl %edi,%ebp + xorl 24(%esp),%ebx + xorl %esi,%ebp + xorl 48(%esp),%ebx + andl %edx,%ebp + xorl 4(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,16(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 20(%esp),%eax + addl %ebp,%ebx + + movl %edx,%ebp + xorl 28(%esp),%eax + xorl %edi,%ebp + xorl 52(%esp),%eax + andl %ecx,%ebp + xorl 8(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,20(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 24(%esp),%esi + addl %ebp,%eax + + movl %ecx,%ebp + xorl 32(%esp),%esi + xorl %edx,%ebp + xorl 56(%esp),%esi + andl %ebx,%ebp + xorl 12(%esp),%esi + roll $1,%esi + addl %edi,%ebp + rorl $2,%ebx + movl %eax,%edi + roll $5,%edi + movl %esi,24(%esp) + leal 2400959708(%esi,%ebp,1),%esi + movl %ecx,%ebp + addl %edi,%esi + andl %edx,%ebp + movl 28(%esp),%edi + addl %ebp,%esi + + movl %ebx,%ebp + xorl 36(%esp),%edi + xorl %ecx,%ebp + xorl 60(%esp),%edi + andl %eax,%ebp + xorl 16(%esp),%edi + roll $1,%edi + addl %edx,%ebp + rorl $2,%eax + movl %esi,%edx + roll $5,%edx + movl %edi,28(%esp) + leal 2400959708(%edi,%ebp,1),%edi + movl %ebx,%ebp + addl %edx,%edi + andl %ecx,%ebp + movl 32(%esp),%edx + addl %ebp,%edi + + movl %eax,%ebp + xorl 40(%esp),%edx + xorl %ebx,%ebp + xorl (%esp),%edx + andl %esi,%ebp + xorl 20(%esp),%edx + roll $1,%edx + addl %ecx,%ebp + rorl $2,%esi + movl %edi,%ecx + roll $5,%ecx + movl %edx,32(%esp) + leal 2400959708(%edx,%ebp,1),%edx + movl %eax,%ebp + addl %ecx,%edx + andl %ebx,%ebp + movl 36(%esp),%ecx + addl %ebp,%edx + + movl %esi,%ebp + xorl 44(%esp),%ecx + xorl %eax,%ebp + xorl 4(%esp),%ecx + andl %edi,%ebp + xorl 24(%esp),%ecx + roll $1,%ecx + addl %ebx,%ebp + rorl $2,%edi + movl %edx,%ebx + roll $5,%ebx + movl %ecx,36(%esp) + leal 2400959708(%ecx,%ebp,1),%ecx + movl %esi,%ebp + addl %ebx,%ecx + andl %eax,%ebp + movl 40(%esp),%ebx + addl %ebp,%ecx + + movl %edi,%ebp + xorl 48(%esp),%ebx + xorl %esi,%ebp + xorl 8(%esp),%ebx + andl %edx,%ebp + xorl 28(%esp),%ebx + roll $1,%ebx + addl %eax,%ebp + rorl $2,%edx + movl %ecx,%eax + roll $5,%eax + movl %ebx,40(%esp) + leal 2400959708(%ebx,%ebp,1),%ebx + movl %edi,%ebp + addl %eax,%ebx + andl %esi,%ebp + movl 44(%esp),%eax + addl %ebp,%ebx + + movl %edx,%ebp + xorl 52(%esp),%eax + xorl %edi,%ebp + xorl 12(%esp),%eax + andl %ecx,%ebp + xorl 32(%esp),%eax + roll $1,%eax + addl %esi,%ebp + rorl $2,%ecx + movl %ebx,%esi + roll $5,%esi + movl %eax,44(%esp) + leal 2400959708(%eax,%ebp,1),%eax + movl %edx,%ebp + addl %esi,%eax + andl %edi,%ebp + movl 48(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl 56(%esp),%esi + xorl %ecx,%ebp + xorl 16(%esp),%esi + xorl %edx,%ebp + xorl 36(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,48(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 52(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 60(%esp),%edi + xorl %ebx,%ebp + xorl 20(%esp),%edi + xorl %ecx,%ebp + xorl 40(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,52(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 56(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl (%esp),%edx + xorl %eax,%ebp + xorl 24(%esp),%edx + xorl %ebx,%ebp + xorl 44(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,56(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 60(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 4(%esp),%ecx + xorl %esi,%ebp + xorl 28(%esp),%ecx + xorl %eax,%ebp + xorl 48(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,60(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl (%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 8(%esp),%ebx + xorl %edi,%ebp + xorl 32(%esp),%ebx + xorl %esi,%ebp + xorl 52(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 4(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 12(%esp),%eax + xorl %edx,%ebp + xorl 36(%esp),%eax + xorl %edi,%ebp + xorl 56(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,4(%esp) + leal 3395469782(%eax,%esi,1),%eax + movl 8(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl 16(%esp),%esi + xorl %ecx,%ebp + xorl 40(%esp),%esi + xorl %edx,%ebp + xorl 60(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,8(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 12(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 20(%esp),%edi + xorl %ebx,%ebp + xorl 44(%esp),%edi + xorl %ecx,%ebp + xorl (%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,12(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 16(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 24(%esp),%edx + xorl %eax,%ebp + xorl 48(%esp),%edx + xorl %ebx,%ebp + xorl 4(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,16(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 20(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 28(%esp),%ecx + xorl %esi,%ebp + xorl 52(%esp),%ecx + xorl %eax,%ebp + xorl 8(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,20(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl 24(%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 32(%esp),%ebx + xorl %edi,%ebp + xorl 56(%esp),%ebx + xorl %esi,%ebp + xorl 12(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,24(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 28(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 36(%esp),%eax + xorl %edx,%ebp + xorl 60(%esp),%eax + xorl %edi,%ebp + xorl 16(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + movl %eax,28(%esp) + leal 3395469782(%eax,%esi,1),%eax + movl 32(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl 40(%esp),%esi + xorl %ecx,%ebp + xorl (%esp),%esi + xorl %edx,%ebp + xorl 20(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + movl %esi,32(%esp) + leal 3395469782(%esi,%edi,1),%esi + movl 36(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 44(%esp),%edi + xorl %ebx,%ebp + xorl 4(%esp),%edi + xorl %ecx,%ebp + xorl 24(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + movl %edi,36(%esp) + leal 3395469782(%edi,%edx,1),%edi + movl 40(%esp),%edx + addl %ebp,%edi + + movl %esi,%ebp + xorl 48(%esp),%edx + xorl %eax,%ebp + xorl 8(%esp),%edx + xorl %ebx,%ebp + xorl 28(%esp),%edx + roll $1,%edx + addl %ebp,%ecx + rorl $2,%esi + movl %edi,%ebp + roll $5,%ebp + movl %edx,40(%esp) + leal 3395469782(%edx,%ecx,1),%edx + movl 44(%esp),%ecx + addl %ebp,%edx + + movl %edi,%ebp + xorl 52(%esp),%ecx + xorl %esi,%ebp + xorl 12(%esp),%ecx + xorl %eax,%ebp + xorl 32(%esp),%ecx + roll $1,%ecx + addl %ebp,%ebx + rorl $2,%edi + movl %edx,%ebp + roll $5,%ebp + movl %ecx,44(%esp) + leal 3395469782(%ecx,%ebx,1),%ecx + movl 48(%esp),%ebx + addl %ebp,%ecx + + movl %edx,%ebp + xorl 56(%esp),%ebx + xorl %edi,%ebp + xorl 16(%esp),%ebx + xorl %esi,%ebp + xorl 36(%esp),%ebx + roll $1,%ebx + addl %ebp,%eax + rorl $2,%edx + movl %ecx,%ebp + roll $5,%ebp + movl %ebx,48(%esp) + leal 3395469782(%ebx,%eax,1),%ebx + movl 52(%esp),%eax + addl %ebp,%ebx + + movl %ecx,%ebp + xorl 60(%esp),%eax + xorl %edx,%ebp + xorl 20(%esp),%eax + xorl %edi,%ebp + xorl 40(%esp),%eax + roll $1,%eax + addl %ebp,%esi + rorl $2,%ecx + movl %ebx,%ebp + roll $5,%ebp + leal 3395469782(%eax,%esi,1),%eax + movl 56(%esp),%esi + addl %ebp,%eax + + movl %ebx,%ebp + xorl (%esp),%esi + xorl %ecx,%ebp + xorl 24(%esp),%esi + xorl %edx,%ebp + xorl 44(%esp),%esi + roll $1,%esi + addl %ebp,%edi + rorl $2,%ebx + movl %eax,%ebp + roll $5,%ebp + leal 3395469782(%esi,%edi,1),%esi + movl 60(%esp),%edi + addl %ebp,%esi + + movl %eax,%ebp + xorl 4(%esp),%edi + xorl %ebx,%ebp + xorl 28(%esp),%edi + xorl %ecx,%ebp + xorl 48(%esp),%edi + roll $1,%edi + addl %ebp,%edx + rorl $2,%eax + movl %esi,%ebp + roll $5,%ebp + leal 3395469782(%edi,%edx,1),%edi + addl %ebp,%edi + movl 96(%esp),%ebp + movl 100(%esp),%edx + addl (%ebp),%edi + addl 4(%ebp),%esi + addl 8(%ebp),%eax + addl 12(%ebp),%ebx + addl 16(%ebp),%ecx + movl %edi,(%ebp) + addl $64,%edx + movl %esi,4(%ebp) + cmpl 104(%esp),%edx + movl %eax,8(%ebp) + movl %ecx,%edi + movl %ebx,12(%ebp) + movl %edx,%esi + movl %ecx,16(%ebp) + jb .L000loop + addl $76,%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size sha1_block_data_order_nohw,.-.L_sha1_block_data_order_nohw_begin +.globl sha1_block_data_order_ssse3 +.hidden sha1_block_data_order_ssse3 +.type sha1_block_data_order_ssse3,@function +.align 16 +sha1_block_data_order_ssse3: +.L_sha1_block_data_order_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .L001pic_point +.L001pic_point: + popl %ebp + leal .LK_XX_XX-.L001pic_point(%ebp),%ebp + movdqa (%ebp),%xmm7 + movdqa 16(%ebp),%xmm0 + movdqa 32(%ebp),%xmm1 + movdqa 48(%ebp),%xmm2 + movdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + movdqa %xmm0,112(%esp) + movdqa %xmm1,128(%esp) + movdqa %xmm2,144(%esp) + shll $6,%edx + movdqa %xmm7,160(%esp) + addl %ebp,%edx + movdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + movdqu -64(%ebp),%xmm0 + movdqu -48(%ebp),%xmm1 + movdqu -32(%ebp),%xmm2 + movdqu -16(%ebp),%xmm3 + pshufb %xmm6,%xmm0 + pshufb %xmm6,%xmm1 + pshufb %xmm6,%xmm2 + movdqa %xmm7,96(%esp) + pshufb %xmm6,%xmm3 + paddd %xmm7,%xmm0 + paddd %xmm7,%xmm1 + paddd %xmm7,%xmm2 + movdqa %xmm0,(%esp) + psubd %xmm7,%xmm0 + movdqa %xmm1,16(%esp) + psubd %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + movl %ecx,%ebp + psubd %xmm7,%xmm2 + xorl %edx,%ebp + pshufd $238,%xmm0,%xmm4 + andl %ebp,%esi + jmp .L002loop +.align 16 +.L002loop: + rorl $2,%ebx + xorl %edx,%esi + movl %eax,%ebp + punpcklqdq %xmm1,%xmm4 + movdqa %xmm3,%xmm6 + addl (%esp),%edi + xorl %ecx,%ebx + paddd %xmm3,%xmm7 + movdqa %xmm0,64(%esp) + roll $5,%eax + addl %esi,%edi + psrldq $4,%xmm6 + andl %ebx,%ebp + xorl %ecx,%ebx + pxor %xmm0,%xmm4 + addl %eax,%edi + rorl $7,%eax + pxor %xmm2,%xmm6 + xorl %ecx,%ebp + movl %edi,%esi + addl 4(%esp),%edx + pxor %xmm6,%xmm4 + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm7,48(%esp) + addl %ebp,%edx + andl %eax,%esi + movdqa %xmm4,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + movdqa %xmm4,%xmm6 + xorl %ebx,%esi + pslldq $12,%xmm0 + paddd %xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + psrld $31,%xmm6 + xorl %eax,%edi + roll $5,%edx + movdqa %xmm0,%xmm7 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + psrld $30,%xmm0 + addl %edx,%ecx + rorl $7,%edx + por %xmm6,%xmm4 + xorl %eax,%ebp + movl %ecx,%esi + addl 12(%esp),%ebx + pslld $2,%xmm7 + xorl %edi,%edx + roll $5,%ecx + pxor %xmm0,%xmm4 + movdqa 96(%esp),%xmm0 + addl %ebp,%ebx + andl %edx,%esi + pxor %xmm7,%xmm4 + pshufd $238,%xmm1,%xmm5 + xorl %edi,%edx + addl %ecx,%ebx + rorl $7,%ecx + xorl %edi,%esi + movl %ebx,%ebp + punpcklqdq %xmm2,%xmm5 + movdqa %xmm4,%xmm7 + addl 16(%esp),%eax + xorl %edx,%ecx + paddd %xmm4,%xmm0 + movdqa %xmm1,80(%esp) + roll $5,%ebx + addl %esi,%eax + psrldq $4,%xmm7 + andl %ecx,%ebp + xorl %edx,%ecx + pxor %xmm1,%xmm5 + addl %ebx,%eax + rorl $7,%ebx + pxor %xmm3,%xmm7 + xorl %edx,%ebp + movl %eax,%esi + addl 20(%esp),%edi + pxor %xmm7,%xmm5 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm0,(%esp) + addl %ebp,%edi + andl %ebx,%esi + movdqa %xmm5,%xmm1 + xorl %ecx,%ebx + addl %eax,%edi + rorl $7,%eax + movdqa %xmm5,%xmm7 + xorl %ecx,%esi + pslldq $12,%xmm1 + paddd %xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + psrld $31,%xmm7 + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm1,%xmm0 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + psrld $30,%xmm1 + addl %edi,%edx + rorl $7,%edi + por %xmm7,%xmm5 + xorl %ebx,%ebp + movl %edx,%esi + addl 28(%esp),%ecx + pslld $2,%xmm0 + xorl %eax,%edi + roll $5,%edx + pxor %xmm1,%xmm5 + movdqa 112(%esp),%xmm1 + addl %ebp,%ecx + andl %edi,%esi + pxor %xmm0,%xmm5 + pshufd $238,%xmm2,%xmm6 + xorl %eax,%edi + addl %edx,%ecx + rorl $7,%edx + xorl %eax,%esi + movl %ecx,%ebp + punpcklqdq %xmm3,%xmm6 + movdqa %xmm5,%xmm0 + addl 32(%esp),%ebx + xorl %edi,%edx + paddd %xmm5,%xmm1 + movdqa %xmm2,96(%esp) + roll $5,%ecx + addl %esi,%ebx + psrldq $4,%xmm0 + andl %edx,%ebp + xorl %edi,%edx + pxor %xmm2,%xmm6 + addl %ecx,%ebx + rorl $7,%ecx + pxor %xmm4,%xmm0 + xorl %edi,%ebp + movl %ebx,%esi + addl 36(%esp),%eax + pxor %xmm0,%xmm6 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm1,16(%esp) + addl %ebp,%eax + andl %ecx,%esi + movdqa %xmm6,%xmm2 + xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm0 + xorl %edx,%esi + pslldq $12,%xmm2 + paddd %xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + psrld $31,%xmm0 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm2,%xmm1 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + psrld $30,%xmm2 + addl %eax,%edi + rorl $7,%eax + por %xmm0,%xmm6 + xorl %ecx,%ebp + movdqa 64(%esp),%xmm0 + movl %edi,%esi + addl 44(%esp),%edx + pslld $2,%xmm1 + xorl %ebx,%eax + roll $5,%edi + pxor %xmm2,%xmm6 + movdqa 112(%esp),%xmm2 + addl %ebp,%edx + andl %eax,%esi + pxor %xmm1,%xmm6 + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + xorl %ebx,%esi + movl %edx,%ebp + punpcklqdq %xmm4,%xmm7 + movdqa %xmm6,%xmm1 + addl 48(%esp),%ecx + xorl %eax,%edi + paddd %xmm6,%xmm2 + movdqa %xmm3,64(%esp) + roll $5,%edx + addl %esi,%ecx + psrldq $4,%xmm1 + andl %edi,%ebp + xorl %eax,%edi + pxor %xmm3,%xmm7 + addl %edx,%ecx + rorl $7,%edx + pxor %xmm5,%xmm1 + xorl %eax,%ebp + movl %ecx,%esi + addl 52(%esp),%ebx + pxor %xmm1,%xmm7 + xorl %edi,%edx + roll $5,%ecx + movdqa %xmm2,32(%esp) + addl %ebp,%ebx + andl %edx,%esi + movdqa %xmm7,%xmm3 + xorl %edi,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm1 + xorl %edi,%esi + pslldq $12,%xmm3 + paddd %xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + psrld $31,%xmm1 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm3,%xmm2 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + psrld $30,%xmm3 + addl %ebx,%eax + rorl $7,%ebx + por %xmm1,%xmm7 + xorl %edx,%ebp + movdqa 80(%esp),%xmm1 + movl %eax,%esi + addl 60(%esp),%edi + pslld $2,%xmm2 + xorl %ecx,%ebx + roll $5,%eax + pxor %xmm3,%xmm7 + movdqa 112(%esp),%xmm3 + addl %ebp,%edi + andl %ebx,%esi + pxor %xmm2,%xmm7 + pshufd $238,%xmm6,%xmm2 + xorl %ecx,%ebx + addl %eax,%edi + rorl $7,%eax + pxor %xmm4,%xmm0 + punpcklqdq %xmm7,%xmm2 + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + pxor %xmm1,%xmm0 + movdqa %xmm4,80(%esp) + xorl %ebx,%eax + roll $5,%edi + movdqa %xmm3,%xmm4 + addl %esi,%edx + paddd %xmm7,%xmm3 + andl %eax,%ebp + pxor %xmm2,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + rorl $7,%edi + xorl %ebx,%ebp + movdqa %xmm0,%xmm2 + movdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + roll $5,%edx + pslld $2,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + psrld $30,%xmm2 + xorl %eax,%edi + addl %edx,%ecx + rorl $7,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + xorl %edi,%edx + roll $5,%ecx + por %xmm2,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + movdqa 96(%esp),%xmm2 + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + pshufd $238,%xmm7,%xmm3 + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 16(%esp),%edi + pxor %xmm5,%xmm1 + punpcklqdq %xmm0,%xmm3 + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm2,%xmm1 + movdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + movdqa %xmm4,%xmm5 + rorl $7,%ebx + paddd %xmm0,%xmm4 + addl %eax,%edi + pxor %xmm3,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + pslld $2,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + psrld $30,%xmm3 + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + por %xmm3,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + movdqa 64(%esp),%xmm3 + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + pshufd $238,%xmm0,%xmm4 + addl %ecx,%ebx + addl 32(%esp),%eax + pxor %xmm6,%xmm2 + punpcklqdq %xmm1,%xmm4 + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + pxor %xmm3,%xmm2 + movdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + movdqa 128(%esp),%xmm6 + rorl $7,%ecx + paddd %xmm1,%xmm5 + addl %ebx,%eax + pxor %xmm4,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + pslld $2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + psrld $30,%xmm4 + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + por %xmm4,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + movdqa 80(%esp),%xmm4 + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + pshufd $238,%xmm1,%xmm5 + addl %edx,%ecx + addl 48(%esp),%ebx + pxor %xmm7,%xmm3 + punpcklqdq %xmm2,%xmm5 + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + pxor %xmm4,%xmm3 + movdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + movdqa %xmm6,%xmm7 + rorl $7,%edx + paddd %xmm2,%xmm6 + addl %ecx,%ebx + pxor %xmm5,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pslld $2,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + psrld $30,%xmm5 + movl %eax,%ebp + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + por %xmm5,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + movdqa 96(%esp),%xmm5 + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + pshufd $238,%xmm2,%xmm6 + addl %edi,%edx + addl (%esp),%ecx + pxor %xmm0,%xmm4 + punpcklqdq %xmm3,%xmm6 + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + pxor %xmm5,%xmm4 + movdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + movdqa %xmm7,%xmm0 + rorl $7,%edi + paddd %xmm3,%xmm7 + addl %edx,%ecx + pxor %xmm6,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + movdqa %xmm4,%xmm6 + movdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + pslld $2,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + psrld $30,%xmm6 + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + por %xmm6,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + movdqa 64(%esp),%xmm6 + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + pshufd $238,%xmm3,%xmm7 + addl %eax,%edi + addl 16(%esp),%edx + pxor %xmm1,%xmm5 + punpcklqdq %xmm4,%xmm7 + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + pxor %xmm6,%xmm5 + movdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + movdqa %xmm0,%xmm1 + rorl $7,%eax + paddd %xmm4,%xmm0 + addl %edi,%edx + pxor %xmm7,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + movdqa %xmm5,%xmm7 + movdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + pslld $2,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + psrld $30,%xmm7 + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + por %xmm7,%xmm5 + addl 28(%esp),%eax + movdqa 80(%esp),%xmm7 + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + pshufd $238,%xmm4,%xmm0 + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 32(%esp),%edi + pxor %xmm2,%xmm6 + punpcklqdq %xmm5,%xmm0 + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + pxor %xmm7,%xmm6 + movdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + roll $5,%eax + movdqa %xmm1,%xmm2 + addl %esi,%edi + paddd %xmm5,%xmm1 + xorl %ebx,%ebp + pxor %xmm0,%xmm6 + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + andl %ebx,%ebp + movdqa %xmm6,%xmm0 + movdqa %xmm1,16(%esp) + xorl %ecx,%ebx + rorl $7,%eax + movl %edi,%esi + xorl %ebx,%ebp + roll $5,%edi + pslld $2,%xmm6 + addl %ebp,%edx + xorl %eax,%esi + psrld $30,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%edi + por %xmm0,%xmm6 + movl %edx,%ebp + xorl %eax,%esi + movdqa 96(%esp),%xmm0 + roll $5,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + pshufd $238,%xmm5,%xmm1 + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + rorl $7,%edx + movl %ecx,%esi + xorl %edi,%ebp + roll $5,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 48(%esp),%eax + pxor %xmm3,%xmm7 + punpcklqdq %xmm6,%xmm1 + andl %edx,%esi + xorl %edi,%edx + rorl $7,%ecx + pxor %xmm0,%xmm7 + movdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + roll $5,%ebx + movdqa 144(%esp),%xmm3 + addl %esi,%eax + paddd %xmm6,%xmm2 + xorl %ecx,%ebp + pxor %xmm1,%xmm7 + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + andl %ecx,%ebp + movdqa %xmm7,%xmm1 + movdqa %xmm2,32(%esp) + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%ebp + roll $5,%eax + pslld $2,%xmm7 + addl %ebp,%edi + xorl %ebx,%esi + psrld $30,%xmm1 + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + por %xmm1,%xmm7 + movl %edi,%ebp + xorl %ebx,%esi + movdqa 64(%esp),%xmm1 + roll $5,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + pshufd $238,%xmm6,%xmm2 + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + rorl $7,%edi + movl %edx,%esi + xorl %eax,%ebp + roll $5,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl (%esp),%ebx + pxor %xmm4,%xmm0 + punpcklqdq %xmm7,%xmm2 + andl %edi,%esi + xorl %eax,%edi + rorl $7,%edx + pxor %xmm1,%xmm0 + movdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + roll $5,%ecx + movdqa %xmm3,%xmm4 + addl %esi,%ebx + paddd %xmm7,%xmm3 + xorl %edx,%ebp + pxor %xmm2,%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + andl %edx,%ebp + movdqa %xmm0,%xmm2 + movdqa %xmm3,48(%esp) + xorl %edi,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + pslld $2,%xmm0 + addl %ebp,%eax + xorl %ecx,%esi + psrld $30,%xmm2 + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + por %xmm2,%xmm0 + movl %eax,%ebp + xorl %ecx,%esi + movdqa 80(%esp),%xmm2 + roll $5,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + pshufd $238,%xmm7,%xmm3 + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + rorl $7,%eax + movl %edi,%esi + xorl %ebx,%ebp + roll $5,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 16(%esp),%ecx + pxor %xmm5,%xmm1 + punpcklqdq %xmm0,%xmm3 + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%edi + pxor %xmm2,%xmm1 + movdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + roll $5,%edx + movdqa %xmm4,%xmm5 + addl %esi,%ecx + paddd %xmm0,%xmm4 + xorl %edi,%ebp + pxor %xmm3,%xmm1 + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + andl %edi,%ebp + movdqa %xmm1,%xmm3 + movdqa %xmm4,(%esp) + xorl %eax,%edi + rorl $7,%edx + movl %ecx,%esi + xorl %edi,%ebp + roll $5,%ecx + pslld $2,%xmm1 + addl %ebp,%ebx + xorl %edx,%esi + psrld $30,%xmm3 + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + rorl $7,%ecx + por %xmm3,%xmm1 + movl %ebx,%ebp + xorl %edx,%esi + movdqa 96(%esp),%xmm3 + roll $5,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + pshufd $238,%xmm0,%xmm4 + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%ebp + roll $5,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 32(%esp),%edx + pxor %xmm6,%xmm2 + punpcklqdq %xmm1,%xmm4 + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + pxor %xmm3,%xmm2 + movdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + roll $5,%edi + movdqa %xmm5,%xmm6 + addl %esi,%edx + paddd %xmm1,%xmm5 + xorl %eax,%ebp + pxor %xmm4,%xmm2 + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + andl %eax,%ebp + movdqa %xmm2,%xmm4 + movdqa %xmm5,16(%esp) + xorl %ebx,%eax + rorl $7,%edi + movl %edx,%esi + xorl %eax,%ebp + roll $5,%edx + pslld $2,%xmm2 + addl %ebp,%ecx + xorl %edi,%esi + psrld $30,%xmm4 + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + rorl $7,%edx + por %xmm4,%xmm2 + movl %ecx,%ebp + xorl %edi,%esi + movdqa 64(%esp),%xmm4 + roll $5,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + pshufd $238,%xmm1,%xmm5 + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%ebp + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + addl 48(%esp),%edi + pxor %xmm7,%xmm3 + punpcklqdq %xmm2,%xmm5 + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + pxor %xmm4,%xmm3 + movdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + movdqa %xmm6,%xmm7 + rorl $7,%ebx + paddd %xmm2,%xmm6 + addl %eax,%edi + pxor %xmm5,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + movdqa %xmm3,%xmm5 + movdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + pslld $2,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + psrld $30,%xmm5 + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + por %xmm5,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + addl (%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + paddd %xmm3,%xmm7 + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + movdqa %xmm7,48(%esp) + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je .L003done + movdqa 160(%esp),%xmm7 + movdqa 176(%esp),%xmm6 + movdqu (%ebp),%xmm0 + movdqu 16(%ebp),%xmm1 + movdqu 32(%ebp),%xmm2 + movdqu 48(%ebp),%xmm3 + addl $64,%ebp + pshufb %xmm6,%xmm0 + movl %ebp,196(%esp) + movdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + pshufb %xmm6,%xmm1 + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + paddd %xmm7,%xmm0 + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + movdqa %xmm0,(%esp) + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + psubd %xmm7,%xmm0 + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + pshufb %xmm6,%xmm2 + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + paddd %xmm7,%xmm1 + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + movdqa %xmm1,16(%esp) + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + psubd %xmm7,%xmm1 + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + pshufb %xmm6,%xmm3 + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + paddd %xmm7,%xmm2 + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + movdqa %xmm2,32(%esp) + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + psubd %xmm7,%xmm2 + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + rorl $7,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %ecx,%ebx + movl %edx,12(%ebp) + xorl %edx,%ebx + movl %edi,16(%ebp) + movl %esi,%ebp + pshufd $238,%xmm0,%xmm4 + andl %ebx,%esi + movl %ebp,%ebx + jmp .L002loop +.align 16 +.L003done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + roll $5,%eax + addl %esi,%edi + xorl %ecx,%ebp + rorl $7,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + roll $5,%edi + addl %ebp,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + roll $5,%edx + addl %esi,%ecx + xorl %eax,%ebp + rorl $7,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + roll $5,%ecx + addl %ebp,%ebx + xorl %edi,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + roll $5,%ebx + addl %esi,%eax + xorl %edx,%ebp + rorl $7,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + roll $5,%eax + addl %ebp,%edi + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + roll $5,%edi + addl %esi,%edx + xorl %ebx,%ebp + rorl $7,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + roll $5,%edx + addl %ebp,%ecx + xorl %eax,%esi + rorl $7,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + roll $5,%ecx + addl %esi,%ebx + xorl %edi,%ebp + rorl $7,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + roll $5,%ebx + addl %ebp,%eax + rorl $7,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size sha1_block_data_order_ssse3,.-.L_sha1_block_data_order_ssse3_begin +.globl sha1_block_data_order_avx +.hidden sha1_block_data_order_avx +.type sha1_block_data_order_avx,@function +.align 16 +sha1_block_data_order_avx: +.L_sha1_block_data_order_avx_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .L004pic_point +.L004pic_point: + popl %ebp + leal .LK_XX_XX-.L004pic_point(%ebp),%ebp + vzeroall + vmovdqa (%ebp),%xmm7 + vmovdqa 16(%ebp),%xmm0 + vmovdqa 32(%ebp),%xmm1 + vmovdqa 48(%ebp),%xmm2 + vmovdqa 64(%ebp),%xmm6 + movl 20(%esp),%edi + movl 24(%esp),%ebp + movl 28(%esp),%edx + movl %esp,%esi + subl $208,%esp + andl $-64,%esp + vmovdqa %xmm0,112(%esp) + vmovdqa %xmm1,128(%esp) + vmovdqa %xmm2,144(%esp) + shll $6,%edx + vmovdqa %xmm7,160(%esp) + addl %ebp,%edx + vmovdqa %xmm6,176(%esp) + addl $64,%ebp + movl %edi,192(%esp) + movl %ebp,196(%esp) + movl %edx,200(%esp) + movl %esi,204(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + movl 16(%edi),%edi + movl %ebx,%esi + vmovdqu -64(%ebp),%xmm0 + vmovdqu -48(%ebp),%xmm1 + vmovdqu -32(%ebp),%xmm2 + vmovdqu -16(%ebp),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vmovdqa %xmm7,96(%esp) + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm7,%xmm0,%xmm4 + vpaddd %xmm7,%xmm1,%xmm5 + vpaddd %xmm7,%xmm2,%xmm6 + vmovdqa %xmm4,(%esp) + movl %ecx,%ebp + vmovdqa %xmm5,16(%esp) + xorl %edx,%ebp + vmovdqa %xmm6,32(%esp) + andl %ebp,%esi + jmp .L005loop +.align 16 +.L005loop: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%ebp + addl (%esp),%edi + vpaddd %xmm3,%xmm7,%xmm7 + vmovdqa %xmm0,64(%esp) + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%edi + vpxor %xmm2,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vmovdqa %xmm7,48(%esp) + movl %edi,%esi + addl 4(%esp),%edx + vpxor %xmm6,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%edi,%edi + addl %ebp,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm6 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm0 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%ebp + addl 8(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrld $30,%xmm0,%xmm7 + vpor %xmm6,%xmm4,%xmm4 + addl %esi,%ecx + andl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + vpslld $2,%xmm0,%xmm0 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vpxor %xmm7,%xmm4,%xmm4 + movl %ecx,%esi + addl 12(%esp),%ebx + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpxor %xmm0,%xmm4,%xmm4 + addl %ebp,%ebx + andl %edx,%esi + vmovdqa 96(%esp),%xmm0 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%ebp + addl 16(%esp),%eax + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqa %xmm1,80(%esp) + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vmovdqa %xmm0,(%esp) + movl %eax,%esi + addl 20(%esp),%edi + vpxor %xmm7,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %ebp,%edi + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm7 + xorl %ecx,%ebx + addl %eax,%edi + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm1 + vpaddd %xmm5,%xmm5,%xmm5 + movl %edi,%ebp + addl 24(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm0 + vpor %xmm7,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpxor %xmm0,%xmm5,%xmm5 + movl %edx,%esi + addl 28(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpxor %xmm1,%xmm5,%xmm5 + addl %ebp,%ecx + andl %edi,%esi + vmovdqa 112(%esp),%xmm1 + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%ebp + addl 32(%esp),%ebx + vpaddd %xmm5,%xmm1,%xmm1 + vmovdqa %xmm2,96(%esp) + xorl %edi,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm0 + addl %esi,%ebx + andl %edx,%ebp + vpxor %xmm2,%xmm6,%xmm6 + xorl %edi,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%ecx,%ecx + xorl %edi,%ebp + vmovdqa %xmm1,16(%esp) + movl %ebx,%esi + addl 36(%esp),%eax + vpxor %xmm0,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + addl %ebp,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm2 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%ebp + addl 40(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm1 + vpor %xmm0,%xmm6,%xmm6 + addl %esi,%edi + andl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + vmovdqa 64(%esp),%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%ebp + vpxor %xmm1,%xmm6,%xmm6 + movl %edi,%esi + addl 44(%esp),%edx + xorl %ebx,%eax + shldl $5,%edi,%edi + vpxor %xmm2,%xmm6,%xmm6 + addl %ebp,%edx + andl %eax,%esi + vmovdqa 112(%esp),%xmm2 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%ebp + addl 48(%esp),%ecx + vpaddd %xmm6,%xmm2,%xmm2 + vmovdqa %xmm3,64(%esp) + xorl %eax,%edi + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm1 + addl %esi,%ecx + andl %edi,%ebp + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%edi + addl %edx,%ecx + vpxor %xmm5,%xmm1,%xmm1 + shrdl $7,%edx,%edx + xorl %eax,%ebp + vmovdqa %xmm2,32(%esp) + movl %ecx,%esi + addl 52(%esp),%ebx + vpxor %xmm1,%xmm7,%xmm7 + xorl %edi,%edx + shldl $5,%ecx,%ecx + addl %ebp,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm1 + xorl %edi,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %edi,%esi + vpslldq $12,%xmm7,%xmm3 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%ebp + addl 56(%esp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm2 + vpor %xmm1,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + vmovdqa 80(%esp),%xmm1 + shrdl $7,%ebx,%ebx + xorl %edx,%ebp + vpxor %xmm2,%xmm7,%xmm7 + movl %eax,%esi + addl 60(%esp),%edi + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpxor %xmm3,%xmm7,%xmm7 + addl %ebp,%edi + andl %ebx,%esi + vmovdqa 112(%esp),%xmm3 + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %edi,%ebp + addl (%esp),%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,80(%esp) + xorl %ebx,%eax + shldl $5,%edi,%edi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + addl %esi,%edx + andl %eax,%ebp + vpxor %xmm2,%xmm0,%xmm0 + xorl %ebx,%eax + addl %edi,%edx + shrdl $7,%edi,%edi + xorl %ebx,%ebp + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + movl %edx,%esi + addl 4(%esp),%ecx + xorl %eax,%edi + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %ebp,%ecx + andl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%ebp + addl 8(%esp),%ebx + vpor %xmm2,%xmm0,%xmm0 + xorl %edi,%edx + shldl $5,%ecx,%ecx + vmovdqa 96(%esp),%xmm2 + addl %esi,%ebx + andl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 12(%esp),%eax + xorl %edi,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,96(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm3,%xmm1,%xmm1 + addl 20(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm3,%xmm1,%xmm1 + addl 28(%esp),%ebx + xorl %edi,%ebp + vmovdqa 64(%esp),%xmm3 + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,64(%esp) + addl %esi,%eax + xorl %edx,%ebp + vmovdqa 128(%esp),%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm4,%xmm2,%xmm2 + addl 36(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpslld $2,%xmm2,%xmm2 + addl 40(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vpor %xmm4,%xmm2,%xmm2 + addl 44(%esp),%ecx + xorl %eax,%ebp + vmovdqa 80(%esp),%xmm4 + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,80(%esp) + addl %esi,%ebx + xorl %edi,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%edx + xorl %ebx,%ebp + vmovdqa 96(%esp),%xmm5 + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpalignr $8,%xmm2,%xmm3,%xmm6 + vpxor %xmm0,%xmm4,%xmm4 + addl (%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + vmovdqa %xmm0,96(%esp) + addl %esi,%ecx + xorl %eax,%ebp + vmovdqa %xmm7,%xmm0 + vpaddd %xmm3,%xmm7,%xmm7 + shrdl $7,%edi,%edi + addl %edx,%ecx + vpxor %xmm6,%xmm4,%xmm4 + addl 4(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm6 + vmovdqa %xmm7,48(%esp) + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm6,%xmm4,%xmm4 + addl 12(%esp),%edi + xorl %ecx,%ebp + vmovdqa 64(%esp),%xmm6 + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpalignr $8,%xmm3,%xmm4,%xmm7 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + vpxor %xmm6,%xmm5,%xmm5 + vmovdqa %xmm1,64(%esp) + addl %esi,%edx + xorl %ebx,%ebp + vmovdqa %xmm0,%xmm1 + vpaddd %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + addl %edi,%edx + vpxor %xmm7,%xmm5,%xmm5 + addl 20(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm7 + vmovdqa %xmm0,(%esp) + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm7,%xmm5,%xmm5 + addl 28(%esp),%eax + vmovdqa 80(%esp),%xmm7 + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm0 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%esp),%edi + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + vmovdqa %xmm2,80(%esp) + movl %eax,%ebp + xorl %ecx,%esi + vmovdqa %xmm1,%xmm2 + vpaddd %xmm5,%xmm1,%xmm1 + shldl $5,%eax,%eax + addl %esi,%edi + vpxor %xmm0,%xmm6,%xmm6 + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 36(%esp),%edx + vpsrld $30,%xmm6,%xmm0 + vmovdqa %xmm1,16(%esp) + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + addl 40(%esp),%ecx + andl %eax,%esi + vpor %xmm0,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%edi,%edi + vmovdqa 96(%esp),%xmm0 + movl %edx,%ebp + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 44(%esp),%ebx + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm1 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%esp),%eax + andl %edx,%esi + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + vmovdqa %xmm3,96(%esp) + movl %ebx,%ebp + xorl %edx,%esi + vmovdqa 144(%esp),%xmm3 + vpaddd %xmm6,%xmm2,%xmm2 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%esp),%edi + vpsrld $30,%xmm7,%xmm1 + vmovdqa %xmm2,32(%esp) + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + addl 56(%esp),%edx + andl %ebx,%esi + vpor %xmm1,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vmovdqa 64(%esp),%xmm1 + movl %edi,%ebp + xorl %ebx,%esi + shldl $5,%edi,%edi + addl %esi,%edx + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 60(%esp),%ecx + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm2 + vpxor %xmm4,%xmm0,%xmm0 + addl (%esp),%ebx + andl %edi,%esi + xorl %eax,%edi + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + vmovdqa %xmm4,64(%esp) + movl %ecx,%ebp + xorl %edi,%esi + vmovdqa %xmm3,%xmm4 + vpaddd %xmm7,%xmm3,%xmm3 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm2,%xmm0,%xmm0 + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 4(%esp),%eax + vpsrld $30,%xmm0,%xmm2 + vmovdqa %xmm3,48(%esp) + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%esp),%edi + andl %ecx,%esi + vpor %xmm2,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vmovdqa 80(%esp),%xmm2 + movl %eax,%ebp + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ebx,%ebp + xorl %ecx,%ebx + addl %eax,%edi + addl 12(%esp),%edx + andl %ebx,%ebp + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %edi,%esi + xorl %ebx,%ebp + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %edi,%edx + vpalignr $8,%xmm7,%xmm0,%xmm3 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%esp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%edi,%edi + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm5,80(%esp) + movl %edx,%ebp + xorl %eax,%esi + vmovdqa %xmm4,%xmm5 + vpaddd %xmm0,%xmm4,%xmm4 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm3,%xmm1,%xmm1 + xorl %edi,%ebp + xorl %eax,%edi + addl %edx,%ecx + addl 20(%esp),%ebx + vpsrld $30,%xmm1,%xmm3 + vmovdqa %xmm4,(%esp) + andl %edi,%ebp + xorl %eax,%edi + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %edi,%ebp + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edx,%esi + xorl %edi,%edx + addl %ecx,%ebx + addl 24(%esp),%eax + andl %edx,%esi + vpor %xmm3,%xmm1,%xmm1 + xorl %edi,%edx + shrdl $7,%ecx,%ecx + vmovdqa 96(%esp),%xmm3 + movl %ebx,%ebp + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%ebp + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%esp),%edi + andl %ecx,%ebp + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%ebp + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%edi + vpalignr $8,%xmm0,%xmm1,%xmm4 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%esp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + vmovdqa %xmm6,96(%esp) + movl %edi,%ebp + xorl %ebx,%esi + vmovdqa %xmm5,%xmm6 + vpaddd %xmm1,%xmm5,%xmm5 + shldl $5,%edi,%edi + addl %esi,%edx + vpxor %xmm4,%xmm2,%xmm2 + xorl %eax,%ebp + xorl %ebx,%eax + addl %edi,%edx + addl 36(%esp),%ecx + vpsrld $30,%xmm2,%xmm4 + vmovdqa %xmm5,16(%esp) + andl %eax,%ebp + xorl %ebx,%eax + shrdl $7,%edi,%edi + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%ebp + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %edi,%esi + xorl %eax,%edi + addl %edx,%ecx + addl 40(%esp),%ebx + andl %edi,%esi + vpor %xmm4,%xmm2,%xmm2 + xorl %eax,%edi + shrdl $7,%edx,%edx + vmovdqa 64(%esp),%xmm4 + movl %ecx,%ebp + xorl %edi,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%ebp + xorl %edi,%edx + addl %ecx,%ebx + addl 44(%esp),%eax + andl %edx,%ebp + xorl %edi,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%ebp + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm5 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + vmovdqa %xmm7,64(%esp) + addl %esi,%edi + xorl %ecx,%ebp + vmovdqa %xmm6,%xmm7 + vpaddd %xmm2,%xmm6,%xmm6 + shrdl $7,%ebx,%ebx + addl %eax,%edi + vpxor %xmm5,%xmm3,%xmm3 + addl 52(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + vpsrld $30,%xmm3,%xmm5 + vmovdqa %xmm6,32(%esp) + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vpor %xmm5,%xmm3,%xmm3 + addl 60(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl (%esp),%eax + vpaddd %xmm3,%xmm7,%xmm7 + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm7,48(%esp) + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 8(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 12(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + movl 196(%esp),%ebp + cmpl 200(%esp),%ebp + je .L006done + vmovdqa 160(%esp),%xmm7 + vmovdqa 176(%esp),%xmm6 + vmovdqu (%ebp),%xmm0 + vmovdqu 16(%ebp),%xmm1 + vmovdqu 32(%ebp),%xmm2 + vmovdqu 48(%ebp),%xmm3 + addl $64,%ebp + vpshufb %xmm6,%xmm0,%xmm0 + movl %ebp,196(%esp) + vmovdqa %xmm7,96(%esp) + addl 16(%esp),%ebx + xorl %edi,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%ebp + shldl $5,%ecx,%ecx + vpaddd %xmm7,%xmm0,%xmm4 + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,(%esp) + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%ebp + shldl $5,%edx,%edx + vpaddd %xmm7,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + vmovdqa %xmm5,16(%esp) + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %edi,%ebp + shldl $5,%edi,%edi + vpaddd %xmm7,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + vmovdqa %xmm6,32(%esp) + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + movl 192(%esp),%ebp + addl (%ebp),%eax + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,%ebx + movl %ecx,8(%ebp) + xorl %edx,%ebx + movl %edx,12(%ebp) + movl %edi,16(%ebp) + movl %esi,%ebp + andl %ebx,%esi + movl %ebp,%ebx + jmp .L005loop +.align 16 +.L006done: + addl 16(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%esp),%edi + xorl %ecx,%esi + movl %eax,%ebp + shldl $5,%eax,%eax + addl %esi,%edi + xorl %ecx,%ebp + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 28(%esp),%edx + xorl %ebx,%ebp + movl %edi,%esi + shldl $5,%edi,%edi + addl %ebp,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %edi,%edx + addl 32(%esp),%ecx + xorl %eax,%esi + movl %edx,%ebp + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%ebp + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 36(%esp),%ebx + xorl %edi,%ebp + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %ebp,%ebx + xorl %edi,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%esp),%eax + xorl %edx,%esi + movl %ebx,%ebp + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%ebp + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%esp),%edi + xorl %ecx,%ebp + movl %eax,%esi + shldl $5,%eax,%eax + addl %ebp,%edi + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%edi + addl 48(%esp),%edx + xorl %ebx,%esi + movl %edi,%ebp + shldl $5,%edi,%edi + addl %esi,%edx + xorl %ebx,%ebp + shrdl $7,%eax,%eax + addl %edi,%edx + addl 52(%esp),%ecx + xorl %eax,%ebp + movl %edx,%esi + shldl $5,%edx,%edx + addl %ebp,%ecx + xorl %eax,%esi + shrdl $7,%edi,%edi + addl %edx,%ecx + addl 56(%esp),%ebx + xorl %edi,%esi + movl %ecx,%ebp + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edi,%ebp + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%esp),%eax + xorl %edx,%ebp + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %ebp,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroall + movl 192(%esp),%ebp + addl (%ebp),%eax + movl 204(%esp),%esp + addl 4(%ebp),%esi + addl 8(%ebp),%ecx + movl %eax,(%ebp) + addl 12(%ebp),%edx + movl %esi,4(%ebp) + addl 16(%ebp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + movl %edi,16(%ebp) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size sha1_block_data_order_avx,.-.L_sha1_block_data_order_avx_begin +.align 64 +.LK_XX_XX: +.long 1518500249,1518500249,1518500249,1518500249 +.long 1859775393,1859775393,1859775393,1859775393 +.long 2400959708,2400959708,2400959708,2400959708 +.long 3395469782,3395469782,3395469782,3395469782 +.long 66051,67438087,134810123,202182159 +.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 +.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 +.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 +.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/sha1-586-win.asm b/third_party/boringssl/gen/bcm/sha1-586-win.asm new file mode 100644 index 00000000..7f0b44b3 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha1-586-win.asm @@ -0,0 +1,3790 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _sha1_block_data_order_nohw +align 16 +_sha1_block_data_order_nohw: +L$_sha1_block_data_order_nohw_begin: + push ebp + push ebx + push esi + push edi + mov ebp,DWORD [20+esp] + mov esi,DWORD [24+esp] + mov eax,DWORD [28+esp] + sub esp,76 + shl eax,6 + add eax,esi + mov DWORD [104+esp],eax + mov edi,DWORD [16+ebp] + jmp NEAR L$000loop +align 16 +L$000loop: + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edx,DWORD [12+esi] + bswap eax + bswap ebx + bswap ecx + bswap edx + mov DWORD [esp],eax + mov DWORD [4+esp],ebx + mov DWORD [8+esp],ecx + mov DWORD [12+esp],edx + mov eax,DWORD [16+esi] + mov ebx,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov edx,DWORD [28+esi] + bswap eax + bswap ebx + bswap ecx + bswap edx + mov DWORD [16+esp],eax + mov DWORD [20+esp],ebx + mov DWORD [24+esp],ecx + mov DWORD [28+esp],edx + mov eax,DWORD [32+esi] + mov ebx,DWORD [36+esi] + mov ecx,DWORD [40+esi] + mov edx,DWORD [44+esi] + bswap eax + bswap ebx + bswap ecx + bswap edx + mov DWORD [32+esp],eax + mov DWORD [36+esp],ebx + mov DWORD [40+esp],ecx + mov DWORD [44+esp],edx + mov eax,DWORD [48+esi] + mov ebx,DWORD [52+esi] + mov ecx,DWORD [56+esi] + mov edx,DWORD [60+esi] + bswap eax + bswap ebx + bswap ecx + bswap edx + mov DWORD [48+esp],eax + mov DWORD [52+esp],ebx + mov DWORD [56+esp],ecx + mov DWORD [60+esp],edx + mov DWORD [100+esp],esi + mov eax,DWORD [ebp] + mov ebx,DWORD [4+ebp] + mov ecx,DWORD [8+ebp] + mov edx,DWORD [12+ebp] + ; 00_15 0 + mov esi,ecx + mov ebp,eax + rol ebp,5 + xor esi,edx + add ebp,edi + mov edi,DWORD [esp] + and esi,ebx + ror ebx,2 + xor esi,edx + lea ebp,[1518500249+edi*1+ebp] + add ebp,esi + ; 00_15 1 + mov edi,ebx + mov esi,ebp + rol ebp,5 + xor edi,ecx + add ebp,edx + mov edx,DWORD [4+esp] + and edi,eax + ror eax,2 + xor edi,ecx + lea ebp,[1518500249+edx*1+ebp] + add ebp,edi + ; 00_15 2 + mov edx,eax + mov edi,ebp + rol ebp,5 + xor edx,ebx + add ebp,ecx + mov ecx,DWORD [8+esp] + and edx,esi + ror esi,2 + xor edx,ebx + lea ebp,[1518500249+ecx*1+ebp] + add ebp,edx + ; 00_15 3 + mov ecx,esi + mov edx,ebp + rol ebp,5 + xor ecx,eax + add ebp,ebx + mov ebx,DWORD [12+esp] + and ecx,edi + ror edi,2 + xor ecx,eax + lea ebp,[1518500249+ebx*1+ebp] + add ebp,ecx + ; 00_15 4 + mov ebx,edi + mov ecx,ebp + rol ebp,5 + xor ebx,esi + add ebp,eax + mov eax,DWORD [16+esp] + and ebx,edx + ror edx,2 + xor ebx,esi + lea ebp,[1518500249+eax*1+ebp] + add ebp,ebx + ; 00_15 5 + mov eax,edx + mov ebx,ebp + rol ebp,5 + xor eax,edi + add ebp,esi + mov esi,DWORD [20+esp] + and eax,ecx + ror ecx,2 + xor eax,edi + lea ebp,[1518500249+esi*1+ebp] + add ebp,eax + ; 00_15 6 + mov esi,ecx + mov eax,ebp + rol ebp,5 + xor esi,edx + add ebp,edi + mov edi,DWORD [24+esp] + and esi,ebx + ror ebx,2 + xor esi,edx + lea ebp,[1518500249+edi*1+ebp] + add ebp,esi + ; 00_15 7 + mov edi,ebx + mov esi,ebp + rol ebp,5 + xor edi,ecx + add ebp,edx + mov edx,DWORD [28+esp] + and edi,eax + ror eax,2 + xor edi,ecx + lea ebp,[1518500249+edx*1+ebp] + add ebp,edi + ; 00_15 8 + mov edx,eax + mov edi,ebp + rol ebp,5 + xor edx,ebx + add ebp,ecx + mov ecx,DWORD [32+esp] + and edx,esi + ror esi,2 + xor edx,ebx + lea ebp,[1518500249+ecx*1+ebp] + add ebp,edx + ; 00_15 9 + mov ecx,esi + mov edx,ebp + rol ebp,5 + xor ecx,eax + add ebp,ebx + mov ebx,DWORD [36+esp] + and ecx,edi + ror edi,2 + xor ecx,eax + lea ebp,[1518500249+ebx*1+ebp] + add ebp,ecx + ; 00_15 10 + mov ebx,edi + mov ecx,ebp + rol ebp,5 + xor ebx,esi + add ebp,eax + mov eax,DWORD [40+esp] + and ebx,edx + ror edx,2 + xor ebx,esi + lea ebp,[1518500249+eax*1+ebp] + add ebp,ebx + ; 00_15 11 + mov eax,edx + mov ebx,ebp + rol ebp,5 + xor eax,edi + add ebp,esi + mov esi,DWORD [44+esp] + and eax,ecx + ror ecx,2 + xor eax,edi + lea ebp,[1518500249+esi*1+ebp] + add ebp,eax + ; 00_15 12 + mov esi,ecx + mov eax,ebp + rol ebp,5 + xor esi,edx + add ebp,edi + mov edi,DWORD [48+esp] + and esi,ebx + ror ebx,2 + xor esi,edx + lea ebp,[1518500249+edi*1+ebp] + add ebp,esi + ; 00_15 13 + mov edi,ebx + mov esi,ebp + rol ebp,5 + xor edi,ecx + add ebp,edx + mov edx,DWORD [52+esp] + and edi,eax + ror eax,2 + xor edi,ecx + lea ebp,[1518500249+edx*1+ebp] + add ebp,edi + ; 00_15 14 + mov edx,eax + mov edi,ebp + rol ebp,5 + xor edx,ebx + add ebp,ecx + mov ecx,DWORD [56+esp] + and edx,esi + ror esi,2 + xor edx,ebx + lea ebp,[1518500249+ecx*1+ebp] + add ebp,edx + ; 00_15 15 + mov ecx,esi + mov edx,ebp + rol ebp,5 + xor ecx,eax + add ebp,ebx + mov ebx,DWORD [60+esp] + and ecx,edi + ror edi,2 + xor ecx,eax + lea ebp,[1518500249+ebx*1+ebp] + mov ebx,DWORD [esp] + add ecx,ebp + ; 16_19 16 + mov ebp,edi + xor ebx,DWORD [8+esp] + xor ebp,esi + xor ebx,DWORD [32+esp] + and ebp,edx + xor ebx,DWORD [52+esp] + rol ebx,1 + xor ebp,esi + add eax,ebp + mov ebp,ecx + ror edx,2 + mov DWORD [esp],ebx + rol ebp,5 + lea ebx,[1518500249+eax*1+ebx] + mov eax,DWORD [4+esp] + add ebx,ebp + ; 16_19 17 + mov ebp,edx + xor eax,DWORD [12+esp] + xor ebp,edi + xor eax,DWORD [36+esp] + and ebp,ecx + xor eax,DWORD [56+esp] + rol eax,1 + xor ebp,edi + add esi,ebp + mov ebp,ebx + ror ecx,2 + mov DWORD [4+esp],eax + rol ebp,5 + lea eax,[1518500249+esi*1+eax] + mov esi,DWORD [8+esp] + add eax,ebp + ; 16_19 18 + mov ebp,ecx + xor esi,DWORD [16+esp] + xor ebp,edx + xor esi,DWORD [40+esp] + and ebp,ebx + xor esi,DWORD [60+esp] + rol esi,1 + xor ebp,edx + add edi,ebp + mov ebp,eax + ror ebx,2 + mov DWORD [8+esp],esi + rol ebp,5 + lea esi,[1518500249+edi*1+esi] + mov edi,DWORD [12+esp] + add esi,ebp + ; 16_19 19 + mov ebp,ebx + xor edi,DWORD [20+esp] + xor ebp,ecx + xor edi,DWORD [44+esp] + and ebp,eax + xor edi,DWORD [esp] + rol edi,1 + xor ebp,ecx + add edx,ebp + mov ebp,esi + ror eax,2 + mov DWORD [12+esp],edi + rol ebp,5 + lea edi,[1518500249+edx*1+edi] + mov edx,DWORD [16+esp] + add edi,ebp + ; 20_39 20 + mov ebp,esi + xor edx,DWORD [24+esp] + xor ebp,eax + xor edx,DWORD [48+esp] + xor ebp,ebx + xor edx,DWORD [4+esp] + rol edx,1 + add ecx,ebp + ror esi,2 + mov ebp,edi + rol ebp,5 + mov DWORD [16+esp],edx + lea edx,[1859775393+ecx*1+edx] + mov ecx,DWORD [20+esp] + add edx,ebp + ; 20_39 21 + mov ebp,edi + xor ecx,DWORD [28+esp] + xor ebp,esi + xor ecx,DWORD [52+esp] + xor ebp,eax + xor ecx,DWORD [8+esp] + rol ecx,1 + add ebx,ebp + ror edi,2 + mov ebp,edx + rol ebp,5 + mov DWORD [20+esp],ecx + lea ecx,[1859775393+ebx*1+ecx] + mov ebx,DWORD [24+esp] + add ecx,ebp + ; 20_39 22 + mov ebp,edx + xor ebx,DWORD [32+esp] + xor ebp,edi + xor ebx,DWORD [56+esp] + xor ebp,esi + xor ebx,DWORD [12+esp] + rol ebx,1 + add eax,ebp + ror edx,2 + mov ebp,ecx + rol ebp,5 + mov DWORD [24+esp],ebx + lea ebx,[1859775393+eax*1+ebx] + mov eax,DWORD [28+esp] + add ebx,ebp + ; 20_39 23 + mov ebp,ecx + xor eax,DWORD [36+esp] + xor ebp,edx + xor eax,DWORD [60+esp] + xor ebp,edi + xor eax,DWORD [16+esp] + rol eax,1 + add esi,ebp + ror ecx,2 + mov ebp,ebx + rol ebp,5 + mov DWORD [28+esp],eax + lea eax,[1859775393+esi*1+eax] + mov esi,DWORD [32+esp] + add eax,ebp + ; 20_39 24 + mov ebp,ebx + xor esi,DWORD [40+esp] + xor ebp,ecx + xor esi,DWORD [esp] + xor ebp,edx + xor esi,DWORD [20+esp] + rol esi,1 + add edi,ebp + ror ebx,2 + mov ebp,eax + rol ebp,5 + mov DWORD [32+esp],esi + lea esi,[1859775393+edi*1+esi] + mov edi,DWORD [36+esp] + add esi,ebp + ; 20_39 25 + mov ebp,eax + xor edi,DWORD [44+esp] + xor ebp,ebx + xor edi,DWORD [4+esp] + xor ebp,ecx + xor edi,DWORD [24+esp] + rol edi,1 + add edx,ebp + ror eax,2 + mov ebp,esi + rol ebp,5 + mov DWORD [36+esp],edi + lea edi,[1859775393+edx*1+edi] + mov edx,DWORD [40+esp] + add edi,ebp + ; 20_39 26 + mov ebp,esi + xor edx,DWORD [48+esp] + xor ebp,eax + xor edx,DWORD [8+esp] + xor ebp,ebx + xor edx,DWORD [28+esp] + rol edx,1 + add ecx,ebp + ror esi,2 + mov ebp,edi + rol ebp,5 + mov DWORD [40+esp],edx + lea edx,[1859775393+ecx*1+edx] + mov ecx,DWORD [44+esp] + add edx,ebp + ; 20_39 27 + mov ebp,edi + xor ecx,DWORD [52+esp] + xor ebp,esi + xor ecx,DWORD [12+esp] + xor ebp,eax + xor ecx,DWORD [32+esp] + rol ecx,1 + add ebx,ebp + ror edi,2 + mov ebp,edx + rol ebp,5 + mov DWORD [44+esp],ecx + lea ecx,[1859775393+ebx*1+ecx] + mov ebx,DWORD [48+esp] + add ecx,ebp + ; 20_39 28 + mov ebp,edx + xor ebx,DWORD [56+esp] + xor ebp,edi + xor ebx,DWORD [16+esp] + xor ebp,esi + xor ebx,DWORD [36+esp] + rol ebx,1 + add eax,ebp + ror edx,2 + mov ebp,ecx + rol ebp,5 + mov DWORD [48+esp],ebx + lea ebx,[1859775393+eax*1+ebx] + mov eax,DWORD [52+esp] + add ebx,ebp + ; 20_39 29 + mov ebp,ecx + xor eax,DWORD [60+esp] + xor ebp,edx + xor eax,DWORD [20+esp] + xor ebp,edi + xor eax,DWORD [40+esp] + rol eax,1 + add esi,ebp + ror ecx,2 + mov ebp,ebx + rol ebp,5 + mov DWORD [52+esp],eax + lea eax,[1859775393+esi*1+eax] + mov esi,DWORD [56+esp] + add eax,ebp + ; 20_39 30 + mov ebp,ebx + xor esi,DWORD [esp] + xor ebp,ecx + xor esi,DWORD [24+esp] + xor ebp,edx + xor esi,DWORD [44+esp] + rol esi,1 + add edi,ebp + ror ebx,2 + mov ebp,eax + rol ebp,5 + mov DWORD [56+esp],esi + lea esi,[1859775393+edi*1+esi] + mov edi,DWORD [60+esp] + add esi,ebp + ; 20_39 31 + mov ebp,eax + xor edi,DWORD [4+esp] + xor ebp,ebx + xor edi,DWORD [28+esp] + xor ebp,ecx + xor edi,DWORD [48+esp] + rol edi,1 + add edx,ebp + ror eax,2 + mov ebp,esi + rol ebp,5 + mov DWORD [60+esp],edi + lea edi,[1859775393+edx*1+edi] + mov edx,DWORD [esp] + add edi,ebp + ; 20_39 32 + mov ebp,esi + xor edx,DWORD [8+esp] + xor ebp,eax + xor edx,DWORD [32+esp] + xor ebp,ebx + xor edx,DWORD [52+esp] + rol edx,1 + add ecx,ebp + ror esi,2 + mov ebp,edi + rol ebp,5 + mov DWORD [esp],edx + lea edx,[1859775393+ecx*1+edx] + mov ecx,DWORD [4+esp] + add edx,ebp + ; 20_39 33 + mov ebp,edi + xor ecx,DWORD [12+esp] + xor ebp,esi + xor ecx,DWORD [36+esp] + xor ebp,eax + xor ecx,DWORD [56+esp] + rol ecx,1 + add ebx,ebp + ror edi,2 + mov ebp,edx + rol ebp,5 + mov DWORD [4+esp],ecx + lea ecx,[1859775393+ebx*1+ecx] + mov ebx,DWORD [8+esp] + add ecx,ebp + ; 20_39 34 + mov ebp,edx + xor ebx,DWORD [16+esp] + xor ebp,edi + xor ebx,DWORD [40+esp] + xor ebp,esi + xor ebx,DWORD [60+esp] + rol ebx,1 + add eax,ebp + ror edx,2 + mov ebp,ecx + rol ebp,5 + mov DWORD [8+esp],ebx + lea ebx,[1859775393+eax*1+ebx] + mov eax,DWORD [12+esp] + add ebx,ebp + ; 20_39 35 + mov ebp,ecx + xor eax,DWORD [20+esp] + xor ebp,edx + xor eax,DWORD [44+esp] + xor ebp,edi + xor eax,DWORD [esp] + rol eax,1 + add esi,ebp + ror ecx,2 + mov ebp,ebx + rol ebp,5 + mov DWORD [12+esp],eax + lea eax,[1859775393+esi*1+eax] + mov esi,DWORD [16+esp] + add eax,ebp + ; 20_39 36 + mov ebp,ebx + xor esi,DWORD [24+esp] + xor ebp,ecx + xor esi,DWORD [48+esp] + xor ebp,edx + xor esi,DWORD [4+esp] + rol esi,1 + add edi,ebp + ror ebx,2 + mov ebp,eax + rol ebp,5 + mov DWORD [16+esp],esi + lea esi,[1859775393+edi*1+esi] + mov edi,DWORD [20+esp] + add esi,ebp + ; 20_39 37 + mov ebp,eax + xor edi,DWORD [28+esp] + xor ebp,ebx + xor edi,DWORD [52+esp] + xor ebp,ecx + xor edi,DWORD [8+esp] + rol edi,1 + add edx,ebp + ror eax,2 + mov ebp,esi + rol ebp,5 + mov DWORD [20+esp],edi + lea edi,[1859775393+edx*1+edi] + mov edx,DWORD [24+esp] + add edi,ebp + ; 20_39 38 + mov ebp,esi + xor edx,DWORD [32+esp] + xor ebp,eax + xor edx,DWORD [56+esp] + xor ebp,ebx + xor edx,DWORD [12+esp] + rol edx,1 + add ecx,ebp + ror esi,2 + mov ebp,edi + rol ebp,5 + mov DWORD [24+esp],edx + lea edx,[1859775393+ecx*1+edx] + mov ecx,DWORD [28+esp] + add edx,ebp + ; 20_39 39 + mov ebp,edi + xor ecx,DWORD [36+esp] + xor ebp,esi + xor ecx,DWORD [60+esp] + xor ebp,eax + xor ecx,DWORD [16+esp] + rol ecx,1 + add ebx,ebp + ror edi,2 + mov ebp,edx + rol ebp,5 + mov DWORD [28+esp],ecx + lea ecx,[1859775393+ebx*1+ecx] + mov ebx,DWORD [32+esp] + add ecx,ebp + ; 40_59 40 + mov ebp,edi + xor ebx,DWORD [40+esp] + xor ebp,esi + xor ebx,DWORD [esp] + and ebp,edx + xor ebx,DWORD [20+esp] + rol ebx,1 + add ebp,eax + ror edx,2 + mov eax,ecx + rol eax,5 + mov DWORD [32+esp],ebx + lea ebx,[2400959708+ebp*1+ebx] + mov ebp,edi + add ebx,eax + and ebp,esi + mov eax,DWORD [36+esp] + add ebx,ebp + ; 40_59 41 + mov ebp,edx + xor eax,DWORD [44+esp] + xor ebp,edi + xor eax,DWORD [4+esp] + and ebp,ecx + xor eax,DWORD [24+esp] + rol eax,1 + add ebp,esi + ror ecx,2 + mov esi,ebx + rol esi,5 + mov DWORD [36+esp],eax + lea eax,[2400959708+ebp*1+eax] + mov ebp,edx + add eax,esi + and ebp,edi + mov esi,DWORD [40+esp] + add eax,ebp + ; 40_59 42 + mov ebp,ecx + xor esi,DWORD [48+esp] + xor ebp,edx + xor esi,DWORD [8+esp] + and ebp,ebx + xor esi,DWORD [28+esp] + rol esi,1 + add ebp,edi + ror ebx,2 + mov edi,eax + rol edi,5 + mov DWORD [40+esp],esi + lea esi,[2400959708+ebp*1+esi] + mov ebp,ecx + add esi,edi + and ebp,edx + mov edi,DWORD [44+esp] + add esi,ebp + ; 40_59 43 + mov ebp,ebx + xor edi,DWORD [52+esp] + xor ebp,ecx + xor edi,DWORD [12+esp] + and ebp,eax + xor edi,DWORD [32+esp] + rol edi,1 + add ebp,edx + ror eax,2 + mov edx,esi + rol edx,5 + mov DWORD [44+esp],edi + lea edi,[2400959708+ebp*1+edi] + mov ebp,ebx + add edi,edx + and ebp,ecx + mov edx,DWORD [48+esp] + add edi,ebp + ; 40_59 44 + mov ebp,eax + xor edx,DWORD [56+esp] + xor ebp,ebx + xor edx,DWORD [16+esp] + and ebp,esi + xor edx,DWORD [36+esp] + rol edx,1 + add ebp,ecx + ror esi,2 + mov ecx,edi + rol ecx,5 + mov DWORD [48+esp],edx + lea edx,[2400959708+ebp*1+edx] + mov ebp,eax + add edx,ecx + and ebp,ebx + mov ecx,DWORD [52+esp] + add edx,ebp + ; 40_59 45 + mov ebp,esi + xor ecx,DWORD [60+esp] + xor ebp,eax + xor ecx,DWORD [20+esp] + and ebp,edi + xor ecx,DWORD [40+esp] + rol ecx,1 + add ebp,ebx + ror edi,2 + mov ebx,edx + rol ebx,5 + mov DWORD [52+esp],ecx + lea ecx,[2400959708+ebp*1+ecx] + mov ebp,esi + add ecx,ebx + and ebp,eax + mov ebx,DWORD [56+esp] + add ecx,ebp + ; 40_59 46 + mov ebp,edi + xor ebx,DWORD [esp] + xor ebp,esi + xor ebx,DWORD [24+esp] + and ebp,edx + xor ebx,DWORD [44+esp] + rol ebx,1 + add ebp,eax + ror edx,2 + mov eax,ecx + rol eax,5 + mov DWORD [56+esp],ebx + lea ebx,[2400959708+ebp*1+ebx] + mov ebp,edi + add ebx,eax + and ebp,esi + mov eax,DWORD [60+esp] + add ebx,ebp + ; 40_59 47 + mov ebp,edx + xor eax,DWORD [4+esp] + xor ebp,edi + xor eax,DWORD [28+esp] + and ebp,ecx + xor eax,DWORD [48+esp] + rol eax,1 + add ebp,esi + ror ecx,2 + mov esi,ebx + rol esi,5 + mov DWORD [60+esp],eax + lea eax,[2400959708+ebp*1+eax] + mov ebp,edx + add eax,esi + and ebp,edi + mov esi,DWORD [esp] + add eax,ebp + ; 40_59 48 + mov ebp,ecx + xor esi,DWORD [8+esp] + xor ebp,edx + xor esi,DWORD [32+esp] + and ebp,ebx + xor esi,DWORD [52+esp] + rol esi,1 + add ebp,edi + ror ebx,2 + mov edi,eax + rol edi,5 + mov DWORD [esp],esi + lea esi,[2400959708+ebp*1+esi] + mov ebp,ecx + add esi,edi + and ebp,edx + mov edi,DWORD [4+esp] + add esi,ebp + ; 40_59 49 + mov ebp,ebx + xor edi,DWORD [12+esp] + xor ebp,ecx + xor edi,DWORD [36+esp] + and ebp,eax + xor edi,DWORD [56+esp] + rol edi,1 + add ebp,edx + ror eax,2 + mov edx,esi + rol edx,5 + mov DWORD [4+esp],edi + lea edi,[2400959708+ebp*1+edi] + mov ebp,ebx + add edi,edx + and ebp,ecx + mov edx,DWORD [8+esp] + add edi,ebp + ; 40_59 50 + mov ebp,eax + xor edx,DWORD [16+esp] + xor ebp,ebx + xor edx,DWORD [40+esp] + and ebp,esi + xor edx,DWORD [60+esp] + rol edx,1 + add ebp,ecx + ror esi,2 + mov ecx,edi + rol ecx,5 + mov DWORD [8+esp],edx + lea edx,[2400959708+ebp*1+edx] + mov ebp,eax + add edx,ecx + and ebp,ebx + mov ecx,DWORD [12+esp] + add edx,ebp + ; 40_59 51 + mov ebp,esi + xor ecx,DWORD [20+esp] + xor ebp,eax + xor ecx,DWORD [44+esp] + and ebp,edi + xor ecx,DWORD [esp] + rol ecx,1 + add ebp,ebx + ror edi,2 + mov ebx,edx + rol ebx,5 + mov DWORD [12+esp],ecx + lea ecx,[2400959708+ebp*1+ecx] + mov ebp,esi + add ecx,ebx + and ebp,eax + mov ebx,DWORD [16+esp] + add ecx,ebp + ; 40_59 52 + mov ebp,edi + xor ebx,DWORD [24+esp] + xor ebp,esi + xor ebx,DWORD [48+esp] + and ebp,edx + xor ebx,DWORD [4+esp] + rol ebx,1 + add ebp,eax + ror edx,2 + mov eax,ecx + rol eax,5 + mov DWORD [16+esp],ebx + lea ebx,[2400959708+ebp*1+ebx] + mov ebp,edi + add ebx,eax + and ebp,esi + mov eax,DWORD [20+esp] + add ebx,ebp + ; 40_59 53 + mov ebp,edx + xor eax,DWORD [28+esp] + xor ebp,edi + xor eax,DWORD [52+esp] + and ebp,ecx + xor eax,DWORD [8+esp] + rol eax,1 + add ebp,esi + ror ecx,2 + mov esi,ebx + rol esi,5 + mov DWORD [20+esp],eax + lea eax,[2400959708+ebp*1+eax] + mov ebp,edx + add eax,esi + and ebp,edi + mov esi,DWORD [24+esp] + add eax,ebp + ; 40_59 54 + mov ebp,ecx + xor esi,DWORD [32+esp] + xor ebp,edx + xor esi,DWORD [56+esp] + and ebp,ebx + xor esi,DWORD [12+esp] + rol esi,1 + add ebp,edi + ror ebx,2 + mov edi,eax + rol edi,5 + mov DWORD [24+esp],esi + lea esi,[2400959708+ebp*1+esi] + mov ebp,ecx + add esi,edi + and ebp,edx + mov edi,DWORD [28+esp] + add esi,ebp + ; 40_59 55 + mov ebp,ebx + xor edi,DWORD [36+esp] + xor ebp,ecx + xor edi,DWORD [60+esp] + and ebp,eax + xor edi,DWORD [16+esp] + rol edi,1 + add ebp,edx + ror eax,2 + mov edx,esi + rol edx,5 + mov DWORD [28+esp],edi + lea edi,[2400959708+ebp*1+edi] + mov ebp,ebx + add edi,edx + and ebp,ecx + mov edx,DWORD [32+esp] + add edi,ebp + ; 40_59 56 + mov ebp,eax + xor edx,DWORD [40+esp] + xor ebp,ebx + xor edx,DWORD [esp] + and ebp,esi + xor edx,DWORD [20+esp] + rol edx,1 + add ebp,ecx + ror esi,2 + mov ecx,edi + rol ecx,5 + mov DWORD [32+esp],edx + lea edx,[2400959708+ebp*1+edx] + mov ebp,eax + add edx,ecx + and ebp,ebx + mov ecx,DWORD [36+esp] + add edx,ebp + ; 40_59 57 + mov ebp,esi + xor ecx,DWORD [44+esp] + xor ebp,eax + xor ecx,DWORD [4+esp] + and ebp,edi + xor ecx,DWORD [24+esp] + rol ecx,1 + add ebp,ebx + ror edi,2 + mov ebx,edx + rol ebx,5 + mov DWORD [36+esp],ecx + lea ecx,[2400959708+ebp*1+ecx] + mov ebp,esi + add ecx,ebx + and ebp,eax + mov ebx,DWORD [40+esp] + add ecx,ebp + ; 40_59 58 + mov ebp,edi + xor ebx,DWORD [48+esp] + xor ebp,esi + xor ebx,DWORD [8+esp] + and ebp,edx + xor ebx,DWORD [28+esp] + rol ebx,1 + add ebp,eax + ror edx,2 + mov eax,ecx + rol eax,5 + mov DWORD [40+esp],ebx + lea ebx,[2400959708+ebp*1+ebx] + mov ebp,edi + add ebx,eax + and ebp,esi + mov eax,DWORD [44+esp] + add ebx,ebp + ; 40_59 59 + mov ebp,edx + xor eax,DWORD [52+esp] + xor ebp,edi + xor eax,DWORD [12+esp] + and ebp,ecx + xor eax,DWORD [32+esp] + rol eax,1 + add ebp,esi + ror ecx,2 + mov esi,ebx + rol esi,5 + mov DWORD [44+esp],eax + lea eax,[2400959708+ebp*1+eax] + mov ebp,edx + add eax,esi + and ebp,edi + mov esi,DWORD [48+esp] + add eax,ebp + ; 20_39 60 + mov ebp,ebx + xor esi,DWORD [56+esp] + xor ebp,ecx + xor esi,DWORD [16+esp] + xor ebp,edx + xor esi,DWORD [36+esp] + rol esi,1 + add edi,ebp + ror ebx,2 + mov ebp,eax + rol ebp,5 + mov DWORD [48+esp],esi + lea esi,[3395469782+edi*1+esi] + mov edi,DWORD [52+esp] + add esi,ebp + ; 20_39 61 + mov ebp,eax + xor edi,DWORD [60+esp] + xor ebp,ebx + xor edi,DWORD [20+esp] + xor ebp,ecx + xor edi,DWORD [40+esp] + rol edi,1 + add edx,ebp + ror eax,2 + mov ebp,esi + rol ebp,5 + mov DWORD [52+esp],edi + lea edi,[3395469782+edx*1+edi] + mov edx,DWORD [56+esp] + add edi,ebp + ; 20_39 62 + mov ebp,esi + xor edx,DWORD [esp] + xor ebp,eax + xor edx,DWORD [24+esp] + xor ebp,ebx + xor edx,DWORD [44+esp] + rol edx,1 + add ecx,ebp + ror esi,2 + mov ebp,edi + rol ebp,5 + mov DWORD [56+esp],edx + lea edx,[3395469782+ecx*1+edx] + mov ecx,DWORD [60+esp] + add edx,ebp + ; 20_39 63 + mov ebp,edi + xor ecx,DWORD [4+esp] + xor ebp,esi + xor ecx,DWORD [28+esp] + xor ebp,eax + xor ecx,DWORD [48+esp] + rol ecx,1 + add ebx,ebp + ror edi,2 + mov ebp,edx + rol ebp,5 + mov DWORD [60+esp],ecx + lea ecx,[3395469782+ebx*1+ecx] + mov ebx,DWORD [esp] + add ecx,ebp + ; 20_39 64 + mov ebp,edx + xor ebx,DWORD [8+esp] + xor ebp,edi + xor ebx,DWORD [32+esp] + xor ebp,esi + xor ebx,DWORD [52+esp] + rol ebx,1 + add eax,ebp + ror edx,2 + mov ebp,ecx + rol ebp,5 + mov DWORD [esp],ebx + lea ebx,[3395469782+eax*1+ebx] + mov eax,DWORD [4+esp] + add ebx,ebp + ; 20_39 65 + mov ebp,ecx + xor eax,DWORD [12+esp] + xor ebp,edx + xor eax,DWORD [36+esp] + xor ebp,edi + xor eax,DWORD [56+esp] + rol eax,1 + add esi,ebp + ror ecx,2 + mov ebp,ebx + rol ebp,5 + mov DWORD [4+esp],eax + lea eax,[3395469782+esi*1+eax] + mov esi,DWORD [8+esp] + add eax,ebp + ; 20_39 66 + mov ebp,ebx + xor esi,DWORD [16+esp] + xor ebp,ecx + xor esi,DWORD [40+esp] + xor ebp,edx + xor esi,DWORD [60+esp] + rol esi,1 + add edi,ebp + ror ebx,2 + mov ebp,eax + rol ebp,5 + mov DWORD [8+esp],esi + lea esi,[3395469782+edi*1+esi] + mov edi,DWORD [12+esp] + add esi,ebp + ; 20_39 67 + mov ebp,eax + xor edi,DWORD [20+esp] + xor ebp,ebx + xor edi,DWORD [44+esp] + xor ebp,ecx + xor edi,DWORD [esp] + rol edi,1 + add edx,ebp + ror eax,2 + mov ebp,esi + rol ebp,5 + mov DWORD [12+esp],edi + lea edi,[3395469782+edx*1+edi] + mov edx,DWORD [16+esp] + add edi,ebp + ; 20_39 68 + mov ebp,esi + xor edx,DWORD [24+esp] + xor ebp,eax + xor edx,DWORD [48+esp] + xor ebp,ebx + xor edx,DWORD [4+esp] + rol edx,1 + add ecx,ebp + ror esi,2 + mov ebp,edi + rol ebp,5 + mov DWORD [16+esp],edx + lea edx,[3395469782+ecx*1+edx] + mov ecx,DWORD [20+esp] + add edx,ebp + ; 20_39 69 + mov ebp,edi + xor ecx,DWORD [28+esp] + xor ebp,esi + xor ecx,DWORD [52+esp] + xor ebp,eax + xor ecx,DWORD [8+esp] + rol ecx,1 + add ebx,ebp + ror edi,2 + mov ebp,edx + rol ebp,5 + mov DWORD [20+esp],ecx + lea ecx,[3395469782+ebx*1+ecx] + mov ebx,DWORD [24+esp] + add ecx,ebp + ; 20_39 70 + mov ebp,edx + xor ebx,DWORD [32+esp] + xor ebp,edi + xor ebx,DWORD [56+esp] + xor ebp,esi + xor ebx,DWORD [12+esp] + rol ebx,1 + add eax,ebp + ror edx,2 + mov ebp,ecx + rol ebp,5 + mov DWORD [24+esp],ebx + lea ebx,[3395469782+eax*1+ebx] + mov eax,DWORD [28+esp] + add ebx,ebp + ; 20_39 71 + mov ebp,ecx + xor eax,DWORD [36+esp] + xor ebp,edx + xor eax,DWORD [60+esp] + xor ebp,edi + xor eax,DWORD [16+esp] + rol eax,1 + add esi,ebp + ror ecx,2 + mov ebp,ebx + rol ebp,5 + mov DWORD [28+esp],eax + lea eax,[3395469782+esi*1+eax] + mov esi,DWORD [32+esp] + add eax,ebp + ; 20_39 72 + mov ebp,ebx + xor esi,DWORD [40+esp] + xor ebp,ecx + xor esi,DWORD [esp] + xor ebp,edx + xor esi,DWORD [20+esp] + rol esi,1 + add edi,ebp + ror ebx,2 + mov ebp,eax + rol ebp,5 + mov DWORD [32+esp],esi + lea esi,[3395469782+edi*1+esi] + mov edi,DWORD [36+esp] + add esi,ebp + ; 20_39 73 + mov ebp,eax + xor edi,DWORD [44+esp] + xor ebp,ebx + xor edi,DWORD [4+esp] + xor ebp,ecx + xor edi,DWORD [24+esp] + rol edi,1 + add edx,ebp + ror eax,2 + mov ebp,esi + rol ebp,5 + mov DWORD [36+esp],edi + lea edi,[3395469782+edx*1+edi] + mov edx,DWORD [40+esp] + add edi,ebp + ; 20_39 74 + mov ebp,esi + xor edx,DWORD [48+esp] + xor ebp,eax + xor edx,DWORD [8+esp] + xor ebp,ebx + xor edx,DWORD [28+esp] + rol edx,1 + add ecx,ebp + ror esi,2 + mov ebp,edi + rol ebp,5 + mov DWORD [40+esp],edx + lea edx,[3395469782+ecx*1+edx] + mov ecx,DWORD [44+esp] + add edx,ebp + ; 20_39 75 + mov ebp,edi + xor ecx,DWORD [52+esp] + xor ebp,esi + xor ecx,DWORD [12+esp] + xor ebp,eax + xor ecx,DWORD [32+esp] + rol ecx,1 + add ebx,ebp + ror edi,2 + mov ebp,edx + rol ebp,5 + mov DWORD [44+esp],ecx + lea ecx,[3395469782+ebx*1+ecx] + mov ebx,DWORD [48+esp] + add ecx,ebp + ; 20_39 76 + mov ebp,edx + xor ebx,DWORD [56+esp] + xor ebp,edi + xor ebx,DWORD [16+esp] + xor ebp,esi + xor ebx,DWORD [36+esp] + rol ebx,1 + add eax,ebp + ror edx,2 + mov ebp,ecx + rol ebp,5 + mov DWORD [48+esp],ebx + lea ebx,[3395469782+eax*1+ebx] + mov eax,DWORD [52+esp] + add ebx,ebp + ; 20_39 77 + mov ebp,ecx + xor eax,DWORD [60+esp] + xor ebp,edx + xor eax,DWORD [20+esp] + xor ebp,edi + xor eax,DWORD [40+esp] + rol eax,1 + add esi,ebp + ror ecx,2 + mov ebp,ebx + rol ebp,5 + lea eax,[3395469782+esi*1+eax] + mov esi,DWORD [56+esp] + add eax,ebp + ; 20_39 78 + mov ebp,ebx + xor esi,DWORD [esp] + xor ebp,ecx + xor esi,DWORD [24+esp] + xor ebp,edx + xor esi,DWORD [44+esp] + rol esi,1 + add edi,ebp + ror ebx,2 + mov ebp,eax + rol ebp,5 + lea esi,[3395469782+edi*1+esi] + mov edi,DWORD [60+esp] + add esi,ebp + ; 20_39 79 + mov ebp,eax + xor edi,DWORD [4+esp] + xor ebp,ebx + xor edi,DWORD [28+esp] + xor ebp,ecx + xor edi,DWORD [48+esp] + rol edi,1 + add edx,ebp + ror eax,2 + mov ebp,esi + rol ebp,5 + lea edi,[3395469782+edx*1+edi] + add edi,ebp + mov ebp,DWORD [96+esp] + mov edx,DWORD [100+esp] + add edi,DWORD [ebp] + add esi,DWORD [4+ebp] + add eax,DWORD [8+ebp] + add ebx,DWORD [12+ebp] + add ecx,DWORD [16+ebp] + mov DWORD [ebp],edi + add edx,64 + mov DWORD [4+ebp],esi + cmp edx,DWORD [104+esp] + mov DWORD [8+ebp],eax + mov edi,ecx + mov DWORD [12+ebp],ebx + mov esi,edx + mov DWORD [16+ebp],ecx + jb NEAR L$000loop + add esp,76 + pop edi + pop esi + pop ebx + pop ebp + ret +global _sha1_block_data_order_ssse3 +align 16 +_sha1_block_data_order_ssse3: +L$_sha1_block_data_order_ssse3_begin: + push ebp + push ebx + push esi + push edi + call L$001pic_point +L$001pic_point: + pop ebp + lea ebp,[(L$K_XX_XX-L$001pic_point)+ebp] + movdqa xmm7,[ebp] + movdqa xmm0,[16+ebp] + movdqa xmm1,[32+ebp] + movdqa xmm2,[48+ebp] + movdqa xmm6,[64+ebp] + mov edi,DWORD [20+esp] + mov ebp,DWORD [24+esp] + mov edx,DWORD [28+esp] + mov esi,esp + sub esp,208 + and esp,-64 + movdqa [112+esp],xmm0 + movdqa [128+esp],xmm1 + movdqa [144+esp],xmm2 + shl edx,6 + movdqa [160+esp],xmm7 + add edx,ebp + movdqa [176+esp],xmm6 + add ebp,64 + mov DWORD [192+esp],edi + mov DWORD [196+esp],ebp + mov DWORD [200+esp],edx + mov DWORD [204+esp],esi + mov eax,DWORD [edi] + mov ebx,DWORD [4+edi] + mov ecx,DWORD [8+edi] + mov edx,DWORD [12+edi] + mov edi,DWORD [16+edi] + mov esi,ebx + movdqu xmm0,[ebp-64] + movdqu xmm1,[ebp-48] + movdqu xmm2,[ebp-32] + movdqu xmm3,[ebp-16] + pshufb xmm0,xmm6 + pshufb xmm1,xmm6 + pshufb xmm2,xmm6 + movdqa [96+esp],xmm7 + pshufb xmm3,xmm6 + paddd xmm0,xmm7 + paddd xmm1,xmm7 + paddd xmm2,xmm7 + movdqa [esp],xmm0 + psubd xmm0,xmm7 + movdqa [16+esp],xmm1 + psubd xmm1,xmm7 + movdqa [32+esp],xmm2 + mov ebp,ecx + psubd xmm2,xmm7 + xor ebp,edx + pshufd xmm4,xmm0,238 + and esi,ebp + jmp NEAR L$002loop +align 16 +L$002loop: + ror ebx,2 + xor esi,edx + mov ebp,eax + punpcklqdq xmm4,xmm1 + movdqa xmm6,xmm3 + add edi,DWORD [esp] + xor ebx,ecx + paddd xmm7,xmm3 + movdqa [64+esp],xmm0 + rol eax,5 + add edi,esi + psrldq xmm6,4 + and ebp,ebx + xor ebx,ecx + pxor xmm4,xmm0 + add edi,eax + ror eax,7 + pxor xmm6,xmm2 + xor ebp,ecx + mov esi,edi + add edx,DWORD [4+esp] + pxor xmm4,xmm6 + xor eax,ebx + rol edi,5 + movdqa [48+esp],xmm7 + add edx,ebp + and esi,eax + movdqa xmm0,xmm4 + xor eax,ebx + add edx,edi + ror edi,7 + movdqa xmm6,xmm4 + xor esi,ebx + pslldq xmm0,12 + paddd xmm4,xmm4 + mov ebp,edx + add ecx,DWORD [8+esp] + psrld xmm6,31 + xor edi,eax + rol edx,5 + movdqa xmm7,xmm0 + add ecx,esi + and ebp,edi + xor edi,eax + psrld xmm0,30 + add ecx,edx + ror edx,7 + por xmm4,xmm6 + xor ebp,eax + mov esi,ecx + add ebx,DWORD [12+esp] + pslld xmm7,2 + xor edx,edi + rol ecx,5 + pxor xmm4,xmm0 + movdqa xmm0,[96+esp] + add ebx,ebp + and esi,edx + pxor xmm4,xmm7 + pshufd xmm5,xmm1,238 + xor edx,edi + add ebx,ecx + ror ecx,7 + xor esi,edi + mov ebp,ebx + punpcklqdq xmm5,xmm2 + movdqa xmm7,xmm4 + add eax,DWORD [16+esp] + xor ecx,edx + paddd xmm0,xmm4 + movdqa [80+esp],xmm1 + rol ebx,5 + add eax,esi + psrldq xmm7,4 + and ebp,ecx + xor ecx,edx + pxor xmm5,xmm1 + add eax,ebx + ror ebx,7 + pxor xmm7,xmm3 + xor ebp,edx + mov esi,eax + add edi,DWORD [20+esp] + pxor xmm5,xmm7 + xor ebx,ecx + rol eax,5 + movdqa [esp],xmm0 + add edi,ebp + and esi,ebx + movdqa xmm1,xmm5 + xor ebx,ecx + add edi,eax + ror eax,7 + movdqa xmm7,xmm5 + xor esi,ecx + pslldq xmm1,12 + paddd xmm5,xmm5 + mov ebp,edi + add edx,DWORD [24+esp] + psrld xmm7,31 + xor eax,ebx + rol edi,5 + movdqa xmm0,xmm1 + add edx,esi + and ebp,eax + xor eax,ebx + psrld xmm1,30 + add edx,edi + ror edi,7 + por xmm5,xmm7 + xor ebp,ebx + mov esi,edx + add ecx,DWORD [28+esp] + pslld xmm0,2 + xor edi,eax + rol edx,5 + pxor xmm5,xmm1 + movdqa xmm1,[112+esp] + add ecx,ebp + and esi,edi + pxor xmm5,xmm0 + pshufd xmm6,xmm2,238 + xor edi,eax + add ecx,edx + ror edx,7 + xor esi,eax + mov ebp,ecx + punpcklqdq xmm6,xmm3 + movdqa xmm0,xmm5 + add ebx,DWORD [32+esp] + xor edx,edi + paddd xmm1,xmm5 + movdqa [96+esp],xmm2 + rol ecx,5 + add ebx,esi + psrldq xmm0,4 + and ebp,edx + xor edx,edi + pxor xmm6,xmm2 + add ebx,ecx + ror ecx,7 + pxor xmm0,xmm4 + xor ebp,edi + mov esi,ebx + add eax,DWORD [36+esp] + pxor xmm6,xmm0 + xor ecx,edx + rol ebx,5 + movdqa [16+esp],xmm1 + add eax,ebp + and esi,ecx + movdqa xmm2,xmm6 + xor ecx,edx + add eax,ebx + ror ebx,7 + movdqa xmm0,xmm6 + xor esi,edx + pslldq xmm2,12 + paddd xmm6,xmm6 + mov ebp,eax + add edi,DWORD [40+esp] + psrld xmm0,31 + xor ebx,ecx + rol eax,5 + movdqa xmm1,xmm2 + add edi,esi + and ebp,ebx + xor ebx,ecx + psrld xmm2,30 + add edi,eax + ror eax,7 + por xmm6,xmm0 + xor ebp,ecx + movdqa xmm0,[64+esp] + mov esi,edi + add edx,DWORD [44+esp] + pslld xmm1,2 + xor eax,ebx + rol edi,5 + pxor xmm6,xmm2 + movdqa xmm2,[112+esp] + add edx,ebp + and esi,eax + pxor xmm6,xmm1 + pshufd xmm7,xmm3,238 + xor eax,ebx + add edx,edi + ror edi,7 + xor esi,ebx + mov ebp,edx + punpcklqdq xmm7,xmm4 + movdqa xmm1,xmm6 + add ecx,DWORD [48+esp] + xor edi,eax + paddd xmm2,xmm6 + movdqa [64+esp],xmm3 + rol edx,5 + add ecx,esi + psrldq xmm1,4 + and ebp,edi + xor edi,eax + pxor xmm7,xmm3 + add ecx,edx + ror edx,7 + pxor xmm1,xmm5 + xor ebp,eax + mov esi,ecx + add ebx,DWORD [52+esp] + pxor xmm7,xmm1 + xor edx,edi + rol ecx,5 + movdqa [32+esp],xmm2 + add ebx,ebp + and esi,edx + movdqa xmm3,xmm7 + xor edx,edi + add ebx,ecx + ror ecx,7 + movdqa xmm1,xmm7 + xor esi,edi + pslldq xmm3,12 + paddd xmm7,xmm7 + mov ebp,ebx + add eax,DWORD [56+esp] + psrld xmm1,31 + xor ecx,edx + rol ebx,5 + movdqa xmm2,xmm3 + add eax,esi + and ebp,ecx + xor ecx,edx + psrld xmm3,30 + add eax,ebx + ror ebx,7 + por xmm7,xmm1 + xor ebp,edx + movdqa xmm1,[80+esp] + mov esi,eax + add edi,DWORD [60+esp] + pslld xmm2,2 + xor ebx,ecx + rol eax,5 + pxor xmm7,xmm3 + movdqa xmm3,[112+esp] + add edi,ebp + and esi,ebx + pxor xmm7,xmm2 + pshufd xmm2,xmm6,238 + xor ebx,ecx + add edi,eax + ror eax,7 + pxor xmm0,xmm4 + punpcklqdq xmm2,xmm7 + xor esi,ecx + mov ebp,edi + add edx,DWORD [esp] + pxor xmm0,xmm1 + movdqa [80+esp],xmm4 + xor eax,ebx + rol edi,5 + movdqa xmm4,xmm3 + add edx,esi + paddd xmm3,xmm7 + and ebp,eax + pxor xmm0,xmm2 + xor eax,ebx + add edx,edi + ror edi,7 + xor ebp,ebx + movdqa xmm2,xmm0 + movdqa [48+esp],xmm3 + mov esi,edx + add ecx,DWORD [4+esp] + xor edi,eax + rol edx,5 + pslld xmm0,2 + add ecx,ebp + and esi,edi + psrld xmm2,30 + xor edi,eax + add ecx,edx + ror edx,7 + xor esi,eax + mov ebp,ecx + add ebx,DWORD [8+esp] + xor edx,edi + rol ecx,5 + por xmm0,xmm2 + add ebx,esi + and ebp,edx + movdqa xmm2,[96+esp] + xor edx,edi + add ebx,ecx + add eax,DWORD [12+esp] + xor ebp,edi + mov esi,ebx + pshufd xmm3,xmm7,238 + rol ebx,5 + add eax,ebp + xor esi,edx + ror ecx,7 + add eax,ebx + add edi,DWORD [16+esp] + pxor xmm1,xmm5 + punpcklqdq xmm3,xmm0 + xor esi,ecx + mov ebp,eax + rol eax,5 + pxor xmm1,xmm2 + movdqa [96+esp],xmm5 + add edi,esi + xor ebp,ecx + movdqa xmm5,xmm4 + ror ebx,7 + paddd xmm4,xmm0 + add edi,eax + pxor xmm1,xmm3 + add edx,DWORD [20+esp] + xor ebp,ebx + mov esi,edi + rol edi,5 + movdqa xmm3,xmm1 + movdqa [esp],xmm4 + add edx,ebp + xor esi,ebx + ror eax,7 + add edx,edi + pslld xmm1,2 + add ecx,DWORD [24+esp] + xor esi,eax + psrld xmm3,30 + mov ebp,edx + rol edx,5 + add ecx,esi + xor ebp,eax + ror edi,7 + add ecx,edx + por xmm1,xmm3 + add ebx,DWORD [28+esp] + xor ebp,edi + movdqa xmm3,[64+esp] + mov esi,ecx + rol ecx,5 + add ebx,ebp + xor esi,edi + ror edx,7 + pshufd xmm4,xmm0,238 + add ebx,ecx + add eax,DWORD [32+esp] + pxor xmm2,xmm6 + punpcklqdq xmm4,xmm1 + xor esi,edx + mov ebp,ebx + rol ebx,5 + pxor xmm2,xmm3 + movdqa [64+esp],xmm6 + add eax,esi + xor ebp,edx + movdqa xmm6,[128+esp] + ror ecx,7 + paddd xmm5,xmm1 + add eax,ebx + pxor xmm2,xmm4 + add edi,DWORD [36+esp] + xor ebp,ecx + mov esi,eax + rol eax,5 + movdqa xmm4,xmm2 + movdqa [16+esp],xmm5 + add edi,ebp + xor esi,ecx + ror ebx,7 + add edi,eax + pslld xmm2,2 + add edx,DWORD [40+esp] + xor esi,ebx + psrld xmm4,30 + mov ebp,edi + rol edi,5 + add edx,esi + xor ebp,ebx + ror eax,7 + add edx,edi + por xmm2,xmm4 + add ecx,DWORD [44+esp] + xor ebp,eax + movdqa xmm4,[80+esp] + mov esi,edx + rol edx,5 + add ecx,ebp + xor esi,eax + ror edi,7 + pshufd xmm5,xmm1,238 + add ecx,edx + add ebx,DWORD [48+esp] + pxor xmm3,xmm7 + punpcklqdq xmm5,xmm2 + xor esi,edi + mov ebp,ecx + rol ecx,5 + pxor xmm3,xmm4 + movdqa [80+esp],xmm7 + add ebx,esi + xor ebp,edi + movdqa xmm7,xmm6 + ror edx,7 + paddd xmm6,xmm2 + add ebx,ecx + pxor xmm3,xmm5 + add eax,DWORD [52+esp] + xor ebp,edx + mov esi,ebx + rol ebx,5 + movdqa xmm5,xmm3 + movdqa [32+esp],xmm6 + add eax,ebp + xor esi,edx + ror ecx,7 + add eax,ebx + pslld xmm3,2 + add edi,DWORD [56+esp] + xor esi,ecx + psrld xmm5,30 + mov ebp,eax + rol eax,5 + add edi,esi + xor ebp,ecx + ror ebx,7 + add edi,eax + por xmm3,xmm5 + add edx,DWORD [60+esp] + xor ebp,ebx + movdqa xmm5,[96+esp] + mov esi,edi + rol edi,5 + add edx,ebp + xor esi,ebx + ror eax,7 + pshufd xmm6,xmm2,238 + add edx,edi + add ecx,DWORD [esp] + pxor xmm4,xmm0 + punpcklqdq xmm6,xmm3 + xor esi,eax + mov ebp,edx + rol edx,5 + pxor xmm4,xmm5 + movdqa [96+esp],xmm0 + add ecx,esi + xor ebp,eax + movdqa xmm0,xmm7 + ror edi,7 + paddd xmm7,xmm3 + add ecx,edx + pxor xmm4,xmm6 + add ebx,DWORD [4+esp] + xor ebp,edi + mov esi,ecx + rol ecx,5 + movdqa xmm6,xmm4 + movdqa [48+esp],xmm7 + add ebx,ebp + xor esi,edi + ror edx,7 + add ebx,ecx + pslld xmm4,2 + add eax,DWORD [8+esp] + xor esi,edx + psrld xmm6,30 + mov ebp,ebx + rol ebx,5 + add eax,esi + xor ebp,edx + ror ecx,7 + add eax,ebx + por xmm4,xmm6 + add edi,DWORD [12+esp] + xor ebp,ecx + movdqa xmm6,[64+esp] + mov esi,eax + rol eax,5 + add edi,ebp + xor esi,ecx + ror ebx,7 + pshufd xmm7,xmm3,238 + add edi,eax + add edx,DWORD [16+esp] + pxor xmm5,xmm1 + punpcklqdq xmm7,xmm4 + xor esi,ebx + mov ebp,edi + rol edi,5 + pxor xmm5,xmm6 + movdqa [64+esp],xmm1 + add edx,esi + xor ebp,ebx + movdqa xmm1,xmm0 + ror eax,7 + paddd xmm0,xmm4 + add edx,edi + pxor xmm5,xmm7 + add ecx,DWORD [20+esp] + xor ebp,eax + mov esi,edx + rol edx,5 + movdqa xmm7,xmm5 + movdqa [esp],xmm0 + add ecx,ebp + xor esi,eax + ror edi,7 + add ecx,edx + pslld xmm5,2 + add ebx,DWORD [24+esp] + xor esi,edi + psrld xmm7,30 + mov ebp,ecx + rol ecx,5 + add ebx,esi + xor ebp,edi + ror edx,7 + add ebx,ecx + por xmm5,xmm7 + add eax,DWORD [28+esp] + movdqa xmm7,[80+esp] + ror ecx,7 + mov esi,ebx + xor ebp,edx + rol ebx,5 + pshufd xmm0,xmm4,238 + add eax,ebp + xor esi,ecx + xor ecx,edx + add eax,ebx + add edi,DWORD [32+esp] + pxor xmm6,xmm2 + punpcklqdq xmm0,xmm5 + and esi,ecx + xor ecx,edx + ror ebx,7 + pxor xmm6,xmm7 + movdqa [80+esp],xmm2 + mov ebp,eax + xor esi,ecx + rol eax,5 + movdqa xmm2,xmm1 + add edi,esi + paddd xmm1,xmm5 + xor ebp,ebx + pxor xmm6,xmm0 + xor ebx,ecx + add edi,eax + add edx,DWORD [36+esp] + and ebp,ebx + movdqa xmm0,xmm6 + movdqa [16+esp],xmm1 + xor ebx,ecx + ror eax,7 + mov esi,edi + xor ebp,ebx + rol edi,5 + pslld xmm6,2 + add edx,ebp + xor esi,eax + psrld xmm0,30 + xor eax,ebx + add edx,edi + add ecx,DWORD [40+esp] + and esi,eax + xor eax,ebx + ror edi,7 + por xmm6,xmm0 + mov ebp,edx + xor esi,eax + movdqa xmm0,[96+esp] + rol edx,5 + add ecx,esi + xor ebp,edi + xor edi,eax + add ecx,edx + pshufd xmm1,xmm5,238 + add ebx,DWORD [44+esp] + and ebp,edi + xor edi,eax + ror edx,7 + mov esi,ecx + xor ebp,edi + rol ecx,5 + add ebx,ebp + xor esi,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [48+esp] + pxor xmm7,xmm3 + punpcklqdq xmm1,xmm6 + and esi,edx + xor edx,edi + ror ecx,7 + pxor xmm7,xmm0 + movdqa [96+esp],xmm3 + mov ebp,ebx + xor esi,edx + rol ebx,5 + movdqa xmm3,[144+esp] + add eax,esi + paddd xmm2,xmm6 + xor ebp,ecx + pxor xmm7,xmm1 + xor ecx,edx + add eax,ebx + add edi,DWORD [52+esp] + and ebp,ecx + movdqa xmm1,xmm7 + movdqa [32+esp],xmm2 + xor ecx,edx + ror ebx,7 + mov esi,eax + xor ebp,ecx + rol eax,5 + pslld xmm7,2 + add edi,ebp + xor esi,ebx + psrld xmm1,30 + xor ebx,ecx + add edi,eax + add edx,DWORD [56+esp] + and esi,ebx + xor ebx,ecx + ror eax,7 + por xmm7,xmm1 + mov ebp,edi + xor esi,ebx + movdqa xmm1,[64+esp] + rol edi,5 + add edx,esi + xor ebp,eax + xor eax,ebx + add edx,edi + pshufd xmm2,xmm6,238 + add ecx,DWORD [60+esp] + and ebp,eax + xor eax,ebx + ror edi,7 + mov esi,edx + xor ebp,eax + rol edx,5 + add ecx,ebp + xor esi,edi + xor edi,eax + add ecx,edx + add ebx,DWORD [esp] + pxor xmm0,xmm4 + punpcklqdq xmm2,xmm7 + and esi,edi + xor edi,eax + ror edx,7 + pxor xmm0,xmm1 + movdqa [64+esp],xmm4 + mov ebp,ecx + xor esi,edi + rol ecx,5 + movdqa xmm4,xmm3 + add ebx,esi + paddd xmm3,xmm7 + xor ebp,edx + pxor xmm0,xmm2 + xor edx,edi + add ebx,ecx + add eax,DWORD [4+esp] + and ebp,edx + movdqa xmm2,xmm0 + movdqa [48+esp],xmm3 + xor edx,edi + ror ecx,7 + mov esi,ebx + xor ebp,edx + rol ebx,5 + pslld xmm0,2 + add eax,ebp + xor esi,ecx + psrld xmm2,30 + xor ecx,edx + add eax,ebx + add edi,DWORD [8+esp] + and esi,ecx + xor ecx,edx + ror ebx,7 + por xmm0,xmm2 + mov ebp,eax + xor esi,ecx + movdqa xmm2,[80+esp] + rol eax,5 + add edi,esi + xor ebp,ebx + xor ebx,ecx + add edi,eax + pshufd xmm3,xmm7,238 + add edx,DWORD [12+esp] + and ebp,ebx + xor ebx,ecx + ror eax,7 + mov esi,edi + xor ebp,ebx + rol edi,5 + add edx,ebp + xor esi,eax + xor eax,ebx + add edx,edi + add ecx,DWORD [16+esp] + pxor xmm1,xmm5 + punpcklqdq xmm3,xmm0 + and esi,eax + xor eax,ebx + ror edi,7 + pxor xmm1,xmm2 + movdqa [80+esp],xmm5 + mov ebp,edx + xor esi,eax + rol edx,5 + movdqa xmm5,xmm4 + add ecx,esi + paddd xmm4,xmm0 + xor ebp,edi + pxor xmm1,xmm3 + xor edi,eax + add ecx,edx + add ebx,DWORD [20+esp] + and ebp,edi + movdqa xmm3,xmm1 + movdqa [esp],xmm4 + xor edi,eax + ror edx,7 + mov esi,ecx + xor ebp,edi + rol ecx,5 + pslld xmm1,2 + add ebx,ebp + xor esi,edx + psrld xmm3,30 + xor edx,edi + add ebx,ecx + add eax,DWORD [24+esp] + and esi,edx + xor edx,edi + ror ecx,7 + por xmm1,xmm3 + mov ebp,ebx + xor esi,edx + movdqa xmm3,[96+esp] + rol ebx,5 + add eax,esi + xor ebp,ecx + xor ecx,edx + add eax,ebx + pshufd xmm4,xmm0,238 + add edi,DWORD [28+esp] + and ebp,ecx + xor ecx,edx + ror ebx,7 + mov esi,eax + xor ebp,ecx + rol eax,5 + add edi,ebp + xor esi,ebx + xor ebx,ecx + add edi,eax + add edx,DWORD [32+esp] + pxor xmm2,xmm6 + punpcklqdq xmm4,xmm1 + and esi,ebx + xor ebx,ecx + ror eax,7 + pxor xmm2,xmm3 + movdqa [96+esp],xmm6 + mov ebp,edi + xor esi,ebx + rol edi,5 + movdqa xmm6,xmm5 + add edx,esi + paddd xmm5,xmm1 + xor ebp,eax + pxor xmm2,xmm4 + xor eax,ebx + add edx,edi + add ecx,DWORD [36+esp] + and ebp,eax + movdqa xmm4,xmm2 + movdqa [16+esp],xmm5 + xor eax,ebx + ror edi,7 + mov esi,edx + xor ebp,eax + rol edx,5 + pslld xmm2,2 + add ecx,ebp + xor esi,edi + psrld xmm4,30 + xor edi,eax + add ecx,edx + add ebx,DWORD [40+esp] + and esi,edi + xor edi,eax + ror edx,7 + por xmm2,xmm4 + mov ebp,ecx + xor esi,edi + movdqa xmm4,[64+esp] + rol ecx,5 + add ebx,esi + xor ebp,edx + xor edx,edi + add ebx,ecx + pshufd xmm5,xmm1,238 + add eax,DWORD [44+esp] + and ebp,edx + xor edx,edi + ror ecx,7 + mov esi,ebx + xor ebp,edx + rol ebx,5 + add eax,ebp + xor esi,edx + add eax,ebx + add edi,DWORD [48+esp] + pxor xmm3,xmm7 + punpcklqdq xmm5,xmm2 + xor esi,ecx + mov ebp,eax + rol eax,5 + pxor xmm3,xmm4 + movdqa [64+esp],xmm7 + add edi,esi + xor ebp,ecx + movdqa xmm7,xmm6 + ror ebx,7 + paddd xmm6,xmm2 + add edi,eax + pxor xmm3,xmm5 + add edx,DWORD [52+esp] + xor ebp,ebx + mov esi,edi + rol edi,5 + movdqa xmm5,xmm3 + movdqa [32+esp],xmm6 + add edx,ebp + xor esi,ebx + ror eax,7 + add edx,edi + pslld xmm3,2 + add ecx,DWORD [56+esp] + xor esi,eax + psrld xmm5,30 + mov ebp,edx + rol edx,5 + add ecx,esi + xor ebp,eax + ror edi,7 + add ecx,edx + por xmm3,xmm5 + add ebx,DWORD [60+esp] + xor ebp,edi + mov esi,ecx + rol ecx,5 + add ebx,ebp + xor esi,edi + ror edx,7 + add ebx,ecx + add eax,DWORD [esp] + xor esi,edx + mov ebp,ebx + rol ebx,5 + add eax,esi + xor ebp,edx + ror ecx,7 + paddd xmm7,xmm3 + add eax,ebx + add edi,DWORD [4+esp] + xor ebp,ecx + mov esi,eax + movdqa [48+esp],xmm7 + rol eax,5 + add edi,ebp + xor esi,ecx + ror ebx,7 + add edi,eax + add edx,DWORD [8+esp] + xor esi,ebx + mov ebp,edi + rol edi,5 + add edx,esi + xor ebp,ebx + ror eax,7 + add edx,edi + add ecx,DWORD [12+esp] + xor ebp,eax + mov esi,edx + rol edx,5 + add ecx,ebp + xor esi,eax + ror edi,7 + add ecx,edx + mov ebp,DWORD [196+esp] + cmp ebp,DWORD [200+esp] + je NEAR L$003done + movdqa xmm7,[160+esp] + movdqa xmm6,[176+esp] + movdqu xmm0,[ebp] + movdqu xmm1,[16+ebp] + movdqu xmm2,[32+ebp] + movdqu xmm3,[48+ebp] + add ebp,64 + pshufb xmm0,xmm6 + mov DWORD [196+esp],ebp + movdqa [96+esp],xmm7 + add ebx,DWORD [16+esp] + xor esi,edi + mov ebp,ecx + rol ecx,5 + add ebx,esi + xor ebp,edi + ror edx,7 + pshufb xmm1,xmm6 + add ebx,ecx + add eax,DWORD [20+esp] + xor ebp,edx + mov esi,ebx + paddd xmm0,xmm7 + rol ebx,5 + add eax,ebp + xor esi,edx + ror ecx,7 + movdqa [esp],xmm0 + add eax,ebx + add edi,DWORD [24+esp] + xor esi,ecx + mov ebp,eax + psubd xmm0,xmm7 + rol eax,5 + add edi,esi + xor ebp,ecx + ror ebx,7 + add edi,eax + add edx,DWORD [28+esp] + xor ebp,ebx + mov esi,edi + rol edi,5 + add edx,ebp + xor esi,ebx + ror eax,7 + add edx,edi + add ecx,DWORD [32+esp] + xor esi,eax + mov ebp,edx + rol edx,5 + add ecx,esi + xor ebp,eax + ror edi,7 + pshufb xmm2,xmm6 + add ecx,edx + add ebx,DWORD [36+esp] + xor ebp,edi + mov esi,ecx + paddd xmm1,xmm7 + rol ecx,5 + add ebx,ebp + xor esi,edi + ror edx,7 + movdqa [16+esp],xmm1 + add ebx,ecx + add eax,DWORD [40+esp] + xor esi,edx + mov ebp,ebx + psubd xmm1,xmm7 + rol ebx,5 + add eax,esi + xor ebp,edx + ror ecx,7 + add eax,ebx + add edi,DWORD [44+esp] + xor ebp,ecx + mov esi,eax + rol eax,5 + add edi,ebp + xor esi,ecx + ror ebx,7 + add edi,eax + add edx,DWORD [48+esp] + xor esi,ebx + mov ebp,edi + rol edi,5 + add edx,esi + xor ebp,ebx + ror eax,7 + pshufb xmm3,xmm6 + add edx,edi + add ecx,DWORD [52+esp] + xor ebp,eax + mov esi,edx + paddd xmm2,xmm7 + rol edx,5 + add ecx,ebp + xor esi,eax + ror edi,7 + movdqa [32+esp],xmm2 + add ecx,edx + add ebx,DWORD [56+esp] + xor esi,edi + mov ebp,ecx + psubd xmm2,xmm7 + rol ecx,5 + add ebx,esi + xor ebp,edi + ror edx,7 + add ebx,ecx + add eax,DWORD [60+esp] + xor ebp,edx + mov esi,ebx + rol ebx,5 + add eax,ebp + ror ecx,7 + add eax,ebx + mov ebp,DWORD [192+esp] + add eax,DWORD [ebp] + add esi,DWORD [4+ebp] + add ecx,DWORD [8+ebp] + mov DWORD [ebp],eax + add edx,DWORD [12+ebp] + mov DWORD [4+ebp],esi + add edi,DWORD [16+ebp] + mov DWORD [8+ebp],ecx + mov ebx,ecx + mov DWORD [12+ebp],edx + xor ebx,edx + mov DWORD [16+ebp],edi + mov ebp,esi + pshufd xmm4,xmm0,238 + and esi,ebx + mov ebx,ebp + jmp NEAR L$002loop +align 16 +L$003done: + add ebx,DWORD [16+esp] + xor esi,edi + mov ebp,ecx + rol ecx,5 + add ebx,esi + xor ebp,edi + ror edx,7 + add ebx,ecx + add eax,DWORD [20+esp] + xor ebp,edx + mov esi,ebx + rol ebx,5 + add eax,ebp + xor esi,edx + ror ecx,7 + add eax,ebx + add edi,DWORD [24+esp] + xor esi,ecx + mov ebp,eax + rol eax,5 + add edi,esi + xor ebp,ecx + ror ebx,7 + add edi,eax + add edx,DWORD [28+esp] + xor ebp,ebx + mov esi,edi + rol edi,5 + add edx,ebp + xor esi,ebx + ror eax,7 + add edx,edi + add ecx,DWORD [32+esp] + xor esi,eax + mov ebp,edx + rol edx,5 + add ecx,esi + xor ebp,eax + ror edi,7 + add ecx,edx + add ebx,DWORD [36+esp] + xor ebp,edi + mov esi,ecx + rol ecx,5 + add ebx,ebp + xor esi,edi + ror edx,7 + add ebx,ecx + add eax,DWORD [40+esp] + xor esi,edx + mov ebp,ebx + rol ebx,5 + add eax,esi + xor ebp,edx + ror ecx,7 + add eax,ebx + add edi,DWORD [44+esp] + xor ebp,ecx + mov esi,eax + rol eax,5 + add edi,ebp + xor esi,ecx + ror ebx,7 + add edi,eax + add edx,DWORD [48+esp] + xor esi,ebx + mov ebp,edi + rol edi,5 + add edx,esi + xor ebp,ebx + ror eax,7 + add edx,edi + add ecx,DWORD [52+esp] + xor ebp,eax + mov esi,edx + rol edx,5 + add ecx,ebp + xor esi,eax + ror edi,7 + add ecx,edx + add ebx,DWORD [56+esp] + xor esi,edi + mov ebp,ecx + rol ecx,5 + add ebx,esi + xor ebp,edi + ror edx,7 + add ebx,ecx + add eax,DWORD [60+esp] + xor ebp,edx + mov esi,ebx + rol ebx,5 + add eax,ebp + ror ecx,7 + add eax,ebx + mov ebp,DWORD [192+esp] + add eax,DWORD [ebp] + mov esp,DWORD [204+esp] + add esi,DWORD [4+ebp] + add ecx,DWORD [8+ebp] + mov DWORD [ebp],eax + add edx,DWORD [12+ebp] + mov DWORD [4+ebp],esi + add edi,DWORD [16+ebp] + mov DWORD [8+ebp],ecx + mov DWORD [12+ebp],edx + mov DWORD [16+ebp],edi + pop edi + pop esi + pop ebx + pop ebp + ret +global _sha1_block_data_order_avx +align 16 +_sha1_block_data_order_avx: +L$_sha1_block_data_order_avx_begin: + push ebp + push ebx + push esi + push edi + call L$004pic_point +L$004pic_point: + pop ebp + lea ebp,[(L$K_XX_XX-L$004pic_point)+ebp] + vzeroall + vmovdqa xmm7,[ebp] + vmovdqa xmm0,[16+ebp] + vmovdqa xmm1,[32+ebp] + vmovdqa xmm2,[48+ebp] + vmovdqa xmm6,[64+ebp] + mov edi,DWORD [20+esp] + mov ebp,DWORD [24+esp] + mov edx,DWORD [28+esp] + mov esi,esp + sub esp,208 + and esp,-64 + vmovdqa [112+esp],xmm0 + vmovdqa [128+esp],xmm1 + vmovdqa [144+esp],xmm2 + shl edx,6 + vmovdqa [160+esp],xmm7 + add edx,ebp + vmovdqa [176+esp],xmm6 + add ebp,64 + mov DWORD [192+esp],edi + mov DWORD [196+esp],ebp + mov DWORD [200+esp],edx + mov DWORD [204+esp],esi + mov eax,DWORD [edi] + mov ebx,DWORD [4+edi] + mov ecx,DWORD [8+edi] + mov edx,DWORD [12+edi] + mov edi,DWORD [16+edi] + mov esi,ebx + vmovdqu xmm0,[ebp-64] + vmovdqu xmm1,[ebp-48] + vmovdqu xmm2,[ebp-32] + vmovdqu xmm3,[ebp-16] + vpshufb xmm0,xmm0,xmm6 + vpshufb xmm1,xmm1,xmm6 + vpshufb xmm2,xmm2,xmm6 + vmovdqa [96+esp],xmm7 + vpshufb xmm3,xmm3,xmm6 + vpaddd xmm4,xmm0,xmm7 + vpaddd xmm5,xmm1,xmm7 + vpaddd xmm6,xmm2,xmm7 + vmovdqa [esp],xmm4 + mov ebp,ecx + vmovdqa [16+esp],xmm5 + xor ebp,edx + vmovdqa [32+esp],xmm6 + and esi,ebp + jmp NEAR L$005loop +align 16 +L$005loop: + shrd ebx,ebx,2 + xor esi,edx + vpalignr xmm4,xmm1,xmm0,8 + mov ebp,eax + add edi,DWORD [esp] + vpaddd xmm7,xmm7,xmm3 + vmovdqa [64+esp],xmm0 + xor ebx,ecx + shld eax,eax,5 + vpsrldq xmm6,xmm3,4 + add edi,esi + and ebp,ebx + vpxor xmm4,xmm4,xmm0 + xor ebx,ecx + add edi,eax + vpxor xmm6,xmm6,xmm2 + shrd eax,eax,7 + xor ebp,ecx + vmovdqa [48+esp],xmm7 + mov esi,edi + add edx,DWORD [4+esp] + vpxor xmm4,xmm4,xmm6 + xor eax,ebx + shld edi,edi,5 + add edx,ebp + and esi,eax + vpsrld xmm6,xmm4,31 + xor eax,ebx + add edx,edi + shrd edi,edi,7 + xor esi,ebx + vpslldq xmm0,xmm4,12 + vpaddd xmm4,xmm4,xmm4 + mov ebp,edx + add ecx,DWORD [8+esp] + xor edi,eax + shld edx,edx,5 + vpsrld xmm7,xmm0,30 + vpor xmm4,xmm4,xmm6 + add ecx,esi + and ebp,edi + xor edi,eax + add ecx,edx + vpslld xmm0,xmm0,2 + shrd edx,edx,7 + xor ebp,eax + vpxor xmm4,xmm4,xmm7 + mov esi,ecx + add ebx,DWORD [12+esp] + xor edx,edi + shld ecx,ecx,5 + vpxor xmm4,xmm4,xmm0 + add ebx,ebp + and esi,edx + vmovdqa xmm0,[96+esp] + xor edx,edi + add ebx,ecx + shrd ecx,ecx,7 + xor esi,edi + vpalignr xmm5,xmm2,xmm1,8 + mov ebp,ebx + add eax,DWORD [16+esp] + vpaddd xmm0,xmm0,xmm4 + vmovdqa [80+esp],xmm1 + xor ecx,edx + shld ebx,ebx,5 + vpsrldq xmm7,xmm4,4 + add eax,esi + and ebp,ecx + vpxor xmm5,xmm5,xmm1 + xor ecx,edx + add eax,ebx + vpxor xmm7,xmm7,xmm3 + shrd ebx,ebx,7 + xor ebp,edx + vmovdqa [esp],xmm0 + mov esi,eax + add edi,DWORD [20+esp] + vpxor xmm5,xmm5,xmm7 + xor ebx,ecx + shld eax,eax,5 + add edi,ebp + and esi,ebx + vpsrld xmm7,xmm5,31 + xor ebx,ecx + add edi,eax + shrd eax,eax,7 + xor esi,ecx + vpslldq xmm1,xmm5,12 + vpaddd xmm5,xmm5,xmm5 + mov ebp,edi + add edx,DWORD [24+esp] + xor eax,ebx + shld edi,edi,5 + vpsrld xmm0,xmm1,30 + vpor xmm5,xmm5,xmm7 + add edx,esi + and ebp,eax + xor eax,ebx + add edx,edi + vpslld xmm1,xmm1,2 + shrd edi,edi,7 + xor ebp,ebx + vpxor xmm5,xmm5,xmm0 + mov esi,edx + add ecx,DWORD [28+esp] + xor edi,eax + shld edx,edx,5 + vpxor xmm5,xmm5,xmm1 + add ecx,ebp + and esi,edi + vmovdqa xmm1,[112+esp] + xor edi,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + vpalignr xmm6,xmm3,xmm2,8 + mov ebp,ecx + add ebx,DWORD [32+esp] + vpaddd xmm1,xmm1,xmm5 + vmovdqa [96+esp],xmm2 + xor edx,edi + shld ecx,ecx,5 + vpsrldq xmm0,xmm5,4 + add ebx,esi + and ebp,edx + vpxor xmm6,xmm6,xmm2 + xor edx,edi + add ebx,ecx + vpxor xmm0,xmm0,xmm4 + shrd ecx,ecx,7 + xor ebp,edi + vmovdqa [16+esp],xmm1 + mov esi,ebx + add eax,DWORD [36+esp] + vpxor xmm6,xmm6,xmm0 + xor ecx,edx + shld ebx,ebx,5 + add eax,ebp + and esi,ecx + vpsrld xmm0,xmm6,31 + xor ecx,edx + add eax,ebx + shrd ebx,ebx,7 + xor esi,edx + vpslldq xmm2,xmm6,12 + vpaddd xmm6,xmm6,xmm6 + mov ebp,eax + add edi,DWORD [40+esp] + xor ebx,ecx + shld eax,eax,5 + vpsrld xmm1,xmm2,30 + vpor xmm6,xmm6,xmm0 + add edi,esi + and ebp,ebx + xor ebx,ecx + add edi,eax + vpslld xmm2,xmm2,2 + vmovdqa xmm0,[64+esp] + shrd eax,eax,7 + xor ebp,ecx + vpxor xmm6,xmm6,xmm1 + mov esi,edi + add edx,DWORD [44+esp] + xor eax,ebx + shld edi,edi,5 + vpxor xmm6,xmm6,xmm2 + add edx,ebp + and esi,eax + vmovdqa xmm2,[112+esp] + xor eax,ebx + add edx,edi + shrd edi,edi,7 + xor esi,ebx + vpalignr xmm7,xmm4,xmm3,8 + mov ebp,edx + add ecx,DWORD [48+esp] + vpaddd xmm2,xmm2,xmm6 + vmovdqa [64+esp],xmm3 + xor edi,eax + shld edx,edx,5 + vpsrldq xmm1,xmm6,4 + add ecx,esi + and ebp,edi + vpxor xmm7,xmm7,xmm3 + xor edi,eax + add ecx,edx + vpxor xmm1,xmm1,xmm5 + shrd edx,edx,7 + xor ebp,eax + vmovdqa [32+esp],xmm2 + mov esi,ecx + add ebx,DWORD [52+esp] + vpxor xmm7,xmm7,xmm1 + xor edx,edi + shld ecx,ecx,5 + add ebx,ebp + and esi,edx + vpsrld xmm1,xmm7,31 + xor edx,edi + add ebx,ecx + shrd ecx,ecx,7 + xor esi,edi + vpslldq xmm3,xmm7,12 + vpaddd xmm7,xmm7,xmm7 + mov ebp,ebx + add eax,DWORD [56+esp] + xor ecx,edx + shld ebx,ebx,5 + vpsrld xmm2,xmm3,30 + vpor xmm7,xmm7,xmm1 + add eax,esi + and ebp,ecx + xor ecx,edx + add eax,ebx + vpslld xmm3,xmm3,2 + vmovdqa xmm1,[80+esp] + shrd ebx,ebx,7 + xor ebp,edx + vpxor xmm7,xmm7,xmm2 + mov esi,eax + add edi,DWORD [60+esp] + xor ebx,ecx + shld eax,eax,5 + vpxor xmm7,xmm7,xmm3 + add edi,ebp + and esi,ebx + vmovdqa xmm3,[112+esp] + xor ebx,ecx + add edi,eax + vpalignr xmm2,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + shrd eax,eax,7 + xor esi,ecx + mov ebp,edi + add edx,DWORD [esp] + vpxor xmm0,xmm0,xmm1 + vmovdqa [80+esp],xmm4 + xor eax,ebx + shld edi,edi,5 + vmovdqa xmm4,xmm3 + vpaddd xmm3,xmm3,xmm7 + add edx,esi + and ebp,eax + vpxor xmm0,xmm0,xmm2 + xor eax,ebx + add edx,edi + shrd edi,edi,7 + xor ebp,ebx + vpsrld xmm2,xmm0,30 + vmovdqa [48+esp],xmm3 + mov esi,edx + add ecx,DWORD [4+esp] + xor edi,eax + shld edx,edx,5 + vpslld xmm0,xmm0,2 + add ecx,ebp + and esi,edi + xor edi,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + mov ebp,ecx + add ebx,DWORD [8+esp] + vpor xmm0,xmm0,xmm2 + xor edx,edi + shld ecx,ecx,5 + vmovdqa xmm2,[96+esp] + add ebx,esi + and ebp,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [12+esp] + xor ebp,edi + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpalignr xmm3,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add edi,DWORD [16+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + vpxor xmm1,xmm1,xmm2 + vmovdqa [96+esp],xmm5 + add edi,esi + xor ebp,ecx + vmovdqa xmm5,xmm4 + vpaddd xmm4,xmm4,xmm0 + shrd ebx,ebx,7 + add edi,eax + vpxor xmm1,xmm1,xmm3 + add edx,DWORD [20+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + vpsrld xmm3,xmm1,30 + vmovdqa [esp],xmm4 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + vpslld xmm1,xmm1,2 + add ecx,DWORD [24+esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + vpor xmm1,xmm1,xmm3 + add ebx,DWORD [28+esp] + xor ebp,edi + vmovdqa xmm3,[64+esp] + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + vpalignr xmm4,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add eax,DWORD [32+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + vpxor xmm2,xmm2,xmm3 + vmovdqa [64+esp],xmm6 + add eax,esi + xor ebp,edx + vmovdqa xmm6,[128+esp] + vpaddd xmm5,xmm5,xmm1 + shrd ecx,ecx,7 + add eax,ebx + vpxor xmm2,xmm2,xmm4 + add edi,DWORD [36+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + vpsrld xmm4,xmm2,30 + vmovdqa [16+esp],xmm5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + vpslld xmm2,xmm2,2 + add edx,DWORD [40+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + vpor xmm2,xmm2,xmm4 + add ecx,DWORD [44+esp] + xor ebp,eax + vmovdqa xmm4,[80+esp] + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + vpalignr xmm5,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add ebx,DWORD [48+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + vpxor xmm3,xmm3,xmm4 + vmovdqa [80+esp],xmm7 + add ebx,esi + xor ebp,edi + vmovdqa xmm7,xmm6 + vpaddd xmm6,xmm6,xmm2 + shrd edx,edx,7 + add ebx,ecx + vpxor xmm3,xmm3,xmm5 + add eax,DWORD [52+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + vpsrld xmm5,xmm3,30 + vmovdqa [32+esp],xmm6 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpslld xmm3,xmm3,2 + add edi,DWORD [56+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + add edi,esi + xor ebp,ecx + shrd ebx,ebx,7 + add edi,eax + vpor xmm3,xmm3,xmm5 + add edx,DWORD [60+esp] + xor ebp,ebx + vmovdqa xmm5,[96+esp] + mov esi,edi + shld edi,edi,5 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + vpalignr xmm6,xmm3,xmm2,8 + vpxor xmm4,xmm4,xmm0 + add ecx,DWORD [esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + vpxor xmm4,xmm4,xmm5 + vmovdqa [96+esp],xmm0 + add ecx,esi + xor ebp,eax + vmovdqa xmm0,xmm7 + vpaddd xmm7,xmm7,xmm3 + shrd edi,edi,7 + add ecx,edx + vpxor xmm4,xmm4,xmm6 + add ebx,DWORD [4+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + vpsrld xmm6,xmm4,30 + vmovdqa [48+esp],xmm7 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + vpslld xmm4,xmm4,2 + add eax,DWORD [8+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + vpor xmm4,xmm4,xmm6 + add edi,DWORD [12+esp] + xor ebp,ecx + vmovdqa xmm6,[64+esp] + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + vpalignr xmm7,xmm4,xmm3,8 + vpxor xmm5,xmm5,xmm1 + add edx,DWORD [16+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + vpxor xmm5,xmm5,xmm6 + vmovdqa [64+esp],xmm1 + add edx,esi + xor ebp,ebx + vmovdqa xmm1,xmm0 + vpaddd xmm0,xmm0,xmm4 + shrd eax,eax,7 + add edx,edi + vpxor xmm5,xmm5,xmm7 + add ecx,DWORD [20+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + vpsrld xmm7,xmm5,30 + vmovdqa [esp],xmm0 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + vpslld xmm5,xmm5,2 + add ebx,DWORD [24+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + vpor xmm5,xmm5,xmm7 + add eax,DWORD [28+esp] + vmovdqa xmm7,[80+esp] + shrd ecx,ecx,7 + mov esi,ebx + xor ebp,edx + shld ebx,ebx,5 + add eax,ebp + xor esi,ecx + xor ecx,edx + add eax,ebx + vpalignr xmm0,xmm5,xmm4,8 + vpxor xmm6,xmm6,xmm2 + add edi,DWORD [32+esp] + and esi,ecx + xor ecx,edx + shrd ebx,ebx,7 + vpxor xmm6,xmm6,xmm7 + vmovdqa [80+esp],xmm2 + mov ebp,eax + xor esi,ecx + vmovdqa xmm2,xmm1 + vpaddd xmm1,xmm1,xmm5 + shld eax,eax,5 + add edi,esi + vpxor xmm6,xmm6,xmm0 + xor ebp,ebx + xor ebx,ecx + add edi,eax + add edx,DWORD [36+esp] + vpsrld xmm0,xmm6,30 + vmovdqa [16+esp],xmm1 + and ebp,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,edi + vpslld xmm6,xmm6,2 + xor ebp,ebx + shld edi,edi,5 + add edx,ebp + xor esi,eax + xor eax,ebx + add edx,edi + add ecx,DWORD [40+esp] + and esi,eax + vpor xmm6,xmm6,xmm0 + xor eax,ebx + shrd edi,edi,7 + vmovdqa xmm0,[96+esp] + mov ebp,edx + xor esi,eax + shld edx,edx,5 + add ecx,esi + xor ebp,edi + xor edi,eax + add ecx,edx + add ebx,DWORD [44+esp] + and ebp,edi + xor edi,eax + shrd edx,edx,7 + mov esi,ecx + xor ebp,edi + shld ecx,ecx,5 + add ebx,ebp + xor esi,edx + xor edx,edi + add ebx,ecx + vpalignr xmm1,xmm6,xmm5,8 + vpxor xmm7,xmm7,xmm3 + add eax,DWORD [48+esp] + and esi,edx + xor edx,edi + shrd ecx,ecx,7 + vpxor xmm7,xmm7,xmm0 + vmovdqa [96+esp],xmm3 + mov ebp,ebx + xor esi,edx + vmovdqa xmm3,[144+esp] + vpaddd xmm2,xmm2,xmm6 + shld ebx,ebx,5 + add eax,esi + vpxor xmm7,xmm7,xmm1 + xor ebp,ecx + xor ecx,edx + add eax,ebx + add edi,DWORD [52+esp] + vpsrld xmm1,xmm7,30 + vmovdqa [32+esp],xmm2 + and ebp,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + vpslld xmm7,xmm7,2 + xor ebp,ecx + shld eax,eax,5 + add edi,ebp + xor esi,ebx + xor ebx,ecx + add edi,eax + add edx,DWORD [56+esp] + and esi,ebx + vpor xmm7,xmm7,xmm1 + xor ebx,ecx + shrd eax,eax,7 + vmovdqa xmm1,[64+esp] + mov ebp,edi + xor esi,ebx + shld edi,edi,5 + add edx,esi + xor ebp,eax + xor eax,ebx + add edx,edi + add ecx,DWORD [60+esp] + and ebp,eax + xor eax,ebx + shrd edi,edi,7 + mov esi,edx + xor ebp,eax + shld edx,edx,5 + add ecx,ebp + xor esi,edi + xor edi,eax + add ecx,edx + vpalignr xmm2,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + add ebx,DWORD [esp] + and esi,edi + xor edi,eax + shrd edx,edx,7 + vpxor xmm0,xmm0,xmm1 + vmovdqa [64+esp],xmm4 + mov ebp,ecx + xor esi,edi + vmovdqa xmm4,xmm3 + vpaddd xmm3,xmm3,xmm7 + shld ecx,ecx,5 + add ebx,esi + vpxor xmm0,xmm0,xmm2 + xor ebp,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [4+esp] + vpsrld xmm2,xmm0,30 + vmovdqa [48+esp],xmm3 + and ebp,edx + xor edx,edi + shrd ecx,ecx,7 + mov esi,ebx + vpslld xmm0,xmm0,2 + xor ebp,edx + shld ebx,ebx,5 + add eax,ebp + xor esi,ecx + xor ecx,edx + add eax,ebx + add edi,DWORD [8+esp] + and esi,ecx + vpor xmm0,xmm0,xmm2 + xor ecx,edx + shrd ebx,ebx,7 + vmovdqa xmm2,[80+esp] + mov ebp,eax + xor esi,ecx + shld eax,eax,5 + add edi,esi + xor ebp,ebx + xor ebx,ecx + add edi,eax + add edx,DWORD [12+esp] + and ebp,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,edi + xor ebp,ebx + shld edi,edi,5 + add edx,ebp + xor esi,eax + xor eax,ebx + add edx,edi + vpalignr xmm3,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add ecx,DWORD [16+esp] + and esi,eax + xor eax,ebx + shrd edi,edi,7 + vpxor xmm1,xmm1,xmm2 + vmovdqa [80+esp],xmm5 + mov ebp,edx + xor esi,eax + vmovdqa xmm5,xmm4 + vpaddd xmm4,xmm4,xmm0 + shld edx,edx,5 + add ecx,esi + vpxor xmm1,xmm1,xmm3 + xor ebp,edi + xor edi,eax + add ecx,edx + add ebx,DWORD [20+esp] + vpsrld xmm3,xmm1,30 + vmovdqa [esp],xmm4 + and ebp,edi + xor edi,eax + shrd edx,edx,7 + mov esi,ecx + vpslld xmm1,xmm1,2 + xor ebp,edi + shld ecx,ecx,5 + add ebx,ebp + xor esi,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [24+esp] + and esi,edx + vpor xmm1,xmm1,xmm3 + xor edx,edi + shrd ecx,ecx,7 + vmovdqa xmm3,[96+esp] + mov ebp,ebx + xor esi,edx + shld ebx,ebx,5 + add eax,esi + xor ebp,ecx + xor ecx,edx + add eax,ebx + add edi,DWORD [28+esp] + and ebp,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + xor ebp,ecx + shld eax,eax,5 + add edi,ebp + xor esi,ebx + xor ebx,ecx + add edi,eax + vpalignr xmm4,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add edx,DWORD [32+esp] + and esi,ebx + xor ebx,ecx + shrd eax,eax,7 + vpxor xmm2,xmm2,xmm3 + vmovdqa [96+esp],xmm6 + mov ebp,edi + xor esi,ebx + vmovdqa xmm6,xmm5 + vpaddd xmm5,xmm5,xmm1 + shld edi,edi,5 + add edx,esi + vpxor xmm2,xmm2,xmm4 + xor ebp,eax + xor eax,ebx + add edx,edi + add ecx,DWORD [36+esp] + vpsrld xmm4,xmm2,30 + vmovdqa [16+esp],xmm5 + and ebp,eax + xor eax,ebx + shrd edi,edi,7 + mov esi,edx + vpslld xmm2,xmm2,2 + xor ebp,eax + shld edx,edx,5 + add ecx,ebp + xor esi,edi + xor edi,eax + add ecx,edx + add ebx,DWORD [40+esp] + and esi,edi + vpor xmm2,xmm2,xmm4 + xor edi,eax + shrd edx,edx,7 + vmovdqa xmm4,[64+esp] + mov ebp,ecx + xor esi,edi + shld ecx,ecx,5 + add ebx,esi + xor ebp,edx + xor edx,edi + add ebx,ecx + add eax,DWORD [44+esp] + and ebp,edx + xor edx,edi + shrd ecx,ecx,7 + mov esi,ebx + xor ebp,edx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + add eax,ebx + vpalignr xmm5,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add edi,DWORD [48+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + vpxor xmm3,xmm3,xmm4 + vmovdqa [64+esp],xmm7 + add edi,esi + xor ebp,ecx + vmovdqa xmm7,xmm6 + vpaddd xmm6,xmm6,xmm2 + shrd ebx,ebx,7 + add edi,eax + vpxor xmm3,xmm3,xmm5 + add edx,DWORD [52+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + vpsrld xmm5,xmm3,30 + vmovdqa [32+esp],xmm6 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + vpslld xmm3,xmm3,2 + add ecx,DWORD [56+esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + vpor xmm3,xmm3,xmm5 + add ebx,DWORD [60+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [esp] + vpaddd xmm7,xmm7,xmm3 + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + vmovdqa [48+esp],xmm7 + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [4+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [8+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [12+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + mov ebp,DWORD [196+esp] + cmp ebp,DWORD [200+esp] + je NEAR L$006done + vmovdqa xmm7,[160+esp] + vmovdqa xmm6,[176+esp] + vmovdqu xmm0,[ebp] + vmovdqu xmm1,[16+ebp] + vmovdqu xmm2,[32+ebp] + vmovdqu xmm3,[48+ebp] + add ebp,64 + vpshufb xmm0,xmm0,xmm6 + mov DWORD [196+esp],ebp + vmovdqa [96+esp],xmm7 + add ebx,DWORD [16+esp] + xor esi,edi + vpshufb xmm1,xmm1,xmm6 + mov ebp,ecx + shld ecx,ecx,5 + vpaddd xmm4,xmm0,xmm7 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + vmovdqa [esp],xmm4 + add eax,DWORD [20+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [24+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + add edi,esi + xor ebp,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [28+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [32+esp] + xor esi,eax + vpshufb xmm2,xmm2,xmm6 + mov ebp,edx + shld edx,edx,5 + vpaddd xmm5,xmm1,xmm7 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + vmovdqa [16+esp],xmm5 + add ebx,DWORD [36+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [40+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [44+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [48+esp] + xor esi,ebx + vpshufb xmm3,xmm3,xmm6 + mov ebp,edi + shld edi,edi,5 + vpaddd xmm6,xmm2,xmm7 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + vmovdqa [32+esp],xmm6 + add ecx,DWORD [52+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + add ebx,DWORD [56+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [60+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + shrd ecx,ecx,7 + add eax,ebx + mov ebp,DWORD [192+esp] + add eax,DWORD [ebp] + add esi,DWORD [4+ebp] + add ecx,DWORD [8+ebp] + mov DWORD [ebp],eax + add edx,DWORD [12+ebp] + mov DWORD [4+ebp],esi + add edi,DWORD [16+ebp] + mov ebx,ecx + mov DWORD [8+ebp],ecx + xor ebx,edx + mov DWORD [12+ebp],edx + mov DWORD [16+ebp],edi + mov ebp,esi + and esi,ebx + mov ebx,ebp + jmp NEAR L$005loop +align 16 +L$006done: + add ebx,DWORD [16+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [20+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [24+esp] + xor esi,ecx + mov ebp,eax + shld eax,eax,5 + add edi,esi + xor ebp,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [28+esp] + xor ebp,ebx + mov esi,edi + shld edi,edi,5 + add edx,ebp + xor esi,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [32+esp] + xor esi,eax + mov ebp,edx + shld edx,edx,5 + add ecx,esi + xor ebp,eax + shrd edi,edi,7 + add ecx,edx + add ebx,DWORD [36+esp] + xor ebp,edi + mov esi,ecx + shld ecx,ecx,5 + add ebx,ebp + xor esi,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [40+esp] + xor esi,edx + mov ebp,ebx + shld ebx,ebx,5 + add eax,esi + xor ebp,edx + shrd ecx,ecx,7 + add eax,ebx + add edi,DWORD [44+esp] + xor ebp,ecx + mov esi,eax + shld eax,eax,5 + add edi,ebp + xor esi,ecx + shrd ebx,ebx,7 + add edi,eax + add edx,DWORD [48+esp] + xor esi,ebx + mov ebp,edi + shld edi,edi,5 + add edx,esi + xor ebp,ebx + shrd eax,eax,7 + add edx,edi + add ecx,DWORD [52+esp] + xor ebp,eax + mov esi,edx + shld edx,edx,5 + add ecx,ebp + xor esi,eax + shrd edi,edi,7 + add ecx,edx + add ebx,DWORD [56+esp] + xor esi,edi + mov ebp,ecx + shld ecx,ecx,5 + add ebx,esi + xor ebp,edi + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD [60+esp] + xor ebp,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,ebp + shrd ecx,ecx,7 + add eax,ebx + vzeroall + mov ebp,DWORD [192+esp] + add eax,DWORD [ebp] + mov esp,DWORD [204+esp] + add esi,DWORD [4+ebp] + add ecx,DWORD [8+ebp] + mov DWORD [ebp],eax + add edx,DWORD [12+ebp] + mov DWORD [4+ebp],esi + add edi,DWORD [16+ebp] + mov DWORD [8+ebp],ecx + mov DWORD [12+ebp],edx + mov DWORD [16+ebp],edi + pop edi + pop esi + pop ebx + pop ebp + ret +align 64 +L$K_XX_XX: +dd 1518500249,1518500249,1518500249,1518500249 +dd 1859775393,1859775393,1859775393,1859775393 +dd 2400959708,2400959708,2400959708,2400959708 +dd 3395469782,3395469782,3395469782,3395469782 +dd 66051,67438087,134810123,202182159 +db 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 +db 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 +db 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 +db 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 +db 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/sha1-armv4-large-linux.S b/third_party/boringssl/gen/bcm/sha1-armv4-large-linux.S new file mode 100644 index 00000000..25a1ea48 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha1-armv4-large-linux.S @@ -0,0 +1,1479 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.globl sha1_block_data_order_nohw +.hidden sha1_block_data_order_nohw +.type sha1_block_data_order_nohw,%function + +.align 5 +sha1_block_data_order_nohw: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + ldmia r0,{r3,r4,r5,r6,r7} +.Lloop: + ldr r8,.LK_00_19 + mov r14,sp + sub sp,sp,#15*4 + mov r5,r5,ror#30 + mov r6,r6,ror#30 + mov r7,r7,ror#30 @ [6] +.L_00_15: +#if __ARM_ARCH<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r6,r8,r6,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r4,r5 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r6,r8,r6,ror#2 @ E+=K_00_19 + eor r10,r4,r5 @ F_xx_xx + add r6,r6,r7,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r3,r10,ror#2 + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r6,r6,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r5,r8,r5,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r3,r4 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r5,r8,r5,ror#2 @ E+=K_00_19 + eor r10,r3,r4 @ F_xx_xx + add r5,r5,r6,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r7,r10,ror#2 + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r5,r5,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r4,r8,r4,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r7,r3 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r4,r8,r4,ror#2 @ E+=K_00_19 + eor r10,r7,r3 @ F_xx_xx + add r4,r4,r5,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r6,r10,ror#2 + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r4,r4,r10 @ E+=F_00_19(B,C,D) +#if __ARM_ARCH<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r3,r8,r3,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r6,r7 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r3,r8,r3,ror#2 @ E+=K_00_19 + eor r10,r6,r7 @ F_xx_xx + add r3,r3,r4,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r5,r10,ror#2 + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r3,r3,r10 @ E+=F_00_19(B,C,D) +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp +#endif + bne .L_00_15 @ [((11+4)*5+2)*3] + sub sp,sp,#25*4 +#if __ARM_ARCH<7 + ldrb r10,[r1,#2] + ldrb r9,[r1,#3] + ldrb r11,[r1,#1] + add r7,r8,r7,ror#2 @ E+=K_00_19 + ldrb r12,[r1],#4 + orr r9,r9,r10,lsl#8 + eor r10,r5,r6 @ F_xx_xx + orr r9,r9,r11,lsl#16 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + orr r9,r9,r12,lsl#24 +#else + ldr r9,[r1],#4 @ handles unaligned + add r7,r8,r7,ror#2 @ E+=K_00_19 + eor r10,r5,r6 @ F_xx_xx + add r7,r7,r3,ror#27 @ E+=ROR(A,27) +#ifdef __ARMEL__ + rev r9,r9 @ byte swap +#endif +#endif + and r10,r4,r10,ror#2 + add r7,r7,r9 @ E+=X[i] + eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) + str r9,[r14,#-4]! + add r7,r7,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) + add r6,r6,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) + add r5,r5,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) + add r4,r4,r10 @ E+=F_00_19(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) + add r3,r3,r10 @ E+=F_00_19(B,C,D) + + ldr r8,.LK_20_39 @ [+15+16*4] + cmn sp,#0 @ [+3], clear carry to denote 20_39 +.L_20_39_or_60_79: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r4,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r3,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r7,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r6,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_20_39(B,C,D) + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + eor r10,r5,r10,ror#2 @ F_xx_xx + @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_20_39(B,C,D) +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp @ preserve carry +#endif + bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] + bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes + + ldr r8,.LK_40_59 + sub sp,sp,#20*4 @ [+2] +.L_40_59: + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r7,r8,r7,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r5,r6 @ F_xx_xx + mov r9,r9,ror#31 + add r7,r7,r3,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r4,r10,ror#2 @ F_xx_xx + and r11,r5,r6 @ F_xx_xx + add r7,r7,r9 @ E+=X[i] + add r7,r7,r10 @ E+=F_40_59(B,C,D) + add r7,r7,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r6,r8,r6,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r4,r5 @ F_xx_xx + mov r9,r9,ror#31 + add r6,r6,r7,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r3,r10,ror#2 @ F_xx_xx + and r11,r4,r5 @ F_xx_xx + add r6,r6,r9 @ E+=X[i] + add r6,r6,r10 @ E+=F_40_59(B,C,D) + add r6,r6,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r5,r8,r5,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r3,r4 @ F_xx_xx + mov r9,r9,ror#31 + add r5,r5,r6,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r7,r10,ror#2 @ F_xx_xx + and r11,r3,r4 @ F_xx_xx + add r5,r5,r9 @ E+=X[i] + add r5,r5,r10 @ E+=F_40_59(B,C,D) + add r5,r5,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r4,r8,r4,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r7,r3 @ F_xx_xx + mov r9,r9,ror#31 + add r4,r4,r5,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r6,r10,ror#2 @ F_xx_xx + and r11,r7,r3 @ F_xx_xx + add r4,r4,r9 @ E+=X[i] + add r4,r4,r10 @ E+=F_40_59(B,C,D) + add r4,r4,r11,ror#2 + ldr r9,[r14,#15*4] + ldr r10,[r14,#13*4] + ldr r11,[r14,#7*4] + add r3,r8,r3,ror#2 @ E+=K_xx_xx + ldr r12,[r14,#2*4] + eor r9,r9,r10 + eor r11,r11,r12 @ 1 cycle stall + eor r10,r6,r7 @ F_xx_xx + mov r9,r9,ror#31 + add r3,r3,r4,ror#27 @ E+=ROR(A,27) + eor r9,r9,r11,ror#31 + str r9,[r14,#-4]! + and r10,r5,r10,ror#2 @ F_xx_xx + and r11,r6,r7 @ F_xx_xx + add r3,r3,r9 @ E+=X[i] + add r3,r3,r10 @ E+=F_40_59(B,C,D) + add r3,r3,r11,ror#2 +#if defined(__thumb2__) + mov r12,sp + teq r14,r12 +#else + teq r14,sp +#endif + bne .L_40_59 @ [+((12+5)*5+2)*4] + + ldr r8,.LK_60_79 + sub sp,sp,#20*4 + cmp sp,#0 @ set carry to denote 60_79 + b .L_20_39_or_60_79 @ [+4], spare 300 bytes +.L_done: + add sp,sp,#80*4 @ "deallocate" stack frame + ldmia r0,{r8,r9,r10,r11,r12} + add r3,r8,r3 + add r4,r9,r4 + add r5,r10,r5,ror#2 + add r6,r11,r6,ror#2 + add r7,r12,r7,ror#2 + stmia r0,{r3,r4,r5,r6,r7} + teq r1,r2 + bne .Lloop @ [+18], total 1307 + +#if __ARM_ARCH>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw + +.align 5 +.LK_00_19:.word 0x5a827999 +.LK_20_39:.word 0x6ed9eba1 +.LK_40_59:.word 0x8f1bbcdc +.LK_60_79:.word 0xca62c1d6 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 5 +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl sha1_block_data_order_neon +.hidden sha1_block_data_order_neon +.type sha1_block_data_order_neon,%function +.align 4 +sha1_block_data_order_neon: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 + @ dmb @ errata #451034 on early Cortex A8 + @ vstmdb sp!,{d8-d15} @ ABI specification says so + mov r14,sp + sub r12,sp,#64 + adr r8,.LK_00_19 + bic r12,r12,#15 @ align for 128-bit stores + + ldmia r0,{r3,r4,r5,r6,r7} @ load context + mov sp,r12 @ alloca + + vld1.8 {q0,q1},[r1]! @ handles unaligned + veor q15,q15,q15 + vld1.8 {q2,q3},[r1]! + vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19 + vrev32.8 q0,q0 @ yes, even on + vrev32.8 q1,q1 @ big-endian... + vrev32.8 q2,q2 + vadd.i32 q8,q0,q14 + vrev32.8 q3,q3 + vadd.i32 q9,q1,q14 + vst1.32 {q8},[r12,:128]! + vadd.i32 q10,q2,q14 + vst1.32 {q9},[r12,:128]! + vst1.32 {q10},[r12,:128]! + ldr r9,[sp] @ big RAW stall + +.Loop_neon: + vext.8 q8,q0,q1,#8 + bic r10,r6,r4 + add r7,r7,r9 + and r11,r5,r4 + vadd.i32 q13,q3,q14 + ldr r9,[sp,#4] + add r7,r7,r3,ror#27 + vext.8 q12,q3,q15,#4 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q8,q8,q0 + bic r10,r5,r3 + add r6,r6,r9 + veor q12,q12,q2 + and r11,r4,r3 + ldr r9,[sp,#8] + veor q12,q12,q8 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q13,q15,q12,#4 + bic r10,r4,r7 + add r5,r5,r9 + vadd.i32 q8,q12,q12 + and r11,r3,r7 + ldr r9,[sp,#12] + vsri.32 q8,q12,#31 + add r5,r5,r6,ror#27 + eor r11,r11,r10 + mov r7,r7,ror#2 + vshr.u32 q12,q13,#30 + add r5,r5,r11 + bic r10,r3,r6 + vshl.u32 q13,q13,#2 + add r4,r4,r9 + and r11,r7,r6 + veor q8,q8,q12 + ldr r9,[sp,#16] + add r4,r4,r5,ror#27 + veor q8,q8,q13 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q9,q1,q2,#8 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + vadd.i32 q13,q8,q14 + ldr r9,[sp,#20] + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r4,ror#27 + vext.8 q12,q8,q15,#4 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + veor q9,q9,q1 + bic r10,r6,r4 + add r7,r7,r9 + veor q12,q12,q3 + and r11,r5,r4 + ldr r9,[sp,#24] + veor q12,q12,q9 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q13,q15,q12,#4 + bic r10,r5,r3 + add r6,r6,r9 + vadd.i32 q9,q12,q12 + and r11,r4,r3 + ldr r9,[sp,#28] + vsri.32 q9,q12,#31 + add r6,r6,r7,ror#27 + eor r11,r11,r10 + mov r3,r3,ror#2 + vshr.u32 q12,q13,#30 + add r6,r6,r11 + bic r10,r4,r7 + vshl.u32 q13,q13,#2 + add r5,r5,r9 + and r11,r3,r7 + veor q9,q9,q12 + ldr r9,[sp,#32] + add r5,r5,r6,ror#27 + veor q9,q9,q13 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q10,q2,q3,#8 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + vadd.i32 q13,q9,q14 + ldr r9,[sp,#36] + add r4,r4,r5,ror#27 + vext.8 q12,q9,q15,#4 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q10,q10,q2 + bic r10,r7,r5 + add r3,r3,r9 + veor q12,q12,q8 + and r11,r6,r5 + ldr r9,[sp,#40] + veor q12,q12,q10 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q13,q15,q12,#4 + bic r10,r6,r4 + add r7,r7,r9 + vadd.i32 q10,q12,q12 + and r11,r5,r4 + ldr r9,[sp,#44] + vsri.32 q10,q12,#31 + add r7,r7,r3,ror#27 + eor r11,r11,r10 + mov r4,r4,ror#2 + vshr.u32 q12,q13,#30 + add r7,r7,r11 + bic r10,r5,r3 + vshl.u32 q13,q13,#2 + add r6,r6,r9 + and r11,r4,r3 + veor q10,q10,q12 + ldr r9,[sp,#48] + add r6,r6,r7,ror#27 + veor q10,q10,q13 + eor r11,r11,r10 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q11,q3,q8,#8 + bic r10,r4,r7 + add r5,r5,r9 + and r11,r3,r7 + vadd.i32 q13,q10,q14 + ldr r9,[sp,#52] + add r5,r5,r6,ror#27 + vext.8 q12,q10,q15,#4 + eor r11,r11,r10 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q11,q11,q3 + bic r10,r3,r6 + add r4,r4,r9 + veor q12,q12,q9 + and r11,r7,r6 + ldr r9,[sp,#56] + veor q12,q12,q11 + add r4,r4,r5,ror#27 + eor r11,r11,r10 + vst1.32 {q13},[r12,:128]! + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q13,q15,q12,#4 + bic r10,r7,r5 + add r3,r3,r9 + vadd.i32 q11,q12,q12 + and r11,r6,r5 + ldr r9,[sp,#60] + vsri.32 q11,q12,#31 + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + vshr.u32 q12,q13,#30 + add r3,r3,r11 + bic r10,r6,r4 + vshl.u32 q13,q13,#2 + add r7,r7,r9 + and r11,r5,r4 + veor q11,q11,q12 + ldr r9,[sp,#0] + add r7,r7,r3,ror#27 + veor q11,q11,q13 + eor r11,r11,r10 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q10,q11,#8 + bic r10,r5,r3 + add r6,r6,r9 + and r11,r4,r3 + veor q0,q0,q8 + ldr r9,[sp,#4] + add r6,r6,r7,ror#27 + veor q0,q0,q1 + eor r11,r11,r10 + mov r3,r3,ror#2 + vadd.i32 q13,q11,q14 + add r6,r6,r11 + bic r10,r4,r7 + veor q12,q12,q0 + add r5,r5,r9 + and r11,r3,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r11,r10 + mov r7,r7,ror#2 + vsli.32 q0,q12,#2 + add r5,r5,r11 + bic r10,r3,r6 + add r4,r4,r9 + and r11,r7,r6 + ldr r9,[sp,#12] + add r4,r4,r5,ror#27 + eor r11,r11,r10 + mov r6,r6,ror#2 + add r4,r4,r11 + bic r10,r7,r5 + add r3,r3,r9 + and r11,r6,r5 + ldr r9,[sp,#16] + add r3,r3,r4,ror#27 + eor r11,r11,r10 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q11,q0,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#20] + veor q1,q1,q9 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q1,q1,q2 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q0,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q1 + ldr r9,[sp,#24] + eor r11,r10,r4 + vshr.u32 q1,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q1,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#28] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#32] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q0,q1,#8 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#36] + veor q2,q2,q10 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + veor q2,q2,q3 + mov r5,r5,ror#2 + add r3,r3,r11 + vadd.i32 q13,q1,q14 + eor r10,r4,r6 + vld1.32 {d28[],d29[]},[r8,:32]! + add r7,r7,r9 + veor q12,q12,q2 + ldr r9,[sp,#40] + eor r11,r10,r5 + vshr.u32 q2,q12,#30 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + vst1.32 {q13},[r12,:128]! + add r7,r7,r11 + eor r10,r3,r5 + vsli.32 q2,q12,#2 + add r6,r6,r9 + ldr r9,[sp,#44] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#48] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q1,q2,#8 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r7 + add r4,r4,r5,ror#27 + veor q3,q3,q8 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q2,q14 + eor r10,r5,r7 + add r3,r3,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r6 + vshr.u32 q3,q12,#30 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vst1.32 {q13},[r12,:128]! + add r3,r3,r11 + eor r10,r4,r6 + vsli.32 q3,q12,#2 + add r7,r7,r9 + ldr r9,[sp,#60] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#0] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q2,q3,#8 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#4] + veor q8,q8,q0 + eor r11,r10,r3 + add r5,r5,r6,ror#27 + veor q8,q8,q9 + mov r7,r7,ror#2 + add r5,r5,r11 + vadd.i32 q13,q3,q14 + eor r10,r6,r3 + add r4,r4,r9 + veor q12,q12,q8 + ldr r9,[sp,#8] + eor r11,r10,r7 + vshr.u32 q8,q12,#30 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + add r4,r4,r11 + eor r10,r5,r7 + vsli.32 q8,q12,#2 + add r3,r3,r9 + ldr r9,[sp,#12] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#16] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q3,q8,#8 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#20] + veor q9,q9,q1 + eor r11,r10,r4 + add r6,r6,r7,ror#27 + veor q9,q9,q10 + mov r3,r3,ror#2 + add r6,r6,r11 + vadd.i32 q13,q8,q14 + eor r10,r7,r4 + add r5,r5,r9 + veor q12,q12,q9 + ldr r9,[sp,#24] + eor r11,r10,r3 + vshr.u32 q9,q12,#30 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + vst1.32 {q13},[r12,:128]! + add r5,r5,r11 + eor r10,r6,r3 + vsli.32 q9,q12,#2 + add r4,r4,r9 + ldr r9,[sp,#28] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#32] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q8,q9,#8 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#36] + veor q10,q10,q2 + add r7,r7,r3,ror#27 + eor r11,r5,r6 + veor q10,q10,q11 + add r7,r7,r10 + and r11,r11,r4 + vadd.i32 q13,q9,q14 + mov r4,r4,ror#2 + add r7,r7,r11 + veor q12,q12,q10 + add r6,r6,r9 + and r10,r4,r5 + vshr.u32 q10,q12,#30 + ldr r9,[sp,#40] + add r6,r6,r7,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r4,r5 + add r6,r6,r10 + vsli.32 q10,q12,#2 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#44] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#48] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + vext.8 q12,q9,q10,#8 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#52] + veor q11,q11,q3 + add r3,r3,r4,ror#27 + eor r11,r6,r7 + veor q11,q11,q0 + add r3,r3,r10 + and r11,r11,r5 + vadd.i32 q13,q10,q14 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + veor q12,q12,q11 + add r7,r7,r9 + and r10,r5,r6 + vshr.u32 q11,q12,#30 + ldr r9,[sp,#56] + add r7,r7,r3,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r5,r6 + add r7,r7,r10 + vsli.32 q11,q12,#2 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#60] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#0] + add r5,r5,r6,ror#27 + eor r11,r3,r4 + add r5,r5,r10 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + vext.8 q12,q10,q11,#8 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#4] + veor q0,q0,q8 + add r4,r4,r5,ror#27 + eor r11,r7,r3 + veor q0,q0,q1 + add r4,r4,r10 + and r11,r11,r6 + vadd.i32 q13,q11,q14 + mov r6,r6,ror#2 + add r4,r4,r11 + veor q12,q12,q0 + add r3,r3,r9 + and r10,r6,r7 + vshr.u32 q0,q12,#30 + ldr r9,[sp,#8] + add r3,r3,r4,ror#27 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + eor r11,r6,r7 + add r3,r3,r10 + vsli.32 q0,q12,#2 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#12] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#16] + add r6,r6,r7,ror#27 + eor r11,r4,r5 + add r6,r6,r10 + and r11,r11,r3 + mov r3,r3,ror#2 + add r6,r6,r11 + vext.8 q12,q11,q0,#8 + add r5,r5,r9 + and r10,r3,r4 + ldr r9,[sp,#20] + veor q1,q1,q9 + add r5,r5,r6,ror#27 + eor r11,r3,r4 + veor q1,q1,q2 + add r5,r5,r10 + and r11,r11,r7 + vadd.i32 q13,q0,q14 + mov r7,r7,ror#2 + add r5,r5,r11 + veor q12,q12,q1 + add r4,r4,r9 + and r10,r7,r3 + vshr.u32 q1,q12,#30 + ldr r9,[sp,#24] + add r4,r4,r5,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r7,r3 + add r4,r4,r10 + vsli.32 q1,q12,#2 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#28] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + add r7,r7,r9 + and r10,r5,r6 + ldr r9,[sp,#32] + add r7,r7,r3,ror#27 + eor r11,r5,r6 + add r7,r7,r10 + and r11,r11,r4 + mov r4,r4,ror#2 + add r7,r7,r11 + vext.8 q12,q0,q1,#8 + add r6,r6,r9 + and r10,r4,r5 + ldr r9,[sp,#36] + veor q2,q2,q10 + add r6,r6,r7,ror#27 + eor r11,r4,r5 + veor q2,q2,q3 + add r6,r6,r10 + and r11,r11,r3 + vadd.i32 q13,q1,q14 + mov r3,r3,ror#2 + add r6,r6,r11 + veor q12,q12,q2 + add r5,r5,r9 + and r10,r3,r4 + vshr.u32 q2,q12,#30 + ldr r9,[sp,#40] + add r5,r5,r6,ror#27 + vst1.32 {q13},[r12,:128]! + eor r11,r3,r4 + add r5,r5,r10 + vsli.32 q2,q12,#2 + and r11,r11,r7 + mov r7,r7,ror#2 + add r5,r5,r11 + add r4,r4,r9 + and r10,r7,r3 + ldr r9,[sp,#44] + add r4,r4,r5,ror#27 + eor r11,r7,r3 + add r4,r4,r10 + and r11,r11,r6 + mov r6,r6,ror#2 + add r4,r4,r11 + add r3,r3,r9 + and r10,r6,r7 + ldr r9,[sp,#48] + add r3,r3,r4,ror#27 + eor r11,r6,r7 + add r3,r3,r10 + and r11,r11,r5 + mov r5,r5,ror#2 + add r3,r3,r11 + vext.8 q12,q1,q2,#8 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#52] + veor q3,q3,q11 + eor r11,r10,r5 + add r7,r7,r3,ror#27 + veor q3,q3,q8 + mov r4,r4,ror#2 + add r7,r7,r11 + vadd.i32 q13,q2,q14 + eor r10,r3,r5 + add r6,r6,r9 + veor q12,q12,q3 + ldr r9,[sp,#56] + eor r11,r10,r4 + vshr.u32 q3,q12,#30 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + vst1.32 {q13},[r12,:128]! + add r6,r6,r11 + eor r10,r7,r4 + vsli.32 q3,q12,#2 + add r5,r5,r9 + ldr r9,[sp,#60] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#0] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + vadd.i32 q13,q3,q14 + eor r10,r5,r7 + add r3,r3,r9 + vst1.32 {q13},[r12,:128]! + sub r12,r12,#64 + teq r1,r2 + sub r8,r8,#16 + it eq + subeq r1,r1,#64 + vld1.8 {q0,q1},[r1]! + ldr r9,[sp,#4] + eor r11,r10,r6 + vld1.8 {q2,q3},[r1]! + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + vld1.32 {d28[],d29[]},[r8,:32]! + add r3,r3,r11 + eor r10,r4,r6 + vrev32.8 q0,q0 + add r7,r7,r9 + ldr r9,[sp,#8] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#12] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#16] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + vrev32.8 q1,q1 + eor r10,r6,r3 + add r4,r4,r9 + vadd.i32 q8,q0,q14 + ldr r9,[sp,#20] + eor r11,r10,r7 + vst1.32 {q8},[r12,:128]! + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#24] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#28] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + eor r10,r3,r5 + add r6,r6,r9 + ldr r9,[sp,#32] + eor r11,r10,r4 + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + vrev32.8 q2,q2 + eor r10,r7,r4 + add r5,r5,r9 + vadd.i32 q9,q1,q14 + ldr r9,[sp,#36] + eor r11,r10,r3 + vst1.32 {q9},[r12,:128]! + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#40] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + ldr r9,[sp,#44] + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + eor r10,r4,r6 + add r7,r7,r9 + ldr r9,[sp,#48] + eor r11,r10,r5 + add r7,r7,r3,ror#27 + mov r4,r4,ror#2 + add r7,r7,r11 + vrev32.8 q3,q3 + eor r10,r3,r5 + add r6,r6,r9 + vadd.i32 q10,q2,q14 + ldr r9,[sp,#52] + eor r11,r10,r4 + vst1.32 {q10},[r12,:128]! + add r6,r6,r7,ror#27 + mov r3,r3,ror#2 + add r6,r6,r11 + eor r10,r7,r4 + add r5,r5,r9 + ldr r9,[sp,#56] + eor r11,r10,r3 + add r5,r5,r6,ror#27 + mov r7,r7,ror#2 + add r5,r5,r11 + eor r10,r6,r3 + add r4,r4,r9 + ldr r9,[sp,#60] + eor r11,r10,r7 + add r4,r4,r5,ror#27 + mov r6,r6,ror#2 + add r4,r4,r11 + eor r10,r5,r7 + add r3,r3,r9 + eor r11,r10,r6 + add r3,r3,r4,ror#27 + mov r5,r5,ror#2 + add r3,r3,r11 + ldmia r0,{r9,r10,r11,r12} @ accumulate context + add r3,r3,r9 + ldr r9,[r0,#16] + add r4,r4,r10 + add r5,r5,r11 + add r6,r6,r12 + it eq + moveq sp,r14 + add r7,r7,r9 + it ne + ldrne r9,[sp] + stmia r0,{r3,r4,r5,r6,r7} + itt ne + addne r12,sp,#3*16 + bne .Loop_neon + + @ vldmia sp!,{d8-d15} + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +.size sha1_block_data_order_neon,.-sha1_block_data_order_neon +#endif +#if __ARM_MAX_ARCH__>=7 + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xf,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d|0x10 +# endif + +.globl sha1_block_data_order_hw +.hidden sha1_block_data_order_hw +.type sha1_block_data_order_hw,%function +.align 5 +sha1_block_data_order_hw: + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so + + veor q1,q1,q1 + adr r3,.LK_00_19 + vld1.32 {q0},[r0]! + vld1.32 {d2[0]},[r0] + sub r0,r0,#16 + vld1.32 {d16[],d17[]},[r3,:32]! + vld1.32 {d18[],d19[]},[r3,:32]! + vld1.32 {d20[],d21[]},[r3,:32]! + vld1.32 {d22[],d23[]},[r3,:32] + +.Loop_v8: + vld1.8 {q4,q5},[r1]! + vld1.8 {q6,q7},[r1]! + vrev32.8 q4,q4 + vrev32.8 q5,q5 + + vadd.i32 q12,q8,q4 + vrev32.8 q6,q6 + vmov q14,q0 @ offload + subs r2,r2,#1 + + vadd.i32 q13,q8,q5 + vrev32.8 q7,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0 + INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12 + vadd.i32 q12,q8,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1 + INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 + vadd.i32 q13,q8,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2 + INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 + vadd.i32 q12,q8,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3 + INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 + vadd.i32 q13,q9,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4 + INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 + vadd.i32 q12,q9,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q9,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q9,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q10,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q10,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11 + INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 + vadd.i32 q13,q10,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q10,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13 + INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 + vadd.i32 q13,q11,q7 + INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 + INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14 + INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 + vadd.i32 q12,q11,q4 + INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 + INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q5 + INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 + INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + vadd.i32 q12,q11,q6 + INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + vadd.i32 q13,q11,q7 + + INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18 + INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 + + INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19 + INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 + + vadd.i32 q1,q1,q2 + vadd.i32 q0,q0,q14 + bne .Loop_v8 + + vst1.32 {q0},[r0]! + vst1.32 {d2[0]},[r0] + + vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} + bx lr @ bx lr +.size sha1_block_data_order_hw,.-sha1_block_data_order_hw +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/sha1-armv8-apple.S b/third_party/boringssl/gen/bcm/sha1-armv8-apple.S new file mode 100644 index 00000000..5744697f --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha1-armv8-apple.S @@ -0,0 +1,1216 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.text + +.globl _sha1_block_data_order_nohw +.private_extern _sha1_block_data_order_nohw + +.align 6 +_sha1_block_data_order_nohw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp w20,w21,[x0] + ldp w22,w23,[x0,#8] + ldr w24,[x0,#16] + +Loop: + ldr x3,[x1],#64 + movz w28,#0x7999 + sub x2,x2,#1 + movk w28,#0x5a82,lsl#16 +#ifdef __AARCH64EB__ + ror x3,x3,#32 +#else + rev32 x3,x3 +#endif + add w24,w24,w28 // warm it up + add w24,w24,w3 + lsr x4,x3,#32 + ldr x5,[x1,#-56] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w4 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x5,x5,#32 +#else + rev32 x5,x5 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w5 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x6,x5,#32 + ldr x7,[x1,#-48] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w6 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x7,x7,#32 +#else + rev32 x7,x7 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w7 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x8,x7,#32 + ldr x9,[x1,#-40] + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w8 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x9,x9,#32 +#else + rev32 x9,x9 +#endif + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w9 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + lsr x10,x9,#32 + ldr x11,[x1,#-32] + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w10 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x11,x11,#32 +#else + rev32 x11,x11 +#endif + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w11 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + lsr x12,x11,#32 + ldr x13,[x1,#-24] + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w12 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x13,x13,#32 +#else + rev32 x13,x13 +#endif + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w13 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + lsr x14,x13,#32 + ldr x15,[x1,#-16] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w14 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x15,x15,#32 +#else + rev32 x15,x15 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w15 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x16,x15,#32 + ldr x17,[x1,#-8] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w16 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x17,x17,#32 +#else + rev32 x17,x17 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w17 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x19,x17,#32 + eor w3,w3,w5 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w3,w3,w11 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w3,w3,w16 + ror w22,w22,#2 + add w24,w24,w19 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + eor w4,w4,w12 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + eor w4,w4,w17 + ror w21,w21,#2 + add w23,w23,w3 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + eor w5,w5,w13 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + eor w5,w5,w19 + ror w20,w20,#2 + add w22,w22,w4 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + eor w6,w6,w14 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + eor w6,w6,w3 + ror w24,w24,#2 + add w21,w21,w5 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + eor w7,w7,w15 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + eor w7,w7,w4 + ror w23,w23,#2 + add w20,w20,w6 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w7,w7,#31 + movz w28,#0xeba1 + movk w28,#0x6ed9,lsl#16 + eor w8,w8,w10 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w8,w8,w16 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w8,w8,w5 + ror w22,w22,#2 + add w24,w24,w7 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w9,w9,w6 + add w23,w23,w8 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w10,w10,w7 + add w22,w22,w9 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w11,w11,w8 + add w21,w21,w10 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w12,w12,w9 + add w20,w20,w11 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w13,w13,w10 + add w24,w24,w12 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w14,w14,w11 + add w23,w23,w13 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w15,w15,w12 + add w22,w22,w14 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w16,w16,w13 + add w21,w21,w15 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w17,w17,w14 + add w20,w20,w16 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w19,w19,w15 + add w24,w24,w17 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w3,w3,w16 + add w23,w23,w19 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w4,w4,w17 + add w22,w22,w3 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w5,w5,w19 + add w21,w21,w4 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w6,w6,w3 + add w20,w20,w5 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w7,w7,w4 + add w24,w24,w6 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w8,w8,w5 + add w23,w23,w7 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w9,w9,w6 + add w22,w22,w8 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w10,w10,w7 + add w21,w21,w9 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w11,w11,w8 + add w20,w20,w10 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w11,w11,#31 + movz w28,#0xbcdc + movk w28,#0x8f1b,lsl#16 + eor w12,w12,w14 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w12,w12,w9 + add w24,w24,w11 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w13,w13,w15 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w13,w13,w5 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w13,w13,w10 + add w23,w23,w12 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w14,w14,w16 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w14,w14,w6 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w14,w14,w11 + add w22,w22,w13 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w15,w15,w17 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w15,w15,w7 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w15,w15,w12 + add w21,w21,w14 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w15,w15,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w16,w16,w19 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w16,w16,w8 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w16,w16,w13 + add w20,w20,w15 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w16,w16,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w17,w17,w3 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w17,w17,w9 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w17,w17,w14 + add w24,w24,w16 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w17,w17,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w19,w19,w4 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w19,w19,w10 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w19,w19,w15 + add w23,w23,w17 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w19,w19,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w3,w3,w5 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w3,w3,w11 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w3,w3,w16 + add w22,w22,w19 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w3,w3,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w4,w4,w6 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w4,w4,w12 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w4,w4,w17 + add w21,w21,w3 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w4,w4,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w5,w5,w7 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w5,w5,w13 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w5,w5,w19 + add w20,w20,w4 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w5,w5,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w6,w6,w8 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w6,w6,w14 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w6,w6,w3 + add w24,w24,w5 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w6,w6,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w7,w7,w9 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w7,w7,w15 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w7,w7,w4 + add w23,w23,w6 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w7,w7,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w8,w8,w10 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w8,w8,w16 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w8,w8,w5 + add w22,w22,w7 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w8,w8,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w9,w9,w11 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w9,w9,w17 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w9,w9,w6 + add w21,w21,w8 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w9,w9,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w10,w10,w12 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w10,w10,w19 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w10,w10,w7 + add w20,w20,w9 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w10,w10,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w11,w11,w13 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w11,w11,w3 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w11,w11,w8 + add w24,w24,w10 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w11,w11,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w12,w12,w14 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w12,w12,w4 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w12,w12,w9 + add w23,w23,w11 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w13,w13,w15 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w13,w13,w5 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w13,w13,w10 + add w22,w22,w12 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w14,w14,w16 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w14,w14,w6 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w14,w14,w11 + add w21,w21,w13 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w15,w15,w17 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w15,w15,w7 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w15,w15,w12 + add w20,w20,w14 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w15,w15,#31 + movz w28,#0xc1d6 + movk w28,#0xca62,lsl#16 + orr w25,w22,w23 + and w26,w22,w23 + eor w16,w16,w19 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w16,w16,w8 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w16,w16,w13 + add w24,w24,w15 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w17,w17,w14 + add w23,w23,w16 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w19,w19,w15 + add w22,w22,w17 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w3,w3,w16 + add w21,w21,w19 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w4,w4,w17 + add w20,w20,w3 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w5,w5,w19 + add w24,w24,w4 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w6,w6,w3 + add w23,w23,w5 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w7,w7,w4 + add w22,w22,w6 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w8,w8,w5 + add w21,w21,w7 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w9,w9,w6 + add w20,w20,w8 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w10,w10,w7 + add w24,w24,w9 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w11,w11,w8 + add w23,w23,w10 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w12,w12,w9 + add w22,w22,w11 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w13,w13,w10 + add w21,w21,w12 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w14,w14,w11 + add w20,w20,w13 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w15,w15,w12 + add w24,w24,w14 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w16,w16,w13 + add w23,w23,w15 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w17,w17,w14 + add w22,w22,w16 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w19,w19,w15 + add w21,w21,w17 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w19,w19,#31 + ldp w4,w5,[x0] + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w19 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ldp w6,w7,[x0,#8] + eor w25,w24,w22 + ror w27,w21,#27 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + ldr w8,[x0,#16] + add w20,w20,w25 // e+=F(b,c,d) + add w21,w21,w5 + add w22,w22,w6 + add w20,w20,w4 + add w23,w23,w7 + add w24,w24,w8 + stp w20,w21,[x0] + stp w22,w23,[x0,#8] + str w24,[x0,#16] + cbnz x2,Loop + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldr x29,[sp],#96 + ret + +.globl _sha1_block_data_order_hw +.private_extern _sha1_block_data_order_hw + +.align 6 +_sha1_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + adrp x4,Lconst@PAGE + add x4,x4,Lconst@PAGEOFF + eor v1.16b,v1.16b,v1.16b + ld1 {v0.4s},[x0],#16 + ld1 {v1.s}[0],[x0] + sub x0,x0,#16 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4] + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + + add v20.4s,v16.4s,v4.4s + rev32 v6.16b,v6.16b + orr v22.16b,v0.16b,v0.16b // offload + + add v21.4s,v16.4s,v5.4s + rev32 v7.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b +.long 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0 + add v20.4s,v16.4s,v6.4s +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 1 +.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v16.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 2 +.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v16.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 3 +.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 4 +.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 5 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 6 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 7 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 8 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 9 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 10 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 11 +.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 12 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 13 +.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 14 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 15 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 16 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 17 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s + +.long 0x5e280803 //sha1h v3.16b,v0.16b // 18 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + +.long 0x5e280802 //sha1h v2.16b,v0.16b // 19 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + + add v1.4s,v1.4s,v2.4s + add v0.4s,v0.4s,v22.4s + + cbnz x2,Loop_hw + + st1 {v0.4s},[x0],#16 + st1 {v1.s}[0],[x0] + + ldr x29,[sp],#16 + ret + +.section __TEXT,__const +.align 6 +Lconst: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/sha1-armv8-linux.S b/third_party/boringssl/gen/bcm/sha1-armv8-linux.S new file mode 100644 index 00000000..d6fb410f --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha1-armv8-linux.S @@ -0,0 +1,1216 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.text + +.globl sha1_block_data_order_nohw +.hidden sha1_block_data_order_nohw +.type sha1_block_data_order_nohw,%function +.align 6 +sha1_block_data_order_nohw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp w20,w21,[x0] + ldp w22,w23,[x0,#8] + ldr w24,[x0,#16] + +.Loop: + ldr x3,[x1],#64 + movz w28,#0x7999 + sub x2,x2,#1 + movk w28,#0x5a82,lsl#16 +#ifdef __AARCH64EB__ + ror x3,x3,#32 +#else + rev32 x3,x3 +#endif + add w24,w24,w28 // warm it up + add w24,w24,w3 + lsr x4,x3,#32 + ldr x5,[x1,#-56] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w4 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x5,x5,#32 +#else + rev32 x5,x5 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w5 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x6,x5,#32 + ldr x7,[x1,#-48] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w6 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x7,x7,#32 +#else + rev32 x7,x7 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w7 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x8,x7,#32 + ldr x9,[x1,#-40] + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w8 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x9,x9,#32 +#else + rev32 x9,x9 +#endif + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w9 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + lsr x10,x9,#32 + ldr x11,[x1,#-32] + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w10 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x11,x11,#32 +#else + rev32 x11,x11 +#endif + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w11 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + lsr x12,x11,#32 + ldr x13,[x1,#-24] + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w12 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x13,x13,#32 +#else + rev32 x13,x13 +#endif + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w13 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + lsr x14,x13,#32 + ldr x15,[x1,#-16] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w14 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x15,x15,#32 +#else + rev32 x15,x15 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w15 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x16,x15,#32 + ldr x17,[x1,#-8] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w16 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x17,x17,#32 +#else + rev32 x17,x17 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w17 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x19,x17,#32 + eor w3,w3,w5 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w3,w3,w11 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w3,w3,w16 + ror w22,w22,#2 + add w24,w24,w19 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + eor w4,w4,w12 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + eor w4,w4,w17 + ror w21,w21,#2 + add w23,w23,w3 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + eor w5,w5,w13 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + eor w5,w5,w19 + ror w20,w20,#2 + add w22,w22,w4 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + eor w6,w6,w14 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + eor w6,w6,w3 + ror w24,w24,#2 + add w21,w21,w5 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + eor w7,w7,w15 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + eor w7,w7,w4 + ror w23,w23,#2 + add w20,w20,w6 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w7,w7,#31 + movz w28,#0xeba1 + movk w28,#0x6ed9,lsl#16 + eor w8,w8,w10 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w8,w8,w16 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w8,w8,w5 + ror w22,w22,#2 + add w24,w24,w7 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w9,w9,w6 + add w23,w23,w8 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w10,w10,w7 + add w22,w22,w9 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w11,w11,w8 + add w21,w21,w10 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w12,w12,w9 + add w20,w20,w11 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w13,w13,w10 + add w24,w24,w12 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w14,w14,w11 + add w23,w23,w13 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w15,w15,w12 + add w22,w22,w14 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w16,w16,w13 + add w21,w21,w15 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w17,w17,w14 + add w20,w20,w16 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w19,w19,w15 + add w24,w24,w17 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w3,w3,w16 + add w23,w23,w19 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w4,w4,w17 + add w22,w22,w3 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w5,w5,w19 + add w21,w21,w4 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w6,w6,w3 + add w20,w20,w5 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w7,w7,w4 + add w24,w24,w6 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w8,w8,w5 + add w23,w23,w7 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w9,w9,w6 + add w22,w22,w8 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w10,w10,w7 + add w21,w21,w9 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w11,w11,w8 + add w20,w20,w10 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w11,w11,#31 + movz w28,#0xbcdc + movk w28,#0x8f1b,lsl#16 + eor w12,w12,w14 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w12,w12,w9 + add w24,w24,w11 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w13,w13,w15 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w13,w13,w5 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w13,w13,w10 + add w23,w23,w12 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w14,w14,w16 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w14,w14,w6 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w14,w14,w11 + add w22,w22,w13 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w15,w15,w17 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w15,w15,w7 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w15,w15,w12 + add w21,w21,w14 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w15,w15,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w16,w16,w19 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w16,w16,w8 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w16,w16,w13 + add w20,w20,w15 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w16,w16,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w17,w17,w3 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w17,w17,w9 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w17,w17,w14 + add w24,w24,w16 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w17,w17,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w19,w19,w4 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w19,w19,w10 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w19,w19,w15 + add w23,w23,w17 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w19,w19,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w3,w3,w5 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w3,w3,w11 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w3,w3,w16 + add w22,w22,w19 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w3,w3,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w4,w4,w6 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w4,w4,w12 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w4,w4,w17 + add w21,w21,w3 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w4,w4,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w5,w5,w7 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w5,w5,w13 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w5,w5,w19 + add w20,w20,w4 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w5,w5,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w6,w6,w8 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w6,w6,w14 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w6,w6,w3 + add w24,w24,w5 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w6,w6,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w7,w7,w9 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w7,w7,w15 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w7,w7,w4 + add w23,w23,w6 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w7,w7,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w8,w8,w10 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w8,w8,w16 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w8,w8,w5 + add w22,w22,w7 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w8,w8,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w9,w9,w11 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w9,w9,w17 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w9,w9,w6 + add w21,w21,w8 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w9,w9,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w10,w10,w12 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w10,w10,w19 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w10,w10,w7 + add w20,w20,w9 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w10,w10,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w11,w11,w13 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w11,w11,w3 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w11,w11,w8 + add w24,w24,w10 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w11,w11,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w12,w12,w14 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w12,w12,w4 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w12,w12,w9 + add w23,w23,w11 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w13,w13,w15 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w13,w13,w5 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w13,w13,w10 + add w22,w22,w12 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w14,w14,w16 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w14,w14,w6 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w14,w14,w11 + add w21,w21,w13 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w15,w15,w17 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w15,w15,w7 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w15,w15,w12 + add w20,w20,w14 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w15,w15,#31 + movz w28,#0xc1d6 + movk w28,#0xca62,lsl#16 + orr w25,w22,w23 + and w26,w22,w23 + eor w16,w16,w19 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w16,w16,w8 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w16,w16,w13 + add w24,w24,w15 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w17,w17,w14 + add w23,w23,w16 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w19,w19,w15 + add w22,w22,w17 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w3,w3,w16 + add w21,w21,w19 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w4,w4,w17 + add w20,w20,w3 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w5,w5,w19 + add w24,w24,w4 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w6,w6,w3 + add w23,w23,w5 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w7,w7,w4 + add w22,w22,w6 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w8,w8,w5 + add w21,w21,w7 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w9,w9,w6 + add w20,w20,w8 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w10,w10,w7 + add w24,w24,w9 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w11,w11,w8 + add w23,w23,w10 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w12,w12,w9 + add w22,w22,w11 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w13,w13,w10 + add w21,w21,w12 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w14,w14,w11 + add w20,w20,w13 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w15,w15,w12 + add w24,w24,w14 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w16,w16,w13 + add w23,w23,w15 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w17,w17,w14 + add w22,w22,w16 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w19,w19,w15 + add w21,w21,w17 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w19,w19,#31 + ldp w4,w5,[x0] + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w19 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ldp w6,w7,[x0,#8] + eor w25,w24,w22 + ror w27,w21,#27 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + ldr w8,[x0,#16] + add w20,w20,w25 // e+=F(b,c,d) + add w21,w21,w5 + add w22,w22,w6 + add w20,w20,w4 + add w23,w23,w7 + add w24,w24,w8 + stp w20,w21,[x0] + stp w22,w23,[x0,#8] + str w24,[x0,#16] + cbnz x2,.Loop + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldr x29,[sp],#96 + ret +.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw +.globl sha1_block_data_order_hw +.hidden sha1_block_data_order_hw +.type sha1_block_data_order_hw,%function +.align 6 +sha1_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + adrp x4,.Lconst + add x4,x4,:lo12:.Lconst + eor v1.16b,v1.16b,v1.16b + ld1 {v0.4s},[x0],#16 + ld1 {v1.s}[0],[x0] + sub x0,x0,#16 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4] + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + + add v20.4s,v16.4s,v4.4s + rev32 v6.16b,v6.16b + orr v22.16b,v0.16b,v0.16b // offload + + add v21.4s,v16.4s,v5.4s + rev32 v7.16b,v7.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b +.inst 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0 + add v20.4s,v16.4s,v6.4s +.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 1 +.inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v16.4s,v7.4s +.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 2 +.inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v16.4s,v4.4s +.inst 0x5e281885 //sha1su1 v5.16b,v4.16b +.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 3 +.inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 4 +.inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v6.4s +.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 5 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v7.4s +.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 6 +.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v4.4s +.inst 0x5e281885 //sha1su1 v5.16b,v4.16b +.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 7 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 8 +.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 9 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v7.4s +.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 10 +.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v4.4s +.inst 0x5e281885 //sha1su1 v5.16b,v4.16b +.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 11 +.inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v5.4s +.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 12 +.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 13 +.inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s +.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 14 +.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v4.4s +.inst 0x5e281885 //sha1su1 v5.16b,v4.16b +.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 15 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v5.4s +.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 16 +.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v6.4s +.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 17 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s + +.inst 0x5e280803 //sha1h v3.16b,v0.16b // 18 +.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + +.inst 0x5e280802 //sha1h v2.16b,v0.16b // 19 +.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + + add v1.4s,v1.4s,v2.4s + add v0.4s,v0.4s,v22.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s},[x0],#16 + st1 {v1.s}[0],[x0] + + ldr x29,[sp],#16 + ret +.size sha1_block_data_order_hw,.-sha1_block_data_order_hw +.section .rodata +.align 6 +.Lconst: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/sha1-armv8-win.S b/third_party/boringssl/gen/bcm/sha1-armv8-win.S new file mode 100644 index 00000000..b8161b92 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha1-armv8-win.S @@ -0,0 +1,1220 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.text + +.globl sha1_block_data_order_nohw + +.def sha1_block_data_order_nohw + .type 32 +.endef +.align 6 +sha1_block_data_order_nohw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + + ldp w20,w21,[x0] + ldp w22,w23,[x0,#8] + ldr w24,[x0,#16] + +Loop: + ldr x3,[x1],#64 + movz w28,#0x7999 + sub x2,x2,#1 + movk w28,#0x5a82,lsl#16 +#ifdef __AARCH64EB__ + ror x3,x3,#32 +#else + rev32 x3,x3 +#endif + add w24,w24,w28 // warm it up + add w24,w24,w3 + lsr x4,x3,#32 + ldr x5,[x1,#-56] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w4 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x5,x5,#32 +#else + rev32 x5,x5 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w5 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x6,x5,#32 + ldr x7,[x1,#-48] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w6 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x7,x7,#32 +#else + rev32 x7,x7 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w7 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x8,x7,#32 + ldr x9,[x1,#-40] + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w8 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x9,x9,#32 +#else + rev32 x9,x9 +#endif + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w9 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + lsr x10,x9,#32 + ldr x11,[x1,#-32] + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w10 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x11,x11,#32 +#else + rev32 x11,x11 +#endif + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w11 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + lsr x12,x11,#32 + ldr x13,[x1,#-24] + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w12 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x13,x13,#32 +#else + rev32 x13,x13 +#endif + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + add w24,w24,w13 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + lsr x14,x13,#32 + ldr x15,[x1,#-16] + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + add w23,w23,w14 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x15,x15,#32 +#else + rev32 x15,x15 +#endif + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + add w22,w22,w15 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + lsr x16,x15,#32 + ldr x17,[x1,#-8] + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + add w21,w21,w16 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) +#ifdef __AARCH64EB__ + ror x17,x17,#32 +#else + rev32 x17,x17 +#endif + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w17 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + lsr x19,x17,#32 + eor w3,w3,w5 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w3,w3,w11 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w3,w3,w16 + ror w22,w22,#2 + add w24,w24,w19 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + bic w25,w23,w21 + and w26,w22,w21 + ror w27,w20,#27 + eor w4,w4,w12 + add w23,w23,w28 // future e+=K + orr w25,w25,w26 + add w24,w24,w27 // e+=rot(a,5) + eor w4,w4,w17 + ror w21,w21,#2 + add w23,w23,w3 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + bic w25,w22,w20 + and w26,w21,w20 + ror w27,w24,#27 + eor w5,w5,w13 + add w22,w22,w28 // future e+=K + orr w25,w25,w26 + add w23,w23,w27 // e+=rot(a,5) + eor w5,w5,w19 + ror w20,w20,#2 + add w22,w22,w4 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + bic w25,w21,w24 + and w26,w20,w24 + ror w27,w23,#27 + eor w6,w6,w14 + add w21,w21,w28 // future e+=K + orr w25,w25,w26 + add w22,w22,w27 // e+=rot(a,5) + eor w6,w6,w3 + ror w24,w24,#2 + add w21,w21,w5 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + bic w25,w20,w23 + and w26,w24,w23 + ror w27,w22,#27 + eor w7,w7,w15 + add w20,w20,w28 // future e+=K + orr w25,w25,w26 + add w21,w21,w27 // e+=rot(a,5) + eor w7,w7,w4 + ror w23,w23,#2 + add w20,w20,w6 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w7,w7,#31 + movz w28,#0xeba1 + movk w28,#0x6ed9,lsl#16 + eor w8,w8,w10 + bic w25,w24,w22 + and w26,w23,w22 + ror w27,w21,#27 + eor w8,w8,w16 + add w24,w24,w28 // future e+=K + orr w25,w25,w26 + add w20,w20,w27 // e+=rot(a,5) + eor w8,w8,w5 + ror w22,w22,#2 + add w24,w24,w7 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w9,w9,w6 + add w23,w23,w8 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w10,w10,w7 + add w22,w22,w9 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w11,w11,w8 + add w21,w21,w10 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w12,w12,w9 + add w20,w20,w11 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w13,w13,w10 + add w24,w24,w12 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w14,w14,w11 + add w23,w23,w13 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w15,w15,w12 + add w22,w22,w14 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w16,w16,w13 + add w21,w21,w15 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w17,w17,w14 + add w20,w20,w16 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w19,w19,w15 + add w24,w24,w17 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w3,w3,w16 + add w23,w23,w19 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w4,w4,w17 + add w22,w22,w3 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w5,w5,w19 + add w21,w21,w4 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w6,w6,w3 + add w20,w20,w5 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w7,w7,w4 + add w24,w24,w6 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w8,w8,w5 + add w23,w23,w7 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w9,w9,w6 + add w22,w22,w8 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w10,w10,w7 + add w21,w21,w9 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w11,w11,w8 + add w20,w20,w10 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w11,w11,#31 + movz w28,#0xbcdc + movk w28,#0x8f1b,lsl#16 + eor w12,w12,w14 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w12,w12,w9 + add w24,w24,w11 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w13,w13,w15 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w13,w13,w5 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w13,w13,w10 + add w23,w23,w12 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w14,w14,w16 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w14,w14,w6 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w14,w14,w11 + add w22,w22,w13 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w15,w15,w17 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w15,w15,w7 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w15,w15,w12 + add w21,w21,w14 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w15,w15,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w16,w16,w19 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w16,w16,w8 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w16,w16,w13 + add w20,w20,w15 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w16,w16,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w17,w17,w3 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w17,w17,w9 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w17,w17,w14 + add w24,w24,w16 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w17,w17,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w19,w19,w4 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w19,w19,w10 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w19,w19,w15 + add w23,w23,w17 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w19,w19,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w3,w3,w5 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w3,w3,w11 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w3,w3,w16 + add w22,w22,w19 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w3,w3,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w4,w4,w6 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w4,w4,w12 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w4,w4,w17 + add w21,w21,w3 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w4,w4,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w5,w5,w7 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w5,w5,w13 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w5,w5,w19 + add w20,w20,w4 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w5,w5,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w6,w6,w8 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w6,w6,w14 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w6,w6,w3 + add w24,w24,w5 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w6,w6,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w7,w7,w9 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w7,w7,w15 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w7,w7,w4 + add w23,w23,w6 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w7,w7,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w8,w8,w10 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w8,w8,w16 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w8,w8,w5 + add w22,w22,w7 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w8,w8,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w9,w9,w11 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w9,w9,w17 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w9,w9,w6 + add w21,w21,w8 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w9,w9,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w10,w10,w12 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w10,w10,w19 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w10,w10,w7 + add w20,w20,w9 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w10,w10,#31 + orr w25,w22,w23 + and w26,w22,w23 + eor w11,w11,w13 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w11,w11,w3 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w11,w11,w8 + add w24,w24,w10 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w11,w11,#31 + orr w25,w21,w22 + and w26,w21,w22 + eor w12,w12,w14 + ror w27,w20,#27 + and w25,w25,w23 + add w23,w23,w28 // future e+=K + eor w12,w12,w4 + add w24,w24,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w21,w21,#2 + eor w12,w12,w9 + add w23,w23,w11 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w12,w12,#31 + orr w25,w20,w21 + and w26,w20,w21 + eor w13,w13,w15 + ror w27,w24,#27 + and w25,w25,w22 + add w22,w22,w28 // future e+=K + eor w13,w13,w5 + add w23,w23,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w20,w20,#2 + eor w13,w13,w10 + add w22,w22,w12 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w13,w13,#31 + orr w25,w24,w20 + and w26,w24,w20 + eor w14,w14,w16 + ror w27,w23,#27 + and w25,w25,w21 + add w21,w21,w28 // future e+=K + eor w14,w14,w6 + add w22,w22,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w24,w24,#2 + eor w14,w14,w11 + add w21,w21,w13 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w14,w14,#31 + orr w25,w23,w24 + and w26,w23,w24 + eor w15,w15,w17 + ror w27,w22,#27 + and w25,w25,w20 + add w20,w20,w28 // future e+=K + eor w15,w15,w7 + add w21,w21,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w23,w23,#2 + eor w15,w15,w12 + add w20,w20,w14 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w15,w15,#31 + movz w28,#0xc1d6 + movk w28,#0xca62,lsl#16 + orr w25,w22,w23 + and w26,w22,w23 + eor w16,w16,w19 + ror w27,w21,#27 + and w25,w25,w24 + add w24,w24,w28 // future e+=K + eor w16,w16,w8 + add w20,w20,w27 // e+=rot(a,5) + orr w25,w25,w26 + ror w22,w22,#2 + eor w16,w16,w13 + add w24,w24,w15 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w17,w17,w14 + add w23,w23,w16 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w19,w19,w15 + add w22,w22,w17 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w19,w19,#31 + eor w3,w3,w5 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w3,w3,w11 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w3,w3,w16 + add w21,w21,w19 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w3,w3,#31 + eor w4,w4,w6 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w4,w4,w12 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w4,w4,w17 + add w20,w20,w3 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w4,w4,#31 + eor w5,w5,w7 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w5,w5,w13 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w5,w5,w19 + add w24,w24,w4 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w5,w5,#31 + eor w6,w6,w8 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w6,w6,w14 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w6,w6,w3 + add w23,w23,w5 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w6,w6,#31 + eor w7,w7,w9 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w7,w7,w15 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w7,w7,w4 + add w22,w22,w6 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w7,w7,#31 + eor w8,w8,w10 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w8,w8,w16 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w8,w8,w5 + add w21,w21,w7 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w8,w8,#31 + eor w9,w9,w11 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w9,w9,w17 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w9,w9,w6 + add w20,w20,w8 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w9,w9,#31 + eor w10,w10,w12 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w10,w10,w19 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w10,w10,w7 + add w24,w24,w9 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w10,w10,#31 + eor w11,w11,w13 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w11,w11,w3 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w11,w11,w8 + add w23,w23,w10 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w11,w11,#31 + eor w12,w12,w14 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w12,w12,w4 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w12,w12,w9 + add w22,w22,w11 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w12,w12,#31 + eor w13,w13,w15 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w13,w13,w5 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w13,w13,w10 + add w21,w21,w12 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w13,w13,#31 + eor w14,w14,w16 + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w14,w14,w6 + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + eor w14,w14,w11 + add w20,w20,w13 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ror w14,w14,#31 + eor w15,w15,w17 + eor w25,w24,w22 + ror w27,w21,#27 + add w24,w24,w28 // future e+=K + eor w15,w15,w7 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + eor w15,w15,w12 + add w24,w24,w14 // future e+=X[i] + add w20,w20,w25 // e+=F(b,c,d) + ror w15,w15,#31 + eor w16,w16,w19 + eor w25,w23,w21 + ror w27,w20,#27 + add w23,w23,w28 // future e+=K + eor w16,w16,w8 + eor w25,w25,w22 + add w24,w24,w27 // e+=rot(a,5) + ror w21,w21,#2 + eor w16,w16,w13 + add w23,w23,w15 // future e+=X[i] + add w24,w24,w25 // e+=F(b,c,d) + ror w16,w16,#31 + eor w17,w17,w3 + eor w25,w22,w20 + ror w27,w24,#27 + add w22,w22,w28 // future e+=K + eor w17,w17,w9 + eor w25,w25,w21 + add w23,w23,w27 // e+=rot(a,5) + ror w20,w20,#2 + eor w17,w17,w14 + add w22,w22,w16 // future e+=X[i] + add w23,w23,w25 // e+=F(b,c,d) + ror w17,w17,#31 + eor w19,w19,w4 + eor w25,w21,w24 + ror w27,w23,#27 + add w21,w21,w28 // future e+=K + eor w19,w19,w10 + eor w25,w25,w20 + add w22,w22,w27 // e+=rot(a,5) + ror w24,w24,#2 + eor w19,w19,w15 + add w21,w21,w17 // future e+=X[i] + add w22,w22,w25 // e+=F(b,c,d) + ror w19,w19,#31 + ldp w4,w5,[x0] + eor w25,w20,w23 + ror w27,w22,#27 + add w20,w20,w28 // future e+=K + eor w25,w25,w24 + add w21,w21,w27 // e+=rot(a,5) + ror w23,w23,#2 + add w20,w20,w19 // future e+=X[i] + add w21,w21,w25 // e+=F(b,c,d) + ldp w6,w7,[x0,#8] + eor w25,w24,w22 + ror w27,w21,#27 + eor w25,w25,w23 + add w20,w20,w27 // e+=rot(a,5) + ror w22,w22,#2 + ldr w8,[x0,#16] + add w20,w20,w25 // e+=F(b,c,d) + add w21,w21,w5 + add w22,w22,w6 + add w20,w20,w4 + add w23,w23,w7 + add w24,w24,w8 + stp w20,w21,[x0] + stp w22,w23,[x0,#8] + str w24,[x0,#16] + cbnz x2,Loop + + ldp x19,x20,[sp,#16] + ldp x21,x22,[sp,#32] + ldp x23,x24,[sp,#48] + ldp x25,x26,[sp,#64] + ldp x27,x28,[sp,#80] + ldr x29,[sp],#96 + ret + +.globl sha1_block_data_order_hw + +.def sha1_block_data_order_hw + .type 32 +.endef +.align 6 +sha1_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + adrp x4,Lconst + add x4,x4,:lo12:Lconst + eor v1.16b,v1.16b,v1.16b + ld1 {v0.4s},[x0],#16 + ld1 {v1.s}[0],[x0] + sub x0,x0,#16 + ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4] + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + + add v20.4s,v16.4s,v4.4s + rev32 v6.16b,v6.16b + orr v22.16b,v0.16b,v0.16b // offload + + add v21.4s,v16.4s,v5.4s + rev32 v7.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b +.long 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0 + add v20.4s,v16.4s,v6.4s +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 1 +.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v16.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 2 +.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v16.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 3 +.long 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 4 +.long 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 5 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 6 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v17.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 7 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v17.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 8 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 9 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 10 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 11 +.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v18.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 12 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v18.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 13 +.long 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s +.long 0x5e2818e4 //sha1su1 v4.16b,v7.16b +.long 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 14 +.long 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v4.4s +.long 0x5e281885 //sha1su1 v5.16b,v4.16b +.long 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 15 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v5.4s +.long 0x5e2818a6 //sha1su1 v6.16b,v5.16b +.long 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b +.long 0x5e280803 //sha1h v3.16b,v0.16b // 16 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + add v20.4s,v19.4s,v6.4s +.long 0x5e2818c7 //sha1su1 v7.16b,v6.16b +.long 0x5e280802 //sha1h v2.16b,v0.16b // 17 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + add v21.4s,v19.4s,v7.4s + +.long 0x5e280803 //sha1h v3.16b,v0.16b // 18 +.long 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s + +.long 0x5e280802 //sha1h v2.16b,v0.16b // 19 +.long 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s + + add v1.4s,v1.4s,v2.4s + add v0.4s,v0.4s,v22.4s + + cbnz x2,Loop_hw + + st1 {v0.4s},[x0],#16 + st1 {v1.s}[0],[x0] + + ldr x29,[sp],#16 + ret + +.section .rodata +.align 6 +Lconst: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/sha1-x86_64-apple.S b/third_party/boringssl/gen/bcm/sha1-x86_64-apple.S new file mode 100644 index 00000000..32b3bc76 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha1-x86_64-apple.S @@ -0,0 +1,5450 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.globl _sha1_block_data_order_nohw +.private_extern _sha1_block_data_order_nohw + +.p2align 4 +_sha1_block_data_order_nohw: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + movq %rdi,%r8 + subq $72,%rsp + movq %rsi,%r9 + andq $-64,%rsp + movq %rdx,%r10 + movq %rax,64(%rsp) + +L$prologue: + + movl 0(%r8),%esi + movl 4(%r8),%edi + movl 8(%r8),%r11d + movl 12(%r8),%r12d + movl 16(%r8),%r13d + jmp L$loop + +.p2align 4 +L$loop: + movl 0(%r9),%edx + bswapl %edx + movl 4(%r9),%ebp + movl %r12d,%eax + movl %edx,0(%rsp) + movl %esi,%ecx + bswapl %ebp + xorl %r11d,%eax + roll $5,%ecx + andl %edi,%eax + leal 1518500249(%rdx,%r13,1),%r13d + addl %ecx,%r13d + xorl %r12d,%eax + roll $30,%edi + addl %eax,%r13d + movl 8(%r9),%r14d + movl %r11d,%eax + movl %ebp,4(%rsp) + movl %r13d,%ecx + bswapl %r14d + xorl %edi,%eax + roll $5,%ecx + andl %esi,%eax + leal 1518500249(%rbp,%r12,1),%r12d + addl %ecx,%r12d + xorl %r11d,%eax + roll $30,%esi + addl %eax,%r12d + movl 12(%r9),%edx + movl %edi,%eax + movl %r14d,8(%rsp) + movl %r12d,%ecx + bswapl %edx + xorl %esi,%eax + roll $5,%ecx + andl %r13d,%eax + leal 1518500249(%r14,%r11,1),%r11d + addl %ecx,%r11d + xorl %edi,%eax + roll $30,%r13d + addl %eax,%r11d + movl 16(%r9),%ebp + movl %esi,%eax + movl %edx,12(%rsp) + movl %r11d,%ecx + bswapl %ebp + xorl %r13d,%eax + roll $5,%ecx + andl %r12d,%eax + leal 1518500249(%rdx,%rdi,1),%edi + addl %ecx,%edi + xorl %esi,%eax + roll $30,%r12d + addl %eax,%edi + movl 20(%r9),%r14d + movl %r13d,%eax + movl %ebp,16(%rsp) + movl %edi,%ecx + bswapl %r14d + xorl %r12d,%eax + roll $5,%ecx + andl %r11d,%eax + leal 1518500249(%rbp,%rsi,1),%esi + addl %ecx,%esi + xorl %r13d,%eax + roll $30,%r11d + addl %eax,%esi + movl 24(%r9),%edx + movl %r12d,%eax + movl %r14d,20(%rsp) + movl %esi,%ecx + bswapl %edx + xorl %r11d,%eax + roll $5,%ecx + andl %edi,%eax + leal 1518500249(%r14,%r13,1),%r13d + addl %ecx,%r13d + xorl %r12d,%eax + roll $30,%edi + addl %eax,%r13d + movl 28(%r9),%ebp + movl %r11d,%eax + movl %edx,24(%rsp) + movl %r13d,%ecx + bswapl %ebp + xorl %edi,%eax + roll $5,%ecx + andl %esi,%eax + leal 1518500249(%rdx,%r12,1),%r12d + addl %ecx,%r12d + xorl %r11d,%eax + roll $30,%esi + addl %eax,%r12d + movl 32(%r9),%r14d + movl %edi,%eax + movl %ebp,28(%rsp) + movl %r12d,%ecx + bswapl %r14d + xorl %esi,%eax + roll $5,%ecx + andl %r13d,%eax + leal 1518500249(%rbp,%r11,1),%r11d + addl %ecx,%r11d + xorl %edi,%eax + roll $30,%r13d + addl %eax,%r11d + movl 36(%r9),%edx + movl %esi,%eax + movl %r14d,32(%rsp) + movl %r11d,%ecx + bswapl %edx + xorl %r13d,%eax + roll $5,%ecx + andl %r12d,%eax + leal 1518500249(%r14,%rdi,1),%edi + addl %ecx,%edi + xorl %esi,%eax + roll $30,%r12d + addl %eax,%edi + movl 40(%r9),%ebp + movl %r13d,%eax + movl %edx,36(%rsp) + movl %edi,%ecx + bswapl %ebp + xorl %r12d,%eax + roll $5,%ecx + andl %r11d,%eax + leal 1518500249(%rdx,%rsi,1),%esi + addl %ecx,%esi + xorl %r13d,%eax + roll $30,%r11d + addl %eax,%esi + movl 44(%r9),%r14d + movl %r12d,%eax + movl %ebp,40(%rsp) + movl %esi,%ecx + bswapl %r14d + xorl %r11d,%eax + roll $5,%ecx + andl %edi,%eax + leal 1518500249(%rbp,%r13,1),%r13d + addl %ecx,%r13d + xorl %r12d,%eax + roll $30,%edi + addl %eax,%r13d + movl 48(%r9),%edx + movl %r11d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ecx + bswapl %edx + xorl %edi,%eax + roll $5,%ecx + andl %esi,%eax + leal 1518500249(%r14,%r12,1),%r12d + addl %ecx,%r12d + xorl %r11d,%eax + roll $30,%esi + addl %eax,%r12d + movl 52(%r9),%ebp + movl %edi,%eax + movl %edx,48(%rsp) + movl %r12d,%ecx + bswapl %ebp + xorl %esi,%eax + roll $5,%ecx + andl %r13d,%eax + leal 1518500249(%rdx,%r11,1),%r11d + addl %ecx,%r11d + xorl %edi,%eax + roll $30,%r13d + addl %eax,%r11d + movl 56(%r9),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) + movl %r11d,%ecx + bswapl %r14d + xorl %r13d,%eax + roll $5,%ecx + andl %r12d,%eax + leal 1518500249(%rbp,%rdi,1),%edi + addl %ecx,%edi + xorl %esi,%eax + roll $30,%r12d + addl %eax,%edi + movl 60(%r9),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) + movl %edi,%ecx + bswapl %edx + xorl %r12d,%eax + roll $5,%ecx + andl %r11d,%eax + leal 1518500249(%r14,%rsi,1),%esi + addl %ecx,%esi + xorl %r13d,%eax + roll $30,%r11d + addl %eax,%esi + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) + movl %esi,%ecx + xorl 8(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + andl %edi,%eax + leal 1518500249(%rdx,%r13,1),%r13d + roll $30,%edi + xorl %r12d,%eax + addl %ecx,%r13d + roll $1,%ebp + addl %eax,%r13d + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) + movl %r13d,%ecx + xorl 12(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + andl %esi,%eax + leal 1518500249(%rbp,%r12,1),%r12d + roll $30,%esi + xorl %r11d,%eax + addl %ecx,%r12d + roll $1,%r14d + addl %eax,%r12d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) + movl %r12d,%ecx + xorl 16(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%edx + andl %r13d,%eax + leal 1518500249(%r14,%r11,1),%r11d + roll $30,%r13d + xorl %edi,%eax + addl %ecx,%r11d + roll $1,%edx + addl %eax,%r11d + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) + movl %r11d,%ecx + xorl 20(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%ebp + andl %r12d,%eax + leal 1518500249(%rdx,%rdi,1),%edi + roll $30,%r12d + xorl %esi,%eax + addl %ecx,%edi + roll $1,%ebp + addl %eax,%edi + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) + movl %edi,%ecx + xorl 24(%rsp),%r14d + xorl %r12d,%eax + roll $5,%ecx + xorl 48(%rsp),%r14d + andl %r11d,%eax + leal 1518500249(%rbp,%rsi,1),%esi + roll $30,%r11d + xorl %r13d,%eax + addl %ecx,%esi + roll $1,%r14d + addl %eax,%esi + xorl 20(%rsp),%edx + movl %edi,%eax + movl %r14d,16(%rsp) + movl %esi,%ecx + xorl 28(%rsp),%edx + xorl %r12d,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%edx + xorl 24(%rsp),%ebp + movl %esi,%eax + movl %edx,20(%rsp) + movl %r13d,%ecx + xorl 32(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %r13d,%eax + movl %ebp,24(%rsp) + movl %r12d,%ecx + xorl 36(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%r14d + xorl 32(%rsp),%edx + movl %r12d,%eax + movl %r14d,28(%rsp) + movl %r11d,%ecx + xorl 40(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 0(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%edx + xorl 36(%rsp),%ebp + movl %r11d,%eax + movl %edx,32(%rsp) + movl %edi,%ecx + xorl 44(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 4(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%ebp + xorl 40(%rsp),%r14d + movl %edi,%eax + movl %ebp,36(%rsp) + movl %esi,%ecx + xorl 48(%rsp),%r14d + xorl %r12d,%eax + roll $5,%ecx + xorl 8(%rsp),%r14d + leal 1859775393(%rbp,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%r14d + xorl 44(%rsp),%edx + movl %esi,%eax + movl %r14d,40(%rsp) + movl %r13d,%ecx + xorl 52(%rsp),%edx + xorl %r11d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal 1859775393(%r14,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%edx + xorl 48(%rsp),%ebp + movl %r13d,%eax + movl %edx,44(%rsp) + movl %r12d,%ecx + xorl 56(%rsp),%ebp + xorl %edi,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal 1859775393(%rdx,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %r12d,%eax + movl %ebp,48(%rsp) + movl %r11d,%ecx + xorl 60(%rsp),%r14d + xorl %esi,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal 1859775393(%rbp,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r11d,%eax + movl %r14d,52(%rsp) + movl %edi,%ecx + xorl 0(%rsp),%edx + xorl %r13d,%eax + roll $5,%ecx + xorl 24(%rsp),%edx + leal 1859775393(%r14,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%edx + xorl 60(%rsp),%ebp + movl %edi,%eax + movl %edx,56(%rsp) + movl %esi,%ecx + xorl 4(%rsp),%ebp + xorl %r12d,%eax + roll $5,%ecx + xorl 28(%rsp),%ebp + leal 1859775393(%rdx,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%ebp + xorl 0(%rsp),%r14d + movl %esi,%eax + movl %ebp,60(%rsp) + movl %r13d,%ecx + xorl 8(%rsp),%r14d + xorl %r11d,%eax + roll $5,%ecx + xorl 32(%rsp),%r14d + leal 1859775393(%rbp,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%r14d + xorl 4(%rsp),%edx + movl %r13d,%eax + movl %r14d,0(%rsp) + movl %r12d,%ecx + xorl 12(%rsp),%edx + xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%edx + leal 1859775393(%r14,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%edx + xorl 8(%rsp),%ebp + movl %r12d,%eax + movl %edx,4(%rsp) + movl %r11d,%ecx + xorl 16(%rsp),%ebp + xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%ebp + leal 1859775393(%rdx,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%ebp + xorl 12(%rsp),%r14d + movl %r11d,%eax + movl %ebp,8(%rsp) + movl %edi,%ecx + xorl 20(%rsp),%r14d + xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%r14d + leal 1859775393(%rbp,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%r14d + xorl 16(%rsp),%edx + movl %edi,%eax + movl %r14d,12(%rsp) + movl %esi,%ecx + xorl 24(%rsp),%edx + xorl %r12d,%eax + roll $5,%ecx + xorl 48(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%edx + xorl 20(%rsp),%ebp + movl %esi,%eax + movl %edx,16(%rsp) + movl %r13d,%ecx + xorl 28(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 52(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%ebp + xorl 24(%rsp),%r14d + movl %r13d,%eax + movl %ebp,20(%rsp) + movl %r12d,%ecx + xorl 32(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 56(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%r14d + xorl 28(%rsp),%edx + movl %r12d,%eax + movl %r14d,24(%rsp) + movl %r11d,%ecx + xorl 36(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 60(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%edx + xorl 32(%rsp),%ebp + movl %r11d,%eax + movl %edx,28(%rsp) + movl %edi,%ecx + xorl 40(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 0(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%ebp + xorl 36(%rsp),%r14d + movl %r12d,%eax + movl %ebp,32(%rsp) + movl %r12d,%ebx + xorl 44(%rsp),%r14d + andl %r11d,%eax + movl %esi,%ecx + xorl 4(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%r14d + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 40(%rsp),%edx + movl %r11d,%eax + movl %r14d,36(%rsp) + movl %r11d,%ebx + xorl 48(%rsp),%edx + andl %edi,%eax + movl %r13d,%ecx + xorl 8(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%edx + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 44(%rsp),%ebp + movl %edi,%eax + movl %edx,40(%rsp) + movl %edi,%ebx + xorl 52(%rsp),%ebp + andl %esi,%eax + movl %r12d,%ecx + xorl 12(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%ebp + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 48(%rsp),%r14d + movl %esi,%eax + movl %ebp,44(%rsp) + movl %esi,%ebx + xorl 56(%rsp),%r14d + andl %r13d,%eax + movl %r11d,%ecx + xorl 16(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%r14d + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 52(%rsp),%edx + movl %r13d,%eax + movl %r14d,48(%rsp) + movl %r13d,%ebx + xorl 60(%rsp),%edx + andl %r12d,%eax + movl %edi,%ecx + xorl 20(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%edx + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 56(%rsp),%ebp + movl %r12d,%eax + movl %edx,52(%rsp) + movl %r12d,%ebx + xorl 0(%rsp),%ebp + andl %r11d,%eax + movl %esi,%ecx + xorl 24(%rsp),%ebp + leal -1894007588(%rdx,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%ebp + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 60(%rsp),%r14d + movl %r11d,%eax + movl %ebp,56(%rsp) + movl %r11d,%ebx + xorl 4(%rsp),%r14d + andl %edi,%eax + movl %r13d,%ecx + xorl 28(%rsp),%r14d + leal -1894007588(%rbp,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%r14d + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 0(%rsp),%edx + movl %edi,%eax + movl %r14d,60(%rsp) + movl %edi,%ebx + xorl 8(%rsp),%edx + andl %esi,%eax + movl %r12d,%ecx + xorl 32(%rsp),%edx + leal -1894007588(%r14,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%edx + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 4(%rsp),%ebp + movl %esi,%eax + movl %edx,0(%rsp) + movl %esi,%ebx + xorl 12(%rsp),%ebp + andl %r13d,%eax + movl %r11d,%ecx + xorl 36(%rsp),%ebp + leal -1894007588(%rdx,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%ebp + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 8(%rsp),%r14d + movl %r13d,%eax + movl %ebp,4(%rsp) + movl %r13d,%ebx + xorl 16(%rsp),%r14d + andl %r12d,%eax + movl %edi,%ecx + xorl 40(%rsp),%r14d + leal -1894007588(%rbp,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%r14d + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 12(%rsp),%edx + movl %r12d,%eax + movl %r14d,8(%rsp) + movl %r12d,%ebx + xorl 20(%rsp),%edx + andl %r11d,%eax + movl %esi,%ecx + xorl 44(%rsp),%edx + leal -1894007588(%r14,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%edx + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 16(%rsp),%ebp + movl %r11d,%eax + movl %edx,12(%rsp) + movl %r11d,%ebx + xorl 24(%rsp),%ebp + andl %edi,%eax + movl %r13d,%ecx + xorl 48(%rsp),%ebp + leal -1894007588(%rdx,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%ebp + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 20(%rsp),%r14d + movl %edi,%eax + movl %ebp,16(%rsp) + movl %edi,%ebx + xorl 28(%rsp),%r14d + andl %esi,%eax + movl %r12d,%ecx + xorl 52(%rsp),%r14d + leal -1894007588(%rbp,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%r14d + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 24(%rsp),%edx + movl %esi,%eax + movl %r14d,20(%rsp) + movl %esi,%ebx + xorl 32(%rsp),%edx + andl %r13d,%eax + movl %r11d,%ecx + xorl 56(%rsp),%edx + leal -1894007588(%r14,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%edx + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 28(%rsp),%ebp + movl %r13d,%eax + movl %edx,24(%rsp) + movl %r13d,%ebx + xorl 36(%rsp),%ebp + andl %r12d,%eax + movl %edi,%ecx + xorl 60(%rsp),%ebp + leal -1894007588(%rdx,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%ebp + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 32(%rsp),%r14d + movl %r12d,%eax + movl %ebp,28(%rsp) + movl %r12d,%ebx + xorl 40(%rsp),%r14d + andl %r11d,%eax + movl %esi,%ecx + xorl 0(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%r14d + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 36(%rsp),%edx + movl %r11d,%eax + movl %r14d,32(%rsp) + movl %r11d,%ebx + xorl 44(%rsp),%edx + andl %edi,%eax + movl %r13d,%ecx + xorl 4(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%edx + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 40(%rsp),%ebp + movl %edi,%eax + movl %edx,36(%rsp) + movl %edi,%ebx + xorl 48(%rsp),%ebp + andl %esi,%eax + movl %r12d,%ecx + xorl 8(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%ebp + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 44(%rsp),%r14d + movl %esi,%eax + movl %ebp,40(%rsp) + movl %esi,%ebx + xorl 52(%rsp),%r14d + andl %r13d,%eax + movl %r11d,%ecx + xorl 12(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%r14d + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 48(%rsp),%edx + movl %r13d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ebx + xorl 56(%rsp),%edx + andl %r12d,%eax + movl %edi,%ecx + xorl 16(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%edx + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 52(%rsp),%ebp + movl %edi,%eax + movl %edx,48(%rsp) + movl %esi,%ecx + xorl 60(%rsp),%ebp + xorl %r12d,%eax + roll $5,%ecx + xorl 20(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%ebp + xorl 56(%rsp),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) + movl %r13d,%ecx + xorl 0(%rsp),%r14d + xorl %r11d,%eax + roll $5,%ecx + xorl 24(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%r14d + xorl 60(%rsp),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) + movl %r12d,%ecx + xorl 4(%rsp),%edx + xorl %edi,%eax + roll $5,%ecx + xorl 28(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%edx + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) + movl %r11d,%ecx + xorl 8(%rsp),%ebp + xorl %esi,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%ebp + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) + movl %edi,%ecx + xorl 12(%rsp),%r14d + xorl %r13d,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + leal -899497514(%rbp,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%r14d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) + movl %esi,%ecx + xorl 16(%rsp),%edx + xorl %r12d,%eax + roll $5,%ecx + xorl 40(%rsp),%edx + leal -899497514(%r14,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%edx + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) + movl %r13d,%ecx + xorl 20(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 44(%rsp),%ebp + leal -899497514(%rdx,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%ebp + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) + movl %r12d,%ecx + xorl 24(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 48(%rsp),%r14d + leal -899497514(%rbp,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%r14d + xorl 20(%rsp),%edx + movl %r12d,%eax + movl %r14d,16(%rsp) + movl %r11d,%ecx + xorl 28(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal -899497514(%r14,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%edx + xorl 24(%rsp),%ebp + movl %r11d,%eax + movl %edx,20(%rsp) + movl %edi,%ecx + xorl 32(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal -899497514(%rdx,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %edi,%eax + movl %ebp,24(%rsp) + movl %esi,%ecx + xorl 36(%rsp),%r14d + xorl %r12d,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal -899497514(%rbp,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%r14d + xorl 32(%rsp),%edx + movl %esi,%eax + movl %r14d,28(%rsp) + movl %r13d,%ecx + xorl 40(%rsp),%edx + xorl %r11d,%eax + roll $5,%ecx + xorl 0(%rsp),%edx + leal -899497514(%r14,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%edx + xorl 36(%rsp),%ebp + movl %r13d,%eax + + movl %r12d,%ecx + xorl 44(%rsp),%ebp + xorl %edi,%eax + roll $5,%ecx + xorl 4(%rsp),%ebp + leal -899497514(%rdx,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%ebp + xorl 40(%rsp),%r14d + movl %r12d,%eax + + movl %r11d,%ecx + xorl 48(%rsp),%r14d + xorl %esi,%eax + roll $5,%ecx + xorl 8(%rsp),%r14d + leal -899497514(%rbp,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%r14d + xorl 44(%rsp),%edx + movl %r11d,%eax + + movl %edi,%ecx + xorl 52(%rsp),%edx + xorl %r13d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal -899497514(%r14,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%edx + xorl 48(%rsp),%ebp + movl %edi,%eax + + movl %esi,%ecx + xorl 56(%rsp),%ebp + xorl %r12d,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %esi,%eax + + movl %r13d,%ecx + xorl 60(%rsp),%r14d + xorl %r11d,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r13d,%eax + + movl %r12d,%ecx + xorl 0(%rsp),%edx + xorl %edi,%eax + roll $5,%ecx + xorl 24(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%edx + xorl 60(%rsp),%ebp + movl %r12d,%eax + + movl %r11d,%ecx + xorl 4(%rsp),%ebp + xorl %esi,%eax + roll $5,%ecx + xorl 28(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%ebp + movl %r11d,%eax + movl %edi,%ecx + xorl %r13d,%eax + leal -899497514(%rbp,%rsi,1),%esi + roll $5,%ecx + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + addl 0(%r8),%esi + addl 4(%r8),%edi + addl 8(%r8),%r11d + addl 12(%r8),%r12d + addl 16(%r8),%r13d + movl %esi,0(%r8) + movl %edi,4(%r8) + movl %r11d,8(%r8) + movl %r12d,12(%r8) + movl %r13d,16(%r8) + + subq $1,%r10 + leaq 64(%r9),%r9 + jnz L$loop + + movq 64(%rsp),%rsi + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue: + ret + + +.globl _sha1_block_data_order_hw +.private_extern _sha1_block_data_order_hw + +.p2align 5 +_sha1_block_data_order_hw: + +_CET_ENDBR + movdqu (%rdi),%xmm0 + movd 16(%rdi),%xmm1 + movdqa K_XX_XX+160(%rip),%xmm3 + + movdqu (%rsi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%rsi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%rsi),%xmm6 + pshufb %xmm3,%xmm4 + movdqu 48(%rsi),%xmm7 + pshufb %xmm3,%xmm5 + pshufb %xmm3,%xmm6 + movdqa %xmm1,%xmm9 + pshufb %xmm3,%xmm7 + jmp L$oop_shaext + +.p2align 4 +L$oop_shaext: + decq %rdx + leaq 64(%rsi),%r8 + paddd %xmm4,%xmm1 + cmovneq %r8,%rsi + prefetcht0 512(%rsi) + movdqa %xmm0,%xmm8 + sha1msg1 %xmm5,%xmm4 + movdqa %xmm0,%xmm2 + sha1rnds4 $0,%xmm1,%xmm0 + sha1nexte %xmm5,%xmm2 + pxor %xmm6,%xmm4 + sha1msg1 %xmm6,%xmm5 + sha1msg2 %xmm7,%xmm4 + + movdqa %xmm0,%xmm1 + sha1rnds4 $0,%xmm2,%xmm0 + sha1nexte %xmm6,%xmm1 + pxor %xmm7,%xmm5 + sha1msg2 %xmm4,%xmm5 + sha1msg1 %xmm7,%xmm6 + movdqa %xmm0,%xmm2 + sha1rnds4 $0,%xmm1,%xmm0 + sha1nexte %xmm7,%xmm2 + pxor %xmm4,%xmm6 + sha1msg1 %xmm4,%xmm7 + sha1msg2 %xmm5,%xmm6 + + movdqa %xmm0,%xmm1 + sha1rnds4 $0,%xmm2,%xmm0 + sha1nexte %xmm4,%xmm1 + pxor %xmm5,%xmm7 + sha1msg2 %xmm6,%xmm7 + sha1msg1 %xmm5,%xmm4 + movdqa %xmm0,%xmm2 + sha1rnds4 $0,%xmm1,%xmm0 + sha1nexte %xmm5,%xmm2 + pxor %xmm6,%xmm4 + sha1msg1 %xmm6,%xmm5 + sha1msg2 %xmm7,%xmm4 + + movdqa %xmm0,%xmm1 + sha1rnds4 $1,%xmm2,%xmm0 + sha1nexte %xmm6,%xmm1 + pxor %xmm7,%xmm5 + sha1msg2 %xmm4,%xmm5 + sha1msg1 %xmm7,%xmm6 + movdqa %xmm0,%xmm2 + sha1rnds4 $1,%xmm1,%xmm0 + sha1nexte %xmm7,%xmm2 + pxor %xmm4,%xmm6 + sha1msg1 %xmm4,%xmm7 + sha1msg2 %xmm5,%xmm6 + + movdqa %xmm0,%xmm1 + sha1rnds4 $1,%xmm2,%xmm0 + sha1nexte %xmm4,%xmm1 + pxor %xmm5,%xmm7 + sha1msg2 %xmm6,%xmm7 + sha1msg1 %xmm5,%xmm4 + movdqa %xmm0,%xmm2 + sha1rnds4 $1,%xmm1,%xmm0 + sha1nexte %xmm5,%xmm2 + pxor %xmm6,%xmm4 + sha1msg1 %xmm6,%xmm5 + sha1msg2 %xmm7,%xmm4 + + movdqa %xmm0,%xmm1 + sha1rnds4 $1,%xmm2,%xmm0 + sha1nexte %xmm6,%xmm1 + pxor %xmm7,%xmm5 + sha1msg2 %xmm4,%xmm5 + sha1msg1 %xmm7,%xmm6 + movdqa %xmm0,%xmm2 + sha1rnds4 $2,%xmm1,%xmm0 + sha1nexte %xmm7,%xmm2 + pxor %xmm4,%xmm6 + sha1msg1 %xmm4,%xmm7 + sha1msg2 %xmm5,%xmm6 + + movdqa %xmm0,%xmm1 + sha1rnds4 $2,%xmm2,%xmm0 + sha1nexte %xmm4,%xmm1 + pxor %xmm5,%xmm7 + sha1msg2 %xmm6,%xmm7 + sha1msg1 %xmm5,%xmm4 + movdqa %xmm0,%xmm2 + sha1rnds4 $2,%xmm1,%xmm0 + sha1nexte %xmm5,%xmm2 + pxor %xmm6,%xmm4 + sha1msg1 %xmm6,%xmm5 + sha1msg2 %xmm7,%xmm4 + + movdqa %xmm0,%xmm1 + sha1rnds4 $2,%xmm2,%xmm0 + sha1nexte %xmm6,%xmm1 + pxor %xmm7,%xmm5 + sha1msg2 %xmm4,%xmm5 + sha1msg1 %xmm7,%xmm6 + movdqa %xmm0,%xmm2 + sha1rnds4 $2,%xmm1,%xmm0 + sha1nexte %xmm7,%xmm2 + pxor %xmm4,%xmm6 + sha1msg1 %xmm4,%xmm7 + sha1msg2 %xmm5,%xmm6 + + movdqa %xmm0,%xmm1 + sha1rnds4 $3,%xmm2,%xmm0 + sha1nexte %xmm4,%xmm1 + pxor %xmm5,%xmm7 + sha1msg2 %xmm6,%xmm7 + movdqu (%rsi),%xmm4 + movdqa %xmm0,%xmm2 + sha1rnds4 $3,%xmm1,%xmm0 + sha1nexte %xmm5,%xmm2 + movdqu 16(%rsi),%xmm5 + pshufb %xmm3,%xmm4 + + movdqa %xmm0,%xmm1 + sha1rnds4 $3,%xmm2,%xmm0 + sha1nexte %xmm6,%xmm1 + movdqu 32(%rsi),%xmm6 + pshufb %xmm3,%xmm5 + + movdqa %xmm0,%xmm2 + sha1rnds4 $3,%xmm1,%xmm0 + sha1nexte %xmm7,%xmm2 + movdqu 48(%rsi),%xmm7 + pshufb %xmm3,%xmm6 + + movdqa %xmm0,%xmm1 + sha1rnds4 $3,%xmm2,%xmm0 + sha1nexte %xmm9,%xmm1 + pshufb %xmm3,%xmm7 + + paddd %xmm8,%xmm0 + movdqa %xmm1,%xmm9 + + jnz L$oop_shaext + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%rdi) + movd %xmm1,16(%rdi) + ret + + +.globl _sha1_block_data_order_ssse3 +.private_extern _sha1_block_data_order_ssse3 + +.p2align 4 +_sha1_block_data_order_ssse3: + +_CET_ENDBR + movq %rsp,%r11 + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + leaq -64(%rsp),%rsp + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 + movdqu 0(%r9),%xmm0 + movdqu 16(%r9),%xmm1 + movdqu 32(%r9),%xmm2 + movdqu 48(%r9),%xmm3 + pshufb %xmm6,%xmm0 + pshufb %xmm6,%xmm1 + pshufb %xmm6,%xmm2 + addq $64,%r9 + paddd %xmm9,%xmm0 + pshufb %xmm6,%xmm3 + paddd %xmm9,%xmm1 + paddd %xmm9,%xmm2 + movdqa %xmm0,0(%rsp) + psubd %xmm9,%xmm0 + movdqa %xmm1,16(%rsp) + psubd %xmm9,%xmm1 + movdqa %xmm2,32(%rsp) + psubd %xmm9,%xmm2 + jmp L$oop_ssse3 +.p2align 4 +L$oop_ssse3: + rorl $2,%ebx + pshufd $238,%xmm0,%xmm4 + xorl %edx,%esi + movdqa %xmm3,%xmm8 + paddd %xmm3,%xmm9 + movl %eax,%edi + addl 0(%rsp),%ebp + punpcklqdq %xmm1,%xmm4 + xorl %ecx,%ebx + roll $5,%eax + addl %esi,%ebp + psrldq $4,%xmm8 + andl %ebx,%edi + xorl %ecx,%ebx + pxor %xmm0,%xmm4 + addl %eax,%ebp + rorl $7,%eax + pxor %xmm2,%xmm8 + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + pxor %xmm8,%xmm4 + xorl %ebx,%eax + roll $5,%ebp + movdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + movdqa %xmm4,%xmm10 + xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + movdqa %xmm4,%xmm8 + xorl %ebx,%esi + pslldq $12,%xmm10 + paddd %xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + psrld $31,%xmm8 + xorl %eax,%ebp + roll $5,%edx + addl %esi,%ecx + movdqa %xmm10,%xmm9 + andl %ebp,%edi + xorl %eax,%ebp + psrld $30,%xmm10 + addl %edx,%ecx + rorl $7,%edx + por %xmm8,%xmm4 + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + pslld $2,%xmm9 + pxor %xmm10,%xmm4 + xorl %ebp,%edx + movdqa -64(%r14),%xmm10 + roll $5,%ecx + addl %edi,%ebx + andl %edx,%esi + pxor %xmm9,%xmm4 + xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + pshufd $238,%xmm1,%xmm5 + xorl %ebp,%esi + movdqa %xmm4,%xmm9 + paddd %xmm4,%xmm10 + movl %ebx,%edi + addl 16(%rsp),%eax + punpcklqdq %xmm2,%xmm5 + xorl %edx,%ecx + roll $5,%ebx + addl %esi,%eax + psrldq $4,%xmm9 + andl %ecx,%edi + xorl %edx,%ecx + pxor %xmm1,%xmm5 + addl %ebx,%eax + rorl $7,%ebx + pxor %xmm3,%xmm9 + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + pxor %xmm9,%xmm5 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm10,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + movdqa %xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + movdqa %xmm5,%xmm9 + xorl %ecx,%esi + pslldq $12,%xmm8 + paddd %xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + psrld $31,%xmm9 + xorl %ebx,%eax + roll $5,%ebp + addl %esi,%edx + movdqa %xmm8,%xmm10 + andl %eax,%edi + xorl %ebx,%eax + psrld $30,%xmm8 + addl %ebp,%edx + rorl $7,%ebp + por %xmm9,%xmm5 + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + pslld $2,%xmm10 + pxor %xmm8,%xmm5 + xorl %eax,%ebp + movdqa -32(%r14),%xmm8 + roll $5,%edx + addl %edi,%ecx + andl %ebp,%esi + pxor %xmm10,%xmm5 + xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edx + pshufd $238,%xmm2,%xmm6 + xorl %eax,%esi + movdqa %xmm5,%xmm10 + paddd %xmm5,%xmm8 + movl %ecx,%edi + addl 32(%rsp),%ebx + punpcklqdq %xmm3,%xmm6 + xorl %ebp,%edx + roll $5,%ecx + addl %esi,%ebx + psrldq $4,%xmm10 + andl %edx,%edi + xorl %ebp,%edx + pxor %xmm2,%xmm6 + addl %ecx,%ebx + rorl $7,%ecx + pxor %xmm4,%xmm10 + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + pxor %xmm10,%xmm6 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm8,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + movdqa %xmm6,%xmm9 + xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm10 + xorl %edx,%esi + pslldq $12,%xmm9 + paddd %xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + psrld $31,%xmm10 + xorl %ecx,%ebx + roll $5,%eax + addl %esi,%ebp + movdqa %xmm9,%xmm8 + andl %ebx,%edi + xorl %ecx,%ebx + psrld $30,%xmm9 + addl %eax,%ebp + rorl $7,%eax + por %xmm10,%xmm6 + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + pslld $2,%xmm8 + pxor %xmm9,%xmm6 + xorl %ebx,%eax + movdqa -32(%r14),%xmm9 + roll $5,%ebp + addl %edi,%edx + andl %eax,%esi + pxor %xmm8,%xmm6 + xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%esi + movdqa %xmm6,%xmm8 + paddd %xmm6,%xmm9 + movl %edx,%edi + addl 48(%rsp),%ecx + punpcklqdq %xmm4,%xmm7 + xorl %eax,%ebp + roll $5,%edx + addl %esi,%ecx + psrldq $4,%xmm8 + andl %ebp,%edi + xorl %eax,%ebp + pxor %xmm3,%xmm7 + addl %edx,%ecx + rorl $7,%edx + pxor %xmm5,%xmm8 + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + pxor %xmm8,%xmm7 + xorl %ebp,%edx + roll $5,%ecx + movdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + movdqa %xmm7,%xmm10 + xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm8 + xorl %ebp,%esi + pslldq $12,%xmm10 + paddd %xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + psrld $31,%xmm8 + xorl %edx,%ecx + roll $5,%ebx + addl %esi,%eax + movdqa %xmm10,%xmm9 + andl %ecx,%edi + xorl %edx,%ecx + psrld $30,%xmm10 + addl %ebx,%eax + rorl $7,%ebx + por %xmm8,%xmm7 + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + pslld $2,%xmm9 + pxor %xmm10,%xmm7 + xorl %ecx,%ebx + movdqa -32(%r14),%xmm10 + roll $5,%eax + addl %edi,%ebp + andl %ebx,%esi + pxor %xmm9,%xmm7 + pshufd $238,%xmm6,%xmm9 + xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + pxor %xmm4,%xmm0 + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + punpcklqdq %xmm7,%xmm9 + xorl %ebx,%eax + roll $5,%ebp + pxor %xmm1,%xmm0 + addl %esi,%edx + andl %eax,%edi + movdqa %xmm10,%xmm8 + xorl %ebx,%eax + paddd %xmm7,%xmm10 + addl %ebp,%edx + pxor %xmm9,%xmm0 + rorl $7,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 4(%rsp),%ecx + movdqa %xmm0,%xmm9 + xorl %eax,%ebp + roll $5,%edx + movdqa %xmm10,48(%rsp) + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + pslld $2,%xmm0 + addl %edx,%ecx + rorl $7,%edx + psrld $30,%xmm9 + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + por %xmm9,%xmm0 + xorl %ebp,%edx + roll $5,%ecx + pshufd $238,%xmm7,%xmm10 + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pxor %xmm5,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm0,%xmm10 + movl %eax,%edi + roll $5,%eax + pxor %xmm2,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + movdqa %xmm8,%xmm9 + rorl $7,%ebx + paddd %xmm0,%xmm8 + addl %eax,%ebp + pxor %xmm10,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + movdqa %xmm1,%xmm10 + addl %edi,%edx + xorl %ebx,%esi + movdqa %xmm8,0(%rsp) + rorl $7,%eax + addl %ebp,%edx + addl 24(%rsp),%ecx + pslld $2,%xmm1 + xorl %eax,%esi + movl %edx,%edi + psrld $30,%xmm10 + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + por %xmm10,%xmm1 + addl %edx,%ecx + addl 28(%rsp),%ebx + pshufd $238,%xmm0,%xmm8 + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + pxor %xmm6,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + punpcklqdq %xmm1,%xmm8 + movl %ebx,%edi + roll $5,%ebx + pxor %xmm3,%xmm2 + addl %esi,%eax + xorl %edx,%edi + movdqa 0(%r14),%xmm10 + rorl $7,%ecx + paddd %xmm1,%xmm9 + addl %ebx,%eax + pxor %xmm8,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + movdqa %xmm2,%xmm8 + addl %edi,%ebp + xorl %ecx,%esi + movdqa %xmm9,16(%rsp) + rorl $7,%ebx + addl %eax,%ebp + addl 40(%rsp),%edx + pslld $2,%xmm2 + xorl %ebx,%esi + movl %ebp,%edi + psrld $30,%xmm8 + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + por %xmm8,%xmm2 + addl %ebp,%edx + addl 44(%rsp),%ecx + pshufd $238,%xmm1,%xmm9 + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + pxor %xmm7,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + punpcklqdq %xmm2,%xmm9 + movl %ecx,%edi + roll $5,%ecx + pxor %xmm4,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + movdqa %xmm10,%xmm8 + rorl $7,%edx + paddd %xmm2,%xmm10 + addl %ecx,%ebx + pxor %xmm9,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + movdqa %xmm3,%xmm9 + addl %edi,%eax + xorl %edx,%esi + movdqa %xmm10,32(%rsp) + rorl $7,%ecx + addl %ebx,%eax + addl 56(%rsp),%ebp + pslld $2,%xmm3 + xorl %ecx,%esi + movl %eax,%edi + psrld $30,%xmm9 + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + por %xmm9,%xmm3 + addl %eax,%ebp + addl 60(%rsp),%edx + pshufd $238,%xmm2,%xmm10 + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + pxor %xmm0,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + punpcklqdq %xmm3,%xmm10 + movl %edx,%edi + roll $5,%edx + pxor %xmm5,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + movdqa %xmm8,%xmm9 + rorl $7,%ebp + paddd %xmm3,%xmm8 + addl %edx,%ecx + pxor %xmm10,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + movdqa %xmm4,%xmm10 + addl %edi,%ebx + xorl %ebp,%esi + movdqa %xmm8,48(%rsp) + rorl $7,%edx + addl %ecx,%ebx + addl 8(%rsp),%eax + pslld $2,%xmm4 + xorl %edx,%esi + movl %ebx,%edi + psrld $30,%xmm10 + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + por %xmm10,%xmm4 + addl %ebx,%eax + addl 12(%rsp),%ebp + pshufd $238,%xmm3,%xmm8 + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + pxor %xmm1,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + punpcklqdq %xmm4,%xmm8 + movl %ebp,%edi + roll $5,%ebp + pxor %xmm6,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + movdqa %xmm9,%xmm10 + rorl $7,%eax + paddd %xmm4,%xmm9 + addl %ebp,%edx + pxor %xmm8,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + movdqa %xmm5,%xmm8 + addl %edi,%ecx + xorl %eax,%esi + movdqa %xmm9,0(%rsp) + rorl $7,%ebp + addl %edx,%ecx + addl 24(%rsp),%ebx + pslld $2,%xmm5 + xorl %ebp,%esi + movl %ecx,%edi + psrld $30,%xmm8 + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + por %xmm8,%xmm5 + addl %ecx,%ebx + addl 28(%rsp),%eax + pshufd $238,%xmm4,%xmm9 + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%edi + roll $5,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + pxor %xmm2,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + punpcklqdq %xmm5,%xmm9 + movl %eax,%edi + xorl %ecx,%esi + pxor %xmm7,%xmm6 + roll $5,%eax + addl %esi,%ebp + movdqa %xmm10,%xmm8 + xorl %ebx,%edi + paddd %xmm5,%xmm10 + xorl %ecx,%ebx + pxor %xmm9,%xmm6 + addl %eax,%ebp + addl 36(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + rorl $7,%eax + movdqa %xmm6,%xmm9 + movl %ebp,%esi + xorl %ebx,%edi + movdqa %xmm10,16(%rsp) + roll $5,%ebp + addl %edi,%edx + xorl %eax,%esi + pslld $2,%xmm6 + xorl %ebx,%eax + addl %ebp,%edx + psrld $30,%xmm9 + addl 40(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + por %xmm9,%xmm6 + rorl $7,%ebp + movl %edx,%edi + xorl %eax,%esi + roll $5,%edx + pshufd $238,%xmm5,%xmm10 + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + rorl $7,%edx + movl %ecx,%esi + xorl %ebp,%edi + roll $5,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + pxor %xmm3,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + rorl $7,%ecx + punpcklqdq %xmm6,%xmm10 + movl %ebx,%edi + xorl %edx,%esi + pxor %xmm0,%xmm7 + roll $5,%ebx + addl %esi,%eax + movdqa 32(%r14),%xmm9 + xorl %ecx,%edi + paddd %xmm6,%xmm8 + xorl %edx,%ecx + pxor %xmm10,%xmm7 + addl %ebx,%eax + addl 52(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + rorl $7,%ebx + movdqa %xmm7,%xmm10 + movl %eax,%esi + xorl %ecx,%edi + movdqa %xmm8,32(%rsp) + roll $5,%eax + addl %edi,%ebp + xorl %ebx,%esi + pslld $2,%xmm7 + xorl %ecx,%ebx + addl %eax,%ebp + psrld $30,%xmm10 + addl 56(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + por %xmm10,%xmm7 + rorl $7,%eax + movl %ebp,%edi + xorl %ebx,%esi + roll $5,%ebp + pshufd $238,%xmm6,%xmm8 + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + rorl $7,%ebp + movl %edx,%esi + xorl %eax,%edi + roll $5,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + pxor %xmm4,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + rorl $7,%edx + punpcklqdq %xmm7,%xmm8 + movl %ecx,%edi + xorl %ebp,%esi + pxor %xmm1,%xmm0 + roll $5,%ecx + addl %esi,%ebx + movdqa %xmm9,%xmm10 + xorl %edx,%edi + paddd %xmm7,%xmm9 + xorl %ebp,%edx + pxor %xmm8,%xmm0 + addl %ecx,%ebx + addl 4(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + rorl $7,%ecx + movdqa %xmm0,%xmm8 + movl %ebx,%esi + xorl %edx,%edi + movdqa %xmm9,48(%rsp) + roll $5,%ebx + addl %edi,%eax + xorl %ecx,%esi + pslld $2,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + psrld $30,%xmm8 + addl 8(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + por %xmm8,%xmm0 + rorl $7,%ebx + movl %eax,%edi + xorl %ecx,%esi + roll $5,%eax + pshufd $238,%xmm7,%xmm9 + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + rorl $7,%eax + movl %ebp,%esi + xorl %ebx,%edi + roll $5,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + pxor %xmm5,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%ebp + punpcklqdq %xmm0,%xmm9 + movl %edx,%edi + xorl %eax,%esi + pxor %xmm2,%xmm1 + roll $5,%edx + addl %esi,%ecx + movdqa %xmm10,%xmm8 + xorl %ebp,%edi + paddd %xmm0,%xmm10 + xorl %eax,%ebp + pxor %xmm9,%xmm1 + addl %edx,%ecx + addl 20(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + rorl $7,%edx + movdqa %xmm1,%xmm9 + movl %ecx,%esi + xorl %ebp,%edi + movdqa %xmm10,0(%rsp) + roll $5,%ecx + addl %edi,%ebx + xorl %edx,%esi + pslld $2,%xmm1 + xorl %ebp,%edx + addl %ecx,%ebx + psrld $30,%xmm9 + addl 24(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + por %xmm9,%xmm1 + rorl $7,%ecx + movl %ebx,%edi + xorl %edx,%esi + roll $5,%ebx + pshufd $238,%xmm0,%xmm10 + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%edi + roll $5,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + pxor %xmm6,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + punpcklqdq %xmm1,%xmm10 + movl %ebp,%edi + xorl %ebx,%esi + pxor %xmm3,%xmm2 + roll $5,%ebp + addl %esi,%edx + movdqa %xmm8,%xmm9 + xorl %eax,%edi + paddd %xmm1,%xmm8 + xorl %ebx,%eax + pxor %xmm10,%xmm2 + addl %ebp,%edx + addl 36(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + rorl $7,%ebp + movdqa %xmm2,%xmm10 + movl %edx,%esi + xorl %eax,%edi + movdqa %xmm8,16(%rsp) + roll $5,%edx + addl %edi,%ecx + xorl %ebp,%esi + pslld $2,%xmm2 + xorl %eax,%ebp + addl %edx,%ecx + psrld $30,%xmm10 + addl 40(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + por %xmm10,%xmm2 + rorl $7,%edx + movl %ecx,%edi + xorl %ebp,%esi + roll $5,%ecx + pshufd $238,%xmm1,%xmm8 + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%edi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + pxor %xmm7,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm2,%xmm8 + movl %eax,%edi + roll $5,%eax + pxor %xmm4,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + movdqa %xmm9,%xmm10 + rorl $7,%ebx + paddd %xmm2,%xmm9 + addl %eax,%ebp + pxor %xmm8,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + movdqa %xmm3,%xmm8 + addl %edi,%edx + xorl %ebx,%esi + movdqa %xmm9,32(%rsp) + rorl $7,%eax + addl %ebp,%edx + addl 56(%rsp),%ecx + pslld $2,%xmm3 + xorl %eax,%esi + movl %edx,%edi + psrld $30,%xmm8 + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + por %xmm8,%xmm3 + addl %edx,%ecx + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + paddd %xmm3,%xmm10 + addl %esi,%eax + xorl %edx,%edi + movdqa %xmm10,48(%rsp) + rorl $7,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je L$done_ssse3 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 + movdqu 0(%r9),%xmm0 + movdqu 16(%r9),%xmm1 + movdqu 32(%r9),%xmm2 + movdqu 48(%r9),%xmm3 + pshufb %xmm6,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + pshufb %xmm6,%xmm1 + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + paddd %xmm9,%xmm0 + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + movdqa %xmm0,0(%rsp) + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + psubd %xmm9,%xmm0 + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + pshufb %xmm6,%xmm2 + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + paddd %xmm9,%xmm1 + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + movdqa %xmm1,16(%rsp) + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + psubd %xmm9,%xmm1 + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + pshufb %xmm6,%xmm3 + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + paddd %xmm9,%xmm2 + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + movdqa %xmm2,32(%rsp) + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + psubd %xmm9,%xmm2 + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp L$oop_ssse3 + +.p2align 4 +L$done_ssse3: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + +L$epilogue_ssse3: + ret + + +.globl _sha1_block_data_order_avx +.private_extern _sha1_block_data_order_avx + +.p2align 4 +_sha1_block_data_order_avx: + +_CET_ENDBR + movq %rsp,%r11 + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + leaq -64(%rsp),%rsp + vzeroupper + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm11,%xmm0,%xmm4 + vpaddd %xmm11,%xmm1,%xmm5 + vpaddd %xmm11,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + jmp L$oop_avx +.p2align 4 +L$oop_avx: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm11,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm10 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm4,%xmm4 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vpxor %xmm10,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm11,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm10 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm10,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa -32(%r14),%xmm11 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vpaddd %xmm5,%xmm11,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm10 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm6,%xmm6 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm10,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm11,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm10 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm10,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm11,%xmm9 + addl %esi,%edx + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm11,%xmm9 + vmovdqa 0(%r14),%xmm11 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + vpaddd %xmm3,%xmm11,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm11,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm11,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm11,%xmm9 + vmovdqa 32(%r14),%xmm11 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm11,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm11,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm11,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm11,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je L$done_avx + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm11,%xmm0,%xmm4 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm11,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm5,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm11,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm6,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp L$oop_avx + +.p2align 4 +L$done_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroupper + + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + +L$epilogue_avx: + ret + + +.globl _sha1_block_data_order_avx2 +.private_extern _sha1_block_data_order_avx2 + +.p2align 4 +_sha1_block_data_order_avx2: + +_CET_ENDBR + movq %rsp,%r11 + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + vzeroupper + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + leaq -640(%rsp),%rsp + shlq $6,%r10 + leaq 64(%r9),%r13 + andq $-128,%rsp + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + cmpq %r10,%r13 + cmovaeq %r9,%r13 + movl 4(%r8),%ebp + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl 16(%r8),%esi + vmovdqu 64(%r14),%ymm6 + + vmovdqu (%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + leaq 64(%r9),%r9 + vinserti128 $1,(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vpshufb %ymm6,%ymm0,%ymm0 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vpshufb %ymm6,%ymm1,%ymm1 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + vpshufb %ymm6,%ymm2,%ymm2 + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm3,%ymm3 + + vpaddd %ymm11,%ymm0,%ymm4 + vpaddd %ymm11,%ymm1,%ymm5 + vmovdqu %ymm4,0(%rsp) + vpaddd %ymm11,%ymm2,%ymm6 + vmovdqu %ymm5,32(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vpalignr $8,%ymm0,%ymm1,%ymm4 + vpsrldq $4,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $31,%ymm4,%ymm8 + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + vpxor %ymm10,%ymm4,%ymm4 + vpaddd %ymm11,%ymm4,%ymm9 + vmovdqu %ymm9,128(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm5 + vpsrldq $4,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm10,%ymm5,%ymm5 + vpaddd %ymm11,%ymm5,%ymm9 + vmovdqu %ymm9,160(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm6 + vpsrldq $4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $31,%ymm6,%ymm8 + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + vpxor %ymm10,%ymm6,%ymm6 + vpaddd %ymm11,%ymm6,%ymm9 + vmovdqu %ymm9,192(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm7 + vpsrldq $4,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm7,%ymm8 + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + vpxor %ymm10,%ymm7,%ymm7 + vpaddd %ymm11,%ymm7,%ymm9 + vmovdqu %ymm9,224(%rsp) + leaq 128(%rsp),%r13 + jmp L$oop_avx2 +.p2align 5 +L$oop_avx2: + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + jmp L$align32_1 +.p2align 5 +L$align32_1: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + vpxor %ymm1,%ymm0,%ymm0 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpxor %ymm8,%ymm0,%ymm0 + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vpor %ymm8,%ymm0,%ymm0 + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + vpaddd %ymm11,%ymm0,%ymm9 + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + vmovdqu %ymm9,256(%rsp) + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + vpxor %ymm2,%ymm1,%ymm1 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpxor %ymm8,%ymm1,%ymm1 + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vpor %ymm8,%ymm1,%ymm1 + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + vpaddd %ymm11,%ymm1,%ymm9 + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vmovdqu %ymm9,288(%rsp) + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + vpxor %ymm3,%ymm2,%ymm2 + vmovdqu 0(%r14),%ymm11 + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpxor %ymm8,%ymm2,%ymm2 + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vpor %ymm8,%ymm2,%ymm2 + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + vpaddd %ymm11,%ymm2,%ymm9 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vmovdqu %ymm9,320(%rsp) + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + vpxor %ymm4,%ymm3,%ymm3 + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpxor %ymm8,%ymm3,%ymm3 + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + vpor %ymm8,%ymm3,%ymm3 + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + vpaddd %ymm11,%ymm3,%ymm9 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vmovdqu %ymm9,352(%rsp) + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpalignr $8,%ymm2,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpxor %ymm5,%ymm4,%ymm4 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpxor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + vpsrld $30,%ymm4,%ymm8 + vpslld $2,%ymm4,%ymm4 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpor %ymm8,%ymm4,%ymm4 + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpaddd %ymm11,%ymm4,%ymm9 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + vmovdqu %ymm9,384(%rsp) + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpalignr $8,%ymm3,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm6,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpxor %ymm8,%ymm5,%ymm5 + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + vpsrld $30,%ymm5,%ymm8 + vpslld $2,%ymm5,%ymm5 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vpor %ymm8,%ymm5,%ymm5 + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + vmovdqu %ymm9,416(%rsp) + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm7,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + vpxor %ymm8,%ymm6,%ymm6 + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + vpsrld $30,%ymm6,%ymm8 + vpslld $2,%ymm6,%ymm6 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpor %ymm8,%ymm6,%ymm6 + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + vmovdqu %ymm9,448(%rsp) + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm5,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm0,%ymm7,%ymm7 + vmovdqu 32(%r14),%ymm11 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpxor %ymm8,%ymm7,%ymm7 + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + vpsrld $30,%ymm7,%ymm8 + vpslld $2,%ymm7,%ymm7 + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpor %ymm8,%ymm7,%ymm7 + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + vmovdqu %ymm9,480(%rsp) + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + jmp L$align32_2 +.p2align 5 +L$align32_2: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -64(%r13),%ebp + xorl %esi,%ecx + vpxor %ymm1,%ymm0,%ymm0 + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + vpxor %ymm8,%ymm0,%ymm0 + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + vpor %ymm8,%ymm0,%ymm0 + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpaddd %ymm11,%ymm0,%ymm9 + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + vmovdqu %ymm9,512(%rsp) + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -28(%r13),%ebx + xorl %eax,%edx + vpxor %ymm2,%ymm1,%ymm1 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpxor %ymm8,%ymm1,%ymm1 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + vpor %ymm8,%ymm1,%ymm1 + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpaddd %ymm11,%ymm1,%ymm9 + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + vmovdqu %ymm9,544(%rsp) + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl 8(%r13),%ecx + xorl %ebp,%esi + vpxor %ymm3,%ymm2,%ymm2 + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + vpxor %ymm8,%ymm2,%ymm2 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + vpor %ymm8,%ymm2,%ymm2 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpaddd %ymm11,%ymm2,%ymm9 + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + vmovdqu %ymm9,576(%rsp) + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl 44(%r13),%edx + xorl %ebx,%eax + vpxor %ymm4,%ymm3,%ymm3 + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm3,%ymm3 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl %r12d,%edx + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + vpor %ymm8,%ymm3,%ymm3 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpaddd %ymm11,%ymm3,%ymm9 + addl %r12d,%ecx + andl %edi,%edx + addl 68(%r13),%ebx + xorl %eax,%edx + vmovdqu %ymm9,608(%rsp) + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -96(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -60(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%r9),%r13 + leaq 128(%r9),%rdi + cmpq %r10,%r13 + cmovaeq %r9,%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + je L$done_avx2 + vmovdqu 64(%r14),%ymm6 + cmpq %r10,%rdi + ja L$ast_avx2 + + vmovdqu -64(%rdi),%xmm0 + vmovdqu -48(%rdi),%xmm1 + vmovdqu -32(%rdi),%xmm2 + vmovdqu -16(%rdi),%xmm3 + vinserti128 $1,0(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + jmp L$ast_avx2 + +.p2align 5 +L$ast_avx2: + leaq 128+16(%rsp),%r13 + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + subq $-128,%r9 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm0,%ymm0 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpshufb %ymm6,%ymm1,%ymm1 + vpaddd %ymm11,%ymm0,%ymm8 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vmovdqu %ymm8,0(%rsp) + vpshufb %ymm6,%ymm2,%ymm2 + vpaddd %ymm11,%ymm1,%ymm9 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + vmovdqu %ymm9,32(%rsp) + vpshufb %ymm6,%ymm3,%ymm3 + vpaddd %ymm11,%ymm2,%ymm6 + addl -64(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + jmp L$align32_3 +.p2align 5 +L$align32_3: + vmovdqu %ymm6,64(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + addl -28(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vmovdqu %ymm7,96(%rsp) + addl 8(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm0,%ymm1,%ymm4 + addl 44(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + vpsrldq $4,%ymm3,%ymm8 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + xorl %ebp,%esi + addl %r12d,%edx + vpxor %ymm8,%ymm4,%ymm4 + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + vpsrld $31,%ymm4,%ymm8 + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + andl %edi,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + addl 68(%r13),%ebx + xorl %eax,%edx + vpxor %ymm10,%ymm4,%ymm4 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpaddd %ymm11,%ymm4,%ymm9 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vmovdqu %ymm9,128(%rsp) + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm1,%ymm2,%ymm5 + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrldq $4,%ymm4,%ymm8 + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + xorl %eax,%edx + addl %r12d,%ecx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + vpxor %ymm10,%ymm5,%ymm5 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vmovdqu %ymm9,160(%rsp) + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm2,%ymm3,%ymm6 + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpsrldq $4,%ymm5,%ymm8 + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm8,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + vpsrld $31,%ymm6,%ymm8 + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + xorl %ebp,%esi + addl %r12d,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + xorl %ebx,%esi + addl -96(%r13),%ecx + vpxor %ymm10,%ymm6,%ymm6 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vmovdqu %ymm9,192(%rsp) + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpalignr $8,%ymm3,%ymm4,%ymm7 + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpsrldq $4,%ymm6,%ymm8 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm8,%ymm7,%ymm7 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + vpsrld $31,%ymm7,%ymm8 + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + xorl %ebx,%eax + addl %r12d,%esi + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + xorl %ecx,%eax + addl -60(%r13),%edx + vpxor %ymm10,%ymm7,%ymm7 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vmovdqu %ymm9,224(%rsp) + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%rsp),%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + jbe L$oop_avx2 + +L$done_avx2: + vzeroupper + movq -40(%r11),%r14 + + movq -32(%r11),%r13 + + movq -24(%r11),%r12 + + movq -16(%r11),%rbp + + movq -8(%r11),%rbx + + leaq (%r11),%rsp + +L$epilogue_avx2: + ret + + +.section __DATA,__const +.p2align 6 +K_XX_XX: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 6 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/sha1-x86_64-linux.S b/third_party/boringssl/gen/bcm/sha1-x86_64-linux.S new file mode 100644 index 00000000..1f4807a7 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha1-x86_64-linux.S @@ -0,0 +1,5450 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.globl sha1_block_data_order_nohw +.hidden sha1_block_data_order_nohw +.type sha1_block_data_order_nohw,@function +.align 16 +sha1_block_data_order_nohw: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + movq %rdi,%r8 + subq $72,%rsp + movq %rsi,%r9 + andq $-64,%rsp + movq %rdx,%r10 + movq %rax,64(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08 +.Lprologue: + + movl 0(%r8),%esi + movl 4(%r8),%edi + movl 8(%r8),%r11d + movl 12(%r8),%r12d + movl 16(%r8),%r13d + jmp .Lloop + +.align 16 +.Lloop: + movl 0(%r9),%edx + bswapl %edx + movl 4(%r9),%ebp + movl %r12d,%eax + movl %edx,0(%rsp) + movl %esi,%ecx + bswapl %ebp + xorl %r11d,%eax + roll $5,%ecx + andl %edi,%eax + leal 1518500249(%rdx,%r13,1),%r13d + addl %ecx,%r13d + xorl %r12d,%eax + roll $30,%edi + addl %eax,%r13d + movl 8(%r9),%r14d + movl %r11d,%eax + movl %ebp,4(%rsp) + movl %r13d,%ecx + bswapl %r14d + xorl %edi,%eax + roll $5,%ecx + andl %esi,%eax + leal 1518500249(%rbp,%r12,1),%r12d + addl %ecx,%r12d + xorl %r11d,%eax + roll $30,%esi + addl %eax,%r12d + movl 12(%r9),%edx + movl %edi,%eax + movl %r14d,8(%rsp) + movl %r12d,%ecx + bswapl %edx + xorl %esi,%eax + roll $5,%ecx + andl %r13d,%eax + leal 1518500249(%r14,%r11,1),%r11d + addl %ecx,%r11d + xorl %edi,%eax + roll $30,%r13d + addl %eax,%r11d + movl 16(%r9),%ebp + movl %esi,%eax + movl %edx,12(%rsp) + movl %r11d,%ecx + bswapl %ebp + xorl %r13d,%eax + roll $5,%ecx + andl %r12d,%eax + leal 1518500249(%rdx,%rdi,1),%edi + addl %ecx,%edi + xorl %esi,%eax + roll $30,%r12d + addl %eax,%edi + movl 20(%r9),%r14d + movl %r13d,%eax + movl %ebp,16(%rsp) + movl %edi,%ecx + bswapl %r14d + xorl %r12d,%eax + roll $5,%ecx + andl %r11d,%eax + leal 1518500249(%rbp,%rsi,1),%esi + addl %ecx,%esi + xorl %r13d,%eax + roll $30,%r11d + addl %eax,%esi + movl 24(%r9),%edx + movl %r12d,%eax + movl %r14d,20(%rsp) + movl %esi,%ecx + bswapl %edx + xorl %r11d,%eax + roll $5,%ecx + andl %edi,%eax + leal 1518500249(%r14,%r13,1),%r13d + addl %ecx,%r13d + xorl %r12d,%eax + roll $30,%edi + addl %eax,%r13d + movl 28(%r9),%ebp + movl %r11d,%eax + movl %edx,24(%rsp) + movl %r13d,%ecx + bswapl %ebp + xorl %edi,%eax + roll $5,%ecx + andl %esi,%eax + leal 1518500249(%rdx,%r12,1),%r12d + addl %ecx,%r12d + xorl %r11d,%eax + roll $30,%esi + addl %eax,%r12d + movl 32(%r9),%r14d + movl %edi,%eax + movl %ebp,28(%rsp) + movl %r12d,%ecx + bswapl %r14d + xorl %esi,%eax + roll $5,%ecx + andl %r13d,%eax + leal 1518500249(%rbp,%r11,1),%r11d + addl %ecx,%r11d + xorl %edi,%eax + roll $30,%r13d + addl %eax,%r11d + movl 36(%r9),%edx + movl %esi,%eax + movl %r14d,32(%rsp) + movl %r11d,%ecx + bswapl %edx + xorl %r13d,%eax + roll $5,%ecx + andl %r12d,%eax + leal 1518500249(%r14,%rdi,1),%edi + addl %ecx,%edi + xorl %esi,%eax + roll $30,%r12d + addl %eax,%edi + movl 40(%r9),%ebp + movl %r13d,%eax + movl %edx,36(%rsp) + movl %edi,%ecx + bswapl %ebp + xorl %r12d,%eax + roll $5,%ecx + andl %r11d,%eax + leal 1518500249(%rdx,%rsi,1),%esi + addl %ecx,%esi + xorl %r13d,%eax + roll $30,%r11d + addl %eax,%esi + movl 44(%r9),%r14d + movl %r12d,%eax + movl %ebp,40(%rsp) + movl %esi,%ecx + bswapl %r14d + xorl %r11d,%eax + roll $5,%ecx + andl %edi,%eax + leal 1518500249(%rbp,%r13,1),%r13d + addl %ecx,%r13d + xorl %r12d,%eax + roll $30,%edi + addl %eax,%r13d + movl 48(%r9),%edx + movl %r11d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ecx + bswapl %edx + xorl %edi,%eax + roll $5,%ecx + andl %esi,%eax + leal 1518500249(%r14,%r12,1),%r12d + addl %ecx,%r12d + xorl %r11d,%eax + roll $30,%esi + addl %eax,%r12d + movl 52(%r9),%ebp + movl %edi,%eax + movl %edx,48(%rsp) + movl %r12d,%ecx + bswapl %ebp + xorl %esi,%eax + roll $5,%ecx + andl %r13d,%eax + leal 1518500249(%rdx,%r11,1),%r11d + addl %ecx,%r11d + xorl %edi,%eax + roll $30,%r13d + addl %eax,%r11d + movl 56(%r9),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) + movl %r11d,%ecx + bswapl %r14d + xorl %r13d,%eax + roll $5,%ecx + andl %r12d,%eax + leal 1518500249(%rbp,%rdi,1),%edi + addl %ecx,%edi + xorl %esi,%eax + roll $30,%r12d + addl %eax,%edi + movl 60(%r9),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) + movl %edi,%ecx + bswapl %edx + xorl %r12d,%eax + roll $5,%ecx + andl %r11d,%eax + leal 1518500249(%r14,%rsi,1),%esi + addl %ecx,%esi + xorl %r13d,%eax + roll $30,%r11d + addl %eax,%esi + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) + movl %esi,%ecx + xorl 8(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + andl %edi,%eax + leal 1518500249(%rdx,%r13,1),%r13d + roll $30,%edi + xorl %r12d,%eax + addl %ecx,%r13d + roll $1,%ebp + addl %eax,%r13d + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) + movl %r13d,%ecx + xorl 12(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + andl %esi,%eax + leal 1518500249(%rbp,%r12,1),%r12d + roll $30,%esi + xorl %r11d,%eax + addl %ecx,%r12d + roll $1,%r14d + addl %eax,%r12d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) + movl %r12d,%ecx + xorl 16(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%edx + andl %r13d,%eax + leal 1518500249(%r14,%r11,1),%r11d + roll $30,%r13d + xorl %edi,%eax + addl %ecx,%r11d + roll $1,%edx + addl %eax,%r11d + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) + movl %r11d,%ecx + xorl 20(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%ebp + andl %r12d,%eax + leal 1518500249(%rdx,%rdi,1),%edi + roll $30,%r12d + xorl %esi,%eax + addl %ecx,%edi + roll $1,%ebp + addl %eax,%edi + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) + movl %edi,%ecx + xorl 24(%rsp),%r14d + xorl %r12d,%eax + roll $5,%ecx + xorl 48(%rsp),%r14d + andl %r11d,%eax + leal 1518500249(%rbp,%rsi,1),%esi + roll $30,%r11d + xorl %r13d,%eax + addl %ecx,%esi + roll $1,%r14d + addl %eax,%esi + xorl 20(%rsp),%edx + movl %edi,%eax + movl %r14d,16(%rsp) + movl %esi,%ecx + xorl 28(%rsp),%edx + xorl %r12d,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%edx + xorl 24(%rsp),%ebp + movl %esi,%eax + movl %edx,20(%rsp) + movl %r13d,%ecx + xorl 32(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %r13d,%eax + movl %ebp,24(%rsp) + movl %r12d,%ecx + xorl 36(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%r14d + xorl 32(%rsp),%edx + movl %r12d,%eax + movl %r14d,28(%rsp) + movl %r11d,%ecx + xorl 40(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 0(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%edx + xorl 36(%rsp),%ebp + movl %r11d,%eax + movl %edx,32(%rsp) + movl %edi,%ecx + xorl 44(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 4(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%ebp + xorl 40(%rsp),%r14d + movl %edi,%eax + movl %ebp,36(%rsp) + movl %esi,%ecx + xorl 48(%rsp),%r14d + xorl %r12d,%eax + roll $5,%ecx + xorl 8(%rsp),%r14d + leal 1859775393(%rbp,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%r14d + xorl 44(%rsp),%edx + movl %esi,%eax + movl %r14d,40(%rsp) + movl %r13d,%ecx + xorl 52(%rsp),%edx + xorl %r11d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal 1859775393(%r14,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%edx + xorl 48(%rsp),%ebp + movl %r13d,%eax + movl %edx,44(%rsp) + movl %r12d,%ecx + xorl 56(%rsp),%ebp + xorl %edi,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal 1859775393(%rdx,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %r12d,%eax + movl %ebp,48(%rsp) + movl %r11d,%ecx + xorl 60(%rsp),%r14d + xorl %esi,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal 1859775393(%rbp,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r11d,%eax + movl %r14d,52(%rsp) + movl %edi,%ecx + xorl 0(%rsp),%edx + xorl %r13d,%eax + roll $5,%ecx + xorl 24(%rsp),%edx + leal 1859775393(%r14,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%edx + xorl 60(%rsp),%ebp + movl %edi,%eax + movl %edx,56(%rsp) + movl %esi,%ecx + xorl 4(%rsp),%ebp + xorl %r12d,%eax + roll $5,%ecx + xorl 28(%rsp),%ebp + leal 1859775393(%rdx,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%ebp + xorl 0(%rsp),%r14d + movl %esi,%eax + movl %ebp,60(%rsp) + movl %r13d,%ecx + xorl 8(%rsp),%r14d + xorl %r11d,%eax + roll $5,%ecx + xorl 32(%rsp),%r14d + leal 1859775393(%rbp,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%r14d + xorl 4(%rsp),%edx + movl %r13d,%eax + movl %r14d,0(%rsp) + movl %r12d,%ecx + xorl 12(%rsp),%edx + xorl %edi,%eax + roll $5,%ecx + xorl 36(%rsp),%edx + leal 1859775393(%r14,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%edx + xorl 8(%rsp),%ebp + movl %r12d,%eax + movl %edx,4(%rsp) + movl %r11d,%ecx + xorl 16(%rsp),%ebp + xorl %esi,%eax + roll $5,%ecx + xorl 40(%rsp),%ebp + leal 1859775393(%rdx,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%ebp + xorl 12(%rsp),%r14d + movl %r11d,%eax + movl %ebp,8(%rsp) + movl %edi,%ecx + xorl 20(%rsp),%r14d + xorl %r13d,%eax + roll $5,%ecx + xorl 44(%rsp),%r14d + leal 1859775393(%rbp,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%r14d + xorl 16(%rsp),%edx + movl %edi,%eax + movl %r14d,12(%rsp) + movl %esi,%ecx + xorl 24(%rsp),%edx + xorl %r12d,%eax + roll $5,%ecx + xorl 48(%rsp),%edx + leal 1859775393(%r14,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%edx + xorl 20(%rsp),%ebp + movl %esi,%eax + movl %edx,16(%rsp) + movl %r13d,%ecx + xorl 28(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 52(%rsp),%ebp + leal 1859775393(%rdx,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%ebp + xorl 24(%rsp),%r14d + movl %r13d,%eax + movl %ebp,20(%rsp) + movl %r12d,%ecx + xorl 32(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 56(%rsp),%r14d + leal 1859775393(%rbp,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%r14d + xorl 28(%rsp),%edx + movl %r12d,%eax + movl %r14d,24(%rsp) + movl %r11d,%ecx + xorl 36(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 60(%rsp),%edx + leal 1859775393(%r14,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%edx + xorl 32(%rsp),%ebp + movl %r11d,%eax + movl %edx,28(%rsp) + movl %edi,%ecx + xorl 40(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 0(%rsp),%ebp + leal 1859775393(%rdx,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%ebp + xorl 36(%rsp),%r14d + movl %r12d,%eax + movl %ebp,32(%rsp) + movl %r12d,%ebx + xorl 44(%rsp),%r14d + andl %r11d,%eax + movl %esi,%ecx + xorl 4(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%r14d + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 40(%rsp),%edx + movl %r11d,%eax + movl %r14d,36(%rsp) + movl %r11d,%ebx + xorl 48(%rsp),%edx + andl %edi,%eax + movl %r13d,%ecx + xorl 8(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%edx + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 44(%rsp),%ebp + movl %edi,%eax + movl %edx,40(%rsp) + movl %edi,%ebx + xorl 52(%rsp),%ebp + andl %esi,%eax + movl %r12d,%ecx + xorl 12(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%ebp + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 48(%rsp),%r14d + movl %esi,%eax + movl %ebp,44(%rsp) + movl %esi,%ebx + xorl 56(%rsp),%r14d + andl %r13d,%eax + movl %r11d,%ecx + xorl 16(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%r14d + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 52(%rsp),%edx + movl %r13d,%eax + movl %r14d,48(%rsp) + movl %r13d,%ebx + xorl 60(%rsp),%edx + andl %r12d,%eax + movl %edi,%ecx + xorl 20(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%edx + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 56(%rsp),%ebp + movl %r12d,%eax + movl %edx,52(%rsp) + movl %r12d,%ebx + xorl 0(%rsp),%ebp + andl %r11d,%eax + movl %esi,%ecx + xorl 24(%rsp),%ebp + leal -1894007588(%rdx,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%ebp + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 60(%rsp),%r14d + movl %r11d,%eax + movl %ebp,56(%rsp) + movl %r11d,%ebx + xorl 4(%rsp),%r14d + andl %edi,%eax + movl %r13d,%ecx + xorl 28(%rsp),%r14d + leal -1894007588(%rbp,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%r14d + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 0(%rsp),%edx + movl %edi,%eax + movl %r14d,60(%rsp) + movl %edi,%ebx + xorl 8(%rsp),%edx + andl %esi,%eax + movl %r12d,%ecx + xorl 32(%rsp),%edx + leal -1894007588(%r14,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%edx + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 4(%rsp),%ebp + movl %esi,%eax + movl %edx,0(%rsp) + movl %esi,%ebx + xorl 12(%rsp),%ebp + andl %r13d,%eax + movl %r11d,%ecx + xorl 36(%rsp),%ebp + leal -1894007588(%rdx,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%ebp + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 8(%rsp),%r14d + movl %r13d,%eax + movl %ebp,4(%rsp) + movl %r13d,%ebx + xorl 16(%rsp),%r14d + andl %r12d,%eax + movl %edi,%ecx + xorl 40(%rsp),%r14d + leal -1894007588(%rbp,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%r14d + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 12(%rsp),%edx + movl %r12d,%eax + movl %r14d,8(%rsp) + movl %r12d,%ebx + xorl 20(%rsp),%edx + andl %r11d,%eax + movl %esi,%ecx + xorl 44(%rsp),%edx + leal -1894007588(%r14,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%edx + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 16(%rsp),%ebp + movl %r11d,%eax + movl %edx,12(%rsp) + movl %r11d,%ebx + xorl 24(%rsp),%ebp + andl %edi,%eax + movl %r13d,%ecx + xorl 48(%rsp),%ebp + leal -1894007588(%rdx,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%ebp + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 20(%rsp),%r14d + movl %edi,%eax + movl %ebp,16(%rsp) + movl %edi,%ebx + xorl 28(%rsp),%r14d + andl %esi,%eax + movl %r12d,%ecx + xorl 52(%rsp),%r14d + leal -1894007588(%rbp,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%r14d + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 24(%rsp),%edx + movl %esi,%eax + movl %r14d,20(%rsp) + movl %esi,%ebx + xorl 32(%rsp),%edx + andl %r13d,%eax + movl %r11d,%ecx + xorl 56(%rsp),%edx + leal -1894007588(%r14,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%edx + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 28(%rsp),%ebp + movl %r13d,%eax + movl %edx,24(%rsp) + movl %r13d,%ebx + xorl 36(%rsp),%ebp + andl %r12d,%eax + movl %edi,%ecx + xorl 60(%rsp),%ebp + leal -1894007588(%rdx,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%ebp + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 32(%rsp),%r14d + movl %r12d,%eax + movl %ebp,28(%rsp) + movl %r12d,%ebx + xorl 40(%rsp),%r14d + andl %r11d,%eax + movl %esi,%ecx + xorl 0(%rsp),%r14d + leal -1894007588(%rbp,%r13,1),%r13d + xorl %r11d,%ebx + roll $5,%ecx + addl %eax,%r13d + roll $1,%r14d + andl %edi,%ebx + addl %ecx,%r13d + roll $30,%edi + addl %ebx,%r13d + xorl 36(%rsp),%edx + movl %r11d,%eax + movl %r14d,32(%rsp) + movl %r11d,%ebx + xorl 44(%rsp),%edx + andl %edi,%eax + movl %r13d,%ecx + xorl 4(%rsp),%edx + leal -1894007588(%r14,%r12,1),%r12d + xorl %edi,%ebx + roll $5,%ecx + addl %eax,%r12d + roll $1,%edx + andl %esi,%ebx + addl %ecx,%r12d + roll $30,%esi + addl %ebx,%r12d + xorl 40(%rsp),%ebp + movl %edi,%eax + movl %edx,36(%rsp) + movl %edi,%ebx + xorl 48(%rsp),%ebp + andl %esi,%eax + movl %r12d,%ecx + xorl 8(%rsp),%ebp + leal -1894007588(%rdx,%r11,1),%r11d + xorl %esi,%ebx + roll $5,%ecx + addl %eax,%r11d + roll $1,%ebp + andl %r13d,%ebx + addl %ecx,%r11d + roll $30,%r13d + addl %ebx,%r11d + xorl 44(%rsp),%r14d + movl %esi,%eax + movl %ebp,40(%rsp) + movl %esi,%ebx + xorl 52(%rsp),%r14d + andl %r13d,%eax + movl %r11d,%ecx + xorl 12(%rsp),%r14d + leal -1894007588(%rbp,%rdi,1),%edi + xorl %r13d,%ebx + roll $5,%ecx + addl %eax,%edi + roll $1,%r14d + andl %r12d,%ebx + addl %ecx,%edi + roll $30,%r12d + addl %ebx,%edi + xorl 48(%rsp),%edx + movl %r13d,%eax + movl %r14d,44(%rsp) + movl %r13d,%ebx + xorl 56(%rsp),%edx + andl %r12d,%eax + movl %edi,%ecx + xorl 16(%rsp),%edx + leal -1894007588(%r14,%rsi,1),%esi + xorl %r12d,%ebx + roll $5,%ecx + addl %eax,%esi + roll $1,%edx + andl %r11d,%ebx + addl %ecx,%esi + roll $30,%r11d + addl %ebx,%esi + xorl 52(%rsp),%ebp + movl %edi,%eax + movl %edx,48(%rsp) + movl %esi,%ecx + xorl 60(%rsp),%ebp + xorl %r12d,%eax + roll $5,%ecx + xorl 20(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%ebp + xorl 56(%rsp),%r14d + movl %esi,%eax + movl %ebp,52(%rsp) + movl %r13d,%ecx + xorl 0(%rsp),%r14d + xorl %r11d,%eax + roll $5,%ecx + xorl 24(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%r14d + xorl 60(%rsp),%edx + movl %r13d,%eax + movl %r14d,56(%rsp) + movl %r12d,%ecx + xorl 4(%rsp),%edx + xorl %edi,%eax + roll $5,%ecx + xorl 28(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%edx + xorl 0(%rsp),%ebp + movl %r12d,%eax + movl %edx,60(%rsp) + movl %r11d,%ecx + xorl 8(%rsp),%ebp + xorl %esi,%eax + roll $5,%ecx + xorl 32(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%ebp + xorl 4(%rsp),%r14d + movl %r11d,%eax + movl %ebp,0(%rsp) + movl %edi,%ecx + xorl 12(%rsp),%r14d + xorl %r13d,%eax + roll $5,%ecx + xorl 36(%rsp),%r14d + leal -899497514(%rbp,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%r14d + xorl 8(%rsp),%edx + movl %edi,%eax + movl %r14d,4(%rsp) + movl %esi,%ecx + xorl 16(%rsp),%edx + xorl %r12d,%eax + roll $5,%ecx + xorl 40(%rsp),%edx + leal -899497514(%r14,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%edx + xorl 12(%rsp),%ebp + movl %esi,%eax + movl %edx,8(%rsp) + movl %r13d,%ecx + xorl 20(%rsp),%ebp + xorl %r11d,%eax + roll $5,%ecx + xorl 44(%rsp),%ebp + leal -899497514(%rdx,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%ebp + xorl 16(%rsp),%r14d + movl %r13d,%eax + movl %ebp,12(%rsp) + movl %r12d,%ecx + xorl 24(%rsp),%r14d + xorl %edi,%eax + roll $5,%ecx + xorl 48(%rsp),%r14d + leal -899497514(%rbp,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%r14d + xorl 20(%rsp),%edx + movl %r12d,%eax + movl %r14d,16(%rsp) + movl %r11d,%ecx + xorl 28(%rsp),%edx + xorl %esi,%eax + roll $5,%ecx + xorl 52(%rsp),%edx + leal -899497514(%r14,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%edx + xorl 24(%rsp),%ebp + movl %r11d,%eax + movl %edx,20(%rsp) + movl %edi,%ecx + xorl 32(%rsp),%ebp + xorl %r13d,%eax + roll $5,%ecx + xorl 56(%rsp),%ebp + leal -899497514(%rdx,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%ebp + xorl 28(%rsp),%r14d + movl %edi,%eax + movl %ebp,24(%rsp) + movl %esi,%ecx + xorl 36(%rsp),%r14d + xorl %r12d,%eax + roll $5,%ecx + xorl 60(%rsp),%r14d + leal -899497514(%rbp,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%r14d + xorl 32(%rsp),%edx + movl %esi,%eax + movl %r14d,28(%rsp) + movl %r13d,%ecx + xorl 40(%rsp),%edx + xorl %r11d,%eax + roll $5,%ecx + xorl 0(%rsp),%edx + leal -899497514(%r14,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%edx + xorl 36(%rsp),%ebp + movl %r13d,%eax + + movl %r12d,%ecx + xorl 44(%rsp),%ebp + xorl %edi,%eax + roll $5,%ecx + xorl 4(%rsp),%ebp + leal -899497514(%rdx,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%ebp + xorl 40(%rsp),%r14d + movl %r12d,%eax + + movl %r11d,%ecx + xorl 48(%rsp),%r14d + xorl %esi,%eax + roll $5,%ecx + xorl 8(%rsp),%r14d + leal -899497514(%rbp,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%r14d + xorl 44(%rsp),%edx + movl %r11d,%eax + + movl %edi,%ecx + xorl 52(%rsp),%edx + xorl %r13d,%eax + roll $5,%ecx + xorl 12(%rsp),%edx + leal -899497514(%r14,%rsi,1),%esi + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + roll $1,%edx + xorl 48(%rsp),%ebp + movl %edi,%eax + + movl %esi,%ecx + xorl 56(%rsp),%ebp + xorl %r12d,%eax + roll $5,%ecx + xorl 16(%rsp),%ebp + leal -899497514(%rdx,%r13,1),%r13d + xorl %r11d,%eax + addl %ecx,%r13d + roll $30,%edi + addl %eax,%r13d + roll $1,%ebp + xorl 52(%rsp),%r14d + movl %esi,%eax + + movl %r13d,%ecx + xorl 60(%rsp),%r14d + xorl %r11d,%eax + roll $5,%ecx + xorl 20(%rsp),%r14d + leal -899497514(%rbp,%r12,1),%r12d + xorl %edi,%eax + addl %ecx,%r12d + roll $30,%esi + addl %eax,%r12d + roll $1,%r14d + xorl 56(%rsp),%edx + movl %r13d,%eax + + movl %r12d,%ecx + xorl 0(%rsp),%edx + xorl %edi,%eax + roll $5,%ecx + xorl 24(%rsp),%edx + leal -899497514(%r14,%r11,1),%r11d + xorl %esi,%eax + addl %ecx,%r11d + roll $30,%r13d + addl %eax,%r11d + roll $1,%edx + xorl 60(%rsp),%ebp + movl %r12d,%eax + + movl %r11d,%ecx + xorl 4(%rsp),%ebp + xorl %esi,%eax + roll $5,%ecx + xorl 28(%rsp),%ebp + leal -899497514(%rdx,%rdi,1),%edi + xorl %r13d,%eax + addl %ecx,%edi + roll $30,%r12d + addl %eax,%edi + roll $1,%ebp + movl %r11d,%eax + movl %edi,%ecx + xorl %r13d,%eax + leal -899497514(%rbp,%rsi,1),%esi + roll $5,%ecx + xorl %r12d,%eax + addl %ecx,%esi + roll $30,%r11d + addl %eax,%esi + addl 0(%r8),%esi + addl 4(%r8),%edi + addl 8(%r8),%r11d + addl 12(%r8),%r12d + addl 16(%r8),%r13d + movl %esi,0(%r8) + movl %edi,4(%r8) + movl %r11d,8(%r8) + movl %r12d,12(%r8) + movl %r13d,16(%r8) + + subq $1,%r10 + leaq 64(%r9),%r9 + jnz .Lloop + + movq 64(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue: + ret +.cfi_endproc +.size sha1_block_data_order_nohw,.-sha1_block_data_order_nohw +.globl sha1_block_data_order_hw +.hidden sha1_block_data_order_hw +.type sha1_block_data_order_hw,@function +.align 32 +sha1_block_data_order_hw: +.cfi_startproc +_CET_ENDBR + movdqu (%rdi),%xmm0 + movd 16(%rdi),%xmm1 + movdqa K_XX_XX+160(%rip),%xmm3 + + movdqu (%rsi),%xmm4 + pshufd $27,%xmm0,%xmm0 + movdqu 16(%rsi),%xmm5 + pshufd $27,%xmm1,%xmm1 + movdqu 32(%rsi),%xmm6 + pshufb %xmm3,%xmm4 + movdqu 48(%rsi),%xmm7 + pshufb %xmm3,%xmm5 + pshufb %xmm3,%xmm6 + movdqa %xmm1,%xmm9 + pshufb %xmm3,%xmm7 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + decq %rdx + leaq 64(%rsi),%r8 + paddd %xmm4,%xmm1 + cmovneq %r8,%rsi + prefetcht0 512(%rsi) + movdqa %xmm0,%xmm8 + sha1msg1 %xmm5,%xmm4 + movdqa %xmm0,%xmm2 + sha1rnds4 $0,%xmm1,%xmm0 + sha1nexte %xmm5,%xmm2 + pxor %xmm6,%xmm4 + sha1msg1 %xmm6,%xmm5 + sha1msg2 %xmm7,%xmm4 + + movdqa %xmm0,%xmm1 + sha1rnds4 $0,%xmm2,%xmm0 + sha1nexte %xmm6,%xmm1 + pxor %xmm7,%xmm5 + sha1msg2 %xmm4,%xmm5 + sha1msg1 %xmm7,%xmm6 + movdqa %xmm0,%xmm2 + sha1rnds4 $0,%xmm1,%xmm0 + sha1nexte %xmm7,%xmm2 + pxor %xmm4,%xmm6 + sha1msg1 %xmm4,%xmm7 + sha1msg2 %xmm5,%xmm6 + + movdqa %xmm0,%xmm1 + sha1rnds4 $0,%xmm2,%xmm0 + sha1nexte %xmm4,%xmm1 + pxor %xmm5,%xmm7 + sha1msg2 %xmm6,%xmm7 + sha1msg1 %xmm5,%xmm4 + movdqa %xmm0,%xmm2 + sha1rnds4 $0,%xmm1,%xmm0 + sha1nexte %xmm5,%xmm2 + pxor %xmm6,%xmm4 + sha1msg1 %xmm6,%xmm5 + sha1msg2 %xmm7,%xmm4 + + movdqa %xmm0,%xmm1 + sha1rnds4 $1,%xmm2,%xmm0 + sha1nexte %xmm6,%xmm1 + pxor %xmm7,%xmm5 + sha1msg2 %xmm4,%xmm5 + sha1msg1 %xmm7,%xmm6 + movdqa %xmm0,%xmm2 + sha1rnds4 $1,%xmm1,%xmm0 + sha1nexte %xmm7,%xmm2 + pxor %xmm4,%xmm6 + sha1msg1 %xmm4,%xmm7 + sha1msg2 %xmm5,%xmm6 + + movdqa %xmm0,%xmm1 + sha1rnds4 $1,%xmm2,%xmm0 + sha1nexte %xmm4,%xmm1 + pxor %xmm5,%xmm7 + sha1msg2 %xmm6,%xmm7 + sha1msg1 %xmm5,%xmm4 + movdqa %xmm0,%xmm2 + sha1rnds4 $1,%xmm1,%xmm0 + sha1nexte %xmm5,%xmm2 + pxor %xmm6,%xmm4 + sha1msg1 %xmm6,%xmm5 + sha1msg2 %xmm7,%xmm4 + + movdqa %xmm0,%xmm1 + sha1rnds4 $1,%xmm2,%xmm0 + sha1nexte %xmm6,%xmm1 + pxor %xmm7,%xmm5 + sha1msg2 %xmm4,%xmm5 + sha1msg1 %xmm7,%xmm6 + movdqa %xmm0,%xmm2 + sha1rnds4 $2,%xmm1,%xmm0 + sha1nexte %xmm7,%xmm2 + pxor %xmm4,%xmm6 + sha1msg1 %xmm4,%xmm7 + sha1msg2 %xmm5,%xmm6 + + movdqa %xmm0,%xmm1 + sha1rnds4 $2,%xmm2,%xmm0 + sha1nexte %xmm4,%xmm1 + pxor %xmm5,%xmm7 + sha1msg2 %xmm6,%xmm7 + sha1msg1 %xmm5,%xmm4 + movdqa %xmm0,%xmm2 + sha1rnds4 $2,%xmm1,%xmm0 + sha1nexte %xmm5,%xmm2 + pxor %xmm6,%xmm4 + sha1msg1 %xmm6,%xmm5 + sha1msg2 %xmm7,%xmm4 + + movdqa %xmm0,%xmm1 + sha1rnds4 $2,%xmm2,%xmm0 + sha1nexte %xmm6,%xmm1 + pxor %xmm7,%xmm5 + sha1msg2 %xmm4,%xmm5 + sha1msg1 %xmm7,%xmm6 + movdqa %xmm0,%xmm2 + sha1rnds4 $2,%xmm1,%xmm0 + sha1nexte %xmm7,%xmm2 + pxor %xmm4,%xmm6 + sha1msg1 %xmm4,%xmm7 + sha1msg2 %xmm5,%xmm6 + + movdqa %xmm0,%xmm1 + sha1rnds4 $3,%xmm2,%xmm0 + sha1nexte %xmm4,%xmm1 + pxor %xmm5,%xmm7 + sha1msg2 %xmm6,%xmm7 + movdqu (%rsi),%xmm4 + movdqa %xmm0,%xmm2 + sha1rnds4 $3,%xmm1,%xmm0 + sha1nexte %xmm5,%xmm2 + movdqu 16(%rsi),%xmm5 + pshufb %xmm3,%xmm4 + + movdqa %xmm0,%xmm1 + sha1rnds4 $3,%xmm2,%xmm0 + sha1nexte %xmm6,%xmm1 + movdqu 32(%rsi),%xmm6 + pshufb %xmm3,%xmm5 + + movdqa %xmm0,%xmm2 + sha1rnds4 $3,%xmm1,%xmm0 + sha1nexte %xmm7,%xmm2 + movdqu 48(%rsi),%xmm7 + pshufb %xmm3,%xmm6 + + movdqa %xmm0,%xmm1 + sha1rnds4 $3,%xmm2,%xmm0 + sha1nexte %xmm9,%xmm1 + pshufb %xmm3,%xmm7 + + paddd %xmm8,%xmm0 + movdqa %xmm1,%xmm9 + + jnz .Loop_shaext + + pshufd $27,%xmm0,%xmm0 + pshufd $27,%xmm1,%xmm1 + movdqu %xmm0,(%rdi) + movd %xmm1,16(%rdi) + ret +.cfi_endproc +.size sha1_block_data_order_hw,.-sha1_block_data_order_hw +.globl sha1_block_data_order_ssse3 +.hidden sha1_block_data_order_ssse3 +.type sha1_block_data_order_ssse3,@function +.align 16 +sha1_block_data_order_ssse3: +.cfi_startproc +_CET_ENDBR + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + leaq -64(%rsp),%rsp + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 + movdqu 0(%r9),%xmm0 + movdqu 16(%r9),%xmm1 + movdqu 32(%r9),%xmm2 + movdqu 48(%r9),%xmm3 + pshufb %xmm6,%xmm0 + pshufb %xmm6,%xmm1 + pshufb %xmm6,%xmm2 + addq $64,%r9 + paddd %xmm9,%xmm0 + pshufb %xmm6,%xmm3 + paddd %xmm9,%xmm1 + paddd %xmm9,%xmm2 + movdqa %xmm0,0(%rsp) + psubd %xmm9,%xmm0 + movdqa %xmm1,16(%rsp) + psubd %xmm9,%xmm1 + movdqa %xmm2,32(%rsp) + psubd %xmm9,%xmm2 + jmp .Loop_ssse3 +.align 16 +.Loop_ssse3: + rorl $2,%ebx + pshufd $238,%xmm0,%xmm4 + xorl %edx,%esi + movdqa %xmm3,%xmm8 + paddd %xmm3,%xmm9 + movl %eax,%edi + addl 0(%rsp),%ebp + punpcklqdq %xmm1,%xmm4 + xorl %ecx,%ebx + roll $5,%eax + addl %esi,%ebp + psrldq $4,%xmm8 + andl %ebx,%edi + xorl %ecx,%ebx + pxor %xmm0,%xmm4 + addl %eax,%ebp + rorl $7,%eax + pxor %xmm2,%xmm8 + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + pxor %xmm8,%xmm4 + xorl %ebx,%eax + roll $5,%ebp + movdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + movdqa %xmm4,%xmm10 + xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + movdqa %xmm4,%xmm8 + xorl %ebx,%esi + pslldq $12,%xmm10 + paddd %xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + psrld $31,%xmm8 + xorl %eax,%ebp + roll $5,%edx + addl %esi,%ecx + movdqa %xmm10,%xmm9 + andl %ebp,%edi + xorl %eax,%ebp + psrld $30,%xmm10 + addl %edx,%ecx + rorl $7,%edx + por %xmm8,%xmm4 + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + pslld $2,%xmm9 + pxor %xmm10,%xmm4 + xorl %ebp,%edx + movdqa -64(%r14),%xmm10 + roll $5,%ecx + addl %edi,%ebx + andl %edx,%esi + pxor %xmm9,%xmm4 + xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + pshufd $238,%xmm1,%xmm5 + xorl %ebp,%esi + movdqa %xmm4,%xmm9 + paddd %xmm4,%xmm10 + movl %ebx,%edi + addl 16(%rsp),%eax + punpcklqdq %xmm2,%xmm5 + xorl %edx,%ecx + roll $5,%ebx + addl %esi,%eax + psrldq $4,%xmm9 + andl %ecx,%edi + xorl %edx,%ecx + pxor %xmm1,%xmm5 + addl %ebx,%eax + rorl $7,%ebx + pxor %xmm3,%xmm9 + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + pxor %xmm9,%xmm5 + xorl %ecx,%ebx + roll $5,%eax + movdqa %xmm10,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + movdqa %xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + movdqa %xmm5,%xmm9 + xorl %ecx,%esi + pslldq $12,%xmm8 + paddd %xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + psrld $31,%xmm9 + xorl %ebx,%eax + roll $5,%ebp + addl %esi,%edx + movdqa %xmm8,%xmm10 + andl %eax,%edi + xorl %ebx,%eax + psrld $30,%xmm8 + addl %ebp,%edx + rorl $7,%ebp + por %xmm9,%xmm5 + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + pslld $2,%xmm10 + pxor %xmm8,%xmm5 + xorl %eax,%ebp + movdqa -32(%r14),%xmm8 + roll $5,%edx + addl %edi,%ecx + andl %ebp,%esi + pxor %xmm10,%xmm5 + xorl %eax,%ebp + addl %edx,%ecx + rorl $7,%edx + pshufd $238,%xmm2,%xmm6 + xorl %eax,%esi + movdqa %xmm5,%xmm10 + paddd %xmm5,%xmm8 + movl %ecx,%edi + addl 32(%rsp),%ebx + punpcklqdq %xmm3,%xmm6 + xorl %ebp,%edx + roll $5,%ecx + addl %esi,%ebx + psrldq $4,%xmm10 + andl %edx,%edi + xorl %ebp,%edx + pxor %xmm2,%xmm6 + addl %ecx,%ebx + rorl $7,%ecx + pxor %xmm4,%xmm10 + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + pxor %xmm10,%xmm6 + xorl %edx,%ecx + roll $5,%ebx + movdqa %xmm8,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + movdqa %xmm6,%xmm9 + xorl %edx,%ecx + addl %ebx,%eax + rorl $7,%ebx + movdqa %xmm6,%xmm10 + xorl %edx,%esi + pslldq $12,%xmm9 + paddd %xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + psrld $31,%xmm10 + xorl %ecx,%ebx + roll $5,%eax + addl %esi,%ebp + movdqa %xmm9,%xmm8 + andl %ebx,%edi + xorl %ecx,%ebx + psrld $30,%xmm9 + addl %eax,%ebp + rorl $7,%eax + por %xmm10,%xmm6 + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + pslld $2,%xmm8 + pxor %xmm9,%xmm6 + xorl %ebx,%eax + movdqa -32(%r14),%xmm9 + roll $5,%ebp + addl %edi,%edx + andl %eax,%esi + pxor %xmm8,%xmm6 + xorl %ebx,%eax + addl %ebp,%edx + rorl $7,%ebp + pshufd $238,%xmm3,%xmm7 + xorl %ebx,%esi + movdqa %xmm6,%xmm8 + paddd %xmm6,%xmm9 + movl %edx,%edi + addl 48(%rsp),%ecx + punpcklqdq %xmm4,%xmm7 + xorl %eax,%ebp + roll $5,%edx + addl %esi,%ecx + psrldq $4,%xmm8 + andl %ebp,%edi + xorl %eax,%ebp + pxor %xmm3,%xmm7 + addl %edx,%ecx + rorl $7,%edx + pxor %xmm5,%xmm8 + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + pxor %xmm8,%xmm7 + xorl %ebp,%edx + roll $5,%ecx + movdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + movdqa %xmm7,%xmm10 + xorl %ebp,%edx + addl %ecx,%ebx + rorl $7,%ecx + movdqa %xmm7,%xmm8 + xorl %ebp,%esi + pslldq $12,%xmm10 + paddd %xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + psrld $31,%xmm8 + xorl %edx,%ecx + roll $5,%ebx + addl %esi,%eax + movdqa %xmm10,%xmm9 + andl %ecx,%edi + xorl %edx,%ecx + psrld $30,%xmm10 + addl %ebx,%eax + rorl $7,%ebx + por %xmm8,%xmm7 + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + pslld $2,%xmm9 + pxor %xmm10,%xmm7 + xorl %ecx,%ebx + movdqa -32(%r14),%xmm10 + roll $5,%eax + addl %edi,%ebp + andl %ebx,%esi + pxor %xmm9,%xmm7 + pshufd $238,%xmm6,%xmm9 + xorl %ecx,%ebx + addl %eax,%ebp + rorl $7,%eax + pxor %xmm4,%xmm0 + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + punpcklqdq %xmm7,%xmm9 + xorl %ebx,%eax + roll $5,%ebp + pxor %xmm1,%xmm0 + addl %esi,%edx + andl %eax,%edi + movdqa %xmm10,%xmm8 + xorl %ebx,%eax + paddd %xmm7,%xmm10 + addl %ebp,%edx + pxor %xmm9,%xmm0 + rorl $7,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 4(%rsp),%ecx + movdqa %xmm0,%xmm9 + xorl %eax,%ebp + roll $5,%edx + movdqa %xmm10,48(%rsp) + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + pslld $2,%xmm0 + addl %edx,%ecx + rorl $7,%edx + psrld $30,%xmm9 + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + por %xmm9,%xmm0 + xorl %ebp,%edx + roll $5,%ecx + pshufd $238,%xmm7,%xmm10 + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + pxor %xmm5,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm0,%xmm10 + movl %eax,%edi + roll $5,%eax + pxor %xmm2,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + movdqa %xmm8,%xmm9 + rorl $7,%ebx + paddd %xmm0,%xmm8 + addl %eax,%ebp + pxor %xmm10,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + movdqa %xmm1,%xmm10 + addl %edi,%edx + xorl %ebx,%esi + movdqa %xmm8,0(%rsp) + rorl $7,%eax + addl %ebp,%edx + addl 24(%rsp),%ecx + pslld $2,%xmm1 + xorl %eax,%esi + movl %edx,%edi + psrld $30,%xmm10 + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + por %xmm10,%xmm1 + addl %edx,%ecx + addl 28(%rsp),%ebx + pshufd $238,%xmm0,%xmm8 + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + pxor %xmm6,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + punpcklqdq %xmm1,%xmm8 + movl %ebx,%edi + roll $5,%ebx + pxor %xmm3,%xmm2 + addl %esi,%eax + xorl %edx,%edi + movdqa 0(%r14),%xmm10 + rorl $7,%ecx + paddd %xmm1,%xmm9 + addl %ebx,%eax + pxor %xmm8,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + movdqa %xmm2,%xmm8 + addl %edi,%ebp + xorl %ecx,%esi + movdqa %xmm9,16(%rsp) + rorl $7,%ebx + addl %eax,%ebp + addl 40(%rsp),%edx + pslld $2,%xmm2 + xorl %ebx,%esi + movl %ebp,%edi + psrld $30,%xmm8 + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + por %xmm8,%xmm2 + addl %ebp,%edx + addl 44(%rsp),%ecx + pshufd $238,%xmm1,%xmm9 + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + pxor %xmm7,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + punpcklqdq %xmm2,%xmm9 + movl %ecx,%edi + roll $5,%ecx + pxor %xmm4,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + movdqa %xmm10,%xmm8 + rorl $7,%edx + paddd %xmm2,%xmm10 + addl %ecx,%ebx + pxor %xmm9,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + movdqa %xmm3,%xmm9 + addl %edi,%eax + xorl %edx,%esi + movdqa %xmm10,32(%rsp) + rorl $7,%ecx + addl %ebx,%eax + addl 56(%rsp),%ebp + pslld $2,%xmm3 + xorl %ecx,%esi + movl %eax,%edi + psrld $30,%xmm9 + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + por %xmm9,%xmm3 + addl %eax,%ebp + addl 60(%rsp),%edx + pshufd $238,%xmm2,%xmm10 + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + pxor %xmm0,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + punpcklqdq %xmm3,%xmm10 + movl %edx,%edi + roll $5,%edx + pxor %xmm5,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + movdqa %xmm8,%xmm9 + rorl $7,%ebp + paddd %xmm3,%xmm8 + addl %edx,%ecx + pxor %xmm10,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + movdqa %xmm4,%xmm10 + addl %edi,%ebx + xorl %ebp,%esi + movdqa %xmm8,48(%rsp) + rorl $7,%edx + addl %ecx,%ebx + addl 8(%rsp),%eax + pslld $2,%xmm4 + xorl %edx,%esi + movl %ebx,%edi + psrld $30,%xmm10 + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + por %xmm10,%xmm4 + addl %ebx,%eax + addl 12(%rsp),%ebp + pshufd $238,%xmm3,%xmm8 + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + pxor %xmm1,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + punpcklqdq %xmm4,%xmm8 + movl %ebp,%edi + roll $5,%ebp + pxor %xmm6,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + movdqa %xmm9,%xmm10 + rorl $7,%eax + paddd %xmm4,%xmm9 + addl %ebp,%edx + pxor %xmm8,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + movdqa %xmm5,%xmm8 + addl %edi,%ecx + xorl %eax,%esi + movdqa %xmm9,0(%rsp) + rorl $7,%ebp + addl %edx,%ecx + addl 24(%rsp),%ebx + pslld $2,%xmm5 + xorl %ebp,%esi + movl %ecx,%edi + psrld $30,%xmm8 + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + por %xmm8,%xmm5 + addl %ecx,%ebx + addl 28(%rsp),%eax + pshufd $238,%xmm4,%xmm9 + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%edi + roll $5,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + pxor %xmm2,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + rorl $7,%ebx + punpcklqdq %xmm5,%xmm9 + movl %eax,%edi + xorl %ecx,%esi + pxor %xmm7,%xmm6 + roll $5,%eax + addl %esi,%ebp + movdqa %xmm10,%xmm8 + xorl %ebx,%edi + paddd %xmm5,%xmm10 + xorl %ecx,%ebx + pxor %xmm9,%xmm6 + addl %eax,%ebp + addl 36(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + rorl $7,%eax + movdqa %xmm6,%xmm9 + movl %ebp,%esi + xorl %ebx,%edi + movdqa %xmm10,16(%rsp) + roll $5,%ebp + addl %edi,%edx + xorl %eax,%esi + pslld $2,%xmm6 + xorl %ebx,%eax + addl %ebp,%edx + psrld $30,%xmm9 + addl 40(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + por %xmm9,%xmm6 + rorl $7,%ebp + movl %edx,%edi + xorl %eax,%esi + roll $5,%edx + pshufd $238,%xmm5,%xmm10 + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + rorl $7,%edx + movl %ecx,%esi + xorl %ebp,%edi + roll $5,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + pxor %xmm3,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + rorl $7,%ecx + punpcklqdq %xmm6,%xmm10 + movl %ebx,%edi + xorl %edx,%esi + pxor %xmm0,%xmm7 + roll $5,%ebx + addl %esi,%eax + movdqa 32(%r14),%xmm9 + xorl %ecx,%edi + paddd %xmm6,%xmm8 + xorl %edx,%ecx + pxor %xmm10,%xmm7 + addl %ebx,%eax + addl 52(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + rorl $7,%ebx + movdqa %xmm7,%xmm10 + movl %eax,%esi + xorl %ecx,%edi + movdqa %xmm8,32(%rsp) + roll $5,%eax + addl %edi,%ebp + xorl %ebx,%esi + pslld $2,%xmm7 + xorl %ecx,%ebx + addl %eax,%ebp + psrld $30,%xmm10 + addl 56(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + por %xmm10,%xmm7 + rorl $7,%eax + movl %ebp,%edi + xorl %ebx,%esi + roll $5,%ebp + pshufd $238,%xmm6,%xmm8 + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + rorl $7,%ebp + movl %edx,%esi + xorl %eax,%edi + roll $5,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + pxor %xmm4,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + rorl $7,%edx + punpcklqdq %xmm7,%xmm8 + movl %ecx,%edi + xorl %ebp,%esi + pxor %xmm1,%xmm0 + roll $5,%ecx + addl %esi,%ebx + movdqa %xmm9,%xmm10 + xorl %edx,%edi + paddd %xmm7,%xmm9 + xorl %ebp,%edx + pxor %xmm8,%xmm0 + addl %ecx,%ebx + addl 4(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + rorl $7,%ecx + movdqa %xmm0,%xmm8 + movl %ebx,%esi + xorl %edx,%edi + movdqa %xmm9,48(%rsp) + roll $5,%ebx + addl %edi,%eax + xorl %ecx,%esi + pslld $2,%xmm0 + xorl %edx,%ecx + addl %ebx,%eax + psrld $30,%xmm8 + addl 8(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + por %xmm8,%xmm0 + rorl $7,%ebx + movl %eax,%edi + xorl %ecx,%esi + roll $5,%eax + pshufd $238,%xmm7,%xmm9 + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + rorl $7,%eax + movl %ebp,%esi + xorl %ebx,%edi + roll $5,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + pxor %xmm5,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + rorl $7,%ebp + punpcklqdq %xmm0,%xmm9 + movl %edx,%edi + xorl %eax,%esi + pxor %xmm2,%xmm1 + roll $5,%edx + addl %esi,%ecx + movdqa %xmm10,%xmm8 + xorl %ebp,%edi + paddd %xmm0,%xmm10 + xorl %eax,%ebp + pxor %xmm9,%xmm1 + addl %edx,%ecx + addl 20(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + rorl $7,%edx + movdqa %xmm1,%xmm9 + movl %ecx,%esi + xorl %ebp,%edi + movdqa %xmm10,0(%rsp) + roll $5,%ecx + addl %edi,%ebx + xorl %edx,%esi + pslld $2,%xmm1 + xorl %ebp,%edx + addl %ecx,%ebx + psrld $30,%xmm9 + addl 24(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + por %xmm9,%xmm1 + rorl $7,%ecx + movl %ebx,%edi + xorl %edx,%esi + roll $5,%ebx + pshufd $238,%xmm0,%xmm10 + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + rorl $7,%ebx + movl %eax,%esi + xorl %ecx,%edi + roll $5,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + pxor %xmm6,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + rorl $7,%eax + punpcklqdq %xmm1,%xmm10 + movl %ebp,%edi + xorl %ebx,%esi + pxor %xmm3,%xmm2 + roll $5,%ebp + addl %esi,%edx + movdqa %xmm8,%xmm9 + xorl %eax,%edi + paddd %xmm1,%xmm8 + xorl %ebx,%eax + pxor %xmm10,%xmm2 + addl %ebp,%edx + addl 36(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + rorl $7,%ebp + movdqa %xmm2,%xmm10 + movl %edx,%esi + xorl %eax,%edi + movdqa %xmm8,16(%rsp) + roll $5,%edx + addl %edi,%ecx + xorl %ebp,%esi + pslld $2,%xmm2 + xorl %eax,%ebp + addl %edx,%ecx + psrld $30,%xmm10 + addl 40(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + por %xmm10,%xmm2 + rorl $7,%edx + movl %ecx,%edi + xorl %ebp,%esi + roll $5,%ecx + pshufd $238,%xmm1,%xmm8 + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + rorl $7,%ecx + movl %ebx,%esi + xorl %edx,%edi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + pxor %xmm7,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + punpcklqdq %xmm2,%xmm8 + movl %eax,%edi + roll $5,%eax + pxor %xmm4,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + movdqa %xmm9,%xmm10 + rorl $7,%ebx + paddd %xmm2,%xmm9 + addl %eax,%ebp + pxor %xmm8,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + movdqa %xmm3,%xmm8 + addl %edi,%edx + xorl %ebx,%esi + movdqa %xmm9,32(%rsp) + rorl $7,%eax + addl %ebp,%edx + addl 56(%rsp),%ecx + pslld $2,%xmm3 + xorl %eax,%esi + movl %edx,%edi + psrld $30,%xmm8 + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + por %xmm8,%xmm3 + addl %edx,%ecx + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + paddd %xmm3,%xmm10 + addl %esi,%eax + xorl %edx,%edi + movdqa %xmm10,48(%rsp) + rorl $7,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je .Ldone_ssse3 + movdqa 64(%r14),%xmm6 + movdqa -64(%r14),%xmm9 + movdqu 0(%r9),%xmm0 + movdqu 16(%r9),%xmm1 + movdqu 32(%r9),%xmm2 + movdqu 48(%r9),%xmm3 + pshufb %xmm6,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + pshufb %xmm6,%xmm1 + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + paddd %xmm9,%xmm0 + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + movdqa %xmm0,0(%rsp) + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + psubd %xmm9,%xmm0 + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + pshufb %xmm6,%xmm2 + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + paddd %xmm9,%xmm1 + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + movdqa %xmm1,16(%rsp) + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + psubd %xmm9,%xmm1 + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + pshufb %xmm6,%xmm3 + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + paddd %xmm9,%xmm2 + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + movdqa %xmm2,32(%rsp) + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + psubd %xmm9,%xmm2 + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp .Loop_ssse3 + +.align 16 +.Ldone_ssse3: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + xorl %edx,%esi + rorl $7,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + roll $5,%eax + addl %esi,%ebp + xorl %ecx,%edi + rorl $7,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + roll $5,%ebp + addl %edi,%edx + xorl %ebx,%esi + rorl $7,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + roll $5,%edx + addl %esi,%ecx + xorl %eax,%edi + rorl $7,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + roll $5,%ecx + addl %edi,%ebx + xorl %ebp,%esi + rorl $7,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + roll $5,%ebx + addl %esi,%eax + xorl %edx,%edi + rorl $7,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + roll $5,%eax + addl %edi,%ebp + xorl %ecx,%esi + rorl $7,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + roll $5,%ebp + addl %esi,%edx + xorl %ebx,%edi + rorl $7,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + roll $5,%edx + addl %edi,%ecx + xorl %eax,%esi + rorl $7,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + roll $5,%ecx + addl %esi,%ebx + xorl %ebp,%edi + rorl $7,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + roll $5,%ebx + addl %edi,%eax + rorl $7,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_ssse3: + ret +.cfi_endproc +.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 +.globl sha1_block_data_order_avx +.hidden sha1_block_data_order_avx +.type sha1_block_data_order_avx,@function +.align 16 +sha1_block_data_order_avx: +.cfi_startproc +_CET_ENDBR + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + leaq -64(%rsp),%rsp + vzeroupper + andq $-64,%rsp + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + shlq $6,%r10 + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + movl 4(%r8),%ebx + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl %ebx,%esi + movl 16(%r8),%ebp + movl %ecx,%edi + xorl %edx,%edi + andl %edi,%esi + + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + vpshufb %xmm6,%xmm1,%xmm1 + vpshufb %xmm6,%xmm2,%xmm2 + vpshufb %xmm6,%xmm3,%xmm3 + vpaddd %xmm11,%xmm0,%xmm4 + vpaddd %xmm11,%xmm1,%xmm5 + vpaddd %xmm11,%xmm2,%xmm6 + vmovdqa %xmm4,0(%rsp) + vmovdqa %xmm5,16(%rsp) + vmovdqa %xmm6,32(%rsp) + jmp .Loop_avx +.align 16 +.Loop_avx: + shrdl $2,%ebx,%ebx + xorl %edx,%esi + vpalignr $8,%xmm0,%xmm1,%xmm4 + movl %eax,%edi + addl 0(%rsp),%ebp + vpaddd %xmm3,%xmm11,%xmm9 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrldq $4,%xmm3,%xmm8 + addl %esi,%ebp + andl %ebx,%edi + vpxor %xmm0,%xmm4,%xmm4 + xorl %ecx,%ebx + addl %eax,%ebp + vpxor %xmm2,%xmm8,%xmm8 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 4(%rsp),%edx + vpxor %xmm8,%xmm4,%xmm4 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vmovdqa %xmm9,48(%rsp) + addl %edi,%edx + andl %eax,%esi + vpsrld $31,%xmm4,%xmm8 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpslldq $12,%xmm4,%xmm10 + vpaddd %xmm4,%xmm4,%xmm4 + movl %edx,%edi + addl 8(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm4,%xmm4 + addl %esi,%ecx + andl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm4,%xmm4 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 12(%rsp),%ebx + vpxor %xmm10,%xmm4,%xmm4 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %edi,%ebx + andl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpalignr $8,%xmm1,%xmm2,%xmm5 + movl %ebx,%edi + addl 16(%rsp),%eax + vpaddd %xmm4,%xmm11,%xmm9 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrldq $4,%xmm4,%xmm8 + addl %esi,%eax + andl %ecx,%edi + vpxor %xmm1,%xmm5,%xmm5 + xorl %edx,%ecx + addl %ebx,%eax + vpxor %xmm3,%xmm8,%xmm8 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 20(%rsp),%ebp + vpxor %xmm8,%xmm5,%xmm5 + xorl %ecx,%ebx + shldl $5,%eax,%eax + vmovdqa %xmm9,0(%rsp) + addl %edi,%ebp + andl %ebx,%esi + vpsrld $31,%xmm5,%xmm8 + xorl %ecx,%ebx + addl %eax,%ebp + shrdl $7,%eax,%eax + xorl %ecx,%esi + vpslldq $12,%xmm5,%xmm10 + vpaddd %xmm5,%xmm5,%xmm5 + movl %ebp,%edi + addl 24(%rsp),%edx + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm5,%xmm5 + addl %esi,%edx + andl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm5,%xmm5 + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + movl %edx,%esi + addl 28(%rsp),%ecx + vpxor %xmm10,%xmm5,%xmm5 + xorl %eax,%ebp + shldl $5,%edx,%edx + vmovdqa -32(%r14),%xmm11 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + vpalignr $8,%xmm2,%xmm3,%xmm6 + movl %ecx,%edi + addl 32(%rsp),%ebx + vpaddd %xmm5,%xmm11,%xmm9 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vpsrldq $4,%xmm5,%xmm8 + addl %esi,%ebx + andl %edx,%edi + vpxor %xmm2,%xmm6,%xmm6 + xorl %ebp,%edx + addl %ecx,%ebx + vpxor %xmm4,%xmm8,%xmm8 + shrdl $7,%ecx,%ecx + xorl %ebp,%edi + movl %ebx,%esi + addl 36(%rsp),%eax + vpxor %xmm8,%xmm6,%xmm6 + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vmovdqa %xmm9,16(%rsp) + addl %edi,%eax + andl %ecx,%esi + vpsrld $31,%xmm6,%xmm8 + xorl %edx,%ecx + addl %ebx,%eax + shrdl $7,%ebx,%ebx + xorl %edx,%esi + vpslldq $12,%xmm6,%xmm10 + vpaddd %xmm6,%xmm6,%xmm6 + movl %eax,%edi + addl 40(%rsp),%ebp + xorl %ecx,%ebx + shldl $5,%eax,%eax + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm6,%xmm6 + addl %esi,%ebp + andl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm6,%xmm6 + shrdl $7,%eax,%eax + xorl %ecx,%edi + movl %ebp,%esi + addl 44(%rsp),%edx + vpxor %xmm10,%xmm6,%xmm6 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + addl %edi,%edx + andl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%esi + vpalignr $8,%xmm3,%xmm4,%xmm7 + movl %edx,%edi + addl 48(%rsp),%ecx + vpaddd %xmm6,%xmm11,%xmm9 + xorl %eax,%ebp + shldl $5,%edx,%edx + vpsrldq $4,%xmm6,%xmm8 + addl %esi,%ecx + andl %ebp,%edi + vpxor %xmm3,%xmm7,%xmm7 + xorl %eax,%ebp + addl %edx,%ecx + vpxor %xmm5,%xmm8,%xmm8 + shrdl $7,%edx,%edx + xorl %eax,%edi + movl %ecx,%esi + addl 52(%rsp),%ebx + vpxor %xmm8,%xmm7,%xmm7 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + vmovdqa %xmm9,32(%rsp) + addl %edi,%ebx + andl %edx,%esi + vpsrld $31,%xmm7,%xmm8 + xorl %ebp,%edx + addl %ecx,%ebx + shrdl $7,%ecx,%ecx + xorl %ebp,%esi + vpslldq $12,%xmm7,%xmm10 + vpaddd %xmm7,%xmm7,%xmm7 + movl %ebx,%edi + addl 56(%rsp),%eax + xorl %edx,%ecx + shldl $5,%ebx,%ebx + vpsrld $30,%xmm10,%xmm9 + vpor %xmm8,%xmm7,%xmm7 + addl %esi,%eax + andl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + vpslld $2,%xmm10,%xmm10 + vpxor %xmm9,%xmm7,%xmm7 + shrdl $7,%ebx,%ebx + xorl %edx,%edi + movl %eax,%esi + addl 60(%rsp),%ebp + vpxor %xmm10,%xmm7,%xmm7 + xorl %ecx,%ebx + shldl $5,%eax,%eax + addl %edi,%ebp + andl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + shrdl $7,%eax,%eax + xorl %ecx,%esi + movl %ebp,%edi + addl 0(%rsp),%edx + vpxor %xmm1,%xmm0,%xmm0 + xorl %ebx,%eax + shldl $5,%ebp,%ebp + vpaddd %xmm7,%xmm11,%xmm9 + addl %esi,%edx + andl %eax,%edi + vpxor %xmm8,%xmm0,%xmm0 + xorl %ebx,%eax + addl %ebp,%edx + shrdl $7,%ebp,%ebp + xorl %ebx,%edi + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + movl %edx,%esi + addl 4(%rsp),%ecx + xorl %eax,%ebp + shldl $5,%edx,%edx + vpslld $2,%xmm0,%xmm0 + addl %edi,%ecx + andl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + shrdl $7,%edx,%edx + xorl %eax,%esi + movl %ecx,%edi + addl 8(%rsp),%ebx + vpor %xmm8,%xmm0,%xmm0 + xorl %ebp,%edx + shldl $5,%ecx,%ecx + addl %esi,%ebx + andl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 12(%rsp),%eax + xorl %ebp,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm2,%xmm1,%xmm1 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm0,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm1,%xmm1 + addl 20(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm1,%xmm1 + addl 24(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm1,%xmm1 + addl 28(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + vpxor %xmm3,%xmm2,%xmm2 + addl %esi,%eax + xorl %edx,%edi + vpaddd %xmm1,%xmm11,%xmm9 + vmovdqa 0(%r14),%xmm11 + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpxor %xmm8,%xmm2,%xmm2 + addl 36(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpslld $2,%xmm2,%xmm2 + addl 40(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpor %xmm8,%xmm2,%xmm2 + addl 44(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebx + xorl %ebp,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpalignr $8,%xmm2,%xmm3,%xmm8 + vpxor %xmm0,%xmm4,%xmm4 + addl 0(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + vpxor %xmm5,%xmm4,%xmm4 + addl %esi,%ecx + xorl %eax,%edi + vpaddd %xmm3,%xmm11,%xmm9 + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpxor %xmm8,%xmm4,%xmm4 + addl 4(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + vpsrld $30,%xmm4,%xmm8 + vmovdqa %xmm9,48(%rsp) + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpslld $2,%xmm4,%xmm4 + addl 8(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vpor %xmm8,%xmm4,%xmm4 + addl 12(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm3,%xmm4,%xmm8 + vpxor %xmm1,%xmm5,%xmm5 + addl 16(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpxor %xmm6,%xmm5,%xmm5 + addl %esi,%edx + xorl %ebx,%edi + vpaddd %xmm4,%xmm11,%xmm9 + shrdl $7,%eax,%eax + addl %ebp,%edx + vpxor %xmm8,%xmm5,%xmm5 + addl 20(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + vpsrld $30,%xmm5,%xmm8 + vmovdqa %xmm9,0(%rsp) + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpslld $2,%xmm5,%xmm5 + addl 24(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vpor %xmm8,%xmm5,%xmm5 + addl 28(%rsp),%eax + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + vpalignr $8,%xmm4,%xmm5,%xmm8 + vpxor %xmm2,%xmm6,%xmm6 + addl 32(%rsp),%ebp + andl %ecx,%esi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + vpxor %xmm7,%xmm6,%xmm6 + movl %eax,%edi + xorl %ecx,%esi + vpaddd %xmm5,%xmm11,%xmm9 + shldl $5,%eax,%eax + addl %esi,%ebp + vpxor %xmm8,%xmm6,%xmm6 + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 36(%rsp),%edx + vpsrld $30,%xmm6,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + vpslld $2,%xmm6,%xmm6 + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + addl 40(%rsp),%ecx + andl %eax,%esi + vpor %xmm8,%xmm6,%xmm6 + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%edi + xorl %eax,%esi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 44(%rsp),%ebx + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + vpalignr $8,%xmm5,%xmm6,%xmm8 + vpxor %xmm3,%xmm7,%xmm7 + addl 48(%rsp),%eax + andl %edx,%esi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + vpxor %xmm0,%xmm7,%xmm7 + movl %ebx,%edi + xorl %edx,%esi + vpaddd %xmm6,%xmm11,%xmm9 + vmovdqa 32(%r14),%xmm11 + shldl $5,%ebx,%ebx + addl %esi,%eax + vpxor %xmm8,%xmm7,%xmm7 + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 52(%rsp),%ebp + vpsrld $30,%xmm7,%xmm8 + vmovdqa %xmm9,32(%rsp) + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + vpslld $2,%xmm7,%xmm7 + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + addl 56(%rsp),%edx + andl %ebx,%esi + vpor %xmm8,%xmm7,%xmm7 + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%edi + xorl %ebx,%esi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 60(%rsp),%ecx + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + vpalignr $8,%xmm6,%xmm7,%xmm8 + vpxor %xmm4,%xmm0,%xmm0 + addl 0(%rsp),%ebx + andl %ebp,%esi + xorl %eax,%ebp + shrdl $7,%edx,%edx + vpxor %xmm1,%xmm0,%xmm0 + movl %ecx,%edi + xorl %ebp,%esi + vpaddd %xmm7,%xmm11,%xmm9 + shldl $5,%ecx,%ecx + addl %esi,%ebx + vpxor %xmm8,%xmm0,%xmm0 + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 4(%rsp),%eax + vpsrld $30,%xmm0,%xmm8 + vmovdqa %xmm9,48(%rsp) + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + vpslld $2,%xmm0,%xmm0 + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %ecx,%esi + xorl %edx,%ecx + addl %ebx,%eax + addl 8(%rsp),%ebp + andl %ecx,%esi + vpor %xmm8,%xmm0,%xmm0 + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%edi + xorl %ecx,%esi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ebx,%edi + xorl %ecx,%ebx + addl %eax,%ebp + addl 12(%rsp),%edx + andl %ebx,%edi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + movl %ebp,%esi + xorl %ebx,%edi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %eax,%esi + xorl %ebx,%eax + addl %ebp,%edx + vpalignr $8,%xmm7,%xmm0,%xmm8 + vpxor %xmm5,%xmm1,%xmm1 + addl 16(%rsp),%ecx + andl %eax,%esi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + vpxor %xmm2,%xmm1,%xmm1 + movl %edx,%edi + xorl %eax,%esi + vpaddd %xmm0,%xmm11,%xmm9 + shldl $5,%edx,%edx + addl %esi,%ecx + vpxor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edi + xorl %eax,%ebp + addl %edx,%ecx + addl 20(%rsp),%ebx + vpsrld $30,%xmm1,%xmm8 + vmovdqa %xmm9,0(%rsp) + andl %ebp,%edi + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%esi + vpslld $2,%xmm1,%xmm1 + xorl %ebp,%edi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %edx,%esi + xorl %ebp,%edx + addl %ecx,%ebx + addl 24(%rsp),%eax + andl %edx,%esi + vpor %xmm8,%xmm1,%xmm1 + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%edi + xorl %edx,%esi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %ecx,%edi + xorl %edx,%ecx + addl %ebx,%eax + addl 28(%rsp),%ebp + andl %ecx,%edi + xorl %edx,%ecx + shrdl $7,%ebx,%ebx + movl %eax,%esi + xorl %ecx,%edi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ebx,%esi + xorl %ecx,%ebx + addl %eax,%ebp + vpalignr $8,%xmm0,%xmm1,%xmm8 + vpxor %xmm6,%xmm2,%xmm2 + addl 32(%rsp),%edx + andl %ebx,%esi + xorl %ecx,%ebx + shrdl $7,%eax,%eax + vpxor %xmm3,%xmm2,%xmm2 + movl %ebp,%edi + xorl %ebx,%esi + vpaddd %xmm1,%xmm11,%xmm9 + shldl $5,%ebp,%ebp + addl %esi,%edx + vpxor %xmm8,%xmm2,%xmm2 + xorl %eax,%edi + xorl %ebx,%eax + addl %ebp,%edx + addl 36(%rsp),%ecx + vpsrld $30,%xmm2,%xmm8 + vmovdqa %xmm9,16(%rsp) + andl %eax,%edi + xorl %ebx,%eax + shrdl $7,%ebp,%ebp + movl %edx,%esi + vpslld $2,%xmm2,%xmm2 + xorl %eax,%edi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %ebp,%esi + xorl %eax,%ebp + addl %edx,%ecx + addl 40(%rsp),%ebx + andl %ebp,%esi + vpor %xmm8,%xmm2,%xmm2 + xorl %eax,%ebp + shrdl $7,%edx,%edx + movl %ecx,%edi + xorl %ebp,%esi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %edx,%edi + xorl %ebp,%edx + addl %ecx,%ebx + addl 44(%rsp),%eax + andl %edx,%edi + xorl %ebp,%edx + shrdl $7,%ecx,%ecx + movl %ebx,%esi + xorl %edx,%edi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + addl %ebx,%eax + vpalignr $8,%xmm1,%xmm2,%xmm8 + vpxor %xmm7,%xmm3,%xmm3 + addl 48(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + vpxor %xmm4,%xmm3,%xmm3 + addl %esi,%ebp + xorl %ecx,%edi + vpaddd %xmm2,%xmm11,%xmm9 + shrdl $7,%ebx,%ebx + addl %eax,%ebp + vpxor %xmm8,%xmm3,%xmm3 + addl 52(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + vpsrld $30,%xmm3,%xmm8 + vmovdqa %xmm9,32(%rsp) + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + vpslld $2,%xmm3,%xmm3 + addl 56(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vpor %xmm8,%xmm3,%xmm3 + addl 60(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 0(%rsp),%eax + vpaddd %xmm3,%xmm11,%xmm9 + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + vmovdqa %xmm9,48(%rsp) + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 4(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 8(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 12(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + cmpq %r10,%r9 + je .Ldone_avx + vmovdqa 64(%r14),%xmm6 + vmovdqa -64(%r14),%xmm11 + vmovdqu 0(%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + vpshufb %xmm6,%xmm0,%xmm0 + addq $64,%r9 + addl 16(%rsp),%ebx + xorl %ebp,%esi + vpshufb %xmm6,%xmm1,%xmm1 + movl %ecx,%edi + shldl $5,%ecx,%ecx + vpaddd %xmm11,%xmm0,%xmm4 + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + vmovdqa %xmm4,0(%rsp) + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + vpshufb %xmm6,%xmm2,%xmm2 + movl %edx,%edi + shldl $5,%edx,%edx + vpaddd %xmm11,%xmm1,%xmm5 + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + vmovdqa %xmm5,16(%rsp) + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + vpshufb %xmm6,%xmm3,%xmm3 + movl %ebp,%edi + shldl $5,%ebp,%ebp + vpaddd %xmm11,%xmm2,%xmm6 + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + vmovdqa %xmm6,32(%rsp) + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + addl 12(%r8),%edx + movl %eax,0(%r8) + addl 16(%r8),%ebp + movl %esi,4(%r8) + movl %esi,%ebx + movl %ecx,8(%r8) + movl %ecx,%edi + movl %edx,12(%r8) + xorl %edx,%edi + movl %ebp,16(%r8) + andl %edi,%esi + jmp .Loop_avx + +.align 16 +.Ldone_avx: + addl 16(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 20(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + xorl %edx,%esi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 24(%rsp),%ebp + xorl %ecx,%esi + movl %eax,%edi + shldl $5,%eax,%eax + addl %esi,%ebp + xorl %ecx,%edi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 28(%rsp),%edx + xorl %ebx,%edi + movl %ebp,%esi + shldl $5,%ebp,%ebp + addl %edi,%edx + xorl %ebx,%esi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 32(%rsp),%ecx + xorl %eax,%esi + movl %edx,%edi + shldl $5,%edx,%edx + addl %esi,%ecx + xorl %eax,%edi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 36(%rsp),%ebx + xorl %ebp,%edi + movl %ecx,%esi + shldl $5,%ecx,%ecx + addl %edi,%ebx + xorl %ebp,%esi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 40(%rsp),%eax + xorl %edx,%esi + movl %ebx,%edi + shldl $5,%ebx,%ebx + addl %esi,%eax + xorl %edx,%edi + shrdl $7,%ecx,%ecx + addl %ebx,%eax + addl 44(%rsp),%ebp + xorl %ecx,%edi + movl %eax,%esi + shldl $5,%eax,%eax + addl %edi,%ebp + xorl %ecx,%esi + shrdl $7,%ebx,%ebx + addl %eax,%ebp + addl 48(%rsp),%edx + xorl %ebx,%esi + movl %ebp,%edi + shldl $5,%ebp,%ebp + addl %esi,%edx + xorl %ebx,%edi + shrdl $7,%eax,%eax + addl %ebp,%edx + addl 52(%rsp),%ecx + xorl %eax,%edi + movl %edx,%esi + shldl $5,%edx,%edx + addl %edi,%ecx + xorl %eax,%esi + shrdl $7,%ebp,%ebp + addl %edx,%ecx + addl 56(%rsp),%ebx + xorl %ebp,%esi + movl %ecx,%edi + shldl $5,%ecx,%ecx + addl %esi,%ebx + xorl %ebp,%edi + shrdl $7,%edx,%edx + addl %ecx,%ebx + addl 60(%rsp),%eax + xorl %edx,%edi + movl %ebx,%esi + shldl $5,%ebx,%ebx + addl %edi,%eax + shrdl $7,%ecx,%ecx + addl %ebx,%eax + vzeroupper + + addl 0(%r8),%eax + addl 4(%r8),%esi + addl 8(%r8),%ecx + movl %eax,0(%r8) + addl 12(%r8),%edx + movl %esi,4(%r8) + addl 16(%r8),%ebp + movl %ecx,8(%r8) + movl %edx,12(%r8) + movl %ebp,16(%r8) + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + ret +.cfi_endproc +.size sha1_block_data_order_avx,.-sha1_block_data_order_avx +.globl sha1_block_data_order_avx2 +.hidden sha1_block_data_order_avx2 +.type sha1_block_data_order_avx2,@function +.align 16 +sha1_block_data_order_avx2: +.cfi_startproc +_CET_ENDBR + movq %rsp,%r11 +.cfi_def_cfa_register %r11 + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + vzeroupper + movq %rdi,%r8 + movq %rsi,%r9 + movq %rdx,%r10 + + leaq -640(%rsp),%rsp + shlq $6,%r10 + leaq 64(%r9),%r13 + andq $-128,%rsp + addq %r9,%r10 + leaq K_XX_XX+64(%rip),%r14 + + movl 0(%r8),%eax + cmpq %r10,%r13 + cmovaeq %r9,%r13 + movl 4(%r8),%ebp + movl 8(%r8),%ecx + movl 12(%r8),%edx + movl 16(%r8),%esi + vmovdqu 64(%r14),%ymm6 + + vmovdqu (%r9),%xmm0 + vmovdqu 16(%r9),%xmm1 + vmovdqu 32(%r9),%xmm2 + vmovdqu 48(%r9),%xmm3 + leaq 64(%r9),%r9 + vinserti128 $1,(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vpshufb %ymm6,%ymm0,%ymm0 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vpshufb %ymm6,%ymm1,%ymm1 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + vpshufb %ymm6,%ymm2,%ymm2 + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm3,%ymm3 + + vpaddd %ymm11,%ymm0,%ymm4 + vpaddd %ymm11,%ymm1,%ymm5 + vmovdqu %ymm4,0(%rsp) + vpaddd %ymm11,%ymm2,%ymm6 + vmovdqu %ymm5,32(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + vmovdqu %ymm6,64(%rsp) + vmovdqu %ymm7,96(%rsp) + vpalignr $8,%ymm0,%ymm1,%ymm4 + vpsrldq $4,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $31,%ymm4,%ymm8 + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + vpxor %ymm10,%ymm4,%ymm4 + vpaddd %ymm11,%ymm4,%ymm9 + vmovdqu %ymm9,128(%rsp) + vpalignr $8,%ymm1,%ymm2,%ymm5 + vpsrldq $4,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm10,%ymm5,%ymm5 + vpaddd %ymm11,%ymm5,%ymm9 + vmovdqu %ymm9,160(%rsp) + vpalignr $8,%ymm2,%ymm3,%ymm6 + vpsrldq $4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $31,%ymm6,%ymm8 + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + vpxor %ymm10,%ymm6,%ymm6 + vpaddd %ymm11,%ymm6,%ymm9 + vmovdqu %ymm9,192(%rsp) + vpalignr $8,%ymm3,%ymm4,%ymm7 + vpsrldq $4,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $31,%ymm7,%ymm8 + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + vpxor %ymm10,%ymm7,%ymm7 + vpaddd %ymm11,%ymm7,%ymm9 + vmovdqu %ymm9,224(%rsp) + leaq 128(%rsp),%r13 + jmp .Loop_avx2 +.align 32 +.Loop_avx2: + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + jmp .Lalign32_1 +.align 32 +.Lalign32_1: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + vpxor %ymm1,%ymm0,%ymm0 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpxor %ymm8,%ymm0,%ymm0 + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vpor %ymm8,%ymm0,%ymm0 + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + vpaddd %ymm11,%ymm0,%ymm9 + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + vmovdqu %ymm9,256(%rsp) + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + vpxor %ymm2,%ymm1,%ymm1 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpxor %ymm8,%ymm1,%ymm1 + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vpor %ymm8,%ymm1,%ymm1 + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + vpaddd %ymm11,%ymm1,%ymm9 + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + vmovdqu %ymm9,288(%rsp) + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + vpxor %ymm3,%ymm2,%ymm2 + vmovdqu 0(%r14),%ymm11 + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpxor %ymm8,%ymm2,%ymm2 + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vpor %ymm8,%ymm2,%ymm2 + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + vpaddd %ymm11,%ymm2,%ymm9 + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + vmovdqu %ymm9,320(%rsp) + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + vpxor %ymm4,%ymm3,%ymm3 + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpxor %ymm8,%ymm3,%ymm3 + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + vpor %ymm8,%ymm3,%ymm3 + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + vpaddd %ymm11,%ymm3,%ymm9 + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + vmovdqu %ymm9,352(%rsp) + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpalignr $8,%ymm2,%ymm3,%ymm8 + vpxor %ymm0,%ymm4,%ymm4 + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpxor %ymm5,%ymm4,%ymm4 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpxor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + vpsrld $30,%ymm4,%ymm8 + vpslld $2,%ymm4,%ymm4 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpor %ymm8,%ymm4,%ymm4 + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpaddd %ymm11,%ymm4,%ymm9 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + vmovdqu %ymm9,384(%rsp) + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpalignr $8,%ymm3,%ymm4,%ymm8 + vpxor %ymm1,%ymm5,%ymm5 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm6,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpxor %ymm8,%ymm5,%ymm5 + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + vpsrld $30,%ymm5,%ymm8 + vpslld $2,%ymm5,%ymm5 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vpor %ymm8,%ymm5,%ymm5 + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + vmovdqu %ymm9,416(%rsp) + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm4,%ymm5,%ymm8 + vpxor %ymm2,%ymm6,%ymm6 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm7,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + vpxor %ymm8,%ymm6,%ymm6 + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + vpsrld $30,%ymm6,%ymm8 + vpslld $2,%ymm6,%ymm6 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vpor %ymm8,%ymm6,%ymm6 + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + vmovdqu %ymm9,448(%rsp) + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm5,%ymm6,%ymm8 + vpxor %ymm3,%ymm7,%ymm7 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm0,%ymm7,%ymm7 + vmovdqu 32(%r14),%ymm11 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpxor %ymm8,%ymm7,%ymm7 + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + vpsrld $30,%ymm7,%ymm8 + vpslld $2,%ymm7,%ymm7 + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpor %ymm8,%ymm7,%ymm7 + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + vmovdqu %ymm9,480(%rsp) + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + jmp .Lalign32_2 +.align 32 +.Lalign32_2: + vpalignr $8,%ymm6,%ymm7,%ymm8 + vpxor %ymm4,%ymm0,%ymm0 + addl -64(%r13),%ebp + xorl %esi,%ecx + vpxor %ymm1,%ymm0,%ymm0 + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + vpxor %ymm8,%ymm0,%ymm0 + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpsrld $30,%ymm0,%ymm8 + vpslld $2,%ymm0,%ymm0 + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + vpor %ymm8,%ymm0,%ymm0 + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + vpaddd %ymm11,%ymm0,%ymm9 + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + vmovdqu %ymm9,512(%rsp) + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + vpalignr $8,%ymm7,%ymm0,%ymm8 + vpxor %ymm5,%ymm1,%ymm1 + addl -28(%r13),%ebx + xorl %eax,%edx + vpxor %ymm2,%ymm1,%ymm1 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpxor %ymm8,%ymm1,%ymm1 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpsrld $30,%ymm1,%ymm8 + vpslld $2,%ymm1,%ymm1 + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + vpor %ymm8,%ymm1,%ymm1 + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + vpaddd %ymm11,%ymm1,%ymm9 + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + vmovdqu %ymm9,544(%rsp) + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vpalignr $8,%ymm0,%ymm1,%ymm8 + vpxor %ymm6,%ymm2,%ymm2 + addl 8(%r13),%ecx + xorl %ebp,%esi + vpxor %ymm3,%ymm2,%ymm2 + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + vpxor %ymm8,%ymm2,%ymm2 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm2,%ymm8 + vpslld $2,%ymm2,%ymm2 + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + vpor %ymm8,%ymm2,%ymm2 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vpaddd %ymm11,%ymm2,%ymm9 + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + vmovdqu %ymm9,576(%rsp) + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm1,%ymm2,%ymm8 + vpxor %ymm7,%ymm3,%ymm3 + addl 44(%r13),%edx + xorl %ebx,%eax + vpxor %ymm4,%ymm3,%ymm3 + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm3,%ymm3 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + vpsrld $30,%ymm3,%ymm8 + vpslld $2,%ymm3,%ymm3 + addl %r12d,%edx + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + vpor %ymm8,%ymm3,%ymm3 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + vpaddd %ymm11,%ymm3,%ymm9 + addl %r12d,%ecx + andl %edi,%edx + addl 68(%r13),%ebx + xorl %eax,%edx + vmovdqu %ymm9,608(%rsp) + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -96(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -60(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%r9),%r13 + leaq 128(%r9),%rdi + cmpq %r10,%r13 + cmovaeq %r9,%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + je .Ldone_avx2 + vmovdqu 64(%r14),%ymm6 + cmpq %r10,%rdi + ja .Last_avx2 + + vmovdqu -64(%rdi),%xmm0 + vmovdqu -48(%rdi),%xmm1 + vmovdqu -32(%rdi),%xmm2 + vmovdqu -16(%rdi),%xmm3 + vinserti128 $1,0(%r13),%ymm0,%ymm0 + vinserti128 $1,16(%r13),%ymm1,%ymm1 + vinserti128 $1,32(%r13),%ymm2,%ymm2 + vinserti128 $1,48(%r13),%ymm3,%ymm3 + jmp .Last_avx2 + +.align 32 +.Last_avx2: + leaq 128+16(%rsp),%r13 + rorxl $2,%ebp,%ebx + andnl %edx,%ebp,%edi + andl %ecx,%ebp + xorl %edi,%ebp + subq $-128,%r9 + addl -128(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -124(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -120(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -116(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -96(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -92(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -88(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -84(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -64(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -60(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl -56(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl -52(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl -32(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl -28(%r13),%edx + andnl %ebx,%esi,%edi + addl %eax,%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + andl %ebp,%esi + addl %r12d,%edx + xorl %edi,%esi + addl -24(%r13),%ecx + andnl %ebp,%edx,%edi + addl %esi,%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + andl %eax,%edx + addl %r12d,%ecx + xorl %edi,%edx + addl -20(%r13),%ebx + andnl %eax,%ecx,%edi + addl %edx,%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + andl %esi,%ecx + addl %r12d,%ebx + xorl %edi,%ecx + addl 0(%r13),%ebp + andnl %esi,%ebx,%edi + addl %ecx,%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + andl %edx,%ebx + addl %r12d,%ebp + xorl %edi,%ebx + addl 4(%r13),%eax + andnl %edx,%ebp,%edi + addl %ebx,%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + andl %ecx,%ebp + addl %r12d,%eax + xorl %edi,%ebp + addl 8(%r13),%esi + andnl %ecx,%eax,%edi + addl %ebp,%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + andl %ebx,%eax + addl %r12d,%esi + xorl %edi,%eax + addl 12(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 32(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 36(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 40(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 44(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl 64(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vmovdqu -64(%r14),%ymm11 + vpshufb %ymm6,%ymm0,%ymm0 + addl 68(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl 72(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl 76(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl 96(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl 100(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpshufb %ymm6,%ymm1,%ymm1 + vpaddd %ymm11,%ymm0,%ymm8 + addl 104(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl 108(%r13),%edx + leaq 256(%r13),%r13 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -128(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -124(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -120(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vmovdqu %ymm8,0(%rsp) + vpshufb %ymm6,%ymm2,%ymm2 + vpaddd %ymm11,%ymm1,%ymm9 + addl -116(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -92(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + addl -88(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -84(%r13),%ebx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + vmovdqu %ymm9,32(%rsp) + vpshufb %ymm6,%ymm3,%ymm3 + vpaddd %ymm11,%ymm2,%ymm6 + addl -64(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -60(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl -56(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl -52(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + addl -32(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + jmp .Lalign32_3 +.align 32 +.Lalign32_3: + vmovdqu %ymm6,64(%rsp) + vpaddd %ymm11,%ymm3,%ymm7 + addl -28(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl -24(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl -20(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 0(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + addl 4(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + andl %edi,%esi + vmovdqu %ymm7,96(%rsp) + addl 8(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + andl %edi,%edx + addl 12(%r13),%ebx + xorl %eax,%edx + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + andl %edi,%ecx + addl 32(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 36(%r13),%eax + xorl %edx,%ebx + movl %ecx,%edi + xorl %edx,%edi + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + andl %edi,%ebp + addl 40(%r13),%esi + xorl %ecx,%ebp + movl %ebx,%edi + xorl %ecx,%edi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + andl %edi,%eax + vpalignr $8,%ymm0,%ymm1,%ymm4 + addl 44(%r13),%edx + xorl %ebx,%eax + movl %ebp,%edi + xorl %ebx,%edi + vpsrldq $4,%ymm3,%ymm8 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpxor %ymm0,%ymm4,%ymm4 + vpxor %ymm2,%ymm8,%ymm8 + xorl %ebp,%esi + addl %r12d,%edx + vpxor %ymm8,%ymm4,%ymm4 + andl %edi,%esi + addl 64(%r13),%ecx + xorl %ebp,%esi + movl %eax,%edi + vpsrld $31,%ymm4,%ymm8 + xorl %ebp,%edi + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + vpslldq $12,%ymm4,%ymm10 + vpaddd %ymm4,%ymm4,%ymm4 + rorxl $2,%edx,%esi + xorl %eax,%edx + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm4,%ymm4 + addl %r12d,%ecx + andl %edi,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm4,%ymm4 + addl 68(%r13),%ebx + xorl %eax,%edx + vpxor %ymm10,%ymm4,%ymm4 + movl %esi,%edi + xorl %eax,%edi + leal (%rbx,%rdx,1),%ebx + vpaddd %ymm11,%ymm4,%ymm9 + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + vmovdqu %ymm9,128(%rsp) + addl %r12d,%ebx + andl %edi,%ecx + addl 72(%r13),%ebp + xorl %esi,%ecx + movl %edx,%edi + xorl %esi,%edi + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + andl %edi,%ebx + addl 76(%r13),%eax + xorl %edx,%ebx + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpalignr $8,%ymm1,%ymm2,%ymm5 + addl 96(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrldq $4,%ymm4,%ymm8 + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + vpxor %ymm1,%ymm5,%ymm5 + vpxor %ymm3,%ymm8,%ymm8 + addl 100(%r13),%edx + leal (%rdx,%rax,1),%edx + vpxor %ymm8,%ymm5,%ymm5 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + xorl %ebp,%esi + addl %r12d,%edx + vpsrld $31,%ymm5,%ymm8 + vmovdqu -32(%r14),%ymm11 + xorl %ebx,%esi + addl 104(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + vpslldq $12,%ymm5,%ymm10 + vpaddd %ymm5,%ymm5,%ymm5 + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm5,%ymm5 + xorl %eax,%edx + addl %r12d,%ecx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm5,%ymm5 + xorl %ebp,%edx + addl 108(%r13),%ebx + leaq 256(%r13),%r13 + vpxor %ymm10,%ymm5,%ymm5 + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + vpaddd %ymm11,%ymm5,%ymm9 + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vmovdqu %ymm9,160(%rsp) + addl -128(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpalignr $8,%ymm2,%ymm3,%ymm6 + addl -124(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + vpsrldq $4,%ymm5,%ymm8 + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + vpxor %ymm2,%ymm6,%ymm6 + vpxor %ymm4,%ymm8,%ymm8 + addl -120(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpxor %ymm8,%ymm6,%ymm6 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + vpsrld $31,%ymm6,%ymm8 + xorl %ecx,%eax + addl -116(%r13),%edx + leal (%rdx,%rax,1),%edx + vpslldq $12,%ymm6,%ymm10 + vpaddd %ymm6,%ymm6,%ymm6 + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm6,%ymm6 + xorl %ebp,%esi + addl %r12d,%edx + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm6,%ymm6 + xorl %ebx,%esi + addl -96(%r13),%ecx + vpxor %ymm10,%ymm6,%ymm6 + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + vpaddd %ymm11,%ymm6,%ymm9 + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + vmovdqu %ymm9,192(%rsp) + addl -92(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + vpalignr $8,%ymm3,%ymm4,%ymm7 + addl -88(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + vpsrldq $4,%ymm6,%ymm8 + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + vpxor %ymm3,%ymm7,%ymm7 + vpxor %ymm5,%ymm8,%ymm8 + addl -84(%r13),%eax + leal (%rax,%rbx,1),%eax + vpxor %ymm8,%ymm7,%ymm7 + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + vpsrld $31,%ymm7,%ymm8 + xorl %edx,%ebp + addl -64(%r13),%esi + leal (%rsi,%rbp,1),%esi + vpslldq $12,%ymm7,%ymm10 + vpaddd %ymm7,%ymm7,%ymm7 + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + vpsrld $30,%ymm10,%ymm9 + vpor %ymm8,%ymm7,%ymm7 + xorl %ebx,%eax + addl %r12d,%esi + vpslld $2,%ymm10,%ymm10 + vpxor %ymm9,%ymm7,%ymm7 + xorl %ecx,%eax + addl -60(%r13),%edx + vpxor %ymm10,%ymm7,%ymm7 + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + rorxl $2,%esi,%eax + vpaddd %ymm11,%ymm7,%ymm9 + xorl %ebp,%esi + addl %r12d,%edx + xorl %ebx,%esi + vmovdqu %ymm9,224(%rsp) + addl -56(%r13),%ecx + leal (%rcx,%rsi,1),%ecx + rorxl $27,%edx,%r12d + rorxl $2,%edx,%esi + xorl %eax,%edx + addl %r12d,%ecx + xorl %ebp,%edx + addl -52(%r13),%ebx + leal (%rbx,%rdx,1),%ebx + rorxl $27,%ecx,%r12d + rorxl $2,%ecx,%edx + xorl %esi,%ecx + addl %r12d,%ebx + xorl %eax,%ecx + addl -32(%r13),%ebp + leal (%rcx,%rbp,1),%ebp + rorxl $27,%ebx,%r12d + rorxl $2,%ebx,%ecx + xorl %edx,%ebx + addl %r12d,%ebp + xorl %esi,%ebx + addl -28(%r13),%eax + leal (%rax,%rbx,1),%eax + rorxl $27,%ebp,%r12d + rorxl $2,%ebp,%ebx + xorl %ecx,%ebp + addl %r12d,%eax + xorl %edx,%ebp + addl -24(%r13),%esi + leal (%rsi,%rbp,1),%esi + rorxl $27,%eax,%r12d + rorxl $2,%eax,%ebp + xorl %ebx,%eax + addl %r12d,%esi + xorl %ecx,%eax + addl -20(%r13),%edx + leal (%rdx,%rax,1),%edx + rorxl $27,%esi,%r12d + addl %r12d,%edx + leaq 128(%rsp),%r13 + + + addl 0(%r8),%edx + addl 4(%r8),%esi + addl 8(%r8),%ebp + movl %edx,0(%r8) + addl 12(%r8),%ebx + movl %esi,4(%r8) + movl %edx,%eax + addl 16(%r8),%ecx + movl %ebp,%r12d + movl %ebp,8(%r8) + movl %ebx,%edx + + movl %ebx,12(%r8) + movl %esi,%ebp + movl %ecx,16(%r8) + + movl %ecx,%esi + movl %r12d,%ecx + + + cmpq %r10,%r9 + jbe .Loop_avx2 + +.Ldone_avx2: + vzeroupper + movq -40(%r11),%r14 +.cfi_restore %r14 + movq -32(%r11),%r13 +.cfi_restore %r13 + movq -24(%r11),%r12 +.cfi_restore %r12 + movq -16(%r11),%rbp +.cfi_restore %rbp + movq -8(%r11),%rbx +.cfi_restore %rbx + leaq (%r11),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx2: + ret +.cfi_endproc +.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 +.section .rodata +.align 64 +K_XX_XX: +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 +.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 64 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/sha1-x86_64-win.asm b/third_party/boringssl/gen/bcm/sha1-x86_64-win.asm new file mode 100644 index 00000000..128fc376 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha1-x86_64-win.asm @@ -0,0 +1,5768 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + +global sha1_block_data_order_nohw + +ALIGN 16 +sha1_block_data_order_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha1_block_data_order_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + mov r8,rdi + sub rsp,72 + mov r9,rsi + and rsp,-64 + mov r10,rdx + mov QWORD[64+rsp],rax + +$L$prologue: + + mov esi,DWORD[r8] + mov edi,DWORD[4+r8] + mov r11d,DWORD[8+r8] + mov r12d,DWORD[12+r8] + mov r13d,DWORD[16+r8] + jmp NEAR $L$loop + +ALIGN 16 +$L$loop: + mov edx,DWORD[r9] + bswap edx + mov ebp,DWORD[4+r9] + mov eax,r12d + mov DWORD[rsp],edx + mov ecx,esi + bswap ebp + xor eax,r11d + rol ecx,5 + and eax,edi + lea r13d,[1518500249+r13*1+rdx] + add r13d,ecx + xor eax,r12d + rol edi,30 + add r13d,eax + mov r14d,DWORD[8+r9] + mov eax,r11d + mov DWORD[4+rsp],ebp + mov ecx,r13d + bswap r14d + xor eax,edi + rol ecx,5 + and eax,esi + lea r12d,[1518500249+r12*1+rbp] + add r12d,ecx + xor eax,r11d + rol esi,30 + add r12d,eax + mov edx,DWORD[12+r9] + mov eax,edi + mov DWORD[8+rsp],r14d + mov ecx,r12d + bswap edx + xor eax,esi + rol ecx,5 + and eax,r13d + lea r11d,[1518500249+r11*1+r14] + add r11d,ecx + xor eax,edi + rol r13d,30 + add r11d,eax + mov ebp,DWORD[16+r9] + mov eax,esi + mov DWORD[12+rsp],edx + mov ecx,r11d + bswap ebp + xor eax,r13d + rol ecx,5 + and eax,r12d + lea edi,[1518500249+rdi*1+rdx] + add edi,ecx + xor eax,esi + rol r12d,30 + add edi,eax + mov r14d,DWORD[20+r9] + mov eax,r13d + mov DWORD[16+rsp],ebp + mov ecx,edi + bswap r14d + xor eax,r12d + rol ecx,5 + and eax,r11d + lea esi,[1518500249+rsi*1+rbp] + add esi,ecx + xor eax,r13d + rol r11d,30 + add esi,eax + mov edx,DWORD[24+r9] + mov eax,r12d + mov DWORD[20+rsp],r14d + mov ecx,esi + bswap edx + xor eax,r11d + rol ecx,5 + and eax,edi + lea r13d,[1518500249+r13*1+r14] + add r13d,ecx + xor eax,r12d + rol edi,30 + add r13d,eax + mov ebp,DWORD[28+r9] + mov eax,r11d + mov DWORD[24+rsp],edx + mov ecx,r13d + bswap ebp + xor eax,edi + rol ecx,5 + and eax,esi + lea r12d,[1518500249+r12*1+rdx] + add r12d,ecx + xor eax,r11d + rol esi,30 + add r12d,eax + mov r14d,DWORD[32+r9] + mov eax,edi + mov DWORD[28+rsp],ebp + mov ecx,r12d + bswap r14d + xor eax,esi + rol ecx,5 + and eax,r13d + lea r11d,[1518500249+r11*1+rbp] + add r11d,ecx + xor eax,edi + rol r13d,30 + add r11d,eax + mov edx,DWORD[36+r9] + mov eax,esi + mov DWORD[32+rsp],r14d + mov ecx,r11d + bswap edx + xor eax,r13d + rol ecx,5 + and eax,r12d + lea edi,[1518500249+rdi*1+r14] + add edi,ecx + xor eax,esi + rol r12d,30 + add edi,eax + mov ebp,DWORD[40+r9] + mov eax,r13d + mov DWORD[36+rsp],edx + mov ecx,edi + bswap ebp + xor eax,r12d + rol ecx,5 + and eax,r11d + lea esi,[1518500249+rsi*1+rdx] + add esi,ecx + xor eax,r13d + rol r11d,30 + add esi,eax + mov r14d,DWORD[44+r9] + mov eax,r12d + mov DWORD[40+rsp],ebp + mov ecx,esi + bswap r14d + xor eax,r11d + rol ecx,5 + and eax,edi + lea r13d,[1518500249+r13*1+rbp] + add r13d,ecx + xor eax,r12d + rol edi,30 + add r13d,eax + mov edx,DWORD[48+r9] + mov eax,r11d + mov DWORD[44+rsp],r14d + mov ecx,r13d + bswap edx + xor eax,edi + rol ecx,5 + and eax,esi + lea r12d,[1518500249+r12*1+r14] + add r12d,ecx + xor eax,r11d + rol esi,30 + add r12d,eax + mov ebp,DWORD[52+r9] + mov eax,edi + mov DWORD[48+rsp],edx + mov ecx,r12d + bswap ebp + xor eax,esi + rol ecx,5 + and eax,r13d + lea r11d,[1518500249+r11*1+rdx] + add r11d,ecx + xor eax,edi + rol r13d,30 + add r11d,eax + mov r14d,DWORD[56+r9] + mov eax,esi + mov DWORD[52+rsp],ebp + mov ecx,r11d + bswap r14d + xor eax,r13d + rol ecx,5 + and eax,r12d + lea edi,[1518500249+rdi*1+rbp] + add edi,ecx + xor eax,esi + rol r12d,30 + add edi,eax + mov edx,DWORD[60+r9] + mov eax,r13d + mov DWORD[56+rsp],r14d + mov ecx,edi + bswap edx + xor eax,r12d + rol ecx,5 + and eax,r11d + lea esi,[1518500249+rsi*1+r14] + add esi,ecx + xor eax,r13d + rol r11d,30 + add esi,eax + xor ebp,DWORD[rsp] + mov eax,r12d + mov DWORD[60+rsp],edx + mov ecx,esi + xor ebp,DWORD[8+rsp] + xor eax,r11d + rol ecx,5 + xor ebp,DWORD[32+rsp] + and eax,edi + lea r13d,[1518500249+r13*1+rdx] + rol edi,30 + xor eax,r12d + add r13d,ecx + rol ebp,1 + add r13d,eax + xor r14d,DWORD[4+rsp] + mov eax,r11d + mov DWORD[rsp],ebp + mov ecx,r13d + xor r14d,DWORD[12+rsp] + xor eax,edi + rol ecx,5 + xor r14d,DWORD[36+rsp] + and eax,esi + lea r12d,[1518500249+r12*1+rbp] + rol esi,30 + xor eax,r11d + add r12d,ecx + rol r14d,1 + add r12d,eax + xor edx,DWORD[8+rsp] + mov eax,edi + mov DWORD[4+rsp],r14d + mov ecx,r12d + xor edx,DWORD[16+rsp] + xor eax,esi + rol ecx,5 + xor edx,DWORD[40+rsp] + and eax,r13d + lea r11d,[1518500249+r11*1+r14] + rol r13d,30 + xor eax,edi + add r11d,ecx + rol edx,1 + add r11d,eax + xor ebp,DWORD[12+rsp] + mov eax,esi + mov DWORD[8+rsp],edx + mov ecx,r11d + xor ebp,DWORD[20+rsp] + xor eax,r13d + rol ecx,5 + xor ebp,DWORD[44+rsp] + and eax,r12d + lea edi,[1518500249+rdi*1+rdx] + rol r12d,30 + xor eax,esi + add edi,ecx + rol ebp,1 + add edi,eax + xor r14d,DWORD[16+rsp] + mov eax,r13d + mov DWORD[12+rsp],ebp + mov ecx,edi + xor r14d,DWORD[24+rsp] + xor eax,r12d + rol ecx,5 + xor r14d,DWORD[48+rsp] + and eax,r11d + lea esi,[1518500249+rsi*1+rbp] + rol r11d,30 + xor eax,r13d + add esi,ecx + rol r14d,1 + add esi,eax + xor edx,DWORD[20+rsp] + mov eax,edi + mov DWORD[16+rsp],r14d + mov ecx,esi + xor edx,DWORD[28+rsp] + xor eax,r12d + rol ecx,5 + xor edx,DWORD[52+rsp] + lea r13d,[1859775393+r13*1+r14] + xor eax,r11d + add r13d,ecx + rol edi,30 + add r13d,eax + rol edx,1 + xor ebp,DWORD[24+rsp] + mov eax,esi + mov DWORD[20+rsp],edx + mov ecx,r13d + xor ebp,DWORD[32+rsp] + xor eax,r11d + rol ecx,5 + xor ebp,DWORD[56+rsp] + lea r12d,[1859775393+r12*1+rdx] + xor eax,edi + add r12d,ecx + rol esi,30 + add r12d,eax + rol ebp,1 + xor r14d,DWORD[28+rsp] + mov eax,r13d + mov DWORD[24+rsp],ebp + mov ecx,r12d + xor r14d,DWORD[36+rsp] + xor eax,edi + rol ecx,5 + xor r14d,DWORD[60+rsp] + lea r11d,[1859775393+r11*1+rbp] + xor eax,esi + add r11d,ecx + rol r13d,30 + add r11d,eax + rol r14d,1 + xor edx,DWORD[32+rsp] + mov eax,r12d + mov DWORD[28+rsp],r14d + mov ecx,r11d + xor edx,DWORD[40+rsp] + xor eax,esi + rol ecx,5 + xor edx,DWORD[rsp] + lea edi,[1859775393+rdi*1+r14] + xor eax,r13d + add edi,ecx + rol r12d,30 + add edi,eax + rol edx,1 + xor ebp,DWORD[36+rsp] + mov eax,r11d + mov DWORD[32+rsp],edx + mov ecx,edi + xor ebp,DWORD[44+rsp] + xor eax,r13d + rol ecx,5 + xor ebp,DWORD[4+rsp] + lea esi,[1859775393+rsi*1+rdx] + xor eax,r12d + add esi,ecx + rol r11d,30 + add esi,eax + rol ebp,1 + xor r14d,DWORD[40+rsp] + mov eax,edi + mov DWORD[36+rsp],ebp + mov ecx,esi + xor r14d,DWORD[48+rsp] + xor eax,r12d + rol ecx,5 + xor r14d,DWORD[8+rsp] + lea r13d,[1859775393+r13*1+rbp] + xor eax,r11d + add r13d,ecx + rol edi,30 + add r13d,eax + rol r14d,1 + xor edx,DWORD[44+rsp] + mov eax,esi + mov DWORD[40+rsp],r14d + mov ecx,r13d + xor edx,DWORD[52+rsp] + xor eax,r11d + rol ecx,5 + xor edx,DWORD[12+rsp] + lea r12d,[1859775393+r12*1+r14] + xor eax,edi + add r12d,ecx + rol esi,30 + add r12d,eax + rol edx,1 + xor ebp,DWORD[48+rsp] + mov eax,r13d + mov DWORD[44+rsp],edx + mov ecx,r12d + xor ebp,DWORD[56+rsp] + xor eax,edi + rol ecx,5 + xor ebp,DWORD[16+rsp] + lea r11d,[1859775393+r11*1+rdx] + xor eax,esi + add r11d,ecx + rol r13d,30 + add r11d,eax + rol ebp,1 + xor r14d,DWORD[52+rsp] + mov eax,r12d + mov DWORD[48+rsp],ebp + mov ecx,r11d + xor r14d,DWORD[60+rsp] + xor eax,esi + rol ecx,5 + xor r14d,DWORD[20+rsp] + lea edi,[1859775393+rdi*1+rbp] + xor eax,r13d + add edi,ecx + rol r12d,30 + add edi,eax + rol r14d,1 + xor edx,DWORD[56+rsp] + mov eax,r11d + mov DWORD[52+rsp],r14d + mov ecx,edi + xor edx,DWORD[rsp] + xor eax,r13d + rol ecx,5 + xor edx,DWORD[24+rsp] + lea esi,[1859775393+rsi*1+r14] + xor eax,r12d + add esi,ecx + rol r11d,30 + add esi,eax + rol edx,1 + xor ebp,DWORD[60+rsp] + mov eax,edi + mov DWORD[56+rsp],edx + mov ecx,esi + xor ebp,DWORD[4+rsp] + xor eax,r12d + rol ecx,5 + xor ebp,DWORD[28+rsp] + lea r13d,[1859775393+r13*1+rdx] + xor eax,r11d + add r13d,ecx + rol edi,30 + add r13d,eax + rol ebp,1 + xor r14d,DWORD[rsp] + mov eax,esi + mov DWORD[60+rsp],ebp + mov ecx,r13d + xor r14d,DWORD[8+rsp] + xor eax,r11d + rol ecx,5 + xor r14d,DWORD[32+rsp] + lea r12d,[1859775393+r12*1+rbp] + xor eax,edi + add r12d,ecx + rol esi,30 + add r12d,eax + rol r14d,1 + xor edx,DWORD[4+rsp] + mov eax,r13d + mov DWORD[rsp],r14d + mov ecx,r12d + xor edx,DWORD[12+rsp] + xor eax,edi + rol ecx,5 + xor edx,DWORD[36+rsp] + lea r11d,[1859775393+r11*1+r14] + xor eax,esi + add r11d,ecx + rol r13d,30 + add r11d,eax + rol edx,1 + xor ebp,DWORD[8+rsp] + mov eax,r12d + mov DWORD[4+rsp],edx + mov ecx,r11d + xor ebp,DWORD[16+rsp] + xor eax,esi + rol ecx,5 + xor ebp,DWORD[40+rsp] + lea edi,[1859775393+rdi*1+rdx] + xor eax,r13d + add edi,ecx + rol r12d,30 + add edi,eax + rol ebp,1 + xor r14d,DWORD[12+rsp] + mov eax,r11d + mov DWORD[8+rsp],ebp + mov ecx,edi + xor r14d,DWORD[20+rsp] + xor eax,r13d + rol ecx,5 + xor r14d,DWORD[44+rsp] + lea esi,[1859775393+rsi*1+rbp] + xor eax,r12d + add esi,ecx + rol r11d,30 + add esi,eax + rol r14d,1 + xor edx,DWORD[16+rsp] + mov eax,edi + mov DWORD[12+rsp],r14d + mov ecx,esi + xor edx,DWORD[24+rsp] + xor eax,r12d + rol ecx,5 + xor edx,DWORD[48+rsp] + lea r13d,[1859775393+r13*1+r14] + xor eax,r11d + add r13d,ecx + rol edi,30 + add r13d,eax + rol edx,1 + xor ebp,DWORD[20+rsp] + mov eax,esi + mov DWORD[16+rsp],edx + mov ecx,r13d + xor ebp,DWORD[28+rsp] + xor eax,r11d + rol ecx,5 + xor ebp,DWORD[52+rsp] + lea r12d,[1859775393+r12*1+rdx] + xor eax,edi + add r12d,ecx + rol esi,30 + add r12d,eax + rol ebp,1 + xor r14d,DWORD[24+rsp] + mov eax,r13d + mov DWORD[20+rsp],ebp + mov ecx,r12d + xor r14d,DWORD[32+rsp] + xor eax,edi + rol ecx,5 + xor r14d,DWORD[56+rsp] + lea r11d,[1859775393+r11*1+rbp] + xor eax,esi + add r11d,ecx + rol r13d,30 + add r11d,eax + rol r14d,1 + xor edx,DWORD[28+rsp] + mov eax,r12d + mov DWORD[24+rsp],r14d + mov ecx,r11d + xor edx,DWORD[36+rsp] + xor eax,esi + rol ecx,5 + xor edx,DWORD[60+rsp] + lea edi,[1859775393+rdi*1+r14] + xor eax,r13d + add edi,ecx + rol r12d,30 + add edi,eax + rol edx,1 + xor ebp,DWORD[32+rsp] + mov eax,r11d + mov DWORD[28+rsp],edx + mov ecx,edi + xor ebp,DWORD[40+rsp] + xor eax,r13d + rol ecx,5 + xor ebp,DWORD[rsp] + lea esi,[1859775393+rsi*1+rdx] + xor eax,r12d + add esi,ecx + rol r11d,30 + add esi,eax + rol ebp,1 + xor r14d,DWORD[36+rsp] + mov eax,r12d + mov DWORD[32+rsp],ebp + mov ebx,r12d + xor r14d,DWORD[44+rsp] + and eax,r11d + mov ecx,esi + xor r14d,DWORD[4+rsp] + lea r13d,[((-1894007588))+r13*1+rbp] + xor ebx,r11d + rol ecx,5 + add r13d,eax + rol r14d,1 + and ebx,edi + add r13d,ecx + rol edi,30 + add r13d,ebx + xor edx,DWORD[40+rsp] + mov eax,r11d + mov DWORD[36+rsp],r14d + mov ebx,r11d + xor edx,DWORD[48+rsp] + and eax,edi + mov ecx,r13d + xor edx,DWORD[8+rsp] + lea r12d,[((-1894007588))+r12*1+r14] + xor ebx,edi + rol ecx,5 + add r12d,eax + rol edx,1 + and ebx,esi + add r12d,ecx + rol esi,30 + add r12d,ebx + xor ebp,DWORD[44+rsp] + mov eax,edi + mov DWORD[40+rsp],edx + mov ebx,edi + xor ebp,DWORD[52+rsp] + and eax,esi + mov ecx,r12d + xor ebp,DWORD[12+rsp] + lea r11d,[((-1894007588))+r11*1+rdx] + xor ebx,esi + rol ecx,5 + add r11d,eax + rol ebp,1 + and ebx,r13d + add r11d,ecx + rol r13d,30 + add r11d,ebx + xor r14d,DWORD[48+rsp] + mov eax,esi + mov DWORD[44+rsp],ebp + mov ebx,esi + xor r14d,DWORD[56+rsp] + and eax,r13d + mov ecx,r11d + xor r14d,DWORD[16+rsp] + lea edi,[((-1894007588))+rdi*1+rbp] + xor ebx,r13d + rol ecx,5 + add edi,eax + rol r14d,1 + and ebx,r12d + add edi,ecx + rol r12d,30 + add edi,ebx + xor edx,DWORD[52+rsp] + mov eax,r13d + mov DWORD[48+rsp],r14d + mov ebx,r13d + xor edx,DWORD[60+rsp] + and eax,r12d + mov ecx,edi + xor edx,DWORD[20+rsp] + lea esi,[((-1894007588))+rsi*1+r14] + xor ebx,r12d + rol ecx,5 + add esi,eax + rol edx,1 + and ebx,r11d + add esi,ecx + rol r11d,30 + add esi,ebx + xor ebp,DWORD[56+rsp] + mov eax,r12d + mov DWORD[52+rsp],edx + mov ebx,r12d + xor ebp,DWORD[rsp] + and eax,r11d + mov ecx,esi + xor ebp,DWORD[24+rsp] + lea r13d,[((-1894007588))+r13*1+rdx] + xor ebx,r11d + rol ecx,5 + add r13d,eax + rol ebp,1 + and ebx,edi + add r13d,ecx + rol edi,30 + add r13d,ebx + xor r14d,DWORD[60+rsp] + mov eax,r11d + mov DWORD[56+rsp],ebp + mov ebx,r11d + xor r14d,DWORD[4+rsp] + and eax,edi + mov ecx,r13d + xor r14d,DWORD[28+rsp] + lea r12d,[((-1894007588))+r12*1+rbp] + xor ebx,edi + rol ecx,5 + add r12d,eax + rol r14d,1 + and ebx,esi + add r12d,ecx + rol esi,30 + add r12d,ebx + xor edx,DWORD[rsp] + mov eax,edi + mov DWORD[60+rsp],r14d + mov ebx,edi + xor edx,DWORD[8+rsp] + and eax,esi + mov ecx,r12d + xor edx,DWORD[32+rsp] + lea r11d,[((-1894007588))+r11*1+r14] + xor ebx,esi + rol ecx,5 + add r11d,eax + rol edx,1 + and ebx,r13d + add r11d,ecx + rol r13d,30 + add r11d,ebx + xor ebp,DWORD[4+rsp] + mov eax,esi + mov DWORD[rsp],edx + mov ebx,esi + xor ebp,DWORD[12+rsp] + and eax,r13d + mov ecx,r11d + xor ebp,DWORD[36+rsp] + lea edi,[((-1894007588))+rdi*1+rdx] + xor ebx,r13d + rol ecx,5 + add edi,eax + rol ebp,1 + and ebx,r12d + add edi,ecx + rol r12d,30 + add edi,ebx + xor r14d,DWORD[8+rsp] + mov eax,r13d + mov DWORD[4+rsp],ebp + mov ebx,r13d + xor r14d,DWORD[16+rsp] + and eax,r12d + mov ecx,edi + xor r14d,DWORD[40+rsp] + lea esi,[((-1894007588))+rsi*1+rbp] + xor ebx,r12d + rol ecx,5 + add esi,eax + rol r14d,1 + and ebx,r11d + add esi,ecx + rol r11d,30 + add esi,ebx + xor edx,DWORD[12+rsp] + mov eax,r12d + mov DWORD[8+rsp],r14d + mov ebx,r12d + xor edx,DWORD[20+rsp] + and eax,r11d + mov ecx,esi + xor edx,DWORD[44+rsp] + lea r13d,[((-1894007588))+r13*1+r14] + xor ebx,r11d + rol ecx,5 + add r13d,eax + rol edx,1 + and ebx,edi + add r13d,ecx + rol edi,30 + add r13d,ebx + xor ebp,DWORD[16+rsp] + mov eax,r11d + mov DWORD[12+rsp],edx + mov ebx,r11d + xor ebp,DWORD[24+rsp] + and eax,edi + mov ecx,r13d + xor ebp,DWORD[48+rsp] + lea r12d,[((-1894007588))+r12*1+rdx] + xor ebx,edi + rol ecx,5 + add r12d,eax + rol ebp,1 + and ebx,esi + add r12d,ecx + rol esi,30 + add r12d,ebx + xor r14d,DWORD[20+rsp] + mov eax,edi + mov DWORD[16+rsp],ebp + mov ebx,edi + xor r14d,DWORD[28+rsp] + and eax,esi + mov ecx,r12d + xor r14d,DWORD[52+rsp] + lea r11d,[((-1894007588))+r11*1+rbp] + xor ebx,esi + rol ecx,5 + add r11d,eax + rol r14d,1 + and ebx,r13d + add r11d,ecx + rol r13d,30 + add r11d,ebx + xor edx,DWORD[24+rsp] + mov eax,esi + mov DWORD[20+rsp],r14d + mov ebx,esi + xor edx,DWORD[32+rsp] + and eax,r13d + mov ecx,r11d + xor edx,DWORD[56+rsp] + lea edi,[((-1894007588))+rdi*1+r14] + xor ebx,r13d + rol ecx,5 + add edi,eax + rol edx,1 + and ebx,r12d + add edi,ecx + rol r12d,30 + add edi,ebx + xor ebp,DWORD[28+rsp] + mov eax,r13d + mov DWORD[24+rsp],edx + mov ebx,r13d + xor ebp,DWORD[36+rsp] + and eax,r12d + mov ecx,edi + xor ebp,DWORD[60+rsp] + lea esi,[((-1894007588))+rsi*1+rdx] + xor ebx,r12d + rol ecx,5 + add esi,eax + rol ebp,1 + and ebx,r11d + add esi,ecx + rol r11d,30 + add esi,ebx + xor r14d,DWORD[32+rsp] + mov eax,r12d + mov DWORD[28+rsp],ebp + mov ebx,r12d + xor r14d,DWORD[40+rsp] + and eax,r11d + mov ecx,esi + xor r14d,DWORD[rsp] + lea r13d,[((-1894007588))+r13*1+rbp] + xor ebx,r11d + rol ecx,5 + add r13d,eax + rol r14d,1 + and ebx,edi + add r13d,ecx + rol edi,30 + add r13d,ebx + xor edx,DWORD[36+rsp] + mov eax,r11d + mov DWORD[32+rsp],r14d + mov ebx,r11d + xor edx,DWORD[44+rsp] + and eax,edi + mov ecx,r13d + xor edx,DWORD[4+rsp] + lea r12d,[((-1894007588))+r12*1+r14] + xor ebx,edi + rol ecx,5 + add r12d,eax + rol edx,1 + and ebx,esi + add r12d,ecx + rol esi,30 + add r12d,ebx + xor ebp,DWORD[40+rsp] + mov eax,edi + mov DWORD[36+rsp],edx + mov ebx,edi + xor ebp,DWORD[48+rsp] + and eax,esi + mov ecx,r12d + xor ebp,DWORD[8+rsp] + lea r11d,[((-1894007588))+r11*1+rdx] + xor ebx,esi + rol ecx,5 + add r11d,eax + rol ebp,1 + and ebx,r13d + add r11d,ecx + rol r13d,30 + add r11d,ebx + xor r14d,DWORD[44+rsp] + mov eax,esi + mov DWORD[40+rsp],ebp + mov ebx,esi + xor r14d,DWORD[52+rsp] + and eax,r13d + mov ecx,r11d + xor r14d,DWORD[12+rsp] + lea edi,[((-1894007588))+rdi*1+rbp] + xor ebx,r13d + rol ecx,5 + add edi,eax + rol r14d,1 + and ebx,r12d + add edi,ecx + rol r12d,30 + add edi,ebx + xor edx,DWORD[48+rsp] + mov eax,r13d + mov DWORD[44+rsp],r14d + mov ebx,r13d + xor edx,DWORD[56+rsp] + and eax,r12d + mov ecx,edi + xor edx,DWORD[16+rsp] + lea esi,[((-1894007588))+rsi*1+r14] + xor ebx,r12d + rol ecx,5 + add esi,eax + rol edx,1 + and ebx,r11d + add esi,ecx + rol r11d,30 + add esi,ebx + xor ebp,DWORD[52+rsp] + mov eax,edi + mov DWORD[48+rsp],edx + mov ecx,esi + xor ebp,DWORD[60+rsp] + xor eax,r12d + rol ecx,5 + xor ebp,DWORD[20+rsp] + lea r13d,[((-899497514))+r13*1+rdx] + xor eax,r11d + add r13d,ecx + rol edi,30 + add r13d,eax + rol ebp,1 + xor r14d,DWORD[56+rsp] + mov eax,esi + mov DWORD[52+rsp],ebp + mov ecx,r13d + xor r14d,DWORD[rsp] + xor eax,r11d + rol ecx,5 + xor r14d,DWORD[24+rsp] + lea r12d,[((-899497514))+r12*1+rbp] + xor eax,edi + add r12d,ecx + rol esi,30 + add r12d,eax + rol r14d,1 + xor edx,DWORD[60+rsp] + mov eax,r13d + mov DWORD[56+rsp],r14d + mov ecx,r12d + xor edx,DWORD[4+rsp] + xor eax,edi + rol ecx,5 + xor edx,DWORD[28+rsp] + lea r11d,[((-899497514))+r11*1+r14] + xor eax,esi + add r11d,ecx + rol r13d,30 + add r11d,eax + rol edx,1 + xor ebp,DWORD[rsp] + mov eax,r12d + mov DWORD[60+rsp],edx + mov ecx,r11d + xor ebp,DWORD[8+rsp] + xor eax,esi + rol ecx,5 + xor ebp,DWORD[32+rsp] + lea edi,[((-899497514))+rdi*1+rdx] + xor eax,r13d + add edi,ecx + rol r12d,30 + add edi,eax + rol ebp,1 + xor r14d,DWORD[4+rsp] + mov eax,r11d + mov DWORD[rsp],ebp + mov ecx,edi + xor r14d,DWORD[12+rsp] + xor eax,r13d + rol ecx,5 + xor r14d,DWORD[36+rsp] + lea esi,[((-899497514))+rsi*1+rbp] + xor eax,r12d + add esi,ecx + rol r11d,30 + add esi,eax + rol r14d,1 + xor edx,DWORD[8+rsp] + mov eax,edi + mov DWORD[4+rsp],r14d + mov ecx,esi + xor edx,DWORD[16+rsp] + xor eax,r12d + rol ecx,5 + xor edx,DWORD[40+rsp] + lea r13d,[((-899497514))+r13*1+r14] + xor eax,r11d + add r13d,ecx + rol edi,30 + add r13d,eax + rol edx,1 + xor ebp,DWORD[12+rsp] + mov eax,esi + mov DWORD[8+rsp],edx + mov ecx,r13d + xor ebp,DWORD[20+rsp] + xor eax,r11d + rol ecx,5 + xor ebp,DWORD[44+rsp] + lea r12d,[((-899497514))+r12*1+rdx] + xor eax,edi + add r12d,ecx + rol esi,30 + add r12d,eax + rol ebp,1 + xor r14d,DWORD[16+rsp] + mov eax,r13d + mov DWORD[12+rsp],ebp + mov ecx,r12d + xor r14d,DWORD[24+rsp] + xor eax,edi + rol ecx,5 + xor r14d,DWORD[48+rsp] + lea r11d,[((-899497514))+r11*1+rbp] + xor eax,esi + add r11d,ecx + rol r13d,30 + add r11d,eax + rol r14d,1 + xor edx,DWORD[20+rsp] + mov eax,r12d + mov DWORD[16+rsp],r14d + mov ecx,r11d + xor edx,DWORD[28+rsp] + xor eax,esi + rol ecx,5 + xor edx,DWORD[52+rsp] + lea edi,[((-899497514))+rdi*1+r14] + xor eax,r13d + add edi,ecx + rol r12d,30 + add edi,eax + rol edx,1 + xor ebp,DWORD[24+rsp] + mov eax,r11d + mov DWORD[20+rsp],edx + mov ecx,edi + xor ebp,DWORD[32+rsp] + xor eax,r13d + rol ecx,5 + xor ebp,DWORD[56+rsp] + lea esi,[((-899497514))+rsi*1+rdx] + xor eax,r12d + add esi,ecx + rol r11d,30 + add esi,eax + rol ebp,1 + xor r14d,DWORD[28+rsp] + mov eax,edi + mov DWORD[24+rsp],ebp + mov ecx,esi + xor r14d,DWORD[36+rsp] + xor eax,r12d + rol ecx,5 + xor r14d,DWORD[60+rsp] + lea r13d,[((-899497514))+r13*1+rbp] + xor eax,r11d + add r13d,ecx + rol edi,30 + add r13d,eax + rol r14d,1 + xor edx,DWORD[32+rsp] + mov eax,esi + mov DWORD[28+rsp],r14d + mov ecx,r13d + xor edx,DWORD[40+rsp] + xor eax,r11d + rol ecx,5 + xor edx,DWORD[rsp] + lea r12d,[((-899497514))+r12*1+r14] + xor eax,edi + add r12d,ecx + rol esi,30 + add r12d,eax + rol edx,1 + xor ebp,DWORD[36+rsp] + mov eax,r13d + + mov ecx,r12d + xor ebp,DWORD[44+rsp] + xor eax,edi + rol ecx,5 + xor ebp,DWORD[4+rsp] + lea r11d,[((-899497514))+r11*1+rdx] + xor eax,esi + add r11d,ecx + rol r13d,30 + add r11d,eax + rol ebp,1 + xor r14d,DWORD[40+rsp] + mov eax,r12d + + mov ecx,r11d + xor r14d,DWORD[48+rsp] + xor eax,esi + rol ecx,5 + xor r14d,DWORD[8+rsp] + lea edi,[((-899497514))+rdi*1+rbp] + xor eax,r13d + add edi,ecx + rol r12d,30 + add edi,eax + rol r14d,1 + xor edx,DWORD[44+rsp] + mov eax,r11d + + mov ecx,edi + xor edx,DWORD[52+rsp] + xor eax,r13d + rol ecx,5 + xor edx,DWORD[12+rsp] + lea esi,[((-899497514))+rsi*1+r14] + xor eax,r12d + add esi,ecx + rol r11d,30 + add esi,eax + rol edx,1 + xor ebp,DWORD[48+rsp] + mov eax,edi + + mov ecx,esi + xor ebp,DWORD[56+rsp] + xor eax,r12d + rol ecx,5 + xor ebp,DWORD[16+rsp] + lea r13d,[((-899497514))+r13*1+rdx] + xor eax,r11d + add r13d,ecx + rol edi,30 + add r13d,eax + rol ebp,1 + xor r14d,DWORD[52+rsp] + mov eax,esi + + mov ecx,r13d + xor r14d,DWORD[60+rsp] + xor eax,r11d + rol ecx,5 + xor r14d,DWORD[20+rsp] + lea r12d,[((-899497514))+r12*1+rbp] + xor eax,edi + add r12d,ecx + rol esi,30 + add r12d,eax + rol r14d,1 + xor edx,DWORD[56+rsp] + mov eax,r13d + + mov ecx,r12d + xor edx,DWORD[rsp] + xor eax,edi + rol ecx,5 + xor edx,DWORD[24+rsp] + lea r11d,[((-899497514))+r11*1+r14] + xor eax,esi + add r11d,ecx + rol r13d,30 + add r11d,eax + rol edx,1 + xor ebp,DWORD[60+rsp] + mov eax,r12d + + mov ecx,r11d + xor ebp,DWORD[4+rsp] + xor eax,esi + rol ecx,5 + xor ebp,DWORD[28+rsp] + lea edi,[((-899497514))+rdi*1+rdx] + xor eax,r13d + add edi,ecx + rol r12d,30 + add edi,eax + rol ebp,1 + mov eax,r11d + mov ecx,edi + xor eax,r13d + lea esi,[((-899497514))+rsi*1+rbp] + rol ecx,5 + xor eax,r12d + add esi,ecx + rol r11d,30 + add esi,eax + add esi,DWORD[r8] + add edi,DWORD[4+r8] + add r11d,DWORD[8+r8] + add r12d,DWORD[12+r8] + add r13d,DWORD[16+r8] + mov DWORD[r8],esi + mov DWORD[4+r8],edi + mov DWORD[8+r8],r11d + mov DWORD[12+r8],r12d + mov DWORD[16+r8],r13d + + sub r10,1 + lea r9,[64+r9] + jnz NEAR $L$loop + + mov rsi,QWORD[64+rsp] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha1_block_data_order_nohw: +global sha1_block_data_order_hw + +ALIGN 32 +sha1_block_data_order_hw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha1_block_data_order_hw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + lea rsp,[((-72))+rsp] + movaps XMMWORD[(-8-64)+rax],xmm6 + movaps XMMWORD[(-8-48)+rax],xmm7 + movaps XMMWORD[(-8-32)+rax],xmm8 + movaps XMMWORD[(-8-16)+rax],xmm9 +$L$prologue_shaext: + movdqu xmm0,XMMWORD[rdi] + movd xmm1,DWORD[16+rdi] + movdqa xmm3,XMMWORD[((K_XX_XX+160))] + + movdqu xmm4,XMMWORD[rsi] + pshufd xmm0,xmm0,27 + movdqu xmm5,XMMWORD[16+rsi] + pshufd xmm1,xmm1,27 + movdqu xmm6,XMMWORD[32+rsi] + pshufb xmm4,xmm3 + movdqu xmm7,XMMWORD[48+rsi] + pshufb xmm5,xmm3 + pshufb xmm6,xmm3 + movdqa xmm9,xmm1 + pshufb xmm7,xmm3 + jmp NEAR $L$oop_shaext + +ALIGN 16 +$L$oop_shaext: + dec rdx + lea r8,[64+rsi] + paddd xmm1,xmm4 + cmovne rsi,r8 + prefetcht0 [512+rsi] + movdqa xmm8,xmm0 + sha1msg1 xmm4,xmm5 + movdqa xmm2,xmm0 + sha1rnds4 xmm0,xmm1,0 + sha1nexte xmm2,xmm5 + pxor xmm4,xmm6 + sha1msg1 xmm5,xmm6 + sha1msg2 xmm4,xmm7 + + movdqa xmm1,xmm0 + sha1rnds4 xmm0,xmm2,0 + sha1nexte xmm1,xmm6 + pxor xmm5,xmm7 + sha1msg2 xmm5,xmm4 + sha1msg1 xmm6,xmm7 + movdqa xmm2,xmm0 + sha1rnds4 xmm0,xmm1,0 + sha1nexte xmm2,xmm7 + pxor xmm6,xmm4 + sha1msg1 xmm7,xmm4 + sha1msg2 xmm6,xmm5 + + movdqa xmm1,xmm0 + sha1rnds4 xmm0,xmm2,0 + sha1nexte xmm1,xmm4 + pxor xmm7,xmm5 + sha1msg2 xmm7,xmm6 + sha1msg1 xmm4,xmm5 + movdqa xmm2,xmm0 + sha1rnds4 xmm0,xmm1,0 + sha1nexte xmm2,xmm5 + pxor xmm4,xmm6 + sha1msg1 xmm5,xmm6 + sha1msg2 xmm4,xmm7 + + movdqa xmm1,xmm0 + sha1rnds4 xmm0,xmm2,1 + sha1nexte xmm1,xmm6 + pxor xmm5,xmm7 + sha1msg2 xmm5,xmm4 + sha1msg1 xmm6,xmm7 + movdqa xmm2,xmm0 + sha1rnds4 xmm0,xmm1,1 + sha1nexte xmm2,xmm7 + pxor xmm6,xmm4 + sha1msg1 xmm7,xmm4 + sha1msg2 xmm6,xmm5 + + movdqa xmm1,xmm0 + sha1rnds4 xmm0,xmm2,1 + sha1nexte xmm1,xmm4 + pxor xmm7,xmm5 + sha1msg2 xmm7,xmm6 + sha1msg1 xmm4,xmm5 + movdqa xmm2,xmm0 + sha1rnds4 xmm0,xmm1,1 + sha1nexte xmm2,xmm5 + pxor xmm4,xmm6 + sha1msg1 xmm5,xmm6 + sha1msg2 xmm4,xmm7 + + movdqa xmm1,xmm0 + sha1rnds4 xmm0,xmm2,1 + sha1nexte xmm1,xmm6 + pxor xmm5,xmm7 + sha1msg2 xmm5,xmm4 + sha1msg1 xmm6,xmm7 + movdqa xmm2,xmm0 + sha1rnds4 xmm0,xmm1,2 + sha1nexte xmm2,xmm7 + pxor xmm6,xmm4 + sha1msg1 xmm7,xmm4 + sha1msg2 xmm6,xmm5 + + movdqa xmm1,xmm0 + sha1rnds4 xmm0,xmm2,2 + sha1nexte xmm1,xmm4 + pxor xmm7,xmm5 + sha1msg2 xmm7,xmm6 + sha1msg1 xmm4,xmm5 + movdqa xmm2,xmm0 + sha1rnds4 xmm0,xmm1,2 + sha1nexte xmm2,xmm5 + pxor xmm4,xmm6 + sha1msg1 xmm5,xmm6 + sha1msg2 xmm4,xmm7 + + movdqa xmm1,xmm0 + sha1rnds4 xmm0,xmm2,2 + sha1nexte xmm1,xmm6 + pxor xmm5,xmm7 + sha1msg2 xmm5,xmm4 + sha1msg1 xmm6,xmm7 + movdqa xmm2,xmm0 + sha1rnds4 xmm0,xmm1,2 + sha1nexte xmm2,xmm7 + pxor xmm6,xmm4 + sha1msg1 xmm7,xmm4 + sha1msg2 xmm6,xmm5 + + movdqa xmm1,xmm0 + sha1rnds4 xmm0,xmm2,3 + sha1nexte xmm1,xmm4 + pxor xmm7,xmm5 + sha1msg2 xmm7,xmm6 + movdqu xmm4,XMMWORD[rsi] + movdqa xmm2,xmm0 + sha1rnds4 xmm0,xmm1,3 + sha1nexte xmm2,xmm5 + movdqu xmm5,XMMWORD[16+rsi] + pshufb xmm4,xmm3 + + movdqa xmm1,xmm0 + sha1rnds4 xmm0,xmm2,3 + sha1nexte xmm1,xmm6 + movdqu xmm6,XMMWORD[32+rsi] + pshufb xmm5,xmm3 + + movdqa xmm2,xmm0 + sha1rnds4 xmm0,xmm1,3 + sha1nexte xmm2,xmm7 + movdqu xmm7,XMMWORD[48+rsi] + pshufb xmm6,xmm3 + + movdqa xmm1,xmm0 + sha1rnds4 xmm0,xmm2,3 + sha1nexte xmm1,xmm9 + pshufb xmm7,xmm3 + + paddd xmm0,xmm8 + movdqa xmm9,xmm1 + + jnz NEAR $L$oop_shaext + + pshufd xmm0,xmm0,27 + pshufd xmm1,xmm1,27 + movdqu XMMWORD[rdi],xmm0 + movd DWORD[16+rdi],xmm1 + movaps xmm6,XMMWORD[((-8-64))+rax] + movaps xmm7,XMMWORD[((-8-48))+rax] + movaps xmm8,XMMWORD[((-8-32))+rax] + movaps xmm9,XMMWORD[((-8-16))+rax] + mov rsp,rax +$L$epilogue_shaext: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha1_block_data_order_hw: +global sha1_block_data_order_ssse3 + +ALIGN 16 +sha1_block_data_order_ssse3: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha1_block_data_order_ssse3: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov r11,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + lea rsp,[((-160))+rsp] + movaps XMMWORD[(-40-96)+r11],xmm6 + movaps XMMWORD[(-40-80)+r11],xmm7 + movaps XMMWORD[(-40-64)+r11],xmm8 + movaps XMMWORD[(-40-48)+r11],xmm9 + movaps XMMWORD[(-40-32)+r11],xmm10 + movaps XMMWORD[(-40-16)+r11],xmm11 +$L$prologue_ssse3: + and rsp,-64 + mov r8,rdi + mov r9,rsi + mov r10,rdx + + shl r10,6 + add r10,r9 + lea r14,[((K_XX_XX+64))] + + mov eax,DWORD[r8] + mov ebx,DWORD[4+r8] + mov ecx,DWORD[8+r8] + mov edx,DWORD[12+r8] + mov esi,ebx + mov ebp,DWORD[16+r8] + mov edi,ecx + xor edi,edx + and esi,edi + + movdqa xmm6,XMMWORD[64+r14] + movdqa xmm9,XMMWORD[((-64))+r14] + movdqu xmm0,XMMWORD[r9] + movdqu xmm1,XMMWORD[16+r9] + movdqu xmm2,XMMWORD[32+r9] + movdqu xmm3,XMMWORD[48+r9] + pshufb xmm0,xmm6 + pshufb xmm1,xmm6 + pshufb xmm2,xmm6 + add r9,64 + paddd xmm0,xmm9 + pshufb xmm3,xmm6 + paddd xmm1,xmm9 + paddd xmm2,xmm9 + movdqa XMMWORD[rsp],xmm0 + psubd xmm0,xmm9 + movdqa XMMWORD[16+rsp],xmm1 + psubd xmm1,xmm9 + movdqa XMMWORD[32+rsp],xmm2 + psubd xmm2,xmm9 + jmp NEAR $L$oop_ssse3 +ALIGN 16 +$L$oop_ssse3: + ror ebx,2 + pshufd xmm4,xmm0,238 + xor esi,edx + movdqa xmm8,xmm3 + paddd xmm9,xmm3 + mov edi,eax + add ebp,DWORD[rsp] + punpcklqdq xmm4,xmm1 + xor ebx,ecx + rol eax,5 + add ebp,esi + psrldq xmm8,4 + and edi,ebx + xor ebx,ecx + pxor xmm4,xmm0 + add ebp,eax + ror eax,7 + pxor xmm8,xmm2 + xor edi,ecx + mov esi,ebp + add edx,DWORD[4+rsp] + pxor xmm4,xmm8 + xor eax,ebx + rol ebp,5 + movdqa XMMWORD[48+rsp],xmm9 + add edx,edi + and esi,eax + movdqa xmm10,xmm4 + xor eax,ebx + add edx,ebp + ror ebp,7 + movdqa xmm8,xmm4 + xor esi,ebx + pslldq xmm10,12 + paddd xmm4,xmm4 + mov edi,edx + add ecx,DWORD[8+rsp] + psrld xmm8,31 + xor ebp,eax + rol edx,5 + add ecx,esi + movdqa xmm9,xmm10 + and edi,ebp + xor ebp,eax + psrld xmm10,30 + add ecx,edx + ror edx,7 + por xmm4,xmm8 + xor edi,eax + mov esi,ecx + add ebx,DWORD[12+rsp] + pslld xmm9,2 + pxor xmm4,xmm10 + xor edx,ebp + movdqa xmm10,XMMWORD[((-64))+r14] + rol ecx,5 + add ebx,edi + and esi,edx + pxor xmm4,xmm9 + xor edx,ebp + add ebx,ecx + ror ecx,7 + pshufd xmm5,xmm1,238 + xor esi,ebp + movdqa xmm9,xmm4 + paddd xmm10,xmm4 + mov edi,ebx + add eax,DWORD[16+rsp] + punpcklqdq xmm5,xmm2 + xor ecx,edx + rol ebx,5 + add eax,esi + psrldq xmm9,4 + and edi,ecx + xor ecx,edx + pxor xmm5,xmm1 + add eax,ebx + ror ebx,7 + pxor xmm9,xmm3 + xor edi,edx + mov esi,eax + add ebp,DWORD[20+rsp] + pxor xmm5,xmm9 + xor ebx,ecx + rol eax,5 + movdqa XMMWORD[rsp],xmm10 + add ebp,edi + and esi,ebx + movdqa xmm8,xmm5 + xor ebx,ecx + add ebp,eax + ror eax,7 + movdqa xmm9,xmm5 + xor esi,ecx + pslldq xmm8,12 + paddd xmm5,xmm5 + mov edi,ebp + add edx,DWORD[24+rsp] + psrld xmm9,31 + xor eax,ebx + rol ebp,5 + add edx,esi + movdqa xmm10,xmm8 + and edi,eax + xor eax,ebx + psrld xmm8,30 + add edx,ebp + ror ebp,7 + por xmm5,xmm9 + xor edi,ebx + mov esi,edx + add ecx,DWORD[28+rsp] + pslld xmm10,2 + pxor xmm5,xmm8 + xor ebp,eax + movdqa xmm8,XMMWORD[((-32))+r14] + rol edx,5 + add ecx,edi + and esi,ebp + pxor xmm5,xmm10 + xor ebp,eax + add ecx,edx + ror edx,7 + pshufd xmm6,xmm2,238 + xor esi,eax + movdqa xmm10,xmm5 + paddd xmm8,xmm5 + mov edi,ecx + add ebx,DWORD[32+rsp] + punpcklqdq xmm6,xmm3 + xor edx,ebp + rol ecx,5 + add ebx,esi + psrldq xmm10,4 + and edi,edx + xor edx,ebp + pxor xmm6,xmm2 + add ebx,ecx + ror ecx,7 + pxor xmm10,xmm4 + xor edi,ebp + mov esi,ebx + add eax,DWORD[36+rsp] + pxor xmm6,xmm10 + xor ecx,edx + rol ebx,5 + movdqa XMMWORD[16+rsp],xmm8 + add eax,edi + and esi,ecx + movdqa xmm9,xmm6 + xor ecx,edx + add eax,ebx + ror ebx,7 + movdqa xmm10,xmm6 + xor esi,edx + pslldq xmm9,12 + paddd xmm6,xmm6 + mov edi,eax + add ebp,DWORD[40+rsp] + psrld xmm10,31 + xor ebx,ecx + rol eax,5 + add ebp,esi + movdqa xmm8,xmm9 + and edi,ebx + xor ebx,ecx + psrld xmm9,30 + add ebp,eax + ror eax,7 + por xmm6,xmm10 + xor edi,ecx + mov esi,ebp + add edx,DWORD[44+rsp] + pslld xmm8,2 + pxor xmm6,xmm9 + xor eax,ebx + movdqa xmm9,XMMWORD[((-32))+r14] + rol ebp,5 + add edx,edi + and esi,eax + pxor xmm6,xmm8 + xor eax,ebx + add edx,ebp + ror ebp,7 + pshufd xmm7,xmm3,238 + xor esi,ebx + movdqa xmm8,xmm6 + paddd xmm9,xmm6 + mov edi,edx + add ecx,DWORD[48+rsp] + punpcklqdq xmm7,xmm4 + xor ebp,eax + rol edx,5 + add ecx,esi + psrldq xmm8,4 + and edi,ebp + xor ebp,eax + pxor xmm7,xmm3 + add ecx,edx + ror edx,7 + pxor xmm8,xmm5 + xor edi,eax + mov esi,ecx + add ebx,DWORD[52+rsp] + pxor xmm7,xmm8 + xor edx,ebp + rol ecx,5 + movdqa XMMWORD[32+rsp],xmm9 + add ebx,edi + and esi,edx + movdqa xmm10,xmm7 + xor edx,ebp + add ebx,ecx + ror ecx,7 + movdqa xmm8,xmm7 + xor esi,ebp + pslldq xmm10,12 + paddd xmm7,xmm7 + mov edi,ebx + add eax,DWORD[56+rsp] + psrld xmm8,31 + xor ecx,edx + rol ebx,5 + add eax,esi + movdqa xmm9,xmm10 + and edi,ecx + xor ecx,edx + psrld xmm10,30 + add eax,ebx + ror ebx,7 + por xmm7,xmm8 + xor edi,edx + mov esi,eax + add ebp,DWORD[60+rsp] + pslld xmm9,2 + pxor xmm7,xmm10 + xor ebx,ecx + movdqa xmm10,XMMWORD[((-32))+r14] + rol eax,5 + add ebp,edi + and esi,ebx + pxor xmm7,xmm9 + pshufd xmm9,xmm6,238 + xor ebx,ecx + add ebp,eax + ror eax,7 + pxor xmm0,xmm4 + xor esi,ecx + mov edi,ebp + add edx,DWORD[rsp] + punpcklqdq xmm9,xmm7 + xor eax,ebx + rol ebp,5 + pxor xmm0,xmm1 + add edx,esi + and edi,eax + movdqa xmm8,xmm10 + xor eax,ebx + paddd xmm10,xmm7 + add edx,ebp + pxor xmm0,xmm9 + ror ebp,7 + xor edi,ebx + mov esi,edx + add ecx,DWORD[4+rsp] + movdqa xmm9,xmm0 + xor ebp,eax + rol edx,5 + movdqa XMMWORD[48+rsp],xmm10 + add ecx,edi + and esi,ebp + xor ebp,eax + pslld xmm0,2 + add ecx,edx + ror edx,7 + psrld xmm9,30 + xor esi,eax + mov edi,ecx + add ebx,DWORD[8+rsp] + por xmm0,xmm9 + xor edx,ebp + rol ecx,5 + pshufd xmm10,xmm7,238 + add ebx,esi + and edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[12+rsp] + xor edi,ebp + mov esi,ebx + rol ebx,5 + add eax,edi + xor esi,edx + ror ecx,7 + add eax,ebx + pxor xmm1,xmm5 + add ebp,DWORD[16+rsp] + xor esi,ecx + punpcklqdq xmm10,xmm0 + mov edi,eax + rol eax,5 + pxor xmm1,xmm2 + add ebp,esi + xor edi,ecx + movdqa xmm9,xmm8 + ror ebx,7 + paddd xmm8,xmm0 + add ebp,eax + pxor xmm1,xmm10 + add edx,DWORD[20+rsp] + xor edi,ebx + mov esi,ebp + rol ebp,5 + movdqa xmm10,xmm1 + add edx,edi + xor esi,ebx + movdqa XMMWORD[rsp],xmm8 + ror eax,7 + add edx,ebp + add ecx,DWORD[24+rsp] + pslld xmm1,2 + xor esi,eax + mov edi,edx + psrld xmm10,30 + rol edx,5 + add ecx,esi + xor edi,eax + ror ebp,7 + por xmm1,xmm10 + add ecx,edx + add ebx,DWORD[28+rsp] + pshufd xmm8,xmm0,238 + xor edi,ebp + mov esi,ecx + rol ecx,5 + add ebx,edi + xor esi,ebp + ror edx,7 + add ebx,ecx + pxor xmm2,xmm6 + add eax,DWORD[32+rsp] + xor esi,edx + punpcklqdq xmm8,xmm1 + mov edi,ebx + rol ebx,5 + pxor xmm2,xmm3 + add eax,esi + xor edi,edx + movdqa xmm10,XMMWORD[r14] + ror ecx,7 + paddd xmm9,xmm1 + add eax,ebx + pxor xmm2,xmm8 + add ebp,DWORD[36+rsp] + xor edi,ecx + mov esi,eax + rol eax,5 + movdqa xmm8,xmm2 + add ebp,edi + xor esi,ecx + movdqa XMMWORD[16+rsp],xmm9 + ror ebx,7 + add ebp,eax + add edx,DWORD[40+rsp] + pslld xmm2,2 + xor esi,ebx + mov edi,ebp + psrld xmm8,30 + rol ebp,5 + add edx,esi + xor edi,ebx + ror eax,7 + por xmm2,xmm8 + add edx,ebp + add ecx,DWORD[44+rsp] + pshufd xmm9,xmm1,238 + xor edi,eax + mov esi,edx + rol edx,5 + add ecx,edi + xor esi,eax + ror ebp,7 + add ecx,edx + pxor xmm3,xmm7 + add ebx,DWORD[48+rsp] + xor esi,ebp + punpcklqdq xmm9,xmm2 + mov edi,ecx + rol ecx,5 + pxor xmm3,xmm4 + add ebx,esi + xor edi,ebp + movdqa xmm8,xmm10 + ror edx,7 + paddd xmm10,xmm2 + add ebx,ecx + pxor xmm3,xmm9 + add eax,DWORD[52+rsp] + xor edi,edx + mov esi,ebx + rol ebx,5 + movdqa xmm9,xmm3 + add eax,edi + xor esi,edx + movdqa XMMWORD[32+rsp],xmm10 + ror ecx,7 + add eax,ebx + add ebp,DWORD[56+rsp] + pslld xmm3,2 + xor esi,ecx + mov edi,eax + psrld xmm9,30 + rol eax,5 + add ebp,esi + xor edi,ecx + ror ebx,7 + por xmm3,xmm9 + add ebp,eax + add edx,DWORD[60+rsp] + pshufd xmm10,xmm2,238 + xor edi,ebx + mov esi,ebp + rol ebp,5 + add edx,edi + xor esi,ebx + ror eax,7 + add edx,ebp + pxor xmm4,xmm0 + add ecx,DWORD[rsp] + xor esi,eax + punpcklqdq xmm10,xmm3 + mov edi,edx + rol edx,5 + pxor xmm4,xmm5 + add ecx,esi + xor edi,eax + movdqa xmm9,xmm8 + ror ebp,7 + paddd xmm8,xmm3 + add ecx,edx + pxor xmm4,xmm10 + add ebx,DWORD[4+rsp] + xor edi,ebp + mov esi,ecx + rol ecx,5 + movdqa xmm10,xmm4 + add ebx,edi + xor esi,ebp + movdqa XMMWORD[48+rsp],xmm8 + ror edx,7 + add ebx,ecx + add eax,DWORD[8+rsp] + pslld xmm4,2 + xor esi,edx + mov edi,ebx + psrld xmm10,30 + rol ebx,5 + add eax,esi + xor edi,edx + ror ecx,7 + por xmm4,xmm10 + add eax,ebx + add ebp,DWORD[12+rsp] + pshufd xmm8,xmm3,238 + xor edi,ecx + mov esi,eax + rol eax,5 + add ebp,edi + xor esi,ecx + ror ebx,7 + add ebp,eax + pxor xmm5,xmm1 + add edx,DWORD[16+rsp] + xor esi,ebx + punpcklqdq xmm8,xmm4 + mov edi,ebp + rol ebp,5 + pxor xmm5,xmm6 + add edx,esi + xor edi,ebx + movdqa xmm10,xmm9 + ror eax,7 + paddd xmm9,xmm4 + add edx,ebp + pxor xmm5,xmm8 + add ecx,DWORD[20+rsp] + xor edi,eax + mov esi,edx + rol edx,5 + movdqa xmm8,xmm5 + add ecx,edi + xor esi,eax + movdqa XMMWORD[rsp],xmm9 + ror ebp,7 + add ecx,edx + add ebx,DWORD[24+rsp] + pslld xmm5,2 + xor esi,ebp + mov edi,ecx + psrld xmm8,30 + rol ecx,5 + add ebx,esi + xor edi,ebp + ror edx,7 + por xmm5,xmm8 + add ebx,ecx + add eax,DWORD[28+rsp] + pshufd xmm9,xmm4,238 + ror ecx,7 + mov esi,ebx + xor edi,edx + rol ebx,5 + add eax,edi + xor esi,ecx + xor ecx,edx + add eax,ebx + pxor xmm6,xmm2 + add ebp,DWORD[32+rsp] + and esi,ecx + xor ecx,edx + ror ebx,7 + punpcklqdq xmm9,xmm5 + mov edi,eax + xor esi,ecx + pxor xmm6,xmm7 + rol eax,5 + add ebp,esi + movdqa xmm8,xmm10 + xor edi,ebx + paddd xmm10,xmm5 + xor ebx,ecx + pxor xmm6,xmm9 + add ebp,eax + add edx,DWORD[36+rsp] + and edi,ebx + xor ebx,ecx + ror eax,7 + movdqa xmm9,xmm6 + mov esi,ebp + xor edi,ebx + movdqa XMMWORD[16+rsp],xmm10 + rol ebp,5 + add edx,edi + xor esi,eax + pslld xmm6,2 + xor eax,ebx + add edx,ebp + psrld xmm9,30 + add ecx,DWORD[40+rsp] + and esi,eax + xor eax,ebx + por xmm6,xmm9 + ror ebp,7 + mov edi,edx + xor esi,eax + rol edx,5 + pshufd xmm10,xmm5,238 + add ecx,esi + xor edi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[44+rsp] + and edi,ebp + xor ebp,eax + ror edx,7 + mov esi,ecx + xor edi,ebp + rol ecx,5 + add ebx,edi + xor esi,edx + xor edx,ebp + add ebx,ecx + pxor xmm7,xmm3 + add eax,DWORD[48+rsp] + and esi,edx + xor edx,ebp + ror ecx,7 + punpcklqdq xmm10,xmm6 + mov edi,ebx + xor esi,edx + pxor xmm7,xmm0 + rol ebx,5 + add eax,esi + movdqa xmm9,XMMWORD[32+r14] + xor edi,ecx + paddd xmm8,xmm6 + xor ecx,edx + pxor xmm7,xmm10 + add eax,ebx + add ebp,DWORD[52+rsp] + and edi,ecx + xor ecx,edx + ror ebx,7 + movdqa xmm10,xmm7 + mov esi,eax + xor edi,ecx + movdqa XMMWORD[32+rsp],xmm8 + rol eax,5 + add ebp,edi + xor esi,ebx + pslld xmm7,2 + xor ebx,ecx + add ebp,eax + psrld xmm10,30 + add edx,DWORD[56+rsp] + and esi,ebx + xor ebx,ecx + por xmm7,xmm10 + ror eax,7 + mov edi,ebp + xor esi,ebx + rol ebp,5 + pshufd xmm8,xmm6,238 + add edx,esi + xor edi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[60+rsp] + and edi,eax + xor eax,ebx + ror ebp,7 + mov esi,edx + xor edi,eax + rol edx,5 + add ecx,edi + xor esi,ebp + xor ebp,eax + add ecx,edx + pxor xmm0,xmm4 + add ebx,DWORD[rsp] + and esi,ebp + xor ebp,eax + ror edx,7 + punpcklqdq xmm8,xmm7 + mov edi,ecx + xor esi,ebp + pxor xmm0,xmm1 + rol ecx,5 + add ebx,esi + movdqa xmm10,xmm9 + xor edi,edx + paddd xmm9,xmm7 + xor edx,ebp + pxor xmm0,xmm8 + add ebx,ecx + add eax,DWORD[4+rsp] + and edi,edx + xor edx,ebp + ror ecx,7 + movdqa xmm8,xmm0 + mov esi,ebx + xor edi,edx + movdqa XMMWORD[48+rsp],xmm9 + rol ebx,5 + add eax,edi + xor esi,ecx + pslld xmm0,2 + xor ecx,edx + add eax,ebx + psrld xmm8,30 + add ebp,DWORD[8+rsp] + and esi,ecx + xor ecx,edx + por xmm0,xmm8 + ror ebx,7 + mov edi,eax + xor esi,ecx + rol eax,5 + pshufd xmm9,xmm7,238 + add ebp,esi + xor edi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[12+rsp] + and edi,ebx + xor ebx,ecx + ror eax,7 + mov esi,ebp + xor edi,ebx + rol ebp,5 + add edx,edi + xor esi,eax + xor eax,ebx + add edx,ebp + pxor xmm1,xmm5 + add ecx,DWORD[16+rsp] + and esi,eax + xor eax,ebx + ror ebp,7 + punpcklqdq xmm9,xmm0 + mov edi,edx + xor esi,eax + pxor xmm1,xmm2 + rol edx,5 + add ecx,esi + movdqa xmm8,xmm10 + xor edi,ebp + paddd xmm10,xmm0 + xor ebp,eax + pxor xmm1,xmm9 + add ecx,edx + add ebx,DWORD[20+rsp] + and edi,ebp + xor ebp,eax + ror edx,7 + movdqa xmm9,xmm1 + mov esi,ecx + xor edi,ebp + movdqa XMMWORD[rsp],xmm10 + rol ecx,5 + add ebx,edi + xor esi,edx + pslld xmm1,2 + xor edx,ebp + add ebx,ecx + psrld xmm9,30 + add eax,DWORD[24+rsp] + and esi,edx + xor edx,ebp + por xmm1,xmm9 + ror ecx,7 + mov edi,ebx + xor esi,edx + rol ebx,5 + pshufd xmm10,xmm0,238 + add eax,esi + xor edi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[28+rsp] + and edi,ecx + xor ecx,edx + ror ebx,7 + mov esi,eax + xor edi,ecx + rol eax,5 + add ebp,edi + xor esi,ebx + xor ebx,ecx + add ebp,eax + pxor xmm2,xmm6 + add edx,DWORD[32+rsp] + and esi,ebx + xor ebx,ecx + ror eax,7 + punpcklqdq xmm10,xmm1 + mov edi,ebp + xor esi,ebx + pxor xmm2,xmm3 + rol ebp,5 + add edx,esi + movdqa xmm9,xmm8 + xor edi,eax + paddd xmm8,xmm1 + xor eax,ebx + pxor xmm2,xmm10 + add edx,ebp + add ecx,DWORD[36+rsp] + and edi,eax + xor eax,ebx + ror ebp,7 + movdqa xmm10,xmm2 + mov esi,edx + xor edi,eax + movdqa XMMWORD[16+rsp],xmm8 + rol edx,5 + add ecx,edi + xor esi,ebp + pslld xmm2,2 + xor ebp,eax + add ecx,edx + psrld xmm10,30 + add ebx,DWORD[40+rsp] + and esi,ebp + xor ebp,eax + por xmm2,xmm10 + ror edx,7 + mov edi,ecx + xor esi,ebp + rol ecx,5 + pshufd xmm8,xmm1,238 + add ebx,esi + xor edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[44+rsp] + and edi,edx + xor edx,ebp + ror ecx,7 + mov esi,ebx + xor edi,edx + rol ebx,5 + add eax,edi + xor esi,edx + add eax,ebx + pxor xmm3,xmm7 + add ebp,DWORD[48+rsp] + xor esi,ecx + punpcklqdq xmm8,xmm2 + mov edi,eax + rol eax,5 + pxor xmm3,xmm4 + add ebp,esi + xor edi,ecx + movdqa xmm10,xmm9 + ror ebx,7 + paddd xmm9,xmm2 + add ebp,eax + pxor xmm3,xmm8 + add edx,DWORD[52+rsp] + xor edi,ebx + mov esi,ebp + rol ebp,5 + movdqa xmm8,xmm3 + add edx,edi + xor esi,ebx + movdqa XMMWORD[32+rsp],xmm9 + ror eax,7 + add edx,ebp + add ecx,DWORD[56+rsp] + pslld xmm3,2 + xor esi,eax + mov edi,edx + psrld xmm8,30 + rol edx,5 + add ecx,esi + xor edi,eax + ror ebp,7 + por xmm3,xmm8 + add ecx,edx + add ebx,DWORD[60+rsp] + xor edi,ebp + mov esi,ecx + rol ecx,5 + add ebx,edi + xor esi,ebp + ror edx,7 + add ebx,ecx + add eax,DWORD[rsp] + xor esi,edx + mov edi,ebx + rol ebx,5 + paddd xmm10,xmm3 + add eax,esi + xor edi,edx + movdqa XMMWORD[48+rsp],xmm10 + ror ecx,7 + add eax,ebx + add ebp,DWORD[4+rsp] + xor edi,ecx + mov esi,eax + rol eax,5 + add ebp,edi + xor esi,ecx + ror ebx,7 + add ebp,eax + add edx,DWORD[8+rsp] + xor esi,ebx + mov edi,ebp + rol ebp,5 + add edx,esi + xor edi,ebx + ror eax,7 + add edx,ebp + add ecx,DWORD[12+rsp] + xor edi,eax + mov esi,edx + rol edx,5 + add ecx,edi + xor esi,eax + ror ebp,7 + add ecx,edx + cmp r9,r10 + je NEAR $L$done_ssse3 + movdqa xmm6,XMMWORD[64+r14] + movdqa xmm9,XMMWORD[((-64))+r14] + movdqu xmm0,XMMWORD[r9] + movdqu xmm1,XMMWORD[16+r9] + movdqu xmm2,XMMWORD[32+r9] + movdqu xmm3,XMMWORD[48+r9] + pshufb xmm0,xmm6 + add r9,64 + add ebx,DWORD[16+rsp] + xor esi,ebp + mov edi,ecx + pshufb xmm1,xmm6 + rol ecx,5 + add ebx,esi + xor edi,ebp + ror edx,7 + paddd xmm0,xmm9 + add ebx,ecx + add eax,DWORD[20+rsp] + xor edi,edx + mov esi,ebx + movdqa XMMWORD[rsp],xmm0 + rol ebx,5 + add eax,edi + xor esi,edx + ror ecx,7 + psubd xmm0,xmm9 + add eax,ebx + add ebp,DWORD[24+rsp] + xor esi,ecx + mov edi,eax + rol eax,5 + add ebp,esi + xor edi,ecx + ror ebx,7 + add ebp,eax + add edx,DWORD[28+rsp] + xor edi,ebx + mov esi,ebp + rol ebp,5 + add edx,edi + xor esi,ebx + ror eax,7 + add edx,ebp + add ecx,DWORD[32+rsp] + xor esi,eax + mov edi,edx + pshufb xmm2,xmm6 + rol edx,5 + add ecx,esi + xor edi,eax + ror ebp,7 + paddd xmm1,xmm9 + add ecx,edx + add ebx,DWORD[36+rsp] + xor edi,ebp + mov esi,ecx + movdqa XMMWORD[16+rsp],xmm1 + rol ecx,5 + add ebx,edi + xor esi,ebp + ror edx,7 + psubd xmm1,xmm9 + add ebx,ecx + add eax,DWORD[40+rsp] + xor esi,edx + mov edi,ebx + rol ebx,5 + add eax,esi + xor edi,edx + ror ecx,7 + add eax,ebx + add ebp,DWORD[44+rsp] + xor edi,ecx + mov esi,eax + rol eax,5 + add ebp,edi + xor esi,ecx + ror ebx,7 + add ebp,eax + add edx,DWORD[48+rsp] + xor esi,ebx + mov edi,ebp + pshufb xmm3,xmm6 + rol ebp,5 + add edx,esi + xor edi,ebx + ror eax,7 + paddd xmm2,xmm9 + add edx,ebp + add ecx,DWORD[52+rsp] + xor edi,eax + mov esi,edx + movdqa XMMWORD[32+rsp],xmm2 + rol edx,5 + add ecx,edi + xor esi,eax + ror ebp,7 + psubd xmm2,xmm9 + add ecx,edx + add ebx,DWORD[56+rsp] + xor esi,ebp + mov edi,ecx + rol ecx,5 + add ebx,esi + xor edi,ebp + ror edx,7 + add ebx,ecx + add eax,DWORD[60+rsp] + xor edi,edx + mov esi,ebx + rol ebx,5 + add eax,edi + ror ecx,7 + add eax,ebx + add eax,DWORD[r8] + add esi,DWORD[4+r8] + add ecx,DWORD[8+r8] + add edx,DWORD[12+r8] + mov DWORD[r8],eax + add ebp,DWORD[16+r8] + mov DWORD[4+r8],esi + mov ebx,esi + mov DWORD[8+r8],ecx + mov edi,ecx + mov DWORD[12+r8],edx + xor edi,edx + mov DWORD[16+r8],ebp + and esi,edi + jmp NEAR $L$oop_ssse3 + +ALIGN 16 +$L$done_ssse3: + add ebx,DWORD[16+rsp] + xor esi,ebp + mov edi,ecx + rol ecx,5 + add ebx,esi + xor edi,ebp + ror edx,7 + add ebx,ecx + add eax,DWORD[20+rsp] + xor edi,edx + mov esi,ebx + rol ebx,5 + add eax,edi + xor esi,edx + ror ecx,7 + add eax,ebx + add ebp,DWORD[24+rsp] + xor esi,ecx + mov edi,eax + rol eax,5 + add ebp,esi + xor edi,ecx + ror ebx,7 + add ebp,eax + add edx,DWORD[28+rsp] + xor edi,ebx + mov esi,ebp + rol ebp,5 + add edx,edi + xor esi,ebx + ror eax,7 + add edx,ebp + add ecx,DWORD[32+rsp] + xor esi,eax + mov edi,edx + rol edx,5 + add ecx,esi + xor edi,eax + ror ebp,7 + add ecx,edx + add ebx,DWORD[36+rsp] + xor edi,ebp + mov esi,ecx + rol ecx,5 + add ebx,edi + xor esi,ebp + ror edx,7 + add ebx,ecx + add eax,DWORD[40+rsp] + xor esi,edx + mov edi,ebx + rol ebx,5 + add eax,esi + xor edi,edx + ror ecx,7 + add eax,ebx + add ebp,DWORD[44+rsp] + xor edi,ecx + mov esi,eax + rol eax,5 + add ebp,edi + xor esi,ecx + ror ebx,7 + add ebp,eax + add edx,DWORD[48+rsp] + xor esi,ebx + mov edi,ebp + rol ebp,5 + add edx,esi + xor edi,ebx + ror eax,7 + add edx,ebp + add ecx,DWORD[52+rsp] + xor edi,eax + mov esi,edx + rol edx,5 + add ecx,edi + xor esi,eax + ror ebp,7 + add ecx,edx + add ebx,DWORD[56+rsp] + xor esi,ebp + mov edi,ecx + rol ecx,5 + add ebx,esi + xor edi,ebp + ror edx,7 + add ebx,ecx + add eax,DWORD[60+rsp] + xor edi,edx + mov esi,ebx + rol ebx,5 + add eax,edi + ror ecx,7 + add eax,ebx + add eax,DWORD[r8] + add esi,DWORD[4+r8] + add ecx,DWORD[8+r8] + mov DWORD[r8],eax + add edx,DWORD[12+r8] + mov DWORD[4+r8],esi + add ebp,DWORD[16+r8] + mov DWORD[8+r8],ecx + mov DWORD[12+r8],edx + mov DWORD[16+r8],ebp + movaps xmm6,XMMWORD[((-40-96))+r11] + movaps xmm7,XMMWORD[((-40-80))+r11] + movaps xmm8,XMMWORD[((-40-64))+r11] + movaps xmm9,XMMWORD[((-40-48))+r11] + movaps xmm10,XMMWORD[((-40-32))+r11] + movaps xmm11,XMMWORD[((-40-16))+r11] + mov r14,QWORD[((-40))+r11] + + mov r13,QWORD[((-32))+r11] + + mov r12,QWORD[((-24))+r11] + + mov rbp,QWORD[((-16))+r11] + + mov rbx,QWORD[((-8))+r11] + + lea rsp,[r11] + +$L$epilogue_ssse3: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha1_block_data_order_ssse3: +global sha1_block_data_order_avx + +ALIGN 16 +sha1_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha1_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov r11,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + lea rsp,[((-160))+rsp] + vzeroupper + vmovaps XMMWORD[(-40-96)+r11],xmm6 + vmovaps XMMWORD[(-40-80)+r11],xmm7 + vmovaps XMMWORD[(-40-64)+r11],xmm8 + vmovaps XMMWORD[(-40-48)+r11],xmm9 + vmovaps XMMWORD[(-40-32)+r11],xmm10 + vmovaps XMMWORD[(-40-16)+r11],xmm11 +$L$prologue_avx: + and rsp,-64 + mov r8,rdi + mov r9,rsi + mov r10,rdx + + shl r10,6 + add r10,r9 + lea r14,[((K_XX_XX+64))] + + mov eax,DWORD[r8] + mov ebx,DWORD[4+r8] + mov ecx,DWORD[8+r8] + mov edx,DWORD[12+r8] + mov esi,ebx + mov ebp,DWORD[16+r8] + mov edi,ecx + xor edi,edx + and esi,edi + + vmovdqa xmm6,XMMWORD[64+r14] + vmovdqa xmm11,XMMWORD[((-64))+r14] + vmovdqu xmm0,XMMWORD[r9] + vmovdqu xmm1,XMMWORD[16+r9] + vmovdqu xmm2,XMMWORD[32+r9] + vmovdqu xmm3,XMMWORD[48+r9] + vpshufb xmm0,xmm0,xmm6 + add r9,64 + vpshufb xmm1,xmm1,xmm6 + vpshufb xmm2,xmm2,xmm6 + vpshufb xmm3,xmm3,xmm6 + vpaddd xmm4,xmm0,xmm11 + vpaddd xmm5,xmm1,xmm11 + vpaddd xmm6,xmm2,xmm11 + vmovdqa XMMWORD[rsp],xmm4 + vmovdqa XMMWORD[16+rsp],xmm5 + vmovdqa XMMWORD[32+rsp],xmm6 + jmp NEAR $L$oop_avx +ALIGN 16 +$L$oop_avx: + shrd ebx,ebx,2 + xor esi,edx + vpalignr xmm4,xmm1,xmm0,8 + mov edi,eax + add ebp,DWORD[rsp] + vpaddd xmm9,xmm11,xmm3 + xor ebx,ecx + shld eax,eax,5 + vpsrldq xmm8,xmm3,4 + add ebp,esi + and edi,ebx + vpxor xmm4,xmm4,xmm0 + xor ebx,ecx + add ebp,eax + vpxor xmm8,xmm8,xmm2 + shrd eax,eax,7 + xor edi,ecx + mov esi,ebp + add edx,DWORD[4+rsp] + vpxor xmm4,xmm4,xmm8 + xor eax,ebx + shld ebp,ebp,5 + vmovdqa XMMWORD[48+rsp],xmm9 + add edx,edi + and esi,eax + vpsrld xmm8,xmm4,31 + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor esi,ebx + vpslldq xmm10,xmm4,12 + vpaddd xmm4,xmm4,xmm4 + mov edi,edx + add ecx,DWORD[8+rsp] + xor ebp,eax + shld edx,edx,5 + vpsrld xmm9,xmm10,30 + vpor xmm4,xmm4,xmm8 + add ecx,esi + and edi,ebp + xor ebp,eax + add ecx,edx + vpslld xmm10,xmm10,2 + vpxor xmm4,xmm4,xmm9 + shrd edx,edx,7 + xor edi,eax + mov esi,ecx + add ebx,DWORD[12+rsp] + vpxor xmm4,xmm4,xmm10 + xor edx,ebp + shld ecx,ecx,5 + add ebx,edi + and esi,edx + xor edx,ebp + add ebx,ecx + shrd ecx,ecx,7 + xor esi,ebp + vpalignr xmm5,xmm2,xmm1,8 + mov edi,ebx + add eax,DWORD[16+rsp] + vpaddd xmm9,xmm11,xmm4 + xor ecx,edx + shld ebx,ebx,5 + vpsrldq xmm8,xmm4,4 + add eax,esi + and edi,ecx + vpxor xmm5,xmm5,xmm1 + xor ecx,edx + add eax,ebx + vpxor xmm8,xmm8,xmm3 + shrd ebx,ebx,7 + xor edi,edx + mov esi,eax + add ebp,DWORD[20+rsp] + vpxor xmm5,xmm5,xmm8 + xor ebx,ecx + shld eax,eax,5 + vmovdqa XMMWORD[rsp],xmm9 + add ebp,edi + and esi,ebx + vpsrld xmm8,xmm5,31 + xor ebx,ecx + add ebp,eax + shrd eax,eax,7 + xor esi,ecx + vpslldq xmm10,xmm5,12 + vpaddd xmm5,xmm5,xmm5 + mov edi,ebp + add edx,DWORD[24+rsp] + xor eax,ebx + shld ebp,ebp,5 + vpsrld xmm9,xmm10,30 + vpor xmm5,xmm5,xmm8 + add edx,esi + and edi,eax + xor eax,ebx + add edx,ebp + vpslld xmm10,xmm10,2 + vpxor xmm5,xmm5,xmm9 + shrd ebp,ebp,7 + xor edi,ebx + mov esi,edx + add ecx,DWORD[28+rsp] + vpxor xmm5,xmm5,xmm10 + xor ebp,eax + shld edx,edx,5 + vmovdqa xmm11,XMMWORD[((-32))+r14] + add ecx,edi + and esi,ebp + xor ebp,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + vpalignr xmm6,xmm3,xmm2,8 + mov edi,ecx + add ebx,DWORD[32+rsp] + vpaddd xmm9,xmm11,xmm5 + xor edx,ebp + shld ecx,ecx,5 + vpsrldq xmm8,xmm5,4 + add ebx,esi + and edi,edx + vpxor xmm6,xmm6,xmm2 + xor edx,ebp + add ebx,ecx + vpxor xmm8,xmm8,xmm4 + shrd ecx,ecx,7 + xor edi,ebp + mov esi,ebx + add eax,DWORD[36+rsp] + vpxor xmm6,xmm6,xmm8 + xor ecx,edx + shld ebx,ebx,5 + vmovdqa XMMWORD[16+rsp],xmm9 + add eax,edi + and esi,ecx + vpsrld xmm8,xmm6,31 + xor ecx,edx + add eax,ebx + shrd ebx,ebx,7 + xor esi,edx + vpslldq xmm10,xmm6,12 + vpaddd xmm6,xmm6,xmm6 + mov edi,eax + add ebp,DWORD[40+rsp] + xor ebx,ecx + shld eax,eax,5 + vpsrld xmm9,xmm10,30 + vpor xmm6,xmm6,xmm8 + add ebp,esi + and edi,ebx + xor ebx,ecx + add ebp,eax + vpslld xmm10,xmm10,2 + vpxor xmm6,xmm6,xmm9 + shrd eax,eax,7 + xor edi,ecx + mov esi,ebp + add edx,DWORD[44+rsp] + vpxor xmm6,xmm6,xmm10 + xor eax,ebx + shld ebp,ebp,5 + add edx,edi + and esi,eax + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor esi,ebx + vpalignr xmm7,xmm4,xmm3,8 + mov edi,edx + add ecx,DWORD[48+rsp] + vpaddd xmm9,xmm11,xmm6 + xor ebp,eax + shld edx,edx,5 + vpsrldq xmm8,xmm6,4 + add ecx,esi + and edi,ebp + vpxor xmm7,xmm7,xmm3 + xor ebp,eax + add ecx,edx + vpxor xmm8,xmm8,xmm5 + shrd edx,edx,7 + xor edi,eax + mov esi,ecx + add ebx,DWORD[52+rsp] + vpxor xmm7,xmm7,xmm8 + xor edx,ebp + shld ecx,ecx,5 + vmovdqa XMMWORD[32+rsp],xmm9 + add ebx,edi + and esi,edx + vpsrld xmm8,xmm7,31 + xor edx,ebp + add ebx,ecx + shrd ecx,ecx,7 + xor esi,ebp + vpslldq xmm10,xmm7,12 + vpaddd xmm7,xmm7,xmm7 + mov edi,ebx + add eax,DWORD[56+rsp] + xor ecx,edx + shld ebx,ebx,5 + vpsrld xmm9,xmm10,30 + vpor xmm7,xmm7,xmm8 + add eax,esi + and edi,ecx + xor ecx,edx + add eax,ebx + vpslld xmm10,xmm10,2 + vpxor xmm7,xmm7,xmm9 + shrd ebx,ebx,7 + xor edi,edx + mov esi,eax + add ebp,DWORD[60+rsp] + vpxor xmm7,xmm7,xmm10 + xor ebx,ecx + shld eax,eax,5 + add ebp,edi + and esi,ebx + xor ebx,ecx + add ebp,eax + vpalignr xmm8,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + shrd eax,eax,7 + xor esi,ecx + mov edi,ebp + add edx,DWORD[rsp] + vpxor xmm0,xmm0,xmm1 + xor eax,ebx + shld ebp,ebp,5 + vpaddd xmm9,xmm11,xmm7 + add edx,esi + and edi,eax + vpxor xmm0,xmm0,xmm8 + xor eax,ebx + add edx,ebp + shrd ebp,ebp,7 + xor edi,ebx + vpsrld xmm8,xmm0,30 + vmovdqa XMMWORD[48+rsp],xmm9 + mov esi,edx + add ecx,DWORD[4+rsp] + xor ebp,eax + shld edx,edx,5 + vpslld xmm0,xmm0,2 + add ecx,edi + and esi,ebp + xor ebp,eax + add ecx,edx + shrd edx,edx,7 + xor esi,eax + mov edi,ecx + add ebx,DWORD[8+rsp] + vpor xmm0,xmm0,xmm8 + xor edx,ebp + shld ecx,ecx,5 + add ebx,esi + and edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[12+rsp] + xor edi,ebp + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpalignr xmm8,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add ebp,DWORD[16+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + vpxor xmm1,xmm1,xmm2 + add ebp,esi + xor edi,ecx + vpaddd xmm9,xmm11,xmm0 + shrd ebx,ebx,7 + add ebp,eax + vpxor xmm1,xmm1,xmm8 + add edx,DWORD[20+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + vpsrld xmm8,xmm1,30 + vmovdqa XMMWORD[rsp],xmm9 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpslld xmm1,xmm1,2 + add ecx,DWORD[24+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vpor xmm1,xmm1,xmm8 + add ebx,DWORD[28+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + vpalignr xmm8,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add eax,DWORD[32+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + vpxor xmm2,xmm2,xmm3 + add eax,esi + xor edi,edx + vpaddd xmm9,xmm11,xmm1 + vmovdqa xmm11,XMMWORD[r14] + shrd ecx,ecx,7 + add eax,ebx + vpxor xmm2,xmm2,xmm8 + add ebp,DWORD[36+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + vpsrld xmm8,xmm2,30 + vmovdqa XMMWORD[16+rsp],xmm9 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpslld xmm2,xmm2,2 + add edx,DWORD[40+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + vpor xmm2,xmm2,xmm8 + add ecx,DWORD[44+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + vpalignr xmm8,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add ebx,DWORD[48+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + vpxor xmm3,xmm3,xmm4 + add ebx,esi + xor edi,ebp + vpaddd xmm9,xmm11,xmm2 + shrd edx,edx,7 + add ebx,ecx + vpxor xmm3,xmm3,xmm8 + add eax,DWORD[52+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + vpsrld xmm8,xmm3,30 + vmovdqa XMMWORD[32+rsp],xmm9 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + vpslld xmm3,xmm3,2 + add ebp,DWORD[56+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpor xmm3,xmm3,xmm8 + add edx,DWORD[60+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpalignr xmm8,xmm3,xmm2,8 + vpxor xmm4,xmm4,xmm0 + add ecx,DWORD[rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + vpxor xmm4,xmm4,xmm5 + add ecx,esi + xor edi,eax + vpaddd xmm9,xmm11,xmm3 + shrd ebp,ebp,7 + add ecx,edx + vpxor xmm4,xmm4,xmm8 + add ebx,DWORD[4+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + vpsrld xmm8,xmm4,30 + vmovdqa XMMWORD[48+rsp],xmm9 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + vpslld xmm4,xmm4,2 + add eax,DWORD[8+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + vpor xmm4,xmm4,xmm8 + add ebp,DWORD[12+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + vpalignr xmm8,xmm4,xmm3,8 + vpxor xmm5,xmm5,xmm1 + add edx,DWORD[16+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + vpxor xmm5,xmm5,xmm6 + add edx,esi + xor edi,ebx + vpaddd xmm9,xmm11,xmm4 + shrd eax,eax,7 + add edx,ebp + vpxor xmm5,xmm5,xmm8 + add ecx,DWORD[20+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + vpsrld xmm8,xmm5,30 + vmovdqa XMMWORD[rsp],xmm9 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + vpslld xmm5,xmm5,2 + add ebx,DWORD[24+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + vpor xmm5,xmm5,xmm8 + add eax,DWORD[28+rsp] + shrd ecx,ecx,7 + mov esi,ebx + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,ecx + xor ecx,edx + add eax,ebx + vpalignr xmm8,xmm5,xmm4,8 + vpxor xmm6,xmm6,xmm2 + add ebp,DWORD[32+rsp] + and esi,ecx + xor ecx,edx + shrd ebx,ebx,7 + vpxor xmm6,xmm6,xmm7 + mov edi,eax + xor esi,ecx + vpaddd xmm9,xmm11,xmm5 + shld eax,eax,5 + add ebp,esi + vpxor xmm6,xmm6,xmm8 + xor edi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[36+rsp] + vpsrld xmm8,xmm6,30 + vmovdqa XMMWORD[16+rsp],xmm9 + and edi,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,ebp + vpslld xmm6,xmm6,2 + xor edi,ebx + shld ebp,ebp,5 + add edx,edi + xor esi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[40+rsp] + and esi,eax + vpor xmm6,xmm6,xmm8 + xor eax,ebx + shrd ebp,ebp,7 + mov edi,edx + xor esi,eax + shld edx,edx,5 + add ecx,esi + xor edi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[44+rsp] + and edi,ebp + xor ebp,eax + shrd edx,edx,7 + mov esi,ecx + xor edi,ebp + shld ecx,ecx,5 + add ebx,edi + xor esi,edx + xor edx,ebp + add ebx,ecx + vpalignr xmm8,xmm6,xmm5,8 + vpxor xmm7,xmm7,xmm3 + add eax,DWORD[48+rsp] + and esi,edx + xor edx,ebp + shrd ecx,ecx,7 + vpxor xmm7,xmm7,xmm0 + mov edi,ebx + xor esi,edx + vpaddd xmm9,xmm11,xmm6 + vmovdqa xmm11,XMMWORD[32+r14] + shld ebx,ebx,5 + add eax,esi + vpxor xmm7,xmm7,xmm8 + xor edi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[52+rsp] + vpsrld xmm8,xmm7,30 + vmovdqa XMMWORD[32+rsp],xmm9 + and edi,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + vpslld xmm7,xmm7,2 + xor edi,ecx + shld eax,eax,5 + add ebp,edi + xor esi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[56+rsp] + and esi,ebx + vpor xmm7,xmm7,xmm8 + xor ebx,ecx + shrd eax,eax,7 + mov edi,ebp + xor esi,ebx + shld ebp,ebp,5 + add edx,esi + xor edi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[60+rsp] + and edi,eax + xor eax,ebx + shrd ebp,ebp,7 + mov esi,edx + xor edi,eax + shld edx,edx,5 + add ecx,edi + xor esi,ebp + xor ebp,eax + add ecx,edx + vpalignr xmm8,xmm7,xmm6,8 + vpxor xmm0,xmm0,xmm4 + add ebx,DWORD[rsp] + and esi,ebp + xor ebp,eax + shrd edx,edx,7 + vpxor xmm0,xmm0,xmm1 + mov edi,ecx + xor esi,ebp + vpaddd xmm9,xmm11,xmm7 + shld ecx,ecx,5 + add ebx,esi + vpxor xmm0,xmm0,xmm8 + xor edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[4+rsp] + vpsrld xmm8,xmm0,30 + vmovdqa XMMWORD[48+rsp],xmm9 + and edi,edx + xor edx,ebp + shrd ecx,ecx,7 + mov esi,ebx + vpslld xmm0,xmm0,2 + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[8+rsp] + and esi,ecx + vpor xmm0,xmm0,xmm8 + xor ecx,edx + shrd ebx,ebx,7 + mov edi,eax + xor esi,ecx + shld eax,eax,5 + add ebp,esi + xor edi,ebx + xor ebx,ecx + add ebp,eax + add edx,DWORD[12+rsp] + and edi,ebx + xor ebx,ecx + shrd eax,eax,7 + mov esi,ebp + xor edi,ebx + shld ebp,ebp,5 + add edx,edi + xor esi,eax + xor eax,ebx + add edx,ebp + vpalignr xmm8,xmm0,xmm7,8 + vpxor xmm1,xmm1,xmm5 + add ecx,DWORD[16+rsp] + and esi,eax + xor eax,ebx + shrd ebp,ebp,7 + vpxor xmm1,xmm1,xmm2 + mov edi,edx + xor esi,eax + vpaddd xmm9,xmm11,xmm0 + shld edx,edx,5 + add ecx,esi + vpxor xmm1,xmm1,xmm8 + xor edi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[20+rsp] + vpsrld xmm8,xmm1,30 + vmovdqa XMMWORD[rsp],xmm9 + and edi,ebp + xor ebp,eax + shrd edx,edx,7 + mov esi,ecx + vpslld xmm1,xmm1,2 + xor edi,ebp + shld ecx,ecx,5 + add ebx,edi + xor esi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[24+rsp] + and esi,edx + vpor xmm1,xmm1,xmm8 + xor edx,ebp + shrd ecx,ecx,7 + mov edi,ebx + xor esi,edx + shld ebx,ebx,5 + add eax,esi + xor edi,ecx + xor ecx,edx + add eax,ebx + add ebp,DWORD[28+rsp] + and edi,ecx + xor ecx,edx + shrd ebx,ebx,7 + mov esi,eax + xor edi,ecx + shld eax,eax,5 + add ebp,edi + xor esi,ebx + xor ebx,ecx + add ebp,eax + vpalignr xmm8,xmm1,xmm0,8 + vpxor xmm2,xmm2,xmm6 + add edx,DWORD[32+rsp] + and esi,ebx + xor ebx,ecx + shrd eax,eax,7 + vpxor xmm2,xmm2,xmm3 + mov edi,ebp + xor esi,ebx + vpaddd xmm9,xmm11,xmm1 + shld ebp,ebp,5 + add edx,esi + vpxor xmm2,xmm2,xmm8 + xor edi,eax + xor eax,ebx + add edx,ebp + add ecx,DWORD[36+rsp] + vpsrld xmm8,xmm2,30 + vmovdqa XMMWORD[16+rsp],xmm9 + and edi,eax + xor eax,ebx + shrd ebp,ebp,7 + mov esi,edx + vpslld xmm2,xmm2,2 + xor edi,eax + shld edx,edx,5 + add ecx,edi + xor esi,ebp + xor ebp,eax + add ecx,edx + add ebx,DWORD[40+rsp] + and esi,ebp + vpor xmm2,xmm2,xmm8 + xor ebp,eax + shrd edx,edx,7 + mov edi,ecx + xor esi,ebp + shld ecx,ecx,5 + add ebx,esi + xor edi,edx + xor edx,ebp + add ebx,ecx + add eax,DWORD[44+rsp] + and edi,edx + xor edx,ebp + shrd ecx,ecx,7 + mov esi,ebx + xor edi,edx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + add eax,ebx + vpalignr xmm8,xmm2,xmm1,8 + vpxor xmm3,xmm3,xmm7 + add ebp,DWORD[48+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + vpxor xmm3,xmm3,xmm4 + add ebp,esi + xor edi,ecx + vpaddd xmm9,xmm11,xmm2 + shrd ebx,ebx,7 + add ebp,eax + vpxor xmm3,xmm3,xmm8 + add edx,DWORD[52+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + vpsrld xmm8,xmm3,30 + vmovdqa XMMWORD[32+rsp],xmm9 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + vpslld xmm3,xmm3,2 + add ecx,DWORD[56+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vpor xmm3,xmm3,xmm8 + add ebx,DWORD[60+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[rsp] + vpaddd xmm9,xmm11,xmm3 + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + vmovdqa XMMWORD[48+rsp],xmm9 + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[4+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[8+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[12+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + cmp r9,r10 + je NEAR $L$done_avx + vmovdqa xmm6,XMMWORD[64+r14] + vmovdqa xmm11,XMMWORD[((-64))+r14] + vmovdqu xmm0,XMMWORD[r9] + vmovdqu xmm1,XMMWORD[16+r9] + vmovdqu xmm2,XMMWORD[32+r9] + vmovdqu xmm3,XMMWORD[48+r9] + vpshufb xmm0,xmm0,xmm6 + add r9,64 + add ebx,DWORD[16+rsp] + xor esi,ebp + vpshufb xmm1,xmm1,xmm6 + mov edi,ecx + shld ecx,ecx,5 + vpaddd xmm4,xmm0,xmm11 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + vmovdqa XMMWORD[rsp],xmm4 + add eax,DWORD[20+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[24+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[28+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[32+rsp] + xor esi,eax + vpshufb xmm2,xmm2,xmm6 + mov edi,edx + shld edx,edx,5 + vpaddd xmm5,xmm1,xmm11 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + vmovdqa XMMWORD[16+rsp],xmm5 + add ebx,DWORD[36+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[40+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[44+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[48+rsp] + xor esi,ebx + vpshufb xmm3,xmm3,xmm6 + mov edi,ebp + shld ebp,ebp,5 + vpaddd xmm6,xmm2,xmm11 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + vmovdqa XMMWORD[32+rsp],xmm6 + add ecx,DWORD[52+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[56+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[60+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + shrd ecx,ecx,7 + add eax,ebx + add eax,DWORD[r8] + add esi,DWORD[4+r8] + add ecx,DWORD[8+r8] + add edx,DWORD[12+r8] + mov DWORD[r8],eax + add ebp,DWORD[16+r8] + mov DWORD[4+r8],esi + mov ebx,esi + mov DWORD[8+r8],ecx + mov edi,ecx + mov DWORD[12+r8],edx + xor edi,edx + mov DWORD[16+r8],ebp + and esi,edi + jmp NEAR $L$oop_avx + +ALIGN 16 +$L$done_avx: + add ebx,DWORD[16+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[20+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + xor esi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[24+rsp] + xor esi,ecx + mov edi,eax + shld eax,eax,5 + add ebp,esi + xor edi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[28+rsp] + xor edi,ebx + mov esi,ebp + shld ebp,ebp,5 + add edx,edi + xor esi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[32+rsp] + xor esi,eax + mov edi,edx + shld edx,edx,5 + add ecx,esi + xor edi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[36+rsp] + xor edi,ebp + mov esi,ecx + shld ecx,ecx,5 + add ebx,edi + xor esi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[40+rsp] + xor esi,edx + mov edi,ebx + shld ebx,ebx,5 + add eax,esi + xor edi,edx + shrd ecx,ecx,7 + add eax,ebx + add ebp,DWORD[44+rsp] + xor edi,ecx + mov esi,eax + shld eax,eax,5 + add ebp,edi + xor esi,ecx + shrd ebx,ebx,7 + add ebp,eax + add edx,DWORD[48+rsp] + xor esi,ebx + mov edi,ebp + shld ebp,ebp,5 + add edx,esi + xor edi,ebx + shrd eax,eax,7 + add edx,ebp + add ecx,DWORD[52+rsp] + xor edi,eax + mov esi,edx + shld edx,edx,5 + add ecx,edi + xor esi,eax + shrd ebp,ebp,7 + add ecx,edx + add ebx,DWORD[56+rsp] + xor esi,ebp + mov edi,ecx + shld ecx,ecx,5 + add ebx,esi + xor edi,ebp + shrd edx,edx,7 + add ebx,ecx + add eax,DWORD[60+rsp] + xor edi,edx + mov esi,ebx + shld ebx,ebx,5 + add eax,edi + shrd ecx,ecx,7 + add eax,ebx + vzeroupper + + add eax,DWORD[r8] + add esi,DWORD[4+r8] + add ecx,DWORD[8+r8] + mov DWORD[r8],eax + add edx,DWORD[12+r8] + mov DWORD[4+r8],esi + add ebp,DWORD[16+r8] + mov DWORD[8+r8],ecx + mov DWORD[12+r8],edx + mov DWORD[16+r8],ebp + movaps xmm6,XMMWORD[((-40-96))+r11] + movaps xmm7,XMMWORD[((-40-80))+r11] + movaps xmm8,XMMWORD[((-40-64))+r11] + movaps xmm9,XMMWORD[((-40-48))+r11] + movaps xmm10,XMMWORD[((-40-32))+r11] + movaps xmm11,XMMWORD[((-40-16))+r11] + mov r14,QWORD[((-40))+r11] + + mov r13,QWORD[((-32))+r11] + + mov r12,QWORD[((-24))+r11] + + mov rbp,QWORD[((-16))+r11] + + mov rbx,QWORD[((-8))+r11] + + lea rsp,[r11] + +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha1_block_data_order_avx: +global sha1_block_data_order_avx2 + +ALIGN 16 +sha1_block_data_order_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha1_block_data_order_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov r11,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + vzeroupper + lea rsp,[((-96))+rsp] + vmovaps XMMWORD[(-40-96)+r11],xmm6 + vmovaps XMMWORD[(-40-80)+r11],xmm7 + vmovaps XMMWORD[(-40-64)+r11],xmm8 + vmovaps XMMWORD[(-40-48)+r11],xmm9 + vmovaps XMMWORD[(-40-32)+r11],xmm10 + vmovaps XMMWORD[(-40-16)+r11],xmm11 +$L$prologue_avx2: + mov r8,rdi + mov r9,rsi + mov r10,rdx + + lea rsp,[((-640))+rsp] + shl r10,6 + lea r13,[64+r9] + and rsp,-128 + add r10,r9 + lea r14,[((K_XX_XX+64))] + + mov eax,DWORD[r8] + cmp r13,r10 + cmovae r13,r9 + mov ebp,DWORD[4+r8] + mov ecx,DWORD[8+r8] + mov edx,DWORD[12+r8] + mov esi,DWORD[16+r8] + vmovdqu ymm6,YMMWORD[64+r14] + + vmovdqu xmm0,XMMWORD[r9] + vmovdqu xmm1,XMMWORD[16+r9] + vmovdqu xmm2,XMMWORD[32+r9] + vmovdqu xmm3,XMMWORD[48+r9] + lea r9,[64+r9] + vinserti128 ymm0,ymm0,XMMWORD[r13],1 + vinserti128 ymm1,ymm1,XMMWORD[16+r13],1 + vpshufb ymm0,ymm0,ymm6 + vinserti128 ymm2,ymm2,XMMWORD[32+r13],1 + vpshufb ymm1,ymm1,ymm6 + vinserti128 ymm3,ymm3,XMMWORD[48+r13],1 + vpshufb ymm2,ymm2,ymm6 + vmovdqu ymm11,YMMWORD[((-64))+r14] + vpshufb ymm3,ymm3,ymm6 + + vpaddd ymm4,ymm0,ymm11 + vpaddd ymm5,ymm1,ymm11 + vmovdqu YMMWORD[rsp],ymm4 + vpaddd ymm6,ymm2,ymm11 + vmovdqu YMMWORD[32+rsp],ymm5 + vpaddd ymm7,ymm3,ymm11 + vmovdqu YMMWORD[64+rsp],ymm6 + vmovdqu YMMWORD[96+rsp],ymm7 + vpalignr ymm4,ymm1,ymm0,8 + vpsrldq ymm8,ymm3,4 + vpxor ymm4,ymm4,ymm0 + vpxor ymm8,ymm8,ymm2 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm8,ymm4,31 + vpslldq ymm10,ymm4,12 + vpaddd ymm4,ymm4,ymm4 + vpsrld ymm9,ymm10,30 + vpor ymm4,ymm4,ymm8 + vpslld ymm10,ymm10,2 + vpxor ymm4,ymm4,ymm9 + vpxor ymm4,ymm4,ymm10 + vpaddd ymm9,ymm4,ymm11 + vmovdqu YMMWORD[128+rsp],ymm9 + vpalignr ymm5,ymm2,ymm1,8 + vpsrldq ymm8,ymm4,4 + vpxor ymm5,ymm5,ymm1 + vpxor ymm8,ymm8,ymm3 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm5,31 + vmovdqu ymm11,YMMWORD[((-32))+r14] + vpslldq ymm10,ymm5,12 + vpaddd ymm5,ymm5,ymm5 + vpsrld ymm9,ymm10,30 + vpor ymm5,ymm5,ymm8 + vpslld ymm10,ymm10,2 + vpxor ymm5,ymm5,ymm9 + vpxor ymm5,ymm5,ymm10 + vpaddd ymm9,ymm5,ymm11 + vmovdqu YMMWORD[160+rsp],ymm9 + vpalignr ymm6,ymm3,ymm2,8 + vpsrldq ymm8,ymm5,4 + vpxor ymm6,ymm6,ymm2 + vpxor ymm8,ymm8,ymm4 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm6,31 + vpslldq ymm10,ymm6,12 + vpaddd ymm6,ymm6,ymm6 + vpsrld ymm9,ymm10,30 + vpor ymm6,ymm6,ymm8 + vpslld ymm10,ymm10,2 + vpxor ymm6,ymm6,ymm9 + vpxor ymm6,ymm6,ymm10 + vpaddd ymm9,ymm6,ymm11 + vmovdqu YMMWORD[192+rsp],ymm9 + vpalignr ymm7,ymm4,ymm3,8 + vpsrldq ymm8,ymm6,4 + vpxor ymm7,ymm7,ymm3 + vpxor ymm8,ymm8,ymm5 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm7,31 + vpslldq ymm10,ymm7,12 + vpaddd ymm7,ymm7,ymm7 + vpsrld ymm9,ymm10,30 + vpor ymm7,ymm7,ymm8 + vpslld ymm10,ymm10,2 + vpxor ymm7,ymm7,ymm9 + vpxor ymm7,ymm7,ymm10 + vpaddd ymm9,ymm7,ymm11 + vmovdqu YMMWORD[224+rsp],ymm9 + lea r13,[128+rsp] + jmp NEAR $L$oop_avx2 +ALIGN 32 +$L$oop_avx2: + rorx ebx,ebp,2 + andn edi,ebp,edx + and ebp,ecx + xor ebp,edi + jmp NEAR $L$align32_1 +ALIGN 32 +$L$align32_1: + vpalignr ymm8,ymm7,ymm6,8 + vpxor ymm0,ymm0,ymm4 + add esi,DWORD[((-128))+r13] + andn edi,eax,ecx + vpxor ymm0,ymm0,ymm1 + add esi,ebp + rorx r12d,eax,27 + rorx ebp,eax,2 + vpxor ymm0,ymm0,ymm8 + and eax,ebx + add esi,r12d + xor eax,edi + vpsrld ymm8,ymm0,30 + vpslld ymm0,ymm0,2 + add edx,DWORD[((-124))+r13] + andn edi,esi,ebx + add edx,eax + rorx r12d,esi,27 + rorx eax,esi,2 + and esi,ebp + vpor ymm0,ymm0,ymm8 + add edx,r12d + xor esi,edi + add ecx,DWORD[((-120))+r13] + andn edi,edx,ebp + vpaddd ymm9,ymm0,ymm11 + add ecx,esi + rorx r12d,edx,27 + rorx esi,edx,2 + and edx,eax + vmovdqu YMMWORD[256+rsp],ymm9 + add ecx,r12d + xor edx,edi + add ebx,DWORD[((-116))+r13] + andn edi,ecx,eax + add ebx,edx + rorx r12d,ecx,27 + rorx edx,ecx,2 + and ecx,esi + add ebx,r12d + xor ecx,edi + add ebp,DWORD[((-96))+r13] + andn edi,ebx,esi + add ebp,ecx + rorx r12d,ebx,27 + rorx ecx,ebx,2 + and ebx,edx + add ebp,r12d + xor ebx,edi + vpalignr ymm8,ymm0,ymm7,8 + vpxor ymm1,ymm1,ymm5 + add eax,DWORD[((-92))+r13] + andn edi,ebp,edx + vpxor ymm1,ymm1,ymm2 + add eax,ebx + rorx r12d,ebp,27 + rorx ebx,ebp,2 + vpxor ymm1,ymm1,ymm8 + and ebp,ecx + add eax,r12d + xor ebp,edi + vpsrld ymm8,ymm1,30 + vpslld ymm1,ymm1,2 + add esi,DWORD[((-88))+r13] + andn edi,eax,ecx + add esi,ebp + rorx r12d,eax,27 + rorx ebp,eax,2 + and eax,ebx + vpor ymm1,ymm1,ymm8 + add esi,r12d + xor eax,edi + add edx,DWORD[((-84))+r13] + andn edi,esi,ebx + vpaddd ymm9,ymm1,ymm11 + add edx,eax + rorx r12d,esi,27 + rorx eax,esi,2 + and esi,ebp + vmovdqu YMMWORD[288+rsp],ymm9 + add edx,r12d + xor esi,edi + add ecx,DWORD[((-64))+r13] + andn edi,edx,ebp + add ecx,esi + rorx r12d,edx,27 + rorx esi,edx,2 + and edx,eax + add ecx,r12d + xor edx,edi + add ebx,DWORD[((-60))+r13] + andn edi,ecx,eax + add ebx,edx + rorx r12d,ecx,27 + rorx edx,ecx,2 + and ecx,esi + add ebx,r12d + xor ecx,edi + vpalignr ymm8,ymm1,ymm0,8 + vpxor ymm2,ymm2,ymm6 + add ebp,DWORD[((-56))+r13] + andn edi,ebx,esi + vpxor ymm2,ymm2,ymm3 + vmovdqu ymm11,YMMWORD[r14] + add ebp,ecx + rorx r12d,ebx,27 + rorx ecx,ebx,2 + vpxor ymm2,ymm2,ymm8 + and ebx,edx + add ebp,r12d + xor ebx,edi + vpsrld ymm8,ymm2,30 + vpslld ymm2,ymm2,2 + add eax,DWORD[((-52))+r13] + andn edi,ebp,edx + add eax,ebx + rorx r12d,ebp,27 + rorx ebx,ebp,2 + and ebp,ecx + vpor ymm2,ymm2,ymm8 + add eax,r12d + xor ebp,edi + add esi,DWORD[((-32))+r13] + andn edi,eax,ecx + vpaddd ymm9,ymm2,ymm11 + add esi,ebp + rorx r12d,eax,27 + rorx ebp,eax,2 + and eax,ebx + vmovdqu YMMWORD[320+rsp],ymm9 + add esi,r12d + xor eax,edi + add edx,DWORD[((-28))+r13] + andn edi,esi,ebx + add edx,eax + rorx r12d,esi,27 + rorx eax,esi,2 + and esi,ebp + add edx,r12d + xor esi,edi + add ecx,DWORD[((-24))+r13] + andn edi,edx,ebp + add ecx,esi + rorx r12d,edx,27 + rorx esi,edx,2 + and edx,eax + add ecx,r12d + xor edx,edi + vpalignr ymm8,ymm2,ymm1,8 + vpxor ymm3,ymm3,ymm7 + add ebx,DWORD[((-20))+r13] + andn edi,ecx,eax + vpxor ymm3,ymm3,ymm4 + add ebx,edx + rorx r12d,ecx,27 + rorx edx,ecx,2 + vpxor ymm3,ymm3,ymm8 + and ecx,esi + add ebx,r12d + xor ecx,edi + vpsrld ymm8,ymm3,30 + vpslld ymm3,ymm3,2 + add ebp,DWORD[r13] + andn edi,ebx,esi + add ebp,ecx + rorx r12d,ebx,27 + rorx ecx,ebx,2 + and ebx,edx + vpor ymm3,ymm3,ymm8 + add ebp,r12d + xor ebx,edi + add eax,DWORD[4+r13] + andn edi,ebp,edx + vpaddd ymm9,ymm3,ymm11 + add eax,ebx + rorx r12d,ebp,27 + rorx ebx,ebp,2 + and ebp,ecx + vmovdqu YMMWORD[352+rsp],ymm9 + add eax,r12d + xor ebp,edi + add esi,DWORD[8+r13] + andn edi,eax,ecx + add esi,ebp + rorx r12d,eax,27 + rorx ebp,eax,2 + and eax,ebx + add esi,r12d + xor eax,edi + add edx,DWORD[12+r13] + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + xor esi,ebx + vpalignr ymm8,ymm3,ymm2,8 + vpxor ymm4,ymm4,ymm0 + add ecx,DWORD[32+r13] + lea ecx,[rsi*1+rcx] + vpxor ymm4,ymm4,ymm5 + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + vpxor ymm4,ymm4,ymm8 + add ecx,r12d + xor edx,ebp + add ebx,DWORD[36+r13] + vpsrld ymm8,ymm4,30 + vpslld ymm4,ymm4,2 + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + xor ecx,eax + vpor ymm4,ymm4,ymm8 + add ebp,DWORD[40+r13] + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + vpaddd ymm9,ymm4,ymm11 + xor ebx,edx + add ebp,r12d + xor ebx,esi + add eax,DWORD[44+r13] + vmovdqu YMMWORD[384+rsp],ymm9 + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + xor ebp,edx + add esi,DWORD[64+r13] + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + xor eax,ecx + vpalignr ymm8,ymm4,ymm3,8 + vpxor ymm5,ymm5,ymm1 + add edx,DWORD[68+r13] + lea edx,[rax*1+rdx] + vpxor ymm5,ymm5,ymm6 + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + vpxor ymm5,ymm5,ymm8 + add edx,r12d + xor esi,ebx + add ecx,DWORD[72+r13] + vpsrld ymm8,ymm5,30 + vpslld ymm5,ymm5,2 + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + xor edx,ebp + vpor ymm5,ymm5,ymm8 + add ebx,DWORD[76+r13] + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + vpaddd ymm9,ymm5,ymm11 + xor ecx,esi + add ebx,r12d + xor ecx,eax + add ebp,DWORD[96+r13] + vmovdqu YMMWORD[416+rsp],ymm9 + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + xor ebx,esi + add eax,DWORD[100+r13] + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + xor ebp,edx + vpalignr ymm8,ymm5,ymm4,8 + vpxor ymm6,ymm6,ymm2 + add esi,DWORD[104+r13] + lea esi,[rbp*1+rsi] + vpxor ymm6,ymm6,ymm7 + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + vpxor ymm6,ymm6,ymm8 + add esi,r12d + xor eax,ecx + add edx,DWORD[108+r13] + lea r13,[256+r13] + vpsrld ymm8,ymm6,30 + vpslld ymm6,ymm6,2 + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + xor esi,ebx + vpor ymm6,ymm6,ymm8 + add ecx,DWORD[((-128))+r13] + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + vpaddd ymm9,ymm6,ymm11 + xor edx,eax + add ecx,r12d + xor edx,ebp + add ebx,DWORD[((-124))+r13] + vmovdqu YMMWORD[448+rsp],ymm9 + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + xor ecx,eax + add ebp,DWORD[((-120))+r13] + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + xor ebx,esi + vpalignr ymm8,ymm6,ymm5,8 + vpxor ymm7,ymm7,ymm3 + add eax,DWORD[((-116))+r13] + lea eax,[rbx*1+rax] + vpxor ymm7,ymm7,ymm0 + vmovdqu ymm11,YMMWORD[32+r14] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + vpxor ymm7,ymm7,ymm8 + add eax,r12d + xor ebp,edx + add esi,DWORD[((-96))+r13] + vpsrld ymm8,ymm7,30 + vpslld ymm7,ymm7,2 + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + xor eax,ecx + vpor ymm7,ymm7,ymm8 + add edx,DWORD[((-92))+r13] + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + vpaddd ymm9,ymm7,ymm11 + xor esi,ebp + add edx,r12d + xor esi,ebx + add ecx,DWORD[((-88))+r13] + vmovdqu YMMWORD[480+rsp],ymm9 + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + xor edx,ebp + add ebx,DWORD[((-84))+r13] + mov edi,esi + xor edi,eax + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + and ecx,edi + jmp NEAR $L$align32_2 +ALIGN 32 +$L$align32_2: + vpalignr ymm8,ymm7,ymm6,8 + vpxor ymm0,ymm0,ymm4 + add ebp,DWORD[((-64))+r13] + xor ecx,esi + vpxor ymm0,ymm0,ymm1 + mov edi,edx + xor edi,esi + lea ebp,[rbp*1+rcx] + vpxor ymm0,ymm0,ymm8 + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + vpsrld ymm8,ymm0,30 + vpslld ymm0,ymm0,2 + add ebp,r12d + and ebx,edi + add eax,DWORD[((-60))+r13] + xor ebx,edx + mov edi,ecx + xor edi,edx + vpor ymm0,ymm0,ymm8 + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + vpaddd ymm9,ymm0,ymm11 + add eax,r12d + and ebp,edi + add esi,DWORD[((-56))+r13] + xor ebp,ecx + vmovdqu YMMWORD[512+rsp],ymm9 + mov edi,ebx + xor edi,ecx + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + and eax,edi + add edx,DWORD[((-52))+r13] + xor eax,ebx + mov edi,ebp + xor edi,ebx + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + and esi,edi + add ecx,DWORD[((-32))+r13] + xor esi,ebp + mov edi,eax + xor edi,ebp + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + and edx,edi + vpalignr ymm8,ymm0,ymm7,8 + vpxor ymm1,ymm1,ymm5 + add ebx,DWORD[((-28))+r13] + xor edx,eax + vpxor ymm1,ymm1,ymm2 + mov edi,esi + xor edi,eax + lea ebx,[rdx*1+rbx] + vpxor ymm1,ymm1,ymm8 + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + vpsrld ymm8,ymm1,30 + vpslld ymm1,ymm1,2 + add ebx,r12d + and ecx,edi + add ebp,DWORD[((-24))+r13] + xor ecx,esi + mov edi,edx + xor edi,esi + vpor ymm1,ymm1,ymm8 + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + vpaddd ymm9,ymm1,ymm11 + add ebp,r12d + and ebx,edi + add eax,DWORD[((-20))+r13] + xor ebx,edx + vmovdqu YMMWORD[544+rsp],ymm9 + mov edi,ecx + xor edi,edx + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + and ebp,edi + add esi,DWORD[r13] + xor ebp,ecx + mov edi,ebx + xor edi,ecx + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + and eax,edi + add edx,DWORD[4+r13] + xor eax,ebx + mov edi,ebp + xor edi,ebx + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + and esi,edi + vpalignr ymm8,ymm1,ymm0,8 + vpxor ymm2,ymm2,ymm6 + add ecx,DWORD[8+r13] + xor esi,ebp + vpxor ymm2,ymm2,ymm3 + mov edi,eax + xor edi,ebp + lea ecx,[rsi*1+rcx] + vpxor ymm2,ymm2,ymm8 + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + vpsrld ymm8,ymm2,30 + vpslld ymm2,ymm2,2 + add ecx,r12d + and edx,edi + add ebx,DWORD[12+r13] + xor edx,eax + mov edi,esi + xor edi,eax + vpor ymm2,ymm2,ymm8 + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + vpaddd ymm9,ymm2,ymm11 + add ebx,r12d + and ecx,edi + add ebp,DWORD[32+r13] + xor ecx,esi + vmovdqu YMMWORD[576+rsp],ymm9 + mov edi,edx + xor edi,esi + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + and ebx,edi + add eax,DWORD[36+r13] + xor ebx,edx + mov edi,ecx + xor edi,edx + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + and ebp,edi + add esi,DWORD[40+r13] + xor ebp,ecx + mov edi,ebx + xor edi,ecx + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + and eax,edi + vpalignr ymm8,ymm2,ymm1,8 + vpxor ymm3,ymm3,ymm7 + add edx,DWORD[44+r13] + xor eax,ebx + vpxor ymm3,ymm3,ymm4 + mov edi,ebp + xor edi,ebx + lea edx,[rax*1+rdx] + vpxor ymm3,ymm3,ymm8 + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + vpsrld ymm8,ymm3,30 + vpslld ymm3,ymm3,2 + add edx,r12d + and esi,edi + add ecx,DWORD[64+r13] + xor esi,ebp + mov edi,eax + xor edi,ebp + vpor ymm3,ymm3,ymm8 + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + vpaddd ymm9,ymm3,ymm11 + add ecx,r12d + and edx,edi + add ebx,DWORD[68+r13] + xor edx,eax + vmovdqu YMMWORD[608+rsp],ymm9 + mov edi,esi + xor edi,eax + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + and ecx,edi + add ebp,DWORD[72+r13] + xor ecx,esi + mov edi,edx + xor edi,esi + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + and ebx,edi + add eax,DWORD[76+r13] + xor ebx,edx + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + xor ebp,edx + add esi,DWORD[96+r13] + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + xor eax,ecx + add edx,DWORD[100+r13] + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + xor esi,ebx + add ecx,DWORD[104+r13] + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + xor edx,ebp + add ebx,DWORD[108+r13] + lea r13,[256+r13] + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + xor ecx,eax + add ebp,DWORD[((-128))+r13] + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + xor ebx,esi + add eax,DWORD[((-124))+r13] + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + xor ebp,edx + add esi,DWORD[((-120))+r13] + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + xor eax,ecx + add edx,DWORD[((-116))+r13] + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + xor esi,ebx + add ecx,DWORD[((-96))+r13] + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + xor edx,ebp + add ebx,DWORD[((-92))+r13] + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + xor ecx,eax + add ebp,DWORD[((-88))+r13] + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + xor ebx,esi + add eax,DWORD[((-84))+r13] + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + xor ebp,edx + add esi,DWORD[((-64))+r13] + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + xor eax,ecx + add edx,DWORD[((-60))+r13] + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + xor esi,ebx + add ecx,DWORD[((-56))+r13] + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + xor edx,ebp + add ebx,DWORD[((-52))+r13] + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + xor ecx,eax + add ebp,DWORD[((-32))+r13] + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + xor ebx,esi + add eax,DWORD[((-28))+r13] + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + xor ebp,edx + add esi,DWORD[((-24))+r13] + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + xor eax,ecx + add edx,DWORD[((-20))+r13] + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + add edx,r12d + lea r13,[128+r9] + lea rdi,[128+r9] + cmp r13,r10 + cmovae r13,r9 + + + add edx,DWORD[r8] + add esi,DWORD[4+r8] + add ebp,DWORD[8+r8] + mov DWORD[r8],edx + add ebx,DWORD[12+r8] + mov DWORD[4+r8],esi + mov eax,edx + add ecx,DWORD[16+r8] + mov r12d,ebp + mov DWORD[8+r8],ebp + mov edx,ebx + + mov DWORD[12+r8],ebx + mov ebp,esi + mov DWORD[16+r8],ecx + + mov esi,ecx + mov ecx,r12d + + + cmp r9,r10 + je NEAR $L$done_avx2 + vmovdqu ymm6,YMMWORD[64+r14] + cmp rdi,r10 + ja NEAR $L$ast_avx2 + + vmovdqu xmm0,XMMWORD[((-64))+rdi] + vmovdqu xmm1,XMMWORD[((-48))+rdi] + vmovdqu xmm2,XMMWORD[((-32))+rdi] + vmovdqu xmm3,XMMWORD[((-16))+rdi] + vinserti128 ymm0,ymm0,XMMWORD[r13],1 + vinserti128 ymm1,ymm1,XMMWORD[16+r13],1 + vinserti128 ymm2,ymm2,XMMWORD[32+r13],1 + vinserti128 ymm3,ymm3,XMMWORD[48+r13],1 + jmp NEAR $L$ast_avx2 + +ALIGN 32 +$L$ast_avx2: + lea r13,[((128+16))+rsp] + rorx ebx,ebp,2 + andn edi,ebp,edx + and ebp,ecx + xor ebp,edi + sub r9,-128 + add esi,DWORD[((-128))+r13] + andn edi,eax,ecx + add esi,ebp + rorx r12d,eax,27 + rorx ebp,eax,2 + and eax,ebx + add esi,r12d + xor eax,edi + add edx,DWORD[((-124))+r13] + andn edi,esi,ebx + add edx,eax + rorx r12d,esi,27 + rorx eax,esi,2 + and esi,ebp + add edx,r12d + xor esi,edi + add ecx,DWORD[((-120))+r13] + andn edi,edx,ebp + add ecx,esi + rorx r12d,edx,27 + rorx esi,edx,2 + and edx,eax + add ecx,r12d + xor edx,edi + add ebx,DWORD[((-116))+r13] + andn edi,ecx,eax + add ebx,edx + rorx r12d,ecx,27 + rorx edx,ecx,2 + and ecx,esi + add ebx,r12d + xor ecx,edi + add ebp,DWORD[((-96))+r13] + andn edi,ebx,esi + add ebp,ecx + rorx r12d,ebx,27 + rorx ecx,ebx,2 + and ebx,edx + add ebp,r12d + xor ebx,edi + add eax,DWORD[((-92))+r13] + andn edi,ebp,edx + add eax,ebx + rorx r12d,ebp,27 + rorx ebx,ebp,2 + and ebp,ecx + add eax,r12d + xor ebp,edi + add esi,DWORD[((-88))+r13] + andn edi,eax,ecx + add esi,ebp + rorx r12d,eax,27 + rorx ebp,eax,2 + and eax,ebx + add esi,r12d + xor eax,edi + add edx,DWORD[((-84))+r13] + andn edi,esi,ebx + add edx,eax + rorx r12d,esi,27 + rorx eax,esi,2 + and esi,ebp + add edx,r12d + xor esi,edi + add ecx,DWORD[((-64))+r13] + andn edi,edx,ebp + add ecx,esi + rorx r12d,edx,27 + rorx esi,edx,2 + and edx,eax + add ecx,r12d + xor edx,edi + add ebx,DWORD[((-60))+r13] + andn edi,ecx,eax + add ebx,edx + rorx r12d,ecx,27 + rorx edx,ecx,2 + and ecx,esi + add ebx,r12d + xor ecx,edi + add ebp,DWORD[((-56))+r13] + andn edi,ebx,esi + add ebp,ecx + rorx r12d,ebx,27 + rorx ecx,ebx,2 + and ebx,edx + add ebp,r12d + xor ebx,edi + add eax,DWORD[((-52))+r13] + andn edi,ebp,edx + add eax,ebx + rorx r12d,ebp,27 + rorx ebx,ebp,2 + and ebp,ecx + add eax,r12d + xor ebp,edi + add esi,DWORD[((-32))+r13] + andn edi,eax,ecx + add esi,ebp + rorx r12d,eax,27 + rorx ebp,eax,2 + and eax,ebx + add esi,r12d + xor eax,edi + add edx,DWORD[((-28))+r13] + andn edi,esi,ebx + add edx,eax + rorx r12d,esi,27 + rorx eax,esi,2 + and esi,ebp + add edx,r12d + xor esi,edi + add ecx,DWORD[((-24))+r13] + andn edi,edx,ebp + add ecx,esi + rorx r12d,edx,27 + rorx esi,edx,2 + and edx,eax + add ecx,r12d + xor edx,edi + add ebx,DWORD[((-20))+r13] + andn edi,ecx,eax + add ebx,edx + rorx r12d,ecx,27 + rorx edx,ecx,2 + and ecx,esi + add ebx,r12d + xor ecx,edi + add ebp,DWORD[r13] + andn edi,ebx,esi + add ebp,ecx + rorx r12d,ebx,27 + rorx ecx,ebx,2 + and ebx,edx + add ebp,r12d + xor ebx,edi + add eax,DWORD[4+r13] + andn edi,ebp,edx + add eax,ebx + rorx r12d,ebp,27 + rorx ebx,ebp,2 + and ebp,ecx + add eax,r12d + xor ebp,edi + add esi,DWORD[8+r13] + andn edi,eax,ecx + add esi,ebp + rorx r12d,eax,27 + rorx ebp,eax,2 + and eax,ebx + add esi,r12d + xor eax,edi + add edx,DWORD[12+r13] + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + xor esi,ebx + add ecx,DWORD[32+r13] + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + xor edx,ebp + add ebx,DWORD[36+r13] + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + xor ecx,eax + add ebp,DWORD[40+r13] + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + xor ebx,esi + add eax,DWORD[44+r13] + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + xor ebp,edx + add esi,DWORD[64+r13] + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + xor eax,ecx + vmovdqu ymm11,YMMWORD[((-64))+r14] + vpshufb ymm0,ymm0,ymm6 + add edx,DWORD[68+r13] + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + xor esi,ebx + add ecx,DWORD[72+r13] + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + xor edx,ebp + add ebx,DWORD[76+r13] + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + xor ecx,eax + add ebp,DWORD[96+r13] + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + xor ebx,esi + add eax,DWORD[100+r13] + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + xor ebp,edx + vpshufb ymm1,ymm1,ymm6 + vpaddd ymm8,ymm0,ymm11 + add esi,DWORD[104+r13] + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + xor eax,ecx + add edx,DWORD[108+r13] + lea r13,[256+r13] + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + xor esi,ebx + add ecx,DWORD[((-128))+r13] + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + xor edx,ebp + add ebx,DWORD[((-124))+r13] + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + xor ecx,eax + add ebp,DWORD[((-120))+r13] + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + xor ebx,esi + vmovdqu YMMWORD[rsp],ymm8 + vpshufb ymm2,ymm2,ymm6 + vpaddd ymm9,ymm1,ymm11 + add eax,DWORD[((-116))+r13] + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + xor ebp,edx + add esi,DWORD[((-96))+r13] + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + xor eax,ecx + add edx,DWORD[((-92))+r13] + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + xor esi,ebx + add ecx,DWORD[((-88))+r13] + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + xor edx,ebp + add ebx,DWORD[((-84))+r13] + mov edi,esi + xor edi,eax + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + and ecx,edi + vmovdqu YMMWORD[32+rsp],ymm9 + vpshufb ymm3,ymm3,ymm6 + vpaddd ymm6,ymm2,ymm11 + add ebp,DWORD[((-64))+r13] + xor ecx,esi + mov edi,edx + xor edi,esi + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + and ebx,edi + add eax,DWORD[((-60))+r13] + xor ebx,edx + mov edi,ecx + xor edi,edx + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + and ebp,edi + add esi,DWORD[((-56))+r13] + xor ebp,ecx + mov edi,ebx + xor edi,ecx + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + and eax,edi + add edx,DWORD[((-52))+r13] + xor eax,ebx + mov edi,ebp + xor edi,ebx + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + and esi,edi + add ecx,DWORD[((-32))+r13] + xor esi,ebp + mov edi,eax + xor edi,ebp + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + and edx,edi + jmp NEAR $L$align32_3 +ALIGN 32 +$L$align32_3: + vmovdqu YMMWORD[64+rsp],ymm6 + vpaddd ymm7,ymm3,ymm11 + add ebx,DWORD[((-28))+r13] + xor edx,eax + mov edi,esi + xor edi,eax + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + and ecx,edi + add ebp,DWORD[((-24))+r13] + xor ecx,esi + mov edi,edx + xor edi,esi + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + and ebx,edi + add eax,DWORD[((-20))+r13] + xor ebx,edx + mov edi,ecx + xor edi,edx + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + and ebp,edi + add esi,DWORD[r13] + xor ebp,ecx + mov edi,ebx + xor edi,ecx + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + and eax,edi + add edx,DWORD[4+r13] + xor eax,ebx + mov edi,ebp + xor edi,ebx + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + and esi,edi + vmovdqu YMMWORD[96+rsp],ymm7 + add ecx,DWORD[8+r13] + xor esi,ebp + mov edi,eax + xor edi,ebp + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + and edx,edi + add ebx,DWORD[12+r13] + xor edx,eax + mov edi,esi + xor edi,eax + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + and ecx,edi + add ebp,DWORD[32+r13] + xor ecx,esi + mov edi,edx + xor edi,esi + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + and ebx,edi + add eax,DWORD[36+r13] + xor ebx,edx + mov edi,ecx + xor edi,edx + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + and ebp,edi + add esi,DWORD[40+r13] + xor ebp,ecx + mov edi,ebx + xor edi,ecx + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + and eax,edi + vpalignr ymm4,ymm1,ymm0,8 + add edx,DWORD[44+r13] + xor eax,ebx + mov edi,ebp + xor edi,ebx + vpsrldq ymm8,ymm3,4 + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + vpxor ymm4,ymm4,ymm0 + vpxor ymm8,ymm8,ymm2 + xor esi,ebp + add edx,r12d + vpxor ymm4,ymm4,ymm8 + and esi,edi + add ecx,DWORD[64+r13] + xor esi,ebp + mov edi,eax + vpsrld ymm8,ymm4,31 + xor edi,ebp + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + vpslldq ymm10,ymm4,12 + vpaddd ymm4,ymm4,ymm4 + rorx esi,edx,2 + xor edx,eax + vpsrld ymm9,ymm10,30 + vpor ymm4,ymm4,ymm8 + add ecx,r12d + and edx,edi + vpslld ymm10,ymm10,2 + vpxor ymm4,ymm4,ymm9 + add ebx,DWORD[68+r13] + xor edx,eax + vpxor ymm4,ymm4,ymm10 + mov edi,esi + xor edi,eax + lea ebx,[rdx*1+rbx] + vpaddd ymm9,ymm4,ymm11 + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + vmovdqu YMMWORD[128+rsp],ymm9 + add ebx,r12d + and ecx,edi + add ebp,DWORD[72+r13] + xor ecx,esi + mov edi,edx + xor edi,esi + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + and ebx,edi + add eax,DWORD[76+r13] + xor ebx,edx + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + xor ebp,edx + vpalignr ymm5,ymm2,ymm1,8 + add esi,DWORD[96+r13] + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + vpsrldq ymm8,ymm4,4 + xor eax,ebx + add esi,r12d + xor eax,ecx + vpxor ymm5,ymm5,ymm1 + vpxor ymm8,ymm8,ymm3 + add edx,DWORD[100+r13] + lea edx,[rax*1+rdx] + vpxor ymm5,ymm5,ymm8 + rorx r12d,esi,27 + rorx eax,esi,2 + xor esi,ebp + add edx,r12d + vpsrld ymm8,ymm5,31 + vmovdqu ymm11,YMMWORD[((-32))+r14] + xor esi,ebx + add ecx,DWORD[104+r13] + lea ecx,[rsi*1+rcx] + vpslldq ymm10,ymm5,12 + vpaddd ymm5,ymm5,ymm5 + rorx r12d,edx,27 + rorx esi,edx,2 + vpsrld ymm9,ymm10,30 + vpor ymm5,ymm5,ymm8 + xor edx,eax + add ecx,r12d + vpslld ymm10,ymm10,2 + vpxor ymm5,ymm5,ymm9 + xor edx,ebp + add ebx,DWORD[108+r13] + lea r13,[256+r13] + vpxor ymm5,ymm5,ymm10 + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + vpaddd ymm9,ymm5,ymm11 + xor ecx,esi + add ebx,r12d + xor ecx,eax + vmovdqu YMMWORD[160+rsp],ymm9 + add ebp,DWORD[((-128))+r13] + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + xor ebx,esi + vpalignr ymm6,ymm3,ymm2,8 + add eax,DWORD[((-124))+r13] + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + vpsrldq ymm8,ymm5,4 + xor ebp,ecx + add eax,r12d + xor ebp,edx + vpxor ymm6,ymm6,ymm2 + vpxor ymm8,ymm8,ymm4 + add esi,DWORD[((-120))+r13] + lea esi,[rbp*1+rsi] + vpxor ymm6,ymm6,ymm8 + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + vpsrld ymm8,ymm6,31 + xor eax,ecx + add edx,DWORD[((-116))+r13] + lea edx,[rax*1+rdx] + vpslldq ymm10,ymm6,12 + vpaddd ymm6,ymm6,ymm6 + rorx r12d,esi,27 + rorx eax,esi,2 + vpsrld ymm9,ymm10,30 + vpor ymm6,ymm6,ymm8 + xor esi,ebp + add edx,r12d + vpslld ymm10,ymm10,2 + vpxor ymm6,ymm6,ymm9 + xor esi,ebx + add ecx,DWORD[((-96))+r13] + vpxor ymm6,ymm6,ymm10 + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + vpaddd ymm9,ymm6,ymm11 + xor edx,eax + add ecx,r12d + xor edx,ebp + vmovdqu YMMWORD[192+rsp],ymm9 + add ebx,DWORD[((-92))+r13] + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + xor ecx,eax + vpalignr ymm7,ymm4,ymm3,8 + add ebp,DWORD[((-88))+r13] + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + vpsrldq ymm8,ymm6,4 + xor ebx,edx + add ebp,r12d + xor ebx,esi + vpxor ymm7,ymm7,ymm3 + vpxor ymm8,ymm8,ymm5 + add eax,DWORD[((-84))+r13] + lea eax,[rbx*1+rax] + vpxor ymm7,ymm7,ymm8 + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + vpsrld ymm8,ymm7,31 + xor ebp,edx + add esi,DWORD[((-64))+r13] + lea esi,[rbp*1+rsi] + vpslldq ymm10,ymm7,12 + vpaddd ymm7,ymm7,ymm7 + rorx r12d,eax,27 + rorx ebp,eax,2 + vpsrld ymm9,ymm10,30 + vpor ymm7,ymm7,ymm8 + xor eax,ebx + add esi,r12d + vpslld ymm10,ymm10,2 + vpxor ymm7,ymm7,ymm9 + xor eax,ecx + add edx,DWORD[((-60))+r13] + vpxor ymm7,ymm7,ymm10 + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + rorx eax,esi,2 + vpaddd ymm9,ymm7,ymm11 + xor esi,ebp + add edx,r12d + xor esi,ebx + vmovdqu YMMWORD[224+rsp],ymm9 + add ecx,DWORD[((-56))+r13] + lea ecx,[rsi*1+rcx] + rorx r12d,edx,27 + rorx esi,edx,2 + xor edx,eax + add ecx,r12d + xor edx,ebp + add ebx,DWORD[((-52))+r13] + lea ebx,[rdx*1+rbx] + rorx r12d,ecx,27 + rorx edx,ecx,2 + xor ecx,esi + add ebx,r12d + xor ecx,eax + add ebp,DWORD[((-32))+r13] + lea ebp,[rbp*1+rcx] + rorx r12d,ebx,27 + rorx ecx,ebx,2 + xor ebx,edx + add ebp,r12d + xor ebx,esi + add eax,DWORD[((-28))+r13] + lea eax,[rbx*1+rax] + rorx r12d,ebp,27 + rorx ebx,ebp,2 + xor ebp,ecx + add eax,r12d + xor ebp,edx + add esi,DWORD[((-24))+r13] + lea esi,[rbp*1+rsi] + rorx r12d,eax,27 + rorx ebp,eax,2 + xor eax,ebx + add esi,r12d + xor eax,ecx + add edx,DWORD[((-20))+r13] + lea edx,[rax*1+rdx] + rorx r12d,esi,27 + add edx,r12d + lea r13,[128+rsp] + + + add edx,DWORD[r8] + add esi,DWORD[4+r8] + add ebp,DWORD[8+r8] + mov DWORD[r8],edx + add ebx,DWORD[12+r8] + mov DWORD[4+r8],esi + mov eax,edx + add ecx,DWORD[16+r8] + mov r12d,ebp + mov DWORD[8+r8],ebp + mov edx,ebx + + mov DWORD[12+r8],ebx + mov ebp,esi + mov DWORD[16+r8],ecx + + mov esi,ecx + mov ecx,r12d + + + cmp r9,r10 + jbe NEAR $L$oop_avx2 + +$L$done_avx2: + vzeroupper + movaps xmm6,XMMWORD[((-40-96))+r11] + movaps xmm7,XMMWORD[((-40-80))+r11] + movaps xmm8,XMMWORD[((-40-64))+r11] + movaps xmm9,XMMWORD[((-40-48))+r11] + movaps xmm10,XMMWORD[((-40-32))+r11] + movaps xmm11,XMMWORD[((-40-16))+r11] + mov r14,QWORD[((-40))+r11] + + mov r13,QWORD[((-32))+r11] + + mov r12,QWORD[((-24))+r11] + + mov rbp,QWORD[((-16))+r11] + + mov rbx,QWORD[((-8))+r11] + + lea rsp,[r11] + +$L$epilogue_avx2: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha1_block_data_order_avx2: +section .rdata rdata align=8 +ALIGN 64 +K_XX_XX: + DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 + DD 0x5a827999,0x5a827999,0x5a827999,0x5a827999 + DD 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 + DD 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 + DD 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc + DD 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc + DD 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 + DD 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 + DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + DB 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 + DB 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 + DB 102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44 + DB 32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60 + DB 97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114 + DB 103,62,0 +ALIGN 64 +section .text + +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + lea r10,[$L$prologue] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + lea r10,[$L$epilogue] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[64+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + + jmp NEAR $L$common_seh_tail + + +ALIGN 16 +shaext_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + lea r10,[$L$prologue_shaext] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + lea r10,[$L$epilogue_shaext] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-8-64))+rax] + lea rdi,[512+r8] + mov ecx,8 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + +ALIGN 16 +ssse3_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[208+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-40-96))+rax] + lea rdi,[512+r8] + mov ecx,12 + DD 0xa548f3fc + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_sha1_block_data_order_nohw wrt ..imagebase + DD $L$SEH_end_sha1_block_data_order_nohw wrt ..imagebase + DD $L$SEH_info_sha1_block_data_order_nohw wrt ..imagebase + DD $L$SEH_begin_sha1_block_data_order_hw wrt ..imagebase + DD $L$SEH_end_sha1_block_data_order_hw wrt ..imagebase + DD $L$SEH_info_sha1_block_data_order_hw wrt ..imagebase + DD $L$SEH_begin_sha1_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_end_sha1_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_info_sha1_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_begin_sha1_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha1_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha1_block_data_order_avx wrt ..imagebase + DD $L$SEH_begin_sha1_block_data_order_avx2 wrt ..imagebase + DD $L$SEH_end_sha1_block_data_order_avx2 wrt ..imagebase + DD $L$SEH_info_sha1_block_data_order_avx2 wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_sha1_block_data_order_nohw: + DB 9,0,0,0 + DD se_handler wrt ..imagebase +$L$SEH_info_sha1_block_data_order_hw: + DB 9,0,0,0 + DD shaext_handler wrt ..imagebase +$L$SEH_info_sha1_block_data_order_ssse3: + DB 9,0,0,0 + DD ssse3_handler wrt ..imagebase + DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase +$L$SEH_info_sha1_block_data_order_avx: + DB 9,0,0,0 + DD ssse3_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase +$L$SEH_info_sha1_block_data_order_avx2: + DB 9,0,0,0 + DD ssse3_handler wrt ..imagebase + DD $L$prologue_avx2 wrt ..imagebase,$L$epilogue_avx2 wrt ..imagebase +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/sha256-586-apple.S b/third_party/boringssl/gen/bcm/sha256-586-apple.S new file mode 100644 index 00000000..66107d67 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha256-586-apple.S @@ -0,0 +1,5593 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _sha256_block_data_order_nohw +.private_extern _sha256_block_data_order_nohw +.align 4 +_sha256_block_data_order_nohw: +L_sha256_block_data_order_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call L000pic_point +L000pic_point: + popl %ebp + leal LK256-L000pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $6,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) +L001no_xmm: + subl %edi,%eax + cmpl $256,%eax + jae L002unrolled + jmp L003loop +.align 4,0x90 +L003loop: + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + bswap %eax + movl 12(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 16(%edi),%eax + movl 20(%edi),%ebx + movl 24(%edi),%ecx + bswap %eax + movl 28(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 32(%edi),%eax + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %eax + movl 44(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 48(%edi),%eax + movl 52(%edi),%ebx + movl 56(%edi),%ecx + bswap %eax + movl 60(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + addl $64,%edi + leal -36(%esp),%esp + movl %edi,104(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,8(%esp) + xorl %ecx,%ebx + movl %ecx,12(%esp) + movl %edi,16(%esp) + movl %ebx,(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + movl %edi,32(%esp) +.align 4,0x90 +L00400_15: + movl %edx,%ecx + movl 24(%esp),%esi + rorl $14,%ecx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl 96(%esp),%ebx + rorl $5,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + rorl $6,%edx + movl %eax,%ecx + addl %esi,%ebx + rorl $9,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + rorl $11,%ecx + movl (%ebp),%esi + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + addl $4,%ebp + addl %ebx,%eax + cmpl $3248222580,%esi + jne L00400_15 + movl 156(%esp),%ecx + jmp L00516_63 +.align 4,0x90 +L00516_63: + movl %ecx,%ebx + movl 104(%esp),%esi + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 160(%esp),%ebx + shrl $10,%edi + addl 124(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 24(%esp),%esi + rorl $14,%ecx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl %ebx,96(%esp) + rorl $5,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + rorl $6,%edx + movl %eax,%ecx + addl %esi,%ebx + rorl $9,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + rorl $11,%ecx + movl (%ebp),%esi + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + movl 156(%esp),%ecx + addl $4,%ebp + addl %ebx,%eax + cmpl $3329325298,%esi + jne L00516_63 + movl 356(%esp),%esi + movl 8(%esp),%ebx + movl 16(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl 24(%esp),%eax + movl 28(%esp),%ebx + movl 32(%esp),%ecx + movl 360(%esp),%edi + addl 16(%esi),%edx + addl 20(%esi),%eax + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %eax,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + leal 356(%esp),%esp + subl $256,%ebp + cmpl 8(%esp),%edi + jb L003loop + movl 12(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +LK256: +.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 +.long 66051,67438087,134810123,202182159 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +.byte 62,0 +.align 4,0x90 +L002unrolled: + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebp + movl 8(%esi),%ecx + movl 12(%esi),%ebx + movl %ebp,4(%esp) + xorl %ecx,%ebp + movl %ecx,8(%esp) + movl %ebx,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %ebx,20(%esp) + movl %ecx,24(%esp) + movl %esi,28(%esp) + jmp L006grand_loop +.align 4,0x90 +L006grand_loop: + movl (%edi),%ebx + movl 4(%edi),%ecx + bswap %ebx + movl 8(%edi),%esi + bswap %ecx + movl %ebx,32(%esp) + bswap %esi + movl %ecx,36(%esp) + movl %esi,40(%esp) + movl 12(%edi),%ebx + movl 16(%edi),%ecx + bswap %ebx + movl 20(%edi),%esi + bswap %ecx + movl %ebx,44(%esp) + bswap %esi + movl %ecx,48(%esp) + movl %esi,52(%esp) + movl 24(%edi),%ebx + movl 28(%edi),%ecx + bswap %ebx + movl 32(%edi),%esi + bswap %ecx + movl %ebx,56(%esp) + bswap %esi + movl %ecx,60(%esp) + movl %esi,64(%esp) + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %ebx + movl 44(%edi),%esi + bswap %ecx + movl %ebx,68(%esp) + bswap %esi + movl %ecx,72(%esp) + movl %esi,76(%esp) + movl 48(%edi),%ebx + movl 52(%edi),%ecx + bswap %ebx + movl 56(%edi),%esi + bswap %ecx + movl %ebx,80(%esp) + bswap %esi + movl %ecx,84(%esp) + movl %esi,88(%esp) + movl 60(%edi),%ebx + addl $64,%edi + bswap %ebx + movl %edi,100(%esp) + movl %ebx,92(%esp) + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 32(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1116352408(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 36(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1899447441(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 40(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3049323471(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 44(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3921009573(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 48(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 961987163(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 52(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1508970993(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 56(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2453635748(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 60(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2870763221(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 64(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3624381080(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 68(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 310598401(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 72(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 607225278(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 76(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1426881987(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 80(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1925078388(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 84(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2162078206(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 88(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2614888103(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 92(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3248222580(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3835390401(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 4022224774(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 264347078(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 604807628(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 770255983(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1249150122(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1555081692(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1996064986(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2554220882(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2821834349(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2952996808(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3210313671(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3336571891(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3584528711(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 113926993(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 338241895(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 666307205(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 773529912(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1294757372(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1396182291(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1695183700(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1986661051(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2177026350(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2456956037(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2730485921(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2820302411(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3259730800(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3345764771(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3516065817(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3600352804(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 4094571909(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 275423344(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 430227734(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 506948616(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 659060556(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 883997877(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 958139571(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1322822218(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1537002063(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1747873779(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1955562222(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2024104815(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2227730452(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2361852424(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2428436474(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2756734187(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3204031479(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3329325298(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 96(%esp),%esi + xorl %edi,%ebp + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebp + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebp,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebp,4(%esp) + xorl %edi,%ebp + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ebx + movl 28(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + cmpl 104(%esp),%edi + jb L006grand_loop + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _sha256_block_data_order_ssse3 +.private_extern _sha256_block_data_order_ssse3 +.align 4 +_sha256_block_data_order_ssse3: +L_sha256_block_data_order_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call L007pic_point +L007pic_point: + popl %ebp + leal LK256-L007pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $6,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + movdqa 256(%ebp),%xmm7 + jmp L008grand_ssse3 +.align 4,0x90 +L008grand_ssse3: + movdqu (%edi),%xmm0 + movdqu 16(%edi),%xmm1 + movdqu 32(%edi),%xmm2 + movdqu 48(%edi),%xmm3 + addl $64,%edi + pshufb %xmm7,%xmm0 + movl %edi,100(%esp) + pshufb %xmm7,%xmm1 + movdqa (%ebp),%xmm4 + pshufb %xmm7,%xmm2 + movdqa 16(%ebp),%xmm5 + paddd %xmm0,%xmm4 + pshufb %xmm7,%xmm3 + movdqa 32(%ebp),%xmm6 + paddd %xmm1,%xmm5 + movdqa 48(%ebp),%xmm7 + movdqa %xmm4,32(%esp) + paddd %xmm2,%xmm6 + movdqa %xmm5,48(%esp) + paddd %xmm3,%xmm7 + movdqa %xmm6,64(%esp) + movdqa %xmm7,80(%esp) + jmp L009ssse3_00_47 +.align 4,0x90 +L009ssse3_00_47: + addl $64,%ebp + movl %edx,%ecx + movdqa %xmm1,%xmm4 + rorl $14,%edx + movl 20(%esp),%esi + movdqa %xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + palignr $4,%xmm0,%xmm4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + palignr $4,%xmm2,%xmm7 + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 4(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm0 + movl %eax,(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm3,%xmm7 + xorl %esi,%ecx + addl 32(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl 16(%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,12(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm0 + movl %ebx,28(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm0 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + pshufd $80,%xmm0,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa (%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,4(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm0 + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + paddd %xmm0,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movdqa %xmm6,32(%esp) + movl %edx,%ecx + movdqa %xmm2,%xmm4 + rorl $14,%edx + movl 4(%esp),%esi + movdqa %xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + palignr $4,%xmm1,%xmm4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + palignr $4,%xmm3,%xmm7 + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 20(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm1 + movl %eax,16(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm0,%xmm7 + xorl %esi,%ecx + addl 48(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl (%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,28(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm1 + movl %ebx,12(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm1 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + pshufd $80,%xmm1,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 16(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,20(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm1 + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + paddd %xmm1,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movdqa %xmm6,48(%esp) + movl %edx,%ecx + movdqa %xmm3,%xmm4 + rorl $14,%edx + movl 20(%esp),%esi + movdqa %xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + palignr $4,%xmm2,%xmm4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + palignr $4,%xmm0,%xmm7 + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 4(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm2 + movl %eax,(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm1,%xmm7 + xorl %esi,%ecx + addl 64(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl 16(%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,12(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm2 + movl %ebx,28(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm2 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + pshufd $80,%xmm2,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 32(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,4(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm2 + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + paddd %xmm2,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movdqa %xmm6,64(%esp) + movl %edx,%ecx + movdqa %xmm0,%xmm4 + rorl $14,%edx + movl 4(%esp),%esi + movdqa %xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + palignr $4,%xmm3,%xmm4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + palignr $4,%xmm1,%xmm7 + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 20(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm3 + movl %eax,16(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm2,%xmm7 + xorl %esi,%ecx + addl 80(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl (%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,28(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm3 + movl %ebx,12(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm3 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + pshufd $80,%xmm3,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 48(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,20(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm3 + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + paddd %xmm3,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne L009ssse3_00_47 + movl %edx,%ecx + rorl $14,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + movdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb L008grand_ssse3 + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _sha256_block_data_order_avx +.private_extern _sha256_block_data_order_avx +.align 4 +_sha256_block_data_order_avx: +L_sha256_block_data_order_avx_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call L010pic_point +L010pic_point: + popl %ebp + leal LK256-L010pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $6,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp L011grand_avx +.align 5,0x90 +L011grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp L012avx_00_47 +.align 4,0x90 +L012avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne L012avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb L011grand_avx + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/sha256-586-linux.S b/third_party/boringssl/gen/bcm/sha256-586-linux.S new file mode 100644 index 00000000..d409651b --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha256-586-linux.S @@ -0,0 +1,5599 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl sha256_block_data_order_nohw +.hidden sha256_block_data_order_nohw +.type sha256_block_data_order_nohw,@function +.align 16 +sha256_block_data_order_nohw: +.L_sha256_block_data_order_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call .L000pic_point +.L000pic_point: + popl %ebp + leal .LK256-.L000pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $6,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) +.L001no_xmm: + subl %edi,%eax + cmpl $256,%eax + jae .L002unrolled + jmp .L003loop +.align 16 +.L003loop: + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + bswap %eax + movl 12(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 16(%edi),%eax + movl 20(%edi),%ebx + movl 24(%edi),%ecx + bswap %eax + movl 28(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 32(%edi),%eax + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %eax + movl 44(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + movl 48(%edi),%eax + movl 52(%edi),%ebx + movl 56(%edi),%ecx + bswap %eax + movl 60(%edi),%edx + bswap %ebx + pushl %eax + bswap %ecx + pushl %ebx + bswap %edx + pushl %ecx + pushl %edx + addl $64,%edi + leal -36(%esp),%esp + movl %edi,104(%esp) + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,8(%esp) + xorl %ecx,%ebx + movl %ecx,12(%esp) + movl %edi,16(%esp) + movl %ebx,(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + movl %edi,32(%esp) +.align 16 +.L00400_15: + movl %edx,%ecx + movl 24(%esp),%esi + rorl $14,%ecx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl 96(%esp),%ebx + rorl $5,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + rorl $6,%edx + movl %eax,%ecx + addl %esi,%ebx + rorl $9,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + rorl $11,%ecx + movl (%ebp),%esi + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + addl $4,%ebp + addl %ebx,%eax + cmpl $3248222580,%esi + jne .L00400_15 + movl 156(%esp),%ecx + jmp .L00516_63 +.align 16 +.L00516_63: + movl %ecx,%ebx + movl 104(%esp),%esi + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 160(%esp),%ebx + shrl $10,%edi + addl 124(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 24(%esp),%esi + rorl $14,%ecx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %edx,%ecx + xorl %edi,%esi + movl %ebx,96(%esp) + rorl $5,%ecx + andl %edx,%esi + movl %edx,20(%esp) + xorl %ecx,%edx + addl 32(%esp),%ebx + xorl %edi,%esi + rorl $6,%edx + movl %eax,%ecx + addl %esi,%ebx + rorl $9,%ecx + addl %edx,%ebx + movl 8(%esp),%edi + xorl %eax,%ecx + movl %eax,4(%esp) + leal -4(%esp),%esp + rorl $11,%ecx + movl (%ebp),%esi + xorl %eax,%ecx + movl 20(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %esi,%ebx + movl %eax,(%esp) + addl %ebx,%edx + andl 4(%esp),%eax + addl %ecx,%ebx + xorl %edi,%eax + movl 156(%esp),%ecx + addl $4,%ebp + addl %ebx,%eax + cmpl $3329325298,%esi + jne .L00516_63 + movl 356(%esp),%esi + movl 8(%esp),%ebx + movl 16(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl 24(%esp),%eax + movl 28(%esp),%ebx + movl 32(%esp),%ecx + movl 360(%esp),%edi + addl 16(%esi),%edx + addl 20(%esi),%eax + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %eax,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + leal 356(%esp),%esp + subl $256,%ebp + cmpl 8(%esp),%edi + jb .L003loop + movl 12(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 64 +.LK256: +.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 +.long 66051,67438087,134810123,202182159 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +.byte 62,0 +.align 16 +.L002unrolled: + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebp + movl 8(%esi),%ecx + movl 12(%esi),%ebx + movl %ebp,4(%esp) + xorl %ecx,%ebp + movl %ecx,8(%esp) + movl %ebx,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %ebx,20(%esp) + movl %ecx,24(%esp) + movl %esi,28(%esp) + jmp .L006grand_loop +.align 16 +.L006grand_loop: + movl (%edi),%ebx + movl 4(%edi),%ecx + bswap %ebx + movl 8(%edi),%esi + bswap %ecx + movl %ebx,32(%esp) + bswap %esi + movl %ecx,36(%esp) + movl %esi,40(%esp) + movl 12(%edi),%ebx + movl 16(%edi),%ecx + bswap %ebx + movl 20(%edi),%esi + bswap %ecx + movl %ebx,44(%esp) + bswap %esi + movl %ecx,48(%esp) + movl %esi,52(%esp) + movl 24(%edi),%ebx + movl 28(%edi),%ecx + bswap %ebx + movl 32(%edi),%esi + bswap %ecx + movl %ebx,56(%esp) + bswap %esi + movl %ecx,60(%esp) + movl %esi,64(%esp) + movl 36(%edi),%ebx + movl 40(%edi),%ecx + bswap %ebx + movl 44(%edi),%esi + bswap %ecx + movl %ebx,68(%esp) + bswap %esi + movl %ecx,72(%esp) + movl %esi,76(%esp) + movl 48(%edi),%ebx + movl 52(%edi),%ecx + bswap %ebx + movl 56(%edi),%esi + bswap %ecx + movl %ebx,80(%esp) + bswap %esi + movl %ecx,84(%esp) + movl %esi,88(%esp) + movl 60(%edi),%ebx + addl $64,%edi + bswap %ebx + movl %edi,100(%esp) + movl %ebx,92(%esp) + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 32(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1116352408(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 36(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1899447441(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 40(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3049323471(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 44(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3921009573(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 48(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 961987163(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 52(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1508970993(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 56(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2453635748(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 60(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2870763221(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 20(%esp),%esi + rorl $14,%edx + movl 24(%esp),%edi + xorl %ecx,%edx + movl 64(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3624381080(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 16(%esp),%ecx + rorl $14,%edx + movl 20(%esp),%edi + xorl %esi,%edx + movl 68(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 310598401(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 12(%esp),%esi + rorl $14,%edx + movl 16(%esp),%edi + xorl %ecx,%edx + movl 72(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 607225278(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 8(%esp),%ecx + rorl $14,%edx + movl 12(%esp),%edi + xorl %esi,%edx + movl 76(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1426881987(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 4(%esp),%esi + rorl $14,%edx + movl 8(%esp),%edi + xorl %ecx,%edx + movl 80(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1925078388(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl (%esp),%ecx + rorl $14,%edx + movl 4(%esp),%edi + xorl %esi,%edx + movl 84(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2162078206(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl %edx,%ecx + movl 28(%esp),%esi + rorl $14,%edx + movl (%esp),%edi + xorl %ecx,%edx + movl 88(%esp),%ebx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2614888103(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl %edx,%esi + movl 24(%esp),%ecx + rorl $14,%edx + movl 28(%esp),%edi + xorl %esi,%edx + movl 92(%esp),%ebx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3248222580(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3835390401(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 4022224774(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 264347078(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 604807628(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 770255983(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1249150122(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1555081692(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1996064986(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2554220882(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2821834349(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2952996808(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3210313671(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3336571891(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3584528711(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 113926993(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 338241895(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 666307205(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 773529912(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1294757372(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1396182291(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1695183700(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1986661051(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2177026350(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2456956037(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2730485921(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2820302411(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3259730800(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3345764771(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3516065817(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3600352804(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,88(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 4094571909(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,92(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 275423344(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 36(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 88(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 32(%esp),%ebx + shrl $10,%edi + addl 68(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,32(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 430227734(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 40(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 92(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 36(%esp),%ebx + shrl $10,%edi + addl 72(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,36(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 506948616(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 44(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 32(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 40(%esp),%ebx + shrl $10,%edi + addl 76(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,40(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 659060556(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 48(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 36(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 44(%esp),%ebx + shrl $10,%edi + addl 80(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,44(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 883997877(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 52(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 40(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 48(%esp),%ebx + shrl $10,%edi + addl 84(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,48(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 958139571(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 56(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 44(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 52(%esp),%ebx + shrl $10,%edi + addl 88(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,52(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1322822218(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 60(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 48(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 56(%esp),%ebx + shrl $10,%edi + addl 92(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + movl %ebx,56(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1537002063(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 64(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 52(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 60(%esp),%ebx + shrl $10,%edi + addl 32(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + movl %ebx,60(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 1747873779(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 68(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 56(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 64(%esp),%ebx + shrl $10,%edi + addl 36(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 20(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 24(%esp),%edi + xorl %ecx,%edx + movl %ebx,64(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + addl 28(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 4(%esp),%edi + xorl %eax,%ecx + movl %eax,(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 1955562222(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 72(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 12(%esp),%edx + addl %ecx,%ebp + movl 60(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 68(%esp),%ebx + shrl $10,%edi + addl 40(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 16(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 20(%esp),%edi + xorl %esi,%edx + movl %ebx,68(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,12(%esp) + xorl %esi,%edx + addl 24(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl (%esp),%edi + xorl %ebp,%esi + movl %ebp,28(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2024104815(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 76(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 8(%esp),%edx + addl %esi,%eax + movl 64(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 72(%esp),%ebx + shrl $10,%edi + addl 44(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 12(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 16(%esp),%edi + xorl %ecx,%edx + movl %ebx,72(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + addl 20(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 28(%esp),%edi + xorl %eax,%ecx + movl %eax,24(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2227730452(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 80(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 4(%esp),%edx + addl %ecx,%ebp + movl 68(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 76(%esp),%ebx + shrl $10,%edi + addl 48(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 8(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 12(%esp),%edi + xorl %esi,%edx + movl %ebx,76(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,4(%esp) + xorl %esi,%edx + addl 16(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 24(%esp),%edi + xorl %ebp,%esi + movl %ebp,20(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2361852424(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 84(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl (%esp),%edx + addl %esi,%eax + movl 72(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 80(%esp),%ebx + shrl $10,%edi + addl 52(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 4(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl 8(%esp),%edi + xorl %ecx,%edx + movl %ebx,80(%esp) + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + addl 12(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 20(%esp),%edi + xorl %eax,%ecx + movl %eax,16(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 2428436474(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 88(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 28(%esp),%edx + addl %ecx,%ebp + movl 76(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 84(%esp),%ebx + shrl $10,%edi + addl 56(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl (%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 4(%esp),%edi + xorl %esi,%edx + movl %ebx,84(%esp) + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,28(%esp) + xorl %esi,%edx + addl 8(%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 16(%esp),%edi + xorl %ebp,%esi + movl %ebp,12(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 2756734187(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + movl 92(%esp),%ecx + rorl $2,%esi + addl %edx,%eax + addl 24(%esp),%edx + addl %esi,%eax + movl 80(%esp),%esi + movl %ecx,%ebx + rorl $11,%ecx + movl %esi,%edi + rorl $2,%esi + xorl %ebx,%ecx + shrl $3,%ebx + rorl $7,%ecx + xorl %edi,%esi + xorl %ecx,%ebx + rorl $17,%esi + addl 88(%esp),%ebx + shrl $10,%edi + addl 60(%esp),%ebx + movl %edx,%ecx + xorl %esi,%edi + movl 28(%esp),%esi + rorl $14,%edx + addl %edi,%ebx + movl (%esp),%edi + xorl %ecx,%edx + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + addl 4(%esp),%ebx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%ebx + rorl $9,%ecx + movl %eax,%esi + movl 12(%esp),%edi + xorl %eax,%ecx + movl %eax,8(%esp) + xorl %edi,%eax + rorl $11,%ecx + andl %eax,%ebp + leal 3204031479(%ebx,%edx,1),%edx + xorl %esi,%ecx + xorl %edi,%ebp + movl 32(%esp),%esi + rorl $2,%ecx + addl %edx,%ebp + addl 20(%esp),%edx + addl %ecx,%ebp + movl 84(%esp),%ecx + movl %esi,%ebx + rorl $11,%esi + movl %ecx,%edi + rorl $2,%ecx + xorl %ebx,%esi + shrl $3,%ebx + rorl $7,%esi + xorl %edi,%ecx + xorl %esi,%ebx + rorl $17,%ecx + addl 92(%esp),%ebx + shrl $10,%edi + addl 64(%esp),%ebx + movl %edx,%esi + xorl %ecx,%edi + movl 24(%esp),%ecx + rorl $14,%edx + addl %edi,%ebx + movl 28(%esp),%edi + xorl %esi,%edx + xorl %edi,%ecx + rorl $5,%edx + andl %esi,%ecx + movl %esi,20(%esp) + xorl %esi,%edx + addl (%esp),%ebx + xorl %ecx,%edi + rorl $6,%edx + movl %ebp,%esi + addl %edi,%ebx + rorl $9,%esi + movl %ebp,%ecx + movl 8(%esp),%edi + xorl %ebp,%esi + movl %ebp,4(%esp) + xorl %edi,%ebp + rorl $11,%esi + andl %ebp,%eax + leal 3329325298(%ebx,%edx,1),%edx + xorl %ecx,%esi + xorl %edi,%eax + rorl $2,%esi + addl %edx,%eax + addl 16(%esp),%edx + addl %esi,%eax + movl 96(%esp),%esi + xorl %edi,%ebp + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebp + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebp,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebp,4(%esp) + xorl %edi,%ebp + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ebx + movl 28(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ebx + addl 28(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %ebx,24(%esi) + movl %ecx,28(%esi) + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ebx,24(%esp) + movl %ecx,28(%esp) + cmpl 104(%esp),%edi + jb .L006grand_loop + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size sha256_block_data_order_nohw,.-.L_sha256_block_data_order_nohw_begin +.globl sha256_block_data_order_ssse3 +.hidden sha256_block_data_order_ssse3 +.type sha256_block_data_order_ssse3,@function +.align 16 +sha256_block_data_order_ssse3: +.L_sha256_block_data_order_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call .L007pic_point +.L007pic_point: + popl %ebp + leal .LK256-.L007pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $6,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + leal -96(%esp),%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + movdqa 256(%ebp),%xmm7 + jmp .L008grand_ssse3 +.align 16 +.L008grand_ssse3: + movdqu (%edi),%xmm0 + movdqu 16(%edi),%xmm1 + movdqu 32(%edi),%xmm2 + movdqu 48(%edi),%xmm3 + addl $64,%edi + pshufb %xmm7,%xmm0 + movl %edi,100(%esp) + pshufb %xmm7,%xmm1 + movdqa (%ebp),%xmm4 + pshufb %xmm7,%xmm2 + movdqa 16(%ebp),%xmm5 + paddd %xmm0,%xmm4 + pshufb %xmm7,%xmm3 + movdqa 32(%ebp),%xmm6 + paddd %xmm1,%xmm5 + movdqa 48(%ebp),%xmm7 + movdqa %xmm4,32(%esp) + paddd %xmm2,%xmm6 + movdqa %xmm5,48(%esp) + paddd %xmm3,%xmm7 + movdqa %xmm6,64(%esp) + movdqa %xmm7,80(%esp) + jmp .L009ssse3_00_47 +.align 16 +.L009ssse3_00_47: + addl $64,%ebp + movl %edx,%ecx + movdqa %xmm1,%xmm4 + rorl $14,%edx + movl 20(%esp),%esi + movdqa %xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + palignr $4,%xmm0,%xmm4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + palignr $4,%xmm2,%xmm7 + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 4(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm0 + movl %eax,(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm3,%xmm7 + xorl %esi,%ecx + addl 32(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl 16(%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,12(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm0 + movl %ebx,28(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm0 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + pshufd $80,%xmm0,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa (%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,4(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm0 + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + paddd %xmm0,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movdqa %xmm6,32(%esp) + movl %edx,%ecx + movdqa %xmm2,%xmm4 + rorl $14,%edx + movl 4(%esp),%esi + movdqa %xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + palignr $4,%xmm1,%xmm4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + palignr $4,%xmm3,%xmm7 + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 20(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm1 + movl %eax,16(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm0,%xmm7 + xorl %esi,%ecx + addl 48(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl (%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,28(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm1 + movl %ebx,12(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm1 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + pshufd $80,%xmm1,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 16(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,20(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm1 + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + paddd %xmm1,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movdqa %xmm6,48(%esp) + movl %edx,%ecx + movdqa %xmm3,%xmm4 + rorl $14,%edx + movl 20(%esp),%esi + movdqa %xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + palignr $4,%xmm2,%xmm4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + palignr $4,%xmm0,%xmm7 + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 4(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm2 + movl %eax,(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm1,%xmm7 + xorl %esi,%ecx + addl 64(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl 16(%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,12(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm2 + movl %ebx,28(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm2 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + pshufd $80,%xmm2,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 32(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,4(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm2 + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + paddd %xmm2,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movdqa %xmm6,64(%esp) + movl %edx,%ecx + movdqa %xmm0,%xmm4 + rorl $14,%edx + movl 4(%esp),%esi + movdqa %xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + palignr $4,%xmm3,%xmm4 + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + palignr $4,%xmm1,%xmm7 + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + movdqa %xmm4,%xmm5 + rorl $6,%edx + movl %eax,%ecx + movdqa %xmm4,%xmm6 + addl %edi,%edx + movl 20(%esp),%edi + psrld $3,%xmm4 + movl %eax,%esi + rorl $9,%ecx + paddd %xmm7,%xmm3 + movl %eax,16(%esp) + xorl %eax,%ecx + psrld $7,%xmm6 + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + pshufd $250,%xmm2,%xmm7 + xorl %esi,%ecx + addl 80(%esp),%edx + pslld $14,%xmm5 + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + psrld $11,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm5,%xmm4 + movl (%esp),%esi + xorl %ecx,%edx + pslld $11,%xmm5 + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + pxor %xmm6,%xmm4 + andl %ecx,%esi + movl %ecx,28(%esp) + movdqa %xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + pxor %xmm5,%xmm4 + movl %ebx,%ecx + addl %edi,%edx + psrld $10,%xmm7 + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm4,%xmm3 + movl %ebx,12(%esp) + xorl %ebx,%ecx + psrlq $17,%xmm6 + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + pxor %xmm6,%xmm7 + andl %ebx,%eax + xorl %esi,%ecx + psrlq $2,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + pshufd $128,%xmm7,%xmm7 + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + psrldq $8,%xmm7 + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + paddd %xmm7,%xmm3 + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + pshufd $80,%xmm3,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + movdqa %xmm7,%xmm6 + rorl $11,%ecx + psrld $10,%xmm7 + andl %eax,%ebx + psrlq $17,%xmm6 + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + pxor %xmm6,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + psrlq $2,%xmm6 + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + pxor %xmm6,%xmm7 + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + pshufd $8,%xmm7,%xmm7 + xorl %edi,%esi + rorl $5,%edx + movdqa 48(%ebp),%xmm6 + andl %ecx,%esi + movl %ecx,20(%esp) + pslldq $8,%xmm7 + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + paddd %xmm7,%xmm3 + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + paddd %xmm3,%xmm6 + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L009ssse3_00_47 + movl %edx,%ecx + rorl $14,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + rorl $14,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + rorl $9,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + rorl $11,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + rorl $2,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + rorl $14,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + rorl $5,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + rorl $6,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + rorl $9,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + rorl $11,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + rorl $2,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + movdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L008grand_ssse3 + movl 108(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size sha256_block_data_order_ssse3,.-.L_sha256_block_data_order_ssse3_begin +.globl sha256_block_data_order_avx +.hidden sha256_block_data_order_avx +.type sha256_block_data_order_avx,@function +.align 16 +sha256_block_data_order_avx: +.L_sha256_block_data_order_avx_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call .L010pic_point +.L010pic_point: + popl %ebp + leal .LK256-.L010pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $6,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + leal -96(%esp),%esp + vzeroall + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edi + movl %ebx,4(%esp) + xorl %ecx,%ebx + movl %ecx,8(%esp) + movl %edi,12(%esp) + movl 16(%esi),%edx + movl 20(%esi),%edi + movl 24(%esi),%ecx + movl 28(%esi),%esi + movl %edi,20(%esp) + movl 100(%esp),%edi + movl %ecx,24(%esp) + movl %esi,28(%esp) + vmovdqa 256(%ebp),%xmm7 + jmp .L011grand_avx +.align 32 +.L011grand_avx: + vmovdqu (%edi),%xmm0 + vmovdqu 16(%edi),%xmm1 + vmovdqu 32(%edi),%xmm2 + vmovdqu 48(%edi),%xmm3 + addl $64,%edi + vpshufb %xmm7,%xmm0,%xmm0 + movl %edi,100(%esp) + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd (%ebp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 16(%ebp),%xmm1,%xmm5 + vpaddd 32(%ebp),%xmm2,%xmm6 + vpaddd 48(%ebp),%xmm3,%xmm7 + vmovdqa %xmm4,32(%esp) + vmovdqa %xmm5,48(%esp) + vmovdqa %xmm6,64(%esp) + vmovdqa %xmm7,80(%esp) + jmp .L012avx_00_47 +.align 16 +.L012avx_00_47: + addl $64,%ebp + vpalignr $4,%xmm0,%xmm1,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm2,%xmm3,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm3,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm0,%xmm0 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm0,%xmm0 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm0,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm0,%xmm0 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd (%ebp),%xmm0,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,32(%esp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm3,%xmm0,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm0,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm1,%xmm1 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm1,%xmm1 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm1,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm1,%xmm1 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 16(%ebp),%xmm1,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,48(%esp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + vpalignr $4,%xmm0,%xmm1,%xmm7 + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + vpshufd $250,%xmm1,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + vpaddd %xmm4,%xmm2,%xmm2 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + vpaddd %xmm7,%xmm2,%xmm2 + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm2,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm2,%xmm2 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + vpaddd 32(%ebp),%xmm2,%xmm6 + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,64(%esp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + vpalignr $4,%xmm1,%xmm2,%xmm7 + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrld $3,%xmm4,%xmm7 + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + vpslld $14,%xmm4,%xmm5 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + vpshufd $250,%xmm2,%xmm7 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpsrld $11,%xmm6,%xmm6 + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpxor %xmm5,%xmm4,%xmm4 + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + vpslld $11,%xmm5,%xmm5 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + vpxor %xmm6,%xmm4,%xmm4 + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + vpsrld $10,%xmm7,%xmm6 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + vpxor %xmm5,%xmm4,%xmm4 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + vpaddd %xmm4,%xmm3,%xmm3 + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + vpxor %xmm5,%xmm6,%xmm6 + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + vpsrlq $19,%xmm7,%xmm7 + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + vpxor %xmm7,%xmm6,%xmm6 + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + vpshufd $132,%xmm6,%xmm7 + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + vpsrldq $8,%xmm7,%xmm7 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + vpaddd %xmm7,%xmm3,%xmm3 + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + vpshufd $80,%xmm3,%xmm7 + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + vpsrld $10,%xmm7,%xmm6 + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + vpsrlq $17,%xmm7,%xmm5 + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + vpxor %xmm5,%xmm6,%xmm6 + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + vpsrlq $19,%xmm7,%xmm7 + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + vpxor %xmm7,%xmm6,%xmm6 + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + vpshufd $232,%xmm6,%xmm7 + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + vpslldq $8,%xmm7,%xmm7 + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + vpaddd %xmm7,%xmm3,%xmm3 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + vpaddd 48(%ebp),%xmm3,%xmm6 + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + vmovdqa %xmm6,80(%esp) + cmpl $66051,64(%ebp) + jne .L012avx_00_47 + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 32(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 36(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 40(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 44(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 48(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 52(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 56(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 60(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 20(%esp),%esi + xorl %ecx,%edx + movl 24(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,16(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 4(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 28(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 64(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 12(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 16(%esp),%esi + xorl %ecx,%edx + movl 20(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,12(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl (%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,28(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 24(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 68(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 8(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 12(%esp),%esi + xorl %ecx,%edx + movl 16(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,8(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 28(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,24(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 20(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 72(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 4(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 8(%esp),%esi + xorl %ecx,%edx + movl 12(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,4(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 24(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,20(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 16(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 76(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl (%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 4(%esp),%esi + xorl %ecx,%edx + movl 8(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 20(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,16(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 12(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 80(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 28(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl (%esp),%esi + xorl %ecx,%edx + movl 4(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,28(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 16(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,12(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl 8(%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 84(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 24(%esp),%edx + addl %ecx,%eax + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 28(%esp),%esi + xorl %ecx,%edx + movl (%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,24(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %eax,%ecx + addl %edi,%edx + movl 12(%esp),%edi + movl %eax,%esi + shrdl $9,%ecx,%ecx + movl %eax,8(%esp) + xorl %eax,%ecx + xorl %edi,%eax + addl 4(%esp),%edx + shrdl $11,%ecx,%ecx + andl %eax,%ebx + xorl %esi,%ecx + addl 88(%esp),%edx + xorl %edi,%ebx + shrdl $2,%ecx,%ecx + addl %edx,%ebx + addl 20(%esp),%edx + addl %ecx,%ebx + movl %edx,%ecx + shrdl $14,%edx,%edx + movl 24(%esp),%esi + xorl %ecx,%edx + movl 28(%esp),%edi + xorl %edi,%esi + shrdl $5,%edx,%edx + andl %ecx,%esi + movl %ecx,20(%esp) + xorl %ecx,%edx + xorl %esi,%edi + shrdl $6,%edx,%edx + movl %ebx,%ecx + addl %edi,%edx + movl 8(%esp),%edi + movl %ebx,%esi + shrdl $9,%ecx,%ecx + movl %ebx,4(%esp) + xorl %ebx,%ecx + xorl %edi,%ebx + addl (%esp),%edx + shrdl $11,%ecx,%ecx + andl %ebx,%eax + xorl %esi,%ecx + addl 92(%esp),%edx + xorl %edi,%eax + shrdl $2,%ecx,%ecx + addl %edx,%eax + addl 16(%esp),%edx + addl %ecx,%eax + movl 96(%esp),%esi + xorl %edi,%ebx + movl 12(%esp),%ecx + addl (%esi),%eax + addl 4(%esi),%ebx + addl 8(%esi),%edi + addl 12(%esi),%ecx + movl %eax,(%esi) + movl %ebx,4(%esi) + movl %edi,8(%esi) + movl %ecx,12(%esi) + movl %ebx,4(%esp) + xorl %edi,%ebx + movl %edi,8(%esp) + movl %ecx,12(%esp) + movl 20(%esp),%edi + movl 24(%esp),%ecx + addl 16(%esi),%edx + addl 20(%esi),%edi + addl 24(%esi),%ecx + movl %edx,16(%esi) + movl %edi,20(%esi) + movl %edi,20(%esp) + movl 28(%esp),%edi + movl %ecx,24(%esi) + addl 28(%esi),%edi + movl %ecx,24(%esp) + movl %edi,28(%esi) + movl %edi,28(%esp) + movl 100(%esp),%edi + vmovdqa 64(%ebp),%xmm7 + subl $192,%ebp + cmpl 104(%esp),%edi + jb .L011grand_avx + movl 108(%esp),%esp + vzeroall + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size sha256_block_data_order_avx,.-.L_sha256_block_data_order_avx_begin +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/sha256-586-win.asm b/third_party/boringssl/gen/bcm/sha256-586-win.asm new file mode 100644 index 00000000..b6fed17b --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha256-586-win.asm @@ -0,0 +1,5601 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _sha256_block_data_order_nohw +align 16 +_sha256_block_data_order_nohw: +L$_sha256_block_data_order_nohw_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov ebx,esp + call L$000pic_point +L$000pic_point: + pop ebp + lea ebp,[(L$K256-L$000pic_point)+ebp] + sub esp,16 + and esp,-64 + shl eax,6 + add eax,edi + mov DWORD [esp],esi + mov DWORD [4+esp],edi + mov DWORD [8+esp],eax + mov DWORD [12+esp],ebx +L$001no_xmm: + sub eax,edi + cmp eax,256 + jae NEAR L$002unrolled + jmp NEAR L$003loop +align 16 +L$003loop: + mov eax,DWORD [edi] + mov ebx,DWORD [4+edi] + mov ecx,DWORD [8+edi] + bswap eax + mov edx,DWORD [12+edi] + bswap ebx + push eax + bswap ecx + push ebx + bswap edx + push ecx + push edx + mov eax,DWORD [16+edi] + mov ebx,DWORD [20+edi] + mov ecx,DWORD [24+edi] + bswap eax + mov edx,DWORD [28+edi] + bswap ebx + push eax + bswap ecx + push ebx + bswap edx + push ecx + push edx + mov eax,DWORD [32+edi] + mov ebx,DWORD [36+edi] + mov ecx,DWORD [40+edi] + bswap eax + mov edx,DWORD [44+edi] + bswap ebx + push eax + bswap ecx + push ebx + bswap edx + push ecx + push edx + mov eax,DWORD [48+edi] + mov ebx,DWORD [52+edi] + mov ecx,DWORD [56+edi] + bswap eax + mov edx,DWORD [60+edi] + bswap ebx + push eax + bswap ecx + push ebx + bswap edx + push ecx + push edx + add edi,64 + lea esp,[esp-36] + mov DWORD [104+esp],edi + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edi,DWORD [12+esi] + mov DWORD [8+esp],ebx + xor ebx,ecx + mov DWORD [12+esp],ecx + mov DWORD [16+esp],edi + mov DWORD [esp],ebx + mov edx,DWORD [16+esi] + mov ebx,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov edi,DWORD [28+esi] + mov DWORD [24+esp],ebx + mov DWORD [28+esp],ecx + mov DWORD [32+esp],edi +align 16 +L$00400_15: + mov ecx,edx + mov esi,DWORD [24+esp] + ror ecx,14 + mov edi,DWORD [28+esp] + xor ecx,edx + xor esi,edi + mov ebx,DWORD [96+esp] + ror ecx,5 + and esi,edx + mov DWORD [20+esp],edx + xor edx,ecx + add ebx,DWORD [32+esp] + xor esi,edi + ror edx,6 + mov ecx,eax + add ebx,esi + ror ecx,9 + add ebx,edx + mov edi,DWORD [8+esp] + xor ecx,eax + mov DWORD [4+esp],eax + lea esp,[esp-4] + ror ecx,11 + mov esi,DWORD [ebp] + xor ecx,eax + mov edx,DWORD [20+esp] + xor eax,edi + ror ecx,2 + add ebx,esi + mov DWORD [esp],eax + add edx,ebx + and eax,DWORD [4+esp] + add ebx,ecx + xor eax,edi + add ebp,4 + add eax,ebx + cmp esi,3248222580 + jne NEAR L$00400_15 + mov ecx,DWORD [156+esp] + jmp NEAR L$00516_63 +align 16 +L$00516_63: + mov ebx,ecx + mov esi,DWORD [104+esp] + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [160+esp] + shr edi,10 + add ebx,DWORD [124+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [24+esp] + ror ecx,14 + add ebx,edi + mov edi,DWORD [28+esp] + xor ecx,edx + xor esi,edi + mov DWORD [96+esp],ebx + ror ecx,5 + and esi,edx + mov DWORD [20+esp],edx + xor edx,ecx + add ebx,DWORD [32+esp] + xor esi,edi + ror edx,6 + mov ecx,eax + add ebx,esi + ror ecx,9 + add ebx,edx + mov edi,DWORD [8+esp] + xor ecx,eax + mov DWORD [4+esp],eax + lea esp,[esp-4] + ror ecx,11 + mov esi,DWORD [ebp] + xor ecx,eax + mov edx,DWORD [20+esp] + xor eax,edi + ror ecx,2 + add ebx,esi + mov DWORD [esp],eax + add edx,ebx + and eax,DWORD [4+esp] + add ebx,ecx + xor eax,edi + mov ecx,DWORD [156+esp] + add ebp,4 + add eax,ebx + cmp esi,3329325298 + jne NEAR L$00516_63 + mov esi,DWORD [356+esp] + mov ebx,DWORD [8+esp] + mov ecx,DWORD [16+esp] + add eax,DWORD [esi] + add ebx,DWORD [4+esi] + add edi,DWORD [8+esi] + add ecx,DWORD [12+esi] + mov DWORD [esi],eax + mov DWORD [4+esi],ebx + mov DWORD [8+esi],edi + mov DWORD [12+esi],ecx + mov eax,DWORD [24+esp] + mov ebx,DWORD [28+esp] + mov ecx,DWORD [32+esp] + mov edi,DWORD [360+esp] + add edx,DWORD [16+esi] + add eax,DWORD [20+esi] + add ebx,DWORD [24+esi] + add ecx,DWORD [28+esi] + mov DWORD [16+esi],edx + mov DWORD [20+esi],eax + mov DWORD [24+esi],ebx + mov DWORD [28+esi],ecx + lea esp,[356+esp] + sub ebp,256 + cmp edi,DWORD [8+esp] + jb NEAR L$003loop + mov esp,DWORD [12+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +align 64 +L$K256: +dd 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 +dd 66051,67438087,134810123,202182159 +db 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 +db 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +db 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +db 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +db 62,0 +align 16 +L$002unrolled: + lea esp,[esp-96] + mov eax,DWORD [esi] + mov ebp,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov ebx,DWORD [12+esi] + mov DWORD [4+esp],ebp + xor ebp,ecx + mov DWORD [8+esp],ecx + mov DWORD [12+esp],ebx + mov edx,DWORD [16+esi] + mov ebx,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov esi,DWORD [28+esi] + mov DWORD [20+esp],ebx + mov DWORD [24+esp],ecx + mov DWORD [28+esp],esi + jmp NEAR L$006grand_loop +align 16 +L$006grand_loop: + mov ebx,DWORD [edi] + mov ecx,DWORD [4+edi] + bswap ebx + mov esi,DWORD [8+edi] + bswap ecx + mov DWORD [32+esp],ebx + bswap esi + mov DWORD [36+esp],ecx + mov DWORD [40+esp],esi + mov ebx,DWORD [12+edi] + mov ecx,DWORD [16+edi] + bswap ebx + mov esi,DWORD [20+edi] + bswap ecx + mov DWORD [44+esp],ebx + bswap esi + mov DWORD [48+esp],ecx + mov DWORD [52+esp],esi + mov ebx,DWORD [24+edi] + mov ecx,DWORD [28+edi] + bswap ebx + mov esi,DWORD [32+edi] + bswap ecx + mov DWORD [56+esp],ebx + bswap esi + mov DWORD [60+esp],ecx + mov DWORD [64+esp],esi + mov ebx,DWORD [36+edi] + mov ecx,DWORD [40+edi] + bswap ebx + mov esi,DWORD [44+edi] + bswap ecx + mov DWORD [68+esp],ebx + bswap esi + mov DWORD [72+esp],ecx + mov DWORD [76+esp],esi + mov ebx,DWORD [48+edi] + mov ecx,DWORD [52+edi] + bswap ebx + mov esi,DWORD [56+edi] + bswap ecx + mov DWORD [80+esp],ebx + bswap esi + mov DWORD [84+esp],ecx + mov DWORD [88+esp],esi + mov ebx,DWORD [60+edi] + add edi,64 + bswap ebx + mov DWORD [100+esp],edi + mov DWORD [92+esp],ebx + mov ecx,edx + mov esi,DWORD [20+esp] + ror edx,14 + mov edi,DWORD [24+esp] + xor edx,ecx + mov ebx,DWORD [32+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + add ebx,DWORD [28+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [4+esp] + xor ecx,eax + mov DWORD [esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[1116352408+edx*1+ebx] + xor ecx,esi + xor ebp,edi + ror ecx,2 + add ebp,edx + add edx,DWORD [12+esp] + add ebp,ecx + mov esi,edx + mov ecx,DWORD [16+esp] + ror edx,14 + mov edi,DWORD [20+esp] + xor edx,esi + mov ebx,DWORD [36+esp] + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [12+esp],esi + xor edx,esi + add ebx,DWORD [24+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [esp] + xor esi,ebp + mov DWORD [28+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[1899447441+edx*1+ebx] + xor esi,ecx + xor eax,edi + ror esi,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,esi + mov ecx,edx + mov esi,DWORD [12+esp] + ror edx,14 + mov edi,DWORD [16+esp] + xor edx,ecx + mov ebx,DWORD [40+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + add ebx,DWORD [20+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [28+esp] + xor ecx,eax + mov DWORD [24+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[3049323471+edx*1+ebx] + xor ecx,esi + xor ebp,edi + ror ecx,2 + add ebp,edx + add edx,DWORD [4+esp] + add ebp,ecx + mov esi,edx + mov ecx,DWORD [8+esp] + ror edx,14 + mov edi,DWORD [12+esp] + xor edx,esi + mov ebx,DWORD [44+esp] + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [4+esp],esi + xor edx,esi + add ebx,DWORD [16+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [24+esp] + xor esi,ebp + mov DWORD [20+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[3921009573+edx*1+ebx] + xor esi,ecx + xor eax,edi + ror esi,2 + add eax,edx + add edx,DWORD [esp] + add eax,esi + mov ecx,edx + mov esi,DWORD [4+esp] + ror edx,14 + mov edi,DWORD [8+esp] + xor edx,ecx + mov ebx,DWORD [48+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + add ebx,DWORD [12+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [20+esp] + xor ecx,eax + mov DWORD [16+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[961987163+edx*1+ebx] + xor ecx,esi + xor ebp,edi + ror ecx,2 + add ebp,edx + add edx,DWORD [28+esp] + add ebp,ecx + mov esi,edx + mov ecx,DWORD [esp] + ror edx,14 + mov edi,DWORD [4+esp] + xor edx,esi + mov ebx,DWORD [52+esp] + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [28+esp],esi + xor edx,esi + add ebx,DWORD [8+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [16+esp] + xor esi,ebp + mov DWORD [12+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[1508970993+edx*1+ebx] + xor esi,ecx + xor eax,edi + ror esi,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,esi + mov ecx,edx + mov esi,DWORD [28+esp] + ror edx,14 + mov edi,DWORD [esp] + xor edx,ecx + mov ebx,DWORD [56+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + add ebx,DWORD [4+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [12+esp] + xor ecx,eax + mov DWORD [8+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[2453635748+edx*1+ebx] + xor ecx,esi + xor ebp,edi + ror ecx,2 + add ebp,edx + add edx,DWORD [20+esp] + add ebp,ecx + mov esi,edx + mov ecx,DWORD [24+esp] + ror edx,14 + mov edi,DWORD [28+esp] + xor edx,esi + mov ebx,DWORD [60+esp] + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [20+esp],esi + xor edx,esi + add ebx,DWORD [esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [8+esp] + xor esi,ebp + mov DWORD [4+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[2870763221+edx*1+ebx] + xor esi,ecx + xor eax,edi + ror esi,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,esi + mov ecx,edx + mov esi,DWORD [20+esp] + ror edx,14 + mov edi,DWORD [24+esp] + xor edx,ecx + mov ebx,DWORD [64+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + add ebx,DWORD [28+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [4+esp] + xor ecx,eax + mov DWORD [esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[3624381080+edx*1+ebx] + xor ecx,esi + xor ebp,edi + ror ecx,2 + add ebp,edx + add edx,DWORD [12+esp] + add ebp,ecx + mov esi,edx + mov ecx,DWORD [16+esp] + ror edx,14 + mov edi,DWORD [20+esp] + xor edx,esi + mov ebx,DWORD [68+esp] + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [12+esp],esi + xor edx,esi + add ebx,DWORD [24+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [esp] + xor esi,ebp + mov DWORD [28+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[310598401+edx*1+ebx] + xor esi,ecx + xor eax,edi + ror esi,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,esi + mov ecx,edx + mov esi,DWORD [12+esp] + ror edx,14 + mov edi,DWORD [16+esp] + xor edx,ecx + mov ebx,DWORD [72+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + add ebx,DWORD [20+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [28+esp] + xor ecx,eax + mov DWORD [24+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[607225278+edx*1+ebx] + xor ecx,esi + xor ebp,edi + ror ecx,2 + add ebp,edx + add edx,DWORD [4+esp] + add ebp,ecx + mov esi,edx + mov ecx,DWORD [8+esp] + ror edx,14 + mov edi,DWORD [12+esp] + xor edx,esi + mov ebx,DWORD [76+esp] + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [4+esp],esi + xor edx,esi + add ebx,DWORD [16+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [24+esp] + xor esi,ebp + mov DWORD [20+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[1426881987+edx*1+ebx] + xor esi,ecx + xor eax,edi + ror esi,2 + add eax,edx + add edx,DWORD [esp] + add eax,esi + mov ecx,edx + mov esi,DWORD [4+esp] + ror edx,14 + mov edi,DWORD [8+esp] + xor edx,ecx + mov ebx,DWORD [80+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + add ebx,DWORD [12+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [20+esp] + xor ecx,eax + mov DWORD [16+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[1925078388+edx*1+ebx] + xor ecx,esi + xor ebp,edi + ror ecx,2 + add ebp,edx + add edx,DWORD [28+esp] + add ebp,ecx + mov esi,edx + mov ecx,DWORD [esp] + ror edx,14 + mov edi,DWORD [4+esp] + xor edx,esi + mov ebx,DWORD [84+esp] + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [28+esp],esi + xor edx,esi + add ebx,DWORD [8+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [16+esp] + xor esi,ebp + mov DWORD [12+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[2162078206+edx*1+ebx] + xor esi,ecx + xor eax,edi + ror esi,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,esi + mov ecx,edx + mov esi,DWORD [28+esp] + ror edx,14 + mov edi,DWORD [esp] + xor edx,ecx + mov ebx,DWORD [88+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + add ebx,DWORD [4+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [12+esp] + xor ecx,eax + mov DWORD [8+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[2614888103+edx*1+ebx] + xor ecx,esi + xor ebp,edi + ror ecx,2 + add ebp,edx + add edx,DWORD [20+esp] + add ebp,ecx + mov esi,edx + mov ecx,DWORD [24+esp] + ror edx,14 + mov edi,DWORD [28+esp] + xor edx,esi + mov ebx,DWORD [92+esp] + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [20+esp],esi + xor edx,esi + add ebx,DWORD [esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [8+esp] + xor esi,ebp + mov DWORD [4+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[3248222580+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [36+esp] + ror esi,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,esi + mov esi,DWORD [88+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [32+esp] + shr edi,10 + add ebx,DWORD [68+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [20+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [24+esp] + xor edx,ecx + mov DWORD [32+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + add ebx,DWORD [28+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [4+esp] + xor ecx,eax + mov DWORD [esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[3835390401+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [40+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [12+esp] + add ebp,ecx + mov ecx,DWORD [92+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [36+esp] + shr edi,10 + add ebx,DWORD [72+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [16+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [20+esp] + xor edx,esi + mov DWORD [36+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [12+esp],esi + xor edx,esi + add ebx,DWORD [24+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [esp] + xor esi,ebp + mov DWORD [28+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[4022224774+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [44+esp] + ror esi,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,esi + mov esi,DWORD [32+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [40+esp] + shr edi,10 + add ebx,DWORD [76+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [12+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [16+esp] + xor edx,ecx + mov DWORD [40+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + add ebx,DWORD [20+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [28+esp] + xor ecx,eax + mov DWORD [24+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[264347078+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [48+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [4+esp] + add ebp,ecx + mov ecx,DWORD [36+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [44+esp] + shr edi,10 + add ebx,DWORD [80+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [8+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [12+esp] + xor edx,esi + mov DWORD [44+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [4+esp],esi + xor edx,esi + add ebx,DWORD [16+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [24+esp] + xor esi,ebp + mov DWORD [20+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[604807628+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [52+esp] + ror esi,2 + add eax,edx + add edx,DWORD [esp] + add eax,esi + mov esi,DWORD [40+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [48+esp] + shr edi,10 + add ebx,DWORD [84+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [4+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [8+esp] + xor edx,ecx + mov DWORD [48+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + add ebx,DWORD [12+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [20+esp] + xor ecx,eax + mov DWORD [16+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[770255983+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [56+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [28+esp] + add ebp,ecx + mov ecx,DWORD [44+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [52+esp] + shr edi,10 + add ebx,DWORD [88+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [4+esp] + xor edx,esi + mov DWORD [52+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [28+esp],esi + xor edx,esi + add ebx,DWORD [8+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [16+esp] + xor esi,ebp + mov DWORD [12+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[1249150122+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [60+esp] + ror esi,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,esi + mov esi,DWORD [48+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [56+esp] + shr edi,10 + add ebx,DWORD [92+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [28+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [esp] + xor edx,ecx + mov DWORD [56+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + add ebx,DWORD [4+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [12+esp] + xor ecx,eax + mov DWORD [8+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[1555081692+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [64+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [20+esp] + add ebp,ecx + mov ecx,DWORD [52+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [60+esp] + shr edi,10 + add ebx,DWORD [32+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [24+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [28+esp] + xor edx,esi + mov DWORD [60+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [20+esp],esi + xor edx,esi + add ebx,DWORD [esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [8+esp] + xor esi,ebp + mov DWORD [4+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[1996064986+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [68+esp] + ror esi,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,esi + mov esi,DWORD [56+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [64+esp] + shr edi,10 + add ebx,DWORD [36+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [20+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [24+esp] + xor edx,ecx + mov DWORD [64+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + add ebx,DWORD [28+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [4+esp] + xor ecx,eax + mov DWORD [esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[2554220882+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [72+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [12+esp] + add ebp,ecx + mov ecx,DWORD [60+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [68+esp] + shr edi,10 + add ebx,DWORD [40+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [16+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [20+esp] + xor edx,esi + mov DWORD [68+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [12+esp],esi + xor edx,esi + add ebx,DWORD [24+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [esp] + xor esi,ebp + mov DWORD [28+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[2821834349+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [76+esp] + ror esi,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,esi + mov esi,DWORD [64+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [72+esp] + shr edi,10 + add ebx,DWORD [44+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [12+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [16+esp] + xor edx,ecx + mov DWORD [72+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + add ebx,DWORD [20+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [28+esp] + xor ecx,eax + mov DWORD [24+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[2952996808+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [80+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [4+esp] + add ebp,ecx + mov ecx,DWORD [68+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [76+esp] + shr edi,10 + add ebx,DWORD [48+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [8+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [12+esp] + xor edx,esi + mov DWORD [76+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [4+esp],esi + xor edx,esi + add ebx,DWORD [16+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [24+esp] + xor esi,ebp + mov DWORD [20+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[3210313671+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [84+esp] + ror esi,2 + add eax,edx + add edx,DWORD [esp] + add eax,esi + mov esi,DWORD [72+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [80+esp] + shr edi,10 + add ebx,DWORD [52+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [4+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [8+esp] + xor edx,ecx + mov DWORD [80+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + add ebx,DWORD [12+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [20+esp] + xor ecx,eax + mov DWORD [16+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[3336571891+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [88+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [28+esp] + add ebp,ecx + mov ecx,DWORD [76+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [84+esp] + shr edi,10 + add ebx,DWORD [56+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [4+esp] + xor edx,esi + mov DWORD [84+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [28+esp],esi + xor edx,esi + add ebx,DWORD [8+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [16+esp] + xor esi,ebp + mov DWORD [12+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[3584528711+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [92+esp] + ror esi,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,esi + mov esi,DWORD [80+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [88+esp] + shr edi,10 + add ebx,DWORD [60+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [28+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [esp] + xor edx,ecx + mov DWORD [88+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + add ebx,DWORD [4+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [12+esp] + xor ecx,eax + mov DWORD [8+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[113926993+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [32+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [20+esp] + add ebp,ecx + mov ecx,DWORD [84+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [92+esp] + shr edi,10 + add ebx,DWORD [64+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [24+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [28+esp] + xor edx,esi + mov DWORD [92+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [20+esp],esi + xor edx,esi + add ebx,DWORD [esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [8+esp] + xor esi,ebp + mov DWORD [4+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[338241895+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [36+esp] + ror esi,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,esi + mov esi,DWORD [88+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [32+esp] + shr edi,10 + add ebx,DWORD [68+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [20+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [24+esp] + xor edx,ecx + mov DWORD [32+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + add ebx,DWORD [28+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [4+esp] + xor ecx,eax + mov DWORD [esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[666307205+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [40+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [12+esp] + add ebp,ecx + mov ecx,DWORD [92+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [36+esp] + shr edi,10 + add ebx,DWORD [72+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [16+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [20+esp] + xor edx,esi + mov DWORD [36+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [12+esp],esi + xor edx,esi + add ebx,DWORD [24+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [esp] + xor esi,ebp + mov DWORD [28+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[773529912+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [44+esp] + ror esi,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,esi + mov esi,DWORD [32+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [40+esp] + shr edi,10 + add ebx,DWORD [76+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [12+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [16+esp] + xor edx,ecx + mov DWORD [40+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + add ebx,DWORD [20+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [28+esp] + xor ecx,eax + mov DWORD [24+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[1294757372+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [48+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [4+esp] + add ebp,ecx + mov ecx,DWORD [36+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [44+esp] + shr edi,10 + add ebx,DWORD [80+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [8+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [12+esp] + xor edx,esi + mov DWORD [44+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [4+esp],esi + xor edx,esi + add ebx,DWORD [16+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [24+esp] + xor esi,ebp + mov DWORD [20+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[1396182291+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [52+esp] + ror esi,2 + add eax,edx + add edx,DWORD [esp] + add eax,esi + mov esi,DWORD [40+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [48+esp] + shr edi,10 + add ebx,DWORD [84+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [4+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [8+esp] + xor edx,ecx + mov DWORD [48+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + add ebx,DWORD [12+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [20+esp] + xor ecx,eax + mov DWORD [16+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[1695183700+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [56+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [28+esp] + add ebp,ecx + mov ecx,DWORD [44+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [52+esp] + shr edi,10 + add ebx,DWORD [88+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [4+esp] + xor edx,esi + mov DWORD [52+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [28+esp],esi + xor edx,esi + add ebx,DWORD [8+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [16+esp] + xor esi,ebp + mov DWORD [12+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[1986661051+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [60+esp] + ror esi,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,esi + mov esi,DWORD [48+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [56+esp] + shr edi,10 + add ebx,DWORD [92+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [28+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [esp] + xor edx,ecx + mov DWORD [56+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + add ebx,DWORD [4+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [12+esp] + xor ecx,eax + mov DWORD [8+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[2177026350+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [64+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [20+esp] + add ebp,ecx + mov ecx,DWORD [52+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [60+esp] + shr edi,10 + add ebx,DWORD [32+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [24+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [28+esp] + xor edx,esi + mov DWORD [60+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [20+esp],esi + xor edx,esi + add ebx,DWORD [esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [8+esp] + xor esi,ebp + mov DWORD [4+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[2456956037+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [68+esp] + ror esi,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,esi + mov esi,DWORD [56+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [64+esp] + shr edi,10 + add ebx,DWORD [36+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [20+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [24+esp] + xor edx,ecx + mov DWORD [64+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + add ebx,DWORD [28+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [4+esp] + xor ecx,eax + mov DWORD [esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[2730485921+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [72+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [12+esp] + add ebp,ecx + mov ecx,DWORD [60+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [68+esp] + shr edi,10 + add ebx,DWORD [40+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [16+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [20+esp] + xor edx,esi + mov DWORD [68+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [12+esp],esi + xor edx,esi + add ebx,DWORD [24+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [esp] + xor esi,ebp + mov DWORD [28+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[2820302411+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [76+esp] + ror esi,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,esi + mov esi,DWORD [64+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [72+esp] + shr edi,10 + add ebx,DWORD [44+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [12+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [16+esp] + xor edx,ecx + mov DWORD [72+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + add ebx,DWORD [20+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [28+esp] + xor ecx,eax + mov DWORD [24+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[3259730800+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [80+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [4+esp] + add ebp,ecx + mov ecx,DWORD [68+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [76+esp] + shr edi,10 + add ebx,DWORD [48+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [8+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [12+esp] + xor edx,esi + mov DWORD [76+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [4+esp],esi + xor edx,esi + add ebx,DWORD [16+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [24+esp] + xor esi,ebp + mov DWORD [20+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[3345764771+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [84+esp] + ror esi,2 + add eax,edx + add edx,DWORD [esp] + add eax,esi + mov esi,DWORD [72+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [80+esp] + shr edi,10 + add ebx,DWORD [52+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [4+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [8+esp] + xor edx,ecx + mov DWORD [80+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + add ebx,DWORD [12+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [20+esp] + xor ecx,eax + mov DWORD [16+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[3516065817+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [88+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [28+esp] + add ebp,ecx + mov ecx,DWORD [76+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [84+esp] + shr edi,10 + add ebx,DWORD [56+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [4+esp] + xor edx,esi + mov DWORD [84+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [28+esp],esi + xor edx,esi + add ebx,DWORD [8+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [16+esp] + xor esi,ebp + mov DWORD [12+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[3600352804+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [92+esp] + ror esi,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,esi + mov esi,DWORD [80+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [88+esp] + shr edi,10 + add ebx,DWORD [60+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [28+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [esp] + xor edx,ecx + mov DWORD [88+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + add ebx,DWORD [4+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [12+esp] + xor ecx,eax + mov DWORD [8+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[4094571909+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [32+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [20+esp] + add ebp,ecx + mov ecx,DWORD [84+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [92+esp] + shr edi,10 + add ebx,DWORD [64+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [24+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [28+esp] + xor edx,esi + mov DWORD [92+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [20+esp],esi + xor edx,esi + add ebx,DWORD [esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [8+esp] + xor esi,ebp + mov DWORD [4+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[275423344+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [36+esp] + ror esi,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,esi + mov esi,DWORD [88+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [32+esp] + shr edi,10 + add ebx,DWORD [68+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [20+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [24+esp] + xor edx,ecx + mov DWORD [32+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + add ebx,DWORD [28+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [4+esp] + xor ecx,eax + mov DWORD [esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[430227734+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [40+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [12+esp] + add ebp,ecx + mov ecx,DWORD [92+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [36+esp] + shr edi,10 + add ebx,DWORD [72+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [16+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [20+esp] + xor edx,esi + mov DWORD [36+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [12+esp],esi + xor edx,esi + add ebx,DWORD [24+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [esp] + xor esi,ebp + mov DWORD [28+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[506948616+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [44+esp] + ror esi,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,esi + mov esi,DWORD [32+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [40+esp] + shr edi,10 + add ebx,DWORD [76+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [12+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [16+esp] + xor edx,ecx + mov DWORD [40+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + add ebx,DWORD [20+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [28+esp] + xor ecx,eax + mov DWORD [24+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[659060556+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [48+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [4+esp] + add ebp,ecx + mov ecx,DWORD [36+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [44+esp] + shr edi,10 + add ebx,DWORD [80+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [8+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [12+esp] + xor edx,esi + mov DWORD [44+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [4+esp],esi + xor edx,esi + add ebx,DWORD [16+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [24+esp] + xor esi,ebp + mov DWORD [20+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[883997877+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [52+esp] + ror esi,2 + add eax,edx + add edx,DWORD [esp] + add eax,esi + mov esi,DWORD [40+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [48+esp] + shr edi,10 + add ebx,DWORD [84+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [4+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [8+esp] + xor edx,ecx + mov DWORD [48+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + add ebx,DWORD [12+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [20+esp] + xor ecx,eax + mov DWORD [16+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[958139571+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [56+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [28+esp] + add ebp,ecx + mov ecx,DWORD [44+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [52+esp] + shr edi,10 + add ebx,DWORD [88+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [4+esp] + xor edx,esi + mov DWORD [52+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [28+esp],esi + xor edx,esi + add ebx,DWORD [8+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [16+esp] + xor esi,ebp + mov DWORD [12+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[1322822218+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [60+esp] + ror esi,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,esi + mov esi,DWORD [48+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [56+esp] + shr edi,10 + add ebx,DWORD [92+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [28+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [esp] + xor edx,ecx + mov DWORD [56+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + add ebx,DWORD [4+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [12+esp] + xor ecx,eax + mov DWORD [8+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[1537002063+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [64+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [20+esp] + add ebp,ecx + mov ecx,DWORD [52+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [60+esp] + shr edi,10 + add ebx,DWORD [32+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [24+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [28+esp] + xor edx,esi + mov DWORD [60+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [20+esp],esi + xor edx,esi + add ebx,DWORD [esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [8+esp] + xor esi,ebp + mov DWORD [4+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[1747873779+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [68+esp] + ror esi,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,esi + mov esi,DWORD [56+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [64+esp] + shr edi,10 + add ebx,DWORD [36+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [20+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [24+esp] + xor edx,ecx + mov DWORD [64+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + add ebx,DWORD [28+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [4+esp] + xor ecx,eax + mov DWORD [esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[1955562222+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [72+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [12+esp] + add ebp,ecx + mov ecx,DWORD [60+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [68+esp] + shr edi,10 + add ebx,DWORD [40+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [16+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [20+esp] + xor edx,esi + mov DWORD [68+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [12+esp],esi + xor edx,esi + add ebx,DWORD [24+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [esp] + xor esi,ebp + mov DWORD [28+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[2024104815+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [76+esp] + ror esi,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,esi + mov esi,DWORD [64+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [72+esp] + shr edi,10 + add ebx,DWORD [44+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [12+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [16+esp] + xor edx,ecx + mov DWORD [72+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + add ebx,DWORD [20+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [28+esp] + xor ecx,eax + mov DWORD [24+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[2227730452+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [80+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [4+esp] + add ebp,ecx + mov ecx,DWORD [68+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [76+esp] + shr edi,10 + add ebx,DWORD [48+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [8+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [12+esp] + xor edx,esi + mov DWORD [76+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [4+esp],esi + xor edx,esi + add ebx,DWORD [16+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [24+esp] + xor esi,ebp + mov DWORD [20+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[2361852424+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [84+esp] + ror esi,2 + add eax,edx + add edx,DWORD [esp] + add eax,esi + mov esi,DWORD [72+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [80+esp] + shr edi,10 + add ebx,DWORD [52+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [4+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [8+esp] + xor edx,ecx + mov DWORD [80+esp],ebx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + add ebx,DWORD [12+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [20+esp] + xor ecx,eax + mov DWORD [16+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[2428436474+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [88+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [28+esp] + add ebp,ecx + mov ecx,DWORD [76+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [84+esp] + shr edi,10 + add ebx,DWORD [56+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [4+esp] + xor edx,esi + mov DWORD [84+esp],ebx + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [28+esp],esi + xor edx,esi + add ebx,DWORD [8+esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [16+esp] + xor esi,ebp + mov DWORD [12+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[2756734187+edx*1+ebx] + xor esi,ecx + xor eax,edi + mov ecx,DWORD [92+esp] + ror esi,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,esi + mov esi,DWORD [80+esp] + mov ebx,ecx + ror ecx,11 + mov edi,esi + ror esi,2 + xor ecx,ebx + shr ebx,3 + ror ecx,7 + xor esi,edi + xor ebx,ecx + ror esi,17 + add ebx,DWORD [88+esp] + shr edi,10 + add ebx,DWORD [60+esp] + mov ecx,edx + xor edi,esi + mov esi,DWORD [28+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [esp] + xor edx,ecx + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + add ebx,DWORD [4+esp] + xor edi,esi + ror edx,6 + mov ecx,eax + add ebx,edi + ror ecx,9 + mov esi,eax + mov edi,DWORD [12+esp] + xor ecx,eax + mov DWORD [8+esp],eax + xor eax,edi + ror ecx,11 + and ebp,eax + lea edx,[3204031479+edx*1+ebx] + xor ecx,esi + xor ebp,edi + mov esi,DWORD [32+esp] + ror ecx,2 + add ebp,edx + add edx,DWORD [20+esp] + add ebp,ecx + mov ecx,DWORD [84+esp] + mov ebx,esi + ror esi,11 + mov edi,ecx + ror ecx,2 + xor esi,ebx + shr ebx,3 + ror esi,7 + xor ecx,edi + xor ebx,esi + ror ecx,17 + add ebx,DWORD [92+esp] + shr edi,10 + add ebx,DWORD [64+esp] + mov esi,edx + xor edi,ecx + mov ecx,DWORD [24+esp] + ror edx,14 + add ebx,edi + mov edi,DWORD [28+esp] + xor edx,esi + xor ecx,edi + ror edx,5 + and ecx,esi + mov DWORD [20+esp],esi + xor edx,esi + add ebx,DWORD [esp] + xor edi,ecx + ror edx,6 + mov esi,ebp + add ebx,edi + ror esi,9 + mov ecx,ebp + mov edi,DWORD [8+esp] + xor esi,ebp + mov DWORD [4+esp],ebp + xor ebp,edi + ror esi,11 + and eax,ebp + lea edx,[3329325298+edx*1+ebx] + xor esi,ecx + xor eax,edi + ror esi,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,esi + mov esi,DWORD [96+esp] + xor ebp,edi + mov ecx,DWORD [12+esp] + add eax,DWORD [esi] + add ebp,DWORD [4+esi] + add edi,DWORD [8+esi] + add ecx,DWORD [12+esi] + mov DWORD [esi],eax + mov DWORD [4+esi],ebp + mov DWORD [8+esi],edi + mov DWORD [12+esi],ecx + mov DWORD [4+esp],ebp + xor ebp,edi + mov DWORD [8+esp],edi + mov DWORD [12+esp],ecx + mov edi,DWORD [20+esp] + mov ebx,DWORD [24+esp] + mov ecx,DWORD [28+esp] + add edx,DWORD [16+esi] + add edi,DWORD [20+esi] + add ebx,DWORD [24+esi] + add ecx,DWORD [28+esi] + mov DWORD [16+esi],edx + mov DWORD [20+esi],edi + mov DWORD [24+esi],ebx + mov DWORD [28+esi],ecx + mov DWORD [20+esp],edi + mov edi,DWORD [100+esp] + mov DWORD [24+esp],ebx + mov DWORD [28+esp],ecx + cmp edi,DWORD [104+esp] + jb NEAR L$006grand_loop + mov esp,DWORD [108+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +global _sha256_block_data_order_ssse3 +align 16 +_sha256_block_data_order_ssse3: +L$_sha256_block_data_order_ssse3_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov ebx,esp + call L$007pic_point +L$007pic_point: + pop ebp + lea ebp,[(L$K256-L$007pic_point)+ebp] + sub esp,16 + and esp,-64 + shl eax,6 + add eax,edi + mov DWORD [esp],esi + mov DWORD [4+esp],edi + mov DWORD [8+esp],eax + mov DWORD [12+esp],ebx + lea esp,[esp-96] + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edi,DWORD [12+esi] + mov DWORD [4+esp],ebx + xor ebx,ecx + mov DWORD [8+esp],ecx + mov DWORD [12+esp],edi + mov edx,DWORD [16+esi] + mov edi,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov esi,DWORD [28+esi] + mov DWORD [20+esp],edi + mov edi,DWORD [100+esp] + mov DWORD [24+esp],ecx + mov DWORD [28+esp],esi + movdqa xmm7,[256+ebp] + jmp NEAR L$008grand_ssse3 +align 16 +L$008grand_ssse3: + movdqu xmm0,[edi] + movdqu xmm1,[16+edi] + movdqu xmm2,[32+edi] + movdqu xmm3,[48+edi] + add edi,64 + pshufb xmm0,xmm7 + mov DWORD [100+esp],edi + pshufb xmm1,xmm7 + movdqa xmm4,[ebp] + pshufb xmm2,xmm7 + movdqa xmm5,[16+ebp] + paddd xmm4,xmm0 + pshufb xmm3,xmm7 + movdqa xmm6,[32+ebp] + paddd xmm5,xmm1 + movdqa xmm7,[48+ebp] + movdqa [32+esp],xmm4 + paddd xmm6,xmm2 + movdqa [48+esp],xmm5 + paddd xmm7,xmm3 + movdqa [64+esp],xmm6 + movdqa [80+esp],xmm7 + jmp NEAR L$009ssse3_00_47 +align 16 +L$009ssse3_00_47: + add ebp,64 + mov ecx,edx + movdqa xmm4,xmm1 + ror edx,14 + mov esi,DWORD [20+esp] + movdqa xmm7,xmm3 + xor edx,ecx + mov edi,DWORD [24+esp] + palignr xmm4,xmm0,4 + xor esi,edi + ror edx,5 + and esi,ecx + palignr xmm7,xmm2,4 + mov DWORD [16+esp],ecx + xor edx,ecx + xor edi,esi + movdqa xmm5,xmm4 + ror edx,6 + mov ecx,eax + movdqa xmm6,xmm4 + add edx,edi + mov edi,DWORD [4+esp] + psrld xmm4,3 + mov esi,eax + ror ecx,9 + paddd xmm0,xmm7 + mov DWORD [esp],eax + xor ecx,eax + psrld xmm6,7 + xor eax,edi + add edx,DWORD [28+esp] + ror ecx,11 + and ebx,eax + pshufd xmm7,xmm3,250 + xor ecx,esi + add edx,DWORD [32+esp] + pslld xmm5,14 + xor ebx,edi + ror ecx,2 + pxor xmm4,xmm6 + add ebx,edx + add edx,DWORD [12+esp] + psrld xmm6,11 + add ebx,ecx + mov ecx,edx + ror edx,14 + pxor xmm4,xmm5 + mov esi,DWORD [16+esp] + xor edx,ecx + pslld xmm5,11 + mov edi,DWORD [20+esp] + xor esi,edi + ror edx,5 + pxor xmm4,xmm6 + and esi,ecx + mov DWORD [12+esp],ecx + movdqa xmm6,xmm7 + xor edx,ecx + xor edi,esi + ror edx,6 + pxor xmm4,xmm5 + mov ecx,ebx + add edx,edi + psrld xmm7,10 + mov edi,DWORD [esp] + mov esi,ebx + ror ecx,9 + paddd xmm0,xmm4 + mov DWORD [28+esp],ebx + xor ecx,ebx + psrlq xmm6,17 + xor ebx,edi + add edx,DWORD [24+esp] + ror ecx,11 + pxor xmm7,xmm6 + and eax,ebx + xor ecx,esi + psrlq xmm6,2 + add edx,DWORD [36+esp] + xor eax,edi + ror ecx,2 + pxor xmm7,xmm6 + add eax,edx + add edx,DWORD [8+esp] + pshufd xmm7,xmm7,128 + add eax,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [12+esp] + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + ror edx,5 + and esi,ecx + psrldq xmm7,8 + mov DWORD [8+esp],ecx + xor edx,ecx + xor edi,esi + paddd xmm0,xmm7 + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + mov esi,eax + ror ecx,9 + mov DWORD [24+esp],eax + pshufd xmm7,xmm0,80 + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + movdqa xmm6,xmm7 + ror ecx,11 + psrld xmm7,10 + and ebx,eax + psrlq xmm6,17 + xor ecx,esi + add edx,DWORD [40+esp] + xor ebx,edi + ror ecx,2 + pxor xmm7,xmm6 + add ebx,edx + add edx,DWORD [4+esp] + psrlq xmm6,2 + add ebx,ecx + mov ecx,edx + ror edx,14 + pxor xmm7,xmm6 + mov esi,DWORD [8+esp] + xor edx,ecx + mov edi,DWORD [12+esp] + pshufd xmm7,xmm7,8 + xor esi,edi + ror edx,5 + movdqa xmm6,[ebp] + and esi,ecx + mov DWORD [4+esp],ecx + pslldq xmm7,8 + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + ror ecx,9 + paddd xmm0,xmm7 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + paddd xmm6,xmm0 + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [44+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + movdqa [32+esp],xmm6 + mov ecx,edx + movdqa xmm4,xmm2 + ror edx,14 + mov esi,DWORD [4+esp] + movdqa xmm7,xmm0 + xor edx,ecx + mov edi,DWORD [8+esp] + palignr xmm4,xmm1,4 + xor esi,edi + ror edx,5 + and esi,ecx + palignr xmm7,xmm3,4 + mov DWORD [esp],ecx + xor edx,ecx + xor edi,esi + movdqa xmm5,xmm4 + ror edx,6 + mov ecx,eax + movdqa xmm6,xmm4 + add edx,edi + mov edi,DWORD [20+esp] + psrld xmm4,3 + mov esi,eax + ror ecx,9 + paddd xmm1,xmm7 + mov DWORD [16+esp],eax + xor ecx,eax + psrld xmm6,7 + xor eax,edi + add edx,DWORD [12+esp] + ror ecx,11 + and ebx,eax + pshufd xmm7,xmm0,250 + xor ecx,esi + add edx,DWORD [48+esp] + pslld xmm5,14 + xor ebx,edi + ror ecx,2 + pxor xmm4,xmm6 + add ebx,edx + add edx,DWORD [28+esp] + psrld xmm6,11 + add ebx,ecx + mov ecx,edx + ror edx,14 + pxor xmm4,xmm5 + mov esi,DWORD [esp] + xor edx,ecx + pslld xmm5,11 + mov edi,DWORD [4+esp] + xor esi,edi + ror edx,5 + pxor xmm4,xmm6 + and esi,ecx + mov DWORD [28+esp],ecx + movdqa xmm6,xmm7 + xor edx,ecx + xor edi,esi + ror edx,6 + pxor xmm4,xmm5 + mov ecx,ebx + add edx,edi + psrld xmm7,10 + mov edi,DWORD [16+esp] + mov esi,ebx + ror ecx,9 + paddd xmm1,xmm4 + mov DWORD [12+esp],ebx + xor ecx,ebx + psrlq xmm6,17 + xor ebx,edi + add edx,DWORD [8+esp] + ror ecx,11 + pxor xmm7,xmm6 + and eax,ebx + xor ecx,esi + psrlq xmm6,2 + add edx,DWORD [52+esp] + xor eax,edi + ror ecx,2 + pxor xmm7,xmm6 + add eax,edx + add edx,DWORD [24+esp] + pshufd xmm7,xmm7,128 + add eax,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [28+esp] + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + ror edx,5 + and esi,ecx + psrldq xmm7,8 + mov DWORD [24+esp],ecx + xor edx,ecx + xor edi,esi + paddd xmm1,xmm7 + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + mov esi,eax + ror ecx,9 + mov DWORD [8+esp],eax + pshufd xmm7,xmm1,80 + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + movdqa xmm6,xmm7 + ror ecx,11 + psrld xmm7,10 + and ebx,eax + psrlq xmm6,17 + xor ecx,esi + add edx,DWORD [56+esp] + xor ebx,edi + ror ecx,2 + pxor xmm7,xmm6 + add ebx,edx + add edx,DWORD [20+esp] + psrlq xmm6,2 + add ebx,ecx + mov ecx,edx + ror edx,14 + pxor xmm7,xmm6 + mov esi,DWORD [24+esp] + xor edx,ecx + mov edi,DWORD [28+esp] + pshufd xmm7,xmm7,8 + xor esi,edi + ror edx,5 + movdqa xmm6,[16+ebp] + and esi,ecx + mov DWORD [20+esp],ecx + pslldq xmm7,8 + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + ror ecx,9 + paddd xmm1,xmm7 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + paddd xmm6,xmm1 + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [60+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + movdqa [48+esp],xmm6 + mov ecx,edx + movdqa xmm4,xmm3 + ror edx,14 + mov esi,DWORD [20+esp] + movdqa xmm7,xmm1 + xor edx,ecx + mov edi,DWORD [24+esp] + palignr xmm4,xmm2,4 + xor esi,edi + ror edx,5 + and esi,ecx + palignr xmm7,xmm0,4 + mov DWORD [16+esp],ecx + xor edx,ecx + xor edi,esi + movdqa xmm5,xmm4 + ror edx,6 + mov ecx,eax + movdqa xmm6,xmm4 + add edx,edi + mov edi,DWORD [4+esp] + psrld xmm4,3 + mov esi,eax + ror ecx,9 + paddd xmm2,xmm7 + mov DWORD [esp],eax + xor ecx,eax + psrld xmm6,7 + xor eax,edi + add edx,DWORD [28+esp] + ror ecx,11 + and ebx,eax + pshufd xmm7,xmm1,250 + xor ecx,esi + add edx,DWORD [64+esp] + pslld xmm5,14 + xor ebx,edi + ror ecx,2 + pxor xmm4,xmm6 + add ebx,edx + add edx,DWORD [12+esp] + psrld xmm6,11 + add ebx,ecx + mov ecx,edx + ror edx,14 + pxor xmm4,xmm5 + mov esi,DWORD [16+esp] + xor edx,ecx + pslld xmm5,11 + mov edi,DWORD [20+esp] + xor esi,edi + ror edx,5 + pxor xmm4,xmm6 + and esi,ecx + mov DWORD [12+esp],ecx + movdqa xmm6,xmm7 + xor edx,ecx + xor edi,esi + ror edx,6 + pxor xmm4,xmm5 + mov ecx,ebx + add edx,edi + psrld xmm7,10 + mov edi,DWORD [esp] + mov esi,ebx + ror ecx,9 + paddd xmm2,xmm4 + mov DWORD [28+esp],ebx + xor ecx,ebx + psrlq xmm6,17 + xor ebx,edi + add edx,DWORD [24+esp] + ror ecx,11 + pxor xmm7,xmm6 + and eax,ebx + xor ecx,esi + psrlq xmm6,2 + add edx,DWORD [68+esp] + xor eax,edi + ror ecx,2 + pxor xmm7,xmm6 + add eax,edx + add edx,DWORD [8+esp] + pshufd xmm7,xmm7,128 + add eax,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [12+esp] + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + ror edx,5 + and esi,ecx + psrldq xmm7,8 + mov DWORD [8+esp],ecx + xor edx,ecx + xor edi,esi + paddd xmm2,xmm7 + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + mov esi,eax + ror ecx,9 + mov DWORD [24+esp],eax + pshufd xmm7,xmm2,80 + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + movdqa xmm6,xmm7 + ror ecx,11 + psrld xmm7,10 + and ebx,eax + psrlq xmm6,17 + xor ecx,esi + add edx,DWORD [72+esp] + xor ebx,edi + ror ecx,2 + pxor xmm7,xmm6 + add ebx,edx + add edx,DWORD [4+esp] + psrlq xmm6,2 + add ebx,ecx + mov ecx,edx + ror edx,14 + pxor xmm7,xmm6 + mov esi,DWORD [8+esp] + xor edx,ecx + mov edi,DWORD [12+esp] + pshufd xmm7,xmm7,8 + xor esi,edi + ror edx,5 + movdqa xmm6,[32+ebp] + and esi,ecx + mov DWORD [4+esp],ecx + pslldq xmm7,8 + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + ror ecx,9 + paddd xmm2,xmm7 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + paddd xmm6,xmm2 + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [76+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + movdqa [64+esp],xmm6 + mov ecx,edx + movdqa xmm4,xmm0 + ror edx,14 + mov esi,DWORD [4+esp] + movdqa xmm7,xmm2 + xor edx,ecx + mov edi,DWORD [8+esp] + palignr xmm4,xmm3,4 + xor esi,edi + ror edx,5 + and esi,ecx + palignr xmm7,xmm1,4 + mov DWORD [esp],ecx + xor edx,ecx + xor edi,esi + movdqa xmm5,xmm4 + ror edx,6 + mov ecx,eax + movdqa xmm6,xmm4 + add edx,edi + mov edi,DWORD [20+esp] + psrld xmm4,3 + mov esi,eax + ror ecx,9 + paddd xmm3,xmm7 + mov DWORD [16+esp],eax + xor ecx,eax + psrld xmm6,7 + xor eax,edi + add edx,DWORD [12+esp] + ror ecx,11 + and ebx,eax + pshufd xmm7,xmm2,250 + xor ecx,esi + add edx,DWORD [80+esp] + pslld xmm5,14 + xor ebx,edi + ror ecx,2 + pxor xmm4,xmm6 + add ebx,edx + add edx,DWORD [28+esp] + psrld xmm6,11 + add ebx,ecx + mov ecx,edx + ror edx,14 + pxor xmm4,xmm5 + mov esi,DWORD [esp] + xor edx,ecx + pslld xmm5,11 + mov edi,DWORD [4+esp] + xor esi,edi + ror edx,5 + pxor xmm4,xmm6 + and esi,ecx + mov DWORD [28+esp],ecx + movdqa xmm6,xmm7 + xor edx,ecx + xor edi,esi + ror edx,6 + pxor xmm4,xmm5 + mov ecx,ebx + add edx,edi + psrld xmm7,10 + mov edi,DWORD [16+esp] + mov esi,ebx + ror ecx,9 + paddd xmm3,xmm4 + mov DWORD [12+esp],ebx + xor ecx,ebx + psrlq xmm6,17 + xor ebx,edi + add edx,DWORD [8+esp] + ror ecx,11 + pxor xmm7,xmm6 + and eax,ebx + xor ecx,esi + psrlq xmm6,2 + add edx,DWORD [84+esp] + xor eax,edi + ror ecx,2 + pxor xmm7,xmm6 + add eax,edx + add edx,DWORD [24+esp] + pshufd xmm7,xmm7,128 + add eax,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [28+esp] + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + ror edx,5 + and esi,ecx + psrldq xmm7,8 + mov DWORD [24+esp],ecx + xor edx,ecx + xor edi,esi + paddd xmm3,xmm7 + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + mov esi,eax + ror ecx,9 + mov DWORD [8+esp],eax + pshufd xmm7,xmm3,80 + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + movdqa xmm6,xmm7 + ror ecx,11 + psrld xmm7,10 + and ebx,eax + psrlq xmm6,17 + xor ecx,esi + add edx,DWORD [88+esp] + xor ebx,edi + ror ecx,2 + pxor xmm7,xmm6 + add ebx,edx + add edx,DWORD [20+esp] + psrlq xmm6,2 + add ebx,ecx + mov ecx,edx + ror edx,14 + pxor xmm7,xmm6 + mov esi,DWORD [24+esp] + xor edx,ecx + mov edi,DWORD [28+esp] + pshufd xmm7,xmm7,8 + xor esi,edi + ror edx,5 + movdqa xmm6,[48+ebp] + and esi,ecx + mov DWORD [20+esp],ecx + pslldq xmm7,8 + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + ror ecx,9 + paddd xmm3,xmm7 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + paddd xmm6,xmm3 + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [92+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + movdqa [80+esp],xmm6 + cmp DWORD [64+ebp],66051 + jne NEAR L$009ssse3_00_47 + mov ecx,edx + ror edx,14 + mov esi,DWORD [20+esp] + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + mov esi,eax + ror ecx,9 + mov DWORD [esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + ror ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [32+esp] + xor ebx,edi + ror ecx,2 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [16+esp] + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + mov esi,ebx + ror ecx,9 + mov DWORD [28+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [36+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [12+esp] + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + mov esi,eax + ror ecx,9 + mov DWORD [24+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + ror ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [40+esp] + xor ebx,edi + ror ecx,2 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [8+esp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + ror ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [44+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [4+esp] + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + mov esi,eax + ror ecx,9 + mov DWORD [16+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + ror ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [48+esp] + xor ebx,edi + ror ecx,2 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [esp] + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + mov esi,ebx + ror ecx,9 + mov DWORD [12+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [52+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [28+esp] + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + mov esi,eax + ror ecx,9 + mov DWORD [8+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + ror ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [56+esp] + xor ebx,edi + ror ecx,2 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [24+esp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + ror ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [60+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [20+esp] + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + mov esi,eax + ror ecx,9 + mov DWORD [esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + ror ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [64+esp] + xor ebx,edi + ror ecx,2 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [16+esp] + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + mov esi,ebx + ror ecx,9 + mov DWORD [28+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [68+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [12+esp] + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + mov esi,eax + ror ecx,9 + mov DWORD [24+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + ror ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [72+esp] + xor ebx,edi + ror ecx,2 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [8+esp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + ror ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [76+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [4+esp] + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + mov esi,eax + ror ecx,9 + mov DWORD [16+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + ror ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [80+esp] + xor ebx,edi + ror ecx,2 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [esp] + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + mov esi,ebx + ror ecx,9 + mov DWORD [12+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [84+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [28+esp] + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + mov esi,eax + ror ecx,9 + mov DWORD [8+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + ror ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [88+esp] + xor ebx,edi + ror ecx,2 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + mov ecx,edx + ror edx,14 + mov esi,DWORD [24+esp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + ror edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + ror edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + ror ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + ror ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [92+esp] + xor eax,edi + ror ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + mov esi,DWORD [96+esp] + xor ebx,edi + mov ecx,DWORD [12+esp] + add eax,DWORD [esi] + add ebx,DWORD [4+esi] + add edi,DWORD [8+esi] + add ecx,DWORD [12+esi] + mov DWORD [esi],eax + mov DWORD [4+esi],ebx + mov DWORD [8+esi],edi + mov DWORD [12+esi],ecx + mov DWORD [4+esp],ebx + xor ebx,edi + mov DWORD [8+esp],edi + mov DWORD [12+esp],ecx + mov edi,DWORD [20+esp] + mov ecx,DWORD [24+esp] + add edx,DWORD [16+esi] + add edi,DWORD [20+esi] + add ecx,DWORD [24+esi] + mov DWORD [16+esi],edx + mov DWORD [20+esi],edi + mov DWORD [20+esp],edi + mov edi,DWORD [28+esp] + mov DWORD [24+esi],ecx + add edi,DWORD [28+esi] + mov DWORD [24+esp],ecx + mov DWORD [28+esi],edi + mov DWORD [28+esp],edi + mov edi,DWORD [100+esp] + movdqa xmm7,[64+ebp] + sub ebp,192 + cmp edi,DWORD [104+esp] + jb NEAR L$008grand_ssse3 + mov esp,DWORD [108+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +global _sha256_block_data_order_avx +align 16 +_sha256_block_data_order_avx: +L$_sha256_block_data_order_avx_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov ebx,esp + call L$010pic_point +L$010pic_point: + pop ebp + lea ebp,[(L$K256-L$010pic_point)+ebp] + sub esp,16 + and esp,-64 + shl eax,6 + add eax,edi + mov DWORD [esp],esi + mov DWORD [4+esp],edi + mov DWORD [8+esp],eax + mov DWORD [12+esp],ebx + lea esp,[esp-96] + vzeroall + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edi,DWORD [12+esi] + mov DWORD [4+esp],ebx + xor ebx,ecx + mov DWORD [8+esp],ecx + mov DWORD [12+esp],edi + mov edx,DWORD [16+esi] + mov edi,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov esi,DWORD [28+esi] + mov DWORD [20+esp],edi + mov edi,DWORD [100+esp] + mov DWORD [24+esp],ecx + mov DWORD [28+esp],esi + vmovdqa xmm7,[256+ebp] + jmp NEAR L$011grand_avx +align 32 +L$011grand_avx: + vmovdqu xmm0,[edi] + vmovdqu xmm1,[16+edi] + vmovdqu xmm2,[32+edi] + vmovdqu xmm3,[48+edi] + add edi,64 + vpshufb xmm0,xmm0,xmm7 + mov DWORD [100+esp],edi + vpshufb xmm1,xmm1,xmm7 + vpshufb xmm2,xmm2,xmm7 + vpaddd xmm4,xmm0,[ebp] + vpshufb xmm3,xmm3,xmm7 + vpaddd xmm5,xmm1,[16+ebp] + vpaddd xmm6,xmm2,[32+ebp] + vpaddd xmm7,xmm3,[48+ebp] + vmovdqa [32+esp],xmm4 + vmovdqa [48+esp],xmm5 + vmovdqa [64+esp],xmm6 + vmovdqa [80+esp],xmm7 + jmp NEAR L$012avx_00_47 +align 16 +L$012avx_00_47: + add ebp,64 + vpalignr xmm4,xmm1,xmm0,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + vpalignr xmm7,xmm3,xmm2,4 + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + vpaddd xmm0,xmm0,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + vpshufd xmm7,xmm3,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [32+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + vpaddd xmm0,xmm0,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [36+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + vpaddd xmm0,xmm0,xmm7 + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + vpshufd xmm7,xmm0,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [40+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + vpaddd xmm0,xmm0,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + vpaddd xmm6,xmm0,[ebp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [44+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + vmovdqa [32+esp],xmm6 + vpalignr xmm4,xmm2,xmm1,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + vpalignr xmm7,xmm0,xmm3,4 + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + vpaddd xmm1,xmm1,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + vpshufd xmm7,xmm0,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [48+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + vpaddd xmm1,xmm1,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [52+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + vpaddd xmm1,xmm1,xmm7 + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + vpshufd xmm7,xmm1,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [56+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + vpaddd xmm1,xmm1,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + vpaddd xmm6,xmm1,[16+ebp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [60+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + vmovdqa [48+esp],xmm6 + vpalignr xmm4,xmm3,xmm2,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + vpalignr xmm7,xmm1,xmm0,4 + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + vpaddd xmm2,xmm2,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + vpshufd xmm7,xmm1,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [64+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + vpaddd xmm2,xmm2,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [68+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + vpaddd xmm2,xmm2,xmm7 + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + vpshufd xmm7,xmm2,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [72+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + vpaddd xmm2,xmm2,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + vpaddd xmm6,xmm2,[32+ebp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [76+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + vmovdqa [64+esp],xmm6 + vpalignr xmm4,xmm0,xmm3,4 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + vpalignr xmm7,xmm2,xmm1,4 + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + vpsrld xmm6,xmm4,7 + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + vpaddd xmm3,xmm3,xmm7 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrld xmm7,xmm4,3 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + vpslld xmm5,xmm4,14 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + vpxor xmm4,xmm7,xmm6 + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + vpshufd xmm7,xmm2,250 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpsrld xmm6,xmm6,11 + add edx,DWORD [80+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpxor xmm4,xmm4,xmm5 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + vpslld xmm5,xmm5,11 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + vpxor xmm4,xmm4,xmm6 + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + vpsrld xmm6,xmm7,10 + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + vpxor xmm4,xmm4,xmm5 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + vpaddd xmm3,xmm3,xmm4 + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + vpxor xmm6,xmm6,xmm5 + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + vpsrlq xmm7,xmm7,19 + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + vpxor xmm6,xmm6,xmm7 + add edx,DWORD [84+esp] + xor eax,edi + shrd ecx,ecx,2 + vpshufd xmm7,xmm6,132 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + vpsrldq xmm7,xmm7,8 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + vpaddd xmm3,xmm3,xmm7 + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + vpshufd xmm7,xmm3,80 + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + vpsrld xmm6,xmm7,10 + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + vpsrlq xmm5,xmm7,17 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + vpxor xmm6,xmm6,xmm5 + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + vpsrlq xmm7,xmm7,19 + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + vpxor xmm6,xmm6,xmm7 + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + vpshufd xmm7,xmm6,232 + add edx,DWORD [88+esp] + xor ebx,edi + shrd ecx,ecx,2 + vpslldq xmm7,xmm7,8 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + vpaddd xmm3,xmm3,xmm7 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + vpaddd xmm6,xmm3,[48+ebp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [92+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + vmovdqa [80+esp],xmm6 + cmp DWORD [64+ebp],66051 + jne NEAR L$012avx_00_47 + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [32+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [36+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [40+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [44+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [48+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [52+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [56+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [60+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [20+esp] + xor edx,ecx + mov edi,DWORD [24+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [16+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [4+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [28+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [64+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [12+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [16+esp] + xor edx,ecx + mov edi,DWORD [20+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [12+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [28+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [24+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [68+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [8+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [12+esp] + xor edx,ecx + mov edi,DWORD [16+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [8+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [28+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [24+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [20+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [72+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [4+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [8+esp] + xor edx,ecx + mov edi,DWORD [12+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [4+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [24+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [20+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [16+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [76+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [4+esp] + xor edx,ecx + mov edi,DWORD [8+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [20+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [16+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [12+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [80+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [28+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [esp] + xor edx,ecx + mov edi,DWORD [4+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [28+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [16+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [12+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [8+esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [84+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [24+esp] + add eax,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [28+esp] + xor edx,ecx + mov edi,DWORD [esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [24+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,eax + add edx,edi + mov edi,DWORD [12+esp] + mov esi,eax + shrd ecx,ecx,9 + mov DWORD [8+esp],eax + xor ecx,eax + xor eax,edi + add edx,DWORD [4+esp] + shrd ecx,ecx,11 + and ebx,eax + xor ecx,esi + add edx,DWORD [88+esp] + xor ebx,edi + shrd ecx,ecx,2 + add ebx,edx + add edx,DWORD [20+esp] + add ebx,ecx + mov ecx,edx + shrd edx,edx,14 + mov esi,DWORD [24+esp] + xor edx,ecx + mov edi,DWORD [28+esp] + xor esi,edi + shrd edx,edx,5 + and esi,ecx + mov DWORD [20+esp],ecx + xor edx,ecx + xor edi,esi + shrd edx,edx,6 + mov ecx,ebx + add edx,edi + mov edi,DWORD [8+esp] + mov esi,ebx + shrd ecx,ecx,9 + mov DWORD [4+esp],ebx + xor ecx,ebx + xor ebx,edi + add edx,DWORD [esp] + shrd ecx,ecx,11 + and eax,ebx + xor ecx,esi + add edx,DWORD [92+esp] + xor eax,edi + shrd ecx,ecx,2 + add eax,edx + add edx,DWORD [16+esp] + add eax,ecx + mov esi,DWORD [96+esp] + xor ebx,edi + mov ecx,DWORD [12+esp] + add eax,DWORD [esi] + add ebx,DWORD [4+esi] + add edi,DWORD [8+esi] + add ecx,DWORD [12+esi] + mov DWORD [esi],eax + mov DWORD [4+esi],ebx + mov DWORD [8+esi],edi + mov DWORD [12+esi],ecx + mov DWORD [4+esp],ebx + xor ebx,edi + mov DWORD [8+esp],edi + mov DWORD [12+esp],ecx + mov edi,DWORD [20+esp] + mov ecx,DWORD [24+esp] + add edx,DWORD [16+esi] + add edi,DWORD [20+esi] + add ecx,DWORD [24+esi] + mov DWORD [16+esi],edx + mov DWORD [20+esi],edi + mov DWORD [20+esp],edi + mov edi,DWORD [28+esp] + mov DWORD [24+esi],ecx + add edi,DWORD [28+esi] + mov DWORD [24+esp],ecx + mov DWORD [28+esi],edi + mov DWORD [28+esp],edi + mov edi,DWORD [100+esp] + vmovdqa xmm7,[64+ebp] + sub ebp,192 + cmp edi,DWORD [104+esp] + jb NEAR L$011grand_avx + mov esp,DWORD [108+esp] + vzeroall + pop edi + pop esi + pop ebx + pop ebp + ret +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/sha256-armv4-linux.S b/third_party/boringssl/gen/bcm/sha256-armv4-linux.S new file mode 100644 index 00000000..b04f4639 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha256-armv4-linux.S @@ -0,0 +1,2840 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the Apache License, Version 2.0 (the "License"); +@ you may not use this file except in compliance with the License. +@ You may obtain a copy of the License at +@ +@ https://www.apache.org/licenses/LICENSE-2.0 +@ +@ Unless required by applicable law or agreed to in writing, software +@ distributed under the License is distributed on an "AS IS" BASIS, +@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ See the License for the specific language governing permissions and +@ limitations under the License. + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. +@ ==================================================================== + +@ SHA256 block procedure for ARMv4. May 2007. + +@ Performance is ~2x better than gcc 3.4 generated code and in "abso- +@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per +@ byte [on single-issue Xscale PXA250 core]. + +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 22% improvement on +@ Cortex A8 core and ~20 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 16% +@ improvement on Cortex A8 core and ~15.4 cycles per processed byte. + +@ September 2013. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process one +@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon +@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only +@ code (meaning that latter performs sub-optimally, nothing was done +@ about it). + +@ May 2014. +@ +@ Add ARMv8 code path performing at 2.0 cpb on Apple A7. + +#ifdef __KERNEL__ +# define __ARM_ARCH __LINUX_ARM_ARCH__ +# define __ARM_MAX_ARCH__ 7 +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those +@ instructions are manually-encoded. (See unsha256.) +.arch armv7-a + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +#else +.code 32 +#endif + +.type K256,%object +.align 5 +K256: +.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.size K256,.-K256 +.word 0 @ terminator +.align 5 + +.globl sha256_block_data_order_nohw +.hidden sha256_block_data_order_nohw +.type sha256_block_data_order_nohw,%function +sha256_block_data_order_nohw: + add r2,r1,r2,lsl#6 @ len to point at the end of inp + stmdb sp!,{r0,r1,r2,r4-r11,lr} + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + adr r14,K256 + sub sp,sp,#16*4 @ alloca(X[16]) +.Loop: +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ magic + eor r12,r12,r12 +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 0 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 0 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 0==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 0==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 0<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 1 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 1 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 1==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 1==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 1<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 2 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 2 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 2==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 2==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 2<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 3 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 3 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 3==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 3==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 3<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 4 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 4 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 4==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 4==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 4<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 5 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 5==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 5==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 5<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 6 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 6 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 6==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 6==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 6<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 7 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 7==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 7==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 7<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 8 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r8,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 8 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 8==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r8,r8,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r8,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 8==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 8<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 9 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r7,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 9 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 9==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r7,r7,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r7,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 9==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 9<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 10 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r6,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 10 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 10==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r6,r6,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r6,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 10==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 10<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 11 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r5,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 11 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 11==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r5,r5,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r5,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 11==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 11<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 12 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r4,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 12 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 12==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r4,r4,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r4,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 12==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 12<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 13 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r11,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 13 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 13==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r11,r11,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r11,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 13==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 13<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 14 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + eor r0,r0,r10,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 14 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + ldrb r12,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r12,lsl#8 + ldrb r12,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 14==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r10,r10,ror#5 + orr r2,r2,r12,lsl#24 + eor r0,r0,r10,ror#19 @ Sigma1(e) +#endif + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 14==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 14<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + @ ldr r2,[r1],#4 @ 15 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + eor r0,r0,r9,ror#19 @ Sigma1(e) +# ifndef __ARMEB__ + rev r2,r2 +# endif +#else + @ ldrb r2,[r1,#3] @ 15 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + ldrb r3,[r1,#2] + ldrb r0,[r1,#1] + orr r2,r2,r3,lsl#8 + ldrb r3,[r1],#4 + orr r2,r2,r0,lsl#16 +# if 15==15 + str r1,[sp,#17*4] @ make room for r1 +# endif + eor r0,r9,r9,ror#5 + orr r2,r2,r3,lsl#24 + eor r0,r0,r9,ror#19 @ Sigma1(e) +#endif + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 15==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 15<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +.Lrounds_16_xx: + @ ldr r2,[sp,#1*4] @ 16 + @ ldr r1,[sp,#14*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#0*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#9*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#0*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 16==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 16<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#2*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#15*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#2*4] @ 17 + @ ldr r1,[sp,#15*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#1*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#10*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#1*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 17==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 17<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#3*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#0*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#3*4] @ 18 + @ ldr r1,[sp,#0*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#2*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#11*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#2*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 18==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 18<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#4*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#1*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#4*4] @ 19 + @ ldr r1,[sp,#1*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#3*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#12*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#3*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 19==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 19<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#5*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#2*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#5*4] @ 20 + @ ldr r1,[sp,#2*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#4*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#13*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#4*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 20==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 20<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#6*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#3*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#6*4] @ 21 + @ ldr r1,[sp,#3*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#5*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#14*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#5*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 21==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 21<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#7*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#4*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#7*4] @ 22 + @ ldr r1,[sp,#4*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#6*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#15*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#6*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 22==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 22<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#8*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#5*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#8*4] @ 23 + @ ldr r1,[sp,#5*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#7*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#0*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#7*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 23==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 23<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#9*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#6*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#9*4] @ 24 + @ ldr r1,[sp,#6*4] + mov r0,r2,ror#7 + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#8*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#1*4] + + add r12,r12,r0 + eor r0,r8,r8,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r8,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r11,r11,r2 @ h+=X[i] + str r2,[sp,#8*4] + eor r2,r9,r10 + add r11,r11,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r8 + add r11,r11,r12 @ h+=K256[i] + eor r2,r2,r10 @ Ch(e,f,g) + eor r0,r4,r4,ror#11 + add r11,r11,r2 @ h+=Ch(e,f,g) +#if 24==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 24<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r4,r5 @ a^b, b^c in next round +#else + ldr r2,[sp,#10*4] @ from future BODY_16_xx + eor r12,r4,r5 @ a^b, b^c in next round + ldr r1,[sp,#7*4] @ from future BODY_16_xx +#endif + eor r0,r0,r4,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r7,r7,r11 @ d+=h + eor r3,r3,r5 @ Maj(a,b,c) + add r11,r11,r0,ror#2 @ h+=Sigma0(a) + @ add r11,r11,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#10*4] @ 25 + @ ldr r1,[sp,#7*4] + mov r0,r2,ror#7 + add r11,r11,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#9*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#2*4] + + add r3,r3,r0 + eor r0,r7,r7,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r7,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r10,r10,r2 @ h+=X[i] + str r2,[sp,#9*4] + eor r2,r8,r9 + add r10,r10,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r7 + add r10,r10,r3 @ h+=K256[i] + eor r2,r2,r9 @ Ch(e,f,g) + eor r0,r11,r11,ror#11 + add r10,r10,r2 @ h+=Ch(e,f,g) +#if 25==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 25<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r11,r4 @ a^b, b^c in next round +#else + ldr r2,[sp,#11*4] @ from future BODY_16_xx + eor r3,r11,r4 @ a^b, b^c in next round + ldr r1,[sp,#8*4] @ from future BODY_16_xx +#endif + eor r0,r0,r11,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r6,r6,r10 @ d+=h + eor r12,r12,r4 @ Maj(a,b,c) + add r10,r10,r0,ror#2 @ h+=Sigma0(a) + @ add r10,r10,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#11*4] @ 26 + @ ldr r1,[sp,#8*4] + mov r0,r2,ror#7 + add r10,r10,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#10*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#3*4] + + add r12,r12,r0 + eor r0,r6,r6,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r6,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r9,r9,r2 @ h+=X[i] + str r2,[sp,#10*4] + eor r2,r7,r8 + add r9,r9,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r6 + add r9,r9,r12 @ h+=K256[i] + eor r2,r2,r8 @ Ch(e,f,g) + eor r0,r10,r10,ror#11 + add r9,r9,r2 @ h+=Ch(e,f,g) +#if 26==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 26<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r10,r11 @ a^b, b^c in next round +#else + ldr r2,[sp,#12*4] @ from future BODY_16_xx + eor r12,r10,r11 @ a^b, b^c in next round + ldr r1,[sp,#9*4] @ from future BODY_16_xx +#endif + eor r0,r0,r10,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r5,r5,r9 @ d+=h + eor r3,r3,r11 @ Maj(a,b,c) + add r9,r9,r0,ror#2 @ h+=Sigma0(a) + @ add r9,r9,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#12*4] @ 27 + @ ldr r1,[sp,#9*4] + mov r0,r2,ror#7 + add r9,r9,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#11*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#4*4] + + add r3,r3,r0 + eor r0,r5,r5,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r5,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r8,r8,r2 @ h+=X[i] + str r2,[sp,#11*4] + eor r2,r6,r7 + add r8,r8,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r5 + add r8,r8,r3 @ h+=K256[i] + eor r2,r2,r7 @ Ch(e,f,g) + eor r0,r9,r9,ror#11 + add r8,r8,r2 @ h+=Ch(e,f,g) +#if 27==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 27<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r9,r10 @ a^b, b^c in next round +#else + ldr r2,[sp,#13*4] @ from future BODY_16_xx + eor r3,r9,r10 @ a^b, b^c in next round + ldr r1,[sp,#10*4] @ from future BODY_16_xx +#endif + eor r0,r0,r9,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r4,r4,r8 @ d+=h + eor r12,r12,r10 @ Maj(a,b,c) + add r8,r8,r0,ror#2 @ h+=Sigma0(a) + @ add r8,r8,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#13*4] @ 28 + @ ldr r1,[sp,#10*4] + mov r0,r2,ror#7 + add r8,r8,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#12*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#5*4] + + add r12,r12,r0 + eor r0,r4,r4,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r4,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r7,r7,r2 @ h+=X[i] + str r2,[sp,#12*4] + eor r2,r5,r6 + add r7,r7,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r4 + add r7,r7,r12 @ h+=K256[i] + eor r2,r2,r6 @ Ch(e,f,g) + eor r0,r8,r8,ror#11 + add r7,r7,r2 @ h+=Ch(e,f,g) +#if 28==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 28<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r8,r9 @ a^b, b^c in next round +#else + ldr r2,[sp,#14*4] @ from future BODY_16_xx + eor r12,r8,r9 @ a^b, b^c in next round + ldr r1,[sp,#11*4] @ from future BODY_16_xx +#endif + eor r0,r0,r8,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r11,r11,r7 @ d+=h + eor r3,r3,r9 @ Maj(a,b,c) + add r7,r7,r0,ror#2 @ h+=Sigma0(a) + @ add r7,r7,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#14*4] @ 29 + @ ldr r1,[sp,#11*4] + mov r0,r2,ror#7 + add r7,r7,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#13*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#6*4] + + add r3,r3,r0 + eor r0,r11,r11,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r11,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r6,r6,r2 @ h+=X[i] + str r2,[sp,#13*4] + eor r2,r4,r5 + add r6,r6,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r11 + add r6,r6,r3 @ h+=K256[i] + eor r2,r2,r5 @ Ch(e,f,g) + eor r0,r7,r7,ror#11 + add r6,r6,r2 @ h+=Ch(e,f,g) +#if 29==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 29<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r7,r8 @ a^b, b^c in next round +#else + ldr r2,[sp,#15*4] @ from future BODY_16_xx + eor r3,r7,r8 @ a^b, b^c in next round + ldr r1,[sp,#12*4] @ from future BODY_16_xx +#endif + eor r0,r0,r7,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r10,r10,r6 @ d+=h + eor r12,r12,r8 @ Maj(a,b,c) + add r6,r6,r0,ror#2 @ h+=Sigma0(a) + @ add r6,r6,r12 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#15*4] @ 30 + @ ldr r1,[sp,#12*4] + mov r0,r2,ror#7 + add r6,r6,r12 @ h+=Maj(a,b,c) from the past + mov r12,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r12,r12,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#14*4] + eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#7*4] + + add r12,r12,r0 + eor r0,r10,r10,ror#5 @ from BODY_00_15 + add r2,r2,r12 + eor r0,r0,r10,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r12,[r14],#4 @ *K256++ + add r5,r5,r2 @ h+=X[i] + str r2,[sp,#14*4] + eor r2,r11,r4 + add r5,r5,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r10 + add r5,r5,r12 @ h+=K256[i] + eor r2,r2,r4 @ Ch(e,f,g) + eor r0,r6,r6,ror#11 + add r5,r5,r2 @ h+=Ch(e,f,g) +#if 30==31 + and r12,r12,#0xff + cmp r12,#0xf2 @ done? +#endif +#if 30<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r12,r6,r7 @ a^b, b^c in next round +#else + ldr r2,[sp,#0*4] @ from future BODY_16_xx + eor r12,r6,r7 @ a^b, b^c in next round + ldr r1,[sp,#13*4] @ from future BODY_16_xx +#endif + eor r0,r0,r6,ror#20 @ Sigma0(a) + and r3,r3,r12 @ (b^c)&=(a^b) + add r9,r9,r5 @ d+=h + eor r3,r3,r7 @ Maj(a,b,c) + add r5,r5,r0,ror#2 @ h+=Sigma0(a) + @ add r5,r5,r3 @ h+=Maj(a,b,c) + @ ldr r2,[sp,#0*4] @ 31 + @ ldr r1,[sp,#13*4] + mov r0,r2,ror#7 + add r5,r5,r3 @ h+=Maj(a,b,c) from the past + mov r3,r1,ror#17 + eor r0,r0,r2,ror#18 + eor r3,r3,r1,ror#19 + eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) + ldr r2,[sp,#15*4] + eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) + ldr r1,[sp,#8*4] + + add r3,r3,r0 + eor r0,r9,r9,ror#5 @ from BODY_00_15 + add r2,r2,r3 + eor r0,r0,r9,ror#19 @ Sigma1(e) + add r2,r2,r1 @ X[i] + ldr r3,[r14],#4 @ *K256++ + add r4,r4,r2 @ h+=X[i] + str r2,[sp,#15*4] + eor r2,r10,r11 + add r4,r4,r0,ror#6 @ h+=Sigma1(e) + and r2,r2,r9 + add r4,r4,r3 @ h+=K256[i] + eor r2,r2,r11 @ Ch(e,f,g) + eor r0,r5,r5,ror#11 + add r4,r4,r2 @ h+=Ch(e,f,g) +#if 31==31 + and r3,r3,#0xff + cmp r3,#0xf2 @ done? +#endif +#if 31<15 +# if __ARM_ARCH>=7 + ldr r2,[r1],#4 @ prefetch +# else + ldrb r2,[r1,#3] +# endif + eor r3,r5,r6 @ a^b, b^c in next round +#else + ldr r2,[sp,#1*4] @ from future BODY_16_xx + eor r3,r5,r6 @ a^b, b^c in next round + ldr r1,[sp,#14*4] @ from future BODY_16_xx +#endif + eor r0,r0,r5,ror#20 @ Sigma0(a) + and r12,r12,r3 @ (b^c)&=(a^b) + add r8,r8,r4 @ d+=h + eor r12,r12,r6 @ Maj(a,b,c) + add r4,r4,r0,ror#2 @ h+=Sigma0(a) + @ add r4,r4,r12 @ h+=Maj(a,b,c) +#if __ARM_ARCH>=7 + ite eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r3,[sp,#16*4] @ pull ctx + bne .Lrounds_16_xx + + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r0,[r3,#0] + ldr r2,[r3,#4] + ldr r12,[r3,#8] + add r4,r4,r0 + ldr r0,[r3,#12] + add r5,r5,r2 + ldr r2,[r3,#16] + add r6,r6,r12 + ldr r12,[r3,#20] + add r7,r7,r0 + ldr r0,[r3,#24] + add r8,r8,r2 + ldr r2,[r3,#28] + add r9,r9,r12 + ldr r1,[sp,#17*4] @ pull inp + ldr r12,[sp,#18*4] @ pull inp+len + add r10,r10,r0 + add r11,r11,r2 + stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} + cmp r1,r12 + sub r14,r14,#256 @ rewind Ktbl + bne .Loop + + add sp,sp,#19*4 @ destroy frame +#if __ARM_ARCH>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.LK256_shortcut_neon: +@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. +#if defined(__thumb2__) +.word K256-(.LK256_add_neon+4) +#else +.word K256-(.LK256_add_neon+8) +#endif + +.globl sha256_block_data_order_neon +.hidden sha256_block_data_order_neon +.type sha256_block_data_order_neon,%function +.align 5 +.skip 16 +sha256_block_data_order_neon: + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + + sub r11,sp,#16*4+16 + + @ K256 is just at the boundary of being easily referenced by an ADR from + @ this function. In Arm mode, when building with __ARM_ARCH=6, it does + @ not fit. By moving code around, we could make it fit, but this is too + @ fragile. For simplicity, just load the offset from + @ .LK256_shortcut_neon. + @ + @ TODO(davidben): adrl would avoid a load, but clang-assembler does not + @ support it. We might be able to emulate it with a macro, but Android's + @ did not work when I tried it. + @ https://android.googlesource.com/platform/ndk/+/refs/heads/main/docs/ClangMigration.md#arm + ldr r14,.LK256_shortcut_neon +.LK256_add_neon: + add r14,pc,r14 + + bic r11,r11,#15 @ align for 128-bit stores + mov r12,sp + mov sp,r11 @ alloca + add r2,r1,r2,lsl#6 @ len to point at the end of inp + + vld1.8 {q0},[r1]! + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + vld1.32 {q8},[r14,:128]! + vld1.32 {q9},[r14,:128]! + vld1.32 {q10},[r14,:128]! + vld1.32 {q11},[r14,:128]! + vrev32.8 q0,q0 @ yes, even on + str r0,[sp,#64] + vrev32.8 q1,q1 @ big-endian + str r1,[sp,#68] + mov r1,sp + vrev32.8 q2,q2 + str r2,[sp,#72] + vrev32.8 q3,q3 + str r12,[sp,#76] @ save original sp + vadd.i32 q8,q8,q0 + vadd.i32 q9,q9,q1 + vst1.32 {q8},[r1,:128]! + vadd.i32 q10,q10,q2 + vst1.32 {q9},[r1,:128]! + vadd.i32 q11,q11,q3 + vst1.32 {q10},[r1,:128]! + vst1.32 {q11},[r1,:128]! + + ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} + sub r1,r1,#64 + ldr r2,[sp,#0] + eor r12,r12,r12 + eor r3,r5,r6 + b .L_00_48 + +.align 4 +.L_00_48: + vext.8 q8,q0,q1,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q2,q3,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q0,q0,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#4] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d7,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d7,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d7,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q0,q0,q9 + add r10,r10,r2 + ldr r2,[sp,#8] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d7,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d7,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d0,d0,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d0,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d0,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d0,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#12] + and r3,r3,r12 + vshr.u32 d24,d0,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d0,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d1,d1,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q0 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q1,q2,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q3,q0,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q1,q1,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#20] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d1,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d1,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d1,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q1,q1,q9 + add r6,r6,r2 + ldr r2,[sp,#24] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d1,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d1,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d2,d2,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d2,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d2,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d2,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#28] + and r3,r3,r12 + vshr.u32 d24,d2,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d2,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d3,d3,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q1 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vext.8 q8,q2,q3,#4 + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + vext.8 q9,q0,q1,#4 + add r4,r4,r12 + and r2,r2,r8 + eor r12,r0,r8,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vadd.i32 q2,q2,q9 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + vshr.u32 q9,q8,#3 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#36] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + veor q9,q9,q10 + add r10,r10,r2 + vsli.32 q11,q8,#14 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + vshr.u32 d24,d3,#17 + add r11,r11,r3 + and r2,r2,r7 + veor q9,q9,q11 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + vsli.32 d24,d3,#15 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + vshr.u32 d25,d3,#10 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + vadd.i32 q2,q2,q9 + add r10,r10,r2 + ldr r2,[sp,#40] + veor d25,d25,d24 + and r12,r12,r3 + add r6,r6,r10 + vshr.u32 d24,d3,#19 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + vsli.32 d24,d3,#13 + add r9,r9,r2 + eor r2,r7,r8 + veor d25,d25,d24 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + vadd.i32 d4,d4,d25 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + vshr.u32 d24,d4,#17 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + vsli.32 d24,d4,#15 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + vshr.u32 d25,d4,#10 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + veor d25,d25,d24 + ldr r2,[sp,#44] + and r3,r3,r12 + vshr.u32 d24,d4,#19 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + vld1.32 {q8},[r14,:128]! + add r8,r8,r2 + vsli.32 d24,d4,#13 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + veor d25,d25,d24 + add r9,r9,r3 + and r2,r2,r5 + vadd.i32 d5,d5,d25 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + vadd.i32 q8,q8,q2 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + vst1.32 {q8},[r1,:128]! + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vext.8 q8,q3,q0,#4 + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + vext.8 q9,q1,q2,#4 + add r8,r8,r12 + and r2,r2,r4 + eor r12,r0,r4,ror#19 + vshr.u32 q10,q8,#7 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vadd.i32 q3,q3,q9 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + vshr.u32 q9,q8,#3 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vsli.32 q10,q8,#25 + ldr r2,[sp,#52] + and r3,r3,r12 + vshr.u32 q11,q8,#18 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + veor q9,q9,q10 + add r6,r6,r2 + vsli.32 q11,q8,#14 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + vshr.u32 d24,d5,#17 + add r7,r7,r3 + and r2,r2,r11 + veor q9,q9,q11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + vsli.32 d24,d5,#15 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + vshr.u32 d25,d5,#10 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + vadd.i32 q3,q3,q9 + add r6,r6,r2 + ldr r2,[sp,#56] + veor d25,d25,d24 + and r12,r12,r3 + add r10,r10,r6 + vshr.u32 d24,d5,#19 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + vsli.32 d24,d5,#13 + add r5,r5,r2 + eor r2,r11,r4 + veor d25,d25,d24 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + vadd.i32 d6,d6,d25 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + vshr.u32 d24,d6,#17 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + vsli.32 d24,d6,#15 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + vshr.u32 d25,d6,#10 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + veor d25,d25,d24 + ldr r2,[sp,#60] + and r3,r3,r12 + vshr.u32 d24,d6,#19 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + vld1.32 {q8},[r14,:128]! + add r4,r4,r2 + vsli.32 d24,d6,#13 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + veor d25,d25,d24 + add r5,r5,r3 + and r2,r2,r9 + vadd.i32 d7,d7,d25 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + vadd.i32 q8,q8,q3 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[r14] + and r12,r12,r3 + add r8,r8,r4 + vst1.32 {q8},[r1,:128]! + add r4,r4,r0,ror#2 + eor r12,r12,r6 + teq r2,#0 @ check for K256 terminator + ldr r2,[sp,#0] + sub r1,r1,#64 + bne .L_00_48 + + ldr r1,[sp,#68] + ldr r0,[sp,#72] + sub r14,r14,#256 @ rewind r14 + teq r1,r0 + it eq + subeq r1,r1,#64 @ avoid SEGV + vld1.8 {q0},[r1]! @ load next input block + vld1.8 {q1},[r1]! + vld1.8 {q2},[r1]! + vld1.8 {q3},[r1]! + it ne + strne r1,[sp,#68] + mov r1,sp + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q0,q0 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q0 + ldr r2,[sp,#4] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#8] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#12] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#16] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q1,q1 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q1 + ldr r2,[sp,#20] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#24] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#28] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#32] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + add r11,r11,r2 + eor r2,r9,r10 + eor r0,r8,r8,ror#5 + add r4,r4,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r8 + eor r12,r0,r8,ror#19 + eor r0,r4,r4,ror#11 + eor r2,r2,r10 + vrev32.8 q2,q2 + add r11,r11,r12,ror#6 + eor r12,r4,r5 + eor r0,r0,r4,ror#20 + add r11,r11,r2 + vadd.i32 q8,q8,q2 + ldr r2,[sp,#36] + and r3,r3,r12 + add r7,r7,r11 + add r11,r11,r0,ror#2 + eor r3,r3,r5 + add r10,r10,r2 + eor r2,r8,r9 + eor r0,r7,r7,ror#5 + add r11,r11,r3 + and r2,r2,r7 + eor r3,r0,r7,ror#19 + eor r0,r11,r11,ror#11 + eor r2,r2,r9 + add r10,r10,r3,ror#6 + eor r3,r11,r4 + eor r0,r0,r11,ror#20 + add r10,r10,r2 + ldr r2,[sp,#40] + and r12,r12,r3 + add r6,r6,r10 + add r10,r10,r0,ror#2 + eor r12,r12,r4 + add r9,r9,r2 + eor r2,r7,r8 + eor r0,r6,r6,ror#5 + add r10,r10,r12 + and r2,r2,r6 + eor r12,r0,r6,ror#19 + eor r0,r10,r10,ror#11 + eor r2,r2,r8 + add r9,r9,r12,ror#6 + eor r12,r10,r11 + eor r0,r0,r10,ror#20 + add r9,r9,r2 + ldr r2,[sp,#44] + and r3,r3,r12 + add r5,r5,r9 + add r9,r9,r0,ror#2 + eor r3,r3,r11 + add r8,r8,r2 + eor r2,r6,r7 + eor r0,r5,r5,ror#5 + add r9,r9,r3 + and r2,r2,r5 + eor r3,r0,r5,ror#19 + eor r0,r9,r9,ror#11 + eor r2,r2,r7 + add r8,r8,r3,ror#6 + eor r3,r9,r10 + eor r0,r0,r9,ror#20 + add r8,r8,r2 + ldr r2,[sp,#48] + and r12,r12,r3 + add r4,r4,r8 + add r8,r8,r0,ror#2 + eor r12,r12,r10 + vst1.32 {q8},[r1,:128]! + add r7,r7,r2 + eor r2,r5,r6 + eor r0,r4,r4,ror#5 + add r8,r8,r12 + vld1.32 {q8},[r14,:128]! + and r2,r2,r4 + eor r12,r0,r4,ror#19 + eor r0,r8,r8,ror#11 + eor r2,r2,r6 + vrev32.8 q3,q3 + add r7,r7,r12,ror#6 + eor r12,r8,r9 + eor r0,r0,r8,ror#20 + add r7,r7,r2 + vadd.i32 q8,q8,q3 + ldr r2,[sp,#52] + and r3,r3,r12 + add r11,r11,r7 + add r7,r7,r0,ror#2 + eor r3,r3,r9 + add r6,r6,r2 + eor r2,r4,r5 + eor r0,r11,r11,ror#5 + add r7,r7,r3 + and r2,r2,r11 + eor r3,r0,r11,ror#19 + eor r0,r7,r7,ror#11 + eor r2,r2,r5 + add r6,r6,r3,ror#6 + eor r3,r7,r8 + eor r0,r0,r7,ror#20 + add r6,r6,r2 + ldr r2,[sp,#56] + and r12,r12,r3 + add r10,r10,r6 + add r6,r6,r0,ror#2 + eor r12,r12,r8 + add r5,r5,r2 + eor r2,r11,r4 + eor r0,r10,r10,ror#5 + add r6,r6,r12 + and r2,r2,r10 + eor r12,r0,r10,ror#19 + eor r0,r6,r6,ror#11 + eor r2,r2,r4 + add r5,r5,r12,ror#6 + eor r12,r6,r7 + eor r0,r0,r6,ror#20 + add r5,r5,r2 + ldr r2,[sp,#60] + and r3,r3,r12 + add r9,r9,r5 + add r5,r5,r0,ror#2 + eor r3,r3,r7 + add r4,r4,r2 + eor r2,r10,r11 + eor r0,r9,r9,ror#5 + add r5,r5,r3 + and r2,r2,r9 + eor r3,r0,r9,ror#19 + eor r0,r5,r5,ror#11 + eor r2,r2,r11 + add r4,r4,r3,ror#6 + eor r3,r5,r6 + eor r0,r0,r5,ror#20 + add r4,r4,r2 + ldr r2,[sp,#64] + and r12,r12,r3 + add r8,r8,r4 + add r4,r4,r0,ror#2 + eor r12,r12,r6 + vst1.32 {q8},[r1,:128]! + ldr r0,[r2,#0] + add r4,r4,r12 @ h+=Maj(a,b,c) from the past + ldr r12,[r2,#4] + ldr r3,[r2,#8] + ldr r1,[r2,#12] + add r4,r4,r0 @ accumulate + ldr r0,[r2,#16] + add r5,r5,r12 + ldr r12,[r2,#20] + add r6,r6,r3 + ldr r3,[r2,#24] + add r7,r7,r1 + ldr r1,[r2,#28] + add r8,r8,r0 + str r4,[r2],#4 + add r9,r9,r12 + str r5,[r2],#4 + add r10,r10,r3 + str r6,[r2],#4 + add r11,r11,r1 + str r7,[r2],#4 + stmia r2,{r8,r9,r10,r11} + + ittte ne + movne r1,sp + ldrne r2,[sp,#0] + eorne r12,r12,r12 + ldreq sp,[sp,#76] @ restore original sp + itt ne + eorne r3,r5,r6 + bne .L_00_48 + + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +.size sha256_block_data_order_neon,.-sha256_block_data_order_neon +#endif +#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) + +# if defined(__thumb2__) +# define INST(a,b,c,d) .byte c,d|0xc,a,b +# else +# define INST(a,b,c,d) .byte a,b,c,d +# endif + +.LK256_shortcut_hw: +@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. +#if defined(__thumb2__) +.word K256-(.LK256_add_hw+4) +#else +.word K256-(.LK256_add_hw+8) +#endif + +.globl sha256_block_data_order_hw +.hidden sha256_block_data_order_hw +.type sha256_block_data_order_hw,%function +.align 5 +sha256_block_data_order_hw: + @ K256 is too far to reference from one ADR command in Thumb mode. In + @ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte + @ boundary. For simplicity, just load the offset from .LK256_shortcut_hw. + ldr r3,.LK256_shortcut_hw +.LK256_add_hw: + add r3,pc,r3 + + vld1.32 {q0,q1},[r0] + add r2,r1,r2,lsl#6 @ len to point at the end of inp + b .Loop_v8 + +.align 4 +.Loop_v8: + vld1.8 {q8,q9},[r1]! + vld1.8 {q10,q11},[r1]! + vld1.32 {q12},[r3]! + vrev32.8 q8,q8 + vrev32.8 q9,q9 + vrev32.8 q10,q10 + vrev32.8 q11,q11 + vmov q14,q0 @ offload + vmov q15,q1 + teq r1,r2 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q10 + INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q11 + INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 + vld1.32 {q13},[r3]! + vadd.i32 q12,q12,q8 + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vld1.32 {q12},[r3]! + vadd.i32 q13,q13,q9 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vld1.32 {q13},[r3] + vadd.i32 q12,q12,q10 + sub r3,r3,#256-16 @ rewind + vmov q2,q0 + INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 + INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 + + vadd.i32 q13,q13,q11 + vmov q2,q0 + INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 + INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 + + vadd.i32 q0,q0,q14 + vadd.i32 q1,q1,q15 + it ne + bne .Loop_v8 + + vst1.32 {q0,q1},[r0] + + bx lr @ bx lr +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw +#endif +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/sha256-armv8-apple.S b/third_party/boringssl/gen/bcm/sha256-armv8-apple.S new file mode 100644 index 00000000..0cb6f72b --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha256-armv8-apple.S @@ -0,0 +1,1192 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +.text + +.globl _sha256_block_data_order_nohw +.private_extern _sha256_block_data_order_nohw + +.align 6 +_sha256_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adrp x30,LK256@PAGE + add x30,x30,LK256@PAGEOFF + stp x0,x2,[x29,#96] + +Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.section __TEXT,__const +.align 6 + +LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl _sha256_block_data_order_hw +.private_extern _sha256_block_data_order_hw + +.align 6 +_sha256_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adrp x3,LK256@PAGE + add x3,x3,LK256@PAGEOFF + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e0760c4 //sha256su1 v4.4s,v6.4s,v7.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.4s,v6.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0460e5 //sha256su1 v5.4s,v7.4s,v4.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e056086 //sha256su1 v6.4s,v4.4s,v5.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0660a7 //sha256su1 v7.4s,v5.4s,v6.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e0760c4 //sha256su1 v4.4s,v6.4s,v7.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.4s,v6.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0460e5 //sha256su1 v5.4s,v7.4s,v4.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e056086 //sha256su1 v6.4s,v4.4s,v5.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0660a7 //sha256su1 v7.4s,v5.4s,v6.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e0760c4 //sha256su1 v4.4s,v6.4s,v7.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.4s,v6.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0460e5 //sha256su1 v5.4s,v7.4s,v4.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e056086 //sha256su1 v6.4s,v4.4s,v5.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0660a7 //sha256su1 v7.4s,v5.4s,v6.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/sha256-armv8-linux.S b/third_party/boringssl/gen/bcm/sha256-armv8-linux.S new file mode 100644 index 00000000..d2901be0 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha256-armv8-linux.S @@ -0,0 +1,1192 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +.text + +.globl sha256_block_data_order_nohw +.hidden sha256_block_data_order_nohw +.type sha256_block_data_order_nohw,%function +.align 6 +sha256_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adrp x30,.LK256 + add x30,x30,:lo12:.LK256 + stp x0,x2,[x29,#96] + +.Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +.Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw + +.section .rodata +.align 6 +.type .LK256,%object +.LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator +.size .LK256,.-.LK256 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl sha256_block_data_order_hw +.hidden sha256_block_data_order_hw +.type sha256_block_data_order_hw,%function +.align 6 +sha256_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adrp x3,.LK256 + add x3,x3,:lo12:.LK256 + +.Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h q0,q1,v16.4s +.inst 0x5e105041 //sha256h2 q1,q2,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.4s,v6.4s,v7.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.4s,v6.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h q0,q1,v17.4s +.inst 0x5e115041 //sha256h2 q1,q2,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.4s,v7.4s,v4.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h q0,q1,v16.4s +.inst 0x5e105041 //sha256h2 q1,q2,v16.4s +.inst 0x5e056086 //sha256su1 v6.4s,v4.4s,v5.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h q0,q1,v17.4s +.inst 0x5e115041 //sha256h2 q1,q2,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.4s,v5.4s,v6.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h q0,q1,v16.4s +.inst 0x5e105041 //sha256h2 q1,q2,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.4s,v6.4s,v7.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.4s,v6.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h q0,q1,v17.4s +.inst 0x5e115041 //sha256h2 q1,q2,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.4s,v7.4s,v4.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h q0,q1,v16.4s +.inst 0x5e105041 //sha256h2 q1,q2,v16.4s +.inst 0x5e056086 //sha256su1 v6.4s,v4.4s,v5.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h q0,q1,v17.4s +.inst 0x5e115041 //sha256h2 q1,q2,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.4s,v5.4s,v6.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.inst 0x5e2828a4 //sha256su0 v4.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h q0,q1,v16.4s +.inst 0x5e105041 //sha256h2 q1,q2,v16.4s +.inst 0x5e0760c4 //sha256su1 v4.4s,v6.4s,v7.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.inst 0x5e2828c5 //sha256su0 v5.4s,v6.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h q0,q1,v17.4s +.inst 0x5e115041 //sha256h2 q1,q2,v17.4s +.inst 0x5e0460e5 //sha256su1 v5.4s,v7.4s,v4.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.inst 0x5e2828e6 //sha256su0 v6.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h q0,q1,v16.4s +.inst 0x5e105041 //sha256h2 q1,q2,v16.4s +.inst 0x5e056086 //sha256su1 v6.4s,v4.4s,v5.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.inst 0x5e282887 //sha256su0 v7.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h q0,q1,v17.4s +.inst 0x5e115041 //sha256h2 q1,q2,v17.4s +.inst 0x5e0660a7 //sha256su1 v7.4s,v5.4s,v6.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h q0,q1,v16.4s +.inst 0x5e105041 //sha256h2 q1,q2,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h q0,q1,v17.4s +.inst 0x5e115041 //sha256h2 q1,q2,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.inst 0x5e104020 //sha256h q0,q1,v16.4s +.inst 0x5e105041 //sha256h2 q1,q2,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.inst 0x5e114020 //sha256h q0,q1,v17.4s +.inst 0x5e115041 //sha256h2 q1,q2,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,.Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/sha256-armv8-win.S b/third_party/boringssl/gen/bcm/sha256-armv8-win.S new file mode 100644 index 00000000..c603cf8b --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha256-armv8-win.S @@ -0,0 +1,1196 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +.text + +.globl sha256_block_data_order_nohw + +.def sha256_block_data_order_nohw + .type 32 +.endef +.align 6 +sha256_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*4 + + ldp w20,w21,[x0] // load context + ldp w22,w23,[x0,#2*4] + ldp w24,w25,[x0,#4*4] + add x2,x1,x2,lsl#6 // end of input + ldp w26,w27,[x0,#6*4] + adrp x30,LK256 + add x30,x30,:lo12:LK256 + stp x0,x2,[x29,#96] + +Loop: + ldp w3,w4,[x1],#2*4 + ldr w19,[x30],#4 // *K++ + eor w28,w21,w22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev w3,w3 // 0 +#endif + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w6,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w3 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w4,w4 // 1 +#endif + ldp w5,w6,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w7,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w4 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w5,w5 // 2 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w8,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w5 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w6,w6 // 3 +#endif + ldp w7,w8,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w9,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w6 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w7,w7 // 4 +#endif + add w24,w24,w17 // h+=Sigma0(a) + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w10,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w7 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w10,ror#11 // Sigma1(e) + ror w10,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w10,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w8,w8 // 5 +#endif + ldp w9,w10,[x1],#2*4 + add w23,w23,w17 // h+=Sigma0(a) + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w11,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w8 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w11,ror#11 // Sigma1(e) + ror w11,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w11,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w9,w9 // 6 +#endif + add w22,w22,w17 // h+=Sigma0(a) + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w12,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w9 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w12,ror#11 // Sigma1(e) + ror w12,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w12,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w10,w10 // 7 +#endif + ldp w11,w12,[x1],#2*4 + add w21,w21,w17 // h+=Sigma0(a) + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + eor w13,w25,w25,ror#14 + and w17,w26,w25 + bic w28,w27,w25 + add w20,w20,w10 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w13,ror#11 // Sigma1(e) + ror w13,w21,#2 + add w20,w20,w17 // h+=Ch(e,f,g) + eor w17,w21,w21,ror#9 + add w20,w20,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w24,w24,w20 // d+=h + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w13,w17,ror#13 // Sigma0(a) + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w20,w20,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w11,w11 // 8 +#endif + add w20,w20,w17 // h+=Sigma0(a) + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + eor w14,w24,w24,ror#14 + and w17,w25,w24 + bic w19,w26,w24 + add w27,w27,w11 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w14,ror#11 // Sigma1(e) + ror w14,w20,#2 + add w27,w27,w17 // h+=Ch(e,f,g) + eor w17,w20,w20,ror#9 + add w27,w27,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w23,w23,w27 // d+=h + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w14,w17,ror#13 // Sigma0(a) + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w27,w27,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w12,w12 // 9 +#endif + ldp w13,w14,[x1],#2*4 + add w27,w27,w17 // h+=Sigma0(a) + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + eor w15,w23,w23,ror#14 + and w17,w24,w23 + bic w28,w25,w23 + add w26,w26,w12 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w15,ror#11 // Sigma1(e) + ror w15,w27,#2 + add w26,w26,w17 // h+=Ch(e,f,g) + eor w17,w27,w27,ror#9 + add w26,w26,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w22,w22,w26 // d+=h + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w15,w17,ror#13 // Sigma0(a) + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w26,w26,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w13,w13 // 10 +#endif + add w26,w26,w17 // h+=Sigma0(a) + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + eor w0,w22,w22,ror#14 + and w17,w23,w22 + bic w19,w24,w22 + add w25,w25,w13 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w0,ror#11 // Sigma1(e) + ror w0,w26,#2 + add w25,w25,w17 // h+=Ch(e,f,g) + eor w17,w26,w26,ror#9 + add w25,w25,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w21,w21,w25 // d+=h + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w0,w17,ror#13 // Sigma0(a) + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w25,w25,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w14,w14 // 11 +#endif + ldp w15,w0,[x1],#2*4 + add w25,w25,w17 // h+=Sigma0(a) + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + eor w6,w21,w21,ror#14 + and w17,w22,w21 + bic w28,w23,w21 + add w24,w24,w14 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w6,ror#11 // Sigma1(e) + ror w6,w25,#2 + add w24,w24,w17 // h+=Ch(e,f,g) + eor w17,w25,w25,ror#9 + add w24,w24,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w20,w20,w24 // d+=h + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w17,ror#13 // Sigma0(a) + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w24,w24,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w15,w15 // 12 +#endif + add w24,w24,w17 // h+=Sigma0(a) + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + eor w7,w20,w20,ror#14 + and w17,w21,w20 + bic w19,w22,w20 + add w23,w23,w15 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w7,ror#11 // Sigma1(e) + ror w7,w24,#2 + add w23,w23,w17 // h+=Ch(e,f,g) + eor w17,w24,w24,ror#9 + add w23,w23,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w27,w27,w23 // d+=h + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w17,ror#13 // Sigma0(a) + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w23,w23,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w0,w0 // 13 +#endif + ldp w1,w2,[x1] + add w23,w23,w17 // h+=Sigma0(a) + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + eor w8,w27,w27,ror#14 + and w17,w20,w27 + bic w28,w21,w27 + add w22,w22,w0 // h+=X[i] + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w8,ror#11 // Sigma1(e) + ror w8,w23,#2 + add w22,w22,w17 // h+=Ch(e,f,g) + eor w17,w23,w23,ror#9 + add w22,w22,w16 // h+=Sigma1(e) + and w19,w19,w28 // (b^c)&=(a^b) + add w26,w26,w22 // d+=h + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w17,ror#13 // Sigma0(a) + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + //add w22,w22,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w1,w1 // 14 +#endif + ldr w6,[sp,#12] + add w22,w22,w17 // h+=Sigma0(a) + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + eor w9,w26,w26,ror#14 + and w17,w27,w26 + bic w19,w20,w26 + add w21,w21,w1 // h+=X[i] + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w9,ror#11 // Sigma1(e) + ror w9,w22,#2 + add w21,w21,w17 // h+=Ch(e,f,g) + eor w17,w22,w22,ror#9 + add w21,w21,w16 // h+=Sigma1(e) + and w28,w28,w19 // (b^c)&=(a^b) + add w25,w25,w21 // d+=h + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w17,ror#13 // Sigma0(a) + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + //add w21,w21,w17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev w2,w2 // 15 +#endif + ldr w7,[sp,#0] + add w21,w21,w17 // h+=Sigma0(a) + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 +Loop_16_xx: + ldr w8,[sp,#4] + str w11,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w10,w5,#7 + and w17,w25,w24 + ror w9,w2,#17 + bic w19,w26,w24 + ror w11,w20,#2 + add w27,w27,w3 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w10,w10,w5,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w11,w11,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w9,w9,w2,ror#19 + eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w11,w20,ror#22 // Sigma0(a) + eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) + add w4,w4,w13 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w4,w4,w10 + add w27,w27,w17 // h+=Sigma0(a) + add w4,w4,w9 + ldr w9,[sp,#8] + str w12,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w11,w6,#7 + and w17,w24,w23 + ror w10,w3,#17 + bic w28,w25,w23 + ror w12,w27,#2 + add w26,w26,w4 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w11,w11,w6,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w12,w12,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w10,w10,w3,ror#19 + eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w12,w27,ror#22 // Sigma0(a) + eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) + add w5,w5,w14 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w5,w5,w11 + add w26,w26,w17 // h+=Sigma0(a) + add w5,w5,w10 + ldr w10,[sp,#12] + str w13,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w12,w7,#7 + and w17,w23,w22 + ror w11,w4,#17 + bic w19,w24,w22 + ror w13,w26,#2 + add w25,w25,w5 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w12,w12,w7,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w13,w13,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w11,w11,w4,ror#19 + eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w13,w26,ror#22 // Sigma0(a) + eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) + add w6,w6,w15 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w6,w6,w12 + add w25,w25,w17 // h+=Sigma0(a) + add w6,w6,w11 + ldr w11,[sp,#0] + str w14,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w13,w8,#7 + and w17,w22,w21 + ror w12,w5,#17 + bic w28,w23,w21 + ror w14,w25,#2 + add w24,w24,w6 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w13,w13,w8,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w14,w14,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w12,w12,w5,ror#19 + eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w14,w25,ror#22 // Sigma0(a) + eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) + add w7,w7,w0 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w7,w7,w13 + add w24,w24,w17 // h+=Sigma0(a) + add w7,w7,w12 + ldr w12,[sp,#4] + str w15,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w14,w9,#7 + and w17,w21,w20 + ror w13,w6,#17 + bic w19,w22,w20 + ror w15,w24,#2 + add w23,w23,w7 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w14,w14,w9,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w15,w15,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w13,w13,w6,ror#19 + eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w15,w24,ror#22 // Sigma0(a) + eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) + add w8,w8,w1 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w8,w8,w14 + add w23,w23,w17 // h+=Sigma0(a) + add w8,w8,w13 + ldr w13,[sp,#8] + str w0,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w15,w10,#7 + and w17,w20,w27 + ror w14,w7,#17 + bic w28,w21,w27 + ror w0,w23,#2 + add w22,w22,w8 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w15,w15,w10,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w0,w0,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w14,w14,w7,ror#19 + eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w0,w23,ror#22 // Sigma0(a) + eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) + add w9,w9,w2 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w9,w9,w15 + add w22,w22,w17 // h+=Sigma0(a) + add w9,w9,w14 + ldr w14,[sp,#12] + str w1,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w0,w11,#7 + and w17,w27,w26 + ror w15,w8,#17 + bic w19,w20,w26 + ror w1,w22,#2 + add w21,w21,w9 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w0,w0,w11,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w1,w1,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w15,w15,w8,ror#19 + eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w1,w22,ror#22 // Sigma0(a) + eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) + add w10,w10,w3 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w10,w10,w0 + add w21,w21,w17 // h+=Sigma0(a) + add w10,w10,w15 + ldr w15,[sp,#0] + str w2,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w1,w12,#7 + and w17,w26,w25 + ror w0,w9,#17 + bic w28,w27,w25 + ror w2,w21,#2 + add w20,w20,w10 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w1,w1,w12,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w2,w2,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w0,w0,w9,ror#19 + eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w2,w21,ror#22 // Sigma0(a) + eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) + add w11,w11,w4 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w11,w11,w1 + add w20,w20,w17 // h+=Sigma0(a) + add w11,w11,w0 + ldr w0,[sp,#4] + str w3,[sp,#0] + ror w16,w24,#6 + add w27,w27,w19 // h+=K[i] + ror w2,w13,#7 + and w17,w25,w24 + ror w1,w10,#17 + bic w19,w26,w24 + ror w3,w20,#2 + add w27,w27,w11 // h+=X[i] + eor w16,w16,w24,ror#11 + eor w2,w2,w13,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w20,w21 // a^b, b^c in next round + eor w16,w16,w24,ror#25 // Sigma1(e) + eor w3,w3,w20,ror#13 + add w27,w27,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w1,w1,w10,ror#19 + eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) + add w27,w27,w16 // h+=Sigma1(e) + eor w28,w28,w21 // Maj(a,b,c) + eor w17,w3,w20,ror#22 // Sigma0(a) + eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) + add w12,w12,w5 + add w23,w23,w27 // d+=h + add w27,w27,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w12,w12,w2 + add w27,w27,w17 // h+=Sigma0(a) + add w12,w12,w1 + ldr w1,[sp,#8] + str w4,[sp,#4] + ror w16,w23,#6 + add w26,w26,w28 // h+=K[i] + ror w3,w14,#7 + and w17,w24,w23 + ror w2,w11,#17 + bic w28,w25,w23 + ror w4,w27,#2 + add w26,w26,w12 // h+=X[i] + eor w16,w16,w23,ror#11 + eor w3,w3,w14,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w27,w20 // a^b, b^c in next round + eor w16,w16,w23,ror#25 // Sigma1(e) + eor w4,w4,w27,ror#13 + add w26,w26,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w2,w2,w11,ror#19 + eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) + add w26,w26,w16 // h+=Sigma1(e) + eor w19,w19,w20 // Maj(a,b,c) + eor w17,w4,w27,ror#22 // Sigma0(a) + eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) + add w13,w13,w6 + add w22,w22,w26 // d+=h + add w26,w26,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w13,w13,w3 + add w26,w26,w17 // h+=Sigma0(a) + add w13,w13,w2 + ldr w2,[sp,#12] + str w5,[sp,#8] + ror w16,w22,#6 + add w25,w25,w19 // h+=K[i] + ror w4,w15,#7 + and w17,w23,w22 + ror w3,w12,#17 + bic w19,w24,w22 + ror w5,w26,#2 + add w25,w25,w13 // h+=X[i] + eor w16,w16,w22,ror#11 + eor w4,w4,w15,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w26,w27 // a^b, b^c in next round + eor w16,w16,w22,ror#25 // Sigma1(e) + eor w5,w5,w26,ror#13 + add w25,w25,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w3,w3,w12,ror#19 + eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) + add w25,w25,w16 // h+=Sigma1(e) + eor w28,w28,w27 // Maj(a,b,c) + eor w17,w5,w26,ror#22 // Sigma0(a) + eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) + add w14,w14,w7 + add w21,w21,w25 // d+=h + add w25,w25,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w14,w14,w4 + add w25,w25,w17 // h+=Sigma0(a) + add w14,w14,w3 + ldr w3,[sp,#0] + str w6,[sp,#12] + ror w16,w21,#6 + add w24,w24,w28 // h+=K[i] + ror w5,w0,#7 + and w17,w22,w21 + ror w4,w13,#17 + bic w28,w23,w21 + ror w6,w25,#2 + add w24,w24,w14 // h+=X[i] + eor w16,w16,w21,ror#11 + eor w5,w5,w0,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w25,w26 // a^b, b^c in next round + eor w16,w16,w21,ror#25 // Sigma1(e) + eor w6,w6,w25,ror#13 + add w24,w24,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w4,w4,w13,ror#19 + eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) + add w24,w24,w16 // h+=Sigma1(e) + eor w19,w19,w26 // Maj(a,b,c) + eor w17,w6,w25,ror#22 // Sigma0(a) + eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) + add w15,w15,w8 + add w20,w20,w24 // d+=h + add w24,w24,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w15,w15,w5 + add w24,w24,w17 // h+=Sigma0(a) + add w15,w15,w4 + ldr w4,[sp,#4] + str w7,[sp,#0] + ror w16,w20,#6 + add w23,w23,w19 // h+=K[i] + ror w6,w1,#7 + and w17,w21,w20 + ror w5,w14,#17 + bic w19,w22,w20 + ror w7,w24,#2 + add w23,w23,w15 // h+=X[i] + eor w16,w16,w20,ror#11 + eor w6,w6,w1,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w24,w25 // a^b, b^c in next round + eor w16,w16,w20,ror#25 // Sigma1(e) + eor w7,w7,w24,ror#13 + add w23,w23,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w5,w5,w14,ror#19 + eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) + add w23,w23,w16 // h+=Sigma1(e) + eor w28,w28,w25 // Maj(a,b,c) + eor w17,w7,w24,ror#22 // Sigma0(a) + eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) + add w0,w0,w9 + add w27,w27,w23 // d+=h + add w23,w23,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w0,w0,w6 + add w23,w23,w17 // h+=Sigma0(a) + add w0,w0,w5 + ldr w5,[sp,#8] + str w8,[sp,#4] + ror w16,w27,#6 + add w22,w22,w28 // h+=K[i] + ror w7,w2,#7 + and w17,w20,w27 + ror w6,w15,#17 + bic w28,w21,w27 + ror w8,w23,#2 + add w22,w22,w0 // h+=X[i] + eor w16,w16,w27,ror#11 + eor w7,w7,w2,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w23,w24 // a^b, b^c in next round + eor w16,w16,w27,ror#25 // Sigma1(e) + eor w8,w8,w23,ror#13 + add w22,w22,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w6,w6,w15,ror#19 + eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) + add w22,w22,w16 // h+=Sigma1(e) + eor w19,w19,w24 // Maj(a,b,c) + eor w17,w8,w23,ror#22 // Sigma0(a) + eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) + add w1,w1,w10 + add w26,w26,w22 // d+=h + add w22,w22,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w1,w1,w7 + add w22,w22,w17 // h+=Sigma0(a) + add w1,w1,w6 + ldr w6,[sp,#12] + str w9,[sp,#8] + ror w16,w26,#6 + add w21,w21,w19 // h+=K[i] + ror w8,w3,#7 + and w17,w27,w26 + ror w7,w0,#17 + bic w19,w20,w26 + ror w9,w22,#2 + add w21,w21,w1 // h+=X[i] + eor w16,w16,w26,ror#11 + eor w8,w8,w3,ror#18 + orr w17,w17,w19 // Ch(e,f,g) + eor w19,w22,w23 // a^b, b^c in next round + eor w16,w16,w26,ror#25 // Sigma1(e) + eor w9,w9,w22,ror#13 + add w21,w21,w17 // h+=Ch(e,f,g) + and w28,w28,w19 // (b^c)&=(a^b) + eor w7,w7,w0,ror#19 + eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) + add w21,w21,w16 // h+=Sigma1(e) + eor w28,w28,w23 // Maj(a,b,c) + eor w17,w9,w22,ror#22 // Sigma0(a) + eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) + add w2,w2,w11 + add w25,w25,w21 // d+=h + add w21,w21,w28 // h+=Maj(a,b,c) + ldr w28,[x30],#4 // *K++, w19 in next round + add w2,w2,w8 + add w21,w21,w17 // h+=Sigma0(a) + add w2,w2,w7 + ldr w7,[sp,#0] + str w10,[sp,#12] + ror w16,w25,#6 + add w20,w20,w28 // h+=K[i] + ror w9,w4,#7 + and w17,w26,w25 + ror w8,w1,#17 + bic w28,w27,w25 + ror w10,w21,#2 + add w20,w20,w2 // h+=X[i] + eor w16,w16,w25,ror#11 + eor w9,w9,w4,ror#18 + orr w17,w17,w28 // Ch(e,f,g) + eor w28,w21,w22 // a^b, b^c in next round + eor w16,w16,w25,ror#25 // Sigma1(e) + eor w10,w10,w21,ror#13 + add w20,w20,w17 // h+=Ch(e,f,g) + and w19,w19,w28 // (b^c)&=(a^b) + eor w8,w8,w1,ror#19 + eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) + add w20,w20,w16 // h+=Sigma1(e) + eor w19,w19,w22 // Maj(a,b,c) + eor w17,w10,w21,ror#22 // Sigma0(a) + eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) + add w3,w3,w12 + add w24,w24,w20 // d+=h + add w20,w20,w19 // h+=Maj(a,b,c) + ldr w19,[x30],#4 // *K++, w28 in next round + add w3,w3,w9 + add w20,w20,w17 // h+=Sigma0(a) + add w3,w3,w8 + cbnz w19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#260 // rewind + + ldp w3,w4,[x0] + ldp w5,w6,[x0,#2*4] + add x1,x1,#14*4 // advance input pointer + ldp w7,w8,[x0,#4*4] + add w20,w20,w3 + ldp w9,w10,[x0,#6*4] + add w21,w21,w4 + add w22,w22,w5 + add w23,w23,w6 + stp w20,w21,[x0] + add w24,w24,w7 + add w25,w25,w8 + stp w22,w23,[x0,#2*4] + add w26,w26,w9 + add w27,w27,w10 + cmp x1,x2 + stp w24,w25,[x0,#4*4] + stp w26,w27,[x0,#6*4] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*4 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.section .rodata +.align 6 + +LK256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0 //terminator + +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl sha256_block_data_order_hw + +.def sha256_block_data_order_hw + .type 32 +.endef +.align 6 +sha256_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v0.4s,v1.4s},[x0] + adrp x3,LK256 + add x3,x3,:lo12:LK256 + +Loop_hw: + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + sub x2,x2,#1 + ld1 {v16.4s},[x3],#16 + rev32 v4.16b,v4.16b + rev32 v5.16b,v5.16b + rev32 v6.16b,v6.16b + rev32 v7.16b,v7.16b + orr v18.16b,v0.16b,v0.16b // offload + orr v19.16b,v1.16b,v1.16b + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e0760c4 //sha256su1 v4.4s,v6.4s,v7.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.4s,v6.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0460e5 //sha256su1 v5.4s,v7.4s,v4.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e056086 //sha256su1 v6.4s,v4.4s,v5.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0660a7 //sha256su1 v7.4s,v5.4s,v6.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e0760c4 //sha256su1 v4.4s,v6.4s,v7.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.4s,v6.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0460e5 //sha256su1 v5.4s,v7.4s,v4.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e056086 //sha256su1 v6.4s,v4.4s,v5.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0660a7 //sha256su1 v7.4s,v5.4s,v6.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s +.long 0x5e2828a4 //sha256su0 v4.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e0760c4 //sha256su1 v4.4s,v6.4s,v7.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s +.long 0x5e2828c5 //sha256su0 v5.4s,v6.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0460e5 //sha256su1 v5.4s,v7.4s,v4.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v6.4s +.long 0x5e2828e6 //sha256su0 v6.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s +.long 0x5e056086 //sha256su1 v6.4s,v4.4s,v5.4s + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v7.4s +.long 0x5e282887 //sha256su0 v7.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s +.long 0x5e0660a7 //sha256su1 v7.4s,v5.4s,v6.4s + ld1 {v17.4s},[x3],#16 + add v16.4s,v16.4s,v4.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s + + ld1 {v16.4s},[x3],#16 + add v17.4s,v17.4s,v5.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s + + ld1 {v17.4s},[x3] + add v16.4s,v16.4s,v6.4s + sub x3,x3,#64*4-16 // rewind + orr v2.16b,v0.16b,v0.16b +.long 0x5e104020 //sha256h q0,q1,v16.4s +.long 0x5e105041 //sha256h2 q1,q2,v16.4s + + add v17.4s,v17.4s,v7.4s + orr v2.16b,v0.16b,v0.16b +.long 0x5e114020 //sha256h q0,q1,v17.4s +.long 0x5e115041 //sha256h2 q1,q2,v17.4s + + add v0.4s,v0.4s,v18.4s + add v1.4s,v1.4s,v19.4s + + cbnz x2,Loop_hw + + st1 {v0.4s,v1.4s},[x0] + + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/sha256-x86_64-apple.S b/third_party/boringssl/gen/bcm/sha256-x86_64-apple.S new file mode 100644 index 00000000..367f0d33 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha256-x86_64-apple.S @@ -0,0 +1,4170 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.globl _sha256_block_data_order_nohw +.private_extern _sha256_block_data_order_nohw + +.p2align 4 +_sha256_block_data_order_nohw: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $64+32,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + +L$prologue: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp L$loop + +.p2align 4 +L$loop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + jmp L$rounds_16_xx +.p2align 4 +L$rounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz L$rounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop + + movq 88(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue: + ret + + +.section __DATA,__const +.p2align 6 + +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl _sha256_block_data_order_hw +.private_extern _sha256_block_data_order_hw + +.p2align 6 +_sha256_block_data_order_hw: + +_CET_ENDBR + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 + palignr $8,%xmm2,%xmm1 + punpcklqdq %xmm0,%xmm2 + jmp L$oop_shaext + +.p2align 4 +L$oop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 + pshufb %xmm7,%xmm3 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 + pshufb %xmm7,%xmm4 + movdqa %xmm2,%xmm10 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 + sha256rnds2 %xmm2,%xmm1 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 + pshufb %xmm7,%xmm5 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi + sha256msg1 %xmm4,%xmm3 + sha256rnds2 %xmm2,%xmm1 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 + pshufb %xmm7,%xmm6 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 + palignr $4,%xmm5,%xmm7 + nop + paddd %xmm7,%xmm3 + sha256msg1 %xmm5,%xmm4 + sha256rnds2 %xmm2,%xmm1 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + sha256msg2 %xmm6,%xmm3 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 + palignr $4,%xmm6,%xmm7 + nop + paddd %xmm7,%xmm4 + sha256msg1 %xmm6,%xmm5 + sha256rnds2 %xmm2,%xmm1 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 + sha256msg2 %xmm3,%xmm4 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 + palignr $4,%xmm3,%xmm7 + nop + paddd %xmm7,%xmm5 + sha256msg1 %xmm3,%xmm6 + sha256rnds2 %xmm2,%xmm1 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 + sha256msg2 %xmm4,%xmm5 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 + palignr $4,%xmm4,%xmm7 + nop + paddd %xmm7,%xmm6 + sha256msg1 %xmm4,%xmm3 + sha256rnds2 %xmm2,%xmm1 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 + sha256msg2 %xmm5,%xmm6 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 + palignr $4,%xmm5,%xmm7 + nop + paddd %xmm7,%xmm3 + sha256msg1 %xmm5,%xmm4 + sha256rnds2 %xmm2,%xmm1 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + sha256msg2 %xmm6,%xmm3 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 + palignr $4,%xmm6,%xmm7 + nop + paddd %xmm7,%xmm4 + sha256msg1 %xmm6,%xmm5 + sha256rnds2 %xmm2,%xmm1 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 + sha256msg2 %xmm3,%xmm4 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 + palignr $4,%xmm3,%xmm7 + nop + paddd %xmm7,%xmm5 + sha256msg1 %xmm3,%xmm6 + sha256rnds2 %xmm2,%xmm1 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 + sha256msg2 %xmm4,%xmm5 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 + palignr $4,%xmm4,%xmm7 + nop + paddd %xmm7,%xmm6 + sha256msg1 %xmm4,%xmm3 + sha256rnds2 %xmm2,%xmm1 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 + sha256msg2 %xmm5,%xmm6 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 + palignr $4,%xmm5,%xmm7 + nop + paddd %xmm7,%xmm3 + sha256msg1 %xmm5,%xmm4 + sha256rnds2 %xmm2,%xmm1 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + sha256msg2 %xmm6,%xmm3 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 + palignr $4,%xmm6,%xmm7 + nop + paddd %xmm7,%xmm4 + sha256msg1 %xmm6,%xmm5 + sha256rnds2 %xmm2,%xmm1 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 + sha256msg2 %xmm3,%xmm4 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 + palignr $4,%xmm3,%xmm7 + nop + paddd %xmm7,%xmm5 + sha256msg1 %xmm3,%xmm6 + sha256rnds2 %xmm2,%xmm1 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 + sha256msg2 %xmm4,%xmm5 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 + palignr $4,%xmm4,%xmm7 + sha256rnds2 %xmm2,%xmm1 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + sha256msg2 %xmm5,%xmm6 + movdqa %xmm8,%xmm7 + sha256rnds2 %xmm2,%xmm1 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop + sha256rnds2 %xmm2,%xmm1 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz L$oop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 + palignr $8,%xmm7,%xmm2 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + ret + + +.globl _sha256_block_data_order_ssse3 +.private_extern _sha256_block_data_order_ssse3 + +.p2align 6 +_sha256_block_data_order_ssse3: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + +L$prologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp L$loop_ssse3 +.p2align 4 +L$loop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + pshufb %xmm7,%xmm0 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rbp + pshufb %xmm7,%xmm1 + movdqa 0(%rbp),%xmm4 + movdqa 32(%rbp),%xmm5 + pshufb %xmm7,%xmm2 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 + pshufb %xmm7,%xmm3 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$ssse3_00_47 + +.p2align 4 +L$ssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + palignr $4,%xmm0,%xmm4 + andl %r8d,%r12d + xorl %r8d,%r13d + palignr $4,%xmm2,%xmm7 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + palignr $4,%xmm1,%xmm4 + andl %eax,%r12d + xorl %eax,%r13d + palignr $4,%xmm3,%xmm7 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + palignr $4,%xmm2,%xmm4 + andl %r8d,%r12d + xorl %r8d,%r13d + palignr $4,%xmm0,%xmm7 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + palignr $4,%xmm3,%xmm4 + andl %eax,%r12d + xorl %eax,%r13d + palignr $4,%xmm1,%xmm7 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne L$ssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_ssse3 + + movq 88(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_ssse3: + ret + + +.globl _sha256_block_data_order_avx +.private_extern _sha256_block_data_order_avx + +.p2align 6 +_sha256_block_data_order_avx: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) + +L$prologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp L$loop_avx +.p2align 4 +L$loop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp L$avx_00_47 + +.p2align 4 +L$avx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne L$avx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb L$loop_avx + + movq 88(%rsp),%rsi + + vzeroupper + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_avx: + ret + + +#endif diff --git a/third_party/boringssl/gen/bcm/sha256-x86_64-linux.S b/third_party/boringssl/gen/bcm/sha256-x86_64-linux.S new file mode 100644 index 00000000..938f5316 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha256-x86_64-linux.S @@ -0,0 +1,4170 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.globl sha256_block_data_order_nohw +.hidden sha256_block_data_order_nohw +.type sha256_block_data_order_nohw,@function +.align 16 +sha256_block_data_order_nohw: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $64+32,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + jmp .Lloop + +.align 16 +.Lloop: + movl %ebx,%edi + leaq K256(%rip),%rbp + xorl %ecx,%edi + movl 0(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 4(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 8(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 12(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 16(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 20(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 24(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 28(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + addl %r14d,%eax + movl 32(%rsi),%r12d + movl %r8d,%r13d + movl %eax,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + addl %r14d,%r11d + movl 36(%rsi),%r12d + movl %edx,%r13d + movl %r11d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + addl %r14d,%r10d + movl 40(%rsi),%r12d + movl %ecx,%r13d + movl %r10d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + addl %r14d,%r9d + movl 44(%rsi),%r12d + movl %ebx,%r13d + movl %r9d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + addl %r14d,%r8d + movl 48(%rsi),%r12d + movl %eax,%r13d + movl %r8d,%r14d + bswapl %r12d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + addl %r14d,%edx + movl 52(%rsi),%r12d + movl %r11d,%r13d + movl %edx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + addl %r14d,%ecx + movl 56(%rsi),%r12d + movl %r10d,%r13d + movl %ecx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + addl %r14d,%ebx + movl 60(%rsi),%r12d + movl %r9d,%r13d + movl %ebx,%r14d + bswapl %r12d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movl 4(%rsp),%r13d + movl 56(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 36(%rsp),%r12d + + addl 0(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,0(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 8(%rsp),%r13d + movl 60(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 40(%rsp),%r12d + + addl 4(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,4(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 12(%rsp),%r13d + movl 0(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 44(%rsp),%r12d + + addl 8(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,8(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 16(%rsp),%r13d + movl 4(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 48(%rsp),%r12d + + addl 12(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,12(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 20(%rsp),%r13d + movl 8(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 52(%rsp),%r12d + + addl 16(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,16(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 24(%rsp),%r13d + movl 12(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 56(%rsp),%r12d + + addl 20(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,20(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 28(%rsp),%r13d + movl 16(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 60(%rsp),%r12d + + addl 24(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,24(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 32(%rsp),%r13d + movl 20(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 0(%rsp),%r12d + + addl 28(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,28(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + movl 36(%rsp),%r13d + movl 24(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%eax + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 4(%rsp),%r12d + + addl 32(%rsp),%r12d + movl %r8d,%r13d + addl %r15d,%r12d + movl %eax,%r14d + rorl $14,%r13d + movl %r9d,%r15d + + xorl %r8d,%r13d + rorl $9,%r14d + xorl %r10d,%r15d + + movl %r12d,32(%rsp) + xorl %eax,%r14d + andl %r8d,%r15d + + rorl $5,%r13d + addl %r11d,%r12d + xorl %r10d,%r15d + + rorl $11,%r14d + xorl %r8d,%r13d + addl %r15d,%r12d + + movl %eax,%r15d + addl (%rbp),%r12d + xorl %eax,%r14d + + xorl %ebx,%r15d + rorl $6,%r13d + movl %ebx,%r11d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r11d + addl %r12d,%edx + addl %r12d,%r11d + + leaq 4(%rbp),%rbp + movl 40(%rsp),%r13d + movl 28(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r11d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 8(%rsp),%r12d + + addl 36(%rsp),%r12d + movl %edx,%r13d + addl %edi,%r12d + movl %r11d,%r14d + rorl $14,%r13d + movl %r8d,%edi + + xorl %edx,%r13d + rorl $9,%r14d + xorl %r9d,%edi + + movl %r12d,36(%rsp) + xorl %r11d,%r14d + andl %edx,%edi + + rorl $5,%r13d + addl %r10d,%r12d + xorl %r9d,%edi + + rorl $11,%r14d + xorl %edx,%r13d + addl %edi,%r12d + + movl %r11d,%edi + addl (%rbp),%r12d + xorl %r11d,%r14d + + xorl %eax,%edi + rorl $6,%r13d + movl %eax,%r10d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r10d + addl %r12d,%ecx + addl %r12d,%r10d + + leaq 4(%rbp),%rbp + movl 44(%rsp),%r13d + movl 32(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r10d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 12(%rsp),%r12d + + addl 40(%rsp),%r12d + movl %ecx,%r13d + addl %r15d,%r12d + movl %r10d,%r14d + rorl $14,%r13d + movl %edx,%r15d + + xorl %ecx,%r13d + rorl $9,%r14d + xorl %r8d,%r15d + + movl %r12d,40(%rsp) + xorl %r10d,%r14d + andl %ecx,%r15d + + rorl $5,%r13d + addl %r9d,%r12d + xorl %r8d,%r15d + + rorl $11,%r14d + xorl %ecx,%r13d + addl %r15d,%r12d + + movl %r10d,%r15d + addl (%rbp),%r12d + xorl %r10d,%r14d + + xorl %r11d,%r15d + rorl $6,%r13d + movl %r11d,%r9d + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%r9d + addl %r12d,%ebx + addl %r12d,%r9d + + leaq 4(%rbp),%rbp + movl 48(%rsp),%r13d + movl 36(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r9d + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 16(%rsp),%r12d + + addl 44(%rsp),%r12d + movl %ebx,%r13d + addl %edi,%r12d + movl %r9d,%r14d + rorl $14,%r13d + movl %ecx,%edi + + xorl %ebx,%r13d + rorl $9,%r14d + xorl %edx,%edi + + movl %r12d,44(%rsp) + xorl %r9d,%r14d + andl %ebx,%edi + + rorl $5,%r13d + addl %r8d,%r12d + xorl %edx,%edi + + rorl $11,%r14d + xorl %ebx,%r13d + addl %edi,%r12d + + movl %r9d,%edi + addl (%rbp),%r12d + xorl %r9d,%r14d + + xorl %r10d,%edi + rorl $6,%r13d + movl %r10d,%r8d + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%r8d + addl %r12d,%eax + addl %r12d,%r8d + + leaq 20(%rbp),%rbp + movl 52(%rsp),%r13d + movl 40(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%r8d + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 20(%rsp),%r12d + + addl 48(%rsp),%r12d + movl %eax,%r13d + addl %r15d,%r12d + movl %r8d,%r14d + rorl $14,%r13d + movl %ebx,%r15d + + xorl %eax,%r13d + rorl $9,%r14d + xorl %ecx,%r15d + + movl %r12d,48(%rsp) + xorl %r8d,%r14d + andl %eax,%r15d + + rorl $5,%r13d + addl %edx,%r12d + xorl %ecx,%r15d + + rorl $11,%r14d + xorl %eax,%r13d + addl %r15d,%r12d + + movl %r8d,%r15d + addl (%rbp),%r12d + xorl %r8d,%r14d + + xorl %r9d,%r15d + rorl $6,%r13d + movl %r9d,%edx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%edx + addl %r12d,%r11d + addl %r12d,%edx + + leaq 4(%rbp),%rbp + movl 56(%rsp),%r13d + movl 44(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%edx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 24(%rsp),%r12d + + addl 52(%rsp),%r12d + movl %r11d,%r13d + addl %edi,%r12d + movl %edx,%r14d + rorl $14,%r13d + movl %eax,%edi + + xorl %r11d,%r13d + rorl $9,%r14d + xorl %ebx,%edi + + movl %r12d,52(%rsp) + xorl %edx,%r14d + andl %r11d,%edi + + rorl $5,%r13d + addl %ecx,%r12d + xorl %ebx,%edi + + rorl $11,%r14d + xorl %r11d,%r13d + addl %edi,%r12d + + movl %edx,%edi + addl (%rbp),%r12d + xorl %edx,%r14d + + xorl %r8d,%edi + rorl $6,%r13d + movl %r8d,%ecx + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%ecx + addl %r12d,%r10d + addl %r12d,%ecx + + leaq 4(%rbp),%rbp + movl 60(%rsp),%r13d + movl 48(%rsp),%r15d + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ecx + movl %r15d,%r14d + rorl $2,%r15d + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%r15d + shrl $10,%r14d + + rorl $17,%r15d + xorl %r13d,%r12d + xorl %r14d,%r15d + addl 28(%rsp),%r12d + + addl 56(%rsp),%r12d + movl %r10d,%r13d + addl %r15d,%r12d + movl %ecx,%r14d + rorl $14,%r13d + movl %r11d,%r15d + + xorl %r10d,%r13d + rorl $9,%r14d + xorl %eax,%r15d + + movl %r12d,56(%rsp) + xorl %ecx,%r14d + andl %r10d,%r15d + + rorl $5,%r13d + addl %ebx,%r12d + xorl %eax,%r15d + + rorl $11,%r14d + xorl %r10d,%r13d + addl %r15d,%r12d + + movl %ecx,%r15d + addl (%rbp),%r12d + xorl %ecx,%r14d + + xorl %edx,%r15d + rorl $6,%r13d + movl %edx,%ebx + + andl %r15d,%edi + rorl $2,%r14d + addl %r13d,%r12d + + xorl %edi,%ebx + addl %r12d,%r9d + addl %r12d,%ebx + + leaq 4(%rbp),%rbp + movl 0(%rsp),%r13d + movl 52(%rsp),%edi + + movl %r13d,%r12d + rorl $11,%r13d + addl %r14d,%ebx + movl %edi,%r14d + rorl $2,%edi + + xorl %r12d,%r13d + shrl $3,%r12d + rorl $7,%r13d + xorl %r14d,%edi + shrl $10,%r14d + + rorl $17,%edi + xorl %r13d,%r12d + xorl %r14d,%edi + addl 32(%rsp),%r12d + + addl 60(%rsp),%r12d + movl %r9d,%r13d + addl %edi,%r12d + movl %ebx,%r14d + rorl $14,%r13d + movl %r10d,%edi + + xorl %r9d,%r13d + rorl $9,%r14d + xorl %r11d,%edi + + movl %r12d,60(%rsp) + xorl %ebx,%r14d + andl %r9d,%edi + + rorl $5,%r13d + addl %eax,%r12d + xorl %r11d,%edi + + rorl $11,%r14d + xorl %r9d,%r13d + addl %edi,%r12d + + movl %ebx,%edi + addl (%rbp),%r12d + xorl %ebx,%r14d + + xorl %ecx,%edi + rorl $6,%r13d + movl %ecx,%eax + + andl %edi,%r15d + rorl $2,%r14d + addl %r13d,%r12d + + xorl %r15d,%eax + addl %r12d,%r8d + addl %r12d,%eax + + leaq 20(%rbp),%rbp + cmpb $0,3(%rbp) + jnz .Lrounds_16_xx + + movq 64+0(%rsp),%rdi + addl %r14d,%eax + leaq 64(%rsi),%rsi + + addl 0(%rdi),%eax + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue: + ret +.cfi_endproc +.size sha256_block_data_order_nohw,.-sha256_block_data_order_nohw +.section .rodata +.align 64 +.type K256,@object +K256: +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 +.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 +.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl sha256_block_data_order_hw +.hidden sha256_block_data_order_hw +.type sha256_block_data_order_hw,@function +.align 64 +sha256_block_data_order_hw: +.cfi_startproc +_CET_ENDBR + leaq K256+128(%rip),%rcx + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa 512-128(%rcx),%xmm7 + + pshufd $0x1b,%xmm1,%xmm0 + pshufd $0xb1,%xmm1,%xmm1 + pshufd $0x1b,%xmm2,%xmm2 + movdqa %xmm7,%xmm8 + palignr $8,%xmm2,%xmm1 + punpcklqdq %xmm0,%xmm2 + jmp .Loop_shaext + +.align 16 +.Loop_shaext: + movdqu (%rsi),%xmm3 + movdqu 16(%rsi),%xmm4 + movdqu 32(%rsi),%xmm5 + pshufb %xmm7,%xmm3 + movdqu 48(%rsi),%xmm6 + + movdqa 0-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 + pshufb %xmm7,%xmm4 + movdqa %xmm2,%xmm10 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + nop + movdqa %xmm1,%xmm9 + sha256rnds2 %xmm2,%xmm1 + + movdqa 32-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 + pshufb %xmm7,%xmm5 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + leaq 64(%rsi),%rsi + sha256msg1 %xmm4,%xmm3 + sha256rnds2 %xmm2,%xmm1 + + movdqa 64-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 + pshufb %xmm7,%xmm6 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 + palignr $4,%xmm5,%xmm7 + nop + paddd %xmm7,%xmm3 + sha256msg1 %xmm5,%xmm4 + sha256rnds2 %xmm2,%xmm1 + + movdqa 96-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + sha256msg2 %xmm6,%xmm3 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 + palignr $4,%xmm6,%xmm7 + nop + paddd %xmm7,%xmm4 + sha256msg1 %xmm6,%xmm5 + sha256rnds2 %xmm2,%xmm1 + movdqa 128-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 + sha256msg2 %xmm3,%xmm4 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 + palignr $4,%xmm3,%xmm7 + nop + paddd %xmm7,%xmm5 + sha256msg1 %xmm3,%xmm6 + sha256rnds2 %xmm2,%xmm1 + movdqa 160-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 + sha256msg2 %xmm4,%xmm5 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 + palignr $4,%xmm4,%xmm7 + nop + paddd %xmm7,%xmm6 + sha256msg1 %xmm4,%xmm3 + sha256rnds2 %xmm2,%xmm1 + movdqa 192-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 + sha256msg2 %xmm5,%xmm6 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 + palignr $4,%xmm5,%xmm7 + nop + paddd %xmm7,%xmm3 + sha256msg1 %xmm5,%xmm4 + sha256rnds2 %xmm2,%xmm1 + movdqa 224-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + sha256msg2 %xmm6,%xmm3 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 + palignr $4,%xmm6,%xmm7 + nop + paddd %xmm7,%xmm4 + sha256msg1 %xmm6,%xmm5 + sha256rnds2 %xmm2,%xmm1 + movdqa 256-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 + sha256msg2 %xmm3,%xmm4 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 + palignr $4,%xmm3,%xmm7 + nop + paddd %xmm7,%xmm5 + sha256msg1 %xmm3,%xmm6 + sha256rnds2 %xmm2,%xmm1 + movdqa 288-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 + sha256msg2 %xmm4,%xmm5 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 + palignr $4,%xmm4,%xmm7 + nop + paddd %xmm7,%xmm6 + sha256msg1 %xmm4,%xmm3 + sha256rnds2 %xmm2,%xmm1 + movdqa 320-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 + sha256msg2 %xmm5,%xmm6 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm6,%xmm7 + palignr $4,%xmm5,%xmm7 + nop + paddd %xmm7,%xmm3 + sha256msg1 %xmm5,%xmm4 + sha256rnds2 %xmm2,%xmm1 + movdqa 352-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + sha256msg2 %xmm6,%xmm3 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm3,%xmm7 + palignr $4,%xmm6,%xmm7 + nop + paddd %xmm7,%xmm4 + sha256msg1 %xmm6,%xmm5 + sha256rnds2 %xmm2,%xmm1 + movdqa 384-128(%rcx),%xmm0 + paddd %xmm3,%xmm0 + sha256msg2 %xmm3,%xmm4 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm4,%xmm7 + palignr $4,%xmm3,%xmm7 + nop + paddd %xmm7,%xmm5 + sha256msg1 %xmm3,%xmm6 + sha256rnds2 %xmm2,%xmm1 + movdqa 416-128(%rcx),%xmm0 + paddd %xmm4,%xmm0 + sha256msg2 %xmm4,%xmm5 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + movdqa %xmm5,%xmm7 + palignr $4,%xmm4,%xmm7 + sha256rnds2 %xmm2,%xmm1 + paddd %xmm7,%xmm6 + + movdqa 448-128(%rcx),%xmm0 + paddd %xmm5,%xmm0 + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + sha256msg2 %xmm5,%xmm6 + movdqa %xmm8,%xmm7 + sha256rnds2 %xmm2,%xmm1 + + movdqa 480-128(%rcx),%xmm0 + paddd %xmm6,%xmm0 + nop + sha256rnds2 %xmm1,%xmm2 + pshufd $0x0e,%xmm0,%xmm0 + decq %rdx + nop + sha256rnds2 %xmm2,%xmm1 + + paddd %xmm10,%xmm2 + paddd %xmm9,%xmm1 + jnz .Loop_shaext + + pshufd $0xb1,%xmm2,%xmm2 + pshufd $0x1b,%xmm1,%xmm7 + pshufd $0xb1,%xmm1,%xmm1 + punpckhqdq %xmm2,%xmm1 + palignr $8,%xmm7,%xmm2 + + movdqu %xmm1,(%rdi) + movdqu %xmm2,16(%rdi) + ret +.cfi_endproc +.size sha256_block_data_order_hw,.-sha256_block_data_order_hw +.globl sha256_block_data_order_ssse3 +.hidden sha256_block_data_order_ssse3 +.type sha256_block_data_order_ssse3,@function +.align 64 +sha256_block_data_order_ssse3: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_ssse3: + + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + + + jmp .Lloop_ssse3 +.align 16 +.Lloop_ssse3: + movdqa K256+512(%rip),%xmm7 + movdqu 0(%rsi),%xmm0 + movdqu 16(%rsi),%xmm1 + movdqu 32(%rsi),%xmm2 + pshufb %xmm7,%xmm0 + movdqu 48(%rsi),%xmm3 + leaq K256(%rip),%rbp + pshufb %xmm7,%xmm1 + movdqa 0(%rbp),%xmm4 + movdqa 32(%rbp),%xmm5 + pshufb %xmm7,%xmm2 + paddd %xmm0,%xmm4 + movdqa 64(%rbp),%xmm6 + pshufb %xmm7,%xmm3 + movdqa 96(%rbp),%xmm7 + paddd %xmm1,%xmm5 + paddd %xmm2,%xmm6 + paddd %xmm3,%xmm7 + movdqa %xmm4,0(%rsp) + movl %eax,%r14d + movdqa %xmm5,16(%rsp) + movl %ebx,%edi + movdqa %xmm6,32(%rsp) + xorl %ecx,%edi + movdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lssse3_00_47 + +.align 16 +.Lssse3_00_47: + subq $-128,%rbp + rorl $14,%r13d + movdqa %xmm1,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm3,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + palignr $4,%xmm0,%xmm4 + andl %r8d,%r12d + xorl %r8d,%r13d + palignr $4,%xmm2,%xmm7 + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm3,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 4(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm0 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm0 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm0,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 0(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm0 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm0,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,0(%rsp) + rorl $14,%r13d + movdqa %xmm2,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm0,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + palignr $4,%xmm1,%xmm4 + andl %eax,%r12d + xorl %eax,%r13d + palignr $4,%xmm3,%xmm7 + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm0,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 20(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm1 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm1 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm1,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 32(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm1 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm1,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,16(%rsp) + rorl $14,%r13d + movdqa %xmm3,%xmm4 + movl %r14d,%eax + movl %r9d,%r12d + movdqa %xmm1,%xmm7 + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + palignr $4,%xmm2,%xmm4 + andl %r8d,%r12d + xorl %r8d,%r13d + palignr $4,%xmm0,%xmm7 + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %ebx,%r15d + addl %r12d,%r11d + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r11d,%edx + psrld $7,%xmm6 + addl %edi,%r11d + movl %edx,%r13d + pshufd $250,%xmm1,%xmm7 + addl %r11d,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%r11d + movl %r8d,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %r11d,%r14d + pxor %xmm5,%xmm4 + andl %edx,%r12d + xorl %edx,%r13d + pslld $11,%xmm5 + addl 36(%rsp),%r10d + movl %r11d,%edi + pxor %xmm6,%xmm4 + xorl %r9d,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %eax,%edi + addl %r12d,%r10d + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + psrld $10,%xmm7 + addl %r13d,%r10d + xorl %eax,%r15d + paddd %xmm4,%xmm2 + rorl $2,%r14d + addl %r10d,%ecx + psrlq $17,%xmm6 + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %ecx,%r13d + xorl %r8d,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + psrldq $8,%xmm7 + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + paddd %xmm7,%xmm2 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + pshufd $80,%xmm2,%xmm7 + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + movdqa %xmm7,%xmm6 + addl %edi,%r9d + movl %ebx,%r13d + psrld $10,%xmm7 + addl %r9d,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%r9d + movl %ecx,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + psrlq $2,%xmm6 + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + pxor %xmm6,%xmm7 + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %r10d,%edi + addl %r12d,%r8d + movdqa 64(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + paddd %xmm7,%xmm2 + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + paddd %xmm2,%xmm6 + movl %eax,%r13d + addl %r8d,%r14d + movdqa %xmm6,32(%rsp) + rorl $14,%r13d + movdqa %xmm0,%xmm4 + movl %r14d,%r8d + movl %ebx,%r12d + movdqa %xmm2,%xmm7 + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + palignr $4,%xmm3,%xmm4 + andl %eax,%r12d + xorl %eax,%r13d + palignr $4,%xmm1,%xmm7 + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + movdqa %xmm4,%xmm5 + xorl %r9d,%r15d + addl %r12d,%edx + movdqa %xmm4,%xmm6 + rorl $6,%r13d + andl %r15d,%edi + psrld $3,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %edx,%r11d + psrld $7,%xmm6 + addl %edi,%edx + movl %r11d,%r13d + pshufd $250,%xmm2,%xmm7 + addl %edx,%r14d + rorl $14,%r13d + pslld $14,%xmm5 + movl %r14d,%edx + movl %eax,%r12d + pxor %xmm6,%xmm4 + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + psrld $11,%xmm6 + xorl %edx,%r14d + pxor %xmm5,%xmm4 + andl %r11d,%r12d + xorl %r11d,%r13d + pslld $11,%xmm5 + addl 52(%rsp),%ecx + movl %edx,%edi + pxor %xmm6,%xmm4 + xorl %ebx,%r12d + rorl $11,%r14d + movdqa %xmm7,%xmm6 + xorl %r8d,%edi + addl %r12d,%ecx + pxor %xmm5,%xmm4 + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + psrld $10,%xmm7 + addl %r13d,%ecx + xorl %r8d,%r15d + paddd %xmm4,%xmm3 + rorl $2,%r14d + addl %ecx,%r10d + psrlq $17,%xmm6 + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + pxor %xmm6,%xmm7 + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + psrlq $2,%xmm6 + xorl %r10d,%r13d + xorl %eax,%r12d + pxor %xmm6,%xmm7 + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + pshufd $128,%xmm7,%xmm7 + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + psrldq $8,%xmm7 + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + paddd %xmm7,%xmm3 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + pshufd $80,%xmm3,%xmm7 + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + movdqa %xmm7,%xmm6 + addl %edi,%ebx + movl %r9d,%r13d + psrld $10,%xmm7 + addl %ebx,%r14d + rorl $14,%r13d + psrlq $17,%xmm6 + movl %r14d,%ebx + movl %r10d,%r12d + pxor %xmm6,%xmm7 + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + psrlq $2,%xmm6 + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + pxor %xmm6,%xmm7 + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + pshufd $8,%xmm7,%xmm7 + xorl %ecx,%edi + addl %r12d,%eax + movdqa 96(%rbp),%xmm6 + rorl $6,%r13d + andl %edi,%r15d + pslldq $8,%xmm7 + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + paddd %xmm7,%xmm3 + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + paddd %xmm3,%xmm6 + movl %r8d,%r13d + addl %eax,%r14d + movdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lssse3_00_47 + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + rorl $14,%r13d + movl %r14d,%eax + movl %r9d,%r12d + rorl $9,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + rorl $5,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + rorl $11,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + rorl $6,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + rorl $2,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + rorl $14,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + rorl $9,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + rorl $5,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + rorl $11,%r14d + xorl %eax,%edi + addl %r12d,%r10d + rorl $6,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + rorl $2,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + rorl $14,%r13d + movl %r14d,%r10d + movl %edx,%r12d + rorl $9,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + rorl $5,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + rorl $11,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + rorl $6,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + rorl $2,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + rorl $14,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + rorl $9,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + rorl $5,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + rorl $11,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + rorl $6,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + rorl $2,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + rorl $14,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + rorl $9,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + rorl $5,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + rorl $11,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + rorl $6,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + rorl $2,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + rorl $14,%r13d + movl %r14d,%edx + movl %eax,%r12d + rorl $9,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + rorl $5,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + rorl $11,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + rorl $6,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + rorl $2,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + rorl $14,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + rorl $9,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + rorl $5,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + rorl $11,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + rorl $6,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + rorl $2,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + rorl $14,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + rorl $9,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + rorl $5,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + rorl $11,%r14d + xorl %ecx,%edi + addl %r12d,%eax + rorl $6,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + rorl $2,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_ssse3 + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_ssse3: + ret +.cfi_endproc +.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 +.globl sha256_block_data_order_avx +.hidden sha256_block_data_order_avx +.type sha256_block_data_order_avx,@function +.align 64 +sha256_block_data_order_avx: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $96,%rsp + leaq (%rsi,%rdx,4),%rdx + andq $-64,%rsp + movq %rdi,64+0(%rsp) + movq %rsi,64+8(%rsp) + movq %rdx,64+16(%rsp) + movq %rax,88(%rsp) +.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movl 0(%rdi),%eax + movl 4(%rdi),%ebx + movl 8(%rdi),%ecx + movl 12(%rdi),%edx + movl 16(%rdi),%r8d + movl 20(%rdi),%r9d + movl 24(%rdi),%r10d + movl 28(%rdi),%r11d + vmovdqa K256+512+32(%rip),%xmm8 + vmovdqa K256+512+64(%rip),%xmm9 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K256+512(%rip),%xmm7 + vmovdqu 0(%rsi),%xmm0 + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm7,%xmm0,%xmm0 + leaq K256(%rip),%rbp + vpshufb %xmm7,%xmm1,%xmm1 + vpshufb %xmm7,%xmm2,%xmm2 + vpaddd 0(%rbp),%xmm0,%xmm4 + vpshufb %xmm7,%xmm3,%xmm3 + vpaddd 32(%rbp),%xmm1,%xmm5 + vpaddd 64(%rbp),%xmm2,%xmm6 + vpaddd 96(%rbp),%xmm3,%xmm7 + vmovdqa %xmm4,0(%rsp) + movl %eax,%r14d + vmovdqa %xmm5,16(%rsp) + movl %ebx,%edi + vmovdqa %xmm6,32(%rsp) + xorl %ecx,%edi + vmovdqa %xmm7,48(%rsp) + movl %r8d,%r13d + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + subq $-128,%rbp + vpalignr $4,%xmm0,%xmm1,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm2,%xmm3,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm0,%xmm0 + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm3,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm0,%xmm0 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm0,%xmm0 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + vpshufd $80,%xmm0,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm0,%xmm0 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 0(%rbp),%xmm0,%xmm6 + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,0(%rsp) + vpalignr $4,%xmm1,%xmm2,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm3,%xmm0,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm1,%xmm1 + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm0,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm1,%xmm1 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm1,%xmm1 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + vpshufd $80,%xmm1,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm1,%xmm1 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 32(%rbp),%xmm1,%xmm6 + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,16(%rsp) + vpalignr $4,%xmm2,%xmm3,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + vpalignr $4,%xmm0,%xmm1,%xmm7 + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + vpaddd %xmm7,%xmm2,%xmm2 + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + vpshufd $250,%xmm1,%xmm7 + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + vpsrld $11,%xmm6,%xmm6 + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + vpaddd %xmm4,%xmm2,%xmm2 + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + vpxor %xmm7,%xmm6,%xmm6 + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + vpaddd %xmm6,%xmm2,%xmm2 + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + vpshufd $80,%xmm2,%xmm7 + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + vpxor %xmm7,%xmm6,%xmm6 + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + vpaddd %xmm6,%xmm2,%xmm2 + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + vpaddd 64(%rbp),%xmm2,%xmm6 + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + vmovdqa %xmm6,32(%rsp) + vpalignr $4,%xmm3,%xmm0,%xmm4 + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + vpalignr $4,%xmm1,%xmm2,%xmm7 + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + vpsrld $7,%xmm4,%xmm6 + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + vpaddd %xmm7,%xmm3,%xmm3 + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + vpsrld $3,%xmm4,%xmm7 + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + vpslld $14,%xmm4,%xmm5 + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + vpxor %xmm6,%xmm7,%xmm4 + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + vpshufd $250,%xmm2,%xmm7 + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + vpsrld $11,%xmm6,%xmm6 + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + vpxor %xmm5,%xmm4,%xmm4 + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + vpslld $11,%xmm5,%xmm5 + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + vpxor %xmm6,%xmm4,%xmm4 + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + vpsrld $10,%xmm7,%xmm6 + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + vpxor %xmm5,%xmm4,%xmm4 + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + vpsrlq $17,%xmm7,%xmm7 + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + vpaddd %xmm4,%xmm3,%xmm3 + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + vpxor %xmm7,%xmm6,%xmm6 + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + vpsrlq $2,%xmm7,%xmm7 + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + vpxor %xmm7,%xmm6,%xmm6 + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + vpshufb %xmm8,%xmm6,%xmm6 + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + vpaddd %xmm6,%xmm3,%xmm3 + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + vpshufd $80,%xmm3,%xmm7 + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + vpsrld $10,%xmm7,%xmm6 + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + vpsrlq $17,%xmm7,%xmm7 + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + vpxor %xmm7,%xmm6,%xmm6 + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + vpsrlq $2,%xmm7,%xmm7 + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + vpxor %xmm7,%xmm6,%xmm6 + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + vpshufb %xmm9,%xmm6,%xmm6 + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + vpaddd %xmm6,%xmm3,%xmm3 + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + vpaddd 96(%rbp),%xmm3,%xmm6 + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + vmovdqa %xmm6,48(%rsp) + cmpb $0,131(%rbp) + jne .Lavx_00_47 + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 0(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 4(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 8(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 12(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 16(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 20(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 24(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 28(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%eax + movl %r9d,%r12d + shrdl $9,%r14d,%r14d + xorl %r8d,%r13d + xorl %r10d,%r12d + shrdl $5,%r13d,%r13d + xorl %eax,%r14d + andl %r8d,%r12d + xorl %r8d,%r13d + addl 32(%rsp),%r11d + movl %eax,%r15d + xorl %r10d,%r12d + shrdl $11,%r14d,%r14d + xorl %ebx,%r15d + addl %r12d,%r11d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %eax,%r14d + addl %r13d,%r11d + xorl %ebx,%edi + shrdl $2,%r14d,%r14d + addl %r11d,%edx + addl %edi,%r11d + movl %edx,%r13d + addl %r11d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r11d + movl %r8d,%r12d + shrdl $9,%r14d,%r14d + xorl %edx,%r13d + xorl %r9d,%r12d + shrdl $5,%r13d,%r13d + xorl %r11d,%r14d + andl %edx,%r12d + xorl %edx,%r13d + addl 36(%rsp),%r10d + movl %r11d,%edi + xorl %r9d,%r12d + shrdl $11,%r14d,%r14d + xorl %eax,%edi + addl %r12d,%r10d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r11d,%r14d + addl %r13d,%r10d + xorl %eax,%r15d + shrdl $2,%r14d,%r14d + addl %r10d,%ecx + addl %r15d,%r10d + movl %ecx,%r13d + addl %r10d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r10d + movl %edx,%r12d + shrdl $9,%r14d,%r14d + xorl %ecx,%r13d + xorl %r8d,%r12d + shrdl $5,%r13d,%r13d + xorl %r10d,%r14d + andl %ecx,%r12d + xorl %ecx,%r13d + addl 40(%rsp),%r9d + movl %r10d,%r15d + xorl %r8d,%r12d + shrdl $11,%r14d,%r14d + xorl %r11d,%r15d + addl %r12d,%r9d + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r10d,%r14d + addl %r13d,%r9d + xorl %r11d,%edi + shrdl $2,%r14d,%r14d + addl %r9d,%ebx + addl %edi,%r9d + movl %ebx,%r13d + addl %r9d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r9d + movl %ecx,%r12d + shrdl $9,%r14d,%r14d + xorl %ebx,%r13d + xorl %edx,%r12d + shrdl $5,%r13d,%r13d + xorl %r9d,%r14d + andl %ebx,%r12d + xorl %ebx,%r13d + addl 44(%rsp),%r8d + movl %r9d,%edi + xorl %edx,%r12d + shrdl $11,%r14d,%r14d + xorl %r10d,%edi + addl %r12d,%r8d + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %r9d,%r14d + addl %r13d,%r8d + xorl %r10d,%r15d + shrdl $2,%r14d,%r14d + addl %r8d,%eax + addl %r15d,%r8d + movl %eax,%r13d + addl %r8d,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%r8d + movl %ebx,%r12d + shrdl $9,%r14d,%r14d + xorl %eax,%r13d + xorl %ecx,%r12d + shrdl $5,%r13d,%r13d + xorl %r8d,%r14d + andl %eax,%r12d + xorl %eax,%r13d + addl 48(%rsp),%edx + movl %r8d,%r15d + xorl %ecx,%r12d + shrdl $11,%r14d,%r14d + xorl %r9d,%r15d + addl %r12d,%edx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %r8d,%r14d + addl %r13d,%edx + xorl %r9d,%edi + shrdl $2,%r14d,%r14d + addl %edx,%r11d + addl %edi,%edx + movl %r11d,%r13d + addl %edx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%edx + movl %eax,%r12d + shrdl $9,%r14d,%r14d + xorl %r11d,%r13d + xorl %ebx,%r12d + shrdl $5,%r13d,%r13d + xorl %edx,%r14d + andl %r11d,%r12d + xorl %r11d,%r13d + addl 52(%rsp),%ecx + movl %edx,%edi + xorl %ebx,%r12d + shrdl $11,%r14d,%r14d + xorl %r8d,%edi + addl %r12d,%ecx + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %edx,%r14d + addl %r13d,%ecx + xorl %r8d,%r15d + shrdl $2,%r14d,%r14d + addl %ecx,%r10d + addl %r15d,%ecx + movl %r10d,%r13d + addl %ecx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ecx + movl %r11d,%r12d + shrdl $9,%r14d,%r14d + xorl %r10d,%r13d + xorl %eax,%r12d + shrdl $5,%r13d,%r13d + xorl %ecx,%r14d + andl %r10d,%r12d + xorl %r10d,%r13d + addl 56(%rsp),%ebx + movl %ecx,%r15d + xorl %eax,%r12d + shrdl $11,%r14d,%r14d + xorl %edx,%r15d + addl %r12d,%ebx + shrdl $6,%r13d,%r13d + andl %r15d,%edi + xorl %ecx,%r14d + addl %r13d,%ebx + xorl %edx,%edi + shrdl $2,%r14d,%r14d + addl %ebx,%r9d + addl %edi,%ebx + movl %r9d,%r13d + addl %ebx,%r14d + shrdl $14,%r13d,%r13d + movl %r14d,%ebx + movl %r10d,%r12d + shrdl $9,%r14d,%r14d + xorl %r9d,%r13d + xorl %r11d,%r12d + shrdl $5,%r13d,%r13d + xorl %ebx,%r14d + andl %r9d,%r12d + xorl %r9d,%r13d + addl 60(%rsp),%eax + movl %ebx,%edi + xorl %r11d,%r12d + shrdl $11,%r14d,%r14d + xorl %ecx,%edi + addl %r12d,%eax + shrdl $6,%r13d,%r13d + andl %edi,%r15d + xorl %ebx,%r14d + addl %r13d,%eax + xorl %ecx,%r15d + shrdl $2,%r14d,%r14d + addl %eax,%r8d + addl %r15d,%eax + movl %r8d,%r13d + addl %eax,%r14d + movq 64+0(%rsp),%rdi + movl %r14d,%eax + + addl 0(%rdi),%eax + leaq 64(%rsi),%rsi + addl 4(%rdi),%ebx + addl 8(%rdi),%ecx + addl 12(%rdi),%edx + addl 16(%rdi),%r8d + addl 20(%rdi),%r9d + addl 24(%rdi),%r10d + addl 28(%rdi),%r11d + + cmpq 64+16(%rsp),%rsi + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + jb .Lloop_avx + + movq 88(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + ret +.cfi_endproc +.size sha256_block_data_order_avx,.-sha256_block_data_order_avx +#endif diff --git a/third_party/boringssl/gen/bcm/sha256-x86_64-win.asm b/third_party/boringssl/gen/bcm/sha256-x86_64-win.asm new file mode 100644 index 00000000..61b11d8a --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha256-x86_64-win.asm @@ -0,0 +1,4415 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + +global sha256_block_data_order_nohw + +ALIGN 16 +sha256_block_data_order_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,16*4+4*8 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[88+rsp],rax + +$L$prologue: + + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + jmp NEAR $L$loop + +ALIGN 16 +$L$loop: + mov edi,ebx + lea rbp,[K256] + xor edi,ecx + mov r12d,DWORD[rsi] + mov r13d,r8d + mov r14d,eax + bswap r12d + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + add r11d,r14d + mov r12d,DWORD[4+rsi] + mov r13d,edx + mov r14d,r11d + bswap r12d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[4+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + add r10d,r14d + mov r12d,DWORD[8+rsi] + mov r13d,ecx + mov r14d,r10d + bswap r12d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[8+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + add r9d,r14d + mov r12d,DWORD[12+rsi] + mov r13d,ebx + mov r14d,r9d + bswap r12d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[12+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + add r8d,r14d + mov r12d,DWORD[16+rsi] + mov r13d,eax + mov r14d,r8d + bswap r12d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[16+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + add edx,r14d + mov r12d,DWORD[20+rsi] + mov r13d,r11d + mov r14d,edx + bswap r12d + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[20+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + add ecx,r14d + mov r12d,DWORD[24+rsi] + mov r13d,r10d + mov r14d,ecx + bswap r12d + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[24+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + add ebx,r14d + mov r12d,DWORD[28+rsi] + mov r13d,r9d + mov r14d,ebx + bswap r12d + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[28+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + add eax,r14d + mov r12d,DWORD[32+rsi] + mov r13d,r8d + mov r14d,eax + bswap r12d + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[32+rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + add r11d,r14d + mov r12d,DWORD[36+rsi] + mov r13d,edx + mov r14d,r11d + bswap r12d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[36+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + add r10d,r14d + mov r12d,DWORD[40+rsi] + mov r13d,ecx + mov r14d,r10d + bswap r12d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[40+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + add r9d,r14d + mov r12d,DWORD[44+rsi] + mov r13d,ebx + mov r14d,r9d + bswap r12d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[44+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + add r8d,r14d + mov r12d,DWORD[48+rsi] + mov r13d,eax + mov r14d,r8d + bswap r12d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[48+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + add edx,r14d + mov r12d,DWORD[52+rsi] + mov r13d,r11d + mov r14d,edx + bswap r12d + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[52+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + add ecx,r14d + mov r12d,DWORD[56+rsi] + mov r13d,r10d + mov r14d,ecx + bswap r12d + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[56+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + add ebx,r14d + mov r12d,DWORD[60+rsi] + mov r13d,r9d + mov r14d,ebx + bswap r12d + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[60+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + jmp NEAR $L$rounds_16_xx +ALIGN 16 +$L$rounds_16_xx: + mov r13d,DWORD[4+rsp] + mov r15d,DWORD[56+rsp] + + mov r12d,r13d + ror r13d,11 + add eax,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[36+rsp] + + add r12d,DWORD[rsp] + mov r13d,r8d + add r12d,r15d + mov r14d,eax + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[8+rsp] + mov edi,DWORD[60+rsp] + + mov r12d,r13d + ror r13d,11 + add r11d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[40+rsp] + + add r12d,DWORD[4+rsp] + mov r13d,edx + add r12d,edi + mov r14d,r11d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[4+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[12+rsp] + mov r15d,DWORD[rsp] + + mov r12d,r13d + ror r13d,11 + add r10d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[44+rsp] + + add r12d,DWORD[8+rsp] + mov r13d,ecx + add r12d,r15d + mov r14d,r10d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[8+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[16+rsp] + mov edi,DWORD[4+rsp] + + mov r12d,r13d + ror r13d,11 + add r9d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[48+rsp] + + add r12d,DWORD[12+rsp] + mov r13d,ebx + add r12d,edi + mov r14d,r9d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[12+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + mov r13d,DWORD[20+rsp] + mov r15d,DWORD[8+rsp] + + mov r12d,r13d + ror r13d,11 + add r8d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[52+rsp] + + add r12d,DWORD[16+rsp] + mov r13d,eax + add r12d,r15d + mov r14d,r8d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[16+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[24+rsp] + mov edi,DWORD[12+rsp] + + mov r12d,r13d + ror r13d,11 + add edx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[56+rsp] + + add r12d,DWORD[20+rsp] + mov r13d,r11d + add r12d,edi + mov r14d,edx + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[20+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[28+rsp] + mov r15d,DWORD[16+rsp] + + mov r12d,r13d + ror r13d,11 + add ecx,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[60+rsp] + + add r12d,DWORD[24+rsp] + mov r13d,r10d + add r12d,r15d + mov r14d,ecx + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[24+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[32+rsp] + mov edi,DWORD[20+rsp] + + mov r12d,r13d + ror r13d,11 + add ebx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[rsp] + + add r12d,DWORD[28+rsp] + mov r13d,r9d + add r12d,edi + mov r14d,ebx + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[28+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + mov r13d,DWORD[36+rsp] + mov r15d,DWORD[24+rsp] + + mov r12d,r13d + ror r13d,11 + add eax,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[4+rsp] + + add r12d,DWORD[32+rsp] + mov r13d,r8d + add r12d,r15d + mov r14d,eax + ror r13d,14 + mov r15d,r9d + + xor r13d,r8d + ror r14d,9 + xor r15d,r10d + + mov DWORD[32+rsp],r12d + xor r14d,eax + and r15d,r8d + + ror r13d,5 + add r12d,r11d + xor r15d,r10d + + ror r14d,11 + xor r13d,r8d + add r12d,r15d + + mov r15d,eax + add r12d,DWORD[rbp] + xor r14d,eax + + xor r15d,ebx + ror r13d,6 + mov r11d,ebx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r11d,edi + add edx,r12d + add r11d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[40+rsp] + mov edi,DWORD[28+rsp] + + mov r12d,r13d + ror r13d,11 + add r11d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[8+rsp] + + add r12d,DWORD[36+rsp] + mov r13d,edx + add r12d,edi + mov r14d,r11d + ror r13d,14 + mov edi,r8d + + xor r13d,edx + ror r14d,9 + xor edi,r9d + + mov DWORD[36+rsp],r12d + xor r14d,r11d + and edi,edx + + ror r13d,5 + add r12d,r10d + xor edi,r9d + + ror r14d,11 + xor r13d,edx + add r12d,edi + + mov edi,r11d + add r12d,DWORD[rbp] + xor r14d,r11d + + xor edi,eax + ror r13d,6 + mov r10d,eax + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r10d,r15d + add ecx,r12d + add r10d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[44+rsp] + mov r15d,DWORD[32+rsp] + + mov r12d,r13d + ror r13d,11 + add r10d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[12+rsp] + + add r12d,DWORD[40+rsp] + mov r13d,ecx + add r12d,r15d + mov r14d,r10d + ror r13d,14 + mov r15d,edx + + xor r13d,ecx + ror r14d,9 + xor r15d,r8d + + mov DWORD[40+rsp],r12d + xor r14d,r10d + and r15d,ecx + + ror r13d,5 + add r12d,r9d + xor r15d,r8d + + ror r14d,11 + xor r13d,ecx + add r12d,r15d + + mov r15d,r10d + add r12d,DWORD[rbp] + xor r14d,r10d + + xor r15d,r11d + ror r13d,6 + mov r9d,r11d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor r9d,edi + add ebx,r12d + add r9d,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[48+rsp] + mov edi,DWORD[36+rsp] + + mov r12d,r13d + ror r13d,11 + add r9d,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[16+rsp] + + add r12d,DWORD[44+rsp] + mov r13d,ebx + add r12d,edi + mov r14d,r9d + ror r13d,14 + mov edi,ecx + + xor r13d,ebx + ror r14d,9 + xor edi,edx + + mov DWORD[44+rsp],r12d + xor r14d,r9d + and edi,ebx + + ror r13d,5 + add r12d,r8d + xor edi,edx + + ror r14d,11 + xor r13d,ebx + add r12d,edi + + mov edi,r9d + add r12d,DWORD[rbp] + xor r14d,r9d + + xor edi,r10d + ror r13d,6 + mov r8d,r10d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor r8d,r15d + add eax,r12d + add r8d,r12d + + lea rbp,[20+rbp] + mov r13d,DWORD[52+rsp] + mov r15d,DWORD[40+rsp] + + mov r12d,r13d + ror r13d,11 + add r8d,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[20+rsp] + + add r12d,DWORD[48+rsp] + mov r13d,eax + add r12d,r15d + mov r14d,r8d + ror r13d,14 + mov r15d,ebx + + xor r13d,eax + ror r14d,9 + xor r15d,ecx + + mov DWORD[48+rsp],r12d + xor r14d,r8d + and r15d,eax + + ror r13d,5 + add r12d,edx + xor r15d,ecx + + ror r14d,11 + xor r13d,eax + add r12d,r15d + + mov r15d,r8d + add r12d,DWORD[rbp] + xor r14d,r8d + + xor r15d,r9d + ror r13d,6 + mov edx,r9d + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor edx,edi + add r11d,r12d + add edx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[56+rsp] + mov edi,DWORD[44+rsp] + + mov r12d,r13d + ror r13d,11 + add edx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[24+rsp] + + add r12d,DWORD[52+rsp] + mov r13d,r11d + add r12d,edi + mov r14d,edx + ror r13d,14 + mov edi,eax + + xor r13d,r11d + ror r14d,9 + xor edi,ebx + + mov DWORD[52+rsp],r12d + xor r14d,edx + and edi,r11d + + ror r13d,5 + add r12d,ecx + xor edi,ebx + + ror r14d,11 + xor r13d,r11d + add r12d,edi + + mov edi,edx + add r12d,DWORD[rbp] + xor r14d,edx + + xor edi,r8d + ror r13d,6 + mov ecx,r8d + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor ecx,r15d + add r10d,r12d + add ecx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[60+rsp] + mov r15d,DWORD[48+rsp] + + mov r12d,r13d + ror r13d,11 + add ecx,r14d + mov r14d,r15d + ror r15d,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor r15d,r14d + shr r14d,10 + + ror r15d,17 + xor r12d,r13d + xor r15d,r14d + add r12d,DWORD[28+rsp] + + add r12d,DWORD[56+rsp] + mov r13d,r10d + add r12d,r15d + mov r14d,ecx + ror r13d,14 + mov r15d,r11d + + xor r13d,r10d + ror r14d,9 + xor r15d,eax + + mov DWORD[56+rsp],r12d + xor r14d,ecx + and r15d,r10d + + ror r13d,5 + add r12d,ebx + xor r15d,eax + + ror r14d,11 + xor r13d,r10d + add r12d,r15d + + mov r15d,ecx + add r12d,DWORD[rbp] + xor r14d,ecx + + xor r15d,edx + ror r13d,6 + mov ebx,edx + + and edi,r15d + ror r14d,2 + add r12d,r13d + + xor ebx,edi + add r9d,r12d + add ebx,r12d + + lea rbp,[4+rbp] + mov r13d,DWORD[rsp] + mov edi,DWORD[52+rsp] + + mov r12d,r13d + ror r13d,11 + add ebx,r14d + mov r14d,edi + ror edi,2 + + xor r13d,r12d + shr r12d,3 + ror r13d,7 + xor edi,r14d + shr r14d,10 + + ror edi,17 + xor r12d,r13d + xor edi,r14d + add r12d,DWORD[32+rsp] + + add r12d,DWORD[60+rsp] + mov r13d,r9d + add r12d,edi + mov r14d,ebx + ror r13d,14 + mov edi,r10d + + xor r13d,r9d + ror r14d,9 + xor edi,r11d + + mov DWORD[60+rsp],r12d + xor r14d,ebx + and edi,r9d + + ror r13d,5 + add r12d,eax + xor edi,r11d + + ror r14d,11 + xor r13d,r9d + add r12d,edi + + mov edi,ebx + add r12d,DWORD[rbp] + xor r14d,ebx + + xor edi,ecx + ror r13d,6 + mov eax,ecx + + and r15d,edi + ror r14d,2 + add r12d,r13d + + xor eax,r15d + add r8d,r12d + add eax,r12d + + lea rbp,[20+rbp] + cmp BYTE[3+rbp],0 + jnz NEAR $L$rounds_16_xx + + mov rdi,QWORD[((64+0))+rsp] + add eax,r14d + lea rsi,[64+rsi] + + add eax,DWORD[rdi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop + + mov rsi,QWORD[88+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha256_block_data_order_nohw: +section .rdata rdata align=8 +ALIGN 64 + +K256: + DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + DD 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 + DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + DD 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 + DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + DD 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 + DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + DD 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 + DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + DD 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc + DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + DD 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da + DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + DD 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 + DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + DD 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 + DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + DD 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 + DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + DD 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 + DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + DD 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 + DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + DD 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 + DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + DD 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 + DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + DD 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 + DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + DD 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 + DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + DD 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 + + DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + DD 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f + DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + DD 0x03020100,0x0b0a0908,0xffffffff,0xffffffff + DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + DD 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 + DB 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 + DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 + DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 + DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 + DB 111,114,103,62,0 +section .text + +global sha256_block_data_order_hw + +ALIGN 64 +sha256_block_data_order_hw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_hw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + lea rsp,[((-88))+rsp] + movaps XMMWORD[(-8-80)+rax],xmm6 + movaps XMMWORD[(-8-64)+rax],xmm7 + movaps XMMWORD[(-8-48)+rax],xmm8 + movaps XMMWORD[(-8-32)+rax],xmm9 + movaps XMMWORD[(-8-16)+rax],xmm10 +$L$prologue_shaext: + lea rcx,[((K256+128))] + movdqu xmm1,XMMWORD[rdi] + movdqu xmm2,XMMWORD[16+rdi] + movdqa xmm7,XMMWORD[((512-128))+rcx] + + pshufd xmm0,xmm1,0x1b + pshufd xmm1,xmm1,0xb1 + pshufd xmm2,xmm2,0x1b + movdqa xmm8,xmm7 + palignr xmm1,xmm2,8 + punpcklqdq xmm2,xmm0 + jmp NEAR $L$oop_shaext + +ALIGN 16 +$L$oop_shaext: + movdqu xmm3,XMMWORD[rsi] + movdqu xmm4,XMMWORD[16+rsi] + movdqu xmm5,XMMWORD[32+rsi] + pshufb xmm3,xmm7 + movdqu xmm6,XMMWORD[48+rsi] + + movdqa xmm0,XMMWORD[((0-128))+rcx] + paddd xmm0,xmm3 + pshufb xmm4,xmm7 + movdqa xmm10,xmm2 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + nop + movdqa xmm9,xmm1 + sha256rnds2 xmm1,xmm2 + + movdqa xmm0,XMMWORD[((32-128))+rcx] + paddd xmm0,xmm4 + pshufb xmm5,xmm7 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + lea rsi,[64+rsi] + sha256msg1 xmm3,xmm4 + sha256rnds2 xmm1,xmm2 + + movdqa xmm0,XMMWORD[((64-128))+rcx] + paddd xmm0,xmm5 + pshufb xmm6,xmm7 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm6 + palignr xmm7,xmm5,4 + nop + paddd xmm3,xmm7 + sha256msg1 xmm4,xmm5 + sha256rnds2 xmm1,xmm2 + + movdqa xmm0,XMMWORD[((96-128))+rcx] + paddd xmm0,xmm6 + sha256msg2 xmm3,xmm6 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm3 + palignr xmm7,xmm6,4 + nop + paddd xmm4,xmm7 + sha256msg1 xmm5,xmm6 + sha256rnds2 xmm1,xmm2 + movdqa xmm0,XMMWORD[((128-128))+rcx] + paddd xmm0,xmm3 + sha256msg2 xmm4,xmm3 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm4 + palignr xmm7,xmm3,4 + nop + paddd xmm5,xmm7 + sha256msg1 xmm6,xmm3 + sha256rnds2 xmm1,xmm2 + movdqa xmm0,XMMWORD[((160-128))+rcx] + paddd xmm0,xmm4 + sha256msg2 xmm5,xmm4 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm5 + palignr xmm7,xmm4,4 + nop + paddd xmm6,xmm7 + sha256msg1 xmm3,xmm4 + sha256rnds2 xmm1,xmm2 + movdqa xmm0,XMMWORD[((192-128))+rcx] + paddd xmm0,xmm5 + sha256msg2 xmm6,xmm5 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm6 + palignr xmm7,xmm5,4 + nop + paddd xmm3,xmm7 + sha256msg1 xmm4,xmm5 + sha256rnds2 xmm1,xmm2 + movdqa xmm0,XMMWORD[((224-128))+rcx] + paddd xmm0,xmm6 + sha256msg2 xmm3,xmm6 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm3 + palignr xmm7,xmm6,4 + nop + paddd xmm4,xmm7 + sha256msg1 xmm5,xmm6 + sha256rnds2 xmm1,xmm2 + movdqa xmm0,XMMWORD[((256-128))+rcx] + paddd xmm0,xmm3 + sha256msg2 xmm4,xmm3 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm4 + palignr xmm7,xmm3,4 + nop + paddd xmm5,xmm7 + sha256msg1 xmm6,xmm3 + sha256rnds2 xmm1,xmm2 + movdqa xmm0,XMMWORD[((288-128))+rcx] + paddd xmm0,xmm4 + sha256msg2 xmm5,xmm4 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm5 + palignr xmm7,xmm4,4 + nop + paddd xmm6,xmm7 + sha256msg1 xmm3,xmm4 + sha256rnds2 xmm1,xmm2 + movdqa xmm0,XMMWORD[((320-128))+rcx] + paddd xmm0,xmm5 + sha256msg2 xmm6,xmm5 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm6 + palignr xmm7,xmm5,4 + nop + paddd xmm3,xmm7 + sha256msg1 xmm4,xmm5 + sha256rnds2 xmm1,xmm2 + movdqa xmm0,XMMWORD[((352-128))+rcx] + paddd xmm0,xmm6 + sha256msg2 xmm3,xmm6 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm3 + palignr xmm7,xmm6,4 + nop + paddd xmm4,xmm7 + sha256msg1 xmm5,xmm6 + sha256rnds2 xmm1,xmm2 + movdqa xmm0,XMMWORD[((384-128))+rcx] + paddd xmm0,xmm3 + sha256msg2 xmm4,xmm3 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm4 + palignr xmm7,xmm3,4 + nop + paddd xmm5,xmm7 + sha256msg1 xmm6,xmm3 + sha256rnds2 xmm1,xmm2 + movdqa xmm0,XMMWORD[((416-128))+rcx] + paddd xmm0,xmm4 + sha256msg2 xmm5,xmm4 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + movdqa xmm7,xmm5 + palignr xmm7,xmm4,4 + sha256rnds2 xmm1,xmm2 + paddd xmm6,xmm7 + + movdqa xmm0,XMMWORD[((448-128))+rcx] + paddd xmm0,xmm5 + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + sha256msg2 xmm6,xmm5 + movdqa xmm7,xmm8 + sha256rnds2 xmm1,xmm2 + + movdqa xmm0,XMMWORD[((480-128))+rcx] + paddd xmm0,xmm6 + nop + sha256rnds2 xmm2,xmm1 + pshufd xmm0,xmm0,0x0e + dec rdx + nop + sha256rnds2 xmm1,xmm2 + + paddd xmm2,xmm10 + paddd xmm1,xmm9 + jnz NEAR $L$oop_shaext + + pshufd xmm2,xmm2,0xb1 + pshufd xmm7,xmm1,0x1b + pshufd xmm1,xmm1,0xb1 + punpckhqdq xmm1,xmm2 + palignr xmm2,xmm7,8 + + movdqu XMMWORD[rdi],xmm1 + movdqu XMMWORD[16+rdi],xmm2 + movaps xmm6,XMMWORD[((-8-80))+rax] + movaps xmm7,XMMWORD[((-8-64))+rax] + movaps xmm8,XMMWORD[((-8-48))+rax] + movaps xmm9,XMMWORD[((-8-32))+rax] + movaps xmm10,XMMWORD[((-8-16))+rax] + mov rsp,rax +$L$epilogue_shaext: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha256_block_data_order_hw: +global sha256_block_data_order_ssse3 + +ALIGN 64 +sha256_block_data_order_ssse3: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_ssse3: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,160 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[88+rsp],rax + + movaps XMMWORD[(64+32)+rsp],xmm6 + movaps XMMWORD[(64+48)+rsp],xmm7 + movaps XMMWORD[(64+64)+rsp],xmm8 + movaps XMMWORD[(64+80)+rsp],xmm9 +$L$prologue_ssse3: + + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + + + jmp NEAR $L$loop_ssse3 +ALIGN 16 +$L$loop_ssse3: + movdqa xmm7,XMMWORD[((K256+512))] + movdqu xmm0,XMMWORD[rsi] + movdqu xmm1,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + pshufb xmm0,xmm7 + movdqu xmm3,XMMWORD[48+rsi] + lea rbp,[K256] + pshufb xmm1,xmm7 + movdqa xmm4,XMMWORD[rbp] + movdqa xmm5,XMMWORD[32+rbp] + pshufb xmm2,xmm7 + paddd xmm4,xmm0 + movdqa xmm6,XMMWORD[64+rbp] + pshufb xmm3,xmm7 + movdqa xmm7,XMMWORD[96+rbp] + paddd xmm5,xmm1 + paddd xmm6,xmm2 + paddd xmm7,xmm3 + movdqa XMMWORD[rsp],xmm4 + mov r14d,eax + movdqa XMMWORD[16+rsp],xmm5 + mov edi,ebx + movdqa XMMWORD[32+rsp],xmm6 + xor edi,ecx + movdqa XMMWORD[48+rsp],xmm7 + mov r13d,r8d + jmp NEAR $L$ssse3_00_47 + +ALIGN 16 +$L$ssse3_00_47: + sub rbp,-128 + ror r13d,14 + movdqa xmm4,xmm1 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm3 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + palignr xmm4,xmm0,4 + and r12d,r8d + xor r13d,r8d + palignr xmm7,xmm2,4 + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm0,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm3,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD[4+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm0,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm0,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm0,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD[rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm0,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm0 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD[rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm2 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm0 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + palignr xmm4,xmm1,4 + and r12d,eax + xor r13d,eax + palignr xmm7,xmm3,4 + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm1,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm0,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD[20+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm1,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm1,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm1,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD[32+rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm1,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm1 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD[16+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm3 + mov eax,r14d + mov r12d,r9d + movdqa xmm7,xmm1 + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + palignr xmm4,xmm2,4 + and r12d,r8d + xor r13d,r8d + palignr xmm7,xmm0,4 + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,ebx + add r11d,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,eax + add r11d,r13d + xor edi,ebx + paddd xmm2,xmm7 + ror r14d,2 + add edx,r11d + psrld xmm6,7 + add r11d,edi + mov r13d,edx + pshufd xmm7,xmm1,250 + add r14d,r11d + ror r13d,14 + pslld xmm5,14 + mov r11d,r14d + mov r12d,r8d + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + psrld xmm6,11 + xor r14d,r11d + pxor xmm4,xmm5 + and r12d,edx + xor r13d,edx + pslld xmm5,11 + add r10d,DWORD[36+rsp] + mov edi,r11d + pxor xmm4,xmm6 + xor r12d,r9d + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,eax + add r10d,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,r11d + psrld xmm7,10 + add r10d,r13d + xor r15d,eax + paddd xmm2,xmm4 + ror r14d,2 + add ecx,r10d + psrlq xmm6,17 + add r10d,r15d + mov r13d,ecx + add r14d,r10d + pxor xmm7,xmm6 + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + psrlq xmm6,2 + xor r13d,ecx + xor r12d,r8d + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,r10d + and r12d,ecx + pshufd xmm7,xmm7,128 + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + psrldq xmm7,8 + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + paddd xmm2,xmm7 + and edi,r15d + xor r14d,r10d + add r9d,r13d + pshufd xmm7,xmm2,80 + xor edi,r11d + ror r14d,2 + add ebx,r9d + movdqa xmm6,xmm7 + add r9d,edi + mov r13d,ebx + psrld xmm7,10 + add r14d,r9d + ror r13d,14 + psrlq xmm6,17 + mov r9d,r14d + mov r12d,ecx + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + psrlq xmm6,2 + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + pxor xmm7,xmm6 + mov edi,r9d + xor r12d,edx + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,r10d + add r8d,r12d + movdqa xmm6,XMMWORD[64+rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + paddd xmm2,xmm7 + ror r14d,2 + add eax,r8d + add r8d,r15d + paddd xmm6,xmm2 + mov r13d,eax + add r14d,r8d + movdqa XMMWORD[32+rsp],xmm6 + ror r13d,14 + movdqa xmm4,xmm0 + mov r8d,r14d + mov r12d,ebx + movdqa xmm7,xmm2 + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + palignr xmm4,xmm3,4 + and r12d,eax + xor r13d,eax + palignr xmm7,xmm1,4 + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + movdqa xmm5,xmm4 + xor r15d,r9d + add edx,r12d + movdqa xmm6,xmm4 + ror r13d,6 + and edi,r15d + psrld xmm4,3 + xor r14d,r8d + add edx,r13d + xor edi,r9d + paddd xmm3,xmm7 + ror r14d,2 + add r11d,edx + psrld xmm6,7 + add edx,edi + mov r13d,r11d + pshufd xmm7,xmm2,250 + add r14d,edx + ror r13d,14 + pslld xmm5,14 + mov edx,r14d + mov r12d,eax + pxor xmm4,xmm6 + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + psrld xmm6,11 + xor r14d,edx + pxor xmm4,xmm5 + and r12d,r11d + xor r13d,r11d + pslld xmm5,11 + add ecx,DWORD[52+rsp] + mov edi,edx + pxor xmm4,xmm6 + xor r12d,ebx + ror r14d,11 + movdqa xmm6,xmm7 + xor edi,r8d + add ecx,r12d + pxor xmm4,xmm5 + ror r13d,6 + and r15d,edi + xor r14d,edx + psrld xmm7,10 + add ecx,r13d + xor r15d,r8d + paddd xmm3,xmm4 + ror r14d,2 + add r10d,ecx + psrlq xmm6,17 + add ecx,r15d + mov r13d,r10d + add r14d,ecx + pxor xmm7,xmm6 + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + psrlq xmm6,2 + xor r13d,r10d + xor r12d,eax + pxor xmm7,xmm6 + ror r13d,5 + xor r14d,ecx + and r12d,r10d + pshufd xmm7,xmm7,128 + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + psrldq xmm7,8 + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + paddd xmm3,xmm7 + and edi,r15d + xor r14d,ecx + add ebx,r13d + pshufd xmm7,xmm3,80 + xor edi,edx + ror r14d,2 + add r9d,ebx + movdqa xmm6,xmm7 + add ebx,edi + mov r13d,r9d + psrld xmm7,10 + add r14d,ebx + ror r13d,14 + psrlq xmm6,17 + mov ebx,r14d + mov r12d,r10d + pxor xmm7,xmm6 + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + psrlq xmm6,2 + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + pxor xmm7,xmm6 + mov edi,ebx + xor r12d,r11d + ror r14d,11 + pshufd xmm7,xmm7,8 + xor edi,ecx + add eax,r12d + movdqa xmm6,XMMWORD[96+rbp] + ror r13d,6 + and r15d,edi + pslldq xmm7,8 + xor r14d,ebx + add eax,r13d + xor r15d,ecx + paddd xmm3,xmm7 + ror r14d,2 + add r8d,eax + add eax,r15d + paddd xmm6,xmm3 + mov r13d,r8d + add r14d,eax + movdqa XMMWORD[48+rsp],xmm6 + cmp BYTE[131+rbp],0 + jne NEAR $L$ssse3_00_47 + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + ror r13d,14 + mov eax,r14d + mov r12d,r9d + ror r14d,9 + xor r13d,r8d + xor r12d,r10d + ror r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + ror r14d,11 + xor r15d,ebx + add r11d,r12d + ror r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + ror r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + ror r13d,14 + mov r11d,r14d + mov r12d,r8d + ror r14d,9 + xor r13d,edx + xor r12d,r9d + ror r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + ror r14d,11 + xor edi,eax + add r10d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + ror r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + ror r13d,14 + mov r10d,r14d + mov r12d,edx + ror r14d,9 + xor r13d,ecx + xor r12d,r8d + ror r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + xor r12d,r8d + ror r14d,11 + xor r15d,r11d + add r9d,r12d + ror r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + ror r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + ror r13d,14 + mov r9d,r14d + mov r12d,ecx + ror r14d,9 + xor r13d,ebx + xor r12d,edx + ror r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + ror r14d,11 + xor edi,r10d + add r8d,r12d + ror r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + ror r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + ror r13d,14 + mov r8d,r14d + mov r12d,ebx + ror r14d,9 + xor r13d,eax + xor r12d,ecx + ror r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + ror r14d,11 + xor r15d,r9d + add edx,r12d + ror r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + ror r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + ror r13d,14 + mov edx,r14d + mov r12d,eax + ror r14d,9 + xor r13d,r11d + xor r12d,ebx + ror r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + ror r14d,11 + xor edi,r8d + add ecx,r12d + ror r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + ror r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + ror r13d,14 + mov ecx,r14d + mov r12d,r11d + ror r14d,9 + xor r13d,r10d + xor r12d,eax + ror r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + xor r12d,eax + ror r14d,11 + xor r15d,edx + add ebx,r12d + ror r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + ror r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + ror r13d,14 + mov ebx,r14d + mov r12d,r10d + ror r14d,9 + xor r13d,r9d + xor r12d,r11d + ror r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + ror r14d,11 + xor edi,ecx + add eax,r12d + ror r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + ror r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD[((64+0))+rsp] + mov eax,r14d + + add eax,DWORD[rdi] + lea rsi,[64+rsi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop_ssse3 + + mov rsi,QWORD[88+rsp] + + movaps xmm6,XMMWORD[((64+32))+rsp] + movaps xmm7,XMMWORD[((64+48))+rsp] + movaps xmm8,XMMWORD[((64+64))+rsp] + movaps xmm9,XMMWORD[((64+80))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue_ssse3: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha256_block_data_order_ssse3: +global sha256_block_data_order_avx + +ALIGN 64 +sha256_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha256_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,160 + lea rdx,[rdx*4+rsi] + and rsp,-64 + mov QWORD[((64+0))+rsp],rdi + mov QWORD[((64+8))+rsp],rsi + mov QWORD[((64+16))+rsp],rdx + mov QWORD[88+rsp],rax + + movaps XMMWORD[(64+32)+rsp],xmm6 + movaps XMMWORD[(64+48)+rsp],xmm7 + movaps XMMWORD[(64+64)+rsp],xmm8 + movaps XMMWORD[(64+80)+rsp],xmm9 +$L$prologue_avx: + + vzeroupper + mov eax,DWORD[rdi] + mov ebx,DWORD[4+rdi] + mov ecx,DWORD[8+rdi] + mov edx,DWORD[12+rdi] + mov r8d,DWORD[16+rdi] + mov r9d,DWORD[20+rdi] + mov r10d,DWORD[24+rdi] + mov r11d,DWORD[28+rdi] + vmovdqa xmm8,XMMWORD[((K256+512+32))] + vmovdqa xmm9,XMMWORD[((K256+512+64))] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm7,XMMWORD[((K256+512))] + vmovdqu xmm0,XMMWORD[rsi] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm0,xmm0,xmm7 + lea rbp,[K256] + vpshufb xmm1,xmm1,xmm7 + vpshufb xmm2,xmm2,xmm7 + vpaddd xmm4,xmm0,XMMWORD[rbp] + vpshufb xmm3,xmm3,xmm7 + vpaddd xmm5,xmm1,XMMWORD[32+rbp] + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + vpaddd xmm7,xmm3,XMMWORD[96+rbp] + vmovdqa XMMWORD[rsp],xmm4 + mov r14d,eax + vmovdqa XMMWORD[16+rsp],xmm5 + mov edi,ebx + vmovdqa XMMWORD[32+rsp],xmm6 + xor edi,ecx + vmovdqa XMMWORD[48+rsp],xmm7 + mov r13d,r8d + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + sub rbp,-128 + vpalignr xmm4,xmm1,xmm0,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm3,xmm2,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm0,xmm0,xmm7 + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm3,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm0,xmm0,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm0,xmm0,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + vpshufd xmm7,xmm0,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm0,xmm0,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm0,XMMWORD[rbp] + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[rsp],xmm6 + vpalignr xmm4,xmm2,xmm1,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm0,xmm3,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm1,xmm1,xmm7 + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm0,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm1,xmm1,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm1,xmm1,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + vpshufd xmm7,xmm1,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm1,xmm1,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm1,XMMWORD[32+rbp] + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[16+rsp],xmm6 + vpalignr xmm4,xmm3,xmm2,4 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + vpalignr xmm7,xmm1,xmm0,4 + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + vpaddd xmm2,xmm2,xmm7 + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + vpsrld xmm7,xmm4,3 + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + vpslld xmm5,xmm4,14 + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,eax + add r11d,r13d + xor edi,ebx + vpshufd xmm7,xmm1,250 + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + vpsrld xmm6,xmm6,11 + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,r11d + and r12d,edx + xor r13d,edx + vpsrld xmm6,xmm7,10 + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + vpaddd xmm2,xmm2,xmm4 + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + vpsrlq xmm7,xmm7,2 + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + vpshufb xmm6,xmm6,xmm8 + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + vpaddd xmm2,xmm2,xmm6 + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + vpshufd xmm7,xmm2,80 + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,r10d + add r9d,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + vpsrlq xmm7,xmm7,2 + add r9d,edi + mov r13d,ebx + add r14d,r9d + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + vpaddd xmm2,xmm2,xmm6 + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + vpaddd xmm6,xmm2,XMMWORD[64+rbp] + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + vmovdqa XMMWORD[32+rsp],xmm6 + vpalignr xmm4,xmm0,xmm3,4 + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + vpalignr xmm7,xmm2,xmm1,4 + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + vpsrld xmm6,xmm4,7 + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + vpaddd xmm3,xmm3,xmm7 + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + vpsrld xmm7,xmm4,3 + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + vpslld xmm5,xmm4,14 + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + vpxor xmm4,xmm7,xmm6 + xor r14d,r8d + add edx,r13d + xor edi,r9d + vpshufd xmm7,xmm2,250 + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + vpsrld xmm6,xmm6,11 + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + vpxor xmm4,xmm4,xmm5 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + vpslld xmm5,xmm5,11 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + vpxor xmm4,xmm4,xmm6 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + vpsrld xmm6,xmm7,10 + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + vpxor xmm4,xmm4,xmm5 + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + vpsrlq xmm7,xmm7,17 + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + vpaddd xmm3,xmm3,xmm4 + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + vpxor xmm6,xmm6,xmm7 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + vpsrlq xmm7,xmm7,2 + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + vpxor xmm6,xmm6,xmm7 + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + vpshufb xmm6,xmm6,xmm8 + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + vpaddd xmm3,xmm3,xmm6 + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + vpshufd xmm7,xmm3,80 + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + vpsrld xmm6,xmm7,10 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + vpsrlq xmm7,xmm7,17 + and edi,r15d + xor r14d,ecx + add ebx,r13d + vpxor xmm6,xmm6,xmm7 + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + vpsrlq xmm7,xmm7,2 + add ebx,edi + mov r13d,r9d + add r14d,ebx + vpxor xmm6,xmm6,xmm7 + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + vpshufb xmm6,xmm6,xmm9 + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + vpaddd xmm3,xmm3,xmm6 + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + vpaddd xmm6,xmm3,XMMWORD[96+rbp] + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + vmovdqa XMMWORD[48+rsp],xmm6 + cmp BYTE[131+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[4+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[8+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[12+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[16+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[20+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[24+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[28+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + shrd r13d,r13d,14 + mov eax,r14d + mov r12d,r9d + shrd r14d,r14d,9 + xor r13d,r8d + xor r12d,r10d + shrd r13d,r13d,5 + xor r14d,eax + and r12d,r8d + xor r13d,r8d + add r11d,DWORD[32+rsp] + mov r15d,eax + xor r12d,r10d + shrd r14d,r14d,11 + xor r15d,ebx + add r11d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,eax + add r11d,r13d + xor edi,ebx + shrd r14d,r14d,2 + add edx,r11d + add r11d,edi + mov r13d,edx + add r14d,r11d + shrd r13d,r13d,14 + mov r11d,r14d + mov r12d,r8d + shrd r14d,r14d,9 + xor r13d,edx + xor r12d,r9d + shrd r13d,r13d,5 + xor r14d,r11d + and r12d,edx + xor r13d,edx + add r10d,DWORD[36+rsp] + mov edi,r11d + xor r12d,r9d + shrd r14d,r14d,11 + xor edi,eax + add r10d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r11d + add r10d,r13d + xor r15d,eax + shrd r14d,r14d,2 + add ecx,r10d + add r10d,r15d + mov r13d,ecx + add r14d,r10d + shrd r13d,r13d,14 + mov r10d,r14d + mov r12d,edx + shrd r14d,r14d,9 + xor r13d,ecx + xor r12d,r8d + shrd r13d,r13d,5 + xor r14d,r10d + and r12d,ecx + xor r13d,ecx + add r9d,DWORD[40+rsp] + mov r15d,r10d + xor r12d,r8d + shrd r14d,r14d,11 + xor r15d,r11d + add r9d,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r10d + add r9d,r13d + xor edi,r11d + shrd r14d,r14d,2 + add ebx,r9d + add r9d,edi + mov r13d,ebx + add r14d,r9d + shrd r13d,r13d,14 + mov r9d,r14d + mov r12d,ecx + shrd r14d,r14d,9 + xor r13d,ebx + xor r12d,edx + shrd r13d,r13d,5 + xor r14d,r9d + and r12d,ebx + xor r13d,ebx + add r8d,DWORD[44+rsp] + mov edi,r9d + xor r12d,edx + shrd r14d,r14d,11 + xor edi,r10d + add r8d,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,r9d + add r8d,r13d + xor r15d,r10d + shrd r14d,r14d,2 + add eax,r8d + add r8d,r15d + mov r13d,eax + add r14d,r8d + shrd r13d,r13d,14 + mov r8d,r14d + mov r12d,ebx + shrd r14d,r14d,9 + xor r13d,eax + xor r12d,ecx + shrd r13d,r13d,5 + xor r14d,r8d + and r12d,eax + xor r13d,eax + add edx,DWORD[48+rsp] + mov r15d,r8d + xor r12d,ecx + shrd r14d,r14d,11 + xor r15d,r9d + add edx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,r8d + add edx,r13d + xor edi,r9d + shrd r14d,r14d,2 + add r11d,edx + add edx,edi + mov r13d,r11d + add r14d,edx + shrd r13d,r13d,14 + mov edx,r14d + mov r12d,eax + shrd r14d,r14d,9 + xor r13d,r11d + xor r12d,ebx + shrd r13d,r13d,5 + xor r14d,edx + and r12d,r11d + xor r13d,r11d + add ecx,DWORD[52+rsp] + mov edi,edx + xor r12d,ebx + shrd r14d,r14d,11 + xor edi,r8d + add ecx,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,edx + add ecx,r13d + xor r15d,r8d + shrd r14d,r14d,2 + add r10d,ecx + add ecx,r15d + mov r13d,r10d + add r14d,ecx + shrd r13d,r13d,14 + mov ecx,r14d + mov r12d,r11d + shrd r14d,r14d,9 + xor r13d,r10d + xor r12d,eax + shrd r13d,r13d,5 + xor r14d,ecx + and r12d,r10d + xor r13d,r10d + add ebx,DWORD[56+rsp] + mov r15d,ecx + xor r12d,eax + shrd r14d,r14d,11 + xor r15d,edx + add ebx,r12d + shrd r13d,r13d,6 + and edi,r15d + xor r14d,ecx + add ebx,r13d + xor edi,edx + shrd r14d,r14d,2 + add r9d,ebx + add ebx,edi + mov r13d,r9d + add r14d,ebx + shrd r13d,r13d,14 + mov ebx,r14d + mov r12d,r10d + shrd r14d,r14d,9 + xor r13d,r9d + xor r12d,r11d + shrd r13d,r13d,5 + xor r14d,ebx + and r12d,r9d + xor r13d,r9d + add eax,DWORD[60+rsp] + mov edi,ebx + xor r12d,r11d + shrd r14d,r14d,11 + xor edi,ecx + add eax,r12d + shrd r13d,r13d,6 + and r15d,edi + xor r14d,ebx + add eax,r13d + xor r15d,ecx + shrd r14d,r14d,2 + add r8d,eax + add eax,r15d + mov r13d,r8d + add r14d,eax + mov rdi,QWORD[((64+0))+rsp] + mov eax,r14d + + add eax,DWORD[rdi] + lea rsi,[64+rsi] + add ebx,DWORD[4+rdi] + add ecx,DWORD[8+rdi] + add edx,DWORD[12+rdi] + add r8d,DWORD[16+rdi] + add r9d,DWORD[20+rdi] + add r10d,DWORD[24+rdi] + add r11d,DWORD[28+rdi] + + cmp rsi,QWORD[((64+16))+rsp] + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + jb NEAR $L$loop_avx + + mov rsi,QWORD[88+rsp] + + vzeroupper + movaps xmm6,XMMWORD[((64+32))+rsp] + movaps xmm7,XMMWORD[((64+48))+rsp] + movaps xmm8,XMMWORD[((64+64))+rsp] + movaps xmm9,XMMWORD[((64+80))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha256_block_data_order_avx: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$in_prologue + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_prologue + mov rsi,rax + mov rax,QWORD[((64+24))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + + lea r10,[$L$epilogue] + cmp rbx,r10 + jb NEAR $L$in_prologue + + lea rsi,[((64+32))+rsi] + lea rdi,[512+r8] + mov ecx,8 + DD 0xa548f3fc + +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +ALIGN 16 +shaext_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + lea r10,[$L$prologue_shaext] + cmp rbx,r10 + jb NEAR $L$in_prologue + + lea r10,[$L$epilogue_shaext] + cmp rbx,r10 + jae NEAR $L$in_prologue + + lea rsi,[((-8-80))+rax] + lea rdi,[512+r8] + mov ecx,10 + DD 0xa548f3fc + + jmp NEAR $L$in_prologue + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_sha256_block_data_order_nohw wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_nohw wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_nohw wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_hw wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_hw wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_hw wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_ssse3 wrt ..imagebase + DD $L$SEH_begin_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha256_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha256_block_data_order_avx wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_sha256_block_data_order_nohw: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase +$L$SEH_info_sha256_block_data_order_hw: + DB 9,0,0,0 + DD shaext_handler wrt ..imagebase +$L$SEH_info_sha256_block_data_order_ssse3: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_ssse3 wrt ..imagebase,$L$epilogue_ssse3 wrt ..imagebase +$L$SEH_info_sha256_block_data_order_avx: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/sha512-586-apple.S b/third_party/boringssl/gen/bcm/sha512-586-apple.S new file mode 100644 index 00000000..be41827c --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha512-586-apple.S @@ -0,0 +1,2406 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _sha512_block_data_order_nohw +.private_extern _sha512_block_data_order_nohw +.align 4 +_sha512_block_data_order_nohw: +L_sha512_block_data_order_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call L000pic_point +L000pic_point: + popl %ebp + leal LK512-L000pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $7,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + movq (%esi),%mm0 + movq 8(%esi),%mm1 + movq 16(%esi),%mm2 + movq 24(%esi),%mm3 + movq 32(%esi),%mm4 + movq 40(%esi),%mm5 + movq 48(%esi),%mm6 + movq 56(%esi),%mm7 + subl $80,%esp + jmp L001loop_sse2 +.align 4,0x90 +L001loop_sse2: + movq %mm1,8(%esp) + movq %mm2,16(%esp) + movq %mm3,24(%esp) + movq %mm5,40(%esp) + movq %mm6,48(%esp) + pxor %mm1,%mm2 + movq %mm7,56(%esp) + movq %mm0,%mm3 + movl (%edi),%eax + movl 4(%edi),%ebx + addl $8,%edi + movl $15,%edx + bswap %eax + bswap %ebx + jmp L00200_14_sse2 +.align 4,0x90 +L00200_14_sse2: + movd %eax,%mm1 + movl (%edi),%eax + movd %ebx,%mm7 + movl 4(%edi),%ebx + addl $8,%edi + bswap %eax + bswap %ebx + punpckldq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm3,%mm0 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm2,%mm3 + movq %mm0,%mm2 + addl $8,%ebp + paddq %mm6,%mm3 + movq 48(%esp),%mm6 + decl %edx + jnz L00200_14_sse2 + movd %eax,%mm1 + movd %ebx,%mm7 + punpckldq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm3,%mm0 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm2,%mm3 + movq %mm0,%mm2 + addl $8,%ebp + paddq %mm6,%mm3 + pxor %mm0,%mm0 + movl $32,%edx + jmp L00316_79_sse2 +.align 4,0x90 +L00316_79_sse2: + movq 88(%esp),%mm5 + movq %mm7,%mm1 + psrlq $1,%mm7 + movq %mm5,%mm6 + psrlq $6,%mm5 + psllq $56,%mm1 + paddq %mm3,%mm0 + movq %mm7,%mm3 + psrlq $6,%mm7 + pxor %mm1,%mm3 + psllq $7,%mm1 + pxor %mm7,%mm3 + psrlq $1,%mm7 + pxor %mm1,%mm3 + movq %mm5,%mm1 + psrlq $13,%mm5 + pxor %mm3,%mm7 + psllq $3,%mm6 + pxor %mm5,%mm1 + paddq 200(%esp),%mm7 + pxor %mm6,%mm1 + psrlq $42,%mm5 + paddq 128(%esp),%mm7 + pxor %mm5,%mm1 + psllq $42,%mm6 + movq 40(%esp),%mm5 + pxor %mm6,%mm1 + movq 48(%esp),%mm6 + paddq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm6,%mm2 + addl $8,%ebp + movq 88(%esp),%mm5 + movq %mm7,%mm1 + psrlq $1,%mm7 + movq %mm5,%mm6 + psrlq $6,%mm5 + psllq $56,%mm1 + paddq %mm3,%mm2 + movq %mm7,%mm3 + psrlq $6,%mm7 + pxor %mm1,%mm3 + psllq $7,%mm1 + pxor %mm7,%mm3 + psrlq $1,%mm7 + pxor %mm1,%mm3 + movq %mm5,%mm1 + psrlq $13,%mm5 + pxor %mm3,%mm7 + psllq $3,%mm6 + pxor %mm5,%mm1 + paddq 200(%esp),%mm7 + pxor %mm6,%mm1 + psrlq $42,%mm5 + paddq 128(%esp),%mm7 + pxor %mm5,%mm1 + psllq $42,%mm6 + movq 40(%esp),%mm5 + pxor %mm6,%mm1 + movq 48(%esp),%mm6 + paddq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm6,%mm0 + addl $8,%ebp + decl %edx + jnz L00316_79_sse2 + paddq %mm3,%mm0 + movq 8(%esp),%mm1 + movq 24(%esp),%mm3 + movq 40(%esp),%mm5 + movq 48(%esp),%mm6 + movq 56(%esp),%mm7 + pxor %mm1,%mm2 + paddq (%esi),%mm0 + paddq 8(%esi),%mm1 + paddq 16(%esi),%mm2 + paddq 24(%esi),%mm3 + paddq 32(%esi),%mm4 + paddq 40(%esi),%mm5 + paddq 48(%esi),%mm6 + paddq 56(%esi),%mm7 + movl $640,%eax + movq %mm0,(%esi) + movq %mm1,8(%esi) + movq %mm2,16(%esi) + movq %mm3,24(%esi) + movq %mm4,32(%esi) + movq %mm5,40(%esi) + movq %mm6,48(%esi) + movq %mm7,56(%esi) + leal (%esp,%eax,1),%esp + subl %eax,%ebp + cmpl 88(%esp),%edi + jb L001loop_sse2 + movl 92(%esp),%esp + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _sha512_block_data_order_ssse3 +.private_extern _sha512_block_data_order_ssse3 +.align 4 +_sha512_block_data_order_ssse3: +L_sha512_block_data_order_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call L004pic_point +L004pic_point: + popl %ebp + leal LK512-L004pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $7,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + movq (%esi),%mm0 + movq 8(%esi),%mm1 + movq 16(%esi),%mm2 + movq 24(%esi),%mm3 + movq 32(%esi),%mm4 + movq 40(%esi),%mm5 + movq 48(%esi),%mm6 + movq 56(%esi),%mm7 + leal -64(%esp),%edx + subl $256,%esp + movdqa 640(%ebp),%xmm1 + movdqu (%edi),%xmm0 + pshufb %xmm1,%xmm0 + movdqa (%ebp),%xmm3 + movdqa %xmm1,%xmm2 + movdqu 16(%edi),%xmm1 + paddq %xmm0,%xmm3 + pshufb %xmm2,%xmm1 + movdqa %xmm3,-128(%edx) + movdqa 16(%ebp),%xmm4 + movdqa %xmm2,%xmm3 + movdqu 32(%edi),%xmm2 + paddq %xmm1,%xmm4 + pshufb %xmm3,%xmm2 + movdqa %xmm4,-112(%edx) + movdqa 32(%ebp),%xmm5 + movdqa %xmm3,%xmm4 + movdqu 48(%edi),%xmm3 + paddq %xmm2,%xmm5 + pshufb %xmm4,%xmm3 + movdqa %xmm5,-96(%edx) + movdqa 48(%ebp),%xmm6 + movdqa %xmm4,%xmm5 + movdqu 64(%edi),%xmm4 + paddq %xmm3,%xmm6 + pshufb %xmm5,%xmm4 + movdqa %xmm6,-80(%edx) + movdqa 64(%ebp),%xmm7 + movdqa %xmm5,%xmm6 + movdqu 80(%edi),%xmm5 + paddq %xmm4,%xmm7 + pshufb %xmm6,%xmm5 + movdqa %xmm7,-64(%edx) + movdqa %xmm0,(%edx) + movdqa 80(%ebp),%xmm0 + movdqa %xmm6,%xmm7 + movdqu 96(%edi),%xmm6 + paddq %xmm5,%xmm0 + pshufb %xmm7,%xmm6 + movdqa %xmm0,-48(%edx) + movdqa %xmm1,16(%edx) + movdqa 96(%ebp),%xmm1 + movdqa %xmm7,%xmm0 + movdqu 112(%edi),%xmm7 + paddq %xmm6,%xmm1 + pshufb %xmm0,%xmm7 + movdqa %xmm1,-32(%edx) + movdqa %xmm2,32(%edx) + movdqa 112(%ebp),%xmm2 + movdqa (%edx),%xmm0 + paddq %xmm7,%xmm2 + movdqa %xmm2,-16(%edx) + nop +.align 5,0x90 +L005loop_ssse3: + movdqa 16(%edx),%xmm2 + movdqa %xmm3,48(%edx) + leal 128(%ebp),%ebp + movq %mm1,8(%esp) + movl %edi,%ebx + movq %mm2,16(%esp) + leal 128(%edi),%edi + movq %mm3,24(%esp) + cmpl %eax,%edi + movq %mm5,40(%esp) + cmovbl %edi,%ebx + movq %mm6,48(%esp) + movl $4,%ecx + pxor %mm1,%mm2 + movq %mm7,56(%esp) + pxor %mm3,%mm3 + jmp L00600_47_ssse3 +.align 5,0x90 +L00600_47_ssse3: + movdqa %xmm5,%xmm3 + movdqa %xmm2,%xmm1 + palignr $8,%xmm0,%xmm2 + movdqa %xmm4,(%edx) + palignr $8,%xmm4,%xmm3 + movdqa %xmm2,%xmm4 + psrlq $7,%xmm2 + paddq %xmm3,%xmm0 + movdqa %xmm4,%xmm3 + psrlq $1,%xmm4 + psllq $56,%xmm3 + pxor %xmm4,%xmm2 + psrlq $7,%xmm4 + pxor %xmm3,%xmm2 + psllq $7,%xmm3 + pxor %xmm4,%xmm2 + movdqa %xmm7,%xmm4 + pxor %xmm3,%xmm2 + movdqa %xmm7,%xmm3 + psrlq $6,%xmm4 + paddq %xmm2,%xmm0 + movdqa %xmm7,%xmm2 + psrlq $19,%xmm3 + psllq $3,%xmm2 + pxor %xmm3,%xmm4 + psrlq $42,%xmm3 + pxor %xmm2,%xmm4 + psllq $42,%xmm2 + pxor %xmm3,%xmm4 + movdqa 32(%edx),%xmm3 + pxor %xmm2,%xmm4 + movdqa (%ebp),%xmm2 + movq %mm4,%mm1 + paddq %xmm4,%xmm0 + movq -128(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + paddq %xmm0,%xmm2 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -120(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm2,-128(%edx) + movdqa %xmm6,%xmm4 + movdqa %xmm3,%xmm2 + palignr $8,%xmm1,%xmm3 + movdqa %xmm5,16(%edx) + palignr $8,%xmm5,%xmm4 + movdqa %xmm3,%xmm5 + psrlq $7,%xmm3 + paddq %xmm4,%xmm1 + movdqa %xmm5,%xmm4 + psrlq $1,%xmm5 + psllq $56,%xmm4 + pxor %xmm5,%xmm3 + psrlq $7,%xmm5 + pxor %xmm4,%xmm3 + psllq $7,%xmm4 + pxor %xmm5,%xmm3 + movdqa %xmm0,%xmm5 + pxor %xmm4,%xmm3 + movdqa %xmm0,%xmm4 + psrlq $6,%xmm5 + paddq %xmm3,%xmm1 + movdqa %xmm0,%xmm3 + psrlq $19,%xmm4 + psllq $3,%xmm3 + pxor %xmm4,%xmm5 + psrlq $42,%xmm4 + pxor %xmm3,%xmm5 + psllq $42,%xmm3 + pxor %xmm4,%xmm5 + movdqa 48(%edx),%xmm4 + pxor %xmm3,%xmm5 + movdqa 16(%ebp),%xmm3 + movq %mm4,%mm1 + paddq %xmm5,%xmm1 + movq -112(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + paddq %xmm1,%xmm3 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -104(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm3,-112(%edx) + movdqa %xmm7,%xmm5 + movdqa %xmm4,%xmm3 + palignr $8,%xmm2,%xmm4 + movdqa %xmm6,32(%edx) + palignr $8,%xmm6,%xmm5 + movdqa %xmm4,%xmm6 + psrlq $7,%xmm4 + paddq %xmm5,%xmm2 + movdqa %xmm6,%xmm5 + psrlq $1,%xmm6 + psllq $56,%xmm5 + pxor %xmm6,%xmm4 + psrlq $7,%xmm6 + pxor %xmm5,%xmm4 + psllq $7,%xmm5 + pxor %xmm6,%xmm4 + movdqa %xmm1,%xmm6 + pxor %xmm5,%xmm4 + movdqa %xmm1,%xmm5 + psrlq $6,%xmm6 + paddq %xmm4,%xmm2 + movdqa %xmm1,%xmm4 + psrlq $19,%xmm5 + psllq $3,%xmm4 + pxor %xmm5,%xmm6 + psrlq $42,%xmm5 + pxor %xmm4,%xmm6 + psllq $42,%xmm4 + pxor %xmm5,%xmm6 + movdqa (%edx),%xmm5 + pxor %xmm4,%xmm6 + movdqa 32(%ebp),%xmm4 + movq %mm4,%mm1 + paddq %xmm6,%xmm2 + movq -96(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + paddq %xmm2,%xmm4 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -88(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm4,-96(%edx) + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm4 + palignr $8,%xmm3,%xmm5 + movdqa %xmm7,48(%edx) + palignr $8,%xmm7,%xmm6 + movdqa %xmm5,%xmm7 + psrlq $7,%xmm5 + paddq %xmm6,%xmm3 + movdqa %xmm7,%xmm6 + psrlq $1,%xmm7 + psllq $56,%xmm6 + pxor %xmm7,%xmm5 + psrlq $7,%xmm7 + pxor %xmm6,%xmm5 + psllq $7,%xmm6 + pxor %xmm7,%xmm5 + movdqa %xmm2,%xmm7 + pxor %xmm6,%xmm5 + movdqa %xmm2,%xmm6 + psrlq $6,%xmm7 + paddq %xmm5,%xmm3 + movdqa %xmm2,%xmm5 + psrlq $19,%xmm6 + psllq $3,%xmm5 + pxor %xmm6,%xmm7 + psrlq $42,%xmm6 + pxor %xmm5,%xmm7 + psllq $42,%xmm5 + pxor %xmm6,%xmm7 + movdqa 16(%edx),%xmm6 + pxor %xmm5,%xmm7 + movdqa 48(%ebp),%xmm5 + movq %mm4,%mm1 + paddq %xmm7,%xmm3 + movq -80(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + paddq %xmm3,%xmm5 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -72(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm5,-80(%edx) + movdqa %xmm1,%xmm7 + movdqa %xmm6,%xmm5 + palignr $8,%xmm4,%xmm6 + movdqa %xmm0,(%edx) + palignr $8,%xmm0,%xmm7 + movdqa %xmm6,%xmm0 + psrlq $7,%xmm6 + paddq %xmm7,%xmm4 + movdqa %xmm0,%xmm7 + psrlq $1,%xmm0 + psllq $56,%xmm7 + pxor %xmm0,%xmm6 + psrlq $7,%xmm0 + pxor %xmm7,%xmm6 + psllq $7,%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm3,%xmm0 + pxor %xmm7,%xmm6 + movdqa %xmm3,%xmm7 + psrlq $6,%xmm0 + paddq %xmm6,%xmm4 + movdqa %xmm3,%xmm6 + psrlq $19,%xmm7 + psllq $3,%xmm6 + pxor %xmm7,%xmm0 + psrlq $42,%xmm7 + pxor %xmm6,%xmm0 + psllq $42,%xmm6 + pxor %xmm7,%xmm0 + movdqa 32(%edx),%xmm7 + pxor %xmm6,%xmm0 + movdqa 64(%ebp),%xmm6 + movq %mm4,%mm1 + paddq %xmm0,%xmm4 + movq -64(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + paddq %xmm4,%xmm6 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -56(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm6,-64(%edx) + movdqa %xmm2,%xmm0 + movdqa %xmm7,%xmm6 + palignr $8,%xmm5,%xmm7 + movdqa %xmm1,16(%edx) + palignr $8,%xmm1,%xmm0 + movdqa %xmm7,%xmm1 + psrlq $7,%xmm7 + paddq %xmm0,%xmm5 + movdqa %xmm1,%xmm0 + psrlq $1,%xmm1 + psllq $56,%xmm0 + pxor %xmm1,%xmm7 + psrlq $7,%xmm1 + pxor %xmm0,%xmm7 + psllq $7,%xmm0 + pxor %xmm1,%xmm7 + movdqa %xmm4,%xmm1 + pxor %xmm0,%xmm7 + movdqa %xmm4,%xmm0 + psrlq $6,%xmm1 + paddq %xmm7,%xmm5 + movdqa %xmm4,%xmm7 + psrlq $19,%xmm0 + psllq $3,%xmm7 + pxor %xmm0,%xmm1 + psrlq $42,%xmm0 + pxor %xmm7,%xmm1 + psllq $42,%xmm7 + pxor %xmm0,%xmm1 + movdqa 48(%edx),%xmm0 + pxor %xmm7,%xmm1 + movdqa 80(%ebp),%xmm7 + movq %mm4,%mm1 + paddq %xmm1,%xmm5 + movq -48(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + paddq %xmm5,%xmm7 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -40(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm7,-48(%edx) + movdqa %xmm3,%xmm1 + movdqa %xmm0,%xmm7 + palignr $8,%xmm6,%xmm0 + movdqa %xmm2,32(%edx) + palignr $8,%xmm2,%xmm1 + movdqa %xmm0,%xmm2 + psrlq $7,%xmm0 + paddq %xmm1,%xmm6 + movdqa %xmm2,%xmm1 + psrlq $1,%xmm2 + psllq $56,%xmm1 + pxor %xmm2,%xmm0 + psrlq $7,%xmm2 + pxor %xmm1,%xmm0 + psllq $7,%xmm1 + pxor %xmm2,%xmm0 + movdqa %xmm5,%xmm2 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm1 + psrlq $6,%xmm2 + paddq %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + psrlq $19,%xmm1 + psllq $3,%xmm0 + pxor %xmm1,%xmm2 + psrlq $42,%xmm1 + pxor %xmm0,%xmm2 + psllq $42,%xmm0 + pxor %xmm1,%xmm2 + movdqa (%edx),%xmm1 + pxor %xmm0,%xmm2 + movdqa 96(%ebp),%xmm0 + movq %mm4,%mm1 + paddq %xmm2,%xmm6 + movq -32(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + paddq %xmm6,%xmm0 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -24(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm0,-32(%edx) + movdqa %xmm4,%xmm2 + movdqa %xmm1,%xmm0 + palignr $8,%xmm7,%xmm1 + movdqa %xmm3,48(%edx) + palignr $8,%xmm3,%xmm2 + movdqa %xmm1,%xmm3 + psrlq $7,%xmm1 + paddq %xmm2,%xmm7 + movdqa %xmm3,%xmm2 + psrlq $1,%xmm3 + psllq $56,%xmm2 + pxor %xmm3,%xmm1 + psrlq $7,%xmm3 + pxor %xmm2,%xmm1 + psllq $7,%xmm2 + pxor %xmm3,%xmm1 + movdqa %xmm6,%xmm3 + pxor %xmm2,%xmm1 + movdqa %xmm6,%xmm2 + psrlq $6,%xmm3 + paddq %xmm1,%xmm7 + movdqa %xmm6,%xmm1 + psrlq $19,%xmm2 + psllq $3,%xmm1 + pxor %xmm2,%xmm3 + psrlq $42,%xmm2 + pxor %xmm1,%xmm3 + psllq $42,%xmm1 + pxor %xmm2,%xmm3 + movdqa 16(%edx),%xmm2 + pxor %xmm1,%xmm3 + movdqa 112(%ebp),%xmm1 + movq %mm4,%mm1 + paddq %xmm3,%xmm7 + movq -16(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + paddq %xmm7,%xmm1 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -8(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm1,-16(%edx) + leal 128(%ebp),%ebp + decl %ecx + jnz L00600_47_ssse3 + movdqa (%ebp),%xmm1 + leal -640(%ebp),%ebp + movdqu (%ebx),%xmm0 + pshufb %xmm1,%xmm0 + movdqa (%ebp),%xmm3 + movdqa %xmm1,%xmm2 + movdqu 16(%ebx),%xmm1 + paddq %xmm0,%xmm3 + pshufb %xmm2,%xmm1 + movq %mm4,%mm1 + movq -128(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -120(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm3,-128(%edx) + movdqa 16(%ebp),%xmm4 + movdqa %xmm2,%xmm3 + movdqu 32(%ebx),%xmm2 + paddq %xmm1,%xmm4 + pshufb %xmm3,%xmm2 + movq %mm4,%mm1 + movq -112(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -104(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm4,-112(%edx) + movdqa 32(%ebp),%xmm5 + movdqa %xmm3,%xmm4 + movdqu 48(%ebx),%xmm3 + paddq %xmm2,%xmm5 + pshufb %xmm4,%xmm3 + movq %mm4,%mm1 + movq -96(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -88(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm5,-96(%edx) + movdqa 48(%ebp),%xmm6 + movdqa %xmm4,%xmm5 + movdqu 64(%ebx),%xmm4 + paddq %xmm3,%xmm6 + pshufb %xmm5,%xmm4 + movq %mm4,%mm1 + movq -80(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -72(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm6,-80(%edx) + movdqa 64(%ebp),%xmm7 + movdqa %xmm5,%xmm6 + movdqu 80(%ebx),%xmm5 + paddq %xmm4,%xmm7 + pshufb %xmm6,%xmm5 + movq %mm4,%mm1 + movq -64(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -56(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm7,-64(%edx) + movdqa %xmm0,(%edx) + movdqa 80(%ebp),%xmm0 + movdqa %xmm6,%xmm7 + movdqu 96(%ebx),%xmm6 + paddq %xmm5,%xmm0 + pshufb %xmm7,%xmm6 + movq %mm4,%mm1 + movq -48(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -40(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm0,-48(%edx) + movdqa %xmm1,16(%edx) + movdqa 96(%ebp),%xmm1 + movdqa %xmm7,%xmm0 + movdqu 112(%ebx),%xmm7 + paddq %xmm6,%xmm1 + pshufb %xmm0,%xmm7 + movq %mm4,%mm1 + movq -32(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -24(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm1,-32(%edx) + movdqa %xmm2,32(%edx) + movdqa 112(%ebp),%xmm2 + movdqa (%edx),%xmm0 + paddq %xmm7,%xmm2 + movq %mm4,%mm1 + movq -16(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -8(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm2,-16(%edx) + movq 8(%esp),%mm1 + paddq %mm3,%mm0 + movq 24(%esp),%mm3 + movq 56(%esp),%mm7 + pxor %mm1,%mm2 + paddq (%esi),%mm0 + paddq 8(%esi),%mm1 + paddq 16(%esi),%mm2 + paddq 24(%esi),%mm3 + paddq 32(%esi),%mm4 + paddq 40(%esi),%mm5 + paddq 48(%esi),%mm6 + paddq 56(%esi),%mm7 + movq %mm0,(%esi) + movq %mm1,8(%esi) + movq %mm2,16(%esi) + movq %mm3,24(%esi) + movq %mm4,32(%esi) + movq %mm5,40(%esi) + movq %mm6,48(%esi) + movq %mm7,56(%esi) + cmpl %eax,%edi + jb L005loop_ssse3 + movl 76(%edx),%esp + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +LK512: +.long 3609767458,1116352408 +.long 602891725,1899447441 +.long 3964484399,3049323471 +.long 2173295548,3921009573 +.long 4081628472,961987163 +.long 3053834265,1508970993 +.long 2937671579,2453635748 +.long 3664609560,2870763221 +.long 2734883394,3624381080 +.long 1164996542,310598401 +.long 1323610764,607225278 +.long 3590304994,1426881987 +.long 4068182383,1925078388 +.long 991336113,2162078206 +.long 633803317,2614888103 +.long 3479774868,3248222580 +.long 2666613458,3835390401 +.long 944711139,4022224774 +.long 2341262773,264347078 +.long 2007800933,604807628 +.long 1495990901,770255983 +.long 1856431235,1249150122 +.long 3175218132,1555081692 +.long 2198950837,1996064986 +.long 3999719339,2554220882 +.long 766784016,2821834349 +.long 2566594879,2952996808 +.long 3203337956,3210313671 +.long 1034457026,3336571891 +.long 2466948901,3584528711 +.long 3758326383,113926993 +.long 168717936,338241895 +.long 1188179964,666307205 +.long 1546045734,773529912 +.long 1522805485,1294757372 +.long 2643833823,1396182291 +.long 2343527390,1695183700 +.long 1014477480,1986661051 +.long 1206759142,2177026350 +.long 344077627,2456956037 +.long 1290863460,2730485921 +.long 3158454273,2820302411 +.long 3505952657,3259730800 +.long 106217008,3345764771 +.long 3606008344,3516065817 +.long 1432725776,3600352804 +.long 1467031594,4094571909 +.long 851169720,275423344 +.long 3100823752,430227734 +.long 1363258195,506948616 +.long 3750685593,659060556 +.long 3785050280,883997877 +.long 3318307427,958139571 +.long 3812723403,1322822218 +.long 2003034995,1537002063 +.long 3602036899,1747873779 +.long 1575990012,1955562222 +.long 1125592928,2024104815 +.long 2716904306,2227730452 +.long 442776044,2361852424 +.long 593698344,2428436474 +.long 3733110249,2756734187 +.long 2999351573,3204031479 +.long 3815920427,3329325298 +.long 3928383900,3391569614 +.long 566280711,3515267271 +.long 3454069534,3940187606 +.long 4000239992,4118630271 +.long 1914138554,116418474 +.long 2731055270,174292421 +.long 3203993006,289380356 +.long 320620315,460393269 +.long 587496836,685471733 +.long 1086792851,852142971 +.long 365543100,1017036298 +.long 2618297676,1126000580 +.long 3409855158,1288033470 +.long 4234509866,1501505948 +.long 987167468,1607167915 +.long 1246189591,1816402316 +.long 67438087,66051 +.long 202182159,134810123 +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 +.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +.byte 62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/sha512-586-linux.S b/third_party/boringssl/gen/bcm/sha512-586-linux.S new file mode 100644 index 00000000..ebeb87df --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha512-586-linux.S @@ -0,0 +1,2410 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl sha512_block_data_order_nohw +.hidden sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,@function +.align 16 +sha512_block_data_order_nohw: +.L_sha512_block_data_order_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call .L000pic_point +.L000pic_point: + popl %ebp + leal .LK512-.L000pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $7,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + movq (%esi),%mm0 + movq 8(%esi),%mm1 + movq 16(%esi),%mm2 + movq 24(%esi),%mm3 + movq 32(%esi),%mm4 + movq 40(%esi),%mm5 + movq 48(%esi),%mm6 + movq 56(%esi),%mm7 + subl $80,%esp + jmp .L001loop_sse2 +.align 16 +.L001loop_sse2: + movq %mm1,8(%esp) + movq %mm2,16(%esp) + movq %mm3,24(%esp) + movq %mm5,40(%esp) + movq %mm6,48(%esp) + pxor %mm1,%mm2 + movq %mm7,56(%esp) + movq %mm0,%mm3 + movl (%edi),%eax + movl 4(%edi),%ebx + addl $8,%edi + movl $15,%edx + bswap %eax + bswap %ebx + jmp .L00200_14_sse2 +.align 16 +.L00200_14_sse2: + movd %eax,%mm1 + movl (%edi),%eax + movd %ebx,%mm7 + movl 4(%edi),%ebx + addl $8,%edi + bswap %eax + bswap %ebx + punpckldq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm3,%mm0 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm2,%mm3 + movq %mm0,%mm2 + addl $8,%ebp + paddq %mm6,%mm3 + movq 48(%esp),%mm6 + decl %edx + jnz .L00200_14_sse2 + movd %eax,%mm1 + movd %ebx,%mm7 + punpckldq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm3,%mm0 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm2,%mm3 + movq %mm0,%mm2 + addl $8,%ebp + paddq %mm6,%mm3 + pxor %mm0,%mm0 + movl $32,%edx + jmp .L00316_79_sse2 +.align 16 +.L00316_79_sse2: + movq 88(%esp),%mm5 + movq %mm7,%mm1 + psrlq $1,%mm7 + movq %mm5,%mm6 + psrlq $6,%mm5 + psllq $56,%mm1 + paddq %mm3,%mm0 + movq %mm7,%mm3 + psrlq $6,%mm7 + pxor %mm1,%mm3 + psllq $7,%mm1 + pxor %mm7,%mm3 + psrlq $1,%mm7 + pxor %mm1,%mm3 + movq %mm5,%mm1 + psrlq $13,%mm5 + pxor %mm3,%mm7 + psllq $3,%mm6 + pxor %mm5,%mm1 + paddq 200(%esp),%mm7 + pxor %mm6,%mm1 + psrlq $42,%mm5 + paddq 128(%esp),%mm7 + pxor %mm5,%mm1 + psllq $42,%mm6 + movq 40(%esp),%mm5 + pxor %mm6,%mm1 + movq 48(%esp),%mm6 + paddq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm6,%mm2 + addl $8,%ebp + movq 88(%esp),%mm5 + movq %mm7,%mm1 + psrlq $1,%mm7 + movq %mm5,%mm6 + psrlq $6,%mm5 + psllq $56,%mm1 + paddq %mm3,%mm2 + movq %mm7,%mm3 + psrlq $6,%mm7 + pxor %mm1,%mm3 + psllq $7,%mm1 + pxor %mm7,%mm3 + psrlq $1,%mm7 + pxor %mm1,%mm3 + movq %mm5,%mm1 + psrlq $13,%mm5 + pxor %mm3,%mm7 + psllq $3,%mm6 + pxor %mm5,%mm1 + paddq 200(%esp),%mm7 + pxor %mm6,%mm1 + psrlq $42,%mm5 + paddq 128(%esp),%mm7 + pxor %mm5,%mm1 + psllq $42,%mm6 + movq 40(%esp),%mm5 + pxor %mm6,%mm1 + movq 48(%esp),%mm6 + paddq %mm1,%mm7 + movq %mm4,%mm1 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + movq %mm7,72(%esp) + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + paddq (%ebp),%mm7 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + subl $8,%esp + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 192(%esp),%mm7 + paddq %mm6,%mm0 + addl $8,%ebp + decl %edx + jnz .L00316_79_sse2 + paddq %mm3,%mm0 + movq 8(%esp),%mm1 + movq 24(%esp),%mm3 + movq 40(%esp),%mm5 + movq 48(%esp),%mm6 + movq 56(%esp),%mm7 + pxor %mm1,%mm2 + paddq (%esi),%mm0 + paddq 8(%esi),%mm1 + paddq 16(%esi),%mm2 + paddq 24(%esi),%mm3 + paddq 32(%esi),%mm4 + paddq 40(%esi),%mm5 + paddq 48(%esi),%mm6 + paddq 56(%esi),%mm7 + movl $640,%eax + movq %mm0,(%esi) + movq %mm1,8(%esi) + movq %mm2,16(%esi) + movq %mm3,24(%esi) + movq %mm4,32(%esi) + movq %mm5,40(%esi) + movq %mm6,48(%esi) + movq %mm7,56(%esi) + leal (%esp,%eax,1),%esp + subl %eax,%ebp + cmpl 88(%esp),%edi + jb .L001loop_sse2 + movl 92(%esp),%esp + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size sha512_block_data_order_nohw,.-.L_sha512_block_data_order_nohw_begin +.globl sha512_block_data_order_ssse3 +.hidden sha512_block_data_order_ssse3 +.type sha512_block_data_order_ssse3,@function +.align 16 +sha512_block_data_order_ssse3: +.L_sha512_block_data_order_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl %esp,%ebx + call .L004pic_point +.L004pic_point: + popl %ebp + leal .LK512-.L004pic_point(%ebp),%ebp + subl $16,%esp + andl $-64,%esp + shll $7,%eax + addl %edi,%eax + movl %esi,(%esp) + movl %edi,4(%esp) + movl %eax,8(%esp) + movl %ebx,12(%esp) + movq (%esi),%mm0 + movq 8(%esi),%mm1 + movq 16(%esi),%mm2 + movq 24(%esi),%mm3 + movq 32(%esi),%mm4 + movq 40(%esi),%mm5 + movq 48(%esi),%mm6 + movq 56(%esi),%mm7 + leal -64(%esp),%edx + subl $256,%esp + movdqa 640(%ebp),%xmm1 + movdqu (%edi),%xmm0 + pshufb %xmm1,%xmm0 + movdqa (%ebp),%xmm3 + movdqa %xmm1,%xmm2 + movdqu 16(%edi),%xmm1 + paddq %xmm0,%xmm3 + pshufb %xmm2,%xmm1 + movdqa %xmm3,-128(%edx) + movdqa 16(%ebp),%xmm4 + movdqa %xmm2,%xmm3 + movdqu 32(%edi),%xmm2 + paddq %xmm1,%xmm4 + pshufb %xmm3,%xmm2 + movdqa %xmm4,-112(%edx) + movdqa 32(%ebp),%xmm5 + movdqa %xmm3,%xmm4 + movdqu 48(%edi),%xmm3 + paddq %xmm2,%xmm5 + pshufb %xmm4,%xmm3 + movdqa %xmm5,-96(%edx) + movdqa 48(%ebp),%xmm6 + movdqa %xmm4,%xmm5 + movdqu 64(%edi),%xmm4 + paddq %xmm3,%xmm6 + pshufb %xmm5,%xmm4 + movdqa %xmm6,-80(%edx) + movdqa 64(%ebp),%xmm7 + movdqa %xmm5,%xmm6 + movdqu 80(%edi),%xmm5 + paddq %xmm4,%xmm7 + pshufb %xmm6,%xmm5 + movdqa %xmm7,-64(%edx) + movdqa %xmm0,(%edx) + movdqa 80(%ebp),%xmm0 + movdqa %xmm6,%xmm7 + movdqu 96(%edi),%xmm6 + paddq %xmm5,%xmm0 + pshufb %xmm7,%xmm6 + movdqa %xmm0,-48(%edx) + movdqa %xmm1,16(%edx) + movdqa 96(%ebp),%xmm1 + movdqa %xmm7,%xmm0 + movdqu 112(%edi),%xmm7 + paddq %xmm6,%xmm1 + pshufb %xmm0,%xmm7 + movdqa %xmm1,-32(%edx) + movdqa %xmm2,32(%edx) + movdqa 112(%ebp),%xmm2 + movdqa (%edx),%xmm0 + paddq %xmm7,%xmm2 + movdqa %xmm2,-16(%edx) + nop +.align 32 +.L005loop_ssse3: + movdqa 16(%edx),%xmm2 + movdqa %xmm3,48(%edx) + leal 128(%ebp),%ebp + movq %mm1,8(%esp) + movl %edi,%ebx + movq %mm2,16(%esp) + leal 128(%edi),%edi + movq %mm3,24(%esp) + cmpl %eax,%edi + movq %mm5,40(%esp) + cmovbl %edi,%ebx + movq %mm6,48(%esp) + movl $4,%ecx + pxor %mm1,%mm2 + movq %mm7,56(%esp) + pxor %mm3,%mm3 + jmp .L00600_47_ssse3 +.align 32 +.L00600_47_ssse3: + movdqa %xmm5,%xmm3 + movdqa %xmm2,%xmm1 + palignr $8,%xmm0,%xmm2 + movdqa %xmm4,(%edx) + palignr $8,%xmm4,%xmm3 + movdqa %xmm2,%xmm4 + psrlq $7,%xmm2 + paddq %xmm3,%xmm0 + movdqa %xmm4,%xmm3 + psrlq $1,%xmm4 + psllq $56,%xmm3 + pxor %xmm4,%xmm2 + psrlq $7,%xmm4 + pxor %xmm3,%xmm2 + psllq $7,%xmm3 + pxor %xmm4,%xmm2 + movdqa %xmm7,%xmm4 + pxor %xmm3,%xmm2 + movdqa %xmm7,%xmm3 + psrlq $6,%xmm4 + paddq %xmm2,%xmm0 + movdqa %xmm7,%xmm2 + psrlq $19,%xmm3 + psllq $3,%xmm2 + pxor %xmm3,%xmm4 + psrlq $42,%xmm3 + pxor %xmm2,%xmm4 + psllq $42,%xmm2 + pxor %xmm3,%xmm4 + movdqa 32(%edx),%xmm3 + pxor %xmm2,%xmm4 + movdqa (%ebp),%xmm2 + movq %mm4,%mm1 + paddq %xmm4,%xmm0 + movq -128(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + paddq %xmm0,%xmm2 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -120(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm2,-128(%edx) + movdqa %xmm6,%xmm4 + movdqa %xmm3,%xmm2 + palignr $8,%xmm1,%xmm3 + movdqa %xmm5,16(%edx) + palignr $8,%xmm5,%xmm4 + movdqa %xmm3,%xmm5 + psrlq $7,%xmm3 + paddq %xmm4,%xmm1 + movdqa %xmm5,%xmm4 + psrlq $1,%xmm5 + psllq $56,%xmm4 + pxor %xmm5,%xmm3 + psrlq $7,%xmm5 + pxor %xmm4,%xmm3 + psllq $7,%xmm4 + pxor %xmm5,%xmm3 + movdqa %xmm0,%xmm5 + pxor %xmm4,%xmm3 + movdqa %xmm0,%xmm4 + psrlq $6,%xmm5 + paddq %xmm3,%xmm1 + movdqa %xmm0,%xmm3 + psrlq $19,%xmm4 + psllq $3,%xmm3 + pxor %xmm4,%xmm5 + psrlq $42,%xmm4 + pxor %xmm3,%xmm5 + psllq $42,%xmm3 + pxor %xmm4,%xmm5 + movdqa 48(%edx),%xmm4 + pxor %xmm3,%xmm5 + movdqa 16(%ebp),%xmm3 + movq %mm4,%mm1 + paddq %xmm5,%xmm1 + movq -112(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + paddq %xmm1,%xmm3 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -104(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm3,-112(%edx) + movdqa %xmm7,%xmm5 + movdqa %xmm4,%xmm3 + palignr $8,%xmm2,%xmm4 + movdqa %xmm6,32(%edx) + palignr $8,%xmm6,%xmm5 + movdqa %xmm4,%xmm6 + psrlq $7,%xmm4 + paddq %xmm5,%xmm2 + movdqa %xmm6,%xmm5 + psrlq $1,%xmm6 + psllq $56,%xmm5 + pxor %xmm6,%xmm4 + psrlq $7,%xmm6 + pxor %xmm5,%xmm4 + psllq $7,%xmm5 + pxor %xmm6,%xmm4 + movdqa %xmm1,%xmm6 + pxor %xmm5,%xmm4 + movdqa %xmm1,%xmm5 + psrlq $6,%xmm6 + paddq %xmm4,%xmm2 + movdqa %xmm1,%xmm4 + psrlq $19,%xmm5 + psllq $3,%xmm4 + pxor %xmm5,%xmm6 + psrlq $42,%xmm5 + pxor %xmm4,%xmm6 + psllq $42,%xmm4 + pxor %xmm5,%xmm6 + movdqa (%edx),%xmm5 + pxor %xmm4,%xmm6 + movdqa 32(%ebp),%xmm4 + movq %mm4,%mm1 + paddq %xmm6,%xmm2 + movq -96(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + paddq %xmm2,%xmm4 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -88(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm4,-96(%edx) + movdqa %xmm0,%xmm6 + movdqa %xmm5,%xmm4 + palignr $8,%xmm3,%xmm5 + movdqa %xmm7,48(%edx) + palignr $8,%xmm7,%xmm6 + movdqa %xmm5,%xmm7 + psrlq $7,%xmm5 + paddq %xmm6,%xmm3 + movdqa %xmm7,%xmm6 + psrlq $1,%xmm7 + psllq $56,%xmm6 + pxor %xmm7,%xmm5 + psrlq $7,%xmm7 + pxor %xmm6,%xmm5 + psllq $7,%xmm6 + pxor %xmm7,%xmm5 + movdqa %xmm2,%xmm7 + pxor %xmm6,%xmm5 + movdqa %xmm2,%xmm6 + psrlq $6,%xmm7 + paddq %xmm5,%xmm3 + movdqa %xmm2,%xmm5 + psrlq $19,%xmm6 + psllq $3,%xmm5 + pxor %xmm6,%xmm7 + psrlq $42,%xmm6 + pxor %xmm5,%xmm7 + psllq $42,%xmm5 + pxor %xmm6,%xmm7 + movdqa 16(%edx),%xmm6 + pxor %xmm5,%xmm7 + movdqa 48(%ebp),%xmm5 + movq %mm4,%mm1 + paddq %xmm7,%xmm3 + movq -80(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + paddq %xmm3,%xmm5 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -72(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm5,-80(%edx) + movdqa %xmm1,%xmm7 + movdqa %xmm6,%xmm5 + palignr $8,%xmm4,%xmm6 + movdqa %xmm0,(%edx) + palignr $8,%xmm0,%xmm7 + movdqa %xmm6,%xmm0 + psrlq $7,%xmm6 + paddq %xmm7,%xmm4 + movdqa %xmm0,%xmm7 + psrlq $1,%xmm0 + psllq $56,%xmm7 + pxor %xmm0,%xmm6 + psrlq $7,%xmm0 + pxor %xmm7,%xmm6 + psllq $7,%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm3,%xmm0 + pxor %xmm7,%xmm6 + movdqa %xmm3,%xmm7 + psrlq $6,%xmm0 + paddq %xmm6,%xmm4 + movdqa %xmm3,%xmm6 + psrlq $19,%xmm7 + psllq $3,%xmm6 + pxor %xmm7,%xmm0 + psrlq $42,%xmm7 + pxor %xmm6,%xmm0 + psllq $42,%xmm6 + pxor %xmm7,%xmm0 + movdqa 32(%edx),%xmm7 + pxor %xmm6,%xmm0 + movdqa 64(%ebp),%xmm6 + movq %mm4,%mm1 + paddq %xmm0,%xmm4 + movq -64(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + paddq %xmm4,%xmm6 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -56(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm6,-64(%edx) + movdqa %xmm2,%xmm0 + movdqa %xmm7,%xmm6 + palignr $8,%xmm5,%xmm7 + movdqa %xmm1,16(%edx) + palignr $8,%xmm1,%xmm0 + movdqa %xmm7,%xmm1 + psrlq $7,%xmm7 + paddq %xmm0,%xmm5 + movdqa %xmm1,%xmm0 + psrlq $1,%xmm1 + psllq $56,%xmm0 + pxor %xmm1,%xmm7 + psrlq $7,%xmm1 + pxor %xmm0,%xmm7 + psllq $7,%xmm0 + pxor %xmm1,%xmm7 + movdqa %xmm4,%xmm1 + pxor %xmm0,%xmm7 + movdqa %xmm4,%xmm0 + psrlq $6,%xmm1 + paddq %xmm7,%xmm5 + movdqa %xmm4,%xmm7 + psrlq $19,%xmm0 + psllq $3,%xmm7 + pxor %xmm0,%xmm1 + psrlq $42,%xmm0 + pxor %xmm7,%xmm1 + psllq $42,%xmm7 + pxor %xmm0,%xmm1 + movdqa 48(%edx),%xmm0 + pxor %xmm7,%xmm1 + movdqa 80(%ebp),%xmm7 + movq %mm4,%mm1 + paddq %xmm1,%xmm5 + movq -48(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + paddq %xmm5,%xmm7 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -40(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm7,-48(%edx) + movdqa %xmm3,%xmm1 + movdqa %xmm0,%xmm7 + palignr $8,%xmm6,%xmm0 + movdqa %xmm2,32(%edx) + palignr $8,%xmm2,%xmm1 + movdqa %xmm0,%xmm2 + psrlq $7,%xmm0 + paddq %xmm1,%xmm6 + movdqa %xmm2,%xmm1 + psrlq $1,%xmm2 + psllq $56,%xmm1 + pxor %xmm2,%xmm0 + psrlq $7,%xmm2 + pxor %xmm1,%xmm0 + psllq $7,%xmm1 + pxor %xmm2,%xmm0 + movdqa %xmm5,%xmm2 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm1 + psrlq $6,%xmm2 + paddq %xmm0,%xmm6 + movdqa %xmm5,%xmm0 + psrlq $19,%xmm1 + psllq $3,%xmm0 + pxor %xmm1,%xmm2 + psrlq $42,%xmm1 + pxor %xmm0,%xmm2 + psllq $42,%xmm0 + pxor %xmm1,%xmm2 + movdqa (%edx),%xmm1 + pxor %xmm0,%xmm2 + movdqa 96(%ebp),%xmm0 + movq %mm4,%mm1 + paddq %xmm2,%xmm6 + movq -32(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + paddq %xmm6,%xmm0 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -24(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm0,-32(%edx) + movdqa %xmm4,%xmm2 + movdqa %xmm1,%xmm0 + palignr $8,%xmm7,%xmm1 + movdqa %xmm3,48(%edx) + palignr $8,%xmm3,%xmm2 + movdqa %xmm1,%xmm3 + psrlq $7,%xmm1 + paddq %xmm2,%xmm7 + movdqa %xmm3,%xmm2 + psrlq $1,%xmm3 + psllq $56,%xmm2 + pxor %xmm3,%xmm1 + psrlq $7,%xmm3 + pxor %xmm2,%xmm1 + psllq $7,%xmm2 + pxor %xmm3,%xmm1 + movdqa %xmm6,%xmm3 + pxor %xmm2,%xmm1 + movdqa %xmm6,%xmm2 + psrlq $6,%xmm3 + paddq %xmm1,%xmm7 + movdqa %xmm6,%xmm1 + psrlq $19,%xmm2 + psllq $3,%xmm1 + pxor %xmm2,%xmm3 + psrlq $42,%xmm2 + pxor %xmm1,%xmm3 + psllq $42,%xmm1 + pxor %xmm2,%xmm3 + movdqa 16(%edx),%xmm2 + pxor %xmm1,%xmm3 + movdqa 112(%ebp),%xmm1 + movq %mm4,%mm1 + paddq %xmm3,%xmm7 + movq -16(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + paddq %xmm7,%xmm1 + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -8(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm1,-16(%edx) + leal 128(%ebp),%ebp + decl %ecx + jnz .L00600_47_ssse3 + movdqa (%ebp),%xmm1 + leal -640(%ebp),%ebp + movdqu (%ebx),%xmm0 + pshufb %xmm1,%xmm0 + movdqa (%ebp),%xmm3 + movdqa %xmm1,%xmm2 + movdqu 16(%ebx),%xmm1 + paddq %xmm0,%xmm3 + pshufb %xmm2,%xmm1 + movq %mm4,%mm1 + movq -128(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -120(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm3,-128(%edx) + movdqa 16(%ebp),%xmm4 + movdqa %xmm2,%xmm3 + movdqu 32(%ebx),%xmm2 + paddq %xmm1,%xmm4 + pshufb %xmm3,%xmm2 + movq %mm4,%mm1 + movq -112(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -104(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm4,-112(%edx) + movdqa 32(%ebp),%xmm5 + movdqa %xmm3,%xmm4 + movdqu 48(%ebx),%xmm3 + paddq %xmm2,%xmm5 + pshufb %xmm4,%xmm3 + movq %mm4,%mm1 + movq -96(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -88(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm5,-96(%edx) + movdqa 48(%ebp),%xmm6 + movdqa %xmm4,%xmm5 + movdqu 64(%ebx),%xmm4 + paddq %xmm3,%xmm6 + pshufb %xmm5,%xmm4 + movq %mm4,%mm1 + movq -80(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -72(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm6,-80(%edx) + movdqa 64(%ebp),%xmm7 + movdqa %xmm5,%xmm6 + movdqu 80(%ebx),%xmm5 + paddq %xmm4,%xmm7 + pshufb %xmm6,%xmm5 + movq %mm4,%mm1 + movq -64(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,32(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 56(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 24(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 8(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 32(%esp),%mm5 + paddq %mm6,%mm2 + movq 40(%esp),%mm6 + movq %mm4,%mm1 + movq -56(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,24(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,56(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 48(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 16(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq (%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 24(%esp),%mm5 + paddq %mm6,%mm0 + movq 32(%esp),%mm6 + movdqa %xmm7,-64(%edx) + movdqa %xmm0,(%edx) + movdqa 80(%ebp),%xmm0 + movdqa %xmm6,%xmm7 + movdqu 96(%ebx),%xmm6 + paddq %xmm5,%xmm0 + pshufb %xmm7,%xmm6 + movq %mm4,%mm1 + movq -48(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,16(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,48(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 40(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 8(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 56(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 16(%esp),%mm5 + paddq %mm6,%mm2 + movq 24(%esp),%mm6 + movq %mm4,%mm1 + movq -40(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,8(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,40(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 32(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq (%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 48(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 8(%esp),%mm5 + paddq %mm6,%mm0 + movq 16(%esp),%mm6 + movdqa %xmm0,-48(%edx) + movdqa %xmm1,16(%edx) + movdqa 96(%ebp),%xmm1 + movdqa %xmm7,%xmm0 + movdqu 112(%ebx),%xmm7 + paddq %xmm6,%xmm1 + pshufb %xmm0,%xmm7 + movq %mm4,%mm1 + movq -32(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,32(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 24(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 56(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 40(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq (%esp),%mm5 + paddq %mm6,%mm2 + movq 8(%esp),%mm6 + movq %mm4,%mm1 + movq -24(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,56(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,24(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 16(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 48(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 32(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 56(%esp),%mm5 + paddq %mm6,%mm0 + movq (%esp),%mm6 + movdqa %xmm1,-32(%edx) + movdqa %xmm2,32(%edx) + movdqa 112(%ebp),%xmm2 + movdqa (%edx),%xmm0 + paddq %xmm7,%xmm2 + movq %mm4,%mm1 + movq -16(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,48(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm0 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm0,16(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq 8(%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 40(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm0,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm0,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 24(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm0,%mm2 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + pxor %mm7,%mm6 + movq 48(%esp),%mm5 + paddq %mm6,%mm2 + movq 56(%esp),%mm6 + movq %mm4,%mm1 + movq -8(%edx),%mm7 + pxor %mm6,%mm5 + psrlq $14,%mm1 + movq %mm4,40(%esp) + pand %mm4,%mm5 + psllq $23,%mm4 + paddq %mm3,%mm2 + movq %mm1,%mm3 + psrlq $4,%mm1 + pxor %mm6,%mm5 + pxor %mm4,%mm3 + psllq $23,%mm4 + pxor %mm1,%mm3 + movq %mm2,8(%esp) + paddq %mm5,%mm7 + pxor %mm4,%mm3 + psrlq $23,%mm1 + paddq (%esp),%mm7 + pxor %mm1,%mm3 + psllq $4,%mm4 + pxor %mm4,%mm3 + movq 32(%esp),%mm4 + paddq %mm7,%mm3 + movq %mm2,%mm5 + psrlq $28,%mm5 + paddq %mm3,%mm4 + movq %mm2,%mm6 + movq %mm5,%mm7 + psllq $25,%mm6 + movq 16(%esp),%mm1 + psrlq $6,%mm5 + pxor %mm6,%mm7 + psllq $5,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm2 + psrlq $5,%mm5 + pxor %mm6,%mm7 + pand %mm2,%mm0 + psllq $6,%mm6 + pxor %mm5,%mm7 + pxor %mm1,%mm0 + pxor %mm7,%mm6 + movq 40(%esp),%mm5 + paddq %mm6,%mm0 + movq 48(%esp),%mm6 + movdqa %xmm2,-16(%edx) + movq 8(%esp),%mm1 + paddq %mm3,%mm0 + movq 24(%esp),%mm3 + movq 56(%esp),%mm7 + pxor %mm1,%mm2 + paddq (%esi),%mm0 + paddq 8(%esi),%mm1 + paddq 16(%esi),%mm2 + paddq 24(%esi),%mm3 + paddq 32(%esi),%mm4 + paddq 40(%esi),%mm5 + paddq 48(%esi),%mm6 + paddq 56(%esi),%mm7 + movq %mm0,(%esi) + movq %mm1,8(%esi) + movq %mm2,16(%esi) + movq %mm3,24(%esi) + movq %mm4,32(%esi) + movq %mm5,40(%esi) + movq %mm6,48(%esi) + movq %mm7,56(%esi) + cmpl %eax,%edi + jb .L005loop_ssse3 + movl 76(%edx),%esp + emms + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size sha512_block_data_order_ssse3,.-.L_sha512_block_data_order_ssse3_begin +.align 64 +.LK512: +.long 3609767458,1116352408 +.long 602891725,1899447441 +.long 3964484399,3049323471 +.long 2173295548,3921009573 +.long 4081628472,961987163 +.long 3053834265,1508970993 +.long 2937671579,2453635748 +.long 3664609560,2870763221 +.long 2734883394,3624381080 +.long 1164996542,310598401 +.long 1323610764,607225278 +.long 3590304994,1426881987 +.long 4068182383,1925078388 +.long 991336113,2162078206 +.long 633803317,2614888103 +.long 3479774868,3248222580 +.long 2666613458,3835390401 +.long 944711139,4022224774 +.long 2341262773,264347078 +.long 2007800933,604807628 +.long 1495990901,770255983 +.long 1856431235,1249150122 +.long 3175218132,1555081692 +.long 2198950837,1996064986 +.long 3999719339,2554220882 +.long 766784016,2821834349 +.long 2566594879,2952996808 +.long 3203337956,3210313671 +.long 1034457026,3336571891 +.long 2466948901,3584528711 +.long 3758326383,113926993 +.long 168717936,338241895 +.long 1188179964,666307205 +.long 1546045734,773529912 +.long 1522805485,1294757372 +.long 2643833823,1396182291 +.long 2343527390,1695183700 +.long 1014477480,1986661051 +.long 1206759142,2177026350 +.long 344077627,2456956037 +.long 1290863460,2730485921 +.long 3158454273,2820302411 +.long 3505952657,3259730800 +.long 106217008,3345764771 +.long 3606008344,3516065817 +.long 1432725776,3600352804 +.long 1467031594,4094571909 +.long 851169720,275423344 +.long 3100823752,430227734 +.long 1363258195,506948616 +.long 3750685593,659060556 +.long 3785050280,883997877 +.long 3318307427,958139571 +.long 3812723403,1322822218 +.long 2003034995,1537002063 +.long 3602036899,1747873779 +.long 1575990012,1955562222 +.long 1125592928,2024104815 +.long 2716904306,2227730452 +.long 442776044,2361852424 +.long 593698344,2428436474 +.long 3733110249,2756734187 +.long 2999351573,3204031479 +.long 3815920427,3329325298 +.long 3928383900,3391569614 +.long 566280711,3515267271 +.long 3454069534,3940187606 +.long 4000239992,4118630271 +.long 1914138554,116418474 +.long 2731055270,174292421 +.long 3203993006,289380356 +.long 320620315,460393269 +.long 587496836,685471733 +.long 1086792851,852142971 +.long 365543100,1017036298 +.long 2618297676,1126000580 +.long 3409855158,1288033470 +.long 4234509866,1501505948 +.long 987167468,1607167915 +.long 1246189591,1816402316 +.long 67438087,66051 +.long 202182159,134810123 +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 +.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +.byte 62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/sha512-586-win.asm b/third_party/boringssl/gen/bcm/sha512-586-win.asm new file mode 100644 index 00000000..2089cf8c --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha512-586-win.asm @@ -0,0 +1,2415 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _sha512_block_data_order_nohw +align 16 +_sha512_block_data_order_nohw: +L$_sha512_block_data_order_nohw_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov ebx,esp + call L$000pic_point +L$000pic_point: + pop ebp + lea ebp,[(L$K512-L$000pic_point)+ebp] + sub esp,16 + and esp,-64 + shl eax,7 + add eax,edi + mov DWORD [esp],esi + mov DWORD [4+esp],edi + mov DWORD [8+esp],eax + mov DWORD [12+esp],ebx + movq mm0,[esi] + movq mm1,[8+esi] + movq mm2,[16+esi] + movq mm3,[24+esi] + movq mm4,[32+esi] + movq mm5,[40+esi] + movq mm6,[48+esi] + movq mm7,[56+esi] + sub esp,80 + jmp NEAR L$001loop_sse2 +align 16 +L$001loop_sse2: + movq [8+esp],mm1 + movq [16+esp],mm2 + movq [24+esp],mm3 + movq [40+esp],mm5 + movq [48+esp],mm6 + pxor mm2,mm1 + movq [56+esp],mm7 + movq mm3,mm0 + mov eax,DWORD [edi] + mov ebx,DWORD [4+edi] + add edi,8 + mov edx,15 + bswap eax + bswap ebx + jmp NEAR L$00200_14_sse2 +align 16 +L$00200_14_sse2: + movd mm1,eax + mov eax,DWORD [edi] + movd mm7,ebx + mov ebx,DWORD [4+edi] + add edi,8 + bswap eax + bswap ebx + punpckldq mm7,mm1 + movq mm1,mm4 + pxor mm5,mm6 + psrlq mm1,14 + movq [32+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + movq mm0,mm3 + movq [72+esp],mm7 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[56+esp] + pxor mm3,mm1 + psllq mm4,4 + paddq mm7,[ebp] + pxor mm3,mm4 + movq mm4,[24+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[8+esp] + psrlq mm5,6 + pxor mm7,mm6 + sub esp,8 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[40+esp] + paddq mm3,mm2 + movq mm2,mm0 + add ebp,8 + paddq mm3,mm6 + movq mm6,[48+esp] + dec edx + jnz NEAR L$00200_14_sse2 + movd mm1,eax + movd mm7,ebx + punpckldq mm7,mm1 + movq mm1,mm4 + pxor mm5,mm6 + psrlq mm1,14 + movq [32+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + movq mm0,mm3 + movq [72+esp],mm7 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[56+esp] + pxor mm3,mm1 + psllq mm4,4 + paddq mm7,[ebp] + pxor mm3,mm4 + movq mm4,[24+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[8+esp] + psrlq mm5,6 + pxor mm7,mm6 + sub esp,8 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm7,[192+esp] + paddq mm3,mm2 + movq mm2,mm0 + add ebp,8 + paddq mm3,mm6 + pxor mm0,mm0 + mov edx,32 + jmp NEAR L$00316_79_sse2 +align 16 +L$00316_79_sse2: + movq mm5,[88+esp] + movq mm1,mm7 + psrlq mm7,1 + movq mm6,mm5 + psrlq mm5,6 + psllq mm1,56 + paddq mm0,mm3 + movq mm3,mm7 + psrlq mm7,6 + pxor mm3,mm1 + psllq mm1,7 + pxor mm3,mm7 + psrlq mm7,1 + pxor mm3,mm1 + movq mm1,mm5 + psrlq mm5,13 + pxor mm7,mm3 + psllq mm6,3 + pxor mm1,mm5 + paddq mm7,[200+esp] + pxor mm1,mm6 + psrlq mm5,42 + paddq mm7,[128+esp] + pxor mm1,mm5 + psllq mm6,42 + movq mm5,[40+esp] + pxor mm1,mm6 + movq mm6,[48+esp] + paddq mm7,mm1 + movq mm1,mm4 + pxor mm5,mm6 + psrlq mm1,14 + movq [32+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + movq [72+esp],mm7 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[56+esp] + pxor mm3,mm1 + psllq mm4,4 + paddq mm7,[ebp] + pxor mm3,mm4 + movq mm4,[24+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[8+esp] + psrlq mm5,6 + pxor mm7,mm6 + sub esp,8 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm7,[192+esp] + paddq mm2,mm6 + add ebp,8 + movq mm5,[88+esp] + movq mm1,mm7 + psrlq mm7,1 + movq mm6,mm5 + psrlq mm5,6 + psllq mm1,56 + paddq mm2,mm3 + movq mm3,mm7 + psrlq mm7,6 + pxor mm3,mm1 + psllq mm1,7 + pxor mm3,mm7 + psrlq mm7,1 + pxor mm3,mm1 + movq mm1,mm5 + psrlq mm5,13 + pxor mm7,mm3 + psllq mm6,3 + pxor mm1,mm5 + paddq mm7,[200+esp] + pxor mm1,mm6 + psrlq mm5,42 + paddq mm7,[128+esp] + pxor mm1,mm5 + psllq mm6,42 + movq mm5,[40+esp] + pxor mm1,mm6 + movq mm6,[48+esp] + paddq mm7,mm1 + movq mm1,mm4 + pxor mm5,mm6 + psrlq mm1,14 + movq [32+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + movq [72+esp],mm7 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[56+esp] + pxor mm3,mm1 + psllq mm4,4 + paddq mm7,[ebp] + pxor mm3,mm4 + movq mm4,[24+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[8+esp] + psrlq mm5,6 + pxor mm7,mm6 + sub esp,8 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm7,[192+esp] + paddq mm0,mm6 + add ebp,8 + dec edx + jnz NEAR L$00316_79_sse2 + paddq mm0,mm3 + movq mm1,[8+esp] + movq mm3,[24+esp] + movq mm5,[40+esp] + movq mm6,[48+esp] + movq mm7,[56+esp] + pxor mm2,mm1 + paddq mm0,[esi] + paddq mm1,[8+esi] + paddq mm2,[16+esi] + paddq mm3,[24+esi] + paddq mm4,[32+esi] + paddq mm5,[40+esi] + paddq mm6,[48+esi] + paddq mm7,[56+esi] + mov eax,640 + movq [esi],mm0 + movq [8+esi],mm1 + movq [16+esi],mm2 + movq [24+esi],mm3 + movq [32+esi],mm4 + movq [40+esi],mm5 + movq [48+esi],mm6 + movq [56+esi],mm7 + lea esp,[eax*1+esp] + sub ebp,eax + cmp edi,DWORD [88+esp] + jb NEAR L$001loop_sse2 + mov esp,DWORD [92+esp] + emms + pop edi + pop esi + pop ebx + pop ebp + ret +global _sha512_block_data_order_ssse3 +align 16 +_sha512_block_data_order_ssse3: +L$_sha512_block_data_order_ssse3_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov ebx,esp + call L$004pic_point +L$004pic_point: + pop ebp + lea ebp,[(L$K512-L$004pic_point)+ebp] + sub esp,16 + and esp,-64 + shl eax,7 + add eax,edi + mov DWORD [esp],esi + mov DWORD [4+esp],edi + mov DWORD [8+esp],eax + mov DWORD [12+esp],ebx + movq mm0,[esi] + movq mm1,[8+esi] + movq mm2,[16+esi] + movq mm3,[24+esi] + movq mm4,[32+esi] + movq mm5,[40+esi] + movq mm6,[48+esi] + movq mm7,[56+esi] + lea edx,[esp-64] + sub esp,256 + movdqa xmm1,[640+ebp] + movdqu xmm0,[edi] + pshufb xmm0,xmm1 + movdqa xmm3,[ebp] + movdqa xmm2,xmm1 + movdqu xmm1,[16+edi] + paddq xmm3,xmm0 + pshufb xmm1,xmm2 + movdqa [edx-128],xmm3 + movdqa xmm4,[16+ebp] + movdqa xmm3,xmm2 + movdqu xmm2,[32+edi] + paddq xmm4,xmm1 + pshufb xmm2,xmm3 + movdqa [edx-112],xmm4 + movdqa xmm5,[32+ebp] + movdqa xmm4,xmm3 + movdqu xmm3,[48+edi] + paddq xmm5,xmm2 + pshufb xmm3,xmm4 + movdqa [edx-96],xmm5 + movdqa xmm6,[48+ebp] + movdqa xmm5,xmm4 + movdqu xmm4,[64+edi] + paddq xmm6,xmm3 + pshufb xmm4,xmm5 + movdqa [edx-80],xmm6 + movdqa xmm7,[64+ebp] + movdqa xmm6,xmm5 + movdqu xmm5,[80+edi] + paddq xmm7,xmm4 + pshufb xmm5,xmm6 + movdqa [edx-64],xmm7 + movdqa [edx],xmm0 + movdqa xmm0,[80+ebp] + movdqa xmm7,xmm6 + movdqu xmm6,[96+edi] + paddq xmm0,xmm5 + pshufb xmm6,xmm7 + movdqa [edx-48],xmm0 + movdqa [16+edx],xmm1 + movdqa xmm1,[96+ebp] + movdqa xmm0,xmm7 + movdqu xmm7,[112+edi] + paddq xmm1,xmm6 + pshufb xmm7,xmm0 + movdqa [edx-32],xmm1 + movdqa [32+edx],xmm2 + movdqa xmm2,[112+ebp] + movdqa xmm0,[edx] + paddq xmm2,xmm7 + movdqa [edx-16],xmm2 + nop +align 32 +L$005loop_ssse3: + movdqa xmm2,[16+edx] + movdqa [48+edx],xmm3 + lea ebp,[128+ebp] + movq [8+esp],mm1 + mov ebx,edi + movq [16+esp],mm2 + lea edi,[128+edi] + movq [24+esp],mm3 + cmp edi,eax + movq [40+esp],mm5 + cmovb ebx,edi + movq [48+esp],mm6 + mov ecx,4 + pxor mm2,mm1 + movq [56+esp],mm7 + pxor mm3,mm3 + jmp NEAR L$00600_47_ssse3 +align 32 +L$00600_47_ssse3: + movdqa xmm3,xmm5 + movdqa xmm1,xmm2 + palignr xmm2,xmm0,8 + movdqa [edx],xmm4 + palignr xmm3,xmm4,8 + movdqa xmm4,xmm2 + psrlq xmm2,7 + paddq xmm0,xmm3 + movdqa xmm3,xmm4 + psrlq xmm4,1 + psllq xmm3,56 + pxor xmm2,xmm4 + psrlq xmm4,7 + pxor xmm2,xmm3 + psllq xmm3,7 + pxor xmm2,xmm4 + movdqa xmm4,xmm7 + pxor xmm2,xmm3 + movdqa xmm3,xmm7 + psrlq xmm4,6 + paddq xmm0,xmm2 + movdqa xmm2,xmm7 + psrlq xmm3,19 + psllq xmm2,3 + pxor xmm4,xmm3 + psrlq xmm3,42 + pxor xmm4,xmm2 + psllq xmm2,42 + pxor xmm4,xmm3 + movdqa xmm3,[32+edx] + pxor xmm4,xmm2 + movdqa xmm2,[ebp] + movq mm1,mm4 + paddq xmm0,xmm4 + movq mm7,[edx-128] + pxor mm5,mm6 + psrlq mm1,14 + movq [32+esp],mm4 + paddq xmm2,xmm0 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[56+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[24+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[8+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[32+esp] + paddq mm2,mm6 + movq mm6,[40+esp] + movq mm1,mm4 + movq mm7,[edx-120] + pxor mm5,mm6 + psrlq mm1,14 + movq [24+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [56+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[48+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[16+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[24+esp] + paddq mm0,mm6 + movq mm6,[32+esp] + movdqa [edx-128],xmm2 + movdqa xmm4,xmm6 + movdqa xmm2,xmm3 + palignr xmm3,xmm1,8 + movdqa [16+edx],xmm5 + palignr xmm4,xmm5,8 + movdqa xmm5,xmm3 + psrlq xmm3,7 + paddq xmm1,xmm4 + movdqa xmm4,xmm5 + psrlq xmm5,1 + psllq xmm4,56 + pxor xmm3,xmm5 + psrlq xmm5,7 + pxor xmm3,xmm4 + psllq xmm4,7 + pxor xmm3,xmm5 + movdqa xmm5,xmm0 + pxor xmm3,xmm4 + movdqa xmm4,xmm0 + psrlq xmm5,6 + paddq xmm1,xmm3 + movdqa xmm3,xmm0 + psrlq xmm4,19 + psllq xmm3,3 + pxor xmm5,xmm4 + psrlq xmm4,42 + pxor xmm5,xmm3 + psllq xmm3,42 + pxor xmm5,xmm4 + movdqa xmm4,[48+edx] + pxor xmm5,xmm3 + movdqa xmm3,[16+ebp] + movq mm1,mm4 + paddq xmm1,xmm5 + movq mm7,[edx-112] + pxor mm5,mm6 + psrlq mm1,14 + movq [16+esp],mm4 + paddq xmm3,xmm1 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [48+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[40+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[8+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[56+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[16+esp] + paddq mm2,mm6 + movq mm6,[24+esp] + movq mm1,mm4 + movq mm7,[edx-104] + pxor mm5,mm6 + psrlq mm1,14 + movq [8+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [40+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[32+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[48+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[8+esp] + paddq mm0,mm6 + movq mm6,[16+esp] + movdqa [edx-112],xmm3 + movdqa xmm5,xmm7 + movdqa xmm3,xmm4 + palignr xmm4,xmm2,8 + movdqa [32+edx],xmm6 + palignr xmm5,xmm6,8 + movdqa xmm6,xmm4 + psrlq xmm4,7 + paddq xmm2,xmm5 + movdqa xmm5,xmm6 + psrlq xmm6,1 + psllq xmm5,56 + pxor xmm4,xmm6 + psrlq xmm6,7 + pxor xmm4,xmm5 + psllq xmm5,7 + pxor xmm4,xmm6 + movdqa xmm6,xmm1 + pxor xmm4,xmm5 + movdqa xmm5,xmm1 + psrlq xmm6,6 + paddq xmm2,xmm4 + movdqa xmm4,xmm1 + psrlq xmm5,19 + psllq xmm4,3 + pxor xmm6,xmm5 + psrlq xmm5,42 + pxor xmm6,xmm4 + psllq xmm4,42 + pxor xmm6,xmm5 + movdqa xmm5,[edx] + pxor xmm6,xmm4 + movdqa xmm4,[32+ebp] + movq mm1,mm4 + paddq xmm2,xmm6 + movq mm7,[edx-96] + pxor mm5,mm6 + psrlq mm1,14 + movq [esp],mm4 + paddq xmm4,xmm2 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [32+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[24+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[56+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[40+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[esp] + paddq mm2,mm6 + movq mm6,[8+esp] + movq mm1,mm4 + movq mm7,[edx-88] + pxor mm5,mm6 + psrlq mm1,14 + movq [56+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [24+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[16+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[48+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[32+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[56+esp] + paddq mm0,mm6 + movq mm6,[esp] + movdqa [edx-96],xmm4 + movdqa xmm6,xmm0 + movdqa xmm4,xmm5 + palignr xmm5,xmm3,8 + movdqa [48+edx],xmm7 + palignr xmm6,xmm7,8 + movdqa xmm7,xmm5 + psrlq xmm5,7 + paddq xmm3,xmm6 + movdqa xmm6,xmm7 + psrlq xmm7,1 + psllq xmm6,56 + pxor xmm5,xmm7 + psrlq xmm7,7 + pxor xmm5,xmm6 + psllq xmm6,7 + pxor xmm5,xmm7 + movdqa xmm7,xmm2 + pxor xmm5,xmm6 + movdqa xmm6,xmm2 + psrlq xmm7,6 + paddq xmm3,xmm5 + movdqa xmm5,xmm2 + psrlq xmm6,19 + psllq xmm5,3 + pxor xmm7,xmm6 + psrlq xmm6,42 + pxor xmm7,xmm5 + psllq xmm5,42 + pxor xmm7,xmm6 + movdqa xmm6,[16+edx] + pxor xmm7,xmm5 + movdqa xmm5,[48+ebp] + movq mm1,mm4 + paddq xmm3,xmm7 + movq mm7,[edx-80] + pxor mm5,mm6 + psrlq mm1,14 + movq [48+esp],mm4 + paddq xmm5,xmm3 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [16+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[8+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[40+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[24+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[48+esp] + paddq mm2,mm6 + movq mm6,[56+esp] + movq mm1,mm4 + movq mm7,[edx-72] + pxor mm5,mm6 + psrlq mm1,14 + movq [40+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [8+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[32+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[16+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[40+esp] + paddq mm0,mm6 + movq mm6,[48+esp] + movdqa [edx-80],xmm5 + movdqa xmm7,xmm1 + movdqa xmm5,xmm6 + palignr xmm6,xmm4,8 + movdqa [edx],xmm0 + palignr xmm7,xmm0,8 + movdqa xmm0,xmm6 + psrlq xmm6,7 + paddq xmm4,xmm7 + movdqa xmm7,xmm0 + psrlq xmm0,1 + psllq xmm7,56 + pxor xmm6,xmm0 + psrlq xmm0,7 + pxor xmm6,xmm7 + psllq xmm7,7 + pxor xmm6,xmm0 + movdqa xmm0,xmm3 + pxor xmm6,xmm7 + movdqa xmm7,xmm3 + psrlq xmm0,6 + paddq xmm4,xmm6 + movdqa xmm6,xmm3 + psrlq xmm7,19 + psllq xmm6,3 + pxor xmm0,xmm7 + psrlq xmm7,42 + pxor xmm0,xmm6 + psllq xmm6,42 + pxor xmm0,xmm7 + movdqa xmm7,[32+edx] + pxor xmm0,xmm6 + movdqa xmm6,[64+ebp] + movq mm1,mm4 + paddq xmm4,xmm0 + movq mm7,[edx-64] + pxor mm5,mm6 + psrlq mm1,14 + movq [32+esp],mm4 + paddq xmm6,xmm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[56+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[24+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[8+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[32+esp] + paddq mm2,mm6 + movq mm6,[40+esp] + movq mm1,mm4 + movq mm7,[edx-56] + pxor mm5,mm6 + psrlq mm1,14 + movq [24+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [56+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[48+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[16+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[24+esp] + paddq mm0,mm6 + movq mm6,[32+esp] + movdqa [edx-64],xmm6 + movdqa xmm0,xmm2 + movdqa xmm6,xmm7 + palignr xmm7,xmm5,8 + movdqa [16+edx],xmm1 + palignr xmm0,xmm1,8 + movdqa xmm1,xmm7 + psrlq xmm7,7 + paddq xmm5,xmm0 + movdqa xmm0,xmm1 + psrlq xmm1,1 + psllq xmm0,56 + pxor xmm7,xmm1 + psrlq xmm1,7 + pxor xmm7,xmm0 + psllq xmm0,7 + pxor xmm7,xmm1 + movdqa xmm1,xmm4 + pxor xmm7,xmm0 + movdqa xmm0,xmm4 + psrlq xmm1,6 + paddq xmm5,xmm7 + movdqa xmm7,xmm4 + psrlq xmm0,19 + psllq xmm7,3 + pxor xmm1,xmm0 + psrlq xmm0,42 + pxor xmm1,xmm7 + psllq xmm7,42 + pxor xmm1,xmm0 + movdqa xmm0,[48+edx] + pxor xmm1,xmm7 + movdqa xmm7,[80+ebp] + movq mm1,mm4 + paddq xmm5,xmm1 + movq mm7,[edx-48] + pxor mm5,mm6 + psrlq mm1,14 + movq [16+esp],mm4 + paddq xmm7,xmm5 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [48+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[40+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[8+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[56+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[16+esp] + paddq mm2,mm6 + movq mm6,[24+esp] + movq mm1,mm4 + movq mm7,[edx-40] + pxor mm5,mm6 + psrlq mm1,14 + movq [8+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [40+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[32+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[48+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[8+esp] + paddq mm0,mm6 + movq mm6,[16+esp] + movdqa [edx-48],xmm7 + movdqa xmm1,xmm3 + movdqa xmm7,xmm0 + palignr xmm0,xmm6,8 + movdqa [32+edx],xmm2 + palignr xmm1,xmm2,8 + movdqa xmm2,xmm0 + psrlq xmm0,7 + paddq xmm6,xmm1 + movdqa xmm1,xmm2 + psrlq xmm2,1 + psllq xmm1,56 + pxor xmm0,xmm2 + psrlq xmm2,7 + pxor xmm0,xmm1 + psllq xmm1,7 + pxor xmm0,xmm2 + movdqa xmm2,xmm5 + pxor xmm0,xmm1 + movdqa xmm1,xmm5 + psrlq xmm2,6 + paddq xmm6,xmm0 + movdqa xmm0,xmm5 + psrlq xmm1,19 + psllq xmm0,3 + pxor xmm2,xmm1 + psrlq xmm1,42 + pxor xmm2,xmm0 + psllq xmm0,42 + pxor xmm2,xmm1 + movdqa xmm1,[edx] + pxor xmm2,xmm0 + movdqa xmm0,[96+ebp] + movq mm1,mm4 + paddq xmm6,xmm2 + movq mm7,[edx-32] + pxor mm5,mm6 + psrlq mm1,14 + movq [esp],mm4 + paddq xmm0,xmm6 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [32+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[24+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[56+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[40+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[esp] + paddq mm2,mm6 + movq mm6,[8+esp] + movq mm1,mm4 + movq mm7,[edx-24] + pxor mm5,mm6 + psrlq mm1,14 + movq [56+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [24+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[16+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[48+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[32+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[56+esp] + paddq mm0,mm6 + movq mm6,[esp] + movdqa [edx-32],xmm0 + movdqa xmm2,xmm4 + movdqa xmm0,xmm1 + palignr xmm1,xmm7,8 + movdqa [48+edx],xmm3 + palignr xmm2,xmm3,8 + movdqa xmm3,xmm1 + psrlq xmm1,7 + paddq xmm7,xmm2 + movdqa xmm2,xmm3 + psrlq xmm3,1 + psllq xmm2,56 + pxor xmm1,xmm3 + psrlq xmm3,7 + pxor xmm1,xmm2 + psllq xmm2,7 + pxor xmm1,xmm3 + movdqa xmm3,xmm6 + pxor xmm1,xmm2 + movdqa xmm2,xmm6 + psrlq xmm3,6 + paddq xmm7,xmm1 + movdqa xmm1,xmm6 + psrlq xmm2,19 + psllq xmm1,3 + pxor xmm3,xmm2 + psrlq xmm2,42 + pxor xmm3,xmm1 + psllq xmm1,42 + pxor xmm3,xmm2 + movdqa xmm2,[16+edx] + pxor xmm3,xmm1 + movdqa xmm1,[112+ebp] + movq mm1,mm4 + paddq xmm7,xmm3 + movq mm7,[edx-16] + pxor mm5,mm6 + psrlq mm1,14 + movq [48+esp],mm4 + paddq xmm1,xmm7 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [16+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[8+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[40+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[24+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[48+esp] + paddq mm2,mm6 + movq mm6,[56+esp] + movq mm1,mm4 + movq mm7,[edx-8] + pxor mm5,mm6 + psrlq mm1,14 + movq [40+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [8+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[32+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[16+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[40+esp] + paddq mm0,mm6 + movq mm6,[48+esp] + movdqa [edx-16],xmm1 + lea ebp,[128+ebp] + dec ecx + jnz NEAR L$00600_47_ssse3 + movdqa xmm1,[ebp] + lea ebp,[ebp-640] + movdqu xmm0,[ebx] + pshufb xmm0,xmm1 + movdqa xmm3,[ebp] + movdqa xmm2,xmm1 + movdqu xmm1,[16+ebx] + paddq xmm3,xmm0 + pshufb xmm1,xmm2 + movq mm1,mm4 + movq mm7,[edx-128] + pxor mm5,mm6 + psrlq mm1,14 + movq [32+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[56+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[24+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[8+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[32+esp] + paddq mm2,mm6 + movq mm6,[40+esp] + movq mm1,mm4 + movq mm7,[edx-120] + pxor mm5,mm6 + psrlq mm1,14 + movq [24+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [56+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[48+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[16+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[24+esp] + paddq mm0,mm6 + movq mm6,[32+esp] + movdqa [edx-128],xmm3 + movdqa xmm4,[16+ebp] + movdqa xmm3,xmm2 + movdqu xmm2,[32+ebx] + paddq xmm4,xmm1 + pshufb xmm2,xmm3 + movq mm1,mm4 + movq mm7,[edx-112] + pxor mm5,mm6 + psrlq mm1,14 + movq [16+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [48+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[40+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[8+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[56+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[16+esp] + paddq mm2,mm6 + movq mm6,[24+esp] + movq mm1,mm4 + movq mm7,[edx-104] + pxor mm5,mm6 + psrlq mm1,14 + movq [8+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [40+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[32+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[48+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[8+esp] + paddq mm0,mm6 + movq mm6,[16+esp] + movdqa [edx-112],xmm4 + movdqa xmm5,[32+ebp] + movdqa xmm4,xmm3 + movdqu xmm3,[48+ebx] + paddq xmm5,xmm2 + pshufb xmm3,xmm4 + movq mm1,mm4 + movq mm7,[edx-96] + pxor mm5,mm6 + psrlq mm1,14 + movq [esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [32+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[24+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[56+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[40+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[esp] + paddq mm2,mm6 + movq mm6,[8+esp] + movq mm1,mm4 + movq mm7,[edx-88] + pxor mm5,mm6 + psrlq mm1,14 + movq [56+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [24+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[16+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[48+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[32+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[56+esp] + paddq mm0,mm6 + movq mm6,[esp] + movdqa [edx-96],xmm5 + movdqa xmm6,[48+ebp] + movdqa xmm5,xmm4 + movdqu xmm4,[64+ebx] + paddq xmm6,xmm3 + pshufb xmm4,xmm5 + movq mm1,mm4 + movq mm7,[edx-80] + pxor mm5,mm6 + psrlq mm1,14 + movq [48+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [16+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[8+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[40+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[24+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[48+esp] + paddq mm2,mm6 + movq mm6,[56+esp] + movq mm1,mm4 + movq mm7,[edx-72] + pxor mm5,mm6 + psrlq mm1,14 + movq [40+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [8+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[32+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[16+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[40+esp] + paddq mm0,mm6 + movq mm6,[48+esp] + movdqa [edx-80],xmm6 + movdqa xmm7,[64+ebp] + movdqa xmm6,xmm5 + movdqu xmm5,[80+ebx] + paddq xmm7,xmm4 + pshufb xmm5,xmm6 + movq mm1,mm4 + movq mm7,[edx-64] + pxor mm5,mm6 + psrlq mm1,14 + movq [32+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[56+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[24+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[8+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[32+esp] + paddq mm2,mm6 + movq mm6,[40+esp] + movq mm1,mm4 + movq mm7,[edx-56] + pxor mm5,mm6 + psrlq mm1,14 + movq [24+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [56+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[48+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[16+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[24+esp] + paddq mm0,mm6 + movq mm6,[32+esp] + movdqa [edx-64],xmm7 + movdqa [edx],xmm0 + movdqa xmm0,[80+ebp] + movdqa xmm7,xmm6 + movdqu xmm6,[96+ebx] + paddq xmm0,xmm5 + pshufb xmm6,xmm7 + movq mm1,mm4 + movq mm7,[edx-48] + pxor mm5,mm6 + psrlq mm1,14 + movq [16+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [48+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[40+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[8+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[56+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[16+esp] + paddq mm2,mm6 + movq mm6,[24+esp] + movq mm1,mm4 + movq mm7,[edx-40] + pxor mm5,mm6 + psrlq mm1,14 + movq [8+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [40+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[32+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[48+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[8+esp] + paddq mm0,mm6 + movq mm6,[16+esp] + movdqa [edx-48],xmm0 + movdqa [16+edx],xmm1 + movdqa xmm1,[96+ebp] + movdqa xmm0,xmm7 + movdqu xmm7,[112+ebx] + paddq xmm1,xmm6 + pshufb xmm7,xmm0 + movq mm1,mm4 + movq mm7,[edx-32] + pxor mm5,mm6 + psrlq mm1,14 + movq [esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [32+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[24+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[56+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[40+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[esp] + paddq mm2,mm6 + movq mm6,[8+esp] + movq mm1,mm4 + movq mm7,[edx-24] + pxor mm5,mm6 + psrlq mm1,14 + movq [56+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [24+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[16+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[48+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[32+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[56+esp] + paddq mm0,mm6 + movq mm6,[esp] + movdqa [edx-32],xmm1 + movdqa [32+edx],xmm2 + movdqa xmm2,[112+ebp] + movdqa xmm0,[edx] + paddq xmm2,xmm7 + movq mm1,mm4 + movq mm7,[edx-16] + pxor mm5,mm6 + psrlq mm1,14 + movq [48+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm0,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [16+esp],mm0 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[8+esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[40+esp] + paddq mm3,mm7 + movq mm5,mm0 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm0 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[24+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm0,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm2,mm0 + psllq mm6,6 + pxor mm7,mm5 + pxor mm2,mm1 + pxor mm6,mm7 + movq mm5,[48+esp] + paddq mm2,mm6 + movq mm6,[56+esp] + movq mm1,mm4 + movq mm7,[edx-8] + pxor mm5,mm6 + psrlq mm1,14 + movq [40+esp],mm4 + pand mm5,mm4 + psllq mm4,23 + paddq mm2,mm3 + movq mm3,mm1 + psrlq mm1,4 + pxor mm5,mm6 + pxor mm3,mm4 + psllq mm4,23 + pxor mm3,mm1 + movq [8+esp],mm2 + paddq mm7,mm5 + pxor mm3,mm4 + psrlq mm1,23 + paddq mm7,[esp] + pxor mm3,mm1 + psllq mm4,4 + pxor mm3,mm4 + movq mm4,[32+esp] + paddq mm3,mm7 + movq mm5,mm2 + psrlq mm5,28 + paddq mm4,mm3 + movq mm6,mm2 + movq mm7,mm5 + psllq mm6,25 + movq mm1,[16+esp] + psrlq mm5,6 + pxor mm7,mm6 + psllq mm6,5 + pxor mm7,mm5 + pxor mm2,mm1 + psrlq mm5,5 + pxor mm7,mm6 + pand mm0,mm2 + psllq mm6,6 + pxor mm7,mm5 + pxor mm0,mm1 + pxor mm6,mm7 + movq mm5,[40+esp] + paddq mm0,mm6 + movq mm6,[48+esp] + movdqa [edx-16],xmm2 + movq mm1,[8+esp] + paddq mm0,mm3 + movq mm3,[24+esp] + movq mm7,[56+esp] + pxor mm2,mm1 + paddq mm0,[esi] + paddq mm1,[8+esi] + paddq mm2,[16+esi] + paddq mm3,[24+esi] + paddq mm4,[32+esi] + paddq mm5,[40+esi] + paddq mm6,[48+esi] + paddq mm7,[56+esi] + movq [esi],mm0 + movq [8+esi],mm1 + movq [16+esi],mm2 + movq [24+esi],mm3 + movq [32+esi],mm4 + movq [40+esi],mm5 + movq [48+esi],mm6 + movq [56+esi],mm7 + cmp edi,eax + jb NEAR L$005loop_ssse3 + mov esp,DWORD [76+edx] + emms + pop edi + pop esi + pop ebx + pop ebp + ret +align 64 +L$K512: +dd 3609767458,1116352408 +dd 602891725,1899447441 +dd 3964484399,3049323471 +dd 2173295548,3921009573 +dd 4081628472,961987163 +dd 3053834265,1508970993 +dd 2937671579,2453635748 +dd 3664609560,2870763221 +dd 2734883394,3624381080 +dd 1164996542,310598401 +dd 1323610764,607225278 +dd 3590304994,1426881987 +dd 4068182383,1925078388 +dd 991336113,2162078206 +dd 633803317,2614888103 +dd 3479774868,3248222580 +dd 2666613458,3835390401 +dd 944711139,4022224774 +dd 2341262773,264347078 +dd 2007800933,604807628 +dd 1495990901,770255983 +dd 1856431235,1249150122 +dd 3175218132,1555081692 +dd 2198950837,1996064986 +dd 3999719339,2554220882 +dd 766784016,2821834349 +dd 2566594879,2952996808 +dd 3203337956,3210313671 +dd 1034457026,3336571891 +dd 2466948901,3584528711 +dd 3758326383,113926993 +dd 168717936,338241895 +dd 1188179964,666307205 +dd 1546045734,773529912 +dd 1522805485,1294757372 +dd 2643833823,1396182291 +dd 2343527390,1695183700 +dd 1014477480,1986661051 +dd 1206759142,2177026350 +dd 344077627,2456956037 +dd 1290863460,2730485921 +dd 3158454273,2820302411 +dd 3505952657,3259730800 +dd 106217008,3345764771 +dd 3606008344,3516065817 +dd 1432725776,3600352804 +dd 1467031594,4094571909 +dd 851169720,275423344 +dd 3100823752,430227734 +dd 1363258195,506948616 +dd 3750685593,659060556 +dd 3785050280,883997877 +dd 3318307427,958139571 +dd 3812723403,1322822218 +dd 2003034995,1537002063 +dd 3602036899,1747873779 +dd 1575990012,1955562222 +dd 1125592928,2024104815 +dd 2716904306,2227730452 +dd 442776044,2361852424 +dd 593698344,2428436474 +dd 3733110249,2756734187 +dd 2999351573,3204031479 +dd 3815920427,3329325298 +dd 3928383900,3391569614 +dd 566280711,3515267271 +dd 3454069534,3940187606 +dd 4000239992,4118630271 +dd 1914138554,116418474 +dd 2731055270,174292421 +dd 3203993006,289380356 +dd 320620315,460393269 +dd 587496836,685471733 +dd 1086792851,852142971 +dd 365543100,1017036298 +dd 2618297676,1126000580 +dd 3409855158,1288033470 +dd 4234509866,1501505948 +dd 987167468,1607167915 +dd 1246189591,1816402316 +dd 67438087,66051 +dd 202182159,134810123 +db 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 +db 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 +db 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 +db 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 +db 62,0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/sha512-armv4-linux.S b/third_party/boringssl/gen/bcm/sha512-armv4-linux.S new file mode 100644 index 00000000..f95fc9a2 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha512-armv4-linux.S @@ -0,0 +1,1857 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. +@ +@ Licensed under the Apache License, Version 2.0 (the "License"); +@ you may not use this file except in compliance with the License. +@ You may obtain a copy of the License at +@ +@ https://www.apache.org/licenses/LICENSE-2.0 +@ +@ Unless required by applicable law or agreed to in writing, software +@ distributed under the License is distributed on an "AS IS" BASIS, +@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@ See the License for the specific language governing permissions and +@ limitations under the License. + + +@ ==================================================================== +@ Written by Andy Polyakov for the OpenSSL +@ project. +@ ==================================================================== + +@ SHA512 block procedure for ARMv4. September 2007. + +@ This code is ~4.5 (four and a half) times faster than code generated +@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue +@ Xscale PXA250 core]. +@ +@ July 2010. +@ +@ Rescheduling for dual-issue pipeline resulted in 6% improvement on +@ Cortex A8 core and ~40 cycles per processed byte. + +@ February 2011. +@ +@ Profiler-assisted and platform-specific optimization resulted in 7% +@ improvement on Coxtex A8 core and ~38 cycles per byte. + +@ March 2011. +@ +@ Add NEON implementation. On Cortex A8 it was measured to process +@ one byte in 23.3 cycles or ~60% faster than integer-only code. + +@ August 2012. +@ +@ Improve NEON performance by 12% on Snapdragon S4. In absolute +@ terms it's 22.6 cycles per byte, which is disappointing result. +@ Technical writers asserted that 3-way S4 pipeline can sustain +@ multiple NEON instructions per cycle, but dual NEON issue could +@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html +@ for further details. On side note Cortex-A15 processes one byte in +@ 16 cycles. + +@ Byte order [in]dependence. ========================================= +@ +@ Originally caller was expected to maintain specific *dword* order in +@ h[0-7], namely with most significant dword at *lower* address, which +@ was reflected in below two parameters as 0 and 4. Now caller is +@ expected to maintain native byte order for whole 64-bit values. +#ifndef __KERNEL__ +# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} +# define VFP_ABI_POP vldmia sp!,{d8-d15} +#else +# define __ARM_MAX_ARCH__ 7 +# define VFP_ABI_PUSH +# define VFP_ABI_POP +#endif + +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + +#ifdef __ARMEL__ +# define LO 0 +# define HI 4 +# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 +#else +# define HI 0 +# define LO 4 +# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 +#endif + +.text +#if defined(__thumb2__) +.syntax unified +.thumb +# define adrl adr +#else +.code 32 +#endif + +.type K512,%object +.align 5 +K512: + WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) + WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) + WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) + WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) + WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) + WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) + WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) + WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) + WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) + WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) + WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) + WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) + WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) + WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) + WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) + WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) + WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) + WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) + WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) + WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) + WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) + WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) + WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) + WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) + WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) + WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) + WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) + WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) + WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) + WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) + WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) + WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) + WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) + WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) + WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) + WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) + WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) + WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) + WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) + WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) +.size K512,.-K512 + +.globl sha512_block_data_order_nohw +.hidden sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,%function +sha512_block_data_order_nohw: + add r2,r1,r2,lsl#7 @ len to point at the end of inp + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + adr r14,K512 + sub sp,sp,#9*8 + + ldr r7,[r0,#32+LO] + ldr r8,[r0,#32+HI] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] +.Loop: + str r9, [sp,#48+0] + str r10, [sp,#48+4] + str r11, [sp,#56+0] + str r12, [sp,#56+4] + ldr r5,[r0,#0+LO] + ldr r6,[r0,#0+HI] + ldr r3,[r0,#8+LO] + ldr r4,[r0,#8+HI] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + str r3,[sp,#8+0] + str r4,[sp,#8+4] + str r9, [sp,#16+0] + str r10, [sp,#16+4] + str r11, [sp,#24+0] + str r12, [sp,#24+4] + ldr r3,[r0,#40+LO] + ldr r4,[r0,#40+HI] + str r3,[sp,#40+0] + str r4,[sp,#40+4] + +.L00_15: +#if __ARM_ARCH<7 + ldrb r3,[r1,#7] + ldrb r9, [r1,#6] + ldrb r10, [r1,#5] + ldrb r11, [r1,#4] + ldrb r4,[r1,#3] + ldrb r12, [r1,#2] + orr r3,r3,r9,lsl#8 + ldrb r9, [r1,#1] + orr r3,r3,r10,lsl#16 + ldrb r10, [r1],#8 + orr r3,r3,r11,lsl#24 + orr r4,r4,r12,lsl#8 + orr r4,r4,r9,lsl#16 + orr r4,r4,r10,lsl#24 +#else + ldr r3,[r1,#4] + ldr r4,[r1],#8 +#ifdef __ARMEL__ + rev r3,r3 + rev r4,r4 +#endif +#endif + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#148 + + ldr r12,[sp,#16+0] @ c.lo +#if __ARM_ARCH>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 + tst r14,#1 + beq .L00_15 + ldr r9,[sp,#184+0] + ldr r10,[sp,#184+4] + bic r14,r14,#1 +.L16_79: + @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) + @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 + @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 + mov r3,r9,lsr#1 + ldr r11,[sp,#80+0] + mov r4,r10,lsr#1 + ldr r12,[sp,#80+4] + eor r3,r3,r10,lsl#31 + eor r4,r4,r9,lsl#31 + eor r3,r3,r9,lsr#8 + eor r4,r4,r10,lsr#8 + eor r3,r3,r10,lsl#24 + eor r4,r4,r9,lsl#24 + eor r3,r3,r9,lsr#7 + eor r4,r4,r10,lsr#7 + eor r3,r3,r10,lsl#25 + + @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) + @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 + @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 + mov r9,r11,lsr#19 + mov r10,r12,lsr#19 + eor r9,r9,r12,lsl#13 + eor r10,r10,r11,lsl#13 + eor r9,r9,r12,lsr#29 + eor r10,r10,r11,lsr#29 + eor r9,r9,r11,lsl#3 + eor r10,r10,r12,lsl#3 + eor r9,r9,r11,lsr#6 + eor r10,r10,r12,lsr#6 + ldr r11,[sp,#120+0] + eor r9,r9,r12,lsl#26 + + ldr r12,[sp,#120+4] + adds r3,r3,r9 + ldr r9,[sp,#192+0] + adc r4,r4,r10 + + ldr r10,[sp,#192+4] + adds r3,r3,r11 + adc r4,r4,r12 + adds r3,r3,r9 + adc r4,r4,r10 + @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) + @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 + @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 + mov r9,r7,lsr#14 + str r3,[sp,#64+0] + mov r10,r8,lsr#14 + str r4,[sp,#64+4] + eor r9,r9,r8,lsl#18 + ldr r11,[sp,#56+0] @ h.lo + eor r10,r10,r7,lsl#18 + ldr r12,[sp,#56+4] @ h.hi + eor r9,r9,r7,lsr#18 + eor r10,r10,r8,lsr#18 + eor r9,r9,r8,lsl#14 + eor r10,r10,r7,lsl#14 + eor r9,r9,r8,lsr#9 + eor r10,r10,r7,lsr#9 + eor r9,r9,r7,lsl#23 + eor r10,r10,r8,lsl#23 @ Sigma1(e) + adds r3,r3,r9 + ldr r9,[sp,#40+0] @ f.lo + adc r4,r4,r10 @ T += Sigma1(e) + ldr r10,[sp,#40+4] @ f.hi + adds r3,r3,r11 + ldr r11,[sp,#48+0] @ g.lo + adc r4,r4,r12 @ T += h + ldr r12,[sp,#48+4] @ g.hi + + eor r9,r9,r11 + str r7,[sp,#32+0] + eor r10,r10,r12 + str r8,[sp,#32+4] + and r9,r9,r7 + str r5,[sp,#0+0] + and r10,r10,r8 + str r6,[sp,#0+4] + eor r9,r9,r11 + ldr r11,[r14,#LO] @ K[i].lo + eor r10,r10,r12 @ Ch(e,f,g) + ldr r12,[r14,#HI] @ K[i].hi + + adds r3,r3,r9 + ldr r7,[sp,#24+0] @ d.lo + adc r4,r4,r10 @ T += Ch(e,f,g) + ldr r8,[sp,#24+4] @ d.hi + adds r3,r3,r11 + and r9,r11,#0xff + adc r4,r4,r12 @ T += K[i] + adds r7,r7,r3 + ldr r11,[sp,#8+0] @ b.lo + adc r8,r8,r4 @ d += T + teq r9,#23 + + ldr r12,[sp,#16+0] @ c.lo +#if __ARM_ARCH>=7 + it eq @ Thumb2 thing, sanity check in ARM +#endif + orreq r14,r14,#1 + @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) + @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 + @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 + mov r9,r5,lsr#28 + mov r10,r6,lsr#28 + eor r9,r9,r6,lsl#4 + eor r10,r10,r5,lsl#4 + eor r9,r9,r6,lsr#2 + eor r10,r10,r5,lsr#2 + eor r9,r9,r5,lsl#30 + eor r10,r10,r6,lsl#30 + eor r9,r9,r6,lsr#7 + eor r10,r10,r5,lsr#7 + eor r9,r9,r5,lsl#25 + eor r10,r10,r6,lsl#25 @ Sigma0(a) + adds r3,r3,r9 + and r9,r5,r11 + adc r4,r4,r10 @ T += Sigma0(a) + + ldr r10,[sp,#8+4] @ b.hi + orr r5,r5,r11 + ldr r11,[sp,#16+4] @ c.hi + and r5,r5,r12 + and r12,r6,r10 + orr r6,r6,r10 + orr r5,r5,r9 @ Maj(a,b,c).lo + and r6,r6,r11 + adds r5,r5,r3 + orr r6,r6,r12 @ Maj(a,b,c).hi + sub sp,sp,#8 + adc r6,r6,r4 @ h += T + tst r14,#1 + add r14,r14,#8 +#if __ARM_ARCH>=7 + ittt eq @ Thumb2 thing, sanity check in ARM +#endif + ldreq r9,[sp,#184+0] + ldreq r10,[sp,#184+4] + beq .L16_79 + bic r14,r14,#1 + + ldr r3,[sp,#8+0] + ldr r4,[sp,#8+4] + ldr r9, [r0,#0+LO] + ldr r10, [r0,#0+HI] + ldr r11, [r0,#8+LO] + ldr r12, [r0,#8+HI] + adds r9,r5,r9 + str r9, [r0,#0+LO] + adc r10,r6,r10 + str r10, [r0,#0+HI] + adds r11,r3,r11 + str r11, [r0,#8+LO] + adc r12,r4,r12 + str r12, [r0,#8+HI] + + ldr r5,[sp,#16+0] + ldr r6,[sp,#16+4] + ldr r3,[sp,#24+0] + ldr r4,[sp,#24+4] + ldr r9, [r0,#16+LO] + ldr r10, [r0,#16+HI] + ldr r11, [r0,#24+LO] + ldr r12, [r0,#24+HI] + adds r9,r5,r9 + str r9, [r0,#16+LO] + adc r10,r6,r10 + str r10, [r0,#16+HI] + adds r11,r3,r11 + str r11, [r0,#24+LO] + adc r12,r4,r12 + str r12, [r0,#24+HI] + + ldr r3,[sp,#40+0] + ldr r4,[sp,#40+4] + ldr r9, [r0,#32+LO] + ldr r10, [r0,#32+HI] + ldr r11, [r0,#40+LO] + ldr r12, [r0,#40+HI] + adds r7,r7,r9 + str r7,[r0,#32+LO] + adc r8,r8,r10 + str r8,[r0,#32+HI] + adds r11,r3,r11 + str r11, [r0,#40+LO] + adc r12,r4,r12 + str r12, [r0,#40+HI] + + ldr r5,[sp,#48+0] + ldr r6,[sp,#48+4] + ldr r3,[sp,#56+0] + ldr r4,[sp,#56+4] + ldr r9, [r0,#48+LO] + ldr r10, [r0,#48+HI] + ldr r11, [r0,#56+LO] + ldr r12, [r0,#56+HI] + adds r9,r5,r9 + str r9, [r0,#48+LO] + adc r10,r6,r10 + str r10, [r0,#48+HI] + adds r11,r3,r11 + str r11, [r0,#56+LO] + adc r12,r4,r12 + str r12, [r0,#56+HI] + + add sp,sp,#640 + sub r14,r14,#640 + + teq r1,r2 + bne .Loop + + add sp,sp,#8*9 @ destroy frame +#if __ARM_ARCH>=5 + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} +#else + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} + tst lr,#1 + moveq pc,lr @ be binary compatible with V4, yet +.word 0xe12fff1e @ interoperable with Thumb ISA:-) +#endif +.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl sha512_block_data_order_neon +.hidden sha512_block_data_order_neon +.type sha512_block_data_order_neon,%function +.align 4 +sha512_block_data_order_neon: + dmb @ errata #451034 on early Cortex A8 + add r2,r1,r2,lsl#7 @ len to point at the end of inp + adr r3,K512 + VFP_ABI_PUSH + vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context +.Loop_neon: + vshr.u64 d24,d20,#14 @ 0 +#if 0<16 + vld1.64 {d0},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 0>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 0<16 && defined(__ARMEL__) + vrev64.8 d0,d0 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 1 +#if 1<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 1>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 1<16 && defined(__ARMEL__) + vrev64.8 d1,d1 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 2 +#if 2<16 + vld1.64 {d2},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 2>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 2<16 && defined(__ARMEL__) + vrev64.8 d2,d2 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 3 +#if 3<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 3>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 3<16 && defined(__ARMEL__) + vrev64.8 d3,d3 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 4 +#if 4<16 + vld1.64 {d4},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 4>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 4<16 && defined(__ARMEL__) + vrev64.8 d4,d4 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 5 +#if 5<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 5>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 5<16 && defined(__ARMEL__) + vrev64.8 d5,d5 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 6 +#if 6<16 + vld1.64 {d6},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 6>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 6<16 && defined(__ARMEL__) + vrev64.8 d6,d6 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 7 +#if 7<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 7>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 7<16 && defined(__ARMEL__) + vrev64.8 d7,d7 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 d24,d20,#14 @ 8 +#if 8<16 + vld1.64 {d8},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d20,#18 +#if 8>0 + vadd.i64 d16,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d20,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 8<16 && defined(__ARMEL__) + vrev64.8 d8,d8 +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 9 +#if 9<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 9>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 9<16 && defined(__ARMEL__) + vrev64.8 d9,d9 +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 d24,d18,#14 @ 10 +#if 10<16 + vld1.64 {d10},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d18,#18 +#if 10>0 + vadd.i64 d22,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d18,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 10<16 && defined(__ARMEL__) + vrev64.8 d10,d10 +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 11 +#if 11<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 11>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 11<16 && defined(__ARMEL__) + vrev64.8 d11,d11 +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 d24,d16,#14 @ 12 +#if 12<16 + vld1.64 {d12},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d16,#18 +#if 12>0 + vadd.i64 d20,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d16,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 12<16 && defined(__ARMEL__) + vrev64.8 d12,d12 +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 13 +#if 13<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 13>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 13<16 && defined(__ARMEL__) + vrev64.8 d13,d13 +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 d24,d22,#14 @ 14 +#if 14<16 + vld1.64 {d14},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d22,#18 +#if 14>0 + vadd.i64 d18,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d22,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 14<16 && defined(__ARMEL__) + vrev64.8 d14,d14 +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 15 +#if 15<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 15>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 15<16 && defined(__ARMEL__) + vrev64.8 d15,d15 +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + mov r12,#4 +.L16_79_neon: + subs r12,#1 + vshr.u64 q12,q7,#19 + vshr.u64 q13,q7,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q7,#6 + vsli.64 q12,q7,#45 + vext.8 q14,q0,q1,#8 @ X[i+1] + vsli.64 q13,q7,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q0,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q4,q5,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q0,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q0,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 16<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d0 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 17 +#if 17<16 + vld1.64 {d1},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 17>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 17<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d1 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q0,#19 + vshr.u64 q13,q0,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q0,#6 + vsli.64 q12,q0,#45 + vext.8 q14,q1,q2,#8 @ X[i+1] + vsli.64 q13,q0,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q1,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q5,q6,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q1,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q1,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 18<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d2 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 19 +#if 19<16 + vld1.64 {d3},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 19>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 19<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d3 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q1,#19 + vshr.u64 q13,q1,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q1,#6 + vsli.64 q12,q1,#45 + vext.8 q14,q2,q3,#8 @ X[i+1] + vsli.64 q13,q1,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q2,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q6,q7,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q2,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q2,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 20<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d4 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 21 +#if 21<16 + vld1.64 {d5},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 21>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 21<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d5 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q2,#19 + vshr.u64 q13,q2,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q2,#6 + vsli.64 q12,q2,#45 + vext.8 q14,q3,q4,#8 @ X[i+1] + vsli.64 q13,q2,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q3,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q7,q0,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q3,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q3,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 22<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d6 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 23 +#if 23<16 + vld1.64 {d7},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 23>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 23<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d7 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + vshr.u64 q12,q3,#19 + vshr.u64 q13,q3,#61 + vadd.i64 d16,d30 @ h+=Maj from the past + vshr.u64 q15,q3,#6 + vsli.64 q12,q3,#45 + vext.8 q14,q4,q5,#8 @ X[i+1] + vsli.64 q13,q3,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q4,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q0,q1,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d20,#14 @ from NEON_00_15 + vadd.i64 q4,q14 + vshr.u64 d25,d20,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d20,#41 @ from NEON_00_15 + vadd.i64 q4,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d20,#50 + vsli.64 d25,d20,#46 + vmov d29,d20 + vsli.64 d26,d20,#23 +#if 24<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d21,d22 @ Ch(e,f,g) + vshr.u64 d24,d16,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d23 + vshr.u64 d25,d16,#34 + vsli.64 d24,d16,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d16,#39 + vadd.i64 d28,d8 + vsli.64 d25,d16,#30 + veor d30,d16,d17 + vsli.64 d26,d16,#25 + veor d23,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d18,d17 @ Maj(a,b,c) + veor d23,d26 @ Sigma0(a) + vadd.i64 d19,d27 + vadd.i64 d30,d27 + @ vadd.i64 d23,d30 + vshr.u64 d24,d19,#14 @ 25 +#if 25<16 + vld1.64 {d9},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d19,#18 +#if 25>0 + vadd.i64 d23,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d19,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d19,#50 + vsli.64 d25,d19,#46 + vmov d29,d19 + vsli.64 d26,d19,#23 +#if 25<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d20,d21 @ Ch(e,f,g) + vshr.u64 d24,d23,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d22 + vshr.u64 d25,d23,#34 + vsli.64 d24,d23,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d23,#39 + vadd.i64 d28,d9 + vsli.64 d25,d23,#30 + veor d30,d23,d16 + vsli.64 d26,d23,#25 + veor d22,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d17,d16 @ Maj(a,b,c) + veor d22,d26 @ Sigma0(a) + vadd.i64 d18,d27 + vadd.i64 d30,d27 + @ vadd.i64 d22,d30 + vshr.u64 q12,q4,#19 + vshr.u64 q13,q4,#61 + vadd.i64 d22,d30 @ h+=Maj from the past + vshr.u64 q15,q4,#6 + vsli.64 q12,q4,#45 + vext.8 q14,q5,q6,#8 @ X[i+1] + vsli.64 q13,q4,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q5,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q1,q2,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d18,#14 @ from NEON_00_15 + vadd.i64 q5,q14 + vshr.u64 d25,d18,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d18,#41 @ from NEON_00_15 + vadd.i64 q5,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d18,#50 + vsli.64 d25,d18,#46 + vmov d29,d18 + vsli.64 d26,d18,#23 +#if 26<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d19,d20 @ Ch(e,f,g) + vshr.u64 d24,d22,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d21 + vshr.u64 d25,d22,#34 + vsli.64 d24,d22,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d22,#39 + vadd.i64 d28,d10 + vsli.64 d25,d22,#30 + veor d30,d22,d23 + vsli.64 d26,d22,#25 + veor d21,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d16,d23 @ Maj(a,b,c) + veor d21,d26 @ Sigma0(a) + vadd.i64 d17,d27 + vadd.i64 d30,d27 + @ vadd.i64 d21,d30 + vshr.u64 d24,d17,#14 @ 27 +#if 27<16 + vld1.64 {d11},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d17,#18 +#if 27>0 + vadd.i64 d21,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d17,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d17,#50 + vsli.64 d25,d17,#46 + vmov d29,d17 + vsli.64 d26,d17,#23 +#if 27<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d18,d19 @ Ch(e,f,g) + vshr.u64 d24,d21,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d20 + vshr.u64 d25,d21,#34 + vsli.64 d24,d21,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d21,#39 + vadd.i64 d28,d11 + vsli.64 d25,d21,#30 + veor d30,d21,d22 + vsli.64 d26,d21,#25 + veor d20,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d23,d22 @ Maj(a,b,c) + veor d20,d26 @ Sigma0(a) + vadd.i64 d16,d27 + vadd.i64 d30,d27 + @ vadd.i64 d20,d30 + vshr.u64 q12,q5,#19 + vshr.u64 q13,q5,#61 + vadd.i64 d20,d30 @ h+=Maj from the past + vshr.u64 q15,q5,#6 + vsli.64 q12,q5,#45 + vext.8 q14,q6,q7,#8 @ X[i+1] + vsli.64 q13,q5,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q6,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q2,q3,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d16,#14 @ from NEON_00_15 + vadd.i64 q6,q14 + vshr.u64 d25,d16,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d16,#41 @ from NEON_00_15 + vadd.i64 q6,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d16,#50 + vsli.64 d25,d16,#46 + vmov d29,d16 + vsli.64 d26,d16,#23 +#if 28<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d17,d18 @ Ch(e,f,g) + vshr.u64 d24,d20,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d19 + vshr.u64 d25,d20,#34 + vsli.64 d24,d20,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d20,#39 + vadd.i64 d28,d12 + vsli.64 d25,d20,#30 + veor d30,d20,d21 + vsli.64 d26,d20,#25 + veor d19,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d22,d21 @ Maj(a,b,c) + veor d19,d26 @ Sigma0(a) + vadd.i64 d23,d27 + vadd.i64 d30,d27 + @ vadd.i64 d19,d30 + vshr.u64 d24,d23,#14 @ 29 +#if 29<16 + vld1.64 {d13},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d23,#18 +#if 29>0 + vadd.i64 d19,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d23,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d23,#50 + vsli.64 d25,d23,#46 + vmov d29,d23 + vsli.64 d26,d23,#23 +#if 29<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d16,d17 @ Ch(e,f,g) + vshr.u64 d24,d19,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d18 + vshr.u64 d25,d19,#34 + vsli.64 d24,d19,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d19,#39 + vadd.i64 d28,d13 + vsli.64 d25,d19,#30 + veor d30,d19,d20 + vsli.64 d26,d19,#25 + veor d18,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d21,d20 @ Maj(a,b,c) + veor d18,d26 @ Sigma0(a) + vadd.i64 d22,d27 + vadd.i64 d30,d27 + @ vadd.i64 d18,d30 + vshr.u64 q12,q6,#19 + vshr.u64 q13,q6,#61 + vadd.i64 d18,d30 @ h+=Maj from the past + vshr.u64 q15,q6,#6 + vsli.64 q12,q6,#45 + vext.8 q14,q7,q0,#8 @ X[i+1] + vsli.64 q13,q6,#3 + veor q15,q12 + vshr.u64 q12,q14,#1 + veor q15,q13 @ sigma1(X[i+14]) + vshr.u64 q13,q14,#8 + vadd.i64 q7,q15 + vshr.u64 q15,q14,#7 + vsli.64 q12,q14,#63 + vsli.64 q13,q14,#56 + vext.8 q14,q3,q4,#8 @ X[i+9] + veor q15,q12 + vshr.u64 d24,d22,#14 @ from NEON_00_15 + vadd.i64 q7,q14 + vshr.u64 d25,d22,#18 @ from NEON_00_15 + veor q15,q13 @ sigma0(X[i+1]) + vshr.u64 d26,d22,#41 @ from NEON_00_15 + vadd.i64 q7,q15 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d22,#50 + vsli.64 d25,d22,#46 + vmov d29,d22 + vsli.64 d26,d22,#23 +#if 30<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d23,d16 @ Ch(e,f,g) + vshr.u64 d24,d18,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d17 + vshr.u64 d25,d18,#34 + vsli.64 d24,d18,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d18,#39 + vadd.i64 d28,d14 + vsli.64 d25,d18,#30 + veor d30,d18,d19 + vsli.64 d26,d18,#25 + veor d17,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d20,d19 @ Maj(a,b,c) + veor d17,d26 @ Sigma0(a) + vadd.i64 d21,d27 + vadd.i64 d30,d27 + @ vadd.i64 d17,d30 + vshr.u64 d24,d21,#14 @ 31 +#if 31<16 + vld1.64 {d15},[r1]! @ handles unaligned +#endif + vshr.u64 d25,d21,#18 +#if 31>0 + vadd.i64 d17,d30 @ h+=Maj from the past +#endif + vshr.u64 d26,d21,#41 + vld1.64 {d28},[r3,:64]! @ K[i++] + vsli.64 d24,d21,#50 + vsli.64 d25,d21,#46 + vmov d29,d21 + vsli.64 d26,d21,#23 +#if 31<16 && defined(__ARMEL__) + vrev64.8 , +#endif + veor d25,d24 + vbsl d29,d22,d23 @ Ch(e,f,g) + vshr.u64 d24,d17,#28 + veor d26,d25 @ Sigma1(e) + vadd.i64 d27,d29,d16 + vshr.u64 d25,d17,#34 + vsli.64 d24,d17,#36 + vadd.i64 d27,d26 + vshr.u64 d26,d17,#39 + vadd.i64 d28,d15 + vsli.64 d25,d17,#30 + veor d30,d17,d18 + vsli.64 d26,d17,#25 + veor d16,d24,d25 + vadd.i64 d27,d28 + vbsl d30,d19,d18 @ Maj(a,b,c) + veor d16,d26 @ Sigma0(a) + vadd.i64 d20,d27 + vadd.i64 d30,d27 + @ vadd.i64 d16,d30 + bne .L16_79_neon + + vadd.i64 d16,d30 @ h+=Maj from the past + vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp + vadd.i64 q8,q12 @ vectorized accumulate + vadd.i64 q9,q13 + vadd.i64 q10,q14 + vadd.i64 q11,q15 + vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context + teq r1,r2 + sub r3,#640 @ rewind K512 + bne .Loop_neon + + VFP_ABI_POP + bx lr @ .word 0xe12fff1e +.size sha512_block_data_order_neon,.-sha512_block_data_order_neon +#endif +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/sha512-armv8-apple.S b/third_party/boringssl/gen/bcm/sha512-armv8-apple.S new file mode 100644 index 00000000..b6403a9c --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha512-armv8-apple.S @@ -0,0 +1,1595 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +.text + +.globl _sha512_block_data_order_nohw +.private_extern _sha512_block_data_order_nohw + +.align 6 +_sha512_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adrp x30,LK512@PAGE + add x30,x30,LK512@PAGEOFF + stp x0,x2,[x29,#96] + +Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.section __TEXT,__const +.align 6 + +LK512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0 // terminator + +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl _sha512_block_data_order_hw +.private_extern _sha512_block_data_order_hw + +.align 6 +_sha512_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context + adrp x3,LK512@PAGE + add x3,x3,LK512@PAGEOFF + + rev64 v16.16b,v16.16b + rev64 v17.16b,v17.16b + rev64 v18.16b,v18.16b + rev64 v19.16b,v19.16b + rev64 v20.16b,v20.16b + rev64 v21.16b,v21.16b + rev64 v22.16b,v22.16b + rev64 v23.16b,v23.16b + b Loop_hw + +.align 4 +Loop_hw: + ld1 {v24.2d},[x3],#16 + subs x2,x2,#1 + sub x4,x1,#128 + orr v26.16b,v0.16b,v0.16b // offload + orr v27.16b,v1.16b,v1.16b + orr v28.16b,v2.16b,v2.16b + orr v29.16b,v3.16b,v3.16b + csel x1,x1,x4,ne // conditional rewind + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v16.2d + ld1 {v16.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a4 //sha512h q4,q5,v6.2d + rev64 v16.16b,v16.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v17.2d + ld1 {v17.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a1 //sha512h q1,q5,v6.2d + rev64 v17.16b,v17.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v18.2d + ld1 {v18.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a0 //sha512h q0,q5,v6.2d + rev64 v18.16b,v18.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v19.2d + ld1 {v19.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a3 //sha512h q3,q5,v6.2d + rev64 v19.16b,v19.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v20.2d + ld1 {v20.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a2 //sha512h q2,q5,v6.2d + rev64 v20.16b,v20.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v21.2d + ld1 {v21.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a4 //sha512h q4,q5,v6.2d + rev64 v21.16b,v21.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v22.2d + ld1 {v22.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a1 //sha512h q1,q5,v6.2d + rev64 v22.16b,v22.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + sub x3,x3,#80*8 // rewind + add v25.2d,v25.2d,v23.2d + ld1 {v23.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a0 //sha512h q0,q5,v6.2d + rev64 v23.16b,v23.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v0.2d,v0.2d,v26.2d // accumulate + add v1.2d,v1.2d,v27.2d + add v2.2d,v2.2d,v28.2d + add v3.2d,v3.2d,v29.2d + + cbnz x2,Loop_hw + + st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context + + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/sha512-armv8-linux.S b/third_party/boringssl/gen/bcm/sha512-armv8-linux.S new file mode 100644 index 00000000..f046a5c4 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha512-armv8-linux.S @@ -0,0 +1,1595 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +.text + +.globl sha512_block_data_order_nohw +.hidden sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,%function +.align 6 +sha512_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adrp x30,.LK512 + add x30,x30,:lo12:.LK512 + stp x0,x2,[x29,#96] + +.Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +.Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,.Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne .Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw + +.section .rodata +.align 6 +.type .LK512,%object +.LK512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0 // terminator +.size .LK512,.-.LK512 +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl sha512_block_data_order_hw +.hidden sha512_block_data_order_hw +.type sha512_block_data_order_hw,%function +.align 6 +sha512_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context + adrp x3,.LK512 + add x3,x3,:lo12:.LK512 + + rev64 v16.16b,v16.16b + rev64 v17.16b,v17.16b + rev64 v18.16b,v18.16b + rev64 v19.16b,v19.16b + rev64 v20.16b,v20.16b + rev64 v21.16b,v21.16b + rev64 v22.16b,v22.16b + rev64 v23.16b,v23.16b + b .Loop_hw + +.align 4 +.Loop_hw: + ld1 {v24.2d},[x3],#16 + subs x2,x2,#1 + sub x4,x1,#128 + orr v26.16b,v0.16b,v0.16b // offload + orr v27.16b,v1.16b,v1.16b + orr v28.16b,v2.16b,v2.16b + orr v29.16b,v3.16b,v3.16b + csel x1,x1,x4,ne // conditional rewind + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a3 //sha512h q3,q5,v6.2d +.inst 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a2 //sha512h q2,q5,v6.2d +.inst 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 q2,q0,v3.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a4 //sha512h q4,q5,v6.2d +.inst 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 q4,q3,v2.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a1 //sha512h q1,q5,v6.2d +.inst 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 q1,q2,v4.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a0 //sha512h q0,q5,v6.2d +.inst 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 q0,q4,v1.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a3 //sha512h q3,q5,v6.2d +.inst 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 q3,q1,v0.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a2 //sha512h q2,q5,v6.2d +.inst 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 q2,q0,v3.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a4 //sha512h q4,q5,v6.2d +.inst 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 q4,q3,v2.2d + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a1 //sha512h q1,q5,v6.2d +.inst 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 q1,q2,v4.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a0 //sha512h q0,q5,v6.2d +.inst 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 q0,q4,v1.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a3 //sha512h q3,q5,v6.2d +.inst 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a2 //sha512h q2,q5,v6.2d +.inst 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 q2,q0,v3.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a4 //sha512h q4,q5,v6.2d +.inst 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 q4,q3,v2.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a1 //sha512h q1,q5,v6.2d +.inst 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 q1,q2,v4.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a0 //sha512h q0,q5,v6.2d +.inst 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 q0,q4,v1.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a3 //sha512h q3,q5,v6.2d +.inst 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 q3,q1,v0.2d + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a2 //sha512h q2,q5,v6.2d +.inst 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 q2,q0,v3.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a4 //sha512h q4,q5,v6.2d +.inst 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 q4,q3,v2.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a1 //sha512h q1,q5,v6.2d +.inst 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 q1,q2,v4.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a0 //sha512h q0,q5,v6.2d +.inst 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 q0,q4,v1.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a3 //sha512h q3,q5,v6.2d +.inst 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a2 //sha512h q2,q5,v6.2d +.inst 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 q2,q0,v3.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a4 //sha512h q4,q5,v6.2d +.inst 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 q4,q3,v2.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a1 //sha512h q1,q5,v6.2d +.inst 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 q1,q2,v4.2d + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.inst 0xce6680a0 //sha512h q0,q5,v6.2d +.inst 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 q0,q4,v1.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.inst 0xce6680a3 //sha512h q3,q5,v6.2d +.inst 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 q3,q1,v0.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.inst 0xce6680a2 //sha512h q2,q5,v6.2d +.inst 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 q2,q0,v3.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.inst 0xce6680a4 //sha512h q4,q5,v6.2d +.inst 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 q4,q3,v2.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.inst 0xce6680a1 //sha512h q1,q5,v6.2d +.inst 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 q1,q2,v4.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.inst 0xce6680a0 //sha512h q0,q5,v6.2d +.inst 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 q0,q4,v1.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.inst 0xce6680a3 //sha512h q3,q5,v6.2d +.inst 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.inst 0xce6680a2 //sha512h q2,q5,v6.2d +.inst 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 q2,q0,v3.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v16.2d + ld1 {v16.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a4 //sha512h q4,q5,v6.2d + rev64 v16.16b,v16.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 q4,q3,v2.2d + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v17.2d + ld1 {v17.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a1 //sha512h q1,q5,v6.2d + rev64 v17.16b,v17.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 q1,q2,v4.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v18.2d + ld1 {v18.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a0 //sha512h q0,q5,v6.2d + rev64 v18.16b,v18.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 q0,q4,v1.2d + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v19.2d + ld1 {v19.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a3 //sha512h q3,q5,v6.2d + rev64 v19.16b,v19.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.inst 0xce608423 //sha512h2 q3,q1,v0.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v20.2d + ld1 {v20.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a2 //sha512h q2,q5,v6.2d + rev64 v20.16b,v20.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.inst 0xce638402 //sha512h2 q2,q0,v3.2d + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v21.2d + ld1 {v21.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a4 //sha512h q4,q5,v6.2d + rev64 v21.16b,v21.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.inst 0xce628464 //sha512h2 q4,q3,v2.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v22.2d + ld1 {v22.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.inst 0xce6680a1 //sha512h q1,q5,v6.2d + rev64 v22.16b,v22.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.inst 0xce648441 //sha512h2 q1,q2,v4.2d + sub x3,x3,#80*8 // rewind + add v25.2d,v25.2d,v23.2d + ld1 {v23.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.inst 0xce6680a0 //sha512h q0,q5,v6.2d + rev64 v23.16b,v23.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.inst 0xce618480 //sha512h2 q0,q4,v1.2d + add v0.2d,v0.2d,v26.2d // accumulate + add v1.2d,v1.2d,v27.2d + add v2.2d,v2.2d,v28.2d + add v3.2d,v3.2d,v29.2d + + cbnz x2,.Loop_hw + + st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context + + ldr x29,[sp],#16 + ret +.size sha512_block_data_order_hw,.-sha512_block_data_order_hw +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/sha512-armv8-win.S b/third_party/boringssl/gen/bcm/sha512-armv8-win.S new file mode 100644 index 00000000..0d5ad5b9 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha512-armv8-win.S @@ -0,0 +1,1599 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// ==================================================================== +// Written by Andy Polyakov for the OpenSSL +// project. +// ==================================================================== +// +// SHA256/512 for ARMv8. +// +// Performance in cycles per processed byte and improvement coefficient +// over code generated with "default" compiler: +// +// SHA256-hw SHA256(*) SHA512 +// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) +// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) +// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) +// Denver 2.01 10.5 (+26%) 6.70 (+8%) +// X-Gene 20.0 (+100%) 12.8 (+300%(***)) +// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) +// Kryo 1.92 17.4 (+30%) 11.2 (+8%) +// +// (*) Software SHA256 results are of lesser relevance, presented +// mostly for informational purposes. +// (**) The result is a trade-off: it's possible to improve it by +// 10% (or by 1 cycle per round), but at the cost of 20% loss +// on Cortex-A53 (or by 4 cycles per round). +// (***) Super-impressive coefficients over gcc-generated code are +// indication of some compiler "pathology", most notably code +// generated with -mgeneral-regs-only is significantly faster +// and the gap is only 40-90%. + +.text + +.globl sha512_block_data_order_nohw + +.def sha512_block_data_order_nohw + .type 32 +.endef +.align 6 +sha512_block_data_order_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-128]! + add x29,sp,#0 + + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#4*8 + + ldp x20,x21,[x0] // load context + ldp x22,x23,[x0,#2*8] + ldp x24,x25,[x0,#4*8] + add x2,x1,x2,lsl#7 // end of input + ldp x26,x27,[x0,#6*8] + adrp x30,LK512 + add x30,x30,:lo12:LK512 + stp x0,x2,[x29,#96] + +Loop: + ldp x3,x4,[x1],#2*8 + ldr x19,[x30],#8 // *K++ + eor x28,x21,x22 // magic seed + str x1,[x29,#112] +#ifndef __AARCH64EB__ + rev x3,x3 // 0 +#endif + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x6,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x3 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x4,x4 // 1 +#endif + ldp x5,x6,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x7,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x4 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x5,x5 // 2 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x8,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x5 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x6,x6 // 3 +#endif + ldp x7,x8,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x9,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x6 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x7,x7 // 4 +#endif + add x24,x24,x17 // h+=Sigma0(a) + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x10,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x7 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x10,ror#18 // Sigma1(e) + ror x10,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x10,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x8,x8 // 5 +#endif + ldp x9,x10,[x1],#2*8 + add x23,x23,x17 // h+=Sigma0(a) + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x11,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x8 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x11,ror#18 // Sigma1(e) + ror x11,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x11,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x9,x9 // 6 +#endif + add x22,x22,x17 // h+=Sigma0(a) + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x12,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x9 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x12,ror#18 // Sigma1(e) + ror x12,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x12,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x10,x10 // 7 +#endif + ldp x11,x12,[x1],#2*8 + add x21,x21,x17 // h+=Sigma0(a) + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + eor x13,x25,x25,ror#23 + and x17,x26,x25 + bic x28,x27,x25 + add x20,x20,x10 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x13,ror#18 // Sigma1(e) + ror x13,x21,#28 + add x20,x20,x17 // h+=Ch(e,f,g) + eor x17,x21,x21,ror#5 + add x20,x20,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x24,x24,x20 // d+=h + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x13,x17,ror#34 // Sigma0(a) + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x20,x20,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x11,x11 // 8 +#endif + add x20,x20,x17 // h+=Sigma0(a) + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + eor x14,x24,x24,ror#23 + and x17,x25,x24 + bic x19,x26,x24 + add x27,x27,x11 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x14,ror#18 // Sigma1(e) + ror x14,x20,#28 + add x27,x27,x17 // h+=Ch(e,f,g) + eor x17,x20,x20,ror#5 + add x27,x27,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x23,x23,x27 // d+=h + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x14,x17,ror#34 // Sigma0(a) + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x27,x27,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x12,x12 // 9 +#endif + ldp x13,x14,[x1],#2*8 + add x27,x27,x17 // h+=Sigma0(a) + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + eor x15,x23,x23,ror#23 + and x17,x24,x23 + bic x28,x25,x23 + add x26,x26,x12 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x15,ror#18 // Sigma1(e) + ror x15,x27,#28 + add x26,x26,x17 // h+=Ch(e,f,g) + eor x17,x27,x27,ror#5 + add x26,x26,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x22,x22,x26 // d+=h + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x15,x17,ror#34 // Sigma0(a) + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x26,x26,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x13,x13 // 10 +#endif + add x26,x26,x17 // h+=Sigma0(a) + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + eor x0,x22,x22,ror#23 + and x17,x23,x22 + bic x19,x24,x22 + add x25,x25,x13 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x0,ror#18 // Sigma1(e) + ror x0,x26,#28 + add x25,x25,x17 // h+=Ch(e,f,g) + eor x17,x26,x26,ror#5 + add x25,x25,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x21,x21,x25 // d+=h + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x0,x17,ror#34 // Sigma0(a) + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x25,x25,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x14,x14 // 11 +#endif + ldp x15,x0,[x1],#2*8 + add x25,x25,x17 // h+=Sigma0(a) + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + eor x6,x21,x21,ror#23 + and x17,x22,x21 + bic x28,x23,x21 + add x24,x24,x14 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x6,ror#18 // Sigma1(e) + ror x6,x25,#28 + add x24,x24,x17 // h+=Ch(e,f,g) + eor x17,x25,x25,ror#5 + add x24,x24,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x20,x20,x24 // d+=h + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x17,ror#34 // Sigma0(a) + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x24,x24,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x15,x15 // 12 +#endif + add x24,x24,x17 // h+=Sigma0(a) + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + eor x7,x20,x20,ror#23 + and x17,x21,x20 + bic x19,x22,x20 + add x23,x23,x15 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x7,ror#18 // Sigma1(e) + ror x7,x24,#28 + add x23,x23,x17 // h+=Ch(e,f,g) + eor x17,x24,x24,ror#5 + add x23,x23,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x27,x27,x23 // d+=h + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x17,ror#34 // Sigma0(a) + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x23,x23,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x0,x0 // 13 +#endif + ldp x1,x2,[x1] + add x23,x23,x17 // h+=Sigma0(a) + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + eor x8,x27,x27,ror#23 + and x17,x20,x27 + bic x28,x21,x27 + add x22,x22,x0 // h+=X[i] + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x8,ror#18 // Sigma1(e) + ror x8,x23,#28 + add x22,x22,x17 // h+=Ch(e,f,g) + eor x17,x23,x23,ror#5 + add x22,x22,x16 // h+=Sigma1(e) + and x19,x19,x28 // (b^c)&=(a^b) + add x26,x26,x22 // d+=h + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x17,ror#34 // Sigma0(a) + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + //add x22,x22,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x1,x1 // 14 +#endif + ldr x6,[sp,#24] + add x22,x22,x17 // h+=Sigma0(a) + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + eor x9,x26,x26,ror#23 + and x17,x27,x26 + bic x19,x20,x26 + add x21,x21,x1 // h+=X[i] + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x9,ror#18 // Sigma1(e) + ror x9,x22,#28 + add x21,x21,x17 // h+=Ch(e,f,g) + eor x17,x22,x22,ror#5 + add x21,x21,x16 // h+=Sigma1(e) + and x28,x28,x19 // (b^c)&=(a^b) + add x25,x25,x21 // d+=h + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x17,ror#34 // Sigma0(a) + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + //add x21,x21,x17 // h+=Sigma0(a) +#ifndef __AARCH64EB__ + rev x2,x2 // 15 +#endif + ldr x7,[sp,#0] + add x21,x21,x17 // h+=Sigma0(a) + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 +Loop_16_xx: + ldr x8,[sp,#8] + str x11,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x10,x5,#1 + and x17,x25,x24 + ror x9,x2,#19 + bic x19,x26,x24 + ror x11,x20,#28 + add x27,x27,x3 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x10,x10,x5,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x11,x11,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x9,x9,x2,ror#61 + eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x11,x20,ror#39 // Sigma0(a) + eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) + add x4,x4,x13 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x4,x4,x10 + add x27,x27,x17 // h+=Sigma0(a) + add x4,x4,x9 + ldr x9,[sp,#16] + str x12,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x11,x6,#1 + and x17,x24,x23 + ror x10,x3,#19 + bic x28,x25,x23 + ror x12,x27,#28 + add x26,x26,x4 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x11,x11,x6,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x12,x12,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x10,x10,x3,ror#61 + eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x12,x27,ror#39 // Sigma0(a) + eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) + add x5,x5,x14 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x5,x5,x11 + add x26,x26,x17 // h+=Sigma0(a) + add x5,x5,x10 + ldr x10,[sp,#24] + str x13,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x12,x7,#1 + and x17,x23,x22 + ror x11,x4,#19 + bic x19,x24,x22 + ror x13,x26,#28 + add x25,x25,x5 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x12,x12,x7,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x13,x13,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x11,x11,x4,ror#61 + eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x13,x26,ror#39 // Sigma0(a) + eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) + add x6,x6,x15 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x6,x6,x12 + add x25,x25,x17 // h+=Sigma0(a) + add x6,x6,x11 + ldr x11,[sp,#0] + str x14,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x13,x8,#1 + and x17,x22,x21 + ror x12,x5,#19 + bic x28,x23,x21 + ror x14,x25,#28 + add x24,x24,x6 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x13,x13,x8,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x14,x14,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x12,x12,x5,ror#61 + eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x14,x25,ror#39 // Sigma0(a) + eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) + add x7,x7,x0 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x7,x7,x13 + add x24,x24,x17 // h+=Sigma0(a) + add x7,x7,x12 + ldr x12,[sp,#8] + str x15,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x14,x9,#1 + and x17,x21,x20 + ror x13,x6,#19 + bic x19,x22,x20 + ror x15,x24,#28 + add x23,x23,x7 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x14,x14,x9,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x15,x15,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x13,x13,x6,ror#61 + eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x15,x24,ror#39 // Sigma0(a) + eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) + add x8,x8,x1 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x8,x8,x14 + add x23,x23,x17 // h+=Sigma0(a) + add x8,x8,x13 + ldr x13,[sp,#16] + str x0,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x15,x10,#1 + and x17,x20,x27 + ror x14,x7,#19 + bic x28,x21,x27 + ror x0,x23,#28 + add x22,x22,x8 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x15,x15,x10,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x0,x0,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x14,x14,x7,ror#61 + eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x0,x23,ror#39 // Sigma0(a) + eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) + add x9,x9,x2 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x9,x9,x15 + add x22,x22,x17 // h+=Sigma0(a) + add x9,x9,x14 + ldr x14,[sp,#24] + str x1,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x0,x11,#1 + and x17,x27,x26 + ror x15,x8,#19 + bic x19,x20,x26 + ror x1,x22,#28 + add x21,x21,x9 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x0,x0,x11,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x1,x1,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x15,x15,x8,ror#61 + eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x1,x22,ror#39 // Sigma0(a) + eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) + add x10,x10,x3 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x10,x10,x0 + add x21,x21,x17 // h+=Sigma0(a) + add x10,x10,x15 + ldr x15,[sp,#0] + str x2,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x1,x12,#1 + and x17,x26,x25 + ror x0,x9,#19 + bic x28,x27,x25 + ror x2,x21,#28 + add x20,x20,x10 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x1,x1,x12,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x2,x2,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x0,x0,x9,ror#61 + eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x2,x21,ror#39 // Sigma0(a) + eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) + add x11,x11,x4 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x11,x11,x1 + add x20,x20,x17 // h+=Sigma0(a) + add x11,x11,x0 + ldr x0,[sp,#8] + str x3,[sp,#0] + ror x16,x24,#14 + add x27,x27,x19 // h+=K[i] + ror x2,x13,#1 + and x17,x25,x24 + ror x1,x10,#19 + bic x19,x26,x24 + ror x3,x20,#28 + add x27,x27,x11 // h+=X[i] + eor x16,x16,x24,ror#18 + eor x2,x2,x13,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x20,x21 // a^b, b^c in next round + eor x16,x16,x24,ror#41 // Sigma1(e) + eor x3,x3,x20,ror#34 + add x27,x27,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x1,x1,x10,ror#61 + eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) + add x27,x27,x16 // h+=Sigma1(e) + eor x28,x28,x21 // Maj(a,b,c) + eor x17,x3,x20,ror#39 // Sigma0(a) + eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) + add x12,x12,x5 + add x23,x23,x27 // d+=h + add x27,x27,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x12,x12,x2 + add x27,x27,x17 // h+=Sigma0(a) + add x12,x12,x1 + ldr x1,[sp,#16] + str x4,[sp,#8] + ror x16,x23,#14 + add x26,x26,x28 // h+=K[i] + ror x3,x14,#1 + and x17,x24,x23 + ror x2,x11,#19 + bic x28,x25,x23 + ror x4,x27,#28 + add x26,x26,x12 // h+=X[i] + eor x16,x16,x23,ror#18 + eor x3,x3,x14,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x27,x20 // a^b, b^c in next round + eor x16,x16,x23,ror#41 // Sigma1(e) + eor x4,x4,x27,ror#34 + add x26,x26,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x2,x2,x11,ror#61 + eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) + add x26,x26,x16 // h+=Sigma1(e) + eor x19,x19,x20 // Maj(a,b,c) + eor x17,x4,x27,ror#39 // Sigma0(a) + eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) + add x13,x13,x6 + add x22,x22,x26 // d+=h + add x26,x26,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x13,x13,x3 + add x26,x26,x17 // h+=Sigma0(a) + add x13,x13,x2 + ldr x2,[sp,#24] + str x5,[sp,#16] + ror x16,x22,#14 + add x25,x25,x19 // h+=K[i] + ror x4,x15,#1 + and x17,x23,x22 + ror x3,x12,#19 + bic x19,x24,x22 + ror x5,x26,#28 + add x25,x25,x13 // h+=X[i] + eor x16,x16,x22,ror#18 + eor x4,x4,x15,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x26,x27 // a^b, b^c in next round + eor x16,x16,x22,ror#41 // Sigma1(e) + eor x5,x5,x26,ror#34 + add x25,x25,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x3,x3,x12,ror#61 + eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) + add x25,x25,x16 // h+=Sigma1(e) + eor x28,x28,x27 // Maj(a,b,c) + eor x17,x5,x26,ror#39 // Sigma0(a) + eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) + add x14,x14,x7 + add x21,x21,x25 // d+=h + add x25,x25,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x14,x14,x4 + add x25,x25,x17 // h+=Sigma0(a) + add x14,x14,x3 + ldr x3,[sp,#0] + str x6,[sp,#24] + ror x16,x21,#14 + add x24,x24,x28 // h+=K[i] + ror x5,x0,#1 + and x17,x22,x21 + ror x4,x13,#19 + bic x28,x23,x21 + ror x6,x25,#28 + add x24,x24,x14 // h+=X[i] + eor x16,x16,x21,ror#18 + eor x5,x5,x0,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x25,x26 // a^b, b^c in next round + eor x16,x16,x21,ror#41 // Sigma1(e) + eor x6,x6,x25,ror#34 + add x24,x24,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x4,x4,x13,ror#61 + eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) + add x24,x24,x16 // h+=Sigma1(e) + eor x19,x19,x26 // Maj(a,b,c) + eor x17,x6,x25,ror#39 // Sigma0(a) + eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) + add x15,x15,x8 + add x20,x20,x24 // d+=h + add x24,x24,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x15,x15,x5 + add x24,x24,x17 // h+=Sigma0(a) + add x15,x15,x4 + ldr x4,[sp,#8] + str x7,[sp,#0] + ror x16,x20,#14 + add x23,x23,x19 // h+=K[i] + ror x6,x1,#1 + and x17,x21,x20 + ror x5,x14,#19 + bic x19,x22,x20 + ror x7,x24,#28 + add x23,x23,x15 // h+=X[i] + eor x16,x16,x20,ror#18 + eor x6,x6,x1,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x24,x25 // a^b, b^c in next round + eor x16,x16,x20,ror#41 // Sigma1(e) + eor x7,x7,x24,ror#34 + add x23,x23,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x5,x5,x14,ror#61 + eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) + add x23,x23,x16 // h+=Sigma1(e) + eor x28,x28,x25 // Maj(a,b,c) + eor x17,x7,x24,ror#39 // Sigma0(a) + eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) + add x0,x0,x9 + add x27,x27,x23 // d+=h + add x23,x23,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x0,x0,x6 + add x23,x23,x17 // h+=Sigma0(a) + add x0,x0,x5 + ldr x5,[sp,#16] + str x8,[sp,#8] + ror x16,x27,#14 + add x22,x22,x28 // h+=K[i] + ror x7,x2,#1 + and x17,x20,x27 + ror x6,x15,#19 + bic x28,x21,x27 + ror x8,x23,#28 + add x22,x22,x0 // h+=X[i] + eor x16,x16,x27,ror#18 + eor x7,x7,x2,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x23,x24 // a^b, b^c in next round + eor x16,x16,x27,ror#41 // Sigma1(e) + eor x8,x8,x23,ror#34 + add x22,x22,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x6,x6,x15,ror#61 + eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) + add x22,x22,x16 // h+=Sigma1(e) + eor x19,x19,x24 // Maj(a,b,c) + eor x17,x8,x23,ror#39 // Sigma0(a) + eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) + add x1,x1,x10 + add x26,x26,x22 // d+=h + add x22,x22,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x1,x1,x7 + add x22,x22,x17 // h+=Sigma0(a) + add x1,x1,x6 + ldr x6,[sp,#24] + str x9,[sp,#16] + ror x16,x26,#14 + add x21,x21,x19 // h+=K[i] + ror x8,x3,#1 + and x17,x27,x26 + ror x7,x0,#19 + bic x19,x20,x26 + ror x9,x22,#28 + add x21,x21,x1 // h+=X[i] + eor x16,x16,x26,ror#18 + eor x8,x8,x3,ror#8 + orr x17,x17,x19 // Ch(e,f,g) + eor x19,x22,x23 // a^b, b^c in next round + eor x16,x16,x26,ror#41 // Sigma1(e) + eor x9,x9,x22,ror#34 + add x21,x21,x17 // h+=Ch(e,f,g) + and x28,x28,x19 // (b^c)&=(a^b) + eor x7,x7,x0,ror#61 + eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) + add x21,x21,x16 // h+=Sigma1(e) + eor x28,x28,x23 // Maj(a,b,c) + eor x17,x9,x22,ror#39 // Sigma0(a) + eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) + add x2,x2,x11 + add x25,x25,x21 // d+=h + add x21,x21,x28 // h+=Maj(a,b,c) + ldr x28,[x30],#8 // *K++, x19 in next round + add x2,x2,x8 + add x21,x21,x17 // h+=Sigma0(a) + add x2,x2,x7 + ldr x7,[sp,#0] + str x10,[sp,#24] + ror x16,x25,#14 + add x20,x20,x28 // h+=K[i] + ror x9,x4,#1 + and x17,x26,x25 + ror x8,x1,#19 + bic x28,x27,x25 + ror x10,x21,#28 + add x20,x20,x2 // h+=X[i] + eor x16,x16,x25,ror#18 + eor x9,x9,x4,ror#8 + orr x17,x17,x28 // Ch(e,f,g) + eor x28,x21,x22 // a^b, b^c in next round + eor x16,x16,x25,ror#41 // Sigma1(e) + eor x10,x10,x21,ror#34 + add x20,x20,x17 // h+=Ch(e,f,g) + and x19,x19,x28 // (b^c)&=(a^b) + eor x8,x8,x1,ror#61 + eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) + add x20,x20,x16 // h+=Sigma1(e) + eor x19,x19,x22 // Maj(a,b,c) + eor x17,x10,x21,ror#39 // Sigma0(a) + eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) + add x3,x3,x12 + add x24,x24,x20 // d+=h + add x20,x20,x19 // h+=Maj(a,b,c) + ldr x19,[x30],#8 // *K++, x28 in next round + add x3,x3,x9 + add x20,x20,x17 // h+=Sigma0(a) + add x3,x3,x8 + cbnz x19,Loop_16_xx + + ldp x0,x2,[x29,#96] + ldr x1,[x29,#112] + sub x30,x30,#648 // rewind + + ldp x3,x4,[x0] + ldp x5,x6,[x0,#2*8] + add x1,x1,#14*8 // advance input pointer + ldp x7,x8,[x0,#4*8] + add x20,x20,x3 + ldp x9,x10,[x0,#6*8] + add x21,x21,x4 + add x22,x22,x5 + add x23,x23,x6 + stp x20,x21,[x0] + add x24,x24,x7 + add x25,x25,x8 + stp x22,x23,[x0,#2*8] + add x26,x26,x9 + add x27,x27,x10 + cmp x1,x2 + stp x24,x25,[x0,#4*8] + stp x26,x27,[x0,#6*8] + b.ne Loop + + ldp x19,x20,[x29,#16] + add sp,sp,#4*8 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#128 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.section .rodata +.align 6 + +LK512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0 // terminator + +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 +.align 2 +.text +#ifndef __KERNEL__ +.globl sha512_block_data_order_hw + +.def sha512_block_data_order_hw + .type 32 +.endef +.align 6 +sha512_block_data_order_hw: + // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. + AARCH64_VALID_CALL_TARGET + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context + adrp x3,LK512 + add x3,x3,:lo12:LK512 + + rev64 v16.16b,v16.16b + rev64 v17.16b,v17.16b + rev64 v18.16b,v18.16b + rev64 v19.16b,v19.16b + rev64 v20.16b,v20.16b + rev64 v21.16b,v21.16b + rev64 v22.16b,v22.16b + rev64 v23.16b,v23.16b + b Loop_hw + +.align 4 +Loop_hw: + ld1 {v24.2d},[x3],#16 + subs x2,x2,#1 + sub x4,x1,#128 + orr v26.16b,v0.16b,v0.16b // offload + orr v27.16b,v1.16b,v1.16b + orr v28.16b,v2.16b,v2.16b + orr v29.16b,v3.16b,v3.16b + csel x1,x1,x4,ne // conditional rewind + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v24.2d,v24.2d,v16.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08230 //sha512su0 v16.2d,v17.2d + ext v7.16b,v20.16b,v21.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678af0 //sha512su1 v16.2d,v23.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v25.2d,v25.2d,v17.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08251 //sha512su0 v17.2d,v18.2d + ext v7.16b,v21.16b,v22.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678a11 //sha512su1 v17.2d,v16.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v24.2d,v24.2d,v18.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec08272 //sha512su0 v18.2d,v19.2d + ext v7.16b,v22.16b,v23.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678a32 //sha512su1 v18.2d,v17.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + add v25.2d,v25.2d,v19.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08293 //sha512su0 v19.2d,v20.2d + ext v7.16b,v23.16b,v16.16b,#8 +.long 0xce6680a4 //sha512h q4,q5,v6.2d +.long 0xce678a53 //sha512su1 v19.2d,v18.2d,v7.2d + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + add v24.2d,v24.2d,v20.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082b4 //sha512su0 v20.2d,v21.2d + ext v7.16b,v16.16b,v17.16b,#8 +.long 0xce6680a1 //sha512h q1,q5,v6.2d +.long 0xce678a74 //sha512su1 v20.2d,v19.2d,v7.2d + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + add v25.2d,v25.2d,v21.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec082d5 //sha512su0 v21.2d,v22.2d + ext v7.16b,v17.16b,v18.16b,#8 +.long 0xce6680a0 //sha512h q0,q5,v6.2d +.long 0xce678a95 //sha512su1 v21.2d,v20.2d,v7.2d + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v24.2d,v24.2d,v22.2d + ld1 {v25.2d},[x3],#16 + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" +.long 0xcec082f6 //sha512su0 v22.2d,v23.2d + ext v7.16b,v18.16b,v19.16b,#8 +.long 0xce6680a3 //sha512h q3,q5,v6.2d +.long 0xce678ab6 //sha512su1 v22.2d,v21.2d,v7.2d + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + add v25.2d,v25.2d,v23.2d + ld1 {v24.2d},[x3],#16 + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" +.long 0xcec08217 //sha512su0 v23.2d,v16.2d + ext v7.16b,v19.16b,v20.16b,#8 +.long 0xce6680a2 //sha512h q2,q5,v6.2d +.long 0xce678ad7 //sha512su1 v23.2d,v22.2d,v7.2d + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v16.2d + ld1 {v16.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a4 //sha512h q4,q5,v6.2d + rev64 v16.16b,v16.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v17.2d + ld1 {v17.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a1 //sha512h q1,q5,v6.2d + rev64 v17.16b,v17.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v18.2d + ld1 {v18.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a0 //sha512h q0,q5,v6.2d + rev64 v18.16b,v18.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v19.2d + ld1 {v19.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v2.16b,v3.16b,#8 + ext v6.16b,v1.16b,v2.16b,#8 + add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a3 //sha512h q3,q5,v6.2d + rev64 v19.16b,v19.16b + add v4.2d,v1.2d,v3.2d // "D + T1" +.long 0xce608423 //sha512h2 q3,q1,v0.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v20.2d + ld1 {v20.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v4.16b,v2.16b,#8 + ext v6.16b,v0.16b,v4.16b,#8 + add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a2 //sha512h q2,q5,v6.2d + rev64 v20.16b,v20.16b + add v1.2d,v0.2d,v2.2d // "D + T1" +.long 0xce638402 //sha512h2 q2,q0,v3.2d + ld1 {v24.2d},[x3],#16 + add v25.2d,v25.2d,v21.2d + ld1 {v21.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v1.16b,v4.16b,#8 + ext v6.16b,v3.16b,v1.16b,#8 + add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a4 //sha512h q4,q5,v6.2d + rev64 v21.16b,v21.16b + add v0.2d,v3.2d,v4.2d // "D + T1" +.long 0xce628464 //sha512h2 q4,q3,v2.2d + ld1 {v25.2d},[x3],#16 + add v24.2d,v24.2d,v22.2d + ld1 {v22.16b},[x1],#16 // load next input + ext v24.16b,v24.16b,v24.16b,#8 + ext v5.16b,v0.16b,v1.16b,#8 + ext v6.16b,v2.16b,v0.16b,#8 + add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" +.long 0xce6680a1 //sha512h q1,q5,v6.2d + rev64 v22.16b,v22.16b + add v3.2d,v2.2d,v1.2d // "D + T1" +.long 0xce648441 //sha512h2 q1,q2,v4.2d + sub x3,x3,#80*8 // rewind + add v25.2d,v25.2d,v23.2d + ld1 {v23.16b},[x1],#16 // load next input + ext v25.16b,v25.16b,v25.16b,#8 + ext v5.16b,v3.16b,v0.16b,#8 + ext v6.16b,v4.16b,v3.16b,#8 + add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" +.long 0xce6680a0 //sha512h q0,q5,v6.2d + rev64 v23.16b,v23.16b + add v2.2d,v4.2d,v0.2d // "D + T1" +.long 0xce618480 //sha512h2 q0,q4,v1.2d + add v0.2d,v0.2d,v26.2d // accumulate + add v1.2d,v1.2d,v27.2d + add v2.2d,v2.2d,v28.2d + add v3.2d,v3.2d,v29.2d + + cbnz x2,Loop_hw + + st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context + + ldr x29,[sp],#16 + ret + +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/sha512-x86_64-apple.S b/third_party/boringssl/gen/bcm/sha512-x86_64-apple.S new file mode 100644 index 00000000..58f27a48 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha512-x86_64-apple.S @@ -0,0 +1,2978 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.globl _sha512_block_data_order_nohw +.private_extern _sha512_block_data_order_nohw + +.p2align 4 +_sha512_block_data_order_nohw: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $128+32,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + +L$prologue: + + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp L$loop + +.p2align 4 +L$loop: + movq %rbx,%rdi + leaq K512(%rip),%rbp + xorq %rcx,%rdi + movq 0(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 8(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 16(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 24(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 32(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 40(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 48(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 56(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + addq %r14,%rax + movq 64(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 72(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 80(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 88(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 96(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 104(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 112(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 120(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + jmp L$rounds_16_xx +.p2align 4 +L$rounds_16_xx: + movq 8(%rsp),%r13 + movq 112(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 72(%rsp),%r12 + + addq 0(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 16(%rsp),%r13 + movq 120(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 80(%rsp),%r12 + + addq 8(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 24(%rsp),%r13 + movq 0(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 88(%rsp),%r12 + + addq 16(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 32(%rsp),%r13 + movq 8(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 96(%rsp),%r12 + + addq 24(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 40(%rsp),%r13 + movq 16(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 104(%rsp),%r12 + + addq 32(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 48(%rsp),%r13 + movq 24(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 112(%rsp),%r12 + + addq 40(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 56(%rsp),%r13 + movq 32(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 120(%rsp),%r12 + + addq 48(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 64(%rsp),%r13 + movq 40(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 0(%rsp),%r12 + + addq 56(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + movq 72(%rsp),%r13 + movq 48(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 8(%rsp),%r12 + + addq 64(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 80(%rsp),%r13 + movq 56(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 16(%rsp),%r12 + + addq 72(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 88(%rsp),%r13 + movq 64(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 24(%rsp),%r12 + + addq 80(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 96(%rsp),%r13 + movq 72(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 32(%rsp),%r12 + + addq 88(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 104(%rsp),%r13 + movq 80(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 40(%rsp),%r12 + + addq 96(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 112(%rsp),%r13 + movq 88(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 48(%rsp),%r12 + + addq 104(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 120(%rsp),%r13 + movq 96(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 56(%rsp),%r12 + + addq 112(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 0(%rsp),%r13 + movq 104(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 64(%rsp),%r12 + + addq 120(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + cmpb $0,7(%rbp) + jnz L$rounds_16_xx + + movq 128+0(%rsp),%rdi + addq %r14,%rax + leaq 128(%rsi),%rsi + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb L$loop + + movq 152(%rsp),%rsi + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue: + ret + + +.section __DATA,__const +.p2align 6 + +K512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl _sha512_block_data_order_avx +.private_extern _sha512_block_data_order_avx + +.p2align 6 +_sha512_block_data_order_avx: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) + +L$prologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp L$loop_avx +.p2align 4 +L$loop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp L$avx_00_47 + +.p2align 4 +L$avx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne L$avx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb L$loop_avx + + movq 152(%rsp),%rsi + + vzeroupper + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$epilogue_avx: + ret + + +#endif diff --git a/third_party/boringssl/gen/bcm/sha512-x86_64-linux.S b/third_party/boringssl/gen/bcm/sha512-x86_64-linux.S new file mode 100644 index 00000000..bbef9430 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha512-x86_64-linux.S @@ -0,0 +1,2978 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.globl sha512_block_data_order_nohw +.hidden sha512_block_data_order_nohw +.type sha512_block_data_order_nohw,@function +.align 16 +sha512_block_data_order_nohw: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $128+32,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue: + + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop + +.align 16 +.Lloop: + movq %rbx,%rdi + leaq K512(%rip),%rbp + xorq %rcx,%rdi + movq 0(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 8(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 16(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 24(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 32(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 40(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 48(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 56(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + addq %r14,%rax + movq 64(%rsi),%r12 + movq %r8,%r13 + movq %rax,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + addq %r14,%r11 + movq 72(%rsi),%r12 + movq %rdx,%r13 + movq %r11,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + addq %r14,%r10 + movq 80(%rsi),%r12 + movq %rcx,%r13 + movq %r10,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + addq %r14,%r9 + movq 88(%rsi),%r12 + movq %rbx,%r13 + movq %r9,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + addq %r14,%r8 + movq 96(%rsi),%r12 + movq %rax,%r13 + movq %r8,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + addq %r14,%rdx + movq 104(%rsi),%r12 + movq %r11,%r13 + movq %rdx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + addq %r14,%rcx + movq 112(%rsi),%r12 + movq %r10,%r13 + movq %rcx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + addq %r14,%rbx + movq 120(%rsi),%r12 + movq %r9,%r13 + movq %rbx,%r14 + bswapq %r12 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + jmp .Lrounds_16_xx +.align 16 +.Lrounds_16_xx: + movq 8(%rsp),%r13 + movq 112(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 72(%rsp),%r12 + + addq 0(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,0(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 16(%rsp),%r13 + movq 120(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 80(%rsp),%r12 + + addq 8(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,8(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 24(%rsp),%r13 + movq 0(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 88(%rsp),%r12 + + addq 16(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,16(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 32(%rsp),%r13 + movq 8(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 96(%rsp),%r12 + + addq 24(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,24(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 40(%rsp),%r13 + movq 16(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 104(%rsp),%r12 + + addq 32(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,32(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 48(%rsp),%r13 + movq 24(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 112(%rsp),%r12 + + addq 40(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,40(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 56(%rsp),%r13 + movq 32(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 120(%rsp),%r12 + + addq 48(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,48(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 64(%rsp),%r13 + movq 40(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 0(%rsp),%r12 + + addq 56(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,56(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + movq 72(%rsp),%r13 + movq 48(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rax + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 8(%rsp),%r12 + + addq 64(%rsp),%r12 + movq %r8,%r13 + addq %r15,%r12 + movq %rax,%r14 + rorq $23,%r13 + movq %r9,%r15 + + xorq %r8,%r13 + rorq $5,%r14 + xorq %r10,%r15 + + movq %r12,64(%rsp) + xorq %rax,%r14 + andq %r8,%r15 + + rorq $4,%r13 + addq %r11,%r12 + xorq %r10,%r15 + + rorq $6,%r14 + xorq %r8,%r13 + addq %r15,%r12 + + movq %rax,%r15 + addq (%rbp),%r12 + xorq %rax,%r14 + + xorq %rbx,%r15 + rorq $14,%r13 + movq %rbx,%r11 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r11 + addq %r12,%rdx + addq %r12,%r11 + + leaq 8(%rbp),%rbp + movq 80(%rsp),%r13 + movq 56(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r11 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 16(%rsp),%r12 + + addq 72(%rsp),%r12 + movq %rdx,%r13 + addq %rdi,%r12 + movq %r11,%r14 + rorq $23,%r13 + movq %r8,%rdi + + xorq %rdx,%r13 + rorq $5,%r14 + xorq %r9,%rdi + + movq %r12,72(%rsp) + xorq %r11,%r14 + andq %rdx,%rdi + + rorq $4,%r13 + addq %r10,%r12 + xorq %r9,%rdi + + rorq $6,%r14 + xorq %rdx,%r13 + addq %rdi,%r12 + + movq %r11,%rdi + addq (%rbp),%r12 + xorq %r11,%r14 + + xorq %rax,%rdi + rorq $14,%r13 + movq %rax,%r10 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r10 + addq %r12,%rcx + addq %r12,%r10 + + leaq 24(%rbp),%rbp + movq 88(%rsp),%r13 + movq 64(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r10 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 24(%rsp),%r12 + + addq 80(%rsp),%r12 + movq %rcx,%r13 + addq %r15,%r12 + movq %r10,%r14 + rorq $23,%r13 + movq %rdx,%r15 + + xorq %rcx,%r13 + rorq $5,%r14 + xorq %r8,%r15 + + movq %r12,80(%rsp) + xorq %r10,%r14 + andq %rcx,%r15 + + rorq $4,%r13 + addq %r9,%r12 + xorq %r8,%r15 + + rorq $6,%r14 + xorq %rcx,%r13 + addq %r15,%r12 + + movq %r10,%r15 + addq (%rbp),%r12 + xorq %r10,%r14 + + xorq %r11,%r15 + rorq $14,%r13 + movq %r11,%r9 + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%r9 + addq %r12,%rbx + addq %r12,%r9 + + leaq 8(%rbp),%rbp + movq 96(%rsp),%r13 + movq 72(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r9 + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 32(%rsp),%r12 + + addq 88(%rsp),%r12 + movq %rbx,%r13 + addq %rdi,%r12 + movq %r9,%r14 + rorq $23,%r13 + movq %rcx,%rdi + + xorq %rbx,%r13 + rorq $5,%r14 + xorq %rdx,%rdi + + movq %r12,88(%rsp) + xorq %r9,%r14 + andq %rbx,%rdi + + rorq $4,%r13 + addq %r8,%r12 + xorq %rdx,%rdi + + rorq $6,%r14 + xorq %rbx,%r13 + addq %rdi,%r12 + + movq %r9,%rdi + addq (%rbp),%r12 + xorq %r9,%r14 + + xorq %r10,%rdi + rorq $14,%r13 + movq %r10,%r8 + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%r8 + addq %r12,%rax + addq %r12,%r8 + + leaq 24(%rbp),%rbp + movq 104(%rsp),%r13 + movq 80(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%r8 + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 40(%rsp),%r12 + + addq 96(%rsp),%r12 + movq %rax,%r13 + addq %r15,%r12 + movq %r8,%r14 + rorq $23,%r13 + movq %rbx,%r15 + + xorq %rax,%r13 + rorq $5,%r14 + xorq %rcx,%r15 + + movq %r12,96(%rsp) + xorq %r8,%r14 + andq %rax,%r15 + + rorq $4,%r13 + addq %rdx,%r12 + xorq %rcx,%r15 + + rorq $6,%r14 + xorq %rax,%r13 + addq %r15,%r12 + + movq %r8,%r15 + addq (%rbp),%r12 + xorq %r8,%r14 + + xorq %r9,%r15 + rorq $14,%r13 + movq %r9,%rdx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rdx + addq %r12,%r11 + addq %r12,%rdx + + leaq 8(%rbp),%rbp + movq 112(%rsp),%r13 + movq 88(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rdx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 48(%rsp),%r12 + + addq 104(%rsp),%r12 + movq %r11,%r13 + addq %rdi,%r12 + movq %rdx,%r14 + rorq $23,%r13 + movq %rax,%rdi + + xorq %r11,%r13 + rorq $5,%r14 + xorq %rbx,%rdi + + movq %r12,104(%rsp) + xorq %rdx,%r14 + andq %r11,%rdi + + rorq $4,%r13 + addq %rcx,%r12 + xorq %rbx,%rdi + + rorq $6,%r14 + xorq %r11,%r13 + addq %rdi,%r12 + + movq %rdx,%rdi + addq (%rbp),%r12 + xorq %rdx,%r14 + + xorq %r8,%rdi + rorq $14,%r13 + movq %r8,%rcx + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rcx + addq %r12,%r10 + addq %r12,%rcx + + leaq 24(%rbp),%rbp + movq 120(%rsp),%r13 + movq 96(%rsp),%r15 + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rcx + movq %r15,%r14 + rorq $42,%r15 + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%r15 + shrq $6,%r14 + + rorq $19,%r15 + xorq %r13,%r12 + xorq %r14,%r15 + addq 56(%rsp),%r12 + + addq 112(%rsp),%r12 + movq %r10,%r13 + addq %r15,%r12 + movq %rcx,%r14 + rorq $23,%r13 + movq %r11,%r15 + + xorq %r10,%r13 + rorq $5,%r14 + xorq %rax,%r15 + + movq %r12,112(%rsp) + xorq %rcx,%r14 + andq %r10,%r15 + + rorq $4,%r13 + addq %rbx,%r12 + xorq %rax,%r15 + + rorq $6,%r14 + xorq %r10,%r13 + addq %r15,%r12 + + movq %rcx,%r15 + addq (%rbp),%r12 + xorq %rcx,%r14 + + xorq %rdx,%r15 + rorq $14,%r13 + movq %rdx,%rbx + + andq %r15,%rdi + rorq $28,%r14 + addq %r13,%r12 + + xorq %rdi,%rbx + addq %r12,%r9 + addq %r12,%rbx + + leaq 8(%rbp),%rbp + movq 0(%rsp),%r13 + movq 104(%rsp),%rdi + + movq %r13,%r12 + rorq $7,%r13 + addq %r14,%rbx + movq %rdi,%r14 + rorq $42,%rdi + + xorq %r12,%r13 + shrq $7,%r12 + rorq $1,%r13 + xorq %r14,%rdi + shrq $6,%r14 + + rorq $19,%rdi + xorq %r13,%r12 + xorq %r14,%rdi + addq 64(%rsp),%r12 + + addq 120(%rsp),%r12 + movq %r9,%r13 + addq %rdi,%r12 + movq %rbx,%r14 + rorq $23,%r13 + movq %r10,%rdi + + xorq %r9,%r13 + rorq $5,%r14 + xorq %r11,%rdi + + movq %r12,120(%rsp) + xorq %rbx,%r14 + andq %r9,%rdi + + rorq $4,%r13 + addq %rax,%r12 + xorq %r11,%rdi + + rorq $6,%r14 + xorq %r9,%r13 + addq %rdi,%r12 + + movq %rbx,%rdi + addq (%rbp),%r12 + xorq %rbx,%r14 + + xorq %rcx,%rdi + rorq $14,%r13 + movq %rcx,%rax + + andq %rdi,%r15 + rorq $28,%r14 + addq %r13,%r12 + + xorq %r15,%rax + addq %r12,%r8 + addq %r12,%rax + + leaq 24(%rbp),%rbp + cmpb $0,7(%rbp) + jnz .Lrounds_16_xx + + movq 128+0(%rsp),%rdi + addq %r14,%rax + leaq 128(%rsi),%rsi + + addq 0(%rdi),%rax + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue: + ret +.cfi_endproc +.size sha512_block_data_order_nohw,.-sha512_block_data_order_nohw +.section .rodata +.align 64 +.type K512,@object +K512: +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0x428a2f98d728ae22,0x7137449123ef65cd +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x3956c25bf348b538,0x59f111f1b605d019 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0xd807aa98a3030242,0x12835b0145706fbe +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0x9bdc06a725c71235,0xc19bf174cf692694 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0x983e5152ee66dfab,0xa831c66d2db43210 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x06ca6351e003826f,0x142929670a0e6e70 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0x81c2c92e47edaee6,0x92722c851482353b +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xd192e819d6ef5218,0xd69906245565a910 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0xf40e35855771202a,0x106aa07032bbd1b8 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0x90befffa23631e28,0xa4506cebde82bde9 +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xca273eceea26619c,0xd186b8c721c0c207 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x113f9804bef90dae,0x1b710b35131c471b +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x28db77f523047d84,0x32caab7b40c72493 +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 +.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.quad 0x0001020304050607,0x08090a0b0c0d0e0f +.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl sha512_block_data_order_avx +.hidden sha512_block_data_order_avx +.type sha512_block_data_order_avx,@function +.align 64 +sha512_block_data_order_avx: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + shlq $4,%rdx + subq $160,%rsp + leaq (%rsi,%rdx,8),%rdx + andq $-64,%rsp + movq %rdi,128+0(%rsp) + movq %rsi,128+8(%rsp) + movq %rdx,128+16(%rsp) + movq %rax,152(%rsp) +.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 +.Lprologue_avx: + + vzeroupper + movq 0(%rdi),%rax + movq 8(%rdi),%rbx + movq 16(%rdi),%rcx + movq 24(%rdi),%rdx + movq 32(%rdi),%r8 + movq 40(%rdi),%r9 + movq 48(%rdi),%r10 + movq 56(%rdi),%r11 + jmp .Lloop_avx +.align 16 +.Lloop_avx: + vmovdqa K512+1280(%rip),%xmm11 + vmovdqu 0(%rsi),%xmm0 + leaq K512+128(%rip),%rbp + vmovdqu 16(%rsi),%xmm1 + vmovdqu 32(%rsi),%xmm2 + vpshufb %xmm11,%xmm0,%xmm0 + vmovdqu 48(%rsi),%xmm3 + vpshufb %xmm11,%xmm1,%xmm1 + vmovdqu 64(%rsi),%xmm4 + vpshufb %xmm11,%xmm2,%xmm2 + vmovdqu 80(%rsi),%xmm5 + vpshufb %xmm11,%xmm3,%xmm3 + vmovdqu 96(%rsi),%xmm6 + vpshufb %xmm11,%xmm4,%xmm4 + vmovdqu 112(%rsi),%xmm7 + vpshufb %xmm11,%xmm5,%xmm5 + vpaddq -128(%rbp),%xmm0,%xmm8 + vpshufb %xmm11,%xmm6,%xmm6 + vpaddq -96(%rbp),%xmm1,%xmm9 + vpshufb %xmm11,%xmm7,%xmm7 + vpaddq -64(%rbp),%xmm2,%xmm10 + vpaddq -32(%rbp),%xmm3,%xmm11 + vmovdqa %xmm8,0(%rsp) + vpaddq 0(%rbp),%xmm4,%xmm8 + vmovdqa %xmm9,16(%rsp) + vpaddq 32(%rbp),%xmm5,%xmm9 + vmovdqa %xmm10,32(%rsp) + vpaddq 64(%rbp),%xmm6,%xmm10 + vmovdqa %xmm11,48(%rsp) + vpaddq 96(%rbp),%xmm7,%xmm11 + vmovdqa %xmm8,64(%rsp) + movq %rax,%r14 + vmovdqa %xmm9,80(%rsp) + movq %rbx,%rdi + vmovdqa %xmm10,96(%rsp) + xorq %rcx,%rdi + vmovdqa %xmm11,112(%rsp) + movq %r8,%r13 + jmp .Lavx_00_47 + +.align 16 +.Lavx_00_47: + addq $256,%rbp + vpalignr $8,%xmm0,%xmm1,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm4,%xmm5,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm0,%xmm0 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 0(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm7,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm7,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm0,%xmm0 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm7,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 8(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm0,%xmm0 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq -128(%rbp),%xmm0,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,0(%rsp) + vpalignr $8,%xmm1,%xmm2,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm5,%xmm6,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm1,%xmm1 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 16(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm0,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm0,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm1,%xmm1 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm0,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 24(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm1,%xmm1 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq -96(%rbp),%xmm1,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,16(%rsp) + vpalignr $8,%xmm2,%xmm3,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm6,%xmm7,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm2,%xmm2 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 32(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm1,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm1,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm2,%xmm2 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm1,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 40(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm2,%xmm2 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq -64(%rbp),%xmm2,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,32(%rsp) + vpalignr $8,%xmm3,%xmm4,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm7,%xmm0,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm3,%xmm3 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 48(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm2,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm2,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm3,%xmm3 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm2,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 56(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm3,%xmm3 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq -32(%rbp),%xmm3,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,48(%rsp) + vpalignr $8,%xmm4,%xmm5,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rax + vpalignr $8,%xmm0,%xmm1,%xmm11 + movq %r9,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r8,%r13 + xorq %r10,%r12 + vpaddq %xmm11,%xmm4,%xmm4 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r8,%r12 + xorq %r8,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 64(%rsp),%r11 + movq %rax,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rbx,%r15 + addq %r12,%r11 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rax,%r14 + addq %r13,%r11 + vpxor %xmm10,%xmm8,%xmm8 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm3,%xmm11 + addq %r11,%rdx + addq %rdi,%r11 + vpxor %xmm9,%xmm8,%xmm8 + movq %rdx,%r13 + addq %r11,%r14 + vpsllq $3,%xmm3,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r11 + vpaddq %xmm8,%xmm4,%xmm4 + movq %r8,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm3,%xmm9 + xorq %rdx,%r13 + xorq %r9,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rdx,%r12 + xorq %rdx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 72(%rsp),%r10 + movq %r11,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r9,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rax,%rdi + addq %r12,%r10 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm4,%xmm4 + xorq %r11,%r14 + addq %r13,%r10 + vpaddq 0(%rbp),%xmm4,%xmm10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + vmovdqa %xmm10,64(%rsp) + vpalignr $8,%xmm5,%xmm6,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r10 + vpalignr $8,%xmm1,%xmm2,%xmm11 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rcx,%r13 + xorq %r8,%r12 + vpaddq %xmm11,%xmm5,%xmm5 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rcx,%r12 + xorq %rcx,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 80(%rsp),%r9 + movq %r10,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r11,%r15 + addq %r12,%r9 + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r10,%r14 + addq %r13,%r9 + vpxor %xmm10,%xmm8,%xmm8 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm4,%xmm11 + addq %r9,%rbx + addq %rdi,%r9 + vpxor %xmm9,%xmm8,%xmm8 + movq %rbx,%r13 + addq %r9,%r14 + vpsllq $3,%xmm4,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%r9 + vpaddq %xmm8,%xmm5,%xmm5 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm4,%xmm9 + xorq %rbx,%r13 + xorq %rdx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %rbx,%r12 + xorq %rbx,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 88(%rsp),%r8 + movq %r9,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r10,%rdi + addq %r12,%r8 + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm5,%xmm5 + xorq %r9,%r14 + addq %r13,%r8 + vpaddq 32(%rbp),%xmm5,%xmm10 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + vmovdqa %xmm10,80(%rsp) + vpalignr $8,%xmm6,%xmm7,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%r8 + vpalignr $8,%xmm2,%xmm3,%xmm11 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %rax,%r13 + xorq %rcx,%r12 + vpaddq %xmm11,%xmm6,%xmm6 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %rax,%r12 + xorq %rax,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 96(%rsp),%rdx + movq %r8,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %r9,%r15 + addq %r12,%rdx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %r8,%r14 + addq %r13,%rdx + vpxor %xmm10,%xmm8,%xmm8 + xorq %r9,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm5,%xmm11 + addq %rdx,%r11 + addq %rdi,%rdx + vpxor %xmm9,%xmm8,%xmm8 + movq %r11,%r13 + addq %rdx,%r14 + vpsllq $3,%xmm5,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rdx + vpaddq %xmm8,%xmm6,%xmm6 + movq %rax,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm5,%xmm9 + xorq %r11,%r13 + xorq %rbx,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r11,%r12 + xorq %r11,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 104(%rsp),%rcx + movq %rdx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %r8,%rdi + addq %r12,%rcx + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm6,%xmm6 + xorq %rdx,%r14 + addq %r13,%rcx + vpaddq 64(%rbp),%xmm6,%xmm10 + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + vmovdqa %xmm10,96(%rsp) + vpalignr $8,%xmm7,%xmm0,%xmm8 + shrdq $23,%r13,%r13 + movq %r14,%rcx + vpalignr $8,%xmm3,%xmm4,%xmm11 + movq %r11,%r12 + shrdq $5,%r14,%r14 + vpsrlq $1,%xmm8,%xmm10 + xorq %r10,%r13 + xorq %rax,%r12 + vpaddq %xmm11,%xmm7,%xmm7 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + vpsrlq $7,%xmm8,%xmm11 + andq %r10,%r12 + xorq %r10,%r13 + vpsllq $56,%xmm8,%xmm9 + addq 112(%rsp),%rbx + movq %rcx,%r15 + vpxor %xmm10,%xmm11,%xmm8 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + vpsrlq $7,%xmm10,%xmm10 + xorq %rdx,%r15 + addq %r12,%rbx + vpxor %xmm9,%xmm8,%xmm8 + shrdq $14,%r13,%r13 + andq %r15,%rdi + vpsllq $7,%xmm9,%xmm9 + xorq %rcx,%r14 + addq %r13,%rbx + vpxor %xmm10,%xmm8,%xmm8 + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + vpsrlq $6,%xmm6,%xmm11 + addq %rbx,%r9 + addq %rdi,%rbx + vpxor %xmm9,%xmm8,%xmm8 + movq %r9,%r13 + addq %rbx,%r14 + vpsllq $3,%xmm6,%xmm10 + shrdq $23,%r13,%r13 + movq %r14,%rbx + vpaddq %xmm8,%xmm7,%xmm7 + movq %r10,%r12 + shrdq $5,%r14,%r14 + vpsrlq $19,%xmm6,%xmm9 + xorq %r9,%r13 + xorq %r11,%r12 + vpxor %xmm10,%xmm11,%xmm11 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + vpsllq $42,%xmm10,%xmm10 + andq %r9,%r12 + xorq %r9,%r13 + vpxor %xmm9,%xmm11,%xmm11 + addq 120(%rsp),%rax + movq %rbx,%rdi + vpsrlq $42,%xmm9,%xmm9 + xorq %r11,%r12 + shrdq $6,%r14,%r14 + vpxor %xmm10,%xmm11,%xmm11 + xorq %rcx,%rdi + addq %r12,%rax + vpxor %xmm9,%xmm11,%xmm11 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + vpaddq %xmm11,%xmm7,%xmm7 + xorq %rbx,%r14 + addq %r13,%rax + vpaddq 96(%rbp),%xmm7,%xmm10 + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + vmovdqa %xmm10,112(%rsp) + cmpb $0,135(%rbp) + jne .Lavx_00_47 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 0(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 8(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 16(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 24(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 32(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 40(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 48(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 56(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rax + movq %r9,%r12 + shrdq $5,%r14,%r14 + xorq %r8,%r13 + xorq %r10,%r12 + shrdq $4,%r13,%r13 + xorq %rax,%r14 + andq %r8,%r12 + xorq %r8,%r13 + addq 64(%rsp),%r11 + movq %rax,%r15 + xorq %r10,%r12 + shrdq $6,%r14,%r14 + xorq %rbx,%r15 + addq %r12,%r11 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rax,%r14 + addq %r13,%r11 + xorq %rbx,%rdi + shrdq $28,%r14,%r14 + addq %r11,%rdx + addq %rdi,%r11 + movq %rdx,%r13 + addq %r11,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r11 + movq %r8,%r12 + shrdq $5,%r14,%r14 + xorq %rdx,%r13 + xorq %r9,%r12 + shrdq $4,%r13,%r13 + xorq %r11,%r14 + andq %rdx,%r12 + xorq %rdx,%r13 + addq 72(%rsp),%r10 + movq %r11,%rdi + xorq %r9,%r12 + shrdq $6,%r14,%r14 + xorq %rax,%rdi + addq %r12,%r10 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r11,%r14 + addq %r13,%r10 + xorq %rax,%r15 + shrdq $28,%r14,%r14 + addq %r10,%rcx + addq %r15,%r10 + movq %rcx,%r13 + addq %r10,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r10 + movq %rdx,%r12 + shrdq $5,%r14,%r14 + xorq %rcx,%r13 + xorq %r8,%r12 + shrdq $4,%r13,%r13 + xorq %r10,%r14 + andq %rcx,%r12 + xorq %rcx,%r13 + addq 80(%rsp),%r9 + movq %r10,%r15 + xorq %r8,%r12 + shrdq $6,%r14,%r14 + xorq %r11,%r15 + addq %r12,%r9 + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r10,%r14 + addq %r13,%r9 + xorq %r11,%rdi + shrdq $28,%r14,%r14 + addq %r9,%rbx + addq %rdi,%r9 + movq %rbx,%r13 + addq %r9,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r9 + movq %rcx,%r12 + shrdq $5,%r14,%r14 + xorq %rbx,%r13 + xorq %rdx,%r12 + shrdq $4,%r13,%r13 + xorq %r9,%r14 + andq %rbx,%r12 + xorq %rbx,%r13 + addq 88(%rsp),%r8 + movq %r9,%rdi + xorq %rdx,%r12 + shrdq $6,%r14,%r14 + xorq %r10,%rdi + addq %r12,%r8 + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %r9,%r14 + addq %r13,%r8 + xorq %r10,%r15 + shrdq $28,%r14,%r14 + addq %r8,%rax + addq %r15,%r8 + movq %rax,%r13 + addq %r8,%r14 + shrdq $23,%r13,%r13 + movq %r14,%r8 + movq %rbx,%r12 + shrdq $5,%r14,%r14 + xorq %rax,%r13 + xorq %rcx,%r12 + shrdq $4,%r13,%r13 + xorq %r8,%r14 + andq %rax,%r12 + xorq %rax,%r13 + addq 96(%rsp),%rdx + movq %r8,%r15 + xorq %rcx,%r12 + shrdq $6,%r14,%r14 + xorq %r9,%r15 + addq %r12,%rdx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %r8,%r14 + addq %r13,%rdx + xorq %r9,%rdi + shrdq $28,%r14,%r14 + addq %rdx,%r11 + addq %rdi,%rdx + movq %r11,%r13 + addq %rdx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rdx + movq %rax,%r12 + shrdq $5,%r14,%r14 + xorq %r11,%r13 + xorq %rbx,%r12 + shrdq $4,%r13,%r13 + xorq %rdx,%r14 + andq %r11,%r12 + xorq %r11,%r13 + addq 104(%rsp),%rcx + movq %rdx,%rdi + xorq %rbx,%r12 + shrdq $6,%r14,%r14 + xorq %r8,%rdi + addq %r12,%rcx + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rdx,%r14 + addq %r13,%rcx + xorq %r8,%r15 + shrdq $28,%r14,%r14 + addq %rcx,%r10 + addq %r15,%rcx + movq %r10,%r13 + addq %rcx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rcx + movq %r11,%r12 + shrdq $5,%r14,%r14 + xorq %r10,%r13 + xorq %rax,%r12 + shrdq $4,%r13,%r13 + xorq %rcx,%r14 + andq %r10,%r12 + xorq %r10,%r13 + addq 112(%rsp),%rbx + movq %rcx,%r15 + xorq %rax,%r12 + shrdq $6,%r14,%r14 + xorq %rdx,%r15 + addq %r12,%rbx + shrdq $14,%r13,%r13 + andq %r15,%rdi + xorq %rcx,%r14 + addq %r13,%rbx + xorq %rdx,%rdi + shrdq $28,%r14,%r14 + addq %rbx,%r9 + addq %rdi,%rbx + movq %r9,%r13 + addq %rbx,%r14 + shrdq $23,%r13,%r13 + movq %r14,%rbx + movq %r10,%r12 + shrdq $5,%r14,%r14 + xorq %r9,%r13 + xorq %r11,%r12 + shrdq $4,%r13,%r13 + xorq %rbx,%r14 + andq %r9,%r12 + xorq %r9,%r13 + addq 120(%rsp),%rax + movq %rbx,%rdi + xorq %r11,%r12 + shrdq $6,%r14,%r14 + xorq %rcx,%rdi + addq %r12,%rax + shrdq $14,%r13,%r13 + andq %rdi,%r15 + xorq %rbx,%r14 + addq %r13,%rax + xorq %rcx,%r15 + shrdq $28,%r14,%r14 + addq %rax,%r8 + addq %r15,%rax + movq %r8,%r13 + addq %rax,%r14 + movq 128+0(%rsp),%rdi + movq %r14,%rax + + addq 0(%rdi),%rax + leaq 128(%rsi),%rsi + addq 8(%rdi),%rbx + addq 16(%rdi),%rcx + addq 24(%rdi),%rdx + addq 32(%rdi),%r8 + addq 40(%rdi),%r9 + addq 48(%rdi),%r10 + addq 56(%rdi),%r11 + + cmpq 128+16(%rsp),%rsi + + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + movq %rcx,16(%rdi) + movq %rdx,24(%rdi) + movq %r8,32(%rdi) + movq %r9,40(%rdi) + movq %r10,48(%rdi) + movq %r11,56(%rdi) + jb .Lloop_avx + + movq 152(%rsp),%rsi +.cfi_def_cfa %rsi,8 + vzeroupper + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lepilogue_avx: + ret +.cfi_endproc +.size sha512_block_data_order_avx,.-sha512_block_data_order_avx +#endif diff --git a/third_party/boringssl/gen/bcm/sha512-x86_64-win.asm b/third_party/boringssl/gen/bcm/sha512-x86_64-win.asm new file mode 100644 index 00000000..53cabe62 --- /dev/null +++ b/third_party/boringssl/gen/bcm/sha512-x86_64-win.asm @@ -0,0 +1,3140 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + +global sha512_block_data_order_nohw + +ALIGN 16 +sha512_block_data_order_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha512_block_data_order_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,16*8+4*8 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[152+rsp],rax + +$L$prologue: + + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop + +ALIGN 16 +$L$loop: + mov rdi,rbx + lea rbp,[K512] + xor rdi,rcx + mov r12,QWORD[rsi] + mov r13,r8 + mov r14,rax + bswap r12 + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + add r11,r14 + mov r12,QWORD[8+rsi] + mov r13,rdx + mov r14,r11 + bswap r12 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[8+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + add r10,r14 + mov r12,QWORD[16+rsi] + mov r13,rcx + mov r14,r10 + bswap r12 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[16+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + add r9,r14 + mov r12,QWORD[24+rsi] + mov r13,rbx + mov r14,r9 + bswap r12 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[24+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + add r8,r14 + mov r12,QWORD[32+rsi] + mov r13,rax + mov r14,r8 + bswap r12 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[32+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + add rdx,r14 + mov r12,QWORD[40+rsi] + mov r13,r11 + mov r14,rdx + bswap r12 + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[40+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + add rcx,r14 + mov r12,QWORD[48+rsi] + mov r13,r10 + mov r14,rcx + bswap r12 + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[48+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + add rbx,r14 + mov r12,QWORD[56+rsi] + mov r13,r9 + mov r14,rbx + bswap r12 + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[56+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + add rax,r14 + mov r12,QWORD[64+rsi] + mov r13,r8 + mov r14,rax + bswap r12 + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[64+rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + add r11,r14 + mov r12,QWORD[72+rsi] + mov r13,rdx + mov r14,r11 + bswap r12 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[72+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + add r10,r14 + mov r12,QWORD[80+rsi] + mov r13,rcx + mov r14,r10 + bswap r12 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[80+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + add r9,r14 + mov r12,QWORD[88+rsi] + mov r13,rbx + mov r14,r9 + bswap r12 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[88+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + add r8,r14 + mov r12,QWORD[96+rsi] + mov r13,rax + mov r14,r8 + bswap r12 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[96+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + add rdx,r14 + mov r12,QWORD[104+rsi] + mov r13,r11 + mov r14,rdx + bswap r12 + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[104+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + add rcx,r14 + mov r12,QWORD[112+rsi] + mov r13,r10 + mov r14,rcx + bswap r12 + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[112+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + add rbx,r14 + mov r12,QWORD[120+rsi] + mov r13,r9 + mov r14,rbx + bswap r12 + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[120+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + jmp NEAR $L$rounds_16_xx +ALIGN 16 +$L$rounds_16_xx: + mov r13,QWORD[8+rsp] + mov r15,QWORD[112+rsp] + + mov r12,r13 + ror r13,7 + add rax,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[72+rsp] + + add r12,QWORD[rsp] + mov r13,r8 + add r12,r15 + mov r14,rax + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[16+rsp] + mov rdi,QWORD[120+rsp] + + mov r12,r13 + ror r13,7 + add r11,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[80+rsp] + + add r12,QWORD[8+rsp] + mov r13,rdx + add r12,rdi + mov r14,r11 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[8+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[24+rsp] + mov r15,QWORD[rsp] + + mov r12,r13 + ror r13,7 + add r10,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[88+rsp] + + add r12,QWORD[16+rsp] + mov r13,rcx + add r12,r15 + mov r14,r10 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[16+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[32+rsp] + mov rdi,QWORD[8+rsp] + + mov r12,r13 + ror r13,7 + add r9,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[96+rsp] + + add r12,QWORD[24+rsp] + mov r13,rbx + add r12,rdi + mov r14,r9 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[24+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[40+rsp] + mov r15,QWORD[16+rsp] + + mov r12,r13 + ror r13,7 + add r8,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[104+rsp] + + add r12,QWORD[32+rsp] + mov r13,rax + add r12,r15 + mov r14,r8 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[32+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[48+rsp] + mov rdi,QWORD[24+rsp] + + mov r12,r13 + ror r13,7 + add rdx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[112+rsp] + + add r12,QWORD[40+rsp] + mov r13,r11 + add r12,rdi + mov r14,rdx + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[40+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[56+rsp] + mov r15,QWORD[32+rsp] + + mov r12,r13 + ror r13,7 + add rcx,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[120+rsp] + + add r12,QWORD[48+rsp] + mov r13,r10 + add r12,r15 + mov r14,rcx + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[48+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[64+rsp] + mov rdi,QWORD[40+rsp] + + mov r12,r13 + ror r13,7 + add rbx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[rsp] + + add r12,QWORD[56+rsp] + mov r13,r9 + add r12,rdi + mov r14,rbx + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[56+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[72+rsp] + mov r15,QWORD[48+rsp] + + mov r12,r13 + ror r13,7 + add rax,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[8+rsp] + + add r12,QWORD[64+rsp] + mov r13,r8 + add r12,r15 + mov r14,rax + ror r13,23 + mov r15,r9 + + xor r13,r8 + ror r14,5 + xor r15,r10 + + mov QWORD[64+rsp],r12 + xor r14,rax + and r15,r8 + + ror r13,4 + add r12,r11 + xor r15,r10 + + ror r14,6 + xor r13,r8 + add r12,r15 + + mov r15,rax + add r12,QWORD[rbp] + xor r14,rax + + xor r15,rbx + ror r13,14 + mov r11,rbx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r11,rdi + add rdx,r12 + add r11,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[80+rsp] + mov rdi,QWORD[56+rsp] + + mov r12,r13 + ror r13,7 + add r11,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[16+rsp] + + add r12,QWORD[72+rsp] + mov r13,rdx + add r12,rdi + mov r14,r11 + ror r13,23 + mov rdi,r8 + + xor r13,rdx + ror r14,5 + xor rdi,r9 + + mov QWORD[72+rsp],r12 + xor r14,r11 + and rdi,rdx + + ror r13,4 + add r12,r10 + xor rdi,r9 + + ror r14,6 + xor r13,rdx + add r12,rdi + + mov rdi,r11 + add r12,QWORD[rbp] + xor r14,r11 + + xor rdi,rax + ror r13,14 + mov r10,rax + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r10,r15 + add rcx,r12 + add r10,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[88+rsp] + mov r15,QWORD[64+rsp] + + mov r12,r13 + ror r13,7 + add r10,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[24+rsp] + + add r12,QWORD[80+rsp] + mov r13,rcx + add r12,r15 + mov r14,r10 + ror r13,23 + mov r15,rdx + + xor r13,rcx + ror r14,5 + xor r15,r8 + + mov QWORD[80+rsp],r12 + xor r14,r10 + and r15,rcx + + ror r13,4 + add r12,r9 + xor r15,r8 + + ror r14,6 + xor r13,rcx + add r12,r15 + + mov r15,r10 + add r12,QWORD[rbp] + xor r14,r10 + + xor r15,r11 + ror r13,14 + mov r9,r11 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor r9,rdi + add rbx,r12 + add r9,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[96+rsp] + mov rdi,QWORD[72+rsp] + + mov r12,r13 + ror r13,7 + add r9,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[32+rsp] + + add r12,QWORD[88+rsp] + mov r13,rbx + add r12,rdi + mov r14,r9 + ror r13,23 + mov rdi,rcx + + xor r13,rbx + ror r14,5 + xor rdi,rdx + + mov QWORD[88+rsp],r12 + xor r14,r9 + and rdi,rbx + + ror r13,4 + add r12,r8 + xor rdi,rdx + + ror r14,6 + xor r13,rbx + add r12,rdi + + mov rdi,r9 + add r12,QWORD[rbp] + xor r14,r9 + + xor rdi,r10 + ror r13,14 + mov r8,r10 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor r8,r15 + add rax,r12 + add r8,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[104+rsp] + mov r15,QWORD[80+rsp] + + mov r12,r13 + ror r13,7 + add r8,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[40+rsp] + + add r12,QWORD[96+rsp] + mov r13,rax + add r12,r15 + mov r14,r8 + ror r13,23 + mov r15,rbx + + xor r13,rax + ror r14,5 + xor r15,rcx + + mov QWORD[96+rsp],r12 + xor r14,r8 + and r15,rax + + ror r13,4 + add r12,rdx + xor r15,rcx + + ror r14,6 + xor r13,rax + add r12,r15 + + mov r15,r8 + add r12,QWORD[rbp] + xor r14,r8 + + xor r15,r9 + ror r13,14 + mov rdx,r9 + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rdx,rdi + add r11,r12 + add rdx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[112+rsp] + mov rdi,QWORD[88+rsp] + + mov r12,r13 + ror r13,7 + add rdx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[48+rsp] + + add r12,QWORD[104+rsp] + mov r13,r11 + add r12,rdi + mov r14,rdx + ror r13,23 + mov rdi,rax + + xor r13,r11 + ror r14,5 + xor rdi,rbx + + mov QWORD[104+rsp],r12 + xor r14,rdx + and rdi,r11 + + ror r13,4 + add r12,rcx + xor rdi,rbx + + ror r14,6 + xor r13,r11 + add r12,rdi + + mov rdi,rdx + add r12,QWORD[rbp] + xor r14,rdx + + xor rdi,r8 + ror r13,14 + mov rcx,r8 + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rcx,r15 + add r10,r12 + add rcx,r12 + + lea rbp,[24+rbp] + mov r13,QWORD[120+rsp] + mov r15,QWORD[96+rsp] + + mov r12,r13 + ror r13,7 + add rcx,r14 + mov r14,r15 + ror r15,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor r15,r14 + shr r14,6 + + ror r15,19 + xor r12,r13 + xor r15,r14 + add r12,QWORD[56+rsp] + + add r12,QWORD[112+rsp] + mov r13,r10 + add r12,r15 + mov r14,rcx + ror r13,23 + mov r15,r11 + + xor r13,r10 + ror r14,5 + xor r15,rax + + mov QWORD[112+rsp],r12 + xor r14,rcx + and r15,r10 + + ror r13,4 + add r12,rbx + xor r15,rax + + ror r14,6 + xor r13,r10 + add r12,r15 + + mov r15,rcx + add r12,QWORD[rbp] + xor r14,rcx + + xor r15,rdx + ror r13,14 + mov rbx,rdx + + and rdi,r15 + ror r14,28 + add r12,r13 + + xor rbx,rdi + add r9,r12 + add rbx,r12 + + lea rbp,[8+rbp] + mov r13,QWORD[rsp] + mov rdi,QWORD[104+rsp] + + mov r12,r13 + ror r13,7 + add rbx,r14 + mov r14,rdi + ror rdi,42 + + xor r13,r12 + shr r12,7 + ror r13,1 + xor rdi,r14 + shr r14,6 + + ror rdi,19 + xor r12,r13 + xor rdi,r14 + add r12,QWORD[64+rsp] + + add r12,QWORD[120+rsp] + mov r13,r9 + add r12,rdi + mov r14,rbx + ror r13,23 + mov rdi,r10 + + xor r13,r9 + ror r14,5 + xor rdi,r11 + + mov QWORD[120+rsp],r12 + xor r14,rbx + and rdi,r9 + + ror r13,4 + add r12,rax + xor rdi,r11 + + ror r14,6 + xor r13,r9 + add r12,rdi + + mov rdi,rbx + add r12,QWORD[rbp] + xor r14,rbx + + xor rdi,rcx + ror r13,14 + mov rax,rcx + + and r15,rdi + ror r14,28 + add r12,r13 + + xor rax,r15 + add r8,r12 + add rax,r12 + + lea rbp,[24+rbp] + cmp BYTE[7+rbp],0 + jnz NEAR $L$rounds_16_xx + + mov rdi,QWORD[((128+0))+rsp] + add rax,r14 + lea rsi,[128+rsi] + + add rax,QWORD[rdi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop + + mov rsi,QWORD[152+rsp] + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha512_block_data_order_nohw: +section .rdata rdata align=8 +ALIGN 64 + +K512: + DQ 0x428a2f98d728ae22,0x7137449123ef65cd + DQ 0x428a2f98d728ae22,0x7137449123ef65cd + DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + DQ 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc + DQ 0x3956c25bf348b538,0x59f111f1b605d019 + DQ 0x3956c25bf348b538,0x59f111f1b605d019 + DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + DQ 0x923f82a4af194f9b,0xab1c5ed5da6d8118 + DQ 0xd807aa98a3030242,0x12835b0145706fbe + DQ 0xd807aa98a3030242,0x12835b0145706fbe + DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + DQ 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 + DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + DQ 0x72be5d74f27b896f,0x80deb1fe3b1696b1 + DQ 0x9bdc06a725c71235,0xc19bf174cf692694 + DQ 0x9bdc06a725c71235,0xc19bf174cf692694 + DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + DQ 0xe49b69c19ef14ad2,0xefbe4786384f25e3 + DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + DQ 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 + DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + DQ 0x2de92c6f592b0275,0x4a7484aa6ea6e483 + DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + DQ 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 + DQ 0x983e5152ee66dfab,0xa831c66d2db43210 + DQ 0x983e5152ee66dfab,0xa831c66d2db43210 + DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4 + DQ 0xb00327c898fb213f,0xbf597fc7beef0ee4 + DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725 + DQ 0xc6e00bf33da88fc2,0xd5a79147930aa725 + DQ 0x06ca6351e003826f,0x142929670a0e6e70 + DQ 0x06ca6351e003826f,0x142929670a0e6e70 + DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926 + DQ 0x27b70a8546d22ffc,0x2e1b21385c26c926 + DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + DQ 0x4d2c6dfc5ac42aed,0x53380d139d95b3df + DQ 0x650a73548baf63de,0x766a0abb3c77b2a8 + DQ 0x650a73548baf63de,0x766a0abb3c77b2a8 + DQ 0x81c2c92e47edaee6,0x92722c851482353b + DQ 0x81c2c92e47edaee6,0x92722c851482353b + DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001 + DQ 0xa2bfe8a14cf10364,0xa81a664bbc423001 + DQ 0xc24b8b70d0f89791,0xc76c51a30654be30 + DQ 0xc24b8b70d0f89791,0xc76c51a30654be30 + DQ 0xd192e819d6ef5218,0xd69906245565a910 + DQ 0xd192e819d6ef5218,0xd69906245565a910 + DQ 0xf40e35855771202a,0x106aa07032bbd1b8 + DQ 0xf40e35855771202a,0x106aa07032bbd1b8 + DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + DQ 0x19a4c116b8d2d0c8,0x1e376c085141ab53 + DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + DQ 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 + DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + DQ 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb + DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + DQ 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 + DQ 0x748f82ee5defb2fc,0x78a5636f43172f60 + DQ 0x748f82ee5defb2fc,0x78a5636f43172f60 + DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec + DQ 0x84c87814a1f0ab72,0x8cc702081a6439ec + DQ 0x90befffa23631e28,0xa4506cebde82bde9 + DQ 0x90befffa23631e28,0xa4506cebde82bde9 + DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b + DQ 0xbef9a3f7b2c67915,0xc67178f2e372532b + DQ 0xca273eceea26619c,0xd186b8c721c0c207 + DQ 0xca273eceea26619c,0xd186b8c721c0c207 + DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + DQ 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 + DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6 + DQ 0x06f067aa72176fba,0x0a637dc5a2c898a6 + DQ 0x113f9804bef90dae,0x1b710b35131c471b + DQ 0x113f9804bef90dae,0x1b710b35131c471b + DQ 0x28db77f523047d84,0x32caab7b40c72493 + DQ 0x28db77f523047d84,0x32caab7b40c72493 + DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + DQ 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c + DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + DQ 0x4cc5d4becb3e42b6,0x597f299cfc657e2a + DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + DQ 0x5fcb6fab3ad6faec,0x6c44198c4a475817 + + DQ 0x0001020304050607,0x08090a0b0c0d0e0f + DQ 0x0001020304050607,0x08090a0b0c0d0e0f + DB 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 + DB 110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54 + DB 52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 + DB 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 + DB 111,114,103,62,0 +section .text + +global sha512_block_data_order_avx + +ALIGN 64 +sha512_block_data_order_avx: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_sha512_block_data_order_avx: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + shl rdx,4 + sub rsp,256 + lea rdx,[rdx*8+rsi] + and rsp,-64 + mov QWORD[((128+0))+rsp],rdi + mov QWORD[((128+8))+rsp],rsi + mov QWORD[((128+16))+rsp],rdx + mov QWORD[152+rsp],rax + + movaps XMMWORD[(128+32)+rsp],xmm6 + movaps XMMWORD[(128+48)+rsp],xmm7 + movaps XMMWORD[(128+64)+rsp],xmm8 + movaps XMMWORD[(128+80)+rsp],xmm9 + movaps XMMWORD[(128+96)+rsp],xmm10 + movaps XMMWORD[(128+112)+rsp],xmm11 +$L$prologue_avx: + + vzeroupper + mov rax,QWORD[rdi] + mov rbx,QWORD[8+rdi] + mov rcx,QWORD[16+rdi] + mov rdx,QWORD[24+rdi] + mov r8,QWORD[32+rdi] + mov r9,QWORD[40+rdi] + mov r10,QWORD[48+rdi] + mov r11,QWORD[56+rdi] + jmp NEAR $L$loop_avx +ALIGN 16 +$L$loop_avx: + vmovdqa xmm11,XMMWORD[((K512+1280))] + vmovdqu xmm0,XMMWORD[rsi] + lea rbp,[((K512+128))] + vmovdqu xmm1,XMMWORD[16+rsi] + vmovdqu xmm2,XMMWORD[32+rsi] + vpshufb xmm0,xmm0,xmm11 + vmovdqu xmm3,XMMWORD[48+rsi] + vpshufb xmm1,xmm1,xmm11 + vmovdqu xmm4,XMMWORD[64+rsi] + vpshufb xmm2,xmm2,xmm11 + vmovdqu xmm5,XMMWORD[80+rsi] + vpshufb xmm3,xmm3,xmm11 + vmovdqu xmm6,XMMWORD[96+rsi] + vpshufb xmm4,xmm4,xmm11 + vmovdqu xmm7,XMMWORD[112+rsi] + vpshufb xmm5,xmm5,xmm11 + vpaddq xmm8,xmm0,XMMWORD[((-128))+rbp] + vpshufb xmm6,xmm6,xmm11 + vpaddq xmm9,xmm1,XMMWORD[((-96))+rbp] + vpshufb xmm7,xmm7,xmm11 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + vpaddq xmm11,xmm3,XMMWORD[((-32))+rbp] + vmovdqa XMMWORD[rsp],xmm8 + vpaddq xmm8,xmm4,XMMWORD[rbp] + vmovdqa XMMWORD[16+rsp],xmm9 + vpaddq xmm9,xmm5,XMMWORD[32+rbp] + vmovdqa XMMWORD[32+rsp],xmm10 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + vmovdqa XMMWORD[48+rsp],xmm11 + vpaddq xmm11,xmm7,XMMWORD[96+rbp] + vmovdqa XMMWORD[64+rsp],xmm8 + mov r14,rax + vmovdqa XMMWORD[80+rsp],xmm9 + mov rdi,rbx + vmovdqa XMMWORD[96+rsp],xmm10 + xor rdi,rcx + vmovdqa XMMWORD[112+rsp],xmm11 + mov r13,r8 + jmp NEAR $L$avx_00_47 + +ALIGN 16 +$L$avx_00_47: + add rbp,256 + vpalignr xmm8,xmm1,xmm0,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm5,xmm4,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm0,xmm0,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm7,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm7,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm0,xmm0,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm7,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[8+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm0,xmm0,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm0,XMMWORD[((-128))+rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[rsp],xmm10 + vpalignr xmm8,xmm2,xmm1,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm6,xmm5,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm1,xmm1,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[16+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm0,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm0,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm1,xmm1,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm0,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[24+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm1,xmm1,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm1,XMMWORD[((-96))+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[16+rsp],xmm10 + vpalignr xmm8,xmm3,xmm2,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm7,xmm6,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm2,xmm2,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[32+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm1,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm1,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm2,xmm2,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm1,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[40+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm2,xmm2,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm2,XMMWORD[((-64))+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[32+rsp],xmm10 + vpalignr xmm8,xmm4,xmm3,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm0,xmm7,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm3,xmm3,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[48+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm2,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm2,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm3,xmm3,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm2,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[56+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm3,xmm3,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm3,XMMWORD[((-32))+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[48+rsp],xmm10 + vpalignr xmm8,xmm5,xmm4,8 + shrd r13,r13,23 + mov rax,r14 + vpalignr xmm11,xmm1,xmm0,8 + mov r12,r9 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r8 + xor r12,r10 + vpaddq xmm4,xmm4,xmm11 + shrd r13,r13,4 + xor r14,rax + vpsrlq xmm11,xmm8,7 + and r12,r8 + xor r13,r8 + vpsllq xmm9,xmm8,56 + add r11,QWORD[64+rsp] + mov r15,rax + vpxor xmm8,xmm11,xmm10 + xor r12,r10 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rbx + add r11,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rax + add r11,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rbx + shrd r14,r14,28 + vpsrlq xmm11,xmm3,6 + add rdx,r11 + add r11,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rdx + add r14,r11 + vpsllq xmm10,xmm3,3 + shrd r13,r13,23 + mov r11,r14 + vpaddq xmm4,xmm4,xmm8 + mov r12,r8 + shrd r14,r14,5 + vpsrlq xmm9,xmm3,19 + xor r13,rdx + xor r12,r9 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r11 + vpsllq xmm10,xmm10,42 + and r12,rdx + xor r13,rdx + vpxor xmm11,xmm11,xmm9 + add r10,QWORD[72+rsp] + mov rdi,r11 + vpsrlq xmm9,xmm9,42 + xor r12,r9 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rax + add r10,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm4,xmm4,xmm11 + xor r14,r11 + add r10,r13 + vpaddq xmm10,xmm4,XMMWORD[rbp] + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + vmovdqa XMMWORD[64+rsp],xmm10 + vpalignr xmm8,xmm6,xmm5,8 + shrd r13,r13,23 + mov r10,r14 + vpalignr xmm11,xmm2,xmm1,8 + mov r12,rdx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rcx + xor r12,r8 + vpaddq xmm5,xmm5,xmm11 + shrd r13,r13,4 + xor r14,r10 + vpsrlq xmm11,xmm8,7 + and r12,rcx + xor r13,rcx + vpsllq xmm9,xmm8,56 + add r9,QWORD[80+rsp] + mov r15,r10 + vpxor xmm8,xmm11,xmm10 + xor r12,r8 + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r11 + add r9,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r10 + add r9,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r11 + shrd r14,r14,28 + vpsrlq xmm11,xmm4,6 + add rbx,r9 + add r9,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,rbx + add r14,r9 + vpsllq xmm10,xmm4,3 + shrd r13,r13,23 + mov r9,r14 + vpaddq xmm5,xmm5,xmm8 + mov r12,rcx + shrd r14,r14,5 + vpsrlq xmm9,xmm4,19 + xor r13,rbx + xor r12,rdx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,r9 + vpsllq xmm10,xmm10,42 + and r12,rbx + xor r13,rbx + vpxor xmm11,xmm11,xmm9 + add r8,QWORD[88+rsp] + mov rdi,r9 + vpsrlq xmm9,xmm9,42 + xor r12,rdx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r10 + add r8,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm5,xmm5,xmm11 + xor r14,r9 + add r8,r13 + vpaddq xmm10,xmm5,XMMWORD[32+rbp] + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + vmovdqa XMMWORD[80+rsp],xmm10 + vpalignr xmm8,xmm7,xmm6,8 + shrd r13,r13,23 + mov r8,r14 + vpalignr xmm11,xmm3,xmm2,8 + mov r12,rbx + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,rax + xor r12,rcx + vpaddq xmm6,xmm6,xmm11 + shrd r13,r13,4 + xor r14,r8 + vpsrlq xmm11,xmm8,7 + and r12,rax + xor r13,rax + vpsllq xmm9,xmm8,56 + add rdx,QWORD[96+rsp] + mov r15,r8 + vpxor xmm8,xmm11,xmm10 + xor r12,rcx + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,r9 + add rdx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,r8 + add rdx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,r9 + shrd r14,r14,28 + vpsrlq xmm11,xmm5,6 + add r11,rdx + add rdx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r11 + add r14,rdx + vpsllq xmm10,xmm5,3 + shrd r13,r13,23 + mov rdx,r14 + vpaddq xmm6,xmm6,xmm8 + mov r12,rax + shrd r14,r14,5 + vpsrlq xmm9,xmm5,19 + xor r13,r11 + xor r12,rbx + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rdx + vpsllq xmm10,xmm10,42 + and r12,r11 + xor r13,r11 + vpxor xmm11,xmm11,xmm9 + add rcx,QWORD[104+rsp] + mov rdi,rdx + vpsrlq xmm9,xmm9,42 + xor r12,rbx + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,r8 + add rcx,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm6,xmm6,xmm11 + xor r14,rdx + add rcx,r13 + vpaddq xmm10,xmm6,XMMWORD[64+rbp] + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + vmovdqa XMMWORD[96+rsp],xmm10 + vpalignr xmm8,xmm0,xmm7,8 + shrd r13,r13,23 + mov rcx,r14 + vpalignr xmm11,xmm4,xmm3,8 + mov r12,r11 + shrd r14,r14,5 + vpsrlq xmm10,xmm8,1 + xor r13,r10 + xor r12,rax + vpaddq xmm7,xmm7,xmm11 + shrd r13,r13,4 + xor r14,rcx + vpsrlq xmm11,xmm8,7 + and r12,r10 + xor r13,r10 + vpsllq xmm9,xmm8,56 + add rbx,QWORD[112+rsp] + mov r15,rcx + vpxor xmm8,xmm11,xmm10 + xor r12,rax + shrd r14,r14,6 + vpsrlq xmm10,xmm10,7 + xor r15,rdx + add rbx,r12 + vpxor xmm8,xmm8,xmm9 + shrd r13,r13,14 + and rdi,r15 + vpsllq xmm9,xmm9,7 + xor r14,rcx + add rbx,r13 + vpxor xmm8,xmm8,xmm10 + xor rdi,rdx + shrd r14,r14,28 + vpsrlq xmm11,xmm6,6 + add r9,rbx + add rbx,rdi + vpxor xmm8,xmm8,xmm9 + mov r13,r9 + add r14,rbx + vpsllq xmm10,xmm6,3 + shrd r13,r13,23 + mov rbx,r14 + vpaddq xmm7,xmm7,xmm8 + mov r12,r10 + shrd r14,r14,5 + vpsrlq xmm9,xmm6,19 + xor r13,r9 + xor r12,r11 + vpxor xmm11,xmm11,xmm10 + shrd r13,r13,4 + xor r14,rbx + vpsllq xmm10,xmm10,42 + and r12,r9 + xor r13,r9 + vpxor xmm11,xmm11,xmm9 + add rax,QWORD[120+rsp] + mov rdi,rbx + vpsrlq xmm9,xmm9,42 + xor r12,r11 + shrd r14,r14,6 + vpxor xmm11,xmm11,xmm10 + xor rdi,rcx + add rax,r12 + vpxor xmm11,xmm11,xmm9 + shrd r13,r13,14 + and r15,rdi + vpaddq xmm7,xmm7,xmm11 + xor r14,rbx + add rax,r13 + vpaddq xmm10,xmm7,XMMWORD[96+rbp] + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + vmovdqa XMMWORD[112+rsp],xmm10 + cmp BYTE[135+rbp],0 + jne NEAR $L$avx_00_47 + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[8+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[16+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[24+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[32+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[40+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[48+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[56+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + shrd r13,r13,23 + mov rax,r14 + mov r12,r9 + shrd r14,r14,5 + xor r13,r8 + xor r12,r10 + shrd r13,r13,4 + xor r14,rax + and r12,r8 + xor r13,r8 + add r11,QWORD[64+rsp] + mov r15,rax + xor r12,r10 + shrd r14,r14,6 + xor r15,rbx + add r11,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rax + add r11,r13 + xor rdi,rbx + shrd r14,r14,28 + add rdx,r11 + add r11,rdi + mov r13,rdx + add r14,r11 + shrd r13,r13,23 + mov r11,r14 + mov r12,r8 + shrd r14,r14,5 + xor r13,rdx + xor r12,r9 + shrd r13,r13,4 + xor r14,r11 + and r12,rdx + xor r13,rdx + add r10,QWORD[72+rsp] + mov rdi,r11 + xor r12,r9 + shrd r14,r14,6 + xor rdi,rax + add r10,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r11 + add r10,r13 + xor r15,rax + shrd r14,r14,28 + add rcx,r10 + add r10,r15 + mov r13,rcx + add r14,r10 + shrd r13,r13,23 + mov r10,r14 + mov r12,rdx + shrd r14,r14,5 + xor r13,rcx + xor r12,r8 + shrd r13,r13,4 + xor r14,r10 + and r12,rcx + xor r13,rcx + add r9,QWORD[80+rsp] + mov r15,r10 + xor r12,r8 + shrd r14,r14,6 + xor r15,r11 + add r9,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r10 + add r9,r13 + xor rdi,r11 + shrd r14,r14,28 + add rbx,r9 + add r9,rdi + mov r13,rbx + add r14,r9 + shrd r13,r13,23 + mov r9,r14 + mov r12,rcx + shrd r14,r14,5 + xor r13,rbx + xor r12,rdx + shrd r13,r13,4 + xor r14,r9 + and r12,rbx + xor r13,rbx + add r8,QWORD[88+rsp] + mov rdi,r9 + xor r12,rdx + shrd r14,r14,6 + xor rdi,r10 + add r8,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,r9 + add r8,r13 + xor r15,r10 + shrd r14,r14,28 + add rax,r8 + add r8,r15 + mov r13,rax + add r14,r8 + shrd r13,r13,23 + mov r8,r14 + mov r12,rbx + shrd r14,r14,5 + xor r13,rax + xor r12,rcx + shrd r13,r13,4 + xor r14,r8 + and r12,rax + xor r13,rax + add rdx,QWORD[96+rsp] + mov r15,r8 + xor r12,rcx + shrd r14,r14,6 + xor r15,r9 + add rdx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,r8 + add rdx,r13 + xor rdi,r9 + shrd r14,r14,28 + add r11,rdx + add rdx,rdi + mov r13,r11 + add r14,rdx + shrd r13,r13,23 + mov rdx,r14 + mov r12,rax + shrd r14,r14,5 + xor r13,r11 + xor r12,rbx + shrd r13,r13,4 + xor r14,rdx + and r12,r11 + xor r13,r11 + add rcx,QWORD[104+rsp] + mov rdi,rdx + xor r12,rbx + shrd r14,r14,6 + xor rdi,r8 + add rcx,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rdx + add rcx,r13 + xor r15,r8 + shrd r14,r14,28 + add r10,rcx + add rcx,r15 + mov r13,r10 + add r14,rcx + shrd r13,r13,23 + mov rcx,r14 + mov r12,r11 + shrd r14,r14,5 + xor r13,r10 + xor r12,rax + shrd r13,r13,4 + xor r14,rcx + and r12,r10 + xor r13,r10 + add rbx,QWORD[112+rsp] + mov r15,rcx + xor r12,rax + shrd r14,r14,6 + xor r15,rdx + add rbx,r12 + shrd r13,r13,14 + and rdi,r15 + xor r14,rcx + add rbx,r13 + xor rdi,rdx + shrd r14,r14,28 + add r9,rbx + add rbx,rdi + mov r13,r9 + add r14,rbx + shrd r13,r13,23 + mov rbx,r14 + mov r12,r10 + shrd r14,r14,5 + xor r13,r9 + xor r12,r11 + shrd r13,r13,4 + xor r14,rbx + and r12,r9 + xor r13,r9 + add rax,QWORD[120+rsp] + mov rdi,rbx + xor r12,r11 + shrd r14,r14,6 + xor rdi,rcx + add rax,r12 + shrd r13,r13,14 + and r15,rdi + xor r14,rbx + add rax,r13 + xor r15,rcx + shrd r14,r14,28 + add r8,rax + add rax,r15 + mov r13,r8 + add r14,rax + mov rdi,QWORD[((128+0))+rsp] + mov rax,r14 + + add rax,QWORD[rdi] + lea rsi,[128+rsi] + add rbx,QWORD[8+rdi] + add rcx,QWORD[16+rdi] + add rdx,QWORD[24+rdi] + add r8,QWORD[32+rdi] + add r9,QWORD[40+rdi] + add r10,QWORD[48+rdi] + add r11,QWORD[56+rdi] + + cmp rsi,QWORD[((128+16))+rsp] + + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + mov QWORD[16+rdi],rcx + mov QWORD[24+rdi],rdx + mov QWORD[32+rdi],r8 + mov QWORD[40+rdi],r9 + mov QWORD[48+rdi],r10 + mov QWORD[56+rdi],r11 + jb NEAR $L$loop_avx + + mov rsi,QWORD[152+rsp] + + vzeroupper + movaps xmm6,XMMWORD[((128+32))+rsp] + movaps xmm7,XMMWORD[((128+48))+rsp] + movaps xmm8,XMMWORD[((128+64))+rsp] + movaps xmm9,XMMWORD[((128+80))+rsp] + movaps xmm10,XMMWORD[((128+96))+rsp] + movaps xmm11,XMMWORD[((128+112))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$epilogue_avx: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_sha512_block_data_order_avx: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$in_prologue + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_prologue + mov rsi,rax + mov rax,QWORD[((128+24))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + + lea r10,[$L$epilogue] + cmp rbx,r10 + jb NEAR $L$in_prologue + + lea rsi,[((128+32))+rsi] + lea rdi,[512+r8] + mov ecx,12 + DD 0xa548f3fc + +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_sha512_block_data_order_nohw wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order_nohw wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order_nohw wrt ..imagebase + DD $L$SEH_begin_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_end_sha512_block_data_order_avx wrt ..imagebase + DD $L$SEH_info_sha512_block_data_order_avx wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_sha512_block_data_order_nohw: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue wrt ..imagebase,$L$epilogue wrt ..imagebase +$L$SEH_info_sha512_block_data_order_avx: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$prologue_avx wrt ..imagebase,$L$epilogue_avx wrt ..imagebase +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/vpaes-armv7-linux.S b/third_party/boringssl/gen/bcm/vpaes-armv7-linux.S new file mode 100644 index 00000000..6e7898ad --- /dev/null +++ b/third_party/boringssl/gen/bcm/vpaes-armv7-linux.S @@ -0,0 +1,1225 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +.syntax unified + +.arch armv7-a +.fpu neon + +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +.text + +.type _vpaes_consts,%object +.align 7 @ totally strategic alignment +_vpaes_consts: +.Lk_mc_forward:@ mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward:@ mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr:@ sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +@ +@ "Hot" constants +@ +.Lk_inv:@ inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt:@ input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo:@ sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1:@ sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2:@ sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 +.size _vpaes_consts,.-_vpaes_consts +.align 6 +@@ +@@ _aes_preheat +@@ +@@ Fills q9-q15 as specified below. +@@ +.type _vpaes_preheat,%function +.align 4 +_vpaes_preheat: + adr r10, .Lk_inv + vmov.i8 q9, #0x0f @ .Lk_s0F + vld1.64 {q10,q11}, [r10]! @ .Lk_inv + add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo + vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 + vld1.64 {q14,q15}, [r10] @ .Lk_sb2 + bx lr + +@@ +@@ _aes_encrypt_core +@@ +@@ AES-encrypt q0. +@@ +@@ Inputs: +@@ q0 = input +@@ q9-q15 as in _vpaes_preheat +@@ [r2] = scheduled keys +@@ +@@ Output in q0 +@@ Clobbers q1-q5, r8-r11 +@@ Preserves q6-q8 so you get some local vectors +@@ +@@ +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov r9, r2 + ldr r8, [r2,#240] @ pull rounds + adr r11, .Lk_ipt + @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + vld1.64 {q2, q3}, [r11] + adr r11, .Lk_mc_forward+16 + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1 + vtbl.8 d3, {q2}, d3 + vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2 + vtbl.8 d5, {q3}, d1 + veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Lenc_entry ends with a bnz instruction which is normally paired with + @ subs in .Lenc_loop. + tst r8, r8 + b .Lenc_entry + +.align 4 +.Lenc_loop: + @ middle of middle round + add r10, r11, #0x40 + vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + vtbl.8 d9, {q13}, d5 + vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + vtbl.8 d1, {q12}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + vtbl.8 d11, {q15}, d5 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + vtbl.8 d5, {q14}, d7 + vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + vtbl.8 d7, {q0}, d3 + veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + @ Write to q5 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + vtbl.8 d11, {q0}, d9 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + vtbl.8 d9, {q3}, d3 + @ Here we restore the original q0/q5 usage. + veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4 + veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + subs r8, r8, #1 @ nr-- + +.Lenc_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + vtbl.8 d11, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 d5, {q10}, d7 + vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 d7, {q10}, d9 + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 + bne .Lenc_loop + + @ middle of last round + add r10, r11, #0x80 + + adr r11, .Lk_sbo + @ Read to q1 instead of q4, so the vtbl.8 instruction below does not + @ overlap table and destination registers. + vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou + vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 d9, {q1}, d5 + vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + @ Write to q2 instead of q0 below, to avoid overlapping table and + @ destination registers. + vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + vtbl.8 d5, {q0}, d7 + veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A + @ Here we restore the original q0/q2 usage. + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 + vtbl.8 d1, {q2}, d3 + bx lr +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.globl vpaes_encrypt +.hidden vpaes_encrypt +.type vpaes_encrypt,%function +.align 4 +vpaes_encrypt: + @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack + @ alignment. + stmdb sp!, {r7,r8,r9,r10,r11,lr} + @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11} + + vld1.64 {q0}, [r0] + bl _vpaes_preheat + bl _vpaes_encrypt_core + vst1.64 {q0}, [r1] + + vldmia sp!, {d8,d9,d10,d11} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_encrypt,.-vpaes_encrypt + +@ +@ Decryption stuff +@ +.type _vpaes_decrypt_consts,%object +.align 4 +_vpaes_decrypt_consts: +.Lk_dipt:@ decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +.Lk_dsbo:@ decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.Lk_dsb9:@ decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd:@ decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb:@ decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe:@ decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.size _vpaes_decrypt_consts,.-_vpaes_decrypt_consts + +@@ +@@ Decryption core +@@ +@@ Same API as encryption core, except it clobbers q12-q15 rather than using +@@ the values from _vpaes_preheat. q9-q11 must still be set from +@@ _vpaes_preheat. +@@ +.type _vpaes_decrypt_core,%function +.align 4 +_vpaes_decrypt_core: + mov r9, r2 + ldr r8, [r2,#240] @ pull rounds + + @ This function performs shuffles with various constants. The x86_64 + @ version loads them on-demand into %xmm0-%xmm5. This does not work well + @ for ARMv7 because those registers are shuffle destinations. The ARMv8 + @ version preloads those constants into registers, but ARMv7 has half + @ the registers to work with. Instead, we load them on-demand into + @ q12-q15, registers normally use for preloaded constants. This is fine + @ because decryption doesn't use those constants. The values are + @ constant, so this does not interfere with potential 2x optimizations. + adr r7, .Lk_dipt + + vld1.64 {q12,q13}, [r7] @ vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11 + eor r11, r11, #0x30 @ xor $0x30, %r11 + adr r10, .Lk_sr + and r11, r11, #0x30 @ and $0x30, %r11 + add r11, r11, r10 + adr r10, .Lk_mc_forward+48 + + vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 d5, {q12}, d3 + vld1.64 {q5}, [r10] @ vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 d1, {q13}, d1 + veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + + @ .Ldec_entry ends with a bnz instruction which is normally paired with + @ subs in .Ldec_loop. + tst r8, r8 + b .Ldec_entry + +.align 4 +.Ldec_loop: +@ +@ Inverse mix columns +@ + + @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of + @ the function. + adr r10, .Lk_dsb9 + vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + @ Load sbd* ahead of time. + vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + vtbl.8 d9, {q12}, d5 + vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + vtbl.8 d3, {q13}, d7 + veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0 + + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + + @ Load sbb* ahead of time. + vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu + @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt + + vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + vtbl.8 d9, {q14}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + vtbl.8 d3, {q15}, d7 + @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + @ Load sbd* ahead of time. + vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu + @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet + + vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + vtbl.8 d9, {q12}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + vtbl.8 d3, {q13}, d7 + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + + vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + vtbl.8 d9, {q14}, d5 + @ Write to q1 instead of q0, so the table and destination registers do + @ not overlap. + vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch + vtbl.8 d3, {q0}, d11 + @ Here we restore the original q0/q1 usage. This instruction is + @ reordered from the ARMv8 version so we do not clobber the vtbl.8 + @ below. + veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + vtbl.8 d3, {q15}, d7 + vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5 + veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + subs r8, r8, #1 @ sub $1,%rax # nr-- + +.Ldec_entry: + @ top of round + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 d5, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + vtbl.8 d5, {q10}, d7 + vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + vtbl.8 d7, {q10}, d9 + veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io + veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0 + bne .Ldec_loop + + @ middle of last round + + adr r10, .Lk_dsbo + + @ Write to q1 rather than q4 to avoid overlapping table and destination. + vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + vtbl.8 d9, {q1}, d5 + @ Write to q2 rather than q1 to avoid overlapping table and destination. + vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + vtbl.8 d3, {q2}, d7 + vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + @ Write to q1 rather than q0 so the table and destination registers + @ below do not overlap. + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A + vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0 + vtbl.8 d1, {q1}, d5 + bx lr +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +.globl vpaes_decrypt +.hidden vpaes_decrypt +.type vpaes_decrypt,%function +.align 4 +vpaes_decrypt: + @ _vpaes_decrypt_core uses r7-r11. + stmdb sp!, {r7,r8,r9,r10,r11,lr} + @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11} + + vld1.64 {q0}, [r0] + bl _vpaes_preheat + bl _vpaes_decrypt_core + vst1.64 {q0}, [r1] + + vldmia sp!, {d8,d9,d10,d11} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_decrypt,.-vpaes_decrypt +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ +@@ @@ +@@ AES key schedule @@ +@@ @@ +@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ + +@ This function diverges from both x86_64 and armv7 in which constants are +@ pinned. x86_64 has a common preheat function for all operations. aarch64 +@ separates them because it has enough registers to pin nearly all constants. +@ armv7 does not have enough registers, but needing explicit loads and stores +@ also complicates using x86_64's register allocation directly. +@ +@ We pin some constants for convenience and leave q14 and q15 free to load +@ others on demand. + +@ +@ Key schedule constants +@ +.type _vpaes_key_consts,%object +.align 4 +_vpaes_key_consts: +.Lk_dksd:@ decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb:@ decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse:@ decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9:@ decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon:@ rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt:@ output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew:@ deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 +.size _vpaes_key_consts,.-_vpaes_key_consts + +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adr r11, .Lk_rcon + vmov.i8 q12, #0x5b @ .Lk_s63 + adr r10, .Lk_inv @ Must be aligned to 8 mod 16. + vmov.i8 q9, #0x0f @ .Lk_s0F + vld1.64 {q10,q11}, [r10] @ .Lk_inv + vld1.64 {q8}, [r11] @ .Lk_rcon + bx lr +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + @ We only need to save lr, but ARM requires an 8-byte stack alignment, + @ so save an extra register. + stmdb sp!, {r3,lr} + + bl _vpaes_key_preheat @ load the tables + + adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. + vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) + + @ input transform + @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not + @ overlap table and destination. + vmov q4, q0 @ vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + adr r10, .Lk_sr @ Must be aligned to 8 mod 16. + vmov q7, q0 @ vmovdqa %xmm0, %xmm7 + + add r8, r8, r10 + tst r3, r3 + bne .Lschedule_am_decrypting + + @ encrypting, output zeroth round key after transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) + b .Lschedule_go + +.Lschedule_am_decrypting: + @ decrypting, output zeroth round key after shiftrows + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q4}, d3 + vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx) + eor r8, r8, #0x30 @ xor $0x30, %r8 + +.Lschedule_go: + cmp r1, #192 @ cmp $192, %esi + bhi .Lschedule_256 + beq .Lschedule_192 + @ 128: fall though + +@@ +@@ .schedule_128 +@@ +@@ 128-bit specific part of key schedule. +@@ +@@ This schedule is really simple, because all its parts +@@ are accomplished by the subroutines. +@@ +.Lschedule_128: + mov r0, #10 @ mov $10, %esi + +.Loop_schedule_128: + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle @ write output + b .Loop_schedule_128 + +@@ +@@ .aes_schedule_192 +@@ +@@ 192-bit specific part of key schedule. +@@ +@@ The main body of this schedule is the same as the 128-bit +@@ schedule, but with more smearing. The long, high side is +@@ stored in q7 as before, and the short, low side is in +@@ the high bits of q6. +@@ +@@ This schedule is somewhat nastier, however, because each +@@ round produces 192 bits of key material, or 1.5 round keys. +@@ Therefore, on each cycle we do 2 rounds and produce 3 round +@@ keys. +@@ +.align 4 +.Lschedule_192: + sub r0, r0, #8 + vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform @ input transform + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part + vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4 + @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov r0, #4 @ mov $4, %esi + +.Loop_schedule_192: + bl _vpaes_schedule_round + vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle @ save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle @ save key n+1 + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle @ save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +@@ +@@ .aes_schedule_256 +@@ +@@ 256-bit specific part of key schedule. +@@ +@@ The structure here is very similar to the 128-bit +@@ schedule, but with an additional "low side" in +@@ q6. The low side's rounds are the same as the +@@ high side's, except no rcon and no rotation. +@@ +.align 4 +.Lschedule_256: + vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform @ input transform + mov r0, #7 @ mov $7, %esi + +.Loop_schedule_256: + bl _vpaes_schedule_mangle @ output low result + vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + @ high round + bl _vpaes_schedule_round + subs r0, r0, #1 @ dec %esi + beq .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + @ low round. swap xmm7 and xmm6 + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vmov.i8 q4, #0 + vmov q5, q7 @ vmovdqa %xmm7, %xmm5 + vmov q7, q6 @ vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + vmov q7, q5 @ vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +@@ +@@ .aes_schedule_mangle_last +@@ +@@ Mangler for last round of key schedule +@@ Mangles q0 +@@ when encrypting, outputs out(q0) ^ 63 +@@ when decrypting, outputs unskew(q0) +@@ +@@ Always called right before return... jumps to cleanup and exits +@@ +.align 4 +.Lschedule_mangle_last: + @ schedule last round key from xmm0 + adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew + tst r3, r3 + bne .Lschedule_mangle_last_dec + + @ encrypting + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 + adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform + add r2, r2, #32 @ add $32, %rdx + vmov q2, q0 + vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute + vtbl.8 d1, {q2}, d3 + +.Lschedule_mangle_last_dec: + sub r2, r2, #16 @ add $-16, %rdx + veor q0, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform @ output transform + vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key + + @ cleanup + veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 + veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 + veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 + veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 + veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 + veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 + ldmia sp!, {r3,pc} @ return +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +@@ +@@ .aes_schedule_192_smear +@@ +@@ Smear the short, low side in the 192-bit key schedule. +@@ +@@ Inputs: +@@ q7: high side, b a x y +@@ q6: low side, d c 0 0 +@@ +@@ Outputs: +@@ q6: b+c+d b+c 0 0 +@@ q0: b+c+d b+c b a +@@ +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + vmov.i8 q1, #0 + vdup.32 q0, d15[1] + vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 + veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + vmov q0, q6 @ vmovdqa %xmm6, %xmm0 + vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + bx lr +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +@@ +@@ .aes_schedule_round +@@ +@@ Runs one main round of the key schedule on q0, q7 +@@ +@@ Specifically, runs subbytes on the high dword of q0 +@@ then rotates it by one byte and xors into the low dword of +@@ q7. +@@ +@@ Adds rcon from low byte of q8, then rotates q8 for +@@ next rcon. +@@ +@@ Smears the dwords of q7 by xoring the low into the +@@ second low, result into third, result into highest. +@@ +@@ Returns results in q7 = q0. +@@ Clobbers q1-q4, r11. +@@ +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + @ extract rcon from xmm8 + vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 + vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1 + vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + + @ rotate + vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 + vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0 + + @ fall through... + + @ low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. + @ We pin other values in _vpaes_key_preheat, so load them now. + adr r11, .Lk_sb1 + vld1.64 {q14,q15}, [r11] + + @ smear xmm7 + vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1 + veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 + vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4 + + @ subbytes + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i + veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 + vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + vtbl.8 d5, {q11}, d3 + veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j + vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + vtbl.8 d7, {q10}, d1 + veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + vtbl.8 d9, {q10}, d3 + veor q7, q7, q12 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 + vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + vtbl.8 d7, {q10}, d7 + veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + vtbl.8 d5, {q10}, d9 + veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io + veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + vtbl.8 d9, {q15}, d7 + vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + vtbl.8 d3, {q14}, d5 + veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + @ add in smeared stuff + veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 + veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 + bx lr +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +@@ +@@ .aes_schedule_transform +@@ +@@ Linear-transform q0 according to tables at [r11] +@@ +@@ Requires that q9 = 0x0F0F... as in preheat +@@ Output in q0 +@@ Clobbers q1, q2, q14, q15 +@@ +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo + @ vmovdqa 16(%r11), %xmm1 # hi + vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 + vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 + vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d3 + vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 + vtbl.8 d1, {q15}, d1 + veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 + bx lr +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +@@ +@@ .aes_schedule_mangle +@@ +@@ Mangles q0 from (basis-transformed) standard version +@@ to our version. +@@ +@@ On encrypt, +@@ xor with 0x63 +@@ multiply by circulant 0,1,1,1 +@@ apply shiftrows transform +@@ +@@ On decrypt, +@@ xor with 0x63 +@@ multiply by "inverse mixcolumns" circulant E,B,D,9 +@@ deskew +@@ apply shiftrows transform +@@ +@@ +@@ Writes out to [r2], and increments or decrements it +@@ Keeps track of round number mod 4 in r8 +@@ Preserves q0 +@@ Clobbers q1-q5 +@@ +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + tst r3, r3 + vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later + adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. + vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 + bne .Lschedule_mangle_dec + + @ encrypting + @ Write to q2 so we do not overlap table and destination below. + veor q2, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add r2, r2, #16 @ add $16, %rdx + vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4 + vtbl.8 d9, {q2}, d11 + vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1 + vtbl.8 d3, {q4}, d11 + vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3 + vtbl.8 d7, {q1}, d11 + veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 + + b .Lschedule_mangle_both +.align 4 +.Lschedule_mangle_dec: + @ inverse mix columns + adr r11, .Lk_dksd @ lea .Lk_dksd(%rip),%r11 + vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi + vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2 + @ vmovdqa 0x10(%r11), %xmm3 + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dksb ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2 + @ vmovdqa 0x30(%r11), %xmm3 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dkse ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2 + @ vmovdqa 0x50(%r11), %xmm3 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d7, {q15}, d3 + @ Load .Lk_dkse ahead of time. + vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2 + @ vmovdqa 0x70(%r11), %xmm4 + @ Write to q13 so we do not overlap table and destination. + veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 + + vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 + vtbl.8 d5, {q14}, d9 + vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 + vtbl.8 d7, {q13}, d11 + vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4 + vtbl.8 d9, {q15}, d3 + vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 + veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 + veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3 + + sub r2, r2, #16 @ add $-16, %rdx + +.Lschedule_mangle_both: + @ Write to q2 so table and destination do not overlap. + vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 + vtbl.8 d5, {q3}, d3 + add r8, r8, #64-16 @ add $-16, %r8 + and r8, r8, #~(1<<6) @ and $0x30, %r8 + vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx) + bx lr +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + stmdb sp!, {r7,r8,r9,r10,r11, lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + lsr r9, r1, #5 @ shr $5,%eax + add r9, r9, #5 @ $5,%eax + str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov r3, #0 @ mov $0,%ecx + mov r8, #0x30 @ mov $0x30,%r8d + bl _vpaes_schedule_core + eor r0, r0, r0 + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.hidden vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,%function +.align 4 +vpaes_set_decrypt_key: + stmdb sp!, {r7,r8,r9,r10,r11, lr} + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + lsr r9, r1, #5 @ shr $5,%eax + add r9, r9, #5 @ $5,%eax + str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl r9, r9, #4 @ shl $4,%eax + add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx + add r2, r2, r9 + + mov r3, #1 @ mov $1,%ecx + lsr r8, r1, #1 @ shr $1,%r8d + and r8, r8, #32 @ and $32,%r8d + eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key + +@ Additional constants for converting to bsaes. +.type _vpaes_convert_consts,%object +.align 4 +_vpaes_convert_consts: +@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear +@ transform in the AES S-box. 0x63 is incorporated into the low half of the +@ table. This was computed with the following script: +@ +@ def u64s_to_u128(x, y): +@ return x | (y << 64) +@ def u128_to_u64s(w): +@ return w & ((1<<64)-1), w >> 64 +@ def get_byte(w, i): +@ return (w >> (i*8)) & 0xff +@ def apply_table(table, b): +@ lo = b & 0xf +@ hi = b >> 4 +@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) +@ def opt(b): +@ table = [ +@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), +@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), +@ ] +@ return apply_table(table, b) +@ def rot_byte(b, n): +@ return 0xff & ((b << n) | (b >> (8-n))) +@ def skew(x): +@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ +@ rot_byte(x, 4)) +@ table = [0, 0] +@ for i in range(16): +@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) +@ table[1] |= skew(opt(i<<4)) << (i*8) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0])) +@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1])) +.Lk_opt_then_skew: +.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b +.quad 0x1f30062936192f00, 0xb49bad829db284ab + +@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation +@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344 +@ becomes 0x22334411 and then 0x11443322. +.Lk_decrypt_transform: +.quad 0x0704050603000102, 0x0f0c0d0e0b08090a +.size _vpaes_convert_consts,.-_vpaes_convert_consts + +@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); +.globl vpaes_encrypt_key_to_bsaes +.hidden vpaes_encrypt_key_to_bsaes +.type vpaes_encrypt_key_to_bsaes,%function +.align 4 +vpaes_encrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. In particular, + @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), + @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last + @ contain the transformations not in the bsaes representation. This + @ function inverts those transforms. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform + adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. + add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) + + vld1.64 {q12}, [r2] + vmov.i8 q10, #0x5b @ .Lk_s63 from vpaes-x86_64 + adr r11, .Lk_opt @ Must be aligned to 8 mod 16. + vmov.i8 q11, #0x63 @ .LK_s63 without .Lk_ipt applied + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [r1,#240] + add r2, r2, #1 + str r2, [r0,#240] + + @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). + @ Invert this with .Lk_opt. + vld1.64 {q0}, [r1]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, + @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, + @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. +.Loop_enc_key_to_bsaes: + vld1.64 {q0}, [r1]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle + @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. + @ We use r3 rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 d4, {q0}, d2 + vtbl.8 d5, {q0}, d3 + add r3, r3, #16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq .Loop_enc_key_to_bsaes_last + + @ Multiply by the circulant. This is its own inverse. + vtbl.8 d2, {q0}, d24 + vtbl.8 d3, {q0}, d25 + vmov q0, q1 + vtbl.8 d4, {q1}, d24 + vtbl.8 d5, {q1}, d25 + veor q0, q0, q2 + vtbl.8 d2, {q2}, d24 + vtbl.8 d3, {q2}, d25 + veor q0, q0, q1 + + @ XOR and finish. + veor q0, q0, q10 + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + b .Loop_enc_key_to_bsaes + +.Loop_enc_key_to_bsaes_last: + @ The final key does not have a basis transform (note + @ .Lschedule_mangle_last inverts the original transform). It only XORs + @ 0x63 and applies ShiftRows. The latter was already inverted in the + @ loop. Note that, because we act on the original representation, we use + @ q11, not q10. + veor q0, q0, q11 + vrev32.8 q0, q0 + vst1.64 {q0}, [r0] + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return +.size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes + +@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes); +.globl vpaes_decrypt_key_to_bsaes +.hidden vpaes_decrypt_key_to_bsaes +.type vpaes_decrypt_key_to_bsaes,%function +.align 4 +vpaes_decrypt_key_to_bsaes: + stmdb sp!, {r11, lr} + + @ See _vpaes_schedule_core for the key schedule logic. Note vpaes + @ computes the decryption key schedule in reverse. Additionally, + @ aes-x86_64.pl shares some transformations, so we must only partially + @ invert vpaes's transformations. In general, vpaes computes in a + @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of + @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is + @ split into a linear skew and XOR of 0x63). We undo all but MixColumns. + @ + @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key + @ representation, which does not match the other aes_nohw_* + @ implementations. The ARM aes_nohw_* stores each 32-bit word + @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the + @ cost of extra REV and VREV32 operations in little-endian ARM. + + adr r2, .Lk_decrypt_transform + adr r3, .Lk_sr+0x30 + adr r11, .Lk_opt_then_skew @ Input to _vpaes_schedule_transform. + vld1.64 {q12}, [r2] @ Reuse q12 from encryption. + vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform + + @ vpaes stores one fewer round count than bsaes, but the number of keys + @ is the same. + ldr r2, [r1,#240] + add r2, r2, #1 + str r2, [r0,#240] + + @ Undo the basis change and reapply the S-box affine transform. See + @ .Lschedule_mangle_last. + vld1.64 {q0}, [r1]! + bl _vpaes_schedule_transform + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ See _vpaes_schedule_mangle for the transform on the middle keys. Note + @ it simultaneously inverts MixColumns and the S-box affine transform. + @ See .Lk_dksd through .Lk_dks9. +.Loop_dec_key_to_bsaes: + vld1.64 {q0}, [r1]! + + @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going + @ forwards cancels inverting for which direction we cycle r3. We use r3 + @ rather than r8 to avoid a callee-saved register. + vld1.64 {q1}, [r3] + vtbl.8 d4, {q0}, d2 + vtbl.8 d5, {q0}, d3 + add r3, r3, #64-16 + and r3, r3, #~(1<<6) + vmov q0, q2 + + @ Handle the last key differently. + subs r2, r2, #1 + beq .Loop_dec_key_to_bsaes_last + + @ Undo the basis change and reapply the S-box affine transform. + bl _vpaes_schedule_transform + + @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We + @ combine the two operations in .Lk_decrypt_transform. + @ + @ TODO(davidben): Where does the rotation come from? + vtbl.8 d2, {q0}, d24 + vtbl.8 d3, {q0}, d25 + + vst1.64 {q1}, [r0]! + b .Loop_dec_key_to_bsaes + +.Loop_dec_key_to_bsaes_last: + @ The final key only inverts ShiftRows (already done in the loop). See + @ .Lschedule_am_decrypting. Its basis is not transformed. + vrev32.8 q0, q0 + vst1.64 {q0}, [r0]! + + @ Wipe registers which contained key material. + veor q0, q0, q0 + veor q1, q1, q1 + veor q2, q2, q2 + + ldmia sp!, {r11, pc} @ return +.size vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + mov ip, sp + stmdb sp!, {r7,r8,r9,r10,r11, lr} + @ This function uses q4-q7 (d8-d15), which are callee-saved. + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + cmp r2, #0 + @ r8 is passed on the stack. + ldr r8, [ip] + beq .Lctr32_done + + @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3. + mov r9, r3 + mov r3, r2 + mov r2, r9 + + @ Load the IV and counter portion. + ldr r7, [r8, #12] + vld1.8 {q7}, [r8] + + bl _vpaes_preheat + rev r7, r7 @ The counter is big-endian. + +.Lctr32_loop: + vmov q0, q7 + vld1.8 {q6}, [r0]! @ .Load input ahead of time + bl _vpaes_encrypt_core + veor q0, q0, q6 @ XOR input and result + vst1.8 {q0}, [r1]! + subs r3, r3, #1 + @ Update the counter. + add r7, r7, #1 + rev r9, r7 + vmov.32 d15[1], r9 + bne .Lctr32_loop + +.Lctr32_done: + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/vpaes-armv8-apple.S b/third_party/boringssl/gen/bcm/vpaes-armv8-apple.S new file mode 100644 index 00000000..d932f510 --- /dev/null +++ b/third_party/boringssl/gen/bcm/vpaes-armv8-apple.S @@ -0,0 +1,1222 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.section __TEXT,__const + + +.align 7 // totally strategic alignment +_vpaes_consts: +Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Decryption stuff +// +Lk_dipt: // decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +Lk_dsbo: // decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +Lk_dsb9: // decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +Lk_dsbd: // decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +Lk_dsbb: // decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +Lk_dsbe: // decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + +// +// Key schedule constants +// +Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 + +.align 6 + +.text +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## + +.align 4 +_vpaes_encrypt_preheat: + adrp x10, Lk_inv@PAGE + add x10, x10, Lk_inv@PAGEOFF + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 + ret + + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## + +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward@PAGE+16 + add x11, x11, Lk_mc_forward@PAGEOFF+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Lenc_entry + +.align 4 +Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret + + +.globl _vpaes_encrypt +.private_extern _vpaes_encrypt + +.align 4 +_vpaes_encrypt: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + + +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward@PAGE+16 + add x11, x11, Lk_mc_forward@PAGEOFF+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Lenc_2x_entry + +.align 4 +Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret + + + +.align 4 +_vpaes_decrypt_preheat: + adrp x10, Lk_inv@PAGE + add x10, x10, Lk_inv@PAGEOFF + movi v17.16b, #0x0f + adrp x11, Lk_dipt@PAGE + add x11, x11, Lk_dipt@PAGEOFF + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe + ret + + +## +## Decryption core +## +## Same API as encryption core. +## + +.align 4 +_vpaes_decrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, Lk_sr@PAGE + add x10, x10, Lk_sr@PAGEOFF + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, Lk_mc_forward@PAGE+48 + add x10, x10, Lk_mc_forward@PAGEOFF+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Ldec_entry + +.align 4 +Ldec_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + sub w8, w8, #1 // sub $1,%rax # nr-- + +Ldec_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, Ldec_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + ret + + +.globl _vpaes_decrypt +.private_extern _vpaes_decrypt + +.align 4 +_vpaes_decrypt: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_decrypt_preheat + bl _vpaes_decrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// v14-v15 input, v0-v1 output + +.align 4 +_vpaes_decrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, Lk_sr@PAGE + add x10, x10, Lk_sr@PAGEOFF + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, Lk_mc_forward@PAGE+48 + add x10, x10, Lk_mc_forward@PAGEOFF+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + tbl v10.16b, {v20.16b},v9.16b + ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + tbl v8.16b, {v21.16b},v8.16b + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v10.16b, v10.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Ldec_2x_entry + +.align 4 +Ldec_2x_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v12.16b, {v24.16b}, v10.16b + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + tbl v9.16b, {v25.16b}, v11.16b + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + eor v8.16b, v12.16b, v16.16b + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v12.16b, {v26.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + tbl v9.16b, {v27.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v12.16b, {v28.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + tbl v9.16b, {v29.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v12.16b, {v30.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + tbl v9.16b, {v31.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + sub w8, w8, #1 // sub $1,%rax # nr-- + +Ldec_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + tbl v10.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v10.16b + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v10.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, Ldec_2x_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + tbl v9.16b, {v23.16b}, v11.16b + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + eor v8.16b, v9.16b, v12.16b + tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v2.16b + ret + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## + +.align 4 +_vpaes_key_preheat: + adrp x10, Lk_inv@PAGE + add x10, x10, Lk_inv@PAGEOFF + movi v16.16b, #0x5b // Lk_s63 + adrp x11, Lk_sb1@PAGE + add x11, x11, Lk_sb1@PAGEOFF + movi v17.16b, #0x0f // Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt + adrp x10, Lk_dksd@PAGE + add x10, x10, Lk_dksd@PAGEOFF + ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 + adrp x11, Lk_mc_forward@PAGE + add x11, x11, Lk_mc_forward@PAGEOFF + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 + ld1 {v8.2d}, [x10] // Lk_rcon + ld1 {v9.2d}, [x11] // Lk_mc_forward[0] + ret + + + +.align 4 +_vpaes_schedule_core: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, Lk_sr@PAGE // lea Lk_sr(%rip),%r10 + add x10, x10, Lk_sr@PAGEOFF + + add x8, x8, x10 + cbnz w3, Lschedule_am_decrypting + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + b Lschedule_go + +Lschedule_am_decrypting: + // decrypting, output zeroth round key after shiftrows + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + eor x8, x8, #0x30 // xor $0x30, %r8 + +Lschedule_go: + cmp w1, #192 // cmp $192, %esi + b.hi Lschedule_256 + b.eq Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +Lschedule_128: + mov x0, #10 // mov $10, %esi + +Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, Lk_deskew@PAGE // lea Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, Lk_deskew@PAGEOFF + + cbnz w3, Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, Lk_opt@PAGE // lea Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, Lk_opt@PAGEOFF + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## + +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret + + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## + +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret + + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## + +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret + + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## + +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + cbnz w3, Lschedule_mangle_dec + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + + b Lschedule_mangle_both +.align 4 +Lschedule_mangle_dec: + // inverse mix columns + // lea .Lk_dksd(%rip),%r11 + ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi + and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + // vmovdqa 0x00(%r11), %xmm2 + tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + // vmovdqa 0x10(%r11), %xmm3 + tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x20(%r11), %xmm2 + tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x30(%r11), %xmm3 + tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x40(%r11), %xmm2 + tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x50(%r11), %xmm3 + tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + + // vmovdqa 0x60(%r11), %xmm2 + tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + // vmovdqa 0x70(%r11), %xmm4 + tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 + + sub x2, x2, #16 // add $-16, %rdx + +Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #48 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret + + +.globl _vpaes_set_encrypt_key +.private_extern _vpaes_set_encrypt_key + +.align 4 +_vpaes_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl _vpaes_set_decrypt_key +.private_extern _vpaes_set_decrypt_key + +.align 4 +_vpaes_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl w9, w9, #4 // shl $4,%eax + add x2, x2, #16 // lea 16(%rdx,%rax),%rdx + add x2, x2, x9 + + mov w3, #1 // mov $1,%ecx + lsr w8, w1, #1 // shr $1,%r8d + and x8, x8, #32 // and $32,%r8d + eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _vpaes_cbc_encrypt +.private_extern _vpaes_cbc_encrypt + +.align 4 +_vpaes_cbc_encrypt: + AARCH64_SIGN_LINK_REGISTER + cbz x2, Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x17, x2 // reassign + mov x2, x3 // reassign + + ld1 {v0.16b}, [x4] // load ivec + bl _vpaes_encrypt_preheat + b Lcbc_enc_loop + +.align 4 +Lcbc_enc_loop: + ld1 {v7.16b}, [x0],#16 // load input + eor v7.16b, v7.16b, v0.16b // xor with ivec + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1],#16 // save output + subs x17, x17, #16 + b.hi Lcbc_enc_loop + + st1 {v0.16b}, [x4] // write ivec + + ldp x29,x30,[sp],#16 +Lcbc_abort: + AARCH64_VALIDATE_LINK_REGISTER + ret + + + +.align 4 +vpaes_cbc_decrypt: + // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to + // only from vpaes_cbc_encrypt which has already signed the return address. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, x2 // reassign + mov x2, x3 // reassign + ld1 {v6.16b}, [x4] // load ivec + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq Lcbc_dec_loop2x + + ld1 {v7.16b}, [x0], #16 // load input + bl _vpaes_decrypt_core + eor v0.16b, v0.16b, v6.16b // xor with ivec + orr v6.16b, v7.16b, v7.16b // next ivec value + st1 {v0.16b}, [x1], #16 + subs x17, x17, #16 + b.ls Lcbc_dec_done + +.align 4 +Lcbc_dec_loop2x: + ld1 {v14.16b,v15.16b}, [x0], #32 + bl _vpaes_decrypt_2x + eor v0.16b, v0.16b, v6.16b // xor with ivec + eor v1.16b, v1.16b, v14.16b + orr v6.16b, v15.16b, v15.16b + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #32 + b.hi Lcbc_dec_loop2x + +Lcbc_dec_done: + st1 {v6.16b}, [x4] + + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl _vpaes_ctr32_encrypt_blocks +.private_extern _vpaes_ctr32_encrypt_blocks + +.align 4 +_vpaes_ctr32_encrypt_blocks: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz x2, Lctr32_done + + // Note, unlike the other functions, x2 here is measured in blocks, + // not bytes. + mov x17, x2 + mov x2, x3 + + // Load the IV and counter portion. + ldr w6, [x4, #12] + ld1 {v7.16b}, [x4] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev w6, w6 // The counter is big-endian. + b.eq Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [x0], #16 // Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [x1], #16 + subs x17, x17, #1 + // Update the counter. + add w6, w6, #1 + rev w7, w6 + mov v7.s[3], w7 + b.ls Lctr32_done + +Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add w6, w6, #1 + rev w7, w6 + mov v15.s[3], w7 + +Lctr32_loop: + ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #2 + // Update the counter. + add w7, w6, #1 + add w6, w6, #2 + rev w7, w7 + mov v14.s[3], w7 + rev w7, w6 + mov v15.s[3], w7 + b.hi Lctr32_loop + +Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/vpaes-armv8-linux.S b/third_party/boringssl/gen/bcm/vpaes-armv8-linux.S new file mode 100644 index 00000000..010ccd2a --- /dev/null +++ b/third_party/boringssl/gen/bcm/vpaes-armv8-linux.S @@ -0,0 +1,1222 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.section .rodata + +.type _vpaes_consts,%object +.align 7 // totally strategic alignment +_vpaes_consts: +.Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +.Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +.Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +.Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +.Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +.Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +.Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Decryption stuff +// +.Lk_dipt: // decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +.Lk_dsbo: // decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +.Lk_dsb9: // decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: // decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: // decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: // decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + +// +// Key schedule constants +// +.Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +.Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +.Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 +.size _vpaes_consts,.-_vpaes_consts +.align 6 + +.text +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.type _vpaes_encrypt_preheat,%function +.align 4 +_vpaes_encrypt_preheat: + adrp x10, .Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 + ret +.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.type _vpaes_encrypt_core,%function +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, .Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Lenc_entry + +.align 4 +.Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +.Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + +.globl vpaes_encrypt +.hidden vpaes_encrypt +.type vpaes_encrypt,%function +.align 4 +vpaes_encrypt: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpaes_encrypt,.-vpaes_encrypt + +.type _vpaes_encrypt_2x,%function +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, .Lk_mc_forward+16 + add x11, x11, :lo12:.Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Lenc_2x_entry + +.align 4 +.Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +.Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, .Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret +.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x + +.type _vpaes_decrypt_preheat,%function +.align 4 +_vpaes_decrypt_preheat: + adrp x10, .Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v17.16b, #0x0f + adrp x11, .Lk_dipt + add x11, x11, :lo12:.Lk_dipt + ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe + ret +.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat + +## +## Decryption core +## +## Same API as encryption core. +## +.type _vpaes_decrypt_core,%function +.align 4 +_vpaes_decrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, .Lk_sr + add x10, x10, :lo12:.Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, .Lk_mc_forward+48 + add x10, x10, :lo12:.Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b .Ldec_entry + +.align 4 +.Ldec_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + sub w8, w8, #1 // sub $1,%rax # nr-- + +.Ldec_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + +.globl vpaes_decrypt +.hidden vpaes_decrypt +.type vpaes_decrypt,%function +.align 4 +vpaes_decrypt: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_decrypt_preheat + bl _vpaes_decrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpaes_decrypt,.-vpaes_decrypt + +// v14-v15 input, v0-v1 output +.type _vpaes_decrypt_2x,%function +.align 4 +_vpaes_decrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, .Lk_sr + add x10, x10, :lo12:.Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, .Lk_mc_forward+48 + add x10, x10, :lo12:.Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + tbl v10.16b, {v20.16b},v9.16b + ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + tbl v8.16b, {v21.16b},v8.16b + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v10.16b, v10.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b .Ldec_2x_entry + +.align 4 +.Ldec_2x_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v12.16b, {v24.16b}, v10.16b + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + tbl v9.16b, {v25.16b}, v11.16b + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + eor v8.16b, v12.16b, v16.16b + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v12.16b, {v26.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + tbl v9.16b, {v27.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v12.16b, {v28.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + tbl v9.16b, {v29.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v12.16b, {v30.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + tbl v9.16b, {v31.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + sub w8, w8, #1 // sub $1,%rax # nr-- + +.Ldec_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + tbl v10.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v10.16b + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v10.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, .Ldec_2x_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + tbl v9.16b, {v23.16b}, v11.16b + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + eor v8.16b, v9.16b, v12.16b + tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v2.16b + ret +.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.type _vpaes_key_preheat,%function +.align 4 +_vpaes_key_preheat: + adrp x10, .Lk_inv + add x10, x10, :lo12:.Lk_inv + movi v16.16b, #0x5b // .Lk_s63 + adrp x11, .Lk_sb1 + add x11, x11, :lo12:.Lk_sb1 + movi v17.16b, #0x0f // .Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt + adrp x10, .Lk_dksd + add x10, x10, :lo12:.Lk_dksd + ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 + adrp x11, .Lk_mc_forward + add x11, x11, :lo12:.Lk_mc_forward + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 + ld1 {v8.2d}, [x10] // .Lk_rcon + ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] + ret +.size _vpaes_key_preheat,.-_vpaes_key_preheat + +.type _vpaes_schedule_core,%function +.align 4 +_vpaes_schedule_core: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 + add x10, x10, :lo12:.Lk_sr + + add x8, x8, x10 + cbnz w3, .Lschedule_am_decrypting + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + b .Lschedule_go + +.Lschedule_am_decrypting: + // decrypting, output zeroth round key after shiftrows + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + eor x8, x8, #0x30 // xor $0x30, %r8 + +.Lschedule_go: + cmp w1, #192 // cmp $192, %esi + b.hi .Lschedule_256 + b.eq .Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +.Lschedule_128: + mov x0, #10 // mov $10, %esi + +.Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b .Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +.Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +.Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b .Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +.Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +.Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, .Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b .Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +.Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, :lo12:.Lk_deskew + + cbnz w3, .Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, :lo12:.Lk_opt + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +.Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.type _vpaes_schedule_192_smear,%function +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.type _vpaes_schedule_round,%function +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.type _vpaes_schedule_transform,%function +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.type _vpaes_schedule_mangle,%function +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + cbnz w3, .Lschedule_mangle_dec + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + + b .Lschedule_mangle_both +.align 4 +.Lschedule_mangle_dec: + // inverse mix columns + // lea .Lk_dksd(%rip),%r11 + ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi + and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + // vmovdqa 0x00(%r11), %xmm2 + tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + // vmovdqa 0x10(%r11), %xmm3 + tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x20(%r11), %xmm2 + tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x30(%r11), %xmm3 + tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x40(%r11), %xmm2 + tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x50(%r11), %xmm3 + tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + + // vmovdqa 0x60(%r11), %xmm2 + tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + // vmovdqa 0x70(%r11), %xmm4 + tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 + + sub x2, x2, #16 // add $-16, %rdx + +.Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #48 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,%function +.align 4 +vpaes_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.hidden vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,%function +.align 4 +vpaes_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl w9, w9, #4 // shl $4,%eax + add x2, x2, #16 // lea 16(%rdx,%rax),%rdx + add x2, x2, x9 + + mov w3, #1 // mov $1,%ecx + lsr w8, w1, #1 // shr $1,%r8d + and x8, x8, #32 // and $32,%r8d + eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key +.globl vpaes_cbc_encrypt +.hidden vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,%function +.align 4 +vpaes_cbc_encrypt: + AARCH64_SIGN_LINK_REGISTER + cbz x2, .Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x17, x2 // reassign + mov x2, x3 // reassign + + ld1 {v0.16b}, [x4] // load ivec + bl _vpaes_encrypt_preheat + b .Lcbc_enc_loop + +.align 4 +.Lcbc_enc_loop: + ld1 {v7.16b}, [x0],#16 // load input + eor v7.16b, v7.16b, v0.16b // xor with ivec + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1],#16 // save output + subs x17, x17, #16 + b.hi .Lcbc_enc_loop + + st1 {v0.16b}, [x4] // write ivec + + ldp x29,x30,[sp],#16 +.Lcbc_abort: + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt + +.type vpaes_cbc_decrypt,%function +.align 4 +vpaes_cbc_decrypt: + // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to + // only from vpaes_cbc_encrypt which has already signed the return address. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, x2 // reassign + mov x2, x3 // reassign + ld1 {v6.16b}, [x4] // load ivec + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq .Lcbc_dec_loop2x + + ld1 {v7.16b}, [x0], #16 // load input + bl _vpaes_decrypt_core + eor v0.16b, v0.16b, v6.16b // xor with ivec + orr v6.16b, v7.16b, v7.16b // next ivec value + st1 {v0.16b}, [x1], #16 + subs x17, x17, #16 + b.ls .Lcbc_dec_done + +.align 4 +.Lcbc_dec_loop2x: + ld1 {v14.16b,v15.16b}, [x0], #32 + bl _vpaes_decrypt_2x + eor v0.16b, v0.16b, v6.16b // xor with ivec + eor v1.16b, v1.16b, v14.16b + orr v6.16b, v15.16b, v15.16b + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #32 + b.hi .Lcbc_dec_loop2x + +.Lcbc_dec_done: + st1 {v6.16b}, [x4] + + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,%function +.align 4 +vpaes_ctr32_encrypt_blocks: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz x2, .Lctr32_done + + // Note, unlike the other functions, x2 here is measured in blocks, + // not bytes. + mov x17, x2 + mov x2, x3 + + // Load the IV and counter portion. + ldr w6, [x4, #12] + ld1 {v7.16b}, [x4] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev w6, w6 // The counter is big-endian. + b.eq .Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [x0], #16 // .Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [x1], #16 + subs x17, x17, #1 + // Update the counter. + add w6, w6, #1 + rev w7, w6 + mov v7.s[3], w7 + b.ls .Lctr32_done + +.Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add w6, w6, #1 + rev w7, w6 + mov v15.s[3], w7 + +.Lctr32_loop: + ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #2 + // Update the counter. + add w7, w6, #1 + add w6, w6, #2 + rev w7, w7 + mov v14.s[3], w7 + rev w7, w6 + mov v15.s[3], w7 + b.hi .Lctr32_loop + +.Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/vpaes-armv8-win.S b/third_party/boringssl/gen/bcm/vpaes-armv8-win.S new file mode 100644 index 00000000..6160e9f7 --- /dev/null +++ b/third_party/boringssl/gen/bcm/vpaes-armv8-win.S @@ -0,0 +1,1260 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.section .rodata + + +.align 7 // totally strategic alignment +_vpaes_consts: +Lk_mc_forward: // mc_forward +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 +Lk_mc_backward: // mc_backward +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F +Lk_sr: // sr +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +// +// "Hot" constants +// +Lk_inv: // inv, inva +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 +Lk_ipt: // input transform (lo, hi) +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 +Lk_sbo: // sbou, sbot +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA +Lk_sb1: // sb1u, sb1t +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +Lk_sb2: // sb2u, sb2t +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD + +// +// Decryption stuff +// +Lk_dipt: // decryption input transform +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 +Lk_dsbo: // decryption sbox final output +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C +Lk_dsb9: // decryption sbox output *9*u, *9*t +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +Lk_dsbd: // decryption sbox output *D*u, *D*t +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +Lk_dsbb: // decryption sbox output *B*u, *B*t +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +Lk_dsbe: // decryption sbox output *E*u, *E*t +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 + +// +// Key schedule constants +// +Lk_dksd: // decryption key schedule: invskew x*D +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +Lk_dksb: // decryption key schedule: invskew x*B +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +Lk_dkse: // decryption key schedule: invskew x*E + 0x63 +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +Lk_dks9: // decryption key schedule: invskew x*9 +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + +Lk_rcon: // rcon +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +Lk_opt: // output transform +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 +Lk_deskew: // deskew tables: inverts the sbox's "skew" +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 2 + +.align 6 + +.text +## +## _aes_preheat +## +## Fills register %r10 -> .aes_consts (so you can -fPIC) +## and %xmm9-%xmm15 as specified below. +## +.def _vpaes_encrypt_preheat + .type 32 +.endef +.align 4 +_vpaes_encrypt_preheat: + adrp x10, Lk_inv + add x10, x10, :lo12:Lk_inv + movi v17.16b, #0x0f + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // Lk_ipt, Lk_sbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // Lk_sb1, Lk_sb2 + ret + + +## +## _aes_encrypt_core +## +## AES-encrypt %xmm0. +## +## Inputs: +## %xmm0 = input +## %xmm9-%xmm15 as in _vpaes_preheat +## (%rdx) = scheduled keys +## +## Output in %xmm0 +## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax +## Preserves %xmm6 - %xmm8 so you get some local vectors +## +## +.def _vpaes_encrypt_core + .type 32 +.endef +.align 4 +_vpaes_encrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward+16 + add x11, x11, :lo12:Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Lenc_entry + +.align 4 +Lenc_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + sub w8, w8, #1 // nr-- + +Lenc_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + ret + + +.globl vpaes_encrypt + +.def vpaes_encrypt + .type 32 +.endef +.align 4 +vpaes_encrypt: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_encrypt_preheat + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.def _vpaes_encrypt_2x + .type 32 +.endef +.align 4 +_vpaes_encrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + adrp x11, Lk_mc_forward+16 + add x11, x11, :lo12:Lk_mc_forward+16 + // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo + ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 + tbl v9.16b, {v20.16b}, v9.16b + // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi + tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 + tbl v10.16b, {v21.16b}, v8.16b + eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 + eor v8.16b, v9.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Lenc_2x_entry + +.align 4 +Lenc_2x_loop: + // middle of middle round + add x10, x11, #0x40 + tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u + tbl v12.16b, {v25.16b}, v10.16b + ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # Lk_mc_forward[] + tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t + tbl v8.16b, {v24.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u + tbl v13.16b, {v27.16b}, v10.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t + tbl v10.16b, {v26.16b}, v11.16b + ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # Lk_mc_backward[] + tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B + tbl v11.16b, {v8.16b}, v1.16b + eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A + eor v10.16b, v10.16b, v13.16b + tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D + tbl v8.16b, {v8.16b}, v4.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B + eor v11.16b, v11.16b, v10.16b + tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C + tbl v12.16b, {v11.16b},v1.16b + eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D + eor v8.16b, v8.16b, v11.16b + and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D + eor v8.16b, v8.16b, v12.16b + sub w8, w8, #1 // nr-- + +Lenc_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k + tbl v13.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v13.16b + eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v13.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 + cbnz w8, Lenc_2x_loop + + // middle of last round + add x10, x11, #0x80 + // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo + // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # Lk_sr[] + tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t + tbl v8.16b, {v23.16b}, v11.16b + eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A + eor v8.16b, v8.16b, v12.16b + tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v1.16b + ret + + +.def _vpaes_decrypt_preheat + .type 32 +.endef +.align 4 +_vpaes_decrypt_preheat: + adrp x10, Lk_inv + add x10, x10, :lo12:Lk_inv + movi v17.16b, #0x0f + adrp x11, Lk_dipt + add x11, x11, :lo12:Lk_dipt + ld1 {v18.2d,v19.2d}, [x10],#32 // Lk_inv + ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // Lk_dipt, Lk_dsbo + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // Lk_dsb9, Lk_dsbd + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // Lk_dsbb, Lk_dsbe + ret + + +## +## Decryption core +## +## Same API as encryption core. +## +.def _vpaes_decrypt_core + .type 32 +.endef +.align 4 +_vpaes_decrypt_core: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, Lk_sr + add x10, x10, :lo12:Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, Lk_mc_forward+48 + add x10, x10, :lo12:Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + b Ldec_entry + +.align 4 +Ldec_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + sub w8, w8, #1 // sub $1,%rax # nr-- + +Ldec_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, Ldec_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + ret + + +.globl vpaes_decrypt + +.def vpaes_decrypt + .type 32 +.endef +.align 4 +vpaes_decrypt: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + ld1 {v7.16b}, [x0] + bl _vpaes_decrypt_preheat + bl _vpaes_decrypt_core + st1 {v0.16b}, [x1] + + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +// v14-v15 input, v0-v1 output +.def _vpaes_decrypt_2x + .type 32 +.endef +.align 4 +_vpaes_decrypt_2x: + mov x9, x2 + ldr w8, [x2,#240] // pull rounds + + // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo + lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 + eor x11, x11, #0x30 // xor $0x30, %r11 + adrp x10, Lk_sr + add x10, x10, :lo12:Lk_sr + and x11, x11, #0x30 // and $0x30, %r11 + add x11, x11, x10 + adrp x10, Lk_mc_forward+48 + add x10, x10, :lo12:Lk_mc_forward+48 + + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key + and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + and v9.16b, v15.16b, v17.16b + ushr v8.16b, v15.16b, #4 + tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + tbl v10.16b, {v20.16b},v9.16b + ld1 {v5.2d}, [x10] // vmovdqa Lk_mc_forward+48(%rip), %xmm5 + // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi + tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + tbl v8.16b, {v21.16b},v8.16b + eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 + eor v10.16b, v10.16b, v16.16b + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + eor v8.16b, v8.16b, v10.16b + b Ldec_2x_entry + +.align 4 +Ldec_2x_loop: +// +// Inverse mix columns +// + // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u + // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t + tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u + tbl v12.16b, {v24.16b}, v10.16b + tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t + tbl v9.16b, {v25.16b}, v11.16b + eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 + eor v8.16b, v12.16b, v16.16b + // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt + + tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu + tbl v12.16b, {v26.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt + tbl v9.16b, {v27.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt + + tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu + tbl v12.16b, {v28.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt + tbl v9.16b, {v29.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet + + tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu + tbl v12.16b, {v30.16b}, v10.16b + tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch + tbl v8.16b, {v8.16b},v5.16b + tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet + tbl v9.16b, {v31.16b}, v11.16b + eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch + eor v8.16b, v8.16b, v12.16b + ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 + eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch + eor v8.16b, v8.16b, v9.16b + sub w8, w8, #1 // sub $1,%rax # nr-- + +Ldec_2x_entry: + // top of round + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + and v9.16b, v8.16b, v17.16b + ushr v8.16b, v8.16b, #4 + tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + tbl v10.16b, {v19.16b},v9.16b + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + eor v9.16b, v9.16b, v8.16b + tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + tbl v11.16b, {v18.16b},v8.16b + tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + tbl v12.16b, {v18.16b},v9.16b + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + eor v11.16b, v11.16b, v10.16b + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + eor v12.16b, v12.16b, v10.16b + tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak + tbl v10.16b, {v18.16b},v11.16b + tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak + tbl v11.16b, {v18.16b},v12.16b + eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io + eor v10.16b, v10.16b, v9.16b + eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo + eor v11.16b, v11.16b, v8.16b + ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 + cbnz w8, Ldec_2x_loop + + // middle of last round + // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou + tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou + tbl v12.16b, {v22.16b}, v10.16b + // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot + tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t + tbl v9.16b, {v23.16b}, v11.16b + ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # Lk_sr-Lk_dsbd=-0x160 + eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k + eor v12.16b, v12.16b, v16.16b + eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A + eor v8.16b, v9.16b, v12.16b + tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 + tbl v1.16b, {v8.16b},v2.16b + ret + +######################################################## +## ## +## AES key schedule ## +## ## +######################################################## +.def _vpaes_key_preheat + .type 32 +.endef +.align 4 +_vpaes_key_preheat: + adrp x10, Lk_inv + add x10, x10, :lo12:Lk_inv + movi v16.16b, #0x5b // Lk_s63 + adrp x11, Lk_sb1 + add x11, x11, :lo12:Lk_sb1 + movi v17.16b, #0x0f // Lk_s0F + ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // Lk_inv, Lk_ipt + adrp x10, Lk_dksd + add x10, x10, :lo12:Lk_dksd + ld1 {v22.2d,v23.2d}, [x11] // Lk_sb1 + adrp x11, Lk_mc_forward + add x11, x11, :lo12:Lk_mc_forward + ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // Lk_dksd, Lk_dksb + ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // Lk_dkse, Lk_dks9 + ld1 {v8.2d}, [x10] // Lk_rcon + ld1 {v9.2d}, [x11] // Lk_mc_forward[0] + ret + + +.def _vpaes_schedule_core + .type 32 +.endef +.align 4 +_vpaes_schedule_core: + AARCH64_SIGN_LINK_REGISTER + stp x29, x30, [sp,#-16]! + add x29,sp,#0 + + bl _vpaes_key_preheat // load the tables + + ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) + + // input transform + mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 + bl _vpaes_schedule_transform + mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 + + adrp x10, Lk_sr // lea Lk_sr(%rip),%r10 + add x10, x10, :lo12:Lk_sr + + add x8, x8, x10 + cbnz w3, Lschedule_am_decrypting + + // encrypting, output zeroth round key after transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) + b Lschedule_go + +Lschedule_am_decrypting: + // decrypting, output zeroth round key after shiftrows + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + eor x8, x8, #0x30 // xor $0x30, %r8 + +Lschedule_go: + cmp w1, #192 // cmp $192, %esi + b.hi Lschedule_256 + b.eq Lschedule_192 + // 128: fall though + +## +## .schedule_128 +## +## 128-bit specific part of key schedule. +## +## This schedule is really simple, because all its parts +## are accomplished by the subroutines. +## +Lschedule_128: + mov x0, #10 // mov $10, %esi + +Loop_schedule_128: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // write output + b Loop_schedule_128 + +## +## .aes_schedule_192 +## +## 192-bit specific part of key schedule. +## +## The main body of this schedule is the same as the 128-bit +## schedule, but with more smearing. The long, high side is +## stored in %xmm7 as before, and the short, low side is in +## the high bits of %xmm6. +## +## This schedule is somewhat nastier, however, because each +## round produces 192 bits of key material, or 1.5 round keys. +## Therefore, on each cycle we do 2 rounds and produce 3 round +## keys. +## +.align 4 +Lschedule_192: + sub x0, x0, #8 + ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) + bl _vpaes_schedule_transform // input transform + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 + ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros + mov x0, #4 // mov $4, %esi + +Loop_schedule_192: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_round + ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 + bl _vpaes_schedule_mangle // save key n + bl _vpaes_schedule_192_smear + bl _vpaes_schedule_mangle // save key n+1 + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle // save key n+2 + bl _vpaes_schedule_192_smear + b Loop_schedule_192 + +## +## .aes_schedule_256 +## +## 256-bit specific part of key schedule. +## +## The structure here is very similar to the 128-bit +## schedule, but with an additional "low side" in +## %xmm6. The low side's rounds are the same as the +## high side's, except no rcon and no rotation. +## +.align 4 +Lschedule_256: + ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) + bl _vpaes_schedule_transform // input transform + mov x0, #7 // mov $7, %esi + +Loop_schedule_256: + sub x0, x0, #1 // dec %esi + bl _vpaes_schedule_mangle // output low result + mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 + + // high round + bl _vpaes_schedule_round + cbz x0, Lschedule_mangle_last + bl _vpaes_schedule_mangle + + // low round. swap xmm7 and xmm6 + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + movi v4.16b, #0 + mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 + mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 + bl _vpaes_schedule_low_round + mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 + + b Loop_schedule_256 + +## +## .aes_schedule_mangle_last +## +## Mangler for last round of key schedule +## Mangles %xmm0 +## when encrypting, outputs out(%xmm0) ^ 63 +## when decrypting, outputs unskew(%xmm0) +## +## Always called right before return... jumps to cleanup and exits +## +.align 4 +Lschedule_mangle_last: + // schedule last round key from xmm0 + adrp x11, Lk_deskew // lea Lk_deskew(%rip),%r11 # prepare to deskew + add x11, x11, :lo12:Lk_deskew + + cbnz w3, Lschedule_mangle_last_dec + + // encrypting + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 + adrp x11, Lk_opt // lea Lk_opt(%rip), %r11 # prepare to output transform + add x11, x11, :lo12:Lk_opt + add x2, x2, #32 // add $32, %rdx + tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute + +Lschedule_mangle_last_dec: + ld1 {v20.2d,v21.2d}, [x11] // reload constants + sub x2, x2, #16 // add $-16, %rdx + eor v0.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm0 + bl _vpaes_schedule_transform // output transform + st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key + + // cleanup + eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 + eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 + eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 + eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 + eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 + eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 + ldp x29, x30, [sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +## +## .aes_schedule_192_smear +## +## Smear the short, low side in the 192-bit key schedule. +## +## Inputs: +## %xmm7: high side, b a x y +## %xmm6: low side, d c 0 0 +## %xmm13: 0 +## +## Outputs: +## %xmm6: b+c+d b+c 0 0 +## %xmm0: b+c+d b+c b a +## +.def _vpaes_schedule_192_smear + .type 32 +.endef +.align 4 +_vpaes_schedule_192_smear: + movi v1.16b, #0 + dup v0.4s, v7.s[3] + ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 + ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a + eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 + eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 + eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a + mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 + ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros + ret + + +## +## .aes_schedule_round +## +## Runs one main round of the key schedule on %xmm0, %xmm7 +## +## Specifically, runs subbytes on the high dword of %xmm0 +## then rotates it by one byte and xors into the low dword of +## %xmm7. +## +## Adds rcon from low byte of %xmm8, then rotates %xmm8 for +## next rcon. +## +## Smears the dwords of %xmm7 by xoring the low into the +## second low, result into third, result into highest. +## +## Returns results in %xmm7 = %xmm0. +## Clobbers %xmm1-%xmm4, %r11. +## +.def _vpaes_schedule_round + .type 32 +.endef +.align 4 +_vpaes_schedule_round: + // extract rcon from xmm8 + movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 + ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 + ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + + // rotate + dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 + ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 + + // fall through... + + // low round: same as high round, but no rotation and no rcon. +_vpaes_schedule_low_round: + // smear xmm7 + ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 + eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 + ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 + + // subbytes + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i + eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 + tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k + eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j + tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k + tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j + eor v7.16b, v7.16b, v16.16b // vpxor Lk_s63(%rip), %xmm7, %xmm7 + tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak + eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k + tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak + eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io + eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo + tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou + tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t + eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output + + // add in smeared stuff + eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 + eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 + ret + + +## +## .aes_schedule_transform +## +## Linear-transform %xmm0 according to tables at (%r11) +## +## Requires that %xmm9 = 0x0F0F... as in preheat +## Output in %xmm0 +## Clobbers %xmm1, %xmm2 +## +.def _vpaes_schedule_transform + .type 32 +.endef +.align 4 +_vpaes_schedule_transform: + and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 + ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 + // vmovdqa (%r11), %xmm2 # lo + tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 + // vmovdqa 16(%r11), %xmm1 # hi + tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 + eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 + ret + + +## +## .aes_schedule_mangle +## +## Mangle xmm0 from (basis-transformed) standard version +## to our version. +## +## On encrypt, +## xor with 0x63 +## multiply by circulant 0,1,1,1 +## apply shiftrows transform +## +## On decrypt, +## xor with 0x63 +## multiply by "inverse mixcolumns" circulant E,B,D,9 +## deskew +## apply shiftrows transform +## +## +## Writes out to (%rdx), and increments or decrements it +## Keeps track of round number mod 4 in %r8 +## Preserves xmm0 +## Clobbers xmm1-xmm5 +## +.def _vpaes_schedule_mangle + .type 32 +.endef +.align 4 +_vpaes_schedule_mangle: + mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later + // vmovdqa .Lk_mc_forward(%rip),%xmm5 + cbnz w3, Lschedule_mangle_dec + + // encrypting + eor v4.16b, v0.16b, v16.16b // vpxor Lk_s63(%rip), %xmm0, %xmm4 + add x2, x2, #16 // add $16, %rdx + tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 + tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 + tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 + eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 + + b Lschedule_mangle_both +.align 4 +Lschedule_mangle_dec: + // inverse mix columns + // lea .Lk_dksd(%rip),%r11 + ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi + and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo + + // vmovdqa 0x00(%r11), %xmm2 + tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + // vmovdqa 0x10(%r11), %xmm3 + tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x20(%r11), %xmm2 + tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x30(%r11), %xmm3 + tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + + // vmovdqa 0x40(%r11), %xmm2 + tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + // vmovdqa 0x50(%r11), %xmm3 + tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 + + // vmovdqa 0x60(%r11), %xmm2 + tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 + tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 + // vmovdqa 0x70(%r11), %xmm4 + tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 + ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 + eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 + eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 + + sub x2, x2, #16 // add $-16, %rdx + +Lschedule_mangle_both: + tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 + add x8, x8, #48 // add $-16, %r8 + and x8, x8, #~(1<<6) // and $0x30, %r8 + st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) + ret + + +.globl vpaes_set_encrypt_key + +.def vpaes_set_encrypt_key + .type 32 +.endef +.align 4 +vpaes_set_encrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + + mov w3, #0 // mov $0,%ecx + mov x8, #0x30 // mov $0x30,%r8d + bl _vpaes_schedule_core + eor x0, x0, x0 + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl vpaes_set_decrypt_key + +.def vpaes_set_decrypt_key + .type 32 +.endef +.align 4 +vpaes_set_decrypt_key: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + + lsr w9, w1, #5 // shr $5,%eax + add w9, w9, #5 // $5,%eax + str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; + lsl w9, w9, #4 // shl $4,%eax + add x2, x2, #16 // lea 16(%rdx,%rax),%rdx + add x2, x2, x9 + + mov w3, #1 // mov $1,%ecx + lsr w8, w1, #1 // shr $1,%r8d + and x8, x8, #32 // and $32,%r8d + eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 + bl _vpaes_schedule_core + + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl vpaes_cbc_encrypt + +.def vpaes_cbc_encrypt + .type 32 +.endef +.align 4 +vpaes_cbc_encrypt: + AARCH64_SIGN_LINK_REGISTER + cbz x2, Lcbc_abort + cmp w5, #0 // check direction + b.eq vpaes_cbc_decrypt + + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + + mov x17, x2 // reassign + mov x2, x3 // reassign + + ld1 {v0.16b}, [x4] // load ivec + bl _vpaes_encrypt_preheat + b Lcbc_enc_loop + +.align 4 +Lcbc_enc_loop: + ld1 {v7.16b}, [x0],#16 // load input + eor v7.16b, v7.16b, v0.16b // xor with ivec + bl _vpaes_encrypt_core + st1 {v0.16b}, [x1],#16 // save output + subs x17, x17, #16 + b.hi Lcbc_enc_loop + + st1 {v0.16b}, [x4] // write ivec + + ldp x29,x30,[sp],#16 +Lcbc_abort: + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.def vpaes_cbc_decrypt + .type 32 +.endef +.align 4 +vpaes_cbc_decrypt: + // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to + // only from vpaes_cbc_encrypt which has already signed the return address. + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + mov x17, x2 // reassign + mov x2, x3 // reassign + ld1 {v6.16b}, [x4] // load ivec + bl _vpaes_decrypt_preheat + tst x17, #16 + b.eq Lcbc_dec_loop2x + + ld1 {v7.16b}, [x0], #16 // load input + bl _vpaes_decrypt_core + eor v0.16b, v0.16b, v6.16b // xor with ivec + orr v6.16b, v7.16b, v7.16b // next ivec value + st1 {v0.16b}, [x1], #16 + subs x17, x17, #16 + b.ls Lcbc_dec_done + +.align 4 +Lcbc_dec_loop2x: + ld1 {v14.16b,v15.16b}, [x0], #32 + bl _vpaes_decrypt_2x + eor v0.16b, v0.16b, v6.16b // xor with ivec + eor v1.16b, v1.16b, v14.16b + orr v6.16b, v15.16b, v15.16b + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #32 + b.hi Lcbc_dec_loop2x + +Lcbc_dec_done: + st1 {v6.16b}, [x4] + + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.globl vpaes_ctr32_encrypt_blocks + +.def vpaes_ctr32_encrypt_blocks + .type 32 +.endef +.align 4 +vpaes_ctr32_encrypt_blocks: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-16]! + add x29,sp,#0 + stp d8,d9,[sp,#-16]! // ABI spec says so + stp d10,d11,[sp,#-16]! + stp d12,d13,[sp,#-16]! + stp d14,d15,[sp,#-16]! + + cbz x2, Lctr32_done + + // Note, unlike the other functions, x2 here is measured in blocks, + // not bytes. + mov x17, x2 + mov x2, x3 + + // Load the IV and counter portion. + ldr w6, [x4, #12] + ld1 {v7.16b}, [x4] + + bl _vpaes_encrypt_preheat + tst x17, #1 + rev w6, w6 // The counter is big-endian. + b.eq Lctr32_prep_loop + + // Handle one block so the remaining block count is even for + // _vpaes_encrypt_2x. + ld1 {v6.16b}, [x0], #16 // Load input ahead of time + bl _vpaes_encrypt_core + eor v0.16b, v0.16b, v6.16b // XOR input and result + st1 {v0.16b}, [x1], #16 + subs x17, x17, #1 + // Update the counter. + add w6, w6, #1 + rev w7, w6 + mov v7.s[3], w7 + b.ls Lctr32_done + +Lctr32_prep_loop: + // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x + // uses v14 and v15. + mov v15.16b, v7.16b + mov v14.16b, v7.16b + add w6, w6, #1 + rev w7, w6 + mov v15.s[3], w7 + +Lctr32_loop: + ld1 {v6.16b,v7.16b}, [x0], #32 // Load input ahead of time + bl _vpaes_encrypt_2x + eor v0.16b, v0.16b, v6.16b // XOR input and result + eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) + st1 {v0.16b,v1.16b}, [x1], #32 + subs x17, x17, #2 + // Update the counter. + add w7, w6, #1 + add w6, w6, #2 + rev w7, w7 + mov v14.s[3], w7 + rev w7, w6 + mov v15.s[3], w7 + b.hi Lctr32_loop + +Lctr32_done: + ldp d14,d15,[sp],#16 + ldp d12,d13,[sp],#16 + ldp d10,d11,[sp],#16 + ldp d8,d9,[sp],#16 + ldp x29,x30,[sp],#16 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/bcm/vpaes-x86-apple.S b/third_party/boringssl/gen/bcm/vpaes-x86-apple.S new file mode 100644 index 00000000..b6717d57 --- /dev/null +++ b/third_party/boringssl/gen/bcm/vpaes-x86-apple.S @@ -0,0 +1,680 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.align 6,0x90 +L_vpaes_consts: +.long 218628480,235210255,168496130,67568393 +.long 252381056,17041926,33884169,51187212 +.long 252645135,252645135,252645135,252645135 +.long 1512730624,3266504856,1377990664,3401244816 +.long 830229760,1275146365,2969422977,3447763452 +.long 3411033600,2979783055,338359620,2782886510 +.long 4209124096,907596821,221174255,1006095553 +.long 191964160,3799684038,3164090317,1589111125 +.long 182528256,1777043520,2877432650,3265356744 +.long 1874708224,3503451415,3305285752,363511674 +.long 1606117888,3487855781,1093350906,2384367825 +.long 197121,67569157,134941193,202313229 +.long 67569157,134941193,202313229,197121 +.long 134941193,202313229,197121,67569157 +.long 202313229,197121,67569157,134941193 +.long 33619971,100992007,168364043,235736079 +.long 235736079,33619971,100992007,168364043 +.long 168364043,235736079,33619971,100992007 +.long 100992007,168364043,235736079,33619971 +.long 50462976,117835012,185207048,252579084 +.long 252314880,51251460,117574920,184942860 +.long 184682752,252054788,50987272,118359308 +.long 118099200,185467140,251790600,50727180 +.long 2946363062,528716217,1300004225,1881839624 +.long 1532713819,1532713819,1532713819,1532713819 +.long 3602276352,4288629033,3737020424,4153884961 +.long 1354558464,32357713,2958822624,3775749553 +.long 1201988352,132424512,1572796698,503232858 +.long 2213177600,1597421020,4103937655,675398315 +.long 2749646592,4273543773,1511898873,121693092 +.long 3040248576,1103263732,2871565598,1608280554 +.long 2236667136,2588920351,482954393,64377734 +.long 3069987328,291237287,2117370568,3650299247 +.long 533321216,3573750986,2572112006,1401264716 +.long 1339849704,2721158661,548607111,3445553514 +.long 2128193280,3054596040,2183486460,1257083700 +.long 655635200,1165381986,3923443150,2344132524 +.long 190078720,256924420,290342170,357187870 +.long 1610966272,2263057382,4103205268,309794674 +.long 2592527872,2233205587,1335446729,3402964816 +.long 3973531904,3225098121,3002836325,1918774430 +.long 3870401024,2102906079,2284471353,4117666579 +.long 617007872,1021508343,366931923,691083277 +.long 2528395776,3491914898,2968704004,1613121270 +.long 3445188352,3247741094,844474987,4093578302 +.long 651481088,1190302358,1689581232,574775300 +.long 4289380608,206939853,2555985458,2489840491 +.long 2130264064,327674451,3566485037,3349835193 +.long 2470714624,316102159,3636825756,3393945945 +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 +.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 +.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 +.byte 118,101,114,115,105,116,121,41,0 +.align 6,0x90 +.private_extern __vpaes_preheat +.align 4 +__vpaes_preheat: + addl (%esp),%ebp + movdqa -48(%ebp),%xmm7 + movdqa -16(%ebp),%xmm6 + ret +.private_extern __vpaes_encrypt_core +.align 4 +__vpaes_encrypt_core: + movl $16,%ecx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa (%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + movdqu (%edx),%xmm5 + pshufb %xmm0,%xmm2 + movdqa 16(%ebp),%xmm0 + pxor %xmm5,%xmm2 + psrld $4,%xmm1 + addl $16,%edx + pshufb %xmm1,%xmm0 + leal 192(%ebp),%ebx + pxor %xmm2,%xmm0 + jmp L000enc_entry +.align 4,0x90 +L001enc_loop: + movdqa 32(%ebp),%xmm4 + movdqa 48(%ebp),%xmm0 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm0 + pxor %xmm5,%xmm4 + movdqa 64(%ebp),%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%ebx,%ecx,1),%xmm1 + pshufb %xmm2,%xmm5 + movdqa 80(%ebp),%xmm2 + movdqa (%ebx,%ecx,1),%xmm4 + pshufb %xmm3,%xmm2 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 + pshufb %xmm1,%xmm0 + addl $16,%edx + pxor %xmm2,%xmm0 + pshufb %xmm4,%xmm3 + addl $16,%ecx + pxor %xmm0,%xmm3 + pshufb %xmm1,%xmm0 + andl $48,%ecx + subl $1,%eax + pxor %xmm3,%xmm0 +L000enc_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm6,%xmm0 + pshufb %xmm0,%xmm5 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 + pshufb %xmm1,%xmm3 + movdqa %xmm7,%xmm4 + pxor %xmm5,%xmm3 + pshufb %xmm0,%xmm4 + movdqa %xmm7,%xmm2 + pxor %xmm5,%xmm4 + pshufb %xmm3,%xmm2 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 + pshufb %xmm4,%xmm3 + movdqu (%edx),%xmm5 + pxor %xmm1,%xmm3 + jnz L001enc_loop + movdqa 96(%ebp),%xmm4 + movdqa 112(%ebp),%xmm0 + pshufb %xmm2,%xmm4 + pxor %xmm5,%xmm4 + pshufb %xmm3,%xmm0 + movdqa 64(%ebx,%ecx,1),%xmm1 + pxor %xmm4,%xmm0 + pshufb %xmm1,%xmm0 + ret +.private_extern __vpaes_decrypt_core +.align 4 +__vpaes_decrypt_core: + leal 608(%ebp),%ebx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa -64(%ebx),%xmm2 + pandn %xmm0,%xmm1 + movl %eax,%ecx + psrld $4,%xmm1 + movdqu (%edx),%xmm5 + shll $4,%ecx + pand %xmm6,%xmm0 + pshufb %xmm0,%xmm2 + movdqa -48(%ebx),%xmm0 + xorl $48,%ecx + pshufb %xmm1,%xmm0 + andl $48,%ecx + pxor %xmm5,%xmm2 + movdqa 176(%ebp),%xmm5 + pxor %xmm2,%xmm0 + addl $16,%edx + leal -352(%ebx,%ecx,1),%ecx + jmp L002dec_entry +.align 4,0x90 +L003dec_loop: + movdqa -32(%ebx),%xmm4 + movdqa -16(%ebx),%xmm1 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa (%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%ebx),%xmm1 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa 32(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%ebx),%xmm1 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa 64(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%ebx),%xmm1 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + addl $16,%edx + palignr $12,%xmm5,%xmm5 + pxor %xmm1,%xmm0 + subl $1,%eax +L002dec_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + psrld $4,%xmm1 + pshufb %xmm0,%xmm2 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 + pshufb %xmm1,%xmm3 + movdqa %xmm7,%xmm4 + pxor %xmm2,%xmm3 + pshufb %xmm0,%xmm4 + pxor %xmm2,%xmm4 + movdqa %xmm7,%xmm2 + pshufb %xmm3,%xmm2 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 + pshufb %xmm4,%xmm3 + movdqu (%edx),%xmm0 + pxor %xmm1,%xmm3 + jnz L003dec_loop + movdqa 96(%ebx),%xmm4 + pshufb %xmm2,%xmm4 + pxor %xmm0,%xmm4 + movdqa 112(%ebx),%xmm0 + movdqa (%ecx),%xmm2 + pshufb %xmm3,%xmm0 + pxor %xmm4,%xmm0 + pshufb %xmm2,%xmm0 + ret +.private_extern __vpaes_schedule_core +.align 4 +__vpaes_schedule_core: + addl (%esp),%ebp + movdqu (%esi),%xmm0 + movdqa 320(%ebp),%xmm2 + movdqa %xmm0,%xmm3 + leal (%ebp),%ebx + movdqa %xmm2,4(%esp) + call __vpaes_schedule_transform + movdqa %xmm0,%xmm7 + testl %edi,%edi + jnz L004schedule_am_decrypting + movdqu %xmm0,(%edx) + jmp L005schedule_go +L004schedule_am_decrypting: + movdqa 256(%ebp,%ecx,1),%xmm1 + pshufb %xmm1,%xmm3 + movdqu %xmm3,(%edx) + xorl $48,%ecx +L005schedule_go: + cmpl $192,%eax + ja L006schedule_256 + je L007schedule_192 +L008schedule_128: + movl $10,%eax +L009loop_schedule_128: + call __vpaes_schedule_round + decl %eax + jz L010schedule_mangle_last + call __vpaes_schedule_mangle + jmp L009loop_schedule_128 +.align 4,0x90 +L007schedule_192: + movdqu 8(%esi),%xmm0 + call __vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%eax +L011loop_schedule_192: + call __vpaes_schedule_round + palignr $8,%xmm6,%xmm0 + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + call __vpaes_schedule_mangle + call __vpaes_schedule_round + decl %eax + jz L010schedule_mangle_last + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + jmp L011loop_schedule_192 +.align 4,0x90 +L006schedule_256: + movdqu 16(%esi),%xmm0 + call __vpaes_schedule_transform + movl $7,%eax +L012loop_schedule_256: + call __vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + call __vpaes_schedule_round + decl %eax + jz L010schedule_mangle_last + call __vpaes_schedule_mangle + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,20(%esp) + movdqa %xmm6,%xmm7 + call L_vpaes_schedule_low_round + movdqa 20(%esp),%xmm7 + jmp L012loop_schedule_256 +.align 4,0x90 +L010schedule_mangle_last: + leal 384(%ebp),%ebx + testl %edi,%edi + jnz L013schedule_mangle_last_dec + movdqa 256(%ebp,%ecx,1),%xmm1 + pshufb %xmm1,%xmm0 + leal 352(%ebp),%ebx + addl $32,%edx +L013schedule_mangle_last_dec: + addl $-16,%edx + pxor 336(%ebp),%xmm0 + call __vpaes_schedule_transform + movdqu %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret +.private_extern __vpaes_schedule_192_smear +.align 4 +__vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + ret +.private_extern __vpaes_schedule_round +.align 4 +__vpaes_schedule_round: + movdqa 8(%esp),%xmm2 + pxor %xmm1,%xmm1 + palignr $15,%xmm2,%xmm1 + palignr $15,%xmm2,%xmm2 + pxor %xmm1,%xmm7 + pshufd $255,%xmm0,%xmm0 + palignr $1,%xmm0,%xmm0 + movdqa %xmm2,8(%esp) +L_vpaes_schedule_low_round: + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor 336(%ebp),%xmm7 + movdqa -16(%ebp),%xmm4 + movdqa -48(%ebp),%xmm5 + movdqa %xmm4,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm4,%xmm0 + movdqa -32(%ebp),%xmm2 + pshufb %xmm0,%xmm2 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + movdqa %xmm5,%xmm4 + pshufb %xmm0,%xmm4 + pxor %xmm2,%xmm4 + movdqa %xmm5,%xmm2 + pshufb %xmm3,%xmm2 + pxor %xmm0,%xmm2 + movdqa %xmm5,%xmm3 + pshufb %xmm4,%xmm3 + pxor %xmm1,%xmm3 + movdqa 32(%ebp),%xmm4 + pshufb %xmm2,%xmm4 + movdqa 48(%ebp),%xmm0 + pshufb %xmm3,%xmm0 + pxor %xmm4,%xmm0 + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret +.private_extern __vpaes_schedule_transform +.align 4 +__vpaes_schedule_transform: + movdqa -16(%ebp),%xmm2 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + movdqa (%ebx),%xmm2 + pshufb %xmm0,%xmm2 + movdqa 16(%ebx),%xmm0 + pshufb %xmm1,%xmm0 + pxor %xmm2,%xmm0 + ret +.private_extern __vpaes_schedule_mangle +.align 4 +__vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa 128(%ebp),%xmm5 + testl %edi,%edi + jnz L014schedule_mangle_dec + addl $16,%edx + pxor 336(%ebp),%xmm4 + pshufb %xmm5,%xmm4 + movdqa %xmm4,%xmm3 + pshufb %xmm5,%xmm4 + pxor %xmm4,%xmm3 + pshufb %xmm5,%xmm4 + pxor %xmm4,%xmm3 + jmp L015schedule_mangle_both +.align 4,0x90 +L014schedule_mangle_dec: + movdqa -16(%ebp),%xmm2 + leal 416(%ebp),%esi + movdqa %xmm2,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm4 + movdqa (%esi),%xmm2 + pshufb %xmm4,%xmm2 + movdqa 16(%esi),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + movdqa 32(%esi),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 48(%esi),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + movdqa 64(%esi),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 80(%esi),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + movdqa 96(%esi),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 112(%esi),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + addl $-16,%edx +L015schedule_mangle_both: + movdqa 256(%ebp,%ecx,1),%xmm1 + pshufb %xmm1,%xmm3 + addl $-16,%ecx + andl $48,%ecx + movdqu %xmm3,(%edx) + ret +.globl _vpaes_set_encrypt_key +.private_extern _vpaes_set_encrypt_key +.align 4 +_vpaes_set_encrypt_key: +L_vpaes_set_encrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L016pic_for_function_hit +L016pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+5-L016pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + movl $48,%ecx + movl $0,%edi + leal L_vpaes_consts+0x30-L017pic_point,%ebp + call __vpaes_schedule_core +L017pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_set_decrypt_key +.private_extern _vpaes_set_decrypt_key +.align 4 +_vpaes_set_decrypt_key: +L_vpaes_set_decrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + shll $4,%ebx + leal 16(%edx,%ebx,1),%edx + movl $1,%edi + movl %eax,%ecx + shrl $1,%ecx + andl $32,%ecx + xorl $32,%ecx + leal L_vpaes_consts+0x30-L018pic_point,%ebp + call __vpaes_schedule_core +L018pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_encrypt +.private_extern _vpaes_encrypt +.align 4 +_vpaes_encrypt: +L_vpaes_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call L019pic_for_function_hit +L019pic_for_function_hit: + popl %ebx + leal _BORINGSSL_function_hit+4-L019pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + leal L_vpaes_consts+0x30-L020pic_point,%ebp + call __vpaes_preheat +L020pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call __vpaes_encrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_decrypt +.private_extern _vpaes_decrypt +.align 4 +_vpaes_decrypt: +L_vpaes_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + leal L_vpaes_consts+0x30-L021pic_point,%ebp + call __vpaes_preheat +L021pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call __vpaes_decrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _vpaes_cbc_encrypt +.private_extern _vpaes_cbc_encrypt +.align 4 +_vpaes_cbc_encrypt: +L_vpaes_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + subl $16,%eax + jc L022cbc_abort + leal -56(%esp),%ebx + movl 36(%esp),%ebp + andl $-16,%ebx + movl 40(%esp),%ecx + xchgl %esp,%ebx + movdqu (%ebp),%xmm1 + subl %esi,%edi + movl %ebx,48(%esp) + movl %edi,(%esp) + movl %edx,4(%esp) + movl %ebp,8(%esp) + movl %eax,%edi + leal L_vpaes_consts+0x30-L023pic_point,%ebp + call __vpaes_preheat +L023pic_point: + cmpl $0,%ecx + je L024cbc_dec_loop + jmp L025cbc_enc_loop +.align 4,0x90 +L025cbc_enc_loop: + movdqu (%esi),%xmm0 + pxor %xmm1,%xmm0 + call __vpaes_encrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + movdqa %xmm0,%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc L025cbc_enc_loop + jmp L026cbc_done +.align 4,0x90 +L024cbc_dec_loop: + movdqu (%esi),%xmm0 + movdqa %xmm1,16(%esp) + movdqa %xmm0,32(%esp) + call __vpaes_decrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + pxor 16(%esp),%xmm0 + movdqa 32(%esp),%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc L024cbc_dec_loop +L026cbc_done: + movl 8(%esp),%ebx + movl 48(%esp),%esp + movdqu %xmm1,(%ebx) +L022cbc_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/vpaes-x86-linux.S b/third_party/boringssl/gen/bcm/vpaes-x86-linux.S new file mode 100644 index 00000000..13da4aad --- /dev/null +++ b/third_party/boringssl/gen/bcm/vpaes-x86-linux.S @@ -0,0 +1,706 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +#ifdef BORINGSSL_DISPATCH_TEST +#endif +.align 64 +.L_vpaes_consts: +.long 218628480,235210255,168496130,67568393 +.long 252381056,17041926,33884169,51187212 +.long 252645135,252645135,252645135,252645135 +.long 1512730624,3266504856,1377990664,3401244816 +.long 830229760,1275146365,2969422977,3447763452 +.long 3411033600,2979783055,338359620,2782886510 +.long 4209124096,907596821,221174255,1006095553 +.long 191964160,3799684038,3164090317,1589111125 +.long 182528256,1777043520,2877432650,3265356744 +.long 1874708224,3503451415,3305285752,363511674 +.long 1606117888,3487855781,1093350906,2384367825 +.long 197121,67569157,134941193,202313229 +.long 67569157,134941193,202313229,197121 +.long 134941193,202313229,197121,67569157 +.long 202313229,197121,67569157,134941193 +.long 33619971,100992007,168364043,235736079 +.long 235736079,33619971,100992007,168364043 +.long 168364043,235736079,33619971,100992007 +.long 100992007,168364043,235736079,33619971 +.long 50462976,117835012,185207048,252579084 +.long 252314880,51251460,117574920,184942860 +.long 184682752,252054788,50987272,118359308 +.long 118099200,185467140,251790600,50727180 +.long 2946363062,528716217,1300004225,1881839624 +.long 1532713819,1532713819,1532713819,1532713819 +.long 3602276352,4288629033,3737020424,4153884961 +.long 1354558464,32357713,2958822624,3775749553 +.long 1201988352,132424512,1572796698,503232858 +.long 2213177600,1597421020,4103937655,675398315 +.long 2749646592,4273543773,1511898873,121693092 +.long 3040248576,1103263732,2871565598,1608280554 +.long 2236667136,2588920351,482954393,64377734 +.long 3069987328,291237287,2117370568,3650299247 +.long 533321216,3573750986,2572112006,1401264716 +.long 1339849704,2721158661,548607111,3445553514 +.long 2128193280,3054596040,2183486460,1257083700 +.long 655635200,1165381986,3923443150,2344132524 +.long 190078720,256924420,290342170,357187870 +.long 1610966272,2263057382,4103205268,309794674 +.long 2592527872,2233205587,1335446729,3402964816 +.long 3973531904,3225098121,3002836325,1918774430 +.long 3870401024,2102906079,2284471353,4117666579 +.long 617007872,1021508343,366931923,691083277 +.long 2528395776,3491914898,2968704004,1613121270 +.long 3445188352,3247741094,844474987,4093578302 +.long 651481088,1190302358,1689581232,574775300 +.long 4289380608,206939853,2555985458,2489840491 +.long 2130264064,327674451,3566485037,3349835193 +.long 2470714624,316102159,3636825756,3393945945 +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 +.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 +.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 +.byte 118,101,114,115,105,116,121,41,0 +.align 64 +.hidden _vpaes_preheat +.type _vpaes_preheat,@function +.align 16 +_vpaes_preheat: + addl (%esp),%ebp + movdqa -48(%ebp),%xmm7 + movdqa -16(%ebp),%xmm6 + ret +.size _vpaes_preheat,.-_vpaes_preheat +.hidden _vpaes_encrypt_core +.type _vpaes_encrypt_core,@function +.align 16 +_vpaes_encrypt_core: + movl $16,%ecx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa (%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + movdqu (%edx),%xmm5 + pshufb %xmm0,%xmm2 + movdqa 16(%ebp),%xmm0 + pxor %xmm5,%xmm2 + psrld $4,%xmm1 + addl $16,%edx + pshufb %xmm1,%xmm0 + leal 192(%ebp),%ebx + pxor %xmm2,%xmm0 + jmp .L000enc_entry +.align 16 +.L001enc_loop: + movdqa 32(%ebp),%xmm4 + movdqa 48(%ebp),%xmm0 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm0 + pxor %xmm5,%xmm4 + movdqa 64(%ebp),%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%ebx,%ecx,1),%xmm1 + pshufb %xmm2,%xmm5 + movdqa 80(%ebp),%xmm2 + movdqa (%ebx,%ecx,1),%xmm4 + pshufb %xmm3,%xmm2 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 + pshufb %xmm1,%xmm0 + addl $16,%edx + pxor %xmm2,%xmm0 + pshufb %xmm4,%xmm3 + addl $16,%ecx + pxor %xmm0,%xmm3 + pshufb %xmm1,%xmm0 + andl $48,%ecx + subl $1,%eax + pxor %xmm3,%xmm0 +.L000enc_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm6,%xmm0 + pshufb %xmm0,%xmm5 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 + pshufb %xmm1,%xmm3 + movdqa %xmm7,%xmm4 + pxor %xmm5,%xmm3 + pshufb %xmm0,%xmm4 + movdqa %xmm7,%xmm2 + pxor %xmm5,%xmm4 + pshufb %xmm3,%xmm2 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 + pshufb %xmm4,%xmm3 + movdqu (%edx),%xmm5 + pxor %xmm1,%xmm3 + jnz .L001enc_loop + movdqa 96(%ebp),%xmm4 + movdqa 112(%ebp),%xmm0 + pshufb %xmm2,%xmm4 + pxor %xmm5,%xmm4 + pshufb %xmm3,%xmm0 + movdqa 64(%ebx,%ecx,1),%xmm1 + pxor %xmm4,%xmm0 + pshufb %xmm1,%xmm0 + ret +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core +.hidden _vpaes_decrypt_core +.type _vpaes_decrypt_core,@function +.align 16 +_vpaes_decrypt_core: + leal 608(%ebp),%ebx + movl 240(%edx),%eax + movdqa %xmm6,%xmm1 + movdqa -64(%ebx),%xmm2 + pandn %xmm0,%xmm1 + movl %eax,%ecx + psrld $4,%xmm1 + movdqu (%edx),%xmm5 + shll $4,%ecx + pand %xmm6,%xmm0 + pshufb %xmm0,%xmm2 + movdqa -48(%ebx),%xmm0 + xorl $48,%ecx + pshufb %xmm1,%xmm0 + andl $48,%ecx + pxor %xmm5,%xmm2 + movdqa 176(%ebp),%xmm5 + pxor %xmm2,%xmm0 + addl $16,%edx + leal -352(%ebx,%ecx,1),%ecx + jmp .L002dec_entry +.align 16 +.L003dec_loop: + movdqa -32(%ebx),%xmm4 + movdqa -16(%ebx),%xmm1 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa (%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%ebx),%xmm1 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa 32(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%ebx),%xmm1 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa 64(%ebx),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%ebx),%xmm1 + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + addl $16,%edx + palignr $12,%xmm5,%xmm5 + pxor %xmm1,%xmm0 + subl $1,%eax +.L002dec_entry: + movdqa %xmm6,%xmm1 + movdqa -32(%ebp),%xmm2 + pandn %xmm0,%xmm1 + pand %xmm6,%xmm0 + psrld $4,%xmm1 + pshufb %xmm0,%xmm2 + movdqa %xmm7,%xmm3 + pxor %xmm1,%xmm0 + pshufb %xmm1,%xmm3 + movdqa %xmm7,%xmm4 + pxor %xmm2,%xmm3 + pshufb %xmm0,%xmm4 + pxor %xmm2,%xmm4 + movdqa %xmm7,%xmm2 + pshufb %xmm3,%xmm2 + movdqa %xmm7,%xmm3 + pxor %xmm0,%xmm2 + pshufb %xmm4,%xmm3 + movdqu (%edx),%xmm0 + pxor %xmm1,%xmm3 + jnz .L003dec_loop + movdqa 96(%ebx),%xmm4 + pshufb %xmm2,%xmm4 + pxor %xmm0,%xmm4 + movdqa 112(%ebx),%xmm0 + movdqa (%ecx),%xmm2 + pshufb %xmm3,%xmm0 + pxor %xmm4,%xmm0 + pshufb %xmm2,%xmm0 + ret +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core +.hidden _vpaes_schedule_core +.type _vpaes_schedule_core,@function +.align 16 +_vpaes_schedule_core: + addl (%esp),%ebp + movdqu (%esi),%xmm0 + movdqa 320(%ebp),%xmm2 + movdqa %xmm0,%xmm3 + leal (%ebp),%ebx + movdqa %xmm2,4(%esp) + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + testl %edi,%edi + jnz .L004schedule_am_decrypting + movdqu %xmm0,(%edx) + jmp .L005schedule_go +.L004schedule_am_decrypting: + movdqa 256(%ebp,%ecx,1),%xmm1 + pshufb %xmm1,%xmm3 + movdqu %xmm3,(%edx) + xorl $48,%ecx +.L005schedule_go: + cmpl $192,%eax + ja .L006schedule_256 + je .L007schedule_192 +.L008schedule_128: + movl $10,%eax +.L009loop_schedule_128: + call _vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call _vpaes_schedule_mangle + jmp .L009loop_schedule_128 +.align 16 +.L007schedule_192: + movdqu 8(%esi),%xmm0 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%eax +.L011loop_schedule_192: + call _vpaes_schedule_round + palignr $8,%xmm6,%xmm0 + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle + call _vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + jmp .L011loop_schedule_192 +.align 16 +.L006schedule_256: + movdqu 16(%esi),%xmm0 + call _vpaes_schedule_transform + movl $7,%eax +.L012loop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + call _vpaes_schedule_round + decl %eax + jz .L010schedule_mangle_last + call _vpaes_schedule_mangle + pshufd $255,%xmm0,%xmm0 + movdqa %xmm7,20(%esp) + movdqa %xmm6,%xmm7 + call .L_vpaes_schedule_low_round + movdqa 20(%esp),%xmm7 + jmp .L012loop_schedule_256 +.align 16 +.L010schedule_mangle_last: + leal 384(%ebp),%ebx + testl %edi,%edi + jnz .L013schedule_mangle_last_dec + movdqa 256(%ebp,%ecx,1),%xmm1 + pshufb %xmm1,%xmm0 + leal 352(%ebp),%ebx + addl $32,%edx +.L013schedule_mangle_last_dec: + addl $-16,%edx + pxor 336(%ebp),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%edx) + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret +.size _vpaes_schedule_core,.-_vpaes_schedule_core +.hidden _vpaes_schedule_192_smear +.type _vpaes_schedule_192_smear,@function +.align 16 +_vpaes_schedule_192_smear: + pshufd $128,%xmm6,%xmm1 + pshufd $254,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + ret +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear +.hidden _vpaes_schedule_round +.type _vpaes_schedule_round,@function +.align 16 +_vpaes_schedule_round: + movdqa 8(%esp),%xmm2 + pxor %xmm1,%xmm1 + palignr $15,%xmm2,%xmm1 + palignr $15,%xmm2,%xmm2 + pxor %xmm1,%xmm7 + pshufd $255,%xmm0,%xmm0 + palignr $1,%xmm0,%xmm0 + movdqa %xmm2,8(%esp) +.L_vpaes_schedule_low_round: + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor 336(%ebp),%xmm7 + movdqa -16(%ebp),%xmm4 + movdqa -48(%ebp),%xmm5 + movdqa %xmm4,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm4,%xmm0 + movdqa -32(%ebp),%xmm2 + pshufb %xmm0,%xmm2 + pxor %xmm1,%xmm0 + movdqa %xmm5,%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + movdqa %xmm5,%xmm4 + pshufb %xmm0,%xmm4 + pxor %xmm2,%xmm4 + movdqa %xmm5,%xmm2 + pshufb %xmm3,%xmm2 + pxor %xmm0,%xmm2 + movdqa %xmm5,%xmm3 + pshufb %xmm4,%xmm3 + pxor %xmm1,%xmm3 + movdqa 32(%ebp),%xmm4 + pshufb %xmm2,%xmm4 + movdqa 48(%ebp),%xmm0 + pshufb %xmm3,%xmm0 + pxor %xmm4,%xmm0 + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret +.size _vpaes_schedule_round,.-_vpaes_schedule_round +.hidden _vpaes_schedule_transform +.type _vpaes_schedule_transform,@function +.align 16 +_vpaes_schedule_transform: + movdqa -16(%ebp),%xmm2 + movdqa %xmm2,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm0 + movdqa (%ebx),%xmm2 + pshufb %xmm0,%xmm2 + movdqa 16(%ebx),%xmm0 + pshufb %xmm1,%xmm0 + pxor %xmm2,%xmm0 + ret +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform +.hidden _vpaes_schedule_mangle +.type _vpaes_schedule_mangle,@function +.align 16 +_vpaes_schedule_mangle: + movdqa %xmm0,%xmm4 + movdqa 128(%ebp),%xmm5 + testl %edi,%edi + jnz .L014schedule_mangle_dec + addl $16,%edx + pxor 336(%ebp),%xmm4 + pshufb %xmm5,%xmm4 + movdqa %xmm4,%xmm3 + pshufb %xmm5,%xmm4 + pxor %xmm4,%xmm3 + pshufb %xmm5,%xmm4 + pxor %xmm4,%xmm3 + jmp .L015schedule_mangle_both +.align 16 +.L014schedule_mangle_dec: + movdqa -16(%ebp),%xmm2 + leal 416(%ebp),%esi + movdqa %xmm2,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm2,%xmm4 + movdqa (%esi),%xmm2 + pshufb %xmm4,%xmm2 + movdqa 16(%esi),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + movdqa 32(%esi),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 48(%esi),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + movdqa 64(%esi),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 80(%esi),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + movdqa 96(%esi),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 112(%esi),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + addl $-16,%edx +.L015schedule_mangle_both: + movdqa 256(%ebp,%ecx,1),%xmm1 + pshufb %xmm1,%xmm3 + addl $-16,%ecx + andl $48,%ecx + movdqu %xmm3,(%edx) + ret +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,@function +.align 16 +vpaes_set_encrypt_key: +.L_vpaes_set_encrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L016pic_for_function_hit +.L016pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+5-.L016pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + movl $48,%ecx + movl $0,%edi + leal .L_vpaes_consts+0x30-.L017pic_point,%ebp + call _vpaes_schedule_core +.L017pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin +.globl vpaes_set_decrypt_key +.hidden vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,@function +.align 16 +vpaes_set_decrypt_key: +.L_vpaes_set_decrypt_key_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%eax + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movl %eax,%ebx + shrl $5,%ebx + addl $5,%ebx + movl %ebx,240(%edx) + shll $4,%ebx + leal 16(%edx,%ebx,1),%edx + movl $1,%edi + movl %eax,%ecx + shrl $1,%ecx + andl $32,%ecx + xorl $32,%ecx + leal .L_vpaes_consts+0x30-.L018pic_point,%ebp + call _vpaes_schedule_core +.L018pic_point: + movl 48(%esp),%esp + xorl %eax,%eax + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin +.globl vpaes_encrypt +.hidden vpaes_encrypt +.type vpaes_encrypt,@function +.align 16 +vpaes_encrypt: +.L_vpaes_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi +#ifdef BORINGSSL_DISPATCH_TEST + pushl %ebx + pushl %edx + call .L019pic_for_function_hit +.L019pic_for_function_hit: + popl %ebx + leal BORINGSSL_function_hit+4-.L019pic_for_function_hit(%ebx),%ebx + movl $1,%edx + movb %dl,(%ebx) + popl %edx + popl %ebx +#endif + leal .L_vpaes_consts+0x30-.L020pic_point,%ebp + call _vpaes_preheat +.L020pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call _vpaes_encrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_encrypt,.-.L_vpaes_encrypt_begin +.globl vpaes_decrypt +.hidden vpaes_decrypt +.type vpaes_decrypt,@function +.align 16 +vpaes_decrypt: +.L_vpaes_decrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + leal .L_vpaes_consts+0x30-.L021pic_point,%ebp + call _vpaes_preheat +.L021pic_point: + movl 20(%esp),%esi + leal -56(%esp),%ebx + movl 24(%esp),%edi + andl $-16,%ebx + movl 28(%esp),%edx + xchgl %esp,%ebx + movl %ebx,48(%esp) + movdqu (%esi),%xmm0 + call _vpaes_decrypt_core + movdqu %xmm0,(%edi) + movl 48(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_decrypt,.-.L_vpaes_decrypt_begin +.globl vpaes_cbc_encrypt +.hidden vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,@function +.align 16 +vpaes_cbc_encrypt: +.L_vpaes_cbc_encrypt_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 20(%esp),%esi + movl 24(%esp),%edi + movl 28(%esp),%eax + movl 32(%esp),%edx + subl $16,%eax + jc .L022cbc_abort + leal -56(%esp),%ebx + movl 36(%esp),%ebp + andl $-16,%ebx + movl 40(%esp),%ecx + xchgl %esp,%ebx + movdqu (%ebp),%xmm1 + subl %esi,%edi + movl %ebx,48(%esp) + movl %edi,(%esp) + movl %edx,4(%esp) + movl %ebp,8(%esp) + movl %eax,%edi + leal .L_vpaes_consts+0x30-.L023pic_point,%ebp + call _vpaes_preheat +.L023pic_point: + cmpl $0,%ecx + je .L024cbc_dec_loop + jmp .L025cbc_enc_loop +.align 16 +.L025cbc_enc_loop: + movdqu (%esi),%xmm0 + pxor %xmm1,%xmm0 + call _vpaes_encrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + movdqa %xmm0,%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc .L025cbc_enc_loop + jmp .L026cbc_done +.align 16 +.L024cbc_dec_loop: + movdqu (%esi),%xmm0 + movdqa %xmm1,16(%esp) + movdqa %xmm0,32(%esp) + call _vpaes_decrypt_core + movl (%esp),%ebx + movl 4(%esp),%edx + pxor 16(%esp),%xmm0 + movdqa 32(%esp),%xmm1 + movdqu %xmm0,(%ebx,%esi,1) + leal 16(%esi),%esi + subl $16,%edi + jnc .L024cbc_dec_loop +.L026cbc_done: + movl 8(%esp),%ebx + movl 48(%esp),%esp + movdqu %xmm1,(%ebx) +.L022cbc_abort: + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/vpaes-x86-win.asm b/third_party/boringssl/gen/bcm/vpaes-x86-win.asm new file mode 100644 index 00000000..30ba96cf --- /dev/null +++ b/third_party/boringssl/gen/bcm/vpaes-x86-win.asm @@ -0,0 +1,679 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +%ifdef BORINGSSL_DISPATCH_TEST +extern _BORINGSSL_function_hit +%endif +align 64 +L$_vpaes_consts: +dd 218628480,235210255,168496130,67568393 +dd 252381056,17041926,33884169,51187212 +dd 252645135,252645135,252645135,252645135 +dd 1512730624,3266504856,1377990664,3401244816 +dd 830229760,1275146365,2969422977,3447763452 +dd 3411033600,2979783055,338359620,2782886510 +dd 4209124096,907596821,221174255,1006095553 +dd 191964160,3799684038,3164090317,1589111125 +dd 182528256,1777043520,2877432650,3265356744 +dd 1874708224,3503451415,3305285752,363511674 +dd 1606117888,3487855781,1093350906,2384367825 +dd 197121,67569157,134941193,202313229 +dd 67569157,134941193,202313229,197121 +dd 134941193,202313229,197121,67569157 +dd 202313229,197121,67569157,134941193 +dd 33619971,100992007,168364043,235736079 +dd 235736079,33619971,100992007,168364043 +dd 168364043,235736079,33619971,100992007 +dd 100992007,168364043,235736079,33619971 +dd 50462976,117835012,185207048,252579084 +dd 252314880,51251460,117574920,184942860 +dd 184682752,252054788,50987272,118359308 +dd 118099200,185467140,251790600,50727180 +dd 2946363062,528716217,1300004225,1881839624 +dd 1532713819,1532713819,1532713819,1532713819 +dd 3602276352,4288629033,3737020424,4153884961 +dd 1354558464,32357713,2958822624,3775749553 +dd 1201988352,132424512,1572796698,503232858 +dd 2213177600,1597421020,4103937655,675398315 +dd 2749646592,4273543773,1511898873,121693092 +dd 3040248576,1103263732,2871565598,1608280554 +dd 2236667136,2588920351,482954393,64377734 +dd 3069987328,291237287,2117370568,3650299247 +dd 533321216,3573750986,2572112006,1401264716 +dd 1339849704,2721158661,548607111,3445553514 +dd 2128193280,3054596040,2183486460,1257083700 +dd 655635200,1165381986,3923443150,2344132524 +dd 190078720,256924420,290342170,357187870 +dd 1610966272,2263057382,4103205268,309794674 +dd 2592527872,2233205587,1335446729,3402964816 +dd 3973531904,3225098121,3002836325,1918774430 +dd 3870401024,2102906079,2284471353,4117666579 +dd 617007872,1021508343,366931923,691083277 +dd 2528395776,3491914898,2968704004,1613121270 +dd 3445188352,3247741094,844474987,4093578302 +dd 651481088,1190302358,1689581232,574775300 +dd 4289380608,206939853,2555985458,2489840491 +dd 2130264064,327674451,3566485037,3349835193 +dd 2470714624,316102159,3636825756,3393945945 +db 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 +db 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 +db 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 +db 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 +db 118,101,114,115,105,116,121,41,0 +align 64 +align 16 +__vpaes_preheat: + add ebp,DWORD [esp] + movdqa xmm7,[ebp-48] + movdqa xmm6,[ebp-16] + ret +align 16 +__vpaes_encrypt_core: + mov ecx,16 + mov eax,DWORD [240+edx] + movdqa xmm1,xmm6 + movdqa xmm2,[ebp] + pandn xmm1,xmm0 + pand xmm0,xmm6 + movdqu xmm5,[edx] + pshufb xmm2,xmm0 + movdqa xmm0,[16+ebp] + pxor xmm2,xmm5 + psrld xmm1,4 + add edx,16 + pshufb xmm0,xmm1 + lea ebx,[192+ebp] + pxor xmm0,xmm2 + jmp NEAR L$000enc_entry +align 16 +L$001enc_loop: + movdqa xmm4,[32+ebp] + movdqa xmm0,[48+ebp] + pshufb xmm4,xmm2 + pshufb xmm0,xmm3 + pxor xmm4,xmm5 + movdqa xmm5,[64+ebp] + pxor xmm0,xmm4 + movdqa xmm1,[ecx*1+ebx-64] + pshufb xmm5,xmm2 + movdqa xmm2,[80+ebp] + movdqa xmm4,[ecx*1+ebx] + pshufb xmm2,xmm3 + movdqa xmm3,xmm0 + pxor xmm2,xmm5 + pshufb xmm0,xmm1 + add edx,16 + pxor xmm0,xmm2 + pshufb xmm3,xmm4 + add ecx,16 + pxor xmm3,xmm0 + pshufb xmm0,xmm1 + and ecx,48 + sub eax,1 + pxor xmm0,xmm3 +L$000enc_entry: + movdqa xmm1,xmm6 + movdqa xmm5,[ebp-32] + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm6 + pshufb xmm5,xmm0 + movdqa xmm3,xmm7 + pxor xmm0,xmm1 + pshufb xmm3,xmm1 + movdqa xmm4,xmm7 + pxor xmm3,xmm5 + pshufb xmm4,xmm0 + movdqa xmm2,xmm7 + pxor xmm4,xmm5 + pshufb xmm2,xmm3 + movdqa xmm3,xmm7 + pxor xmm2,xmm0 + pshufb xmm3,xmm4 + movdqu xmm5,[edx] + pxor xmm3,xmm1 + jnz NEAR L$001enc_loop + movdqa xmm4,[96+ebp] + movdqa xmm0,[112+ebp] + pshufb xmm4,xmm2 + pxor xmm4,xmm5 + pshufb xmm0,xmm3 + movdqa xmm1,[64+ecx*1+ebx] + pxor xmm0,xmm4 + pshufb xmm0,xmm1 + ret +align 16 +__vpaes_decrypt_core: + lea ebx,[608+ebp] + mov eax,DWORD [240+edx] + movdqa xmm1,xmm6 + movdqa xmm2,[ebx-64] + pandn xmm1,xmm0 + mov ecx,eax + psrld xmm1,4 + movdqu xmm5,[edx] + shl ecx,4 + pand xmm0,xmm6 + pshufb xmm2,xmm0 + movdqa xmm0,[ebx-48] + xor ecx,48 + pshufb xmm0,xmm1 + and ecx,48 + pxor xmm2,xmm5 + movdqa xmm5,[176+ebp] + pxor xmm0,xmm2 + add edx,16 + lea ecx,[ecx*1+ebx-352] + jmp NEAR L$002dec_entry +align 16 +L$003dec_loop: + movdqa xmm4,[ebx-32] + movdqa xmm1,[ebx-16] + pshufb xmm4,xmm2 + pshufb xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,[ebx] + pxor xmm0,xmm1 + movdqa xmm1,[16+ebx] + pshufb xmm4,xmm2 + pshufb xmm0,xmm5 + pshufb xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,[32+ebx] + pxor xmm0,xmm1 + movdqa xmm1,[48+ebx] + pshufb xmm4,xmm2 + pshufb xmm0,xmm5 + pshufb xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,[64+ebx] + pxor xmm0,xmm1 + movdqa xmm1,[80+ebx] + pshufb xmm4,xmm2 + pshufb xmm0,xmm5 + pshufb xmm1,xmm3 + pxor xmm0,xmm4 + add edx,16 + palignr xmm5,xmm5,12 + pxor xmm0,xmm1 + sub eax,1 +L$002dec_entry: + movdqa xmm1,xmm6 + movdqa xmm2,[ebp-32] + pandn xmm1,xmm0 + pand xmm0,xmm6 + psrld xmm1,4 + pshufb xmm2,xmm0 + movdqa xmm3,xmm7 + pxor xmm0,xmm1 + pshufb xmm3,xmm1 + movdqa xmm4,xmm7 + pxor xmm3,xmm2 + pshufb xmm4,xmm0 + pxor xmm4,xmm2 + movdqa xmm2,xmm7 + pshufb xmm2,xmm3 + movdqa xmm3,xmm7 + pxor xmm2,xmm0 + pshufb xmm3,xmm4 + movdqu xmm0,[edx] + pxor xmm3,xmm1 + jnz NEAR L$003dec_loop + movdqa xmm4,[96+ebx] + pshufb xmm4,xmm2 + pxor xmm4,xmm0 + movdqa xmm0,[112+ebx] + movdqa xmm2,[ecx] + pshufb xmm0,xmm3 + pxor xmm0,xmm4 + pshufb xmm0,xmm2 + ret +align 16 +__vpaes_schedule_core: + add ebp,DWORD [esp] + movdqu xmm0,[esi] + movdqa xmm2,[320+ebp] + movdqa xmm3,xmm0 + lea ebx,[ebp] + movdqa [4+esp],xmm2 + call __vpaes_schedule_transform + movdqa xmm7,xmm0 + test edi,edi + jnz NEAR L$004schedule_am_decrypting + movdqu [edx],xmm0 + jmp NEAR L$005schedule_go +L$004schedule_am_decrypting: + movdqa xmm1,[256+ecx*1+ebp] + pshufb xmm3,xmm1 + movdqu [edx],xmm3 + xor ecx,48 +L$005schedule_go: + cmp eax,192 + ja NEAR L$006schedule_256 + je NEAR L$007schedule_192 +L$008schedule_128: + mov eax,10 +L$009loop_schedule_128: + call __vpaes_schedule_round + dec eax + jz NEAR L$010schedule_mangle_last + call __vpaes_schedule_mangle + jmp NEAR L$009loop_schedule_128 +align 16 +L$007schedule_192: + movdqu xmm0,[8+esi] + call __vpaes_schedule_transform + movdqa xmm6,xmm0 + pxor xmm4,xmm4 + movhlps xmm6,xmm4 + mov eax,4 +L$011loop_schedule_192: + call __vpaes_schedule_round + palignr xmm0,xmm6,8 + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + call __vpaes_schedule_mangle + call __vpaes_schedule_round + dec eax + jz NEAR L$010schedule_mangle_last + call __vpaes_schedule_mangle + call __vpaes_schedule_192_smear + jmp NEAR L$011loop_schedule_192 +align 16 +L$006schedule_256: + movdqu xmm0,[16+esi] + call __vpaes_schedule_transform + mov eax,7 +L$012loop_schedule_256: + call __vpaes_schedule_mangle + movdqa xmm6,xmm0 + call __vpaes_schedule_round + dec eax + jz NEAR L$010schedule_mangle_last + call __vpaes_schedule_mangle + pshufd xmm0,xmm0,255 + movdqa [20+esp],xmm7 + movdqa xmm7,xmm6 + call L$_vpaes_schedule_low_round + movdqa xmm7,[20+esp] + jmp NEAR L$012loop_schedule_256 +align 16 +L$010schedule_mangle_last: + lea ebx,[384+ebp] + test edi,edi + jnz NEAR L$013schedule_mangle_last_dec + movdqa xmm1,[256+ecx*1+ebp] + pshufb xmm0,xmm1 + lea ebx,[352+ebp] + add edx,32 +L$013schedule_mangle_last_dec: + add edx,-16 + pxor xmm0,[336+ebp] + call __vpaes_schedule_transform + movdqu [edx],xmm0 + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + ret +align 16 +__vpaes_schedule_192_smear: + pshufd xmm1,xmm6,128 + pshufd xmm0,xmm7,254 + pxor xmm6,xmm1 + pxor xmm1,xmm1 + pxor xmm6,xmm0 + movdqa xmm0,xmm6 + movhlps xmm6,xmm1 + ret +align 16 +__vpaes_schedule_round: + movdqa xmm2,[8+esp] + pxor xmm1,xmm1 + palignr xmm1,xmm2,15 + palignr xmm2,xmm2,15 + pxor xmm7,xmm1 + pshufd xmm0,xmm0,255 + palignr xmm0,xmm0,1 + movdqa [8+esp],xmm2 +L$_vpaes_schedule_low_round: + movdqa xmm1,xmm7 + pslldq xmm7,4 + pxor xmm7,xmm1 + movdqa xmm1,xmm7 + pslldq xmm7,8 + pxor xmm7,xmm1 + pxor xmm7,[336+ebp] + movdqa xmm4,[ebp-16] + movdqa xmm5,[ebp-48] + movdqa xmm1,xmm4 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm4 + movdqa xmm2,[ebp-32] + pshufb xmm2,xmm0 + pxor xmm0,xmm1 + movdqa xmm3,xmm5 + pshufb xmm3,xmm1 + pxor xmm3,xmm2 + movdqa xmm4,xmm5 + pshufb xmm4,xmm0 + pxor xmm4,xmm2 + movdqa xmm2,xmm5 + pshufb xmm2,xmm3 + pxor xmm2,xmm0 + movdqa xmm3,xmm5 + pshufb xmm3,xmm4 + pxor xmm3,xmm1 + movdqa xmm4,[32+ebp] + pshufb xmm4,xmm2 + movdqa xmm0,[48+ebp] + pshufb xmm0,xmm3 + pxor xmm0,xmm4 + pxor xmm0,xmm7 + movdqa xmm7,xmm0 + ret +align 16 +__vpaes_schedule_transform: + movdqa xmm2,[ebp-16] + movdqa xmm1,xmm2 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm2 + movdqa xmm2,[ebx] + pshufb xmm2,xmm0 + movdqa xmm0,[16+ebx] + pshufb xmm0,xmm1 + pxor xmm0,xmm2 + ret +align 16 +__vpaes_schedule_mangle: + movdqa xmm4,xmm0 + movdqa xmm5,[128+ebp] + test edi,edi + jnz NEAR L$014schedule_mangle_dec + add edx,16 + pxor xmm4,[336+ebp] + pshufb xmm4,xmm5 + movdqa xmm3,xmm4 + pshufb xmm4,xmm5 + pxor xmm3,xmm4 + pshufb xmm4,xmm5 + pxor xmm3,xmm4 + jmp NEAR L$015schedule_mangle_both +align 16 +L$014schedule_mangle_dec: + movdqa xmm2,[ebp-16] + lea esi,[416+ebp] + movdqa xmm1,xmm2 + pandn xmm1,xmm4 + psrld xmm1,4 + pand xmm4,xmm2 + movdqa xmm2,[esi] + pshufb xmm2,xmm4 + movdqa xmm3,[16+esi] + pshufb xmm3,xmm1 + pxor xmm3,xmm2 + pshufb xmm3,xmm5 + movdqa xmm2,[32+esi] + pshufb xmm2,xmm4 + pxor xmm2,xmm3 + movdqa xmm3,[48+esi] + pshufb xmm3,xmm1 + pxor xmm3,xmm2 + pshufb xmm3,xmm5 + movdqa xmm2,[64+esi] + pshufb xmm2,xmm4 + pxor xmm2,xmm3 + movdqa xmm3,[80+esi] + pshufb xmm3,xmm1 + pxor xmm3,xmm2 + pshufb xmm3,xmm5 + movdqa xmm2,[96+esi] + pshufb xmm2,xmm4 + pxor xmm2,xmm3 + movdqa xmm3,[112+esi] + pshufb xmm3,xmm1 + pxor xmm3,xmm2 + add edx,-16 +L$015schedule_mangle_both: + movdqa xmm1,[256+ecx*1+ebp] + pshufb xmm3,xmm1 + add ecx,-16 + and ecx,48 + movdqu [edx],xmm3 + ret +global _vpaes_set_encrypt_key +align 16 +_vpaes_set_encrypt_key: +L$_vpaes_set_encrypt_key_begin: + push ebp + push ebx + push esi + push edi +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$016pic_for_function_hit +L$016pic_for_function_hit: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+5-L$016pic_for_function_hit)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + mov esi,DWORD [20+esp] + lea ebx,[esp-56] + mov eax,DWORD [24+esp] + and ebx,-16 + mov edx,DWORD [28+esp] + xchg ebx,esp + mov DWORD [48+esp],ebx + mov ebx,eax + shr ebx,5 + add ebx,5 + mov DWORD [240+edx],ebx + mov ecx,48 + mov edi,0 + lea ebp,[(L$_vpaes_consts+0x30-L$017pic_point)] + call __vpaes_schedule_core +L$017pic_point: + mov esp,DWORD [48+esp] + xor eax,eax + pop edi + pop esi + pop ebx + pop ebp + ret +global _vpaes_set_decrypt_key +align 16 +_vpaes_set_decrypt_key: +L$_vpaes_set_decrypt_key_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + lea ebx,[esp-56] + mov eax,DWORD [24+esp] + and ebx,-16 + mov edx,DWORD [28+esp] + xchg ebx,esp + mov DWORD [48+esp],ebx + mov ebx,eax + shr ebx,5 + add ebx,5 + mov DWORD [240+edx],ebx + shl ebx,4 + lea edx,[16+ebx*1+edx] + mov edi,1 + mov ecx,eax + shr ecx,1 + and ecx,32 + xor ecx,32 + lea ebp,[(L$_vpaes_consts+0x30-L$018pic_point)] + call __vpaes_schedule_core +L$018pic_point: + mov esp,DWORD [48+esp] + xor eax,eax + pop edi + pop esi + pop ebx + pop ebp + ret +global _vpaes_encrypt +align 16 +_vpaes_encrypt: +L$_vpaes_encrypt_begin: + push ebp + push ebx + push esi + push edi +%ifdef BORINGSSL_DISPATCH_TEST + push ebx + push edx + call L$019pic_for_function_hit +L$019pic_for_function_hit: + pop ebx + lea ebx,[(_BORINGSSL_function_hit+4-L$019pic_for_function_hit)+ebx] + mov edx,1 + mov BYTE [ebx],dl + pop edx + pop ebx +%endif + lea ebp,[(L$_vpaes_consts+0x30-L$020pic_point)] + call __vpaes_preheat +L$020pic_point: + mov esi,DWORD [20+esp] + lea ebx,[esp-56] + mov edi,DWORD [24+esp] + and ebx,-16 + mov edx,DWORD [28+esp] + xchg ebx,esp + mov DWORD [48+esp],ebx + movdqu xmm0,[esi] + call __vpaes_encrypt_core + movdqu [edi],xmm0 + mov esp,DWORD [48+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +global _vpaes_decrypt +align 16 +_vpaes_decrypt: +L$_vpaes_decrypt_begin: + push ebp + push ebx + push esi + push edi + lea ebp,[(L$_vpaes_consts+0x30-L$021pic_point)] + call __vpaes_preheat +L$021pic_point: + mov esi,DWORD [20+esp] + lea ebx,[esp-56] + mov edi,DWORD [24+esp] + and ebx,-16 + mov edx,DWORD [28+esp] + xchg ebx,esp + mov DWORD [48+esp],ebx + movdqu xmm0,[esi] + call __vpaes_decrypt_core + movdqu [edi],xmm0 + mov esp,DWORD [48+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +global _vpaes_cbc_encrypt +align 16 +_vpaes_cbc_encrypt: +L$_vpaes_cbc_encrypt_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [20+esp] + mov edi,DWORD [24+esp] + mov eax,DWORD [28+esp] + mov edx,DWORD [32+esp] + sub eax,16 + jc NEAR L$022cbc_abort + lea ebx,[esp-56] + mov ebp,DWORD [36+esp] + and ebx,-16 + mov ecx,DWORD [40+esp] + xchg ebx,esp + movdqu xmm1,[ebp] + sub edi,esi + mov DWORD [48+esp],ebx + mov DWORD [esp],edi + mov DWORD [4+esp],edx + mov DWORD [8+esp],ebp + mov edi,eax + lea ebp,[(L$_vpaes_consts+0x30-L$023pic_point)] + call __vpaes_preheat +L$023pic_point: + cmp ecx,0 + je NEAR L$024cbc_dec_loop + jmp NEAR L$025cbc_enc_loop +align 16 +L$025cbc_enc_loop: + movdqu xmm0,[esi] + pxor xmm0,xmm1 + call __vpaes_encrypt_core + mov ebx,DWORD [esp] + mov edx,DWORD [4+esp] + movdqa xmm1,xmm0 + movdqu [esi*1+ebx],xmm0 + lea esi,[16+esi] + sub edi,16 + jnc NEAR L$025cbc_enc_loop + jmp NEAR L$026cbc_done +align 16 +L$024cbc_dec_loop: + movdqu xmm0,[esi] + movdqa [16+esp],xmm1 + movdqa [32+esp],xmm0 + call __vpaes_decrypt_core + mov ebx,DWORD [esp] + mov edx,DWORD [4+esp] + pxor xmm0,[16+esp] + movdqa xmm1,[32+esp] + movdqu [esi*1+ebx],xmm0 + lea esi,[16+esi] + sub edi,16 + jnc NEAR L$024cbc_dec_loop +L$026cbc_done: + mov ebx,DWORD [8+esp] + mov esp,DWORD [48+esp] + movdqu [ebx],xmm1 +L$022cbc_abort: + pop edi + pop esi + pop ebx + pop ebp + ret +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/vpaes-x86_64-apple.S b/third_party/boringssl/gen/bcm/vpaes-x86_64-apple.S new file mode 100644 index 00000000..bfcc030f --- /dev/null +++ b/third_party/boringssl/gen/bcm/vpaes-x86_64-apple.S @@ -0,0 +1,1131 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_encrypt_core: + + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa L$k_ipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movdqu (%r9),%xmm5 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + pshufb %xmm0,%xmm2 + movdqa L$k_ipt+16(%rip),%xmm0 + pshufb %xmm1,%xmm0 + pxor %xmm5,%xmm2 + addq $16,%r9 + pxor %xmm2,%xmm0 + leaq L$k_mc_backward(%rip),%r10 + jmp L$enc_entry + +.p2align 4 +L$enc_loop: + + movdqa %xmm13,%xmm4 + movdqa %xmm12,%xmm0 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm0 + pxor %xmm5,%xmm4 + movdqa %xmm15,%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%r11,%r10,1),%xmm1 + pshufb %xmm2,%xmm5 + movdqa (%r11,%r10,1),%xmm4 + movdqa %xmm14,%xmm2 + pshufb %xmm3,%xmm2 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 + pshufb %xmm1,%xmm0 + addq $16,%r9 + pxor %xmm2,%xmm0 + pshufb %xmm4,%xmm3 + addq $16,%r11 + pxor %xmm0,%xmm3 + pshufb %xmm1,%xmm0 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + +L$enc_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + pshufb %xmm0,%xmm5 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 + pshufb %xmm1,%xmm3 + movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 + pshufb %xmm0,%xmm4 + movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 + pshufb %xmm3,%xmm2 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 + pshufb %xmm4,%xmm3 + movdqu (%r9),%xmm5 + pxor %xmm1,%xmm3 + jnz L$enc_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + pshufb %xmm2,%xmm4 + pxor %xmm5,%xmm4 + pshufb %xmm3,%xmm0 + movdqa 64(%r11,%r10,1),%xmm1 + pxor %xmm4,%xmm0 + pshufb %xmm1,%xmm0 + ret + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_encrypt_core_2x: + + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa L$k_ipt(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + movdqu (%r9),%xmm5 + + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 + pshufb %xmm0,%xmm2 + pshufb %xmm6,%xmm8 + movdqa L$k_ipt+16(%rip),%xmm0 + movdqa %xmm0,%xmm6 + pshufb %xmm1,%xmm0 + pshufb %xmm7,%xmm6 + pxor %xmm5,%xmm2 + pxor %xmm5,%xmm8 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + leaq L$k_mc_backward(%rip),%r10 + jmp L$enc2x_entry + +.p2align 4 +L$enc2x_loop: + + movdqa L$k_sb1(%rip),%xmm4 + movdqa L$k_sb1+16(%rip),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 + pshufb %xmm2,%xmm4 + pshufb %xmm8,%xmm12 + pshufb %xmm3,%xmm0 + pshufb %xmm11,%xmm6 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + movdqa L$k_sb2(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + movdqa -64(%r11,%r10,1),%xmm1 + + pshufb %xmm2,%xmm5 + pshufb %xmm8,%xmm13 + movdqa (%r11,%r10,1),%xmm4 + + movdqa L$k_sb2+16(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pshufb %xmm3,%xmm2 + pshufb %xmm11,%xmm8 + movdqa %xmm0,%xmm3 + movdqa %xmm6,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm13,%xmm8 + pshufb %xmm1,%xmm0 + pshufb %xmm1,%xmm6 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + pshufb %xmm4,%xmm3 + pshufb %xmm4,%xmm11 + addq $16,%r11 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm11 + pshufb %xmm1,%xmm0 + pshufb %xmm1,%xmm6 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm6 + +L$enc2x_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa L$k_inv+16(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 + pshufb %xmm0,%xmm5 + pshufb %xmm6,%xmm13 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm1,%xmm0 + pxor %xmm7,%xmm6 + pshufb %xmm1,%xmm3 + pshufb %xmm7,%xmm11 + movdqa %xmm10,%xmm4 + movdqa %xmm10,%xmm12 + pxor %xmm5,%xmm3 + pxor %xmm13,%xmm11 + pshufb %xmm0,%xmm4 + pshufb %xmm6,%xmm12 + movdqa %xmm10,%xmm2 + movdqa %xmm10,%xmm8 + pxor %xmm5,%xmm4 + pxor %xmm13,%xmm12 + pshufb %xmm3,%xmm2 + pshufb %xmm11,%xmm8 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm0,%xmm2 + pxor %xmm6,%xmm8 + pshufb %xmm4,%xmm3 + pshufb %xmm12,%xmm11 + movdqu (%r9),%xmm5 + + pxor %xmm1,%xmm3 + pxor %xmm7,%xmm11 + jnz L$enc2x_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 + pshufb %xmm2,%xmm4 + pshufb %xmm8,%xmm12 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + pshufb %xmm3,%xmm0 + pshufb %xmm11,%xmm6 + movdqa 64(%r11,%r10,1),%xmm1 + + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + pshufb %xmm1,%xmm0 + pshufb %xmm1,%xmm6 + ret + + + + + + + + + +.p2align 4 +_vpaes_decrypt_core: + + movq %rdx,%r9 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa L$k_dipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movq %rax,%r11 + psrld $4,%xmm1 + movdqu (%r9),%xmm5 + shlq $4,%r11 + pand %xmm9,%xmm0 + pshufb %xmm0,%xmm2 + movdqa L$k_dipt+16(%rip),%xmm0 + xorq $0x30,%r11 + leaq L$k_dsbd(%rip),%r10 + pshufb %xmm1,%xmm0 + andq $0x30,%r11 + pxor %xmm5,%xmm2 + movdqa L$k_mc_forward+48(%rip),%xmm5 + pxor %xmm2,%xmm0 + addq $16,%r9 + addq %r10,%r11 + jmp L$dec_entry + +.p2align 4 +L$dec_loop: + + + + movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa 0(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 + + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 + + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + addq $16,%r9 + palignr $12,%xmm5,%xmm5 + pxor %xmm1,%xmm0 + subq $1,%rax + +L$dec_entry: + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + pshufb %xmm0,%xmm2 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 + pshufb %xmm1,%xmm3 + movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 + pshufb %xmm0,%xmm4 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 + pshufb %xmm3,%xmm2 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 + pshufb %xmm4,%xmm3 + movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 + jnz L$dec_loop + + + movdqa 96(%r10),%xmm4 + pshufb %xmm2,%xmm4 + pxor %xmm0,%xmm4 + movdqa 112(%r10),%xmm0 + movdqa -352(%r11),%xmm2 + pshufb %xmm3,%xmm0 + pxor %xmm4,%xmm0 + pshufb %xmm2,%xmm0 + ret + + + + + + + + + +.p2align 4 +_vpaes_schedule_core: + + + + + + + call _vpaes_preheat + movdqa L$k_rcon(%rip),%xmm8 + movdqu (%rdi),%xmm0 + + + movdqa %xmm0,%xmm3 + leaq L$k_ipt(%rip),%r11 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + + leaq L$k_sr(%rip),%r10 + testq %rcx,%rcx + jnz L$schedule_am_decrypting + + + movdqu %xmm0,(%rdx) + jmp L$schedule_go + +L$schedule_am_decrypting: + + movdqa (%r8,%r10,1),%xmm1 + pshufb %xmm1,%xmm3 + movdqu %xmm3,(%rdx) + xorq $0x30,%r8 + +L$schedule_go: + cmpl $192,%esi + ja L$schedule_256 + je L$schedule_192 + + + + + + + + + + +L$schedule_128: + movl $10,%esi + +L$oop_schedule_128: + call _vpaes_schedule_round + decq %rsi + jz L$schedule_mangle_last + call _vpaes_schedule_mangle + jmp L$oop_schedule_128 + + + + + + + + + + + + + + + + +.p2align 4 +L$schedule_192: + movdqu 8(%rdi),%xmm0 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%esi + +L$oop_schedule_192: + call _vpaes_schedule_round + palignr $8,%xmm6,%xmm0 + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle + call _vpaes_schedule_round + decq %rsi + jz L$schedule_mangle_last + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + jmp L$oop_schedule_192 + + + + + + + + + + + +.p2align 4 +L$schedule_256: + movdqu 16(%rdi),%xmm0 + call _vpaes_schedule_transform + movl $7,%esi + +L$oop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + + + call _vpaes_schedule_round + decq %rsi + jz L$schedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd $0xFF,%xmm0,%xmm0 + movdqa %xmm7,%xmm5 + movdqa %xmm6,%xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5,%xmm7 + + jmp L$oop_schedule_256 + + + + + + + + + + + + +.p2align 4 +L$schedule_mangle_last: + + leaq L$k_deskew(%rip),%r11 + testq %rcx,%rcx + jnz L$schedule_mangle_last_dec + + + movdqa (%r8,%r10,1),%xmm1 + pshufb %xmm1,%xmm0 + leaq L$k_opt(%rip),%r11 + addq $32,%rdx + +L$schedule_mangle_last_dec: + addq $-16,%rdx + pxor L$k_s63(%rip),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%rdx) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_schedule_192_smear: + + pshufd $0x80,%xmm6,%xmm1 + pshufd $0xFE,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + ret + + + + + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_schedule_round: + + + pxor %xmm1,%xmm1 + palignr $15,%xmm8,%xmm1 + palignr $15,%xmm8,%xmm8 + pxor %xmm1,%xmm7 + + + pshufd $0xFF,%xmm0,%xmm0 + palignr $1,%xmm0,%xmm0 + + + + +_vpaes_schedule_low_round: + + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor L$k_s63(%rip),%xmm7 + + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa %xmm11,%xmm2 + pshufb %xmm0,%xmm2 + pxor %xmm1,%xmm0 + movdqa %xmm10,%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + movdqa %xmm10,%xmm4 + pshufb %xmm0,%xmm4 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 + pshufb %xmm3,%xmm2 + pxor %xmm0,%xmm2 + movdqa %xmm10,%xmm3 + pshufb %xmm4,%xmm3 + pxor %xmm1,%xmm3 + movdqa %xmm13,%xmm4 + pshufb %xmm2,%xmm4 + movdqa %xmm12,%xmm0 + pshufb %xmm3,%xmm0 + pxor %xmm4,%xmm0 + + + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret + + + + + + + + + + + + + +.p2align 4 +_vpaes_schedule_transform: + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa (%r11),%xmm2 + pshufb %xmm0,%xmm2 + movdqa 16(%r11),%xmm0 + pshufb %xmm1,%xmm0 + pxor %xmm2,%xmm0 + ret + + + + + + + + + + + + + + + + + + + + + + + + + + + +.p2align 4 +_vpaes_schedule_mangle: + + movdqa %xmm0,%xmm4 + movdqa L$k_mc_forward(%rip),%xmm5 + testq %rcx,%rcx + jnz L$schedule_mangle_dec + + + addq $16,%rdx + pxor L$k_s63(%rip),%xmm4 + pshufb %xmm5,%xmm4 + movdqa %xmm4,%xmm3 + pshufb %xmm5,%xmm4 + pxor %xmm4,%xmm3 + pshufb %xmm5,%xmm4 + pxor %xmm4,%xmm3 + + jmp L$schedule_mangle_both +.p2align 4 +L$schedule_mangle_dec: + + leaq L$k_dksd(%rip),%r11 + movdqa %xmm9,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm4 + + movdqa 0(%r11),%xmm2 + pshufb %xmm4,%xmm2 + movdqa 16(%r11),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + + movdqa 32(%r11),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 48(%r11),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + + movdqa 64(%r11),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 80(%r11),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + + movdqa 96(%r11),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 112(%r11),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + + addq $-16,%rdx + +L$schedule_mangle_both: + movdqa (%r8,%r10,1),%xmm1 + pshufb %xmm1,%xmm3 + addq $-16,%r8 + andq $0x30,%r8 + movdqu %xmm3,(%rdx) + ret + + + + + + +.globl _vpaes_set_encrypt_key +.private_extern _vpaes_set_encrypt_key + +.p2align 4 +_vpaes_set_encrypt_key: + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+5(%rip) +#endif + + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + + movl $0,%ecx + movl $0x30,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + ret + + + +.globl _vpaes_set_decrypt_key +.private_extern _vpaes_set_decrypt_key + +.p2align 4 +_vpaes_set_decrypt_key: + +_CET_ENDBR + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + shll $4,%eax + leaq 16(%rdx,%rax,1),%rdx + + movl $1,%ecx + movl %esi,%r8d + shrl $1,%r8d + andl $32,%r8d + xorl $32,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + ret + + + +.globl _vpaes_encrypt +.private_extern _vpaes_encrypt + +.p2align 4 +_vpaes_encrypt: + +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST + + movb $1,_BORINGSSL_function_hit+4(%rip) +#endif + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu %xmm0,(%rsi) + ret + + + +.globl _vpaes_decrypt +.private_extern _vpaes_decrypt + +.p2align 4 +_vpaes_decrypt: + +_CET_ENDBR + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu %xmm0,(%rsi) + ret + + +.globl _vpaes_cbc_encrypt +.private_extern _vpaes_cbc_encrypt + +.p2align 4 +_vpaes_cbc_encrypt: + +_CET_ENDBR + xchgq %rcx,%rdx + subq $16,%rcx + jc L$cbc_abort + movdqu (%r8),%xmm6 + subq %rdi,%rsi + call _vpaes_preheat + cmpl $0,%r9d + je L$cbc_dec_loop + jmp L$cbc_enc_loop +.p2align 4 +L$cbc_enc_loop: + movdqu (%rdi),%xmm0 + pxor %xmm6,%xmm0 + call _vpaes_encrypt_core + movdqa %xmm0,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc L$cbc_enc_loop + jmp L$cbc_done +.p2align 4 +L$cbc_dec_loop: + movdqu (%rdi),%xmm0 + movdqa %xmm0,%xmm7 + call _vpaes_decrypt_core + pxor %xmm6,%xmm0 + movdqa %xmm7,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc L$cbc_dec_loop +L$cbc_done: + movdqu %xmm6,(%r8) +L$cbc_abort: + ret + + +.globl _vpaes_ctr32_encrypt_blocks +.private_extern _vpaes_ctr32_encrypt_blocks + +.p2align 4 +_vpaes_ctr32_encrypt_blocks: + +_CET_ENDBR + + xchgq %rcx,%rdx + testq %rcx,%rcx + jz L$ctr32_abort + movdqu (%r8),%xmm0 + movdqa L$ctr_add_one(%rip),%xmm8 + subq %rdi,%rsi + call _vpaes_preheat + movdqa %xmm0,%xmm6 + pshufb L$rev_ctr(%rip),%xmm6 + + testq $1,%rcx + jz L$ctr32_prep_loop + + + + movdqu (%rdi),%xmm7 + call _vpaes_encrypt_core + pxor %xmm7,%xmm0 + paddd %xmm8,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + subq $1,%rcx + leaq 16(%rdi),%rdi + jz L$ctr32_done + +L$ctr32_prep_loop: + + + movdqa %xmm6,%xmm14 + movdqa %xmm6,%xmm15 + paddd %xmm8,%xmm15 + +L$ctr32_loop: + movdqa L$rev_ctr(%rip),%xmm1 + movdqa %xmm14,%xmm0 + movdqa %xmm15,%xmm6 + pshufb %xmm1,%xmm0 + pshufb %xmm1,%xmm6 + call _vpaes_encrypt_core_2x + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa L$ctr_add_two(%rip),%xmm3 + pxor %xmm1,%xmm0 + pxor %xmm2,%xmm6 + paddd %xmm3,%xmm14 + paddd %xmm3,%xmm15 + movdqu %xmm0,(%rsi,%rdi,1) + movdqu %xmm6,16(%rsi,%rdi,1) + subq $2,%rcx + leaq 32(%rdi),%rdi + jnz L$ctr32_loop + +L$ctr32_done: +L$ctr32_abort: + ret + + + + + + + + + +.p2align 4 +_vpaes_preheat: + + leaq L$k_s0F(%rip),%r10 + movdqa -32(%r10),%xmm10 + movdqa -16(%r10),%xmm11 + movdqa 0(%r10),%xmm9 + movdqa 48(%r10),%xmm13 + movdqa 64(%r10),%xmm12 + movdqa 80(%r10),%xmm15 + movdqa 96(%r10),%xmm14 + ret + + + + + + + + +.section __DATA,__const +.p2align 6 +_vpaes_consts: +L$k_inv: +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +L$k_s0F: +.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +L$k_ipt: +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +L$k_sb1: +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +L$k_sb2: +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +L$k_sbo: +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +L$k_mc_forward: +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +L$k_mc_backward: +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F + +L$k_sr: +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +L$k_rcon: +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +L$k_s63: +.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +L$k_opt: +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +L$k_deskew: +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + + + + +L$k_dksd: +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +L$k_dksb: +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +L$k_dkse: +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +L$k_dks9: +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + + + + + +L$k_dipt: +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 + +L$k_dsb9: +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +L$k_dsbd: +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +L$k_dsbb: +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +L$k_dsbe: +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +L$k_dsbo: +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C + + +L$rev_ctr: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 + + +L$ctr_add_one: +.quad 0x0000000000000000, 0x0000000100000000 +L$ctr_add_two: +.quad 0x0000000000000000, 0x0000000200000000 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.p2align 6 + +.text +#endif diff --git a/third_party/boringssl/gen/bcm/vpaes-x86_64-linux.S b/third_party/boringssl/gen/bcm/vpaes-x86_64-linux.S new file mode 100644 index 00000000..e7884646 --- /dev/null +++ b/third_party/boringssl/gen/bcm/vpaes-x86_64-linux.S @@ -0,0 +1,1133 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + + + + + + + + + + + + + + + + +.type _vpaes_encrypt_core,@function +.align 16 +_vpaes_encrypt_core: +.cfi_startproc + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_ipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movdqu (%r9),%xmm5 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + pshufb %xmm0,%xmm2 + movdqa .Lk_ipt+16(%rip),%xmm0 + pshufb %xmm1,%xmm0 + pxor %xmm5,%xmm2 + addq $16,%r9 + pxor %xmm2,%xmm0 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc_entry + +.align 16 +.Lenc_loop: + + movdqa %xmm13,%xmm4 + movdqa %xmm12,%xmm0 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm0 + pxor %xmm5,%xmm4 + movdqa %xmm15,%xmm5 + pxor %xmm4,%xmm0 + movdqa -64(%r11,%r10,1),%xmm1 + pshufb %xmm2,%xmm5 + movdqa (%r11,%r10,1),%xmm4 + movdqa %xmm14,%xmm2 + pshufb %xmm3,%xmm2 + movdqa %xmm0,%xmm3 + pxor %xmm5,%xmm2 + pshufb %xmm1,%xmm0 + addq $16,%r9 + pxor %xmm2,%xmm0 + pshufb %xmm4,%xmm3 + addq $16,%r11 + pxor %xmm0,%xmm3 + pshufb %xmm1,%xmm0 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + +.Lenc_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm11,%xmm5 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + pshufb %xmm0,%xmm5 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 + pshufb %xmm1,%xmm3 + movdqa %xmm10,%xmm4 + pxor %xmm5,%xmm3 + pshufb %xmm0,%xmm4 + movdqa %xmm10,%xmm2 + pxor %xmm5,%xmm4 + pshufb %xmm3,%xmm2 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 + pshufb %xmm4,%xmm3 + movdqu (%r9),%xmm5 + pxor %xmm1,%xmm3 + jnz .Lenc_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + pshufb %xmm2,%xmm4 + pxor %xmm5,%xmm4 + pshufb %xmm3,%xmm0 + movdqa 64(%r11,%r10,1),%xmm1 + pxor %xmm4,%xmm0 + pshufb %xmm1,%xmm0 + ret +.cfi_endproc +.size _vpaes_encrypt_core,.-_vpaes_encrypt_core + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +.type _vpaes_encrypt_core_2x,@function +.align 16 +_vpaes_encrypt_core_2x: +.cfi_startproc + movq %rdx,%r9 + movq $16,%r11 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa .Lk_ipt(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + movdqu (%r9),%xmm5 + + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 + pshufb %xmm0,%xmm2 + pshufb %xmm6,%xmm8 + movdqa .Lk_ipt+16(%rip),%xmm0 + movdqa %xmm0,%xmm6 + pshufb %xmm1,%xmm0 + pshufb %xmm7,%xmm6 + pxor %xmm5,%xmm2 + pxor %xmm5,%xmm8 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + leaq .Lk_mc_backward(%rip),%r10 + jmp .Lenc2x_entry + +.align 16 +.Lenc2x_loop: + + movdqa .Lk_sb1(%rip),%xmm4 + movdqa .Lk_sb1+16(%rip),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 + pshufb %xmm2,%xmm4 + pshufb %xmm8,%xmm12 + pshufb %xmm3,%xmm0 + pshufb %xmm11,%xmm6 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + movdqa .Lk_sb2(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + movdqa -64(%r11,%r10,1),%xmm1 + + pshufb %xmm2,%xmm5 + pshufb %xmm8,%xmm13 + movdqa (%r11,%r10,1),%xmm4 + + movdqa .Lk_sb2+16(%rip),%xmm2 + movdqa %xmm2,%xmm8 + pshufb %xmm3,%xmm2 + pshufb %xmm11,%xmm8 + movdqa %xmm0,%xmm3 + movdqa %xmm6,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm13,%xmm8 + pshufb %xmm1,%xmm0 + pshufb %xmm1,%xmm6 + addq $16,%r9 + pxor %xmm2,%xmm0 + pxor %xmm8,%xmm6 + pshufb %xmm4,%xmm3 + pshufb %xmm4,%xmm11 + addq $16,%r11 + pxor %xmm0,%xmm3 + pxor %xmm6,%xmm11 + pshufb %xmm1,%xmm0 + pshufb %xmm1,%xmm6 + andq $0x30,%r11 + subq $1,%rax + pxor %xmm3,%xmm0 + pxor %xmm11,%xmm6 + +.Lenc2x_entry: + + movdqa %xmm9,%xmm1 + movdqa %xmm9,%xmm7 + movdqa .Lk_inv+16(%rip),%xmm5 + movdqa %xmm5,%xmm13 + pandn %xmm0,%xmm1 + pandn %xmm6,%xmm7 + psrld $4,%xmm1 + psrld $4,%xmm7 + pand %xmm9,%xmm0 + pand %xmm9,%xmm6 + pshufb %xmm0,%xmm5 + pshufb %xmm6,%xmm13 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm1,%xmm0 + pxor %xmm7,%xmm6 + pshufb %xmm1,%xmm3 + pshufb %xmm7,%xmm11 + movdqa %xmm10,%xmm4 + movdqa %xmm10,%xmm12 + pxor %xmm5,%xmm3 + pxor %xmm13,%xmm11 + pshufb %xmm0,%xmm4 + pshufb %xmm6,%xmm12 + movdqa %xmm10,%xmm2 + movdqa %xmm10,%xmm8 + pxor %xmm5,%xmm4 + pxor %xmm13,%xmm12 + pshufb %xmm3,%xmm2 + pshufb %xmm11,%xmm8 + movdqa %xmm10,%xmm3 + movdqa %xmm10,%xmm11 + pxor %xmm0,%xmm2 + pxor %xmm6,%xmm8 + pshufb %xmm4,%xmm3 + pshufb %xmm12,%xmm11 + movdqu (%r9),%xmm5 + + pxor %xmm1,%xmm3 + pxor %xmm7,%xmm11 + jnz .Lenc2x_loop + + + movdqa -96(%r10),%xmm4 + movdqa -80(%r10),%xmm0 + movdqa %xmm4,%xmm12 + movdqa %xmm0,%xmm6 + pshufb %xmm2,%xmm4 + pshufb %xmm8,%xmm12 + pxor %xmm5,%xmm4 + pxor %xmm5,%xmm12 + pshufb %xmm3,%xmm0 + pshufb %xmm11,%xmm6 + movdqa 64(%r11,%r10,1),%xmm1 + + pxor %xmm4,%xmm0 + pxor %xmm12,%xmm6 + pshufb %xmm1,%xmm0 + pshufb %xmm1,%xmm6 + ret +.cfi_endproc +.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x + + + + + + +.type _vpaes_decrypt_core,@function +.align 16 +_vpaes_decrypt_core: +.cfi_startproc + movq %rdx,%r9 + movl 240(%rdx),%eax + movdqa %xmm9,%xmm1 + movdqa .Lk_dipt(%rip),%xmm2 + pandn %xmm0,%xmm1 + movq %rax,%r11 + psrld $4,%xmm1 + movdqu (%r9),%xmm5 + shlq $4,%r11 + pand %xmm9,%xmm0 + pshufb %xmm0,%xmm2 + movdqa .Lk_dipt+16(%rip),%xmm0 + xorq $0x30,%r11 + leaq .Lk_dsbd(%rip),%r10 + pshufb %xmm1,%xmm0 + andq $0x30,%r11 + pxor %xmm5,%xmm2 + movdqa .Lk_mc_forward+48(%rip),%xmm5 + pxor %xmm2,%xmm0 + addq $16,%r9 + addq %r10,%r11 + jmp .Ldec_entry + +.align 16 +.Ldec_loop: + + + + movdqa -32(%r10),%xmm4 + movdqa -16(%r10),%xmm1 + pshufb %xmm2,%xmm4 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa 0(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 16(%r10),%xmm1 + + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa 32(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 48(%r10),%xmm1 + + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + movdqa 64(%r10),%xmm4 + pxor %xmm1,%xmm0 + movdqa 80(%r10),%xmm1 + + pshufb %xmm2,%xmm4 + pshufb %xmm5,%xmm0 + pshufb %xmm3,%xmm1 + pxor %xmm4,%xmm0 + addq $16,%r9 + palignr $12,%xmm5,%xmm5 + pxor %xmm1,%xmm0 + subq $1,%rax + +.Ldec_entry: + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + movdqa %xmm11,%xmm2 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + pshufb %xmm0,%xmm2 + movdqa %xmm10,%xmm3 + pxor %xmm1,%xmm0 + pshufb %xmm1,%xmm3 + movdqa %xmm10,%xmm4 + pxor %xmm2,%xmm3 + pshufb %xmm0,%xmm4 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 + pshufb %xmm3,%xmm2 + movdqa %xmm10,%xmm3 + pxor %xmm0,%xmm2 + pshufb %xmm4,%xmm3 + movdqu (%r9),%xmm0 + pxor %xmm1,%xmm3 + jnz .Ldec_loop + + + movdqa 96(%r10),%xmm4 + pshufb %xmm2,%xmm4 + pxor %xmm0,%xmm4 + movdqa 112(%r10),%xmm0 + movdqa -352(%r11),%xmm2 + pshufb %xmm3,%xmm0 + pxor %xmm4,%xmm0 + pshufb %xmm2,%xmm0 + ret +.cfi_endproc +.size _vpaes_decrypt_core,.-_vpaes_decrypt_core + + + + + + +.type _vpaes_schedule_core,@function +.align 16 +_vpaes_schedule_core: +.cfi_startproc + + + + + + call _vpaes_preheat + movdqa .Lk_rcon(%rip),%xmm8 + movdqu (%rdi),%xmm0 + + + movdqa %xmm0,%xmm3 + leaq .Lk_ipt(%rip),%r11 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm7 + + leaq .Lk_sr(%rip),%r10 + testq %rcx,%rcx + jnz .Lschedule_am_decrypting + + + movdqu %xmm0,(%rdx) + jmp .Lschedule_go + +.Lschedule_am_decrypting: + + movdqa (%r8,%r10,1),%xmm1 + pshufb %xmm1,%xmm3 + movdqu %xmm3,(%rdx) + xorq $0x30,%r8 + +.Lschedule_go: + cmpl $192,%esi + ja .Lschedule_256 + je .Lschedule_192 + + + + + + + + + + +.Lschedule_128: + movl $10,%esi + +.Loop_schedule_128: + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + jmp .Loop_schedule_128 + + + + + + + + + + + + + + + + +.align 16 +.Lschedule_192: + movdqu 8(%rdi),%xmm0 + call _vpaes_schedule_transform + movdqa %xmm0,%xmm6 + pxor %xmm4,%xmm4 + movhlps %xmm4,%xmm6 + movl $4,%esi + +.Loop_schedule_192: + call _vpaes_schedule_round + palignr $8,%xmm6,%xmm0 + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + jmp .Loop_schedule_192 + + + + + + + + + + + +.align 16 +.Lschedule_256: + movdqu 16(%rdi),%xmm0 + call _vpaes_schedule_transform + movl $7,%esi + +.Loop_schedule_256: + call _vpaes_schedule_mangle + movdqa %xmm0,%xmm6 + + + call _vpaes_schedule_round + decq %rsi + jz .Lschedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd $0xFF,%xmm0,%xmm0 + movdqa %xmm7,%xmm5 + movdqa %xmm6,%xmm7 + call _vpaes_schedule_low_round + movdqa %xmm5,%xmm7 + + jmp .Loop_schedule_256 + + + + + + + + + + + + +.align 16 +.Lschedule_mangle_last: + + leaq .Lk_deskew(%rip),%r11 + testq %rcx,%rcx + jnz .Lschedule_mangle_last_dec + + + movdqa (%r8,%r10,1),%xmm1 + pshufb %xmm1,%xmm0 + leaq .Lk_opt(%rip),%r11 + addq $32,%rdx + +.Lschedule_mangle_last_dec: + addq $-16,%rdx + pxor .Lk_s63(%rip),%xmm0 + call _vpaes_schedule_transform + movdqu %xmm0,(%rdx) + + + pxor %xmm0,%xmm0 + pxor %xmm1,%xmm1 + pxor %xmm2,%xmm2 + pxor %xmm3,%xmm3 + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + pxor %xmm6,%xmm6 + pxor %xmm7,%xmm7 + ret +.cfi_endproc +.size _vpaes_schedule_core,.-_vpaes_schedule_core + + + + + + + + + + + + + + + +.type _vpaes_schedule_192_smear,@function +.align 16 +_vpaes_schedule_192_smear: +.cfi_startproc + pshufd $0x80,%xmm6,%xmm1 + pshufd $0xFE,%xmm7,%xmm0 + pxor %xmm1,%xmm6 + pxor %xmm1,%xmm1 + pxor %xmm0,%xmm6 + movdqa %xmm6,%xmm0 + movhlps %xmm1,%xmm6 + ret +.cfi_endproc +.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear + + + + + + + + + + + + + + + + + + + +.type _vpaes_schedule_round,@function +.align 16 +_vpaes_schedule_round: +.cfi_startproc + + pxor %xmm1,%xmm1 + palignr $15,%xmm8,%xmm1 + palignr $15,%xmm8,%xmm8 + pxor %xmm1,%xmm7 + + + pshufd $0xFF,%xmm0,%xmm0 + palignr $1,%xmm0,%xmm0 + + + + +_vpaes_schedule_low_round: + + movdqa %xmm7,%xmm1 + pslldq $4,%xmm7 + pxor %xmm1,%xmm7 + movdqa %xmm7,%xmm1 + pslldq $8,%xmm7 + pxor %xmm1,%xmm7 + pxor .Lk_s63(%rip),%xmm7 + + + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa %xmm11,%xmm2 + pshufb %xmm0,%xmm2 + pxor %xmm1,%xmm0 + movdqa %xmm10,%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + movdqa %xmm10,%xmm4 + pshufb %xmm0,%xmm4 + pxor %xmm2,%xmm4 + movdqa %xmm10,%xmm2 + pshufb %xmm3,%xmm2 + pxor %xmm0,%xmm2 + movdqa %xmm10,%xmm3 + pshufb %xmm4,%xmm3 + pxor %xmm1,%xmm3 + movdqa %xmm13,%xmm4 + pshufb %xmm2,%xmm4 + movdqa %xmm12,%xmm0 + pshufb %xmm3,%xmm0 + pxor %xmm4,%xmm0 + + + pxor %xmm7,%xmm0 + movdqa %xmm0,%xmm7 + ret +.cfi_endproc +.size _vpaes_schedule_round,.-_vpaes_schedule_round + + + + + + + + + + +.type _vpaes_schedule_transform,@function +.align 16 +_vpaes_schedule_transform: +.cfi_startproc + movdqa %xmm9,%xmm1 + pandn %xmm0,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm0 + movdqa (%r11),%xmm2 + pshufb %xmm0,%xmm2 + movdqa 16(%r11),%xmm0 + pshufb %xmm1,%xmm0 + pxor %xmm2,%xmm0 + ret +.cfi_endproc +.size _vpaes_schedule_transform,.-_vpaes_schedule_transform + + + + + + + + + + + + + + + + + + + + + + + + +.type _vpaes_schedule_mangle,@function +.align 16 +_vpaes_schedule_mangle: +.cfi_startproc + movdqa %xmm0,%xmm4 + movdqa .Lk_mc_forward(%rip),%xmm5 + testq %rcx,%rcx + jnz .Lschedule_mangle_dec + + + addq $16,%rdx + pxor .Lk_s63(%rip),%xmm4 + pshufb %xmm5,%xmm4 + movdqa %xmm4,%xmm3 + pshufb %xmm5,%xmm4 + pxor %xmm4,%xmm3 + pshufb %xmm5,%xmm4 + pxor %xmm4,%xmm3 + + jmp .Lschedule_mangle_both +.align 16 +.Lschedule_mangle_dec: + + leaq .Lk_dksd(%rip),%r11 + movdqa %xmm9,%xmm1 + pandn %xmm4,%xmm1 + psrld $4,%xmm1 + pand %xmm9,%xmm4 + + movdqa 0(%r11),%xmm2 + pshufb %xmm4,%xmm2 + movdqa 16(%r11),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + + movdqa 32(%r11),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 48(%r11),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + + movdqa 64(%r11),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 80(%r11),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + pshufb %xmm5,%xmm3 + + movdqa 96(%r11),%xmm2 + pshufb %xmm4,%xmm2 + pxor %xmm3,%xmm2 + movdqa 112(%r11),%xmm3 + pshufb %xmm1,%xmm3 + pxor %xmm2,%xmm3 + + addq $-16,%rdx + +.Lschedule_mangle_both: + movdqa (%r8,%r10,1),%xmm1 + pshufb %xmm1,%xmm3 + addq $-16,%r8 + andq $0x30,%r8 + movdqu %xmm3,(%rdx) + ret +.cfi_endproc +.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle + + + + +.globl vpaes_set_encrypt_key +.hidden vpaes_set_encrypt_key +.type vpaes_set_encrypt_key,@function +.align 16 +vpaes_set_encrypt_key: +.cfi_startproc +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+5(%rip) +#endif + + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + + movl $0,%ecx + movl $0x30,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + ret +.cfi_endproc +.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key + +.globl vpaes_set_decrypt_key +.hidden vpaes_set_decrypt_key +.type vpaes_set_decrypt_key,@function +.align 16 +vpaes_set_decrypt_key: +.cfi_startproc +_CET_ENDBR + movl %esi,%eax + shrl $5,%eax + addl $5,%eax + movl %eax,240(%rdx) + shll $4,%eax + leaq 16(%rdx,%rax,1),%rdx + + movl $1,%ecx + movl %esi,%r8d + shrl $1,%r8d + andl $32,%r8d + xorl $32,%r8d + call _vpaes_schedule_core + xorl %eax,%eax + ret +.cfi_endproc +.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key + +.globl vpaes_encrypt +.hidden vpaes_encrypt +.type vpaes_encrypt,@function +.align 16 +vpaes_encrypt: +.cfi_startproc +_CET_ENDBR +#ifdef BORINGSSL_DISPATCH_TEST +.extern BORINGSSL_function_hit +.hidden BORINGSSL_function_hit + movb $1,BORINGSSL_function_hit+4(%rip) +#endif + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu %xmm0,(%rsi) + ret +.cfi_endproc +.size vpaes_encrypt,.-vpaes_encrypt + +.globl vpaes_decrypt +.hidden vpaes_decrypt +.type vpaes_decrypt,@function +.align 16 +vpaes_decrypt: +.cfi_startproc +_CET_ENDBR + movdqu (%rdi),%xmm0 + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu %xmm0,(%rsi) + ret +.cfi_endproc +.size vpaes_decrypt,.-vpaes_decrypt +.globl vpaes_cbc_encrypt +.hidden vpaes_cbc_encrypt +.type vpaes_cbc_encrypt,@function +.align 16 +vpaes_cbc_encrypt: +.cfi_startproc +_CET_ENDBR + xchgq %rcx,%rdx + subq $16,%rcx + jc .Lcbc_abort + movdqu (%r8),%xmm6 + subq %rdi,%rsi + call _vpaes_preheat + cmpl $0,%r9d + je .Lcbc_dec_loop + jmp .Lcbc_enc_loop +.align 16 +.Lcbc_enc_loop: + movdqu (%rdi),%xmm0 + pxor %xmm6,%xmm0 + call _vpaes_encrypt_core + movdqa %xmm0,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_enc_loop + jmp .Lcbc_done +.align 16 +.Lcbc_dec_loop: + movdqu (%rdi),%xmm0 + movdqa %xmm0,%xmm7 + call _vpaes_decrypt_core + pxor %xmm6,%xmm0 + movdqa %xmm7,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + leaq 16(%rdi),%rdi + subq $16,%rcx + jnc .Lcbc_dec_loop +.Lcbc_done: + movdqu %xmm6,(%r8) +.Lcbc_abort: + ret +.cfi_endproc +.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt +.globl vpaes_ctr32_encrypt_blocks +.hidden vpaes_ctr32_encrypt_blocks +.type vpaes_ctr32_encrypt_blocks,@function +.align 16 +vpaes_ctr32_encrypt_blocks: +.cfi_startproc +_CET_ENDBR + + xchgq %rcx,%rdx + testq %rcx,%rcx + jz .Lctr32_abort + movdqu (%r8),%xmm0 + movdqa .Lctr_add_one(%rip),%xmm8 + subq %rdi,%rsi + call _vpaes_preheat + movdqa %xmm0,%xmm6 + pshufb .Lrev_ctr(%rip),%xmm6 + + testq $1,%rcx + jz .Lctr32_prep_loop + + + + movdqu (%rdi),%xmm7 + call _vpaes_encrypt_core + pxor %xmm7,%xmm0 + paddd %xmm8,%xmm6 + movdqu %xmm0,(%rsi,%rdi,1) + subq $1,%rcx + leaq 16(%rdi),%rdi + jz .Lctr32_done + +.Lctr32_prep_loop: + + + movdqa %xmm6,%xmm14 + movdqa %xmm6,%xmm15 + paddd %xmm8,%xmm15 + +.Lctr32_loop: + movdqa .Lrev_ctr(%rip),%xmm1 + movdqa %xmm14,%xmm0 + movdqa %xmm15,%xmm6 + pshufb %xmm1,%xmm0 + pshufb %xmm1,%xmm6 + call _vpaes_encrypt_core_2x + movdqu (%rdi),%xmm1 + movdqu 16(%rdi),%xmm2 + movdqa .Lctr_add_two(%rip),%xmm3 + pxor %xmm1,%xmm0 + pxor %xmm2,%xmm6 + paddd %xmm3,%xmm14 + paddd %xmm3,%xmm15 + movdqu %xmm0,(%rsi,%rdi,1) + movdqu %xmm6,16(%rsi,%rdi,1) + subq $2,%rcx + leaq 32(%rdi),%rdi + jnz .Lctr32_loop + +.Lctr32_done: +.Lctr32_abort: + ret +.cfi_endproc +.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks + + + + + + +.type _vpaes_preheat,@function +.align 16 +_vpaes_preheat: +.cfi_startproc + leaq .Lk_s0F(%rip),%r10 + movdqa -32(%r10),%xmm10 + movdqa -16(%r10),%xmm11 + movdqa 0(%r10),%xmm9 + movdqa 48(%r10),%xmm13 + movdqa 64(%r10),%xmm12 + movdqa 80(%r10),%xmm15 + movdqa 96(%r10),%xmm14 + ret +.cfi_endproc +.size _vpaes_preheat,.-_vpaes_preheat + + + + + +.type _vpaes_consts,@object +.section .rodata +.align 64 +_vpaes_consts: +.Lk_inv: +.quad 0x0E05060F0D080180, 0x040703090A0B0C02 +.quad 0x01040A060F0B0780, 0x030D0E0C02050809 + +.Lk_s0F: +.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F + +.Lk_ipt: +.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 +.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 + +.Lk_sb1: +.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 +.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF +.Lk_sb2: +.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD +.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A +.Lk_sbo: +.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 +.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA + +.Lk_mc_forward: +.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 +.quad 0x080B0A0904070605, 0x000302010C0F0E0D +.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 +.quad 0x000302010C0F0E0D, 0x080B0A0904070605 + +.Lk_mc_backward: +.quad 0x0605040702010003, 0x0E0D0C0F0A09080B +.quad 0x020100030E0D0C0F, 0x0A09080B06050407 +.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 +.quad 0x0A09080B06050407, 0x020100030E0D0C0F + +.Lk_sr: +.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 +.quad 0x030E09040F0A0500, 0x0B06010C07020D08 +.quad 0x0F060D040B020900, 0x070E050C030A0108 +.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 + +.Lk_rcon: +.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 + +.Lk_s63: +.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B + +.Lk_opt: +.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 +.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 + +.Lk_deskew: +.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A +.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 + + + + + +.Lk_dksd: +.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 +.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E +.Lk_dksb: +.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 +.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 +.Lk_dkse: +.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 +.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 +.Lk_dks9: +.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC +.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE + + + + + +.Lk_dipt: +.quad 0x0F505B040B545F00, 0x154A411E114E451A +.quad 0x86E383E660056500, 0x12771772F491F194 + +.Lk_dsb9: +.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 +.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 +.Lk_dsbd: +.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 +.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 +.Lk_dsbb: +.quad 0xD022649296B44200, 0x602646F6B0F2D404 +.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B +.Lk_dsbe: +.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 +.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 +.Lk_dsbo: +.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D +.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C + + +.Lrev_ctr: +.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 + + +.Lctr_add_one: +.quad 0x0000000000000000, 0x0000000100000000 +.Lctr_add_two: +.quad 0x0000000000000000, 0x0000000200000000 + +.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 +.align 64 +.size _vpaes_consts,.-_vpaes_consts +.text +#endif diff --git a/third_party/boringssl/gen/bcm/vpaes-x86_64-win.asm b/third_party/boringssl/gen/bcm/vpaes-x86_64-win.asm new file mode 100644 index 00000000..cddb9fa3 --- /dev/null +++ b/third_party/boringssl/gen/bcm/vpaes-x86_64-win.asm @@ -0,0 +1,1487 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_encrypt_core: + + mov r9,rdx + mov r11,16 + mov eax,DWORD[240+rdx] + movdqa xmm1,xmm9 + movdqa xmm2,XMMWORD[$L$k_ipt] + pandn xmm1,xmm0 + movdqu xmm5,XMMWORD[r9] + psrld xmm1,4 + pand xmm0,xmm9 + pshufb xmm2,xmm0 + movdqa xmm0,XMMWORD[(($L$k_ipt+16))] + pshufb xmm0,xmm1 + pxor xmm2,xmm5 + add r9,16 + pxor xmm0,xmm2 + lea r10,[$L$k_mc_backward] + jmp NEAR $L$enc_entry + +ALIGN 16 +$L$enc_loop: + + movdqa xmm4,xmm13 + movdqa xmm0,xmm12 + pshufb xmm4,xmm2 + pshufb xmm0,xmm3 + pxor xmm4,xmm5 + movdqa xmm5,xmm15 + pxor xmm0,xmm4 + movdqa xmm1,XMMWORD[((-64))+r10*1+r11] + pshufb xmm5,xmm2 + movdqa xmm4,XMMWORD[r10*1+r11] + movdqa xmm2,xmm14 + pshufb xmm2,xmm3 + movdqa xmm3,xmm0 + pxor xmm2,xmm5 + pshufb xmm0,xmm1 + add r9,16 + pxor xmm0,xmm2 + pshufb xmm3,xmm4 + add r11,16 + pxor xmm3,xmm0 + pshufb xmm0,xmm1 + and r11,0x30 + sub rax,1 + pxor xmm0,xmm3 + +$L$enc_entry: + + movdqa xmm1,xmm9 + movdqa xmm5,xmm11 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm9 + pshufb xmm5,xmm0 + movdqa xmm3,xmm10 + pxor xmm0,xmm1 + pshufb xmm3,xmm1 + movdqa xmm4,xmm10 + pxor xmm3,xmm5 + pshufb xmm4,xmm0 + movdqa xmm2,xmm10 + pxor xmm4,xmm5 + pshufb xmm2,xmm3 + movdqa xmm3,xmm10 + pxor xmm2,xmm0 + pshufb xmm3,xmm4 + movdqu xmm5,XMMWORD[r9] + pxor xmm3,xmm1 + jnz NEAR $L$enc_loop + + + movdqa xmm4,XMMWORD[((-96))+r10] + movdqa xmm0,XMMWORD[((-80))+r10] + pshufb xmm4,xmm2 + pxor xmm4,xmm5 + pshufb xmm0,xmm3 + movdqa xmm1,XMMWORD[64+r10*1+r11] + pxor xmm0,xmm4 + pshufb xmm0,xmm1 + ret + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_encrypt_core_2x: + + mov r9,rdx + mov r11,16 + mov eax,DWORD[240+rdx] + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm2,XMMWORD[$L$k_ipt] + movdqa xmm8,xmm2 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + movdqu xmm5,XMMWORD[r9] + + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 + pshufb xmm2,xmm0 + pshufb xmm8,xmm6 + movdqa xmm0,XMMWORD[(($L$k_ipt+16))] + movdqa xmm6,xmm0 + pshufb xmm0,xmm1 + pshufb xmm6,xmm7 + pxor xmm2,xmm5 + pxor xmm8,xmm5 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 + lea r10,[$L$k_mc_backward] + jmp NEAR $L$enc2x_entry + +ALIGN 16 +$L$enc2x_loop: + + movdqa xmm4,XMMWORD[$L$k_sb1] + movdqa xmm0,XMMWORD[(($L$k_sb1+16))] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 + pshufb xmm4,xmm2 + pshufb xmm12,xmm8 + pshufb xmm0,xmm3 + pshufb xmm6,xmm11 + pxor xmm4,xmm5 + pxor xmm12,xmm5 + movdqa xmm5,XMMWORD[$L$k_sb2] + movdqa xmm13,xmm5 + pxor xmm0,xmm4 + pxor xmm6,xmm12 + movdqa xmm1,XMMWORD[((-64))+r10*1+r11] + + pshufb xmm5,xmm2 + pshufb xmm13,xmm8 + movdqa xmm4,XMMWORD[r10*1+r11] + + movdqa xmm2,XMMWORD[(($L$k_sb2+16))] + movdqa xmm8,xmm2 + pshufb xmm2,xmm3 + pshufb xmm8,xmm11 + movdqa xmm3,xmm0 + movdqa xmm11,xmm6 + pxor xmm2,xmm5 + pxor xmm8,xmm13 + pshufb xmm0,xmm1 + pshufb xmm6,xmm1 + add r9,16 + pxor xmm0,xmm2 + pxor xmm6,xmm8 + pshufb xmm3,xmm4 + pshufb xmm11,xmm4 + add r11,16 + pxor xmm3,xmm0 + pxor xmm11,xmm6 + pshufb xmm0,xmm1 + pshufb xmm6,xmm1 + and r11,0x30 + sub rax,1 + pxor xmm0,xmm3 + pxor xmm6,xmm11 + +$L$enc2x_entry: + + movdqa xmm1,xmm9 + movdqa xmm7,xmm9 + movdqa xmm5,XMMWORD[(($L$k_inv+16))] + movdqa xmm13,xmm5 + pandn xmm1,xmm0 + pandn xmm7,xmm6 + psrld xmm1,4 + psrld xmm7,4 + pand xmm0,xmm9 + pand xmm6,xmm9 + pshufb xmm5,xmm0 + pshufb xmm13,xmm6 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm0,xmm1 + pxor xmm6,xmm7 + pshufb xmm3,xmm1 + pshufb xmm11,xmm7 + movdqa xmm4,xmm10 + movdqa xmm12,xmm10 + pxor xmm3,xmm5 + pxor xmm11,xmm13 + pshufb xmm4,xmm0 + pshufb xmm12,xmm6 + movdqa xmm2,xmm10 + movdqa xmm8,xmm10 + pxor xmm4,xmm5 + pxor xmm12,xmm13 + pshufb xmm2,xmm3 + pshufb xmm8,xmm11 + movdqa xmm3,xmm10 + movdqa xmm11,xmm10 + pxor xmm2,xmm0 + pxor xmm8,xmm6 + pshufb xmm3,xmm4 + pshufb xmm11,xmm12 + movdqu xmm5,XMMWORD[r9] + + pxor xmm3,xmm1 + pxor xmm11,xmm7 + jnz NEAR $L$enc2x_loop + + + movdqa xmm4,XMMWORD[((-96))+r10] + movdqa xmm0,XMMWORD[((-80))+r10] + movdqa xmm12,xmm4 + movdqa xmm6,xmm0 + pshufb xmm4,xmm2 + pshufb xmm12,xmm8 + pxor xmm4,xmm5 + pxor xmm12,xmm5 + pshufb xmm0,xmm3 + pshufb xmm6,xmm11 + movdqa xmm1,XMMWORD[64+r10*1+r11] + + pxor xmm0,xmm4 + pxor xmm6,xmm12 + pshufb xmm0,xmm1 + pshufb xmm6,xmm1 + ret + + + + + + + + + +ALIGN 16 +_vpaes_decrypt_core: + + mov r9,rdx + mov eax,DWORD[240+rdx] + movdqa xmm1,xmm9 + movdqa xmm2,XMMWORD[$L$k_dipt] + pandn xmm1,xmm0 + mov r11,rax + psrld xmm1,4 + movdqu xmm5,XMMWORD[r9] + shl r11,4 + pand xmm0,xmm9 + pshufb xmm2,xmm0 + movdqa xmm0,XMMWORD[(($L$k_dipt+16))] + xor r11,0x30 + lea r10,[$L$k_dsbd] + pshufb xmm0,xmm1 + and r11,0x30 + pxor xmm2,xmm5 + movdqa xmm5,XMMWORD[(($L$k_mc_forward+48))] + pxor xmm0,xmm2 + add r9,16 + add r11,r10 + jmp NEAR $L$dec_entry + +ALIGN 16 +$L$dec_loop: + + + + movdqa xmm4,XMMWORD[((-32))+r10] + movdqa xmm1,XMMWORD[((-16))+r10] + pshufb xmm4,xmm2 + pshufb xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,XMMWORD[r10] + pxor xmm0,xmm1 + movdqa xmm1,XMMWORD[16+r10] + + pshufb xmm4,xmm2 + pshufb xmm0,xmm5 + pshufb xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,XMMWORD[32+r10] + pxor xmm0,xmm1 + movdqa xmm1,XMMWORD[48+r10] + + pshufb xmm4,xmm2 + pshufb xmm0,xmm5 + pshufb xmm1,xmm3 + pxor xmm0,xmm4 + movdqa xmm4,XMMWORD[64+r10] + pxor xmm0,xmm1 + movdqa xmm1,XMMWORD[80+r10] + + pshufb xmm4,xmm2 + pshufb xmm0,xmm5 + pshufb xmm1,xmm3 + pxor xmm0,xmm4 + add r9,16 + palignr xmm5,xmm5,12 + pxor xmm0,xmm1 + sub rax,1 + +$L$dec_entry: + + movdqa xmm1,xmm9 + pandn xmm1,xmm0 + movdqa xmm2,xmm11 + psrld xmm1,4 + pand xmm0,xmm9 + pshufb xmm2,xmm0 + movdqa xmm3,xmm10 + pxor xmm0,xmm1 + pshufb xmm3,xmm1 + movdqa xmm4,xmm10 + pxor xmm3,xmm2 + pshufb xmm4,xmm0 + pxor xmm4,xmm2 + movdqa xmm2,xmm10 + pshufb xmm2,xmm3 + movdqa xmm3,xmm10 + pxor xmm2,xmm0 + pshufb xmm3,xmm4 + movdqu xmm0,XMMWORD[r9] + pxor xmm3,xmm1 + jnz NEAR $L$dec_loop + + + movdqa xmm4,XMMWORD[96+r10] + pshufb xmm4,xmm2 + pxor xmm4,xmm0 + movdqa xmm0,XMMWORD[112+r10] + movdqa xmm2,XMMWORD[((-352))+r11] + pshufb xmm0,xmm3 + pxor xmm0,xmm4 + pshufb xmm0,xmm2 + ret + + + + + + + + + +ALIGN 16 +_vpaes_schedule_core: + + + + + + + call _vpaes_preheat + movdqa xmm8,XMMWORD[$L$k_rcon] + movdqu xmm0,XMMWORD[rdi] + + + movdqa xmm3,xmm0 + lea r11,[$L$k_ipt] + call _vpaes_schedule_transform + movdqa xmm7,xmm0 + + lea r10,[$L$k_sr] + test rcx,rcx + jnz NEAR $L$schedule_am_decrypting + + + movdqu XMMWORD[rdx],xmm0 + jmp NEAR $L$schedule_go + +$L$schedule_am_decrypting: + + movdqa xmm1,XMMWORD[r10*1+r8] + pshufb xmm3,xmm1 + movdqu XMMWORD[rdx],xmm3 + xor r8,0x30 + +$L$schedule_go: + cmp esi,192 + ja NEAR $L$schedule_256 + je NEAR $L$schedule_192 + + + + + + + + + + +$L$schedule_128: + mov esi,10 + +$L$oop_schedule_128: + call _vpaes_schedule_round + dec rsi + jz NEAR $L$schedule_mangle_last + call _vpaes_schedule_mangle + jmp NEAR $L$oop_schedule_128 + + + + + + + + + + + + + + + + +ALIGN 16 +$L$schedule_192: + movdqu xmm0,XMMWORD[8+rdi] + call _vpaes_schedule_transform + movdqa xmm6,xmm0 + pxor xmm4,xmm4 + movhlps xmm6,xmm4 + mov esi,4 + +$L$oop_schedule_192: + call _vpaes_schedule_round + palignr xmm0,xmm6,8 + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + call _vpaes_schedule_mangle + call _vpaes_schedule_round + dec rsi + jz NEAR $L$schedule_mangle_last + call _vpaes_schedule_mangle + call _vpaes_schedule_192_smear + jmp NEAR $L$oop_schedule_192 + + + + + + + + + + + +ALIGN 16 +$L$schedule_256: + movdqu xmm0,XMMWORD[16+rdi] + call _vpaes_schedule_transform + mov esi,7 + +$L$oop_schedule_256: + call _vpaes_schedule_mangle + movdqa xmm6,xmm0 + + + call _vpaes_schedule_round + dec rsi + jz NEAR $L$schedule_mangle_last + call _vpaes_schedule_mangle + + + pshufd xmm0,xmm0,0xFF + movdqa xmm5,xmm7 + movdqa xmm7,xmm6 + call _vpaes_schedule_low_round + movdqa xmm7,xmm5 + + jmp NEAR $L$oop_schedule_256 + + + + + + + + + + + + +ALIGN 16 +$L$schedule_mangle_last: + + lea r11,[$L$k_deskew] + test rcx,rcx + jnz NEAR $L$schedule_mangle_last_dec + + + movdqa xmm1,XMMWORD[r10*1+r8] + pshufb xmm0,xmm1 + lea r11,[$L$k_opt] + add rdx,32 + +$L$schedule_mangle_last_dec: + add rdx,-16 + pxor xmm0,XMMWORD[$L$k_s63] + call _vpaes_schedule_transform + movdqu XMMWORD[rdx],xmm0 + + + pxor xmm0,xmm0 + pxor xmm1,xmm1 + pxor xmm2,xmm2 + pxor xmm3,xmm3 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + pxor xmm6,xmm6 + pxor xmm7,xmm7 + ret + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_schedule_192_smear: + + pshufd xmm1,xmm6,0x80 + pshufd xmm0,xmm7,0xFE + pxor xmm6,xmm1 + pxor xmm1,xmm1 + pxor xmm6,xmm0 + movdqa xmm0,xmm6 + movhlps xmm6,xmm1 + ret + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_schedule_round: + + + pxor xmm1,xmm1 + palignr xmm1,xmm8,15 + palignr xmm8,xmm8,15 + pxor xmm7,xmm1 + + + pshufd xmm0,xmm0,0xFF + palignr xmm0,xmm0,1 + + + + +_vpaes_schedule_low_round: + + movdqa xmm1,xmm7 + pslldq xmm7,4 + pxor xmm7,xmm1 + movdqa xmm1,xmm7 + pslldq xmm7,8 + pxor xmm7,xmm1 + pxor xmm7,XMMWORD[$L$k_s63] + + + movdqa xmm1,xmm9 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm9 + movdqa xmm2,xmm11 + pshufb xmm2,xmm0 + pxor xmm0,xmm1 + movdqa xmm3,xmm10 + pshufb xmm3,xmm1 + pxor xmm3,xmm2 + movdqa xmm4,xmm10 + pshufb xmm4,xmm0 + pxor xmm4,xmm2 + movdqa xmm2,xmm10 + pshufb xmm2,xmm3 + pxor xmm2,xmm0 + movdqa xmm3,xmm10 + pshufb xmm3,xmm4 + pxor xmm3,xmm1 + movdqa xmm4,xmm13 + pshufb xmm4,xmm2 + movdqa xmm0,xmm12 + pshufb xmm0,xmm3 + pxor xmm0,xmm4 + + + pxor xmm0,xmm7 + movdqa xmm7,xmm0 + ret + + + + + + + + + + + + + +ALIGN 16 +_vpaes_schedule_transform: + + movdqa xmm1,xmm9 + pandn xmm1,xmm0 + psrld xmm1,4 + pand xmm0,xmm9 + movdqa xmm2,XMMWORD[r11] + pshufb xmm2,xmm0 + movdqa xmm0,XMMWORD[16+r11] + pshufb xmm0,xmm1 + pxor xmm0,xmm2 + ret + + + + + + + + + + + + + + + + + + + + + + + + + + + +ALIGN 16 +_vpaes_schedule_mangle: + + movdqa xmm4,xmm0 + movdqa xmm5,XMMWORD[$L$k_mc_forward] + test rcx,rcx + jnz NEAR $L$schedule_mangle_dec + + + add rdx,16 + pxor xmm4,XMMWORD[$L$k_s63] + pshufb xmm4,xmm5 + movdqa xmm3,xmm4 + pshufb xmm4,xmm5 + pxor xmm3,xmm4 + pshufb xmm4,xmm5 + pxor xmm3,xmm4 + + jmp NEAR $L$schedule_mangle_both +ALIGN 16 +$L$schedule_mangle_dec: + + lea r11,[$L$k_dksd] + movdqa xmm1,xmm9 + pandn xmm1,xmm4 + psrld xmm1,4 + pand xmm4,xmm9 + + movdqa xmm2,XMMWORD[r11] + pshufb xmm2,xmm4 + movdqa xmm3,XMMWORD[16+r11] + pshufb xmm3,xmm1 + pxor xmm3,xmm2 + pshufb xmm3,xmm5 + + movdqa xmm2,XMMWORD[32+r11] + pshufb xmm2,xmm4 + pxor xmm2,xmm3 + movdqa xmm3,XMMWORD[48+r11] + pshufb xmm3,xmm1 + pxor xmm3,xmm2 + pshufb xmm3,xmm5 + + movdqa xmm2,XMMWORD[64+r11] + pshufb xmm2,xmm4 + pxor xmm2,xmm3 + movdqa xmm3,XMMWORD[80+r11] + pshufb xmm3,xmm1 + pxor xmm3,xmm2 + pshufb xmm3,xmm5 + + movdqa xmm2,XMMWORD[96+r11] + pshufb xmm2,xmm4 + pxor xmm2,xmm3 + movdqa xmm3,XMMWORD[112+r11] + pshufb xmm3,xmm1 + pxor xmm3,xmm2 + + add rdx,-16 + +$L$schedule_mangle_both: + movdqa xmm1,XMMWORD[r10*1+r8] + pshufb xmm3,xmm1 + add r8,-16 + and r8,0x30 + movdqu XMMWORD[rdx],xmm3 + ret + + + + + + +global vpaes_set_encrypt_key + +ALIGN 16 +vpaes_set_encrypt_key: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_vpaes_set_encrypt_key: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+5))],1 +%endif + + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$enc_key_body: + mov eax,esi + shr eax,5 + add eax,5 + mov DWORD[240+rdx],eax + + mov ecx,0 + mov r8d,0x30 + call _vpaes_schedule_core + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$enc_key_epilogue: + xor eax,eax + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_vpaes_set_encrypt_key: + +global vpaes_set_decrypt_key + +ALIGN 16 +vpaes_set_decrypt_key: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_vpaes_set_decrypt_key: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$dec_key_body: + mov eax,esi + shr eax,5 + add eax,5 + mov DWORD[240+rdx],eax + shl eax,4 + lea rdx,[16+rax*1+rdx] + + mov ecx,1 + mov r8d,esi + shr r8d,1 + and r8d,32 + xor r8d,32 + call _vpaes_schedule_core + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$dec_key_epilogue: + xor eax,eax + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_vpaes_set_decrypt_key: + +global vpaes_encrypt + +ALIGN 16 +vpaes_encrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_vpaes_encrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR +%ifdef BORINGSSL_DISPATCH_TEST +EXTERN BORINGSSL_function_hit + mov BYTE[((BORINGSSL_function_hit+4))],1 +%endif + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$enc_body: + movdqu xmm0,XMMWORD[rdi] + call _vpaes_preheat + call _vpaes_encrypt_core + movdqu XMMWORD[rsi],xmm0 + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$enc_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_vpaes_encrypt: + +global vpaes_decrypt + +ALIGN 16 +vpaes_decrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_vpaes_decrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$dec_body: + movdqu xmm0,XMMWORD[rdi] + call _vpaes_preheat + call _vpaes_decrypt_core + movdqu XMMWORD[rsi],xmm0 + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$dec_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_vpaes_decrypt: +global vpaes_cbc_encrypt + +ALIGN 16 +vpaes_cbc_encrypt: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_vpaes_cbc_encrypt: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + xchg rdx,rcx + sub rcx,16 + jc NEAR $L$cbc_abort + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$cbc_body: + movdqu xmm6,XMMWORD[r8] + sub rsi,rdi + call _vpaes_preheat + cmp r9d,0 + je NEAR $L$cbc_dec_loop + jmp NEAR $L$cbc_enc_loop +ALIGN 16 +$L$cbc_enc_loop: + movdqu xmm0,XMMWORD[rdi] + pxor xmm0,xmm6 + call _vpaes_encrypt_core + movdqa xmm6,xmm0 + movdqu XMMWORD[rdi*1+rsi],xmm0 + lea rdi,[16+rdi] + sub rcx,16 + jnc NEAR $L$cbc_enc_loop + jmp NEAR $L$cbc_done +ALIGN 16 +$L$cbc_dec_loop: + movdqu xmm0,XMMWORD[rdi] + movdqa xmm7,xmm0 + call _vpaes_decrypt_core + pxor xmm0,xmm6 + movdqa xmm6,xmm7 + movdqu XMMWORD[rdi*1+rsi],xmm0 + lea rdi,[16+rdi] + sub rcx,16 + jnc NEAR $L$cbc_dec_loop +$L$cbc_done: + movdqu XMMWORD[r8],xmm6 + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$cbc_epilogue: +$L$cbc_abort: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_vpaes_cbc_encrypt: +global vpaes_ctr32_encrypt_blocks + +ALIGN 16 +vpaes_ctr32_encrypt_blocks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_vpaes_ctr32_encrypt_blocks: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + + xchg rdx,rcx + test rcx,rcx + jz NEAR $L$ctr32_abort + lea rsp,[((-184))+rsp] + movaps XMMWORD[16+rsp],xmm6 + movaps XMMWORD[32+rsp],xmm7 + movaps XMMWORD[48+rsp],xmm8 + movaps XMMWORD[64+rsp],xmm9 + movaps XMMWORD[80+rsp],xmm10 + movaps XMMWORD[96+rsp],xmm11 + movaps XMMWORD[112+rsp],xmm12 + movaps XMMWORD[128+rsp],xmm13 + movaps XMMWORD[144+rsp],xmm14 + movaps XMMWORD[160+rsp],xmm15 +$L$ctr32_body: + movdqu xmm0,XMMWORD[r8] + movdqa xmm8,XMMWORD[$L$ctr_add_one] + sub rsi,rdi + call _vpaes_preheat + movdqa xmm6,xmm0 + pshufb xmm6,XMMWORD[$L$rev_ctr] + + test rcx,1 + jz NEAR $L$ctr32_prep_loop + + + + movdqu xmm7,XMMWORD[rdi] + call _vpaes_encrypt_core + pxor xmm0,xmm7 + paddd xmm6,xmm8 + movdqu XMMWORD[rdi*1+rsi],xmm0 + sub rcx,1 + lea rdi,[16+rdi] + jz NEAR $L$ctr32_done + +$L$ctr32_prep_loop: + + + movdqa xmm14,xmm6 + movdqa xmm15,xmm6 + paddd xmm15,xmm8 + +$L$ctr32_loop: + movdqa xmm1,XMMWORD[$L$rev_ctr] + movdqa xmm0,xmm14 + movdqa xmm6,xmm15 + pshufb xmm0,xmm1 + pshufb xmm6,xmm1 + call _vpaes_encrypt_core_2x + movdqu xmm1,XMMWORD[rdi] + movdqu xmm2,XMMWORD[16+rdi] + movdqa xmm3,XMMWORD[$L$ctr_add_two] + pxor xmm0,xmm1 + pxor xmm6,xmm2 + paddd xmm14,xmm3 + paddd xmm15,xmm3 + movdqu XMMWORD[rdi*1+rsi],xmm0 + movdqu XMMWORD[16+rdi*1+rsi],xmm6 + sub rcx,2 + lea rdi,[32+rdi] + jnz NEAR $L$ctr32_loop + +$L$ctr32_done: + movaps xmm6,XMMWORD[16+rsp] + movaps xmm7,XMMWORD[32+rsp] + movaps xmm8,XMMWORD[48+rsp] + movaps xmm9,XMMWORD[64+rsp] + movaps xmm10,XMMWORD[80+rsp] + movaps xmm11,XMMWORD[96+rsp] + movaps xmm12,XMMWORD[112+rsp] + movaps xmm13,XMMWORD[128+rsp] + movaps xmm14,XMMWORD[144+rsp] + movaps xmm15,XMMWORD[160+rsp] + lea rsp,[184+rsp] +$L$ctr32_epilogue: +$L$ctr32_abort: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_vpaes_ctr32_encrypt_blocks: + + + + + + + +ALIGN 16 +_vpaes_preheat: + + lea r10,[$L$k_s0F] + movdqa xmm10,XMMWORD[((-32))+r10] + movdqa xmm11,XMMWORD[((-16))+r10] + movdqa xmm9,XMMWORD[r10] + movdqa xmm13,XMMWORD[48+r10] + movdqa xmm12,XMMWORD[64+r10] + movdqa xmm15,XMMWORD[80+r10] + movdqa xmm14,XMMWORD[96+r10] + ret + + + + + + + + +section .rdata rdata align=8 +ALIGN 64 +_vpaes_consts: +$L$k_inv: + DQ 0x0E05060F0D080180,0x040703090A0B0C02 + DQ 0x01040A060F0B0780,0x030D0E0C02050809 + +$L$k_s0F: + DQ 0x0F0F0F0F0F0F0F0F,0x0F0F0F0F0F0F0F0F + +$L$k_ipt: + DQ 0xC2B2E8985A2A7000,0xCABAE09052227808 + DQ 0x4C01307D317C4D00,0xCD80B1FCB0FDCC81 + +$L$k_sb1: + DQ 0xB19BE18FCB503E00,0xA5DF7A6E142AF544 + DQ 0x3618D415FAE22300,0x3BF7CCC10D2ED9EF +$L$k_sb2: + DQ 0xE27A93C60B712400,0x5EB7E955BC982FCD + DQ 0x69EB88400AE12900,0xC2A163C8AB82234A +$L$k_sbo: + DQ 0xD0D26D176FBDC700,0x15AABF7AC502A878 + DQ 0xCFE474A55FBB6A00,0x8E1E90D1412B35FA + +$L$k_mc_forward: + DQ 0x0407060500030201,0x0C0F0E0D080B0A09 + DQ 0x080B0A0904070605,0x000302010C0F0E0D + DQ 0x0C0F0E0D080B0A09,0x0407060500030201 + DQ 0x000302010C0F0E0D,0x080B0A0904070605 + +$L$k_mc_backward: + DQ 0x0605040702010003,0x0E0D0C0F0A09080B + DQ 0x020100030E0D0C0F,0x0A09080B06050407 + DQ 0x0E0D0C0F0A09080B,0x0605040702010003 + DQ 0x0A09080B06050407,0x020100030E0D0C0F + +$L$k_sr: + DQ 0x0706050403020100,0x0F0E0D0C0B0A0908 + DQ 0x030E09040F0A0500,0x0B06010C07020D08 + DQ 0x0F060D040B020900,0x070E050C030A0108 + DQ 0x0B0E0104070A0D00,0x0306090C0F020508 + +$L$k_rcon: + DQ 0x1F8391B9AF9DEEB6,0x702A98084D7C7D81 + +$L$k_s63: + DQ 0x5B5B5B5B5B5B5B5B,0x5B5B5B5B5B5B5B5B + +$L$k_opt: + DQ 0xFF9F4929D6B66000,0xF7974121DEBE6808 + DQ 0x01EDBD5150BCEC00,0xE10D5DB1B05C0CE0 + +$L$k_deskew: + DQ 0x07E4A34047A4E300,0x1DFEB95A5DBEF91A + DQ 0x5F36B5DC83EA6900,0x2841C2ABF49D1E77 + + + + + +$L$k_dksd: + DQ 0xFEB91A5DA3E44700,0x0740E3A45A1DBEF9 + DQ 0x41C277F4B5368300,0x5FDC69EAAB289D1E +$L$k_dksb: + DQ 0x9A4FCA1F8550D500,0x03D653861CC94C99 + DQ 0x115BEDA7B6FC4A00,0xD993256F7E3482C8 +$L$k_dkse: + DQ 0xD5031CCA1FC9D600,0x53859A4C994F5086 + DQ 0xA23196054FDC7BE8,0xCD5EF96A20B31487 +$L$k_dks9: + DQ 0xB6116FC87ED9A700,0x4AED933482255BFC + DQ 0x4576516227143300,0x8BB89FACE9DAFDCE + + + + + +$L$k_dipt: + DQ 0x0F505B040B545F00,0x154A411E114E451A + DQ 0x86E383E660056500,0x12771772F491F194 + +$L$k_dsb9: + DQ 0x851C03539A86D600,0xCAD51F504F994CC9 + DQ 0xC03B1789ECD74900,0x725E2C9EB2FBA565 +$L$k_dsbd: + DQ 0x7D57CCDFE6B1A200,0xF56E9B13882A4439 + DQ 0x3CE2FAF724C6CB00,0x2931180D15DEEFD3 +$L$k_dsbb: + DQ 0xD022649296B44200,0x602646F6B0F2D404 + DQ 0xC19498A6CD596700,0xF3FF0C3E3255AA6B +$L$k_dsbe: + DQ 0x46F2929626D4D000,0x2242600464B4F6B0 + DQ 0x0C55A6CDFFAAC100,0x9467F36B98593E32 +$L$k_dsbo: + DQ 0x1387EA537EF94000,0xC7AA6DB9D4943E2D + DQ 0x12D7560F93441D00,0xCA4B8159D8C58E9C + + +$L$rev_ctr: + DQ 0x0706050403020100,0x0c0d0e0f0b0a0908 + + +$L$ctr_add_one: + DQ 0x0000000000000000,0x0000000100000000 +$L$ctr_add_two: + DQ 0x0000000000000000,0x0000000200000000 + + DB 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 + DB 111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54 + DB 52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97 + DB 109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32 + DB 85,110,105,118,101,114,115,105,116,121,41,0 +ALIGN 64 + +section .text + +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$in_prologue + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$in_prologue + + lea rsi,[16+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + lea rax,[184+rax] + +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_vpaes_set_encrypt_key wrt ..imagebase + DD $L$SEH_end_vpaes_set_encrypt_key wrt ..imagebase + DD $L$SEH_info_vpaes_set_encrypt_key wrt ..imagebase + + DD $L$SEH_begin_vpaes_set_decrypt_key wrt ..imagebase + DD $L$SEH_end_vpaes_set_decrypt_key wrt ..imagebase + DD $L$SEH_info_vpaes_set_decrypt_key wrt ..imagebase + + DD $L$SEH_begin_vpaes_encrypt wrt ..imagebase + DD $L$SEH_end_vpaes_encrypt wrt ..imagebase + DD $L$SEH_info_vpaes_encrypt wrt ..imagebase + + DD $L$SEH_begin_vpaes_decrypt wrt ..imagebase + DD $L$SEH_end_vpaes_decrypt wrt ..imagebase + DD $L$SEH_info_vpaes_decrypt wrt ..imagebase + + DD $L$SEH_begin_vpaes_cbc_encrypt wrt ..imagebase + DD $L$SEH_end_vpaes_cbc_encrypt wrt ..imagebase + DD $L$SEH_info_vpaes_cbc_encrypt wrt ..imagebase + + DD $L$SEH_begin_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_end_vpaes_ctr32_encrypt_blocks wrt ..imagebase + DD $L$SEH_info_vpaes_ctr32_encrypt_blocks wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_vpaes_set_encrypt_key: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$enc_key_body wrt ..imagebase,$L$enc_key_epilogue wrt ..imagebase +$L$SEH_info_vpaes_set_decrypt_key: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$dec_key_body wrt ..imagebase,$L$dec_key_epilogue wrt ..imagebase +$L$SEH_info_vpaes_encrypt: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$enc_body wrt ..imagebase,$L$enc_epilogue wrt ..imagebase +$L$SEH_info_vpaes_decrypt: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$dec_body wrt ..imagebase,$L$dec_epilogue wrt ..imagebase +$L$SEH_info_vpaes_cbc_encrypt: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$cbc_body wrt ..imagebase,$L$cbc_epilogue wrt ..imagebase +$L$SEH_info_vpaes_ctr32_encrypt_blocks: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + DD $L$ctr32_body wrt ..imagebase,$L$ctr32_epilogue wrt ..imagebase +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/x86-mont-apple.S b/third_party/boringssl/gen/bcm/x86-mont-apple.S new file mode 100644 index 00000000..89e439ba --- /dev/null +++ b/third_party/boringssl/gen/bcm/x86-mont-apple.S @@ -0,0 +1,216 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _bn_mul_mont_words +.private_extern _bn_mul_mont_words +.align 4 +_bn_mul_mont_words: +L_bn_mul_mont_words_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 40(%esp),%edi + leal 20(%esp),%esi + leal 24(%esp),%edx + addl $2,%edi + negl %edi + leal -32(%esp,%edi,4),%ebp + negl %edi + movl %ebp,%eax + subl %edx,%eax + andl $2047,%eax + subl %eax,%ebp + xorl %ebp,%edx + andl $2048,%edx + xorl $2048,%edx + subl %edx,%ebp + andl $-64,%ebp + movl %esp,%eax + subl %ebp,%eax + andl $-4096,%eax + movl %esp,%edx + leal (%ebp,%eax,1),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja L000page_walk + jmp L001page_walk_done +.align 4,0x90 +L000page_walk: + leal -4096(%esp),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja L000page_walk +L001page_walk_done: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%ebp + movl 16(%esi),%esi + movl (%esi),%esi + movl %eax,4(%esp) + movl %ebx,8(%esp) + movl %ecx,12(%esp) + movl %ebp,16(%esp) + movl %esi,20(%esp) + leal -3(%edi),%ebx + movl %edx,24(%esp) + movl $-1,%eax + movd %eax,%mm7 + movl 8(%esp),%esi + movl 12(%esp),%edi + movl 16(%esp),%ebp + xorl %edx,%edx + xorl %ecx,%ecx + movd (%edi),%mm4 + movd (%esi),%mm5 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + movq %mm5,%mm2 + movq %mm5,%mm0 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + incl %ecx +.align 4,0x90 +L0021st: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + leal 1(%ecx),%ecx + cmpl %ebx,%ecx + jl L0021st + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm2,%mm3 + movq %mm3,32(%esp,%ebx,4) + incl %edx +L003outer: + xorl %ecx,%ecx + movd (%edi,%edx,4),%mm4 + movd (%esi),%mm5 + movd 32(%esp),%mm6 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + paddq %mm6,%mm5 + movq %mm5,%mm0 + movq %mm5,%mm2 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 36(%esp),%mm6 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm6,%mm2 + incl %ecx + decl %ebx +L004inner: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + movd 36(%esp,%ecx,4),%mm6 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + paddq %mm6,%mm2 + decl %ebx + leal 1(%ecx),%ecx + jnz L004inner + movl %ecx,%ebx + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + movd 36(%esp,%ebx,4),%mm6 + paddq %mm2,%mm3 + paddq %mm6,%mm3 + movq %mm3,32(%esp,%ebx,4) + leal 1(%edx),%edx + cmpl %ebx,%edx + jle L003outer + emms + jmp L005common_tail +.align 4,0x90 +L005common_tail: + movl 16(%esp),%ebp + movl 4(%esp),%edi + leal 32(%esp),%esi + movl (%esi),%eax + movl %ebx,%ecx + xorl %edx,%edx +.align 4,0x90 +L006sub: + sbbl (%ebp,%edx,4),%eax + movl %eax,(%edi,%edx,4) + decl %ecx + movl 4(%esi,%edx,4),%eax + leal 1(%edx),%edx + jge L006sub + sbbl $0,%eax + movl $-1,%edx + xorl %eax,%edx + jmp L007copy +.align 4,0x90 +L007copy: + movl 32(%esp,%ebx,4),%esi + movl (%edi,%ebx,4),%ebp + movl %ecx,32(%esp,%ebx,4) + andl %eax,%esi + andl %edx,%ebp + orl %esi,%ebp + movl %ebp,(%edi,%ebx,4) + decl %ebx + jge L007copy + movl 24(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 +.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +.byte 111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/bcm/x86-mont-linux.S b/third_party/boringssl/gen/bcm/x86-mont-linux.S new file mode 100644 index 00000000..edf98a27 --- /dev/null +++ b/third_party/boringssl/gen/bcm/x86-mont-linux.S @@ -0,0 +1,218 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl bn_mul_mont_words +.hidden bn_mul_mont_words +.type bn_mul_mont_words,@function +.align 16 +bn_mul_mont_words: +.L_bn_mul_mont_words_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 40(%esp),%edi + leal 20(%esp),%esi + leal 24(%esp),%edx + addl $2,%edi + negl %edi + leal -32(%esp,%edi,4),%ebp + negl %edi + movl %ebp,%eax + subl %edx,%eax + andl $2047,%eax + subl %eax,%ebp + xorl %ebp,%edx + andl $2048,%edx + xorl $2048,%edx + subl %edx,%ebp + andl $-64,%ebp + movl %esp,%eax + subl %ebp,%eax + andl $-4096,%eax + movl %esp,%edx + leal (%ebp,%eax,1),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja .L000page_walk + jmp .L001page_walk_done +.align 16 +.L000page_walk: + leal -4096(%esp),%esp + movl (%esp),%eax + cmpl %ebp,%esp + ja .L000page_walk +.L001page_walk_done: + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%ebp + movl 16(%esi),%esi + movl (%esi),%esi + movl %eax,4(%esp) + movl %ebx,8(%esp) + movl %ecx,12(%esp) + movl %ebp,16(%esp) + movl %esi,20(%esp) + leal -3(%edi),%ebx + movl %edx,24(%esp) + movl $-1,%eax + movd %eax,%mm7 + movl 8(%esp),%esi + movl 12(%esp),%edi + movl 16(%esp),%ebp + xorl %edx,%edx + xorl %ecx,%ecx + movd (%edi),%mm4 + movd (%esi),%mm5 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + movq %mm5,%mm2 + movq %mm5,%mm0 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + incl %ecx +.align 16 +.L0021st: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + leal 1(%ecx),%ecx + cmpl %ebx,%ecx + jl .L0021st + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm2,%mm3 + movq %mm3,32(%esp,%ebx,4) + incl %edx +.L003outer: + xorl %ecx,%ecx + movd (%edi,%edx,4),%mm4 + movd (%esi),%mm5 + movd 32(%esp),%mm6 + movd (%ebp),%mm3 + pmuludq %mm4,%mm5 + paddq %mm6,%mm5 + movq %mm5,%mm0 + movq %mm5,%mm2 + pand %mm7,%mm0 + pmuludq 20(%esp),%mm5 + pmuludq %mm5,%mm3 + paddq %mm0,%mm3 + movd 36(%esp),%mm6 + movd 4(%ebp),%mm1 + movd 4(%esi),%mm0 + psrlq $32,%mm2 + psrlq $32,%mm3 + paddq %mm6,%mm2 + incl %ecx + decl %ebx +.L004inner: + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + movd 36(%esp,%ecx,4),%mm6 + pand %mm7,%mm0 + movd 4(%ebp,%ecx,4),%mm1 + paddq %mm0,%mm3 + movd 4(%esi,%ecx,4),%mm0 + psrlq $32,%mm2 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm3 + paddq %mm6,%mm2 + decl %ebx + leal 1(%ecx),%ecx + jnz .L004inner + movl %ecx,%ebx + pmuludq %mm4,%mm0 + pmuludq %mm5,%mm1 + paddq %mm0,%mm2 + paddq %mm1,%mm3 + movq %mm2,%mm0 + pand %mm7,%mm0 + paddq %mm0,%mm3 + movd %mm3,28(%esp,%ecx,4) + psrlq $32,%mm2 + psrlq $32,%mm3 + movd 36(%esp,%ebx,4),%mm6 + paddq %mm2,%mm3 + paddq %mm6,%mm3 + movq %mm3,32(%esp,%ebx,4) + leal 1(%edx),%edx + cmpl %ebx,%edx + jle .L003outer + emms + jmp .L005common_tail +.align 16 +.L005common_tail: + movl 16(%esp),%ebp + movl 4(%esp),%edi + leal 32(%esp),%esi + movl (%esi),%eax + movl %ebx,%ecx + xorl %edx,%edx +.align 16 +.L006sub: + sbbl (%ebp,%edx,4),%eax + movl %eax,(%edi,%edx,4) + decl %ecx + movl 4(%esi,%edx,4),%eax + leal 1(%edx),%edx + jge .L006sub + sbbl $0,%eax + movl $-1,%edx + xorl %eax,%edx + jmp .L007copy +.align 16 +.L007copy: + movl 32(%esp,%ebx,4),%esi + movl (%edi,%ebx,4),%ebp + movl %ecx,32(%esp,%ebx,4) + andl %eax,%esi + andl %edx,%ebp + orl %esi,%ebp + movl %ebp,(%edi,%ebx,4) + decl %ebx + jge .L007copy + movl 24(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size bn_mul_mont_words,.-.L_bn_mul_mont_words_begin +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 +.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +.byte 111,114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/bcm/x86-mont-win.asm b/third_party/boringssl/gen/bcm/x86-mont-win.asm new file mode 100644 index 00000000..47c7e85b --- /dev/null +++ b/third_party/boringssl/gen/bcm/x86-mont-win.asm @@ -0,0 +1,226 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _bn_mul_mont_words +align 16 +_bn_mul_mont_words: +L$_bn_mul_mont_words_begin: + push ebp + push ebx + push esi + push edi + mov edi,DWORD [40+esp] + lea esi,[20+esp] + lea edx,[24+esp] + add edi,2 + neg edi + lea ebp,[edi*4+esp-32] + neg edi + mov eax,ebp + sub eax,edx + and eax,2047 + sub ebp,eax + xor edx,ebp + and edx,2048 + xor edx,2048 + sub ebp,edx + and ebp,-64 + mov eax,esp + sub eax,ebp + and eax,-4096 + mov edx,esp + lea esp,[eax*1+ebp] + mov eax,DWORD [esp] + cmp esp,ebp + ja NEAR L$000page_walk + jmp NEAR L$001page_walk_done +align 16 +L$000page_walk: + lea esp,[esp-4096] + mov eax,DWORD [esp] + cmp esp,ebp + ja NEAR L$000page_walk +L$001page_walk_done: + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov ebp,DWORD [12+esi] + mov esi,DWORD [16+esi] + mov esi,DWORD [esi] + mov DWORD [4+esp],eax + mov DWORD [8+esp],ebx + mov DWORD [12+esp],ecx + mov DWORD [16+esp],ebp + mov DWORD [20+esp],esi + lea ebx,[edi-3] + mov DWORD [24+esp],edx + mov eax,-1 + movd mm7,eax + mov esi,DWORD [8+esp] + mov edi,DWORD [12+esp] + mov ebp,DWORD [16+esp] + xor edx,edx + xor ecx,ecx + movd mm4,DWORD [edi] + movd mm5,DWORD [esi] + movd mm3,DWORD [ebp] + pmuludq mm5,mm4 + movq mm2,mm5 + movq mm0,mm5 + pand mm0,mm7 + pmuludq mm5,[20+esp] + pmuludq mm3,mm5 + paddq mm3,mm0 + movd mm1,DWORD [4+ebp] + movd mm0,DWORD [4+esi] + psrlq mm2,32 + psrlq mm3,32 + inc ecx +align 16 +L$0021st: + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + pand mm0,mm7 + movd mm1,DWORD [4+ecx*4+ebp] + paddq mm3,mm0 + movd mm0,DWORD [4+ecx*4+esi] + psrlq mm2,32 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm3,32 + lea ecx,[1+ecx] + cmp ecx,ebx + jl NEAR L$0021st + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + pand mm0,mm7 + paddq mm3,mm0 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm2,32 + psrlq mm3,32 + paddq mm3,mm2 + movq [32+ebx*4+esp],mm3 + inc edx +L$003outer: + xor ecx,ecx + movd mm4,DWORD [edx*4+edi] + movd mm5,DWORD [esi] + movd mm6,DWORD [32+esp] + movd mm3,DWORD [ebp] + pmuludq mm5,mm4 + paddq mm5,mm6 + movq mm0,mm5 + movq mm2,mm5 + pand mm0,mm7 + pmuludq mm5,[20+esp] + pmuludq mm3,mm5 + paddq mm3,mm0 + movd mm6,DWORD [36+esp] + movd mm1,DWORD [4+ebp] + movd mm0,DWORD [4+esi] + psrlq mm2,32 + psrlq mm3,32 + paddq mm2,mm6 + inc ecx + dec ebx +L$004inner: + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + movd mm6,DWORD [36+ecx*4+esp] + pand mm0,mm7 + movd mm1,DWORD [4+ecx*4+ebp] + paddq mm3,mm0 + movd mm0,DWORD [4+ecx*4+esi] + psrlq mm2,32 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm3,32 + paddq mm2,mm6 + dec ebx + lea ecx,[1+ecx] + jnz NEAR L$004inner + mov ebx,ecx + pmuludq mm0,mm4 + pmuludq mm1,mm5 + paddq mm2,mm0 + paddq mm3,mm1 + movq mm0,mm2 + pand mm0,mm7 + paddq mm3,mm0 + movd DWORD [28+ecx*4+esp],mm3 + psrlq mm2,32 + psrlq mm3,32 + movd mm6,DWORD [36+ebx*4+esp] + paddq mm3,mm2 + paddq mm3,mm6 + movq [32+ebx*4+esp],mm3 + lea edx,[1+edx] + cmp edx,ebx + jle NEAR L$003outer + emms + jmp NEAR L$005common_tail +align 16 +L$005common_tail: + mov ebp,DWORD [16+esp] + mov edi,DWORD [4+esp] + lea esi,[32+esp] + mov eax,DWORD [esi] + mov ecx,ebx + xor edx,edx +align 16 +L$006sub: + sbb eax,DWORD [edx*4+ebp] + mov DWORD [edx*4+edi],eax + dec ecx + mov eax,DWORD [4+edx*4+esi] + lea edx,[1+edx] + jge NEAR L$006sub + sbb eax,0 + mov edx,-1 + xor edx,eax + jmp NEAR L$007copy +align 16 +L$007copy: + mov esi,DWORD [32+ebx*4+esp] + mov ebp,DWORD [ebx*4+edi] + mov DWORD [32+ebx*4+esp],ecx + and esi,eax + and ebp,edx + or ebp,esi + mov DWORD [ebx*4+edi],ebp + dec ebx + jge NEAR L$007copy + mov esp,DWORD [24+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +db 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 +db 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 +db 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 +db 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 +db 111,114,103,62,0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/x86_64-mont-apple.S b/third_party/boringssl/gen/bcm/x86_64-mont-apple.S new file mode 100644 index 00000000..27a168d7 --- /dev/null +++ b/third_party/boringssl/gen/bcm/x86_64-mont-apple.S @@ -0,0 +1,1235 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.globl _bn_mul_mont_nohw +.private_extern _bn_mul_mont_nohw + +.p2align 4 +_bn_mul_mont_nohw: + +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + negq %r9 + movq %rsp,%r11 + leaq -16(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk + jmp L$mul_page_walk_done + +.p2align 4 +L$mul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk +L$mul_page_walk_done: + + movq %rax,8(%rsp,%r9,8) + +L$mul_body: + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$1st_enter + +.p2align 4 +L$1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne L$1st + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp L$outer +.p2align 4 +L$outer: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$inner_enter + +.p2align 4 +L$inner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$inner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne L$inner + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jb L$outer + + xorq %r14,%r14 + movq (%rsp),%rax + movq %r9,%r15 + +.p2align 4 +L$sub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsp,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz L$sub + + sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx + xorq %r14,%r14 + movq %r9,%r15 + +L$copy: + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx + movq %r9,(%rsp,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz L$copy + + movq 8(%rsp,%r9,8),%rsi + + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mul_epilogue: + ret + + +.globl _bn_mul4x_mont +.private_extern _bn_mul4x_mont + +.p2align 4 +_bn_mul4x_mont: + +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + negq %r9 + movq %rsp,%r11 + leaq -32(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul4x_page_walk + jmp L$mul4x_page_walk_done + +L$mul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul4x_page_walk +L$mul4x_page_walk_done: + + movq %rax,8(%rsp,%r9,8) + +L$mul4x_body: + movq %rdi,16(%rsp,%r9,8) + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp L$1st4x +.p2align 4 +L$1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb L$1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + leaq 1(%r14),%r14 +.p2align 2 +L$outer4x: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq (%rsp),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%rsp),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp L$inner4x +.p2align 4 +L$inner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq 8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb L$inner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 1(%r14),%r14 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%rsp,%r9,8),%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + cmpq %r9,%r14 + jb L$outer4x + movq 16(%rsp,%r9,8),%rdi + leaq -4(%r9),%r15 + movq 0(%rsp),%rax + movq 8(%rsp),%rdx + shrq $2,%r15 + leaq (%rsp),%rsi + xorq %r14,%r14 + + subq 0(%rcx),%rax + movq 16(%rsi),%rbx + movq 24(%rsi),%rbp + sbbq 8(%rcx),%rdx + +L$sub4x: + movq %rax,0(%rdi,%r14,8) + movq %rdx,8(%rdi,%r14,8) + sbbq 16(%rcx,%r14,8),%rbx + movq 32(%rsi,%r14,8),%rax + movq 40(%rsi,%r14,8),%rdx + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + movq %rbp,24(%rdi,%r14,8) + sbbq 32(%rcx,%r14,8),%rax + movq 48(%rsi,%r14,8),%rbx + movq 56(%rsi,%r14,8),%rbp + sbbq 40(%rcx,%r14,8),%rdx + leaq 4(%r14),%r14 + decq %r15 + jnz L$sub4x + + movq %rax,0(%rdi,%r14,8) + movq 32(%rsi,%r14,8),%rax + sbbq 16(%rcx,%r14,8),%rbx + movq %rdx,8(%rdi,%r14,8) + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + + sbbq $0,%rax + movq %rbp,24(%rdi,%r14,8) + pxor %xmm0,%xmm0 + movq %rax,%xmm4 + pcmpeqd %xmm5,%xmm5 + pshufd $0,%xmm4,%xmm4 + movq %r9,%r15 + pxor %xmm4,%xmm5 + shrq $2,%r15 + xorl %eax,%eax + + jmp L$copy4x +.p2align 4 +L$copy4x: + movdqa (%rsp,%rax,1),%xmm1 + movdqu (%rdi,%rax,1),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax,1),%xmm3 + movdqa %xmm0,(%rsp,%rax,1) + por %xmm2,%xmm1 + movdqu 16(%rdi,%rax,1),%xmm2 + movdqu %xmm1,(%rdi,%rax,1) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax,1) + por %xmm2,%xmm3 + movdqu %xmm3,16(%rdi,%rax,1) + leaq 32(%rax),%rax + decq %r15 + jnz L$copy4x + movq 8(%rsp,%r9,8),%rsi + + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mul4x_epilogue: + ret + + + + + +.globl _bn_sqr8x_mont +.private_extern _bn_sqr8x_mont + +.p2align 5 +_bn_sqr8x_mont: + +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$sqr8x_prologue: + + movl %r9d,%r10d + shll $3,%r9d + shlq $3+2,%r10 + negq %r9 + + + + + + + leaq -64(%rsp,%r9,2),%r11 + movq %rsp,%rbp + movq (%r8),%r8 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$sqr8x_sp_alt + subq %r11,%rbp + leaq -64(%rbp,%r9,2),%rbp + jmp L$sqr8x_sp_done + +.p2align 5 +L$sqr8x_sp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$sqr8x_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$sqr8x_page_walk + jmp L$sqr8x_page_walk_done + +.p2align 4 +L$sqr8x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$sqr8x_page_walk +L$sqr8x_page_walk_done: + + movq %r9,%r10 + negq %r9 + + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$sqr8x_body: + + movq %rcx,%xmm2 + pxor %xmm0,%xmm0 + movq %rdi,%xmm1 + movq %r10,%xmm3 + testq %rdx,%rdx + jz L$sqr8x_nox + + call _bn_sqrx8x_internal + + + + + leaq (%r8,%rcx,1),%rbx + movq %rcx,%r9 + movq %rcx,%rdx + movq %xmm1,%rdi + sarq $3+2,%rcx + jmp L$sqr8x_sub + +.p2align 5 +L$sqr8x_nox: + call _bn_sqr8x_internal + + + + + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %r9,%rdx + movq %xmm1,%rdi + sarq $3+2,%rcx + jmp L$sqr8x_sub + +.p2align 5 +L$sqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz L$sqr8x_sub + + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi + + movq %rax,%xmm1 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi + + jmp L$sqr8x_cond_copy + +.p2align 5 +L$sqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz L$sqr8x_cond_copy + + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$sqr8x_epilogue: + ret + + +.globl _bn_mulx4x_mont +.private_extern _bn_mulx4x_mont + +.p2align 5 +_bn_mulx4x_mont: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mulx4x_prologue: + + shll $3,%r9d + xorq %r10,%r10 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk + jmp L$mulx4x_page_walk_done + +.p2align 4 +L$mulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk +L$mulx4x_page_walk_done: + + leaq (%rdx,%r9,1),%r10 + + + + + + + + + + + + + movq %r9,0(%rsp) + shrq $5,%r9 + movq %r10,16(%rsp) + subq $1,%r9 + movq %r8,24(%rsp) + movq %rdi,32(%rsp) + movq %rax,40(%rsp) + + movq %r9,48(%rsp) + jmp L$mulx4x_body + +.p2align 5 +L$mulx4x_body: + leaq 8(%rdx),%rdi + movq (%rdx),%rdx + leaq 64+32(%rsp),%rbx + movq %rdx,%r9 + + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r14 + addq %rax,%r11 + movq %rdi,8(%rsp) + mulxq 16(%rsi),%r12,%r13 + adcq %r14,%r12 + adcq $0,%r13 + + movq %r8,%rdi + imulq 24(%rsp),%r8 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%rdi + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 +.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + movq 48(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + + jmp L$mulx4x_1st + +.p2align 5 +L$mulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_1st + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + addq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + jmp L$mulx4x_outer + +.p2align 5 +L$mulx4x_outer: + movq (%rdi),%rdx + leaq 8(%rdi),%rdi + subq %rax,%rsi + movq %r15,(%rbx) + leaq 64+32(%rsp),%rbx + subq %rax,%rcx + + mulxq 0(%rsi),%r8,%r11 + xorl %ebp,%ebp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + adoxq -16(%rbx),%r12 + adcxq %rbp,%r13 + adoxq %rbp,%r13 + + movq %rdi,8(%rsp) + movq %r8,%r15 + imulq 24(%rsp),%r8 + xorl %ebp,%ebp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + adcxq %rax,%r13 + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + adoxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + leaq 32(%rcx),%rcx + adcxq %rax,%r12 + adoxq %rbp,%r15 + movq 48(%rsp),%rdi + movq %r12,-16(%rbx) + + jmp L$mulx4x_inner + +.p2align 5 +L$mulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-32(%rbx) + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_inner + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + subq 0(%rbx),%rbp + adcq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + + cmpq 16(%rsp),%rdi + jne L$mulx4x_outer + + leaq 64(%rsp),%rbx + subq %rax,%rcx + negq %r15 + movq %rax,%rdx + shrq $3+2,%rax + movq 32(%rsp),%rdi + jmp L$mulx4x_sub + +.p2align 5 +L$mulx4x_sub: + movq 0(%rbx),%r11 + movq 8(%rbx),%r12 + movq 16(%rbx),%r13 + movq 24(%rbx),%r14 + leaq 32(%rbx),%rbx + sbbq 0(%rcx),%r11 + sbbq 8(%rcx),%r12 + sbbq 16(%rcx),%r13 + sbbq 24(%rcx),%r14 + leaq 32(%rcx),%rcx + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + movq %r14,24(%rdi) + leaq 32(%rdi),%rdi + decq %rax + jnz L$mulx4x_sub + + sbbq $0,%r15 + leaq 64(%rsp),%rbx + subq %rdx,%rdi + + movq %r15,%xmm1 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi + + jmp L$mulx4x_cond_copy + +.p2align 5 +L$mulx4x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + subq $32,%rdx + jnz L$mulx4x_cond_copy + + movq %rdx,(%rbx) + + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mulx4x_epilogue: + ret + + +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.p2align 4 +#endif diff --git a/third_party/boringssl/gen/bcm/x86_64-mont-linux.S b/third_party/boringssl/gen/bcm/x86_64-mont-linux.S new file mode 100644 index 00000000..51c4b6cf --- /dev/null +++ b/third_party/boringssl/gen/bcm/x86_64-mont-linux.S @@ -0,0 +1,1237 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.globl bn_mul_mont_nohw +.hidden bn_mul_mont_nohw +.type bn_mul_mont_nohw,@function +.align 16 +bn_mul_mont_nohw: +.cfi_startproc +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + + negq %r9 + movq %rsp,%r11 + leaq -16(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.align 16 +.Lmul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 +.Lmul_body: + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .L1st_enter + +.align 16 +.L1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.L1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne .L1st + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp .Louter +.align 16 +.Louter: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .Linner_enter + +.align 16 +.Linner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.Linner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne .Linner + + addq %rax,%r13 + movq (%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jb .Louter + + xorq %r14,%r14 + movq (%rsp),%rax + movq %r9,%r15 + +.align 16 +.Lsub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsp,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz .Lsub + + sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx + xorq %r14,%r14 + movq %r9,%r15 + +.Lcopy: + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx + movq %r9,(%rsp,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz .Lcopy + + movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi,8 + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmul_epilogue: + ret +.cfi_endproc +.size bn_mul_mont_nohw,.-bn_mul_mont_nohw +.globl bn_mul4x_mont +.hidden bn_mul4x_mont +.type bn_mul4x_mont,@function +.align 16 +bn_mul4x_mont: +.cfi_startproc +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + + negq %r9 + movq %rsp,%r11 + leaq -32(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + + movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 +.Lmul4x_body: + movq %rdi,16(%rsp,%r9,8) + movq %rdx,%r12 + movq (%r8),%r8 + movq (%r12),%rbx + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp .L1st4x +.align 16 +.L1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb .L1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + leaq 1(%r14),%r14 +.align 4 +.Louter4x: + movq (%r12,%r14,8),%rbx + xorq %r15,%r15 + movq (%rsp),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%rsp),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 4(%r15),%r15 + adcq $0,%rdx + movq %rdi,(%rsp) + movq %rdx,%r13 + jmp .Linner4x +.align 16 +.Linner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq 8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 4(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq -16(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-32(%rsp,%r15,8) + movq %rdx,%r13 + cmpq %r9,%r15 + jb .Linner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -16(%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%rsp,%r15,8) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx,%r15,8),%rax + adcq $0,%rdx + addq -8(%rsp,%r15,8),%r11 + adcq $0,%rdx + leaq 1(%r14),%r14 + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%rsp,%r15,8) + movq %rdx,%r13 + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%rsp,%r9,8),%r13 + adcq $0,%rdi + movq %r13,-8(%rsp,%r15,8) + movq %rdi,(%rsp,%r15,8) + + cmpq %r9,%r14 + jb .Louter4x + movq 16(%rsp,%r9,8),%rdi + leaq -4(%r9),%r15 + movq 0(%rsp),%rax + movq 8(%rsp),%rdx + shrq $2,%r15 + leaq (%rsp),%rsi + xorq %r14,%r14 + + subq 0(%rcx),%rax + movq 16(%rsi),%rbx + movq 24(%rsi),%rbp + sbbq 8(%rcx),%rdx + +.Lsub4x: + movq %rax,0(%rdi,%r14,8) + movq %rdx,8(%rdi,%r14,8) + sbbq 16(%rcx,%r14,8),%rbx + movq 32(%rsi,%r14,8),%rax + movq 40(%rsi,%r14,8),%rdx + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + movq %rbp,24(%rdi,%r14,8) + sbbq 32(%rcx,%r14,8),%rax + movq 48(%rsi,%r14,8),%rbx + movq 56(%rsi,%r14,8),%rbp + sbbq 40(%rcx,%r14,8),%rdx + leaq 4(%r14),%r14 + decq %r15 + jnz .Lsub4x + + movq %rax,0(%rdi,%r14,8) + movq 32(%rsi,%r14,8),%rax + sbbq 16(%rcx,%r14,8),%rbx + movq %rdx,8(%rdi,%r14,8) + sbbq 24(%rcx,%r14,8),%rbp + movq %rbx,16(%rdi,%r14,8) + + sbbq $0,%rax + movq %rbp,24(%rdi,%r14,8) + pxor %xmm0,%xmm0 + movq %rax,%xmm4 + pcmpeqd %xmm5,%xmm5 + pshufd $0,%xmm4,%xmm4 + movq %r9,%r15 + pxor %xmm4,%xmm5 + shrq $2,%r15 + xorl %eax,%eax + + jmp .Lcopy4x +.align 16 +.Lcopy4x: + movdqa (%rsp,%rax,1),%xmm1 + movdqu (%rdi,%rax,1),%xmm2 + pand %xmm4,%xmm1 + pand %xmm5,%xmm2 + movdqa 16(%rsp,%rax,1),%xmm3 + movdqa %xmm0,(%rsp,%rax,1) + por %xmm2,%xmm1 + movdqu 16(%rdi,%rax,1),%xmm2 + movdqu %xmm1,(%rdi,%rax,1) + pand %xmm4,%xmm3 + pand %xmm5,%xmm2 + movdqa %xmm0,16(%rsp,%rax,1) + por %xmm2,%xmm3 + movdqu %xmm3,16(%rdi,%rax,1) + leaq 32(%rax),%rax + decq %r15 + jnz .Lcopy4x + movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi, 8 + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmul4x_epilogue: + ret +.cfi_endproc +.size bn_mul4x_mont,.-bn_mul4x_mont +.extern bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.extern bn_sqr8x_internal +.hidden bn_sqr8x_internal + +.globl bn_sqr8x_mont +.hidden bn_sqr8x_mont +.type bn_sqr8x_mont,@function +.align 32 +bn_sqr8x_mont: +.cfi_startproc +_CET_ENDBR + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lsqr8x_prologue: + + movl %r9d,%r10d + shll $3,%r9d + shlq $3+2,%r10 + negq %r9 + + + + + + + leaq -64(%rsp,%r9,2),%r11 + movq %rsp,%rbp + movq (%r8),%r8 + subq %rsi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lsqr8x_sp_alt + subq %r11,%rbp + leaq -64(%rbp,%r9,2),%rbp + jmp .Lsqr8x_sp_done + +.align 32 +.Lsqr8x_sp_alt: + leaq 4096-64(,%r9,2),%r10 + leaq -64(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lsqr8x_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lsqr8x_page_walk + jmp .Lsqr8x_page_walk_done + +.align 16 +.Lsqr8x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lsqr8x_page_walk +.Lsqr8x_page_walk_done: + + movq %r9,%r10 + negq %r9 + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lsqr8x_body: + + movq %rcx,%xmm2 + pxor %xmm0,%xmm0 + movq %rdi,%xmm1 + movq %r10,%xmm3 + testq %rdx,%rdx + jz .Lsqr8x_nox + + call bn_sqrx8x_internal + + + + + leaq (%r8,%rcx,1),%rbx + movq %rcx,%r9 + movq %rcx,%rdx + movq %xmm1,%rdi + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_nox: + call bn_sqr8x_internal + + + + + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %r9,%rdx + movq %xmm1,%rdi + sarq $3+2,%rcx + jmp .Lsqr8x_sub + +.align 32 +.Lsqr8x_sub: + movq 0(%rbx),%r12 + movq 8(%rbx),%r13 + movq 16(%rbx),%r14 + movq 24(%rbx),%r15 + leaq 32(%rbx),%rbx + sbbq 0(%rbp),%r12 + sbbq 8(%rbp),%r13 + sbbq 16(%rbp),%r14 + sbbq 24(%rbp),%r15 + leaq 32(%rbp),%rbp + movq %r12,0(%rdi) + movq %r13,8(%rdi) + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + incq %rcx + jnz .Lsqr8x_sub + + sbbq $0,%rax + leaq (%rbx,%r9,1),%rbx + leaq (%rdi,%r9,1),%rdi + + movq %rax,%xmm1 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + jmp .Lsqr8x_cond_copy + +.align 32 +.Lsqr8x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + movdqa %xmm0,-32(%rbx,%rdx,1) + movdqa %xmm0,-16(%rbx,%rdx,1) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + addq $32,%r9 + jnz .Lsqr8x_cond_copy + + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lsqr8x_epilogue: + ret +.cfi_endproc +.size bn_sqr8x_mont,.-bn_sqr8x_mont +.globl bn_mulx4x_mont +.hidden bn_mulx4x_mont +.type bn_mulx4x_mont,@function +.align 32 +bn_mulx4x_mont: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + shll $3,%r9d + xorq %r10,%r10 + subq %r9,%r10 + movq (%r8),%r8 + leaq -72(%rsp,%r10,1),%rbp + andq $-128,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.align 16 +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + leaq (%rdx,%r9,1),%r10 + + + + + + + + + + + + + movq %r9,0(%rsp) + shrq $5,%r9 + movq %r10,16(%rsp) + subq $1,%r9 + movq %r8,24(%rsp) + movq %rdi,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 + movq %r9,48(%rsp) + jmp .Lmulx4x_body + +.align 32 +.Lmulx4x_body: + leaq 8(%rdx),%rdi + movq (%rdx),%rdx + leaq 64+32(%rsp),%rbx + movq %rdx,%r9 + + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r14 + addq %rax,%r11 + movq %rdi,8(%rsp) + mulxq 16(%rsi),%r12,%r13 + adcq %r14,%r12 + adcq $0,%r13 + + movq %r8,%rdi + imulq 24(%rsp),%r8 + xorq %rbp,%rbp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%rdi + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 +.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + movq 48(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + addq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + movq (%rdi),%rdx + leaq 8(%rdi),%rdi + subq %rax,%rsi + movq %r15,(%rbx) + leaq 64+32(%rsp),%rbx + subq %rax,%rcx + + mulxq 0(%rsi),%r8,%r11 + xorl %ebp,%ebp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + adoxq -16(%rbx),%r12 + adcxq %rbp,%r13 + adoxq %rbp,%r13 + + movq %rdi,8(%rsp) + movq %r8,%r15 + imulq 24(%rsp),%r8 + xorl %ebp,%ebp + + mulxq 24(%rsi),%rax,%r14 + movq %r8,%rdx + adcxq %rax,%r13 + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + adoxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + leaq 32(%rcx),%rcx + adcxq %rax,%r12 + adoxq %rbp,%r15 + movq 48(%rsp),%rdi + movq %r12,-16(%rbx) + + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-32(%rbx) + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0(%rsp),%rax + movq 8(%rsp),%rdi + adcq %rbp,%r15 + subq 0(%rbx),%rbp + adcq %r15,%r14 + sbbq %r15,%r15 + movq %r14,-8(%rbx) + + cmpq 16(%rsp),%rdi + jne .Lmulx4x_outer + + leaq 64(%rsp),%rbx + subq %rax,%rcx + negq %r15 + movq %rax,%rdx + shrq $3+2,%rax + movq 32(%rsp),%rdi + jmp .Lmulx4x_sub + +.align 32 +.Lmulx4x_sub: + movq 0(%rbx),%r11 + movq 8(%rbx),%r12 + movq 16(%rbx),%r13 + movq 24(%rbx),%r14 + leaq 32(%rbx),%rbx + sbbq 0(%rcx),%r11 + sbbq 8(%rcx),%r12 + sbbq 16(%rcx),%r13 + sbbq 24(%rcx),%r14 + leaq 32(%rcx),%rcx + movq %r11,0(%rdi) + movq %r12,8(%rdi) + movq %r13,16(%rdi) + movq %r14,24(%rdi) + leaq 32(%rdi),%rdi + decq %rax + jnz .Lmulx4x_sub + + sbbq $0,%r15 + leaq 64(%rsp),%rbx + subq %rdx,%rdi + + movq %r15,%xmm1 + pxor %xmm0,%xmm0 + pshufd $0,%xmm1,%xmm1 + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + jmp .Lmulx4x_cond_copy + +.align 32 +.Lmulx4x_cond_copy: + movdqa 0(%rbx),%xmm2 + movdqa 16(%rbx),%xmm3 + leaq 32(%rbx),%rbx + movdqu 0(%rdi),%xmm4 + movdqu 16(%rdi),%xmm5 + leaq 32(%rdi),%rdi + movdqa %xmm0,-32(%rbx) + movdqa %xmm0,-16(%rbx) + pcmpeqd %xmm1,%xmm0 + pand %xmm1,%xmm2 + pand %xmm1,%xmm3 + pand %xmm0,%xmm4 + pand %xmm0,%xmm5 + pxor %xmm0,%xmm0 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqu %xmm4,-32(%rdi) + movdqu %xmm5,-16(%rdi) + subq $32,%rdx + jnz .Lmulx4x_cond_copy + + movq %rdx,(%rbx) + + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + ret +.cfi_endproc +.size bn_mulx4x_mont,.-bn_mulx4x_mont +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 16 +#endif diff --git a/third_party/boringssl/gen/bcm/x86_64-mont-win.asm b/third_party/boringssl/gen/bcm/x86_64-mont-win.asm new file mode 100644 index 00000000..721d5dc6 --- /dev/null +++ b/third_party/boringssl/gen/bcm/x86_64-mont-win.asm @@ -0,0 +1,1470 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + +global bn_mul_mont_nohw + +ALIGN 16 +bn_mul_mont_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mul_mont_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov r9d,r9d + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + neg r9 + mov r11,rsp + lea r10,[((-16))+r9*8+rsp] + neg r9 + and r10,-1024 + + + + + + + + + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk + jmp NEAR $L$mul_page_walk_done + +ALIGN 16 +$L$mul_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk +$L$mul_page_walk_done: + + mov QWORD[8+r9*8+rsp],rax + +$L$mul_body: + mov r12,rdx + mov r8,QWORD[r8] + mov rbx,QWORD[r12] + mov rax,QWORD[rsi] + + xor r14,r14 + xor r15,r15 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD[rcx] + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov r13,rdx + + lea r15,[1+r15] + jmp NEAR $L$1st_enter + +ALIGN 16 +$L$1st: + add r13,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add r13,r11 + mov r11,r10 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$1st_enter: + mul rbx + add r11,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + lea r15,[1+r15] + mov r10,rdx + + mul rbp + cmp r15,r9 + jne NEAR $L$1st + + add r13,rax + mov rax,QWORD[rsi] + adc rdx,0 + add r13,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + mov r11,r10 + + xor rdx,rdx + add r13,r11 + adc rdx,0 + mov QWORD[((-8))+r9*8+rsp],r13 + mov QWORD[r9*8+rsp],rdx + + lea r14,[1+r14] + jmp NEAR $L$outer +ALIGN 16 +$L$outer: + mov rbx,QWORD[r14*8+r12] + xor r15,r15 + mov rbp,r8 + mov r10,QWORD[rsp] + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov r10,QWORD[8+rsp] + mov r13,rdx + + lea r15,[1+r15] + jmp NEAR $L$inner_enter + +ALIGN 16 +$L$inner: + add r13,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add r13,r10 + mov r10,QWORD[r15*8+rsp] + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$inner_enter: + mul rbx + add r11,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + add r10,r11 + mov r11,rdx + adc r11,0 + lea r15,[1+r15] + + mul rbp + cmp r15,r9 + jne NEAR $L$inner + + add r13,rax + mov rax,QWORD[rsi] + adc rdx,0 + add r13,r10 + mov r10,QWORD[r15*8+rsp] + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + + xor rdx,rdx + add r13,r11 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r9*8+rsp],r13 + mov QWORD[r9*8+rsp],rdx + + lea r14,[1+r14] + cmp r14,r9 + jb NEAR $L$outer + + xor r14,r14 + mov rax,QWORD[rsp] + mov r15,r9 + +ALIGN 16 +$L$sub: sbb rax,QWORD[r14*8+rcx] + mov QWORD[r14*8+rdi],rax + mov rax,QWORD[8+r14*8+rsp] + lea r14,[1+r14] + dec r15 + jnz NEAR $L$sub + + sbb rax,0 + mov rbx,-1 + xor rbx,rax + xor r14,r14 + mov r15,r9 + +$L$copy: + mov rcx,QWORD[r14*8+rdi] + mov rdx,QWORD[r14*8+rsp] + and rcx,rbx + and rdx,rax + mov QWORD[r14*8+rsp],r9 + or rdx,rcx + mov QWORD[r14*8+rdi],rdx + lea r14,[1+r14] + sub r15,1 + jnz NEAR $L$copy + + mov rsi,QWORD[8+r9*8+rsp] + + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_mul_mont_nohw: +global bn_mul4x_mont + +ALIGN 16 +bn_mul4x_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mul4x_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov r9d,r9d + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + neg r9 + mov r11,rsp + lea r10,[((-32))+r9*8+rsp] + neg r9 + and r10,-1024 + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul4x_page_walk + jmp NEAR $L$mul4x_page_walk_done + +$L$mul4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul4x_page_walk +$L$mul4x_page_walk_done: + + mov QWORD[8+r9*8+rsp],rax + +$L$mul4x_body: + mov QWORD[16+r9*8+rsp],rdi + mov r12,rdx + mov r8,QWORD[r8] + mov rbx,QWORD[r12] + mov rax,QWORD[rsi] + + xor r14,r14 + xor r15,r15 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD[rcx] + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[4+r15] + adc rdx,0 + mov QWORD[rsp],rdi + mov r13,rdx + jmp NEAR $L$1st4x +ALIGN 16 +$L$1st4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+r15*8+rcx] + adc rdx,0 + lea r15,[4+r15] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[((-16))+r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-32))+r15*8+rsp],rdi + mov r13,rdx + cmp r15,r9 + jb NEAR $L$1st4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + xor rdi,rdi + add r13,r10 + adc rdi,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov QWORD[r15*8+rsp],rdi + + lea r14,[1+r14] +ALIGN 4 +$L$outer4x: + mov rbx,QWORD[r14*8+r12] + xor r15,r15 + mov r10,QWORD[rsp] + mov rbp,r8 + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + add r11,QWORD[8+rsp] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[4+r15] + adc rdx,0 + mov QWORD[rsp],rdi + mov r13,rdx + jmp NEAR $L$inner4x +ALIGN 16 +$L$inner4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + add r10,QWORD[((-16))+r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r15*8+rsp] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + add r10,QWORD[r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+r15*8+rcx] + adc rdx,0 + add r11,QWORD[8+r15*8+rsp] + adc rdx,0 + lea r15,[4+r15] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[((-16))+r15*8+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-32))+r15*8+rsp],rdi + mov r13,rdx + cmp r15,r9 + jb NEAR $L$inner4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+r15*8+rcx] + adc rdx,0 + add r10,QWORD[((-16))+r15*8+rsp] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*8+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r15*8+rsp],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+r15*8+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r15*8+rsp] + adc rdx,0 + lea r14,[1+r14] + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],rdi + mov r13,rdx + + xor rdi,rdi + add r13,r10 + adc rdi,0 + add r13,QWORD[r9*8+rsp] + adc rdi,0 + mov QWORD[((-8))+r15*8+rsp],r13 + mov QWORD[r15*8+rsp],rdi + + cmp r14,r9 + jb NEAR $L$outer4x + mov rdi,QWORD[16+r9*8+rsp] + lea r15,[((-4))+r9] + mov rax,QWORD[rsp] + mov rdx,QWORD[8+rsp] + shr r15,2 + lea rsi,[rsp] + xor r14,r14 + + sub rax,QWORD[rcx] + mov rbx,QWORD[16+rsi] + mov rbp,QWORD[24+rsi] + sbb rdx,QWORD[8+rcx] + +$L$sub4x: + mov QWORD[r14*8+rdi],rax + mov QWORD[8+r14*8+rdi],rdx + sbb rbx,QWORD[16+r14*8+rcx] + mov rax,QWORD[32+r14*8+rsi] + mov rdx,QWORD[40+r14*8+rsi] + sbb rbp,QWORD[24+r14*8+rcx] + mov QWORD[16+r14*8+rdi],rbx + mov QWORD[24+r14*8+rdi],rbp + sbb rax,QWORD[32+r14*8+rcx] + mov rbx,QWORD[48+r14*8+rsi] + mov rbp,QWORD[56+r14*8+rsi] + sbb rdx,QWORD[40+r14*8+rcx] + lea r14,[4+r14] + dec r15 + jnz NEAR $L$sub4x + + mov QWORD[r14*8+rdi],rax + mov rax,QWORD[32+r14*8+rsi] + sbb rbx,QWORD[16+r14*8+rcx] + mov QWORD[8+r14*8+rdi],rdx + sbb rbp,QWORD[24+r14*8+rcx] + mov QWORD[16+r14*8+rdi],rbx + + sbb rax,0 + mov QWORD[24+r14*8+rdi],rbp + pxor xmm0,xmm0 + movq xmm4,rax + pcmpeqd xmm5,xmm5 + pshufd xmm4,xmm4,0 + mov r15,r9 + pxor xmm5,xmm4 + shr r15,2 + xor eax,eax + + jmp NEAR $L$copy4x +ALIGN 16 +$L$copy4x: + movdqa xmm1,XMMWORD[rax*1+rsp] + movdqu xmm2,XMMWORD[rax*1+rdi] + pand xmm1,xmm4 + pand xmm2,xmm5 + movdqa xmm3,XMMWORD[16+rax*1+rsp] + movdqa XMMWORD[rax*1+rsp],xmm0 + por xmm1,xmm2 + movdqu xmm2,XMMWORD[16+rax*1+rdi] + movdqu XMMWORD[rax*1+rdi],xmm1 + pand xmm3,xmm4 + pand xmm2,xmm5 + movdqa XMMWORD[16+rax*1+rsp],xmm0 + por xmm3,xmm2 + movdqu XMMWORD[16+rax*1+rdi],xmm3 + lea rax,[32+rax] + dec r15 + jnz NEAR $L$copy4x + mov rsi,QWORD[8+r9*8+rsp] + + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mul4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_mul4x_mont: +EXTERN bn_sqrx8x_internal +EXTERN bn_sqr8x_internal + +global bn_sqr8x_mont + +ALIGN 32 +bn_sqr8x_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_sqr8x_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov r9d,r9d + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$sqr8x_prologue: + + mov r10d,r9d + shl r9d,3 + shl r10,3+2 + neg r9 + + + + + + + lea r11,[((-64))+r9*2+rsp] + mov rbp,rsp + mov r8,QWORD[r8] + sub r11,rsi + and r11,4095 + cmp r10,r11 + jb NEAR $L$sqr8x_sp_alt + sub rbp,r11 + lea rbp,[((-64))+r9*2+rbp] + jmp NEAR $L$sqr8x_sp_done + +ALIGN 32 +$L$sqr8x_sp_alt: + lea r10,[((4096-64))+r9*2] + lea rbp,[((-64))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$sqr8x_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$sqr8x_page_walk + jmp NEAR $L$sqr8x_page_walk_done + +ALIGN 16 +$L$sqr8x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$sqr8x_page_walk +$L$sqr8x_page_walk_done: + + mov r10,r9 + neg r9 + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$sqr8x_body: + + movq xmm2,rcx + pxor xmm0,xmm0 + movq xmm1,rdi + movq xmm3,r10 + test rdx,rdx + jz NEAR $L$sqr8x_nox + + call bn_sqrx8x_internal + + + + + lea rbx,[rcx*1+r8] + mov r9,rcx + mov rdx,rcx + movq rdi,xmm1 + sar rcx,3+2 + jmp NEAR $L$sqr8x_sub + +ALIGN 32 +$L$sqr8x_nox: + call bn_sqr8x_internal + + + + + lea rbx,[r9*1+rdi] + mov rcx,r9 + mov rdx,r9 + movq rdi,xmm1 + sar rcx,3+2 + jmp NEAR $L$sqr8x_sub + +ALIGN 32 +$L$sqr8x_sub: + mov r12,QWORD[rbx] + mov r13,QWORD[8+rbx] + mov r14,QWORD[16+rbx] + mov r15,QWORD[24+rbx] + lea rbx,[32+rbx] + sbb r12,QWORD[rbp] + sbb r13,QWORD[8+rbp] + sbb r14,QWORD[16+rbp] + sbb r15,QWORD[24+rbp] + lea rbp,[32+rbp] + mov QWORD[rdi],r12 + mov QWORD[8+rdi],r13 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + lea rdi,[32+rdi] + inc rcx + jnz NEAR $L$sqr8x_sub + + sbb rax,0 + lea rbx,[r9*1+rbx] + lea rdi,[r9*1+rdi] + + movq xmm1,rax + pxor xmm0,xmm0 + pshufd xmm1,xmm1,0 + mov rsi,QWORD[40+rsp] + + jmp NEAR $L$sqr8x_cond_copy + +ALIGN 32 +$L$sqr8x_cond_copy: + movdqa xmm2,XMMWORD[rbx] + movdqa xmm3,XMMWORD[16+rbx] + lea rbx,[32+rbx] + movdqu xmm4,XMMWORD[rdi] + movdqu xmm5,XMMWORD[16+rdi] + lea rdi,[32+rdi] + movdqa XMMWORD[(-32)+rbx],xmm0 + movdqa XMMWORD[(-16)+rbx],xmm0 + movdqa XMMWORD[(-32)+rdx*1+rbx],xmm0 + movdqa XMMWORD[(-16)+rdx*1+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD[(-32)+rdi],xmm4 + movdqu XMMWORD[(-16)+rdi],xmm5 + add r9,32 + jnz NEAR $L$sqr8x_cond_copy + + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$sqr8x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_sqr8x_mont: +global bn_mulx4x_mont + +ALIGN 32 +bn_mulx4x_mont: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mulx4x_mont: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx4x_prologue: + + shl r9d,3 + xor r10,r10 + sub r10,r9 + mov r8,QWORD[r8] + lea rbp,[((-72))+r10*1+rsp] + and rbp,-128 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk + jmp NEAR $L$mulx4x_page_walk_done + +ALIGN 16 +$L$mulx4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk +$L$mulx4x_page_walk_done: + + lea r10,[r9*1+rdx] + + + + + + + + + + + + + mov QWORD[rsp],r9 + shr r9,5 + mov QWORD[16+rsp],r10 + sub r9,1 + mov QWORD[24+rsp],r8 + mov QWORD[32+rsp],rdi + mov QWORD[40+rsp],rax + + mov QWORD[48+rsp],r9 + jmp NEAR $L$mulx4x_body + +ALIGN 32 +$L$mulx4x_body: + lea rdi,[8+rdx] + mov rdx,QWORD[rdx] + lea rbx,[((64+32))+rsp] + mov r9,rdx + + mulx rax,r8,QWORD[rsi] + mulx r14,r11,QWORD[8+rsi] + add r11,rax + mov QWORD[8+rsp],rdi + mulx r13,r12,QWORD[16+rsi] + adc r12,r14 + adc r13,0 + + mov rdi,r8 + imul r8,QWORD[24+rsp] + xor rbp,rbp + + mulx r14,rax,QWORD[24+rsi] + mov rdx,r8 + lea rsi,[32+rsi] + adcx r13,rax + adcx r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx rdi,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + DB 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 + mov rdi,QWORD[48+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + adcx r12,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r12 + + jmp NEAR $L$mulx4x_1st + +ALIGN 32 +$L$mulx4x_1st: + adcx r15,rbp + mulx rax,r10,QWORD[rsi] + adcx r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r12,r14 + mulx r14,r13,QWORD[24+rsi] + DB 0x67,0x67 + mov rdx,r8 + adcx r13,rax + adcx r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + mov QWORD[((-32))+rbx],r11 + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_1st + + mov rax,QWORD[rsp] + mov rdi,QWORD[8+rsp] + adc r15,rbp + add r14,r15 + sbb r15,r15 + mov QWORD[((-8))+rbx],r14 + jmp NEAR $L$mulx4x_outer + +ALIGN 32 +$L$mulx4x_outer: + mov rdx,QWORD[rdi] + lea rdi,[8+rdi] + sub rsi,rax + mov QWORD[rbx],r15 + lea rbx,[((64+32))+rsp] + sub rcx,rax + + mulx r11,r8,QWORD[rsi] + xor ebp,ebp + mov r9,rdx + mulx r12,r14,QWORD[8+rsi] + adox r8,QWORD[((-32))+rbx] + adcx r11,r14 + mulx r13,r15,QWORD[16+rsi] + adox r11,QWORD[((-24))+rbx] + adcx r12,r15 + adox r12,QWORD[((-16))+rbx] + adcx r13,rbp + adox r13,rbp + + mov QWORD[8+rsp],rdi + mov r15,r8 + imul r8,QWORD[24+rsp] + xor ebp,ebp + + mulx r14,rax,QWORD[24+rsi] + mov rdx,r8 + adcx r13,rax + adox r13,QWORD[((-8))+rbx] + adcx r14,rbp + lea rsi,[32+rsi] + adox r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + lea rcx,[32+rcx] + adcx r12,rax + adox r15,rbp + mov rdi,QWORD[48+rsp] + mov QWORD[((-16))+rbx],r12 + + jmp NEAR $L$mulx4x_inner + +ALIGN 32 +$L$mulx4x_inner: + mulx rax,r10,QWORD[rsi] + adcx r15,rbp + adox r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r10,QWORD[rbx] + adox r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r11,QWORD[8+rbx] + adox r12,r14 + mulx r14,r13,QWORD[24+rsi] + mov rdx,r8 + adcx r12,QWORD[16+rbx] + adox r13,rax + adcx r13,QWORD[24+rbx] + adox r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + adcx r14,rbp + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-32))+rbx],r11 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_inner + + mov rax,QWORD[rsp] + mov rdi,QWORD[8+rsp] + adc r15,rbp + sub rbp,QWORD[rbx] + adc r14,r15 + sbb r15,r15 + mov QWORD[((-8))+rbx],r14 + + cmp rdi,QWORD[16+rsp] + jne NEAR $L$mulx4x_outer + + lea rbx,[64+rsp] + sub rcx,rax + neg r15 + mov rdx,rax + shr rax,3+2 + mov rdi,QWORD[32+rsp] + jmp NEAR $L$mulx4x_sub + +ALIGN 32 +$L$mulx4x_sub: + mov r11,QWORD[rbx] + mov r12,QWORD[8+rbx] + mov r13,QWORD[16+rbx] + mov r14,QWORD[24+rbx] + lea rbx,[32+rbx] + sbb r11,QWORD[rcx] + sbb r12,QWORD[8+rcx] + sbb r13,QWORD[16+rcx] + sbb r14,QWORD[24+rcx] + lea rcx,[32+rcx] + mov QWORD[rdi],r11 + mov QWORD[8+rdi],r12 + mov QWORD[16+rdi],r13 + mov QWORD[24+rdi],r14 + lea rdi,[32+rdi] + dec rax + jnz NEAR $L$mulx4x_sub + + sbb r15,0 + lea rbx,[64+rsp] + sub rdi,rdx + + movq xmm1,r15 + pxor xmm0,xmm0 + pshufd xmm1,xmm1,0 + mov rsi,QWORD[40+rsp] + + jmp NEAR $L$mulx4x_cond_copy + +ALIGN 32 +$L$mulx4x_cond_copy: + movdqa xmm2,XMMWORD[rbx] + movdqa xmm3,XMMWORD[16+rbx] + lea rbx,[32+rbx] + movdqu xmm4,XMMWORD[rdi] + movdqu xmm5,XMMWORD[16+rdi] + lea rdi,[32+rdi] + movdqa XMMWORD[(-32)+rbx],xmm0 + movdqa XMMWORD[(-16)+rbx],xmm0 + pcmpeqd xmm0,xmm1 + pand xmm2,xmm1 + pand xmm3,xmm1 + pand xmm4,xmm0 + pand xmm5,xmm0 + pxor xmm0,xmm0 + por xmm4,xmm2 + por xmm5,xmm3 + movdqu XMMWORD[(-32)+rdi],xmm4 + movdqu XMMWORD[(-16)+rdi],xmm5 + sub rdx,32 + jnz NEAR $L$mulx4x_cond_copy + + mov QWORD[rbx],rdx + + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mulx4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_mulx4x_mont: + DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 + DB 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 + DB 54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83 + DB 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 + DB 115,108,46,111,114,103,62,0 +ALIGN 16 +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +mul_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov r10,QWORD[192+r8] + mov rax,QWORD[8+r10*8+rax] + + jmp NEAR $L$common_pop_regs + + + +ALIGN 16 +sqr_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_pop_regs + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + mov rax,QWORD[40+rax] + +$L$common_pop_regs: + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_bn_mul_mont_nohw wrt ..imagebase + DD $L$SEH_end_bn_mul_mont_nohw wrt ..imagebase + DD $L$SEH_info_bn_mul_mont_nohw wrt ..imagebase + + DD $L$SEH_begin_bn_mul4x_mont wrt ..imagebase + DD $L$SEH_end_bn_mul4x_mont wrt ..imagebase + DD $L$SEH_info_bn_mul4x_mont wrt ..imagebase + + DD $L$SEH_begin_bn_sqr8x_mont wrt ..imagebase + DD $L$SEH_end_bn_sqr8x_mont wrt ..imagebase + DD $L$SEH_info_bn_sqr8x_mont wrt ..imagebase + DD $L$SEH_begin_bn_mulx4x_mont wrt ..imagebase + DD $L$SEH_end_bn_mulx4x_mont wrt ..imagebase + DD $L$SEH_info_bn_mulx4x_mont wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_bn_mul_mont_nohw: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase +$L$SEH_info_bn_mul4x_mont: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase +$L$SEH_info_bn_sqr8x_mont: + DB 9,0,0,0 + DD sqr_handler wrt ..imagebase + DD $L$sqr8x_prologue wrt ..imagebase,$L$sqr8x_body wrt ..imagebase,$L$sqr8x_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_mulx4x_mont: + DB 9,0,0,0 + DD sqr_handler wrt ..imagebase + DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase +ALIGN 8 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/bcm/x86_64-mont5-apple.S b/third_party/boringssl/gen/bcm/x86_64-mont5-apple.S new file mode 100644 index 00000000..cd7d797b --- /dev/null +++ b/third_party/boringssl/gen/bcm/x86_64-mont5-apple.S @@ -0,0 +1,3626 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.globl _bn_mul_mont_gather5_nohw +.private_extern _bn_mul_mont_gather5_nohw + +.p2align 6 +_bn_mul_mont_gather5_nohw: + +_CET_ENDBR + + + movl %r9d,%r9d + movq %rsp,%rax + + movd 8(%rsp),%xmm5 + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + negq %r9 + movq %rsp,%r11 + leaq -280(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk + jmp L$mul_page_walk_done + +L$mul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja L$mul_page_walk +L$mul_page_walk_done: + + leaq L$inc(%rip),%r10 + movq %rax,8(%rsp,%r9,8) + +L$mul_body: + + leaq 128(%rdx),%r12 + movdqa 0(%r10),%xmm0 + movdqa 16(%r10),%xmm1 + leaq 24-112(%rsp,%r9,8),%r10 + andq $-16,%r10 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 + movq %xmm0,%rbx + + movq (%r8),%r8 + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$1st_enter + +.p2align 4 +L$1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne L$1st + + + addq %rax,%r13 + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r9,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp L$outer +.p2align 4 +L$outer: + leaq 24+128(%rsp,%r9,8),%rdx + andq $-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 + + movq (%rsi),%rax + movq %xmm0,%rbx + + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp L$inner_enter + +.p2align 4 +L$inner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +L$inner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne L$inner + + addq %rax,%r13 + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r9,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r9,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jb L$outer + + xorq %r14,%r14 + movq (%rsp),%rax + leaq (%rsp),%rsi + movq %r9,%r15 + jmp L$sub +.p2align 4 +L$sub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsi,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz L$sub + + sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx + xorq %r14,%r14 + movq %r9,%r15 + +L$copy: + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx + movq %r14,(%rsp,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz L$copy + + movq 8(%rsp,%r9,8),%rsi + + movq $1,%rax + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mul_epilogue: + ret + + +.globl _bn_mul4x_mont_gather5 +.private_extern _bn_mul4x_mont_gather5 + +.p2align 5 +_bn_mul4x_mont_gather5: + +_CET_ENDBR +.byte 0x67 + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mul4x_prologue: + +.byte 0x67 + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$mul4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$mul4xsp_done + +.p2align 5 +L$mul4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$mul4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mul4x_page_walk + jmp L$mul4x_page_walk_done + +L$mul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mul4x_page_walk +L$mul4x_page_walk_done: + + negq %r9 + + movq %rax,40(%rsp) + +L$mul4x_body: + + call mul4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mul4x_epilogue: + ret + + + + +.p2align 5 +mul4x_internal: + + shlq $5,%r9 + movd 8(%rax),%xmm5 + leaq L$inc(%rip),%rax + leaq 128(%rdx,%r9,1),%r13 + shrq $5,%r9 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r9,1),%r10 + leaq 128(%rdx),%r12 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67,0x67 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 + movq %xmm0,%rbx + + movq %r13,16+8(%rsp) + movq %rdi,56+8(%rsp) + + movq (%r8),%r8 + movq (%rsi),%rax + leaq (%rsi,%r9,1),%rsi + negq %r9 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + leaq 64+8(%rsp),%r14 + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + jmp L$1st4x + +.p2align 5 +L$1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz L$1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + + leaq (%rcx,%r9,1),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + jmp L$outer4x + +.p2align 5 +L$outer4x: + leaq 16+128(%r14),%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 + movq %xmm0,%rbx + + movq (%r14,%r9,1),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + movq %rdi,(%r14) + + leaq (%r14,%r9,1),%r14 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdx,%r13 + jmp L$inner4x + +.p2align 5 +L$inner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + addq (%r14),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz L$inner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq %rbp,%rax + movq -8(%rcx),%rbp + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + + movq %rdi,-16(%r14) + leaq (%rcx,%r9,1),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%r14),%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + cmpq 16+8(%rsp),%r12 + jb L$outer4x + xorq %rax,%rax + subq %r13,%rbp + adcq %r15,%r15 + orq %r15,%rdi + subq %rdi,%rax + leaq (%r14,%r9,1),%rbx + movq (%rcx),%r12 + leaq (%rcx),%rbp + movq %r9,%rcx + sarq $3+2,%rcx + movq 56+8(%rsp),%rdi + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqr4x_sub_entry + + +.globl _bn_power5_nohw +.private_extern _bn_power5_nohw + +.p2align 5 +_bn_power5_nohw: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$power5_prologue: + + + + + shll $3,%r9d + leal (%r9,%r9,2),%r10d + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$pwr_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$pwr_sp_done + +.p2align 5 +L$pwr_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$pwr_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwr_page_walk + jmp L$pwr_page_walk_done + +L$pwr_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwr_page_walk +L$pwr_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$power5_body: + movq %rdi,%xmm1 + movq %rcx,%xmm2 + movq %r10,%xmm3 + movq %rdx,%xmm4 + + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + + movq %xmm2,%rcx + movq %xmm4,%rdx + movq %rsi,%rdi + movq 40(%rsp),%rax + leaq 32(%rsp),%r8 + + call mul4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$power5_epilogue: + ret + + + +.globl _bn_sqr8x_internal +.private_extern _bn_sqr8x_internal +.private_extern _bn_sqr8x_internal + +.p2align 5 +_bn_sqr8x_internal: +__bn_sqr8x_internal: + +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 32(%r10),%rbp + leaq (%rsi,%r9,1),%rsi + + movq %r9,%rcx + + + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + movq %r10,-24(%rdi,%rbp,1) + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + movq %r11,-16(%rdi,%rbp,1) + movq %rdx,%r10 + + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + movq %rax,%r12 + movq %rbx,%rax + movq %rdx,%r13 + + leaq (%rbp),%rcx + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + jmp L$sqr4x_1st + +.p2align 5 +L$sqr4x_1st: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq 16(%rsi,%rcx,1),%rbx + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %r10,8(%rdi,%rcx,1) + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 24(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,16(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + leaq 32(%rcx),%rcx + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne L$sqr4x_1st + + mulq %r15 + addq %rax,%r13 + leaq 16(%rbp),%rbp + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + jmp L$sqr4x_outer + +.p2align 5 +L$sqr4x_outer: + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq -24(%rdi,%rbp,1),%r10 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + movq %r10,-24(%rdi,%rbp,1) + movq %rdx,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + addq -16(%rdi,%rbp,1),%r11 + movq %rdx,%r10 + adcq $0,%r10 + movq %r11,-16(%rdi,%rbp,1) + + xorq %r12,%r12 + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq $0,%rdx + addq -8(%rdi,%rbp,1),%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rbp,1) + + leaq (%rbp),%rcx + jmp L$sqr4x_inner + +.p2align 5 +L$sqr4x_inner: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + addq (%rdi,%rcx,1),%r13 + adcq $0,%r12 + +.byte 0x67 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %r11,(%rdi,%rcx,1) + movq %rbx,%rax + movq %rdx,%r13 + adcq $0,%r13 + addq 8(%rdi,%rcx,1),%r12 + leaq 16(%rcx),%rcx + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne L$sqr4x_inner + +.byte 0x67 + mulq %r15 + addq %rax,%r13 + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + addq $16,%rbp + jnz L$sqr4x_outer + + + movq -32(%rsi),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi),%rbx + movq %rax,%r15 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq %r10,-24(%rdi) + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + movq -8(%rsi),%rbx + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,-16(%rdi) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi) + + mulq %r15 + addq %rax,%r13 + movq -16(%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + mulq %rbx + addq $16,%rbp + xorq %r14,%r14 + subq %r9,%rbp + xorq %r15,%r15 + + addq %r12,%rax + adcq $0,%rdx + movq %rax,8(%rdi) + movq %rdx,16(%rdi) + movq %r15,24(%rdi) + + movq -16(%rsi,%rbp,1),%rax + leaq 48+8(%rsp),%rdi + xorq %r10,%r10 + movq 8(%rdi),%r11 + + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + leaq 16(%rbp),%rbp + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + jmp L$sqr4x_shift_n_add + +.p2align 5 +L$sqr4x_shift_n_add: + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 0(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 8(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,-16(%rdi) + adcq %rdx,%r8 + + leaq (%r14,%r10,2),%r12 + movq %r8,-8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq 8(%rsi,%rbp,1),%rax + movq %r12,0(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 16(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + addq $32,%rbp + jnz L$sqr4x_shift_n_add + + leaq (%r14,%r10,2),%r12 +.byte 0x67 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + mulq %rax + negq %r15 + adcq %rax,%rbx + adcq %rdx,%r8 + movq %rbx,-16(%rdi) + movq %r8,-8(%rdi) + movq %xmm2,%rbp +__bn_sqr8x_reduction: + xorq %rax,%rax + leaq (%r9,%rbp,1),%rcx + leaq 48+8(%rsp,%r9,2),%rdx + movq %rcx,0+8(%rsp) + leaq 48+8(%rsp,%r9,1),%rdi + movq %rdx,8+8(%rsp) + negq %r9 + jmp L$8x_reduction_loop + +.p2align 5 +L$8x_reduction_loop: + leaq (%rdi,%r9,1),%rdi +.byte 0x66 + movq 0(%rdi),%rbx + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,(%rdx) + leaq 64(%rdi),%rdi + +.byte 0x67 + movq %rbx,%r8 + imulq 32+8(%rsp),%rbx + movq 0(%rbp),%rax + movl $8,%ecx + jmp L$8x_reduce + +.p2align 5 +L$8x_reduce: + mulq %rbx + movq 8(%rbp),%rax + negq %r8 + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rbx,48-8+8(%rsp,%rcx,8) + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq 32+8(%rsp),%rsi + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rbp),%rax + adcq $0,%rdx + imulq %r8,%rsi + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq %rsi,%rbx + addq %rax,%r15 + movq 0(%rbp),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz L$8x_reduce + + leaq 64(%rbp),%rbp + xorq %rax,%rax + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae L$8x_no_tail + +.byte 0x66 + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movq 48+56+8(%rsp),%rbx + movl $8,%ecx + movq 0(%rbp),%rax + jmp L$8x_tail + +.p2align 5 +L$8x_tail: + mulq %rbx + addq %rax,%r8 + movq 8(%rbp),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + leaq 8(%rdi),%rdi + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq 48-16+8(%rsp,%rcx,8),%rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r15,%r14 + movq 0(%rbp),%rax + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz L$8x_tail + + leaq 64(%rbp),%rbp + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae L$8x_tail_done + + movq 48+56+8(%rsp),%rbx + negq %rsi + movq 0(%rbp),%rax + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movl $8,%ecx + jmp L$8x_tail + +.p2align 5 +L$8x_tail_done: + xorq %rax,%rax + addq (%rdx),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + negq %rsi +L$8x_no_tail: + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + movq -8(%rbp),%rcx + xorq %rsi,%rsi + + movq %xmm2,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %xmm3,%r9 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + leaq 64(%rdi),%rdi + + cmpq %rdx,%rdi + jb L$8x_reduction_loop + ret + + + +.p2align 5 +__bn_post4x_internal: + + movq 0(%rbp),%r12 + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %xmm1,%rdi + negq %rax + movq %xmm1,%rsi + sarq $3+2,%rcx + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqr4x_sub_entry + +.p2align 4 +L$sqr4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +L$sqr4x_sub_entry: + leaq 32(%rbp),%rbp + notq %r12 + notq %r13 + notq %r14 + notq %r15 + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + negq %r10 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + adcq 16(%rbx),%r14 + adcq 24(%rbx),%r15 + movq %r12,0(%rdi) + leaq 32(%rbx),%rbx + movq %r13,8(%rdi) + sbbq %r10,%r10 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + + incq %rcx + jnz L$sqr4x_sub + + movq %r9,%r10 + negq %r9 + ret + + +.globl _bn_mulx4x_mont_gather5 +.private_extern _bn_mulx4x_mont_gather5 + +.p2align 5 +_bn_mulx4x_mont_gather5: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$mulx4x_prologue: + + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$mulx4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$mulx4xsp_done + +L$mulx4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$mulx4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk + jmp L$mulx4x_page_walk_done + +L$mulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$mulx4x_page_walk +L$mulx4x_page_walk_done: + + + + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$mulx4x_body: + call mulx4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$mulx4x_epilogue: + ret + + + + +.p2align 5 +mulx4x_internal: + + movq %r9,8(%rsp) + movq %r9,%r10 + negq %r9 + shlq $5,%r9 + negq %r10 + leaq 128(%rdx,%r9,1),%r13 + shrq $5+5,%r9 + movd 8(%rax),%xmm5 + subq $1,%r9 + leaq L$inc(%rip),%rax + movq %r13,16+8(%rsp) + movq %r9,24+8(%rsp) + movq %rdi,56+8(%rsp) + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r10,1),%r10 + leaq 128(%rdx),%rdi + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67 + movdqa %xmm1,%xmm2 +.byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 +.byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + + pand 64(%rdi),%xmm0 + pand 80(%rdi),%xmm1 + pand 96(%rdi),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%rdi),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%rdi),%xmm4 + movdqa -112(%rdi),%xmm5 + movdqa -96(%rdi),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%rdi),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%rdi),%xmm4 + movdqa -48(%rdi),%xmm5 + movdqa -32(%rdi),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%rdi),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%rdi),%xmm4 + movdqa 16(%rdi),%xmm5 + movdqa 32(%rdi),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%rdi),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + pxor %xmm1,%xmm0 + + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%rdi),%rdi + movq %xmm0,%rdx + leaq 64+32+8(%rsp),%rbx + + movq %rdx,%r9 + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r12 + addq %rax,%r11 + mulxq 16(%rsi),%rax,%r13 + adcq %rax,%r12 + adcq $0,%r13 + mulxq 24(%rsi),%rax,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + xorq %rbp,%rbp + movq %r8,%rdx + + movq %rdi,8+8(%rsp) + + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + jmp L$mulx4x_1st + +.p2align 5 +L$mulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_1st + + movq 8(%rsp),%rax + adcq %rbp,%r15 + leaq (%rsi,%rax,1),%rsi + addq %r15,%r14 + movq 8+8(%rsp),%rdi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + jmp L$mulx4x_outer + +.p2align 5 +L$mulx4x_outer: + leaq 16-256(%rbx),%r10 + pxor %xmm4,%xmm4 +.byte 0x67,0x67 + pxor %xmm5,%xmm5 + movdqa -128(%rdi),%xmm0 + movdqa -112(%rdi),%xmm1 + movdqa -96(%rdi),%xmm2 + pand 256(%r10),%xmm0 + movdqa -80(%rdi),%xmm3 + pand 272(%r10),%xmm1 + por %xmm0,%xmm4 + pand 288(%r10),%xmm2 + por %xmm1,%xmm5 + pand 304(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%rdi),%xmm0 + movdqa -48(%rdi),%xmm1 + movdqa -32(%rdi),%xmm2 + pand 320(%r10),%xmm0 + movdqa -16(%rdi),%xmm3 + pand 336(%r10),%xmm1 + por %xmm0,%xmm4 + pand 352(%r10),%xmm2 + por %xmm1,%xmm5 + pand 368(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%rdi),%xmm0 + movdqa 16(%rdi),%xmm1 + movdqa 32(%rdi),%xmm2 + pand 384(%r10),%xmm0 + movdqa 48(%rdi),%xmm3 + pand 400(%r10),%xmm1 + por %xmm0,%xmm4 + pand 416(%r10),%xmm2 + por %xmm1,%xmm5 + pand 432(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%rdi),%xmm0 + movdqa 80(%rdi),%xmm1 + movdqa 96(%rdi),%xmm2 + pand 448(%r10),%xmm0 + movdqa 112(%rdi),%xmm3 + pand 464(%r10),%xmm1 + por %xmm0,%xmm4 + pand 480(%r10),%xmm2 + por %xmm1,%xmm5 + pand 496(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%rdi),%rdi + movq %xmm0,%rdx + + movq %rbp,(%rbx) + leaq 32(%rbx,%rax,1),%rbx + mulxq 0(%rsi),%r8,%r11 + xorq %rbp,%rbp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + mulxq 24(%rsi),%rdx,%r14 + adoxq -16(%rbx),%r12 + adcxq %rdx,%r13 + leaq (%rcx,%rax,1),%rcx + leaq 32(%rsi),%rsi + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + adoxq %rbp,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + + movq %r8,%rdx + xorq %rbp,%rbp + movq %rdi,8+8(%rsp) + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r12 + movq %r11,-24(%rbx) + adoxq %rbp,%r15 + movq %r12,-16(%rbx) + leaq 32(%rcx),%rcx + jmp L$mulx4x_inner + +.p2align 5 +L$mulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + movq %r11,-32(%rbx) + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + leaq 32(%rcx),%rcx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + movq %r13,-16(%rbx) + + decq %rdi + jnz L$mulx4x_inner + + movq 0+8(%rsp),%rax + adcq %rbp,%r15 + subq 0(%rbx),%rdi + movq 8+8(%rsp),%rdi + movq 16+8(%rsp),%r10 + adcq %r15,%r14 + leaq (%rsi,%rax,1),%rsi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + + cmpq %r10,%rdi + jb L$mulx4x_outer + + movq -8(%rcx),%r10 + movq %rbp,%r8 + movq (%rcx,%rax,1),%r12 + leaq (%rcx,%rax,1),%rbp + movq %rax,%rcx + leaq (%rbx,%rax,1),%rdi + xorl %eax,%eax + xorq %r15,%r15 + subq %r14,%r10 + adcq %r15,%r15 + orq %r15,%r8 + sarq $3+2,%rcx + subq %r8,%rax + movq 56+8(%rsp),%rdx + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqrx4x_sub_entry + + +.globl _bn_powerx5 +.private_extern _bn_powerx5 + +.p2align 5 +_bn_powerx5: + +_CET_ENDBR + movq %rsp,%rax + + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + +L$powerx5_prologue: + + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb L$pwrx_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp L$pwrx_sp_done + +.p2align 5 +L$pwrx_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +L$pwrx_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwrx_page_walk + jmp L$pwrx_page_walk_done + +L$pwrx_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja L$pwrx_page_walk +L$pwrx_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + + + pxor %xmm0,%xmm0 + movq %rdi,%xmm1 + movq %rcx,%xmm2 + movq %r10,%xmm3 + movq %rdx,%xmm4 + movq %r8,32(%rsp) + movq %rax,40(%rsp) + +L$powerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + movq %r10,%r9 + movq %rsi,%rdi + movq %xmm2,%rcx + movq %xmm4,%rdx + movq 40(%rsp),%rax + + call mulx4x_internal + + movq 40(%rsp),%rsi + + movq $1,%rax + + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$powerx5_epilogue: + ret + + + +.globl _bn_sqrx8x_internal +.private_extern _bn_sqrx8x_internal +.private_extern _bn_sqrx8x_internal + +.p2align 5 +_bn_sqrx8x_internal: +__bn_sqrx8x_internal: + +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 48+8(%rsp),%rdi + leaq (%rsi,%r9,1),%rbp + movq %r9,0+8(%rsp) + movq %rbp,8+8(%rsp) + jmp L$sqr8x_zero_start + +.p2align 5 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +L$sqrx8x_zero: +.byte 0x3e + movdqa %xmm0,0(%rdi) + movdqa %xmm0,16(%rdi) + movdqa %xmm0,32(%rdi) + movdqa %xmm0,48(%rdi) +L$sqr8x_zero_start: + movdqa %xmm0,64(%rdi) + movdqa %xmm0,80(%rdi) + movdqa %xmm0,96(%rdi) + movdqa %xmm0,112(%rdi) + leaq 128(%rdi),%rdi + subq $64,%r9 + jnz L$sqrx8x_zero + + movq 0(%rsi),%rdx + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + leaq 48+8(%rsp),%rdi + xorq %rbp,%rbp + jmp L$sqrx8x_outer_loop + +.p2align 5 +L$sqrx8x_outer_loop: + mulxq 8(%rsi),%r8,%rax + adcxq %r9,%r8 + adoxq %rax,%r10 + mulxq 16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 +.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcxq %r11,%r10 + adoxq %rax,%r12 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcxq %r12,%r11 + adoxq %rax,%r13 + mulxq 40(%rsi),%r12,%rax + adcxq %r13,%r12 + adoxq %rax,%r14 + mulxq 48(%rsi),%r13,%rax + adcxq %r14,%r13 + adoxq %r15,%rax + mulxq 56(%rsi),%r14,%r15 + movq 8(%rsi),%rdx + adcxq %rax,%r14 + adoxq %rbp,%r15 + adcq 64(%rdi),%r15 + movq %r8,8(%rdi) + movq %r9,16(%rdi) + sbbq %rcx,%rcx + xorq %rbp,%rbp + + + mulxq 16(%rsi),%r8,%rbx + mulxq 24(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 32(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %rbx,%r11 +.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcxq %r13,%r11 + adoxq %r14,%r12 +.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + movq 16(%rsi),%rdx + adcxq %rax,%r12 + adoxq %rbx,%r13 + adcxq %r15,%r13 + adoxq %rbp,%r14 + adcxq %rbp,%r14 + + movq %r8,24(%rdi) + movq %r9,32(%rdi) + + mulxq 24(%rsi),%r8,%rbx + mulxq 32(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 40(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %r13,%r11 +.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +.byte 0x3e + movq 24(%rsi),%rdx + adcxq %rbx,%r11 + adoxq %rax,%r12 + adcxq %r14,%r12 + movq %r8,40(%rdi) + movq %r9,48(%rdi) + mulxq 32(%rsi),%r8,%rax + adoxq %rbp,%r13 + adcxq %rbp,%r13 + + mulxq 40(%rsi),%r9,%rbx + adcxq %r10,%r8 + adoxq %rax,%r9 + mulxq 48(%rsi),%r10,%rax + adcxq %r11,%r9 + adoxq %r12,%r10 + mulxq 56(%rsi),%r11,%r12 + movq 32(%rsi),%rdx + movq 40(%rsi),%r14 + adcxq %rbx,%r10 + adoxq %rax,%r11 + movq 48(%rsi),%r15 + adcxq %r13,%r11 + adoxq %rbp,%r12 + adcxq %rbp,%r12 + + movq %r8,56(%rdi) + movq %r9,64(%rdi) + + mulxq %r14,%r9,%rax + movq 56(%rsi),%r8 + adcxq %r10,%r9 + mulxq %r15,%r10,%rbx + adoxq %rax,%r10 + adcxq %r11,%r10 + mulxq %r8,%r11,%rax + movq %r14,%rdx + adoxq %rbx,%r11 + adcxq %r12,%r11 + + adcxq %rbp,%rax + + mulxq %r15,%r14,%rbx + mulxq %r8,%r12,%r13 + movq %r15,%rdx + leaq 64(%rsi),%rsi + adcxq %r14,%r11 + adoxq %rbx,%r12 + adcxq %rax,%r12 + adoxq %rbp,%r13 + +.byte 0x67,0x67 + mulxq %r8,%r8,%r14 + adcxq %r8,%r13 + adcxq %rbp,%r14 + + cmpq 8+8(%rsp),%rsi + je L$sqrx8x_outer_break + + negq %rcx + movq $-8,%rcx + movq %rbp,%r15 + movq 64(%rdi),%r8 + adcxq 72(%rdi),%r9 + adcxq 80(%rdi),%r10 + adcxq 88(%rdi),%r11 + adcq 96(%rdi),%r12 + adcq 104(%rdi),%r13 + adcq 112(%rdi),%r14 + adcq 120(%rdi),%r15 + leaq (%rsi),%rbp + leaq 128(%rdi),%rdi + sbbq %rax,%rax + + movq -64(%rsi),%rdx + movq %rax,16+8(%rsp) + movq %rdi,24+8(%rsp) + + + xorl %eax,%eax + jmp L$sqrx8x_loop + +.p2align 5 +L$sqrx8x_loop: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + movq %rbx,(%rdi,%rcx,8) + movl $0,%ebx + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + movq 8(%rsi,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rbx,%r15 + adcxq %rbx,%r15 + +.byte 0x67 + incq %rcx + jnz L$sqrx8x_loop + + leaq 64(%rbp),%rbp + movq $-8,%rcx + cmpq 8+8(%rsp),%rbp + je L$sqrx8x_break + + subq 16+8(%rsp),%rbx +.byte 0x66 + movq -64(%rsi),%rdx + adcxq 0(%rdi),%r8 + adcxq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi +.byte 0x67 + sbbq %rax,%rax + xorl %ebx,%ebx + movq %rax,16+8(%rsp) + jmp L$sqrx8x_loop + +.p2align 5 +L$sqrx8x_break: + xorq %rbp,%rbp + subq 16+8(%rsp),%rbx + adcxq %rbp,%r8 + movq 24+8(%rsp),%rcx + adcxq %rbp,%r9 + movq 0(%rsi),%rdx + adcq $0,%r10 + movq %r8,0(%rdi) + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmpq %rcx,%rdi + je L$sqrx8x_outer_loop + + movq %r9,8(%rdi) + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + movq 40(%rcx),%r13 + movq %r14,48(%rdi) + movq 48(%rcx),%r14 + movq %r15,56(%rdi) + movq 56(%rcx),%r15 + movq %rcx,%rdi + jmp L$sqrx8x_outer_loop + +.p2align 5 +L$sqrx8x_outer_break: + movq %r9,72(%rdi) + movq %xmm3,%rcx + movq %r10,80(%rdi) + movq %r11,88(%rdi) + movq %r12,96(%rdi) + movq %r13,104(%rdi) + movq %r14,112(%rdi) + leaq 48+8(%rsp),%rdi + movq (%rsi,%rcx,1),%rdx + + movq 8(%rdi),%r11 + xorq %r10,%r10 + movq 0+8(%rsp),%r9 + adoxq %r11,%r11 + movq 16(%rdi),%r12 + movq 24(%rdi),%r13 + + +.p2align 5 +L$sqrx4x_shift_n_add: + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax +.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 40(%rdi),%r11 + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + movq 16(%rsi,%rcx,1),%rdx + movq 48(%rdi),%r12 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 56(%rdi),%r13 + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax + movq 24(%rsi,%rcx,1),%rdx + leaq 32(%rcx),%rcx + movq 64(%rdi),%r10 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 72(%rdi),%r11 + movq %rax,32(%rdi) + movq %rbx,40(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + jrcxz L$sqrx4x_shift_n_add_break +.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 80(%rdi),%r12 + movq 88(%rdi),%r13 + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + nop + jmp L$sqrx4x_shift_n_add + +.p2align 5 +L$sqrx4x_shift_n_add_break: + adcxq %r13,%rbx + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + movq %xmm2,%rbp +__bn_sqrx8x_reduction: + xorl %eax,%eax + movq 32+8(%rsp),%rbx + movq 48+8(%rsp),%rdx + leaq -64(%rbp,%r9,1),%rcx + + movq %rcx,0+8(%rsp) + movq %rdi,8+8(%rsp) + + leaq 48+8(%rsp),%rdi + jmp L$sqrx8x_reduction_loop + +.p2align 5 +L$sqrx8x_reduction_loop: + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq %rdx,%r8 + imulq %rbx,%rdx + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,24+8(%rsp) + + leaq 64(%rdi),%rdi + xorq %rsi,%rsi + movq $-8,%rcx + jmp L$sqrx8x_reduce + +.p2align 5 +L$sqrx8x_reduce: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rbx,%r9 + adcxq %rbx,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 32+8(%rsp),%rbx,%rdx + movq %rax,%rdx + movq %rax,64+48+8(%rsp,%rcx,8) + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + +.byte 0x67,0x67,0x67 + incq %rcx + jnz L$sqrx8x_reduce + + movq %rsi,%rax + cmpq 0+8(%rsp),%rbp + jae L$sqrx8x_no_tail + + movq 48+8(%rsp),%rdx + addq 0(%rdi),%r8 + leaq 64(%rbp),%rbp + movq $-8,%rcx + adcxq 8(%rdi),%r9 + adcxq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp L$sqrx8x_tail + +.p2align 5 +L$sqrx8x_tail: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq 72+48+8(%rsp,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + movq %rbx,(%rdi,%rcx,8) + movq %r8,%rbx + adcxq %rsi,%r15 + + incq %rcx + jnz L$sqrx8x_tail + + cmpq 0+8(%rsp),%rbp + jae L$sqrx8x_tail_done + + subq 16+8(%rsp),%rsi + movq 48+8(%rsp),%rdx + leaq 64(%rbp),%rbp + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + subq $8,%rcx + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp L$sqrx8x_tail + +.p2align 5 +L$sqrx8x_tail_done: + xorq %rax,%rax + addq 24+8(%rsp),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + subq 16+8(%rsp),%rsi +L$sqrx8x_no_tail: + adcq 0(%rdi),%r8 + movq %xmm3,%rcx + adcq 8(%rdi),%r9 + movq 56(%rbp),%rsi + movq %xmm2,%rbp + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + + movq 32+8(%rsp),%rbx + movq 64(%rdi,%rcx,1),%rdx + + movq %r8,0(%rdi) + leaq 64(%rdi),%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 64(%rdi,%rcx,1),%rdi + cmpq 8+8(%rsp),%r8 + jb L$sqrx8x_reduction_loop + ret + + +.p2align 5 + +__bn_postx4x_internal: + + movq 0(%rbp),%r12 + movq %rcx,%r10 + movq %rcx,%r9 + negq %rax + sarq $3+2,%rcx + + movq %xmm1,%rdx + movq %xmm1,%rsi + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp L$sqrx4x_sub_entry + +.p2align 4 +L$sqrx4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +L$sqrx4x_sub_entry: + andnq %rax,%r12,%r12 + leaq 32(%rbp),%rbp + andnq %rax,%r13,%r13 + andnq %rax,%r14,%r14 + andnq %rax,%r15,%r15 + + negq %r8 + adcq 0(%rdi),%r12 + adcq 8(%rdi),%r13 + adcq 16(%rdi),%r14 + adcq 24(%rdi),%r15 + movq %r12,0(%rdx) + leaq 32(%rdi),%rdi + movq %r13,8(%rdx) + sbbq %r8,%r8 + movq %r14,16(%rdx) + movq %r15,24(%rdx) + leaq 32(%rdx),%rdx + + incq %rcx + jnz L$sqrx4x_sub + + negq %r9 + + ret + + +.globl _bn_scatter5 +.private_extern _bn_scatter5 + +.p2align 4 +_bn_scatter5: + +_CET_ENDBR + cmpl $0,%esi + jz L$scatter_epilogue + + + + + + + + + + leaq (%rdx,%rcx,8),%rdx +L$scatter: + movq (%rdi),%rax + leaq 8(%rdi),%rdi + movq %rax,(%rdx) + leaq 256(%rdx),%rdx + subl $1,%esi + jnz L$scatter +L$scatter_epilogue: + ret + + + +.globl _bn_gather5 +.private_extern _bn_gather5 + +.p2align 5 +_bn_gather5: + +L$SEH_begin_bn_gather5: +_CET_ENDBR + +.byte 0x4c,0x8d,0x14,0x24 + +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + leaq L$inc(%rip),%rax + andq $-16,%rsp + + movd %ecx,%xmm5 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 128(%rdx),%r11 + leaq 128(%rsp),%rax + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-128(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-112(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-96(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-80(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-48(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-16(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,0(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,16(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,48(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,80(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,96(%rax) + movdqa %xmm4,%xmm2 + movdqa %xmm3,112(%rax) + jmp L$gather + +.p2align 5 +L$gather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r11),%xmm0 + movdqa -112(%r11),%xmm1 + movdqa -96(%r11),%xmm2 + pand -128(%rax),%xmm0 + movdqa -80(%r11),%xmm3 + pand -112(%rax),%xmm1 + por %xmm0,%xmm4 + pand -96(%rax),%xmm2 + por %xmm1,%xmm5 + pand -80(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r11),%xmm0 + movdqa -48(%r11),%xmm1 + movdqa -32(%r11),%xmm2 + pand -64(%rax),%xmm0 + movdqa -16(%r11),%xmm3 + pand -48(%rax),%xmm1 + por %xmm0,%xmm4 + pand -32(%rax),%xmm2 + por %xmm1,%xmm5 + pand -16(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + pand 0(%rax),%xmm0 + movdqa 48(%r11),%xmm3 + pand 16(%rax),%xmm1 + por %xmm0,%xmm4 + pand 32(%rax),%xmm2 + por %xmm1,%xmm5 + pand 48(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r11),%xmm0 + movdqa 80(%r11),%xmm1 + movdqa 96(%r11),%xmm2 + pand 64(%rax),%xmm0 + movdqa 112(%r11),%xmm3 + pand 80(%rax),%xmm1 + por %xmm0,%xmm4 + pand 96(%rax),%xmm2 + por %xmm1,%xmm5 + pand 112(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + leaq 256(%r11),%r11 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + movq %xmm0,(%rdi) + leaq 8(%rdi),%rdi + subl $1,%esi + jnz L$gather + + leaq (%r10),%rsp + + ret +L$SEH_end_bn_gather5: + + +.section __DATA,__const +.p2align 6 +L$inc: +.long 0,0, 1,1 +.long 2,2, 2,2 +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/x86_64-mont5-linux.S b/third_party/boringssl/gen/bcm/x86_64-mont5-linux.S new file mode 100644 index 00000000..a147041d --- /dev/null +++ b/third_party/boringssl/gen/bcm/x86_64-mont5-linux.S @@ -0,0 +1,3626 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.globl bn_mul_mont_gather5_nohw +.hidden bn_mul_mont_gather5_nohw +.type bn_mul_mont_gather5_nohw,@function +.align 64 +bn_mul_mont_gather5_nohw: +.cfi_startproc +_CET_ENDBR + + + movl %r9d,%r9d + movq %rsp,%rax +.cfi_def_cfa_register %rax + movd 8(%rsp),%xmm5 + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 + + negq %r9 + movq %rsp,%r11 + leaq -280(%rsp,%r9,8),%r10 + negq %r9 + andq $-1024,%r10 + + + + + + + + + + subq %r10,%r11 + andq $-4096,%r11 + leaq (%r10,%r11,1),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk + jmp .Lmul_page_walk_done + +.Lmul_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r11 + cmpq %r10,%rsp + ja .Lmul_page_walk +.Lmul_page_walk_done: + + leaq .Linc(%rip),%r10 + movq %rax,8(%rsp,%r9,8) +.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 +.Lmul_body: + + leaq 128(%rdx),%r12 + movdqa 0(%r10),%xmm0 + movdqa 16(%r10),%xmm1 + leaq 24-112(%rsp,%r9,8),%r10 + andq $-16,%r10 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 + movq %xmm0,%rbx + + movq (%r8),%r8 + movq (%rsi),%rax + + xorq %r14,%r14 + xorq %r15,%r15 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .L1st_enter + +.align 16 +.L1st: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r13 + movq %r10,%r11 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.L1st_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + leaq 1(%r15),%r15 + movq %rdx,%r10 + + mulq %rbp + cmpq %r9,%r15 + jne .L1st + + + addq %rax,%r13 + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-16(%rsp,%r9,8) + movq %rdx,%r13 + movq %r10,%r11 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + jmp .Louter +.align 16 +.Louter: + leaq 24+128(%rsp,%r9,8),%rdx + andq $-16,%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 + + movq (%rsi),%rax + movq %xmm0,%rbx + + xorq %r15,%r15 + movq %r8,%rbp + movq (%rsp),%r10 + + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi),%rax + adcq $0,%rdx + movq 8(%rsp),%r10 + movq %rdx,%r13 + + leaq 1(%r15),%r15 + jmp .Linner_enter + +.align 16 +.Linner: + addq %rax,%r13 + movq (%rsi,%r15,8),%rax + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r15,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r15,8) + movq %rdx,%r13 + +.Linner_enter: + mulq %rbx + addq %rax,%r11 + movq (%rcx,%r15,8),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + leaq 1(%r15),%r15 + + mulq %rbp + cmpq %r9,%r15 + jne .Linner + + addq %rax,%r13 + adcq $0,%rdx + addq %r10,%r13 + movq (%rsp,%r9,8),%r10 + adcq $0,%rdx + movq %r13,-16(%rsp,%r9,8) + movq %rdx,%r13 + + xorq %rdx,%rdx + addq %r11,%r13 + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%rsp,%r9,8) + movq %rdx,(%rsp,%r9,8) + + leaq 1(%r14),%r14 + cmpq %r9,%r14 + jb .Louter + + xorq %r14,%r14 + movq (%rsp),%rax + leaq (%rsp),%rsi + movq %r9,%r15 + jmp .Lsub +.align 16 +.Lsub: sbbq (%rcx,%r14,8),%rax + movq %rax,(%rdi,%r14,8) + movq 8(%rsi,%r14,8),%rax + leaq 1(%r14),%r14 + decq %r15 + jnz .Lsub + + sbbq $0,%rax + movq $-1,%rbx + xorq %rax,%rbx + xorq %r14,%r14 + movq %r9,%r15 + +.Lcopy: + movq (%rdi,%r14,8),%rcx + movq (%rsp,%r14,8),%rdx + andq %rbx,%rcx + andq %rax,%rdx + movq %r14,(%rsp,%r14,8) + orq %rcx,%rdx + movq %rdx,(%rdi,%r14,8) + leaq 1(%r14),%r14 + subq $1,%r15 + jnz .Lcopy + + movq 8(%rsp,%r9,8),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmul_epilogue: + ret +.cfi_endproc +.size bn_mul_mont_gather5_nohw,.-bn_mul_mont_gather5_nohw +.globl bn_mul4x_mont_gather5 +.hidden bn_mul4x_mont_gather5 +.type bn_mul4x_mont_gather5,@function +.align 32 +bn_mul4x_mont_gather5: +.cfi_startproc +_CET_ENDBR +.byte 0x67 + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmul4x_prologue: + +.byte 0x67 + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lmul4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lmul4xsp_done + +.align 32 +.Lmul4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lmul4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmul4x_page_walk + jmp .Lmul4x_page_walk_done + +.Lmul4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmul4x_page_walk +.Lmul4x_page_walk_done: + + negq %r9 + + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lmul4x_body: + + call mul4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmul4x_epilogue: + ret +.cfi_endproc +.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 + +.type mul4x_internal,@function +.align 32 +mul4x_internal: +.cfi_startproc + shlq $5,%r9 + movd 8(%rax),%xmm5 + leaq .Linc(%rip),%rax + leaq 128(%rdx,%r9,1),%r13 + shrq $5,%r9 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r9,1),%r10 + leaq 128(%rdx),%r12 + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67,0x67 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 +.byte 0x67 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 +.byte 0x67 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + pand 64(%r12),%xmm0 + + pand 80(%r12),%xmm1 + pand 96(%r12),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%r12),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%r12),%xmm4 + movdqa -112(%r12),%xmm5 + movdqa -96(%r12),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%r12),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%r12),%xmm4 + movdqa -48(%r12),%xmm5 + movdqa -32(%r12),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%r12),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%r12),%xmm4 + movdqa 16(%r12),%xmm5 + movdqa 32(%r12),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%r12),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + por %xmm1,%xmm0 + + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%r12),%r12 + movq %xmm0,%rbx + + movq %r13,16+8(%rsp) + movq %rdi,56+8(%rsp) + + movq (%r8),%r8 + movq (%rsi),%rax + leaq (%rsi,%r9,1),%rsi + negq %r9 + + movq %r8,%rbp + mulq %rbx + movq %rax,%r10 + movq (%rcx),%rax + + imulq %r10,%rbp + leaq 64+8(%rsp),%r14 + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + jmp .L1st4x + +.align 32 +.L1st4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdi,(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz .L1st4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%r13 + + leaq (%rcx,%r9,1),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + jmp .Louter4x + +.align 32 +.Louter4x: + leaq 16+128(%r14),%rdx + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r12),%xmm0 + movdqa -112(%r12),%xmm1 + movdqa -96(%r12),%xmm2 + movdqa -80(%r12),%xmm3 + pand -128(%rdx),%xmm0 + pand -112(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -80(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r12),%xmm0 + movdqa -48(%r12),%xmm1 + movdqa -32(%r12),%xmm2 + movdqa -16(%r12),%xmm3 + pand -64(%rdx),%xmm0 + pand -48(%rdx),%xmm1 + por %xmm0,%xmm4 + pand -32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand -16(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r12),%xmm0 + movdqa 16(%r12),%xmm1 + movdqa 32(%r12),%xmm2 + movdqa 48(%r12),%xmm3 + pand 0(%rdx),%xmm0 + pand 16(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 32(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 48(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r12),%xmm0 + movdqa 80(%r12),%xmm1 + movdqa 96(%r12),%xmm2 + movdqa 112(%r12),%xmm3 + pand 64(%rdx),%xmm0 + pand 80(%rdx),%xmm1 + por %xmm0,%xmm4 + pand 96(%rdx),%xmm2 + por %xmm1,%xmm5 + pand 112(%rdx),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%r12),%r12 + movq %xmm0,%rbx + + movq (%r14,%r9,1),%r10 + movq %r8,%rbp + mulq %rbx + addq %rax,%r10 + movq (%rcx),%rax + adcq $0,%rdx + + imulq %r10,%rbp + movq %rdx,%r11 + movq %rdi,(%r14) + + leaq (%r14,%r9,1),%r14 + + mulq %rbp + addq %rax,%r10 + movq 8(%rsi,%r9,1),%rax + adcq $0,%rdx + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%r9),%r15 + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %rdx,%r13 + jmp .Linner4x + +.align 32 +.Linner4x: + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq -8(%rcx),%rax + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + + mulq %rbx + addq %rax,%r10 + movq 0(%rcx),%rax + adcq $0,%rdx + addq (%r14),%r10 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq 8(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-16(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq 8(%rcx),%rax + adcq $0,%rdx + addq 8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq 16(%rsi,%r15,1),%rax + adcq $0,%rdx + addq %r11,%rdi + leaq 32(%rcx),%rcx + adcq $0,%rdx + movq %r13,-8(%r14) + movq %rdx,%r13 + + addq $32,%r15 + jnz .Linner4x + + mulq %rbx + addq %rax,%r10 + movq -16(%rcx),%rax + adcq $0,%rdx + addq 16(%r14),%r10 + leaq 32(%r14),%r14 + adcq $0,%rdx + movq %rdx,%r11 + + mulq %rbp + addq %rax,%r13 + movq -8(%rsi),%rax + adcq $0,%rdx + addq %r10,%r13 + adcq $0,%rdx + movq %rdi,-32(%r14) + movq %rdx,%rdi + + mulq %rbx + addq %rax,%r11 + movq %rbp,%rax + movq -8(%rcx),%rbp + adcq $0,%rdx + addq -8(%r14),%r11 + adcq $0,%rdx + movq %rdx,%r10 + + mulq %rbp + addq %rax,%rdi + movq (%rsi,%r9,1),%rax + adcq $0,%rdx + addq %r11,%rdi + adcq $0,%rdx + movq %r13,-24(%r14) + movq %rdx,%r13 + + movq %rdi,-16(%r14) + leaq (%rcx,%r9,1),%rcx + + xorq %rdi,%rdi + addq %r10,%r13 + adcq $0,%rdi + addq (%r14),%r13 + adcq $0,%rdi + movq %r13,-8(%r14) + + cmpq 16+8(%rsp),%r12 + jb .Louter4x + xorq %rax,%rax + subq %r13,%rbp + adcq %r15,%r15 + orq %r15,%rdi + subq %rdi,%rax + leaq (%r14,%r9,1),%rbx + movq (%rcx),%r12 + leaq (%rcx),%rbp + movq %r9,%rcx + sarq $3+2,%rcx + movq 56+8(%rsp),%rdi + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry +.cfi_endproc +.size mul4x_internal,.-mul4x_internal +.globl bn_power5_nohw +.hidden bn_power5_nohw +.type bn_power5_nohw,@function +.align 32 +bn_power5_nohw: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lpower5_prologue: + + + + + shll $3,%r9d + leal (%r9,%r9,2),%r10d + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lpwr_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lpwr_sp_done + +.align 32 +.Lpwr_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lpwr_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwr_page_walk + jmp .Lpwr_page_walk_done + +.Lpwr_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwr_page_walk +.Lpwr_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lpower5_body: + movq %rdi,%xmm1 + movq %rcx,%xmm2 + movq %r10,%xmm3 + movq %rdx,%xmm4 + + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + + movq %xmm2,%rcx + movq %xmm4,%rdx + movq %rsi,%rdi + movq 40(%rsp),%rax + leaq 32(%rsp),%r8 + + call mul4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpower5_epilogue: + ret +.cfi_endproc +.size bn_power5_nohw,.-bn_power5_nohw + +.globl bn_sqr8x_internal +.hidden bn_sqr8x_internal +.hidden bn_sqr8x_internal +.type bn_sqr8x_internal,@function +.align 32 +bn_sqr8x_internal: +__bn_sqr8x_internal: +.cfi_startproc +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 32(%r10),%rbp + leaq (%rsi,%r9,1),%rsi + + movq %r9,%rcx + + + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + movq %r10,-24(%rdi,%rbp,1) + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + movq %r11,-16(%rdi,%rbp,1) + movq %rdx,%r10 + + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + movq %rax,%r12 + movq %rbx,%rax + movq %rdx,%r13 + + leaq (%rbp),%rcx + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + jmp .Lsqr4x_1st + +.align 32 +.Lsqr4x_1st: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq 16(%rsi,%rcx,1),%rbx + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %r10,8(%rdi,%rcx,1) + movq %rdx,%r12 + adcq $0,%r12 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 24(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,16(%rdi,%rcx,1) + movq %rdx,%r13 + adcq $0,%r13 + leaq 32(%rcx),%rcx + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne .Lsqr4x_1st + + mulq %r15 + addq %rax,%r13 + leaq 16(%rbp),%rbp + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + jmp .Lsqr4x_outer + +.align 32 +.Lsqr4x_outer: + movq -32(%rsi,%rbp,1),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi,%rbp,1),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi,%rbp,1),%rbx + movq %rax,%r15 + + mulq %r14 + movq -24(%rdi,%rbp,1),%r10 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + movq %r10,-24(%rdi,%rbp,1) + movq %rdx,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + adcq $0,%rdx + addq -16(%rdi,%rbp,1),%r11 + movq %rdx,%r10 + adcq $0,%r10 + movq %r11,-16(%rdi,%rbp,1) + + xorq %r12,%r12 + + movq -8(%rsi,%rbp,1),%rbx + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + adcq $0,%rdx + addq -8(%rdi,%rbp,1),%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rbp,1) + + leaq (%rbp),%rcx + jmp .Lsqr4x_inner + +.align 32 +.Lsqr4x_inner: + movq (%rsi,%rcx,1),%rbx + mulq %r15 + addq %rax,%r13 + movq %rbx,%rax + movq %rdx,%r12 + adcq $0,%r12 + addq (%rdi,%rcx,1),%r13 + adcq $0,%r12 + +.byte 0x67 + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq 8(%rsi,%rcx,1),%rbx + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %r11,(%rdi,%rcx,1) + movq %rbx,%rax + movq %rdx,%r13 + adcq $0,%r13 + addq 8(%rdi,%rcx,1),%r12 + leaq 16(%rcx),%rcx + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + adcq $0,%rdx + addq %r12,%r10 + movq %rdx,%r11 + adcq $0,%r11 + movq %r10,-8(%rdi,%rcx,1) + + cmpq $0,%rcx + jne .Lsqr4x_inner + +.byte 0x67 + mulq %r15 + addq %rax,%r13 + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + addq $16,%rbp + jnz .Lsqr4x_outer + + + movq -32(%rsi),%r14 + leaq 48+8(%rsp,%r9,2),%rdi + movq -24(%rsi),%rax + leaq -32(%rdi,%rbp,1),%rdi + movq -16(%rsi),%rbx + movq %rax,%r15 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + + mulq %r14 + addq %rax,%r11 + movq %rbx,%rax + movq %r10,-24(%rdi) + movq %rdx,%r10 + adcq $0,%r10 + addq %r13,%r11 + movq -8(%rsi),%rbx + adcq $0,%r10 + + mulq %r15 + addq %rax,%r12 + movq %rbx,%rax + movq %r11,-16(%rdi) + movq %rdx,%r13 + adcq $0,%r13 + + mulq %r14 + addq %rax,%r10 + movq %rbx,%rax + movq %rdx,%r11 + adcq $0,%r11 + addq %r12,%r10 + adcq $0,%r11 + movq %r10,-8(%rdi) + + mulq %r15 + addq %rax,%r13 + movq -16(%rsi),%rax + adcq $0,%rdx + addq %r11,%r13 + adcq $0,%rdx + + movq %r13,(%rdi) + movq %rdx,%r12 + movq %rdx,8(%rdi) + + mulq %rbx + addq $16,%rbp + xorq %r14,%r14 + subq %r9,%rbp + xorq %r15,%r15 + + addq %r12,%rax + adcq $0,%rdx + movq %rax,8(%rdi) + movq %rdx,16(%rdi) + movq %r15,24(%rdi) + + movq -16(%rsi,%rbp,1),%rax + leaq 48+8(%rsp),%rdi + xorq %r10,%r10 + movq 8(%rdi),%r11 + + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + leaq 16(%rbp),%rbp + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + jmp .Lsqr4x_shift_n_add + +.align 32 +.Lsqr4x_shift_n_add: + leaq (%r14,%r10,2),%r12 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi,%rbp,1),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 0(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 8(%rdi),%r11 + adcq %rax,%rbx + movq 0(%rsi,%rbp,1),%rax + movq %rbx,-16(%rdi) + adcq %rdx,%r8 + + leaq (%r14,%r10,2),%r12 + movq %r8,-8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq 16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 24(%rdi),%r11 + adcq %rax,%r12 + movq 8(%rsi,%rbp,1),%rax + movq %r12,0(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,8(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + movq 32(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq 40(%rdi),%r11 + adcq %rax,%rbx + movq 16(%rsi,%rbp,1),%rax + movq %rbx,16(%rdi) + adcq %rdx,%r8 + movq %r8,24(%rdi) + sbbq %r15,%r15 + leaq 64(%rdi),%rdi + addq $32,%rbp + jnz .Lsqr4x_shift_n_add + + leaq (%r14,%r10,2),%r12 +.byte 0x67 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r13 + shrq $63,%r11 + orq %r10,%r13 + movq -16(%rdi),%r10 + movq %r11,%r14 + mulq %rax + negq %r15 + movq -8(%rdi),%r11 + adcq %rax,%r12 + movq -8(%rsi),%rax + movq %r12,-32(%rdi) + adcq %rdx,%r13 + + leaq (%r14,%r10,2),%rbx + movq %r13,-24(%rdi) + sbbq %r15,%r15 + shrq $63,%r10 + leaq (%rcx,%r11,2),%r8 + shrq $63,%r11 + orq %r10,%r8 + mulq %rax + negq %r15 + adcq %rax,%rbx + adcq %rdx,%r8 + movq %rbx,-16(%rdi) + movq %r8,-8(%rdi) + movq %xmm2,%rbp +__bn_sqr8x_reduction: + xorq %rax,%rax + leaq (%r9,%rbp,1),%rcx + leaq 48+8(%rsp,%r9,2),%rdx + movq %rcx,0+8(%rsp) + leaq 48+8(%rsp,%r9,1),%rdi + movq %rdx,8+8(%rsp) + negq %r9 + jmp .L8x_reduction_loop + +.align 32 +.L8x_reduction_loop: + leaq (%rdi,%r9,1),%rdi +.byte 0x66 + movq 0(%rdi),%rbx + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,(%rdx) + leaq 64(%rdi),%rdi + +.byte 0x67 + movq %rbx,%r8 + imulq 32+8(%rsp),%rbx + movq 0(%rbp),%rax + movl $8,%ecx + jmp .L8x_reduce + +.align 32 +.L8x_reduce: + mulq %rbx + movq 8(%rbp),%rax + negq %r8 + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + movq %rbx,48-8+8(%rsp,%rcx,8) + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq 32+8(%rsp),%rsi + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rbp),%rax + adcq $0,%rdx + imulq %r8,%rsi + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq %rsi,%rbx + addq %rax,%r15 + movq 0(%rbp),%rax + adcq $0,%rdx + addq %r15,%r14 + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz .L8x_reduce + + leaq 64(%rbp),%rbp + xorq %rax,%rax + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae .L8x_no_tail + +.byte 0x66 + addq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movq 48+56+8(%rsp),%rbx + movl $8,%ecx + movq 0(%rbp),%rax + jmp .L8x_tail + +.align 32 +.L8x_tail: + mulq %rbx + addq %rax,%r8 + movq 8(%rbp),%rax + movq %r8,(%rdi) + movq %rdx,%r8 + adcq $0,%r8 + + mulq %rbx + addq %rax,%r9 + movq 16(%rbp),%rax + adcq $0,%rdx + addq %r9,%r8 + leaq 8(%rdi),%rdi + movq %rdx,%r9 + adcq $0,%r9 + + mulq %rbx + addq %rax,%r10 + movq 24(%rbp),%rax + adcq $0,%rdx + addq %r10,%r9 + movq %rdx,%r10 + adcq $0,%r10 + + mulq %rbx + addq %rax,%r11 + movq 32(%rbp),%rax + adcq $0,%rdx + addq %r11,%r10 + movq %rdx,%r11 + adcq $0,%r11 + + mulq %rbx + addq %rax,%r12 + movq 40(%rbp),%rax + adcq $0,%rdx + addq %r12,%r11 + movq %rdx,%r12 + adcq $0,%r12 + + mulq %rbx + addq %rax,%r13 + movq 48(%rbp),%rax + adcq $0,%rdx + addq %r13,%r12 + movq %rdx,%r13 + adcq $0,%r13 + + mulq %rbx + addq %rax,%r14 + movq 56(%rbp),%rax + adcq $0,%rdx + addq %r14,%r13 + movq %rdx,%r14 + adcq $0,%r14 + + mulq %rbx + movq 48-16+8(%rsp,%rcx,8),%rbx + addq %rax,%r15 + adcq $0,%rdx + addq %r15,%r14 + movq 0(%rbp),%rax + movq %rdx,%r15 + adcq $0,%r15 + + decl %ecx + jnz .L8x_tail + + leaq 64(%rbp),%rbp + movq 8+8(%rsp),%rdx + cmpq 0+8(%rsp),%rbp + jae .L8x_tail_done + + movq 48+56+8(%rsp),%rbx + negq %rsi + movq 0(%rbp),%rax + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + sbbq %rsi,%rsi + + movl $8,%ecx + jmp .L8x_tail + +.align 32 +.L8x_tail_done: + xorq %rax,%rax + addq (%rdx),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + negq %rsi +.L8x_no_tail: + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + movq -8(%rbp),%rcx + xorq %rsi,%rsi + + movq %xmm2,%rbp + + movq %r8,0(%rdi) + movq %r9,8(%rdi) + movq %xmm3,%r9 + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + leaq 64(%rdi),%rdi + + cmpq %rdx,%rdi + jb .L8x_reduction_loop + ret +.cfi_endproc +.size bn_sqr8x_internal,.-bn_sqr8x_internal +.type __bn_post4x_internal,@function +.align 32 +__bn_post4x_internal: +.cfi_startproc + movq 0(%rbp),%r12 + leaq (%rdi,%r9,1),%rbx + movq %r9,%rcx + movq %xmm1,%rdi + negq %rax + movq %xmm1,%rsi + sarq $3+2,%rcx + decq %r12 + xorq %r10,%r10 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqr4x_sub_entry + +.align 16 +.Lsqr4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqr4x_sub_entry: + leaq 32(%rbp),%rbp + notq %r12 + notq %r13 + notq %r14 + notq %r15 + andq %rax,%r12 + andq %rax,%r13 + andq %rax,%r14 + andq %rax,%r15 + + negq %r10 + adcq 0(%rbx),%r12 + adcq 8(%rbx),%r13 + adcq 16(%rbx),%r14 + adcq 24(%rbx),%r15 + movq %r12,0(%rdi) + leaq 32(%rbx),%rbx + movq %r13,8(%rdi) + sbbq %r10,%r10 + movq %r14,16(%rdi) + movq %r15,24(%rdi) + leaq 32(%rdi),%rdi + + incq %rcx + jnz .Lsqr4x_sub + + movq %r9,%r10 + negq %r9 + ret +.cfi_endproc +.size __bn_post4x_internal,.-__bn_post4x_internal +.globl bn_mulx4x_mont_gather5 +.hidden bn_mulx4x_mont_gather5 +.type bn_mulx4x_mont_gather5,@function +.align 32 +bn_mulx4x_mont_gather5: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lmulx4x_prologue: + + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lmulx4xsp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lmulx4xsp_done + +.Lmulx4xsp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lmulx4xsp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk + jmp .Lmulx4x_page_walk_done + +.Lmulx4x_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lmulx4x_page_walk +.Lmulx4x_page_walk_done: + + + + + + + + + + + + + + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lmulx4x_body: + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lmulx4x_epilogue: + ret +.cfi_endproc +.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 + +.type mulx4x_internal,@function +.align 32 +mulx4x_internal: +.cfi_startproc + movq %r9,8(%rsp) + movq %r9,%r10 + negq %r9 + shlq $5,%r9 + negq %r10 + leaq 128(%rdx,%r9,1),%r13 + shrq $5+5,%r9 + movd 8(%rax),%xmm5 + subq $1,%r9 + leaq .Linc(%rip),%rax + movq %r13,16+8(%rsp) + movq %r9,24+8(%rsp) + movq %rdi,56+8(%rsp) + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 88-112(%rsp,%r10,1),%r10 + leaq 128(%rdx),%rdi + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 +.byte 0x67 + movdqa %xmm1,%xmm2 +.byte 0x67 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,112(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,128(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,144(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,160(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,176(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,192(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,208(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,224(%r10) + movdqa %xmm4,%xmm3 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,240(%r10) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,256(%r10) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,272(%r10) + movdqa %xmm4,%xmm2 + + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,288(%r10) + movdqa %xmm4,%xmm3 +.byte 0x67 + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,304(%r10) + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,320(%r10) + + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,336(%r10) + + pand 64(%rdi),%xmm0 + pand 80(%rdi),%xmm1 + pand 96(%rdi),%xmm2 + movdqa %xmm3,352(%r10) + pand 112(%rdi),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -128(%rdi),%xmm4 + movdqa -112(%rdi),%xmm5 + movdqa -96(%rdi),%xmm2 + pand 112(%r10),%xmm4 + movdqa -80(%rdi),%xmm3 + pand 128(%r10),%xmm5 + por %xmm4,%xmm0 + pand 144(%r10),%xmm2 + por %xmm5,%xmm1 + pand 160(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa -64(%rdi),%xmm4 + movdqa -48(%rdi),%xmm5 + movdqa -32(%rdi),%xmm2 + pand 176(%r10),%xmm4 + movdqa -16(%rdi),%xmm3 + pand 192(%r10),%xmm5 + por %xmm4,%xmm0 + pand 208(%r10),%xmm2 + por %xmm5,%xmm1 + pand 224(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + movdqa 0(%rdi),%xmm4 + movdqa 16(%rdi),%xmm5 + movdqa 32(%rdi),%xmm2 + pand 240(%r10),%xmm4 + movdqa 48(%rdi),%xmm3 + pand 256(%r10),%xmm5 + por %xmm4,%xmm0 + pand 272(%r10),%xmm2 + por %xmm5,%xmm1 + pand 288(%r10),%xmm3 + por %xmm2,%xmm0 + por %xmm3,%xmm1 + pxor %xmm1,%xmm0 + + pshufd $0x4e,%xmm0,%xmm1 + por %xmm1,%xmm0 + leaq 256(%rdi),%rdi + movq %xmm0,%rdx + leaq 64+32+8(%rsp),%rbx + + movq %rdx,%r9 + mulxq 0(%rsi),%r8,%rax + mulxq 8(%rsi),%r11,%r12 + addq %rax,%r11 + mulxq 16(%rsi),%rax,%r13 + adcq %rax,%r12 + adcq $0,%r13 + mulxq 24(%rsi),%rax,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + xorq %rbp,%rbp + movq %r8,%rdx + + movq %rdi,8+8(%rsp) + + leaq 32(%rsi),%rsi + adcxq %rax,%r13 + adcxq %rbp,%r14 + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r11,-24(%rbx) + adcxq %rax,%r12 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r12,-16(%rbx) + jmp .Lmulx4x_1st + +.align 32 +.Lmulx4x_1st: + adcxq %rbp,%r15 + mulxq 0(%rsi),%r10,%rax + adcxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 +.byte 0x67,0x67 + movq %r8,%rdx + adcxq %rax,%r13 + adcxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + movq %r11,-32(%rbx) + adoxq %r15,%r13 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + leaq 32(%rcx),%rcx + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_1st + + movq 8(%rsp),%rax + adcq %rbp,%r15 + leaq (%rsi,%rax,1),%rsi + addq %r15,%r14 + movq 8+8(%rsp),%rdi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + jmp .Lmulx4x_outer + +.align 32 +.Lmulx4x_outer: + leaq 16-256(%rbx),%r10 + pxor %xmm4,%xmm4 +.byte 0x67,0x67 + pxor %xmm5,%xmm5 + movdqa -128(%rdi),%xmm0 + movdqa -112(%rdi),%xmm1 + movdqa -96(%rdi),%xmm2 + pand 256(%r10),%xmm0 + movdqa -80(%rdi),%xmm3 + pand 272(%r10),%xmm1 + por %xmm0,%xmm4 + pand 288(%r10),%xmm2 + por %xmm1,%xmm5 + pand 304(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%rdi),%xmm0 + movdqa -48(%rdi),%xmm1 + movdqa -32(%rdi),%xmm2 + pand 320(%r10),%xmm0 + movdqa -16(%rdi),%xmm3 + pand 336(%r10),%xmm1 + por %xmm0,%xmm4 + pand 352(%r10),%xmm2 + por %xmm1,%xmm5 + pand 368(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%rdi),%xmm0 + movdqa 16(%rdi),%xmm1 + movdqa 32(%rdi),%xmm2 + pand 384(%r10),%xmm0 + movdqa 48(%rdi),%xmm3 + pand 400(%r10),%xmm1 + por %xmm0,%xmm4 + pand 416(%r10),%xmm2 + por %xmm1,%xmm5 + pand 432(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%rdi),%xmm0 + movdqa 80(%rdi),%xmm1 + movdqa 96(%rdi),%xmm2 + pand 448(%r10),%xmm0 + movdqa 112(%rdi),%xmm3 + pand 464(%r10),%xmm1 + por %xmm0,%xmm4 + pand 480(%r10),%xmm2 + por %xmm1,%xmm5 + pand 496(%r10),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + leaq 256(%rdi),%rdi + movq %xmm0,%rdx + + movq %rbp,(%rbx) + leaq 32(%rbx,%rax,1),%rbx + mulxq 0(%rsi),%r8,%r11 + xorq %rbp,%rbp + movq %rdx,%r9 + mulxq 8(%rsi),%r14,%r12 + adoxq -32(%rbx),%r8 + adcxq %r14,%r11 + mulxq 16(%rsi),%r15,%r13 + adoxq -24(%rbx),%r11 + adcxq %r15,%r12 + mulxq 24(%rsi),%rdx,%r14 + adoxq -16(%rbx),%r12 + adcxq %rdx,%r13 + leaq (%rcx,%rax,1),%rcx + leaq 32(%rsi),%rsi + adoxq -8(%rbx),%r13 + adcxq %rbp,%r14 + adoxq %rbp,%r14 + + movq %r8,%r15 + imulq 32+8(%rsp),%r8 + + movq %r8,%rdx + xorq %rbp,%rbp + movq %rdi,8+8(%rsp) + + mulxq 0(%rcx),%rax,%r10 + adcxq %rax,%r15 + adoxq %r11,%r10 + mulxq 8(%rcx),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + mulxq 16(%rcx),%rax,%r12 + adcxq %rax,%r11 + adoxq %r13,%r12 + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + movq 24+8(%rsp),%rdi + movq %r10,-32(%rbx) + adcxq %rax,%r12 + movq %r11,-24(%rbx) + adoxq %rbp,%r15 + movq %r12,-16(%rbx) + leaq 32(%rcx),%rcx + jmp .Lmulx4x_inner + +.align 32 +.Lmulx4x_inner: + mulxq 0(%rsi),%r10,%rax + adcxq %rbp,%r15 + adoxq %r14,%r10 + mulxq 8(%rsi),%r11,%r14 + adcxq 0(%rbx),%r10 + adoxq %rax,%r11 + mulxq 16(%rsi),%r12,%rax + adcxq 8(%rbx),%r11 + adoxq %r14,%r12 + mulxq 24(%rsi),%r13,%r14 + movq %r8,%rdx + adcxq 16(%rbx),%r12 + adoxq %rax,%r13 + adcxq 24(%rbx),%r13 + adoxq %rbp,%r14 + leaq 32(%rsi),%rsi + leaq 32(%rbx),%rbx + adcxq %rbp,%r14 + + adoxq %r15,%r10 + mulxq 0(%rcx),%rax,%r15 + adcxq %rax,%r10 + adoxq %r15,%r11 + mulxq 8(%rcx),%rax,%r15 + adcxq %rax,%r11 + adoxq %r15,%r12 + mulxq 16(%rcx),%rax,%r15 + movq %r10,-40(%rbx) + adcxq %rax,%r12 + adoxq %r15,%r13 + movq %r11,-32(%rbx) + mulxq 24(%rcx),%rax,%r15 + movq %r9,%rdx + leaq 32(%rcx),%rcx + movq %r12,-24(%rbx) + adcxq %rax,%r13 + adoxq %rbp,%r15 + movq %r13,-16(%rbx) + + decq %rdi + jnz .Lmulx4x_inner + + movq 0+8(%rsp),%rax + adcq %rbp,%r15 + subq 0(%rbx),%rdi + movq 8+8(%rsp),%rdi + movq 16+8(%rsp),%r10 + adcq %r15,%r14 + leaq (%rsi,%rax,1),%rsi + adcq %rbp,%rbp + movq %r14,-8(%rbx) + + cmpq %r10,%rdi + jb .Lmulx4x_outer + + movq -8(%rcx),%r10 + movq %rbp,%r8 + movq (%rcx,%rax,1),%r12 + leaq (%rcx,%rax,1),%rbp + movq %rax,%rcx + leaq (%rbx,%rax,1),%rdi + xorl %eax,%eax + xorq %r15,%r15 + subq %r14,%r10 + adcq %r15,%r15 + orq %r15,%r8 + sarq $3+2,%rcx + subq %r8,%rax + movq 56+8(%rsp),%rdx + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry +.cfi_endproc +.size mulx4x_internal,.-mulx4x_internal +.globl bn_powerx5 +.hidden bn_powerx5 +.type bn_powerx5,@function +.align 32 +bn_powerx5: +.cfi_startproc +_CET_ENDBR + movq %rsp,%rax +.cfi_def_cfa_register %rax + pushq %rbx +.cfi_offset %rbx,-16 + pushq %rbp +.cfi_offset %rbp,-24 + pushq %r12 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_offset %r15,-56 +.Lpowerx5_prologue: + + + + + shll $3,%r9d + leaq (%r9,%r9,2),%r10 + negq %r9 + movq (%r8),%r8 + + + + + + + + + leaq -320(%rsp,%r9,2),%r11 + movq %rsp,%rbp + subq %rdi,%r11 + andq $4095,%r11 + cmpq %r11,%r10 + jb .Lpwrx_sp_alt + subq %r11,%rbp + leaq -320(%rbp,%r9,2),%rbp + jmp .Lpwrx_sp_done + +.align 32 +.Lpwrx_sp_alt: + leaq 4096-320(,%r9,2),%r10 + leaq -320(%rbp,%r9,2),%rbp + subq %r10,%r11 + movq $0,%r10 + cmovcq %r10,%r11 + subq %r11,%rbp +.Lpwrx_sp_done: + andq $-64,%rbp + movq %rsp,%r11 + subq %rbp,%r11 + andq $-4096,%r11 + leaq (%r11,%rbp,1),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk + jmp .Lpwrx_page_walk_done + +.Lpwrx_page_walk: + leaq -4096(%rsp),%rsp + movq (%rsp),%r10 + cmpq %rbp,%rsp + ja .Lpwrx_page_walk +.Lpwrx_page_walk_done: + + movq %r9,%r10 + negq %r9 + + + + + + + + + + + + + pxor %xmm0,%xmm0 + movq %rdi,%xmm1 + movq %rcx,%xmm2 + movq %r10,%xmm3 + movq %rdx,%xmm4 + movq %r8,32(%rsp) + movq %rax,40(%rsp) +.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 +.Lpowerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + movq %r10,%r9 + movq %rsi,%rdi + movq %xmm2,%rcx + movq %xmm4,%rdx + movq 40(%rsp),%rax + + call mulx4x_internal + + movq 40(%rsp),%rsi +.cfi_def_cfa %rsi,8 + movq $1,%rax + + movq -48(%rsi),%r15 +.cfi_restore %r15 + movq -40(%rsi),%r14 +.cfi_restore %r14 + movq -32(%rsi),%r13 +.cfi_restore %r13 + movq -24(%rsi),%r12 +.cfi_restore %r12 + movq -16(%rsi),%rbp +.cfi_restore %rbp + movq -8(%rsi),%rbx +.cfi_restore %rbx + leaq (%rsi),%rsp +.cfi_def_cfa_register %rsp +.Lpowerx5_epilogue: + ret +.cfi_endproc +.size bn_powerx5,.-bn_powerx5 + +.globl bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.hidden bn_sqrx8x_internal +.type bn_sqrx8x_internal,@function +.align 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: +.cfi_startproc +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + leaq 48+8(%rsp),%rdi + leaq (%rsi,%r9,1),%rbp + movq %r9,0+8(%rsp) + movq %rbp,8+8(%rsp) + jmp .Lsqr8x_zero_start + +.align 32 +.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +.Lsqrx8x_zero: +.byte 0x3e + movdqa %xmm0,0(%rdi) + movdqa %xmm0,16(%rdi) + movdqa %xmm0,32(%rdi) + movdqa %xmm0,48(%rdi) +.Lsqr8x_zero_start: + movdqa %xmm0,64(%rdi) + movdqa %xmm0,80(%rdi) + movdqa %xmm0,96(%rdi) + movdqa %xmm0,112(%rdi) + leaq 128(%rdi),%rdi + subq $64,%r9 + jnz .Lsqrx8x_zero + + movq 0(%rsi),%rdx + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + leaq 48+8(%rsp),%rdi + xorq %rbp,%rbp + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_loop: + mulxq 8(%rsi),%r8,%rax + adcxq %r9,%r8 + adoxq %rax,%r10 + mulxq 16(%rsi),%r9,%rax + adcxq %r10,%r9 + adoxq %rax,%r11 +.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcxq %r11,%r10 + adoxq %rax,%r12 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcxq %r12,%r11 + adoxq %rax,%r13 + mulxq 40(%rsi),%r12,%rax + adcxq %r13,%r12 + adoxq %rax,%r14 + mulxq 48(%rsi),%r13,%rax + adcxq %r14,%r13 + adoxq %r15,%rax + mulxq 56(%rsi),%r14,%r15 + movq 8(%rsi),%rdx + adcxq %rax,%r14 + adoxq %rbp,%r15 + adcq 64(%rdi),%r15 + movq %r8,8(%rdi) + movq %r9,16(%rdi) + sbbq %rcx,%rcx + xorq %rbp,%rbp + + + mulxq 16(%rsi),%r8,%rbx + mulxq 24(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 32(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %rbx,%r11 +.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcxq %r13,%r11 + adoxq %r14,%r12 +.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + movq 16(%rsi),%rdx + adcxq %rax,%r12 + adoxq %rbx,%r13 + adcxq %r15,%r13 + adoxq %rbp,%r14 + adcxq %rbp,%r14 + + movq %r8,24(%rdi) + movq %r9,32(%rdi) + + mulxq 24(%rsi),%r8,%rbx + mulxq 32(%rsi),%r9,%rax + adcxq %r10,%r8 + adoxq %rbx,%r9 + mulxq 40(%rsi),%r10,%rbx + adcxq %r11,%r9 + adoxq %rax,%r10 +.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcxq %r12,%r10 + adoxq %r13,%r11 +.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 +.byte 0x3e + movq 24(%rsi),%rdx + adcxq %rbx,%r11 + adoxq %rax,%r12 + adcxq %r14,%r12 + movq %r8,40(%rdi) + movq %r9,48(%rdi) + mulxq 32(%rsi),%r8,%rax + adoxq %rbp,%r13 + adcxq %rbp,%r13 + + mulxq 40(%rsi),%r9,%rbx + adcxq %r10,%r8 + adoxq %rax,%r9 + mulxq 48(%rsi),%r10,%rax + adcxq %r11,%r9 + adoxq %r12,%r10 + mulxq 56(%rsi),%r11,%r12 + movq 32(%rsi),%rdx + movq 40(%rsi),%r14 + adcxq %rbx,%r10 + adoxq %rax,%r11 + movq 48(%rsi),%r15 + adcxq %r13,%r11 + adoxq %rbp,%r12 + adcxq %rbp,%r12 + + movq %r8,56(%rdi) + movq %r9,64(%rdi) + + mulxq %r14,%r9,%rax + movq 56(%rsi),%r8 + adcxq %r10,%r9 + mulxq %r15,%r10,%rbx + adoxq %rax,%r10 + adcxq %r11,%r10 + mulxq %r8,%r11,%rax + movq %r14,%rdx + adoxq %rbx,%r11 + adcxq %r12,%r11 + + adcxq %rbp,%rax + + mulxq %r15,%r14,%rbx + mulxq %r8,%r12,%r13 + movq %r15,%rdx + leaq 64(%rsi),%rsi + adcxq %r14,%r11 + adoxq %rbx,%r12 + adcxq %rax,%r12 + adoxq %rbp,%r13 + +.byte 0x67,0x67 + mulxq %r8,%r8,%r14 + adcxq %r8,%r13 + adcxq %rbp,%r14 + + cmpq 8+8(%rsp),%rsi + je .Lsqrx8x_outer_break + + negq %rcx + movq $-8,%rcx + movq %rbp,%r15 + movq 64(%rdi),%r8 + adcxq 72(%rdi),%r9 + adcxq 80(%rdi),%r10 + adcxq 88(%rdi),%r11 + adcq 96(%rdi),%r12 + adcq 104(%rdi),%r13 + adcq 112(%rdi),%r14 + adcq 120(%rdi),%r15 + leaq (%rsi),%rbp + leaq 128(%rdi),%rdi + sbbq %rax,%rax + + movq -64(%rsi),%rdx + movq %rax,16+8(%rsp) + movq %rdi,24+8(%rsp) + + + xorl %eax,%eax + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_loop: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + movq %rbx,(%rdi,%rcx,8) + movl $0,%ebx + adcxq %rax,%r13 + adoxq %r15,%r14 + +.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + movq 8(%rsi,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rbx,%r15 + adcxq %rbx,%r15 + +.byte 0x67 + incq %rcx + jnz .Lsqrx8x_loop + + leaq 64(%rbp),%rbp + movq $-8,%rcx + cmpq 8+8(%rsp),%rbp + je .Lsqrx8x_break + + subq 16+8(%rsp),%rbx +.byte 0x66 + movq -64(%rsi),%rdx + adcxq 0(%rdi),%r8 + adcxq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi +.byte 0x67 + sbbq %rax,%rax + xorl %ebx,%ebx + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_loop + +.align 32 +.Lsqrx8x_break: + xorq %rbp,%rbp + subq 16+8(%rsp),%rbx + adcxq %rbp,%r8 + movq 24+8(%rsp),%rcx + adcxq %rbp,%r9 + movq 0(%rsi),%rdx + adcq $0,%r10 + movq %r8,0(%rdi) + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + cmpq %rcx,%rdi + je .Lsqrx8x_outer_loop + + movq %r9,8(%rdi) + movq 8(%rcx),%r9 + movq %r10,16(%rdi) + movq 16(%rcx),%r10 + movq %r11,24(%rdi) + movq 24(%rcx),%r11 + movq %r12,32(%rdi) + movq 32(%rcx),%r12 + movq %r13,40(%rdi) + movq 40(%rcx),%r13 + movq %r14,48(%rdi) + movq 48(%rcx),%r14 + movq %r15,56(%rdi) + movq 56(%rcx),%r15 + movq %rcx,%rdi + jmp .Lsqrx8x_outer_loop + +.align 32 +.Lsqrx8x_outer_break: + movq %r9,72(%rdi) + movq %xmm3,%rcx + movq %r10,80(%rdi) + movq %r11,88(%rdi) + movq %r12,96(%rdi) + movq %r13,104(%rdi) + movq %r14,112(%rdi) + leaq 48+8(%rsp),%rdi + movq (%rsi,%rcx,1),%rdx + + movq 8(%rdi),%r11 + xorq %r10,%r10 + movq 0+8(%rsp),%r9 + adoxq %r11,%r11 + movq 16(%rdi),%r12 + movq 24(%rdi),%r13 + + +.align 32 +.Lsqrx4x_shift_n_add: + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax +.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 +.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 40(%rdi),%r11 + movq %rax,0(%rdi) + movq %rbx,8(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + movq 16(%rsi,%rcx,1),%rdx + movq 48(%rdi),%r12 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 56(%rdi),%r13 + movq %rax,16(%rdi) + movq %rbx,24(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r12,%r12 + adcxq %r10,%rax + movq 24(%rsi,%rcx,1),%rdx + leaq 32(%rcx),%rcx + movq 64(%rdi),%r10 + adoxq %r13,%r13 + adcxq %r11,%rbx + movq 72(%rdi),%r11 + movq %rax,32(%rdi) + movq %rbx,40(%rdi) + + mulxq %rdx,%rax,%rbx + adoxq %r10,%r10 + adcxq %r12,%rax + jrcxz .Lsqrx4x_shift_n_add_break +.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adoxq %r11,%r11 + adcxq %r13,%rbx + movq 80(%rdi),%r12 + movq 88(%rdi),%r13 + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + nop + jmp .Lsqrx4x_shift_n_add + +.align 32 +.Lsqrx4x_shift_n_add_break: + adcxq %r13,%rbx + movq %rax,48(%rdi) + movq %rbx,56(%rdi) + leaq 64(%rdi),%rdi + movq %xmm2,%rbp +__bn_sqrx8x_reduction: + xorl %eax,%eax + movq 32+8(%rsp),%rbx + movq 48+8(%rsp),%rdx + leaq -64(%rbp,%r9,1),%rcx + + movq %rcx,0+8(%rsp) + movq %rdi,8+8(%rsp) + + leaq 48+8(%rsp),%rdi + jmp .Lsqrx8x_reduction_loop + +.align 32 +.Lsqrx8x_reduction_loop: + movq 8(%rdi),%r9 + movq 16(%rdi),%r10 + movq 24(%rdi),%r11 + movq 32(%rdi),%r12 + movq %rdx,%r8 + imulq %rbx,%rdx + movq 40(%rdi),%r13 + movq 48(%rdi),%r14 + movq 56(%rdi),%r15 + movq %rax,24+8(%rsp) + + leaq 64(%rdi),%rdi + xorq %rsi,%rsi + movq $-8,%rcx + jmp .Lsqrx8x_reduce + +.align 32 +.Lsqrx8x_reduce: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rbx,%rax + adoxq %r9,%r8 + + mulxq 8(%rbp),%rbx,%r9 + adcxq %rbx,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rbx,%r10 + adcxq %rbx,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rbx,%r11 + adcxq %rbx,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + movq %rdx,%rax + movq %r8,%rdx + adcxq %rbx,%r11 + adoxq %r13,%r12 + + mulxq 32+8(%rsp),%rbx,%rdx + movq %rax,%rdx + movq %rax,64+48+8(%rsp,%rcx,8) + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq %rbx,%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + adcxq %rsi,%r15 + +.byte 0x67,0x67,0x67 + incq %rcx + jnz .Lsqrx8x_reduce + + movq %rsi,%rax + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_no_tail + + movq 48+8(%rsp),%rdx + addq 0(%rdi),%r8 + leaq 64(%rbp),%rbp + movq $-8,%rcx + adcxq 8(%rdi),%r9 + adcxq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail: + movq %r8,%rbx + mulxq 0(%rbp),%rax,%r8 + adcxq %rax,%rbx + adoxq %r9,%r8 + + mulxq 8(%rbp),%rax,%r9 + adcxq %rax,%r8 + adoxq %r10,%r9 + + mulxq 16(%rbp),%rax,%r10 + adcxq %rax,%r9 + adoxq %r11,%r10 + + mulxq 24(%rbp),%rax,%r11 + adcxq %rax,%r10 + adoxq %r12,%r11 + +.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcxq %rax,%r11 + adoxq %r13,%r12 + + mulxq 40(%rbp),%rax,%r13 + adcxq %rax,%r12 + adoxq %r14,%r13 + + mulxq 48(%rbp),%rax,%r14 + adcxq %rax,%r13 + adoxq %r15,%r14 + + mulxq 56(%rbp),%rax,%r15 + movq 72+48+8(%rsp,%rcx,8),%rdx + adcxq %rax,%r14 + adoxq %rsi,%r15 + movq %rbx,(%rdi,%rcx,8) + movq %r8,%rbx + adcxq %rsi,%r15 + + incq %rcx + jnz .Lsqrx8x_tail + + cmpq 0+8(%rsp),%rbp + jae .Lsqrx8x_tail_done + + subq 16+8(%rsp),%rsi + movq 48+8(%rsp),%rdx + leaq 64(%rbp),%rbp + adcq 0(%rdi),%r8 + adcq 8(%rdi),%r9 + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + leaq 64(%rdi),%rdi + sbbq %rax,%rax + subq $8,%rcx + + xorq %rsi,%rsi + movq %rax,16+8(%rsp) + jmp .Lsqrx8x_tail + +.align 32 +.Lsqrx8x_tail_done: + xorq %rax,%rax + addq 24+8(%rsp),%r8 + adcq $0,%r9 + adcq $0,%r10 + adcq $0,%r11 + adcq $0,%r12 + adcq $0,%r13 + adcq $0,%r14 + adcq $0,%r15 + adcq $0,%rax + + subq 16+8(%rsp),%rsi +.Lsqrx8x_no_tail: + adcq 0(%rdi),%r8 + movq %xmm3,%rcx + adcq 8(%rdi),%r9 + movq 56(%rbp),%rsi + movq %xmm2,%rbp + adcq 16(%rdi),%r10 + adcq 24(%rdi),%r11 + adcq 32(%rdi),%r12 + adcq 40(%rdi),%r13 + adcq 48(%rdi),%r14 + adcq 56(%rdi),%r15 + adcq $0,%rax + + movq 32+8(%rsp),%rbx + movq 64(%rdi,%rcx,1),%rdx + + movq %r8,0(%rdi) + leaq 64(%rdi),%r8 + movq %r9,8(%rdi) + movq %r10,16(%rdi) + movq %r11,24(%rdi) + movq %r12,32(%rdi) + movq %r13,40(%rdi) + movq %r14,48(%rdi) + movq %r15,56(%rdi) + + leaq 64(%rdi,%rcx,1),%rdi + cmpq 8+8(%rsp),%r8 + jb .Lsqrx8x_reduction_loop + ret +.cfi_endproc +.size bn_sqrx8x_internal,.-bn_sqrx8x_internal +.align 32 +.type __bn_postx4x_internal,@function +__bn_postx4x_internal: +.cfi_startproc + movq 0(%rbp),%r12 + movq %rcx,%r10 + movq %rcx,%r9 + negq %rax + sarq $3+2,%rcx + + movq %xmm1,%rdx + movq %xmm1,%rsi + decq %r12 + movq 8(%rbp),%r13 + xorq %r8,%r8 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 + jmp .Lsqrx4x_sub_entry + +.align 16 +.Lsqrx4x_sub: + movq 0(%rbp),%r12 + movq 8(%rbp),%r13 + movq 16(%rbp),%r14 + movq 24(%rbp),%r15 +.Lsqrx4x_sub_entry: + andnq %rax,%r12,%r12 + leaq 32(%rbp),%rbp + andnq %rax,%r13,%r13 + andnq %rax,%r14,%r14 + andnq %rax,%r15,%r15 + + negq %r8 + adcq 0(%rdi),%r12 + adcq 8(%rdi),%r13 + adcq 16(%rdi),%r14 + adcq 24(%rdi),%r15 + movq %r12,0(%rdx) + leaq 32(%rdi),%rdi + movq %r13,8(%rdx) + sbbq %r8,%r8 + movq %r14,16(%rdx) + movq %r15,24(%rdx) + leaq 32(%rdx),%rdx + + incq %rcx + jnz .Lsqrx4x_sub + + negq %r9 + + ret +.cfi_endproc +.size __bn_postx4x_internal,.-__bn_postx4x_internal +.globl bn_scatter5 +.hidden bn_scatter5 +.type bn_scatter5,@function +.align 16 +bn_scatter5: +.cfi_startproc +_CET_ENDBR + cmpl $0,%esi + jz .Lscatter_epilogue + + + + + + + + + + leaq (%rdx,%rcx,8),%rdx +.Lscatter: + movq (%rdi),%rax + leaq 8(%rdi),%rdi + movq %rax,(%rdx) + leaq 256(%rdx),%rdx + subl $1,%esi + jnz .Lscatter +.Lscatter_epilogue: + ret +.cfi_endproc +.size bn_scatter5,.-bn_scatter5 + +.globl bn_gather5 +.hidden bn_gather5 +.type bn_gather5,@function +.align 32 +bn_gather5: +.cfi_startproc +.LSEH_begin_bn_gather5: +_CET_ENDBR + +.byte 0x4c,0x8d,0x14,0x24 +.cfi_def_cfa_register %r10 +.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + leaq .Linc(%rip),%rax + andq $-16,%rsp + + movd %ecx,%xmm5 + movdqa 0(%rax),%xmm0 + movdqa 16(%rax),%xmm1 + leaq 128(%rdx),%r11 + leaq 128(%rsp),%rax + + pshufd $0,%xmm5,%xmm5 + movdqa %xmm1,%xmm4 + movdqa %xmm1,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-128(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-112(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-96(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-80(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,-64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,-48(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,-32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,-16(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,0(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,16(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,32(%rax) + movdqa %xmm4,%xmm2 + paddd %xmm0,%xmm1 + pcmpeqd %xmm5,%xmm0 + movdqa %xmm3,48(%rax) + movdqa %xmm4,%xmm3 + + paddd %xmm1,%xmm2 + pcmpeqd %xmm5,%xmm1 + movdqa %xmm0,64(%rax) + movdqa %xmm4,%xmm0 + + paddd %xmm2,%xmm3 + pcmpeqd %xmm5,%xmm2 + movdqa %xmm1,80(%rax) + movdqa %xmm4,%xmm1 + + paddd %xmm3,%xmm0 + pcmpeqd %xmm5,%xmm3 + movdqa %xmm2,96(%rax) + movdqa %xmm4,%xmm2 + movdqa %xmm3,112(%rax) + jmp .Lgather + +.align 32 +.Lgather: + pxor %xmm4,%xmm4 + pxor %xmm5,%xmm5 + movdqa -128(%r11),%xmm0 + movdqa -112(%r11),%xmm1 + movdqa -96(%r11),%xmm2 + pand -128(%rax),%xmm0 + movdqa -80(%r11),%xmm3 + pand -112(%rax),%xmm1 + por %xmm0,%xmm4 + pand -96(%rax),%xmm2 + por %xmm1,%xmm5 + pand -80(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa -64(%r11),%xmm0 + movdqa -48(%r11),%xmm1 + movdqa -32(%r11),%xmm2 + pand -64(%rax),%xmm0 + movdqa -16(%r11),%xmm3 + pand -48(%rax),%xmm1 + por %xmm0,%xmm4 + pand -32(%rax),%xmm2 + por %xmm1,%xmm5 + pand -16(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 0(%r11),%xmm0 + movdqa 16(%r11),%xmm1 + movdqa 32(%r11),%xmm2 + pand 0(%rax),%xmm0 + movdqa 48(%r11),%xmm3 + pand 16(%rax),%xmm1 + por %xmm0,%xmm4 + pand 32(%rax),%xmm2 + por %xmm1,%xmm5 + pand 48(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + movdqa 64(%r11),%xmm0 + movdqa 80(%r11),%xmm1 + movdqa 96(%r11),%xmm2 + pand 64(%rax),%xmm0 + movdqa 112(%r11),%xmm3 + pand 80(%rax),%xmm1 + por %xmm0,%xmm4 + pand 96(%rax),%xmm2 + por %xmm1,%xmm5 + pand 112(%rax),%xmm3 + por %xmm2,%xmm4 + por %xmm3,%xmm5 + por %xmm5,%xmm4 + leaq 256(%r11),%r11 + + pshufd $0x4e,%xmm4,%xmm0 + por %xmm4,%xmm0 + movq %xmm0,(%rdi) + leaq 8(%rdi),%rdi + subl $1,%esi + jnz .Lgather + + leaq (%r10),%rsp +.cfi_def_cfa_register %rsp + ret +.LSEH_end_bn_gather5: +.cfi_endproc +.size bn_gather5,.-bn_gather5 +.section .rodata +.align 64 +.Linc: +.long 0,0, 1,1 +.long 2,2, 2,2 +.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +#endif diff --git a/third_party/boringssl/gen/bcm/x86_64-mont5-win.asm b/third_party/boringssl/gen/bcm/x86_64-mont5-win.asm new file mode 100644 index 00000000..5d5ec364 --- /dev/null +++ b/third_party/boringssl/gen/bcm/x86_64-mont5-win.asm @@ -0,0 +1,3863 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + +global bn_mul_mont_gather5_nohw + +ALIGN 64 +bn_mul_mont_gather5_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mul_mont_gather5_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + + + mov r9d,r9d + mov rax,rsp + + movd xmm5,DWORD[56+rsp] + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + + neg r9 + mov r11,rsp + lea r10,[((-280))+r9*8+rsp] + neg r9 + and r10,-1024 + + + + + + + + + + sub r11,r10 + and r11,-4096 + lea rsp,[r11*1+r10] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk + jmp NEAR $L$mul_page_walk_done + +$L$mul_page_walk: + lea rsp,[((-4096))+rsp] + mov r11,QWORD[rsp] + cmp rsp,r10 + ja NEAR $L$mul_page_walk +$L$mul_page_walk_done: + + lea r10,[$L$inc] + mov QWORD[8+r9*8+rsp],rax + +$L$mul_body: + + lea r12,[128+rdx] + movdqa xmm0,XMMWORD[r10] + movdqa xmm1,XMMWORD[16+r10] + lea r10,[((24-112))+r9*8+rsp] + and r10,-16 + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + DB 0x67 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 + DB 0x67 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + pand xmm0,XMMWORD[64+r12] + + pand xmm1,XMMWORD[80+r12] + pand xmm2,XMMWORD[96+r12] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+r12] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+r12] + movdqa xmm5,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+r12] + movdqa xmm5,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[r12] + movdqa xmm5,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+r12] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + por xmm0,xmm1 + + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 + lea r12,[256+r12] + movq rbx,xmm0 + + mov r8,QWORD[r8] + mov rax,QWORD[rsi] + + xor r14,r14 + xor r15,r15 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD[rcx] + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov r13,rdx + + lea r15,[1+r15] + jmp NEAR $L$1st_enter + +ALIGN 16 +$L$1st: + add r13,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add r13,r11 + mov r11,r10 + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$1st_enter: + mul rbx + add r11,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + lea r15,[1+r15] + mov r10,rdx + + mul rbp + cmp r15,r9 + jne NEAR $L$1st + + + add r13,rax + adc rdx,0 + add r13,r11 + adc rdx,0 + mov QWORD[((-16))+r9*8+rsp],r13 + mov r13,rdx + mov r11,r10 + + xor rdx,rdx + add r13,r11 + adc rdx,0 + mov QWORD[((-8))+r9*8+rsp],r13 + mov QWORD[r9*8+rsp],rdx + + lea r14,[1+r14] + jmp NEAR $L$outer +ALIGN 16 +$L$outer: + lea rdx,[((24+128))+r9*8+rsp] + and rdx,-16 + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+r12] + movdqa xmm1,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm0,XMMWORD[((-128))+rdx] + pand xmm1,XMMWORD[((-112))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-96))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-80))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+r12] + movdqa xmm1,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm0,XMMWORD[((-64))+rdx] + pand xmm1,XMMWORD[((-48))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-32))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-16))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[r12] + movdqa xmm1,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + movdqa xmm3,XMMWORD[48+r12] + pand xmm0,XMMWORD[rdx] + pand xmm1,XMMWORD[16+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[32+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[48+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+r12] + movdqa xmm1,XMMWORD[80+r12] + movdqa xmm2,XMMWORD[96+r12] + movdqa xmm3,XMMWORD[112+r12] + pand xmm0,XMMWORD[64+rdx] + pand xmm1,XMMWORD[80+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[96+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[112+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea r12,[256+r12] + + mov rax,QWORD[rsi] + movq rbx,xmm0 + + xor r15,r15 + mov rbp,r8 + mov r10,QWORD[rsp] + + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + + imul rbp,r10 + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+rsi] + adc rdx,0 + mov r10,QWORD[8+rsp] + mov r13,rdx + + lea r15,[1+r15] + jmp NEAR $L$inner_enter + +ALIGN 16 +$L$inner: + add r13,rax + mov rax,QWORD[r15*8+rsi] + adc rdx,0 + add r13,r10 + mov r10,QWORD[r15*8+rsp] + adc rdx,0 + mov QWORD[((-16))+r15*8+rsp],r13 + mov r13,rdx + +$L$inner_enter: + mul rbx + add r11,rax + mov rax,QWORD[r15*8+rcx] + adc rdx,0 + add r10,r11 + mov r11,rdx + adc r11,0 + lea r15,[1+r15] + + mul rbp + cmp r15,r9 + jne NEAR $L$inner + + add r13,rax + adc rdx,0 + add r13,r10 + mov r10,QWORD[r9*8+rsp] + adc rdx,0 + mov QWORD[((-16))+r9*8+rsp],r13 + mov r13,rdx + + xor rdx,rdx + add r13,r11 + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r9*8+rsp],r13 + mov QWORD[r9*8+rsp],rdx + + lea r14,[1+r14] + cmp r14,r9 + jb NEAR $L$outer + + xor r14,r14 + mov rax,QWORD[rsp] + lea rsi,[rsp] + mov r15,r9 + jmp NEAR $L$sub +ALIGN 16 +$L$sub: sbb rax,QWORD[r14*8+rcx] + mov QWORD[r14*8+rdi],rax + mov rax,QWORD[8+r14*8+rsi] + lea r14,[1+r14] + dec r15 + jnz NEAR $L$sub + + sbb rax,0 + mov rbx,-1 + xor rbx,rax + xor r14,r14 + mov r15,r9 + +$L$copy: + mov rcx,QWORD[r14*8+rdi] + mov rdx,QWORD[r14*8+rsp] + and rcx,rbx + and rdx,rax + mov QWORD[r14*8+rsp],r14 + or rdx,rcx + mov QWORD[r14*8+rdi],rdx + lea r14,[1+r14] + sub r15,1 + jnz NEAR $L$copy + + mov rsi,QWORD[8+r9*8+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mul_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_mul_mont_gather5_nohw: +global bn_mul4x_mont_gather5 + +ALIGN 32 +bn_mul4x_mont_gather5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mul4x_mont_gather5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + DB 0x67 + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mul4x_prologue: + + DB 0x67 + + + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + + + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$mul4xsp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$mul4xsp_done + +ALIGN 32 +$L$mul4xsp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$mul4xsp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mul4x_page_walk + jmp NEAR $L$mul4x_page_walk_done + +$L$mul4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mul4x_page_walk +$L$mul4x_page_walk_done: + + neg r9 + + mov QWORD[40+rsp],rax + +$L$mul4x_body: + + call mul4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mul4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_mul4x_mont_gather5: + + +ALIGN 32 +mul4x_internal: + + shl r9,5 + movd xmm5,DWORD[56+rax] + lea rax,[$L$inc] + lea r13,[128+r9*1+rdx] + shr r9,5 + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r10,[((88-112))+r9*1+rsp] + lea r12,[128+rdx] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + DB 0x67,0x67 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + DB 0x67 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 + DB 0x67 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + pand xmm0,XMMWORD[64+r12] + + pand xmm1,XMMWORD[80+r12] + pand xmm2,XMMWORD[96+r12] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+r12] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+r12] + movdqa xmm5,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+r12] + movdqa xmm5,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[r12] + movdqa xmm5,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+r12] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + por xmm0,xmm1 + + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 + lea r12,[256+r12] + movq rbx,xmm0 + + mov QWORD[((16+8))+rsp],r13 + mov QWORD[((56+8))+rsp],rdi + + mov r8,QWORD[r8] + mov rax,QWORD[rsi] + lea rsi,[r9*1+rsi] + neg r9 + + mov rbp,r8 + mul rbx + mov r10,rax + mov rax,QWORD[rcx] + + imul rbp,r10 + lea r14,[((64+8))+rsp] + mov r11,rdx + + mul rbp + add r10,rax + mov rax,QWORD[8+r9*1+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r9*1+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[32+r9] + lea rcx,[32+rcx] + adc rdx,0 + mov QWORD[r14],rdi + mov r13,rdx + jmp NEAR $L$1st4x + +ALIGN 32 +$L$1st4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r14],rdi + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-8))+r14],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r15*1+rsi] + adc rdx,0 + add rdi,r11 + lea rcx,[32+rcx] + adc rdx,0 + mov QWORD[r14],rdi + mov r13,rdx + + add r15,32 + jnz NEAR $L$1st4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+rcx] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r9*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-16))+r14],rdi + mov r13,rdx + + lea rcx,[r9*1+rcx] + + xor rdi,rdi + add r13,r10 + adc rdi,0 + mov QWORD[((-8))+r14],r13 + + jmp NEAR $L$outer4x + +ALIGN 32 +$L$outer4x: + lea rdx,[((16+128))+r14] + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+r12] + movdqa xmm1,XMMWORD[((-112))+r12] + movdqa xmm2,XMMWORD[((-96))+r12] + movdqa xmm3,XMMWORD[((-80))+r12] + pand xmm0,XMMWORD[((-128))+rdx] + pand xmm1,XMMWORD[((-112))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-96))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-80))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+r12] + movdqa xmm1,XMMWORD[((-48))+r12] + movdqa xmm2,XMMWORD[((-32))+r12] + movdqa xmm3,XMMWORD[((-16))+r12] + pand xmm0,XMMWORD[((-64))+rdx] + pand xmm1,XMMWORD[((-48))+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-32))+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-16))+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[r12] + movdqa xmm1,XMMWORD[16+r12] + movdqa xmm2,XMMWORD[32+r12] + movdqa xmm3,XMMWORD[48+r12] + pand xmm0,XMMWORD[rdx] + pand xmm1,XMMWORD[16+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[32+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[48+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+r12] + movdqa xmm1,XMMWORD[80+r12] + movdqa xmm2,XMMWORD[96+r12] + movdqa xmm3,XMMWORD[112+r12] + pand xmm0,XMMWORD[64+rdx] + pand xmm1,XMMWORD[80+rdx] + por xmm4,xmm0 + pand xmm2,XMMWORD[96+rdx] + por xmm5,xmm1 + pand xmm3,XMMWORD[112+rdx] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea r12,[256+r12] + movq rbx,xmm0 + + mov r10,QWORD[r9*1+r14] + mov rbp,r8 + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + + imul rbp,r10 + mov r11,rdx + mov QWORD[r14],rdi + + lea r14,[r9*1+r14] + + mul rbp + add r10,rax + mov rax,QWORD[8+r9*1+rsi] + adc rdx,0 + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + add r11,QWORD[8+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r9*1+rsi] + adc rdx,0 + add rdi,r11 + lea r15,[32+r9] + lea rcx,[32+rcx] + adc rdx,0 + mov r13,rdx + jmp NEAR $L$inner4x + +ALIGN 32 +$L$inner4x: + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + adc rdx,0 + add r10,QWORD[16+r14] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-32))+r14],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[((-8))+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r15*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov r13,rdx + + mul rbx + add r10,rax + mov rax,QWORD[rcx] + adc rdx,0 + add r10,QWORD[r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[8+r15*1+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-16))+r14],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,QWORD[8+rcx] + adc rdx,0 + add r11,QWORD[8+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[16+r15*1+rsi] + adc rdx,0 + add rdi,r11 + lea rcx,[32+rcx] + adc rdx,0 + mov QWORD[((-8))+r14],r13 + mov r13,rdx + + add r15,32 + jnz NEAR $L$inner4x + + mul rbx + add r10,rax + mov rax,QWORD[((-16))+rcx] + adc rdx,0 + add r10,QWORD[16+r14] + lea r14,[32+r14] + adc rdx,0 + mov r11,rdx + + mul rbp + add r13,rax + mov rax,QWORD[((-8))+rsi] + adc rdx,0 + add r13,r10 + adc rdx,0 + mov QWORD[((-32))+r14],rdi + mov rdi,rdx + + mul rbx + add r11,rax + mov rax,rbp + mov rbp,QWORD[((-8))+rcx] + adc rdx,0 + add r11,QWORD[((-8))+r14] + adc rdx,0 + mov r10,rdx + + mul rbp + add rdi,rax + mov rax,QWORD[r9*1+rsi] + adc rdx,0 + add rdi,r11 + adc rdx,0 + mov QWORD[((-24))+r14],r13 + mov r13,rdx + + mov QWORD[((-16))+r14],rdi + lea rcx,[r9*1+rcx] + + xor rdi,rdi + add r13,r10 + adc rdi,0 + add r13,QWORD[r14] + adc rdi,0 + mov QWORD[((-8))+r14],r13 + + cmp r12,QWORD[((16+8))+rsp] + jb NEAR $L$outer4x + xor rax,rax + sub rbp,r13 + adc r15,r15 + or rdi,r15 + sub rax,rdi + lea rbx,[r9*1+r14] + mov r12,QWORD[rcx] + lea rbp,[rcx] + mov rcx,r9 + sar rcx,3+2 + mov rdi,QWORD[((56+8))+rsp] + dec r12 + xor r10,r10 + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqr4x_sub_entry + + +global bn_power5_nohw + +ALIGN 32 +bn_power5_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_power5_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$power5_prologue: + + + + + shl r9d,3 + lea r10d,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$pwr_sp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$pwr_sp_done + +ALIGN 32 +$L$pwr_sp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$pwr_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwr_page_walk + jmp NEAR $L$pwr_page_walk_done + +$L$pwr_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwr_page_walk +$L$pwr_page_walk_done: + + mov r10,r9 + neg r9 + + + + + + + + + + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$power5_body: + movq xmm1,rdi + movq xmm2,rcx + movq xmm3,r10 + movq xmm4,rdx + + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + call __bn_sqr8x_internal + call __bn_post4x_internal + + movq rcx,xmm2 + movq rdx,xmm4 + mov rdi,rsi + mov rax,QWORD[40+rsp] + lea r8,[32+rsp] + + call mul4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$power5_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_power5_nohw: + +global bn_sqr8x_internal + + +ALIGN 32 +bn_sqr8x_internal: +__bn_sqr8x_internal: + +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lea rbp,[32+r10] + lea rsi,[r9*1+rsi] + + mov rcx,r9 + + + mov r14,QWORD[((-32))+rbp*1+rsi] + lea rdi,[((48+8))+r9*2+rsp] + mov rax,QWORD[((-24))+rbp*1+rsi] + lea rdi,[((-32))+rbp*1+rdi] + mov rbx,QWORD[((-16))+rbp*1+rsi] + mov r15,rax + + mul r14 + mov r10,rax + mov rax,rbx + mov r11,rdx + mov QWORD[((-24))+rbp*1+rdi],r10 + + mul r14 + add r11,rax + mov rax,rbx + adc rdx,0 + mov QWORD[((-16))+rbp*1+rdi],r11 + mov r10,rdx + + + mov rbx,QWORD[((-8))+rbp*1+rsi] + mul r15 + mov r12,rax + mov rax,rbx + mov r13,rdx + + lea rcx,[rbp] + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + mov QWORD[((-8))+rcx*1+rdi],r10 + jmp NEAR $L$sqr4x_1st + +ALIGN 32 +$L$sqr4x_1st: + mov rbx,QWORD[rcx*1+rsi] + mul r15 + add r13,rax + mov rax,rbx + mov r12,rdx + adc r12,0 + + mul r14 + add r11,rax + mov rax,rbx + mov rbx,QWORD[8+rcx*1+rsi] + mov r10,rdx + adc r10,0 + add r11,r13 + adc r10,0 + + + mul r15 + add r12,rax + mov rax,rbx + mov QWORD[rcx*1+rdi],r11 + mov r13,rdx + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + mov rbx,QWORD[16+rcx*1+rsi] + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + + mul r15 + add r13,rax + mov rax,rbx + mov QWORD[8+rcx*1+rdi],r10 + mov r12,rdx + adc r12,0 + + mul r14 + add r11,rax + mov rax,rbx + mov rbx,QWORD[24+rcx*1+rsi] + mov r10,rdx + adc r10,0 + add r11,r13 + adc r10,0 + + + mul r15 + add r12,rax + mov rax,rbx + mov QWORD[16+rcx*1+rdi],r11 + mov r13,rdx + adc r13,0 + lea rcx,[32+rcx] + + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + mov QWORD[((-8))+rcx*1+rdi],r10 + + cmp rcx,0 + jne NEAR $L$sqr4x_1st + + mul r15 + add r13,rax + lea rbp,[16+rbp] + adc rdx,0 + add r13,r11 + adc rdx,0 + + mov QWORD[rdi],r13 + mov r12,rdx + mov QWORD[8+rdi],rdx + jmp NEAR $L$sqr4x_outer + +ALIGN 32 +$L$sqr4x_outer: + mov r14,QWORD[((-32))+rbp*1+rsi] + lea rdi,[((48+8))+r9*2+rsp] + mov rax,QWORD[((-24))+rbp*1+rsi] + lea rdi,[((-32))+rbp*1+rdi] + mov rbx,QWORD[((-16))+rbp*1+rsi] + mov r15,rax + + mul r14 + mov r10,QWORD[((-24))+rbp*1+rdi] + add r10,rax + mov rax,rbx + adc rdx,0 + mov QWORD[((-24))+rbp*1+rdi],r10 + mov r11,rdx + + mul r14 + add r11,rax + mov rax,rbx + adc rdx,0 + add r11,QWORD[((-16))+rbp*1+rdi] + mov r10,rdx + adc r10,0 + mov QWORD[((-16))+rbp*1+rdi],r11 + + xor r12,r12 + + mov rbx,QWORD[((-8))+rbp*1+rsi] + mul r15 + add r12,rax + mov rax,rbx + adc rdx,0 + add r12,QWORD[((-8))+rbp*1+rdi] + mov r13,rdx + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + adc rdx,0 + add r10,r12 + mov r11,rdx + adc r11,0 + mov QWORD[((-8))+rbp*1+rdi],r10 + + lea rcx,[rbp] + jmp NEAR $L$sqr4x_inner + +ALIGN 32 +$L$sqr4x_inner: + mov rbx,QWORD[rcx*1+rsi] + mul r15 + add r13,rax + mov rax,rbx + mov r12,rdx + adc r12,0 + add r13,QWORD[rcx*1+rdi] + adc r12,0 + + DB 0x67 + mul r14 + add r11,rax + mov rax,rbx + mov rbx,QWORD[8+rcx*1+rsi] + mov r10,rdx + adc r10,0 + add r11,r13 + adc r10,0 + + mul r15 + add r12,rax + mov QWORD[rcx*1+rdi],r11 + mov rax,rbx + mov r13,rdx + adc r13,0 + add r12,QWORD[8+rcx*1+rdi] + lea rcx,[16+rcx] + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + adc rdx,0 + add r10,r12 + mov r11,rdx + adc r11,0 + mov QWORD[((-8))+rcx*1+rdi],r10 + + cmp rcx,0 + jne NEAR $L$sqr4x_inner + + DB 0x67 + mul r15 + add r13,rax + adc rdx,0 + add r13,r11 + adc rdx,0 + + mov QWORD[rdi],r13 + mov r12,rdx + mov QWORD[8+rdi],rdx + + add rbp,16 + jnz NEAR $L$sqr4x_outer + + + mov r14,QWORD[((-32))+rsi] + lea rdi,[((48+8))+r9*2+rsp] + mov rax,QWORD[((-24))+rsi] + lea rdi,[((-32))+rbp*1+rdi] + mov rbx,QWORD[((-16))+rsi] + mov r15,rax + + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + + mul r14 + add r11,rax + mov rax,rbx + mov QWORD[((-24))+rdi],r10 + mov r10,rdx + adc r10,0 + add r11,r13 + mov rbx,QWORD[((-8))+rsi] + adc r10,0 + + mul r15 + add r12,rax + mov rax,rbx + mov QWORD[((-16))+rdi],r11 + mov r13,rdx + adc r13,0 + + mul r14 + add r10,rax + mov rax,rbx + mov r11,rdx + adc r11,0 + add r10,r12 + adc r11,0 + mov QWORD[((-8))+rdi],r10 + + mul r15 + add r13,rax + mov rax,QWORD[((-16))+rsi] + adc rdx,0 + add r13,r11 + adc rdx,0 + + mov QWORD[rdi],r13 + mov r12,rdx + mov QWORD[8+rdi],rdx + + mul rbx + add rbp,16 + xor r14,r14 + sub rbp,r9 + xor r15,r15 + + add rax,r12 + adc rdx,0 + mov QWORD[8+rdi],rax + mov QWORD[16+rdi],rdx + mov QWORD[24+rdi],r15 + + mov rax,QWORD[((-16))+rbp*1+rsi] + lea rdi,[((48+8))+rsp] + xor r10,r10 + mov r11,QWORD[8+rdi] + + lea r12,[r10*2+r14] + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[16+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[24+rdi] + adc r12,rax + mov rax,QWORD[((-8))+rbp*1+rsi] + mov QWORD[rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[8+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mov r10,QWORD[32+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[40+rdi] + adc rbx,rax + mov rax,QWORD[rbp*1+rsi] + mov QWORD[16+rdi],rbx + adc r8,rdx + lea rbp,[16+rbp] + mov QWORD[24+rdi],r8 + sbb r15,r15 + lea rdi,[64+rdi] + jmp NEAR $L$sqr4x_shift_n_add + +ALIGN 32 +$L$sqr4x_shift_n_add: + lea r12,[r10*2+r14] + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[((-16))+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[((-8))+rdi] + adc r12,rax + mov rax,QWORD[((-8))+rbp*1+rsi] + mov QWORD[((-32))+rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[((-24))+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mov r10,QWORD[rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[8+rdi] + adc rbx,rax + mov rax,QWORD[rbp*1+rsi] + mov QWORD[((-16))+rdi],rbx + adc r8,rdx + + lea r12,[r10*2+r14] + mov QWORD[((-8))+rdi],r8 + sbb r15,r15 + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[16+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[24+rdi] + adc r12,rax + mov rax,QWORD[8+rbp*1+rsi] + mov QWORD[rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[8+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mov r10,QWORD[32+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[40+rdi] + adc rbx,rax + mov rax,QWORD[16+rbp*1+rsi] + mov QWORD[16+rdi],rbx + adc r8,rdx + mov QWORD[24+rdi],r8 + sbb r15,r15 + lea rdi,[64+rdi] + add rbp,32 + jnz NEAR $L$sqr4x_shift_n_add + + lea r12,[r10*2+r14] + DB 0x67 + shr r10,63 + lea r13,[r11*2+rcx] + shr r11,63 + or r13,r10 + mov r10,QWORD[((-16))+rdi] + mov r14,r11 + mul rax + neg r15 + mov r11,QWORD[((-8))+rdi] + adc r12,rax + mov rax,QWORD[((-8))+rsi] + mov QWORD[((-32))+rdi],r12 + adc r13,rdx + + lea rbx,[r10*2+r14] + mov QWORD[((-24))+rdi],r13 + sbb r15,r15 + shr r10,63 + lea r8,[r11*2+rcx] + shr r11,63 + or r8,r10 + mul rax + neg r15 + adc rbx,rax + adc r8,rdx + mov QWORD[((-16))+rdi],rbx + mov QWORD[((-8))+rdi],r8 + movq rbp,xmm2 +__bn_sqr8x_reduction: + xor rax,rax + lea rcx,[rbp*1+r9] + lea rdx,[((48+8))+r9*2+rsp] + mov QWORD[((0+8))+rsp],rcx + lea rdi,[((48+8))+r9*1+rsp] + mov QWORD[((8+8))+rsp],rdx + neg r9 + jmp NEAR $L$8x_reduction_loop + +ALIGN 32 +$L$8x_reduction_loop: + lea rdi,[r9*1+rdi] + DB 0x66 + mov rbx,QWORD[rdi] + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov r12,QWORD[32+rdi] + mov r13,QWORD[40+rdi] + mov r14,QWORD[48+rdi] + mov r15,QWORD[56+rdi] + mov QWORD[rdx],rax + lea rdi,[64+rdi] + + DB 0x67 + mov r8,rbx + imul rbx,QWORD[((32+8))+rsp] + mov rax,QWORD[rbp] + mov ecx,8 + jmp NEAR $L$8x_reduce + +ALIGN 32 +$L$8x_reduce: + mul rbx + mov rax,QWORD[8+rbp] + neg r8 + mov r8,rdx + adc r8,0 + + mul rbx + add r9,rax + mov rax,QWORD[16+rbp] + adc rdx,0 + add r8,r9 + mov QWORD[((48-8+8))+rcx*8+rsp],rbx + mov r9,rdx + adc r9,0 + + mul rbx + add r10,rax + mov rax,QWORD[24+rbp] + adc rdx,0 + add r9,r10 + mov rsi,QWORD[((32+8))+rsp] + mov r10,rdx + adc r10,0 + + mul rbx + add r11,rax + mov rax,QWORD[32+rbp] + adc rdx,0 + imul rsi,r8 + add r10,r11 + mov r11,rdx + adc r11,0 + + mul rbx + add r12,rax + mov rax,QWORD[40+rbp] + adc rdx,0 + add r11,r12 + mov r12,rdx + adc r12,0 + + mul rbx + add r13,rax + mov rax,QWORD[48+rbp] + adc rdx,0 + add r12,r13 + mov r13,rdx + adc r13,0 + + mul rbx + add r14,rax + mov rax,QWORD[56+rbp] + adc rdx,0 + add r13,r14 + mov r14,rdx + adc r14,0 + + mul rbx + mov rbx,rsi + add r15,rax + mov rax,QWORD[rbp] + adc rdx,0 + add r14,r15 + mov r15,rdx + adc r15,0 + + dec ecx + jnz NEAR $L$8x_reduce + + lea rbp,[64+rbp] + xor rax,rax + mov rdx,QWORD[((8+8))+rsp] + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$8x_no_tail + + DB 0x66 + add r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + sbb rsi,rsi + + mov rbx,QWORD[((48+56+8))+rsp] + mov ecx,8 + mov rax,QWORD[rbp] + jmp NEAR $L$8x_tail + +ALIGN 32 +$L$8x_tail: + mul rbx + add r8,rax + mov rax,QWORD[8+rbp] + mov QWORD[rdi],r8 + mov r8,rdx + adc r8,0 + + mul rbx + add r9,rax + mov rax,QWORD[16+rbp] + adc rdx,0 + add r8,r9 + lea rdi,[8+rdi] + mov r9,rdx + adc r9,0 + + mul rbx + add r10,rax + mov rax,QWORD[24+rbp] + adc rdx,0 + add r9,r10 + mov r10,rdx + adc r10,0 + + mul rbx + add r11,rax + mov rax,QWORD[32+rbp] + adc rdx,0 + add r10,r11 + mov r11,rdx + adc r11,0 + + mul rbx + add r12,rax + mov rax,QWORD[40+rbp] + adc rdx,0 + add r11,r12 + mov r12,rdx + adc r12,0 + + mul rbx + add r13,rax + mov rax,QWORD[48+rbp] + adc rdx,0 + add r12,r13 + mov r13,rdx + adc r13,0 + + mul rbx + add r14,rax + mov rax,QWORD[56+rbp] + adc rdx,0 + add r13,r14 + mov r14,rdx + adc r14,0 + + mul rbx + mov rbx,QWORD[((48-16+8))+rcx*8+rsp] + add r15,rax + adc rdx,0 + add r14,r15 + mov rax,QWORD[rbp] + mov r15,rdx + adc r15,0 + + dec ecx + jnz NEAR $L$8x_tail + + lea rbp,[64+rbp] + mov rdx,QWORD[((8+8))+rsp] + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$8x_tail_done + + mov rbx,QWORD[((48+56+8))+rsp] + neg rsi + mov rax,QWORD[rbp] + adc r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + sbb rsi,rsi + + mov ecx,8 + jmp NEAR $L$8x_tail + +ALIGN 32 +$L$8x_tail_done: + xor rax,rax + add r8,QWORD[rdx] + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rax,0 + + neg rsi +$L$8x_no_tail: + adc r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + adc rax,0 + mov rcx,QWORD[((-8))+rbp] + xor rsi,rsi + + movq rbp,xmm2 + + mov QWORD[rdi],r8 + mov QWORD[8+rdi],r9 + movq r9,xmm3 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + lea rdi,[64+rdi] + + cmp rdi,rdx + jb NEAR $L$8x_reduction_loop + ret + + + +ALIGN 32 +__bn_post4x_internal: + + mov r12,QWORD[rbp] + lea rbx,[r9*1+rdi] + mov rcx,r9 + movq rdi,xmm1 + neg rax + movq rsi,xmm1 + sar rcx,3+2 + dec r12 + xor r10,r10 + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqr4x_sub_entry + +ALIGN 16 +$L$sqr4x_sub: + mov r12,QWORD[rbp] + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] +$L$sqr4x_sub_entry: + lea rbp,[32+rbp] + not r12 + not r13 + not r14 + not r15 + and r12,rax + and r13,rax + and r14,rax + and r15,rax + + neg r10 + adc r12,QWORD[rbx] + adc r13,QWORD[8+rbx] + adc r14,QWORD[16+rbx] + adc r15,QWORD[24+rbx] + mov QWORD[rdi],r12 + lea rbx,[32+rbx] + mov QWORD[8+rdi],r13 + sbb r10,r10 + mov QWORD[16+rdi],r14 + mov QWORD[24+rdi],r15 + lea rdi,[32+rdi] + + inc rcx + jnz NEAR $L$sqr4x_sub + + mov r10,r9 + neg r9 + ret + + +global bn_mulx4x_mont_gather5 + +ALIGN 32 +bn_mulx4x_mont_gather5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_mulx4x_mont_gather5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$mulx4x_prologue: + + + + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$mulx4xsp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$mulx4xsp_done + +$L$mulx4xsp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$mulx4xsp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk + jmp NEAR $L$mulx4x_page_walk_done + +$L$mulx4x_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$mulx4x_page_walk +$L$mulx4x_page_walk_done: + + + + + + + + + + + + + + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$mulx4x_body: + call mulx4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$mulx4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_mulx4x_mont_gather5: + + +ALIGN 32 +mulx4x_internal: + + mov QWORD[8+rsp],r9 + mov r10,r9 + neg r9 + shl r9,5 + neg r10 + lea r13,[128+r9*1+rdx] + shr r9,5+5 + movd xmm5,DWORD[56+rax] + sub r9,1 + lea rax,[$L$inc] + mov QWORD[((16+8))+rsp],r13 + mov QWORD[((24+8))+rsp],r9 + mov QWORD[((56+8))+rsp],rdi + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r10,[((88-112))+r10*1+rsp] + lea rdi,[128+rdx] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + DB 0x67 + movdqa xmm2,xmm1 + DB 0x67 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[112+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[128+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[144+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[160+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[176+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[192+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[208+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[224+r10],xmm3 + movdqa xmm3,xmm4 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[240+r10],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[256+r10],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[272+r10],xmm2 + movdqa xmm2,xmm4 + + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[288+r10],xmm3 + movdqa xmm3,xmm4 + DB 0x67 + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[304+r10],xmm0 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[320+r10],xmm1 + + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[336+r10],xmm2 + + pand xmm0,XMMWORD[64+rdi] + pand xmm1,XMMWORD[80+rdi] + pand xmm2,XMMWORD[96+rdi] + movdqa XMMWORD[352+r10],xmm3 + pand xmm3,XMMWORD[112+rdi] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-128))+rdi] + movdqa xmm5,XMMWORD[((-112))+rdi] + movdqa xmm2,XMMWORD[((-96))+rdi] + pand xmm4,XMMWORD[112+r10] + movdqa xmm3,XMMWORD[((-80))+rdi] + pand xmm5,XMMWORD[128+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[144+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[160+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[((-64))+rdi] + movdqa xmm5,XMMWORD[((-48))+rdi] + movdqa xmm2,XMMWORD[((-32))+rdi] + pand xmm4,XMMWORD[176+r10] + movdqa xmm3,XMMWORD[((-16))+rdi] + pand xmm5,XMMWORD[192+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[208+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[224+r10] + por xmm0,xmm2 + por xmm1,xmm3 + movdqa xmm4,XMMWORD[rdi] + movdqa xmm5,XMMWORD[16+rdi] + movdqa xmm2,XMMWORD[32+rdi] + pand xmm4,XMMWORD[240+r10] + movdqa xmm3,XMMWORD[48+rdi] + pand xmm5,XMMWORD[256+r10] + por xmm0,xmm4 + pand xmm2,XMMWORD[272+r10] + por xmm1,xmm5 + pand xmm3,XMMWORD[288+r10] + por xmm0,xmm2 + por xmm1,xmm3 + pxor xmm0,xmm1 + + pshufd xmm1,xmm0,0x4e + por xmm0,xmm1 + lea rdi,[256+rdi] + movq rdx,xmm0 + lea rbx,[((64+32+8))+rsp] + + mov r9,rdx + mulx rax,r8,QWORD[rsi] + mulx r12,r11,QWORD[8+rsi] + add r11,rax + mulx r13,rax,QWORD[16+rsi] + adc r12,rax + adc r13,0 + mulx r14,rax,QWORD[24+rsi] + + mov r15,r8 + imul r8,QWORD[((32+8))+rsp] + xor rbp,rbp + mov rdx,r8 + + mov QWORD[((8+8))+rsp],rdi + + lea rsi,[32+rsi] + adcx r13,rax + adcx r14,rbp + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + mov rdi,QWORD[((24+8))+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r11 + adcx r12,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r12 + jmp NEAR $L$mulx4x_1st + +ALIGN 32 +$L$mulx4x_1st: + adcx r15,rbp + mulx rax,r10,QWORD[rsi] + adcx r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r12,r14 + mulx r14,r13,QWORD[24+rsi] + DB 0x67,0x67 + mov rdx,r8 + adcx r13,rax + adcx r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + mov QWORD[((-32))+rbx],r11 + adox r13,r15 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + lea rcx,[32+rcx] + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_1st + + mov rax,QWORD[8+rsp] + adc r15,rbp + lea rsi,[rax*1+rsi] + add r14,r15 + mov rdi,QWORD[((8+8))+rsp] + adc rbp,rbp + mov QWORD[((-8))+rbx],r14 + jmp NEAR $L$mulx4x_outer + +ALIGN 32 +$L$mulx4x_outer: + lea r10,[((16-256))+rbx] + pxor xmm4,xmm4 + DB 0x67,0x67 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+rdi] + movdqa xmm1,XMMWORD[((-112))+rdi] + movdqa xmm2,XMMWORD[((-96))+rdi] + pand xmm0,XMMWORD[256+r10] + movdqa xmm3,XMMWORD[((-80))+rdi] + pand xmm1,XMMWORD[272+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[288+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[304+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+rdi] + movdqa xmm1,XMMWORD[((-48))+rdi] + movdqa xmm2,XMMWORD[((-32))+rdi] + pand xmm0,XMMWORD[320+r10] + movdqa xmm3,XMMWORD[((-16))+rdi] + pand xmm1,XMMWORD[336+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[352+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[368+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[rdi] + movdqa xmm1,XMMWORD[16+rdi] + movdqa xmm2,XMMWORD[32+rdi] + pand xmm0,XMMWORD[384+r10] + movdqa xmm3,XMMWORD[48+rdi] + pand xmm1,XMMWORD[400+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[416+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[432+r10] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+rdi] + movdqa xmm1,XMMWORD[80+rdi] + movdqa xmm2,XMMWORD[96+rdi] + pand xmm0,XMMWORD[448+r10] + movdqa xmm3,XMMWORD[112+rdi] + pand xmm1,XMMWORD[464+r10] + por xmm4,xmm0 + pand xmm2,XMMWORD[480+r10] + por xmm5,xmm1 + pand xmm3,XMMWORD[496+r10] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + lea rdi,[256+rdi] + movq rdx,xmm0 + + mov QWORD[rbx],rbp + lea rbx,[32+rax*1+rbx] + mulx r11,r8,QWORD[rsi] + xor rbp,rbp + mov r9,rdx + mulx r12,r14,QWORD[8+rsi] + adox r8,QWORD[((-32))+rbx] + adcx r11,r14 + mulx r13,r15,QWORD[16+rsi] + adox r11,QWORD[((-24))+rbx] + adcx r12,r15 + mulx r14,rdx,QWORD[24+rsi] + adox r12,QWORD[((-16))+rbx] + adcx r13,rdx + lea rcx,[rax*1+rcx] + lea rsi,[32+rsi] + adox r13,QWORD[((-8))+rbx] + adcx r14,rbp + adox r14,rbp + + mov r15,r8 + imul r8,QWORD[((32+8))+rsp] + + mov rdx,r8 + xor rbp,rbp + mov QWORD[((8+8))+rsp],rdi + + mulx r10,rax,QWORD[rcx] + adcx r15,rax + adox r10,r11 + mulx r11,rax,QWORD[8+rcx] + adcx r10,rax + adox r11,r12 + mulx r12,rax,QWORD[16+rcx] + adcx r11,rax + adox r12,r13 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + mov rdi,QWORD[((24+8))+rsp] + mov QWORD[((-32))+rbx],r10 + adcx r12,rax + mov QWORD[((-24))+rbx],r11 + adox r15,rbp + mov QWORD[((-16))+rbx],r12 + lea rcx,[32+rcx] + jmp NEAR $L$mulx4x_inner + +ALIGN 32 +$L$mulx4x_inner: + mulx rax,r10,QWORD[rsi] + adcx r15,rbp + adox r10,r14 + mulx r14,r11,QWORD[8+rsi] + adcx r10,QWORD[rbx] + adox r11,rax + mulx rax,r12,QWORD[16+rsi] + adcx r11,QWORD[8+rbx] + adox r12,r14 + mulx r14,r13,QWORD[24+rsi] + mov rdx,r8 + adcx r12,QWORD[16+rbx] + adox r13,rax + adcx r13,QWORD[24+rbx] + adox r14,rbp + lea rsi,[32+rsi] + lea rbx,[32+rbx] + adcx r14,rbp + + adox r10,r15 + mulx r15,rax,QWORD[rcx] + adcx r10,rax + adox r11,r15 + mulx r15,rax,QWORD[8+rcx] + adcx r11,rax + adox r12,r15 + mulx r15,rax,QWORD[16+rcx] + mov QWORD[((-40))+rbx],r10 + adcx r12,rax + adox r13,r15 + mov QWORD[((-32))+rbx],r11 + mulx r15,rax,QWORD[24+rcx] + mov rdx,r9 + lea rcx,[32+rcx] + mov QWORD[((-24))+rbx],r12 + adcx r13,rax + adox r15,rbp + mov QWORD[((-16))+rbx],r13 + + dec rdi + jnz NEAR $L$mulx4x_inner + + mov rax,QWORD[((0+8))+rsp] + adc r15,rbp + sub rdi,QWORD[rbx] + mov rdi,QWORD[((8+8))+rsp] + mov r10,QWORD[((16+8))+rsp] + adc r14,r15 + lea rsi,[rax*1+rsi] + adc rbp,rbp + mov QWORD[((-8))+rbx],r14 + + cmp rdi,r10 + jb NEAR $L$mulx4x_outer + + mov r10,QWORD[((-8))+rcx] + mov r8,rbp + mov r12,QWORD[rax*1+rcx] + lea rbp,[rax*1+rcx] + mov rcx,rax + lea rdi,[rax*1+rbx] + xor eax,eax + xor r15,r15 + sub r10,r14 + adc r15,r15 + or r8,r15 + sar rcx,3+2 + sub rax,r8 + mov rdx,QWORD[((56+8))+rsp] + dec r12 + mov r13,QWORD[8+rbp] + xor r8,r8 + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqrx4x_sub_entry + + +global bn_powerx5 + +ALIGN 32 +bn_powerx5: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_bn_powerx5: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + mov rax,rsp + + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + +$L$powerx5_prologue: + + + + + shl r9d,3 + lea r10,[r9*2+r9] + neg r9 + mov r8,QWORD[r8] + + + + + + + + + lea r11,[((-320))+r9*2+rsp] + mov rbp,rsp + sub r11,rdi + and r11,4095 + cmp r10,r11 + jb NEAR $L$pwrx_sp_alt + sub rbp,r11 + lea rbp,[((-320))+r9*2+rbp] + jmp NEAR $L$pwrx_sp_done + +ALIGN 32 +$L$pwrx_sp_alt: + lea r10,[((4096-320))+r9*2] + lea rbp,[((-320))+r9*2+rbp] + sub r11,r10 + mov r10,0 + cmovc r11,r10 + sub rbp,r11 +$L$pwrx_sp_done: + and rbp,-64 + mov r11,rsp + sub r11,rbp + and r11,-4096 + lea rsp,[rbp*1+r11] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwrx_page_walk + jmp NEAR $L$pwrx_page_walk_done + +$L$pwrx_page_walk: + lea rsp,[((-4096))+rsp] + mov r10,QWORD[rsp] + cmp rsp,rbp + ja NEAR $L$pwrx_page_walk +$L$pwrx_page_walk_done: + + mov r10,r9 + neg r9 + + + + + + + + + + + + + pxor xmm0,xmm0 + movq xmm1,rdi + movq xmm2,rcx + movq xmm3,r10 + movq xmm4,rdx + mov QWORD[32+rsp],r8 + mov QWORD[40+rsp],rax + +$L$powerx5_body: + + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + call __bn_sqrx8x_internal + call __bn_postx4x_internal + + mov r9,r10 + mov rdi,rsi + movq rcx,xmm2 + movq rdx,xmm4 + mov rax,QWORD[40+rsp] + + call mulx4x_internal + + mov rsi,QWORD[40+rsp] + + mov rax,1 + + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$powerx5_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_bn_powerx5: + +global bn_sqrx8x_internal + + +ALIGN 32 +bn_sqrx8x_internal: +__bn_sqrx8x_internal: + +_CET_ENDBR + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + lea rdi,[((48+8))+rsp] + lea rbp,[r9*1+rsi] + mov QWORD[((0+8))+rsp],r9 + mov QWORD[((8+8))+rsp],rbp + jmp NEAR $L$sqr8x_zero_start + +ALIGN 32 + DB 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 +$L$sqrx8x_zero: + DB 0x3e + movdqa XMMWORD[rdi],xmm0 + movdqa XMMWORD[16+rdi],xmm0 + movdqa XMMWORD[32+rdi],xmm0 + movdqa XMMWORD[48+rdi],xmm0 +$L$sqr8x_zero_start: + movdqa XMMWORD[64+rdi],xmm0 + movdqa XMMWORD[80+rdi],xmm0 + movdqa XMMWORD[96+rdi],xmm0 + movdqa XMMWORD[112+rdi],xmm0 + lea rdi,[128+rdi] + sub r9,64 + jnz NEAR $L$sqrx8x_zero + + mov rdx,QWORD[rsi] + + xor r10,r10 + xor r11,r11 + xor r12,r12 + xor r13,r13 + xor r14,r14 + xor r15,r15 + lea rdi,[((48+8))+rsp] + xor rbp,rbp + jmp NEAR $L$sqrx8x_outer_loop + +ALIGN 32 +$L$sqrx8x_outer_loop: + mulx rax,r8,QWORD[8+rsi] + adcx r8,r9 + adox r10,rax + mulx rax,r9,QWORD[16+rsi] + adcx r9,r10 + adox r11,rax + DB 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 + adcx r10,r11 + adox r12,rax + DB 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 + adcx r11,r12 + adox r13,rax + mulx rax,r12,QWORD[40+rsi] + adcx r12,r13 + adox r14,rax + mulx rax,r13,QWORD[48+rsi] + adcx r13,r14 + adox rax,r15 + mulx r15,r14,QWORD[56+rsi] + mov rdx,QWORD[8+rsi] + adcx r14,rax + adox r15,rbp + adc r15,QWORD[64+rdi] + mov QWORD[8+rdi],r8 + mov QWORD[16+rdi],r9 + sbb rcx,rcx + xor rbp,rbp + + + mulx rbx,r8,QWORD[16+rsi] + mulx rax,r9,QWORD[24+rsi] + adcx r8,r10 + adox r9,rbx + mulx rbx,r10,QWORD[32+rsi] + adcx r9,r11 + adox r10,rax + DB 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 + adcx r10,r12 + adox r11,rbx + DB 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 + adcx r11,r13 + adox r12,r14 + DB 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 + mov rdx,QWORD[16+rsi] + adcx r12,rax + adox r13,rbx + adcx r13,r15 + adox r14,rbp + adcx r14,rbp + + mov QWORD[24+rdi],r8 + mov QWORD[32+rdi],r9 + + mulx rbx,r8,QWORD[24+rsi] + mulx rax,r9,QWORD[32+rsi] + adcx r8,r10 + adox r9,rbx + mulx rbx,r10,QWORD[40+rsi] + adcx r9,r11 + adox r10,rax + DB 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 + adcx r10,r12 + adox r11,r13 + DB 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 + DB 0x3e + mov rdx,QWORD[24+rsi] + adcx r11,rbx + adox r12,rax + adcx r12,r14 + mov QWORD[40+rdi],r8 + mov QWORD[48+rdi],r9 + mulx rax,r8,QWORD[32+rsi] + adox r13,rbp + adcx r13,rbp + + mulx rbx,r9,QWORD[40+rsi] + adcx r8,r10 + adox r9,rax + mulx rax,r10,QWORD[48+rsi] + adcx r9,r11 + adox r10,r12 + mulx r12,r11,QWORD[56+rsi] + mov rdx,QWORD[32+rsi] + mov r14,QWORD[40+rsi] + adcx r10,rbx + adox r11,rax + mov r15,QWORD[48+rsi] + adcx r11,r13 + adox r12,rbp + adcx r12,rbp + + mov QWORD[56+rdi],r8 + mov QWORD[64+rdi],r9 + + mulx rax,r9,r14 + mov r8,QWORD[56+rsi] + adcx r9,r10 + mulx rbx,r10,r15 + adox r10,rax + adcx r10,r11 + mulx rax,r11,r8 + mov rdx,r14 + adox r11,rbx + adcx r11,r12 + + adcx rax,rbp + + mulx rbx,r14,r15 + mulx r13,r12,r8 + mov rdx,r15 + lea rsi,[64+rsi] + adcx r11,r14 + adox r12,rbx + adcx r12,rax + adox r13,rbp + + DB 0x67,0x67 + mulx r14,r8,r8 + adcx r13,r8 + adcx r14,rbp + + cmp rsi,QWORD[((8+8))+rsp] + je NEAR $L$sqrx8x_outer_break + + neg rcx + mov rcx,-8 + mov r15,rbp + mov r8,QWORD[64+rdi] + adcx r9,QWORD[72+rdi] + adcx r10,QWORD[80+rdi] + adcx r11,QWORD[88+rdi] + adc r12,QWORD[96+rdi] + adc r13,QWORD[104+rdi] + adc r14,QWORD[112+rdi] + adc r15,QWORD[120+rdi] + lea rbp,[rsi] + lea rdi,[128+rdi] + sbb rax,rax + + mov rdx,QWORD[((-64))+rsi] + mov QWORD[((16+8))+rsp],rax + mov QWORD[((24+8))+rsp],rdi + + + xor eax,eax + jmp NEAR $L$sqrx8x_loop + +ALIGN 32 +$L$sqrx8x_loop: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rbp] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rbp] + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rbp] + adcx r10,rax + adox r11,r12 + + DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + mov QWORD[rcx*8+rdi],rbx + mov ebx,0 + adcx r13,rax + adox r14,r15 + + DB 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 + mov rdx,QWORD[8+rcx*8+rsi] + adcx r14,rax + adox r15,rbx + adcx r15,rbx + + DB 0x67 + inc rcx + jnz NEAR $L$sqrx8x_loop + + lea rbp,[64+rbp] + mov rcx,-8 + cmp rbp,QWORD[((8+8))+rsp] + je NEAR $L$sqrx8x_break + + sub rbx,QWORD[((16+8))+rsp] + DB 0x66 + mov rdx,QWORD[((-64))+rsi] + adcx r8,QWORD[rdi] + adcx r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + DB 0x67 + sbb rax,rax + xor ebx,ebx + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_loop + +ALIGN 32 +$L$sqrx8x_break: + xor rbp,rbp + sub rbx,QWORD[((16+8))+rsp] + adcx r8,rbp + mov rcx,QWORD[((24+8))+rsp] + adcx r9,rbp + mov rdx,QWORD[rsi] + adc r10,0 + mov QWORD[rdi],r8 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + cmp rdi,rcx + je NEAR $L$sqrx8x_outer_loop + + mov QWORD[8+rdi],r9 + mov r9,QWORD[8+rcx] + mov QWORD[16+rdi],r10 + mov r10,QWORD[16+rcx] + mov QWORD[24+rdi],r11 + mov r11,QWORD[24+rcx] + mov QWORD[32+rdi],r12 + mov r12,QWORD[32+rcx] + mov QWORD[40+rdi],r13 + mov r13,QWORD[40+rcx] + mov QWORD[48+rdi],r14 + mov r14,QWORD[48+rcx] + mov QWORD[56+rdi],r15 + mov r15,QWORD[56+rcx] + mov rdi,rcx + jmp NEAR $L$sqrx8x_outer_loop + +ALIGN 32 +$L$sqrx8x_outer_break: + mov QWORD[72+rdi],r9 + movq rcx,xmm3 + mov QWORD[80+rdi],r10 + mov QWORD[88+rdi],r11 + mov QWORD[96+rdi],r12 + mov QWORD[104+rdi],r13 + mov QWORD[112+rdi],r14 + lea rdi,[((48+8))+rsp] + mov rdx,QWORD[rcx*1+rsi] + + mov r11,QWORD[8+rdi] + xor r10,r10 + mov r9,QWORD[((0+8))+rsp] + adox r11,r11 + mov r12,QWORD[16+rdi] + mov r13,QWORD[24+rdi] + + +ALIGN 32 +$L$sqrx4x_shift_n_add: + mulx rbx,rax,rdx + adox r12,r12 + adcx rax,r10 + DB 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 + DB 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 + adox r13,r13 + adcx rbx,r11 + mov r11,QWORD[40+rdi] + mov QWORD[rdi],rax + mov QWORD[8+rdi],rbx + + mulx rbx,rax,rdx + adox r10,r10 + adcx rax,r12 + mov rdx,QWORD[16+rcx*1+rsi] + mov r12,QWORD[48+rdi] + adox r11,r11 + adcx rbx,r13 + mov r13,QWORD[56+rdi] + mov QWORD[16+rdi],rax + mov QWORD[24+rdi],rbx + + mulx rbx,rax,rdx + adox r12,r12 + adcx rax,r10 + mov rdx,QWORD[24+rcx*1+rsi] + lea rcx,[32+rcx] + mov r10,QWORD[64+rdi] + adox r13,r13 + adcx rbx,r11 + mov r11,QWORD[72+rdi] + mov QWORD[32+rdi],rax + mov QWORD[40+rdi],rbx + + mulx rbx,rax,rdx + adox r10,r10 + adcx rax,r12 + jrcxz $L$sqrx4x_shift_n_add_break + DB 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 + adox r11,r11 + adcx rbx,r13 + mov r12,QWORD[80+rdi] + mov r13,QWORD[88+rdi] + mov QWORD[48+rdi],rax + mov QWORD[56+rdi],rbx + lea rdi,[64+rdi] + nop + jmp NEAR $L$sqrx4x_shift_n_add + +ALIGN 32 +$L$sqrx4x_shift_n_add_break: + adcx rbx,r13 + mov QWORD[48+rdi],rax + mov QWORD[56+rdi],rbx + lea rdi,[64+rdi] + movq rbp,xmm2 +__bn_sqrx8x_reduction: + xor eax,eax + mov rbx,QWORD[((32+8))+rsp] + mov rdx,QWORD[((48+8))+rsp] + lea rcx,[((-64))+r9*1+rbp] + + mov QWORD[((0+8))+rsp],rcx + mov QWORD[((8+8))+rsp],rdi + + lea rdi,[((48+8))+rsp] + jmp NEAR $L$sqrx8x_reduction_loop + +ALIGN 32 +$L$sqrx8x_reduction_loop: + mov r9,QWORD[8+rdi] + mov r10,QWORD[16+rdi] + mov r11,QWORD[24+rdi] + mov r12,QWORD[32+rdi] + mov r8,rdx + imul rdx,rbx + mov r13,QWORD[40+rdi] + mov r14,QWORD[48+rdi] + mov r15,QWORD[56+rdi] + mov QWORD[((24+8))+rsp],rax + + lea rdi,[64+rdi] + xor rsi,rsi + mov rcx,-8 + jmp NEAR $L$sqrx8x_reduce + +ALIGN 32 +$L$sqrx8x_reduce: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rax,rbx + adox r8,r9 + + mulx r9,rbx,QWORD[8+rbp] + adcx r8,rbx + adox r9,r10 + + mulx r10,rbx,QWORD[16+rbp] + adcx r9,rbx + adox r10,r11 + + mulx r11,rbx,QWORD[24+rbp] + adcx r10,rbx + adox r11,r12 + + DB 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 + mov rax,rdx + mov rdx,r8 + adcx r11,rbx + adox r12,r13 + + mulx rdx,rbx,QWORD[((32+8))+rsp] + mov rdx,rax + mov QWORD[((64+48+8))+rcx*8+rsp],rax + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rbp] + mov rdx,rbx + adcx r14,rax + adox r15,rsi + adcx r15,rsi + + DB 0x67,0x67,0x67 + inc rcx + jnz NEAR $L$sqrx8x_reduce + + mov rax,rsi + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$sqrx8x_no_tail + + mov rdx,QWORD[((48+8))+rsp] + add r8,QWORD[rdi] + lea rbp,[64+rbp] + mov rcx,-8 + adcx r9,QWORD[8+rdi] + adcx r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + sbb rax,rax + + xor rsi,rsi + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_tail + +ALIGN 32 +$L$sqrx8x_tail: + mov rbx,r8 + mulx r8,rax,QWORD[rbp] + adcx rbx,rax + adox r8,r9 + + mulx r9,rax,QWORD[8+rbp] + adcx r8,rax + adox r9,r10 + + mulx r10,rax,QWORD[16+rbp] + adcx r9,rax + adox r10,r11 + + mulx r11,rax,QWORD[24+rbp] + adcx r10,rax + adox r11,r12 + + DB 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 + adcx r11,rax + adox r12,r13 + + mulx r13,rax,QWORD[40+rbp] + adcx r12,rax + adox r13,r14 + + mulx r14,rax,QWORD[48+rbp] + adcx r13,rax + adox r14,r15 + + mulx r15,rax,QWORD[56+rbp] + mov rdx,QWORD[((72+48+8))+rcx*8+rsp] + adcx r14,rax + adox r15,rsi + mov QWORD[rcx*8+rdi],rbx + mov rbx,r8 + adcx r15,rsi + + inc rcx + jnz NEAR $L$sqrx8x_tail + + cmp rbp,QWORD[((0+8))+rsp] + jae NEAR $L$sqrx8x_tail_done + + sub rsi,QWORD[((16+8))+rsp] + mov rdx,QWORD[((48+8))+rsp] + lea rbp,[64+rbp] + adc r8,QWORD[rdi] + adc r9,QWORD[8+rdi] + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + lea rdi,[64+rdi] + sbb rax,rax + sub rcx,8 + + xor rsi,rsi + mov QWORD[((16+8))+rsp],rax + jmp NEAR $L$sqrx8x_tail + +ALIGN 32 +$L$sqrx8x_tail_done: + xor rax,rax + add r8,QWORD[((24+8))+rsp] + adc r9,0 + adc r10,0 + adc r11,0 + adc r12,0 + adc r13,0 + adc r14,0 + adc r15,0 + adc rax,0 + + sub rsi,QWORD[((16+8))+rsp] +$L$sqrx8x_no_tail: + adc r8,QWORD[rdi] + movq rcx,xmm3 + adc r9,QWORD[8+rdi] + mov rsi,QWORD[56+rbp] + movq rbp,xmm2 + adc r10,QWORD[16+rdi] + adc r11,QWORD[24+rdi] + adc r12,QWORD[32+rdi] + adc r13,QWORD[40+rdi] + adc r14,QWORD[48+rdi] + adc r15,QWORD[56+rdi] + adc rax,0 + + mov rbx,QWORD[((32+8))+rsp] + mov rdx,QWORD[64+rcx*1+rdi] + + mov QWORD[rdi],r8 + lea r8,[64+rdi] + mov QWORD[8+rdi],r9 + mov QWORD[16+rdi],r10 + mov QWORD[24+rdi],r11 + mov QWORD[32+rdi],r12 + mov QWORD[40+rdi],r13 + mov QWORD[48+rdi],r14 + mov QWORD[56+rdi],r15 + + lea rdi,[64+rcx*1+rdi] + cmp r8,QWORD[((8+8))+rsp] + jb NEAR $L$sqrx8x_reduction_loop + ret + + +ALIGN 32 + +__bn_postx4x_internal: + + mov r12,QWORD[rbp] + mov r10,rcx + mov r9,rcx + neg rax + sar rcx,3+2 + + movq rdx,xmm1 + movq rsi,xmm1 + dec r12 + mov r13,QWORD[8+rbp] + xor r8,r8 + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] + jmp NEAR $L$sqrx4x_sub_entry + +ALIGN 16 +$L$sqrx4x_sub: + mov r12,QWORD[rbp] + mov r13,QWORD[8+rbp] + mov r14,QWORD[16+rbp] + mov r15,QWORD[24+rbp] +$L$sqrx4x_sub_entry: + andn r12,r12,rax + lea rbp,[32+rbp] + andn r13,r13,rax + andn r14,r14,rax + andn r15,r15,rax + + neg r8 + adc r12,QWORD[rdi] + adc r13,QWORD[8+rdi] + adc r14,QWORD[16+rdi] + adc r15,QWORD[24+rdi] + mov QWORD[rdx],r12 + lea rdi,[32+rdi] + mov QWORD[8+rdx],r13 + sbb r8,r8 + mov QWORD[16+rdx],r14 + mov QWORD[24+rdx],r15 + lea rdx,[32+rdx] + + inc rcx + jnz NEAR $L$sqrx4x_sub + + neg r9 + + ret + + +global bn_scatter5 + +ALIGN 16 +bn_scatter5: + +_CET_ENDBR + cmp edx,0 + jz NEAR $L$scatter_epilogue + + + + + + + + + + lea r8,[r9*8+r8] +$L$scatter: + mov rax,QWORD[rcx] + lea rcx,[8+rcx] + mov QWORD[r8],rax + lea r8,[256+r8] + sub edx,1 + jnz NEAR $L$scatter +$L$scatter_epilogue: + ret + + + +global bn_gather5 + +ALIGN 32 +bn_gather5: + +$L$SEH_begin_bn_gather5: +_CET_ENDBR + + DB 0x4c,0x8d,0x14,0x24 + + DB 0x48,0x81,0xec,0x08,0x01,0x00,0x00 + lea rax,[$L$inc] + and rsp,-16 + + movd xmm5,r9d + movdqa xmm0,XMMWORD[rax] + movdqa xmm1,XMMWORD[16+rax] + lea r11,[128+r8] + lea rax,[128+rsp] + + pshufd xmm5,xmm5,0 + movdqa xmm4,xmm1 + movdqa xmm2,xmm1 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[(-128)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[(-112)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[(-96)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[(-80)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[(-64)+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[(-48)+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[(-32)+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[(-16)+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[16+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[32+rax],xmm2 + movdqa xmm2,xmm4 + paddd xmm1,xmm0 + pcmpeqd xmm0,xmm5 + movdqa XMMWORD[48+rax],xmm3 + movdqa xmm3,xmm4 + + paddd xmm2,xmm1 + pcmpeqd xmm1,xmm5 + movdqa XMMWORD[64+rax],xmm0 + movdqa xmm0,xmm4 + + paddd xmm3,xmm2 + pcmpeqd xmm2,xmm5 + movdqa XMMWORD[80+rax],xmm1 + movdqa xmm1,xmm4 + + paddd xmm0,xmm3 + pcmpeqd xmm3,xmm5 + movdqa XMMWORD[96+rax],xmm2 + movdqa xmm2,xmm4 + movdqa XMMWORD[112+rax],xmm3 + jmp NEAR $L$gather + +ALIGN 32 +$L$gather: + pxor xmm4,xmm4 + pxor xmm5,xmm5 + movdqa xmm0,XMMWORD[((-128))+r11] + movdqa xmm1,XMMWORD[((-112))+r11] + movdqa xmm2,XMMWORD[((-96))+r11] + pand xmm0,XMMWORD[((-128))+rax] + movdqa xmm3,XMMWORD[((-80))+r11] + pand xmm1,XMMWORD[((-112))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-96))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-80))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[((-64))+r11] + movdqa xmm1,XMMWORD[((-48))+r11] + movdqa xmm2,XMMWORD[((-32))+r11] + pand xmm0,XMMWORD[((-64))+rax] + movdqa xmm3,XMMWORD[((-16))+r11] + pand xmm1,XMMWORD[((-48))+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[((-32))+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[((-16))+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[r11] + movdqa xmm1,XMMWORD[16+r11] + movdqa xmm2,XMMWORD[32+r11] + pand xmm0,XMMWORD[rax] + movdqa xmm3,XMMWORD[48+r11] + pand xmm1,XMMWORD[16+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[32+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[48+rax] + por xmm4,xmm2 + por xmm5,xmm3 + movdqa xmm0,XMMWORD[64+r11] + movdqa xmm1,XMMWORD[80+r11] + movdqa xmm2,XMMWORD[96+r11] + pand xmm0,XMMWORD[64+rax] + movdqa xmm3,XMMWORD[112+r11] + pand xmm1,XMMWORD[80+rax] + por xmm4,xmm0 + pand xmm2,XMMWORD[96+rax] + por xmm5,xmm1 + pand xmm3,XMMWORD[112+rax] + por xmm4,xmm2 + por xmm5,xmm3 + por xmm4,xmm5 + lea r11,[256+r11] + + pshufd xmm0,xmm4,0x4e + por xmm0,xmm4 + movq QWORD[rcx],xmm0 + lea rcx,[8+rcx] + sub edx,1 + jnz NEAR $L$gather + + lea rsp,[r10] + + ret +$L$SEH_end_bn_gather5: + + +section .rdata rdata align=8 +ALIGN 64 +$L$inc: + DD 0,0,1,1 + DD 2,2,2,2 + DB 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 + DB 112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115 + DB 99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111 + DB 114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79 + DB 71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111 + DB 112,101,110,115,115,108,46,111,114,103,62,0 +section .text + +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +mul_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_pop_regs + + mov rax,QWORD[152+r8] + + mov r10d,DWORD[8+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea r10,[$L$mul_epilogue] + cmp rbx,r10 + ja NEAR $L$body_40 + + mov r10,QWORD[192+r8] + mov rax,QWORD[8+r10*8+rax] + + jmp NEAR $L$common_pop_regs + +$L$body_40: + mov rax,QWORD[40+rax] +$L$common_pop_regs: + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_bn_mul_mont_gather5_nohw wrt ..imagebase + DD $L$SEH_end_bn_mul_mont_gather5_nohw wrt ..imagebase + DD $L$SEH_info_bn_mul_mont_gather5_nohw wrt ..imagebase + + DD $L$SEH_begin_bn_mul4x_mont_gather5 wrt ..imagebase + DD $L$SEH_end_bn_mul4x_mont_gather5 wrt ..imagebase + DD $L$SEH_info_bn_mul4x_mont_gather5 wrt ..imagebase + + DD $L$SEH_begin_bn_power5_nohw wrt ..imagebase + DD $L$SEH_end_bn_power5_nohw wrt ..imagebase + DD $L$SEH_info_bn_power5_nohw wrt ..imagebase + DD $L$SEH_begin_bn_mulx4x_mont_gather5 wrt ..imagebase + DD $L$SEH_end_bn_mulx4x_mont_gather5 wrt ..imagebase + DD $L$SEH_info_bn_mulx4x_mont_gather5 wrt ..imagebase + + DD $L$SEH_begin_bn_powerx5 wrt ..imagebase + DD $L$SEH_end_bn_powerx5 wrt ..imagebase + DD $L$SEH_info_bn_powerx5 wrt ..imagebase + DD $L$SEH_begin_bn_gather5 wrt ..imagebase + DD $L$SEH_end_bn_gather5 wrt ..imagebase + DD $L$SEH_info_bn_gather5 wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_bn_mul_mont_gather5_nohw: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mul_body wrt ..imagebase,$L$mul_body wrt ..imagebase,$L$mul_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_mul4x_mont_gather5: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mul4x_prologue wrt ..imagebase,$L$mul4x_body wrt ..imagebase,$L$mul4x_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_power5_nohw: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$power5_prologue wrt ..imagebase,$L$power5_body wrt ..imagebase,$L$power5_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_mulx4x_mont_gather5: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$mulx4x_prologue wrt ..imagebase,$L$mulx4x_body wrt ..imagebase,$L$mulx4x_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_powerx5: + DB 9,0,0,0 + DD mul_handler wrt ..imagebase + DD $L$powerx5_prologue wrt ..imagebase,$L$powerx5_body wrt ..imagebase,$L$powerx5_epilogue wrt ..imagebase +ALIGN 8 +$L$SEH_info_bn_gather5: + DB 0x01,0x0b,0x03,0x0a + DB 0x0b,0x01,0x21,0x00 + DB 0x04,0xa3,0x00,0x00 +ALIGN 8 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/crypto/aes128gcmsiv-x86_64-apple.S b/third_party/boringssl/gen/crypto/aes128gcmsiv-x86_64-apple.S new file mode 100644 index 00000000..81e2f071 --- /dev/null +++ b/third_party/boringssl/gen/crypto/aes128gcmsiv-x86_64-apple.S @@ -0,0 +1,3081 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.section __DATA,__const + +.p2align 4 +one: +.quad 1,0 +two: +.quad 2,0 +three: +.quad 3,0 +four: +.quad 4,0 +five: +.quad 5,0 +six: +.quad 6,0 +seven: +.quad 7,0 +eight: +.quad 8,0 + +OR_MASK: +.long 0x00000000,0x00000000,0x00000000,0x80000000 +poly: +.quad 0x1, 0xc200000000000000 +mask: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +con1: +.long 1,1,1,1 +con2: +.long 0x1b,0x1b,0x1b,0x1b +con3: +.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 +and_mask: +.long 0,0xffffffff, 0xffffffff, 0xffffffff +.text + +.p2align 4 +GFMUL: + + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm3,%xmm5,%xmm5 + + vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 + vpshufd $78,%xmm2,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + + vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 + vpshufd $78,%xmm2,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + + vpxor %xmm5,%xmm2,%xmm0 + ret + + +.globl _aesgcmsiv_htable_init +.private_extern _aesgcmsiv_htable_init + +.p2align 4 +_aesgcmsiv_htable_init: + +_CET_ENDBR + vmovdqa (%rsi),%xmm0 + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm0,(%rdi) + call GFMUL + vmovdqa %xmm0,16(%rdi) + call GFMUL + vmovdqa %xmm0,32(%rdi) + call GFMUL + vmovdqa %xmm0,48(%rdi) + call GFMUL + vmovdqa %xmm0,64(%rdi) + call GFMUL + vmovdqa %xmm0,80(%rdi) + call GFMUL + vmovdqa %xmm0,96(%rdi) + call GFMUL + vmovdqa %xmm0,112(%rdi) + ret + + +.globl _aesgcmsiv_htable6_init +.private_extern _aesgcmsiv_htable6_init + +.p2align 4 +_aesgcmsiv_htable6_init: + +_CET_ENDBR + vmovdqa (%rsi),%xmm0 + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm0,(%rdi) + call GFMUL + vmovdqa %xmm0,16(%rdi) + call GFMUL + vmovdqa %xmm0,32(%rdi) + call GFMUL + vmovdqa %xmm0,48(%rdi) + call GFMUL + vmovdqa %xmm0,64(%rdi) + call GFMUL + vmovdqa %xmm0,80(%rdi) + ret + + +.globl _aesgcmsiv_htable_polyval +.private_extern _aesgcmsiv_htable_polyval + +.p2align 4 +_aesgcmsiv_htable_polyval: + +_CET_ENDBR + testq %rdx,%rdx + jnz L$htable_polyval_start + ret + +L$htable_polyval_start: + vzeroall + + + + movq %rdx,%r11 + andq $127,%r11 + + jz L$htable_polyval_no_prefix + + vpxor %xmm9,%xmm9,%xmm9 + vmovdqa (%rcx),%xmm1 + subq %r11,%rdx + + subq $16,%r11 + + + vmovdqu (%rsi),%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5 + vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3 + vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4 + vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + leaq 16(%rsi),%rsi + testq %r11,%r11 + jnz L$htable_polyval_prefix_loop + jmp L$htable_polyval_prefix_complete + + +.p2align 6 +L$htable_polyval_prefix_loop: + subq $16,%r11 + + vmovdqu (%rsi),%xmm0 + + vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + testq %r11,%r11 + + leaq 16(%rsi),%rsi + + jnz L$htable_polyval_prefix_loop + +L$htable_polyval_prefix_complete: + vpsrldq $8,%xmm5,%xmm6 + vpslldq $8,%xmm5,%xmm5 + + vpxor %xmm6,%xmm4,%xmm9 + vpxor %xmm5,%xmm3,%xmm1 + + jmp L$htable_polyval_main_loop + +L$htable_polyval_no_prefix: + + + + + vpxor %xmm1,%xmm1,%xmm1 + vmovdqa (%rcx),%xmm9 + +.p2align 6 +L$htable_polyval_main_loop: + subq $0x80,%rdx + jb L$htable_polyval_out + + vmovdqu 112(%rsi),%xmm0 + + vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5 + vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3 + vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4 + vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 96(%rsi),%xmm0 + vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + + vmovdqu 80(%rsi),%xmm0 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 + vpalignr $8,%xmm1,%xmm1,%xmm1 + + vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm7,%xmm1,%xmm1 + + vmovdqu 64(%rsi),%xmm0 + + vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 48(%rsi),%xmm0 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 + vpalignr $8,%xmm1,%xmm1,%xmm1 + + vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm7,%xmm1,%xmm1 + + vmovdqu 32(%rsi),%xmm0 + + vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm9,%xmm1,%xmm1 + + vmovdqu 16(%rsi),%xmm0 + + vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 0(%rsi),%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpsrldq $8,%xmm5,%xmm6 + vpslldq $8,%xmm5,%xmm5 + + vpxor %xmm6,%xmm4,%xmm9 + vpxor %xmm5,%xmm3,%xmm1 + + leaq 128(%rsi),%rsi + jmp L$htable_polyval_main_loop + + + +L$htable_polyval_out: + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 + vpalignr $8,%xmm1,%xmm1,%xmm1 + vpxor %xmm6,%xmm1,%xmm1 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 + vpalignr $8,%xmm1,%xmm1,%xmm1 + vpxor %xmm6,%xmm1,%xmm1 + vpxor %xmm9,%xmm1,%xmm1 + + vmovdqu %xmm1,(%rcx) + vzeroupper + ret + + +.globl _aesgcmsiv_polyval_horner +.private_extern _aesgcmsiv_polyval_horner + +.p2align 4 +_aesgcmsiv_polyval_horner: + +_CET_ENDBR + testq %rcx,%rcx + jnz L$polyval_horner_start + ret + +L$polyval_horner_start: + + + + xorq %r10,%r10 + shlq $4,%rcx + + vmovdqa (%rsi),%xmm1 + vmovdqa (%rdi),%xmm0 + +L$polyval_horner_loop: + vpxor (%rdx,%r10,1),%xmm0,%xmm0 + call GFMUL + + addq $16,%r10 + cmpq %r10,%rcx + jne L$polyval_horner_loop + + + vmovdqa %xmm0,(%rdi) + ret + + +.globl _aes128gcmsiv_aes_ks +.private_extern _aes128gcmsiv_aes_ks + +.p2align 4 +_aes128gcmsiv_aes_ks: + +_CET_ENDBR + vmovdqu (%rdi),%xmm1 + vmovdqa %xmm1,(%rsi) + + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + + movq $8,%rax + +L$ks128_loop: + addq $16,%rsi + subq $1,%rax + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + jne L$ks128_loop + + vmovdqa con2(%rip),%xmm0 + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,16(%rsi) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,32(%rsi) + ret + + +.globl _aes256gcmsiv_aes_ks +.private_extern _aes256gcmsiv_aes_ks + +.p2align 4 +_aes256gcmsiv_aes_ks: + +_CET_ENDBR + vmovdqu (%rdi),%xmm1 + vmovdqu 16(%rdi),%xmm3 + vmovdqa %xmm1,(%rsi) + vmovdqa %xmm3,16(%rsi) + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + vpxor %xmm14,%xmm14,%xmm14 + movq $6,%rax + +L$ks256_loop: + addq $32,%rsi + subq $1,%rax + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpsllq $32,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpshufb con3(%rip),%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vmovdqa %xmm3,16(%rsi) + jne L$ks256_loop + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpsllq $32,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,32(%rsi) + ret + +.globl _aes128gcmsiv_aes_ks_enc_x1 +.private_extern _aes128gcmsiv_aes_ks_enc_x1 + +.p2align 4 +_aes128gcmsiv_aes_ks_enc_x1: + +_CET_ENDBR + vmovdqa (%rcx),%xmm1 + vmovdqa 0(%rdi),%xmm4 + + vmovdqa %xmm1,(%rdx) + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,16(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,32(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,48(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,64(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,80(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,96(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,112(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,128(%rdx) + + + vmovdqa con2(%rip),%xmm0 + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,144(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenclast %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,160(%rdx) + + + vmovdqa %xmm4,0(%rsi) + ret + + +.globl _aes128gcmsiv_kdf +.private_extern _aes128gcmsiv_kdf + +.p2align 4 +_aes128gcmsiv_kdf: + +_CET_ENDBR + + + + + vmovdqa (%rdx),%xmm1 + vmovdqa 0(%rdi),%xmm9 + vmovdqa and_mask(%rip),%xmm12 + vmovdqa one(%rip),%xmm13 + vpshufd $0x90,%xmm9,%xmm9 + vpand %xmm12,%xmm9,%xmm9 + vpaddd %xmm13,%xmm9,%xmm10 + vpaddd %xmm13,%xmm10,%xmm11 + vpaddd %xmm13,%xmm11,%xmm12 + + vpxor %xmm1,%xmm9,%xmm9 + vpxor %xmm1,%xmm10,%xmm10 + vpxor %xmm1,%xmm11,%xmm11 + vpxor %xmm1,%xmm12,%xmm12 + + vmovdqa 16(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 32(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 48(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 64(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 80(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 96(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 112(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 128(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 144(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 160(%rdx),%xmm2 + vaesenclast %xmm2,%xmm9,%xmm9 + vaesenclast %xmm2,%xmm10,%xmm10 + vaesenclast %xmm2,%xmm11,%xmm11 + vaesenclast %xmm2,%xmm12,%xmm12 + + + vmovdqa %xmm9,0(%rsi) + vmovdqa %xmm10,16(%rsi) + vmovdqa %xmm11,32(%rsi) + vmovdqa %xmm12,48(%rsi) + ret + + +.globl _aes128gcmsiv_enc_msg_x4 +.private_extern _aes128gcmsiv_enc_msg_x4 + +.p2align 4 +_aes128gcmsiv_enc_msg_x4: + +_CET_ENDBR + testq %r8,%r8 + jnz L$128_enc_msg_x4_start + ret + +L$128_enc_msg_x4_start: + pushq %r12 + + pushq %r13 + + + shrq $4,%r8 + movq %r8,%r10 + shlq $62,%r10 + shrq $62,%r10 + + + vmovdqa (%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + + vmovdqu four(%rip),%xmm4 + vmovdqa %xmm15,%xmm0 + vpaddd one(%rip),%xmm15,%xmm1 + vpaddd two(%rip),%xmm15,%xmm2 + vpaddd three(%rip),%xmm15,%xmm3 + + shrq $2,%r8 + je L$128_enc_msg_x4_check_remainder + + subq $64,%rsi + subq $64,%rdi + +L$128_enc_msg_x4_loop1: + addq $64,%rsi + addq $64,%rdi + + vmovdqa %xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vmovdqa %xmm2,%xmm7 + vmovdqa %xmm3,%xmm8 + + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqu 32(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm1,%xmm1 + vmovdqu 48(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm2,%xmm2 + vmovdqu 64(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm3,%xmm3 + + vmovdqu 80(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 96(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 112(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 128(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 144(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm12 + vaesenclast %xmm12,%xmm5,%xmm5 + vaesenclast %xmm12,%xmm6,%xmm6 + vaesenclast %xmm12,%xmm7,%xmm7 + vaesenclast %xmm12,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm5,%xmm5 + vpxor 16(%rdi),%xmm6,%xmm6 + vpxor 32(%rdi),%xmm7,%xmm7 + vpxor 48(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm5,0(%rsi) + vmovdqu %xmm6,16(%rsi) + vmovdqu %xmm7,32(%rsi) + vmovdqu %xmm8,48(%rsi) + + jne L$128_enc_msg_x4_loop1 + + addq $64,%rsi + addq $64,%rdi + +L$128_enc_msg_x4_check_remainder: + cmpq $0,%r10 + je L$128_enc_msg_x4_out + +L$128_enc_msg_x4_loop2: + + + vmovdqa %xmm0,%xmm5 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm5,%xmm5 + vaesenc 16(%rcx),%xmm5,%xmm5 + vaesenc 32(%rcx),%xmm5,%xmm5 + vaesenc 48(%rcx),%xmm5,%xmm5 + vaesenc 64(%rcx),%xmm5,%xmm5 + vaesenc 80(%rcx),%xmm5,%xmm5 + vaesenc 96(%rcx),%xmm5,%xmm5 + vaesenc 112(%rcx),%xmm5,%xmm5 + vaesenc 128(%rcx),%xmm5,%xmm5 + vaesenc 144(%rcx),%xmm5,%xmm5 + vaesenclast 160(%rcx),%xmm5,%xmm5 + + + vpxor (%rdi),%xmm5,%xmm5 + vmovdqu %xmm5,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + subq $1,%r10 + jne L$128_enc_msg_x4_loop2 + +L$128_enc_msg_x4_out: + popq %r13 + + popq %r12 + + ret + + +.globl _aes128gcmsiv_enc_msg_x8 +.private_extern _aes128gcmsiv_enc_msg_x8 + +.p2align 4 +_aes128gcmsiv_enc_msg_x8: + +_CET_ENDBR + testq %r8,%r8 + jnz L$128_enc_msg_x8_start + ret + +L$128_enc_msg_x8_start: + pushq %r12 + + pushq %r13 + + pushq %rbp + + movq %rsp,%rbp + + + + subq $128,%rsp + andq $-64,%rsp + + shrq $4,%r8 + movq %r8,%r10 + shlq $61,%r10 + shrq $61,%r10 + + + vmovdqu (%rdx),%xmm1 + vpor OR_MASK(%rip),%xmm1,%xmm1 + + + vpaddd seven(%rip),%xmm1,%xmm0 + vmovdqu %xmm0,(%rsp) + vpaddd one(%rip),%xmm1,%xmm9 + vpaddd two(%rip),%xmm1,%xmm10 + vpaddd three(%rip),%xmm1,%xmm11 + vpaddd four(%rip),%xmm1,%xmm12 + vpaddd five(%rip),%xmm1,%xmm13 + vpaddd six(%rip),%xmm1,%xmm14 + vmovdqa %xmm1,%xmm0 + + shrq $3,%r8 + je L$128_enc_msg_x8_check_remainder + + subq $128,%rsi + subq $128,%rdi + +L$128_enc_msg_x8_loop1: + addq $128,%rsi + addq $128,%rdi + + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm9,%xmm2 + vmovdqa %xmm10,%xmm3 + vmovdqa %xmm11,%xmm4 + vmovdqa %xmm12,%xmm5 + vmovdqa %xmm13,%xmm6 + vmovdqa %xmm14,%xmm7 + + vmovdqu (%rsp),%xmm8 + + vpxor (%rcx),%xmm1,%xmm1 + vpxor (%rcx),%xmm2,%xmm2 + vpxor (%rcx),%xmm3,%xmm3 + vpxor (%rcx),%xmm4,%xmm4 + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu (%rsp),%xmm14 + vpaddd eight(%rip),%xmm14,%xmm14 + vmovdqu %xmm14,(%rsp) + vmovdqu 32(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpsubd one(%rip),%xmm14,%xmm14 + vmovdqu 48(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm0,%xmm0 + vmovdqu 64(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm9,%xmm9 + vmovdqu 80(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm10,%xmm10 + vmovdqu 96(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm11,%xmm11 + vmovdqu 112(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm12,%xmm12 + vmovdqu 128(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm13,%xmm13 + vmovdqu 144(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm15 + vaesenclast %xmm15,%xmm1,%xmm1 + vaesenclast %xmm15,%xmm2,%xmm2 + vaesenclast %xmm15,%xmm3,%xmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vaesenclast %xmm15,%xmm6,%xmm6 + vaesenclast %xmm15,%xmm7,%xmm7 + vaesenclast %xmm15,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm1,%xmm1 + vpxor 16(%rdi),%xmm2,%xmm2 + vpxor 32(%rdi),%xmm3,%xmm3 + vpxor 48(%rdi),%xmm4,%xmm4 + vpxor 64(%rdi),%xmm5,%xmm5 + vpxor 80(%rdi),%xmm6,%xmm6 + vpxor 96(%rdi),%xmm7,%xmm7 + vpxor 112(%rdi),%xmm8,%xmm8 + + decq %r8 + + vmovdqu %xmm1,0(%rsi) + vmovdqu %xmm2,16(%rsi) + vmovdqu %xmm3,32(%rsi) + vmovdqu %xmm4,48(%rsi) + vmovdqu %xmm5,64(%rsi) + vmovdqu %xmm6,80(%rsi) + vmovdqu %xmm7,96(%rsi) + vmovdqu %xmm8,112(%rsi) + + jne L$128_enc_msg_x8_loop1 + + addq $128,%rsi + addq $128,%rdi + +L$128_enc_msg_x8_check_remainder: + cmpq $0,%r10 + je L$128_enc_msg_x8_out + +L$128_enc_msg_x8_loop2: + + + vmovdqa %xmm0,%xmm1 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm1,%xmm1 + vaesenc 16(%rcx),%xmm1,%xmm1 + vaesenc 32(%rcx),%xmm1,%xmm1 + vaesenc 48(%rcx),%xmm1,%xmm1 + vaesenc 64(%rcx),%xmm1,%xmm1 + vaesenc 80(%rcx),%xmm1,%xmm1 + vaesenc 96(%rcx),%xmm1,%xmm1 + vaesenc 112(%rcx),%xmm1,%xmm1 + vaesenc 128(%rcx),%xmm1,%xmm1 + vaesenc 144(%rcx),%xmm1,%xmm1 + vaesenclast 160(%rcx),%xmm1,%xmm1 + + + vpxor (%rdi),%xmm1,%xmm1 + + vmovdqu %xmm1,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + decq %r10 + jne L$128_enc_msg_x8_loop2 + +L$128_enc_msg_x8_out: + movq %rbp,%rsp + + popq %rbp + + popq %r13 + + popq %r12 + + ret + + +.globl _aes128gcmsiv_dec +.private_extern _aes128gcmsiv_dec + +.p2align 4 +_aes128gcmsiv_dec: + +_CET_ENDBR + testq $~15,%r9 + jnz L$128_dec_start + ret + +L$128_dec_start: + vzeroupper + vmovdqa (%rdx),%xmm0 + + + vmovdqu 16(%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + movq %rdx,%rax + + leaq 32(%rax),%rax + leaq 32(%rcx),%rcx + + andq $~15,%r9 + + + cmpq $96,%r9 + jb L$128_dec_loop2 + + + subq $96,%r9 + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vpxor (%r8),%xmm7,%xmm7 + vpxor (%r8),%xmm8,%xmm8 + vpxor (%r8),%xmm9,%xmm9 + vpxor (%r8),%xmm10,%xmm10 + vpxor (%r8),%xmm11,%xmm11 + vpxor (%r8),%xmm12,%xmm12 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vaesenclast %xmm4,%xmm8,%xmm8 + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm4,%xmm10,%xmm10 + vaesenclast %xmm4,%xmm11,%xmm11 + vaesenclast %xmm4,%xmm12,%xmm12 + + + vpxor 0(%rdi),%xmm7,%xmm7 + vpxor 16(%rdi),%xmm8,%xmm8 + vpxor 32(%rdi),%xmm9,%xmm9 + vpxor 48(%rdi),%xmm10,%xmm10 + vpxor 64(%rdi),%xmm11,%xmm11 + vpxor 80(%rdi),%xmm12,%xmm12 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + addq $96,%rdi + addq $96,%rsi + jmp L$128_dec_loop1 + + +.p2align 6 +L$128_dec_loop1: + cmpq $96,%r9 + jb L$128_dec_finish_96 + subq $96,%r9 + + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vmovdqa (%r8),%xmm4 + vpxor %xmm4,%xmm7,%xmm7 + vpxor %xmm4,%xmm8,%xmm8 + vpxor %xmm4,%xmm9,%xmm9 + vpxor %xmm4,%xmm10,%xmm10 + vpxor %xmm4,%xmm11,%xmm11 + vpxor %xmm4,%xmm12,%xmm12 + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vmovdqa 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm6 + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor 0(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vpxor 16(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm8,%xmm8 + vpxor 32(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm9,%xmm9 + vpxor 48(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm10,%xmm10 + vpxor 64(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm11,%xmm11 + vpxor 80(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm12,%xmm12 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + vpxor %xmm5,%xmm0,%xmm0 + + leaq 96(%rdi),%rdi + leaq 96(%rsi),%rsi + jmp L$128_dec_loop1 + +L$128_dec_finish_96: + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor %xmm5,%xmm0,%xmm0 + +L$128_dec_loop2: + + + + cmpq $16,%r9 + jb L$128_dec_out + subq $16,%r9 + + vmovdqa %xmm15,%xmm2 + vpaddd one(%rip),%xmm15,%xmm15 + + vpxor 0(%r8),%xmm2,%xmm2 + vaesenc 16(%r8),%xmm2,%xmm2 + vaesenc 32(%r8),%xmm2,%xmm2 + vaesenc 48(%r8),%xmm2,%xmm2 + vaesenc 64(%r8),%xmm2,%xmm2 + vaesenc 80(%r8),%xmm2,%xmm2 + vaesenc 96(%r8),%xmm2,%xmm2 + vaesenc 112(%r8),%xmm2,%xmm2 + vaesenc 128(%r8),%xmm2,%xmm2 + vaesenc 144(%r8),%xmm2,%xmm2 + vaesenclast 160(%r8),%xmm2,%xmm2 + vpxor (%rdi),%xmm2,%xmm2 + vmovdqu %xmm2,(%rsi) + addq $16,%rdi + addq $16,%rsi + + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa -32(%rcx),%xmm1 + call GFMUL + + jmp L$128_dec_loop2 + +L$128_dec_out: + vmovdqu %xmm0,(%rdx) + ret + + +.globl _aes128gcmsiv_ecb_enc_block +.private_extern _aes128gcmsiv_ecb_enc_block + +.p2align 4 +_aes128gcmsiv_ecb_enc_block: + +_CET_ENDBR + vmovdqa (%rdi),%xmm1 + + vpxor (%rdx),%xmm1,%xmm1 + vaesenc 16(%rdx),%xmm1,%xmm1 + vaesenc 32(%rdx),%xmm1,%xmm1 + vaesenc 48(%rdx),%xmm1,%xmm1 + vaesenc 64(%rdx),%xmm1,%xmm1 + vaesenc 80(%rdx),%xmm1,%xmm1 + vaesenc 96(%rdx),%xmm1,%xmm1 + vaesenc 112(%rdx),%xmm1,%xmm1 + vaesenc 128(%rdx),%xmm1,%xmm1 + vaesenc 144(%rdx),%xmm1,%xmm1 + vaesenclast 160(%rdx),%xmm1,%xmm1 + + vmovdqa %xmm1,(%rsi) + + ret + + +.globl _aes256gcmsiv_aes_ks_enc_x1 +.private_extern _aes256gcmsiv_aes_ks_enc_x1 + +.p2align 4 +_aes256gcmsiv_aes_ks_enc_x1: + +_CET_ENDBR + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + vmovdqa (%rdi),%xmm8 + vmovdqa (%rcx),%xmm1 + vmovdqa 16(%rcx),%xmm3 + vpxor %xmm1,%xmm8,%xmm8 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm1,(%rdx) + vmovdqu %xmm3,16(%rdx) + vpxor %xmm14,%xmm14,%xmm14 + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,32(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,48(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,64(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,80(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,96(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,112(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,128(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,144(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,160(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,176(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,192(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,208(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenclast %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,224(%rdx) + + vmovdqa %xmm8,(%rsi) + ret + + +.globl _aes256gcmsiv_ecb_enc_block +.private_extern _aes256gcmsiv_ecb_enc_block + +.p2align 4 +_aes256gcmsiv_ecb_enc_block: + +_CET_ENDBR + vmovdqa (%rdi),%xmm1 + vpxor (%rdx),%xmm1,%xmm1 + vaesenc 16(%rdx),%xmm1,%xmm1 + vaesenc 32(%rdx),%xmm1,%xmm1 + vaesenc 48(%rdx),%xmm1,%xmm1 + vaesenc 64(%rdx),%xmm1,%xmm1 + vaesenc 80(%rdx),%xmm1,%xmm1 + vaesenc 96(%rdx),%xmm1,%xmm1 + vaesenc 112(%rdx),%xmm1,%xmm1 + vaesenc 128(%rdx),%xmm1,%xmm1 + vaesenc 144(%rdx),%xmm1,%xmm1 + vaesenc 160(%rdx),%xmm1,%xmm1 + vaesenc 176(%rdx),%xmm1,%xmm1 + vaesenc 192(%rdx),%xmm1,%xmm1 + vaesenc 208(%rdx),%xmm1,%xmm1 + vaesenclast 224(%rdx),%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + ret + + +.globl _aes256gcmsiv_enc_msg_x4 +.private_extern _aes256gcmsiv_enc_msg_x4 + +.p2align 4 +_aes256gcmsiv_enc_msg_x4: + +_CET_ENDBR + testq %r8,%r8 + jnz L$256_enc_msg_x4_start + ret + +L$256_enc_msg_x4_start: + movq %r8,%r10 + shrq $4,%r8 + shlq $60,%r10 + jz L$256_enc_msg_x4_start2 + addq $1,%r8 + +L$256_enc_msg_x4_start2: + movq %r8,%r10 + shlq $62,%r10 + shrq $62,%r10 + + + vmovdqa (%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + + vmovdqa four(%rip),%xmm4 + vmovdqa %xmm15,%xmm0 + vpaddd one(%rip),%xmm15,%xmm1 + vpaddd two(%rip),%xmm15,%xmm2 + vpaddd three(%rip),%xmm15,%xmm3 + + shrq $2,%r8 + je L$256_enc_msg_x4_check_remainder + + subq $64,%rsi + subq $64,%rdi + +L$256_enc_msg_x4_loop1: + addq $64,%rsi + addq $64,%rdi + + vmovdqa %xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vmovdqa %xmm2,%xmm7 + vmovdqa %xmm3,%xmm8 + + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqu 32(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm1,%xmm1 + vmovdqu 48(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm2,%xmm2 + vmovdqu 64(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm3,%xmm3 + + vmovdqu 80(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 96(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 112(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 128(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 144(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 176(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 192(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 208(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 224(%rcx),%xmm12 + vaesenclast %xmm12,%xmm5,%xmm5 + vaesenclast %xmm12,%xmm6,%xmm6 + vaesenclast %xmm12,%xmm7,%xmm7 + vaesenclast %xmm12,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm5,%xmm5 + vpxor 16(%rdi),%xmm6,%xmm6 + vpxor 32(%rdi),%xmm7,%xmm7 + vpxor 48(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm5,0(%rsi) + vmovdqu %xmm6,16(%rsi) + vmovdqu %xmm7,32(%rsi) + vmovdqu %xmm8,48(%rsi) + + jne L$256_enc_msg_x4_loop1 + + addq $64,%rsi + addq $64,%rdi + +L$256_enc_msg_x4_check_remainder: + cmpq $0,%r10 + je L$256_enc_msg_x4_out + +L$256_enc_msg_x4_loop2: + + + + vmovdqa %xmm0,%xmm5 + vpaddd one(%rip),%xmm0,%xmm0 + vpxor (%rcx),%xmm5,%xmm5 + vaesenc 16(%rcx),%xmm5,%xmm5 + vaesenc 32(%rcx),%xmm5,%xmm5 + vaesenc 48(%rcx),%xmm5,%xmm5 + vaesenc 64(%rcx),%xmm5,%xmm5 + vaesenc 80(%rcx),%xmm5,%xmm5 + vaesenc 96(%rcx),%xmm5,%xmm5 + vaesenc 112(%rcx),%xmm5,%xmm5 + vaesenc 128(%rcx),%xmm5,%xmm5 + vaesenc 144(%rcx),%xmm5,%xmm5 + vaesenc 160(%rcx),%xmm5,%xmm5 + vaesenc 176(%rcx),%xmm5,%xmm5 + vaesenc 192(%rcx),%xmm5,%xmm5 + vaesenc 208(%rcx),%xmm5,%xmm5 + vaesenclast 224(%rcx),%xmm5,%xmm5 + + + vpxor (%rdi),%xmm5,%xmm5 + + vmovdqu %xmm5,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + subq $1,%r10 + jne L$256_enc_msg_x4_loop2 + +L$256_enc_msg_x4_out: + ret + + +.globl _aes256gcmsiv_enc_msg_x8 +.private_extern _aes256gcmsiv_enc_msg_x8 + +.p2align 4 +_aes256gcmsiv_enc_msg_x8: + +_CET_ENDBR + testq %r8,%r8 + jnz L$256_enc_msg_x8_start + ret + +L$256_enc_msg_x8_start: + + movq %rsp,%r11 + subq $16,%r11 + andq $-64,%r11 + + movq %r8,%r10 + shrq $4,%r8 + shlq $60,%r10 + jz L$256_enc_msg_x8_start2 + addq $1,%r8 + +L$256_enc_msg_x8_start2: + movq %r8,%r10 + shlq $61,%r10 + shrq $61,%r10 + + + vmovdqa (%rdx),%xmm1 + vpor OR_MASK(%rip),%xmm1,%xmm1 + + + vpaddd seven(%rip),%xmm1,%xmm0 + vmovdqa %xmm0,(%r11) + vpaddd one(%rip),%xmm1,%xmm9 + vpaddd two(%rip),%xmm1,%xmm10 + vpaddd three(%rip),%xmm1,%xmm11 + vpaddd four(%rip),%xmm1,%xmm12 + vpaddd five(%rip),%xmm1,%xmm13 + vpaddd six(%rip),%xmm1,%xmm14 + vmovdqa %xmm1,%xmm0 + + shrq $3,%r8 + jz L$256_enc_msg_x8_check_remainder + + subq $128,%rsi + subq $128,%rdi + +L$256_enc_msg_x8_loop1: + addq $128,%rsi + addq $128,%rdi + + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm9,%xmm2 + vmovdqa %xmm10,%xmm3 + vmovdqa %xmm11,%xmm4 + vmovdqa %xmm12,%xmm5 + vmovdqa %xmm13,%xmm6 + vmovdqa %xmm14,%xmm7 + + vmovdqa (%r11),%xmm8 + + vpxor (%rcx),%xmm1,%xmm1 + vpxor (%rcx),%xmm2,%xmm2 + vpxor (%rcx),%xmm3,%xmm3 + vpxor (%rcx),%xmm4,%xmm4 + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqa (%r11),%xmm14 + vpaddd eight(%rip),%xmm14,%xmm14 + vmovdqa %xmm14,(%r11) + vmovdqu 32(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpsubd one(%rip),%xmm14,%xmm14 + vmovdqu 48(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm0,%xmm0 + vmovdqu 64(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm9,%xmm9 + vmovdqu 80(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm10,%xmm10 + vmovdqu 96(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm11,%xmm11 + vmovdqu 112(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm12,%xmm12 + vmovdqu 128(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm13,%xmm13 + vmovdqu 144(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 176(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 192(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 208(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 224(%rcx),%xmm15 + vaesenclast %xmm15,%xmm1,%xmm1 + vaesenclast %xmm15,%xmm2,%xmm2 + vaesenclast %xmm15,%xmm3,%xmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vaesenclast %xmm15,%xmm6,%xmm6 + vaesenclast %xmm15,%xmm7,%xmm7 + vaesenclast %xmm15,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm1,%xmm1 + vpxor 16(%rdi),%xmm2,%xmm2 + vpxor 32(%rdi),%xmm3,%xmm3 + vpxor 48(%rdi),%xmm4,%xmm4 + vpxor 64(%rdi),%xmm5,%xmm5 + vpxor 80(%rdi),%xmm6,%xmm6 + vpxor 96(%rdi),%xmm7,%xmm7 + vpxor 112(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm1,0(%rsi) + vmovdqu %xmm2,16(%rsi) + vmovdqu %xmm3,32(%rsi) + vmovdqu %xmm4,48(%rsi) + vmovdqu %xmm5,64(%rsi) + vmovdqu %xmm6,80(%rsi) + vmovdqu %xmm7,96(%rsi) + vmovdqu %xmm8,112(%rsi) + + jne L$256_enc_msg_x8_loop1 + + addq $128,%rsi + addq $128,%rdi + +L$256_enc_msg_x8_check_remainder: + cmpq $0,%r10 + je L$256_enc_msg_x8_out + +L$256_enc_msg_x8_loop2: + + + vmovdqa %xmm0,%xmm1 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm1,%xmm1 + vaesenc 16(%rcx),%xmm1,%xmm1 + vaesenc 32(%rcx),%xmm1,%xmm1 + vaesenc 48(%rcx),%xmm1,%xmm1 + vaesenc 64(%rcx),%xmm1,%xmm1 + vaesenc 80(%rcx),%xmm1,%xmm1 + vaesenc 96(%rcx),%xmm1,%xmm1 + vaesenc 112(%rcx),%xmm1,%xmm1 + vaesenc 128(%rcx),%xmm1,%xmm1 + vaesenc 144(%rcx),%xmm1,%xmm1 + vaesenc 160(%rcx),%xmm1,%xmm1 + vaesenc 176(%rcx),%xmm1,%xmm1 + vaesenc 192(%rcx),%xmm1,%xmm1 + vaesenc 208(%rcx),%xmm1,%xmm1 + vaesenclast 224(%rcx),%xmm1,%xmm1 + + + vpxor (%rdi),%xmm1,%xmm1 + + vmovdqu %xmm1,(%rsi) + + addq $16,%rdi + addq $16,%rsi + subq $1,%r10 + jnz L$256_enc_msg_x8_loop2 + +L$256_enc_msg_x8_out: + ret + + + +.globl _aes256gcmsiv_dec +.private_extern _aes256gcmsiv_dec + +.p2align 4 +_aes256gcmsiv_dec: + +_CET_ENDBR + testq $~15,%r9 + jnz L$256_dec_start + ret + +L$256_dec_start: + vzeroupper + vmovdqa (%rdx),%xmm0 + + + vmovdqu 16(%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + movq %rdx,%rax + + leaq 32(%rax),%rax + leaq 32(%rcx),%rcx + + andq $~15,%r9 + + + cmpq $96,%r9 + jb L$256_dec_loop2 + + + subq $96,%r9 + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vpxor (%r8),%xmm7,%xmm7 + vpxor (%r8),%xmm8,%xmm8 + vpxor (%r8),%xmm9,%xmm9 + vpxor (%r8),%xmm10,%xmm10 + vpxor (%r8),%xmm11,%xmm11 + vpxor (%r8),%xmm12,%xmm12 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 176(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 192(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 208(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 224(%r8),%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vaesenclast %xmm4,%xmm8,%xmm8 + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm4,%xmm10,%xmm10 + vaesenclast %xmm4,%xmm11,%xmm11 + vaesenclast %xmm4,%xmm12,%xmm12 + + + vpxor 0(%rdi),%xmm7,%xmm7 + vpxor 16(%rdi),%xmm8,%xmm8 + vpxor 32(%rdi),%xmm9,%xmm9 + vpxor 48(%rdi),%xmm10,%xmm10 + vpxor 64(%rdi),%xmm11,%xmm11 + vpxor 80(%rdi),%xmm12,%xmm12 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + addq $96,%rdi + addq $96,%rsi + jmp L$256_dec_loop1 + + +.p2align 6 +L$256_dec_loop1: + cmpq $96,%r9 + jb L$256_dec_finish_96 + subq $96,%r9 + + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vmovdqa (%r8),%xmm4 + vpxor %xmm4,%xmm7,%xmm7 + vpxor %xmm4,%xmm8,%xmm8 + vpxor %xmm4,%xmm9,%xmm9 + vpxor %xmm4,%xmm10,%xmm10 + vpxor %xmm4,%xmm11,%xmm11 + vpxor %xmm4,%xmm12,%xmm12 + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vmovdqa 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 176(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 192(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 208(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 224(%r8),%xmm6 + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor 0(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vpxor 16(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm8,%xmm8 + vpxor 32(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm9,%xmm9 + vpxor 48(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm10,%xmm10 + vpxor 64(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm11,%xmm11 + vpxor 80(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm12,%xmm12 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + vpxor %xmm5,%xmm0,%xmm0 + + leaq 96(%rdi),%rdi + leaq 96(%rsi),%rsi + jmp L$256_dec_loop1 + +L$256_dec_finish_96: + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor %xmm5,%xmm0,%xmm0 + +L$256_dec_loop2: + + + + cmpq $16,%r9 + jb L$256_dec_out + subq $16,%r9 + + vmovdqa %xmm15,%xmm2 + vpaddd one(%rip),%xmm15,%xmm15 + + vpxor 0(%r8),%xmm2,%xmm2 + vaesenc 16(%r8),%xmm2,%xmm2 + vaesenc 32(%r8),%xmm2,%xmm2 + vaesenc 48(%r8),%xmm2,%xmm2 + vaesenc 64(%r8),%xmm2,%xmm2 + vaesenc 80(%r8),%xmm2,%xmm2 + vaesenc 96(%r8),%xmm2,%xmm2 + vaesenc 112(%r8),%xmm2,%xmm2 + vaesenc 128(%r8),%xmm2,%xmm2 + vaesenc 144(%r8),%xmm2,%xmm2 + vaesenc 160(%r8),%xmm2,%xmm2 + vaesenc 176(%r8),%xmm2,%xmm2 + vaesenc 192(%r8),%xmm2,%xmm2 + vaesenc 208(%r8),%xmm2,%xmm2 + vaesenclast 224(%r8),%xmm2,%xmm2 + vpxor (%rdi),%xmm2,%xmm2 + vmovdqu %xmm2,(%rsi) + addq $16,%rdi + addq $16,%rsi + + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa -32(%rcx),%xmm1 + call GFMUL + + jmp L$256_dec_loop2 + +L$256_dec_out: + vmovdqu %xmm0,(%rdx) + ret + + +.globl _aes256gcmsiv_kdf +.private_extern _aes256gcmsiv_kdf + +.p2align 4 +_aes256gcmsiv_kdf: + +_CET_ENDBR + + + + + vmovdqa (%rdx),%xmm1 + vmovdqa 0(%rdi),%xmm4 + vmovdqa and_mask(%rip),%xmm11 + vmovdqa one(%rip),%xmm8 + vpshufd $0x90,%xmm4,%xmm4 + vpand %xmm11,%xmm4,%xmm4 + vpaddd %xmm8,%xmm4,%xmm6 + vpaddd %xmm8,%xmm6,%xmm7 + vpaddd %xmm8,%xmm7,%xmm11 + vpaddd %xmm8,%xmm11,%xmm12 + vpaddd %xmm8,%xmm12,%xmm13 + + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm1,%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm1,%xmm11,%xmm11 + vpxor %xmm1,%xmm12,%xmm12 + vpxor %xmm1,%xmm13,%xmm13 + + vmovdqa 16(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 32(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 48(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 64(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 80(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 96(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 112(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 128(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 144(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 160(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 176(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 192(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 208(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 224(%rdx),%xmm2 + vaesenclast %xmm2,%xmm4,%xmm4 + vaesenclast %xmm2,%xmm6,%xmm6 + vaesenclast %xmm2,%xmm7,%xmm7 + vaesenclast %xmm2,%xmm11,%xmm11 + vaesenclast %xmm2,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + + + vmovdqa %xmm4,0(%rsi) + vmovdqa %xmm6,16(%rsi) + vmovdqa %xmm7,32(%rsi) + vmovdqa %xmm11,48(%rsi) + vmovdqa %xmm12,64(%rsi) + vmovdqa %xmm13,80(%rsi) + ret + + +#endif diff --git a/third_party/boringssl/gen/crypto/aes128gcmsiv-x86_64-linux.S b/third_party/boringssl/gen/crypto/aes128gcmsiv-x86_64-linux.S new file mode 100644 index 00000000..a8de4a9a --- /dev/null +++ b/third_party/boringssl/gen/crypto/aes128gcmsiv-x86_64-linux.S @@ -0,0 +1,3091 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.section .rodata + +.align 16 +one: +.quad 1,0 +two: +.quad 2,0 +three: +.quad 3,0 +four: +.quad 4,0 +five: +.quad 5,0 +six: +.quad 6,0 +seven: +.quad 7,0 +eight: +.quad 8,0 + +OR_MASK: +.long 0x00000000,0x00000000,0x00000000,0x80000000 +poly: +.quad 0x1, 0xc200000000000000 +mask: +.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +con1: +.long 1,1,1,1 +con2: +.long 0x1b,0x1b,0x1b,0x1b +con3: +.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 +and_mask: +.long 0,0xffffffff, 0xffffffff, 0xffffffff +.text +.type GFMUL,@function +.align 16 +GFMUL: +.cfi_startproc + vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 + vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5 + vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 + vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $8,%xmm3,%xmm4 + vpsrldq $8,%xmm3,%xmm3 + vpxor %xmm4,%xmm2,%xmm2 + vpxor %xmm3,%xmm5,%xmm5 + + vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 + vpshufd $78,%xmm2,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + + vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 + vpshufd $78,%xmm2,%xmm4 + vpxor %xmm4,%xmm3,%xmm2 + + vpxor %xmm5,%xmm2,%xmm0 + ret +.cfi_endproc +.size GFMUL, .-GFMUL +.globl aesgcmsiv_htable_init +.hidden aesgcmsiv_htable_init +.type aesgcmsiv_htable_init,@function +.align 16 +aesgcmsiv_htable_init: +.cfi_startproc +_CET_ENDBR + vmovdqa (%rsi),%xmm0 + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm0,(%rdi) + call GFMUL + vmovdqa %xmm0,16(%rdi) + call GFMUL + vmovdqa %xmm0,32(%rdi) + call GFMUL + vmovdqa %xmm0,48(%rdi) + call GFMUL + vmovdqa %xmm0,64(%rdi) + call GFMUL + vmovdqa %xmm0,80(%rdi) + call GFMUL + vmovdqa %xmm0,96(%rdi) + call GFMUL + vmovdqa %xmm0,112(%rdi) + ret +.cfi_endproc +.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init +.globl aesgcmsiv_htable6_init +.hidden aesgcmsiv_htable6_init +.type aesgcmsiv_htable6_init,@function +.align 16 +aesgcmsiv_htable6_init: +.cfi_startproc +_CET_ENDBR + vmovdqa (%rsi),%xmm0 + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm0,(%rdi) + call GFMUL + vmovdqa %xmm0,16(%rdi) + call GFMUL + vmovdqa %xmm0,32(%rdi) + call GFMUL + vmovdqa %xmm0,48(%rdi) + call GFMUL + vmovdqa %xmm0,64(%rdi) + call GFMUL + vmovdqa %xmm0,80(%rdi) + ret +.cfi_endproc +.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init +.globl aesgcmsiv_htable_polyval +.hidden aesgcmsiv_htable_polyval +.type aesgcmsiv_htable_polyval,@function +.align 16 +aesgcmsiv_htable_polyval: +.cfi_startproc +_CET_ENDBR + testq %rdx,%rdx + jnz .Lhtable_polyval_start + ret + +.Lhtable_polyval_start: + vzeroall + + + + movq %rdx,%r11 + andq $127,%r11 + + jz .Lhtable_polyval_no_prefix + + vpxor %xmm9,%xmm9,%xmm9 + vmovdqa (%rcx),%xmm1 + subq %r11,%rdx + + subq $16,%r11 + + + vmovdqu (%rsi),%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5 + vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3 + vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4 + vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + leaq 16(%rsi),%rsi + testq %r11,%r11 + jnz .Lhtable_polyval_prefix_loop + jmp .Lhtable_polyval_prefix_complete + + +.align 64 +.Lhtable_polyval_prefix_loop: + subq $16,%r11 + + vmovdqu (%rsi),%xmm0 + + vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + testq %r11,%r11 + + leaq 16(%rsi),%rsi + + jnz .Lhtable_polyval_prefix_loop + +.Lhtable_polyval_prefix_complete: + vpsrldq $8,%xmm5,%xmm6 + vpslldq $8,%xmm5,%xmm5 + + vpxor %xmm6,%xmm4,%xmm9 + vpxor %xmm5,%xmm3,%xmm1 + + jmp .Lhtable_polyval_main_loop + +.Lhtable_polyval_no_prefix: + + + + + vpxor %xmm1,%xmm1,%xmm1 + vmovdqa (%rcx),%xmm9 + +.align 64 +.Lhtable_polyval_main_loop: + subq $0x80,%rdx + jb .Lhtable_polyval_out + + vmovdqu 112(%rsi),%xmm0 + + vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5 + vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3 + vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4 + vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 96(%rsi),%xmm0 + vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + + vmovdqu 80(%rsi),%xmm0 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 + vpalignr $8,%xmm1,%xmm1,%xmm1 + + vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm7,%xmm1,%xmm1 + + vmovdqu 64(%rsi),%xmm0 + + vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 48(%rsi),%xmm0 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 + vpalignr $8,%xmm1,%xmm1,%xmm1 + + vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm7,%xmm1,%xmm1 + + vmovdqu 32(%rsi),%xmm0 + + vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpxor %xmm9,%xmm1,%xmm1 + + vmovdqu 16(%rsi),%xmm0 + + vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vmovdqu 0(%rsi),%xmm0 + vpxor %xmm1,%xmm0,%xmm0 + + vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm3,%xmm3 + vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm4,%xmm4 + vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6 + vpxor %xmm6,%xmm5,%xmm5 + + + vpsrldq $8,%xmm5,%xmm6 + vpslldq $8,%xmm5,%xmm5 + + vpxor %xmm6,%xmm4,%xmm9 + vpxor %xmm5,%xmm3,%xmm1 + + leaq 128(%rsi),%rsi + jmp .Lhtable_polyval_main_loop + + + +.Lhtable_polyval_out: + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 + vpalignr $8,%xmm1,%xmm1,%xmm1 + vpxor %xmm6,%xmm1,%xmm1 + + vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 + vpalignr $8,%xmm1,%xmm1,%xmm1 + vpxor %xmm6,%xmm1,%xmm1 + vpxor %xmm9,%xmm1,%xmm1 + + vmovdqu %xmm1,(%rcx) + vzeroupper + ret +.cfi_endproc +.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval +.globl aesgcmsiv_polyval_horner +.hidden aesgcmsiv_polyval_horner +.type aesgcmsiv_polyval_horner,@function +.align 16 +aesgcmsiv_polyval_horner: +.cfi_startproc +_CET_ENDBR + testq %rcx,%rcx + jnz .Lpolyval_horner_start + ret + +.Lpolyval_horner_start: + + + + xorq %r10,%r10 + shlq $4,%rcx + + vmovdqa (%rsi),%xmm1 + vmovdqa (%rdi),%xmm0 + +.Lpolyval_horner_loop: + vpxor (%rdx,%r10,1),%xmm0,%xmm0 + call GFMUL + + addq $16,%r10 + cmpq %r10,%rcx + jne .Lpolyval_horner_loop + + + vmovdqa %xmm0,(%rdi) + ret +.cfi_endproc +.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner +.globl aes128gcmsiv_aes_ks +.hidden aes128gcmsiv_aes_ks +.type aes128gcmsiv_aes_ks,@function +.align 16 +aes128gcmsiv_aes_ks: +.cfi_startproc +_CET_ENDBR + vmovdqu (%rdi),%xmm1 + vmovdqa %xmm1,(%rsi) + + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + + movq $8,%rax + +.Lks128_loop: + addq $16,%rsi + subq $1,%rax + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + jne .Lks128_loop + + vmovdqa con2(%rip),%xmm0 + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,16(%rsi) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslldq $4,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpslldq $4,%xmm3,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,32(%rsi) + ret +.cfi_endproc +.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks +.globl aes256gcmsiv_aes_ks +.hidden aes256gcmsiv_aes_ks +.type aes256gcmsiv_aes_ks,@function +.align 16 +aes256gcmsiv_aes_ks: +.cfi_startproc +_CET_ENDBR + vmovdqu (%rdi),%xmm1 + vmovdqu 16(%rdi),%xmm3 + vmovdqa %xmm1,(%rsi) + vmovdqa %xmm3,16(%rsi) + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + vpxor %xmm14,%xmm14,%xmm14 + movq $6,%rax + +.Lks256_loop: + addq $32,%rsi + subq $1,%rax + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpsllq $32,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpshufb con3(%rip),%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vmovdqa %xmm3,16(%rsi) + jne .Lks256_loop + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpsllq $32,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vmovdqa %xmm1,32(%rsi) + ret +.cfi_endproc +.globl aes128gcmsiv_aes_ks_enc_x1 +.hidden aes128gcmsiv_aes_ks_enc_x1 +.type aes128gcmsiv_aes_ks_enc_x1,@function +.align 16 +aes128gcmsiv_aes_ks_enc_x1: +.cfi_startproc +_CET_ENDBR + vmovdqa (%rcx),%xmm1 + vmovdqa 0(%rdi),%xmm4 + + vmovdqa %xmm1,(%rdx) + vpxor %xmm1,%xmm4,%xmm4 + + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,16(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,32(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,48(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,64(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,80(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,96(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,112(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,128(%rdx) + + + vmovdqa con2(%rip),%xmm0 + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenc %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,144(%rdx) + + vpshufb %xmm15,%xmm1,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpsllq $32,%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpshufb con3(%rip),%xmm1,%xmm3 + vpxor %xmm3,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + + vaesenclast %xmm1,%xmm4,%xmm4 + vmovdqa %xmm1,160(%rdx) + + + vmovdqa %xmm4,0(%rsi) + ret +.cfi_endproc +.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1 +.globl aes128gcmsiv_kdf +.hidden aes128gcmsiv_kdf +.type aes128gcmsiv_kdf,@function +.align 16 +aes128gcmsiv_kdf: +.cfi_startproc +_CET_ENDBR + + + + + vmovdqa (%rdx),%xmm1 + vmovdqa 0(%rdi),%xmm9 + vmovdqa and_mask(%rip),%xmm12 + vmovdqa one(%rip),%xmm13 + vpshufd $0x90,%xmm9,%xmm9 + vpand %xmm12,%xmm9,%xmm9 + vpaddd %xmm13,%xmm9,%xmm10 + vpaddd %xmm13,%xmm10,%xmm11 + vpaddd %xmm13,%xmm11,%xmm12 + + vpxor %xmm1,%xmm9,%xmm9 + vpxor %xmm1,%xmm10,%xmm10 + vpxor %xmm1,%xmm11,%xmm11 + vpxor %xmm1,%xmm12,%xmm12 + + vmovdqa 16(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 32(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 48(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 64(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 80(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 96(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 112(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 128(%rdx),%xmm2 + vaesenc %xmm2,%xmm9,%xmm9 + vaesenc %xmm2,%xmm10,%xmm10 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + + vmovdqa 144(%rdx),%xmm1 + vaesenc %xmm1,%xmm9,%xmm9 + vaesenc %xmm1,%xmm10,%xmm10 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + + vmovdqa 160(%rdx),%xmm2 + vaesenclast %xmm2,%xmm9,%xmm9 + vaesenclast %xmm2,%xmm10,%xmm10 + vaesenclast %xmm2,%xmm11,%xmm11 + vaesenclast %xmm2,%xmm12,%xmm12 + + + vmovdqa %xmm9,0(%rsi) + vmovdqa %xmm10,16(%rsi) + vmovdqa %xmm11,32(%rsi) + vmovdqa %xmm12,48(%rsi) + ret +.cfi_endproc +.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf +.globl aes128gcmsiv_enc_msg_x4 +.hidden aes128gcmsiv_enc_msg_x4 +.type aes128gcmsiv_enc_msg_x4,@function +.align 16 +aes128gcmsiv_enc_msg_x4: +.cfi_startproc +_CET_ENDBR + testq %r8,%r8 + jnz .L128_enc_msg_x4_start + ret + +.L128_enc_msg_x4_start: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 + + shrq $4,%r8 + movq %r8,%r10 + shlq $62,%r10 + shrq $62,%r10 + + + vmovdqa (%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + + vmovdqu four(%rip),%xmm4 + vmovdqa %xmm15,%xmm0 + vpaddd one(%rip),%xmm15,%xmm1 + vpaddd two(%rip),%xmm15,%xmm2 + vpaddd three(%rip),%xmm15,%xmm3 + + shrq $2,%r8 + je .L128_enc_msg_x4_check_remainder + + subq $64,%rsi + subq $64,%rdi + +.L128_enc_msg_x4_loop1: + addq $64,%rsi + addq $64,%rdi + + vmovdqa %xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vmovdqa %xmm2,%xmm7 + vmovdqa %xmm3,%xmm8 + + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqu 32(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm1,%xmm1 + vmovdqu 48(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm2,%xmm2 + vmovdqu 64(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm3,%xmm3 + + vmovdqu 80(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 96(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 112(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 128(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 144(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm12 + vaesenclast %xmm12,%xmm5,%xmm5 + vaesenclast %xmm12,%xmm6,%xmm6 + vaesenclast %xmm12,%xmm7,%xmm7 + vaesenclast %xmm12,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm5,%xmm5 + vpxor 16(%rdi),%xmm6,%xmm6 + vpxor 32(%rdi),%xmm7,%xmm7 + vpxor 48(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm5,0(%rsi) + vmovdqu %xmm6,16(%rsi) + vmovdqu %xmm7,32(%rsi) + vmovdqu %xmm8,48(%rsi) + + jne .L128_enc_msg_x4_loop1 + + addq $64,%rsi + addq $64,%rdi + +.L128_enc_msg_x4_check_remainder: + cmpq $0,%r10 + je .L128_enc_msg_x4_out + +.L128_enc_msg_x4_loop2: + + + vmovdqa %xmm0,%xmm5 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm5,%xmm5 + vaesenc 16(%rcx),%xmm5,%xmm5 + vaesenc 32(%rcx),%xmm5,%xmm5 + vaesenc 48(%rcx),%xmm5,%xmm5 + vaesenc 64(%rcx),%xmm5,%xmm5 + vaesenc 80(%rcx),%xmm5,%xmm5 + vaesenc 96(%rcx),%xmm5,%xmm5 + vaesenc 112(%rcx),%xmm5,%xmm5 + vaesenc 128(%rcx),%xmm5,%xmm5 + vaesenc 144(%rcx),%xmm5,%xmm5 + vaesenclast 160(%rcx),%xmm5,%xmm5 + + + vpxor (%rdi),%xmm5,%xmm5 + vmovdqu %xmm5,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + subq $1,%r10 + jne .L128_enc_msg_x4_loop2 + +.L128_enc_msg_x4_out: + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret +.cfi_endproc +.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4 +.globl aes128gcmsiv_enc_msg_x8 +.hidden aes128gcmsiv_enc_msg_x8 +.type aes128gcmsiv_enc_msg_x8,@function +.align 16 +aes128gcmsiv_enc_msg_x8: +.cfi_startproc +_CET_ENDBR + testq %r8,%r8 + jnz .L128_enc_msg_x8_start + ret + +.L128_enc_msg_x8_start: + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-24 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-32 + movq %rsp,%rbp +.cfi_def_cfa_register rbp + + + subq $128,%rsp + andq $-64,%rsp + + shrq $4,%r8 + movq %r8,%r10 + shlq $61,%r10 + shrq $61,%r10 + + + vmovdqu (%rdx),%xmm1 + vpor OR_MASK(%rip),%xmm1,%xmm1 + + + vpaddd seven(%rip),%xmm1,%xmm0 + vmovdqu %xmm0,(%rsp) + vpaddd one(%rip),%xmm1,%xmm9 + vpaddd two(%rip),%xmm1,%xmm10 + vpaddd three(%rip),%xmm1,%xmm11 + vpaddd four(%rip),%xmm1,%xmm12 + vpaddd five(%rip),%xmm1,%xmm13 + vpaddd six(%rip),%xmm1,%xmm14 + vmovdqa %xmm1,%xmm0 + + shrq $3,%r8 + je .L128_enc_msg_x8_check_remainder + + subq $128,%rsi + subq $128,%rdi + +.L128_enc_msg_x8_loop1: + addq $128,%rsi + addq $128,%rdi + + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm9,%xmm2 + vmovdqa %xmm10,%xmm3 + vmovdqa %xmm11,%xmm4 + vmovdqa %xmm12,%xmm5 + vmovdqa %xmm13,%xmm6 + vmovdqa %xmm14,%xmm7 + + vmovdqu (%rsp),%xmm8 + + vpxor (%rcx),%xmm1,%xmm1 + vpxor (%rcx),%xmm2,%xmm2 + vpxor (%rcx),%xmm3,%xmm3 + vpxor (%rcx),%xmm4,%xmm4 + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu (%rsp),%xmm14 + vpaddd eight(%rip),%xmm14,%xmm14 + vmovdqu %xmm14,(%rsp) + vmovdqu 32(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpsubd one(%rip),%xmm14,%xmm14 + vmovdqu 48(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm0,%xmm0 + vmovdqu 64(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm9,%xmm9 + vmovdqu 80(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm10,%xmm10 + vmovdqu 96(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm11,%xmm11 + vmovdqu 112(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm12,%xmm12 + vmovdqu 128(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm13,%xmm13 + vmovdqu 144(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm15 + vaesenclast %xmm15,%xmm1,%xmm1 + vaesenclast %xmm15,%xmm2,%xmm2 + vaesenclast %xmm15,%xmm3,%xmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vaesenclast %xmm15,%xmm6,%xmm6 + vaesenclast %xmm15,%xmm7,%xmm7 + vaesenclast %xmm15,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm1,%xmm1 + vpxor 16(%rdi),%xmm2,%xmm2 + vpxor 32(%rdi),%xmm3,%xmm3 + vpxor 48(%rdi),%xmm4,%xmm4 + vpxor 64(%rdi),%xmm5,%xmm5 + vpxor 80(%rdi),%xmm6,%xmm6 + vpxor 96(%rdi),%xmm7,%xmm7 + vpxor 112(%rdi),%xmm8,%xmm8 + + decq %r8 + + vmovdqu %xmm1,0(%rsi) + vmovdqu %xmm2,16(%rsi) + vmovdqu %xmm3,32(%rsi) + vmovdqu %xmm4,48(%rsi) + vmovdqu %xmm5,64(%rsi) + vmovdqu %xmm6,80(%rsi) + vmovdqu %xmm7,96(%rsi) + vmovdqu %xmm8,112(%rsi) + + jne .L128_enc_msg_x8_loop1 + + addq $128,%rsi + addq $128,%rdi + +.L128_enc_msg_x8_check_remainder: + cmpq $0,%r10 + je .L128_enc_msg_x8_out + +.L128_enc_msg_x8_loop2: + + + vmovdqa %xmm0,%xmm1 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm1,%xmm1 + vaesenc 16(%rcx),%xmm1,%xmm1 + vaesenc 32(%rcx),%xmm1,%xmm1 + vaesenc 48(%rcx),%xmm1,%xmm1 + vaesenc 64(%rcx),%xmm1,%xmm1 + vaesenc 80(%rcx),%xmm1,%xmm1 + vaesenc 96(%rcx),%xmm1,%xmm1 + vaesenc 112(%rcx),%xmm1,%xmm1 + vaesenc 128(%rcx),%xmm1,%xmm1 + vaesenc 144(%rcx),%xmm1,%xmm1 + vaesenclast 160(%rcx),%xmm1,%xmm1 + + + vpxor (%rdi),%xmm1,%xmm1 + + vmovdqu %xmm1,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + decq %r10 + jne .L128_enc_msg_x8_loop2 + +.L128_enc_msg_x8_out: + movq %rbp,%rsp +.cfi_def_cfa_register %rsp + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret +.cfi_endproc +.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8 +.globl aes128gcmsiv_dec +.hidden aes128gcmsiv_dec +.type aes128gcmsiv_dec,@function +.align 16 +aes128gcmsiv_dec: +.cfi_startproc +_CET_ENDBR + testq $~15,%r9 + jnz .L128_dec_start + ret + +.L128_dec_start: + vzeroupper + vmovdqa (%rdx),%xmm0 + + + vmovdqu 16(%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + movq %rdx,%rax + + leaq 32(%rax),%rax + leaq 32(%rcx),%rcx + + andq $~15,%r9 + + + cmpq $96,%r9 + jb .L128_dec_loop2 + + + subq $96,%r9 + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vpxor (%r8),%xmm7,%xmm7 + vpxor (%r8),%xmm8,%xmm8 + vpxor (%r8),%xmm9,%xmm9 + vpxor (%r8),%xmm10,%xmm10 + vpxor (%r8),%xmm11,%xmm11 + vpxor (%r8),%xmm12,%xmm12 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vaesenclast %xmm4,%xmm8,%xmm8 + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm4,%xmm10,%xmm10 + vaesenclast %xmm4,%xmm11,%xmm11 + vaesenclast %xmm4,%xmm12,%xmm12 + + + vpxor 0(%rdi),%xmm7,%xmm7 + vpxor 16(%rdi),%xmm8,%xmm8 + vpxor 32(%rdi),%xmm9,%xmm9 + vpxor 48(%rdi),%xmm10,%xmm10 + vpxor 64(%rdi),%xmm11,%xmm11 + vpxor 80(%rdi),%xmm12,%xmm12 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + addq $96,%rdi + addq $96,%rsi + jmp .L128_dec_loop1 + + +.align 64 +.L128_dec_loop1: + cmpq $96,%r9 + jb .L128_dec_finish_96 + subq $96,%r9 + + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vmovdqa (%r8),%xmm4 + vpxor %xmm4,%xmm7,%xmm7 + vpxor %xmm4,%xmm8,%xmm8 + vpxor %xmm4,%xmm9,%xmm9 + vpxor %xmm4,%xmm10,%xmm10 + vpxor %xmm4,%xmm11,%xmm11 + vpxor %xmm4,%xmm12,%xmm12 + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vmovdqa 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm6 + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor 0(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vpxor 16(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm8,%xmm8 + vpxor 32(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm9,%xmm9 + vpxor 48(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm10,%xmm10 + vpxor 64(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm11,%xmm11 + vpxor 80(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm12,%xmm12 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + vpxor %xmm5,%xmm0,%xmm0 + + leaq 96(%rdi),%rdi + leaq 96(%rsi),%rsi + jmp .L128_dec_loop1 + +.L128_dec_finish_96: + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor %xmm5,%xmm0,%xmm0 + +.L128_dec_loop2: + + + + cmpq $16,%r9 + jb .L128_dec_out + subq $16,%r9 + + vmovdqa %xmm15,%xmm2 + vpaddd one(%rip),%xmm15,%xmm15 + + vpxor 0(%r8),%xmm2,%xmm2 + vaesenc 16(%r8),%xmm2,%xmm2 + vaesenc 32(%r8),%xmm2,%xmm2 + vaesenc 48(%r8),%xmm2,%xmm2 + vaesenc 64(%r8),%xmm2,%xmm2 + vaesenc 80(%r8),%xmm2,%xmm2 + vaesenc 96(%r8),%xmm2,%xmm2 + vaesenc 112(%r8),%xmm2,%xmm2 + vaesenc 128(%r8),%xmm2,%xmm2 + vaesenc 144(%r8),%xmm2,%xmm2 + vaesenclast 160(%r8),%xmm2,%xmm2 + vpxor (%rdi),%xmm2,%xmm2 + vmovdqu %xmm2,(%rsi) + addq $16,%rdi + addq $16,%rsi + + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa -32(%rcx),%xmm1 + call GFMUL + + jmp .L128_dec_loop2 + +.L128_dec_out: + vmovdqu %xmm0,(%rdx) + ret +.cfi_endproc +.size aes128gcmsiv_dec, .-aes128gcmsiv_dec +.globl aes128gcmsiv_ecb_enc_block +.hidden aes128gcmsiv_ecb_enc_block +.type aes128gcmsiv_ecb_enc_block,@function +.align 16 +aes128gcmsiv_ecb_enc_block: +.cfi_startproc +_CET_ENDBR + vmovdqa (%rdi),%xmm1 + + vpxor (%rdx),%xmm1,%xmm1 + vaesenc 16(%rdx),%xmm1,%xmm1 + vaesenc 32(%rdx),%xmm1,%xmm1 + vaesenc 48(%rdx),%xmm1,%xmm1 + vaesenc 64(%rdx),%xmm1,%xmm1 + vaesenc 80(%rdx),%xmm1,%xmm1 + vaesenc 96(%rdx),%xmm1,%xmm1 + vaesenc 112(%rdx),%xmm1,%xmm1 + vaesenc 128(%rdx),%xmm1,%xmm1 + vaesenc 144(%rdx),%xmm1,%xmm1 + vaesenclast 160(%rdx),%xmm1,%xmm1 + + vmovdqa %xmm1,(%rsi) + + ret +.cfi_endproc +.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block +.globl aes256gcmsiv_aes_ks_enc_x1 +.hidden aes256gcmsiv_aes_ks_enc_x1 +.type aes256gcmsiv_aes_ks_enc_x1,@function +.align 16 +aes256gcmsiv_aes_ks_enc_x1: +.cfi_startproc +_CET_ENDBR + vmovdqa con1(%rip),%xmm0 + vmovdqa mask(%rip),%xmm15 + vmovdqa (%rdi),%xmm8 + vmovdqa (%rcx),%xmm1 + vmovdqa 16(%rcx),%xmm3 + vpxor %xmm1,%xmm8,%xmm8 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm1,(%rdx) + vmovdqu %xmm3,16(%rdx) + vpxor %xmm14,%xmm14,%xmm14 + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,32(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,48(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,64(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,80(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,96(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,112(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,128(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,144(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,160(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,176(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslld $1,%xmm0,%xmm0 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenc %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,192(%rdx) + + vpshufd $0xff,%xmm1,%xmm2 + vaesenclast %xmm14,%xmm2,%xmm2 + vpslldq $4,%xmm3,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpxor %xmm2,%xmm3,%xmm3 + vaesenc %xmm3,%xmm8,%xmm8 + vmovdqu %xmm3,208(%rdx) + + vpshufb %xmm15,%xmm3,%xmm2 + vaesenclast %xmm0,%xmm2,%xmm2 + vpslldq $4,%xmm1,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpslldq $4,%xmm4,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpxor %xmm2,%xmm1,%xmm1 + vaesenclast %xmm1,%xmm8,%xmm8 + vmovdqu %xmm1,224(%rdx) + + vmovdqa %xmm8,(%rsi) + ret +.cfi_endproc +.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1 +.globl aes256gcmsiv_ecb_enc_block +.hidden aes256gcmsiv_ecb_enc_block +.type aes256gcmsiv_ecb_enc_block,@function +.align 16 +aes256gcmsiv_ecb_enc_block: +.cfi_startproc +_CET_ENDBR + vmovdqa (%rdi),%xmm1 + vpxor (%rdx),%xmm1,%xmm1 + vaesenc 16(%rdx),%xmm1,%xmm1 + vaesenc 32(%rdx),%xmm1,%xmm1 + vaesenc 48(%rdx),%xmm1,%xmm1 + vaesenc 64(%rdx),%xmm1,%xmm1 + vaesenc 80(%rdx),%xmm1,%xmm1 + vaesenc 96(%rdx),%xmm1,%xmm1 + vaesenc 112(%rdx),%xmm1,%xmm1 + vaesenc 128(%rdx),%xmm1,%xmm1 + vaesenc 144(%rdx),%xmm1,%xmm1 + vaesenc 160(%rdx),%xmm1,%xmm1 + vaesenc 176(%rdx),%xmm1,%xmm1 + vaesenc 192(%rdx),%xmm1,%xmm1 + vaesenc 208(%rdx),%xmm1,%xmm1 + vaesenclast 224(%rdx),%xmm1,%xmm1 + vmovdqa %xmm1,(%rsi) + ret +.cfi_endproc +.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block +.globl aes256gcmsiv_enc_msg_x4 +.hidden aes256gcmsiv_enc_msg_x4 +.type aes256gcmsiv_enc_msg_x4,@function +.align 16 +aes256gcmsiv_enc_msg_x4: +.cfi_startproc +_CET_ENDBR + testq %r8,%r8 + jnz .L256_enc_msg_x4_start + ret + +.L256_enc_msg_x4_start: + movq %r8,%r10 + shrq $4,%r8 + shlq $60,%r10 + jz .L256_enc_msg_x4_start2 + addq $1,%r8 + +.L256_enc_msg_x4_start2: + movq %r8,%r10 + shlq $62,%r10 + shrq $62,%r10 + + + vmovdqa (%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + + vmovdqa four(%rip),%xmm4 + vmovdqa %xmm15,%xmm0 + vpaddd one(%rip),%xmm15,%xmm1 + vpaddd two(%rip),%xmm15,%xmm2 + vpaddd three(%rip),%xmm15,%xmm3 + + shrq $2,%r8 + je .L256_enc_msg_x4_check_remainder + + subq $64,%rsi + subq $64,%rdi + +.L256_enc_msg_x4_loop1: + addq $64,%rsi + addq $64,%rdi + + vmovdqa %xmm0,%xmm5 + vmovdqa %xmm1,%xmm6 + vmovdqa %xmm2,%xmm7 + vmovdqa %xmm3,%xmm8 + + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm0,%xmm0 + vmovdqu 32(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm1,%xmm1 + vmovdqu 48(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm2,%xmm2 + vmovdqu 64(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vpaddd %xmm4,%xmm3,%xmm3 + + vmovdqu 80(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 96(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 112(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 128(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 144(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 176(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 192(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 208(%rcx),%xmm12 + vaesenc %xmm12,%xmm5,%xmm5 + vaesenc %xmm12,%xmm6,%xmm6 + vaesenc %xmm12,%xmm7,%xmm7 + vaesenc %xmm12,%xmm8,%xmm8 + + vmovdqu 224(%rcx),%xmm12 + vaesenclast %xmm12,%xmm5,%xmm5 + vaesenclast %xmm12,%xmm6,%xmm6 + vaesenclast %xmm12,%xmm7,%xmm7 + vaesenclast %xmm12,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm5,%xmm5 + vpxor 16(%rdi),%xmm6,%xmm6 + vpxor 32(%rdi),%xmm7,%xmm7 + vpxor 48(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm5,0(%rsi) + vmovdqu %xmm6,16(%rsi) + vmovdqu %xmm7,32(%rsi) + vmovdqu %xmm8,48(%rsi) + + jne .L256_enc_msg_x4_loop1 + + addq $64,%rsi + addq $64,%rdi + +.L256_enc_msg_x4_check_remainder: + cmpq $0,%r10 + je .L256_enc_msg_x4_out + +.L256_enc_msg_x4_loop2: + + + + vmovdqa %xmm0,%xmm5 + vpaddd one(%rip),%xmm0,%xmm0 + vpxor (%rcx),%xmm5,%xmm5 + vaesenc 16(%rcx),%xmm5,%xmm5 + vaesenc 32(%rcx),%xmm5,%xmm5 + vaesenc 48(%rcx),%xmm5,%xmm5 + vaesenc 64(%rcx),%xmm5,%xmm5 + vaesenc 80(%rcx),%xmm5,%xmm5 + vaesenc 96(%rcx),%xmm5,%xmm5 + vaesenc 112(%rcx),%xmm5,%xmm5 + vaesenc 128(%rcx),%xmm5,%xmm5 + vaesenc 144(%rcx),%xmm5,%xmm5 + vaesenc 160(%rcx),%xmm5,%xmm5 + vaesenc 176(%rcx),%xmm5,%xmm5 + vaesenc 192(%rcx),%xmm5,%xmm5 + vaesenc 208(%rcx),%xmm5,%xmm5 + vaesenclast 224(%rcx),%xmm5,%xmm5 + + + vpxor (%rdi),%xmm5,%xmm5 + + vmovdqu %xmm5,(%rsi) + + addq $16,%rdi + addq $16,%rsi + + subq $1,%r10 + jne .L256_enc_msg_x4_loop2 + +.L256_enc_msg_x4_out: + ret +.cfi_endproc +.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4 +.globl aes256gcmsiv_enc_msg_x8 +.hidden aes256gcmsiv_enc_msg_x8 +.type aes256gcmsiv_enc_msg_x8,@function +.align 16 +aes256gcmsiv_enc_msg_x8: +.cfi_startproc +_CET_ENDBR + testq %r8,%r8 + jnz .L256_enc_msg_x8_start + ret + +.L256_enc_msg_x8_start: + + movq %rsp,%r11 + subq $16,%r11 + andq $-64,%r11 + + movq %r8,%r10 + shrq $4,%r8 + shlq $60,%r10 + jz .L256_enc_msg_x8_start2 + addq $1,%r8 + +.L256_enc_msg_x8_start2: + movq %r8,%r10 + shlq $61,%r10 + shrq $61,%r10 + + + vmovdqa (%rdx),%xmm1 + vpor OR_MASK(%rip),%xmm1,%xmm1 + + + vpaddd seven(%rip),%xmm1,%xmm0 + vmovdqa %xmm0,(%r11) + vpaddd one(%rip),%xmm1,%xmm9 + vpaddd two(%rip),%xmm1,%xmm10 + vpaddd three(%rip),%xmm1,%xmm11 + vpaddd four(%rip),%xmm1,%xmm12 + vpaddd five(%rip),%xmm1,%xmm13 + vpaddd six(%rip),%xmm1,%xmm14 + vmovdqa %xmm1,%xmm0 + + shrq $3,%r8 + jz .L256_enc_msg_x8_check_remainder + + subq $128,%rsi + subq $128,%rdi + +.L256_enc_msg_x8_loop1: + addq $128,%rsi + addq $128,%rdi + + vmovdqa %xmm0,%xmm1 + vmovdqa %xmm9,%xmm2 + vmovdqa %xmm10,%xmm3 + vmovdqa %xmm11,%xmm4 + vmovdqa %xmm12,%xmm5 + vmovdqa %xmm13,%xmm6 + vmovdqa %xmm14,%xmm7 + + vmovdqa (%r11),%xmm8 + + vpxor (%rcx),%xmm1,%xmm1 + vpxor (%rcx),%xmm2,%xmm2 + vpxor (%rcx),%xmm3,%xmm3 + vpxor (%rcx),%xmm4,%xmm4 + vpxor (%rcx),%xmm5,%xmm5 + vpxor (%rcx),%xmm6,%xmm6 + vpxor (%rcx),%xmm7,%xmm7 + vpxor (%rcx),%xmm8,%xmm8 + + vmovdqu 16(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqa (%r11),%xmm14 + vpaddd eight(%rip),%xmm14,%xmm14 + vmovdqa %xmm14,(%r11) + vmovdqu 32(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpsubd one(%rip),%xmm14,%xmm14 + vmovdqu 48(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm0,%xmm0 + vmovdqu 64(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm9,%xmm9 + vmovdqu 80(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm10,%xmm10 + vmovdqu 96(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm11,%xmm11 + vmovdqu 112(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm12,%xmm12 + vmovdqu 128(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vpaddd eight(%rip),%xmm13,%xmm13 + vmovdqu 144(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 160(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 176(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 192(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 208(%rcx),%xmm15 + vaesenc %xmm15,%xmm1,%xmm1 + vaesenc %xmm15,%xmm2,%xmm2 + vaesenc %xmm15,%xmm3,%xmm3 + vaesenc %xmm15,%xmm4,%xmm4 + vaesenc %xmm15,%xmm5,%xmm5 + vaesenc %xmm15,%xmm6,%xmm6 + vaesenc %xmm15,%xmm7,%xmm7 + vaesenc %xmm15,%xmm8,%xmm8 + + vmovdqu 224(%rcx),%xmm15 + vaesenclast %xmm15,%xmm1,%xmm1 + vaesenclast %xmm15,%xmm2,%xmm2 + vaesenclast %xmm15,%xmm3,%xmm3 + vaesenclast %xmm15,%xmm4,%xmm4 + vaesenclast %xmm15,%xmm5,%xmm5 + vaesenclast %xmm15,%xmm6,%xmm6 + vaesenclast %xmm15,%xmm7,%xmm7 + vaesenclast %xmm15,%xmm8,%xmm8 + + + + vpxor 0(%rdi),%xmm1,%xmm1 + vpxor 16(%rdi),%xmm2,%xmm2 + vpxor 32(%rdi),%xmm3,%xmm3 + vpxor 48(%rdi),%xmm4,%xmm4 + vpxor 64(%rdi),%xmm5,%xmm5 + vpxor 80(%rdi),%xmm6,%xmm6 + vpxor 96(%rdi),%xmm7,%xmm7 + vpxor 112(%rdi),%xmm8,%xmm8 + + subq $1,%r8 + + vmovdqu %xmm1,0(%rsi) + vmovdqu %xmm2,16(%rsi) + vmovdqu %xmm3,32(%rsi) + vmovdqu %xmm4,48(%rsi) + vmovdqu %xmm5,64(%rsi) + vmovdqu %xmm6,80(%rsi) + vmovdqu %xmm7,96(%rsi) + vmovdqu %xmm8,112(%rsi) + + jne .L256_enc_msg_x8_loop1 + + addq $128,%rsi + addq $128,%rdi + +.L256_enc_msg_x8_check_remainder: + cmpq $0,%r10 + je .L256_enc_msg_x8_out + +.L256_enc_msg_x8_loop2: + + + vmovdqa %xmm0,%xmm1 + vpaddd one(%rip),%xmm0,%xmm0 + + vpxor (%rcx),%xmm1,%xmm1 + vaesenc 16(%rcx),%xmm1,%xmm1 + vaesenc 32(%rcx),%xmm1,%xmm1 + vaesenc 48(%rcx),%xmm1,%xmm1 + vaesenc 64(%rcx),%xmm1,%xmm1 + vaesenc 80(%rcx),%xmm1,%xmm1 + vaesenc 96(%rcx),%xmm1,%xmm1 + vaesenc 112(%rcx),%xmm1,%xmm1 + vaesenc 128(%rcx),%xmm1,%xmm1 + vaesenc 144(%rcx),%xmm1,%xmm1 + vaesenc 160(%rcx),%xmm1,%xmm1 + vaesenc 176(%rcx),%xmm1,%xmm1 + vaesenc 192(%rcx),%xmm1,%xmm1 + vaesenc 208(%rcx),%xmm1,%xmm1 + vaesenclast 224(%rcx),%xmm1,%xmm1 + + + vpxor (%rdi),%xmm1,%xmm1 + + vmovdqu %xmm1,(%rsi) + + addq $16,%rdi + addq $16,%rsi + subq $1,%r10 + jnz .L256_enc_msg_x8_loop2 + +.L256_enc_msg_x8_out: + ret + +.cfi_endproc +.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8 +.globl aes256gcmsiv_dec +.hidden aes256gcmsiv_dec +.type aes256gcmsiv_dec,@function +.align 16 +aes256gcmsiv_dec: +.cfi_startproc +_CET_ENDBR + testq $~15,%r9 + jnz .L256_dec_start + ret + +.L256_dec_start: + vzeroupper + vmovdqa (%rdx),%xmm0 + + + vmovdqu 16(%rdx),%xmm15 + vpor OR_MASK(%rip),%xmm15,%xmm15 + movq %rdx,%rax + + leaq 32(%rax),%rax + leaq 32(%rcx),%rcx + + andq $~15,%r9 + + + cmpq $96,%r9 + jb .L256_dec_loop2 + + + subq $96,%r9 + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vpxor (%r8),%xmm7,%xmm7 + vpxor (%r8),%xmm8,%xmm8 + vpxor (%r8),%xmm9,%xmm9 + vpxor (%r8),%xmm10,%xmm10 + vpxor (%r8),%xmm11,%xmm11 + vpxor (%r8),%xmm12,%xmm12 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 176(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 192(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 208(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 224(%r8),%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vaesenclast %xmm4,%xmm8,%xmm8 + vaesenclast %xmm4,%xmm9,%xmm9 + vaesenclast %xmm4,%xmm10,%xmm10 + vaesenclast %xmm4,%xmm11,%xmm11 + vaesenclast %xmm4,%xmm12,%xmm12 + + + vpxor 0(%rdi),%xmm7,%xmm7 + vpxor 16(%rdi),%xmm8,%xmm8 + vpxor 32(%rdi),%xmm9,%xmm9 + vpxor 48(%rdi),%xmm10,%xmm10 + vpxor 64(%rdi),%xmm11,%xmm11 + vpxor 80(%rdi),%xmm12,%xmm12 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + addq $96,%rdi + addq $96,%rsi + jmp .L256_dec_loop1 + + +.align 64 +.L256_dec_loop1: + cmpq $96,%r9 + jb .L256_dec_finish_96 + subq $96,%r9 + + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqa %xmm15,%xmm7 + vpaddd one(%rip),%xmm7,%xmm8 + vpaddd two(%rip),%xmm7,%xmm9 + vpaddd one(%rip),%xmm9,%xmm10 + vpaddd two(%rip),%xmm9,%xmm11 + vpaddd one(%rip),%xmm11,%xmm12 + vpaddd two(%rip),%xmm11,%xmm15 + + vmovdqa (%r8),%xmm4 + vpxor %xmm4,%xmm7,%xmm7 + vpxor %xmm4,%xmm8,%xmm8 + vpxor %xmm4,%xmm9,%xmm9 + vpxor %xmm4,%xmm10,%xmm10 + vpxor %xmm4,%xmm11,%xmm11 + vpxor %xmm4,%xmm12,%xmm12 + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 32(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 48(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 64(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 96(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 112(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vmovdqa 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 128(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vmovdqu 144(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 160(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 176(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 192(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 208(%r8),%xmm4 + vaesenc %xmm4,%xmm7,%xmm7 + vaesenc %xmm4,%xmm8,%xmm8 + vaesenc %xmm4,%xmm9,%xmm9 + vaesenc %xmm4,%xmm10,%xmm10 + vaesenc %xmm4,%xmm11,%xmm11 + vaesenc %xmm4,%xmm12,%xmm12 + + vmovdqu 224(%r8),%xmm6 + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor 0(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm7,%xmm7 + vpxor 16(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm8,%xmm8 + vpxor 32(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm9,%xmm9 + vpxor 48(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm10,%xmm10 + vpxor 64(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm11,%xmm11 + vpxor 80(%rdi),%xmm6,%xmm4 + vaesenclast %xmm4,%xmm12,%xmm12 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vmovdqu %xmm7,0(%rsi) + vmovdqu %xmm8,16(%rsi) + vmovdqu %xmm9,32(%rsi) + vmovdqu %xmm10,48(%rsi) + vmovdqu %xmm11,64(%rsi) + vmovdqu %xmm12,80(%rsi) + + vpxor %xmm5,%xmm0,%xmm0 + + leaq 96(%rdi),%rdi + leaq 96(%rsi),%rsi + jmp .L256_dec_loop1 + +.L256_dec_finish_96: + vmovdqa %xmm12,%xmm6 + vmovdqa %xmm11,16-32(%rax) + vmovdqa %xmm10,32-32(%rax) + vmovdqa %xmm9,48-32(%rax) + vmovdqa %xmm8,64-32(%rax) + vmovdqa %xmm7,80-32(%rax) + + vmovdqu 0-32(%rcx),%xmm4 + vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 + vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 + vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 + vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu -16(%rax),%xmm6 + vmovdqu -16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 0(%rax),%xmm6 + vmovdqu 0(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 16(%rax),%xmm6 + vmovdqu 16(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vmovdqu 32(%rax),%xmm6 + vmovdqu 32(%rcx),%xmm13 + + vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + + vmovdqu 80-32(%rax),%xmm6 + vpxor %xmm0,%xmm6,%xmm6 + vmovdqu 80-32(%rcx),%xmm5 + vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm2,%xmm2 + vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm3,%xmm3 + vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 + vpxor %xmm4,%xmm1,%xmm1 + + vpsrldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm2,%xmm5 + vpslldq $8,%xmm1,%xmm4 + vpxor %xmm4,%xmm3,%xmm0 + + vmovdqa poly(%rip),%xmm3 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpalignr $8,%xmm0,%xmm0,%xmm2 + vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 + vpxor %xmm0,%xmm2,%xmm0 + + vpxor %xmm5,%xmm0,%xmm0 + +.L256_dec_loop2: + + + + cmpq $16,%r9 + jb .L256_dec_out + subq $16,%r9 + + vmovdqa %xmm15,%xmm2 + vpaddd one(%rip),%xmm15,%xmm15 + + vpxor 0(%r8),%xmm2,%xmm2 + vaesenc 16(%r8),%xmm2,%xmm2 + vaesenc 32(%r8),%xmm2,%xmm2 + vaesenc 48(%r8),%xmm2,%xmm2 + vaesenc 64(%r8),%xmm2,%xmm2 + vaesenc 80(%r8),%xmm2,%xmm2 + vaesenc 96(%r8),%xmm2,%xmm2 + vaesenc 112(%r8),%xmm2,%xmm2 + vaesenc 128(%r8),%xmm2,%xmm2 + vaesenc 144(%r8),%xmm2,%xmm2 + vaesenc 160(%r8),%xmm2,%xmm2 + vaesenc 176(%r8),%xmm2,%xmm2 + vaesenc 192(%r8),%xmm2,%xmm2 + vaesenc 208(%r8),%xmm2,%xmm2 + vaesenclast 224(%r8),%xmm2,%xmm2 + vpxor (%rdi),%xmm2,%xmm2 + vmovdqu %xmm2,(%rsi) + addq $16,%rdi + addq $16,%rsi + + vpxor %xmm2,%xmm0,%xmm0 + vmovdqa -32(%rcx),%xmm1 + call GFMUL + + jmp .L256_dec_loop2 + +.L256_dec_out: + vmovdqu %xmm0,(%rdx) + ret +.cfi_endproc +.size aes256gcmsiv_dec, .-aes256gcmsiv_dec +.globl aes256gcmsiv_kdf +.hidden aes256gcmsiv_kdf +.type aes256gcmsiv_kdf,@function +.align 16 +aes256gcmsiv_kdf: +.cfi_startproc +_CET_ENDBR + + + + + vmovdqa (%rdx),%xmm1 + vmovdqa 0(%rdi),%xmm4 + vmovdqa and_mask(%rip),%xmm11 + vmovdqa one(%rip),%xmm8 + vpshufd $0x90,%xmm4,%xmm4 + vpand %xmm11,%xmm4,%xmm4 + vpaddd %xmm8,%xmm4,%xmm6 + vpaddd %xmm8,%xmm6,%xmm7 + vpaddd %xmm8,%xmm7,%xmm11 + vpaddd %xmm8,%xmm11,%xmm12 + vpaddd %xmm8,%xmm12,%xmm13 + + vpxor %xmm1,%xmm4,%xmm4 + vpxor %xmm1,%xmm6,%xmm6 + vpxor %xmm1,%xmm7,%xmm7 + vpxor %xmm1,%xmm11,%xmm11 + vpxor %xmm1,%xmm12,%xmm12 + vpxor %xmm1,%xmm13,%xmm13 + + vmovdqa 16(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 32(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 48(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 64(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 80(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 96(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 112(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 128(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 144(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 160(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 176(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 192(%rdx),%xmm2 + vaesenc %xmm2,%xmm4,%xmm4 + vaesenc %xmm2,%xmm6,%xmm6 + vaesenc %xmm2,%xmm7,%xmm7 + vaesenc %xmm2,%xmm11,%xmm11 + vaesenc %xmm2,%xmm12,%xmm12 + vaesenc %xmm2,%xmm13,%xmm13 + + vmovdqa 208(%rdx),%xmm1 + vaesenc %xmm1,%xmm4,%xmm4 + vaesenc %xmm1,%xmm6,%xmm6 + vaesenc %xmm1,%xmm7,%xmm7 + vaesenc %xmm1,%xmm11,%xmm11 + vaesenc %xmm1,%xmm12,%xmm12 + vaesenc %xmm1,%xmm13,%xmm13 + + vmovdqa 224(%rdx),%xmm2 + vaesenclast %xmm2,%xmm4,%xmm4 + vaesenclast %xmm2,%xmm6,%xmm6 + vaesenclast %xmm2,%xmm7,%xmm7 + vaesenclast %xmm2,%xmm11,%xmm11 + vaesenclast %xmm2,%xmm12,%xmm12 + vaesenclast %xmm2,%xmm13,%xmm13 + + + vmovdqa %xmm4,0(%rsi) + vmovdqa %xmm6,16(%rsi) + vmovdqa %xmm7,32(%rsi) + vmovdqa %xmm11,48(%rsi) + vmovdqa %xmm12,64(%rsi) + vmovdqa %xmm13,80(%rsi) + ret +.cfi_endproc +.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf +#endif diff --git a/third_party/boringssl/gen/crypto/aes128gcmsiv-x86_64-win.asm b/third_party/boringssl/gen/crypto/aes128gcmsiv-x86_64-win.asm new file mode 100644 index 00000000..e875c0a6 --- /dev/null +++ b/third_party/boringssl/gen/crypto/aes128gcmsiv-x86_64-win.asm @@ -0,0 +1,3302 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .rdata rdata align=8 + +ALIGN 16 +one: + DQ 1,0 +two: + DQ 2,0 +three: + DQ 3,0 +four: + DQ 4,0 +five: + DQ 5,0 +six: + DQ 6,0 +seven: + DQ 7,0 +eight: + DQ 8,0 + +OR_MASK: + DD 0x00000000,0x00000000,0x00000000,0x80000000 +poly: + DQ 0x1,0xc200000000000000 +mask: + DD 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d +con1: + DD 1,1,1,1 +con2: + DD 0x1b,0x1b,0x1b,0x1b +con3: + DB -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 +and_mask: + DD 0,0xffffffff,0xffffffff,0xffffffff +section .text code align=64 + + +ALIGN 16 +GFMUL: + + vpclmulqdq xmm2,xmm0,xmm1,0x00 + vpclmulqdq xmm5,xmm0,xmm1,0x11 + vpclmulqdq xmm3,xmm0,xmm1,0x10 + vpclmulqdq xmm4,xmm0,xmm1,0x01 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm3,8 + vpsrldq xmm3,xmm3,8 + vpxor xmm2,xmm2,xmm4 + vpxor xmm5,xmm5,xmm3 + + vpclmulqdq xmm3,xmm2,XMMWORD[poly],0x10 + vpshufd xmm4,xmm2,78 + vpxor xmm2,xmm3,xmm4 + + vpclmulqdq xmm3,xmm2,XMMWORD[poly],0x10 + vpshufd xmm4,xmm2,78 + vpxor xmm2,xmm3,xmm4 + + vpxor xmm0,xmm2,xmm5 + ret + + +global aesgcmsiv_htable_init + +ALIGN 16 +aesgcmsiv_htable_init: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesgcmsiv_htable_init: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + vmovdqa xmm0,XMMWORD[rsi] + vmovdqa xmm1,xmm0 + vmovdqa XMMWORD[rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[16+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[32+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[48+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[64+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[80+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[96+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[112+rdi],xmm0 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aesgcmsiv_htable_init: +global aesgcmsiv_htable6_init + +ALIGN 16 +aesgcmsiv_htable6_init: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesgcmsiv_htable6_init: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + vmovdqa xmm0,XMMWORD[rsi] + vmovdqa xmm1,xmm0 + vmovdqa XMMWORD[rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[16+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[32+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[48+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[64+rdi],xmm0 + call GFMUL + vmovdqa XMMWORD[80+rdi],xmm0 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aesgcmsiv_htable6_init: +global aesgcmsiv_htable_polyval + +ALIGN 16 +aesgcmsiv_htable_polyval: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesgcmsiv_htable_polyval: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + +_CET_ENDBR + test rdx,rdx + jnz NEAR $L$htable_polyval_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$htable_polyval_start: + vzeroall + + + + mov r11,rdx + and r11,127 + + jz NEAR $L$htable_polyval_no_prefix + + vpxor xmm9,xmm9,xmm9 + vmovdqa xmm1,XMMWORD[rcx] + sub rdx,r11 + + sub r11,16 + + + vmovdqu xmm0,XMMWORD[rsi] + vpxor xmm0,xmm0,xmm1 + + vpclmulqdq xmm5,xmm0,XMMWORD[r11*1+rdi],0x01 + vpclmulqdq xmm3,xmm0,XMMWORD[r11*1+rdi],0x00 + vpclmulqdq xmm4,xmm0,XMMWORD[r11*1+rdi],0x11 + vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + lea rsi,[16+rsi] + test r11,r11 + jnz NEAR $L$htable_polyval_prefix_loop + jmp NEAR $L$htable_polyval_prefix_complete + + +ALIGN 64 +$L$htable_polyval_prefix_loop: + sub r11,16 + + vmovdqu xmm0,XMMWORD[rsi] + + vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[r11*1+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + test r11,r11 + + lea rsi,[16+rsi] + + jnz NEAR $L$htable_polyval_prefix_loop + +$L$htable_polyval_prefix_complete: + vpsrldq xmm6,xmm5,8 + vpslldq xmm5,xmm5,8 + + vpxor xmm9,xmm4,xmm6 + vpxor xmm1,xmm3,xmm5 + + jmp NEAR $L$htable_polyval_main_loop + +$L$htable_polyval_no_prefix: + + + + + vpxor xmm1,xmm1,xmm1 + vmovdqa xmm9,XMMWORD[rcx] + +ALIGN 64 +$L$htable_polyval_main_loop: + sub rdx,0x80 + jb NEAR $L$htable_polyval_out + + vmovdqu xmm0,XMMWORD[112+rsi] + + vpclmulqdq xmm5,xmm0,XMMWORD[rdi],0x01 + vpclmulqdq xmm3,xmm0,XMMWORD[rdi],0x00 + vpclmulqdq xmm4,xmm0,XMMWORD[rdi],0x11 + vpclmulqdq xmm6,xmm0,XMMWORD[rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vmovdqu xmm0,XMMWORD[96+rsi] + vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[16+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + + vmovdqu xmm0,XMMWORD[80+rsi] + + vpclmulqdq xmm7,xmm1,XMMWORD[poly],0x10 + vpalignr xmm1,xmm1,xmm1,8 + + vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[32+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vpxor xmm1,xmm1,xmm7 + + vmovdqu xmm0,XMMWORD[64+rsi] + + vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[48+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vmovdqu xmm0,XMMWORD[48+rsi] + + vpclmulqdq xmm7,xmm1,XMMWORD[poly],0x10 + vpalignr xmm1,xmm1,xmm1,8 + + vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[64+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vpxor xmm1,xmm1,xmm7 + + vmovdqu xmm0,XMMWORD[32+rsi] + + vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[80+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vpxor xmm1,xmm1,xmm9 + + vmovdqu xmm0,XMMWORD[16+rsi] + + vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[96+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vmovdqu xmm0,XMMWORD[rsi] + vpxor xmm0,xmm0,xmm1 + + vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x01 + vpxor xmm5,xmm5,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x00 + vpxor xmm3,xmm3,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x11 + vpxor xmm4,xmm4,xmm6 + vpclmulqdq xmm6,xmm0,XMMWORD[112+rdi],0x10 + vpxor xmm5,xmm5,xmm6 + + + vpsrldq xmm6,xmm5,8 + vpslldq xmm5,xmm5,8 + + vpxor xmm9,xmm4,xmm6 + vpxor xmm1,xmm3,xmm5 + + lea rsi,[128+rsi] + jmp NEAR $L$htable_polyval_main_loop + + + +$L$htable_polyval_out: + vpclmulqdq xmm6,xmm1,XMMWORD[poly],0x10 + vpalignr xmm1,xmm1,xmm1,8 + vpxor xmm1,xmm1,xmm6 + + vpclmulqdq xmm6,xmm1,XMMWORD[poly],0x10 + vpalignr xmm1,xmm1,xmm1,8 + vpxor xmm1,xmm1,xmm6 + vpxor xmm1,xmm1,xmm9 + + vmovdqu XMMWORD[rcx],xmm1 + vzeroupper + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aesgcmsiv_htable_polyval: +global aesgcmsiv_polyval_horner + +ALIGN 16 +aesgcmsiv_polyval_horner: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aesgcmsiv_polyval_horner: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + +_CET_ENDBR + test rcx,rcx + jnz NEAR $L$polyval_horner_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$polyval_horner_start: + + + + xor r10,r10 + shl rcx,4 + + vmovdqa xmm1,XMMWORD[rsi] + vmovdqa xmm0,XMMWORD[rdi] + +$L$polyval_horner_loop: + vpxor xmm0,xmm0,XMMWORD[r10*1+rdx] + call GFMUL + + add r10,16 + cmp rcx,r10 + jne NEAR $L$polyval_horner_loop + + + vmovdqa XMMWORD[rdi],xmm0 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aesgcmsiv_polyval_horner: +global aes128gcmsiv_aes_ks + +ALIGN 16 +aes128gcmsiv_aes_ks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_aes_ks: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + vmovdqu xmm1,XMMWORD[rdi] + vmovdqa XMMWORD[rsi],xmm1 + + vmovdqa xmm0,XMMWORD[con1] + vmovdqa xmm15,XMMWORD[mask] + + mov rax,8 + +$L$ks128_loop: + add rsi,16 + sub rax,1 + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm3,xmm1,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + vmovdqa XMMWORD[rsi],xmm1 + jne NEAR $L$ks128_loop + + vmovdqa xmm0,XMMWORD[con2] + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm3,xmm1,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + vmovdqa XMMWORD[16+rsi],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslldq xmm3,xmm1,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpslldq xmm3,xmm3,4 + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + vmovdqa XMMWORD[32+rsi],xmm1 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes128gcmsiv_aes_ks: +global aes256gcmsiv_aes_ks + +ALIGN 16 +aes256gcmsiv_aes_ks: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_aes_ks: + mov rdi,rcx + mov rsi,rdx + + + +_CET_ENDBR + vmovdqu xmm1,XMMWORD[rdi] + vmovdqu xmm3,XMMWORD[16+rdi] + vmovdqa XMMWORD[rsi],xmm1 + vmovdqa XMMWORD[16+rsi],xmm3 + vmovdqa xmm0,XMMWORD[con1] + vmovdqa xmm15,XMMWORD[mask] + vpxor xmm14,xmm14,xmm14 + mov rax,6 + +$L$ks256_loop: + add rsi,32 + sub rax,1 + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm4,xmm1,32 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm4,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vmovdqa XMMWORD[rsi],xmm1 + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpsllq xmm4,xmm3,32 + vpxor xmm3,xmm3,xmm4 + vpshufb xmm4,xmm3,XMMWORD[con3] + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vmovdqa XMMWORD[16+rsi],xmm3 + jne NEAR $L$ks256_loop + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpsllq xmm4,xmm1,32 + vpxor xmm1,xmm1,xmm4 + vpshufb xmm4,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vmovdqa XMMWORD[32+rsi],xmm1 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +global aes128gcmsiv_aes_ks_enc_x1 + +ALIGN 16 +aes128gcmsiv_aes_ks_enc_x1: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_aes_ks_enc_x1: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + +_CET_ENDBR + vmovdqa xmm1,XMMWORD[rcx] + vmovdqa xmm4,XMMWORD[rdi] + + vmovdqa XMMWORD[rdx],xmm1 + vpxor xmm4,xmm4,xmm1 + + vmovdqa xmm0,XMMWORD[con1] + vmovdqa xmm15,XMMWORD[mask] + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[16+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[32+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[48+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[64+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[80+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[96+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[112+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[128+rdx],xmm1 + + + vmovdqa xmm0,XMMWORD[con2] + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenc xmm4,xmm4,xmm1 + vmovdqa XMMWORD[144+rdx],xmm1 + + vpshufb xmm2,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpsllq xmm3,xmm1,32 + vpxor xmm1,xmm1,xmm3 + vpshufb xmm3,xmm1,XMMWORD[con3] + vpxor xmm1,xmm1,xmm3 + vpxor xmm1,xmm1,xmm2 + + vaesenclast xmm4,xmm4,xmm1 + vmovdqa XMMWORD[160+rdx],xmm1 + + + vmovdqa XMMWORD[rsi],xmm4 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes128gcmsiv_aes_ks_enc_x1: +global aes128gcmsiv_kdf + +ALIGN 16 +aes128gcmsiv_kdf: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_kdf: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + + + + + vmovdqa xmm1,XMMWORD[rdx] + vmovdqa xmm9,XMMWORD[rdi] + vmovdqa xmm12,XMMWORD[and_mask] + vmovdqa xmm13,XMMWORD[one] + vpshufd xmm9,xmm9,0x90 + vpand xmm9,xmm9,xmm12 + vpaddd xmm10,xmm9,xmm13 + vpaddd xmm11,xmm10,xmm13 + vpaddd xmm12,xmm11,xmm13 + + vpxor xmm9,xmm9,xmm1 + vpxor xmm10,xmm10,xmm1 + vpxor xmm11,xmm11,xmm1 + vpxor xmm12,xmm12,xmm1 + + vmovdqa xmm1,XMMWORD[16+rdx] + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + + vmovdqa xmm2,XMMWORD[32+rdx] + vaesenc xmm9,xmm9,xmm2 + vaesenc xmm10,xmm10,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + + vmovdqa xmm1,XMMWORD[48+rdx] + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + + vmovdqa xmm2,XMMWORD[64+rdx] + vaesenc xmm9,xmm9,xmm2 + vaesenc xmm10,xmm10,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + + vmovdqa xmm1,XMMWORD[80+rdx] + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + + vmovdqa xmm2,XMMWORD[96+rdx] + vaesenc xmm9,xmm9,xmm2 + vaesenc xmm10,xmm10,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + + vmovdqa xmm1,XMMWORD[112+rdx] + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + + vmovdqa xmm2,XMMWORD[128+rdx] + vaesenc xmm9,xmm9,xmm2 + vaesenc xmm10,xmm10,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + + vmovdqa xmm1,XMMWORD[144+rdx] + vaesenc xmm9,xmm9,xmm1 + vaesenc xmm10,xmm10,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + + vmovdqa xmm2,XMMWORD[160+rdx] + vaesenclast xmm9,xmm9,xmm2 + vaesenclast xmm10,xmm10,xmm2 + vaesenclast xmm11,xmm11,xmm2 + vaesenclast xmm12,xmm12,xmm2 + + + vmovdqa XMMWORD[rsi],xmm9 + vmovdqa XMMWORD[16+rsi],xmm10 + vmovdqa XMMWORD[32+rsi],xmm11 + vmovdqa XMMWORD[48+rsi],xmm12 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes128gcmsiv_kdf: +global aes128gcmsiv_enc_msg_x4 + +ALIGN 16 +aes128gcmsiv_enc_msg_x4: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_enc_msg_x4: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + test r8,r8 + jnz NEAR $L$128_enc_msg_x4_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$128_enc_msg_x4_start: + push r12 + + push r13 + + + shr r8,4 + mov r10,r8 + shl r10,62 + shr r10,62 + + + vmovdqa xmm15,XMMWORD[rdx] + vpor xmm15,xmm15,XMMWORD[OR_MASK] + + vmovdqu xmm4,XMMWORD[four] + vmovdqa xmm0,xmm15 + vpaddd xmm1,xmm15,XMMWORD[one] + vpaddd xmm2,xmm15,XMMWORD[two] + vpaddd xmm3,xmm15,XMMWORD[three] + + shr r8,2 + je NEAR $L$128_enc_msg_x4_check_remainder + + sub rsi,64 + sub rdi,64 + +$L$128_enc_msg_x4_loop1: + add rsi,64 + add rdi,64 + + vmovdqa xmm5,xmm0 + vmovdqa xmm6,xmm1 + vmovdqa xmm7,xmm2 + vmovdqa xmm8,xmm3 + + vpxor xmm5,xmm5,XMMWORD[rcx] + vpxor xmm6,xmm6,XMMWORD[rcx] + vpxor xmm7,xmm7,XMMWORD[rcx] + vpxor xmm8,xmm8,XMMWORD[rcx] + + vmovdqu xmm12,XMMWORD[16+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm0,xmm0,xmm4 + vmovdqu xmm12,XMMWORD[32+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm1,xmm1,xmm4 + vmovdqu xmm12,XMMWORD[48+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm2,xmm2,xmm4 + vmovdqu xmm12,XMMWORD[64+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm3,xmm3,xmm4 + + vmovdqu xmm12,XMMWORD[80+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[96+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[112+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[128+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[144+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[160+rcx] + vaesenclast xmm5,xmm5,xmm12 + vaesenclast xmm6,xmm6,xmm12 + vaesenclast xmm7,xmm7,xmm12 + vaesenclast xmm8,xmm8,xmm12 + + + + vpxor xmm5,xmm5,XMMWORD[rdi] + vpxor xmm6,xmm6,XMMWORD[16+rdi] + vpxor xmm7,xmm7,XMMWORD[32+rdi] + vpxor xmm8,xmm8,XMMWORD[48+rdi] + + sub r8,1 + + vmovdqu XMMWORD[rsi],xmm5 + vmovdqu XMMWORD[16+rsi],xmm6 + vmovdqu XMMWORD[32+rsi],xmm7 + vmovdqu XMMWORD[48+rsi],xmm8 + + jne NEAR $L$128_enc_msg_x4_loop1 + + add rsi,64 + add rdi,64 + +$L$128_enc_msg_x4_check_remainder: + cmp r10,0 + je NEAR $L$128_enc_msg_x4_out + +$L$128_enc_msg_x4_loop2: + + + vmovdqa xmm5,xmm0 + vpaddd xmm0,xmm0,XMMWORD[one] + + vpxor xmm5,xmm5,XMMWORD[rcx] + vaesenc xmm5,xmm5,XMMWORD[16+rcx] + vaesenc xmm5,xmm5,XMMWORD[32+rcx] + vaesenc xmm5,xmm5,XMMWORD[48+rcx] + vaesenc xmm5,xmm5,XMMWORD[64+rcx] + vaesenc xmm5,xmm5,XMMWORD[80+rcx] + vaesenc xmm5,xmm5,XMMWORD[96+rcx] + vaesenc xmm5,xmm5,XMMWORD[112+rcx] + vaesenc xmm5,xmm5,XMMWORD[128+rcx] + vaesenc xmm5,xmm5,XMMWORD[144+rcx] + vaesenclast xmm5,xmm5,XMMWORD[160+rcx] + + + vpxor xmm5,xmm5,XMMWORD[rdi] + vmovdqu XMMWORD[rsi],xmm5 + + add rdi,16 + add rsi,16 + + sub r10,1 + jne NEAR $L$128_enc_msg_x4_loop2 + +$L$128_enc_msg_x4_out: + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes128gcmsiv_enc_msg_x4: +global aes128gcmsiv_enc_msg_x8 + +ALIGN 16 +aes128gcmsiv_enc_msg_x8: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_enc_msg_x8: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + test r8,r8 + jnz NEAR $L$128_enc_msg_x8_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$128_enc_msg_x8_start: + push r12 + + push r13 + + push rbp + + mov rbp,rsp + + + + sub rsp,128 + and rsp,-64 + + shr r8,4 + mov r10,r8 + shl r10,61 + shr r10,61 + + + vmovdqu xmm1,XMMWORD[rdx] + vpor xmm1,xmm1,XMMWORD[OR_MASK] + + + vpaddd xmm0,xmm1,XMMWORD[seven] + vmovdqu XMMWORD[rsp],xmm0 + vpaddd xmm9,xmm1,XMMWORD[one] + vpaddd xmm10,xmm1,XMMWORD[two] + vpaddd xmm11,xmm1,XMMWORD[three] + vpaddd xmm12,xmm1,XMMWORD[four] + vpaddd xmm13,xmm1,XMMWORD[five] + vpaddd xmm14,xmm1,XMMWORD[six] + vmovdqa xmm0,xmm1 + + shr r8,3 + je NEAR $L$128_enc_msg_x8_check_remainder + + sub rsi,128 + sub rdi,128 + +$L$128_enc_msg_x8_loop1: + add rsi,128 + add rdi,128 + + vmovdqa xmm1,xmm0 + vmovdqa xmm2,xmm9 + vmovdqa xmm3,xmm10 + vmovdqa xmm4,xmm11 + vmovdqa xmm5,xmm12 + vmovdqa xmm6,xmm13 + vmovdqa xmm7,xmm14 + + vmovdqu xmm8,XMMWORD[rsp] + + vpxor xmm1,xmm1,XMMWORD[rcx] + vpxor xmm2,xmm2,XMMWORD[rcx] + vpxor xmm3,xmm3,XMMWORD[rcx] + vpxor xmm4,xmm4,XMMWORD[rcx] + vpxor xmm5,xmm5,XMMWORD[rcx] + vpxor xmm6,xmm6,XMMWORD[rcx] + vpxor xmm7,xmm7,XMMWORD[rcx] + vpxor xmm8,xmm8,XMMWORD[rcx] + + vmovdqu xmm15,XMMWORD[16+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm14,XMMWORD[rsp] + vpaddd xmm14,xmm14,XMMWORD[eight] + vmovdqu XMMWORD[rsp],xmm14 + vmovdqu xmm15,XMMWORD[32+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpsubd xmm14,xmm14,XMMWORD[one] + vmovdqu xmm15,XMMWORD[48+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm0,xmm0,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[64+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm9,xmm9,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[80+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm10,xmm10,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[96+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm11,xmm11,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[112+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm12,xmm12,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[128+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm13,xmm13,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[144+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[160+rcx] + vaesenclast xmm1,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm15 + vaesenclast xmm3,xmm3,xmm15 + vaesenclast xmm4,xmm4,xmm15 + vaesenclast xmm5,xmm5,xmm15 + vaesenclast xmm6,xmm6,xmm15 + vaesenclast xmm7,xmm7,xmm15 + vaesenclast xmm8,xmm8,xmm15 + + + + vpxor xmm1,xmm1,XMMWORD[rdi] + vpxor xmm2,xmm2,XMMWORD[16+rdi] + vpxor xmm3,xmm3,XMMWORD[32+rdi] + vpxor xmm4,xmm4,XMMWORD[48+rdi] + vpxor xmm5,xmm5,XMMWORD[64+rdi] + vpxor xmm6,xmm6,XMMWORD[80+rdi] + vpxor xmm7,xmm7,XMMWORD[96+rdi] + vpxor xmm8,xmm8,XMMWORD[112+rdi] + + dec r8 + + vmovdqu XMMWORD[rsi],xmm1 + vmovdqu XMMWORD[16+rsi],xmm2 + vmovdqu XMMWORD[32+rsi],xmm3 + vmovdqu XMMWORD[48+rsi],xmm4 + vmovdqu XMMWORD[64+rsi],xmm5 + vmovdqu XMMWORD[80+rsi],xmm6 + vmovdqu XMMWORD[96+rsi],xmm7 + vmovdqu XMMWORD[112+rsi],xmm8 + + jne NEAR $L$128_enc_msg_x8_loop1 + + add rsi,128 + add rdi,128 + +$L$128_enc_msg_x8_check_remainder: + cmp r10,0 + je NEAR $L$128_enc_msg_x8_out + +$L$128_enc_msg_x8_loop2: + + + vmovdqa xmm1,xmm0 + vpaddd xmm0,xmm0,XMMWORD[one] + + vpxor xmm1,xmm1,XMMWORD[rcx] + vaesenc xmm1,xmm1,XMMWORD[16+rcx] + vaesenc xmm1,xmm1,XMMWORD[32+rcx] + vaesenc xmm1,xmm1,XMMWORD[48+rcx] + vaesenc xmm1,xmm1,XMMWORD[64+rcx] + vaesenc xmm1,xmm1,XMMWORD[80+rcx] + vaesenc xmm1,xmm1,XMMWORD[96+rcx] + vaesenc xmm1,xmm1,XMMWORD[112+rcx] + vaesenc xmm1,xmm1,XMMWORD[128+rcx] + vaesenc xmm1,xmm1,XMMWORD[144+rcx] + vaesenclast xmm1,xmm1,XMMWORD[160+rcx] + + + vpxor xmm1,xmm1,XMMWORD[rdi] + + vmovdqu XMMWORD[rsi],xmm1 + + add rdi,16 + add rsi,16 + + dec r10 + jne NEAR $L$128_enc_msg_x8_loop2 + +$L$128_enc_msg_x8_out: + mov rsp,rbp + + pop rbp + + pop r13 + + pop r12 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes128gcmsiv_enc_msg_x8: +global aes128gcmsiv_dec + +ALIGN 16 +aes128gcmsiv_dec: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_dec: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + test r9,~15 + jnz NEAR $L$128_dec_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$128_dec_start: + vzeroupper + vmovdqa xmm0,XMMWORD[rdx] + + + vmovdqu xmm15,XMMWORD[16+rdx] + vpor xmm15,xmm15,XMMWORD[OR_MASK] + mov rax,rdx + + lea rax,[32+rax] + lea rcx,[32+rcx] + + and r9,~15 + + + cmp r9,96 + jb NEAR $L$128_dec_loop2 + + + sub r9,96 + vmovdqa xmm7,xmm15 + vpaddd xmm8,xmm7,XMMWORD[one] + vpaddd xmm9,xmm7,XMMWORD[two] + vpaddd xmm10,xmm9,XMMWORD[one] + vpaddd xmm11,xmm9,XMMWORD[two] + vpaddd xmm12,xmm11,XMMWORD[one] + vpaddd xmm15,xmm11,XMMWORD[two] + + vpxor xmm7,xmm7,XMMWORD[r8] + vpxor xmm8,xmm8,XMMWORD[r8] + vpxor xmm9,xmm9,XMMWORD[r8] + vpxor xmm10,xmm10,XMMWORD[r8] + vpxor xmm11,xmm11,XMMWORD[r8] + vpxor xmm12,xmm12,XMMWORD[r8] + + vmovdqu xmm4,XMMWORD[16+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[32+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[48+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[64+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[80+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[96+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[112+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[128+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[144+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[160+r8] + vaesenclast xmm7,xmm7,xmm4 + vaesenclast xmm8,xmm8,xmm4 + vaesenclast xmm9,xmm9,xmm4 + vaesenclast xmm10,xmm10,xmm4 + vaesenclast xmm11,xmm11,xmm4 + vaesenclast xmm12,xmm12,xmm4 + + + vpxor xmm7,xmm7,XMMWORD[rdi] + vpxor xmm8,xmm8,XMMWORD[16+rdi] + vpxor xmm9,xmm9,XMMWORD[32+rdi] + vpxor xmm10,xmm10,XMMWORD[48+rdi] + vpxor xmm11,xmm11,XMMWORD[64+rdi] + vpxor xmm12,xmm12,XMMWORD[80+rdi] + + vmovdqu XMMWORD[rsi],xmm7 + vmovdqu XMMWORD[16+rsi],xmm8 + vmovdqu XMMWORD[32+rsi],xmm9 + vmovdqu XMMWORD[48+rsi],xmm10 + vmovdqu XMMWORD[64+rsi],xmm11 + vmovdqu XMMWORD[80+rsi],xmm12 + + add rdi,96 + add rsi,96 + jmp NEAR $L$128_dec_loop1 + + +ALIGN 64 +$L$128_dec_loop1: + cmp r9,96 + jb NEAR $L$128_dec_finish_96 + sub r9,96 + + vmovdqa xmm6,xmm12 + vmovdqa XMMWORD[(16-32)+rax],xmm11 + vmovdqa XMMWORD[(32-32)+rax],xmm10 + vmovdqa XMMWORD[(48-32)+rax],xmm9 + vmovdqa XMMWORD[(64-32)+rax],xmm8 + vmovdqa XMMWORD[(80-32)+rax],xmm7 + + vmovdqa xmm7,xmm15 + vpaddd xmm8,xmm7,XMMWORD[one] + vpaddd xmm9,xmm7,XMMWORD[two] + vpaddd xmm10,xmm9,XMMWORD[one] + vpaddd xmm11,xmm9,XMMWORD[two] + vpaddd xmm12,xmm11,XMMWORD[one] + vpaddd xmm15,xmm11,XMMWORD[two] + + vmovdqa xmm4,XMMWORD[r8] + vpxor xmm7,xmm7,xmm4 + vpxor xmm8,xmm8,xmm4 + vpxor xmm9,xmm9,xmm4 + vpxor xmm10,xmm10,xmm4 + vpxor xmm11,xmm11,xmm4 + vpxor xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[((0-32))+rcx] + vpclmulqdq xmm2,xmm6,xmm4,0x11 + vpclmulqdq xmm3,xmm6,xmm4,0x00 + vpclmulqdq xmm1,xmm6,xmm4,0x01 + vpclmulqdq xmm4,xmm6,xmm4,0x10 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm4,XMMWORD[16+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[((-16))+rax] + vmovdqu xmm13,XMMWORD[((-16))+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[32+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[rax] + vmovdqu xmm13,XMMWORD[rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[48+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[16+rax] + vmovdqu xmm13,XMMWORD[16+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[64+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[32+rax] + vmovdqu xmm13,XMMWORD[32+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[80+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[96+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[112+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + + vmovdqa xmm6,XMMWORD[((80-32))+rax] + vpxor xmm6,xmm6,xmm0 + vmovdqu xmm5,XMMWORD[((80-32))+rcx] + + vpclmulqdq xmm4,xmm6,xmm5,0x01 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x10 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm4,XMMWORD[128+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + + vpsrldq xmm4,xmm1,8 + vpxor xmm5,xmm2,xmm4 + vpslldq xmm4,xmm1,8 + vpxor xmm0,xmm3,xmm4 + + vmovdqa xmm3,XMMWORD[poly] + + vmovdqu xmm4,XMMWORD[144+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[160+r8] + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpxor xmm4,xmm6,XMMWORD[rdi] + vaesenclast xmm7,xmm7,xmm4 + vpxor xmm4,xmm6,XMMWORD[16+rdi] + vaesenclast xmm8,xmm8,xmm4 + vpxor xmm4,xmm6,XMMWORD[32+rdi] + vaesenclast xmm9,xmm9,xmm4 + vpxor xmm4,xmm6,XMMWORD[48+rdi] + vaesenclast xmm10,xmm10,xmm4 + vpxor xmm4,xmm6,XMMWORD[64+rdi] + vaesenclast xmm11,xmm11,xmm4 + vpxor xmm4,xmm6,XMMWORD[80+rdi] + vaesenclast xmm12,xmm12,xmm4 + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vmovdqu XMMWORD[rsi],xmm7 + vmovdqu XMMWORD[16+rsi],xmm8 + vmovdqu XMMWORD[32+rsi],xmm9 + vmovdqu XMMWORD[48+rsi],xmm10 + vmovdqu XMMWORD[64+rsi],xmm11 + vmovdqu XMMWORD[80+rsi],xmm12 + + vpxor xmm0,xmm0,xmm5 + + lea rdi,[96+rdi] + lea rsi,[96+rsi] + jmp NEAR $L$128_dec_loop1 + +$L$128_dec_finish_96: + vmovdqa xmm6,xmm12 + vmovdqa XMMWORD[(16-32)+rax],xmm11 + vmovdqa XMMWORD[(32-32)+rax],xmm10 + vmovdqa XMMWORD[(48-32)+rax],xmm9 + vmovdqa XMMWORD[(64-32)+rax],xmm8 + vmovdqa XMMWORD[(80-32)+rax],xmm7 + + vmovdqu xmm4,XMMWORD[((0-32))+rcx] + vpclmulqdq xmm1,xmm6,xmm4,0x10 + vpclmulqdq xmm2,xmm6,xmm4,0x11 + vpclmulqdq xmm3,xmm6,xmm4,0x00 + vpclmulqdq xmm4,xmm6,xmm4,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[((-16))+rax] + vmovdqu xmm13,XMMWORD[((-16))+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[rax] + vmovdqu xmm13,XMMWORD[rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[16+rax] + vmovdqu xmm13,XMMWORD[16+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[32+rax] + vmovdqu xmm13,XMMWORD[32+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm6,XMMWORD[((80-32))+rax] + vpxor xmm6,xmm6,xmm0 + vmovdqu xmm5,XMMWORD[((80-32))+rcx] + vpclmulqdq xmm4,xmm6,xmm5,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x01 + vpxor xmm1,xmm1,xmm4 + + vpsrldq xmm4,xmm1,8 + vpxor xmm5,xmm2,xmm4 + vpslldq xmm4,xmm1,8 + vpxor xmm0,xmm3,xmm4 + + vmovdqa xmm3,XMMWORD[poly] + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpxor xmm0,xmm0,xmm5 + +$L$128_dec_loop2: + + + + cmp r9,16 + jb NEAR $L$128_dec_out + sub r9,16 + + vmovdqa xmm2,xmm15 + vpaddd xmm15,xmm15,XMMWORD[one] + + vpxor xmm2,xmm2,XMMWORD[r8] + vaesenc xmm2,xmm2,XMMWORD[16+r8] + vaesenc xmm2,xmm2,XMMWORD[32+r8] + vaesenc xmm2,xmm2,XMMWORD[48+r8] + vaesenc xmm2,xmm2,XMMWORD[64+r8] + vaesenc xmm2,xmm2,XMMWORD[80+r8] + vaesenc xmm2,xmm2,XMMWORD[96+r8] + vaesenc xmm2,xmm2,XMMWORD[112+r8] + vaesenc xmm2,xmm2,XMMWORD[128+r8] + vaesenc xmm2,xmm2,XMMWORD[144+r8] + vaesenclast xmm2,xmm2,XMMWORD[160+r8] + vpxor xmm2,xmm2,XMMWORD[rdi] + vmovdqu XMMWORD[rsi],xmm2 + add rdi,16 + add rsi,16 + + vpxor xmm0,xmm0,xmm2 + vmovdqa xmm1,XMMWORD[((-32))+rcx] + call GFMUL + + jmp NEAR $L$128_dec_loop2 + +$L$128_dec_out: + vmovdqu XMMWORD[rdx],xmm0 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes128gcmsiv_dec: +global aes128gcmsiv_ecb_enc_block + +ALIGN 16 +aes128gcmsiv_ecb_enc_block: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes128gcmsiv_ecb_enc_block: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + vmovdqa xmm1,XMMWORD[rdi] + + vpxor xmm1,xmm1,XMMWORD[rdx] + vaesenc xmm1,xmm1,XMMWORD[16+rdx] + vaesenc xmm1,xmm1,XMMWORD[32+rdx] + vaesenc xmm1,xmm1,XMMWORD[48+rdx] + vaesenc xmm1,xmm1,XMMWORD[64+rdx] + vaesenc xmm1,xmm1,XMMWORD[80+rdx] + vaesenc xmm1,xmm1,XMMWORD[96+rdx] + vaesenc xmm1,xmm1,XMMWORD[112+rdx] + vaesenc xmm1,xmm1,XMMWORD[128+rdx] + vaesenc xmm1,xmm1,XMMWORD[144+rdx] + vaesenclast xmm1,xmm1,XMMWORD[160+rdx] + + vmovdqa XMMWORD[rsi],xmm1 + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes128gcmsiv_ecb_enc_block: +global aes256gcmsiv_aes_ks_enc_x1 + +ALIGN 16 +aes256gcmsiv_aes_ks_enc_x1: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_aes_ks_enc_x1: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + + + +_CET_ENDBR + vmovdqa xmm0,XMMWORD[con1] + vmovdqa xmm15,XMMWORD[mask] + vmovdqa xmm8,XMMWORD[rdi] + vmovdqa xmm1,XMMWORD[rcx] + vmovdqa xmm3,XMMWORD[16+rcx] + vpxor xmm8,xmm8,xmm1 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[rdx],xmm1 + vmovdqu XMMWORD[16+rdx],xmm3 + vpxor xmm14,xmm14,xmm14 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[32+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[48+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[64+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[80+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[96+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[112+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[128+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[144+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[160+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[176+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslld xmm0,xmm0,1 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenc xmm8,xmm8,xmm1 + vmovdqu XMMWORD[192+rdx],xmm1 + + vpshufd xmm2,xmm1,0xff + vaesenclast xmm2,xmm2,xmm14 + vpslldq xmm4,xmm3,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm3,xmm3,xmm4 + vpxor xmm3,xmm3,xmm2 + vaesenc xmm8,xmm8,xmm3 + vmovdqu XMMWORD[208+rdx],xmm3 + + vpshufb xmm2,xmm3,xmm15 + vaesenclast xmm2,xmm2,xmm0 + vpslldq xmm4,xmm1,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpslldq xmm4,xmm4,4 + vpxor xmm1,xmm1,xmm4 + vpxor xmm1,xmm1,xmm2 + vaesenclast xmm8,xmm8,xmm1 + vmovdqu XMMWORD[224+rdx],xmm1 + + vmovdqa XMMWORD[rsi],xmm8 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes256gcmsiv_aes_ks_enc_x1: +global aes256gcmsiv_ecb_enc_block + +ALIGN 16 +aes256gcmsiv_ecb_enc_block: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_ecb_enc_block: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + vmovdqa xmm1,XMMWORD[rdi] + vpxor xmm1,xmm1,XMMWORD[rdx] + vaesenc xmm1,xmm1,XMMWORD[16+rdx] + vaesenc xmm1,xmm1,XMMWORD[32+rdx] + vaesenc xmm1,xmm1,XMMWORD[48+rdx] + vaesenc xmm1,xmm1,XMMWORD[64+rdx] + vaesenc xmm1,xmm1,XMMWORD[80+rdx] + vaesenc xmm1,xmm1,XMMWORD[96+rdx] + vaesenc xmm1,xmm1,XMMWORD[112+rdx] + vaesenc xmm1,xmm1,XMMWORD[128+rdx] + vaesenc xmm1,xmm1,XMMWORD[144+rdx] + vaesenc xmm1,xmm1,XMMWORD[160+rdx] + vaesenc xmm1,xmm1,XMMWORD[176+rdx] + vaesenc xmm1,xmm1,XMMWORD[192+rdx] + vaesenc xmm1,xmm1,XMMWORD[208+rdx] + vaesenclast xmm1,xmm1,XMMWORD[224+rdx] + vmovdqa XMMWORD[rsi],xmm1 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes256gcmsiv_ecb_enc_block: +global aes256gcmsiv_enc_msg_x4 + +ALIGN 16 +aes256gcmsiv_enc_msg_x4: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_enc_msg_x4: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + test r8,r8 + jnz NEAR $L$256_enc_msg_x4_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$256_enc_msg_x4_start: + mov r10,r8 + shr r8,4 + shl r10,60 + jz NEAR $L$256_enc_msg_x4_start2 + add r8,1 + +$L$256_enc_msg_x4_start2: + mov r10,r8 + shl r10,62 + shr r10,62 + + + vmovdqa xmm15,XMMWORD[rdx] + vpor xmm15,xmm15,XMMWORD[OR_MASK] + + vmovdqa xmm4,XMMWORD[four] + vmovdqa xmm0,xmm15 + vpaddd xmm1,xmm15,XMMWORD[one] + vpaddd xmm2,xmm15,XMMWORD[two] + vpaddd xmm3,xmm15,XMMWORD[three] + + shr r8,2 + je NEAR $L$256_enc_msg_x4_check_remainder + + sub rsi,64 + sub rdi,64 + +$L$256_enc_msg_x4_loop1: + add rsi,64 + add rdi,64 + + vmovdqa xmm5,xmm0 + vmovdqa xmm6,xmm1 + vmovdqa xmm7,xmm2 + vmovdqa xmm8,xmm3 + + vpxor xmm5,xmm5,XMMWORD[rcx] + vpxor xmm6,xmm6,XMMWORD[rcx] + vpxor xmm7,xmm7,XMMWORD[rcx] + vpxor xmm8,xmm8,XMMWORD[rcx] + + vmovdqu xmm12,XMMWORD[16+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm0,xmm0,xmm4 + vmovdqu xmm12,XMMWORD[32+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm1,xmm1,xmm4 + vmovdqu xmm12,XMMWORD[48+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm2,xmm2,xmm4 + vmovdqu xmm12,XMMWORD[64+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vpaddd xmm3,xmm3,xmm4 + + vmovdqu xmm12,XMMWORD[80+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[96+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[112+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[128+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[144+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[160+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[176+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[192+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[208+rcx] + vaesenc xmm5,xmm5,xmm12 + vaesenc xmm6,xmm6,xmm12 + vaesenc xmm7,xmm7,xmm12 + vaesenc xmm8,xmm8,xmm12 + + vmovdqu xmm12,XMMWORD[224+rcx] + vaesenclast xmm5,xmm5,xmm12 + vaesenclast xmm6,xmm6,xmm12 + vaesenclast xmm7,xmm7,xmm12 + vaesenclast xmm8,xmm8,xmm12 + + + + vpxor xmm5,xmm5,XMMWORD[rdi] + vpxor xmm6,xmm6,XMMWORD[16+rdi] + vpxor xmm7,xmm7,XMMWORD[32+rdi] + vpxor xmm8,xmm8,XMMWORD[48+rdi] + + sub r8,1 + + vmovdqu XMMWORD[rsi],xmm5 + vmovdqu XMMWORD[16+rsi],xmm6 + vmovdqu XMMWORD[32+rsi],xmm7 + vmovdqu XMMWORD[48+rsi],xmm8 + + jne NEAR $L$256_enc_msg_x4_loop1 + + add rsi,64 + add rdi,64 + +$L$256_enc_msg_x4_check_remainder: + cmp r10,0 + je NEAR $L$256_enc_msg_x4_out + +$L$256_enc_msg_x4_loop2: + + + + vmovdqa xmm5,xmm0 + vpaddd xmm0,xmm0,XMMWORD[one] + vpxor xmm5,xmm5,XMMWORD[rcx] + vaesenc xmm5,xmm5,XMMWORD[16+rcx] + vaesenc xmm5,xmm5,XMMWORD[32+rcx] + vaesenc xmm5,xmm5,XMMWORD[48+rcx] + vaesenc xmm5,xmm5,XMMWORD[64+rcx] + vaesenc xmm5,xmm5,XMMWORD[80+rcx] + vaesenc xmm5,xmm5,XMMWORD[96+rcx] + vaesenc xmm5,xmm5,XMMWORD[112+rcx] + vaesenc xmm5,xmm5,XMMWORD[128+rcx] + vaesenc xmm5,xmm5,XMMWORD[144+rcx] + vaesenc xmm5,xmm5,XMMWORD[160+rcx] + vaesenc xmm5,xmm5,XMMWORD[176+rcx] + vaesenc xmm5,xmm5,XMMWORD[192+rcx] + vaesenc xmm5,xmm5,XMMWORD[208+rcx] + vaesenclast xmm5,xmm5,XMMWORD[224+rcx] + + + vpxor xmm5,xmm5,XMMWORD[rdi] + + vmovdqu XMMWORD[rsi],xmm5 + + add rdi,16 + add rsi,16 + + sub r10,1 + jne NEAR $L$256_enc_msg_x4_loop2 + +$L$256_enc_msg_x4_out: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes256gcmsiv_enc_msg_x4: +global aes256gcmsiv_enc_msg_x8 + +ALIGN 16 +aes256gcmsiv_enc_msg_x8: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_enc_msg_x8: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + test r8,r8 + jnz NEAR $L$256_enc_msg_x8_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$256_enc_msg_x8_start: + + mov r11,rsp + sub r11,16 + and r11,-64 + + mov r10,r8 + shr r8,4 + shl r10,60 + jz NEAR $L$256_enc_msg_x8_start2 + add r8,1 + +$L$256_enc_msg_x8_start2: + mov r10,r8 + shl r10,61 + shr r10,61 + + + vmovdqa xmm1,XMMWORD[rdx] + vpor xmm1,xmm1,XMMWORD[OR_MASK] + + + vpaddd xmm0,xmm1,XMMWORD[seven] + vmovdqa XMMWORD[r11],xmm0 + vpaddd xmm9,xmm1,XMMWORD[one] + vpaddd xmm10,xmm1,XMMWORD[two] + vpaddd xmm11,xmm1,XMMWORD[three] + vpaddd xmm12,xmm1,XMMWORD[four] + vpaddd xmm13,xmm1,XMMWORD[five] + vpaddd xmm14,xmm1,XMMWORD[six] + vmovdqa xmm0,xmm1 + + shr r8,3 + jz NEAR $L$256_enc_msg_x8_check_remainder + + sub rsi,128 + sub rdi,128 + +$L$256_enc_msg_x8_loop1: + add rsi,128 + add rdi,128 + + vmovdqa xmm1,xmm0 + vmovdqa xmm2,xmm9 + vmovdqa xmm3,xmm10 + vmovdqa xmm4,xmm11 + vmovdqa xmm5,xmm12 + vmovdqa xmm6,xmm13 + vmovdqa xmm7,xmm14 + + vmovdqa xmm8,XMMWORD[r11] + + vpxor xmm1,xmm1,XMMWORD[rcx] + vpxor xmm2,xmm2,XMMWORD[rcx] + vpxor xmm3,xmm3,XMMWORD[rcx] + vpxor xmm4,xmm4,XMMWORD[rcx] + vpxor xmm5,xmm5,XMMWORD[rcx] + vpxor xmm6,xmm6,XMMWORD[rcx] + vpxor xmm7,xmm7,XMMWORD[rcx] + vpxor xmm8,xmm8,XMMWORD[rcx] + + vmovdqu xmm15,XMMWORD[16+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqa xmm14,XMMWORD[r11] + vpaddd xmm14,xmm14,XMMWORD[eight] + vmovdqa XMMWORD[r11],xmm14 + vmovdqu xmm15,XMMWORD[32+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpsubd xmm14,xmm14,XMMWORD[one] + vmovdqu xmm15,XMMWORD[48+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm0,xmm0,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[64+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm9,xmm9,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[80+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm10,xmm10,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[96+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm11,xmm11,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[112+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm12,xmm12,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[128+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vpaddd xmm13,xmm13,XMMWORD[eight] + vmovdqu xmm15,XMMWORD[144+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[160+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[176+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[192+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[208+rcx] + vaesenc xmm1,xmm1,xmm15 + vaesenc xmm2,xmm2,xmm15 + vaesenc xmm3,xmm3,xmm15 + vaesenc xmm4,xmm4,xmm15 + vaesenc xmm5,xmm5,xmm15 + vaesenc xmm6,xmm6,xmm15 + vaesenc xmm7,xmm7,xmm15 + vaesenc xmm8,xmm8,xmm15 + + vmovdqu xmm15,XMMWORD[224+rcx] + vaesenclast xmm1,xmm1,xmm15 + vaesenclast xmm2,xmm2,xmm15 + vaesenclast xmm3,xmm3,xmm15 + vaesenclast xmm4,xmm4,xmm15 + vaesenclast xmm5,xmm5,xmm15 + vaesenclast xmm6,xmm6,xmm15 + vaesenclast xmm7,xmm7,xmm15 + vaesenclast xmm8,xmm8,xmm15 + + + + vpxor xmm1,xmm1,XMMWORD[rdi] + vpxor xmm2,xmm2,XMMWORD[16+rdi] + vpxor xmm3,xmm3,XMMWORD[32+rdi] + vpxor xmm4,xmm4,XMMWORD[48+rdi] + vpxor xmm5,xmm5,XMMWORD[64+rdi] + vpxor xmm6,xmm6,XMMWORD[80+rdi] + vpxor xmm7,xmm7,XMMWORD[96+rdi] + vpxor xmm8,xmm8,XMMWORD[112+rdi] + + sub r8,1 + + vmovdqu XMMWORD[rsi],xmm1 + vmovdqu XMMWORD[16+rsi],xmm2 + vmovdqu XMMWORD[32+rsi],xmm3 + vmovdqu XMMWORD[48+rsi],xmm4 + vmovdqu XMMWORD[64+rsi],xmm5 + vmovdqu XMMWORD[80+rsi],xmm6 + vmovdqu XMMWORD[96+rsi],xmm7 + vmovdqu XMMWORD[112+rsi],xmm8 + + jne NEAR $L$256_enc_msg_x8_loop1 + + add rsi,128 + add rdi,128 + +$L$256_enc_msg_x8_check_remainder: + cmp r10,0 + je NEAR $L$256_enc_msg_x8_out + +$L$256_enc_msg_x8_loop2: + + + vmovdqa xmm1,xmm0 + vpaddd xmm0,xmm0,XMMWORD[one] + + vpxor xmm1,xmm1,XMMWORD[rcx] + vaesenc xmm1,xmm1,XMMWORD[16+rcx] + vaesenc xmm1,xmm1,XMMWORD[32+rcx] + vaesenc xmm1,xmm1,XMMWORD[48+rcx] + vaesenc xmm1,xmm1,XMMWORD[64+rcx] + vaesenc xmm1,xmm1,XMMWORD[80+rcx] + vaesenc xmm1,xmm1,XMMWORD[96+rcx] + vaesenc xmm1,xmm1,XMMWORD[112+rcx] + vaesenc xmm1,xmm1,XMMWORD[128+rcx] + vaesenc xmm1,xmm1,XMMWORD[144+rcx] + vaesenc xmm1,xmm1,XMMWORD[160+rcx] + vaesenc xmm1,xmm1,XMMWORD[176+rcx] + vaesenc xmm1,xmm1,XMMWORD[192+rcx] + vaesenc xmm1,xmm1,XMMWORD[208+rcx] + vaesenclast xmm1,xmm1,XMMWORD[224+rcx] + + + vpxor xmm1,xmm1,XMMWORD[rdi] + + vmovdqu XMMWORD[rsi],xmm1 + + add rdi,16 + add rsi,16 + sub r10,1 + jnz NEAR $L$256_enc_msg_x8_loop2 + +$L$256_enc_msg_x8_out: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + + +$L$SEH_end_aes256gcmsiv_enc_msg_x8: +global aes256gcmsiv_dec + +ALIGN 16 +aes256gcmsiv_dec: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_dec: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + test r9,~15 + jnz NEAR $L$256_dec_start + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$256_dec_start: + vzeroupper + vmovdqa xmm0,XMMWORD[rdx] + + + vmovdqu xmm15,XMMWORD[16+rdx] + vpor xmm15,xmm15,XMMWORD[OR_MASK] + mov rax,rdx + + lea rax,[32+rax] + lea rcx,[32+rcx] + + and r9,~15 + + + cmp r9,96 + jb NEAR $L$256_dec_loop2 + + + sub r9,96 + vmovdqa xmm7,xmm15 + vpaddd xmm8,xmm7,XMMWORD[one] + vpaddd xmm9,xmm7,XMMWORD[two] + vpaddd xmm10,xmm9,XMMWORD[one] + vpaddd xmm11,xmm9,XMMWORD[two] + vpaddd xmm12,xmm11,XMMWORD[one] + vpaddd xmm15,xmm11,XMMWORD[two] + + vpxor xmm7,xmm7,XMMWORD[r8] + vpxor xmm8,xmm8,XMMWORD[r8] + vpxor xmm9,xmm9,XMMWORD[r8] + vpxor xmm10,xmm10,XMMWORD[r8] + vpxor xmm11,xmm11,XMMWORD[r8] + vpxor xmm12,xmm12,XMMWORD[r8] + + vmovdqu xmm4,XMMWORD[16+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[32+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[48+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[64+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[80+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[96+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[112+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[128+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[144+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[160+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[176+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[192+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[208+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[224+r8] + vaesenclast xmm7,xmm7,xmm4 + vaesenclast xmm8,xmm8,xmm4 + vaesenclast xmm9,xmm9,xmm4 + vaesenclast xmm10,xmm10,xmm4 + vaesenclast xmm11,xmm11,xmm4 + vaesenclast xmm12,xmm12,xmm4 + + + vpxor xmm7,xmm7,XMMWORD[rdi] + vpxor xmm8,xmm8,XMMWORD[16+rdi] + vpxor xmm9,xmm9,XMMWORD[32+rdi] + vpxor xmm10,xmm10,XMMWORD[48+rdi] + vpxor xmm11,xmm11,XMMWORD[64+rdi] + vpxor xmm12,xmm12,XMMWORD[80+rdi] + + vmovdqu XMMWORD[rsi],xmm7 + vmovdqu XMMWORD[16+rsi],xmm8 + vmovdqu XMMWORD[32+rsi],xmm9 + vmovdqu XMMWORD[48+rsi],xmm10 + vmovdqu XMMWORD[64+rsi],xmm11 + vmovdqu XMMWORD[80+rsi],xmm12 + + add rdi,96 + add rsi,96 + jmp NEAR $L$256_dec_loop1 + + +ALIGN 64 +$L$256_dec_loop1: + cmp r9,96 + jb NEAR $L$256_dec_finish_96 + sub r9,96 + + vmovdqa xmm6,xmm12 + vmovdqa XMMWORD[(16-32)+rax],xmm11 + vmovdqa XMMWORD[(32-32)+rax],xmm10 + vmovdqa XMMWORD[(48-32)+rax],xmm9 + vmovdqa XMMWORD[(64-32)+rax],xmm8 + vmovdqa XMMWORD[(80-32)+rax],xmm7 + + vmovdqa xmm7,xmm15 + vpaddd xmm8,xmm7,XMMWORD[one] + vpaddd xmm9,xmm7,XMMWORD[two] + vpaddd xmm10,xmm9,XMMWORD[one] + vpaddd xmm11,xmm9,XMMWORD[two] + vpaddd xmm12,xmm11,XMMWORD[one] + vpaddd xmm15,xmm11,XMMWORD[two] + + vmovdqa xmm4,XMMWORD[r8] + vpxor xmm7,xmm7,xmm4 + vpxor xmm8,xmm8,xmm4 + vpxor xmm9,xmm9,xmm4 + vpxor xmm10,xmm10,xmm4 + vpxor xmm11,xmm11,xmm4 + vpxor xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[((0-32))+rcx] + vpclmulqdq xmm2,xmm6,xmm4,0x11 + vpclmulqdq xmm3,xmm6,xmm4,0x00 + vpclmulqdq xmm1,xmm6,xmm4,0x01 + vpclmulqdq xmm4,xmm6,xmm4,0x10 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm4,XMMWORD[16+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[((-16))+rax] + vmovdqu xmm13,XMMWORD[((-16))+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[32+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[rax] + vmovdqu xmm13,XMMWORD[rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[48+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[16+rax] + vmovdqu xmm13,XMMWORD[16+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[64+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[32+rax] + vmovdqu xmm13,XMMWORD[32+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm4,XMMWORD[80+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[96+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[112+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + + vmovdqa xmm6,XMMWORD[((80-32))+rax] + vpxor xmm6,xmm6,xmm0 + vmovdqu xmm5,XMMWORD[((80-32))+rcx] + + vpclmulqdq xmm4,xmm6,xmm5,0x01 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x10 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm4,XMMWORD[128+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + + vpsrldq xmm4,xmm1,8 + vpxor xmm5,xmm2,xmm4 + vpslldq xmm4,xmm1,8 + vpxor xmm0,xmm3,xmm4 + + vmovdqa xmm3,XMMWORD[poly] + + vmovdqu xmm4,XMMWORD[144+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[160+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[176+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[192+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm4,XMMWORD[208+r8] + vaesenc xmm7,xmm7,xmm4 + vaesenc xmm8,xmm8,xmm4 + vaesenc xmm9,xmm9,xmm4 + vaesenc xmm10,xmm10,xmm4 + vaesenc xmm11,xmm11,xmm4 + vaesenc xmm12,xmm12,xmm4 + + vmovdqu xmm6,XMMWORD[224+r8] + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpxor xmm4,xmm6,XMMWORD[rdi] + vaesenclast xmm7,xmm7,xmm4 + vpxor xmm4,xmm6,XMMWORD[16+rdi] + vaesenclast xmm8,xmm8,xmm4 + vpxor xmm4,xmm6,XMMWORD[32+rdi] + vaesenclast xmm9,xmm9,xmm4 + vpxor xmm4,xmm6,XMMWORD[48+rdi] + vaesenclast xmm10,xmm10,xmm4 + vpxor xmm4,xmm6,XMMWORD[64+rdi] + vaesenclast xmm11,xmm11,xmm4 + vpxor xmm4,xmm6,XMMWORD[80+rdi] + vaesenclast xmm12,xmm12,xmm4 + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vmovdqu XMMWORD[rsi],xmm7 + vmovdqu XMMWORD[16+rsi],xmm8 + vmovdqu XMMWORD[32+rsi],xmm9 + vmovdqu XMMWORD[48+rsi],xmm10 + vmovdqu XMMWORD[64+rsi],xmm11 + vmovdqu XMMWORD[80+rsi],xmm12 + + vpxor xmm0,xmm0,xmm5 + + lea rdi,[96+rdi] + lea rsi,[96+rsi] + jmp NEAR $L$256_dec_loop1 + +$L$256_dec_finish_96: + vmovdqa xmm6,xmm12 + vmovdqa XMMWORD[(16-32)+rax],xmm11 + vmovdqa XMMWORD[(32-32)+rax],xmm10 + vmovdqa XMMWORD[(48-32)+rax],xmm9 + vmovdqa XMMWORD[(64-32)+rax],xmm8 + vmovdqa XMMWORD[(80-32)+rax],xmm7 + + vmovdqu xmm4,XMMWORD[((0-32))+rcx] + vpclmulqdq xmm1,xmm6,xmm4,0x10 + vpclmulqdq xmm2,xmm6,xmm4,0x11 + vpclmulqdq xmm3,xmm6,xmm4,0x00 + vpclmulqdq xmm4,xmm6,xmm4,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[((-16))+rax] + vmovdqu xmm13,XMMWORD[((-16))+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[rax] + vmovdqu xmm13,XMMWORD[rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[16+rax] + vmovdqu xmm13,XMMWORD[16+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + vmovdqu xmm6,XMMWORD[32+rax] + vmovdqu xmm13,XMMWORD[32+rcx] + + vpclmulqdq xmm4,xmm6,xmm13,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm13,0x01 + vpxor xmm1,xmm1,xmm4 + + + vmovdqu xmm6,XMMWORD[((80-32))+rax] + vpxor xmm6,xmm6,xmm0 + vmovdqu xmm5,XMMWORD[((80-32))+rcx] + vpclmulqdq xmm4,xmm6,xmm5,0x11 + vpxor xmm2,xmm2,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x00 + vpxor xmm3,xmm3,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x10 + vpxor xmm1,xmm1,xmm4 + vpclmulqdq xmm4,xmm6,xmm5,0x01 + vpxor xmm1,xmm1,xmm4 + + vpsrldq xmm4,xmm1,8 + vpxor xmm5,xmm2,xmm4 + vpslldq xmm4,xmm1,8 + vpxor xmm0,xmm3,xmm4 + + vmovdqa xmm3,XMMWORD[poly] + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpalignr xmm2,xmm0,xmm0,8 + vpclmulqdq xmm0,xmm0,xmm3,0x10 + vpxor xmm0,xmm2,xmm0 + + vpxor xmm0,xmm0,xmm5 + +$L$256_dec_loop2: + + + + cmp r9,16 + jb NEAR $L$256_dec_out + sub r9,16 + + vmovdqa xmm2,xmm15 + vpaddd xmm15,xmm15,XMMWORD[one] + + vpxor xmm2,xmm2,XMMWORD[r8] + vaesenc xmm2,xmm2,XMMWORD[16+r8] + vaesenc xmm2,xmm2,XMMWORD[32+r8] + vaesenc xmm2,xmm2,XMMWORD[48+r8] + vaesenc xmm2,xmm2,XMMWORD[64+r8] + vaesenc xmm2,xmm2,XMMWORD[80+r8] + vaesenc xmm2,xmm2,XMMWORD[96+r8] + vaesenc xmm2,xmm2,XMMWORD[112+r8] + vaesenc xmm2,xmm2,XMMWORD[128+r8] + vaesenc xmm2,xmm2,XMMWORD[144+r8] + vaesenc xmm2,xmm2,XMMWORD[160+r8] + vaesenc xmm2,xmm2,XMMWORD[176+r8] + vaesenc xmm2,xmm2,XMMWORD[192+r8] + vaesenc xmm2,xmm2,XMMWORD[208+r8] + vaesenclast xmm2,xmm2,XMMWORD[224+r8] + vpxor xmm2,xmm2,XMMWORD[rdi] + vmovdqu XMMWORD[rsi],xmm2 + add rdi,16 + add rsi,16 + + vpxor xmm0,xmm0,xmm2 + vmovdqa xmm1,XMMWORD[((-32))+rcx] + call GFMUL + + jmp NEAR $L$256_dec_loop2 + +$L$256_dec_out: + vmovdqu XMMWORD[rdx],xmm0 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes256gcmsiv_dec: +global aes256gcmsiv_kdf + +ALIGN 16 +aes256gcmsiv_kdf: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_aes256gcmsiv_kdf: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + + + + + vmovdqa xmm1,XMMWORD[rdx] + vmovdqa xmm4,XMMWORD[rdi] + vmovdqa xmm11,XMMWORD[and_mask] + vmovdqa xmm8,XMMWORD[one] + vpshufd xmm4,xmm4,0x90 + vpand xmm4,xmm4,xmm11 + vpaddd xmm6,xmm4,xmm8 + vpaddd xmm7,xmm6,xmm8 + vpaddd xmm11,xmm7,xmm8 + vpaddd xmm12,xmm11,xmm8 + vpaddd xmm13,xmm12,xmm8 + + vpxor xmm4,xmm4,xmm1 + vpxor xmm6,xmm6,xmm1 + vpxor xmm7,xmm7,xmm1 + vpxor xmm11,xmm11,xmm1 + vpxor xmm12,xmm12,xmm1 + vpxor xmm13,xmm13,xmm1 + + vmovdqa xmm1,XMMWORD[16+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[32+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[48+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[64+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[80+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[96+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[112+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[128+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[144+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[160+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[176+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[192+rdx] + vaesenc xmm4,xmm4,xmm2 + vaesenc xmm6,xmm6,xmm2 + vaesenc xmm7,xmm7,xmm2 + vaesenc xmm11,xmm11,xmm2 + vaesenc xmm12,xmm12,xmm2 + vaesenc xmm13,xmm13,xmm2 + + vmovdqa xmm1,XMMWORD[208+rdx] + vaesenc xmm4,xmm4,xmm1 + vaesenc xmm6,xmm6,xmm1 + vaesenc xmm7,xmm7,xmm1 + vaesenc xmm11,xmm11,xmm1 + vaesenc xmm12,xmm12,xmm1 + vaesenc xmm13,xmm13,xmm1 + + vmovdqa xmm2,XMMWORD[224+rdx] + vaesenclast xmm4,xmm4,xmm2 + vaesenclast xmm6,xmm6,xmm2 + vaesenclast xmm7,xmm7,xmm2 + vaesenclast xmm11,xmm11,xmm2 + vaesenclast xmm12,xmm12,xmm2 + vaesenclast xmm13,xmm13,xmm2 + + + vmovdqa XMMWORD[rsi],xmm4 + vmovdqa XMMWORD[16+rsi],xmm6 + vmovdqa XMMWORD[32+rsi],xmm7 + vmovdqa XMMWORD[48+rsi],xmm11 + vmovdqa XMMWORD[64+rsi],xmm12 + vmovdqa XMMWORD[80+rsi],xmm13 + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_aes256gcmsiv_kdf: +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/crypto/chacha-armv4-linux.S b/third_party/boringssl/gen/crypto/chacha-armv4-linux.S new file mode 100644 index 00000000..6532e00d --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha-armv4-linux.S @@ -0,0 +1,1449 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both +@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. +.arch armv7-a + +.text +#if defined(__thumb2__) || defined(__clang__) +.syntax unified +#endif +#if defined(__thumb2__) +.thumb +#else +.code 32 +#endif + +#if defined(__thumb2__) || defined(__clang__) +#define ldrhsb ldrbhs +#endif + +.align 5 +.Lsigma: +.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral +.Lone: +.long 1,0,0,0 + +.globl ChaCha20_ctr32_nohw +.hidden ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,%function +.align 5 +ChaCha20_ctr32_nohw: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0,r1,r2,r4-r11,lr} + adr r14,.Lsigma + ldmia r12,{r4,r5,r6,r7} @ load counter and nonce + sub sp,sp,#4*(16) @ off-load area + stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce + ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key + ldmia r14,{r0,r1,r2,r3} @ load sigma + stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key + stmdb sp!,{r0,r1,r2,r3} @ copy sigma + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + b .Loop_outer_enter + +.align 4 +.Loop_outer: + ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material + str r11,[sp,#4*(32+2)] @ save len + str r12, [sp,#4*(32+1)] @ save inp + str r14, [sp,#4*(32+0)] @ save out +.Loop_outer_enter: + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(16+15)] + mov r11,#10 + b .Loop + +.align 4 +.Loop: + subs r11,r11,#1 + add r0,r0,r4 + mov r12,r12,ror#16 + add r1,r1,r5 + mov r10,r10,ror#16 + eor r12,r12,r0,ror#16 + eor r10,r10,r1,ror#16 + add r8,r8,r12 + mov r4,r4,ror#20 + add r9,r9,r10 + mov r5,r5,ror#20 + eor r4,r4,r8,ror#20 + eor r5,r5,r9,ror#20 + add r0,r0,r4 + mov r12,r12,ror#24 + add r1,r1,r5 + mov r10,r10,ror#24 + eor r12,r12,r0,ror#24 + eor r10,r10,r1,ror#24 + add r8,r8,r12 + mov r4,r4,ror#25 + add r9,r9,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+13)] + ldr r10,[sp,#4*(16+15)] + eor r4,r4,r8,ror#25 + eor r5,r5,r9,ror#25 + str r8,[sp,#4*(16+8)] + ldr r8,[sp,#4*(16+10)] + add r2,r2,r6 + mov r14,r14,ror#16 + str r9,[sp,#4*(16+9)] + ldr r9,[sp,#4*(16+11)] + add r3,r3,r7 + mov r10,r10,ror#16 + eor r14,r14,r2,ror#16 + eor r10,r10,r3,ror#16 + add r8,r8,r14 + mov r6,r6,ror#20 + add r9,r9,r10 + mov r7,r7,ror#20 + eor r6,r6,r8,ror#20 + eor r7,r7,r9,ror#20 + add r2,r2,r6 + mov r14,r14,ror#24 + add r3,r3,r7 + mov r10,r10,ror#24 + eor r14,r14,r2,ror#24 + eor r10,r10,r3,ror#24 + add r8,r8,r14 + mov r6,r6,ror#25 + add r9,r9,r10 + mov r7,r7,ror#25 + eor r6,r6,r8,ror#25 + eor r7,r7,r9,ror#25 + add r0,r0,r5 + mov r10,r10,ror#16 + add r1,r1,r6 + mov r12,r12,ror#16 + eor r10,r10,r0,ror#16 + eor r12,r12,r1,ror#16 + add r8,r8,r10 + mov r5,r5,ror#20 + add r9,r9,r12 + mov r6,r6,ror#20 + eor r5,r5,r8,ror#20 + eor r6,r6,r9,ror#20 + add r0,r0,r5 + mov r10,r10,ror#24 + add r1,r1,r6 + mov r12,r12,ror#24 + eor r10,r10,r0,ror#24 + eor r12,r12,r1,ror#24 + add r8,r8,r10 + mov r5,r5,ror#25 + str r10,[sp,#4*(16+15)] + ldr r10,[sp,#4*(16+13)] + add r9,r9,r12 + mov r6,r6,ror#25 + eor r5,r5,r8,ror#25 + eor r6,r6,r9,ror#25 + str r8,[sp,#4*(16+10)] + ldr r8,[sp,#4*(16+8)] + add r2,r2,r7 + mov r10,r10,ror#16 + str r9,[sp,#4*(16+11)] + ldr r9,[sp,#4*(16+9)] + add r3,r3,r4 + mov r14,r14,ror#16 + eor r10,r10,r2,ror#16 + eor r14,r14,r3,ror#16 + add r8,r8,r10 + mov r7,r7,ror#20 + add r9,r9,r14 + mov r4,r4,ror#20 + eor r7,r7,r8,ror#20 + eor r4,r4,r9,ror#20 + add r2,r2,r7 + mov r10,r10,ror#24 + add r3,r3,r4 + mov r14,r14,ror#24 + eor r10,r10,r2,ror#24 + eor r14,r14,r3,ror#24 + add r8,r8,r10 + mov r7,r7,ror#25 + add r9,r9,r14 + mov r4,r4,ror#25 + eor r7,r7,r8,ror#25 + eor r4,r4,r9,ror#25 + bne .Loop + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + cmp r11,#64 @ done yet? +#ifdef __thumb2__ + itete lo +#endif + addlo r12,sp,#4*(0) @ shortcut or ... + ldrhs r12,[sp,#4*(32+1)] @ ... load inp + addlo r14,sp,#4*(0) @ shortcut or ... + ldrhs r14,[sp,#4*(32+0)] @ ... load out + + ldr r8,[sp,#4*(0)] @ load key material + ldr r9,[sp,#4*(1)] + +#if __ARM_ARCH>=6 || !defined(__ARMEB__) +# if __ARM_ARCH<7 + orr r10,r12,r14 + tst r10,#3 @ are input and output aligned? + ldr r10,[sp,#4*(2)] + bne .Lunaligned + cmp r11,#64 @ restore flags +# else + ldr r10,[sp,#4*(2)] +# endif + ldr r11,[sp,#4*(3)] + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + + add r2,r2,r10 + add r3,r3,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r0,r0,r8 @ xor with input + eorhs r1,r1,r9 + add r8,sp,#4*(4) + str r0,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r1,[r14,#-12] + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 + add r8,sp,#4*(8) + str r4,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r5,[r14,#-12] + ldmia r8,{r8,r9,r10,r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] +# ifdef __thumb2__ + itt hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r2,r2,r10 + add r3,r3,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r0,r0,r8 + eorhs r1,r1,r9 + add r8,sp,#4*(12) + str r0,[r14],#16 @ store output +# ifdef __thumb2__ + itt hs +# endif + eorhs r2,r2,r10 + eorhs r3,r3,r11 + str r1,[r14,#-12] + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 +# ifdef __thumb2__ + itt hi +# endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value +# ifdef __thumb2__ + itt hs +# endif + ldrhs r8,[r12],#16 @ load input + ldrhs r9,[r12,#-12] + add r6,r6,r10 + add r7,r7,r11 +# ifdef __thumb2__ + itt hs +# endif + ldrhs r10,[r12,#-8] + ldrhs r11,[r12,#-4] +# if __ARM_ARCH>=6 && defined(__ARMEB__) + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif +# ifdef __thumb2__ + itt hs +# endif + eorhs r4,r4,r8 + eorhs r5,r5,r9 +# ifdef __thumb2__ + it ne +# endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + itt hs +# endif + eorhs r6,r6,r10 + eorhs r7,r7,r11 + str r4,[r14],#16 @ store output + str r5,[r14,#-12] +# ifdef __thumb2__ + it hs +# endif + subhs r11,r8,#64 @ len-=64 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi .Loop_outer + + beq .Ldone +# if __ARM_ARCH<7 + b .Ltail + +.align 4 +.Lunaligned:@ unaligned endian-neutral path + cmp r11,#64 @ restore flags +# endif +#endif +#if __ARM_ARCH<7 + ldr r11,[sp,#4*(3)] + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r3,r3,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+0) + ldmia r8,{r8,r9,r10,r11} @ load key material + add r0,sp,#4*(16+8) + add r4,r4,r8 @ accumulate key material + add r5,r5,r9 + add r6,r6,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r7,r7,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] + add r8,sp,#4*(4+4) + ldmia r8,{r8,r9,r10,r11} @ load key material + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half +# ifdef __thumb2__ + itt hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" + strhi r11,[sp,#4*(16+11)] @ copy "rx" + add r0,r0,r8 @ accumulate key material + add r1,r1,r9 + add r2,r2,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r3,r3,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r0,r8,r0 @ xor with input (or zero) + eor r1,r9,r1 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r2,r10,r2 + strb r0,[r14],#16 @ store output + eor r3,r11,r3 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r1,[r14,#-12] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-8] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r3,[r14,#-4] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-15] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r1,[r14,#-11] + eor r0,r8,r0,lsr#8 + strb r2,[r14,#-7] + eor r1,r9,r1,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r3,[r14,#-3] + eor r2,r10,r2,lsr#8 + strb r0,[r14,#-14] + eor r3,r11,r3,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r1,[r14,#-10] + strb r2,[r14,#-6] + eor r0,r8,r0,lsr#8 + strb r3,[r14,#-2] + eor r1,r9,r1,lsr#8 + strb r0,[r14,#-13] + eor r2,r10,r2,lsr#8 + strb r1,[r14,#-9] + eor r3,r11,r3,lsr#8 + strb r2,[r14,#-5] + strb r3,[r14,#-1] + add r8,sp,#4*(4+8) + ldmia r8,{r8,r9,r10,r11} @ load key material + add r4,r4,r8 @ accumulate key material +# ifdef __thumb2__ + itt hi +# endif + addhi r8,r8,#1 @ next counter value + strhi r8,[sp,#4*(12)] @ save next counter value + add r5,r5,r9 + add r6,r6,r10 +# ifdef __thumb2__ + itete lo +# endif + eorlo r8,r8,r8 @ zero or ... + ldrhsb r8,[r12],#16 @ ... load input + eorlo r9,r9,r9 + ldrhsb r9,[r12,#-12] + + add r7,r7,r11 +# ifdef __thumb2__ + itete lo +# endif + eorlo r10,r10,r10 + ldrhsb r10,[r12,#-8] + eorlo r11,r11,r11 + ldrhsb r11,[r12,#-4] + + eor r4,r8,r4 @ xor with input (or zero) + eor r5,r9,r5 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-15] @ load more input + ldrhsb r9,[r12,#-11] + eor r6,r10,r6 + strb r4,[r14],#16 @ store output + eor r7,r11,r7 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-7] + ldrhsb r11,[r12,#-3] + strb r5,[r14,#-12] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-8] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-14] @ load more input + ldrhsb r9,[r12,#-10] + strb r7,[r14,#-4] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-15] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-6] + ldrhsb r11,[r12,#-2] + strb r5,[r14,#-11] + eor r4,r8,r4,lsr#8 + strb r6,[r14,#-7] + eor r5,r9,r5,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r8,[r12,#-13] @ load more input + ldrhsb r9,[r12,#-9] + strb r7,[r14,#-3] + eor r6,r10,r6,lsr#8 + strb r4,[r14,#-14] + eor r7,r11,r7,lsr#8 +# ifdef __thumb2__ + itt hs +# endif + ldrhsb r10,[r12,#-5] + ldrhsb r11,[r12,#-1] + strb r5,[r14,#-10] + strb r6,[r14,#-6] + eor r4,r8,r4,lsr#8 + strb r7,[r14,#-2] + eor r5,r9,r5,lsr#8 + strb r4,[r14,#-13] + eor r6,r10,r6,lsr#8 + strb r5,[r14,#-9] + eor r7,r11,r7,lsr#8 + strb r6,[r14,#-5] + strb r7,[r14,#-1] +# ifdef __thumb2__ + it ne +# endif + ldrne r8,[sp,#4*(32+2)] @ re-load len +# ifdef __thumb2__ + it hs +# endif + subhs r11,r8,#64 @ len-=64 + bhi .Loop_outer + + beq .Ldone +#endif + +.Ltail: + ldr r12,[sp,#4*(32+1)] @ load inp + add r9,sp,#4*(0) + ldr r14,[sp,#4*(32+0)] @ load out + +.Loop_tail: + ldrb r10,[r9],#1 @ read buffer on stack + ldrb r11,[r12],#1 @ read input + subs r8,r8,#1 + eor r11,r11,r10 + strb r11,[r14],#1 @ store output + bne .Loop_tail + +.Ldone: + add sp,sp,#4*(32+3) + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw +#if __ARM_MAX_ARCH__>=7 +.arch armv7-a +.fpu neon + +.globl ChaCha20_ctr32_neon +.hidden ChaCha20_ctr32_neon +.type ChaCha20_ctr32_neon,%function +.align 5 +ChaCha20_ctr32_neon: + ldr r12,[sp,#0] @ pull pointer to counter and nonce + stmdb sp!,{r0,r1,r2,r4-r11,lr} + adr r14,.Lsigma + vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so + stmdb sp!,{r0,r1,r2,r3} + + vld1.32 {q1,q2},[r3] @ load key + ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key + + sub sp,sp,#4*(16+16) + vld1.32 {q3},[r12] @ load counter and nonce + add r12,sp,#4*8 + ldmia r14,{r0,r1,r2,r3} @ load sigma + vld1.32 {q0},[r14]! @ load sigma + vld1.32 {q12},[r14] @ one + vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce + vst1.32 {q0,q1},[sp] @ copy sigma|1/2key + + str r10,[sp,#4*(16+10)] @ off-load "rx" + str r11,[sp,#4*(16+11)] @ off-load "rx" + vshl.i32 d26,d24,#1 @ two + vstr d24,[sp,#4*(16+0)] + vshl.i32 d28,d24,#2 @ four + vstr d26,[sp,#4*(16+2)] + vmov q4,q0 + vstr d28,[sp,#4*(16+4)] + vmov q8,q0 + vmov q5,q1 + vmov q9,q1 + b .Loop_neon_enter + +.align 4 +.Loop_neon_outer: + ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material + cmp r11,#64*2 @ if len<=64*2 + bls .Lbreak_neon @ switch to integer-only + vmov q4,q0 + str r11,[sp,#4*(32+2)] @ save len + vmov q8,q0 + str r12, [sp,#4*(32+1)] @ save inp + vmov q5,q1 + str r14, [sp,#4*(32+0)] @ save out + vmov q9,q1 +.Loop_neon_enter: + ldr r11, [sp,#4*(15)] + vadd.i32 q7,q3,q12 @ counter+1 + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + vmov q6,q2 + ldr r10, [sp,#4*(13)] + vmov q10,q2 + ldr r14,[sp,#4*(14)] + vadd.i32 q11,q7,q12 @ counter+2 + str r11, [sp,#4*(16+15)] + mov r11,#10 + add r12,r12,#3 @ counter+3 + b .Loop_neon + +.align 4 +.Loop_neon: + subs r11,r11,#1 + vadd.i32 q0,q0,q1 + add r0,r0,r4 + vadd.i32 q4,q4,q5 + mov r12,r12,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r5 + veor q3,q3,q0 + mov r10,r10,ror#16 + veor q7,q7,q4 + eor r12,r12,r0,ror#16 + veor q11,q11,q8 + eor r10,r10,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r12 + vrev32.16 q7,q7 + mov r4,r4,ror#20 + vrev32.16 q11,q11 + add r9,r9,r10 + vadd.i32 q2,q2,q3 + mov r5,r5,ror#20 + vadd.i32 q6,q6,q7 + eor r4,r4,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r5,r5,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r4 + veor q13,q5,q6 + mov r12,r12,ror#24 + veor q14,q9,q10 + add r1,r1,r5 + vshr.u32 q1,q12,#20 + mov r10,r10,ror#24 + vshr.u32 q5,q13,#20 + eor r12,r12,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r10,r10,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r12 + vsli.32 q5,q13,#12 + mov r4,r4,ror#25 + vsli.32 q9,q14,#12 + add r9,r9,r10 + vadd.i32 q0,q0,q1 + mov r5,r5,ror#25 + vadd.i32 q4,q4,q5 + str r10,[sp,#4*(16+13)] + vadd.i32 q8,q8,q9 + ldr r10,[sp,#4*(16+15)] + veor q12,q3,q0 + eor r4,r4,r8,ror#25 + veor q13,q7,q4 + eor r5,r5,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+8)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+10)] + vshr.u32 q7,q13,#24 + add r2,r2,r6 + vshr.u32 q11,q14,#24 + mov r14,r14,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+9)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+11)] + vsli.32 q11,q14,#8 + add r3,r3,r7 + vadd.i32 q2,q2,q3 + mov r10,r10,ror#16 + vadd.i32 q6,q6,q7 + eor r14,r14,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r10,r10,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r14 + veor q13,q5,q6 + mov r6,r6,ror#20 + veor q14,q9,q10 + add r9,r9,r10 + vshr.u32 q1,q12,#25 + mov r7,r7,ror#20 + vshr.u32 q5,q13,#25 + eor r6,r6,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r7,r7,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r6 + vsli.32 q5,q13,#7 + mov r14,r14,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r7 + vext.8 q2,q2,q2,#8 + mov r10,r10,ror#24 + vext.8 q6,q6,q6,#8 + eor r14,r14,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r10,r10,r3,ror#24 + vext.8 q1,q1,q1,#4 + add r8,r8,r14 + vext.8 q5,q5,q5,#4 + mov r6,r6,ror#25 + vext.8 q9,q9,q9,#4 + add r9,r9,r10 + vext.8 q3,q3,q3,#12 + mov r7,r7,ror#25 + vext.8 q7,q7,q7,#12 + eor r6,r6,r8,ror#25 + vext.8 q11,q11,q11,#12 + eor r7,r7,r9,ror#25 + vadd.i32 q0,q0,q1 + add r0,r0,r5 + vadd.i32 q4,q4,q5 + mov r10,r10,ror#16 + vadd.i32 q8,q8,q9 + add r1,r1,r6 + veor q3,q3,q0 + mov r12,r12,ror#16 + veor q7,q7,q4 + eor r10,r10,r0,ror#16 + veor q11,q11,q8 + eor r12,r12,r1,ror#16 + vrev32.16 q3,q3 + add r8,r8,r10 + vrev32.16 q7,q7 + mov r5,r5,ror#20 + vrev32.16 q11,q11 + add r9,r9,r12 + vadd.i32 q2,q2,q3 + mov r6,r6,ror#20 + vadd.i32 q6,q6,q7 + eor r5,r5,r8,ror#20 + vadd.i32 q10,q10,q11 + eor r6,r6,r9,ror#20 + veor q12,q1,q2 + add r0,r0,r5 + veor q13,q5,q6 + mov r10,r10,ror#24 + veor q14,q9,q10 + add r1,r1,r6 + vshr.u32 q1,q12,#20 + mov r12,r12,ror#24 + vshr.u32 q5,q13,#20 + eor r10,r10,r0,ror#24 + vshr.u32 q9,q14,#20 + eor r12,r12,r1,ror#24 + vsli.32 q1,q12,#12 + add r8,r8,r10 + vsli.32 q5,q13,#12 + mov r5,r5,ror#25 + vsli.32 q9,q14,#12 + str r10,[sp,#4*(16+15)] + vadd.i32 q0,q0,q1 + ldr r10,[sp,#4*(16+13)] + vadd.i32 q4,q4,q5 + add r9,r9,r12 + vadd.i32 q8,q8,q9 + mov r6,r6,ror#25 + veor q12,q3,q0 + eor r5,r5,r8,ror#25 + veor q13,q7,q4 + eor r6,r6,r9,ror#25 + veor q14,q11,q8 + str r8,[sp,#4*(16+10)] + vshr.u32 q3,q12,#24 + ldr r8,[sp,#4*(16+8)] + vshr.u32 q7,q13,#24 + add r2,r2,r7 + vshr.u32 q11,q14,#24 + mov r10,r10,ror#16 + vsli.32 q3,q12,#8 + str r9,[sp,#4*(16+11)] + vsli.32 q7,q13,#8 + ldr r9,[sp,#4*(16+9)] + vsli.32 q11,q14,#8 + add r3,r3,r4 + vadd.i32 q2,q2,q3 + mov r14,r14,ror#16 + vadd.i32 q6,q6,q7 + eor r10,r10,r2,ror#16 + vadd.i32 q10,q10,q11 + eor r14,r14,r3,ror#16 + veor q12,q1,q2 + add r8,r8,r10 + veor q13,q5,q6 + mov r7,r7,ror#20 + veor q14,q9,q10 + add r9,r9,r14 + vshr.u32 q1,q12,#25 + mov r4,r4,ror#20 + vshr.u32 q5,q13,#25 + eor r7,r7,r8,ror#20 + vshr.u32 q9,q14,#25 + eor r4,r4,r9,ror#20 + vsli.32 q1,q12,#7 + add r2,r2,r7 + vsli.32 q5,q13,#7 + mov r10,r10,ror#24 + vsli.32 q9,q14,#7 + add r3,r3,r4 + vext.8 q2,q2,q2,#8 + mov r14,r14,ror#24 + vext.8 q6,q6,q6,#8 + eor r10,r10,r2,ror#24 + vext.8 q10,q10,q10,#8 + eor r14,r14,r3,ror#24 + vext.8 q1,q1,q1,#12 + add r8,r8,r10 + vext.8 q5,q5,q5,#12 + mov r7,r7,ror#25 + vext.8 q9,q9,q9,#12 + add r9,r9,r14 + vext.8 q3,q3,q3,#4 + mov r4,r4,ror#25 + vext.8 q7,q7,q7,#4 + eor r7,r7,r8,ror#25 + vext.8 q11,q11,q11,#4 + eor r4,r4,r9,ror#25 + bne .Loop_neon + + add r11,sp,#32 + vld1.32 {q12,q13},[sp] @ load key material + vld1.32 {q14,q15},[r11] + + ldr r11,[sp,#4*(32+2)] @ load len + + str r8, [sp,#4*(16+8)] @ modulo-scheduled store + str r9, [sp,#4*(16+9)] + str r12,[sp,#4*(16+12)] + str r10, [sp,#4*(16+13)] + str r14,[sp,#4*(16+14)] + + @ at this point we have first half of 512-bit result in + @ rx and second half at sp+4*(16+8) + + ldr r12,[sp,#4*(32+1)] @ load inp + ldr r14,[sp,#4*(32+0)] @ load out + + vadd.i32 q0,q0,q12 @ accumulate key material + vadd.i32 q4,q4,q12 + vadd.i32 q8,q8,q12 + vldr d24,[sp,#4*(16+0)] @ one + + vadd.i32 q1,q1,q13 + vadd.i32 q5,q5,q13 + vadd.i32 q9,q9,q13 + vldr d26,[sp,#4*(16+2)] @ two + + vadd.i32 q2,q2,q14 + vadd.i32 q6,q6,q14 + vadd.i32 q10,q10,q14 + vadd.i32 d14,d14,d24 @ counter+1 + vadd.i32 d22,d22,d26 @ counter+2 + + vadd.i32 q3,q3,q15 + vadd.i32 q7,q7,q15 + vadd.i32 q11,q11,q15 + + cmp r11,#64*4 + blo .Ltail_neon + + vld1.8 {q12,q13},[r12]! @ load input + mov r11,sp + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 @ xor with input + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + vst1.8 {q0,q1},[r14]! @ store output + veor q5,q5,q13 + vld1.8 {q12,q13},[r12]! + veor q6,q6,q14 + vst1.8 {q2,q3},[r14]! + veor q7,q7,q15 + vld1.8 {q14,q15},[r12]! + + veor q8,q8,q12 + vld1.32 {q0,q1},[r11]! @ load for next iteration + veor d25,d25,d25 + vldr d24,[sp,#4*(16+4)] @ four + veor q9,q9,q13 + vld1.32 {q2,q3},[r11] + veor q10,q10,q14 + vst1.8 {q4,q5},[r14]! + veor q11,q11,q15 + vst1.8 {q6,q7},[r14]! + + vadd.i32 d6,d6,d24 @ next counter value + vldr d24,[sp,#4*(16+0)] @ one + + ldmia sp,{r8,r9,r10,r11} @ load key material + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + vst1.8 {q8,q9},[r14]! + add r1,r1,r9 + ldr r9,[r12,#-12] + vst1.8 {q10,q11},[r14]! + add r2,r2,r10 + ldr r10,[r12,#-8] + add r3,r3,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + eor r0,r0,r8 @ xor with input + add r8,sp,#4*(4) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r5,r5,r9 + ldr r9,[r12,#-12] + add r6,r6,r10 + ldr r10,[r12,#-8] + add r7,r7,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + eor r4,r4,r8 + add r8,sp,#4*(8) + eor r5,r5,r9 + str r4,[r14],#16 @ store output + eor r6,r6,r10 + str r5,[r14,#-12] + eor r7,r7,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r6,[r14,#-8] + add r0,sp,#4*(16+8) + str r7,[r14,#-4] + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + ldr r8,[r12],#16 @ load input + add r1,r1,r9 + ldr r9,[r12,#-12] +# ifdef __thumb2__ + it hi +# endif + strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it + add r2,r2,r10 + ldr r10,[r12,#-8] +# ifdef __thumb2__ + it hi +# endif + strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it + add r3,r3,r11 + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 +# endif + eor r0,r0,r8 + add r8,sp,#4*(12) + eor r1,r1,r9 + str r0,[r14],#16 @ store output + eor r2,r2,r10 + str r1,[r14,#-12] + eor r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + str r2,[r14,#-8] + str r3,[r14,#-4] + + add r4,r4,r8 @ accumulate key material + add r8,r8,#4 @ next counter value + add r5,r5,r9 + str r8,[sp,#4*(12)] @ save next counter value + ldr r8,[r12],#16 @ load input + add r6,r6,r10 + add r4,r4,#3 @ counter+3 + ldr r9,[r12,#-12] + add r7,r7,r11 + ldr r10,[r12,#-8] + ldr r11,[r12,#-4] +# ifdef __ARMEB__ + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + eor r4,r4,r8 +# ifdef __thumb2__ + it hi +# endif + ldrhi r8,[sp,#4*(32+2)] @ re-load len + eor r5,r5,r9 + eor r6,r6,r10 + str r4,[r14],#16 @ store output + eor r7,r7,r11 + str r5,[r14,#-12] + sub r11,r8,#64*4 @ len-=64*4 + str r6,[r14,#-8] + str r7,[r14,#-4] + bhi .Loop_neon_outer + + b .Ldone_neon + +.align 4 +.Lbreak_neon: + @ harmonize NEON and integer-only stack frames: load data + @ from NEON frame, but save to integer-only one; distance + @ between the two is 4*(32+4+16-32)=4*(20). + + str r11, [sp,#4*(20+32+2)] @ save len + add r11,sp,#4*(32+4) + str r12, [sp,#4*(20+32+1)] @ save inp + str r14, [sp,#4*(20+32+0)] @ save out + + ldr r12,[sp,#4*(16+10)] + ldr r14,[sp,#4*(16+11)] + vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement + str r12,[sp,#4*(20+16+10)] @ copy "rx" + str r14,[sp,#4*(20+16+11)] @ copy "rx" + + ldr r11, [sp,#4*(15)] + ldr r12,[sp,#4*(12)] @ modulo-scheduled load + ldr r10, [sp,#4*(13)] + ldr r14,[sp,#4*(14)] + str r11, [sp,#4*(20+16+15)] + add r11,sp,#4*(20) + vst1.32 {q0,q1},[r11]! @ copy key + add sp,sp,#4*(20) @ switch frame + vst1.32 {q2,q3},[r11] + mov r11,#10 + b .Loop @ go integer-only + +.align 4 +.Ltail_neon: + cmp r11,#64*3 + bhs .L192_or_more_neon + cmp r11,#64*2 + bhs .L128_or_more_neon + cmp r11,#64*1 + bhs .L64_or_more_neon + + add r8,sp,#4*(8) + vst1.8 {q0,q1},[sp] + add r10,sp,#4*(0) + vst1.8 {q2,q3},[r8] + b .Loop_tail_neon + +.align 4 +.L64_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + veor q2,q2,q14 + veor q3,q3,q15 + vst1.8 {q0,q1},[r14]! + vst1.8 {q2,q3},[r14]! + + beq .Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q4,q5},[sp] + add r10,sp,#4*(0) + vst1.8 {q6,q7},[r8] + sub r11,r11,#64*1 @ len-=64*1 + b .Loop_tail_neon + +.align 4 +.L128_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vst1.8 {q0,q1},[r14]! + veor q6,q6,q14 + vst1.8 {q2,q3},[r14]! + veor q7,q7,q15 + vst1.8 {q4,q5},[r14]! + vst1.8 {q6,q7},[r14]! + + beq .Ldone_neon + + add r8,sp,#4*(8) + vst1.8 {q8,q9},[sp] + add r10,sp,#4*(0) + vst1.8 {q10,q11},[r8] + sub r11,r11,#64*2 @ len-=64*2 + b .Loop_tail_neon + +.align 4 +.L192_or_more_neon: + vld1.8 {q12,q13},[r12]! + vld1.8 {q14,q15},[r12]! + veor q0,q0,q12 + veor q1,q1,q13 + vld1.8 {q12,q13},[r12]! + veor q2,q2,q14 + veor q3,q3,q15 + vld1.8 {q14,q15},[r12]! + + veor q4,q4,q12 + veor q5,q5,q13 + vld1.8 {q12,q13},[r12]! + veor q6,q6,q14 + vst1.8 {q0,q1},[r14]! + veor q7,q7,q15 + vld1.8 {q14,q15},[r12]! + + veor q8,q8,q12 + vst1.8 {q2,q3},[r14]! + veor q9,q9,q13 + vst1.8 {q4,q5},[r14]! + veor q10,q10,q14 + vst1.8 {q6,q7},[r14]! + veor q11,q11,q15 + vst1.8 {q8,q9},[r14]! + vst1.8 {q10,q11},[r14]! + + beq .Ldone_neon + + ldmia sp,{r8,r9,r10,r11} @ load key material + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(4) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r6,r6,r10 + add r7,r7,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7} + add r0,sp,#4*(16+8) + + ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half + + add r0,r0,r8 @ accumulate key material + add r8,sp,#4*(12) + add r1,r1,r9 + add r2,r2,r10 + add r3,r3,r11 + ldmia r8,{r8,r9,r10,r11} @ load key material + + add r4,r4,r8 @ accumulate key material + add r8,sp,#4*(8) + add r5,r5,r9 + add r4,r4,#3 @ counter+3 + add r6,r6,r10 + add r7,r7,r11 + ldr r11,[sp,#4*(32+2)] @ re-load len +# ifdef __ARMEB__ + rev r0,r0 + rev r1,r1 + rev r2,r2 + rev r3,r3 + rev r4,r4 + rev r5,r5 + rev r6,r6 + rev r7,r7 +# endif + stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7} + add r10,sp,#4*(0) + sub r11,r11,#64*3 @ len-=64*3 + +.Loop_tail_neon: + ldrb r8,[r10],#1 @ read buffer on stack + ldrb r9,[r12],#1 @ read input + subs r11,r11,#1 + eor r8,r8,r9 + strb r8,[r14],#1 @ store output + bne .Loop_tail_neon + +.Ldone_neon: + add sp,sp,#4*(32+4) + vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15} + add sp,sp,#4*(16+3) + ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} +.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon +#endif +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/third_party/boringssl/gen/crypto/chacha-armv8-apple.S b/third_party/boringssl/gen/crypto/chacha-armv8-apple.S new file mode 100644 index 00000000..452bf496 --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha-armv8-apple.S @@ -0,0 +1,1966 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.section __TEXT,__const + +.align 5 +Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +Lone: +.long 1,0,0,0 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.text + +.globl _ChaCha20_ctr32_nohw +.private_extern _ChaCha20_ctr32_nohw + +.align 5 +_ChaCha20_ctr32_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __AARCH64EB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.align 4 +Ltail: + add x2,x2,#64 +Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl _ChaCha20_ctr32_neon +.private_extern _ChaCha20_ctr32_neon + +.align 5 +_ChaCha20_ctr32_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b Last_neon + +Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b Last_neon +Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b Last_neon + +.align 4 +Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.align 5 +ChaCha20_512_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma@PAGE + add x5,x5,Lsigma@PAGEOFF + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b Loop_outer + +Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/crypto/chacha-armv8-linux.S b/third_party/boringssl/gen/crypto/chacha-armv8-linux.S new file mode 100644 index 00000000..6ecff537 --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha-armv8-linux.S @@ -0,0 +1,1966 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.section .rodata + +.align 5 +.Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +.Lone: +.long 1,0,0,0 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.text + +.globl ChaCha20_ctr32_nohw +.hidden ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,%function +.align 5 +ChaCha20_ctr32_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __AARCH64EB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +.Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +.Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,.Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo .Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi .Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.align 4 +.Ltail: + add x2,x2,#64 +.Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +.Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,.Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw + +.globl ChaCha20_ctr32_neon +.hidden ChaCha20_ctr32_neon +.type ChaCha20_ctr32_neon,%function +.align 5 +ChaCha20_ctr32_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs .L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +.Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +.Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,.Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo .Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi .Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo .Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq .Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo .Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq .Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo .Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq .Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b .Last_neon + +.Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b .Last_neon +.Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b .Last_neon + +.align 4 +.Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +.Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,.Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +.Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ChaCha20_ctr32_neon,.-ChaCha20_ctr32_neon +.type ChaCha20_512_neon,%function +.align 5 +ChaCha20_512_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,.Lsigma + add x5,x5,:lo12:.Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +.L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +.Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +.Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,.Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +.Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,.Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs .Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq .Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs .Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b .Loop_outer + +.Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size ChaCha20_512_neon,.-ChaCha20_512_neon +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/crypto/chacha-armv8-win.S b/third_party/boringssl/gen/crypto/chacha-armv8-win.S new file mode 100644 index 00000000..ea1da282 --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha-armv8-win.S @@ -0,0 +1,1972 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.section .rodata + +.align 5 +Lsigma: +.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral +Lone: +.long 1,0,0,0 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.align 2 + +.text + +.globl ChaCha20_ctr32_nohw + +.def ChaCha20_ctr32_nohw + .type 32 +.endef +.align 5 +ChaCha20_ctr32_nohw: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma + add x5,x5,:lo12:Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ldp x28,x30,[x4] // load counter +#ifdef __AARCH64EB__ + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + +Loop_outer: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov w7,w23 + lsr x8,x23,#32 + mov w9,w24 + lsr x10,x24,#32 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#64 +Loop: + sub x4,x4,#1 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + ror w21,w21,#16 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#20 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + add w5,w5,w9 + add w6,w6,w10 + add w7,w7,w11 + add w8,w8,w12 + eor w17,w17,w5 + eor w19,w19,w6 + eor w20,w20,w7 + eor w21,w21,w8 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + ror w21,w21,#24 + add w13,w13,w17 + add w14,w14,w19 + add w15,w15,w20 + add w16,w16,w21 + eor w9,w9,w13 + eor w10,w10,w14 + eor w11,w11,w15 + eor w12,w12,w16 + ror w9,w9,#25 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#16 + ror w17,w17,#16 + ror w19,w19,#16 + ror w20,w20,#16 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#20 + ror w11,w11,#20 + ror w12,w12,#20 + ror w9,w9,#20 + add w5,w5,w10 + add w6,w6,w11 + add w7,w7,w12 + add w8,w8,w9 + eor w21,w21,w5 + eor w17,w17,w6 + eor w19,w19,w7 + eor w20,w20,w8 + ror w21,w21,#24 + ror w17,w17,#24 + ror w19,w19,#24 + ror w20,w20,#24 + add w15,w15,w21 + add w16,w16,w17 + add w13,w13,w19 + add w14,w14,w20 + eor w10,w10,w15 + eor w11,w11,w16 + eor w12,w12,w13 + eor w9,w9,w14 + ror w10,w10,#25 + ror w11,w11,#25 + ror w12,w12,#25 + ror w9,w9,#25 + cbnz x4,Loop + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + b.lo Ltail + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + + b.hi Loop_outer + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.align 4 +Ltail: + add x2,x2,#64 +Less_than_64: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + stp x5,x7,[sp,#0] + stp x9,x11,[sp,#16] + stp x13,x15,[sp,#32] + stp x17,x20,[sp,#48] + +Loop_tail: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl ChaCha20_ctr32_neon + +.def ChaCha20_ctr32_neon + .type 32 +.endef +.align 5 +ChaCha20_ctr32_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma + add x5,x5,:lo12:Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + cmp x2,#512 + b.hs L512_or_more_neon + + sub sp,sp,#64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + +Loop_outer_neon: + mov w5,w22 // unpack key block + lsr x6,x22,#32 + mov v0.16b,v24.16b + mov w7,w23 + lsr x8,x23,#32 + mov v4.16b,v24.16b + mov w9,w24 + lsr x10,x24,#32 + mov v16.16b,v24.16b + mov w11,w25 + mov v1.16b,v25.16b + lsr x12,x25,#32 + mov v5.16b,v25.16b + mov w13,w26 + mov v17.16b,v25.16b + lsr x14,x26,#32 + mov v3.16b,v27.16b + mov w15,w27 + mov v7.16b,v28.16b + lsr x16,x27,#32 + mov v19.16b,v29.16b + mov w17,w28 + mov v2.16b,v26.16b + lsr x19,x28,#32 + mov v6.16b,v26.16b + mov w20,w30 + mov v18.16b,v26.16b + lsr x21,x30,#32 + + mov x4,#10 + subs x2,x2,#256 +Loop_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v16.4s,v16.4s,v17.4s + add w7,w7,w11 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w12 + eor v7.16b,v7.16b,v4.16b + eor w17,w17,w5 + eor v19.16b,v19.16b,v16.16b + eor w19,w19,w6 + rev32 v3.8h,v3.8h + eor w20,w20,w7 + rev32 v7.8h,v7.8h + eor w21,w21,w8 + rev32 v19.8h,v19.8h + ror w17,w17,#16 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#16 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#16 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#16 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#20 + add w16,w16,w21 + ushr v5.4s,v21.4s,#20 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#20 + eor w10,w10,w14 + sli v1.4s,v20.4s,#12 + eor w11,w11,w15 + sli v5.4s,v21.4s,#12 + eor w12,w12,w16 + sli v17.4s,v22.4s,#12 + ror w9,w9,#20 + add v0.4s,v0.4s,v1.4s + ror w10,w10,#20 + add v4.4s,v4.4s,v5.4s + ror w11,w11,#20 + add v16.4s,v16.4s,v17.4s + ror w12,w12,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w9 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w10 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w11 + ushr v3.4s,v20.4s,#24 + add w8,w8,w12 + ushr v7.4s,v21.4s,#24 + eor w17,w17,w5 + ushr v19.4s,v22.4s,#24 + eor w19,w19,w6 + sli v3.4s,v20.4s,#8 + eor w20,w20,w7 + sli v7.4s,v21.4s,#8 + eor w21,w21,w8 + sli v19.4s,v22.4s,#8 + ror w17,w17,#24 + add v2.4s,v2.4s,v3.4s + ror w19,w19,#24 + add v6.4s,v6.4s,v7.4s + ror w20,w20,#24 + add v18.4s,v18.4s,v19.4s + ror w21,w21,#24 + eor v20.16b,v1.16b,v2.16b + add w13,w13,w17 + eor v21.16b,v5.16b,v6.16b + add w14,w14,w19 + eor v22.16b,v17.16b,v18.16b + add w15,w15,w20 + ushr v1.4s,v20.4s,#25 + add w16,w16,w21 + ushr v5.4s,v21.4s,#25 + eor w9,w9,w13 + ushr v17.4s,v22.4s,#25 + eor w10,w10,w14 + sli v1.4s,v20.4s,#7 + eor w11,w11,w15 + sli v5.4s,v21.4s,#7 + eor w12,w12,w16 + sli v17.4s,v22.4s,#7 + ror w9,w9,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w10,w10,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w10 + add v4.4s,v4.4s,v5.4s + add w6,w6,w11 + add v16.4s,v16.4s,v17.4s + add w7,w7,w12 + eor v3.16b,v3.16b,v0.16b + add w8,w8,w9 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w5 + eor v19.16b,v19.16b,v16.16b + eor w17,w17,w6 + rev32 v3.8h,v3.8h + eor w19,w19,w7 + rev32 v7.8h,v7.8h + eor w20,w20,w8 + rev32 v19.8h,v19.8h + ror w21,w21,#16 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#16 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#16 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#16 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#20 + add w14,w14,w20 + ushr v5.4s,v21.4s,#20 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#20 + eor w11,w11,w16 + sli v1.4s,v20.4s,#12 + eor w12,w12,w13 + sli v5.4s,v21.4s,#12 + eor w9,w9,w14 + sli v17.4s,v22.4s,#12 + ror w10,w10,#20 + add v0.4s,v0.4s,v1.4s + ror w11,w11,#20 + add v4.4s,v4.4s,v5.4s + ror w12,w12,#20 + add v16.4s,v16.4s,v17.4s + ror w9,w9,#20 + eor v20.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v21.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v22.16b,v19.16b,v16.16b + add w7,w7,w12 + ushr v3.4s,v20.4s,#24 + add w8,w8,w9 + ushr v7.4s,v21.4s,#24 + eor w21,w21,w5 + ushr v19.4s,v22.4s,#24 + eor w17,w17,w6 + sli v3.4s,v20.4s,#8 + eor w19,w19,w7 + sli v7.4s,v21.4s,#8 + eor w20,w20,w8 + sli v19.4s,v22.4s,#8 + ror w21,w21,#24 + add v2.4s,v2.4s,v3.4s + ror w17,w17,#24 + add v6.4s,v6.4s,v7.4s + ror w19,w19,#24 + add v18.4s,v18.4s,v19.4s + ror w20,w20,#24 + eor v20.16b,v1.16b,v2.16b + add w15,w15,w21 + eor v21.16b,v5.16b,v6.16b + add w16,w16,w17 + eor v22.16b,v17.16b,v18.16b + add w13,w13,w19 + ushr v1.4s,v20.4s,#25 + add w14,w14,w20 + ushr v5.4s,v21.4s,#25 + eor w10,w10,w15 + ushr v17.4s,v22.4s,#25 + eor w11,w11,w16 + sli v1.4s,v20.4s,#7 + eor w12,w12,w13 + sli v5.4s,v21.4s,#7 + eor w9,w9,w14 + sli v17.4s,v22.4s,#7 + ror w10,w10,#25 + ext v2.16b,v2.16b,v2.16b,#8 + ror w11,w11,#25 + ext v6.16b,v6.16b,v6.16b,#8 + ror w12,w12,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + cbnz x4,Loop_neon + + add w5,w5,w22 // accumulate key block + add v0.4s,v0.4s,v24.4s + add x6,x6,x22,lsr#32 + add v4.4s,v4.4s,v24.4s + add w7,w7,w23 + add v16.4s,v16.4s,v24.4s + add x8,x8,x23,lsr#32 + add v2.4s,v2.4s,v26.4s + add w9,w9,w24 + add v6.4s,v6.4s,v26.4s + add x10,x10,x24,lsr#32 + add v18.4s,v18.4s,v26.4s + add w11,w11,w25 + add v3.4s,v3.4s,v27.4s + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add v7.4s,v7.4s,v28.4s + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add v19.4s,v19.4s,v29.4s + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add v1.4s,v1.4s,v25.4s + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add v5.4s,v5.4s,v25.4s + add x21,x21,x30,lsr#32 + add v17.4s,v17.4s,v25.4s + + b.lo Ltail_neon + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v20.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v21.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v22.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v23.16b + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + add v27.4s,v27.4s,v31.4s // += 4 + stp x13,x15,[x0,#32] + add v28.4s,v28.4s,v31.4s + stp x17,x20,[x0,#48] + add v29.4s,v29.4s,v31.4s + add x0,x0,#64 + + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + eor v16.16b,v16.16b,v0.16b + eor v17.16b,v17.16b,v1.16b + eor v18.16b,v18.16b,v2.16b + eor v19.16b,v19.16b,v3.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + b.hi Loop_outer_neon + + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Ltail_neon: + add x2,x2,#256 + cmp x2,#64 + b.lo Less_than_64 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#4 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_128 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v0.16b,v0.16b,v20.16b + eor v1.16b,v1.16b,v21.16b + eor v2.16b,v2.16b,v22.16b + eor v3.16b,v3.16b,v23.16b + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + cmp x2,#64 + b.lo Less_than_192 + + ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 + eor v4.16b,v4.16b,v20.16b + eor v5.16b,v5.16b,v21.16b + eor v6.16b,v6.16b,v22.16b + eor v7.16b,v7.16b,v23.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + b.eq Ldone_neon + sub x2,x2,#64 + + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] + b Last_neon + +Less_than_128: + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] + b Last_neon +Less_than_192: + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] + b Last_neon + +.align 4 +Last_neon: + sub x0,x0,#1 + add x1,x1,x2 + add x0,x0,x2 + add x4,sp,x2 + neg x2,x2 + +Loop_tail_neon: + ldrb w10,[x1,x2] + ldrb w11,[x4,x2] + add x2,x2,#1 + eor w10,w10,w11 + strb w10,[x0,x2] + cbnz x2,Loop_tail_neon + + stp xzr,xzr,[sp,#0] + stp xzr,xzr,[sp,#16] + stp xzr,xzr,[sp,#32] + stp xzr,xzr,[sp,#48] + +Ldone_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.def ChaCha20_512_neon + .type 32 +.endef +.align 5 +ChaCha20_512_neon: + AARCH64_SIGN_LINK_REGISTER + stp x29,x30,[sp,#-96]! + add x29,sp,#0 + + adrp x5,Lsigma + add x5,x5,:lo12:Lsigma + stp x19,x20,[sp,#16] + stp x21,x22,[sp,#32] + stp x23,x24,[sp,#48] + stp x25,x26,[sp,#64] + stp x27,x28,[sp,#80] + +L512_or_more_neon: + sub sp,sp,#128+64 + + ldp x22,x23,[x5] // load sigma + ld1 {v24.4s},[x5],#16 + ldp x24,x25,[x3] // load key + ldp x26,x27,[x3,#16] + ld1 {v25.4s,v26.4s},[x3] + ldp x28,x30,[x4] // load counter + ld1 {v27.4s},[x4] + ld1 {v31.4s},[x5] +#ifdef __AARCH64EB__ + rev64 v24.4s,v24.4s + ror x24,x24,#32 + ror x25,x25,#32 + ror x26,x26,#32 + ror x27,x27,#32 + ror x28,x28,#32 + ror x30,x30,#32 +#endif + add v27.4s,v27.4s,v31.4s // += 1 + stp q24,q25,[sp,#0] // off-load key block, invariant part + add v27.4s,v27.4s,v31.4s // not typo + str q26,[sp,#32] + add v28.4s,v27.4s,v31.4s + add v29.4s,v28.4s,v31.4s + add v30.4s,v29.4s,v31.4s + shl v31.4s,v31.4s,#2 // 1 -> 4 + + stp d8,d9,[sp,#128+0] // meet ABI requirements + stp d10,d11,[sp,#128+16] + stp d12,d13,[sp,#128+32] + stp d14,d15,[sp,#128+48] + + sub x2,x2,#512 // not typo + +Loop_outer_512_neon: + mov v0.16b,v24.16b + mov v4.16b,v24.16b + mov v8.16b,v24.16b + mov v12.16b,v24.16b + mov v16.16b,v24.16b + mov v20.16b,v24.16b + mov v1.16b,v25.16b + mov w5,w22 // unpack key block + mov v5.16b,v25.16b + lsr x6,x22,#32 + mov v9.16b,v25.16b + mov w7,w23 + mov v13.16b,v25.16b + lsr x8,x23,#32 + mov v17.16b,v25.16b + mov w9,w24 + mov v21.16b,v25.16b + lsr x10,x24,#32 + mov v3.16b,v27.16b + mov w11,w25 + mov v7.16b,v28.16b + lsr x12,x25,#32 + mov v11.16b,v29.16b + mov w13,w26 + mov v15.16b,v30.16b + lsr x14,x26,#32 + mov v2.16b,v26.16b + mov w15,w27 + mov v6.16b,v26.16b + lsr x16,x27,#32 + add v19.4s,v3.4s,v31.4s // +4 + mov w17,w28 + add v23.4s,v7.4s,v31.4s // +4 + lsr x19,x28,#32 + mov v10.16b,v26.16b + mov w20,w30 + mov v14.16b,v26.16b + lsr x21,x30,#32 + mov v18.16b,v26.16b + stp q27,q28,[sp,#48] // off-load key block, variable part + mov v22.16b,v26.16b + str q29,[sp,#80] + + mov x4,#5 + subs x2,x2,#512 +Loop_upper_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_upper_neon + + add w5,w5,w22 // accumulate key block + add x6,x6,x22,lsr#32 + add w7,w7,w23 + add x8,x8,x23,lsr#32 + add w9,w9,w24 + add x10,x10,x24,lsr#32 + add w11,w11,w25 + add x12,x12,x25,lsr#32 + add w13,w13,w26 + add x14,x14,x26,lsr#32 + add w15,w15,w27 + add x16,x16,x27,lsr#32 + add w17,w17,w28 + add x19,x19,x28,lsr#32 + add w20,w20,w30 + add x21,x21,x30,lsr#32 + + add x5,x5,x6,lsl#32 // pack + add x7,x7,x8,lsl#32 + ldp x6,x8,[x1,#0] // load input + add x9,x9,x10,lsl#32 + add x11,x11,x12,lsl#32 + ldp x10,x12,[x1,#16] + add x13,x13,x14,lsl#32 + add x15,x15,x16,lsl#32 + ldp x14,x16,[x1,#32] + add x17,x17,x19,lsl#32 + add x20,x20,x21,lsl#32 + ldp x19,x21,[x1,#48] + add x1,x1,#64 +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor x15,x15,x16 + eor x17,x17,x19 + eor x20,x20,x21 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#1 // increment counter + mov w5,w22 // unpack key block + lsr x6,x22,#32 + stp x9,x11,[x0,#16] + mov w7,w23 + lsr x8,x23,#32 + stp x13,x15,[x0,#32] + mov w9,w24 + lsr x10,x24,#32 + stp x17,x20,[x0,#48] + add x0,x0,#64 + mov w11,w25 + lsr x12,x25,#32 + mov w13,w26 + lsr x14,x26,#32 + mov w15,w27 + lsr x16,x27,#32 + mov w17,w28 + lsr x19,x28,#32 + mov w20,w30 + lsr x21,x30,#32 + + mov x4,#5 +Loop_lower_neon: + sub x4,x4,#1 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#12 + ext v7.16b,v7.16b,v7.16b,#12 + ext v11.16b,v11.16b,v11.16b,#12 + ext v15.16b,v15.16b,v15.16b,#12 + ext v19.16b,v19.16b,v19.16b,#12 + ext v23.16b,v23.16b,v23.16b,#12 + ext v1.16b,v1.16b,v1.16b,#4 + ext v5.16b,v5.16b,v5.16b,#4 + ext v9.16b,v9.16b,v9.16b,#4 + ext v13.16b,v13.16b,v13.16b,#4 + ext v17.16b,v17.16b,v17.16b,#4 + ext v21.16b,v21.16b,v21.16b,#4 + add v0.4s,v0.4s,v1.4s + add w5,w5,w9 + add v4.4s,v4.4s,v5.4s + add w6,w6,w10 + add v8.4s,v8.4s,v9.4s + add w7,w7,w11 + add v12.4s,v12.4s,v13.4s + add w8,w8,w12 + add v16.4s,v16.4s,v17.4s + eor w17,w17,w5 + add v20.4s,v20.4s,v21.4s + eor w19,w19,w6 + eor v3.16b,v3.16b,v0.16b + eor w20,w20,w7 + eor v7.16b,v7.16b,v4.16b + eor w21,w21,w8 + eor v11.16b,v11.16b,v8.16b + ror w17,w17,#16 + eor v15.16b,v15.16b,v12.16b + ror w19,w19,#16 + eor v19.16b,v19.16b,v16.16b + ror w20,w20,#16 + eor v23.16b,v23.16b,v20.16b + ror w21,w21,#16 + rev32 v3.8h,v3.8h + add w13,w13,w17 + rev32 v7.8h,v7.8h + add w14,w14,w19 + rev32 v11.8h,v11.8h + add w15,w15,w20 + rev32 v15.8h,v15.8h + add w16,w16,w21 + rev32 v19.8h,v19.8h + eor w9,w9,w13 + rev32 v23.8h,v23.8h + eor w10,w10,w14 + add v2.4s,v2.4s,v3.4s + eor w11,w11,w15 + add v6.4s,v6.4s,v7.4s + eor w12,w12,w16 + add v10.4s,v10.4s,v11.4s + ror w9,w9,#20 + add v14.4s,v14.4s,v15.4s + ror w10,w10,#20 + add v18.4s,v18.4s,v19.4s + ror w11,w11,#20 + add v22.4s,v22.4s,v23.4s + ror w12,w12,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w9 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w10 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w11 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w12 + eor v28.16b,v17.16b,v18.16b + eor w17,w17,w5 + eor v29.16b,v21.16b,v22.16b + eor w19,w19,w6 + ushr v1.4s,v24.4s,#20 + eor w20,w20,w7 + ushr v5.4s,v25.4s,#20 + eor w21,w21,w8 + ushr v9.4s,v26.4s,#20 + ror w17,w17,#24 + ushr v13.4s,v27.4s,#20 + ror w19,w19,#24 + ushr v17.4s,v28.4s,#20 + ror w20,w20,#24 + ushr v21.4s,v29.4s,#20 + ror w21,w21,#24 + sli v1.4s,v24.4s,#12 + add w13,w13,w17 + sli v5.4s,v25.4s,#12 + add w14,w14,w19 + sli v9.4s,v26.4s,#12 + add w15,w15,w20 + sli v13.4s,v27.4s,#12 + add w16,w16,w21 + sli v17.4s,v28.4s,#12 + eor w9,w9,w13 + sli v21.4s,v29.4s,#12 + eor w10,w10,w14 + add v0.4s,v0.4s,v1.4s + eor w11,w11,w15 + add v4.4s,v4.4s,v5.4s + eor w12,w12,w16 + add v8.4s,v8.4s,v9.4s + ror w9,w9,#25 + add v12.4s,v12.4s,v13.4s + ror w10,w10,#25 + add v16.4s,v16.4s,v17.4s + ror w11,w11,#25 + add v20.4s,v20.4s,v21.4s + ror w12,w12,#25 + eor v24.16b,v3.16b,v0.16b + add w5,w5,w10 + eor v25.16b,v7.16b,v4.16b + add w6,w6,w11 + eor v26.16b,v11.16b,v8.16b + add w7,w7,w12 + eor v27.16b,v15.16b,v12.16b + add w8,w8,w9 + eor v28.16b,v19.16b,v16.16b + eor w21,w21,w5 + eor v29.16b,v23.16b,v20.16b + eor w17,w17,w6 + ushr v3.4s,v24.4s,#24 + eor w19,w19,w7 + ushr v7.4s,v25.4s,#24 + eor w20,w20,w8 + ushr v11.4s,v26.4s,#24 + ror w21,w21,#16 + ushr v15.4s,v27.4s,#24 + ror w17,w17,#16 + ushr v19.4s,v28.4s,#24 + ror w19,w19,#16 + ushr v23.4s,v29.4s,#24 + ror w20,w20,#16 + sli v3.4s,v24.4s,#8 + add w15,w15,w21 + sli v7.4s,v25.4s,#8 + add w16,w16,w17 + sli v11.4s,v26.4s,#8 + add w13,w13,w19 + sli v15.4s,v27.4s,#8 + add w14,w14,w20 + sli v19.4s,v28.4s,#8 + eor w10,w10,w15 + sli v23.4s,v29.4s,#8 + eor w11,w11,w16 + add v2.4s,v2.4s,v3.4s + eor w12,w12,w13 + add v6.4s,v6.4s,v7.4s + eor w9,w9,w14 + add v10.4s,v10.4s,v11.4s + ror w10,w10,#20 + add v14.4s,v14.4s,v15.4s + ror w11,w11,#20 + add v18.4s,v18.4s,v19.4s + ror w12,w12,#20 + add v22.4s,v22.4s,v23.4s + ror w9,w9,#20 + eor v24.16b,v1.16b,v2.16b + add w5,w5,w10 + eor v25.16b,v5.16b,v6.16b + add w6,w6,w11 + eor v26.16b,v9.16b,v10.16b + add w7,w7,w12 + eor v27.16b,v13.16b,v14.16b + add w8,w8,w9 + eor v28.16b,v17.16b,v18.16b + eor w21,w21,w5 + eor v29.16b,v21.16b,v22.16b + eor w17,w17,w6 + ushr v1.4s,v24.4s,#25 + eor w19,w19,w7 + ushr v5.4s,v25.4s,#25 + eor w20,w20,w8 + ushr v9.4s,v26.4s,#25 + ror w21,w21,#24 + ushr v13.4s,v27.4s,#25 + ror w17,w17,#24 + ushr v17.4s,v28.4s,#25 + ror w19,w19,#24 + ushr v21.4s,v29.4s,#25 + ror w20,w20,#24 + sli v1.4s,v24.4s,#7 + add w15,w15,w21 + sli v5.4s,v25.4s,#7 + add w16,w16,w17 + sli v9.4s,v26.4s,#7 + add w13,w13,w19 + sli v13.4s,v27.4s,#7 + add w14,w14,w20 + sli v17.4s,v28.4s,#7 + eor w10,w10,w15 + sli v21.4s,v29.4s,#7 + eor w11,w11,w16 + ext v2.16b,v2.16b,v2.16b,#8 + eor w12,w12,w13 + ext v6.16b,v6.16b,v6.16b,#8 + eor w9,w9,w14 + ext v10.16b,v10.16b,v10.16b,#8 + ror w10,w10,#25 + ext v14.16b,v14.16b,v14.16b,#8 + ror w11,w11,#25 + ext v18.16b,v18.16b,v18.16b,#8 + ror w12,w12,#25 + ext v22.16b,v22.16b,v22.16b,#8 + ror w9,w9,#25 + ext v3.16b,v3.16b,v3.16b,#4 + ext v7.16b,v7.16b,v7.16b,#4 + ext v11.16b,v11.16b,v11.16b,#4 + ext v15.16b,v15.16b,v15.16b,#4 + ext v19.16b,v19.16b,v19.16b,#4 + ext v23.16b,v23.16b,v23.16b,#4 + ext v1.16b,v1.16b,v1.16b,#12 + ext v5.16b,v5.16b,v5.16b,#12 + ext v9.16b,v9.16b,v9.16b,#12 + ext v13.16b,v13.16b,v13.16b,#12 + ext v17.16b,v17.16b,v17.16b,#12 + ext v21.16b,v21.16b,v21.16b,#12 + cbnz x4,Loop_lower_neon + + add w5,w5,w22 // accumulate key block + ldp q24,q25,[sp,#0] + add x6,x6,x22,lsr#32 + ldp q26,q27,[sp,#32] + add w7,w7,w23 + ldp q28,q29,[sp,#64] + add x8,x8,x23,lsr#32 + add v0.4s,v0.4s,v24.4s + add w9,w9,w24 + add v4.4s,v4.4s,v24.4s + add x10,x10,x24,lsr#32 + add v8.4s,v8.4s,v24.4s + add w11,w11,w25 + add v12.4s,v12.4s,v24.4s + add x12,x12,x25,lsr#32 + add v16.4s,v16.4s,v24.4s + add w13,w13,w26 + add v20.4s,v20.4s,v24.4s + add x14,x14,x26,lsr#32 + add v2.4s,v2.4s,v26.4s + add w15,w15,w27 + add v6.4s,v6.4s,v26.4s + add x16,x16,x27,lsr#32 + add v10.4s,v10.4s,v26.4s + add w17,w17,w28 + add v14.4s,v14.4s,v26.4s + add x19,x19,x28,lsr#32 + add v18.4s,v18.4s,v26.4s + add w20,w20,w30 + add v22.4s,v22.4s,v26.4s + add x21,x21,x30,lsr#32 + add v19.4s,v19.4s,v31.4s // +4 + add x5,x5,x6,lsl#32 // pack + add v23.4s,v23.4s,v31.4s // +4 + add x7,x7,x8,lsl#32 + add v3.4s,v3.4s,v27.4s + ldp x6,x8,[x1,#0] // load input + add v7.4s,v7.4s,v28.4s + add x9,x9,x10,lsl#32 + add v11.4s,v11.4s,v29.4s + add x11,x11,x12,lsl#32 + add v15.4s,v15.4s,v30.4s + ldp x10,x12,[x1,#16] + add v19.4s,v19.4s,v27.4s + add x13,x13,x14,lsl#32 + add v23.4s,v23.4s,v28.4s + add x15,x15,x16,lsl#32 + add v1.4s,v1.4s,v25.4s + ldp x14,x16,[x1,#32] + add v5.4s,v5.4s,v25.4s + add x17,x17,x19,lsl#32 + add v9.4s,v9.4s,v25.4s + add x20,x20,x21,lsl#32 + add v13.4s,v13.4s,v25.4s + ldp x19,x21,[x1,#48] + add v17.4s,v17.4s,v25.4s + add x1,x1,#64 + add v21.4s,v21.4s,v25.4s + +#ifdef __AARCH64EB__ + rev x5,x5 + rev x7,x7 + rev x9,x9 + rev x11,x11 + rev x13,x13 + rev x15,x15 + rev x17,x17 + rev x20,x20 +#endif + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + eor x5,x5,x6 + eor x7,x7,x8 + eor x9,x9,x10 + eor x11,x11,x12 + eor x13,x13,x14 + eor v0.16b,v0.16b,v24.16b + eor x15,x15,x16 + eor v1.16b,v1.16b,v25.16b + eor x17,x17,x19 + eor v2.16b,v2.16b,v26.16b + eor x20,x20,x21 + eor v3.16b,v3.16b,v27.16b + ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 + + stp x5,x7,[x0,#0] // store output + add x28,x28,#7 // increment counter + stp x9,x11,[x0,#16] + stp x13,x15,[x0,#32] + stp x17,x20,[x0,#48] + add x0,x0,#64 + st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 + + ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 + eor v4.16b,v4.16b,v24.16b + eor v5.16b,v5.16b,v25.16b + eor v6.16b,v6.16b,v26.16b + eor v7.16b,v7.16b,v27.16b + st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 + + ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 + eor v8.16b,v8.16b,v0.16b + ldp q24,q25,[sp,#0] + eor v9.16b,v9.16b,v1.16b + ldp q26,q27,[sp,#32] + eor v10.16b,v10.16b,v2.16b + eor v11.16b,v11.16b,v3.16b + st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 + + ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 + eor v12.16b,v12.16b,v4.16b + eor v13.16b,v13.16b,v5.16b + eor v14.16b,v14.16b,v6.16b + eor v15.16b,v15.16b,v7.16b + st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 + + ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 + eor v16.16b,v16.16b,v8.16b + eor v17.16b,v17.16b,v9.16b + eor v18.16b,v18.16b,v10.16b + eor v19.16b,v19.16b,v11.16b + st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 + + shl v0.4s,v31.4s,#1 // 4 -> 8 + eor v20.16b,v20.16b,v12.16b + eor v21.16b,v21.16b,v13.16b + eor v22.16b,v22.16b,v14.16b + eor v23.16b,v23.16b,v15.16b + st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 + + add v27.4s,v27.4s,v0.4s // += 8 + add v28.4s,v28.4s,v0.4s + add v29.4s,v29.4s,v0.4s + add v30.4s,v30.4s,v0.4s + + b.hs Loop_outer_512_neon + + adds x2,x2,#512 + ushr v0.4s,v31.4s,#2 // 4 -> 1 + + ldp d8,d9,[sp,#128+0] // meet ABI requirements + ldp d10,d11,[sp,#128+16] + ldp d12,d13,[sp,#128+32] + ldp d14,d15,[sp,#128+48] + + stp q24,q31,[sp,#0] // wipe off-load area + stp q24,q31,[sp,#32] + stp q24,q31,[sp,#64] + + b.eq Ldone_512_neon + + cmp x2,#192 + sub v27.4s,v27.4s,v0.4s // -= 1 + sub v28.4s,v28.4s,v0.4s + sub v29.4s,v29.4s,v0.4s + add sp,sp,#128 + b.hs Loop_outer_neon + + eor v25.16b,v25.16b,v25.16b + eor v26.16b,v26.16b,v26.16b + eor v27.16b,v27.16b,v27.16b + eor v28.16b,v28.16b,v28.16b + eor v29.16b,v29.16b,v29.16b + eor v30.16b,v30.16b,v30.16b + b Loop_outer + +Ldone_512_neon: + ldp x19,x20,[x29,#16] + add sp,sp,#128+64 + ldp x21,x22,[x29,#32] + ldp x23,x24,[x29,#48] + ldp x25,x26,[x29,#64] + ldp x27,x28,[x29,#80] + ldp x29,x30,[sp],#96 + AARCH64_VALIDATE_LINK_REGISTER + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/crypto/chacha-x86-apple.S b/third_party/boringssl/gen/crypto/chacha-x86-apple.S new file mode 100644 index 00000000..c03fb5bd --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha-x86-apple.S @@ -0,0 +1,957 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _ChaCha20_ctr32_nohw +.private_extern _ChaCha20_ctr32_nohw +.align 4 +_ChaCha20_ctr32_nohw: +L_ChaCha20_ctr32_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 32(%esp),%esi + movl 36(%esp),%edi + subl $132,%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,80(%esp) + movl %ebx,84(%esp) + movl %ecx,88(%esp) + movl %edx,92(%esp) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + movl %eax,96(%esp) + movl %ebx,100(%esp) + movl %ecx,104(%esp) + movl %edx,108(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + subl $1,%eax + movl %eax,112(%esp) + movl %ebx,116(%esp) + movl %ecx,120(%esp) + movl %edx,124(%esp) + jmp L000entry +.align 4,0x90 +L001outer_loop: + movl %ebx,156(%esp) + movl %eax,152(%esp) + movl %ecx,160(%esp) +L000entry: + movl $1634760805,%eax + movl $857760878,4(%esp) + movl $2036477234,8(%esp) + movl $1797285236,12(%esp) + movl 84(%esp),%ebx + movl 88(%esp),%ebp + movl 104(%esp),%ecx + movl 108(%esp),%esi + movl 116(%esp),%edx + movl 120(%esp),%edi + movl %ebx,20(%esp) + movl %ebp,24(%esp) + movl %ecx,40(%esp) + movl %esi,44(%esp) + movl %edx,52(%esp) + movl %edi,56(%esp) + movl 92(%esp),%ebx + movl 124(%esp),%edi + movl 112(%esp),%edx + movl 80(%esp),%ebp + movl 96(%esp),%ecx + movl 100(%esp),%esi + addl $1,%edx + movl %ebx,28(%esp) + movl %edi,60(%esp) + movl %edx,112(%esp) + movl $10,%ebx + jmp L002loop +.align 4,0x90 +L002loop: + addl %ebp,%eax + movl %ebx,128(%esp) + movl %ebp,%ebx + xorl %eax,%edx + roll $16,%edx + addl %edx,%ecx + xorl %ecx,%ebx + movl 52(%esp),%edi + roll $12,%ebx + movl 20(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,48(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,32(%esp) + roll $16,%edi + movl %ebx,16(%esp) + addl %edi,%esi + movl 40(%esp),%ecx + xorl %esi,%ebp + movl 56(%esp),%edx + roll $12,%ebp + movl 24(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,52(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,36(%esp) + roll $16,%edx + movl %ebp,20(%esp) + addl %edx,%ecx + movl 44(%esp),%esi + xorl %ecx,%ebx + movl 60(%esp),%edi + roll $12,%ebx + movl 28(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,56(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,24(%esp) + addl %edi,%esi + xorl %esi,%ebp + roll $12,%ebp + movl 20(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,%edx + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + roll $16,%edx + movl %ebp,28(%esp) + addl %edx,%ecx + xorl %ecx,%ebx + movl 48(%esp),%edi + roll $12,%ebx + movl 24(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,60(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,40(%esp) + roll $16,%edi + movl %ebx,20(%esp) + addl %edi,%esi + movl 32(%esp),%ecx + xorl %esi,%ebp + movl 52(%esp),%edx + roll $12,%ebp + movl 28(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,48(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,44(%esp) + roll $16,%edx + movl %ebp,24(%esp) + addl %edx,%ecx + movl 36(%esp),%esi + xorl %ecx,%ebx + movl 56(%esp),%edi + roll $12,%ebx + movl 16(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,52(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,28(%esp) + addl %edi,%esi + xorl %esi,%ebp + movl 48(%esp),%edx + roll $12,%ebp + movl 128(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,56(%esp) + xorl %esi,%ebp + roll $7,%ebp + decl %ebx + jnz L002loop + movl 160(%esp),%ebx + addl $1634760805,%eax + addl 80(%esp),%ebp + addl 96(%esp),%ecx + addl 100(%esp),%esi + cmpl $64,%ebx + jb L003tail + movl 156(%esp),%ebx + addl 112(%esp),%edx + addl 120(%esp),%edi + xorl (%ebx),%eax + xorl 16(%ebx),%ebp + movl %eax,(%esp) + movl 152(%esp),%eax + xorl 32(%ebx),%ecx + xorl 36(%ebx),%esi + xorl 48(%ebx),%edx + xorl 56(%ebx),%edi + movl %ebp,16(%eax) + movl %ecx,32(%eax) + movl %esi,36(%eax) + movl %edx,48(%eax) + movl %edi,56(%eax) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + xorl 4(%ebx),%ebp + xorl 8(%ebx),%ecx + xorl 12(%ebx),%esi + xorl 20(%ebx),%edx + xorl 24(%ebx),%edi + movl %ebp,4(%eax) + movl %ecx,8(%eax) + movl %esi,12(%eax) + movl %edx,20(%eax) + movl %edi,24(%eax) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + xorl 28(%ebx),%ebp + xorl 40(%ebx),%ecx + xorl 44(%ebx),%esi + xorl 52(%ebx),%edx + xorl 60(%ebx),%edi + leal 64(%ebx),%ebx + movl %ebp,28(%eax) + movl (%esp),%ebp + movl %ecx,40(%eax) + movl 160(%esp),%ecx + movl %esi,44(%eax) + movl %edx,52(%eax) + movl %edi,60(%eax) + movl %ebp,(%eax) + leal 64(%eax),%eax + subl $64,%ecx + jnz L001outer_loop + jmp L004done +L003tail: + addl 112(%esp),%edx + addl 120(%esp),%edi + movl %eax,(%esp) + movl %ebp,16(%esp) + movl %ecx,32(%esp) + movl %esi,36(%esp) + movl %edx,48(%esp) + movl %edi,56(%esp) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + movl %ebp,4(%esp) + movl %ecx,8(%esp) + movl %esi,12(%esp) + movl %edx,20(%esp) + movl %edi,24(%esp) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + movl %ebp,28(%esp) + movl 156(%esp),%ebp + movl %ecx,40(%esp) + movl 152(%esp),%ecx + movl %esi,44(%esp) + xorl %esi,%esi + movl %edx,52(%esp) + movl %edi,60(%esp) + xorl %eax,%eax + xorl %edx,%edx +L005tail_loop: + movb (%esi,%ebp,1),%al + movb (%esp,%esi,1),%dl + leal 1(%esi),%esi + xorb %dl,%al + movb %al,-1(%ecx,%esi,1) + decl %ebx + jnz L005tail_loop +L004done: + addl $132,%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _ChaCha20_ctr32_ssse3 +.private_extern _ChaCha20_ctr32_ssse3 +.align 4 +_ChaCha20_ctr32_ssse3: +L_ChaCha20_ctr32_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call Lpic_point +Lpic_point: + popl %eax + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal Lssse3_data-Lpic_point(%eax),%eax + movdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb L0061x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + movdqu (%edx),%xmm7 + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + paddd 48(%eax),%xmm0 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + psubd 64(%eax),%xmm0 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,64(%ebp) + movdqa %xmm1,80(%ebp) + movdqa %xmm2,96(%ebp) + movdqa %xmm3,112(%ebp) + movdqu 16(%edx),%xmm3 + movdqa %xmm4,-64(%ebp) + movdqa %xmm5,-48(%ebp) + movdqa %xmm6,-32(%ebp) + movdqa %xmm7,-16(%ebp) + movdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,(%ebp) + movdqa %xmm1,16(%ebp) + movdqa %xmm2,32(%ebp) + movdqa %xmm3,48(%ebp) + movdqa %xmm4,-128(%ebp) + movdqa %xmm5,-112(%ebp) + movdqa %xmm6,-96(%ebp) + movdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp L007outer_loop +.align 4,0x90 +L007outer_loop: + movdqa -112(%ebp),%xmm1 + movdqa -96(%ebp),%xmm2 + movdqa -80(%ebp),%xmm3 + movdqa -48(%ebp),%xmm5 + movdqa -32(%ebp),%xmm6 + movdqa -16(%ebp),%xmm7 + movdqa %xmm1,-112(%ebx) + movdqa %xmm2,-96(%ebx) + movdqa %xmm3,-80(%ebx) + movdqa %xmm5,-48(%ebx) + movdqa %xmm6,-32(%ebx) + movdqa %xmm7,-16(%ebx) + movdqa 32(%ebp),%xmm2 + movdqa 48(%ebp),%xmm3 + movdqa 64(%ebp),%xmm4 + movdqa 80(%ebp),%xmm5 + movdqa 96(%ebp),%xmm6 + movdqa 112(%ebp),%xmm7 + paddd 64(%eax),%xmm4 + movdqa %xmm2,32(%ebx) + movdqa %xmm3,48(%ebx) + movdqa %xmm4,64(%ebx) + movdqa %xmm5,80(%ebx) + movdqa %xmm6,96(%ebx) + movdqa %xmm7,112(%ebx) + movdqa %xmm4,64(%ebp) + movdqa -128(%ebp),%xmm0 + movdqa %xmm4,%xmm6 + movdqa -64(%ebp),%xmm3 + movdqa (%ebp),%xmm4 + movdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 4,0x90 +L008loop: + paddd %xmm3,%xmm0 + movdqa %xmm3,%xmm2 + pxor %xmm0,%xmm6 + pshufb (%eax),%xmm6 + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -48(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 80(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,64(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-64(%ebx) + paddd %xmm7,%xmm5 + movdqa 32(%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -32(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 96(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,80(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,16(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-48(%ebx) + paddd %xmm6,%xmm4 + movdqa 48(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -16(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 112(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,96(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-32(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa -48(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,%xmm6 + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + pshufb (%eax),%xmm6 + movdqa %xmm3,-16(%ebx) + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -32(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 64(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,112(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,32(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-48(%ebx) + paddd %xmm7,%xmm5 + movdqa (%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -16(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 80(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,64(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,48(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-32(%ebx) + paddd %xmm6,%xmm4 + movdqa 16(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -64(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 96(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,80(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-16(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 64(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,96(%ebx) + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + por %xmm1,%xmm3 + decl %edx + jnz L008loop + movdqa %xmm3,-64(%ebx) + movdqa %xmm4,(%ebx) + movdqa %xmm5,16(%ebx) + movdqa %xmm6,64(%ebx) + movdqa %xmm7,96(%ebx) + movdqa -112(%ebx),%xmm1 + movdqa -96(%ebx),%xmm2 + movdqa -80(%ebx),%xmm3 + paddd -128(%ebp),%xmm0 + paddd -112(%ebp),%xmm1 + paddd -96(%ebp),%xmm2 + paddd -80(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa -64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa -48(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa -32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa -16(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd -64(%ebp),%xmm0 + paddd -48(%ebp),%xmm1 + paddd -32(%ebp),%xmm2 + paddd -16(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa (%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 16(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 48(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd (%ebp),%xmm0 + paddd 16(%ebp),%xmm1 + paddd 32(%ebp),%xmm2 + paddd 48(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa 64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 80(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 96(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 112(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd 64(%ebp),%xmm0 + paddd 80(%ebp),%xmm1 + paddd 96(%ebp),%xmm2 + paddd 112(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 208(%esi),%esi + pxor %xmm0,%xmm4 + pxor %xmm1,%xmm5 + pxor %xmm2,%xmm6 + pxor %xmm3,%xmm7 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc L007outer_loop + addl $256,%ecx + jz L009done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + movd 64(%ebp),%xmm2 + movdqu (%ebx),%xmm3 + paddd 96(%eax),%xmm2 + pand 112(%eax),%xmm3 + por %xmm2,%xmm3 +L0061x: + movdqa 32(%eax),%xmm0 + movdqu (%edx),%xmm1 + movdqu 16(%edx),%xmm2 + movdqa (%eax),%xmm6 + movdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + movl $10,%edx + jmp L010loop1x +.align 4,0x90 +L011outer1x: + movdqa 80(%eax),%xmm3 + movdqa (%esp),%xmm0 + movdqa 16(%esp),%xmm1 + movdqa 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + movl $10,%edx + movdqa %xmm3,48(%esp) + jmp L010loop1x +.align 4,0x90 +L010loop1x: + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm6,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm7,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm1,%xmm1 + pshufd $147,%xmm3,%xmm3 + nop + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm6,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm7,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm1,%xmm1 + pshufd $57,%xmm3,%xmm3 + decl %edx + jnz L010loop1x + paddd (%esp),%xmm0 + paddd 16(%esp),%xmm1 + paddd 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + cmpl $64,%ecx + jb L012tail + movdqu (%esi),%xmm4 + movdqu 16(%esi),%xmm5 + pxor %xmm4,%xmm0 + movdqu 32(%esi),%xmm4 + pxor %xmm5,%xmm1 + movdqu 48(%esi),%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm3 + leal 64(%esi),%esi + movdqu %xmm0,(%edi) + movdqu %xmm1,16(%edi) + movdqu %xmm2,32(%edi) + movdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz L011outer1x + jmp L009done +L012tail: + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +L013tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz L013tail_loop +L009done: + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.align 6,0x90 +Lssse3_data: +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +.long 1634760805,857760878,2036477234,1797285236 +.long 0,1,2,3 +.long 4,4,4,4 +.long 1,0,0,0 +.long 4,0,0,0 +.long 0,-1,-1,-1 +.align 6,0x90 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +.byte 114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/crypto/chacha-x86-linux.S b/third_party/boringssl/gen/crypto/chacha-x86-linux.S new file mode 100644 index 00000000..9ad20a07 --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha-x86-linux.S @@ -0,0 +1,961 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl ChaCha20_ctr32_nohw +.hidden ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,@function +.align 16 +ChaCha20_ctr32_nohw: +.L_ChaCha20_ctr32_nohw_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 32(%esp),%esi + movl 36(%esp),%edi + subl $132,%esp + movl (%esi),%eax + movl 4(%esi),%ebx + movl 8(%esi),%ecx + movl 12(%esi),%edx + movl %eax,80(%esp) + movl %ebx,84(%esp) + movl %ecx,88(%esp) + movl %edx,92(%esp) + movl 16(%esi),%eax + movl 20(%esi),%ebx + movl 24(%esi),%ecx + movl 28(%esi),%edx + movl %eax,96(%esp) + movl %ebx,100(%esp) + movl %ecx,104(%esp) + movl %edx,108(%esp) + movl (%edi),%eax + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx + subl $1,%eax + movl %eax,112(%esp) + movl %ebx,116(%esp) + movl %ecx,120(%esp) + movl %edx,124(%esp) + jmp .L000entry +.align 16 +.L001outer_loop: + movl %ebx,156(%esp) + movl %eax,152(%esp) + movl %ecx,160(%esp) +.L000entry: + movl $1634760805,%eax + movl $857760878,4(%esp) + movl $2036477234,8(%esp) + movl $1797285236,12(%esp) + movl 84(%esp),%ebx + movl 88(%esp),%ebp + movl 104(%esp),%ecx + movl 108(%esp),%esi + movl 116(%esp),%edx + movl 120(%esp),%edi + movl %ebx,20(%esp) + movl %ebp,24(%esp) + movl %ecx,40(%esp) + movl %esi,44(%esp) + movl %edx,52(%esp) + movl %edi,56(%esp) + movl 92(%esp),%ebx + movl 124(%esp),%edi + movl 112(%esp),%edx + movl 80(%esp),%ebp + movl 96(%esp),%ecx + movl 100(%esp),%esi + addl $1,%edx + movl %ebx,28(%esp) + movl %edi,60(%esp) + movl %edx,112(%esp) + movl $10,%ebx + jmp .L002loop +.align 16 +.L002loop: + addl %ebp,%eax + movl %ebx,128(%esp) + movl %ebp,%ebx + xorl %eax,%edx + roll $16,%edx + addl %edx,%ecx + xorl %ecx,%ebx + movl 52(%esp),%edi + roll $12,%ebx + movl 20(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,48(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,32(%esp) + roll $16,%edi + movl %ebx,16(%esp) + addl %edi,%esi + movl 40(%esp),%ecx + xorl %esi,%ebp + movl 56(%esp),%edx + roll $12,%ebp + movl 24(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,52(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,36(%esp) + roll $16,%edx + movl %ebp,20(%esp) + addl %edx,%ecx + movl 44(%esp),%esi + xorl %ecx,%ebx + movl 60(%esp),%edi + roll $12,%ebx + movl 28(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,56(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,24(%esp) + addl %edi,%esi + xorl %esi,%ebp + roll $12,%ebp + movl 20(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,%edx + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + roll $16,%edx + movl %ebp,28(%esp) + addl %edx,%ecx + xorl %ecx,%ebx + movl 48(%esp),%edi + roll $12,%ebx + movl 24(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,(%esp) + roll $8,%edx + movl 4(%esp),%eax + addl %edx,%ecx + movl %edx,60(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + movl %ecx,40(%esp) + roll $16,%edi + movl %ebx,20(%esp) + addl %edi,%esi + movl 32(%esp),%ecx + xorl %esi,%ebp + movl 52(%esp),%edx + roll $12,%ebp + movl 28(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,4(%esp) + roll $8,%edi + movl 8(%esp),%eax + addl %edi,%esi + movl %edi,48(%esp) + xorl %esi,%ebp + addl %ebx,%eax + roll $7,%ebp + xorl %eax,%edx + movl %esi,44(%esp) + roll $16,%edx + movl %ebp,24(%esp) + addl %edx,%ecx + movl 36(%esp),%esi + xorl %ecx,%ebx + movl 56(%esp),%edi + roll $12,%ebx + movl 16(%esp),%ebp + addl %ebx,%eax + xorl %eax,%edx + movl %eax,8(%esp) + roll $8,%edx + movl 12(%esp),%eax + addl %edx,%ecx + movl %edx,52(%esp) + xorl %ecx,%ebx + addl %ebp,%eax + roll $7,%ebx + xorl %eax,%edi + roll $16,%edi + movl %ebx,28(%esp) + addl %edi,%esi + xorl %esi,%ebp + movl 48(%esp),%edx + roll $12,%ebp + movl 128(%esp),%ebx + addl %ebp,%eax + xorl %eax,%edi + movl %eax,12(%esp) + roll $8,%edi + movl (%esp),%eax + addl %edi,%esi + movl %edi,56(%esp) + xorl %esi,%ebp + roll $7,%ebp + decl %ebx + jnz .L002loop + movl 160(%esp),%ebx + addl $1634760805,%eax + addl 80(%esp),%ebp + addl 96(%esp),%ecx + addl 100(%esp),%esi + cmpl $64,%ebx + jb .L003tail + movl 156(%esp),%ebx + addl 112(%esp),%edx + addl 120(%esp),%edi + xorl (%ebx),%eax + xorl 16(%ebx),%ebp + movl %eax,(%esp) + movl 152(%esp),%eax + xorl 32(%ebx),%ecx + xorl 36(%ebx),%esi + xorl 48(%ebx),%edx + xorl 56(%ebx),%edi + movl %ebp,16(%eax) + movl %ecx,32(%eax) + movl %esi,36(%eax) + movl %edx,48(%eax) + movl %edi,56(%eax) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + xorl 4(%ebx),%ebp + xorl 8(%ebx),%ecx + xorl 12(%ebx),%esi + xorl 20(%ebx),%edx + xorl 24(%ebx),%edi + movl %ebp,4(%eax) + movl %ecx,8(%eax) + movl %esi,12(%eax) + movl %edx,20(%eax) + movl %edi,24(%eax) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + xorl 28(%ebx),%ebp + xorl 40(%ebx),%ecx + xorl 44(%ebx),%esi + xorl 52(%ebx),%edx + xorl 60(%ebx),%edi + leal 64(%ebx),%ebx + movl %ebp,28(%eax) + movl (%esp),%ebp + movl %ecx,40(%eax) + movl 160(%esp),%ecx + movl %esi,44(%eax) + movl %edx,52(%eax) + movl %edi,60(%eax) + movl %ebp,(%eax) + leal 64(%eax),%eax + subl $64,%ecx + jnz .L001outer_loop + jmp .L004done +.L003tail: + addl 112(%esp),%edx + addl 120(%esp),%edi + movl %eax,(%esp) + movl %ebp,16(%esp) + movl %ecx,32(%esp) + movl %esi,36(%esp) + movl %edx,48(%esp) + movl %edi,56(%esp) + movl 4(%esp),%ebp + movl 8(%esp),%ecx + movl 12(%esp),%esi + movl 20(%esp),%edx + movl 24(%esp),%edi + addl $857760878,%ebp + addl $2036477234,%ecx + addl $1797285236,%esi + addl 84(%esp),%edx + addl 88(%esp),%edi + movl %ebp,4(%esp) + movl %ecx,8(%esp) + movl %esi,12(%esp) + movl %edx,20(%esp) + movl %edi,24(%esp) + movl 28(%esp),%ebp + movl 40(%esp),%ecx + movl 44(%esp),%esi + movl 52(%esp),%edx + movl 60(%esp),%edi + addl 92(%esp),%ebp + addl 104(%esp),%ecx + addl 108(%esp),%esi + addl 116(%esp),%edx + addl 124(%esp),%edi + movl %ebp,28(%esp) + movl 156(%esp),%ebp + movl %ecx,40(%esp) + movl 152(%esp),%ecx + movl %esi,44(%esp) + xorl %esi,%esi + movl %edx,52(%esp) + movl %edi,60(%esp) + xorl %eax,%eax + xorl %edx,%edx +.L005tail_loop: + movb (%esi,%ebp,1),%al + movb (%esp,%esi,1),%dl + leal 1(%esi),%esi + xorb %dl,%al + movb %al,-1(%ecx,%esi,1) + decl %ebx + jnz .L005tail_loop +.L004done: + addl $132,%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size ChaCha20_ctr32_nohw,.-.L_ChaCha20_ctr32_nohw_begin +.globl ChaCha20_ctr32_ssse3 +.hidden ChaCha20_ctr32_ssse3 +.type ChaCha20_ctr32_ssse3,@function +.align 16 +ChaCha20_ctr32_ssse3: +.L_ChaCha20_ctr32_ssse3_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + call .Lpic_point +.Lpic_point: + popl %eax + movl 20(%esp),%edi + movl 24(%esp),%esi + movl 28(%esp),%ecx + movl 32(%esp),%edx + movl 36(%esp),%ebx + movl %esp,%ebp + subl $524,%esp + andl $-64,%esp + movl %ebp,512(%esp) + leal .Lssse3_data-.Lpic_point(%eax),%eax + movdqu (%ebx),%xmm3 + cmpl $256,%ecx + jb .L0061x + movl %edx,516(%esp) + movl %ebx,520(%esp) + subl $256,%ecx + leal 384(%esp),%ebp + movdqu (%edx),%xmm7 + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + paddd 48(%eax),%xmm0 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + psubd 64(%eax),%xmm0 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,64(%ebp) + movdqa %xmm1,80(%ebp) + movdqa %xmm2,96(%ebp) + movdqa %xmm3,112(%ebp) + movdqu 16(%edx),%xmm3 + movdqa %xmm4,-64(%ebp) + movdqa %xmm5,-48(%ebp) + movdqa %xmm6,-32(%ebp) + movdqa %xmm7,-16(%ebp) + movdqa 32(%eax),%xmm7 + leal 128(%esp),%ebx + pshufd $0,%xmm3,%xmm0 + pshufd $85,%xmm3,%xmm1 + pshufd $170,%xmm3,%xmm2 + pshufd $255,%xmm3,%xmm3 + pshufd $0,%xmm7,%xmm4 + pshufd $85,%xmm7,%xmm5 + pshufd $170,%xmm7,%xmm6 + pshufd $255,%xmm7,%xmm7 + movdqa %xmm0,(%ebp) + movdqa %xmm1,16(%ebp) + movdqa %xmm2,32(%ebp) + movdqa %xmm3,48(%ebp) + movdqa %xmm4,-128(%ebp) + movdqa %xmm5,-112(%ebp) + movdqa %xmm6,-96(%ebp) + movdqa %xmm7,-80(%ebp) + leal 128(%esi),%esi + leal 128(%edi),%edi + jmp .L007outer_loop +.align 16 +.L007outer_loop: + movdqa -112(%ebp),%xmm1 + movdqa -96(%ebp),%xmm2 + movdqa -80(%ebp),%xmm3 + movdqa -48(%ebp),%xmm5 + movdqa -32(%ebp),%xmm6 + movdqa -16(%ebp),%xmm7 + movdqa %xmm1,-112(%ebx) + movdqa %xmm2,-96(%ebx) + movdqa %xmm3,-80(%ebx) + movdqa %xmm5,-48(%ebx) + movdqa %xmm6,-32(%ebx) + movdqa %xmm7,-16(%ebx) + movdqa 32(%ebp),%xmm2 + movdqa 48(%ebp),%xmm3 + movdqa 64(%ebp),%xmm4 + movdqa 80(%ebp),%xmm5 + movdqa 96(%ebp),%xmm6 + movdqa 112(%ebp),%xmm7 + paddd 64(%eax),%xmm4 + movdqa %xmm2,32(%ebx) + movdqa %xmm3,48(%ebx) + movdqa %xmm4,64(%ebx) + movdqa %xmm5,80(%ebx) + movdqa %xmm6,96(%ebx) + movdqa %xmm7,112(%ebx) + movdqa %xmm4,64(%ebp) + movdqa -128(%ebp),%xmm0 + movdqa %xmm4,%xmm6 + movdqa -64(%ebp),%xmm3 + movdqa (%ebp),%xmm4 + movdqa 16(%ebp),%xmm5 + movl $10,%edx + nop +.align 16 +.L008loop: + paddd %xmm3,%xmm0 + movdqa %xmm3,%xmm2 + pxor %xmm0,%xmm6 + pshufb (%eax),%xmm6 + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -48(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 80(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,64(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-64(%ebx) + paddd %xmm7,%xmm5 + movdqa 32(%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -32(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 96(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,80(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,16(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-48(%ebx) + paddd %xmm6,%xmm4 + movdqa 48(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -16(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 112(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,96(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-32(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa -48(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,%xmm6 + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + pshufb (%eax),%xmm6 + movdqa %xmm3,-16(%ebx) + paddd %xmm6,%xmm4 + pxor %xmm4,%xmm2 + movdqa -32(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -112(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 64(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-128(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,112(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + movdqa %xmm4,32(%ebx) + pshufb (%eax),%xmm7 + movdqa %xmm2,-48(%ebx) + paddd %xmm7,%xmm5 + movdqa (%ebx),%xmm4 + pxor %xmm5,%xmm3 + movdqa -16(%ebx),%xmm2 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -96(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 80(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-112(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,64(%ebx) + pxor %xmm5,%xmm3 + paddd %xmm2,%xmm0 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + pxor %xmm0,%xmm6 + por %xmm1,%xmm3 + movdqa %xmm5,48(%ebx) + pshufb (%eax),%xmm6 + movdqa %xmm3,-32(%ebx) + paddd %xmm6,%xmm4 + movdqa 16(%ebx),%xmm5 + pxor %xmm4,%xmm2 + movdqa -64(%ebx),%xmm3 + movdqa %xmm2,%xmm1 + pslld $12,%xmm2 + psrld $20,%xmm1 + por %xmm1,%xmm2 + movdqa -80(%ebx),%xmm1 + paddd %xmm2,%xmm0 + movdqa 96(%ebx),%xmm7 + pxor %xmm0,%xmm6 + movdqa %xmm0,-96(%ebx) + pshufb 16(%eax),%xmm6 + paddd %xmm6,%xmm4 + movdqa %xmm6,80(%ebx) + pxor %xmm4,%xmm2 + paddd %xmm3,%xmm1 + movdqa %xmm2,%xmm0 + pslld $7,%xmm2 + psrld $25,%xmm0 + pxor %xmm1,%xmm7 + por %xmm0,%xmm2 + pshufb (%eax),%xmm7 + movdqa %xmm2,-16(%ebx) + paddd %xmm7,%xmm5 + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm0 + pslld $12,%xmm3 + psrld $20,%xmm0 + por %xmm0,%xmm3 + movdqa -128(%ebx),%xmm0 + paddd %xmm3,%xmm1 + movdqa 64(%ebx),%xmm6 + pxor %xmm1,%xmm7 + movdqa %xmm1,-80(%ebx) + pshufb 16(%eax),%xmm7 + paddd %xmm7,%xmm5 + movdqa %xmm7,96(%ebx) + pxor %xmm5,%xmm3 + movdqa %xmm3,%xmm1 + pslld $7,%xmm3 + psrld $25,%xmm1 + por %xmm1,%xmm3 + decl %edx + jnz .L008loop + movdqa %xmm3,-64(%ebx) + movdqa %xmm4,(%ebx) + movdqa %xmm5,16(%ebx) + movdqa %xmm6,64(%ebx) + movdqa %xmm7,96(%ebx) + movdqa -112(%ebx),%xmm1 + movdqa -96(%ebx),%xmm2 + movdqa -80(%ebx),%xmm3 + paddd -128(%ebp),%xmm0 + paddd -112(%ebp),%xmm1 + paddd -96(%ebp),%xmm2 + paddd -80(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa -64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa -48(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa -32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa -16(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd -64(%ebp),%xmm0 + paddd -48(%ebp),%xmm1 + paddd -32(%ebp),%xmm2 + paddd -16(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa (%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 16(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 32(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 48(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd (%ebp),%xmm0 + paddd 16(%ebp),%xmm1 + paddd 32(%ebp),%xmm2 + paddd 48(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 16(%esi),%esi + pxor %xmm0,%xmm4 + movdqa 64(%ebx),%xmm0 + pxor %xmm1,%xmm5 + movdqa 80(%ebx),%xmm1 + pxor %xmm2,%xmm6 + movdqa 96(%ebx),%xmm2 + pxor %xmm3,%xmm7 + movdqa 112(%ebx),%xmm3 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 16(%edi),%edi + paddd 64(%ebp),%xmm0 + paddd 80(%ebp),%xmm1 + paddd 96(%ebp),%xmm2 + paddd 112(%ebp),%xmm3 + movdqa %xmm0,%xmm6 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm6 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm6,%xmm3 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + movdqu -128(%esi),%xmm4 + movdqu -64(%esi),%xmm5 + movdqu (%esi),%xmm2 + movdqu 64(%esi),%xmm7 + leal 208(%esi),%esi + pxor %xmm0,%xmm4 + pxor %xmm1,%xmm5 + pxor %xmm2,%xmm6 + pxor %xmm3,%xmm7 + movdqu %xmm4,-128(%edi) + movdqu %xmm5,-64(%edi) + movdqu %xmm6,(%edi) + movdqu %xmm7,64(%edi) + leal 208(%edi),%edi + subl $256,%ecx + jnc .L007outer_loop + addl $256,%ecx + jz .L009done + movl 520(%esp),%ebx + leal -128(%esi),%esi + movl 516(%esp),%edx + leal -128(%edi),%edi + movd 64(%ebp),%xmm2 + movdqu (%ebx),%xmm3 + paddd 96(%eax),%xmm2 + pand 112(%eax),%xmm3 + por %xmm2,%xmm3 +.L0061x: + movdqa 32(%eax),%xmm0 + movdqu (%edx),%xmm1 + movdqu 16(%edx),%xmm2 + movdqa (%eax),%xmm6 + movdqa 16(%eax),%xmm7 + movl %ebp,48(%esp) + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + movl $10,%edx + jmp .L010loop1x +.align 16 +.L011outer1x: + movdqa 80(%eax),%xmm3 + movdqa (%esp),%xmm0 + movdqa 16(%esp),%xmm1 + movdqa 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + movl $10,%edx + movdqa %xmm3,48(%esp) + jmp .L010loop1x +.align 16 +.L010loop1x: + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm6,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm7,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm1,%xmm1 + pshufd $147,%xmm3,%xmm3 + nop + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm6,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm7,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm1,%xmm1 + pshufd $57,%xmm3,%xmm3 + decl %edx + jnz .L010loop1x + paddd (%esp),%xmm0 + paddd 16(%esp),%xmm1 + paddd 32(%esp),%xmm2 + paddd 48(%esp),%xmm3 + cmpl $64,%ecx + jb .L012tail + movdqu (%esi),%xmm4 + movdqu 16(%esi),%xmm5 + pxor %xmm4,%xmm0 + movdqu 32(%esi),%xmm4 + pxor %xmm5,%xmm1 + movdqu 48(%esi),%xmm5 + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm3 + leal 64(%esi),%esi + movdqu %xmm0,(%edi) + movdqu %xmm1,16(%edi) + movdqu %xmm2,32(%edi) + movdqu %xmm3,48(%edi) + leal 64(%edi),%edi + subl $64,%ecx + jnz .L011outer1x + jmp .L009done +.L012tail: + movdqa %xmm0,(%esp) + movdqa %xmm1,16(%esp) + movdqa %xmm2,32(%esp) + movdqa %xmm3,48(%esp) + xorl %eax,%eax + xorl %edx,%edx + xorl %ebp,%ebp +.L013tail_loop: + movb (%esp,%ebp,1),%al + movb (%esi,%ebp,1),%dl + leal 1(%ebp),%ebp + xorb %dl,%al + movb %al,-1(%edi,%ebp,1) + decl %ecx + jnz .L013tail_loop +.L009done: + movl 512(%esp),%esp + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size ChaCha20_ctr32_ssse3,.-.L_ChaCha20_ctr32_ssse3_begin +.align 64 +.Lssse3_data: +.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +.long 1634760805,857760878,2036477234,1797285236 +.long 0,1,2,3 +.long 4,4,4,4 +.long 1,0,0,0 +.long 4,0,0,0 +.long 0,-1,-1,-1 +.align 64 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +.byte 114,103,62,0 +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/crypto/chacha-x86-win.asm b/third_party/boringssl/gen/crypto/chacha-x86-win.asm new file mode 100644 index 00000000..799a6aa2 --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha-x86-win.asm @@ -0,0 +1,966 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _ChaCha20_ctr32_nohw +align 16 +_ChaCha20_ctr32_nohw: +L$_ChaCha20_ctr32_nohw_begin: + push ebp + push ebx + push esi + push edi + mov esi,DWORD [32+esp] + mov edi,DWORD [36+esp] + sub esp,132 + mov eax,DWORD [esi] + mov ebx,DWORD [4+esi] + mov ecx,DWORD [8+esi] + mov edx,DWORD [12+esi] + mov DWORD [80+esp],eax + mov DWORD [84+esp],ebx + mov DWORD [88+esp],ecx + mov DWORD [92+esp],edx + mov eax,DWORD [16+esi] + mov ebx,DWORD [20+esi] + mov ecx,DWORD [24+esi] + mov edx,DWORD [28+esi] + mov DWORD [96+esp],eax + mov DWORD [100+esp],ebx + mov DWORD [104+esp],ecx + mov DWORD [108+esp],edx + mov eax,DWORD [edi] + mov ebx,DWORD [4+edi] + mov ecx,DWORD [8+edi] + mov edx,DWORD [12+edi] + sub eax,1 + mov DWORD [112+esp],eax + mov DWORD [116+esp],ebx + mov DWORD [120+esp],ecx + mov DWORD [124+esp],edx + jmp NEAR L$000entry +align 16 +L$001outer_loop: + mov DWORD [156+esp],ebx + mov DWORD [152+esp],eax + mov DWORD [160+esp],ecx +L$000entry: + mov eax,1634760805 + mov DWORD [4+esp],857760878 + mov DWORD [8+esp],2036477234 + mov DWORD [12+esp],1797285236 + mov ebx,DWORD [84+esp] + mov ebp,DWORD [88+esp] + mov ecx,DWORD [104+esp] + mov esi,DWORD [108+esp] + mov edx,DWORD [116+esp] + mov edi,DWORD [120+esp] + mov DWORD [20+esp],ebx + mov DWORD [24+esp],ebp + mov DWORD [40+esp],ecx + mov DWORD [44+esp],esi + mov DWORD [52+esp],edx + mov DWORD [56+esp],edi + mov ebx,DWORD [92+esp] + mov edi,DWORD [124+esp] + mov edx,DWORD [112+esp] + mov ebp,DWORD [80+esp] + mov ecx,DWORD [96+esp] + mov esi,DWORD [100+esp] + add edx,1 + mov DWORD [28+esp],ebx + mov DWORD [60+esp],edi + mov DWORD [112+esp],edx + mov ebx,10 + jmp NEAR L$002loop +align 16 +L$002loop: + add eax,ebp + mov DWORD [128+esp],ebx + mov ebx,ebp + xor edx,eax + rol edx,16 + add ecx,edx + xor ebx,ecx + mov edi,DWORD [52+esp] + rol ebx,12 + mov ebp,DWORD [20+esp] + add eax,ebx + xor edx,eax + mov DWORD [esp],eax + rol edx,8 + mov eax,DWORD [4+esp] + add ecx,edx + mov DWORD [48+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + mov DWORD [32+esp],ecx + rol edi,16 + mov DWORD [16+esp],ebx + add esi,edi + mov ecx,DWORD [40+esp] + xor ebp,esi + mov edx,DWORD [56+esp] + rol ebp,12 + mov ebx,DWORD [24+esp] + add eax,ebp + xor edi,eax + mov DWORD [4+esp],eax + rol edi,8 + mov eax,DWORD [8+esp] + add esi,edi + mov DWORD [52+esp],edi + xor ebp,esi + add eax,ebx + rol ebp,7 + xor edx,eax + mov DWORD [36+esp],esi + rol edx,16 + mov DWORD [20+esp],ebp + add ecx,edx + mov esi,DWORD [44+esp] + xor ebx,ecx + mov edi,DWORD [60+esp] + rol ebx,12 + mov ebp,DWORD [28+esp] + add eax,ebx + xor edx,eax + mov DWORD [8+esp],eax + rol edx,8 + mov eax,DWORD [12+esp] + add ecx,edx + mov DWORD [56+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + rol edi,16 + mov DWORD [24+esp],ebx + add esi,edi + xor ebp,esi + rol ebp,12 + mov ebx,DWORD [20+esp] + add eax,ebp + xor edi,eax + mov DWORD [12+esp],eax + rol edi,8 + mov eax,DWORD [esp] + add esi,edi + mov edx,edi + xor ebp,esi + add eax,ebx + rol ebp,7 + xor edx,eax + rol edx,16 + mov DWORD [28+esp],ebp + add ecx,edx + xor ebx,ecx + mov edi,DWORD [48+esp] + rol ebx,12 + mov ebp,DWORD [24+esp] + add eax,ebx + xor edx,eax + mov DWORD [esp],eax + rol edx,8 + mov eax,DWORD [4+esp] + add ecx,edx + mov DWORD [60+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + mov DWORD [40+esp],ecx + rol edi,16 + mov DWORD [20+esp],ebx + add esi,edi + mov ecx,DWORD [32+esp] + xor ebp,esi + mov edx,DWORD [52+esp] + rol ebp,12 + mov ebx,DWORD [28+esp] + add eax,ebp + xor edi,eax + mov DWORD [4+esp],eax + rol edi,8 + mov eax,DWORD [8+esp] + add esi,edi + mov DWORD [48+esp],edi + xor ebp,esi + add eax,ebx + rol ebp,7 + xor edx,eax + mov DWORD [44+esp],esi + rol edx,16 + mov DWORD [24+esp],ebp + add ecx,edx + mov esi,DWORD [36+esp] + xor ebx,ecx + mov edi,DWORD [56+esp] + rol ebx,12 + mov ebp,DWORD [16+esp] + add eax,ebx + xor edx,eax + mov DWORD [8+esp],eax + rol edx,8 + mov eax,DWORD [12+esp] + add ecx,edx + mov DWORD [52+esp],edx + xor ebx,ecx + add eax,ebp + rol ebx,7 + xor edi,eax + rol edi,16 + mov DWORD [28+esp],ebx + add esi,edi + xor ebp,esi + mov edx,DWORD [48+esp] + rol ebp,12 + mov ebx,DWORD [128+esp] + add eax,ebp + xor edi,eax + mov DWORD [12+esp],eax + rol edi,8 + mov eax,DWORD [esp] + add esi,edi + mov DWORD [56+esp],edi + xor ebp,esi + rol ebp,7 + dec ebx + jnz NEAR L$002loop + mov ebx,DWORD [160+esp] + add eax,1634760805 + add ebp,DWORD [80+esp] + add ecx,DWORD [96+esp] + add esi,DWORD [100+esp] + cmp ebx,64 + jb NEAR L$003tail + mov ebx,DWORD [156+esp] + add edx,DWORD [112+esp] + add edi,DWORD [120+esp] + xor eax,DWORD [ebx] + xor ebp,DWORD [16+ebx] + mov DWORD [esp],eax + mov eax,DWORD [152+esp] + xor ecx,DWORD [32+ebx] + xor esi,DWORD [36+ebx] + xor edx,DWORD [48+ebx] + xor edi,DWORD [56+ebx] + mov DWORD [16+eax],ebp + mov DWORD [32+eax],ecx + mov DWORD [36+eax],esi + mov DWORD [48+eax],edx + mov DWORD [56+eax],edi + mov ebp,DWORD [4+esp] + mov ecx,DWORD [8+esp] + mov esi,DWORD [12+esp] + mov edx,DWORD [20+esp] + mov edi,DWORD [24+esp] + add ebp,857760878 + add ecx,2036477234 + add esi,1797285236 + add edx,DWORD [84+esp] + add edi,DWORD [88+esp] + xor ebp,DWORD [4+ebx] + xor ecx,DWORD [8+ebx] + xor esi,DWORD [12+ebx] + xor edx,DWORD [20+ebx] + xor edi,DWORD [24+ebx] + mov DWORD [4+eax],ebp + mov DWORD [8+eax],ecx + mov DWORD [12+eax],esi + mov DWORD [20+eax],edx + mov DWORD [24+eax],edi + mov ebp,DWORD [28+esp] + mov ecx,DWORD [40+esp] + mov esi,DWORD [44+esp] + mov edx,DWORD [52+esp] + mov edi,DWORD [60+esp] + add ebp,DWORD [92+esp] + add ecx,DWORD [104+esp] + add esi,DWORD [108+esp] + add edx,DWORD [116+esp] + add edi,DWORD [124+esp] + xor ebp,DWORD [28+ebx] + xor ecx,DWORD [40+ebx] + xor esi,DWORD [44+ebx] + xor edx,DWORD [52+ebx] + xor edi,DWORD [60+ebx] + lea ebx,[64+ebx] + mov DWORD [28+eax],ebp + mov ebp,DWORD [esp] + mov DWORD [40+eax],ecx + mov ecx,DWORD [160+esp] + mov DWORD [44+eax],esi + mov DWORD [52+eax],edx + mov DWORD [60+eax],edi + mov DWORD [eax],ebp + lea eax,[64+eax] + sub ecx,64 + jnz NEAR L$001outer_loop + jmp NEAR L$004done +L$003tail: + add edx,DWORD [112+esp] + add edi,DWORD [120+esp] + mov DWORD [esp],eax + mov DWORD [16+esp],ebp + mov DWORD [32+esp],ecx + mov DWORD [36+esp],esi + mov DWORD [48+esp],edx + mov DWORD [56+esp],edi + mov ebp,DWORD [4+esp] + mov ecx,DWORD [8+esp] + mov esi,DWORD [12+esp] + mov edx,DWORD [20+esp] + mov edi,DWORD [24+esp] + add ebp,857760878 + add ecx,2036477234 + add esi,1797285236 + add edx,DWORD [84+esp] + add edi,DWORD [88+esp] + mov DWORD [4+esp],ebp + mov DWORD [8+esp],ecx + mov DWORD [12+esp],esi + mov DWORD [20+esp],edx + mov DWORD [24+esp],edi + mov ebp,DWORD [28+esp] + mov ecx,DWORD [40+esp] + mov esi,DWORD [44+esp] + mov edx,DWORD [52+esp] + mov edi,DWORD [60+esp] + add ebp,DWORD [92+esp] + add ecx,DWORD [104+esp] + add esi,DWORD [108+esp] + add edx,DWORD [116+esp] + add edi,DWORD [124+esp] + mov DWORD [28+esp],ebp + mov ebp,DWORD [156+esp] + mov DWORD [40+esp],ecx + mov ecx,DWORD [152+esp] + mov DWORD [44+esp],esi + xor esi,esi + mov DWORD [52+esp],edx + mov DWORD [60+esp],edi + xor eax,eax + xor edx,edx +L$005tail_loop: + mov al,BYTE [ebp*1+esi] + mov dl,BYTE [esi*1+esp] + lea esi,[1+esi] + xor al,dl + mov BYTE [esi*1+ecx-1],al + dec ebx + jnz NEAR L$005tail_loop +L$004done: + add esp,132 + pop edi + pop esi + pop ebx + pop ebp + ret +global _ChaCha20_ctr32_ssse3 +align 16 +_ChaCha20_ctr32_ssse3: +L$_ChaCha20_ctr32_ssse3_begin: + push ebp + push ebx + push esi + push edi + call L$pic_point +L$pic_point: + pop eax + mov edi,DWORD [20+esp] + mov esi,DWORD [24+esp] + mov ecx,DWORD [28+esp] + mov edx,DWORD [32+esp] + mov ebx,DWORD [36+esp] + mov ebp,esp + sub esp,524 + and esp,-64 + mov DWORD [512+esp],ebp + lea eax,[(L$ssse3_data-L$pic_point)+eax] + movdqu xmm3,[ebx] + cmp ecx,256 + jb NEAR L$0061x + mov DWORD [516+esp],edx + mov DWORD [520+esp],ebx + sub ecx,256 + lea ebp,[384+esp] + movdqu xmm7,[edx] + pshufd xmm0,xmm3,0 + pshufd xmm1,xmm3,85 + pshufd xmm2,xmm3,170 + pshufd xmm3,xmm3,255 + paddd xmm0,[48+eax] + pshufd xmm4,xmm7,0 + pshufd xmm5,xmm7,85 + psubd xmm0,[64+eax] + pshufd xmm6,xmm7,170 + pshufd xmm7,xmm7,255 + movdqa [64+ebp],xmm0 + movdqa [80+ebp],xmm1 + movdqa [96+ebp],xmm2 + movdqa [112+ebp],xmm3 + movdqu xmm3,[16+edx] + movdqa [ebp-64],xmm4 + movdqa [ebp-48],xmm5 + movdqa [ebp-32],xmm6 + movdqa [ebp-16],xmm7 + movdqa xmm7,[32+eax] + lea ebx,[128+esp] + pshufd xmm0,xmm3,0 + pshufd xmm1,xmm3,85 + pshufd xmm2,xmm3,170 + pshufd xmm3,xmm3,255 + pshufd xmm4,xmm7,0 + pshufd xmm5,xmm7,85 + pshufd xmm6,xmm7,170 + pshufd xmm7,xmm7,255 + movdqa [ebp],xmm0 + movdqa [16+ebp],xmm1 + movdqa [32+ebp],xmm2 + movdqa [48+ebp],xmm3 + movdqa [ebp-128],xmm4 + movdqa [ebp-112],xmm5 + movdqa [ebp-96],xmm6 + movdqa [ebp-80],xmm7 + lea esi,[128+esi] + lea edi,[128+edi] + jmp NEAR L$007outer_loop +align 16 +L$007outer_loop: + movdqa xmm1,[ebp-112] + movdqa xmm2,[ebp-96] + movdqa xmm3,[ebp-80] + movdqa xmm5,[ebp-48] + movdqa xmm6,[ebp-32] + movdqa xmm7,[ebp-16] + movdqa [ebx-112],xmm1 + movdqa [ebx-96],xmm2 + movdqa [ebx-80],xmm3 + movdqa [ebx-48],xmm5 + movdqa [ebx-32],xmm6 + movdqa [ebx-16],xmm7 + movdqa xmm2,[32+ebp] + movdqa xmm3,[48+ebp] + movdqa xmm4,[64+ebp] + movdqa xmm5,[80+ebp] + movdqa xmm6,[96+ebp] + movdqa xmm7,[112+ebp] + paddd xmm4,[64+eax] + movdqa [32+ebx],xmm2 + movdqa [48+ebx],xmm3 + movdqa [64+ebx],xmm4 + movdqa [80+ebx],xmm5 + movdqa [96+ebx],xmm6 + movdqa [112+ebx],xmm7 + movdqa [64+ebp],xmm4 + movdqa xmm0,[ebp-128] + movdqa xmm6,xmm4 + movdqa xmm3,[ebp-64] + movdqa xmm4,[ebp] + movdqa xmm5,[16+ebp] + mov edx,10 + nop +align 16 +L$008loop: + paddd xmm0,xmm3 + movdqa xmm2,xmm3 + pxor xmm6,xmm0 + pshufb xmm6,[eax] + paddd xmm4,xmm6 + pxor xmm2,xmm4 + movdqa xmm3,[ebx-48] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-112] + paddd xmm0,xmm2 + movdqa xmm7,[80+ebx] + pxor xmm6,xmm0 + movdqa [ebx-128],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [64+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + movdqa [ebx],xmm4 + pshufb xmm7,[eax] + movdqa [ebx-64],xmm2 + paddd xmm5,xmm7 + movdqa xmm4,[32+ebx] + pxor xmm3,xmm5 + movdqa xmm2,[ebx-32] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-96] + paddd xmm1,xmm3 + movdqa xmm6,[96+ebx] + pxor xmm7,xmm1 + movdqa [ebx-112],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [80+ebx],xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + movdqa [16+ebx],xmm5 + pshufb xmm6,[eax] + movdqa [ebx-48],xmm3 + paddd xmm4,xmm6 + movdqa xmm5,[48+ebx] + pxor xmm2,xmm4 + movdqa xmm3,[ebx-16] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-80] + paddd xmm0,xmm2 + movdqa xmm7,[112+ebx] + pxor xmm6,xmm0 + movdqa [ebx-96],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [96+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + pshufb xmm7,[eax] + movdqa [ebx-32],xmm2 + paddd xmm5,xmm7 + pxor xmm3,xmm5 + movdqa xmm2,[ebx-48] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-128] + paddd xmm1,xmm3 + pxor xmm7,xmm1 + movdqa [ebx-80],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa xmm6,xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + pshufb xmm6,[eax] + movdqa [ebx-16],xmm3 + paddd xmm4,xmm6 + pxor xmm2,xmm4 + movdqa xmm3,[ebx-32] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-112] + paddd xmm0,xmm2 + movdqa xmm7,[64+ebx] + pxor xmm6,xmm0 + movdqa [ebx-128],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [112+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + movdqa [32+ebx],xmm4 + pshufb xmm7,[eax] + movdqa [ebx-48],xmm2 + paddd xmm5,xmm7 + movdqa xmm4,[ebx] + pxor xmm3,xmm5 + movdqa xmm2,[ebx-16] + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-96] + paddd xmm1,xmm3 + movdqa xmm6,[80+ebx] + pxor xmm7,xmm1 + movdqa [ebx-112],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [64+ebx],xmm7 + pxor xmm3,xmm5 + paddd xmm0,xmm2 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + pxor xmm6,xmm0 + por xmm3,xmm1 + movdqa [48+ebx],xmm5 + pshufb xmm6,[eax] + movdqa [ebx-32],xmm3 + paddd xmm4,xmm6 + movdqa xmm5,[16+ebx] + pxor xmm2,xmm4 + movdqa xmm3,[ebx-64] + movdqa xmm1,xmm2 + pslld xmm2,12 + psrld xmm1,20 + por xmm2,xmm1 + movdqa xmm1,[ebx-80] + paddd xmm0,xmm2 + movdqa xmm7,[96+ebx] + pxor xmm6,xmm0 + movdqa [ebx-96],xmm0 + pshufb xmm6,[16+eax] + paddd xmm4,xmm6 + movdqa [80+ebx],xmm6 + pxor xmm2,xmm4 + paddd xmm1,xmm3 + movdqa xmm0,xmm2 + pslld xmm2,7 + psrld xmm0,25 + pxor xmm7,xmm1 + por xmm2,xmm0 + pshufb xmm7,[eax] + movdqa [ebx-16],xmm2 + paddd xmm5,xmm7 + pxor xmm3,xmm5 + movdqa xmm0,xmm3 + pslld xmm3,12 + psrld xmm0,20 + por xmm3,xmm0 + movdqa xmm0,[ebx-128] + paddd xmm1,xmm3 + movdqa xmm6,[64+ebx] + pxor xmm7,xmm1 + movdqa [ebx-80],xmm1 + pshufb xmm7,[16+eax] + paddd xmm5,xmm7 + movdqa [96+ebx],xmm7 + pxor xmm3,xmm5 + movdqa xmm1,xmm3 + pslld xmm3,7 + psrld xmm1,25 + por xmm3,xmm1 + dec edx + jnz NEAR L$008loop + movdqa [ebx-64],xmm3 + movdqa [ebx],xmm4 + movdqa [16+ebx],xmm5 + movdqa [64+ebx],xmm6 + movdqa [96+ebx],xmm7 + movdqa xmm1,[ebx-112] + movdqa xmm2,[ebx-96] + movdqa xmm3,[ebx-80] + paddd xmm0,[ebp-128] + paddd xmm1,[ebp-112] + paddd xmm2,[ebp-96] + paddd xmm3,[ebp-80] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[ebx-64] + pxor xmm5,xmm1 + movdqa xmm1,[ebx-48] + pxor xmm6,xmm2 + movdqa xmm2,[ebx-32] + pxor xmm7,xmm3 + movdqa xmm3,[ebx-16] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[ebp-64] + paddd xmm1,[ebp-48] + paddd xmm2,[ebp-32] + paddd xmm3,[ebp-16] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[ebx] + pxor xmm5,xmm1 + movdqa xmm1,[16+ebx] + pxor xmm6,xmm2 + movdqa xmm2,[32+ebx] + pxor xmm7,xmm3 + movdqa xmm3,[48+ebx] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[ebp] + paddd xmm1,[16+ebp] + paddd xmm2,[32+ebp] + paddd xmm3,[48+ebp] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[16+esi] + pxor xmm4,xmm0 + movdqa xmm0,[64+ebx] + pxor xmm5,xmm1 + movdqa xmm1,[80+ebx] + pxor xmm6,xmm2 + movdqa xmm2,[96+ebx] + pxor xmm7,xmm3 + movdqa xmm3,[112+ebx] + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[16+edi] + paddd xmm0,[64+ebp] + paddd xmm1,[80+ebp] + paddd xmm2,[96+ebp] + paddd xmm3,[112+ebp] + movdqa xmm6,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm6,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + movdqu xmm4,[esi-128] + movdqu xmm5,[esi-64] + movdqu xmm2,[esi] + movdqu xmm7,[64+esi] + lea esi,[208+esi] + pxor xmm4,xmm0 + pxor xmm5,xmm1 + pxor xmm6,xmm2 + pxor xmm7,xmm3 + movdqu [edi-128],xmm4 + movdqu [edi-64],xmm5 + movdqu [edi],xmm6 + movdqu [64+edi],xmm7 + lea edi,[208+edi] + sub ecx,256 + jnc NEAR L$007outer_loop + add ecx,256 + jz NEAR L$009done + mov ebx,DWORD [520+esp] + lea esi,[esi-128] + mov edx,DWORD [516+esp] + lea edi,[edi-128] + movd xmm2,DWORD [64+ebp] + movdqu xmm3,[ebx] + paddd xmm2,[96+eax] + pand xmm3,[112+eax] + por xmm3,xmm2 +L$0061x: + movdqa xmm0,[32+eax] + movdqu xmm1,[edx] + movdqu xmm2,[16+edx] + movdqa xmm6,[eax] + movdqa xmm7,[16+eax] + mov DWORD [48+esp],ebp + movdqa [esp],xmm0 + movdqa [16+esp],xmm1 + movdqa [32+esp],xmm2 + movdqa [48+esp],xmm3 + mov edx,10 + jmp NEAR L$010loop1x +align 16 +L$011outer1x: + movdqa xmm3,[80+eax] + movdqa xmm0,[esp] + movdqa xmm1,[16+esp] + movdqa xmm2,[32+esp] + paddd xmm3,[48+esp] + mov edx,10 + movdqa [48+esp],xmm3 + jmp NEAR L$010loop1x +align 16 +L$010loop1x: + paddd xmm0,xmm1 + pxor xmm3,xmm0 + pshufb xmm3,xmm6 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 + pshufb xmm3,xmm7 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,57 + pshufd xmm3,xmm3,147 + nop + paddd xmm0,xmm1 + pxor xmm3,xmm0 + pshufb xmm3,xmm6 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 + pshufb xmm3,xmm7 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,147 + pshufd xmm3,xmm3,57 + dec edx + jnz NEAR L$010loop1x + paddd xmm0,[esp] + paddd xmm1,[16+esp] + paddd xmm2,[32+esp] + paddd xmm3,[48+esp] + cmp ecx,64 + jb NEAR L$012tail + movdqu xmm4,[esi] + movdqu xmm5,[16+esi] + pxor xmm0,xmm4 + movdqu xmm4,[32+esi] + pxor xmm1,xmm5 + movdqu xmm5,[48+esi] + pxor xmm2,xmm4 + pxor xmm3,xmm5 + lea esi,[64+esi] + movdqu [edi],xmm0 + movdqu [16+edi],xmm1 + movdqu [32+edi],xmm2 + movdqu [48+edi],xmm3 + lea edi,[64+edi] + sub ecx,64 + jnz NEAR L$011outer1x + jmp NEAR L$009done +L$012tail: + movdqa [esp],xmm0 + movdqa [16+esp],xmm1 + movdqa [32+esp],xmm2 + movdqa [48+esp],xmm3 + xor eax,eax + xor edx,edx + xor ebp,ebp +L$013tail_loop: + mov al,BYTE [ebp*1+esp] + mov dl,BYTE [ebp*1+esi] + lea ebp,[1+ebp] + xor al,dl + mov BYTE [ebp*1+edi-1],al + dec ecx + jnz NEAR L$013tail_loop +L$009done: + mov esp,DWORD [512+esp] + pop edi + pop esi + pop ebx + pop ebp + ret +align 64 +L$ssse3_data: +db 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +db 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +dd 1634760805,857760878,2036477234,1797285236 +dd 0,1,2,3 +dd 4,4,4,4 +dd 1,0,0,0 +dd 4,0,0,0 +dd 0,-1,-1,-1 +align 64 +db 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 +db 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 +db 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 +db 114,103,62,0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/crypto/chacha-x86_64-apple.S b/third_party/boringssl/gen/crypto/chacha-x86_64-apple.S new file mode 100644 index 00000000..09481e37 --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha-x86_64-apple.S @@ -0,0 +1,1603 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + +.section __DATA,__const +.p2align 6 +L$zero: +.long 0,0,0,0 +L$one: +.long 1,0,0,0 +L$inc: +.long 0,1,2,3 +L$four: +.long 4,4,4,4 +L$incy: +.long 0,2,4,6,1,3,5,7 +L$eight: +.long 8,8,8,8,8,8,8,8 +L$rot16: +.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd +L$rot24: +.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe +L$sigma: +.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 +.p2align 6 +L$zeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +L$fourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +L$incz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +L$sixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl _ChaCha20_ctr32_nohw +.private_extern _ChaCha20_ctr32_nohw + +.p2align 6 +_ChaCha20_ctr32_nohw: + +_CET_ENDBR + pushq %rbx + + pushq %rbp + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + subq $64+24,%rsp + +L$ctr32_body: + + + movdqu (%rcx),%xmm1 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa L$one(%rip),%xmm4 + + + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + movq %rdx,%rbp + jmp L$oop_outer + +.p2align 5 +L$oop_outer: + movl $0x61707865,%eax + movl $0x3320646e,%ebx + movl $0x79622d32,%ecx + movl $0x6b206574,%edx + movl 16(%rsp),%r8d + movl 20(%rsp),%r9d + movl 24(%rsp),%r10d + movl 28(%rsp),%r11d + movd %xmm3,%r12d + movl 52(%rsp),%r13d + movl 56(%rsp),%r14d + movl 60(%rsp),%r15d + + movq %rbp,64+0(%rsp) + movl $10,%ebp + movq %rsi,64+8(%rsp) + movq %xmm2,%rsi + movq %rdi,64+16(%rsp) + movq %rsi,%rdi + shrq $32,%rdi + jmp L$oop + +.p2align 5 +L$oop: + addl %r8d,%eax + xorl %eax,%r12d + roll $16,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $16,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $12,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $12,%r9d + addl %r8d,%eax + xorl %eax,%r12d + roll $8,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $8,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $7,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $7,%r9d + movl %esi,32(%rsp) + movl %edi,36(%rsp) + movl 40(%rsp),%esi + movl 44(%rsp),%edi + addl %r10d,%ecx + xorl %ecx,%r14d + roll $16,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $16,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $12,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $12,%r11d + addl %r10d,%ecx + xorl %ecx,%r14d + roll $8,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $8,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $7,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $7,%r11d + addl %r9d,%eax + xorl %eax,%r15d + roll $16,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $16,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $12,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $12,%r10d + addl %r9d,%eax + xorl %eax,%r15d + roll $8,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $8,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $7,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $7,%r10d + movl %esi,40(%rsp) + movl %edi,44(%rsp) + movl 32(%rsp),%esi + movl 36(%rsp),%edi + addl %r11d,%ecx + xorl %ecx,%r13d + roll $16,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $16,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $12,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $12,%r8d + addl %r11d,%ecx + xorl %ecx,%r13d + roll $8,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $8,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $7,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $7,%r8d + decl %ebp + jnz L$oop + movl %edi,36(%rsp) + movl %esi,32(%rsp) + movq 64(%rsp),%rbp + movdqa %xmm2,%xmm1 + movq 64+8(%rsp),%rsi + paddd %xmm4,%xmm3 + movq 64+16(%rsp),%rdi + + addl $0x61707865,%eax + addl $0x3320646e,%ebx + addl $0x79622d32,%ecx + addl $0x6b206574,%edx + addl 16(%rsp),%r8d + addl 20(%rsp),%r9d + addl 24(%rsp),%r10d + addl 28(%rsp),%r11d + addl 48(%rsp),%r12d + addl 52(%rsp),%r13d + addl 56(%rsp),%r14d + addl 60(%rsp),%r15d + paddd 32(%rsp),%xmm1 + + cmpq $64,%rbp + jb L$tail + + xorl 0(%rsi),%eax + xorl 4(%rsi),%ebx + xorl 8(%rsi),%ecx + xorl 12(%rsi),%edx + xorl 16(%rsi),%r8d + xorl 20(%rsi),%r9d + xorl 24(%rsi),%r10d + xorl 28(%rsi),%r11d + movdqu 32(%rsi),%xmm0 + xorl 48(%rsi),%r12d + xorl 52(%rsi),%r13d + xorl 56(%rsi),%r14d + xorl 60(%rsi),%r15d + leaq 64(%rsi),%rsi + pxor %xmm1,%xmm0 + + movdqa %xmm2,32(%rsp) + movd %xmm3,48(%rsp) + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + movdqu %xmm0,32(%rdi) + movl %r12d,48(%rdi) + movl %r13d,52(%rdi) + movl %r14d,56(%rdi) + movl %r15d,60(%rdi) + leaq 64(%rdi),%rdi + + subq $64,%rbp + jnz L$oop_outer + + jmp L$done + +.p2align 4 +L$tail: + movl %eax,0(%rsp) + movl %ebx,4(%rsp) + xorq %rbx,%rbx + movl %ecx,8(%rsp) + movl %edx,12(%rsp) + movl %r8d,16(%rsp) + movl %r9d,20(%rsp) + movl %r10d,24(%rsp) + movl %r11d,28(%rsp) + movdqa %xmm1,32(%rsp) + movl %r12d,48(%rsp) + movl %r13d,52(%rsp) + movl %r14d,56(%rsp) + movl %r15d,60(%rsp) + +L$oop_tail: + movzbl (%rsi,%rbx,1),%eax + movzbl (%rsp,%rbx,1),%edx + leaq 1(%rbx),%rbx + xorl %edx,%eax + movb %al,-1(%rdi,%rbx,1) + decq %rbp + jnz L$oop_tail + +L$done: + leaq 64+24+48(%rsp),%rsi + movq -48(%rsi),%r15 + + movq -40(%rsi),%r14 + + movq -32(%rsi),%r13 + + movq -24(%rsi),%r12 + + movq -16(%rsi),%rbp + + movq -8(%rsi),%rbx + + leaq (%rsi),%rsp + +L$no_data: + ret + + +.globl _ChaCha20_ctr32_ssse3 +.private_extern _ChaCha20_ctr32_ssse3 + +.p2align 5 +_ChaCha20_ctr32_ssse3: + +_CET_ENDBR + movq %rsp,%r9 + + subq $64+8,%rsp + movdqa L$sigma(%rip),%xmm0 + movdqu (%rcx),%xmm1 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa L$rot16(%rip),%xmm6 + movdqa L$rot24(%rip),%xmm7 + + movdqa %xmm0,0(%rsp) + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + movq $10,%r8 + jmp L$oop_ssse3 + +.p2align 5 +L$oop_outer_ssse3: + movdqa L$one(%rip),%xmm3 + movdqa 0(%rsp),%xmm0 + movdqa 16(%rsp),%xmm1 + movdqa 32(%rsp),%xmm2 + paddd 48(%rsp),%xmm3 + movq $10,%r8 + movdqa %xmm3,48(%rsp) + jmp L$oop_ssse3 + +.p2align 5 +L$oop_ssse3: + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm6,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm7,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm1,%xmm1 + pshufd $147,%xmm3,%xmm3 + nop + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm6,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm7,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm1,%xmm1 + pshufd $57,%xmm3,%xmm3 + decq %r8 + jnz L$oop_ssse3 + paddd 0(%rsp),%xmm0 + paddd 16(%rsp),%xmm1 + paddd 32(%rsp),%xmm2 + paddd 48(%rsp),%xmm3 + + cmpq $64,%rdx + jb L$tail_ssse3 + + movdqu 0(%rsi),%xmm4 + movdqu 16(%rsi),%xmm5 + pxor %xmm4,%xmm0 + movdqu 32(%rsi),%xmm4 + pxor %xmm5,%xmm1 + movdqu 48(%rsi),%xmm5 + leaq 64(%rsi),%rsi + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm3 + + movdqu %xmm0,0(%rdi) + movdqu %xmm1,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + leaq 64(%rdi),%rdi + + subq $64,%rdx + jnz L$oop_outer_ssse3 + + jmp L$done_ssse3 + +.p2align 4 +L$tail_ssse3: + movdqa %xmm0,0(%rsp) + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + xorq %r8,%r8 + +L$oop_tail_ssse3: + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 + xorl %ecx,%eax + movb %al,-1(%rdi,%r8,1) + decq %rdx + jnz L$oop_tail_ssse3 + +L$done_ssse3: + leaq (%r9),%rsp + +L$ssse3_epilogue: + ret + + +.globl _ChaCha20_ctr32_ssse3_4x +.private_extern _ChaCha20_ctr32_ssse3_4x + +.p2align 5 +_ChaCha20_ctr32_ssse3_4x: + +_CET_ENDBR + movq %rsp,%r9 + + subq $0x140+8,%rsp + movdqa L$sigma(%rip),%xmm11 + movdqu (%rcx),%xmm15 + movdqu 16(%rcx),%xmm7 + movdqu (%r8),%xmm3 + leaq 256(%rsp),%rcx + leaq L$rot16(%rip),%r10 + leaq L$rot24(%rip),%r11 + + pshufd $0x00,%xmm11,%xmm8 + pshufd $0x55,%xmm11,%xmm9 + movdqa %xmm8,64(%rsp) + pshufd $0xaa,%xmm11,%xmm10 + movdqa %xmm9,80(%rsp) + pshufd $0xff,%xmm11,%xmm11 + movdqa %xmm10,96(%rsp) + movdqa %xmm11,112(%rsp) + + pshufd $0x00,%xmm15,%xmm12 + pshufd $0x55,%xmm15,%xmm13 + movdqa %xmm12,128-256(%rcx) + pshufd $0xaa,%xmm15,%xmm14 + movdqa %xmm13,144-256(%rcx) + pshufd $0xff,%xmm15,%xmm15 + movdqa %xmm14,160-256(%rcx) + movdqa %xmm15,176-256(%rcx) + + pshufd $0x00,%xmm7,%xmm4 + pshufd $0x55,%xmm7,%xmm5 + movdqa %xmm4,192-256(%rcx) + pshufd $0xaa,%xmm7,%xmm6 + movdqa %xmm5,208-256(%rcx) + pshufd $0xff,%xmm7,%xmm7 + movdqa %xmm6,224-256(%rcx) + movdqa %xmm7,240-256(%rcx) + + pshufd $0x00,%xmm3,%xmm0 + pshufd $0x55,%xmm3,%xmm1 + paddd L$inc(%rip),%xmm0 + pshufd $0xaa,%xmm3,%xmm2 + movdqa %xmm1,272-256(%rcx) + pshufd $0xff,%xmm3,%xmm3 + movdqa %xmm2,288-256(%rcx) + movdqa %xmm3,304-256(%rcx) + + jmp L$oop_enter4x + +.p2align 5 +L$oop_outer4x: + movdqa 64(%rsp),%xmm8 + movdqa 80(%rsp),%xmm9 + movdqa 96(%rsp),%xmm10 + movdqa 112(%rsp),%xmm11 + movdqa 128-256(%rcx),%xmm12 + movdqa 144-256(%rcx),%xmm13 + movdqa 160-256(%rcx),%xmm14 + movdqa 176-256(%rcx),%xmm15 + movdqa 192-256(%rcx),%xmm4 + movdqa 208-256(%rcx),%xmm5 + movdqa 224-256(%rcx),%xmm6 + movdqa 240-256(%rcx),%xmm7 + movdqa 256-256(%rcx),%xmm0 + movdqa 272-256(%rcx),%xmm1 + movdqa 288-256(%rcx),%xmm2 + movdqa 304-256(%rcx),%xmm3 + paddd L$four(%rip),%xmm0 + +L$oop_enter4x: + movdqa %xmm6,32(%rsp) + movdqa %xmm7,48(%rsp) + movdqa (%r10),%xmm7 + movl $10,%eax + movdqa %xmm0,256-256(%rcx) + jmp L$oop4x + +.p2align 5 +L$oop4x: + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 + pshufb %xmm7,%xmm0 + pshufb %xmm7,%xmm1 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm6 + pslld $12,%xmm12 + psrld $20,%xmm6 + movdqa %xmm13,%xmm7 + pslld $12,%xmm13 + por %xmm6,%xmm12 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm13 + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 + pshufb %xmm6,%xmm0 + pshufb %xmm6,%xmm1 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm7 + pslld $7,%xmm12 + psrld $25,%xmm7 + movdqa %xmm13,%xmm6 + pslld $7,%xmm13 + por %xmm7,%xmm12 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm13 + movdqa %xmm4,0(%rsp) + movdqa %xmm5,16(%rsp) + movdqa 32(%rsp),%xmm4 + movdqa 48(%rsp),%xmm5 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 + pshufb %xmm7,%xmm2 + pshufb %xmm7,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm6 + pslld $12,%xmm14 + psrld $20,%xmm6 + movdqa %xmm15,%xmm7 + pslld $12,%xmm15 + por %xmm6,%xmm14 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm15 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 + pshufb %xmm6,%xmm2 + pshufb %xmm6,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm7 + pslld $7,%xmm14 + psrld $25,%xmm7 + movdqa %xmm15,%xmm6 + pslld $7,%xmm15 + por %xmm7,%xmm14 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm15 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 + pshufb %xmm7,%xmm3 + pshufb %xmm7,%xmm0 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm6 + pslld $12,%xmm13 + psrld $20,%xmm6 + movdqa %xmm14,%xmm7 + pslld $12,%xmm14 + por %xmm6,%xmm13 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm14 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 + pshufb %xmm6,%xmm3 + pshufb %xmm6,%xmm0 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm7 + pslld $7,%xmm13 + psrld $25,%xmm7 + movdqa %xmm14,%xmm6 + pslld $7,%xmm14 + por %xmm7,%xmm13 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm14 + movdqa %xmm4,32(%rsp) + movdqa %xmm5,48(%rsp) + movdqa 0(%rsp),%xmm4 + movdqa 16(%rsp),%xmm5 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 + pshufb %xmm7,%xmm1 + pshufb %xmm7,%xmm2 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm6 + pslld $12,%xmm15 + psrld $20,%xmm6 + movdqa %xmm12,%xmm7 + pslld $12,%xmm12 + por %xmm6,%xmm15 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm12 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 + pshufb %xmm6,%xmm1 + pshufb %xmm6,%xmm2 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm7 + pslld $7,%xmm15 + psrld $25,%xmm7 + movdqa %xmm12,%xmm6 + pslld $7,%xmm12 + por %xmm7,%xmm15 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm12 + decl %eax + jnz L$oop4x + + paddd 64(%rsp),%xmm8 + paddd 80(%rsp),%xmm9 + paddd 96(%rsp),%xmm10 + paddd 112(%rsp),%xmm11 + + movdqa %xmm8,%xmm6 + punpckldq %xmm9,%xmm8 + movdqa %xmm10,%xmm7 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm9,%xmm6 + punpckhdq %xmm11,%xmm7 + movdqa %xmm8,%xmm9 + punpcklqdq %xmm10,%xmm8 + movdqa %xmm6,%xmm11 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm10,%xmm9 + punpckhqdq %xmm7,%xmm11 + paddd 128-256(%rcx),%xmm12 + paddd 144-256(%rcx),%xmm13 + paddd 160-256(%rcx),%xmm14 + paddd 176-256(%rcx),%xmm15 + + movdqa %xmm8,0(%rsp) + movdqa %xmm9,16(%rsp) + movdqa 32(%rsp),%xmm8 + movdqa 48(%rsp),%xmm9 + + movdqa %xmm12,%xmm10 + punpckldq %xmm13,%xmm12 + movdqa %xmm14,%xmm7 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm13,%xmm10 + punpckhdq %xmm15,%xmm7 + movdqa %xmm12,%xmm13 + punpcklqdq %xmm14,%xmm12 + movdqa %xmm10,%xmm15 + punpcklqdq %xmm7,%xmm10 + punpckhqdq %xmm14,%xmm13 + punpckhqdq %xmm7,%xmm15 + paddd 192-256(%rcx),%xmm4 + paddd 208-256(%rcx),%xmm5 + paddd 224-256(%rcx),%xmm8 + paddd 240-256(%rcx),%xmm9 + + movdqa %xmm6,32(%rsp) + movdqa %xmm11,48(%rsp) + + movdqa %xmm4,%xmm14 + punpckldq %xmm5,%xmm4 + movdqa %xmm8,%xmm7 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm5,%xmm14 + punpckhdq %xmm9,%xmm7 + movdqa %xmm4,%xmm5 + punpcklqdq %xmm8,%xmm4 + movdqa %xmm14,%xmm9 + punpcklqdq %xmm7,%xmm14 + punpckhqdq %xmm8,%xmm5 + punpckhqdq %xmm7,%xmm9 + paddd 256-256(%rcx),%xmm0 + paddd 272-256(%rcx),%xmm1 + paddd 288-256(%rcx),%xmm2 + paddd 304-256(%rcx),%xmm3 + + movdqa %xmm0,%xmm8 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm8 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm8,%xmm3 + punpcklqdq %xmm7,%xmm8 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + cmpq $256,%rdx + jb L$tail4x + + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 48(%rsp),%xmm6 + pxor %xmm15,%xmm11 + pxor %xmm9,%xmm2 + pxor %xmm3,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + + subq $256,%rdx + jnz L$oop_outer4x + + jmp L$done4x + +L$tail4x: + cmpq $192,%rdx + jae L$192_or_more4x + cmpq $128,%rdx + jae L$128_or_more4x + cmpq $64,%rdx + jae L$64_or_more4x + + + xorq %r10,%r10 + + movdqa %xmm12,16(%rsp) + movdqa %xmm4,32(%rsp) + movdqa %xmm0,48(%rsp) + jmp L$oop_tail4x + +.p2align 5 +L$64_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je L$done4x + + movdqa 16(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm13,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm5,32(%rsp) + subq $64,%rdx + movdqa %xmm1,48(%rsp) + jmp L$oop_tail4x + +.p2align 5 +L$128_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + je L$done4x + + movdqa 32(%rsp),%xmm6 + leaq 128(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm10,16(%rsp) + leaq 128(%rdi),%rdi + movdqa %xmm14,32(%rsp) + subq $128,%rdx + movdqa %xmm8,48(%rsp) + jmp L$oop_tail4x + +.p2align 5 +L$192_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je L$done4x + + movdqa 48(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm15,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm9,32(%rsp) + subq $192,%rdx + movdqa %xmm3,48(%rsp) + +L$oop_tail4x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz L$oop_tail4x + +L$done4x: + leaq (%r9),%rsp + +L$4x_epilogue: + ret + + +.globl _ChaCha20_ctr32_avx2 +.private_extern _ChaCha20_ctr32_avx2 + +.p2align 5 +_ChaCha20_ctr32_avx2: + +_CET_ENDBR + movq %rsp,%r9 + + subq $0x280+8,%rsp + andq $-32,%rsp + vzeroupper + + + + + + + + + + + vbroadcasti128 L$sigma(%rip),%ymm11 + vbroadcasti128 (%rcx),%ymm3 + vbroadcasti128 16(%rcx),%ymm15 + vbroadcasti128 (%r8),%ymm7 + leaq 256(%rsp),%rcx + leaq 512(%rsp),%rax + leaq L$rot16(%rip),%r10 + leaq L$rot24(%rip),%r11 + + vpshufd $0x00,%ymm11,%ymm8 + vpshufd $0x55,%ymm11,%ymm9 + vmovdqa %ymm8,128-256(%rcx) + vpshufd $0xaa,%ymm11,%ymm10 + vmovdqa %ymm9,160-256(%rcx) + vpshufd $0xff,%ymm11,%ymm11 + vmovdqa %ymm10,192-256(%rcx) + vmovdqa %ymm11,224-256(%rcx) + + vpshufd $0x00,%ymm3,%ymm0 + vpshufd $0x55,%ymm3,%ymm1 + vmovdqa %ymm0,256-256(%rcx) + vpshufd $0xaa,%ymm3,%ymm2 + vmovdqa %ymm1,288-256(%rcx) + vpshufd $0xff,%ymm3,%ymm3 + vmovdqa %ymm2,320-256(%rcx) + vmovdqa %ymm3,352-256(%rcx) + + vpshufd $0x00,%ymm15,%ymm12 + vpshufd $0x55,%ymm15,%ymm13 + vmovdqa %ymm12,384-512(%rax) + vpshufd $0xaa,%ymm15,%ymm14 + vmovdqa %ymm13,416-512(%rax) + vpshufd $0xff,%ymm15,%ymm15 + vmovdqa %ymm14,448-512(%rax) + vmovdqa %ymm15,480-512(%rax) + + vpshufd $0x00,%ymm7,%ymm4 + vpshufd $0x55,%ymm7,%ymm5 + vpaddd L$incy(%rip),%ymm4,%ymm4 + vpshufd $0xaa,%ymm7,%ymm6 + vmovdqa %ymm5,544-512(%rax) + vpshufd $0xff,%ymm7,%ymm7 + vmovdqa %ymm6,576-512(%rax) + vmovdqa %ymm7,608-512(%rax) + + jmp L$oop_enter8x + +.p2align 5 +L$oop_outer8x: + vmovdqa 128-256(%rcx),%ymm8 + vmovdqa 160-256(%rcx),%ymm9 + vmovdqa 192-256(%rcx),%ymm10 + vmovdqa 224-256(%rcx),%ymm11 + vmovdqa 256-256(%rcx),%ymm0 + vmovdqa 288-256(%rcx),%ymm1 + vmovdqa 320-256(%rcx),%ymm2 + vmovdqa 352-256(%rcx),%ymm3 + vmovdqa 384-512(%rax),%ymm12 + vmovdqa 416-512(%rax),%ymm13 + vmovdqa 448-512(%rax),%ymm14 + vmovdqa 480-512(%rax),%ymm15 + vmovdqa 512-512(%rax),%ymm4 + vmovdqa 544-512(%rax),%ymm5 + vmovdqa 576-512(%rax),%ymm6 + vmovdqa 608-512(%rax),%ymm7 + vpaddd L$eight(%rip),%ymm4,%ymm4 + +L$oop_enter8x: + vmovdqa %ymm14,64(%rsp) + vmovdqa %ymm15,96(%rsp) + vbroadcasti128 (%r10),%ymm15 + vmovdqa %ymm4,512-512(%rax) + movl $10,%eax + jmp L$oop8x + +.p2align 5 +L$oop8x: + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $12,%ymm0,%ymm14 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $12,%ymm1,%ymm15 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $7,%ymm0,%ymm15 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $7,%ymm1,%ymm14 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vmovdqa %ymm12,0(%rsp) + vmovdqa %ymm13,32(%rsp) + vmovdqa 64(%rsp),%ymm12 + vmovdqa 96(%rsp),%ymm13 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $12,%ymm2,%ymm14 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $12,%ymm3,%ymm15 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $7,%ymm2,%ymm15 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $7,%ymm3,%ymm14 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $12,%ymm1,%ymm14 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $12,%ymm2,%ymm15 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $7,%ymm1,%ymm15 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $7,%ymm2,%ymm14 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vmovdqa %ymm12,64(%rsp) + vmovdqa %ymm13,96(%rsp) + vmovdqa 0(%rsp),%ymm12 + vmovdqa 32(%rsp),%ymm13 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $12,%ymm3,%ymm14 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $12,%ymm0,%ymm15 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $7,%ymm3,%ymm15 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $7,%ymm0,%ymm14 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + decl %eax + jnz L$oop8x + + leaq 512(%rsp),%rax + vpaddd 128-256(%rcx),%ymm8,%ymm8 + vpaddd 160-256(%rcx),%ymm9,%ymm9 + vpaddd 192-256(%rcx),%ymm10,%ymm10 + vpaddd 224-256(%rcx),%ymm11,%ymm11 + + vpunpckldq %ymm9,%ymm8,%ymm14 + vpunpckldq %ymm11,%ymm10,%ymm15 + vpunpckhdq %ymm9,%ymm8,%ymm8 + vpunpckhdq %ymm11,%ymm10,%ymm10 + vpunpcklqdq %ymm15,%ymm14,%ymm9 + vpunpckhqdq %ymm15,%ymm14,%ymm14 + vpunpcklqdq %ymm10,%ymm8,%ymm11 + vpunpckhqdq %ymm10,%ymm8,%ymm8 + vpaddd 256-256(%rcx),%ymm0,%ymm0 + vpaddd 288-256(%rcx),%ymm1,%ymm1 + vpaddd 320-256(%rcx),%ymm2,%ymm2 + vpaddd 352-256(%rcx),%ymm3,%ymm3 + + vpunpckldq %ymm1,%ymm0,%ymm10 + vpunpckldq %ymm3,%ymm2,%ymm15 + vpunpckhdq %ymm1,%ymm0,%ymm0 + vpunpckhdq %ymm3,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm10,%ymm1 + vpunpckhqdq %ymm15,%ymm10,%ymm10 + vpunpcklqdq %ymm2,%ymm0,%ymm3 + vpunpckhqdq %ymm2,%ymm0,%ymm0 + vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 + vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 + vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 + vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 + vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 + vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 + vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 + vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 + vmovdqa %ymm15,0(%rsp) + vmovdqa %ymm9,32(%rsp) + vmovdqa 64(%rsp),%ymm15 + vmovdqa 96(%rsp),%ymm9 + + vpaddd 384-512(%rax),%ymm12,%ymm12 + vpaddd 416-512(%rax),%ymm13,%ymm13 + vpaddd 448-512(%rax),%ymm15,%ymm15 + vpaddd 480-512(%rax),%ymm9,%ymm9 + + vpunpckldq %ymm13,%ymm12,%ymm2 + vpunpckldq %ymm9,%ymm15,%ymm8 + vpunpckhdq %ymm13,%ymm12,%ymm12 + vpunpckhdq %ymm9,%ymm15,%ymm15 + vpunpcklqdq %ymm8,%ymm2,%ymm13 + vpunpckhqdq %ymm8,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm12,%ymm9 + vpunpckhqdq %ymm15,%ymm12,%ymm12 + vpaddd 512-512(%rax),%ymm4,%ymm4 + vpaddd 544-512(%rax),%ymm5,%ymm5 + vpaddd 576-512(%rax),%ymm6,%ymm6 + vpaddd 608-512(%rax),%ymm7,%ymm7 + + vpunpckldq %ymm5,%ymm4,%ymm15 + vpunpckldq %ymm7,%ymm6,%ymm8 + vpunpckhdq %ymm5,%ymm4,%ymm4 + vpunpckhdq %ymm7,%ymm6,%ymm6 + vpunpcklqdq %ymm8,%ymm15,%ymm5 + vpunpckhqdq %ymm8,%ymm15,%ymm15 + vpunpcklqdq %ymm6,%ymm4,%ymm7 + vpunpckhqdq %ymm6,%ymm4,%ymm4 + vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 + vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 + vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 + vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 + vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 + vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 + vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 + vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 + vmovdqa 0(%rsp),%ymm6 + vmovdqa 32(%rsp),%ymm12 + + cmpq $512,%rdx + jb L$tail8x + + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + leaq 128(%rsi),%rsi + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm12,%ymm12 + vpxor 32(%rsi),%ymm13,%ymm13 + vpxor 64(%rsi),%ymm10,%ymm10 + vpxor 96(%rsi),%ymm15,%ymm15 + leaq 128(%rsi),%rsi + vmovdqu %ymm12,0(%rdi) + vmovdqu %ymm13,32(%rdi) + vmovdqu %ymm10,64(%rdi) + vmovdqu %ymm15,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm14,%ymm14 + vpxor 32(%rsi),%ymm2,%ymm2 + vpxor 64(%rsi),%ymm3,%ymm3 + vpxor 96(%rsi),%ymm7,%ymm7 + leaq 128(%rsi),%rsi + vmovdqu %ymm14,0(%rdi) + vmovdqu %ymm2,32(%rdi) + vmovdqu %ymm3,64(%rdi) + vmovdqu %ymm7,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm11,%ymm11 + vpxor 32(%rsi),%ymm9,%ymm9 + vpxor 64(%rsi),%ymm0,%ymm0 + vpxor 96(%rsi),%ymm4,%ymm4 + leaq 128(%rsi),%rsi + vmovdqu %ymm11,0(%rdi) + vmovdqu %ymm9,32(%rdi) + vmovdqu %ymm0,64(%rdi) + vmovdqu %ymm4,96(%rdi) + leaq 128(%rdi),%rdi + + subq $512,%rdx + jnz L$oop_outer8x + + jmp L$done8x + +L$tail8x: + cmpq $448,%rdx + jae L$448_or_more8x + cmpq $384,%rdx + jae L$384_or_more8x + cmpq $320,%rdx + jae L$320_or_more8x + cmpq $256,%rdx + jae L$256_or_more8x + cmpq $192,%rdx + jae L$192_or_more8x + cmpq $128,%rdx + jae L$128_or_more8x + cmpq $64,%rdx + jae L$64_or_more8x + + xorq %r10,%r10 + vmovdqa %ymm6,0(%rsp) + vmovdqa %ymm8,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$64_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + je L$done8x + + leaq 64(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm1,0(%rsp) + leaq 64(%rdi),%rdi + subq $64,%rdx + vmovdqa %ymm5,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$128_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + je L$done8x + + leaq 128(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm12,0(%rsp) + leaq 128(%rdi),%rdi + subq $128,%rdx + vmovdqa %ymm13,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$192_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + je L$done8x + + leaq 192(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm10,0(%rsp) + leaq 192(%rdi),%rdi + subq $192,%rdx + vmovdqa %ymm15,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$256_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + je L$done8x + + leaq 256(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm14,0(%rsp) + leaq 256(%rdi),%rdi + subq $256,%rdx + vmovdqa %ymm2,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$320_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + je L$done8x + + leaq 320(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm3,0(%rsp) + leaq 320(%rdi),%rdi + subq $320,%rdx + vmovdqa %ymm7,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$384_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + je L$done8x + + leaq 384(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm11,0(%rsp) + leaq 384(%rdi),%rdi + subq $384,%rdx + vmovdqa %ymm9,32(%rsp) + jmp L$oop_tail8x + +.p2align 5 +L$448_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vpxor 384(%rsi),%ymm11,%ymm11 + vpxor 416(%rsi),%ymm9,%ymm9 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + vmovdqu %ymm11,384(%rdi) + vmovdqu %ymm9,416(%rdi) + je L$done8x + + leaq 448(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm0,0(%rsp) + leaq 448(%rdi),%rdi + subq $448,%rdx + vmovdqa %ymm4,32(%rsp) + +L$oop_tail8x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz L$oop_tail8x + +L$done8x: + vzeroall + leaq (%r9),%rsp + +L$8x_epilogue: + ret + + +#endif diff --git a/third_party/boringssl/gen/crypto/chacha-x86_64-linux.S b/third_party/boringssl/gen/crypto/chacha-x86_64-linux.S new file mode 100644 index 00000000..8ea190d1 --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha-x86_64-linux.S @@ -0,0 +1,1609 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + +.section .rodata +.align 64 +.Lzero: +.long 0,0,0,0 +.Lone: +.long 1,0,0,0 +.Linc: +.long 0,1,2,3 +.Lfour: +.long 4,4,4,4 +.Lincy: +.long 0,2,4,6,1,3,5,7 +.Leight: +.long 8,8,8,8,8,8,8,8 +.Lrot16: +.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd +.Lrot24: +.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe +.Lsigma: +.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 +.align 64 +.Lzeroz: +.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 +.Lfourz: +.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 +.Lincz: +.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +.Lsixteen: +.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 +.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 +.text +.globl ChaCha20_ctr32_nohw +.hidden ChaCha20_ctr32_nohw +.type ChaCha20_ctr32_nohw,@function +.align 64 +ChaCha20_ctr32_nohw: +.cfi_startproc +_CET_ENDBR + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx,-16 + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15,-56 + subq $64+24,%rsp +.cfi_adjust_cfa_offset 88 +.Lctr32_body: + + + movdqu (%rcx),%xmm1 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa .Lone(%rip),%xmm4 + + + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + movq %rdx,%rbp + jmp .Loop_outer + +.align 32 +.Loop_outer: + movl $0x61707865,%eax + movl $0x3320646e,%ebx + movl $0x79622d32,%ecx + movl $0x6b206574,%edx + movl 16(%rsp),%r8d + movl 20(%rsp),%r9d + movl 24(%rsp),%r10d + movl 28(%rsp),%r11d + movd %xmm3,%r12d + movl 52(%rsp),%r13d + movl 56(%rsp),%r14d + movl 60(%rsp),%r15d + + movq %rbp,64+0(%rsp) + movl $10,%ebp + movq %rsi,64+8(%rsp) + movq %xmm2,%rsi + movq %rdi,64+16(%rsp) + movq %rsi,%rdi + shrq $32,%rdi + jmp .Loop + +.align 32 +.Loop: + addl %r8d,%eax + xorl %eax,%r12d + roll $16,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $16,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $12,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $12,%r9d + addl %r8d,%eax + xorl %eax,%r12d + roll $8,%r12d + addl %r9d,%ebx + xorl %ebx,%r13d + roll $8,%r13d + addl %r12d,%esi + xorl %esi,%r8d + roll $7,%r8d + addl %r13d,%edi + xorl %edi,%r9d + roll $7,%r9d + movl %esi,32(%rsp) + movl %edi,36(%rsp) + movl 40(%rsp),%esi + movl 44(%rsp),%edi + addl %r10d,%ecx + xorl %ecx,%r14d + roll $16,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $16,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $12,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $12,%r11d + addl %r10d,%ecx + xorl %ecx,%r14d + roll $8,%r14d + addl %r11d,%edx + xorl %edx,%r15d + roll $8,%r15d + addl %r14d,%esi + xorl %esi,%r10d + roll $7,%r10d + addl %r15d,%edi + xorl %edi,%r11d + roll $7,%r11d + addl %r9d,%eax + xorl %eax,%r15d + roll $16,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $16,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $12,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $12,%r10d + addl %r9d,%eax + xorl %eax,%r15d + roll $8,%r15d + addl %r10d,%ebx + xorl %ebx,%r12d + roll $8,%r12d + addl %r15d,%esi + xorl %esi,%r9d + roll $7,%r9d + addl %r12d,%edi + xorl %edi,%r10d + roll $7,%r10d + movl %esi,40(%rsp) + movl %edi,44(%rsp) + movl 32(%rsp),%esi + movl 36(%rsp),%edi + addl %r11d,%ecx + xorl %ecx,%r13d + roll $16,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $16,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $12,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $12,%r8d + addl %r11d,%ecx + xorl %ecx,%r13d + roll $8,%r13d + addl %r8d,%edx + xorl %edx,%r14d + roll $8,%r14d + addl %r13d,%esi + xorl %esi,%r11d + roll $7,%r11d + addl %r14d,%edi + xorl %edi,%r8d + roll $7,%r8d + decl %ebp + jnz .Loop + movl %edi,36(%rsp) + movl %esi,32(%rsp) + movq 64(%rsp),%rbp + movdqa %xmm2,%xmm1 + movq 64+8(%rsp),%rsi + paddd %xmm4,%xmm3 + movq 64+16(%rsp),%rdi + + addl $0x61707865,%eax + addl $0x3320646e,%ebx + addl $0x79622d32,%ecx + addl $0x6b206574,%edx + addl 16(%rsp),%r8d + addl 20(%rsp),%r9d + addl 24(%rsp),%r10d + addl 28(%rsp),%r11d + addl 48(%rsp),%r12d + addl 52(%rsp),%r13d + addl 56(%rsp),%r14d + addl 60(%rsp),%r15d + paddd 32(%rsp),%xmm1 + + cmpq $64,%rbp + jb .Ltail + + xorl 0(%rsi),%eax + xorl 4(%rsi),%ebx + xorl 8(%rsi),%ecx + xorl 12(%rsi),%edx + xorl 16(%rsi),%r8d + xorl 20(%rsi),%r9d + xorl 24(%rsi),%r10d + xorl 28(%rsi),%r11d + movdqu 32(%rsi),%xmm0 + xorl 48(%rsi),%r12d + xorl 52(%rsi),%r13d + xorl 56(%rsi),%r14d + xorl 60(%rsi),%r15d + leaq 64(%rsi),%rsi + pxor %xmm1,%xmm0 + + movdqa %xmm2,32(%rsp) + movd %xmm3,48(%rsp) + + movl %eax,0(%rdi) + movl %ebx,4(%rdi) + movl %ecx,8(%rdi) + movl %edx,12(%rdi) + movl %r8d,16(%rdi) + movl %r9d,20(%rdi) + movl %r10d,24(%rdi) + movl %r11d,28(%rdi) + movdqu %xmm0,32(%rdi) + movl %r12d,48(%rdi) + movl %r13d,52(%rdi) + movl %r14d,56(%rdi) + movl %r15d,60(%rdi) + leaq 64(%rdi),%rdi + + subq $64,%rbp + jnz .Loop_outer + + jmp .Ldone + +.align 16 +.Ltail: + movl %eax,0(%rsp) + movl %ebx,4(%rsp) + xorq %rbx,%rbx + movl %ecx,8(%rsp) + movl %edx,12(%rsp) + movl %r8d,16(%rsp) + movl %r9d,20(%rsp) + movl %r10d,24(%rsp) + movl %r11d,28(%rsp) + movdqa %xmm1,32(%rsp) + movl %r12d,48(%rsp) + movl %r13d,52(%rsp) + movl %r14d,56(%rsp) + movl %r15d,60(%rsp) + +.Loop_tail: + movzbl (%rsi,%rbx,1),%eax + movzbl (%rsp,%rbx,1),%edx + leaq 1(%rbx),%rbx + xorl %edx,%eax + movb %al,-1(%rdi,%rbx,1) + decq %rbp + jnz .Loop_tail + +.Ldone: + leaq 64+24+48(%rsp),%rsi + movq -48(%rsi),%r15 +.cfi_restore r15 + movq -40(%rsi),%r14 +.cfi_restore r14 + movq -32(%rsi),%r13 +.cfi_restore r13 + movq -24(%rsi),%r12 +.cfi_restore r12 + movq -16(%rsi),%rbp +.cfi_restore rbp + movq -8(%rsi),%rbx +.cfi_restore rbx + leaq (%rsi),%rsp +.cfi_adjust_cfa_offset -136 +.Lno_data: + ret +.cfi_endproc +.size ChaCha20_ctr32_nohw,.-ChaCha20_ctr32_nohw +.globl ChaCha20_ctr32_ssse3 +.hidden ChaCha20_ctr32_ssse3 +.type ChaCha20_ctr32_ssse3,@function +.align 32 +ChaCha20_ctr32_ssse3: +.cfi_startproc +_CET_ENDBR + movq %rsp,%r9 +.cfi_def_cfa_register r9 + subq $64+8,%rsp + movdqa .Lsigma(%rip),%xmm0 + movdqu (%rcx),%xmm1 + movdqu 16(%rcx),%xmm2 + movdqu (%r8),%xmm3 + movdqa .Lrot16(%rip),%xmm6 + movdqa .Lrot24(%rip),%xmm7 + + movdqa %xmm0,0(%rsp) + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + movq $10,%r8 + jmp .Loop_ssse3 + +.align 32 +.Loop_outer_ssse3: + movdqa .Lone(%rip),%xmm3 + movdqa 0(%rsp),%xmm0 + movdqa 16(%rsp),%xmm1 + movdqa 32(%rsp),%xmm2 + paddd 48(%rsp),%xmm3 + movq $10,%r8 + movdqa %xmm3,48(%rsp) + jmp .Loop_ssse3 + +.align 32 +.Loop_ssse3: + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm6,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm7,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $57,%xmm1,%xmm1 + pshufd $147,%xmm3,%xmm3 + nop + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm6,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $20,%xmm1 + pslld $12,%xmm4 + por %xmm4,%xmm1 + paddd %xmm1,%xmm0 + pxor %xmm0,%xmm3 + pshufb %xmm7,%xmm3 + paddd %xmm3,%xmm2 + pxor %xmm2,%xmm1 + movdqa %xmm1,%xmm4 + psrld $25,%xmm1 + pslld $7,%xmm4 + por %xmm4,%xmm1 + pshufd $78,%xmm2,%xmm2 + pshufd $147,%xmm1,%xmm1 + pshufd $57,%xmm3,%xmm3 + decq %r8 + jnz .Loop_ssse3 + paddd 0(%rsp),%xmm0 + paddd 16(%rsp),%xmm1 + paddd 32(%rsp),%xmm2 + paddd 48(%rsp),%xmm3 + + cmpq $64,%rdx + jb .Ltail_ssse3 + + movdqu 0(%rsi),%xmm4 + movdqu 16(%rsi),%xmm5 + pxor %xmm4,%xmm0 + movdqu 32(%rsi),%xmm4 + pxor %xmm5,%xmm1 + movdqu 48(%rsi),%xmm5 + leaq 64(%rsi),%rsi + pxor %xmm4,%xmm2 + pxor %xmm5,%xmm3 + + movdqu %xmm0,0(%rdi) + movdqu %xmm1,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm3,48(%rdi) + leaq 64(%rdi),%rdi + + subq $64,%rdx + jnz .Loop_outer_ssse3 + + jmp .Ldone_ssse3 + +.align 16 +.Ltail_ssse3: + movdqa %xmm0,0(%rsp) + movdqa %xmm1,16(%rsp) + movdqa %xmm2,32(%rsp) + movdqa %xmm3,48(%rsp) + xorq %r8,%r8 + +.Loop_tail_ssse3: + movzbl (%rsi,%r8,1),%eax + movzbl (%rsp,%r8,1),%ecx + leaq 1(%r8),%r8 + xorl %ecx,%eax + movb %al,-1(%rdi,%r8,1) + decq %rdx + jnz .Loop_tail_ssse3 + +.Ldone_ssse3: + leaq (%r9),%rsp +.cfi_def_cfa_register rsp +.Lssse3_epilogue: + ret +.cfi_endproc +.size ChaCha20_ctr32_ssse3,.-ChaCha20_ctr32_ssse3 +.globl ChaCha20_ctr32_ssse3_4x +.hidden ChaCha20_ctr32_ssse3_4x +.type ChaCha20_ctr32_ssse3_4x,@function +.align 32 +ChaCha20_ctr32_ssse3_4x: +.cfi_startproc +_CET_ENDBR + movq %rsp,%r9 +.cfi_def_cfa_register r9 + subq $0x140+8,%rsp + movdqa .Lsigma(%rip),%xmm11 + movdqu (%rcx),%xmm15 + movdqu 16(%rcx),%xmm7 + movdqu (%r8),%xmm3 + leaq 256(%rsp),%rcx + leaq .Lrot16(%rip),%r10 + leaq .Lrot24(%rip),%r11 + + pshufd $0x00,%xmm11,%xmm8 + pshufd $0x55,%xmm11,%xmm9 + movdqa %xmm8,64(%rsp) + pshufd $0xaa,%xmm11,%xmm10 + movdqa %xmm9,80(%rsp) + pshufd $0xff,%xmm11,%xmm11 + movdqa %xmm10,96(%rsp) + movdqa %xmm11,112(%rsp) + + pshufd $0x00,%xmm15,%xmm12 + pshufd $0x55,%xmm15,%xmm13 + movdqa %xmm12,128-256(%rcx) + pshufd $0xaa,%xmm15,%xmm14 + movdqa %xmm13,144-256(%rcx) + pshufd $0xff,%xmm15,%xmm15 + movdqa %xmm14,160-256(%rcx) + movdqa %xmm15,176-256(%rcx) + + pshufd $0x00,%xmm7,%xmm4 + pshufd $0x55,%xmm7,%xmm5 + movdqa %xmm4,192-256(%rcx) + pshufd $0xaa,%xmm7,%xmm6 + movdqa %xmm5,208-256(%rcx) + pshufd $0xff,%xmm7,%xmm7 + movdqa %xmm6,224-256(%rcx) + movdqa %xmm7,240-256(%rcx) + + pshufd $0x00,%xmm3,%xmm0 + pshufd $0x55,%xmm3,%xmm1 + paddd .Linc(%rip),%xmm0 + pshufd $0xaa,%xmm3,%xmm2 + movdqa %xmm1,272-256(%rcx) + pshufd $0xff,%xmm3,%xmm3 + movdqa %xmm2,288-256(%rcx) + movdqa %xmm3,304-256(%rcx) + + jmp .Loop_enter4x + +.align 32 +.Loop_outer4x: + movdqa 64(%rsp),%xmm8 + movdqa 80(%rsp),%xmm9 + movdqa 96(%rsp),%xmm10 + movdqa 112(%rsp),%xmm11 + movdqa 128-256(%rcx),%xmm12 + movdqa 144-256(%rcx),%xmm13 + movdqa 160-256(%rcx),%xmm14 + movdqa 176-256(%rcx),%xmm15 + movdqa 192-256(%rcx),%xmm4 + movdqa 208-256(%rcx),%xmm5 + movdqa 224-256(%rcx),%xmm6 + movdqa 240-256(%rcx),%xmm7 + movdqa 256-256(%rcx),%xmm0 + movdqa 272-256(%rcx),%xmm1 + movdqa 288-256(%rcx),%xmm2 + movdqa 304-256(%rcx),%xmm3 + paddd .Lfour(%rip),%xmm0 + +.Loop_enter4x: + movdqa %xmm6,32(%rsp) + movdqa %xmm7,48(%rsp) + movdqa (%r10),%xmm7 + movl $10,%eax + movdqa %xmm0,256-256(%rcx) + jmp .Loop4x + +.align 32 +.Loop4x: + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 + pshufb %xmm7,%xmm0 + pshufb %xmm7,%xmm1 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm6 + pslld $12,%xmm12 + psrld $20,%xmm6 + movdqa %xmm13,%xmm7 + pslld $12,%xmm13 + por %xmm6,%xmm12 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm13 + paddd %xmm12,%xmm8 + paddd %xmm13,%xmm9 + pxor %xmm8,%xmm0 + pxor %xmm9,%xmm1 + pshufb %xmm6,%xmm0 + pshufb %xmm6,%xmm1 + paddd %xmm0,%xmm4 + paddd %xmm1,%xmm5 + pxor %xmm4,%xmm12 + pxor %xmm5,%xmm13 + movdqa %xmm12,%xmm7 + pslld $7,%xmm12 + psrld $25,%xmm7 + movdqa %xmm13,%xmm6 + pslld $7,%xmm13 + por %xmm7,%xmm12 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm13 + movdqa %xmm4,0(%rsp) + movdqa %xmm5,16(%rsp) + movdqa 32(%rsp),%xmm4 + movdqa 48(%rsp),%xmm5 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 + pshufb %xmm7,%xmm2 + pshufb %xmm7,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm6 + pslld $12,%xmm14 + psrld $20,%xmm6 + movdqa %xmm15,%xmm7 + pslld $12,%xmm15 + por %xmm6,%xmm14 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm15 + paddd %xmm14,%xmm10 + paddd %xmm15,%xmm11 + pxor %xmm10,%xmm2 + pxor %xmm11,%xmm3 + pshufb %xmm6,%xmm2 + pshufb %xmm6,%xmm3 + paddd %xmm2,%xmm4 + paddd %xmm3,%xmm5 + pxor %xmm4,%xmm14 + pxor %xmm5,%xmm15 + movdqa %xmm14,%xmm7 + pslld $7,%xmm14 + psrld $25,%xmm7 + movdqa %xmm15,%xmm6 + pslld $7,%xmm15 + por %xmm7,%xmm14 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm15 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 + pshufb %xmm7,%xmm3 + pshufb %xmm7,%xmm0 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm6 + pslld $12,%xmm13 + psrld $20,%xmm6 + movdqa %xmm14,%xmm7 + pslld $12,%xmm14 + por %xmm6,%xmm13 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm14 + paddd %xmm13,%xmm8 + paddd %xmm14,%xmm9 + pxor %xmm8,%xmm3 + pxor %xmm9,%xmm0 + pshufb %xmm6,%xmm3 + pshufb %xmm6,%xmm0 + paddd %xmm3,%xmm4 + paddd %xmm0,%xmm5 + pxor %xmm4,%xmm13 + pxor %xmm5,%xmm14 + movdqa %xmm13,%xmm7 + pslld $7,%xmm13 + psrld $25,%xmm7 + movdqa %xmm14,%xmm6 + pslld $7,%xmm14 + por %xmm7,%xmm13 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm14 + movdqa %xmm4,32(%rsp) + movdqa %xmm5,48(%rsp) + movdqa 0(%rsp),%xmm4 + movdqa 16(%rsp),%xmm5 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 + pshufb %xmm7,%xmm1 + pshufb %xmm7,%xmm2 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm6 + pslld $12,%xmm15 + psrld $20,%xmm6 + movdqa %xmm12,%xmm7 + pslld $12,%xmm12 + por %xmm6,%xmm15 + psrld $20,%xmm7 + movdqa (%r11),%xmm6 + por %xmm7,%xmm12 + paddd %xmm15,%xmm10 + paddd %xmm12,%xmm11 + pxor %xmm10,%xmm1 + pxor %xmm11,%xmm2 + pshufb %xmm6,%xmm1 + pshufb %xmm6,%xmm2 + paddd %xmm1,%xmm4 + paddd %xmm2,%xmm5 + pxor %xmm4,%xmm15 + pxor %xmm5,%xmm12 + movdqa %xmm15,%xmm7 + pslld $7,%xmm15 + psrld $25,%xmm7 + movdqa %xmm12,%xmm6 + pslld $7,%xmm12 + por %xmm7,%xmm15 + psrld $25,%xmm6 + movdqa (%r10),%xmm7 + por %xmm6,%xmm12 + decl %eax + jnz .Loop4x + + paddd 64(%rsp),%xmm8 + paddd 80(%rsp),%xmm9 + paddd 96(%rsp),%xmm10 + paddd 112(%rsp),%xmm11 + + movdqa %xmm8,%xmm6 + punpckldq %xmm9,%xmm8 + movdqa %xmm10,%xmm7 + punpckldq %xmm11,%xmm10 + punpckhdq %xmm9,%xmm6 + punpckhdq %xmm11,%xmm7 + movdqa %xmm8,%xmm9 + punpcklqdq %xmm10,%xmm8 + movdqa %xmm6,%xmm11 + punpcklqdq %xmm7,%xmm6 + punpckhqdq %xmm10,%xmm9 + punpckhqdq %xmm7,%xmm11 + paddd 128-256(%rcx),%xmm12 + paddd 144-256(%rcx),%xmm13 + paddd 160-256(%rcx),%xmm14 + paddd 176-256(%rcx),%xmm15 + + movdqa %xmm8,0(%rsp) + movdqa %xmm9,16(%rsp) + movdqa 32(%rsp),%xmm8 + movdqa 48(%rsp),%xmm9 + + movdqa %xmm12,%xmm10 + punpckldq %xmm13,%xmm12 + movdqa %xmm14,%xmm7 + punpckldq %xmm15,%xmm14 + punpckhdq %xmm13,%xmm10 + punpckhdq %xmm15,%xmm7 + movdqa %xmm12,%xmm13 + punpcklqdq %xmm14,%xmm12 + movdqa %xmm10,%xmm15 + punpcklqdq %xmm7,%xmm10 + punpckhqdq %xmm14,%xmm13 + punpckhqdq %xmm7,%xmm15 + paddd 192-256(%rcx),%xmm4 + paddd 208-256(%rcx),%xmm5 + paddd 224-256(%rcx),%xmm8 + paddd 240-256(%rcx),%xmm9 + + movdqa %xmm6,32(%rsp) + movdqa %xmm11,48(%rsp) + + movdqa %xmm4,%xmm14 + punpckldq %xmm5,%xmm4 + movdqa %xmm8,%xmm7 + punpckldq %xmm9,%xmm8 + punpckhdq %xmm5,%xmm14 + punpckhdq %xmm9,%xmm7 + movdqa %xmm4,%xmm5 + punpcklqdq %xmm8,%xmm4 + movdqa %xmm14,%xmm9 + punpcklqdq %xmm7,%xmm14 + punpckhqdq %xmm8,%xmm5 + punpckhqdq %xmm7,%xmm9 + paddd 256-256(%rcx),%xmm0 + paddd 272-256(%rcx),%xmm1 + paddd 288-256(%rcx),%xmm2 + paddd 304-256(%rcx),%xmm3 + + movdqa %xmm0,%xmm8 + punpckldq %xmm1,%xmm0 + movdqa %xmm2,%xmm7 + punpckldq %xmm3,%xmm2 + punpckhdq %xmm1,%xmm8 + punpckhdq %xmm3,%xmm7 + movdqa %xmm0,%xmm1 + punpcklqdq %xmm2,%xmm0 + movdqa %xmm8,%xmm3 + punpcklqdq %xmm7,%xmm8 + punpckhqdq %xmm2,%xmm1 + punpckhqdq %xmm7,%xmm3 + cmpq $256,%rdx + jb .Ltail4x + + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 48(%rsp),%xmm6 + pxor %xmm15,%xmm11 + pxor %xmm9,%xmm2 + pxor %xmm3,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + + subq $256,%rdx + jnz .Loop_outer4x + + jmp .Ldone4x + +.Ltail4x: + cmpq $192,%rdx + jae .L192_or_more4x + cmpq $128,%rdx + jae .L128_or_more4x + cmpq $64,%rdx + jae .L64_or_more4x + + + xorq %r10,%r10 + + movdqa %xmm12,16(%rsp) + movdqa %xmm4,32(%rsp) + movdqa %xmm0,48(%rsp) + jmp .Loop_tail4x + +.align 32 +.L64_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je .Ldone4x + + movdqa 16(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm13,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm5,32(%rsp) + subq $64,%rdx + movdqa %xmm1,48(%rsp) + jmp .Loop_tail4x + +.align 32 +.L128_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + movdqu %xmm6,64(%rdi) + movdqu %xmm11,80(%rdi) + movdqu %xmm2,96(%rdi) + movdqu %xmm7,112(%rdi) + je .Ldone4x + + movdqa 32(%rsp),%xmm6 + leaq 128(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm10,16(%rsp) + leaq 128(%rdi),%rdi + movdqa %xmm14,32(%rsp) + subq $128,%rdx + movdqa %xmm8,48(%rsp) + jmp .Loop_tail4x + +.align 32 +.L192_or_more4x: + movdqu 0(%rsi),%xmm6 + movdqu 16(%rsi),%xmm11 + movdqu 32(%rsi),%xmm2 + movdqu 48(%rsi),%xmm7 + pxor 0(%rsp),%xmm6 + pxor %xmm12,%xmm11 + pxor %xmm4,%xmm2 + pxor %xmm0,%xmm7 + + movdqu %xmm6,0(%rdi) + movdqu 64(%rsi),%xmm6 + movdqu %xmm11,16(%rdi) + movdqu 80(%rsi),%xmm11 + movdqu %xmm2,32(%rdi) + movdqu 96(%rsi),%xmm2 + movdqu %xmm7,48(%rdi) + movdqu 112(%rsi),%xmm7 + leaq 128(%rsi),%rsi + pxor 16(%rsp),%xmm6 + pxor %xmm13,%xmm11 + pxor %xmm5,%xmm2 + pxor %xmm1,%xmm7 + + movdqu %xmm6,64(%rdi) + movdqu 0(%rsi),%xmm6 + movdqu %xmm11,80(%rdi) + movdqu 16(%rsi),%xmm11 + movdqu %xmm2,96(%rdi) + movdqu 32(%rsi),%xmm2 + movdqu %xmm7,112(%rdi) + leaq 128(%rdi),%rdi + movdqu 48(%rsi),%xmm7 + pxor 32(%rsp),%xmm6 + pxor %xmm10,%xmm11 + pxor %xmm14,%xmm2 + pxor %xmm8,%xmm7 + movdqu %xmm6,0(%rdi) + movdqu %xmm11,16(%rdi) + movdqu %xmm2,32(%rdi) + movdqu %xmm7,48(%rdi) + je .Ldone4x + + movdqa 48(%rsp),%xmm6 + leaq 64(%rsi),%rsi + xorq %r10,%r10 + movdqa %xmm6,0(%rsp) + movdqa %xmm15,16(%rsp) + leaq 64(%rdi),%rdi + movdqa %xmm9,32(%rsp) + subq $192,%rdx + movdqa %xmm3,48(%rsp) + +.Loop_tail4x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail4x + +.Ldone4x: + leaq (%r9),%rsp +.cfi_def_cfa_register rsp +.L4x_epilogue: + ret +.cfi_endproc +.size ChaCha20_ctr32_ssse3_4x,.-ChaCha20_ctr32_ssse3_4x +.globl ChaCha20_ctr32_avx2 +.hidden ChaCha20_ctr32_avx2 +.type ChaCha20_ctr32_avx2,@function +.align 32 +ChaCha20_ctr32_avx2: +.cfi_startproc +_CET_ENDBR + movq %rsp,%r9 +.cfi_def_cfa_register r9 + subq $0x280+8,%rsp + andq $-32,%rsp + vzeroupper + + + + + + + + + + + vbroadcasti128 .Lsigma(%rip),%ymm11 + vbroadcasti128 (%rcx),%ymm3 + vbroadcasti128 16(%rcx),%ymm15 + vbroadcasti128 (%r8),%ymm7 + leaq 256(%rsp),%rcx + leaq 512(%rsp),%rax + leaq .Lrot16(%rip),%r10 + leaq .Lrot24(%rip),%r11 + + vpshufd $0x00,%ymm11,%ymm8 + vpshufd $0x55,%ymm11,%ymm9 + vmovdqa %ymm8,128-256(%rcx) + vpshufd $0xaa,%ymm11,%ymm10 + vmovdqa %ymm9,160-256(%rcx) + vpshufd $0xff,%ymm11,%ymm11 + vmovdqa %ymm10,192-256(%rcx) + vmovdqa %ymm11,224-256(%rcx) + + vpshufd $0x00,%ymm3,%ymm0 + vpshufd $0x55,%ymm3,%ymm1 + vmovdqa %ymm0,256-256(%rcx) + vpshufd $0xaa,%ymm3,%ymm2 + vmovdqa %ymm1,288-256(%rcx) + vpshufd $0xff,%ymm3,%ymm3 + vmovdqa %ymm2,320-256(%rcx) + vmovdqa %ymm3,352-256(%rcx) + + vpshufd $0x00,%ymm15,%ymm12 + vpshufd $0x55,%ymm15,%ymm13 + vmovdqa %ymm12,384-512(%rax) + vpshufd $0xaa,%ymm15,%ymm14 + vmovdqa %ymm13,416-512(%rax) + vpshufd $0xff,%ymm15,%ymm15 + vmovdqa %ymm14,448-512(%rax) + vmovdqa %ymm15,480-512(%rax) + + vpshufd $0x00,%ymm7,%ymm4 + vpshufd $0x55,%ymm7,%ymm5 + vpaddd .Lincy(%rip),%ymm4,%ymm4 + vpshufd $0xaa,%ymm7,%ymm6 + vmovdqa %ymm5,544-512(%rax) + vpshufd $0xff,%ymm7,%ymm7 + vmovdqa %ymm6,576-512(%rax) + vmovdqa %ymm7,608-512(%rax) + + jmp .Loop_enter8x + +.align 32 +.Loop_outer8x: + vmovdqa 128-256(%rcx),%ymm8 + vmovdqa 160-256(%rcx),%ymm9 + vmovdqa 192-256(%rcx),%ymm10 + vmovdqa 224-256(%rcx),%ymm11 + vmovdqa 256-256(%rcx),%ymm0 + vmovdqa 288-256(%rcx),%ymm1 + vmovdqa 320-256(%rcx),%ymm2 + vmovdqa 352-256(%rcx),%ymm3 + vmovdqa 384-512(%rax),%ymm12 + vmovdqa 416-512(%rax),%ymm13 + vmovdqa 448-512(%rax),%ymm14 + vmovdqa 480-512(%rax),%ymm15 + vmovdqa 512-512(%rax),%ymm4 + vmovdqa 544-512(%rax),%ymm5 + vmovdqa 576-512(%rax),%ymm6 + vmovdqa 608-512(%rax),%ymm7 + vpaddd .Leight(%rip),%ymm4,%ymm4 + +.Loop_enter8x: + vmovdqa %ymm14,64(%rsp) + vmovdqa %ymm15,96(%rsp) + vbroadcasti128 (%r10),%ymm15 + vmovdqa %ymm4,512-512(%rax) + movl $10,%eax + jmp .Loop8x + +.align 32 +.Loop8x: + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $12,%ymm0,%ymm14 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $12,%ymm1,%ymm15 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vpaddd %ymm0,%ymm8,%ymm8 + vpxor %ymm4,%ymm8,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm1,%ymm9,%ymm9 + vpxor %ymm5,%ymm9,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm4,%ymm12,%ymm12 + vpxor %ymm0,%ymm12,%ymm0 + vpslld $7,%ymm0,%ymm15 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm5,%ymm13,%ymm13 + vpxor %ymm1,%ymm13,%ymm1 + vpslld $7,%ymm1,%ymm14 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vmovdqa %ymm12,0(%rsp) + vmovdqa %ymm13,32(%rsp) + vmovdqa 64(%rsp),%ymm12 + vmovdqa 96(%rsp),%ymm13 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $12,%ymm2,%ymm14 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $12,%ymm3,%ymm15 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vpaddd %ymm2,%ymm10,%ymm10 + vpxor %ymm6,%ymm10,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm3,%ymm11,%ymm11 + vpxor %ymm7,%ymm11,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm6,%ymm12,%ymm12 + vpxor %ymm2,%ymm12,%ymm2 + vpslld $7,%ymm2,%ymm15 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm7,%ymm13,%ymm13 + vpxor %ymm3,%ymm13,%ymm3 + vpslld $7,%ymm3,%ymm14 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm15,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm15,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $12,%ymm1,%ymm14 + vpsrld $20,%ymm1,%ymm1 + vpor %ymm1,%ymm14,%ymm1 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $12,%ymm2,%ymm15 + vpsrld $20,%ymm2,%ymm2 + vpor %ymm2,%ymm15,%ymm2 + vpaddd %ymm1,%ymm8,%ymm8 + vpxor %ymm7,%ymm8,%ymm7 + vpshufb %ymm14,%ymm7,%ymm7 + vpaddd %ymm2,%ymm9,%ymm9 + vpxor %ymm4,%ymm9,%ymm4 + vpshufb %ymm14,%ymm4,%ymm4 + vpaddd %ymm7,%ymm12,%ymm12 + vpxor %ymm1,%ymm12,%ymm1 + vpslld $7,%ymm1,%ymm15 + vpsrld $25,%ymm1,%ymm1 + vpor %ymm1,%ymm15,%ymm1 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm4,%ymm13,%ymm13 + vpxor %ymm2,%ymm13,%ymm2 + vpslld $7,%ymm2,%ymm14 + vpsrld $25,%ymm2,%ymm2 + vpor %ymm2,%ymm14,%ymm2 + vmovdqa %ymm12,64(%rsp) + vmovdqa %ymm13,96(%rsp) + vmovdqa 0(%rsp),%ymm12 + vmovdqa 32(%rsp),%ymm13 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm15,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm15,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $12,%ymm3,%ymm14 + vpsrld $20,%ymm3,%ymm3 + vpor %ymm3,%ymm14,%ymm3 + vbroadcasti128 (%r11),%ymm14 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $12,%ymm0,%ymm15 + vpsrld $20,%ymm0,%ymm0 + vpor %ymm0,%ymm15,%ymm0 + vpaddd %ymm3,%ymm10,%ymm10 + vpxor %ymm5,%ymm10,%ymm5 + vpshufb %ymm14,%ymm5,%ymm5 + vpaddd %ymm0,%ymm11,%ymm11 + vpxor %ymm6,%ymm11,%ymm6 + vpshufb %ymm14,%ymm6,%ymm6 + vpaddd %ymm5,%ymm12,%ymm12 + vpxor %ymm3,%ymm12,%ymm3 + vpslld $7,%ymm3,%ymm15 + vpsrld $25,%ymm3,%ymm3 + vpor %ymm3,%ymm15,%ymm3 + vbroadcasti128 (%r10),%ymm15 + vpaddd %ymm6,%ymm13,%ymm13 + vpxor %ymm0,%ymm13,%ymm0 + vpslld $7,%ymm0,%ymm14 + vpsrld $25,%ymm0,%ymm0 + vpor %ymm0,%ymm14,%ymm0 + decl %eax + jnz .Loop8x + + leaq 512(%rsp),%rax + vpaddd 128-256(%rcx),%ymm8,%ymm8 + vpaddd 160-256(%rcx),%ymm9,%ymm9 + vpaddd 192-256(%rcx),%ymm10,%ymm10 + vpaddd 224-256(%rcx),%ymm11,%ymm11 + + vpunpckldq %ymm9,%ymm8,%ymm14 + vpunpckldq %ymm11,%ymm10,%ymm15 + vpunpckhdq %ymm9,%ymm8,%ymm8 + vpunpckhdq %ymm11,%ymm10,%ymm10 + vpunpcklqdq %ymm15,%ymm14,%ymm9 + vpunpckhqdq %ymm15,%ymm14,%ymm14 + vpunpcklqdq %ymm10,%ymm8,%ymm11 + vpunpckhqdq %ymm10,%ymm8,%ymm8 + vpaddd 256-256(%rcx),%ymm0,%ymm0 + vpaddd 288-256(%rcx),%ymm1,%ymm1 + vpaddd 320-256(%rcx),%ymm2,%ymm2 + vpaddd 352-256(%rcx),%ymm3,%ymm3 + + vpunpckldq %ymm1,%ymm0,%ymm10 + vpunpckldq %ymm3,%ymm2,%ymm15 + vpunpckhdq %ymm1,%ymm0,%ymm0 + vpunpckhdq %ymm3,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm10,%ymm1 + vpunpckhqdq %ymm15,%ymm10,%ymm10 + vpunpcklqdq %ymm2,%ymm0,%ymm3 + vpunpckhqdq %ymm2,%ymm0,%ymm0 + vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 + vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 + vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 + vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 + vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 + vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 + vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 + vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 + vmovdqa %ymm15,0(%rsp) + vmovdqa %ymm9,32(%rsp) + vmovdqa 64(%rsp),%ymm15 + vmovdqa 96(%rsp),%ymm9 + + vpaddd 384-512(%rax),%ymm12,%ymm12 + vpaddd 416-512(%rax),%ymm13,%ymm13 + vpaddd 448-512(%rax),%ymm15,%ymm15 + vpaddd 480-512(%rax),%ymm9,%ymm9 + + vpunpckldq %ymm13,%ymm12,%ymm2 + vpunpckldq %ymm9,%ymm15,%ymm8 + vpunpckhdq %ymm13,%ymm12,%ymm12 + vpunpckhdq %ymm9,%ymm15,%ymm15 + vpunpcklqdq %ymm8,%ymm2,%ymm13 + vpunpckhqdq %ymm8,%ymm2,%ymm2 + vpunpcklqdq %ymm15,%ymm12,%ymm9 + vpunpckhqdq %ymm15,%ymm12,%ymm12 + vpaddd 512-512(%rax),%ymm4,%ymm4 + vpaddd 544-512(%rax),%ymm5,%ymm5 + vpaddd 576-512(%rax),%ymm6,%ymm6 + vpaddd 608-512(%rax),%ymm7,%ymm7 + + vpunpckldq %ymm5,%ymm4,%ymm15 + vpunpckldq %ymm7,%ymm6,%ymm8 + vpunpckhdq %ymm5,%ymm4,%ymm4 + vpunpckhdq %ymm7,%ymm6,%ymm6 + vpunpcklqdq %ymm8,%ymm15,%ymm5 + vpunpckhqdq %ymm8,%ymm15,%ymm15 + vpunpcklqdq %ymm6,%ymm4,%ymm7 + vpunpckhqdq %ymm6,%ymm4,%ymm4 + vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 + vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 + vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 + vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 + vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 + vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 + vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 + vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 + vmovdqa 0(%rsp),%ymm6 + vmovdqa 32(%rsp),%ymm12 + + cmpq $512,%rdx + jb .Ltail8x + + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + leaq 128(%rsi),%rsi + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm12,%ymm12 + vpxor 32(%rsi),%ymm13,%ymm13 + vpxor 64(%rsi),%ymm10,%ymm10 + vpxor 96(%rsi),%ymm15,%ymm15 + leaq 128(%rsi),%rsi + vmovdqu %ymm12,0(%rdi) + vmovdqu %ymm13,32(%rdi) + vmovdqu %ymm10,64(%rdi) + vmovdqu %ymm15,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm14,%ymm14 + vpxor 32(%rsi),%ymm2,%ymm2 + vpxor 64(%rsi),%ymm3,%ymm3 + vpxor 96(%rsi),%ymm7,%ymm7 + leaq 128(%rsi),%rsi + vmovdqu %ymm14,0(%rdi) + vmovdqu %ymm2,32(%rdi) + vmovdqu %ymm3,64(%rdi) + vmovdqu %ymm7,96(%rdi) + leaq 128(%rdi),%rdi + + vpxor 0(%rsi),%ymm11,%ymm11 + vpxor 32(%rsi),%ymm9,%ymm9 + vpxor 64(%rsi),%ymm0,%ymm0 + vpxor 96(%rsi),%ymm4,%ymm4 + leaq 128(%rsi),%rsi + vmovdqu %ymm11,0(%rdi) + vmovdqu %ymm9,32(%rdi) + vmovdqu %ymm0,64(%rdi) + vmovdqu %ymm4,96(%rdi) + leaq 128(%rdi),%rdi + + subq $512,%rdx + jnz .Loop_outer8x + + jmp .Ldone8x + +.Ltail8x: + cmpq $448,%rdx + jae .L448_or_more8x + cmpq $384,%rdx + jae .L384_or_more8x + cmpq $320,%rdx + jae .L320_or_more8x + cmpq $256,%rdx + jae .L256_or_more8x + cmpq $192,%rdx + jae .L192_or_more8x + cmpq $128,%rdx + jae .L128_or_more8x + cmpq $64,%rdx + jae .L64_or_more8x + + xorq %r10,%r10 + vmovdqa %ymm6,0(%rsp) + vmovdqa %ymm8,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L64_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + je .Ldone8x + + leaq 64(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm1,0(%rsp) + leaq 64(%rdi),%rdi + subq $64,%rdx + vmovdqa %ymm5,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L128_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + je .Ldone8x + + leaq 128(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm12,0(%rsp) + leaq 128(%rdi),%rdi + subq $128,%rdx + vmovdqa %ymm13,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L192_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + je .Ldone8x + + leaq 192(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm10,0(%rsp) + leaq 192(%rdi),%rdi + subq $192,%rdx + vmovdqa %ymm15,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L256_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + je .Ldone8x + + leaq 256(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm14,0(%rsp) + leaq 256(%rdi),%rdi + subq $256,%rdx + vmovdqa %ymm2,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L320_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + je .Ldone8x + + leaq 320(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm3,0(%rsp) + leaq 320(%rdi),%rdi + subq $320,%rdx + vmovdqa %ymm7,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L384_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + je .Ldone8x + + leaq 384(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm11,0(%rsp) + leaq 384(%rdi),%rdi + subq $384,%rdx + vmovdqa %ymm9,32(%rsp) + jmp .Loop_tail8x + +.align 32 +.L448_or_more8x: + vpxor 0(%rsi),%ymm6,%ymm6 + vpxor 32(%rsi),%ymm8,%ymm8 + vpxor 64(%rsi),%ymm1,%ymm1 + vpxor 96(%rsi),%ymm5,%ymm5 + vpxor 128(%rsi),%ymm12,%ymm12 + vpxor 160(%rsi),%ymm13,%ymm13 + vpxor 192(%rsi),%ymm10,%ymm10 + vpxor 224(%rsi),%ymm15,%ymm15 + vpxor 256(%rsi),%ymm14,%ymm14 + vpxor 288(%rsi),%ymm2,%ymm2 + vpxor 320(%rsi),%ymm3,%ymm3 + vpxor 352(%rsi),%ymm7,%ymm7 + vpxor 384(%rsi),%ymm11,%ymm11 + vpxor 416(%rsi),%ymm9,%ymm9 + vmovdqu %ymm6,0(%rdi) + vmovdqu %ymm8,32(%rdi) + vmovdqu %ymm1,64(%rdi) + vmovdqu %ymm5,96(%rdi) + vmovdqu %ymm12,128(%rdi) + vmovdqu %ymm13,160(%rdi) + vmovdqu %ymm10,192(%rdi) + vmovdqu %ymm15,224(%rdi) + vmovdqu %ymm14,256(%rdi) + vmovdqu %ymm2,288(%rdi) + vmovdqu %ymm3,320(%rdi) + vmovdqu %ymm7,352(%rdi) + vmovdqu %ymm11,384(%rdi) + vmovdqu %ymm9,416(%rdi) + je .Ldone8x + + leaq 448(%rsi),%rsi + xorq %r10,%r10 + vmovdqa %ymm0,0(%rsp) + leaq 448(%rdi),%rdi + subq $448,%rdx + vmovdqa %ymm4,32(%rsp) + +.Loop_tail8x: + movzbl (%rsi,%r10,1),%eax + movzbl (%rsp,%r10,1),%ecx + leaq 1(%r10),%r10 + xorl %ecx,%eax + movb %al,-1(%rdi,%r10,1) + decq %rdx + jnz .Loop_tail8x + +.Ldone8x: + vzeroall + leaq (%r9),%rsp +.cfi_def_cfa_register rsp +.L8x_epilogue: + ret +.cfi_endproc +.size ChaCha20_ctr32_avx2,.-ChaCha20_ctr32_avx2 +#endif diff --git a/third_party/boringssl/gen/crypto/chacha-x86_64-win.asm b/third_party/boringssl/gen/crypto/chacha-x86_64-win.asm new file mode 100644 index 00000000..ca5b73c5 --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha-x86_64-win.asm @@ -0,0 +1,1915 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + +section .rdata rdata align=8 +ALIGN 64 +$L$zero: + DD 0,0,0,0 +$L$one: + DD 1,0,0,0 +$L$inc: + DD 0,1,2,3 +$L$four: + DD 4,4,4,4 +$L$incy: + DD 0,2,4,6,1,3,5,7 +$L$eight: + DD 8,8,8,8,8,8,8,8 +$L$rot16: + DB 0x2,0x3,0x0,0x1,0x6,0x7,0x4,0x5,0xa,0xb,0x8,0x9,0xe,0xf,0xc,0xd +$L$rot24: + DB 0x3,0x0,0x1,0x2,0x7,0x4,0x5,0x6,0xb,0x8,0x9,0xa,0xf,0xc,0xd,0xe +$L$sigma: + DB 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107 + DB 0 +ALIGN 64 +$L$zeroz: + DD 0,0,0,0,1,0,0,0,2,0,0,0,3,0,0,0 +$L$fourz: + DD 4,0,0,0,4,0,0,0,4,0,0,0,4,0,0,0 +$L$incz: + DD 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 +$L$sixteen: + DD 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 + DB 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 + DB 95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32 + DB 98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115 + DB 108,46,111,114,103,62,0 +section .text + +global ChaCha20_ctr32_nohw + +ALIGN 64 +ChaCha20_ctr32_nohw: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ctr32_nohw: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + push rbx + + push rbp + + push r12 + + push r13 + + push r14 + + push r15 + + sub rsp,64+24 + +$L$ctr32_body: + + + movdqu xmm1,XMMWORD[rcx] + movdqu xmm2,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + movdqa xmm4,XMMWORD[$L$one] + + + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + mov rbp,rdx + jmp NEAR $L$oop_outer + +ALIGN 32 +$L$oop_outer: + mov eax,0x61707865 + mov ebx,0x3320646e + mov ecx,0x79622d32 + mov edx,0x6b206574 + mov r8d,DWORD[16+rsp] + mov r9d,DWORD[20+rsp] + mov r10d,DWORD[24+rsp] + mov r11d,DWORD[28+rsp] + movd r12d,xmm3 + mov r13d,DWORD[52+rsp] + mov r14d,DWORD[56+rsp] + mov r15d,DWORD[60+rsp] + + mov QWORD[((64+0))+rsp],rbp + mov ebp,10 + mov QWORD[((64+8))+rsp],rsi + movq rsi,xmm2 + mov QWORD[((64+16))+rsp],rdi + mov rdi,rsi + shr rdi,32 + jmp NEAR $L$oop + +ALIGN 32 +$L$oop: + add eax,r8d + xor r12d,eax + rol r12d,16 + add ebx,r9d + xor r13d,ebx + rol r13d,16 + add esi,r12d + xor r8d,esi + rol r8d,12 + add edi,r13d + xor r9d,edi + rol r9d,12 + add eax,r8d + xor r12d,eax + rol r12d,8 + add ebx,r9d + xor r13d,ebx + rol r13d,8 + add esi,r12d + xor r8d,esi + rol r8d,7 + add edi,r13d + xor r9d,edi + rol r9d,7 + mov DWORD[32+rsp],esi + mov DWORD[36+rsp],edi + mov esi,DWORD[40+rsp] + mov edi,DWORD[44+rsp] + add ecx,r10d + xor r14d,ecx + rol r14d,16 + add edx,r11d + xor r15d,edx + rol r15d,16 + add esi,r14d + xor r10d,esi + rol r10d,12 + add edi,r15d + xor r11d,edi + rol r11d,12 + add ecx,r10d + xor r14d,ecx + rol r14d,8 + add edx,r11d + xor r15d,edx + rol r15d,8 + add esi,r14d + xor r10d,esi + rol r10d,7 + add edi,r15d + xor r11d,edi + rol r11d,7 + add eax,r9d + xor r15d,eax + rol r15d,16 + add ebx,r10d + xor r12d,ebx + rol r12d,16 + add esi,r15d + xor r9d,esi + rol r9d,12 + add edi,r12d + xor r10d,edi + rol r10d,12 + add eax,r9d + xor r15d,eax + rol r15d,8 + add ebx,r10d + xor r12d,ebx + rol r12d,8 + add esi,r15d + xor r9d,esi + rol r9d,7 + add edi,r12d + xor r10d,edi + rol r10d,7 + mov DWORD[40+rsp],esi + mov DWORD[44+rsp],edi + mov esi,DWORD[32+rsp] + mov edi,DWORD[36+rsp] + add ecx,r11d + xor r13d,ecx + rol r13d,16 + add edx,r8d + xor r14d,edx + rol r14d,16 + add esi,r13d + xor r11d,esi + rol r11d,12 + add edi,r14d + xor r8d,edi + rol r8d,12 + add ecx,r11d + xor r13d,ecx + rol r13d,8 + add edx,r8d + xor r14d,edx + rol r14d,8 + add esi,r13d + xor r11d,esi + rol r11d,7 + add edi,r14d + xor r8d,edi + rol r8d,7 + dec ebp + jnz NEAR $L$oop + mov DWORD[36+rsp],edi + mov DWORD[32+rsp],esi + mov rbp,QWORD[64+rsp] + movdqa xmm1,xmm2 + mov rsi,QWORD[((64+8))+rsp] + paddd xmm3,xmm4 + mov rdi,QWORD[((64+16))+rsp] + + add eax,0x61707865 + add ebx,0x3320646e + add ecx,0x79622d32 + add edx,0x6b206574 + add r8d,DWORD[16+rsp] + add r9d,DWORD[20+rsp] + add r10d,DWORD[24+rsp] + add r11d,DWORD[28+rsp] + add r12d,DWORD[48+rsp] + add r13d,DWORD[52+rsp] + add r14d,DWORD[56+rsp] + add r15d,DWORD[60+rsp] + paddd xmm1,XMMWORD[32+rsp] + + cmp rbp,64 + jb NEAR $L$tail + + xor eax,DWORD[rsi] + xor ebx,DWORD[4+rsi] + xor ecx,DWORD[8+rsi] + xor edx,DWORD[12+rsi] + xor r8d,DWORD[16+rsi] + xor r9d,DWORD[20+rsi] + xor r10d,DWORD[24+rsi] + xor r11d,DWORD[28+rsi] + movdqu xmm0,XMMWORD[32+rsi] + xor r12d,DWORD[48+rsi] + xor r13d,DWORD[52+rsi] + xor r14d,DWORD[56+rsi] + xor r15d,DWORD[60+rsi] + lea rsi,[64+rsi] + pxor xmm0,xmm1 + + movdqa XMMWORD[32+rsp],xmm2 + movd DWORD[48+rsp],xmm3 + + mov DWORD[rdi],eax + mov DWORD[4+rdi],ebx + mov DWORD[8+rdi],ecx + mov DWORD[12+rdi],edx + mov DWORD[16+rdi],r8d + mov DWORD[20+rdi],r9d + mov DWORD[24+rdi],r10d + mov DWORD[28+rdi],r11d + movdqu XMMWORD[32+rdi],xmm0 + mov DWORD[48+rdi],r12d + mov DWORD[52+rdi],r13d + mov DWORD[56+rdi],r14d + mov DWORD[60+rdi],r15d + lea rdi,[64+rdi] + + sub rbp,64 + jnz NEAR $L$oop_outer + + jmp NEAR $L$done + +ALIGN 16 +$L$tail: + mov DWORD[rsp],eax + mov DWORD[4+rsp],ebx + xor rbx,rbx + mov DWORD[8+rsp],ecx + mov DWORD[12+rsp],edx + mov DWORD[16+rsp],r8d + mov DWORD[20+rsp],r9d + mov DWORD[24+rsp],r10d + mov DWORD[28+rsp],r11d + movdqa XMMWORD[32+rsp],xmm1 + mov DWORD[48+rsp],r12d + mov DWORD[52+rsp],r13d + mov DWORD[56+rsp],r14d + mov DWORD[60+rsp],r15d + +$L$oop_tail: + movzx eax,BYTE[rbx*1+rsi] + movzx edx,BYTE[rbx*1+rsp] + lea rbx,[1+rbx] + xor eax,edx + mov BYTE[((-1))+rbx*1+rdi],al + dec rbp + jnz NEAR $L$oop_tail + +$L$done: + lea rsi,[((64+24+48))+rsp] + mov r15,QWORD[((-48))+rsi] + + mov r14,QWORD[((-40))+rsi] + + mov r13,QWORD[((-32))+rsi] + + mov r12,QWORD[((-24))+rsi] + + mov rbp,QWORD[((-16))+rsi] + + mov rbx,QWORD[((-8))+rsi] + + lea rsp,[rsi] + +$L$no_data: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ChaCha20_ctr32_nohw: +global ChaCha20_ctr32_ssse3 + +ALIGN 32 +ChaCha20_ctr32_ssse3: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ctr32_ssse3: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + mov r9,rsp + + sub rsp,64+40 + movaps XMMWORD[(-40)+r9],xmm6 + movaps XMMWORD[(-24)+r9],xmm7 +$L$ssse3_body: + movdqa xmm0,XMMWORD[$L$sigma] + movdqu xmm1,XMMWORD[rcx] + movdqu xmm2,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + movdqa xmm6,XMMWORD[$L$rot16] + movdqa xmm7,XMMWORD[$L$rot24] + + movdqa XMMWORD[rsp],xmm0 + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + mov r8,10 + jmp NEAR $L$oop_ssse3 + +ALIGN 32 +$L$oop_outer_ssse3: + movdqa xmm3,XMMWORD[$L$one] + movdqa xmm0,XMMWORD[rsp] + movdqa xmm1,XMMWORD[16+rsp] + movdqa xmm2,XMMWORD[32+rsp] + paddd xmm3,XMMWORD[48+rsp] + mov r8,10 + movdqa XMMWORD[48+rsp],xmm3 + jmp NEAR $L$oop_ssse3 + +ALIGN 32 +$L$oop_ssse3: + paddd xmm0,xmm1 + pxor xmm3,xmm0 + pshufb xmm3,xmm6 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 + pshufb xmm3,xmm7 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,57 + pshufd xmm3,xmm3,147 + nop + paddd xmm0,xmm1 + pxor xmm3,xmm0 + pshufb xmm3,xmm6 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,20 + pslld xmm4,12 + por xmm1,xmm4 + paddd xmm0,xmm1 + pxor xmm3,xmm0 + pshufb xmm3,xmm7 + paddd xmm2,xmm3 + pxor xmm1,xmm2 + movdqa xmm4,xmm1 + psrld xmm1,25 + pslld xmm4,7 + por xmm1,xmm4 + pshufd xmm2,xmm2,78 + pshufd xmm1,xmm1,147 + pshufd xmm3,xmm3,57 + dec r8 + jnz NEAR $L$oop_ssse3 + paddd xmm0,XMMWORD[rsp] + paddd xmm1,XMMWORD[16+rsp] + paddd xmm2,XMMWORD[32+rsp] + paddd xmm3,XMMWORD[48+rsp] + + cmp rdx,64 + jb NEAR $L$tail_ssse3 + + movdqu xmm4,XMMWORD[rsi] + movdqu xmm5,XMMWORD[16+rsi] + pxor xmm0,xmm4 + movdqu xmm4,XMMWORD[32+rsi] + pxor xmm1,xmm5 + movdqu xmm5,XMMWORD[48+rsi] + lea rsi,[64+rsi] + pxor xmm2,xmm4 + pxor xmm3,xmm5 + + movdqu XMMWORD[rdi],xmm0 + movdqu XMMWORD[16+rdi],xmm1 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm3 + lea rdi,[64+rdi] + + sub rdx,64 + jnz NEAR $L$oop_outer_ssse3 + + jmp NEAR $L$done_ssse3 + +ALIGN 16 +$L$tail_ssse3: + movdqa XMMWORD[rsp],xmm0 + movdqa XMMWORD[16+rsp],xmm1 + movdqa XMMWORD[32+rsp],xmm2 + movdqa XMMWORD[48+rsp],xmm3 + xor r8,r8 + +$L$oop_tail_ssse3: + movzx eax,BYTE[r8*1+rsi] + movzx ecx,BYTE[r8*1+rsp] + lea r8,[1+r8] + xor eax,ecx + mov BYTE[((-1))+r8*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail_ssse3 + +$L$done_ssse3: + movaps xmm6,XMMWORD[((-40))+r9] + movaps xmm7,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$ssse3_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ChaCha20_ctr32_ssse3: +global ChaCha20_ctr32_ssse3_4x + +ALIGN 32 +ChaCha20_ctr32_ssse3_4x: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ctr32_ssse3_4x: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + mov r9,rsp + + sub rsp,0x140+168 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$4x_body: + movdqa xmm11,XMMWORD[$L$sigma] + movdqu xmm15,XMMWORD[rcx] + movdqu xmm7,XMMWORD[16+rcx] + movdqu xmm3,XMMWORD[r8] + lea rcx,[256+rsp] + lea r10,[$L$rot16] + lea r11,[$L$rot24] + + pshufd xmm8,xmm11,0x00 + pshufd xmm9,xmm11,0x55 + movdqa XMMWORD[64+rsp],xmm8 + pshufd xmm10,xmm11,0xaa + movdqa XMMWORD[80+rsp],xmm9 + pshufd xmm11,xmm11,0xff + movdqa XMMWORD[96+rsp],xmm10 + movdqa XMMWORD[112+rsp],xmm11 + + pshufd xmm12,xmm15,0x00 + pshufd xmm13,xmm15,0x55 + movdqa XMMWORD[(128-256)+rcx],xmm12 + pshufd xmm14,xmm15,0xaa + movdqa XMMWORD[(144-256)+rcx],xmm13 + pshufd xmm15,xmm15,0xff + movdqa XMMWORD[(160-256)+rcx],xmm14 + movdqa XMMWORD[(176-256)+rcx],xmm15 + + pshufd xmm4,xmm7,0x00 + pshufd xmm5,xmm7,0x55 + movdqa XMMWORD[(192-256)+rcx],xmm4 + pshufd xmm6,xmm7,0xaa + movdqa XMMWORD[(208-256)+rcx],xmm5 + pshufd xmm7,xmm7,0xff + movdqa XMMWORD[(224-256)+rcx],xmm6 + movdqa XMMWORD[(240-256)+rcx],xmm7 + + pshufd xmm0,xmm3,0x00 + pshufd xmm1,xmm3,0x55 + paddd xmm0,XMMWORD[$L$inc] + pshufd xmm2,xmm3,0xaa + movdqa XMMWORD[(272-256)+rcx],xmm1 + pshufd xmm3,xmm3,0xff + movdqa XMMWORD[(288-256)+rcx],xmm2 + movdqa XMMWORD[(304-256)+rcx],xmm3 + + jmp NEAR $L$oop_enter4x + +ALIGN 32 +$L$oop_outer4x: + movdqa xmm8,XMMWORD[64+rsp] + movdqa xmm9,XMMWORD[80+rsp] + movdqa xmm10,XMMWORD[96+rsp] + movdqa xmm11,XMMWORD[112+rsp] + movdqa xmm12,XMMWORD[((128-256))+rcx] + movdqa xmm13,XMMWORD[((144-256))+rcx] + movdqa xmm14,XMMWORD[((160-256))+rcx] + movdqa xmm15,XMMWORD[((176-256))+rcx] + movdqa xmm4,XMMWORD[((192-256))+rcx] + movdqa xmm5,XMMWORD[((208-256))+rcx] + movdqa xmm6,XMMWORD[((224-256))+rcx] + movdqa xmm7,XMMWORD[((240-256))+rcx] + movdqa xmm0,XMMWORD[((256-256))+rcx] + movdqa xmm1,XMMWORD[((272-256))+rcx] + movdqa xmm2,XMMWORD[((288-256))+rcx] + movdqa xmm3,XMMWORD[((304-256))+rcx] + paddd xmm0,XMMWORD[$L$four] + +$L$oop_enter4x: + movdqa XMMWORD[32+rsp],xmm6 + movdqa XMMWORD[48+rsp],xmm7 + movdqa xmm7,XMMWORD[r10] + mov eax,10 + movdqa XMMWORD[(256-256)+rcx],xmm0 + jmp NEAR $L$oop4x + +ALIGN 32 +$L$oop4x: + paddd xmm8,xmm12 + paddd xmm9,xmm13 + pxor xmm0,xmm8 + pxor xmm1,xmm9 + pshufb xmm0,xmm7 + pshufb xmm1,xmm7 + paddd xmm4,xmm0 + paddd xmm5,xmm1 + pxor xmm12,xmm4 + pxor xmm13,xmm5 + movdqa xmm6,xmm12 + pslld xmm12,12 + psrld xmm6,20 + movdqa xmm7,xmm13 + pslld xmm13,12 + por xmm12,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm13,xmm7 + paddd xmm8,xmm12 + paddd xmm9,xmm13 + pxor xmm0,xmm8 + pxor xmm1,xmm9 + pshufb xmm0,xmm6 + pshufb xmm1,xmm6 + paddd xmm4,xmm0 + paddd xmm5,xmm1 + pxor xmm12,xmm4 + pxor xmm13,xmm5 + movdqa xmm7,xmm12 + pslld xmm12,7 + psrld xmm7,25 + movdqa xmm6,xmm13 + pslld xmm13,7 + por xmm12,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm13,xmm6 + movdqa XMMWORD[rsp],xmm4 + movdqa XMMWORD[16+rsp],xmm5 + movdqa xmm4,XMMWORD[32+rsp] + movdqa xmm5,XMMWORD[48+rsp] + paddd xmm10,xmm14 + paddd xmm11,xmm15 + pxor xmm2,xmm10 + pxor xmm3,xmm11 + pshufb xmm2,xmm7 + pshufb xmm3,xmm7 + paddd xmm4,xmm2 + paddd xmm5,xmm3 + pxor xmm14,xmm4 + pxor xmm15,xmm5 + movdqa xmm6,xmm14 + pslld xmm14,12 + psrld xmm6,20 + movdqa xmm7,xmm15 + pslld xmm15,12 + por xmm14,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm15,xmm7 + paddd xmm10,xmm14 + paddd xmm11,xmm15 + pxor xmm2,xmm10 + pxor xmm3,xmm11 + pshufb xmm2,xmm6 + pshufb xmm3,xmm6 + paddd xmm4,xmm2 + paddd xmm5,xmm3 + pxor xmm14,xmm4 + pxor xmm15,xmm5 + movdqa xmm7,xmm14 + pslld xmm14,7 + psrld xmm7,25 + movdqa xmm6,xmm15 + pslld xmm15,7 + por xmm14,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm15,xmm6 + paddd xmm8,xmm13 + paddd xmm9,xmm14 + pxor xmm3,xmm8 + pxor xmm0,xmm9 + pshufb xmm3,xmm7 + pshufb xmm0,xmm7 + paddd xmm4,xmm3 + paddd xmm5,xmm0 + pxor xmm13,xmm4 + pxor xmm14,xmm5 + movdqa xmm6,xmm13 + pslld xmm13,12 + psrld xmm6,20 + movdqa xmm7,xmm14 + pslld xmm14,12 + por xmm13,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm14,xmm7 + paddd xmm8,xmm13 + paddd xmm9,xmm14 + pxor xmm3,xmm8 + pxor xmm0,xmm9 + pshufb xmm3,xmm6 + pshufb xmm0,xmm6 + paddd xmm4,xmm3 + paddd xmm5,xmm0 + pxor xmm13,xmm4 + pxor xmm14,xmm5 + movdqa xmm7,xmm13 + pslld xmm13,7 + psrld xmm7,25 + movdqa xmm6,xmm14 + pslld xmm14,7 + por xmm13,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm14,xmm6 + movdqa XMMWORD[32+rsp],xmm4 + movdqa XMMWORD[48+rsp],xmm5 + movdqa xmm4,XMMWORD[rsp] + movdqa xmm5,XMMWORD[16+rsp] + paddd xmm10,xmm15 + paddd xmm11,xmm12 + pxor xmm1,xmm10 + pxor xmm2,xmm11 + pshufb xmm1,xmm7 + pshufb xmm2,xmm7 + paddd xmm4,xmm1 + paddd xmm5,xmm2 + pxor xmm15,xmm4 + pxor xmm12,xmm5 + movdqa xmm6,xmm15 + pslld xmm15,12 + psrld xmm6,20 + movdqa xmm7,xmm12 + pslld xmm12,12 + por xmm15,xmm6 + psrld xmm7,20 + movdqa xmm6,XMMWORD[r11] + por xmm12,xmm7 + paddd xmm10,xmm15 + paddd xmm11,xmm12 + pxor xmm1,xmm10 + pxor xmm2,xmm11 + pshufb xmm1,xmm6 + pshufb xmm2,xmm6 + paddd xmm4,xmm1 + paddd xmm5,xmm2 + pxor xmm15,xmm4 + pxor xmm12,xmm5 + movdqa xmm7,xmm15 + pslld xmm15,7 + psrld xmm7,25 + movdqa xmm6,xmm12 + pslld xmm12,7 + por xmm15,xmm7 + psrld xmm6,25 + movdqa xmm7,XMMWORD[r10] + por xmm12,xmm6 + dec eax + jnz NEAR $L$oop4x + + paddd xmm8,XMMWORD[64+rsp] + paddd xmm9,XMMWORD[80+rsp] + paddd xmm10,XMMWORD[96+rsp] + paddd xmm11,XMMWORD[112+rsp] + + movdqa xmm6,xmm8 + punpckldq xmm8,xmm9 + movdqa xmm7,xmm10 + punpckldq xmm10,xmm11 + punpckhdq xmm6,xmm9 + punpckhdq xmm7,xmm11 + movdqa xmm9,xmm8 + punpcklqdq xmm8,xmm10 + movdqa xmm11,xmm6 + punpcklqdq xmm6,xmm7 + punpckhqdq xmm9,xmm10 + punpckhqdq xmm11,xmm7 + paddd xmm12,XMMWORD[((128-256))+rcx] + paddd xmm13,XMMWORD[((144-256))+rcx] + paddd xmm14,XMMWORD[((160-256))+rcx] + paddd xmm15,XMMWORD[((176-256))+rcx] + + movdqa XMMWORD[rsp],xmm8 + movdqa XMMWORD[16+rsp],xmm9 + movdqa xmm8,XMMWORD[32+rsp] + movdqa xmm9,XMMWORD[48+rsp] + + movdqa xmm10,xmm12 + punpckldq xmm12,xmm13 + movdqa xmm7,xmm14 + punpckldq xmm14,xmm15 + punpckhdq xmm10,xmm13 + punpckhdq xmm7,xmm15 + movdqa xmm13,xmm12 + punpcklqdq xmm12,xmm14 + movdqa xmm15,xmm10 + punpcklqdq xmm10,xmm7 + punpckhqdq xmm13,xmm14 + punpckhqdq xmm15,xmm7 + paddd xmm4,XMMWORD[((192-256))+rcx] + paddd xmm5,XMMWORD[((208-256))+rcx] + paddd xmm8,XMMWORD[((224-256))+rcx] + paddd xmm9,XMMWORD[((240-256))+rcx] + + movdqa XMMWORD[32+rsp],xmm6 + movdqa XMMWORD[48+rsp],xmm11 + + movdqa xmm14,xmm4 + punpckldq xmm4,xmm5 + movdqa xmm7,xmm8 + punpckldq xmm8,xmm9 + punpckhdq xmm14,xmm5 + punpckhdq xmm7,xmm9 + movdqa xmm5,xmm4 + punpcklqdq xmm4,xmm8 + movdqa xmm9,xmm14 + punpcklqdq xmm14,xmm7 + punpckhqdq xmm5,xmm8 + punpckhqdq xmm9,xmm7 + paddd xmm0,XMMWORD[((256-256))+rcx] + paddd xmm1,XMMWORD[((272-256))+rcx] + paddd xmm2,XMMWORD[((288-256))+rcx] + paddd xmm3,XMMWORD[((304-256))+rcx] + + movdqa xmm8,xmm0 + punpckldq xmm0,xmm1 + movdqa xmm7,xmm2 + punpckldq xmm2,xmm3 + punpckhdq xmm8,xmm1 + punpckhdq xmm7,xmm3 + movdqa xmm1,xmm0 + punpcklqdq xmm0,xmm2 + movdqa xmm3,xmm8 + punpcklqdq xmm8,xmm7 + punpckhqdq xmm1,xmm2 + punpckhqdq xmm3,xmm7 + cmp rdx,64*4 + jb NEAR $L$tail4x + + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + + movdqu XMMWORD[64+rdi],xmm6 + movdqu xmm6,XMMWORD[rsi] + movdqu XMMWORD[80+rdi],xmm11 + movdqu xmm11,XMMWORD[16+rsi] + movdqu XMMWORD[96+rdi],xmm2 + movdqu xmm2,XMMWORD[32+rsi] + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[32+rsp] + pxor xmm11,xmm10 + pxor xmm2,xmm14 + pxor xmm7,xmm8 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[48+rsp] + pxor xmm11,xmm15 + pxor xmm2,xmm9 + pxor xmm7,xmm3 + movdqu XMMWORD[64+rdi],xmm6 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm2 + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + + sub rdx,64*4 + jnz NEAR $L$oop_outer4x + + jmp NEAR $L$done4x + +$L$tail4x: + cmp rdx,192 + jae NEAR $L$192_or_more4x + cmp rdx,128 + jae NEAR $L$128_or_more4x + cmp rdx,64 + jae NEAR $L$64_or_more4x + + + xor r10,r10 + + movdqa XMMWORD[16+rsp],xmm12 + movdqa XMMWORD[32+rsp],xmm4 + movdqa XMMWORD[48+rsp],xmm0 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$64_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + movdqu XMMWORD[rdi],xmm6 + movdqu XMMWORD[16+rdi],xmm11 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[16+rsp] + lea rsi,[64+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm13 + lea rdi,[64+rdi] + movdqa XMMWORD[32+rsp],xmm5 + sub rdx,64 + movdqa XMMWORD[48+rsp],xmm1 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$128_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + movdqu XMMWORD[64+rdi],xmm6 + movdqu XMMWORD[80+rdi],xmm11 + movdqu XMMWORD[96+rdi],xmm2 + movdqu XMMWORD[112+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[32+rsp] + lea rsi,[128+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm10 + lea rdi,[128+rdi] + movdqa XMMWORD[32+rsp],xmm14 + sub rdx,128 + movdqa XMMWORD[48+rsp],xmm8 + jmp NEAR $L$oop_tail4x + +ALIGN 32 +$L$192_or_more4x: + movdqu xmm6,XMMWORD[rsi] + movdqu xmm11,XMMWORD[16+rsi] + movdqu xmm2,XMMWORD[32+rsi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[rsp] + pxor xmm11,xmm12 + pxor xmm2,xmm4 + pxor xmm7,xmm0 + + movdqu XMMWORD[rdi],xmm6 + movdqu xmm6,XMMWORD[64+rsi] + movdqu XMMWORD[16+rdi],xmm11 + movdqu xmm11,XMMWORD[80+rsi] + movdqu XMMWORD[32+rdi],xmm2 + movdqu xmm2,XMMWORD[96+rsi] + movdqu XMMWORD[48+rdi],xmm7 + movdqu xmm7,XMMWORD[112+rsi] + lea rsi,[128+rsi] + pxor xmm6,XMMWORD[16+rsp] + pxor xmm11,xmm13 + pxor xmm2,xmm5 + pxor xmm7,xmm1 + + movdqu XMMWORD[64+rdi],xmm6 + movdqu xmm6,XMMWORD[rsi] + movdqu XMMWORD[80+rdi],xmm11 + movdqu xmm11,XMMWORD[16+rsi] + movdqu XMMWORD[96+rdi],xmm2 + movdqu xmm2,XMMWORD[32+rsi] + movdqu XMMWORD[112+rdi],xmm7 + lea rdi,[128+rdi] + movdqu xmm7,XMMWORD[48+rsi] + pxor xmm6,XMMWORD[32+rsp] + pxor xmm11,xmm10 + pxor xmm2,xmm14 + pxor xmm7,xmm8 + movdqu XMMWORD[rdi],xmm6 + movdqu XMMWORD[16+rdi],xmm11 + movdqu XMMWORD[32+rdi],xmm2 + movdqu XMMWORD[48+rdi],xmm7 + je NEAR $L$done4x + + movdqa xmm6,XMMWORD[48+rsp] + lea rsi,[64+rsi] + xor r10,r10 + movdqa XMMWORD[rsp],xmm6 + movdqa XMMWORD[16+rsp],xmm15 + lea rdi,[64+rdi] + movdqa XMMWORD[32+rsp],xmm9 + sub rdx,192 + movdqa XMMWORD[48+rsp],xmm3 + +$L$oop_tail4x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail4x + +$L$done4x: + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$4x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ChaCha20_ctr32_ssse3_4x: +global ChaCha20_ctr32_avx2 + +ALIGN 32 +ChaCha20_ctr32_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_ChaCha20_ctr32_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + + + +_CET_ENDBR + mov r9,rsp + + sub rsp,0x280+168 + and rsp,-32 + movaps XMMWORD[(-168)+r9],xmm6 + movaps XMMWORD[(-152)+r9],xmm7 + movaps XMMWORD[(-136)+r9],xmm8 + movaps XMMWORD[(-120)+r9],xmm9 + movaps XMMWORD[(-104)+r9],xmm10 + movaps XMMWORD[(-88)+r9],xmm11 + movaps XMMWORD[(-72)+r9],xmm12 + movaps XMMWORD[(-56)+r9],xmm13 + movaps XMMWORD[(-40)+r9],xmm14 + movaps XMMWORD[(-24)+r9],xmm15 +$L$8x_body: + vzeroupper + + + + + + + + + + + vbroadcasti128 ymm11,XMMWORD[$L$sigma] + vbroadcasti128 ymm3,XMMWORD[rcx] + vbroadcasti128 ymm15,XMMWORD[16+rcx] + vbroadcasti128 ymm7,XMMWORD[r8] + lea rcx,[256+rsp] + lea rax,[512+rsp] + lea r10,[$L$rot16] + lea r11,[$L$rot24] + + vpshufd ymm8,ymm11,0x00 + vpshufd ymm9,ymm11,0x55 + vmovdqa YMMWORD[(128-256)+rcx],ymm8 + vpshufd ymm10,ymm11,0xaa + vmovdqa YMMWORD[(160-256)+rcx],ymm9 + vpshufd ymm11,ymm11,0xff + vmovdqa YMMWORD[(192-256)+rcx],ymm10 + vmovdqa YMMWORD[(224-256)+rcx],ymm11 + + vpshufd ymm0,ymm3,0x00 + vpshufd ymm1,ymm3,0x55 + vmovdqa YMMWORD[(256-256)+rcx],ymm0 + vpshufd ymm2,ymm3,0xaa + vmovdqa YMMWORD[(288-256)+rcx],ymm1 + vpshufd ymm3,ymm3,0xff + vmovdqa YMMWORD[(320-256)+rcx],ymm2 + vmovdqa YMMWORD[(352-256)+rcx],ymm3 + + vpshufd ymm12,ymm15,0x00 + vpshufd ymm13,ymm15,0x55 + vmovdqa YMMWORD[(384-512)+rax],ymm12 + vpshufd ymm14,ymm15,0xaa + vmovdqa YMMWORD[(416-512)+rax],ymm13 + vpshufd ymm15,ymm15,0xff + vmovdqa YMMWORD[(448-512)+rax],ymm14 + vmovdqa YMMWORD[(480-512)+rax],ymm15 + + vpshufd ymm4,ymm7,0x00 + vpshufd ymm5,ymm7,0x55 + vpaddd ymm4,ymm4,YMMWORD[$L$incy] + vpshufd ymm6,ymm7,0xaa + vmovdqa YMMWORD[(544-512)+rax],ymm5 + vpshufd ymm7,ymm7,0xff + vmovdqa YMMWORD[(576-512)+rax],ymm6 + vmovdqa YMMWORD[(608-512)+rax],ymm7 + + jmp NEAR $L$oop_enter8x + +ALIGN 32 +$L$oop_outer8x: + vmovdqa ymm8,YMMWORD[((128-256))+rcx] + vmovdqa ymm9,YMMWORD[((160-256))+rcx] + vmovdqa ymm10,YMMWORD[((192-256))+rcx] + vmovdqa ymm11,YMMWORD[((224-256))+rcx] + vmovdqa ymm0,YMMWORD[((256-256))+rcx] + vmovdqa ymm1,YMMWORD[((288-256))+rcx] + vmovdqa ymm2,YMMWORD[((320-256))+rcx] + vmovdqa ymm3,YMMWORD[((352-256))+rcx] + vmovdqa ymm12,YMMWORD[((384-512))+rax] + vmovdqa ymm13,YMMWORD[((416-512))+rax] + vmovdqa ymm14,YMMWORD[((448-512))+rax] + vmovdqa ymm15,YMMWORD[((480-512))+rax] + vmovdqa ymm4,YMMWORD[((512-512))+rax] + vmovdqa ymm5,YMMWORD[((544-512))+rax] + vmovdqa ymm6,YMMWORD[((576-512))+rax] + vmovdqa ymm7,YMMWORD[((608-512))+rax] + vpaddd ymm4,ymm4,YMMWORD[$L$eight] + +$L$oop_enter8x: + vmovdqa YMMWORD[64+rsp],ymm14 + vmovdqa YMMWORD[96+rsp],ymm15 + vbroadcasti128 ymm15,XMMWORD[r10] + vmovdqa YMMWORD[(512-512)+rax],ymm4 + mov eax,10 + jmp NEAR $L$oop8x + +ALIGN 32 +$L$oop8x: + vpaddd ymm8,ymm8,ymm0 + vpxor ymm4,ymm8,ymm4 + vpshufb ymm4,ymm4,ymm15 + vpaddd ymm9,ymm9,ymm1 + vpxor ymm5,ymm9,ymm5 + vpshufb ymm5,ymm5,ymm15 + vpaddd ymm12,ymm12,ymm4 + vpxor ymm0,ymm12,ymm0 + vpslld ymm14,ymm0,12 + vpsrld ymm0,ymm0,20 + vpor ymm0,ymm14,ymm0 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm5 + vpxor ymm1,ymm13,ymm1 + vpslld ymm15,ymm1,12 + vpsrld ymm1,ymm1,20 + vpor ymm1,ymm15,ymm1 + vpaddd ymm8,ymm8,ymm0 + vpxor ymm4,ymm8,ymm4 + vpshufb ymm4,ymm4,ymm14 + vpaddd ymm9,ymm9,ymm1 + vpxor ymm5,ymm9,ymm5 + vpshufb ymm5,ymm5,ymm14 + vpaddd ymm12,ymm12,ymm4 + vpxor ymm0,ymm12,ymm0 + vpslld ymm15,ymm0,7 + vpsrld ymm0,ymm0,25 + vpor ymm0,ymm15,ymm0 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm5 + vpxor ymm1,ymm13,ymm1 + vpslld ymm14,ymm1,7 + vpsrld ymm1,ymm1,25 + vpor ymm1,ymm14,ymm1 + vmovdqa YMMWORD[rsp],ymm12 + vmovdqa YMMWORD[32+rsp],ymm13 + vmovdqa ymm12,YMMWORD[64+rsp] + vmovdqa ymm13,YMMWORD[96+rsp] + vpaddd ymm10,ymm10,ymm2 + vpxor ymm6,ymm10,ymm6 + vpshufb ymm6,ymm6,ymm15 + vpaddd ymm11,ymm11,ymm3 + vpxor ymm7,ymm11,ymm7 + vpshufb ymm7,ymm7,ymm15 + vpaddd ymm12,ymm12,ymm6 + vpxor ymm2,ymm12,ymm2 + vpslld ymm14,ymm2,12 + vpsrld ymm2,ymm2,20 + vpor ymm2,ymm14,ymm2 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm7 + vpxor ymm3,ymm13,ymm3 + vpslld ymm15,ymm3,12 + vpsrld ymm3,ymm3,20 + vpor ymm3,ymm15,ymm3 + vpaddd ymm10,ymm10,ymm2 + vpxor ymm6,ymm10,ymm6 + vpshufb ymm6,ymm6,ymm14 + vpaddd ymm11,ymm11,ymm3 + vpxor ymm7,ymm11,ymm7 + vpshufb ymm7,ymm7,ymm14 + vpaddd ymm12,ymm12,ymm6 + vpxor ymm2,ymm12,ymm2 + vpslld ymm15,ymm2,7 + vpsrld ymm2,ymm2,25 + vpor ymm2,ymm15,ymm2 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm7 + vpxor ymm3,ymm13,ymm3 + vpslld ymm14,ymm3,7 + vpsrld ymm3,ymm3,25 + vpor ymm3,ymm14,ymm3 + vpaddd ymm8,ymm8,ymm1 + vpxor ymm7,ymm8,ymm7 + vpshufb ymm7,ymm7,ymm15 + vpaddd ymm9,ymm9,ymm2 + vpxor ymm4,ymm9,ymm4 + vpshufb ymm4,ymm4,ymm15 + vpaddd ymm12,ymm12,ymm7 + vpxor ymm1,ymm12,ymm1 + vpslld ymm14,ymm1,12 + vpsrld ymm1,ymm1,20 + vpor ymm1,ymm14,ymm1 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm4 + vpxor ymm2,ymm13,ymm2 + vpslld ymm15,ymm2,12 + vpsrld ymm2,ymm2,20 + vpor ymm2,ymm15,ymm2 + vpaddd ymm8,ymm8,ymm1 + vpxor ymm7,ymm8,ymm7 + vpshufb ymm7,ymm7,ymm14 + vpaddd ymm9,ymm9,ymm2 + vpxor ymm4,ymm9,ymm4 + vpshufb ymm4,ymm4,ymm14 + vpaddd ymm12,ymm12,ymm7 + vpxor ymm1,ymm12,ymm1 + vpslld ymm15,ymm1,7 + vpsrld ymm1,ymm1,25 + vpor ymm1,ymm15,ymm1 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm4 + vpxor ymm2,ymm13,ymm2 + vpslld ymm14,ymm2,7 + vpsrld ymm2,ymm2,25 + vpor ymm2,ymm14,ymm2 + vmovdqa YMMWORD[64+rsp],ymm12 + vmovdqa YMMWORD[96+rsp],ymm13 + vmovdqa ymm12,YMMWORD[rsp] + vmovdqa ymm13,YMMWORD[32+rsp] + vpaddd ymm10,ymm10,ymm3 + vpxor ymm5,ymm10,ymm5 + vpshufb ymm5,ymm5,ymm15 + vpaddd ymm11,ymm11,ymm0 + vpxor ymm6,ymm11,ymm6 + vpshufb ymm6,ymm6,ymm15 + vpaddd ymm12,ymm12,ymm5 + vpxor ymm3,ymm12,ymm3 + vpslld ymm14,ymm3,12 + vpsrld ymm3,ymm3,20 + vpor ymm3,ymm14,ymm3 + vbroadcasti128 ymm14,XMMWORD[r11] + vpaddd ymm13,ymm13,ymm6 + vpxor ymm0,ymm13,ymm0 + vpslld ymm15,ymm0,12 + vpsrld ymm0,ymm0,20 + vpor ymm0,ymm15,ymm0 + vpaddd ymm10,ymm10,ymm3 + vpxor ymm5,ymm10,ymm5 + vpshufb ymm5,ymm5,ymm14 + vpaddd ymm11,ymm11,ymm0 + vpxor ymm6,ymm11,ymm6 + vpshufb ymm6,ymm6,ymm14 + vpaddd ymm12,ymm12,ymm5 + vpxor ymm3,ymm12,ymm3 + vpslld ymm15,ymm3,7 + vpsrld ymm3,ymm3,25 + vpor ymm3,ymm15,ymm3 + vbroadcasti128 ymm15,XMMWORD[r10] + vpaddd ymm13,ymm13,ymm6 + vpxor ymm0,ymm13,ymm0 + vpslld ymm14,ymm0,7 + vpsrld ymm0,ymm0,25 + vpor ymm0,ymm14,ymm0 + dec eax + jnz NEAR $L$oop8x + + lea rax,[512+rsp] + vpaddd ymm8,ymm8,YMMWORD[((128-256))+rcx] + vpaddd ymm9,ymm9,YMMWORD[((160-256))+rcx] + vpaddd ymm10,ymm10,YMMWORD[((192-256))+rcx] + vpaddd ymm11,ymm11,YMMWORD[((224-256))+rcx] + + vpunpckldq ymm14,ymm8,ymm9 + vpunpckldq ymm15,ymm10,ymm11 + vpunpckhdq ymm8,ymm8,ymm9 + vpunpckhdq ymm10,ymm10,ymm11 + vpunpcklqdq ymm9,ymm14,ymm15 + vpunpckhqdq ymm14,ymm14,ymm15 + vpunpcklqdq ymm11,ymm8,ymm10 + vpunpckhqdq ymm8,ymm8,ymm10 + vpaddd ymm0,ymm0,YMMWORD[((256-256))+rcx] + vpaddd ymm1,ymm1,YMMWORD[((288-256))+rcx] + vpaddd ymm2,ymm2,YMMWORD[((320-256))+rcx] + vpaddd ymm3,ymm3,YMMWORD[((352-256))+rcx] + + vpunpckldq ymm10,ymm0,ymm1 + vpunpckldq ymm15,ymm2,ymm3 + vpunpckhdq ymm0,ymm0,ymm1 + vpunpckhdq ymm2,ymm2,ymm3 + vpunpcklqdq ymm1,ymm10,ymm15 + vpunpckhqdq ymm10,ymm10,ymm15 + vpunpcklqdq ymm3,ymm0,ymm2 + vpunpckhqdq ymm0,ymm0,ymm2 + vperm2i128 ymm15,ymm9,ymm1,0x20 + vperm2i128 ymm1,ymm9,ymm1,0x31 + vperm2i128 ymm9,ymm14,ymm10,0x20 + vperm2i128 ymm10,ymm14,ymm10,0x31 + vperm2i128 ymm14,ymm11,ymm3,0x20 + vperm2i128 ymm3,ymm11,ymm3,0x31 + vperm2i128 ymm11,ymm8,ymm0,0x20 + vperm2i128 ymm0,ymm8,ymm0,0x31 + vmovdqa YMMWORD[rsp],ymm15 + vmovdqa YMMWORD[32+rsp],ymm9 + vmovdqa ymm15,YMMWORD[64+rsp] + vmovdqa ymm9,YMMWORD[96+rsp] + + vpaddd ymm12,ymm12,YMMWORD[((384-512))+rax] + vpaddd ymm13,ymm13,YMMWORD[((416-512))+rax] + vpaddd ymm15,ymm15,YMMWORD[((448-512))+rax] + vpaddd ymm9,ymm9,YMMWORD[((480-512))+rax] + + vpunpckldq ymm2,ymm12,ymm13 + vpunpckldq ymm8,ymm15,ymm9 + vpunpckhdq ymm12,ymm12,ymm13 + vpunpckhdq ymm15,ymm15,ymm9 + vpunpcklqdq ymm13,ymm2,ymm8 + vpunpckhqdq ymm2,ymm2,ymm8 + vpunpcklqdq ymm9,ymm12,ymm15 + vpunpckhqdq ymm12,ymm12,ymm15 + vpaddd ymm4,ymm4,YMMWORD[((512-512))+rax] + vpaddd ymm5,ymm5,YMMWORD[((544-512))+rax] + vpaddd ymm6,ymm6,YMMWORD[((576-512))+rax] + vpaddd ymm7,ymm7,YMMWORD[((608-512))+rax] + + vpunpckldq ymm15,ymm4,ymm5 + vpunpckldq ymm8,ymm6,ymm7 + vpunpckhdq ymm4,ymm4,ymm5 + vpunpckhdq ymm6,ymm6,ymm7 + vpunpcklqdq ymm5,ymm15,ymm8 + vpunpckhqdq ymm15,ymm15,ymm8 + vpunpcklqdq ymm7,ymm4,ymm6 + vpunpckhqdq ymm4,ymm4,ymm6 + vperm2i128 ymm8,ymm13,ymm5,0x20 + vperm2i128 ymm5,ymm13,ymm5,0x31 + vperm2i128 ymm13,ymm2,ymm15,0x20 + vperm2i128 ymm15,ymm2,ymm15,0x31 + vperm2i128 ymm2,ymm9,ymm7,0x20 + vperm2i128 ymm7,ymm9,ymm7,0x31 + vperm2i128 ymm9,ymm12,ymm4,0x20 + vperm2i128 ymm4,ymm12,ymm4,0x31 + vmovdqa ymm6,YMMWORD[rsp] + vmovdqa ymm12,YMMWORD[32+rsp] + + cmp rdx,64*8 + jb NEAR $L$tail8x + + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + lea rdi,[128+rdi] + + vpxor ymm12,ymm12,YMMWORD[rsi] + vpxor ymm13,ymm13,YMMWORD[32+rsi] + vpxor ymm10,ymm10,YMMWORD[64+rsi] + vpxor ymm15,ymm15,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm12 + vmovdqu YMMWORD[32+rdi],ymm13 + vmovdqu YMMWORD[64+rdi],ymm10 + vmovdqu YMMWORD[96+rdi],ymm15 + lea rdi,[128+rdi] + + vpxor ymm14,ymm14,YMMWORD[rsi] + vpxor ymm2,ymm2,YMMWORD[32+rsi] + vpxor ymm3,ymm3,YMMWORD[64+rsi] + vpxor ymm7,ymm7,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm14 + vmovdqu YMMWORD[32+rdi],ymm2 + vmovdqu YMMWORD[64+rdi],ymm3 + vmovdqu YMMWORD[96+rdi],ymm7 + lea rdi,[128+rdi] + + vpxor ymm11,ymm11,YMMWORD[rsi] + vpxor ymm9,ymm9,YMMWORD[32+rsi] + vpxor ymm0,ymm0,YMMWORD[64+rsi] + vpxor ymm4,ymm4,YMMWORD[96+rsi] + lea rsi,[128+rsi] + vmovdqu YMMWORD[rdi],ymm11 + vmovdqu YMMWORD[32+rdi],ymm9 + vmovdqu YMMWORD[64+rdi],ymm0 + vmovdqu YMMWORD[96+rdi],ymm4 + lea rdi,[128+rdi] + + sub rdx,64*8 + jnz NEAR $L$oop_outer8x + + jmp NEAR $L$done8x + +$L$tail8x: + cmp rdx,448 + jae NEAR $L$448_or_more8x + cmp rdx,384 + jae NEAR $L$384_or_more8x + cmp rdx,320 + jae NEAR $L$320_or_more8x + cmp rdx,256 + jae NEAR $L$256_or_more8x + cmp rdx,192 + jae NEAR $L$192_or_more8x + cmp rdx,128 + jae NEAR $L$128_or_more8x + cmp rdx,64 + jae NEAR $L$64_or_more8x + + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm6 + vmovdqa YMMWORD[32+rsp],ymm8 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$64_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + je NEAR $L$done8x + + lea rsi,[64+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm1 + lea rdi,[64+rdi] + sub rdx,64 + vmovdqa YMMWORD[32+rsp],ymm5 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$128_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + je NEAR $L$done8x + + lea rsi,[128+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm12 + lea rdi,[128+rdi] + sub rdx,128 + vmovdqa YMMWORD[32+rsp],ymm13 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$192_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + je NEAR $L$done8x + + lea rsi,[192+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm10 + lea rdi,[192+rdi] + sub rdx,192 + vmovdqa YMMWORD[32+rsp],ymm15 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$256_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + je NEAR $L$done8x + + lea rsi,[256+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm14 + lea rdi,[256+rdi] + sub rdx,256 + vmovdqa YMMWORD[32+rsp],ymm2 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$320_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + je NEAR $L$done8x + + lea rsi,[320+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm3 + lea rdi,[320+rdi] + sub rdx,320 + vmovdqa YMMWORD[32+rsp],ymm7 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$384_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vpxor ymm3,ymm3,YMMWORD[320+rsi] + vpxor ymm7,ymm7,YMMWORD[352+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + vmovdqu YMMWORD[320+rdi],ymm3 + vmovdqu YMMWORD[352+rdi],ymm7 + je NEAR $L$done8x + + lea rsi,[384+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm11 + lea rdi,[384+rdi] + sub rdx,384 + vmovdqa YMMWORD[32+rsp],ymm9 + jmp NEAR $L$oop_tail8x + +ALIGN 32 +$L$448_or_more8x: + vpxor ymm6,ymm6,YMMWORD[rsi] + vpxor ymm8,ymm8,YMMWORD[32+rsi] + vpxor ymm1,ymm1,YMMWORD[64+rsi] + vpxor ymm5,ymm5,YMMWORD[96+rsi] + vpxor ymm12,ymm12,YMMWORD[128+rsi] + vpxor ymm13,ymm13,YMMWORD[160+rsi] + vpxor ymm10,ymm10,YMMWORD[192+rsi] + vpxor ymm15,ymm15,YMMWORD[224+rsi] + vpxor ymm14,ymm14,YMMWORD[256+rsi] + vpxor ymm2,ymm2,YMMWORD[288+rsi] + vpxor ymm3,ymm3,YMMWORD[320+rsi] + vpxor ymm7,ymm7,YMMWORD[352+rsi] + vpxor ymm11,ymm11,YMMWORD[384+rsi] + vpxor ymm9,ymm9,YMMWORD[416+rsi] + vmovdqu YMMWORD[rdi],ymm6 + vmovdqu YMMWORD[32+rdi],ymm8 + vmovdqu YMMWORD[64+rdi],ymm1 + vmovdqu YMMWORD[96+rdi],ymm5 + vmovdqu YMMWORD[128+rdi],ymm12 + vmovdqu YMMWORD[160+rdi],ymm13 + vmovdqu YMMWORD[192+rdi],ymm10 + vmovdqu YMMWORD[224+rdi],ymm15 + vmovdqu YMMWORD[256+rdi],ymm14 + vmovdqu YMMWORD[288+rdi],ymm2 + vmovdqu YMMWORD[320+rdi],ymm3 + vmovdqu YMMWORD[352+rdi],ymm7 + vmovdqu YMMWORD[384+rdi],ymm11 + vmovdqu YMMWORD[416+rdi],ymm9 + je NEAR $L$done8x + + lea rsi,[448+rsi] + xor r10,r10 + vmovdqa YMMWORD[rsp],ymm0 + lea rdi,[448+rdi] + sub rdx,448 + vmovdqa YMMWORD[32+rsp],ymm4 + +$L$oop_tail8x: + movzx eax,BYTE[r10*1+rsi] + movzx ecx,BYTE[r10*1+rsp] + lea r10,[1+r10] + xor eax,ecx + mov BYTE[((-1))+r10*1+rdi],al + dec rdx + jnz NEAR $L$oop_tail8x + +$L$done8x: + vzeroall + movaps xmm6,XMMWORD[((-168))+r9] + movaps xmm7,XMMWORD[((-152))+r9] + movaps xmm8,XMMWORD[((-136))+r9] + movaps xmm9,XMMWORD[((-120))+r9] + movaps xmm10,XMMWORD[((-104))+r9] + movaps xmm11,XMMWORD[((-88))+r9] + movaps xmm12,XMMWORD[((-72))+r9] + movaps xmm13,XMMWORD[((-56))+r9] + movaps xmm14,XMMWORD[((-40))+r9] + movaps xmm15,XMMWORD[((-24))+r9] + lea rsp,[r9] + +$L$8x_epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_ChaCha20_ctr32_avx2: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + lea r10,[$L$ctr32_body] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[152+r8] + + lea r10,[$L$no_data] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rax,[((64+24+48))+rax] + + mov rbx,QWORD[((-8))+rax] + mov rbp,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r13,QWORD[((-32))+rax] + mov r14,QWORD[((-40))+rax] + mov r15,QWORD[((-48))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[224+r8],r13 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$common_seh_tail: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + + +ALIGN 16 +ssse3_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-40))+rax] + lea rdi,[512+r8] + mov ecx,4 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + + +ALIGN 16 +full_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + mov rsi,QWORD[8+r9] + mov r11,QWORD[56+r9] + + mov r10d,DWORD[r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jb NEAR $L$common_seh_tail + + mov rax,QWORD[192+r8] + + mov r10d,DWORD[4+r11] + lea r10,[r10*1+rsi] + cmp rbx,r10 + jae NEAR $L$common_seh_tail + + lea rsi,[((-168))+rax] + lea rdi,[512+r8] + mov ecx,20 + DD 0xa548f3fc + + jmp NEAR $L$common_seh_tail + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_ChaCha20_ctr32_nohw wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32_nohw wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32_nohw wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_ctr32_ssse3 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32_ssse3 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32_ssse3 wrt ..imagebase + + DD $L$SEH_begin_ChaCha20_ctr32_ssse3_4x wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32_ssse3_4x wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32_ssse3_4x wrt ..imagebase + DD $L$SEH_begin_ChaCha20_ctr32_avx2 wrt ..imagebase + DD $L$SEH_end_ChaCha20_ctr32_avx2 wrt ..imagebase + DD $L$SEH_info_ChaCha20_ctr32_avx2 wrt ..imagebase +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_ChaCha20_ctr32_nohw: + DB 9,0,0,0 + DD se_handler wrt ..imagebase + +$L$SEH_info_ChaCha20_ctr32_ssse3: + DB 9,0,0,0 + DD ssse3_handler wrt ..imagebase + DD $L$ssse3_body wrt ..imagebase,$L$ssse3_epilogue wrt ..imagebase + +$L$SEH_info_ChaCha20_ctr32_ssse3_4x: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$4x_body wrt ..imagebase,$L$4x_epilogue wrt ..imagebase +$L$SEH_info_ChaCha20_ctr32_avx2: + DB 9,0,0,0 + DD full_handler wrt ..imagebase + DD $L$8x_body wrt ..imagebase,$L$8x_epilogue wrt ..imagebase +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/crypto/chacha20_poly1305_armv8-apple.S b/third_party/boringssl/gen/crypto/chacha20_poly1305_armv8-apple.S new file mode 100644 index 00000000..d3a8c95b --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha20_poly1305_armv8-apple.S @@ -0,0 +1,3006 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.section __TEXT,__const + +.align 7 +Lchacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +Linc: +.long 1,2,3,4 +Lrol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +Lclamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC + +.text + + +.align 6 +Lpoly_hash_ad_internal: +.cfi_startproc + cbnz x4, Lpoly_hash_intro + ret + +Lpoly_hash_intro: + cmp x4, #16 + b.lt Lpoly_hash_ad_tail + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b Lpoly_hash_ad_internal + +Lpoly_hash_ad_tail: + cbz x4, Lpoly_hash_ad_ret + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD + sub x4, x4, #1 + +Lpoly_hash_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, x4] + mov v20.b[0], w11 + subs x4, x4, #1 + b.ge Lpoly_hash_tail_16_compose + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lpoly_hash_ad_ret: + ret +.cfi_endproc + + +///////////////////////////////// +// +// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); +// +.globl _chacha20_poly1305_seal +.private_extern _chacha20_poly1305_seal + +.align 6 +_chacha20_poly1305_seal: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, Lchacha20_consts@PAGE + add x11, x11, Lchacha20_consts@PAGEOFF + + ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + ldr x12, [x5, #56] // The total cipher text length includes extra_in_len + add x12, x12, x2 + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x12 + + cmp x2, #128 + b.le Lseal_128 // Optimization for smaller buffers + + // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, + // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, + // the fifth block (A4-D4) horizontally. + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + sub x5, x5, #32 + + mov x6, #10 + +.align 5 +Lseal_init_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.hi Lseal_init_rounds + + add v15.4s, v15.4s, v25.4s + mov x11, #4 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + and v4.16b, v4.16b, v27.16b + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + mov x16, v4.d[0] // Move the R key to GPRs + mov x17, v4.d[1] + mov v27.16b, v9.16b // Store the S key + + bl Lpoly_hash_ad_internal + + mov x3, x0 + cmp x2, #256 + b.le Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #256 + + mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds + mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 + +Lseal_main_loop: + adrp x11, Lchacha20_consts@PAGE + add x11, x11, Lchacha20_consts@PAGEOFF + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + sub x5, x5, #32 +.align 5 +Lseal_main_loop_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.ge Lseal_main_loop_rounds + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + subs x7, x7, #1 + b.gt Lseal_main_loop_rounds + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + cmp x2, #320 + b.le Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #320 + + mov x6, #0 + mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration + + b Lseal_main_loop + +Lseal_tail: + // This part of the function handles the storage and authentication of the last [0,320) bytes + // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. + cmp x2, #64 + b.lt Lseal_tail_64 + + // Store and authenticate 64B blocks per iteration + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + // Shift the state left by 64 bytes for the next iteration of the loop + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + + mov v1.16b, v2.16b + mov v6.16b, v7.16b + mov v11.16b, v12.16b + mov v16.16b, v17.16b + + mov v2.16b, v3.16b + mov v7.16b, v8.16b + mov v12.16b, v13.16b + mov v17.16b, v18.16b + + mov v3.16b, v4.16b + mov v8.16b, v9.16b + mov v13.16b, v14.16b + mov v18.16b, v19.16b + + b Lseal_tail + +Lseal_tail_64: + ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr + + // Here we handle the last [0,64) bytes of plaintext + cmp x2, #16 + b.lt Lseal_tail_16 + // Each iteration encrypt and authenticate a 16B block + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b}, [x0], #16 + + sub x2, x2, #16 + + // Shift the state left by 16 bytes for the next iteration of the loop + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + + b Lseal_tail_64 + +Lseal_tail_16: + // Here we handle the last [0,16) bytes of ciphertext that require a padded block + cbz x2, Lseal_hash_extra + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes + not v22.16b, v20.16b + + mov x6, x2 + add x1, x1, x2 + + cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding + + mov x7, #16 // We need to load some extra_in first for padding + sub x7, x7, x2 + cmp x4, x7 + csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register + mov x12, x7 + add x3, x3, x7 + sub x4, x4, x7 + +Lseal_tail16_compose_extra_in: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x7, x7, #1 + b.gt Lseal_tail16_compose_extra_in + + add x3, x3, x12 + +Lseal_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x1, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt Lseal_tail_16_compose + + and v0.16b, v0.16b, v21.16b + eor v20.16b, v20.16b, v0.16b + mov v21.16b, v20.16b + +Lseal_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt Lseal_tail_16_store + + // Hash in the final ct block concatenated with extra_in + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lseal_hash_extra: + cbz x4, Lseal_finalize + +Lseal_hash_extra_loop: + cmp x4, #16 + b.lt Lseal_hash_extra_tail + ld1 {v20.16b}, [x3], #16 + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b Lseal_hash_extra_loop + +Lseal_hash_extra_tail: + cbz x4, Lseal_finalize + eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext + add x3, x3, x4 + +Lseal_hash_extra_load: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x4, x4, #1 + b.gt Lseal_hash_extra_load + + // Hash in the final padded extra_in block + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lseal_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Lseal_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +Lseal_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi Lseal_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + // Only the first 32 bytes of the third block (counter = 0) are needed, + // so skip updating v12 and v17. + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl Lpoly_hash_ad_internal + b Lseal_tail +.cfi_endproc + + +///////////////////////////////// +// +// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); +// +.globl _chacha20_poly1305_open +.private_extern _chacha20_poly1305_open + +.align 6 +_chacha20_poly1305_open: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, Lchacha20_consts@PAGE + add x11, x11, Lchacha20_consts@PAGEOFF + + ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x2 + + cmp x2, #128 + b.le Lopen_128 // Optimization for smaller buffers + + // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + + mov x6, #10 + +.align 5 +Lopen_init_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.hi Lopen_init_rounds + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + + and v0.16b, v0.16b, v27.16b + mov x16, v0.d[0] // Move the R key to GPRs + mov x17, v0.d[1] + mov v27.16b, v5.16b // Store the S key + + bl Lpoly_hash_ad_internal + + mov x3, x1 + +// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes +Lopen_main_loop: + + cmp x2, #192 + b.lt Lopen_tail + + adrp x11, Lchacha20_consts@PAGE + add x11, x11, Lchacha20_consts@PAGEOFF + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + sub x5, x5, #32 + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 + sub x4, x4, #10 + + mov x7, #10 + subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full + + cbz x7, Lopen_main_loop_rounds_short + +.align 5 +Lopen_main_loop_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +Lopen_main_loop_rounds_short: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x7, x7, #1 + b.gt Lopen_main_loop_rounds + subs x6, x6, #1 + b.ge Lopen_main_loop_rounds_short + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + // We can always safely store 192 bytes + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #192 + + mov v0.16b, v3.16b + mov v5.16b, v8.16b + mov v10.16b, v13.16b + mov v15.16b, v18.16b + + cmp x2, #64 + b.lt Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v4.16b + mov v5.16b, v9.16b + mov v10.16b, v14.16b + mov v15.16b, v19.16b + + cmp x2, #64 + b.lt Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + b Lopen_main_loop + +Lopen_tail: + + cbz x2, Lopen_finalize + + lsr x4, x2, #4 // How many whole blocks we have to hash + + cmp x2, #64 + b.le Lopen_tail_64 + cmp x2, #128 + b.le Lopen_tail_128 + +Lopen_tail_192: + // We need three more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + mov v17.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v21.16b, v21.16b, v21.16b + ins v23.s[0], v25.s[0] + ins v21.d[0], x15 + + add v22.4s, v23.4s, v21.4s + add v21.4s, v22.4s, v21.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + mov x7, #10 + subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing + sub x4, x4, x7 + + cbz x7, Lopen_tail_192_rounds_no_hash + +Lopen_tail_192_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +Lopen_tail_192_rounds_no_hash: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x7, x7, #1 + b.gt Lopen_tail_192_rounds + subs x6, x6, #1 + b.ge Lopen_tail_192_rounds_no_hash + + // We hashed 160 bytes at most, may still have 32 bytes left +Lopen_tail_192_hash: + cbz x4, Lopen_tail_192_hash_done + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b Lopen_tail_192_hash + +Lopen_tail_192_hash_done: + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v12.4s, v12.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #128 + b Lopen_tail_64_store + +Lopen_tail_128: + // We need two more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v22.16b, v22.16b, v22.16b + ins v23.s[0], v25.s[0] + ins v22.d[0], x15 + add v22.4s, v22.4s, v23.4s + + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +Lopen_tail_128_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #4 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #12 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #4 + subs x6, x6, #1 + b.gt Lopen_tail_128_rounds + cbz x4, Lopen_tail_128_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b Lopen_tail_128_rounds + +Lopen_tail_128_rounds_done: + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + b Lopen_tail_64_store + +Lopen_tail_64: + // We just need a single block + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + ins v23.s[0], v25.s[0] + add v15.4s, v15.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +Lopen_tail_64_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.gt Lopen_tail_64_rounds + cbz x4, Lopen_tail_64_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b Lopen_tail_64_rounds + +Lopen_tail_64_rounds_done: + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v15.4s, v15.4s, v23.4s + +Lopen_tail_64_store: + cmp x2, #16 + b.lt Lopen_tail_16 + + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + st1 {v20.16b}, [x0], #16 + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + sub x2, x2, #16 + b Lopen_tail_64_store + +Lopen_tail_16: + // Here we handle the last [0,16) bytes that require a padded block + cbz x2, Lopen_finalize + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask + not v22.16b, v20.16b + + add x7, x1, x2 + mov x6, x2 + +Lopen_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x7, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt Lopen_tail_16_compose + + and v20.16b, v20.16b, v21.16b + // Hash in the final padded block + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + eor v20.16b, v20.16b, v0.16b + +Lopen_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt Lopen_tail_16_store + +Lopen_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Lopen_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +Lopen_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi Lopen_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl Lpoly_hash_ad_internal + +Lopen_128_store: + cmp x2, #64 + b.lt Lopen_128_store_64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + +Lopen_128_store_64: + + lsr x4, x2, #4 + mov x3, x1 + +Lopen_128_hash_64: + cbz x4, Lopen_tail_64_store + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b Lopen_128_hash_64 +.cfi_endproc + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/crypto/chacha20_poly1305_armv8-linux.S b/third_party/boringssl/gen/crypto/chacha20_poly1305_armv8-linux.S new file mode 100644 index 00000000..0ab662fb --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha20_poly1305_armv8-linux.S @@ -0,0 +1,3006 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.section .rodata + +.align 7 +.Lchacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.Linc: +.long 1,2,3,4 +.Lrol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.Lclamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC + +.text + +.type .Lpoly_hash_ad_internal,%function +.align 6 +.Lpoly_hash_ad_internal: +.cfi_startproc + cbnz x4, .Lpoly_hash_intro + ret + +.Lpoly_hash_intro: + cmp x4, #16 + b.lt .Lpoly_hash_ad_tail + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b .Lpoly_hash_ad_internal + +.Lpoly_hash_ad_tail: + cbz x4, .Lpoly_hash_ad_ret + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD + sub x4, x4, #1 + +.Lpoly_hash_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, x4] + mov v20.b[0], w11 + subs x4, x4, #1 + b.ge .Lpoly_hash_tail_16_compose + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +.Lpoly_hash_ad_ret: + ret +.cfi_endproc +.size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal + +///////////////////////////////// +// +// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); +// +.globl chacha20_poly1305_seal +.hidden chacha20_poly1305_seal +.type chacha20_poly1305_seal,%function +.align 6 +chacha20_poly1305_seal: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, .Lchacha20_consts + add x11, x11, :lo12:.Lchacha20_consts + + ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + ldr x12, [x5, #56] // The total cipher text length includes extra_in_len + add x12, x12, x2 + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x12 + + cmp x2, #128 + b.le .Lseal_128 // Optimization for smaller buffers + + // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, + // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, + // the fifth block (A4-D4) horizontally. + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + sub x5, x5, #32 + + mov x6, #10 + +.align 5 +.Lseal_init_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.hi .Lseal_init_rounds + + add v15.4s, v15.4s, v25.4s + mov x11, #4 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + and v4.16b, v4.16b, v27.16b + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + mov x16, v4.d[0] // Move the R key to GPRs + mov x17, v4.d[1] + mov v27.16b, v9.16b // Store the S key + + bl .Lpoly_hash_ad_internal + + mov x3, x0 + cmp x2, #256 + b.le .Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #256 + + mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds + mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 + +.Lseal_main_loop: + adrp x11, .Lchacha20_consts + add x11, x11, :lo12:.Lchacha20_consts + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + sub x5, x5, #32 +.align 5 +.Lseal_main_loop_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.ge .Lseal_main_loop_rounds + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + subs x7, x7, #1 + b.gt .Lseal_main_loop_rounds + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + cmp x2, #320 + b.le .Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #320 + + mov x6, #0 + mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration + + b .Lseal_main_loop + +.Lseal_tail: + // This part of the function handles the storage and authentication of the last [0,320) bytes + // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. + cmp x2, #64 + b.lt .Lseal_tail_64 + + // Store and authenticate 64B blocks per iteration + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + // Shift the state left by 64 bytes for the next iteration of the loop + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + + mov v1.16b, v2.16b + mov v6.16b, v7.16b + mov v11.16b, v12.16b + mov v16.16b, v17.16b + + mov v2.16b, v3.16b + mov v7.16b, v8.16b + mov v12.16b, v13.16b + mov v17.16b, v18.16b + + mov v3.16b, v4.16b + mov v8.16b, v9.16b + mov v13.16b, v14.16b + mov v18.16b, v19.16b + + b .Lseal_tail + +.Lseal_tail_64: + ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr + + // Here we handle the last [0,64) bytes of plaintext + cmp x2, #16 + b.lt .Lseal_tail_16 + // Each iteration encrypt and authenticate a 16B block + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b}, [x0], #16 + + sub x2, x2, #16 + + // Shift the state left by 16 bytes for the next iteration of the loop + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + + b .Lseal_tail_64 + +.Lseal_tail_16: + // Here we handle the last [0,16) bytes of ciphertext that require a padded block + cbz x2, .Lseal_hash_extra + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes + not v22.16b, v20.16b + + mov x6, x2 + add x1, x1, x2 + + cbz x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding + + mov x7, #16 // We need to load some extra_in first for padding + sub x7, x7, x2 + cmp x4, x7 + csel x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register + mov x12, x7 + add x3, x3, x7 + sub x4, x4, x7 + +.Lseal_tail16_compose_extra_in: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x7, x7, #1 + b.gt .Lseal_tail16_compose_extra_in + + add x3, x3, x12 + +.Lseal_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x1, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt .Lseal_tail_16_compose + + and v0.16b, v0.16b, v21.16b + eor v20.16b, v20.16b, v0.16b + mov v21.16b, v20.16b + +.Lseal_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt .Lseal_tail_16_store + + // Hash in the final ct block concatenated with extra_in + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +.Lseal_hash_extra: + cbz x4, .Lseal_finalize + +.Lseal_hash_extra_loop: + cmp x4, #16 + b.lt .Lseal_hash_extra_tail + ld1 {v20.16b}, [x3], #16 + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b .Lseal_hash_extra_loop + +.Lseal_hash_extra_tail: + cbz x4, .Lseal_finalize + eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext + add x3, x3, x4 + +.Lseal_hash_extra_load: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x4, x4, #1 + b.gt .Lseal_hash_extra_load + + // Hash in the final padded extra_in block + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +.Lseal_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.Lseal_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +.Lseal_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi .Lseal_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + // Only the first 32 bytes of the third block (counter = 0) are needed, + // so skip updating v12 and v17. + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl .Lpoly_hash_ad_internal + b .Lseal_tail +.cfi_endproc +.size chacha20_poly1305_seal,.-chacha20_poly1305_seal + +///////////////////////////////// +// +// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); +// +.globl chacha20_poly1305_open +.hidden chacha20_poly1305_open +.type chacha20_poly1305_open,%function +.align 6 +chacha20_poly1305_open: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, .Lchacha20_consts + add x11, x11, :lo12:.Lchacha20_consts + + ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x2 + + cmp x2, #128 + b.le .Lopen_128 // Optimization for smaller buffers + + // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + + mov x6, #10 + +.align 5 +.Lopen_init_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.hi .Lopen_init_rounds + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + + and v0.16b, v0.16b, v27.16b + mov x16, v0.d[0] // Move the R key to GPRs + mov x17, v0.d[1] + mov v27.16b, v5.16b // Store the S key + + bl .Lpoly_hash_ad_internal + + mov x3, x1 + +// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes +.Lopen_main_loop: + + cmp x2, #192 + b.lt .Lopen_tail + + adrp x11, .Lchacha20_consts + add x11, x11, :lo12:.Lchacha20_consts + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + sub x5, x5, #32 + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 + sub x4, x4, #10 + + mov x7, #10 + subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full + + cbz x7, .Lopen_main_loop_rounds_short + +.align 5 +.Lopen_main_loop_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +.Lopen_main_loop_rounds_short: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x7, x7, #1 + b.gt .Lopen_main_loop_rounds + subs x6, x6, #1 + b.ge .Lopen_main_loop_rounds_short + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + // We can always safely store 192 bytes + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #192 + + mov v0.16b, v3.16b + mov v5.16b, v8.16b + mov v10.16b, v13.16b + mov v15.16b, v18.16b + + cmp x2, #64 + b.lt .Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v4.16b + mov v5.16b, v9.16b + mov v10.16b, v14.16b + mov v15.16b, v19.16b + + cmp x2, #64 + b.lt .Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + b .Lopen_main_loop + +.Lopen_tail: + + cbz x2, .Lopen_finalize + + lsr x4, x2, #4 // How many whole blocks we have to hash + + cmp x2, #64 + b.le .Lopen_tail_64 + cmp x2, #128 + b.le .Lopen_tail_128 + +.Lopen_tail_192: + // We need three more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + mov v17.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v21.16b, v21.16b, v21.16b + ins v23.s[0], v25.s[0] + ins v21.d[0], x15 + + add v22.4s, v23.4s, v21.4s + add v21.4s, v22.4s, v21.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + mov x7, #10 + subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing + sub x4, x4, x7 + + cbz x7, .Lopen_tail_192_rounds_no_hash + +.Lopen_tail_192_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +.Lopen_tail_192_rounds_no_hash: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x7, x7, #1 + b.gt .Lopen_tail_192_rounds + subs x6, x6, #1 + b.ge .Lopen_tail_192_rounds_no_hash + + // We hashed 160 bytes at most, may still have 32 bytes left +.Lopen_tail_192_hash: + cbz x4, .Lopen_tail_192_hash_done + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b .Lopen_tail_192_hash + +.Lopen_tail_192_hash_done: + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v12.4s, v12.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #128 + b .Lopen_tail_64_store + +.Lopen_tail_128: + // We need two more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v22.16b, v22.16b, v22.16b + ins v23.s[0], v25.s[0] + ins v22.d[0], x15 + add v22.4s, v22.4s, v23.4s + + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +.Lopen_tail_128_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #4 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #12 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #4 + subs x6, x6, #1 + b.gt .Lopen_tail_128_rounds + cbz x4, .Lopen_tail_128_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b .Lopen_tail_128_rounds + +.Lopen_tail_128_rounds_done: + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + b .Lopen_tail_64_store + +.Lopen_tail_64: + // We just need a single block + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + ins v23.s[0], v25.s[0] + add v15.4s, v15.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +.Lopen_tail_64_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.gt .Lopen_tail_64_rounds + cbz x4, .Lopen_tail_64_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b .Lopen_tail_64_rounds + +.Lopen_tail_64_rounds_done: + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v15.4s, v15.4s, v23.4s + +.Lopen_tail_64_store: + cmp x2, #16 + b.lt .Lopen_tail_16 + + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + st1 {v20.16b}, [x0], #16 + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + sub x2, x2, #16 + b .Lopen_tail_64_store + +.Lopen_tail_16: + // Here we handle the last [0,16) bytes that require a padded block + cbz x2, .Lopen_finalize + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask + not v22.16b, v20.16b + + add x7, x1, x2 + mov x6, x2 + +.Lopen_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x7, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt .Lopen_tail_16_compose + + and v20.16b, v20.16b, v21.16b + // Hash in the final padded block + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + eor v20.16b, v20.16b, v0.16b + +.Lopen_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt .Lopen_tail_16_store + +.Lopen_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +.Lopen_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +.Lopen_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi .Lopen_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl .Lpoly_hash_ad_internal + +.Lopen_128_store: + cmp x2, #64 + b.lt .Lopen_128_store_64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + +.Lopen_128_store_64: + + lsr x4, x2, #4 + mov x3, x1 + +.Lopen_128_hash_64: + cbz x4, .Lopen_tail_64_store + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b .Lopen_128_hash_64 +.cfi_endproc +.size chacha20_poly1305_open,.-chacha20_poly1305_open +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/crypto/chacha20_poly1305_armv8-win.S b/third_party/boringssl/gen/crypto/chacha20_poly1305_armv8-win.S new file mode 100644 index 00000000..fa86675f --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha20_poly1305_armv8-win.S @@ -0,0 +1,3012 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.section .rodata + +.align 7 +Lchacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +Linc: +.long 1,2,3,4 +Lrol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +Lclamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC + +.text + +.def Lpoly_hash_ad_internal + .type 32 +.endef +.align 6 +Lpoly_hash_ad_internal: +.cfi_startproc + cbnz x4, Lpoly_hash_intro + ret + +Lpoly_hash_intro: + cmp x4, #16 + b.lt Lpoly_hash_ad_tail + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b Lpoly_hash_ad_internal + +Lpoly_hash_ad_tail: + cbz x4, Lpoly_hash_ad_ret + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD + sub x4, x4, #1 + +Lpoly_hash_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, x4] + mov v20.b[0], w11 + subs x4, x4, #1 + b.ge Lpoly_hash_tail_16_compose + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lpoly_hash_ad_ret: + ret +.cfi_endproc + + +///////////////////////////////// +// +// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); +// +.globl chacha20_poly1305_seal + +.def chacha20_poly1305_seal + .type 32 +.endef +.align 6 +chacha20_poly1305_seal: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + ldr x12, [x5, #56] // The total cipher text length includes extra_in_len + add x12, x12, x2 + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x12 + + cmp x2, #128 + b.le Lseal_128 // Optimization for smaller buffers + + // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, + // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, + // the fifth block (A4-D4) horizontally. + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + sub x5, x5, #32 + + mov x6, #10 + +.align 5 +Lseal_init_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.hi Lseal_init_rounds + + add v15.4s, v15.4s, v25.4s + mov x11, #4 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + and v4.16b, v4.16b, v27.16b + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + mov x16, v4.d[0] // Move the R key to GPRs + mov x17, v4.d[1] + mov v27.16b, v9.16b // Store the S key + + bl Lpoly_hash_ad_internal + + mov x3, x0 + cmp x2, #256 + b.le Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #256 + + mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds + mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 + +Lseal_main_loop: + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + sub x5, x5, #32 +.align 5 +Lseal_main_loop_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x6, x6, #1 + b.ge Lseal_main_loop_rounds + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + subs x7, x7, #1 + b.gt Lseal_main_loop_rounds + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + cmp x2, #320 + b.le Lseal_tail + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #320 + + mov x6, #0 + mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration + + b Lseal_main_loop + +Lseal_tail: + // This part of the function handles the storage and authentication of the last [0,320) bytes + // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. + cmp x2, #64 + b.lt Lseal_tail_64 + + // Store and authenticate 64B blocks per iteration + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + // Shift the state left by 64 bytes for the next iteration of the loop + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + + mov v1.16b, v2.16b + mov v6.16b, v7.16b + mov v11.16b, v12.16b + mov v16.16b, v17.16b + + mov v2.16b, v3.16b + mov v7.16b, v8.16b + mov v12.16b, v13.16b + mov v17.16b, v18.16b + + mov v3.16b, v4.16b + mov v8.16b, v9.16b + mov v13.16b, v14.16b + mov v18.16b, v19.16b + + b Lseal_tail + +Lseal_tail_64: + ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr + + // Here we handle the last [0,64) bytes of plaintext + cmp x2, #16 + b.lt Lseal_tail_16 + // Each iteration encrypt and authenticate a 16B block + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + st1 {v20.16b}, [x0], #16 + + sub x2, x2, #16 + + // Shift the state left by 16 bytes for the next iteration of the loop + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + + b Lseal_tail_64 + +Lseal_tail_16: + // Here we handle the last [0,16) bytes of ciphertext that require a padded block + cbz x2, Lseal_hash_extra + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes + not v22.16b, v20.16b + + mov x6, x2 + add x1, x1, x2 + + cbz x4, Lseal_tail_16_compose // No extra data to pad with, zero padding + + mov x7, #16 // We need to load some extra_in first for padding + sub x7, x7, x2 + cmp x4, x7 + csel x7, x4, x7, lt // Load the minimum of extra_in_len and the amount needed to fill the register + mov x12, x7 + add x3, x3, x7 + sub x4, x4, x7 + +Lseal_tail16_compose_extra_in: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x7, x7, #1 + b.gt Lseal_tail16_compose_extra_in + + add x3, x3, x12 + +Lseal_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x1, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt Lseal_tail_16_compose + + and v0.16b, v0.16b, v21.16b + eor v20.16b, v20.16b, v0.16b + mov v21.16b, v20.16b + +Lseal_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt Lseal_tail_16_store + + // Hash in the final ct block concatenated with extra_in + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lseal_hash_extra: + cbz x4, Lseal_finalize + +Lseal_hash_extra_loop: + cmp x4, #16 + b.lt Lseal_hash_extra_tail + ld1 {v20.16b}, [x3], #16 + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #16 + b Lseal_hash_extra_loop + +Lseal_hash_extra_tail: + cbz x4, Lseal_finalize + eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext + add x3, x3, x4 + +Lseal_hash_extra_load: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x3, #-1]! + mov v20.b[0], w11 + subs x4, x4, #1 + b.gt Lseal_hash_extra_load + + // Hash in the final padded extra_in block + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + +Lseal_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Lseal_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +Lseal_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi Lseal_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + // Only the first 32 bytes of the third block (counter = 0) are needed, + // so skip updating v12 and v17. + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl Lpoly_hash_ad_internal + b Lseal_tail +.cfi_endproc + + +///////////////////////////////// +// +// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); +// +.globl chacha20_poly1305_open + +.def chacha20_poly1305_open + .type 32 +.endef +.align 6 +chacha20_poly1305_open: + AARCH64_SIGN_LINK_REGISTER +.cfi_startproc + stp x29, x30, [sp, #-80]! +.cfi_def_cfa_offset 80 +.cfi_offset w30, -72 +.cfi_offset w29, -80 + mov x29, sp + // We probably could do .cfi_def_cfa w29, 80 at this point, but since + // we don't actually use the frame pointer like that, it's probably not + // worth bothering. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] +.cfi_offset b15, -8 +.cfi_offset b14, -16 +.cfi_offset b13, -24 +.cfi_offset b12, -32 +.cfi_offset b11, -40 +.cfi_offset b10, -48 +.cfi_offset b9, -56 +.cfi_offset b8, -64 + + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld1 {v24.16b - v27.16b}, [x11] // Load the CONSTS, INC, ROL8 and CLAMP values + ld1 {v28.16b - v30.16b}, [x5] + + mov x15, #1 // Prepare the Poly1305 state + mov x8, #0 + mov x9, #0 + mov x10, #0 + + mov v31.d[0], x4 // Store the input and aad lengths + mov v31.d[1], x2 + + cmp x2, #128 + b.le Lopen_128 // Optimization for smaller buffers + + // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + + mov x6, #10 + +.align 5 +Lopen_init_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.hi Lopen_init_rounds + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + + and v0.16b, v0.16b, v27.16b + mov x16, v0.d[0] // Move the R key to GPRs + mov x17, v0.d[1] + mov v27.16b, v5.16b // Store the S key + + bl Lpoly_hash_ad_internal + + mov x3, x1 + +// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes +Lopen_main_loop: + + cmp x2, #192 + b.lt Lopen_tail + + adrp x11, Lchacha20_consts + add x11, x11, :lo12:Lchacha20_consts + + ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] + mov v4.16b, v24.16b + + ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 + mov v9.16b, v28.16b + + ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 + mov v14.16b, v29.16b + + ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] + sub x5, x5, #32 + add v15.4s, v15.4s, v25.4s + mov v19.16b, v30.16b + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 + sub x4, x4, #10 + + mov x7, #10 + subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full + + cbz x7, Lopen_main_loop_rounds_short + +.align 5 +Lopen_main_loop_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +Lopen_main_loop_rounds_short: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v9.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v18.8h, v18.8h + rev32 v19.8h, v19.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + eor v8.16b, v8.16b, v13.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v9.4s, #20 + sli v8.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + add v3.4s, v3.4s, v7.4s + add v4.4s, v4.4s, v8.4s + + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + eor v18.16b, v18.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v18.16b, {v18.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + add v13.4s, v13.4s, v18.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v14.16b + + ushr v9.4s, v8.4s, #25 + sli v9.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #4 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #12 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + add v0.4s, v0.4s, v6.4s + add v1.4s, v1.4s, v7.4s + add v2.4s, v2.4s, v8.4s + add v3.4s, v3.4s, v5.4s + add v4.4s, v4.4s, v9.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + rev32 v18.8h, v18.8h + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + rev32 v19.8h, v19.8h + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v6.16b, v6.16b, v12.16b + eor v7.16b, v7.16b, v13.16b + eor v8.16b, v8.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v9.16b, v9.16b, v14.16b + + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + ushr v7.4s, v8.4s, #20 + sli v7.4s, v8.4s, #12 + ushr v8.4s, v5.4s, #20 + sli v8.4s, v5.4s, #12 + ushr v5.4s, v9.4s, #20 + sli v5.4s, v9.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + add v3.4s, v3.4s, v8.4s + add v4.4s, v4.4s, v5.4s + + eor v18.16b, v18.16b, v0.16b + eor v15.16b, v15.16b, v1.16b + eor v16.16b, v16.16b, v2.16b + eor v17.16b, v17.16b, v3.16b + eor v19.16b, v19.16b, v4.16b + + tbl v18.16b, {v18.16b}, v26.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + tbl v19.16b, {v19.16b}, v26.16b + + add v12.4s, v12.4s, v18.4s + add v13.4s, v13.4s, v15.4s + add v10.4s, v10.4s, v16.4s + add v11.4s, v11.4s, v17.4s + add v14.4s, v14.4s, v19.4s + + eor v20.16b, v20.16b, v12.16b + eor v6.16b, v6.16b, v13.16b + eor v7.16b, v7.16b, v10.16b + eor v8.16b, v8.16b, v11.16b + eor v5.16b, v5.16b, v14.16b + + ushr v9.4s, v5.4s, #25 + sli v9.4s, v5.4s, #7 + ushr v5.4s, v8.4s, #25 + sli v5.4s, v8.4s, #7 + ushr v8.4s, v7.4s, #25 + sli v8.4s, v7.4s, #7 + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + + ext v9.16b, v9.16b, v9.16b, #12 + ext v14.16b, v14.16b, v14.16b, #8 + ext v19.16b, v19.16b, v19.16b, #4 + subs x7, x7, #1 + b.gt Lopen_main_loop_rounds + subs x6, x6, #1 + b.ge Lopen_main_loop_rounds_short + + eor v20.16b, v20.16b, v20.16b //zero + not v21.16b, v20.16b // -1 + sub v21.4s, v25.4s, v21.4s // Add +1 + ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) + add v19.4s, v19.4s, v20.4s + + add v15.4s, v15.4s, v25.4s + mov x11, #5 + dup v20.4s, w11 + add v25.4s, v25.4s, v20.4s + + zip1 v20.4s, v0.4s, v1.4s + zip2 v21.4s, v0.4s, v1.4s + zip1 v22.4s, v2.4s, v3.4s + zip2 v23.4s, v2.4s, v3.4s + + zip1 v0.2d, v20.2d, v22.2d + zip2 v1.2d, v20.2d, v22.2d + zip1 v2.2d, v21.2d, v23.2d + zip2 v3.2d, v21.2d, v23.2d + + zip1 v20.4s, v5.4s, v6.4s + zip2 v21.4s, v5.4s, v6.4s + zip1 v22.4s, v7.4s, v8.4s + zip2 v23.4s, v7.4s, v8.4s + + zip1 v5.2d, v20.2d, v22.2d + zip2 v6.2d, v20.2d, v22.2d + zip1 v7.2d, v21.2d, v23.2d + zip2 v8.2d, v21.2d, v23.2d + + zip1 v20.4s, v10.4s, v11.4s + zip2 v21.4s, v10.4s, v11.4s + zip1 v22.4s, v12.4s, v13.4s + zip2 v23.4s, v12.4s, v13.4s + + zip1 v10.2d, v20.2d, v22.2d + zip2 v11.2d, v20.2d, v22.2d + zip1 v12.2d, v21.2d, v23.2d + zip2 v13.2d, v21.2d, v23.2d + + zip1 v20.4s, v15.4s, v16.4s + zip2 v21.4s, v15.4s, v16.4s + zip1 v22.4s, v17.4s, v18.4s + zip2 v23.4s, v17.4s, v18.4s + + zip1 v15.2d, v20.2d, v22.2d + zip2 v16.2d, v20.2d, v22.2d + zip1 v17.2d, v21.2d, v23.2d + zip2 v18.2d, v21.2d, v23.2d + + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + + add v1.4s, v1.4s, v24.4s + add v6.4s, v6.4s, v28.4s + add v11.4s, v11.4s, v29.4s + add v16.4s, v16.4s, v30.4s + + add v2.4s, v2.4s, v24.4s + add v7.4s, v7.4s, v28.4s + add v12.4s, v12.4s, v29.4s + add v17.4s, v17.4s, v30.4s + + add v3.4s, v3.4s, v24.4s + add v8.4s, v8.4s, v28.4s + add v13.4s, v13.4s, v29.4s + add v18.4s, v18.4s, v30.4s + + add v4.4s, v4.4s, v24.4s + add v9.4s, v9.4s, v28.4s + add v14.4s, v14.4s, v29.4s + add v19.4s, v19.4s, v30.4s + + // We can always safely store 192 bytes + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #192 + + mov v0.16b, v3.16b + mov v5.16b, v8.16b + mov v10.16b, v13.16b + mov v15.16b, v18.16b + + cmp x2, #64 + b.lt Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v3.16b + eor v21.16b, v21.16b, v8.16b + eor v22.16b, v22.16b, v13.16b + eor v23.16b, v23.16b, v18.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v4.16b + mov v5.16b, v9.16b + mov v10.16b, v14.16b + mov v15.16b, v19.16b + + cmp x2, #64 + b.lt Lopen_tail_64_store + + ld1 {v20.16b - v23.16b}, [x1], #64 + eor v20.16b, v20.16b, v4.16b + eor v21.16b, v21.16b, v9.16b + eor v22.16b, v22.16b, v14.16b + eor v23.16b, v23.16b, v19.16b + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + b Lopen_main_loop + +Lopen_tail: + + cbz x2, Lopen_finalize + + lsr x4, x2, #4 // How many whole blocks we have to hash + + cmp x2, #64 + b.le Lopen_tail_64 + cmp x2, #128 + b.le Lopen_tail_128 + +Lopen_tail_192: + // We need three more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + mov v17.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v21.16b, v21.16b, v21.16b + ins v23.s[0], v25.s[0] + ins v21.d[0], x15 + + add v22.4s, v23.4s, v21.4s + add v21.4s, v22.4s, v21.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + mov x7, #10 + subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash + csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing + sub x4, x4, x7 + + cbz x7, Lopen_tail_192_rounds_no_hash + +Lopen_tail_192_rounds: + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most +Lopen_tail_192_rounds_no_hash: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x7, x7, #1 + b.gt Lopen_tail_192_rounds + subs x6, x6, #1 + b.ge Lopen_tail_192_rounds_no_hash + + // We hashed 160 bytes at most, may still have 32 bytes left +Lopen_tail_192_hash: + cbz x4, Lopen_tail_192_hash_done + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b Lopen_tail_192_hash + +Lopen_tail_192_hash_done: + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v12.4s, v12.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v17.4s, v17.4s, v30.4s + + add v15.4s, v15.4s, v21.4s + add v16.4s, v16.4s, v23.4s + add v17.4s, v17.4s, v22.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v2.16b + eor v21.16b, v21.16b, v7.16b + eor v22.16b, v22.16b, v12.16b + eor v23.16b, v23.16b, v17.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #128 + b Lopen_tail_64_store + +Lopen_tail_128: + // We need two more blocks + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v15.16b, v30.16b + mov v16.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + eor v22.16b, v22.16b, v22.16b + ins v23.s[0], v25.s[0] + ins v22.d[0], x15 + add v22.4s, v22.4s, v23.4s + + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +Lopen_tail_128_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #4 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + add v1.4s, v1.4s, v6.4s + eor v16.16b, v16.16b, v1.16b + rev32 v16.8h, v16.8h + + add v11.4s, v11.4s, v16.4s + eor v6.16b, v6.16b, v11.16b + ushr v20.4s, v6.4s, #20 + sli v20.4s, v6.4s, #12 + add v1.4s, v1.4s, v20.4s + eor v16.16b, v16.16b, v1.16b + tbl v16.16b, {v16.16b}, v26.16b + + add v11.4s, v11.4s, v16.4s + eor v20.16b, v20.16b, v11.16b + ushr v6.4s, v20.4s, #25 + sli v6.4s, v20.4s, #7 + ext v6.16b, v6.16b, v6.16b, #12 + ext v11.16b, v11.16b, v11.16b, #8 + ext v16.16b, v16.16b, v16.16b, #4 + subs x6, x6, #1 + b.gt Lopen_tail_128_rounds + cbz x4, Lopen_tail_128_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b Lopen_tail_128_rounds + +Lopen_tail_128_rounds_done: + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v16.4s, v16.4s, v30.4s + add v15.4s, v15.4s, v22.4s + add v16.4s, v16.4s, v23.4s + + ld1 {v20.16b - v23.16b}, [x1], #64 + + eor v20.16b, v20.16b, v1.16b + eor v21.16b, v21.16b, v6.16b + eor v22.16b, v22.16b, v11.16b + eor v23.16b, v23.16b, v16.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + sub x2, x2, #64 + + b Lopen_tail_64_store + +Lopen_tail_64: + // We just need a single block + mov v0.16b, v24.16b + mov v5.16b, v28.16b + mov v10.16b, v29.16b + mov v15.16b, v30.16b + eor v23.16b, v23.16b, v23.16b + ins v23.s[0], v25.s[0] + add v15.4s, v15.4s, v23.4s + + mov x6, #10 + sub x6, x6, x4 + +Lopen_tail_64_rounds: + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #4 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #12 + add v0.4s, v0.4s, v5.4s + eor v15.16b, v15.16b, v0.16b + rev32 v15.8h, v15.8h + + add v10.4s, v10.4s, v15.4s + eor v5.16b, v5.16b, v10.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + add v0.4s, v0.4s, v20.4s + eor v15.16b, v15.16b, v0.16b + tbl v15.16b, {v15.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + eor v20.16b, v20.16b, v10.16b + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + ext v5.16b, v5.16b, v5.16b, #12 + ext v10.16b, v10.16b, v10.16b, #8 + ext v15.16b, v15.16b, v15.16b, #4 + subs x6, x6, #1 + b.gt Lopen_tail_64_rounds + cbz x4, Lopen_tail_64_rounds_done + subs x4, x4, #1 + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + b Lopen_tail_64_rounds + +Lopen_tail_64_rounds_done: + add v0.4s, v0.4s, v24.4s + add v5.4s, v5.4s, v28.4s + add v10.4s, v10.4s, v29.4s + add v15.4s, v15.4s, v30.4s + add v15.4s, v15.4s, v23.4s + +Lopen_tail_64_store: + cmp x2, #16 + b.lt Lopen_tail_16 + + ld1 {v20.16b}, [x1], #16 + eor v20.16b, v20.16b, v0.16b + st1 {v20.16b}, [x0], #16 + mov v0.16b, v5.16b + mov v5.16b, v10.16b + mov v10.16b, v15.16b + sub x2, x2, #16 + b Lopen_tail_64_store + +Lopen_tail_16: + // Here we handle the last [0,16) bytes that require a padded block + cbz x2, Lopen_finalize + + eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext + eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask + not v22.16b, v20.16b + + add x7, x1, x2 + mov x6, x2 + +Lopen_tail_16_compose: + ext v20.16b, v20.16b, v20.16b, #15 + ldrb w11, [x7, #-1]! + mov v20.b[0], w11 + ext v21.16b, v22.16b, v21.16b, #15 + subs x2, x2, #1 + b.gt Lopen_tail_16_compose + + and v20.16b, v20.16b, v21.16b + // Hash in the final padded block + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + eor v20.16b, v20.16b, v0.16b + +Lopen_tail_16_store: + umov w11, v20.b[0] + strb w11, [x0], #1 + ext v20.16b, v20.16b, v20.16b, #1 + subs x6, x6, #1 + b.gt Lopen_tail_16_store + +Lopen_finalize: + mov x11, v31.d[0] + mov x12, v31.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + // Final reduction step + sub x12, xzr, x15 + orr x13, xzr, #3 + subs x11, x8, #-5 + sbcs x12, x9, x12 + sbcs x13, x10, x13 + csel x8, x11, x8, cs + csel x9, x12, x9, cs + csel x10, x13, x10, cs + mov x11, v27.d[0] + mov x12, v27.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + + stp x8, x9, [x5] + + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] +.cfi_restore b15 +.cfi_restore b14 +.cfi_restore b13 +.cfi_restore b12 +.cfi_restore b11 +.cfi_restore b10 +.cfi_restore b9 +.cfi_restore b8 + ldp x29, x30, [sp], 80 +.cfi_restore w29 +.cfi_restore w30 +.cfi_def_cfa_offset 0 + AARCH64_VALIDATE_LINK_REGISTER + ret + +Lopen_128: + // On some architectures preparing 5 blocks for small buffers is wasteful + eor v25.16b, v25.16b, v25.16b + mov x11, #1 + mov v25.s[0], w11 + mov v0.16b, v24.16b + mov v1.16b, v24.16b + mov v2.16b, v24.16b + mov v5.16b, v28.16b + mov v6.16b, v28.16b + mov v7.16b, v28.16b + mov v10.16b, v29.16b + mov v11.16b, v29.16b + mov v12.16b, v29.16b + mov v17.16b, v30.16b + add v15.4s, v17.4s, v25.4s + add v16.4s, v15.4s, v25.4s + + mov x6, #10 + +Lopen_128_rounds: + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #4 + ext v6.16b, v6.16b, v6.16b, #4 + ext v7.16b, v7.16b, v7.16b, #4 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #12 + ext v16.16b, v16.16b, v16.16b, #12 + ext v17.16b, v17.16b, v17.16b, #12 + add v0.4s, v0.4s, v5.4s + add v1.4s, v1.4s, v6.4s + add v2.4s, v2.4s, v7.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + rev32 v15.8h, v15.8h + rev32 v16.8h, v16.8h + rev32 v17.8h, v17.8h + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v5.16b, v5.16b, v10.16b + eor v6.16b, v6.16b, v11.16b + eor v7.16b, v7.16b, v12.16b + ushr v20.4s, v5.4s, #20 + sli v20.4s, v5.4s, #12 + ushr v5.4s, v6.4s, #20 + sli v5.4s, v6.4s, #12 + ushr v6.4s, v7.4s, #20 + sli v6.4s, v7.4s, #12 + + add v0.4s, v0.4s, v20.4s + add v1.4s, v1.4s, v5.4s + add v2.4s, v2.4s, v6.4s + eor v15.16b, v15.16b, v0.16b + eor v16.16b, v16.16b, v1.16b + eor v17.16b, v17.16b, v2.16b + tbl v15.16b, {v15.16b}, v26.16b + tbl v16.16b, {v16.16b}, v26.16b + tbl v17.16b, {v17.16b}, v26.16b + + add v10.4s, v10.4s, v15.4s + add v11.4s, v11.4s, v16.4s + add v12.4s, v12.4s, v17.4s + eor v20.16b, v20.16b, v10.16b + eor v5.16b, v5.16b, v11.16b + eor v6.16b, v6.16b, v12.16b + ushr v7.4s, v6.4s, #25 + sli v7.4s, v6.4s, #7 + ushr v6.4s, v5.4s, #25 + sli v6.4s, v5.4s, #7 + ushr v5.4s, v20.4s, #25 + sli v5.4s, v20.4s, #7 + + ext v5.16b, v5.16b, v5.16b, #12 + ext v6.16b, v6.16b, v6.16b, #12 + ext v7.16b, v7.16b, v7.16b, #12 + + ext v10.16b, v10.16b, v10.16b, #8 + ext v11.16b, v11.16b, v11.16b, #8 + ext v12.16b, v12.16b, v12.16b, #8 + + ext v15.16b, v15.16b, v15.16b, #4 + ext v16.16b, v16.16b, v16.16b, #4 + ext v17.16b, v17.16b, v17.16b, #4 + subs x6, x6, #1 + b.hi Lopen_128_rounds + + add v0.4s, v0.4s, v24.4s + add v1.4s, v1.4s, v24.4s + add v2.4s, v2.4s, v24.4s + + add v5.4s, v5.4s, v28.4s + add v6.4s, v6.4s, v28.4s + add v7.4s, v7.4s, v28.4s + + add v10.4s, v10.4s, v29.4s + add v11.4s, v11.4s, v29.4s + + add v30.4s, v30.4s, v25.4s + add v15.4s, v15.4s, v30.4s + add v30.4s, v30.4s, v25.4s + add v16.4s, v16.4s, v30.4s + + and v2.16b, v2.16b, v27.16b + mov x16, v2.d[0] // Move the R key to GPRs + mov x17, v2.d[1] + mov v27.16b, v7.16b // Store the S key + + bl Lpoly_hash_ad_internal + +Lopen_128_store: + cmp x2, #64 + b.lt Lopen_128_store_64 + + ld1 {v20.16b - v23.16b}, [x1], #64 + + mov x11, v20.d[0] + mov x12, v20.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v21.d[0] + mov x12, v21.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v22.d[0] + mov x12, v22.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + mov x11, v23.d[0] + mov x12, v23.d[1] + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + + eor v20.16b, v20.16b, v0.16b + eor v21.16b, v21.16b, v5.16b + eor v22.16b, v22.16b, v10.16b + eor v23.16b, v23.16b, v15.16b + + st1 {v20.16b - v23.16b}, [x0], #64 + + sub x2, x2, #64 + + mov v0.16b, v1.16b + mov v5.16b, v6.16b + mov v10.16b, v11.16b + mov v15.16b, v16.16b + +Lopen_128_store_64: + + lsr x4, x2, #4 + mov x3, x1 + +Lopen_128_hash_64: + cbz x4, Lopen_tail_64_store + ldp x11, x12, [x3], 16 + adds x8, x8, x11 + adcs x9, x9, x12 + adc x10, x10, x15 + mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 + umulh x12, x8, x16 + mul x13, x9, x16 + umulh x14, x9, x16 + adds x12, x12, x13 + mul x13, x10, x16 + adc x13, x13, x14 + mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] + umulh x8, x8, x17 + adds x12, x12, x14 + mul x14, x9, x17 + umulh x9, x9, x17 + adcs x14, x14, x8 + mul x10, x10, x17 + adc x10, x10, x9 + adds x13, x13, x14 + adc x14, x10, xzr + and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) + and x8, x13, #-4 + extr x13, x14, x13, #2 + adds x8, x8, x11 + lsr x11, x14, #2 + adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits + adds x8, x8, x13 + adcs x9, x9, x12 + adc x10, x10, xzr // At this point acc2 has the value of 4 at most + sub x4, x4, #1 + b Lopen_128_hash_64 +.cfi_endproc + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/crypto/chacha20_poly1305_x86_64-apple.S b/third_party/boringssl/gen/crypto/chacha20_poly1305_x86_64-apple.S new file mode 100644 index 00000000..4044212e --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha20_poly1305_x86_64-apple.S @@ -0,0 +1,8898 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.section __DATA,__const +.p2align 6 +chacha20_poly1305_constants: +L$chacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +L$rol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +L$rol16: +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +L$avx2_init: +.long 0,0,0,0 +L$sse_inc: +.long 1,0,0,0 +L$avx2_inc: +.long 2,0,0,0,2,0,0,0 +L$clamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC +.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF +.p2align 4 +L$and_masks: +.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +.text + + +.p2align 6 +poly_hash_ad_internal: + + + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + cmpq $13,%r8 + jne L$hash_ad_loop +L$poly_fast_tls_ad: + + movq (%rcx),%r10 + movq 5(%rcx),%r11 + shrq $24,%r11 + movq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + ret +L$hash_ad_loop: + + cmpq $16,%r8 + jb L$hash_ad_tail + addq 0+0(%rcx),%r10 + adcq 8+0(%rcx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rcx),%rcx + subq $16,%r8 + jmp L$hash_ad_loop +L$hash_ad_tail: + cmpq $0,%r8 + je L$hash_ad_done + + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + addq %r8,%rcx +L$hash_ad_tail_loop: + shldq $8,%r13,%r14 + shlq $8,%r13 + movzbq -1(%rcx),%r15 + xorq %r15,%r13 + decq %rcx + decq %r8 + jne L$hash_ad_tail_loop + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +L$hash_ad_done: + ret + + + +.globl _chacha20_poly1305_open_sse41 +.private_extern _chacha20_poly1305_open_sse41 + +.p2align 6 +_chacha20_poly1305_open_sse41: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + + pushq %r9 + + subq $288 + 0 + 32,%rsp + + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + + cmpq $128,%rbx + jbe L$open_sse_128 + + movdqa L$chacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + + movdqa %xmm12,%xmm7 + + movdqa %xmm4,0+48(%rbp) + movdqa %xmm8,0+64(%rbp) + movdqa %xmm12,0+96(%rbp) + movq $10,%r10 +L$open_sse_init_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + + decq %r10 + jne L$open_sse_init_rounds + + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + + pand L$clamp(%rip),%xmm0 + movdqa %xmm0,0+0(%rbp) + movdqa %xmm4,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +L$open_sse_main_loop: + cmpq $256,%rbx + jb L$open_sse_tail + + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd L$sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + + + + movq $4,%rcx + movq %rsi,%r8 +L$open_sse_main_loop_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + + leaq 16(%r8),%r8 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + palignr $4,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $12,%xmm15,%xmm15 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + palignr $12,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $4,%xmm15,%xmm15 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + + decq %rcx + jge L$open_sse_main_loop_rounds + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + cmpq $-6,%rcx + jg L$open_sse_main_loop_rounds + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqa %xmm12,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor 0+80(%rbp),%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp L$open_sse_main_loop +L$open_sse_tail: + + testq %rbx,%rbx + jz L$open_sse_finalize + cmpq $192,%rbx + ja L$open_sse_tail_256 + cmpq $128,%rbx + ja L$open_sse_tail_192 + cmpq $64,%rbx + ja L$open_sse_tail_128 + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa 0+96(%rbp),%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + cmpq $16,%rcx + jb L$open_sse_tail_64_rounds +L$open_sse_tail_64_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx +L$open_sse_tail_64_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + + cmpq $16,%rcx + jae L$open_sse_tail_64_rounds_and_x1hash + cmpq $160,%r8 + jne L$open_sse_tail_64_rounds + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + jmp L$open_sse_tail_64_dec_loop + +L$open_sse_tail_128: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 0+96(%rbp),%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + + movq %rbx,%rcx + andq $-16,%rcx + xorq %r8,%r8 +L$open_sse_tail_128_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +L$open_sse_tail_128_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + + cmpq %rcx,%r8 + jb L$open_sse_tail_128_rounds_and_x1hash + cmpq $160,%r8 + jne L$open_sse_tail_128_rounds + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + subq $64,%rbx + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jmp L$open_sse_tail_64_dec_loop + +L$open_sse_tail_192: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 0+96(%rbp),%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + + movq %rbx,%rcx + movq $160,%r8 + cmpq $160,%rcx + cmovgq %r8,%rcx + andq $-16,%rcx + xorq %r8,%r8 +L$open_sse_tail_192_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +L$open_sse_tail_192_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + + cmpq %rcx,%r8 + jb L$open_sse_tail_192_rounds_and_x1hash + cmpq $160,%r8 + jne L$open_sse_tail_192_rounds + cmpq $176,%rbx + jb L$open_sse_tail_192_finish + addq 0+160(%rsi),%r10 + adcq 8+160(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + cmpq $192,%rbx + jb L$open_sse_tail_192_finish + addq 0+176(%rsi),%r10 + adcq 8+176(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +L$open_sse_tail_192_finish: + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + subq $128,%rbx + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + jmp L$open_sse_tail_64_dec_loop + +L$open_sse_tail_256: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd L$sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + + xorq %r8,%r8 +L$open_sse_tail_256_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movdqa %xmm11,0+80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + movdqa 0+80(%rbp),%xmm11 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa %xmm9,0+80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb L$rol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb L$rol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 + palignr $4,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $12,%xmm15,%xmm15 + movdqa 0+80(%rbp),%xmm9 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + movdqa %xmm11,0+80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + movdqa 0+80(%rbp),%xmm11 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + movdqa %xmm9,0+80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb L$rol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb L$rol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 + palignr $12,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $4,%xmm15,%xmm15 + movdqa 0+80(%rbp),%xmm9 + + addq $16,%r8 + cmpq $160,%r8 + jb L$open_sse_tail_256_rounds_and_x1hash + + movq %rbx,%rcx + andq $-16,%rcx +L$open_sse_tail_256_hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%r8 + cmpq %rcx,%r8 + jb L$open_sse_tail_256_hash + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqa %xmm12,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movdqa 0+80(%rbp),%xmm12 + subq $192,%rbx + leaq 192(%rsi),%rsi + leaq 192(%rdi),%rdi + + +L$open_sse_tail_64_dec_loop: + cmpq $16,%rbx + jb L$open_sse_tail_16_init + subq $16,%rbx + movdqu (%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + jmp L$open_sse_tail_64_dec_loop +L$open_sse_tail_16_init: + movdqa %xmm0,%xmm1 + + +L$open_sse_tail_16: + testq %rbx,%rbx + jz L$open_sse_finalize + + + + pxor %xmm3,%xmm3 + leaq -1(%rsi,%rbx,1),%rsi + movq %rbx,%r8 +L$open_sse_tail_16_compose: + pslldq $1,%xmm3 + pinsrb $0,(%rsi),%xmm3 + subq $1,%rsi + subq $1,%r8 + jnz L$open_sse_tail_16_compose + + movq %xmm3,%r13 + pextrq $1,%xmm3,%r14 + + pxor %xmm1,%xmm3 + + +L$open_sse_tail_16_extract: + pextrb $0,%xmm3,(%rdi) + psrldq $1,%xmm3 + addq $1,%rdi + subq $1,%rbx + jne L$open_sse_tail_16_extract + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +L$open_sse_finalize: + addq 0+0+32(%rbp),%r10 + adcq 8+0+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+0+16(%rbp),%r10 + adcq 8+0+16(%rbp),%r11 + + + addq $288 + 0 + 32,%rsp + + + popq %r9 + + movq %r10,(%r9) + movq %r11,8(%r9) + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbx + + popq %rbp + + ret + +L$open_sse_128: + + movdqu L$chacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm12 + movdqa %xmm12,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm13,%xmm15 + movq $10,%r10 + +L$open_sse_128_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + + decq %r10 + jnz L$open_sse_128_rounds + paddd L$chacha20_consts(%rip),%xmm0 + paddd L$chacha20_consts(%rip),%xmm1 + paddd L$chacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm9 + paddd %xmm11,%xmm10 + paddd %xmm15,%xmm13 + paddd L$sse_inc(%rip),%xmm15 + paddd %xmm15,%xmm14 + + pand L$clamp(%rip),%xmm0 + movdqa %xmm0,0+0(%rbp) + movdqa %xmm4,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +L$open_sse_128_xor_hash: + cmpq $16,%rbx + jb L$open_sse_tail_16 + subq $16,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm1 + movdqu %xmm1,0(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + movdqa %xmm2,%xmm13 + movdqa %xmm6,%xmm2 + movdqa %xmm10,%xmm6 + movdqa %xmm14,%xmm10 + jmp L$open_sse_128_xor_hash + + + + + + + + + +.globl _chacha20_poly1305_seal_sse41 +.private_extern _chacha20_poly1305_seal_sse41 + +.p2align 6 +_chacha20_poly1305_seal_sse41: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + + pushq %r9 + + subq $288 + 0 + 32,%rsp + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx + + cmpq $128,%rbx + jbe L$seal_sse_128 + + movdqa L$chacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm14 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd L$sse_inc(%rip),%xmm12 + + movdqa %xmm4,0+48(%rbp) + movdqa %xmm8,0+64(%rbp) + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + movq $10,%r10 +L$seal_sse_init_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + palignr $4,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $12,%xmm15,%xmm15 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + palignr $12,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $4,%xmm15,%xmm15 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + + decq %r10 + jnz L$seal_sse_init_rounds + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + + pand L$clamp(%rip),%xmm3 + movdqa %xmm3,0+0(%rbp) + movdqa %xmm7,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + cmpq $192,%rbx + ja L$seal_sse_main_init + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + jmp L$seal_sse_128_tail_hash +L$seal_sse_main_init: + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 128(%rdi) + movdqu %xmm4,16 + 128(%rdi) + movdqu %xmm8,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + movq $2,%rcx + movq $8,%r8 + cmpq $64,%rbx + jbe L$seal_sse_tail_64 + cmpq $128,%rbx + jbe L$seal_sse_tail_128 + cmpq $192,%rbx + jbe L$seal_sse_tail_192 + +L$seal_sse_main_loop: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd L$sse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + +.p2align 5 +L$seal_sse_main_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + palignr $4,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $12,%xmm15,%xmm15 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + movdqa %xmm8,0+80(%rbp) + movdqa L$rol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa L$rol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + palignr $12,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $4,%xmm15,%xmm15 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + + leaq 16(%rdi),%rdi + decq %r8 + jge L$seal_sse_main_rounds + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg L$seal_sse_main_rounds + paddd L$chacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + movdqa %xmm14,0+80(%rbp) + movdqa %xmm14,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm14 + pxor %xmm3,%xmm14 + movdqu %xmm14,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm14 + pxor %xmm7,%xmm14 + movdqu %xmm14,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm14 + pxor %xmm11,%xmm14 + movdqu %xmm14,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm14 + pxor %xmm15,%xmm14 + movdqu %xmm14,48 + 0(%rdi) + + movdqa 0+80(%rbp),%xmm14 + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + cmpq $256,%rbx + ja L$seal_sse_main_loop_xor + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + jmp L$seal_sse_128_tail_hash +L$seal_sse_main_loop_xor: + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + subq $256,%rbx + movq $6,%rcx + movq $4,%r8 + cmpq $192,%rbx + jg L$seal_sse_main_loop + movq %rbx,%rcx + testq %rbx,%rbx + je L$seal_sse_128_tail_hash + movq $6,%rcx + cmpq $128,%rbx + ja L$seal_sse_tail_192 + cmpq $64,%rbx + ja L$seal_sse_tail_128 + +L$seal_sse_tail_64: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa 0+96(%rbp),%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + +L$seal_sse_tail_64_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_sse_tail_64_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg L$seal_sse_tail_64_rounds_and_x2hash + decq %r8 + jge L$seal_sse_tail_64_rounds_and_x1hash + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + jmp L$seal_sse_128_tail_xor + +L$seal_sse_tail_128: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 0+96(%rbp),%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + +L$seal_sse_tail_128_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_sse_tail_128_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + + leaq 16(%rdi),%rdi + decq %rcx + jg L$seal_sse_tail_128_rounds_and_x2hash + decq %r8 + jge L$seal_sse_tail_128_rounds_and_x1hash + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + movq $64,%rcx + subq $64,%rbx + leaq 64(%rsi),%rsi + jmp L$seal_sse_128_tail_hash + +L$seal_sse_tail_192: + movdqa L$chacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 0+96(%rbp),%xmm14 + paddd L$sse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + +L$seal_sse_tail_192_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_sse_tail_192_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + + leaq 16(%rdi),%rdi + decq %rcx + jg L$seal_sse_tail_192_rounds_and_x2hash + decq %r8 + jge L$seal_sse_tail_192_rounds_and_x1hash + paddd L$chacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd L$chacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd L$chacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + +L$seal_sse_128_tail_hash: + cmpq $16,%rcx + jb L$seal_sse_128_tail_xor + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + leaq 16(%rdi),%rdi + jmp L$seal_sse_128_tail_hash + +L$seal_sse_128_tail_xor: + cmpq $16,%rbx + jb L$seal_sse_tail_16 + subq $16,%rbx + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,0(%rdi) + + addq 0(%rdi),%r10 + adcq 8(%rdi),%r11 + adcq $1,%r12 + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + movdqa %xmm1,%xmm12 + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + jmp L$seal_sse_128_tail_xor + +L$seal_sse_tail_16: + testq %rbx,%rbx + jz L$process_blocks_of_extra_in + + movq %rbx,%r8 + movq %rbx,%rcx + leaq -1(%rsi,%rbx,1),%rsi + pxor %xmm15,%xmm15 +L$seal_sse_tail_16_compose: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + decq %rcx + jne L$seal_sse_tail_16_compose + + + pxor %xmm0,%xmm15 + + + movq %rbx,%rcx + movdqu %xmm15,%xmm0 +L$seal_sse_tail_16_extract: + pextrb $0,%xmm0,(%rdi) + psrldq $1,%xmm0 + addq $1,%rdi + subq $1,%rcx + jnz L$seal_sse_tail_16_extract + + + + + + + + + movq 288 + 0 + 32(%rsp),%r9 + movq 56(%r9),%r14 + movq 48(%r9),%r13 + testq %r14,%r14 + jz L$process_partial_block + + movq $16,%r15 + subq %rbx,%r15 + cmpq %r15,%r14 + + jge L$load_extra_in + movq %r14,%r15 + +L$load_extra_in: + + + leaq -1(%r13,%r15,1),%rsi + + + addq %r15,%r13 + subq %r15,%r14 + movq %r13,48(%r9) + movq %r14,56(%r9) + + + + addq %r15,%r8 + + + pxor %xmm11,%xmm11 +L$load_extra_load_loop: + pslldq $1,%xmm11 + pinsrb $0,(%rsi),%xmm11 + leaq -1(%rsi),%rsi + subq $1,%r15 + jnz L$load_extra_load_loop + + + + + movq %rbx,%r15 + +L$load_extra_shift_loop: + pslldq $1,%xmm11 + subq $1,%r15 + jnz L$load_extra_shift_loop + + + + + leaq L$and_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx,1),%xmm15 + + + por %xmm11,%xmm15 + + + + movq %xmm15,%r13 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +L$process_blocks_of_extra_in: + + movq 288+32+0 (%rsp),%r9 + movq 48(%r9),%rsi + movq 56(%r9),%r8 + movq %r8,%rcx + shrq $4,%r8 + +L$process_extra_hash_loop: + jz process_extra_in_trailer + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rsi),%rsi + subq $1,%r8 + jmp L$process_extra_hash_loop +process_extra_in_trailer: + andq $15,%rcx + movq %rcx,%rbx + jz L$do_length_block + leaq -1(%rsi,%rcx,1),%rsi + +L$process_extra_in_trailer_load: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + subq $1,%rcx + jnz L$process_extra_in_trailer_load + +L$process_partial_block: + + leaq L$and_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx,1),%xmm15 + movq %xmm15,%r13 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +L$do_length_block: + addq 0+0+32(%rbp),%r10 + adcq 8+0+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+0+16(%rbp),%r10 + adcq 8+0+16(%rbp),%r11 + + + addq $288 + 0 + 32,%rsp + + + popq %r9 + + movq %r10,(%r9) + movq %r11,8(%r9) + popq %r15 + + popq %r14 + + popq %r13 + + popq %r12 + + popq %rbx + + popq %rbp + + ret + +L$seal_sse_128: + + movdqu L$chacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm14 + movdqa %xmm14,%xmm12 + paddd L$sse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd L$sse_inc(%rip),%xmm13 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + movq $10,%r10 + +L$seal_sse_128_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb L$rol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb L$rol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb L$rol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + + decq %r10 + jnz L$seal_sse_128_rounds + paddd L$chacha20_consts(%rip),%xmm0 + paddd L$chacha20_consts(%rip),%xmm1 + paddd L$chacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm8 + paddd %xmm11,%xmm9 + paddd %xmm15,%xmm12 + paddd L$sse_inc(%rip),%xmm15 + paddd %xmm15,%xmm13 + + pand L$clamp(%rip),%xmm2 + movdqa %xmm2,0+0(%rbp) + movdqa %xmm6,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + jmp L$seal_sse_128_tail_xor + + + + +.globl _chacha20_poly1305_open_avx2 +.private_extern _chacha20_poly1305_open_avx2 + +.p2align 6 +_chacha20_poly1305_open_avx2: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + + pushq %r9 + + subq $288 + 0 + 32,%rsp + + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + + vzeroupper + vmovdqa L$chacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd L$avx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe L$open_avx2_192 + cmpq $320,%rbx + jbe L$open_avx2_320 + + vmovdqa %ymm4,0+64(%rbp) + vmovdqa %ymm8,0+96(%rbp) + vmovdqa %ymm12,0+160(%rbp) + movq $10,%r10 +L$open_avx2_init_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + decq %r10 + jne L$open_avx2_init_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + + movq %r8,%r8 + call poly_hash_ad_internal + + xorq %rcx,%rcx +L$open_avx2_init_hash: + addq 0+0(%rsi,%rcx,1),%r10 + adcq 8+0(%rsi,%rcx,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%rcx + cmpq $64,%rcx + jne L$open_avx2_init_hash + + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + + vmovdqu %ymm0,0(%rdi) + vmovdqu %ymm4,32(%rdi) + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + subq $64,%rbx +L$open_avx2_main_loop: + + cmpq $512,%rbx + jb L$open_avx2_main_loop_done + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + xorq %rcx,%rcx +L$open_avx2_main_loop_rounds: + addq 0+0(%rsi,%rcx,1),%r10 + adcq 8+0(%rsi,%rcx,1),%r11 + adcq $1,%r12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + addq 0+16(%rsi,%rcx,1),%r10 + adcq 8+16(%rsi,%rcx,1),%r11 + adcq $1,%r12 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq 0+32(%rsi,%rcx,1),%r10 + adcq 8+32(%rsi,%rcx,1),%r11 + adcq $1,%r12 + + leaq 48(%rcx),%rcx + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq %rax,%r15 + adcq %rdx,%r9 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + cmpq $60*8,%rcx + jne L$open_avx2_main_loop_rounds + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + addq 0+60*8(%rsi),%r10 + adcq 8+60*8(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + addq 0+60*8+16(%rsi),%r10 + adcq 8+60*8+16(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + leaq 512(%rdi),%rdi + subq $512,%rbx + jmp L$open_avx2_main_loop +L$open_avx2_main_loop_done: + testq %rbx,%rbx + vzeroupper + je L$open_sse_finalize + + cmpq $384,%rbx + ja L$open_avx2_tail_512 + cmpq $256,%rbx + ja L$open_avx2_tail_384 + cmpq $128,%rbx + ja L$open_avx2_tail_256 + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + andq $-16,%rcx + testq %rcx,%rcx + je L$open_avx2_tail_128_rounds +L$open_avx2_tail_128_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +L$open_avx2_tail_128_rounds: + addq $16,%r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb L$open_avx2_tail_128_rounds_and_x1hash + cmpq $160,%r8 + jne L$open_avx2_tail_128_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp L$open_avx2_tail_128_xor + +L$open_avx2_tail_256: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + + movq %rbx,0+128(%rbp) + movq %rbx,%rcx + subq $128,%rcx + shrq $4,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +L$open_avx2_tail_256_rounds_and_x1hash: + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +L$open_avx2_tail_256_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + + incq %r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + cmpq %rcx,%r8 + jb L$open_avx2_tail_256_rounds_and_x1hash + cmpq $10,%r8 + jne L$open_avx2_tail_256_rounds + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 0+128(%rbp),%rbx +L$open_avx2_tail_256_hash: + addq $16,%rcx + cmpq %rbx,%rcx + jg L$open_avx2_tail_256_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp L$open_avx2_tail_256_hash +L$open_avx2_tail_256_done: + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + subq $128,%rbx + jmp L$open_avx2_tail_128_xor + +L$open_avx2_tail_384: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + + movq %rbx,0+128(%rbp) + movq %rbx,%rcx + subq $256,%rcx + shrq $4,%rcx + addq $6,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +L$open_avx2_tail_384_rounds_and_x2hash: + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +L$open_avx2_tail_384_rounds_and_x1hash: + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx + incq %r8 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb L$open_avx2_tail_384_rounds_and_x2hash + cmpq $10,%r8 + jne L$open_avx2_tail_384_rounds_and_x1hash + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 0+128(%rbp),%rbx +L$open_avx2_384_tail_hash: + addq $16,%rcx + cmpq %rbx,%rcx + jg L$open_avx2_384_tail_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp L$open_avx2_384_tail_hash +L$open_avx2_384_tail_done: + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp L$open_avx2_tail_128_xor + +L$open_avx2_tail_512: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + xorq %rcx,%rcx + movq %rsi,%r8 +L$open_avx2_tail_512_rounds_and_x2hash: + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 +L$open_avx2_tail_512_rounds_and_x1hash: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + addq 0+16(%r8),%r10 + adcq 8+16(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%r8),%r8 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + incq %rcx + cmpq $4,%rcx + jl L$open_avx2_tail_512_rounds_and_x2hash + cmpq $10,%rcx + jne L$open_avx2_tail_512_rounds_and_x1hash + movq %rbx,%rcx + subq $384,%rcx + andq $-16,%rcx +L$open_avx2_tail_512_hash: + testq %rcx,%rcx + je L$open_avx2_tail_512_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + subq $16,%rcx + jmp L$open_avx2_tail_512_hash +L$open_avx2_tail_512_done: + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 384(%rsi),%rsi + leaq 384(%rdi),%rdi + subq $384,%rbx +L$open_avx2_tail_128_xor: + cmpq $32,%rbx + jb L$open_avx2_tail_32_xor + subq $32,%rbx + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + jmp L$open_avx2_tail_128_xor +L$open_avx2_tail_32_xor: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb L$open_avx2_exit + subq $16,%rbx + + vpxor (%rsi),%xmm0,%xmm1 + vmovdqu %xmm1,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 + vmovdqa %xmm0,%xmm1 +L$open_avx2_exit: + vzeroupper + jmp L$open_sse_tail_16 + +L$open_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +L$open_avx2_192_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne L$open_avx2_192_rounds + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +L$open_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal +L$open_avx2_short_hash_and_xor_loop: + cmpq $32,%rbx + jb L$open_avx2_short_tail_32 + subq $32,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rsi),%r10 + adcq 8+16(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp L$open_avx2_short_hash_and_xor_loop +L$open_avx2_short_tail_32: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb L$open_avx2_short_tail_32_exit + subq $16,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm1 +L$open_avx2_short_tail_32_exit: + vzeroupper + jmp L$open_sse_tail_16 + +L$open_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 + vpaddd L$avx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + movq $10,%r10 +L$open_avx2_320_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne L$open_avx2_320_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp L$open_avx2_short + + + + +.globl _chacha20_poly1305_seal_avx2 +.private_extern _chacha20_poly1305_seal_avx2 + +.p2align 6 +_chacha20_poly1305_seal_avx2: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r13 + + pushq %r14 + + pushq %r15 + + + + pushq %r9 + + subq $288 + 0 + 32,%rsp + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx + + vzeroupper + vmovdqa L$chacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd L$avx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe L$seal_avx2_192 + cmpq $320,%rbx + jbe L$seal_avx2_320 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm4,0+64(%rbp) + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm8,0+96(%rbp) + vmovdqa %ymm12,%ymm15 + vpaddd L$avx2_inc(%rip),%ymm15,%ymm14 + vpaddd L$avx2_inc(%rip),%ymm14,%ymm13 + vpaddd L$avx2_inc(%rip),%ymm13,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm15,0+256(%rbp) + movq $10,%r10 +L$seal_avx2_init_rounds: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %r10 + jnz L$seal_avx2_init_rounds + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 + vpand L$clamp(%rip),%ymm15,%ymm15 + vmovdqa %ymm15,0+0(%rbp) + movq %r8,%r8 + call poly_hash_ad_internal + + vpxor 0(%rsi),%ymm3,%ymm3 + vpxor 32(%rsi),%ymm11,%ymm11 + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm11,32(%rdi) + vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+64(%rsi),%ymm15,%ymm15 + vpxor 32+64(%rsi),%ymm2,%ymm2 + vpxor 64+64(%rsi),%ymm6,%ymm6 + vpxor 96+64(%rsi),%ymm10,%ymm10 + vmovdqu %ymm15,0+64(%rdi) + vmovdqu %ymm2,32+64(%rdi) + vmovdqu %ymm6,64+64(%rdi) + vmovdqu %ymm10,96+64(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+192(%rsi),%ymm15,%ymm15 + vpxor 32+192(%rsi),%ymm1,%ymm1 + vpxor 64+192(%rsi),%ymm5,%ymm5 + vpxor 96+192(%rsi),%ymm9,%ymm9 + vmovdqu %ymm15,0+192(%rdi) + vmovdqu %ymm1,32+192(%rdi) + vmovdqu %ymm5,64+192(%rdi) + vmovdqu %ymm9,96+192(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm15,%ymm8 + + leaq 320(%rsi),%rsi + subq $320,%rbx + movq $320,%rcx + cmpq $128,%rbx + jbe L$seal_avx2_short_hash_remainder + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + vpxor 64(%rsi),%ymm8,%ymm8 + vpxor 96(%rsi),%ymm12,%ymm12 + vmovdqu %ymm0,320(%rdi) + vmovdqu %ymm4,352(%rdi) + vmovdqu %ymm8,384(%rdi) + vmovdqu %ymm12,416(%rdi) + leaq 128(%rsi),%rsi + subq $128,%rbx + movq $8,%rcx + movq $2,%r8 + cmpq $128,%rbx + jbe L$seal_avx2_tail_128 + cmpq $256,%rbx + jbe L$seal_avx2_tail_256 + cmpq $384,%rbx + jbe L$seal_avx2_tail_384 + cmpq $512,%rbx + jbe L$seal_avx2_tail_512 + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + + subq $16,%rdi + movq $9,%rcx + jmp L$seal_avx2_main_loop_rounds_entry +.p2align 5 +L$seal_avx2_main_loop: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + movq $10,%rcx +.p2align 5 +L$seal_avx2_main_loop_rounds: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +L$seal_avx2_main_loop_rounds_entry: + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq 0+32(%rdi),%r10 + adcq 8+32(%rdi),%r11 + adcq $1,%r12 + + leaq 48(%rdi),%rdi + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq %rax,%r15 + adcq %rdx,%r9 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %rcx + jne L$seal_avx2_main_loop_rounds + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + subq $512,%rbx + cmpq $512,%rbx + jg L$seal_avx2_main_loop + + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + movq $10,%rcx + xorq %r8,%r8 + + cmpq $384,%rbx + ja L$seal_avx2_tail_512 + cmpq $256,%rbx + ja L$seal_avx2_tail_384 + cmpq $128,%rbx + ja L$seal_avx2_tail_256 + +L$seal_avx2_tail_128: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + +L$seal_avx2_tail_128_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_avx2_tail_128_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg L$seal_avx2_tail_128_rounds_and_3xhash + decq %r8 + jge L$seal_avx2_tail_128_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp L$seal_avx2_short_loop + +L$seal_avx2_tail_256: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + +L$seal_avx2_tail_256_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_avx2_tail_256_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg L$seal_avx2_tail_256_rounds_and_3xhash + decq %r8 + jge L$seal_avx2_tail_256_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $128,%rcx + leaq 128(%rsi),%rsi + subq $128,%rbx + jmp L$seal_avx2_short_hash_remainder + +L$seal_avx2_tail_384: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + +L$seal_avx2_tail_384_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_avx2_tail_384_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + leaq 32(%rdi),%rdi + decq %rcx + jg L$seal_avx2_tail_384_rounds_and_3xhash + decq %r8 + jge L$seal_avx2_tail_384_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $256,%rcx + leaq 256(%rsi),%rsi + subq $256,%rbx + jmp L$seal_avx2_short_hash_remainder + +L$seal_avx2_tail_512: + vmovdqa L$chacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa L$avx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + +L$seal_avx2_tail_512_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +L$seal_avx2_tail_512_rounds_and_2xhash: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq %rax,%r15 + adcq %rdx,%r9 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa L$rol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa L$rol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + + + + + + + + + + + + + + + + addq %rax,%r15 + adcq %rdx,%r9 + + + + + + + + + + + + + + + + + + + + + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg L$seal_avx2_tail_512_rounds_and_3xhash + decq %r8 + jge L$seal_avx2_tail_512_rounds_and_2xhash + vpaddd L$chacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $384,%rcx + leaq 384(%rsi),%rsi + subq $384,%rbx + jmp L$seal_avx2_short_hash_remainder + +L$seal_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 + vpaddd L$avx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + movq $10,%r10 +L$seal_avx2_320_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb L$rol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne L$seal_avx2_320_rounds + vpaddd L$chacha20_consts(%rip),%ymm0,%ymm0 + vpaddd L$chacha20_consts(%rip),%ymm1,%ymm1 + vpaddd L$chacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp L$seal_avx2_short + +L$seal_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd L$avx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +L$seal_avx2_192_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb L$rol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb L$rol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne L$seal_avx2_192_rounds + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand L$clamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +L$seal_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal + xorq %rcx,%rcx +L$seal_avx2_short_hash_remainder: + cmpq $16,%rcx + jb L$seal_avx2_short_loop + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + addq $16,%rdi + jmp L$seal_avx2_short_hash_remainder +L$seal_avx2_short_loop: + cmpq $32,%rbx + jb L$seal_avx2_short_tail + subq $32,%rbx + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp L$seal_avx2_short_loop +L$seal_avx2_short_tail: + cmpq $16,%rbx + jb L$seal_avx2_exit + subq $16,%rbx + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm0 +L$seal_avx2_exit: + vzeroupper + jmp L$seal_sse_tail_16 + + +#endif diff --git a/third_party/boringssl/gen/crypto/chacha20_poly1305_x86_64-linux.S b/third_party/boringssl/gen/crypto/chacha20_poly1305_x86_64-linux.S new file mode 100644 index 00000000..6fd94c84 --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha20_poly1305_x86_64-linux.S @@ -0,0 +1,8940 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.section .rodata +.align 64 +chacha20_poly1305_constants: +.Lchacha20_consts: +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +.Lrol8: +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 +.Lrol16: +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 +.Lavx2_init: +.long 0,0,0,0 +.Lsse_inc: +.long 1,0,0,0 +.Lavx2_inc: +.long 2,0,0,0,2,0,0,0 +.Lclamp: +.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC +.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF +.align 16 +.Land_masks: +.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 +.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +.text + +.type poly_hash_ad_internal,@function +.align 64 +poly_hash_ad_internal: +.cfi_startproc +.cfi_def_cfa rsp, 8 + xorq %r10,%r10 + xorq %r11,%r11 + xorq %r12,%r12 + cmpq $13,%r8 + jne .Lhash_ad_loop +.Lpoly_fast_tls_ad: + + movq (%rcx),%r10 + movq 5(%rcx),%r11 + shrq $24,%r11 + movq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + ret +.Lhash_ad_loop: + + cmpq $16,%r8 + jb .Lhash_ad_tail + addq 0+0(%rcx),%r10 + adcq 8+0(%rcx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rcx),%rcx + subq $16,%r8 + jmp .Lhash_ad_loop +.Lhash_ad_tail: + cmpq $0,%r8 + je .Lhash_ad_done + + xorq %r13,%r13 + xorq %r14,%r14 + xorq %r15,%r15 + addq %r8,%rcx +.Lhash_ad_tail_loop: + shldq $8,%r13,%r14 + shlq $8,%r13 + movzbq -1(%rcx),%r15 + xorq %r15,%r13 + decq %rcx + decq %r8 + jne .Lhash_ad_tail_loop + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +.Lhash_ad_done: + ret +.cfi_endproc +.size poly_hash_ad_internal, .-poly_hash_ad_internal + +.globl chacha20_poly1305_open_sse41 +.hidden chacha20_poly1305_open_sse41 +.type chacha20_poly1305_open_sse41,@function +.align 64 +chacha20_poly1305_open_sse41: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + + cmpq $128,%rbx + jbe .Lopen_sse_128 + + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + + movdqa %xmm12,%xmm7 + + movdqa %xmm4,0+48(%rbp) + movdqa %xmm8,0+64(%rbp) + movdqa %xmm12,0+96(%rbp) + movq $10,%r10 +.Lopen_sse_init_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + + decq %r10 + jne .Lopen_sse_init_rounds + + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + + pand .Lclamp(%rip),%xmm0 + movdqa %xmm0,0+0(%rbp) + movdqa %xmm4,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +.Lopen_sse_main_loop: + cmpq $256,%rbx + jb .Lopen_sse_tail + + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd .Lsse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + + + + movq $4,%rcx + movq %rsi,%r8 +.Lopen_sse_main_loop_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + + leaq 16(%r8),%r8 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + palignr $4,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $12,%xmm15,%xmm15 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + palignr $12,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $4,%xmm15,%xmm15 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + + decq %rcx + jge .Lopen_sse_main_loop_rounds + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + cmpq $-6,%rcx + jg .Lopen_sse_main_loop_rounds + paddd .Lchacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqa %xmm12,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor 0+80(%rbp),%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp .Lopen_sse_main_loop +.Lopen_sse_tail: + + testq %rbx,%rbx + jz .Lopen_sse_finalize + cmpq $192,%rbx + ja .Lopen_sse_tail_256 + cmpq $128,%rbx + ja .Lopen_sse_tail_192 + cmpq $64,%rbx + ja .Lopen_sse_tail_128 + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa 0+96(%rbp),%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + cmpq $16,%rcx + jb .Lopen_sse_tail_64_rounds +.Lopen_sse_tail_64_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx +.Lopen_sse_tail_64_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + + cmpq $16,%rcx + jae .Lopen_sse_tail_64_rounds_and_x1hash + cmpq $160,%r8 + jne .Lopen_sse_tail_64_rounds + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + jmp .Lopen_sse_tail_64_dec_loop + +.Lopen_sse_tail_128: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 0+96(%rbp),%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + + movq %rbx,%rcx + andq $-16,%rcx + xorq %r8,%r8 +.Lopen_sse_tail_128_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +.Lopen_sse_tail_128_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + + cmpq %rcx,%r8 + jb .Lopen_sse_tail_128_rounds_and_x1hash + cmpq $160,%r8 + jne .Lopen_sse_tail_128_rounds + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + subq $64,%rbx + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + jmp .Lopen_sse_tail_64_dec_loop + +.Lopen_sse_tail_192: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 0+96(%rbp),%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + + movq %rbx,%rcx + movq $160,%r8 + cmpq $160,%rcx + cmovgq %r8,%rcx + andq $-16,%rcx + xorq %r8,%r8 +.Lopen_sse_tail_192_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +.Lopen_sse_tail_192_rounds: + addq $16,%r8 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + + cmpq %rcx,%r8 + jb .Lopen_sse_tail_192_rounds_and_x1hash + cmpq $160,%r8 + jne .Lopen_sse_tail_192_rounds + cmpq $176,%rbx + jb .Lopen_sse_tail_192_finish + addq 0+160(%rsi),%r10 + adcq 8+160(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + cmpq $192,%rbx + jb .Lopen_sse_tail_192_finish + addq 0+176(%rsi),%r10 + adcq 8+176(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +.Lopen_sse_tail_192_finish: + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + subq $128,%rbx + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + jmp .Lopen_sse_tail_64_dec_loop + +.Lopen_sse_tail_256: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd .Lsse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + + xorq %r8,%r8 +.Lopen_sse_tail_256_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movdqa %xmm11,0+80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + movdqa 0+80(%rbp),%xmm11 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa %xmm9,0+80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .Lrol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .Lrol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 + palignr $4,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $12,%xmm15,%xmm15 + movdqa 0+80(%rbp),%xmm9 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + movdqa %xmm11,0+80(%rbp) + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm4 + pxor %xmm11,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm4 + pxor %xmm11,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm5 + pxor %xmm11,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm5 + pxor %xmm11,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $12,%xmm11 + psrld $20,%xmm6 + pxor %xmm11,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm11 + pslld $7,%xmm11 + psrld $25,%xmm6 + pxor %xmm11,%xmm6 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + movdqa 0+80(%rbp),%xmm11 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + movdqa %xmm9,0+80(%rbp) + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .Lrol16(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $12,%xmm9 + psrld $20,%xmm7 + pxor %xmm9,%xmm7 + paddd %xmm7,%xmm3 + pxor %xmm3,%xmm15 + pshufb .Lrol8(%rip),%xmm15 + paddd %xmm15,%xmm11 + pxor %xmm11,%xmm7 + movdqa %xmm7,%xmm9 + pslld $7,%xmm9 + psrld $25,%xmm7 + pxor %xmm9,%xmm7 + palignr $12,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $4,%xmm15,%xmm15 + movdqa 0+80(%rbp),%xmm9 + + addq $16,%r8 + cmpq $160,%r8 + jb .Lopen_sse_tail_256_rounds_and_x1hash + + movq %rbx,%rcx + andq $-16,%rcx +.Lopen_sse_tail_256_hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%r8 + cmpq %rcx,%r8 + jb .Lopen_sse_tail_256_hash + paddd .Lchacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqa %xmm12,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm12 + pxor %xmm3,%xmm12 + movdqu %xmm12,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm12 + pxor %xmm7,%xmm12 + movdqu %xmm12,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm12 + pxor %xmm11,%xmm12 + movdqu %xmm12,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm12 + pxor %xmm15,%xmm12 + movdqu %xmm12,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movdqa 0+80(%rbp),%xmm12 + subq $192,%rbx + leaq 192(%rsi),%rsi + leaq 192(%rdi),%rdi + + +.Lopen_sse_tail_64_dec_loop: + cmpq $16,%rbx + jb .Lopen_sse_tail_16_init + subq $16,%rbx + movdqu (%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + jmp .Lopen_sse_tail_64_dec_loop +.Lopen_sse_tail_16_init: + movdqa %xmm0,%xmm1 + + +.Lopen_sse_tail_16: + testq %rbx,%rbx + jz .Lopen_sse_finalize + + + + pxor %xmm3,%xmm3 + leaq -1(%rsi,%rbx,1),%rsi + movq %rbx,%r8 +.Lopen_sse_tail_16_compose: + pslldq $1,%xmm3 + pinsrb $0,(%rsi),%xmm3 + subq $1,%rsi + subq $1,%r8 + jnz .Lopen_sse_tail_16_compose + + movq %xmm3,%r13 + pextrq $1,%xmm3,%r14 + + pxor %xmm1,%xmm3 + + +.Lopen_sse_tail_16_extract: + pextrb $0,%xmm3,(%rdi) + psrldq $1,%xmm3 + addq $1,%rdi + subq $1,%rbx + jne .Lopen_sse_tail_16_extract + + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +.Lopen_sse_finalize: + addq 0+0+32(%rbp),%r10 + adcq 8+0+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+0+16(%rbp),%r10 + adcq 8+0+16(%rbp),%r11 + +.cfi_remember_state + addq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset -(288 + 32) + + popq %r9 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r9 + movq %r10,(%r9) + movq %r11,8(%r9) + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + ret + +.Lopen_sse_128: +.cfi_restore_state + movdqu .Lchacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm12 + movdqa %xmm12,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm13,%xmm15 + movq $10,%r10 + +.Lopen_sse_128_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + + decq %r10 + jnz .Lopen_sse_128_rounds + paddd .Lchacha20_consts(%rip),%xmm0 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm9 + paddd %xmm11,%xmm10 + paddd %xmm15,%xmm13 + paddd .Lsse_inc(%rip),%xmm15 + paddd %xmm15,%xmm14 + + pand .Lclamp(%rip),%xmm0 + movdqa %xmm0,0+0(%rbp) + movdqa %xmm4,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal +.Lopen_sse_128_xor_hash: + cmpq $16,%rbx + jb .Lopen_sse_tail_16 + subq $16,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm1 + movdqu %xmm1,0(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + movdqa %xmm2,%xmm13 + movdqa %xmm6,%xmm2 + movdqa %xmm10,%xmm6 + movdqa %xmm14,%xmm10 + jmp .Lopen_sse_128_xor_hash +.size chacha20_poly1305_open_sse41, .-chacha20_poly1305_open_sse41 +.cfi_endproc + + + + + + + +.globl chacha20_poly1305_seal_sse41 +.hidden chacha20_poly1305_seal_sse41 +.type chacha20_poly1305_seal_sse41,@function +.align 64 +chacha20_poly1305_seal_sse41: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx + + cmpq $128,%rbx + jbe .Lseal_sse_128 + + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqu 0(%r9),%xmm4 + movdqu 16(%r9),%xmm8 + movdqu 32(%r9),%xmm12 + + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm14 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd .Lsse_inc(%rip),%xmm12 + + movdqa %xmm4,0+48(%rbp) + movdqa %xmm8,0+64(%rbp) + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + movq $10,%r10 +.Lseal_sse_init_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + palignr $4,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $12,%xmm15,%xmm15 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + palignr $12,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $4,%xmm15,%xmm15 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + + decq %r10 + jnz .Lseal_sse_init_rounds + paddd .Lchacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + + pand .Lclamp(%rip),%xmm3 + movdqa %xmm3,0+0(%rbp) + movdqa %xmm7,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + cmpq $192,%rbx + ja .Lseal_sse_main_init + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + jmp .Lseal_sse_128_tail_hash +.Lseal_sse_main_init: + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 128(%rdi) + movdqu %xmm4,16 + 128(%rdi) + movdqu %xmm8,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + movq $2,%rcx + movq $8,%r8 + cmpq $64,%rbx + jbe .Lseal_sse_tail_64 + cmpq $128,%rbx + jbe .Lseal_sse_tail_128 + cmpq $192,%rbx + jbe .Lseal_sse_tail_192 + +.Lseal_sse_main_loop: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa %xmm0,%xmm3 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa 0+96(%rbp),%xmm15 + paddd .Lsse_inc(%rip),%xmm15 + movdqa %xmm15,%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + movdqa %xmm15,0+144(%rbp) + +.align 32 +.Lseal_sse_main_rounds: + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + palignr $4,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $12,%xmm15,%xmm15 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + movdqa %xmm8,0+80(%rbp) + movdqa .Lrol16(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $20,%xmm8 + pslld $32-20,%xmm4 + pxor %xmm8,%xmm4 + movdqa .Lrol8(%rip),%xmm8 + paddd %xmm7,%xmm3 + paddd %xmm6,%xmm2 + paddd %xmm5,%xmm1 + paddd %xmm4,%xmm0 + pxor %xmm3,%xmm15 + pxor %xmm2,%xmm14 + pxor %xmm1,%xmm13 + pxor %xmm0,%xmm12 + pshufb %xmm8,%xmm15 + pshufb %xmm8,%xmm14 + pshufb %xmm8,%xmm13 + pshufb %xmm8,%xmm12 + movdqa 0+80(%rbp),%xmm8 + paddd %xmm15,%xmm11 + paddd %xmm14,%xmm10 + paddd %xmm13,%xmm9 + paddd %xmm12,%xmm8 + pxor %xmm11,%xmm7 + pxor %xmm10,%xmm6 + pxor %xmm9,%xmm5 + pxor %xmm8,%xmm4 + movdqa %xmm8,0+80(%rbp) + movdqa %xmm7,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm7 + pxor %xmm8,%xmm7 + movdqa %xmm6,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm6 + pxor %xmm8,%xmm6 + movdqa %xmm5,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm5 + pxor %xmm8,%xmm5 + movdqa %xmm4,%xmm8 + psrld $25,%xmm8 + pslld $32-25,%xmm4 + pxor %xmm8,%xmm4 + movdqa 0+80(%rbp),%xmm8 + palignr $12,%xmm7,%xmm7 + palignr $8,%xmm11,%xmm11 + palignr $4,%xmm15,%xmm15 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + + leaq 16(%rdi),%rdi + decq %r8 + jge .Lseal_sse_main_rounds + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg .Lseal_sse_main_rounds + paddd .Lchacha20_consts(%rip),%xmm3 + paddd 0+48(%rbp),%xmm7 + paddd 0+64(%rbp),%xmm11 + paddd 0+144(%rbp),%xmm15 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + movdqa %xmm14,0+80(%rbp) + movdqa %xmm14,0+80(%rbp) + movdqu 0 + 0(%rsi),%xmm14 + pxor %xmm3,%xmm14 + movdqu %xmm14,0 + 0(%rdi) + movdqu 16 + 0(%rsi),%xmm14 + pxor %xmm7,%xmm14 + movdqu %xmm14,16 + 0(%rdi) + movdqu 32 + 0(%rsi),%xmm14 + pxor %xmm11,%xmm14 + movdqu %xmm14,32 + 0(%rdi) + movdqu 48 + 0(%rsi),%xmm14 + pxor %xmm15,%xmm14 + movdqu %xmm14,48 + 0(%rdi) + + movdqa 0+80(%rbp),%xmm14 + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 64(%rdi) + movdqu %xmm6,16 + 64(%rdi) + movdqu %xmm10,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + movdqu 0 + 128(%rsi),%xmm3 + movdqu 16 + 128(%rsi),%xmm7 + movdqu 32 + 128(%rsi),%xmm11 + movdqu 48 + 128(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 128(%rdi) + movdqu %xmm5,16 + 128(%rdi) + movdqu %xmm9,32 + 128(%rdi) + movdqu %xmm15,48 + 128(%rdi) + + cmpq $256,%rbx + ja .Lseal_sse_main_loop_xor + + movq $192,%rcx + subq $192,%rbx + leaq 192(%rsi),%rsi + jmp .Lseal_sse_128_tail_hash +.Lseal_sse_main_loop_xor: + movdqu 0 + 192(%rsi),%xmm3 + movdqu 16 + 192(%rsi),%xmm7 + movdqu 32 + 192(%rsi),%xmm11 + movdqu 48 + 192(%rsi),%xmm15 + pxor %xmm3,%xmm0 + pxor %xmm7,%xmm4 + pxor %xmm11,%xmm8 + pxor %xmm12,%xmm15 + movdqu %xmm0,0 + 192(%rdi) + movdqu %xmm4,16 + 192(%rdi) + movdqu %xmm8,32 + 192(%rdi) + movdqu %xmm15,48 + 192(%rdi) + + leaq 256(%rsi),%rsi + subq $256,%rbx + movq $6,%rcx + movq $4,%r8 + cmpq $192,%rbx + jg .Lseal_sse_main_loop + movq %rbx,%rcx + testq %rbx,%rbx + je .Lseal_sse_128_tail_hash + movq $6,%rcx + cmpq $128,%rbx + ja .Lseal_sse_tail_192 + cmpq $64,%rbx + ja .Lseal_sse_tail_128 + +.Lseal_sse_tail_64: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa 0+96(%rbp),%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + +.Lseal_sse_tail_64_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_sse_tail_64_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + decq %rcx + jg .Lseal_sse_tail_64_rounds_and_x2hash + decq %r8 + jge .Lseal_sse_tail_64_rounds_and_x1hash + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + + jmp .Lseal_sse_128_tail_xor + +.Lseal_sse_tail_128: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa 0+96(%rbp),%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + +.Lseal_sse_tail_128_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_sse_tail_128_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + + leaq 16(%rdi),%rdi + decq %rcx + jg .Lseal_sse_tail_128_rounds_and_x2hash + decq %r8 + jge .Lseal_sse_tail_128_rounds_and_x1hash + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 0(%rdi) + movdqu %xmm5,16 + 0(%rdi) + movdqu %xmm9,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + + movq $64,%rcx + subq $64,%rbx + leaq 64(%rsi),%rsi + jmp .Lseal_sse_128_tail_hash + +.Lseal_sse_tail_192: + movdqa .Lchacha20_consts(%rip),%xmm0 + movdqa 0+48(%rbp),%xmm4 + movdqa 0+64(%rbp),%xmm8 + movdqa %xmm0,%xmm1 + movdqa %xmm4,%xmm5 + movdqa %xmm8,%xmm9 + movdqa %xmm0,%xmm2 + movdqa %xmm4,%xmm6 + movdqa %xmm8,%xmm10 + movdqa 0+96(%rbp),%xmm14 + paddd .Lsse_inc(%rip),%xmm14 + movdqa %xmm14,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm13,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,0+96(%rbp) + movdqa %xmm13,0+112(%rbp) + movdqa %xmm14,0+128(%rbp) + +.Lseal_sse_tail_192_rounds_and_x2hash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_sse_tail_192_rounds_and_x1hash: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + + leaq 16(%rdi),%rdi + decq %rcx + jg .Lseal_sse_tail_192_rounds_and_x2hash + decq %r8 + jge .Lseal_sse_tail_192_rounds_and_x1hash + paddd .Lchacha20_consts(%rip),%xmm2 + paddd 0+48(%rbp),%xmm6 + paddd 0+64(%rbp),%xmm10 + paddd 0+128(%rbp),%xmm14 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd 0+48(%rbp),%xmm5 + paddd 0+64(%rbp),%xmm9 + paddd 0+112(%rbp),%xmm13 + paddd .Lchacha20_consts(%rip),%xmm0 + paddd 0+48(%rbp),%xmm4 + paddd 0+64(%rbp),%xmm8 + paddd 0+96(%rbp),%xmm12 + movdqu 0 + 0(%rsi),%xmm3 + movdqu 16 + 0(%rsi),%xmm7 + movdqu 32 + 0(%rsi),%xmm11 + movdqu 48 + 0(%rsi),%xmm15 + pxor %xmm3,%xmm2 + pxor %xmm7,%xmm6 + pxor %xmm11,%xmm10 + pxor %xmm14,%xmm15 + movdqu %xmm2,0 + 0(%rdi) + movdqu %xmm6,16 + 0(%rdi) + movdqu %xmm10,32 + 0(%rdi) + movdqu %xmm15,48 + 0(%rdi) + movdqu 0 + 64(%rsi),%xmm3 + movdqu 16 + 64(%rsi),%xmm7 + movdqu 32 + 64(%rsi),%xmm11 + movdqu 48 + 64(%rsi),%xmm15 + pxor %xmm3,%xmm1 + pxor %xmm7,%xmm5 + pxor %xmm11,%xmm9 + pxor %xmm13,%xmm15 + movdqu %xmm1,0 + 64(%rdi) + movdqu %xmm5,16 + 64(%rdi) + movdqu %xmm9,32 + 64(%rdi) + movdqu %xmm15,48 + 64(%rdi) + + movq $128,%rcx + subq $128,%rbx + leaq 128(%rsi),%rsi + +.Lseal_sse_128_tail_hash: + cmpq $16,%rcx + jb .Lseal_sse_128_tail_xor + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + leaq 16(%rdi),%rdi + jmp .Lseal_sse_128_tail_hash + +.Lseal_sse_128_tail_xor: + cmpq $16,%rbx + jb .Lseal_sse_tail_16 + subq $16,%rbx + + movdqu 0(%rsi),%xmm3 + pxor %xmm3,%xmm0 + movdqu %xmm0,0(%rdi) + + addq 0(%rdi),%r10 + adcq 8(%rdi),%r11 + adcq $1,%r12 + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movdqa %xmm4,%xmm0 + movdqa %xmm8,%xmm4 + movdqa %xmm12,%xmm8 + movdqa %xmm1,%xmm12 + movdqa %xmm5,%xmm1 + movdqa %xmm9,%xmm5 + movdqa %xmm13,%xmm9 + jmp .Lseal_sse_128_tail_xor + +.Lseal_sse_tail_16: + testq %rbx,%rbx + jz .Lprocess_blocks_of_extra_in + + movq %rbx,%r8 + movq %rbx,%rcx + leaq -1(%rsi,%rbx,1),%rsi + pxor %xmm15,%xmm15 +.Lseal_sse_tail_16_compose: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + decq %rcx + jne .Lseal_sse_tail_16_compose + + + pxor %xmm0,%xmm15 + + + movq %rbx,%rcx + movdqu %xmm15,%xmm0 +.Lseal_sse_tail_16_extract: + pextrb $0,%xmm0,(%rdi) + psrldq $1,%xmm0 + addq $1,%rdi + subq $1,%rcx + jnz .Lseal_sse_tail_16_extract + + + + + + + + + movq 288 + 0 + 32(%rsp),%r9 + movq 56(%r9),%r14 + movq 48(%r9),%r13 + testq %r14,%r14 + jz .Lprocess_partial_block + + movq $16,%r15 + subq %rbx,%r15 + cmpq %r15,%r14 + + jge .Lload_extra_in + movq %r14,%r15 + +.Lload_extra_in: + + + leaq -1(%r13,%r15,1),%rsi + + + addq %r15,%r13 + subq %r15,%r14 + movq %r13,48(%r9) + movq %r14,56(%r9) + + + + addq %r15,%r8 + + + pxor %xmm11,%xmm11 +.Lload_extra_load_loop: + pslldq $1,%xmm11 + pinsrb $0,(%rsi),%xmm11 + leaq -1(%rsi),%rsi + subq $1,%r15 + jnz .Lload_extra_load_loop + + + + + movq %rbx,%r15 + +.Lload_extra_shift_loop: + pslldq $1,%xmm11 + subq $1,%r15 + jnz .Lload_extra_shift_loop + + + + + leaq .Land_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx,1),%xmm15 + + + por %xmm11,%xmm15 + + + + movq %xmm15,%r13 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +.Lprocess_blocks_of_extra_in: + + movq 288+32+0 (%rsp),%r9 + movq 48(%r9),%rsi + movq 56(%r9),%r8 + movq %r8,%rcx + shrq $4,%r8 + +.Lprocess_extra_hash_loop: + jz process_extra_in_trailer + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rsi),%rsi + subq $1,%r8 + jmp .Lprocess_extra_hash_loop +process_extra_in_trailer: + andq $15,%rcx + movq %rcx,%rbx + jz .Ldo_length_block + leaq -1(%rsi,%rcx,1),%rsi + +.Lprocess_extra_in_trailer_load: + pslldq $1,%xmm15 + pinsrb $0,(%rsi),%xmm15 + leaq -1(%rsi),%rsi + subq $1,%rcx + jnz .Lprocess_extra_in_trailer_load + +.Lprocess_partial_block: + + leaq .Land_masks(%rip),%r15 + shlq $4,%rbx + pand -16(%r15,%rbx,1),%xmm15 + movq %xmm15,%r13 + pextrq $1,%xmm15,%r14 + addq %r13,%r10 + adcq %r14,%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + +.Ldo_length_block: + addq 0+0+32(%rbp),%r10 + adcq 8+0+32(%rbp),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + movq %r10,%r13 + movq %r11,%r14 + movq %r12,%r15 + subq $-5,%r10 + sbbq $-1,%r11 + sbbq $3,%r12 + cmovcq %r13,%r10 + cmovcq %r14,%r11 + cmovcq %r15,%r12 + + addq 0+0+16(%rbp),%r10 + adcq 8+0+16(%rbp),%r11 + +.cfi_remember_state + addq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset -(288 + 32) + + popq %r9 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r9 + movq %r10,(%r9) + movq %r11,8(%r9) + popq %r15 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r15 + popq %r14 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r14 + popq %r13 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r13 + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + popq %rbx +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbx + popq %rbp +.cfi_adjust_cfa_offset -8 +.cfi_restore %rbp + ret + +.Lseal_sse_128: +.cfi_restore_state + movdqu .Lchacha20_consts(%rip),%xmm0 + movdqa %xmm0,%xmm1 + movdqa %xmm0,%xmm2 + movdqu 0(%r9),%xmm4 + movdqa %xmm4,%xmm5 + movdqa %xmm4,%xmm6 + movdqu 16(%r9),%xmm8 + movdqa %xmm8,%xmm9 + movdqa %xmm8,%xmm10 + movdqu 32(%r9),%xmm14 + movdqa %xmm14,%xmm12 + paddd .Lsse_inc(%rip),%xmm12 + movdqa %xmm12,%xmm13 + paddd .Lsse_inc(%rip),%xmm13 + movdqa %xmm4,%xmm7 + movdqa %xmm8,%xmm11 + movdqa %xmm12,%xmm15 + movq $10,%r10 + +.Lseal_sse_128_rounds: + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $4,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $12,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $4,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $12,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $4,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $12,%xmm14,%xmm14 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol16(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm4 + pxor %xmm3,%xmm4 + paddd %xmm4,%xmm0 + pxor %xmm0,%xmm12 + pshufb .Lrol8(%rip),%xmm12 + paddd %xmm12,%xmm8 + pxor %xmm8,%xmm4 + movdqa %xmm4,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm4 + pxor %xmm3,%xmm4 + palignr $12,%xmm4,%xmm4 + palignr $8,%xmm8,%xmm8 + palignr $4,%xmm12,%xmm12 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol16(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm5 + pxor %xmm3,%xmm5 + paddd %xmm5,%xmm1 + pxor %xmm1,%xmm13 + pshufb .Lrol8(%rip),%xmm13 + paddd %xmm13,%xmm9 + pxor %xmm9,%xmm5 + movdqa %xmm5,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm5 + pxor %xmm3,%xmm5 + palignr $12,%xmm5,%xmm5 + palignr $8,%xmm9,%xmm9 + palignr $4,%xmm13,%xmm13 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol16(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $12,%xmm3 + psrld $20,%xmm6 + pxor %xmm3,%xmm6 + paddd %xmm6,%xmm2 + pxor %xmm2,%xmm14 + pshufb .Lrol8(%rip),%xmm14 + paddd %xmm14,%xmm10 + pxor %xmm10,%xmm6 + movdqa %xmm6,%xmm3 + pslld $7,%xmm3 + psrld $25,%xmm6 + pxor %xmm3,%xmm6 + palignr $12,%xmm6,%xmm6 + palignr $8,%xmm10,%xmm10 + palignr $4,%xmm14,%xmm14 + + decq %r10 + jnz .Lseal_sse_128_rounds + paddd .Lchacha20_consts(%rip),%xmm0 + paddd .Lchacha20_consts(%rip),%xmm1 + paddd .Lchacha20_consts(%rip),%xmm2 + paddd %xmm7,%xmm4 + paddd %xmm7,%xmm5 + paddd %xmm7,%xmm6 + paddd %xmm11,%xmm8 + paddd %xmm11,%xmm9 + paddd %xmm15,%xmm12 + paddd .Lsse_inc(%rip),%xmm15 + paddd %xmm15,%xmm13 + + pand .Lclamp(%rip),%xmm2 + movdqa %xmm2,0+0(%rbp) + movdqa %xmm6,0+16(%rbp) + + movq %r8,%r8 + call poly_hash_ad_internal + jmp .Lseal_sse_128_tail_xor +.size chacha20_poly1305_seal_sse41, .-chacha20_poly1305_seal_sse41 +.cfi_endproc + + +.globl chacha20_poly1305_open_avx2 +.hidden chacha20_poly1305_open_avx2 +.type chacha20_poly1305_open_avx2,@function +.align 64 +chacha20_poly1305_open_avx2: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 + + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + + vzeroupper + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd .Lavx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe .Lopen_avx2_192 + cmpq $320,%rbx + jbe .Lopen_avx2_320 + + vmovdqa %ymm4,0+64(%rbp) + vmovdqa %ymm8,0+96(%rbp) + vmovdqa %ymm12,0+160(%rbp) + movq $10,%r10 +.Lopen_avx2_init_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + decq %r10 + jne .Lopen_avx2_init_rounds + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .Lclamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + + movq %r8,%r8 + call poly_hash_ad_internal + + xorq %rcx,%rcx +.Lopen_avx2_init_hash: + addq 0+0(%rsi,%rcx,1),%r10 + adcq 8+0(%rsi,%rcx,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + addq $16,%rcx + cmpq $64,%rcx + jne .Lopen_avx2_init_hash + + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + + vmovdqu %ymm0,0(%rdi) + vmovdqu %ymm4,32(%rdi) + leaq 64(%rsi),%rsi + leaq 64(%rdi),%rdi + subq $64,%rbx +.Lopen_avx2_main_loop: + + cmpq $512,%rbx + jb .Lopen_avx2_main_loop_done + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + xorq %rcx,%rcx +.Lopen_avx2_main_loop_rounds: + addq 0+0(%rsi,%rcx,1),%r10 + adcq 8+0(%rsi,%rcx,1),%r11 + adcq $1,%r12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + addq 0+16(%rsi,%rcx,1),%r10 + adcq 8+16(%rsi,%rcx,1),%r11 + adcq $1,%r12 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq 0+32(%rsi,%rcx,1),%r10 + adcq 8+32(%rsi,%rcx,1),%r11 + adcq $1,%r12 + + leaq 48(%rcx),%rcx + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq %rax,%r15 + adcq %rdx,%r9 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + cmpq $60*8,%rcx + jne .Lopen_avx2_main_loop_rounds + vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + addq 0+60*8(%rsi),%r10 + adcq 8+60*8(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + addq 0+60*8+16(%rsi),%r10 + adcq 8+60*8+16(%rsi),%r11 + adcq $1,%r12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + leaq 512(%rdi),%rdi + subq $512,%rbx + jmp .Lopen_avx2_main_loop +.Lopen_avx2_main_loop_done: + testq %rbx,%rbx + vzeroupper + je .Lopen_sse_finalize + + cmpq $384,%rbx + ja .Lopen_avx2_tail_512 + cmpq $256,%rbx + ja .Lopen_avx2_tail_384 + cmpq $128,%rbx + ja .Lopen_avx2_tail_256 + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + + xorq %r8,%r8 + movq %rbx,%rcx + andq $-16,%rcx + testq %rcx,%rcx + je .Lopen_avx2_tail_128_rounds +.Lopen_avx2_tail_128_rounds_and_x1hash: + addq 0+0(%rsi,%r8,1),%r10 + adcq 8+0(%rsi,%r8,1),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +.Lopen_avx2_tail_128_rounds: + addq $16,%r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb .Lopen_avx2_tail_128_rounds_and_x1hash + cmpq $160,%r8 + jne .Lopen_avx2_tail_128_rounds + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp .Lopen_avx2_tail_128_xor + +.Lopen_avx2_tail_256: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + + movq %rbx,0+128(%rbp) + movq %rbx,%rcx + subq $128,%rcx + shrq $4,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +.Lopen_avx2_tail_256_rounds_and_x1hash: + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +.Lopen_avx2_tail_256_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + + incq %r8 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + cmpq %rcx,%r8 + jb .Lopen_avx2_tail_256_rounds_and_x1hash + cmpq $10,%r8 + jne .Lopen_avx2_tail_256_rounds + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 0+128(%rbp),%rbx +.Lopen_avx2_tail_256_hash: + addq $16,%rcx + cmpq %rbx,%rcx + jg .Lopen_avx2_tail_256_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp .Lopen_avx2_tail_256_hash +.Lopen_avx2_tail_256_done: + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 128(%rsi),%rsi + leaq 128(%rdi),%rdi + subq $128,%rbx + jmp .Lopen_avx2_tail_128_xor + +.Lopen_avx2_tail_384: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + + movq %rbx,0+128(%rbp) + movq %rbx,%rcx + subq $256,%rcx + shrq $4,%rcx + addq $6,%rcx + movq $10,%r8 + cmpq $10,%rcx + cmovgq %r8,%rcx + movq %rsi,%rbx + xorq %r8,%r8 +.Lopen_avx2_tail_384_rounds_and_x2hash: + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx +.Lopen_avx2_tail_384_rounds_and_x1hash: + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0+0(%rbx),%r10 + adcq 8+0(%rbx),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rbx),%rbx + incq %r8 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + + cmpq %rcx,%r8 + jb .Lopen_avx2_tail_384_rounds_and_x2hash + cmpq $10,%r8 + jne .Lopen_avx2_tail_384_rounds_and_x1hash + movq %rbx,%r8 + subq %rsi,%rbx + movq %rbx,%rcx + movq 0+128(%rbp),%rbx +.Lopen_avx2_384_tail_hash: + addq $16,%rcx + cmpq %rbx,%rcx + jg .Lopen_avx2_384_tail_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + jmp .Lopen_avx2_384_tail_hash +.Lopen_avx2_384_tail_done: + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 256(%rsi),%rsi + leaq 256(%rdi),%rdi + subq $256,%rbx + jmp .Lopen_avx2_tail_128_xor + +.Lopen_avx2_tail_512: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + xorq %rcx,%rcx + movq %rsi,%r8 +.Lopen_avx2_tail_512_rounds_and_x2hash: + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 +.Lopen_avx2_tail_512_rounds_and_x1hash: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + addq 0+16(%r8),%r10 + adcq 8+16(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%r8),%r8 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + incq %rcx + cmpq $4,%rcx + jl .Lopen_avx2_tail_512_rounds_and_x2hash + cmpq $10,%rcx + jne .Lopen_avx2_tail_512_rounds_and_x1hash + movq %rbx,%rcx + subq $384,%rcx + andq $-16,%rcx +.Lopen_avx2_tail_512_hash: + testq %rcx,%rcx + je .Lopen_avx2_tail_512_done + addq 0+0(%r8),%r10 + adcq 8+0(%r8),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%r8),%r8 + subq $16,%rcx + jmp .Lopen_avx2_tail_512_hash +.Lopen_avx2_tail_512_done: + vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + leaq 384(%rsi),%rsi + leaq 384(%rdi),%rdi + subq $384,%rbx +.Lopen_avx2_tail_128_xor: + cmpq $32,%rbx + jb .Lopen_avx2_tail_32_xor + subq $32,%rbx + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + jmp .Lopen_avx2_tail_128_xor +.Lopen_avx2_tail_32_xor: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb .Lopen_avx2_exit + subq $16,%rbx + + vpxor (%rsi),%xmm0,%xmm1 + vmovdqu %xmm1,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 + vmovdqa %xmm0,%xmm1 +.Lopen_avx2_exit: + vzeroupper + jmp .Lopen_sse_tail_16 + +.Lopen_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +.Lopen_avx2_192_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne .Lopen_avx2_192_rounds + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .Lclamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +.Lopen_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal +.Lopen_avx2_short_hash_and_xor_loop: + cmpq $32,%rbx + jb .Lopen_avx2_short_tail_32 + subq $32,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rsi),%r10 + adcq 8+16(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp .Lopen_avx2_short_hash_and_xor_loop +.Lopen_avx2_short_tail_32: + cmpq $16,%rbx + vmovdqa %xmm0,%xmm1 + jb .Lopen_avx2_short_tail_32_exit + subq $16,%rbx + addq 0+0(%rsi),%r10 + adcq 8+0(%rsi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm1 +.Lopen_avx2_short_tail_32_exit: + vzeroupper + jmp .Lopen_sse_tail_16 + +.Lopen_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 + vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + movq $10,%r10 +.Lopen_avx2_320_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne .Lopen_avx2_320_rounds + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .Lclamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp .Lopen_avx2_short +.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 +.cfi_endproc + + +.globl chacha20_poly1305_seal_avx2 +.hidden chacha20_poly1305_seal_avx2 +.type chacha20_poly1305_seal_avx2,@function +.align 64 +chacha20_poly1305_seal_avx2: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset %rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-32 + pushq %r13 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-40 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r14,-48 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r15,-56 + + + pushq %r9 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r9,-64 + subq $288 + 0 + 32,%rsp +.cfi_adjust_cfa_offset 288 + 32 + leaq 32(%rsp),%rbp + andq $-32,%rbp + + movq 56(%r9),%rbx + addq %rdx,%rbx + movq %r8,0+0+32(%rbp) + movq %rbx,8+0+32(%rbp) + movq %rdx,%rbx + + vzeroupper + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vbroadcasti128 0(%r9),%ymm4 + vbroadcasti128 16(%r9),%ymm8 + vbroadcasti128 32(%r9),%ymm12 + vpaddd .Lavx2_init(%rip),%ymm12,%ymm12 + cmpq $192,%rbx + jbe .Lseal_avx2_192 + cmpq $320,%rbx + jbe .Lseal_avx2_320 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm4,0+64(%rbp) + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm8,0+96(%rbp) + vmovdqa %ymm12,%ymm15 + vpaddd .Lavx2_inc(%rip),%ymm15,%ymm14 + vpaddd .Lavx2_inc(%rip),%ymm14,%ymm13 + vpaddd .Lavx2_inc(%rip),%ymm13,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm15,0+256(%rbp) + movq $10,%r10 +.Lseal_avx2_init_rounds: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %r10 + jnz .Lseal_avx2_init_rounds + vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 + vpand .Lclamp(%rip),%ymm15,%ymm15 + vmovdqa %ymm15,0+0(%rbp) + movq %r8,%r8 + call poly_hash_ad_internal + + vpxor 0(%rsi),%ymm3,%ymm3 + vpxor 32(%rsi),%ymm11,%ymm11 + vmovdqu %ymm3,0(%rdi) + vmovdqu %ymm11,32(%rdi) + vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+64(%rsi),%ymm15,%ymm15 + vpxor 32+64(%rsi),%ymm2,%ymm2 + vpxor 64+64(%rsi),%ymm6,%ymm6 + vpxor 96+64(%rsi),%ymm10,%ymm10 + vmovdqu %ymm15,0+64(%rdi) + vmovdqu %ymm2,32+64(%rdi) + vmovdqu %ymm6,64+64(%rdi) + vmovdqu %ymm10,96+64(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+192(%rsi),%ymm15,%ymm15 + vpxor 32+192(%rsi),%ymm1,%ymm1 + vpxor 64+192(%rsi),%ymm5,%ymm5 + vpxor 96+192(%rsi),%ymm9,%ymm9 + vmovdqu %ymm15,0+192(%rdi) + vmovdqu %ymm1,32+192(%rdi) + vmovdqu %ymm5,64+192(%rdi) + vmovdqu %ymm9,96+192(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm15,%ymm8 + + leaq 320(%rsi),%rsi + subq $320,%rbx + movq $320,%rcx + cmpq $128,%rbx + jbe .Lseal_avx2_short_hash_remainder + vpxor 0(%rsi),%ymm0,%ymm0 + vpxor 32(%rsi),%ymm4,%ymm4 + vpxor 64(%rsi),%ymm8,%ymm8 + vpxor 96(%rsi),%ymm12,%ymm12 + vmovdqu %ymm0,320(%rdi) + vmovdqu %ymm4,352(%rdi) + vmovdqu %ymm8,384(%rdi) + vmovdqu %ymm12,416(%rdi) + leaq 128(%rsi),%rsi + subq $128,%rbx + movq $8,%rcx + movq $2,%r8 + cmpq $128,%rbx + jbe .Lseal_avx2_tail_128 + cmpq $256,%rbx + jbe .Lseal_avx2_tail_256 + cmpq $384,%rbx + jbe .Lseal_avx2_tail_384 + cmpq $512,%rbx + jbe .Lseal_avx2_tail_512 + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + + subq $16,%rdi + movq $9,%rcx + jmp .Lseal_avx2_main_loop_rounds_entry +.align 32 +.Lseal_avx2_main_loop: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + + movq $10,%rcx +.align 32 +.Lseal_avx2_main_loop_rounds: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + addq %rax,%r15 + adcq %rdx,%r9 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + +.Lseal_avx2_main_loop_rounds_entry: + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + addq %rax,%r15 + adcq %rdx,%r9 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq 0+32(%rdi),%r10 + adcq 8+32(%rdi),%r11 + adcq $1,%r12 + + leaq 48(%rdi),%rdi + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + addq %rax,%r15 + adcq %rdx,%r9 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + decq %rcx + jne .Lseal_avx2_main_loop_rounds + vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 + vpxor 0+384(%rsi),%ymm3,%ymm3 + vpxor 32+384(%rsi),%ymm0,%ymm0 + vpxor 64+384(%rsi),%ymm4,%ymm4 + vpxor 96+384(%rsi),%ymm8,%ymm8 + vmovdqu %ymm3,0+384(%rdi) + vmovdqu %ymm0,32+384(%rdi) + vmovdqu %ymm4,64+384(%rdi) + vmovdqu %ymm8,96+384(%rdi) + + leaq 512(%rsi),%rsi + subq $512,%rbx + cmpq $512,%rbx + jg .Lseal_avx2_main_loop + + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + movq $10,%rcx + xorq %r8,%r8 + + cmpq $384,%rbx + ja .Lseal_avx2_tail_512 + cmpq $256,%rbx + ja .Lseal_avx2_tail_384 + cmpq $128,%rbx + ja .Lseal_avx2_tail_256 + +.Lseal_avx2_tail_128: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + +.Lseal_avx2_tail_128_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_avx2_tail_128_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg .Lseal_avx2_tail_128_rounds_and_3xhash + decq %r8 + jge .Lseal_avx2_tail_128_rounds_and_2xhash + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + jmp .Lseal_avx2_short_loop + +.Lseal_avx2_tail_256: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + +.Lseal_avx2_tail_256_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_avx2_tail_256_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg .Lseal_avx2_tail_256_rounds_and_3xhash + decq %r8 + jge .Lseal_avx2_tail_256_rounds_and_2xhash + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm1,%ymm1 + vpxor 64+0(%rsi),%ymm5,%ymm5 + vpxor 96+0(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm1,32+0(%rdi) + vmovdqu %ymm5,64+0(%rdi) + vmovdqu %ymm9,96+0(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $128,%rcx + leaq 128(%rsi),%rsi + subq $128,%rbx + jmp .Lseal_avx2_short_hash_remainder + +.Lseal_avx2_tail_384: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + +.Lseal_avx2_tail_384_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_avx2_tail_384_rounds_and_2xhash: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + leaq 32(%rdi),%rdi + decq %rcx + jg .Lseal_avx2_tail_384_rounds_and_3xhash + decq %r8 + jge .Lseal_avx2_tail_384_rounds_and_2xhash + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+0(%rsi),%ymm3,%ymm3 + vpxor 32+0(%rsi),%ymm2,%ymm2 + vpxor 64+0(%rsi),%ymm6,%ymm6 + vpxor 96+0(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+0(%rdi) + vmovdqu %ymm2,32+0(%rdi) + vmovdqu %ymm6,64+0(%rdi) + vmovdqu %ymm10,96+0(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm1,%ymm1 + vpxor 64+128(%rsi),%ymm5,%ymm5 + vpxor 96+128(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm1,32+128(%rdi) + vmovdqu %ymm5,64+128(%rdi) + vmovdqu %ymm9,96+128(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $256,%rcx + leaq 256(%rsi),%rsi + subq $256,%rbx + jmp .Lseal_avx2_short_hash_remainder + +.Lseal_avx2_tail_512: + vmovdqa .Lchacha20_consts(%rip),%ymm0 + vmovdqa 0+64(%rbp),%ymm4 + vmovdqa 0+96(%rbp),%ymm8 + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm10 + vmovdqa %ymm0,%ymm3 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa .Lavx2_inc(%rip),%ymm12 + vpaddd 0+160(%rbp),%ymm12,%ymm15 + vpaddd %ymm15,%ymm12,%ymm14 + vpaddd %ymm14,%ymm12,%ymm13 + vpaddd %ymm13,%ymm12,%ymm12 + vmovdqa %ymm15,0+256(%rbp) + vmovdqa %ymm14,0+224(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm12,0+160(%rbp) + +.Lseal_avx2_tail_512_rounds_and_3xhash: + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + addq %rax,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi +.Lseal_avx2_tail_512_rounds_and_2xhash: + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $4,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $12,%ymm15,%ymm15,%ymm15 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $4,%ymm4,%ymm4,%ymm4 + addq %rax,%r15 + adcq %rdx,%r9 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vmovdqa %ymm8,0+128(%rbp) + vmovdqa .Lrol16(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $20,%ymm7,%ymm8 + vpslld $32-20,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $20,%ymm6,%ymm8 + vpslld $32-20,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $20,%ymm5,%ymm8 + vpslld $32-20,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $20,%ymm4,%ymm8 + vpslld $32-20,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa .Lrol8(%rip),%ymm8 + vpaddd %ymm7,%ymm3,%ymm3 + vpaddd %ymm6,%ymm2,%ymm2 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + vpaddd %ymm5,%ymm1,%ymm1 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm3,%ymm15,%ymm15 + vpxor %ymm2,%ymm14,%ymm14 + vpxor %ymm1,%ymm13,%ymm13 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb %ymm8,%ymm15,%ymm15 + vpshufb %ymm8,%ymm14,%ymm14 + vpshufb %ymm8,%ymm13,%ymm13 + vpshufb %ymm8,%ymm12,%ymm12 + vpaddd %ymm15,%ymm11,%ymm11 + vpaddd %ymm14,%ymm10,%ymm10 + vpaddd %ymm13,%ymm9,%ymm9 + vpaddd 0+128(%rbp),%ymm12,%ymm8 + vpxor %ymm11,%ymm7,%ymm7 + vpxor %ymm10,%ymm6,%ymm6 + vpxor %ymm9,%ymm5,%ymm5 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa %ymm8,0+128(%rbp) + vpsrld $25,%ymm7,%ymm8 + movq 0+0+0(%rbp),%rdx + movq %rdx,%r15 + mulxq %r10,%r13,%r14 + mulxq %r11,%rax,%rdx + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + vpslld $32-25,%ymm7,%ymm7 + vpxor %ymm8,%ymm7,%ymm7 + vpsrld $25,%ymm6,%ymm8 + vpslld $32-25,%ymm6,%ymm6 + vpxor %ymm8,%ymm6,%ymm6 + vpsrld $25,%ymm5,%ymm8 + vpslld $32-25,%ymm5,%ymm5 + vpxor %ymm8,%ymm5,%ymm5 + vpsrld $25,%ymm4,%ymm8 + vpslld $32-25,%ymm4,%ymm4 + vpxor %ymm8,%ymm4,%ymm4 + vmovdqa 0+128(%rbp),%ymm8 + vpalignr $12,%ymm7,%ymm7,%ymm7 + vpalignr $8,%ymm11,%ymm11,%ymm11 + vpalignr $4,%ymm15,%ymm15,%ymm15 + vpalignr $12,%ymm6,%ymm6,%ymm6 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpalignr $8,%ymm9,%ymm9,%ymm9 + movq 8+0+0(%rbp),%rdx + mulxq %r10,%r10,%rax + addq %r10,%r14 + mulxq %r11,%r11,%r9 + adcq %r11,%r15 + adcq $0,%r9 + imulq %r12,%rdx + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm12,%ymm12,%ymm12 + + + + + + + + + + + + + + + + + addq %rax,%r15 + adcq %rdx,%r9 + + + + + + + + + + + + + + + + + + + + + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + decq %rcx + jg .Lseal_avx2_tail_512_rounds_and_3xhash + decq %r8 + jge .Lseal_avx2_tail_512_rounds_and_2xhash + vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 + vpaddd 0+64(%rbp),%ymm7,%ymm7 + vpaddd 0+96(%rbp),%ymm11,%ymm11 + vpaddd 0+256(%rbp),%ymm15,%ymm15 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd 0+64(%rbp),%ymm6,%ymm6 + vpaddd 0+96(%rbp),%ymm10,%ymm10 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd 0+64(%rbp),%ymm5,%ymm5 + vpaddd 0+96(%rbp),%ymm9,%ymm9 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd 0+64(%rbp),%ymm4,%ymm4 + vpaddd 0+96(%rbp),%ymm8,%ymm8 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + + vmovdqa %ymm0,0+128(%rbp) + vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 + vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 + vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 + vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 + vpxor 0+0(%rsi),%ymm0,%ymm0 + vpxor 32+0(%rsi),%ymm3,%ymm3 + vpxor 64+0(%rsi),%ymm7,%ymm7 + vpxor 96+0(%rsi),%ymm11,%ymm11 + vmovdqu %ymm0,0+0(%rdi) + vmovdqu %ymm3,32+0(%rdi) + vmovdqu %ymm7,64+0(%rdi) + vmovdqu %ymm11,96+0(%rdi) + + vmovdqa 0+128(%rbp),%ymm0 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 + vpxor 0+128(%rsi),%ymm3,%ymm3 + vpxor 32+128(%rsi),%ymm2,%ymm2 + vpxor 64+128(%rsi),%ymm6,%ymm6 + vpxor 96+128(%rsi),%ymm10,%ymm10 + vmovdqu %ymm3,0+128(%rdi) + vmovdqu %ymm2,32+128(%rdi) + vmovdqu %ymm6,64+128(%rdi) + vmovdqu %ymm10,96+128(%rdi) + vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 + vpxor 0+256(%rsi),%ymm3,%ymm3 + vpxor 32+256(%rsi),%ymm1,%ymm1 + vpxor 64+256(%rsi),%ymm5,%ymm5 + vpxor 96+256(%rsi),%ymm9,%ymm9 + vmovdqu %ymm3,0+256(%rdi) + vmovdqu %ymm1,32+256(%rdi) + vmovdqu %ymm5,64+256(%rdi) + vmovdqu %ymm9,96+256(%rdi) + vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 + vmovdqa %ymm3,%ymm8 + + movq $384,%rcx + leaq 384(%rsi),%rsi + subq $384,%rbx + jmp .Lseal_avx2_short_hash_remainder + +.Lseal_avx2_320: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 + vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14 + vmovdqa %ymm4,%ymm7 + vmovdqa %ymm8,%ymm11 + vmovdqa %ymm12,0+160(%rbp) + vmovdqa %ymm13,0+192(%rbp) + vmovdqa %ymm14,0+224(%rbp) + movq $10,%r10 +.Lseal_avx2_320_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $12,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $4,%ymm6,%ymm6,%ymm6 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol16(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpsrld $20,%ymm6,%ymm3 + vpslld $12,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpaddd %ymm6,%ymm2,%ymm2 + vpxor %ymm2,%ymm14,%ymm14 + vpshufb .Lrol8(%rip),%ymm14,%ymm14 + vpaddd %ymm14,%ymm10,%ymm10 + vpxor %ymm10,%ymm6,%ymm6 + vpslld $7,%ymm6,%ymm3 + vpsrld $25,%ymm6,%ymm6 + vpxor %ymm3,%ymm6,%ymm6 + vpalignr $4,%ymm14,%ymm14,%ymm14 + vpalignr $8,%ymm10,%ymm10,%ymm10 + vpalignr $12,%ymm6,%ymm6,%ymm6 + + decq %r10 + jne .Lseal_avx2_320_rounds + vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 + vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 + vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 + vpaddd %ymm7,%ymm4,%ymm4 + vpaddd %ymm7,%ymm5,%ymm5 + vpaddd %ymm7,%ymm6,%ymm6 + vpaddd %ymm11,%ymm8,%ymm8 + vpaddd %ymm11,%ymm9,%ymm9 + vpaddd %ymm11,%ymm10,%ymm10 + vpaddd 0+160(%rbp),%ymm12,%ymm12 + vpaddd 0+192(%rbp),%ymm13,%ymm13 + vpaddd 0+224(%rbp),%ymm14,%ymm14 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .Lclamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 + vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 + vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 + vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 + vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 + jmp .Lseal_avx2_short + +.Lseal_avx2_192: + vmovdqa %ymm0,%ymm1 + vmovdqa %ymm0,%ymm2 + vmovdqa %ymm4,%ymm5 + vmovdqa %ymm4,%ymm6 + vmovdqa %ymm8,%ymm9 + vmovdqa %ymm8,%ymm10 + vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 + vmovdqa %ymm12,%ymm11 + vmovdqa %ymm13,%ymm15 + movq $10,%r10 +.Lseal_avx2_192_rounds: + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $12,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $4,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $12,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $4,%ymm5,%ymm5,%ymm5 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol16(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpsrld $20,%ymm4,%ymm3 + vpslld $12,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpaddd %ymm4,%ymm0,%ymm0 + vpxor %ymm0,%ymm12,%ymm12 + vpshufb .Lrol8(%rip),%ymm12,%ymm12 + vpaddd %ymm12,%ymm8,%ymm8 + vpxor %ymm8,%ymm4,%ymm4 + vpslld $7,%ymm4,%ymm3 + vpsrld $25,%ymm4,%ymm4 + vpxor %ymm3,%ymm4,%ymm4 + vpalignr $4,%ymm12,%ymm12,%ymm12 + vpalignr $8,%ymm8,%ymm8,%ymm8 + vpalignr $12,%ymm4,%ymm4,%ymm4 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol16(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpsrld $20,%ymm5,%ymm3 + vpslld $12,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpaddd %ymm5,%ymm1,%ymm1 + vpxor %ymm1,%ymm13,%ymm13 + vpshufb .Lrol8(%rip),%ymm13,%ymm13 + vpaddd %ymm13,%ymm9,%ymm9 + vpxor %ymm9,%ymm5,%ymm5 + vpslld $7,%ymm5,%ymm3 + vpsrld $25,%ymm5,%ymm5 + vpxor %ymm3,%ymm5,%ymm5 + vpalignr $4,%ymm13,%ymm13,%ymm13 + vpalignr $8,%ymm9,%ymm9,%ymm9 + vpalignr $12,%ymm5,%ymm5,%ymm5 + + decq %r10 + jne .Lseal_avx2_192_rounds + vpaddd %ymm2,%ymm0,%ymm0 + vpaddd %ymm2,%ymm1,%ymm1 + vpaddd %ymm6,%ymm4,%ymm4 + vpaddd %ymm6,%ymm5,%ymm5 + vpaddd %ymm10,%ymm8,%ymm8 + vpaddd %ymm10,%ymm9,%ymm9 + vpaddd %ymm11,%ymm12,%ymm12 + vpaddd %ymm15,%ymm13,%ymm13 + vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 + + vpand .Lclamp(%rip),%ymm3,%ymm3 + vmovdqa %ymm3,0+0(%rbp) + + vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 + vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 + vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 + vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 + vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 + vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 +.Lseal_avx2_short: + movq %r8,%r8 + call poly_hash_ad_internal + xorq %rcx,%rcx +.Lseal_avx2_short_hash_remainder: + cmpq $16,%rcx + jb .Lseal_avx2_short_loop + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + subq $16,%rcx + addq $16,%rdi + jmp .Lseal_avx2_short_hash_remainder +.Lseal_avx2_short_loop: + cmpq $32,%rbx + jb .Lseal_avx2_short_tail + subq $32,%rbx + + vpxor (%rsi),%ymm0,%ymm0 + vmovdqu %ymm0,(%rdi) + leaq 32(%rsi),%rsi + + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + addq 0+16(%rdi),%r10 + adcq 8+16(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 32(%rdi),%rdi + + vmovdqa %ymm4,%ymm0 + vmovdqa %ymm8,%ymm4 + vmovdqa %ymm12,%ymm8 + vmovdqa %ymm1,%ymm12 + vmovdqa %ymm5,%ymm1 + vmovdqa %ymm9,%ymm5 + vmovdqa %ymm13,%ymm9 + vmovdqa %ymm2,%ymm13 + vmovdqa %ymm6,%ymm2 + jmp .Lseal_avx2_short_loop +.Lseal_avx2_short_tail: + cmpq $16,%rbx + jb .Lseal_avx2_exit + subq $16,%rbx + vpxor (%rsi),%xmm0,%xmm3 + vmovdqu %xmm3,(%rdi) + leaq 16(%rsi),%rsi + addq 0+0(%rdi),%r10 + adcq 8+0(%rdi),%r11 + adcq $1,%r12 + movq 0+0+0(%rbp),%rax + movq %rax,%r15 + mulq %r10 + movq %rax,%r13 + movq %rdx,%r14 + movq 0+0+0(%rbp),%rax + mulq %r11 + imulq %r12,%r15 + addq %rax,%r14 + adcq %rdx,%r15 + movq 8+0+0(%rbp),%rax + movq %rax,%r9 + mulq %r10 + addq %rax,%r14 + adcq $0,%rdx + movq %rdx,%r10 + movq 8+0+0(%rbp),%rax + mulq %r11 + addq %rax,%r15 + adcq $0,%rdx + imulq %r12,%r9 + addq %r10,%r15 + adcq %rdx,%r9 + movq %r13,%r10 + movq %r14,%r11 + movq %r15,%r12 + andq $3,%r12 + movq %r15,%r13 + andq $-4,%r13 + movq %r9,%r14 + shrdq $2,%r9,%r15 + shrq $2,%r9 + addq %r13,%r15 + adcq %r14,%r9 + addq %r15,%r10 + adcq %r9,%r11 + adcq $0,%r12 + + leaq 16(%rdi),%rdi + vextracti128 $1,%ymm0,%xmm0 +.Lseal_avx2_exit: + vzeroupper + jmp .Lseal_sse_tail_16 +.cfi_endproc +.size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2 +#endif diff --git a/third_party/boringssl/gen/crypto/chacha20_poly1305_x86_64-win.asm b/third_party/boringssl/gen/crypto/chacha20_poly1305_x86_64-win.asm new file mode 100644 index 00000000..48916779 --- /dev/null +++ b/third_party/boringssl/gen/crypto/chacha20_poly1305_x86_64-win.asm @@ -0,0 +1,9023 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .rdata rdata align=8 +ALIGN 64 +chacha20_poly1305_constants: +$L$chacha20_consts: + DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' + DB 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' +$L$rol8: + DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 + DB 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 +$L$rol16: + DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 + DB 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 +$L$avx2_init: + DD 0,0,0,0 +$L$sse_inc: + DD 1,0,0,0 +$L$avx2_inc: + DD 2,0,0,0,2,0,0,0 +$L$clamp: + DQ 0x0FFFFFFC0FFFFFFF,0x0FFFFFFC0FFFFFFC + DQ 0xFFFFFFFFFFFFFFFF,0xFFFFFFFFFFFFFFFF +ALIGN 16 +$L$and_masks: + DB 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 + DB 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff +section .text code align=64 + + + +ALIGN 64 +poly_hash_ad_internal: + + + xor r10,r10 + xor r11,r11 + xor r12,r12 + cmp r8,13 + jne NEAR $L$hash_ad_loop +$L$poly_fast_tls_ad: + + mov r10,QWORD[rcx] + mov r11,QWORD[5+rcx] + shr r11,24 + mov r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + ret +$L$hash_ad_loop: + + cmp r8,16 + jb NEAR $L$hash_ad_tail + add r10,QWORD[((0+0))+rcx] + adc r11,QWORD[((8+0))+rcx] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rcx,[16+rcx] + sub r8,16 + jmp NEAR $L$hash_ad_loop +$L$hash_ad_tail: + cmp r8,0 + je NEAR $L$hash_ad_done + + xor r13,r13 + xor r14,r14 + xor r15,r15 + add rcx,r8 +$L$hash_ad_tail_loop: + shld r14,r13,8 + shl r13,8 + movzx r15,BYTE[((-1))+rcx] + xor r13,r15 + dec rcx + dec r8 + jne NEAR $L$hash_ad_tail_loop + + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$hash_ad_done: + ret + + + +global chacha20_poly1305_open_sse41 + +ALIGN 64 +chacha20_poly1305_open_sse41: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_chacha20_poly1305_open_sse41: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + + + push r9 + + sub rsp,288 + 160 + 32 + + + lea rbp,[32+rsp] + and rbp,-32 + + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 + + mov rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx + + cmp rbx,128 + jbe NEAR $L$open_sse_128 + + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqu xmm4,XMMWORD[r9] + movdqu xmm8,XMMWORD[16+r9] + movdqu xmm12,XMMWORD[32+r9] + + movdqa xmm7,xmm12 + + movdqa XMMWORD[(160+48)+rbp],xmm4 + movdqa XMMWORD[(160+64)+rbp],xmm8 + movdqa XMMWORD[(160+96)+rbp],xmm12 + mov r10,10 +$L$open_sse_init_rounds: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + + dec r10 + jne NEAR $L$open_sse_init_rounds + + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + + pand xmm0,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm0 + movdqa XMMWORD[(160+16)+rbp],xmm4 + + mov r8,r8 + call poly_hash_ad_internal +$L$open_sse_main_loop: + cmp rbx,16*16 + jb NEAR $L$open_sse_tail + + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm3,xmm0 + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,XMMWORD[((160+96))+rbp] + paddd xmm15,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm15 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + + + + mov rcx,4 + mov r8,rsi +$L$open_sse_main_loop_rounds: + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + + lea r8,[16+r8] + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + imul r9,r12 + add r15,r10 + adc r9,rdx + palignr xmm7,xmm7,4 + palignr xmm11,xmm11,8 + palignr xmm15,xmm15,12 + palignr xmm6,xmm6,4 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,12 + palignr xmm5,xmm5,4 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,12 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + palignr xmm7,xmm7,12 + palignr xmm11,xmm11,8 + palignr xmm15,xmm15,4 + palignr xmm6,xmm6,12 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,4 + palignr xmm5,xmm5,12 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,4 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + + dec rcx + jge NEAR $L$open_sse_main_loop_rounds + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + cmp rcx,-6 + jg NEAR $L$open_sse_main_loop_rounds + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqa XMMWORD[(160+80)+rbp],xmm12 + movdqu xmm12,XMMWORD[((0 + 0))+rsi] + pxor xmm12,xmm3 + movdqu XMMWORD[(0 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((16 + 0))+rsi] + pxor xmm12,xmm7 + movdqu XMMWORD[(16 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((32 + 0))+rsi] + pxor xmm12,xmm11 + movdqu XMMWORD[(32 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((48 + 0))+rsi] + pxor xmm12,xmm15 + movdqu XMMWORD[(48 + 0)+rdi],xmm12 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 64)+rdi],xmm2 + movdqu XMMWORD[(16 + 64)+rdi],xmm6 + movdqu XMMWORD[(32 + 64)+rdi],xmm10 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 128)+rdi],xmm1 + movdqu XMMWORD[(16 + 128)+rdi],xmm5 + movdqu XMMWORD[(32 + 128)+rdi],xmm9 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 192))+rsi] + movdqu xmm7,XMMWORD[((16 + 192))+rsi] + movdqu xmm11,XMMWORD[((32 + 192))+rsi] + movdqu xmm15,XMMWORD[((48 + 192))+rsi] + pxor xmm0,xmm3 + pxor xmm4,xmm7 + pxor xmm8,xmm11 + pxor xmm15,XMMWORD[((160+80))+rbp] + movdqu XMMWORD[(0 + 192)+rdi],xmm0 + movdqu XMMWORD[(16 + 192)+rdi],xmm4 + movdqu XMMWORD[(32 + 192)+rdi],xmm8 + movdqu XMMWORD[(48 + 192)+rdi],xmm15 + + lea rsi,[256+rsi] + lea rdi,[256+rdi] + sub rbx,16*16 + jmp NEAR $L$open_sse_main_loop +$L$open_sse_tail: + + test rbx,rbx + jz NEAR $L$open_sse_finalize + cmp rbx,12*16 + ja NEAR $L$open_sse_tail_256 + cmp rbx,8*16 + ja NEAR $L$open_sse_tail_192 + cmp rbx,4*16 + ja NEAR $L$open_sse_tail_128 + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm12,XMMWORD[((160+96))+rbp] + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + + xor r8,r8 + mov rcx,rbx + cmp rcx,16 + jb NEAR $L$open_sse_tail_64_rounds +$L$open_sse_tail_64_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + sub rcx,16 +$L$open_sse_tail_64_rounds: + add r8,16 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + + cmp rcx,16 + jae NEAR $L$open_sse_tail_64_rounds_and_x1hash + cmp r8,10*16 + jne NEAR $L$open_sse_tail_64_rounds + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + jmp NEAR $L$open_sse_tail_64_dec_loop + +$L$open_sse_tail_128: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm13,XMMWORD[((160+96))+rbp] + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + + mov rcx,rbx + and rcx,-16 + xor r8,r8 +$L$open_sse_tail_128_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_sse_tail_128_rounds: + add r8,16 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,4 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,12 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,4 + + cmp r8,rcx + jb NEAR $L$open_sse_tail_128_rounds_and_x1hash + cmp r8,10*16 + jne NEAR $L$open_sse_tail_128_rounds + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 0)+rdi],xmm1 + movdqu XMMWORD[(16 + 0)+rdi],xmm5 + movdqu XMMWORD[(32 + 0)+rdi],xmm9 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + + sub rbx,4*16 + lea rsi,[64+rsi] + lea rdi,[64+rdi] + jmp NEAR $L$open_sse_tail_64_dec_loop + +$L$open_sse_tail_192: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm14,XMMWORD[((160+96))+rbp] + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + + mov rcx,rbx + mov r8,10*16 + cmp rcx,10*16 + cmovg rcx,r8 + and rcx,-16 + xor r8,r8 +$L$open_sse_tail_192_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_sse_tail_192_rounds: + add r8,16 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,4 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 + palignr xmm6,xmm6,4 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,12 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 + palignr xmm6,xmm6,12 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,4 + + cmp r8,rcx + jb NEAR $L$open_sse_tail_192_rounds_and_x1hash + cmp r8,10*16 + jne NEAR $L$open_sse_tail_192_rounds + cmp rbx,11*16 + jb NEAR $L$open_sse_tail_192_finish + add r10,QWORD[((0+160))+rsi] + adc r11,QWORD[((8+160))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + cmp rbx,12*16 + jb NEAR $L$open_sse_tail_192_finish + add r10,QWORD[((0+176))+rsi] + adc r11,QWORD[((8+176))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_sse_tail_192_finish: + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 0)+rdi],xmm2 + movdqu XMMWORD[(16 + 0)+rdi],xmm6 + movdqu XMMWORD[(32 + 0)+rdi],xmm10 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 64)+rdi],xmm1 + movdqu XMMWORD[(16 + 64)+rdi],xmm5 + movdqu XMMWORD[(32 + 64)+rdi],xmm9 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + + sub rbx,8*16 + lea rsi,[128+rsi] + lea rdi,[128+rdi] + jmp NEAR $L$open_sse_tail_64_dec_loop + +$L$open_sse_tail_256: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm3,xmm0 + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,XMMWORD[((160+96))+rbp] + paddd xmm15,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm15 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + + xor r8,r8 +$L$open_sse_tail_256_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + movdqa XMMWORD[(160+80)+rbp],xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,12 + psrld xmm4,20 + pxor xmm4,xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,7 + psrld xmm4,25 + pxor xmm4,xmm11 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,12 + psrld xmm5,20 + pxor xmm5,xmm11 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,7 + psrld xmm5,25 + pxor xmm5,xmm11 + palignr xmm5,xmm5,4 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,12 + psrld xmm6,20 + pxor xmm6,xmm11 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,7 + psrld xmm6,25 + pxor xmm6,xmm11 + palignr xmm6,xmm6,4 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,12 + movdqa xmm11,XMMWORD[((160+80))+rbp] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + movdqa XMMWORD[(160+80)+rbp],xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol16] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,12 + psrld xmm7,20 + pxor xmm7,xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol8] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,7 + psrld xmm7,25 + pxor xmm7,xmm9 + palignr xmm7,xmm7,4 + palignr xmm11,xmm11,8 + palignr xmm15,xmm15,12 + movdqa xmm9,XMMWORD[((160+80))+rbp] + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + movdqa XMMWORD[(160+80)+rbp],xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,12 + psrld xmm4,20 + pxor xmm4,xmm11 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm11,xmm4 + pslld xmm11,7 + psrld xmm4,25 + pxor xmm4,xmm11 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,12 + psrld xmm5,20 + pxor xmm5,xmm11 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm11,xmm5 + pslld xmm11,7 + psrld xmm5,25 + pxor xmm5,xmm11 + palignr xmm5,xmm5,12 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,4 + imul r9,r12 + add r15,r10 + adc r9,rdx + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,12 + psrld xmm6,20 + pxor xmm6,xmm11 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm11,xmm6 + pslld xmm11,7 + psrld xmm6,25 + pxor xmm6,xmm11 + palignr xmm6,xmm6,12 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,4 + movdqa xmm11,XMMWORD[((160+80))+rbp] + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + movdqa XMMWORD[(160+80)+rbp],xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol16] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,12 + psrld xmm7,20 + pxor xmm7,xmm9 + paddd xmm3,xmm7 + pxor xmm15,xmm3 + pshufb xmm15,XMMWORD[$L$rol8] + paddd xmm11,xmm15 + pxor xmm7,xmm11 + movdqa xmm9,xmm7 + pslld xmm9,7 + psrld xmm7,25 + pxor xmm7,xmm9 + palignr xmm7,xmm7,12 + palignr xmm11,xmm11,8 + palignr xmm15,xmm15,4 + movdqa xmm9,XMMWORD[((160+80))+rbp] + + add r8,16 + cmp r8,10*16 + jb NEAR $L$open_sse_tail_256_rounds_and_x1hash + + mov rcx,rbx + and rcx,-16 +$L$open_sse_tail_256_hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + add r8,16 + cmp r8,rcx + jb NEAR $L$open_sse_tail_256_hash + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqa XMMWORD[(160+80)+rbp],xmm12 + movdqu xmm12,XMMWORD[((0 + 0))+rsi] + pxor xmm12,xmm3 + movdqu XMMWORD[(0 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((16 + 0))+rsi] + pxor xmm12,xmm7 + movdqu XMMWORD[(16 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((32 + 0))+rsi] + pxor xmm12,xmm11 + movdqu XMMWORD[(32 + 0)+rdi],xmm12 + movdqu xmm12,XMMWORD[((48 + 0))+rsi] + pxor xmm12,xmm15 + movdqu XMMWORD[(48 + 0)+rdi],xmm12 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 64)+rdi],xmm2 + movdqu XMMWORD[(16 + 64)+rdi],xmm6 + movdqu XMMWORD[(32 + 64)+rdi],xmm10 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 128)+rdi],xmm1 + movdqu XMMWORD[(16 + 128)+rdi],xmm5 + movdqu XMMWORD[(32 + 128)+rdi],xmm9 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + + movdqa xmm12,XMMWORD[((160+80))+rbp] + sub rbx,12*16 + lea rsi,[192+rsi] + lea rdi,[192+rdi] + + +$L$open_sse_tail_64_dec_loop: + cmp rbx,16 + jb NEAR $L$open_sse_tail_16_init + sub rbx,16 + movdqu xmm3,XMMWORD[rsi] + pxor xmm0,xmm3 + movdqu XMMWORD[rdi],xmm0 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + movdqa xmm0,xmm4 + movdqa xmm4,xmm8 + movdqa xmm8,xmm12 + jmp NEAR $L$open_sse_tail_64_dec_loop +$L$open_sse_tail_16_init: + movdqa xmm1,xmm0 + + +$L$open_sse_tail_16: + test rbx,rbx + jz NEAR $L$open_sse_finalize + + + + pxor xmm3,xmm3 + lea rsi,[((-1))+rbx*1+rsi] + mov r8,rbx +$L$open_sse_tail_16_compose: + pslldq xmm3,1 + pinsrb xmm3,BYTE[rsi],0 + sub rsi,1 + sub r8,1 + jnz NEAR $L$open_sse_tail_16_compose + + movq r13,xmm3 + pextrq r14,xmm3,1 + + pxor xmm3,xmm1 + + +$L$open_sse_tail_16_extract: + pextrb XMMWORD[rdi],xmm3,0 + psrldq xmm3,1 + add rdi,1 + sub rbx,1 + jne NEAR $L$open_sse_tail_16_extract + + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$open_sse_finalize: + add r10,QWORD[((0+160+32))+rbp] + adc r11,QWORD[((8+160+32))+rbp] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + mov r13,r10 + mov r14,r11 + mov r15,r12 + sub r10,-5 + sbb r11,-1 + sbb r12,3 + cmovc r10,r13 + cmovc r11,r14 + cmovc r12,r15 + + add r10,QWORD[((0+160+16))+rbp] + adc r11,QWORD[((8+160+16))+rbp] + + movaps xmm6,XMMWORD[((0+0))+rbp] + movaps xmm7,XMMWORD[((16+0))+rbp] + movaps xmm8,XMMWORD[((32+0))+rbp] + movaps xmm9,XMMWORD[((48+0))+rbp] + movaps xmm10,XMMWORD[((64+0))+rbp] + movaps xmm11,XMMWORD[((80+0))+rbp] + movaps xmm12,XMMWORD[((96+0))+rbp] + movaps xmm13,XMMWORD[((112+0))+rbp] + movaps xmm14,XMMWORD[((128+0))+rbp] + movaps xmm15,XMMWORD[((144+0))+rbp] + + + add rsp,288 + 160 + 32 + + + pop r9 + + mov QWORD[r9],r10 + mov QWORD[8+r9],r11 + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbx + + pop rbp + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$open_sse_128: + + movdqu xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm1,xmm0 + movdqa xmm2,xmm0 + movdqu xmm4,XMMWORD[r9] + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + movdqu xmm8,XMMWORD[16+r9] + movdqa xmm9,xmm8 + movdqa xmm10,xmm8 + movdqu xmm12,XMMWORD[32+r9] + movdqa xmm13,xmm12 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm13 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,xmm13 + mov r10,10 + +$L$open_sse_128_rounds: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,4 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 + palignr xmm6,xmm6,4 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,12 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 + palignr xmm6,xmm6,12 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,4 + + dec r10 + jnz NEAR $L$open_sse_128_rounds + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm4,xmm7 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + paddd xmm9,xmm11 + paddd xmm10,xmm11 + paddd xmm13,xmm15 + paddd xmm15,XMMWORD[$L$sse_inc] + paddd xmm14,xmm15 + + pand xmm0,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm0 + movdqa XMMWORD[(160+16)+rbp],xmm4 + + mov r8,r8 + call poly_hash_ad_internal +$L$open_sse_128_xor_hash: + cmp rbx,16 + jb NEAR $L$open_sse_tail_16 + sub rbx,16 + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + + + movdqu xmm3,XMMWORD[rsi] + pxor xmm1,xmm3 + movdqu XMMWORD[rdi],xmm1 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + movdqa xmm1,xmm5 + movdqa xmm5,xmm9 + movdqa xmm9,xmm13 + movdqa xmm13,xmm2 + movdqa xmm2,xmm6 + movdqa xmm6,xmm10 + movdqa xmm10,xmm14 + jmp NEAR $L$open_sse_128_xor_hash +$L$SEH_end_chacha20_poly1305_open_sse41: + + + + + + + + +global chacha20_poly1305_seal_sse41 + +ALIGN 64 +chacha20_poly1305_seal_sse41: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_chacha20_poly1305_seal_sse41: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + + + push r9 + + sub rsp,288 + 160 + 32 + + lea rbp,[32+rsp] + and rbp,-32 + + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 + + mov rbx,QWORD[56+r9] + add rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx + mov rbx,rdx + + cmp rbx,128 + jbe NEAR $L$seal_sse_128 + + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqu xmm4,XMMWORD[r9] + movdqu xmm8,XMMWORD[16+r9] + movdqu xmm12,XMMWORD[32+r9] + + movdqa xmm1,xmm0 + movdqa xmm2,xmm0 + movdqa xmm3,xmm0 + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + movdqa xmm7,xmm4 + movdqa xmm9,xmm8 + movdqa xmm10,xmm8 + movdqa xmm11,xmm8 + movdqa xmm15,xmm12 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm12 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm12 + paddd xmm12,XMMWORD[$L$sse_inc] + + movdqa XMMWORD[(160+48)+rbp],xmm4 + movdqa XMMWORD[(160+64)+rbp],xmm8 + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + mov r10,10 +$L$seal_sse_init_rounds: + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + palignr xmm7,xmm7,4 + palignr xmm11,xmm11,8 + palignr xmm15,xmm15,12 + palignr xmm6,xmm6,4 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,12 + palignr xmm5,xmm5,4 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,12 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + palignr xmm7,xmm7,12 + palignr xmm11,xmm11,8 + palignr xmm15,xmm15,4 + palignr xmm6,xmm6,12 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,4 + palignr xmm5,xmm5,12 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,4 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + + dec r10 + jnz NEAR $L$seal_sse_init_rounds + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + + pand xmm3,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm3 + movdqa XMMWORD[(160+16)+rbp],xmm7 + + mov r8,r8 + call poly_hash_ad_internal + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 0)+rdi],xmm2 + movdqu XMMWORD[(16 + 0)+rdi],xmm6 + movdqu XMMWORD[(32 + 0)+rdi],xmm10 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 64)+rdi],xmm1 + movdqu XMMWORD[(16 + 64)+rdi],xmm5 + movdqu XMMWORD[(32 + 64)+rdi],xmm9 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + + cmp rbx,12*16 + ja NEAR $L$seal_sse_main_init + mov rcx,8*16 + sub rbx,8*16 + lea rsi,[128+rsi] + jmp NEAR $L$seal_sse_128_tail_hash +$L$seal_sse_main_init: + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm0,xmm3 + pxor xmm4,xmm7 + pxor xmm8,xmm11 + pxor xmm15,xmm12 + movdqu XMMWORD[(0 + 128)+rdi],xmm0 + movdqu XMMWORD[(16 + 128)+rdi],xmm4 + movdqu XMMWORD[(32 + 128)+rdi],xmm8 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + + mov rcx,12*16 + sub rbx,12*16 + lea rsi,[192+rsi] + mov rcx,2 + mov r8,8 + cmp rbx,4*16 + jbe NEAR $L$seal_sse_tail_64 + cmp rbx,8*16 + jbe NEAR $L$seal_sse_tail_128 + cmp rbx,12*16 + jbe NEAR $L$seal_sse_tail_192 + +$L$seal_sse_main_loop: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm3,xmm0 + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,XMMWORD[((160+96))+rbp] + paddd xmm15,XMMWORD[$L$sse_inc] + movdqa xmm14,xmm15 + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + movdqa XMMWORD[(160+144)+rbp],xmm15 + +ALIGN 32 +$L$seal_sse_main_rounds: + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + imul r9,r12 + add r15,r10 + adc r9,rdx + palignr xmm7,xmm7,4 + palignr xmm11,xmm11,8 + palignr xmm15,xmm15,12 + palignr xmm6,xmm6,4 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,12 + palignr xmm5,xmm5,4 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,12 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,XMMWORD[$L$rol16] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,20 + pslld xmm7,32-20 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,20 + pslld xmm6,32-20 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,20 + pslld xmm5,32-20 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,20 + pslld xmm4,32-20 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[$L$rol8] + paddd xmm3,xmm7 + paddd xmm2,xmm6 + paddd xmm1,xmm5 + paddd xmm0,xmm4 + pxor xmm15,xmm3 + pxor xmm14,xmm2 + pxor xmm13,xmm1 + pxor xmm12,xmm0 + pshufb xmm15,xmm8 + pshufb xmm14,xmm8 + pshufb xmm13,xmm8 + pshufb xmm12,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + paddd xmm11,xmm15 + paddd xmm10,xmm14 + paddd xmm9,xmm13 + paddd xmm8,xmm12 + pxor xmm7,xmm11 + pxor xmm6,xmm10 + pxor xmm5,xmm9 + pxor xmm4,xmm8 + movdqa XMMWORD[(160+80)+rbp],xmm8 + movdqa xmm8,xmm7 + psrld xmm8,25 + pslld xmm7,32-25 + pxor xmm7,xmm8 + movdqa xmm8,xmm6 + psrld xmm8,25 + pslld xmm6,32-25 + pxor xmm6,xmm8 + movdqa xmm8,xmm5 + psrld xmm8,25 + pslld xmm5,32-25 + pxor xmm5,xmm8 + movdqa xmm8,xmm4 + psrld xmm8,25 + pslld xmm4,32-25 + pxor xmm4,xmm8 + movdqa xmm8,XMMWORD[((160+80))+rbp] + palignr xmm7,xmm7,12 + palignr xmm11,xmm11,8 + palignr xmm15,xmm15,4 + palignr xmm6,xmm6,12 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,4 + palignr xmm5,xmm5,12 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,4 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + + lea rdi,[16+rdi] + dec r8 + jge NEAR $L$seal_sse_main_rounds + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_main_rounds + paddd xmm3,XMMWORD[$L$chacha20_consts] + paddd xmm7,XMMWORD[((160+48))+rbp] + paddd xmm11,XMMWORD[((160+64))+rbp] + paddd xmm15,XMMWORD[((160+144))+rbp] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + movdqa XMMWORD[(160+80)+rbp],xmm14 + movdqa XMMWORD[(160+80)+rbp],xmm14 + movdqu xmm14,XMMWORD[((0 + 0))+rsi] + pxor xmm14,xmm3 + movdqu XMMWORD[(0 + 0)+rdi],xmm14 + movdqu xmm14,XMMWORD[((16 + 0))+rsi] + pxor xmm14,xmm7 + movdqu XMMWORD[(16 + 0)+rdi],xmm14 + movdqu xmm14,XMMWORD[((32 + 0))+rsi] + pxor xmm14,xmm11 + movdqu XMMWORD[(32 + 0)+rdi],xmm14 + movdqu xmm14,XMMWORD[((48 + 0))+rsi] + pxor xmm14,xmm15 + movdqu XMMWORD[(48 + 0)+rdi],xmm14 + + movdqa xmm14,XMMWORD[((160+80))+rbp] + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 64)+rdi],xmm2 + movdqu XMMWORD[(16 + 64)+rdi],xmm6 + movdqu XMMWORD[(32 + 64)+rdi],xmm10 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 128))+rsi] + movdqu xmm7,XMMWORD[((16 + 128))+rsi] + movdqu xmm11,XMMWORD[((32 + 128))+rsi] + movdqu xmm15,XMMWORD[((48 + 128))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 128)+rdi],xmm1 + movdqu XMMWORD[(16 + 128)+rdi],xmm5 + movdqu XMMWORD[(32 + 128)+rdi],xmm9 + movdqu XMMWORD[(48 + 128)+rdi],xmm15 + + cmp rbx,16*16 + ja NEAR $L$seal_sse_main_loop_xor + + mov rcx,12*16 + sub rbx,12*16 + lea rsi,[192+rsi] + jmp NEAR $L$seal_sse_128_tail_hash +$L$seal_sse_main_loop_xor: + movdqu xmm3,XMMWORD[((0 + 192))+rsi] + movdqu xmm7,XMMWORD[((16 + 192))+rsi] + movdqu xmm11,XMMWORD[((32 + 192))+rsi] + movdqu xmm15,XMMWORD[((48 + 192))+rsi] + pxor xmm0,xmm3 + pxor xmm4,xmm7 + pxor xmm8,xmm11 + pxor xmm15,xmm12 + movdqu XMMWORD[(0 + 192)+rdi],xmm0 + movdqu XMMWORD[(16 + 192)+rdi],xmm4 + movdqu XMMWORD[(32 + 192)+rdi],xmm8 + movdqu XMMWORD[(48 + 192)+rdi],xmm15 + + lea rsi,[256+rsi] + sub rbx,16*16 + mov rcx,6 + mov r8,4 + cmp rbx,12*16 + jg NEAR $L$seal_sse_main_loop + mov rcx,rbx + test rbx,rbx + je NEAR $L$seal_sse_128_tail_hash + mov rcx,6 + cmp rbx,8*16 + ja NEAR $L$seal_sse_tail_192 + cmp rbx,4*16 + ja NEAR $L$seal_sse_tail_128 + +$L$seal_sse_tail_64: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm12,XMMWORD[((160+96))+rbp] + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + +$L$seal_sse_tail_64_rounds_and_x2hash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_sse_tail_64_rounds_and_x1hash: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_tail_64_rounds_and_x2hash + dec r8 + jge NEAR $L$seal_sse_tail_64_rounds_and_x1hash + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + + jmp NEAR $L$seal_sse_128_tail_xor + +$L$seal_sse_tail_128: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm13,XMMWORD[((160+96))+rbp] + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + +$L$seal_sse_tail_128_rounds_and_x2hash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_sse_tail_128_rounds_and_x1hash: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,4 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,12 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,12 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,4 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_tail_128_rounds_and_x2hash + dec r8 + jge NEAR $L$seal_sse_tail_128_rounds_and_x1hash + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 0)+rdi],xmm1 + movdqu XMMWORD[(16 + 0)+rdi],xmm5 + movdqu XMMWORD[(32 + 0)+rdi],xmm9 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + + mov rcx,4*16 + sub rbx,4*16 + lea rsi,[64+rsi] + jmp NEAR $L$seal_sse_128_tail_hash + +$L$seal_sse_tail_192: + movdqa xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm4,XMMWORD[((160+48))+rbp] + movdqa xmm8,XMMWORD[((160+64))+rbp] + movdqa xmm1,xmm0 + movdqa xmm5,xmm4 + movdqa xmm9,xmm8 + movdqa xmm2,xmm0 + movdqa xmm6,xmm4 + movdqa xmm10,xmm8 + movdqa xmm14,XMMWORD[((160+96))+rbp] + paddd xmm14,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm14 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm12,xmm13 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa XMMWORD[(160+96)+rbp],xmm12 + movdqa XMMWORD[(160+112)+rbp],xmm13 + movdqa XMMWORD[(160+128)+rbp],xmm14 + +$L$seal_sse_tail_192_rounds_and_x2hash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_sse_tail_192_rounds_and_x1hash: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,4 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 + palignr xmm6,xmm6,4 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,12 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,12 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 + palignr xmm6,xmm6,12 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,4 + + lea rdi,[16+rdi] + dec rcx + jg NEAR $L$seal_sse_tail_192_rounds_and_x2hash + dec r8 + jge NEAR $L$seal_sse_tail_192_rounds_and_x1hash + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm6,XMMWORD[((160+48))+rbp] + paddd xmm10,XMMWORD[((160+64))+rbp] + paddd xmm14,XMMWORD[((160+128))+rbp] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm5,XMMWORD[((160+48))+rbp] + paddd xmm9,XMMWORD[((160+64))+rbp] + paddd xmm13,XMMWORD[((160+112))+rbp] + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm4,XMMWORD[((160+48))+rbp] + paddd xmm8,XMMWORD[((160+64))+rbp] + paddd xmm12,XMMWORD[((160+96))+rbp] + movdqu xmm3,XMMWORD[((0 + 0))+rsi] + movdqu xmm7,XMMWORD[((16 + 0))+rsi] + movdqu xmm11,XMMWORD[((32 + 0))+rsi] + movdqu xmm15,XMMWORD[((48 + 0))+rsi] + pxor xmm2,xmm3 + pxor xmm6,xmm7 + pxor xmm10,xmm11 + pxor xmm15,xmm14 + movdqu XMMWORD[(0 + 0)+rdi],xmm2 + movdqu XMMWORD[(16 + 0)+rdi],xmm6 + movdqu XMMWORD[(32 + 0)+rdi],xmm10 + movdqu XMMWORD[(48 + 0)+rdi],xmm15 + movdqu xmm3,XMMWORD[((0 + 64))+rsi] + movdqu xmm7,XMMWORD[((16 + 64))+rsi] + movdqu xmm11,XMMWORD[((32 + 64))+rsi] + movdqu xmm15,XMMWORD[((48 + 64))+rsi] + pxor xmm1,xmm3 + pxor xmm5,xmm7 + pxor xmm9,xmm11 + pxor xmm15,xmm13 + movdqu XMMWORD[(0 + 64)+rdi],xmm1 + movdqu XMMWORD[(16 + 64)+rdi],xmm5 + movdqu XMMWORD[(32 + 64)+rdi],xmm9 + movdqu XMMWORD[(48 + 64)+rdi],xmm15 + + mov rcx,8*16 + sub rbx,8*16 + lea rsi,[128+rsi] + +$L$seal_sse_128_tail_hash: + cmp rcx,16 + jb NEAR $L$seal_sse_128_tail_xor + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + sub rcx,16 + lea rdi,[16+rdi] + jmp NEAR $L$seal_sse_128_tail_hash + +$L$seal_sse_128_tail_xor: + cmp rbx,16 + jb NEAR $L$seal_sse_tail_16 + sub rbx,16 + + movdqu xmm3,XMMWORD[rsi] + pxor xmm0,xmm3 + movdqu XMMWORD[rdi],xmm0 + + add r10,QWORD[rdi] + adc r11,QWORD[8+rdi] + adc r12,1 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + movdqa xmm0,xmm4 + movdqa xmm4,xmm8 + movdqa xmm8,xmm12 + movdqa xmm12,xmm1 + movdqa xmm1,xmm5 + movdqa xmm5,xmm9 + movdqa xmm9,xmm13 + jmp NEAR $L$seal_sse_128_tail_xor + +$L$seal_sse_tail_16: + test rbx,rbx + jz NEAR $L$process_blocks_of_extra_in + + mov r8,rbx + mov rcx,rbx + lea rsi,[((-1))+rbx*1+rsi] + pxor xmm15,xmm15 +$L$seal_sse_tail_16_compose: + pslldq xmm15,1 + pinsrb xmm15,BYTE[rsi],0 + lea rsi,[((-1))+rsi] + dec rcx + jne NEAR $L$seal_sse_tail_16_compose + + + pxor xmm15,xmm0 + + + mov rcx,rbx + movdqu xmm0,xmm15 +$L$seal_sse_tail_16_extract: + pextrb XMMWORD[rdi],xmm0,0 + psrldq xmm0,1 + add rdi,1 + sub rcx,1 + jnz NEAR $L$seal_sse_tail_16_extract + + + + + + + + + mov r9,QWORD[((288 + 160 + 32))+rsp] + mov r14,QWORD[56+r9] + mov r13,QWORD[48+r9] + test r14,r14 + jz NEAR $L$process_partial_block + + mov r15,16 + sub r15,rbx + cmp r14,r15 + + jge NEAR $L$load_extra_in + mov r15,r14 + +$L$load_extra_in: + + + lea rsi,[((-1))+r15*1+r13] + + + add r13,r15 + sub r14,r15 + mov QWORD[48+r9],r13 + mov QWORD[56+r9],r14 + + + + add r8,r15 + + + pxor xmm11,xmm11 +$L$load_extra_load_loop: + pslldq xmm11,1 + pinsrb xmm11,BYTE[rsi],0 + lea rsi,[((-1))+rsi] + sub r15,1 + jnz NEAR $L$load_extra_load_loop + + + + + mov r15,rbx + +$L$load_extra_shift_loop: + pslldq xmm11,1 + sub r15,1 + jnz NEAR $L$load_extra_shift_loop + + + + + lea r15,[$L$and_masks] + shl rbx,4 + pand xmm15,XMMWORD[((-16))+rbx*1+r15] + + + por xmm15,xmm11 + + + + movq r13,xmm15 + pextrq r14,xmm15,1 + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$process_blocks_of_extra_in: + + mov r9,QWORD[((288+32+160 ))+rsp] + mov rsi,QWORD[48+r9] + mov r8,QWORD[56+r9] + mov rcx,r8 + shr r8,4 + +$L$process_extra_hash_loop: + jz NEAR process_extra_in_trailer + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rsi,[16+rsi] + sub r8,1 + jmp NEAR $L$process_extra_hash_loop +process_extra_in_trailer: + and rcx,15 + mov rbx,rcx + jz NEAR $L$do_length_block + lea rsi,[((-1))+rcx*1+rsi] + +$L$process_extra_in_trailer_load: + pslldq xmm15,1 + pinsrb xmm15,BYTE[rsi],0 + lea rsi,[((-1))+rsi] + sub rcx,1 + jnz NEAR $L$process_extra_in_trailer_load + +$L$process_partial_block: + + lea r15,[$L$and_masks] + shl rbx,4 + pand xmm15,XMMWORD[((-16))+rbx*1+r15] + movq r13,xmm15 + pextrq r14,xmm15,1 + add r10,r13 + adc r11,r14 + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + +$L$do_length_block: + add r10,QWORD[((0+160+32))+rbp] + adc r11,QWORD[((8+160+32))+rbp] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + mov r13,r10 + mov r14,r11 + mov r15,r12 + sub r10,-5 + sbb r11,-1 + sbb r12,3 + cmovc r10,r13 + cmovc r11,r14 + cmovc r12,r15 + + add r10,QWORD[((0+160+16))+rbp] + adc r11,QWORD[((8+160+16))+rbp] + + movaps xmm6,XMMWORD[((0+0))+rbp] + movaps xmm7,XMMWORD[((16+0))+rbp] + movaps xmm8,XMMWORD[((32+0))+rbp] + movaps xmm9,XMMWORD[((48+0))+rbp] + movaps xmm10,XMMWORD[((64+0))+rbp] + movaps xmm11,XMMWORD[((80+0))+rbp] + movaps xmm12,XMMWORD[((96+0))+rbp] + movaps xmm13,XMMWORD[((112+0))+rbp] + movaps xmm14,XMMWORD[((128+0))+rbp] + movaps xmm15,XMMWORD[((144+0))+rbp] + + + add rsp,288 + 160 + 32 + + + pop r9 + + mov QWORD[r9],r10 + mov QWORD[8+r9],r11 + pop r15 + + pop r14 + + pop r13 + + pop r12 + + pop rbx + + pop rbp + + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$seal_sse_128: + + movdqu xmm0,XMMWORD[$L$chacha20_consts] + movdqa xmm1,xmm0 + movdqa xmm2,xmm0 + movdqu xmm4,XMMWORD[r9] + movdqa xmm5,xmm4 + movdqa xmm6,xmm4 + movdqu xmm8,XMMWORD[16+r9] + movdqa xmm9,xmm8 + movdqa xmm10,xmm8 + movdqu xmm14,XMMWORD[32+r9] + movdqa xmm12,xmm14 + paddd xmm12,XMMWORD[$L$sse_inc] + movdqa xmm13,xmm12 + paddd xmm13,XMMWORD[$L$sse_inc] + movdqa xmm7,xmm4 + movdqa xmm11,xmm8 + movdqa xmm15,xmm12 + mov r10,10 + +$L$seal_sse_128_rounds: + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,4 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,12 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,4 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,12 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 + palignr xmm6,xmm6,4 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,12 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol16] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,12 + psrld xmm4,20 + pxor xmm4,xmm3 + paddd xmm0,xmm4 + pxor xmm12,xmm0 + pshufb xmm12,XMMWORD[$L$rol8] + paddd xmm8,xmm12 + pxor xmm4,xmm8 + movdqa xmm3,xmm4 + pslld xmm3,7 + psrld xmm4,25 + pxor xmm4,xmm3 + palignr xmm4,xmm4,12 + palignr xmm8,xmm8,8 + palignr xmm12,xmm12,4 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol16] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,12 + psrld xmm5,20 + pxor xmm5,xmm3 + paddd xmm1,xmm5 + pxor xmm13,xmm1 + pshufb xmm13,XMMWORD[$L$rol8] + paddd xmm9,xmm13 + pxor xmm5,xmm9 + movdqa xmm3,xmm5 + pslld xmm3,7 + psrld xmm5,25 + pxor xmm5,xmm3 + palignr xmm5,xmm5,12 + palignr xmm9,xmm9,8 + palignr xmm13,xmm13,4 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol16] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,12 + psrld xmm6,20 + pxor xmm6,xmm3 + paddd xmm2,xmm6 + pxor xmm14,xmm2 + pshufb xmm14,XMMWORD[$L$rol8] + paddd xmm10,xmm14 + pxor xmm6,xmm10 + movdqa xmm3,xmm6 + pslld xmm3,7 + psrld xmm6,25 + pxor xmm6,xmm3 + palignr xmm6,xmm6,12 + palignr xmm10,xmm10,8 + palignr xmm14,xmm14,4 + + dec r10 + jnz NEAR $L$seal_sse_128_rounds + paddd xmm0,XMMWORD[$L$chacha20_consts] + paddd xmm1,XMMWORD[$L$chacha20_consts] + paddd xmm2,XMMWORD[$L$chacha20_consts] + paddd xmm4,xmm7 + paddd xmm5,xmm7 + paddd xmm6,xmm7 + paddd xmm8,xmm11 + paddd xmm9,xmm11 + paddd xmm12,xmm15 + paddd xmm15,XMMWORD[$L$sse_inc] + paddd xmm13,xmm15 + + pand xmm2,XMMWORD[$L$clamp] + movdqa XMMWORD[(160+0)+rbp],xmm2 + movdqa XMMWORD[(160+16)+rbp],xmm6 + + mov r8,r8 + call poly_hash_ad_internal + jmp NEAR $L$seal_sse_128_tail_xor +$L$SEH_end_chacha20_poly1305_seal_sse41: + + + +global chacha20_poly1305_open_avx2 + +ALIGN 64 +chacha20_poly1305_open_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_chacha20_poly1305_open_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + + + push r9 + + sub rsp,288 + 160 + 32 + + + lea rbp,[32+rsp] + and rbp,-32 + + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 + + mov rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx + + vzeroupper + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vbroadcasti128 ymm4,XMMWORD[r9] + vbroadcasti128 ymm8,XMMWORD[16+r9] + vbroadcasti128 ymm12,XMMWORD[32+r9] + vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init] + cmp rbx,6*32 + jbe NEAR $L$open_avx2_192 + cmp rbx,10*32 + jbe NEAR $L$open_avx2_320 + + vmovdqa YMMWORD[(160+64)+rbp],ymm4 + vmovdqa YMMWORD[(160+96)+rbp],ymm8 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + mov r10,10 +$L$open_avx2_init_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + + dec r10 + jne NEAR $L$open_avx2_init_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + + mov r8,r8 + call poly_hash_ad_internal + + xor rcx,rcx +$L$open_avx2_init_hash: + add r10,QWORD[((0+0))+rcx*1+rsi] + adc r11,QWORD[((8+0))+rcx*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + add rcx,16 + cmp rcx,2*32 + jne NEAR $L$open_avx2_init_hash + + vpxor ymm0,ymm0,YMMWORD[rsi] + vpxor ymm4,ymm4,YMMWORD[32+rsi] + + vmovdqu YMMWORD[rdi],ymm0 + vmovdqu YMMWORD[32+rdi],ymm4 + lea rsi,[64+rsi] + lea rdi,[64+rdi] + sub rbx,2*32 +$L$open_avx2_main_loop: + + cmp rbx,16*32 + jb NEAR $L$open_avx2_main_loop_done + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + xor rcx,rcx +$L$open_avx2_main_loop_rounds: + add r10,QWORD[((0+0))+rcx*1+rsi] + adc r11,QWORD[((8+0))+rcx*1+rsi] + adc r12,1 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + add r15,rax + adc r9,rdx + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + add r10,QWORD[((0+16))+rcx*1+rsi] + adc r11,QWORD[((8+16))+rcx*1+rsi] + adc r12,1 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + add r15,rax + adc r9,rdx + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + add r10,QWORD[((0+32))+rcx*1+rsi] + adc r11,QWORD[((8+32))+rcx*1+rsi] + adc r12,1 + + lea rcx,[48+rcx] + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + add r15,rax + adc r9,rdx + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpalignr ymm12,ymm12,ymm12,4 + + cmp rcx,10*6*8 + jne NEAR $L$open_avx2_main_loop_rounds + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + add r10,QWORD[((0+480))+rsi] + adc r11,QWORD[((8+480))+rsi] + adc r12,1 + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + add r10,QWORD[((0+480+16))+rsi] + adc r11,QWORD[((8+480+16))+rsi] + adc r12,1 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vperm2i128 ymm3,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm12,ymm8,0x02 + vperm2i128 ymm8,ymm12,ymm8,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi] + vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi] + vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi] + vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi] + vmovdqu YMMWORD[(0+384)+rdi],ymm3 + vmovdqu YMMWORD[(32+384)+rdi],ymm0 + vmovdqu YMMWORD[(64+384)+rdi],ymm4 + vmovdqu YMMWORD[(96+384)+rdi],ymm8 + + lea rsi,[512+rsi] + lea rdi,[512+rdi] + sub rbx,16*32 + jmp NEAR $L$open_avx2_main_loop +$L$open_avx2_main_loop_done: + test rbx,rbx + vzeroupper + je NEAR $L$open_sse_finalize + + cmp rbx,12*32 + ja NEAR $L$open_avx2_tail_512 + cmp rbx,8*32 + ja NEAR $L$open_avx2_tail_384 + cmp rbx,4*32 + ja NEAR $L$open_avx2_tail_256 + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + xor r8,r8 + mov rcx,rbx + and rcx,-16 + test rcx,rcx + je NEAR $L$open_avx2_tail_128_rounds +$L$open_avx2_tail_128_rounds_and_x1hash: + add r10,QWORD[((0+0))+r8*1+rsi] + adc r11,QWORD[((8+0))+r8*1+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$open_avx2_tail_128_rounds: + add r8,16 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + + cmp r8,rcx + jb NEAR $L$open_avx2_tail_128_rounds_and_x1hash + cmp r8,160 + jne NEAR $L$open_avx2_tail_128_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + jmp NEAR $L$open_avx2_tail_128_xor + +$L$open_avx2_tail_256: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + + mov QWORD[((160+128))+rbp],rbx + mov rcx,rbx + sub rcx,4*32 + shr rcx,4 + mov r8,10 + cmp rcx,10 + cmovg rcx,r8 + mov rbx,rsi + xor r8,r8 +$L$open_avx2_tail_256_rounds_and_x1hash: + add r10,QWORD[((0+0))+rbx] + adc r11,QWORD[((8+0))+rbx] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rbx,[16+rbx] +$L$open_avx2_tail_256_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + + inc r8 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + cmp r8,rcx + jb NEAR $L$open_avx2_tail_256_rounds_and_x1hash + cmp r8,10 + jne NEAR $L$open_avx2_tail_256_rounds + mov r8,rbx + sub rbx,rsi + mov rcx,rbx + mov rbx,QWORD[((160+128))+rbp] +$L$open_avx2_tail_256_hash: + add rcx,16 + cmp rcx,rbx + jg NEAR $L$open_avx2_tail_256_done + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + jmp NEAR $L$open_avx2_tail_256_hash +$L$open_avx2_tail_256_done: + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm1 + vmovdqu YMMWORD[(64+0)+rdi],ymm5 + vmovdqu YMMWORD[(96+0)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + lea rsi,[128+rsi] + lea rdi,[128+rdi] + sub rbx,4*32 + jmp NEAR $L$open_avx2_tail_128_xor + +$L$open_avx2_tail_384: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + + mov QWORD[((160+128))+rbp],rbx + mov rcx,rbx + sub rcx,8*32 + shr rcx,4 + add rcx,6 + mov r8,10 + cmp rcx,10 + cmovg rcx,r8 + mov rbx,rsi + xor r8,r8 +$L$open_avx2_tail_384_rounds_and_x2hash: + add r10,QWORD[((0+0))+rbx] + adc r11,QWORD[((8+0))+rbx] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rbx,[16+rbx] +$L$open_avx2_tail_384_rounds_and_x1hash: + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + add r10,QWORD[((0+0))+rbx] + adc r11,QWORD[((8+0))+rbx] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rbx,[16+rbx] + inc r8 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + + cmp r8,rcx + jb NEAR $L$open_avx2_tail_384_rounds_and_x2hash + cmp r8,10 + jne NEAR $L$open_avx2_tail_384_rounds_and_x1hash + mov r8,rbx + sub rbx,rsi + mov rcx,rbx + mov rbx,QWORD[((160+128))+rbp] +$L$open_avx2_384_tail_hash: + add rcx,16 + cmp rcx,rbx + jg NEAR $L$open_avx2_384_tail_done + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + jmp NEAR $L$open_avx2_384_tail_hash +$L$open_avx2_384_tail_done: + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm2 + vmovdqu YMMWORD[(64+0)+rdi],ymm6 + vmovdqu YMMWORD[(96+0)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm1 + vmovdqu YMMWORD[(64+128)+rdi],ymm5 + vmovdqu YMMWORD[(96+128)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + lea rsi,[256+rsi] + lea rdi,[256+rdi] + sub rbx,8*32 + jmp NEAR $L$open_avx2_tail_128_xor + +$L$open_avx2_tail_512: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + xor rcx,rcx + mov r8,rsi +$L$open_avx2_tail_512_rounds_and_x2hash: + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] +$L$open_avx2_tail_512_rounds_and_x1hash: + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + add r10,QWORD[((0+16))+r8] + adc r11,QWORD[((8+16))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[32+r8] + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + + inc rcx + cmp rcx,4 + jl NEAR $L$open_avx2_tail_512_rounds_and_x2hash + cmp rcx,10 + jne NEAR $L$open_avx2_tail_512_rounds_and_x1hash + mov rcx,rbx + sub rcx,12*32 + and rcx,-16 +$L$open_avx2_tail_512_hash: + test rcx,rcx + je NEAR $L$open_avx2_tail_512_done + add r10,QWORD[((0+0))+r8] + adc r11,QWORD[((8+0))+r8] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea r8,[16+r8] + sub rcx,2*8 + jmp NEAR $L$open_avx2_tail_512_hash +$L$open_avx2_tail_512_done: + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + lea rsi,[384+rsi] + lea rdi,[384+rdi] + sub rbx,12*32 +$L$open_avx2_tail_128_xor: + cmp rbx,32 + jb NEAR $L$open_avx2_tail_32_xor + sub rbx,32 + vpxor ymm0,ymm0,YMMWORD[rsi] + vmovdqu YMMWORD[rdi],ymm0 + lea rsi,[32+rsi] + lea rdi,[32+rdi] + vmovdqa ymm0,ymm4 + vmovdqa ymm4,ymm8 + vmovdqa ymm8,ymm12 + jmp NEAR $L$open_avx2_tail_128_xor +$L$open_avx2_tail_32_xor: + cmp rbx,16 + vmovdqa xmm1,xmm0 + jb NEAR $L$open_avx2_exit + sub rbx,16 + + vpxor xmm1,xmm0,XMMWORD[rsi] + vmovdqu XMMWORD[rdi],xmm1 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + vperm2i128 ymm0,ymm0,ymm0,0x11 + vmovdqa xmm1,xmm0 +$L$open_avx2_exit: + vzeroupper + jmp NEAR $L$open_sse_tail_16 + +$L$open_avx2_192: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vmovdqa ymm11,ymm12 + vmovdqa ymm15,ymm13 + mov r10,10 +$L$open_avx2_192_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + + dec r10 + jne NEAR $L$open_avx2_192_rounds + vpaddd ymm0,ymm0,ymm2 + vpaddd ymm1,ymm1,ymm2 + vpaddd ymm4,ymm4,ymm6 + vpaddd ymm5,ymm5,ymm6 + vpaddd ymm8,ymm8,ymm10 + vpaddd ymm9,ymm9,ymm10 + vpaddd ymm12,ymm12,ymm11 + vpaddd ymm13,ymm13,ymm15 + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 +$L$open_avx2_short: + mov r8,r8 + call poly_hash_ad_internal +$L$open_avx2_short_hash_and_xor_loop: + cmp rbx,32 + jb NEAR $L$open_avx2_short_tail_32 + sub rbx,32 + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rsi] + adc r11,QWORD[((8+16))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + + vpxor ymm0,ymm0,YMMWORD[rsi] + vmovdqu YMMWORD[rdi],ymm0 + lea rsi,[32+rsi] + lea rdi,[32+rdi] + + vmovdqa ymm0,ymm4 + vmovdqa ymm4,ymm8 + vmovdqa ymm8,ymm12 + vmovdqa ymm12,ymm1 + vmovdqa ymm1,ymm5 + vmovdqa ymm5,ymm9 + vmovdqa ymm9,ymm13 + vmovdqa ymm13,ymm2 + vmovdqa ymm2,ymm6 + jmp NEAR $L$open_avx2_short_hash_and_xor_loop +$L$open_avx2_short_tail_32: + cmp rbx,16 + vmovdqa xmm1,xmm0 + jb NEAR $L$open_avx2_short_tail_32_exit + sub rbx,16 + add r10,QWORD[((0+0))+rsi] + adc r11,QWORD[((8+0))+rsi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + vpxor xmm3,xmm0,XMMWORD[rsi] + vmovdqu XMMWORD[rdi],xmm3 + lea rsi,[16+rsi] + lea rdi,[16+rdi] + vextracti128 xmm1,ymm0,1 +$L$open_avx2_short_tail_32_exit: + vzeroupper + jmp NEAR $L$open_sse_tail_16 + +$L$open_avx2_320: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc] + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + mov r10,10 +$L$open_avx2_320_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + dec r10 + jne NEAR $L$open_avx2_320_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,ymm7 + vpaddd ymm5,ymm5,ymm7 + vpaddd ymm6,ymm6,ymm7 + vpaddd ymm8,ymm8,ymm11 + vpaddd ymm9,ymm9,ymm11 + vpaddd ymm10,ymm10,ymm11 + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 + vperm2i128 ymm9,ymm6,ymm2,0x02 + vperm2i128 ymm13,ymm14,ymm10,0x02 + vperm2i128 ymm2,ymm6,ymm2,0x13 + vperm2i128 ymm6,ymm14,ymm10,0x13 + jmp NEAR $L$open_avx2_short +$L$SEH_end_chacha20_poly1305_open_avx2: + + + +global chacha20_poly1305_seal_avx2 + +ALIGN 64 +chacha20_poly1305_seal_avx2: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_chacha20_poly1305_seal_avx2: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + mov rcx,r9 + mov r8,QWORD[40+rsp] + mov r9,QWORD[48+rsp] + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r13 + + push r14 + + push r15 + + + + push r9 + + sub rsp,288 + 160 + 32 + + lea rbp,[32+rsp] + and rbp,-32 + + movaps XMMWORD[(0+0)+rbp],xmm6 + movaps XMMWORD[(16+0)+rbp],xmm7 + movaps XMMWORD[(32+0)+rbp],xmm8 + movaps XMMWORD[(48+0)+rbp],xmm9 + movaps XMMWORD[(64+0)+rbp],xmm10 + movaps XMMWORD[(80+0)+rbp],xmm11 + movaps XMMWORD[(96+0)+rbp],xmm12 + movaps XMMWORD[(112+0)+rbp],xmm13 + movaps XMMWORD[(128+0)+rbp],xmm14 + movaps XMMWORD[(144+0)+rbp],xmm15 + + mov rbx,QWORD[56+r9] + add rbx,rdx + mov QWORD[((0+160+32))+rbp],r8 + mov QWORD[((8+160+32))+rbp],rbx + mov rbx,rdx + + vzeroupper + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vbroadcasti128 ymm4,XMMWORD[r9] + vbroadcasti128 ymm8,XMMWORD[16+r9] + vbroadcasti128 ymm12,XMMWORD[32+r9] + vpaddd ymm12,ymm12,YMMWORD[$L$avx2_init] + cmp rbx,6*32 + jbe NEAR $L$seal_avx2_192 + cmp rbx,10*32 + jbe NEAR $L$seal_avx2_320 + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm3,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm7,ymm4 + vmovdqa YMMWORD[(160+64)+rbp],ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vmovdqa ymm11,ymm8 + vmovdqa YMMWORD[(160+96)+rbp],ymm8 + vmovdqa ymm15,ymm12 + vpaddd ymm14,ymm15,YMMWORD[$L$avx2_inc] + vpaddd ymm13,ymm14,YMMWORD[$L$avx2_inc] + vpaddd ymm12,ymm13,YMMWORD[$L$avx2_inc] + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + mov r10,10 +$L$seal_avx2_init_rounds: + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + + dec r10 + jnz NEAR $L$seal_avx2_init_rounds + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vperm2i128 ymm11,ymm15,ymm11,0x13 + vperm2i128 ymm15,ymm7,ymm3,0x02 + vperm2i128 ymm3,ymm7,ymm3,0x13 + vpand ymm15,ymm15,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm15 + mov r8,r8 + call poly_hash_ad_internal + + vpxor ymm3,ymm3,YMMWORD[rsi] + vpxor ymm11,ymm11,YMMWORD[32+rsi] + vmovdqu YMMWORD[rdi],ymm3 + vmovdqu YMMWORD[32+rdi],ymm11 + vperm2i128 ymm15,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm15,ymm15,YMMWORD[((0+64))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+64))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+64))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+64))+rsi] + vmovdqu YMMWORD[(0+64)+rdi],ymm15 + vmovdqu YMMWORD[(32+64)+rdi],ymm2 + vmovdqu YMMWORD[(64+64)+rdi],ymm6 + vmovdqu YMMWORD[(96+64)+rdi],ymm10 + vperm2i128 ymm15,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm15,ymm15,YMMWORD[((0+192))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+192))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+192))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+192))+rsi] + vmovdqu YMMWORD[(0+192)+rdi],ymm15 + vmovdqu YMMWORD[(32+192)+rdi],ymm1 + vmovdqu YMMWORD[(64+192)+rdi],ymm5 + vmovdqu YMMWORD[(96+192)+rdi],ymm9 + vperm2i128 ymm15,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm15 + + lea rsi,[320+rsi] + sub rbx,10*32 + mov rcx,10*32 + cmp rbx,4*32 + jbe NEAR $L$seal_avx2_short_hash_remainder + vpxor ymm0,ymm0,YMMWORD[rsi] + vpxor ymm4,ymm4,YMMWORD[32+rsi] + vpxor ymm8,ymm8,YMMWORD[64+rsi] + vpxor ymm12,ymm12,YMMWORD[96+rsi] + vmovdqu YMMWORD[320+rdi],ymm0 + vmovdqu YMMWORD[352+rdi],ymm4 + vmovdqu YMMWORD[384+rdi],ymm8 + vmovdqu YMMWORD[416+rdi],ymm12 + lea rsi,[128+rsi] + sub rbx,4*32 + mov rcx,8 + mov r8,2 + cmp rbx,4*32 + jbe NEAR $L$seal_avx2_tail_128 + cmp rbx,8*32 + jbe NEAR $L$seal_avx2_tail_256 + cmp rbx,12*32 + jbe NEAR $L$seal_avx2_tail_384 + cmp rbx,16*32 + jbe NEAR $L$seal_avx2_tail_512 + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + + sub rdi,16 + mov rcx,9 + jmp NEAR $L$seal_avx2_main_loop_rounds_entry +ALIGN 32 +$L$seal_avx2_main_loop: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + + mov rcx,10 +ALIGN 32 +$L$seal_avx2_main_loop_rounds: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + add r15,rax + adc r9,rdx + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + +$L$seal_avx2_main_loop_rounds_entry: + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + add r15,rax + adc r9,rdx + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + add r10,QWORD[((0+32))+rdi] + adc r11,QWORD[((8+32))+rdi] + adc r12,1 + + lea rdi,[48+rdi] + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + add r15,rax + adc r9,rdx + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpalignr ymm12,ymm12,ymm12,4 + + dec rcx + jne NEAR $L$seal_avx2_main_loop_rounds + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm12,ymm8,0x02 + vperm2i128 ymm8,ymm12,ymm8,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+384))+rsi] + vpxor ymm0,ymm0,YMMWORD[((32+384))+rsi] + vpxor ymm4,ymm4,YMMWORD[((64+384))+rsi] + vpxor ymm8,ymm8,YMMWORD[((96+384))+rsi] + vmovdqu YMMWORD[(0+384)+rdi],ymm3 + vmovdqu YMMWORD[(32+384)+rdi],ymm0 + vmovdqu YMMWORD[(64+384)+rdi],ymm4 + vmovdqu YMMWORD[(96+384)+rdi],ymm8 + + lea rsi,[512+rsi] + sub rbx,16*32 + cmp rbx,16*32 + jg NEAR $L$seal_avx2_main_loop + + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + mov rcx,10 + xor r8,r8 + + cmp rbx,12*32 + ja NEAR $L$seal_avx2_tail_512 + cmp rbx,8*32 + ja NEAR $L$seal_avx2_tail_384 + cmp rbx,4*32 + ja NEAR $L$seal_avx2_tail_256 + +$L$seal_avx2_tail_128: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + +$L$seal_avx2_tail_128_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_128_rounds_and_2xhash: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_128_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_128_rounds_and_2xhash + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + jmp NEAR $L$seal_avx2_short_loop + +$L$seal_avx2_tail_256: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm13,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + +$L$seal_avx2_tail_256_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_256_rounds_and_2xhash: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_256_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_256_rounds_and_2xhash + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+0))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+0))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm1 + vmovdqu YMMWORD[(64+0)+rdi],ymm5 + vmovdqu YMMWORD[(96+0)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + mov rcx,4*32 + lea rsi,[128+rsi] + sub rbx,4*32 + jmp NEAR $L$seal_avx2_short_hash_remainder + +$L$seal_avx2_tail_384: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + +$L$seal_avx2_tail_384_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_384_rounds_and_2xhash: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_384_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_384_rounds_and_2xhash + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+0))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+0))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+0))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm3 + vmovdqu YMMWORD[(32+0)+rdi],ymm2 + vmovdqu YMMWORD[(64+0)+rdi],ymm6 + vmovdqu YMMWORD[(96+0)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+128))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+128))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm1 + vmovdqu YMMWORD[(64+128)+rdi],ymm5 + vmovdqu YMMWORD[(96+128)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + mov rcx,8*32 + lea rsi,[256+rsi] + sub rbx,8*32 + jmp NEAR $L$seal_avx2_short_hash_remainder + +$L$seal_avx2_tail_512: + vmovdqa ymm0,YMMWORD[$L$chacha20_consts] + vmovdqa ymm4,YMMWORD[((160+64))+rbp] + vmovdqa ymm8,YMMWORD[((160+96))+rbp] + vmovdqa ymm1,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm2,ymm0 + vmovdqa ymm6,ymm4 + vmovdqa ymm10,ymm8 + vmovdqa ymm3,ymm0 + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm15,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm14,ymm12,ymm15 + vpaddd ymm13,ymm12,ymm14 + vpaddd ymm12,ymm12,ymm13 + vmovdqa YMMWORD[(160+256)+rbp],ymm15 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + +$L$seal_avx2_tail_512_rounds_and_3xhash: + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + add r15,rax + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] +$L$seal_avx2_tail_512_rounds_and_2xhash: + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,4 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,12 + vpalignr ymm6,ymm6,ymm6,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm5,ymm5,ymm5,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm4,ymm4,ymm4,4 + add r15,rax + adc r9,rdx + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,12 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vmovdqa ymm8,YMMWORD[$L$rol16] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,20 + vpslld ymm7,ymm7,32-20 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,20 + vpslld ymm6,ymm6,32-20 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,20 + vpslld ymm5,ymm5,32-20 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,20 + vpslld ymm4,ymm4,32-20 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[$L$rol8] + vpaddd ymm3,ymm3,ymm7 + vpaddd ymm2,ymm2,ymm6 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + vpaddd ymm1,ymm1,ymm5 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm15,ymm15,ymm3 + vpxor ymm14,ymm14,ymm2 + vpxor ymm13,ymm13,ymm1 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm15,ymm15,ymm8 + vpshufb ymm14,ymm14,ymm8 + vpshufb ymm13,ymm13,ymm8 + vpshufb ymm12,ymm12,ymm8 + vpaddd ymm11,ymm11,ymm15 + vpaddd ymm10,ymm10,ymm14 + vpaddd ymm9,ymm9,ymm13 + vpaddd ymm8,ymm12,YMMWORD[((160+128))+rbp] + vpxor ymm7,ymm7,ymm11 + vpxor ymm6,ymm6,ymm10 + vpxor ymm5,ymm5,ymm9 + vpxor ymm4,ymm4,ymm8 + vmovdqa YMMWORD[(160+128)+rbp],ymm8 + vpsrld ymm8,ymm7,25 + mov rdx,QWORD[((0+160+0))+rbp] + mov r15,rdx + mulx r14,r13,r10 + mulx rdx,rax,r11 + imul r15,r12 + add r14,rax + adc r15,rdx + vpslld ymm7,ymm7,32-25 + vpxor ymm7,ymm7,ymm8 + vpsrld ymm8,ymm6,25 + vpslld ymm6,ymm6,32-25 + vpxor ymm6,ymm6,ymm8 + vpsrld ymm8,ymm5,25 + vpslld ymm5,ymm5,32-25 + vpxor ymm5,ymm5,ymm8 + vpsrld ymm8,ymm4,25 + vpslld ymm4,ymm4,32-25 + vpxor ymm4,ymm4,ymm8 + vmovdqa ymm8,YMMWORD[((160+128))+rbp] + vpalignr ymm7,ymm7,ymm7,12 + vpalignr ymm11,ymm11,ymm11,8 + vpalignr ymm15,ymm15,ymm15,4 + vpalignr ymm6,ymm6,ymm6,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm5,ymm5,ymm5,12 + vpalignr ymm9,ymm9,ymm9,8 + mov rdx,QWORD[((8+160+0))+rbp] + mulx rax,r10,r10 + add r14,r10 + mulx r9,r11,r11 + adc r15,r11 + adc r9,0 + imul rdx,r12 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm4,ymm4,ymm4,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm12,ymm12,ymm12,4 + + + + + + + + + + + + + + + + + add r15,rax + adc r9,rdx + + + + + + + + + + + + + + + + + + + + + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + dec rcx + jg NEAR $L$seal_avx2_tail_512_rounds_and_3xhash + dec r8 + jge NEAR $L$seal_avx2_tail_512_rounds_and_2xhash + vpaddd ymm3,ymm3,YMMWORD[$L$chacha20_consts] + vpaddd ymm7,ymm7,YMMWORD[((160+64))+rbp] + vpaddd ymm11,ymm11,YMMWORD[((160+96))+rbp] + vpaddd ymm15,ymm15,YMMWORD[((160+256))+rbp] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm6,ymm6,YMMWORD[((160+64))+rbp] + vpaddd ymm10,ymm10,YMMWORD[((160+96))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm5,ymm5,YMMWORD[((160+64))+rbp] + vpaddd ymm9,ymm9,YMMWORD[((160+96))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,YMMWORD[((160+64))+rbp] + vpaddd ymm8,ymm8,YMMWORD[((160+96))+rbp] + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + + vmovdqa YMMWORD[(160+128)+rbp],ymm0 + vperm2i128 ymm0,ymm7,ymm3,0x02 + vperm2i128 ymm7,ymm7,ymm3,0x13 + vperm2i128 ymm3,ymm15,ymm11,0x02 + vperm2i128 ymm11,ymm15,ymm11,0x13 + vpxor ymm0,ymm0,YMMWORD[((0+0))+rsi] + vpxor ymm3,ymm3,YMMWORD[((32+0))+rsi] + vpxor ymm7,ymm7,YMMWORD[((64+0))+rsi] + vpxor ymm11,ymm11,YMMWORD[((96+0))+rsi] + vmovdqu YMMWORD[(0+0)+rdi],ymm0 + vmovdqu YMMWORD[(32+0)+rdi],ymm3 + vmovdqu YMMWORD[(64+0)+rdi],ymm7 + vmovdqu YMMWORD[(96+0)+rdi],ymm11 + + vmovdqa ymm0,YMMWORD[((160+128))+rbp] + vperm2i128 ymm3,ymm6,ymm2,0x02 + vperm2i128 ymm6,ymm6,ymm2,0x13 + vperm2i128 ymm2,ymm14,ymm10,0x02 + vperm2i128 ymm10,ymm14,ymm10,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+128))+rsi] + vpxor ymm2,ymm2,YMMWORD[((32+128))+rsi] + vpxor ymm6,ymm6,YMMWORD[((64+128))+rsi] + vpxor ymm10,ymm10,YMMWORD[((96+128))+rsi] + vmovdqu YMMWORD[(0+128)+rdi],ymm3 + vmovdqu YMMWORD[(32+128)+rdi],ymm2 + vmovdqu YMMWORD[(64+128)+rdi],ymm6 + vmovdqu YMMWORD[(96+128)+rdi],ymm10 + vperm2i128 ymm3,ymm5,ymm1,0x02 + vperm2i128 ymm5,ymm5,ymm1,0x13 + vperm2i128 ymm1,ymm13,ymm9,0x02 + vperm2i128 ymm9,ymm13,ymm9,0x13 + vpxor ymm3,ymm3,YMMWORD[((0+256))+rsi] + vpxor ymm1,ymm1,YMMWORD[((32+256))+rsi] + vpxor ymm5,ymm5,YMMWORD[((64+256))+rsi] + vpxor ymm9,ymm9,YMMWORD[((96+256))+rsi] + vmovdqu YMMWORD[(0+256)+rdi],ymm3 + vmovdqu YMMWORD[(32+256)+rdi],ymm1 + vmovdqu YMMWORD[(64+256)+rdi],ymm5 + vmovdqu YMMWORD[(96+256)+rdi],ymm9 + vperm2i128 ymm3,ymm4,ymm0,0x13 + vperm2i128 ymm0,ymm4,ymm0,0x02 + vperm2i128 ymm4,ymm12,ymm8,0x02 + vperm2i128 ymm12,ymm12,ymm8,0x13 + vmovdqa ymm8,ymm3 + + mov rcx,12*32 + lea rsi,[384+rsi] + sub rbx,12*32 + jmp NEAR $L$seal_avx2_short_hash_remainder + +$L$seal_avx2_320: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vpaddd ymm14,ymm13,YMMWORD[$L$avx2_inc] + vmovdqa ymm7,ymm4 + vmovdqa ymm11,ymm8 + vmovdqa YMMWORD[(160+160)+rbp],ymm12 + vmovdqa YMMWORD[(160+192)+rbp],ymm13 + vmovdqa YMMWORD[(160+224)+rbp],ymm14 + mov r10,10 +$L$seal_avx2_320_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,12 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol16] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpsrld ymm3,ymm6,20 + vpslld ymm6,ymm6,12 + vpxor ymm6,ymm6,ymm3 + vpaddd ymm2,ymm2,ymm6 + vpxor ymm14,ymm14,ymm2 + vpshufb ymm14,ymm14,YMMWORD[$L$rol8] + vpaddd ymm10,ymm10,ymm14 + vpxor ymm6,ymm6,ymm10 + vpslld ymm3,ymm6,7 + vpsrld ymm6,ymm6,25 + vpxor ymm6,ymm6,ymm3 + vpalignr ymm14,ymm14,ymm14,4 + vpalignr ymm10,ymm10,ymm10,8 + vpalignr ymm6,ymm6,ymm6,12 + + dec r10 + jne NEAR $L$seal_avx2_320_rounds + vpaddd ymm0,ymm0,YMMWORD[$L$chacha20_consts] + vpaddd ymm1,ymm1,YMMWORD[$L$chacha20_consts] + vpaddd ymm2,ymm2,YMMWORD[$L$chacha20_consts] + vpaddd ymm4,ymm4,ymm7 + vpaddd ymm5,ymm5,ymm7 + vpaddd ymm6,ymm6,ymm7 + vpaddd ymm8,ymm8,ymm11 + vpaddd ymm9,ymm9,ymm11 + vpaddd ymm10,ymm10,ymm11 + vpaddd ymm12,ymm12,YMMWORD[((160+160))+rbp] + vpaddd ymm13,ymm13,YMMWORD[((160+192))+rbp] + vpaddd ymm14,ymm14,YMMWORD[((160+224))+rbp] + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 + vperm2i128 ymm9,ymm6,ymm2,0x02 + vperm2i128 ymm13,ymm14,ymm10,0x02 + vperm2i128 ymm2,ymm6,ymm2,0x13 + vperm2i128 ymm6,ymm14,ymm10,0x13 + jmp NEAR $L$seal_avx2_short + +$L$seal_avx2_192: + vmovdqa ymm1,ymm0 + vmovdqa ymm2,ymm0 + vmovdqa ymm5,ymm4 + vmovdqa ymm6,ymm4 + vmovdqa ymm9,ymm8 + vmovdqa ymm10,ymm8 + vpaddd ymm13,ymm12,YMMWORD[$L$avx2_inc] + vmovdqa ymm11,ymm12 + vmovdqa ymm15,ymm13 + mov r10,10 +$L$seal_avx2_192_rounds: + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,12 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,4 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,12 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,4 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol16] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpsrld ymm3,ymm4,20 + vpslld ymm4,ymm4,12 + vpxor ymm4,ymm4,ymm3 + vpaddd ymm0,ymm0,ymm4 + vpxor ymm12,ymm12,ymm0 + vpshufb ymm12,ymm12,YMMWORD[$L$rol8] + vpaddd ymm8,ymm8,ymm12 + vpxor ymm4,ymm4,ymm8 + vpslld ymm3,ymm4,7 + vpsrld ymm4,ymm4,25 + vpxor ymm4,ymm4,ymm3 + vpalignr ymm12,ymm12,ymm12,4 + vpalignr ymm8,ymm8,ymm8,8 + vpalignr ymm4,ymm4,ymm4,12 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol16] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpsrld ymm3,ymm5,20 + vpslld ymm5,ymm5,12 + vpxor ymm5,ymm5,ymm3 + vpaddd ymm1,ymm1,ymm5 + vpxor ymm13,ymm13,ymm1 + vpshufb ymm13,ymm13,YMMWORD[$L$rol8] + vpaddd ymm9,ymm9,ymm13 + vpxor ymm5,ymm5,ymm9 + vpslld ymm3,ymm5,7 + vpsrld ymm5,ymm5,25 + vpxor ymm5,ymm5,ymm3 + vpalignr ymm13,ymm13,ymm13,4 + vpalignr ymm9,ymm9,ymm9,8 + vpalignr ymm5,ymm5,ymm5,12 + + dec r10 + jne NEAR $L$seal_avx2_192_rounds + vpaddd ymm0,ymm0,ymm2 + vpaddd ymm1,ymm1,ymm2 + vpaddd ymm4,ymm4,ymm6 + vpaddd ymm5,ymm5,ymm6 + vpaddd ymm8,ymm8,ymm10 + vpaddd ymm9,ymm9,ymm10 + vpaddd ymm12,ymm12,ymm11 + vpaddd ymm13,ymm13,ymm15 + vperm2i128 ymm3,ymm4,ymm0,0x02 + + vpand ymm3,ymm3,YMMWORD[$L$clamp] + vmovdqa YMMWORD[(160+0)+rbp],ymm3 + + vperm2i128 ymm0,ymm4,ymm0,0x13 + vperm2i128 ymm4,ymm12,ymm8,0x13 + vperm2i128 ymm8,ymm5,ymm1,0x02 + vperm2i128 ymm12,ymm13,ymm9,0x02 + vperm2i128 ymm1,ymm5,ymm1,0x13 + vperm2i128 ymm5,ymm13,ymm9,0x13 +$L$seal_avx2_short: + mov r8,r8 + call poly_hash_ad_internal + xor rcx,rcx +$L$seal_avx2_short_hash_remainder: + cmp rcx,16 + jb NEAR $L$seal_avx2_short_loop + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + sub rcx,16 + add rdi,16 + jmp NEAR $L$seal_avx2_short_hash_remainder +$L$seal_avx2_short_loop: + cmp rbx,32 + jb NEAR $L$seal_avx2_short_tail + sub rbx,32 + + vpxor ymm0,ymm0,YMMWORD[rsi] + vmovdqu YMMWORD[rdi],ymm0 + lea rsi,[32+rsi] + + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + add r10,QWORD[((0+16))+rdi] + adc r11,QWORD[((8+16))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[32+rdi] + + vmovdqa ymm0,ymm4 + vmovdqa ymm4,ymm8 + vmovdqa ymm8,ymm12 + vmovdqa ymm12,ymm1 + vmovdqa ymm1,ymm5 + vmovdqa ymm5,ymm9 + vmovdqa ymm9,ymm13 + vmovdqa ymm13,ymm2 + vmovdqa ymm2,ymm6 + jmp NEAR $L$seal_avx2_short_loop +$L$seal_avx2_short_tail: + cmp rbx,16 + jb NEAR $L$seal_avx2_exit + sub rbx,16 + vpxor xmm3,xmm0,XMMWORD[rsi] + vmovdqu XMMWORD[rdi],xmm3 + lea rsi,[16+rsi] + add r10,QWORD[((0+0))+rdi] + adc r11,QWORD[((8+0))+rdi] + adc r12,1 + mov rax,QWORD[((0+160+0))+rbp] + mov r15,rax + mul r10 + mov r13,rax + mov r14,rdx + mov rax,QWORD[((0+160+0))+rbp] + mul r11 + imul r15,r12 + add r14,rax + adc r15,rdx + mov rax,QWORD[((8+160+0))+rbp] + mov r9,rax + mul r10 + add r14,rax + adc rdx,0 + mov r10,rdx + mov rax,QWORD[((8+160+0))+rbp] + mul r11 + add r15,rax + adc rdx,0 + imul r9,r12 + add r15,r10 + adc r9,rdx + mov r10,r13 + mov r11,r14 + mov r12,r15 + and r12,3 + mov r13,r15 + and r13,-4 + mov r14,r9 + shrd r15,r9,2 + shr r9,2 + add r15,r13 + adc r9,r14 + add r10,r15 + adc r11,r9 + adc r12,0 + + lea rdi,[16+rdi] + vextracti128 xmm0,ymm0,1 +$L$seal_avx2_exit: + vzeroupper + jmp NEAR $L$seal_sse_tail_16 + +$L$SEH_end_chacha20_poly1305_seal_avx2: +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/crypto/md5-586-apple.S b/third_party/boringssl/gen/crypto/md5-586-apple.S new file mode 100644 index 00000000..986d5900 --- /dev/null +++ b/third_party/boringssl/gen/crypto/md5-586-apple.S @@ -0,0 +1,684 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _md5_block_asm_data_order +.private_extern _md5_block_asm_data_order +.align 4 +_md5_block_asm_data_order: +L_md5_block_asm_data_order_begin: + pushl %esi + pushl %edi + movl 12(%esp),%edi + movl 16(%esp),%esi + movl 20(%esp),%ecx + pushl %ebp + shll $6,%ecx + pushl %ebx + addl %esi,%ecx + subl $64,%ecx + movl (%edi),%eax + pushl %ecx + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx +L000start: + + # R0 section + movl %ecx,%edi + movl (%esi),%ebp + # R0 0 + xorl %edx,%edi + andl %ebx,%edi + leal 3614090360(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 4(%esi),%ebp + addl %ebx,%eax + # R0 1 + xorl %ecx,%edi + andl %eax,%edi + leal 3905402710(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 8(%esi),%ebp + addl %eax,%edx + # R0 2 + xorl %ebx,%edi + andl %edx,%edi + leal 606105819(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 12(%esi),%ebp + addl %edx,%ecx + # R0 3 + xorl %eax,%edi + andl %ecx,%edi + leal 3250441966(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 16(%esi),%ebp + addl %ecx,%ebx + # R0 4 + xorl %edx,%edi + andl %ebx,%edi + leal 4118548399(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 20(%esi),%ebp + addl %ebx,%eax + # R0 5 + xorl %ecx,%edi + andl %eax,%edi + leal 1200080426(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 24(%esi),%ebp + addl %eax,%edx + # R0 6 + xorl %ebx,%edi + andl %edx,%edi + leal 2821735955(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 28(%esi),%ebp + addl %edx,%ecx + # R0 7 + xorl %eax,%edi + andl %ecx,%edi + leal 4249261313(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 32(%esi),%ebp + addl %ecx,%ebx + # R0 8 + xorl %edx,%edi + andl %ebx,%edi + leal 1770035416(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 36(%esi),%ebp + addl %ebx,%eax + # R0 9 + xorl %ecx,%edi + andl %eax,%edi + leal 2336552879(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 40(%esi),%ebp + addl %eax,%edx + # R0 10 + xorl %ebx,%edi + andl %edx,%edi + leal 4294925233(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 44(%esi),%ebp + addl %edx,%ecx + # R0 11 + xorl %eax,%edi + andl %ecx,%edi + leal 2304563134(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 48(%esi),%ebp + addl %ecx,%ebx + # R0 12 + xorl %edx,%edi + andl %ebx,%edi + leal 1804603682(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 52(%esi),%ebp + addl %ebx,%eax + # R0 13 + xorl %ecx,%edi + andl %eax,%edi + leal 4254626195(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 56(%esi),%ebp + addl %eax,%edx + # R0 14 + xorl %ebx,%edi + andl %edx,%edi + leal 2792965006(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 60(%esi),%ebp + addl %edx,%ecx + # R0 15 + xorl %eax,%edi + andl %ecx,%edi + leal 1236535329(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 4(%esi),%ebp + addl %ecx,%ebx + + # R1 section + # R1 16 + leal 4129170786(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 24(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + # R1 17 + leal 3225465664(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 44(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + # R1 18 + leal 643717713(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl (%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + # R1 19 + leal 3921069994(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 20(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + # R1 20 + leal 3593408605(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 40(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + # R1 21 + leal 38016083(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 60(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + # R1 22 + leal 3634488961(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl 16(%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + # R1 23 + leal 3889429448(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 36(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + # R1 24 + leal 568446438(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 56(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + # R1 25 + leal 3275163606(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 12(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + # R1 26 + leal 4107603335(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl 32(%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + # R1 27 + leal 1163531501(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 52(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + # R1 28 + leal 2850285829(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 8(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + # R1 29 + leal 4243563512(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 28(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + # R1 30 + leal 1735328473(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl 48(%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + # R1 31 + leal 2368359562(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 20(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + + # R2 section + # R2 32 + xorl %edx,%edi + xorl %ebx,%edi + leal 4294588738(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl 32(%esi),%ebp + movl %ebx,%edi + # R2 33 + leal 2272392833(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 44(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + # R2 34 + xorl %ebx,%edi + xorl %edx,%edi + leal 1839030562(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 56(%esi),%ebp + movl %edx,%edi + # R2 35 + leal 4259657740(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl 4(%esi),%ebp + addl %edi,%ebx + movl %ecx,%edi + roll $23,%ebx + addl %ecx,%ebx + # R2 36 + xorl %edx,%edi + xorl %ebx,%edi + leal 2763975236(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl 16(%esi),%ebp + movl %ebx,%edi + # R2 37 + leal 1272893353(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 28(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + # R2 38 + xorl %ebx,%edi + xorl %edx,%edi + leal 4139469664(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 40(%esi),%ebp + movl %edx,%edi + # R2 39 + leal 3200236656(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl 52(%esi),%ebp + addl %edi,%ebx + movl %ecx,%edi + roll $23,%ebx + addl %ecx,%ebx + # R2 40 + xorl %edx,%edi + xorl %ebx,%edi + leal 681279174(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl (%esi),%ebp + movl %ebx,%edi + # R2 41 + leal 3936430074(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 12(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + # R2 42 + xorl %ebx,%edi + xorl %edx,%edi + leal 3572445317(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 24(%esi),%ebp + movl %edx,%edi + # R2 43 + leal 76029189(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl 36(%esi),%ebp + addl %edi,%ebx + movl %ecx,%edi + roll $23,%ebx + addl %ecx,%ebx + # R2 44 + xorl %edx,%edi + xorl %ebx,%edi + leal 3654602809(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl 48(%esi),%ebp + movl %ebx,%edi + # R2 45 + leal 3873151461(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 60(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + # R2 46 + xorl %ebx,%edi + xorl %edx,%edi + leal 530742520(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 8(%esi),%ebp + movl %edx,%edi + # R2 47 + leal 3299628645(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl (%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $23,%ebx + addl %ecx,%ebx + + # R3 section + # R3 48 + xorl %edx,%edi + orl %ebx,%edi + leal 4096336452(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 28(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + # R3 49 + orl %eax,%edi + leal 1126891415(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 56(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + # R3 50 + orl %edx,%edi + leal 2878612391(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 20(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + # R3 51 + orl %ecx,%edi + leal 4237533241(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 48(%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $21,%ebx + xorl %edx,%edi + addl %ecx,%ebx + # R3 52 + orl %ebx,%edi + leal 1700485571(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 12(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + # R3 53 + orl %eax,%edi + leal 2399980690(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 40(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + # R3 54 + orl %edx,%edi + leal 4293915773(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 4(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + # R3 55 + orl %ecx,%edi + leal 2240044497(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 32(%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $21,%ebx + xorl %edx,%edi + addl %ecx,%ebx + # R3 56 + orl %ebx,%edi + leal 1873313359(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 60(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + # R3 57 + orl %eax,%edi + leal 4264355552(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 24(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + # R3 58 + orl %edx,%edi + leal 2734768916(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 52(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + # R3 59 + orl %ecx,%edi + leal 1309151649(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 16(%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $21,%ebx + xorl %edx,%edi + addl %ecx,%ebx + # R3 60 + orl %ebx,%edi + leal 4149444226(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 44(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + # R3 61 + orl %eax,%edi + leal 3174756917(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 8(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + # R3 62 + orl %edx,%edi + leal 718787259(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 36(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + # R3 63 + orl %ecx,%edi + leal 3951481745(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 24(%esp),%ebp + addl %edi,%ebx + addl $64,%esi + roll $21,%ebx + movl (%ebp),%edi + addl %ecx,%ebx + addl %edi,%eax + movl 4(%ebp),%edi + addl %edi,%ebx + movl 8(%ebp),%edi + addl %edi,%ecx + movl 12(%ebp),%edi + addl %edi,%edx + movl %eax,(%ebp) + movl %ebx,4(%ebp) + movl (%esp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + cmpl %esi,%edi + jae L000start + popl %eax + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/crypto/md5-586-linux.S b/third_party/boringssl/gen/crypto/md5-586-linux.S new file mode 100644 index 00000000..a297f2bc --- /dev/null +++ b/third_party/boringssl/gen/crypto/md5-586-linux.S @@ -0,0 +1,686 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl md5_block_asm_data_order +.hidden md5_block_asm_data_order +.type md5_block_asm_data_order,@function +.align 16 +md5_block_asm_data_order: +.L_md5_block_asm_data_order_begin: + pushl %esi + pushl %edi + movl 12(%esp),%edi + movl 16(%esp),%esi + movl 20(%esp),%ecx + pushl %ebp + shll $6,%ecx + pushl %ebx + addl %esi,%ecx + subl $64,%ecx + movl (%edi),%eax + pushl %ecx + movl 4(%edi),%ebx + movl 8(%edi),%ecx + movl 12(%edi),%edx +.L000start: + + + movl %ecx,%edi + movl (%esi),%ebp + + xorl %edx,%edi + andl %ebx,%edi + leal 3614090360(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 4(%esi),%ebp + addl %ebx,%eax + + xorl %ecx,%edi + andl %eax,%edi + leal 3905402710(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 8(%esi),%ebp + addl %eax,%edx + + xorl %ebx,%edi + andl %edx,%edi + leal 606105819(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 12(%esi),%ebp + addl %edx,%ecx + + xorl %eax,%edi + andl %ecx,%edi + leal 3250441966(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 16(%esi),%ebp + addl %ecx,%ebx + + xorl %edx,%edi + andl %ebx,%edi + leal 4118548399(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 20(%esi),%ebp + addl %ebx,%eax + + xorl %ecx,%edi + andl %eax,%edi + leal 1200080426(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 24(%esi),%ebp + addl %eax,%edx + + xorl %ebx,%edi + andl %edx,%edi + leal 2821735955(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 28(%esi),%ebp + addl %edx,%ecx + + xorl %eax,%edi + andl %ecx,%edi + leal 4249261313(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 32(%esi),%ebp + addl %ecx,%ebx + + xorl %edx,%edi + andl %ebx,%edi + leal 1770035416(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 36(%esi),%ebp + addl %ebx,%eax + + xorl %ecx,%edi + andl %eax,%edi + leal 2336552879(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 40(%esi),%ebp + addl %eax,%edx + + xorl %ebx,%edi + andl %edx,%edi + leal 4294925233(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 44(%esi),%ebp + addl %edx,%ecx + + xorl %eax,%edi + andl %ecx,%edi + leal 2304563134(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 48(%esi),%ebp + addl %ecx,%ebx + + xorl %edx,%edi + andl %ebx,%edi + leal 1804603682(%eax,%ebp,1),%eax + xorl %edx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $7,%eax + movl 52(%esi),%ebp + addl %ebx,%eax + + xorl %ecx,%edi + andl %eax,%edi + leal 4254626195(%edx,%ebp,1),%edx + xorl %ecx,%edi + addl %edi,%edx + movl %eax,%edi + roll $12,%edx + movl 56(%esi),%ebp + addl %eax,%edx + + xorl %ebx,%edi + andl %edx,%edi + leal 2792965006(%ecx,%ebp,1),%ecx + xorl %ebx,%edi + addl %edi,%ecx + movl %edx,%edi + roll $17,%ecx + movl 60(%esi),%ebp + addl %edx,%ecx + + xorl %eax,%edi + andl %ecx,%edi + leal 1236535329(%ebx,%ebp,1),%ebx + xorl %eax,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $22,%ebx + movl 4(%esi),%ebp + addl %ecx,%ebx + + + + leal 4129170786(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 24(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + + leal 3225465664(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 44(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + + leal 643717713(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl (%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + + leal 3921069994(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 20(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + + leal 3593408605(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 40(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + + leal 38016083(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 60(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + + leal 3634488961(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl 16(%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + + leal 3889429448(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 36(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + + leal 568446438(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 56(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + + leal 3275163606(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 12(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + + leal 4107603335(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl 32(%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + + leal 1163531501(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 52(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + + leal 2850285829(%eax,%ebp,1),%eax + xorl %ebx,%edi + andl %edx,%edi + movl 8(%esi),%ebp + xorl %ecx,%edi + addl %edi,%eax + movl %ebx,%edi + roll $5,%eax + addl %ebx,%eax + + leal 4243563512(%edx,%ebp,1),%edx + xorl %eax,%edi + andl %ecx,%edi + movl 28(%esi),%ebp + xorl %ebx,%edi + addl %edi,%edx + movl %eax,%edi + roll $9,%edx + addl %eax,%edx + + leal 1735328473(%ecx,%ebp,1),%ecx + xorl %edx,%edi + andl %ebx,%edi + movl 48(%esi),%ebp + xorl %eax,%edi + addl %edi,%ecx + movl %edx,%edi + roll $14,%ecx + addl %edx,%ecx + + leal 2368359562(%ebx,%ebp,1),%ebx + xorl %ecx,%edi + andl %eax,%edi + movl 20(%esi),%ebp + xorl %edx,%edi + addl %edi,%ebx + movl %ecx,%edi + roll $20,%ebx + addl %ecx,%ebx + + + + xorl %edx,%edi + xorl %ebx,%edi + leal 4294588738(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl 32(%esi),%ebp + movl %ebx,%edi + + leal 2272392833(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 44(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + + xorl %ebx,%edi + xorl %edx,%edi + leal 1839030562(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 56(%esi),%ebp + movl %edx,%edi + + leal 4259657740(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl 4(%esi),%ebp + addl %edi,%ebx + movl %ecx,%edi + roll $23,%ebx + addl %ecx,%ebx + + xorl %edx,%edi + xorl %ebx,%edi + leal 2763975236(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl 16(%esi),%ebp + movl %ebx,%edi + + leal 1272893353(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 28(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + + xorl %ebx,%edi + xorl %edx,%edi + leal 4139469664(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 40(%esi),%ebp + movl %edx,%edi + + leal 3200236656(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl 52(%esi),%ebp + addl %edi,%ebx + movl %ecx,%edi + roll $23,%ebx + addl %ecx,%ebx + + xorl %edx,%edi + xorl %ebx,%edi + leal 681279174(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl (%esi),%ebp + movl %ebx,%edi + + leal 3936430074(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 12(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + + xorl %ebx,%edi + xorl %edx,%edi + leal 3572445317(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 24(%esi),%ebp + movl %edx,%edi + + leal 76029189(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl 36(%esi),%ebp + addl %edi,%ebx + movl %ecx,%edi + roll $23,%ebx + addl %ecx,%ebx + + xorl %edx,%edi + xorl %ebx,%edi + leal 3654602809(%eax,%ebp,1),%eax + addl %edi,%eax + roll $4,%eax + movl 48(%esi),%ebp + movl %ebx,%edi + + leal 3873151461(%edx,%ebp,1),%edx + addl %ebx,%eax + xorl %ecx,%edi + xorl %eax,%edi + movl 60(%esi),%ebp + addl %edi,%edx + movl %eax,%edi + roll $11,%edx + addl %eax,%edx + + xorl %ebx,%edi + xorl %edx,%edi + leal 530742520(%ecx,%ebp,1),%ecx + addl %edi,%ecx + roll $16,%ecx + movl 8(%esi),%ebp + movl %edx,%edi + + leal 3299628645(%ebx,%ebp,1),%ebx + addl %edx,%ecx + xorl %eax,%edi + xorl %ecx,%edi + movl (%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $23,%ebx + addl %ecx,%ebx + + + + xorl %edx,%edi + orl %ebx,%edi + leal 4096336452(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 28(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + + orl %eax,%edi + leal 1126891415(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 56(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + + orl %edx,%edi + leal 2878612391(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 20(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + + orl %ecx,%edi + leal 4237533241(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 48(%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $21,%ebx + xorl %edx,%edi + addl %ecx,%ebx + + orl %ebx,%edi + leal 1700485571(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 12(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + + orl %eax,%edi + leal 2399980690(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 40(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + + orl %edx,%edi + leal 4293915773(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 4(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + + orl %ecx,%edi + leal 2240044497(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 32(%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $21,%ebx + xorl %edx,%edi + addl %ecx,%ebx + + orl %ebx,%edi + leal 1873313359(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 60(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + + orl %eax,%edi + leal 4264355552(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 24(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + + orl %edx,%edi + leal 2734768916(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 52(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + + orl %ecx,%edi + leal 1309151649(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 16(%esi),%ebp + addl %edi,%ebx + movl $-1,%edi + roll $21,%ebx + xorl %edx,%edi + addl %ecx,%ebx + + orl %ebx,%edi + leal 4149444226(%eax,%ebp,1),%eax + xorl %ecx,%edi + movl 44(%esi),%ebp + addl %edi,%eax + movl $-1,%edi + roll $6,%eax + xorl %ecx,%edi + addl %ebx,%eax + + orl %eax,%edi + leal 3174756917(%edx,%ebp,1),%edx + xorl %ebx,%edi + movl 8(%esi),%ebp + addl %edi,%edx + movl $-1,%edi + roll $10,%edx + xorl %ebx,%edi + addl %eax,%edx + + orl %edx,%edi + leal 718787259(%ecx,%ebp,1),%ecx + xorl %eax,%edi + movl 36(%esi),%ebp + addl %edi,%ecx + movl $-1,%edi + roll $15,%ecx + xorl %eax,%edi + addl %edx,%ecx + + orl %ecx,%edi + leal 3951481745(%ebx,%ebp,1),%ebx + xorl %edx,%edi + movl 24(%esp),%ebp + addl %edi,%ebx + addl $64,%esi + roll $21,%ebx + movl (%ebp),%edi + addl %ecx,%ebx + addl %edi,%eax + movl 4(%ebp),%edi + addl %edi,%ebx + movl 8(%ebp),%edi + addl %edi,%ecx + movl 12(%ebp),%edi + addl %edi,%edx + movl %eax,(%ebp) + movl %ebx,4(%ebp) + movl (%esp),%edi + movl %ecx,8(%ebp) + movl %edx,12(%ebp) + cmpl %esi,%edi + jae .L000start + popl %eax + popl %ebx + popl %ebp + popl %edi + popl %esi + ret +.size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/crypto/md5-586-win.asm b/third_party/boringssl/gen/crypto/md5-586-win.asm new file mode 100644 index 00000000..c160d78e --- /dev/null +++ b/third_party/boringssl/gen/crypto/md5-586-win.asm @@ -0,0 +1,694 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _md5_block_asm_data_order +align 16 +_md5_block_asm_data_order: +L$_md5_block_asm_data_order_begin: + push esi + push edi + mov edi,DWORD [12+esp] + mov esi,DWORD [16+esp] + mov ecx,DWORD [20+esp] + push ebp + shl ecx,6 + push ebx + add ecx,esi + sub ecx,64 + mov eax,DWORD [edi] + push ecx + mov ebx,DWORD [4+edi] + mov ecx,DWORD [8+edi] + mov edx,DWORD [12+edi] +L$000start: + ; + ; R0 section + mov edi,ecx + mov ebp,DWORD [esi] + ; R0 0 + xor edi,edx + and edi,ebx + lea eax,[3614090360+ebp*1+eax] + xor edi,edx + add eax,edi + mov edi,ebx + rol eax,7 + mov ebp,DWORD [4+esi] + add eax,ebx + ; R0 1 + xor edi,ecx + and edi,eax + lea edx,[3905402710+ebp*1+edx] + xor edi,ecx + add edx,edi + mov edi,eax + rol edx,12 + mov ebp,DWORD [8+esi] + add edx,eax + ; R0 2 + xor edi,ebx + and edi,edx + lea ecx,[606105819+ebp*1+ecx] + xor edi,ebx + add ecx,edi + mov edi,edx + rol ecx,17 + mov ebp,DWORD [12+esi] + add ecx,edx + ; R0 3 + xor edi,eax + and edi,ecx + lea ebx,[3250441966+ebp*1+ebx] + xor edi,eax + add ebx,edi + mov edi,ecx + rol ebx,22 + mov ebp,DWORD [16+esi] + add ebx,ecx + ; R0 4 + xor edi,edx + and edi,ebx + lea eax,[4118548399+ebp*1+eax] + xor edi,edx + add eax,edi + mov edi,ebx + rol eax,7 + mov ebp,DWORD [20+esi] + add eax,ebx + ; R0 5 + xor edi,ecx + and edi,eax + lea edx,[1200080426+ebp*1+edx] + xor edi,ecx + add edx,edi + mov edi,eax + rol edx,12 + mov ebp,DWORD [24+esi] + add edx,eax + ; R0 6 + xor edi,ebx + and edi,edx + lea ecx,[2821735955+ebp*1+ecx] + xor edi,ebx + add ecx,edi + mov edi,edx + rol ecx,17 + mov ebp,DWORD [28+esi] + add ecx,edx + ; R0 7 + xor edi,eax + and edi,ecx + lea ebx,[4249261313+ebp*1+ebx] + xor edi,eax + add ebx,edi + mov edi,ecx + rol ebx,22 + mov ebp,DWORD [32+esi] + add ebx,ecx + ; R0 8 + xor edi,edx + and edi,ebx + lea eax,[1770035416+ebp*1+eax] + xor edi,edx + add eax,edi + mov edi,ebx + rol eax,7 + mov ebp,DWORD [36+esi] + add eax,ebx + ; R0 9 + xor edi,ecx + and edi,eax + lea edx,[2336552879+ebp*1+edx] + xor edi,ecx + add edx,edi + mov edi,eax + rol edx,12 + mov ebp,DWORD [40+esi] + add edx,eax + ; R0 10 + xor edi,ebx + and edi,edx + lea ecx,[4294925233+ebp*1+ecx] + xor edi,ebx + add ecx,edi + mov edi,edx + rol ecx,17 + mov ebp,DWORD [44+esi] + add ecx,edx + ; R0 11 + xor edi,eax + and edi,ecx + lea ebx,[2304563134+ebp*1+ebx] + xor edi,eax + add ebx,edi + mov edi,ecx + rol ebx,22 + mov ebp,DWORD [48+esi] + add ebx,ecx + ; R0 12 + xor edi,edx + and edi,ebx + lea eax,[1804603682+ebp*1+eax] + xor edi,edx + add eax,edi + mov edi,ebx + rol eax,7 + mov ebp,DWORD [52+esi] + add eax,ebx + ; R0 13 + xor edi,ecx + and edi,eax + lea edx,[4254626195+ebp*1+edx] + xor edi,ecx + add edx,edi + mov edi,eax + rol edx,12 + mov ebp,DWORD [56+esi] + add edx,eax + ; R0 14 + xor edi,ebx + and edi,edx + lea ecx,[2792965006+ebp*1+ecx] + xor edi,ebx + add ecx,edi + mov edi,edx + rol ecx,17 + mov ebp,DWORD [60+esi] + add ecx,edx + ; R0 15 + xor edi,eax + and edi,ecx + lea ebx,[1236535329+ebp*1+ebx] + xor edi,eax + add ebx,edi + mov edi,ecx + rol ebx,22 + mov ebp,DWORD [4+esi] + add ebx,ecx + ; + ; R1 section + ; R1 16 + lea eax,[4129170786+ebp*1+eax] + xor edi,ebx + and edi,edx + mov ebp,DWORD [24+esi] + xor edi,ecx + add eax,edi + mov edi,ebx + rol eax,5 + add eax,ebx + ; R1 17 + lea edx,[3225465664+ebp*1+edx] + xor edi,eax + and edi,ecx + mov ebp,DWORD [44+esi] + xor edi,ebx + add edx,edi + mov edi,eax + rol edx,9 + add edx,eax + ; R1 18 + lea ecx,[643717713+ebp*1+ecx] + xor edi,edx + and edi,ebx + mov ebp,DWORD [esi] + xor edi,eax + add ecx,edi + mov edi,edx + rol ecx,14 + add ecx,edx + ; R1 19 + lea ebx,[3921069994+ebp*1+ebx] + xor edi,ecx + and edi,eax + mov ebp,DWORD [20+esi] + xor edi,edx + add ebx,edi + mov edi,ecx + rol ebx,20 + add ebx,ecx + ; R1 20 + lea eax,[3593408605+ebp*1+eax] + xor edi,ebx + and edi,edx + mov ebp,DWORD [40+esi] + xor edi,ecx + add eax,edi + mov edi,ebx + rol eax,5 + add eax,ebx + ; R1 21 + lea edx,[38016083+ebp*1+edx] + xor edi,eax + and edi,ecx + mov ebp,DWORD [60+esi] + xor edi,ebx + add edx,edi + mov edi,eax + rol edx,9 + add edx,eax + ; R1 22 + lea ecx,[3634488961+ebp*1+ecx] + xor edi,edx + and edi,ebx + mov ebp,DWORD [16+esi] + xor edi,eax + add ecx,edi + mov edi,edx + rol ecx,14 + add ecx,edx + ; R1 23 + lea ebx,[3889429448+ebp*1+ebx] + xor edi,ecx + and edi,eax + mov ebp,DWORD [36+esi] + xor edi,edx + add ebx,edi + mov edi,ecx + rol ebx,20 + add ebx,ecx + ; R1 24 + lea eax,[568446438+ebp*1+eax] + xor edi,ebx + and edi,edx + mov ebp,DWORD [56+esi] + xor edi,ecx + add eax,edi + mov edi,ebx + rol eax,5 + add eax,ebx + ; R1 25 + lea edx,[3275163606+ebp*1+edx] + xor edi,eax + and edi,ecx + mov ebp,DWORD [12+esi] + xor edi,ebx + add edx,edi + mov edi,eax + rol edx,9 + add edx,eax + ; R1 26 + lea ecx,[4107603335+ebp*1+ecx] + xor edi,edx + and edi,ebx + mov ebp,DWORD [32+esi] + xor edi,eax + add ecx,edi + mov edi,edx + rol ecx,14 + add ecx,edx + ; R1 27 + lea ebx,[1163531501+ebp*1+ebx] + xor edi,ecx + and edi,eax + mov ebp,DWORD [52+esi] + xor edi,edx + add ebx,edi + mov edi,ecx + rol ebx,20 + add ebx,ecx + ; R1 28 + lea eax,[2850285829+ebp*1+eax] + xor edi,ebx + and edi,edx + mov ebp,DWORD [8+esi] + xor edi,ecx + add eax,edi + mov edi,ebx + rol eax,5 + add eax,ebx + ; R1 29 + lea edx,[4243563512+ebp*1+edx] + xor edi,eax + and edi,ecx + mov ebp,DWORD [28+esi] + xor edi,ebx + add edx,edi + mov edi,eax + rol edx,9 + add edx,eax + ; R1 30 + lea ecx,[1735328473+ebp*1+ecx] + xor edi,edx + and edi,ebx + mov ebp,DWORD [48+esi] + xor edi,eax + add ecx,edi + mov edi,edx + rol ecx,14 + add ecx,edx + ; R1 31 + lea ebx,[2368359562+ebp*1+ebx] + xor edi,ecx + and edi,eax + mov ebp,DWORD [20+esi] + xor edi,edx + add ebx,edi + mov edi,ecx + rol ebx,20 + add ebx,ecx + ; + ; R2 section + ; R2 32 + xor edi,edx + xor edi,ebx + lea eax,[4294588738+ebp*1+eax] + add eax,edi + rol eax,4 + mov ebp,DWORD [32+esi] + mov edi,ebx + ; R2 33 + lea edx,[2272392833+ebp*1+edx] + add eax,ebx + xor edi,ecx + xor edi,eax + mov ebp,DWORD [44+esi] + add edx,edi + mov edi,eax + rol edx,11 + add edx,eax + ; R2 34 + xor edi,ebx + xor edi,edx + lea ecx,[1839030562+ebp*1+ecx] + add ecx,edi + rol ecx,16 + mov ebp,DWORD [56+esi] + mov edi,edx + ; R2 35 + lea ebx,[4259657740+ebp*1+ebx] + add ecx,edx + xor edi,eax + xor edi,ecx + mov ebp,DWORD [4+esi] + add ebx,edi + mov edi,ecx + rol ebx,23 + add ebx,ecx + ; R2 36 + xor edi,edx + xor edi,ebx + lea eax,[2763975236+ebp*1+eax] + add eax,edi + rol eax,4 + mov ebp,DWORD [16+esi] + mov edi,ebx + ; R2 37 + lea edx,[1272893353+ebp*1+edx] + add eax,ebx + xor edi,ecx + xor edi,eax + mov ebp,DWORD [28+esi] + add edx,edi + mov edi,eax + rol edx,11 + add edx,eax + ; R2 38 + xor edi,ebx + xor edi,edx + lea ecx,[4139469664+ebp*1+ecx] + add ecx,edi + rol ecx,16 + mov ebp,DWORD [40+esi] + mov edi,edx + ; R2 39 + lea ebx,[3200236656+ebp*1+ebx] + add ecx,edx + xor edi,eax + xor edi,ecx + mov ebp,DWORD [52+esi] + add ebx,edi + mov edi,ecx + rol ebx,23 + add ebx,ecx + ; R2 40 + xor edi,edx + xor edi,ebx + lea eax,[681279174+ebp*1+eax] + add eax,edi + rol eax,4 + mov ebp,DWORD [esi] + mov edi,ebx + ; R2 41 + lea edx,[3936430074+ebp*1+edx] + add eax,ebx + xor edi,ecx + xor edi,eax + mov ebp,DWORD [12+esi] + add edx,edi + mov edi,eax + rol edx,11 + add edx,eax + ; R2 42 + xor edi,ebx + xor edi,edx + lea ecx,[3572445317+ebp*1+ecx] + add ecx,edi + rol ecx,16 + mov ebp,DWORD [24+esi] + mov edi,edx + ; R2 43 + lea ebx,[76029189+ebp*1+ebx] + add ecx,edx + xor edi,eax + xor edi,ecx + mov ebp,DWORD [36+esi] + add ebx,edi + mov edi,ecx + rol ebx,23 + add ebx,ecx + ; R2 44 + xor edi,edx + xor edi,ebx + lea eax,[3654602809+ebp*1+eax] + add eax,edi + rol eax,4 + mov ebp,DWORD [48+esi] + mov edi,ebx + ; R2 45 + lea edx,[3873151461+ebp*1+edx] + add eax,ebx + xor edi,ecx + xor edi,eax + mov ebp,DWORD [60+esi] + add edx,edi + mov edi,eax + rol edx,11 + add edx,eax + ; R2 46 + xor edi,ebx + xor edi,edx + lea ecx,[530742520+ebp*1+ecx] + add ecx,edi + rol ecx,16 + mov ebp,DWORD [8+esi] + mov edi,edx + ; R2 47 + lea ebx,[3299628645+ebp*1+ebx] + add ecx,edx + xor edi,eax + xor edi,ecx + mov ebp,DWORD [esi] + add ebx,edi + mov edi,-1 + rol ebx,23 + add ebx,ecx + ; + ; R3 section + ; R3 48 + xor edi,edx + or edi,ebx + lea eax,[4096336452+ebp*1+eax] + xor edi,ecx + mov ebp,DWORD [28+esi] + add eax,edi + mov edi,-1 + rol eax,6 + xor edi,ecx + add eax,ebx + ; R3 49 + or edi,eax + lea edx,[1126891415+ebp*1+edx] + xor edi,ebx + mov ebp,DWORD [56+esi] + add edx,edi + mov edi,-1 + rol edx,10 + xor edi,ebx + add edx,eax + ; R3 50 + or edi,edx + lea ecx,[2878612391+ebp*1+ecx] + xor edi,eax + mov ebp,DWORD [20+esi] + add ecx,edi + mov edi,-1 + rol ecx,15 + xor edi,eax + add ecx,edx + ; R3 51 + or edi,ecx + lea ebx,[4237533241+ebp*1+ebx] + xor edi,edx + mov ebp,DWORD [48+esi] + add ebx,edi + mov edi,-1 + rol ebx,21 + xor edi,edx + add ebx,ecx + ; R3 52 + or edi,ebx + lea eax,[1700485571+ebp*1+eax] + xor edi,ecx + mov ebp,DWORD [12+esi] + add eax,edi + mov edi,-1 + rol eax,6 + xor edi,ecx + add eax,ebx + ; R3 53 + or edi,eax + lea edx,[2399980690+ebp*1+edx] + xor edi,ebx + mov ebp,DWORD [40+esi] + add edx,edi + mov edi,-1 + rol edx,10 + xor edi,ebx + add edx,eax + ; R3 54 + or edi,edx + lea ecx,[4293915773+ebp*1+ecx] + xor edi,eax + mov ebp,DWORD [4+esi] + add ecx,edi + mov edi,-1 + rol ecx,15 + xor edi,eax + add ecx,edx + ; R3 55 + or edi,ecx + lea ebx,[2240044497+ebp*1+ebx] + xor edi,edx + mov ebp,DWORD [32+esi] + add ebx,edi + mov edi,-1 + rol ebx,21 + xor edi,edx + add ebx,ecx + ; R3 56 + or edi,ebx + lea eax,[1873313359+ebp*1+eax] + xor edi,ecx + mov ebp,DWORD [60+esi] + add eax,edi + mov edi,-1 + rol eax,6 + xor edi,ecx + add eax,ebx + ; R3 57 + or edi,eax + lea edx,[4264355552+ebp*1+edx] + xor edi,ebx + mov ebp,DWORD [24+esi] + add edx,edi + mov edi,-1 + rol edx,10 + xor edi,ebx + add edx,eax + ; R3 58 + or edi,edx + lea ecx,[2734768916+ebp*1+ecx] + xor edi,eax + mov ebp,DWORD [52+esi] + add ecx,edi + mov edi,-1 + rol ecx,15 + xor edi,eax + add ecx,edx + ; R3 59 + or edi,ecx + lea ebx,[1309151649+ebp*1+ebx] + xor edi,edx + mov ebp,DWORD [16+esi] + add ebx,edi + mov edi,-1 + rol ebx,21 + xor edi,edx + add ebx,ecx + ; R3 60 + or edi,ebx + lea eax,[4149444226+ebp*1+eax] + xor edi,ecx + mov ebp,DWORD [44+esi] + add eax,edi + mov edi,-1 + rol eax,6 + xor edi,ecx + add eax,ebx + ; R3 61 + or edi,eax + lea edx,[3174756917+ebp*1+edx] + xor edi,ebx + mov ebp,DWORD [8+esi] + add edx,edi + mov edi,-1 + rol edx,10 + xor edi,ebx + add edx,eax + ; R3 62 + or edi,edx + lea ecx,[718787259+ebp*1+ecx] + xor edi,eax + mov ebp,DWORD [36+esi] + add ecx,edi + mov edi,-1 + rol ecx,15 + xor edi,eax + add ecx,edx + ; R3 63 + or edi,ecx + lea ebx,[3951481745+ebp*1+ebx] + xor edi,edx + mov ebp,DWORD [24+esp] + add ebx,edi + add esi,64 + rol ebx,21 + mov edi,DWORD [ebp] + add ebx,ecx + add eax,edi + mov edi,DWORD [4+ebp] + add ebx,edi + mov edi,DWORD [8+ebp] + add ecx,edi + mov edi,DWORD [12+ebp] + add edx,edi + mov DWORD [ebp],eax + mov DWORD [4+ebp],ebx + mov edi,DWORD [esp] + mov DWORD [8+ebp],ecx + mov DWORD [12+ebp],edx + cmp edi,esi + jae NEAR L$000start + pop eax + pop ebx + pop ebp + pop edi + pop esi + ret +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/crypto/md5-x86_64-apple.S b/third_party/boringssl/gen/crypto/md5-x86_64-apple.S new file mode 100644 index 00000000..e4c02415 --- /dev/null +++ b/third_party/boringssl/gen/crypto/md5-x86_64-apple.S @@ -0,0 +1,690 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text +.p2align 4 + +.globl _md5_block_asm_data_order +.private_extern _md5_block_asm_data_order + +_md5_block_asm_data_order: + +_CET_ENDBR + pushq %rbp + + pushq %rbx + + pushq %r12 + + pushq %r14 + + pushq %r15 + +L$prologue: + + + + + movq %rdi,%rbp + shlq $6,%rdx + leaq (%rsi,%rdx,1),%rdi + movl 0(%rbp),%eax + movl 4(%rbp),%ebx + movl 8(%rbp),%ecx + movl 12(%rbp),%edx + + + + + + + + cmpq %rdi,%rsi + je L$end + + +L$loop: + movl %eax,%r8d + movl %ebx,%r9d + movl %ecx,%r14d + movl %edx,%r15d + movl 0(%rsi),%r10d + movl %edx,%r11d + xorl %ecx,%r11d + leal -680876936(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 4(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal -389564586(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 8(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal 606105819(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 12(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal -1044525330(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 16(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + xorl %ecx,%r11d + leal -176418897(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 20(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal 1200080426(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 24(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal -1473231341(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 28(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal -45705983(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 32(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + xorl %ecx,%r11d + leal 1770035416(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 36(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal -1958414417(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 40(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal -42063(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 44(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal -1990404162(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 48(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + xorl %ecx,%r11d + leal 1804603682(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 52(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal -40341101(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 56(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal -1502002290(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 60(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal 1236535329(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 0(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + movl 4(%rsi),%r10d + movl %edx,%r11d + movl %edx,%r12d + notl %r11d + leal -165796510(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 24(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal -1069501632(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 44(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal 643717713(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 0(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal -373897302(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 20(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + notl %r11d + leal -701558691(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 40(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal 38016083(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 60(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal -660478335(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 16(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal -405537848(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 36(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + notl %r11d + leal 568446438(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 56(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal -1019803690(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 12(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal -187363961(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 32(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal 1163531501(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 52(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + notl %r11d + leal -1444681467(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 8(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal -51403784(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 28(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal 1735328473(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 48(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal -1926607734(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 0(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + movl 20(%rsi),%r10d + movl %ecx,%r11d + leal -378558(%rax,%r10,1),%eax + movl 32(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal -2022574463(%rdx,%r10,1),%edx + movl 44(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal 1839030562(%rcx,%r10,1),%ecx + movl 56(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal -35309556(%rbx,%r10,1),%ebx + movl 4(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + leal -1530992060(%rax,%r10,1),%eax + movl 16(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal 1272893353(%rdx,%r10,1),%edx + movl 28(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal -155497632(%rcx,%r10,1),%ecx + movl 40(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal -1094730640(%rbx,%r10,1),%ebx + movl 52(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + leal 681279174(%rax,%r10,1),%eax + movl 0(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal -358537222(%rdx,%r10,1),%edx + movl 12(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal -722521979(%rcx,%r10,1),%ecx + movl 24(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal 76029189(%rbx,%r10,1),%ebx + movl 36(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + leal -640364487(%rax,%r10,1),%eax + movl 48(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal -421815835(%rdx,%r10,1),%edx + movl 60(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal 530742520(%rcx,%r10,1),%ecx + movl 8(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal -995338651(%rbx,%r10,1),%ebx + movl 0(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + movl 0(%rsi),%r10d + movl $0xffffffff,%r11d + xorl %edx,%r11d + leal -198630844(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 28(%rsi),%r10d + movl $0xffffffff,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal 1126891415(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 56(%rsi),%r10d + movl $0xffffffff,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal -1416354905(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 20(%rsi),%r10d + movl $0xffffffff,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal -57434055(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 48(%rsi),%r10d + movl $0xffffffff,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + leal 1700485571(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 12(%rsi),%r10d + movl $0xffffffff,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal -1894986606(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 40(%rsi),%r10d + movl $0xffffffff,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal -1051523(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 4(%rsi),%r10d + movl $0xffffffff,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal -2054922799(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 32(%rsi),%r10d + movl $0xffffffff,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + leal 1873313359(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 60(%rsi),%r10d + movl $0xffffffff,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal -30611744(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 24(%rsi),%r10d + movl $0xffffffff,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal -1560198380(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 52(%rsi),%r10d + movl $0xffffffff,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal 1309151649(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 16(%rsi),%r10d + movl $0xffffffff,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + leal -145523070(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 44(%rsi),%r10d + movl $0xffffffff,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal -1120210379(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 8(%rsi),%r10d + movl $0xffffffff,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal 718787259(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 36(%rsi),%r10d + movl $0xffffffff,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal -343485551(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 0(%rsi),%r10d + movl $0xffffffff,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + + addl %r8d,%eax + addl %r9d,%ebx + addl %r14d,%ecx + addl %r15d,%edx + + + addq $64,%rsi + cmpq %rdi,%rsi + jb L$loop + + +L$end: + movl %eax,0(%rbp) + movl %ebx,4(%rbp) + movl %ecx,8(%rbp) + movl %edx,12(%rbp) + + movq (%rsp),%r15 + + movq 8(%rsp),%r14 + + movq 16(%rsp),%r12 + + movq 24(%rsp),%rbx + + movq 32(%rsp),%rbp + + addq $40,%rsp + +L$epilogue: + ret + + +#endif diff --git a/third_party/boringssl/gen/crypto/md5-x86_64-linux.S b/third_party/boringssl/gen/crypto/md5-x86_64-linux.S new file mode 100644 index 00000000..7b93662a --- /dev/null +++ b/third_party/boringssl/gen/crypto/md5-x86_64-linux.S @@ -0,0 +1,695 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text +.align 16 + +.globl md5_block_asm_data_order +.hidden md5_block_asm_data_order +.type md5_block_asm_data_order,@function +md5_block_asm_data_order: +.cfi_startproc +_CET_ENDBR + pushq %rbp +.cfi_adjust_cfa_offset 8 +.cfi_offset rbp,-16 + pushq %rbx +.cfi_adjust_cfa_offset 8 +.cfi_offset rbx,-24 + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset r12,-32 + pushq %r14 +.cfi_adjust_cfa_offset 8 +.cfi_offset r14,-40 + pushq %r15 +.cfi_adjust_cfa_offset 8 +.cfi_offset r15,-48 +.Lprologue: + + + + + movq %rdi,%rbp + shlq $6,%rdx + leaq (%rsi,%rdx,1),%rdi + movl 0(%rbp),%eax + movl 4(%rbp),%ebx + movl 8(%rbp),%ecx + movl 12(%rbp),%edx + + + + + + + + cmpq %rdi,%rsi + je .Lend + + +.Lloop: + movl %eax,%r8d + movl %ebx,%r9d + movl %ecx,%r14d + movl %edx,%r15d + movl 0(%rsi),%r10d + movl %edx,%r11d + xorl %ecx,%r11d + leal -680876936(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 4(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal -389564586(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 8(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal 606105819(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 12(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal -1044525330(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 16(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + xorl %ecx,%r11d + leal -176418897(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 20(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal 1200080426(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 24(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal -1473231341(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 28(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal -45705983(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 32(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + xorl %ecx,%r11d + leal 1770035416(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 36(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal -1958414417(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 40(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal -42063(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 44(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal -1990404162(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 48(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + xorl %ecx,%r11d + leal 1804603682(%rax,%r10,1),%eax + andl %ebx,%r11d + xorl %edx,%r11d + movl 52(%rsi),%r10d + addl %r11d,%eax + roll $7,%eax + movl %ecx,%r11d + addl %ebx,%eax + xorl %ebx,%r11d + leal -40341101(%rdx,%r10,1),%edx + andl %eax,%r11d + xorl %ecx,%r11d + movl 56(%rsi),%r10d + addl %r11d,%edx + roll $12,%edx + movl %ebx,%r11d + addl %eax,%edx + xorl %eax,%r11d + leal -1502002290(%rcx,%r10,1),%ecx + andl %edx,%r11d + xorl %ebx,%r11d + movl 60(%rsi),%r10d + addl %r11d,%ecx + roll $17,%ecx + movl %eax,%r11d + addl %edx,%ecx + xorl %edx,%r11d + leal 1236535329(%rbx,%r10,1),%ebx + andl %ecx,%r11d + xorl %eax,%r11d + movl 0(%rsi),%r10d + addl %r11d,%ebx + roll $22,%ebx + movl %edx,%r11d + addl %ecx,%ebx + movl 4(%rsi),%r10d + movl %edx,%r11d + movl %edx,%r12d + notl %r11d + leal -165796510(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 24(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal -1069501632(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 44(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal 643717713(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 0(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal -373897302(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 20(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + notl %r11d + leal -701558691(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 40(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal 38016083(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 60(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal -660478335(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 16(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal -405537848(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 36(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + notl %r11d + leal 568446438(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 56(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal -1019803690(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 12(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal -187363961(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 32(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal 1163531501(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 52(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + notl %r11d + leal -1444681467(%rax,%r10,1),%eax + andl %ebx,%r12d + andl %ecx,%r11d + movl 8(%rsi),%r10d + orl %r11d,%r12d + movl %ecx,%r11d + addl %r12d,%eax + movl %ecx,%r12d + roll $5,%eax + addl %ebx,%eax + notl %r11d + leal -51403784(%rdx,%r10,1),%edx + andl %eax,%r12d + andl %ebx,%r11d + movl 28(%rsi),%r10d + orl %r11d,%r12d + movl %ebx,%r11d + addl %r12d,%edx + movl %ebx,%r12d + roll $9,%edx + addl %eax,%edx + notl %r11d + leal 1735328473(%rcx,%r10,1),%ecx + andl %edx,%r12d + andl %eax,%r11d + movl 48(%rsi),%r10d + orl %r11d,%r12d + movl %eax,%r11d + addl %r12d,%ecx + movl %eax,%r12d + roll $14,%ecx + addl %edx,%ecx + notl %r11d + leal -1926607734(%rbx,%r10,1),%ebx + andl %ecx,%r12d + andl %edx,%r11d + movl 0(%rsi),%r10d + orl %r11d,%r12d + movl %edx,%r11d + addl %r12d,%ebx + movl %edx,%r12d + roll $20,%ebx + addl %ecx,%ebx + movl 20(%rsi),%r10d + movl %ecx,%r11d + leal -378558(%rax,%r10,1),%eax + movl 32(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal -2022574463(%rdx,%r10,1),%edx + movl 44(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal 1839030562(%rcx,%r10,1),%ecx + movl 56(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal -35309556(%rbx,%r10,1),%ebx + movl 4(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + leal -1530992060(%rax,%r10,1),%eax + movl 16(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal 1272893353(%rdx,%r10,1),%edx + movl 28(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal -155497632(%rcx,%r10,1),%ecx + movl 40(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal -1094730640(%rbx,%r10,1),%ebx + movl 52(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + leal 681279174(%rax,%r10,1),%eax + movl 0(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal -358537222(%rdx,%r10,1),%edx + movl 12(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal -722521979(%rcx,%r10,1),%ecx + movl 24(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal 76029189(%rbx,%r10,1),%ebx + movl 36(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + leal -640364487(%rax,%r10,1),%eax + movl 48(%rsi),%r10d + xorl %edx,%r11d + xorl %ebx,%r11d + addl %r11d,%eax + roll $4,%eax + movl %ebx,%r11d + addl %ebx,%eax + leal -421815835(%rdx,%r10,1),%edx + movl 60(%rsi),%r10d + xorl %ecx,%r11d + xorl %eax,%r11d + addl %r11d,%edx + roll $11,%edx + movl %eax,%r11d + addl %eax,%edx + leal 530742520(%rcx,%r10,1),%ecx + movl 8(%rsi),%r10d + xorl %ebx,%r11d + xorl %edx,%r11d + addl %r11d,%ecx + roll $16,%ecx + movl %edx,%r11d + addl %edx,%ecx + leal -995338651(%rbx,%r10,1),%ebx + movl 0(%rsi),%r10d + xorl %eax,%r11d + xorl %ecx,%r11d + addl %r11d,%ebx + roll $23,%ebx + movl %ecx,%r11d + addl %ecx,%ebx + movl 0(%rsi),%r10d + movl $0xffffffff,%r11d + xorl %edx,%r11d + leal -198630844(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 28(%rsi),%r10d + movl $0xffffffff,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal 1126891415(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 56(%rsi),%r10d + movl $0xffffffff,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal -1416354905(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 20(%rsi),%r10d + movl $0xffffffff,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal -57434055(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 48(%rsi),%r10d + movl $0xffffffff,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + leal 1700485571(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 12(%rsi),%r10d + movl $0xffffffff,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal -1894986606(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 40(%rsi),%r10d + movl $0xffffffff,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal -1051523(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 4(%rsi),%r10d + movl $0xffffffff,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal -2054922799(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 32(%rsi),%r10d + movl $0xffffffff,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + leal 1873313359(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 60(%rsi),%r10d + movl $0xffffffff,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal -30611744(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 24(%rsi),%r10d + movl $0xffffffff,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal -1560198380(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 52(%rsi),%r10d + movl $0xffffffff,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal 1309151649(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 16(%rsi),%r10d + movl $0xffffffff,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + leal -145523070(%rax,%r10,1),%eax + orl %ebx,%r11d + xorl %ecx,%r11d + addl %r11d,%eax + movl 44(%rsi),%r10d + movl $0xffffffff,%r11d + roll $6,%eax + xorl %ecx,%r11d + addl %ebx,%eax + leal -1120210379(%rdx,%r10,1),%edx + orl %eax,%r11d + xorl %ebx,%r11d + addl %r11d,%edx + movl 8(%rsi),%r10d + movl $0xffffffff,%r11d + roll $10,%edx + xorl %ebx,%r11d + addl %eax,%edx + leal 718787259(%rcx,%r10,1),%ecx + orl %edx,%r11d + xorl %eax,%r11d + addl %r11d,%ecx + movl 36(%rsi),%r10d + movl $0xffffffff,%r11d + roll $15,%ecx + xorl %eax,%r11d + addl %edx,%ecx + leal -343485551(%rbx,%r10,1),%ebx + orl %ecx,%r11d + xorl %edx,%r11d + addl %r11d,%ebx + movl 0(%rsi),%r10d + movl $0xffffffff,%r11d + roll $21,%ebx + xorl %edx,%r11d + addl %ecx,%ebx + + addl %r8d,%eax + addl %r9d,%ebx + addl %r14d,%ecx + addl %r15d,%edx + + + addq $64,%rsi + cmpq %rdi,%rsi + jb .Lloop + + +.Lend: + movl %eax,0(%rbp) + movl %ebx,4(%rbp) + movl %ecx,8(%rbp) + movl %edx,12(%rbp) + + movq (%rsp),%r15 +.cfi_restore r15 + movq 8(%rsp),%r14 +.cfi_restore r14 + movq 16(%rsp),%r12 +.cfi_restore r12 + movq 24(%rsp),%rbx +.cfi_restore rbx + movq 32(%rsp),%rbp +.cfi_restore rbp + addq $40,%rsp +.cfi_adjust_cfa_offset -40 +.Lepilogue: + ret +.cfi_endproc +.size md5_block_asm_data_order,.-md5_block_asm_data_order +#endif diff --git a/third_party/boringssl/gen/crypto/md5-x86_64-win.asm b/third_party/boringssl/gen/crypto/md5-x86_64-win.asm new file mode 100644 index 00000000..5cfac974 --- /dev/null +++ b/third_party/boringssl/gen/crypto/md5-x86_64-win.asm @@ -0,0 +1,803 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + +ALIGN 16 + +global md5_block_asm_data_order + +md5_block_asm_data_order: + mov QWORD[8+rsp],rdi ;WIN64 prologue + mov QWORD[16+rsp],rsi + mov rax,rsp +$L$SEH_begin_md5_block_asm_data_order: + mov rdi,rcx + mov rsi,rdx + mov rdx,r8 + + + +_CET_ENDBR + push rbp + + push rbx + + push r12 + + push r14 + + push r15 + +$L$prologue: + + + + + mov rbp,rdi + shl rdx,6 + lea rdi,[rdx*1+rsi] + mov eax,DWORD[rbp] + mov ebx,DWORD[4+rbp] + mov ecx,DWORD[8+rbp] + mov edx,DWORD[12+rbp] + + + + + + + + cmp rsi,rdi + je NEAR $L$end + + +$L$loop: + mov r8d,eax + mov r9d,ebx + mov r14d,ecx + mov r15d,edx + mov r10d,DWORD[rsi] + mov r11d,edx + xor r11d,ecx + lea eax,[((-680876936))+r10*1+rax] + and r11d,ebx + xor r11d,edx + mov r10d,DWORD[4+rsi] + add eax,r11d + rol eax,7 + mov r11d,ecx + add eax,ebx + xor r11d,ebx + lea edx,[((-389564586))+r10*1+rdx] + and r11d,eax + xor r11d,ecx + mov r10d,DWORD[8+rsi] + add edx,r11d + rol edx,12 + mov r11d,ebx + add edx,eax + xor r11d,eax + lea ecx,[606105819+r10*1+rcx] + and r11d,edx + xor r11d,ebx + mov r10d,DWORD[12+rsi] + add ecx,r11d + rol ecx,17 + mov r11d,eax + add ecx,edx + xor r11d,edx + lea ebx,[((-1044525330))+r10*1+rbx] + and r11d,ecx + xor r11d,eax + mov r10d,DWORD[16+rsi] + add ebx,r11d + rol ebx,22 + mov r11d,edx + add ebx,ecx + xor r11d,ecx + lea eax,[((-176418897))+r10*1+rax] + and r11d,ebx + xor r11d,edx + mov r10d,DWORD[20+rsi] + add eax,r11d + rol eax,7 + mov r11d,ecx + add eax,ebx + xor r11d,ebx + lea edx,[1200080426+r10*1+rdx] + and r11d,eax + xor r11d,ecx + mov r10d,DWORD[24+rsi] + add edx,r11d + rol edx,12 + mov r11d,ebx + add edx,eax + xor r11d,eax + lea ecx,[((-1473231341))+r10*1+rcx] + and r11d,edx + xor r11d,ebx + mov r10d,DWORD[28+rsi] + add ecx,r11d + rol ecx,17 + mov r11d,eax + add ecx,edx + xor r11d,edx + lea ebx,[((-45705983))+r10*1+rbx] + and r11d,ecx + xor r11d,eax + mov r10d,DWORD[32+rsi] + add ebx,r11d + rol ebx,22 + mov r11d,edx + add ebx,ecx + xor r11d,ecx + lea eax,[1770035416+r10*1+rax] + and r11d,ebx + xor r11d,edx + mov r10d,DWORD[36+rsi] + add eax,r11d + rol eax,7 + mov r11d,ecx + add eax,ebx + xor r11d,ebx + lea edx,[((-1958414417))+r10*1+rdx] + and r11d,eax + xor r11d,ecx + mov r10d,DWORD[40+rsi] + add edx,r11d + rol edx,12 + mov r11d,ebx + add edx,eax + xor r11d,eax + lea ecx,[((-42063))+r10*1+rcx] + and r11d,edx + xor r11d,ebx + mov r10d,DWORD[44+rsi] + add ecx,r11d + rol ecx,17 + mov r11d,eax + add ecx,edx + xor r11d,edx + lea ebx,[((-1990404162))+r10*1+rbx] + and r11d,ecx + xor r11d,eax + mov r10d,DWORD[48+rsi] + add ebx,r11d + rol ebx,22 + mov r11d,edx + add ebx,ecx + xor r11d,ecx + lea eax,[1804603682+r10*1+rax] + and r11d,ebx + xor r11d,edx + mov r10d,DWORD[52+rsi] + add eax,r11d + rol eax,7 + mov r11d,ecx + add eax,ebx + xor r11d,ebx + lea edx,[((-40341101))+r10*1+rdx] + and r11d,eax + xor r11d,ecx + mov r10d,DWORD[56+rsi] + add edx,r11d + rol edx,12 + mov r11d,ebx + add edx,eax + xor r11d,eax + lea ecx,[((-1502002290))+r10*1+rcx] + and r11d,edx + xor r11d,ebx + mov r10d,DWORD[60+rsi] + add ecx,r11d + rol ecx,17 + mov r11d,eax + add ecx,edx + xor r11d,edx + lea ebx,[1236535329+r10*1+rbx] + and r11d,ecx + xor r11d,eax + mov r10d,DWORD[rsi] + add ebx,r11d + rol ebx,22 + mov r11d,edx + add ebx,ecx + mov r10d,DWORD[4+rsi] + mov r11d,edx + mov r12d,edx + not r11d + lea eax,[((-165796510))+r10*1+rax] + and r12d,ebx + and r11d,ecx + mov r10d,DWORD[24+rsi] + or r12d,r11d + mov r11d,ecx + add eax,r12d + mov r12d,ecx + rol eax,5 + add eax,ebx + not r11d + lea edx,[((-1069501632))+r10*1+rdx] + and r12d,eax + and r11d,ebx + mov r10d,DWORD[44+rsi] + or r12d,r11d + mov r11d,ebx + add edx,r12d + mov r12d,ebx + rol edx,9 + add edx,eax + not r11d + lea ecx,[643717713+r10*1+rcx] + and r12d,edx + and r11d,eax + mov r10d,DWORD[rsi] + or r12d,r11d + mov r11d,eax + add ecx,r12d + mov r12d,eax + rol ecx,14 + add ecx,edx + not r11d + lea ebx,[((-373897302))+r10*1+rbx] + and r12d,ecx + and r11d,edx + mov r10d,DWORD[20+rsi] + or r12d,r11d + mov r11d,edx + add ebx,r12d + mov r12d,edx + rol ebx,20 + add ebx,ecx + not r11d + lea eax,[((-701558691))+r10*1+rax] + and r12d,ebx + and r11d,ecx + mov r10d,DWORD[40+rsi] + or r12d,r11d + mov r11d,ecx + add eax,r12d + mov r12d,ecx + rol eax,5 + add eax,ebx + not r11d + lea edx,[38016083+r10*1+rdx] + and r12d,eax + and r11d,ebx + mov r10d,DWORD[60+rsi] + or r12d,r11d + mov r11d,ebx + add edx,r12d + mov r12d,ebx + rol edx,9 + add edx,eax + not r11d + lea ecx,[((-660478335))+r10*1+rcx] + and r12d,edx + and r11d,eax + mov r10d,DWORD[16+rsi] + or r12d,r11d + mov r11d,eax + add ecx,r12d + mov r12d,eax + rol ecx,14 + add ecx,edx + not r11d + lea ebx,[((-405537848))+r10*1+rbx] + and r12d,ecx + and r11d,edx + mov r10d,DWORD[36+rsi] + or r12d,r11d + mov r11d,edx + add ebx,r12d + mov r12d,edx + rol ebx,20 + add ebx,ecx + not r11d + lea eax,[568446438+r10*1+rax] + and r12d,ebx + and r11d,ecx + mov r10d,DWORD[56+rsi] + or r12d,r11d + mov r11d,ecx + add eax,r12d + mov r12d,ecx + rol eax,5 + add eax,ebx + not r11d + lea edx,[((-1019803690))+r10*1+rdx] + and r12d,eax + and r11d,ebx + mov r10d,DWORD[12+rsi] + or r12d,r11d + mov r11d,ebx + add edx,r12d + mov r12d,ebx + rol edx,9 + add edx,eax + not r11d + lea ecx,[((-187363961))+r10*1+rcx] + and r12d,edx + and r11d,eax + mov r10d,DWORD[32+rsi] + or r12d,r11d + mov r11d,eax + add ecx,r12d + mov r12d,eax + rol ecx,14 + add ecx,edx + not r11d + lea ebx,[1163531501+r10*1+rbx] + and r12d,ecx + and r11d,edx + mov r10d,DWORD[52+rsi] + or r12d,r11d + mov r11d,edx + add ebx,r12d + mov r12d,edx + rol ebx,20 + add ebx,ecx + not r11d + lea eax,[((-1444681467))+r10*1+rax] + and r12d,ebx + and r11d,ecx + mov r10d,DWORD[8+rsi] + or r12d,r11d + mov r11d,ecx + add eax,r12d + mov r12d,ecx + rol eax,5 + add eax,ebx + not r11d + lea edx,[((-51403784))+r10*1+rdx] + and r12d,eax + and r11d,ebx + mov r10d,DWORD[28+rsi] + or r12d,r11d + mov r11d,ebx + add edx,r12d + mov r12d,ebx + rol edx,9 + add edx,eax + not r11d + lea ecx,[1735328473+r10*1+rcx] + and r12d,edx + and r11d,eax + mov r10d,DWORD[48+rsi] + or r12d,r11d + mov r11d,eax + add ecx,r12d + mov r12d,eax + rol ecx,14 + add ecx,edx + not r11d + lea ebx,[((-1926607734))+r10*1+rbx] + and r12d,ecx + and r11d,edx + mov r10d,DWORD[rsi] + or r12d,r11d + mov r11d,edx + add ebx,r12d + mov r12d,edx + rol ebx,20 + add ebx,ecx + mov r10d,DWORD[20+rsi] + mov r11d,ecx + lea eax,[((-378558))+r10*1+rax] + mov r10d,DWORD[32+rsi] + xor r11d,edx + xor r11d,ebx + add eax,r11d + rol eax,4 + mov r11d,ebx + add eax,ebx + lea edx,[((-2022574463))+r10*1+rdx] + mov r10d,DWORD[44+rsi] + xor r11d,ecx + xor r11d,eax + add edx,r11d + rol edx,11 + mov r11d,eax + add edx,eax + lea ecx,[1839030562+r10*1+rcx] + mov r10d,DWORD[56+rsi] + xor r11d,ebx + xor r11d,edx + add ecx,r11d + rol ecx,16 + mov r11d,edx + add ecx,edx + lea ebx,[((-35309556))+r10*1+rbx] + mov r10d,DWORD[4+rsi] + xor r11d,eax + xor r11d,ecx + add ebx,r11d + rol ebx,23 + mov r11d,ecx + add ebx,ecx + lea eax,[((-1530992060))+r10*1+rax] + mov r10d,DWORD[16+rsi] + xor r11d,edx + xor r11d,ebx + add eax,r11d + rol eax,4 + mov r11d,ebx + add eax,ebx + lea edx,[1272893353+r10*1+rdx] + mov r10d,DWORD[28+rsi] + xor r11d,ecx + xor r11d,eax + add edx,r11d + rol edx,11 + mov r11d,eax + add edx,eax + lea ecx,[((-155497632))+r10*1+rcx] + mov r10d,DWORD[40+rsi] + xor r11d,ebx + xor r11d,edx + add ecx,r11d + rol ecx,16 + mov r11d,edx + add ecx,edx + lea ebx,[((-1094730640))+r10*1+rbx] + mov r10d,DWORD[52+rsi] + xor r11d,eax + xor r11d,ecx + add ebx,r11d + rol ebx,23 + mov r11d,ecx + add ebx,ecx + lea eax,[681279174+r10*1+rax] + mov r10d,DWORD[rsi] + xor r11d,edx + xor r11d,ebx + add eax,r11d + rol eax,4 + mov r11d,ebx + add eax,ebx + lea edx,[((-358537222))+r10*1+rdx] + mov r10d,DWORD[12+rsi] + xor r11d,ecx + xor r11d,eax + add edx,r11d + rol edx,11 + mov r11d,eax + add edx,eax + lea ecx,[((-722521979))+r10*1+rcx] + mov r10d,DWORD[24+rsi] + xor r11d,ebx + xor r11d,edx + add ecx,r11d + rol ecx,16 + mov r11d,edx + add ecx,edx + lea ebx,[76029189+r10*1+rbx] + mov r10d,DWORD[36+rsi] + xor r11d,eax + xor r11d,ecx + add ebx,r11d + rol ebx,23 + mov r11d,ecx + add ebx,ecx + lea eax,[((-640364487))+r10*1+rax] + mov r10d,DWORD[48+rsi] + xor r11d,edx + xor r11d,ebx + add eax,r11d + rol eax,4 + mov r11d,ebx + add eax,ebx + lea edx,[((-421815835))+r10*1+rdx] + mov r10d,DWORD[60+rsi] + xor r11d,ecx + xor r11d,eax + add edx,r11d + rol edx,11 + mov r11d,eax + add edx,eax + lea ecx,[530742520+r10*1+rcx] + mov r10d,DWORD[8+rsi] + xor r11d,ebx + xor r11d,edx + add ecx,r11d + rol ecx,16 + mov r11d,edx + add ecx,edx + lea ebx,[((-995338651))+r10*1+rbx] + mov r10d,DWORD[rsi] + xor r11d,eax + xor r11d,ecx + add ebx,r11d + rol ebx,23 + mov r11d,ecx + add ebx,ecx + mov r10d,DWORD[rsi] + mov r11d,0xffffffff + xor r11d,edx + lea eax,[((-198630844))+r10*1+rax] + or r11d,ebx + xor r11d,ecx + add eax,r11d + mov r10d,DWORD[28+rsi] + mov r11d,0xffffffff + rol eax,6 + xor r11d,ecx + add eax,ebx + lea edx,[1126891415+r10*1+rdx] + or r11d,eax + xor r11d,ebx + add edx,r11d + mov r10d,DWORD[56+rsi] + mov r11d,0xffffffff + rol edx,10 + xor r11d,ebx + add edx,eax + lea ecx,[((-1416354905))+r10*1+rcx] + or r11d,edx + xor r11d,eax + add ecx,r11d + mov r10d,DWORD[20+rsi] + mov r11d,0xffffffff + rol ecx,15 + xor r11d,eax + add ecx,edx + lea ebx,[((-57434055))+r10*1+rbx] + or r11d,ecx + xor r11d,edx + add ebx,r11d + mov r10d,DWORD[48+rsi] + mov r11d,0xffffffff + rol ebx,21 + xor r11d,edx + add ebx,ecx + lea eax,[1700485571+r10*1+rax] + or r11d,ebx + xor r11d,ecx + add eax,r11d + mov r10d,DWORD[12+rsi] + mov r11d,0xffffffff + rol eax,6 + xor r11d,ecx + add eax,ebx + lea edx,[((-1894986606))+r10*1+rdx] + or r11d,eax + xor r11d,ebx + add edx,r11d + mov r10d,DWORD[40+rsi] + mov r11d,0xffffffff + rol edx,10 + xor r11d,ebx + add edx,eax + lea ecx,[((-1051523))+r10*1+rcx] + or r11d,edx + xor r11d,eax + add ecx,r11d + mov r10d,DWORD[4+rsi] + mov r11d,0xffffffff + rol ecx,15 + xor r11d,eax + add ecx,edx + lea ebx,[((-2054922799))+r10*1+rbx] + or r11d,ecx + xor r11d,edx + add ebx,r11d + mov r10d,DWORD[32+rsi] + mov r11d,0xffffffff + rol ebx,21 + xor r11d,edx + add ebx,ecx + lea eax,[1873313359+r10*1+rax] + or r11d,ebx + xor r11d,ecx + add eax,r11d + mov r10d,DWORD[60+rsi] + mov r11d,0xffffffff + rol eax,6 + xor r11d,ecx + add eax,ebx + lea edx,[((-30611744))+r10*1+rdx] + or r11d,eax + xor r11d,ebx + add edx,r11d + mov r10d,DWORD[24+rsi] + mov r11d,0xffffffff + rol edx,10 + xor r11d,ebx + add edx,eax + lea ecx,[((-1560198380))+r10*1+rcx] + or r11d,edx + xor r11d,eax + add ecx,r11d + mov r10d,DWORD[52+rsi] + mov r11d,0xffffffff + rol ecx,15 + xor r11d,eax + add ecx,edx + lea ebx,[1309151649+r10*1+rbx] + or r11d,ecx + xor r11d,edx + add ebx,r11d + mov r10d,DWORD[16+rsi] + mov r11d,0xffffffff + rol ebx,21 + xor r11d,edx + add ebx,ecx + lea eax,[((-145523070))+r10*1+rax] + or r11d,ebx + xor r11d,ecx + add eax,r11d + mov r10d,DWORD[44+rsi] + mov r11d,0xffffffff + rol eax,6 + xor r11d,ecx + add eax,ebx + lea edx,[((-1120210379))+r10*1+rdx] + or r11d,eax + xor r11d,ebx + add edx,r11d + mov r10d,DWORD[8+rsi] + mov r11d,0xffffffff + rol edx,10 + xor r11d,ebx + add edx,eax + lea ecx,[718787259+r10*1+rcx] + or r11d,edx + xor r11d,eax + add ecx,r11d + mov r10d,DWORD[36+rsi] + mov r11d,0xffffffff + rol ecx,15 + xor r11d,eax + add ecx,edx + lea ebx,[((-343485551))+r10*1+rbx] + or r11d,ecx + xor r11d,edx + add ebx,r11d + mov r10d,DWORD[rsi] + mov r11d,0xffffffff + rol ebx,21 + xor r11d,edx + add ebx,ecx + + add eax,r8d + add ebx,r9d + add ecx,r14d + add edx,r15d + + + add rsi,64 + cmp rsi,rdi + jb NEAR $L$loop + + +$L$end: + mov DWORD[rbp],eax + mov DWORD[4+rbp],ebx + mov DWORD[8+rbp],ecx + mov DWORD[12+rbp],edx + + mov r15,QWORD[rsp] + + mov r14,QWORD[8+rsp] + + mov r12,QWORD[16+rsp] + + mov rbx,QWORD[24+rsp] + + mov rbp,QWORD[32+rsp] + + add rsp,40 + +$L$epilogue: + mov rdi,QWORD[8+rsp] ;WIN64 epilogue + mov rsi,QWORD[16+rsp] + ret + +$L$SEH_end_md5_block_asm_data_order: +EXTERN __imp_RtlVirtualUnwind + +ALIGN 16 +se_handler: + push rsi + push rdi + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + pushfq + sub rsp,64 + + mov rax,QWORD[120+r8] + mov rbx,QWORD[248+r8] + + lea r10,[$L$prologue] + cmp rbx,r10 + jb NEAR $L$in_prologue + + mov rax,QWORD[152+r8] + + lea r10,[$L$epilogue] + cmp rbx,r10 + jae NEAR $L$in_prologue + + lea rax,[40+rax] + + mov rbp,QWORD[((-8))+rax] + mov rbx,QWORD[((-16))+rax] + mov r12,QWORD[((-24))+rax] + mov r14,QWORD[((-32))+rax] + mov r15,QWORD[((-40))+rax] + mov QWORD[144+r8],rbx + mov QWORD[160+r8],rbp + mov QWORD[216+r8],r12 + mov QWORD[232+r8],r14 + mov QWORD[240+r8],r15 + +$L$in_prologue: + mov rdi,QWORD[8+rax] + mov rsi,QWORD[16+rax] + mov QWORD[152+r8],rax + mov QWORD[168+r8],rsi + mov QWORD[176+r8],rdi + + mov rdi,QWORD[40+r9] + mov rsi,r8 + mov ecx,154 + DD 0xa548f3fc + + mov rsi,r9 + xor rcx,rcx + mov rdx,QWORD[8+rsi] + mov r8,QWORD[rsi] + mov r9,QWORD[16+rsi] + mov r10,QWORD[40+rsi] + lea r11,[56+rsi] + lea r12,[24+rsi] + mov QWORD[32+rsp],r10 + mov QWORD[40+rsp],r11 + mov QWORD[48+rsp],r12 + mov QWORD[56+rsp],rcx + call QWORD[__imp_RtlVirtualUnwind] + + mov eax,1 + add rsp,64 + popfq + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + pop rdi + pop rsi + ret + + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_md5_block_asm_data_order wrt ..imagebase + DD $L$SEH_end_md5_block_asm_data_order wrt ..imagebase + DD $L$SEH_info_md5_block_asm_data_order wrt ..imagebase + +section .xdata rdata align=8 +ALIGN 8 +$L$SEH_info_md5_block_asm_data_order: + DB 9,0,0,0 + DD se_handler wrt ..imagebase +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/test_support/trampoline-armv4-linux.S b/third_party/boringssl/gen/test_support/trampoline-armv4-linux.S new file mode 100644 index 00000000..34a2819d --- /dev/null +++ b/third_party/boringssl/gen/test_support/trampoline-armv4-linux.S @@ -0,0 +1,368 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && defined(__ELF__) +.syntax unified + +.arch armv7-a +.fpu vfp + +.text + +@ abi_test_trampoline loads callee-saved registers from |state|, calls |func| +@ with |argv|, then saves the callee-saved registers into |state|. It returns +@ the result of |func|. The |unwind| argument is unused. +@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state, +@ const uint32_t *argv, size_t argc, +@ int unwind); +.type abi_test_trampoline, %function +.globl abi_test_trampoline +.hidden abi_test_trampoline +.align 4 +abi_test_trampoline: + @ Save parameters and all callee-saved registers. For convenience, we + @ save r9 on iOS even though it's volatile. + vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + stmdb sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} + + @ Reserve stack space for six (10-4) stack parameters, plus an extra 4 + @ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3). + sub sp, sp, #28 + + @ Every register in AAPCS is either non-volatile or a parameter (except + @ r9 on iOS), so this code, by the actual call, loses all its scratch + @ registers. First fill in stack parameters while there are registers + @ to spare. + cmp r3, #4 + bls .Lstack_args_done + mov r4, sp @ r4 is the output pointer. + add r5, r2, r3, lsl #2 @ Set r5 to the end of argv. + add r2, r2, #16 @ Skip four arguments. +.Lstack_args_loop: + ldr r6, [r2], #4 + cmp r2, r5 + str r6, [r4], #4 + bne .Lstack_args_loop + +.Lstack_args_done: + @ Load registers from |r1|. + vldmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} +#if defined(__APPLE__) + @ r9 is not volatile on iOS. + ldmia r1!, {r4,r5,r6,r7,r8,r10-r11} +#else + ldmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} +#endif + + @ Load register parameters. This uses up our remaining registers, so we + @ repurpose lr as scratch space. + ldr r3, [sp, #40] @ Reload argc. + ldr lr, [sp, #36] @ .Load argv into lr. + cmp r3, #3 + bhi .Larg_r3 + beq .Larg_r2 + cmp r3, #1 + bhi .Larg_r1 + beq .Larg_r0 + b .Largs_done + +.Larg_r3: + ldr r3, [lr, #12] @ argv[3] +.Larg_r2: + ldr r2, [lr, #8] @ argv[2] +.Larg_r1: + ldr r1, [lr, #4] @ argv[1] +.Larg_r0: + ldr r0, [lr] @ argv[0] +.Largs_done: + + @ With every other register in use, load the function pointer into lr + @ and call the function. + ldr lr, [sp, #28] + blx lr + + @ r1-r3 are free for use again. The trampoline only supports + @ single-return functions. Pass r4-r11 to the caller. + ldr r1, [sp, #32] + vstmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} +#if defined(__APPLE__) + @ r9 is not volatile on iOS. + stmia r1!, {r4,r5,r6,r7,r8,r10-r11} +#else + stmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} +#endif + + @ Unwind the stack and restore registers. + add sp, sp, #44 @ 44 = 28+16 + ldmia sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} @ Skip r0-r3 (see +16 above). + vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} + + bx lr +.size abi_test_trampoline,.-abi_test_trampoline +.type abi_test_clobber_r0, %function +.globl abi_test_clobber_r0 +.hidden abi_test_clobber_r0 +.align 4 +abi_test_clobber_r0: + mov r0, #0 + bx lr +.size abi_test_clobber_r0,.-abi_test_clobber_r0 +.type abi_test_clobber_r1, %function +.globl abi_test_clobber_r1 +.hidden abi_test_clobber_r1 +.align 4 +abi_test_clobber_r1: + mov r1, #0 + bx lr +.size abi_test_clobber_r1,.-abi_test_clobber_r1 +.type abi_test_clobber_r2, %function +.globl abi_test_clobber_r2 +.hidden abi_test_clobber_r2 +.align 4 +abi_test_clobber_r2: + mov r2, #0 + bx lr +.size abi_test_clobber_r2,.-abi_test_clobber_r2 +.type abi_test_clobber_r3, %function +.globl abi_test_clobber_r3 +.hidden abi_test_clobber_r3 +.align 4 +abi_test_clobber_r3: + mov r3, #0 + bx lr +.size abi_test_clobber_r3,.-abi_test_clobber_r3 +.type abi_test_clobber_r4, %function +.globl abi_test_clobber_r4 +.hidden abi_test_clobber_r4 +.align 4 +abi_test_clobber_r4: + mov r4, #0 + bx lr +.size abi_test_clobber_r4,.-abi_test_clobber_r4 +.type abi_test_clobber_r5, %function +.globl abi_test_clobber_r5 +.hidden abi_test_clobber_r5 +.align 4 +abi_test_clobber_r5: + mov r5, #0 + bx lr +.size abi_test_clobber_r5,.-abi_test_clobber_r5 +.type abi_test_clobber_r6, %function +.globl abi_test_clobber_r6 +.hidden abi_test_clobber_r6 +.align 4 +abi_test_clobber_r6: + mov r6, #0 + bx lr +.size abi_test_clobber_r6,.-abi_test_clobber_r6 +.type abi_test_clobber_r7, %function +.globl abi_test_clobber_r7 +.hidden abi_test_clobber_r7 +.align 4 +abi_test_clobber_r7: + mov r7, #0 + bx lr +.size abi_test_clobber_r7,.-abi_test_clobber_r7 +.type abi_test_clobber_r8, %function +.globl abi_test_clobber_r8 +.hidden abi_test_clobber_r8 +.align 4 +abi_test_clobber_r8: + mov r8, #0 + bx lr +.size abi_test_clobber_r8,.-abi_test_clobber_r8 +.type abi_test_clobber_r9, %function +.globl abi_test_clobber_r9 +.hidden abi_test_clobber_r9 +.align 4 +abi_test_clobber_r9: + mov r9, #0 + bx lr +.size abi_test_clobber_r9,.-abi_test_clobber_r9 +.type abi_test_clobber_r10, %function +.globl abi_test_clobber_r10 +.hidden abi_test_clobber_r10 +.align 4 +abi_test_clobber_r10: + mov r10, #0 + bx lr +.size abi_test_clobber_r10,.-abi_test_clobber_r10 +.type abi_test_clobber_r11, %function +.globl abi_test_clobber_r11 +.hidden abi_test_clobber_r11 +.align 4 +abi_test_clobber_r11: + mov r11, #0 + bx lr +.size abi_test_clobber_r11,.-abi_test_clobber_r11 +.type abi_test_clobber_r12, %function +.globl abi_test_clobber_r12 +.hidden abi_test_clobber_r12 +.align 4 +abi_test_clobber_r12: + mov r12, #0 + bx lr +.size abi_test_clobber_r12,.-abi_test_clobber_r12 +.type abi_test_clobber_d0, %function +.globl abi_test_clobber_d0 +.hidden abi_test_clobber_d0 +.align 4 +abi_test_clobber_d0: + mov r0, #0 + vmov s0, r0 + vmov s1, r0 + bx lr +.size abi_test_clobber_d0,.-abi_test_clobber_d0 +.type abi_test_clobber_d1, %function +.globl abi_test_clobber_d1 +.hidden abi_test_clobber_d1 +.align 4 +abi_test_clobber_d1: + mov r0, #0 + vmov s2, r0 + vmov s3, r0 + bx lr +.size abi_test_clobber_d1,.-abi_test_clobber_d1 +.type abi_test_clobber_d2, %function +.globl abi_test_clobber_d2 +.hidden abi_test_clobber_d2 +.align 4 +abi_test_clobber_d2: + mov r0, #0 + vmov s4, r0 + vmov s5, r0 + bx lr +.size abi_test_clobber_d2,.-abi_test_clobber_d2 +.type abi_test_clobber_d3, %function +.globl abi_test_clobber_d3 +.hidden abi_test_clobber_d3 +.align 4 +abi_test_clobber_d3: + mov r0, #0 + vmov s6, r0 + vmov s7, r0 + bx lr +.size abi_test_clobber_d3,.-abi_test_clobber_d3 +.type abi_test_clobber_d4, %function +.globl abi_test_clobber_d4 +.hidden abi_test_clobber_d4 +.align 4 +abi_test_clobber_d4: + mov r0, #0 + vmov s8, r0 + vmov s9, r0 + bx lr +.size abi_test_clobber_d4,.-abi_test_clobber_d4 +.type abi_test_clobber_d5, %function +.globl abi_test_clobber_d5 +.hidden abi_test_clobber_d5 +.align 4 +abi_test_clobber_d5: + mov r0, #0 + vmov s10, r0 + vmov s11, r0 + bx lr +.size abi_test_clobber_d5,.-abi_test_clobber_d5 +.type abi_test_clobber_d6, %function +.globl abi_test_clobber_d6 +.hidden abi_test_clobber_d6 +.align 4 +abi_test_clobber_d6: + mov r0, #0 + vmov s12, r0 + vmov s13, r0 + bx lr +.size abi_test_clobber_d6,.-abi_test_clobber_d6 +.type abi_test_clobber_d7, %function +.globl abi_test_clobber_d7 +.hidden abi_test_clobber_d7 +.align 4 +abi_test_clobber_d7: + mov r0, #0 + vmov s14, r0 + vmov s15, r0 + bx lr +.size abi_test_clobber_d7,.-abi_test_clobber_d7 +.type abi_test_clobber_d8, %function +.globl abi_test_clobber_d8 +.hidden abi_test_clobber_d8 +.align 4 +abi_test_clobber_d8: + mov r0, #0 + vmov s16, r0 + vmov s17, r0 + bx lr +.size abi_test_clobber_d8,.-abi_test_clobber_d8 +.type abi_test_clobber_d9, %function +.globl abi_test_clobber_d9 +.hidden abi_test_clobber_d9 +.align 4 +abi_test_clobber_d9: + mov r0, #0 + vmov s18, r0 + vmov s19, r0 + bx lr +.size abi_test_clobber_d9,.-abi_test_clobber_d9 +.type abi_test_clobber_d10, %function +.globl abi_test_clobber_d10 +.hidden abi_test_clobber_d10 +.align 4 +abi_test_clobber_d10: + mov r0, #0 + vmov s20, r0 + vmov s21, r0 + bx lr +.size abi_test_clobber_d10,.-abi_test_clobber_d10 +.type abi_test_clobber_d11, %function +.globl abi_test_clobber_d11 +.hidden abi_test_clobber_d11 +.align 4 +abi_test_clobber_d11: + mov r0, #0 + vmov s22, r0 + vmov s23, r0 + bx lr +.size abi_test_clobber_d11,.-abi_test_clobber_d11 +.type abi_test_clobber_d12, %function +.globl abi_test_clobber_d12 +.hidden abi_test_clobber_d12 +.align 4 +abi_test_clobber_d12: + mov r0, #0 + vmov s24, r0 + vmov s25, r0 + bx lr +.size abi_test_clobber_d12,.-abi_test_clobber_d12 +.type abi_test_clobber_d13, %function +.globl abi_test_clobber_d13 +.hidden abi_test_clobber_d13 +.align 4 +abi_test_clobber_d13: + mov r0, #0 + vmov s26, r0 + vmov s27, r0 + bx lr +.size abi_test_clobber_d13,.-abi_test_clobber_d13 +.type abi_test_clobber_d14, %function +.globl abi_test_clobber_d14 +.hidden abi_test_clobber_d14 +.align 4 +abi_test_clobber_d14: + mov r0, #0 + vmov s28, r0 + vmov s29, r0 + bx lr +.size abi_test_clobber_d14,.-abi_test_clobber_d14 +.type abi_test_clobber_d15, %function +.globl abi_test_clobber_d15 +.hidden abi_test_clobber_d15 +.align 4 +abi_test_clobber_d15: + mov r0, #0 + vmov s30, r0 + vmov s31, r0 + bx lr +.size abi_test_clobber_d15,.-abi_test_clobber_d15 +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_ARM) && defined(__ELF__) diff --git a/third_party/boringssl/gen/test_support/trampoline-armv8-apple.S b/third_party/boringssl/gen/test_support/trampoline-armv8-apple.S new file mode 100644 index 00000000..b32c707e --- /dev/null +++ b/third_party/boringssl/gen/test_support/trampoline-armv8-apple.S @@ -0,0 +1,748 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__APPLE__) +.text + +// abi_test_trampoline loads callee-saved registers from |state|, calls |func| +// with |argv|, then saves the callee-saved registers into |state|. It returns +// the result of |func|. The |unwind| argument is unused. +// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state, +// const uint64_t *argv, size_t argc, +// uint64_t unwind); + +.globl _abi_test_trampoline +.private_extern _abi_test_trampoline +.align 4 +_abi_test_trampoline: +Labi_test_trampoline_begin: + AARCH64_SIGN_LINK_REGISTER + // Stack layout (low to high addresses) + // x29,x30 (16 bytes) + // d8-d15 (64 bytes) + // x19-x28 (80 bytes) + // x1 (8 bytes) + // padding (8 bytes) + stp x29, x30, [sp, #-176]! + mov x29, sp + + // Saved callee-saved registers and |state|. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + stp x23, x24, [sp, #112] + stp x25, x26, [sp, #128] + stp x27, x28, [sp, #144] + str x1, [sp, #160] + + // Load registers from |state|, with the exception of x29. x29 is the + // frame pointer and also callee-saved, but AAPCS64 allows platforms to + // mandate that x29 always point to a frame. iOS64 does so, which means + // we cannot fill x29 with entropy without violating ABI rules + // ourselves. x29 is tested separately below. + ldp d8, d9, [x1], #16 + ldp d10, d11, [x1], #16 + ldp d12, d13, [x1], #16 + ldp d14, d15, [x1], #16 + ldp x19, x20, [x1], #16 + ldp x21, x22, [x1], #16 + ldp x23, x24, [x1], #16 + ldp x25, x26, [x1], #16 + ldp x27, x28, [x1], #16 + + // Move parameters into temporary registers. + mov x9, x0 + mov x10, x2 + mov x11, x3 + + // Load parameters into registers. + cbz x11, Largs_done + ldr x0, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x1, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x2, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x3, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x4, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x5, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x6, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x7, [x10], #8 + +Largs_done: + blr x9 + + // Reload |state| and store registers. + ldr x1, [sp, #160] + stp d8, d9, [x1], #16 + stp d10, d11, [x1], #16 + stp d12, d13, [x1], #16 + stp d14, d15, [x1], #16 + stp x19, x20, [x1], #16 + stp x21, x22, [x1], #16 + stp x23, x24, [x1], #16 + stp x25, x26, [x1], #16 + stp x27, x28, [x1], #16 + + // |func| is required to preserve x29, the frame pointer. We cannot load + // random values into x29 (see comment above), so compare it against the + // expected value and zero the field of |state| if corrupted. + mov x9, sp + cmp x29, x9 + b.eq Lx29_ok + str xzr, [x1] + +Lx29_ok: + // Restore callee-saved registers. + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldp x23, x24, [sp, #112] + ldp x25, x26, [sp, #128] + ldp x27, x28, [sp, #144] + + ldp x29, x30, [sp], #176 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl _abi_test_clobber_x0 +.private_extern _abi_test_clobber_x0 +.align 4 +_abi_test_clobber_x0: + AARCH64_VALID_CALL_TARGET + mov x0, xzr + ret + + +.globl _abi_test_clobber_x1 +.private_extern _abi_test_clobber_x1 +.align 4 +_abi_test_clobber_x1: + AARCH64_VALID_CALL_TARGET + mov x1, xzr + ret + + +.globl _abi_test_clobber_x2 +.private_extern _abi_test_clobber_x2 +.align 4 +_abi_test_clobber_x2: + AARCH64_VALID_CALL_TARGET + mov x2, xzr + ret + + +.globl _abi_test_clobber_x3 +.private_extern _abi_test_clobber_x3 +.align 4 +_abi_test_clobber_x3: + AARCH64_VALID_CALL_TARGET + mov x3, xzr + ret + + +.globl _abi_test_clobber_x4 +.private_extern _abi_test_clobber_x4 +.align 4 +_abi_test_clobber_x4: + AARCH64_VALID_CALL_TARGET + mov x4, xzr + ret + + +.globl _abi_test_clobber_x5 +.private_extern _abi_test_clobber_x5 +.align 4 +_abi_test_clobber_x5: + AARCH64_VALID_CALL_TARGET + mov x5, xzr + ret + + +.globl _abi_test_clobber_x6 +.private_extern _abi_test_clobber_x6 +.align 4 +_abi_test_clobber_x6: + AARCH64_VALID_CALL_TARGET + mov x6, xzr + ret + + +.globl _abi_test_clobber_x7 +.private_extern _abi_test_clobber_x7 +.align 4 +_abi_test_clobber_x7: + AARCH64_VALID_CALL_TARGET + mov x7, xzr + ret + + +.globl _abi_test_clobber_x8 +.private_extern _abi_test_clobber_x8 +.align 4 +_abi_test_clobber_x8: + AARCH64_VALID_CALL_TARGET + mov x8, xzr + ret + + +.globl _abi_test_clobber_x9 +.private_extern _abi_test_clobber_x9 +.align 4 +_abi_test_clobber_x9: + AARCH64_VALID_CALL_TARGET + mov x9, xzr + ret + + +.globl _abi_test_clobber_x10 +.private_extern _abi_test_clobber_x10 +.align 4 +_abi_test_clobber_x10: + AARCH64_VALID_CALL_TARGET + mov x10, xzr + ret + + +.globl _abi_test_clobber_x11 +.private_extern _abi_test_clobber_x11 +.align 4 +_abi_test_clobber_x11: + AARCH64_VALID_CALL_TARGET + mov x11, xzr + ret + + +.globl _abi_test_clobber_x12 +.private_extern _abi_test_clobber_x12 +.align 4 +_abi_test_clobber_x12: + AARCH64_VALID_CALL_TARGET + mov x12, xzr + ret + + +.globl _abi_test_clobber_x13 +.private_extern _abi_test_clobber_x13 +.align 4 +_abi_test_clobber_x13: + AARCH64_VALID_CALL_TARGET + mov x13, xzr + ret + + +.globl _abi_test_clobber_x14 +.private_extern _abi_test_clobber_x14 +.align 4 +_abi_test_clobber_x14: + AARCH64_VALID_CALL_TARGET + mov x14, xzr + ret + + +.globl _abi_test_clobber_x15 +.private_extern _abi_test_clobber_x15 +.align 4 +_abi_test_clobber_x15: + AARCH64_VALID_CALL_TARGET + mov x15, xzr + ret + + +.globl _abi_test_clobber_x16 +.private_extern _abi_test_clobber_x16 +.align 4 +_abi_test_clobber_x16: + AARCH64_VALID_CALL_TARGET + mov x16, xzr + ret + + +.globl _abi_test_clobber_x17 +.private_extern _abi_test_clobber_x17 +.align 4 +_abi_test_clobber_x17: + AARCH64_VALID_CALL_TARGET + mov x17, xzr + ret + + +.globl _abi_test_clobber_x19 +.private_extern _abi_test_clobber_x19 +.align 4 +_abi_test_clobber_x19: + AARCH64_VALID_CALL_TARGET + mov x19, xzr + ret + + +.globl _abi_test_clobber_x20 +.private_extern _abi_test_clobber_x20 +.align 4 +_abi_test_clobber_x20: + AARCH64_VALID_CALL_TARGET + mov x20, xzr + ret + + +.globl _abi_test_clobber_x21 +.private_extern _abi_test_clobber_x21 +.align 4 +_abi_test_clobber_x21: + AARCH64_VALID_CALL_TARGET + mov x21, xzr + ret + + +.globl _abi_test_clobber_x22 +.private_extern _abi_test_clobber_x22 +.align 4 +_abi_test_clobber_x22: + AARCH64_VALID_CALL_TARGET + mov x22, xzr + ret + + +.globl _abi_test_clobber_x23 +.private_extern _abi_test_clobber_x23 +.align 4 +_abi_test_clobber_x23: + AARCH64_VALID_CALL_TARGET + mov x23, xzr + ret + + +.globl _abi_test_clobber_x24 +.private_extern _abi_test_clobber_x24 +.align 4 +_abi_test_clobber_x24: + AARCH64_VALID_CALL_TARGET + mov x24, xzr + ret + + +.globl _abi_test_clobber_x25 +.private_extern _abi_test_clobber_x25 +.align 4 +_abi_test_clobber_x25: + AARCH64_VALID_CALL_TARGET + mov x25, xzr + ret + + +.globl _abi_test_clobber_x26 +.private_extern _abi_test_clobber_x26 +.align 4 +_abi_test_clobber_x26: + AARCH64_VALID_CALL_TARGET + mov x26, xzr + ret + + +.globl _abi_test_clobber_x27 +.private_extern _abi_test_clobber_x27 +.align 4 +_abi_test_clobber_x27: + AARCH64_VALID_CALL_TARGET + mov x27, xzr + ret + + +.globl _abi_test_clobber_x28 +.private_extern _abi_test_clobber_x28 +.align 4 +_abi_test_clobber_x28: + AARCH64_VALID_CALL_TARGET + mov x28, xzr + ret + + +.globl _abi_test_clobber_x29 +.private_extern _abi_test_clobber_x29 +.align 4 +_abi_test_clobber_x29: + AARCH64_VALID_CALL_TARGET + mov x29, xzr + ret + + +.globl _abi_test_clobber_d0 +.private_extern _abi_test_clobber_d0 +.align 4 +_abi_test_clobber_d0: + AARCH64_VALID_CALL_TARGET + fmov d0, xzr + ret + + +.globl _abi_test_clobber_d1 +.private_extern _abi_test_clobber_d1 +.align 4 +_abi_test_clobber_d1: + AARCH64_VALID_CALL_TARGET + fmov d1, xzr + ret + + +.globl _abi_test_clobber_d2 +.private_extern _abi_test_clobber_d2 +.align 4 +_abi_test_clobber_d2: + AARCH64_VALID_CALL_TARGET + fmov d2, xzr + ret + + +.globl _abi_test_clobber_d3 +.private_extern _abi_test_clobber_d3 +.align 4 +_abi_test_clobber_d3: + AARCH64_VALID_CALL_TARGET + fmov d3, xzr + ret + + +.globl _abi_test_clobber_d4 +.private_extern _abi_test_clobber_d4 +.align 4 +_abi_test_clobber_d4: + AARCH64_VALID_CALL_TARGET + fmov d4, xzr + ret + + +.globl _abi_test_clobber_d5 +.private_extern _abi_test_clobber_d5 +.align 4 +_abi_test_clobber_d5: + AARCH64_VALID_CALL_TARGET + fmov d5, xzr + ret + + +.globl _abi_test_clobber_d6 +.private_extern _abi_test_clobber_d6 +.align 4 +_abi_test_clobber_d6: + AARCH64_VALID_CALL_TARGET + fmov d6, xzr + ret + + +.globl _abi_test_clobber_d7 +.private_extern _abi_test_clobber_d7 +.align 4 +_abi_test_clobber_d7: + AARCH64_VALID_CALL_TARGET + fmov d7, xzr + ret + + +.globl _abi_test_clobber_d8 +.private_extern _abi_test_clobber_d8 +.align 4 +_abi_test_clobber_d8: + AARCH64_VALID_CALL_TARGET + fmov d8, xzr + ret + + +.globl _abi_test_clobber_d9 +.private_extern _abi_test_clobber_d9 +.align 4 +_abi_test_clobber_d9: + AARCH64_VALID_CALL_TARGET + fmov d9, xzr + ret + + +.globl _abi_test_clobber_d10 +.private_extern _abi_test_clobber_d10 +.align 4 +_abi_test_clobber_d10: + AARCH64_VALID_CALL_TARGET + fmov d10, xzr + ret + + +.globl _abi_test_clobber_d11 +.private_extern _abi_test_clobber_d11 +.align 4 +_abi_test_clobber_d11: + AARCH64_VALID_CALL_TARGET + fmov d11, xzr + ret + + +.globl _abi_test_clobber_d12 +.private_extern _abi_test_clobber_d12 +.align 4 +_abi_test_clobber_d12: + AARCH64_VALID_CALL_TARGET + fmov d12, xzr + ret + + +.globl _abi_test_clobber_d13 +.private_extern _abi_test_clobber_d13 +.align 4 +_abi_test_clobber_d13: + AARCH64_VALID_CALL_TARGET + fmov d13, xzr + ret + + +.globl _abi_test_clobber_d14 +.private_extern _abi_test_clobber_d14 +.align 4 +_abi_test_clobber_d14: + AARCH64_VALID_CALL_TARGET + fmov d14, xzr + ret + + +.globl _abi_test_clobber_d15 +.private_extern _abi_test_clobber_d15 +.align 4 +_abi_test_clobber_d15: + AARCH64_VALID_CALL_TARGET + fmov d15, xzr + ret + + +.globl _abi_test_clobber_d16 +.private_extern _abi_test_clobber_d16 +.align 4 +_abi_test_clobber_d16: + AARCH64_VALID_CALL_TARGET + fmov d16, xzr + ret + + +.globl _abi_test_clobber_d17 +.private_extern _abi_test_clobber_d17 +.align 4 +_abi_test_clobber_d17: + AARCH64_VALID_CALL_TARGET + fmov d17, xzr + ret + + +.globl _abi_test_clobber_d18 +.private_extern _abi_test_clobber_d18 +.align 4 +_abi_test_clobber_d18: + AARCH64_VALID_CALL_TARGET + fmov d18, xzr + ret + + +.globl _abi_test_clobber_d19 +.private_extern _abi_test_clobber_d19 +.align 4 +_abi_test_clobber_d19: + AARCH64_VALID_CALL_TARGET + fmov d19, xzr + ret + + +.globl _abi_test_clobber_d20 +.private_extern _abi_test_clobber_d20 +.align 4 +_abi_test_clobber_d20: + AARCH64_VALID_CALL_TARGET + fmov d20, xzr + ret + + +.globl _abi_test_clobber_d21 +.private_extern _abi_test_clobber_d21 +.align 4 +_abi_test_clobber_d21: + AARCH64_VALID_CALL_TARGET + fmov d21, xzr + ret + + +.globl _abi_test_clobber_d22 +.private_extern _abi_test_clobber_d22 +.align 4 +_abi_test_clobber_d22: + AARCH64_VALID_CALL_TARGET + fmov d22, xzr + ret + + +.globl _abi_test_clobber_d23 +.private_extern _abi_test_clobber_d23 +.align 4 +_abi_test_clobber_d23: + AARCH64_VALID_CALL_TARGET + fmov d23, xzr + ret + + +.globl _abi_test_clobber_d24 +.private_extern _abi_test_clobber_d24 +.align 4 +_abi_test_clobber_d24: + AARCH64_VALID_CALL_TARGET + fmov d24, xzr + ret + + +.globl _abi_test_clobber_d25 +.private_extern _abi_test_clobber_d25 +.align 4 +_abi_test_clobber_d25: + AARCH64_VALID_CALL_TARGET + fmov d25, xzr + ret + + +.globl _abi_test_clobber_d26 +.private_extern _abi_test_clobber_d26 +.align 4 +_abi_test_clobber_d26: + AARCH64_VALID_CALL_TARGET + fmov d26, xzr + ret + + +.globl _abi_test_clobber_d27 +.private_extern _abi_test_clobber_d27 +.align 4 +_abi_test_clobber_d27: + AARCH64_VALID_CALL_TARGET + fmov d27, xzr + ret + + +.globl _abi_test_clobber_d28 +.private_extern _abi_test_clobber_d28 +.align 4 +_abi_test_clobber_d28: + AARCH64_VALID_CALL_TARGET + fmov d28, xzr + ret + + +.globl _abi_test_clobber_d29 +.private_extern _abi_test_clobber_d29 +.align 4 +_abi_test_clobber_d29: + AARCH64_VALID_CALL_TARGET + fmov d29, xzr + ret + + +.globl _abi_test_clobber_d30 +.private_extern _abi_test_clobber_d30 +.align 4 +_abi_test_clobber_d30: + AARCH64_VALID_CALL_TARGET + fmov d30, xzr + ret + + +.globl _abi_test_clobber_d31 +.private_extern _abi_test_clobber_d31 +.align 4 +_abi_test_clobber_d31: + AARCH64_VALID_CALL_TARGET + fmov d31, xzr + ret + + +.globl _abi_test_clobber_v8_upper +.private_extern _abi_test_clobber_v8_upper +.align 4 +_abi_test_clobber_v8_upper: + AARCH64_VALID_CALL_TARGET + fmov v8.d[1], xzr + ret + + +.globl _abi_test_clobber_v9_upper +.private_extern _abi_test_clobber_v9_upper +.align 4 +_abi_test_clobber_v9_upper: + AARCH64_VALID_CALL_TARGET + fmov v9.d[1], xzr + ret + + +.globl _abi_test_clobber_v10_upper +.private_extern _abi_test_clobber_v10_upper +.align 4 +_abi_test_clobber_v10_upper: + AARCH64_VALID_CALL_TARGET + fmov v10.d[1], xzr + ret + + +.globl _abi_test_clobber_v11_upper +.private_extern _abi_test_clobber_v11_upper +.align 4 +_abi_test_clobber_v11_upper: + AARCH64_VALID_CALL_TARGET + fmov v11.d[1], xzr + ret + + +.globl _abi_test_clobber_v12_upper +.private_extern _abi_test_clobber_v12_upper +.align 4 +_abi_test_clobber_v12_upper: + AARCH64_VALID_CALL_TARGET + fmov v12.d[1], xzr + ret + + +.globl _abi_test_clobber_v13_upper +.private_extern _abi_test_clobber_v13_upper +.align 4 +_abi_test_clobber_v13_upper: + AARCH64_VALID_CALL_TARGET + fmov v13.d[1], xzr + ret + + +.globl _abi_test_clobber_v14_upper +.private_extern _abi_test_clobber_v14_upper +.align 4 +_abi_test_clobber_v14_upper: + AARCH64_VALID_CALL_TARGET + fmov v14.d[1], xzr + ret + + +.globl _abi_test_clobber_v15_upper +.private_extern _abi_test_clobber_v15_upper +.align 4 +_abi_test_clobber_v15_upper: + AARCH64_VALID_CALL_TARGET + fmov v15.d[1], xzr + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/test_support/trampoline-armv8-linux.S b/third_party/boringssl/gen/test_support/trampoline-armv8-linux.S new file mode 100644 index 00000000..7bfb9062 --- /dev/null +++ b/third_party/boringssl/gen/test_support/trampoline-armv8-linux.S @@ -0,0 +1,748 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(__ELF__) +.text + +// abi_test_trampoline loads callee-saved registers from |state|, calls |func| +// with |argv|, then saves the callee-saved registers into |state|. It returns +// the result of |func|. The |unwind| argument is unused. +// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state, +// const uint64_t *argv, size_t argc, +// uint64_t unwind); +.type abi_test_trampoline, %function +.globl abi_test_trampoline +.hidden abi_test_trampoline +.align 4 +abi_test_trampoline: +.Labi_test_trampoline_begin: + AARCH64_SIGN_LINK_REGISTER + // Stack layout (low to high addresses) + // x29,x30 (16 bytes) + // d8-d15 (64 bytes) + // x19-x28 (80 bytes) + // x1 (8 bytes) + // padding (8 bytes) + stp x29, x30, [sp, #-176]! + mov x29, sp + + // Saved callee-saved registers and |state|. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + stp x23, x24, [sp, #112] + stp x25, x26, [sp, #128] + stp x27, x28, [sp, #144] + str x1, [sp, #160] + + // Load registers from |state|, with the exception of x29. x29 is the + // frame pointer and also callee-saved, but AAPCS64 allows platforms to + // mandate that x29 always point to a frame. iOS64 does so, which means + // we cannot fill x29 with entropy without violating ABI rules + // ourselves. x29 is tested separately below. + ldp d8, d9, [x1], #16 + ldp d10, d11, [x1], #16 + ldp d12, d13, [x1], #16 + ldp d14, d15, [x1], #16 + ldp x19, x20, [x1], #16 + ldp x21, x22, [x1], #16 + ldp x23, x24, [x1], #16 + ldp x25, x26, [x1], #16 + ldp x27, x28, [x1], #16 + + // Move parameters into temporary registers. + mov x9, x0 + mov x10, x2 + mov x11, x3 + + // Load parameters into registers. + cbz x11, .Largs_done + ldr x0, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x1, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x2, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x3, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x4, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x5, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x6, [x10], #8 + subs x11, x11, #1 + b.eq .Largs_done + ldr x7, [x10], #8 + +.Largs_done: + blr x9 + + // Reload |state| and store registers. + ldr x1, [sp, #160] + stp d8, d9, [x1], #16 + stp d10, d11, [x1], #16 + stp d12, d13, [x1], #16 + stp d14, d15, [x1], #16 + stp x19, x20, [x1], #16 + stp x21, x22, [x1], #16 + stp x23, x24, [x1], #16 + stp x25, x26, [x1], #16 + stp x27, x28, [x1], #16 + + // |func| is required to preserve x29, the frame pointer. We cannot load + // random values into x29 (see comment above), so compare it against the + // expected value and zero the field of |state| if corrupted. + mov x9, sp + cmp x29, x9 + b.eq .Lx29_ok + str xzr, [x1] + +.Lx29_ok: + // Restore callee-saved registers. + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldp x23, x24, [sp, #112] + ldp x25, x26, [sp, #128] + ldp x27, x28, [sp, #144] + + ldp x29, x30, [sp], #176 + AARCH64_VALIDATE_LINK_REGISTER + ret +.size abi_test_trampoline,.-abi_test_trampoline +.type abi_test_clobber_x0, %function +.globl abi_test_clobber_x0 +.hidden abi_test_clobber_x0 +.align 4 +abi_test_clobber_x0: + AARCH64_VALID_CALL_TARGET + mov x0, xzr + ret +.size abi_test_clobber_x0,.-abi_test_clobber_x0 +.type abi_test_clobber_x1, %function +.globl abi_test_clobber_x1 +.hidden abi_test_clobber_x1 +.align 4 +abi_test_clobber_x1: + AARCH64_VALID_CALL_TARGET + mov x1, xzr + ret +.size abi_test_clobber_x1,.-abi_test_clobber_x1 +.type abi_test_clobber_x2, %function +.globl abi_test_clobber_x2 +.hidden abi_test_clobber_x2 +.align 4 +abi_test_clobber_x2: + AARCH64_VALID_CALL_TARGET + mov x2, xzr + ret +.size abi_test_clobber_x2,.-abi_test_clobber_x2 +.type abi_test_clobber_x3, %function +.globl abi_test_clobber_x3 +.hidden abi_test_clobber_x3 +.align 4 +abi_test_clobber_x3: + AARCH64_VALID_CALL_TARGET + mov x3, xzr + ret +.size abi_test_clobber_x3,.-abi_test_clobber_x3 +.type abi_test_clobber_x4, %function +.globl abi_test_clobber_x4 +.hidden abi_test_clobber_x4 +.align 4 +abi_test_clobber_x4: + AARCH64_VALID_CALL_TARGET + mov x4, xzr + ret +.size abi_test_clobber_x4,.-abi_test_clobber_x4 +.type abi_test_clobber_x5, %function +.globl abi_test_clobber_x5 +.hidden abi_test_clobber_x5 +.align 4 +abi_test_clobber_x5: + AARCH64_VALID_CALL_TARGET + mov x5, xzr + ret +.size abi_test_clobber_x5,.-abi_test_clobber_x5 +.type abi_test_clobber_x6, %function +.globl abi_test_clobber_x6 +.hidden abi_test_clobber_x6 +.align 4 +abi_test_clobber_x6: + AARCH64_VALID_CALL_TARGET + mov x6, xzr + ret +.size abi_test_clobber_x6,.-abi_test_clobber_x6 +.type abi_test_clobber_x7, %function +.globl abi_test_clobber_x7 +.hidden abi_test_clobber_x7 +.align 4 +abi_test_clobber_x7: + AARCH64_VALID_CALL_TARGET + mov x7, xzr + ret +.size abi_test_clobber_x7,.-abi_test_clobber_x7 +.type abi_test_clobber_x8, %function +.globl abi_test_clobber_x8 +.hidden abi_test_clobber_x8 +.align 4 +abi_test_clobber_x8: + AARCH64_VALID_CALL_TARGET + mov x8, xzr + ret +.size abi_test_clobber_x8,.-abi_test_clobber_x8 +.type abi_test_clobber_x9, %function +.globl abi_test_clobber_x9 +.hidden abi_test_clobber_x9 +.align 4 +abi_test_clobber_x9: + AARCH64_VALID_CALL_TARGET + mov x9, xzr + ret +.size abi_test_clobber_x9,.-abi_test_clobber_x9 +.type abi_test_clobber_x10, %function +.globl abi_test_clobber_x10 +.hidden abi_test_clobber_x10 +.align 4 +abi_test_clobber_x10: + AARCH64_VALID_CALL_TARGET + mov x10, xzr + ret +.size abi_test_clobber_x10,.-abi_test_clobber_x10 +.type abi_test_clobber_x11, %function +.globl abi_test_clobber_x11 +.hidden abi_test_clobber_x11 +.align 4 +abi_test_clobber_x11: + AARCH64_VALID_CALL_TARGET + mov x11, xzr + ret +.size abi_test_clobber_x11,.-abi_test_clobber_x11 +.type abi_test_clobber_x12, %function +.globl abi_test_clobber_x12 +.hidden abi_test_clobber_x12 +.align 4 +abi_test_clobber_x12: + AARCH64_VALID_CALL_TARGET + mov x12, xzr + ret +.size abi_test_clobber_x12,.-abi_test_clobber_x12 +.type abi_test_clobber_x13, %function +.globl abi_test_clobber_x13 +.hidden abi_test_clobber_x13 +.align 4 +abi_test_clobber_x13: + AARCH64_VALID_CALL_TARGET + mov x13, xzr + ret +.size abi_test_clobber_x13,.-abi_test_clobber_x13 +.type abi_test_clobber_x14, %function +.globl abi_test_clobber_x14 +.hidden abi_test_clobber_x14 +.align 4 +abi_test_clobber_x14: + AARCH64_VALID_CALL_TARGET + mov x14, xzr + ret +.size abi_test_clobber_x14,.-abi_test_clobber_x14 +.type abi_test_clobber_x15, %function +.globl abi_test_clobber_x15 +.hidden abi_test_clobber_x15 +.align 4 +abi_test_clobber_x15: + AARCH64_VALID_CALL_TARGET + mov x15, xzr + ret +.size abi_test_clobber_x15,.-abi_test_clobber_x15 +.type abi_test_clobber_x16, %function +.globl abi_test_clobber_x16 +.hidden abi_test_clobber_x16 +.align 4 +abi_test_clobber_x16: + AARCH64_VALID_CALL_TARGET + mov x16, xzr + ret +.size abi_test_clobber_x16,.-abi_test_clobber_x16 +.type abi_test_clobber_x17, %function +.globl abi_test_clobber_x17 +.hidden abi_test_clobber_x17 +.align 4 +abi_test_clobber_x17: + AARCH64_VALID_CALL_TARGET + mov x17, xzr + ret +.size abi_test_clobber_x17,.-abi_test_clobber_x17 +.type abi_test_clobber_x19, %function +.globl abi_test_clobber_x19 +.hidden abi_test_clobber_x19 +.align 4 +abi_test_clobber_x19: + AARCH64_VALID_CALL_TARGET + mov x19, xzr + ret +.size abi_test_clobber_x19,.-abi_test_clobber_x19 +.type abi_test_clobber_x20, %function +.globl abi_test_clobber_x20 +.hidden abi_test_clobber_x20 +.align 4 +abi_test_clobber_x20: + AARCH64_VALID_CALL_TARGET + mov x20, xzr + ret +.size abi_test_clobber_x20,.-abi_test_clobber_x20 +.type abi_test_clobber_x21, %function +.globl abi_test_clobber_x21 +.hidden abi_test_clobber_x21 +.align 4 +abi_test_clobber_x21: + AARCH64_VALID_CALL_TARGET + mov x21, xzr + ret +.size abi_test_clobber_x21,.-abi_test_clobber_x21 +.type abi_test_clobber_x22, %function +.globl abi_test_clobber_x22 +.hidden abi_test_clobber_x22 +.align 4 +abi_test_clobber_x22: + AARCH64_VALID_CALL_TARGET + mov x22, xzr + ret +.size abi_test_clobber_x22,.-abi_test_clobber_x22 +.type abi_test_clobber_x23, %function +.globl abi_test_clobber_x23 +.hidden abi_test_clobber_x23 +.align 4 +abi_test_clobber_x23: + AARCH64_VALID_CALL_TARGET + mov x23, xzr + ret +.size abi_test_clobber_x23,.-abi_test_clobber_x23 +.type abi_test_clobber_x24, %function +.globl abi_test_clobber_x24 +.hidden abi_test_clobber_x24 +.align 4 +abi_test_clobber_x24: + AARCH64_VALID_CALL_TARGET + mov x24, xzr + ret +.size abi_test_clobber_x24,.-abi_test_clobber_x24 +.type abi_test_clobber_x25, %function +.globl abi_test_clobber_x25 +.hidden abi_test_clobber_x25 +.align 4 +abi_test_clobber_x25: + AARCH64_VALID_CALL_TARGET + mov x25, xzr + ret +.size abi_test_clobber_x25,.-abi_test_clobber_x25 +.type abi_test_clobber_x26, %function +.globl abi_test_clobber_x26 +.hidden abi_test_clobber_x26 +.align 4 +abi_test_clobber_x26: + AARCH64_VALID_CALL_TARGET + mov x26, xzr + ret +.size abi_test_clobber_x26,.-abi_test_clobber_x26 +.type abi_test_clobber_x27, %function +.globl abi_test_clobber_x27 +.hidden abi_test_clobber_x27 +.align 4 +abi_test_clobber_x27: + AARCH64_VALID_CALL_TARGET + mov x27, xzr + ret +.size abi_test_clobber_x27,.-abi_test_clobber_x27 +.type abi_test_clobber_x28, %function +.globl abi_test_clobber_x28 +.hidden abi_test_clobber_x28 +.align 4 +abi_test_clobber_x28: + AARCH64_VALID_CALL_TARGET + mov x28, xzr + ret +.size abi_test_clobber_x28,.-abi_test_clobber_x28 +.type abi_test_clobber_x29, %function +.globl abi_test_clobber_x29 +.hidden abi_test_clobber_x29 +.align 4 +abi_test_clobber_x29: + AARCH64_VALID_CALL_TARGET + mov x29, xzr + ret +.size abi_test_clobber_x29,.-abi_test_clobber_x29 +.type abi_test_clobber_d0, %function +.globl abi_test_clobber_d0 +.hidden abi_test_clobber_d0 +.align 4 +abi_test_clobber_d0: + AARCH64_VALID_CALL_TARGET + fmov d0, xzr + ret +.size abi_test_clobber_d0,.-abi_test_clobber_d0 +.type abi_test_clobber_d1, %function +.globl abi_test_clobber_d1 +.hidden abi_test_clobber_d1 +.align 4 +abi_test_clobber_d1: + AARCH64_VALID_CALL_TARGET + fmov d1, xzr + ret +.size abi_test_clobber_d1,.-abi_test_clobber_d1 +.type abi_test_clobber_d2, %function +.globl abi_test_clobber_d2 +.hidden abi_test_clobber_d2 +.align 4 +abi_test_clobber_d2: + AARCH64_VALID_CALL_TARGET + fmov d2, xzr + ret +.size abi_test_clobber_d2,.-abi_test_clobber_d2 +.type abi_test_clobber_d3, %function +.globl abi_test_clobber_d3 +.hidden abi_test_clobber_d3 +.align 4 +abi_test_clobber_d3: + AARCH64_VALID_CALL_TARGET + fmov d3, xzr + ret +.size abi_test_clobber_d3,.-abi_test_clobber_d3 +.type abi_test_clobber_d4, %function +.globl abi_test_clobber_d4 +.hidden abi_test_clobber_d4 +.align 4 +abi_test_clobber_d4: + AARCH64_VALID_CALL_TARGET + fmov d4, xzr + ret +.size abi_test_clobber_d4,.-abi_test_clobber_d4 +.type abi_test_clobber_d5, %function +.globl abi_test_clobber_d5 +.hidden abi_test_clobber_d5 +.align 4 +abi_test_clobber_d5: + AARCH64_VALID_CALL_TARGET + fmov d5, xzr + ret +.size abi_test_clobber_d5,.-abi_test_clobber_d5 +.type abi_test_clobber_d6, %function +.globl abi_test_clobber_d6 +.hidden abi_test_clobber_d6 +.align 4 +abi_test_clobber_d6: + AARCH64_VALID_CALL_TARGET + fmov d6, xzr + ret +.size abi_test_clobber_d6,.-abi_test_clobber_d6 +.type abi_test_clobber_d7, %function +.globl abi_test_clobber_d7 +.hidden abi_test_clobber_d7 +.align 4 +abi_test_clobber_d7: + AARCH64_VALID_CALL_TARGET + fmov d7, xzr + ret +.size abi_test_clobber_d7,.-abi_test_clobber_d7 +.type abi_test_clobber_d8, %function +.globl abi_test_clobber_d8 +.hidden abi_test_clobber_d8 +.align 4 +abi_test_clobber_d8: + AARCH64_VALID_CALL_TARGET + fmov d8, xzr + ret +.size abi_test_clobber_d8,.-abi_test_clobber_d8 +.type abi_test_clobber_d9, %function +.globl abi_test_clobber_d9 +.hidden abi_test_clobber_d9 +.align 4 +abi_test_clobber_d9: + AARCH64_VALID_CALL_TARGET + fmov d9, xzr + ret +.size abi_test_clobber_d9,.-abi_test_clobber_d9 +.type abi_test_clobber_d10, %function +.globl abi_test_clobber_d10 +.hidden abi_test_clobber_d10 +.align 4 +abi_test_clobber_d10: + AARCH64_VALID_CALL_TARGET + fmov d10, xzr + ret +.size abi_test_clobber_d10,.-abi_test_clobber_d10 +.type abi_test_clobber_d11, %function +.globl abi_test_clobber_d11 +.hidden abi_test_clobber_d11 +.align 4 +abi_test_clobber_d11: + AARCH64_VALID_CALL_TARGET + fmov d11, xzr + ret +.size abi_test_clobber_d11,.-abi_test_clobber_d11 +.type abi_test_clobber_d12, %function +.globl abi_test_clobber_d12 +.hidden abi_test_clobber_d12 +.align 4 +abi_test_clobber_d12: + AARCH64_VALID_CALL_TARGET + fmov d12, xzr + ret +.size abi_test_clobber_d12,.-abi_test_clobber_d12 +.type abi_test_clobber_d13, %function +.globl abi_test_clobber_d13 +.hidden abi_test_clobber_d13 +.align 4 +abi_test_clobber_d13: + AARCH64_VALID_CALL_TARGET + fmov d13, xzr + ret +.size abi_test_clobber_d13,.-abi_test_clobber_d13 +.type abi_test_clobber_d14, %function +.globl abi_test_clobber_d14 +.hidden abi_test_clobber_d14 +.align 4 +abi_test_clobber_d14: + AARCH64_VALID_CALL_TARGET + fmov d14, xzr + ret +.size abi_test_clobber_d14,.-abi_test_clobber_d14 +.type abi_test_clobber_d15, %function +.globl abi_test_clobber_d15 +.hidden abi_test_clobber_d15 +.align 4 +abi_test_clobber_d15: + AARCH64_VALID_CALL_TARGET + fmov d15, xzr + ret +.size abi_test_clobber_d15,.-abi_test_clobber_d15 +.type abi_test_clobber_d16, %function +.globl abi_test_clobber_d16 +.hidden abi_test_clobber_d16 +.align 4 +abi_test_clobber_d16: + AARCH64_VALID_CALL_TARGET + fmov d16, xzr + ret +.size abi_test_clobber_d16,.-abi_test_clobber_d16 +.type abi_test_clobber_d17, %function +.globl abi_test_clobber_d17 +.hidden abi_test_clobber_d17 +.align 4 +abi_test_clobber_d17: + AARCH64_VALID_CALL_TARGET + fmov d17, xzr + ret +.size abi_test_clobber_d17,.-abi_test_clobber_d17 +.type abi_test_clobber_d18, %function +.globl abi_test_clobber_d18 +.hidden abi_test_clobber_d18 +.align 4 +abi_test_clobber_d18: + AARCH64_VALID_CALL_TARGET + fmov d18, xzr + ret +.size abi_test_clobber_d18,.-abi_test_clobber_d18 +.type abi_test_clobber_d19, %function +.globl abi_test_clobber_d19 +.hidden abi_test_clobber_d19 +.align 4 +abi_test_clobber_d19: + AARCH64_VALID_CALL_TARGET + fmov d19, xzr + ret +.size abi_test_clobber_d19,.-abi_test_clobber_d19 +.type abi_test_clobber_d20, %function +.globl abi_test_clobber_d20 +.hidden abi_test_clobber_d20 +.align 4 +abi_test_clobber_d20: + AARCH64_VALID_CALL_TARGET + fmov d20, xzr + ret +.size abi_test_clobber_d20,.-abi_test_clobber_d20 +.type abi_test_clobber_d21, %function +.globl abi_test_clobber_d21 +.hidden abi_test_clobber_d21 +.align 4 +abi_test_clobber_d21: + AARCH64_VALID_CALL_TARGET + fmov d21, xzr + ret +.size abi_test_clobber_d21,.-abi_test_clobber_d21 +.type abi_test_clobber_d22, %function +.globl abi_test_clobber_d22 +.hidden abi_test_clobber_d22 +.align 4 +abi_test_clobber_d22: + AARCH64_VALID_CALL_TARGET + fmov d22, xzr + ret +.size abi_test_clobber_d22,.-abi_test_clobber_d22 +.type abi_test_clobber_d23, %function +.globl abi_test_clobber_d23 +.hidden abi_test_clobber_d23 +.align 4 +abi_test_clobber_d23: + AARCH64_VALID_CALL_TARGET + fmov d23, xzr + ret +.size abi_test_clobber_d23,.-abi_test_clobber_d23 +.type abi_test_clobber_d24, %function +.globl abi_test_clobber_d24 +.hidden abi_test_clobber_d24 +.align 4 +abi_test_clobber_d24: + AARCH64_VALID_CALL_TARGET + fmov d24, xzr + ret +.size abi_test_clobber_d24,.-abi_test_clobber_d24 +.type abi_test_clobber_d25, %function +.globl abi_test_clobber_d25 +.hidden abi_test_clobber_d25 +.align 4 +abi_test_clobber_d25: + AARCH64_VALID_CALL_TARGET + fmov d25, xzr + ret +.size abi_test_clobber_d25,.-abi_test_clobber_d25 +.type abi_test_clobber_d26, %function +.globl abi_test_clobber_d26 +.hidden abi_test_clobber_d26 +.align 4 +abi_test_clobber_d26: + AARCH64_VALID_CALL_TARGET + fmov d26, xzr + ret +.size abi_test_clobber_d26,.-abi_test_clobber_d26 +.type abi_test_clobber_d27, %function +.globl abi_test_clobber_d27 +.hidden abi_test_clobber_d27 +.align 4 +abi_test_clobber_d27: + AARCH64_VALID_CALL_TARGET + fmov d27, xzr + ret +.size abi_test_clobber_d27,.-abi_test_clobber_d27 +.type abi_test_clobber_d28, %function +.globl abi_test_clobber_d28 +.hidden abi_test_clobber_d28 +.align 4 +abi_test_clobber_d28: + AARCH64_VALID_CALL_TARGET + fmov d28, xzr + ret +.size abi_test_clobber_d28,.-abi_test_clobber_d28 +.type abi_test_clobber_d29, %function +.globl abi_test_clobber_d29 +.hidden abi_test_clobber_d29 +.align 4 +abi_test_clobber_d29: + AARCH64_VALID_CALL_TARGET + fmov d29, xzr + ret +.size abi_test_clobber_d29,.-abi_test_clobber_d29 +.type abi_test_clobber_d30, %function +.globl abi_test_clobber_d30 +.hidden abi_test_clobber_d30 +.align 4 +abi_test_clobber_d30: + AARCH64_VALID_CALL_TARGET + fmov d30, xzr + ret +.size abi_test_clobber_d30,.-abi_test_clobber_d30 +.type abi_test_clobber_d31, %function +.globl abi_test_clobber_d31 +.hidden abi_test_clobber_d31 +.align 4 +abi_test_clobber_d31: + AARCH64_VALID_CALL_TARGET + fmov d31, xzr + ret +.size abi_test_clobber_d31,.-abi_test_clobber_d31 +.type abi_test_clobber_v8_upper, %function +.globl abi_test_clobber_v8_upper +.hidden abi_test_clobber_v8_upper +.align 4 +abi_test_clobber_v8_upper: + AARCH64_VALID_CALL_TARGET + fmov v8.d[1], xzr + ret +.size abi_test_clobber_v8_upper,.-abi_test_clobber_v8_upper +.type abi_test_clobber_v9_upper, %function +.globl abi_test_clobber_v9_upper +.hidden abi_test_clobber_v9_upper +.align 4 +abi_test_clobber_v9_upper: + AARCH64_VALID_CALL_TARGET + fmov v9.d[1], xzr + ret +.size abi_test_clobber_v9_upper,.-abi_test_clobber_v9_upper +.type abi_test_clobber_v10_upper, %function +.globl abi_test_clobber_v10_upper +.hidden abi_test_clobber_v10_upper +.align 4 +abi_test_clobber_v10_upper: + AARCH64_VALID_CALL_TARGET + fmov v10.d[1], xzr + ret +.size abi_test_clobber_v10_upper,.-abi_test_clobber_v10_upper +.type abi_test_clobber_v11_upper, %function +.globl abi_test_clobber_v11_upper +.hidden abi_test_clobber_v11_upper +.align 4 +abi_test_clobber_v11_upper: + AARCH64_VALID_CALL_TARGET + fmov v11.d[1], xzr + ret +.size abi_test_clobber_v11_upper,.-abi_test_clobber_v11_upper +.type abi_test_clobber_v12_upper, %function +.globl abi_test_clobber_v12_upper +.hidden abi_test_clobber_v12_upper +.align 4 +abi_test_clobber_v12_upper: + AARCH64_VALID_CALL_TARGET + fmov v12.d[1], xzr + ret +.size abi_test_clobber_v12_upper,.-abi_test_clobber_v12_upper +.type abi_test_clobber_v13_upper, %function +.globl abi_test_clobber_v13_upper +.hidden abi_test_clobber_v13_upper +.align 4 +abi_test_clobber_v13_upper: + AARCH64_VALID_CALL_TARGET + fmov v13.d[1], xzr + ret +.size abi_test_clobber_v13_upper,.-abi_test_clobber_v13_upper +.type abi_test_clobber_v14_upper, %function +.globl abi_test_clobber_v14_upper +.hidden abi_test_clobber_v14_upper +.align 4 +abi_test_clobber_v14_upper: + AARCH64_VALID_CALL_TARGET + fmov v14.d[1], xzr + ret +.size abi_test_clobber_v14_upper,.-abi_test_clobber_v14_upper +.type abi_test_clobber_v15_upper, %function +.globl abi_test_clobber_v15_upper +.hidden abi_test_clobber_v15_upper +.align 4 +abi_test_clobber_v15_upper: + AARCH64_VALID_CALL_TARGET + fmov v15.d[1], xzr + ret +.size abi_test_clobber_v15_upper,.-abi_test_clobber_v15_upper +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(__ELF__) diff --git a/third_party/boringssl/gen/test_support/trampoline-armv8-win.S b/third_party/boringssl/gen/test_support/trampoline-armv8-win.S new file mode 100644 index 00000000..8b82440e --- /dev/null +++ b/third_party/boringssl/gen/test_support/trampoline-armv8-win.S @@ -0,0 +1,748 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) && defined(_WIN32) +.text + +// abi_test_trampoline loads callee-saved registers from |state|, calls |func| +// with |argv|, then saves the callee-saved registers into |state|. It returns +// the result of |func|. The |unwind| argument is unused. +// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state, +// const uint64_t *argv, size_t argc, +// uint64_t unwind); + +.globl abi_test_trampoline + +.align 4 +abi_test_trampoline: +Labi_test_trampoline_begin: + AARCH64_SIGN_LINK_REGISTER + // Stack layout (low to high addresses) + // x29,x30 (16 bytes) + // d8-d15 (64 bytes) + // x19-x28 (80 bytes) + // x1 (8 bytes) + // padding (8 bytes) + stp x29, x30, [sp, #-176]! + mov x29, sp + + // Saved callee-saved registers and |state|. + stp d8, d9, [sp, #16] + stp d10, d11, [sp, #32] + stp d12, d13, [sp, #48] + stp d14, d15, [sp, #64] + stp x19, x20, [sp, #80] + stp x21, x22, [sp, #96] + stp x23, x24, [sp, #112] + stp x25, x26, [sp, #128] + stp x27, x28, [sp, #144] + str x1, [sp, #160] + + // Load registers from |state|, with the exception of x29. x29 is the + // frame pointer and also callee-saved, but AAPCS64 allows platforms to + // mandate that x29 always point to a frame. iOS64 does so, which means + // we cannot fill x29 with entropy without violating ABI rules + // ourselves. x29 is tested separately below. + ldp d8, d9, [x1], #16 + ldp d10, d11, [x1], #16 + ldp d12, d13, [x1], #16 + ldp d14, d15, [x1], #16 + ldp x19, x20, [x1], #16 + ldp x21, x22, [x1], #16 + ldp x23, x24, [x1], #16 + ldp x25, x26, [x1], #16 + ldp x27, x28, [x1], #16 + + // Move parameters into temporary registers. + mov x9, x0 + mov x10, x2 + mov x11, x3 + + // Load parameters into registers. + cbz x11, Largs_done + ldr x0, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x1, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x2, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x3, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x4, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x5, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x6, [x10], #8 + subs x11, x11, #1 + b.eq Largs_done + ldr x7, [x10], #8 + +Largs_done: + blr x9 + + // Reload |state| and store registers. + ldr x1, [sp, #160] + stp d8, d9, [x1], #16 + stp d10, d11, [x1], #16 + stp d12, d13, [x1], #16 + stp d14, d15, [x1], #16 + stp x19, x20, [x1], #16 + stp x21, x22, [x1], #16 + stp x23, x24, [x1], #16 + stp x25, x26, [x1], #16 + stp x27, x28, [x1], #16 + + // |func| is required to preserve x29, the frame pointer. We cannot load + // random values into x29 (see comment above), so compare it against the + // expected value and zero the field of |state| if corrupted. + mov x9, sp + cmp x29, x9 + b.eq Lx29_ok + str xzr, [x1] + +Lx29_ok: + // Restore callee-saved registers. + ldp d8, d9, [sp, #16] + ldp d10, d11, [sp, #32] + ldp d12, d13, [sp, #48] + ldp d14, d15, [sp, #64] + ldp x19, x20, [sp, #80] + ldp x21, x22, [sp, #96] + ldp x23, x24, [sp, #112] + ldp x25, x26, [sp, #128] + ldp x27, x28, [sp, #144] + + ldp x29, x30, [sp], #176 + AARCH64_VALIDATE_LINK_REGISTER + ret + + +.globl abi_test_clobber_x0 + +.align 4 +abi_test_clobber_x0: + AARCH64_VALID_CALL_TARGET + mov x0, xzr + ret + + +.globl abi_test_clobber_x1 + +.align 4 +abi_test_clobber_x1: + AARCH64_VALID_CALL_TARGET + mov x1, xzr + ret + + +.globl abi_test_clobber_x2 + +.align 4 +abi_test_clobber_x2: + AARCH64_VALID_CALL_TARGET + mov x2, xzr + ret + + +.globl abi_test_clobber_x3 + +.align 4 +abi_test_clobber_x3: + AARCH64_VALID_CALL_TARGET + mov x3, xzr + ret + + +.globl abi_test_clobber_x4 + +.align 4 +abi_test_clobber_x4: + AARCH64_VALID_CALL_TARGET + mov x4, xzr + ret + + +.globl abi_test_clobber_x5 + +.align 4 +abi_test_clobber_x5: + AARCH64_VALID_CALL_TARGET + mov x5, xzr + ret + + +.globl abi_test_clobber_x6 + +.align 4 +abi_test_clobber_x6: + AARCH64_VALID_CALL_TARGET + mov x6, xzr + ret + + +.globl abi_test_clobber_x7 + +.align 4 +abi_test_clobber_x7: + AARCH64_VALID_CALL_TARGET + mov x7, xzr + ret + + +.globl abi_test_clobber_x8 + +.align 4 +abi_test_clobber_x8: + AARCH64_VALID_CALL_TARGET + mov x8, xzr + ret + + +.globl abi_test_clobber_x9 + +.align 4 +abi_test_clobber_x9: + AARCH64_VALID_CALL_TARGET + mov x9, xzr + ret + + +.globl abi_test_clobber_x10 + +.align 4 +abi_test_clobber_x10: + AARCH64_VALID_CALL_TARGET + mov x10, xzr + ret + + +.globl abi_test_clobber_x11 + +.align 4 +abi_test_clobber_x11: + AARCH64_VALID_CALL_TARGET + mov x11, xzr + ret + + +.globl abi_test_clobber_x12 + +.align 4 +abi_test_clobber_x12: + AARCH64_VALID_CALL_TARGET + mov x12, xzr + ret + + +.globl abi_test_clobber_x13 + +.align 4 +abi_test_clobber_x13: + AARCH64_VALID_CALL_TARGET + mov x13, xzr + ret + + +.globl abi_test_clobber_x14 + +.align 4 +abi_test_clobber_x14: + AARCH64_VALID_CALL_TARGET + mov x14, xzr + ret + + +.globl abi_test_clobber_x15 + +.align 4 +abi_test_clobber_x15: + AARCH64_VALID_CALL_TARGET + mov x15, xzr + ret + + +.globl abi_test_clobber_x16 + +.align 4 +abi_test_clobber_x16: + AARCH64_VALID_CALL_TARGET + mov x16, xzr + ret + + +.globl abi_test_clobber_x17 + +.align 4 +abi_test_clobber_x17: + AARCH64_VALID_CALL_TARGET + mov x17, xzr + ret + + +.globl abi_test_clobber_x19 + +.align 4 +abi_test_clobber_x19: + AARCH64_VALID_CALL_TARGET + mov x19, xzr + ret + + +.globl abi_test_clobber_x20 + +.align 4 +abi_test_clobber_x20: + AARCH64_VALID_CALL_TARGET + mov x20, xzr + ret + + +.globl abi_test_clobber_x21 + +.align 4 +abi_test_clobber_x21: + AARCH64_VALID_CALL_TARGET + mov x21, xzr + ret + + +.globl abi_test_clobber_x22 + +.align 4 +abi_test_clobber_x22: + AARCH64_VALID_CALL_TARGET + mov x22, xzr + ret + + +.globl abi_test_clobber_x23 + +.align 4 +abi_test_clobber_x23: + AARCH64_VALID_CALL_TARGET + mov x23, xzr + ret + + +.globl abi_test_clobber_x24 + +.align 4 +abi_test_clobber_x24: + AARCH64_VALID_CALL_TARGET + mov x24, xzr + ret + + +.globl abi_test_clobber_x25 + +.align 4 +abi_test_clobber_x25: + AARCH64_VALID_CALL_TARGET + mov x25, xzr + ret + + +.globl abi_test_clobber_x26 + +.align 4 +abi_test_clobber_x26: + AARCH64_VALID_CALL_TARGET + mov x26, xzr + ret + + +.globl abi_test_clobber_x27 + +.align 4 +abi_test_clobber_x27: + AARCH64_VALID_CALL_TARGET + mov x27, xzr + ret + + +.globl abi_test_clobber_x28 + +.align 4 +abi_test_clobber_x28: + AARCH64_VALID_CALL_TARGET + mov x28, xzr + ret + + +.globl abi_test_clobber_x29 + +.align 4 +abi_test_clobber_x29: + AARCH64_VALID_CALL_TARGET + mov x29, xzr + ret + + +.globl abi_test_clobber_d0 + +.align 4 +abi_test_clobber_d0: + AARCH64_VALID_CALL_TARGET + fmov d0, xzr + ret + + +.globl abi_test_clobber_d1 + +.align 4 +abi_test_clobber_d1: + AARCH64_VALID_CALL_TARGET + fmov d1, xzr + ret + + +.globl abi_test_clobber_d2 + +.align 4 +abi_test_clobber_d2: + AARCH64_VALID_CALL_TARGET + fmov d2, xzr + ret + + +.globl abi_test_clobber_d3 + +.align 4 +abi_test_clobber_d3: + AARCH64_VALID_CALL_TARGET + fmov d3, xzr + ret + + +.globl abi_test_clobber_d4 + +.align 4 +abi_test_clobber_d4: + AARCH64_VALID_CALL_TARGET + fmov d4, xzr + ret + + +.globl abi_test_clobber_d5 + +.align 4 +abi_test_clobber_d5: + AARCH64_VALID_CALL_TARGET + fmov d5, xzr + ret + + +.globl abi_test_clobber_d6 + +.align 4 +abi_test_clobber_d6: + AARCH64_VALID_CALL_TARGET + fmov d6, xzr + ret + + +.globl abi_test_clobber_d7 + +.align 4 +abi_test_clobber_d7: + AARCH64_VALID_CALL_TARGET + fmov d7, xzr + ret + + +.globl abi_test_clobber_d8 + +.align 4 +abi_test_clobber_d8: + AARCH64_VALID_CALL_TARGET + fmov d8, xzr + ret + + +.globl abi_test_clobber_d9 + +.align 4 +abi_test_clobber_d9: + AARCH64_VALID_CALL_TARGET + fmov d9, xzr + ret + + +.globl abi_test_clobber_d10 + +.align 4 +abi_test_clobber_d10: + AARCH64_VALID_CALL_TARGET + fmov d10, xzr + ret + + +.globl abi_test_clobber_d11 + +.align 4 +abi_test_clobber_d11: + AARCH64_VALID_CALL_TARGET + fmov d11, xzr + ret + + +.globl abi_test_clobber_d12 + +.align 4 +abi_test_clobber_d12: + AARCH64_VALID_CALL_TARGET + fmov d12, xzr + ret + + +.globl abi_test_clobber_d13 + +.align 4 +abi_test_clobber_d13: + AARCH64_VALID_CALL_TARGET + fmov d13, xzr + ret + + +.globl abi_test_clobber_d14 + +.align 4 +abi_test_clobber_d14: + AARCH64_VALID_CALL_TARGET + fmov d14, xzr + ret + + +.globl abi_test_clobber_d15 + +.align 4 +abi_test_clobber_d15: + AARCH64_VALID_CALL_TARGET + fmov d15, xzr + ret + + +.globl abi_test_clobber_d16 + +.align 4 +abi_test_clobber_d16: + AARCH64_VALID_CALL_TARGET + fmov d16, xzr + ret + + +.globl abi_test_clobber_d17 + +.align 4 +abi_test_clobber_d17: + AARCH64_VALID_CALL_TARGET + fmov d17, xzr + ret + + +.globl abi_test_clobber_d18 + +.align 4 +abi_test_clobber_d18: + AARCH64_VALID_CALL_TARGET + fmov d18, xzr + ret + + +.globl abi_test_clobber_d19 + +.align 4 +abi_test_clobber_d19: + AARCH64_VALID_CALL_TARGET + fmov d19, xzr + ret + + +.globl abi_test_clobber_d20 + +.align 4 +abi_test_clobber_d20: + AARCH64_VALID_CALL_TARGET + fmov d20, xzr + ret + + +.globl abi_test_clobber_d21 + +.align 4 +abi_test_clobber_d21: + AARCH64_VALID_CALL_TARGET + fmov d21, xzr + ret + + +.globl abi_test_clobber_d22 + +.align 4 +abi_test_clobber_d22: + AARCH64_VALID_CALL_TARGET + fmov d22, xzr + ret + + +.globl abi_test_clobber_d23 + +.align 4 +abi_test_clobber_d23: + AARCH64_VALID_CALL_TARGET + fmov d23, xzr + ret + + +.globl abi_test_clobber_d24 + +.align 4 +abi_test_clobber_d24: + AARCH64_VALID_CALL_TARGET + fmov d24, xzr + ret + + +.globl abi_test_clobber_d25 + +.align 4 +abi_test_clobber_d25: + AARCH64_VALID_CALL_TARGET + fmov d25, xzr + ret + + +.globl abi_test_clobber_d26 + +.align 4 +abi_test_clobber_d26: + AARCH64_VALID_CALL_TARGET + fmov d26, xzr + ret + + +.globl abi_test_clobber_d27 + +.align 4 +abi_test_clobber_d27: + AARCH64_VALID_CALL_TARGET + fmov d27, xzr + ret + + +.globl abi_test_clobber_d28 + +.align 4 +abi_test_clobber_d28: + AARCH64_VALID_CALL_TARGET + fmov d28, xzr + ret + + +.globl abi_test_clobber_d29 + +.align 4 +abi_test_clobber_d29: + AARCH64_VALID_CALL_TARGET + fmov d29, xzr + ret + + +.globl abi_test_clobber_d30 + +.align 4 +abi_test_clobber_d30: + AARCH64_VALID_CALL_TARGET + fmov d30, xzr + ret + + +.globl abi_test_clobber_d31 + +.align 4 +abi_test_clobber_d31: + AARCH64_VALID_CALL_TARGET + fmov d31, xzr + ret + + +.globl abi_test_clobber_v8_upper + +.align 4 +abi_test_clobber_v8_upper: + AARCH64_VALID_CALL_TARGET + fmov v8.d[1], xzr + ret + + +.globl abi_test_clobber_v9_upper + +.align 4 +abi_test_clobber_v9_upper: + AARCH64_VALID_CALL_TARGET + fmov v9.d[1], xzr + ret + + +.globl abi_test_clobber_v10_upper + +.align 4 +abi_test_clobber_v10_upper: + AARCH64_VALID_CALL_TARGET + fmov v10.d[1], xzr + ret + + +.globl abi_test_clobber_v11_upper + +.align 4 +abi_test_clobber_v11_upper: + AARCH64_VALID_CALL_TARGET + fmov v11.d[1], xzr + ret + + +.globl abi_test_clobber_v12_upper + +.align 4 +abi_test_clobber_v12_upper: + AARCH64_VALID_CALL_TARGET + fmov v12.d[1], xzr + ret + + +.globl abi_test_clobber_v13_upper + +.align 4 +abi_test_clobber_v13_upper: + AARCH64_VALID_CALL_TARGET + fmov v13.d[1], xzr + ret + + +.globl abi_test_clobber_v14_upper + +.align 4 +abi_test_clobber_v14_upper: + AARCH64_VALID_CALL_TARGET + fmov v14.d[1], xzr + ret + + +.globl abi_test_clobber_v15_upper + +.align 4 +abi_test_clobber_v15_upper: + AARCH64_VALID_CALL_TARGET + fmov v15.d[1], xzr + ret + +#endif // !OPENSSL_NO_ASM && defined(OPENSSL_AARCH64) && defined(_WIN32) diff --git a/third_party/boringssl/gen/test_support/trampoline-x86-apple.S b/third_party/boringssl/gen/test_support/trampoline-x86-apple.S new file mode 100644 index 00000000..4065b9ad --- /dev/null +++ b/third_party/boringssl/gen/test_support/trampoline-x86-apple.S @@ -0,0 +1,168 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) +.text +.globl _abi_test_trampoline +.private_extern _abi_test_trampoline +.align 4 +_abi_test_trampoline: +L_abi_test_trampoline_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 24(%esp),%ecx + movl (%ecx),%esi + movl 4(%ecx),%edi + movl 8(%ecx),%ebx + movl 12(%ecx),%ebp + subl $44,%esp + movl 72(%esp),%eax + xorl %ecx,%ecx +L000loop: + cmpl 76(%esp),%ecx + jae L001loop_done + movl (%eax,%ecx,4),%edx + movl %edx,(%esp,%ecx,4) + addl $1,%ecx + jmp L000loop +L001loop_done: + call *64(%esp) + addl $44,%esp + movl 24(%esp),%ecx + movl %esi,(%ecx) + movl %edi,4(%ecx) + movl %ebx,8(%ecx) + movl %ebp,12(%ecx) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.globl _abi_test_get_and_clear_direction_flag +.private_extern _abi_test_get_and_clear_direction_flag +.align 4 +_abi_test_get_and_clear_direction_flag: +L_abi_test_get_and_clear_direction_flag_begin: + pushfl + popl %eax + andl $1024,%eax + shrl $10,%eax + cld + ret +.globl _abi_test_set_direction_flag +.private_extern _abi_test_set_direction_flag +.align 4 +_abi_test_set_direction_flag: +L_abi_test_set_direction_flag_begin: + std + ret +.globl _abi_test_clobber_eax +.private_extern _abi_test_clobber_eax +.align 4 +_abi_test_clobber_eax: +L_abi_test_clobber_eax_begin: + xorl %eax,%eax + ret +.globl _abi_test_clobber_ebx +.private_extern _abi_test_clobber_ebx +.align 4 +_abi_test_clobber_ebx: +L_abi_test_clobber_ebx_begin: + xorl %ebx,%ebx + ret +.globl _abi_test_clobber_ecx +.private_extern _abi_test_clobber_ecx +.align 4 +_abi_test_clobber_ecx: +L_abi_test_clobber_ecx_begin: + xorl %ecx,%ecx + ret +.globl _abi_test_clobber_edx +.private_extern _abi_test_clobber_edx +.align 4 +_abi_test_clobber_edx: +L_abi_test_clobber_edx_begin: + xorl %edx,%edx + ret +.globl _abi_test_clobber_edi +.private_extern _abi_test_clobber_edi +.align 4 +_abi_test_clobber_edi: +L_abi_test_clobber_edi_begin: + xorl %edi,%edi + ret +.globl _abi_test_clobber_esi +.private_extern _abi_test_clobber_esi +.align 4 +_abi_test_clobber_esi: +L_abi_test_clobber_esi_begin: + xorl %esi,%esi + ret +.globl _abi_test_clobber_ebp +.private_extern _abi_test_clobber_ebp +.align 4 +_abi_test_clobber_ebp: +L_abi_test_clobber_ebp_begin: + xorl %ebp,%ebp + ret +.globl _abi_test_clobber_xmm0 +.private_extern _abi_test_clobber_xmm0 +.align 4 +_abi_test_clobber_xmm0: +L_abi_test_clobber_xmm0_begin: + pxor %xmm0,%xmm0 + ret +.globl _abi_test_clobber_xmm1 +.private_extern _abi_test_clobber_xmm1 +.align 4 +_abi_test_clobber_xmm1: +L_abi_test_clobber_xmm1_begin: + pxor %xmm1,%xmm1 + ret +.globl _abi_test_clobber_xmm2 +.private_extern _abi_test_clobber_xmm2 +.align 4 +_abi_test_clobber_xmm2: +L_abi_test_clobber_xmm2_begin: + pxor %xmm2,%xmm2 + ret +.globl _abi_test_clobber_xmm3 +.private_extern _abi_test_clobber_xmm3 +.align 4 +_abi_test_clobber_xmm3: +L_abi_test_clobber_xmm3_begin: + pxor %xmm3,%xmm3 + ret +.globl _abi_test_clobber_xmm4 +.private_extern _abi_test_clobber_xmm4 +.align 4 +_abi_test_clobber_xmm4: +L_abi_test_clobber_xmm4_begin: + pxor %xmm4,%xmm4 + ret +.globl _abi_test_clobber_xmm5 +.private_extern _abi_test_clobber_xmm5 +.align 4 +_abi_test_clobber_xmm5: +L_abi_test_clobber_xmm5_begin: + pxor %xmm5,%xmm5 + ret +.globl _abi_test_clobber_xmm6 +.private_extern _abi_test_clobber_xmm6 +.align 4 +_abi_test_clobber_xmm6: +L_abi_test_clobber_xmm6_begin: + pxor %xmm6,%xmm6 + ret +.globl _abi_test_clobber_xmm7 +.private_extern _abi_test_clobber_xmm7 +.align 4 +_abi_test_clobber_xmm7: +L_abi_test_clobber_xmm7_begin: + pxor %xmm7,%xmm7 + ret +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__APPLE__) diff --git a/third_party/boringssl/gen/test_support/trampoline-x86-linux.S b/third_party/boringssl/gen/test_support/trampoline-x86-linux.S new file mode 100644 index 00000000..3452c634 --- /dev/null +++ b/third_party/boringssl/gen/test_support/trampoline-x86-linux.S @@ -0,0 +1,204 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) +.text +.globl abi_test_trampoline +.hidden abi_test_trampoline +.type abi_test_trampoline,@function +.align 16 +abi_test_trampoline: +.L_abi_test_trampoline_begin: + pushl %ebp + pushl %ebx + pushl %esi + pushl %edi + movl 24(%esp),%ecx + movl (%ecx),%esi + movl 4(%ecx),%edi + movl 8(%ecx),%ebx + movl 12(%ecx),%ebp + subl $44,%esp + movl 72(%esp),%eax + xorl %ecx,%ecx +.L000loop: + cmpl 76(%esp),%ecx + jae .L001loop_done + movl (%eax,%ecx,4),%edx + movl %edx,(%esp,%ecx,4) + addl $1,%ecx + jmp .L000loop +.L001loop_done: + call *64(%esp) + addl $44,%esp + movl 24(%esp),%ecx + movl %esi,(%ecx) + movl %edi,4(%ecx) + movl %ebx,8(%ecx) + movl %ebp,12(%ecx) + popl %edi + popl %esi + popl %ebx + popl %ebp + ret +.size abi_test_trampoline,.-.L_abi_test_trampoline_begin +.globl abi_test_get_and_clear_direction_flag +.hidden abi_test_get_and_clear_direction_flag +.type abi_test_get_and_clear_direction_flag,@function +.align 16 +abi_test_get_and_clear_direction_flag: +.L_abi_test_get_and_clear_direction_flag_begin: + pushfl + popl %eax + andl $1024,%eax + shrl $10,%eax + cld + ret +.size abi_test_get_and_clear_direction_flag,.-.L_abi_test_get_and_clear_direction_flag_begin +.globl abi_test_set_direction_flag +.hidden abi_test_set_direction_flag +.type abi_test_set_direction_flag,@function +.align 16 +abi_test_set_direction_flag: +.L_abi_test_set_direction_flag_begin: + std + ret +.size abi_test_set_direction_flag,.-.L_abi_test_set_direction_flag_begin +.globl abi_test_clobber_eax +.hidden abi_test_clobber_eax +.type abi_test_clobber_eax,@function +.align 16 +abi_test_clobber_eax: +.L_abi_test_clobber_eax_begin: + xorl %eax,%eax + ret +.size abi_test_clobber_eax,.-.L_abi_test_clobber_eax_begin +.globl abi_test_clobber_ebx +.hidden abi_test_clobber_ebx +.type abi_test_clobber_ebx,@function +.align 16 +abi_test_clobber_ebx: +.L_abi_test_clobber_ebx_begin: + xorl %ebx,%ebx + ret +.size abi_test_clobber_ebx,.-.L_abi_test_clobber_ebx_begin +.globl abi_test_clobber_ecx +.hidden abi_test_clobber_ecx +.type abi_test_clobber_ecx,@function +.align 16 +abi_test_clobber_ecx: +.L_abi_test_clobber_ecx_begin: + xorl %ecx,%ecx + ret +.size abi_test_clobber_ecx,.-.L_abi_test_clobber_ecx_begin +.globl abi_test_clobber_edx +.hidden abi_test_clobber_edx +.type abi_test_clobber_edx,@function +.align 16 +abi_test_clobber_edx: +.L_abi_test_clobber_edx_begin: + xorl %edx,%edx + ret +.size abi_test_clobber_edx,.-.L_abi_test_clobber_edx_begin +.globl abi_test_clobber_edi +.hidden abi_test_clobber_edi +.type abi_test_clobber_edi,@function +.align 16 +abi_test_clobber_edi: +.L_abi_test_clobber_edi_begin: + xorl %edi,%edi + ret +.size abi_test_clobber_edi,.-.L_abi_test_clobber_edi_begin +.globl abi_test_clobber_esi +.hidden abi_test_clobber_esi +.type abi_test_clobber_esi,@function +.align 16 +abi_test_clobber_esi: +.L_abi_test_clobber_esi_begin: + xorl %esi,%esi + ret +.size abi_test_clobber_esi,.-.L_abi_test_clobber_esi_begin +.globl abi_test_clobber_ebp +.hidden abi_test_clobber_ebp +.type abi_test_clobber_ebp,@function +.align 16 +abi_test_clobber_ebp: +.L_abi_test_clobber_ebp_begin: + xorl %ebp,%ebp + ret +.size abi_test_clobber_ebp,.-.L_abi_test_clobber_ebp_begin +.globl abi_test_clobber_xmm0 +.hidden abi_test_clobber_xmm0 +.type abi_test_clobber_xmm0,@function +.align 16 +abi_test_clobber_xmm0: +.L_abi_test_clobber_xmm0_begin: + pxor %xmm0,%xmm0 + ret +.size abi_test_clobber_xmm0,.-.L_abi_test_clobber_xmm0_begin +.globl abi_test_clobber_xmm1 +.hidden abi_test_clobber_xmm1 +.type abi_test_clobber_xmm1,@function +.align 16 +abi_test_clobber_xmm1: +.L_abi_test_clobber_xmm1_begin: + pxor %xmm1,%xmm1 + ret +.size abi_test_clobber_xmm1,.-.L_abi_test_clobber_xmm1_begin +.globl abi_test_clobber_xmm2 +.hidden abi_test_clobber_xmm2 +.type abi_test_clobber_xmm2,@function +.align 16 +abi_test_clobber_xmm2: +.L_abi_test_clobber_xmm2_begin: + pxor %xmm2,%xmm2 + ret +.size abi_test_clobber_xmm2,.-.L_abi_test_clobber_xmm2_begin +.globl abi_test_clobber_xmm3 +.hidden abi_test_clobber_xmm3 +.type abi_test_clobber_xmm3,@function +.align 16 +abi_test_clobber_xmm3: +.L_abi_test_clobber_xmm3_begin: + pxor %xmm3,%xmm3 + ret +.size abi_test_clobber_xmm3,.-.L_abi_test_clobber_xmm3_begin +.globl abi_test_clobber_xmm4 +.hidden abi_test_clobber_xmm4 +.type abi_test_clobber_xmm4,@function +.align 16 +abi_test_clobber_xmm4: +.L_abi_test_clobber_xmm4_begin: + pxor %xmm4,%xmm4 + ret +.size abi_test_clobber_xmm4,.-.L_abi_test_clobber_xmm4_begin +.globl abi_test_clobber_xmm5 +.hidden abi_test_clobber_xmm5 +.type abi_test_clobber_xmm5,@function +.align 16 +abi_test_clobber_xmm5: +.L_abi_test_clobber_xmm5_begin: + pxor %xmm5,%xmm5 + ret +.size abi_test_clobber_xmm5,.-.L_abi_test_clobber_xmm5_begin +.globl abi_test_clobber_xmm6 +.hidden abi_test_clobber_xmm6 +.type abi_test_clobber_xmm6,@function +.align 16 +abi_test_clobber_xmm6: +.L_abi_test_clobber_xmm6_begin: + pxor %xmm6,%xmm6 + ret +.size abi_test_clobber_xmm6,.-.L_abi_test_clobber_xmm6_begin +.globl abi_test_clobber_xmm7 +.hidden abi_test_clobber_xmm7 +.type abi_test_clobber_xmm7,@function +.align 16 +abi_test_clobber_xmm7: +.L_abi_test_clobber_xmm7_begin: + pxor %xmm7,%xmm7 + ret +.size abi_test_clobber_xmm7,.-.L_abi_test_clobber_xmm7_begin +#endif // !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) && defined(__ELF__) diff --git a/third_party/boringssl/gen/test_support/trampoline-x86-win.asm b/third_party/boringssl/gen/test_support/trampoline-x86-win.asm new file mode 100644 index 00000000..e8c85bfc --- /dev/null +++ b/third_party/boringssl/gen/test_support/trampoline-x86-win.asm @@ -0,0 +1,161 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_win_asm.inc" +%endif +%ifidn __OUTPUT_FORMAT__, win32 +%ifidn __OUTPUT_FORMAT__,obj +section code use32 class=code align=64 +%elifidn __OUTPUT_FORMAT__,win32 +$@feat.00 equ 1 +section .text code align=64 +%else +section .text code +%endif +global _abi_test_trampoline +align 16 +_abi_test_trampoline: +L$_abi_test_trampoline_begin: + push ebp + push ebx + push esi + push edi + mov ecx,DWORD [24+esp] + mov esi,DWORD [ecx] + mov edi,DWORD [4+ecx] + mov ebx,DWORD [8+ecx] + mov ebp,DWORD [12+ecx] + sub esp,44 + mov eax,DWORD [72+esp] + xor ecx,ecx +L$000loop: + cmp ecx,DWORD [76+esp] + jae NEAR L$001loop_done + mov edx,DWORD [ecx*4+eax] + mov DWORD [ecx*4+esp],edx + add ecx,1 + jmp NEAR L$000loop +L$001loop_done: + call DWORD [64+esp] + add esp,44 + mov ecx,DWORD [24+esp] + mov DWORD [ecx],esi + mov DWORD [4+ecx],edi + mov DWORD [8+ecx],ebx + mov DWORD [12+ecx],ebp + pop edi + pop esi + pop ebx + pop ebp + ret +global _abi_test_get_and_clear_direction_flag +align 16 +_abi_test_get_and_clear_direction_flag: +L$_abi_test_get_and_clear_direction_flag_begin: + pushfd + pop eax + and eax,1024 + shr eax,10 + cld + ret +global _abi_test_set_direction_flag +align 16 +_abi_test_set_direction_flag: +L$_abi_test_set_direction_flag_begin: + std + ret +global _abi_test_clobber_eax +align 16 +_abi_test_clobber_eax: +L$_abi_test_clobber_eax_begin: + xor eax,eax + ret +global _abi_test_clobber_ebx +align 16 +_abi_test_clobber_ebx: +L$_abi_test_clobber_ebx_begin: + xor ebx,ebx + ret +global _abi_test_clobber_ecx +align 16 +_abi_test_clobber_ecx: +L$_abi_test_clobber_ecx_begin: + xor ecx,ecx + ret +global _abi_test_clobber_edx +align 16 +_abi_test_clobber_edx: +L$_abi_test_clobber_edx_begin: + xor edx,edx + ret +global _abi_test_clobber_edi +align 16 +_abi_test_clobber_edi: +L$_abi_test_clobber_edi_begin: + xor edi,edi + ret +global _abi_test_clobber_esi +align 16 +_abi_test_clobber_esi: +L$_abi_test_clobber_esi_begin: + xor esi,esi + ret +global _abi_test_clobber_ebp +align 16 +_abi_test_clobber_ebp: +L$_abi_test_clobber_ebp_begin: + xor ebp,ebp + ret +global _abi_test_clobber_xmm0 +align 16 +_abi_test_clobber_xmm0: +L$_abi_test_clobber_xmm0_begin: + pxor xmm0,xmm0 + ret +global _abi_test_clobber_xmm1 +align 16 +_abi_test_clobber_xmm1: +L$_abi_test_clobber_xmm1_begin: + pxor xmm1,xmm1 + ret +global _abi_test_clobber_xmm2 +align 16 +_abi_test_clobber_xmm2: +L$_abi_test_clobber_xmm2_begin: + pxor xmm2,xmm2 + ret +global _abi_test_clobber_xmm3 +align 16 +_abi_test_clobber_xmm3: +L$_abi_test_clobber_xmm3_begin: + pxor xmm3,xmm3 + ret +global _abi_test_clobber_xmm4 +align 16 +_abi_test_clobber_xmm4: +L$_abi_test_clobber_xmm4_begin: + pxor xmm4,xmm4 + ret +global _abi_test_clobber_xmm5 +align 16 +_abi_test_clobber_xmm5: +L$_abi_test_clobber_xmm5_begin: + pxor xmm5,xmm5 + ret +global _abi_test_clobber_xmm6 +align 16 +_abi_test_clobber_xmm6: +L$_abi_test_clobber_xmm6_begin: + pxor xmm6,xmm6 + ret +global _abi_test_clobber_xmm7 +align 16 +_abi_test_clobber_xmm7: +L$_abi_test_clobber_xmm7_begin: + pxor xmm7,xmm7 + ret +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/gen/test_support/trampoline-x86_64-apple.S b/third_party/boringssl/gen/test_support/trampoline-x86_64-apple.S new file mode 100644 index 00000000..5d60e67a --- /dev/null +++ b/third_party/boringssl/gen/test_support/trampoline-x86_64-apple.S @@ -0,0 +1,544 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__APPLE__) +.text + + + + + + + + +.globl _abi_test_trampoline +.private_extern _abi_test_trampoline + +.p2align 4 +_abi_test_trampoline: + + +_CET_ENDBR + + + + + + + + + + subq $120,%rsp + + + movq %r8,48(%rsp) + movq %rbx,64(%rsp) + + + movq %rbp,72(%rsp) + + + movq %r12,80(%rsp) + + + movq %r13,88(%rsp) + + + movq %r14,96(%rsp) + + + movq %r15,104(%rsp) + + + + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq 32(%rsi),%r14 + movq 40(%rsi),%r15 + + movq %rdi,32(%rsp) + movq %rsi,40(%rsp) + + + + + movq %rdx,%r10 + movq %rcx,%r11 + decq %r11 + js L$args_done + movq (%r10),%rdi + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%rsi + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%rdx + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%rcx + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%r8 + addq $8,%r10 + decq %r11 + js L$args_done + movq (%r10),%r9 + addq $8,%r10 + leaq 0(%rsp),%rax +L$args_loop: + decq %r11 + js L$args_done + + + + + + + movq %r11,56(%rsp) + movq (%r10),%r11 + movq %r11,(%rax) + movq 56(%rsp),%r11 + + addq $8,%r10 + addq $8,%rax + jmp L$args_loop + +L$args_done: + movq 32(%rsp),%rax + movq 48(%rsp),%r10 + testq %r10,%r10 + jz L$no_unwind + + + pushfq + orq $0x100,0(%rsp) + popfq + + + + nop +.globl _abi_test_unwind_start +.private_extern _abi_test_unwind_start +_abi_test_unwind_start: + + call *%rax +.globl _abi_test_unwind_return +.private_extern _abi_test_unwind_return +_abi_test_unwind_return: + + + + + pushfq + andq $-0x101,0(%rsp) + popfq +.globl _abi_test_unwind_stop +.private_extern _abi_test_unwind_stop +_abi_test_unwind_stop: + + jmp L$call_done + +L$no_unwind: + call *%rax + +L$call_done: + + movq 40(%rsp),%rsi + movq %rbx,0(%rsi) + movq %rbp,8(%rsi) + movq %r12,16(%rsi) + movq %r13,24(%rsi) + movq %r14,32(%rsi) + movq %r15,40(%rsi) + movq 64(%rsp),%rbx + + movq 72(%rsp),%rbp + + movq 80(%rsp),%r12 + + movq 88(%rsp),%r13 + + movq 96(%rsp),%r14 + + movq 104(%rsp),%r15 + + addq $120,%rsp + + + + ret + + + +.globl _abi_test_clobber_rax +.private_extern _abi_test_clobber_rax + +.p2align 4 +_abi_test_clobber_rax: +_CET_ENDBR + xorq %rax,%rax + ret + +.globl _abi_test_clobber_rbx +.private_extern _abi_test_clobber_rbx + +.p2align 4 +_abi_test_clobber_rbx: +_CET_ENDBR + xorq %rbx,%rbx + ret + +.globl _abi_test_clobber_rcx +.private_extern _abi_test_clobber_rcx + +.p2align 4 +_abi_test_clobber_rcx: +_CET_ENDBR + xorq %rcx,%rcx + ret + +.globl _abi_test_clobber_rdx +.private_extern _abi_test_clobber_rdx + +.p2align 4 +_abi_test_clobber_rdx: +_CET_ENDBR + xorq %rdx,%rdx + ret + +.globl _abi_test_clobber_rdi +.private_extern _abi_test_clobber_rdi + +.p2align 4 +_abi_test_clobber_rdi: +_CET_ENDBR + xorq %rdi,%rdi + ret + +.globl _abi_test_clobber_rsi +.private_extern _abi_test_clobber_rsi + +.p2align 4 +_abi_test_clobber_rsi: +_CET_ENDBR + xorq %rsi,%rsi + ret + +.globl _abi_test_clobber_rbp +.private_extern _abi_test_clobber_rbp + +.p2align 4 +_abi_test_clobber_rbp: +_CET_ENDBR + xorq %rbp,%rbp + ret + +.globl _abi_test_clobber_r8 +.private_extern _abi_test_clobber_r8 + +.p2align 4 +_abi_test_clobber_r8: +_CET_ENDBR + xorq %r8,%r8 + ret + +.globl _abi_test_clobber_r9 +.private_extern _abi_test_clobber_r9 + +.p2align 4 +_abi_test_clobber_r9: +_CET_ENDBR + xorq %r9,%r9 + ret + +.globl _abi_test_clobber_r10 +.private_extern _abi_test_clobber_r10 + +.p2align 4 +_abi_test_clobber_r10: +_CET_ENDBR + xorq %r10,%r10 + ret + +.globl _abi_test_clobber_r11 +.private_extern _abi_test_clobber_r11 + +.p2align 4 +_abi_test_clobber_r11: +_CET_ENDBR + xorq %r11,%r11 + ret + +.globl _abi_test_clobber_r12 +.private_extern _abi_test_clobber_r12 + +.p2align 4 +_abi_test_clobber_r12: +_CET_ENDBR + xorq %r12,%r12 + ret + +.globl _abi_test_clobber_r13 +.private_extern _abi_test_clobber_r13 + +.p2align 4 +_abi_test_clobber_r13: +_CET_ENDBR + xorq %r13,%r13 + ret + +.globl _abi_test_clobber_r14 +.private_extern _abi_test_clobber_r14 + +.p2align 4 +_abi_test_clobber_r14: +_CET_ENDBR + xorq %r14,%r14 + ret + +.globl _abi_test_clobber_r15 +.private_extern _abi_test_clobber_r15 + +.p2align 4 +_abi_test_clobber_r15: +_CET_ENDBR + xorq %r15,%r15 + ret + +.globl _abi_test_clobber_xmm0 +.private_extern _abi_test_clobber_xmm0 + +.p2align 4 +_abi_test_clobber_xmm0: +_CET_ENDBR + pxor %xmm0,%xmm0 + ret + +.globl _abi_test_clobber_xmm1 +.private_extern _abi_test_clobber_xmm1 + +.p2align 4 +_abi_test_clobber_xmm1: +_CET_ENDBR + pxor %xmm1,%xmm1 + ret + +.globl _abi_test_clobber_xmm2 +.private_extern _abi_test_clobber_xmm2 + +.p2align 4 +_abi_test_clobber_xmm2: +_CET_ENDBR + pxor %xmm2,%xmm2 + ret + +.globl _abi_test_clobber_xmm3 +.private_extern _abi_test_clobber_xmm3 + +.p2align 4 +_abi_test_clobber_xmm3: +_CET_ENDBR + pxor %xmm3,%xmm3 + ret + +.globl _abi_test_clobber_xmm4 +.private_extern _abi_test_clobber_xmm4 + +.p2align 4 +_abi_test_clobber_xmm4: +_CET_ENDBR + pxor %xmm4,%xmm4 + ret + +.globl _abi_test_clobber_xmm5 +.private_extern _abi_test_clobber_xmm5 + +.p2align 4 +_abi_test_clobber_xmm5: +_CET_ENDBR + pxor %xmm5,%xmm5 + ret + +.globl _abi_test_clobber_xmm6 +.private_extern _abi_test_clobber_xmm6 + +.p2align 4 +_abi_test_clobber_xmm6: +_CET_ENDBR + pxor %xmm6,%xmm6 + ret + +.globl _abi_test_clobber_xmm7 +.private_extern _abi_test_clobber_xmm7 + +.p2align 4 +_abi_test_clobber_xmm7: +_CET_ENDBR + pxor %xmm7,%xmm7 + ret + +.globl _abi_test_clobber_xmm8 +.private_extern _abi_test_clobber_xmm8 + +.p2align 4 +_abi_test_clobber_xmm8: +_CET_ENDBR + pxor %xmm8,%xmm8 + ret + +.globl _abi_test_clobber_xmm9 +.private_extern _abi_test_clobber_xmm9 + +.p2align 4 +_abi_test_clobber_xmm9: +_CET_ENDBR + pxor %xmm9,%xmm9 + ret + +.globl _abi_test_clobber_xmm10 +.private_extern _abi_test_clobber_xmm10 + +.p2align 4 +_abi_test_clobber_xmm10: +_CET_ENDBR + pxor %xmm10,%xmm10 + ret + +.globl _abi_test_clobber_xmm11 +.private_extern _abi_test_clobber_xmm11 + +.p2align 4 +_abi_test_clobber_xmm11: +_CET_ENDBR + pxor %xmm11,%xmm11 + ret + +.globl _abi_test_clobber_xmm12 +.private_extern _abi_test_clobber_xmm12 + +.p2align 4 +_abi_test_clobber_xmm12: +_CET_ENDBR + pxor %xmm12,%xmm12 + ret + +.globl _abi_test_clobber_xmm13 +.private_extern _abi_test_clobber_xmm13 + +.p2align 4 +_abi_test_clobber_xmm13: +_CET_ENDBR + pxor %xmm13,%xmm13 + ret + +.globl _abi_test_clobber_xmm14 +.private_extern _abi_test_clobber_xmm14 + +.p2align 4 +_abi_test_clobber_xmm14: +_CET_ENDBR + pxor %xmm14,%xmm14 + ret + +.globl _abi_test_clobber_xmm15 +.private_extern _abi_test_clobber_xmm15 + +.p2align 4 +_abi_test_clobber_xmm15: +_CET_ENDBR + pxor %xmm15,%xmm15 + ret + + + + +.globl _abi_test_bad_unwind_wrong_register +.private_extern _abi_test_bad_unwind_wrong_register + +.p2align 4 +_abi_test_bad_unwind_wrong_register: + + +_CET_ENDBR + pushq %r12 + + + + + + + nop + popq %r12 + + ret + + + + + + + +.globl _abi_test_bad_unwind_temporary +.private_extern _abi_test_bad_unwind_temporary + +.p2align 4 +_abi_test_bad_unwind_temporary: + + +_CET_ENDBR + pushq %r12 + + + + + movq %r12,%rax + incq %rax + movq %rax,(%rsp) + + + + movq %r12,(%rsp) + + + popq %r12 + + ret + + + + + + + +.globl _abi_test_get_and_clear_direction_flag +.private_extern _abi_test_get_and_clear_direction_flag + +_abi_test_get_and_clear_direction_flag: +_CET_ENDBR + pushfq + popq %rax + andq $0x400,%rax + shrq $10,%rax + cld + ret + + + + +.globl _abi_test_set_direction_flag +.private_extern _abi_test_set_direction_flag + +_abi_test_set_direction_flag: +_CET_ENDBR + std + ret + +#endif diff --git a/third_party/boringssl/gen/test_support/trampoline-x86_64-linux.S b/third_party/boringssl/gen/test_support/trampoline-x86_64-linux.S new file mode 100644 index 00000000..13d08a30 --- /dev/null +++ b/third_party/boringssl/gen/test_support/trampoline-x86_64-linux.S @@ -0,0 +1,548 @@ +// This file is generated from a similarly-named Perl script in the BoringSSL +// source tree. Do not edit by hand. + +#include + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && defined(__ELF__) +.text + + + + + + + + +.globl abi_test_trampoline +.hidden abi_test_trampoline +.type abi_test_trampoline, @function +.align 16 +abi_test_trampoline: +.cfi_startproc + +_CET_ENDBR + + + + + + + + + + subq $120,%rsp +.cfi_adjust_cfa_offset 120 + + movq %r8,48(%rsp) + movq %rbx,64(%rsp) +.cfi_offset rbx, -64 + + movq %rbp,72(%rsp) +.cfi_offset rbp, -56 + + movq %r12,80(%rsp) +.cfi_offset r12, -48 + + movq %r13,88(%rsp) +.cfi_offset r13, -40 + + movq %r14,96(%rsp) +.cfi_offset r14, -32 + + movq %r15,104(%rsp) +.cfi_offset r15, -24 + + + movq 0(%rsi),%rbx + movq 8(%rsi),%rbp + movq 16(%rsi),%r12 + movq 24(%rsi),%r13 + movq 32(%rsi),%r14 + movq 40(%rsi),%r15 + + movq %rdi,32(%rsp) + movq %rsi,40(%rsp) + + + + + movq %rdx,%r10 + movq %rcx,%r11 + decq %r11 + js .Largs_done + movq (%r10),%rdi + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%rsi + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%rdx + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%rcx + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%r8 + addq $8,%r10 + decq %r11 + js .Largs_done + movq (%r10),%r9 + addq $8,%r10 + leaq 0(%rsp),%rax +.Largs_loop: + decq %r11 + js .Largs_done + + + + + + + movq %r11,56(%rsp) + movq (%r10),%r11 + movq %r11,(%rax) + movq 56(%rsp),%r11 + + addq $8,%r10 + addq $8,%rax + jmp .Largs_loop + +.Largs_done: + movq 32(%rsp),%rax + movq 48(%rsp),%r10 + testq %r10,%r10 + jz .Lno_unwind + + + pushfq + orq $0x100,0(%rsp) + popfq + + + + nop +.globl abi_test_unwind_start +.hidden abi_test_unwind_start +abi_test_unwind_start: + + call *%rax +.globl abi_test_unwind_return +.hidden abi_test_unwind_return +abi_test_unwind_return: + + + + + pushfq + andq $-0x101,0(%rsp) + popfq +.globl abi_test_unwind_stop +.hidden abi_test_unwind_stop +abi_test_unwind_stop: + + jmp .Lcall_done + +.Lno_unwind: + call *%rax + +.Lcall_done: + + movq 40(%rsp),%rsi + movq %rbx,0(%rsi) + movq %rbp,8(%rsi) + movq %r12,16(%rsi) + movq %r13,24(%rsi) + movq %r14,32(%rsi) + movq %r15,40(%rsi) + movq 64(%rsp),%rbx +.cfi_restore rbx + movq 72(%rsp),%rbp +.cfi_restore rbp + movq 80(%rsp),%r12 +.cfi_restore r12 + movq 88(%rsp),%r13 +.cfi_restore r13 + movq 96(%rsp),%r14 +.cfi_restore r14 + movq 104(%rsp),%r15 +.cfi_restore r15 + addq $120,%rsp +.cfi_adjust_cfa_offset -120 + + + ret +.cfi_endproc + +.size abi_test_trampoline,.-abi_test_trampoline +.globl abi_test_clobber_rax +.hidden abi_test_clobber_rax +.type abi_test_clobber_rax, @function +.align 16 +abi_test_clobber_rax: +_CET_ENDBR + xorq %rax,%rax + ret +.size abi_test_clobber_rax,.-abi_test_clobber_rax +.globl abi_test_clobber_rbx +.hidden abi_test_clobber_rbx +.type abi_test_clobber_rbx, @function +.align 16 +abi_test_clobber_rbx: +_CET_ENDBR + xorq %rbx,%rbx + ret +.size abi_test_clobber_rbx,.-abi_test_clobber_rbx +.globl abi_test_clobber_rcx +.hidden abi_test_clobber_rcx +.type abi_test_clobber_rcx, @function +.align 16 +abi_test_clobber_rcx: +_CET_ENDBR + xorq %rcx,%rcx + ret +.size abi_test_clobber_rcx,.-abi_test_clobber_rcx +.globl abi_test_clobber_rdx +.hidden abi_test_clobber_rdx +.type abi_test_clobber_rdx, @function +.align 16 +abi_test_clobber_rdx: +_CET_ENDBR + xorq %rdx,%rdx + ret +.size abi_test_clobber_rdx,.-abi_test_clobber_rdx +.globl abi_test_clobber_rdi +.hidden abi_test_clobber_rdi +.type abi_test_clobber_rdi, @function +.align 16 +abi_test_clobber_rdi: +_CET_ENDBR + xorq %rdi,%rdi + ret +.size abi_test_clobber_rdi,.-abi_test_clobber_rdi +.globl abi_test_clobber_rsi +.hidden abi_test_clobber_rsi +.type abi_test_clobber_rsi, @function +.align 16 +abi_test_clobber_rsi: +_CET_ENDBR + xorq %rsi,%rsi + ret +.size abi_test_clobber_rsi,.-abi_test_clobber_rsi +.globl abi_test_clobber_rbp +.hidden abi_test_clobber_rbp +.type abi_test_clobber_rbp, @function +.align 16 +abi_test_clobber_rbp: +_CET_ENDBR + xorq %rbp,%rbp + ret +.size abi_test_clobber_rbp,.-abi_test_clobber_rbp +.globl abi_test_clobber_r8 +.hidden abi_test_clobber_r8 +.type abi_test_clobber_r8, @function +.align 16 +abi_test_clobber_r8: +_CET_ENDBR + xorq %r8,%r8 + ret +.size abi_test_clobber_r8,.-abi_test_clobber_r8 +.globl abi_test_clobber_r9 +.hidden abi_test_clobber_r9 +.type abi_test_clobber_r9, @function +.align 16 +abi_test_clobber_r9: +_CET_ENDBR + xorq %r9,%r9 + ret +.size abi_test_clobber_r9,.-abi_test_clobber_r9 +.globl abi_test_clobber_r10 +.hidden abi_test_clobber_r10 +.type abi_test_clobber_r10, @function +.align 16 +abi_test_clobber_r10: +_CET_ENDBR + xorq %r10,%r10 + ret +.size abi_test_clobber_r10,.-abi_test_clobber_r10 +.globl abi_test_clobber_r11 +.hidden abi_test_clobber_r11 +.type abi_test_clobber_r11, @function +.align 16 +abi_test_clobber_r11: +_CET_ENDBR + xorq %r11,%r11 + ret +.size abi_test_clobber_r11,.-abi_test_clobber_r11 +.globl abi_test_clobber_r12 +.hidden abi_test_clobber_r12 +.type abi_test_clobber_r12, @function +.align 16 +abi_test_clobber_r12: +_CET_ENDBR + xorq %r12,%r12 + ret +.size abi_test_clobber_r12,.-abi_test_clobber_r12 +.globl abi_test_clobber_r13 +.hidden abi_test_clobber_r13 +.type abi_test_clobber_r13, @function +.align 16 +abi_test_clobber_r13: +_CET_ENDBR + xorq %r13,%r13 + ret +.size abi_test_clobber_r13,.-abi_test_clobber_r13 +.globl abi_test_clobber_r14 +.hidden abi_test_clobber_r14 +.type abi_test_clobber_r14, @function +.align 16 +abi_test_clobber_r14: +_CET_ENDBR + xorq %r14,%r14 + ret +.size abi_test_clobber_r14,.-abi_test_clobber_r14 +.globl abi_test_clobber_r15 +.hidden abi_test_clobber_r15 +.type abi_test_clobber_r15, @function +.align 16 +abi_test_clobber_r15: +_CET_ENDBR + xorq %r15,%r15 + ret +.size abi_test_clobber_r15,.-abi_test_clobber_r15 +.globl abi_test_clobber_xmm0 +.hidden abi_test_clobber_xmm0 +.type abi_test_clobber_xmm0, @function +.align 16 +abi_test_clobber_xmm0: +_CET_ENDBR + pxor %xmm0,%xmm0 + ret +.size abi_test_clobber_xmm0,.-abi_test_clobber_xmm0 +.globl abi_test_clobber_xmm1 +.hidden abi_test_clobber_xmm1 +.type abi_test_clobber_xmm1, @function +.align 16 +abi_test_clobber_xmm1: +_CET_ENDBR + pxor %xmm1,%xmm1 + ret +.size abi_test_clobber_xmm1,.-abi_test_clobber_xmm1 +.globl abi_test_clobber_xmm2 +.hidden abi_test_clobber_xmm2 +.type abi_test_clobber_xmm2, @function +.align 16 +abi_test_clobber_xmm2: +_CET_ENDBR + pxor %xmm2,%xmm2 + ret +.size abi_test_clobber_xmm2,.-abi_test_clobber_xmm2 +.globl abi_test_clobber_xmm3 +.hidden abi_test_clobber_xmm3 +.type abi_test_clobber_xmm3, @function +.align 16 +abi_test_clobber_xmm3: +_CET_ENDBR + pxor %xmm3,%xmm3 + ret +.size abi_test_clobber_xmm3,.-abi_test_clobber_xmm3 +.globl abi_test_clobber_xmm4 +.hidden abi_test_clobber_xmm4 +.type abi_test_clobber_xmm4, @function +.align 16 +abi_test_clobber_xmm4: +_CET_ENDBR + pxor %xmm4,%xmm4 + ret +.size abi_test_clobber_xmm4,.-abi_test_clobber_xmm4 +.globl abi_test_clobber_xmm5 +.hidden abi_test_clobber_xmm5 +.type abi_test_clobber_xmm5, @function +.align 16 +abi_test_clobber_xmm5: +_CET_ENDBR + pxor %xmm5,%xmm5 + ret +.size abi_test_clobber_xmm5,.-abi_test_clobber_xmm5 +.globl abi_test_clobber_xmm6 +.hidden abi_test_clobber_xmm6 +.type abi_test_clobber_xmm6, @function +.align 16 +abi_test_clobber_xmm6: +_CET_ENDBR + pxor %xmm6,%xmm6 + ret +.size abi_test_clobber_xmm6,.-abi_test_clobber_xmm6 +.globl abi_test_clobber_xmm7 +.hidden abi_test_clobber_xmm7 +.type abi_test_clobber_xmm7, @function +.align 16 +abi_test_clobber_xmm7: +_CET_ENDBR + pxor %xmm7,%xmm7 + ret +.size abi_test_clobber_xmm7,.-abi_test_clobber_xmm7 +.globl abi_test_clobber_xmm8 +.hidden abi_test_clobber_xmm8 +.type abi_test_clobber_xmm8, @function +.align 16 +abi_test_clobber_xmm8: +_CET_ENDBR + pxor %xmm8,%xmm8 + ret +.size abi_test_clobber_xmm8,.-abi_test_clobber_xmm8 +.globl abi_test_clobber_xmm9 +.hidden abi_test_clobber_xmm9 +.type abi_test_clobber_xmm9, @function +.align 16 +abi_test_clobber_xmm9: +_CET_ENDBR + pxor %xmm9,%xmm9 + ret +.size abi_test_clobber_xmm9,.-abi_test_clobber_xmm9 +.globl abi_test_clobber_xmm10 +.hidden abi_test_clobber_xmm10 +.type abi_test_clobber_xmm10, @function +.align 16 +abi_test_clobber_xmm10: +_CET_ENDBR + pxor %xmm10,%xmm10 + ret +.size abi_test_clobber_xmm10,.-abi_test_clobber_xmm10 +.globl abi_test_clobber_xmm11 +.hidden abi_test_clobber_xmm11 +.type abi_test_clobber_xmm11, @function +.align 16 +abi_test_clobber_xmm11: +_CET_ENDBR + pxor %xmm11,%xmm11 + ret +.size abi_test_clobber_xmm11,.-abi_test_clobber_xmm11 +.globl abi_test_clobber_xmm12 +.hidden abi_test_clobber_xmm12 +.type abi_test_clobber_xmm12, @function +.align 16 +abi_test_clobber_xmm12: +_CET_ENDBR + pxor %xmm12,%xmm12 + ret +.size abi_test_clobber_xmm12,.-abi_test_clobber_xmm12 +.globl abi_test_clobber_xmm13 +.hidden abi_test_clobber_xmm13 +.type abi_test_clobber_xmm13, @function +.align 16 +abi_test_clobber_xmm13: +_CET_ENDBR + pxor %xmm13,%xmm13 + ret +.size abi_test_clobber_xmm13,.-abi_test_clobber_xmm13 +.globl abi_test_clobber_xmm14 +.hidden abi_test_clobber_xmm14 +.type abi_test_clobber_xmm14, @function +.align 16 +abi_test_clobber_xmm14: +_CET_ENDBR + pxor %xmm14,%xmm14 + ret +.size abi_test_clobber_xmm14,.-abi_test_clobber_xmm14 +.globl abi_test_clobber_xmm15 +.hidden abi_test_clobber_xmm15 +.type abi_test_clobber_xmm15, @function +.align 16 +abi_test_clobber_xmm15: +_CET_ENDBR + pxor %xmm15,%xmm15 + ret +.size abi_test_clobber_xmm15,.-abi_test_clobber_xmm15 + + + +.globl abi_test_bad_unwind_wrong_register +.hidden abi_test_bad_unwind_wrong_register +.type abi_test_bad_unwind_wrong_register, @function +.align 16 +abi_test_bad_unwind_wrong_register: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r13,-16 + + + + + + nop + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret + +.cfi_endproc +.size abi_test_bad_unwind_wrong_register,.-abi_test_bad_unwind_wrong_register + + + + +.globl abi_test_bad_unwind_temporary +.hidden abi_test_bad_unwind_temporary +.type abi_test_bad_unwind_temporary, @function +.align 16 +abi_test_bad_unwind_temporary: +.cfi_startproc + +_CET_ENDBR + pushq %r12 +.cfi_adjust_cfa_offset 8 +.cfi_offset %r12,-16 + + + + movq %r12,%rax + incq %rax + movq %rax,(%rsp) + + + + movq %r12,(%rsp) + + + popq %r12 +.cfi_adjust_cfa_offset -8 +.cfi_restore %r12 + ret +.cfi_endproc + +.size abi_test_bad_unwind_temporary,.-abi_test_bad_unwind_temporary + + + + +.globl abi_test_get_and_clear_direction_flag +.hidden abi_test_get_and_clear_direction_flag +.type abi_test_get_and_clear_direction_flag, @function +abi_test_get_and_clear_direction_flag: +_CET_ENDBR + pushfq + popq %rax + andq $0x400,%rax + shrq $10,%rax + cld + ret +.size abi_test_get_and_clear_direction_flag,.-abi_test_get_and_clear_direction_flag + + + +.globl abi_test_set_direction_flag +.hidden abi_test_set_direction_flag +.type abi_test_set_direction_flag, @function +abi_test_set_direction_flag: +_CET_ENDBR + std + ret +.size abi_test_set_direction_flag,.-abi_test_set_direction_flag +#endif diff --git a/third_party/boringssl/gen/test_support/trampoline-x86_64-win.asm b/third_party/boringssl/gen/test_support/trampoline-x86_64-win.asm new file mode 100644 index 00000000..ca2fb87b --- /dev/null +++ b/third_party/boringssl/gen/test_support/trampoline-x86_64-win.asm @@ -0,0 +1,723 @@ +; This file is generated from a similarly-named Perl script in the BoringSSL +; source tree. Do not edit by hand. + +%ifidn __OUTPUT_FORMAT__, win64 +default rel +%define XMMWORD +%define YMMWORD +%define ZMMWORD +%define _CET_ENDBR + +%ifdef BORINGSSL_PREFIX +%include "boringssl_prefix_symbols_internal_x86_64_win_asm.inc" +%endif +section .text code align=64 + + + + + + + + + +global abi_test_trampoline + +ALIGN 16 +abi_test_trampoline: + +$L$SEH_begin_abi_test_trampoline_1: +_CET_ENDBR + + + + + + + + + + sub rsp,344 + +$L$SEH_prologue_abi_test_trampoline_2: + mov QWORD[112+rsp],rbx + +$L$SEH_prologue_abi_test_trampoline_3: + mov QWORD[120+rsp],rbp + +$L$SEH_prologue_abi_test_trampoline_4: + mov QWORD[128+rsp],rdi + +$L$SEH_prologue_abi_test_trampoline_5: + mov QWORD[136+rsp],rsi + +$L$SEH_prologue_abi_test_trampoline_6: + mov QWORD[144+rsp],r12 + +$L$SEH_prologue_abi_test_trampoline_7: + mov QWORD[152+rsp],r13 + +$L$SEH_prologue_abi_test_trampoline_8: + mov QWORD[160+rsp],r14 + +$L$SEH_prologue_abi_test_trampoline_9: + mov QWORD[168+rsp],r15 + +$L$SEH_prologue_abi_test_trampoline_10: + movdqa XMMWORD[176+rsp],xmm6 + +$L$SEH_prologue_abi_test_trampoline_11: + movdqa XMMWORD[192+rsp],xmm7 + +$L$SEH_prologue_abi_test_trampoline_12: + movdqa XMMWORD[208+rsp],xmm8 + +$L$SEH_prologue_abi_test_trampoline_13: + movdqa XMMWORD[224+rsp],xmm9 + +$L$SEH_prologue_abi_test_trampoline_14: + movdqa XMMWORD[240+rsp],xmm10 + +$L$SEH_prologue_abi_test_trampoline_15: + movdqa XMMWORD[256+rsp],xmm11 + +$L$SEH_prologue_abi_test_trampoline_16: + movdqa XMMWORD[272+rsp],xmm12 + +$L$SEH_prologue_abi_test_trampoline_17: + movdqa XMMWORD[288+rsp],xmm13 + +$L$SEH_prologue_abi_test_trampoline_18: + movdqa XMMWORD[304+rsp],xmm14 + +$L$SEH_prologue_abi_test_trampoline_19: + movdqa XMMWORD[320+rsp],xmm15 + +$L$SEH_prologue_abi_test_trampoline_20: +$L$SEH_endprologue_abi_test_trampoline_21: + mov rbx,QWORD[rdx] + mov rbp,QWORD[8+rdx] + mov rdi,QWORD[16+rdx] + mov rsi,QWORD[24+rdx] + mov r12,QWORD[32+rdx] + mov r13,QWORD[40+rdx] + mov r14,QWORD[48+rdx] + mov r15,QWORD[56+rdx] + movdqa xmm6,XMMWORD[64+rdx] + movdqa xmm7,XMMWORD[80+rdx] + movdqa xmm8,XMMWORD[96+rdx] + movdqa xmm9,XMMWORD[112+rdx] + movdqa xmm10,XMMWORD[128+rdx] + movdqa xmm11,XMMWORD[144+rdx] + movdqa xmm12,XMMWORD[160+rdx] + movdqa xmm13,XMMWORD[176+rdx] + movdqa xmm14,XMMWORD[192+rdx] + movdqa xmm15,XMMWORD[208+rdx] + + mov QWORD[88+rsp],rcx + mov QWORD[96+rsp],rdx + + + + + mov r10,r8 + mov r11,r9 + dec r11 + js NEAR $L$args_done + mov rcx,QWORD[r10] + add r10,8 + dec r11 + js NEAR $L$args_done + mov rdx,QWORD[r10] + add r10,8 + dec r11 + js NEAR $L$args_done + mov r8,QWORD[r10] + add r10,8 + dec r11 + js NEAR $L$args_done + mov r9,QWORD[r10] + add r10,8 + lea rax,[32+rsp] +$L$args_loop: + dec r11 + js NEAR $L$args_done + + + + + + + mov QWORD[104+rsp],r11 + mov r11,QWORD[r10] + mov QWORD[rax],r11 + mov r11,QWORD[104+rsp] + + add r10,8 + add rax,8 + jmp NEAR $L$args_loop + +$L$args_done: + mov rax,QWORD[88+rsp] + mov r10,QWORD[384+rsp] + test r10,r10 + jz NEAR $L$no_unwind + + + pushfq + or QWORD[rsp],0x100 + popfq + + + + nop +global abi_test_unwind_start +abi_test_unwind_start: + + call rax +global abi_test_unwind_return +abi_test_unwind_return: + + + + + pushfq + and QWORD[rsp],-0x101 + popfq +global abi_test_unwind_stop +abi_test_unwind_stop: + + jmp NEAR $L$call_done + +$L$no_unwind: + call rax + +$L$call_done: + + mov rdx,QWORD[96+rsp] + mov QWORD[rdx],rbx + mov QWORD[8+rdx],rbp + mov QWORD[16+rdx],rdi + mov QWORD[24+rdx],rsi + mov QWORD[32+rdx],r12 + mov QWORD[40+rdx],r13 + mov QWORD[48+rdx],r14 + mov QWORD[56+rdx],r15 + movdqa XMMWORD[64+rdx],xmm6 + movdqa XMMWORD[80+rdx],xmm7 + movdqa XMMWORD[96+rdx],xmm8 + movdqa XMMWORD[112+rdx],xmm9 + movdqa XMMWORD[128+rdx],xmm10 + movdqa XMMWORD[144+rdx],xmm11 + movdqa XMMWORD[160+rdx],xmm12 + movdqa XMMWORD[176+rdx],xmm13 + movdqa XMMWORD[192+rdx],xmm14 + movdqa XMMWORD[208+rdx],xmm15 + mov rbx,QWORD[112+rsp] + + mov rbp,QWORD[120+rsp] + + mov rdi,QWORD[128+rsp] + + mov rsi,QWORD[136+rsp] + + mov r12,QWORD[144+rsp] + + mov r13,QWORD[152+rsp] + + mov r14,QWORD[160+rsp] + + mov r15,QWORD[168+rsp] + + movdqa xmm6,XMMWORD[176+rsp] + + movdqa xmm7,XMMWORD[192+rsp] + + movdqa xmm8,XMMWORD[208+rsp] + + movdqa xmm9,XMMWORD[224+rsp] + + movdqa xmm10,XMMWORD[240+rsp] + + movdqa xmm11,XMMWORD[256+rsp] + + movdqa xmm12,XMMWORD[272+rsp] + + movdqa xmm13,XMMWORD[288+rsp] + + movdqa xmm14,XMMWORD[304+rsp] + + movdqa xmm15,XMMWORD[320+rsp] + + add rsp,344 + + + + ret + +$L$SEH_end_abi_test_trampoline_22: + +global abi_test_clobber_rax + +ALIGN 16 +abi_test_clobber_rax: +_CET_ENDBR + xor rax,rax + ret + +global abi_test_clobber_rbx + +ALIGN 16 +abi_test_clobber_rbx: +_CET_ENDBR + xor rbx,rbx + ret + +global abi_test_clobber_rcx + +ALIGN 16 +abi_test_clobber_rcx: +_CET_ENDBR + xor rcx,rcx + ret + +global abi_test_clobber_rdx + +ALIGN 16 +abi_test_clobber_rdx: +_CET_ENDBR + xor rdx,rdx + ret + +global abi_test_clobber_rdi + +ALIGN 16 +abi_test_clobber_rdi: +_CET_ENDBR + xor rdi,rdi + ret + +global abi_test_clobber_rsi + +ALIGN 16 +abi_test_clobber_rsi: +_CET_ENDBR + xor rsi,rsi + ret + +global abi_test_clobber_rbp + +ALIGN 16 +abi_test_clobber_rbp: +_CET_ENDBR + xor rbp,rbp + ret + +global abi_test_clobber_r8 + +ALIGN 16 +abi_test_clobber_r8: +_CET_ENDBR + xor r8,r8 + ret + +global abi_test_clobber_r9 + +ALIGN 16 +abi_test_clobber_r9: +_CET_ENDBR + xor r9,r9 + ret + +global abi_test_clobber_r10 + +ALIGN 16 +abi_test_clobber_r10: +_CET_ENDBR + xor r10,r10 + ret + +global abi_test_clobber_r11 + +ALIGN 16 +abi_test_clobber_r11: +_CET_ENDBR + xor r11,r11 + ret + +global abi_test_clobber_r12 + +ALIGN 16 +abi_test_clobber_r12: +_CET_ENDBR + xor r12,r12 + ret + +global abi_test_clobber_r13 + +ALIGN 16 +abi_test_clobber_r13: +_CET_ENDBR + xor r13,r13 + ret + +global abi_test_clobber_r14 + +ALIGN 16 +abi_test_clobber_r14: +_CET_ENDBR + xor r14,r14 + ret + +global abi_test_clobber_r15 + +ALIGN 16 +abi_test_clobber_r15: +_CET_ENDBR + xor r15,r15 + ret + +global abi_test_clobber_xmm0 + +ALIGN 16 +abi_test_clobber_xmm0: +_CET_ENDBR + pxor xmm0,xmm0 + ret + +global abi_test_clobber_xmm1 + +ALIGN 16 +abi_test_clobber_xmm1: +_CET_ENDBR + pxor xmm1,xmm1 + ret + +global abi_test_clobber_xmm2 + +ALIGN 16 +abi_test_clobber_xmm2: +_CET_ENDBR + pxor xmm2,xmm2 + ret + +global abi_test_clobber_xmm3 + +ALIGN 16 +abi_test_clobber_xmm3: +_CET_ENDBR + pxor xmm3,xmm3 + ret + +global abi_test_clobber_xmm4 + +ALIGN 16 +abi_test_clobber_xmm4: +_CET_ENDBR + pxor xmm4,xmm4 + ret + +global abi_test_clobber_xmm5 + +ALIGN 16 +abi_test_clobber_xmm5: +_CET_ENDBR + pxor xmm5,xmm5 + ret + +global abi_test_clobber_xmm6 + +ALIGN 16 +abi_test_clobber_xmm6: +_CET_ENDBR + pxor xmm6,xmm6 + ret + +global abi_test_clobber_xmm7 + +ALIGN 16 +abi_test_clobber_xmm7: +_CET_ENDBR + pxor xmm7,xmm7 + ret + +global abi_test_clobber_xmm8 + +ALIGN 16 +abi_test_clobber_xmm8: +_CET_ENDBR + pxor xmm8,xmm8 + ret + +global abi_test_clobber_xmm9 + +ALIGN 16 +abi_test_clobber_xmm9: +_CET_ENDBR + pxor xmm9,xmm9 + ret + +global abi_test_clobber_xmm10 + +ALIGN 16 +abi_test_clobber_xmm10: +_CET_ENDBR + pxor xmm10,xmm10 + ret + +global abi_test_clobber_xmm11 + +ALIGN 16 +abi_test_clobber_xmm11: +_CET_ENDBR + pxor xmm11,xmm11 + ret + +global abi_test_clobber_xmm12 + +ALIGN 16 +abi_test_clobber_xmm12: +_CET_ENDBR + pxor xmm12,xmm12 + ret + +global abi_test_clobber_xmm13 + +ALIGN 16 +abi_test_clobber_xmm13: +_CET_ENDBR + pxor xmm13,xmm13 + ret + +global abi_test_clobber_xmm14 + +ALIGN 16 +abi_test_clobber_xmm14: +_CET_ENDBR + pxor xmm14,xmm14 + ret + +global abi_test_clobber_xmm15 + +ALIGN 16 +abi_test_clobber_xmm15: +_CET_ENDBR + pxor xmm15,xmm15 + ret + + + + +global abi_test_bad_unwind_wrong_register + +ALIGN 16 +abi_test_bad_unwind_wrong_register: + +$L$SEH_begin_abi_test_bad_unwind_wrong_register_1: +_CET_ENDBR + push r12 + +$L$SEH_prologue_abi_test_bad_unwind_wrong_register_2: +$L$SEH_endprologue_abi_test_bad_unwind_wrong_register_3: + + + + nop + pop r12 + + ret +$L$SEH_end_abi_test_bad_unwind_wrong_register_4: + + + + + + +global abi_test_bad_unwind_temporary + +ALIGN 16 +abi_test_bad_unwind_temporary: + +$L$SEH_begin_abi_test_bad_unwind_temporary_1: +_CET_ENDBR + push r12 + +$L$SEH_prologue_abi_test_bad_unwind_temporary_2: +$L$SEH_endprologue_abi_test_bad_unwind_temporary_3: + + mov rax,r12 + inc rax + mov QWORD[rsp],rax + + + + mov QWORD[rsp],r12 + + + pop r12 + + ret + +$L$SEH_end_abi_test_bad_unwind_temporary_4: + + + + + +global abi_test_get_and_clear_direction_flag + +abi_test_get_and_clear_direction_flag: +_CET_ENDBR + pushfq + pop rax + and rax,0x400 + shr rax,10 + cld + ret + + + + +global abi_test_set_direction_flag + +abi_test_set_direction_flag: +_CET_ENDBR + std + ret + + + + + +global abi_test_bad_unwind_epilog + +ALIGN 16 +abi_test_bad_unwind_epilog: +$L$SEH_begin_abi_test_bad_unwind_epilog_1: + push r12 +$L$SEH_prologue_abi_test_bad_unwind_epilog_2: +$L$SEH_endprologue_abi_test_bad_unwind_epilog_3: + + nop + + + pop r12 + nop + ret +$L$SEH_end_abi_test_bad_unwind_epilog_4: + +section .pdata rdata align=4 +ALIGN 4 + DD $L$SEH_begin_abi_test_trampoline_1 wrt ..imagebase + DD $L$SEH_end_abi_test_trampoline_22 wrt ..imagebase + DD $L$SEH_info_abi_test_trampoline_0 wrt ..imagebase + + DD $L$SEH_begin_abi_test_bad_unwind_wrong_register_1 wrt ..imagebase + DD $L$SEH_end_abi_test_bad_unwind_wrong_register_4 wrt ..imagebase + DD $L$SEH_info_abi_test_bad_unwind_wrong_register_0 wrt ..imagebase + + DD $L$SEH_begin_abi_test_bad_unwind_temporary_1 wrt ..imagebase + DD $L$SEH_end_abi_test_bad_unwind_temporary_4 wrt ..imagebase + DD $L$SEH_info_abi_test_bad_unwind_temporary_0 wrt ..imagebase + + DD $L$SEH_begin_abi_test_bad_unwind_epilog_1 wrt ..imagebase + DD $L$SEH_end_abi_test_bad_unwind_epilog_4 wrt ..imagebase + DD $L$SEH_info_abi_test_bad_unwind_epilog_0 wrt ..imagebase + + +section .xdata rdata align=8 +ALIGN 4 +$L$SEH_info_abi_test_trampoline_0: + DB 1 + DB $L$SEH_endprologue_abi_test_trampoline_21-$L$SEH_begin_abi_test_trampoline_1 + DB 38 + DB 0 + DB $L$SEH_prologue_abi_test_trampoline_20-$L$SEH_begin_abi_test_trampoline_1 + DB 248 + DW 20 + DB $L$SEH_prologue_abi_test_trampoline_19-$L$SEH_begin_abi_test_trampoline_1 + DB 232 + DW 19 + DB $L$SEH_prologue_abi_test_trampoline_18-$L$SEH_begin_abi_test_trampoline_1 + DB 216 + DW 18 + DB $L$SEH_prologue_abi_test_trampoline_17-$L$SEH_begin_abi_test_trampoline_1 + DB 200 + DW 17 + DB $L$SEH_prologue_abi_test_trampoline_16-$L$SEH_begin_abi_test_trampoline_1 + DB 184 + DW 16 + DB $L$SEH_prologue_abi_test_trampoline_15-$L$SEH_begin_abi_test_trampoline_1 + DB 168 + DW 15 + DB $L$SEH_prologue_abi_test_trampoline_14-$L$SEH_begin_abi_test_trampoline_1 + DB 152 + DW 14 + DB $L$SEH_prologue_abi_test_trampoline_13-$L$SEH_begin_abi_test_trampoline_1 + DB 136 + DW 13 + DB $L$SEH_prologue_abi_test_trampoline_12-$L$SEH_begin_abi_test_trampoline_1 + DB 120 + DW 12 + DB $L$SEH_prologue_abi_test_trampoline_11-$L$SEH_begin_abi_test_trampoline_1 + DB 104 + DW 11 + DB $L$SEH_prologue_abi_test_trampoline_10-$L$SEH_begin_abi_test_trampoline_1 + DB 244 + DW 21 + DB $L$SEH_prologue_abi_test_trampoline_9-$L$SEH_begin_abi_test_trampoline_1 + DB 228 + DW 20 + DB $L$SEH_prologue_abi_test_trampoline_8-$L$SEH_begin_abi_test_trampoline_1 + DB 212 + DW 19 + DB $L$SEH_prologue_abi_test_trampoline_7-$L$SEH_begin_abi_test_trampoline_1 + DB 196 + DW 18 + DB $L$SEH_prologue_abi_test_trampoline_6-$L$SEH_begin_abi_test_trampoline_1 + DB 100 + DW 17 + DB $L$SEH_prologue_abi_test_trampoline_5-$L$SEH_begin_abi_test_trampoline_1 + DB 116 + DW 16 + DB $L$SEH_prologue_abi_test_trampoline_4-$L$SEH_begin_abi_test_trampoline_1 + DB 84 + DW 15 + DB $L$SEH_prologue_abi_test_trampoline_3-$L$SEH_begin_abi_test_trampoline_1 + DB 52 + DW 14 + DB $L$SEH_prologue_abi_test_trampoline_2-$L$SEH_begin_abi_test_trampoline_1 + DB 1 + DW 43 + +$L$SEH_info_abi_test_bad_unwind_wrong_register_0: + DB 1 + DB $L$SEH_endprologue_abi_test_bad_unwind_wrong_register_3-$L$SEH_begin_abi_test_bad_unwind_wrong_register_1 + DB 1 + DB 0 + DB $L$SEH_prologue_abi_test_bad_unwind_wrong_register_2-$L$SEH_begin_abi_test_bad_unwind_wrong_register_1 + DB 208 + + DW 0 +$L$SEH_info_abi_test_bad_unwind_temporary_0: + DB 1 + DB $L$SEH_endprologue_abi_test_bad_unwind_temporary_3-$L$SEH_begin_abi_test_bad_unwind_temporary_1 + DB 1 + DB 0 + DB $L$SEH_prologue_abi_test_bad_unwind_temporary_2-$L$SEH_begin_abi_test_bad_unwind_temporary_1 + DB 192 + + DW 0 +$L$SEH_info_abi_test_bad_unwind_epilog_0: + DB 1 + DB $L$SEH_endprologue_abi_test_bad_unwind_epilog_3-$L$SEH_begin_abi_test_bad_unwind_epilog_1 + DB 1 + DB 0 + DB $L$SEH_prologue_abi_test_bad_unwind_epilog_2-$L$SEH_begin_abi_test_bad_unwind_epilog_1 + DB 192 + + DW 0 +%else +; Work around https://bugzilla.nasm.us/show_bug.cgi?id=3392738 +ret +%endif diff --git a/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S b/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S deleted file mode 100644 index 43921111..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/chacha/chacha-armv8.S +++ /dev/null @@ -1,1995 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - - -.hidden OPENSSL_armcap_P - -.section .rodata - -.align 5 -.Lsigma: -.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral -.Lone: -.long 1,0,0,0 -.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 - -.text - -.globl ChaCha20_ctr32 -.hidden ChaCha20_ctr32 -.type ChaCha20_ctr32,%function -.align 5 -ChaCha20_ctr32: - AARCH64_VALID_CALL_TARGET - cbz x2,.Labort -#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 - adrp x5,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x5,OPENSSL_armcap_P -#endif - cmp x2,#192 - b.lo .Lshort - ldr w17,[x5,:lo12:OPENSSL_armcap_P] - tst w17,#ARMV7_NEON - b.ne ChaCha20_neon - -.Lshort: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adrp x5,.Lsigma - add x5,x5,:lo12:.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ldp x28,x30,[x4] // load counter -#ifdef __AARCH64EB__ - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - -.Loop_outer: - mov w5,w22 // unpack key block - lsr x6,x22,#32 - mov w7,w23 - lsr x8,x23,#32 - mov w9,w24 - lsr x10,x24,#32 - mov w11,w25 - lsr x12,x25,#32 - mov w13,w26 - lsr x14,x26,#32 - mov w15,w27 - lsr x16,x27,#32 - mov w17,w28 - lsr x19,x28,#32 - mov w20,w30 - lsr x21,x30,#32 - - mov x4,#10 - subs x2,x2,#64 -.Loop: - sub x4,x4,#1 - add w5,w5,w9 - add w6,w6,w10 - add w7,w7,w11 - add w8,w8,w12 - eor w17,w17,w5 - eor w19,w19,w6 - eor w20,w20,w7 - eor w21,w21,w8 - ror w17,w17,#16 - ror w19,w19,#16 - ror w20,w20,#16 - ror w21,w21,#16 - add w13,w13,w17 - add w14,w14,w19 - add w15,w15,w20 - add w16,w16,w21 - eor w9,w9,w13 - eor w10,w10,w14 - eor w11,w11,w15 - eor w12,w12,w16 - ror w9,w9,#20 - ror w10,w10,#20 - ror w11,w11,#20 - ror w12,w12,#20 - add w5,w5,w9 - add w6,w6,w10 - add w7,w7,w11 - add w8,w8,w12 - eor w17,w17,w5 - eor w19,w19,w6 - eor w20,w20,w7 - eor w21,w21,w8 - ror w17,w17,#24 - ror w19,w19,#24 - ror w20,w20,#24 - ror w21,w21,#24 - add w13,w13,w17 - add w14,w14,w19 - add w15,w15,w20 - add w16,w16,w21 - eor w9,w9,w13 - eor w10,w10,w14 - eor w11,w11,w15 - eor w12,w12,w16 - ror w9,w9,#25 - ror w10,w10,#25 - ror w11,w11,#25 - ror w12,w12,#25 - add w5,w5,w10 - add w6,w6,w11 - add w7,w7,w12 - add w8,w8,w9 - eor w21,w21,w5 - eor w17,w17,w6 - eor w19,w19,w7 - eor w20,w20,w8 - ror w21,w21,#16 - ror w17,w17,#16 - ror w19,w19,#16 - ror w20,w20,#16 - add w15,w15,w21 - add w16,w16,w17 - add w13,w13,w19 - add w14,w14,w20 - eor w10,w10,w15 - eor w11,w11,w16 - eor w12,w12,w13 - eor w9,w9,w14 - ror w10,w10,#20 - ror w11,w11,#20 - ror w12,w12,#20 - ror w9,w9,#20 - add w5,w5,w10 - add w6,w6,w11 - add w7,w7,w12 - add w8,w8,w9 - eor w21,w21,w5 - eor w17,w17,w6 - eor w19,w19,w7 - eor w20,w20,w8 - ror w21,w21,#24 - ror w17,w17,#24 - ror w19,w19,#24 - ror w20,w20,#24 - add w15,w15,w21 - add w16,w16,w17 - add w13,w13,w19 - add w14,w14,w20 - eor w10,w10,w15 - eor w11,w11,w16 - eor w12,w12,w13 - eor w9,w9,w14 - ror w10,w10,#25 - ror w11,w11,#25 - ror w12,w12,#25 - ror w9,w9,#25 - cbnz x4,.Loop - - add w5,w5,w22 // accumulate key block - add x6,x6,x22,lsr#32 - add w7,w7,w23 - add x8,x8,x23,lsr#32 - add w9,w9,w24 - add x10,x10,x24,lsr#32 - add w11,w11,w25 - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add x21,x21,x30,lsr#32 - - b.lo .Ltail - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#1 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - - b.hi .Loop_outer - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER -.Labort: - ret - -.align 4 -.Ltail: - add x2,x2,#64 -.Less_than_64: - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 - neg x2,x2 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - stp x5,x7,[sp,#0] - stp x9,x11,[sp,#16] - stp x13,x15,[sp,#32] - stp x17,x20,[sp,#48] - -.Loop_tail: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] - add x2,x2,#1 - eor w10,w10,w11 - strb w10,[x0,x2] - cbnz x2,.Loop_tail - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ChaCha20_ctr32,.-ChaCha20_ctr32 - -.type ChaCha20_neon,%function -.align 5 -ChaCha20_neon: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adrp x5,.Lsigma - add x5,x5,:lo12:.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - cmp x2,#512 - b.hs .L512_or_more_neon - - sub sp,sp,#64 - - ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __AARCH64EB__ - rev64 v24.4s,v24.4s - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - add v27.4s,v27.4s,v31.4s // += 1 - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 - -.Loop_outer_neon: - mov w5,w22 // unpack key block - lsr x6,x22,#32 - mov v0.16b,v24.16b - mov w7,w23 - lsr x8,x23,#32 - mov v4.16b,v24.16b - mov w9,w24 - lsr x10,x24,#32 - mov v16.16b,v24.16b - mov w11,w25 - mov v1.16b,v25.16b - lsr x12,x25,#32 - mov v5.16b,v25.16b - mov w13,w26 - mov v17.16b,v25.16b - lsr x14,x26,#32 - mov v3.16b,v27.16b - mov w15,w27 - mov v7.16b,v28.16b - lsr x16,x27,#32 - mov v19.16b,v29.16b - mov w17,w28 - mov v2.16b,v26.16b - lsr x19,x28,#32 - mov v6.16b,v26.16b - mov w20,w30 - mov v18.16b,v26.16b - lsr x21,x30,#32 - - mov x4,#10 - subs x2,x2,#256 -.Loop_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v16.4s,v16.4s,v17.4s - add w7,w7,w11 - eor v3.16b,v3.16b,v0.16b - add w8,w8,w12 - eor v7.16b,v7.16b,v4.16b - eor w17,w17,w5 - eor v19.16b,v19.16b,v16.16b - eor w19,w19,w6 - rev32 v3.8h,v3.8h - eor w20,w20,w7 - rev32 v7.8h,v7.8h - eor w21,w21,w8 - rev32 v19.8h,v19.8h - ror w17,w17,#16 - add v2.4s,v2.4s,v3.4s - ror w19,w19,#16 - add v6.4s,v6.4s,v7.4s - ror w20,w20,#16 - add v18.4s,v18.4s,v19.4s - ror w21,w21,#16 - eor v20.16b,v1.16b,v2.16b - add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b - add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b - add w15,w15,w20 - ushr v1.4s,v20.4s,#20 - add w16,w16,w21 - ushr v5.4s,v21.4s,#20 - eor w9,w9,w13 - ushr v17.4s,v22.4s,#20 - eor w10,w10,w14 - sli v1.4s,v20.4s,#12 - eor w11,w11,w15 - sli v5.4s,v21.4s,#12 - eor w12,w12,w16 - sli v17.4s,v22.4s,#12 - ror w9,w9,#20 - add v0.4s,v0.4s,v1.4s - ror w10,w10,#20 - add v4.4s,v4.4s,v5.4s - ror w11,w11,#20 - add v16.4s,v16.4s,v17.4s - ror w12,w12,#20 - eor v20.16b,v3.16b,v0.16b - add w5,w5,w9 - eor v21.16b,v7.16b,v4.16b - add w6,w6,w10 - eor v22.16b,v19.16b,v16.16b - add w7,w7,w11 - ushr v3.4s,v20.4s,#24 - add w8,w8,w12 - ushr v7.4s,v21.4s,#24 - eor w17,w17,w5 - ushr v19.4s,v22.4s,#24 - eor w19,w19,w6 - sli v3.4s,v20.4s,#8 - eor w20,w20,w7 - sli v7.4s,v21.4s,#8 - eor w21,w21,w8 - sli v19.4s,v22.4s,#8 - ror w17,w17,#24 - add v2.4s,v2.4s,v3.4s - ror w19,w19,#24 - add v6.4s,v6.4s,v7.4s - ror w20,w20,#24 - add v18.4s,v18.4s,v19.4s - ror w21,w21,#24 - eor v20.16b,v1.16b,v2.16b - add w13,w13,w17 - eor v21.16b,v5.16b,v6.16b - add w14,w14,w19 - eor v22.16b,v17.16b,v18.16b - add w15,w15,w20 - ushr v1.4s,v20.4s,#25 - add w16,w16,w21 - ushr v5.4s,v21.4s,#25 - eor w9,w9,w13 - ushr v17.4s,v22.4s,#25 - eor w10,w10,w14 - sli v1.4s,v20.4s,#7 - eor w11,w11,w15 - sli v5.4s,v21.4s,#7 - eor w12,w12,w16 - sli v17.4s,v22.4s,#7 - ror w9,w9,#25 - ext v2.16b,v2.16b,v2.16b,#8 - ror w10,w10,#25 - ext v6.16b,v6.16b,v6.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w10 - add v4.4s,v4.4s,v5.4s - add w6,w6,w11 - add v16.4s,v16.4s,v17.4s - add w7,w7,w12 - eor v3.16b,v3.16b,v0.16b - add w8,w8,w9 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w5 - eor v19.16b,v19.16b,v16.16b - eor w17,w17,w6 - rev32 v3.8h,v3.8h - eor w19,w19,w7 - rev32 v7.8h,v7.8h - eor w20,w20,w8 - rev32 v19.8h,v19.8h - ror w21,w21,#16 - add v2.4s,v2.4s,v3.4s - ror w17,w17,#16 - add v6.4s,v6.4s,v7.4s - ror w19,w19,#16 - add v18.4s,v18.4s,v19.4s - ror w20,w20,#16 - eor v20.16b,v1.16b,v2.16b - add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b - add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b - add w13,w13,w19 - ushr v1.4s,v20.4s,#20 - add w14,w14,w20 - ushr v5.4s,v21.4s,#20 - eor w10,w10,w15 - ushr v17.4s,v22.4s,#20 - eor w11,w11,w16 - sli v1.4s,v20.4s,#12 - eor w12,w12,w13 - sli v5.4s,v21.4s,#12 - eor w9,w9,w14 - sli v17.4s,v22.4s,#12 - ror w10,w10,#20 - add v0.4s,v0.4s,v1.4s - ror w11,w11,#20 - add v4.4s,v4.4s,v5.4s - ror w12,w12,#20 - add v16.4s,v16.4s,v17.4s - ror w9,w9,#20 - eor v20.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v21.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v22.16b,v19.16b,v16.16b - add w7,w7,w12 - ushr v3.4s,v20.4s,#24 - add w8,w8,w9 - ushr v7.4s,v21.4s,#24 - eor w21,w21,w5 - ushr v19.4s,v22.4s,#24 - eor w17,w17,w6 - sli v3.4s,v20.4s,#8 - eor w19,w19,w7 - sli v7.4s,v21.4s,#8 - eor w20,w20,w8 - sli v19.4s,v22.4s,#8 - ror w21,w21,#24 - add v2.4s,v2.4s,v3.4s - ror w17,w17,#24 - add v6.4s,v6.4s,v7.4s - ror w19,w19,#24 - add v18.4s,v18.4s,v19.4s - ror w20,w20,#24 - eor v20.16b,v1.16b,v2.16b - add w15,w15,w21 - eor v21.16b,v5.16b,v6.16b - add w16,w16,w17 - eor v22.16b,v17.16b,v18.16b - add w13,w13,w19 - ushr v1.4s,v20.4s,#25 - add w14,w14,w20 - ushr v5.4s,v21.4s,#25 - eor w10,w10,w15 - ushr v17.4s,v22.4s,#25 - eor w11,w11,w16 - sli v1.4s,v20.4s,#7 - eor w12,w12,w13 - sli v5.4s,v21.4s,#7 - eor w9,w9,w14 - sli v17.4s,v22.4s,#7 - ror w10,w10,#25 - ext v2.16b,v2.16b,v2.16b,#8 - ror w11,w11,#25 - ext v6.16b,v6.16b,v6.16b,#8 - ror w12,w12,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - cbnz x4,.Loop_neon - - add w5,w5,w22 // accumulate key block - add v0.4s,v0.4s,v24.4s - add x6,x6,x22,lsr#32 - add v4.4s,v4.4s,v24.4s - add w7,w7,w23 - add v16.4s,v16.4s,v24.4s - add x8,x8,x23,lsr#32 - add v2.4s,v2.4s,v26.4s - add w9,w9,w24 - add v6.4s,v6.4s,v26.4s - add x10,x10,x24,lsr#32 - add v18.4s,v18.4s,v26.4s - add w11,w11,w25 - add v3.4s,v3.4s,v27.4s - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add v7.4s,v7.4s,v28.4s - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add v19.4s,v19.4s,v29.4s - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add v1.4s,v1.4s,v25.4s - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add v5.4s,v5.4s,v25.4s - add x21,x21,x30,lsr#32 - add v17.4s,v17.4s,v25.4s - - b.lo .Ltail_neon - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor v0.16b,v0.16b,v20.16b - eor x15,x15,x16 - eor v1.16b,v1.16b,v21.16b - eor x17,x17,x19 - eor v2.16b,v2.16b,v22.16b - eor x20,x20,x21 - eor v3.16b,v3.16b,v23.16b - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter - stp x9,x11,[x0,#16] - add v27.4s,v27.4s,v31.4s // += 4 - stp x13,x15,[x0,#32] - add v28.4s,v28.4s,v31.4s - stp x17,x20,[x0,#48] - add v29.4s,v29.4s,v31.4s - add x0,x0,#64 - - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - eor v16.16b,v16.16b,v0.16b - eor v17.16b,v17.16b,v1.16b - eor v18.16b,v18.16b,v2.16b - eor v19.16b,v19.16b,v3.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - - b.hi .Loop_outer_neon - - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret - -.Ltail_neon: - add x2,x2,#256 - cmp x2,#64 - b.lo .Less_than_64 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#4 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - b.eq .Ldone_neon - sub x2,x2,#64 - cmp x2,#64 - b.lo .Less_than_128 - - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v0.16b,v0.16b,v20.16b - eor v1.16b,v1.16b,v21.16b - eor v2.16b,v2.16b,v22.16b - eor v3.16b,v3.16b,v23.16b - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - b.eq .Ldone_neon - sub x2,x2,#64 - cmp x2,#64 - b.lo .Less_than_192 - - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - eor v4.16b,v4.16b,v20.16b - eor v5.16b,v5.16b,v21.16b - eor v6.16b,v6.16b,v22.16b - eor v7.16b,v7.16b,v23.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - b.eq .Ldone_neon - sub x2,x2,#64 - - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[sp] - b .Last_neon - -.Less_than_128: - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[sp] - b .Last_neon -.Less_than_192: - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[sp] - b .Last_neon - -.align 4 -.Last_neon: - sub x0,x0,#1 - add x1,x1,x2 - add x0,x0,x2 - add x4,sp,x2 - neg x2,x2 - -.Loop_tail_neon: - ldrb w10,[x1,x2] - ldrb w11,[x4,x2] - add x2,x2,#1 - eor w10,w10,w11 - strb w10,[x0,x2] - cbnz x2,.Loop_tail_neon - - stp xzr,xzr,[sp,#0] - stp xzr,xzr,[sp,#16] - stp xzr,xzr,[sp,#32] - stp xzr,xzr,[sp,#48] - -.Ldone_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ChaCha20_neon,.-ChaCha20_neon -.type ChaCha20_512_neon,%function -.align 5 -ChaCha20_512_neon: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - - adrp x5,.Lsigma - add x5,x5,:lo12:.Lsigma - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - -.L512_or_more_neon: - sub sp,sp,#128+64 - - ldp x22,x23,[x5] // load sigma - ld1 {v24.4s},[x5],#16 - ldp x24,x25,[x3] // load key - ldp x26,x27,[x3,#16] - ld1 {v25.4s,v26.4s},[x3] - ldp x28,x30,[x4] // load counter - ld1 {v27.4s},[x4] - ld1 {v31.4s},[x5] -#ifdef __AARCH64EB__ - rev64 v24.4s,v24.4s - ror x24,x24,#32 - ror x25,x25,#32 - ror x26,x26,#32 - ror x27,x27,#32 - ror x28,x28,#32 - ror x30,x30,#32 -#endif - add v27.4s,v27.4s,v31.4s // += 1 - stp q24,q25,[sp,#0] // off-load key block, invariant part - add v27.4s,v27.4s,v31.4s // not typo - str q26,[sp,#32] - add v28.4s,v27.4s,v31.4s - add v29.4s,v28.4s,v31.4s - add v30.4s,v29.4s,v31.4s - shl v31.4s,v31.4s,#2 // 1 -> 4 - - stp d8,d9,[sp,#128+0] // meet ABI requirements - stp d10,d11,[sp,#128+16] - stp d12,d13,[sp,#128+32] - stp d14,d15,[sp,#128+48] - - sub x2,x2,#512 // not typo - -.Loop_outer_512_neon: - mov v0.16b,v24.16b - mov v4.16b,v24.16b - mov v8.16b,v24.16b - mov v12.16b,v24.16b - mov v16.16b,v24.16b - mov v20.16b,v24.16b - mov v1.16b,v25.16b - mov w5,w22 // unpack key block - mov v5.16b,v25.16b - lsr x6,x22,#32 - mov v9.16b,v25.16b - mov w7,w23 - mov v13.16b,v25.16b - lsr x8,x23,#32 - mov v17.16b,v25.16b - mov w9,w24 - mov v21.16b,v25.16b - lsr x10,x24,#32 - mov v3.16b,v27.16b - mov w11,w25 - mov v7.16b,v28.16b - lsr x12,x25,#32 - mov v11.16b,v29.16b - mov w13,w26 - mov v15.16b,v30.16b - lsr x14,x26,#32 - mov v2.16b,v26.16b - mov w15,w27 - mov v6.16b,v26.16b - lsr x16,x27,#32 - add v19.4s,v3.4s,v31.4s // +4 - mov w17,w28 - add v23.4s,v7.4s,v31.4s // +4 - lsr x19,x28,#32 - mov v10.16b,v26.16b - mov w20,w30 - mov v14.16b,v26.16b - lsr x21,x30,#32 - mov v18.16b,v26.16b - stp q27,q28,[sp,#48] // off-load key block, variable part - mov v22.16b,v26.16b - str q29,[sp,#80] - - mov x4,#5 - subs x2,x2,#512 -.Loop_upper_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v9.16b,v9.16b,v9.16b,#4 - ext v13.16b,v13.16b,v13.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v9.16b,v9.16b,v9.16b,#12 - ext v13.16b,v13.16b,v13.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,.Loop_upper_neon - - add w5,w5,w22 // accumulate key block - add x6,x6,x22,lsr#32 - add w7,w7,w23 - add x8,x8,x23,lsr#32 - add w9,w9,w24 - add x10,x10,x24,lsr#32 - add w11,w11,w25 - add x12,x12,x25,lsr#32 - add w13,w13,w26 - add x14,x14,x26,lsr#32 - add w15,w15,w27 - add x16,x16,x27,lsr#32 - add w17,w17,w28 - add x19,x19,x28,lsr#32 - add w20,w20,w30 - add x21,x21,x30,lsr#32 - - add x5,x5,x6,lsl#32 // pack - add x7,x7,x8,lsl#32 - ldp x6,x8,[x1,#0] // load input - add x9,x9,x10,lsl#32 - add x11,x11,x12,lsl#32 - ldp x10,x12,[x1,#16] - add x13,x13,x14,lsl#32 - add x15,x15,x16,lsl#32 - ldp x14,x16,[x1,#32] - add x17,x17,x19,lsl#32 - add x20,x20,x21,lsl#32 - ldp x19,x21,[x1,#48] - add x1,x1,#64 -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor x15,x15,x16 - eor x17,x17,x19 - eor x20,x20,x21 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#1 // increment counter - mov w5,w22 // unpack key block - lsr x6,x22,#32 - stp x9,x11,[x0,#16] - mov w7,w23 - lsr x8,x23,#32 - stp x13,x15,[x0,#32] - mov w9,w24 - lsr x10,x24,#32 - stp x17,x20,[x0,#48] - add x0,x0,#64 - mov w11,w25 - lsr x12,x25,#32 - mov w13,w26 - lsr x14,x26,#32 - mov w15,w27 - lsr x16,x27,#32 - mov w17,w28 - lsr x19,x28,#32 - mov w20,w30 - lsr x21,x30,#32 - - mov x4,#5 -.Loop_lower_neon: - sub x4,x4,#1 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#12 - ext v7.16b,v7.16b,v7.16b,#12 - ext v11.16b,v11.16b,v11.16b,#12 - ext v15.16b,v15.16b,v15.16b,#12 - ext v19.16b,v19.16b,v19.16b,#12 - ext v23.16b,v23.16b,v23.16b,#12 - ext v1.16b,v1.16b,v1.16b,#4 - ext v5.16b,v5.16b,v5.16b,#4 - ext v9.16b,v9.16b,v9.16b,#4 - ext v13.16b,v13.16b,v13.16b,#4 - ext v17.16b,v17.16b,v17.16b,#4 - ext v21.16b,v21.16b,v21.16b,#4 - add v0.4s,v0.4s,v1.4s - add w5,w5,w9 - add v4.4s,v4.4s,v5.4s - add w6,w6,w10 - add v8.4s,v8.4s,v9.4s - add w7,w7,w11 - add v12.4s,v12.4s,v13.4s - add w8,w8,w12 - add v16.4s,v16.4s,v17.4s - eor w17,w17,w5 - add v20.4s,v20.4s,v21.4s - eor w19,w19,w6 - eor v3.16b,v3.16b,v0.16b - eor w20,w20,w7 - eor v7.16b,v7.16b,v4.16b - eor w21,w21,w8 - eor v11.16b,v11.16b,v8.16b - ror w17,w17,#16 - eor v15.16b,v15.16b,v12.16b - ror w19,w19,#16 - eor v19.16b,v19.16b,v16.16b - ror w20,w20,#16 - eor v23.16b,v23.16b,v20.16b - ror w21,w21,#16 - rev32 v3.8h,v3.8h - add w13,w13,w17 - rev32 v7.8h,v7.8h - add w14,w14,w19 - rev32 v11.8h,v11.8h - add w15,w15,w20 - rev32 v15.8h,v15.8h - add w16,w16,w21 - rev32 v19.8h,v19.8h - eor w9,w9,w13 - rev32 v23.8h,v23.8h - eor w10,w10,w14 - add v2.4s,v2.4s,v3.4s - eor w11,w11,w15 - add v6.4s,v6.4s,v7.4s - eor w12,w12,w16 - add v10.4s,v10.4s,v11.4s - ror w9,w9,#20 - add v14.4s,v14.4s,v15.4s - ror w10,w10,#20 - add v18.4s,v18.4s,v19.4s - ror w11,w11,#20 - add v22.4s,v22.4s,v23.4s - ror w12,w12,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w9 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w10 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w11 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w12 - eor v28.16b,v17.16b,v18.16b - eor w17,w17,w5 - eor v29.16b,v21.16b,v22.16b - eor w19,w19,w6 - ushr v1.4s,v24.4s,#20 - eor w20,w20,w7 - ushr v5.4s,v25.4s,#20 - eor w21,w21,w8 - ushr v9.4s,v26.4s,#20 - ror w17,w17,#24 - ushr v13.4s,v27.4s,#20 - ror w19,w19,#24 - ushr v17.4s,v28.4s,#20 - ror w20,w20,#24 - ushr v21.4s,v29.4s,#20 - ror w21,w21,#24 - sli v1.4s,v24.4s,#12 - add w13,w13,w17 - sli v5.4s,v25.4s,#12 - add w14,w14,w19 - sli v9.4s,v26.4s,#12 - add w15,w15,w20 - sli v13.4s,v27.4s,#12 - add w16,w16,w21 - sli v17.4s,v28.4s,#12 - eor w9,w9,w13 - sli v21.4s,v29.4s,#12 - eor w10,w10,w14 - add v0.4s,v0.4s,v1.4s - eor w11,w11,w15 - add v4.4s,v4.4s,v5.4s - eor w12,w12,w16 - add v8.4s,v8.4s,v9.4s - ror w9,w9,#25 - add v12.4s,v12.4s,v13.4s - ror w10,w10,#25 - add v16.4s,v16.4s,v17.4s - ror w11,w11,#25 - add v20.4s,v20.4s,v21.4s - ror w12,w12,#25 - eor v24.16b,v3.16b,v0.16b - add w5,w5,w10 - eor v25.16b,v7.16b,v4.16b - add w6,w6,w11 - eor v26.16b,v11.16b,v8.16b - add w7,w7,w12 - eor v27.16b,v15.16b,v12.16b - add w8,w8,w9 - eor v28.16b,v19.16b,v16.16b - eor w21,w21,w5 - eor v29.16b,v23.16b,v20.16b - eor w17,w17,w6 - ushr v3.4s,v24.4s,#24 - eor w19,w19,w7 - ushr v7.4s,v25.4s,#24 - eor w20,w20,w8 - ushr v11.4s,v26.4s,#24 - ror w21,w21,#16 - ushr v15.4s,v27.4s,#24 - ror w17,w17,#16 - ushr v19.4s,v28.4s,#24 - ror w19,w19,#16 - ushr v23.4s,v29.4s,#24 - ror w20,w20,#16 - sli v3.4s,v24.4s,#8 - add w15,w15,w21 - sli v7.4s,v25.4s,#8 - add w16,w16,w17 - sli v11.4s,v26.4s,#8 - add w13,w13,w19 - sli v15.4s,v27.4s,#8 - add w14,w14,w20 - sli v19.4s,v28.4s,#8 - eor w10,w10,w15 - sli v23.4s,v29.4s,#8 - eor w11,w11,w16 - add v2.4s,v2.4s,v3.4s - eor w12,w12,w13 - add v6.4s,v6.4s,v7.4s - eor w9,w9,w14 - add v10.4s,v10.4s,v11.4s - ror w10,w10,#20 - add v14.4s,v14.4s,v15.4s - ror w11,w11,#20 - add v18.4s,v18.4s,v19.4s - ror w12,w12,#20 - add v22.4s,v22.4s,v23.4s - ror w9,w9,#20 - eor v24.16b,v1.16b,v2.16b - add w5,w5,w10 - eor v25.16b,v5.16b,v6.16b - add w6,w6,w11 - eor v26.16b,v9.16b,v10.16b - add w7,w7,w12 - eor v27.16b,v13.16b,v14.16b - add w8,w8,w9 - eor v28.16b,v17.16b,v18.16b - eor w21,w21,w5 - eor v29.16b,v21.16b,v22.16b - eor w17,w17,w6 - ushr v1.4s,v24.4s,#25 - eor w19,w19,w7 - ushr v5.4s,v25.4s,#25 - eor w20,w20,w8 - ushr v9.4s,v26.4s,#25 - ror w21,w21,#24 - ushr v13.4s,v27.4s,#25 - ror w17,w17,#24 - ushr v17.4s,v28.4s,#25 - ror w19,w19,#24 - ushr v21.4s,v29.4s,#25 - ror w20,w20,#24 - sli v1.4s,v24.4s,#7 - add w15,w15,w21 - sli v5.4s,v25.4s,#7 - add w16,w16,w17 - sli v9.4s,v26.4s,#7 - add w13,w13,w19 - sli v13.4s,v27.4s,#7 - add w14,w14,w20 - sli v17.4s,v28.4s,#7 - eor w10,w10,w15 - sli v21.4s,v29.4s,#7 - eor w11,w11,w16 - ext v2.16b,v2.16b,v2.16b,#8 - eor w12,w12,w13 - ext v6.16b,v6.16b,v6.16b,#8 - eor w9,w9,w14 - ext v10.16b,v10.16b,v10.16b,#8 - ror w10,w10,#25 - ext v14.16b,v14.16b,v14.16b,#8 - ror w11,w11,#25 - ext v18.16b,v18.16b,v18.16b,#8 - ror w12,w12,#25 - ext v22.16b,v22.16b,v22.16b,#8 - ror w9,w9,#25 - ext v3.16b,v3.16b,v3.16b,#4 - ext v7.16b,v7.16b,v7.16b,#4 - ext v11.16b,v11.16b,v11.16b,#4 - ext v15.16b,v15.16b,v15.16b,#4 - ext v19.16b,v19.16b,v19.16b,#4 - ext v23.16b,v23.16b,v23.16b,#4 - ext v1.16b,v1.16b,v1.16b,#12 - ext v5.16b,v5.16b,v5.16b,#12 - ext v9.16b,v9.16b,v9.16b,#12 - ext v13.16b,v13.16b,v13.16b,#12 - ext v17.16b,v17.16b,v17.16b,#12 - ext v21.16b,v21.16b,v21.16b,#12 - cbnz x4,.Loop_lower_neon - - add w5,w5,w22 // accumulate key block - ldp q24,q25,[sp,#0] - add x6,x6,x22,lsr#32 - ldp q26,q27,[sp,#32] - add w7,w7,w23 - ldp q28,q29,[sp,#64] - add x8,x8,x23,lsr#32 - add v0.4s,v0.4s,v24.4s - add w9,w9,w24 - add v4.4s,v4.4s,v24.4s - add x10,x10,x24,lsr#32 - add v8.4s,v8.4s,v24.4s - add w11,w11,w25 - add v12.4s,v12.4s,v24.4s - add x12,x12,x25,lsr#32 - add v16.4s,v16.4s,v24.4s - add w13,w13,w26 - add v20.4s,v20.4s,v24.4s - add x14,x14,x26,lsr#32 - add v2.4s,v2.4s,v26.4s - add w15,w15,w27 - add v6.4s,v6.4s,v26.4s - add x16,x16,x27,lsr#32 - add v10.4s,v10.4s,v26.4s - add w17,w17,w28 - add v14.4s,v14.4s,v26.4s - add x19,x19,x28,lsr#32 - add v18.4s,v18.4s,v26.4s - add w20,w20,w30 - add v22.4s,v22.4s,v26.4s - add x21,x21,x30,lsr#32 - add v19.4s,v19.4s,v31.4s // +4 - add x5,x5,x6,lsl#32 // pack - add v23.4s,v23.4s,v31.4s // +4 - add x7,x7,x8,lsl#32 - add v3.4s,v3.4s,v27.4s - ldp x6,x8,[x1,#0] // load input - add v7.4s,v7.4s,v28.4s - add x9,x9,x10,lsl#32 - add v11.4s,v11.4s,v29.4s - add x11,x11,x12,lsl#32 - add v15.4s,v15.4s,v30.4s - ldp x10,x12,[x1,#16] - add v19.4s,v19.4s,v27.4s - add x13,x13,x14,lsl#32 - add v23.4s,v23.4s,v28.4s - add x15,x15,x16,lsl#32 - add v1.4s,v1.4s,v25.4s - ldp x14,x16,[x1,#32] - add v5.4s,v5.4s,v25.4s - add x17,x17,x19,lsl#32 - add v9.4s,v9.4s,v25.4s - add x20,x20,x21,lsl#32 - add v13.4s,v13.4s,v25.4s - ldp x19,x21,[x1,#48] - add v17.4s,v17.4s,v25.4s - add x1,x1,#64 - add v21.4s,v21.4s,v25.4s - -#ifdef __AARCH64EB__ - rev x5,x5 - rev x7,x7 - rev x9,x9 - rev x11,x11 - rev x13,x13 - rev x15,x15 - rev x17,x17 - rev x20,x20 -#endif - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 - eor x5,x5,x6 - eor x7,x7,x8 - eor x9,x9,x10 - eor x11,x11,x12 - eor x13,x13,x14 - eor v0.16b,v0.16b,v24.16b - eor x15,x15,x16 - eor v1.16b,v1.16b,v25.16b - eor x17,x17,x19 - eor v2.16b,v2.16b,v26.16b - eor x20,x20,x21 - eor v3.16b,v3.16b,v27.16b - ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x1],#64 - - stp x5,x7,[x0,#0] // store output - add x28,x28,#7 // increment counter - stp x9,x11,[x0,#16] - stp x13,x15,[x0,#32] - stp x17,x20,[x0,#48] - add x0,x0,#64 - st1 {v0.16b,v1.16b,v2.16b,v3.16b},[x0],#64 - - ld1 {v0.16b,v1.16b,v2.16b,v3.16b},[x1],#64 - eor v4.16b,v4.16b,v24.16b - eor v5.16b,v5.16b,v25.16b - eor v6.16b,v6.16b,v26.16b - eor v7.16b,v7.16b,v27.16b - st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 - - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - eor v8.16b,v8.16b,v0.16b - ldp q24,q25,[sp,#0] - eor v9.16b,v9.16b,v1.16b - ldp q26,q27,[sp,#32] - eor v10.16b,v10.16b,v2.16b - eor v11.16b,v11.16b,v3.16b - st1 {v8.16b,v9.16b,v10.16b,v11.16b},[x0],#64 - - ld1 {v8.16b,v9.16b,v10.16b,v11.16b},[x1],#64 - eor v12.16b,v12.16b,v4.16b - eor v13.16b,v13.16b,v5.16b - eor v14.16b,v14.16b,v6.16b - eor v15.16b,v15.16b,v7.16b - st1 {v12.16b,v13.16b,v14.16b,v15.16b},[x0],#64 - - ld1 {v12.16b,v13.16b,v14.16b,v15.16b},[x1],#64 - eor v16.16b,v16.16b,v8.16b - eor v17.16b,v17.16b,v9.16b - eor v18.16b,v18.16b,v10.16b - eor v19.16b,v19.16b,v11.16b - st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 - - shl v0.4s,v31.4s,#1 // 4 -> 8 - eor v20.16b,v20.16b,v12.16b - eor v21.16b,v21.16b,v13.16b - eor v22.16b,v22.16b,v14.16b - eor v23.16b,v23.16b,v15.16b - st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 - - add v27.4s,v27.4s,v0.4s // += 8 - add v28.4s,v28.4s,v0.4s - add v29.4s,v29.4s,v0.4s - add v30.4s,v30.4s,v0.4s - - b.hs .Loop_outer_512_neon - - adds x2,x2,#512 - ushr v0.4s,v31.4s,#2 // 4 -> 1 - - ldp d8,d9,[sp,#128+0] // meet ABI requirements - ldp d10,d11,[sp,#128+16] - ldp d12,d13,[sp,#128+32] - ldp d14,d15,[sp,#128+48] - - stp q24,q31,[sp,#0] // wipe off-load area - stp q24,q31,[sp,#32] - stp q24,q31,[sp,#64] - - b.eq .Ldone_512_neon - - cmp x2,#192 - sub v27.4s,v27.4s,v0.4s // -= 1 - sub v28.4s,v28.4s,v0.4s - sub v29.4s,v29.4s,v0.4s - add sp,sp,#128 - b.hs .Loop_outer_neon - - eor v25.16b,v25.16b,v25.16b - eor v26.16b,v26.16b,v26.16b - eor v27.16b,v27.16b,v27.16b - eor v28.16b,v28.16b,v28.16b - eor v29.16b,v29.16b,v29.16b - eor v30.16b,v30.16b,v30.16b - b .Loop_outer - -.Ldone_512_neon: - ldp x19,x20,[x29,#16] - add sp,sp,#128+64 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ChaCha20_512_neon,.-ChaCha20_512_neon -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S b/third_party/boringssl/linux-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S deleted file mode 100644 index 69e1296b..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S +++ /dev/null @@ -1,3020 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include -.section .rodata - -.align 7 -.Lchacha20_consts: -.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' -.Linc: -.long 1,2,3,4 -.Lrol8: -.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 -.Lclamp: -.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC - -.text - -.type .Lpoly_hash_ad_internal,%function -.align 6 -.Lpoly_hash_ad_internal: -.cfi_startproc - cbnz x4, .Lpoly_hash_intro - ret - -.Lpoly_hash_intro: - cmp x4, #16 - b.lt .Lpoly_hash_ad_tail - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - sub x4, x4, #16 - b .Lpoly_hash_ad_internal - -.Lpoly_hash_ad_tail: - cbz x4, .Lpoly_hash_ad_ret - - eor v20.16b, v20.16b, v20.16b // Use T0 to load the AAD - sub x4, x4, #1 - -.Lpoly_hash_tail_16_compose: - ext v20.16b, v20.16b, v20.16b, #15 - ldrb w11, [x3, x4] - mov v20.b[0], w11 - subs x4, x4, #1 - b.ge .Lpoly_hash_tail_16_compose - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - -.Lpoly_hash_ad_ret: - ret -.cfi_endproc -.size .Lpoly_hash_ad_internal, .-.Lpoly_hash_ad_internal - -///////////////////////////////// -// -// void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *seal_data); -// -.globl chacha20_poly1305_seal -.hidden chacha20_poly1305_seal -.type chacha20_poly1305_seal,%function -.align 6 -chacha20_poly1305_seal: - AARCH64_SIGN_LINK_REGISTER -.cfi_startproc - stp x29, x30, [sp, #-80]! -.cfi_def_cfa_offset 80 -.cfi_offset w30, -72 -.cfi_offset w29, -80 - mov x29, sp - // We probably could do .cfi_def_cfa w29, 80 at this point, but since - // we don't actually use the frame pointer like that, it's probably not - // worth bothering. - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] -.cfi_offset b15, -8 -.cfi_offset b14, -16 -.cfi_offset b13, -24 -.cfi_offset b12, -32 -.cfi_offset b11, -40 -.cfi_offset b10, -48 -.cfi_offset b9, -56 -.cfi_offset b8, -64 - - adrp x11, .Lchacha20_consts - add x11, x11, :lo12:.Lchacha20_consts - - ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values - ld1 {v28.16b - v30.16b}, [x5] - - mov x15, #1 // Prepare the Poly1305 state - mov x8, #0 - mov x9, #0 - mov x10, #0 - - ldr x12, [x5, #56] // The total cipher text length includes extra_in_len - add x12, x12, x2 - mov v31.d[0], x4 // Store the input and aad lengths - mov v31.d[1], x12 - - cmp x2, #128 - b.le .Lseal_128 // Optimization for smaller buffers - - // Initially we prepare 5 ChaCha20 blocks. Four to encrypt up to 4 blocks (256 bytes) of plaintext, - // and one for the Poly1305 R and S keys. The first four blocks (A0-A3..D0-D3) are computed vertically, - // the fifth block (A4-D4) horizontally. - ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] - mov v4.16b, v24.16b - - ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 - mov v9.16b, v28.16b - - ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 - mov v14.16b, v29.16b - - ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] - add v15.4s, v15.4s, v25.4s - mov v19.16b, v30.16b - - sub x5, x5, #32 - - mov x6, #10 - -.align 5 -.Lseal_init_rounds: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v9.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v18.8h, v18.8h - rev32 v19.8h, v19.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - eor v8.16b, v8.16b, v13.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v9.4s, #20 - sli v8.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - add v4.4s, v4.4s, v8.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v18.16b, {v18.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v14.16b - - ushr v9.4s, v8.4s, #25 - sli v9.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #4 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #12 - add v0.4s, v0.4s, v6.4s - add v1.4s, v1.4s, v7.4s - add v2.4s, v2.4s, v8.4s - add v3.4s, v3.4s, v5.4s - add v4.4s, v4.4s, v9.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v18.8h, v18.8h - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v19.8h, v19.8h - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v6.4s, #20 - sli v20.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v5.4s, #20 - sli v8.4s, v5.4s, #12 - ushr v5.4s, v9.4s, #20 - sli v5.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v5.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v18.16b, {v18.16b}, v26.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v12.16b - eor v6.16b, v6.16b, v13.16b - eor v7.16b, v7.16b, v10.16b - eor v8.16b, v8.16b, v11.16b - eor v5.16b, v5.16b, v14.16b - - ushr v9.4s, v5.4s, #25 - sli v9.4s, v5.4s, #7 - ushr v5.4s, v8.4s, #25 - sli v5.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v20.4s, #25 - sli v6.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #12 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #4 - subs x6, x6, #1 - b.hi .Lseal_init_rounds - - add v15.4s, v15.4s, v25.4s - mov x11, #4 - dup v20.4s, w11 - add v25.4s, v25.4s, v20.4s - - zip1 v20.4s, v0.4s, v1.4s - zip2 v21.4s, v0.4s, v1.4s - zip1 v22.4s, v2.4s, v3.4s - zip2 v23.4s, v2.4s, v3.4s - - zip1 v0.2d, v20.2d, v22.2d - zip2 v1.2d, v20.2d, v22.2d - zip1 v2.2d, v21.2d, v23.2d - zip2 v3.2d, v21.2d, v23.2d - - zip1 v20.4s, v5.4s, v6.4s - zip2 v21.4s, v5.4s, v6.4s - zip1 v22.4s, v7.4s, v8.4s - zip2 v23.4s, v7.4s, v8.4s - - zip1 v5.2d, v20.2d, v22.2d - zip2 v6.2d, v20.2d, v22.2d - zip1 v7.2d, v21.2d, v23.2d - zip2 v8.2d, v21.2d, v23.2d - - zip1 v20.4s, v10.4s, v11.4s - zip2 v21.4s, v10.4s, v11.4s - zip1 v22.4s, v12.4s, v13.4s - zip2 v23.4s, v12.4s, v13.4s - - zip1 v10.2d, v20.2d, v22.2d - zip2 v11.2d, v20.2d, v22.2d - zip1 v12.2d, v21.2d, v23.2d - zip2 v13.2d, v21.2d, v23.2d - - zip1 v20.4s, v15.4s, v16.4s - zip2 v21.4s, v15.4s, v16.4s - zip1 v22.4s, v17.4s, v18.4s - zip2 v23.4s, v17.4s, v18.4s - - zip1 v15.2d, v20.2d, v22.2d - zip2 v16.2d, v20.2d, v22.2d - zip1 v17.2d, v21.2d, v23.2d - zip2 v18.2d, v21.2d, v23.2d - - add v4.4s, v4.4s, v24.4s - add v9.4s, v9.4s, v28.4s - and v4.16b, v4.16b, v27.16b - - add v0.4s, v0.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v15.4s, v15.4s, v30.4s - - add v1.4s, v1.4s, v24.4s - add v6.4s, v6.4s, v28.4s - add v11.4s, v11.4s, v29.4s - add v16.4s, v16.4s, v30.4s - - add v2.4s, v2.4s, v24.4s - add v7.4s, v7.4s, v28.4s - add v12.4s, v12.4s, v29.4s - add v17.4s, v17.4s, v30.4s - - add v3.4s, v3.4s, v24.4s - add v8.4s, v8.4s, v28.4s - add v13.4s, v13.4s, v29.4s - add v18.4s, v18.4s, v30.4s - - mov x16, v4.d[0] // Move the R key to GPRs - mov x17, v4.d[1] - mov v27.16b, v9.16b // Store the S key - - bl .Lpoly_hash_ad_internal - - mov x3, x0 - cmp x2, #256 - b.le .Lseal_tail - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v0.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v15.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v1.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v11.16b - eor v23.16b, v23.16b, v16.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v2.16b - eor v21.16b, v21.16b, v7.16b - eor v22.16b, v22.16b, v12.16b - eor v23.16b, v23.16b, v17.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v3.16b - eor v21.16b, v21.16b, v8.16b - eor v22.16b, v22.16b, v13.16b - eor v23.16b, v23.16b, v18.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #256 - - mov x6, #4 // In the first run of the loop we need to hash 256 bytes, therefore we hash one block for the first 4 rounds - mov x7, #6 // and two blocks for the remaining 6, for a total of (1 * 4 + 2 * 6) * 16 = 256 - -.Lseal_main_loop: - adrp x11, .Lchacha20_consts - add x11, x11, :lo12:.Lchacha20_consts - - ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] - mov v4.16b, v24.16b - - ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 - mov v9.16b, v28.16b - - ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 - mov v14.16b, v29.16b - - ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] - add v15.4s, v15.4s, v25.4s - mov v19.16b, v30.16b - - eor v20.16b, v20.16b, v20.16b //zero - not v21.16b, v20.16b // -1 - sub v21.4s, v25.4s, v21.4s // Add +1 - ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) - add v19.4s, v19.4s, v20.4s - - sub x5, x5, #32 -.align 5 -.Lseal_main_loop_rounds: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v9.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v18.8h, v18.8h - rev32 v19.8h, v19.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - eor v8.16b, v8.16b, v13.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v9.4s, #20 - sli v8.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - add v4.4s, v4.4s, v8.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v18.16b, {v18.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v14.16b - - ushr v9.4s, v8.4s, #25 - sli v9.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #4 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #12 - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - add v0.4s, v0.4s, v6.4s - add v1.4s, v1.4s, v7.4s - add v2.4s, v2.4s, v8.4s - add v3.4s, v3.4s, v5.4s - add v4.4s, v4.4s, v9.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v18.8h, v18.8h - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v19.8h, v19.8h - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v6.4s, #20 - sli v20.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v5.4s, #20 - sli v8.4s, v5.4s, #12 - ushr v5.4s, v9.4s, #20 - sli v5.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v5.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v18.16b, {v18.16b}, v26.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v12.16b - eor v6.16b, v6.16b, v13.16b - eor v7.16b, v7.16b, v10.16b - eor v8.16b, v8.16b, v11.16b - eor v5.16b, v5.16b, v14.16b - - ushr v9.4s, v5.4s, #25 - sli v9.4s, v5.4s, #7 - ushr v5.4s, v8.4s, #25 - sli v5.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v20.4s, #25 - sli v6.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #12 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #4 - subs x6, x6, #1 - b.ge .Lseal_main_loop_rounds - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - subs x7, x7, #1 - b.gt .Lseal_main_loop_rounds - - eor v20.16b, v20.16b, v20.16b //zero - not v21.16b, v20.16b // -1 - sub v21.4s, v25.4s, v21.4s // Add +1 - ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) - add v19.4s, v19.4s, v20.4s - - add v15.4s, v15.4s, v25.4s - mov x11, #5 - dup v20.4s, w11 - add v25.4s, v25.4s, v20.4s - - zip1 v20.4s, v0.4s, v1.4s - zip2 v21.4s, v0.4s, v1.4s - zip1 v22.4s, v2.4s, v3.4s - zip2 v23.4s, v2.4s, v3.4s - - zip1 v0.2d, v20.2d, v22.2d - zip2 v1.2d, v20.2d, v22.2d - zip1 v2.2d, v21.2d, v23.2d - zip2 v3.2d, v21.2d, v23.2d - - zip1 v20.4s, v5.4s, v6.4s - zip2 v21.4s, v5.4s, v6.4s - zip1 v22.4s, v7.4s, v8.4s - zip2 v23.4s, v7.4s, v8.4s - - zip1 v5.2d, v20.2d, v22.2d - zip2 v6.2d, v20.2d, v22.2d - zip1 v7.2d, v21.2d, v23.2d - zip2 v8.2d, v21.2d, v23.2d - - zip1 v20.4s, v10.4s, v11.4s - zip2 v21.4s, v10.4s, v11.4s - zip1 v22.4s, v12.4s, v13.4s - zip2 v23.4s, v12.4s, v13.4s - - zip1 v10.2d, v20.2d, v22.2d - zip2 v11.2d, v20.2d, v22.2d - zip1 v12.2d, v21.2d, v23.2d - zip2 v13.2d, v21.2d, v23.2d - - zip1 v20.4s, v15.4s, v16.4s - zip2 v21.4s, v15.4s, v16.4s - zip1 v22.4s, v17.4s, v18.4s - zip2 v23.4s, v17.4s, v18.4s - - zip1 v15.2d, v20.2d, v22.2d - zip2 v16.2d, v20.2d, v22.2d - zip1 v17.2d, v21.2d, v23.2d - zip2 v18.2d, v21.2d, v23.2d - - add v0.4s, v0.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v15.4s, v15.4s, v30.4s - - add v1.4s, v1.4s, v24.4s - add v6.4s, v6.4s, v28.4s - add v11.4s, v11.4s, v29.4s - add v16.4s, v16.4s, v30.4s - - add v2.4s, v2.4s, v24.4s - add v7.4s, v7.4s, v28.4s - add v12.4s, v12.4s, v29.4s - add v17.4s, v17.4s, v30.4s - - add v3.4s, v3.4s, v24.4s - add v8.4s, v8.4s, v28.4s - add v13.4s, v13.4s, v29.4s - add v18.4s, v18.4s, v30.4s - - add v4.4s, v4.4s, v24.4s - add v9.4s, v9.4s, v28.4s - add v14.4s, v14.4s, v29.4s - add v19.4s, v19.4s, v30.4s - - cmp x2, #320 - b.le .Lseal_tail - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v0.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v15.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v1.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v11.16b - eor v23.16b, v23.16b, v16.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v2.16b - eor v21.16b, v21.16b, v7.16b - eor v22.16b, v22.16b, v12.16b - eor v23.16b, v23.16b, v17.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v3.16b - eor v21.16b, v21.16b, v8.16b - eor v22.16b, v22.16b, v13.16b - eor v23.16b, v23.16b, v18.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v4.16b - eor v21.16b, v21.16b, v9.16b - eor v22.16b, v22.16b, v14.16b - eor v23.16b, v23.16b, v19.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #320 - - mov x6, #0 - mov x7, #10 // For the remainder of the loop we always hash and encrypt 320 bytes per iteration - - b .Lseal_main_loop - -.Lseal_tail: - // This part of the function handles the storage and authentication of the last [0,320) bytes - // We assume A0-A4 ... D0-D4 hold at least inl (320 max) bytes of the stream data. - cmp x2, #64 - b.lt .Lseal_tail_64 - - // Store and authenticate 64B blocks per iteration - ld1 {v20.16b - v23.16b}, [x1], #64 - - eor v20.16b, v20.16b, v0.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v15.16b - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v21.d[0] - mov x12, v21.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v22.d[0] - mov x12, v22.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v23.d[0] - mov x12, v23.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - st1 {v20.16b - v23.16b}, [x0], #64 - sub x2, x2, #64 - - // Shift the state left by 64 bytes for the next iteration of the loop - mov v0.16b, v1.16b - mov v5.16b, v6.16b - mov v10.16b, v11.16b - mov v15.16b, v16.16b - - mov v1.16b, v2.16b - mov v6.16b, v7.16b - mov v11.16b, v12.16b - mov v16.16b, v17.16b - - mov v2.16b, v3.16b - mov v7.16b, v8.16b - mov v12.16b, v13.16b - mov v17.16b, v18.16b - - mov v3.16b, v4.16b - mov v8.16b, v9.16b - mov v13.16b, v14.16b - mov v18.16b, v19.16b - - b .Lseal_tail - -.Lseal_tail_64: - ldp x3, x4, [x5, #48] // extra_in_len and extra_in_ptr - - // Here we handle the last [0,64) bytes of plaintext - cmp x2, #16 - b.lt .Lseal_tail_16 - // Each iteration encrypt and authenticate a 16B block - ld1 {v20.16b}, [x1], #16 - eor v20.16b, v20.16b, v0.16b - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - st1 {v20.16b}, [x0], #16 - - sub x2, x2, #16 - - // Shift the state left by 16 bytes for the next iteration of the loop - mov v0.16b, v5.16b - mov v5.16b, v10.16b - mov v10.16b, v15.16b - - b .Lseal_tail_64 - -.Lseal_tail_16: - // Here we handle the last [0,16) bytes of ciphertext that require a padded block - cbz x2, .Lseal_hash_extra - - eor v20.16b, v20.16b, v20.16b // Use T0 to load the plaintext/extra in - eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask that will only mask the ciphertext bytes - not v22.16b, v20.16b - - mov x6, x2 - add x1, x1, x2 - - cbz x4, .Lseal_tail_16_compose // No extra data to pad with, zero padding - - mov x7, #16 // We need to load some extra_in first for padding - sub x7, x7, x2 - cmp x4, x7 - csel x7, x4, x7, lt // .Load the minimum of extra_in_len and the amount needed to fill the register - mov x12, x7 - add x3, x3, x7 - sub x4, x4, x7 - -.Lseal_tail16_compose_extra_in: - ext v20.16b, v20.16b, v20.16b, #15 - ldrb w11, [x3, #-1]! - mov v20.b[0], w11 - subs x7, x7, #1 - b.gt .Lseal_tail16_compose_extra_in - - add x3, x3, x12 - -.Lseal_tail_16_compose: - ext v20.16b, v20.16b, v20.16b, #15 - ldrb w11, [x1, #-1]! - mov v20.b[0], w11 - ext v21.16b, v22.16b, v21.16b, #15 - subs x2, x2, #1 - b.gt .Lseal_tail_16_compose - - and v0.16b, v0.16b, v21.16b - eor v20.16b, v20.16b, v0.16b - mov v21.16b, v20.16b - -.Lseal_tail_16_store: - umov w11, v20.b[0] - strb w11, [x0], #1 - ext v20.16b, v20.16b, v20.16b, #1 - subs x6, x6, #1 - b.gt .Lseal_tail_16_store - - // Hash in the final ct block concatenated with extra_in - mov x11, v21.d[0] - mov x12, v21.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - -.Lseal_hash_extra: - cbz x4, .Lseal_finalize - -.Lseal_hash_extra_loop: - cmp x4, #16 - b.lt .Lseal_hash_extra_tail - ld1 {v20.16b}, [x3], #16 - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - sub x4, x4, #16 - b .Lseal_hash_extra_loop - -.Lseal_hash_extra_tail: - cbz x4, .Lseal_finalize - eor v20.16b, v20.16b, v20.16b // Use T0 to load the remaining extra ciphertext - add x3, x3, x4 - -.Lseal_hash_extra_load: - ext v20.16b, v20.16b, v20.16b, #15 - ldrb w11, [x3, #-1]! - mov v20.b[0], w11 - subs x4, x4, #1 - b.gt .Lseal_hash_extra_load - - // Hash in the final padded extra_in blcok - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - -.Lseal_finalize: - mov x11, v31.d[0] - mov x12, v31.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - // Final reduction step - sub x12, xzr, x15 - orr x13, xzr, #3 - subs x11, x8, #-5 - sbcs x12, x9, x12 - sbcs x13, x10, x13 - csel x8, x11, x8, cs - csel x9, x12, x9, cs - csel x10, x13, x10, cs - mov x11, v27.d[0] - mov x12, v27.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - - stp x8, x9, [x5] - - ldp d8, d9, [sp, #16] - ldp d10, d11, [sp, #32] - ldp d12, d13, [sp, #48] - ldp d14, d15, [sp, #64] -.cfi_restore b15 -.cfi_restore b14 -.cfi_restore b13 -.cfi_restore b12 -.cfi_restore b11 -.cfi_restore b10 -.cfi_restore b9 -.cfi_restore b8 - ldp x29, x30, [sp], 80 -.cfi_restore w29 -.cfi_restore w30 -.cfi_def_cfa_offset 0 - AARCH64_VALIDATE_LINK_REGISTER - ret - -.Lseal_128: - // On some architectures preparing 5 blocks for small buffers is wasteful - eor v25.16b, v25.16b, v25.16b - mov x11, #1 - mov v25.s[0], w11 - mov v0.16b, v24.16b - mov v1.16b, v24.16b - mov v2.16b, v24.16b - mov v5.16b, v28.16b - mov v6.16b, v28.16b - mov v7.16b, v28.16b - mov v10.16b, v29.16b - mov v11.16b, v29.16b - mov v12.16b, v29.16b - mov v17.16b, v30.16b - add v15.4s, v17.4s, v25.4s - add v16.4s, v15.4s, v25.4s - - mov x6, #10 - -.Lseal_128_rounds: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #4 - ext v6.16b, v6.16b, v6.16b, #4 - ext v7.16b, v7.16b, v7.16b, #4 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #12 - ext v16.16b, v16.16b, v16.16b, #12 - ext v17.16b, v17.16b, v17.16b, #12 - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #12 - ext v6.16b, v6.16b, v6.16b, #12 - ext v7.16b, v7.16b, v7.16b, #12 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #4 - ext v16.16b, v16.16b, v16.16b, #4 - ext v17.16b, v17.16b, v17.16b, #4 - subs x6, x6, #1 - b.hi .Lseal_128_rounds - - add v0.4s, v0.4s, v24.4s - add v1.4s, v1.4s, v24.4s - add v2.4s, v2.4s, v24.4s - - add v5.4s, v5.4s, v28.4s - add v6.4s, v6.4s, v28.4s - add v7.4s, v7.4s, v28.4s - - // Only the first 32 bytes of the third block (counter = 0) are needed, - // so skip updating v12 and v17. - add v10.4s, v10.4s, v29.4s - add v11.4s, v11.4s, v29.4s - - add v30.4s, v30.4s, v25.4s - add v15.4s, v15.4s, v30.4s - add v30.4s, v30.4s, v25.4s - add v16.4s, v16.4s, v30.4s - - and v2.16b, v2.16b, v27.16b - mov x16, v2.d[0] // Move the R key to GPRs - mov x17, v2.d[1] - mov v27.16b, v7.16b // Store the S key - - bl .Lpoly_hash_ad_internal - b .Lseal_tail -.cfi_endproc -.size chacha20_poly1305_seal,.-chacha20_poly1305_seal - -///////////////////////////////// -// -// void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, union open_data *aead_data); -// -.globl chacha20_poly1305_open -.hidden chacha20_poly1305_open -.type chacha20_poly1305_open,%function -.align 6 -chacha20_poly1305_open: - AARCH64_SIGN_LINK_REGISTER -.cfi_startproc - stp x29, x30, [sp, #-80]! -.cfi_def_cfa_offset 80 -.cfi_offset w30, -72 -.cfi_offset w29, -80 - mov x29, sp - // We probably could do .cfi_def_cfa w29, 80 at this point, but since - // we don't actually use the frame pointer like that, it's probably not - // worth bothering. - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] -.cfi_offset b15, -8 -.cfi_offset b14, -16 -.cfi_offset b13, -24 -.cfi_offset b12, -32 -.cfi_offset b11, -40 -.cfi_offset b10, -48 -.cfi_offset b9, -56 -.cfi_offset b8, -64 - - adrp x11, .Lchacha20_consts - add x11, x11, :lo12:.Lchacha20_consts - - ld1 {v24.16b - v27.16b}, [x11] // .Load the CONSTS, INC, ROL8 and CLAMP values - ld1 {v28.16b - v30.16b}, [x5] - - mov x15, #1 // Prepare the Poly1305 state - mov x8, #0 - mov x9, #0 - mov x10, #0 - - mov v31.d[0], x4 // Store the input and aad lengths - mov v31.d[1], x2 - - cmp x2, #128 - b.le .Lopen_128 // Optimization for smaller buffers - - // Initially we prepare a single ChaCha20 block for the Poly1305 R and S keys - mov v0.16b, v24.16b - mov v5.16b, v28.16b - mov v10.16b, v29.16b - mov v15.16b, v30.16b - - mov x6, #10 - -.align 5 -.Lopen_init_rounds: - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #4 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #12 - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #12 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #4 - subs x6, x6, #1 - b.hi .Lopen_init_rounds - - add v0.4s, v0.4s, v24.4s - add v5.4s, v5.4s, v28.4s - - and v0.16b, v0.16b, v27.16b - mov x16, v0.d[0] // Move the R key to GPRs - mov x17, v0.d[1] - mov v27.16b, v5.16b // Store the S key - - bl .Lpoly_hash_ad_internal - -.Lopen_ad_done: - mov x3, x1 - -// Each iteration of the loop hash 320 bytes, and prepare stream for 320 bytes -.Lopen_main_loop: - - cmp x2, #192 - b.lt .Lopen_tail - - adrp x11, .Lchacha20_consts - add x11, x11, :lo12:.Lchacha20_consts - - ld4r {v0.4s,v1.4s,v2.4s,v3.4s}, [x11] - mov v4.16b, v24.16b - - ld4r {v5.4s,v6.4s,v7.4s,v8.4s}, [x5], #16 - mov v9.16b, v28.16b - - ld4r {v10.4s,v11.4s,v12.4s,v13.4s}, [x5], #16 - mov v14.16b, v29.16b - - ld4r {v15.4s,v16.4s,v17.4s,v18.4s}, [x5] - sub x5, x5, #32 - add v15.4s, v15.4s, v25.4s - mov v19.16b, v30.16b - - eor v20.16b, v20.16b, v20.16b //zero - not v21.16b, v20.16b // -1 - sub v21.4s, v25.4s, v21.4s // Add +1 - ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) - add v19.4s, v19.4s, v20.4s - - lsr x4, x2, #4 // How many whole blocks we have to hash, will always be at least 12 - sub x4, x4, #10 - - mov x7, #10 - subs x6, x7, x4 - subs x6, x7, x4 // itr1 can be negative if we have more than 320 bytes to hash - csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are full - - cbz x7, .Lopen_main_loop_rounds_short - -.align 5 -.Lopen_main_loop_rounds: - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most -.Lopen_main_loop_rounds_short: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v9.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v18.8h, v18.8h - rev32 v19.8h, v19.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - eor v8.16b, v8.16b, v13.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v9.4s, #20 - sli v8.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - add v3.4s, v3.4s, v7.4s - add v4.4s, v4.4s, v8.4s - - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - eor v18.16b, v18.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v18.16b, {v18.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - add v13.4s, v13.4s, v18.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v14.16b - - ushr v9.4s, v8.4s, #25 - sli v9.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #4 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #12 - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - add v0.4s, v0.4s, v6.4s - add v1.4s, v1.4s, v7.4s - add v2.4s, v2.4s, v8.4s - add v3.4s, v3.4s, v5.4s - add v4.4s, v4.4s, v9.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - rev32 v18.8h, v18.8h - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - rev32 v19.8h, v19.8h - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v6.16b, v6.16b, v12.16b - eor v7.16b, v7.16b, v13.16b - eor v8.16b, v8.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v9.16b, v9.16b, v14.16b - - ushr v20.4s, v6.4s, #20 - sli v20.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - ushr v7.4s, v8.4s, #20 - sli v7.4s, v8.4s, #12 - ushr v8.4s, v5.4s, #20 - sli v8.4s, v5.4s, #12 - ushr v5.4s, v9.4s, #20 - sli v5.4s, v9.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - add v3.4s, v3.4s, v8.4s - add v4.4s, v4.4s, v5.4s - - eor v18.16b, v18.16b, v0.16b - eor v15.16b, v15.16b, v1.16b - eor v16.16b, v16.16b, v2.16b - eor v17.16b, v17.16b, v3.16b - eor v19.16b, v19.16b, v4.16b - - tbl v18.16b, {v18.16b}, v26.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - tbl v19.16b, {v19.16b}, v26.16b - - add v12.4s, v12.4s, v18.4s - add v13.4s, v13.4s, v15.4s - add v10.4s, v10.4s, v16.4s - add v11.4s, v11.4s, v17.4s - add v14.4s, v14.4s, v19.4s - - eor v20.16b, v20.16b, v12.16b - eor v6.16b, v6.16b, v13.16b - eor v7.16b, v7.16b, v10.16b - eor v8.16b, v8.16b, v11.16b - eor v5.16b, v5.16b, v14.16b - - ushr v9.4s, v5.4s, #25 - sli v9.4s, v5.4s, #7 - ushr v5.4s, v8.4s, #25 - sli v5.4s, v8.4s, #7 - ushr v8.4s, v7.4s, #25 - sli v8.4s, v7.4s, #7 - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v20.4s, #25 - sli v6.4s, v20.4s, #7 - - ext v9.16b, v9.16b, v9.16b, #12 - ext v14.16b, v14.16b, v14.16b, #8 - ext v19.16b, v19.16b, v19.16b, #4 - subs x7, x7, #1 - b.gt .Lopen_main_loop_rounds - subs x6, x6, #1 - b.ge .Lopen_main_loop_rounds_short - - eor v20.16b, v20.16b, v20.16b //zero - not v21.16b, v20.16b // -1 - sub v21.4s, v25.4s, v21.4s // Add +1 - ext v20.16b, v21.16b, v20.16b, #12 // Get the last element (counter) - add v19.4s, v19.4s, v20.4s - - add v15.4s, v15.4s, v25.4s - mov x11, #5 - dup v20.4s, w11 - add v25.4s, v25.4s, v20.4s - - zip1 v20.4s, v0.4s, v1.4s - zip2 v21.4s, v0.4s, v1.4s - zip1 v22.4s, v2.4s, v3.4s - zip2 v23.4s, v2.4s, v3.4s - - zip1 v0.2d, v20.2d, v22.2d - zip2 v1.2d, v20.2d, v22.2d - zip1 v2.2d, v21.2d, v23.2d - zip2 v3.2d, v21.2d, v23.2d - - zip1 v20.4s, v5.4s, v6.4s - zip2 v21.4s, v5.4s, v6.4s - zip1 v22.4s, v7.4s, v8.4s - zip2 v23.4s, v7.4s, v8.4s - - zip1 v5.2d, v20.2d, v22.2d - zip2 v6.2d, v20.2d, v22.2d - zip1 v7.2d, v21.2d, v23.2d - zip2 v8.2d, v21.2d, v23.2d - - zip1 v20.4s, v10.4s, v11.4s - zip2 v21.4s, v10.4s, v11.4s - zip1 v22.4s, v12.4s, v13.4s - zip2 v23.4s, v12.4s, v13.4s - - zip1 v10.2d, v20.2d, v22.2d - zip2 v11.2d, v20.2d, v22.2d - zip1 v12.2d, v21.2d, v23.2d - zip2 v13.2d, v21.2d, v23.2d - - zip1 v20.4s, v15.4s, v16.4s - zip2 v21.4s, v15.4s, v16.4s - zip1 v22.4s, v17.4s, v18.4s - zip2 v23.4s, v17.4s, v18.4s - - zip1 v15.2d, v20.2d, v22.2d - zip2 v16.2d, v20.2d, v22.2d - zip1 v17.2d, v21.2d, v23.2d - zip2 v18.2d, v21.2d, v23.2d - - add v0.4s, v0.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v15.4s, v15.4s, v30.4s - - add v1.4s, v1.4s, v24.4s - add v6.4s, v6.4s, v28.4s - add v11.4s, v11.4s, v29.4s - add v16.4s, v16.4s, v30.4s - - add v2.4s, v2.4s, v24.4s - add v7.4s, v7.4s, v28.4s - add v12.4s, v12.4s, v29.4s - add v17.4s, v17.4s, v30.4s - - add v3.4s, v3.4s, v24.4s - add v8.4s, v8.4s, v28.4s - add v13.4s, v13.4s, v29.4s - add v18.4s, v18.4s, v30.4s - - add v4.4s, v4.4s, v24.4s - add v9.4s, v9.4s, v28.4s - add v14.4s, v14.4s, v29.4s - add v19.4s, v19.4s, v30.4s - - // We can always safely store 192 bytes - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v0.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v15.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v1.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v11.16b - eor v23.16b, v23.16b, v16.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v2.16b - eor v21.16b, v21.16b, v7.16b - eor v22.16b, v22.16b, v12.16b - eor v23.16b, v23.16b, v17.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #192 - - mov v0.16b, v3.16b - mov v5.16b, v8.16b - mov v10.16b, v13.16b - mov v15.16b, v18.16b - - cmp x2, #64 - b.lt .Lopen_tail_64_store - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v3.16b - eor v21.16b, v21.16b, v8.16b - eor v22.16b, v22.16b, v13.16b - eor v23.16b, v23.16b, v18.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #64 - - mov v0.16b, v4.16b - mov v5.16b, v9.16b - mov v10.16b, v14.16b - mov v15.16b, v19.16b - - cmp x2, #64 - b.lt .Lopen_tail_64_store - - ld1 {v20.16b - v23.16b}, [x1], #64 - eor v20.16b, v20.16b, v4.16b - eor v21.16b, v21.16b, v9.16b - eor v22.16b, v22.16b, v14.16b - eor v23.16b, v23.16b, v19.16b - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #64 - b .Lopen_main_loop - -.Lopen_tail: - - cbz x2, .Lopen_finalize - - lsr x4, x2, #4 // How many whole blocks we have to hash - - cmp x2, #64 - b.le .Lopen_tail_64 - cmp x2, #128 - b.le .Lopen_tail_128 - -.Lopen_tail_192: - // We need three more blocks - mov v0.16b, v24.16b - mov v1.16b, v24.16b - mov v2.16b, v24.16b - mov v5.16b, v28.16b - mov v6.16b, v28.16b - mov v7.16b, v28.16b - mov v10.16b, v29.16b - mov v11.16b, v29.16b - mov v12.16b, v29.16b - mov v15.16b, v30.16b - mov v16.16b, v30.16b - mov v17.16b, v30.16b - eor v23.16b, v23.16b, v23.16b - eor v21.16b, v21.16b, v21.16b - ins v23.s[0], v25.s[0] - ins v21.d[0], x15 - - add v22.4s, v23.4s, v21.4s - add v21.4s, v22.4s, v21.4s - - add v15.4s, v15.4s, v21.4s - add v16.4s, v16.4s, v23.4s - add v17.4s, v17.4s, v22.4s - - mov x7, #10 - subs x6, x7, x4 // itr1 can be negative if we have more than 160 bytes to hash - csel x7, x7, x4, le // if itr1 is zero or less, itr2 should be 10 to indicate all 10 rounds are hashing - sub x4, x4, x7 - - cbz x7, .Lopen_tail_192_rounds_no_hash - -.Lopen_tail_192_rounds: - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most -.Lopen_tail_192_rounds_no_hash: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #4 - ext v6.16b, v6.16b, v6.16b, #4 - ext v7.16b, v7.16b, v7.16b, #4 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #12 - ext v16.16b, v16.16b, v16.16b, #12 - ext v17.16b, v17.16b, v17.16b, #12 - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #12 - ext v6.16b, v6.16b, v6.16b, #12 - ext v7.16b, v7.16b, v7.16b, #12 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #4 - ext v16.16b, v16.16b, v16.16b, #4 - ext v17.16b, v17.16b, v17.16b, #4 - subs x7, x7, #1 - b.gt .Lopen_tail_192_rounds - subs x6, x6, #1 - b.ge .Lopen_tail_192_rounds_no_hash - - // We hashed 160 bytes at most, may still have 32 bytes left -.Lopen_tail_192_hash: - cbz x4, .Lopen_tail_192_hash_done - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - sub x4, x4, #1 - b .Lopen_tail_192_hash - -.Lopen_tail_192_hash_done: - - add v0.4s, v0.4s, v24.4s - add v1.4s, v1.4s, v24.4s - add v2.4s, v2.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v6.4s, v6.4s, v28.4s - add v7.4s, v7.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v11.4s, v11.4s, v29.4s - add v12.4s, v12.4s, v29.4s - add v15.4s, v15.4s, v30.4s - add v16.4s, v16.4s, v30.4s - add v17.4s, v17.4s, v30.4s - - add v15.4s, v15.4s, v21.4s - add v16.4s, v16.4s, v23.4s - add v17.4s, v17.4s, v22.4s - - ld1 {v20.16b - v23.16b}, [x1], #64 - - eor v20.16b, v20.16b, v1.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v11.16b - eor v23.16b, v23.16b, v16.16b - - st1 {v20.16b - v23.16b}, [x0], #64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - - eor v20.16b, v20.16b, v2.16b - eor v21.16b, v21.16b, v7.16b - eor v22.16b, v22.16b, v12.16b - eor v23.16b, v23.16b, v17.16b - - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #128 - b .Lopen_tail_64_store - -.Lopen_tail_128: - // We need two more blocks - mov v0.16b, v24.16b - mov v1.16b, v24.16b - mov v5.16b, v28.16b - mov v6.16b, v28.16b - mov v10.16b, v29.16b - mov v11.16b, v29.16b - mov v15.16b, v30.16b - mov v16.16b, v30.16b - eor v23.16b, v23.16b, v23.16b - eor v22.16b, v22.16b, v22.16b - ins v23.s[0], v25.s[0] - ins v22.d[0], x15 - add v22.4s, v22.4s, v23.4s - - add v15.4s, v15.4s, v22.4s - add v16.4s, v16.4s, v23.4s - - mov x6, #10 - sub x6, x6, x4 - -.Lopen_tail_128_rounds: - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #4 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #12 - add v1.4s, v1.4s, v6.4s - eor v16.16b, v16.16b, v1.16b - rev32 v16.8h, v16.8h - - add v11.4s, v11.4s, v16.4s - eor v6.16b, v6.16b, v11.16b - ushr v20.4s, v6.4s, #20 - sli v20.4s, v6.4s, #12 - add v1.4s, v1.4s, v20.4s - eor v16.16b, v16.16b, v1.16b - tbl v16.16b, {v16.16b}, v26.16b - - add v11.4s, v11.4s, v16.4s - eor v20.16b, v20.16b, v11.16b - ushr v6.4s, v20.4s, #25 - sli v6.4s, v20.4s, #7 - ext v6.16b, v6.16b, v6.16b, #4 - ext v11.16b, v11.16b, v11.16b, #8 - ext v16.16b, v16.16b, v16.16b, #12 - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #12 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #4 - add v1.4s, v1.4s, v6.4s - eor v16.16b, v16.16b, v1.16b - rev32 v16.8h, v16.8h - - add v11.4s, v11.4s, v16.4s - eor v6.16b, v6.16b, v11.16b - ushr v20.4s, v6.4s, #20 - sli v20.4s, v6.4s, #12 - add v1.4s, v1.4s, v20.4s - eor v16.16b, v16.16b, v1.16b - tbl v16.16b, {v16.16b}, v26.16b - - add v11.4s, v11.4s, v16.4s - eor v20.16b, v20.16b, v11.16b - ushr v6.4s, v20.4s, #25 - sli v6.4s, v20.4s, #7 - ext v6.16b, v6.16b, v6.16b, #12 - ext v11.16b, v11.16b, v11.16b, #8 - ext v16.16b, v16.16b, v16.16b, #4 - subs x6, x6, #1 - b.gt .Lopen_tail_128_rounds - cbz x4, .Lopen_tail_128_rounds_done - subs x4, x4, #1 - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - b .Lopen_tail_128_rounds - -.Lopen_tail_128_rounds_done: - add v0.4s, v0.4s, v24.4s - add v1.4s, v1.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v6.4s, v6.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v11.4s, v11.4s, v29.4s - add v15.4s, v15.4s, v30.4s - add v16.4s, v16.4s, v30.4s - add v15.4s, v15.4s, v22.4s - add v16.4s, v16.4s, v23.4s - - ld1 {v20.16b - v23.16b}, [x1], #64 - - eor v20.16b, v20.16b, v1.16b - eor v21.16b, v21.16b, v6.16b - eor v22.16b, v22.16b, v11.16b - eor v23.16b, v23.16b, v16.16b - - st1 {v20.16b - v23.16b}, [x0], #64 - sub x2, x2, #64 - - b .Lopen_tail_64_store - -.Lopen_tail_64: - // We just need a single block - mov v0.16b, v24.16b - mov v5.16b, v28.16b - mov v10.16b, v29.16b - mov v15.16b, v30.16b - eor v23.16b, v23.16b, v23.16b - ins v23.s[0], v25.s[0] - add v15.4s, v15.4s, v23.4s - - mov x6, #10 - sub x6, x6, x4 - -.Lopen_tail_64_rounds: - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #4 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #12 - add v0.4s, v0.4s, v5.4s - eor v15.16b, v15.16b, v0.16b - rev32 v15.8h, v15.8h - - add v10.4s, v10.4s, v15.4s - eor v5.16b, v5.16b, v10.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - add v0.4s, v0.4s, v20.4s - eor v15.16b, v15.16b, v0.16b - tbl v15.16b, {v15.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - eor v20.16b, v20.16b, v10.16b - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - ext v5.16b, v5.16b, v5.16b, #12 - ext v10.16b, v10.16b, v10.16b, #8 - ext v15.16b, v15.16b, v15.16b, #4 - subs x6, x6, #1 - b.gt .Lopen_tail_64_rounds - cbz x4, .Lopen_tail_64_rounds_done - subs x4, x4, #1 - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - b .Lopen_tail_64_rounds - -.Lopen_tail_64_rounds_done: - add v0.4s, v0.4s, v24.4s - add v5.4s, v5.4s, v28.4s - add v10.4s, v10.4s, v29.4s - add v15.4s, v15.4s, v30.4s - add v15.4s, v15.4s, v23.4s - -.Lopen_tail_64_store: - cmp x2, #16 - b.lt .Lopen_tail_16 - - ld1 {v20.16b}, [x1], #16 - eor v20.16b, v20.16b, v0.16b - st1 {v20.16b}, [x0], #16 - mov v0.16b, v5.16b - mov v5.16b, v10.16b - mov v10.16b, v15.16b - sub x2, x2, #16 - b .Lopen_tail_64_store - -.Lopen_tail_16: - // Here we handle the last [0,16) bytes that require a padded block - cbz x2, .Lopen_finalize - - eor v20.16b, v20.16b, v20.16b // Use T0 to load the ciphertext - eor v21.16b, v21.16b, v21.16b // Use T1 to generate an AND mask - not v22.16b, v20.16b - - add x7, x1, x2 - mov x6, x2 - -.Lopen_tail_16_compose: - ext v20.16b, v20.16b, v20.16b, #15 - ldrb w11, [x7, #-1]! - mov v20.b[0], w11 - ext v21.16b, v22.16b, v21.16b, #15 - subs x2, x2, #1 - b.gt .Lopen_tail_16_compose - - and v20.16b, v20.16b, v21.16b - // Hash in the final padded block - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - eor v20.16b, v20.16b, v0.16b - -.Lopen_tail_16_store: - umov w11, v20.b[0] - strb w11, [x0], #1 - ext v20.16b, v20.16b, v20.16b, #1 - subs x6, x6, #1 - b.gt .Lopen_tail_16_store - -.Lopen_finalize: - mov x11, v31.d[0] - mov x12, v31.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - // Final reduction step - sub x12, xzr, x15 - orr x13, xzr, #3 - subs x11, x8, #-5 - sbcs x12, x9, x12 - sbcs x13, x10, x13 - csel x8, x11, x8, cs - csel x9, x12, x9, cs - csel x10, x13, x10, cs - mov x11, v27.d[0] - mov x12, v27.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - - stp x8, x9, [x5] - - ldp d8, d9, [sp, #16] - ldp d10, d11, [sp, #32] - ldp d12, d13, [sp, #48] - ldp d14, d15, [sp, #64] -.cfi_restore b15 -.cfi_restore b14 -.cfi_restore b13 -.cfi_restore b12 -.cfi_restore b11 -.cfi_restore b10 -.cfi_restore b9 -.cfi_restore b8 - ldp x29, x30, [sp], 80 -.cfi_restore w29 -.cfi_restore w30 -.cfi_def_cfa_offset 0 - AARCH64_VALIDATE_LINK_REGISTER - ret - -.Lopen_128: - // On some architectures preparing 5 blocks for small buffers is wasteful - eor v25.16b, v25.16b, v25.16b - mov x11, #1 - mov v25.s[0], w11 - mov v0.16b, v24.16b - mov v1.16b, v24.16b - mov v2.16b, v24.16b - mov v5.16b, v28.16b - mov v6.16b, v28.16b - mov v7.16b, v28.16b - mov v10.16b, v29.16b - mov v11.16b, v29.16b - mov v12.16b, v29.16b - mov v17.16b, v30.16b - add v15.4s, v17.4s, v25.4s - add v16.4s, v15.4s, v25.4s - - mov x6, #10 - -.Lopen_128_rounds: - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #4 - ext v6.16b, v6.16b, v6.16b, #4 - ext v7.16b, v7.16b, v7.16b, #4 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #12 - ext v16.16b, v16.16b, v16.16b, #12 - ext v17.16b, v17.16b, v17.16b, #12 - add v0.4s, v0.4s, v5.4s - add v1.4s, v1.4s, v6.4s - add v2.4s, v2.4s, v7.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - rev32 v15.8h, v15.8h - rev32 v16.8h, v16.8h - rev32 v17.8h, v17.8h - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v5.16b, v5.16b, v10.16b - eor v6.16b, v6.16b, v11.16b - eor v7.16b, v7.16b, v12.16b - ushr v20.4s, v5.4s, #20 - sli v20.4s, v5.4s, #12 - ushr v5.4s, v6.4s, #20 - sli v5.4s, v6.4s, #12 - ushr v6.4s, v7.4s, #20 - sli v6.4s, v7.4s, #12 - - add v0.4s, v0.4s, v20.4s - add v1.4s, v1.4s, v5.4s - add v2.4s, v2.4s, v6.4s - eor v15.16b, v15.16b, v0.16b - eor v16.16b, v16.16b, v1.16b - eor v17.16b, v17.16b, v2.16b - tbl v15.16b, {v15.16b}, v26.16b - tbl v16.16b, {v16.16b}, v26.16b - tbl v17.16b, {v17.16b}, v26.16b - - add v10.4s, v10.4s, v15.4s - add v11.4s, v11.4s, v16.4s - add v12.4s, v12.4s, v17.4s - eor v20.16b, v20.16b, v10.16b - eor v5.16b, v5.16b, v11.16b - eor v6.16b, v6.16b, v12.16b - ushr v7.4s, v6.4s, #25 - sli v7.4s, v6.4s, #7 - ushr v6.4s, v5.4s, #25 - sli v6.4s, v5.4s, #7 - ushr v5.4s, v20.4s, #25 - sli v5.4s, v20.4s, #7 - - ext v5.16b, v5.16b, v5.16b, #12 - ext v6.16b, v6.16b, v6.16b, #12 - ext v7.16b, v7.16b, v7.16b, #12 - - ext v10.16b, v10.16b, v10.16b, #8 - ext v11.16b, v11.16b, v11.16b, #8 - ext v12.16b, v12.16b, v12.16b, #8 - - ext v15.16b, v15.16b, v15.16b, #4 - ext v16.16b, v16.16b, v16.16b, #4 - ext v17.16b, v17.16b, v17.16b, #4 - subs x6, x6, #1 - b.hi .Lopen_128_rounds - - add v0.4s, v0.4s, v24.4s - add v1.4s, v1.4s, v24.4s - add v2.4s, v2.4s, v24.4s - - add v5.4s, v5.4s, v28.4s - add v6.4s, v6.4s, v28.4s - add v7.4s, v7.4s, v28.4s - - add v10.4s, v10.4s, v29.4s - add v11.4s, v11.4s, v29.4s - - add v30.4s, v30.4s, v25.4s - add v15.4s, v15.4s, v30.4s - add v30.4s, v30.4s, v25.4s - add v16.4s, v16.4s, v30.4s - - and v2.16b, v2.16b, v27.16b - mov x16, v2.d[0] // Move the R key to GPRs - mov x17, v2.d[1] - mov v27.16b, v7.16b // Store the S key - - bl .Lpoly_hash_ad_internal - -.Lopen_128_store: - cmp x2, #64 - b.lt .Lopen_128_store_64 - - ld1 {v20.16b - v23.16b}, [x1], #64 - - mov x11, v20.d[0] - mov x12, v20.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v21.d[0] - mov x12, v21.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v22.d[0] - mov x12, v22.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - mov x11, v23.d[0] - mov x12, v23.d[1] - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - - eor v20.16b, v20.16b, v0.16b - eor v21.16b, v21.16b, v5.16b - eor v22.16b, v22.16b, v10.16b - eor v23.16b, v23.16b, v15.16b - - st1 {v20.16b - v23.16b}, [x0], #64 - - sub x2, x2, #64 - - mov v0.16b, v1.16b - mov v5.16b, v6.16b - mov v10.16b, v11.16b - mov v15.16b, v16.16b - -.Lopen_128_store_64: - - lsr x4, x2, #4 - mov x3, x1 - -.Lopen_128_hash_64: - cbz x4, .Lopen_tail_64_store - ldp x11, x12, [x3], 16 - adds x8, x8, x11 - adcs x9, x9, x12 - adc x10, x10, x15 - mul x11, x8, x16 // [t2:t1:t0] = [acc2:acc1:acc0] * r0 - umulh x12, x8, x16 - mul x13, x9, x16 - umulh x14, x9, x16 - adds x12, x12, x13 - mul x13, x10, x16 - adc x13, x13, x14 - mul x14, x8, x17 // [t3:t2:t1:t0] = [acc2:acc1:acc0] * [r1:r0] - umulh x8, x8, x17 - adds x12, x12, x14 - mul x14, x9, x17 - umulh x9, x9, x17 - adcs x14, x14, x8 - mul x10, x10, x17 - adc x10, x10, x9 - adds x13, x13, x14 - adc x14, x10, xzr - and x10, x13, #3 // At this point acc2 is 2 bits at most (value of 3) - and x8, x13, #-4 - extr x13, x14, x13, #2 - adds x8, x8, x11 - lsr x11, x14, #2 - adc x9, x14, x11 // No carry out since t0 is 61 bits and t3 is 63 bits - adds x8, x8, x13 - adcs x9, x9, x12 - adc x10, x10, xzr // At this point acc2 has the value of 4 at most - sub x4, x4, #1 - b .Lopen_128_hash_64 -.cfi_endproc -.size chacha20_poly1305_open,.-chacha20_poly1305_open -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S deleted file mode 100644 index 4949ba30..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/aesv8-armx64.S +++ /dev/null @@ -1,802 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -#if __ARM_MAX_ARCH__>=7 -.text -.arch armv8-a+crypto -.section .rodata -.align 5 -.Lrcon: -.long 0x01,0x01,0x01,0x01 -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat -.long 0x1b,0x1b,0x1b,0x1b - -.text - -.globl aes_hw_set_encrypt_key -.hidden aes_hw_set_encrypt_key -.type aes_hw_set_encrypt_key,%function -.align 5 -aes_hw_set_encrypt_key: -.Lenc_key: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - AARCH64_VALID_CALL_TARGET - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - mov x3,#-1 - cmp x0,#0 - b.eq .Lenc_key_abort - cmp x2,#0 - b.eq .Lenc_key_abort - mov x3,#-2 - cmp w1,#128 - b.lt .Lenc_key_abort - cmp w1,#256 - b.gt .Lenc_key_abort - tst w1,#0x3f - b.ne .Lenc_key_abort - - adrp x3,.Lrcon - add x3,x3,:lo12:.Lrcon - cmp w1,#192 - - eor v0.16b,v0.16b,v0.16b - ld1 {v3.16b},[x0],#16 - mov w1,#8 // reuse w1 - ld1 {v1.4s,v2.4s},[x3],#32 - - b.lt .Loop128 - b.eq .L192 - b .L256 - -.align 4 -.Loop128: - tbl v6.16b,{v3.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 - aese v6.16b,v0.16b - subs w1,w1,#1 - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - shl v1.16b,v1.16b,#1 - eor v3.16b,v3.16b,v6.16b - b.ne .Loop128 - - ld1 {v1.4s},[x3] - - tbl v6.16b,{v3.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 - aese v6.16b,v0.16b - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - shl v1.16b,v1.16b,#1 - eor v3.16b,v3.16b,v6.16b - - tbl v6.16b,{v3.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v3.4s},[x2],#16 - aese v6.16b,v0.16b - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - eor v3.16b,v3.16b,v6.16b - st1 {v3.4s},[x2] - add x2,x2,#0x50 - - mov w12,#10 - b .Ldone - -.align 4 -.L192: - ld1 {v4.8b},[x0],#8 - movi v6.16b,#8 // borrow v6.16b - st1 {v3.4s},[x2],#16 - sub v2.16b,v2.16b,v6.16b // adjust the mask - -.Loop192: - tbl v6.16b,{v4.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v4.8b},[x2],#8 - aese v6.16b,v0.16b - subs w1,w1,#1 - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - - dup v5.4s,v3.s[3] - eor v5.16b,v5.16b,v4.16b - eor v6.16b,v6.16b,v1.16b - ext v4.16b,v0.16b,v4.16b,#12 - shl v1.16b,v1.16b,#1 - eor v4.16b,v4.16b,v5.16b - eor v3.16b,v3.16b,v6.16b - eor v4.16b,v4.16b,v6.16b - st1 {v3.4s},[x2],#16 - b.ne .Loop192 - - mov w12,#12 - add x2,x2,#0x20 - b .Ldone - -.align 4 -.L256: - ld1 {v4.16b},[x0] - mov w1,#7 - mov w12,#14 - st1 {v3.4s},[x2],#16 - -.Loop256: - tbl v6.16b,{v4.16b},v2.16b - ext v5.16b,v0.16b,v3.16b,#12 - st1 {v4.4s},[x2],#16 - aese v6.16b,v0.16b - subs w1,w1,#1 - - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v3.16b,v3.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v6.16b,v6.16b,v1.16b - eor v3.16b,v3.16b,v5.16b - shl v1.16b,v1.16b,#1 - eor v3.16b,v3.16b,v6.16b - st1 {v3.4s},[x2],#16 - b.eq .Ldone - - dup v6.4s,v3.s[3] // just splat - ext v5.16b,v0.16b,v4.16b,#12 - aese v6.16b,v0.16b - - eor v4.16b,v4.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v4.16b,v4.16b,v5.16b - ext v5.16b,v0.16b,v5.16b,#12 - eor v4.16b,v4.16b,v5.16b - - eor v4.16b,v4.16b,v6.16b - b .Loop256 - -.Ldone: - str w12,[x2] - mov x3,#0 - -.Lenc_key_abort: - mov x0,x3 // return value - ldr x29,[sp],#16 - ret -.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key - -.globl aes_hw_set_decrypt_key -.hidden aes_hw_set_decrypt_key -.type aes_hw_set_decrypt_key,%function -.align 5 -aes_hw_set_decrypt_key: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - bl .Lenc_key - - cmp x0,#0 - b.ne .Ldec_key_abort - - sub x2,x2,#240 // restore original x2 - mov x4,#-16 - add x0,x2,x12,lsl#4 // end of key schedule - - ld1 {v0.4s},[x2] - ld1 {v1.4s},[x0] - st1 {v0.4s},[x0],x4 - st1 {v1.4s},[x2],#16 - -.Loop_imc: - ld1 {v0.4s},[x2] - ld1 {v1.4s},[x0] - aesimc v0.16b,v0.16b - aesimc v1.16b,v1.16b - st1 {v0.4s},[x0],x4 - st1 {v1.4s},[x2],#16 - cmp x0,x2 - b.hi .Loop_imc - - ld1 {v0.4s},[x2] - aesimc v0.16b,v0.16b - st1 {v0.4s},[x0] - - eor x0,x0,x0 // return value -.Ldec_key_abort: - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key -.globl aes_hw_encrypt -.hidden aes_hw_encrypt -.type aes_hw_encrypt,%function -.align 5 -aes_hw_encrypt: - AARCH64_VALID_CALL_TARGET - ldr w3,[x2,#240] - ld1 {v0.4s},[x2],#16 - ld1 {v2.16b},[x0] - sub w3,w3,#2 - ld1 {v1.4s},[x2],#16 - -.Loop_enc: - aese v2.16b,v0.16b - aesmc v2.16b,v2.16b - ld1 {v0.4s},[x2],#16 - subs w3,w3,#2 - aese v2.16b,v1.16b - aesmc v2.16b,v2.16b - ld1 {v1.4s},[x2],#16 - b.gt .Loop_enc - - aese v2.16b,v0.16b - aesmc v2.16b,v2.16b - ld1 {v0.4s},[x2] - aese v2.16b,v1.16b - eor v2.16b,v2.16b,v0.16b - - st1 {v2.16b},[x1] - ret -.size aes_hw_encrypt,.-aes_hw_encrypt -.globl aes_hw_decrypt -.hidden aes_hw_decrypt -.type aes_hw_decrypt,%function -.align 5 -aes_hw_decrypt: - AARCH64_VALID_CALL_TARGET - ldr w3,[x2,#240] - ld1 {v0.4s},[x2],#16 - ld1 {v2.16b},[x0] - sub w3,w3,#2 - ld1 {v1.4s},[x2],#16 - -.Loop_dec: - aesd v2.16b,v0.16b - aesimc v2.16b,v2.16b - ld1 {v0.4s},[x2],#16 - subs w3,w3,#2 - aesd v2.16b,v1.16b - aesimc v2.16b,v2.16b - ld1 {v1.4s},[x2],#16 - b.gt .Loop_dec - - aesd v2.16b,v0.16b - aesimc v2.16b,v2.16b - ld1 {v0.4s},[x2] - aesd v2.16b,v1.16b - eor v2.16b,v2.16b,v0.16b - - st1 {v2.16b},[x1] - ret -.size aes_hw_decrypt,.-aes_hw_decrypt -.globl aes_hw_cbc_encrypt -.hidden aes_hw_cbc_encrypt -.type aes_hw_cbc_encrypt,%function -.align 5 -aes_hw_cbc_encrypt: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - AARCH64_VALID_CALL_TARGET - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - subs x2,x2,#16 - mov x8,#16 - b.lo .Lcbc_abort - csel x8,xzr,x8,eq - - cmp w5,#0 // en- or decrypting? - ldr w5,[x3,#240] - and x2,x2,#-16 - ld1 {v6.16b},[x4] - ld1 {v0.16b},[x0],x8 - - ld1 {v16.4s,v17.4s},[x3] // load key schedule... - sub w5,w5,#6 - add x7,x3,x5,lsl#4 // pointer to last 7 round keys - sub w5,w5,#2 - ld1 {v18.4s,v19.4s},[x7],#32 - ld1 {v20.4s,v21.4s},[x7],#32 - ld1 {v22.4s,v23.4s},[x7],#32 - ld1 {v7.4s},[x7] - - add x7,x3,#32 - mov w6,w5 - b.eq .Lcbc_dec - - cmp w5,#2 - eor v0.16b,v0.16b,v6.16b - eor v5.16b,v16.16b,v7.16b - b.eq .Lcbc_enc128 - - ld1 {v2.4s,v3.4s},[x7] - add x7,x3,#16 - add x6,x3,#16*4 - add x12,x3,#16*5 - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - add x14,x3,#16*6 - add x3,x3,#16*7 - b .Lenter_cbc_enc - -.align 4 -.Loop_cbc_enc: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - st1 {v6.16b},[x1],#16 -.Lenter_cbc_enc: - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - aese v0.16b,v2.16b - aesmc v0.16b,v0.16b - ld1 {v16.4s},[x6] - cmp w5,#4 - aese v0.16b,v3.16b - aesmc v0.16b,v0.16b - ld1 {v17.4s},[x12] - b.eq .Lcbc_enc192 - - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - ld1 {v16.4s},[x14] - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - ld1 {v17.4s},[x3] - nop - -.Lcbc_enc192: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - subs x2,x2,#16 - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - csel x8,xzr,x8,eq - aese v0.16b,v18.16b - aesmc v0.16b,v0.16b - aese v0.16b,v19.16b - aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 - aese v0.16b,v20.16b - aesmc v0.16b,v0.16b - eor v16.16b,v16.16b,v5.16b - aese v0.16b,v21.16b - aesmc v0.16b,v0.16b - ld1 {v17.4s},[x7] // re-pre-load rndkey[1] - aese v0.16b,v22.16b - aesmc v0.16b,v0.16b - aese v0.16b,v23.16b - eor v6.16b,v0.16b,v7.16b - b.hs .Loop_cbc_enc - - st1 {v6.16b},[x1],#16 - b .Lcbc_done - -.align 5 -.Lcbc_enc128: - ld1 {v2.4s,v3.4s},[x7] - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - b .Lenter_cbc_enc128 -.Loop_cbc_enc128: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - st1 {v6.16b},[x1],#16 -.Lenter_cbc_enc128: - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - subs x2,x2,#16 - aese v0.16b,v2.16b - aesmc v0.16b,v0.16b - csel x8,xzr,x8,eq - aese v0.16b,v3.16b - aesmc v0.16b,v0.16b - aese v0.16b,v18.16b - aesmc v0.16b,v0.16b - aese v0.16b,v19.16b - aesmc v0.16b,v0.16b - ld1 {v16.16b},[x0],x8 - aese v0.16b,v20.16b - aesmc v0.16b,v0.16b - aese v0.16b,v21.16b - aesmc v0.16b,v0.16b - aese v0.16b,v22.16b - aesmc v0.16b,v0.16b - eor v16.16b,v16.16b,v5.16b - aese v0.16b,v23.16b - eor v6.16b,v0.16b,v7.16b - b.hs .Loop_cbc_enc128 - - st1 {v6.16b},[x1],#16 - b .Lcbc_done -.align 5 -.Lcbc_dec: - ld1 {v18.16b},[x0],#16 - subs x2,x2,#32 // bias - add w6,w5,#2 - orr v3.16b,v0.16b,v0.16b - orr v1.16b,v0.16b,v0.16b - orr v19.16b,v18.16b,v18.16b - b.lo .Lcbc_dec_tail - - orr v1.16b,v18.16b,v18.16b - ld1 {v18.16b},[x0],#16 - orr v2.16b,v0.16b,v0.16b - orr v3.16b,v1.16b,v1.16b - orr v19.16b,v18.16b,v18.16b - -.Loop3x_cbc_dec: - aesd v0.16b,v16.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v16.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b - ld1 {v16.4s},[x7],#16 - subs w6,w6,#2 - aesd v0.16b,v17.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v17.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b - ld1 {v17.4s},[x7],#16 - b.gt .Loop3x_cbc_dec - - aesd v0.16b,v16.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v16.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b - eor v4.16b,v6.16b,v7.16b - subs x2,x2,#0x30 - eor v5.16b,v2.16b,v7.16b - csel x6,x2,x6,lo // x6, w6, is zero at this point - aesd v0.16b,v17.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v17.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b - eor v17.16b,v3.16b,v7.16b - add x0,x0,x6 // x0 is adjusted in such way that - // at exit from the loop v1.16b-v18.16b - // are loaded with last "words" - orr v6.16b,v19.16b,v19.16b - mov x7,x3 - aesd v0.16b,v20.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v20.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v20.16b - aesimc v18.16b,v18.16b - ld1 {v2.16b},[x0],#16 - aesd v0.16b,v21.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v21.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v21.16b - aesimc v18.16b,v18.16b - ld1 {v3.16b},[x0],#16 - aesd v0.16b,v22.16b - aesimc v0.16b,v0.16b - aesd v1.16b,v22.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v22.16b - aesimc v18.16b,v18.16b - ld1 {v19.16b},[x0],#16 - aesd v0.16b,v23.16b - aesd v1.16b,v23.16b - aesd v18.16b,v23.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] - add w6,w5,#2 - eor v4.16b,v4.16b,v0.16b - eor v5.16b,v5.16b,v1.16b - eor v18.16b,v18.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v4.16b},[x1],#16 - orr v0.16b,v2.16b,v2.16b - st1 {v5.16b},[x1],#16 - orr v1.16b,v3.16b,v3.16b - st1 {v18.16b},[x1],#16 - orr v18.16b,v19.16b,v19.16b - b.hs .Loop3x_cbc_dec - - cmn x2,#0x30 - b.eq .Lcbc_done - nop - -.Lcbc_dec_tail: - aesd v1.16b,v16.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b - ld1 {v16.4s},[x7],#16 - subs w6,w6,#2 - aesd v1.16b,v17.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b - ld1 {v17.4s},[x7],#16 - b.gt .Lcbc_dec_tail - - aesd v1.16b,v16.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v16.16b - aesimc v18.16b,v18.16b - aesd v1.16b,v17.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v17.16b - aesimc v18.16b,v18.16b - aesd v1.16b,v20.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v20.16b - aesimc v18.16b,v18.16b - cmn x2,#0x20 - aesd v1.16b,v21.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v21.16b - aesimc v18.16b,v18.16b - eor v5.16b,v6.16b,v7.16b - aesd v1.16b,v22.16b - aesimc v1.16b,v1.16b - aesd v18.16b,v22.16b - aesimc v18.16b,v18.16b - eor v17.16b,v3.16b,v7.16b - aesd v1.16b,v23.16b - aesd v18.16b,v23.16b - b.eq .Lcbc_dec_one - eor v5.16b,v5.16b,v1.16b - eor v17.16b,v17.16b,v18.16b - orr v6.16b,v19.16b,v19.16b - st1 {v5.16b},[x1],#16 - st1 {v17.16b},[x1],#16 - b .Lcbc_done - -.Lcbc_dec_one: - eor v5.16b,v5.16b,v18.16b - orr v6.16b,v19.16b,v19.16b - st1 {v5.16b},[x1],#16 - -.Lcbc_done: - st1 {v6.16b},[x4] -.Lcbc_abort: - ldr x29,[sp],#16 - ret -.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt -.globl aes_hw_ctr32_encrypt_blocks -.hidden aes_hw_ctr32_encrypt_blocks -.type aes_hw_ctr32_encrypt_blocks,%function -.align 5 -aes_hw_ctr32_encrypt_blocks: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - AARCH64_VALID_CALL_TARGET - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - ldr w5,[x3,#240] - - ldr w8, [x4, #12] - ld1 {v0.4s},[x4] - - ld1 {v16.4s,v17.4s},[x3] // load key schedule... - sub w5,w5,#4 - mov x12,#16 - cmp x2,#2 - add x7,x3,x5,lsl#4 // pointer to last 5 round keys - sub w5,w5,#2 - ld1 {v20.4s,v21.4s},[x7],#32 - ld1 {v22.4s,v23.4s},[x7],#32 - ld1 {v7.4s},[x7] - add x7,x3,#32 - mov w6,w5 - csel x12,xzr,x12,lo - - // ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are - // affected by silicon errata #1742098 [0] and #1655431 [1], - // respectively, where the second instruction of an aese/aesmc - // instruction pair may execute twice if an interrupt is taken right - // after the first instruction consumes an input register of which a - // single 32-bit lane has been updated the last time it was modified. - // - // This function uses a counter in one 32-bit lane. The vmov lines - // could write to v1.16b and v18.16b directly, but that trips this bugs. - // We write to v6.16b and copy to the final register as a workaround. - // - // [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice - // [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice -#ifndef __AARCH64EB__ - rev w8, w8 -#endif - add w10, w8, #1 - orr v6.16b,v0.16b,v0.16b - rev w10, w10 - mov v6.s[3],w10 - add w8, w8, #2 - orr v1.16b,v6.16b,v6.16b - b.ls .Lctr32_tail - rev w12, w8 - mov v6.s[3],w12 - sub x2,x2,#3 // bias - orr v18.16b,v6.16b,v6.16b - b .Loop3x_ctr32 - -.align 4 -.Loop3x_ctr32: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - aese v1.16b,v16.16b - aesmc v1.16b,v1.16b - aese v18.16b,v16.16b - aesmc v18.16b,v18.16b - ld1 {v16.4s},[x7],#16 - subs w6,w6,#2 - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - aese v1.16b,v17.16b - aesmc v1.16b,v1.16b - aese v18.16b,v17.16b - aesmc v18.16b,v18.16b - ld1 {v17.4s},[x7],#16 - b.gt .Loop3x_ctr32 - - aese v0.16b,v16.16b - aesmc v4.16b,v0.16b - aese v1.16b,v16.16b - aesmc v5.16b,v1.16b - ld1 {v2.16b},[x0],#16 - add w9,w8,#1 - aese v18.16b,v16.16b - aesmc v18.16b,v18.16b - ld1 {v3.16b},[x0],#16 - rev w9,w9 - aese v4.16b,v17.16b - aesmc v4.16b,v4.16b - aese v5.16b,v17.16b - aesmc v5.16b,v5.16b - ld1 {v19.16b},[x0],#16 - mov x7,x3 - aese v18.16b,v17.16b - aesmc v17.16b,v18.16b - aese v4.16b,v20.16b - aesmc v4.16b,v4.16b - aese v5.16b,v20.16b - aesmc v5.16b,v5.16b - eor v2.16b,v2.16b,v7.16b - add w10,w8,#2 - aese v17.16b,v20.16b - aesmc v17.16b,v17.16b - eor v3.16b,v3.16b,v7.16b - add w8,w8,#3 - aese v4.16b,v21.16b - aesmc v4.16b,v4.16b - aese v5.16b,v21.16b - aesmc v5.16b,v5.16b - // Note the logic to update v0.16b, v1.16b, and v1.16b is written to work - // around a bug in ARM Cortex-A57 and Cortex-A72 cores running in - // 32-bit mode. See the comment above. - eor v19.16b,v19.16b,v7.16b - mov v6.s[3], w9 - aese v17.16b,v21.16b - aesmc v17.16b,v17.16b - orr v0.16b,v6.16b,v6.16b - rev w10,w10 - aese v4.16b,v22.16b - aesmc v4.16b,v4.16b - mov v6.s[3], w10 - rev w12,w8 - aese v5.16b,v22.16b - aesmc v5.16b,v5.16b - orr v1.16b,v6.16b,v6.16b - mov v6.s[3], w12 - aese v17.16b,v22.16b - aesmc v17.16b,v17.16b - orr v18.16b,v6.16b,v6.16b - subs x2,x2,#3 - aese v4.16b,v23.16b - aese v5.16b,v23.16b - aese v17.16b,v23.16b - - eor v2.16b,v2.16b,v4.16b - ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] - st1 {v2.16b},[x1],#16 - eor v3.16b,v3.16b,v5.16b - mov w6,w5 - st1 {v3.16b},[x1],#16 - eor v19.16b,v19.16b,v17.16b - ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] - st1 {v19.16b},[x1],#16 - b.hs .Loop3x_ctr32 - - adds x2,x2,#3 - b.eq .Lctr32_done - cmp x2,#1 - mov x12,#16 - csel x12,xzr,x12,eq - -.Lctr32_tail: - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - aese v1.16b,v16.16b - aesmc v1.16b,v1.16b - ld1 {v16.4s},[x7],#16 - subs w6,w6,#2 - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - aese v1.16b,v17.16b - aesmc v1.16b,v1.16b - ld1 {v17.4s},[x7],#16 - b.gt .Lctr32_tail - - aese v0.16b,v16.16b - aesmc v0.16b,v0.16b - aese v1.16b,v16.16b - aesmc v1.16b,v1.16b - aese v0.16b,v17.16b - aesmc v0.16b,v0.16b - aese v1.16b,v17.16b - aesmc v1.16b,v1.16b - ld1 {v2.16b},[x0],x12 - aese v0.16b,v20.16b - aesmc v0.16b,v0.16b - aese v1.16b,v20.16b - aesmc v1.16b,v1.16b - ld1 {v3.16b},[x0] - aese v0.16b,v21.16b - aesmc v0.16b,v0.16b - aese v1.16b,v21.16b - aesmc v1.16b,v1.16b - eor v2.16b,v2.16b,v7.16b - aese v0.16b,v22.16b - aesmc v0.16b,v0.16b - aese v1.16b,v22.16b - aesmc v1.16b,v1.16b - eor v3.16b,v3.16b,v7.16b - aese v0.16b,v23.16b - aese v1.16b,v23.16b - - cmp x2,#1 - eor v2.16b,v2.16b,v0.16b - eor v3.16b,v3.16b,v1.16b - st1 {v2.16b},[x1],#16 - b.eq .Lctr32_done - st1 {v3.16b},[x1] - -.Lctr32_done: - ldr x29,[sp],#16 - ret -.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S deleted file mode 100644 index db89859a..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/armv8-mont.S +++ /dev/null @@ -1,1436 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.text - -.globl bn_mul_mont -.hidden bn_mul_mont -.type bn_mul_mont,%function -.align 5 -bn_mul_mont: - AARCH64_SIGN_LINK_REGISTER - tst x5,#7 - b.eq __bn_sqr8x_mont - tst x5,#3 - b.eq __bn_mul4x_mont -.Lmul_mont: - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - ldr x9,[x2],#8 // bp[0] - sub x22,sp,x5,lsl#3 - ldp x7,x8,[x1],#16 // ap[0..1] - lsl x5,x5,#3 - ldr x4,[x4] // *n0 - and x22,x22,#-16 // ABI says so - ldp x13,x14,[x3],#16 // np[0..1] - - mul x6,x7,x9 // ap[0]*bp[0] - sub x21,x5,#16 // j=num-2 - umulh x7,x7,x9 - mul x10,x8,x9 // ap[1]*bp[0] - umulh x11,x8,x9 - - mul x15,x6,x4 // "tp[0]"*n0 - mov sp,x22 // alloca - - // (*) mul x12,x13,x15 // np[0]*m1 - umulh x13,x13,x15 - mul x16,x14,x15 // np[1]*m1 - // (*) adds x12,x12,x6 // discarded - // (*) As for removal of first multiplication and addition - // instructions. The outcome of first addition is - // guaranteed to be zero, which leaves two computationally - // significant outcomes: it either carries or not. Then - // question is when does it carry? Is there alternative - // way to deduce it? If you follow operations, you can - // observe that condition for carry is quite simple: - // x6 being non-zero. So that carry can be calculated - // by adding -1 to x6. That's what next instruction does. - subs xzr,x6,#1 // (*) - umulh x17,x14,x15 - adc x13,x13,xzr - cbz x21,.L1st_skip - -.L1st: - ldr x8,[x1],#8 - adds x6,x10,x7 - sub x21,x21,#8 // j-- - adc x7,x11,xzr - - ldr x14,[x3],#8 - adds x12,x16,x13 - mul x10,x8,x9 // ap[j]*bp[0] - adc x13,x17,xzr - umulh x11,x8,x9 - - adds x12,x12,x6 - mul x16,x14,x15 // np[j]*m1 - adc x13,x13,xzr - umulh x17,x14,x15 - str x12,[x22],#8 // tp[j-1] - cbnz x21,.L1st - -.L1st_skip: - adds x6,x10,x7 - sub x1,x1,x5 // rewind x1 - adc x7,x11,xzr - - adds x12,x16,x13 - sub x3,x3,x5 // rewind x3 - adc x13,x17,xzr - - adds x12,x12,x6 - sub x20,x5,#8 // i=num-1 - adcs x13,x13,x7 - - adc x19,xzr,xzr // upmost overflow bit - stp x12,x13,[x22] - -.Louter: - ldr x9,[x2],#8 // bp[i] - ldp x7,x8,[x1],#16 - ldr x23,[sp] // tp[0] - add x22,sp,#8 - - mul x6,x7,x9 // ap[0]*bp[i] - sub x21,x5,#16 // j=num-2 - umulh x7,x7,x9 - ldp x13,x14,[x3],#16 - mul x10,x8,x9 // ap[1]*bp[i] - adds x6,x6,x23 - umulh x11,x8,x9 - adc x7,x7,xzr - - mul x15,x6,x4 - sub x20,x20,#8 // i-- - - // (*) mul x12,x13,x15 // np[0]*m1 - umulh x13,x13,x15 - mul x16,x14,x15 // np[1]*m1 - // (*) adds x12,x12,x6 - subs xzr,x6,#1 // (*) - umulh x17,x14,x15 - cbz x21,.Linner_skip - -.Linner: - ldr x8,[x1],#8 - adc x13,x13,xzr - ldr x23,[x22],#8 // tp[j] - adds x6,x10,x7 - sub x21,x21,#8 // j-- - adc x7,x11,xzr - - adds x12,x16,x13 - ldr x14,[x3],#8 - adc x13,x17,xzr - - mul x10,x8,x9 // ap[j]*bp[i] - adds x6,x6,x23 - umulh x11,x8,x9 - adc x7,x7,xzr - - mul x16,x14,x15 // np[j]*m1 - adds x12,x12,x6 - umulh x17,x14,x15 - str x12,[x22,#-16] // tp[j-1] - cbnz x21,.Linner - -.Linner_skip: - ldr x23,[x22],#8 // tp[j] - adc x13,x13,xzr - adds x6,x10,x7 - sub x1,x1,x5 // rewind x1 - adc x7,x11,xzr - - adds x12,x16,x13 - sub x3,x3,x5 // rewind x3 - adcs x13,x17,x19 - adc x19,xzr,xzr - - adds x6,x6,x23 - adc x7,x7,xzr - - adds x12,x12,x6 - adcs x13,x13,x7 - adc x19,x19,xzr // upmost overflow bit - stp x12,x13,[x22,#-16] - - cbnz x20,.Louter - - // Final step. We see if result is larger than modulus, and - // if it is, subtract the modulus. But comparison implies - // subtraction. So we subtract modulus, see if it borrowed, - // and conditionally copy original value. - ldr x23,[sp] // tp[0] - add x22,sp,#8 - ldr x14,[x3],#8 // np[0] - subs x21,x5,#8 // j=num-1 and clear borrow - mov x1,x0 -.Lsub: - sbcs x8,x23,x14 // tp[j]-np[j] - ldr x23,[x22],#8 - sub x21,x21,#8 // j-- - ldr x14,[x3],#8 - str x8,[x1],#8 // rp[j]=tp[j]-np[j] - cbnz x21,.Lsub - - sbcs x8,x23,x14 - sbcs x19,x19,xzr // did it borrow? - str x8,[x1],#8 // rp[num-1] - - ldr x23,[sp] // tp[0] - add x22,sp,#8 - ldr x8,[x0],#8 // rp[0] - sub x5,x5,#8 // num-- - nop -.Lcond_copy: - sub x5,x5,#8 // num-- - csel x14,x23,x8,lo // did it borrow? - ldr x23,[x22],#8 - ldr x8,[x0],#8 - str xzr,[x22,#-16] // wipe tp - str x14,[x0,#-16] - cbnz x5,.Lcond_copy - - csel x14,x23,x8,lo - str xzr,[x22,#-8] // wipe tp - str x14,[x0,#-8] - - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] - mov x0,#1 - ldp x23,x24,[x29,#48] - ldr x29,[sp],#64 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size bn_mul_mont,.-bn_mul_mont -.type __bn_sqr8x_mont,%function -.align 5 -__bn_sqr8x_mont: - // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_sqr8x_mont is jumped to - // only from bn_mul_mont which has already signed the return address. - cmp x1,x2 - b.ne __bn_mul4x_mont -.Lsqr8x_mont: - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x0,x3,[sp,#96] // offload rp and np - - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - ldp x10,x11,[x1,#8*4] - ldp x12,x13,[x1,#8*6] - - sub x2,sp,x5,lsl#4 - lsl x5,x5,#3 - ldr x4,[x4] // *n0 - mov sp,x2 // alloca - sub x27,x5,#8*8 - b .Lsqr8x_zero_start - -.Lsqr8x_zero: - sub x27,x27,#8*8 - stp xzr,xzr,[x2,#8*0] - stp xzr,xzr,[x2,#8*2] - stp xzr,xzr,[x2,#8*4] - stp xzr,xzr,[x2,#8*6] -.Lsqr8x_zero_start: - stp xzr,xzr,[x2,#8*8] - stp xzr,xzr,[x2,#8*10] - stp xzr,xzr,[x2,#8*12] - stp xzr,xzr,[x2,#8*14] - add x2,x2,#8*16 - cbnz x27,.Lsqr8x_zero - - add x3,x1,x5 - add x1,x1,#8*8 - mov x19,xzr - mov x20,xzr - mov x21,xzr - mov x22,xzr - mov x23,xzr - mov x24,xzr - mov x25,xzr - mov x26,xzr - mov x2,sp - str x4,[x29,#112] // offload n0 - - // Multiply everything but a[i]*a[i] -.align 4 -.Lsqr8x_outer_loop: - // a[1]a[0] (i) - // a[2]a[0] - // a[3]a[0] - // a[4]a[0] - // a[5]a[0] - // a[6]a[0] - // a[7]a[0] - // a[2]a[1] (ii) - // a[3]a[1] - // a[4]a[1] - // a[5]a[1] - // a[6]a[1] - // a[7]a[1] - // a[3]a[2] (iii) - // a[4]a[2] - // a[5]a[2] - // a[6]a[2] - // a[7]a[2] - // a[4]a[3] (iv) - // a[5]a[3] - // a[6]a[3] - // a[7]a[3] - // a[5]a[4] (v) - // a[6]a[4] - // a[7]a[4] - // a[6]a[5] (vi) - // a[7]a[5] - // a[7]a[6] (vii) - - mul x14,x7,x6 // lo(a[1..7]*a[0]) (i) - mul x15,x8,x6 - mul x16,x9,x6 - mul x17,x10,x6 - adds x20,x20,x14 // t[1]+lo(a[1]*a[0]) - mul x14,x11,x6 - adcs x21,x21,x15 - mul x15,x12,x6 - adcs x22,x22,x16 - mul x16,x13,x6 - adcs x23,x23,x17 - umulh x17,x7,x6 // hi(a[1..7]*a[0]) - adcs x24,x24,x14 - umulh x14,x8,x6 - adcs x25,x25,x15 - umulh x15,x9,x6 - adcs x26,x26,x16 - umulh x16,x10,x6 - stp x19,x20,[x2],#8*2 // t[0..1] - adc x19,xzr,xzr // t[8] - adds x21,x21,x17 // t[2]+lo(a[1]*a[0]) - umulh x17,x11,x6 - adcs x22,x22,x14 - umulh x14,x12,x6 - adcs x23,x23,x15 - umulh x15,x13,x6 - adcs x24,x24,x16 - mul x16,x8,x7 // lo(a[2..7]*a[1]) (ii) - adcs x25,x25,x17 - mul x17,x9,x7 - adcs x26,x26,x14 - mul x14,x10,x7 - adc x19,x19,x15 - - mul x15,x11,x7 - adds x22,x22,x16 - mul x16,x12,x7 - adcs x23,x23,x17 - mul x17,x13,x7 - adcs x24,x24,x14 - umulh x14,x8,x7 // hi(a[2..7]*a[1]) - adcs x25,x25,x15 - umulh x15,x9,x7 - adcs x26,x26,x16 - umulh x16,x10,x7 - adcs x19,x19,x17 - umulh x17,x11,x7 - stp x21,x22,[x2],#8*2 // t[2..3] - adc x20,xzr,xzr // t[9] - adds x23,x23,x14 - umulh x14,x12,x7 - adcs x24,x24,x15 - umulh x15,x13,x7 - adcs x25,x25,x16 - mul x16,x9,x8 // lo(a[3..7]*a[2]) (iii) - adcs x26,x26,x17 - mul x17,x10,x8 - adcs x19,x19,x14 - mul x14,x11,x8 - adc x20,x20,x15 - - mul x15,x12,x8 - adds x24,x24,x16 - mul x16,x13,x8 - adcs x25,x25,x17 - umulh x17,x9,x8 // hi(a[3..7]*a[2]) - adcs x26,x26,x14 - umulh x14,x10,x8 - adcs x19,x19,x15 - umulh x15,x11,x8 - adcs x20,x20,x16 - umulh x16,x12,x8 - stp x23,x24,[x2],#8*2 // t[4..5] - adc x21,xzr,xzr // t[10] - adds x25,x25,x17 - umulh x17,x13,x8 - adcs x26,x26,x14 - mul x14,x10,x9 // lo(a[4..7]*a[3]) (iv) - adcs x19,x19,x15 - mul x15,x11,x9 - adcs x20,x20,x16 - mul x16,x12,x9 - adc x21,x21,x17 - - mul x17,x13,x9 - adds x26,x26,x14 - umulh x14,x10,x9 // hi(a[4..7]*a[3]) - adcs x19,x19,x15 - umulh x15,x11,x9 - adcs x20,x20,x16 - umulh x16,x12,x9 - adcs x21,x21,x17 - umulh x17,x13,x9 - stp x25,x26,[x2],#8*2 // t[6..7] - adc x22,xzr,xzr // t[11] - adds x19,x19,x14 - mul x14,x11,x10 // lo(a[5..7]*a[4]) (v) - adcs x20,x20,x15 - mul x15,x12,x10 - adcs x21,x21,x16 - mul x16,x13,x10 - adc x22,x22,x17 - - umulh x17,x11,x10 // hi(a[5..7]*a[4]) - adds x20,x20,x14 - umulh x14,x12,x10 - adcs x21,x21,x15 - umulh x15,x13,x10 - adcs x22,x22,x16 - mul x16,x12,x11 // lo(a[6..7]*a[5]) (vi) - adc x23,xzr,xzr // t[12] - adds x21,x21,x17 - mul x17,x13,x11 - adcs x22,x22,x14 - umulh x14,x12,x11 // hi(a[6..7]*a[5]) - adc x23,x23,x15 - - umulh x15,x13,x11 - adds x22,x22,x16 - mul x16,x13,x12 // lo(a[7]*a[6]) (vii) - adcs x23,x23,x17 - umulh x17,x13,x12 // hi(a[7]*a[6]) - adc x24,xzr,xzr // t[13] - adds x23,x23,x14 - sub x27,x3,x1 // done yet? - adc x24,x24,x15 - - adds x24,x24,x16 - sub x14,x3,x5 // rewinded ap - adc x25,xzr,xzr // t[14] - add x25,x25,x17 - - cbz x27,.Lsqr8x_outer_break - - mov x4,x6 - ldp x6,x7,[x2,#8*0] - ldp x8,x9,[x2,#8*2] - ldp x10,x11,[x2,#8*4] - ldp x12,x13,[x2,#8*6] - adds x19,x19,x6 - adcs x20,x20,x7 - ldp x6,x7,[x1,#8*0] - adcs x21,x21,x8 - adcs x22,x22,x9 - ldp x8,x9,[x1,#8*2] - adcs x23,x23,x10 - adcs x24,x24,x11 - ldp x10,x11,[x1,#8*4] - adcs x25,x25,x12 - mov x0,x1 - adcs x26,xzr,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 - //adc x28,xzr,xzr // moved below - mov x27,#-8*8 - - // a[8]a[0] - // a[9]a[0] - // a[a]a[0] - // a[b]a[0] - // a[c]a[0] - // a[d]a[0] - // a[e]a[0] - // a[f]a[0] - // a[8]a[1] - // a[f]a[1]........................ - // a[8]a[2] - // a[f]a[2]........................ - // a[8]a[3] - // a[f]a[3]........................ - // a[8]a[4] - // a[f]a[4]........................ - // a[8]a[5] - // a[f]a[5]........................ - // a[8]a[6] - // a[f]a[6]........................ - // a[8]a[7] - // a[f]a[7]........................ -.Lsqr8x_mul: - mul x14,x6,x4 - adc x28,xzr,xzr // carry bit, modulo-scheduled - mul x15,x7,x4 - add x27,x27,#8 - mul x16,x8,x4 - mul x17,x9,x4 - adds x19,x19,x14 - mul x14,x10,x4 - adcs x20,x20,x15 - mul x15,x11,x4 - adcs x21,x21,x16 - mul x16,x12,x4 - adcs x22,x22,x17 - mul x17,x13,x4 - adcs x23,x23,x14 - umulh x14,x6,x4 - adcs x24,x24,x15 - umulh x15,x7,x4 - adcs x25,x25,x16 - umulh x16,x8,x4 - adcs x26,x26,x17 - umulh x17,x9,x4 - adc x28,x28,xzr - str x19,[x2],#8 - adds x19,x20,x14 - umulh x14,x10,x4 - adcs x20,x21,x15 - umulh x15,x11,x4 - adcs x21,x22,x16 - umulh x16,x12,x4 - adcs x22,x23,x17 - umulh x17,x13,x4 - ldr x4,[x0,x27] - adcs x23,x24,x14 - adcs x24,x25,x15 - adcs x25,x26,x16 - adcs x26,x28,x17 - //adc x28,xzr,xzr // moved above - cbnz x27,.Lsqr8x_mul - // note that carry flag is guaranteed - // to be zero at this point - cmp x1,x3 // done yet? - b.eq .Lsqr8x_break - - ldp x6,x7,[x2,#8*0] - ldp x8,x9,[x2,#8*2] - ldp x10,x11,[x2,#8*4] - ldp x12,x13,[x2,#8*6] - adds x19,x19,x6 - ldr x4,[x0,#-8*8] - adcs x20,x20,x7 - ldp x6,x7,[x1,#8*0] - adcs x21,x21,x8 - adcs x22,x22,x9 - ldp x8,x9,[x1,#8*2] - adcs x23,x23,x10 - adcs x24,x24,x11 - ldp x10,x11,[x1,#8*4] - adcs x25,x25,x12 - mov x27,#-8*8 - adcs x26,x26,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 - //adc x28,xzr,xzr // moved above - b .Lsqr8x_mul - -.align 4 -.Lsqr8x_break: - ldp x6,x7,[x0,#8*0] - add x1,x0,#8*8 - ldp x8,x9,[x0,#8*2] - sub x14,x3,x1 // is it last iteration? - ldp x10,x11,[x0,#8*4] - sub x15,x2,x14 - ldp x12,x13,[x0,#8*6] - cbz x14,.Lsqr8x_outer_loop - - stp x19,x20,[x2,#8*0] - ldp x19,x20,[x15,#8*0] - stp x21,x22,[x2,#8*2] - ldp x21,x22,[x15,#8*2] - stp x23,x24,[x2,#8*4] - ldp x23,x24,[x15,#8*4] - stp x25,x26,[x2,#8*6] - mov x2,x15 - ldp x25,x26,[x15,#8*6] - b .Lsqr8x_outer_loop - -.align 4 -.Lsqr8x_outer_break: - // Now multiply above result by 2 and add a[n-1]*a[n-1]|...|a[0]*a[0] - ldp x7,x9,[x14,#8*0] // recall that x14 is &a[0] - ldp x15,x16,[sp,#8*1] - ldp x11,x13,[x14,#8*2] - add x1,x14,#8*4 - ldp x17,x14,[sp,#8*3] - - stp x19,x20,[x2,#8*0] - mul x19,x7,x7 - stp x21,x22,[x2,#8*2] - umulh x7,x7,x7 - stp x23,x24,[x2,#8*4] - mul x8,x9,x9 - stp x25,x26,[x2,#8*6] - mov x2,sp - umulh x9,x9,x9 - adds x20,x7,x15,lsl#1 - extr x15,x16,x15,#63 - sub x27,x5,#8*4 - -.Lsqr4x_shift_n_add: - adcs x21,x8,x15 - extr x16,x17,x16,#63 - sub x27,x27,#8*4 - adcs x22,x9,x16 - ldp x15,x16,[x2,#8*5] - mul x10,x11,x11 - ldp x7,x9,[x1],#8*2 - umulh x11,x11,x11 - mul x12,x13,x13 - umulh x13,x13,x13 - extr x17,x14,x17,#63 - stp x19,x20,[x2,#8*0] - adcs x23,x10,x17 - extr x14,x15,x14,#63 - stp x21,x22,[x2,#8*2] - adcs x24,x11,x14 - ldp x17,x14,[x2,#8*7] - extr x15,x16,x15,#63 - adcs x25,x12,x15 - extr x16,x17,x16,#63 - adcs x26,x13,x16 - ldp x15,x16,[x2,#8*9] - mul x6,x7,x7 - ldp x11,x13,[x1],#8*2 - umulh x7,x7,x7 - mul x8,x9,x9 - umulh x9,x9,x9 - stp x23,x24,[x2,#8*4] - extr x17,x14,x17,#63 - stp x25,x26,[x2,#8*6] - add x2,x2,#8*8 - adcs x19,x6,x17 - extr x14,x15,x14,#63 - adcs x20,x7,x14 - ldp x17,x14,[x2,#8*3] - extr x15,x16,x15,#63 - cbnz x27,.Lsqr4x_shift_n_add - ldp x1,x4,[x29,#104] // pull np and n0 - - adcs x21,x8,x15 - extr x16,x17,x16,#63 - adcs x22,x9,x16 - ldp x15,x16,[x2,#8*5] - mul x10,x11,x11 - umulh x11,x11,x11 - stp x19,x20,[x2,#8*0] - mul x12,x13,x13 - umulh x13,x13,x13 - stp x21,x22,[x2,#8*2] - extr x17,x14,x17,#63 - adcs x23,x10,x17 - extr x14,x15,x14,#63 - ldp x19,x20,[sp,#8*0] - adcs x24,x11,x14 - extr x15,x16,x15,#63 - ldp x6,x7,[x1,#8*0] - adcs x25,x12,x15 - extr x16,xzr,x16,#63 - ldp x8,x9,[x1,#8*2] - adc x26,x13,x16 - ldp x10,x11,[x1,#8*4] - - // Reduce by 512 bits per iteration - mul x28,x4,x19 // t[0]*n0 - ldp x12,x13,[x1,#8*6] - add x3,x1,x5 - ldp x21,x22,[sp,#8*2] - stp x23,x24,[x2,#8*4] - ldp x23,x24,[sp,#8*4] - stp x25,x26,[x2,#8*6] - ldp x25,x26,[sp,#8*6] - add x1,x1,#8*8 - mov x30,xzr // initial top-most carry - mov x2,sp - mov x27,#8 - -.Lsqr8x_reduction: - // (*) mul x14,x6,x28 // lo(n[0-7])*lo(t[0]*n0) - mul x15,x7,x28 - sub x27,x27,#1 - mul x16,x8,x28 - str x28,[x2],#8 // put aside t[0]*n0 for tail processing - mul x17,x9,x28 - // (*) adds xzr,x19,x14 - subs xzr,x19,#1 // (*) - mul x14,x10,x28 - adcs x19,x20,x15 - mul x15,x11,x28 - adcs x20,x21,x16 - mul x16,x12,x28 - adcs x21,x22,x17 - mul x17,x13,x28 - adcs x22,x23,x14 - umulh x14,x6,x28 // hi(n[0-7])*lo(t[0]*n0) - adcs x23,x24,x15 - umulh x15,x7,x28 - adcs x24,x25,x16 - umulh x16,x8,x28 - adcs x25,x26,x17 - umulh x17,x9,x28 - adc x26,xzr,xzr - adds x19,x19,x14 - umulh x14,x10,x28 - adcs x20,x20,x15 - umulh x15,x11,x28 - adcs x21,x21,x16 - umulh x16,x12,x28 - adcs x22,x22,x17 - umulh x17,x13,x28 - mul x28,x4,x19 // next t[0]*n0 - adcs x23,x23,x14 - adcs x24,x24,x15 - adcs x25,x25,x16 - adc x26,x26,x17 - cbnz x27,.Lsqr8x_reduction - - ldp x14,x15,[x2,#8*0] - ldp x16,x17,[x2,#8*2] - mov x0,x2 - sub x27,x3,x1 // done yet? - adds x19,x19,x14 - adcs x20,x20,x15 - ldp x14,x15,[x2,#8*4] - adcs x21,x21,x16 - adcs x22,x22,x17 - ldp x16,x17,[x2,#8*6] - adcs x23,x23,x14 - adcs x24,x24,x15 - adcs x25,x25,x16 - adcs x26,x26,x17 - //adc x28,xzr,xzr // moved below - cbz x27,.Lsqr8x8_post_condition - - ldr x4,[x2,#-8*8] - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - ldp x10,x11,[x1,#8*4] - mov x27,#-8*8 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 - -.Lsqr8x_tail: - mul x14,x6,x4 - adc x28,xzr,xzr // carry bit, modulo-scheduled - mul x15,x7,x4 - add x27,x27,#8 - mul x16,x8,x4 - mul x17,x9,x4 - adds x19,x19,x14 - mul x14,x10,x4 - adcs x20,x20,x15 - mul x15,x11,x4 - adcs x21,x21,x16 - mul x16,x12,x4 - adcs x22,x22,x17 - mul x17,x13,x4 - adcs x23,x23,x14 - umulh x14,x6,x4 - adcs x24,x24,x15 - umulh x15,x7,x4 - adcs x25,x25,x16 - umulh x16,x8,x4 - adcs x26,x26,x17 - umulh x17,x9,x4 - adc x28,x28,xzr - str x19,[x2],#8 - adds x19,x20,x14 - umulh x14,x10,x4 - adcs x20,x21,x15 - umulh x15,x11,x4 - adcs x21,x22,x16 - umulh x16,x12,x4 - adcs x22,x23,x17 - umulh x17,x13,x4 - ldr x4,[x0,x27] - adcs x23,x24,x14 - adcs x24,x25,x15 - adcs x25,x26,x16 - adcs x26,x28,x17 - //adc x28,xzr,xzr // moved above - cbnz x27,.Lsqr8x_tail - // note that carry flag is guaranteed - // to be zero at this point - ldp x6,x7,[x2,#8*0] - sub x27,x3,x1 // done yet? - sub x16,x3,x5 // rewinded np - ldp x8,x9,[x2,#8*2] - ldp x10,x11,[x2,#8*4] - ldp x12,x13,[x2,#8*6] - cbz x27,.Lsqr8x_tail_break - - ldr x4,[x0,#-8*8] - adds x19,x19,x6 - adcs x20,x20,x7 - ldp x6,x7,[x1,#8*0] - adcs x21,x21,x8 - adcs x22,x22,x9 - ldp x8,x9,[x1,#8*2] - adcs x23,x23,x10 - adcs x24,x24,x11 - ldp x10,x11,[x1,#8*4] - adcs x25,x25,x12 - mov x27,#-8*8 - adcs x26,x26,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 - //adc x28,xzr,xzr // moved above - b .Lsqr8x_tail - -.align 4 -.Lsqr8x_tail_break: - ldr x4,[x29,#112] // pull n0 - add x27,x2,#8*8 // end of current t[num] window - - subs xzr,x30,#1 // "move" top-most carry to carry bit - adcs x14,x19,x6 - adcs x15,x20,x7 - ldp x19,x20,[x0,#8*0] - adcs x21,x21,x8 - ldp x6,x7,[x16,#8*0] // recall that x16 is &n[0] - adcs x22,x22,x9 - ldp x8,x9,[x16,#8*2] - adcs x23,x23,x10 - adcs x24,x24,x11 - ldp x10,x11,[x16,#8*4] - adcs x25,x25,x12 - adcs x26,x26,x13 - ldp x12,x13,[x16,#8*6] - add x1,x16,#8*8 - adc x30,xzr,xzr // top-most carry - mul x28,x4,x19 - stp x14,x15,[x2,#8*0] - stp x21,x22,[x2,#8*2] - ldp x21,x22,[x0,#8*2] - stp x23,x24,[x2,#8*4] - ldp x23,x24,[x0,#8*4] - cmp x27,x29 // did we hit the bottom? - stp x25,x26,[x2,#8*6] - mov x2,x0 // slide the window - ldp x25,x26,[x0,#8*6] - mov x27,#8 - b.ne .Lsqr8x_reduction - - // Final step. We see if result is larger than modulus, and - // if it is, subtract the modulus. But comparison implies - // subtraction. So we subtract modulus, see if it borrowed, - // and conditionally copy original value. - ldr x0,[x29,#96] // pull rp - add x2,x2,#8*8 - subs x14,x19,x6 - sbcs x15,x20,x7 - sub x27,x5,#8*8 - mov x3,x0 // x0 copy - -.Lsqr8x_sub: - sbcs x16,x21,x8 - ldp x6,x7,[x1,#8*0] - sbcs x17,x22,x9 - stp x14,x15,[x0,#8*0] - sbcs x14,x23,x10 - ldp x8,x9,[x1,#8*2] - sbcs x15,x24,x11 - stp x16,x17,[x0,#8*2] - sbcs x16,x25,x12 - ldp x10,x11,[x1,#8*4] - sbcs x17,x26,x13 - ldp x12,x13,[x1,#8*6] - add x1,x1,#8*8 - ldp x19,x20,[x2,#8*0] - sub x27,x27,#8*8 - ldp x21,x22,[x2,#8*2] - ldp x23,x24,[x2,#8*4] - ldp x25,x26,[x2,#8*6] - add x2,x2,#8*8 - stp x14,x15,[x0,#8*4] - sbcs x14,x19,x6 - stp x16,x17,[x0,#8*6] - add x0,x0,#8*8 - sbcs x15,x20,x7 - cbnz x27,.Lsqr8x_sub - - sbcs x16,x21,x8 - mov x2,sp - add x1,sp,x5 - ldp x6,x7,[x3,#8*0] - sbcs x17,x22,x9 - stp x14,x15,[x0,#8*0] - sbcs x14,x23,x10 - ldp x8,x9,[x3,#8*2] - sbcs x15,x24,x11 - stp x16,x17,[x0,#8*2] - sbcs x16,x25,x12 - ldp x19,x20,[x1,#8*0] - sbcs x17,x26,x13 - ldp x21,x22,[x1,#8*2] - sbcs xzr,x30,xzr // did it borrow? - ldr x30,[x29,#8] // pull return address - stp x14,x15,[x0,#8*4] - stp x16,x17,[x0,#8*6] - - sub x27,x5,#8*4 -.Lsqr4x_cond_copy: - sub x27,x27,#8*4 - csel x14,x19,x6,lo - stp xzr,xzr,[x2,#8*0] - csel x15,x20,x7,lo - ldp x6,x7,[x3,#8*4] - ldp x19,x20,[x1,#8*4] - csel x16,x21,x8,lo - stp xzr,xzr,[x2,#8*2] - add x2,x2,#8*4 - csel x17,x22,x9,lo - ldp x8,x9,[x3,#8*6] - ldp x21,x22,[x1,#8*6] - add x1,x1,#8*4 - stp x14,x15,[x3,#8*0] - stp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - stp xzr,xzr,[x1,#8*0] - stp xzr,xzr,[x1,#8*2] - cbnz x27,.Lsqr4x_cond_copy - - csel x14,x19,x6,lo - stp xzr,xzr,[x2,#8*0] - csel x15,x20,x7,lo - stp xzr,xzr,[x2,#8*2] - csel x16,x21,x8,lo - csel x17,x22,x9,lo - stp x14,x15,[x3,#8*0] - stp x16,x17,[x3,#8*2] - - b .Lsqr8x_done - -.align 4 -.Lsqr8x8_post_condition: - adc x28,xzr,xzr - ldr x30,[x29,#8] // pull return address - // x19-7,x28 hold result, x6-7 hold modulus - subs x6,x19,x6 - ldr x1,[x29,#96] // pull rp - sbcs x7,x20,x7 - stp xzr,xzr,[sp,#8*0] - sbcs x8,x21,x8 - stp xzr,xzr,[sp,#8*2] - sbcs x9,x22,x9 - stp xzr,xzr,[sp,#8*4] - sbcs x10,x23,x10 - stp xzr,xzr,[sp,#8*6] - sbcs x11,x24,x11 - stp xzr,xzr,[sp,#8*8] - sbcs x12,x25,x12 - stp xzr,xzr,[sp,#8*10] - sbcs x13,x26,x13 - stp xzr,xzr,[sp,#8*12] - sbcs x28,x28,xzr // did it borrow? - stp xzr,xzr,[sp,#8*14] - - // x6-7 hold result-modulus - csel x6,x19,x6,lo - csel x7,x20,x7,lo - csel x8,x21,x8,lo - csel x9,x22,x9,lo - stp x6,x7,[x1,#8*0] - csel x10,x23,x10,lo - csel x11,x24,x11,lo - stp x8,x9,[x1,#8*2] - csel x12,x25,x12,lo - csel x13,x26,x13,lo - stp x10,x11,[x1,#8*4] - stp x12,x13,[x1,#8*6] - -.Lsqr8x_done: - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] - mov x0,#1 - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - // x30 is popped earlier - AARCH64_VALIDATE_LINK_REGISTER - ret -.size __bn_sqr8x_mont,.-__bn_sqr8x_mont -.type __bn_mul4x_mont,%function -.align 5 -__bn_mul4x_mont: - // Not adding AARCH64_SIGN_LINK_REGISTER here because __bn_mul4x_mont is jumped to - // only from bn_mul_mont or __bn_mul8x_mont which have already signed the - // return address. - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - sub x26,sp,x5,lsl#3 - lsl x5,x5,#3 - ldr x4,[x4] // *n0 - sub sp,x26,#8*4 // alloca - - add x10,x2,x5 - add x27,x1,x5 - stp x0,x10,[x29,#96] // offload rp and &b[num] - - ldr x24,[x2,#8*0] // b[0] - ldp x6,x7,[x1,#8*0] // a[0..3] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - mov x19,xzr - mov x20,xzr - mov x21,xzr - mov x22,xzr - ldp x14,x15,[x3,#8*0] // n[0..3] - ldp x16,x17,[x3,#8*2] - adds x3,x3,#8*4 // clear carry bit - mov x0,xzr - mov x28,#0 - mov x26,sp - -.Loop_mul4x_1st_reduction: - mul x10,x6,x24 // lo(a[0..3]*b[0]) - adc x0,x0,xzr // modulo-scheduled - mul x11,x7,x24 - add x28,x28,#8 - mul x12,x8,x24 - and x28,x28,#31 - mul x13,x9,x24 - adds x19,x19,x10 - umulh x10,x6,x24 // hi(a[0..3]*b[0]) - adcs x20,x20,x11 - mul x25,x19,x4 // t[0]*n0 - adcs x21,x21,x12 - umulh x11,x7,x24 - adcs x22,x22,x13 - umulh x12,x8,x24 - adc x23,xzr,xzr - umulh x13,x9,x24 - ldr x24,[x2,x28] // next b[i] (or b[0]) - adds x20,x20,x10 - // (*) mul x10,x14,x25 // lo(n[0..3]*t[0]*n0) - str x25,[x26],#8 // put aside t[0]*n0 for tail processing - adcs x21,x21,x11 - mul x11,x15,x25 - adcs x22,x22,x12 - mul x12,x16,x25 - adc x23,x23,x13 // can't overflow - mul x13,x17,x25 - // (*) adds xzr,x19,x10 - subs xzr,x19,#1 // (*) - umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0) - adcs x19,x20,x11 - umulh x11,x15,x25 - adcs x20,x21,x12 - umulh x12,x16,x25 - adcs x21,x22,x13 - umulh x13,x17,x25 - adcs x22,x23,x0 - adc x0,xzr,xzr - adds x19,x19,x10 - sub x10,x27,x1 - adcs x20,x20,x11 - adcs x21,x21,x12 - adcs x22,x22,x13 - //adc x0,x0,xzr - cbnz x28,.Loop_mul4x_1st_reduction - - cbz x10,.Lmul4x4_post_condition - - ldp x6,x7,[x1,#8*0] // a[4..7] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - ldr x25,[sp] // a[0]*n0 - ldp x14,x15,[x3,#8*0] // n[4..7] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - -.Loop_mul4x_1st_tail: - mul x10,x6,x24 // lo(a[4..7]*b[i]) - adc x0,x0,xzr // modulo-scheduled - mul x11,x7,x24 - add x28,x28,#8 - mul x12,x8,x24 - and x28,x28,#31 - mul x13,x9,x24 - adds x19,x19,x10 - umulh x10,x6,x24 // hi(a[4..7]*b[i]) - adcs x20,x20,x11 - umulh x11,x7,x24 - adcs x21,x21,x12 - umulh x12,x8,x24 - adcs x22,x22,x13 - umulh x13,x9,x24 - adc x23,xzr,xzr - ldr x24,[x2,x28] // next b[i] (or b[0]) - adds x20,x20,x10 - mul x10,x14,x25 // lo(n[4..7]*a[0]*n0) - adcs x21,x21,x11 - mul x11,x15,x25 - adcs x22,x22,x12 - mul x12,x16,x25 - adc x23,x23,x13 // can't overflow - mul x13,x17,x25 - adds x19,x19,x10 - umulh x10,x14,x25 // hi(n[4..7]*a[0]*n0) - adcs x20,x20,x11 - umulh x11,x15,x25 - adcs x21,x21,x12 - umulh x12,x16,x25 - adcs x22,x22,x13 - adcs x23,x23,x0 - umulh x13,x17,x25 - adc x0,xzr,xzr - ldr x25,[sp,x28] // next t[0]*n0 - str x19,[x26],#8 // result!!! - adds x19,x20,x10 - sub x10,x27,x1 // done yet? - adcs x20,x21,x11 - adcs x21,x22,x12 - adcs x22,x23,x13 - //adc x0,x0,xzr - cbnz x28,.Loop_mul4x_1st_tail - - sub x11,x27,x5 // rewinded x1 - cbz x10,.Lmul4x_proceed - - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - ldp x14,x15,[x3,#8*0] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - b .Loop_mul4x_1st_tail - -.align 5 -.Lmul4x_proceed: - ldr x24,[x2,#8*4]! // *++b - adc x30,x0,xzr - ldp x6,x7,[x11,#8*0] // a[0..3] - sub x3,x3,x5 // rewind np - ldp x8,x9,[x11,#8*2] - add x1,x11,#8*4 - - stp x19,x20,[x26,#8*0] // result!!! - ldp x19,x20,[sp,#8*4] // t[0..3] - stp x21,x22,[x26,#8*2] // result!!! - ldp x21,x22,[sp,#8*6] - - ldp x14,x15,[x3,#8*0] // n[0..3] - mov x26,sp - ldp x16,x17,[x3,#8*2] - adds x3,x3,#8*4 // clear carry bit - mov x0,xzr - -.align 4 -.Loop_mul4x_reduction: - mul x10,x6,x24 // lo(a[0..3]*b[4]) - adc x0,x0,xzr // modulo-scheduled - mul x11,x7,x24 - add x28,x28,#8 - mul x12,x8,x24 - and x28,x28,#31 - mul x13,x9,x24 - adds x19,x19,x10 - umulh x10,x6,x24 // hi(a[0..3]*b[4]) - adcs x20,x20,x11 - mul x25,x19,x4 // t[0]*n0 - adcs x21,x21,x12 - umulh x11,x7,x24 - adcs x22,x22,x13 - umulh x12,x8,x24 - adc x23,xzr,xzr - umulh x13,x9,x24 - ldr x24,[x2,x28] // next b[i] - adds x20,x20,x10 - // (*) mul x10,x14,x25 - str x25,[x26],#8 // put aside t[0]*n0 for tail processing - adcs x21,x21,x11 - mul x11,x15,x25 // lo(n[0..3]*t[0]*n0 - adcs x22,x22,x12 - mul x12,x16,x25 - adc x23,x23,x13 // can't overflow - mul x13,x17,x25 - // (*) adds xzr,x19,x10 - subs xzr,x19,#1 // (*) - umulh x10,x14,x25 // hi(n[0..3]*t[0]*n0 - adcs x19,x20,x11 - umulh x11,x15,x25 - adcs x20,x21,x12 - umulh x12,x16,x25 - adcs x21,x22,x13 - umulh x13,x17,x25 - adcs x22,x23,x0 - adc x0,xzr,xzr - adds x19,x19,x10 - adcs x20,x20,x11 - adcs x21,x21,x12 - adcs x22,x22,x13 - //adc x0,x0,xzr - cbnz x28,.Loop_mul4x_reduction - - adc x0,x0,xzr - ldp x10,x11,[x26,#8*4] // t[4..7] - ldp x12,x13,[x26,#8*6] - ldp x6,x7,[x1,#8*0] // a[4..7] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - adds x19,x19,x10 - adcs x20,x20,x11 - adcs x21,x21,x12 - adcs x22,x22,x13 - //adc x0,x0,xzr - - ldr x25,[sp] // t[0]*n0 - ldp x14,x15,[x3,#8*0] // n[4..7] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - -.align 4 -.Loop_mul4x_tail: - mul x10,x6,x24 // lo(a[4..7]*b[4]) - adc x0,x0,xzr // modulo-scheduled - mul x11,x7,x24 - add x28,x28,#8 - mul x12,x8,x24 - and x28,x28,#31 - mul x13,x9,x24 - adds x19,x19,x10 - umulh x10,x6,x24 // hi(a[4..7]*b[4]) - adcs x20,x20,x11 - umulh x11,x7,x24 - adcs x21,x21,x12 - umulh x12,x8,x24 - adcs x22,x22,x13 - umulh x13,x9,x24 - adc x23,xzr,xzr - ldr x24,[x2,x28] // next b[i] - adds x20,x20,x10 - mul x10,x14,x25 // lo(n[4..7]*t[0]*n0) - adcs x21,x21,x11 - mul x11,x15,x25 - adcs x22,x22,x12 - mul x12,x16,x25 - adc x23,x23,x13 // can't overflow - mul x13,x17,x25 - adds x19,x19,x10 - umulh x10,x14,x25 // hi(n[4..7]*t[0]*n0) - adcs x20,x20,x11 - umulh x11,x15,x25 - adcs x21,x21,x12 - umulh x12,x16,x25 - adcs x22,x22,x13 - umulh x13,x17,x25 - adcs x23,x23,x0 - ldr x25,[sp,x28] // next a[0]*n0 - adc x0,xzr,xzr - str x19,[x26],#8 // result!!! - adds x19,x20,x10 - sub x10,x27,x1 // done yet? - adcs x20,x21,x11 - adcs x21,x22,x12 - adcs x22,x23,x13 - //adc x0,x0,xzr - cbnz x28,.Loop_mul4x_tail - - sub x11,x3,x5 // rewinded np? - adc x0,x0,xzr - cbz x10,.Loop_mul4x_break - - ldp x10,x11,[x26,#8*4] - ldp x12,x13,[x26,#8*6] - ldp x6,x7,[x1,#8*0] - ldp x8,x9,[x1,#8*2] - add x1,x1,#8*4 - adds x19,x19,x10 - adcs x20,x20,x11 - adcs x21,x21,x12 - adcs x22,x22,x13 - //adc x0,x0,xzr - ldp x14,x15,[x3,#8*0] - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - b .Loop_mul4x_tail - -.align 4 -.Loop_mul4x_break: - ldp x12,x13,[x29,#96] // pull rp and &b[num] - adds x19,x19,x30 - add x2,x2,#8*4 // bp++ - adcs x20,x20,xzr - sub x1,x1,x5 // rewind ap - adcs x21,x21,xzr - stp x19,x20,[x26,#8*0] // result!!! - adcs x22,x22,xzr - ldp x19,x20,[sp,#8*4] // t[0..3] - adc x30,x0,xzr - stp x21,x22,[x26,#8*2] // result!!! - cmp x2,x13 // done yet? - ldp x21,x22,[sp,#8*6] - ldp x14,x15,[x11,#8*0] // n[0..3] - ldp x16,x17,[x11,#8*2] - add x3,x11,#8*4 - b.eq .Lmul4x_post - - ldr x24,[x2] - ldp x6,x7,[x1,#8*0] // a[0..3] - ldp x8,x9,[x1,#8*2] - adds x1,x1,#8*4 // clear carry bit - mov x0,xzr - mov x26,sp - b .Loop_mul4x_reduction - -.align 4 -.Lmul4x_post: - // Final step. We see if result is larger than modulus, and - // if it is, subtract the modulus. But comparison implies - // subtraction. So we subtract modulus, see if it borrowed, - // and conditionally copy original value. - mov x0,x12 - mov x27,x12 // x0 copy - subs x10,x19,x14 - add x26,sp,#8*8 - sbcs x11,x20,x15 - sub x28,x5,#8*4 - -.Lmul4x_sub: - sbcs x12,x21,x16 - ldp x14,x15,[x3,#8*0] - sub x28,x28,#8*4 - ldp x19,x20,[x26,#8*0] - sbcs x13,x22,x17 - ldp x16,x17,[x3,#8*2] - add x3,x3,#8*4 - ldp x21,x22,[x26,#8*2] - add x26,x26,#8*4 - stp x10,x11,[x0,#8*0] - sbcs x10,x19,x14 - stp x12,x13,[x0,#8*2] - add x0,x0,#8*4 - sbcs x11,x20,x15 - cbnz x28,.Lmul4x_sub - - sbcs x12,x21,x16 - mov x26,sp - add x1,sp,#8*4 - ldp x6,x7,[x27,#8*0] - sbcs x13,x22,x17 - stp x10,x11,[x0,#8*0] - ldp x8,x9,[x27,#8*2] - stp x12,x13,[x0,#8*2] - ldp x19,x20,[x1,#8*0] - ldp x21,x22,[x1,#8*2] - sbcs xzr,x30,xzr // did it borrow? - ldr x30,[x29,#8] // pull return address - - sub x28,x5,#8*4 -.Lmul4x_cond_copy: - sub x28,x28,#8*4 - csel x10,x19,x6,lo - stp xzr,xzr,[x26,#8*0] - csel x11,x20,x7,lo - ldp x6,x7,[x27,#8*4] - ldp x19,x20,[x1,#8*4] - csel x12,x21,x8,lo - stp xzr,xzr,[x26,#8*2] - add x26,x26,#8*4 - csel x13,x22,x9,lo - ldp x8,x9,[x27,#8*6] - ldp x21,x22,[x1,#8*6] - add x1,x1,#8*4 - stp x10,x11,[x27,#8*0] - stp x12,x13,[x27,#8*2] - add x27,x27,#8*4 - cbnz x28,.Lmul4x_cond_copy - - csel x10,x19,x6,lo - stp xzr,xzr,[x26,#8*0] - csel x11,x20,x7,lo - stp xzr,xzr,[x26,#8*2] - csel x12,x21,x8,lo - stp xzr,xzr,[x26,#8*3] - csel x13,x22,x9,lo - stp xzr,xzr,[x26,#8*4] - stp x10,x11,[x27,#8*0] - stp x12,x13,[x27,#8*2] - - b .Lmul4x_done - -.align 4 -.Lmul4x4_post_condition: - adc x0,x0,xzr - ldr x1,[x29,#96] // pull rp - // x19-3,x0 hold result, x14-7 hold modulus - subs x6,x19,x14 - ldr x30,[x29,#8] // pull return address - sbcs x7,x20,x15 - stp xzr,xzr,[sp,#8*0] - sbcs x8,x21,x16 - stp xzr,xzr,[sp,#8*2] - sbcs x9,x22,x17 - stp xzr,xzr,[sp,#8*4] - sbcs xzr,x0,xzr // did it borrow? - stp xzr,xzr,[sp,#8*6] - - // x6-3 hold result-modulus - csel x6,x19,x6,lo - csel x7,x20,x7,lo - csel x8,x21,x8,lo - csel x9,x22,x9,lo - stp x6,x7,[x1,#8*0] - stp x8,x9,[x1,#8*2] - -.Lmul4x_done: - ldp x19,x20,[x29,#16] - mov sp,x29 - ldp x21,x22,[x29,#32] - mov x0,#1 - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldr x29,[sp],#128 - // x30 is popped earlier - AARCH64_VALIDATE_LINK_REGISTER - ret -.size __bn_mul4x_mont,.-__bn_mul4x_mont -.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 4 -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S deleted file mode 100644 index 098967b5..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S +++ /dev/null @@ -1,346 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.text - -.globl gcm_init_neon -.hidden gcm_init_neon -.type gcm_init_neon,%function -.align 4 -gcm_init_neon: - AARCH64_VALID_CALL_TARGET - // This function is adapted from gcm_init_v8. xC2 is t3. - ld1 {v17.2d}, [x1] // load H - movi v19.16b, #0xe1 - shl v19.2d, v19.2d, #57 // 0xc2.0 - ext v3.16b, v17.16b, v17.16b, #8 - ushr v18.2d, v19.2d, #63 - dup v17.4s, v17.s[1] - ext v16.16b, v18.16b, v19.16b, #8 // t0=0xc2....01 - ushr v18.2d, v3.2d, #63 - sshr v17.4s, v17.4s, #31 // broadcast carry bit - and v18.16b, v18.16b, v16.16b - shl v3.2d, v3.2d, #1 - ext v18.16b, v18.16b, v18.16b, #8 - and v16.16b, v16.16b, v17.16b - orr v3.16b, v3.16b, v18.16b // H<<<=1 - eor v5.16b, v3.16b, v16.16b // twisted H - st1 {v5.2d}, [x0] // store Htable[0] - ret -.size gcm_init_neon,.-gcm_init_neon - -.globl gcm_gmult_neon -.hidden gcm_gmult_neon -.type gcm_gmult_neon,%function -.align 4 -gcm_gmult_neon: - AARCH64_VALID_CALL_TARGET - ld1 {v3.16b}, [x0] // load Xi - ld1 {v5.1d}, [x1], #8 // load twisted H - ld1 {v6.1d}, [x1] - adrp x9, .Lmasks // load constants - add x9, x9, :lo12:.Lmasks - ld1 {v24.2d, v25.2d}, [x9] - rev64 v3.16b, v3.16b // byteswap Xi - ext v3.16b, v3.16b, v3.16b, #8 - eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing - - mov x3, #16 - b .Lgmult_neon -.size gcm_gmult_neon,.-gcm_gmult_neon - -.globl gcm_ghash_neon -.hidden gcm_ghash_neon -.type gcm_ghash_neon,%function -.align 4 -gcm_ghash_neon: - AARCH64_VALID_CALL_TARGET - ld1 {v0.16b}, [x0] // load Xi - ld1 {v5.1d}, [x1], #8 // load twisted H - ld1 {v6.1d}, [x1] - adrp x9, .Lmasks // load constants - add x9, x9, :lo12:.Lmasks - ld1 {v24.2d, v25.2d}, [x9] - rev64 v0.16b, v0.16b // byteswap Xi - ext v0.16b, v0.16b, v0.16b, #8 - eor v7.8b, v5.8b, v6.8b // Karatsuba pre-processing - -.Loop_neon: - ld1 {v3.16b}, [x2], #16 // load inp - rev64 v3.16b, v3.16b // byteswap inp - ext v3.16b, v3.16b, v3.16b, #8 - eor v3.16b, v3.16b, v0.16b // inp ^= Xi - -.Lgmult_neon: - // Split the input into v3 and v4. (The upper halves are unused, - // so it is okay to leave them alone.) - ins v4.d[0], v3.d[1] - ext v16.8b, v5.8b, v5.8b, #1 // A1 - pmull v16.8h, v16.8b, v3.8b // F = A1*B - ext v0.8b, v3.8b, v3.8b, #1 // B1 - pmull v0.8h, v5.8b, v0.8b // E = A*B1 - ext v17.8b, v5.8b, v5.8b, #2 // A2 - pmull v17.8h, v17.8b, v3.8b // H = A2*B - ext v19.8b, v3.8b, v3.8b, #2 // B2 - pmull v19.8h, v5.8b, v19.8b // G = A*B2 - ext v18.8b, v5.8b, v5.8b, #3 // A3 - eor v16.16b, v16.16b, v0.16b // L = E + F - pmull v18.8h, v18.8b, v3.8b // J = A3*B - ext v0.8b, v3.8b, v3.8b, #3 // B3 - eor v17.16b, v17.16b, v19.16b // M = G + H - pmull v0.8h, v5.8b, v0.8b // I = A*B3 - - // Here we diverge from the 32-bit version. It computes the following - // (instructions reordered for clarity): - // - // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) - // vand $t0#hi, $t0#hi, $k48 - // veor $t0#lo, $t0#lo, $t0#hi - // - // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) - // vand $t1#hi, $t1#hi, $k32 - // veor $t1#lo, $t1#lo, $t1#hi - // - // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) - // vand $t2#hi, $t2#hi, $k16 - // veor $t2#lo, $t2#lo, $t2#hi - // - // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) - // vmov.i64 $t3#hi, #0 - // - // $kN is a mask with the bottom N bits set. AArch64 cannot compute on - // upper halves of SIMD registers, so we must split each half into - // separate registers. To compensate, we pair computations up and - // parallelize. - - ext v19.8b, v3.8b, v3.8b, #4 // B4 - eor v18.16b, v18.16b, v0.16b // N = I + J - pmull v19.8h, v5.8b, v19.8b // K = A*B4 - - // This can probably be scheduled more efficiently. For now, we just - // pair up independent instructions. - zip1 v20.2d, v16.2d, v17.2d - zip1 v22.2d, v18.2d, v19.2d - zip2 v21.2d, v16.2d, v17.2d - zip2 v23.2d, v18.2d, v19.2d - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - and v21.16b, v21.16b, v24.16b - and v23.16b, v23.16b, v25.16b - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - zip1 v16.2d, v20.2d, v21.2d - zip1 v18.2d, v22.2d, v23.2d - zip2 v17.2d, v20.2d, v21.2d - zip2 v19.2d, v22.2d, v23.2d - - ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 - ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 - pmull v0.8h, v5.8b, v3.8b // D = A*B - ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 - ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 - eor v16.16b, v16.16b, v17.16b - eor v18.16b, v18.16b, v19.16b - eor v0.16b, v0.16b, v16.16b - eor v0.16b, v0.16b, v18.16b - eor v3.8b, v3.8b, v4.8b // Karatsuba pre-processing - ext v16.8b, v7.8b, v7.8b, #1 // A1 - pmull v16.8h, v16.8b, v3.8b // F = A1*B - ext v1.8b, v3.8b, v3.8b, #1 // B1 - pmull v1.8h, v7.8b, v1.8b // E = A*B1 - ext v17.8b, v7.8b, v7.8b, #2 // A2 - pmull v17.8h, v17.8b, v3.8b // H = A2*B - ext v19.8b, v3.8b, v3.8b, #2 // B2 - pmull v19.8h, v7.8b, v19.8b // G = A*B2 - ext v18.8b, v7.8b, v7.8b, #3 // A3 - eor v16.16b, v16.16b, v1.16b // L = E + F - pmull v18.8h, v18.8b, v3.8b // J = A3*B - ext v1.8b, v3.8b, v3.8b, #3 // B3 - eor v17.16b, v17.16b, v19.16b // M = G + H - pmull v1.8h, v7.8b, v1.8b // I = A*B3 - - // Here we diverge from the 32-bit version. It computes the following - // (instructions reordered for clarity): - // - // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) - // vand $t0#hi, $t0#hi, $k48 - // veor $t0#lo, $t0#lo, $t0#hi - // - // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) - // vand $t1#hi, $t1#hi, $k32 - // veor $t1#lo, $t1#lo, $t1#hi - // - // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) - // vand $t2#hi, $t2#hi, $k16 - // veor $t2#lo, $t2#lo, $t2#hi - // - // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) - // vmov.i64 $t3#hi, #0 - // - // $kN is a mask with the bottom N bits set. AArch64 cannot compute on - // upper halves of SIMD registers, so we must split each half into - // separate registers. To compensate, we pair computations up and - // parallelize. - - ext v19.8b, v3.8b, v3.8b, #4 // B4 - eor v18.16b, v18.16b, v1.16b // N = I + J - pmull v19.8h, v7.8b, v19.8b // K = A*B4 - - // This can probably be scheduled more efficiently. For now, we just - // pair up independent instructions. - zip1 v20.2d, v16.2d, v17.2d - zip1 v22.2d, v18.2d, v19.2d - zip2 v21.2d, v16.2d, v17.2d - zip2 v23.2d, v18.2d, v19.2d - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - and v21.16b, v21.16b, v24.16b - and v23.16b, v23.16b, v25.16b - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - zip1 v16.2d, v20.2d, v21.2d - zip1 v18.2d, v22.2d, v23.2d - zip2 v17.2d, v20.2d, v21.2d - zip2 v19.2d, v22.2d, v23.2d - - ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 - ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 - pmull v1.8h, v7.8b, v3.8b // D = A*B - ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 - ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 - eor v16.16b, v16.16b, v17.16b - eor v18.16b, v18.16b, v19.16b - eor v1.16b, v1.16b, v16.16b - eor v1.16b, v1.16b, v18.16b - ext v16.8b, v6.8b, v6.8b, #1 // A1 - pmull v16.8h, v16.8b, v4.8b // F = A1*B - ext v2.8b, v4.8b, v4.8b, #1 // B1 - pmull v2.8h, v6.8b, v2.8b // E = A*B1 - ext v17.8b, v6.8b, v6.8b, #2 // A2 - pmull v17.8h, v17.8b, v4.8b // H = A2*B - ext v19.8b, v4.8b, v4.8b, #2 // B2 - pmull v19.8h, v6.8b, v19.8b // G = A*B2 - ext v18.8b, v6.8b, v6.8b, #3 // A3 - eor v16.16b, v16.16b, v2.16b // L = E + F - pmull v18.8h, v18.8b, v4.8b // J = A3*B - ext v2.8b, v4.8b, v4.8b, #3 // B3 - eor v17.16b, v17.16b, v19.16b // M = G + H - pmull v2.8h, v6.8b, v2.8b // I = A*B3 - - // Here we diverge from the 32-bit version. It computes the following - // (instructions reordered for clarity): - // - // veor $t0#lo, $t0#lo, $t0#hi @ t0 = P0 + P1 (L) - // vand $t0#hi, $t0#hi, $k48 - // veor $t0#lo, $t0#lo, $t0#hi - // - // veor $t1#lo, $t1#lo, $t1#hi @ t1 = P2 + P3 (M) - // vand $t1#hi, $t1#hi, $k32 - // veor $t1#lo, $t1#lo, $t1#hi - // - // veor $t2#lo, $t2#lo, $t2#hi @ t2 = P4 + P5 (N) - // vand $t2#hi, $t2#hi, $k16 - // veor $t2#lo, $t2#lo, $t2#hi - // - // veor $t3#lo, $t3#lo, $t3#hi @ t3 = P6 + P7 (K) - // vmov.i64 $t3#hi, #0 - // - // $kN is a mask with the bottom N bits set. AArch64 cannot compute on - // upper halves of SIMD registers, so we must split each half into - // separate registers. To compensate, we pair computations up and - // parallelize. - - ext v19.8b, v4.8b, v4.8b, #4 // B4 - eor v18.16b, v18.16b, v2.16b // N = I + J - pmull v19.8h, v6.8b, v19.8b // K = A*B4 - - // This can probably be scheduled more efficiently. For now, we just - // pair up independent instructions. - zip1 v20.2d, v16.2d, v17.2d - zip1 v22.2d, v18.2d, v19.2d - zip2 v21.2d, v16.2d, v17.2d - zip2 v23.2d, v18.2d, v19.2d - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - and v21.16b, v21.16b, v24.16b - and v23.16b, v23.16b, v25.16b - eor v20.16b, v20.16b, v21.16b - eor v22.16b, v22.16b, v23.16b - zip1 v16.2d, v20.2d, v21.2d - zip1 v18.2d, v22.2d, v23.2d - zip2 v17.2d, v20.2d, v21.2d - zip2 v19.2d, v22.2d, v23.2d - - ext v16.16b, v16.16b, v16.16b, #15 // t0 = t0 << 8 - ext v17.16b, v17.16b, v17.16b, #14 // t1 = t1 << 16 - pmull v2.8h, v6.8b, v4.8b // D = A*B - ext v19.16b, v19.16b, v19.16b, #12 // t3 = t3 << 32 - ext v18.16b, v18.16b, v18.16b, #13 // t2 = t2 << 24 - eor v16.16b, v16.16b, v17.16b - eor v18.16b, v18.16b, v19.16b - eor v2.16b, v2.16b, v16.16b - eor v2.16b, v2.16b, v18.16b - ext v16.16b, v0.16b, v2.16b, #8 - eor v1.16b, v1.16b, v0.16b // Karatsuba post-processing - eor v1.16b, v1.16b, v2.16b - eor v1.16b, v1.16b, v16.16b // Xm overlaps Xh.lo and Xl.hi - ins v0.d[1], v1.d[0] // Xh|Xl - 256-bit result - // This is a no-op due to the ins instruction below. - // ins v2.d[0], v1.d[1] - - // equivalent of reduction_avx from ghash-x86_64.pl - shl v17.2d, v0.2d, #57 // 1st phase - shl v18.2d, v0.2d, #62 - eor v18.16b, v18.16b, v17.16b // - shl v17.2d, v0.2d, #63 - eor v18.16b, v18.16b, v17.16b // - // Note Xm contains {Xl.d[1], Xh.d[0]}. - eor v18.16b, v18.16b, v1.16b - ins v0.d[1], v18.d[0] // Xl.d[1] ^= t2.d[0] - ins v2.d[0], v18.d[1] // Xh.d[0] ^= t2.d[1] - - ushr v18.2d, v0.2d, #1 // 2nd phase - eor v2.16b, v2.16b,v0.16b - eor v0.16b, v0.16b,v18.16b // - ushr v18.2d, v18.2d, #6 - ushr v0.2d, v0.2d, #1 // - eor v0.16b, v0.16b, v2.16b // - eor v0.16b, v0.16b, v18.16b // - - subs x3, x3, #16 - bne .Loop_neon - - rev64 v0.16b, v0.16b // byteswap Xi and write - ext v0.16b, v0.16b, v0.16b, #8 - st1 {v0.16b}, [x0] - - ret -.size gcm_ghash_neon,.-gcm_ghash_neon - -.section .rodata -.align 4 -.Lmasks: -.quad 0x0000ffffffffffff // k48 -.quad 0x00000000ffffffff // k32 -.quad 0x000000000000ffff // k16 -.quad 0x0000000000000000 // k0 -.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,100,101,114,105,118,101,100,32,102,114,111,109,32,65,82,77,118,52,32,118,101,114,115,105,111,110,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S deleted file mode 100644 index 4544cee0..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S +++ /dev/null @@ -1,576 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -#if __ARM_MAX_ARCH__>=7 -.text -.arch armv8-a+crypto -.globl gcm_init_v8 -.hidden gcm_init_v8 -.type gcm_init_v8,%function -.align 4 -gcm_init_v8: - AARCH64_VALID_CALL_TARGET - ld1 {v17.2d},[x1] //load input H - movi v19.16b,#0xe1 - shl v19.2d,v19.2d,#57 //0xc2.0 - ext v3.16b,v17.16b,v17.16b,#8 - ushr v18.2d,v19.2d,#63 - dup v17.4s,v17.s[1] - ext v16.16b,v18.16b,v19.16b,#8 //t0=0xc2....01 - ushr v18.2d,v3.2d,#63 - sshr v17.4s,v17.4s,#31 //broadcast carry bit - and v18.16b,v18.16b,v16.16b - shl v3.2d,v3.2d,#1 - ext v18.16b,v18.16b,v18.16b,#8 - and v16.16b,v16.16b,v17.16b - orr v3.16b,v3.16b,v18.16b //H<<<=1 - eor v20.16b,v3.16b,v16.16b //twisted H - st1 {v20.2d},[x0],#16 //store Htable[0] - - //calculate H^2 - ext v16.16b,v20.16b,v20.16b,#8 //Karatsuba pre-processing - pmull v0.1q,v20.1d,v20.1d - eor v16.16b,v16.16b,v20.16b - pmull2 v2.1q,v20.2d,v20.2d - pmull v1.1q,v16.1d,v16.1d - - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase - - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v22.16b,v0.16b,v18.16b - - ext v17.16b,v22.16b,v22.16b,#8 //Karatsuba pre-processing - eor v17.16b,v17.16b,v22.16b - ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed - st1 {v21.2d,v22.2d},[x0],#32 //store Htable[1..2] - //calculate H^3 and H^4 - pmull v0.1q,v20.1d, v22.1d - pmull v5.1q,v22.1d,v22.1d - pmull2 v2.1q,v20.2d, v22.2d - pmull2 v7.1q,v22.2d,v22.2d - pmull v1.1q,v16.1d,v17.1d - pmull v6.1q,v17.1d,v17.1d - - ext v16.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - ext v17.16b,v5.16b,v7.16b,#8 - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v16.16b - eor v4.16b,v5.16b,v7.16b - eor v6.16b,v6.16b,v17.16b - eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase - eor v6.16b,v6.16b,v4.16b - pmull v4.1q,v5.1d,v19.1d - - ins v2.d[0],v1.d[1] - ins v7.d[0],v6.d[1] - ins v1.d[1],v0.d[0] - ins v6.d[1],v5.d[0] - eor v0.16b,v1.16b,v18.16b - eor v5.16b,v6.16b,v4.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase - ext v4.16b,v5.16b,v5.16b,#8 - pmull v0.1q,v0.1d,v19.1d - pmull v5.1q,v5.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v4.16b,v4.16b,v7.16b - eor v20.16b, v0.16b,v18.16b //H^3 - eor v22.16b,v5.16b,v4.16b //H^4 - - ext v16.16b,v20.16b, v20.16b,#8 //Karatsuba pre-processing - ext v17.16b,v22.16b,v22.16b,#8 - eor v16.16b,v16.16b,v20.16b - eor v17.16b,v17.16b,v22.16b - ext v21.16b,v16.16b,v17.16b,#8 //pack Karatsuba pre-processed - st1 {v20.2d,v21.2d,v22.2d},[x0] //store Htable[3..5] - ret -.size gcm_init_v8,.-gcm_init_v8 -.globl gcm_gmult_v8 -.hidden gcm_gmult_v8 -.type gcm_gmult_v8,%function -.align 4 -gcm_gmult_v8: - AARCH64_VALID_CALL_TARGET - ld1 {v17.2d},[x0] //load Xi - movi v19.16b,#0xe1 - ld1 {v20.2d,v21.2d},[x1] //load twisted H, ... - shl v19.2d,v19.2d,#57 -#ifndef __AARCH64EB__ - rev64 v17.16b,v17.16b -#endif - ext v3.16b,v17.16b,v17.16b,#8 - - pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo - eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi - pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) - - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v0.16b,v0.16b,v18.16b - -#ifndef __AARCH64EB__ - rev64 v0.16b,v0.16b -#endif - ext v0.16b,v0.16b,v0.16b,#8 - st1 {v0.2d},[x0] //write out Xi - - ret -.size gcm_gmult_v8,.-gcm_gmult_v8 -.globl gcm_ghash_v8 -.hidden gcm_ghash_v8 -.type gcm_ghash_v8,%function -.align 4 -gcm_ghash_v8: - AARCH64_VALID_CALL_TARGET - cmp x3,#64 - b.hs .Lgcm_ghash_v8_4x - ld1 {v0.2d},[x0] //load [rotated] Xi - //"[rotated]" means that - //loaded value would have - //to be rotated in order to - //make it appear as in - //algorithm specification - subs x3,x3,#32 //see if x3 is 32 or larger - mov x12,#16 //x12 is used as post- - //increment for input pointer; - //as loop is modulo-scheduled - //x12 is zeroed just in time - //to preclude overstepping - //inp[len], which means that - //last block[s] are actually - //loaded twice, but last - //copy is not processed - ld1 {v20.2d,v21.2d},[x1],#32 //load twisted H, ..., H^2 - movi v19.16b,#0xe1 - ld1 {v22.2d},[x1] - csel x12,xzr,x12,eq //is it time to zero x12? - ext v0.16b,v0.16b,v0.16b,#8 //rotate Xi - ld1 {v16.2d},[x2],#16 //load [rotated] I[0] - shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant -#ifndef __AARCH64EB__ - rev64 v16.16b,v16.16b - rev64 v0.16b,v0.16b -#endif - ext v3.16b,v16.16b,v16.16b,#8 //rotate I[0] - b.lo .Lodd_tail_v8 //x3 was less than 32 - ld1 {v17.2d},[x2],x12 //load [rotated] I[1] -#ifndef __AARCH64EB__ - rev64 v17.16b,v17.16b -#endif - ext v7.16b,v17.16b,v17.16b,#8 - eor v3.16b,v3.16b,v0.16b //I[i]^=Xi - pmull v4.1q,v20.1d,v7.1d //H·Ii+1 - eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing - pmull2 v6.1q,v20.2d,v7.2d - b .Loop_mod2x_v8 - -.align 4 -.Loop_mod2x_v8: - ext v18.16b,v3.16b,v3.16b,#8 - subs x3,x3,#32 //is there more data? - pmull v0.1q,v22.1d,v3.1d //H^2.lo·Xi.lo - csel x12,xzr,x12,lo //is it time to zero x12? - - pmull v5.1q,v21.1d,v17.1d - eor v18.16b,v18.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v22.2d,v3.2d //H^2.hi·Xi.hi - eor v0.16b,v0.16b,v4.16b //accumulate - pmull2 v1.1q,v21.2d,v18.2d //(H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) - ld1 {v16.2d},[x2],x12 //load [rotated] I[i+2] - - eor v2.16b,v2.16b,v6.16b - csel x12,xzr,x12,eq //is it time to zero x12? - eor v1.16b,v1.16b,v5.16b - - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - ld1 {v17.2d},[x2],x12 //load [rotated] I[i+3] -#ifndef __AARCH64EB__ - rev64 v16.16b,v16.16b -#endif - eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - -#ifndef __AARCH64EB__ - rev64 v17.16b,v17.16b -#endif - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - ext v7.16b,v17.16b,v17.16b,#8 - ext v3.16b,v16.16b,v16.16b,#8 - eor v0.16b,v1.16b,v18.16b - pmull v4.1q,v20.1d,v7.1d //H·Ii+1 - eor v3.16b,v3.16b,v2.16b //accumulate v3.16b early - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v3.16b,v3.16b,v18.16b - eor v17.16b,v17.16b,v7.16b //Karatsuba pre-processing - eor v3.16b,v3.16b,v0.16b - pmull2 v6.1q,v20.2d,v7.2d - b.hs .Loop_mod2x_v8 //there was at least 32 more bytes - - eor v2.16b,v2.16b,v18.16b - ext v3.16b,v16.16b,v16.16b,#8 //re-construct v3.16b - adds x3,x3,#32 //re-construct x3 - eor v0.16b,v0.16b,v2.16b //re-construct v0.16b - b.eq .Ldone_v8 //is x3 zero? -.Lodd_tail_v8: - ext v18.16b,v0.16b,v0.16b,#8 - eor v3.16b,v3.16b,v0.16b //inp^=Xi - eor v17.16b,v16.16b,v18.16b //v17.16b is rotated inp^Xi - - pmull v0.1q,v20.1d,v3.1d //H.lo·Xi.lo - eor v17.16b,v17.16b,v3.16b //Karatsuba pre-processing - pmull2 v2.1q,v20.2d,v3.2d //H.hi·Xi.hi - pmull v1.1q,v21.1d,v17.1d //(H.lo+H.hi)·(Xi.lo+Xi.hi) - - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - eor v1.16b,v1.16b,v18.16b - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v0.16b,v0.16b,v18.16b - -.Ldone_v8: -#ifndef __AARCH64EB__ - rev64 v0.16b,v0.16b -#endif - ext v0.16b,v0.16b,v0.16b,#8 - st1 {v0.2d},[x0] //write out Xi - - ret -.size gcm_ghash_v8,.-gcm_ghash_v8 -.type gcm_ghash_v8_4x,%function -.align 4 -gcm_ghash_v8_4x: -.Lgcm_ghash_v8_4x: - ld1 {v0.2d},[x0] //load [rotated] Xi - ld1 {v20.2d,v21.2d,v22.2d},[x1],#48 //load twisted H, ..., H^2 - movi v19.16b,#0xe1 - ld1 {v26.2d,v27.2d,v28.2d},[x1] //load twisted H^3, ..., H^4 - shl v19.2d,v19.2d,#57 //compose 0xc2.0 constant - - ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 -#ifndef __AARCH64EB__ - rev64 v0.16b,v0.16b - rev64 v5.16b,v5.16b - rev64 v6.16b,v6.16b - rev64 v7.16b,v7.16b - rev64 v4.16b,v4.16b -#endif - ext v25.16b,v7.16b,v7.16b,#8 - ext v24.16b,v6.16b,v6.16b,#8 - ext v23.16b,v5.16b,v5.16b,#8 - - pmull v29.1q,v20.1d,v25.1d //H·Ii+3 - eor v7.16b,v7.16b,v25.16b - pmull2 v31.1q,v20.2d,v25.2d - pmull v30.1q,v21.1d,v7.1d - - pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 - eor v6.16b,v6.16b,v24.16b - pmull2 v24.1q,v22.2d,v24.2d - pmull2 v6.1q,v21.2d,v6.2d - - eor v29.16b,v29.16b,v16.16b - eor v31.16b,v31.16b,v24.16b - eor v30.16b,v30.16b,v6.16b - - pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 - eor v5.16b,v5.16b,v23.16b - pmull2 v23.1q,v26.2d,v23.2d - pmull v5.1q,v27.1d,v5.1d - - eor v29.16b,v29.16b,v7.16b - eor v31.16b,v31.16b,v23.16b - eor v30.16b,v30.16b,v5.16b - - subs x3,x3,#128 - b.lo .Ltail4x - - b .Loop4x - -.align 4 -.Loop4x: - eor v16.16b,v4.16b,v0.16b - ld1 {v4.2d,v5.2d,v6.2d,v7.2d},[x2],#64 - ext v3.16b,v16.16b,v16.16b,#8 -#ifndef __AARCH64EB__ - rev64 v5.16b,v5.16b - rev64 v6.16b,v6.16b - rev64 v7.16b,v7.16b - rev64 v4.16b,v4.16b -#endif - - pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) - eor v16.16b,v16.16b,v3.16b - pmull2 v2.1q,v28.2d,v3.2d - ext v25.16b,v7.16b,v7.16b,#8 - pmull2 v1.1q,v27.2d,v16.2d - - eor v0.16b,v0.16b,v29.16b - eor v2.16b,v2.16b,v31.16b - ext v24.16b,v6.16b,v6.16b,#8 - eor v1.16b,v1.16b,v30.16b - ext v23.16b,v5.16b,v5.16b,#8 - - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - pmull v29.1q,v20.1d,v25.1d //H·Ii+3 - eor v7.16b,v7.16b,v25.16b - eor v1.16b,v1.16b,v17.16b - pmull2 v31.1q,v20.2d,v25.2d - eor v1.16b,v1.16b,v18.16b - pmull v30.1q,v21.1d,v7.1d - - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - pmull v16.1q,v22.1d,v24.1d //H^2·Ii+2 - eor v6.16b,v6.16b,v24.16b - pmull2 v24.1q,v22.2d,v24.2d - eor v0.16b,v1.16b,v18.16b - pmull2 v6.1q,v21.2d,v6.2d - - eor v29.16b,v29.16b,v16.16b - eor v31.16b,v31.16b,v24.16b - eor v30.16b,v30.16b,v6.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - pmull v7.1q,v26.1d,v23.1d //H^3·Ii+1 - eor v5.16b,v5.16b,v23.16b - eor v18.16b,v18.16b,v2.16b - pmull2 v23.1q,v26.2d,v23.2d - pmull v5.1q,v27.1d,v5.1d - - eor v0.16b,v0.16b,v18.16b - eor v29.16b,v29.16b,v7.16b - eor v31.16b,v31.16b,v23.16b - ext v0.16b,v0.16b,v0.16b,#8 - eor v30.16b,v30.16b,v5.16b - - subs x3,x3,#64 - b.hs .Loop4x - -.Ltail4x: - eor v16.16b,v4.16b,v0.16b - ext v3.16b,v16.16b,v16.16b,#8 - - pmull v0.1q,v28.1d,v3.1d //H^4·(Xi+Ii) - eor v16.16b,v16.16b,v3.16b - pmull2 v2.1q,v28.2d,v3.2d - pmull2 v1.1q,v27.2d,v16.2d - - eor v0.16b,v0.16b,v29.16b - eor v2.16b,v2.16b,v31.16b - eor v1.16b,v1.16b,v30.16b - - adds x3,x3,#64 - b.eq .Ldone4x - - cmp x3,#32 - b.lo .Lone - b.eq .Ltwo -.Lthree: - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - ld1 {v4.2d,v5.2d,v6.2d},[x2] - eor v1.16b,v1.16b,v18.16b -#ifndef __AARCH64EB__ - rev64 v5.16b,v5.16b - rev64 v6.16b,v6.16b - rev64 v4.16b,v4.16b -#endif - - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - ext v24.16b,v6.16b,v6.16b,#8 - ext v23.16b,v5.16b,v5.16b,#8 - eor v0.16b,v1.16b,v18.16b - - pmull v29.1q,v20.1d,v24.1d //H·Ii+2 - eor v6.16b,v6.16b,v24.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - pmull2 v31.1q,v20.2d,v24.2d - pmull v30.1q,v21.1d,v6.1d - eor v0.16b,v0.16b,v18.16b - pmull v7.1q,v22.1d,v23.1d //H^2·Ii+1 - eor v5.16b,v5.16b,v23.16b - ext v0.16b,v0.16b,v0.16b,#8 - - pmull2 v23.1q,v22.2d,v23.2d - eor v16.16b,v4.16b,v0.16b - pmull2 v5.1q,v21.2d,v5.2d - ext v3.16b,v16.16b,v16.16b,#8 - - eor v29.16b,v29.16b,v7.16b - eor v31.16b,v31.16b,v23.16b - eor v30.16b,v30.16b,v5.16b - - pmull v0.1q,v26.1d,v3.1d //H^3·(Xi+Ii) - eor v16.16b,v16.16b,v3.16b - pmull2 v2.1q,v26.2d,v3.2d - pmull v1.1q,v27.1d,v16.1d - - eor v0.16b,v0.16b,v29.16b - eor v2.16b,v2.16b,v31.16b - eor v1.16b,v1.16b,v30.16b - b .Ldone4x - -.align 4 -.Ltwo: - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - ld1 {v4.2d,v5.2d},[x2] - eor v1.16b,v1.16b,v18.16b -#ifndef __AARCH64EB__ - rev64 v5.16b,v5.16b - rev64 v4.16b,v4.16b -#endif - - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - ext v23.16b,v5.16b,v5.16b,#8 - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v0.16b,v0.16b,v18.16b - ext v0.16b,v0.16b,v0.16b,#8 - - pmull v29.1q,v20.1d,v23.1d //H·Ii+1 - eor v5.16b,v5.16b,v23.16b - - eor v16.16b,v4.16b,v0.16b - ext v3.16b,v16.16b,v16.16b,#8 - - pmull2 v31.1q,v20.2d,v23.2d - pmull v30.1q,v21.1d,v5.1d - - pmull v0.1q,v22.1d,v3.1d //H^2·(Xi+Ii) - eor v16.16b,v16.16b,v3.16b - pmull2 v2.1q,v22.2d,v3.2d - pmull2 v1.1q,v21.2d,v16.2d - - eor v0.16b,v0.16b,v29.16b - eor v2.16b,v2.16b,v31.16b - eor v1.16b,v1.16b,v30.16b - b .Ldone4x - -.align 4 -.Lone: - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - ld1 {v4.2d},[x2] - eor v1.16b,v1.16b,v18.16b -#ifndef __AARCH64EB__ - rev64 v4.16b,v4.16b -#endif - - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v0.16b,v0.16b,v18.16b - ext v0.16b,v0.16b,v0.16b,#8 - - eor v16.16b,v4.16b,v0.16b - ext v3.16b,v16.16b,v16.16b,#8 - - pmull v0.1q,v20.1d,v3.1d - eor v16.16b,v16.16b,v3.16b - pmull2 v2.1q,v20.2d,v3.2d - pmull v1.1q,v21.1d,v16.1d - -.Ldone4x: - ext v17.16b,v0.16b,v2.16b,#8 //Karatsuba post-processing - eor v18.16b,v0.16b,v2.16b - eor v1.16b,v1.16b,v17.16b - eor v1.16b,v1.16b,v18.16b - - pmull v18.1q,v0.1d,v19.1d //1st phase of reduction - ins v2.d[0],v1.d[1] - ins v1.d[1],v0.d[0] - eor v0.16b,v1.16b,v18.16b - - ext v18.16b,v0.16b,v0.16b,#8 //2nd phase of reduction - pmull v0.1q,v0.1d,v19.1d - eor v18.16b,v18.16b,v2.16b - eor v0.16b,v0.16b,v18.16b - ext v0.16b,v0.16b,v0.16b,#8 - -#ifndef __AARCH64EB__ - rev64 v0.16b,v0.16b -#endif - st1 {v0.2d},[x0] //write out Xi - - ret -.size gcm_ghash_v8_4x,.-gcm_ghash_v8_4x -.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/p256-armv8-asm.S b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/p256-armv8-asm.S deleted file mode 100644 index 3efcccb6..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/p256-armv8-asm.S +++ /dev/null @@ -1,1713 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include "openssl/arm_arch.h" - -.text -.align 5 -.Lpoly: -.quad 0xffffffffffffffff,0x00000000ffffffff,0x0000000000000000,0xffffffff00000001 -.LRR: // 2^512 mod P precomputed for NIST P256 polynomial -.quad 0x0000000000000003,0xfffffffbffffffff,0xfffffffffffffffe,0x00000004fffffffd -.Lone_mont: -.quad 0x0000000000000001,0xffffffff00000000,0xffffffffffffffff,0x00000000fffffffe -.Lone: -.quad 1,0,0,0 -.Lord: -.quad 0xf3b9cac2fc632551,0xbce6faada7179e84,0xffffffffffffffff,0xffffffff00000000 -.LordK: -.quad 0xccd1c8aaee00bc4f -.byte 69,67,80,95,78,73,83,84,90,50,53,54,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 - -// void ecp_nistz256_mul_mont(BN_ULONG x0[4],const BN_ULONG x1[4], -// const BN_ULONG x2[4]); -.globl ecp_nistz256_mul_mont -.hidden ecp_nistz256_mul_mont -.type ecp_nistz256_mul_mont,%function -.align 4 -ecp_nistz256_mul_mont: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - - ldr x3,[x2] // bp[0] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 - - bl __ecp_nistz256_mul_mont - - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont - -// void ecp_nistz256_sqr_mont(BN_ULONG x0[4],const BN_ULONG x1[4]); -.globl ecp_nistz256_sqr_mont -.hidden ecp_nistz256_sqr_mont -.type ecp_nistz256_sqr_mont,%function -.align 4 -ecp_nistz256_sqr_mont: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-32]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 - - bl __ecp_nistz256_sqr_mont - - ldp x19,x20,[sp,#16] - ldp x29,x30,[sp],#32 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont - -// void ecp_nistz256_div_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); -.globl ecp_nistz256_div_by_2 -.hidden ecp_nistz256_div_by_2 -.type ecp_nistz256_div_by_2,%function -.align 4 -ecp_nistz256_div_by_2: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 - - bl __ecp_nistz256_div_by_2 - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ecp_nistz256_div_by_2,.-ecp_nistz256_div_by_2 - -// void ecp_nistz256_mul_by_2(BN_ULONG x0[4],const BN_ULONG x1[4]); -.globl ecp_nistz256_mul_by_2 -.hidden ecp_nistz256_mul_by_2 -.type ecp_nistz256_mul_by_2,%function -.align 4 -ecp_nistz256_mul_by_2: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 - mov x8,x14 - mov x9,x15 - mov x10,x16 - mov x11,x17 - - bl __ecp_nistz256_add_to // ret = a+a // 2*a - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ecp_nistz256_mul_by_2,.-ecp_nistz256_mul_by_2 - -// void ecp_nistz256_mul_by_3(BN_ULONG x0[4],const BN_ULONG x1[4]); -.globl ecp_nistz256_mul_by_3 -.hidden ecp_nistz256_mul_by_3 -.type ecp_nistz256_mul_by_3,%function -.align 4 -ecp_nistz256_mul_by_3: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 - mov x8,x14 - mov x9,x15 - mov x10,x16 - mov x11,x17 - mov x4,x14 - mov x5,x15 - mov x6,x16 - mov x7,x17 - - bl __ecp_nistz256_add_to // ret = a+a // 2*a - - mov x8,x4 - mov x9,x5 - mov x10,x6 - mov x11,x7 - - bl __ecp_nistz256_add_to // ret += a // 2*a+a=3*a - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ecp_nistz256_mul_by_3,.-ecp_nistz256_mul_by_3 - -// void ecp_nistz256_sub(BN_ULONG x0[4],const BN_ULONG x1[4], -// const BN_ULONG x2[4]); -.globl ecp_nistz256_sub -.hidden ecp_nistz256_sub -.type ecp_nistz256_sub,%function -.align 4 -ecp_nistz256_sub: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ldp x14,x15,[x1] - ldp x16,x17,[x1,#16] - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 - - bl __ecp_nistz256_sub_from - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ecp_nistz256_sub,.-ecp_nistz256_sub - -// void ecp_nistz256_neg(BN_ULONG x0[4],const BN_ULONG x1[4]); -.globl ecp_nistz256_neg -.hidden ecp_nistz256_neg -.type ecp_nistz256_neg,%function -.align 4 -ecp_nistz256_neg: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x2,x1 - mov x14,xzr // a = 0 - mov x15,xzr - mov x16,xzr - mov x17,xzr - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 - - bl __ecp_nistz256_sub_from - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ecp_nistz256_neg,.-ecp_nistz256_neg - -// note that __ecp_nistz256_mul_mont expects a[0-3] input pre-loaded -// to x4-x7 and b[0] - to x3 -.type __ecp_nistz256_mul_mont,%function -.align 4 -__ecp_nistz256_mul_mont: - mul x14,x4,x3 // a[0]*b[0] - umulh x8,x4,x3 - - mul x15,x5,x3 // a[1]*b[0] - umulh x9,x5,x3 - - mul x16,x6,x3 // a[2]*b[0] - umulh x10,x6,x3 - - mul x17,x7,x3 // a[3]*b[0] - umulh x11,x7,x3 - ldr x3,[x2,#8] // b[1] - - adds x15,x15,x8 // accumulate high parts of multiplication - lsl x8,x14,#32 - adcs x16,x16,x9 - lsr x9,x14,#32 - adcs x17,x17,x10 - adc x19,xzr,x11 - mov x20,xzr - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - mul x8,x4,x3 // lo(a[0]*b[i]) - adcs x15,x16,x9 - mul x9,x5,x3 // lo(a[1]*b[i]) - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - mul x10,x6,x3 // lo(a[2]*b[i]) - adcs x17,x19,x11 - mul x11,x7,x3 // lo(a[3]*b[i]) - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts of multiplication - umulh x8,x4,x3 // hi(a[0]*b[i]) - adcs x15,x15,x9 - umulh x9,x5,x3 // hi(a[1]*b[i]) - adcs x16,x16,x10 - umulh x10,x6,x3 // hi(a[2]*b[i]) - adcs x17,x17,x11 - umulh x11,x7,x3 // hi(a[3]*b[i]) - adc x19,x19,xzr - ldr x3,[x2,#8*(1+1)] // b[1+1] - adds x15,x15,x8 // accumulate high parts of multiplication - lsl x8,x14,#32 - adcs x16,x16,x9 - lsr x9,x14,#32 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - mul x8,x4,x3 // lo(a[0]*b[i]) - adcs x15,x16,x9 - mul x9,x5,x3 // lo(a[1]*b[i]) - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - mul x10,x6,x3 // lo(a[2]*b[i]) - adcs x17,x19,x11 - mul x11,x7,x3 // lo(a[3]*b[i]) - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts of multiplication - umulh x8,x4,x3 // hi(a[0]*b[i]) - adcs x15,x15,x9 - umulh x9,x5,x3 // hi(a[1]*b[i]) - adcs x16,x16,x10 - umulh x10,x6,x3 // hi(a[2]*b[i]) - adcs x17,x17,x11 - umulh x11,x7,x3 // hi(a[3]*b[i]) - adc x19,x19,xzr - ldr x3,[x2,#8*(2+1)] // b[2+1] - adds x15,x15,x8 // accumulate high parts of multiplication - lsl x8,x14,#32 - adcs x16,x16,x9 - lsr x9,x14,#32 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - mul x8,x4,x3 // lo(a[0]*b[i]) - adcs x15,x16,x9 - mul x9,x5,x3 // lo(a[1]*b[i]) - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - mul x10,x6,x3 // lo(a[2]*b[i]) - adcs x17,x19,x11 - mul x11,x7,x3 // lo(a[3]*b[i]) - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts of multiplication - umulh x8,x4,x3 // hi(a[0]*b[i]) - adcs x15,x15,x9 - umulh x9,x5,x3 // hi(a[1]*b[i]) - adcs x16,x16,x10 - umulh x10,x6,x3 // hi(a[2]*b[i]) - adcs x17,x17,x11 - umulh x11,x7,x3 // hi(a[3]*b[i]) - adc x19,x19,xzr - adds x15,x15,x8 // accumulate high parts of multiplication - lsl x8,x14,#32 - adcs x16,x16,x9 - lsr x9,x14,#32 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - // last reduction - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - adcs x15,x16,x9 - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - adcs x17,x19,x11 - adc x19,x20,xzr - - adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus - sbcs x9,x15,x12 - sbcs x10,x16,xzr - sbcs x11,x17,x13 - sbcs xzr,x19,xzr // did it borrow? - - csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus - csel x15,x15,x9,lo - csel x16,x16,x10,lo - stp x14,x15,[x0] - csel x17,x17,x11,lo - stp x16,x17,[x0,#16] - - ret -.size __ecp_nistz256_mul_mont,.-__ecp_nistz256_mul_mont - -// note that __ecp_nistz256_sqr_mont expects a[0-3] input pre-loaded -// to x4-x7 -.type __ecp_nistz256_sqr_mont,%function -.align 4 -__ecp_nistz256_sqr_mont: - // | | | | | |a1*a0| | - // | | | | |a2*a0| | | - // | |a3*a2|a3*a0| | | | - // | | | |a2*a1| | | | - // | | |a3*a1| | | | | - // *| | | | | | | | 2| - // +|a3*a3|a2*a2|a1*a1|a0*a0| - // |--+--+--+--+--+--+--+--| - // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow - // - // "can't overflow" below mark carrying into high part of - // multiplication result, which can't overflow, because it - // can never be all ones. - - mul x15,x5,x4 // a[1]*a[0] - umulh x9,x5,x4 - mul x16,x6,x4 // a[2]*a[0] - umulh x10,x6,x4 - mul x17,x7,x4 // a[3]*a[0] - umulh x19,x7,x4 - - adds x16,x16,x9 // accumulate high parts of multiplication - mul x8,x6,x5 // a[2]*a[1] - umulh x9,x6,x5 - adcs x17,x17,x10 - mul x10,x7,x5 // a[3]*a[1] - umulh x11,x7,x5 - adc x19,x19,xzr // can't overflow - - mul x20,x7,x6 // a[3]*a[2] - umulh x1,x7,x6 - - adds x9,x9,x10 // accumulate high parts of multiplication - mul x14,x4,x4 // a[0]*a[0] - adc x10,x11,xzr // can't overflow - - adds x17,x17,x8 // accumulate low parts of multiplication - umulh x4,x4,x4 - adcs x19,x19,x9 - mul x9,x5,x5 // a[1]*a[1] - adcs x20,x20,x10 - umulh x5,x5,x5 - adc x1,x1,xzr // can't overflow - - adds x15,x15,x15 // acc[1-6]*=2 - mul x10,x6,x6 // a[2]*a[2] - adcs x16,x16,x16 - umulh x6,x6,x6 - adcs x17,x17,x17 - mul x11,x7,x7 // a[3]*a[3] - adcs x19,x19,x19 - umulh x7,x7,x7 - adcs x20,x20,x20 - adcs x1,x1,x1 - adc x2,xzr,xzr - - adds x15,x15,x4 // +a[i]*a[i] - adcs x16,x16,x9 - adcs x17,x17,x5 - adcs x19,x19,x10 - adcs x20,x20,x6 - lsl x8,x14,#32 - adcs x1,x1,x11 - lsr x9,x14,#32 - adc x2,x2,x7 - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - adcs x15,x16,x9 - lsl x8,x14,#32 - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - lsr x9,x14,#32 - adc x17,x11,xzr // can't overflow - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - adcs x15,x16,x9 - lsl x8,x14,#32 - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - lsr x9,x14,#32 - adc x17,x11,xzr // can't overflow - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - adcs x15,x16,x9 - lsl x8,x14,#32 - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - lsr x9,x14,#32 - adc x17,x11,xzr // can't overflow - subs x10,x14,x8 // "*0xffff0001" - sbc x11,x14,x9 - adds x14,x15,x8 // +=acc[0]<<96 and omit acc[0] - adcs x15,x16,x9 - adcs x16,x17,x10 // +=acc[0]*0xffff0001 - adc x17,x11,xzr // can't overflow - - adds x14,x14,x19 // accumulate upper half - adcs x15,x15,x20 - adcs x16,x16,x1 - adcs x17,x17,x2 - adc x19,xzr,xzr - - adds x8,x14,#1 // subs x8,x14,#-1 // tmp = ret-modulus - sbcs x9,x15,x12 - sbcs x10,x16,xzr - sbcs x11,x17,x13 - sbcs xzr,x19,xzr // did it borrow? - - csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus - csel x15,x15,x9,lo - csel x16,x16,x10,lo - stp x14,x15,[x0] - csel x17,x17,x11,lo - stp x16,x17,[x0,#16] - - ret -.size __ecp_nistz256_sqr_mont,.-__ecp_nistz256_sqr_mont - -// Note that __ecp_nistz256_add_to expects both input vectors pre-loaded to -// x4-x7 and x8-x11. This is done because it's used in multiple -// contexts, e.g. in multiplication by 2 and 3... -.type __ecp_nistz256_add_to,%function -.align 4 -__ecp_nistz256_add_to: - adds x14,x14,x8 // ret = a+b - adcs x15,x15,x9 - adcs x16,x16,x10 - adcs x17,x17,x11 - adc x1,xzr,xzr // zap x1 - - adds x8,x14,#1 // subs x8,x4,#-1 // tmp = ret-modulus - sbcs x9,x15,x12 - sbcs x10,x16,xzr - sbcs x11,x17,x13 - sbcs xzr,x1,xzr // did subtraction borrow? - - csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus - csel x15,x15,x9,lo - csel x16,x16,x10,lo - stp x14,x15,[x0] - csel x17,x17,x11,lo - stp x16,x17,[x0,#16] - - ret -.size __ecp_nistz256_add_to,.-__ecp_nistz256_add_to - -.type __ecp_nistz256_sub_from,%function -.align 4 -__ecp_nistz256_sub_from: - ldp x8,x9,[x2] - ldp x10,x11,[x2,#16] - subs x14,x14,x8 // ret = a-b - sbcs x15,x15,x9 - sbcs x16,x16,x10 - sbcs x17,x17,x11 - sbc x1,xzr,xzr // zap x1 - - subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus - adcs x9,x15,x12 - adcs x10,x16,xzr - adc x11,x17,x13 - cmp x1,xzr // did subtraction borrow? - - csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret - csel x15,x15,x9,eq - csel x16,x16,x10,eq - stp x14,x15,[x0] - csel x17,x17,x11,eq - stp x16,x17,[x0,#16] - - ret -.size __ecp_nistz256_sub_from,.-__ecp_nistz256_sub_from - -.type __ecp_nistz256_sub_morf,%function -.align 4 -__ecp_nistz256_sub_morf: - ldp x8,x9,[x2] - ldp x10,x11,[x2,#16] - subs x14,x8,x14 // ret = b-a - sbcs x15,x9,x15 - sbcs x16,x10,x16 - sbcs x17,x11,x17 - sbc x1,xzr,xzr // zap x1 - - subs x8,x14,#1 // adds x8,x4,#-1 // tmp = ret+modulus - adcs x9,x15,x12 - adcs x10,x16,xzr - adc x11,x17,x13 - cmp x1,xzr // did subtraction borrow? - - csel x14,x14,x8,eq // ret = borrow ? ret+modulus : ret - csel x15,x15,x9,eq - csel x16,x16,x10,eq - stp x14,x15,[x0] - csel x17,x17,x11,eq - stp x16,x17,[x0,#16] - - ret -.size __ecp_nistz256_sub_morf,.-__ecp_nistz256_sub_morf - -.type __ecp_nistz256_div_by_2,%function -.align 4 -__ecp_nistz256_div_by_2: - subs x8,x14,#1 // adds x8,x4,#-1 // tmp = a+modulus - adcs x9,x15,x12 - adcs x10,x16,xzr - adcs x11,x17,x13 - adc x1,xzr,xzr // zap x1 - tst x14,#1 // is a even? - - csel x14,x14,x8,eq // ret = even ? a : a+modulus - csel x15,x15,x9,eq - csel x16,x16,x10,eq - csel x17,x17,x11,eq - csel x1,xzr,x1,eq - - lsr x14,x14,#1 // ret >>= 1 - orr x14,x14,x15,lsl#63 - lsr x15,x15,#1 - orr x15,x15,x16,lsl#63 - lsr x16,x16,#1 - orr x16,x16,x17,lsl#63 - lsr x17,x17,#1 - stp x14,x15,[x0] - orr x17,x17,x1,lsl#63 - stp x16,x17,[x0,#16] - - ret -.size __ecp_nistz256_div_by_2,.-__ecp_nistz256_div_by_2 -.globl ecp_nistz256_point_double -.hidden ecp_nistz256_point_double -.type ecp_nistz256_point_double,%function -.align 5 -ecp_nistz256_point_double: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - sub sp,sp,#32*4 - -.Ldouble_shortcut: - ldp x14,x15,[x1,#32] - mov x21,x0 - ldp x16,x17,[x1,#48] - mov x22,x1 - ldr x12,.Lpoly+8 - mov x8,x14 - ldr x13,.Lpoly+24 - mov x9,x15 - ldp x4,x5,[x22,#64] // forward load for p256_sqr_mont - mov x10,x16 - mov x11,x17 - ldp x6,x7,[x22,#64+16] - add x0,sp,#0 - bl __ecp_nistz256_add_to // p256_mul_by_2(S, in_y); - - add x0,sp,#64 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Zsqr, in_z); - - ldp x8,x9,[x22] - ldp x10,x11,[x22,#16] - mov x4,x14 // put Zsqr aside for p256_sub - mov x5,x15 - mov x6,x16 - mov x7,x17 - add x0,sp,#32 - bl __ecp_nistz256_add_to // p256_add(M, Zsqr, in_x); - - add x2,x22,#0 - mov x14,x4 // restore Zsqr - mov x15,x5 - ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont - mov x16,x6 - mov x17,x7 - ldp x6,x7,[sp,#0+16] - add x0,sp,#64 - bl __ecp_nistz256_sub_morf // p256_sub(Zsqr, in_x, Zsqr); - - add x0,sp,#0 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(S, S); - - ldr x3,[x22,#32] - ldp x4,x5,[x22,#64] - ldp x6,x7,[x22,#64+16] - add x2,x22,#32 - add x0,sp,#96 - bl __ecp_nistz256_mul_mont // p256_mul_mont(tmp0, in_z, in_y); - - mov x8,x14 - mov x9,x15 - ldp x4,x5,[sp,#0] // forward load for p256_sqr_mont - mov x10,x16 - mov x11,x17 - ldp x6,x7,[sp,#0+16] - add x0,x21,#64 - bl __ecp_nistz256_add_to // p256_mul_by_2(res_z, tmp0); - - add x0,sp,#96 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(tmp0, S); - - ldr x3,[sp,#64] // forward load for p256_mul_mont - ldp x4,x5,[sp,#32] - ldp x6,x7,[sp,#32+16] - add x0,x21,#32 - bl __ecp_nistz256_div_by_2 // p256_div_by_2(res_y, tmp0); - - add x2,sp,#64 - add x0,sp,#32 - bl __ecp_nistz256_mul_mont // p256_mul_mont(M, M, Zsqr); - - mov x8,x14 // duplicate M - mov x9,x15 - mov x10,x16 - mov x11,x17 - mov x4,x14 // put M aside - mov x5,x15 - mov x6,x16 - mov x7,x17 - add x0,sp,#32 - bl __ecp_nistz256_add_to - mov x8,x4 // restore M - mov x9,x5 - ldr x3,[x22] // forward load for p256_mul_mont - mov x10,x6 - ldp x4,x5,[sp,#0] - mov x11,x7 - ldp x6,x7,[sp,#0+16] - bl __ecp_nistz256_add_to // p256_mul_by_3(M, M); - - add x2,x22,#0 - add x0,sp,#0 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, in_x); - - mov x8,x14 - mov x9,x15 - ldp x4,x5,[sp,#32] // forward load for p256_sqr_mont - mov x10,x16 - mov x11,x17 - ldp x6,x7,[sp,#32+16] - add x0,sp,#96 - bl __ecp_nistz256_add_to // p256_mul_by_2(tmp0, S); - - add x0,x21,#0 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(res_x, M); - - add x2,sp,#96 - bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, tmp0); - - add x2,sp,#0 - add x0,sp,#0 - bl __ecp_nistz256_sub_morf // p256_sub(S, S, res_x); - - ldr x3,[sp,#32] - mov x4,x14 // copy S - mov x5,x15 - mov x6,x16 - mov x7,x17 - add x2,sp,#32 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S, S, M); - - add x2,x21,#32 - add x0,x21,#32 - bl __ecp_nistz256_sub_from // p256_sub(res_y, S, res_y); - - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ecp_nistz256_point_double,.-ecp_nistz256_point_double -.globl ecp_nistz256_point_add -.hidden ecp_nistz256_point_add -.type ecp_nistz256_point_add,%function -.align 5 -ecp_nistz256_point_add: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#32*12 - - ldp x4,x5,[x2,#64] // in2_z - ldp x6,x7,[x2,#64+16] - mov x21,x0 - mov x22,x1 - mov x23,x2 - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 - orr x8,x4,x5 - orr x10,x6,x7 - orr x25,x8,x10 - cmp x25,#0 - csetm x25,ne // ~in2infty - add x0,sp,#192 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z2sqr, in2_z); - - ldp x4,x5,[x22,#64] // in1_z - ldp x6,x7,[x22,#64+16] - orr x8,x4,x5 - orr x10,x6,x7 - orr x24,x8,x10 - cmp x24,#0 - csetm x24,ne // ~in1infty - add x0,sp,#128 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); - - ldr x3,[x23,#64] - ldp x4,x5,[sp,#192] - ldp x6,x7,[sp,#192+16] - add x2,x23,#64 - add x0,sp,#320 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, Z2sqr, in2_z); - - ldr x3,[x22,#64] - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x2,x22,#64 - add x0,sp,#352 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); - - ldr x3,[x22,#32] - ldp x4,x5,[sp,#320] - ldp x6,x7,[sp,#320+16] - add x2,x22,#32 - add x0,sp,#320 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S1, S1, in1_y); - - ldr x3,[x23,#32] - ldp x4,x5,[sp,#352] - ldp x6,x7,[sp,#352+16] - add x2,x23,#32 - add x0,sp,#352 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); - - add x2,sp,#320 - ldr x3,[sp,#192] // forward load for p256_mul_mont - ldp x4,x5,[x22] - ldp x6,x7,[x22,#16] - add x0,sp,#160 - bl __ecp_nistz256_sub_from // p256_sub(R, S2, S1); - - orr x14,x14,x15 // see if result is zero - orr x16,x16,x17 - orr x26,x14,x16 // ~is_equal(S1,S2) - - add x2,sp,#192 - add x0,sp,#256 - bl __ecp_nistz256_mul_mont // p256_mul_mont(U1, in1_x, Z2sqr); - - ldr x3,[sp,#128] - ldp x4,x5,[x23] - ldp x6,x7,[x23,#16] - add x2,sp,#128 - add x0,sp,#288 - bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in2_x, Z1sqr); - - add x2,sp,#256 - ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont - ldp x6,x7,[sp,#160+16] - add x0,sp,#96 - bl __ecp_nistz256_sub_from // p256_sub(H, U2, U1); - - orr x14,x14,x15 // see if result is zero - orr x16,x16,x17 - orr x14,x14,x16 // ~is_equal(U1,U2) - - mvn x27,x24 // -1/0 -> 0/-1 - mvn x28,x25 // -1/0 -> 0/-1 - orr x14,x14,x27 - orr x14,x14,x28 - orr x14,x14,x26 - cbnz x14,.Ladd_proceed // if(~is_equal(U1,U2) | in1infty | in2infty | ~is_equal(S1,S2)) - -.Ladd_double: - mov x1,x22 - mov x0,x21 - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - add sp,sp,#256 // #256 is from #32*(12-4). difference in stack frames - b .Ldouble_shortcut - -.align 4 -.Ladd_proceed: - add x0,sp,#192 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); - - ldr x3,[x22,#64] - ldp x4,x5,[sp,#96] - ldp x6,x7,[sp,#96+16] - add x2,x22,#64 - add x0,sp,#64 - bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); - - ldp x4,x5,[sp,#96] - ldp x6,x7,[sp,#96+16] - add x0,sp,#128 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); - - ldr x3,[x23,#64] - ldp x4,x5,[sp,#64] - ldp x6,x7,[sp,#64+16] - add x2,x23,#64 - add x0,sp,#64 - bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, res_z, in2_z); - - ldr x3,[sp,#96] - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x2,sp,#96 - add x0,sp,#224 - bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); - - ldr x3,[sp,#128] - ldp x4,x5,[sp,#256] - ldp x6,x7,[sp,#256+16] - add x2,sp,#128 - add x0,sp,#288 - bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, U1, Hsqr); - - mov x8,x14 - mov x9,x15 - mov x10,x16 - mov x11,x17 - add x0,sp,#128 - bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); - - add x2,sp,#192 - add x0,sp,#0 - bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); - - add x2,sp,#224 - bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); - - add x2,sp,#288 - ldr x3,[sp,#224] // forward load for p256_mul_mont - ldp x4,x5,[sp,#320] - ldp x6,x7,[sp,#320+16] - add x0,sp,#32 - bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); - - add x2,sp,#224 - add x0,sp,#352 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S1, Hcub); - - ldr x3,[sp,#160] - ldp x4,x5,[sp,#32] - ldp x6,x7,[sp,#32+16] - add x2,sp,#160 - add x0,sp,#32 - bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); - - add x2,sp,#352 - bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); - - ldp x4,x5,[sp,#0] // res - ldp x6,x7,[sp,#0+16] - ldp x8,x9,[x23] // in2 - ldp x10,x11,[x23,#16] - ldp x14,x15,[x22,#0] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#0+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+0+32] // res - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+0+48] - csel x14,x8,x14,ne - csel x15,x9,x15,ne - ldp x8,x9,[x23,#0+32] // in2 - csel x16,x10,x16,ne - csel x17,x11,x17,ne - ldp x10,x11,[x23,#0+48] - stp x14,x15,[x21,#0] - stp x16,x17,[x21,#0+16] - ldp x14,x15,[x22,#32] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#32+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+32+32] // res - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+32+48] - csel x14,x8,x14,ne - csel x15,x9,x15,ne - ldp x8,x9,[x23,#32+32] // in2 - csel x16,x10,x16,ne - csel x17,x11,x17,ne - ldp x10,x11,[x23,#32+48] - stp x14,x15,[x21,#32] - stp x16,x17,[x21,#32+16] - ldp x14,x15,[x22,#64] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#64+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - csel x14,x8,x14,ne - csel x15,x9,x15,ne - csel x16,x10,x16,ne - csel x17,x11,x17,ne - stp x14,x15,[x21,#64] - stp x16,x17,[x21,#64+16] - -.Ladd_done: - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#96 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ecp_nistz256_point_add,.-ecp_nistz256_point_add -.globl ecp_nistz256_point_add_affine -.hidden ecp_nistz256_point_add_affine -.type ecp_nistz256_point_add_affine,%function -.align 5 -ecp_nistz256_point_add_affine: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-80]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - sub sp,sp,#32*10 - - mov x21,x0 - mov x22,x1 - mov x23,x2 - ldr x12,.Lpoly+8 - ldr x13,.Lpoly+24 - - ldp x4,x5,[x1,#64] // in1_z - ldp x6,x7,[x1,#64+16] - orr x8,x4,x5 - orr x10,x6,x7 - orr x24,x8,x10 - cmp x24,#0 - csetm x24,ne // ~in1infty - - ldp x14,x15,[x2] // in2_x - ldp x16,x17,[x2,#16] - ldp x8,x9,[x2,#32] // in2_y - ldp x10,x11,[x2,#48] - orr x14,x14,x15 - orr x16,x16,x17 - orr x8,x8,x9 - orr x10,x10,x11 - orr x14,x14,x16 - orr x8,x8,x10 - orr x25,x14,x8 - cmp x25,#0 - csetm x25,ne // ~in2infty - - add x0,sp,#128 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Z1sqr, in1_z); - - mov x4,x14 - mov x5,x15 - mov x6,x16 - mov x7,x17 - ldr x3,[x23] - add x2,x23,#0 - add x0,sp,#96 - bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, Z1sqr, in2_x); - - add x2,x22,#0 - ldr x3,[x22,#64] // forward load for p256_mul_mont - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x0,sp,#160 - bl __ecp_nistz256_sub_from // p256_sub(H, U2, in1_x); - - add x2,x22,#64 - add x0,sp,#128 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, Z1sqr, in1_z); - - ldr x3,[x22,#64] - ldp x4,x5,[sp,#160] - ldp x6,x7,[sp,#160+16] - add x2,x22,#64 - add x0,sp,#64 - bl __ecp_nistz256_mul_mont // p256_mul_mont(res_z, H, in1_z); - - ldr x3,[x23,#32] - ldp x4,x5,[sp,#128] - ldp x6,x7,[sp,#128+16] - add x2,x23,#32 - add x0,sp,#128 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, S2, in2_y); - - add x2,x22,#32 - ldp x4,x5,[sp,#160] // forward load for p256_sqr_mont - ldp x6,x7,[sp,#160+16] - add x0,sp,#192 - bl __ecp_nistz256_sub_from // p256_sub(R, S2, in1_y); - - add x0,sp,#224 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Hsqr, H); - - ldp x4,x5,[sp,#192] - ldp x6,x7,[sp,#192+16] - add x0,sp,#288 - bl __ecp_nistz256_sqr_mont // p256_sqr_mont(Rsqr, R); - - ldr x3,[sp,#160] - ldp x4,x5,[sp,#224] - ldp x6,x7,[sp,#224+16] - add x2,sp,#160 - add x0,sp,#256 - bl __ecp_nistz256_mul_mont // p256_mul_mont(Hcub, Hsqr, H); - - ldr x3,[x22] - ldp x4,x5,[sp,#224] - ldp x6,x7,[sp,#224+16] - add x2,x22,#0 - add x0,sp,#96 - bl __ecp_nistz256_mul_mont // p256_mul_mont(U2, in1_x, Hsqr); - - mov x8,x14 - mov x9,x15 - mov x10,x16 - mov x11,x17 - add x0,sp,#224 - bl __ecp_nistz256_add_to // p256_mul_by_2(Hsqr, U2); - - add x2,sp,#288 - add x0,sp,#0 - bl __ecp_nistz256_sub_morf // p256_sub(res_x, Rsqr, Hsqr); - - add x2,sp,#256 - bl __ecp_nistz256_sub_from // p256_sub(res_x, res_x, Hcub); - - add x2,sp,#96 - ldr x3,[x22,#32] // forward load for p256_mul_mont - ldp x4,x5,[sp,#256] - ldp x6,x7,[sp,#256+16] - add x0,sp,#32 - bl __ecp_nistz256_sub_morf // p256_sub(res_y, U2, res_x); - - add x2,x22,#32 - add x0,sp,#128 - bl __ecp_nistz256_mul_mont // p256_mul_mont(S2, in1_y, Hcub); - - ldr x3,[sp,#192] - ldp x4,x5,[sp,#32] - ldp x6,x7,[sp,#32+16] - add x2,sp,#192 - add x0,sp,#32 - bl __ecp_nistz256_mul_mont // p256_mul_mont(res_y, res_y, R); - - add x2,sp,#128 - bl __ecp_nistz256_sub_from // p256_sub(res_y, res_y, S2); - - ldp x4,x5,[sp,#0] // res - ldp x6,x7,[sp,#0+16] - ldp x8,x9,[x23] // in2 - ldp x10,x11,[x23,#16] - ldp x14,x15,[x22,#0] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#0+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+0+32] // res - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+0+48] - csel x14,x8,x14,ne - csel x15,x9,x15,ne - ldp x8,x9,[x23,#0+32] // in2 - csel x16,x10,x16,ne - csel x17,x11,x17,ne - ldp x10,x11,[x23,#0+48] - stp x14,x15,[x21,#0] - stp x16,x17,[x21,#0+16] - adr x23,.Lone_mont-64 - ldp x14,x15,[x22,#32] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#32+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - ldp x4,x5,[sp,#0+32+32] // res - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - ldp x6,x7,[sp,#0+32+48] - csel x14,x8,x14,ne - csel x15,x9,x15,ne - ldp x8,x9,[x23,#32+32] // in2 - csel x16,x10,x16,ne - csel x17,x11,x17,ne - ldp x10,x11,[x23,#32+48] - stp x14,x15,[x21,#32] - stp x16,x17,[x21,#32+16] - ldp x14,x15,[x22,#64] // in1 - cmp x24,#0 // ~, remember? - ldp x16,x17,[x22,#64+16] - csel x8,x4,x8,ne - csel x9,x5,x9,ne - csel x10,x6,x10,ne - csel x11,x7,x11,ne - cmp x25,#0 // ~, remember? - csel x14,x8,x14,ne - csel x15,x9,x15,ne - csel x16,x10,x16,ne - csel x17,x11,x17,ne - stp x14,x15,[x21,#64] - stp x16,x17,[x21,#64+16] - - add sp,x29,#0 // destroy frame - ldp x19,x20,[x29,#16] - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x29,x30,[sp],#80 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine -//////////////////////////////////////////////////////////////////////// -// void ecp_nistz256_ord_mul_mont(uint64_t res[4], uint64_t a[4], -// uint64_t b[4]); -.globl ecp_nistz256_ord_mul_mont -.hidden ecp_nistz256_ord_mul_mont -.type ecp_nistz256_ord_mul_mont,%function -.align 4 -ecp_nistz256_ord_mul_mont: - AARCH64_VALID_CALL_TARGET - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - adr x23,.Lord - ldr x3,[x2] // bp[0] - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - - ldp x12,x13,[x23,#0] - ldp x21,x22,[x23,#16] - ldr x23,[x23,#32] - - mul x14,x4,x3 // a[0]*b[0] - umulh x8,x4,x3 - - mul x15,x5,x3 // a[1]*b[0] - umulh x9,x5,x3 - - mul x16,x6,x3 // a[2]*b[0] - umulh x10,x6,x3 - - mul x17,x7,x3 // a[3]*b[0] - umulh x19,x7,x3 - - mul x24,x14,x23 - - adds x15,x15,x8 // accumulate high parts of multiplication - adcs x16,x16,x9 - adcs x17,x17,x10 - adc x19,x19,xzr - mov x20,xzr - ldr x3,[x2,#8*1] // b[i] - - lsl x8,x24,#32 - subs x16,x16,x24 - lsr x9,x24,#32 - sbcs x17,x17,x8 - sbcs x19,x19,x9 - sbc x20,x20,xzr - - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - mul x8,x4,x3 - adc x11,x11,xzr - mul x9,x5,x3 - - adds x14,x15,x10 - mul x10,x6,x3 - adcs x15,x16,x11 - mul x11,x7,x3 - adcs x16,x17,x24 - adcs x17,x19,x24 - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts - umulh x8,x4,x3 - adcs x15,x15,x9 - umulh x9,x5,x3 - adcs x16,x16,x10 - umulh x10,x6,x3 - adcs x17,x17,x11 - umulh x11,x7,x3 - adc x19,x19,xzr - mul x24,x14,x23 - adds x15,x15,x8 // accumulate high parts - adcs x16,x16,x9 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - ldr x3,[x2,#8*2] // b[i] - - lsl x8,x24,#32 - subs x16,x16,x24 - lsr x9,x24,#32 - sbcs x17,x17,x8 - sbcs x19,x19,x9 - sbc x20,x20,xzr - - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - mul x8,x4,x3 - adc x11,x11,xzr - mul x9,x5,x3 - - adds x14,x15,x10 - mul x10,x6,x3 - adcs x15,x16,x11 - mul x11,x7,x3 - adcs x16,x17,x24 - adcs x17,x19,x24 - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts - umulh x8,x4,x3 - adcs x15,x15,x9 - umulh x9,x5,x3 - adcs x16,x16,x10 - umulh x10,x6,x3 - adcs x17,x17,x11 - umulh x11,x7,x3 - adc x19,x19,xzr - mul x24,x14,x23 - adds x15,x15,x8 // accumulate high parts - adcs x16,x16,x9 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - ldr x3,[x2,#8*3] // b[i] - - lsl x8,x24,#32 - subs x16,x16,x24 - lsr x9,x24,#32 - sbcs x17,x17,x8 - sbcs x19,x19,x9 - sbc x20,x20,xzr - - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - mul x8,x4,x3 - adc x11,x11,xzr - mul x9,x5,x3 - - adds x14,x15,x10 - mul x10,x6,x3 - adcs x15,x16,x11 - mul x11,x7,x3 - adcs x16,x17,x24 - adcs x17,x19,x24 - adc x19,x20,xzr - - adds x14,x14,x8 // accumulate low parts - umulh x8,x4,x3 - adcs x15,x15,x9 - umulh x9,x5,x3 - adcs x16,x16,x10 - umulh x10,x6,x3 - adcs x17,x17,x11 - umulh x11,x7,x3 - adc x19,x19,xzr - mul x24,x14,x23 - adds x15,x15,x8 // accumulate high parts - adcs x16,x16,x9 - adcs x17,x17,x10 - adcs x19,x19,x11 - adc x20,xzr,xzr - lsl x8,x24,#32 // last reduction - subs x16,x16,x24 - lsr x9,x24,#32 - sbcs x17,x17,x8 - sbcs x19,x19,x9 - sbc x20,x20,xzr - - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - adc x11,x11,xzr - - adds x14,x15,x10 - adcs x15,x16,x11 - adcs x16,x17,x24 - adcs x17,x19,x24 - adc x19,x20,xzr - - subs x8,x14,x12 // ret -= modulus - sbcs x9,x15,x13 - sbcs x10,x16,x21 - sbcs x11,x17,x22 - sbcs xzr,x19,xzr - - csel x14,x14,x8,lo // ret = borrow ? ret : ret-modulus - csel x15,x15,x9,lo - csel x16,x16,x10,lo - stp x14,x15,[x0] - csel x17,x17,x11,lo - stp x16,x17,[x0,#16] - - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldr x29,[sp],#64 - ret -.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont - -//////////////////////////////////////////////////////////////////////// -// void ecp_nistz256_ord_sqr_mont(uint64_t res[4], uint64_t a[4], -// int rep); -.globl ecp_nistz256_ord_sqr_mont -.hidden ecp_nistz256_ord_sqr_mont -.type ecp_nistz256_ord_sqr_mont,%function -.align 4 -ecp_nistz256_ord_sqr_mont: - AARCH64_VALID_CALL_TARGET - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-64]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - - adr x23,.Lord - ldp x4,x5,[x1] - ldp x6,x7,[x1,#16] - - ldp x12,x13,[x23,#0] - ldp x21,x22,[x23,#16] - ldr x23,[x23,#32] - b .Loop_ord_sqr - -.align 4 -.Loop_ord_sqr: - sub x2,x2,#1 - //////////////////////////////////////////////////////////////// - // | | | | | |a1*a0| | - // | | | | |a2*a0| | | - // | |a3*a2|a3*a0| | | | - // | | | |a2*a1| | | | - // | | |a3*a1| | | | | - // *| | | | | | | | 2| - // +|a3*a3|a2*a2|a1*a1|a0*a0| - // |--+--+--+--+--+--+--+--| - // |A7|A6|A5|A4|A3|A2|A1|A0|, where Ax is , i.e. follow - // - // "can't overflow" below mark carrying into high part of - // multiplication result, which can't overflow, because it - // can never be all ones. - - mul x15,x5,x4 // a[1]*a[0] - umulh x9,x5,x4 - mul x16,x6,x4 // a[2]*a[0] - umulh x10,x6,x4 - mul x17,x7,x4 // a[3]*a[0] - umulh x19,x7,x4 - - adds x16,x16,x9 // accumulate high parts of multiplication - mul x8,x6,x5 // a[2]*a[1] - umulh x9,x6,x5 - adcs x17,x17,x10 - mul x10,x7,x5 // a[3]*a[1] - umulh x11,x7,x5 - adc x19,x19,xzr // can't overflow - - mul x20,x7,x6 // a[3]*a[2] - umulh x1,x7,x6 - - adds x9,x9,x10 // accumulate high parts of multiplication - mul x14,x4,x4 // a[0]*a[0] - adc x10,x11,xzr // can't overflow - - adds x17,x17,x8 // accumulate low parts of multiplication - umulh x4,x4,x4 - adcs x19,x19,x9 - mul x9,x5,x5 // a[1]*a[1] - adcs x20,x20,x10 - umulh x5,x5,x5 - adc x1,x1,xzr // can't overflow - - adds x15,x15,x15 // acc[1-6]*=2 - mul x10,x6,x6 // a[2]*a[2] - adcs x16,x16,x16 - umulh x6,x6,x6 - adcs x17,x17,x17 - mul x11,x7,x7 // a[3]*a[3] - adcs x19,x19,x19 - umulh x7,x7,x7 - adcs x20,x20,x20 - adcs x1,x1,x1 - adc x3,xzr,xzr - - adds x15,x15,x4 // +a[i]*a[i] - mul x24,x14,x23 - adcs x16,x16,x9 - adcs x17,x17,x5 - adcs x19,x19,x10 - adcs x20,x20,x6 - adcs x1,x1,x11 - adc x3,x3,x7 - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - adc x11,x11,xzr - - adds x14,x15,x10 - adcs x15,x16,x11 - adcs x16,x17,x24 - adc x17,xzr,x24 // can't overflow - mul x11,x14,x23 - lsl x8,x24,#32 - subs x15,x15,x24 - lsr x9,x24,#32 - sbcs x16,x16,x8 - sbc x17,x17,x9 // can't borrow - subs xzr,x14,#1 - umulh x9,x12,x11 - mul x10,x13,x11 - umulh x24,x13,x11 - - adcs x10,x10,x9 - adc x24,x24,xzr - - adds x14,x15,x10 - adcs x15,x16,x24 - adcs x16,x17,x11 - adc x17,xzr,x11 // can't overflow - mul x24,x14,x23 - lsl x8,x11,#32 - subs x15,x15,x11 - lsr x9,x11,#32 - sbcs x16,x16,x8 - sbc x17,x17,x9 // can't borrow - subs xzr,x14,#1 - umulh x9,x12,x24 - mul x10,x13,x24 - umulh x11,x13,x24 - - adcs x10,x10,x9 - adc x11,x11,xzr - - adds x14,x15,x10 - adcs x15,x16,x11 - adcs x16,x17,x24 - adc x17,xzr,x24 // can't overflow - mul x11,x14,x23 - lsl x8,x24,#32 - subs x15,x15,x24 - lsr x9,x24,#32 - sbcs x16,x16,x8 - sbc x17,x17,x9 // can't borrow - subs xzr,x14,#1 - umulh x9,x12,x11 - mul x10,x13,x11 - umulh x24,x13,x11 - - adcs x10,x10,x9 - adc x24,x24,xzr - - adds x14,x15,x10 - adcs x15,x16,x24 - adcs x16,x17,x11 - adc x17,xzr,x11 // can't overflow - lsl x8,x11,#32 - subs x15,x15,x11 - lsr x9,x11,#32 - sbcs x16,x16,x8 - sbc x17,x17,x9 // can't borrow - adds x14,x14,x19 // accumulate upper half - adcs x15,x15,x20 - adcs x16,x16,x1 - adcs x17,x17,x3 - adc x19,xzr,xzr - - subs x8,x14,x12 // ret -= modulus - sbcs x9,x15,x13 - sbcs x10,x16,x21 - sbcs x11,x17,x22 - sbcs xzr,x19,xzr - - csel x4,x14,x8,lo // ret = borrow ? ret : ret-modulus - csel x5,x15,x9,lo - csel x6,x16,x10,lo - csel x7,x17,x11,lo - - cbnz x2,.Loop_ord_sqr - - stp x4,x5,[x0] - stp x6,x7,[x0,#16] - - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldr x29,[sp],#64 - ret -.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont -//////////////////////////////////////////////////////////////////////// -// void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index); -.globl ecp_nistz256_select_w5 -.hidden ecp_nistz256_select_w5 -.type ecp_nistz256_select_w5,%function -.align 4 -ecp_nistz256_select_w5: - AARCH64_VALID_CALL_TARGET - - // x10 := x0 - // w9 := 0; loop counter and incremented internal index - mov x10, x0 - mov w9, #0 - - // [v16-v21] := 0 - movi v16.16b, #0 - movi v17.16b, #0 - movi v18.16b, #0 - movi v19.16b, #0 - movi v20.16b, #0 - movi v21.16b, #0 - -.Lselect_w5_loop: - // Loop 16 times. - - // Increment index (loop counter); tested at the end of the loop - add w9, w9, #1 - - // [v22-v27] := Load a (3*256-bit = 6*128-bit) table entry starting at x1 - // and advance x1 to point to the next entry - ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 - - // x11 := (w9 == w2)? All 1s : All 0s - cmp w9, w2 - csetm x11, eq - - // continue loading ... - ld1 {v26.2d, v27.2d}, [x1],#32 - - // duplicate mask_64 into Mask (all 0s or all 1s) - dup v3.2d, x11 - - // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] - // i.e., values in output registers will remain the same if w9 != w2 - bit v16.16b, v22.16b, v3.16b - bit v17.16b, v23.16b, v3.16b - - bit v18.16b, v24.16b, v3.16b - bit v19.16b, v25.16b, v3.16b - - bit v20.16b, v26.16b, v3.16b - bit v21.16b, v27.16b, v3.16b - - // If bit #4 is not 0 (i.e. idx_ctr < 16) loop back - tbz w9, #4, .Lselect_w5_loop - - // Write [v16-v21] to memory at the output pointer - st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x10],#64 - st1 {v20.2d, v21.2d}, [x10] - - ret -.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 - - -//////////////////////////////////////////////////////////////////////// -// void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index); -.globl ecp_nistz256_select_w7 -.hidden ecp_nistz256_select_w7 -.type ecp_nistz256_select_w7,%function -.align 4 -ecp_nistz256_select_w7: - AARCH64_VALID_CALL_TARGET - - // w9 := 0; loop counter and incremented internal index - mov w9, #0 - - // [v16-v21] := 0 - movi v16.16b, #0 - movi v17.16b, #0 - movi v18.16b, #0 - movi v19.16b, #0 - -.Lselect_w7_loop: - // Loop 64 times. - - // Increment index (loop counter); tested at the end of the loop - add w9, w9, #1 - - // [v22-v25] := Load a (2*256-bit = 4*128-bit) table entry starting at x1 - // and advance x1 to point to the next entry - ld1 {v22.2d, v23.2d, v24.2d, v25.2d}, [x1],#64 - - // x11 := (w9 == w2)? All 1s : All 0s - cmp w9, w2 - csetm x11, eq - - // duplicate mask_64 into Mask (all 0s or all 1s) - dup v3.2d, x11 - - // [v16-v19] := (Mask == all 1s)? [v22-v25] : [v16-v19] - // i.e., values in output registers will remain the same if w9 != w2 - bit v16.16b, v22.16b, v3.16b - bit v17.16b, v23.16b, v3.16b - - bit v18.16b, v24.16b, v3.16b - bit v19.16b, v25.16b, v3.16b - - // If bit #6 is not 0 (i.e. idx_ctr < 64) loop back - tbz w9, #6, .Lselect_w7_loop - - // Write [v16-v19] to memory at the output pointer - st1 {v16.2d, v17.2d, v18.2d, v19.2d}, [x0] - - ret -.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S deleted file mode 100644 index 9243b8b3..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S +++ /dev/null @@ -1,320 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include "openssl/arm_arch.h" - -.text -.globl beeu_mod_inverse_vartime -.hidden beeu_mod_inverse_vartime -.type beeu_mod_inverse_vartime, %function -.align 4 -beeu_mod_inverse_vartime: - // Reserve enough space for 14 8-byte registers on the stack - // in the first stp call for x29, x30. - // Then store the remaining callee-saved registers. - // - // | x29 | x30 | x19 | x20 | ... | x27 | x28 | x0 | x2 | - // ^ ^ - // sp <------------------- 112 bytes ----------------> old sp - // x29 (FP) - // - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-112]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - stp x0,x2,[sp,#96] - - // B = b3..b0 := a - ldp x25,x26,[x1] - ldp x27,x28,[x1,#16] - - // n3..n0 := n - // Note: the value of input params are changed in the following. - ldp x0,x1,[x2] - ldp x2,x30,[x2,#16] - - // A = a3..a0 := n - mov x21, x0 - mov x22, x1 - mov x23, x2 - mov x24, x30 - - // X = x4..x0 := 1 - mov x3, #1 - eor x4, x4, x4 - eor x5, x5, x5 - eor x6, x6, x6 - eor x7, x7, x7 - - // Y = y4..y0 := 0 - eor x8, x8, x8 - eor x9, x9, x9 - eor x10, x10, x10 - eor x11, x11, x11 - eor x12, x12, x12 - -.Lbeeu_loop: - // if B == 0, jump to .Lbeeu_loop_end - orr x14, x25, x26 - orr x14, x14, x27 - - // reverse the bit order of x25. This is needed for clz after this macro - rbit x15, x25 - - orr x14, x14, x28 - cbz x14,.Lbeeu_loop_end - - - // 0 < B < |n|, - // 0 < A <= |n|, - // (1) X*a == B (mod |n|), - // (2) (-1)*Y*a == A (mod |n|) - - // Now divide B by the maximum possible power of two in the - // integers, and divide X by the same value mod |n|. - // When we're done, (1) still holds. - - // shift := number of trailing 0s in x25 - // ( = number of leading 0s in x15; see the "rbit" instruction in TEST_B_ZERO) - clz x13, x15 - - // If there is no shift, goto shift_A_Y - cbz x13, .Lbeeu_shift_A_Y - - // Shift B right by "x13" bits - neg x14, x13 - lsr x25, x25, x13 - lsl x15, x26, x14 - - lsr x26, x26, x13 - lsl x19, x27, x14 - - orr x25, x25, x15 - - lsr x27, x27, x13 - lsl x20, x28, x14 - - orr x26, x26, x19 - - lsr x28, x28, x13 - - orr x27, x27, x20 - - - // Shift X right by "x13" bits, adding n whenever X becomes odd. - // x13--; - // x14 := 0; needed in the addition to the most significant word in SHIFT1 - eor x14, x14, x14 -.Lbeeu_shift_loop_X: - tbz x3, #0, .Lshift1_0 - adds x3, x3, x0 - adcs x4, x4, x1 - adcs x5, x5, x2 - adcs x6, x6, x30 - adc x7, x7, x14 -.Lshift1_0: - // var0 := [var1|var0]<64..1>; - // i.e. concatenate var1 and var0, - // extract bits <64..1> from the resulting 128-bit value - // and put them in var0 - extr x3, x4, x3, #1 - extr x4, x5, x4, #1 - extr x5, x6, x5, #1 - extr x6, x7, x6, #1 - lsr x7, x7, #1 - - subs x13, x13, #1 - bne .Lbeeu_shift_loop_X - - // Note: the steps above perform the same sequence as in p256_beeu-x86_64-asm.pl - // with the following differences: - // - "x13" is set directly to the number of trailing 0s in B - // (using rbit and clz instructions) - // - The loop is only used to call SHIFT1(X) - // and x13 is decreased while executing the X loop. - // - SHIFT256(B, x13) is performed before right-shifting X; they are independent - -.Lbeeu_shift_A_Y: - // Same for A and Y. - // Afterwards, (2) still holds. - // Reverse the bit order of x21 - // x13 := number of trailing 0s in x21 (= number of leading 0s in x15) - rbit x15, x21 - clz x13, x15 - - // If there is no shift, goto |B-A|, X+Y update - cbz x13, .Lbeeu_update_B_X_or_A_Y - - // Shift A right by "x13" bits - neg x14, x13 - lsr x21, x21, x13 - lsl x15, x22, x14 - - lsr x22, x22, x13 - lsl x19, x23, x14 - - orr x21, x21, x15 - - lsr x23, x23, x13 - lsl x20, x24, x14 - - orr x22, x22, x19 - - lsr x24, x24, x13 - - orr x23, x23, x20 - - - // Shift Y right by "x13" bits, adding n whenever Y becomes odd. - // x13--; - // x14 := 0; needed in the addition to the most significant word in SHIFT1 - eor x14, x14, x14 -.Lbeeu_shift_loop_Y: - tbz x8, #0, .Lshift1_1 - adds x8, x8, x0 - adcs x9, x9, x1 - adcs x10, x10, x2 - adcs x11, x11, x30 - adc x12, x12, x14 -.Lshift1_1: - // var0 := [var1|var0]<64..1>; - // i.e. concatenate var1 and var0, - // extract bits <64..1> from the resulting 128-bit value - // and put them in var0 - extr x8, x9, x8, #1 - extr x9, x10, x9, #1 - extr x10, x11, x10, #1 - extr x11, x12, x11, #1 - lsr x12, x12, #1 - - subs x13, x13, #1 - bne .Lbeeu_shift_loop_Y - -.Lbeeu_update_B_X_or_A_Y: - // Try T := B - A; if cs, continue with B > A (cs: carry set = no borrow) - // Note: this is a case of unsigned arithmetic, where T fits in 4 64-bit words - // without taking a sign bit if generated. The lack of a carry would - // indicate a negative result. See, for example, - // https://community.arm.com/developer/ip-products/processors/b/processors-ip-blog/posts/condition-codes-1-condition-flags-and-codes - subs x14, x25, x21 - sbcs x15, x26, x22 - sbcs x19, x27, x23 - sbcs x20, x28, x24 - bcs .Lbeeu_B_greater_than_A - - // Else A > B => - // A := A - B; Y := Y + X; goto beginning of the loop - subs x21, x21, x25 - sbcs x22, x22, x26 - sbcs x23, x23, x27 - sbcs x24, x24, x28 - - adds x8, x8, x3 - adcs x9, x9, x4 - adcs x10, x10, x5 - adcs x11, x11, x6 - adc x12, x12, x7 - b .Lbeeu_loop - -.Lbeeu_B_greater_than_A: - // Continue with B > A => - // B := B - A; X := X + Y; goto beginning of the loop - mov x25, x14 - mov x26, x15 - mov x27, x19 - mov x28, x20 - - adds x3, x3, x8 - adcs x4, x4, x9 - adcs x5, x5, x10 - adcs x6, x6, x11 - adc x7, x7, x12 - b .Lbeeu_loop - -.Lbeeu_loop_end: - // The Euclid's algorithm loop ends when A == gcd(a,n); - // this would be 1, when a and n are co-prime (i.e. do not have a common factor). - // Since (-1)*Y*a == A (mod |n|), Y>0 - // then out = -Y mod n - - // Verify that A = 1 ==> (-1)*Y*a = A = 1 (mod |n|) - // Is A-1 == 0? - // If not, fail. - sub x14, x21, #1 - orr x14, x14, x22 - orr x14, x14, x23 - orr x14, x14, x24 - cbnz x14, .Lbeeu_err - - // If Y>n ==> Y:=Y-n -.Lbeeu_reduction_loop: - // x_i := y_i - n_i (X is no longer needed, use it as temp) - // (x14 = 0 from above) - subs x3, x8, x0 - sbcs x4, x9, x1 - sbcs x5, x10, x2 - sbcs x6, x11, x30 - sbcs x7, x12, x14 - - // If result is non-negative (i.e., cs = carry set = no borrow), - // y_i := x_i; goto reduce again - // else - // y_i := y_i; continue - csel x8, x3, x8, cs - csel x9, x4, x9, cs - csel x10, x5, x10, cs - csel x11, x6, x11, cs - csel x12, x7, x12, cs - bcs .Lbeeu_reduction_loop - - // Now Y < n (Y cannot be equal to n, since the inverse cannot be 0) - // out = -Y = n-Y - subs x8, x0, x8 - sbcs x9, x1, x9 - sbcs x10, x2, x10 - sbcs x11, x30, x11 - - // Save Y in output (out (x0) was saved on the stack) - ldr x3, [sp,#96] - stp x8, x9, [x3] - stp x10, x11, [x3,#16] - // return 1 (success) - mov x0, #1 - b .Lbeeu_finish - -.Lbeeu_err: - // return 0 (error) - eor x0, x0, x0 - -.Lbeeu_finish: - // Restore callee-saved registers, except x0, x2 - add sp,x29,#0 - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldp x25,x26,[sp,#64] - ldp x27,x28,[sp,#80] - ldp x29,x30,[sp],#112 - - AARCH64_VALIDATE_LINK_REGISTER - ret -.size beeu_mod_inverse_vartime,.-beeu_mod_inverse_vartime -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S deleted file mode 100644 index d7a87958..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha1-armv8.S +++ /dev/null @@ -1,1238 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.text - - -.hidden OPENSSL_armcap_P -.globl sha1_block_data_order -.hidden sha1_block_data_order -.type sha1_block_data_order,%function -.align 6 -sha1_block_data_order: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - AARCH64_VALID_CALL_TARGET -#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x16,OPENSSL_armcap_P -#endif - ldr w16,[x16,:lo12:OPENSSL_armcap_P] - tst w16,#ARMV8_SHA1 - b.ne .Lv8_entry - - stp x29,x30,[sp,#-96]! - add x29,sp,#0 - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - - ldp w20,w21,[x0] - ldp w22,w23,[x0,#8] - ldr w24,[x0,#16] - -.Loop: - ldr x3,[x1],#64 - movz w28,#0x7999 - sub x2,x2,#1 - movk w28,#0x5a82,lsl#16 -#ifdef __AARCH64EB__ - ror x3,x3,#32 -#else - rev32 x3,x3 -#endif - add w24,w24,w28 // warm it up - add w24,w24,w3 - lsr x4,x3,#32 - ldr x5,[x1,#-56] - bic w25,w23,w21 - and w26,w22,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - orr w25,w25,w26 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - add w23,w23,w4 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x5,x5,#32 -#else - rev32 x5,x5 -#endif - bic w25,w22,w20 - and w26,w21,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - orr w25,w25,w26 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - add w22,w22,w5 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - lsr x6,x5,#32 - ldr x7,[x1,#-48] - bic w25,w21,w24 - and w26,w20,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - orr w25,w25,w26 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - add w21,w21,w6 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x7,x7,#32 -#else - rev32 x7,x7 -#endif - bic w25,w20,w23 - and w26,w24,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - orr w25,w25,w26 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - add w20,w20,w7 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - lsr x8,x7,#32 - ldr x9,[x1,#-40] - bic w25,w24,w22 - and w26,w23,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - orr w25,w25,w26 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - add w24,w24,w8 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x9,x9,#32 -#else - rev32 x9,x9 -#endif - bic w25,w23,w21 - and w26,w22,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - orr w25,w25,w26 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - add w23,w23,w9 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - lsr x10,x9,#32 - ldr x11,[x1,#-32] - bic w25,w22,w20 - and w26,w21,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - orr w25,w25,w26 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - add w22,w22,w10 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x11,x11,#32 -#else - rev32 x11,x11 -#endif - bic w25,w21,w24 - and w26,w20,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - orr w25,w25,w26 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - add w21,w21,w11 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - lsr x12,x11,#32 - ldr x13,[x1,#-24] - bic w25,w20,w23 - and w26,w24,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - orr w25,w25,w26 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - add w20,w20,w12 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x13,x13,#32 -#else - rev32 x13,x13 -#endif - bic w25,w24,w22 - and w26,w23,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - orr w25,w25,w26 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - add w24,w24,w13 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - lsr x14,x13,#32 - ldr x15,[x1,#-16] - bic w25,w23,w21 - and w26,w22,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - orr w25,w25,w26 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - add w23,w23,w14 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x15,x15,#32 -#else - rev32 x15,x15 -#endif - bic w25,w22,w20 - and w26,w21,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - orr w25,w25,w26 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - add w22,w22,w15 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - lsr x16,x15,#32 - ldr x17,[x1,#-8] - bic w25,w21,w24 - and w26,w20,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - orr w25,w25,w26 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - add w21,w21,w16 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) -#ifdef __AARCH64EB__ - ror x17,x17,#32 -#else - rev32 x17,x17 -#endif - bic w25,w20,w23 - and w26,w24,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - orr w25,w25,w26 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - add w20,w20,w17 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - lsr x19,x17,#32 - eor w3,w3,w5 - bic w25,w24,w22 - and w26,w23,w22 - ror w27,w21,#27 - eor w3,w3,w11 - add w24,w24,w28 // future e+=K - orr w25,w25,w26 - add w20,w20,w27 // e+=rot(a,5) - eor w3,w3,w16 - ror w22,w22,#2 - add w24,w24,w19 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w3,w3,#31 - eor w4,w4,w6 - bic w25,w23,w21 - and w26,w22,w21 - ror w27,w20,#27 - eor w4,w4,w12 - add w23,w23,w28 // future e+=K - orr w25,w25,w26 - add w24,w24,w27 // e+=rot(a,5) - eor w4,w4,w17 - ror w21,w21,#2 - add w23,w23,w3 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w4,w4,#31 - eor w5,w5,w7 - bic w25,w22,w20 - and w26,w21,w20 - ror w27,w24,#27 - eor w5,w5,w13 - add w22,w22,w28 // future e+=K - orr w25,w25,w26 - add w23,w23,w27 // e+=rot(a,5) - eor w5,w5,w19 - ror w20,w20,#2 - add w22,w22,w4 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w5,w5,#31 - eor w6,w6,w8 - bic w25,w21,w24 - and w26,w20,w24 - ror w27,w23,#27 - eor w6,w6,w14 - add w21,w21,w28 // future e+=K - orr w25,w25,w26 - add w22,w22,w27 // e+=rot(a,5) - eor w6,w6,w3 - ror w24,w24,#2 - add w21,w21,w5 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w6,w6,#31 - eor w7,w7,w9 - bic w25,w20,w23 - and w26,w24,w23 - ror w27,w22,#27 - eor w7,w7,w15 - add w20,w20,w28 // future e+=K - orr w25,w25,w26 - add w21,w21,w27 // e+=rot(a,5) - eor w7,w7,w4 - ror w23,w23,#2 - add w20,w20,w6 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w7,w7,#31 - movz w28,#0xeba1 - movk w28,#0x6ed9,lsl#16 - eor w8,w8,w10 - bic w25,w24,w22 - and w26,w23,w22 - ror w27,w21,#27 - eor w8,w8,w16 - add w24,w24,w28 // future e+=K - orr w25,w25,w26 - add w20,w20,w27 // e+=rot(a,5) - eor w8,w8,w5 - ror w22,w22,#2 - add w24,w24,w7 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w8,w8,#31 - eor w9,w9,w11 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w9,w9,w17 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w9,w9,w6 - add w23,w23,w8 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w9,w9,#31 - eor w10,w10,w12 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w10,w10,w19 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w10,w10,w7 - add w22,w22,w9 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w10,w10,#31 - eor w11,w11,w13 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w11,w11,w3 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w11,w11,w8 - add w21,w21,w10 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w11,w11,#31 - eor w12,w12,w14 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w12,w12,w4 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w12,w12,w9 - add w20,w20,w11 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w12,w12,#31 - eor w13,w13,w15 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w13,w13,w5 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w13,w13,w10 - add w24,w24,w12 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w13,w13,#31 - eor w14,w14,w16 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w14,w14,w6 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w14,w14,w11 - add w23,w23,w13 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w14,w14,#31 - eor w15,w15,w17 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w15,w15,w7 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w15,w15,w12 - add w22,w22,w14 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w15,w15,#31 - eor w16,w16,w19 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w16,w16,w8 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w16,w16,w13 - add w21,w21,w15 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w16,w16,#31 - eor w17,w17,w3 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w17,w17,w9 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w17,w17,w14 - add w20,w20,w16 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w17,w17,#31 - eor w19,w19,w4 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w19,w19,w10 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w19,w19,w15 - add w24,w24,w17 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w19,w19,#31 - eor w3,w3,w5 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w3,w3,w11 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w3,w3,w16 - add w23,w23,w19 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w3,w3,#31 - eor w4,w4,w6 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w4,w4,w12 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w4,w4,w17 - add w22,w22,w3 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w4,w4,#31 - eor w5,w5,w7 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w5,w5,w13 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w5,w5,w19 - add w21,w21,w4 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w5,w5,#31 - eor w6,w6,w8 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w6,w6,w14 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w6,w6,w3 - add w20,w20,w5 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w6,w6,#31 - eor w7,w7,w9 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w7,w7,w15 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w7,w7,w4 - add w24,w24,w6 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w7,w7,#31 - eor w8,w8,w10 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w8,w8,w16 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w8,w8,w5 - add w23,w23,w7 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w8,w8,#31 - eor w9,w9,w11 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w9,w9,w17 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w9,w9,w6 - add w22,w22,w8 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w9,w9,#31 - eor w10,w10,w12 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w10,w10,w19 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w10,w10,w7 - add w21,w21,w9 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w10,w10,#31 - eor w11,w11,w13 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w11,w11,w3 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w11,w11,w8 - add w20,w20,w10 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w11,w11,#31 - movz w28,#0xbcdc - movk w28,#0x8f1b,lsl#16 - eor w12,w12,w14 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w12,w12,w4 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w12,w12,w9 - add w24,w24,w11 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w12,w12,#31 - orr w25,w21,w22 - and w26,w21,w22 - eor w13,w13,w15 - ror w27,w20,#27 - and w25,w25,w23 - add w23,w23,w28 // future e+=K - eor w13,w13,w5 - add w24,w24,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w21,w21,#2 - eor w13,w13,w10 - add w23,w23,w12 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w13,w13,#31 - orr w25,w20,w21 - and w26,w20,w21 - eor w14,w14,w16 - ror w27,w24,#27 - and w25,w25,w22 - add w22,w22,w28 // future e+=K - eor w14,w14,w6 - add w23,w23,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w20,w20,#2 - eor w14,w14,w11 - add w22,w22,w13 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w14,w14,#31 - orr w25,w24,w20 - and w26,w24,w20 - eor w15,w15,w17 - ror w27,w23,#27 - and w25,w25,w21 - add w21,w21,w28 // future e+=K - eor w15,w15,w7 - add w22,w22,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w24,w24,#2 - eor w15,w15,w12 - add w21,w21,w14 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w15,w15,#31 - orr w25,w23,w24 - and w26,w23,w24 - eor w16,w16,w19 - ror w27,w22,#27 - and w25,w25,w20 - add w20,w20,w28 // future e+=K - eor w16,w16,w8 - add w21,w21,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w23,w23,#2 - eor w16,w16,w13 - add w20,w20,w15 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w16,w16,#31 - orr w25,w22,w23 - and w26,w22,w23 - eor w17,w17,w3 - ror w27,w21,#27 - and w25,w25,w24 - add w24,w24,w28 // future e+=K - eor w17,w17,w9 - add w20,w20,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w22,w22,#2 - eor w17,w17,w14 - add w24,w24,w16 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w17,w17,#31 - orr w25,w21,w22 - and w26,w21,w22 - eor w19,w19,w4 - ror w27,w20,#27 - and w25,w25,w23 - add w23,w23,w28 // future e+=K - eor w19,w19,w10 - add w24,w24,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w21,w21,#2 - eor w19,w19,w15 - add w23,w23,w17 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w19,w19,#31 - orr w25,w20,w21 - and w26,w20,w21 - eor w3,w3,w5 - ror w27,w24,#27 - and w25,w25,w22 - add w22,w22,w28 // future e+=K - eor w3,w3,w11 - add w23,w23,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w20,w20,#2 - eor w3,w3,w16 - add w22,w22,w19 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w3,w3,#31 - orr w25,w24,w20 - and w26,w24,w20 - eor w4,w4,w6 - ror w27,w23,#27 - and w25,w25,w21 - add w21,w21,w28 // future e+=K - eor w4,w4,w12 - add w22,w22,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w24,w24,#2 - eor w4,w4,w17 - add w21,w21,w3 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w4,w4,#31 - orr w25,w23,w24 - and w26,w23,w24 - eor w5,w5,w7 - ror w27,w22,#27 - and w25,w25,w20 - add w20,w20,w28 // future e+=K - eor w5,w5,w13 - add w21,w21,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w23,w23,#2 - eor w5,w5,w19 - add w20,w20,w4 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w5,w5,#31 - orr w25,w22,w23 - and w26,w22,w23 - eor w6,w6,w8 - ror w27,w21,#27 - and w25,w25,w24 - add w24,w24,w28 // future e+=K - eor w6,w6,w14 - add w20,w20,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w22,w22,#2 - eor w6,w6,w3 - add w24,w24,w5 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w6,w6,#31 - orr w25,w21,w22 - and w26,w21,w22 - eor w7,w7,w9 - ror w27,w20,#27 - and w25,w25,w23 - add w23,w23,w28 // future e+=K - eor w7,w7,w15 - add w24,w24,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w21,w21,#2 - eor w7,w7,w4 - add w23,w23,w6 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w7,w7,#31 - orr w25,w20,w21 - and w26,w20,w21 - eor w8,w8,w10 - ror w27,w24,#27 - and w25,w25,w22 - add w22,w22,w28 // future e+=K - eor w8,w8,w16 - add w23,w23,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w20,w20,#2 - eor w8,w8,w5 - add w22,w22,w7 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w8,w8,#31 - orr w25,w24,w20 - and w26,w24,w20 - eor w9,w9,w11 - ror w27,w23,#27 - and w25,w25,w21 - add w21,w21,w28 // future e+=K - eor w9,w9,w17 - add w22,w22,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w24,w24,#2 - eor w9,w9,w6 - add w21,w21,w8 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w9,w9,#31 - orr w25,w23,w24 - and w26,w23,w24 - eor w10,w10,w12 - ror w27,w22,#27 - and w25,w25,w20 - add w20,w20,w28 // future e+=K - eor w10,w10,w19 - add w21,w21,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w23,w23,#2 - eor w10,w10,w7 - add w20,w20,w9 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w10,w10,#31 - orr w25,w22,w23 - and w26,w22,w23 - eor w11,w11,w13 - ror w27,w21,#27 - and w25,w25,w24 - add w24,w24,w28 // future e+=K - eor w11,w11,w3 - add w20,w20,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w22,w22,#2 - eor w11,w11,w8 - add w24,w24,w10 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w11,w11,#31 - orr w25,w21,w22 - and w26,w21,w22 - eor w12,w12,w14 - ror w27,w20,#27 - and w25,w25,w23 - add w23,w23,w28 // future e+=K - eor w12,w12,w4 - add w24,w24,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w21,w21,#2 - eor w12,w12,w9 - add w23,w23,w11 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w12,w12,#31 - orr w25,w20,w21 - and w26,w20,w21 - eor w13,w13,w15 - ror w27,w24,#27 - and w25,w25,w22 - add w22,w22,w28 // future e+=K - eor w13,w13,w5 - add w23,w23,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w20,w20,#2 - eor w13,w13,w10 - add w22,w22,w12 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w13,w13,#31 - orr w25,w24,w20 - and w26,w24,w20 - eor w14,w14,w16 - ror w27,w23,#27 - and w25,w25,w21 - add w21,w21,w28 // future e+=K - eor w14,w14,w6 - add w22,w22,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w24,w24,#2 - eor w14,w14,w11 - add w21,w21,w13 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w14,w14,#31 - orr w25,w23,w24 - and w26,w23,w24 - eor w15,w15,w17 - ror w27,w22,#27 - and w25,w25,w20 - add w20,w20,w28 // future e+=K - eor w15,w15,w7 - add w21,w21,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w23,w23,#2 - eor w15,w15,w12 - add w20,w20,w14 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w15,w15,#31 - movz w28,#0xc1d6 - movk w28,#0xca62,lsl#16 - orr w25,w22,w23 - and w26,w22,w23 - eor w16,w16,w19 - ror w27,w21,#27 - and w25,w25,w24 - add w24,w24,w28 // future e+=K - eor w16,w16,w8 - add w20,w20,w27 // e+=rot(a,5) - orr w25,w25,w26 - ror w22,w22,#2 - eor w16,w16,w13 - add w24,w24,w15 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w16,w16,#31 - eor w17,w17,w3 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w17,w17,w9 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w17,w17,w14 - add w23,w23,w16 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w17,w17,#31 - eor w19,w19,w4 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w19,w19,w10 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w19,w19,w15 - add w22,w22,w17 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w19,w19,#31 - eor w3,w3,w5 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w3,w3,w11 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w3,w3,w16 - add w21,w21,w19 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w3,w3,#31 - eor w4,w4,w6 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w4,w4,w12 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w4,w4,w17 - add w20,w20,w3 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w4,w4,#31 - eor w5,w5,w7 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w5,w5,w13 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w5,w5,w19 - add w24,w24,w4 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w5,w5,#31 - eor w6,w6,w8 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w6,w6,w14 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w6,w6,w3 - add w23,w23,w5 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w6,w6,#31 - eor w7,w7,w9 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w7,w7,w15 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w7,w7,w4 - add w22,w22,w6 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w7,w7,#31 - eor w8,w8,w10 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w8,w8,w16 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w8,w8,w5 - add w21,w21,w7 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w8,w8,#31 - eor w9,w9,w11 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w9,w9,w17 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w9,w9,w6 - add w20,w20,w8 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w9,w9,#31 - eor w10,w10,w12 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w10,w10,w19 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w10,w10,w7 - add w24,w24,w9 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w10,w10,#31 - eor w11,w11,w13 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w11,w11,w3 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w11,w11,w8 - add w23,w23,w10 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w11,w11,#31 - eor w12,w12,w14 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w12,w12,w4 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w12,w12,w9 - add w22,w22,w11 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w12,w12,#31 - eor w13,w13,w15 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w13,w13,w5 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w13,w13,w10 - add w21,w21,w12 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w13,w13,#31 - eor w14,w14,w16 - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w14,w14,w6 - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - eor w14,w14,w11 - add w20,w20,w13 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ror w14,w14,#31 - eor w15,w15,w17 - eor w25,w24,w22 - ror w27,w21,#27 - add w24,w24,w28 // future e+=K - eor w15,w15,w7 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - eor w15,w15,w12 - add w24,w24,w14 // future e+=X[i] - add w20,w20,w25 // e+=F(b,c,d) - ror w15,w15,#31 - eor w16,w16,w19 - eor w25,w23,w21 - ror w27,w20,#27 - add w23,w23,w28 // future e+=K - eor w16,w16,w8 - eor w25,w25,w22 - add w24,w24,w27 // e+=rot(a,5) - ror w21,w21,#2 - eor w16,w16,w13 - add w23,w23,w15 // future e+=X[i] - add w24,w24,w25 // e+=F(b,c,d) - ror w16,w16,#31 - eor w17,w17,w3 - eor w25,w22,w20 - ror w27,w24,#27 - add w22,w22,w28 // future e+=K - eor w17,w17,w9 - eor w25,w25,w21 - add w23,w23,w27 // e+=rot(a,5) - ror w20,w20,#2 - eor w17,w17,w14 - add w22,w22,w16 // future e+=X[i] - add w23,w23,w25 // e+=F(b,c,d) - ror w17,w17,#31 - eor w19,w19,w4 - eor w25,w21,w24 - ror w27,w23,#27 - add w21,w21,w28 // future e+=K - eor w19,w19,w10 - eor w25,w25,w20 - add w22,w22,w27 // e+=rot(a,5) - ror w24,w24,#2 - eor w19,w19,w15 - add w21,w21,w17 // future e+=X[i] - add w22,w22,w25 // e+=F(b,c,d) - ror w19,w19,#31 - ldp w4,w5,[x0] - eor w25,w20,w23 - ror w27,w22,#27 - add w20,w20,w28 // future e+=K - eor w25,w25,w24 - add w21,w21,w27 // e+=rot(a,5) - ror w23,w23,#2 - add w20,w20,w19 // future e+=X[i] - add w21,w21,w25 // e+=F(b,c,d) - ldp w6,w7,[x0,#8] - eor w25,w24,w22 - ror w27,w21,#27 - eor w25,w25,w23 - add w20,w20,w27 // e+=rot(a,5) - ror w22,w22,#2 - ldr w8,[x0,#16] - add w20,w20,w25 // e+=F(b,c,d) - add w21,w21,w5 - add w22,w22,w6 - add w20,w20,w4 - add w23,w23,w7 - add w24,w24,w8 - stp w20,w21,[x0] - stp w22,w23,[x0,#8] - str w24,[x0,#16] - cbnz x2,.Loop - - ldp x19,x20,[sp,#16] - ldp x21,x22,[sp,#32] - ldp x23,x24,[sp,#48] - ldp x25,x26,[sp,#64] - ldp x27,x28,[sp,#80] - ldr x29,[sp],#96 - ret -.size sha1_block_data_order,.-sha1_block_data_order -.type sha1_block_armv8,%function -.align 6 -sha1_block_armv8: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - AARCH64_VALID_CALL_TARGET -.Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - adrp x4,.Lconst - add x4,x4,:lo12:.Lconst - eor v1.16b,v1.16b,v1.16b - ld1 {v0.4s},[x0],#16 - ld1 {v1.s}[0],[x0] - sub x0,x0,#16 - ld1 {v16.4s,v17.4s,v18.4s,v19.4s},[x4] - -.Loop_hw: - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - sub x2,x2,#1 - rev32 v4.16b,v4.16b - rev32 v5.16b,v5.16b - - add v20.4s,v16.4s,v4.4s - rev32 v6.16b,v6.16b - orr v22.16b,v0.16b,v0.16b // offload - - add v21.4s,v16.4s,v5.4s - rev32 v7.16b,v7.16b -.inst 0x5e280803 //sha1h v3.16b,v0.16b -.inst 0x5e140020 //sha1c v0.16b,v1.16b,v20.4s // 0 - add v20.4s,v16.4s,v6.4s -.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b -.inst 0x5e280802 //sha1h v2.16b,v0.16b // 1 -.inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s - add v21.4s,v16.4s,v7.4s -.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b -.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b -.inst 0x5e280803 //sha1h v3.16b,v0.16b // 2 -.inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s - add v20.4s,v16.4s,v4.4s -.inst 0x5e281885 //sha1su1 v5.16b,v4.16b -.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b -.inst 0x5e280802 //sha1h v2.16b,v0.16b // 3 -.inst 0x5e150060 //sha1c v0.16b,v3.16b,v21.4s - add v21.4s,v17.4s,v5.4s -.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b -.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b -.inst 0x5e280803 //sha1h v3.16b,v0.16b // 4 -.inst 0x5e140040 //sha1c v0.16b,v2.16b,v20.4s - add v20.4s,v17.4s,v6.4s -.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b -.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b -.inst 0x5e280802 //sha1h v2.16b,v0.16b // 5 -.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - add v21.4s,v17.4s,v7.4s -.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b -.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b -.inst 0x5e280803 //sha1h v3.16b,v0.16b // 6 -.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s - add v20.4s,v17.4s,v4.4s -.inst 0x5e281885 //sha1su1 v5.16b,v4.16b -.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b -.inst 0x5e280802 //sha1h v2.16b,v0.16b // 7 -.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - add v21.4s,v17.4s,v5.4s -.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b -.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b -.inst 0x5e280803 //sha1h v3.16b,v0.16b // 8 -.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s - add v20.4s,v18.4s,v6.4s -.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b -.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b -.inst 0x5e280802 //sha1h v2.16b,v0.16b // 9 -.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - add v21.4s,v18.4s,v7.4s -.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b -.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b -.inst 0x5e280803 //sha1h v3.16b,v0.16b // 10 -.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s - add v20.4s,v18.4s,v4.4s -.inst 0x5e281885 //sha1su1 v5.16b,v4.16b -.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b -.inst 0x5e280802 //sha1h v2.16b,v0.16b // 11 -.inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s - add v21.4s,v18.4s,v5.4s -.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b -.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b -.inst 0x5e280803 //sha1h v3.16b,v0.16b // 12 -.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s - add v20.4s,v18.4s,v6.4s -.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b -.inst 0x5e0630a4 //sha1su0 v4.16b,v5.16b,v6.16b -.inst 0x5e280802 //sha1h v2.16b,v0.16b // 13 -.inst 0x5e152060 //sha1m v0.16b,v3.16b,v21.4s - add v21.4s,v19.4s,v7.4s -.inst 0x5e2818e4 //sha1su1 v4.16b,v7.16b -.inst 0x5e0730c5 //sha1su0 v5.16b,v6.16b,v7.16b -.inst 0x5e280803 //sha1h v3.16b,v0.16b // 14 -.inst 0x5e142040 //sha1m v0.16b,v2.16b,v20.4s - add v20.4s,v19.4s,v4.4s -.inst 0x5e281885 //sha1su1 v5.16b,v4.16b -.inst 0x5e0430e6 //sha1su0 v6.16b,v7.16b,v4.16b -.inst 0x5e280802 //sha1h v2.16b,v0.16b // 15 -.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - add v21.4s,v19.4s,v5.4s -.inst 0x5e2818a6 //sha1su1 v6.16b,v5.16b -.inst 0x5e053087 //sha1su0 v7.16b,v4.16b,v5.16b -.inst 0x5e280803 //sha1h v3.16b,v0.16b // 16 -.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s - add v20.4s,v19.4s,v6.4s -.inst 0x5e2818c7 //sha1su1 v7.16b,v6.16b -.inst 0x5e280802 //sha1h v2.16b,v0.16b // 17 -.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - add v21.4s,v19.4s,v7.4s - -.inst 0x5e280803 //sha1h v3.16b,v0.16b // 18 -.inst 0x5e141040 //sha1p v0.16b,v2.16b,v20.4s - -.inst 0x5e280802 //sha1h v2.16b,v0.16b // 19 -.inst 0x5e151060 //sha1p v0.16b,v3.16b,v21.4s - - add v1.4s,v1.4s,v2.4s - add v0.4s,v0.4s,v22.4s - - cbnz x2,.Loop_hw - - st1 {v0.4s},[x0],#16 - st1 {v1.s}[0],[x0] - - ldr x29,[sp],#16 - ret -.size sha1_block_armv8,.-sha1_block_armv8 -.section .rodata -.align 6 -.Lconst: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 //K_00_19 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 //K_20_39 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc //K_40_59 -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 //K_60_79 -.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S deleted file mode 100644 index c777ec82..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha256-armv8.S +++ /dev/null @@ -1,1215 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. -// -// Licensed under the OpenSSL license (the "License"). You may not use -// this file except in compliance with the License. You can obtain a copy -// in the file LICENSE in the source distribution or at -// https://www.openssl.org/source/license.html - -// ==================================================================== -// Written by Andy Polyakov for the OpenSSL -// project. The module is, however, dual licensed under OpenSSL and -// CRYPTOGAMS licenses depending on where you obtain it. For further -// details see http://www.openssl.org/~appro/cryptogams/. -// -// Permission to use under GPLv2 terms is granted. -// ==================================================================== -// -// SHA256/512 for ARMv8. -// -// Performance in cycles per processed byte and improvement coefficient -// over code generated with "default" compiler: -// -// SHA256-hw SHA256(*) SHA512 -// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) -// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) -// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) -// Denver 2.01 10.5 (+26%) 6.70 (+8%) -// X-Gene 20.0 (+100%) 12.8 (+300%(***)) -// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) -// Kryo 1.92 17.4 (+30%) 11.2 (+8%) -// -// (*) Software SHA256 results are of lesser relevance, presented -// mostly for informational purposes. -// (**) The result is a trade-off: it's possible to improve it by -// 10% (or by 1 cycle per round), but at the cost of 20% loss -// on Cortex-A53 (or by 4 cycles per round). -// (***) Super-impressive coefficients over gcc-generated code are -// indication of some compiler "pathology", most notably code -// generated with -mgeneral-regs-only is significantly faster -// and the gap is only 40-90%. - -#ifndef __KERNEL__ -# include -#endif - -.text - - -.hidden OPENSSL_armcap_P -.globl sha256_block_data_order -.hidden sha256_block_data_order -.type sha256_block_data_order,%function -.align 6 -sha256_block_data_order: - AARCH64_VALID_CALL_TARGET -#ifndef __KERNEL__ -#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x16,OPENSSL_armcap_P -#endif - ldr w16,[x16,:lo12:OPENSSL_armcap_P] - tst w16,#ARMV8_SHA256 - b.ne .Lv8_entry -#endif - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#4*4 - - ldp w20,w21,[x0] // load context - ldp w22,w23,[x0,#2*4] - ldp w24,w25,[x0,#4*4] - add x2,x1,x2,lsl#6 // end of input - ldp w26,w27,[x0,#6*4] - adrp x30,.LK256 - add x30,x30,:lo12:.LK256 - stp x0,x2,[x29,#96] - -.Loop: - ldp w3,w4,[x1],#2*4 - ldr w19,[x30],#4 // *K++ - eor w28,w21,w22 // magic seed - str x1,[x29,#112] -#ifndef __AARCH64EB__ - rev w3,w3 // 0 -#endif - ror w16,w24,#6 - add w27,w27,w19 // h+=K[i] - eor w6,w24,w24,ror#14 - and w17,w25,w24 - bic w19,w26,w24 - add w27,w27,w3 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w20,w21 // a^b, b^c in next round - eor w16,w16,w6,ror#11 // Sigma1(e) - ror w6,w20,#2 - add w27,w27,w17 // h+=Ch(e,f,g) - eor w17,w20,w20,ror#9 - add w27,w27,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w23,w23,w27 // d+=h - eor w28,w28,w21 // Maj(a,b,c) - eor w17,w6,w17,ror#13 // Sigma0(a) - add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w27,w27,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w4,w4 // 1 -#endif - ldp w5,w6,[x1],#2*4 - add w27,w27,w17 // h+=Sigma0(a) - ror w16,w23,#6 - add w26,w26,w28 // h+=K[i] - eor w7,w23,w23,ror#14 - and w17,w24,w23 - bic w28,w25,w23 - add w26,w26,w4 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w27,w20 // a^b, b^c in next round - eor w16,w16,w7,ror#11 // Sigma1(e) - ror w7,w27,#2 - add w26,w26,w17 // h+=Ch(e,f,g) - eor w17,w27,w27,ror#9 - add w26,w26,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w22,w22,w26 // d+=h - eor w19,w19,w20 // Maj(a,b,c) - eor w17,w7,w17,ror#13 // Sigma0(a) - add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w26,w26,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w5,w5 // 2 -#endif - add w26,w26,w17 // h+=Sigma0(a) - ror w16,w22,#6 - add w25,w25,w19 // h+=K[i] - eor w8,w22,w22,ror#14 - and w17,w23,w22 - bic w19,w24,w22 - add w25,w25,w5 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w26,w27 // a^b, b^c in next round - eor w16,w16,w8,ror#11 // Sigma1(e) - ror w8,w26,#2 - add w25,w25,w17 // h+=Ch(e,f,g) - eor w17,w26,w26,ror#9 - add w25,w25,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w21,w21,w25 // d+=h - eor w28,w28,w27 // Maj(a,b,c) - eor w17,w8,w17,ror#13 // Sigma0(a) - add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w25,w25,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w6,w6 // 3 -#endif - ldp w7,w8,[x1],#2*4 - add w25,w25,w17 // h+=Sigma0(a) - ror w16,w21,#6 - add w24,w24,w28 // h+=K[i] - eor w9,w21,w21,ror#14 - and w17,w22,w21 - bic w28,w23,w21 - add w24,w24,w6 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w25,w26 // a^b, b^c in next round - eor w16,w16,w9,ror#11 // Sigma1(e) - ror w9,w25,#2 - add w24,w24,w17 // h+=Ch(e,f,g) - eor w17,w25,w25,ror#9 - add w24,w24,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w20,w20,w24 // d+=h - eor w19,w19,w26 // Maj(a,b,c) - eor w17,w9,w17,ror#13 // Sigma0(a) - add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w24,w24,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w7,w7 // 4 -#endif - add w24,w24,w17 // h+=Sigma0(a) - ror w16,w20,#6 - add w23,w23,w19 // h+=K[i] - eor w10,w20,w20,ror#14 - and w17,w21,w20 - bic w19,w22,w20 - add w23,w23,w7 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w24,w25 // a^b, b^c in next round - eor w16,w16,w10,ror#11 // Sigma1(e) - ror w10,w24,#2 - add w23,w23,w17 // h+=Ch(e,f,g) - eor w17,w24,w24,ror#9 - add w23,w23,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w27,w27,w23 // d+=h - eor w28,w28,w25 // Maj(a,b,c) - eor w17,w10,w17,ror#13 // Sigma0(a) - add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w23,w23,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w8,w8 // 5 -#endif - ldp w9,w10,[x1],#2*4 - add w23,w23,w17 // h+=Sigma0(a) - ror w16,w27,#6 - add w22,w22,w28 // h+=K[i] - eor w11,w27,w27,ror#14 - and w17,w20,w27 - bic w28,w21,w27 - add w22,w22,w8 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w23,w24 // a^b, b^c in next round - eor w16,w16,w11,ror#11 // Sigma1(e) - ror w11,w23,#2 - add w22,w22,w17 // h+=Ch(e,f,g) - eor w17,w23,w23,ror#9 - add w22,w22,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w26,w26,w22 // d+=h - eor w19,w19,w24 // Maj(a,b,c) - eor w17,w11,w17,ror#13 // Sigma0(a) - add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w22,w22,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w9,w9 // 6 -#endif - add w22,w22,w17 // h+=Sigma0(a) - ror w16,w26,#6 - add w21,w21,w19 // h+=K[i] - eor w12,w26,w26,ror#14 - and w17,w27,w26 - bic w19,w20,w26 - add w21,w21,w9 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w22,w23 // a^b, b^c in next round - eor w16,w16,w12,ror#11 // Sigma1(e) - ror w12,w22,#2 - add w21,w21,w17 // h+=Ch(e,f,g) - eor w17,w22,w22,ror#9 - add w21,w21,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w25,w25,w21 // d+=h - eor w28,w28,w23 // Maj(a,b,c) - eor w17,w12,w17,ror#13 // Sigma0(a) - add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w21,w21,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w10,w10 // 7 -#endif - ldp w11,w12,[x1],#2*4 - add w21,w21,w17 // h+=Sigma0(a) - ror w16,w25,#6 - add w20,w20,w28 // h+=K[i] - eor w13,w25,w25,ror#14 - and w17,w26,w25 - bic w28,w27,w25 - add w20,w20,w10 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w21,w22 // a^b, b^c in next round - eor w16,w16,w13,ror#11 // Sigma1(e) - ror w13,w21,#2 - add w20,w20,w17 // h+=Ch(e,f,g) - eor w17,w21,w21,ror#9 - add w20,w20,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w24,w24,w20 // d+=h - eor w19,w19,w22 // Maj(a,b,c) - eor w17,w13,w17,ror#13 // Sigma0(a) - add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w20,w20,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w11,w11 // 8 -#endif - add w20,w20,w17 // h+=Sigma0(a) - ror w16,w24,#6 - add w27,w27,w19 // h+=K[i] - eor w14,w24,w24,ror#14 - and w17,w25,w24 - bic w19,w26,w24 - add w27,w27,w11 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w20,w21 // a^b, b^c in next round - eor w16,w16,w14,ror#11 // Sigma1(e) - ror w14,w20,#2 - add w27,w27,w17 // h+=Ch(e,f,g) - eor w17,w20,w20,ror#9 - add w27,w27,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w23,w23,w27 // d+=h - eor w28,w28,w21 // Maj(a,b,c) - eor w17,w14,w17,ror#13 // Sigma0(a) - add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w27,w27,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w12,w12 // 9 -#endif - ldp w13,w14,[x1],#2*4 - add w27,w27,w17 // h+=Sigma0(a) - ror w16,w23,#6 - add w26,w26,w28 // h+=K[i] - eor w15,w23,w23,ror#14 - and w17,w24,w23 - bic w28,w25,w23 - add w26,w26,w12 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w27,w20 // a^b, b^c in next round - eor w16,w16,w15,ror#11 // Sigma1(e) - ror w15,w27,#2 - add w26,w26,w17 // h+=Ch(e,f,g) - eor w17,w27,w27,ror#9 - add w26,w26,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w22,w22,w26 // d+=h - eor w19,w19,w20 // Maj(a,b,c) - eor w17,w15,w17,ror#13 // Sigma0(a) - add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w26,w26,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w13,w13 // 10 -#endif - add w26,w26,w17 // h+=Sigma0(a) - ror w16,w22,#6 - add w25,w25,w19 // h+=K[i] - eor w0,w22,w22,ror#14 - and w17,w23,w22 - bic w19,w24,w22 - add w25,w25,w13 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w26,w27 // a^b, b^c in next round - eor w16,w16,w0,ror#11 // Sigma1(e) - ror w0,w26,#2 - add w25,w25,w17 // h+=Ch(e,f,g) - eor w17,w26,w26,ror#9 - add w25,w25,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w21,w21,w25 // d+=h - eor w28,w28,w27 // Maj(a,b,c) - eor w17,w0,w17,ror#13 // Sigma0(a) - add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w25,w25,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w14,w14 // 11 -#endif - ldp w15,w0,[x1],#2*4 - add w25,w25,w17 // h+=Sigma0(a) - str w6,[sp,#12] - ror w16,w21,#6 - add w24,w24,w28 // h+=K[i] - eor w6,w21,w21,ror#14 - and w17,w22,w21 - bic w28,w23,w21 - add w24,w24,w14 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w25,w26 // a^b, b^c in next round - eor w16,w16,w6,ror#11 // Sigma1(e) - ror w6,w25,#2 - add w24,w24,w17 // h+=Ch(e,f,g) - eor w17,w25,w25,ror#9 - add w24,w24,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w20,w20,w24 // d+=h - eor w19,w19,w26 // Maj(a,b,c) - eor w17,w6,w17,ror#13 // Sigma0(a) - add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w24,w24,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w15,w15 // 12 -#endif - add w24,w24,w17 // h+=Sigma0(a) - str w7,[sp,#0] - ror w16,w20,#6 - add w23,w23,w19 // h+=K[i] - eor w7,w20,w20,ror#14 - and w17,w21,w20 - bic w19,w22,w20 - add w23,w23,w15 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w24,w25 // a^b, b^c in next round - eor w16,w16,w7,ror#11 // Sigma1(e) - ror w7,w24,#2 - add w23,w23,w17 // h+=Ch(e,f,g) - eor w17,w24,w24,ror#9 - add w23,w23,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w27,w27,w23 // d+=h - eor w28,w28,w25 // Maj(a,b,c) - eor w17,w7,w17,ror#13 // Sigma0(a) - add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w23,w23,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w0,w0 // 13 -#endif - ldp w1,w2,[x1] - add w23,w23,w17 // h+=Sigma0(a) - str w8,[sp,#4] - ror w16,w27,#6 - add w22,w22,w28 // h+=K[i] - eor w8,w27,w27,ror#14 - and w17,w20,w27 - bic w28,w21,w27 - add w22,w22,w0 // h+=X[i] - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w23,w24 // a^b, b^c in next round - eor w16,w16,w8,ror#11 // Sigma1(e) - ror w8,w23,#2 - add w22,w22,w17 // h+=Ch(e,f,g) - eor w17,w23,w23,ror#9 - add w22,w22,w16 // h+=Sigma1(e) - and w19,w19,w28 // (b^c)&=(a^b) - add w26,w26,w22 // d+=h - eor w19,w19,w24 // Maj(a,b,c) - eor w17,w8,w17,ror#13 // Sigma0(a) - add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - //add w22,w22,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w1,w1 // 14 -#endif - ldr w6,[sp,#12] - add w22,w22,w17 // h+=Sigma0(a) - str w9,[sp,#8] - ror w16,w26,#6 - add w21,w21,w19 // h+=K[i] - eor w9,w26,w26,ror#14 - and w17,w27,w26 - bic w19,w20,w26 - add w21,w21,w1 // h+=X[i] - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w22,w23 // a^b, b^c in next round - eor w16,w16,w9,ror#11 // Sigma1(e) - ror w9,w22,#2 - add w21,w21,w17 // h+=Ch(e,f,g) - eor w17,w22,w22,ror#9 - add w21,w21,w16 // h+=Sigma1(e) - and w28,w28,w19 // (b^c)&=(a^b) - add w25,w25,w21 // d+=h - eor w28,w28,w23 // Maj(a,b,c) - eor w17,w9,w17,ror#13 // Sigma0(a) - add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - //add w21,w21,w17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev w2,w2 // 15 -#endif - ldr w7,[sp,#0] - add w21,w21,w17 // h+=Sigma0(a) - str w10,[sp,#12] - ror w16,w25,#6 - add w20,w20,w28 // h+=K[i] - ror w9,w4,#7 - and w17,w26,w25 - ror w8,w1,#17 - bic w28,w27,w25 - ror w10,w21,#2 - add w20,w20,w2 // h+=X[i] - eor w16,w16,w25,ror#11 - eor w9,w9,w4,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w21,w22 // a^b, b^c in next round - eor w16,w16,w25,ror#25 // Sigma1(e) - eor w10,w10,w21,ror#13 - add w20,w20,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w8,w8,w1,ror#19 - eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) - add w20,w20,w16 // h+=Sigma1(e) - eor w19,w19,w22 // Maj(a,b,c) - eor w17,w10,w21,ror#22 // Sigma0(a) - eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) - add w3,w3,w12 - add w24,w24,w20 // d+=h - add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w3,w3,w9 - add w20,w20,w17 // h+=Sigma0(a) - add w3,w3,w8 -.Loop_16_xx: - ldr w8,[sp,#4] - str w11,[sp,#0] - ror w16,w24,#6 - add w27,w27,w19 // h+=K[i] - ror w10,w5,#7 - and w17,w25,w24 - ror w9,w2,#17 - bic w19,w26,w24 - ror w11,w20,#2 - add w27,w27,w3 // h+=X[i] - eor w16,w16,w24,ror#11 - eor w10,w10,w5,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w20,w21 // a^b, b^c in next round - eor w16,w16,w24,ror#25 // Sigma1(e) - eor w11,w11,w20,ror#13 - add w27,w27,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w9,w9,w2,ror#19 - eor w10,w10,w5,lsr#3 // sigma0(X[i+1]) - add w27,w27,w16 // h+=Sigma1(e) - eor w28,w28,w21 // Maj(a,b,c) - eor w17,w11,w20,ror#22 // Sigma0(a) - eor w9,w9,w2,lsr#10 // sigma1(X[i+14]) - add w4,w4,w13 - add w23,w23,w27 // d+=h - add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w4,w4,w10 - add w27,w27,w17 // h+=Sigma0(a) - add w4,w4,w9 - ldr w9,[sp,#8] - str w12,[sp,#4] - ror w16,w23,#6 - add w26,w26,w28 // h+=K[i] - ror w11,w6,#7 - and w17,w24,w23 - ror w10,w3,#17 - bic w28,w25,w23 - ror w12,w27,#2 - add w26,w26,w4 // h+=X[i] - eor w16,w16,w23,ror#11 - eor w11,w11,w6,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w27,w20 // a^b, b^c in next round - eor w16,w16,w23,ror#25 // Sigma1(e) - eor w12,w12,w27,ror#13 - add w26,w26,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w10,w10,w3,ror#19 - eor w11,w11,w6,lsr#3 // sigma0(X[i+1]) - add w26,w26,w16 // h+=Sigma1(e) - eor w19,w19,w20 // Maj(a,b,c) - eor w17,w12,w27,ror#22 // Sigma0(a) - eor w10,w10,w3,lsr#10 // sigma1(X[i+14]) - add w5,w5,w14 - add w22,w22,w26 // d+=h - add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w5,w5,w11 - add w26,w26,w17 // h+=Sigma0(a) - add w5,w5,w10 - ldr w10,[sp,#12] - str w13,[sp,#8] - ror w16,w22,#6 - add w25,w25,w19 // h+=K[i] - ror w12,w7,#7 - and w17,w23,w22 - ror w11,w4,#17 - bic w19,w24,w22 - ror w13,w26,#2 - add w25,w25,w5 // h+=X[i] - eor w16,w16,w22,ror#11 - eor w12,w12,w7,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w26,w27 // a^b, b^c in next round - eor w16,w16,w22,ror#25 // Sigma1(e) - eor w13,w13,w26,ror#13 - add w25,w25,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w11,w11,w4,ror#19 - eor w12,w12,w7,lsr#3 // sigma0(X[i+1]) - add w25,w25,w16 // h+=Sigma1(e) - eor w28,w28,w27 // Maj(a,b,c) - eor w17,w13,w26,ror#22 // Sigma0(a) - eor w11,w11,w4,lsr#10 // sigma1(X[i+14]) - add w6,w6,w15 - add w21,w21,w25 // d+=h - add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w6,w6,w12 - add w25,w25,w17 // h+=Sigma0(a) - add w6,w6,w11 - ldr w11,[sp,#0] - str w14,[sp,#12] - ror w16,w21,#6 - add w24,w24,w28 // h+=K[i] - ror w13,w8,#7 - and w17,w22,w21 - ror w12,w5,#17 - bic w28,w23,w21 - ror w14,w25,#2 - add w24,w24,w6 // h+=X[i] - eor w16,w16,w21,ror#11 - eor w13,w13,w8,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w25,w26 // a^b, b^c in next round - eor w16,w16,w21,ror#25 // Sigma1(e) - eor w14,w14,w25,ror#13 - add w24,w24,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w12,w12,w5,ror#19 - eor w13,w13,w8,lsr#3 // sigma0(X[i+1]) - add w24,w24,w16 // h+=Sigma1(e) - eor w19,w19,w26 // Maj(a,b,c) - eor w17,w14,w25,ror#22 // Sigma0(a) - eor w12,w12,w5,lsr#10 // sigma1(X[i+14]) - add w7,w7,w0 - add w20,w20,w24 // d+=h - add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w7,w7,w13 - add w24,w24,w17 // h+=Sigma0(a) - add w7,w7,w12 - ldr w12,[sp,#4] - str w15,[sp,#0] - ror w16,w20,#6 - add w23,w23,w19 // h+=K[i] - ror w14,w9,#7 - and w17,w21,w20 - ror w13,w6,#17 - bic w19,w22,w20 - ror w15,w24,#2 - add w23,w23,w7 // h+=X[i] - eor w16,w16,w20,ror#11 - eor w14,w14,w9,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w24,w25 // a^b, b^c in next round - eor w16,w16,w20,ror#25 // Sigma1(e) - eor w15,w15,w24,ror#13 - add w23,w23,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w13,w13,w6,ror#19 - eor w14,w14,w9,lsr#3 // sigma0(X[i+1]) - add w23,w23,w16 // h+=Sigma1(e) - eor w28,w28,w25 // Maj(a,b,c) - eor w17,w15,w24,ror#22 // Sigma0(a) - eor w13,w13,w6,lsr#10 // sigma1(X[i+14]) - add w8,w8,w1 - add w27,w27,w23 // d+=h - add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w8,w8,w14 - add w23,w23,w17 // h+=Sigma0(a) - add w8,w8,w13 - ldr w13,[sp,#8] - str w0,[sp,#4] - ror w16,w27,#6 - add w22,w22,w28 // h+=K[i] - ror w15,w10,#7 - and w17,w20,w27 - ror w14,w7,#17 - bic w28,w21,w27 - ror w0,w23,#2 - add w22,w22,w8 // h+=X[i] - eor w16,w16,w27,ror#11 - eor w15,w15,w10,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w23,w24 // a^b, b^c in next round - eor w16,w16,w27,ror#25 // Sigma1(e) - eor w0,w0,w23,ror#13 - add w22,w22,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w14,w14,w7,ror#19 - eor w15,w15,w10,lsr#3 // sigma0(X[i+1]) - add w22,w22,w16 // h+=Sigma1(e) - eor w19,w19,w24 // Maj(a,b,c) - eor w17,w0,w23,ror#22 // Sigma0(a) - eor w14,w14,w7,lsr#10 // sigma1(X[i+14]) - add w9,w9,w2 - add w26,w26,w22 // d+=h - add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w9,w9,w15 - add w22,w22,w17 // h+=Sigma0(a) - add w9,w9,w14 - ldr w14,[sp,#12] - str w1,[sp,#8] - ror w16,w26,#6 - add w21,w21,w19 // h+=K[i] - ror w0,w11,#7 - and w17,w27,w26 - ror w15,w8,#17 - bic w19,w20,w26 - ror w1,w22,#2 - add w21,w21,w9 // h+=X[i] - eor w16,w16,w26,ror#11 - eor w0,w0,w11,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w22,w23 // a^b, b^c in next round - eor w16,w16,w26,ror#25 // Sigma1(e) - eor w1,w1,w22,ror#13 - add w21,w21,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w15,w15,w8,ror#19 - eor w0,w0,w11,lsr#3 // sigma0(X[i+1]) - add w21,w21,w16 // h+=Sigma1(e) - eor w28,w28,w23 // Maj(a,b,c) - eor w17,w1,w22,ror#22 // Sigma0(a) - eor w15,w15,w8,lsr#10 // sigma1(X[i+14]) - add w10,w10,w3 - add w25,w25,w21 // d+=h - add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w10,w10,w0 - add w21,w21,w17 // h+=Sigma0(a) - add w10,w10,w15 - ldr w15,[sp,#0] - str w2,[sp,#12] - ror w16,w25,#6 - add w20,w20,w28 // h+=K[i] - ror w1,w12,#7 - and w17,w26,w25 - ror w0,w9,#17 - bic w28,w27,w25 - ror w2,w21,#2 - add w20,w20,w10 // h+=X[i] - eor w16,w16,w25,ror#11 - eor w1,w1,w12,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w21,w22 // a^b, b^c in next round - eor w16,w16,w25,ror#25 // Sigma1(e) - eor w2,w2,w21,ror#13 - add w20,w20,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w0,w0,w9,ror#19 - eor w1,w1,w12,lsr#3 // sigma0(X[i+1]) - add w20,w20,w16 // h+=Sigma1(e) - eor w19,w19,w22 // Maj(a,b,c) - eor w17,w2,w21,ror#22 // Sigma0(a) - eor w0,w0,w9,lsr#10 // sigma1(X[i+14]) - add w11,w11,w4 - add w24,w24,w20 // d+=h - add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w11,w11,w1 - add w20,w20,w17 // h+=Sigma0(a) - add w11,w11,w0 - ldr w0,[sp,#4] - str w3,[sp,#0] - ror w16,w24,#6 - add w27,w27,w19 // h+=K[i] - ror w2,w13,#7 - and w17,w25,w24 - ror w1,w10,#17 - bic w19,w26,w24 - ror w3,w20,#2 - add w27,w27,w11 // h+=X[i] - eor w16,w16,w24,ror#11 - eor w2,w2,w13,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w20,w21 // a^b, b^c in next round - eor w16,w16,w24,ror#25 // Sigma1(e) - eor w3,w3,w20,ror#13 - add w27,w27,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w1,w1,w10,ror#19 - eor w2,w2,w13,lsr#3 // sigma0(X[i+1]) - add w27,w27,w16 // h+=Sigma1(e) - eor w28,w28,w21 // Maj(a,b,c) - eor w17,w3,w20,ror#22 // Sigma0(a) - eor w1,w1,w10,lsr#10 // sigma1(X[i+14]) - add w12,w12,w5 - add w23,w23,w27 // d+=h - add w27,w27,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w12,w12,w2 - add w27,w27,w17 // h+=Sigma0(a) - add w12,w12,w1 - ldr w1,[sp,#8] - str w4,[sp,#4] - ror w16,w23,#6 - add w26,w26,w28 // h+=K[i] - ror w3,w14,#7 - and w17,w24,w23 - ror w2,w11,#17 - bic w28,w25,w23 - ror w4,w27,#2 - add w26,w26,w12 // h+=X[i] - eor w16,w16,w23,ror#11 - eor w3,w3,w14,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w27,w20 // a^b, b^c in next round - eor w16,w16,w23,ror#25 // Sigma1(e) - eor w4,w4,w27,ror#13 - add w26,w26,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w2,w2,w11,ror#19 - eor w3,w3,w14,lsr#3 // sigma0(X[i+1]) - add w26,w26,w16 // h+=Sigma1(e) - eor w19,w19,w20 // Maj(a,b,c) - eor w17,w4,w27,ror#22 // Sigma0(a) - eor w2,w2,w11,lsr#10 // sigma1(X[i+14]) - add w13,w13,w6 - add w22,w22,w26 // d+=h - add w26,w26,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w13,w13,w3 - add w26,w26,w17 // h+=Sigma0(a) - add w13,w13,w2 - ldr w2,[sp,#12] - str w5,[sp,#8] - ror w16,w22,#6 - add w25,w25,w19 // h+=K[i] - ror w4,w15,#7 - and w17,w23,w22 - ror w3,w12,#17 - bic w19,w24,w22 - ror w5,w26,#2 - add w25,w25,w13 // h+=X[i] - eor w16,w16,w22,ror#11 - eor w4,w4,w15,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w26,w27 // a^b, b^c in next round - eor w16,w16,w22,ror#25 // Sigma1(e) - eor w5,w5,w26,ror#13 - add w25,w25,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w3,w3,w12,ror#19 - eor w4,w4,w15,lsr#3 // sigma0(X[i+1]) - add w25,w25,w16 // h+=Sigma1(e) - eor w28,w28,w27 // Maj(a,b,c) - eor w17,w5,w26,ror#22 // Sigma0(a) - eor w3,w3,w12,lsr#10 // sigma1(X[i+14]) - add w14,w14,w7 - add w21,w21,w25 // d+=h - add w25,w25,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w14,w14,w4 - add w25,w25,w17 // h+=Sigma0(a) - add w14,w14,w3 - ldr w3,[sp,#0] - str w6,[sp,#12] - ror w16,w21,#6 - add w24,w24,w28 // h+=K[i] - ror w5,w0,#7 - and w17,w22,w21 - ror w4,w13,#17 - bic w28,w23,w21 - ror w6,w25,#2 - add w24,w24,w14 // h+=X[i] - eor w16,w16,w21,ror#11 - eor w5,w5,w0,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w25,w26 // a^b, b^c in next round - eor w16,w16,w21,ror#25 // Sigma1(e) - eor w6,w6,w25,ror#13 - add w24,w24,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w4,w4,w13,ror#19 - eor w5,w5,w0,lsr#3 // sigma0(X[i+1]) - add w24,w24,w16 // h+=Sigma1(e) - eor w19,w19,w26 // Maj(a,b,c) - eor w17,w6,w25,ror#22 // Sigma0(a) - eor w4,w4,w13,lsr#10 // sigma1(X[i+14]) - add w15,w15,w8 - add w20,w20,w24 // d+=h - add w24,w24,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w15,w15,w5 - add w24,w24,w17 // h+=Sigma0(a) - add w15,w15,w4 - ldr w4,[sp,#4] - str w7,[sp,#0] - ror w16,w20,#6 - add w23,w23,w19 // h+=K[i] - ror w6,w1,#7 - and w17,w21,w20 - ror w5,w14,#17 - bic w19,w22,w20 - ror w7,w24,#2 - add w23,w23,w15 // h+=X[i] - eor w16,w16,w20,ror#11 - eor w6,w6,w1,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w24,w25 // a^b, b^c in next round - eor w16,w16,w20,ror#25 // Sigma1(e) - eor w7,w7,w24,ror#13 - add w23,w23,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w5,w5,w14,ror#19 - eor w6,w6,w1,lsr#3 // sigma0(X[i+1]) - add w23,w23,w16 // h+=Sigma1(e) - eor w28,w28,w25 // Maj(a,b,c) - eor w17,w7,w24,ror#22 // Sigma0(a) - eor w5,w5,w14,lsr#10 // sigma1(X[i+14]) - add w0,w0,w9 - add w27,w27,w23 // d+=h - add w23,w23,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w0,w0,w6 - add w23,w23,w17 // h+=Sigma0(a) - add w0,w0,w5 - ldr w5,[sp,#8] - str w8,[sp,#4] - ror w16,w27,#6 - add w22,w22,w28 // h+=K[i] - ror w7,w2,#7 - and w17,w20,w27 - ror w6,w15,#17 - bic w28,w21,w27 - ror w8,w23,#2 - add w22,w22,w0 // h+=X[i] - eor w16,w16,w27,ror#11 - eor w7,w7,w2,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w23,w24 // a^b, b^c in next round - eor w16,w16,w27,ror#25 // Sigma1(e) - eor w8,w8,w23,ror#13 - add w22,w22,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w6,w6,w15,ror#19 - eor w7,w7,w2,lsr#3 // sigma0(X[i+1]) - add w22,w22,w16 // h+=Sigma1(e) - eor w19,w19,w24 // Maj(a,b,c) - eor w17,w8,w23,ror#22 // Sigma0(a) - eor w6,w6,w15,lsr#10 // sigma1(X[i+14]) - add w1,w1,w10 - add w26,w26,w22 // d+=h - add w22,w22,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w1,w1,w7 - add w22,w22,w17 // h+=Sigma0(a) - add w1,w1,w6 - ldr w6,[sp,#12] - str w9,[sp,#8] - ror w16,w26,#6 - add w21,w21,w19 // h+=K[i] - ror w8,w3,#7 - and w17,w27,w26 - ror w7,w0,#17 - bic w19,w20,w26 - ror w9,w22,#2 - add w21,w21,w1 // h+=X[i] - eor w16,w16,w26,ror#11 - eor w8,w8,w3,ror#18 - orr w17,w17,w19 // Ch(e,f,g) - eor w19,w22,w23 // a^b, b^c in next round - eor w16,w16,w26,ror#25 // Sigma1(e) - eor w9,w9,w22,ror#13 - add w21,w21,w17 // h+=Ch(e,f,g) - and w28,w28,w19 // (b^c)&=(a^b) - eor w7,w7,w0,ror#19 - eor w8,w8,w3,lsr#3 // sigma0(X[i+1]) - add w21,w21,w16 // h+=Sigma1(e) - eor w28,w28,w23 // Maj(a,b,c) - eor w17,w9,w22,ror#22 // Sigma0(a) - eor w7,w7,w0,lsr#10 // sigma1(X[i+14]) - add w2,w2,w11 - add w25,w25,w21 // d+=h - add w21,w21,w28 // h+=Maj(a,b,c) - ldr w28,[x30],#4 // *K++, w19 in next round - add w2,w2,w8 - add w21,w21,w17 // h+=Sigma0(a) - add w2,w2,w7 - ldr w7,[sp,#0] - str w10,[sp,#12] - ror w16,w25,#6 - add w20,w20,w28 // h+=K[i] - ror w9,w4,#7 - and w17,w26,w25 - ror w8,w1,#17 - bic w28,w27,w25 - ror w10,w21,#2 - add w20,w20,w2 // h+=X[i] - eor w16,w16,w25,ror#11 - eor w9,w9,w4,ror#18 - orr w17,w17,w28 // Ch(e,f,g) - eor w28,w21,w22 // a^b, b^c in next round - eor w16,w16,w25,ror#25 // Sigma1(e) - eor w10,w10,w21,ror#13 - add w20,w20,w17 // h+=Ch(e,f,g) - and w19,w19,w28 // (b^c)&=(a^b) - eor w8,w8,w1,ror#19 - eor w9,w9,w4,lsr#3 // sigma0(X[i+1]) - add w20,w20,w16 // h+=Sigma1(e) - eor w19,w19,w22 // Maj(a,b,c) - eor w17,w10,w21,ror#22 // Sigma0(a) - eor w8,w8,w1,lsr#10 // sigma1(X[i+14]) - add w3,w3,w12 - add w24,w24,w20 // d+=h - add w20,w20,w19 // h+=Maj(a,b,c) - ldr w19,[x30],#4 // *K++, w28 in next round - add w3,w3,w9 - add w20,w20,w17 // h+=Sigma0(a) - add w3,w3,w8 - cbnz w19,.Loop_16_xx - - ldp x0,x2,[x29,#96] - ldr x1,[x29,#112] - sub x30,x30,#260 // rewind - - ldp w3,w4,[x0] - ldp w5,w6,[x0,#2*4] - add x1,x1,#14*4 // advance input pointer - ldp w7,w8,[x0,#4*4] - add w20,w20,w3 - ldp w9,w10,[x0,#6*4] - add w21,w21,w4 - add w22,w22,w5 - add w23,w23,w6 - stp w20,w21,[x0] - add w24,w24,w7 - add w25,w25,w8 - stp w22,w23,[x0,#2*4] - add w26,w26,w9 - add w27,w27,w10 - cmp x1,x2 - stp w24,w25,[x0,#4*4] - stp w26,w27,[x0,#6*4] - b.ne .Loop - - ldp x19,x20,[x29,#16] - add sp,sp,#4*4 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size sha256_block_data_order,.-sha256_block_data_order - -.section .rodata -.align 6 -.type .LK256,%object -.LK256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.long 0 //terminator -.size .LK256,.-.LK256 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -.text -#ifndef __KERNEL__ -.type sha256_block_armv8,%function -.align 6 -sha256_block_armv8: -.Lv8_entry: - // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v0.4s,v1.4s},[x0] - adrp x3,.LK256 - add x3,x3,:lo12:.LK256 - -.Loop_hw: - ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 - sub x2,x2,#1 - ld1 {v16.4s},[x3],#16 - rev32 v4.16b,v4.16b - rev32 v5.16b,v5.16b - rev32 v6.16b,v6.16b - rev32 v7.16b,v7.16b - orr v18.16b,v0.16b,v0.16b // offload - orr v19.16b,v1.16b,v1.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.inst 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.inst 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s -.inst 0x5e2828a4 //sha256su0 v4.16b,v5.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e0760c4 //sha256su1 v4.16b,v6.16b,v7.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s -.inst 0x5e2828c5 //sha256su0 v5.16b,v6.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0460e5 //sha256su1 v5.16b,v7.16b,v4.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v6.4s -.inst 0x5e2828e6 //sha256su0 v6.16b,v7.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s -.inst 0x5e056086 //sha256su1 v6.16b,v4.16b,v5.16b - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v7.4s -.inst 0x5e282887 //sha256su0 v7.16b,v4.16b - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s -.inst 0x5e0660a7 //sha256su1 v7.16b,v5.16b,v6.16b - ld1 {v17.4s},[x3],#16 - add v16.4s,v16.4s,v4.4s - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - ld1 {v16.4s},[x3],#16 - add v17.4s,v17.4s,v5.4s - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - ld1 {v17.4s},[x3] - add v16.4s,v16.4s,v6.4s - sub x3,x3,#64*4-16 // rewind - orr v2.16b,v0.16b,v0.16b -.inst 0x5e104020 //sha256h v0.16b,v1.16b,v16.4s -.inst 0x5e105041 //sha256h2 v1.16b,v2.16b,v16.4s - - add v17.4s,v17.4s,v7.4s - orr v2.16b,v0.16b,v0.16b -.inst 0x5e114020 //sha256h v0.16b,v1.16b,v17.4s -.inst 0x5e115041 //sha256h2 v1.16b,v2.16b,v17.4s - - add v0.4s,v0.4s,v18.4s - add v1.4s,v1.4s,v19.4s - - cbnz x2,.Loop_hw - - st1 {v0.4s,v1.4s},[x0] - - ldr x29,[sp],#16 - ret -.size sha256_block_armv8,.-sha256_block_armv8 -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S deleted file mode 100644 index a3b458a2..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/sha512-armv8.S +++ /dev/null @@ -1,1617 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -// Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. -// -// Licensed under the OpenSSL license (the "License"). You may not use -// this file except in compliance with the License. You can obtain a copy -// in the file LICENSE in the source distribution or at -// https://www.openssl.org/source/license.html - -// ==================================================================== -// Written by Andy Polyakov for the OpenSSL -// project. The module is, however, dual licensed under OpenSSL and -// CRYPTOGAMS licenses depending on where you obtain it. For further -// details see http://www.openssl.org/~appro/cryptogams/. -// -// Permission to use under GPLv2 terms is granted. -// ==================================================================== -// -// SHA256/512 for ARMv8. -// -// Performance in cycles per processed byte and improvement coefficient -// over code generated with "default" compiler: -// -// SHA256-hw SHA256(*) SHA512 -// Apple A7 1.97 10.5 (+33%) 6.73 (-1%(**)) -// Cortex-A53 2.38 15.5 (+115%) 10.0 (+150%(***)) -// Cortex-A57 2.31 11.6 (+86%) 7.51 (+260%(***)) -// Denver 2.01 10.5 (+26%) 6.70 (+8%) -// X-Gene 20.0 (+100%) 12.8 (+300%(***)) -// Mongoose 2.36 13.0 (+50%) 8.36 (+33%) -// Kryo 1.92 17.4 (+30%) 11.2 (+8%) -// -// (*) Software SHA256 results are of lesser relevance, presented -// mostly for informational purposes. -// (**) The result is a trade-off: it's possible to improve it by -// 10% (or by 1 cycle per round), but at the cost of 20% loss -// on Cortex-A53 (or by 4 cycles per round). -// (***) Super-impressive coefficients over gcc-generated code are -// indication of some compiler "pathology", most notably code -// generated with -mgeneral-regs-only is significantly faster -// and the gap is only 40-90%. - -#ifndef __KERNEL__ -# include -#endif - -.text - - -.hidden OPENSSL_armcap_P -.globl sha512_block_data_order -.hidden sha512_block_data_order -.type sha512_block_data_order,%function -.align 6 -sha512_block_data_order: - AARCH64_VALID_CALL_TARGET -#ifndef __KERNEL__ -#if __has_feature(hwaddress_sanitizer) && __clang_major__ >= 10 - adrp x16,:pg_hi21_nc:OPENSSL_armcap_P -#else - adrp x16,OPENSSL_armcap_P -#endif - ldr w16,[x16,:lo12:OPENSSL_armcap_P] - tst w16,#ARMV8_SHA512 - b.ne .Lv8_entry -#endif - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-128]! - add x29,sp,#0 - - stp x19,x20,[sp,#16] - stp x21,x22,[sp,#32] - stp x23,x24,[sp,#48] - stp x25,x26,[sp,#64] - stp x27,x28,[sp,#80] - sub sp,sp,#4*8 - - ldp x20,x21,[x0] // load context - ldp x22,x23,[x0,#2*8] - ldp x24,x25,[x0,#4*8] - add x2,x1,x2,lsl#7 // end of input - ldp x26,x27,[x0,#6*8] - adrp x30,.LK512 - add x30,x30,:lo12:.LK512 - stp x0,x2,[x29,#96] - -.Loop: - ldp x3,x4,[x1],#2*8 - ldr x19,[x30],#8 // *K++ - eor x28,x21,x22 // magic seed - str x1,[x29,#112] -#ifndef __AARCH64EB__ - rev x3,x3 // 0 -#endif - ror x16,x24,#14 - add x27,x27,x19 // h+=K[i] - eor x6,x24,x24,ror#23 - and x17,x25,x24 - bic x19,x26,x24 - add x27,x27,x3 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x20,x21 // a^b, b^c in next round - eor x16,x16,x6,ror#18 // Sigma1(e) - ror x6,x20,#28 - add x27,x27,x17 // h+=Ch(e,f,g) - eor x17,x20,x20,ror#5 - add x27,x27,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x23,x23,x27 // d+=h - eor x28,x28,x21 // Maj(a,b,c) - eor x17,x6,x17,ror#34 // Sigma0(a) - add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x27,x27,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x4,x4 // 1 -#endif - ldp x5,x6,[x1],#2*8 - add x27,x27,x17 // h+=Sigma0(a) - ror x16,x23,#14 - add x26,x26,x28 // h+=K[i] - eor x7,x23,x23,ror#23 - and x17,x24,x23 - bic x28,x25,x23 - add x26,x26,x4 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x27,x20 // a^b, b^c in next round - eor x16,x16,x7,ror#18 // Sigma1(e) - ror x7,x27,#28 - add x26,x26,x17 // h+=Ch(e,f,g) - eor x17,x27,x27,ror#5 - add x26,x26,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x22,x22,x26 // d+=h - eor x19,x19,x20 // Maj(a,b,c) - eor x17,x7,x17,ror#34 // Sigma0(a) - add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x26,x26,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x5,x5 // 2 -#endif - add x26,x26,x17 // h+=Sigma0(a) - ror x16,x22,#14 - add x25,x25,x19 // h+=K[i] - eor x8,x22,x22,ror#23 - and x17,x23,x22 - bic x19,x24,x22 - add x25,x25,x5 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x26,x27 // a^b, b^c in next round - eor x16,x16,x8,ror#18 // Sigma1(e) - ror x8,x26,#28 - add x25,x25,x17 // h+=Ch(e,f,g) - eor x17,x26,x26,ror#5 - add x25,x25,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x21,x21,x25 // d+=h - eor x28,x28,x27 // Maj(a,b,c) - eor x17,x8,x17,ror#34 // Sigma0(a) - add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x25,x25,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x6,x6 // 3 -#endif - ldp x7,x8,[x1],#2*8 - add x25,x25,x17 // h+=Sigma0(a) - ror x16,x21,#14 - add x24,x24,x28 // h+=K[i] - eor x9,x21,x21,ror#23 - and x17,x22,x21 - bic x28,x23,x21 - add x24,x24,x6 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x25,x26 // a^b, b^c in next round - eor x16,x16,x9,ror#18 // Sigma1(e) - ror x9,x25,#28 - add x24,x24,x17 // h+=Ch(e,f,g) - eor x17,x25,x25,ror#5 - add x24,x24,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x20,x20,x24 // d+=h - eor x19,x19,x26 // Maj(a,b,c) - eor x17,x9,x17,ror#34 // Sigma0(a) - add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x24,x24,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x7,x7 // 4 -#endif - add x24,x24,x17 // h+=Sigma0(a) - ror x16,x20,#14 - add x23,x23,x19 // h+=K[i] - eor x10,x20,x20,ror#23 - and x17,x21,x20 - bic x19,x22,x20 - add x23,x23,x7 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x24,x25 // a^b, b^c in next round - eor x16,x16,x10,ror#18 // Sigma1(e) - ror x10,x24,#28 - add x23,x23,x17 // h+=Ch(e,f,g) - eor x17,x24,x24,ror#5 - add x23,x23,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x27,x27,x23 // d+=h - eor x28,x28,x25 // Maj(a,b,c) - eor x17,x10,x17,ror#34 // Sigma0(a) - add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x23,x23,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x8,x8 // 5 -#endif - ldp x9,x10,[x1],#2*8 - add x23,x23,x17 // h+=Sigma0(a) - ror x16,x27,#14 - add x22,x22,x28 // h+=K[i] - eor x11,x27,x27,ror#23 - and x17,x20,x27 - bic x28,x21,x27 - add x22,x22,x8 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x23,x24 // a^b, b^c in next round - eor x16,x16,x11,ror#18 // Sigma1(e) - ror x11,x23,#28 - add x22,x22,x17 // h+=Ch(e,f,g) - eor x17,x23,x23,ror#5 - add x22,x22,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x26,x26,x22 // d+=h - eor x19,x19,x24 // Maj(a,b,c) - eor x17,x11,x17,ror#34 // Sigma0(a) - add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x22,x22,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x9,x9 // 6 -#endif - add x22,x22,x17 // h+=Sigma0(a) - ror x16,x26,#14 - add x21,x21,x19 // h+=K[i] - eor x12,x26,x26,ror#23 - and x17,x27,x26 - bic x19,x20,x26 - add x21,x21,x9 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x22,x23 // a^b, b^c in next round - eor x16,x16,x12,ror#18 // Sigma1(e) - ror x12,x22,#28 - add x21,x21,x17 // h+=Ch(e,f,g) - eor x17,x22,x22,ror#5 - add x21,x21,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x25,x25,x21 // d+=h - eor x28,x28,x23 // Maj(a,b,c) - eor x17,x12,x17,ror#34 // Sigma0(a) - add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x21,x21,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x10,x10 // 7 -#endif - ldp x11,x12,[x1],#2*8 - add x21,x21,x17 // h+=Sigma0(a) - ror x16,x25,#14 - add x20,x20,x28 // h+=K[i] - eor x13,x25,x25,ror#23 - and x17,x26,x25 - bic x28,x27,x25 - add x20,x20,x10 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x21,x22 // a^b, b^c in next round - eor x16,x16,x13,ror#18 // Sigma1(e) - ror x13,x21,#28 - add x20,x20,x17 // h+=Ch(e,f,g) - eor x17,x21,x21,ror#5 - add x20,x20,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x24,x24,x20 // d+=h - eor x19,x19,x22 // Maj(a,b,c) - eor x17,x13,x17,ror#34 // Sigma0(a) - add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x20,x20,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x11,x11 // 8 -#endif - add x20,x20,x17 // h+=Sigma0(a) - ror x16,x24,#14 - add x27,x27,x19 // h+=K[i] - eor x14,x24,x24,ror#23 - and x17,x25,x24 - bic x19,x26,x24 - add x27,x27,x11 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x20,x21 // a^b, b^c in next round - eor x16,x16,x14,ror#18 // Sigma1(e) - ror x14,x20,#28 - add x27,x27,x17 // h+=Ch(e,f,g) - eor x17,x20,x20,ror#5 - add x27,x27,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x23,x23,x27 // d+=h - eor x28,x28,x21 // Maj(a,b,c) - eor x17,x14,x17,ror#34 // Sigma0(a) - add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x27,x27,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x12,x12 // 9 -#endif - ldp x13,x14,[x1],#2*8 - add x27,x27,x17 // h+=Sigma0(a) - ror x16,x23,#14 - add x26,x26,x28 // h+=K[i] - eor x15,x23,x23,ror#23 - and x17,x24,x23 - bic x28,x25,x23 - add x26,x26,x12 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x27,x20 // a^b, b^c in next round - eor x16,x16,x15,ror#18 // Sigma1(e) - ror x15,x27,#28 - add x26,x26,x17 // h+=Ch(e,f,g) - eor x17,x27,x27,ror#5 - add x26,x26,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x22,x22,x26 // d+=h - eor x19,x19,x20 // Maj(a,b,c) - eor x17,x15,x17,ror#34 // Sigma0(a) - add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x26,x26,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x13,x13 // 10 -#endif - add x26,x26,x17 // h+=Sigma0(a) - ror x16,x22,#14 - add x25,x25,x19 // h+=K[i] - eor x0,x22,x22,ror#23 - and x17,x23,x22 - bic x19,x24,x22 - add x25,x25,x13 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x26,x27 // a^b, b^c in next round - eor x16,x16,x0,ror#18 // Sigma1(e) - ror x0,x26,#28 - add x25,x25,x17 // h+=Ch(e,f,g) - eor x17,x26,x26,ror#5 - add x25,x25,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x21,x21,x25 // d+=h - eor x28,x28,x27 // Maj(a,b,c) - eor x17,x0,x17,ror#34 // Sigma0(a) - add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x25,x25,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x14,x14 // 11 -#endif - ldp x15,x0,[x1],#2*8 - add x25,x25,x17 // h+=Sigma0(a) - str x6,[sp,#24] - ror x16,x21,#14 - add x24,x24,x28 // h+=K[i] - eor x6,x21,x21,ror#23 - and x17,x22,x21 - bic x28,x23,x21 - add x24,x24,x14 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x25,x26 // a^b, b^c in next round - eor x16,x16,x6,ror#18 // Sigma1(e) - ror x6,x25,#28 - add x24,x24,x17 // h+=Ch(e,f,g) - eor x17,x25,x25,ror#5 - add x24,x24,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x20,x20,x24 // d+=h - eor x19,x19,x26 // Maj(a,b,c) - eor x17,x6,x17,ror#34 // Sigma0(a) - add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x24,x24,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x15,x15 // 12 -#endif - add x24,x24,x17 // h+=Sigma0(a) - str x7,[sp,#0] - ror x16,x20,#14 - add x23,x23,x19 // h+=K[i] - eor x7,x20,x20,ror#23 - and x17,x21,x20 - bic x19,x22,x20 - add x23,x23,x15 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x24,x25 // a^b, b^c in next round - eor x16,x16,x7,ror#18 // Sigma1(e) - ror x7,x24,#28 - add x23,x23,x17 // h+=Ch(e,f,g) - eor x17,x24,x24,ror#5 - add x23,x23,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x27,x27,x23 // d+=h - eor x28,x28,x25 // Maj(a,b,c) - eor x17,x7,x17,ror#34 // Sigma0(a) - add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x23,x23,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x0,x0 // 13 -#endif - ldp x1,x2,[x1] - add x23,x23,x17 // h+=Sigma0(a) - str x8,[sp,#8] - ror x16,x27,#14 - add x22,x22,x28 // h+=K[i] - eor x8,x27,x27,ror#23 - and x17,x20,x27 - bic x28,x21,x27 - add x22,x22,x0 // h+=X[i] - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x23,x24 // a^b, b^c in next round - eor x16,x16,x8,ror#18 // Sigma1(e) - ror x8,x23,#28 - add x22,x22,x17 // h+=Ch(e,f,g) - eor x17,x23,x23,ror#5 - add x22,x22,x16 // h+=Sigma1(e) - and x19,x19,x28 // (b^c)&=(a^b) - add x26,x26,x22 // d+=h - eor x19,x19,x24 // Maj(a,b,c) - eor x17,x8,x17,ror#34 // Sigma0(a) - add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - //add x22,x22,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x1,x1 // 14 -#endif - ldr x6,[sp,#24] - add x22,x22,x17 // h+=Sigma0(a) - str x9,[sp,#16] - ror x16,x26,#14 - add x21,x21,x19 // h+=K[i] - eor x9,x26,x26,ror#23 - and x17,x27,x26 - bic x19,x20,x26 - add x21,x21,x1 // h+=X[i] - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x22,x23 // a^b, b^c in next round - eor x16,x16,x9,ror#18 // Sigma1(e) - ror x9,x22,#28 - add x21,x21,x17 // h+=Ch(e,f,g) - eor x17,x22,x22,ror#5 - add x21,x21,x16 // h+=Sigma1(e) - and x28,x28,x19 // (b^c)&=(a^b) - add x25,x25,x21 // d+=h - eor x28,x28,x23 // Maj(a,b,c) - eor x17,x9,x17,ror#34 // Sigma0(a) - add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - //add x21,x21,x17 // h+=Sigma0(a) -#ifndef __AARCH64EB__ - rev x2,x2 // 15 -#endif - ldr x7,[sp,#0] - add x21,x21,x17 // h+=Sigma0(a) - str x10,[sp,#24] - ror x16,x25,#14 - add x20,x20,x28 // h+=K[i] - ror x9,x4,#1 - and x17,x26,x25 - ror x8,x1,#19 - bic x28,x27,x25 - ror x10,x21,#28 - add x20,x20,x2 // h+=X[i] - eor x16,x16,x25,ror#18 - eor x9,x9,x4,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x21,x22 // a^b, b^c in next round - eor x16,x16,x25,ror#41 // Sigma1(e) - eor x10,x10,x21,ror#34 - add x20,x20,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x8,x8,x1,ror#61 - eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) - add x20,x20,x16 // h+=Sigma1(e) - eor x19,x19,x22 // Maj(a,b,c) - eor x17,x10,x21,ror#39 // Sigma0(a) - eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) - add x3,x3,x12 - add x24,x24,x20 // d+=h - add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x3,x3,x9 - add x20,x20,x17 // h+=Sigma0(a) - add x3,x3,x8 -.Loop_16_xx: - ldr x8,[sp,#8] - str x11,[sp,#0] - ror x16,x24,#14 - add x27,x27,x19 // h+=K[i] - ror x10,x5,#1 - and x17,x25,x24 - ror x9,x2,#19 - bic x19,x26,x24 - ror x11,x20,#28 - add x27,x27,x3 // h+=X[i] - eor x16,x16,x24,ror#18 - eor x10,x10,x5,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x20,x21 // a^b, b^c in next round - eor x16,x16,x24,ror#41 // Sigma1(e) - eor x11,x11,x20,ror#34 - add x27,x27,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x9,x9,x2,ror#61 - eor x10,x10,x5,lsr#7 // sigma0(X[i+1]) - add x27,x27,x16 // h+=Sigma1(e) - eor x28,x28,x21 // Maj(a,b,c) - eor x17,x11,x20,ror#39 // Sigma0(a) - eor x9,x9,x2,lsr#6 // sigma1(X[i+14]) - add x4,x4,x13 - add x23,x23,x27 // d+=h - add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x4,x4,x10 - add x27,x27,x17 // h+=Sigma0(a) - add x4,x4,x9 - ldr x9,[sp,#16] - str x12,[sp,#8] - ror x16,x23,#14 - add x26,x26,x28 // h+=K[i] - ror x11,x6,#1 - and x17,x24,x23 - ror x10,x3,#19 - bic x28,x25,x23 - ror x12,x27,#28 - add x26,x26,x4 // h+=X[i] - eor x16,x16,x23,ror#18 - eor x11,x11,x6,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x27,x20 // a^b, b^c in next round - eor x16,x16,x23,ror#41 // Sigma1(e) - eor x12,x12,x27,ror#34 - add x26,x26,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x10,x10,x3,ror#61 - eor x11,x11,x6,lsr#7 // sigma0(X[i+1]) - add x26,x26,x16 // h+=Sigma1(e) - eor x19,x19,x20 // Maj(a,b,c) - eor x17,x12,x27,ror#39 // Sigma0(a) - eor x10,x10,x3,lsr#6 // sigma1(X[i+14]) - add x5,x5,x14 - add x22,x22,x26 // d+=h - add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x5,x5,x11 - add x26,x26,x17 // h+=Sigma0(a) - add x5,x5,x10 - ldr x10,[sp,#24] - str x13,[sp,#16] - ror x16,x22,#14 - add x25,x25,x19 // h+=K[i] - ror x12,x7,#1 - and x17,x23,x22 - ror x11,x4,#19 - bic x19,x24,x22 - ror x13,x26,#28 - add x25,x25,x5 // h+=X[i] - eor x16,x16,x22,ror#18 - eor x12,x12,x7,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x26,x27 // a^b, b^c in next round - eor x16,x16,x22,ror#41 // Sigma1(e) - eor x13,x13,x26,ror#34 - add x25,x25,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x11,x11,x4,ror#61 - eor x12,x12,x7,lsr#7 // sigma0(X[i+1]) - add x25,x25,x16 // h+=Sigma1(e) - eor x28,x28,x27 // Maj(a,b,c) - eor x17,x13,x26,ror#39 // Sigma0(a) - eor x11,x11,x4,lsr#6 // sigma1(X[i+14]) - add x6,x6,x15 - add x21,x21,x25 // d+=h - add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x6,x6,x12 - add x25,x25,x17 // h+=Sigma0(a) - add x6,x6,x11 - ldr x11,[sp,#0] - str x14,[sp,#24] - ror x16,x21,#14 - add x24,x24,x28 // h+=K[i] - ror x13,x8,#1 - and x17,x22,x21 - ror x12,x5,#19 - bic x28,x23,x21 - ror x14,x25,#28 - add x24,x24,x6 // h+=X[i] - eor x16,x16,x21,ror#18 - eor x13,x13,x8,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x25,x26 // a^b, b^c in next round - eor x16,x16,x21,ror#41 // Sigma1(e) - eor x14,x14,x25,ror#34 - add x24,x24,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x12,x12,x5,ror#61 - eor x13,x13,x8,lsr#7 // sigma0(X[i+1]) - add x24,x24,x16 // h+=Sigma1(e) - eor x19,x19,x26 // Maj(a,b,c) - eor x17,x14,x25,ror#39 // Sigma0(a) - eor x12,x12,x5,lsr#6 // sigma1(X[i+14]) - add x7,x7,x0 - add x20,x20,x24 // d+=h - add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x7,x7,x13 - add x24,x24,x17 // h+=Sigma0(a) - add x7,x7,x12 - ldr x12,[sp,#8] - str x15,[sp,#0] - ror x16,x20,#14 - add x23,x23,x19 // h+=K[i] - ror x14,x9,#1 - and x17,x21,x20 - ror x13,x6,#19 - bic x19,x22,x20 - ror x15,x24,#28 - add x23,x23,x7 // h+=X[i] - eor x16,x16,x20,ror#18 - eor x14,x14,x9,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x24,x25 // a^b, b^c in next round - eor x16,x16,x20,ror#41 // Sigma1(e) - eor x15,x15,x24,ror#34 - add x23,x23,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x13,x13,x6,ror#61 - eor x14,x14,x9,lsr#7 // sigma0(X[i+1]) - add x23,x23,x16 // h+=Sigma1(e) - eor x28,x28,x25 // Maj(a,b,c) - eor x17,x15,x24,ror#39 // Sigma0(a) - eor x13,x13,x6,lsr#6 // sigma1(X[i+14]) - add x8,x8,x1 - add x27,x27,x23 // d+=h - add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x8,x8,x14 - add x23,x23,x17 // h+=Sigma0(a) - add x8,x8,x13 - ldr x13,[sp,#16] - str x0,[sp,#8] - ror x16,x27,#14 - add x22,x22,x28 // h+=K[i] - ror x15,x10,#1 - and x17,x20,x27 - ror x14,x7,#19 - bic x28,x21,x27 - ror x0,x23,#28 - add x22,x22,x8 // h+=X[i] - eor x16,x16,x27,ror#18 - eor x15,x15,x10,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x23,x24 // a^b, b^c in next round - eor x16,x16,x27,ror#41 // Sigma1(e) - eor x0,x0,x23,ror#34 - add x22,x22,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x14,x14,x7,ror#61 - eor x15,x15,x10,lsr#7 // sigma0(X[i+1]) - add x22,x22,x16 // h+=Sigma1(e) - eor x19,x19,x24 // Maj(a,b,c) - eor x17,x0,x23,ror#39 // Sigma0(a) - eor x14,x14,x7,lsr#6 // sigma1(X[i+14]) - add x9,x9,x2 - add x26,x26,x22 // d+=h - add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x9,x9,x15 - add x22,x22,x17 // h+=Sigma0(a) - add x9,x9,x14 - ldr x14,[sp,#24] - str x1,[sp,#16] - ror x16,x26,#14 - add x21,x21,x19 // h+=K[i] - ror x0,x11,#1 - and x17,x27,x26 - ror x15,x8,#19 - bic x19,x20,x26 - ror x1,x22,#28 - add x21,x21,x9 // h+=X[i] - eor x16,x16,x26,ror#18 - eor x0,x0,x11,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x22,x23 // a^b, b^c in next round - eor x16,x16,x26,ror#41 // Sigma1(e) - eor x1,x1,x22,ror#34 - add x21,x21,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x15,x15,x8,ror#61 - eor x0,x0,x11,lsr#7 // sigma0(X[i+1]) - add x21,x21,x16 // h+=Sigma1(e) - eor x28,x28,x23 // Maj(a,b,c) - eor x17,x1,x22,ror#39 // Sigma0(a) - eor x15,x15,x8,lsr#6 // sigma1(X[i+14]) - add x10,x10,x3 - add x25,x25,x21 // d+=h - add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x10,x10,x0 - add x21,x21,x17 // h+=Sigma0(a) - add x10,x10,x15 - ldr x15,[sp,#0] - str x2,[sp,#24] - ror x16,x25,#14 - add x20,x20,x28 // h+=K[i] - ror x1,x12,#1 - and x17,x26,x25 - ror x0,x9,#19 - bic x28,x27,x25 - ror x2,x21,#28 - add x20,x20,x10 // h+=X[i] - eor x16,x16,x25,ror#18 - eor x1,x1,x12,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x21,x22 // a^b, b^c in next round - eor x16,x16,x25,ror#41 // Sigma1(e) - eor x2,x2,x21,ror#34 - add x20,x20,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x0,x0,x9,ror#61 - eor x1,x1,x12,lsr#7 // sigma0(X[i+1]) - add x20,x20,x16 // h+=Sigma1(e) - eor x19,x19,x22 // Maj(a,b,c) - eor x17,x2,x21,ror#39 // Sigma0(a) - eor x0,x0,x9,lsr#6 // sigma1(X[i+14]) - add x11,x11,x4 - add x24,x24,x20 // d+=h - add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x11,x11,x1 - add x20,x20,x17 // h+=Sigma0(a) - add x11,x11,x0 - ldr x0,[sp,#8] - str x3,[sp,#0] - ror x16,x24,#14 - add x27,x27,x19 // h+=K[i] - ror x2,x13,#1 - and x17,x25,x24 - ror x1,x10,#19 - bic x19,x26,x24 - ror x3,x20,#28 - add x27,x27,x11 // h+=X[i] - eor x16,x16,x24,ror#18 - eor x2,x2,x13,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x20,x21 // a^b, b^c in next round - eor x16,x16,x24,ror#41 // Sigma1(e) - eor x3,x3,x20,ror#34 - add x27,x27,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x1,x1,x10,ror#61 - eor x2,x2,x13,lsr#7 // sigma0(X[i+1]) - add x27,x27,x16 // h+=Sigma1(e) - eor x28,x28,x21 // Maj(a,b,c) - eor x17,x3,x20,ror#39 // Sigma0(a) - eor x1,x1,x10,lsr#6 // sigma1(X[i+14]) - add x12,x12,x5 - add x23,x23,x27 // d+=h - add x27,x27,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x12,x12,x2 - add x27,x27,x17 // h+=Sigma0(a) - add x12,x12,x1 - ldr x1,[sp,#16] - str x4,[sp,#8] - ror x16,x23,#14 - add x26,x26,x28 // h+=K[i] - ror x3,x14,#1 - and x17,x24,x23 - ror x2,x11,#19 - bic x28,x25,x23 - ror x4,x27,#28 - add x26,x26,x12 // h+=X[i] - eor x16,x16,x23,ror#18 - eor x3,x3,x14,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x27,x20 // a^b, b^c in next round - eor x16,x16,x23,ror#41 // Sigma1(e) - eor x4,x4,x27,ror#34 - add x26,x26,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x2,x2,x11,ror#61 - eor x3,x3,x14,lsr#7 // sigma0(X[i+1]) - add x26,x26,x16 // h+=Sigma1(e) - eor x19,x19,x20 // Maj(a,b,c) - eor x17,x4,x27,ror#39 // Sigma0(a) - eor x2,x2,x11,lsr#6 // sigma1(X[i+14]) - add x13,x13,x6 - add x22,x22,x26 // d+=h - add x26,x26,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x13,x13,x3 - add x26,x26,x17 // h+=Sigma0(a) - add x13,x13,x2 - ldr x2,[sp,#24] - str x5,[sp,#16] - ror x16,x22,#14 - add x25,x25,x19 // h+=K[i] - ror x4,x15,#1 - and x17,x23,x22 - ror x3,x12,#19 - bic x19,x24,x22 - ror x5,x26,#28 - add x25,x25,x13 // h+=X[i] - eor x16,x16,x22,ror#18 - eor x4,x4,x15,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x26,x27 // a^b, b^c in next round - eor x16,x16,x22,ror#41 // Sigma1(e) - eor x5,x5,x26,ror#34 - add x25,x25,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x3,x3,x12,ror#61 - eor x4,x4,x15,lsr#7 // sigma0(X[i+1]) - add x25,x25,x16 // h+=Sigma1(e) - eor x28,x28,x27 // Maj(a,b,c) - eor x17,x5,x26,ror#39 // Sigma0(a) - eor x3,x3,x12,lsr#6 // sigma1(X[i+14]) - add x14,x14,x7 - add x21,x21,x25 // d+=h - add x25,x25,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x14,x14,x4 - add x25,x25,x17 // h+=Sigma0(a) - add x14,x14,x3 - ldr x3,[sp,#0] - str x6,[sp,#24] - ror x16,x21,#14 - add x24,x24,x28 // h+=K[i] - ror x5,x0,#1 - and x17,x22,x21 - ror x4,x13,#19 - bic x28,x23,x21 - ror x6,x25,#28 - add x24,x24,x14 // h+=X[i] - eor x16,x16,x21,ror#18 - eor x5,x5,x0,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x25,x26 // a^b, b^c in next round - eor x16,x16,x21,ror#41 // Sigma1(e) - eor x6,x6,x25,ror#34 - add x24,x24,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x4,x4,x13,ror#61 - eor x5,x5,x0,lsr#7 // sigma0(X[i+1]) - add x24,x24,x16 // h+=Sigma1(e) - eor x19,x19,x26 // Maj(a,b,c) - eor x17,x6,x25,ror#39 // Sigma0(a) - eor x4,x4,x13,lsr#6 // sigma1(X[i+14]) - add x15,x15,x8 - add x20,x20,x24 // d+=h - add x24,x24,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x15,x15,x5 - add x24,x24,x17 // h+=Sigma0(a) - add x15,x15,x4 - ldr x4,[sp,#8] - str x7,[sp,#0] - ror x16,x20,#14 - add x23,x23,x19 // h+=K[i] - ror x6,x1,#1 - and x17,x21,x20 - ror x5,x14,#19 - bic x19,x22,x20 - ror x7,x24,#28 - add x23,x23,x15 // h+=X[i] - eor x16,x16,x20,ror#18 - eor x6,x6,x1,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x24,x25 // a^b, b^c in next round - eor x16,x16,x20,ror#41 // Sigma1(e) - eor x7,x7,x24,ror#34 - add x23,x23,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x5,x5,x14,ror#61 - eor x6,x6,x1,lsr#7 // sigma0(X[i+1]) - add x23,x23,x16 // h+=Sigma1(e) - eor x28,x28,x25 // Maj(a,b,c) - eor x17,x7,x24,ror#39 // Sigma0(a) - eor x5,x5,x14,lsr#6 // sigma1(X[i+14]) - add x0,x0,x9 - add x27,x27,x23 // d+=h - add x23,x23,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x0,x0,x6 - add x23,x23,x17 // h+=Sigma0(a) - add x0,x0,x5 - ldr x5,[sp,#16] - str x8,[sp,#8] - ror x16,x27,#14 - add x22,x22,x28 // h+=K[i] - ror x7,x2,#1 - and x17,x20,x27 - ror x6,x15,#19 - bic x28,x21,x27 - ror x8,x23,#28 - add x22,x22,x0 // h+=X[i] - eor x16,x16,x27,ror#18 - eor x7,x7,x2,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x23,x24 // a^b, b^c in next round - eor x16,x16,x27,ror#41 // Sigma1(e) - eor x8,x8,x23,ror#34 - add x22,x22,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x6,x6,x15,ror#61 - eor x7,x7,x2,lsr#7 // sigma0(X[i+1]) - add x22,x22,x16 // h+=Sigma1(e) - eor x19,x19,x24 // Maj(a,b,c) - eor x17,x8,x23,ror#39 // Sigma0(a) - eor x6,x6,x15,lsr#6 // sigma1(X[i+14]) - add x1,x1,x10 - add x26,x26,x22 // d+=h - add x22,x22,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x1,x1,x7 - add x22,x22,x17 // h+=Sigma0(a) - add x1,x1,x6 - ldr x6,[sp,#24] - str x9,[sp,#16] - ror x16,x26,#14 - add x21,x21,x19 // h+=K[i] - ror x8,x3,#1 - and x17,x27,x26 - ror x7,x0,#19 - bic x19,x20,x26 - ror x9,x22,#28 - add x21,x21,x1 // h+=X[i] - eor x16,x16,x26,ror#18 - eor x8,x8,x3,ror#8 - orr x17,x17,x19 // Ch(e,f,g) - eor x19,x22,x23 // a^b, b^c in next round - eor x16,x16,x26,ror#41 // Sigma1(e) - eor x9,x9,x22,ror#34 - add x21,x21,x17 // h+=Ch(e,f,g) - and x28,x28,x19 // (b^c)&=(a^b) - eor x7,x7,x0,ror#61 - eor x8,x8,x3,lsr#7 // sigma0(X[i+1]) - add x21,x21,x16 // h+=Sigma1(e) - eor x28,x28,x23 // Maj(a,b,c) - eor x17,x9,x22,ror#39 // Sigma0(a) - eor x7,x7,x0,lsr#6 // sigma1(X[i+14]) - add x2,x2,x11 - add x25,x25,x21 // d+=h - add x21,x21,x28 // h+=Maj(a,b,c) - ldr x28,[x30],#8 // *K++, x19 in next round - add x2,x2,x8 - add x21,x21,x17 // h+=Sigma0(a) - add x2,x2,x7 - ldr x7,[sp,#0] - str x10,[sp,#24] - ror x16,x25,#14 - add x20,x20,x28 // h+=K[i] - ror x9,x4,#1 - and x17,x26,x25 - ror x8,x1,#19 - bic x28,x27,x25 - ror x10,x21,#28 - add x20,x20,x2 // h+=X[i] - eor x16,x16,x25,ror#18 - eor x9,x9,x4,ror#8 - orr x17,x17,x28 // Ch(e,f,g) - eor x28,x21,x22 // a^b, b^c in next round - eor x16,x16,x25,ror#41 // Sigma1(e) - eor x10,x10,x21,ror#34 - add x20,x20,x17 // h+=Ch(e,f,g) - and x19,x19,x28 // (b^c)&=(a^b) - eor x8,x8,x1,ror#61 - eor x9,x9,x4,lsr#7 // sigma0(X[i+1]) - add x20,x20,x16 // h+=Sigma1(e) - eor x19,x19,x22 // Maj(a,b,c) - eor x17,x10,x21,ror#39 // Sigma0(a) - eor x8,x8,x1,lsr#6 // sigma1(X[i+14]) - add x3,x3,x12 - add x24,x24,x20 // d+=h - add x20,x20,x19 // h+=Maj(a,b,c) - ldr x19,[x30],#8 // *K++, x28 in next round - add x3,x3,x9 - add x20,x20,x17 // h+=Sigma0(a) - add x3,x3,x8 - cbnz x19,.Loop_16_xx - - ldp x0,x2,[x29,#96] - ldr x1,[x29,#112] - sub x30,x30,#648 // rewind - - ldp x3,x4,[x0] - ldp x5,x6,[x0,#2*8] - add x1,x1,#14*8 // advance input pointer - ldp x7,x8,[x0,#4*8] - add x20,x20,x3 - ldp x9,x10,[x0,#6*8] - add x21,x21,x4 - add x22,x22,x5 - add x23,x23,x6 - stp x20,x21,[x0] - add x24,x24,x7 - add x25,x25,x8 - stp x22,x23,[x0,#2*8] - add x26,x26,x9 - add x27,x27,x10 - cmp x1,x2 - stp x24,x25,[x0,#4*8] - stp x26,x27,[x0,#6*8] - b.ne .Loop - - ldp x19,x20,[x29,#16] - add sp,sp,#4*8 - ldp x21,x22,[x29,#32] - ldp x23,x24,[x29,#48] - ldp x25,x26,[x29,#64] - ldp x27,x28,[x29,#80] - ldp x29,x30,[sp],#128 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size sha512_block_data_order,.-sha512_block_data_order - -.section .rodata -.align 6 -.type .LK512,%object -.LK512: -.quad 0x428a2f98d728ae22,0x7137449123ef65cd -.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc -.quad 0x3956c25bf348b538,0x59f111f1b605d019 -.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 -.quad 0xd807aa98a3030242,0x12835b0145706fbe -.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 -.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 -.quad 0x9bdc06a725c71235,0xc19bf174cf692694 -.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 -.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 -.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 -.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 -.quad 0x983e5152ee66dfab,0xa831c66d2db43210 -.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 -.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 -.quad 0x06ca6351e003826f,0x142929670a0e6e70 -.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 -.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df -.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 -.quad 0x81c2c92e47edaee6,0x92722c851482353b -.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 -.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 -.quad 0xd192e819d6ef5218,0xd69906245565a910 -.quad 0xf40e35855771202a,0x106aa07032bbd1b8 -.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 -.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 -.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb -.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 -.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 -.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec -.quad 0x90befffa23631e28,0xa4506cebde82bde9 -.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b -.quad 0xca273eceea26619c,0xd186b8c721c0c207 -.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 -.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 -.quad 0x113f9804bef90dae,0x1b710b35131c471b -.quad 0x28db77f523047d84,0x32caab7b40c72493 -.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c -.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a -.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 -.quad 0 // terminator -.size .LK512,.-.LK512 -.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -.text -#ifndef __KERNEL__ -.type sha512_block_armv8,%function -.align 6 -sha512_block_armv8: -.Lv8_entry: - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 // load input - ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 - - ld1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // load context - adrp x3,.LK512 - add x3,x3,:lo12:.LK512 - - rev64 v16.16b,v16.16b - rev64 v17.16b,v17.16b - rev64 v18.16b,v18.16b - rev64 v19.16b,v19.16b - rev64 v20.16b,v20.16b - rev64 v21.16b,v21.16b - rev64 v22.16b,v22.16b - rev64 v23.16b,v23.16b - b .Loop_hw - -.align 4 -.Loop_hw: - ld1 {v24.2d},[x3],#16 - subs x2,x2,#1 - sub x4,x1,#128 - orr v26.16b,v0.16b,v0.16b // offload - orr v27.16b,v1.16b,v1.16b - orr v28.16b,v2.16b,v2.16b - orr v29.16b,v3.16b,v3.16b - csel x1,x1,x4,ne // conditional rewind - add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec08230 //sha512su0 v16.16b,v17.16b - ext v7.16b,v20.16b,v21.16b,#8 -.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08251 //sha512su0 v17.16b,v18.16b - ext v7.16b,v21.16b,v22.16b,#8 -.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec08272 //sha512su0 v18.16b,v19.16b - ext v7.16b,v22.16b,v23.16b,#8 -.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08293 //sha512su0 v19.16b,v20.16b - ext v7.16b,v23.16b,v16.16b,#8 -.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b - ext v7.16b,v16.16b,v17.16b,#8 -.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b - ext v7.16b,v17.16b,v18.16b,#8 -.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b - ext v7.16b,v18.16b,v19.16b,#8 -.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08217 //sha512su0 v23.16b,v16.16b - ext v7.16b,v19.16b,v20.16b,#8 -.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec08230 //sha512su0 v16.16b,v17.16b - ext v7.16b,v20.16b,v21.16b,#8 -.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08251 //sha512su0 v17.16b,v18.16b - ext v7.16b,v21.16b,v22.16b,#8 -.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec08272 //sha512su0 v18.16b,v19.16b - ext v7.16b,v22.16b,v23.16b,#8 -.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08293 //sha512su0 v19.16b,v20.16b - ext v7.16b,v23.16b,v16.16b,#8 -.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b - ext v7.16b,v16.16b,v17.16b,#8 -.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b - ext v7.16b,v17.16b,v18.16b,#8 -.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b - ext v7.16b,v18.16b,v19.16b,#8 -.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08217 //sha512su0 v23.16b,v16.16b - ext v7.16b,v19.16b,v20.16b,#8 -.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec08230 //sha512su0 v16.16b,v17.16b - ext v7.16b,v20.16b,v21.16b,#8 -.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08251 //sha512su0 v17.16b,v18.16b - ext v7.16b,v21.16b,v22.16b,#8 -.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec08272 //sha512su0 v18.16b,v19.16b - ext v7.16b,v22.16b,v23.16b,#8 -.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08293 //sha512su0 v19.16b,v20.16b - ext v7.16b,v23.16b,v16.16b,#8 -.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b - ext v7.16b,v16.16b,v17.16b,#8 -.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b - ext v7.16b,v17.16b,v18.16b,#8 -.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b - ext v7.16b,v18.16b,v19.16b,#8 -.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08217 //sha512su0 v23.16b,v16.16b - ext v7.16b,v19.16b,v20.16b,#8 -.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v24.2d,v24.2d,v16.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec08230 //sha512su0 v16.16b,v17.16b - ext v7.16b,v20.16b,v21.16b,#8 -.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.inst 0xce678af0 //sha512su1 v16.16b,v23.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v25.2d,v25.2d,v17.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08251 //sha512su0 v17.16b,v18.16b - ext v7.16b,v21.16b,v22.16b,#8 -.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.inst 0xce678a11 //sha512su1 v17.16b,v16.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v24.2d,v24.2d,v18.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec08272 //sha512su0 v18.16b,v19.16b - ext v7.16b,v22.16b,v23.16b,#8 -.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.inst 0xce678a32 //sha512su1 v18.16b,v17.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - add v25.2d,v25.2d,v19.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08293 //sha512su0 v19.16b,v20.16b - ext v7.16b,v23.16b,v16.16b,#8 -.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b -.inst 0xce678a53 //sha512su1 v19.16b,v18.16b,v7.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - add v24.2d,v24.2d,v20.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec082b4 //sha512su0 v20.16b,v21.16b - ext v7.16b,v16.16b,v17.16b,#8 -.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b -.inst 0xce678a74 //sha512su1 v20.16b,v19.16b,v7.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - add v25.2d,v25.2d,v21.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec082d5 //sha512su0 v21.16b,v22.16b - ext v7.16b,v17.16b,v18.16b,#8 -.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b -.inst 0xce678a95 //sha512su1 v21.16b,v20.16b,v7.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v24.2d,v24.2d,v22.2d - ld1 {v25.2d},[x3],#16 - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xcec082f6 //sha512su0 v22.16b,v23.16b - ext v7.16b,v18.16b,v19.16b,#8 -.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b -.inst 0xce678ab6 //sha512su1 v22.16b,v21.16b,v7.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - add v25.2d,v25.2d,v23.2d - ld1 {v24.2d},[x3],#16 - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xcec08217 //sha512su0 v23.16b,v16.16b - ext v7.16b,v19.16b,v20.16b,#8 -.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b -.inst 0xce678ad7 //sha512su1 v23.16b,v22.16b,v7.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - ld1 {v25.2d},[x3],#16 - add v24.2d,v24.2d,v16.2d - ld1 {v16.16b},[x1],#16 // load next input - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b - rev64 v16.16b,v16.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - ld1 {v24.2d},[x3],#16 - add v25.2d,v25.2d,v17.2d - ld1 {v17.16b},[x1],#16 // load next input - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b - rev64 v17.16b,v17.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - ld1 {v25.2d},[x3],#16 - add v24.2d,v24.2d,v18.2d - ld1 {v18.16b},[x1],#16 // load next input - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b - rev64 v18.16b,v18.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - ld1 {v24.2d},[x3],#16 - add v25.2d,v25.2d,v19.2d - ld1 {v19.16b},[x1],#16 // load next input - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v2.16b,v3.16b,#8 - ext v6.16b,v1.16b,v2.16b,#8 - add v3.2d,v3.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xce6680a3 //sha512h v3.16b,v5.16b,v6.16b - rev64 v19.16b,v19.16b - add v4.2d,v1.2d,v3.2d // "D + T1" -.inst 0xce608423 //sha512h2 v3.16b,v1.16b,v0.16b - ld1 {v25.2d},[x3],#16 - add v24.2d,v24.2d,v20.2d - ld1 {v20.16b},[x1],#16 // load next input - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v4.16b,v2.16b,#8 - ext v6.16b,v0.16b,v4.16b,#8 - add v2.2d,v2.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xce6680a2 //sha512h v2.16b,v5.16b,v6.16b - rev64 v20.16b,v20.16b - add v1.2d,v0.2d,v2.2d // "D + T1" -.inst 0xce638402 //sha512h2 v2.16b,v0.16b,v3.16b - ld1 {v24.2d},[x3],#16 - add v25.2d,v25.2d,v21.2d - ld1 {v21.16b},[x1],#16 // load next input - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v1.16b,v4.16b,#8 - ext v6.16b,v3.16b,v1.16b,#8 - add v4.2d,v4.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xce6680a4 //sha512h v4.16b,v5.16b,v6.16b - rev64 v21.16b,v21.16b - add v0.2d,v3.2d,v4.2d // "D + T1" -.inst 0xce628464 //sha512h2 v4.16b,v3.16b,v2.16b - ld1 {v25.2d},[x3],#16 - add v24.2d,v24.2d,v22.2d - ld1 {v22.16b},[x1],#16 // load next input - ext v24.16b,v24.16b,v24.16b,#8 - ext v5.16b,v0.16b,v1.16b,#8 - ext v6.16b,v2.16b,v0.16b,#8 - add v1.2d,v1.2d,v24.2d // "T1 + H + K512[i]" -.inst 0xce6680a1 //sha512h v1.16b,v5.16b,v6.16b - rev64 v22.16b,v22.16b - add v3.2d,v2.2d,v1.2d // "D + T1" -.inst 0xce648441 //sha512h2 v1.16b,v2.16b,v4.16b - sub x3,x3,#80*8 // rewind - add v25.2d,v25.2d,v23.2d - ld1 {v23.16b},[x1],#16 // load next input - ext v25.16b,v25.16b,v25.16b,#8 - ext v5.16b,v3.16b,v0.16b,#8 - ext v6.16b,v4.16b,v3.16b,#8 - add v0.2d,v0.2d,v25.2d // "T1 + H + K512[i]" -.inst 0xce6680a0 //sha512h v0.16b,v5.16b,v6.16b - rev64 v23.16b,v23.16b - add v2.2d,v4.2d,v0.2d // "D + T1" -.inst 0xce618480 //sha512h2 v0.16b,v4.16b,v1.16b - add v0.2d,v0.2d,v26.2d // accumulate - add v1.2d,v1.2d,v27.2d - add v2.2d,v2.2d,v28.2d - add v3.2d,v3.2d,v29.2d - - cbnz x2,.Loop_hw - - st1 {v0.2d,v1.2d,v2.2d,v3.2d},[x0] // store context - - ldr x29,[sp],#16 - ret -.size sha512_block_armv8,.-sha512_block_armv8 -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S b/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S deleted file mode 100644 index 59b1d31d..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/fipsmodule/vpaes-armv8.S +++ /dev/null @@ -1,1235 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.section .rodata - -.type _vpaes_consts,%object -.align 7 // totally strategic alignment -_vpaes_consts: -.Lk_mc_forward: // mc_forward -.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 -.quad 0x080B0A0904070605, 0x000302010C0F0E0D -.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 -.quad 0x000302010C0F0E0D, 0x080B0A0904070605 -.Lk_mc_backward: // mc_backward -.quad 0x0605040702010003, 0x0E0D0C0F0A09080B -.quad 0x020100030E0D0C0F, 0x0A09080B06050407 -.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 -.quad 0x0A09080B06050407, 0x020100030E0D0C0F -.Lk_sr: // sr -.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 -.quad 0x030E09040F0A0500, 0x0B06010C07020D08 -.quad 0x0F060D040B020900, 0x070E050C030A0108 -.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 - -// -// "Hot" constants -// -.Lk_inv: // inv, inva -.quad 0x0E05060F0D080180, 0x040703090A0B0C02 -.quad 0x01040A060F0B0780, 0x030D0E0C02050809 -.Lk_ipt: // input transform (lo, hi) -.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 -.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 -.Lk_sbo: // sbou, sbot -.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 -.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA -.Lk_sb1: // sb1u, sb1t -.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF -.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 -.Lk_sb2: // sb2u, sb2t -.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A -.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD - -// -// Decryption stuff -// -.Lk_dipt: // decryption input transform -.quad 0x0F505B040B545F00, 0x154A411E114E451A -.quad 0x86E383E660056500, 0x12771772F491F194 -.Lk_dsbo: // decryption sbox final output -.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D -.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C -.Lk_dsb9: // decryption sbox output *9*u, *9*t -.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 -.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 -.Lk_dsbd: // decryption sbox output *D*u, *D*t -.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 -.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 -.Lk_dsbb: // decryption sbox output *B*u, *B*t -.quad 0xD022649296B44200, 0x602646F6B0F2D404 -.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B -.Lk_dsbe: // decryption sbox output *E*u, *E*t -.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 -.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 - -// -// Key schedule constants -// -.Lk_dksd: // decryption key schedule: invskew x*D -.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 -.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E -.Lk_dksb: // decryption key schedule: invskew x*B -.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 -.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 -.Lk_dkse: // decryption key schedule: invskew x*E + 0x63 -.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 -.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 -.Lk_dks9: // decryption key schedule: invskew x*9 -.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC -.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE - -.Lk_rcon: // rcon -.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 - -.Lk_opt: // output transform -.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 -.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 -.Lk_deskew: // deskew tables: inverts the sbox's "skew" -.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A -.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 - -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,56,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 -.align 2 -.size _vpaes_consts,.-_vpaes_consts -.align 6 - -.text -## -## _aes_preheat -## -## Fills register %r10 -> .aes_consts (so you can -fPIC) -## and %xmm9-%xmm15 as specified below. -## -.type _vpaes_encrypt_preheat,%function -.align 4 -_vpaes_encrypt_preheat: - adrp x10, .Lk_inv - add x10, x10, :lo12:.Lk_inv - movi v17.16b, #0x0f - ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv - ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x10],#64 // .Lk_ipt, .Lk_sbo - ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10] // .Lk_sb1, .Lk_sb2 - ret -.size _vpaes_encrypt_preheat,.-_vpaes_encrypt_preheat - -## -## _aes_encrypt_core -## -## AES-encrypt %xmm0. -## -## Inputs: -## %xmm0 = input -## %xmm9-%xmm15 as in _vpaes_preheat -## (%rdx) = scheduled keys -## -## Output in %xmm0 -## Clobbers %xmm1-%xmm5, %r9, %r10, %r11, %rax -## Preserves %xmm6 - %xmm8 so you get some local vectors -## -## -.type _vpaes_encrypt_core,%function -.align 4 -_vpaes_encrypt_core: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds - adrp x11, .Lk_mc_forward+16 - add x11, x11, :lo12:.Lk_mc_forward+16 - // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo - ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key - and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 - ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 - tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 - // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi - tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 - eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 - eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 - b .Lenc_entry - -.align 4 -.Lenc_loop: - // middle of middle round - add x10, x11, #0x40 - tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u - ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] - tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t - eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A - tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t - ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] - tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B - eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A - tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B - tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C - eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D - and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D - sub w8, w8, #1 // nr-- - -.Lenc_entry: - // top of round - and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i - tbl v5.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k - eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j - tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io - eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 - cbnz w8, .Lenc_loop - - // middle of last round - add x10, x11, #0x80 - // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo - // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 - tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] - tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t - eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A - tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 - ret -.size _vpaes_encrypt_core,.-_vpaes_encrypt_core - -.globl vpaes_encrypt -.hidden vpaes_encrypt -.type vpaes_encrypt,%function -.align 4 -vpaes_encrypt: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v7.16b}, [x0] - bl _vpaes_encrypt_preheat - bl _vpaes_encrypt_core - st1 {v0.16b}, [x1] - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size vpaes_encrypt,.-vpaes_encrypt - -.type _vpaes_encrypt_2x,%function -.align 4 -_vpaes_encrypt_2x: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds - adrp x11, .Lk_mc_forward+16 - add x11, x11, :lo12:.Lk_mc_forward+16 - // vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo - ld1 {v16.2d}, [x9], #16 // vmovdqu (%r9), %xmm5 # round0 key - and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 - ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 - and v9.16b, v15.16b, v17.16b - ushr v8.16b, v15.16b, #4 - tbl v1.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm1 - tbl v9.16b, {v20.16b}, v9.16b - // vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi - tbl v2.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm3, %xmm2 - tbl v10.16b, {v21.16b}, v8.16b - eor v0.16b, v1.16b, v16.16b // vpxor %xmm5, %xmm1, %xmm0 - eor v8.16b, v9.16b, v16.16b - eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 - eor v8.16b, v8.16b, v10.16b - b .Lenc_2x_entry - -.align 4 -.Lenc_2x_loop: - // middle of middle round - add x10, x11, #0x40 - tbl v4.16b, {v25.16b}, v2.16b // vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u - tbl v12.16b, {v25.16b}, v10.16b - ld1 {v1.2d}, [x11], #16 // vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] - tbl v0.16b, {v24.16b}, v3.16b // vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t - tbl v8.16b, {v24.16b}, v11.16b - eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - eor v12.16b, v12.16b, v16.16b - tbl v5.16b, {v27.16b}, v2.16b // vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u - tbl v13.16b, {v27.16b}, v10.16b - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A - eor v8.16b, v8.16b, v12.16b - tbl v2.16b, {v26.16b}, v3.16b // vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t - tbl v10.16b, {v26.16b}, v11.16b - ld1 {v4.2d}, [x10] // vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] - tbl v3.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm3 # 0 = B - tbl v11.16b, {v8.16b}, v1.16b - eor v2.16b, v2.16b, v5.16b // vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A - eor v10.16b, v10.16b, v13.16b - tbl v0.16b, {v0.16b}, v4.16b // vpshufb %xmm4, %xmm0, %xmm0 # 3 = D - tbl v8.16b, {v8.16b}, v4.16b - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B - eor v11.16b, v11.16b, v10.16b - tbl v4.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C - tbl v12.16b, {v11.16b},v1.16b - eor v0.16b, v0.16b, v3.16b // vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D - eor v8.16b, v8.16b, v11.16b - and x11, x11, #~(1<<6) // and $0x30, %r11 # ... mod 4 - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D - eor v8.16b, v8.16b, v12.16b - sub w8, w8, #1 // nr-- - -.Lenc_2x_entry: - // top of round - and v1.16b, v0.16b, v17.16b // vpand %xmm0, %xmm9, %xmm1 # 0 = k - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i - and v9.16b, v8.16b, v17.16b - ushr v8.16b, v8.16b, #4 - tbl v5.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k - tbl v13.16b, {v19.16b},v9.16b - eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j - eor v9.16b, v9.16b, v8.16b - tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - tbl v11.16b, {v18.16b},v8.16b - tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - tbl v12.16b, {v18.16b},v9.16b - eor v3.16b, v3.16b, v5.16b // vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - eor v11.16b, v11.16b, v13.16b - eor v4.16b, v4.16b, v5.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - eor v12.16b, v12.16b, v13.16b - tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - tbl v10.16b, {v18.16b},v11.16b - tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - tbl v11.16b, {v18.16b},v12.16b - eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io - eor v10.16b, v10.16b, v9.16b - eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - eor v11.16b, v11.16b, v8.16b - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm5 - cbnz w8, .Lenc_2x_loop - - // middle of last round - add x10, x11, #0x80 - // vmovdqa -0x60(%r10), %xmm4 # 3 : sbou .Lk_sbo - // vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 - tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - tbl v12.16b, {v22.16b}, v10.16b - ld1 {v1.2d}, [x10] // vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] - tbl v0.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t - tbl v8.16b, {v23.16b}, v11.16b - eor v4.16b, v4.16b, v16.16b // vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - eor v12.16b, v12.16b, v16.16b - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 0 = A - eor v8.16b, v8.16b, v12.16b - tbl v0.16b, {v0.16b},v1.16b // vpshufb %xmm1, %xmm0, %xmm0 - tbl v1.16b, {v8.16b},v1.16b - ret -.size _vpaes_encrypt_2x,.-_vpaes_encrypt_2x - -.type _vpaes_decrypt_preheat,%function -.align 4 -_vpaes_decrypt_preheat: - adrp x10, .Lk_inv - add x10, x10, :lo12:.Lk_inv - movi v17.16b, #0x0f - adrp x11, .Lk_dipt - add x11, x11, :lo12:.Lk_dipt - ld1 {v18.2d,v19.2d}, [x10],#32 // .Lk_inv - ld1 {v20.2d,v21.2d,v22.2d,v23.2d}, [x11],#64 // .Lk_dipt, .Lk_dsbo - ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x11],#64 // .Lk_dsb9, .Lk_dsbd - ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x11] // .Lk_dsbb, .Lk_dsbe - ret -.size _vpaes_decrypt_preheat,.-_vpaes_decrypt_preheat - -## -## Decryption core -## -## Same API as encryption core. -## -.type _vpaes_decrypt_core,%function -.align 4 -_vpaes_decrypt_core: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds - - // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo - lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 - eor x11, x11, #0x30 // xor $0x30, %r11 - adrp x10, .Lk_sr - add x10, x10, :lo12:.Lk_sr - and x11, x11, #0x30 // and $0x30, %r11 - add x11, x11, x10 - adrp x10, .Lk_mc_forward+48 - add x10, x10, :lo12:.Lk_mc_forward+48 - - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key - and v1.16b, v7.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 - ushr v0.16b, v7.16b, #4 // vpsrlb $4, %xmm0, %xmm0 - tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 - ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 - // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi - tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 - eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 - eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 - b .Ldec_entry - -.align 4 -.Ldec_loop: -// -// Inverse mix columns -// - // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u - // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t - tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u - tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t - eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 - // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt - - tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu - tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt - - tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu - tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet - - tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu - tbl v0.16b, {v0.16b}, v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - sub w8, w8, #1 // sub $1,%rax # nr-- - -.Ldec_entry: - // top of round - and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i - tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j - tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - tbl v2.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - tbl v3.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io - eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 - cbnz w8, .Ldec_loop - - // middle of last round - // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou - tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot - ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 - tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t - eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k - eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A - tbl v0.16b, {v0.16b}, v2.16b // vpshufb %xmm2, %xmm0, %xmm0 - ret -.size _vpaes_decrypt_core,.-_vpaes_decrypt_core - -.globl vpaes_decrypt -.hidden vpaes_decrypt -.type vpaes_decrypt,%function -.align 4 -vpaes_decrypt: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - ld1 {v7.16b}, [x0] - bl _vpaes_decrypt_preheat - bl _vpaes_decrypt_core - st1 {v0.16b}, [x1] - - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size vpaes_decrypt,.-vpaes_decrypt - -// v14-v15 input, v0-v1 output -.type _vpaes_decrypt_2x,%function -.align 4 -_vpaes_decrypt_2x: - mov x9, x2 - ldr w8, [x2,#240] // pull rounds - - // vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo - lsl x11, x8, #4 // mov %rax, %r11; shl $4, %r11 - eor x11, x11, #0x30 // xor $0x30, %r11 - adrp x10, .Lk_sr - add x10, x10, :lo12:.Lk_sr - and x11, x11, #0x30 // and $0x30, %r11 - add x11, x11, x10 - adrp x10, .Lk_mc_forward+48 - add x10, x10, :lo12:.Lk_mc_forward+48 - - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm4 # round0 key - and v1.16b, v14.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 - ushr v0.16b, v14.16b, #4 // vpsrlb $4, %xmm0, %xmm0 - and v9.16b, v15.16b, v17.16b - ushr v8.16b, v15.16b, #4 - tbl v2.16b, {v20.16b},v1.16b // vpshufb %xmm1, %xmm2, %xmm2 - tbl v10.16b, {v20.16b},v9.16b - ld1 {v5.2d}, [x10] // vmovdqa .Lk_mc_forward+48(%rip), %xmm5 - // vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi - tbl v0.16b, {v21.16b},v0.16b // vpshufb %xmm0, %xmm1, %xmm0 - tbl v8.16b, {v21.16b},v8.16b - eor v2.16b, v2.16b, v16.16b // vpxor %xmm4, %xmm2, %xmm2 - eor v10.16b, v10.16b, v16.16b - eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 - eor v8.16b, v8.16b, v10.16b - b .Ldec_2x_entry - -.align 4 -.Ldec_2x_loop: -// -// Inverse mix columns -// - // vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u - // vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t - tbl v4.16b, {v24.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u - tbl v12.16b, {v24.16b}, v10.16b - tbl v1.16b, {v25.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t - tbl v9.16b, {v25.16b}, v11.16b - eor v0.16b, v4.16b, v16.16b // vpxor %xmm4, %xmm0, %xmm0 - eor v8.16b, v12.16b, v16.16b - // vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - eor v8.16b, v8.16b, v9.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - // vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt - - tbl v4.16b, {v26.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu - tbl v12.16b, {v26.16b}, v10.16b - tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v8.16b, {v8.16b},v5.16b - tbl v1.16b, {v27.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt - tbl v9.16b, {v27.16b}, v11.16b - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - eor v8.16b, v8.16b, v12.16b - // vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - eor v8.16b, v8.16b, v9.16b - // vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt - - tbl v4.16b, {v28.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu - tbl v12.16b, {v28.16b}, v10.16b - tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v8.16b, {v8.16b},v5.16b - tbl v1.16b, {v29.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt - tbl v9.16b, {v29.16b}, v11.16b - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - eor v8.16b, v8.16b, v12.16b - // vmovdqa 0x40(%r10), %xmm4 # 4 : sbeu - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - eor v8.16b, v8.16b, v9.16b - // vmovdqa 0x50(%r10), %xmm1 # 0 : sbet - - tbl v4.16b, {v30.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu - tbl v12.16b, {v30.16b}, v10.16b - tbl v0.16b, {v0.16b},v5.16b // vpshufb %xmm5, %xmm0, %xmm0 # MC ch - tbl v8.16b, {v8.16b},v5.16b - tbl v1.16b, {v31.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet - tbl v9.16b, {v31.16b}, v11.16b - eor v0.16b, v0.16b, v4.16b // vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - eor v8.16b, v8.16b, v12.16b - ext v5.16b, v5.16b, v5.16b, #12 // vpalignr $12, %xmm5, %xmm5, %xmm5 - eor v0.16b, v0.16b, v1.16b // vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - eor v8.16b, v8.16b, v9.16b - sub w8, w8, #1 // sub $1,%rax # nr-- - -.Ldec_2x_entry: - // top of round - and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i - and v9.16b, v8.16b, v17.16b - ushr v8.16b, v8.16b, #4 - tbl v2.16b, {v19.16b},v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - tbl v10.16b, {v19.16b},v9.16b - eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j - eor v9.16b, v9.16b, v8.16b - tbl v3.16b, {v18.16b},v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - tbl v11.16b, {v18.16b},v8.16b - tbl v4.16b, {v18.16b},v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - tbl v12.16b, {v18.16b},v9.16b - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - eor v11.16b, v11.16b, v10.16b - eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - eor v12.16b, v12.16b, v10.16b - tbl v2.16b, {v18.16b},v3.16b // vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - tbl v10.16b, {v18.16b},v11.16b - tbl v3.16b, {v18.16b},v4.16b // vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - tbl v11.16b, {v18.16b},v12.16b - eor v2.16b, v2.16b, v1.16b // vpxor %xmm1, %xmm2, %xmm2 # 2 = io - eor v10.16b, v10.16b, v9.16b - eor v3.16b, v3.16b, v0.16b // vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - eor v11.16b, v11.16b, v8.16b - ld1 {v16.2d}, [x9],#16 // vmovdqu (%r9), %xmm0 - cbnz w8, .Ldec_2x_loop - - // middle of last round - // vmovdqa 0x60(%r10), %xmm4 # 3 : sbou - tbl v4.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - tbl v12.16b, {v22.16b}, v10.16b - // vmovdqa 0x70(%r10), %xmm1 # 0 : sbot - tbl v1.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t - tbl v9.16b, {v23.16b}, v11.16b - ld1 {v2.2d}, [x11] // vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 - eor v4.16b, v4.16b, v16.16b // vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k - eor v12.16b, v12.16b, v16.16b - eor v0.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm0 # 0 = A - eor v8.16b, v9.16b, v12.16b - tbl v0.16b, {v0.16b},v2.16b // vpshufb %xmm2, %xmm0, %xmm0 - tbl v1.16b, {v8.16b},v2.16b - ret -.size _vpaes_decrypt_2x,.-_vpaes_decrypt_2x -######################################################## -## ## -## AES key schedule ## -## ## -######################################################## -.type _vpaes_key_preheat,%function -.align 4 -_vpaes_key_preheat: - adrp x10, .Lk_inv - add x10, x10, :lo12:.Lk_inv - movi v16.16b, #0x5b // .Lk_s63 - adrp x11, .Lk_sb1 - add x11, x11, :lo12:.Lk_sb1 - movi v17.16b, #0x0f // .Lk_s0F - ld1 {v18.2d,v19.2d,v20.2d,v21.2d}, [x10] // .Lk_inv, .Lk_ipt - adrp x10, .Lk_dksd - add x10, x10, :lo12:.Lk_dksd - ld1 {v22.2d,v23.2d}, [x11] // .Lk_sb1 - adrp x11, .Lk_mc_forward - add x11, x11, :lo12:.Lk_mc_forward - ld1 {v24.2d,v25.2d,v26.2d,v27.2d}, [x10],#64 // .Lk_dksd, .Lk_dksb - ld1 {v28.2d,v29.2d,v30.2d,v31.2d}, [x10],#64 // .Lk_dkse, .Lk_dks9 - ld1 {v8.2d}, [x10] // .Lk_rcon - ld1 {v9.2d}, [x11] // .Lk_mc_forward[0] - ret -.size _vpaes_key_preheat,.-_vpaes_key_preheat - -.type _vpaes_schedule_core,%function -.align 4 -_vpaes_schedule_core: - AARCH64_SIGN_LINK_REGISTER - stp x29, x30, [sp,#-16]! - add x29,sp,#0 - - bl _vpaes_key_preheat // load the tables - - ld1 {v0.16b}, [x0],#16 // vmovdqu (%rdi), %xmm0 # load key (unaligned) - - // input transform - mov v3.16b, v0.16b // vmovdqa %xmm0, %xmm3 - bl _vpaes_schedule_transform - mov v7.16b, v0.16b // vmovdqa %xmm0, %xmm7 - - adrp x10, .Lk_sr // lea .Lk_sr(%rip),%r10 - add x10, x10, :lo12:.Lk_sr - - add x8, x8, x10 - cbnz w3, .Lschedule_am_decrypting - - // encrypting, output zeroth round key after transform - st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) - b .Lschedule_go - -.Lschedule_am_decrypting: - // decrypting, output zeroth round key after shiftrows - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 - tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) - eor x8, x8, #0x30 // xor $0x30, %r8 - -.Lschedule_go: - cmp w1, #192 // cmp $192, %esi - b.hi .Lschedule_256 - b.eq .Lschedule_192 - // 128: fall though - -## -## .schedule_128 -## -## 128-bit specific part of key schedule. -## -## This schedule is really simple, because all its parts -## are accomplished by the subroutines. -## -.Lschedule_128: - mov x0, #10 // mov $10, %esi - -.Loop_schedule_128: - sub x0, x0, #1 // dec %esi - bl _vpaes_schedule_round - cbz x0, .Lschedule_mangle_last - bl _vpaes_schedule_mangle // write output - b .Loop_schedule_128 - -## -## .aes_schedule_192 -## -## 192-bit specific part of key schedule. -## -## The main body of this schedule is the same as the 128-bit -## schedule, but with more smearing. The long, high side is -## stored in %xmm7 as before, and the short, low side is in -## the high bits of %xmm6. -## -## This schedule is somewhat nastier, however, because each -## round produces 192 bits of key material, or 1.5 round keys. -## Therefore, on each cycle we do 2 rounds and produce 3 round -## keys. -## -.align 4 -.Lschedule_192: - sub x0, x0, #8 - ld1 {v0.16b}, [x0] // vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) - bl _vpaes_schedule_transform // input transform - mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save short part - eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 # clear 4 - ins v6.d[0], v4.d[0] // vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros - mov x0, #4 // mov $4, %esi - -.Loop_schedule_192: - sub x0, x0, #1 // dec %esi - bl _vpaes_schedule_round - ext v0.16b, v6.16b, v0.16b, #8 // vpalignr $8,%xmm6,%xmm0,%xmm0 - bl _vpaes_schedule_mangle // save key n - bl _vpaes_schedule_192_smear - bl _vpaes_schedule_mangle // save key n+1 - bl _vpaes_schedule_round - cbz x0, .Lschedule_mangle_last - bl _vpaes_schedule_mangle // save key n+2 - bl _vpaes_schedule_192_smear - b .Loop_schedule_192 - -## -## .aes_schedule_256 -## -## 256-bit specific part of key schedule. -## -## The structure here is very similar to the 128-bit -## schedule, but with an additional "low side" in -## %xmm6. The low side's rounds are the same as the -## high side's, except no rcon and no rotation. -## -.align 4 -.Lschedule_256: - ld1 {v0.16b}, [x0] // vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) - bl _vpaes_schedule_transform // input transform - mov x0, #7 // mov $7, %esi - -.Loop_schedule_256: - sub x0, x0, #1 // dec %esi - bl _vpaes_schedule_mangle // output low result - mov v6.16b, v0.16b // vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 - - // high round - bl _vpaes_schedule_round - cbz x0, .Lschedule_mangle_last - bl _vpaes_schedule_mangle - - // low round. swap xmm7 and xmm6 - dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 - movi v4.16b, #0 - mov v5.16b, v7.16b // vmovdqa %xmm7, %xmm5 - mov v7.16b, v6.16b // vmovdqa %xmm6, %xmm7 - bl _vpaes_schedule_low_round - mov v7.16b, v5.16b // vmovdqa %xmm5, %xmm7 - - b .Loop_schedule_256 - -## -## .aes_schedule_mangle_last -## -## Mangler for last round of key schedule -## Mangles %xmm0 -## when encrypting, outputs out(%xmm0) ^ 63 -## when decrypting, outputs unskew(%xmm0) -## -## Always called right before return... jumps to cleanup and exits -## -.align 4 -.Lschedule_mangle_last: - // schedule last round key from xmm0 - adrp x11, .Lk_deskew // lea .Lk_deskew(%rip),%r11 # prepare to deskew - add x11, x11, :lo12:.Lk_deskew - - cbnz w3, .Lschedule_mangle_last_dec - - // encrypting - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10),%xmm1 - adrp x11, .Lk_opt // lea .Lk_opt(%rip), %r11 # prepare to output transform - add x11, x11, :lo12:.Lk_opt - add x2, x2, #32 // add $32, %rdx - tbl v0.16b, {v0.16b}, v1.16b // vpshufb %xmm1, %xmm0, %xmm0 # output permute - -.Lschedule_mangle_last_dec: - ld1 {v20.2d,v21.2d}, [x11] // reload constants - sub x2, x2, #16 // add $-16, %rdx - eor v0.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm0 - bl _vpaes_schedule_transform // output transform - st1 {v0.2d}, [x2] // vmovdqu %xmm0, (%rdx) # save last key - - // cleanup - eor v0.16b, v0.16b, v0.16b // vpxor %xmm0, %xmm0, %xmm0 - eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 - eor v2.16b, v2.16b, v2.16b // vpxor %xmm2, %xmm2, %xmm2 - eor v3.16b, v3.16b, v3.16b // vpxor %xmm3, %xmm3, %xmm3 - eor v4.16b, v4.16b, v4.16b // vpxor %xmm4, %xmm4, %xmm4 - eor v5.16b, v5.16b, v5.16b // vpxor %xmm5, %xmm5, %xmm5 - eor v6.16b, v6.16b, v6.16b // vpxor %xmm6, %xmm6, %xmm6 - eor v7.16b, v7.16b, v7.16b // vpxor %xmm7, %xmm7, %xmm7 - ldp x29, x30, [sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size _vpaes_schedule_core,.-_vpaes_schedule_core - -## -## .aes_schedule_192_smear -## -## Smear the short, low side in the 192-bit key schedule. -## -## Inputs: -## %xmm7: high side, b a x y -## %xmm6: low side, d c 0 0 -## %xmm13: 0 -## -## Outputs: -## %xmm6: b+c+d b+c 0 0 -## %xmm0: b+c+d b+c b a -## -.type _vpaes_schedule_192_smear,%function -.align 4 -_vpaes_schedule_192_smear: - movi v1.16b, #0 - dup v0.4s, v7.s[3] - ins v1.s[3], v6.s[2] // vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 - ins v0.s[0], v7.s[2] // vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a - eor v6.16b, v6.16b, v1.16b // vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 - eor v1.16b, v1.16b, v1.16b // vpxor %xmm1, %xmm1, %xmm1 - eor v6.16b, v6.16b, v0.16b // vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a - mov v0.16b, v6.16b // vmovdqa %xmm6, %xmm0 - ins v6.d[0], v1.d[0] // vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros - ret -.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear - -## -## .aes_schedule_round -## -## Runs one main round of the key schedule on %xmm0, %xmm7 -## -## Specifically, runs subbytes on the high dword of %xmm0 -## then rotates it by one byte and xors into the low dword of -## %xmm7. -## -## Adds rcon from low byte of %xmm8, then rotates %xmm8 for -## next rcon. -## -## Smears the dwords of %xmm7 by xoring the low into the -## second low, result into third, result into highest. -## -## Returns results in %xmm7 = %xmm0. -## Clobbers %xmm1-%xmm4, %r11. -## -.type _vpaes_schedule_round,%function -.align 4 -_vpaes_schedule_round: - // extract rcon from xmm8 - movi v4.16b, #0 // vpxor %xmm4, %xmm4, %xmm4 - ext v1.16b, v8.16b, v4.16b, #15 // vpalignr $15, %xmm8, %xmm4, %xmm1 - ext v8.16b, v8.16b, v8.16b, #15 // vpalignr $15, %xmm8, %xmm8, %xmm8 - eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 - - // rotate - dup v0.4s, v0.s[3] // vpshufd $0xFF, %xmm0, %xmm0 - ext v0.16b, v0.16b, v0.16b, #1 // vpalignr $1, %xmm0, %xmm0, %xmm0 - - // fall through... - - // low round: same as high round, but no rotation and no rcon. -_vpaes_schedule_low_round: - // smear xmm7 - ext v1.16b, v4.16b, v7.16b, #12 // vpslldq $4, %xmm7, %xmm1 - eor v7.16b, v7.16b, v1.16b // vpxor %xmm1, %xmm7, %xmm7 - ext v4.16b, v4.16b, v7.16b, #8 // vpslldq $8, %xmm7, %xmm4 - - // subbytes - and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 # 0 = k - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 # 1 = i - eor v7.16b, v7.16b, v4.16b // vpxor %xmm4, %xmm7, %xmm7 - tbl v2.16b, {v19.16b}, v1.16b // vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - eor v1.16b, v1.16b, v0.16b // vpxor %xmm0, %xmm1, %xmm1 # 0 = j - tbl v3.16b, {v18.16b}, v0.16b // vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - tbl v4.16b, {v18.16b}, v1.16b // vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - eor v7.16b, v7.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm7, %xmm7 - tbl v3.16b, {v18.16b}, v3.16b // vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak - eor v4.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - tbl v2.16b, {v18.16b}, v4.16b // vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak - eor v3.16b, v3.16b, v1.16b // vpxor %xmm1, %xmm3, %xmm3 # 2 = io - eor v2.16b, v2.16b, v0.16b // vpxor %xmm0, %xmm2, %xmm2 # 3 = jo - tbl v4.16b, {v23.16b}, v3.16b // vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou - tbl v1.16b, {v22.16b}, v2.16b // vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t - eor v1.16b, v1.16b, v4.16b // vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output - - // add in smeared stuff - eor v0.16b, v1.16b, v7.16b // vpxor %xmm7, %xmm1, %xmm0 - eor v7.16b, v1.16b, v7.16b // vmovdqa %xmm0, %xmm7 - ret -.size _vpaes_schedule_round,.-_vpaes_schedule_round - -## -## .aes_schedule_transform -## -## Linear-transform %xmm0 according to tables at (%r11) -## -## Requires that %xmm9 = 0x0F0F... as in preheat -## Output in %xmm0 -## Clobbers %xmm1, %xmm2 -## -.type _vpaes_schedule_transform,%function -.align 4 -_vpaes_schedule_transform: - and v1.16b, v0.16b, v17.16b // vpand %xmm9, %xmm0, %xmm1 - ushr v0.16b, v0.16b, #4 // vpsrlb $4, %xmm0, %xmm0 - // vmovdqa (%r11), %xmm2 # lo - tbl v2.16b, {v20.16b}, v1.16b // vpshufb %xmm1, %xmm2, %xmm2 - // vmovdqa 16(%r11), %xmm1 # hi - tbl v0.16b, {v21.16b}, v0.16b // vpshufb %xmm0, %xmm1, %xmm0 - eor v0.16b, v0.16b, v2.16b // vpxor %xmm2, %xmm0, %xmm0 - ret -.size _vpaes_schedule_transform,.-_vpaes_schedule_transform - -## -## .aes_schedule_mangle -## -## Mangle xmm0 from (basis-transformed) standard version -## to our version. -## -## On encrypt, -## xor with 0x63 -## multiply by circulant 0,1,1,1 -## apply shiftrows transform -## -## On decrypt, -## xor with 0x63 -## multiply by "inverse mixcolumns" circulant E,B,D,9 -## deskew -## apply shiftrows transform -## -## -## Writes out to (%rdx), and increments or decrements it -## Keeps track of round number mod 4 in %r8 -## Preserves xmm0 -## Clobbers xmm1-xmm5 -## -.type _vpaes_schedule_mangle,%function -.align 4 -_vpaes_schedule_mangle: - mov v4.16b, v0.16b // vmovdqa %xmm0, %xmm4 # save xmm0 for later - // vmovdqa .Lk_mc_forward(%rip),%xmm5 - cbnz w3, .Lschedule_mangle_dec - - // encrypting - eor v4.16b, v0.16b, v16.16b // vpxor .Lk_s63(%rip), %xmm0, %xmm4 - add x2, x2, #16 // add $16, %rdx - tbl v4.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm4 - tbl v1.16b, {v4.16b}, v9.16b // vpshufb %xmm5, %xmm4, %xmm1 - tbl v3.16b, {v1.16b}, v9.16b // vpshufb %xmm5, %xmm1, %xmm3 - eor v4.16b, v4.16b, v1.16b // vpxor %xmm1, %xmm4, %xmm4 - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 - eor v3.16b, v3.16b, v4.16b // vpxor %xmm4, %xmm3, %xmm3 - - b .Lschedule_mangle_both -.align 4 -.Lschedule_mangle_dec: - // inverse mix columns - // lea .Lk_dksd(%rip),%r11 - ushr v1.16b, v4.16b, #4 // vpsrlb $4, %xmm4, %xmm1 # 1 = hi - and v4.16b, v4.16b, v17.16b // vpand %xmm9, %xmm4, %xmm4 # 4 = lo - - // vmovdqa 0x00(%r11), %xmm2 - tbl v2.16b, {v24.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 - // vmovdqa 0x10(%r11), %xmm3 - tbl v3.16b, {v25.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 - tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 - - // vmovdqa 0x20(%r11), %xmm2 - tbl v2.16b, {v26.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 - eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 - // vmovdqa 0x30(%r11), %xmm3 - tbl v3.16b, {v27.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 - tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 - - // vmovdqa 0x40(%r11), %xmm2 - tbl v2.16b, {v28.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 - eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 - // vmovdqa 0x50(%r11), %xmm3 - tbl v3.16b, {v29.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - eor v3.16b, v3.16b, v2.16b // vpxor %xmm2, %xmm3, %xmm3 - - // vmovdqa 0x60(%r11), %xmm2 - tbl v2.16b, {v30.16b}, v4.16b // vpshufb %xmm4, %xmm2, %xmm2 - tbl v3.16b, {v3.16b}, v9.16b // vpshufb %xmm5, %xmm3, %xmm3 - // vmovdqa 0x70(%r11), %xmm4 - tbl v4.16b, {v31.16b}, v1.16b // vpshufb %xmm1, %xmm4, %xmm4 - ld1 {v1.2d}, [x8] // vmovdqa (%r8,%r10), %xmm1 - eor v2.16b, v2.16b, v3.16b // vpxor %xmm3, %xmm2, %xmm2 - eor v3.16b, v4.16b, v2.16b // vpxor %xmm2, %xmm4, %xmm3 - - sub x2, x2, #16 // add $-16, %rdx - -.Lschedule_mangle_both: - tbl v3.16b, {v3.16b}, v1.16b // vpshufb %xmm1, %xmm3, %xmm3 - add x8, x8, #48 // add $-16, %r8 - and x8, x8, #~(1<<6) // and $0x30, %r8 - st1 {v3.2d}, [x2] // vmovdqu %xmm3, (%rdx) - ret -.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle - -.globl vpaes_set_encrypt_key -.hidden vpaes_set_encrypt_key -.type vpaes_set_encrypt_key,%function -.align 4 -vpaes_set_encrypt_key: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - - lsr w9, w1, #5 // shr $5,%eax - add w9, w9, #5 // $5,%eax - str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - - mov w3, #0 // mov $0,%ecx - mov x8, #0x30 // mov $0x30,%r8d - bl _vpaes_schedule_core - eor x0, x0, x0 - - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key - -.globl vpaes_set_decrypt_key -.hidden vpaes_set_decrypt_key -.type vpaes_set_decrypt_key,%function -.align 4 -vpaes_set_decrypt_key: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - - lsr w9, w1, #5 // shr $5,%eax - add w9, w9, #5 // $5,%eax - str w9, [x2,#240] // mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - lsl w9, w9, #4 // shl $4,%eax - add x2, x2, #16 // lea 16(%rdx,%rax),%rdx - add x2, x2, x9 - - mov w3, #1 // mov $1,%ecx - lsr w8, w1, #1 // shr $1,%r8d - and x8, x8, #32 // and $32,%r8d - eor x8, x8, #32 // xor $32,%r8d # nbits==192?0:32 - bl _vpaes_schedule_core - - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key -.globl vpaes_cbc_encrypt -.hidden vpaes_cbc_encrypt -.type vpaes_cbc_encrypt,%function -.align 4 -vpaes_cbc_encrypt: - AARCH64_SIGN_LINK_REGISTER - cbz x2, .Lcbc_abort - cmp w5, #0 // check direction - b.eq vpaes_cbc_decrypt - - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - - mov x17, x2 // reassign - mov x2, x3 // reassign - - ld1 {v0.16b}, [x4] // load ivec - bl _vpaes_encrypt_preheat - b .Lcbc_enc_loop - -.align 4 -.Lcbc_enc_loop: - ld1 {v7.16b}, [x0],#16 // load input - eor v7.16b, v7.16b, v0.16b // xor with ivec - bl _vpaes_encrypt_core - st1 {v0.16b}, [x1],#16 // save output - subs x17, x17, #16 - b.hi .Lcbc_enc_loop - - st1 {v0.16b}, [x4] // write ivec - - ldp x29,x30,[sp],#16 -.Lcbc_abort: - AARCH64_VALIDATE_LINK_REGISTER - ret -.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt - -.type vpaes_cbc_decrypt,%function -.align 4 -vpaes_cbc_decrypt: - // Not adding AARCH64_SIGN_LINK_REGISTER here because vpaes_cbc_decrypt is jumped to - // only from vpaes_cbc_encrypt which has already signed the return address. - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - stp d10,d11,[sp,#-16]! - stp d12,d13,[sp,#-16]! - stp d14,d15,[sp,#-16]! - - mov x17, x2 // reassign - mov x2, x3 // reassign - ld1 {v6.16b}, [x4] // load ivec - bl _vpaes_decrypt_preheat - tst x17, #16 - b.eq .Lcbc_dec_loop2x - - ld1 {v7.16b}, [x0], #16 // load input - bl _vpaes_decrypt_core - eor v0.16b, v0.16b, v6.16b // xor with ivec - orr v6.16b, v7.16b, v7.16b // next ivec value - st1 {v0.16b}, [x1], #16 - subs x17, x17, #16 - b.ls .Lcbc_dec_done - -.align 4 -.Lcbc_dec_loop2x: - ld1 {v14.16b,v15.16b}, [x0], #32 - bl _vpaes_decrypt_2x - eor v0.16b, v0.16b, v6.16b // xor with ivec - eor v1.16b, v1.16b, v14.16b - orr v6.16b, v15.16b, v15.16b - st1 {v0.16b,v1.16b}, [x1], #32 - subs x17, x17, #32 - b.hi .Lcbc_dec_loop2x - -.Lcbc_dec_done: - st1 {v6.16b}, [x4] - - ldp d14,d15,[sp],#16 - ldp d12,d13,[sp],#16 - ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size vpaes_cbc_decrypt,.-vpaes_cbc_decrypt -.globl vpaes_ctr32_encrypt_blocks -.hidden vpaes_ctr32_encrypt_blocks -.type vpaes_ctr32_encrypt_blocks,%function -.align 4 -vpaes_ctr32_encrypt_blocks: - AARCH64_SIGN_LINK_REGISTER - stp x29,x30,[sp,#-16]! - add x29,sp,#0 - stp d8,d9,[sp,#-16]! // ABI spec says so - stp d10,d11,[sp,#-16]! - stp d12,d13,[sp,#-16]! - stp d14,d15,[sp,#-16]! - - cbz x2, .Lctr32_done - - // Note, unlike the other functions, x2 here is measured in blocks, - // not bytes. - mov x17, x2 - mov x2, x3 - - // Load the IV and counter portion. - ldr w6, [x4, #12] - ld1 {v7.16b}, [x4] - - bl _vpaes_encrypt_preheat - tst x17, #1 - rev w6, w6 // The counter is big-endian. - b.eq .Lctr32_prep_loop - - // Handle one block so the remaining block count is even for - // _vpaes_encrypt_2x. - ld1 {v6.16b}, [x0], #16 // .Load input ahead of time - bl _vpaes_encrypt_core - eor v0.16b, v0.16b, v6.16b // XOR input and result - st1 {v0.16b}, [x1], #16 - subs x17, x17, #1 - // Update the counter. - add w6, w6, #1 - rev w7, w6 - mov v7.s[3], w7 - b.ls .Lctr32_done - -.Lctr32_prep_loop: - // _vpaes_encrypt_core takes its input from v7, while _vpaes_encrypt_2x - // uses v14 and v15. - mov v15.16b, v7.16b - mov v14.16b, v7.16b - add w6, w6, #1 - rev w7, w6 - mov v15.s[3], w7 - -.Lctr32_loop: - ld1 {v6.16b,v7.16b}, [x0], #32 // .Load input ahead of time - bl _vpaes_encrypt_2x - eor v0.16b, v0.16b, v6.16b // XOR input and result - eor v1.16b, v1.16b, v7.16b // XOR input and result (#2) - st1 {v0.16b,v1.16b}, [x1], #32 - subs x17, x17, #2 - // Update the counter. - add w7, w6, #1 - add w6, w6, #2 - rev w7, w7 - mov v14.s[3], w7 - rev w7, w6 - mov v15.s[3], w7 - b.hi .Lctr32_loop - -.Lctr32_done: - ldp d14,d15,[sp],#16 - ldp d12,d13,[sp],#16 - ldp d10,d11,[sp],#16 - ldp d8,d9,[sp],#16 - ldp x29,x30,[sp],#16 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S b/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S deleted file mode 100644 index 8928d7f5..00000000 --- a/third_party/boringssl/linux-aarch64/crypto/test/trampoline-armv8.S +++ /dev/null @@ -1,761 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__aarch64__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.text - -// abi_test_trampoline loads callee-saved registers from |state|, calls |func| -// with |argv|, then saves the callee-saved registers into |state|. It returns -// the result of |func|. The |unwind| argument is unused. -// uint64_t abi_test_trampoline(void (*func)(...), CallerState *state, -// const uint64_t *argv, size_t argc, -// uint64_t unwind); -.type abi_test_trampoline, %function -.globl abi_test_trampoline -.hidden abi_test_trampoline -.align 4 -abi_test_trampoline: -.Labi_test_trampoline_begin: - AARCH64_SIGN_LINK_REGISTER - // Stack layout (low to high addresses) - // x29,x30 (16 bytes) - // d8-d15 (64 bytes) - // x19-x28 (80 bytes) - // x1 (8 bytes) - // padding (8 bytes) - stp x29, x30, [sp, #-176]! - mov x29, sp - - // Saved callee-saved registers and |state|. - stp d8, d9, [sp, #16] - stp d10, d11, [sp, #32] - stp d12, d13, [sp, #48] - stp d14, d15, [sp, #64] - stp x19, x20, [sp, #80] - stp x21, x22, [sp, #96] - stp x23, x24, [sp, #112] - stp x25, x26, [sp, #128] - stp x27, x28, [sp, #144] - str x1, [sp, #160] - - // Load registers from |state|, with the exception of x29. x29 is the - // frame pointer and also callee-saved, but AAPCS64 allows platforms to - // mandate that x29 always point to a frame. iOS64 does so, which means - // we cannot fill x29 with entropy without violating ABI rules - // ourselves. x29 is tested separately below. - ldp d8, d9, [x1], #16 - ldp d10, d11, [x1], #16 - ldp d12, d13, [x1], #16 - ldp d14, d15, [x1], #16 - ldp x19, x20, [x1], #16 - ldp x21, x22, [x1], #16 - ldp x23, x24, [x1], #16 - ldp x25, x26, [x1], #16 - ldp x27, x28, [x1], #16 - - // Move parameters into temporary registers. - mov x9, x0 - mov x10, x2 - mov x11, x3 - - // Load parameters into registers. - cbz x11, .Largs_done - ldr x0, [x10], #8 - subs x11, x11, #1 - b.eq .Largs_done - ldr x1, [x10], #8 - subs x11, x11, #1 - b.eq .Largs_done - ldr x2, [x10], #8 - subs x11, x11, #1 - b.eq .Largs_done - ldr x3, [x10], #8 - subs x11, x11, #1 - b.eq .Largs_done - ldr x4, [x10], #8 - subs x11, x11, #1 - b.eq .Largs_done - ldr x5, [x10], #8 - subs x11, x11, #1 - b.eq .Largs_done - ldr x6, [x10], #8 - subs x11, x11, #1 - b.eq .Largs_done - ldr x7, [x10], #8 - -.Largs_done: - blr x9 - - // Reload |state| and store registers. - ldr x1, [sp, #160] - stp d8, d9, [x1], #16 - stp d10, d11, [x1], #16 - stp d12, d13, [x1], #16 - stp d14, d15, [x1], #16 - stp x19, x20, [x1], #16 - stp x21, x22, [x1], #16 - stp x23, x24, [x1], #16 - stp x25, x26, [x1], #16 - stp x27, x28, [x1], #16 - - // |func| is required to preserve x29, the frame pointer. We cannot load - // random values into x29 (see comment above), so compare it against the - // expected value and zero the field of |state| if corrupted. - mov x9, sp - cmp x29, x9 - b.eq .Lx29_ok - str xzr, [x1] - -.Lx29_ok: - // Restore callee-saved registers. - ldp d8, d9, [sp, #16] - ldp d10, d11, [sp, #32] - ldp d12, d13, [sp, #48] - ldp d14, d15, [sp, #64] - ldp x19, x20, [sp, #80] - ldp x21, x22, [sp, #96] - ldp x23, x24, [sp, #112] - ldp x25, x26, [sp, #128] - ldp x27, x28, [sp, #144] - - ldp x29, x30, [sp], #176 - AARCH64_VALIDATE_LINK_REGISTER - ret -.size abi_test_trampoline,.-abi_test_trampoline -.type abi_test_clobber_x0, %function -.globl abi_test_clobber_x0 -.hidden abi_test_clobber_x0 -.align 4 -abi_test_clobber_x0: - AARCH64_VALID_CALL_TARGET - mov x0, xzr - ret -.size abi_test_clobber_x0,.-abi_test_clobber_x0 -.type abi_test_clobber_x1, %function -.globl abi_test_clobber_x1 -.hidden abi_test_clobber_x1 -.align 4 -abi_test_clobber_x1: - AARCH64_VALID_CALL_TARGET - mov x1, xzr - ret -.size abi_test_clobber_x1,.-abi_test_clobber_x1 -.type abi_test_clobber_x2, %function -.globl abi_test_clobber_x2 -.hidden abi_test_clobber_x2 -.align 4 -abi_test_clobber_x2: - AARCH64_VALID_CALL_TARGET - mov x2, xzr - ret -.size abi_test_clobber_x2,.-abi_test_clobber_x2 -.type abi_test_clobber_x3, %function -.globl abi_test_clobber_x3 -.hidden abi_test_clobber_x3 -.align 4 -abi_test_clobber_x3: - AARCH64_VALID_CALL_TARGET - mov x3, xzr - ret -.size abi_test_clobber_x3,.-abi_test_clobber_x3 -.type abi_test_clobber_x4, %function -.globl abi_test_clobber_x4 -.hidden abi_test_clobber_x4 -.align 4 -abi_test_clobber_x4: - AARCH64_VALID_CALL_TARGET - mov x4, xzr - ret -.size abi_test_clobber_x4,.-abi_test_clobber_x4 -.type abi_test_clobber_x5, %function -.globl abi_test_clobber_x5 -.hidden abi_test_clobber_x5 -.align 4 -abi_test_clobber_x5: - AARCH64_VALID_CALL_TARGET - mov x5, xzr - ret -.size abi_test_clobber_x5,.-abi_test_clobber_x5 -.type abi_test_clobber_x6, %function -.globl abi_test_clobber_x6 -.hidden abi_test_clobber_x6 -.align 4 -abi_test_clobber_x6: - AARCH64_VALID_CALL_TARGET - mov x6, xzr - ret -.size abi_test_clobber_x6,.-abi_test_clobber_x6 -.type abi_test_clobber_x7, %function -.globl abi_test_clobber_x7 -.hidden abi_test_clobber_x7 -.align 4 -abi_test_clobber_x7: - AARCH64_VALID_CALL_TARGET - mov x7, xzr - ret -.size abi_test_clobber_x7,.-abi_test_clobber_x7 -.type abi_test_clobber_x8, %function -.globl abi_test_clobber_x8 -.hidden abi_test_clobber_x8 -.align 4 -abi_test_clobber_x8: - AARCH64_VALID_CALL_TARGET - mov x8, xzr - ret -.size abi_test_clobber_x8,.-abi_test_clobber_x8 -.type abi_test_clobber_x9, %function -.globl abi_test_clobber_x9 -.hidden abi_test_clobber_x9 -.align 4 -abi_test_clobber_x9: - AARCH64_VALID_CALL_TARGET - mov x9, xzr - ret -.size abi_test_clobber_x9,.-abi_test_clobber_x9 -.type abi_test_clobber_x10, %function -.globl abi_test_clobber_x10 -.hidden abi_test_clobber_x10 -.align 4 -abi_test_clobber_x10: - AARCH64_VALID_CALL_TARGET - mov x10, xzr - ret -.size abi_test_clobber_x10,.-abi_test_clobber_x10 -.type abi_test_clobber_x11, %function -.globl abi_test_clobber_x11 -.hidden abi_test_clobber_x11 -.align 4 -abi_test_clobber_x11: - AARCH64_VALID_CALL_TARGET - mov x11, xzr - ret -.size abi_test_clobber_x11,.-abi_test_clobber_x11 -.type abi_test_clobber_x12, %function -.globl abi_test_clobber_x12 -.hidden abi_test_clobber_x12 -.align 4 -abi_test_clobber_x12: - AARCH64_VALID_CALL_TARGET - mov x12, xzr - ret -.size abi_test_clobber_x12,.-abi_test_clobber_x12 -.type abi_test_clobber_x13, %function -.globl abi_test_clobber_x13 -.hidden abi_test_clobber_x13 -.align 4 -abi_test_clobber_x13: - AARCH64_VALID_CALL_TARGET - mov x13, xzr - ret -.size abi_test_clobber_x13,.-abi_test_clobber_x13 -.type abi_test_clobber_x14, %function -.globl abi_test_clobber_x14 -.hidden abi_test_clobber_x14 -.align 4 -abi_test_clobber_x14: - AARCH64_VALID_CALL_TARGET - mov x14, xzr - ret -.size abi_test_clobber_x14,.-abi_test_clobber_x14 -.type abi_test_clobber_x15, %function -.globl abi_test_clobber_x15 -.hidden abi_test_clobber_x15 -.align 4 -abi_test_clobber_x15: - AARCH64_VALID_CALL_TARGET - mov x15, xzr - ret -.size abi_test_clobber_x15,.-abi_test_clobber_x15 -.type abi_test_clobber_x16, %function -.globl abi_test_clobber_x16 -.hidden abi_test_clobber_x16 -.align 4 -abi_test_clobber_x16: - AARCH64_VALID_CALL_TARGET - mov x16, xzr - ret -.size abi_test_clobber_x16,.-abi_test_clobber_x16 -.type abi_test_clobber_x17, %function -.globl abi_test_clobber_x17 -.hidden abi_test_clobber_x17 -.align 4 -abi_test_clobber_x17: - AARCH64_VALID_CALL_TARGET - mov x17, xzr - ret -.size abi_test_clobber_x17,.-abi_test_clobber_x17 -.type abi_test_clobber_x19, %function -.globl abi_test_clobber_x19 -.hidden abi_test_clobber_x19 -.align 4 -abi_test_clobber_x19: - AARCH64_VALID_CALL_TARGET - mov x19, xzr - ret -.size abi_test_clobber_x19,.-abi_test_clobber_x19 -.type abi_test_clobber_x20, %function -.globl abi_test_clobber_x20 -.hidden abi_test_clobber_x20 -.align 4 -abi_test_clobber_x20: - AARCH64_VALID_CALL_TARGET - mov x20, xzr - ret -.size abi_test_clobber_x20,.-abi_test_clobber_x20 -.type abi_test_clobber_x21, %function -.globl abi_test_clobber_x21 -.hidden abi_test_clobber_x21 -.align 4 -abi_test_clobber_x21: - AARCH64_VALID_CALL_TARGET - mov x21, xzr - ret -.size abi_test_clobber_x21,.-abi_test_clobber_x21 -.type abi_test_clobber_x22, %function -.globl abi_test_clobber_x22 -.hidden abi_test_clobber_x22 -.align 4 -abi_test_clobber_x22: - AARCH64_VALID_CALL_TARGET - mov x22, xzr - ret -.size abi_test_clobber_x22,.-abi_test_clobber_x22 -.type abi_test_clobber_x23, %function -.globl abi_test_clobber_x23 -.hidden abi_test_clobber_x23 -.align 4 -abi_test_clobber_x23: - AARCH64_VALID_CALL_TARGET - mov x23, xzr - ret -.size abi_test_clobber_x23,.-abi_test_clobber_x23 -.type abi_test_clobber_x24, %function -.globl abi_test_clobber_x24 -.hidden abi_test_clobber_x24 -.align 4 -abi_test_clobber_x24: - AARCH64_VALID_CALL_TARGET - mov x24, xzr - ret -.size abi_test_clobber_x24,.-abi_test_clobber_x24 -.type abi_test_clobber_x25, %function -.globl abi_test_clobber_x25 -.hidden abi_test_clobber_x25 -.align 4 -abi_test_clobber_x25: - AARCH64_VALID_CALL_TARGET - mov x25, xzr - ret -.size abi_test_clobber_x25,.-abi_test_clobber_x25 -.type abi_test_clobber_x26, %function -.globl abi_test_clobber_x26 -.hidden abi_test_clobber_x26 -.align 4 -abi_test_clobber_x26: - AARCH64_VALID_CALL_TARGET - mov x26, xzr - ret -.size abi_test_clobber_x26,.-abi_test_clobber_x26 -.type abi_test_clobber_x27, %function -.globl abi_test_clobber_x27 -.hidden abi_test_clobber_x27 -.align 4 -abi_test_clobber_x27: - AARCH64_VALID_CALL_TARGET - mov x27, xzr - ret -.size abi_test_clobber_x27,.-abi_test_clobber_x27 -.type abi_test_clobber_x28, %function -.globl abi_test_clobber_x28 -.hidden abi_test_clobber_x28 -.align 4 -abi_test_clobber_x28: - AARCH64_VALID_CALL_TARGET - mov x28, xzr - ret -.size abi_test_clobber_x28,.-abi_test_clobber_x28 -.type abi_test_clobber_x29, %function -.globl abi_test_clobber_x29 -.hidden abi_test_clobber_x29 -.align 4 -abi_test_clobber_x29: - AARCH64_VALID_CALL_TARGET - mov x29, xzr - ret -.size abi_test_clobber_x29,.-abi_test_clobber_x29 -.type abi_test_clobber_d0, %function -.globl abi_test_clobber_d0 -.hidden abi_test_clobber_d0 -.align 4 -abi_test_clobber_d0: - AARCH64_VALID_CALL_TARGET - fmov d0, xzr - ret -.size abi_test_clobber_d0,.-abi_test_clobber_d0 -.type abi_test_clobber_d1, %function -.globl abi_test_clobber_d1 -.hidden abi_test_clobber_d1 -.align 4 -abi_test_clobber_d1: - AARCH64_VALID_CALL_TARGET - fmov d1, xzr - ret -.size abi_test_clobber_d1,.-abi_test_clobber_d1 -.type abi_test_clobber_d2, %function -.globl abi_test_clobber_d2 -.hidden abi_test_clobber_d2 -.align 4 -abi_test_clobber_d2: - AARCH64_VALID_CALL_TARGET - fmov d2, xzr - ret -.size abi_test_clobber_d2,.-abi_test_clobber_d2 -.type abi_test_clobber_d3, %function -.globl abi_test_clobber_d3 -.hidden abi_test_clobber_d3 -.align 4 -abi_test_clobber_d3: - AARCH64_VALID_CALL_TARGET - fmov d3, xzr - ret -.size abi_test_clobber_d3,.-abi_test_clobber_d3 -.type abi_test_clobber_d4, %function -.globl abi_test_clobber_d4 -.hidden abi_test_clobber_d4 -.align 4 -abi_test_clobber_d4: - AARCH64_VALID_CALL_TARGET - fmov d4, xzr - ret -.size abi_test_clobber_d4,.-abi_test_clobber_d4 -.type abi_test_clobber_d5, %function -.globl abi_test_clobber_d5 -.hidden abi_test_clobber_d5 -.align 4 -abi_test_clobber_d5: - AARCH64_VALID_CALL_TARGET - fmov d5, xzr - ret -.size abi_test_clobber_d5,.-abi_test_clobber_d5 -.type abi_test_clobber_d6, %function -.globl abi_test_clobber_d6 -.hidden abi_test_clobber_d6 -.align 4 -abi_test_clobber_d6: - AARCH64_VALID_CALL_TARGET - fmov d6, xzr - ret -.size abi_test_clobber_d6,.-abi_test_clobber_d6 -.type abi_test_clobber_d7, %function -.globl abi_test_clobber_d7 -.hidden abi_test_clobber_d7 -.align 4 -abi_test_clobber_d7: - AARCH64_VALID_CALL_TARGET - fmov d7, xzr - ret -.size abi_test_clobber_d7,.-abi_test_clobber_d7 -.type abi_test_clobber_d8, %function -.globl abi_test_clobber_d8 -.hidden abi_test_clobber_d8 -.align 4 -abi_test_clobber_d8: - AARCH64_VALID_CALL_TARGET - fmov d8, xzr - ret -.size abi_test_clobber_d8,.-abi_test_clobber_d8 -.type abi_test_clobber_d9, %function -.globl abi_test_clobber_d9 -.hidden abi_test_clobber_d9 -.align 4 -abi_test_clobber_d9: - AARCH64_VALID_CALL_TARGET - fmov d9, xzr - ret -.size abi_test_clobber_d9,.-abi_test_clobber_d9 -.type abi_test_clobber_d10, %function -.globl abi_test_clobber_d10 -.hidden abi_test_clobber_d10 -.align 4 -abi_test_clobber_d10: - AARCH64_VALID_CALL_TARGET - fmov d10, xzr - ret -.size abi_test_clobber_d10,.-abi_test_clobber_d10 -.type abi_test_clobber_d11, %function -.globl abi_test_clobber_d11 -.hidden abi_test_clobber_d11 -.align 4 -abi_test_clobber_d11: - AARCH64_VALID_CALL_TARGET - fmov d11, xzr - ret -.size abi_test_clobber_d11,.-abi_test_clobber_d11 -.type abi_test_clobber_d12, %function -.globl abi_test_clobber_d12 -.hidden abi_test_clobber_d12 -.align 4 -abi_test_clobber_d12: - AARCH64_VALID_CALL_TARGET - fmov d12, xzr - ret -.size abi_test_clobber_d12,.-abi_test_clobber_d12 -.type abi_test_clobber_d13, %function -.globl abi_test_clobber_d13 -.hidden abi_test_clobber_d13 -.align 4 -abi_test_clobber_d13: - AARCH64_VALID_CALL_TARGET - fmov d13, xzr - ret -.size abi_test_clobber_d13,.-abi_test_clobber_d13 -.type abi_test_clobber_d14, %function -.globl abi_test_clobber_d14 -.hidden abi_test_clobber_d14 -.align 4 -abi_test_clobber_d14: - AARCH64_VALID_CALL_TARGET - fmov d14, xzr - ret -.size abi_test_clobber_d14,.-abi_test_clobber_d14 -.type abi_test_clobber_d15, %function -.globl abi_test_clobber_d15 -.hidden abi_test_clobber_d15 -.align 4 -abi_test_clobber_d15: - AARCH64_VALID_CALL_TARGET - fmov d15, xzr - ret -.size abi_test_clobber_d15,.-abi_test_clobber_d15 -.type abi_test_clobber_d16, %function -.globl abi_test_clobber_d16 -.hidden abi_test_clobber_d16 -.align 4 -abi_test_clobber_d16: - AARCH64_VALID_CALL_TARGET - fmov d16, xzr - ret -.size abi_test_clobber_d16,.-abi_test_clobber_d16 -.type abi_test_clobber_d17, %function -.globl abi_test_clobber_d17 -.hidden abi_test_clobber_d17 -.align 4 -abi_test_clobber_d17: - AARCH64_VALID_CALL_TARGET - fmov d17, xzr - ret -.size abi_test_clobber_d17,.-abi_test_clobber_d17 -.type abi_test_clobber_d18, %function -.globl abi_test_clobber_d18 -.hidden abi_test_clobber_d18 -.align 4 -abi_test_clobber_d18: - AARCH64_VALID_CALL_TARGET - fmov d18, xzr - ret -.size abi_test_clobber_d18,.-abi_test_clobber_d18 -.type abi_test_clobber_d19, %function -.globl abi_test_clobber_d19 -.hidden abi_test_clobber_d19 -.align 4 -abi_test_clobber_d19: - AARCH64_VALID_CALL_TARGET - fmov d19, xzr - ret -.size abi_test_clobber_d19,.-abi_test_clobber_d19 -.type abi_test_clobber_d20, %function -.globl abi_test_clobber_d20 -.hidden abi_test_clobber_d20 -.align 4 -abi_test_clobber_d20: - AARCH64_VALID_CALL_TARGET - fmov d20, xzr - ret -.size abi_test_clobber_d20,.-abi_test_clobber_d20 -.type abi_test_clobber_d21, %function -.globl abi_test_clobber_d21 -.hidden abi_test_clobber_d21 -.align 4 -abi_test_clobber_d21: - AARCH64_VALID_CALL_TARGET - fmov d21, xzr - ret -.size abi_test_clobber_d21,.-abi_test_clobber_d21 -.type abi_test_clobber_d22, %function -.globl abi_test_clobber_d22 -.hidden abi_test_clobber_d22 -.align 4 -abi_test_clobber_d22: - AARCH64_VALID_CALL_TARGET - fmov d22, xzr - ret -.size abi_test_clobber_d22,.-abi_test_clobber_d22 -.type abi_test_clobber_d23, %function -.globl abi_test_clobber_d23 -.hidden abi_test_clobber_d23 -.align 4 -abi_test_clobber_d23: - AARCH64_VALID_CALL_TARGET - fmov d23, xzr - ret -.size abi_test_clobber_d23,.-abi_test_clobber_d23 -.type abi_test_clobber_d24, %function -.globl abi_test_clobber_d24 -.hidden abi_test_clobber_d24 -.align 4 -abi_test_clobber_d24: - AARCH64_VALID_CALL_TARGET - fmov d24, xzr - ret -.size abi_test_clobber_d24,.-abi_test_clobber_d24 -.type abi_test_clobber_d25, %function -.globl abi_test_clobber_d25 -.hidden abi_test_clobber_d25 -.align 4 -abi_test_clobber_d25: - AARCH64_VALID_CALL_TARGET - fmov d25, xzr - ret -.size abi_test_clobber_d25,.-abi_test_clobber_d25 -.type abi_test_clobber_d26, %function -.globl abi_test_clobber_d26 -.hidden abi_test_clobber_d26 -.align 4 -abi_test_clobber_d26: - AARCH64_VALID_CALL_TARGET - fmov d26, xzr - ret -.size abi_test_clobber_d26,.-abi_test_clobber_d26 -.type abi_test_clobber_d27, %function -.globl abi_test_clobber_d27 -.hidden abi_test_clobber_d27 -.align 4 -abi_test_clobber_d27: - AARCH64_VALID_CALL_TARGET - fmov d27, xzr - ret -.size abi_test_clobber_d27,.-abi_test_clobber_d27 -.type abi_test_clobber_d28, %function -.globl abi_test_clobber_d28 -.hidden abi_test_clobber_d28 -.align 4 -abi_test_clobber_d28: - AARCH64_VALID_CALL_TARGET - fmov d28, xzr - ret -.size abi_test_clobber_d28,.-abi_test_clobber_d28 -.type abi_test_clobber_d29, %function -.globl abi_test_clobber_d29 -.hidden abi_test_clobber_d29 -.align 4 -abi_test_clobber_d29: - AARCH64_VALID_CALL_TARGET - fmov d29, xzr - ret -.size abi_test_clobber_d29,.-abi_test_clobber_d29 -.type abi_test_clobber_d30, %function -.globl abi_test_clobber_d30 -.hidden abi_test_clobber_d30 -.align 4 -abi_test_clobber_d30: - AARCH64_VALID_CALL_TARGET - fmov d30, xzr - ret -.size abi_test_clobber_d30,.-abi_test_clobber_d30 -.type abi_test_clobber_d31, %function -.globl abi_test_clobber_d31 -.hidden abi_test_clobber_d31 -.align 4 -abi_test_clobber_d31: - AARCH64_VALID_CALL_TARGET - fmov d31, xzr - ret -.size abi_test_clobber_d31,.-abi_test_clobber_d31 -.type abi_test_clobber_v8_upper, %function -.globl abi_test_clobber_v8_upper -.hidden abi_test_clobber_v8_upper -.align 4 -abi_test_clobber_v8_upper: - AARCH64_VALID_CALL_TARGET - fmov v8.d[1], xzr - ret -.size abi_test_clobber_v8_upper,.-abi_test_clobber_v8_upper -.type abi_test_clobber_v9_upper, %function -.globl abi_test_clobber_v9_upper -.hidden abi_test_clobber_v9_upper -.align 4 -abi_test_clobber_v9_upper: - AARCH64_VALID_CALL_TARGET - fmov v9.d[1], xzr - ret -.size abi_test_clobber_v9_upper,.-abi_test_clobber_v9_upper -.type abi_test_clobber_v10_upper, %function -.globl abi_test_clobber_v10_upper -.hidden abi_test_clobber_v10_upper -.align 4 -abi_test_clobber_v10_upper: - AARCH64_VALID_CALL_TARGET - fmov v10.d[1], xzr - ret -.size abi_test_clobber_v10_upper,.-abi_test_clobber_v10_upper -.type abi_test_clobber_v11_upper, %function -.globl abi_test_clobber_v11_upper -.hidden abi_test_clobber_v11_upper -.align 4 -abi_test_clobber_v11_upper: - AARCH64_VALID_CALL_TARGET - fmov v11.d[1], xzr - ret -.size abi_test_clobber_v11_upper,.-abi_test_clobber_v11_upper -.type abi_test_clobber_v12_upper, %function -.globl abi_test_clobber_v12_upper -.hidden abi_test_clobber_v12_upper -.align 4 -abi_test_clobber_v12_upper: - AARCH64_VALID_CALL_TARGET - fmov v12.d[1], xzr - ret -.size abi_test_clobber_v12_upper,.-abi_test_clobber_v12_upper -.type abi_test_clobber_v13_upper, %function -.globl abi_test_clobber_v13_upper -.hidden abi_test_clobber_v13_upper -.align 4 -abi_test_clobber_v13_upper: - AARCH64_VALID_CALL_TARGET - fmov v13.d[1], xzr - ret -.size abi_test_clobber_v13_upper,.-abi_test_clobber_v13_upper -.type abi_test_clobber_v14_upper, %function -.globl abi_test_clobber_v14_upper -.hidden abi_test_clobber_v14_upper -.align 4 -abi_test_clobber_v14_upper: - AARCH64_VALID_CALL_TARGET - fmov v14.d[1], xzr - ret -.size abi_test_clobber_v14_upper,.-abi_test_clobber_v14_upper -.type abi_test_clobber_v15_upper, %function -.globl abi_test_clobber_v15_upper -.hidden abi_test_clobber_v15_upper -.align 4 -abi_test_clobber_v15_upper: - AARCH64_VALID_CALL_TARGET - fmov v15.d[1], xzr - ret -.size abi_test_clobber_v15_upper,.-abi_test_clobber_v15_upper -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S b/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S deleted file mode 100644 index 363aeee5..00000000 --- a/third_party/boringssl/linux-arm/crypto/chacha/chacha-armv4.S +++ /dev/null @@ -1,1493 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__arm__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. -.arch armv7-a - -.text -#if defined(__thumb2__) || defined(__clang__) -.syntax unified -#endif -#if defined(__thumb2__) -.thumb -#else -.code 32 -#endif - -#if defined(__thumb2__) || defined(__clang__) -#define ldrhsb ldrbhs -#endif - -.align 5 -.Lsigma: -.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 @ endian-neutral -.Lone: -.long 1,0,0,0 -#if __ARM_MAX_ARCH__>=7 -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.LChaCha20_ctr32 -#else -.word -1 -#endif - -.globl ChaCha20_ctr32 -.hidden ChaCha20_ctr32 -.type ChaCha20_ctr32,%function -.align 5 -ChaCha20_ctr32: -.LChaCha20_ctr32: - ldr r12,[sp,#0] @ pull pointer to counter and nonce - stmdb sp!,{r0,r1,r2,r4-r11,lr} -#if __ARM_ARCH__<7 && !defined(__thumb2__) - sub r14,pc,#16 @ ChaCha20_ctr32 -#else - adr r14,.LChaCha20_ctr32 -#endif - cmp r2,#0 @ len==0? -#ifdef __thumb2__ - itt eq -#endif - addeq sp,sp,#4*3 - beq .Lno_data -#if __ARM_MAX_ARCH__>=7 - cmp r2,#192 @ test len - bls .Lshort - ldr r4,[r14,#-32] - ldr r4,[r14,r4] -# ifdef __APPLE__ - ldr r4,[r4] -# endif - tst r4,#ARMV7_NEON - bne .LChaCha20_neon -.Lshort: -#endif - ldmia r12,{r4,r5,r6,r7} @ load counter and nonce - sub sp,sp,#4*(16) @ off-load area - sub r14,r14,#64 @ .Lsigma - stmdb sp!,{r4,r5,r6,r7} @ copy counter and nonce - ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key - ldmia r14,{r0,r1,r2,r3} @ load sigma - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} @ copy key - stmdb sp!,{r0,r1,r2,r3} @ copy sigma - str r10,[sp,#4*(16+10)] @ off-load "rx" - str r11,[sp,#4*(16+11)] @ off-load "rx" - b .Loop_outer_enter - -.align 4 -.Loop_outer: - ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material - str r11,[sp,#4*(32+2)] @ save len - str r12, [sp,#4*(32+1)] @ save inp - str r14, [sp,#4*(32+0)] @ save out -.Loop_outer_enter: - ldr r11, [sp,#4*(15)] - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - ldr r10, [sp,#4*(13)] - ldr r14,[sp,#4*(14)] - str r11, [sp,#4*(16+15)] - mov r11,#10 - b .Loop - -.align 4 -.Loop: - subs r11,r11,#1 - add r0,r0,r4 - mov r12,r12,ror#16 - add r1,r1,r5 - mov r10,r10,ror#16 - eor r12,r12,r0,ror#16 - eor r10,r10,r1,ror#16 - add r8,r8,r12 - mov r4,r4,ror#20 - add r9,r9,r10 - mov r5,r5,ror#20 - eor r4,r4,r8,ror#20 - eor r5,r5,r9,ror#20 - add r0,r0,r4 - mov r12,r12,ror#24 - add r1,r1,r5 - mov r10,r10,ror#24 - eor r12,r12,r0,ror#24 - eor r10,r10,r1,ror#24 - add r8,r8,r12 - mov r4,r4,ror#25 - add r9,r9,r10 - mov r5,r5,ror#25 - str r10,[sp,#4*(16+13)] - ldr r10,[sp,#4*(16+15)] - eor r4,r4,r8,ror#25 - eor r5,r5,r9,ror#25 - str r8,[sp,#4*(16+8)] - ldr r8,[sp,#4*(16+10)] - add r2,r2,r6 - mov r14,r14,ror#16 - str r9,[sp,#4*(16+9)] - ldr r9,[sp,#4*(16+11)] - add r3,r3,r7 - mov r10,r10,ror#16 - eor r14,r14,r2,ror#16 - eor r10,r10,r3,ror#16 - add r8,r8,r14 - mov r6,r6,ror#20 - add r9,r9,r10 - mov r7,r7,ror#20 - eor r6,r6,r8,ror#20 - eor r7,r7,r9,ror#20 - add r2,r2,r6 - mov r14,r14,ror#24 - add r3,r3,r7 - mov r10,r10,ror#24 - eor r14,r14,r2,ror#24 - eor r10,r10,r3,ror#24 - add r8,r8,r14 - mov r6,r6,ror#25 - add r9,r9,r10 - mov r7,r7,ror#25 - eor r6,r6,r8,ror#25 - eor r7,r7,r9,ror#25 - add r0,r0,r5 - mov r10,r10,ror#16 - add r1,r1,r6 - mov r12,r12,ror#16 - eor r10,r10,r0,ror#16 - eor r12,r12,r1,ror#16 - add r8,r8,r10 - mov r5,r5,ror#20 - add r9,r9,r12 - mov r6,r6,ror#20 - eor r5,r5,r8,ror#20 - eor r6,r6,r9,ror#20 - add r0,r0,r5 - mov r10,r10,ror#24 - add r1,r1,r6 - mov r12,r12,ror#24 - eor r10,r10,r0,ror#24 - eor r12,r12,r1,ror#24 - add r8,r8,r10 - mov r5,r5,ror#25 - str r10,[sp,#4*(16+15)] - ldr r10,[sp,#4*(16+13)] - add r9,r9,r12 - mov r6,r6,ror#25 - eor r5,r5,r8,ror#25 - eor r6,r6,r9,ror#25 - str r8,[sp,#4*(16+10)] - ldr r8,[sp,#4*(16+8)] - add r2,r2,r7 - mov r10,r10,ror#16 - str r9,[sp,#4*(16+11)] - ldr r9,[sp,#4*(16+9)] - add r3,r3,r4 - mov r14,r14,ror#16 - eor r10,r10,r2,ror#16 - eor r14,r14,r3,ror#16 - add r8,r8,r10 - mov r7,r7,ror#20 - add r9,r9,r14 - mov r4,r4,ror#20 - eor r7,r7,r8,ror#20 - eor r4,r4,r9,ror#20 - add r2,r2,r7 - mov r10,r10,ror#24 - add r3,r3,r4 - mov r14,r14,ror#24 - eor r10,r10,r2,ror#24 - eor r14,r14,r3,ror#24 - add r8,r8,r10 - mov r7,r7,ror#25 - add r9,r9,r14 - mov r4,r4,ror#25 - eor r7,r7,r8,ror#25 - eor r4,r4,r9,ror#25 - bne .Loop - - ldr r11,[sp,#4*(32+2)] @ load len - - str r8, [sp,#4*(16+8)] @ modulo-scheduled store - str r9, [sp,#4*(16+9)] - str r12,[sp,#4*(16+12)] - str r10, [sp,#4*(16+13)] - str r14,[sp,#4*(16+14)] - - @ at this point we have first half of 512-bit result in - @ rx and second half at sp+4*(16+8) - - cmp r11,#64 @ done yet? -#ifdef __thumb2__ - itete lo -#endif - addlo r12,sp,#4*(0) @ shortcut or ... - ldrhs r12,[sp,#4*(32+1)] @ ... load inp - addlo r14,sp,#4*(0) @ shortcut or ... - ldrhs r14,[sp,#4*(32+0)] @ ... load out - - ldr r8,[sp,#4*(0)] @ load key material - ldr r9,[sp,#4*(1)] - -#if __ARM_ARCH__>=6 || !defined(__ARMEB__) -# if __ARM_ARCH__<7 - orr r10,r12,r14 - tst r10,#3 @ are input and output aligned? - ldr r10,[sp,#4*(2)] - bne .Lunaligned - cmp r11,#64 @ restore flags -# else - ldr r10,[sp,#4*(2)] -# endif - ldr r11,[sp,#4*(3)] - - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - - add r2,r2,r10 - add r3,r3,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r0,r0,r8 @ xor with input - eorhs r1,r1,r9 - add r8,sp,#4*(4) - str r0,[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs r2,r2,r10 - eorhs r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r1,[r14,#-12] - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - add r5,r5,r9 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - add r6,r6,r10 - add r7,r7,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r4,r4,r8 - eorhs r5,r5,r9 - add r8,sp,#4*(8) - str r4,[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs r6,r6,r10 - eorhs r7,r7,r11 - str r5,[r14,#-12] - ldmia r8,{r8,r9,r10,r11} @ load key material - str r6,[r14,#-8] - add r0,sp,#4*(16+8) - str r7,[r14,#-4] - - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half - - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] -# ifdef __thumb2__ - itt hi -# endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it - strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it - add r2,r2,r10 - add r3,r3,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r0,r0,r8 - eorhs r1,r1,r9 - add r8,sp,#4*(12) - str r0,[r14],#16 @ store output -# ifdef __thumb2__ - itt hs -# endif - eorhs r2,r2,r10 - eorhs r3,r3,r11 - str r1,[r14,#-12] - ldmia r8,{r8,r9,r10,r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - add r5,r5,r9 -# ifdef __thumb2__ - itt hi -# endif - addhi r8,r8,#1 @ next counter value - strhi r8,[sp,#4*(12)] @ save next counter value -# ifdef __thumb2__ - itt hs -# endif - ldrhs r8,[r12],#16 @ load input - ldrhs r9,[r12,#-12] - add r6,r6,r10 - add r7,r7,r11 -# ifdef __thumb2__ - itt hs -# endif - ldrhs r10,[r12,#-8] - ldrhs r11,[r12,#-4] -# if __ARM_ARCH__>=6 && defined(__ARMEB__) - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif -# ifdef __thumb2__ - itt hs -# endif - eorhs r4,r4,r8 - eorhs r5,r5,r9 -# ifdef __thumb2__ - it ne -# endif - ldrne r8,[sp,#4*(32+2)] @ re-load len -# ifdef __thumb2__ - itt hs -# endif - eorhs r6,r6,r10 - eorhs r7,r7,r11 - str r4,[r14],#16 @ store output - str r5,[r14,#-12] -# ifdef __thumb2__ - it hs -# endif - subhs r11,r8,#64 @ len-=64 - str r6,[r14,#-8] - str r7,[r14,#-4] - bhi .Loop_outer - - beq .Ldone -# if __ARM_ARCH__<7 - b .Ltail - -.align 4 -.Lunaligned:@ unaligned endian-neutral path - cmp r11,#64 @ restore flags -# endif -#endif -#if __ARM_ARCH__<7 - ldr r11,[sp,#4*(3)] - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 - add r2,r2,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r3,r3,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r0,r8,r0 @ xor with input (or zero) - eor r1,r9,r1 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r2,r10,r2 - strb r0,[r14],#16 @ store output - eor r3,r11,r3 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r1,[r14,#-12] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-8] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r3,[r14,#-4] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-15] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r1,[r14,#-11] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-7] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r3,[r14,#-3] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-14] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r1,[r14,#-10] - strb r2,[r14,#-6] - eor r0,r8,r0,lsr#8 - strb r3,[r14,#-2] - eor r1,r9,r1,lsr#8 - strb r0,[r14,#-13] - eor r2,r10,r2,lsr#8 - strb r1,[r14,#-9] - eor r3,r11,r3,lsr#8 - strb r2,[r14,#-5] - strb r3,[r14,#-1] - add r8,sp,#4*(4+0) - ldmia r8,{r8,r9,r10,r11} @ load key material - add r0,sp,#4*(16+8) - add r4,r4,r8 @ accumulate key material - add r5,r5,r9 - add r6,r6,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r7,r7,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r4,r8,r4 @ xor with input (or zero) - eor r5,r9,r5 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r6,r10,r6 - strb r4,[r14],#16 @ store output - eor r7,r11,r7 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r5,[r14,#-12] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-8] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r7,[r14,#-4] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-15] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r5,[r14,#-11] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-7] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r7,[r14,#-3] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-14] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r5,[r14,#-10] - strb r6,[r14,#-6] - eor r4,r8,r4,lsr#8 - strb r7,[r14,#-2] - eor r5,r9,r5,lsr#8 - strb r4,[r14,#-13] - eor r6,r10,r6,lsr#8 - strb r5,[r14,#-9] - eor r7,r11,r7,lsr#8 - strb r6,[r14,#-5] - strb r7,[r14,#-1] - add r8,sp,#4*(4+4) - ldmia r8,{r8,r9,r10,r11} @ load key material - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half -# ifdef __thumb2__ - itt hi -# endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" - strhi r11,[sp,#4*(16+11)] @ copy "rx" - add r0,r0,r8 @ accumulate key material - add r1,r1,r9 - add r2,r2,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r3,r3,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r0,r8,r0 @ xor with input (or zero) - eor r1,r9,r1 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r2,r10,r2 - strb r0,[r14],#16 @ store output - eor r3,r11,r3 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r1,[r14,#-12] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-8] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r3,[r14,#-4] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-15] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r1,[r14,#-11] - eor r0,r8,r0,lsr#8 - strb r2,[r14,#-7] - eor r1,r9,r1,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r3,[r14,#-3] - eor r2,r10,r2,lsr#8 - strb r0,[r14,#-14] - eor r3,r11,r3,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r1,[r14,#-10] - strb r2,[r14,#-6] - eor r0,r8,r0,lsr#8 - strb r3,[r14,#-2] - eor r1,r9,r1,lsr#8 - strb r0,[r14,#-13] - eor r2,r10,r2,lsr#8 - strb r1,[r14,#-9] - eor r3,r11,r3,lsr#8 - strb r2,[r14,#-5] - strb r3,[r14,#-1] - add r8,sp,#4*(4+8) - ldmia r8,{r8,r9,r10,r11} @ load key material - add r4,r4,r8 @ accumulate key material -# ifdef __thumb2__ - itt hi -# endif - addhi r8,r8,#1 @ next counter value - strhi r8,[sp,#4*(12)] @ save next counter value - add r5,r5,r9 - add r6,r6,r10 -# ifdef __thumb2__ - itete lo -# endif - eorlo r8,r8,r8 @ zero or ... - ldrhsb r8,[r12],#16 @ ... load input - eorlo r9,r9,r9 - ldrhsb r9,[r12,#-12] - - add r7,r7,r11 -# ifdef __thumb2__ - itete lo -# endif - eorlo r10,r10,r10 - ldrhsb r10,[r12,#-8] - eorlo r11,r11,r11 - ldrhsb r11,[r12,#-4] - - eor r4,r8,r4 @ xor with input (or zero) - eor r5,r9,r5 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-15] @ load more input - ldrhsb r9,[r12,#-11] - eor r6,r10,r6 - strb r4,[r14],#16 @ store output - eor r7,r11,r7 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-7] - ldrhsb r11,[r12,#-3] - strb r5,[r14,#-12] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-8] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-14] @ load more input - ldrhsb r9,[r12,#-10] - strb r7,[r14,#-4] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-15] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-6] - ldrhsb r11,[r12,#-2] - strb r5,[r14,#-11] - eor r4,r8,r4,lsr#8 - strb r6,[r14,#-7] - eor r5,r9,r5,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r8,[r12,#-13] @ load more input - ldrhsb r9,[r12,#-9] - strb r7,[r14,#-3] - eor r6,r10,r6,lsr#8 - strb r4,[r14,#-14] - eor r7,r11,r7,lsr#8 -# ifdef __thumb2__ - itt hs -# endif - ldrhsb r10,[r12,#-5] - ldrhsb r11,[r12,#-1] - strb r5,[r14,#-10] - strb r6,[r14,#-6] - eor r4,r8,r4,lsr#8 - strb r7,[r14,#-2] - eor r5,r9,r5,lsr#8 - strb r4,[r14,#-13] - eor r6,r10,r6,lsr#8 - strb r5,[r14,#-9] - eor r7,r11,r7,lsr#8 - strb r6,[r14,#-5] - strb r7,[r14,#-1] -# ifdef __thumb2__ - it ne -# endif - ldrne r8,[sp,#4*(32+2)] @ re-load len -# ifdef __thumb2__ - it hs -# endif - subhs r11,r8,#64 @ len-=64 - bhi .Loop_outer - - beq .Ldone -#endif - -.Ltail: - ldr r12,[sp,#4*(32+1)] @ load inp - add r9,sp,#4*(0) - ldr r14,[sp,#4*(32+0)] @ load out - -.Loop_tail: - ldrb r10,[r9],#1 @ read buffer on stack - ldrb r11,[r12],#1 @ read input - subs r8,r8,#1 - eor r11,r11,r10 - strb r11,[r14],#1 @ store output - bne .Loop_tail - -.Ldone: - add sp,sp,#4*(32+3) -.Lno_data: - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} -.size ChaCha20_ctr32,.-ChaCha20_ctr32 -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.type ChaCha20_neon,%function -.align 5 -ChaCha20_neon: - ldr r12,[sp,#0] @ pull pointer to counter and nonce - stmdb sp!,{r0,r1,r2,r4-r11,lr} -.LChaCha20_neon: - adr r14,.Lsigma - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI spec says so - stmdb sp!,{r0,r1,r2,r3} - - vld1.32 {q1,q2},[r3] @ load key - ldmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} @ load key - - sub sp,sp,#4*(16+16) - vld1.32 {q3},[r12] @ load counter and nonce - add r12,sp,#4*8 - ldmia r14,{r0,r1,r2,r3} @ load sigma - vld1.32 {q0},[r14]! @ load sigma - vld1.32 {q12},[r14] @ one - vst1.32 {q2,q3},[r12] @ copy 1/2key|counter|nonce - vst1.32 {q0,q1},[sp] @ copy sigma|1/2key - - str r10,[sp,#4*(16+10)] @ off-load "rx" - str r11,[sp,#4*(16+11)] @ off-load "rx" - vshl.i32 d26,d24,#1 @ two - vstr d24,[sp,#4*(16+0)] - vshl.i32 d28,d24,#2 @ four - vstr d26,[sp,#4*(16+2)] - vmov q4,q0 - vstr d28,[sp,#4*(16+4)] - vmov q8,q0 - vmov q5,q1 - vmov q9,q1 - b .Loop_neon_enter - -.align 4 -.Loop_neon_outer: - ldmia sp,{r0,r1,r2,r3,r4,r5,r6,r7,r8,r9} @ load key material - cmp r11,#64*2 @ if len<=64*2 - bls .Lbreak_neon @ switch to integer-only - vmov q4,q0 - str r11,[sp,#4*(32+2)] @ save len - vmov q8,q0 - str r12, [sp,#4*(32+1)] @ save inp - vmov q5,q1 - str r14, [sp,#4*(32+0)] @ save out - vmov q9,q1 -.Loop_neon_enter: - ldr r11, [sp,#4*(15)] - vadd.i32 q7,q3,q12 @ counter+1 - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - vmov q6,q2 - ldr r10, [sp,#4*(13)] - vmov q10,q2 - ldr r14,[sp,#4*(14)] - vadd.i32 q11,q7,q12 @ counter+2 - str r11, [sp,#4*(16+15)] - mov r11,#10 - add r12,r12,#3 @ counter+3 - b .Loop_neon - -.align 4 -.Loop_neon: - subs r11,r11,#1 - vadd.i32 q0,q0,q1 - add r0,r0,r4 - vadd.i32 q4,q4,q5 - mov r12,r12,ror#16 - vadd.i32 q8,q8,q9 - add r1,r1,r5 - veor q3,q3,q0 - mov r10,r10,ror#16 - veor q7,q7,q4 - eor r12,r12,r0,ror#16 - veor q11,q11,q8 - eor r10,r10,r1,ror#16 - vrev32.16 q3,q3 - add r8,r8,r12 - vrev32.16 q7,q7 - mov r4,r4,ror#20 - vrev32.16 q11,q11 - add r9,r9,r10 - vadd.i32 q2,q2,q3 - mov r5,r5,ror#20 - vadd.i32 q6,q6,q7 - eor r4,r4,r8,ror#20 - vadd.i32 q10,q10,q11 - eor r5,r5,r9,ror#20 - veor q12,q1,q2 - add r0,r0,r4 - veor q13,q5,q6 - mov r12,r12,ror#24 - veor q14,q9,q10 - add r1,r1,r5 - vshr.u32 q1,q12,#20 - mov r10,r10,ror#24 - vshr.u32 q5,q13,#20 - eor r12,r12,r0,ror#24 - vshr.u32 q9,q14,#20 - eor r10,r10,r1,ror#24 - vsli.32 q1,q12,#12 - add r8,r8,r12 - vsli.32 q5,q13,#12 - mov r4,r4,ror#25 - vsli.32 q9,q14,#12 - add r9,r9,r10 - vadd.i32 q0,q0,q1 - mov r5,r5,ror#25 - vadd.i32 q4,q4,q5 - str r10,[sp,#4*(16+13)] - vadd.i32 q8,q8,q9 - ldr r10,[sp,#4*(16+15)] - veor q12,q3,q0 - eor r4,r4,r8,ror#25 - veor q13,q7,q4 - eor r5,r5,r9,ror#25 - veor q14,q11,q8 - str r8,[sp,#4*(16+8)] - vshr.u32 q3,q12,#24 - ldr r8,[sp,#4*(16+10)] - vshr.u32 q7,q13,#24 - add r2,r2,r6 - vshr.u32 q11,q14,#24 - mov r14,r14,ror#16 - vsli.32 q3,q12,#8 - str r9,[sp,#4*(16+9)] - vsli.32 q7,q13,#8 - ldr r9,[sp,#4*(16+11)] - vsli.32 q11,q14,#8 - add r3,r3,r7 - vadd.i32 q2,q2,q3 - mov r10,r10,ror#16 - vadd.i32 q6,q6,q7 - eor r14,r14,r2,ror#16 - vadd.i32 q10,q10,q11 - eor r10,r10,r3,ror#16 - veor q12,q1,q2 - add r8,r8,r14 - veor q13,q5,q6 - mov r6,r6,ror#20 - veor q14,q9,q10 - add r9,r9,r10 - vshr.u32 q1,q12,#25 - mov r7,r7,ror#20 - vshr.u32 q5,q13,#25 - eor r6,r6,r8,ror#20 - vshr.u32 q9,q14,#25 - eor r7,r7,r9,ror#20 - vsli.32 q1,q12,#7 - add r2,r2,r6 - vsli.32 q5,q13,#7 - mov r14,r14,ror#24 - vsli.32 q9,q14,#7 - add r3,r3,r7 - vext.8 q2,q2,q2,#8 - mov r10,r10,ror#24 - vext.8 q6,q6,q6,#8 - eor r14,r14,r2,ror#24 - vext.8 q10,q10,q10,#8 - eor r10,r10,r3,ror#24 - vext.8 q1,q1,q1,#4 - add r8,r8,r14 - vext.8 q5,q5,q5,#4 - mov r6,r6,ror#25 - vext.8 q9,q9,q9,#4 - add r9,r9,r10 - vext.8 q3,q3,q3,#12 - mov r7,r7,ror#25 - vext.8 q7,q7,q7,#12 - eor r6,r6,r8,ror#25 - vext.8 q11,q11,q11,#12 - eor r7,r7,r9,ror#25 - vadd.i32 q0,q0,q1 - add r0,r0,r5 - vadd.i32 q4,q4,q5 - mov r10,r10,ror#16 - vadd.i32 q8,q8,q9 - add r1,r1,r6 - veor q3,q3,q0 - mov r12,r12,ror#16 - veor q7,q7,q4 - eor r10,r10,r0,ror#16 - veor q11,q11,q8 - eor r12,r12,r1,ror#16 - vrev32.16 q3,q3 - add r8,r8,r10 - vrev32.16 q7,q7 - mov r5,r5,ror#20 - vrev32.16 q11,q11 - add r9,r9,r12 - vadd.i32 q2,q2,q3 - mov r6,r6,ror#20 - vadd.i32 q6,q6,q7 - eor r5,r5,r8,ror#20 - vadd.i32 q10,q10,q11 - eor r6,r6,r9,ror#20 - veor q12,q1,q2 - add r0,r0,r5 - veor q13,q5,q6 - mov r10,r10,ror#24 - veor q14,q9,q10 - add r1,r1,r6 - vshr.u32 q1,q12,#20 - mov r12,r12,ror#24 - vshr.u32 q5,q13,#20 - eor r10,r10,r0,ror#24 - vshr.u32 q9,q14,#20 - eor r12,r12,r1,ror#24 - vsli.32 q1,q12,#12 - add r8,r8,r10 - vsli.32 q5,q13,#12 - mov r5,r5,ror#25 - vsli.32 q9,q14,#12 - str r10,[sp,#4*(16+15)] - vadd.i32 q0,q0,q1 - ldr r10,[sp,#4*(16+13)] - vadd.i32 q4,q4,q5 - add r9,r9,r12 - vadd.i32 q8,q8,q9 - mov r6,r6,ror#25 - veor q12,q3,q0 - eor r5,r5,r8,ror#25 - veor q13,q7,q4 - eor r6,r6,r9,ror#25 - veor q14,q11,q8 - str r8,[sp,#4*(16+10)] - vshr.u32 q3,q12,#24 - ldr r8,[sp,#4*(16+8)] - vshr.u32 q7,q13,#24 - add r2,r2,r7 - vshr.u32 q11,q14,#24 - mov r10,r10,ror#16 - vsli.32 q3,q12,#8 - str r9,[sp,#4*(16+11)] - vsli.32 q7,q13,#8 - ldr r9,[sp,#4*(16+9)] - vsli.32 q11,q14,#8 - add r3,r3,r4 - vadd.i32 q2,q2,q3 - mov r14,r14,ror#16 - vadd.i32 q6,q6,q7 - eor r10,r10,r2,ror#16 - vadd.i32 q10,q10,q11 - eor r14,r14,r3,ror#16 - veor q12,q1,q2 - add r8,r8,r10 - veor q13,q5,q6 - mov r7,r7,ror#20 - veor q14,q9,q10 - add r9,r9,r14 - vshr.u32 q1,q12,#25 - mov r4,r4,ror#20 - vshr.u32 q5,q13,#25 - eor r7,r7,r8,ror#20 - vshr.u32 q9,q14,#25 - eor r4,r4,r9,ror#20 - vsli.32 q1,q12,#7 - add r2,r2,r7 - vsli.32 q5,q13,#7 - mov r10,r10,ror#24 - vsli.32 q9,q14,#7 - add r3,r3,r4 - vext.8 q2,q2,q2,#8 - mov r14,r14,ror#24 - vext.8 q6,q6,q6,#8 - eor r10,r10,r2,ror#24 - vext.8 q10,q10,q10,#8 - eor r14,r14,r3,ror#24 - vext.8 q1,q1,q1,#12 - add r8,r8,r10 - vext.8 q5,q5,q5,#12 - mov r7,r7,ror#25 - vext.8 q9,q9,q9,#12 - add r9,r9,r14 - vext.8 q3,q3,q3,#4 - mov r4,r4,ror#25 - vext.8 q7,q7,q7,#4 - eor r7,r7,r8,ror#25 - vext.8 q11,q11,q11,#4 - eor r4,r4,r9,ror#25 - bne .Loop_neon - - add r11,sp,#32 - vld1.32 {q12,q13},[sp] @ load key material - vld1.32 {q14,q15},[r11] - - ldr r11,[sp,#4*(32+2)] @ load len - - str r8, [sp,#4*(16+8)] @ modulo-scheduled store - str r9, [sp,#4*(16+9)] - str r12,[sp,#4*(16+12)] - str r10, [sp,#4*(16+13)] - str r14,[sp,#4*(16+14)] - - @ at this point we have first half of 512-bit result in - @ rx and second half at sp+4*(16+8) - - ldr r12,[sp,#4*(32+1)] @ load inp - ldr r14,[sp,#4*(32+0)] @ load out - - vadd.i32 q0,q0,q12 @ accumulate key material - vadd.i32 q4,q4,q12 - vadd.i32 q8,q8,q12 - vldr d24,[sp,#4*(16+0)] @ one - - vadd.i32 q1,q1,q13 - vadd.i32 q5,q5,q13 - vadd.i32 q9,q9,q13 - vldr d26,[sp,#4*(16+2)] @ two - - vadd.i32 q2,q2,q14 - vadd.i32 q6,q6,q14 - vadd.i32 q10,q10,q14 - vadd.i32 d14,d14,d24 @ counter+1 - vadd.i32 d22,d22,d26 @ counter+2 - - vadd.i32 q3,q3,q15 - vadd.i32 q7,q7,q15 - vadd.i32 q11,q11,q15 - - cmp r11,#64*4 - blo .Ltail_neon - - vld1.8 {q12,q13},[r12]! @ load input - mov r11,sp - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 @ xor with input - veor q1,q1,q13 - vld1.8 {q12,q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14,q15},[r12]! - - veor q4,q4,q12 - vst1.8 {q0,q1},[r14]! @ store output - veor q5,q5,q13 - vld1.8 {q12,q13},[r12]! - veor q6,q6,q14 - vst1.8 {q2,q3},[r14]! - veor q7,q7,q15 - vld1.8 {q14,q15},[r12]! - - veor q8,q8,q12 - vld1.32 {q0,q1},[r11]! @ load for next iteration - veor d25,d25,d25 - vldr d24,[sp,#4*(16+4)] @ four - veor q9,q9,q13 - vld1.32 {q2,q3},[r11] - veor q10,q10,q14 - vst1.8 {q4,q5},[r14]! - veor q11,q11,q15 - vst1.8 {q6,q7},[r14]! - - vadd.i32 d6,d6,d24 @ next counter value - vldr d24,[sp,#4*(16+0)] @ one - - ldmia sp,{r8,r9,r10,r11} @ load key material - add r0,r0,r8 @ accumulate key material - ldr r8,[r12],#16 @ load input - vst1.8 {q8,q9},[r14]! - add r1,r1,r9 - ldr r9,[r12,#-12] - vst1.8 {q10,q11},[r14]! - add r2,r2,r10 - ldr r10,[r12,#-8] - add r3,r3,r11 - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif - eor r0,r0,r8 @ xor with input - add r8,sp,#4*(4) - eor r1,r1,r9 - str r0,[r14],#16 @ store output - eor r2,r2,r10 - str r1,[r14,#-12] - eor r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - ldr r8,[r12],#16 @ load input - add r5,r5,r9 - ldr r9,[r12,#-12] - add r6,r6,r10 - ldr r10,[r12,#-8] - add r7,r7,r11 - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - eor r4,r4,r8 - add r8,sp,#4*(8) - eor r5,r5,r9 - str r4,[r14],#16 @ store output - eor r6,r6,r10 - str r5,[r14,#-12] - eor r7,r7,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r6,[r14,#-8] - add r0,sp,#4*(16+8) - str r7,[r14,#-4] - - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half - - add r0,r0,r8 @ accumulate key material - ldr r8,[r12],#16 @ load input - add r1,r1,r9 - ldr r9,[r12,#-12] -# ifdef __thumb2__ - it hi -# endif - strhi r10,[sp,#4*(16+10)] @ copy "rx" while at it - add r2,r2,r10 - ldr r10,[r12,#-8] -# ifdef __thumb2__ - it hi -# endif - strhi r11,[sp,#4*(16+11)] @ copy "rx" while at it - add r3,r3,r11 - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 -# endif - eor r0,r0,r8 - add r8,sp,#4*(12) - eor r1,r1,r9 - str r0,[r14],#16 @ store output - eor r2,r2,r10 - str r1,[r14,#-12] - eor r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - str r2,[r14,#-8] - str r3,[r14,#-4] - - add r4,r4,r8 @ accumulate key material - add r8,r8,#4 @ next counter value - add r5,r5,r9 - str r8,[sp,#4*(12)] @ save next counter value - ldr r8,[r12],#16 @ load input - add r6,r6,r10 - add r4,r4,#3 @ counter+3 - ldr r9,[r12,#-12] - add r7,r7,r11 - ldr r10,[r12,#-8] - ldr r11,[r12,#-4] -# ifdef __ARMEB__ - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - eor r4,r4,r8 -# ifdef __thumb2__ - it hi -# endif - ldrhi r8,[sp,#4*(32+2)] @ re-load len - eor r5,r5,r9 - eor r6,r6,r10 - str r4,[r14],#16 @ store output - eor r7,r7,r11 - str r5,[r14,#-12] - sub r11,r8,#64*4 @ len-=64*4 - str r6,[r14,#-8] - str r7,[r14,#-4] - bhi .Loop_neon_outer - - b .Ldone_neon - -.align 4 -.Lbreak_neon: - @ harmonize NEON and integer-only stack frames: load data - @ from NEON frame, but save to integer-only one; distance - @ between the two is 4*(32+4+16-32)=4*(20). - - str r11, [sp,#4*(20+32+2)] @ save len - add r11,sp,#4*(32+4) - str r12, [sp,#4*(20+32+1)] @ save inp - str r14, [sp,#4*(20+32+0)] @ save out - - ldr r12,[sp,#4*(16+10)] - ldr r14,[sp,#4*(16+11)] - vldmia r11,{d8,d9,d10,d11,d12,d13,d14,d15} @ fulfill ABI requirement - str r12,[sp,#4*(20+16+10)] @ copy "rx" - str r14,[sp,#4*(20+16+11)] @ copy "rx" - - ldr r11, [sp,#4*(15)] - ldr r12,[sp,#4*(12)] @ modulo-scheduled load - ldr r10, [sp,#4*(13)] - ldr r14,[sp,#4*(14)] - str r11, [sp,#4*(20+16+15)] - add r11,sp,#4*(20) - vst1.32 {q0,q1},[r11]! @ copy key - add sp,sp,#4*(20) @ switch frame - vst1.32 {q2,q3},[r11] - mov r11,#10 - b .Loop @ go integer-only - -.align 4 -.Ltail_neon: - cmp r11,#64*3 - bhs .L192_or_more_neon - cmp r11,#64*2 - bhs .L128_or_more_neon - cmp r11,#64*1 - bhs .L64_or_more_neon - - add r8,sp,#4*(8) - vst1.8 {q0,q1},[sp] - add r10,sp,#4*(0) - vst1.8 {q2,q3},[r8] - b .Loop_tail_neon - -.align 4 -.L64_or_more_neon: - vld1.8 {q12,q13},[r12]! - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - veor q2,q2,q14 - veor q3,q3,q15 - vst1.8 {q0,q1},[r14]! - vst1.8 {q2,q3},[r14]! - - beq .Ldone_neon - - add r8,sp,#4*(8) - vst1.8 {q4,q5},[sp] - add r10,sp,#4*(0) - vst1.8 {q6,q7},[r8] - sub r11,r11,#64*1 @ len-=64*1 - b .Loop_tail_neon - -.align 4 -.L128_or_more_neon: - vld1.8 {q12,q13},[r12]! - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - vld1.8 {q12,q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14,q15},[r12]! - - veor q4,q4,q12 - veor q5,q5,q13 - vst1.8 {q0,q1},[r14]! - veor q6,q6,q14 - vst1.8 {q2,q3},[r14]! - veor q7,q7,q15 - vst1.8 {q4,q5},[r14]! - vst1.8 {q6,q7},[r14]! - - beq .Ldone_neon - - add r8,sp,#4*(8) - vst1.8 {q8,q9},[sp] - add r10,sp,#4*(0) - vst1.8 {q10,q11},[r8] - sub r11,r11,#64*2 @ len-=64*2 - b .Loop_tail_neon - -.align 4 -.L192_or_more_neon: - vld1.8 {q12,q13},[r12]! - vld1.8 {q14,q15},[r12]! - veor q0,q0,q12 - veor q1,q1,q13 - vld1.8 {q12,q13},[r12]! - veor q2,q2,q14 - veor q3,q3,q15 - vld1.8 {q14,q15},[r12]! - - veor q4,q4,q12 - veor q5,q5,q13 - vld1.8 {q12,q13},[r12]! - veor q6,q6,q14 - vst1.8 {q0,q1},[r14]! - veor q7,q7,q15 - vld1.8 {q14,q15},[r12]! - - veor q8,q8,q12 - vst1.8 {q2,q3},[r14]! - veor q9,q9,q13 - vst1.8 {q4,q5},[r14]! - veor q10,q10,q14 - vst1.8 {q6,q7},[r14]! - veor q11,q11,q15 - vst1.8 {q8,q9},[r14]! - vst1.8 {q10,q11},[r14]! - - beq .Ldone_neon - - ldmia sp,{r8,r9,r10,r11} @ load key material - add r0,r0,r8 @ accumulate key material - add r8,sp,#4*(4) - add r1,r1,r9 - add r2,r2,r10 - add r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - - add r4,r4,r8 @ accumulate key material - add r8,sp,#4*(8) - add r5,r5,r9 - add r6,r6,r10 - add r7,r7,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - stmia sp,{r0,r1,r2,r3,r4,r5,r6,r7} - add r0,sp,#4*(16+8) - - ldmia r0,{r0,r1,r2,r3,r4,r5,r6,r7} @ load second half - - add r0,r0,r8 @ accumulate key material - add r8,sp,#4*(12) - add r1,r1,r9 - add r2,r2,r10 - add r3,r3,r11 - ldmia r8,{r8,r9,r10,r11} @ load key material - - add r4,r4,r8 @ accumulate key material - add r8,sp,#4*(8) - add r5,r5,r9 - add r4,r4,#3 @ counter+3 - add r6,r6,r10 - add r7,r7,r11 - ldr r11,[sp,#4*(32+2)] @ re-load len -# ifdef __ARMEB__ - rev r0,r0 - rev r1,r1 - rev r2,r2 - rev r3,r3 - rev r4,r4 - rev r5,r5 - rev r6,r6 - rev r7,r7 -# endif - stmia r8,{r0,r1,r2,r3,r4,r5,r6,r7} - add r10,sp,#4*(0) - sub r11,r11,#64*3 @ len-=64*3 - -.Loop_tail_neon: - ldrb r8,[r10],#1 @ read buffer on stack - ldrb r9,[r12],#1 @ read input - subs r11,r11,#1 - eor r8,r8,r9 - strb r8,[r14],#1 @ store output - bne .Loop_tail_neon - -.Ldone_neon: - add sp,sp,#4*(32+4) - vldmia sp,{d8,d9,d10,d11,d12,d13,d14,d15} - add sp,sp,#4*(16+3) - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} -.size ChaCha20_neon,.-ChaCha20_neon -.comm OPENSSL_armcap_P,4,4 -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S b/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S deleted file mode 100644 index 30c6525d..00000000 --- a/third_party/boringssl/linux-arm/crypto/fipsmodule/aesv8-armx32.S +++ /dev/null @@ -1,800 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__arm__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -#if __ARM_MAX_ARCH__>=7 -.text -.arch armv7-a @ don't confuse not-so-latest binutils with argv8 :-) -.fpu neon -.code 32 -#undef __thumb2__ -.align 5 -.Lrcon: -.long 0x01,0x01,0x01,0x01 -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d @ rotate-n-splat -.long 0x1b,0x1b,0x1b,0x1b - -.text - -.globl aes_hw_set_encrypt_key -.hidden aes_hw_set_encrypt_key -.type aes_hw_set_encrypt_key,%function -.align 5 -aes_hw_set_encrypt_key: -.Lenc_key: - mov r3,#-1 - cmp r0,#0 - beq .Lenc_key_abort - cmp r2,#0 - beq .Lenc_key_abort - mov r3,#-2 - cmp r1,#128 - blt .Lenc_key_abort - cmp r1,#256 - bgt .Lenc_key_abort - tst r1,#0x3f - bne .Lenc_key_abort - - adr r3,.Lrcon - cmp r1,#192 - - veor q0,q0,q0 - vld1.8 {q3},[r0]! - mov r1,#8 @ reuse r1 - vld1.32 {q1,q2},[r3]! - - blt .Loop128 - beq .L192 - b .L256 - -.align 4 -.Loop128: - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - bne .Loop128 - - vld1.32 {q1},[r3] - - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - - vtbl.8 d20,{q3},d4 - vtbl.8 d21,{q3},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q3},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - veor q3,q3,q10 - vst1.32 {q3},[r2] - add r2,r2,#0x50 - - mov r12,#10 - b .Ldone - -.align 4 -.L192: - vld1.8 {d16},[r0]! - vmov.i8 q10,#8 @ borrow q10 - vst1.32 {q3},[r2]! - vsub.i8 q2,q2,q10 @ adjust the mask - -.Loop192: - vtbl.8 d20,{q8},d4 - vtbl.8 d21,{q8},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {d16},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - - vdup.32 q9,d7[1] - veor q9,q9,q8 - veor q10,q10,q1 - vext.8 q8,q0,q8,#12 - vshl.u8 q1,q1,#1 - veor q8,q8,q9 - veor q3,q3,q10 - veor q8,q8,q10 - vst1.32 {q3},[r2]! - bne .Loop192 - - mov r12,#12 - add r2,r2,#0x20 - b .Ldone - -.align 4 -.L256: - vld1.8 {q8},[r0] - mov r1,#7 - mov r12,#14 - vst1.32 {q3},[r2]! - -.Loop256: - vtbl.8 d20,{q8},d4 - vtbl.8 d21,{q8},d5 - vext.8 q9,q0,q3,#12 - vst1.32 {q8},[r2]! -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - subs r1,r1,#1 - - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q3,q3,q9 - vext.8 q9,q0,q9,#12 - veor q10,q10,q1 - veor q3,q3,q9 - vshl.u8 q1,q1,#1 - veor q3,q3,q10 - vst1.32 {q3},[r2]! - beq .Ldone - - vdup.32 q10,d7[1] - vext.8 q9,q0,q8,#12 -.byte 0x00,0x43,0xf0,0xf3 @ aese q10,q0 - - veor q8,q8,q9 - vext.8 q9,q0,q9,#12 - veor q8,q8,q9 - vext.8 q9,q0,q9,#12 - veor q8,q8,q9 - - veor q8,q8,q10 - b .Loop256 - -.Ldone: - str r12,[r2] - mov r3,#0 - -.Lenc_key_abort: - mov r0,r3 @ return value - - bx lr -.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key - -.globl aes_hw_set_decrypt_key -.hidden aes_hw_set_decrypt_key -.type aes_hw_set_decrypt_key,%function -.align 5 -aes_hw_set_decrypt_key: - stmdb sp!,{r4,lr} - bl .Lenc_key - - cmp r0,#0 - bne .Ldec_key_abort - - sub r2,r2,#240 @ restore original r2 - mov r4,#-16 - add r0,r2,r12,lsl#4 @ end of key schedule - - vld1.32 {q0},[r2] - vld1.32 {q1},[r0] - vst1.32 {q0},[r0],r4 - vst1.32 {q1},[r2]! - -.Loop_imc: - vld1.32 {q0},[r2] - vld1.32 {q1},[r0] -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 - vst1.32 {q0},[r0],r4 - vst1.32 {q1},[r2]! - cmp r0,r2 - bhi .Loop_imc - - vld1.32 {q0},[r2] -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 - vst1.32 {q0},[r0] - - eor r0,r0,r0 @ return value -.Ldec_key_abort: - ldmia sp!,{r4,pc} -.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key -.globl aes_hw_encrypt -.hidden aes_hw_encrypt -.type aes_hw_encrypt,%function -.align 5 -aes_hw_encrypt: - AARCH64_VALID_CALL_TARGET - ldr r3,[r2,#240] - vld1.32 {q0},[r2]! - vld1.8 {q2},[r0] - sub r3,r3,#2 - vld1.32 {q1},[r2]! - -.Loop_enc: -.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - vld1.32 {q0},[r2]! - subs r3,r3,#2 -.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - vld1.32 {q1},[r2]! - bgt .Loop_enc - -.byte 0x00,0x43,0xb0,0xf3 @ aese q2,q0 -.byte 0x84,0x43,0xb0,0xf3 @ aesmc q2,q2 - vld1.32 {q0},[r2] -.byte 0x02,0x43,0xb0,0xf3 @ aese q2,q1 - veor q2,q2,q0 - - vst1.8 {q2},[r1] - bx lr -.size aes_hw_encrypt,.-aes_hw_encrypt -.globl aes_hw_decrypt -.hidden aes_hw_decrypt -.type aes_hw_decrypt,%function -.align 5 -aes_hw_decrypt: - AARCH64_VALID_CALL_TARGET - ldr r3,[r2,#240] - vld1.32 {q0},[r2]! - vld1.8 {q2},[r0] - sub r3,r3,#2 - vld1.32 {q1},[r2]! - -.Loop_dec: -.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - vld1.32 {q0},[r2]! - subs r3,r3,#2 -.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - vld1.32 {q1},[r2]! - bgt .Loop_dec - -.byte 0x40,0x43,0xb0,0xf3 @ aesd q2,q0 -.byte 0xc4,0x43,0xb0,0xf3 @ aesimc q2,q2 - vld1.32 {q0},[r2] -.byte 0x42,0x43,0xb0,0xf3 @ aesd q2,q1 - veor q2,q2,q0 - - vst1.8 {q2},[r1] - bx lr -.size aes_hw_decrypt,.-aes_hw_decrypt -.globl aes_hw_cbc_encrypt -.hidden aes_hw_cbc_encrypt -.type aes_hw_cbc_encrypt,%function -.align 5 -aes_hw_cbc_encrypt: - mov ip,sp - stmdb sp!,{r4,r5,r6,r7,r8,lr} - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - ldmia ip,{r4,r5} @ load remaining args - subs r2,r2,#16 - mov r8,#16 - blo .Lcbc_abort - moveq r8,#0 - - cmp r5,#0 @ en- or decrypting? - ldr r5,[r3,#240] - and r2,r2,#-16 - vld1.8 {q6},[r4] - vld1.8 {q0},[r0],r8 - - vld1.32 {q8,q9},[r3] @ load key schedule... - sub r5,r5,#6 - add r7,r3,r5,lsl#4 @ pointer to last 7 round keys - sub r5,r5,#2 - vld1.32 {q10,q11},[r7]! - vld1.32 {q12,q13},[r7]! - vld1.32 {q14,q15},[r7]! - vld1.32 {q7},[r7] - - add r7,r3,#32 - mov r6,r5 - beq .Lcbc_dec - - cmp r5,#2 - veor q0,q0,q6 - veor q5,q8,q7 - beq .Lcbc_enc128 - - vld1.32 {q2,q3},[r7] - add r7,r3,#16 - add r6,r3,#16*4 - add r12,r3,#16*5 -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - add r14,r3,#16*6 - add r3,r3,#16*7 - b .Lenter_cbc_enc - -.align 4 -.Loop_cbc_enc: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vst1.8 {q6},[r1]! -.Lenter_cbc_enc: -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q8},[r6] - cmp r5,#4 -.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r12] - beq .Lcbc_enc192 - -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q8},[r14] -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r3] - nop - -.Lcbc_enc192: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r2,r2,#16 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - moveq r8,#0 -.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.8 {q8},[r0],r8 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - veor q8,q8,q5 -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.32 {q9},[r7] @ re-pre-load rndkey[1] -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - veor q6,q0,q7 - bhs .Loop_cbc_enc - - vst1.8 {q6},[r1]! - b .Lcbc_done - -.align 5 -.Lcbc_enc128: - vld1.32 {q2,q3},[r7] -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - b .Lenter_cbc_enc128 -.Loop_cbc_enc128: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vst1.8 {q6},[r1]! -.Lenter_cbc_enc128: -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - subs r2,r2,#16 -.byte 0x04,0x03,0xb0,0xf3 @ aese q0,q2 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - moveq r8,#0 -.byte 0x06,0x03,0xb0,0xf3 @ aese q0,q3 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x24,0x03,0xb0,0xf3 @ aese q0,q10 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x26,0x03,0xb0,0xf3 @ aese q0,q11 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - vld1.8 {q8},[r0],r8 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 - veor q8,q8,q5 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 - veor q6,q0,q7 - bhs .Loop_cbc_enc128 - - vst1.8 {q6},[r1]! - b .Lcbc_done -.align 5 -.Lcbc_dec: - vld1.8 {q10},[r0]! - subs r2,r2,#32 @ bias - add r6,r5,#2 - vorr q3,q0,q0 - vorr q1,q0,q0 - vorr q11,q10,q10 - blo .Lcbc_dec_tail - - vorr q1,q10,q10 - vld1.8 {q10},[r0]! - vorr q2,q0,q0 - vorr q3,q1,q1 - vorr q11,q10,q10 - -.Loop3x_cbc_dec: -.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q9},[r7]! - bgt .Loop3x_cbc_dec - -.byte 0x60,0x03,0xb0,0xf3 @ aesd q0,q8 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q4,q6,q7 - subs r2,r2,#0x30 - veor q5,q2,q7 - movlo r6,r2 @ r6, r6, is zero at this point -.byte 0x62,0x03,0xb0,0xf3 @ aesd q0,q9 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q9,q3,q7 - add r0,r0,r6 @ r0 is adjusted in such way that - @ at exit from the loop q1-q10 - @ are loaded with last "words" - vorr q6,q11,q11 - mov r7,r3 -.byte 0x68,0x03,0xb0,0xf3 @ aesd q0,q12 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.8 {q2},[r0]! -.byte 0x6a,0x03,0xb0,0xf3 @ aesd q0,q13 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.8 {q3},[r0]! -.byte 0x6c,0x03,0xb0,0xf3 @ aesd q0,q14 -.byte 0xc0,0x03,0xb0,0xf3 @ aesimc q0,q0 -.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.8 {q11},[r0]! -.byte 0x6e,0x03,0xb0,0xf3 @ aesd q0,q15 -.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 -.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 - vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] - add r6,r5,#2 - veor q4,q4,q0 - veor q5,q5,q1 - veor q10,q10,q9 - vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - vst1.8 {q4},[r1]! - vorr q0,q2,q2 - vst1.8 {q5},[r1]! - vorr q1,q3,q3 - vst1.8 {q10},[r1]! - vorr q10,q11,q11 - bhs .Loop3x_cbc_dec - - cmn r2,#0x30 - beq .Lcbc_done - nop - -.Lcbc_dec_tail: -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - vld1.32 {q9},[r7]! - bgt .Lcbc_dec_tail - -.byte 0x60,0x23,0xb0,0xf3 @ aesd q1,q8 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x60,0x43,0xf0,0xf3 @ aesd q10,q8 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 -.byte 0x62,0x23,0xb0,0xf3 @ aesd q1,q9 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x62,0x43,0xf0,0xf3 @ aesd q10,q9 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 -.byte 0x68,0x23,0xb0,0xf3 @ aesd q1,q12 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x68,0x43,0xf0,0xf3 @ aesd q10,q12 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - cmn r2,#0x20 -.byte 0x6a,0x23,0xb0,0xf3 @ aesd q1,q13 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6a,0x43,0xf0,0xf3 @ aesd q10,q13 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q5,q6,q7 -.byte 0x6c,0x23,0xb0,0xf3 @ aesd q1,q14 -.byte 0xc2,0x23,0xb0,0xf3 @ aesimc q1,q1 -.byte 0x6c,0x43,0xf0,0xf3 @ aesd q10,q14 -.byte 0xe4,0x43,0xf0,0xf3 @ aesimc q10,q10 - veor q9,q3,q7 -.byte 0x6e,0x23,0xb0,0xf3 @ aesd q1,q15 -.byte 0x6e,0x43,0xf0,0xf3 @ aesd q10,q15 - beq .Lcbc_dec_one - veor q5,q5,q1 - veor q9,q9,q10 - vorr q6,q11,q11 - vst1.8 {q5},[r1]! - vst1.8 {q9},[r1]! - b .Lcbc_done - -.Lcbc_dec_one: - veor q5,q5,q10 - vorr q6,q11,q11 - vst1.8 {q5},[r1]! - -.Lcbc_done: - vst1.8 {q6},[r4] -.Lcbc_abort: - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!,{r4,r5,r6,r7,r8,pc} -.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt -.globl aes_hw_ctr32_encrypt_blocks -.hidden aes_hw_ctr32_encrypt_blocks -.type aes_hw_ctr32_encrypt_blocks,%function -.align 5 -aes_hw_ctr32_encrypt_blocks: - mov ip,sp - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,lr} - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - ldr r4, [ip] @ load remaining arg - ldr r5,[r3,#240] - - ldr r8, [r4, #12] - vld1.32 {q0},[r4] - - vld1.32 {q8,q9},[r3] @ load key schedule... - sub r5,r5,#4 - mov r12,#16 - cmp r2,#2 - add r7,r3,r5,lsl#4 @ pointer to last 5 round keys - sub r5,r5,#2 - vld1.32 {q12,q13},[r7]! - vld1.32 {q14,q15},[r7]! - vld1.32 {q7},[r7] - add r7,r3,#32 - mov r6,r5 - movlo r12,#0 - - @ ARM Cortex-A57 and Cortex-A72 cores running in 32-bit mode are - @ affected by silicon errata #1742098 [0] and #1655431 [1], - @ respectively, where the second instruction of an aese/aesmc - @ instruction pair may execute twice if an interrupt is taken right - @ after the first instruction consumes an input register of which a - @ single 32-bit lane has been updated the last time it was modified. - @ - @ This function uses a counter in one 32-bit lane. The - @ could write to q1 and q10 directly, but that trips this bugs. - @ We write to q6 and copy to the final register as a workaround. - @ - @ [0] ARM-EPM-049219 v23 Cortex-A57 MPCore Software Developers Errata Notice - @ [1] ARM-EPM-012079 v11.0 Cortex-A72 MPCore Software Developers Errata Notice -#ifndef __ARMEB__ - rev r8, r8 -#endif - add r10, r8, #1 - vorr q6,q0,q0 - rev r10, r10 - vmov.32 d13[1],r10 - add r8, r8, #2 - vorr q1,q6,q6 - bls .Lctr32_tail - rev r12, r8 - vmov.32 d13[1],r12 - sub r2,r2,#3 @ bias - vorr q10,q6,q6 - b .Loop3x_ctr32 - -.align 4 -.Loop3x_ctr32: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 - vld1.32 {q9},[r7]! - bgt .Loop3x_ctr32 - -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x83,0xb0,0xf3 @ aesmc q4,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0xa3,0xb0,0xf3 @ aesmc q5,q1 - vld1.8 {q2},[r0]! - add r9,r8,#1 -.byte 0x20,0x43,0xf0,0xf3 @ aese q10,q8 -.byte 0xa4,0x43,0xf0,0xf3 @ aesmc q10,q10 - vld1.8 {q3},[r0]! - rev r9,r9 -.byte 0x22,0x83,0xb0,0xf3 @ aese q4,q9 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x22,0xa3,0xb0,0xf3 @ aese q5,q9 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - vld1.8 {q11},[r0]! - mov r7,r3 -.byte 0x22,0x43,0xf0,0xf3 @ aese q10,q9 -.byte 0xa4,0x23,0xf0,0xf3 @ aesmc q9,q10 -.byte 0x28,0x83,0xb0,0xf3 @ aese q4,q12 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x28,0xa3,0xb0,0xf3 @ aese q5,q12 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - veor q2,q2,q7 - add r10,r8,#2 -.byte 0x28,0x23,0xf0,0xf3 @ aese q9,q12 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 - veor q3,q3,q7 - add r8,r8,#3 -.byte 0x2a,0x83,0xb0,0xf3 @ aese q4,q13 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 -.byte 0x2a,0xa3,0xb0,0xf3 @ aese q5,q13 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - @ Note the logic to update q0, q1, and q1 is written to work - @ around a bug in ARM Cortex-A57 and Cortex-A72 cores running in - @ 32-bit mode. See the comment above. - veor q11,q11,q7 - vmov.32 d13[1], r9 -.byte 0x2a,0x23,0xf0,0xf3 @ aese q9,q13 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 - vorr q0,q6,q6 - rev r10,r10 -.byte 0x2c,0x83,0xb0,0xf3 @ aese q4,q14 -.byte 0x88,0x83,0xb0,0xf3 @ aesmc q4,q4 - vmov.32 d13[1], r10 - rev r12,r8 -.byte 0x2c,0xa3,0xb0,0xf3 @ aese q5,q14 -.byte 0x8a,0xa3,0xb0,0xf3 @ aesmc q5,q5 - vorr q1,q6,q6 - vmov.32 d13[1], r12 -.byte 0x2c,0x23,0xf0,0xf3 @ aese q9,q14 -.byte 0xa2,0x23,0xf0,0xf3 @ aesmc q9,q9 - vorr q10,q6,q6 - subs r2,r2,#3 -.byte 0x2e,0x83,0xb0,0xf3 @ aese q4,q15 -.byte 0x2e,0xa3,0xb0,0xf3 @ aese q5,q15 -.byte 0x2e,0x23,0xf0,0xf3 @ aese q9,q15 - - veor q2,q2,q4 - vld1.32 {q8},[r7]! @ re-pre-load rndkey[0] - vst1.8 {q2},[r1]! - veor q3,q3,q5 - mov r6,r5 - vst1.8 {q3},[r1]! - veor q11,q11,q9 - vld1.32 {q9},[r7]! @ re-pre-load rndkey[1] - vst1.8 {q11},[r1]! - bhs .Loop3x_ctr32 - - adds r2,r2,#3 - beq .Lctr32_done - cmp r2,#1 - mov r12,#16 - moveq r12,#0 - -.Lctr32_tail: -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.32 {q8},[r7]! - subs r6,r6,#2 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.32 {q9},[r7]! - bgt .Lctr32_tail - -.byte 0x20,0x03,0xb0,0xf3 @ aese q0,q8 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x20,0x23,0xb0,0xf3 @ aese q1,q8 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 -.byte 0x22,0x03,0xb0,0xf3 @ aese q0,q9 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x22,0x23,0xb0,0xf3 @ aese q1,q9 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.8 {q2},[r0],r12 -.byte 0x28,0x03,0xb0,0xf3 @ aese q0,q12 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x28,0x23,0xb0,0xf3 @ aese q1,q12 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - vld1.8 {q3},[r0] -.byte 0x2a,0x03,0xb0,0xf3 @ aese q0,q13 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2a,0x23,0xb0,0xf3 @ aese q1,q13 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - veor q2,q2,q7 -.byte 0x2c,0x03,0xb0,0xf3 @ aese q0,q14 -.byte 0x80,0x03,0xb0,0xf3 @ aesmc q0,q0 -.byte 0x2c,0x23,0xb0,0xf3 @ aese q1,q14 -.byte 0x82,0x23,0xb0,0xf3 @ aesmc q1,q1 - veor q3,q3,q7 -.byte 0x2e,0x03,0xb0,0xf3 @ aese q0,q15 -.byte 0x2e,0x23,0xb0,0xf3 @ aese q1,q15 - - cmp r2,#1 - veor q2,q2,q0 - veor q3,q3,q1 - vst1.8 {q2},[r1]! - beq .Lctr32_done - vst1.8 {q3},[r1] - -.Lctr32_done: - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,pc} -.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S b/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S deleted file mode 100644 index 02968947..00000000 --- a/third_party/boringssl/linux-arm/crypto/fipsmodule/armv4-mont.S +++ /dev/null @@ -1,977 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__arm__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. -.arch armv7-a - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -#if __ARM_MAX_ARCH__>=7 -.align 5 -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.Lbn_mul_mont -#endif - -.globl bn_mul_mont -.hidden bn_mul_mont -.type bn_mul_mont,%function - -.align 5 -bn_mul_mont: -.Lbn_mul_mont: - ldr ip,[sp,#4] @ load num - stmdb sp!,{r0,r2} @ sp points at argument block -#if __ARM_MAX_ARCH__>=7 - tst ip,#7 - bne .Lialu - adr r0,.Lbn_mul_mont - ldr r2,.LOPENSSL_armcap - ldr r0,[r0,r2] -#ifdef __APPLE__ - ldr r0,[r0] -#endif - tst r0,#ARMV7_NEON @ NEON available? - ldmia sp, {r0,r2} - beq .Lialu - add sp,sp,#8 - b bn_mul8x_mont_neon -.align 4 -.Lialu: -#endif - cmp ip,#2 - mov r0,ip @ load num -#ifdef __thumb2__ - ittt lt -#endif - movlt r0,#0 - addlt sp,sp,#2*4 - blt .Labrt - - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ save 10 registers - - mov r0,r0,lsl#2 @ rescale r0 for byte count - sub sp,sp,r0 @ alloca(4*num) - sub sp,sp,#4 @ +extra dword - sub r0,r0,#4 @ "num=num-1" - add r4,r2,r0 @ &bp[num-1] - - add r0,sp,r0 @ r0 to point at &tp[num-1] - ldr r8,[r0,#14*4] @ &n0 - ldr r2,[r2] @ bp[0] - ldr r5,[r1],#4 @ ap[0],ap++ - ldr r6,[r3],#4 @ np[0],np++ - ldr r8,[r8] @ *n0 - str r4,[r0,#15*4] @ save &bp[num] - - umull r10,r11,r5,r2 @ ap[0]*bp[0] - str r8,[r0,#14*4] @ save n0 value - mul r8,r10,r8 @ "tp[0]"*n0 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"t[0]" - mov r4,sp - -.L1st: - ldr r5,[r1],#4 @ ap[j],ap++ - mov r10,r11 - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[0] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne .L1st - - adds r12,r12,r11 - ldr r4,[r0,#13*4] @ restore bp - mov r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - mov r7,sp - str r14,[r0,#4] @ tp[num]= - -.Louter: - sub r7,r0,r7 @ "original" r0-1 value - sub r1,r1,r7 @ "rewind" ap to &ap[1] - ldr r2,[r4,#4]! @ *(++bp) - sub r3,r3,r7 @ "rewind" np to &np[1] - ldr r5,[r1,#-4] @ ap[0] - ldr r10,[sp] @ tp[0] - ldr r6,[r3,#-4] @ np[0] - ldr r7,[sp,#4] @ tp[1] - - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[0]*bp[i]+tp[0] - str r4,[r0,#13*4] @ save bp - mul r8,r10,r8 - mov r12,#0 - umlal r10,r12,r6,r8 @ np[0]*n0+"tp[0]" - mov r4,sp - -.Linner: - ldr r5,[r1],#4 @ ap[j],ap++ - adds r10,r11,r7 @ +=tp[j] - ldr r6,[r3],#4 @ np[j],np++ - mov r11,#0 - umlal r10,r11,r5,r2 @ ap[j]*bp[i] - mov r14,#0 - umlal r12,r14,r6,r8 @ np[j]*n0 - adc r11,r11,#0 - ldr r7,[r4,#8] @ tp[j+1] - adds r12,r12,r10 - str r12,[r4],#4 @ tp[j-1]=,tp++ - adc r12,r14,#0 - cmp r4,r0 - bne .Linner - - adds r12,r12,r11 - mov r14,#0 - ldr r4,[r0,#13*4] @ restore bp - adc r14,r14,#0 - ldr r8,[r0,#14*4] @ restore n0 - adds r12,r12,r7 - ldr r7,[r0,#15*4] @ restore &bp[num] - adc r14,r14,#0 - str r12,[r0] @ tp[num-1]= - str r14,[r0,#4] @ tp[num]= - - cmp r4,r7 -#ifdef __thumb2__ - itt ne -#endif - movne r7,sp - bne .Louter - - ldr r2,[r0,#12*4] @ pull rp - mov r5,sp - add r0,r0,#4 @ r0 to point at &tp[num] - sub r5,r0,r5 @ "original" num value - mov r4,sp @ "rewind" r4 - mov r1,r4 @ "borrow" r1 - sub r3,r3,r5 @ "rewind" r3 to &np[0] - - subs r7,r7,r7 @ "clear" carry flag -.Lsub: ldr r7,[r4],#4 - ldr r6,[r3],#4 - sbcs r7,r7,r6 @ tp[j]-np[j] - str r7,[r2],#4 @ rp[j]= - teq r4,r0 @ preserve carry - bne .Lsub - sbcs r14,r14,#0 @ upmost carry - mov r4,sp @ "rewind" r4 - sub r2,r2,r5 @ "rewind" r2 - -.Lcopy: ldr r7,[r4] @ conditional copy - ldr r5,[r2] - str sp,[r4],#4 @ zap tp -#ifdef __thumb2__ - it cc -#endif - movcc r5,r7 - str r5,[r2],#4 - teq r4,r0 @ preserve carry - bne .Lcopy - - mov sp,r0 - add sp,sp,#4 @ skip over tp[num+1] - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} @ restore registers - add sp,sp,#2*4 @ skip over {r0,r2} - mov r0,#1 -.Labrt: -#if __ARM_ARCH__>=5 - bx lr @ bx lr -#else - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size bn_mul_mont,.-bn_mul_mont -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.type bn_mul8x_mont_neon,%function -.align 5 -bn_mul8x_mont_neon: - mov ip,sp - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11} - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - ldmia ip,{r4,r5} @ load rest of parameter block - mov ip,sp - - cmp r5,#8 - bhi .LNEON_8n - - @ special case for r5==8, everything is in register bank... - - vld1.32 {d28[0]}, [r2,:32]! - veor d8,d8,d8 - sub r7,sp,r5,lsl#4 - vld1.32 {d0,d1,d2,d3}, [r1]! @ can't specify :32 :-( - and r7,r7,#-64 - vld1.32 {d30[0]}, [r4,:32] - mov sp,r7 @ alloca - vzip.16 d28,d8 - - vmull.u32 q6,d28,d0[0] - vmull.u32 q7,d28,d0[1] - vmull.u32 q8,d28,d1[0] - vshl.i64 d29,d13,#16 - vmull.u32 q9,d28,d1[1] - - vadd.u64 d29,d29,d12 - veor d8,d8,d8 - vmul.u32 d29,d29,d30 - - vmull.u32 q10,d28,d2[0] - vld1.32 {d4,d5,d6,d7}, [r3]! - vmull.u32 q11,d28,d2[1] - vmull.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmull.u32 q13,d28,d3[1] - - vmlal.u32 q6,d29,d4[0] - sub r9,r5,#1 - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vmov q5,q6 - vmlal.u32 q11,d29,d6[1] - vmov q6,q7 - vmlal.u32 q12,d29,d7[0] - vmov q7,q8 - vmlal.u32 q13,d29,d7[1] - vmov q8,q9 - vmov q9,q10 - vshr.u64 d10,d10,#16 - vmov q10,q11 - vmov q11,q12 - vadd.u64 d10,d10,d11 - vmov q12,q13 - veor q13,q13 - vshr.u64 d10,d10,#16 - - b .LNEON_outer8 - -.align 4 -.LNEON_outer8: - vld1.32 {d28[0]}, [r2,:32]! - veor d8,d8,d8 - vzip.16 d28,d8 - vadd.u64 d12,d12,d10 - - vmlal.u32 q6,d28,d0[0] - vmlal.u32 q7,d28,d0[1] - vmlal.u32 q8,d28,d1[0] - vshl.i64 d29,d13,#16 - vmlal.u32 q9,d28,d1[1] - - vadd.u64 d29,d29,d12 - veor d8,d8,d8 - subs r9,r9,#1 - vmul.u32 d29,d29,d30 - - vmlal.u32 q10,d28,d2[0] - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q13,d28,d3[1] - - vmlal.u32 q6,d29,d4[0] - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - - vmlal.u32 q10,d29,d6[0] - vmov q5,q6 - vmlal.u32 q11,d29,d6[1] - vmov q6,q7 - vmlal.u32 q12,d29,d7[0] - vmov q7,q8 - vmlal.u32 q13,d29,d7[1] - vmov q8,q9 - vmov q9,q10 - vshr.u64 d10,d10,#16 - vmov q10,q11 - vmov q11,q12 - vadd.u64 d10,d10,d11 - vmov q12,q13 - veor q13,q13 - vshr.u64 d10,d10,#16 - - bne .LNEON_outer8 - - vadd.u64 d12,d12,d10 - mov r7,sp - vshr.u64 d10,d12,#16 - mov r8,r5 - vadd.u64 d13,d13,d10 - add r6,sp,#96 - vshr.u64 d10,d13,#16 - vzip.16 d12,d13 - - b .LNEON_tail_entry - -.align 4 -.LNEON_8n: - veor q6,q6,q6 - sub r7,sp,#128 - veor q7,q7,q7 - sub r7,r7,r5,lsl#4 - veor q8,q8,q8 - and r7,r7,#-64 - veor q9,q9,q9 - mov sp,r7 @ alloca - veor q10,q10,q10 - add r7,r7,#256 - veor q11,q11,q11 - sub r8,r5,#8 - veor q12,q12,q12 - veor q13,q13,q13 - -.LNEON_8n_init: - vst1.64 {q6,q7},[r7,:256]! - subs r8,r8,#8 - vst1.64 {q8,q9},[r7,:256]! - vst1.64 {q10,q11},[r7,:256]! - vst1.64 {q12,q13},[r7,:256]! - bne .LNEON_8n_init - - add r6,sp,#256 - vld1.32 {d0,d1,d2,d3},[r1]! - add r10,sp,#8 - vld1.32 {d30[0]},[r4,:32] - mov r9,r5 - b .LNEON_8n_outer - -.align 4 -.LNEON_8n_outer: - vld1.32 {d28[0]},[r2,:32]! @ *b++ - veor d8,d8,d8 - vzip.16 d28,d8 - add r7,sp,#128 - vld1.32 {d4,d5,d6,d7},[r3]! - - vmlal.u32 q6,d28,d0[0] - vmlal.u32 q7,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q8,d28,d1[0] - vshl.i64 d29,d13,#16 - vmlal.u32 q9,d28,d1[1] - vadd.u64 d29,d29,d12 - vmlal.u32 q10,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q11,d28,d2[1] - vst1.32 {d28},[sp,:64] @ put aside smashed b[8*i+0] - vmlal.u32 q12,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q13,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q6,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q7,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q8,d29,d5[0] - vshr.u64 d12,d12,#16 - vmlal.u32 q9,d29,d5[1] - vmlal.u32 q10,d29,d6[0] - vadd.u64 d12,d12,d13 - vmlal.u32 q11,d29,d6[1] - vshr.u64 d12,d12,#16 - vmlal.u32 q12,d29,d7[0] - vmlal.u32 q13,d29,d7[1] - vadd.u64 d14,d14,d12 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+0] - vmlal.u32 q7,d28,d0[0] - vld1.64 {q6},[r6,:128]! - vmlal.u32 q8,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q9,d28,d1[0] - vshl.i64 d29,d15,#16 - vmlal.u32 q10,d28,d1[1] - vadd.u64 d29,d29,d14 - vmlal.u32 q11,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q12,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+1] - vmlal.u32 q13,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q6,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q7,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q8,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q9,d29,d5[0] - vshr.u64 d14,d14,#16 - vmlal.u32 q10,d29,d5[1] - vmlal.u32 q11,d29,d6[0] - vadd.u64 d14,d14,d15 - vmlal.u32 q12,d29,d6[1] - vshr.u64 d14,d14,#16 - vmlal.u32 q13,d29,d7[0] - vmlal.u32 q6,d29,d7[1] - vadd.u64 d16,d16,d14 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+1] - vmlal.u32 q8,d28,d0[0] - vld1.64 {q7},[r6,:128]! - vmlal.u32 q9,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q10,d28,d1[0] - vshl.i64 d29,d17,#16 - vmlal.u32 q11,d28,d1[1] - vadd.u64 d29,d29,d16 - vmlal.u32 q12,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q13,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+2] - vmlal.u32 q6,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q7,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q8,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q9,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q10,d29,d5[0] - vshr.u64 d16,d16,#16 - vmlal.u32 q11,d29,d5[1] - vmlal.u32 q12,d29,d6[0] - vadd.u64 d16,d16,d17 - vmlal.u32 q13,d29,d6[1] - vshr.u64 d16,d16,#16 - vmlal.u32 q6,d29,d7[0] - vmlal.u32 q7,d29,d7[1] - vadd.u64 d18,d18,d16 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+2] - vmlal.u32 q9,d28,d0[0] - vld1.64 {q8},[r6,:128]! - vmlal.u32 q10,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q11,d28,d1[0] - vshl.i64 d29,d19,#16 - vmlal.u32 q12,d28,d1[1] - vadd.u64 d29,d29,d18 - vmlal.u32 q13,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q6,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+3] - vmlal.u32 q7,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q8,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q9,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q10,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q11,d29,d5[0] - vshr.u64 d18,d18,#16 - vmlal.u32 q12,d29,d5[1] - vmlal.u32 q13,d29,d6[0] - vadd.u64 d18,d18,d19 - vmlal.u32 q6,d29,d6[1] - vshr.u64 d18,d18,#16 - vmlal.u32 q7,d29,d7[0] - vmlal.u32 q8,d29,d7[1] - vadd.u64 d20,d20,d18 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+3] - vmlal.u32 q10,d28,d0[0] - vld1.64 {q9},[r6,:128]! - vmlal.u32 q11,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q12,d28,d1[0] - vshl.i64 d29,d21,#16 - vmlal.u32 q13,d28,d1[1] - vadd.u64 d29,d29,d20 - vmlal.u32 q6,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q7,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+4] - vmlal.u32 q8,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q9,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q10,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q11,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q12,d29,d5[0] - vshr.u64 d20,d20,#16 - vmlal.u32 q13,d29,d5[1] - vmlal.u32 q6,d29,d6[0] - vadd.u64 d20,d20,d21 - vmlal.u32 q7,d29,d6[1] - vshr.u64 d20,d20,#16 - vmlal.u32 q8,d29,d7[0] - vmlal.u32 q9,d29,d7[1] - vadd.u64 d22,d22,d20 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+4] - vmlal.u32 q11,d28,d0[0] - vld1.64 {q10},[r6,:128]! - vmlal.u32 q12,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q13,d28,d1[0] - vshl.i64 d29,d23,#16 - vmlal.u32 q6,d28,d1[1] - vadd.u64 d29,d29,d22 - vmlal.u32 q7,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q8,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+5] - vmlal.u32 q9,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q10,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q11,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q12,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q13,d29,d5[0] - vshr.u64 d22,d22,#16 - vmlal.u32 q6,d29,d5[1] - vmlal.u32 q7,d29,d6[0] - vadd.u64 d22,d22,d23 - vmlal.u32 q8,d29,d6[1] - vshr.u64 d22,d22,#16 - vmlal.u32 q9,d29,d7[0] - vmlal.u32 q10,d29,d7[1] - vadd.u64 d24,d24,d22 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+5] - vmlal.u32 q12,d28,d0[0] - vld1.64 {q11},[r6,:128]! - vmlal.u32 q13,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q6,d28,d1[0] - vshl.i64 d29,d25,#16 - vmlal.u32 q7,d28,d1[1] - vadd.u64 d29,d29,d24 - vmlal.u32 q8,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q9,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+6] - vmlal.u32 q10,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q11,d28,d3[1] - vld1.32 {d28[0]},[r2,:32]! @ *b++ - vmlal.u32 q12,d29,d4[0] - veor d10,d10,d10 - vmlal.u32 q13,d29,d4[1] - vzip.16 d28,d10 - vmlal.u32 q6,d29,d5[0] - vshr.u64 d24,d24,#16 - vmlal.u32 q7,d29,d5[1] - vmlal.u32 q8,d29,d6[0] - vadd.u64 d24,d24,d25 - vmlal.u32 q9,d29,d6[1] - vshr.u64 d24,d24,#16 - vmlal.u32 q10,d29,d7[0] - vmlal.u32 q11,d29,d7[1] - vadd.u64 d26,d26,d24 - vst1.32 {d29},[r10,:64]! @ put aside smashed m[8*i+6] - vmlal.u32 q13,d28,d0[0] - vld1.64 {q12},[r6,:128]! - vmlal.u32 q6,d28,d0[1] - veor d8,d8,d8 - vmlal.u32 q7,d28,d1[0] - vshl.i64 d29,d27,#16 - vmlal.u32 q8,d28,d1[1] - vadd.u64 d29,d29,d26 - vmlal.u32 q9,d28,d2[0] - vmul.u32 d29,d29,d30 - vmlal.u32 q10,d28,d2[1] - vst1.32 {d28},[r10,:64]! @ put aside smashed b[8*i+7] - vmlal.u32 q11,d28,d3[0] - vzip.16 d29,d8 - vmlal.u32 q12,d28,d3[1] - vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] - vmlal.u32 q13,d29,d4[0] - vld1.32 {d0,d1,d2,d3},[r1]! - vmlal.u32 q6,d29,d4[1] - vmlal.u32 q7,d29,d5[0] - vshr.u64 d26,d26,#16 - vmlal.u32 q8,d29,d5[1] - vmlal.u32 q9,d29,d6[0] - vadd.u64 d26,d26,d27 - vmlal.u32 q10,d29,d6[1] - vshr.u64 d26,d26,#16 - vmlal.u32 q11,d29,d7[0] - vmlal.u32 q12,d29,d7[1] - vadd.u64 d12,d12,d26 - vst1.32 {d29},[r10,:64] @ put aside smashed m[8*i+7] - add r10,sp,#8 @ rewind - sub r8,r5,#8 - b .LNEON_8n_inner - -.align 4 -.LNEON_8n_inner: - subs r8,r8,#8 - vmlal.u32 q6,d28,d0[0] - vld1.64 {q13},[r6,:128] - vmlal.u32 q7,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+0] - vmlal.u32 q8,d28,d1[0] - vld1.32 {d4,d5,d6,d7},[r3]! - vmlal.u32 q9,d28,d1[1] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q10,d28,d2[0] - vmlal.u32 q11,d28,d2[1] - vmlal.u32 q12,d28,d3[0] - vmlal.u32 q13,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+1] - vmlal.u32 q6,d29,d4[0] - vmlal.u32 q7,d29,d4[1] - vmlal.u32 q8,d29,d5[0] - vmlal.u32 q9,d29,d5[1] - vmlal.u32 q10,d29,d6[0] - vmlal.u32 q11,d29,d6[1] - vmlal.u32 q12,d29,d7[0] - vmlal.u32 q13,d29,d7[1] - vst1.64 {q6},[r7,:128]! - vmlal.u32 q7,d28,d0[0] - vld1.64 {q6},[r6,:128] - vmlal.u32 q8,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+1] - vmlal.u32 q9,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q10,d28,d1[1] - vmlal.u32 q11,d28,d2[0] - vmlal.u32 q12,d28,d2[1] - vmlal.u32 q13,d28,d3[0] - vmlal.u32 q6,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+2] - vmlal.u32 q7,d29,d4[0] - vmlal.u32 q8,d29,d4[1] - vmlal.u32 q9,d29,d5[0] - vmlal.u32 q10,d29,d5[1] - vmlal.u32 q11,d29,d6[0] - vmlal.u32 q12,d29,d6[1] - vmlal.u32 q13,d29,d7[0] - vmlal.u32 q6,d29,d7[1] - vst1.64 {q7},[r7,:128]! - vmlal.u32 q8,d28,d0[0] - vld1.64 {q7},[r6,:128] - vmlal.u32 q9,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+2] - vmlal.u32 q10,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q11,d28,d1[1] - vmlal.u32 q12,d28,d2[0] - vmlal.u32 q13,d28,d2[1] - vmlal.u32 q6,d28,d3[0] - vmlal.u32 q7,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+3] - vmlal.u32 q8,d29,d4[0] - vmlal.u32 q9,d29,d4[1] - vmlal.u32 q10,d29,d5[0] - vmlal.u32 q11,d29,d5[1] - vmlal.u32 q12,d29,d6[0] - vmlal.u32 q13,d29,d6[1] - vmlal.u32 q6,d29,d7[0] - vmlal.u32 q7,d29,d7[1] - vst1.64 {q8},[r7,:128]! - vmlal.u32 q9,d28,d0[0] - vld1.64 {q8},[r6,:128] - vmlal.u32 q10,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+3] - vmlal.u32 q11,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q12,d28,d1[1] - vmlal.u32 q13,d28,d2[0] - vmlal.u32 q6,d28,d2[1] - vmlal.u32 q7,d28,d3[0] - vmlal.u32 q8,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+4] - vmlal.u32 q9,d29,d4[0] - vmlal.u32 q10,d29,d4[1] - vmlal.u32 q11,d29,d5[0] - vmlal.u32 q12,d29,d5[1] - vmlal.u32 q13,d29,d6[0] - vmlal.u32 q6,d29,d6[1] - vmlal.u32 q7,d29,d7[0] - vmlal.u32 q8,d29,d7[1] - vst1.64 {q9},[r7,:128]! - vmlal.u32 q10,d28,d0[0] - vld1.64 {q9},[r6,:128] - vmlal.u32 q11,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+4] - vmlal.u32 q12,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q13,d28,d1[1] - vmlal.u32 q6,d28,d2[0] - vmlal.u32 q7,d28,d2[1] - vmlal.u32 q8,d28,d3[0] - vmlal.u32 q9,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+5] - vmlal.u32 q10,d29,d4[0] - vmlal.u32 q11,d29,d4[1] - vmlal.u32 q12,d29,d5[0] - vmlal.u32 q13,d29,d5[1] - vmlal.u32 q6,d29,d6[0] - vmlal.u32 q7,d29,d6[1] - vmlal.u32 q8,d29,d7[0] - vmlal.u32 q9,d29,d7[1] - vst1.64 {q10},[r7,:128]! - vmlal.u32 q11,d28,d0[0] - vld1.64 {q10},[r6,:128] - vmlal.u32 q12,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+5] - vmlal.u32 q13,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q6,d28,d1[1] - vmlal.u32 q7,d28,d2[0] - vmlal.u32 q8,d28,d2[1] - vmlal.u32 q9,d28,d3[0] - vmlal.u32 q10,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+6] - vmlal.u32 q11,d29,d4[0] - vmlal.u32 q12,d29,d4[1] - vmlal.u32 q13,d29,d5[0] - vmlal.u32 q6,d29,d5[1] - vmlal.u32 q7,d29,d6[0] - vmlal.u32 q8,d29,d6[1] - vmlal.u32 q9,d29,d7[0] - vmlal.u32 q10,d29,d7[1] - vst1.64 {q11},[r7,:128]! - vmlal.u32 q12,d28,d0[0] - vld1.64 {q11},[r6,:128] - vmlal.u32 q13,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+6] - vmlal.u32 q6,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q7,d28,d1[1] - vmlal.u32 q8,d28,d2[0] - vmlal.u32 q9,d28,d2[1] - vmlal.u32 q10,d28,d3[0] - vmlal.u32 q11,d28,d3[1] - vld1.32 {d28},[r10,:64]! @ pull smashed b[8*i+7] - vmlal.u32 q12,d29,d4[0] - vmlal.u32 q13,d29,d4[1] - vmlal.u32 q6,d29,d5[0] - vmlal.u32 q7,d29,d5[1] - vmlal.u32 q8,d29,d6[0] - vmlal.u32 q9,d29,d6[1] - vmlal.u32 q10,d29,d7[0] - vmlal.u32 q11,d29,d7[1] - vst1.64 {q12},[r7,:128]! - vmlal.u32 q13,d28,d0[0] - vld1.64 {q12},[r6,:128] - vmlal.u32 q6,d28,d0[1] - vld1.32 {d29},[r10,:64]! @ pull smashed m[8*i+7] - vmlal.u32 q7,d28,d1[0] - it ne - addne r6,r6,#16 @ don't advance in last iteration - vmlal.u32 q8,d28,d1[1] - vmlal.u32 q9,d28,d2[0] - vmlal.u32 q10,d28,d2[1] - vmlal.u32 q11,d28,d3[0] - vmlal.u32 q12,d28,d3[1] - it eq - subeq r1,r1,r5,lsl#2 @ rewind - vmlal.u32 q13,d29,d4[0] - vld1.32 {d28},[sp,:64] @ pull smashed b[8*i+0] - vmlal.u32 q6,d29,d4[1] - vld1.32 {d0,d1,d2,d3},[r1]! - vmlal.u32 q7,d29,d5[0] - add r10,sp,#8 @ rewind - vmlal.u32 q8,d29,d5[1] - vmlal.u32 q9,d29,d6[0] - vmlal.u32 q10,d29,d6[1] - vmlal.u32 q11,d29,d7[0] - vst1.64 {q13},[r7,:128]! - vmlal.u32 q12,d29,d7[1] - - bne .LNEON_8n_inner - add r6,sp,#128 - vst1.64 {q6,q7},[r7,:256]! - veor q2,q2,q2 @ d4-d5 - vst1.64 {q8,q9},[r7,:256]! - veor q3,q3,q3 @ d6-d7 - vst1.64 {q10,q11},[r7,:256]! - vst1.64 {q12},[r7,:128] - - subs r9,r9,#8 - vld1.64 {q6,q7},[r6,:256]! - vld1.64 {q8,q9},[r6,:256]! - vld1.64 {q10,q11},[r6,:256]! - vld1.64 {q12,q13},[r6,:256]! - - itt ne - subne r3,r3,r5,lsl#2 @ rewind - bne .LNEON_8n_outer - - add r7,sp,#128 - vst1.64 {q2,q3}, [sp,:256]! @ start wiping stack frame - vshr.u64 d10,d12,#16 - vst1.64 {q2,q3},[sp,:256]! - vadd.u64 d13,d13,d10 - vst1.64 {q2,q3}, [sp,:256]! - vshr.u64 d10,d13,#16 - vst1.64 {q2,q3}, [sp,:256]! - vzip.16 d12,d13 - - mov r8,r5 - b .LNEON_tail_entry - -.align 4 -.LNEON_tail: - vadd.u64 d12,d12,d10 - vshr.u64 d10,d12,#16 - vld1.64 {q8,q9}, [r6, :256]! - vadd.u64 d13,d13,d10 - vld1.64 {q10,q11}, [r6, :256]! - vshr.u64 d10,d13,#16 - vld1.64 {q12,q13}, [r6, :256]! - vzip.16 d12,d13 - -.LNEON_tail_entry: - vadd.u64 d14,d14,d10 - vst1.32 {d12[0]}, [r7, :32]! - vshr.u64 d10,d14,#16 - vadd.u64 d15,d15,d10 - vshr.u64 d10,d15,#16 - vzip.16 d14,d15 - vadd.u64 d16,d16,d10 - vst1.32 {d14[0]}, [r7, :32]! - vshr.u64 d10,d16,#16 - vadd.u64 d17,d17,d10 - vshr.u64 d10,d17,#16 - vzip.16 d16,d17 - vadd.u64 d18,d18,d10 - vst1.32 {d16[0]}, [r7, :32]! - vshr.u64 d10,d18,#16 - vadd.u64 d19,d19,d10 - vshr.u64 d10,d19,#16 - vzip.16 d18,d19 - vadd.u64 d20,d20,d10 - vst1.32 {d18[0]}, [r7, :32]! - vshr.u64 d10,d20,#16 - vadd.u64 d21,d21,d10 - vshr.u64 d10,d21,#16 - vzip.16 d20,d21 - vadd.u64 d22,d22,d10 - vst1.32 {d20[0]}, [r7, :32]! - vshr.u64 d10,d22,#16 - vadd.u64 d23,d23,d10 - vshr.u64 d10,d23,#16 - vzip.16 d22,d23 - vadd.u64 d24,d24,d10 - vst1.32 {d22[0]}, [r7, :32]! - vshr.u64 d10,d24,#16 - vadd.u64 d25,d25,d10 - vshr.u64 d10,d25,#16 - vzip.16 d24,d25 - vadd.u64 d26,d26,d10 - vst1.32 {d24[0]}, [r7, :32]! - vshr.u64 d10,d26,#16 - vadd.u64 d27,d27,d10 - vshr.u64 d10,d27,#16 - vzip.16 d26,d27 - vld1.64 {q6,q7}, [r6, :256]! - subs r8,r8,#8 - vst1.32 {d26[0]}, [r7, :32]! - bne .LNEON_tail - - vst1.32 {d10[0]}, [r7, :32] @ top-most bit - sub r3,r3,r5,lsl#2 @ rewind r3 - subs r1,sp,#0 @ clear carry flag - add r2,sp,r5,lsl#2 - -.LNEON_sub: - ldmia r1!, {r4,r5,r6,r7} - ldmia r3!, {r8,r9,r10,r11} - sbcs r8, r4,r8 - sbcs r9, r5,r9 - sbcs r10,r6,r10 - sbcs r11,r7,r11 - teq r1,r2 @ preserves carry - stmia r0!, {r8,r9,r10,r11} - bne .LNEON_sub - - ldr r10, [r1] @ load top-most bit - mov r11,sp - veor q0,q0,q0 - sub r11,r2,r11 @ this is num*4 - veor q1,q1,q1 - mov r1,sp - sub r0,r0,r11 @ rewind r0 - mov r3,r2 @ second 3/4th of frame - sbcs r10,r10,#0 @ result is carry flag - -.LNEON_copy_n_zap: - ldmia r1!, {r4,r5,r6,r7} - ldmia r0, {r8,r9,r10,r11} - it cc - movcc r8, r4 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - itt cc - movcc r9, r5 - movcc r10,r6 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - it cc - movcc r11,r7 - ldmia r1, {r4,r5,r6,r7} - stmia r0!, {r8,r9,r10,r11} - sub r1,r1,#16 - ldmia r0, {r8,r9,r10,r11} - it cc - movcc r8, r4 - vst1.64 {q0,q1}, [r1,:256]! @ wipe - itt cc - movcc r9, r5 - movcc r10,r6 - vst1.64 {q0,q1}, [r3,:256]! @ wipe - it cc - movcc r11,r7 - teq r1,r2 @ preserves carry - stmia r0!, {r8,r9,r10,r11} - bne .LNEON_copy_n_zap - - mov sp,ip - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11} - bx lr @ bx lr -.size bn_mul8x_mont_neon,.-bn_mul8x_mont_neon -#endif -.byte 77,111,110,116,103,111,109,101,114,121,32,109,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#if __ARM_MAX_ARCH__>=7 -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S b/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S deleted file mode 100644 index 69a8fcac..00000000 --- a/third_party/boringssl/linux-arm/crypto/fipsmodule/bsaes-armv7.S +++ /dev/null @@ -1,1529 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__arm__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -@ Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. -@ -@ Licensed under the OpenSSL license (the "License"). You may not use -@ this file except in compliance with the License. You can obtain a copy -@ in the file LICENSE in the source distribution or at -@ https://www.openssl.org/source/license.html - - -@ ==================================================================== -@ Written by Andy Polyakov for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. -@ -@ Specific modes and adaptation for Linux kernel by Ard Biesheuvel -@ of Linaro. Permission to use under GPL terms is granted. -@ ==================================================================== - -@ Bit-sliced AES for ARM NEON -@ -@ February 2012. -@ -@ This implementation is direct adaptation of bsaes-x86_64 module for -@ ARM NEON. Except that this module is endian-neutral [in sense that -@ it can be compiled for either endianness] by courtesy of vld1.8's -@ neutrality. Initial version doesn't implement interface to OpenSSL, -@ only low-level primitives and unsupported entry points, just enough -@ to collect performance results, which for Cortex-A8 core are: -@ -@ encrypt 19.5 cycles per byte processed with 128-bit key -@ decrypt 22.1 cycles per byte processed with 128-bit key -@ key conv. 440 cycles per 128-bit key/0.18 of 8x block -@ -@ Snapdragon S4 encrypts byte in 17.6 cycles and decrypts in 19.7, -@ which is [much] worse than anticipated (for further details see -@ http://www.openssl.org/~appro/Snapdragon-S4.html). -@ -@ Cortex-A15 manages in 14.2/16.1 cycles [when integer-only code -@ manages in 20.0 cycles]. -@ -@ When comparing to x86_64 results keep in mind that NEON unit is -@ [mostly] single-issue and thus can't [fully] benefit from -@ instruction-level parallelism. And when comparing to aes-armv4 -@ results keep in mind key schedule conversion overhead (see -@ bsaes-x86_64.pl for further details)... -@ -@ - -@ April-August 2013 -@ Add CBC, CTR and XTS subroutines and adapt for kernel use; courtesy of Ard. - -#ifndef __KERNEL__ -# include - -# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} -# define VFP_ABI_POP vldmia sp!,{d8-d15} -# define VFP_ABI_FRAME 0x40 -#else -# define VFP_ABI_PUSH -# define VFP_ABI_POP -# define VFP_ABI_FRAME 0 -# define BSAES_ASM_EXTENDED_KEY -# define XTS_CHAIN_TWEAK -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ 7 -#endif - -#ifdef __thumb__ -# define adrl adr -#endif - -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.text -.syntax unified @ ARMv7-capable assembler is expected to handle this -#if defined(__thumb2__) && !defined(__APPLE__) -.thumb -#else -.code 32 -# undef __thumb2__ -#endif - -.type _bsaes_decrypt8,%function -.align 4 -_bsaes_decrypt8: - adr r6,. - vldmia r4!, {q9} @ round 0 key -#if defined(__thumb2__) || defined(__APPLE__) - adr r6,.LM0ISR -#else - add r6,r6,#.LM0ISR-_bsaes_decrypt8 -#endif - - vldmia r6!, {q8} @ .LM0ISR - veor q10, q0, q9 @ xor with round0 key - veor q11, q1, q9 - vtbl.8 d0, {q10}, d16 - vtbl.8 d1, {q10}, d17 - veor q12, q2, q9 - vtbl.8 d2, {q11}, d16 - vtbl.8 d3, {q11}, d17 - veor q13, q3, q9 - vtbl.8 d4, {q12}, d16 - vtbl.8 d5, {q12}, d17 - veor q14, q4, q9 - vtbl.8 d6, {q13}, d16 - vtbl.8 d7, {q13}, d17 - veor q15, q5, q9 - vtbl.8 d8, {q14}, d16 - vtbl.8 d9, {q14}, d17 - veor q10, q6, q9 - vtbl.8 d10, {q15}, d16 - vtbl.8 d11, {q15}, d17 - veor q11, q7, q9 - vtbl.8 d12, {q10}, d16 - vtbl.8 d13, {q10}, d17 - vtbl.8 d14, {q11}, d16 - vtbl.8 d15, {q11}, d17 - vmov.i8 q8,#0x55 @ compose .LBS0 - vmov.i8 q9,#0x33 @ compose .LBS1 - vshr.u64 q10, q6, #1 - vshr.u64 q11, q4, #1 - veor q10, q10, q7 - veor q11, q11, q5 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #1 - veor q5, q5, q11 - vshl.u64 q11, q11, #1 - veor q6, q6, q10 - veor q4, q4, q11 - vshr.u64 q10, q2, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q3 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q3, q3, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q2, q2, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose .LBS2 - vshr.u64 q10, q5, #2 - vshr.u64 q11, q4, #2 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q9 - vand q11, q11, q9 - veor q7, q7, q10 - vshl.u64 q10, q10, #2 - veor q6, q6, q11 - vshl.u64 q11, q11, #2 - veor q5, q5, q10 - veor q4, q4, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q3 - veor q11, q11, q2 - vand q10, q10, q9 - vand q11, q11, q9 - veor q3, q3, q10 - vshl.u64 q10, q10, #2 - veor q2, q2, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q3, #4 - vshr.u64 q11, q2, #4 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q6, q6, q11 - vshl.u64 q11, q11, #4 - veor q3, q3, q10 - veor q2, q2, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q5 - veor q11, q11, q4 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q4, q4, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - sub r5,r5,#1 - b .Ldec_sbox -.align 4 -.Ldec_loop: - vldmia r4!, {q8,q9,q10,q11} - veor q8, q8, q0 - veor q9, q9, q1 - vtbl.8 d0, {q8}, d24 - vtbl.8 d1, {q8}, d25 - vldmia r4!, {q8} - veor q10, q10, q2 - vtbl.8 d2, {q9}, d24 - vtbl.8 d3, {q9}, d25 - vldmia r4!, {q9} - veor q11, q11, q3 - vtbl.8 d4, {q10}, d24 - vtbl.8 d5, {q10}, d25 - vldmia r4!, {q10} - vtbl.8 d6, {q11}, d24 - vtbl.8 d7, {q11}, d25 - vldmia r4!, {q11} - veor q8, q8, q4 - veor q9, q9, q5 - vtbl.8 d8, {q8}, d24 - vtbl.8 d9, {q8}, d25 - veor q10, q10, q6 - vtbl.8 d10, {q9}, d24 - vtbl.8 d11, {q9}, d25 - veor q11, q11, q7 - vtbl.8 d12, {q10}, d24 - vtbl.8 d13, {q10}, d25 - vtbl.8 d14, {q11}, d24 - vtbl.8 d15, {q11}, d25 -.Ldec_sbox: - veor q1, q1, q4 - veor q3, q3, q4 - - veor q4, q4, q7 - veor q1, q1, q6 - veor q2, q2, q7 - veor q6, q6, q4 - - veor q0, q0, q1 - veor q2, q2, q5 - veor q7, q7, q6 - veor q3, q3, q0 - veor q5, q5, q0 - veor q1, q1, q3 - veor q11, q3, q0 - veor q10, q7, q4 - veor q9, q1, q6 - veor q13, q4, q0 - vmov q8, q10 - veor q12, q5, q2 - - vorr q10, q10, q9 - veor q15, q11, q8 - vand q14, q11, q12 - vorr q11, q11, q12 - veor q12, q12, q9 - vand q8, q8, q9 - veor q9, q6, q2 - vand q15, q15, q12 - vand q13, q13, q9 - veor q9, q3, q7 - veor q12, q1, q5 - veor q11, q11, q13 - veor q10, q10, q13 - vand q13, q9, q12 - vorr q9, q9, q12 - veor q11, q11, q15 - veor q8, q8, q13 - veor q10, q10, q14 - veor q9, q9, q15 - veor q8, q8, q14 - vand q12, q4, q6 - veor q9, q9, q14 - vand q13, q0, q2 - vand q14, q7, q1 - vorr q15, q3, q5 - veor q11, q11, q12 - veor q9, q9, q14 - veor q8, q8, q15 - veor q10, q10, q13 - - @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 - - @ new smaller inversion - - vand q14, q11, q9 - vmov q12, q8 - - veor q13, q10, q14 - veor q15, q8, q14 - veor q14, q8, q14 @ q14=q15 - - vbsl q13, q9, q8 - vbsl q15, q11, q10 - veor q11, q11, q10 - - vbsl q12, q13, q14 - vbsl q8, q14, q13 - - vand q14, q12, q15 - veor q9, q9, q8 - - veor q14, q14, q11 - veor q12, q5, q2 - veor q8, q1, q6 - veor q10, q15, q14 - vand q10, q10, q5 - veor q5, q5, q1 - vand q11, q1, q15 - vand q5, q5, q14 - veor q1, q11, q10 - veor q5, q5, q11 - veor q15, q15, q13 - veor q14, q14, q9 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q2 - veor q12, q12, q8 - veor q2, q2, q6 - vand q8, q8, q15 - vand q6, q6, q13 - vand q12, q12, q14 - vand q2, q2, q9 - veor q8, q8, q12 - veor q2, q2, q6 - veor q12, q12, q11 - veor q6, q6, q10 - veor q5, q5, q12 - veor q2, q2, q12 - veor q1, q1, q8 - veor q6, q6, q8 - - veor q12, q3, q0 - veor q8, q7, q4 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q0 - veor q12, q12, q8 - veor q0, q0, q4 - vand q8, q8, q15 - vand q4, q4, q13 - vand q12, q12, q14 - vand q0, q0, q9 - veor q8, q8, q12 - veor q0, q0, q4 - veor q12, q12, q11 - veor q4, q4, q10 - veor q15, q15, q13 - veor q14, q14, q9 - veor q10, q15, q14 - vand q10, q10, q3 - veor q3, q3, q7 - vand q11, q7, q15 - vand q3, q3, q14 - veor q7, q11, q10 - veor q3, q3, q11 - veor q3, q3, q12 - veor q0, q0, q12 - veor q7, q7, q8 - veor q4, q4, q8 - veor q1, q1, q7 - veor q6, q6, q5 - - veor q4, q4, q1 - veor q2, q2, q7 - veor q5, q5, q7 - veor q4, q4, q2 - veor q7, q7, q0 - veor q4, q4, q5 - veor q3, q3, q6 - veor q6, q6, q1 - veor q3, q3, q4 - - veor q4, q4, q0 - veor q7, q7, q3 - subs r5,r5,#1 - bcc .Ldec_done - @ multiplication by 0x05-0x00-0x04-0x00 - vext.8 q8, q0, q0, #8 - vext.8 q14, q3, q3, #8 - vext.8 q15, q5, q5, #8 - veor q8, q8, q0 - vext.8 q9, q1, q1, #8 - veor q14, q14, q3 - vext.8 q10, q6, q6, #8 - veor q15, q15, q5 - vext.8 q11, q4, q4, #8 - veor q9, q9, q1 - vext.8 q12, q2, q2, #8 - veor q10, q10, q6 - vext.8 q13, q7, q7, #8 - veor q11, q11, q4 - veor q12, q12, q2 - veor q13, q13, q7 - - veor q0, q0, q14 - veor q1, q1, q14 - veor q6, q6, q8 - veor q2, q2, q10 - veor q4, q4, q9 - veor q1, q1, q15 - veor q6, q6, q15 - veor q2, q2, q14 - veor q7, q7, q11 - veor q4, q4, q14 - veor q3, q3, q12 - veor q2, q2, q15 - veor q7, q7, q15 - veor q5, q5, q13 - vext.8 q8, q0, q0, #12 @ x0 <<< 32 - vext.8 q9, q1, q1, #12 - veor q0, q0, q8 @ x0 ^ (x0 <<< 32) - vext.8 q10, q6, q6, #12 - veor q1, q1, q9 - vext.8 q11, q4, q4, #12 - veor q6, q6, q10 - vext.8 q12, q2, q2, #12 - veor q4, q4, q11 - vext.8 q13, q7, q7, #12 - veor q2, q2, q12 - vext.8 q14, q3, q3, #12 - veor q7, q7, q13 - vext.8 q15, q5, q5, #12 - veor q3, q3, q14 - - veor q9, q9, q0 - veor q5, q5, q15 - vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) - veor q10, q10, q1 - veor q8, q8, q5 - veor q9, q9, q5 - vext.8 q1, q1, q1, #8 - veor q13, q13, q2 - veor q0, q0, q8 - veor q14, q14, q7 - veor q1, q1, q9 - vext.8 q8, q2, q2, #8 - veor q12, q12, q4 - vext.8 q9, q7, q7, #8 - veor q15, q15, q3 - vext.8 q2, q4, q4, #8 - veor q11, q11, q6 - vext.8 q7, q5, q5, #8 - veor q12, q12, q5 - vext.8 q4, q3, q3, #8 - veor q11, q11, q5 - vext.8 q3, q6, q6, #8 - veor q5, q9, q13 - veor q11, q11, q2 - veor q7, q7, q15 - veor q6, q4, q14 - veor q4, q8, q12 - veor q2, q3, q10 - vmov q3, q11 - @ vmov q5, q9 - vldmia r6, {q12} @ .LISR - ite eq @ Thumb2 thing, sanity check in ARM - addeq r6,r6,#0x10 - bne .Ldec_loop - vldmia r6, {q12} @ .LISRM0 - b .Ldec_loop -.align 4 -.Ldec_done: - vmov.i8 q8,#0x55 @ compose .LBS0 - vmov.i8 q9,#0x33 @ compose .LBS1 - vshr.u64 q10, q3, #1 - vshr.u64 q11, q2, #1 - veor q10, q10, q5 - veor q11, q11, q7 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #1 - veor q7, q7, q11 - vshl.u64 q11, q11, #1 - veor q3, q3, q10 - veor q2, q2, q11 - vshr.u64 q10, q6, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q4 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q4, q4, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q6, q6, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose .LBS2 - vshr.u64 q10, q7, #2 - vshr.u64 q11, q2, #2 - veor q10, q10, q5 - veor q11, q11, q3 - vand q10, q10, q9 - vand q11, q11, q9 - veor q5, q5, q10 - vshl.u64 q10, q10, #2 - veor q3, q3, q11 - vshl.u64 q11, q11, #2 - veor q7, q7, q10 - veor q2, q2, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q4 - veor q11, q11, q6 - vand q10, q10, q9 - vand q11, q11, q9 - veor q4, q4, q10 - vshl.u64 q10, q10, #2 - veor q6, q6, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q4, #4 - vshr.u64 q11, q6, #4 - veor q10, q10, q5 - veor q11, q11, q3 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q3, q3, q11 - vshl.u64 q11, q11, #4 - veor q4, q4, q10 - veor q6, q6, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q7 - veor q11, q11, q2 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q2, q2, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - vldmia r4, {q8} @ last round key - veor q6, q6, q8 - veor q4, q4, q8 - veor q2, q2, q8 - veor q7, q7, q8 - veor q3, q3, q8 - veor q5, q5, q8 - veor q0, q0, q8 - veor q1, q1, q8 - bx lr -.size _bsaes_decrypt8,.-_bsaes_decrypt8 - -.type _bsaes_const,%object -.align 6 -_bsaes_const: -.LM0ISR:@ InvShiftRows constants -.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 -.LISR: -.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 -.LISRM0: -.quad 0x01040b0e0205080f, 0x0306090c00070a0d -.LM0SR:@ ShiftRows constants -.quad 0x0a0e02060f03070b, 0x0004080c05090d01 -.LSR: -.quad 0x0504070600030201, 0x0f0e0d0c0a09080b -.LSRM0: -.quad 0x0304090e00050a0f, 0x01060b0c0207080d -.LM0: -.quad 0x02060a0e03070b0f, 0x0004080c0105090d -.LREVM0SR: -.quad 0x090d01050c000408, 0x03070b0f060a0e02 -.byte 66,105,116,45,115,108,105,99,101,100,32,65,69,83,32,102,111,114,32,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 6 -.size _bsaes_const,.-_bsaes_const - -.type _bsaes_encrypt8,%function -.align 4 -_bsaes_encrypt8: - adr r6,. - vldmia r4!, {q9} @ round 0 key -#if defined(__thumb2__) || defined(__APPLE__) - adr r6,.LM0SR -#else - sub r6,r6,#_bsaes_encrypt8-.LM0SR -#endif - - vldmia r6!, {q8} @ .LM0SR -_bsaes_encrypt8_alt: - veor q10, q0, q9 @ xor with round0 key - veor q11, q1, q9 - vtbl.8 d0, {q10}, d16 - vtbl.8 d1, {q10}, d17 - veor q12, q2, q9 - vtbl.8 d2, {q11}, d16 - vtbl.8 d3, {q11}, d17 - veor q13, q3, q9 - vtbl.8 d4, {q12}, d16 - vtbl.8 d5, {q12}, d17 - veor q14, q4, q9 - vtbl.8 d6, {q13}, d16 - vtbl.8 d7, {q13}, d17 - veor q15, q5, q9 - vtbl.8 d8, {q14}, d16 - vtbl.8 d9, {q14}, d17 - veor q10, q6, q9 - vtbl.8 d10, {q15}, d16 - vtbl.8 d11, {q15}, d17 - veor q11, q7, q9 - vtbl.8 d12, {q10}, d16 - vtbl.8 d13, {q10}, d17 - vtbl.8 d14, {q11}, d16 - vtbl.8 d15, {q11}, d17 -_bsaes_encrypt8_bitslice: - vmov.i8 q8,#0x55 @ compose .LBS0 - vmov.i8 q9,#0x33 @ compose .LBS1 - vshr.u64 q10, q6, #1 - vshr.u64 q11, q4, #1 - veor q10, q10, q7 - veor q11, q11, q5 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #1 - veor q5, q5, q11 - vshl.u64 q11, q11, #1 - veor q6, q6, q10 - veor q4, q4, q11 - vshr.u64 q10, q2, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q3 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q3, q3, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q2, q2, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose .LBS2 - vshr.u64 q10, q5, #2 - vshr.u64 q11, q4, #2 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q9 - vand q11, q11, q9 - veor q7, q7, q10 - vshl.u64 q10, q10, #2 - veor q6, q6, q11 - vshl.u64 q11, q11, #2 - veor q5, q5, q10 - veor q4, q4, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q3 - veor q11, q11, q2 - vand q10, q10, q9 - vand q11, q11, q9 - veor q3, q3, q10 - vshl.u64 q10, q10, #2 - veor q2, q2, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q3, #4 - vshr.u64 q11, q2, #4 - veor q10, q10, q7 - veor q11, q11, q6 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q6, q6, q11 - vshl.u64 q11, q11, #4 - veor q3, q3, q10 - veor q2, q2, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q5 - veor q11, q11, q4 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q4, q4, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - sub r5,r5,#1 - b .Lenc_sbox -.align 4 -.Lenc_loop: - vldmia r4!, {q8,q9,q10,q11} - veor q8, q8, q0 - veor q9, q9, q1 - vtbl.8 d0, {q8}, d24 - vtbl.8 d1, {q8}, d25 - vldmia r4!, {q8} - veor q10, q10, q2 - vtbl.8 d2, {q9}, d24 - vtbl.8 d3, {q9}, d25 - vldmia r4!, {q9} - veor q11, q11, q3 - vtbl.8 d4, {q10}, d24 - vtbl.8 d5, {q10}, d25 - vldmia r4!, {q10} - vtbl.8 d6, {q11}, d24 - vtbl.8 d7, {q11}, d25 - vldmia r4!, {q11} - veor q8, q8, q4 - veor q9, q9, q5 - vtbl.8 d8, {q8}, d24 - vtbl.8 d9, {q8}, d25 - veor q10, q10, q6 - vtbl.8 d10, {q9}, d24 - vtbl.8 d11, {q9}, d25 - veor q11, q11, q7 - vtbl.8 d12, {q10}, d24 - vtbl.8 d13, {q10}, d25 - vtbl.8 d14, {q11}, d24 - vtbl.8 d15, {q11}, d25 -.Lenc_sbox: - veor q2, q2, q1 - veor q5, q5, q6 - veor q3, q3, q0 - veor q6, q6, q2 - veor q5, q5, q0 - - veor q6, q6, q3 - veor q3, q3, q7 - veor q7, q7, q5 - veor q3, q3, q4 - veor q4, q4, q5 - - veor q2, q2, q7 - veor q3, q3, q1 - veor q1, q1, q5 - veor q11, q7, q4 - veor q10, q1, q2 - veor q9, q5, q3 - veor q13, q2, q4 - vmov q8, q10 - veor q12, q6, q0 - - vorr q10, q10, q9 - veor q15, q11, q8 - vand q14, q11, q12 - vorr q11, q11, q12 - veor q12, q12, q9 - vand q8, q8, q9 - veor q9, q3, q0 - vand q15, q15, q12 - vand q13, q13, q9 - veor q9, q7, q1 - veor q12, q5, q6 - veor q11, q11, q13 - veor q10, q10, q13 - vand q13, q9, q12 - vorr q9, q9, q12 - veor q11, q11, q15 - veor q8, q8, q13 - veor q10, q10, q14 - veor q9, q9, q15 - veor q8, q8, q14 - vand q12, q2, q3 - veor q9, q9, q14 - vand q13, q4, q0 - vand q14, q1, q5 - vorr q15, q7, q6 - veor q11, q11, q12 - veor q9, q9, q14 - veor q8, q8, q15 - veor q10, q10, q13 - - @ Inv_GF16 0, 1, 2, 3, s0, s1, s2, s3 - - @ new smaller inversion - - vand q14, q11, q9 - vmov q12, q8 - - veor q13, q10, q14 - veor q15, q8, q14 - veor q14, q8, q14 @ q14=q15 - - vbsl q13, q9, q8 - vbsl q15, q11, q10 - veor q11, q11, q10 - - vbsl q12, q13, q14 - vbsl q8, q14, q13 - - vand q14, q12, q15 - veor q9, q9, q8 - - veor q14, q14, q11 - veor q12, q6, q0 - veor q8, q5, q3 - veor q10, q15, q14 - vand q10, q10, q6 - veor q6, q6, q5 - vand q11, q5, q15 - vand q6, q6, q14 - veor q5, q11, q10 - veor q6, q6, q11 - veor q15, q15, q13 - veor q14, q14, q9 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q0 - veor q12, q12, q8 - veor q0, q0, q3 - vand q8, q8, q15 - vand q3, q3, q13 - vand q12, q12, q14 - vand q0, q0, q9 - veor q8, q8, q12 - veor q0, q0, q3 - veor q12, q12, q11 - veor q3, q3, q10 - veor q6, q6, q12 - veor q0, q0, q12 - veor q5, q5, q8 - veor q3, q3, q8 - - veor q12, q7, q4 - veor q8, q1, q2 - veor q11, q15, q14 - veor q10, q13, q9 - vand q11, q11, q12 - vand q10, q10, q4 - veor q12, q12, q8 - veor q4, q4, q2 - vand q8, q8, q15 - vand q2, q2, q13 - vand q12, q12, q14 - vand q4, q4, q9 - veor q8, q8, q12 - veor q4, q4, q2 - veor q12, q12, q11 - veor q2, q2, q10 - veor q15, q15, q13 - veor q14, q14, q9 - veor q10, q15, q14 - vand q10, q10, q7 - veor q7, q7, q1 - vand q11, q1, q15 - vand q7, q7, q14 - veor q1, q11, q10 - veor q7, q7, q11 - veor q7, q7, q12 - veor q4, q4, q12 - veor q1, q1, q8 - veor q2, q2, q8 - veor q7, q7, q0 - veor q1, q1, q6 - veor q6, q6, q0 - veor q4, q4, q7 - veor q0, q0, q1 - - veor q1, q1, q5 - veor q5, q5, q2 - veor q2, q2, q3 - veor q3, q3, q5 - veor q4, q4, q5 - - veor q6, q6, q3 - subs r5,r5,#1 - bcc .Lenc_done - vext.8 q8, q0, q0, #12 @ x0 <<< 32 - vext.8 q9, q1, q1, #12 - veor q0, q0, q8 @ x0 ^ (x0 <<< 32) - vext.8 q10, q4, q4, #12 - veor q1, q1, q9 - vext.8 q11, q6, q6, #12 - veor q4, q4, q10 - vext.8 q12, q3, q3, #12 - veor q6, q6, q11 - vext.8 q13, q7, q7, #12 - veor q3, q3, q12 - vext.8 q14, q2, q2, #12 - veor q7, q7, q13 - vext.8 q15, q5, q5, #12 - veor q2, q2, q14 - - veor q9, q9, q0 - veor q5, q5, q15 - vext.8 q0, q0, q0, #8 @ (x0 ^ (x0 <<< 32)) <<< 64) - veor q10, q10, q1 - veor q8, q8, q5 - veor q9, q9, q5 - vext.8 q1, q1, q1, #8 - veor q13, q13, q3 - veor q0, q0, q8 - veor q14, q14, q7 - veor q1, q1, q9 - vext.8 q8, q3, q3, #8 - veor q12, q12, q6 - vext.8 q9, q7, q7, #8 - veor q15, q15, q2 - vext.8 q3, q6, q6, #8 - veor q11, q11, q4 - vext.8 q7, q5, q5, #8 - veor q12, q12, q5 - vext.8 q6, q2, q2, #8 - veor q11, q11, q5 - vext.8 q2, q4, q4, #8 - veor q5, q9, q13 - veor q4, q8, q12 - veor q3, q3, q11 - veor q7, q7, q15 - veor q6, q6, q14 - @ vmov q4, q8 - veor q2, q2, q10 - @ vmov q5, q9 - vldmia r6, {q12} @ .LSR - ite eq @ Thumb2 thing, samity check in ARM - addeq r6,r6,#0x10 - bne .Lenc_loop - vldmia r6, {q12} @ .LSRM0 - b .Lenc_loop -.align 4 -.Lenc_done: - vmov.i8 q8,#0x55 @ compose .LBS0 - vmov.i8 q9,#0x33 @ compose .LBS1 - vshr.u64 q10, q2, #1 - vshr.u64 q11, q3, #1 - veor q10, q10, q5 - veor q11, q11, q7 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #1 - veor q7, q7, q11 - vshl.u64 q11, q11, #1 - veor q2, q2, q10 - veor q3, q3, q11 - vshr.u64 q10, q4, #1 - vshr.u64 q11, q0, #1 - veor q10, q10, q6 - veor q11, q11, q1 - vand q10, q10, q8 - vand q11, q11, q8 - veor q6, q6, q10 - vshl.u64 q10, q10, #1 - veor q1, q1, q11 - vshl.u64 q11, q11, #1 - veor q4, q4, q10 - veor q0, q0, q11 - vmov.i8 q8,#0x0f @ compose .LBS2 - vshr.u64 q10, q7, #2 - vshr.u64 q11, q3, #2 - veor q10, q10, q5 - veor q11, q11, q2 - vand q10, q10, q9 - vand q11, q11, q9 - veor q5, q5, q10 - vshl.u64 q10, q10, #2 - veor q2, q2, q11 - vshl.u64 q11, q11, #2 - veor q7, q7, q10 - veor q3, q3, q11 - vshr.u64 q10, q1, #2 - vshr.u64 q11, q0, #2 - veor q10, q10, q6 - veor q11, q11, q4 - vand q10, q10, q9 - vand q11, q11, q9 - veor q6, q6, q10 - vshl.u64 q10, q10, #2 - veor q4, q4, q11 - vshl.u64 q11, q11, #2 - veor q1, q1, q10 - veor q0, q0, q11 - vshr.u64 q10, q6, #4 - vshr.u64 q11, q4, #4 - veor q10, q10, q5 - veor q11, q11, q2 - vand q10, q10, q8 - vand q11, q11, q8 - veor q5, q5, q10 - vshl.u64 q10, q10, #4 - veor q2, q2, q11 - vshl.u64 q11, q11, #4 - veor q6, q6, q10 - veor q4, q4, q11 - vshr.u64 q10, q1, #4 - vshr.u64 q11, q0, #4 - veor q10, q10, q7 - veor q11, q11, q3 - vand q10, q10, q8 - vand q11, q11, q8 - veor q7, q7, q10 - vshl.u64 q10, q10, #4 - veor q3, q3, q11 - vshl.u64 q11, q11, #4 - veor q1, q1, q10 - veor q0, q0, q11 - vldmia r4, {q8} @ last round key - veor q4, q4, q8 - veor q6, q6, q8 - veor q3, q3, q8 - veor q7, q7, q8 - veor q2, q2, q8 - veor q5, q5, q8 - veor q0, q0, q8 - veor q1, q1, q8 - bx lr -.size _bsaes_encrypt8,.-_bsaes_encrypt8 -.type _bsaes_key_convert,%function -.align 4 -_bsaes_key_convert: - adr r6,. - vld1.8 {q7}, [r4]! @ load round 0 key -#if defined(__thumb2__) || defined(__APPLE__) - adr r6,.LM0 -#else - sub r6,r6,#_bsaes_key_convert-.LM0 -#endif - vld1.8 {q15}, [r4]! @ load round 1 key - - vmov.i8 q8, #0x01 @ bit masks - vmov.i8 q9, #0x02 - vmov.i8 q10, #0x04 - vmov.i8 q11, #0x08 - vmov.i8 q12, #0x10 - vmov.i8 q13, #0x20 - vldmia r6, {q14} @ .LM0 - -#ifdef __ARMEL__ - vrev32.8 q7, q7 - vrev32.8 q15, q15 -#endif - sub r5,r5,#1 - vstmia r12!, {q7} @ save round 0 key - b .Lkey_loop - -.align 4 -.Lkey_loop: - vtbl.8 d14,{q15},d28 - vtbl.8 d15,{q15},d29 - vmov.i8 q6, #0x40 - vmov.i8 q15, #0x80 - - vtst.8 q0, q7, q8 - vtst.8 q1, q7, q9 - vtst.8 q2, q7, q10 - vtst.8 q3, q7, q11 - vtst.8 q4, q7, q12 - vtst.8 q5, q7, q13 - vtst.8 q6, q7, q6 - vtst.8 q7, q7, q15 - vld1.8 {q15}, [r4]! @ load next round key - vmvn q0, q0 @ "pnot" - vmvn q1, q1 - vmvn q5, q5 - vmvn q6, q6 -#ifdef __ARMEL__ - vrev32.8 q15, q15 -#endif - subs r5,r5,#1 - vstmia r12!,{q0,q1,q2,q3,q4,q5,q6,q7} @ write bit-sliced round key - bne .Lkey_loop - - vmov.i8 q7,#0x63 @ compose .L63 - @ don't save last round key - bx lr -.size _bsaes_key_convert,.-_bsaes_key_convert -.globl bsaes_cbc_encrypt -.hidden bsaes_cbc_encrypt -.type bsaes_cbc_encrypt,%function -.align 5 -bsaes_cbc_encrypt: - @ In OpenSSL, this function had a fallback to aes_nohw_cbc_encrypt for - @ short inputs. We patch this out, using bsaes for all input sizes. - - @ it is up to the caller to make sure we are called with enc == 0 - - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} - VFP_ABI_PUSH - ldr r8, [ip] @ IV is 1st arg on the stack - mov r2, r2, lsr#4 @ len in 16 byte blocks - sub sp, #0x10 @ scratch space to carry over the IV - mov r9, sp @ save sp - - ldr r10, [r3, #240] @ get # of rounds -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key - add r12, #96 @ sifze of bit-slices key schedule - - @ populate the key schedule - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - mov sp, r12 @ sp is sp - bl _bsaes_key_convert - vldmia sp, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia sp, {q7} -#else - ldr r12, [r3, #244] - eors r12, #1 - beq 0f - - @ populate the key schedule - str r12, [r3, #244] - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - add r12, r3, #248 @ pass key schedule - bl _bsaes_key_convert - add r4, r3, #248 - vldmia r4, {q6} - vstmia r12, {q15} @ save last round key - veor q7, q7, q6 @ fix up round 0 key - vstmia r4, {q7} - -.align 2 - -#endif - - vld1.8 {q15}, [r8] @ load IV - b .Lcbc_dec_loop - -.align 4 -.Lcbc_dec_loop: - subs r2, r2, #0x8 - bmi .Lcbc_dec_loop_finish - - vld1.8 {q0,q1}, [r0]! @ load input - vld1.8 {q2,q3}, [r0]! -#ifndef BSAES_ASM_EXTENDED_KEY - mov r4, sp @ pass the key -#else - add r4, r3, #248 -#endif - vld1.8 {q4,q5}, [r0]! - mov r5, r10 - vld1.8 {q6,q7}, [r0] - sub r0, r0, #0x60 - vstmia r9, {q15} @ put aside IV - - bl _bsaes_decrypt8 - - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q12,q13}, [r0]! - veor q4, q4, q10 - veor q2, q2, q11 - vld1.8 {q14,q15}, [r0]! - veor q7, q7, q12 - vst1.8 {q0,q1}, [r1]! @ write output - veor q3, q3, q13 - vst1.8 {q6}, [r1]! - veor q5, q5, q14 - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - vst1.8 {q7}, [r1]! - vst1.8 {q3}, [r1]! - vst1.8 {q5}, [r1]! - - b .Lcbc_dec_loop - -.Lcbc_dec_loop_finish: - adds r2, r2, #8 - beq .Lcbc_dec_done - - @ Set up most parameters for the _bsaes_decrypt8 call. -#ifndef BSAES_ASM_EXTENDED_KEY - mov r4, sp @ pass the key -#else - add r4, r3, #248 -#endif - mov r5, r10 - vstmia r9, {q15} @ put aside IV - - vld1.8 {q0}, [r0]! @ load input - cmp r2, #2 - blo .Lcbc_dec_one - vld1.8 {q1}, [r0]! - beq .Lcbc_dec_two - vld1.8 {q2}, [r0]! - cmp r2, #4 - blo .Lcbc_dec_three - vld1.8 {q3}, [r0]! - beq .Lcbc_dec_four - vld1.8 {q4}, [r0]! - cmp r2, #6 - blo .Lcbc_dec_five - vld1.8 {q5}, [r0]! - beq .Lcbc_dec_six - vld1.8 {q6}, [r0]! - sub r0, r0, #0x70 - - bl _bsaes_decrypt8 - - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q12,q13}, [r0]! - veor q4, q4, q10 - veor q2, q2, q11 - vld1.8 {q15}, [r0]! - veor q7, q7, q12 - vst1.8 {q0,q1}, [r1]! @ write output - veor q3, q3, q13 - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - vst1.8 {q7}, [r1]! - vst1.8 {q3}, [r1]! - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_six: - sub r0, r0, #0x60 - bl _bsaes_decrypt8 - vldmia r9,{q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q12}, [r0]! - veor q4, q4, q10 - veor q2, q2, q11 - vld1.8 {q15}, [r0]! - veor q7, q7, q12 - vst1.8 {q0,q1}, [r1]! @ write output - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - vst1.8 {q7}, [r1]! - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_five: - sub r0, r0, #0x50 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10,q11}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q15}, [r0]! - veor q4, q4, q10 - vst1.8 {q0,q1}, [r1]! @ write output - veor q2, q2, q11 - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - vst1.8 {q2}, [r1]! - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_four: - sub r0, r0, #0x40 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q10}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vld1.8 {q15}, [r0]! - veor q4, q4, q10 - vst1.8 {q0,q1}, [r1]! @ write output - vst1.8 {q6}, [r1]! - vst1.8 {q4}, [r1]! - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_three: - sub r0, r0, #0x30 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8,q9}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q15}, [r0]! - veor q1, q1, q8 - veor q6, q6, q9 - vst1.8 {q0,q1}, [r1]! @ write output - vst1.8 {q6}, [r1]! - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_two: - sub r0, r0, #0x20 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q8}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vld1.8 {q15}, [r0]! @ reload input - veor q1, q1, q8 - vst1.8 {q0,q1}, [r1]! @ write output - b .Lcbc_dec_done -.align 4 -.Lcbc_dec_one: - sub r0, r0, #0x10 - bl _bsaes_decrypt8 - vldmia r9, {q14} @ reload IV - vld1.8 {q15}, [r0]! @ reload input - veor q0, q0, q14 @ ^= IV - vst1.8 {q0}, [r1]! @ write output - -.Lcbc_dec_done: -#ifndef BSAES_ASM_EXTENDED_KEY - vmov.i32 q0, #0 - vmov.i32 q1, #0 -.Lcbc_dec_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r9 - bne .Lcbc_dec_bzero -#endif - - mov sp, r9 - add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb - vst1.8 {q15}, [r8] @ return IV - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} -.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt -.globl bsaes_ctr32_encrypt_blocks -.hidden bsaes_ctr32_encrypt_blocks -.type bsaes_ctr32_encrypt_blocks,%function -.align 5 -bsaes_ctr32_encrypt_blocks: - @ In OpenSSL, short inputs fall back to aes_nohw_* here. We patch this - @ out to retain a constant-time implementation. - mov ip, sp - stmdb sp!, {r4,r5,r6,r7,r8,r9,r10, lr} - VFP_ABI_PUSH - ldr r8, [ip] @ ctr is 1st arg on the stack - sub sp, sp, #0x10 @ scratch space to carry over the ctr - mov r9, sp @ save sp - - ldr r10, [r3, #240] @ get # of rounds -#ifndef BSAES_ASM_EXTENDED_KEY - @ allocate the key schedule on the stack - sub r12, sp, r10, lsl#7 @ 128 bytes per inner round key - add r12, #96 @ size of bit-sliced key schedule - - @ populate the key schedule - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - mov sp, r12 @ sp is sp - bl _bsaes_key_convert - veor q7,q7,q15 @ fix up last round key - vstmia r12, {q7} @ save last round key - - vld1.8 {q0}, [r8] @ load counter -#ifdef __APPLE__ - mov r8, #:lower16:(.LREVM0SR-.LM0) - add r8, r6, r8 -#else - add r8, r6, #.LREVM0SR-.LM0 @ borrow r8 -#endif - vldmia sp, {q4} @ load round0 key -#else - ldr r12, [r3, #244] - eors r12, #1 - beq 0f - - @ populate the key schedule - str r12, [r3, #244] - mov r4, r3 @ pass key - mov r5, r10 @ pass # of rounds - add r12, r3, #248 @ pass key schedule - bl _bsaes_key_convert - veor q7,q7,q15 @ fix up last round key - vstmia r12, {q7} @ save last round key - -.align 2 - add r12, r3, #248 - vld1.8 {q0}, [r8] @ load counter - adrl r8, .LREVM0SR @ borrow r8 - vldmia r12, {q4} @ load round0 key - sub sp, #0x10 @ place for adjusted round0 key -#endif - - vmov.i32 q8,#1 @ compose 1<<96 - veor q9,q9,q9 - vrev32.8 q0,q0 - vext.8 q8,q9,q8,#4 - vrev32.8 q4,q4 - vadd.u32 q9,q8,q8 @ compose 2<<96 - vstmia sp, {q4} @ save adjusted round0 key - b .Lctr_enc_loop - -.align 4 -.Lctr_enc_loop: - vadd.u32 q10, q8, q9 @ compose 3<<96 - vadd.u32 q1, q0, q8 @ +1 - vadd.u32 q2, q0, q9 @ +2 - vadd.u32 q3, q0, q10 @ +3 - vadd.u32 q4, q1, q10 - vadd.u32 q5, q2, q10 - vadd.u32 q6, q3, q10 - vadd.u32 q7, q4, q10 - vadd.u32 q10, q5, q10 @ next counter - - @ Borrow prologue from _bsaes_encrypt8 to use the opportunity - @ to flip byte order in 32-bit counter - - vldmia sp, {q9} @ load round0 key -#ifndef BSAES_ASM_EXTENDED_KEY - add r4, sp, #0x10 @ pass next round key -#else - add r4, r3, #264 -#endif - vldmia r8, {q8} @ .LREVM0SR - mov r5, r10 @ pass rounds - vstmia r9, {q10} @ save next counter -#ifdef __APPLE__ - mov r6, #:lower16:(.LREVM0SR-.LSR) - sub r6, r8, r6 -#else - sub r6, r8, #.LREVM0SR-.LSR @ pass constants -#endif - - bl _bsaes_encrypt8_alt - - subs r2, r2, #8 - blo .Lctr_enc_loop_done - - vld1.8 {q8,q9}, [r0]! @ load input - vld1.8 {q10,q11}, [r0]! - veor q0, q8 - veor q1, q9 - vld1.8 {q12,q13}, [r0]! - veor q4, q10 - veor q6, q11 - vld1.8 {q14,q15}, [r0]! - veor q3, q12 - vst1.8 {q0,q1}, [r1]! @ write output - veor q7, q13 - veor q2, q14 - vst1.8 {q4}, [r1]! - veor q5, q15 - vst1.8 {q6}, [r1]! - vmov.i32 q8, #1 @ compose 1<<96 - vst1.8 {q3}, [r1]! - veor q9, q9, q9 - vst1.8 {q7}, [r1]! - vext.8 q8, q9, q8, #4 - vst1.8 {q2}, [r1]! - vadd.u32 q9,q8,q8 @ compose 2<<96 - vst1.8 {q5}, [r1]! - vldmia r9, {q0} @ load counter - - bne .Lctr_enc_loop - b .Lctr_enc_done - -.align 4 -.Lctr_enc_loop_done: - add r2, r2, #8 - vld1.8 {q8}, [r0]! @ load input - veor q0, q8 - vst1.8 {q0}, [r1]! @ write output - cmp r2, #2 - blo .Lctr_enc_done - vld1.8 {q9}, [r0]! - veor q1, q9 - vst1.8 {q1}, [r1]! - beq .Lctr_enc_done - vld1.8 {q10}, [r0]! - veor q4, q10 - vst1.8 {q4}, [r1]! - cmp r2, #4 - blo .Lctr_enc_done - vld1.8 {q11}, [r0]! - veor q6, q11 - vst1.8 {q6}, [r1]! - beq .Lctr_enc_done - vld1.8 {q12}, [r0]! - veor q3, q12 - vst1.8 {q3}, [r1]! - cmp r2, #6 - blo .Lctr_enc_done - vld1.8 {q13}, [r0]! - veor q7, q13 - vst1.8 {q7}, [r1]! - beq .Lctr_enc_done - vld1.8 {q14}, [r0] - veor q2, q14 - vst1.8 {q2}, [r1]! - -.Lctr_enc_done: - vmov.i32 q0, #0 - vmov.i32 q1, #0 -#ifndef BSAES_ASM_EXTENDED_KEY -.Lctr_enc_bzero:@ wipe key schedule [if any] - vstmia sp!, {q0,q1} - cmp sp, r9 - bne .Lctr_enc_bzero -#else - vstmia sp, {q0,q1} -#endif - - mov sp, r9 - add sp, #0x10 @ add sp,r9,#0x10 is no good for thumb - VFP_ABI_POP - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10, pc} @ return - - @ OpenSSL contains aes_nohw_* fallback code here. We patch this - @ out to retain a constant-time implementation. -.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S b/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S deleted file mode 100644 index 0532695a..00000000 --- a/third_party/boringssl/linux-arm/crypto/fipsmodule/ghash-armv4.S +++ /dev/null @@ -1,255 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__arm__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. (ARMv8 PMULL -@ instructions are in aesv8-armx.pl.) -.arch armv7-a - -.text -#if defined(__thumb2__) || defined(__clang__) -.syntax unified -#define ldrplb ldrbpl -#define ldrneb ldrbne -#endif -#if defined(__thumb2__) -.thumb -#else -.code 32 -#endif -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.globl gcm_init_neon -.hidden gcm_init_neon -.type gcm_init_neon,%function -.align 4 -gcm_init_neon: - vld1.64 d7,[r1]! @ load H - vmov.i8 q8,#0xe1 - vld1.64 d6,[r1] - vshl.i64 d17,#57 - vshr.u64 d16,#63 @ t0=0xc2....01 - vdup.8 q9,d7[7] - vshr.u64 d26,d6,#63 - vshr.s8 q9,#7 @ broadcast carry bit - vshl.i64 q3,q3,#1 - vand q8,q8,q9 - vorr d7,d26 @ H<<<=1 - veor q3,q3,q8 @ twisted H - vstmia r0,{q3} - - bx lr @ bx lr -.size gcm_init_neon,.-gcm_init_neon - -.globl gcm_gmult_neon -.hidden gcm_gmult_neon -.type gcm_gmult_neon,%function -.align 4 -gcm_gmult_neon: - vld1.64 d7,[r0]! @ load Xi - vld1.64 d6,[r0]! - vmov.i64 d29,#0x0000ffffffffffff - vldmia r1,{d26,d27} @ load twisted H - vmov.i64 d30,#0x00000000ffffffff -#ifdef __ARMEL__ - vrev64.8 q3,q3 -#endif - vmov.i64 d31,#0x000000000000ffff - veor d28,d26,d27 @ Karatsuba pre-processing - mov r3,#16 - b .Lgmult_neon -.size gcm_gmult_neon,.-gcm_gmult_neon - -.globl gcm_ghash_neon -.hidden gcm_ghash_neon -.type gcm_ghash_neon,%function -.align 4 -gcm_ghash_neon: - vld1.64 d1,[r0]! @ load Xi - vld1.64 d0,[r0]! - vmov.i64 d29,#0x0000ffffffffffff - vldmia r1,{d26,d27} @ load twisted H - vmov.i64 d30,#0x00000000ffffffff -#ifdef __ARMEL__ - vrev64.8 q0,q0 -#endif - vmov.i64 d31,#0x000000000000ffff - veor d28,d26,d27 @ Karatsuba pre-processing - -.Loop_neon: - vld1.64 d7,[r2]! @ load inp - vld1.64 d6,[r2]! -#ifdef __ARMEL__ - vrev64.8 q3,q3 -#endif - veor q3,q0 @ inp^=Xi -.Lgmult_neon: - vext.8 d16, d26, d26, #1 @ A1 - vmull.p8 q8, d16, d6 @ F = A1*B - vext.8 d0, d6, d6, #1 @ B1 - vmull.p8 q0, d26, d0 @ E = A*B1 - vext.8 d18, d26, d26, #2 @ A2 - vmull.p8 q9, d18, d6 @ H = A2*B - vext.8 d22, d6, d6, #2 @ B2 - vmull.p8 q11, d26, d22 @ G = A*B2 - vext.8 d20, d26, d26, #3 @ A3 - veor q8, q8, q0 @ L = E + F - vmull.p8 q10, d20, d6 @ J = A3*B - vext.8 d0, d6, d6, #3 @ B3 - veor q9, q9, q11 @ M = G + H - vmull.p8 q0, d26, d0 @ I = A*B3 - veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 - vand d17, d17, d29 - vext.8 d22, d6, d6, #4 @ B4 - veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 - vand d19, d19, d30 - vmull.p8 q11, d26, d22 @ K = A*B4 - veor q10, q10, q0 @ N = I + J - veor d16, d16, d17 - veor d18, d18, d19 - veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 - vand d21, d21, d31 - vext.8 q8, q8, q8, #15 - veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 - vmov.i64 d23, #0 - vext.8 q9, q9, q9, #14 - veor d20, d20, d21 - vmull.p8 q0, d26, d6 @ D = A*B - vext.8 q11, q11, q11, #12 - vext.8 q10, q10, q10, #13 - veor q8, q8, q9 - veor q10, q10, q11 - veor q0, q0, q8 - veor q0, q0, q10 - veor d6,d6,d7 @ Karatsuba pre-processing - vext.8 d16, d28, d28, #1 @ A1 - vmull.p8 q8, d16, d6 @ F = A1*B - vext.8 d2, d6, d6, #1 @ B1 - vmull.p8 q1, d28, d2 @ E = A*B1 - vext.8 d18, d28, d28, #2 @ A2 - vmull.p8 q9, d18, d6 @ H = A2*B - vext.8 d22, d6, d6, #2 @ B2 - vmull.p8 q11, d28, d22 @ G = A*B2 - vext.8 d20, d28, d28, #3 @ A3 - veor q8, q8, q1 @ L = E + F - vmull.p8 q10, d20, d6 @ J = A3*B - vext.8 d2, d6, d6, #3 @ B3 - veor q9, q9, q11 @ M = G + H - vmull.p8 q1, d28, d2 @ I = A*B3 - veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 - vand d17, d17, d29 - vext.8 d22, d6, d6, #4 @ B4 - veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 - vand d19, d19, d30 - vmull.p8 q11, d28, d22 @ K = A*B4 - veor q10, q10, q1 @ N = I + J - veor d16, d16, d17 - veor d18, d18, d19 - veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 - vand d21, d21, d31 - vext.8 q8, q8, q8, #15 - veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 - vmov.i64 d23, #0 - vext.8 q9, q9, q9, #14 - veor d20, d20, d21 - vmull.p8 q1, d28, d6 @ D = A*B - vext.8 q11, q11, q11, #12 - vext.8 q10, q10, q10, #13 - veor q8, q8, q9 - veor q10, q10, q11 - veor q1, q1, q8 - veor q1, q1, q10 - vext.8 d16, d27, d27, #1 @ A1 - vmull.p8 q8, d16, d7 @ F = A1*B - vext.8 d4, d7, d7, #1 @ B1 - vmull.p8 q2, d27, d4 @ E = A*B1 - vext.8 d18, d27, d27, #2 @ A2 - vmull.p8 q9, d18, d7 @ H = A2*B - vext.8 d22, d7, d7, #2 @ B2 - vmull.p8 q11, d27, d22 @ G = A*B2 - vext.8 d20, d27, d27, #3 @ A3 - veor q8, q8, q2 @ L = E + F - vmull.p8 q10, d20, d7 @ J = A3*B - vext.8 d4, d7, d7, #3 @ B3 - veor q9, q9, q11 @ M = G + H - vmull.p8 q2, d27, d4 @ I = A*B3 - veor d16, d16, d17 @ t0 = (L) (P0 + P1) << 8 - vand d17, d17, d29 - vext.8 d22, d7, d7, #4 @ B4 - veor d18, d18, d19 @ t1 = (M) (P2 + P3) << 16 - vand d19, d19, d30 - vmull.p8 q11, d27, d22 @ K = A*B4 - veor q10, q10, q2 @ N = I + J - veor d16, d16, d17 - veor d18, d18, d19 - veor d20, d20, d21 @ t2 = (N) (P4 + P5) << 24 - vand d21, d21, d31 - vext.8 q8, q8, q8, #15 - veor d22, d22, d23 @ t3 = (K) (P6 + P7) << 32 - vmov.i64 d23, #0 - vext.8 q9, q9, q9, #14 - veor d20, d20, d21 - vmull.p8 q2, d27, d7 @ D = A*B - vext.8 q11, q11, q11, #12 - vext.8 q10, q10, q10, #13 - veor q8, q8, q9 - veor q10, q10, q11 - veor q2, q2, q8 - veor q2, q2, q10 - veor q1,q1,q0 @ Karatsuba post-processing - veor q1,q1,q2 - veor d1,d1,d2 - veor d4,d4,d3 @ Xh|Xl - 256-bit result - - @ equivalent of reduction_avx from ghash-x86_64.pl - vshl.i64 q9,q0,#57 @ 1st phase - vshl.i64 q10,q0,#62 - veor q10,q10,q9 @ - vshl.i64 q9,q0,#63 - veor q10, q10, q9 @ - veor d1,d1,d20 @ - veor d4,d4,d21 - - vshr.u64 q10,q0,#1 @ 2nd phase - veor q2,q2,q0 - veor q0,q0,q10 @ - vshr.u64 q10,q10,#6 - vshr.u64 q0,q0,#1 @ - veor q0,q0,q2 @ - veor q0,q0,q10 @ - - subs r3,#16 - bne .Loop_neon - -#ifdef __ARMEL__ - vrev64.8 q0,q0 -#endif - sub r0,#16 - vst1.64 d1,[r0]! @ write out Xi - vst1.64 d0,[r0] - - bx lr @ bx lr -.size gcm_ghash_neon,.-gcm_ghash_neon -#endif -.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S b/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S deleted file mode 100644 index 096dfb74..00000000 --- a/third_party/boringssl/linux-arm/crypto/fipsmodule/ghashv8-armx32.S +++ /dev/null @@ -1,257 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__arm__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -#if __ARM_MAX_ARCH__>=7 -.text -.fpu neon -.code 32 -#undef __thumb2__ -.globl gcm_init_v8 -.hidden gcm_init_v8 -.type gcm_init_v8,%function -.align 4 -gcm_init_v8: - AARCH64_VALID_CALL_TARGET - vld1.64 {q9},[r1] @ load input H - vmov.i8 q11,#0xe1 - vshl.i64 q11,q11,#57 @ 0xc2.0 - vext.8 q3,q9,q9,#8 - vshr.u64 q10,q11,#63 - vdup.32 q9,d18[1] - vext.8 q8,q10,q11,#8 @ t0=0xc2....01 - vshr.u64 q10,q3,#63 - vshr.s32 q9,q9,#31 @ broadcast carry bit - vand q10,q10,q8 - vshl.i64 q3,q3,#1 - vext.8 q10,q10,q10,#8 - vand q8,q8,q9 - vorr q3,q3,q10 @ H<<<=1 - veor q12,q3,q8 @ twisted H - vst1.64 {q12},[r0]! @ store Htable[0] - - @ calculate H^2 - vext.8 q8,q12,q12,#8 @ Karatsuba pre-processing -.byte 0xa8,0x0e,0xa8,0xf2 @ pmull q0,q12,q12 - veor q8,q8,q12 -.byte 0xa9,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q12 -.byte 0xa0,0x2e,0xa0,0xf2 @ pmull q1,q8,q8 - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase - - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - veor q0,q1,q10 - - vext.8 q10,q0,q0,#8 @ 2nd phase -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q10,q10,q2 - veor q14,q0,q10 - - vext.8 q9,q14,q14,#8 @ Karatsuba pre-processing - veor q9,q9,q14 - vext.8 q13,q8,q9,#8 @ pack Karatsuba pre-processed - vst1.64 {q13,q14},[r0]! @ store Htable[1..2] - bx lr -.size gcm_init_v8,.-gcm_init_v8 -.globl gcm_gmult_v8 -.hidden gcm_gmult_v8 -.type gcm_gmult_v8,%function -.align 4 -gcm_gmult_v8: - AARCH64_VALID_CALL_TARGET - vld1.64 {q9},[r0] @ load Xi - vmov.i8 q11,#0xe1 - vld1.64 {q12,q13},[r1] @ load twisted H, ... - vshl.u64 q11,q11,#57 -#ifndef __ARMEB__ - vrev64.8 q9,q9 -#endif - vext.8 q3,q9,q9,#8 - -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo - veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction - - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - veor q0,q1,q10 - - vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q10,q10,q2 - veor q0,q0,q10 - -#ifndef __ARMEB__ - vrev64.8 q0,q0 -#endif - vext.8 q0,q0,q0,#8 - vst1.64 {q0},[r0] @ write out Xi - - bx lr -.size gcm_gmult_v8,.-gcm_gmult_v8 -.globl gcm_ghash_v8 -.hidden gcm_ghash_v8 -.type gcm_ghash_v8,%function -.align 4 -gcm_ghash_v8: - AARCH64_VALID_CALL_TARGET - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so - vld1.64 {q0},[r0] @ load [rotated] Xi - @ "[rotated]" means that - @ loaded value would have - @ to be rotated in order to - @ make it appear as in - @ algorithm specification - subs r3,r3,#32 @ see if r3 is 32 or larger - mov r12,#16 @ r12 is used as post- - @ increment for input pointer; - @ as loop is modulo-scheduled - @ r12 is zeroed just in time - @ to preclude overstepping - @ inp[len], which means that - @ last block[s] are actually - @ loaded twice, but last - @ copy is not processed - vld1.64 {q12,q13},[r1]! @ load twisted H, ..., H^2 - vmov.i8 q11,#0xe1 - vld1.64 {q14},[r1] - moveq r12,#0 @ is it time to zero r12? - vext.8 q0,q0,q0,#8 @ rotate Xi - vld1.64 {q8},[r2]! @ load [rotated] I[0] - vshl.u64 q11,q11,#57 @ compose 0xc2.0 constant -#ifndef __ARMEB__ - vrev64.8 q8,q8 - vrev64.8 q0,q0 -#endif - vext.8 q3,q8,q8,#8 @ rotate I[0] - blo .Lodd_tail_v8 @ r3 was less than 32 - vld1.64 {q9},[r2],r12 @ load [rotated] I[1] -#ifndef __ARMEB__ - vrev64.8 q9,q9 -#endif - vext.8 q7,q9,q9,#8 - veor q3,q3,q0 @ I[i]^=Xi -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 - veor q9,q9,q7 @ Karatsuba pre-processing -.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 - b .Loop_mod2x_v8 - -.align 4 -.Loop_mod2x_v8: - vext.8 q10,q3,q3,#8 - subs r3,r3,#32 @ is there more data? -.byte 0x86,0x0e,0xac,0xf2 @ pmull q0,q14,q3 @ H^2.lo·Xi.lo - movlo r12,#0 @ is it time to zero r12? - -.byte 0xa2,0xae,0xaa,0xf2 @ pmull q5,q13,q9 - veor q10,q10,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xad,0xf2 @ pmull2 q2,q14,q3 @ H^2.hi·Xi.hi - veor q0,q0,q4 @ accumulate -.byte 0xa5,0x2e,0xab,0xf2 @ pmull2 q1,q13,q10 @ (H^2.lo+H^2.hi)·(Xi.lo+Xi.hi) - vld1.64 {q8},[r2],r12 @ load [rotated] I[i+2] - - veor q2,q2,q6 - moveq r12,#0 @ is it time to zero r12? - veor q1,q1,q5 - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - vld1.64 {q9},[r2],r12 @ load [rotated] I[i+3] -#ifndef __ARMEB__ - vrev64.8 q8,q8 -#endif - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction - -#ifndef __ARMEB__ - vrev64.8 q9,q9 -#endif - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - vext.8 q7,q9,q9,#8 - vext.8 q3,q8,q8,#8 - veor q0,q1,q10 -.byte 0x8e,0x8e,0xa8,0xf2 @ pmull q4,q12,q7 @ H·Ii+1 - veor q3,q3,q2 @ accumulate q3 early - - vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q3,q3,q10 - veor q9,q9,q7 @ Karatsuba pre-processing - veor q3,q3,q0 -.byte 0x8f,0xce,0xa9,0xf2 @ pmull2 q6,q12,q7 - bhs .Loop_mod2x_v8 @ there was at least 32 more bytes - - veor q2,q2,q10 - vext.8 q3,q8,q8,#8 @ re-construct q3 - adds r3,r3,#32 @ re-construct r3 - veor q0,q0,q2 @ re-construct q0 - beq .Ldone_v8 @ is r3 zero? -.Lodd_tail_v8: - vext.8 q10,q0,q0,#8 - veor q3,q3,q0 @ inp^=Xi - veor q9,q8,q10 @ q9 is rotated inp^Xi - -.byte 0x86,0x0e,0xa8,0xf2 @ pmull q0,q12,q3 @ H.lo·Xi.lo - veor q9,q9,q3 @ Karatsuba pre-processing -.byte 0x87,0x4e,0xa9,0xf2 @ pmull2 q2,q12,q3 @ H.hi·Xi.hi -.byte 0xa2,0x2e,0xaa,0xf2 @ pmull q1,q13,q9 @ (H.lo+H.hi)·(Xi.lo+Xi.hi) - - vext.8 q9,q0,q2,#8 @ Karatsuba post-processing - veor q10,q0,q2 - veor q1,q1,q9 - veor q1,q1,q10 -.byte 0x26,0x4e,0xe0,0xf2 @ pmull q10,q0,q11 @ 1st phase of reduction - - vmov d4,d3 @ Xh|Xm - 256-bit result - vmov d3,d0 @ Xm is rotated Xl - veor q0,q1,q10 - - vext.8 q10,q0,q0,#8 @ 2nd phase of reduction -.byte 0x26,0x0e,0xa0,0xf2 @ pmull q0,q0,q11 - veor q10,q10,q2 - veor q0,q0,q10 - -.Ldone_v8: -#ifndef __ARMEB__ - vrev64.8 q0,q0 -#endif - vext.8 q0,q0,q0,#8 - vst1.64 {q0},[r0] @ write out Xi - - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ 32-bit ABI says so - bx lr -.size gcm_ghash_v8,.-gcm_ghash_v8 -.byte 71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S b/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S deleted file mode 100644 index 61deddf8..00000000 --- a/third_party/boringssl/linux-arm/crypto/fipsmodule/sha1-armv4-large.S +++ /dev/null @@ -1,1511 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__arm__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -#include - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -.globl sha1_block_data_order -.hidden sha1_block_data_order -.type sha1_block_data_order,%function - -.align 5 -sha1_block_data_order: -#if __ARM_MAX_ARCH__>=7 -.Lsha1_block: - adr r3,.Lsha1_block - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ - ldr r12,[r12] -#endif - tst r12,#ARMV8_SHA1 - bne .LARMv8 - tst r12,#ARMV7_NEON - bne .LNEON -#endif - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 - ldmia r0,{r3,r4,r5,r6,r7} -.Lloop: - ldr r8,.LK_00_19 - mov r14,sp - sub sp,sp,#15*4 - mov r5,r5,ror#30 - mov r6,r6,ror#30 - mov r7,r7,ror#30 @ [6] -.L_00_15: -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r6,r8,r6,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r4,r5 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r6,r8,r6,ror#2 @ E+=K_00_19 - eor r10,r4,r5 @ F_xx_xx - add r6,r6,r7,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r3,r10,ror#2 - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r6,r6,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r5,r8,r5,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r3,r4 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r5,r8,r5,ror#2 @ E+=K_00_19 - eor r10,r3,r4 @ F_xx_xx - add r5,r5,r6,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r7,r10,ror#2 - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r5,r5,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r4,r8,r4,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r7,r3 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r4,r8,r4,ror#2 @ E+=K_00_19 - eor r10,r7,r3 @ F_xx_xx - add r4,r4,r5,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r6,r10,ror#2 - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r4,r4,r10 @ E+=F_00_19(B,C,D) -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r3,r8,r3,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r6,r7 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r3,r8,r3,ror#2 @ E+=K_00_19 - eor r10,r6,r7 @ F_xx_xx - add r3,r3,r4,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r5,r10,ror#2 - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r3,r3,r10 @ E+=F_00_19(B,C,D) -#if defined(__thumb2__) - mov r12,sp - teq r14,r12 -#else - teq r14,sp -#endif - bne .L_00_15 @ [((11+4)*5+2)*3] - sub sp,sp,#25*4 -#if __ARM_ARCH__<7 - ldrb r10,[r1,#2] - ldrb r9,[r1,#3] - ldrb r11,[r1,#1] - add r7,r8,r7,ror#2 @ E+=K_00_19 - ldrb r12,[r1],#4 - orr r9,r9,r10,lsl#8 - eor r10,r5,r6 @ F_xx_xx - orr r9,r9,r11,lsl#16 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - orr r9,r9,r12,lsl#24 -#else - ldr r9,[r1],#4 @ handles unaligned - add r7,r8,r7,ror#2 @ E+=K_00_19 - eor r10,r5,r6 @ F_xx_xx - add r7,r7,r3,ror#27 @ E+=ROR(A,27) -#ifdef __ARMEL__ - rev r9,r9 @ byte swap -#endif -#endif - and r10,r4,r10,ror#2 - add r7,r7,r9 @ E+=X[i] - eor r10,r10,r6,ror#2 @ F_00_19(B,C,D) - str r9,[r14,#-4]! - add r7,r7,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - eor r10,r10,r5,ror#2 @ F_00_19(B,C,D) - add r6,r6,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - eor r10,r10,r4,ror#2 @ F_00_19(B,C,D) - add r5,r5,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - eor r10,r10,r3,ror#2 @ F_00_19(B,C,D) - add r4,r4,r10 @ E+=F_00_19(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - eor r10,r10,r7,ror#2 @ F_00_19(B,C,D) - add r3,r3,r10 @ E+=F_00_19(B,C,D) - - ldr r8,.LK_20_39 @ [+15+16*4] - cmn sp,#0 @ [+3], clear carry to denote 20_39 -.L_20_39_or_60_79: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r4,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r3,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r7,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r6,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_20_39(B,C,D) - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - eor r10,r5,r10,ror#2 @ F_xx_xx - @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_20_39(B,C,D) -#if defined(__thumb2__) - mov r12,sp - teq r14,r12 -#else - teq r14,sp @ preserve carry -#endif - bne .L_20_39_or_60_79 @ [+((12+3)*5+2)*4] - bcs .L_done @ [+((12+3)*5+2)*4], spare 300 bytes - - ldr r8,.LK_40_59 - sub sp,sp,#20*4 @ [+2] -.L_40_59: - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r7,r8,r7,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r5,r6 @ F_xx_xx - mov r9,r9,ror#31 - add r7,r7,r3,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r4,r10,ror#2 @ F_xx_xx - and r11,r5,r6 @ F_xx_xx - add r7,r7,r9 @ E+=X[i] - add r7,r7,r10 @ E+=F_40_59(B,C,D) - add r7,r7,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r6,r8,r6,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r4,r5 @ F_xx_xx - mov r9,r9,ror#31 - add r6,r6,r7,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r3,r10,ror#2 @ F_xx_xx - and r11,r4,r5 @ F_xx_xx - add r6,r6,r9 @ E+=X[i] - add r6,r6,r10 @ E+=F_40_59(B,C,D) - add r6,r6,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r5,r8,r5,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r3,r4 @ F_xx_xx - mov r9,r9,ror#31 - add r5,r5,r6,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r7,r10,ror#2 @ F_xx_xx - and r11,r3,r4 @ F_xx_xx - add r5,r5,r9 @ E+=X[i] - add r5,r5,r10 @ E+=F_40_59(B,C,D) - add r5,r5,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r4,r8,r4,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r7,r3 @ F_xx_xx - mov r9,r9,ror#31 - add r4,r4,r5,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r6,r10,ror#2 @ F_xx_xx - and r11,r7,r3 @ F_xx_xx - add r4,r4,r9 @ E+=X[i] - add r4,r4,r10 @ E+=F_40_59(B,C,D) - add r4,r4,r11,ror#2 - ldr r9,[r14,#15*4] - ldr r10,[r14,#13*4] - ldr r11,[r14,#7*4] - add r3,r8,r3,ror#2 @ E+=K_xx_xx - ldr r12,[r14,#2*4] - eor r9,r9,r10 - eor r11,r11,r12 @ 1 cycle stall - eor r10,r6,r7 @ F_xx_xx - mov r9,r9,ror#31 - add r3,r3,r4,ror#27 @ E+=ROR(A,27) - eor r9,r9,r11,ror#31 - str r9,[r14,#-4]! - and r10,r5,r10,ror#2 @ F_xx_xx - and r11,r6,r7 @ F_xx_xx - add r3,r3,r9 @ E+=X[i] - add r3,r3,r10 @ E+=F_40_59(B,C,D) - add r3,r3,r11,ror#2 -#if defined(__thumb2__) - mov r12,sp - teq r14,r12 -#else - teq r14,sp -#endif - bne .L_40_59 @ [+((12+5)*5+2)*4] - - ldr r8,.LK_60_79 - sub sp,sp,#20*4 - cmp sp,#0 @ set carry to denote 60_79 - b .L_20_39_or_60_79 @ [+4], spare 300 bytes -.L_done: - add sp,sp,#80*4 @ "deallocate" stack frame - ldmia r0,{r8,r9,r10,r11,r12} - add r3,r8,r3 - add r4,r9,r4 - add r5,r10,r5,ror#2 - add r6,r11,r6,ror#2 - add r7,r12,r7,ror#2 - stmia r0,{r3,r4,r5,r6,r7} - teq r1,r2 - bne .Lloop @ [+18], total 1307 - -#if __ARM_ARCH__>=5 - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} -#else - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size sha1_block_data_order,.-sha1_block_data_order - -.align 5 -.LK_00_19:.word 0x5a827999 -.LK_20_39:.word 0x6ed9eba1 -.LK_40_59:.word 0x8f1bbcdc -.LK_60_79:.word 0xca62c1d6 -#if __ARM_MAX_ARCH__>=7 -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.Lsha1_block -#endif -.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 5 -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.type sha1_block_data_order_neon,%function -.align 4 -sha1_block_data_order_neon: -.LNEON: - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - add r2,r1,r2,lsl#6 @ r2 to point at the end of r1 - @ dmb @ errata #451034 on early Cortex A8 - @ vstmdb sp!,{d8-d15} @ ABI specification says so - mov r14,sp - sub r12,sp,#64 - adr r8,.LK_00_19 - bic r12,r12,#15 @ align for 128-bit stores - - ldmia r0,{r3,r4,r5,r6,r7} @ load context - mov sp,r12 @ alloca - - vld1.8 {q0,q1},[r1]! @ handles unaligned - veor q15,q15,q15 - vld1.8 {q2,q3},[r1]! - vld1.32 {d28[],d29[]},[r8,:32]! @ load K_00_19 - vrev32.8 q0,q0 @ yes, even on - vrev32.8 q1,q1 @ big-endian... - vrev32.8 q2,q2 - vadd.i32 q8,q0,q14 - vrev32.8 q3,q3 - vadd.i32 q9,q1,q14 - vst1.32 {q8},[r12,:128]! - vadd.i32 q10,q2,q14 - vst1.32 {q9},[r12,:128]! - vst1.32 {q10},[r12,:128]! - ldr r9,[sp] @ big RAW stall - -.Loop_neon: - vext.8 q8,q0,q1,#8 - bic r10,r6,r4 - add r7,r7,r9 - and r11,r5,r4 - vadd.i32 q13,q3,q14 - ldr r9,[sp,#4] - add r7,r7,r3,ror#27 - vext.8 q12,q3,q15,#4 - eor r11,r11,r10 - mov r4,r4,ror#2 - add r7,r7,r11 - veor q8,q8,q0 - bic r10,r5,r3 - add r6,r6,r9 - veor q12,q12,q2 - and r11,r4,r3 - ldr r9,[sp,#8] - veor q12,q12,q8 - add r6,r6,r7,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q13,q15,q12,#4 - bic r10,r4,r7 - add r5,r5,r9 - vadd.i32 q8,q12,q12 - and r11,r3,r7 - ldr r9,[sp,#12] - vsri.32 q8,q12,#31 - add r5,r5,r6,ror#27 - eor r11,r11,r10 - mov r7,r7,ror#2 - vshr.u32 q12,q13,#30 - add r5,r5,r11 - bic r10,r3,r6 - vshl.u32 q13,q13,#2 - add r4,r4,r9 - and r11,r7,r6 - veor q8,q8,q12 - ldr r9,[sp,#16] - add r4,r4,r5,ror#27 - veor q8,q8,q13 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q9,q1,q2,#8 - bic r10,r7,r5 - add r3,r3,r9 - and r11,r6,r5 - vadd.i32 q13,q8,q14 - ldr r9,[sp,#20] - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r4,ror#27 - vext.8 q12,q8,q15,#4 - eor r11,r11,r10 - mov r5,r5,ror#2 - add r3,r3,r11 - veor q9,q9,q1 - bic r10,r6,r4 - add r7,r7,r9 - veor q12,q12,q3 - and r11,r5,r4 - ldr r9,[sp,#24] - veor q12,q12,q9 - add r7,r7,r3,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q13,q15,q12,#4 - bic r10,r5,r3 - add r6,r6,r9 - vadd.i32 q9,q12,q12 - and r11,r4,r3 - ldr r9,[sp,#28] - vsri.32 q9,q12,#31 - add r6,r6,r7,ror#27 - eor r11,r11,r10 - mov r3,r3,ror#2 - vshr.u32 q12,q13,#30 - add r6,r6,r11 - bic r10,r4,r7 - vshl.u32 q13,q13,#2 - add r5,r5,r9 - and r11,r3,r7 - veor q9,q9,q12 - ldr r9,[sp,#32] - add r5,r5,r6,ror#27 - veor q9,q9,q13 - eor r11,r11,r10 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q10,q2,q3,#8 - bic r10,r3,r6 - add r4,r4,r9 - and r11,r7,r6 - vadd.i32 q13,q9,q14 - ldr r9,[sp,#36] - add r4,r4,r5,ror#27 - vext.8 q12,q9,q15,#4 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - veor q10,q10,q2 - bic r10,r7,r5 - add r3,r3,r9 - veor q12,q12,q8 - and r11,r6,r5 - ldr r9,[sp,#40] - veor q12,q12,q10 - add r3,r3,r4,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q13,q15,q12,#4 - bic r10,r6,r4 - add r7,r7,r9 - vadd.i32 q10,q12,q12 - and r11,r5,r4 - ldr r9,[sp,#44] - vsri.32 q10,q12,#31 - add r7,r7,r3,ror#27 - eor r11,r11,r10 - mov r4,r4,ror#2 - vshr.u32 q12,q13,#30 - add r7,r7,r11 - bic r10,r5,r3 - vshl.u32 q13,q13,#2 - add r6,r6,r9 - and r11,r4,r3 - veor q10,q10,q12 - ldr r9,[sp,#48] - add r6,r6,r7,ror#27 - veor q10,q10,q13 - eor r11,r11,r10 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q11,q3,q8,#8 - bic r10,r4,r7 - add r5,r5,r9 - and r11,r3,r7 - vadd.i32 q13,q10,q14 - ldr r9,[sp,#52] - add r5,r5,r6,ror#27 - vext.8 q12,q10,q15,#4 - eor r11,r11,r10 - mov r7,r7,ror#2 - add r5,r5,r11 - veor q11,q11,q3 - bic r10,r3,r6 - add r4,r4,r9 - veor q12,q12,q9 - and r11,r7,r6 - ldr r9,[sp,#56] - veor q12,q12,q11 - add r4,r4,r5,ror#27 - eor r11,r11,r10 - vst1.32 {q13},[r12,:128]! - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q13,q15,q12,#4 - bic r10,r7,r5 - add r3,r3,r9 - vadd.i32 q11,q12,q12 - and r11,r6,r5 - ldr r9,[sp,#60] - vsri.32 q11,q12,#31 - add r3,r3,r4,ror#27 - eor r11,r11,r10 - mov r5,r5,ror#2 - vshr.u32 q12,q13,#30 - add r3,r3,r11 - bic r10,r6,r4 - vshl.u32 q13,q13,#2 - add r7,r7,r9 - and r11,r5,r4 - veor q11,q11,q12 - ldr r9,[sp,#0] - add r7,r7,r3,ror#27 - veor q11,q11,q13 - eor r11,r11,r10 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q10,q11,#8 - bic r10,r5,r3 - add r6,r6,r9 - and r11,r4,r3 - veor q0,q0,q8 - ldr r9,[sp,#4] - add r6,r6,r7,ror#27 - veor q0,q0,q1 - eor r11,r11,r10 - mov r3,r3,ror#2 - vadd.i32 q13,q11,q14 - add r6,r6,r11 - bic r10,r4,r7 - veor q12,q12,q0 - add r5,r5,r9 - and r11,r3,r7 - vshr.u32 q0,q12,#30 - ldr r9,[sp,#8] - add r5,r5,r6,ror#27 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - eor r11,r11,r10 - mov r7,r7,ror#2 - vsli.32 q0,q12,#2 - add r5,r5,r11 - bic r10,r3,r6 - add r4,r4,r9 - and r11,r7,r6 - ldr r9,[sp,#12] - add r4,r4,r5,ror#27 - eor r11,r11,r10 - mov r6,r6,ror#2 - add r4,r4,r11 - bic r10,r7,r5 - add r3,r3,r9 - and r11,r6,r5 - ldr r9,[sp,#16] - add r3,r3,r4,ror#27 - eor r11,r11,r10 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q11,q0,#8 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#20] - veor q1,q1,q9 - eor r11,r10,r5 - add r7,r7,r3,ror#27 - veor q1,q1,q2 - mov r4,r4,ror#2 - add r7,r7,r11 - vadd.i32 q13,q0,q14 - eor r10,r3,r5 - add r6,r6,r9 - veor q12,q12,q1 - ldr r9,[sp,#24] - eor r11,r10,r4 - vshr.u32 q1,q12,#30 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - vst1.32 {q13},[r12,:128]! - add r6,r6,r11 - eor r10,r7,r4 - vsli.32 q1,q12,#2 - add r5,r5,r9 - ldr r9,[sp,#28] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#32] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q12,q0,q1,#8 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#36] - veor q2,q2,q10 - eor r11,r10,r6 - add r3,r3,r4,ror#27 - veor q2,q2,q3 - mov r5,r5,ror#2 - add r3,r3,r11 - vadd.i32 q13,q1,q14 - eor r10,r4,r6 - vld1.32 {d28[],d29[]},[r8,:32]! - add r7,r7,r9 - veor q12,q12,q2 - ldr r9,[sp,#40] - eor r11,r10,r5 - vshr.u32 q2,q12,#30 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - vst1.32 {q13},[r12,:128]! - add r7,r7,r11 - eor r10,r3,r5 - vsli.32 q2,q12,#2 - add r6,r6,r9 - ldr r9,[sp,#44] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#48] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q12,q1,q2,#8 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#52] - veor q3,q3,q11 - eor r11,r10,r7 - add r4,r4,r5,ror#27 - veor q3,q3,q8 - mov r6,r6,ror#2 - add r4,r4,r11 - vadd.i32 q13,q2,q14 - eor r10,r5,r7 - add r3,r3,r9 - veor q12,q12,q3 - ldr r9,[sp,#56] - eor r11,r10,r6 - vshr.u32 q3,q12,#30 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - vst1.32 {q13},[r12,:128]! - add r3,r3,r11 - eor r10,r4,r6 - vsli.32 q3,q12,#2 - add r7,r7,r9 - ldr r9,[sp,#60] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#0] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q12,q2,q3,#8 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#4] - veor q8,q8,q0 - eor r11,r10,r3 - add r5,r5,r6,ror#27 - veor q8,q8,q9 - mov r7,r7,ror#2 - add r5,r5,r11 - vadd.i32 q13,q3,q14 - eor r10,r6,r3 - add r4,r4,r9 - veor q12,q12,q8 - ldr r9,[sp,#8] - eor r11,r10,r7 - vshr.u32 q8,q12,#30 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - add r4,r4,r11 - eor r10,r5,r7 - vsli.32 q8,q12,#2 - add r3,r3,r9 - ldr r9,[sp,#12] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#16] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q3,q8,#8 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#20] - veor q9,q9,q1 - eor r11,r10,r4 - add r6,r6,r7,ror#27 - veor q9,q9,q10 - mov r3,r3,ror#2 - add r6,r6,r11 - vadd.i32 q13,q8,q14 - eor r10,r7,r4 - add r5,r5,r9 - veor q12,q12,q9 - ldr r9,[sp,#24] - eor r11,r10,r3 - vshr.u32 q9,q12,#30 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - vst1.32 {q13},[r12,:128]! - add r5,r5,r11 - eor r10,r6,r3 - vsli.32 q9,q12,#2 - add r4,r4,r9 - ldr r9,[sp,#28] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#32] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q8,q9,#8 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#36] - veor q10,q10,q2 - add r7,r7,r3,ror#27 - eor r11,r5,r6 - veor q10,q10,q11 - add r7,r7,r10 - and r11,r11,r4 - vadd.i32 q13,q9,q14 - mov r4,r4,ror#2 - add r7,r7,r11 - veor q12,q12,q10 - add r6,r6,r9 - and r10,r4,r5 - vshr.u32 q10,q12,#30 - ldr r9,[sp,#40] - add r6,r6,r7,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r4,r5 - add r6,r6,r10 - vsli.32 q10,q12,#2 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#44] - add r5,r5,r6,ror#27 - eor r11,r3,r4 - add r5,r5,r10 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#48] - add r4,r4,r5,ror#27 - eor r11,r7,r3 - add r4,r4,r10 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - vext.8 q12,q9,q10,#8 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#52] - veor q11,q11,q3 - add r3,r3,r4,ror#27 - eor r11,r6,r7 - veor q11,q11,q0 - add r3,r3,r10 - and r11,r11,r5 - vadd.i32 q13,q10,q14 - mov r5,r5,ror#2 - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r11 - veor q12,q12,q11 - add r7,r7,r9 - and r10,r5,r6 - vshr.u32 q11,q12,#30 - ldr r9,[sp,#56] - add r7,r7,r3,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r5,r6 - add r7,r7,r10 - vsli.32 q11,q12,#2 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#60] - add r6,r6,r7,ror#27 - eor r11,r4,r5 - add r6,r6,r10 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#0] - add r5,r5,r6,ror#27 - eor r11,r3,r4 - add r5,r5,r10 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - vext.8 q12,q10,q11,#8 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#4] - veor q0,q0,q8 - add r4,r4,r5,ror#27 - eor r11,r7,r3 - veor q0,q0,q1 - add r4,r4,r10 - and r11,r11,r6 - vadd.i32 q13,q11,q14 - mov r6,r6,ror#2 - add r4,r4,r11 - veor q12,q12,q0 - add r3,r3,r9 - and r10,r6,r7 - vshr.u32 q0,q12,#30 - ldr r9,[sp,#8] - add r3,r3,r4,ror#27 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - eor r11,r6,r7 - add r3,r3,r10 - vsli.32 q0,q12,#2 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#12] - add r7,r7,r3,ror#27 - eor r11,r5,r6 - add r7,r7,r10 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#16] - add r6,r6,r7,ror#27 - eor r11,r4,r5 - add r6,r6,r10 - and r11,r11,r3 - mov r3,r3,ror#2 - add r6,r6,r11 - vext.8 q12,q11,q0,#8 - add r5,r5,r9 - and r10,r3,r4 - ldr r9,[sp,#20] - veor q1,q1,q9 - add r5,r5,r6,ror#27 - eor r11,r3,r4 - veor q1,q1,q2 - add r5,r5,r10 - and r11,r11,r7 - vadd.i32 q13,q0,q14 - mov r7,r7,ror#2 - add r5,r5,r11 - veor q12,q12,q1 - add r4,r4,r9 - and r10,r7,r3 - vshr.u32 q1,q12,#30 - ldr r9,[sp,#24] - add r4,r4,r5,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r7,r3 - add r4,r4,r10 - vsli.32 q1,q12,#2 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#28] - add r3,r3,r4,ror#27 - eor r11,r6,r7 - add r3,r3,r10 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - add r7,r7,r9 - and r10,r5,r6 - ldr r9,[sp,#32] - add r7,r7,r3,ror#27 - eor r11,r5,r6 - add r7,r7,r10 - and r11,r11,r4 - mov r4,r4,ror#2 - add r7,r7,r11 - vext.8 q12,q0,q1,#8 - add r6,r6,r9 - and r10,r4,r5 - ldr r9,[sp,#36] - veor q2,q2,q10 - add r6,r6,r7,ror#27 - eor r11,r4,r5 - veor q2,q2,q3 - add r6,r6,r10 - and r11,r11,r3 - vadd.i32 q13,q1,q14 - mov r3,r3,ror#2 - add r6,r6,r11 - veor q12,q12,q2 - add r5,r5,r9 - and r10,r3,r4 - vshr.u32 q2,q12,#30 - ldr r9,[sp,#40] - add r5,r5,r6,ror#27 - vst1.32 {q13},[r12,:128]! - eor r11,r3,r4 - add r5,r5,r10 - vsli.32 q2,q12,#2 - and r11,r11,r7 - mov r7,r7,ror#2 - add r5,r5,r11 - add r4,r4,r9 - and r10,r7,r3 - ldr r9,[sp,#44] - add r4,r4,r5,ror#27 - eor r11,r7,r3 - add r4,r4,r10 - and r11,r11,r6 - mov r6,r6,ror#2 - add r4,r4,r11 - add r3,r3,r9 - and r10,r6,r7 - ldr r9,[sp,#48] - add r3,r3,r4,ror#27 - eor r11,r6,r7 - add r3,r3,r10 - and r11,r11,r5 - mov r5,r5,ror#2 - add r3,r3,r11 - vext.8 q12,q1,q2,#8 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#52] - veor q3,q3,q11 - eor r11,r10,r5 - add r7,r7,r3,ror#27 - veor q3,q3,q8 - mov r4,r4,ror#2 - add r7,r7,r11 - vadd.i32 q13,q2,q14 - eor r10,r3,r5 - add r6,r6,r9 - veor q12,q12,q3 - ldr r9,[sp,#56] - eor r11,r10,r4 - vshr.u32 q3,q12,#30 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - vst1.32 {q13},[r12,:128]! - add r6,r6,r11 - eor r10,r7,r4 - vsli.32 q3,q12,#2 - add r5,r5,r9 - ldr r9,[sp,#60] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#0] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - vadd.i32 q13,q3,q14 - eor r10,r5,r7 - add r3,r3,r9 - vst1.32 {q13},[r12,:128]! - sub r12,r12,#64 - teq r1,r2 - sub r8,r8,#16 - it eq - subeq r1,r1,#64 - vld1.8 {q0,q1},[r1]! - ldr r9,[sp,#4] - eor r11,r10,r6 - vld1.8 {q2,q3},[r1]! - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - vld1.32 {d28[],d29[]},[r8,:32]! - add r3,r3,r11 - eor r10,r4,r6 - vrev32.8 q0,q0 - add r7,r7,r9 - ldr r9,[sp,#8] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#12] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#16] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - vrev32.8 q1,q1 - eor r10,r6,r3 - add r4,r4,r9 - vadd.i32 q8,q0,q14 - ldr r9,[sp,#20] - eor r11,r10,r7 - vst1.32 {q8},[r12,:128]! - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#24] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#28] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - eor r10,r3,r5 - add r6,r6,r9 - ldr r9,[sp,#32] - eor r11,r10,r4 - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - vrev32.8 q2,q2 - eor r10,r7,r4 - add r5,r5,r9 - vadd.i32 q9,q1,q14 - ldr r9,[sp,#36] - eor r11,r10,r3 - vst1.32 {q9},[r12,:128]! - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#40] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - ldr r9,[sp,#44] - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - eor r10,r4,r6 - add r7,r7,r9 - ldr r9,[sp,#48] - eor r11,r10,r5 - add r7,r7,r3,ror#27 - mov r4,r4,ror#2 - add r7,r7,r11 - vrev32.8 q3,q3 - eor r10,r3,r5 - add r6,r6,r9 - vadd.i32 q10,q2,q14 - ldr r9,[sp,#52] - eor r11,r10,r4 - vst1.32 {q10},[r12,:128]! - add r6,r6,r7,ror#27 - mov r3,r3,ror#2 - add r6,r6,r11 - eor r10,r7,r4 - add r5,r5,r9 - ldr r9,[sp,#56] - eor r11,r10,r3 - add r5,r5,r6,ror#27 - mov r7,r7,ror#2 - add r5,r5,r11 - eor r10,r6,r3 - add r4,r4,r9 - ldr r9,[sp,#60] - eor r11,r10,r7 - add r4,r4,r5,ror#27 - mov r6,r6,ror#2 - add r4,r4,r11 - eor r10,r5,r7 - add r3,r3,r9 - eor r11,r10,r6 - add r3,r3,r4,ror#27 - mov r5,r5,ror#2 - add r3,r3,r11 - ldmia r0,{r9,r10,r11,r12} @ accumulate context - add r3,r3,r9 - ldr r9,[r0,#16] - add r4,r4,r10 - add r5,r5,r11 - add r6,r6,r12 - it eq - moveq sp,r14 - add r7,r7,r9 - it ne - ldrne r9,[sp] - stmia r0,{r3,r4,r5,r6,r7} - itt ne - addne r12,sp,#3*16 - bne .Loop_neon - - @ vldmia sp!,{d8-d15} - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} -.size sha1_block_data_order_neon,.-sha1_block_data_order_neon -#endif -#if __ARM_MAX_ARCH__>=7 - -# if defined(__thumb2__) -# define INST(a,b,c,d) .byte c,d|0xf,a,b -# else -# define INST(a,b,c,d) .byte a,b,c,d|0x10 -# endif - -.type sha1_block_data_order_armv8,%function -.align 5 -sha1_block_data_order_armv8: -.LARMv8: - vstmdb sp!,{d8,d9,d10,d11,d12,d13,d14,d15} @ ABI specification says so - - veor q1,q1,q1 - adr r3,.LK_00_19 - vld1.32 {q0},[r0]! - vld1.32 {d2[0]},[r0] - sub r0,r0,#16 - vld1.32 {d16[],d17[]},[r3,:32]! - vld1.32 {d18[],d19[]},[r3,:32]! - vld1.32 {d20[],d21[]},[r3,:32]! - vld1.32 {d22[],d23[]},[r3,:32] - -.Loop_v8: - vld1.8 {q4,q5},[r1]! - vld1.8 {q6,q7},[r1]! - vrev32.8 q4,q4 - vrev32.8 q5,q5 - - vadd.i32 q12,q8,q4 - vrev32.8 q6,q6 - vmov q14,q0 @ offload - subs r2,r2,#1 - - vadd.i32 q13,q8,q5 - vrev32.8 q7,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 0 - INST(0x68,0x0c,0x02,0xe2) @ sha1c q0,q1,q12 - vadd.i32 q12,q8,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 1 - INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 - vadd.i32 q13,q8,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 2 - INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 - vadd.i32 q12,q8,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 3 - INST(0x6a,0x0c,0x06,0xe2) @ sha1c q0,q3,q13 - vadd.i32 q13,q9,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 4 - INST(0x68,0x0c,0x04,0xe2) @ sha1c q0,q2,q12 - vadd.i32 q12,q9,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 5 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q9,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 6 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - vadd.i32 q12,q9,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 7 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q9,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 8 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - vadd.i32 q12,q10,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 9 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q10,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 10 - INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 - vadd.i32 q12,q10,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 11 - INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 - vadd.i32 q13,q10,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 12 - INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 - vadd.i32 q12,q10,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0x4c,0x8c,0x3a,0xe2) @ sha1su0 q4,q5,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 13 - INST(0x6a,0x0c,0x26,0xe2) @ sha1m q0,q3,q13 - vadd.i32 q13,q11,q7 - INST(0x8e,0x83,0xba,0xf3) @ sha1su1 q4,q7 - INST(0x4e,0xac,0x3c,0xe2) @ sha1su0 q5,q6,q7 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 14 - INST(0x68,0x0c,0x24,0xe2) @ sha1m q0,q2,q12 - vadd.i32 q12,q11,q4 - INST(0x88,0xa3,0xba,0xf3) @ sha1su1 q5,q4 - INST(0x48,0xcc,0x3e,0xe2) @ sha1su0 q6,q7,q4 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 15 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q11,q5 - INST(0x8a,0xc3,0xba,0xf3) @ sha1su1 q6,q5 - INST(0x4a,0xec,0x38,0xe2) @ sha1su0 q7,q4,q5 - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 16 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - vadd.i32 q12,q11,q6 - INST(0x8c,0xe3,0xba,0xf3) @ sha1su1 q7,q6 - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 17 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - vadd.i32 q13,q11,q7 - - INST(0xc0,0x62,0xb9,0xf3) @ sha1h q3,q0 @ 18 - INST(0x68,0x0c,0x14,0xe2) @ sha1p q0,q2,q12 - - INST(0xc0,0x42,0xb9,0xf3) @ sha1h q2,q0 @ 19 - INST(0x6a,0x0c,0x16,0xe2) @ sha1p q0,q3,q13 - - vadd.i32 q1,q1,q2 - vadd.i32 q0,q0,q14 - bne .Loop_v8 - - vst1.32 {q0},[r0]! - vst1.32 {d2[0]},[r0] - - vldmia sp!,{d8,d9,d10,d11,d12,d13,d14,d15} - bx lr @ bx lr -.size sha1_block_data_order_armv8,.-sha1_block_data_order_armv8 -#endif -#if __ARM_MAX_ARCH__>=7 -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S b/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S deleted file mode 100644 index aee04785..00000000 --- a/third_party/boringssl/linux-arm/crypto/fipsmodule/sha256-armv4.S +++ /dev/null @@ -1,2839 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__arm__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. -@ -@ Licensed under the OpenSSL license (the "License"). You may not use -@ this file except in compliance with the License. You can obtain a copy -@ in the file LICENSE in the source distribution or at -@ https://www.openssl.org/source/license.html - - -@ ==================================================================== -@ Written by Andy Polyakov for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. -@ -@ Permission to use under GPL terms is granted. -@ ==================================================================== - -@ SHA256 block procedure for ARMv4. May 2007. - -@ Performance is ~2x better than gcc 3.4 generated code and in "abso- -@ lute" terms is ~2250 cycles per 64-byte block or ~35 cycles per -@ byte [on single-issue Xscale PXA250 core]. - -@ July 2010. -@ -@ Rescheduling for dual-issue pipeline resulted in 22% improvement on -@ Cortex A8 core and ~20 cycles per processed byte. - -@ February 2011. -@ -@ Profiler-assisted and platform-specific optimization resulted in 16% -@ improvement on Cortex A8 core and ~15.4 cycles per processed byte. - -@ September 2013. -@ -@ Add NEON implementation. On Cortex A8 it was measured to process one -@ byte in 12.5 cycles or 23% faster than integer-only code. Snapdragon -@ S4 does it in 12.5 cycles too, but it's 50% faster than integer-only -@ code (meaning that latter performs sub-optimally, nothing was done -@ about it). - -@ May 2014. -@ -@ Add ARMv8 code path performing at 2.0 cpb on Apple A7. - -#ifndef __KERNEL__ -# include -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ 7 -#endif - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors. It does have ARMv8-only code, but those -@ instructions are manually-encoded. (See unsha256.) -.arch armv7-a - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -#else -.code 32 -#endif - -.type K256,%object -.align 5 -K256: -.word 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.word 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.word 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.word 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.word 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.word 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.word 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.word 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.word 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.word 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.word 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.word 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.word 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.word 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.word 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.word 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.size K256,.-K256 -.word 0 @ terminator -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.Lsha256_block_data_order -#endif -.align 5 - -.globl sha256_block_data_order -.hidden sha256_block_data_order -.type sha256_block_data_order,%function -sha256_block_data_order: -.Lsha256_block_data_order: -#if __ARM_ARCH__<7 && !defined(__thumb2__) - sub r3,pc,#8 @ sha256_block_data_order -#else - adr r3,.Lsha256_block_data_order -#endif -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ - ldr r12,[r12] -#endif - tst r12,#ARMV8_SHA256 - bne .LARMv8 - tst r12,#ARMV7_NEON - bne .LNEON -#endif - add r2,r1,r2,lsl#6 @ len to point at the end of inp - stmdb sp!,{r0,r1,r2,r4-r11,lr} - ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} - sub r14,r3,#256+32 @ K256 - sub sp,sp,#16*4 @ alloca(X[16]) -.Loop: -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ magic - eor r12,r12,r12 -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 0 -# if 0==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r8,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 0 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 0==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r8,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#0*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 0==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 0<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#2*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#15*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 1 -# if 1==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r7,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 1 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 1==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r7,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#1*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 1==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 1<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#3*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#0*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 2 -# if 2==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r6,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 2 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 2==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r6,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#2*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 2==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 2<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#4*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#1*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 3 -# if 3==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r5,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 3 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 3==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r5,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#3*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 3==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 3<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#5*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#2*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 4 -# if 4==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r4,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 4 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 4==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r4,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#4*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 4==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 4<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#6*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#3*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 5 -# if 5==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r11,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 5==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r11,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#5*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 5==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 5<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#7*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#4*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 6 -# if 6==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r10,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 6 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 6==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r10,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#6*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 6==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 6<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#8*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#5*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 7 -# if 7==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r9,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 7==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r9,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#7*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 7==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 7<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#9*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#6*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 8 -# if 8==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r8,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 8 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 8==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r8,r8,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r8,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#8*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 8==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 8<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#10*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#7*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 9 -# if 9==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r7,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 9 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 9==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r7,r7,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r7,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#9*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 9==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 9<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#11*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#8*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 10 -# if 10==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r6,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 10 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 10==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r6,r6,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r6,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#10*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 10==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 10<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#12*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#9*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 11 -# if 11==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r5,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 11 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 11==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r5,r5,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r5,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#11*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 11==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 11<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#13*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#10*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 12 -# if 12==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r4,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 12 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 12==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r4,r4,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r4,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#12*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 12==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 12<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#14*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#11*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 13 -# if 13==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r11,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 13 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 13==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r11,r11,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r11,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#13*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 13==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 13<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#15*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#12*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 14 -# if 14==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - eor r0,r0,r10,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 14 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - ldrb r12,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r12,lsl#8 - ldrb r12,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 14==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r10,r10,ror#5 - orr r2,r2,r12,lsl#24 - eor r0,r0,r10,ror#19 @ Sigma1(e) -#endif - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#14*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 14==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 14<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#0*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#13*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - @ ldr r2,[r1],#4 @ 15 -# if 15==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - eor r0,r0,r9,ror#19 @ Sigma1(e) -# ifndef __ARMEB__ - rev r2,r2 -# endif -#else - @ ldrb r2,[r1,#3] @ 15 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - ldrb r3,[r1,#2] - ldrb r0,[r1,#1] - orr r2,r2,r3,lsl#8 - ldrb r3,[r1],#4 - orr r2,r2,r0,lsl#16 -# if 15==15 - str r1,[sp,#17*4] @ make room for r1 -# endif - eor r0,r9,r9,ror#5 - orr r2,r2,r3,lsl#24 - eor r0,r0,r9,ror#19 @ Sigma1(e) -#endif - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#15*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 15==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 15<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#1*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#14*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) -.Lrounds_16_xx: - @ ldr r2,[sp,#1*4] @ 16 - @ ldr r1,[sp,#14*4] - mov r0,r2,ror#7 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#0*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#9*4] - - add r12,r12,r0 - eor r0,r8,r8,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r8,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#0*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 16==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 16<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#2*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#15*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#2*4] @ 17 - @ ldr r1,[sp,#15*4] - mov r0,r2,ror#7 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#1*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#10*4] - - add r3,r3,r0 - eor r0,r7,r7,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r7,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#1*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 17==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 17<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#3*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#0*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#3*4] @ 18 - @ ldr r1,[sp,#0*4] - mov r0,r2,ror#7 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#2*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#11*4] - - add r12,r12,r0 - eor r0,r6,r6,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r6,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#2*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 18==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 18<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#4*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#1*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#4*4] @ 19 - @ ldr r1,[sp,#1*4] - mov r0,r2,ror#7 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#3*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#12*4] - - add r3,r3,r0 - eor r0,r5,r5,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r5,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#3*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 19==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 19<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#5*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#2*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#5*4] @ 20 - @ ldr r1,[sp,#2*4] - mov r0,r2,ror#7 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#4*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#13*4] - - add r12,r12,r0 - eor r0,r4,r4,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r4,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#4*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 20==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 20<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#6*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#3*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#6*4] @ 21 - @ ldr r1,[sp,#3*4] - mov r0,r2,ror#7 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#5*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#14*4] - - add r3,r3,r0 - eor r0,r11,r11,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r11,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#5*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 21==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 21<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#7*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#4*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#7*4] @ 22 - @ ldr r1,[sp,#4*4] - mov r0,r2,ror#7 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#6*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#15*4] - - add r12,r12,r0 - eor r0,r10,r10,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r10,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#6*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 22==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 22<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#8*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#5*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#8*4] @ 23 - @ ldr r1,[sp,#5*4] - mov r0,r2,ror#7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#7*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#0*4] - - add r3,r3,r0 - eor r0,r9,r9,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r9,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#7*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 23==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 23<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#9*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#6*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#9*4] @ 24 - @ ldr r1,[sp,#6*4] - mov r0,r2,ror#7 - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#8*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#1*4] - - add r12,r12,r0 - eor r0,r8,r8,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r8,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r11,r11,r2 @ h+=X[i] - str r2,[sp,#8*4] - eor r2,r9,r10 - add r11,r11,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r8 - add r11,r11,r12 @ h+=K256[i] - eor r2,r2,r10 @ Ch(e,f,g) - eor r0,r4,r4,ror#11 - add r11,r11,r2 @ h+=Ch(e,f,g) -#if 24==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 24<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r4,r5 @ a^b, b^c in next round -#else - ldr r2,[sp,#10*4] @ from future BODY_16_xx - eor r12,r4,r5 @ a^b, b^c in next round - ldr r1,[sp,#7*4] @ from future BODY_16_xx -#endif - eor r0,r0,r4,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r7,r7,r11 @ d+=h - eor r3,r3,r5 @ Maj(a,b,c) - add r11,r11,r0,ror#2 @ h+=Sigma0(a) - @ add r11,r11,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#10*4] @ 25 - @ ldr r1,[sp,#7*4] - mov r0,r2,ror#7 - add r11,r11,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#9*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#2*4] - - add r3,r3,r0 - eor r0,r7,r7,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r7,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r10,r10,r2 @ h+=X[i] - str r2,[sp,#9*4] - eor r2,r8,r9 - add r10,r10,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r7 - add r10,r10,r3 @ h+=K256[i] - eor r2,r2,r9 @ Ch(e,f,g) - eor r0,r11,r11,ror#11 - add r10,r10,r2 @ h+=Ch(e,f,g) -#if 25==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 25<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r11,r4 @ a^b, b^c in next round -#else - ldr r2,[sp,#11*4] @ from future BODY_16_xx - eor r3,r11,r4 @ a^b, b^c in next round - ldr r1,[sp,#8*4] @ from future BODY_16_xx -#endif - eor r0,r0,r11,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r6,r6,r10 @ d+=h - eor r12,r12,r4 @ Maj(a,b,c) - add r10,r10,r0,ror#2 @ h+=Sigma0(a) - @ add r10,r10,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#11*4] @ 26 - @ ldr r1,[sp,#8*4] - mov r0,r2,ror#7 - add r10,r10,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#10*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#3*4] - - add r12,r12,r0 - eor r0,r6,r6,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r6,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r9,r9,r2 @ h+=X[i] - str r2,[sp,#10*4] - eor r2,r7,r8 - add r9,r9,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r6 - add r9,r9,r12 @ h+=K256[i] - eor r2,r2,r8 @ Ch(e,f,g) - eor r0,r10,r10,ror#11 - add r9,r9,r2 @ h+=Ch(e,f,g) -#if 26==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 26<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r10,r11 @ a^b, b^c in next round -#else - ldr r2,[sp,#12*4] @ from future BODY_16_xx - eor r12,r10,r11 @ a^b, b^c in next round - ldr r1,[sp,#9*4] @ from future BODY_16_xx -#endif - eor r0,r0,r10,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r5,r5,r9 @ d+=h - eor r3,r3,r11 @ Maj(a,b,c) - add r9,r9,r0,ror#2 @ h+=Sigma0(a) - @ add r9,r9,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#12*4] @ 27 - @ ldr r1,[sp,#9*4] - mov r0,r2,ror#7 - add r9,r9,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#11*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#4*4] - - add r3,r3,r0 - eor r0,r5,r5,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r5,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r8,r8,r2 @ h+=X[i] - str r2,[sp,#11*4] - eor r2,r6,r7 - add r8,r8,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r5 - add r8,r8,r3 @ h+=K256[i] - eor r2,r2,r7 @ Ch(e,f,g) - eor r0,r9,r9,ror#11 - add r8,r8,r2 @ h+=Ch(e,f,g) -#if 27==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 27<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r9,r10 @ a^b, b^c in next round -#else - ldr r2,[sp,#13*4] @ from future BODY_16_xx - eor r3,r9,r10 @ a^b, b^c in next round - ldr r1,[sp,#10*4] @ from future BODY_16_xx -#endif - eor r0,r0,r9,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r4,r4,r8 @ d+=h - eor r12,r12,r10 @ Maj(a,b,c) - add r8,r8,r0,ror#2 @ h+=Sigma0(a) - @ add r8,r8,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#13*4] @ 28 - @ ldr r1,[sp,#10*4] - mov r0,r2,ror#7 - add r8,r8,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#12*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#5*4] - - add r12,r12,r0 - eor r0,r4,r4,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r4,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r7,r7,r2 @ h+=X[i] - str r2,[sp,#12*4] - eor r2,r5,r6 - add r7,r7,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r4 - add r7,r7,r12 @ h+=K256[i] - eor r2,r2,r6 @ Ch(e,f,g) - eor r0,r8,r8,ror#11 - add r7,r7,r2 @ h+=Ch(e,f,g) -#if 28==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 28<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r8,r9 @ a^b, b^c in next round -#else - ldr r2,[sp,#14*4] @ from future BODY_16_xx - eor r12,r8,r9 @ a^b, b^c in next round - ldr r1,[sp,#11*4] @ from future BODY_16_xx -#endif - eor r0,r0,r8,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r11,r11,r7 @ d+=h - eor r3,r3,r9 @ Maj(a,b,c) - add r7,r7,r0,ror#2 @ h+=Sigma0(a) - @ add r7,r7,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#14*4] @ 29 - @ ldr r1,[sp,#11*4] - mov r0,r2,ror#7 - add r7,r7,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#13*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#6*4] - - add r3,r3,r0 - eor r0,r11,r11,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r11,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r6,r6,r2 @ h+=X[i] - str r2,[sp,#13*4] - eor r2,r4,r5 - add r6,r6,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r11 - add r6,r6,r3 @ h+=K256[i] - eor r2,r2,r5 @ Ch(e,f,g) - eor r0,r7,r7,ror#11 - add r6,r6,r2 @ h+=Ch(e,f,g) -#if 29==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 29<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r7,r8 @ a^b, b^c in next round -#else - ldr r2,[sp,#15*4] @ from future BODY_16_xx - eor r3,r7,r8 @ a^b, b^c in next round - ldr r1,[sp,#12*4] @ from future BODY_16_xx -#endif - eor r0,r0,r7,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r10,r10,r6 @ d+=h - eor r12,r12,r8 @ Maj(a,b,c) - add r6,r6,r0,ror#2 @ h+=Sigma0(a) - @ add r6,r6,r12 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#15*4] @ 30 - @ ldr r1,[sp,#12*4] - mov r0,r2,ror#7 - add r6,r6,r12 @ h+=Maj(a,b,c) from the past - mov r12,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r12,r12,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#14*4] - eor r12,r12,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#7*4] - - add r12,r12,r0 - eor r0,r10,r10,ror#5 @ from BODY_00_15 - add r2,r2,r12 - eor r0,r0,r10,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r12,[r14],#4 @ *K256++ - add r5,r5,r2 @ h+=X[i] - str r2,[sp,#14*4] - eor r2,r11,r4 - add r5,r5,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r10 - add r5,r5,r12 @ h+=K256[i] - eor r2,r2,r4 @ Ch(e,f,g) - eor r0,r6,r6,ror#11 - add r5,r5,r2 @ h+=Ch(e,f,g) -#if 30==31 - and r12,r12,#0xff - cmp r12,#0xf2 @ done? -#endif -#if 30<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r12,r6,r7 @ a^b, b^c in next round -#else - ldr r2,[sp,#0*4] @ from future BODY_16_xx - eor r12,r6,r7 @ a^b, b^c in next round - ldr r1,[sp,#13*4] @ from future BODY_16_xx -#endif - eor r0,r0,r6,ror#20 @ Sigma0(a) - and r3,r3,r12 @ (b^c)&=(a^b) - add r9,r9,r5 @ d+=h - eor r3,r3,r7 @ Maj(a,b,c) - add r5,r5,r0,ror#2 @ h+=Sigma0(a) - @ add r5,r5,r3 @ h+=Maj(a,b,c) - @ ldr r2,[sp,#0*4] @ 31 - @ ldr r1,[sp,#13*4] - mov r0,r2,ror#7 - add r5,r5,r3 @ h+=Maj(a,b,c) from the past - mov r3,r1,ror#17 - eor r0,r0,r2,ror#18 - eor r3,r3,r1,ror#19 - eor r0,r0,r2,lsr#3 @ sigma0(X[i+1]) - ldr r2,[sp,#15*4] - eor r3,r3,r1,lsr#10 @ sigma1(X[i+14]) - ldr r1,[sp,#8*4] - - add r3,r3,r0 - eor r0,r9,r9,ror#5 @ from BODY_00_15 - add r2,r2,r3 - eor r0,r0,r9,ror#19 @ Sigma1(e) - add r2,r2,r1 @ X[i] - ldr r3,[r14],#4 @ *K256++ - add r4,r4,r2 @ h+=X[i] - str r2,[sp,#15*4] - eor r2,r10,r11 - add r4,r4,r0,ror#6 @ h+=Sigma1(e) - and r2,r2,r9 - add r4,r4,r3 @ h+=K256[i] - eor r2,r2,r11 @ Ch(e,f,g) - eor r0,r5,r5,ror#11 - add r4,r4,r2 @ h+=Ch(e,f,g) -#if 31==31 - and r3,r3,#0xff - cmp r3,#0xf2 @ done? -#endif -#if 31<15 -# if __ARM_ARCH__>=7 - ldr r2,[r1],#4 @ prefetch -# else - ldrb r2,[r1,#3] -# endif - eor r3,r5,r6 @ a^b, b^c in next round -#else - ldr r2,[sp,#1*4] @ from future BODY_16_xx - eor r3,r5,r6 @ a^b, b^c in next round - ldr r1,[sp,#14*4] @ from future BODY_16_xx -#endif - eor r0,r0,r5,ror#20 @ Sigma0(a) - and r12,r12,r3 @ (b^c)&=(a^b) - add r8,r8,r4 @ d+=h - eor r12,r12,r6 @ Maj(a,b,c) - add r4,r4,r0,ror#2 @ h+=Sigma0(a) - @ add r4,r4,r12 @ h+=Maj(a,b,c) -#if __ARM_ARCH__>=7 - ite eq @ Thumb2 thing, sanity check in ARM -#endif - ldreq r3,[sp,#16*4] @ pull ctx - bne .Lrounds_16_xx - - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldr r0,[r3,#0] - ldr r2,[r3,#4] - ldr r12,[r3,#8] - add r4,r4,r0 - ldr r0,[r3,#12] - add r5,r5,r2 - ldr r2,[r3,#16] - add r6,r6,r12 - ldr r12,[r3,#20] - add r7,r7,r0 - ldr r0,[r3,#24] - add r8,r8,r2 - ldr r2,[r3,#28] - add r9,r9,r12 - ldr r1,[sp,#17*4] @ pull inp - ldr r12,[sp,#18*4] @ pull inp+len - add r10,r10,r0 - add r11,r11,r2 - stmia r3,{r4,r5,r6,r7,r8,r9,r10,r11} - cmp r1,r12 - sub r14,r14,#256 @ rewind Ktbl - bne .Loop - - add sp,sp,#19*4 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,pc} -#else - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size sha256_block_data_order,.-sha256_block_data_order -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.globl sha256_block_data_order_neon -.hidden sha256_block_data_order_neon -.type sha256_block_data_order_neon,%function -.align 5 -.skip 16 -sha256_block_data_order_neon: -.LNEON: - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - - sub r11,sp,#16*4+16 - adr r14,K256 - bic r11,r11,#15 @ align for 128-bit stores - mov r12,sp - mov sp,r11 @ alloca - add r2,r1,r2,lsl#6 @ len to point at the end of inp - - vld1.8 {q0},[r1]! - vld1.8 {q1},[r1]! - vld1.8 {q2},[r1]! - vld1.8 {q3},[r1]! - vld1.32 {q8},[r14,:128]! - vld1.32 {q9},[r14,:128]! - vld1.32 {q10},[r14,:128]! - vld1.32 {q11},[r14,:128]! - vrev32.8 q0,q0 @ yes, even on - str r0,[sp,#64] - vrev32.8 q1,q1 @ big-endian - str r1,[sp,#68] - mov r1,sp - vrev32.8 q2,q2 - str r2,[sp,#72] - vrev32.8 q3,q3 - str r12,[sp,#76] @ save original sp - vadd.i32 q8,q8,q0 - vadd.i32 q9,q9,q1 - vst1.32 {q8},[r1,:128]! - vadd.i32 q10,q10,q2 - vst1.32 {q9},[r1,:128]! - vadd.i32 q11,q11,q3 - vst1.32 {q10},[r1,:128]! - vst1.32 {q11},[r1,:128]! - - ldmia r0,{r4,r5,r6,r7,r8,r9,r10,r11} - sub r1,r1,#64 - ldr r2,[sp,#0] - eor r12,r12,r12 - eor r3,r5,r6 - b .L_00_48 - -.align 4 -.L_00_48: - vext.8 q8,q0,q1,#4 - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - vext.8 q9,q2,q3,#4 - add r4,r4,r12 - and r2,r2,r8 - eor r12,r0,r8,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vadd.i32 q0,q0,q9 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - vshr.u32 q9,q8,#3 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#4] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - veor q9,q9,q10 - add r10,r10,r2 - vsli.32 q11,q8,#14 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - vshr.u32 d24,d7,#17 - add r11,r11,r3 - and r2,r2,r7 - veor q9,q9,q11 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - vsli.32 d24,d7,#15 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - vshr.u32 d25,d7,#10 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - vadd.i32 q0,q0,q9 - add r10,r10,r2 - ldr r2,[sp,#8] - veor d25,d25,d24 - and r12,r12,r3 - add r6,r6,r10 - vshr.u32 d24,d7,#19 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - vsli.32 d24,d7,#13 - add r9,r9,r2 - eor r2,r7,r8 - veor d25,d25,d24 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - vadd.i32 d0,d0,d25 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - vshr.u32 d24,d0,#17 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - vsli.32 d24,d0,#15 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - vshr.u32 d25,d0,#10 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - veor d25,d25,d24 - ldr r2,[sp,#12] - and r3,r3,r12 - vshr.u32 d24,d0,#19 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - vld1.32 {q8},[r14,:128]! - add r8,r8,r2 - vsli.32 d24,d0,#13 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - veor d25,d25,d24 - add r9,r9,r3 - and r2,r2,r5 - vadd.i32 d1,d1,d25 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - vadd.i32 q8,q8,q0 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#16] - and r12,r12,r3 - add r4,r4,r8 - vst1.32 {q8},[r1,:128]! - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vext.8 q8,q1,q2,#4 - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - vext.8 q9,q3,q0,#4 - add r8,r8,r12 - and r2,r2,r4 - eor r12,r0,r4,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vadd.i32 q1,q1,q9 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - vshr.u32 q9,q8,#3 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#20] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - veor q9,q9,q10 - add r6,r6,r2 - vsli.32 q11,q8,#14 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - vshr.u32 d24,d1,#17 - add r7,r7,r3 - and r2,r2,r11 - veor q9,q9,q11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - vsli.32 d24,d1,#15 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - vshr.u32 d25,d1,#10 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - vadd.i32 q1,q1,q9 - add r6,r6,r2 - ldr r2,[sp,#24] - veor d25,d25,d24 - and r12,r12,r3 - add r10,r10,r6 - vshr.u32 d24,d1,#19 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - vsli.32 d24,d1,#13 - add r5,r5,r2 - eor r2,r11,r4 - veor d25,d25,d24 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - vadd.i32 d2,d2,d25 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - vshr.u32 d24,d2,#17 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - vsli.32 d24,d2,#15 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - vshr.u32 d25,d2,#10 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - veor d25,d25,d24 - ldr r2,[sp,#28] - and r3,r3,r12 - vshr.u32 d24,d2,#19 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - vld1.32 {q8},[r14,:128]! - add r4,r4,r2 - vsli.32 d24,d2,#13 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - veor d25,d25,d24 - add r5,r5,r3 - and r2,r2,r9 - vadd.i32 d3,d3,d25 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - vadd.i32 q8,q8,q1 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#32] - and r12,r12,r3 - add r8,r8,r4 - vst1.32 {q8},[r1,:128]! - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vext.8 q8,q2,q3,#4 - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - vext.8 q9,q0,q1,#4 - add r4,r4,r12 - and r2,r2,r8 - eor r12,r0,r8,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vadd.i32 q2,q2,q9 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - vshr.u32 q9,q8,#3 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#36] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - veor q9,q9,q10 - add r10,r10,r2 - vsli.32 q11,q8,#14 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - vshr.u32 d24,d3,#17 - add r11,r11,r3 - and r2,r2,r7 - veor q9,q9,q11 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - vsli.32 d24,d3,#15 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - vshr.u32 d25,d3,#10 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - vadd.i32 q2,q2,q9 - add r10,r10,r2 - ldr r2,[sp,#40] - veor d25,d25,d24 - and r12,r12,r3 - add r6,r6,r10 - vshr.u32 d24,d3,#19 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - vsli.32 d24,d3,#13 - add r9,r9,r2 - eor r2,r7,r8 - veor d25,d25,d24 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - vadd.i32 d4,d4,d25 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - vshr.u32 d24,d4,#17 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - vsli.32 d24,d4,#15 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - vshr.u32 d25,d4,#10 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - veor d25,d25,d24 - ldr r2,[sp,#44] - and r3,r3,r12 - vshr.u32 d24,d4,#19 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - vld1.32 {q8},[r14,:128]! - add r8,r8,r2 - vsli.32 d24,d4,#13 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - veor d25,d25,d24 - add r9,r9,r3 - and r2,r2,r5 - vadd.i32 d5,d5,d25 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - vadd.i32 q8,q8,q2 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#48] - and r12,r12,r3 - add r4,r4,r8 - vst1.32 {q8},[r1,:128]! - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vext.8 q8,q3,q0,#4 - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - vext.8 q9,q1,q2,#4 - add r8,r8,r12 - and r2,r2,r4 - eor r12,r0,r4,ror#19 - vshr.u32 q10,q8,#7 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vadd.i32 q3,q3,q9 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - vshr.u32 q9,q8,#3 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vsli.32 q10,q8,#25 - ldr r2,[sp,#52] - and r3,r3,r12 - vshr.u32 q11,q8,#18 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - veor q9,q9,q10 - add r6,r6,r2 - vsli.32 q11,q8,#14 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - vshr.u32 d24,d5,#17 - add r7,r7,r3 - and r2,r2,r11 - veor q9,q9,q11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - vsli.32 d24,d5,#15 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - vshr.u32 d25,d5,#10 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - vadd.i32 q3,q3,q9 - add r6,r6,r2 - ldr r2,[sp,#56] - veor d25,d25,d24 - and r12,r12,r3 - add r10,r10,r6 - vshr.u32 d24,d5,#19 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - vsli.32 d24,d5,#13 - add r5,r5,r2 - eor r2,r11,r4 - veor d25,d25,d24 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - vadd.i32 d6,d6,d25 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - vshr.u32 d24,d6,#17 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - vsli.32 d24,d6,#15 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - vshr.u32 d25,d6,#10 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - veor d25,d25,d24 - ldr r2,[sp,#60] - and r3,r3,r12 - vshr.u32 d24,d6,#19 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - vld1.32 {q8},[r14,:128]! - add r4,r4,r2 - vsli.32 d24,d6,#13 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - veor d25,d25,d24 - add r5,r5,r3 - and r2,r2,r9 - vadd.i32 d7,d7,d25 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - vadd.i32 q8,q8,q3 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[r14] - and r12,r12,r3 - add r8,r8,r4 - vst1.32 {q8},[r1,:128]! - add r4,r4,r0,ror#2 - eor r12,r12,r6 - teq r2,#0 @ check for K256 terminator - ldr r2,[sp,#0] - sub r1,r1,#64 - bne .L_00_48 - - ldr r1,[sp,#68] - ldr r0,[sp,#72] - sub r14,r14,#256 @ rewind r14 - teq r1,r0 - it eq - subeq r1,r1,#64 @ avoid SEGV - vld1.8 {q0},[r1]! @ load next input block - vld1.8 {q1},[r1]! - vld1.8 {q2},[r1]! - vld1.8 {q3},[r1]! - it ne - strne r1,[sp,#68] - mov r1,sp - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - add r4,r4,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r8 - eor r12,r0,r8,ror#19 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vrev32.8 q0,q0 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vadd.i32 q8,q8,q0 - ldr r2,[sp,#4] - and r3,r3,r12 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - add r10,r10,r2 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - add r11,r11,r3 - and r2,r2,r7 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - add r10,r10,r2 - ldr r2,[sp,#8] - and r12,r12,r3 - add r6,r6,r10 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - add r9,r9,r2 - eor r2,r7,r8 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - ldr r2,[sp,#12] - and r3,r3,r12 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - add r8,r8,r2 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - add r9,r9,r3 - and r2,r2,r5 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#16] - and r12,r12,r3 - add r4,r4,r8 - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vst1.32 {q8},[r1,:128]! - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - add r8,r8,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r4 - eor r12,r0,r4,ror#19 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vrev32.8 q1,q1 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vadd.i32 q8,q8,q1 - ldr r2,[sp,#20] - and r3,r3,r12 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - add r6,r6,r2 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - add r7,r7,r3 - and r2,r2,r11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - add r6,r6,r2 - ldr r2,[sp,#24] - and r12,r12,r3 - add r10,r10,r6 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - add r5,r5,r2 - eor r2,r11,r4 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - ldr r2,[sp,#28] - and r3,r3,r12 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - add r4,r4,r2 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - add r5,r5,r3 - and r2,r2,r9 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#32] - and r12,r12,r3 - add r8,r8,r4 - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vst1.32 {q8},[r1,:128]! - add r11,r11,r2 - eor r2,r9,r10 - eor r0,r8,r8,ror#5 - add r4,r4,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r8 - eor r12,r0,r8,ror#19 - eor r0,r4,r4,ror#11 - eor r2,r2,r10 - vrev32.8 q2,q2 - add r11,r11,r12,ror#6 - eor r12,r4,r5 - eor r0,r0,r4,ror#20 - add r11,r11,r2 - vadd.i32 q8,q8,q2 - ldr r2,[sp,#36] - and r3,r3,r12 - add r7,r7,r11 - add r11,r11,r0,ror#2 - eor r3,r3,r5 - add r10,r10,r2 - eor r2,r8,r9 - eor r0,r7,r7,ror#5 - add r11,r11,r3 - and r2,r2,r7 - eor r3,r0,r7,ror#19 - eor r0,r11,r11,ror#11 - eor r2,r2,r9 - add r10,r10,r3,ror#6 - eor r3,r11,r4 - eor r0,r0,r11,ror#20 - add r10,r10,r2 - ldr r2,[sp,#40] - and r12,r12,r3 - add r6,r6,r10 - add r10,r10,r0,ror#2 - eor r12,r12,r4 - add r9,r9,r2 - eor r2,r7,r8 - eor r0,r6,r6,ror#5 - add r10,r10,r12 - and r2,r2,r6 - eor r12,r0,r6,ror#19 - eor r0,r10,r10,ror#11 - eor r2,r2,r8 - add r9,r9,r12,ror#6 - eor r12,r10,r11 - eor r0,r0,r10,ror#20 - add r9,r9,r2 - ldr r2,[sp,#44] - and r3,r3,r12 - add r5,r5,r9 - add r9,r9,r0,ror#2 - eor r3,r3,r11 - add r8,r8,r2 - eor r2,r6,r7 - eor r0,r5,r5,ror#5 - add r9,r9,r3 - and r2,r2,r5 - eor r3,r0,r5,ror#19 - eor r0,r9,r9,ror#11 - eor r2,r2,r7 - add r8,r8,r3,ror#6 - eor r3,r9,r10 - eor r0,r0,r9,ror#20 - add r8,r8,r2 - ldr r2,[sp,#48] - and r12,r12,r3 - add r4,r4,r8 - add r8,r8,r0,ror#2 - eor r12,r12,r10 - vst1.32 {q8},[r1,:128]! - add r7,r7,r2 - eor r2,r5,r6 - eor r0,r4,r4,ror#5 - add r8,r8,r12 - vld1.32 {q8},[r14,:128]! - and r2,r2,r4 - eor r12,r0,r4,ror#19 - eor r0,r8,r8,ror#11 - eor r2,r2,r6 - vrev32.8 q3,q3 - add r7,r7,r12,ror#6 - eor r12,r8,r9 - eor r0,r0,r8,ror#20 - add r7,r7,r2 - vadd.i32 q8,q8,q3 - ldr r2,[sp,#52] - and r3,r3,r12 - add r11,r11,r7 - add r7,r7,r0,ror#2 - eor r3,r3,r9 - add r6,r6,r2 - eor r2,r4,r5 - eor r0,r11,r11,ror#5 - add r7,r7,r3 - and r2,r2,r11 - eor r3,r0,r11,ror#19 - eor r0,r7,r7,ror#11 - eor r2,r2,r5 - add r6,r6,r3,ror#6 - eor r3,r7,r8 - eor r0,r0,r7,ror#20 - add r6,r6,r2 - ldr r2,[sp,#56] - and r12,r12,r3 - add r10,r10,r6 - add r6,r6,r0,ror#2 - eor r12,r12,r8 - add r5,r5,r2 - eor r2,r11,r4 - eor r0,r10,r10,ror#5 - add r6,r6,r12 - and r2,r2,r10 - eor r12,r0,r10,ror#19 - eor r0,r6,r6,ror#11 - eor r2,r2,r4 - add r5,r5,r12,ror#6 - eor r12,r6,r7 - eor r0,r0,r6,ror#20 - add r5,r5,r2 - ldr r2,[sp,#60] - and r3,r3,r12 - add r9,r9,r5 - add r5,r5,r0,ror#2 - eor r3,r3,r7 - add r4,r4,r2 - eor r2,r10,r11 - eor r0,r9,r9,ror#5 - add r5,r5,r3 - and r2,r2,r9 - eor r3,r0,r9,ror#19 - eor r0,r5,r5,ror#11 - eor r2,r2,r11 - add r4,r4,r3,ror#6 - eor r3,r5,r6 - eor r0,r0,r5,ror#20 - add r4,r4,r2 - ldr r2,[sp,#64] - and r12,r12,r3 - add r8,r8,r4 - add r4,r4,r0,ror#2 - eor r12,r12,r6 - vst1.32 {q8},[r1,:128]! - ldr r0,[r2,#0] - add r4,r4,r12 @ h+=Maj(a,b,c) from the past - ldr r12,[r2,#4] - ldr r3,[r2,#8] - ldr r1,[r2,#12] - add r4,r4,r0 @ accumulate - ldr r0,[r2,#16] - add r5,r5,r12 - ldr r12,[r2,#20] - add r6,r6,r3 - ldr r3,[r2,#24] - add r7,r7,r1 - ldr r1,[r2,#28] - add r8,r8,r0 - str r4,[r2],#4 - add r9,r9,r12 - str r5,[r2],#4 - add r10,r10,r3 - str r6,[r2],#4 - add r11,r11,r1 - str r7,[r2],#4 - stmia r2,{r8,r9,r10,r11} - - ittte ne - movne r1,sp - ldrne r2,[sp,#0] - eorne r12,r12,r12 - ldreq sp,[sp,#76] @ restore original sp - itt ne - eorne r3,r5,r6 - bne .L_00_48 - - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} -.size sha256_block_data_order_neon,.-sha256_block_data_order_neon -#endif -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - -# if defined(__thumb2__) -# define INST(a,b,c,d) .byte c,d|0xc,a,b -# else -# define INST(a,b,c,d) .byte a,b,c,d -# endif - -.type sha256_block_data_order_armv8,%function -.align 5 -sha256_block_data_order_armv8: -.LARMv8: - vld1.32 {q0,q1},[r0] - sub r3,r3,#256+32 - add r2,r1,r2,lsl#6 @ len to point at the end of inp - b .Loop_v8 - -.align 4 -.Loop_v8: - vld1.8 {q8,q9},[r1]! - vld1.8 {q10,q11},[r1]! - vld1.32 {q12},[r3]! - vrev32.8 q8,q8 - vrev32.8 q9,q9 - vrev32.8 q10,q10 - vrev32.8 q11,q11 - vmov q14,q0 @ offload - vmov q15,q1 - teq r1,r2 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - INST(0xe2,0x03,0xfa,0xf3) @ sha256su0 q8,q9 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe6,0x0c,0x64,0xf3) @ sha256su1 q8,q10,q11 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - INST(0xe4,0x23,0xfa,0xf3) @ sha256su0 q9,q10 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe0,0x2c,0x66,0xf3) @ sha256su1 q9,q11,q8 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q10 - INST(0xe6,0x43,0xfa,0xf3) @ sha256su0 q10,q11 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - INST(0xe2,0x4c,0x60,0xf3) @ sha256su1 q10,q8,q9 - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q11 - INST(0xe0,0x63,0xfa,0xf3) @ sha256su0 q11,q8 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - INST(0xe4,0x6c,0x62,0xf3) @ sha256su1 q11,q9,q10 - vld1.32 {q13},[r3]! - vadd.i32 q12,q12,q8 - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - - vld1.32 {q12},[r3]! - vadd.i32 q13,q13,q9 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - - vld1.32 {q13},[r3] - vadd.i32 q12,q12,q10 - sub r3,r3,#256-16 @ rewind - vmov q2,q0 - INST(0x68,0x0c,0x02,0xf3) @ sha256h q0,q1,q12 - INST(0x68,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q12 - - vadd.i32 q13,q13,q11 - vmov q2,q0 - INST(0x6a,0x0c,0x02,0xf3) @ sha256h q0,q1,q13 - INST(0x6a,0x2c,0x14,0xf3) @ sha256h2 q1,q2,q13 - - vadd.i32 q0,q0,q14 - vadd.i32 q1,q1,q15 - it ne - bne .Loop_v8 - - vst1.32 {q0,q1},[r0] - - bx lr @ bx lr -.size sha256_block_data_order_armv8,.-sha256_block_data_order_armv8 -#endif -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,47,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S b/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S deleted file mode 100644 index a06d41fe..00000000 --- a/third_party/boringssl/linux-arm/crypto/fipsmodule/sha512-armv4.S +++ /dev/null @@ -1,1894 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__arm__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -@ Copyright 2007-2016 The OpenSSL Project Authors. All Rights Reserved. -@ -@ Licensed under the OpenSSL license (the "License"). You may not use -@ this file except in compliance with the License. You can obtain a copy -@ in the file LICENSE in the source distribution or at -@ https://www.openssl.org/source/license.html - - -@ ==================================================================== -@ Written by Andy Polyakov for the OpenSSL -@ project. The module is, however, dual licensed under OpenSSL and -@ CRYPTOGAMS licenses depending on where you obtain it. For further -@ details see http://www.openssl.org/~appro/cryptogams/. -@ -@ Permission to use under GPL terms is granted. -@ ==================================================================== - -@ SHA512 block procedure for ARMv4. September 2007. - -@ This code is ~4.5 (four and a half) times faster than code generated -@ by gcc 3.4 and it spends ~72 clock cycles per byte [on single-issue -@ Xscale PXA250 core]. -@ -@ July 2010. -@ -@ Rescheduling for dual-issue pipeline resulted in 6% improvement on -@ Cortex A8 core and ~40 cycles per processed byte. - -@ February 2011. -@ -@ Profiler-assisted and platform-specific optimization resulted in 7% -@ improvement on Coxtex A8 core and ~38 cycles per byte. - -@ March 2011. -@ -@ Add NEON implementation. On Cortex A8 it was measured to process -@ one byte in 23.3 cycles or ~60% faster than integer-only code. - -@ August 2012. -@ -@ Improve NEON performance by 12% on Snapdragon S4. In absolute -@ terms it's 22.6 cycles per byte, which is disappointing result. -@ Technical writers asserted that 3-way S4 pipeline can sustain -@ multiple NEON instructions per cycle, but dual NEON issue could -@ not be observed, see http://www.openssl.org/~appro/Snapdragon-S4.html -@ for further details. On side note Cortex-A15 processes one byte in -@ 16 cycles. - -@ Byte order [in]dependence. ========================================= -@ -@ Originally caller was expected to maintain specific *dword* order in -@ h[0-7], namely with most significant dword at *lower* address, which -@ was reflected in below two parameters as 0 and 4. Now caller is -@ expected to maintain native byte order for whole 64-bit values. -#ifndef __KERNEL__ -# include -# define VFP_ABI_PUSH vstmdb sp!,{d8-d15} -# define VFP_ABI_POP vldmia sp!,{d8-d15} -#else -# define __ARM_ARCH__ __LINUX_ARM_ARCH__ -# define __ARM_MAX_ARCH__ 7 -# define VFP_ABI_PUSH -# define VFP_ABI_POP -#endif - -@ Silence ARMv8 deprecated IT instruction warnings. This file is used by both -@ ARMv7 and ARMv8 processors and does not use ARMv8 instructions. -.arch armv7-a - -#ifdef __ARMEL__ -# define LO 0 -# define HI 4 -# define WORD64(hi0,lo0,hi1,lo1) .word lo0,hi0, lo1,hi1 -#else -# define HI 0 -# define LO 4 -# define WORD64(hi0,lo0,hi1,lo1) .word hi0,lo0, hi1,lo1 -#endif - -.text -#if defined(__thumb2__) -.syntax unified -.thumb -# define adrl adr -#else -.code 32 -#endif - -.type K512,%object -.align 5 -K512: - WORD64(0x428a2f98,0xd728ae22, 0x71374491,0x23ef65cd) - WORD64(0xb5c0fbcf,0xec4d3b2f, 0xe9b5dba5,0x8189dbbc) - WORD64(0x3956c25b,0xf348b538, 0x59f111f1,0xb605d019) - WORD64(0x923f82a4,0xaf194f9b, 0xab1c5ed5,0xda6d8118) - WORD64(0xd807aa98,0xa3030242, 0x12835b01,0x45706fbe) - WORD64(0x243185be,0x4ee4b28c, 0x550c7dc3,0xd5ffb4e2) - WORD64(0x72be5d74,0xf27b896f, 0x80deb1fe,0x3b1696b1) - WORD64(0x9bdc06a7,0x25c71235, 0xc19bf174,0xcf692694) - WORD64(0xe49b69c1,0x9ef14ad2, 0xefbe4786,0x384f25e3) - WORD64(0x0fc19dc6,0x8b8cd5b5, 0x240ca1cc,0x77ac9c65) - WORD64(0x2de92c6f,0x592b0275, 0x4a7484aa,0x6ea6e483) - WORD64(0x5cb0a9dc,0xbd41fbd4, 0x76f988da,0x831153b5) - WORD64(0x983e5152,0xee66dfab, 0xa831c66d,0x2db43210) - WORD64(0xb00327c8,0x98fb213f, 0xbf597fc7,0xbeef0ee4) - WORD64(0xc6e00bf3,0x3da88fc2, 0xd5a79147,0x930aa725) - WORD64(0x06ca6351,0xe003826f, 0x14292967,0x0a0e6e70) - WORD64(0x27b70a85,0x46d22ffc, 0x2e1b2138,0x5c26c926) - WORD64(0x4d2c6dfc,0x5ac42aed, 0x53380d13,0x9d95b3df) - WORD64(0x650a7354,0x8baf63de, 0x766a0abb,0x3c77b2a8) - WORD64(0x81c2c92e,0x47edaee6, 0x92722c85,0x1482353b) - WORD64(0xa2bfe8a1,0x4cf10364, 0xa81a664b,0xbc423001) - WORD64(0xc24b8b70,0xd0f89791, 0xc76c51a3,0x0654be30) - WORD64(0xd192e819,0xd6ef5218, 0xd6990624,0x5565a910) - WORD64(0xf40e3585,0x5771202a, 0x106aa070,0x32bbd1b8) - WORD64(0x19a4c116,0xb8d2d0c8, 0x1e376c08,0x5141ab53) - WORD64(0x2748774c,0xdf8eeb99, 0x34b0bcb5,0xe19b48a8) - WORD64(0x391c0cb3,0xc5c95a63, 0x4ed8aa4a,0xe3418acb) - WORD64(0x5b9cca4f,0x7763e373, 0x682e6ff3,0xd6b2b8a3) - WORD64(0x748f82ee,0x5defb2fc, 0x78a5636f,0x43172f60) - WORD64(0x84c87814,0xa1f0ab72, 0x8cc70208,0x1a6439ec) - WORD64(0x90befffa,0x23631e28, 0xa4506ceb,0xde82bde9) - WORD64(0xbef9a3f7,0xb2c67915, 0xc67178f2,0xe372532b) - WORD64(0xca273ece,0xea26619c, 0xd186b8c7,0x21c0c207) - WORD64(0xeada7dd6,0xcde0eb1e, 0xf57d4f7f,0xee6ed178) - WORD64(0x06f067aa,0x72176fba, 0x0a637dc5,0xa2c898a6) - WORD64(0x113f9804,0xbef90dae, 0x1b710b35,0x131c471b) - WORD64(0x28db77f5,0x23047d84, 0x32caab7b,0x40c72493) - WORD64(0x3c9ebe0a,0x15c9bebc, 0x431d67c4,0x9c100d4c) - WORD64(0x4cc5d4be,0xcb3e42b6, 0x597f299c,0xfc657e2a) - WORD64(0x5fcb6fab,0x3ad6faec, 0x6c44198c,0x4a475817) -.size K512,.-K512 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.LOPENSSL_armcap: -.word OPENSSL_armcap_P-.Lsha512_block_data_order -.skip 32-4 -#else -.skip 32 -#endif - -.globl sha512_block_data_order -.hidden sha512_block_data_order -.type sha512_block_data_order,%function -sha512_block_data_order: -.Lsha512_block_data_order: -#if __ARM_ARCH__<7 && !defined(__thumb2__) - sub r3,pc,#8 @ sha512_block_data_order -#else - adr r3,.Lsha512_block_data_order -#endif -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) - ldr r12,.LOPENSSL_armcap - ldr r12,[r3,r12] @ OPENSSL_armcap_P -#ifdef __APPLE__ - ldr r12,[r12] -#endif - tst r12,#ARMV7_NEON - bne .LNEON -#endif - add r2,r1,r2,lsl#7 @ len to point at the end of inp - stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - sub r14,r3,#672 @ K512 - sub sp,sp,#9*8 - - ldr r7,[r0,#32+LO] - ldr r8,[r0,#32+HI] - ldr r9, [r0,#48+LO] - ldr r10, [r0,#48+HI] - ldr r11, [r0,#56+LO] - ldr r12, [r0,#56+HI] -.Loop: - str r9, [sp,#48+0] - str r10, [sp,#48+4] - str r11, [sp,#56+0] - str r12, [sp,#56+4] - ldr r5,[r0,#0+LO] - ldr r6,[r0,#0+HI] - ldr r3,[r0,#8+LO] - ldr r4,[r0,#8+HI] - ldr r9, [r0,#16+LO] - ldr r10, [r0,#16+HI] - ldr r11, [r0,#24+LO] - ldr r12, [r0,#24+HI] - str r3,[sp,#8+0] - str r4,[sp,#8+4] - str r9, [sp,#16+0] - str r10, [sp,#16+4] - str r11, [sp,#24+0] - str r12, [sp,#24+4] - ldr r3,[r0,#40+LO] - ldr r4,[r0,#40+HI] - str r3,[sp,#40+0] - str r4,[sp,#40+4] - -.L00_15: -#if __ARM_ARCH__<7 - ldrb r3,[r1,#7] - ldrb r9, [r1,#6] - ldrb r10, [r1,#5] - ldrb r11, [r1,#4] - ldrb r4,[r1,#3] - ldrb r12, [r1,#2] - orr r3,r3,r9,lsl#8 - ldrb r9, [r1,#1] - orr r3,r3,r10,lsl#16 - ldrb r10, [r1],#8 - orr r3,r3,r11,lsl#24 - orr r4,r4,r12,lsl#8 - orr r4,r4,r9,lsl#16 - orr r4,r4,r10,lsl#24 -#else - ldr r3,[r1,#4] - ldr r4,[r1],#8 -#ifdef __ARMEL__ - rev r3,r3 - rev r4,r4 -#endif -#endif - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov r9,r7,lsr#14 - str r3,[sp,#64+0] - mov r10,r8,lsr#14 - str r4,[sp,#64+4] - eor r9,r9,r8,lsl#18 - ldr r11,[sp,#56+0] @ h.lo - eor r10,r10,r7,lsl#18 - ldr r12,[sp,#56+4] @ h.hi - eor r9,r9,r7,lsr#18 - eor r10,r10,r8,lsr#18 - eor r9,r9,r8,lsl#14 - eor r10,r10,r7,lsl#14 - eor r9,r9,r8,lsr#9 - eor r10,r10,r7,lsr#9 - eor r9,r9,r7,lsl#23 - eor r10,r10,r8,lsl#23 @ Sigma1(e) - adds r3,r3,r9 - ldr r9,[sp,#40+0] @ f.lo - adc r4,r4,r10 @ T += Sigma1(e) - ldr r10,[sp,#40+4] @ f.hi - adds r3,r3,r11 - ldr r11,[sp,#48+0] @ g.lo - adc r4,r4,r12 @ T += h - ldr r12,[sp,#48+4] @ g.hi - - eor r9,r9,r11 - str r7,[sp,#32+0] - eor r10,r10,r12 - str r8,[sp,#32+4] - and r9,r9,r7 - str r5,[sp,#0+0] - and r10,r10,r8 - str r6,[sp,#0+4] - eor r9,r9,r11 - ldr r11,[r14,#LO] @ K[i].lo - eor r10,r10,r12 @ Ch(e,f,g) - ldr r12,[r14,#HI] @ K[i].hi - - adds r3,r3,r9 - ldr r7,[sp,#24+0] @ d.lo - adc r4,r4,r10 @ T += Ch(e,f,g) - ldr r8,[sp,#24+4] @ d.hi - adds r3,r3,r11 - and r9,r11,#0xff - adc r4,r4,r12 @ T += K[i] - adds r7,r7,r3 - ldr r11,[sp,#8+0] @ b.lo - adc r8,r8,r4 @ d += T - teq r9,#148 - - ldr r12,[sp,#16+0] @ c.lo -#if __ARM_ARCH__>=7 - it eq @ Thumb2 thing, sanity check in ARM -#endif - orreq r14,r14,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov r9,r5,lsr#28 - mov r10,r6,lsr#28 - eor r9,r9,r6,lsl#4 - eor r10,r10,r5,lsl#4 - eor r9,r9,r6,lsr#2 - eor r10,r10,r5,lsr#2 - eor r9,r9,r5,lsl#30 - eor r10,r10,r6,lsl#30 - eor r9,r9,r6,lsr#7 - eor r10,r10,r5,lsr#7 - eor r9,r9,r5,lsl#25 - eor r10,r10,r6,lsl#25 @ Sigma0(a) - adds r3,r3,r9 - and r9,r5,r11 - adc r4,r4,r10 @ T += Sigma0(a) - - ldr r10,[sp,#8+4] @ b.hi - orr r5,r5,r11 - ldr r11,[sp,#16+4] @ c.hi - and r5,r5,r12 - and r12,r6,r10 - orr r6,r6,r10 - orr r5,r5,r9 @ Maj(a,b,c).lo - and r6,r6,r11 - adds r5,r5,r3 - orr r6,r6,r12 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc r6,r6,r4 @ h += T - tst r14,#1 - add r14,r14,#8 - tst r14,#1 - beq .L00_15 - ldr r9,[sp,#184+0] - ldr r10,[sp,#184+4] - bic r14,r14,#1 -.L16_79: - @ sigma0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7)) - @ LO lo>>1^hi<<31 ^ lo>>8^hi<<24 ^ lo>>7^hi<<25 - @ HI hi>>1^lo<<31 ^ hi>>8^lo<<24 ^ hi>>7 - mov r3,r9,lsr#1 - ldr r11,[sp,#80+0] - mov r4,r10,lsr#1 - ldr r12,[sp,#80+4] - eor r3,r3,r10,lsl#31 - eor r4,r4,r9,lsl#31 - eor r3,r3,r9,lsr#8 - eor r4,r4,r10,lsr#8 - eor r3,r3,r10,lsl#24 - eor r4,r4,r9,lsl#24 - eor r3,r3,r9,lsr#7 - eor r4,r4,r10,lsr#7 - eor r3,r3,r10,lsl#25 - - @ sigma1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6)) - @ LO lo>>19^hi<<13 ^ hi>>29^lo<<3 ^ lo>>6^hi<<26 - @ HI hi>>19^lo<<13 ^ lo>>29^hi<<3 ^ hi>>6 - mov r9,r11,lsr#19 - mov r10,r12,lsr#19 - eor r9,r9,r12,lsl#13 - eor r10,r10,r11,lsl#13 - eor r9,r9,r12,lsr#29 - eor r10,r10,r11,lsr#29 - eor r9,r9,r11,lsl#3 - eor r10,r10,r12,lsl#3 - eor r9,r9,r11,lsr#6 - eor r10,r10,r12,lsr#6 - ldr r11,[sp,#120+0] - eor r9,r9,r12,lsl#26 - - ldr r12,[sp,#120+4] - adds r3,r3,r9 - ldr r9,[sp,#192+0] - adc r4,r4,r10 - - ldr r10,[sp,#192+4] - adds r3,r3,r11 - adc r4,r4,r12 - adds r3,r3,r9 - adc r4,r4,r10 - @ Sigma1(x) (ROTR((x),14) ^ ROTR((x),18) ^ ROTR((x),41)) - @ LO lo>>14^hi<<18 ^ lo>>18^hi<<14 ^ hi>>9^lo<<23 - @ HI hi>>14^lo<<18 ^ hi>>18^lo<<14 ^ lo>>9^hi<<23 - mov r9,r7,lsr#14 - str r3,[sp,#64+0] - mov r10,r8,lsr#14 - str r4,[sp,#64+4] - eor r9,r9,r8,lsl#18 - ldr r11,[sp,#56+0] @ h.lo - eor r10,r10,r7,lsl#18 - ldr r12,[sp,#56+4] @ h.hi - eor r9,r9,r7,lsr#18 - eor r10,r10,r8,lsr#18 - eor r9,r9,r8,lsl#14 - eor r10,r10,r7,lsl#14 - eor r9,r9,r8,lsr#9 - eor r10,r10,r7,lsr#9 - eor r9,r9,r7,lsl#23 - eor r10,r10,r8,lsl#23 @ Sigma1(e) - adds r3,r3,r9 - ldr r9,[sp,#40+0] @ f.lo - adc r4,r4,r10 @ T += Sigma1(e) - ldr r10,[sp,#40+4] @ f.hi - adds r3,r3,r11 - ldr r11,[sp,#48+0] @ g.lo - adc r4,r4,r12 @ T += h - ldr r12,[sp,#48+4] @ g.hi - - eor r9,r9,r11 - str r7,[sp,#32+0] - eor r10,r10,r12 - str r8,[sp,#32+4] - and r9,r9,r7 - str r5,[sp,#0+0] - and r10,r10,r8 - str r6,[sp,#0+4] - eor r9,r9,r11 - ldr r11,[r14,#LO] @ K[i].lo - eor r10,r10,r12 @ Ch(e,f,g) - ldr r12,[r14,#HI] @ K[i].hi - - adds r3,r3,r9 - ldr r7,[sp,#24+0] @ d.lo - adc r4,r4,r10 @ T += Ch(e,f,g) - ldr r8,[sp,#24+4] @ d.hi - adds r3,r3,r11 - and r9,r11,#0xff - adc r4,r4,r12 @ T += K[i] - adds r7,r7,r3 - ldr r11,[sp,#8+0] @ b.lo - adc r8,r8,r4 @ d += T - teq r9,#23 - - ldr r12,[sp,#16+0] @ c.lo -#if __ARM_ARCH__>=7 - it eq @ Thumb2 thing, sanity check in ARM -#endif - orreq r14,r14,#1 - @ Sigma0(x) (ROTR((x),28) ^ ROTR((x),34) ^ ROTR((x),39)) - @ LO lo>>28^hi<<4 ^ hi>>2^lo<<30 ^ hi>>7^lo<<25 - @ HI hi>>28^lo<<4 ^ lo>>2^hi<<30 ^ lo>>7^hi<<25 - mov r9,r5,lsr#28 - mov r10,r6,lsr#28 - eor r9,r9,r6,lsl#4 - eor r10,r10,r5,lsl#4 - eor r9,r9,r6,lsr#2 - eor r10,r10,r5,lsr#2 - eor r9,r9,r5,lsl#30 - eor r10,r10,r6,lsl#30 - eor r9,r9,r6,lsr#7 - eor r10,r10,r5,lsr#7 - eor r9,r9,r5,lsl#25 - eor r10,r10,r6,lsl#25 @ Sigma0(a) - adds r3,r3,r9 - and r9,r5,r11 - adc r4,r4,r10 @ T += Sigma0(a) - - ldr r10,[sp,#8+4] @ b.hi - orr r5,r5,r11 - ldr r11,[sp,#16+4] @ c.hi - and r5,r5,r12 - and r12,r6,r10 - orr r6,r6,r10 - orr r5,r5,r9 @ Maj(a,b,c).lo - and r6,r6,r11 - adds r5,r5,r3 - orr r6,r6,r12 @ Maj(a,b,c).hi - sub sp,sp,#8 - adc r6,r6,r4 @ h += T - tst r14,#1 - add r14,r14,#8 -#if __ARM_ARCH__>=7 - ittt eq @ Thumb2 thing, sanity check in ARM -#endif - ldreq r9,[sp,#184+0] - ldreq r10,[sp,#184+4] - beq .L16_79 - bic r14,r14,#1 - - ldr r3,[sp,#8+0] - ldr r4,[sp,#8+4] - ldr r9, [r0,#0+LO] - ldr r10, [r0,#0+HI] - ldr r11, [r0,#8+LO] - ldr r12, [r0,#8+HI] - adds r9,r5,r9 - str r9, [r0,#0+LO] - adc r10,r6,r10 - str r10, [r0,#0+HI] - adds r11,r3,r11 - str r11, [r0,#8+LO] - adc r12,r4,r12 - str r12, [r0,#8+HI] - - ldr r5,[sp,#16+0] - ldr r6,[sp,#16+4] - ldr r3,[sp,#24+0] - ldr r4,[sp,#24+4] - ldr r9, [r0,#16+LO] - ldr r10, [r0,#16+HI] - ldr r11, [r0,#24+LO] - ldr r12, [r0,#24+HI] - adds r9,r5,r9 - str r9, [r0,#16+LO] - adc r10,r6,r10 - str r10, [r0,#16+HI] - adds r11,r3,r11 - str r11, [r0,#24+LO] - adc r12,r4,r12 - str r12, [r0,#24+HI] - - ldr r3,[sp,#40+0] - ldr r4,[sp,#40+4] - ldr r9, [r0,#32+LO] - ldr r10, [r0,#32+HI] - ldr r11, [r0,#40+LO] - ldr r12, [r0,#40+HI] - adds r7,r7,r9 - str r7,[r0,#32+LO] - adc r8,r8,r10 - str r8,[r0,#32+HI] - adds r11,r3,r11 - str r11, [r0,#40+LO] - adc r12,r4,r12 - str r12, [r0,#40+HI] - - ldr r5,[sp,#48+0] - ldr r6,[sp,#48+4] - ldr r3,[sp,#56+0] - ldr r4,[sp,#56+4] - ldr r9, [r0,#48+LO] - ldr r10, [r0,#48+HI] - ldr r11, [r0,#56+LO] - ldr r12, [r0,#56+HI] - adds r9,r5,r9 - str r9, [r0,#48+LO] - adc r10,r6,r10 - str r10, [r0,#48+HI] - adds r11,r3,r11 - str r11, [r0,#56+LO] - adc r12,r4,r12 - str r12, [r0,#56+HI] - - add sp,sp,#640 - sub r14,r14,#640 - - teq r1,r2 - bne .Loop - - add sp,sp,#8*9 @ destroy frame -#if __ARM_ARCH__>=5 - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,pc} -#else - ldmia sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} - tst lr,#1 - moveq pc,lr @ be binary compatible with V4, yet -.word 0xe12fff1e @ interoperable with Thumb ISA:-) -#endif -.size sha512_block_data_order,.-sha512_block_data_order -#if __ARM_MAX_ARCH__>=7 -.arch armv7-a -.fpu neon - -.globl sha512_block_data_order_neon -.hidden sha512_block_data_order_neon -.type sha512_block_data_order_neon,%function -.align 4 -sha512_block_data_order_neon: -.LNEON: - dmb @ errata #451034 on early Cortex A8 - add r2,r1,r2,lsl#7 @ len to point at the end of inp - adr r3,K512 - VFP_ABI_PUSH - vldmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ load context -.Loop_neon: - vshr.u64 d24,d20,#14 @ 0 -#if 0<16 - vld1.64 {d0},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d20,#18 -#if 0>0 - vadd.i64 d16,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d20,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 0<16 && defined(__ARMEL__) - vrev64.8 d0,d0 -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d0 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 1 -#if 1<16 - vld1.64 {d1},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 1>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 1<16 && defined(__ARMEL__) - vrev64.8 d1,d1 -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d1 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 d24,d18,#14 @ 2 -#if 2<16 - vld1.64 {d2},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d18,#18 -#if 2>0 - vadd.i64 d22,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d18,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 2<16 && defined(__ARMEL__) - vrev64.8 d2,d2 -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d2 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 3 -#if 3<16 - vld1.64 {d3},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 3>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 3<16 && defined(__ARMEL__) - vrev64.8 d3,d3 -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d3 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 d24,d16,#14 @ 4 -#if 4<16 - vld1.64 {d4},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d16,#18 -#if 4>0 - vadd.i64 d20,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d16,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 4<16 && defined(__ARMEL__) - vrev64.8 d4,d4 -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d4 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 5 -#if 5<16 - vld1.64 {d5},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 5>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 5<16 && defined(__ARMEL__) - vrev64.8 d5,d5 -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d5 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 d24,d22,#14 @ 6 -#if 6<16 - vld1.64 {d6},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d22,#18 -#if 6>0 - vadd.i64 d18,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d22,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 6<16 && defined(__ARMEL__) - vrev64.8 d6,d6 -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d6 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 7 -#if 7<16 - vld1.64 {d7},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 7>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 7<16 && defined(__ARMEL__) - vrev64.8 d7,d7 -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d7 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - vshr.u64 d24,d20,#14 @ 8 -#if 8<16 - vld1.64 {d8},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d20,#18 -#if 8>0 - vadd.i64 d16,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d20,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 8<16 && defined(__ARMEL__) - vrev64.8 d8,d8 -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d8 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 9 -#if 9<16 - vld1.64 {d9},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 9>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 9<16 && defined(__ARMEL__) - vrev64.8 d9,d9 -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d9 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 d24,d18,#14 @ 10 -#if 10<16 - vld1.64 {d10},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d18,#18 -#if 10>0 - vadd.i64 d22,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d18,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 10<16 && defined(__ARMEL__) - vrev64.8 d10,d10 -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d10 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 11 -#if 11<16 - vld1.64 {d11},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 11>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 11<16 && defined(__ARMEL__) - vrev64.8 d11,d11 -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d11 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 d24,d16,#14 @ 12 -#if 12<16 - vld1.64 {d12},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d16,#18 -#if 12>0 - vadd.i64 d20,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d16,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 12<16 && defined(__ARMEL__) - vrev64.8 d12,d12 -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d12 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 13 -#if 13<16 - vld1.64 {d13},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 13>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 13<16 && defined(__ARMEL__) - vrev64.8 d13,d13 -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d13 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 d24,d22,#14 @ 14 -#if 14<16 - vld1.64 {d14},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d22,#18 -#if 14>0 - vadd.i64 d18,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d22,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 14<16 && defined(__ARMEL__) - vrev64.8 d14,d14 -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d14 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 15 -#if 15<16 - vld1.64 {d15},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 15>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 15<16 && defined(__ARMEL__) - vrev64.8 d15,d15 -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d15 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - mov r12,#4 -.L16_79_neon: - subs r12,#1 - vshr.u64 q12,q7,#19 - vshr.u64 q13,q7,#61 - vadd.i64 d16,d30 @ h+=Maj from the past - vshr.u64 q15,q7,#6 - vsli.64 q12,q7,#45 - vext.8 q14,q0,q1,#8 @ X[i+1] - vsli.64 q13,q7,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q0,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q4,q5,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d20,#14 @ from NEON_00_15 - vadd.i64 q0,q14 - vshr.u64 d25,d20,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d20,#41 @ from NEON_00_15 - vadd.i64 q0,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 16<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d0 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 17 -#if 17<16 - vld1.64 {d1},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 17>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 17<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d1 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 q12,q0,#19 - vshr.u64 q13,q0,#61 - vadd.i64 d22,d30 @ h+=Maj from the past - vshr.u64 q15,q0,#6 - vsli.64 q12,q0,#45 - vext.8 q14,q1,q2,#8 @ X[i+1] - vsli.64 q13,q0,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q1,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q5,q6,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d18,#14 @ from NEON_00_15 - vadd.i64 q1,q14 - vshr.u64 d25,d18,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d18,#41 @ from NEON_00_15 - vadd.i64 q1,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 18<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d2 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 19 -#if 19<16 - vld1.64 {d3},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 19>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 19<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d3 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 q12,q1,#19 - vshr.u64 q13,q1,#61 - vadd.i64 d20,d30 @ h+=Maj from the past - vshr.u64 q15,q1,#6 - vsli.64 q12,q1,#45 - vext.8 q14,q2,q3,#8 @ X[i+1] - vsli.64 q13,q1,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q2,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q6,q7,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d16,#14 @ from NEON_00_15 - vadd.i64 q2,q14 - vshr.u64 d25,d16,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d16,#41 @ from NEON_00_15 - vadd.i64 q2,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 20<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d4 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 21 -#if 21<16 - vld1.64 {d5},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 21>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 21<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d5 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 q12,q2,#19 - vshr.u64 q13,q2,#61 - vadd.i64 d18,d30 @ h+=Maj from the past - vshr.u64 q15,q2,#6 - vsli.64 q12,q2,#45 - vext.8 q14,q3,q4,#8 @ X[i+1] - vsli.64 q13,q2,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q3,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q7,q0,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d22,#14 @ from NEON_00_15 - vadd.i64 q3,q14 - vshr.u64 d25,d22,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d22,#41 @ from NEON_00_15 - vadd.i64 q3,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 22<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d6 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 23 -#if 23<16 - vld1.64 {d7},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 23>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 23<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d7 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - vshr.u64 q12,q3,#19 - vshr.u64 q13,q3,#61 - vadd.i64 d16,d30 @ h+=Maj from the past - vshr.u64 q15,q3,#6 - vsli.64 q12,q3,#45 - vext.8 q14,q4,q5,#8 @ X[i+1] - vsli.64 q13,q3,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q4,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q0,q1,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d20,#14 @ from NEON_00_15 - vadd.i64 q4,q14 - vshr.u64 d25,d20,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d20,#41 @ from NEON_00_15 - vadd.i64 q4,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d20,#50 - vsli.64 d25,d20,#46 - vmov d29,d20 - vsli.64 d26,d20,#23 -#if 24<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d21,d22 @ Ch(e,f,g) - vshr.u64 d24,d16,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d23 - vshr.u64 d25,d16,#34 - vsli.64 d24,d16,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d16,#39 - vadd.i64 d28,d8 - vsli.64 d25,d16,#30 - veor d30,d16,d17 - vsli.64 d26,d16,#25 - veor d23,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d18,d17 @ Maj(a,b,c) - veor d23,d26 @ Sigma0(a) - vadd.i64 d19,d27 - vadd.i64 d30,d27 - @ vadd.i64 d23,d30 - vshr.u64 d24,d19,#14 @ 25 -#if 25<16 - vld1.64 {d9},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d19,#18 -#if 25>0 - vadd.i64 d23,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d19,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d19,#50 - vsli.64 d25,d19,#46 - vmov d29,d19 - vsli.64 d26,d19,#23 -#if 25<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d20,d21 @ Ch(e,f,g) - vshr.u64 d24,d23,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d22 - vshr.u64 d25,d23,#34 - vsli.64 d24,d23,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d23,#39 - vadd.i64 d28,d9 - vsli.64 d25,d23,#30 - veor d30,d23,d16 - vsli.64 d26,d23,#25 - veor d22,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d17,d16 @ Maj(a,b,c) - veor d22,d26 @ Sigma0(a) - vadd.i64 d18,d27 - vadd.i64 d30,d27 - @ vadd.i64 d22,d30 - vshr.u64 q12,q4,#19 - vshr.u64 q13,q4,#61 - vadd.i64 d22,d30 @ h+=Maj from the past - vshr.u64 q15,q4,#6 - vsli.64 q12,q4,#45 - vext.8 q14,q5,q6,#8 @ X[i+1] - vsli.64 q13,q4,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q5,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q1,q2,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d18,#14 @ from NEON_00_15 - vadd.i64 q5,q14 - vshr.u64 d25,d18,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d18,#41 @ from NEON_00_15 - vadd.i64 q5,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d18,#50 - vsli.64 d25,d18,#46 - vmov d29,d18 - vsli.64 d26,d18,#23 -#if 26<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d19,d20 @ Ch(e,f,g) - vshr.u64 d24,d22,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d21 - vshr.u64 d25,d22,#34 - vsli.64 d24,d22,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d22,#39 - vadd.i64 d28,d10 - vsli.64 d25,d22,#30 - veor d30,d22,d23 - vsli.64 d26,d22,#25 - veor d21,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d16,d23 @ Maj(a,b,c) - veor d21,d26 @ Sigma0(a) - vadd.i64 d17,d27 - vadd.i64 d30,d27 - @ vadd.i64 d21,d30 - vshr.u64 d24,d17,#14 @ 27 -#if 27<16 - vld1.64 {d11},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d17,#18 -#if 27>0 - vadd.i64 d21,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d17,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d17,#50 - vsli.64 d25,d17,#46 - vmov d29,d17 - vsli.64 d26,d17,#23 -#if 27<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d18,d19 @ Ch(e,f,g) - vshr.u64 d24,d21,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d20 - vshr.u64 d25,d21,#34 - vsli.64 d24,d21,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d21,#39 - vadd.i64 d28,d11 - vsli.64 d25,d21,#30 - veor d30,d21,d22 - vsli.64 d26,d21,#25 - veor d20,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d23,d22 @ Maj(a,b,c) - veor d20,d26 @ Sigma0(a) - vadd.i64 d16,d27 - vadd.i64 d30,d27 - @ vadd.i64 d20,d30 - vshr.u64 q12,q5,#19 - vshr.u64 q13,q5,#61 - vadd.i64 d20,d30 @ h+=Maj from the past - vshr.u64 q15,q5,#6 - vsli.64 q12,q5,#45 - vext.8 q14,q6,q7,#8 @ X[i+1] - vsli.64 q13,q5,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q6,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q2,q3,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d16,#14 @ from NEON_00_15 - vadd.i64 q6,q14 - vshr.u64 d25,d16,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d16,#41 @ from NEON_00_15 - vadd.i64 q6,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d16,#50 - vsli.64 d25,d16,#46 - vmov d29,d16 - vsli.64 d26,d16,#23 -#if 28<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d17,d18 @ Ch(e,f,g) - vshr.u64 d24,d20,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d19 - vshr.u64 d25,d20,#34 - vsli.64 d24,d20,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d20,#39 - vadd.i64 d28,d12 - vsli.64 d25,d20,#30 - veor d30,d20,d21 - vsli.64 d26,d20,#25 - veor d19,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d22,d21 @ Maj(a,b,c) - veor d19,d26 @ Sigma0(a) - vadd.i64 d23,d27 - vadd.i64 d30,d27 - @ vadd.i64 d19,d30 - vshr.u64 d24,d23,#14 @ 29 -#if 29<16 - vld1.64 {d13},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d23,#18 -#if 29>0 - vadd.i64 d19,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d23,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d23,#50 - vsli.64 d25,d23,#46 - vmov d29,d23 - vsli.64 d26,d23,#23 -#if 29<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d16,d17 @ Ch(e,f,g) - vshr.u64 d24,d19,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d18 - vshr.u64 d25,d19,#34 - vsli.64 d24,d19,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d19,#39 - vadd.i64 d28,d13 - vsli.64 d25,d19,#30 - veor d30,d19,d20 - vsli.64 d26,d19,#25 - veor d18,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d21,d20 @ Maj(a,b,c) - veor d18,d26 @ Sigma0(a) - vadd.i64 d22,d27 - vadd.i64 d30,d27 - @ vadd.i64 d18,d30 - vshr.u64 q12,q6,#19 - vshr.u64 q13,q6,#61 - vadd.i64 d18,d30 @ h+=Maj from the past - vshr.u64 q15,q6,#6 - vsli.64 q12,q6,#45 - vext.8 q14,q7,q0,#8 @ X[i+1] - vsli.64 q13,q6,#3 - veor q15,q12 - vshr.u64 q12,q14,#1 - veor q15,q13 @ sigma1(X[i+14]) - vshr.u64 q13,q14,#8 - vadd.i64 q7,q15 - vshr.u64 q15,q14,#7 - vsli.64 q12,q14,#63 - vsli.64 q13,q14,#56 - vext.8 q14,q3,q4,#8 @ X[i+9] - veor q15,q12 - vshr.u64 d24,d22,#14 @ from NEON_00_15 - vadd.i64 q7,q14 - vshr.u64 d25,d22,#18 @ from NEON_00_15 - veor q15,q13 @ sigma0(X[i+1]) - vshr.u64 d26,d22,#41 @ from NEON_00_15 - vadd.i64 q7,q15 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d22,#50 - vsli.64 d25,d22,#46 - vmov d29,d22 - vsli.64 d26,d22,#23 -#if 30<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d23,d16 @ Ch(e,f,g) - vshr.u64 d24,d18,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d17 - vshr.u64 d25,d18,#34 - vsli.64 d24,d18,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d18,#39 - vadd.i64 d28,d14 - vsli.64 d25,d18,#30 - veor d30,d18,d19 - vsli.64 d26,d18,#25 - veor d17,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d20,d19 @ Maj(a,b,c) - veor d17,d26 @ Sigma0(a) - vadd.i64 d21,d27 - vadd.i64 d30,d27 - @ vadd.i64 d17,d30 - vshr.u64 d24,d21,#14 @ 31 -#if 31<16 - vld1.64 {d15},[r1]! @ handles unaligned -#endif - vshr.u64 d25,d21,#18 -#if 31>0 - vadd.i64 d17,d30 @ h+=Maj from the past -#endif - vshr.u64 d26,d21,#41 - vld1.64 {d28},[r3,:64]! @ K[i++] - vsli.64 d24,d21,#50 - vsli.64 d25,d21,#46 - vmov d29,d21 - vsli.64 d26,d21,#23 -#if 31<16 && defined(__ARMEL__) - vrev64.8 , -#endif - veor d25,d24 - vbsl d29,d22,d23 @ Ch(e,f,g) - vshr.u64 d24,d17,#28 - veor d26,d25 @ Sigma1(e) - vadd.i64 d27,d29,d16 - vshr.u64 d25,d17,#34 - vsli.64 d24,d17,#36 - vadd.i64 d27,d26 - vshr.u64 d26,d17,#39 - vadd.i64 d28,d15 - vsli.64 d25,d17,#30 - veor d30,d17,d18 - vsli.64 d26,d17,#25 - veor d16,d24,d25 - vadd.i64 d27,d28 - vbsl d30,d19,d18 @ Maj(a,b,c) - veor d16,d26 @ Sigma0(a) - vadd.i64 d20,d27 - vadd.i64 d30,d27 - @ vadd.i64 d16,d30 - bne .L16_79_neon - - vadd.i64 d16,d30 @ h+=Maj from the past - vldmia r0,{d24,d25,d26,d27,d28,d29,d30,d31} @ load context to temp - vadd.i64 q8,q12 @ vectorized accumulate - vadd.i64 q9,q13 - vadd.i64 q10,q14 - vadd.i64 q11,q15 - vstmia r0,{d16,d17,d18,d19,d20,d21,d22,d23} @ save context - teq r1,r2 - sub r3,#640 @ rewind K512 - bne .Loop_neon - - VFP_ABI_POP - bx lr @ .word 0xe12fff1e -.size sha512_block_data_order_neon,.-sha512_block_data_order_neon -#endif -.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,65,82,77,118,52,47,78,69,79,78,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#if __ARM_MAX_ARCH__>=7 && !defined(__KERNEL__) -.comm OPENSSL_armcap_P,4,4 -.hidden OPENSSL_armcap_P -#endif -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S b/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S deleted file mode 100644 index e5ad6ed9..00000000 --- a/third_party/boringssl/linux-arm/crypto/fipsmodule/vpaes-armv7.S +++ /dev/null @@ -1,1236 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__arm__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.syntax unified - -.arch armv7-a -.fpu neon - -#if defined(__thumb2__) -.thumb -#else -.code 32 -#endif - -.text - -.type _vpaes_consts,%object -.align 7 @ totally strategic alignment -_vpaes_consts: -.Lk_mc_forward:@ mc_forward -.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 -.quad 0x080B0A0904070605, 0x000302010C0F0E0D -.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 -.quad 0x000302010C0F0E0D, 0x080B0A0904070605 -.Lk_mc_backward:@ mc_backward -.quad 0x0605040702010003, 0x0E0D0C0F0A09080B -.quad 0x020100030E0D0C0F, 0x0A09080B06050407 -.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 -.quad 0x0A09080B06050407, 0x020100030E0D0C0F -.Lk_sr:@ sr -.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 -.quad 0x030E09040F0A0500, 0x0B06010C07020D08 -.quad 0x0F060D040B020900, 0x070E050C030A0108 -.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 - -@ -@ "Hot" constants -@ -.Lk_inv:@ inv, inva -.quad 0x0E05060F0D080180, 0x040703090A0B0C02 -.quad 0x01040A060F0B0780, 0x030D0E0C02050809 -.Lk_ipt:@ input transform (lo, hi) -.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 -.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 -.Lk_sbo:@ sbou, sbot -.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 -.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA -.Lk_sb1:@ sb1u, sb1t -.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF -.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 -.Lk_sb2:@ sb2u, sb2t -.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A -.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD - -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,65,82,77,118,55,32,78,69,79,78,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 -.align 2 -.size _vpaes_consts,.-_vpaes_consts -.align 6 -@@ -@@ _aes_preheat -@@ -@@ Fills q9-q15 as specified below. -@@ -.type _vpaes_preheat,%function -.align 4 -_vpaes_preheat: - adr r10, .Lk_inv - vmov.i8 q9, #0x0f @ .Lk_s0F - vld1.64 {q10,q11}, [r10]! @ .Lk_inv - add r10, r10, #64 @ Skip .Lk_ipt, .Lk_sbo - vld1.64 {q12,q13}, [r10]! @ .Lk_sb1 - vld1.64 {q14,q15}, [r10] @ .Lk_sb2 - bx lr - -@@ -@@ _aes_encrypt_core -@@ -@@ AES-encrypt q0. -@@ -@@ Inputs: -@@ q0 = input -@@ q9-q15 as in _vpaes_preheat -@@ [r2] = scheduled keys -@@ -@@ Output in q0 -@@ Clobbers q1-q5, r8-r11 -@@ Preserves q6-q8 so you get some local vectors -@@ -@@ -.type _vpaes_encrypt_core,%function -.align 4 -_vpaes_encrypt_core: - mov r9, r2 - ldr r8, [r2,#240] @ pull rounds - adr r11, .Lk_ipt - @ vmovdqa .Lk_ipt(%rip), %xmm2 # iptlo - @ vmovdqa .Lk_ipt+16(%rip), %xmm3 # ipthi - vld1.64 {q2, q3}, [r11] - adr r11, .Lk_mc_forward+16 - vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 # round0 key - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 - vtbl.8 d2, {q2}, d2 @ vpshufb %xmm1, %xmm2, %xmm1 - vtbl.8 d3, {q2}, d3 - vtbl.8 d4, {q3}, d0 @ vpshufb %xmm0, %xmm3, %xmm2 - vtbl.8 d5, {q3}, d1 - veor q0, q1, q5 @ vpxor %xmm5, %xmm1, %xmm0 - veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 - - @ .Lenc_entry ends with a bnz instruction which is normally paired with - @ subs in .Lenc_loop. - tst r8, r8 - b .Lenc_entry - -.align 4 -.Lenc_loop: - @ middle of middle round - add r10, r11, #0x40 - vtbl.8 d8, {q13}, d4 @ vpshufb %xmm2, %xmm13, %xmm4 # 4 = sb1u - vtbl.8 d9, {q13}, d5 - vld1.64 {q1}, [r11]! @ vmovdqa -0x40(%r11,%r10), %xmm1 # .Lk_mc_forward[] - vtbl.8 d0, {q12}, d6 @ vpshufb %xmm3, %xmm12, %xmm0 # 0 = sb1t - vtbl.8 d1, {q12}, d7 - veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - vtbl.8 d10, {q15}, d4 @ vpshufb %xmm2, %xmm15, %xmm5 # 4 = sb2u - vtbl.8 d11, {q15}, d5 - veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A - vtbl.8 d4, {q14}, d6 @ vpshufb %xmm3, %xmm14, %xmm2 # 2 = sb2t - vtbl.8 d5, {q14}, d7 - vld1.64 {q4}, [r10] @ vmovdqa (%r11,%r10), %xmm4 # .Lk_mc_backward[] - vtbl.8 d6, {q0}, d2 @ vpshufb %xmm1, %xmm0, %xmm3 # 0 = B - vtbl.8 d7, {q0}, d3 - veor q2, q2, q5 @ vpxor %xmm5, %xmm2, %xmm2 # 2 = 2A - @ Write to q5 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d10, {q0}, d8 @ vpshufb %xmm4, %xmm0, %xmm0 # 3 = D - vtbl.8 d11, {q0}, d9 - veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 0 = 2A+B - vtbl.8 d8, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm4 # 0 = 2B+C - vtbl.8 d9, {q3}, d3 - @ Here we restore the original q0/q5 usage. - veor q0, q5, q3 @ vpxor %xmm3, %xmm0, %xmm0 # 3 = 2A+B+D - and r11, r11, #~(1<<6) @ and $0x30, %r11 # ... mod 4 - veor q0, q0, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = 2A+3B+C+D - subs r8, r8, #1 @ nr-- - -.Lenc_entry: - @ top of round - vand q1, q0, q9 @ vpand %xmm0, %xmm9, %xmm1 # 0 = k - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i - vtbl.8 d10, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm5 # 2 = a/k - vtbl.8 d11, {q11}, d3 - veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j - vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - vtbl.8 d7, {q10}, d1 - vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - vtbl.8 d9, {q10}, d3 - veor q3, q3, q5 @ vpxor %xmm5, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - vtbl.8 d5, {q10}, d7 - vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - vtbl.8 d7, {q10}, d9 - veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io - veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - vld1.64 {q5}, [r9]! @ vmovdqu (%r9), %xmm5 - bne .Lenc_loop - - @ middle of last round - add r10, r11, #0x80 - - adr r11, .Lk_sbo - @ Read to q1 instead of q4, so the vtbl.8 instruction below does not - @ overlap table and destination registers. - vld1.64 {q1}, [r11]! @ vmovdqa -0x60(%r10), %xmm4 # 3 : sbou - vld1.64 {q0}, [r11] @ vmovdqa -0x50(%r10), %xmm0 # 0 : sbot .Lk_sbo+16 - vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - vtbl.8 d9, {q1}, d5 - vld1.64 {q1}, [r10] @ vmovdqa 0x40(%r11,%r10), %xmm1 # .Lk_sr[] - @ Write to q2 instead of q0 below, to avoid overlapping table and - @ destination registers. - vtbl.8 d4, {q0}, d6 @ vpshufb %xmm3, %xmm0, %xmm0 # 0 = sb1t - vtbl.8 d5, {q0}, d7 - veor q4, q4, q5 @ vpxor %xmm5, %xmm4, %xmm4 # 4 = sb1u + k - veor q2, q2, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 0 = A - @ Here we restore the original q0/q2 usage. - vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 - vtbl.8 d1, {q2}, d3 - bx lr -.size _vpaes_encrypt_core,.-_vpaes_encrypt_core - -.globl vpaes_encrypt -.hidden vpaes_encrypt -.type vpaes_encrypt,%function -.align 4 -vpaes_encrypt: - @ _vpaes_encrypt_core uses r8-r11. Round up to r7-r11 to maintain stack - @ alignment. - stmdb sp!, {r7,r8,r9,r10,r11,lr} - @ _vpaes_encrypt_core uses q4-q5 (d8-d11), which are callee-saved. - vstmdb sp!, {d8,d9,d10,d11} - - vld1.64 {q0}, [r0] - bl _vpaes_preheat - bl _vpaes_encrypt_core - vst1.64 {q0}, [r1] - - vldmia sp!, {d8,d9,d10,d11} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return -.size vpaes_encrypt,.-vpaes_encrypt - -@ -@ Decryption stuff -@ -.type _vpaes_decrypt_consts,%object -.align 4 -_vpaes_decrypt_consts: -.Lk_dipt:@ decryption input transform -.quad 0x0F505B040B545F00, 0x154A411E114E451A -.quad 0x86E383E660056500, 0x12771772F491F194 -.Lk_dsbo:@ decryption sbox final output -.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D -.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C -.Lk_dsb9:@ decryption sbox output *9*u, *9*t -.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 -.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 -.Lk_dsbd:@ decryption sbox output *D*u, *D*t -.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 -.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 -.Lk_dsbb:@ decryption sbox output *B*u, *B*t -.quad 0xD022649296B44200, 0x602646F6B0F2D404 -.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B -.Lk_dsbe:@ decryption sbox output *E*u, *E*t -.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 -.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 -.size _vpaes_decrypt_consts,.-_vpaes_decrypt_consts - -@@ -@@ Decryption core -@@ -@@ Same API as encryption core, except it clobbers q12-q15 rather than using -@@ the values from _vpaes_preheat. q9-q11 must still be set from -@@ _vpaes_preheat. -@@ -.type _vpaes_decrypt_core,%function -.align 4 -_vpaes_decrypt_core: - mov r9, r2 - ldr r8, [r2,#240] @ pull rounds - - @ This function performs shuffles with various constants. The x86_64 - @ version loads them on-demand into %xmm0-%xmm5. This does not work well - @ for ARMv7 because those registers are shuffle destinations. The ARMv8 - @ version preloads those constants into registers, but ARMv7 has half - @ the registers to work with. Instead, we load them on-demand into - @ q12-q15, registers normally use for preloaded constants. This is fine - @ because decryption doesn't use those constants. The values are - @ constant, so this does not interfere with potential 2x optimizations. - adr r7, .Lk_dipt - - vld1.64 {q12,q13}, [r7] @ vmovdqa .Lk_dipt(%rip), %xmm2 # iptlo - lsl r11, r8, #4 @ mov %rax, %r11; shl $4, %r11 - eor r11, r11, #0x30 @ xor $0x30, %r11 - adr r10, .Lk_sr - and r11, r11, #0x30 @ and $0x30, %r11 - add r11, r11, r10 - adr r10, .Lk_mc_forward+48 - - vld1.64 {q4}, [r9]! @ vmovdqu (%r9), %xmm4 # round0 key - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 - vtbl.8 d4, {q12}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 - vtbl.8 d5, {q12}, d3 - vld1.64 {q5}, [r10] @ vmovdqa .Lk_mc_forward+48(%rip), %xmm5 - @ vmovdqa .Lk_dipt+16(%rip), %xmm1 # ipthi - vtbl.8 d0, {q13}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 - vtbl.8 d1, {q13}, d1 - veor q2, q2, q4 @ vpxor %xmm4, %xmm2, %xmm2 - veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 - - @ .Ldec_entry ends with a bnz instruction which is normally paired with - @ subs in .Ldec_loop. - tst r8, r8 - b .Ldec_entry - -.align 4 -.Ldec_loop: -@ -@ Inverse mix columns -@ - - @ We load .Lk_dsb* into q12-q15 on-demand. See the comment at the top of - @ the function. - adr r10, .Lk_dsb9 - vld1.64 {q12,q13}, [r10]! @ vmovdqa -0x20(%r10),%xmm4 # 4 : sb9u - @ vmovdqa -0x10(%r10),%xmm1 # 0 : sb9t - @ Load sbd* ahead of time. - vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x00(%r10),%xmm4 # 4 : sbdu - @ vmovdqa 0x10(%r10),%xmm1 # 0 : sbdt - vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sb9u - vtbl.8 d9, {q12}, d5 - vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb9t - vtbl.8 d3, {q13}, d7 - veor q0, q4, q0 @ vpxor %xmm4, %xmm0, %xmm0 - - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - - @ Load sbb* ahead of time. - vld1.64 {q12,q13}, [r10]! @ vmovdqa 0x20(%r10),%xmm4 # 4 : sbbu - @ vmovdqa 0x30(%r10),%xmm1 # 0 : sbbt - - vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbdu - vtbl.8 d9, {q14}, d5 - @ Write to q1 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch - vtbl.8 d3, {q0}, d11 - @ Here we restore the original q0/q1 usage. This instruction is - @ reordered from the ARMv8 version so we do not clobber the vtbl.8 - @ below. - veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbdt - vtbl.8 d3, {q15}, d7 - @ vmovdqa 0x20(%r10), %xmm4 # 4 : sbbu - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - @ vmovdqa 0x30(%r10), %xmm1 # 0 : sbbt - - @ Load sbd* ahead of time. - vld1.64 {q14,q15}, [r10]! @ vmovdqa 0x40(%r10),%xmm4 # 4 : sbeu - @ vmovdqa 0x50(%r10),%xmm1 # 0 : sbet - - vtbl.8 d8, {q12}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbbu - vtbl.8 d9, {q12}, d5 - @ Write to q1 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch - vtbl.8 d3, {q0}, d11 - @ Here we restore the original q0/q1 usage. This instruction is - @ reordered from the ARMv8 version so we do not clobber the vtbl.8 - @ below. - veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - vtbl.8 d2, {q13}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbbt - vtbl.8 d3, {q13}, d7 - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - - vtbl.8 d8, {q14}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbeu - vtbl.8 d9, {q14}, d5 - @ Write to q1 instead of q0, so the table and destination registers do - @ not overlap. - vtbl.8 d2, {q0}, d10 @ vpshufb %xmm5, %xmm0, %xmm0 # MC ch - vtbl.8 d3, {q0}, d11 - @ Here we restore the original q0/q1 usage. This instruction is - @ reordered from the ARMv8 version so we do not clobber the vtbl.8 - @ below. - veor q0, q1, q4 @ vpxor %xmm4, %xmm0, %xmm0 # 4 = ch - vtbl.8 d2, {q15}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sbet - vtbl.8 d3, {q15}, d7 - vext.8 q5, q5, q5, #12 @ vpalignr $12, %xmm5, %xmm5, %xmm5 - veor q0, q0, q1 @ vpxor %xmm1, %xmm0, %xmm0 # 0 = ch - subs r8, r8, #1 @ sub $1,%rax # nr-- - -.Ldec_entry: - @ top of round - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i - vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - vtbl.8 d5, {q11}, d3 - veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j - vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - vtbl.8 d7, {q10}, d1 - vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - vtbl.8 d9, {q10}, d3 - veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - vtbl.8 d4, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm2 # 2 = 1/iak - vtbl.8 d5, {q10}, d7 - vtbl.8 d6, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm3 # 3 = 1/jak - vtbl.8 d7, {q10}, d9 - veor q2, q2, q1 @ vpxor %xmm1, %xmm2, %xmm2 # 2 = io - veor q3, q3, q0 @ vpxor %xmm0, %xmm3, %xmm3 # 3 = jo - vld1.64 {q0}, [r9]! @ vmovdqu (%r9), %xmm0 - bne .Ldec_loop - - @ middle of last round - - adr r10, .Lk_dsbo - - @ Write to q1 rather than q4 to avoid overlapping table and destination. - vld1.64 {q1}, [r10]! @ vmovdqa 0x60(%r10), %xmm4 # 3 : sbou - vtbl.8 d8, {q1}, d4 @ vpshufb %xmm2, %xmm4, %xmm4 # 4 = sbou - vtbl.8 d9, {q1}, d5 - @ Write to q2 rather than q1 to avoid overlapping table and destination. - vld1.64 {q2}, [r10] @ vmovdqa 0x70(%r10), %xmm1 # 0 : sbot - vtbl.8 d2, {q2}, d6 @ vpshufb %xmm3, %xmm1, %xmm1 # 0 = sb1t - vtbl.8 d3, {q2}, d7 - vld1.64 {q2}, [r11] @ vmovdqa -0x160(%r11), %xmm2 # .Lk_sr-.Lk_dsbd=-0x160 - veor q4, q4, q0 @ vpxor %xmm0, %xmm4, %xmm4 # 4 = sb1u + k - @ Write to q1 rather than q0 so the table and destination registers - @ below do not overlap. - veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm0 # 0 = A - vtbl.8 d0, {q1}, d4 @ vpshufb %xmm2, %xmm0, %xmm0 - vtbl.8 d1, {q1}, d5 - bx lr -.size _vpaes_decrypt_core,.-_vpaes_decrypt_core - -.globl vpaes_decrypt -.hidden vpaes_decrypt -.type vpaes_decrypt,%function -.align 4 -vpaes_decrypt: - @ _vpaes_decrypt_core uses r7-r11. - stmdb sp!, {r7,r8,r9,r10,r11,lr} - @ _vpaes_decrypt_core uses q4-q5 (d8-d11), which are callee-saved. - vstmdb sp!, {d8,d9,d10,d11} - - vld1.64 {q0}, [r0] - bl _vpaes_preheat - bl _vpaes_decrypt_core - vst1.64 {q0}, [r1] - - vldmia sp!, {d8,d9,d10,d11} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return -.size vpaes_decrypt,.-vpaes_decrypt -@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ -@@ @@ -@@ AES key schedule @@ -@@ @@ -@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ - -@ This function diverges from both x86_64 and armv7 in which constants are -@ pinned. x86_64 has a common preheat function for all operations. aarch64 -@ separates them because it has enough registers to pin nearly all constants. -@ armv7 does not have enough registers, but needing explicit loads and stores -@ also complicates using x86_64's register allocation directly. -@ -@ We pin some constants for convenience and leave q14 and q15 free to load -@ others on demand. - -@ -@ Key schedule constants -@ -.type _vpaes_key_consts,%object -.align 4 -_vpaes_key_consts: -.Lk_dksd:@ decryption key schedule: invskew x*D -.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 -.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E -.Lk_dksb:@ decryption key schedule: invskew x*B -.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 -.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 -.Lk_dkse:@ decryption key schedule: invskew x*E + 0x63 -.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 -.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 -.Lk_dks9:@ decryption key schedule: invskew x*9 -.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC -.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE - -.Lk_rcon:@ rcon -.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 - -.Lk_opt:@ output transform -.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 -.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 -.Lk_deskew:@ deskew tables: inverts the sbox's "skew" -.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A -.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 -.size _vpaes_key_consts,.-_vpaes_key_consts - -.type _vpaes_key_preheat,%function -.align 4 -_vpaes_key_preheat: - adr r11, .Lk_rcon - vmov.i8 q12, #0x5b @ .Lk_s63 - adr r10, .Lk_inv @ Must be aligned to 8 mod 16. - vmov.i8 q9, #0x0f @ .Lk_s0F - vld1.64 {q10,q11}, [r10] @ .Lk_inv - vld1.64 {q8}, [r11] @ .Lk_rcon - bx lr -.size _vpaes_key_preheat,.-_vpaes_key_preheat - -.type _vpaes_schedule_core,%function -.align 4 -_vpaes_schedule_core: - @ We only need to save lr, but ARM requires an 8-byte stack alignment, - @ so save an extra register. - stmdb sp!, {r3,lr} - - bl _vpaes_key_preheat @ load the tables - - adr r11, .Lk_ipt @ Must be aligned to 8 mod 16. - vld1.64 {q0}, [r0]! @ vmovdqu (%rdi), %xmm0 # load key (unaligned) - - @ input transform - @ Use q4 here rather than q3 so .Lschedule_am_decrypting does not - @ overlap table and destination. - vmov q4, q0 @ vmovdqa %xmm0, %xmm3 - bl _vpaes_schedule_transform - adr r10, .Lk_sr @ Must be aligned to 8 mod 16. - vmov q7, q0 @ vmovdqa %xmm0, %xmm7 - - add r8, r8, r10 - tst r3, r3 - bne .Lschedule_am_decrypting - - @ encrypting, output zeroth round key after transform - vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) - b .Lschedule_go - -.Lschedule_am_decrypting: - @ decrypting, output zeroth round key after shiftrows - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 - vtbl.8 d6, {q4}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q4}, d3 - vst1.64 {q3}, [r2] @ vmovdqu %xmm3, (%rdx) - eor r8, r8, #0x30 @ xor $0x30, %r8 - -.Lschedule_go: - cmp r1, #192 @ cmp $192, %esi - bhi .Lschedule_256 - beq .Lschedule_192 - @ 128: fall though - -@@ -@@ .schedule_128 -@@ -@@ 128-bit specific part of key schedule. -@@ -@@ This schedule is really simple, because all its parts -@@ are accomplished by the subroutines. -@@ -.Lschedule_128: - mov r0, #10 @ mov $10, %esi - -.Loop_schedule_128: - bl _vpaes_schedule_round - subs r0, r0, #1 @ dec %esi - beq .Lschedule_mangle_last - bl _vpaes_schedule_mangle @ write output - b .Loop_schedule_128 - -@@ -@@ .aes_schedule_192 -@@ -@@ 192-bit specific part of key schedule. -@@ -@@ The main body of this schedule is the same as the 128-bit -@@ schedule, but with more smearing. The long, high side is -@@ stored in q7 as before, and the short, low side is in -@@ the high bits of q6. -@@ -@@ This schedule is somewhat nastier, however, because each -@@ round produces 192 bits of key material, or 1.5 round keys. -@@ Therefore, on each cycle we do 2 rounds and produce 3 round -@@ keys. -@@ -.align 4 -.Lschedule_192: - sub r0, r0, #8 - vld1.64 {q0}, [r0] @ vmovdqu 8(%rdi),%xmm0 # load key part 2 (very unaligned) - bl _vpaes_schedule_transform @ input transform - vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save short part - vmov.i8 d12, #0 @ vpxor %xmm4, %xmm4, %xmm4 # clear 4 - @ vmovhlps %xmm4, %xmm6, %xmm6 # clobber low side with zeros - mov r0, #4 @ mov $4, %esi - -.Loop_schedule_192: - bl _vpaes_schedule_round - vext.8 q0, q6, q0, #8 @ vpalignr $8,%xmm6,%xmm0,%xmm0 - bl _vpaes_schedule_mangle @ save key n - bl _vpaes_schedule_192_smear - bl _vpaes_schedule_mangle @ save key n+1 - bl _vpaes_schedule_round - subs r0, r0, #1 @ dec %esi - beq .Lschedule_mangle_last - bl _vpaes_schedule_mangle @ save key n+2 - bl _vpaes_schedule_192_smear - b .Loop_schedule_192 - -@@ -@@ .aes_schedule_256 -@@ -@@ 256-bit specific part of key schedule. -@@ -@@ The structure here is very similar to the 128-bit -@@ schedule, but with an additional "low side" in -@@ q6. The low side's rounds are the same as the -@@ high side's, except no rcon and no rotation. -@@ -.align 4 -.Lschedule_256: - vld1.64 {q0}, [r0] @ vmovdqu 16(%rdi),%xmm0 # load key part 2 (unaligned) - bl _vpaes_schedule_transform @ input transform - mov r0, #7 @ mov $7, %esi - -.Loop_schedule_256: - bl _vpaes_schedule_mangle @ output low result - vmov q6, q0 @ vmovdqa %xmm0, %xmm6 # save cur_lo in xmm6 - - @ high round - bl _vpaes_schedule_round - subs r0, r0, #1 @ dec %esi - beq .Lschedule_mangle_last - bl _vpaes_schedule_mangle - - @ low round. swap xmm7 and xmm6 - vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 - vmov.i8 q4, #0 - vmov q5, q7 @ vmovdqa %xmm7, %xmm5 - vmov q7, q6 @ vmovdqa %xmm6, %xmm7 - bl _vpaes_schedule_low_round - vmov q7, q5 @ vmovdqa %xmm5, %xmm7 - - b .Loop_schedule_256 - -@@ -@@ .aes_schedule_mangle_last -@@ -@@ Mangler for last round of key schedule -@@ Mangles q0 -@@ when encrypting, outputs out(q0) ^ 63 -@@ when decrypting, outputs unskew(q0) -@@ -@@ Always called right before return... jumps to cleanup and exits -@@ -.align 4 -.Lschedule_mangle_last: - @ schedule last round key from xmm0 - adr r11, .Lk_deskew @ lea .Lk_deskew(%rip),%r11 # prepare to deskew - tst r3, r3 - bne .Lschedule_mangle_last_dec - - @ encrypting - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10),%xmm1 - adr r11, .Lk_opt @ lea .Lk_opt(%rip), %r11 # prepare to output transform - add r2, r2, #32 @ add $32, %rdx - vmov q2, q0 - vtbl.8 d0, {q2}, d2 @ vpshufb %xmm1, %xmm0, %xmm0 # output permute - vtbl.8 d1, {q2}, d3 - -.Lschedule_mangle_last_dec: - sub r2, r2, #16 @ add $-16, %rdx - veor q0, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm0 - bl _vpaes_schedule_transform @ output transform - vst1.64 {q0}, [r2] @ vmovdqu %xmm0, (%rdx) # save last key - - @ cleanup - veor q0, q0, q0 @ vpxor %xmm0, %xmm0, %xmm0 - veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 - veor q2, q2, q2 @ vpxor %xmm2, %xmm2, %xmm2 - veor q3, q3, q3 @ vpxor %xmm3, %xmm3, %xmm3 - veor q4, q4, q4 @ vpxor %xmm4, %xmm4, %xmm4 - veor q5, q5, q5 @ vpxor %xmm5, %xmm5, %xmm5 - veor q6, q6, q6 @ vpxor %xmm6, %xmm6, %xmm6 - veor q7, q7, q7 @ vpxor %xmm7, %xmm7, %xmm7 - ldmia sp!, {r3,pc} @ return -.size _vpaes_schedule_core,.-_vpaes_schedule_core - -@@ -@@ .aes_schedule_192_smear -@@ -@@ Smear the short, low side in the 192-bit key schedule. -@@ -@@ Inputs: -@@ q7: high side, b a x y -@@ q6: low side, d c 0 0 -@@ -@@ Outputs: -@@ q6: b+c+d b+c 0 0 -@@ q0: b+c+d b+c b a -@@ -.type _vpaes_schedule_192_smear,%function -.align 4 -_vpaes_schedule_192_smear: - vmov.i8 q1, #0 - vdup.32 q0, d15[1] - vshl.i64 q1, q6, #32 @ vpshufd $0x80, %xmm6, %xmm1 # d c 0 0 -> c 0 0 0 - vmov d0, d15 @ vpshufd $0xFE, %xmm7, %xmm0 # b a _ _ -> b b b a - veor q6, q6, q1 @ vpxor %xmm1, %xmm6, %xmm6 # -> c+d c 0 0 - veor q1, q1, q1 @ vpxor %xmm1, %xmm1, %xmm1 - veor q6, q6, q0 @ vpxor %xmm0, %xmm6, %xmm6 # -> b+c+d b+c b a - vmov q0, q6 @ vmovdqa %xmm6, %xmm0 - vmov d12, d2 @ vmovhlps %xmm1, %xmm6, %xmm6 # clobber low side with zeros - bx lr -.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear - -@@ -@@ .aes_schedule_round -@@ -@@ Runs one main round of the key schedule on q0, q7 -@@ -@@ Specifically, runs subbytes on the high dword of q0 -@@ then rotates it by one byte and xors into the low dword of -@@ q7. -@@ -@@ Adds rcon from low byte of q8, then rotates q8 for -@@ next rcon. -@@ -@@ Smears the dwords of q7 by xoring the low into the -@@ second low, result into third, result into highest. -@@ -@@ Returns results in q7 = q0. -@@ Clobbers q1-q4, r11. -@@ -.type _vpaes_schedule_round,%function -.align 4 -_vpaes_schedule_round: - @ extract rcon from xmm8 - vmov.i8 q4, #0 @ vpxor %xmm4, %xmm4, %xmm4 - vext.8 q1, q8, q4, #15 @ vpalignr $15, %xmm8, %xmm4, %xmm1 - vext.8 q8, q8, q8, #15 @ vpalignr $15, %xmm8, %xmm8, %xmm8 - veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 - - @ rotate - vdup.32 q0, d1[1] @ vpshufd $0xFF, %xmm0, %xmm0 - vext.8 q0, q0, q0, #1 @ vpalignr $1, %xmm0, %xmm0, %xmm0 - - @ fall through... - - @ low round: same as high round, but no rotation and no rcon. -_vpaes_schedule_low_round: - @ The x86_64 version pins .Lk_sb1 in %xmm13 and .Lk_sb1+16 in %xmm12. - @ We pin other values in _vpaes_key_preheat, so load them now. - adr r11, .Lk_sb1 - vld1.64 {q14,q15}, [r11] - - @ smear xmm7 - vext.8 q1, q4, q7, #12 @ vpslldq $4, %xmm7, %xmm1 - veor q7, q7, q1 @ vpxor %xmm1, %xmm7, %xmm7 - vext.8 q4, q4, q7, #8 @ vpslldq $8, %xmm7, %xmm4 - - @ subbytes - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 # 0 = k - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 # 1 = i - veor q7, q7, q4 @ vpxor %xmm4, %xmm7, %xmm7 - vtbl.8 d4, {q11}, d2 @ vpshufb %xmm1, %xmm11, %xmm2 # 2 = a/k - vtbl.8 d5, {q11}, d3 - veor q1, q1, q0 @ vpxor %xmm0, %xmm1, %xmm1 # 0 = j - vtbl.8 d6, {q10}, d0 @ vpshufb %xmm0, %xmm10, %xmm3 # 3 = 1/i - vtbl.8 d7, {q10}, d1 - veor q3, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 # 3 = iak = 1/i + a/k - vtbl.8 d8, {q10}, d2 @ vpshufb %xmm1, %xmm10, %xmm4 # 4 = 1/j - vtbl.8 d9, {q10}, d3 - veor q7, q7, q12 @ vpxor .Lk_s63(%rip), %xmm7, %xmm7 - vtbl.8 d6, {q10}, d6 @ vpshufb %xmm3, %xmm10, %xmm3 # 2 = 1/iak - vtbl.8 d7, {q10}, d7 - veor q4, q4, q2 @ vpxor %xmm2, %xmm4, %xmm4 # 4 = jak = 1/j + a/k - vtbl.8 d4, {q10}, d8 @ vpshufb %xmm4, %xmm10, %xmm2 # 3 = 1/jak - vtbl.8 d5, {q10}, d9 - veor q3, q3, q1 @ vpxor %xmm1, %xmm3, %xmm3 # 2 = io - veor q2, q2, q0 @ vpxor %xmm0, %xmm2, %xmm2 # 3 = jo - vtbl.8 d8, {q15}, d6 @ vpshufb %xmm3, %xmm13, %xmm4 # 4 = sbou - vtbl.8 d9, {q15}, d7 - vtbl.8 d2, {q14}, d4 @ vpshufb %xmm2, %xmm12, %xmm1 # 0 = sb1t - vtbl.8 d3, {q14}, d5 - veor q1, q1, q4 @ vpxor %xmm4, %xmm1, %xmm1 # 0 = sbox output - - @ add in smeared stuff - veor q0, q1, q7 @ vpxor %xmm7, %xmm1, %xmm0 - veor q7, q1, q7 @ vmovdqa %xmm0, %xmm7 - bx lr -.size _vpaes_schedule_round,.-_vpaes_schedule_round - -@@ -@@ .aes_schedule_transform -@@ -@@ Linear-transform q0 according to tables at [r11] -@@ -@@ Requires that q9 = 0x0F0F... as in preheat -@@ Output in q0 -@@ Clobbers q1, q2, q14, q15 -@@ -.type _vpaes_schedule_transform,%function -.align 4 -_vpaes_schedule_transform: - vld1.64 {q14,q15}, [r11] @ vmovdqa (%r11), %xmm2 # lo - @ vmovdqa 16(%r11), %xmm1 # hi - vand q1, q0, q9 @ vpand %xmm9, %xmm0, %xmm1 - vshr.u8 q0, q0, #4 @ vpsrlb $4, %xmm0, %xmm0 - vtbl.8 d4, {q14}, d2 @ vpshufb %xmm1, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d3 - vtbl.8 d0, {q15}, d0 @ vpshufb %xmm0, %xmm1, %xmm0 - vtbl.8 d1, {q15}, d1 - veor q0, q0, q2 @ vpxor %xmm2, %xmm0, %xmm0 - bx lr -.size _vpaes_schedule_transform,.-_vpaes_schedule_transform - -@@ -@@ .aes_schedule_mangle -@@ -@@ Mangles q0 from (basis-transformed) standard version -@@ to our version. -@@ -@@ On encrypt, -@@ xor with 0x63 -@@ multiply by circulant 0,1,1,1 -@@ apply shiftrows transform -@@ -@@ On decrypt, -@@ xor with 0x63 -@@ multiply by "inverse mixcolumns" circulant E,B,D,9 -@@ deskew -@@ apply shiftrows transform -@@ -@@ -@@ Writes out to [r2], and increments or decrements it -@@ Keeps track of round number mod 4 in r8 -@@ Preserves q0 -@@ Clobbers q1-q5 -@@ -.type _vpaes_schedule_mangle,%function -.align 4 -_vpaes_schedule_mangle: - tst r3, r3 - vmov q4, q0 @ vmovdqa %xmm0, %xmm4 # save xmm0 for later - adr r11, .Lk_mc_forward @ Must be aligned to 8 mod 16. - vld1.64 {q5}, [r11] @ vmovdqa .Lk_mc_forward(%rip),%xmm5 - bne .Lschedule_mangle_dec - - @ encrypting - @ Write to q2 so we do not overlap table and destination below. - veor q2, q0, q12 @ vpxor .Lk_s63(%rip), %xmm0, %xmm4 - add r2, r2, #16 @ add $16, %rdx - vtbl.8 d8, {q2}, d10 @ vpshufb %xmm5, %xmm4, %xmm4 - vtbl.8 d9, {q2}, d11 - vtbl.8 d2, {q4}, d10 @ vpshufb %xmm5, %xmm4, %xmm1 - vtbl.8 d3, {q4}, d11 - vtbl.8 d6, {q1}, d10 @ vpshufb %xmm5, %xmm1, %xmm3 - vtbl.8 d7, {q1}, d11 - veor q4, q4, q1 @ vpxor %xmm1, %xmm4, %xmm4 - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 - veor q3, q3, q4 @ vpxor %xmm4, %xmm3, %xmm3 - - b .Lschedule_mangle_both -.align 4 -.Lschedule_mangle_dec: - @ inverse mix columns - adr r11, .Lk_dksd @ lea .Lk_dksd(%rip),%r11 - vshr.u8 q1, q4, #4 @ vpsrlb $4, %xmm4, %xmm1 # 1 = hi - vand q4, q4, q9 @ vpand %xmm9, %xmm4, %xmm4 # 4 = lo - - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x00(%r11), %xmm2 - @ vmovdqa 0x10(%r11), %xmm3 - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q15}, d3 - @ Load .Lk_dksb ahead of time. - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x20(%r11), %xmm2 - @ vmovdqa 0x30(%r11), %xmm3 - @ Write to q13 so we do not overlap table and destination. - veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 - vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 - vtbl.8 d7, {q13}, d11 - - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 - vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q15}, d3 - @ Load .Lk_dkse ahead of time. - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x40(%r11), %xmm2 - @ vmovdqa 0x50(%r11), %xmm3 - @ Write to q13 so we do not overlap table and destination. - veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 - vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 - vtbl.8 d7, {q13}, d11 - - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 - vtbl.8 d6, {q15}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d7, {q15}, d3 - @ Load .Lk_dkse ahead of time. - vld1.64 {q14,q15}, [r11]! @ vmovdqa 0x60(%r11), %xmm2 - @ vmovdqa 0x70(%r11), %xmm4 - @ Write to q13 so we do not overlap table and destination. - veor q13, q3, q2 @ vpxor %xmm2, %xmm3, %xmm3 - - vtbl.8 d4, {q14}, d8 @ vpshufb %xmm4, %xmm2, %xmm2 - vtbl.8 d5, {q14}, d9 - vtbl.8 d6, {q13}, d10 @ vpshufb %xmm5, %xmm3, %xmm3 - vtbl.8 d7, {q13}, d11 - vtbl.8 d8, {q15}, d2 @ vpshufb %xmm1, %xmm4, %xmm4 - vtbl.8 d9, {q15}, d3 - vld1.64 {q1}, [r8] @ vmovdqa (%r8,%r10), %xmm1 - veor q2, q2, q3 @ vpxor %xmm3, %xmm2, %xmm2 - veor q3, q4, q2 @ vpxor %xmm2, %xmm4, %xmm3 - - sub r2, r2, #16 @ add $-16, %rdx - -.Lschedule_mangle_both: - @ Write to q2 so table and destination do not overlap. - vtbl.8 d4, {q3}, d2 @ vpshufb %xmm1, %xmm3, %xmm3 - vtbl.8 d5, {q3}, d3 - add r8, r8, #64-16 @ add $-16, %r8 - and r8, r8, #~(1<<6) @ and $0x30, %r8 - vst1.64 {q2}, [r2] @ vmovdqu %xmm3, (%rdx) - bx lr -.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle - -.globl vpaes_set_encrypt_key -.hidden vpaes_set_encrypt_key -.type vpaes_set_encrypt_key,%function -.align 4 -vpaes_set_encrypt_key: - stmdb sp!, {r7,r8,r9,r10,r11, lr} - vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - - lsr r9, r1, #5 @ shr $5,%eax - add r9, r9, #5 @ $5,%eax - str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - - mov r3, #0 @ mov $0,%ecx - mov r8, #0x30 @ mov $0x30,%r8d - bl _vpaes_schedule_core - eor r0, r0, r0 - - vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return -.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key - -.globl vpaes_set_decrypt_key -.hidden vpaes_set_decrypt_key -.type vpaes_set_decrypt_key,%function -.align 4 -vpaes_set_decrypt_key: - stmdb sp!, {r7,r8,r9,r10,r11, lr} - vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - - lsr r9, r1, #5 @ shr $5,%eax - add r9, r9, #5 @ $5,%eax - str r9, [r2,#240] @ mov %eax,240(%rdx) # AES_KEY->rounds = nbits/32+5; - lsl r9, r9, #4 @ shl $4,%eax - add r2, r2, #16 @ lea 16(%rdx,%rax),%rdx - add r2, r2, r9 - - mov r3, #1 @ mov $1,%ecx - lsr r8, r1, #1 @ shr $1,%r8d - and r8, r8, #32 @ and $32,%r8d - eor r8, r8, #32 @ xor $32,%r8d # nbits==192?0:32 - bl _vpaes_schedule_core - - vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return -.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key - -@ Additional constants for converting to bsaes. -.type _vpaes_convert_consts,%object -.align 4 -_vpaes_convert_consts: -@ .Lk_opt_then_skew applies skew(opt(x)) XOR 0x63, where skew is the linear -@ transform in the AES S-box. 0x63 is incorporated into the low half of the -@ table. This was computed with the following script: -@ -@ def u64s_to_u128(x, y): -@ return x | (y << 64) -@ def u128_to_u64s(w): -@ return w & ((1<<64)-1), w >> 64 -@ def get_byte(w, i): -@ return (w >> (i*8)) & 0xff -@ def apply_table(table, b): -@ lo = b & 0xf -@ hi = b >> 4 -@ return get_byte(table[0], lo) ^ get_byte(table[1], hi) -@ def opt(b): -@ table = [ -@ u64s_to_u128(0xFF9F4929D6B66000, 0xF7974121DEBE6808), -@ u64s_to_u128(0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0), -@ ] -@ return apply_table(table, b) -@ def rot_byte(b, n): -@ return 0xff & ((b << n) | (b >> (8-n))) -@ def skew(x): -@ return (x ^ rot_byte(x, 1) ^ rot_byte(x, 2) ^ rot_byte(x, 3) ^ -@ rot_byte(x, 4)) -@ table = [0, 0] -@ for i in range(16): -@ table[0] |= (skew(opt(i)) ^ 0x63) << (i*8) -@ table[1] |= skew(opt(i<<4)) << (i*8) -@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[0])) -@ print(" .quad 0x%016x, 0x%016x" % u128_to_u64s(table[1])) -.Lk_opt_then_skew: -.quad 0x9cb8436798bc4763, 0x6440bb9f6044bf9b -.quad 0x1f30062936192f00, 0xb49bad829db284ab - -@ .Lk_decrypt_transform is a permutation which performs an 8-bit left-rotation -@ followed by a byte-swap on each 32-bit word of a vector. E.g., 0x11223344 -@ becomes 0x22334411 and then 0x11443322. -.Lk_decrypt_transform: -.quad 0x0704050603000102, 0x0f0c0d0e0b08090a -.size _vpaes_convert_consts,.-_vpaes_convert_consts - -@ void vpaes_encrypt_key_to_bsaes(AES_KEY *bsaes, const AES_KEY *vpaes); -.globl vpaes_encrypt_key_to_bsaes -.hidden vpaes_encrypt_key_to_bsaes -.type vpaes_encrypt_key_to_bsaes,%function -.align 4 -vpaes_encrypt_key_to_bsaes: - stmdb sp!, {r11, lr} - - @ See _vpaes_schedule_core for the key schedule logic. In particular, - @ _vpaes_schedule_transform(.Lk_ipt) (section 2.2 of the paper), - @ _vpaes_schedule_mangle (section 4.3), and .Lschedule_mangle_last - @ contain the transformations not in the bsaes representation. This - @ function inverts those transforms. - @ - @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key - @ representation, which does not match the other aes_nohw_* - @ implementations. The ARM aes_nohw_* stores each 32-bit word - @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the - @ cost of extra REV and VREV32 operations in little-endian ARM. - - vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform - adr r2, .Lk_mc_forward @ Must be aligned to 8 mod 16. - add r3, r2, 0x90 @ .Lk_sr+0x10-.Lk_mc_forward = 0x90 (Apple's toolchain doesn't support the expression) - - vld1.64 {q12}, [r2] - vmov.i8 q10, #0x5b @ .Lk_s63 from vpaes-x86_64 - adr r11, .Lk_opt @ Must be aligned to 8 mod 16. - vmov.i8 q11, #0x63 @ .LK_s63 without .Lk_ipt applied - - @ vpaes stores one fewer round count than bsaes, but the number of keys - @ is the same. - ldr r2, [r1,#240] - add r2, r2, #1 - str r2, [r0,#240] - - @ The first key is transformed with _vpaes_schedule_transform(.Lk_ipt). - @ Invert this with .Lk_opt. - vld1.64 {q0}, [r1]! - bl _vpaes_schedule_transform - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - - @ The middle keys have _vpaes_schedule_transform(.Lk_ipt) applied, - @ followed by _vpaes_schedule_mangle. _vpaes_schedule_mangle XORs 0x63, - @ multiplies by the circulant 0,1,1,1, then applies ShiftRows. -.Loop_enc_key_to_bsaes: - vld1.64 {q0}, [r1]! - - @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note we cycle - @ r3 in the opposite direction and start at .Lk_sr+0x10 instead of 0x30. - @ We use r3 rather than r8 to avoid a callee-saved register. - vld1.64 {q1}, [r3] - vtbl.8 d4, {q0}, d2 - vtbl.8 d5, {q0}, d3 - add r3, r3, #16 - and r3, r3, #~(1<<6) - vmov q0, q2 - - @ Handle the last key differently. - subs r2, r2, #1 - beq .Loop_enc_key_to_bsaes_last - - @ Multiply by the circulant. This is its own inverse. - vtbl.8 d2, {q0}, d24 - vtbl.8 d3, {q0}, d25 - vmov q0, q1 - vtbl.8 d4, {q1}, d24 - vtbl.8 d5, {q1}, d25 - veor q0, q0, q2 - vtbl.8 d2, {q2}, d24 - vtbl.8 d3, {q2}, d25 - veor q0, q0, q1 - - @ XOR and finish. - veor q0, q0, q10 - bl _vpaes_schedule_transform - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - b .Loop_enc_key_to_bsaes - -.Loop_enc_key_to_bsaes_last: - @ The final key does not have a basis transform (note - @ .Lschedule_mangle_last inverts the original transform). It only XORs - @ 0x63 and applies ShiftRows. The latter was already inverted in the - @ loop. Note that, because we act on the original representation, we use - @ q11, not q10. - veor q0, q0, q11 - vrev32.8 q0, q0 - vst1.64 {q0}, [r0] - - @ Wipe registers which contained key material. - veor q0, q0, q0 - veor q1, q1, q1 - veor q2, q2, q2 - - ldmia sp!, {r11, pc} @ return -.size vpaes_encrypt_key_to_bsaes,.-vpaes_encrypt_key_to_bsaes - -@ void vpaes_decrypt_key_to_bsaes(AES_KEY *vpaes, const AES_KEY *bsaes); -.globl vpaes_decrypt_key_to_bsaes -.hidden vpaes_decrypt_key_to_bsaes -.type vpaes_decrypt_key_to_bsaes,%function -.align 4 -vpaes_decrypt_key_to_bsaes: - stmdb sp!, {r11, lr} - - @ See _vpaes_schedule_core for the key schedule logic. Note vpaes - @ computes the decryption key schedule in reverse. Additionally, - @ aes-x86_64.pl shares some transformations, so we must only partially - @ invert vpaes's transformations. In general, vpaes computes in a - @ different basis (.Lk_ipt and .Lk_opt) and applies the inverses of - @ MixColumns, ShiftRows, and the affine part of the AES S-box (which is - @ split into a linear skew and XOR of 0x63). We undo all but MixColumns. - @ - @ Note also that bsaes-armv7.pl expects aes-armv4.pl's key - @ representation, which does not match the other aes_nohw_* - @ implementations. The ARM aes_nohw_* stores each 32-bit word - @ byteswapped, as a convenience for (unsupported) big-endian ARM, at the - @ cost of extra REV and VREV32 operations in little-endian ARM. - - adr r2, .Lk_decrypt_transform - adr r3, .Lk_sr+0x30 - adr r11, .Lk_opt_then_skew @ Input to _vpaes_schedule_transform. - vld1.64 {q12}, [r2] @ Reuse q12 from encryption. - vmov.i8 q9, #0x0f @ Required by _vpaes_schedule_transform - - @ vpaes stores one fewer round count than bsaes, but the number of keys - @ is the same. - ldr r2, [r1,#240] - add r2, r2, #1 - str r2, [r0,#240] - - @ Undo the basis change and reapply the S-box affine transform. See - @ .Lschedule_mangle_last. - vld1.64 {q0}, [r1]! - bl _vpaes_schedule_transform - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - - @ See _vpaes_schedule_mangle for the transform on the middle keys. Note - @ it simultaneously inverts MixColumns and the S-box affine transform. - @ See .Lk_dksd through .Lk_dks9. -.Loop_dec_key_to_bsaes: - vld1.64 {q0}, [r1]! - - @ Invert the ShiftRows step (see .Lschedule_mangle_both). Note going - @ forwards cancels inverting for which direction we cycle r3. We use r3 - @ rather than r8 to avoid a callee-saved register. - vld1.64 {q1}, [r3] - vtbl.8 d4, {q0}, d2 - vtbl.8 d5, {q0}, d3 - add r3, r3, #64-16 - and r3, r3, #~(1<<6) - vmov q0, q2 - - @ Handle the last key differently. - subs r2, r2, #1 - beq .Loop_dec_key_to_bsaes_last - - @ Undo the basis change and reapply the S-box affine transform. - bl _vpaes_schedule_transform - - @ Rotate each word by 8 bytes (cycle the rows) and then byte-swap. We - @ combine the two operations in .Lk_decrypt_transform. - @ - @ TODO(davidben): Where does the rotation come from? - vtbl.8 d2, {q0}, d24 - vtbl.8 d3, {q0}, d25 - - vst1.64 {q1}, [r0]! - b .Loop_dec_key_to_bsaes - -.Loop_dec_key_to_bsaes_last: - @ The final key only inverts ShiftRows (already done in the loop). See - @ .Lschedule_am_decrypting. Its basis is not transformed. - vrev32.8 q0, q0 - vst1.64 {q0}, [r0]! - - @ Wipe registers which contained key material. - veor q0, q0, q0 - veor q1, q1, q1 - veor q2, q2, q2 - - ldmia sp!, {r11, pc} @ return -.size vpaes_decrypt_key_to_bsaes,.-vpaes_decrypt_key_to_bsaes -.globl vpaes_ctr32_encrypt_blocks -.hidden vpaes_ctr32_encrypt_blocks -.type vpaes_ctr32_encrypt_blocks,%function -.align 4 -vpaes_ctr32_encrypt_blocks: - mov ip, sp - stmdb sp!, {r7,r8,r9,r10,r11, lr} - @ This function uses q4-q7 (d8-d15), which are callee-saved. - vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - - cmp r2, #0 - @ r8 is passed on the stack. - ldr r8, [ip] - beq .Lctr32_done - - @ _vpaes_encrypt_core expects the key in r2, so swap r2 and r3. - mov r9, r3 - mov r3, r2 - mov r2, r9 - - @ Load the IV and counter portion. - ldr r7, [r8, #12] - vld1.8 {q7}, [r8] - - bl _vpaes_preheat - rev r7, r7 @ The counter is big-endian. - -.Lctr32_loop: - vmov q0, q7 - vld1.8 {q6}, [r0]! @ .Load input ahead of time - bl _vpaes_encrypt_core - veor q0, q0, q6 @ XOR input and result - vst1.8 {q0}, [r1]! - subs r3, r3, #1 - @ Update the counter. - add r7, r7, #1 - rev r9, r7 - vmov.32 d15[1], r9 - bne .Lctr32_loop - -.Lctr32_done: - vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - ldmia sp!, {r7,r8,r9,r10,r11, pc} @ return -.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S b/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S deleted file mode 100644 index 9a73ba82..00000000 --- a/third_party/boringssl/linux-arm/crypto/test/trampoline-armv4.S +++ /dev/null @@ -1,379 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if !defined(__has_feature) -#define __has_feature(x) 0 -#endif -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif - -#if !defined(OPENSSL_NO_ASM) -#if defined(__arm__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.syntax unified - -.arch armv7-a -.fpu vfp - -.text - -@ abi_test_trampoline loads callee-saved registers from |state|, calls |func| -@ with |argv|, then saves the callee-saved registers into |state|. It returns -@ the result of |func|. The |unwind| argument is unused. -@ uint32_t abi_test_trampoline(void (*func)(...), CallerState *state, -@ const uint32_t *argv, size_t argc, -@ int unwind); -.type abi_test_trampoline, %function -.globl abi_test_trampoline -.hidden abi_test_trampoline -.align 4 -abi_test_trampoline: - @ Save parameters and all callee-saved registers. For convenience, we - @ save r9 on iOS even though it's volatile. - vstmdb sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - stmdb sp!, {r0,r1,r2,r3,r4,r5,r6,r7,r8,r9,r10,r11,lr} - - @ Reserve stack space for six (10-4) stack parameters, plus an extra 4 - @ bytes to keep it 8-byte-aligned (see AAPCS, section 5.3). - sub sp, sp, #28 - - @ Every register in AAPCS is either non-volatile or a parameter (except - @ r9 on iOS), so this code, by the actual call, loses all its scratch - @ registers. First fill in stack parameters while there are registers - @ to spare. - cmp r3, #4 - bls .Lstack_args_done - mov r4, sp @ r4 is the output pointer. - add r5, r2, r3, lsl #2 @ Set r5 to the end of argv. - add r2, r2, #16 @ Skip four arguments. -.Lstack_args_loop: - ldr r6, [r2], #4 - cmp r2, r5 - str r6, [r4], #4 - bne .Lstack_args_loop - -.Lstack_args_done: - @ Load registers from |r1|. - vldmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} -#if defined(__APPLE__) - @ r9 is not volatile on iOS. - ldmia r1!, {r4,r5,r6,r7,r8,r10-r11} -#else - ldmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} -#endif - - @ Load register parameters. This uses up our remaining registers, so we - @ repurpose lr as scratch space. - ldr r3, [sp, #40] @ Reload argc. - ldr lr, [sp, #36] @ .Load argv into lr. - cmp r3, #3 - bhi .Larg_r3 - beq .Larg_r2 - cmp r3, #1 - bhi .Larg_r1 - beq .Larg_r0 - b .Largs_done - -.Larg_r3: - ldr r3, [lr, #12] @ argv[3] -.Larg_r2: - ldr r2, [lr, #8] @ argv[2] -.Larg_r1: - ldr r1, [lr, #4] @ argv[1] -.Larg_r0: - ldr r0, [lr] @ argv[0] -.Largs_done: - - @ With every other register in use, load the function pointer into lr - @ and call the function. - ldr lr, [sp, #28] - blx lr - - @ r1-r3 are free for use again. The trampoline only supports - @ single-return functions. Pass r4-r11 to the caller. - ldr r1, [sp, #32] - vstmia r1!, {d8,d9,d10,d11,d12,d13,d14,d15} -#if defined(__APPLE__) - @ r9 is not volatile on iOS. - stmia r1!, {r4,r5,r6,r7,r8,r10-r11} -#else - stmia r1!, {r4,r5,r6,r7,r8,r9,r10,r11} -#endif - - @ Unwind the stack and restore registers. - add sp, sp, #44 @ 44 = 28+16 - ldmia sp!, {r4,r5,r6,r7,r8,r9,r10,r11,lr} @ Skip r0-r3 (see +16 above). - vldmia sp!, {d8,d9,d10,d11,d12,d13,d14,d15} - - bx lr -.size abi_test_trampoline,.-abi_test_trampoline -.type abi_test_clobber_r0, %function -.globl abi_test_clobber_r0 -.hidden abi_test_clobber_r0 -.align 4 -abi_test_clobber_r0: - mov r0, #0 - bx lr -.size abi_test_clobber_r0,.-abi_test_clobber_r0 -.type abi_test_clobber_r1, %function -.globl abi_test_clobber_r1 -.hidden abi_test_clobber_r1 -.align 4 -abi_test_clobber_r1: - mov r1, #0 - bx lr -.size abi_test_clobber_r1,.-abi_test_clobber_r1 -.type abi_test_clobber_r2, %function -.globl abi_test_clobber_r2 -.hidden abi_test_clobber_r2 -.align 4 -abi_test_clobber_r2: - mov r2, #0 - bx lr -.size abi_test_clobber_r2,.-abi_test_clobber_r2 -.type abi_test_clobber_r3, %function -.globl abi_test_clobber_r3 -.hidden abi_test_clobber_r3 -.align 4 -abi_test_clobber_r3: - mov r3, #0 - bx lr -.size abi_test_clobber_r3,.-abi_test_clobber_r3 -.type abi_test_clobber_r4, %function -.globl abi_test_clobber_r4 -.hidden abi_test_clobber_r4 -.align 4 -abi_test_clobber_r4: - mov r4, #0 - bx lr -.size abi_test_clobber_r4,.-abi_test_clobber_r4 -.type abi_test_clobber_r5, %function -.globl abi_test_clobber_r5 -.hidden abi_test_clobber_r5 -.align 4 -abi_test_clobber_r5: - mov r5, #0 - bx lr -.size abi_test_clobber_r5,.-abi_test_clobber_r5 -.type abi_test_clobber_r6, %function -.globl abi_test_clobber_r6 -.hidden abi_test_clobber_r6 -.align 4 -abi_test_clobber_r6: - mov r6, #0 - bx lr -.size abi_test_clobber_r6,.-abi_test_clobber_r6 -.type abi_test_clobber_r7, %function -.globl abi_test_clobber_r7 -.hidden abi_test_clobber_r7 -.align 4 -abi_test_clobber_r7: - mov r7, #0 - bx lr -.size abi_test_clobber_r7,.-abi_test_clobber_r7 -.type abi_test_clobber_r8, %function -.globl abi_test_clobber_r8 -.hidden abi_test_clobber_r8 -.align 4 -abi_test_clobber_r8: - mov r8, #0 - bx lr -.size abi_test_clobber_r8,.-abi_test_clobber_r8 -.type abi_test_clobber_r9, %function -.globl abi_test_clobber_r9 -.hidden abi_test_clobber_r9 -.align 4 -abi_test_clobber_r9: - mov r9, #0 - bx lr -.size abi_test_clobber_r9,.-abi_test_clobber_r9 -.type abi_test_clobber_r10, %function -.globl abi_test_clobber_r10 -.hidden abi_test_clobber_r10 -.align 4 -abi_test_clobber_r10: - mov r10, #0 - bx lr -.size abi_test_clobber_r10,.-abi_test_clobber_r10 -.type abi_test_clobber_r11, %function -.globl abi_test_clobber_r11 -.hidden abi_test_clobber_r11 -.align 4 -abi_test_clobber_r11: - mov r11, #0 - bx lr -.size abi_test_clobber_r11,.-abi_test_clobber_r11 -.type abi_test_clobber_r12, %function -.globl abi_test_clobber_r12 -.hidden abi_test_clobber_r12 -.align 4 -abi_test_clobber_r12: - mov r12, #0 - bx lr -.size abi_test_clobber_r12,.-abi_test_clobber_r12 -.type abi_test_clobber_d0, %function -.globl abi_test_clobber_d0 -.hidden abi_test_clobber_d0 -.align 4 -abi_test_clobber_d0: - mov r0, #0 - vmov s0, r0 - vmov s1, r0 - bx lr -.size abi_test_clobber_d0,.-abi_test_clobber_d0 -.type abi_test_clobber_d1, %function -.globl abi_test_clobber_d1 -.hidden abi_test_clobber_d1 -.align 4 -abi_test_clobber_d1: - mov r0, #0 - vmov s2, r0 - vmov s3, r0 - bx lr -.size abi_test_clobber_d1,.-abi_test_clobber_d1 -.type abi_test_clobber_d2, %function -.globl abi_test_clobber_d2 -.hidden abi_test_clobber_d2 -.align 4 -abi_test_clobber_d2: - mov r0, #0 - vmov s4, r0 - vmov s5, r0 - bx lr -.size abi_test_clobber_d2,.-abi_test_clobber_d2 -.type abi_test_clobber_d3, %function -.globl abi_test_clobber_d3 -.hidden abi_test_clobber_d3 -.align 4 -abi_test_clobber_d3: - mov r0, #0 - vmov s6, r0 - vmov s7, r0 - bx lr -.size abi_test_clobber_d3,.-abi_test_clobber_d3 -.type abi_test_clobber_d4, %function -.globl abi_test_clobber_d4 -.hidden abi_test_clobber_d4 -.align 4 -abi_test_clobber_d4: - mov r0, #0 - vmov s8, r0 - vmov s9, r0 - bx lr -.size abi_test_clobber_d4,.-abi_test_clobber_d4 -.type abi_test_clobber_d5, %function -.globl abi_test_clobber_d5 -.hidden abi_test_clobber_d5 -.align 4 -abi_test_clobber_d5: - mov r0, #0 - vmov s10, r0 - vmov s11, r0 - bx lr -.size abi_test_clobber_d5,.-abi_test_clobber_d5 -.type abi_test_clobber_d6, %function -.globl abi_test_clobber_d6 -.hidden abi_test_clobber_d6 -.align 4 -abi_test_clobber_d6: - mov r0, #0 - vmov s12, r0 - vmov s13, r0 - bx lr -.size abi_test_clobber_d6,.-abi_test_clobber_d6 -.type abi_test_clobber_d7, %function -.globl abi_test_clobber_d7 -.hidden abi_test_clobber_d7 -.align 4 -abi_test_clobber_d7: - mov r0, #0 - vmov s14, r0 - vmov s15, r0 - bx lr -.size abi_test_clobber_d7,.-abi_test_clobber_d7 -.type abi_test_clobber_d8, %function -.globl abi_test_clobber_d8 -.hidden abi_test_clobber_d8 -.align 4 -abi_test_clobber_d8: - mov r0, #0 - vmov s16, r0 - vmov s17, r0 - bx lr -.size abi_test_clobber_d8,.-abi_test_clobber_d8 -.type abi_test_clobber_d9, %function -.globl abi_test_clobber_d9 -.hidden abi_test_clobber_d9 -.align 4 -abi_test_clobber_d9: - mov r0, #0 - vmov s18, r0 - vmov s19, r0 - bx lr -.size abi_test_clobber_d9,.-abi_test_clobber_d9 -.type abi_test_clobber_d10, %function -.globl abi_test_clobber_d10 -.hidden abi_test_clobber_d10 -.align 4 -abi_test_clobber_d10: - mov r0, #0 - vmov s20, r0 - vmov s21, r0 - bx lr -.size abi_test_clobber_d10,.-abi_test_clobber_d10 -.type abi_test_clobber_d11, %function -.globl abi_test_clobber_d11 -.hidden abi_test_clobber_d11 -.align 4 -abi_test_clobber_d11: - mov r0, #0 - vmov s22, r0 - vmov s23, r0 - bx lr -.size abi_test_clobber_d11,.-abi_test_clobber_d11 -.type abi_test_clobber_d12, %function -.globl abi_test_clobber_d12 -.hidden abi_test_clobber_d12 -.align 4 -abi_test_clobber_d12: - mov r0, #0 - vmov s24, r0 - vmov s25, r0 - bx lr -.size abi_test_clobber_d12,.-abi_test_clobber_d12 -.type abi_test_clobber_d13, %function -.globl abi_test_clobber_d13 -.hidden abi_test_clobber_d13 -.align 4 -abi_test_clobber_d13: - mov r0, #0 - vmov s26, r0 - vmov s27, r0 - bx lr -.size abi_test_clobber_d13,.-abi_test_clobber_d13 -.type abi_test_clobber_d14, %function -.globl abi_test_clobber_d14 -.hidden abi_test_clobber_d14 -.align 4 -abi_test_clobber_d14: - mov r0, #0 - vmov s28, r0 - vmov s29, r0 - bx lr -.size abi_test_clobber_d14,.-abi_test_clobber_d14 -.type abi_test_clobber_d15, %function -.globl abi_test_clobber_d15 -.hidden abi_test_clobber_d15 -.align 4 -abi_test_clobber_d15: - mov r0, #0 - vmov s30, r0 - vmov s31, r0 - bx lr -.size abi_test_clobber_d15,.-abi_test_clobber_d15 -#endif -#endif // !OPENSSL_NO_ASM -.section .note.GNU-stack,"",%progbits diff --git a/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S b/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S deleted file mode 100644 index ea2a7f68..00000000 --- a/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S +++ /dev/null @@ -1,3670 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__) -.machine "any" - -.abiversion 2 -.text - -.align 7 -.Lrcon: -.byte 0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00,0x01 -.byte 0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b,0x00,0x00,0x00,0x1b -.byte 0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d,0x0c,0x0f,0x0e,0x0d -.byte 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.Lconsts: - mflr 0 - bcl 20,31,$+4 - mflr 6 - addi 6,6,-0x48 - mtlr 0 - blr -.long 0 -.byte 0,12,0x14,0,0,0,0,0 -.byte 65,69,83,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 - -.globl aes_hw_set_encrypt_key -.type aes_hw_set_encrypt_key,@function -.align 5 -aes_hw_set_encrypt_key: -.localentry aes_hw_set_encrypt_key,0 - -.Lset_encrypt_key: - mflr 11 - std 11,16(1) - - li 6,-1 - cmpldi 3,0 - beq- .Lenc_key_abort - cmpldi 5,0 - beq- .Lenc_key_abort - li 6,-2 - cmpwi 4,128 - blt- .Lenc_key_abort - cmpwi 4,256 - bgt- .Lenc_key_abort - andi. 0,4,0x3f - bne- .Lenc_key_abort - - lis 0,0xfff0 - li 12,-1 - or 0,0,0 - - bl .Lconsts - mtlr 11 - - neg 9,3 - lvx 1,0,3 - addi 3,3,15 - lvsr 3,0,9 - li 8,0x20 - cmpwi 4,192 - lvx 2,0,3 - vspltisb 5,0x0f - lvx 4,0,6 - vxor 3,3,5 - lvx 5,8,6 - addi 6,6,0x10 - vperm 1,1,2,3 - li 7,8 - vxor 0,0,0 - mtctr 7 - - lvsl 8,0,5 - vspltisb 9,-1 - lvx 10,0,5 - vperm 9,9,0,8 - - blt .Loop128 - addi 3,3,8 - beq .L192 - addi 3,3,8 - b .L256 - -.align 4 -.Loop128: - vperm 3,1,1,5 - vsldoi 6,0,1,12 - vperm 11,1,1,8 - vsel 7,10,11,9 - vor 10,11,11 - .long 0x10632509 - stvx 7,0,5 - addi 5,5,16 - - vxor 1,1,6 - vsldoi 6,0,6,12 - vxor 1,1,6 - vsldoi 6,0,6,12 - vxor 1,1,6 - vadduwm 4,4,4 - vxor 1,1,3 - bdnz .Loop128 - - lvx 4,0,6 - - vperm 3,1,1,5 - vsldoi 6,0,1,12 - vperm 11,1,1,8 - vsel 7,10,11,9 - vor 10,11,11 - .long 0x10632509 - stvx 7,0,5 - addi 5,5,16 - - vxor 1,1,6 - vsldoi 6,0,6,12 - vxor 1,1,6 - vsldoi 6,0,6,12 - vxor 1,1,6 - vadduwm 4,4,4 - vxor 1,1,3 - - vperm 3,1,1,5 - vsldoi 6,0,1,12 - vperm 11,1,1,8 - vsel 7,10,11,9 - vor 10,11,11 - .long 0x10632509 - stvx 7,0,5 - addi 5,5,16 - - vxor 1,1,6 - vsldoi 6,0,6,12 - vxor 1,1,6 - vsldoi 6,0,6,12 - vxor 1,1,6 - vxor 1,1,3 - vperm 11,1,1,8 - vsel 7,10,11,9 - vor 10,11,11 - stvx 7,0,5 - - addi 3,5,15 - addi 5,5,0x50 - - li 8,10 - b .Ldone - -.align 4 -.L192: - lvx 6,0,3 - li 7,4 - vperm 11,1,1,8 - vsel 7,10,11,9 - vor 10,11,11 - stvx 7,0,5 - addi 5,5,16 - vperm 2,2,6,3 - vspltisb 3,8 - mtctr 7 - vsububm 5,5,3 - -.Loop192: - vperm 3,2,2,5 - vsldoi 6,0,1,12 - .long 0x10632509 - - vxor 1,1,6 - vsldoi 6,0,6,12 - vxor 1,1,6 - vsldoi 6,0,6,12 - vxor 1,1,6 - - vsldoi 7,0,2,8 - vspltw 6,1,3 - vxor 6,6,2 - vsldoi 2,0,2,12 - vadduwm 4,4,4 - vxor 2,2,6 - vxor 1,1,3 - vxor 2,2,3 - vsldoi 7,7,1,8 - - vperm 3,2,2,5 - vsldoi 6,0,1,12 - vperm 11,7,7,8 - vsel 7,10,11,9 - vor 10,11,11 - .long 0x10632509 - stvx 7,0,5 - addi 5,5,16 - - vsldoi 7,1,2,8 - vxor 1,1,6 - vsldoi 6,0,6,12 - vperm 11,7,7,8 - vsel 7,10,11,9 - vor 10,11,11 - vxor 1,1,6 - vsldoi 6,0,6,12 - vxor 1,1,6 - stvx 7,0,5 - addi 5,5,16 - - vspltw 6,1,3 - vxor 6,6,2 - vsldoi 2,0,2,12 - vadduwm 4,4,4 - vxor 2,2,6 - vxor 1,1,3 - vxor 2,2,3 - vperm 11,1,1,8 - vsel 7,10,11,9 - vor 10,11,11 - stvx 7,0,5 - addi 3,5,15 - addi 5,5,16 - bdnz .Loop192 - - li 8,12 - addi 5,5,0x20 - b .Ldone - -.align 4 -.L256: - lvx 6,0,3 - li 7,7 - li 8,14 - vperm 11,1,1,8 - vsel 7,10,11,9 - vor 10,11,11 - stvx 7,0,5 - addi 5,5,16 - vperm 2,2,6,3 - mtctr 7 - -.Loop256: - vperm 3,2,2,5 - vsldoi 6,0,1,12 - vperm 11,2,2,8 - vsel 7,10,11,9 - vor 10,11,11 - .long 0x10632509 - stvx 7,0,5 - addi 5,5,16 - - vxor 1,1,6 - vsldoi 6,0,6,12 - vxor 1,1,6 - vsldoi 6,0,6,12 - vxor 1,1,6 - vadduwm 4,4,4 - vxor 1,1,3 - vperm 11,1,1,8 - vsel 7,10,11,9 - vor 10,11,11 - stvx 7,0,5 - addi 3,5,15 - addi 5,5,16 - bdz .Ldone - - vspltw 3,1,3 - vsldoi 6,0,2,12 - .long 0x106305C8 - - vxor 2,2,6 - vsldoi 6,0,6,12 - vxor 2,2,6 - vsldoi 6,0,6,12 - vxor 2,2,6 - - vxor 2,2,3 - b .Loop256 - -.align 4 -.Ldone: - lvx 2,0,3 - vsel 2,10,2,9 - stvx 2,0,3 - li 6,0 - or 12,12,12 - stw 8,0(5) - -.Lenc_key_abort: - mr 3,6 - blr -.long 0 -.byte 0,12,0x14,1,0,0,3,0 -.long 0 -.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key - -.globl aes_hw_set_decrypt_key -.type aes_hw_set_decrypt_key,@function -.align 5 -aes_hw_set_decrypt_key: -.localentry aes_hw_set_decrypt_key,0 - - stdu 1,-64(1) - mflr 10 - std 10,80(1) - bl .Lset_encrypt_key - mtlr 10 - - cmpwi 3,0 - bne- .Ldec_key_abort - - slwi 7,8,4 - subi 3,5,240 - srwi 8,8,1 - add 5,3,7 - mtctr 8 - -.Ldeckey: - lwz 0, 0(3) - lwz 6, 4(3) - lwz 7, 8(3) - lwz 8, 12(3) - addi 3,3,16 - lwz 9, 0(5) - lwz 10,4(5) - lwz 11,8(5) - lwz 12,12(5) - stw 0, 0(5) - stw 6, 4(5) - stw 7, 8(5) - stw 8, 12(5) - subi 5,5,16 - stw 9, -16(3) - stw 10,-12(3) - stw 11,-8(3) - stw 12,-4(3) - bdnz .Ldeckey - - xor 3,3,3 -.Ldec_key_abort: - addi 1,1,64 - blr -.long 0 -.byte 0,12,4,1,0x80,0,3,0 -.long 0 -.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key -.globl aes_hw_encrypt -.type aes_hw_encrypt,@function -.align 5 -aes_hw_encrypt: -.localentry aes_hw_encrypt,0 - - lwz 6,240(5) - lis 0,0xfc00 - li 12,-1 - li 7,15 - or 0,0,0 - - lvx 0,0,3 - neg 11,4 - lvx 1,7,3 - lvsl 2,0,3 - vspltisb 4,0x0f - lvsr 3,0,11 - vxor 2,2,4 - li 7,16 - vperm 0,0,1,2 - lvx 1,0,5 - lvsr 5,0,5 - srwi 6,6,1 - lvx 2,7,5 - addi 7,7,16 - subi 6,6,1 - vperm 1,2,1,5 - - vxor 0,0,1 - lvx 1,7,5 - addi 7,7,16 - mtctr 6 - -.Loop_enc: - vperm 2,1,2,5 - .long 0x10001508 - lvx 2,7,5 - addi 7,7,16 - vperm 1,2,1,5 - .long 0x10000D08 - lvx 1,7,5 - addi 7,7,16 - bdnz .Loop_enc - - vperm 2,1,2,5 - .long 0x10001508 - lvx 2,7,5 - vperm 1,2,1,5 - .long 0x10000D09 - - vspltisb 2,-1 - vxor 1,1,1 - li 7,15 - vperm 2,2,1,3 - vxor 3,3,4 - lvx 1,0,4 - vperm 0,0,0,3 - vsel 1,1,0,2 - lvx 4,7,4 - stvx 1,0,4 - vsel 0,0,4,2 - stvx 0,7,4 - - or 12,12,12 - blr -.long 0 -.byte 0,12,0x14,0,0,0,3,0 -.long 0 -.size aes_hw_encrypt,.-aes_hw_encrypt -.globl aes_hw_decrypt -.type aes_hw_decrypt,@function -.align 5 -aes_hw_decrypt: -.localentry aes_hw_decrypt,0 - - lwz 6,240(5) - lis 0,0xfc00 - li 12,-1 - li 7,15 - or 0,0,0 - - lvx 0,0,3 - neg 11,4 - lvx 1,7,3 - lvsl 2,0,3 - vspltisb 4,0x0f - lvsr 3,0,11 - vxor 2,2,4 - li 7,16 - vperm 0,0,1,2 - lvx 1,0,5 - lvsr 5,0,5 - srwi 6,6,1 - lvx 2,7,5 - addi 7,7,16 - subi 6,6,1 - vperm 1,2,1,5 - - vxor 0,0,1 - lvx 1,7,5 - addi 7,7,16 - mtctr 6 - -.Loop_dec: - vperm 2,1,2,5 - .long 0x10001548 - lvx 2,7,5 - addi 7,7,16 - vperm 1,2,1,5 - .long 0x10000D48 - lvx 1,7,5 - addi 7,7,16 - bdnz .Loop_dec - - vperm 2,1,2,5 - .long 0x10001548 - lvx 2,7,5 - vperm 1,2,1,5 - .long 0x10000D49 - - vspltisb 2,-1 - vxor 1,1,1 - li 7,15 - vperm 2,2,1,3 - vxor 3,3,4 - lvx 1,0,4 - vperm 0,0,0,3 - vsel 1,1,0,2 - lvx 4,7,4 - stvx 1,0,4 - vsel 0,0,4,2 - stvx 0,7,4 - - or 12,12,12 - blr -.long 0 -.byte 0,12,0x14,0,0,0,3,0 -.long 0 -.size aes_hw_decrypt,.-aes_hw_decrypt -.globl aes_hw_cbc_encrypt -.type aes_hw_cbc_encrypt,@function -.align 5 -aes_hw_cbc_encrypt: -.localentry aes_hw_cbc_encrypt,0 - - cmpldi 5,16 - .long 0x4dc00020 - - cmpwi 8,0 - lis 0,0xffe0 - li 12,-1 - or 0,0,0 - - li 10,15 - vxor 0,0,0 - vspltisb 3,0x0f - - lvx 4,0,7 - lvsl 6,0,7 - lvx 5,10,7 - vxor 6,6,3 - vperm 4,4,5,6 - - neg 11,3 - lvsr 10,0,6 - lwz 9,240(6) - - lvsr 6,0,11 - lvx 5,0,3 - addi 3,3,15 - vxor 6,6,3 - - lvsl 8,0,4 - vspltisb 9,-1 - lvx 7,0,4 - vperm 9,9,0,8 - vxor 8,8,3 - - srwi 9,9,1 - li 10,16 - subi 9,9,1 - beq .Lcbc_dec - -.Lcbc_enc: - vor 2,5,5 - lvx 5,0,3 - addi 3,3,16 - mtctr 9 - subi 5,5,16 - - lvx 0,0,6 - vperm 2,2,5,6 - lvx 1,10,6 - addi 10,10,16 - vperm 0,1,0,10 - vxor 2,2,0 - lvx 0,10,6 - addi 10,10,16 - vxor 2,2,4 - -.Loop_cbc_enc: - vperm 1,0,1,10 - .long 0x10420D08 - lvx 1,10,6 - addi 10,10,16 - vperm 0,1,0,10 - .long 0x10420508 - lvx 0,10,6 - addi 10,10,16 - bdnz .Loop_cbc_enc - - vperm 1,0,1,10 - .long 0x10420D08 - lvx 1,10,6 - li 10,16 - vperm 0,1,0,10 - .long 0x10820509 - cmpldi 5,16 - - vperm 3,4,4,8 - vsel 2,7,3,9 - vor 7,3,3 - stvx 2,0,4 - addi 4,4,16 - bge .Lcbc_enc - - b .Lcbc_done - -.align 4 -.Lcbc_dec: - cmpldi 5,128 - bge _aesp8_cbc_decrypt8x - vor 3,5,5 - lvx 5,0,3 - addi 3,3,16 - mtctr 9 - subi 5,5,16 - - lvx 0,0,6 - vperm 3,3,5,6 - lvx 1,10,6 - addi 10,10,16 - vperm 0,1,0,10 - vxor 2,3,0 - lvx 0,10,6 - addi 10,10,16 - -.Loop_cbc_dec: - vperm 1,0,1,10 - .long 0x10420D48 - lvx 1,10,6 - addi 10,10,16 - vperm 0,1,0,10 - .long 0x10420548 - lvx 0,10,6 - addi 10,10,16 - bdnz .Loop_cbc_dec - - vperm 1,0,1,10 - .long 0x10420D48 - lvx 1,10,6 - li 10,16 - vperm 0,1,0,10 - .long 0x10420549 - cmpldi 5,16 - - vxor 2,2,4 - vor 4,3,3 - vperm 3,2,2,8 - vsel 2,7,3,9 - vor 7,3,3 - stvx 2,0,4 - addi 4,4,16 - bge .Lcbc_dec - -.Lcbc_done: - addi 4,4,-1 - lvx 2,0,4 - vsel 2,7,2,9 - stvx 2,0,4 - - neg 8,7 - li 10,15 - vxor 0,0,0 - vspltisb 9,-1 - vspltisb 3,0x0f - lvsr 8,0,8 - vperm 9,9,0,8 - vxor 8,8,3 - lvx 7,0,7 - vperm 4,4,4,8 - vsel 2,7,4,9 - lvx 5,10,7 - stvx 2,0,7 - vsel 2,4,5,9 - stvx 2,10,7 - - or 12,12,12 - blr -.long 0 -.byte 0,12,0x14,0,0,0,6,0 -.long 0 -.align 5 -_aesp8_cbc_decrypt8x: - stdu 1,-448(1) - li 10,207 - li 11,223 - stvx 20,10,1 - addi 10,10,32 - stvx 21,11,1 - addi 11,11,32 - stvx 22,10,1 - addi 10,10,32 - stvx 23,11,1 - addi 11,11,32 - stvx 24,10,1 - addi 10,10,32 - stvx 25,11,1 - addi 11,11,32 - stvx 26,10,1 - addi 10,10,32 - stvx 27,11,1 - addi 11,11,32 - stvx 28,10,1 - addi 10,10,32 - stvx 29,11,1 - addi 11,11,32 - stvx 30,10,1 - stvx 31,11,1 - li 0,-1 - stw 12,396(1) - li 8,0x10 - std 26,400(1) - li 26,0x20 - std 27,408(1) - li 27,0x30 - std 28,416(1) - li 28,0x40 - std 29,424(1) - li 29,0x50 - std 30,432(1) - li 30,0x60 - std 31,440(1) - li 31,0x70 - or 0,0,0 - - subi 9,9,3 - subi 5,5,128 - - lvx 23,0,6 - lvx 30,8,6 - addi 6,6,0x20 - lvx 31,0,6 - vperm 23,30,23,10 - addi 11,1,79 - mtctr 9 - -.Load_cbc_dec_key: - vperm 24,31,30,10 - lvx 30,8,6 - addi 6,6,0x20 - stvx 24,0,11 - vperm 25,30,31,10 - lvx 31,0,6 - stvx 25,8,11 - addi 11,11,0x20 - bdnz .Load_cbc_dec_key - - lvx 26,8,6 - vperm 24,31,30,10 - lvx 27,26,6 - stvx 24,0,11 - vperm 25,26,31,10 - lvx 28,27,6 - stvx 25,8,11 - addi 11,1,79 - vperm 26,27,26,10 - lvx 29,28,6 - vperm 27,28,27,10 - lvx 30,29,6 - vperm 28,29,28,10 - lvx 31,30,6 - vperm 29,30,29,10 - lvx 14,31,6 - vperm 30,31,30,10 - lvx 24,0,11 - vperm 31,14,31,10 - lvx 25,8,11 - - - - subi 3,3,15 - - li 10,8 - .long 0x7C001E99 - lvsl 6,0,10 - vspltisb 3,0x0f - .long 0x7C281E99 - vxor 6,6,3 - .long 0x7C5A1E99 - vperm 0,0,0,6 - .long 0x7C7B1E99 - vperm 1,1,1,6 - .long 0x7D5C1E99 - vperm 2,2,2,6 - vxor 14,0,23 - .long 0x7D7D1E99 - vperm 3,3,3,6 - vxor 15,1,23 - .long 0x7D9E1E99 - vperm 10,10,10,6 - vxor 16,2,23 - .long 0x7DBF1E99 - addi 3,3,0x80 - vperm 11,11,11,6 - vxor 17,3,23 - vperm 12,12,12,6 - vxor 18,10,23 - vperm 13,13,13,6 - vxor 19,11,23 - vxor 20,12,23 - vxor 21,13,23 - - mtctr 9 - b .Loop_cbc_dec8x -.align 5 -.Loop_cbc_dec8x: - .long 0x11CEC548 - .long 0x11EFC548 - .long 0x1210C548 - .long 0x1231C548 - .long 0x1252C548 - .long 0x1273C548 - .long 0x1294C548 - .long 0x12B5C548 - lvx 24,26,11 - addi 11,11,0x20 - - .long 0x11CECD48 - .long 0x11EFCD48 - .long 0x1210CD48 - .long 0x1231CD48 - .long 0x1252CD48 - .long 0x1273CD48 - .long 0x1294CD48 - .long 0x12B5CD48 - lvx 25,8,11 - bdnz .Loop_cbc_dec8x - - subic 5,5,128 - .long 0x11CEC548 - .long 0x11EFC548 - .long 0x1210C548 - .long 0x1231C548 - .long 0x1252C548 - .long 0x1273C548 - .long 0x1294C548 - .long 0x12B5C548 - - subfe. 0,0,0 - .long 0x11CECD48 - .long 0x11EFCD48 - .long 0x1210CD48 - .long 0x1231CD48 - .long 0x1252CD48 - .long 0x1273CD48 - .long 0x1294CD48 - .long 0x12B5CD48 - - and 0,0,5 - .long 0x11CED548 - .long 0x11EFD548 - .long 0x1210D548 - .long 0x1231D548 - .long 0x1252D548 - .long 0x1273D548 - .long 0x1294D548 - .long 0x12B5D548 - - add 3,3,0 - - - - .long 0x11CEDD48 - .long 0x11EFDD48 - .long 0x1210DD48 - .long 0x1231DD48 - .long 0x1252DD48 - .long 0x1273DD48 - .long 0x1294DD48 - .long 0x12B5DD48 - - addi 11,1,79 - .long 0x11CEE548 - .long 0x11EFE548 - .long 0x1210E548 - .long 0x1231E548 - .long 0x1252E548 - .long 0x1273E548 - .long 0x1294E548 - .long 0x12B5E548 - lvx 24,0,11 - - .long 0x11CEED48 - .long 0x11EFED48 - .long 0x1210ED48 - .long 0x1231ED48 - .long 0x1252ED48 - .long 0x1273ED48 - .long 0x1294ED48 - .long 0x12B5ED48 - lvx 25,8,11 - - .long 0x11CEF548 - vxor 4,4,31 - .long 0x11EFF548 - vxor 0,0,31 - .long 0x1210F548 - vxor 1,1,31 - .long 0x1231F548 - vxor 2,2,31 - .long 0x1252F548 - vxor 3,3,31 - .long 0x1273F548 - vxor 10,10,31 - .long 0x1294F548 - vxor 11,11,31 - .long 0x12B5F548 - vxor 12,12,31 - - .long 0x11CE2549 - .long 0x11EF0549 - .long 0x7C001E99 - .long 0x12100D49 - .long 0x7C281E99 - .long 0x12311549 - vperm 0,0,0,6 - .long 0x7C5A1E99 - .long 0x12521D49 - vperm 1,1,1,6 - .long 0x7C7B1E99 - .long 0x12735549 - vperm 2,2,2,6 - .long 0x7D5C1E99 - .long 0x12945D49 - vperm 3,3,3,6 - .long 0x7D7D1E99 - .long 0x12B56549 - vperm 10,10,10,6 - .long 0x7D9E1E99 - vor 4,13,13 - vperm 11,11,11,6 - .long 0x7DBF1E99 - addi 3,3,0x80 - - vperm 14,14,14,6 - vperm 15,15,15,6 - .long 0x7DC02799 - vperm 12,12,12,6 - vxor 14,0,23 - vperm 16,16,16,6 - .long 0x7DE82799 - vperm 13,13,13,6 - vxor 15,1,23 - vperm 17,17,17,6 - .long 0x7E1A2799 - vxor 16,2,23 - vperm 18,18,18,6 - .long 0x7E3B2799 - vxor 17,3,23 - vperm 19,19,19,6 - .long 0x7E5C2799 - vxor 18,10,23 - vperm 20,20,20,6 - .long 0x7E7D2799 - vxor 19,11,23 - vperm 21,21,21,6 - .long 0x7E9E2799 - vxor 20,12,23 - .long 0x7EBF2799 - addi 4,4,0x80 - vxor 21,13,23 - - mtctr 9 - beq .Loop_cbc_dec8x - - addic. 5,5,128 - beq .Lcbc_dec8x_done - nop - nop - -.Loop_cbc_dec8x_tail: - .long 0x11EFC548 - .long 0x1210C548 - .long 0x1231C548 - .long 0x1252C548 - .long 0x1273C548 - .long 0x1294C548 - .long 0x12B5C548 - lvx 24,26,11 - addi 11,11,0x20 - - .long 0x11EFCD48 - .long 0x1210CD48 - .long 0x1231CD48 - .long 0x1252CD48 - .long 0x1273CD48 - .long 0x1294CD48 - .long 0x12B5CD48 - lvx 25,8,11 - bdnz .Loop_cbc_dec8x_tail - - .long 0x11EFC548 - .long 0x1210C548 - .long 0x1231C548 - .long 0x1252C548 - .long 0x1273C548 - .long 0x1294C548 - .long 0x12B5C548 - - .long 0x11EFCD48 - .long 0x1210CD48 - .long 0x1231CD48 - .long 0x1252CD48 - .long 0x1273CD48 - .long 0x1294CD48 - .long 0x12B5CD48 - - .long 0x11EFD548 - .long 0x1210D548 - .long 0x1231D548 - .long 0x1252D548 - .long 0x1273D548 - .long 0x1294D548 - .long 0x12B5D548 - - .long 0x11EFDD48 - .long 0x1210DD48 - .long 0x1231DD48 - .long 0x1252DD48 - .long 0x1273DD48 - .long 0x1294DD48 - .long 0x12B5DD48 - - .long 0x11EFE548 - .long 0x1210E548 - .long 0x1231E548 - .long 0x1252E548 - .long 0x1273E548 - .long 0x1294E548 - .long 0x12B5E548 - - .long 0x11EFED48 - .long 0x1210ED48 - .long 0x1231ED48 - .long 0x1252ED48 - .long 0x1273ED48 - .long 0x1294ED48 - .long 0x12B5ED48 - - .long 0x11EFF548 - vxor 4,4,31 - .long 0x1210F548 - vxor 1,1,31 - .long 0x1231F548 - vxor 2,2,31 - .long 0x1252F548 - vxor 3,3,31 - .long 0x1273F548 - vxor 10,10,31 - .long 0x1294F548 - vxor 11,11,31 - .long 0x12B5F548 - vxor 12,12,31 - - cmplwi 5,32 - blt .Lcbc_dec8x_one - nop - beq .Lcbc_dec8x_two - cmplwi 5,64 - blt .Lcbc_dec8x_three - nop - beq .Lcbc_dec8x_four - cmplwi 5,96 - blt .Lcbc_dec8x_five - nop - beq .Lcbc_dec8x_six - -.Lcbc_dec8x_seven: - .long 0x11EF2549 - .long 0x12100D49 - .long 0x12311549 - .long 0x12521D49 - .long 0x12735549 - .long 0x12945D49 - .long 0x12B56549 - vor 4,13,13 - - vperm 15,15,15,6 - vperm 16,16,16,6 - .long 0x7DE02799 - vperm 17,17,17,6 - .long 0x7E082799 - vperm 18,18,18,6 - .long 0x7E3A2799 - vperm 19,19,19,6 - .long 0x7E5B2799 - vperm 20,20,20,6 - .long 0x7E7C2799 - vperm 21,21,21,6 - .long 0x7E9D2799 - .long 0x7EBE2799 - addi 4,4,0x70 - b .Lcbc_dec8x_done - -.align 5 -.Lcbc_dec8x_six: - .long 0x12102549 - .long 0x12311549 - .long 0x12521D49 - .long 0x12735549 - .long 0x12945D49 - .long 0x12B56549 - vor 4,13,13 - - vperm 16,16,16,6 - vperm 17,17,17,6 - .long 0x7E002799 - vperm 18,18,18,6 - .long 0x7E282799 - vperm 19,19,19,6 - .long 0x7E5A2799 - vperm 20,20,20,6 - .long 0x7E7B2799 - vperm 21,21,21,6 - .long 0x7E9C2799 - .long 0x7EBD2799 - addi 4,4,0x60 - b .Lcbc_dec8x_done - -.align 5 -.Lcbc_dec8x_five: - .long 0x12312549 - .long 0x12521D49 - .long 0x12735549 - .long 0x12945D49 - .long 0x12B56549 - vor 4,13,13 - - vperm 17,17,17,6 - vperm 18,18,18,6 - .long 0x7E202799 - vperm 19,19,19,6 - .long 0x7E482799 - vperm 20,20,20,6 - .long 0x7E7A2799 - vperm 21,21,21,6 - .long 0x7E9B2799 - .long 0x7EBC2799 - addi 4,4,0x50 - b .Lcbc_dec8x_done - -.align 5 -.Lcbc_dec8x_four: - .long 0x12522549 - .long 0x12735549 - .long 0x12945D49 - .long 0x12B56549 - vor 4,13,13 - - vperm 18,18,18,6 - vperm 19,19,19,6 - .long 0x7E402799 - vperm 20,20,20,6 - .long 0x7E682799 - vperm 21,21,21,6 - .long 0x7E9A2799 - .long 0x7EBB2799 - addi 4,4,0x40 - b .Lcbc_dec8x_done - -.align 5 -.Lcbc_dec8x_three: - .long 0x12732549 - .long 0x12945D49 - .long 0x12B56549 - vor 4,13,13 - - vperm 19,19,19,6 - vperm 20,20,20,6 - .long 0x7E602799 - vperm 21,21,21,6 - .long 0x7E882799 - .long 0x7EBA2799 - addi 4,4,0x30 - b .Lcbc_dec8x_done - -.align 5 -.Lcbc_dec8x_two: - .long 0x12942549 - .long 0x12B56549 - vor 4,13,13 - - vperm 20,20,20,6 - vperm 21,21,21,6 - .long 0x7E802799 - .long 0x7EA82799 - addi 4,4,0x20 - b .Lcbc_dec8x_done - -.align 5 -.Lcbc_dec8x_one: - .long 0x12B52549 - vor 4,13,13 - - vperm 21,21,21,6 - .long 0x7EA02799 - addi 4,4,0x10 - -.Lcbc_dec8x_done: - vperm 4,4,4,6 - .long 0x7C803F99 - - li 10,79 - li 11,95 - stvx 6,10,1 - addi 10,10,32 - stvx 6,11,1 - addi 11,11,32 - stvx 6,10,1 - addi 10,10,32 - stvx 6,11,1 - addi 11,11,32 - stvx 6,10,1 - addi 10,10,32 - stvx 6,11,1 - addi 11,11,32 - stvx 6,10,1 - addi 10,10,32 - stvx 6,11,1 - addi 11,11,32 - - or 12,12,12 - lvx 20,10,1 - addi 10,10,32 - lvx 21,11,1 - addi 11,11,32 - lvx 22,10,1 - addi 10,10,32 - lvx 23,11,1 - addi 11,11,32 - lvx 24,10,1 - addi 10,10,32 - lvx 25,11,1 - addi 11,11,32 - lvx 26,10,1 - addi 10,10,32 - lvx 27,11,1 - addi 11,11,32 - lvx 28,10,1 - addi 10,10,32 - lvx 29,11,1 - addi 11,11,32 - lvx 30,10,1 - lvx 31,11,1 - ld 26,400(1) - ld 27,408(1) - ld 28,416(1) - ld 29,424(1) - ld 30,432(1) - ld 31,440(1) - addi 1,1,448 - blr -.long 0 -.byte 0,12,0x04,0,0x80,6,6,0 -.long 0 -.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt -.globl aes_hw_ctr32_encrypt_blocks -.type aes_hw_ctr32_encrypt_blocks,@function -.align 5 -aes_hw_ctr32_encrypt_blocks: -.localentry aes_hw_ctr32_encrypt_blocks,0 - - cmpldi 5,1 - .long 0x4dc00020 - - lis 0,0xfff0 - li 12,-1 - or 0,0,0 - - li 10,15 - vxor 0,0,0 - vspltisb 3,0x0f - - lvx 4,0,7 - lvsl 6,0,7 - lvx 5,10,7 - vspltisb 11,1 - vxor 6,6,3 - vperm 4,4,5,6 - vsldoi 11,0,11,1 - - neg 11,3 - lvsr 10,0,6 - lwz 9,240(6) - - lvsr 6,0,11 - lvx 5,0,3 - addi 3,3,15 - vxor 6,6,3 - - srwi 9,9,1 - li 10,16 - subi 9,9,1 - - cmpldi 5,8 - bge _aesp8_ctr32_encrypt8x - - lvsl 8,0,4 - vspltisb 9,-1 - lvx 7,0,4 - vperm 9,9,0,8 - vxor 8,8,3 - - lvx 0,0,6 - mtctr 9 - lvx 1,10,6 - addi 10,10,16 - vperm 0,1,0,10 - vxor 2,4,0 - lvx 0,10,6 - addi 10,10,16 - b .Loop_ctr32_enc - -.align 5 -.Loop_ctr32_enc: - vperm 1,0,1,10 - .long 0x10420D08 - lvx 1,10,6 - addi 10,10,16 - vperm 0,1,0,10 - .long 0x10420508 - lvx 0,10,6 - addi 10,10,16 - bdnz .Loop_ctr32_enc - - vadduwm 4,4,11 - vor 3,5,5 - lvx 5,0,3 - addi 3,3,16 - subic. 5,5,1 - - vperm 1,0,1,10 - .long 0x10420D08 - lvx 1,10,6 - vperm 3,3,5,6 - li 10,16 - vperm 1,1,0,10 - lvx 0,0,6 - vxor 3,3,1 - .long 0x10421D09 - - lvx 1,10,6 - addi 10,10,16 - vperm 2,2,2,8 - vsel 3,7,2,9 - mtctr 9 - vperm 0,1,0,10 - vor 7,2,2 - vxor 2,4,0 - lvx 0,10,6 - addi 10,10,16 - stvx 3,0,4 - addi 4,4,16 - bne .Loop_ctr32_enc - - addi 4,4,-1 - lvx 2,0,4 - vsel 2,7,2,9 - stvx 2,0,4 - - or 12,12,12 - blr -.long 0 -.byte 0,12,0x14,0,0,0,6,0 -.long 0 -.align 5 -_aesp8_ctr32_encrypt8x: - stdu 1,-448(1) - li 10,207 - li 11,223 - stvx 20,10,1 - addi 10,10,32 - stvx 21,11,1 - addi 11,11,32 - stvx 22,10,1 - addi 10,10,32 - stvx 23,11,1 - addi 11,11,32 - stvx 24,10,1 - addi 10,10,32 - stvx 25,11,1 - addi 11,11,32 - stvx 26,10,1 - addi 10,10,32 - stvx 27,11,1 - addi 11,11,32 - stvx 28,10,1 - addi 10,10,32 - stvx 29,11,1 - addi 11,11,32 - stvx 30,10,1 - stvx 31,11,1 - li 0,-1 - stw 12,396(1) - li 8,0x10 - std 26,400(1) - li 26,0x20 - std 27,408(1) - li 27,0x30 - std 28,416(1) - li 28,0x40 - std 29,424(1) - li 29,0x50 - std 30,432(1) - li 30,0x60 - std 31,440(1) - li 31,0x70 - or 0,0,0 - - subi 9,9,3 - - lvx 23,0,6 - lvx 30,8,6 - addi 6,6,0x20 - lvx 31,0,6 - vperm 23,30,23,10 - addi 11,1,79 - mtctr 9 - -.Load_ctr32_enc_key: - vperm 24,31,30,10 - lvx 30,8,6 - addi 6,6,0x20 - stvx 24,0,11 - vperm 25,30,31,10 - lvx 31,0,6 - stvx 25,8,11 - addi 11,11,0x20 - bdnz .Load_ctr32_enc_key - - lvx 26,8,6 - vperm 24,31,30,10 - lvx 27,26,6 - stvx 24,0,11 - vperm 25,26,31,10 - lvx 28,27,6 - stvx 25,8,11 - addi 11,1,79 - vperm 26,27,26,10 - lvx 29,28,6 - vperm 27,28,27,10 - lvx 30,29,6 - vperm 28,29,28,10 - lvx 31,30,6 - vperm 29,30,29,10 - lvx 15,31,6 - vperm 30,31,30,10 - lvx 24,0,11 - vperm 31,15,31,10 - lvx 25,8,11 - - vadduwm 7,11,11 - subi 3,3,15 - sldi 5,5,4 - - vadduwm 16,4,11 - vadduwm 17,4,7 - vxor 15,4,23 - li 10,8 - vadduwm 18,16,7 - vxor 16,16,23 - lvsl 6,0,10 - vadduwm 19,17,7 - vxor 17,17,23 - vspltisb 3,0x0f - vadduwm 20,18,7 - vxor 18,18,23 - vxor 6,6,3 - vadduwm 21,19,7 - vxor 19,19,23 - vadduwm 22,20,7 - vxor 20,20,23 - vadduwm 4,21,7 - vxor 21,21,23 - vxor 22,22,23 - - mtctr 9 - b .Loop_ctr32_enc8x -.align 5 -.Loop_ctr32_enc8x: - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 -.Loop_ctr32_enc8x_middle: - lvx 24,26,11 - addi 11,11,0x20 - - .long 0x11EFCD08 - .long 0x1210CD08 - .long 0x1231CD08 - .long 0x1252CD08 - .long 0x1273CD08 - .long 0x1294CD08 - .long 0x12B5CD08 - .long 0x12D6CD08 - lvx 25,8,11 - bdnz .Loop_ctr32_enc8x - - subic 11,5,256 - .long 0x11EFC508 - .long 0x1210C508 - .long 0x1231C508 - .long 0x1252C508 - .long 0x1273C508 - .long 0x1294C508 - .long 0x12B5C508 - .long 0x12D6C508 - - subfe 0,0,0 - .long 0x11EFCD08 - .long 0x1210CD08 - .long 0x1231CD08 - .long 0x1252CD08 - .long 0x1273CD08 - .long 0x1294CD08 - .long 0x12B5CD08 - .long 0x12D6CD08 - - and 0,0,11 - addi 11,1,79 - .long 0x11EFD508 - .long 0x1210D508 - .long 0x1231D508 - .long 0x1252D508 - .long 0x1273D508 - .long 0x1294D508 - .long 0x12B5D508 - .long 0x12D6D508 - lvx 24,0,11 - - subic 5,5,129 - .long 0x11EFDD08 - addi 5,5,1 - .long 0x1210DD08 - .long 0x1231DD08 - .long 0x1252DD08 - .long 0x1273DD08 - .long 0x1294DD08 - .long 0x12B5DD08 - .long 0x12D6DD08 - lvx 25,8,11 - - .long 0x11EFE508 - .long 0x7C001E99 - .long 0x1210E508 - .long 0x7C281E99 - .long 0x1231E508 - .long 0x7C5A1E99 - .long 0x1252E508 - .long 0x7C7B1E99 - .long 0x1273E508 - .long 0x7D5C1E99 - .long 0x1294E508 - .long 0x7D9D1E99 - .long 0x12B5E508 - .long 0x7DBE1E99 - .long 0x12D6E508 - .long 0x7DDF1E99 - addi 3,3,0x80 - - .long 0x11EFED08 - vperm 0,0,0,6 - .long 0x1210ED08 - vperm 1,1,1,6 - .long 0x1231ED08 - vperm 2,2,2,6 - .long 0x1252ED08 - vperm 3,3,3,6 - .long 0x1273ED08 - vperm 10,10,10,6 - .long 0x1294ED08 - vperm 12,12,12,6 - .long 0x12B5ED08 - vperm 13,13,13,6 - .long 0x12D6ED08 - vperm 14,14,14,6 - - add 3,3,0 - - - - subfe. 0,0,0 - .long 0x11EFF508 - vxor 0,0,31 - .long 0x1210F508 - vxor 1,1,31 - .long 0x1231F508 - vxor 2,2,31 - .long 0x1252F508 - vxor 3,3,31 - .long 0x1273F508 - vxor 10,10,31 - .long 0x1294F508 - vxor 12,12,31 - .long 0x12B5F508 - vxor 13,13,31 - .long 0x12D6F508 - vxor 14,14,31 - - bne .Lctr32_enc8x_break - - .long 0x100F0509 - .long 0x10300D09 - vadduwm 16,4,11 - .long 0x10511509 - vadduwm 17,4,7 - vxor 15,4,23 - .long 0x10721D09 - vadduwm 18,16,7 - vxor 16,16,23 - .long 0x11535509 - vadduwm 19,17,7 - vxor 17,17,23 - .long 0x11946509 - vadduwm 20,18,7 - vxor 18,18,23 - .long 0x11B56D09 - vadduwm 21,19,7 - vxor 19,19,23 - .long 0x11D67509 - vadduwm 22,20,7 - vxor 20,20,23 - vperm 0,0,0,6 - vadduwm 4,21,7 - vxor 21,21,23 - vperm 1,1,1,6 - vxor 22,22,23 - mtctr 9 - - .long 0x11EFC508 - .long 0x7C002799 - vperm 2,2,2,6 - .long 0x1210C508 - .long 0x7C282799 - vperm 3,3,3,6 - .long 0x1231C508 - .long 0x7C5A2799 - vperm 10,10,10,6 - .long 0x1252C508 - .long 0x7C7B2799 - vperm 12,12,12,6 - .long 0x1273C508 - .long 0x7D5C2799 - vperm 13,13,13,6 - .long 0x1294C508 - .long 0x7D9D2799 - vperm 14,14,14,6 - .long 0x12B5C508 - .long 0x7DBE2799 - .long 0x12D6C508 - .long 0x7DDF2799 - addi 4,4,0x80 - - b .Loop_ctr32_enc8x_middle - -.align 5 -.Lctr32_enc8x_break: - cmpwi 5,-0x60 - blt .Lctr32_enc8x_one - nop - beq .Lctr32_enc8x_two - cmpwi 5,-0x40 - blt .Lctr32_enc8x_three - nop - beq .Lctr32_enc8x_four - cmpwi 5,-0x20 - blt .Lctr32_enc8x_five - nop - beq .Lctr32_enc8x_six - cmpwi 5,0x00 - blt .Lctr32_enc8x_seven - -.Lctr32_enc8x_eight: - .long 0x11EF0509 - .long 0x12100D09 - .long 0x12311509 - .long 0x12521D09 - .long 0x12735509 - .long 0x12946509 - .long 0x12B56D09 - .long 0x12D67509 - - vperm 15,15,15,6 - vperm 16,16,16,6 - .long 0x7DE02799 - vperm 17,17,17,6 - .long 0x7E082799 - vperm 18,18,18,6 - .long 0x7E3A2799 - vperm 19,19,19,6 - .long 0x7E5B2799 - vperm 20,20,20,6 - .long 0x7E7C2799 - vperm 21,21,21,6 - .long 0x7E9D2799 - vperm 22,22,22,6 - .long 0x7EBE2799 - .long 0x7EDF2799 - addi 4,4,0x80 - b .Lctr32_enc8x_done - -.align 5 -.Lctr32_enc8x_seven: - .long 0x11EF0D09 - .long 0x12101509 - .long 0x12311D09 - .long 0x12525509 - .long 0x12736509 - .long 0x12946D09 - .long 0x12B57509 - - vperm 15,15,15,6 - vperm 16,16,16,6 - .long 0x7DE02799 - vperm 17,17,17,6 - .long 0x7E082799 - vperm 18,18,18,6 - .long 0x7E3A2799 - vperm 19,19,19,6 - .long 0x7E5B2799 - vperm 20,20,20,6 - .long 0x7E7C2799 - vperm 21,21,21,6 - .long 0x7E9D2799 - .long 0x7EBE2799 - addi 4,4,0x70 - b .Lctr32_enc8x_done - -.align 5 -.Lctr32_enc8x_six: - .long 0x11EF1509 - .long 0x12101D09 - .long 0x12315509 - .long 0x12526509 - .long 0x12736D09 - .long 0x12947509 - - vperm 15,15,15,6 - vperm 16,16,16,6 - .long 0x7DE02799 - vperm 17,17,17,6 - .long 0x7E082799 - vperm 18,18,18,6 - .long 0x7E3A2799 - vperm 19,19,19,6 - .long 0x7E5B2799 - vperm 20,20,20,6 - .long 0x7E7C2799 - .long 0x7E9D2799 - addi 4,4,0x60 - b .Lctr32_enc8x_done - -.align 5 -.Lctr32_enc8x_five: - .long 0x11EF1D09 - .long 0x12105509 - .long 0x12316509 - .long 0x12526D09 - .long 0x12737509 - - vperm 15,15,15,6 - vperm 16,16,16,6 - .long 0x7DE02799 - vperm 17,17,17,6 - .long 0x7E082799 - vperm 18,18,18,6 - .long 0x7E3A2799 - vperm 19,19,19,6 - .long 0x7E5B2799 - .long 0x7E7C2799 - addi 4,4,0x50 - b .Lctr32_enc8x_done - -.align 5 -.Lctr32_enc8x_four: - .long 0x11EF5509 - .long 0x12106509 - .long 0x12316D09 - .long 0x12527509 - - vperm 15,15,15,6 - vperm 16,16,16,6 - .long 0x7DE02799 - vperm 17,17,17,6 - .long 0x7E082799 - vperm 18,18,18,6 - .long 0x7E3A2799 - .long 0x7E5B2799 - addi 4,4,0x40 - b .Lctr32_enc8x_done - -.align 5 -.Lctr32_enc8x_three: - .long 0x11EF6509 - .long 0x12106D09 - .long 0x12317509 - - vperm 15,15,15,6 - vperm 16,16,16,6 - .long 0x7DE02799 - vperm 17,17,17,6 - .long 0x7E082799 - .long 0x7E3A2799 - addi 4,4,0x30 - b .Lctr32_enc8x_done - -.align 5 -.Lctr32_enc8x_two: - .long 0x11EF6D09 - .long 0x12107509 - - vperm 15,15,15,6 - vperm 16,16,16,6 - .long 0x7DE02799 - .long 0x7E082799 - addi 4,4,0x20 - b .Lctr32_enc8x_done - -.align 5 -.Lctr32_enc8x_one: - .long 0x11EF7509 - - vperm 15,15,15,6 - .long 0x7DE02799 - addi 4,4,0x10 - -.Lctr32_enc8x_done: - li 10,79 - li 11,95 - stvx 6,10,1 - addi 10,10,32 - stvx 6,11,1 - addi 11,11,32 - stvx 6,10,1 - addi 10,10,32 - stvx 6,11,1 - addi 11,11,32 - stvx 6,10,1 - addi 10,10,32 - stvx 6,11,1 - addi 11,11,32 - stvx 6,10,1 - addi 10,10,32 - stvx 6,11,1 - addi 11,11,32 - - or 12,12,12 - lvx 20,10,1 - addi 10,10,32 - lvx 21,11,1 - addi 11,11,32 - lvx 22,10,1 - addi 10,10,32 - lvx 23,11,1 - addi 11,11,32 - lvx 24,10,1 - addi 10,10,32 - lvx 25,11,1 - addi 11,11,32 - lvx 26,10,1 - addi 10,10,32 - lvx 27,11,1 - addi 11,11,32 - lvx 28,10,1 - addi 10,10,32 - lvx 29,11,1 - addi 11,11,32 - lvx 30,10,1 - lvx 31,11,1 - ld 26,400(1) - ld 27,408(1) - ld 28,416(1) - ld 29,424(1) - ld 30,432(1) - ld 31,440(1) - addi 1,1,448 - blr -.long 0 -.byte 0,12,0x04,0,0x80,6,6,0 -.long 0 -.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks -.globl aes_hw_xts_encrypt -.type aes_hw_xts_encrypt,@function -.align 5 -aes_hw_xts_encrypt: -.localentry aes_hw_xts_encrypt,0 - - mr 10,3 - li 3,-1 - cmpldi 5,16 - .long 0x4dc00020 - - lis 0,0xfff0 - li 12,-1 - li 11,0 - or 0,0,0 - - vspltisb 9,0x07 - lvsl 6,11,11 - vspltisb 11,0x0f - vxor 6,6,9 - - li 3,15 - lvx 8,0,8 - lvsl 5,0,8 - lvx 4,3,8 - vxor 5,5,11 - vperm 8,8,4,5 - - neg 11,10 - lvsr 5,0,11 - lvx 2,0,10 - addi 10,10,15 - vxor 5,5,11 - - cmpldi 7,0 - beq .Lxts_enc_no_key2 - - lvsr 7,0,7 - lwz 9,240(7) - srwi 9,9,1 - subi 9,9,1 - li 3,16 - - lvx 0,0,7 - lvx 1,3,7 - addi 3,3,16 - vperm 0,1,0,7 - vxor 8,8,0 - lvx 0,3,7 - addi 3,3,16 - mtctr 9 - -.Ltweak_xts_enc: - vperm 1,0,1,7 - .long 0x11080D08 - lvx 1,3,7 - addi 3,3,16 - vperm 0,1,0,7 - .long 0x11080508 - lvx 0,3,7 - addi 3,3,16 - bdnz .Ltweak_xts_enc - - vperm 1,0,1,7 - .long 0x11080D08 - lvx 1,3,7 - vperm 0,1,0,7 - .long 0x11080509 - - li 8,0 - b .Lxts_enc - -.Lxts_enc_no_key2: - li 3,-16 - and 5,5,3 - - -.Lxts_enc: - lvx 4,0,10 - addi 10,10,16 - - lvsr 7,0,6 - lwz 9,240(6) - srwi 9,9,1 - subi 9,9,1 - li 3,16 - - vslb 10,9,9 - vor 10,10,9 - vspltisb 11,1 - vsldoi 10,10,11,15 - - cmpldi 5,96 - bge _aesp8_xts_encrypt6x - - andi. 7,5,15 - subic 0,5,32 - subi 7,7,16 - subfe 0,0,0 - and 0,0,7 - add 10,10,0 - - lvx 0,0,6 - lvx 1,3,6 - addi 3,3,16 - vperm 2,2,4,5 - vperm 0,1,0,7 - vxor 2,2,8 - vxor 2,2,0 - lvx 0,3,6 - addi 3,3,16 - mtctr 9 - b .Loop_xts_enc - -.align 5 -.Loop_xts_enc: - vperm 1,0,1,7 - .long 0x10420D08 - lvx 1,3,6 - addi 3,3,16 - vperm 0,1,0,7 - .long 0x10420508 - lvx 0,3,6 - addi 3,3,16 - bdnz .Loop_xts_enc - - vperm 1,0,1,7 - .long 0x10420D08 - lvx 1,3,6 - li 3,16 - vperm 0,1,0,7 - vxor 0,0,8 - .long 0x10620509 - - vperm 11,3,3,6 - - .long 0x7D602799 - - addi 4,4,16 - - subic. 5,5,16 - beq .Lxts_enc_done - - vor 2,4,4 - lvx 4,0,10 - addi 10,10,16 - lvx 0,0,6 - lvx 1,3,6 - addi 3,3,16 - - subic 0,5,32 - subfe 0,0,0 - and 0,0,7 - add 10,10,0 - - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vand 11,11,10 - vxor 8,8,11 - - vperm 2,2,4,5 - vperm 0,1,0,7 - vxor 2,2,8 - vxor 3,3,0 - vxor 2,2,0 - lvx 0,3,6 - addi 3,3,16 - - mtctr 9 - cmpldi 5,16 - bge .Loop_xts_enc - - vxor 3,3,8 - lvsr 5,0,5 - vxor 4,4,4 - vspltisb 11,-1 - vperm 4,4,11,5 - vsel 2,2,3,4 - - subi 11,4,17 - subi 4,4,16 - mtctr 5 - li 5,16 -.Loop_xts_enc_steal: - lbzu 0,1(11) - stb 0,16(11) - bdnz .Loop_xts_enc_steal - - mtctr 9 - b .Loop_xts_enc - -.Lxts_enc_done: - cmpldi 8,0 - beq .Lxts_enc_ret - - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vand 11,11,10 - vxor 8,8,11 - - vperm 8,8,8,6 - .long 0x7D004799 - -.Lxts_enc_ret: - or 12,12,12 - li 3,0 - blr -.long 0 -.byte 0,12,0x04,0,0x80,6,6,0 -.long 0 -.size aes_hw_xts_encrypt,.-aes_hw_xts_encrypt - -.globl aes_hw_xts_decrypt -.type aes_hw_xts_decrypt,@function -.align 5 -aes_hw_xts_decrypt: -.localentry aes_hw_xts_decrypt,0 - - mr 10,3 - li 3,-1 - cmpldi 5,16 - .long 0x4dc00020 - - lis 0,0xfff8 - li 12,-1 - li 11,0 - or 0,0,0 - - andi. 0,5,15 - neg 0,0 - andi. 0,0,16 - sub 5,5,0 - - vspltisb 9,0x07 - lvsl 6,11,11 - vspltisb 11,0x0f - vxor 6,6,9 - - li 3,15 - lvx 8,0,8 - lvsl 5,0,8 - lvx 4,3,8 - vxor 5,5,11 - vperm 8,8,4,5 - - neg 11,10 - lvsr 5,0,11 - lvx 2,0,10 - addi 10,10,15 - vxor 5,5,11 - - cmpldi 7,0 - beq .Lxts_dec_no_key2 - - lvsr 7,0,7 - lwz 9,240(7) - srwi 9,9,1 - subi 9,9,1 - li 3,16 - - lvx 0,0,7 - lvx 1,3,7 - addi 3,3,16 - vperm 0,1,0,7 - vxor 8,8,0 - lvx 0,3,7 - addi 3,3,16 - mtctr 9 - -.Ltweak_xts_dec: - vperm 1,0,1,7 - .long 0x11080D08 - lvx 1,3,7 - addi 3,3,16 - vperm 0,1,0,7 - .long 0x11080508 - lvx 0,3,7 - addi 3,3,16 - bdnz .Ltweak_xts_dec - - vperm 1,0,1,7 - .long 0x11080D08 - lvx 1,3,7 - vperm 0,1,0,7 - .long 0x11080509 - - li 8,0 - b .Lxts_dec - -.Lxts_dec_no_key2: - neg 3,5 - andi. 3,3,15 - add 5,5,3 - - -.Lxts_dec: - lvx 4,0,10 - addi 10,10,16 - - lvsr 7,0,6 - lwz 9,240(6) - srwi 9,9,1 - subi 9,9,1 - li 3,16 - - vslb 10,9,9 - vor 10,10,9 - vspltisb 11,1 - vsldoi 10,10,11,15 - - cmpldi 5,96 - bge _aesp8_xts_decrypt6x - - lvx 0,0,6 - lvx 1,3,6 - addi 3,3,16 - vperm 2,2,4,5 - vperm 0,1,0,7 - vxor 2,2,8 - vxor 2,2,0 - lvx 0,3,6 - addi 3,3,16 - mtctr 9 - - cmpldi 5,16 - blt .Ltail_xts_dec - - -.align 5 -.Loop_xts_dec: - vperm 1,0,1,7 - .long 0x10420D48 - lvx 1,3,6 - addi 3,3,16 - vperm 0,1,0,7 - .long 0x10420548 - lvx 0,3,6 - addi 3,3,16 - bdnz .Loop_xts_dec - - vperm 1,0,1,7 - .long 0x10420D48 - lvx 1,3,6 - li 3,16 - vperm 0,1,0,7 - vxor 0,0,8 - .long 0x10620549 - - vperm 11,3,3,6 - - .long 0x7D602799 - - addi 4,4,16 - - subic. 5,5,16 - beq .Lxts_dec_done - - vor 2,4,4 - lvx 4,0,10 - addi 10,10,16 - lvx 0,0,6 - lvx 1,3,6 - addi 3,3,16 - - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vand 11,11,10 - vxor 8,8,11 - - vperm 2,2,4,5 - vperm 0,1,0,7 - vxor 2,2,8 - vxor 2,2,0 - lvx 0,3,6 - addi 3,3,16 - - mtctr 9 - cmpldi 5,16 - bge .Loop_xts_dec - -.Ltail_xts_dec: - vsrab 11,8,9 - vaddubm 12,8,8 - vsldoi 11,11,11,15 - vand 11,11,10 - vxor 12,12,11 - - subi 10,10,16 - add 10,10,5 - - vxor 2,2,8 - vxor 2,2,12 - -.Loop_xts_dec_short: - vperm 1,0,1,7 - .long 0x10420D48 - lvx 1,3,6 - addi 3,3,16 - vperm 0,1,0,7 - .long 0x10420548 - lvx 0,3,6 - addi 3,3,16 - bdnz .Loop_xts_dec_short - - vperm 1,0,1,7 - .long 0x10420D48 - lvx 1,3,6 - li 3,16 - vperm 0,1,0,7 - vxor 0,0,12 - .long 0x10620549 - - vperm 11,3,3,6 - - .long 0x7D602799 - - - vor 2,4,4 - lvx 4,0,10 - - lvx 0,0,6 - lvx 1,3,6 - addi 3,3,16 - vperm 2,2,4,5 - vperm 0,1,0,7 - - lvsr 5,0,5 - vxor 4,4,4 - vspltisb 11,-1 - vperm 4,4,11,5 - vsel 2,2,3,4 - - vxor 0,0,8 - vxor 2,2,0 - lvx 0,3,6 - addi 3,3,16 - - subi 11,4,1 - mtctr 5 - li 5,16 -.Loop_xts_dec_steal: - lbzu 0,1(11) - stb 0,16(11) - bdnz .Loop_xts_dec_steal - - mtctr 9 - b .Loop_xts_dec - -.Lxts_dec_done: - cmpldi 8,0 - beq .Lxts_dec_ret - - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vand 11,11,10 - vxor 8,8,11 - - vperm 8,8,8,6 - .long 0x7D004799 - -.Lxts_dec_ret: - or 12,12,12 - li 3,0 - blr -.long 0 -.byte 0,12,0x04,0,0x80,6,6,0 -.long 0 -.size aes_hw_xts_decrypt,.-aes_hw_xts_decrypt -.align 5 -_aesp8_xts_encrypt6x: - stdu 1,-448(1) - mflr 11 - li 7,207 - li 3,223 - std 11,464(1) - stvx 20,7,1 - addi 7,7,32 - stvx 21,3,1 - addi 3,3,32 - stvx 22,7,1 - addi 7,7,32 - stvx 23,3,1 - addi 3,3,32 - stvx 24,7,1 - addi 7,7,32 - stvx 25,3,1 - addi 3,3,32 - stvx 26,7,1 - addi 7,7,32 - stvx 27,3,1 - addi 3,3,32 - stvx 28,7,1 - addi 7,7,32 - stvx 29,3,1 - addi 3,3,32 - stvx 30,7,1 - stvx 31,3,1 - li 0,-1 - stw 12,396(1) - li 3,0x10 - std 26,400(1) - li 26,0x20 - std 27,408(1) - li 27,0x30 - std 28,416(1) - li 28,0x40 - std 29,424(1) - li 29,0x50 - std 30,432(1) - li 30,0x60 - std 31,440(1) - li 31,0x70 - or 0,0,0 - - subi 9,9,3 - - lvx 23,0,6 - lvx 30,3,6 - addi 6,6,0x20 - lvx 31,0,6 - vperm 23,30,23,7 - addi 7,1,79 - mtctr 9 - -.Load_xts_enc_key: - vperm 24,31,30,7 - lvx 30,3,6 - addi 6,6,0x20 - stvx 24,0,7 - vperm 25,30,31,7 - lvx 31,0,6 - stvx 25,3,7 - addi 7,7,0x20 - bdnz .Load_xts_enc_key - - lvx 26,3,6 - vperm 24,31,30,7 - lvx 27,26,6 - stvx 24,0,7 - vperm 25,26,31,7 - lvx 28,27,6 - stvx 25,3,7 - addi 7,1,79 - vperm 26,27,26,7 - lvx 29,28,6 - vperm 27,28,27,7 - lvx 30,29,6 - vperm 28,29,28,7 - lvx 31,30,6 - vperm 29,30,29,7 - lvx 22,31,6 - vperm 30,31,30,7 - lvx 24,0,7 - vperm 31,22,31,7 - lvx 25,3,7 - - vperm 0,2,4,5 - subi 10,10,31 - vxor 17,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vand 11,11,10 - vxor 7,0,17 - vxor 8,8,11 - - .long 0x7C235699 - vxor 18,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vperm 1,1,1,6 - vand 11,11,10 - vxor 12,1,18 - vxor 8,8,11 - - .long 0x7C5A5699 - andi. 31,5,15 - vxor 19,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vperm 2,2,2,6 - vand 11,11,10 - vxor 13,2,19 - vxor 8,8,11 - - .long 0x7C7B5699 - sub 5,5,31 - vxor 20,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vperm 3,3,3,6 - vand 11,11,10 - vxor 14,3,20 - vxor 8,8,11 - - .long 0x7C9C5699 - subi 5,5,0x60 - vxor 21,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vperm 4,4,4,6 - vand 11,11,10 - vxor 15,4,21 - vxor 8,8,11 - - .long 0x7CBD5699 - addi 10,10,0x60 - vxor 22,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vperm 5,5,5,6 - vand 11,11,10 - vxor 16,5,22 - vxor 8,8,11 - - vxor 31,31,23 - mtctr 9 - b .Loop_xts_enc6x - -.align 5 -.Loop_xts_enc6x: - .long 0x10E7C508 - .long 0x118CC508 - .long 0x11ADC508 - .long 0x11CEC508 - .long 0x11EFC508 - .long 0x1210C508 - lvx 24,26,7 - addi 7,7,0x20 - - .long 0x10E7CD08 - .long 0x118CCD08 - .long 0x11ADCD08 - .long 0x11CECD08 - .long 0x11EFCD08 - .long 0x1210CD08 - lvx 25,3,7 - bdnz .Loop_xts_enc6x - - subic 5,5,96 - vxor 0,17,31 - .long 0x10E7C508 - .long 0x118CC508 - vsrab 11,8,9 - vxor 17,8,23 - vaddubm 8,8,8 - .long 0x11ADC508 - .long 0x11CEC508 - vsldoi 11,11,11,15 - .long 0x11EFC508 - .long 0x1210C508 - - subfe. 0,0,0 - vand 11,11,10 - .long 0x10E7CD08 - .long 0x118CCD08 - vxor 8,8,11 - .long 0x11ADCD08 - .long 0x11CECD08 - vxor 1,18,31 - vsrab 11,8,9 - vxor 18,8,23 - .long 0x11EFCD08 - .long 0x1210CD08 - - and 0,0,5 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - .long 0x10E7D508 - .long 0x118CD508 - vand 11,11,10 - .long 0x11ADD508 - .long 0x11CED508 - vxor 8,8,11 - .long 0x11EFD508 - .long 0x1210D508 - - add 10,10,0 - - - - vxor 2,19,31 - vsrab 11,8,9 - vxor 19,8,23 - vaddubm 8,8,8 - .long 0x10E7DD08 - .long 0x118CDD08 - vsldoi 11,11,11,15 - .long 0x11ADDD08 - .long 0x11CEDD08 - vand 11,11,10 - .long 0x11EFDD08 - .long 0x1210DD08 - - addi 7,1,79 - vxor 8,8,11 - .long 0x10E7E508 - .long 0x118CE508 - vxor 3,20,31 - vsrab 11,8,9 - vxor 20,8,23 - .long 0x11ADE508 - .long 0x11CEE508 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - .long 0x11EFE508 - .long 0x1210E508 - lvx 24,0,7 - vand 11,11,10 - - .long 0x10E7ED08 - .long 0x118CED08 - vxor 8,8,11 - .long 0x11ADED08 - .long 0x11CEED08 - vxor 4,21,31 - vsrab 11,8,9 - vxor 21,8,23 - .long 0x11EFED08 - .long 0x1210ED08 - lvx 25,3,7 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - - .long 0x10E7F508 - .long 0x118CF508 - vand 11,11,10 - .long 0x11ADF508 - .long 0x11CEF508 - vxor 8,8,11 - .long 0x11EFF508 - .long 0x1210F508 - vxor 5,22,31 - vsrab 11,8,9 - vxor 22,8,23 - - .long 0x10E70509 - .long 0x7C005699 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - .long 0x118C0D09 - .long 0x7C235699 - .long 0x11AD1509 - vperm 0,0,0,6 - .long 0x7C5A5699 - vand 11,11,10 - .long 0x11CE1D09 - vperm 1,1,1,6 - .long 0x7C7B5699 - .long 0x11EF2509 - vperm 2,2,2,6 - .long 0x7C9C5699 - vxor 8,8,11 - .long 0x11702D09 - - vperm 3,3,3,6 - .long 0x7CBD5699 - addi 10,10,0x60 - vperm 4,4,4,6 - vperm 5,5,5,6 - - vperm 7,7,7,6 - vperm 12,12,12,6 - .long 0x7CE02799 - vxor 7,0,17 - vperm 13,13,13,6 - .long 0x7D832799 - vxor 12,1,18 - vperm 14,14,14,6 - .long 0x7DBA2799 - vxor 13,2,19 - vperm 15,15,15,6 - .long 0x7DDB2799 - vxor 14,3,20 - vperm 16,11,11,6 - .long 0x7DFC2799 - vxor 15,4,21 - .long 0x7E1D2799 - - vxor 16,5,22 - addi 4,4,0x60 - - mtctr 9 - beq .Loop_xts_enc6x - - addic. 5,5,0x60 - beq .Lxts_enc6x_zero - cmpwi 5,0x20 - blt .Lxts_enc6x_one - nop - beq .Lxts_enc6x_two - cmpwi 5,0x40 - blt .Lxts_enc6x_three - nop - beq .Lxts_enc6x_four - -.Lxts_enc6x_five: - vxor 7,1,17 - vxor 12,2,18 - vxor 13,3,19 - vxor 14,4,20 - vxor 15,5,21 - - bl _aesp8_xts_enc5x - - vperm 7,7,7,6 - vor 17,22,22 - vperm 12,12,12,6 - .long 0x7CE02799 - vperm 13,13,13,6 - .long 0x7D832799 - vperm 14,14,14,6 - .long 0x7DBA2799 - vxor 11,15,22 - vperm 15,15,15,6 - .long 0x7DDB2799 - .long 0x7DFC2799 - addi 4,4,0x50 - bne .Lxts_enc6x_steal - b .Lxts_enc6x_done - -.align 4 -.Lxts_enc6x_four: - vxor 7,2,17 - vxor 12,3,18 - vxor 13,4,19 - vxor 14,5,20 - vxor 15,15,15 - - bl _aesp8_xts_enc5x - - vperm 7,7,7,6 - vor 17,21,21 - vperm 12,12,12,6 - .long 0x7CE02799 - vperm 13,13,13,6 - .long 0x7D832799 - vxor 11,14,21 - vperm 14,14,14,6 - .long 0x7DBA2799 - .long 0x7DDB2799 - addi 4,4,0x40 - bne .Lxts_enc6x_steal - b .Lxts_enc6x_done - -.align 4 -.Lxts_enc6x_three: - vxor 7,3,17 - vxor 12,4,18 - vxor 13,5,19 - vxor 14,14,14 - vxor 15,15,15 - - bl _aesp8_xts_enc5x - - vperm 7,7,7,6 - vor 17,20,20 - vperm 12,12,12,6 - .long 0x7CE02799 - vxor 11,13,20 - vperm 13,13,13,6 - .long 0x7D832799 - .long 0x7DBA2799 - addi 4,4,0x30 - bne .Lxts_enc6x_steal - b .Lxts_enc6x_done - -.align 4 -.Lxts_enc6x_two: - vxor 7,4,17 - vxor 12,5,18 - vxor 13,13,13 - vxor 14,14,14 - vxor 15,15,15 - - bl _aesp8_xts_enc5x - - vperm 7,7,7,6 - vor 17,19,19 - vxor 11,12,19 - vperm 12,12,12,6 - .long 0x7CE02799 - .long 0x7D832799 - addi 4,4,0x20 - bne .Lxts_enc6x_steal - b .Lxts_enc6x_done - -.align 4 -.Lxts_enc6x_one: - vxor 7,5,17 - nop -.Loop_xts_enc1x: - .long 0x10E7C508 - lvx 24,26,7 - addi 7,7,0x20 - - .long 0x10E7CD08 - lvx 25,3,7 - bdnz .Loop_xts_enc1x - - add 10,10,31 - cmpwi 31,0 - .long 0x10E7C508 - - subi 10,10,16 - .long 0x10E7CD08 - - lvsr 5,0,31 - .long 0x10E7D508 - - .long 0x7C005699 - .long 0x10E7DD08 - - addi 7,1,79 - .long 0x10E7E508 - lvx 24,0,7 - - .long 0x10E7ED08 - lvx 25,3,7 - vxor 17,17,31 - - vperm 0,0,0,6 - .long 0x10E7F508 - - vperm 0,0,0,5 - .long 0x10E78D09 - - vor 17,18,18 - vxor 11,7,18 - vperm 7,7,7,6 - .long 0x7CE02799 - addi 4,4,0x10 - bne .Lxts_enc6x_steal - b .Lxts_enc6x_done - -.align 4 -.Lxts_enc6x_zero: - cmpwi 31,0 - beq .Lxts_enc6x_done - - add 10,10,31 - subi 10,10,16 - .long 0x7C005699 - lvsr 5,0,31 - vperm 0,0,0,6 - vperm 0,0,0,5 - vxor 11,11,17 -.Lxts_enc6x_steal: - vxor 0,0,17 - vxor 7,7,7 - vspltisb 12,-1 - vperm 7,7,12,5 - vsel 7,0,11,7 - - subi 30,4,17 - subi 4,4,16 - mtctr 31 -.Loop_xts_enc6x_steal: - lbzu 0,1(30) - stb 0,16(30) - bdnz .Loop_xts_enc6x_steal - - li 31,0 - mtctr 9 - b .Loop_xts_enc1x - -.align 4 -.Lxts_enc6x_done: - cmpldi 8,0 - beq .Lxts_enc6x_ret - - vxor 8,17,23 - vperm 8,8,8,6 - .long 0x7D004799 - -.Lxts_enc6x_ret: - mtlr 11 - li 10,79 - li 11,95 - stvx 9,10,1 - addi 10,10,32 - stvx 9,11,1 - addi 11,11,32 - stvx 9,10,1 - addi 10,10,32 - stvx 9,11,1 - addi 11,11,32 - stvx 9,10,1 - addi 10,10,32 - stvx 9,11,1 - addi 11,11,32 - stvx 9,10,1 - addi 10,10,32 - stvx 9,11,1 - addi 11,11,32 - - or 12,12,12 - lvx 20,10,1 - addi 10,10,32 - lvx 21,11,1 - addi 11,11,32 - lvx 22,10,1 - addi 10,10,32 - lvx 23,11,1 - addi 11,11,32 - lvx 24,10,1 - addi 10,10,32 - lvx 25,11,1 - addi 11,11,32 - lvx 26,10,1 - addi 10,10,32 - lvx 27,11,1 - addi 11,11,32 - lvx 28,10,1 - addi 10,10,32 - lvx 29,11,1 - addi 11,11,32 - lvx 30,10,1 - lvx 31,11,1 - ld 26,400(1) - ld 27,408(1) - ld 28,416(1) - ld 29,424(1) - ld 30,432(1) - ld 31,440(1) - addi 1,1,448 - blr -.long 0 -.byte 0,12,0x04,1,0x80,6,6,0 -.long 0 - -.align 5 -_aesp8_xts_enc5x: - .long 0x10E7C508 - .long 0x118CC508 - .long 0x11ADC508 - .long 0x11CEC508 - .long 0x11EFC508 - lvx 24,26,7 - addi 7,7,0x20 - - .long 0x10E7CD08 - .long 0x118CCD08 - .long 0x11ADCD08 - .long 0x11CECD08 - .long 0x11EFCD08 - lvx 25,3,7 - bdnz _aesp8_xts_enc5x - - add 10,10,31 - cmpwi 31,0 - .long 0x10E7C508 - .long 0x118CC508 - .long 0x11ADC508 - .long 0x11CEC508 - .long 0x11EFC508 - - subi 10,10,16 - .long 0x10E7CD08 - .long 0x118CCD08 - .long 0x11ADCD08 - .long 0x11CECD08 - .long 0x11EFCD08 - vxor 17,17,31 - - .long 0x10E7D508 - lvsr 5,0,31 - .long 0x118CD508 - .long 0x11ADD508 - .long 0x11CED508 - .long 0x11EFD508 - vxor 1,18,31 - - .long 0x10E7DD08 - .long 0x7C005699 - .long 0x118CDD08 - .long 0x11ADDD08 - .long 0x11CEDD08 - .long 0x11EFDD08 - vxor 2,19,31 - - addi 7,1,79 - .long 0x10E7E508 - .long 0x118CE508 - .long 0x11ADE508 - .long 0x11CEE508 - .long 0x11EFE508 - lvx 24,0,7 - vxor 3,20,31 - - .long 0x10E7ED08 - vperm 0,0,0,6 - .long 0x118CED08 - .long 0x11ADED08 - .long 0x11CEED08 - .long 0x11EFED08 - lvx 25,3,7 - vxor 4,21,31 - - .long 0x10E7F508 - vperm 0,0,0,5 - .long 0x118CF508 - .long 0x11ADF508 - .long 0x11CEF508 - .long 0x11EFF508 - - .long 0x10E78D09 - .long 0x118C0D09 - .long 0x11AD1509 - .long 0x11CE1D09 - .long 0x11EF2509 - blr -.long 0 -.byte 0,12,0x14,0,0,0,0,0 - -.align 5 -_aesp8_xts_decrypt6x: - stdu 1,-448(1) - mflr 11 - li 7,207 - li 3,223 - std 11,464(1) - stvx 20,7,1 - addi 7,7,32 - stvx 21,3,1 - addi 3,3,32 - stvx 22,7,1 - addi 7,7,32 - stvx 23,3,1 - addi 3,3,32 - stvx 24,7,1 - addi 7,7,32 - stvx 25,3,1 - addi 3,3,32 - stvx 26,7,1 - addi 7,7,32 - stvx 27,3,1 - addi 3,3,32 - stvx 28,7,1 - addi 7,7,32 - stvx 29,3,1 - addi 3,3,32 - stvx 30,7,1 - stvx 31,3,1 - li 0,-1 - stw 12,396(1) - li 3,0x10 - std 26,400(1) - li 26,0x20 - std 27,408(1) - li 27,0x30 - std 28,416(1) - li 28,0x40 - std 29,424(1) - li 29,0x50 - std 30,432(1) - li 30,0x60 - std 31,440(1) - li 31,0x70 - or 0,0,0 - - subi 9,9,3 - - lvx 23,0,6 - lvx 30,3,6 - addi 6,6,0x20 - lvx 31,0,6 - vperm 23,30,23,7 - addi 7,1,79 - mtctr 9 - -.Load_xts_dec_key: - vperm 24,31,30,7 - lvx 30,3,6 - addi 6,6,0x20 - stvx 24,0,7 - vperm 25,30,31,7 - lvx 31,0,6 - stvx 25,3,7 - addi 7,7,0x20 - bdnz .Load_xts_dec_key - - lvx 26,3,6 - vperm 24,31,30,7 - lvx 27,26,6 - stvx 24,0,7 - vperm 25,26,31,7 - lvx 28,27,6 - stvx 25,3,7 - addi 7,1,79 - vperm 26,27,26,7 - lvx 29,28,6 - vperm 27,28,27,7 - lvx 30,29,6 - vperm 28,29,28,7 - lvx 31,30,6 - vperm 29,30,29,7 - lvx 22,31,6 - vperm 30,31,30,7 - lvx 24,0,7 - vperm 31,22,31,7 - lvx 25,3,7 - - vperm 0,2,4,5 - subi 10,10,31 - vxor 17,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vand 11,11,10 - vxor 7,0,17 - vxor 8,8,11 - - .long 0x7C235699 - vxor 18,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vperm 1,1,1,6 - vand 11,11,10 - vxor 12,1,18 - vxor 8,8,11 - - .long 0x7C5A5699 - andi. 31,5,15 - vxor 19,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vperm 2,2,2,6 - vand 11,11,10 - vxor 13,2,19 - vxor 8,8,11 - - .long 0x7C7B5699 - sub 5,5,31 - vxor 20,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vperm 3,3,3,6 - vand 11,11,10 - vxor 14,3,20 - vxor 8,8,11 - - .long 0x7C9C5699 - subi 5,5,0x60 - vxor 21,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vperm 4,4,4,6 - vand 11,11,10 - vxor 15,4,21 - vxor 8,8,11 - - .long 0x7CBD5699 - addi 10,10,0x60 - vxor 22,8,23 - vsrab 11,8,9 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - vperm 5,5,5,6 - vand 11,11,10 - vxor 16,5,22 - vxor 8,8,11 - - vxor 31,31,23 - mtctr 9 - b .Loop_xts_dec6x - -.align 5 -.Loop_xts_dec6x: - .long 0x10E7C548 - .long 0x118CC548 - .long 0x11ADC548 - .long 0x11CEC548 - .long 0x11EFC548 - .long 0x1210C548 - lvx 24,26,7 - addi 7,7,0x20 - - .long 0x10E7CD48 - .long 0x118CCD48 - .long 0x11ADCD48 - .long 0x11CECD48 - .long 0x11EFCD48 - .long 0x1210CD48 - lvx 25,3,7 - bdnz .Loop_xts_dec6x - - subic 5,5,96 - vxor 0,17,31 - .long 0x10E7C548 - .long 0x118CC548 - vsrab 11,8,9 - vxor 17,8,23 - vaddubm 8,8,8 - .long 0x11ADC548 - .long 0x11CEC548 - vsldoi 11,11,11,15 - .long 0x11EFC548 - .long 0x1210C548 - - subfe. 0,0,0 - vand 11,11,10 - .long 0x10E7CD48 - .long 0x118CCD48 - vxor 8,8,11 - .long 0x11ADCD48 - .long 0x11CECD48 - vxor 1,18,31 - vsrab 11,8,9 - vxor 18,8,23 - .long 0x11EFCD48 - .long 0x1210CD48 - - and 0,0,5 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - .long 0x10E7D548 - .long 0x118CD548 - vand 11,11,10 - .long 0x11ADD548 - .long 0x11CED548 - vxor 8,8,11 - .long 0x11EFD548 - .long 0x1210D548 - - add 10,10,0 - - - - vxor 2,19,31 - vsrab 11,8,9 - vxor 19,8,23 - vaddubm 8,8,8 - .long 0x10E7DD48 - .long 0x118CDD48 - vsldoi 11,11,11,15 - .long 0x11ADDD48 - .long 0x11CEDD48 - vand 11,11,10 - .long 0x11EFDD48 - .long 0x1210DD48 - - addi 7,1,79 - vxor 8,8,11 - .long 0x10E7E548 - .long 0x118CE548 - vxor 3,20,31 - vsrab 11,8,9 - vxor 20,8,23 - .long 0x11ADE548 - .long 0x11CEE548 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - .long 0x11EFE548 - .long 0x1210E548 - lvx 24,0,7 - vand 11,11,10 - - .long 0x10E7ED48 - .long 0x118CED48 - vxor 8,8,11 - .long 0x11ADED48 - .long 0x11CEED48 - vxor 4,21,31 - vsrab 11,8,9 - vxor 21,8,23 - .long 0x11EFED48 - .long 0x1210ED48 - lvx 25,3,7 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - - .long 0x10E7F548 - .long 0x118CF548 - vand 11,11,10 - .long 0x11ADF548 - .long 0x11CEF548 - vxor 8,8,11 - .long 0x11EFF548 - .long 0x1210F548 - vxor 5,22,31 - vsrab 11,8,9 - vxor 22,8,23 - - .long 0x10E70549 - .long 0x7C005699 - vaddubm 8,8,8 - vsldoi 11,11,11,15 - .long 0x118C0D49 - .long 0x7C235699 - .long 0x11AD1549 - vperm 0,0,0,6 - .long 0x7C5A5699 - vand 11,11,10 - .long 0x11CE1D49 - vperm 1,1,1,6 - .long 0x7C7B5699 - .long 0x11EF2549 - vperm 2,2,2,6 - .long 0x7C9C5699 - vxor 8,8,11 - .long 0x12102D49 - vperm 3,3,3,6 - .long 0x7CBD5699 - addi 10,10,0x60 - vperm 4,4,4,6 - vperm 5,5,5,6 - - vperm 7,7,7,6 - vperm 12,12,12,6 - .long 0x7CE02799 - vxor 7,0,17 - vperm 13,13,13,6 - .long 0x7D832799 - vxor 12,1,18 - vperm 14,14,14,6 - .long 0x7DBA2799 - vxor 13,2,19 - vperm 15,15,15,6 - .long 0x7DDB2799 - vxor 14,3,20 - vperm 16,16,16,6 - .long 0x7DFC2799 - vxor 15,4,21 - .long 0x7E1D2799 - vxor 16,5,22 - addi 4,4,0x60 - - mtctr 9 - beq .Loop_xts_dec6x - - addic. 5,5,0x60 - beq .Lxts_dec6x_zero - cmpwi 5,0x20 - blt .Lxts_dec6x_one - nop - beq .Lxts_dec6x_two - cmpwi 5,0x40 - blt .Lxts_dec6x_three - nop - beq .Lxts_dec6x_four - -.Lxts_dec6x_five: - vxor 7,1,17 - vxor 12,2,18 - vxor 13,3,19 - vxor 14,4,20 - vxor 15,5,21 - - bl _aesp8_xts_dec5x - - vperm 7,7,7,6 - vor 17,22,22 - vxor 18,8,23 - vperm 12,12,12,6 - .long 0x7CE02799 - vxor 7,0,18 - vperm 13,13,13,6 - .long 0x7D832799 - vperm 14,14,14,6 - .long 0x7DBA2799 - vperm 15,15,15,6 - .long 0x7DDB2799 - .long 0x7DFC2799 - addi 4,4,0x50 - bne .Lxts_dec6x_steal - b .Lxts_dec6x_done - -.align 4 -.Lxts_dec6x_four: - vxor 7,2,17 - vxor 12,3,18 - vxor 13,4,19 - vxor 14,5,20 - vxor 15,15,15 - - bl _aesp8_xts_dec5x - - vperm 7,7,7,6 - vor 17,21,21 - vor 18,22,22 - vperm 12,12,12,6 - .long 0x7CE02799 - vxor 7,0,22 - vperm 13,13,13,6 - .long 0x7D832799 - vperm 14,14,14,6 - .long 0x7DBA2799 - .long 0x7DDB2799 - addi 4,4,0x40 - bne .Lxts_dec6x_steal - b .Lxts_dec6x_done - -.align 4 -.Lxts_dec6x_three: - vxor 7,3,17 - vxor 12,4,18 - vxor 13,5,19 - vxor 14,14,14 - vxor 15,15,15 - - bl _aesp8_xts_dec5x - - vperm 7,7,7,6 - vor 17,20,20 - vor 18,21,21 - vperm 12,12,12,6 - .long 0x7CE02799 - vxor 7,0,21 - vperm 13,13,13,6 - .long 0x7D832799 - .long 0x7DBA2799 - addi 4,4,0x30 - bne .Lxts_dec6x_steal - b .Lxts_dec6x_done - -.align 4 -.Lxts_dec6x_two: - vxor 7,4,17 - vxor 12,5,18 - vxor 13,13,13 - vxor 14,14,14 - vxor 15,15,15 - - bl _aesp8_xts_dec5x - - vperm 7,7,7,6 - vor 17,19,19 - vor 18,20,20 - vperm 12,12,12,6 - .long 0x7CE02799 - vxor 7,0,20 - .long 0x7D832799 - addi 4,4,0x20 - bne .Lxts_dec6x_steal - b .Lxts_dec6x_done - -.align 4 -.Lxts_dec6x_one: - vxor 7,5,17 - nop -.Loop_xts_dec1x: - .long 0x10E7C548 - lvx 24,26,7 - addi 7,7,0x20 - - .long 0x10E7CD48 - lvx 25,3,7 - bdnz .Loop_xts_dec1x - - subi 0,31,1 - .long 0x10E7C548 - - andi. 0,0,16 - cmpwi 31,0 - .long 0x10E7CD48 - - sub 10,10,0 - .long 0x10E7D548 - - .long 0x7C005699 - .long 0x10E7DD48 - - addi 7,1,79 - .long 0x10E7E548 - lvx 24,0,7 - - .long 0x10E7ED48 - lvx 25,3,7 - vxor 17,17,31 - - vperm 0,0,0,6 - .long 0x10E7F548 - - mtctr 9 - .long 0x10E78D49 - - vor 17,18,18 - vor 18,19,19 - vperm 7,7,7,6 - .long 0x7CE02799 - addi 4,4,0x10 - vxor 7,0,19 - bne .Lxts_dec6x_steal - b .Lxts_dec6x_done - -.align 4 -.Lxts_dec6x_zero: - cmpwi 31,0 - beq .Lxts_dec6x_done - - .long 0x7C005699 - vperm 0,0,0,6 - vxor 7,0,18 -.Lxts_dec6x_steal: - .long 0x10E7C548 - lvx 24,26,7 - addi 7,7,0x20 - - .long 0x10E7CD48 - lvx 25,3,7 - bdnz .Lxts_dec6x_steal - - add 10,10,31 - .long 0x10E7C548 - - cmpwi 31,0 - .long 0x10E7CD48 - - .long 0x7C005699 - .long 0x10E7D548 - - lvsr 5,0,31 - .long 0x10E7DD48 - - addi 7,1,79 - .long 0x10E7E548 - lvx 24,0,7 - - .long 0x10E7ED48 - lvx 25,3,7 - vxor 18,18,31 - - vperm 0,0,0,6 - .long 0x10E7F548 - - vperm 0,0,0,5 - .long 0x11679549 - - vperm 7,11,11,6 - .long 0x7CE02799 - - - vxor 7,7,7 - vspltisb 12,-1 - vperm 7,7,12,5 - vsel 7,0,11,7 - vxor 7,7,17 - - subi 30,4,1 - mtctr 31 -.Loop_xts_dec6x_steal: - lbzu 0,1(30) - stb 0,16(30) - bdnz .Loop_xts_dec6x_steal - - li 31,0 - mtctr 9 - b .Loop_xts_dec1x - -.align 4 -.Lxts_dec6x_done: - cmpldi 8,0 - beq .Lxts_dec6x_ret - - vxor 8,17,23 - vperm 8,8,8,6 - .long 0x7D004799 - -.Lxts_dec6x_ret: - mtlr 11 - li 10,79 - li 11,95 - stvx 9,10,1 - addi 10,10,32 - stvx 9,11,1 - addi 11,11,32 - stvx 9,10,1 - addi 10,10,32 - stvx 9,11,1 - addi 11,11,32 - stvx 9,10,1 - addi 10,10,32 - stvx 9,11,1 - addi 11,11,32 - stvx 9,10,1 - addi 10,10,32 - stvx 9,11,1 - addi 11,11,32 - - or 12,12,12 - lvx 20,10,1 - addi 10,10,32 - lvx 21,11,1 - addi 11,11,32 - lvx 22,10,1 - addi 10,10,32 - lvx 23,11,1 - addi 11,11,32 - lvx 24,10,1 - addi 10,10,32 - lvx 25,11,1 - addi 11,11,32 - lvx 26,10,1 - addi 10,10,32 - lvx 27,11,1 - addi 11,11,32 - lvx 28,10,1 - addi 10,10,32 - lvx 29,11,1 - addi 11,11,32 - lvx 30,10,1 - lvx 31,11,1 - ld 26,400(1) - ld 27,408(1) - ld 28,416(1) - ld 29,424(1) - ld 30,432(1) - ld 31,440(1) - addi 1,1,448 - blr -.long 0 -.byte 0,12,0x04,1,0x80,6,6,0 -.long 0 - -.align 5 -_aesp8_xts_dec5x: - .long 0x10E7C548 - .long 0x118CC548 - .long 0x11ADC548 - .long 0x11CEC548 - .long 0x11EFC548 - lvx 24,26,7 - addi 7,7,0x20 - - .long 0x10E7CD48 - .long 0x118CCD48 - .long 0x11ADCD48 - .long 0x11CECD48 - .long 0x11EFCD48 - lvx 25,3,7 - bdnz _aesp8_xts_dec5x - - subi 0,31,1 - .long 0x10E7C548 - .long 0x118CC548 - .long 0x11ADC548 - .long 0x11CEC548 - .long 0x11EFC548 - - andi. 0,0,16 - cmpwi 31,0 - .long 0x10E7CD48 - .long 0x118CCD48 - .long 0x11ADCD48 - .long 0x11CECD48 - .long 0x11EFCD48 - vxor 17,17,31 - - sub 10,10,0 - .long 0x10E7D548 - .long 0x118CD548 - .long 0x11ADD548 - .long 0x11CED548 - .long 0x11EFD548 - vxor 1,18,31 - - .long 0x10E7DD48 - .long 0x7C005699 - .long 0x118CDD48 - .long 0x11ADDD48 - .long 0x11CEDD48 - .long 0x11EFDD48 - vxor 2,19,31 - - addi 7,1,79 - .long 0x10E7E548 - .long 0x118CE548 - .long 0x11ADE548 - .long 0x11CEE548 - .long 0x11EFE548 - lvx 24,0,7 - vxor 3,20,31 - - .long 0x10E7ED48 - vperm 0,0,0,6 - .long 0x118CED48 - .long 0x11ADED48 - .long 0x11CEED48 - .long 0x11EFED48 - lvx 25,3,7 - vxor 4,21,31 - - .long 0x10E7F548 - .long 0x118CF548 - .long 0x11ADF548 - .long 0x11CEF548 - .long 0x11EFF548 - - .long 0x10E78D49 - .long 0x118C0D49 - .long 0x11AD1549 - .long 0x11CE1D49 - .long 0x11EF2549 - mtctr 9 - blr -.long 0 -.byte 0,12,0x14,0,0,0,0,0 -#endif // !OPENSSL_NO_ASM && __powerpc64__ -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S b/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S deleted file mode 100644 index 76b4e739..00000000 --- a/third_party/boringssl/linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S +++ /dev/null @@ -1,587 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__) -.machine "any" - -.abiversion 2 -.text - -.globl gcm_init_p8 -.type gcm_init_p8,@function -.align 5 -gcm_init_p8: -.localentry gcm_init_p8,0 - - li 0,-4096 - li 8,0x10 - li 12,-1 - li 9,0x20 - or 0,0,0 - li 10,0x30 - .long 0x7D202699 - - vspltisb 8,-16 - vspltisb 5,1 - vaddubm 8,8,8 - vxor 4,4,4 - vor 8,8,5 - vsldoi 8,8,4,15 - vsldoi 6,4,5,1 - vaddubm 8,8,8 - vspltisb 7,7 - vor 8,8,6 - vspltb 6,9,0 - vsl 9,9,5 - vsrab 6,6,7 - vand 6,6,8 - vxor 3,9,6 - - vsldoi 9,3,3,8 - vsldoi 8,4,8,8 - vsldoi 11,4,9,8 - vsldoi 10,9,4,8 - - .long 0x7D001F99 - .long 0x7D681F99 - li 8,0x40 - .long 0x7D291F99 - li 9,0x50 - .long 0x7D4A1F99 - li 10,0x60 - - .long 0x10035CC8 - .long 0x10234CC8 - .long 0x104354C8 - - .long 0x10E044C8 - - vsldoi 5,1,4,8 - vsldoi 6,4,1,8 - vxor 0,0,5 - vxor 2,2,6 - - vsldoi 0,0,0,8 - vxor 0,0,7 - - vsldoi 6,0,0,8 - .long 0x100044C8 - vxor 6,6,2 - vxor 16,0,6 - - vsldoi 17,16,16,8 - vsldoi 19,4,17,8 - vsldoi 18,17,4,8 - - .long 0x7E681F99 - li 8,0x70 - .long 0x7E291F99 - li 9,0x80 - .long 0x7E4A1F99 - li 10,0x90 - .long 0x10039CC8 - .long 0x11B09CC8 - .long 0x10238CC8 - .long 0x11D08CC8 - .long 0x104394C8 - .long 0x11F094C8 - - .long 0x10E044C8 - .long 0x114D44C8 - - vsldoi 5,1,4,8 - vsldoi 6,4,1,8 - vsldoi 11,14,4,8 - vsldoi 9,4,14,8 - vxor 0,0,5 - vxor 2,2,6 - vxor 13,13,11 - vxor 15,15,9 - - vsldoi 0,0,0,8 - vsldoi 13,13,13,8 - vxor 0,0,7 - vxor 13,13,10 - - vsldoi 6,0,0,8 - vsldoi 9,13,13,8 - .long 0x100044C8 - .long 0x11AD44C8 - vxor 6,6,2 - vxor 9,9,15 - vxor 0,0,6 - vxor 13,13,9 - - vsldoi 9,0,0,8 - vsldoi 17,13,13,8 - vsldoi 11,4,9,8 - vsldoi 10,9,4,8 - vsldoi 19,4,17,8 - vsldoi 18,17,4,8 - - .long 0x7D681F99 - li 8,0xa0 - .long 0x7D291F99 - li 9,0xb0 - .long 0x7D4A1F99 - li 10,0xc0 - .long 0x7E681F99 - .long 0x7E291F99 - .long 0x7E4A1F99 - - or 12,12,12 - blr -.long 0 -.byte 0,12,0x14,0,0,0,2,0 -.long 0 -.size gcm_init_p8,.-gcm_init_p8 -.globl gcm_gmult_p8 -.type gcm_gmult_p8,@function -.align 5 -gcm_gmult_p8: -.localentry gcm_gmult_p8,0 - - lis 0,0xfff8 - li 8,0x10 - li 12,-1 - li 9,0x20 - or 0,0,0 - li 10,0x30 - .long 0x7C601E99 - - .long 0x7D682699 - lvsl 12,0,0 - .long 0x7D292699 - vspltisb 5,0x07 - .long 0x7D4A2699 - vxor 12,12,5 - .long 0x7D002699 - vperm 3,3,3,12 - vxor 4,4,4 - - .long 0x10035CC8 - .long 0x10234CC8 - .long 0x104354C8 - - .long 0x10E044C8 - - vsldoi 5,1,4,8 - vsldoi 6,4,1,8 - vxor 0,0,5 - vxor 2,2,6 - - vsldoi 0,0,0,8 - vxor 0,0,7 - - vsldoi 6,0,0,8 - .long 0x100044C8 - vxor 6,6,2 - vxor 0,0,6 - - vperm 0,0,0,12 - .long 0x7C001F99 - - or 12,12,12 - blr -.long 0 -.byte 0,12,0x14,0,0,0,2,0 -.long 0 -.size gcm_gmult_p8,.-gcm_gmult_p8 - -.globl gcm_ghash_p8 -.type gcm_ghash_p8,@function -.align 5 -gcm_ghash_p8: -.localentry gcm_ghash_p8,0 - - li 0,-4096 - li 8,0x10 - li 12,-1 - li 9,0x20 - or 0,0,0 - li 10,0x30 - .long 0x7C001E99 - - .long 0x7D682699 - li 8,0x40 - lvsl 12,0,0 - .long 0x7D292699 - li 9,0x50 - vspltisb 5,0x07 - .long 0x7D4A2699 - li 10,0x60 - vxor 12,12,5 - .long 0x7D002699 - vperm 0,0,0,12 - vxor 4,4,4 - - cmpldi 6,64 - bge .Lgcm_ghash_p8_4x - - .long 0x7C602E99 - addi 5,5,16 - subic. 6,6,16 - vperm 3,3,3,12 - vxor 3,3,0 - beq .Lshort - - .long 0x7E682699 - li 8,16 - .long 0x7E292699 - add 9,5,6 - .long 0x7E4A2699 - - -.align 5 -.Loop_2x: - .long 0x7E002E99 - vperm 16,16,16,12 - - subic 6,6,32 - .long 0x10039CC8 - .long 0x11B05CC8 - subfe 0,0,0 - .long 0x10238CC8 - .long 0x11D04CC8 - and 0,0,6 - .long 0x104394C8 - .long 0x11F054C8 - add 5,5,0 - - vxor 0,0,13 - vxor 1,1,14 - - .long 0x10E044C8 - - vsldoi 5,1,4,8 - vsldoi 6,4,1,8 - vxor 2,2,15 - vxor 0,0,5 - vxor 2,2,6 - - vsldoi 0,0,0,8 - vxor 0,0,7 - .long 0x7C682E99 - addi 5,5,32 - - vsldoi 6,0,0,8 - .long 0x100044C8 - vperm 3,3,3,12 - vxor 6,6,2 - vxor 3,3,6 - vxor 3,3,0 - cmpld 9,5 - bgt .Loop_2x - - cmplwi 6,0 - bne .Leven - -.Lshort: - .long 0x10035CC8 - .long 0x10234CC8 - .long 0x104354C8 - - .long 0x10E044C8 - - vsldoi 5,1,4,8 - vsldoi 6,4,1,8 - vxor 0,0,5 - vxor 2,2,6 - - vsldoi 0,0,0,8 - vxor 0,0,7 - - vsldoi 6,0,0,8 - .long 0x100044C8 - vxor 6,6,2 - -.Leven: - vxor 0,0,6 - vperm 0,0,0,12 - .long 0x7C001F99 - - or 12,12,12 - blr -.long 0 -.byte 0,12,0x14,0,0,0,4,0 -.long 0 -.align 5 -.gcm_ghash_p8_4x: -.Lgcm_ghash_p8_4x: - stdu 1,-256(1) - li 10,63 - li 11,79 - stvx 20,10,1 - addi 10,10,32 - stvx 21,11,1 - addi 11,11,32 - stvx 22,10,1 - addi 10,10,32 - stvx 23,11,1 - addi 11,11,32 - stvx 24,10,1 - addi 10,10,32 - stvx 25,11,1 - addi 11,11,32 - stvx 26,10,1 - addi 10,10,32 - stvx 27,11,1 - addi 11,11,32 - stvx 28,10,1 - addi 10,10,32 - stvx 29,11,1 - addi 11,11,32 - stvx 30,10,1 - li 10,0x60 - stvx 31,11,1 - li 0,-1 - stw 12,252(1) - or 0,0,0 - - lvsl 5,0,8 - - li 8,0x70 - .long 0x7E292699 - li 9,0x80 - vspltisb 6,8 - - li 10,0x90 - .long 0x7EE82699 - li 8,0xa0 - .long 0x7F092699 - li 9,0xb0 - .long 0x7F2A2699 - li 10,0xc0 - .long 0x7FA82699 - li 8,0x10 - .long 0x7FC92699 - li 9,0x20 - .long 0x7FEA2699 - li 10,0x30 - - vsldoi 7,4,6,8 - vaddubm 18,5,7 - vaddubm 19,6,18 - - srdi 6,6,4 - - .long 0x7C602E99 - .long 0x7E082E99 - subic. 6,6,8 - .long 0x7EC92E99 - .long 0x7F8A2E99 - addi 5,5,0x40 - vperm 3,3,3,12 - vperm 16,16,16,12 - vperm 22,22,22,12 - vperm 28,28,28,12 - - vxor 2,3,0 - - .long 0x11B0BCC8 - .long 0x11D0C4C8 - .long 0x11F0CCC8 - - vperm 11,17,9,18 - vperm 5,22,28,19 - vperm 10,17,9,19 - vperm 6,22,28,18 - .long 0x12B68CC8 - .long 0x12855CC8 - .long 0x137C4CC8 - .long 0x134654C8 - - vxor 21,21,14 - vxor 20,20,13 - vxor 27,27,21 - vxor 26,26,15 - - blt .Ltail_4x - -.Loop_4x: - .long 0x7C602E99 - .long 0x7E082E99 - subic. 6,6,4 - .long 0x7EC92E99 - .long 0x7F8A2E99 - addi 5,5,0x40 - vperm 16,16,16,12 - vperm 22,22,22,12 - vperm 28,28,28,12 - vperm 3,3,3,12 - - .long 0x1002ECC8 - .long 0x1022F4C8 - .long 0x1042FCC8 - .long 0x11B0BCC8 - .long 0x11D0C4C8 - .long 0x11F0CCC8 - - vxor 0,0,20 - vxor 1,1,27 - vxor 2,2,26 - vperm 5,22,28,19 - vperm 6,22,28,18 - - .long 0x10E044C8 - .long 0x12855CC8 - .long 0x134654C8 - - vsldoi 5,1,4,8 - vsldoi 6,4,1,8 - vxor 0,0,5 - vxor 2,2,6 - - vsldoi 0,0,0,8 - vxor 0,0,7 - - vsldoi 6,0,0,8 - .long 0x12B68CC8 - .long 0x137C4CC8 - .long 0x100044C8 - - vxor 20,20,13 - vxor 26,26,15 - vxor 2,2,3 - vxor 21,21,14 - vxor 2,2,6 - vxor 27,27,21 - vxor 2,2,0 - bge .Loop_4x - -.Ltail_4x: - .long 0x1002ECC8 - .long 0x1022F4C8 - .long 0x1042FCC8 - - vxor 0,0,20 - vxor 1,1,27 - - .long 0x10E044C8 - - vsldoi 5,1,4,8 - vsldoi 6,4,1,8 - vxor 2,2,26 - vxor 0,0,5 - vxor 2,2,6 - - vsldoi 0,0,0,8 - vxor 0,0,7 - - vsldoi 6,0,0,8 - .long 0x100044C8 - vxor 6,6,2 - vxor 0,0,6 - - addic. 6,6,4 - beq .Ldone_4x - - .long 0x7C602E99 - cmpldi 6,2 - li 6,-4 - blt .Lone - .long 0x7E082E99 - beq .Ltwo - -.Lthree: - .long 0x7EC92E99 - vperm 3,3,3,12 - vperm 16,16,16,12 - vperm 22,22,22,12 - - vxor 2,3,0 - vor 29,23,23 - vor 30, 24, 24 - vor 31,25,25 - - vperm 5,16,22,19 - vperm 6,16,22,18 - .long 0x12B08CC8 - .long 0x13764CC8 - .long 0x12855CC8 - .long 0x134654C8 - - vxor 27,27,21 - b .Ltail_4x - -.align 4 -.Ltwo: - vperm 3,3,3,12 - vperm 16,16,16,12 - - vxor 2,3,0 - vperm 5,4,16,19 - vperm 6,4,16,18 - - vsldoi 29,4,17,8 - vor 30, 17, 17 - vsldoi 31,17,4,8 - - .long 0x12855CC8 - .long 0x13704CC8 - .long 0x134654C8 - - b .Ltail_4x - -.align 4 -.Lone: - vperm 3,3,3,12 - - vsldoi 29,4,9,8 - vor 30, 9, 9 - vsldoi 31,9,4,8 - - vxor 2,3,0 - vxor 20,20,20 - vxor 27,27,27 - vxor 26,26,26 - - b .Ltail_4x - -.Ldone_4x: - vperm 0,0,0,12 - .long 0x7C001F99 - - li 10,63 - li 11,79 - or 12,12,12 - lvx 20,10,1 - addi 10,10,32 - lvx 21,11,1 - addi 11,11,32 - lvx 22,10,1 - addi 10,10,32 - lvx 23,11,1 - addi 11,11,32 - lvx 24,10,1 - addi 10,10,32 - lvx 25,11,1 - addi 11,11,32 - lvx 26,10,1 - addi 10,10,32 - lvx 27,11,1 - addi 11,11,32 - lvx 28,10,1 - addi 10,10,32 - lvx 29,11,1 - addi 11,11,32 - lvx 30,10,1 - lvx 31,11,1 - addi 1,1,256 - blr -.long 0 -.byte 0,12,0x04,0,0x80,0,4,0 -.long 0 -.size gcm_ghash_p8,.-gcm_ghash_p8 - -.byte 71,72,65,83,72,32,102,111,114,32,80,111,119,101,114,73,83,65,32,50,46,48,55,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 2 -.align 2 -#endif // !OPENSSL_NO_ASM && __powerpc64__ -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-ppc64le/crypto/test/trampoline-ppc.S b/third_party/boringssl/linux-ppc64le/crypto/test/trampoline-ppc.S deleted file mode 100644 index 8166d31a..00000000 --- a/third_party/boringssl/linux-ppc64le/crypto/test/trampoline-ppc.S +++ /dev/null @@ -1,1410 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if !defined(OPENSSL_NO_ASM) && defined(__powerpc64__) -.machine "any" -.abiversion 2 -.text - - - - - - - -.globl abi_test_trampoline -.type abi_test_trampoline,@function -.align 5 -abi_test_trampoline: -.localentry abi_test_trampoline,0 - - - mflr 0 - std 0, 16(1) - - - - - - - - - - - - - - - - - - - - stdu 1, -528(1) - - mfcr 0 - std 0, 8(1) - std 2, 24(1) - std 4, 32(1) - li 11, 48 - stvx 20, 11, 1 - li 11, 64 - stvx 21, 11, 1 - li 11, 80 - stvx 22, 11, 1 - li 11, 96 - stvx 23, 11, 1 - li 11, 112 - stvx 24, 11, 1 - li 11, 128 - stvx 25, 11, 1 - li 11, 144 - stvx 26, 11, 1 - li 11, 160 - stvx 27, 11, 1 - li 11, 176 - stvx 28, 11, 1 - li 11, 192 - stvx 29, 11, 1 - li 11, 208 - stvx 30, 11, 1 - li 11, 224 - stvx 31, 11, 1 - std 14, 240(1) - std 15, 248(1) - std 16, 256(1) - std 17, 264(1) - std 18, 272(1) - std 19, 280(1) - std 20, 288(1) - std 21, 296(1) - std 22, 304(1) - std 23, 312(1) - std 24, 320(1) - std 25, 328(1) - std 26, 336(1) - std 27, 344(1) - std 28, 352(1) - std 29, 360(1) - std 30, 368(1) - std 31, 376(1) - stfd 14, 384(1) - stfd 15, 392(1) - stfd 16, 400(1) - stfd 17, 408(1) - stfd 18, 416(1) - stfd 19, 424(1) - stfd 20, 432(1) - stfd 21, 440(1) - stfd 22, 448(1) - stfd 23, 456(1) - stfd 24, 464(1) - stfd 25, 472(1) - stfd 26, 480(1) - stfd 27, 488(1) - stfd 28, 496(1) - stfd 29, 504(1) - stfd 30, 512(1) - stfd 31, 520(1) - li 11, 0 - lvx 20, 11, 4 - li 11, 16 - lvx 21, 11, 4 - li 11, 32 - lvx 22, 11, 4 - li 11, 48 - lvx 23, 11, 4 - li 11, 64 - lvx 24, 11, 4 - li 11, 80 - lvx 25, 11, 4 - li 11, 96 - lvx 26, 11, 4 - li 11, 112 - lvx 27, 11, 4 - li 11, 128 - lvx 28, 11, 4 - li 11, 144 - lvx 29, 11, 4 - li 11, 160 - lvx 30, 11, 4 - li 11, 176 - lvx 31, 11, 4 - ld 14, 192(4) - ld 15, 200(4) - ld 16, 208(4) - ld 17, 216(4) - ld 18, 224(4) - ld 19, 232(4) - ld 20, 240(4) - ld 21, 248(4) - ld 22, 256(4) - ld 23, 264(4) - ld 24, 272(4) - ld 25, 280(4) - ld 26, 288(4) - ld 27, 296(4) - ld 28, 304(4) - ld 29, 312(4) - ld 30, 320(4) - ld 31, 328(4) - lfd 14, 336(4) - lfd 15, 344(4) - lfd 16, 352(4) - lfd 17, 360(4) - lfd 18, 368(4) - lfd 19, 376(4) - lfd 20, 384(4) - lfd 21, 392(4) - lfd 22, 400(4) - lfd 23, 408(4) - lfd 24, 416(4) - lfd 25, 424(4) - lfd 26, 432(4) - lfd 27, 440(4) - lfd 28, 448(4) - lfd 29, 456(4) - lfd 30, 464(4) - lfd 31, 472(4) - - ld 0, 480(4) - mtcr 0 - - - addi 11, 5, -8 - mr 12, 3 - - - cmpdi 6, 0 - beq .Largs_done - mtctr 6 - ldu 3, 8(11) - bdz .Largs_done - ldu 4, 8(11) - bdz .Largs_done - ldu 5, 8(11) - bdz .Largs_done - ldu 6, 8(11) - bdz .Largs_done - ldu 7, 8(11) - bdz .Largs_done - ldu 8, 8(11) - bdz .Largs_done - ldu 9, 8(11) - bdz .Largs_done - ldu 10, 8(11) - -.Largs_done: - li 2, 0 - mtctr 12 - bctrl - ld 2, 24(1) - - ld 4, 32(1) - li 11, 0 - stvx 20, 11, 4 - li 11, 16 - stvx 21, 11, 4 - li 11, 32 - stvx 22, 11, 4 - li 11, 48 - stvx 23, 11, 4 - li 11, 64 - stvx 24, 11, 4 - li 11, 80 - stvx 25, 11, 4 - li 11, 96 - stvx 26, 11, 4 - li 11, 112 - stvx 27, 11, 4 - li 11, 128 - stvx 28, 11, 4 - li 11, 144 - stvx 29, 11, 4 - li 11, 160 - stvx 30, 11, 4 - li 11, 176 - stvx 31, 11, 4 - std 14, 192(4) - std 15, 200(4) - std 16, 208(4) - std 17, 216(4) - std 18, 224(4) - std 19, 232(4) - std 20, 240(4) - std 21, 248(4) - std 22, 256(4) - std 23, 264(4) - std 24, 272(4) - std 25, 280(4) - std 26, 288(4) - std 27, 296(4) - std 28, 304(4) - std 29, 312(4) - std 30, 320(4) - std 31, 328(4) - stfd 14, 336(4) - stfd 15, 344(4) - stfd 16, 352(4) - stfd 17, 360(4) - stfd 18, 368(4) - stfd 19, 376(4) - stfd 20, 384(4) - stfd 21, 392(4) - stfd 22, 400(4) - stfd 23, 408(4) - stfd 24, 416(4) - stfd 25, 424(4) - stfd 26, 432(4) - stfd 27, 440(4) - stfd 28, 448(4) - stfd 29, 456(4) - stfd 30, 464(4) - stfd 31, 472(4) - li 11, 48 - lvx 20, 11, 1 - li 11, 64 - lvx 21, 11, 1 - li 11, 80 - lvx 22, 11, 1 - li 11, 96 - lvx 23, 11, 1 - li 11, 112 - lvx 24, 11, 1 - li 11, 128 - lvx 25, 11, 1 - li 11, 144 - lvx 26, 11, 1 - li 11, 160 - lvx 27, 11, 1 - li 11, 176 - lvx 28, 11, 1 - li 11, 192 - lvx 29, 11, 1 - li 11, 208 - lvx 30, 11, 1 - li 11, 224 - lvx 31, 11, 1 - ld 14, 240(1) - ld 15, 248(1) - ld 16, 256(1) - ld 17, 264(1) - ld 18, 272(1) - ld 19, 280(1) - ld 20, 288(1) - ld 21, 296(1) - ld 22, 304(1) - ld 23, 312(1) - ld 24, 320(1) - ld 25, 328(1) - ld 26, 336(1) - ld 27, 344(1) - ld 28, 352(1) - ld 29, 360(1) - ld 30, 368(1) - ld 31, 376(1) - lfd 14, 384(1) - lfd 15, 392(1) - lfd 16, 400(1) - lfd 17, 408(1) - lfd 18, 416(1) - lfd 19, 424(1) - lfd 20, 432(1) - lfd 21, 440(1) - lfd 22, 448(1) - lfd 23, 456(1) - lfd 24, 464(1) - lfd 25, 472(1) - lfd 26, 480(1) - lfd 27, 488(1) - lfd 28, 496(1) - lfd 29, 504(1) - lfd 30, 512(1) - lfd 31, 520(1) - mfcr 0 - std 0, 480(4) - ld 0, 8(1) - mtcrf 0b00111000, 0 - addi 1, 1, 528 - ld 0, 16(1) - mtlr 0 - blr -.size abi_test_trampoline,.-abi_test_trampoline -.globl abi_test_clobber_r0 -.type abi_test_clobber_r0,@function -.align 5 -abi_test_clobber_r0: -.localentry abi_test_clobber_r0,0 - - li 0, 0 - blr -.size abi_test_clobber_r0,.-abi_test_clobber_r0 -.globl abi_test_clobber_r2 -.type abi_test_clobber_r2,@function -.align 5 -abi_test_clobber_r2: -.localentry abi_test_clobber_r2,0 - - li 2, 0 - blr -.size abi_test_clobber_r2,.-abi_test_clobber_r2 -.globl abi_test_clobber_r3 -.type abi_test_clobber_r3,@function -.align 5 -abi_test_clobber_r3: -.localentry abi_test_clobber_r3,0 - - li 3, 0 - blr -.size abi_test_clobber_r3,.-abi_test_clobber_r3 -.globl abi_test_clobber_r4 -.type abi_test_clobber_r4,@function -.align 5 -abi_test_clobber_r4: -.localentry abi_test_clobber_r4,0 - - li 4, 0 - blr -.size abi_test_clobber_r4,.-abi_test_clobber_r4 -.globl abi_test_clobber_r5 -.type abi_test_clobber_r5,@function -.align 5 -abi_test_clobber_r5: -.localentry abi_test_clobber_r5,0 - - li 5, 0 - blr -.size abi_test_clobber_r5,.-abi_test_clobber_r5 -.globl abi_test_clobber_r6 -.type abi_test_clobber_r6,@function -.align 5 -abi_test_clobber_r6: -.localentry abi_test_clobber_r6,0 - - li 6, 0 - blr -.size abi_test_clobber_r6,.-abi_test_clobber_r6 -.globl abi_test_clobber_r7 -.type abi_test_clobber_r7,@function -.align 5 -abi_test_clobber_r7: -.localentry abi_test_clobber_r7,0 - - li 7, 0 - blr -.size abi_test_clobber_r7,.-abi_test_clobber_r7 -.globl abi_test_clobber_r8 -.type abi_test_clobber_r8,@function -.align 5 -abi_test_clobber_r8: -.localentry abi_test_clobber_r8,0 - - li 8, 0 - blr -.size abi_test_clobber_r8,.-abi_test_clobber_r8 -.globl abi_test_clobber_r9 -.type abi_test_clobber_r9,@function -.align 5 -abi_test_clobber_r9: -.localentry abi_test_clobber_r9,0 - - li 9, 0 - blr -.size abi_test_clobber_r9,.-abi_test_clobber_r9 -.globl abi_test_clobber_r10 -.type abi_test_clobber_r10,@function -.align 5 -abi_test_clobber_r10: -.localentry abi_test_clobber_r10,0 - - li 10, 0 - blr -.size abi_test_clobber_r10,.-abi_test_clobber_r10 -.globl abi_test_clobber_r11 -.type abi_test_clobber_r11,@function -.align 5 -abi_test_clobber_r11: -.localentry abi_test_clobber_r11,0 - - li 11, 0 - blr -.size abi_test_clobber_r11,.-abi_test_clobber_r11 -.globl abi_test_clobber_r12 -.type abi_test_clobber_r12,@function -.align 5 -abi_test_clobber_r12: -.localentry abi_test_clobber_r12,0 - - li 12, 0 - blr -.size abi_test_clobber_r12,.-abi_test_clobber_r12 -.globl abi_test_clobber_r14 -.type abi_test_clobber_r14,@function -.align 5 -abi_test_clobber_r14: -.localentry abi_test_clobber_r14,0 - - li 14, 0 - blr -.size abi_test_clobber_r14,.-abi_test_clobber_r14 -.globl abi_test_clobber_r15 -.type abi_test_clobber_r15,@function -.align 5 -abi_test_clobber_r15: -.localentry abi_test_clobber_r15,0 - - li 15, 0 - blr -.size abi_test_clobber_r15,.-abi_test_clobber_r15 -.globl abi_test_clobber_r16 -.type abi_test_clobber_r16,@function -.align 5 -abi_test_clobber_r16: -.localentry abi_test_clobber_r16,0 - - li 16, 0 - blr -.size abi_test_clobber_r16,.-abi_test_clobber_r16 -.globl abi_test_clobber_r17 -.type abi_test_clobber_r17,@function -.align 5 -abi_test_clobber_r17: -.localentry abi_test_clobber_r17,0 - - li 17, 0 - blr -.size abi_test_clobber_r17,.-abi_test_clobber_r17 -.globl abi_test_clobber_r18 -.type abi_test_clobber_r18,@function -.align 5 -abi_test_clobber_r18: -.localentry abi_test_clobber_r18,0 - - li 18, 0 - blr -.size abi_test_clobber_r18,.-abi_test_clobber_r18 -.globl abi_test_clobber_r19 -.type abi_test_clobber_r19,@function -.align 5 -abi_test_clobber_r19: -.localentry abi_test_clobber_r19,0 - - li 19, 0 - blr -.size abi_test_clobber_r19,.-abi_test_clobber_r19 -.globl abi_test_clobber_r20 -.type abi_test_clobber_r20,@function -.align 5 -abi_test_clobber_r20: -.localentry abi_test_clobber_r20,0 - - li 20, 0 - blr -.size abi_test_clobber_r20,.-abi_test_clobber_r20 -.globl abi_test_clobber_r21 -.type abi_test_clobber_r21,@function -.align 5 -abi_test_clobber_r21: -.localentry abi_test_clobber_r21,0 - - li 21, 0 - blr -.size abi_test_clobber_r21,.-abi_test_clobber_r21 -.globl abi_test_clobber_r22 -.type abi_test_clobber_r22,@function -.align 5 -abi_test_clobber_r22: -.localentry abi_test_clobber_r22,0 - - li 22, 0 - blr -.size abi_test_clobber_r22,.-abi_test_clobber_r22 -.globl abi_test_clobber_r23 -.type abi_test_clobber_r23,@function -.align 5 -abi_test_clobber_r23: -.localentry abi_test_clobber_r23,0 - - li 23, 0 - blr -.size abi_test_clobber_r23,.-abi_test_clobber_r23 -.globl abi_test_clobber_r24 -.type abi_test_clobber_r24,@function -.align 5 -abi_test_clobber_r24: -.localentry abi_test_clobber_r24,0 - - li 24, 0 - blr -.size abi_test_clobber_r24,.-abi_test_clobber_r24 -.globl abi_test_clobber_r25 -.type abi_test_clobber_r25,@function -.align 5 -abi_test_clobber_r25: -.localentry abi_test_clobber_r25,0 - - li 25, 0 - blr -.size abi_test_clobber_r25,.-abi_test_clobber_r25 -.globl abi_test_clobber_r26 -.type abi_test_clobber_r26,@function -.align 5 -abi_test_clobber_r26: -.localentry abi_test_clobber_r26,0 - - li 26, 0 - blr -.size abi_test_clobber_r26,.-abi_test_clobber_r26 -.globl abi_test_clobber_r27 -.type abi_test_clobber_r27,@function -.align 5 -abi_test_clobber_r27: -.localentry abi_test_clobber_r27,0 - - li 27, 0 - blr -.size abi_test_clobber_r27,.-abi_test_clobber_r27 -.globl abi_test_clobber_r28 -.type abi_test_clobber_r28,@function -.align 5 -abi_test_clobber_r28: -.localentry abi_test_clobber_r28,0 - - li 28, 0 - blr -.size abi_test_clobber_r28,.-abi_test_clobber_r28 -.globl abi_test_clobber_r29 -.type abi_test_clobber_r29,@function -.align 5 -abi_test_clobber_r29: -.localentry abi_test_clobber_r29,0 - - li 29, 0 - blr -.size abi_test_clobber_r29,.-abi_test_clobber_r29 -.globl abi_test_clobber_r30 -.type abi_test_clobber_r30,@function -.align 5 -abi_test_clobber_r30: -.localentry abi_test_clobber_r30,0 - - li 30, 0 - blr -.size abi_test_clobber_r30,.-abi_test_clobber_r30 -.globl abi_test_clobber_r31 -.type abi_test_clobber_r31,@function -.align 5 -abi_test_clobber_r31: -.localentry abi_test_clobber_r31,0 - - li 31, 0 - blr -.size abi_test_clobber_r31,.-abi_test_clobber_r31 -.globl abi_test_clobber_f0 -.type abi_test_clobber_f0,@function -.align 4 -abi_test_clobber_f0: -.localentry abi_test_clobber_f0,0 - - li 0, 0 - - std 0, -8(1) - lfd 0, -8(1) - blr -.size abi_test_clobber_f0,.-abi_test_clobber_f0 -.globl abi_test_clobber_f1 -.type abi_test_clobber_f1,@function -.align 4 -abi_test_clobber_f1: -.localentry abi_test_clobber_f1,0 - - li 0, 0 - - std 0, -8(1) - lfd 1, -8(1) - blr -.size abi_test_clobber_f1,.-abi_test_clobber_f1 -.globl abi_test_clobber_f2 -.type abi_test_clobber_f2,@function -.align 4 -abi_test_clobber_f2: -.localentry abi_test_clobber_f2,0 - - li 0, 0 - - std 0, -8(1) - lfd 2, -8(1) - blr -.size abi_test_clobber_f2,.-abi_test_clobber_f2 -.globl abi_test_clobber_f3 -.type abi_test_clobber_f3,@function -.align 4 -abi_test_clobber_f3: -.localentry abi_test_clobber_f3,0 - - li 0, 0 - - std 0, -8(1) - lfd 3, -8(1) - blr -.size abi_test_clobber_f3,.-abi_test_clobber_f3 -.globl abi_test_clobber_f4 -.type abi_test_clobber_f4,@function -.align 4 -abi_test_clobber_f4: -.localentry abi_test_clobber_f4,0 - - li 0, 0 - - std 0, -8(1) - lfd 4, -8(1) - blr -.size abi_test_clobber_f4,.-abi_test_clobber_f4 -.globl abi_test_clobber_f5 -.type abi_test_clobber_f5,@function -.align 4 -abi_test_clobber_f5: -.localentry abi_test_clobber_f5,0 - - li 0, 0 - - std 0, -8(1) - lfd 5, -8(1) - blr -.size abi_test_clobber_f5,.-abi_test_clobber_f5 -.globl abi_test_clobber_f6 -.type abi_test_clobber_f6,@function -.align 4 -abi_test_clobber_f6: -.localentry abi_test_clobber_f6,0 - - li 0, 0 - - std 0, -8(1) - lfd 6, -8(1) - blr -.size abi_test_clobber_f6,.-abi_test_clobber_f6 -.globl abi_test_clobber_f7 -.type abi_test_clobber_f7,@function -.align 4 -abi_test_clobber_f7: -.localentry abi_test_clobber_f7,0 - - li 0, 0 - - std 0, -8(1) - lfd 7, -8(1) - blr -.size abi_test_clobber_f7,.-abi_test_clobber_f7 -.globl abi_test_clobber_f8 -.type abi_test_clobber_f8,@function -.align 4 -abi_test_clobber_f8: -.localentry abi_test_clobber_f8,0 - - li 0, 0 - - std 0, -8(1) - lfd 8, -8(1) - blr -.size abi_test_clobber_f8,.-abi_test_clobber_f8 -.globl abi_test_clobber_f9 -.type abi_test_clobber_f9,@function -.align 4 -abi_test_clobber_f9: -.localentry abi_test_clobber_f9,0 - - li 0, 0 - - std 0, -8(1) - lfd 9, -8(1) - blr -.size abi_test_clobber_f9,.-abi_test_clobber_f9 -.globl abi_test_clobber_f10 -.type abi_test_clobber_f10,@function -.align 4 -abi_test_clobber_f10: -.localentry abi_test_clobber_f10,0 - - li 0, 0 - - std 0, -8(1) - lfd 10, -8(1) - blr -.size abi_test_clobber_f10,.-abi_test_clobber_f10 -.globl abi_test_clobber_f11 -.type abi_test_clobber_f11,@function -.align 4 -abi_test_clobber_f11: -.localentry abi_test_clobber_f11,0 - - li 0, 0 - - std 0, -8(1) - lfd 11, -8(1) - blr -.size abi_test_clobber_f11,.-abi_test_clobber_f11 -.globl abi_test_clobber_f12 -.type abi_test_clobber_f12,@function -.align 4 -abi_test_clobber_f12: -.localentry abi_test_clobber_f12,0 - - li 0, 0 - - std 0, -8(1) - lfd 12, -8(1) - blr -.size abi_test_clobber_f12,.-abi_test_clobber_f12 -.globl abi_test_clobber_f13 -.type abi_test_clobber_f13,@function -.align 4 -abi_test_clobber_f13: -.localentry abi_test_clobber_f13,0 - - li 0, 0 - - std 0, -8(1) - lfd 13, -8(1) - blr -.size abi_test_clobber_f13,.-abi_test_clobber_f13 -.globl abi_test_clobber_f14 -.type abi_test_clobber_f14,@function -.align 4 -abi_test_clobber_f14: -.localentry abi_test_clobber_f14,0 - - li 0, 0 - - std 0, -8(1) - lfd 14, -8(1) - blr -.size abi_test_clobber_f14,.-abi_test_clobber_f14 -.globl abi_test_clobber_f15 -.type abi_test_clobber_f15,@function -.align 4 -abi_test_clobber_f15: -.localentry abi_test_clobber_f15,0 - - li 0, 0 - - std 0, -8(1) - lfd 15, -8(1) - blr -.size abi_test_clobber_f15,.-abi_test_clobber_f15 -.globl abi_test_clobber_f16 -.type abi_test_clobber_f16,@function -.align 4 -abi_test_clobber_f16: -.localentry abi_test_clobber_f16,0 - - li 0, 0 - - std 0, -8(1) - lfd 16, -8(1) - blr -.size abi_test_clobber_f16,.-abi_test_clobber_f16 -.globl abi_test_clobber_f17 -.type abi_test_clobber_f17,@function -.align 4 -abi_test_clobber_f17: -.localentry abi_test_clobber_f17,0 - - li 0, 0 - - std 0, -8(1) - lfd 17, -8(1) - blr -.size abi_test_clobber_f17,.-abi_test_clobber_f17 -.globl abi_test_clobber_f18 -.type abi_test_clobber_f18,@function -.align 4 -abi_test_clobber_f18: -.localentry abi_test_clobber_f18,0 - - li 0, 0 - - std 0, -8(1) - lfd 18, -8(1) - blr -.size abi_test_clobber_f18,.-abi_test_clobber_f18 -.globl abi_test_clobber_f19 -.type abi_test_clobber_f19,@function -.align 4 -abi_test_clobber_f19: -.localentry abi_test_clobber_f19,0 - - li 0, 0 - - std 0, -8(1) - lfd 19, -8(1) - blr -.size abi_test_clobber_f19,.-abi_test_clobber_f19 -.globl abi_test_clobber_f20 -.type abi_test_clobber_f20,@function -.align 4 -abi_test_clobber_f20: -.localentry abi_test_clobber_f20,0 - - li 0, 0 - - std 0, -8(1) - lfd 20, -8(1) - blr -.size abi_test_clobber_f20,.-abi_test_clobber_f20 -.globl abi_test_clobber_f21 -.type abi_test_clobber_f21,@function -.align 4 -abi_test_clobber_f21: -.localentry abi_test_clobber_f21,0 - - li 0, 0 - - std 0, -8(1) - lfd 21, -8(1) - blr -.size abi_test_clobber_f21,.-abi_test_clobber_f21 -.globl abi_test_clobber_f22 -.type abi_test_clobber_f22,@function -.align 4 -abi_test_clobber_f22: -.localentry abi_test_clobber_f22,0 - - li 0, 0 - - std 0, -8(1) - lfd 22, -8(1) - blr -.size abi_test_clobber_f22,.-abi_test_clobber_f22 -.globl abi_test_clobber_f23 -.type abi_test_clobber_f23,@function -.align 4 -abi_test_clobber_f23: -.localentry abi_test_clobber_f23,0 - - li 0, 0 - - std 0, -8(1) - lfd 23, -8(1) - blr -.size abi_test_clobber_f23,.-abi_test_clobber_f23 -.globl abi_test_clobber_f24 -.type abi_test_clobber_f24,@function -.align 4 -abi_test_clobber_f24: -.localentry abi_test_clobber_f24,0 - - li 0, 0 - - std 0, -8(1) - lfd 24, -8(1) - blr -.size abi_test_clobber_f24,.-abi_test_clobber_f24 -.globl abi_test_clobber_f25 -.type abi_test_clobber_f25,@function -.align 4 -abi_test_clobber_f25: -.localentry abi_test_clobber_f25,0 - - li 0, 0 - - std 0, -8(1) - lfd 25, -8(1) - blr -.size abi_test_clobber_f25,.-abi_test_clobber_f25 -.globl abi_test_clobber_f26 -.type abi_test_clobber_f26,@function -.align 4 -abi_test_clobber_f26: -.localentry abi_test_clobber_f26,0 - - li 0, 0 - - std 0, -8(1) - lfd 26, -8(1) - blr -.size abi_test_clobber_f26,.-abi_test_clobber_f26 -.globl abi_test_clobber_f27 -.type abi_test_clobber_f27,@function -.align 4 -abi_test_clobber_f27: -.localentry abi_test_clobber_f27,0 - - li 0, 0 - - std 0, -8(1) - lfd 27, -8(1) - blr -.size abi_test_clobber_f27,.-abi_test_clobber_f27 -.globl abi_test_clobber_f28 -.type abi_test_clobber_f28,@function -.align 4 -abi_test_clobber_f28: -.localentry abi_test_clobber_f28,0 - - li 0, 0 - - std 0, -8(1) - lfd 28, -8(1) - blr -.size abi_test_clobber_f28,.-abi_test_clobber_f28 -.globl abi_test_clobber_f29 -.type abi_test_clobber_f29,@function -.align 4 -abi_test_clobber_f29: -.localentry abi_test_clobber_f29,0 - - li 0, 0 - - std 0, -8(1) - lfd 29, -8(1) - blr -.size abi_test_clobber_f29,.-abi_test_clobber_f29 -.globl abi_test_clobber_f30 -.type abi_test_clobber_f30,@function -.align 4 -abi_test_clobber_f30: -.localentry abi_test_clobber_f30,0 - - li 0, 0 - - std 0, -8(1) - lfd 30, -8(1) - blr -.size abi_test_clobber_f30,.-abi_test_clobber_f30 -.globl abi_test_clobber_f31 -.type abi_test_clobber_f31,@function -.align 4 -abi_test_clobber_f31: -.localentry abi_test_clobber_f31,0 - - li 0, 0 - - std 0, -8(1) - lfd 31, -8(1) - blr -.size abi_test_clobber_f31,.-abi_test_clobber_f31 -.globl abi_test_clobber_v0 -.type abi_test_clobber_v0,@function -.align 4 -abi_test_clobber_v0: -.localentry abi_test_clobber_v0,0 - - vxor 0, 0, 0 - blr -.size abi_test_clobber_v0,.-abi_test_clobber_v0 -.globl abi_test_clobber_v1 -.type abi_test_clobber_v1,@function -.align 4 -abi_test_clobber_v1: -.localentry abi_test_clobber_v1,0 - - vxor 1, 1, 1 - blr -.size abi_test_clobber_v1,.-abi_test_clobber_v1 -.globl abi_test_clobber_v2 -.type abi_test_clobber_v2,@function -.align 4 -abi_test_clobber_v2: -.localentry abi_test_clobber_v2,0 - - vxor 2, 2, 2 - blr -.size abi_test_clobber_v2,.-abi_test_clobber_v2 -.globl abi_test_clobber_v3 -.type abi_test_clobber_v3,@function -.align 4 -abi_test_clobber_v3: -.localentry abi_test_clobber_v3,0 - - vxor 3, 3, 3 - blr -.size abi_test_clobber_v3,.-abi_test_clobber_v3 -.globl abi_test_clobber_v4 -.type abi_test_clobber_v4,@function -.align 4 -abi_test_clobber_v4: -.localentry abi_test_clobber_v4,0 - - vxor 4, 4, 4 - blr -.size abi_test_clobber_v4,.-abi_test_clobber_v4 -.globl abi_test_clobber_v5 -.type abi_test_clobber_v5,@function -.align 4 -abi_test_clobber_v5: -.localentry abi_test_clobber_v5,0 - - vxor 5, 5, 5 - blr -.size abi_test_clobber_v5,.-abi_test_clobber_v5 -.globl abi_test_clobber_v6 -.type abi_test_clobber_v6,@function -.align 4 -abi_test_clobber_v6: -.localentry abi_test_clobber_v6,0 - - vxor 6, 6, 6 - blr -.size abi_test_clobber_v6,.-abi_test_clobber_v6 -.globl abi_test_clobber_v7 -.type abi_test_clobber_v7,@function -.align 4 -abi_test_clobber_v7: -.localentry abi_test_clobber_v7,0 - - vxor 7, 7, 7 - blr -.size abi_test_clobber_v7,.-abi_test_clobber_v7 -.globl abi_test_clobber_v8 -.type abi_test_clobber_v8,@function -.align 4 -abi_test_clobber_v8: -.localentry abi_test_clobber_v8,0 - - vxor 8, 8, 8 - blr -.size abi_test_clobber_v8,.-abi_test_clobber_v8 -.globl abi_test_clobber_v9 -.type abi_test_clobber_v9,@function -.align 4 -abi_test_clobber_v9: -.localentry abi_test_clobber_v9,0 - - vxor 9, 9, 9 - blr -.size abi_test_clobber_v9,.-abi_test_clobber_v9 -.globl abi_test_clobber_v10 -.type abi_test_clobber_v10,@function -.align 4 -abi_test_clobber_v10: -.localentry abi_test_clobber_v10,0 - - vxor 10, 10, 10 - blr -.size abi_test_clobber_v10,.-abi_test_clobber_v10 -.globl abi_test_clobber_v11 -.type abi_test_clobber_v11,@function -.align 4 -abi_test_clobber_v11: -.localentry abi_test_clobber_v11,0 - - vxor 11, 11, 11 - blr -.size abi_test_clobber_v11,.-abi_test_clobber_v11 -.globl abi_test_clobber_v12 -.type abi_test_clobber_v12,@function -.align 4 -abi_test_clobber_v12: -.localentry abi_test_clobber_v12,0 - - vxor 12, 12, 12 - blr -.size abi_test_clobber_v12,.-abi_test_clobber_v12 -.globl abi_test_clobber_v13 -.type abi_test_clobber_v13,@function -.align 4 -abi_test_clobber_v13: -.localentry abi_test_clobber_v13,0 - - vxor 13, 13, 13 - blr -.size abi_test_clobber_v13,.-abi_test_clobber_v13 -.globl abi_test_clobber_v14 -.type abi_test_clobber_v14,@function -.align 4 -abi_test_clobber_v14: -.localentry abi_test_clobber_v14,0 - - vxor 14, 14, 14 - blr -.size abi_test_clobber_v14,.-abi_test_clobber_v14 -.globl abi_test_clobber_v15 -.type abi_test_clobber_v15,@function -.align 4 -abi_test_clobber_v15: -.localentry abi_test_clobber_v15,0 - - vxor 15, 15, 15 - blr -.size abi_test_clobber_v15,.-abi_test_clobber_v15 -.globl abi_test_clobber_v16 -.type abi_test_clobber_v16,@function -.align 4 -abi_test_clobber_v16: -.localentry abi_test_clobber_v16,0 - - vxor 16, 16, 16 - blr -.size abi_test_clobber_v16,.-abi_test_clobber_v16 -.globl abi_test_clobber_v17 -.type abi_test_clobber_v17,@function -.align 4 -abi_test_clobber_v17: -.localentry abi_test_clobber_v17,0 - - vxor 17, 17, 17 - blr -.size abi_test_clobber_v17,.-abi_test_clobber_v17 -.globl abi_test_clobber_v18 -.type abi_test_clobber_v18,@function -.align 4 -abi_test_clobber_v18: -.localentry abi_test_clobber_v18,0 - - vxor 18, 18, 18 - blr -.size abi_test_clobber_v18,.-abi_test_clobber_v18 -.globl abi_test_clobber_v19 -.type abi_test_clobber_v19,@function -.align 4 -abi_test_clobber_v19: -.localentry abi_test_clobber_v19,0 - - vxor 19, 19, 19 - blr -.size abi_test_clobber_v19,.-abi_test_clobber_v19 -.globl abi_test_clobber_v20 -.type abi_test_clobber_v20,@function -.align 4 -abi_test_clobber_v20: -.localentry abi_test_clobber_v20,0 - - vxor 20, 20, 20 - blr -.size abi_test_clobber_v20,.-abi_test_clobber_v20 -.globl abi_test_clobber_v21 -.type abi_test_clobber_v21,@function -.align 4 -abi_test_clobber_v21: -.localentry abi_test_clobber_v21,0 - - vxor 21, 21, 21 - blr -.size abi_test_clobber_v21,.-abi_test_clobber_v21 -.globl abi_test_clobber_v22 -.type abi_test_clobber_v22,@function -.align 4 -abi_test_clobber_v22: -.localentry abi_test_clobber_v22,0 - - vxor 22, 22, 22 - blr -.size abi_test_clobber_v22,.-abi_test_clobber_v22 -.globl abi_test_clobber_v23 -.type abi_test_clobber_v23,@function -.align 4 -abi_test_clobber_v23: -.localentry abi_test_clobber_v23,0 - - vxor 23, 23, 23 - blr -.size abi_test_clobber_v23,.-abi_test_clobber_v23 -.globl abi_test_clobber_v24 -.type abi_test_clobber_v24,@function -.align 4 -abi_test_clobber_v24: -.localentry abi_test_clobber_v24,0 - - vxor 24, 24, 24 - blr -.size abi_test_clobber_v24,.-abi_test_clobber_v24 -.globl abi_test_clobber_v25 -.type abi_test_clobber_v25,@function -.align 4 -abi_test_clobber_v25: -.localentry abi_test_clobber_v25,0 - - vxor 25, 25, 25 - blr -.size abi_test_clobber_v25,.-abi_test_clobber_v25 -.globl abi_test_clobber_v26 -.type abi_test_clobber_v26,@function -.align 4 -abi_test_clobber_v26: -.localentry abi_test_clobber_v26,0 - - vxor 26, 26, 26 - blr -.size abi_test_clobber_v26,.-abi_test_clobber_v26 -.globl abi_test_clobber_v27 -.type abi_test_clobber_v27,@function -.align 4 -abi_test_clobber_v27: -.localentry abi_test_clobber_v27,0 - - vxor 27, 27, 27 - blr -.size abi_test_clobber_v27,.-abi_test_clobber_v27 -.globl abi_test_clobber_v28 -.type abi_test_clobber_v28,@function -.align 4 -abi_test_clobber_v28: -.localentry abi_test_clobber_v28,0 - - vxor 28, 28, 28 - blr -.size abi_test_clobber_v28,.-abi_test_clobber_v28 -.globl abi_test_clobber_v29 -.type abi_test_clobber_v29,@function -.align 4 -abi_test_clobber_v29: -.localentry abi_test_clobber_v29,0 - - vxor 29, 29, 29 - blr -.size abi_test_clobber_v29,.-abi_test_clobber_v29 -.globl abi_test_clobber_v30 -.type abi_test_clobber_v30,@function -.align 4 -abi_test_clobber_v30: -.localentry abi_test_clobber_v30,0 - - vxor 30, 30, 30 - blr -.size abi_test_clobber_v30,.-abi_test_clobber_v30 -.globl abi_test_clobber_v31 -.type abi_test_clobber_v31,@function -.align 4 -abi_test_clobber_v31: -.localentry abi_test_clobber_v31,0 - - vxor 31, 31, 31 - blr -.size abi_test_clobber_v31,.-abi_test_clobber_v31 -.globl abi_test_clobber_cr0 -.type abi_test_clobber_cr0,@function -.align 4 -abi_test_clobber_cr0: -.localentry abi_test_clobber_cr0,0 - - - - mfcr 0 - not 0, 0 - mtcrf 128, 0 - blr -.size abi_test_clobber_cr0,.-abi_test_clobber_cr0 -.globl abi_test_clobber_cr1 -.type abi_test_clobber_cr1,@function -.align 4 -abi_test_clobber_cr1: -.localentry abi_test_clobber_cr1,0 - - - - mfcr 0 - not 0, 0 - mtcrf 64, 0 - blr -.size abi_test_clobber_cr1,.-abi_test_clobber_cr1 -.globl abi_test_clobber_cr2 -.type abi_test_clobber_cr2,@function -.align 4 -abi_test_clobber_cr2: -.localentry abi_test_clobber_cr2,0 - - - - mfcr 0 - not 0, 0 - mtcrf 32, 0 - blr -.size abi_test_clobber_cr2,.-abi_test_clobber_cr2 -.globl abi_test_clobber_cr3 -.type abi_test_clobber_cr3,@function -.align 4 -abi_test_clobber_cr3: -.localentry abi_test_clobber_cr3,0 - - - - mfcr 0 - not 0, 0 - mtcrf 16, 0 - blr -.size abi_test_clobber_cr3,.-abi_test_clobber_cr3 -.globl abi_test_clobber_cr4 -.type abi_test_clobber_cr4,@function -.align 4 -abi_test_clobber_cr4: -.localentry abi_test_clobber_cr4,0 - - - - mfcr 0 - not 0, 0 - mtcrf 8, 0 - blr -.size abi_test_clobber_cr4,.-abi_test_clobber_cr4 -.globl abi_test_clobber_cr5 -.type abi_test_clobber_cr5,@function -.align 4 -abi_test_clobber_cr5: -.localentry abi_test_clobber_cr5,0 - - - - mfcr 0 - not 0, 0 - mtcrf 4, 0 - blr -.size abi_test_clobber_cr5,.-abi_test_clobber_cr5 -.globl abi_test_clobber_cr6 -.type abi_test_clobber_cr6,@function -.align 4 -abi_test_clobber_cr6: -.localentry abi_test_clobber_cr6,0 - - - - mfcr 0 - not 0, 0 - mtcrf 2, 0 - blr -.size abi_test_clobber_cr6,.-abi_test_clobber_cr6 -.globl abi_test_clobber_cr7 -.type abi_test_clobber_cr7,@function -.align 4 -abi_test_clobber_cr7: -.localentry abi_test_clobber_cr7,0 - - - - mfcr 0 - not 0, 0 - mtcrf 1, 0 - blr -.size abi_test_clobber_cr7,.-abi_test_clobber_cr7 -.globl abi_test_clobber_ctr -.type abi_test_clobber_ctr,@function -.align 4 -abi_test_clobber_ctr: -.localentry abi_test_clobber_ctr,0 - - li 0, 0 - mtctr 0 - blr -.size abi_test_clobber_ctr,.-abi_test_clobber_ctr - -.globl abi_test_clobber_lr -.type abi_test_clobber_lr,@function -.align 4 -abi_test_clobber_lr: -.localentry abi_test_clobber_lr,0 - - mflr 0 - mtctr 0 - li 0, 0 - mtlr 0 - bctr -.size abi_test_clobber_lr,.-abi_test_clobber_lr - -#endif // !OPENSSL_NO_ASM && __powerpc64__ -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S b/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S deleted file mode 100644 index 0ae7a4bb..00000000 --- a/third_party/boringssl/linux-x86/crypto/chacha/chacha-x86.S +++ /dev/null @@ -1,975 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl ChaCha20_ctr32 -.hidden ChaCha20_ctr32 -.type ChaCha20_ctr32,@function -.align 16 -ChaCha20_ctr32: -.L_ChaCha20_ctr32_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - xorl %eax,%eax - cmpl 28(%esp),%eax - je .L000no_data - call .Lpic_point -.Lpic_point: - popl %eax - leal OPENSSL_ia32cap_P-.Lpic_point(%eax),%ebp - testl $16777216,(%ebp) - jz .L001x86 - testl $512,4(%ebp) - jz .L001x86 - jmp .Lssse3_shortcut -.L001x86: - movl 32(%esp),%esi - movl 36(%esp),%edi - subl $132,%esp - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edx - movl %eax,80(%esp) - movl %ebx,84(%esp) - movl %ecx,88(%esp) - movl %edx,92(%esp) - movl 16(%esi),%eax - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%edx - movl %eax,96(%esp) - movl %ebx,100(%esp) - movl %ecx,104(%esp) - movl %edx,108(%esp) - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - subl $1,%eax - movl %eax,112(%esp) - movl %ebx,116(%esp) - movl %ecx,120(%esp) - movl %edx,124(%esp) - jmp .L002entry -.align 16 -.L003outer_loop: - movl %ebx,156(%esp) - movl %eax,152(%esp) - movl %ecx,160(%esp) -.L002entry: - movl $1634760805,%eax - movl $857760878,4(%esp) - movl $2036477234,8(%esp) - movl $1797285236,12(%esp) - movl 84(%esp),%ebx - movl 88(%esp),%ebp - movl 104(%esp),%ecx - movl 108(%esp),%esi - movl 116(%esp),%edx - movl 120(%esp),%edi - movl %ebx,20(%esp) - movl %ebp,24(%esp) - movl %ecx,40(%esp) - movl %esi,44(%esp) - movl %edx,52(%esp) - movl %edi,56(%esp) - movl 92(%esp),%ebx - movl 124(%esp),%edi - movl 112(%esp),%edx - movl 80(%esp),%ebp - movl 96(%esp),%ecx - movl 100(%esp),%esi - addl $1,%edx - movl %ebx,28(%esp) - movl %edi,60(%esp) - movl %edx,112(%esp) - movl $10,%ebx - jmp .L004loop -.align 16 -.L004loop: - addl %ebp,%eax - movl %ebx,128(%esp) - movl %ebp,%ebx - xorl %eax,%edx - roll $16,%edx - addl %edx,%ecx - xorl %ecx,%ebx - movl 52(%esp),%edi - roll $12,%ebx - movl 20(%esp),%ebp - addl %ebx,%eax - xorl %eax,%edx - movl %eax,(%esp) - roll $8,%edx - movl 4(%esp),%eax - addl %edx,%ecx - movl %edx,48(%esp) - xorl %ecx,%ebx - addl %ebp,%eax - roll $7,%ebx - xorl %eax,%edi - movl %ecx,32(%esp) - roll $16,%edi - movl %ebx,16(%esp) - addl %edi,%esi - movl 40(%esp),%ecx - xorl %esi,%ebp - movl 56(%esp),%edx - roll $12,%ebp - movl 24(%esp),%ebx - addl %ebp,%eax - xorl %eax,%edi - movl %eax,4(%esp) - roll $8,%edi - movl 8(%esp),%eax - addl %edi,%esi - movl %edi,52(%esp) - xorl %esi,%ebp - addl %ebx,%eax - roll $7,%ebp - xorl %eax,%edx - movl %esi,36(%esp) - roll $16,%edx - movl %ebp,20(%esp) - addl %edx,%ecx - movl 44(%esp),%esi - xorl %ecx,%ebx - movl 60(%esp),%edi - roll $12,%ebx - movl 28(%esp),%ebp - addl %ebx,%eax - xorl %eax,%edx - movl %eax,8(%esp) - roll $8,%edx - movl 12(%esp),%eax - addl %edx,%ecx - movl %edx,56(%esp) - xorl %ecx,%ebx - addl %ebp,%eax - roll $7,%ebx - xorl %eax,%edi - roll $16,%edi - movl %ebx,24(%esp) - addl %edi,%esi - xorl %esi,%ebp - roll $12,%ebp - movl 20(%esp),%ebx - addl %ebp,%eax - xorl %eax,%edi - movl %eax,12(%esp) - roll $8,%edi - movl (%esp),%eax - addl %edi,%esi - movl %edi,%edx - xorl %esi,%ebp - addl %ebx,%eax - roll $7,%ebp - xorl %eax,%edx - roll $16,%edx - movl %ebp,28(%esp) - addl %edx,%ecx - xorl %ecx,%ebx - movl 48(%esp),%edi - roll $12,%ebx - movl 24(%esp),%ebp - addl %ebx,%eax - xorl %eax,%edx - movl %eax,(%esp) - roll $8,%edx - movl 4(%esp),%eax - addl %edx,%ecx - movl %edx,60(%esp) - xorl %ecx,%ebx - addl %ebp,%eax - roll $7,%ebx - xorl %eax,%edi - movl %ecx,40(%esp) - roll $16,%edi - movl %ebx,20(%esp) - addl %edi,%esi - movl 32(%esp),%ecx - xorl %esi,%ebp - movl 52(%esp),%edx - roll $12,%ebp - movl 28(%esp),%ebx - addl %ebp,%eax - xorl %eax,%edi - movl %eax,4(%esp) - roll $8,%edi - movl 8(%esp),%eax - addl %edi,%esi - movl %edi,48(%esp) - xorl %esi,%ebp - addl %ebx,%eax - roll $7,%ebp - xorl %eax,%edx - movl %esi,44(%esp) - roll $16,%edx - movl %ebp,24(%esp) - addl %edx,%ecx - movl 36(%esp),%esi - xorl %ecx,%ebx - movl 56(%esp),%edi - roll $12,%ebx - movl 16(%esp),%ebp - addl %ebx,%eax - xorl %eax,%edx - movl %eax,8(%esp) - roll $8,%edx - movl 12(%esp),%eax - addl %edx,%ecx - movl %edx,52(%esp) - xorl %ecx,%ebx - addl %ebp,%eax - roll $7,%ebx - xorl %eax,%edi - roll $16,%edi - movl %ebx,28(%esp) - addl %edi,%esi - xorl %esi,%ebp - movl 48(%esp),%edx - roll $12,%ebp - movl 128(%esp),%ebx - addl %ebp,%eax - xorl %eax,%edi - movl %eax,12(%esp) - roll $8,%edi - movl (%esp),%eax - addl %edi,%esi - movl %edi,56(%esp) - xorl %esi,%ebp - roll $7,%ebp - decl %ebx - jnz .L004loop - movl 160(%esp),%ebx - addl $1634760805,%eax - addl 80(%esp),%ebp - addl 96(%esp),%ecx - addl 100(%esp),%esi - cmpl $64,%ebx - jb .L005tail - movl 156(%esp),%ebx - addl 112(%esp),%edx - addl 120(%esp),%edi - xorl (%ebx),%eax - xorl 16(%ebx),%ebp - movl %eax,(%esp) - movl 152(%esp),%eax - xorl 32(%ebx),%ecx - xorl 36(%ebx),%esi - xorl 48(%ebx),%edx - xorl 56(%ebx),%edi - movl %ebp,16(%eax) - movl %ecx,32(%eax) - movl %esi,36(%eax) - movl %edx,48(%eax) - movl %edi,56(%eax) - movl 4(%esp),%ebp - movl 8(%esp),%ecx - movl 12(%esp),%esi - movl 20(%esp),%edx - movl 24(%esp),%edi - addl $857760878,%ebp - addl $2036477234,%ecx - addl $1797285236,%esi - addl 84(%esp),%edx - addl 88(%esp),%edi - xorl 4(%ebx),%ebp - xorl 8(%ebx),%ecx - xorl 12(%ebx),%esi - xorl 20(%ebx),%edx - xorl 24(%ebx),%edi - movl %ebp,4(%eax) - movl %ecx,8(%eax) - movl %esi,12(%eax) - movl %edx,20(%eax) - movl %edi,24(%eax) - movl 28(%esp),%ebp - movl 40(%esp),%ecx - movl 44(%esp),%esi - movl 52(%esp),%edx - movl 60(%esp),%edi - addl 92(%esp),%ebp - addl 104(%esp),%ecx - addl 108(%esp),%esi - addl 116(%esp),%edx - addl 124(%esp),%edi - xorl 28(%ebx),%ebp - xorl 40(%ebx),%ecx - xorl 44(%ebx),%esi - xorl 52(%ebx),%edx - xorl 60(%ebx),%edi - leal 64(%ebx),%ebx - movl %ebp,28(%eax) - movl (%esp),%ebp - movl %ecx,40(%eax) - movl 160(%esp),%ecx - movl %esi,44(%eax) - movl %edx,52(%eax) - movl %edi,60(%eax) - movl %ebp,(%eax) - leal 64(%eax),%eax - subl $64,%ecx - jnz .L003outer_loop - jmp .L006done -.L005tail: - addl 112(%esp),%edx - addl 120(%esp),%edi - movl %eax,(%esp) - movl %ebp,16(%esp) - movl %ecx,32(%esp) - movl %esi,36(%esp) - movl %edx,48(%esp) - movl %edi,56(%esp) - movl 4(%esp),%ebp - movl 8(%esp),%ecx - movl 12(%esp),%esi - movl 20(%esp),%edx - movl 24(%esp),%edi - addl $857760878,%ebp - addl $2036477234,%ecx - addl $1797285236,%esi - addl 84(%esp),%edx - addl 88(%esp),%edi - movl %ebp,4(%esp) - movl %ecx,8(%esp) - movl %esi,12(%esp) - movl %edx,20(%esp) - movl %edi,24(%esp) - movl 28(%esp),%ebp - movl 40(%esp),%ecx - movl 44(%esp),%esi - movl 52(%esp),%edx - movl 60(%esp),%edi - addl 92(%esp),%ebp - addl 104(%esp),%ecx - addl 108(%esp),%esi - addl 116(%esp),%edx - addl 124(%esp),%edi - movl %ebp,28(%esp) - movl 156(%esp),%ebp - movl %ecx,40(%esp) - movl 152(%esp),%ecx - movl %esi,44(%esp) - xorl %esi,%esi - movl %edx,52(%esp) - movl %edi,60(%esp) - xorl %eax,%eax - xorl %edx,%edx -.L007tail_loop: - movb (%esi,%ebp,1),%al - movb (%esp,%esi,1),%dl - leal 1(%esi),%esi - xorb %dl,%al - movb %al,-1(%ecx,%esi,1) - decl %ebx - jnz .L007tail_loop -.L006done: - addl $132,%esp -.L000no_data: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size ChaCha20_ctr32,.-.L_ChaCha20_ctr32_begin -.globl ChaCha20_ssse3 -.hidden ChaCha20_ssse3 -.type ChaCha20_ssse3,@function -.align 16 -ChaCha20_ssse3: -.L_ChaCha20_ssse3_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi -.Lssse3_shortcut: - movl 20(%esp),%edi - movl 24(%esp),%esi - movl 28(%esp),%ecx - movl 32(%esp),%edx - movl 36(%esp),%ebx - movl %esp,%ebp - subl $524,%esp - andl $-64,%esp - movl %ebp,512(%esp) - leal .Lssse3_data-.Lpic_point(%eax),%eax - movdqu (%ebx),%xmm3 - cmpl $256,%ecx - jb .L0081x - movl %edx,516(%esp) - movl %ebx,520(%esp) - subl $256,%ecx - leal 384(%esp),%ebp - movdqu (%edx),%xmm7 - pshufd $0,%xmm3,%xmm0 - pshufd $85,%xmm3,%xmm1 - pshufd $170,%xmm3,%xmm2 - pshufd $255,%xmm3,%xmm3 - paddd 48(%eax),%xmm0 - pshufd $0,%xmm7,%xmm4 - pshufd $85,%xmm7,%xmm5 - psubd 64(%eax),%xmm0 - pshufd $170,%xmm7,%xmm6 - pshufd $255,%xmm7,%xmm7 - movdqa %xmm0,64(%ebp) - movdqa %xmm1,80(%ebp) - movdqa %xmm2,96(%ebp) - movdqa %xmm3,112(%ebp) - movdqu 16(%edx),%xmm3 - movdqa %xmm4,-64(%ebp) - movdqa %xmm5,-48(%ebp) - movdqa %xmm6,-32(%ebp) - movdqa %xmm7,-16(%ebp) - movdqa 32(%eax),%xmm7 - leal 128(%esp),%ebx - pshufd $0,%xmm3,%xmm0 - pshufd $85,%xmm3,%xmm1 - pshufd $170,%xmm3,%xmm2 - pshufd $255,%xmm3,%xmm3 - pshufd $0,%xmm7,%xmm4 - pshufd $85,%xmm7,%xmm5 - pshufd $170,%xmm7,%xmm6 - pshufd $255,%xmm7,%xmm7 - movdqa %xmm0,(%ebp) - movdqa %xmm1,16(%ebp) - movdqa %xmm2,32(%ebp) - movdqa %xmm3,48(%ebp) - movdqa %xmm4,-128(%ebp) - movdqa %xmm5,-112(%ebp) - movdqa %xmm6,-96(%ebp) - movdqa %xmm7,-80(%ebp) - leal 128(%esi),%esi - leal 128(%edi),%edi - jmp .L009outer_loop -.align 16 -.L009outer_loop: - movdqa -112(%ebp),%xmm1 - movdqa -96(%ebp),%xmm2 - movdqa -80(%ebp),%xmm3 - movdqa -48(%ebp),%xmm5 - movdqa -32(%ebp),%xmm6 - movdqa -16(%ebp),%xmm7 - movdqa %xmm1,-112(%ebx) - movdqa %xmm2,-96(%ebx) - movdqa %xmm3,-80(%ebx) - movdqa %xmm5,-48(%ebx) - movdqa %xmm6,-32(%ebx) - movdqa %xmm7,-16(%ebx) - movdqa 32(%ebp),%xmm2 - movdqa 48(%ebp),%xmm3 - movdqa 64(%ebp),%xmm4 - movdqa 80(%ebp),%xmm5 - movdqa 96(%ebp),%xmm6 - movdqa 112(%ebp),%xmm7 - paddd 64(%eax),%xmm4 - movdqa %xmm2,32(%ebx) - movdqa %xmm3,48(%ebx) - movdqa %xmm4,64(%ebx) - movdqa %xmm5,80(%ebx) - movdqa %xmm6,96(%ebx) - movdqa %xmm7,112(%ebx) - movdqa %xmm4,64(%ebp) - movdqa -128(%ebp),%xmm0 - movdqa %xmm4,%xmm6 - movdqa -64(%ebp),%xmm3 - movdqa (%ebp),%xmm4 - movdqa 16(%ebp),%xmm5 - movl $10,%edx - nop -.align 16 -.L010loop: - paddd %xmm3,%xmm0 - movdqa %xmm3,%xmm2 - pxor %xmm0,%xmm6 - pshufb (%eax),%xmm6 - paddd %xmm6,%xmm4 - pxor %xmm4,%xmm2 - movdqa -48(%ebx),%xmm3 - movdqa %xmm2,%xmm1 - pslld $12,%xmm2 - psrld $20,%xmm1 - por %xmm1,%xmm2 - movdqa -112(%ebx),%xmm1 - paddd %xmm2,%xmm0 - movdqa 80(%ebx),%xmm7 - pxor %xmm0,%xmm6 - movdqa %xmm0,-128(%ebx) - pshufb 16(%eax),%xmm6 - paddd %xmm6,%xmm4 - movdqa %xmm6,64(%ebx) - pxor %xmm4,%xmm2 - paddd %xmm3,%xmm1 - movdqa %xmm2,%xmm0 - pslld $7,%xmm2 - psrld $25,%xmm0 - pxor %xmm1,%xmm7 - por %xmm0,%xmm2 - movdqa %xmm4,(%ebx) - pshufb (%eax),%xmm7 - movdqa %xmm2,-64(%ebx) - paddd %xmm7,%xmm5 - movdqa 32(%ebx),%xmm4 - pxor %xmm5,%xmm3 - movdqa -32(%ebx),%xmm2 - movdqa %xmm3,%xmm0 - pslld $12,%xmm3 - psrld $20,%xmm0 - por %xmm0,%xmm3 - movdqa -96(%ebx),%xmm0 - paddd %xmm3,%xmm1 - movdqa 96(%ebx),%xmm6 - pxor %xmm1,%xmm7 - movdqa %xmm1,-112(%ebx) - pshufb 16(%eax),%xmm7 - paddd %xmm7,%xmm5 - movdqa %xmm7,80(%ebx) - pxor %xmm5,%xmm3 - paddd %xmm2,%xmm0 - movdqa %xmm3,%xmm1 - pslld $7,%xmm3 - psrld $25,%xmm1 - pxor %xmm0,%xmm6 - por %xmm1,%xmm3 - movdqa %xmm5,16(%ebx) - pshufb (%eax),%xmm6 - movdqa %xmm3,-48(%ebx) - paddd %xmm6,%xmm4 - movdqa 48(%ebx),%xmm5 - pxor %xmm4,%xmm2 - movdqa -16(%ebx),%xmm3 - movdqa %xmm2,%xmm1 - pslld $12,%xmm2 - psrld $20,%xmm1 - por %xmm1,%xmm2 - movdqa -80(%ebx),%xmm1 - paddd %xmm2,%xmm0 - movdqa 112(%ebx),%xmm7 - pxor %xmm0,%xmm6 - movdqa %xmm0,-96(%ebx) - pshufb 16(%eax),%xmm6 - paddd %xmm6,%xmm4 - movdqa %xmm6,96(%ebx) - pxor %xmm4,%xmm2 - paddd %xmm3,%xmm1 - movdqa %xmm2,%xmm0 - pslld $7,%xmm2 - psrld $25,%xmm0 - pxor %xmm1,%xmm7 - por %xmm0,%xmm2 - pshufb (%eax),%xmm7 - movdqa %xmm2,-32(%ebx) - paddd %xmm7,%xmm5 - pxor %xmm5,%xmm3 - movdqa -48(%ebx),%xmm2 - movdqa %xmm3,%xmm0 - pslld $12,%xmm3 - psrld $20,%xmm0 - por %xmm0,%xmm3 - movdqa -128(%ebx),%xmm0 - paddd %xmm3,%xmm1 - pxor %xmm1,%xmm7 - movdqa %xmm1,-80(%ebx) - pshufb 16(%eax),%xmm7 - paddd %xmm7,%xmm5 - movdqa %xmm7,%xmm6 - pxor %xmm5,%xmm3 - paddd %xmm2,%xmm0 - movdqa %xmm3,%xmm1 - pslld $7,%xmm3 - psrld $25,%xmm1 - pxor %xmm0,%xmm6 - por %xmm1,%xmm3 - pshufb (%eax),%xmm6 - movdqa %xmm3,-16(%ebx) - paddd %xmm6,%xmm4 - pxor %xmm4,%xmm2 - movdqa -32(%ebx),%xmm3 - movdqa %xmm2,%xmm1 - pslld $12,%xmm2 - psrld $20,%xmm1 - por %xmm1,%xmm2 - movdqa -112(%ebx),%xmm1 - paddd %xmm2,%xmm0 - movdqa 64(%ebx),%xmm7 - pxor %xmm0,%xmm6 - movdqa %xmm0,-128(%ebx) - pshufb 16(%eax),%xmm6 - paddd %xmm6,%xmm4 - movdqa %xmm6,112(%ebx) - pxor %xmm4,%xmm2 - paddd %xmm3,%xmm1 - movdqa %xmm2,%xmm0 - pslld $7,%xmm2 - psrld $25,%xmm0 - pxor %xmm1,%xmm7 - por %xmm0,%xmm2 - movdqa %xmm4,32(%ebx) - pshufb (%eax),%xmm7 - movdqa %xmm2,-48(%ebx) - paddd %xmm7,%xmm5 - movdqa (%ebx),%xmm4 - pxor %xmm5,%xmm3 - movdqa -16(%ebx),%xmm2 - movdqa %xmm3,%xmm0 - pslld $12,%xmm3 - psrld $20,%xmm0 - por %xmm0,%xmm3 - movdqa -96(%ebx),%xmm0 - paddd %xmm3,%xmm1 - movdqa 80(%ebx),%xmm6 - pxor %xmm1,%xmm7 - movdqa %xmm1,-112(%ebx) - pshufb 16(%eax),%xmm7 - paddd %xmm7,%xmm5 - movdqa %xmm7,64(%ebx) - pxor %xmm5,%xmm3 - paddd %xmm2,%xmm0 - movdqa %xmm3,%xmm1 - pslld $7,%xmm3 - psrld $25,%xmm1 - pxor %xmm0,%xmm6 - por %xmm1,%xmm3 - movdqa %xmm5,48(%ebx) - pshufb (%eax),%xmm6 - movdqa %xmm3,-32(%ebx) - paddd %xmm6,%xmm4 - movdqa 16(%ebx),%xmm5 - pxor %xmm4,%xmm2 - movdqa -64(%ebx),%xmm3 - movdqa %xmm2,%xmm1 - pslld $12,%xmm2 - psrld $20,%xmm1 - por %xmm1,%xmm2 - movdqa -80(%ebx),%xmm1 - paddd %xmm2,%xmm0 - movdqa 96(%ebx),%xmm7 - pxor %xmm0,%xmm6 - movdqa %xmm0,-96(%ebx) - pshufb 16(%eax),%xmm6 - paddd %xmm6,%xmm4 - movdqa %xmm6,80(%ebx) - pxor %xmm4,%xmm2 - paddd %xmm3,%xmm1 - movdqa %xmm2,%xmm0 - pslld $7,%xmm2 - psrld $25,%xmm0 - pxor %xmm1,%xmm7 - por %xmm0,%xmm2 - pshufb (%eax),%xmm7 - movdqa %xmm2,-16(%ebx) - paddd %xmm7,%xmm5 - pxor %xmm5,%xmm3 - movdqa %xmm3,%xmm0 - pslld $12,%xmm3 - psrld $20,%xmm0 - por %xmm0,%xmm3 - movdqa -128(%ebx),%xmm0 - paddd %xmm3,%xmm1 - movdqa 64(%ebx),%xmm6 - pxor %xmm1,%xmm7 - movdqa %xmm1,-80(%ebx) - pshufb 16(%eax),%xmm7 - paddd %xmm7,%xmm5 - movdqa %xmm7,96(%ebx) - pxor %xmm5,%xmm3 - movdqa %xmm3,%xmm1 - pslld $7,%xmm3 - psrld $25,%xmm1 - por %xmm1,%xmm3 - decl %edx - jnz .L010loop - movdqa %xmm3,-64(%ebx) - movdqa %xmm4,(%ebx) - movdqa %xmm5,16(%ebx) - movdqa %xmm6,64(%ebx) - movdqa %xmm7,96(%ebx) - movdqa -112(%ebx),%xmm1 - movdqa -96(%ebx),%xmm2 - movdqa -80(%ebx),%xmm3 - paddd -128(%ebp),%xmm0 - paddd -112(%ebp),%xmm1 - paddd -96(%ebp),%xmm2 - paddd -80(%ebp),%xmm3 - movdqa %xmm0,%xmm6 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm6 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 - movdqa %xmm6,%xmm3 - punpcklqdq %xmm7,%xmm6 - punpckhqdq %xmm2,%xmm1 - punpckhqdq %xmm7,%xmm3 - movdqu -128(%esi),%xmm4 - movdqu -64(%esi),%xmm5 - movdqu (%esi),%xmm2 - movdqu 64(%esi),%xmm7 - leal 16(%esi),%esi - pxor %xmm0,%xmm4 - movdqa -64(%ebx),%xmm0 - pxor %xmm1,%xmm5 - movdqa -48(%ebx),%xmm1 - pxor %xmm2,%xmm6 - movdqa -32(%ebx),%xmm2 - pxor %xmm3,%xmm7 - movdqa -16(%ebx),%xmm3 - movdqu %xmm4,-128(%edi) - movdqu %xmm5,-64(%edi) - movdqu %xmm6,(%edi) - movdqu %xmm7,64(%edi) - leal 16(%edi),%edi - paddd -64(%ebp),%xmm0 - paddd -48(%ebp),%xmm1 - paddd -32(%ebp),%xmm2 - paddd -16(%ebp),%xmm3 - movdqa %xmm0,%xmm6 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm6 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 - movdqa %xmm6,%xmm3 - punpcklqdq %xmm7,%xmm6 - punpckhqdq %xmm2,%xmm1 - punpckhqdq %xmm7,%xmm3 - movdqu -128(%esi),%xmm4 - movdqu -64(%esi),%xmm5 - movdqu (%esi),%xmm2 - movdqu 64(%esi),%xmm7 - leal 16(%esi),%esi - pxor %xmm0,%xmm4 - movdqa (%ebx),%xmm0 - pxor %xmm1,%xmm5 - movdqa 16(%ebx),%xmm1 - pxor %xmm2,%xmm6 - movdqa 32(%ebx),%xmm2 - pxor %xmm3,%xmm7 - movdqa 48(%ebx),%xmm3 - movdqu %xmm4,-128(%edi) - movdqu %xmm5,-64(%edi) - movdqu %xmm6,(%edi) - movdqu %xmm7,64(%edi) - leal 16(%edi),%edi - paddd (%ebp),%xmm0 - paddd 16(%ebp),%xmm1 - paddd 32(%ebp),%xmm2 - paddd 48(%ebp),%xmm3 - movdqa %xmm0,%xmm6 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm6 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 - movdqa %xmm6,%xmm3 - punpcklqdq %xmm7,%xmm6 - punpckhqdq %xmm2,%xmm1 - punpckhqdq %xmm7,%xmm3 - movdqu -128(%esi),%xmm4 - movdqu -64(%esi),%xmm5 - movdqu (%esi),%xmm2 - movdqu 64(%esi),%xmm7 - leal 16(%esi),%esi - pxor %xmm0,%xmm4 - movdqa 64(%ebx),%xmm0 - pxor %xmm1,%xmm5 - movdqa 80(%ebx),%xmm1 - pxor %xmm2,%xmm6 - movdqa 96(%ebx),%xmm2 - pxor %xmm3,%xmm7 - movdqa 112(%ebx),%xmm3 - movdqu %xmm4,-128(%edi) - movdqu %xmm5,-64(%edi) - movdqu %xmm6,(%edi) - movdqu %xmm7,64(%edi) - leal 16(%edi),%edi - paddd 64(%ebp),%xmm0 - paddd 80(%ebp),%xmm1 - paddd 96(%ebp),%xmm2 - paddd 112(%ebp),%xmm3 - movdqa %xmm0,%xmm6 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm6 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 - movdqa %xmm6,%xmm3 - punpcklqdq %xmm7,%xmm6 - punpckhqdq %xmm2,%xmm1 - punpckhqdq %xmm7,%xmm3 - movdqu -128(%esi),%xmm4 - movdqu -64(%esi),%xmm5 - movdqu (%esi),%xmm2 - movdqu 64(%esi),%xmm7 - leal 208(%esi),%esi - pxor %xmm0,%xmm4 - pxor %xmm1,%xmm5 - pxor %xmm2,%xmm6 - pxor %xmm3,%xmm7 - movdqu %xmm4,-128(%edi) - movdqu %xmm5,-64(%edi) - movdqu %xmm6,(%edi) - movdqu %xmm7,64(%edi) - leal 208(%edi),%edi - subl $256,%ecx - jnc .L009outer_loop - addl $256,%ecx - jz .L011done - movl 520(%esp),%ebx - leal -128(%esi),%esi - movl 516(%esp),%edx - leal -128(%edi),%edi - movd 64(%ebp),%xmm2 - movdqu (%ebx),%xmm3 - paddd 96(%eax),%xmm2 - pand 112(%eax),%xmm3 - por %xmm2,%xmm3 -.L0081x: - movdqa 32(%eax),%xmm0 - movdqu (%edx),%xmm1 - movdqu 16(%edx),%xmm2 - movdqa (%eax),%xmm6 - movdqa 16(%eax),%xmm7 - movl %ebp,48(%esp) - movdqa %xmm0,(%esp) - movdqa %xmm1,16(%esp) - movdqa %xmm2,32(%esp) - movdqa %xmm3,48(%esp) - movl $10,%edx - jmp .L012loop1x -.align 16 -.L013outer1x: - movdqa 80(%eax),%xmm3 - movdqa (%esp),%xmm0 - movdqa 16(%esp),%xmm1 - movdqa 32(%esp),%xmm2 - paddd 48(%esp),%xmm3 - movl $10,%edx - movdqa %xmm3,48(%esp) - jmp .L012loop1x -.align 16 -.L012loop1x: - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $57,%xmm1,%xmm1 - pshufd $147,%xmm3,%xmm3 - nop - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $147,%xmm1,%xmm1 - pshufd $57,%xmm3,%xmm3 - decl %edx - jnz .L012loop1x - paddd (%esp),%xmm0 - paddd 16(%esp),%xmm1 - paddd 32(%esp),%xmm2 - paddd 48(%esp),%xmm3 - cmpl $64,%ecx - jb .L014tail - movdqu (%esi),%xmm4 - movdqu 16(%esi),%xmm5 - pxor %xmm4,%xmm0 - movdqu 32(%esi),%xmm4 - pxor %xmm5,%xmm1 - movdqu 48(%esi),%xmm5 - pxor %xmm4,%xmm2 - pxor %xmm5,%xmm3 - leal 64(%esi),%esi - movdqu %xmm0,(%edi) - movdqu %xmm1,16(%edi) - movdqu %xmm2,32(%edi) - movdqu %xmm3,48(%edi) - leal 64(%edi),%edi - subl $64,%ecx - jnz .L013outer1x - jmp .L011done -.L014tail: - movdqa %xmm0,(%esp) - movdqa %xmm1,16(%esp) - movdqa %xmm2,32(%esp) - movdqa %xmm3,48(%esp) - xorl %eax,%eax - xorl %edx,%edx - xorl %ebp,%ebp -.L015tail_loop: - movb (%esp,%ebp,1),%al - movb (%esi,%ebp,1),%dl - leal 1(%ebp),%ebp - xorb %dl,%al - movb %al,-1(%edi,%ebp,1) - decl %ecx - jnz .L015tail_loop -.L011done: - movl 512(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size ChaCha20_ssse3,.-.L_ChaCha20_ssse3_begin -.align 64 -.Lssse3_data: -.byte 2,3,0,1,6,7,4,5,10,11,8,9,14,15,12,13 -.byte 3,0,1,2,7,4,5,6,11,8,9,10,15,12,13,14 -.long 1634760805,857760878,2036477234,1797285236 -.long 0,1,2,3 -.long 4,4,4,4 -.long 1,0,0,0 -.long 4,0,0,0 -.long 0,-1,-1,-1 -.align 64 -.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54 -.byte 44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32 -.byte 60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111 -.byte 114,103,62,0 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S b/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S deleted file mode 100644 index 00a6ec21..00000000 --- a/third_party/boringssl/linux-x86/crypto/fipsmodule/aesni-x86.S +++ /dev/null @@ -1,2513 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -#ifdef BORINGSSL_DISPATCH_TEST -#endif -.globl aes_hw_encrypt -.hidden aes_hw_encrypt -.type aes_hw_encrypt,@function -.align 16 -aes_hw_encrypt: -.L_aes_hw_encrypt_begin: -#ifdef BORINGSSL_DISPATCH_TEST - pushl %ebx - pushl %edx - call .L000pic -.L000pic: - popl %ebx - leal BORINGSSL_function_hit+1-.L000pic(%ebx),%ebx - movl $1,%edx - movb %dl,(%ebx) - popl %edx - popl %ebx -#endif - movl 4(%esp),%eax - movl 12(%esp),%edx - movups (%eax),%xmm2 - movl 240(%edx),%ecx - movl 8(%esp),%eax - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L001enc1_loop_1: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L001enc1_loop_1 -.byte 102,15,56,221,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%eax) - pxor %xmm2,%xmm2 - ret -.size aes_hw_encrypt,.-.L_aes_hw_encrypt_begin -.globl aes_hw_decrypt -.hidden aes_hw_decrypt -.type aes_hw_decrypt,@function -.align 16 -aes_hw_decrypt: -.L_aes_hw_decrypt_begin: - movl 4(%esp),%eax - movl 12(%esp),%edx - movups (%eax),%xmm2 - movl 240(%edx),%ecx - movl 8(%esp),%eax - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L002dec1_loop_2: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L002dec1_loop_2 -.byte 102,15,56,223,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%eax) - pxor %xmm2,%xmm2 - ret -.size aes_hw_decrypt,.-.L_aes_hw_decrypt_begin -.hidden _aesni_encrypt2 -.type _aesni_encrypt2,@function -.align 16 -_aesni_encrypt2: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx - addl $16,%ecx -.L003enc2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%edx,%ecx,1),%xmm0 - jnz .L003enc2_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - ret -.size _aesni_encrypt2,.-_aesni_encrypt2 -.hidden _aesni_decrypt2 -.type _aesni_decrypt2,@function -.align 16 -_aesni_decrypt2: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx - addl $16,%ecx -.L004dec2_loop: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 - movups -16(%edx,%ecx,1),%xmm0 - jnz .L004dec2_loop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 - ret -.size _aesni_decrypt2,.-_aesni_decrypt2 -.hidden _aesni_encrypt3 -.type _aesni_encrypt3,@function -.align 16 -_aesni_encrypt3: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx - addl $16,%ecx -.L005enc3_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 - movups -16(%edx,%ecx,1),%xmm0 - jnz .L005enc3_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 - ret -.size _aesni_encrypt3,.-_aesni_encrypt3 -.hidden _aesni_decrypt3 -.type _aesni_decrypt3,@function -.align 16 -_aesni_decrypt3: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx - addl $16,%ecx -.L006dec3_loop: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 - movups -16(%edx,%ecx,1),%xmm0 - jnz .L006dec3_loop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 - ret -.size _aesni_decrypt3,.-_aesni_decrypt3 -.hidden _aesni_encrypt4 -.type _aesni_encrypt4,@function -.align 16 -_aesni_encrypt4: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - shll $4,%ecx - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 - pxor %xmm0,%xmm5 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx -.byte 15,31,64,0 - addl $16,%ecx -.L007enc4_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups -16(%edx,%ecx,1),%xmm0 - jnz .L007enc4_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 -.byte 102,15,56,221,232 - ret -.size _aesni_encrypt4,.-_aesni_encrypt4 -.hidden _aesni_decrypt4 -.type _aesni_decrypt4,@function -.align 16 -_aesni_decrypt4: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - shll $4,%ecx - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 - pxor %xmm0,%xmm5 - movups 32(%edx),%xmm0 - leal 32(%edx,%ecx,1),%edx - negl %ecx -.byte 15,31,64,0 - addl $16,%ecx -.L008dec4_loop: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups -16(%edx,%ecx,1),%xmm0 - jnz .L008dec4_loop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 -.byte 102,15,56,223,232 - ret -.size _aesni_decrypt4,.-_aesni_decrypt4 -.hidden _aesni_encrypt6 -.type _aesni_encrypt6,@function -.align 16 -_aesni_encrypt6: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 -.byte 102,15,56,220,209 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 -.byte 102,15,56,220,217 - leal 32(%edx,%ecx,1),%edx - negl %ecx -.byte 102,15,56,220,225 - pxor %xmm0,%xmm7 - movups (%edx,%ecx,1),%xmm0 - addl $16,%ecx - jmp .L009_aesni_encrypt6_inner -.align 16 -.L010enc6_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.L009_aesni_encrypt6_inner: -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.L_aesni_encrypt6_enter: - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%edx,%ecx,1),%xmm0 - jnz .L010enc6_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 - ret -.size _aesni_encrypt6,.-_aesni_encrypt6 -.hidden _aesni_decrypt6 -.type _aesni_decrypt6,@function -.align 16 -_aesni_decrypt6: - movups (%edx),%xmm0 - shll $4,%ecx - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 -.byte 102,15,56,222,209 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 -.byte 102,15,56,222,217 - leal 32(%edx,%ecx,1),%edx - negl %ecx -.byte 102,15,56,222,225 - pxor %xmm0,%xmm7 - movups (%edx,%ecx,1),%xmm0 - addl $16,%ecx - jmp .L011_aesni_decrypt6_inner -.align 16 -.L012dec6_loop: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.L011_aesni_decrypt6_inner: -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.L_aesni_decrypt6_enter: - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -16(%edx,%ecx,1),%xmm0 - jnz .L012dec6_loop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 - ret -.size _aesni_decrypt6,.-_aesni_decrypt6 -.globl aes_hw_ecb_encrypt -.hidden aes_hw_ecb_encrypt -.type aes_hw_ecb_encrypt,@function -.align 16 -aes_hw_ecb_encrypt: -.L_aes_hw_ecb_encrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl 36(%esp),%ebx - andl $-16,%eax - jz .L013ecb_ret - movl 240(%edx),%ecx - testl %ebx,%ebx - jz .L014ecb_decrypt - movl %edx,%ebp - movl %ecx,%ebx - cmpl $96,%eax - jb .L015ecb_enc_tail - movdqu (%esi),%xmm2 - movdqu 16(%esi),%xmm3 - movdqu 32(%esi),%xmm4 - movdqu 48(%esi),%xmm5 - movdqu 64(%esi),%xmm6 - movdqu 80(%esi),%xmm7 - leal 96(%esi),%esi - subl $96,%eax - jmp .L016ecb_enc_loop6_enter -.align 16 -.L017ecb_enc_loop6: - movups %xmm2,(%edi) - movdqu (%esi),%xmm2 - movups %xmm3,16(%edi) - movdqu 16(%esi),%xmm3 - movups %xmm4,32(%edi) - movdqu 32(%esi),%xmm4 - movups %xmm5,48(%edi) - movdqu 48(%esi),%xmm5 - movups %xmm6,64(%edi) - movdqu 64(%esi),%xmm6 - movups %xmm7,80(%edi) - leal 96(%edi),%edi - movdqu 80(%esi),%xmm7 - leal 96(%esi),%esi -.L016ecb_enc_loop6_enter: - call _aesni_encrypt6 - movl %ebp,%edx - movl %ebx,%ecx - subl $96,%eax - jnc .L017ecb_enc_loop6 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - movups %xmm7,80(%edi) - leal 96(%edi),%edi - addl $96,%eax - jz .L013ecb_ret -.L015ecb_enc_tail: - movups (%esi),%xmm2 - cmpl $32,%eax - jb .L018ecb_enc_one - movups 16(%esi),%xmm3 - je .L019ecb_enc_two - movups 32(%esi),%xmm4 - cmpl $64,%eax - jb .L020ecb_enc_three - movups 48(%esi),%xmm5 - je .L021ecb_enc_four - movups 64(%esi),%xmm6 - xorps %xmm7,%xmm7 - call _aesni_encrypt6 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - jmp .L013ecb_ret -.align 16 -.L018ecb_enc_one: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L022enc1_loop_3: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L022enc1_loop_3 -.byte 102,15,56,221,209 - movups %xmm2,(%edi) - jmp .L013ecb_ret -.align 16 -.L019ecb_enc_two: - call _aesni_encrypt2 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - jmp .L013ecb_ret -.align 16 -.L020ecb_enc_three: - call _aesni_encrypt3 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - jmp .L013ecb_ret -.align 16 -.L021ecb_enc_four: - call _aesni_encrypt4 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - jmp .L013ecb_ret -.align 16 -.L014ecb_decrypt: - movl %edx,%ebp - movl %ecx,%ebx - cmpl $96,%eax - jb .L023ecb_dec_tail - movdqu (%esi),%xmm2 - movdqu 16(%esi),%xmm3 - movdqu 32(%esi),%xmm4 - movdqu 48(%esi),%xmm5 - movdqu 64(%esi),%xmm6 - movdqu 80(%esi),%xmm7 - leal 96(%esi),%esi - subl $96,%eax - jmp .L024ecb_dec_loop6_enter -.align 16 -.L025ecb_dec_loop6: - movups %xmm2,(%edi) - movdqu (%esi),%xmm2 - movups %xmm3,16(%edi) - movdqu 16(%esi),%xmm3 - movups %xmm4,32(%edi) - movdqu 32(%esi),%xmm4 - movups %xmm5,48(%edi) - movdqu 48(%esi),%xmm5 - movups %xmm6,64(%edi) - movdqu 64(%esi),%xmm6 - movups %xmm7,80(%edi) - leal 96(%edi),%edi - movdqu 80(%esi),%xmm7 - leal 96(%esi),%esi -.L024ecb_dec_loop6_enter: - call _aesni_decrypt6 - movl %ebp,%edx - movl %ebx,%ecx - subl $96,%eax - jnc .L025ecb_dec_loop6 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - movups %xmm7,80(%edi) - leal 96(%edi),%edi - addl $96,%eax - jz .L013ecb_ret -.L023ecb_dec_tail: - movups (%esi),%xmm2 - cmpl $32,%eax - jb .L026ecb_dec_one - movups 16(%esi),%xmm3 - je .L027ecb_dec_two - movups 32(%esi),%xmm4 - cmpl $64,%eax - jb .L028ecb_dec_three - movups 48(%esi),%xmm5 - je .L029ecb_dec_four - movups 64(%esi),%xmm6 - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - jmp .L013ecb_ret -.align 16 -.L026ecb_dec_one: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L030dec1_loop_4: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L030dec1_loop_4 -.byte 102,15,56,223,209 - movups %xmm2,(%edi) - jmp .L013ecb_ret -.align 16 -.L027ecb_dec_two: - call _aesni_decrypt2 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - jmp .L013ecb_ret -.align 16 -.L028ecb_dec_three: - call _aesni_decrypt3 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - jmp .L013ecb_ret -.align 16 -.L029ecb_dec_four: - call _aesni_decrypt4 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) -.L013ecb_ret: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size aes_hw_ecb_encrypt,.-.L_aes_hw_ecb_encrypt_begin -.globl aes_hw_ccm64_encrypt_blocks -.hidden aes_hw_ccm64_encrypt_blocks -.type aes_hw_ccm64_encrypt_blocks,@function -.align 16 -aes_hw_ccm64_encrypt_blocks: -.L_aes_hw_ccm64_encrypt_blocks_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl 36(%esp),%ebx - movl 40(%esp),%ecx - movl %esp,%ebp - subl $60,%esp - andl $-16,%esp - movl %ebp,48(%esp) - movdqu (%ebx),%xmm7 - movdqu (%ecx),%xmm3 - movl 240(%edx),%ecx - movl $202182159,(%esp) - movl $134810123,4(%esp) - movl $67438087,8(%esp) - movl $66051,12(%esp) - movl $1,%ebx - xorl %ebp,%ebp - movl %ebx,16(%esp) - movl %ebp,20(%esp) - movl %ebp,24(%esp) - movl %ebp,28(%esp) - shll $4,%ecx - movl $16,%ebx - leal (%edx),%ebp - movdqa (%esp),%xmm5 - movdqa %xmm7,%xmm2 - leal 32(%edx,%ecx,1),%edx - subl %ecx,%ebx -.byte 102,15,56,0,253 -.L031ccm64_enc_outer: - movups (%ebp),%xmm0 - movl %ebx,%ecx - movups (%esi),%xmm6 - xorps %xmm0,%xmm2 - movups 16(%ebp),%xmm1 - xorps %xmm6,%xmm0 - xorps %xmm0,%xmm3 - movups 32(%ebp),%xmm0 -.L032ccm64_enc2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%edx,%ecx,1),%xmm0 - jnz .L032ccm64_enc2_loop -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - paddq 16(%esp),%xmm7 - decl %eax -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - leal 16(%esi),%esi - xorps %xmm2,%xmm6 - movdqa %xmm7,%xmm2 - movups %xmm6,(%edi) -.byte 102,15,56,0,213 - leal 16(%edi),%edi - jnz .L031ccm64_enc_outer - movl 48(%esp),%esp - movl 40(%esp),%edi - movups %xmm3,(%edi) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size aes_hw_ccm64_encrypt_blocks,.-.L_aes_hw_ccm64_encrypt_blocks_begin -.globl aes_hw_ccm64_decrypt_blocks -.hidden aes_hw_ccm64_decrypt_blocks -.type aes_hw_ccm64_decrypt_blocks,@function -.align 16 -aes_hw_ccm64_decrypt_blocks: -.L_aes_hw_ccm64_decrypt_blocks_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl 36(%esp),%ebx - movl 40(%esp),%ecx - movl %esp,%ebp - subl $60,%esp - andl $-16,%esp - movl %ebp,48(%esp) - movdqu (%ebx),%xmm7 - movdqu (%ecx),%xmm3 - movl 240(%edx),%ecx - movl $202182159,(%esp) - movl $134810123,4(%esp) - movl $67438087,8(%esp) - movl $66051,12(%esp) - movl $1,%ebx - xorl %ebp,%ebp - movl %ebx,16(%esp) - movl %ebp,20(%esp) - movl %ebp,24(%esp) - movl %ebp,28(%esp) - movdqa (%esp),%xmm5 - movdqa %xmm7,%xmm2 - movl %edx,%ebp - movl %ecx,%ebx -.byte 102,15,56,0,253 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L033enc1_loop_5: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L033enc1_loop_5 -.byte 102,15,56,221,209 - shll $4,%ebx - movl $16,%ecx - movups (%esi),%xmm6 - paddq 16(%esp),%xmm7 - leal 16(%esi),%esi - subl %ebx,%ecx - leal 32(%ebp,%ebx,1),%edx - movl %ecx,%ebx - jmp .L034ccm64_dec_outer -.align 16 -.L034ccm64_dec_outer: - xorps %xmm2,%xmm6 - movdqa %xmm7,%xmm2 - movups %xmm6,(%edi) - leal 16(%edi),%edi -.byte 102,15,56,0,213 - subl $1,%eax - jz .L035ccm64_dec_break - movups (%ebp),%xmm0 - movl %ebx,%ecx - movups 16(%ebp),%xmm1 - xorps %xmm0,%xmm6 - xorps %xmm0,%xmm2 - xorps %xmm6,%xmm3 - movups 32(%ebp),%xmm0 -.L036ccm64_dec2_loop: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%edx,%ecx,1),%xmm1 - addl $32,%ecx -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%edx,%ecx,1),%xmm0 - jnz .L036ccm64_dec2_loop - movups (%esi),%xmm6 - paddq 16(%esp),%xmm7 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - leal 16(%esi),%esi - jmp .L034ccm64_dec_outer -.align 16 -.L035ccm64_dec_break: - movl 240(%ebp),%ecx - movl %ebp,%edx - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm6 - leal 32(%edx),%edx - xorps %xmm6,%xmm3 -.L037enc1_loop_6: -.byte 102,15,56,220,217 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L037enc1_loop_6 -.byte 102,15,56,221,217 - movl 48(%esp),%esp - movl 40(%esp),%edi - movups %xmm3,(%edi) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size aes_hw_ccm64_decrypt_blocks,.-.L_aes_hw_ccm64_decrypt_blocks_begin -.globl aes_hw_ctr32_encrypt_blocks -.hidden aes_hw_ctr32_encrypt_blocks -.type aes_hw_ctr32_encrypt_blocks,@function -.align 16 -aes_hw_ctr32_encrypt_blocks: -.L_aes_hw_ctr32_encrypt_blocks_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi -#ifdef BORINGSSL_DISPATCH_TEST - pushl %ebx - pushl %edx - call .L038pic -.L038pic: - popl %ebx - leal BORINGSSL_function_hit+0-.L038pic(%ebx),%ebx - movl $1,%edx - movb %dl,(%ebx) - popl %edx - popl %ebx -#endif - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl 36(%esp),%ebx - movl %esp,%ebp - subl $88,%esp - andl $-16,%esp - movl %ebp,80(%esp) - cmpl $1,%eax - je .L039ctr32_one_shortcut - movdqu (%ebx),%xmm7 - movl $202182159,(%esp) - movl $134810123,4(%esp) - movl $67438087,8(%esp) - movl $66051,12(%esp) - movl $6,%ecx - xorl %ebp,%ebp - movl %ecx,16(%esp) - movl %ecx,20(%esp) - movl %ecx,24(%esp) - movl %ebp,28(%esp) -.byte 102,15,58,22,251,3 -.byte 102,15,58,34,253,3 - movl 240(%edx),%ecx - bswap %ebx - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movdqa (%esp),%xmm2 -.byte 102,15,58,34,195,0 - leal 3(%ebx),%ebp -.byte 102,15,58,34,205,0 - incl %ebx -.byte 102,15,58,34,195,1 - incl %ebp -.byte 102,15,58,34,205,1 - incl %ebx -.byte 102,15,58,34,195,2 - incl %ebp -.byte 102,15,58,34,205,2 - movdqa %xmm0,48(%esp) -.byte 102,15,56,0,194 - movdqu (%edx),%xmm6 - movdqa %xmm1,64(%esp) -.byte 102,15,56,0,202 - pshufd $192,%xmm0,%xmm2 - pshufd $128,%xmm0,%xmm3 - cmpl $6,%eax - jb .L040ctr32_tail - pxor %xmm6,%xmm7 - shll $4,%ecx - movl $16,%ebx - movdqa %xmm7,32(%esp) - movl %edx,%ebp - subl %ecx,%ebx - leal 32(%edx,%ecx,1),%edx - subl $6,%eax - jmp .L041ctr32_loop6 -.align 16 -.L041ctr32_loop6: - pshufd $64,%xmm0,%xmm4 - movdqa 32(%esp),%xmm0 - pshufd $192,%xmm1,%xmm5 - pxor %xmm0,%xmm2 - pshufd $128,%xmm1,%xmm6 - pxor %xmm0,%xmm3 - pshufd $64,%xmm1,%xmm7 - movups 16(%ebp),%xmm1 - pxor %xmm0,%xmm4 - pxor %xmm0,%xmm5 -.byte 102,15,56,220,209 - pxor %xmm0,%xmm6 - pxor %xmm0,%xmm7 -.byte 102,15,56,220,217 - movups 32(%ebp),%xmm0 - movl %ebx,%ecx -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - call .L_aesni_encrypt6_enter - movups (%esi),%xmm1 - movups 16(%esi),%xmm0 - xorps %xmm1,%xmm2 - movups 32(%esi),%xmm1 - xorps %xmm0,%xmm3 - movups %xmm2,(%edi) - movdqa 16(%esp),%xmm0 - xorps %xmm1,%xmm4 - movdqa 64(%esp),%xmm1 - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - paddd %xmm0,%xmm1 - paddd 48(%esp),%xmm0 - movdqa (%esp),%xmm2 - movups 48(%esi),%xmm3 - movups 64(%esi),%xmm4 - xorps %xmm3,%xmm5 - movups 80(%esi),%xmm3 - leal 96(%esi),%esi - movdqa %xmm0,48(%esp) -.byte 102,15,56,0,194 - xorps %xmm4,%xmm6 - movups %xmm5,48(%edi) - xorps %xmm3,%xmm7 - movdqa %xmm1,64(%esp) -.byte 102,15,56,0,202 - movups %xmm6,64(%edi) - pshufd $192,%xmm0,%xmm2 - movups %xmm7,80(%edi) - leal 96(%edi),%edi - pshufd $128,%xmm0,%xmm3 - subl $6,%eax - jnc .L041ctr32_loop6 - addl $6,%eax - jz .L042ctr32_ret - movdqu (%ebp),%xmm7 - movl %ebp,%edx - pxor 32(%esp),%xmm7 - movl 240(%ebp),%ecx -.L040ctr32_tail: - por %xmm7,%xmm2 - cmpl $2,%eax - jb .L043ctr32_one - pshufd $64,%xmm0,%xmm4 - por %xmm7,%xmm3 - je .L044ctr32_two - pshufd $192,%xmm1,%xmm5 - por %xmm7,%xmm4 - cmpl $4,%eax - jb .L045ctr32_three - pshufd $128,%xmm1,%xmm6 - por %xmm7,%xmm5 - je .L046ctr32_four - por %xmm7,%xmm6 - call _aesni_encrypt6 - movups (%esi),%xmm1 - movups 16(%esi),%xmm0 - xorps %xmm1,%xmm2 - movups 32(%esi),%xmm1 - xorps %xmm0,%xmm3 - movups 48(%esi),%xmm0 - xorps %xmm1,%xmm4 - movups 64(%esi),%xmm1 - xorps %xmm0,%xmm5 - movups %xmm2,(%edi) - xorps %xmm1,%xmm6 - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - jmp .L042ctr32_ret -.align 16 -.L039ctr32_one_shortcut: - movups (%ebx),%xmm2 - movl 240(%edx),%ecx -.L043ctr32_one: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L047enc1_loop_7: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L047enc1_loop_7 -.byte 102,15,56,221,209 - movups (%esi),%xmm6 - xorps %xmm2,%xmm6 - movups %xmm6,(%edi) - jmp .L042ctr32_ret -.align 16 -.L044ctr32_two: - call _aesni_encrypt2 - movups (%esi),%xmm5 - movups 16(%esi),%xmm6 - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - jmp .L042ctr32_ret -.align 16 -.L045ctr32_three: - call _aesni_encrypt3 - movups (%esi),%xmm5 - movups 16(%esi),%xmm6 - xorps %xmm5,%xmm2 - movups 32(%esi),%xmm7 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - xorps %xmm7,%xmm4 - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - jmp .L042ctr32_ret -.align 16 -.L046ctr32_four: - call _aesni_encrypt4 - movups (%esi),%xmm6 - movups 16(%esi),%xmm7 - movups 32(%esi),%xmm1 - xorps %xmm6,%xmm2 - movups 48(%esi),%xmm0 - xorps %xmm7,%xmm3 - movups %xmm2,(%edi) - xorps %xmm1,%xmm4 - movups %xmm3,16(%edi) - xorps %xmm0,%xmm5 - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) -.L042ctr32_ret: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - movdqa %xmm0,32(%esp) - pxor %xmm5,%xmm5 - movdqa %xmm0,48(%esp) - pxor %xmm6,%xmm6 - movdqa %xmm0,64(%esp) - pxor %xmm7,%xmm7 - movl 80(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size aes_hw_ctr32_encrypt_blocks,.-.L_aes_hw_ctr32_encrypt_blocks_begin -.globl aes_hw_xts_encrypt -.hidden aes_hw_xts_encrypt -.type aes_hw_xts_encrypt,@function -.align 16 -aes_hw_xts_encrypt: -.L_aes_hw_xts_encrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 36(%esp),%edx - movl 40(%esp),%esi - movl 240(%edx),%ecx - movups (%esi),%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L048enc1_loop_8: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L048enc1_loop_8 -.byte 102,15,56,221,209 - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl %esp,%ebp - subl $120,%esp - movl 240(%edx),%ecx - andl $-16,%esp - movl $135,96(%esp) - movl $0,100(%esp) - movl $1,104(%esp) - movl $0,108(%esp) - movl %eax,112(%esp) - movl %ebp,116(%esp) - movdqa %xmm2,%xmm1 - pxor %xmm0,%xmm0 - movdqa 96(%esp),%xmm3 - pcmpgtd %xmm1,%xmm0 - andl $-16,%eax - movl %edx,%ebp - movl %ecx,%ebx - subl $96,%eax - jc .L049xts_enc_short - shll $4,%ecx - movl $16,%ebx - subl %ecx,%ebx - leal 32(%edx,%ecx,1),%edx - jmp .L050xts_enc_loop6 -.align 16 -.L050xts_enc_loop6: - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,16(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,32(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,48(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm7 - movdqa %xmm1,64(%esp) - paddq %xmm1,%xmm1 - movups (%ebp),%xmm0 - pand %xmm3,%xmm7 - movups (%esi),%xmm2 - pxor %xmm1,%xmm7 - movl %ebx,%ecx - movdqu 16(%esi),%xmm3 - xorps %xmm0,%xmm2 - movdqu 32(%esi),%xmm4 - pxor %xmm0,%xmm3 - movdqu 48(%esi),%xmm5 - pxor %xmm0,%xmm4 - movdqu 64(%esi),%xmm6 - pxor %xmm0,%xmm5 - movdqu 80(%esi),%xmm1 - pxor %xmm0,%xmm6 - leal 96(%esi),%esi - pxor (%esp),%xmm2 - movdqa %xmm7,80(%esp) - pxor %xmm1,%xmm7 - movups 16(%ebp),%xmm1 - pxor 16(%esp),%xmm3 - pxor 32(%esp),%xmm4 -.byte 102,15,56,220,209 - pxor 48(%esp),%xmm5 - pxor 64(%esp),%xmm6 -.byte 102,15,56,220,217 - pxor %xmm0,%xmm7 - movups 32(%ebp),%xmm0 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - call .L_aesni_encrypt6_enter - movdqa 80(%esp),%xmm1 - pxor %xmm0,%xmm0 - xorps (%esp),%xmm2 - pcmpgtd %xmm1,%xmm0 - xorps 16(%esp),%xmm3 - movups %xmm2,(%edi) - xorps 32(%esp),%xmm4 - movups %xmm3,16(%edi) - xorps 48(%esp),%xmm5 - movups %xmm4,32(%edi) - xorps 64(%esp),%xmm6 - movups %xmm5,48(%edi) - xorps %xmm1,%xmm7 - movups %xmm6,64(%edi) - pshufd $19,%xmm0,%xmm2 - movups %xmm7,80(%edi) - leal 96(%edi),%edi - movdqa 96(%esp),%xmm3 - pxor %xmm0,%xmm0 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - subl $96,%eax - jnc .L050xts_enc_loop6 - movl 240(%ebp),%ecx - movl %ebp,%edx - movl %ecx,%ebx -.L049xts_enc_short: - addl $96,%eax - jz .L051xts_enc_done6x - movdqa %xmm1,%xmm5 - cmpl $32,%eax - jb .L052xts_enc_one - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - je .L053xts_enc_two - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,%xmm6 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - cmpl $64,%eax - jb .L054xts_enc_three - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,%xmm7 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - movdqa %xmm5,(%esp) - movdqa %xmm6,16(%esp) - je .L055xts_enc_four - movdqa %xmm7,32(%esp) - pshufd $19,%xmm0,%xmm7 - movdqa %xmm1,48(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm7 - pxor %xmm1,%xmm7 - movdqu (%esi),%xmm2 - movdqu 16(%esi),%xmm3 - movdqu 32(%esi),%xmm4 - pxor (%esp),%xmm2 - movdqu 48(%esi),%xmm5 - pxor 16(%esp),%xmm3 - movdqu 64(%esi),%xmm6 - pxor 32(%esp),%xmm4 - leal 80(%esi),%esi - pxor 48(%esp),%xmm5 - movdqa %xmm7,64(%esp) - pxor %xmm7,%xmm6 - call _aesni_encrypt6 - movaps 64(%esp),%xmm1 - xorps (%esp),%xmm2 - xorps 16(%esp),%xmm3 - xorps 32(%esp),%xmm4 - movups %xmm2,(%edi) - xorps 48(%esp),%xmm5 - movups %xmm3,16(%edi) - xorps %xmm1,%xmm6 - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - leal 80(%edi),%edi - jmp .L056xts_enc_done -.align 16 -.L052xts_enc_one: - movups (%esi),%xmm2 - leal 16(%esi),%esi - xorps %xmm5,%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L057enc1_loop_9: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L057enc1_loop_9 -.byte 102,15,56,221,209 - xorps %xmm5,%xmm2 - movups %xmm2,(%edi) - leal 16(%edi),%edi - movdqa %xmm5,%xmm1 - jmp .L056xts_enc_done -.align 16 -.L053xts_enc_two: - movaps %xmm1,%xmm6 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - leal 32(%esi),%esi - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - call _aesni_encrypt2 - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - leal 32(%edi),%edi - movdqa %xmm6,%xmm1 - jmp .L056xts_enc_done -.align 16 -.L054xts_enc_three: - movaps %xmm1,%xmm7 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - movups 32(%esi),%xmm4 - leal 48(%esi),%esi - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - xorps %xmm7,%xmm4 - call _aesni_encrypt3 - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - xorps %xmm7,%xmm4 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - leal 48(%edi),%edi - movdqa %xmm7,%xmm1 - jmp .L056xts_enc_done -.align 16 -.L055xts_enc_four: - movaps %xmm1,%xmm6 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - movups 32(%esi),%xmm4 - xorps (%esp),%xmm2 - movups 48(%esi),%xmm5 - leal 64(%esi),%esi - xorps 16(%esp),%xmm3 - xorps %xmm7,%xmm4 - xorps %xmm6,%xmm5 - call _aesni_encrypt4 - xorps (%esp),%xmm2 - xorps 16(%esp),%xmm3 - xorps %xmm7,%xmm4 - movups %xmm2,(%edi) - xorps %xmm6,%xmm5 - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - leal 64(%edi),%edi - movdqa %xmm6,%xmm1 - jmp .L056xts_enc_done -.align 16 -.L051xts_enc_done6x: - movl 112(%esp),%eax - andl $15,%eax - jz .L058xts_enc_ret - movdqa %xmm1,%xmm5 - movl %eax,112(%esp) - jmp .L059xts_enc_steal -.align 16 -.L056xts_enc_done: - movl 112(%esp),%eax - pxor %xmm0,%xmm0 - andl $15,%eax - jz .L058xts_enc_ret - pcmpgtd %xmm1,%xmm0 - movl %eax,112(%esp) - pshufd $19,%xmm0,%xmm5 - paddq %xmm1,%xmm1 - pand 96(%esp),%xmm5 - pxor %xmm1,%xmm5 -.L059xts_enc_steal: - movzbl (%esi),%ecx - movzbl -16(%edi),%edx - leal 1(%esi),%esi - movb %cl,-16(%edi) - movb %dl,(%edi) - leal 1(%edi),%edi - subl $1,%eax - jnz .L059xts_enc_steal - subl 112(%esp),%edi - movl %ebp,%edx - movl %ebx,%ecx - movups -16(%edi),%xmm2 - xorps %xmm5,%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L060enc1_loop_10: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L060enc1_loop_10 -.byte 102,15,56,221,209 - xorps %xmm5,%xmm2 - movups %xmm2,-16(%edi) -.L058xts_enc_ret: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movdqa %xmm0,(%esp) - pxor %xmm3,%xmm3 - movdqa %xmm0,16(%esp) - pxor %xmm4,%xmm4 - movdqa %xmm0,32(%esp) - pxor %xmm5,%xmm5 - movdqa %xmm0,48(%esp) - pxor %xmm6,%xmm6 - movdqa %xmm0,64(%esp) - pxor %xmm7,%xmm7 - movdqa %xmm0,80(%esp) - movl 116(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size aes_hw_xts_encrypt,.-.L_aes_hw_xts_encrypt_begin -.globl aes_hw_xts_decrypt -.hidden aes_hw_xts_decrypt -.type aes_hw_xts_decrypt,@function -.align 16 -aes_hw_xts_decrypt: -.L_aes_hw_xts_decrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 36(%esp),%edx - movl 40(%esp),%esi - movl 240(%edx),%ecx - movups (%esi),%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L061enc1_loop_11: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L061enc1_loop_11 -.byte 102,15,56,221,209 - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - movl %esp,%ebp - subl $120,%esp - andl $-16,%esp - xorl %ebx,%ebx - testl $15,%eax - setnz %bl - shll $4,%ebx - subl %ebx,%eax - movl $135,96(%esp) - movl $0,100(%esp) - movl $1,104(%esp) - movl $0,108(%esp) - movl %eax,112(%esp) - movl %ebp,116(%esp) - movl 240(%edx),%ecx - movl %edx,%ebp - movl %ecx,%ebx - movdqa %xmm2,%xmm1 - pxor %xmm0,%xmm0 - movdqa 96(%esp),%xmm3 - pcmpgtd %xmm1,%xmm0 - andl $-16,%eax - subl $96,%eax - jc .L062xts_dec_short - shll $4,%ecx - movl $16,%ebx - subl %ecx,%ebx - leal 32(%edx,%ecx,1),%edx - jmp .L063xts_dec_loop6 -.align 16 -.L063xts_dec_loop6: - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,16(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,32(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,48(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - pshufd $19,%xmm0,%xmm7 - movdqa %xmm1,64(%esp) - paddq %xmm1,%xmm1 - movups (%ebp),%xmm0 - pand %xmm3,%xmm7 - movups (%esi),%xmm2 - pxor %xmm1,%xmm7 - movl %ebx,%ecx - movdqu 16(%esi),%xmm3 - xorps %xmm0,%xmm2 - movdqu 32(%esi),%xmm4 - pxor %xmm0,%xmm3 - movdqu 48(%esi),%xmm5 - pxor %xmm0,%xmm4 - movdqu 64(%esi),%xmm6 - pxor %xmm0,%xmm5 - movdqu 80(%esi),%xmm1 - pxor %xmm0,%xmm6 - leal 96(%esi),%esi - pxor (%esp),%xmm2 - movdqa %xmm7,80(%esp) - pxor %xmm1,%xmm7 - movups 16(%ebp),%xmm1 - pxor 16(%esp),%xmm3 - pxor 32(%esp),%xmm4 -.byte 102,15,56,222,209 - pxor 48(%esp),%xmm5 - pxor 64(%esp),%xmm6 -.byte 102,15,56,222,217 - pxor %xmm0,%xmm7 - movups 32(%ebp),%xmm0 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - call .L_aesni_decrypt6_enter - movdqa 80(%esp),%xmm1 - pxor %xmm0,%xmm0 - xorps (%esp),%xmm2 - pcmpgtd %xmm1,%xmm0 - xorps 16(%esp),%xmm3 - movups %xmm2,(%edi) - xorps 32(%esp),%xmm4 - movups %xmm3,16(%edi) - xorps 48(%esp),%xmm5 - movups %xmm4,32(%edi) - xorps 64(%esp),%xmm6 - movups %xmm5,48(%edi) - xorps %xmm1,%xmm7 - movups %xmm6,64(%edi) - pshufd $19,%xmm0,%xmm2 - movups %xmm7,80(%edi) - leal 96(%edi),%edi - movdqa 96(%esp),%xmm3 - pxor %xmm0,%xmm0 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - subl $96,%eax - jnc .L063xts_dec_loop6 - movl 240(%ebp),%ecx - movl %ebp,%edx - movl %ecx,%ebx -.L062xts_dec_short: - addl $96,%eax - jz .L064xts_dec_done6x - movdqa %xmm1,%xmm5 - cmpl $32,%eax - jb .L065xts_dec_one - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - je .L066xts_dec_two - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,%xmm6 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - cmpl $64,%eax - jb .L067xts_dec_three - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa %xmm1,%xmm7 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 - movdqa %xmm5,(%esp) - movdqa %xmm6,16(%esp) - je .L068xts_dec_four - movdqa %xmm7,32(%esp) - pshufd $19,%xmm0,%xmm7 - movdqa %xmm1,48(%esp) - paddq %xmm1,%xmm1 - pand %xmm3,%xmm7 - pxor %xmm1,%xmm7 - movdqu (%esi),%xmm2 - movdqu 16(%esi),%xmm3 - movdqu 32(%esi),%xmm4 - pxor (%esp),%xmm2 - movdqu 48(%esi),%xmm5 - pxor 16(%esp),%xmm3 - movdqu 64(%esi),%xmm6 - pxor 32(%esp),%xmm4 - leal 80(%esi),%esi - pxor 48(%esp),%xmm5 - movdqa %xmm7,64(%esp) - pxor %xmm7,%xmm6 - call _aesni_decrypt6 - movaps 64(%esp),%xmm1 - xorps (%esp),%xmm2 - xorps 16(%esp),%xmm3 - xorps 32(%esp),%xmm4 - movups %xmm2,(%edi) - xorps 48(%esp),%xmm5 - movups %xmm3,16(%edi) - xorps %xmm1,%xmm6 - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - movups %xmm6,64(%edi) - leal 80(%edi),%edi - jmp .L069xts_dec_done -.align 16 -.L065xts_dec_one: - movups (%esi),%xmm2 - leal 16(%esi),%esi - xorps %xmm5,%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L070dec1_loop_12: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L070dec1_loop_12 -.byte 102,15,56,223,209 - xorps %xmm5,%xmm2 - movups %xmm2,(%edi) - leal 16(%edi),%edi - movdqa %xmm5,%xmm1 - jmp .L069xts_dec_done -.align 16 -.L066xts_dec_two: - movaps %xmm1,%xmm6 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - leal 32(%esi),%esi - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - call _aesni_decrypt2 - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - leal 32(%edi),%edi - movdqa %xmm6,%xmm1 - jmp .L069xts_dec_done -.align 16 -.L067xts_dec_three: - movaps %xmm1,%xmm7 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - movups 32(%esi),%xmm4 - leal 48(%esi),%esi - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - xorps %xmm7,%xmm4 - call _aesni_decrypt3 - xorps %xmm5,%xmm2 - xorps %xmm6,%xmm3 - xorps %xmm7,%xmm4 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - leal 48(%edi),%edi - movdqa %xmm7,%xmm1 - jmp .L069xts_dec_done -.align 16 -.L068xts_dec_four: - movaps %xmm1,%xmm6 - movups (%esi),%xmm2 - movups 16(%esi),%xmm3 - movups 32(%esi),%xmm4 - xorps (%esp),%xmm2 - movups 48(%esi),%xmm5 - leal 64(%esi),%esi - xorps 16(%esp),%xmm3 - xorps %xmm7,%xmm4 - xorps %xmm6,%xmm5 - call _aesni_decrypt4 - xorps (%esp),%xmm2 - xorps 16(%esp),%xmm3 - xorps %xmm7,%xmm4 - movups %xmm2,(%edi) - xorps %xmm6,%xmm5 - movups %xmm3,16(%edi) - movups %xmm4,32(%edi) - movups %xmm5,48(%edi) - leal 64(%edi),%edi - movdqa %xmm6,%xmm1 - jmp .L069xts_dec_done -.align 16 -.L064xts_dec_done6x: - movl 112(%esp),%eax - andl $15,%eax - jz .L071xts_dec_ret - movl %eax,112(%esp) - jmp .L072xts_dec_only_one_more -.align 16 -.L069xts_dec_done: - movl 112(%esp),%eax - pxor %xmm0,%xmm0 - andl $15,%eax - jz .L071xts_dec_ret - pcmpgtd %xmm1,%xmm0 - movl %eax,112(%esp) - pshufd $19,%xmm0,%xmm2 - pxor %xmm0,%xmm0 - movdqa 96(%esp),%xmm3 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm2 - pcmpgtd %xmm1,%xmm0 - pxor %xmm2,%xmm1 -.L072xts_dec_only_one_more: - pshufd $19,%xmm0,%xmm5 - movdqa %xmm1,%xmm6 - paddq %xmm1,%xmm1 - pand %xmm3,%xmm5 - pxor %xmm1,%xmm5 - movl %ebp,%edx - movl %ebx,%ecx - movups (%esi),%xmm2 - xorps %xmm5,%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L073dec1_loop_13: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L073dec1_loop_13 -.byte 102,15,56,223,209 - xorps %xmm5,%xmm2 - movups %xmm2,(%edi) -.L074xts_dec_steal: - movzbl 16(%esi),%ecx - movzbl (%edi),%edx - leal 1(%esi),%esi - movb %cl,(%edi) - movb %dl,16(%edi) - leal 1(%edi),%edi - subl $1,%eax - jnz .L074xts_dec_steal - subl 112(%esp),%edi - movl %ebp,%edx - movl %ebx,%ecx - movups (%edi),%xmm2 - xorps %xmm6,%xmm2 - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L075dec1_loop_14: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L075dec1_loop_14 -.byte 102,15,56,223,209 - xorps %xmm6,%xmm2 - movups %xmm2,(%edi) -.L071xts_dec_ret: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - movdqa %xmm0,(%esp) - pxor %xmm3,%xmm3 - movdqa %xmm0,16(%esp) - pxor %xmm4,%xmm4 - movdqa %xmm0,32(%esp) - pxor %xmm5,%xmm5 - movdqa %xmm0,48(%esp) - pxor %xmm6,%xmm6 - movdqa %xmm0,64(%esp) - pxor %xmm7,%xmm7 - movdqa %xmm0,80(%esp) - movl 116(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size aes_hw_xts_decrypt,.-.L_aes_hw_xts_decrypt_begin -.globl aes_hw_cbc_encrypt -.hidden aes_hw_cbc_encrypt -.type aes_hw_cbc_encrypt,@function -.align 16 -aes_hw_cbc_encrypt: -.L_aes_hw_cbc_encrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl %esp,%ebx - movl 24(%esp),%edi - subl $24,%ebx - movl 28(%esp),%eax - andl $-16,%ebx - movl 32(%esp),%edx - movl 36(%esp),%ebp - testl %eax,%eax - jz .L076cbc_abort - cmpl $0,40(%esp) - xchgl %esp,%ebx - movups (%ebp),%xmm7 - movl 240(%edx),%ecx - movl %edx,%ebp - movl %ebx,16(%esp) - movl %ecx,%ebx - je .L077cbc_decrypt - movaps %xmm7,%xmm2 - cmpl $16,%eax - jb .L078cbc_enc_tail - subl $16,%eax - jmp .L079cbc_enc_loop -.align 16 -.L079cbc_enc_loop: - movups (%esi),%xmm7 - leal 16(%esi),%esi - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - xorps %xmm0,%xmm7 - leal 32(%edx),%edx - xorps %xmm7,%xmm2 -.L080enc1_loop_15: -.byte 102,15,56,220,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L080enc1_loop_15 -.byte 102,15,56,221,209 - movl %ebx,%ecx - movl %ebp,%edx - movups %xmm2,(%edi) - leal 16(%edi),%edi - subl $16,%eax - jnc .L079cbc_enc_loop - addl $16,%eax - jnz .L078cbc_enc_tail - movaps %xmm2,%xmm7 - pxor %xmm2,%xmm2 - jmp .L081cbc_ret -.L078cbc_enc_tail: - movl %eax,%ecx -.long 2767451785 - movl $16,%ecx - subl %eax,%ecx - xorl %eax,%eax -.long 2868115081 - leal -16(%edi),%edi - movl %ebx,%ecx - movl %edi,%esi - movl %ebp,%edx - jmp .L079cbc_enc_loop -.align 16 -.L077cbc_decrypt: - cmpl $80,%eax - jbe .L082cbc_dec_tail - movaps %xmm7,(%esp) - subl $80,%eax - jmp .L083cbc_dec_loop6_enter -.align 16 -.L084cbc_dec_loop6: - movaps %xmm0,(%esp) - movups %xmm7,(%edi) - leal 16(%edi),%edi -.L083cbc_dec_loop6_enter: - movdqu (%esi),%xmm2 - movdqu 16(%esi),%xmm3 - movdqu 32(%esi),%xmm4 - movdqu 48(%esi),%xmm5 - movdqu 64(%esi),%xmm6 - movdqu 80(%esi),%xmm7 - call _aesni_decrypt6 - movups (%esi),%xmm1 - movups 16(%esi),%xmm0 - xorps (%esp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%esi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%esi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%esi),%xmm1 - xorps %xmm0,%xmm6 - movups 80(%esi),%xmm0 - xorps %xmm1,%xmm7 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - leal 96(%esi),%esi - movups %xmm4,32(%edi) - movl %ebx,%ecx - movups %xmm5,48(%edi) - movl %ebp,%edx - movups %xmm6,64(%edi) - leal 80(%edi),%edi - subl $96,%eax - ja .L084cbc_dec_loop6 - movaps %xmm7,%xmm2 - movaps %xmm0,%xmm7 - addl $80,%eax - jle .L085cbc_dec_clear_tail_collected - movups %xmm2,(%edi) - leal 16(%edi),%edi -.L082cbc_dec_tail: - movups (%esi),%xmm2 - movaps %xmm2,%xmm6 - cmpl $16,%eax - jbe .L086cbc_dec_one - movups 16(%esi),%xmm3 - movaps %xmm3,%xmm5 - cmpl $32,%eax - jbe .L087cbc_dec_two - movups 32(%esi),%xmm4 - cmpl $48,%eax - jbe .L088cbc_dec_three - movups 48(%esi),%xmm5 - cmpl $64,%eax - jbe .L089cbc_dec_four - movups 64(%esi),%xmm6 - movaps %xmm7,(%esp) - movups (%esi),%xmm2 - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - movups (%esi),%xmm1 - movups 16(%esi),%xmm0 - xorps (%esp),%xmm2 - xorps %xmm1,%xmm3 - movups 32(%esi),%xmm1 - xorps %xmm0,%xmm4 - movups 48(%esi),%xmm0 - xorps %xmm1,%xmm5 - movups 64(%esi),%xmm7 - xorps %xmm0,%xmm6 - movups %xmm2,(%edi) - movups %xmm3,16(%edi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%edi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%edi) - pxor %xmm5,%xmm5 - leal 64(%edi),%edi - movaps %xmm6,%xmm2 - pxor %xmm6,%xmm6 - subl $80,%eax - jmp .L090cbc_dec_tail_collected -.align 16 -.L086cbc_dec_one: - movups (%edx),%xmm0 - movups 16(%edx),%xmm1 - leal 32(%edx),%edx - xorps %xmm0,%xmm2 -.L091dec1_loop_16: -.byte 102,15,56,222,209 - decl %ecx - movups (%edx),%xmm1 - leal 16(%edx),%edx - jnz .L091dec1_loop_16 -.byte 102,15,56,223,209 - xorps %xmm7,%xmm2 - movaps %xmm6,%xmm7 - subl $16,%eax - jmp .L090cbc_dec_tail_collected -.align 16 -.L087cbc_dec_two: - call _aesni_decrypt2 - xorps %xmm7,%xmm2 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - movaps %xmm3,%xmm2 - pxor %xmm3,%xmm3 - leal 16(%edi),%edi - movaps %xmm5,%xmm7 - subl $32,%eax - jmp .L090cbc_dec_tail_collected -.align 16 -.L088cbc_dec_three: - call _aesni_decrypt3 - xorps %xmm7,%xmm2 - xorps %xmm6,%xmm3 - xorps %xmm5,%xmm4 - movups %xmm2,(%edi) - movaps %xmm4,%xmm2 - pxor %xmm4,%xmm4 - movups %xmm3,16(%edi) - pxor %xmm3,%xmm3 - leal 32(%edi),%edi - movups 32(%esi),%xmm7 - subl $48,%eax - jmp .L090cbc_dec_tail_collected -.align 16 -.L089cbc_dec_four: - call _aesni_decrypt4 - movups 16(%esi),%xmm1 - movups 32(%esi),%xmm0 - xorps %xmm7,%xmm2 - movups 48(%esi),%xmm7 - xorps %xmm6,%xmm3 - movups %xmm2,(%edi) - xorps %xmm1,%xmm4 - movups %xmm3,16(%edi) - pxor %xmm3,%xmm3 - xorps %xmm0,%xmm5 - movups %xmm4,32(%edi) - pxor %xmm4,%xmm4 - leal 48(%edi),%edi - movaps %xmm5,%xmm2 - pxor %xmm5,%xmm5 - subl $64,%eax - jmp .L090cbc_dec_tail_collected -.align 16 -.L085cbc_dec_clear_tail_collected: - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 -.L090cbc_dec_tail_collected: - andl $15,%eax - jnz .L092cbc_dec_tail_partial - movups %xmm2,(%edi) - pxor %xmm0,%xmm0 - jmp .L081cbc_ret -.align 16 -.L092cbc_dec_tail_partial: - movaps %xmm2,(%esp) - pxor %xmm0,%xmm0 - movl $16,%ecx - movl %esp,%esi - subl %eax,%ecx -.long 2767451785 - movdqa %xmm2,(%esp) -.L081cbc_ret: - movl 16(%esp),%esp - movl 36(%esp),%ebp - pxor %xmm2,%xmm2 - pxor %xmm1,%xmm1 - movups %xmm7,(%ebp) - pxor %xmm7,%xmm7 -.L076cbc_abort: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size aes_hw_cbc_encrypt,.-.L_aes_hw_cbc_encrypt_begin -.hidden _aesni_set_encrypt_key -.type _aesni_set_encrypt_key,@function -.align 16 -_aesni_set_encrypt_key: - pushl %ebp - pushl %ebx - testl %eax,%eax - jz .L093bad_pointer - testl %edx,%edx - jz .L093bad_pointer - call .L094pic -.L094pic: - popl %ebx - leal .Lkey_const-.L094pic(%ebx),%ebx - leal OPENSSL_ia32cap_P-.Lkey_const(%ebx),%ebp - movups (%eax),%xmm0 - xorps %xmm4,%xmm4 - movl 4(%ebp),%ebp - leal 16(%edx),%edx - andl $268437504,%ebp - cmpl $256,%ecx - je .L09514rounds - cmpl $192,%ecx - je .L09612rounds - cmpl $128,%ecx - jne .L097bad_keybits -.align 16 -.L09810rounds: - cmpl $268435456,%ebp - je .L09910rounds_alt - movl $9,%ecx - movups %xmm0,-16(%edx) -.byte 102,15,58,223,200,1 - call .L100key_128_cold -.byte 102,15,58,223,200,2 - call .L101key_128 -.byte 102,15,58,223,200,4 - call .L101key_128 -.byte 102,15,58,223,200,8 - call .L101key_128 -.byte 102,15,58,223,200,16 - call .L101key_128 -.byte 102,15,58,223,200,32 - call .L101key_128 -.byte 102,15,58,223,200,64 - call .L101key_128 -.byte 102,15,58,223,200,128 - call .L101key_128 -.byte 102,15,58,223,200,27 - call .L101key_128 -.byte 102,15,58,223,200,54 - call .L101key_128 - movups %xmm0,(%edx) - movl %ecx,80(%edx) - jmp .L102good_key -.align 16 -.L101key_128: - movups %xmm0,(%edx) - leal 16(%edx),%edx -.L100key_128_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - ret -.align 16 -.L09910rounds_alt: - movdqa (%ebx),%xmm5 - movl $8,%ecx - movdqa 32(%ebx),%xmm4 - movdqa %xmm0,%xmm2 - movdqu %xmm0,-16(%edx) -.L103loop_key128: -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - pslld $1,%xmm4 - leal 16(%edx),%edx - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - pxor %xmm2,%xmm0 - movdqu %xmm0,-16(%edx) - movdqa %xmm0,%xmm2 - decl %ecx - jnz .L103loop_key128 - movdqa 48(%ebx),%xmm4 -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - pslld $1,%xmm4 - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - pxor %xmm2,%xmm0 - movdqu %xmm0,(%edx) - movdqa %xmm0,%xmm2 -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - pxor %xmm2,%xmm0 - movdqu %xmm0,16(%edx) - movl $9,%ecx - movl %ecx,96(%edx) - jmp .L102good_key -.align 16 -.L09612rounds: - movq 16(%eax),%xmm2 - cmpl $268435456,%ebp - je .L10412rounds_alt - movl $11,%ecx - movups %xmm0,-16(%edx) -.byte 102,15,58,223,202,1 - call .L105key_192a_cold -.byte 102,15,58,223,202,2 - call .L106key_192b -.byte 102,15,58,223,202,4 - call .L107key_192a -.byte 102,15,58,223,202,8 - call .L106key_192b -.byte 102,15,58,223,202,16 - call .L107key_192a -.byte 102,15,58,223,202,32 - call .L106key_192b -.byte 102,15,58,223,202,64 - call .L107key_192a -.byte 102,15,58,223,202,128 - call .L106key_192b - movups %xmm0,(%edx) - movl %ecx,48(%edx) - jmp .L102good_key -.align 16 -.L107key_192a: - movups %xmm0,(%edx) - leal 16(%edx),%edx -.align 16 -.L105key_192a_cold: - movaps %xmm2,%xmm5 -.L108key_192b_warm: - shufps $16,%xmm0,%xmm4 - movdqa %xmm2,%xmm3 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - pslldq $4,%xmm3 - xorps %xmm4,%xmm0 - pshufd $85,%xmm1,%xmm1 - pxor %xmm3,%xmm2 - pxor %xmm1,%xmm0 - pshufd $255,%xmm0,%xmm3 - pxor %xmm3,%xmm2 - ret -.align 16 -.L106key_192b: - movaps %xmm0,%xmm3 - shufps $68,%xmm0,%xmm5 - movups %xmm5,(%edx) - shufps $78,%xmm2,%xmm3 - movups %xmm3,16(%edx) - leal 32(%edx),%edx - jmp .L108key_192b_warm -.align 16 -.L10412rounds_alt: - movdqa 16(%ebx),%xmm5 - movdqa 32(%ebx),%xmm4 - movl $8,%ecx - movdqu %xmm0,-16(%edx) -.L109loop_key192: - movq %xmm2,(%edx) - movdqa %xmm2,%xmm1 -.byte 102,15,56,0,213 -.byte 102,15,56,221,212 - pslld $1,%xmm4 - leal 24(%edx),%edx - movdqa %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm3,%xmm0 - pshufd $255,%xmm0,%xmm3 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - pxor %xmm2,%xmm0 - pxor %xmm3,%xmm2 - movdqu %xmm0,-16(%edx) - decl %ecx - jnz .L109loop_key192 - movl $11,%ecx - movl %ecx,32(%edx) - jmp .L102good_key -.align 16 -.L09514rounds: - movups 16(%eax),%xmm2 - leal 16(%edx),%edx - cmpl $268435456,%ebp - je .L11014rounds_alt - movl $13,%ecx - movups %xmm0,-32(%edx) - movups %xmm2,-16(%edx) -.byte 102,15,58,223,202,1 - call .L111key_256a_cold -.byte 102,15,58,223,200,1 - call .L112key_256b -.byte 102,15,58,223,202,2 - call .L113key_256a -.byte 102,15,58,223,200,2 - call .L112key_256b -.byte 102,15,58,223,202,4 - call .L113key_256a -.byte 102,15,58,223,200,4 - call .L112key_256b -.byte 102,15,58,223,202,8 - call .L113key_256a -.byte 102,15,58,223,200,8 - call .L112key_256b -.byte 102,15,58,223,202,16 - call .L113key_256a -.byte 102,15,58,223,200,16 - call .L112key_256b -.byte 102,15,58,223,202,32 - call .L113key_256a -.byte 102,15,58,223,200,32 - call .L112key_256b -.byte 102,15,58,223,202,64 - call .L113key_256a - movups %xmm0,(%edx) - movl %ecx,16(%edx) - xorl %eax,%eax - jmp .L102good_key -.align 16 -.L113key_256a: - movups %xmm2,(%edx) - leal 16(%edx),%edx -.L111key_256a_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - ret -.align 16 -.L112key_256b: - movups %xmm0,(%edx) - leal 16(%edx),%edx - shufps $16,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $140,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $170,%xmm1,%xmm1 - xorps %xmm1,%xmm2 - ret -.align 16 -.L11014rounds_alt: - movdqa (%ebx),%xmm5 - movdqa 32(%ebx),%xmm4 - movl $7,%ecx - movdqu %xmm0,-32(%edx) - movdqa %xmm2,%xmm1 - movdqu %xmm2,-16(%edx) -.L114loop_key256: -.byte 102,15,56,0,213 -.byte 102,15,56,221,212 - movdqa %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm3,%xmm0 - pslld $1,%xmm4 - pxor %xmm2,%xmm0 - movdqu %xmm0,(%edx) - decl %ecx - jz .L115done_key256 - pshufd $255,%xmm0,%xmm2 - pxor %xmm3,%xmm3 -.byte 102,15,56,221,211 - movdqa %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm3,%xmm1 - pxor %xmm1,%xmm2 - movdqu %xmm2,16(%edx) - leal 32(%edx),%edx - movdqa %xmm2,%xmm1 - jmp .L114loop_key256 -.L115done_key256: - movl $13,%ecx - movl %ecx,16(%edx) -.L102good_key: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - xorl %eax,%eax - popl %ebx - popl %ebp - ret -.align 4 -.L093bad_pointer: - movl $-1,%eax - popl %ebx - popl %ebp - ret -.align 4 -.L097bad_keybits: - pxor %xmm0,%xmm0 - movl $-2,%eax - popl %ebx - popl %ebp - ret -.size _aesni_set_encrypt_key,.-_aesni_set_encrypt_key -.globl aes_hw_set_encrypt_key -.hidden aes_hw_set_encrypt_key -.type aes_hw_set_encrypt_key,@function -.align 16 -aes_hw_set_encrypt_key: -.L_aes_hw_set_encrypt_key_begin: -#ifdef BORINGSSL_DISPATCH_TEST - pushl %ebx - pushl %edx - call .L116pic -.L116pic: - popl %ebx - leal BORINGSSL_function_hit+3-.L116pic(%ebx),%ebx - movl $1,%edx - movb %dl,(%ebx) - popl %edx - popl %ebx -#endif - movl 4(%esp),%eax - movl 8(%esp),%ecx - movl 12(%esp),%edx - call _aesni_set_encrypt_key - ret -.size aes_hw_set_encrypt_key,.-.L_aes_hw_set_encrypt_key_begin -.globl aes_hw_set_decrypt_key -.hidden aes_hw_set_decrypt_key -.type aes_hw_set_decrypt_key,@function -.align 16 -aes_hw_set_decrypt_key: -.L_aes_hw_set_decrypt_key_begin: - movl 4(%esp),%eax - movl 8(%esp),%ecx - movl 12(%esp),%edx - call _aesni_set_encrypt_key - movl 12(%esp),%edx - shll $4,%ecx - testl %eax,%eax - jnz .L117dec_key_ret - leal 16(%edx,%ecx,1),%eax - movups (%edx),%xmm0 - movups (%eax),%xmm1 - movups %xmm0,(%eax) - movups %xmm1,(%edx) - leal 16(%edx),%edx - leal -16(%eax),%eax -.L118dec_key_inverse: - movups (%edx),%xmm0 - movups (%eax),%xmm1 -.byte 102,15,56,219,192 -.byte 102,15,56,219,201 - leal 16(%edx),%edx - leal -16(%eax),%eax - movups %xmm0,16(%eax) - movups %xmm1,-16(%edx) - cmpl %edx,%eax - ja .L118dec_key_inverse - movups (%edx),%xmm0 -.byte 102,15,56,219,192 - movups %xmm0,(%edx) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - xorl %eax,%eax -.L117dec_key_ret: - ret -.size aes_hw_set_decrypt_key,.-.L_aes_hw_set_decrypt_key_begin -.align 64 -.Lkey_const: -.long 202313229,202313229,202313229,202313229 -.long 67569157,67569157,67569157,67569157 -.long 1,1,1,1 -.long 27,27,27,27 -.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69 -.byte 83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83 -.byte 32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115 -.byte 115,108,46,111,114,103,62,0 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S b/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S deleted file mode 100644 index 638c0361..00000000 --- a/third_party/boringssl/linux-x86/crypto/fipsmodule/bn-586.S +++ /dev/null @@ -1,997 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl bn_mul_add_words -.hidden bn_mul_add_words -.type bn_mul_add_words,@function -.align 16 -bn_mul_add_words: -.L_bn_mul_add_words_begin: - call .L000PIC_me_up -.L000PIC_me_up: - popl %eax - leal OPENSSL_ia32cap_P-.L000PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc .L001maw_non_sse2 - movl 4(%esp),%eax - movl 8(%esp),%edx - movl 12(%esp),%ecx - movd 16(%esp),%mm0 - pxor %mm1,%mm1 - jmp .L002maw_sse2_entry -.align 16 -.L003maw_sse2_unrolled: - movd (%eax),%mm3 - paddq %mm3,%mm1 - movd (%edx),%mm2 - pmuludq %mm0,%mm2 - movd 4(%edx),%mm4 - pmuludq %mm0,%mm4 - movd 8(%edx),%mm6 - pmuludq %mm0,%mm6 - movd 12(%edx),%mm7 - pmuludq %mm0,%mm7 - paddq %mm2,%mm1 - movd 4(%eax),%mm3 - paddq %mm4,%mm3 - movd 8(%eax),%mm5 - paddq %mm6,%mm5 - movd 12(%eax),%mm4 - paddq %mm4,%mm7 - movd %mm1,(%eax) - movd 16(%edx),%mm2 - pmuludq %mm0,%mm2 - psrlq $32,%mm1 - movd 20(%edx),%mm4 - pmuludq %mm0,%mm4 - paddq %mm3,%mm1 - movd 24(%edx),%mm6 - pmuludq %mm0,%mm6 - movd %mm1,4(%eax) - psrlq $32,%mm1 - movd 28(%edx),%mm3 - addl $32,%edx - pmuludq %mm0,%mm3 - paddq %mm5,%mm1 - movd 16(%eax),%mm5 - paddq %mm5,%mm2 - movd %mm1,8(%eax) - psrlq $32,%mm1 - paddq %mm7,%mm1 - movd 20(%eax),%mm5 - paddq %mm5,%mm4 - movd %mm1,12(%eax) - psrlq $32,%mm1 - paddq %mm2,%mm1 - movd 24(%eax),%mm5 - paddq %mm5,%mm6 - movd %mm1,16(%eax) - psrlq $32,%mm1 - paddq %mm4,%mm1 - movd 28(%eax),%mm5 - paddq %mm5,%mm3 - movd %mm1,20(%eax) - psrlq $32,%mm1 - paddq %mm6,%mm1 - movd %mm1,24(%eax) - psrlq $32,%mm1 - paddq %mm3,%mm1 - movd %mm1,28(%eax) - leal 32(%eax),%eax - psrlq $32,%mm1 - subl $8,%ecx - jz .L004maw_sse2_exit -.L002maw_sse2_entry: - testl $4294967288,%ecx - jnz .L003maw_sse2_unrolled -.align 4 -.L005maw_sse2_loop: - movd (%edx),%mm2 - movd (%eax),%mm3 - pmuludq %mm0,%mm2 - leal 4(%edx),%edx - paddq %mm3,%mm1 - paddq %mm2,%mm1 - movd %mm1,(%eax) - subl $1,%ecx - psrlq $32,%mm1 - leal 4(%eax),%eax - jnz .L005maw_sse2_loop -.L004maw_sse2_exit: - movd %mm1,%eax - emms - ret -.align 16 -.L001maw_non_sse2: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - xorl %esi,%esi - movl 20(%esp),%edi - movl 28(%esp),%ecx - movl 24(%esp),%ebx - andl $4294967288,%ecx - movl 32(%esp),%ebp - pushl %ecx - jz .L006maw_finish -.align 16 -.L007maw_loop: - - movl (%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl (%edi),%eax - adcl $0,%edx - movl %eax,(%edi) - movl %edx,%esi - - movl 4(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 4(%edi),%eax - adcl $0,%edx - movl %eax,4(%edi) - movl %edx,%esi - - movl 8(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 8(%edi),%eax - adcl $0,%edx - movl %eax,8(%edi) - movl %edx,%esi - - movl 12(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 12(%edi),%eax - adcl $0,%edx - movl %eax,12(%edi) - movl %edx,%esi - - movl 16(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 16(%edi),%eax - adcl $0,%edx - movl %eax,16(%edi) - movl %edx,%esi - - movl 20(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 20(%edi),%eax - adcl $0,%edx - movl %eax,20(%edi) - movl %edx,%esi - - movl 24(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 24(%edi),%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi - - movl 28(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 28(%edi),%eax - adcl $0,%edx - movl %eax,28(%edi) - movl %edx,%esi - - subl $8,%ecx - leal 32(%ebx),%ebx - leal 32(%edi),%edi - jnz .L007maw_loop -.L006maw_finish: - movl 32(%esp),%ecx - andl $7,%ecx - jnz .L008maw_finish2 - jmp .L009maw_end -.L008maw_finish2: - - movl (%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl (%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 4(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 4(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,4(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 8(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 8(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,8(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 12(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 12(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,12(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 16(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 16(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,16(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 20(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 20(%edi),%eax - adcl $0,%edx - decl %ecx - movl %eax,20(%edi) - movl %edx,%esi - jz .L009maw_end - - movl 24(%ebx),%eax - mull %ebp - addl %esi,%eax - adcl $0,%edx - addl 24(%edi),%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi -.L009maw_end: - movl %esi,%eax - popl %ecx - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size bn_mul_add_words,.-.L_bn_mul_add_words_begin -.globl bn_mul_words -.hidden bn_mul_words -.type bn_mul_words,@function -.align 16 -bn_mul_words: -.L_bn_mul_words_begin: - call .L010PIC_me_up -.L010PIC_me_up: - popl %eax - leal OPENSSL_ia32cap_P-.L010PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc .L011mw_non_sse2 - movl 4(%esp),%eax - movl 8(%esp),%edx - movl 12(%esp),%ecx - movd 16(%esp),%mm0 - pxor %mm1,%mm1 -.align 16 -.L012mw_sse2_loop: - movd (%edx),%mm2 - pmuludq %mm0,%mm2 - leal 4(%edx),%edx - paddq %mm2,%mm1 - movd %mm1,(%eax) - subl $1,%ecx - psrlq $32,%mm1 - leal 4(%eax),%eax - jnz .L012mw_sse2_loop - movd %mm1,%eax - emms - ret -.align 16 -.L011mw_non_sse2: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - xorl %esi,%esi - movl 20(%esp),%edi - movl 24(%esp),%ebx - movl 28(%esp),%ebp - movl 32(%esp),%ecx - andl $4294967288,%ebp - jz .L013mw_finish -.L014mw_loop: - - movl (%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,(%edi) - movl %edx,%esi - - movl 4(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,4(%edi) - movl %edx,%esi - - movl 8(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,8(%edi) - movl %edx,%esi - - movl 12(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,12(%edi) - movl %edx,%esi - - movl 16(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,16(%edi) - movl %edx,%esi - - movl 20(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,20(%edi) - movl %edx,%esi - - movl 24(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi - - movl 28(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,28(%edi) - movl %edx,%esi - - addl $32,%ebx - addl $32,%edi - subl $8,%ebp - jz .L013mw_finish - jmp .L014mw_loop -.L013mw_finish: - movl 28(%esp),%ebp - andl $7,%ebp - jnz .L015mw_finish2 - jmp .L016mw_end -.L015mw_finish2: - - movl (%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 4(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,4(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 8(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,8(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 12(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,12(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 16(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,16(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 20(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,20(%edi) - movl %edx,%esi - decl %ebp - jz .L016mw_end - - movl 24(%ebx),%eax - mull %ecx - addl %esi,%eax - adcl $0,%edx - movl %eax,24(%edi) - movl %edx,%esi -.L016mw_end: - movl %esi,%eax - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size bn_mul_words,.-.L_bn_mul_words_begin -.globl bn_sqr_words -.hidden bn_sqr_words -.type bn_sqr_words,@function -.align 16 -bn_sqr_words: -.L_bn_sqr_words_begin: - call .L017PIC_me_up -.L017PIC_me_up: - popl %eax - leal OPENSSL_ia32cap_P-.L017PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc .L018sqr_non_sse2 - movl 4(%esp),%eax - movl 8(%esp),%edx - movl 12(%esp),%ecx -.align 16 -.L019sqr_sse2_loop: - movd (%edx),%mm0 - pmuludq %mm0,%mm0 - leal 4(%edx),%edx - movq %mm0,(%eax) - subl $1,%ecx - leal 8(%eax),%eax - jnz .L019sqr_sse2_loop - emms - ret -.align 16 -.L018sqr_non_sse2: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%ebx - andl $4294967288,%ebx - jz .L020sw_finish -.L021sw_loop: - - movl (%edi),%eax - mull %eax - movl %eax,(%esi) - movl %edx,4(%esi) - - movl 4(%edi),%eax - mull %eax - movl %eax,8(%esi) - movl %edx,12(%esi) - - movl 8(%edi),%eax - mull %eax - movl %eax,16(%esi) - movl %edx,20(%esi) - - movl 12(%edi),%eax - mull %eax - movl %eax,24(%esi) - movl %edx,28(%esi) - - movl 16(%edi),%eax - mull %eax - movl %eax,32(%esi) - movl %edx,36(%esi) - - movl 20(%edi),%eax - mull %eax - movl %eax,40(%esi) - movl %edx,44(%esi) - - movl 24(%edi),%eax - mull %eax - movl %eax,48(%esi) - movl %edx,52(%esi) - - movl 28(%edi),%eax - mull %eax - movl %eax,56(%esi) - movl %edx,60(%esi) - - addl $32,%edi - addl $64,%esi - subl $8,%ebx - jnz .L021sw_loop -.L020sw_finish: - movl 28(%esp),%ebx - andl $7,%ebx - jz .L022sw_end - - movl (%edi),%eax - mull %eax - movl %eax,(%esi) - decl %ebx - movl %edx,4(%esi) - jz .L022sw_end - - movl 4(%edi),%eax - mull %eax - movl %eax,8(%esi) - decl %ebx - movl %edx,12(%esi) - jz .L022sw_end - - movl 8(%edi),%eax - mull %eax - movl %eax,16(%esi) - decl %ebx - movl %edx,20(%esi) - jz .L022sw_end - - movl 12(%edi),%eax - mull %eax - movl %eax,24(%esi) - decl %ebx - movl %edx,28(%esi) - jz .L022sw_end - - movl 16(%edi),%eax - mull %eax - movl %eax,32(%esi) - decl %ebx - movl %edx,36(%esi) - jz .L022sw_end - - movl 20(%edi),%eax - mull %eax - movl %eax,40(%esi) - decl %ebx - movl %edx,44(%esi) - jz .L022sw_end - - movl 24(%edi),%eax - mull %eax - movl %eax,48(%esi) - movl %edx,52(%esi) -.L022sw_end: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size bn_sqr_words,.-.L_bn_sqr_words_begin -.globl bn_div_words -.hidden bn_div_words -.type bn_div_words,@function -.align 16 -bn_div_words: -.L_bn_div_words_begin: - movl 4(%esp),%edx - movl 8(%esp),%eax - movl 12(%esp),%ecx - divl %ecx - ret -.size bn_div_words,.-.L_bn_div_words_begin -.globl bn_add_words -.hidden bn_add_words -.type bn_add_words,@function -.align 16 -bn_add_words: -.L_bn_add_words_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 20(%esp),%ebx - movl 24(%esp),%esi - movl 28(%esp),%edi - movl 32(%esp),%ebp - xorl %eax,%eax - andl $4294967288,%ebp - jz .L023aw_finish -.L024aw_loop: - - movl (%esi),%ecx - movl (%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,(%ebx) - - movl 4(%esi),%ecx - movl 4(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,4(%ebx) - - movl 8(%esi),%ecx - movl 8(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,8(%ebx) - - movl 12(%esi),%ecx - movl 12(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,12(%ebx) - - movl 16(%esi),%ecx - movl 16(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,16(%ebx) - - movl 20(%esi),%ecx - movl 20(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,20(%ebx) - - movl 24(%esi),%ecx - movl 24(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,24(%ebx) - - movl 28(%esi),%ecx - movl 28(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,28(%ebx) - - addl $32,%esi - addl $32,%edi - addl $32,%ebx - subl $8,%ebp - jnz .L024aw_loop -.L023aw_finish: - movl 32(%esp),%ebp - andl $7,%ebp - jz .L025aw_end - - movl (%esi),%ecx - movl (%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,(%ebx) - jz .L025aw_end - - movl 4(%esi),%ecx - movl 4(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,4(%ebx) - jz .L025aw_end - - movl 8(%esi),%ecx - movl 8(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,8(%ebx) - jz .L025aw_end - - movl 12(%esi),%ecx - movl 12(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,12(%ebx) - jz .L025aw_end - - movl 16(%esi),%ecx - movl 16(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,16(%ebx) - jz .L025aw_end - - movl 20(%esi),%ecx - movl 20(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,20(%ebx) - jz .L025aw_end - - movl 24(%esi),%ecx - movl 24(%edi),%edx - addl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - addl %edx,%ecx - adcl $0,%eax - movl %ecx,24(%ebx) -.L025aw_end: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size bn_add_words,.-.L_bn_add_words_begin -.globl bn_sub_words -.hidden bn_sub_words -.type bn_sub_words,@function -.align 16 -bn_sub_words: -.L_bn_sub_words_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - - movl 20(%esp),%ebx - movl 24(%esp),%esi - movl 28(%esp),%edi - movl 32(%esp),%ebp - xorl %eax,%eax - andl $4294967288,%ebp - jz .L026aw_finish -.L027aw_loop: - - movl (%esi),%ecx - movl (%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,(%ebx) - - movl 4(%esi),%ecx - movl 4(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,4(%ebx) - - movl 8(%esi),%ecx - movl 8(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,8(%ebx) - - movl 12(%esi),%ecx - movl 12(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,12(%ebx) - - movl 16(%esi),%ecx - movl 16(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,16(%ebx) - - movl 20(%esi),%ecx - movl 20(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,20(%ebx) - - movl 24(%esi),%ecx - movl 24(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,24(%ebx) - - movl 28(%esi),%ecx - movl 28(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,28(%ebx) - - addl $32,%esi - addl $32,%edi - addl $32,%ebx - subl $8,%ebp - jnz .L027aw_loop -.L026aw_finish: - movl 32(%esp),%ebp - andl $7,%ebp - jz .L028aw_end - - movl (%esi),%ecx - movl (%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,(%ebx) - jz .L028aw_end - - movl 4(%esi),%ecx - movl 4(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,4(%ebx) - jz .L028aw_end - - movl 8(%esi),%ecx - movl 8(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,8(%ebx) - jz .L028aw_end - - movl 12(%esi),%ecx - movl 12(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,12(%ebx) - jz .L028aw_end - - movl 16(%esi),%ecx - movl 16(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,16(%ebx) - jz .L028aw_end - - movl 20(%esi),%ecx - movl 20(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - decl %ebp - movl %ecx,20(%ebx) - jz .L028aw_end - - movl 24(%esi),%ecx - movl 24(%edi),%edx - subl %eax,%ecx - movl $0,%eax - adcl %eax,%eax - subl %edx,%ecx - adcl $0,%eax - movl %ecx,24(%ebx) -.L028aw_end: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size bn_sub_words,.-.L_bn_sub_words_begin -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S b/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S deleted file mode 100644 index f1e67caf..00000000 --- a/third_party/boringssl/linux-x86/crypto/fipsmodule/co-586.S +++ /dev/null @@ -1,1266 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl bn_mul_comba8 -.hidden bn_mul_comba8 -.type bn_mul_comba8,@function -.align 16 -bn_mul_comba8: -.L_bn_mul_comba8_begin: - pushl %esi - movl 12(%esp),%esi - pushl %edi - movl 20(%esp),%edi - pushl %ebp - pushl %ebx - xorl %ebx,%ebx - movl (%esi),%eax - xorl %ecx,%ecx - movl (%edi),%edx - - xorl %ebp,%ebp - - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl (%edi),%edx - adcl $0,%ebp - movl %ebx,(%eax) - movl 4(%esi),%eax - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%ecx - movl (%esi),%eax - adcl %edx,%ebp - movl 4(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl (%edi),%edx - adcl $0,%ebx - movl %ecx,4(%eax) - movl 8(%esi),%eax - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%ebp - movl 4(%esi),%eax - adcl %edx,%ebx - movl 4(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl (%esi),%eax - adcl %edx,%ebx - movl 8(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl (%edi),%edx - adcl $0,%ecx - movl %ebp,8(%eax) - movl 12(%esi),%eax - - - xorl %ebp,%ebp - - mull %edx - addl %eax,%ebx - movl 8(%esi),%eax - adcl %edx,%ecx - movl 4(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 4(%esi),%eax - adcl %edx,%ecx - movl 8(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl (%esi),%eax - adcl %edx,%ecx - movl 12(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl (%edi),%edx - adcl $0,%ebp - movl %ebx,12(%eax) - movl 16(%esi),%eax - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%ecx - movl 12(%esi),%eax - adcl %edx,%ebp - movl 4(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 8(%esi),%eax - adcl %edx,%ebp - movl 8(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 4(%esi),%eax - adcl %edx,%ebp - movl 12(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl (%esi),%eax - adcl %edx,%ebp - movl 16(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl (%edi),%edx - adcl $0,%ebx - movl %ecx,16(%eax) - movl 20(%esi),%eax - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%ebp - movl 16(%esi),%eax - adcl %edx,%ebx - movl 4(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 12(%esi),%eax - adcl %edx,%ebx - movl 8(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 8(%esi),%eax - adcl %edx,%ebx - movl 12(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 4(%esi),%eax - adcl %edx,%ebx - movl 16(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl (%esi),%eax - adcl %edx,%ebx - movl 20(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl (%edi),%edx - adcl $0,%ecx - movl %ebp,20(%eax) - movl 24(%esi),%eax - - - xorl %ebp,%ebp - - mull %edx - addl %eax,%ebx - movl 20(%esi),%eax - adcl %edx,%ecx - movl 4(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 16(%esi),%eax - adcl %edx,%ecx - movl 8(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 12(%esi),%eax - adcl %edx,%ecx - movl 12(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 8(%esi),%eax - adcl %edx,%ecx - movl 16(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 4(%esi),%eax - adcl %edx,%ecx - movl 20(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl (%esi),%eax - adcl %edx,%ecx - movl 24(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl (%edi),%edx - adcl $0,%ebp - movl %ebx,24(%eax) - movl 28(%esi),%eax - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%ecx - movl 24(%esi),%eax - adcl %edx,%ebp - movl 4(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 20(%esi),%eax - adcl %edx,%ebp - movl 8(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 16(%esi),%eax - adcl %edx,%ebp - movl 12(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 12(%esi),%eax - adcl %edx,%ebp - movl 16(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 8(%esi),%eax - adcl %edx,%ebp - movl 20(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 4(%esi),%eax - adcl %edx,%ebp - movl 24(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl (%esi),%eax - adcl %edx,%ebp - movl 28(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl 4(%edi),%edx - adcl $0,%ebx - movl %ecx,28(%eax) - movl 28(%esi),%eax - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%ebp - movl 24(%esi),%eax - adcl %edx,%ebx - movl 8(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 20(%esi),%eax - adcl %edx,%ebx - movl 12(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 16(%esi),%eax - adcl %edx,%ebx - movl 16(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 12(%esi),%eax - adcl %edx,%ebx - movl 20(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 8(%esi),%eax - adcl %edx,%ebx - movl 24(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 4(%esi),%eax - adcl %edx,%ebx - movl 28(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl 8(%edi),%edx - adcl $0,%ecx - movl %ebp,32(%eax) - movl 28(%esi),%eax - - - xorl %ebp,%ebp - - mull %edx - addl %eax,%ebx - movl 24(%esi),%eax - adcl %edx,%ecx - movl 12(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 20(%esi),%eax - adcl %edx,%ecx - movl 16(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 16(%esi),%eax - adcl %edx,%ecx - movl 20(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 12(%esi),%eax - adcl %edx,%ecx - movl 24(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 8(%esi),%eax - adcl %edx,%ecx - movl 28(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl 12(%edi),%edx - adcl $0,%ebp - movl %ebx,36(%eax) - movl 28(%esi),%eax - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%ecx - movl 24(%esi),%eax - adcl %edx,%ebp - movl 16(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 20(%esi),%eax - adcl %edx,%ebp - movl 20(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 16(%esi),%eax - adcl %edx,%ebp - movl 24(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 12(%esi),%eax - adcl %edx,%ebp - movl 28(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl 16(%edi),%edx - adcl $0,%ebx - movl %ecx,40(%eax) - movl 28(%esi),%eax - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%ebp - movl 24(%esi),%eax - adcl %edx,%ebx - movl 20(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 20(%esi),%eax - adcl %edx,%ebx - movl 24(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 16(%esi),%eax - adcl %edx,%ebx - movl 28(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl 20(%edi),%edx - adcl $0,%ecx - movl %ebp,44(%eax) - movl 28(%esi),%eax - - - xorl %ebp,%ebp - - mull %edx - addl %eax,%ebx - movl 24(%esi),%eax - adcl %edx,%ecx - movl 24(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 20(%esi),%eax - adcl %edx,%ecx - movl 28(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl 24(%edi),%edx - adcl $0,%ebp - movl %ebx,48(%eax) - movl 28(%esi),%eax - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%ecx - movl 24(%esi),%eax - adcl %edx,%ebp - movl 28(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl 28(%edi),%edx - adcl $0,%ebx - movl %ecx,52(%eax) - movl 28(%esi),%eax - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - adcl $0,%ecx - movl %ebp,56(%eax) - - - movl %ebx,60(%eax) - popl %ebx - popl %ebp - popl %edi - popl %esi - ret -.size bn_mul_comba8,.-.L_bn_mul_comba8_begin -.globl bn_mul_comba4 -.hidden bn_mul_comba4 -.type bn_mul_comba4,@function -.align 16 -bn_mul_comba4: -.L_bn_mul_comba4_begin: - pushl %esi - movl 12(%esp),%esi - pushl %edi - movl 20(%esp),%edi - pushl %ebp - pushl %ebx - xorl %ebx,%ebx - movl (%esi),%eax - xorl %ecx,%ecx - movl (%edi),%edx - - xorl %ebp,%ebp - - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl (%edi),%edx - adcl $0,%ebp - movl %ebx,(%eax) - movl 4(%esi),%eax - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%ecx - movl (%esi),%eax - adcl %edx,%ebp - movl 4(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl (%edi),%edx - adcl $0,%ebx - movl %ecx,4(%eax) - movl 8(%esi),%eax - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%ebp - movl 4(%esi),%eax - adcl %edx,%ebx - movl 4(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl (%esi),%eax - adcl %edx,%ebx - movl 8(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl (%edi),%edx - adcl $0,%ecx - movl %ebp,8(%eax) - movl 12(%esi),%eax - - - xorl %ebp,%ebp - - mull %edx - addl %eax,%ebx - movl 8(%esi),%eax - adcl %edx,%ecx - movl 4(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 4(%esi),%eax - adcl %edx,%ecx - movl 8(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl (%esi),%eax - adcl %edx,%ecx - movl 12(%edi),%edx - adcl $0,%ebp - - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - movl 4(%edi),%edx - adcl $0,%ebp - movl %ebx,12(%eax) - movl 12(%esi),%eax - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%ecx - movl 8(%esi),%eax - adcl %edx,%ebp - movl 8(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 4(%esi),%eax - adcl %edx,%ebp - movl 12(%edi),%edx - adcl $0,%ebx - - mull %edx - addl %eax,%ecx - movl 20(%esp),%eax - adcl %edx,%ebp - movl 8(%edi),%edx - adcl $0,%ebx - movl %ecx,16(%eax) - movl 12(%esi),%eax - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%ebp - movl 8(%esi),%eax - adcl %edx,%ebx - movl 12(%edi),%edx - adcl $0,%ecx - - mull %edx - addl %eax,%ebp - movl 20(%esp),%eax - adcl %edx,%ebx - movl 12(%edi),%edx - adcl $0,%ecx - movl %ebp,20(%eax) - movl 12(%esi),%eax - - - xorl %ebp,%ebp - - mull %edx - addl %eax,%ebx - movl 20(%esp),%eax - adcl %edx,%ecx - adcl $0,%ebp - movl %ebx,24(%eax) - - - movl %ecx,28(%eax) - popl %ebx - popl %ebp - popl %edi - popl %esi - ret -.size bn_mul_comba4,.-.L_bn_mul_comba4_begin -.globl bn_sqr_comba8 -.hidden bn_sqr_comba8 -.type bn_sqr_comba8,@function -.align 16 -bn_sqr_comba8: -.L_bn_sqr_comba8_begin: - pushl %esi - pushl %edi - pushl %ebp - pushl %ebx - movl 20(%esp),%edi - movl 24(%esp),%esi - xorl %ebx,%ebx - xorl %ecx,%ecx - movl (%esi),%eax - - xorl %ebp,%ebp - - mull %eax - addl %eax,%ebx - adcl %edx,%ecx - movl (%esi),%edx - adcl $0,%ebp - movl %ebx,(%edi) - movl 4(%esi),%eax - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 8(%esi),%eax - adcl $0,%ebx - movl %ecx,4(%edi) - movl (%esi),%edx - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 4(%esi),%eax - adcl $0,%ecx - - mull %eax - addl %eax,%ebp - adcl %edx,%ebx - movl (%esi),%edx - adcl $0,%ecx - movl %ebp,8(%edi) - movl 12(%esi),%eax - - - xorl %ebp,%ebp - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 8(%esi),%eax - adcl $0,%ebp - movl 4(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 16(%esi),%eax - adcl $0,%ebp - movl %ebx,12(%edi) - movl (%esi),%edx - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 12(%esi),%eax - adcl $0,%ebx - movl 4(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 8(%esi),%eax - adcl $0,%ebx - - mull %eax - addl %eax,%ecx - adcl %edx,%ebp - movl (%esi),%edx - adcl $0,%ebx - movl %ecx,16(%edi) - movl 20(%esi),%eax - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 16(%esi),%eax - adcl $0,%ecx - movl 4(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 12(%esi),%eax - adcl $0,%ecx - movl 8(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 24(%esi),%eax - adcl $0,%ecx - movl %ebp,20(%edi) - movl (%esi),%edx - - - xorl %ebp,%ebp - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 20(%esi),%eax - adcl $0,%ebp - movl 4(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 16(%esi),%eax - adcl $0,%ebp - movl 8(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 12(%esi),%eax - adcl $0,%ebp - - mull %eax - addl %eax,%ebx - adcl %edx,%ecx - movl (%esi),%edx - adcl $0,%ebp - movl %ebx,24(%edi) - movl 28(%esi),%eax - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 24(%esi),%eax - adcl $0,%ebx - movl 4(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 20(%esi),%eax - adcl $0,%ebx - movl 8(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 16(%esi),%eax - adcl $0,%ebx - movl 12(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 28(%esi),%eax - adcl $0,%ebx - movl %ecx,28(%edi) - movl 4(%esi),%edx - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 24(%esi),%eax - adcl $0,%ecx - movl 8(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 20(%esi),%eax - adcl $0,%ecx - movl 12(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 16(%esi),%eax - adcl $0,%ecx - - mull %eax - addl %eax,%ebp - adcl %edx,%ebx - movl 8(%esi),%edx - adcl $0,%ecx - movl %ebp,32(%edi) - movl 28(%esi),%eax - - - xorl %ebp,%ebp - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 24(%esi),%eax - adcl $0,%ebp - movl 12(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 20(%esi),%eax - adcl $0,%ebp - movl 16(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 28(%esi),%eax - adcl $0,%ebp - movl %ebx,36(%edi) - movl 12(%esi),%edx - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 24(%esi),%eax - adcl $0,%ebx - movl 16(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 20(%esi),%eax - adcl $0,%ebx - - mull %eax - addl %eax,%ecx - adcl %edx,%ebp - movl 16(%esi),%edx - adcl $0,%ebx - movl %ecx,40(%edi) - movl 28(%esi),%eax - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 24(%esi),%eax - adcl $0,%ecx - movl 20(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 28(%esi),%eax - adcl $0,%ecx - movl %ebp,44(%edi) - movl 20(%esi),%edx - - - xorl %ebp,%ebp - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 24(%esi),%eax - adcl $0,%ebp - - mull %eax - addl %eax,%ebx - adcl %edx,%ecx - movl 24(%esi),%edx - adcl $0,%ebp - movl %ebx,48(%edi) - movl 28(%esi),%eax - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 28(%esi),%eax - adcl $0,%ebx - movl %ecx,52(%edi) - - - xorl %ecx,%ecx - - mull %eax - addl %eax,%ebp - adcl %edx,%ebx - adcl $0,%ecx - movl %ebp,56(%edi) - - movl %ebx,60(%edi) - popl %ebx - popl %ebp - popl %edi - popl %esi - ret -.size bn_sqr_comba8,.-.L_bn_sqr_comba8_begin -.globl bn_sqr_comba4 -.hidden bn_sqr_comba4 -.type bn_sqr_comba4,@function -.align 16 -bn_sqr_comba4: -.L_bn_sqr_comba4_begin: - pushl %esi - pushl %edi - pushl %ebp - pushl %ebx - movl 20(%esp),%edi - movl 24(%esp),%esi - xorl %ebx,%ebx - xorl %ecx,%ecx - movl (%esi),%eax - - xorl %ebp,%ebp - - mull %eax - addl %eax,%ebx - adcl %edx,%ecx - movl (%esi),%edx - adcl $0,%ebp - movl %ebx,(%edi) - movl 4(%esi),%eax - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 8(%esi),%eax - adcl $0,%ebx - movl %ecx,4(%edi) - movl (%esi),%edx - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 4(%esi),%eax - adcl $0,%ecx - - mull %eax - addl %eax,%ebp - adcl %edx,%ebx - movl (%esi),%edx - adcl $0,%ecx - movl %ebp,8(%edi) - movl 12(%esi),%eax - - - xorl %ebp,%ebp - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 8(%esi),%eax - adcl $0,%ebp - movl 4(%esi),%edx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebp - addl %eax,%ebx - adcl %edx,%ecx - movl 12(%esi),%eax - adcl $0,%ebp - movl %ebx,12(%edi) - movl 4(%esi),%edx - - - xorl %ebx,%ebx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ebx - addl %eax,%ecx - adcl %edx,%ebp - movl 8(%esi),%eax - adcl $0,%ebx - - mull %eax - addl %eax,%ecx - adcl %edx,%ebp - movl 8(%esi),%edx - adcl $0,%ebx - movl %ecx,16(%edi) - movl 12(%esi),%eax - - - xorl %ecx,%ecx - - mull %edx - addl %eax,%eax - adcl %edx,%edx - adcl $0,%ecx - addl %eax,%ebp - adcl %edx,%ebx - movl 12(%esi),%eax - adcl $0,%ecx - movl %ebp,20(%edi) - - - xorl %ebp,%ebp - - mull %eax - addl %eax,%ebx - adcl %edx,%ecx - adcl $0,%ebp - movl %ebx,24(%edi) - - movl %ecx,28(%edi) - popl %ebx - popl %ebp - popl %edi - popl %esi - ret -.size bn_sqr_comba4,.-.L_bn_sqr_comba4_begin -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S b/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S deleted file mode 100644 index 840e4389..00000000 --- a/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S +++ /dev/null @@ -1,294 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl gcm_gmult_ssse3 -.hidden gcm_gmult_ssse3 -.type gcm_gmult_ssse3,@function -.align 16 -gcm_gmult_ssse3: -.L_gcm_gmult_ssse3_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%edi - movl 24(%esp),%esi - movdqu (%edi),%xmm0 - call .L000pic_point -.L000pic_point: - popl %eax - movdqa .Lreverse_bytes-.L000pic_point(%eax),%xmm7 - movdqa .Llow4_mask-.L000pic_point(%eax),%xmm2 -.byte 102,15,56,0,199 - movdqa %xmm2,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm2,%xmm0 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - movl $5,%eax -.L001loop_row_1: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz .L001loop_row_1 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movl $5,%eax -.L002loop_row_2: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz .L002loop_row_2 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movl $6,%eax -.L003loop_row_3: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz .L003loop_row_3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 -.byte 102,15,56,0,215 - movdqu %xmm2,(%edi) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size gcm_gmult_ssse3,.-.L_gcm_gmult_ssse3_begin -.globl gcm_ghash_ssse3 -.hidden gcm_ghash_ssse3 -.type gcm_ghash_ssse3,@function -.align 16 -gcm_ghash_ssse3: -.L_gcm_ghash_ssse3_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%edi - movl 24(%esp),%esi - movl 28(%esp),%edx - movl 32(%esp),%ecx - movdqu (%edi),%xmm0 - call .L004pic_point -.L004pic_point: - popl %ebx - movdqa .Lreverse_bytes-.L004pic_point(%ebx),%xmm7 - andl $-16,%ecx -.byte 102,15,56,0,199 - pxor %xmm3,%xmm3 -.L005loop_ghash: - movdqa .Llow4_mask-.L004pic_point(%ebx),%xmm2 - movdqu (%edx),%xmm1 -.byte 102,15,56,0,207 - pxor %xmm1,%xmm0 - movdqa %xmm2,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm2,%xmm0 - pxor %xmm2,%xmm2 - movl $5,%eax -.L006loop_row_4: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz .L006loop_row_4 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movl $5,%eax -.L007loop_row_5: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz .L007loop_row_5 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movl $6,%eax -.L008loop_row_6: - movdqa (%esi),%xmm4 - leal 16(%esi),%esi - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - pxor %xmm5,%xmm2 - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - subl $1,%eax - jnz .L008loop_row_6 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movdqa %xmm2,%xmm0 - leal -256(%esi),%esi - leal 16(%edx),%edx - subl $16,%ecx - jnz .L005loop_ghash -.byte 102,15,56,0,199 - movdqu %xmm0,(%edi) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size gcm_ghash_ssse3,.-.L_gcm_ghash_ssse3_begin -.align 16 -.Lreverse_bytes: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.align 16 -.Llow4_mask: -.long 252645135,252645135,252645135,252645135 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S b/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S deleted file mode 100644 index 13b94457..00000000 --- a/third_party/boringssl/linux-x86/crypto/fipsmodule/ghash-x86.S +++ /dev/null @@ -1,330 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl gcm_init_clmul -.hidden gcm_init_clmul -.type gcm_init_clmul,@function -.align 16 -gcm_init_clmul: -.L_gcm_init_clmul_begin: - movl 4(%esp),%edx - movl 8(%esp),%eax - call .L000pic -.L000pic: - popl %ecx - leal .Lbswap-.L000pic(%ecx),%ecx - movdqu (%eax),%xmm2 - pshufd $78,%xmm2,%xmm2 - pshufd $255,%xmm2,%xmm4 - movdqa %xmm2,%xmm3 - psllq $1,%xmm2 - pxor %xmm5,%xmm5 - psrlq $63,%xmm3 - pcmpgtd %xmm4,%xmm5 - pslldq $8,%xmm3 - por %xmm3,%xmm2 - pand 16(%ecx),%xmm5 - pxor %xmm5,%xmm2 - movdqa %xmm2,%xmm0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 - xorps %xmm0,%xmm3 - xorps %xmm1,%xmm3 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - pshufd $78,%xmm2,%xmm3 - pshufd $78,%xmm0,%xmm4 - pxor %xmm2,%xmm3 - movdqu %xmm2,(%edx) - pxor %xmm0,%xmm4 - movdqu %xmm0,16(%edx) -.byte 102,15,58,15,227,8 - movdqu %xmm4,32(%edx) - ret -.size gcm_init_clmul,.-.L_gcm_init_clmul_begin -.globl gcm_gmult_clmul -.hidden gcm_gmult_clmul -.type gcm_gmult_clmul,@function -.align 16 -gcm_gmult_clmul: -.L_gcm_gmult_clmul_begin: - movl 4(%esp),%eax - movl 8(%esp),%edx - call .L001pic -.L001pic: - popl %ecx - leal .Lbswap-.L001pic(%ecx),%ecx - movdqu (%eax),%xmm0 - movdqa (%ecx),%xmm5 - movups (%edx),%xmm2 -.byte 102,15,56,0,197 - movups 32(%edx),%xmm4 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 - xorps %xmm0,%xmm3 - xorps %xmm1,%xmm3 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,197 - movdqu %xmm0,(%eax) - ret -.size gcm_gmult_clmul,.-.L_gcm_gmult_clmul_begin -.globl gcm_ghash_clmul -.hidden gcm_ghash_clmul -.type gcm_ghash_clmul,@function -.align 16 -gcm_ghash_clmul: -.L_gcm_ghash_clmul_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%eax - movl 24(%esp),%edx - movl 28(%esp),%esi - movl 32(%esp),%ebx - call .L002pic -.L002pic: - popl %ecx - leal .Lbswap-.L002pic(%ecx),%ecx - movdqu (%eax),%xmm0 - movdqa (%ecx),%xmm5 - movdqu (%edx),%xmm2 -.byte 102,15,56,0,197 - subl $16,%ebx - jz .L003odd_tail - movdqu (%esi),%xmm3 - movdqu 16(%esi),%xmm6 -.byte 102,15,56,0,221 -.byte 102,15,56,0,245 - movdqu 32(%edx),%xmm5 - pxor %xmm3,%xmm0 - pshufd $78,%xmm6,%xmm3 - movdqa %xmm6,%xmm7 - pxor %xmm6,%xmm3 - leal 32(%esi),%esi -.byte 102,15,58,68,242,0 -.byte 102,15,58,68,250,17 -.byte 102,15,58,68,221,0 - movups 16(%edx),%xmm2 - nop - subl $32,%ebx - jbe .L004even_tail - jmp .L005mod_loop -.align 32 -.L005mod_loop: - pshufd $78,%xmm0,%xmm4 - movdqa %xmm0,%xmm1 - pxor %xmm0,%xmm4 - nop -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,229,16 - movups (%edx),%xmm2 - xorps %xmm6,%xmm0 - movdqa (%ecx),%xmm5 - xorps %xmm7,%xmm1 - movdqu (%esi),%xmm7 - pxor %xmm0,%xmm3 - movdqu 16(%esi),%xmm6 - pxor %xmm1,%xmm3 -.byte 102,15,56,0,253 - pxor %xmm3,%xmm4 - movdqa %xmm4,%xmm3 - psrldq $8,%xmm4 - pslldq $8,%xmm3 - pxor %xmm4,%xmm1 - pxor %xmm3,%xmm0 -.byte 102,15,56,0,245 - pxor %xmm7,%xmm1 - movdqa %xmm6,%xmm7 - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 -.byte 102,15,58,68,242,0 - movups 32(%edx),%xmm5 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - pshufd $78,%xmm7,%xmm3 - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm7,%xmm3 - pxor %xmm4,%xmm1 -.byte 102,15,58,68,250,17 - movups 16(%edx),%xmm2 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 -.byte 102,15,58,68,221,0 - leal 32(%esi),%esi - subl $32,%ebx - ja .L005mod_loop -.L004even_tail: - pshufd $78,%xmm0,%xmm4 - movdqa %xmm0,%xmm1 - pxor %xmm0,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,229,16 - movdqa (%ecx),%xmm5 - xorps %xmm6,%xmm0 - xorps %xmm7,%xmm1 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - pxor %xmm3,%xmm4 - movdqa %xmm4,%xmm3 - psrldq $8,%xmm4 - pslldq $8,%xmm3 - pxor %xmm4,%xmm1 - pxor %xmm3,%xmm0 - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - testl %ebx,%ebx - jnz .L006done - movups (%edx),%xmm2 -.L003odd_tail: - movdqu (%esi),%xmm3 -.byte 102,15,56,0,221 - pxor %xmm3,%xmm0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pshufd $78,%xmm2,%xmm4 - pxor %xmm0,%xmm3 - pxor %xmm2,%xmm4 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 - xorps %xmm0,%xmm3 - xorps %xmm1,%xmm3 - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 -.L006done: -.byte 102,15,56,0,197 - movdqu %xmm0,(%eax) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size gcm_ghash_clmul,.-.L_gcm_ghash_clmul_begin -.align 64 -.Lbswap: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,194 -.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,44,32,67 -.byte 82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112 -.byte 112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62 -.byte 0 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S b/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S deleted file mode 100644 index 58872344..00000000 --- a/third_party/boringssl/linux-x86/crypto/fipsmodule/md5-586.S +++ /dev/null @@ -1,688 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl md5_block_asm_data_order -.hidden md5_block_asm_data_order -.type md5_block_asm_data_order,@function -.align 16 -md5_block_asm_data_order: -.L_md5_block_asm_data_order_begin: - pushl %esi - pushl %edi - movl 12(%esp),%edi - movl 16(%esp),%esi - movl 20(%esp),%ecx - pushl %ebp - shll $6,%ecx - pushl %ebx - addl %esi,%ecx - subl $64,%ecx - movl (%edi),%eax - pushl %ecx - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx -.L000start: - - - movl %ecx,%edi - movl (%esi),%ebp - - xorl %edx,%edi - andl %ebx,%edi - leal 3614090360(%eax,%ebp,1),%eax - xorl %edx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $7,%eax - movl 4(%esi),%ebp - addl %ebx,%eax - - xorl %ecx,%edi - andl %eax,%edi - leal 3905402710(%edx,%ebp,1),%edx - xorl %ecx,%edi - addl %edi,%edx - movl %eax,%edi - roll $12,%edx - movl 8(%esi),%ebp - addl %eax,%edx - - xorl %ebx,%edi - andl %edx,%edi - leal 606105819(%ecx,%ebp,1),%ecx - xorl %ebx,%edi - addl %edi,%ecx - movl %edx,%edi - roll $17,%ecx - movl 12(%esi),%ebp - addl %edx,%ecx - - xorl %eax,%edi - andl %ecx,%edi - leal 3250441966(%ebx,%ebp,1),%ebx - xorl %eax,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $22,%ebx - movl 16(%esi),%ebp - addl %ecx,%ebx - - xorl %edx,%edi - andl %ebx,%edi - leal 4118548399(%eax,%ebp,1),%eax - xorl %edx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $7,%eax - movl 20(%esi),%ebp - addl %ebx,%eax - - xorl %ecx,%edi - andl %eax,%edi - leal 1200080426(%edx,%ebp,1),%edx - xorl %ecx,%edi - addl %edi,%edx - movl %eax,%edi - roll $12,%edx - movl 24(%esi),%ebp - addl %eax,%edx - - xorl %ebx,%edi - andl %edx,%edi - leal 2821735955(%ecx,%ebp,1),%ecx - xorl %ebx,%edi - addl %edi,%ecx - movl %edx,%edi - roll $17,%ecx - movl 28(%esi),%ebp - addl %edx,%ecx - - xorl %eax,%edi - andl %ecx,%edi - leal 4249261313(%ebx,%ebp,1),%ebx - xorl %eax,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $22,%ebx - movl 32(%esi),%ebp - addl %ecx,%ebx - - xorl %edx,%edi - andl %ebx,%edi - leal 1770035416(%eax,%ebp,1),%eax - xorl %edx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $7,%eax - movl 36(%esi),%ebp - addl %ebx,%eax - - xorl %ecx,%edi - andl %eax,%edi - leal 2336552879(%edx,%ebp,1),%edx - xorl %ecx,%edi - addl %edi,%edx - movl %eax,%edi - roll $12,%edx - movl 40(%esi),%ebp - addl %eax,%edx - - xorl %ebx,%edi - andl %edx,%edi - leal 4294925233(%ecx,%ebp,1),%ecx - xorl %ebx,%edi - addl %edi,%ecx - movl %edx,%edi - roll $17,%ecx - movl 44(%esi),%ebp - addl %edx,%ecx - - xorl %eax,%edi - andl %ecx,%edi - leal 2304563134(%ebx,%ebp,1),%ebx - xorl %eax,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $22,%ebx - movl 48(%esi),%ebp - addl %ecx,%ebx - - xorl %edx,%edi - andl %ebx,%edi - leal 1804603682(%eax,%ebp,1),%eax - xorl %edx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $7,%eax - movl 52(%esi),%ebp - addl %ebx,%eax - - xorl %ecx,%edi - andl %eax,%edi - leal 4254626195(%edx,%ebp,1),%edx - xorl %ecx,%edi - addl %edi,%edx - movl %eax,%edi - roll $12,%edx - movl 56(%esi),%ebp - addl %eax,%edx - - xorl %ebx,%edi - andl %edx,%edi - leal 2792965006(%ecx,%ebp,1),%ecx - xorl %ebx,%edi - addl %edi,%ecx - movl %edx,%edi - roll $17,%ecx - movl 60(%esi),%ebp - addl %edx,%ecx - - xorl %eax,%edi - andl %ecx,%edi - leal 1236535329(%ebx,%ebp,1),%ebx - xorl %eax,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $22,%ebx - movl 4(%esi),%ebp - addl %ecx,%ebx - - - - leal 4129170786(%eax,%ebp,1),%eax - xorl %ebx,%edi - andl %edx,%edi - movl 24(%esi),%ebp - xorl %ecx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $5,%eax - addl %ebx,%eax - - leal 3225465664(%edx,%ebp,1),%edx - xorl %eax,%edi - andl %ecx,%edi - movl 44(%esi),%ebp - xorl %ebx,%edi - addl %edi,%edx - movl %eax,%edi - roll $9,%edx - addl %eax,%edx - - leal 643717713(%ecx,%ebp,1),%ecx - xorl %edx,%edi - andl %ebx,%edi - movl (%esi),%ebp - xorl %eax,%edi - addl %edi,%ecx - movl %edx,%edi - roll $14,%ecx - addl %edx,%ecx - - leal 3921069994(%ebx,%ebp,1),%ebx - xorl %ecx,%edi - andl %eax,%edi - movl 20(%esi),%ebp - xorl %edx,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $20,%ebx - addl %ecx,%ebx - - leal 3593408605(%eax,%ebp,1),%eax - xorl %ebx,%edi - andl %edx,%edi - movl 40(%esi),%ebp - xorl %ecx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $5,%eax - addl %ebx,%eax - - leal 38016083(%edx,%ebp,1),%edx - xorl %eax,%edi - andl %ecx,%edi - movl 60(%esi),%ebp - xorl %ebx,%edi - addl %edi,%edx - movl %eax,%edi - roll $9,%edx - addl %eax,%edx - - leal 3634488961(%ecx,%ebp,1),%ecx - xorl %edx,%edi - andl %ebx,%edi - movl 16(%esi),%ebp - xorl %eax,%edi - addl %edi,%ecx - movl %edx,%edi - roll $14,%ecx - addl %edx,%ecx - - leal 3889429448(%ebx,%ebp,1),%ebx - xorl %ecx,%edi - andl %eax,%edi - movl 36(%esi),%ebp - xorl %edx,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $20,%ebx - addl %ecx,%ebx - - leal 568446438(%eax,%ebp,1),%eax - xorl %ebx,%edi - andl %edx,%edi - movl 56(%esi),%ebp - xorl %ecx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $5,%eax - addl %ebx,%eax - - leal 3275163606(%edx,%ebp,1),%edx - xorl %eax,%edi - andl %ecx,%edi - movl 12(%esi),%ebp - xorl %ebx,%edi - addl %edi,%edx - movl %eax,%edi - roll $9,%edx - addl %eax,%edx - - leal 4107603335(%ecx,%ebp,1),%ecx - xorl %edx,%edi - andl %ebx,%edi - movl 32(%esi),%ebp - xorl %eax,%edi - addl %edi,%ecx - movl %edx,%edi - roll $14,%ecx - addl %edx,%ecx - - leal 1163531501(%ebx,%ebp,1),%ebx - xorl %ecx,%edi - andl %eax,%edi - movl 52(%esi),%ebp - xorl %edx,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $20,%ebx - addl %ecx,%ebx - - leal 2850285829(%eax,%ebp,1),%eax - xorl %ebx,%edi - andl %edx,%edi - movl 8(%esi),%ebp - xorl %ecx,%edi - addl %edi,%eax - movl %ebx,%edi - roll $5,%eax - addl %ebx,%eax - - leal 4243563512(%edx,%ebp,1),%edx - xorl %eax,%edi - andl %ecx,%edi - movl 28(%esi),%ebp - xorl %ebx,%edi - addl %edi,%edx - movl %eax,%edi - roll $9,%edx - addl %eax,%edx - - leal 1735328473(%ecx,%ebp,1),%ecx - xorl %edx,%edi - andl %ebx,%edi - movl 48(%esi),%ebp - xorl %eax,%edi - addl %edi,%ecx - movl %edx,%edi - roll $14,%ecx - addl %edx,%ecx - - leal 2368359562(%ebx,%ebp,1),%ebx - xorl %ecx,%edi - andl %eax,%edi - movl 20(%esi),%ebp - xorl %edx,%edi - addl %edi,%ebx - movl %ecx,%edi - roll $20,%ebx - addl %ecx,%ebx - - - - xorl %edx,%edi - xorl %ebx,%edi - leal 4294588738(%eax,%ebp,1),%eax - addl %edi,%eax - roll $4,%eax - movl 32(%esi),%ebp - movl %ebx,%edi - - leal 2272392833(%edx,%ebp,1),%edx - addl %ebx,%eax - xorl %ecx,%edi - xorl %eax,%edi - movl 44(%esi),%ebp - addl %edi,%edx - movl %eax,%edi - roll $11,%edx - addl %eax,%edx - - xorl %ebx,%edi - xorl %edx,%edi - leal 1839030562(%ecx,%ebp,1),%ecx - addl %edi,%ecx - roll $16,%ecx - movl 56(%esi),%ebp - movl %edx,%edi - - leal 4259657740(%ebx,%ebp,1),%ebx - addl %edx,%ecx - xorl %eax,%edi - xorl %ecx,%edi - movl 4(%esi),%ebp - addl %edi,%ebx - movl %ecx,%edi - roll $23,%ebx - addl %ecx,%ebx - - xorl %edx,%edi - xorl %ebx,%edi - leal 2763975236(%eax,%ebp,1),%eax - addl %edi,%eax - roll $4,%eax - movl 16(%esi),%ebp - movl %ebx,%edi - - leal 1272893353(%edx,%ebp,1),%edx - addl %ebx,%eax - xorl %ecx,%edi - xorl %eax,%edi - movl 28(%esi),%ebp - addl %edi,%edx - movl %eax,%edi - roll $11,%edx - addl %eax,%edx - - xorl %ebx,%edi - xorl %edx,%edi - leal 4139469664(%ecx,%ebp,1),%ecx - addl %edi,%ecx - roll $16,%ecx - movl 40(%esi),%ebp - movl %edx,%edi - - leal 3200236656(%ebx,%ebp,1),%ebx - addl %edx,%ecx - xorl %eax,%edi - xorl %ecx,%edi - movl 52(%esi),%ebp - addl %edi,%ebx - movl %ecx,%edi - roll $23,%ebx - addl %ecx,%ebx - - xorl %edx,%edi - xorl %ebx,%edi - leal 681279174(%eax,%ebp,1),%eax - addl %edi,%eax - roll $4,%eax - movl (%esi),%ebp - movl %ebx,%edi - - leal 3936430074(%edx,%ebp,1),%edx - addl %ebx,%eax - xorl %ecx,%edi - xorl %eax,%edi - movl 12(%esi),%ebp - addl %edi,%edx - movl %eax,%edi - roll $11,%edx - addl %eax,%edx - - xorl %ebx,%edi - xorl %edx,%edi - leal 3572445317(%ecx,%ebp,1),%ecx - addl %edi,%ecx - roll $16,%ecx - movl 24(%esi),%ebp - movl %edx,%edi - - leal 76029189(%ebx,%ebp,1),%ebx - addl %edx,%ecx - xorl %eax,%edi - xorl %ecx,%edi - movl 36(%esi),%ebp - addl %edi,%ebx - movl %ecx,%edi - roll $23,%ebx - addl %ecx,%ebx - - xorl %edx,%edi - xorl %ebx,%edi - leal 3654602809(%eax,%ebp,1),%eax - addl %edi,%eax - roll $4,%eax - movl 48(%esi),%ebp - movl %ebx,%edi - - leal 3873151461(%edx,%ebp,1),%edx - addl %ebx,%eax - xorl %ecx,%edi - xorl %eax,%edi - movl 60(%esi),%ebp - addl %edi,%edx - movl %eax,%edi - roll $11,%edx - addl %eax,%edx - - xorl %ebx,%edi - xorl %edx,%edi - leal 530742520(%ecx,%ebp,1),%ecx - addl %edi,%ecx - roll $16,%ecx - movl 8(%esi),%ebp - movl %edx,%edi - - leal 3299628645(%ebx,%ebp,1),%ebx - addl %edx,%ecx - xorl %eax,%edi - xorl %ecx,%edi - movl (%esi),%ebp - addl %edi,%ebx - movl $-1,%edi - roll $23,%ebx - addl %ecx,%ebx - - - - xorl %edx,%edi - orl %ebx,%edi - leal 4096336452(%eax,%ebp,1),%eax - xorl %ecx,%edi - movl 28(%esi),%ebp - addl %edi,%eax - movl $-1,%edi - roll $6,%eax - xorl %ecx,%edi - addl %ebx,%eax - - orl %eax,%edi - leal 1126891415(%edx,%ebp,1),%edx - xorl %ebx,%edi - movl 56(%esi),%ebp - addl %edi,%edx - movl $-1,%edi - roll $10,%edx - xorl %ebx,%edi - addl %eax,%edx - - orl %edx,%edi - leal 2878612391(%ecx,%ebp,1),%ecx - xorl %eax,%edi - movl 20(%esi),%ebp - addl %edi,%ecx - movl $-1,%edi - roll $15,%ecx - xorl %eax,%edi - addl %edx,%ecx - - orl %ecx,%edi - leal 4237533241(%ebx,%ebp,1),%ebx - xorl %edx,%edi - movl 48(%esi),%ebp - addl %edi,%ebx - movl $-1,%edi - roll $21,%ebx - xorl %edx,%edi - addl %ecx,%ebx - - orl %ebx,%edi - leal 1700485571(%eax,%ebp,1),%eax - xorl %ecx,%edi - movl 12(%esi),%ebp - addl %edi,%eax - movl $-1,%edi - roll $6,%eax - xorl %ecx,%edi - addl %ebx,%eax - - orl %eax,%edi - leal 2399980690(%edx,%ebp,1),%edx - xorl %ebx,%edi - movl 40(%esi),%ebp - addl %edi,%edx - movl $-1,%edi - roll $10,%edx - xorl %ebx,%edi - addl %eax,%edx - - orl %edx,%edi - leal 4293915773(%ecx,%ebp,1),%ecx - xorl %eax,%edi - movl 4(%esi),%ebp - addl %edi,%ecx - movl $-1,%edi - roll $15,%ecx - xorl %eax,%edi - addl %edx,%ecx - - orl %ecx,%edi - leal 2240044497(%ebx,%ebp,1),%ebx - xorl %edx,%edi - movl 32(%esi),%ebp - addl %edi,%ebx - movl $-1,%edi - roll $21,%ebx - xorl %edx,%edi - addl %ecx,%ebx - - orl %ebx,%edi - leal 1873313359(%eax,%ebp,1),%eax - xorl %ecx,%edi - movl 60(%esi),%ebp - addl %edi,%eax - movl $-1,%edi - roll $6,%eax - xorl %ecx,%edi - addl %ebx,%eax - - orl %eax,%edi - leal 4264355552(%edx,%ebp,1),%edx - xorl %ebx,%edi - movl 24(%esi),%ebp - addl %edi,%edx - movl $-1,%edi - roll $10,%edx - xorl %ebx,%edi - addl %eax,%edx - - orl %edx,%edi - leal 2734768916(%ecx,%ebp,1),%ecx - xorl %eax,%edi - movl 52(%esi),%ebp - addl %edi,%ecx - movl $-1,%edi - roll $15,%ecx - xorl %eax,%edi - addl %edx,%ecx - - orl %ecx,%edi - leal 1309151649(%ebx,%ebp,1),%ebx - xorl %edx,%edi - movl 16(%esi),%ebp - addl %edi,%ebx - movl $-1,%edi - roll $21,%ebx - xorl %edx,%edi - addl %ecx,%ebx - - orl %ebx,%edi - leal 4149444226(%eax,%ebp,1),%eax - xorl %ecx,%edi - movl 44(%esi),%ebp - addl %edi,%eax - movl $-1,%edi - roll $6,%eax - xorl %ecx,%edi - addl %ebx,%eax - - orl %eax,%edi - leal 3174756917(%edx,%ebp,1),%edx - xorl %ebx,%edi - movl 8(%esi),%ebp - addl %edi,%edx - movl $-1,%edi - roll $10,%edx - xorl %ebx,%edi - addl %eax,%edx - - orl %edx,%edi - leal 718787259(%ecx,%ebp,1),%ecx - xorl %eax,%edi - movl 36(%esi),%ebp - addl %edi,%ecx - movl $-1,%edi - roll $15,%ecx - xorl %eax,%edi - addl %edx,%ecx - - orl %ecx,%edi - leal 3951481745(%ebx,%ebp,1),%ebx - xorl %edx,%edi - movl 24(%esp),%ebp - addl %edi,%ebx - addl $64,%esi - roll $21,%ebx - movl (%ebp),%edi - addl %ecx,%ebx - addl %edi,%eax - movl 4(%ebp),%edi - addl %edi,%ebx - movl 8(%ebp),%edi - addl %edi,%ecx - movl 12(%ebp),%edi - addl %edi,%edx - movl %eax,(%ebp) - movl %ebx,4(%ebp) - movl (%esp),%edi - movl %ecx,8(%ebp) - movl %edx,12(%ebp) - cmpl %esi,%edi - jae .L000start - popl %eax - popl %ebx - popl %ebp - popl %edi - popl %esi - ret -.size md5_block_asm_data_order,.-.L_md5_block_asm_data_order_begin -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S b/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S deleted file mode 100644 index e224da4d..00000000 --- a/third_party/boringssl/linux-x86/crypto/fipsmodule/sha1-586.S +++ /dev/null @@ -1,3808 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl sha1_block_data_order -.hidden sha1_block_data_order -.type sha1_block_data_order,@function -.align 16 -sha1_block_data_order: -.L_sha1_block_data_order_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call .L000pic_point -.L000pic_point: - popl %ebp - leal OPENSSL_ia32cap_P-.L000pic_point(%ebp),%esi - leal .LK_XX_XX-.L000pic_point(%ebp),%ebp - movl (%esi),%eax - movl 4(%esi),%edx - testl $512,%edx - jz .L001x86 - movl 8(%esi),%ecx - testl $16777216,%eax - jz .L001x86 - andl $268435456,%edx - andl $1073741824,%eax - orl %edx,%eax - cmpl $1342177280,%eax - je .Lavx_shortcut - jmp .Lssse3_shortcut -.align 16 -.L001x86: - movl 20(%esp),%ebp - movl 24(%esp),%esi - movl 28(%esp),%eax - subl $76,%esp - shll $6,%eax - addl %esi,%eax - movl %eax,104(%esp) - movl 16(%ebp),%edi - jmp .L002loop -.align 16 -.L002loop: - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,(%esp) - movl %ebx,4(%esp) - movl %ecx,8(%esp) - movl %edx,12(%esp) - movl 16(%esi),%eax - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,16(%esp) - movl %ebx,20(%esp) - movl %ecx,24(%esp) - movl %edx,28(%esp) - movl 32(%esi),%eax - movl 36(%esi),%ebx - movl 40(%esi),%ecx - movl 44(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,32(%esp) - movl %ebx,36(%esp) - movl %ecx,40(%esp) - movl %edx,44(%esp) - movl 48(%esi),%eax - movl 52(%esi),%ebx - movl 56(%esi),%ecx - movl 60(%esi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - movl %eax,48(%esp) - movl %ebx,52(%esp) - movl %ecx,56(%esp) - movl %edx,60(%esp) - movl %esi,100(%esp) - movl (%ebp),%eax - movl 4(%ebp),%ebx - movl 8(%ebp),%ecx - movl 12(%ebp),%edx - - movl %ecx,%esi - movl %eax,%ebp - roll $5,%ebp - xorl %edx,%esi - addl %edi,%ebp - movl (%esp),%edi - andl %ebx,%esi - rorl $2,%ebx - xorl %edx,%esi - leal 1518500249(%ebp,%edi,1),%ebp - addl %esi,%ebp - - movl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - xorl %ecx,%edi - addl %edx,%ebp - movl 4(%esp),%edx - andl %eax,%edi - rorl $2,%eax - xorl %ecx,%edi - leal 1518500249(%ebp,%edx,1),%ebp - addl %edi,%ebp - - movl %eax,%edx - movl %ebp,%edi - roll $5,%ebp - xorl %ebx,%edx - addl %ecx,%ebp - movl 8(%esp),%ecx - andl %esi,%edx - rorl $2,%esi - xorl %ebx,%edx - leal 1518500249(%ebp,%ecx,1),%ebp - addl %edx,%ebp - - movl %esi,%ecx - movl %ebp,%edx - roll $5,%ebp - xorl %eax,%ecx - addl %ebx,%ebp - movl 12(%esp),%ebx - andl %edi,%ecx - rorl $2,%edi - xorl %eax,%ecx - leal 1518500249(%ebp,%ebx,1),%ebp - addl %ecx,%ebp - - movl %edi,%ebx - movl %ebp,%ecx - roll $5,%ebp - xorl %esi,%ebx - addl %eax,%ebp - movl 16(%esp),%eax - andl %edx,%ebx - rorl $2,%edx - xorl %esi,%ebx - leal 1518500249(%ebp,%eax,1),%ebp - addl %ebx,%ebp - - movl %edx,%eax - movl %ebp,%ebx - roll $5,%ebp - xorl %edi,%eax - addl %esi,%ebp - movl 20(%esp),%esi - andl %ecx,%eax - rorl $2,%ecx - xorl %edi,%eax - leal 1518500249(%ebp,%esi,1),%ebp - addl %eax,%ebp - - movl %ecx,%esi - movl %ebp,%eax - roll $5,%ebp - xorl %edx,%esi - addl %edi,%ebp - movl 24(%esp),%edi - andl %ebx,%esi - rorl $2,%ebx - xorl %edx,%esi - leal 1518500249(%ebp,%edi,1),%ebp - addl %esi,%ebp - - movl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - xorl %ecx,%edi - addl %edx,%ebp - movl 28(%esp),%edx - andl %eax,%edi - rorl $2,%eax - xorl %ecx,%edi - leal 1518500249(%ebp,%edx,1),%ebp - addl %edi,%ebp - - movl %eax,%edx - movl %ebp,%edi - roll $5,%ebp - xorl %ebx,%edx - addl %ecx,%ebp - movl 32(%esp),%ecx - andl %esi,%edx - rorl $2,%esi - xorl %ebx,%edx - leal 1518500249(%ebp,%ecx,1),%ebp - addl %edx,%ebp - - movl %esi,%ecx - movl %ebp,%edx - roll $5,%ebp - xorl %eax,%ecx - addl %ebx,%ebp - movl 36(%esp),%ebx - andl %edi,%ecx - rorl $2,%edi - xorl %eax,%ecx - leal 1518500249(%ebp,%ebx,1),%ebp - addl %ecx,%ebp - - movl %edi,%ebx - movl %ebp,%ecx - roll $5,%ebp - xorl %esi,%ebx - addl %eax,%ebp - movl 40(%esp),%eax - andl %edx,%ebx - rorl $2,%edx - xorl %esi,%ebx - leal 1518500249(%ebp,%eax,1),%ebp - addl %ebx,%ebp - - movl %edx,%eax - movl %ebp,%ebx - roll $5,%ebp - xorl %edi,%eax - addl %esi,%ebp - movl 44(%esp),%esi - andl %ecx,%eax - rorl $2,%ecx - xorl %edi,%eax - leal 1518500249(%ebp,%esi,1),%ebp - addl %eax,%ebp - - movl %ecx,%esi - movl %ebp,%eax - roll $5,%ebp - xorl %edx,%esi - addl %edi,%ebp - movl 48(%esp),%edi - andl %ebx,%esi - rorl $2,%ebx - xorl %edx,%esi - leal 1518500249(%ebp,%edi,1),%ebp - addl %esi,%ebp - - movl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - xorl %ecx,%edi - addl %edx,%ebp - movl 52(%esp),%edx - andl %eax,%edi - rorl $2,%eax - xorl %ecx,%edi - leal 1518500249(%ebp,%edx,1),%ebp - addl %edi,%ebp - - movl %eax,%edx - movl %ebp,%edi - roll $5,%ebp - xorl %ebx,%edx - addl %ecx,%ebp - movl 56(%esp),%ecx - andl %esi,%edx - rorl $2,%esi - xorl %ebx,%edx - leal 1518500249(%ebp,%ecx,1),%ebp - addl %edx,%ebp - - movl %esi,%ecx - movl %ebp,%edx - roll $5,%ebp - xorl %eax,%ecx - addl %ebx,%ebp - movl 60(%esp),%ebx - andl %edi,%ecx - rorl $2,%edi - xorl %eax,%ecx - leal 1518500249(%ebp,%ebx,1),%ebp - movl (%esp),%ebx - addl %ebp,%ecx - - movl %edi,%ebp - xorl 8(%esp),%ebx - xorl %esi,%ebp - xorl 32(%esp),%ebx - andl %edx,%ebp - xorl 52(%esp),%ebx - roll $1,%ebx - xorl %esi,%ebp - addl %ebp,%eax - movl %ecx,%ebp - rorl $2,%edx - movl %ebx,(%esp) - roll $5,%ebp - leal 1518500249(%ebx,%eax,1),%ebx - movl 4(%esp),%eax - addl %ebp,%ebx - - movl %edx,%ebp - xorl 12(%esp),%eax - xorl %edi,%ebp - xorl 36(%esp),%eax - andl %ecx,%ebp - xorl 56(%esp),%eax - roll $1,%eax - xorl %edi,%ebp - addl %ebp,%esi - movl %ebx,%ebp - rorl $2,%ecx - movl %eax,4(%esp) - roll $5,%ebp - leal 1518500249(%eax,%esi,1),%eax - movl 8(%esp),%esi - addl %ebp,%eax - - movl %ecx,%ebp - xorl 16(%esp),%esi - xorl %edx,%ebp - xorl 40(%esp),%esi - andl %ebx,%ebp - xorl 60(%esp),%esi - roll $1,%esi - xorl %edx,%ebp - addl %ebp,%edi - movl %eax,%ebp - rorl $2,%ebx - movl %esi,8(%esp) - roll $5,%ebp - leal 1518500249(%esi,%edi,1),%esi - movl 12(%esp),%edi - addl %ebp,%esi - - movl %ebx,%ebp - xorl 20(%esp),%edi - xorl %ecx,%ebp - xorl 44(%esp),%edi - andl %eax,%ebp - xorl (%esp),%edi - roll $1,%edi - xorl %ecx,%ebp - addl %ebp,%edx - movl %esi,%ebp - rorl $2,%eax - movl %edi,12(%esp) - roll $5,%ebp - leal 1518500249(%edi,%edx,1),%edi - movl 16(%esp),%edx - addl %ebp,%edi - - movl %esi,%ebp - xorl 24(%esp),%edx - xorl %eax,%ebp - xorl 48(%esp),%edx - xorl %ebx,%ebp - xorl 4(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,16(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 20(%esp),%ecx - addl %ebp,%edx - - movl %edi,%ebp - xorl 28(%esp),%ecx - xorl %esi,%ebp - xorl 52(%esp),%ecx - xorl %eax,%ebp - xorl 8(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,20(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 24(%esp),%ebx - addl %ebp,%ecx - - movl %edx,%ebp - xorl 32(%esp),%ebx - xorl %edi,%ebp - xorl 56(%esp),%ebx - xorl %esi,%ebp - xorl 12(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,24(%esp) - leal 1859775393(%ebx,%eax,1),%ebx - movl 28(%esp),%eax - addl %ebp,%ebx - - movl %ecx,%ebp - xorl 36(%esp),%eax - xorl %edx,%ebp - xorl 60(%esp),%eax - xorl %edi,%ebp - xorl 16(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,28(%esp) - leal 1859775393(%eax,%esi,1),%eax - movl 32(%esp),%esi - addl %ebp,%eax - - movl %ebx,%ebp - xorl 40(%esp),%esi - xorl %ecx,%ebp - xorl (%esp),%esi - xorl %edx,%ebp - xorl 20(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,32(%esp) - leal 1859775393(%esi,%edi,1),%esi - movl 36(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 44(%esp),%edi - xorl %ebx,%ebp - xorl 4(%esp),%edi - xorl %ecx,%ebp - xorl 24(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,36(%esp) - leal 1859775393(%edi,%edx,1),%edi - movl 40(%esp),%edx - addl %ebp,%edi - - movl %esi,%ebp - xorl 48(%esp),%edx - xorl %eax,%ebp - xorl 8(%esp),%edx - xorl %ebx,%ebp - xorl 28(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,40(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 44(%esp),%ecx - addl %ebp,%edx - - movl %edi,%ebp - xorl 52(%esp),%ecx - xorl %esi,%ebp - xorl 12(%esp),%ecx - xorl %eax,%ebp - xorl 32(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,44(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 48(%esp),%ebx - addl %ebp,%ecx - - movl %edx,%ebp - xorl 56(%esp),%ebx - xorl %edi,%ebp - xorl 16(%esp),%ebx - xorl %esi,%ebp - xorl 36(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,48(%esp) - leal 1859775393(%ebx,%eax,1),%ebx - movl 52(%esp),%eax - addl %ebp,%ebx - - movl %ecx,%ebp - xorl 60(%esp),%eax - xorl %edx,%ebp - xorl 20(%esp),%eax - xorl %edi,%ebp - xorl 40(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,52(%esp) - leal 1859775393(%eax,%esi,1),%eax - movl 56(%esp),%esi - addl %ebp,%eax - - movl %ebx,%ebp - xorl (%esp),%esi - xorl %ecx,%ebp - xorl 24(%esp),%esi - xorl %edx,%ebp - xorl 44(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,56(%esp) - leal 1859775393(%esi,%edi,1),%esi - movl 60(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 4(%esp),%edi - xorl %ebx,%ebp - xorl 28(%esp),%edi - xorl %ecx,%ebp - xorl 48(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,60(%esp) - leal 1859775393(%edi,%edx,1),%edi - movl (%esp),%edx - addl %ebp,%edi - - movl %esi,%ebp - xorl 8(%esp),%edx - xorl %eax,%ebp - xorl 32(%esp),%edx - xorl %ebx,%ebp - xorl 52(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 4(%esp),%ecx - addl %ebp,%edx - - movl %edi,%ebp - xorl 12(%esp),%ecx - xorl %esi,%ebp - xorl 36(%esp),%ecx - xorl %eax,%ebp - xorl 56(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,4(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 8(%esp),%ebx - addl %ebp,%ecx - - movl %edx,%ebp - xorl 16(%esp),%ebx - xorl %edi,%ebp - xorl 40(%esp),%ebx - xorl %esi,%ebp - xorl 60(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,8(%esp) - leal 1859775393(%ebx,%eax,1),%ebx - movl 12(%esp),%eax - addl %ebp,%ebx - - movl %ecx,%ebp - xorl 20(%esp),%eax - xorl %edx,%ebp - xorl 44(%esp),%eax - xorl %edi,%ebp - xorl (%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,12(%esp) - leal 1859775393(%eax,%esi,1),%eax - movl 16(%esp),%esi - addl %ebp,%eax - - movl %ebx,%ebp - xorl 24(%esp),%esi - xorl %ecx,%ebp - xorl 48(%esp),%esi - xorl %edx,%ebp - xorl 4(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,16(%esp) - leal 1859775393(%esi,%edi,1),%esi - movl 20(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 28(%esp),%edi - xorl %ebx,%ebp - xorl 52(%esp),%edi - xorl %ecx,%ebp - xorl 8(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,20(%esp) - leal 1859775393(%edi,%edx,1),%edi - movl 24(%esp),%edx - addl %ebp,%edi - - movl %esi,%ebp - xorl 32(%esp),%edx - xorl %eax,%ebp - xorl 56(%esp),%edx - xorl %ebx,%ebp - xorl 12(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,24(%esp) - leal 1859775393(%edx,%ecx,1),%edx - movl 28(%esp),%ecx - addl %ebp,%edx - - movl %edi,%ebp - xorl 36(%esp),%ecx - xorl %esi,%ebp - xorl 60(%esp),%ecx - xorl %eax,%ebp - xorl 16(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,28(%esp) - leal 1859775393(%ecx,%ebx,1),%ecx - movl 32(%esp),%ebx - addl %ebp,%ecx - - movl %edi,%ebp - xorl 40(%esp),%ebx - xorl %esi,%ebp - xorl (%esp),%ebx - andl %edx,%ebp - xorl 20(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,32(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 36(%esp),%eax - addl %ebp,%ebx - - movl %edx,%ebp - xorl 44(%esp),%eax - xorl %edi,%ebp - xorl 4(%esp),%eax - andl %ecx,%ebp - xorl 24(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx - movl %ebx,%esi - roll $5,%esi - movl %eax,36(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax - andl %edi,%ebp - movl 40(%esp),%esi - addl %ebp,%eax - - movl %ecx,%ebp - xorl 48(%esp),%esi - xorl %edx,%ebp - xorl 8(%esp),%esi - andl %ebx,%ebp - xorl 28(%esp),%esi - roll $1,%esi - addl %edi,%ebp - rorl $2,%ebx - movl %eax,%edi - roll $5,%edi - movl %esi,40(%esp) - leal 2400959708(%esi,%ebp,1),%esi - movl %ecx,%ebp - addl %edi,%esi - andl %edx,%ebp - movl 44(%esp),%edi - addl %ebp,%esi - - movl %ebx,%ebp - xorl 52(%esp),%edi - xorl %ecx,%ebp - xorl 12(%esp),%edi - andl %eax,%ebp - xorl 32(%esp),%edi - roll $1,%edi - addl %edx,%ebp - rorl $2,%eax - movl %esi,%edx - roll $5,%edx - movl %edi,44(%esp) - leal 2400959708(%edi,%ebp,1),%edi - movl %ebx,%ebp - addl %edx,%edi - andl %ecx,%ebp - movl 48(%esp),%edx - addl %ebp,%edi - - movl %eax,%ebp - xorl 56(%esp),%edx - xorl %ebx,%ebp - xorl 16(%esp),%edx - andl %esi,%ebp - xorl 36(%esp),%edx - roll $1,%edx - addl %ecx,%ebp - rorl $2,%esi - movl %edi,%ecx - roll $5,%ecx - movl %edx,48(%esp) - leal 2400959708(%edx,%ebp,1),%edx - movl %eax,%ebp - addl %ecx,%edx - andl %ebx,%ebp - movl 52(%esp),%ecx - addl %ebp,%edx - - movl %esi,%ebp - xorl 60(%esp),%ecx - xorl %eax,%ebp - xorl 20(%esp),%ecx - andl %edi,%ebp - xorl 40(%esp),%ecx - roll $1,%ecx - addl %ebx,%ebp - rorl $2,%edi - movl %edx,%ebx - roll $5,%ebx - movl %ecx,52(%esp) - leal 2400959708(%ecx,%ebp,1),%ecx - movl %esi,%ebp - addl %ebx,%ecx - andl %eax,%ebp - movl 56(%esp),%ebx - addl %ebp,%ecx - - movl %edi,%ebp - xorl (%esp),%ebx - xorl %esi,%ebp - xorl 24(%esp),%ebx - andl %edx,%ebp - xorl 44(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,56(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 60(%esp),%eax - addl %ebp,%ebx - - movl %edx,%ebp - xorl 4(%esp),%eax - xorl %edi,%ebp - xorl 28(%esp),%eax - andl %ecx,%ebp - xorl 48(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx - movl %ebx,%esi - roll $5,%esi - movl %eax,60(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax - andl %edi,%ebp - movl (%esp),%esi - addl %ebp,%eax - - movl %ecx,%ebp - xorl 8(%esp),%esi - xorl %edx,%ebp - xorl 32(%esp),%esi - andl %ebx,%ebp - xorl 52(%esp),%esi - roll $1,%esi - addl %edi,%ebp - rorl $2,%ebx - movl %eax,%edi - roll $5,%edi - movl %esi,(%esp) - leal 2400959708(%esi,%ebp,1),%esi - movl %ecx,%ebp - addl %edi,%esi - andl %edx,%ebp - movl 4(%esp),%edi - addl %ebp,%esi - - movl %ebx,%ebp - xorl 12(%esp),%edi - xorl %ecx,%ebp - xorl 36(%esp),%edi - andl %eax,%ebp - xorl 56(%esp),%edi - roll $1,%edi - addl %edx,%ebp - rorl $2,%eax - movl %esi,%edx - roll $5,%edx - movl %edi,4(%esp) - leal 2400959708(%edi,%ebp,1),%edi - movl %ebx,%ebp - addl %edx,%edi - andl %ecx,%ebp - movl 8(%esp),%edx - addl %ebp,%edi - - movl %eax,%ebp - xorl 16(%esp),%edx - xorl %ebx,%ebp - xorl 40(%esp),%edx - andl %esi,%ebp - xorl 60(%esp),%edx - roll $1,%edx - addl %ecx,%ebp - rorl $2,%esi - movl %edi,%ecx - roll $5,%ecx - movl %edx,8(%esp) - leal 2400959708(%edx,%ebp,1),%edx - movl %eax,%ebp - addl %ecx,%edx - andl %ebx,%ebp - movl 12(%esp),%ecx - addl %ebp,%edx - - movl %esi,%ebp - xorl 20(%esp),%ecx - xorl %eax,%ebp - xorl 44(%esp),%ecx - andl %edi,%ebp - xorl (%esp),%ecx - roll $1,%ecx - addl %ebx,%ebp - rorl $2,%edi - movl %edx,%ebx - roll $5,%ebx - movl %ecx,12(%esp) - leal 2400959708(%ecx,%ebp,1),%ecx - movl %esi,%ebp - addl %ebx,%ecx - andl %eax,%ebp - movl 16(%esp),%ebx - addl %ebp,%ecx - - movl %edi,%ebp - xorl 24(%esp),%ebx - xorl %esi,%ebp - xorl 48(%esp),%ebx - andl %edx,%ebp - xorl 4(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,16(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 20(%esp),%eax - addl %ebp,%ebx - - movl %edx,%ebp - xorl 28(%esp),%eax - xorl %edi,%ebp - xorl 52(%esp),%eax - andl %ecx,%ebp - xorl 8(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx - movl %ebx,%esi - roll $5,%esi - movl %eax,20(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax - andl %edi,%ebp - movl 24(%esp),%esi - addl %ebp,%eax - - movl %ecx,%ebp - xorl 32(%esp),%esi - xorl %edx,%ebp - xorl 56(%esp),%esi - andl %ebx,%ebp - xorl 12(%esp),%esi - roll $1,%esi - addl %edi,%ebp - rorl $2,%ebx - movl %eax,%edi - roll $5,%edi - movl %esi,24(%esp) - leal 2400959708(%esi,%ebp,1),%esi - movl %ecx,%ebp - addl %edi,%esi - andl %edx,%ebp - movl 28(%esp),%edi - addl %ebp,%esi - - movl %ebx,%ebp - xorl 36(%esp),%edi - xorl %ecx,%ebp - xorl 60(%esp),%edi - andl %eax,%ebp - xorl 16(%esp),%edi - roll $1,%edi - addl %edx,%ebp - rorl $2,%eax - movl %esi,%edx - roll $5,%edx - movl %edi,28(%esp) - leal 2400959708(%edi,%ebp,1),%edi - movl %ebx,%ebp - addl %edx,%edi - andl %ecx,%ebp - movl 32(%esp),%edx - addl %ebp,%edi - - movl %eax,%ebp - xorl 40(%esp),%edx - xorl %ebx,%ebp - xorl (%esp),%edx - andl %esi,%ebp - xorl 20(%esp),%edx - roll $1,%edx - addl %ecx,%ebp - rorl $2,%esi - movl %edi,%ecx - roll $5,%ecx - movl %edx,32(%esp) - leal 2400959708(%edx,%ebp,1),%edx - movl %eax,%ebp - addl %ecx,%edx - andl %ebx,%ebp - movl 36(%esp),%ecx - addl %ebp,%edx - - movl %esi,%ebp - xorl 44(%esp),%ecx - xorl %eax,%ebp - xorl 4(%esp),%ecx - andl %edi,%ebp - xorl 24(%esp),%ecx - roll $1,%ecx - addl %ebx,%ebp - rorl $2,%edi - movl %edx,%ebx - roll $5,%ebx - movl %ecx,36(%esp) - leal 2400959708(%ecx,%ebp,1),%ecx - movl %esi,%ebp - addl %ebx,%ecx - andl %eax,%ebp - movl 40(%esp),%ebx - addl %ebp,%ecx - - movl %edi,%ebp - xorl 48(%esp),%ebx - xorl %esi,%ebp - xorl 8(%esp),%ebx - andl %edx,%ebp - xorl 28(%esp),%ebx - roll $1,%ebx - addl %eax,%ebp - rorl $2,%edx - movl %ecx,%eax - roll $5,%eax - movl %ebx,40(%esp) - leal 2400959708(%ebx,%ebp,1),%ebx - movl %edi,%ebp - addl %eax,%ebx - andl %esi,%ebp - movl 44(%esp),%eax - addl %ebp,%ebx - - movl %edx,%ebp - xorl 52(%esp),%eax - xorl %edi,%ebp - xorl 12(%esp),%eax - andl %ecx,%ebp - xorl 32(%esp),%eax - roll $1,%eax - addl %esi,%ebp - rorl $2,%ecx - movl %ebx,%esi - roll $5,%esi - movl %eax,44(%esp) - leal 2400959708(%eax,%ebp,1),%eax - movl %edx,%ebp - addl %esi,%eax - andl %edi,%ebp - movl 48(%esp),%esi - addl %ebp,%eax - - movl %ebx,%ebp - xorl 56(%esp),%esi - xorl %ecx,%ebp - xorl 16(%esp),%esi - xorl %edx,%ebp - xorl 36(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,48(%esp) - leal 3395469782(%esi,%edi,1),%esi - movl 52(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 60(%esp),%edi - xorl %ebx,%ebp - xorl 20(%esp),%edi - xorl %ecx,%ebp - xorl 40(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,52(%esp) - leal 3395469782(%edi,%edx,1),%edi - movl 56(%esp),%edx - addl %ebp,%edi - - movl %esi,%ebp - xorl (%esp),%edx - xorl %eax,%ebp - xorl 24(%esp),%edx - xorl %ebx,%ebp - xorl 44(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,56(%esp) - leal 3395469782(%edx,%ecx,1),%edx - movl 60(%esp),%ecx - addl %ebp,%edx - - movl %edi,%ebp - xorl 4(%esp),%ecx - xorl %esi,%ebp - xorl 28(%esp),%ecx - xorl %eax,%ebp - xorl 48(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,60(%esp) - leal 3395469782(%ecx,%ebx,1),%ecx - movl (%esp),%ebx - addl %ebp,%ecx - - movl %edx,%ebp - xorl 8(%esp),%ebx - xorl %edi,%ebp - xorl 32(%esp),%ebx - xorl %esi,%ebp - xorl 52(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,(%esp) - leal 3395469782(%ebx,%eax,1),%ebx - movl 4(%esp),%eax - addl %ebp,%ebx - - movl %ecx,%ebp - xorl 12(%esp),%eax - xorl %edx,%ebp - xorl 36(%esp),%eax - xorl %edi,%ebp - xorl 56(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,4(%esp) - leal 3395469782(%eax,%esi,1),%eax - movl 8(%esp),%esi - addl %ebp,%eax - - movl %ebx,%ebp - xorl 16(%esp),%esi - xorl %ecx,%ebp - xorl 40(%esp),%esi - xorl %edx,%ebp - xorl 60(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,8(%esp) - leal 3395469782(%esi,%edi,1),%esi - movl 12(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 20(%esp),%edi - xorl %ebx,%ebp - xorl 44(%esp),%edi - xorl %ecx,%ebp - xorl (%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,12(%esp) - leal 3395469782(%edi,%edx,1),%edi - movl 16(%esp),%edx - addl %ebp,%edi - - movl %esi,%ebp - xorl 24(%esp),%edx - xorl %eax,%ebp - xorl 48(%esp),%edx - xorl %ebx,%ebp - xorl 4(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,16(%esp) - leal 3395469782(%edx,%ecx,1),%edx - movl 20(%esp),%ecx - addl %ebp,%edx - - movl %edi,%ebp - xorl 28(%esp),%ecx - xorl %esi,%ebp - xorl 52(%esp),%ecx - xorl %eax,%ebp - xorl 8(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,20(%esp) - leal 3395469782(%ecx,%ebx,1),%ecx - movl 24(%esp),%ebx - addl %ebp,%ecx - - movl %edx,%ebp - xorl 32(%esp),%ebx - xorl %edi,%ebp - xorl 56(%esp),%ebx - xorl %esi,%ebp - xorl 12(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,24(%esp) - leal 3395469782(%ebx,%eax,1),%ebx - movl 28(%esp),%eax - addl %ebp,%ebx - - movl %ecx,%ebp - xorl 36(%esp),%eax - xorl %edx,%ebp - xorl 60(%esp),%eax - xorl %edi,%ebp - xorl 16(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - movl %eax,28(%esp) - leal 3395469782(%eax,%esi,1),%eax - movl 32(%esp),%esi - addl %ebp,%eax - - movl %ebx,%ebp - xorl 40(%esp),%esi - xorl %ecx,%ebp - xorl (%esp),%esi - xorl %edx,%ebp - xorl 20(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - movl %esi,32(%esp) - leal 3395469782(%esi,%edi,1),%esi - movl 36(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 44(%esp),%edi - xorl %ebx,%ebp - xorl 4(%esp),%edi - xorl %ecx,%ebp - xorl 24(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - movl %edi,36(%esp) - leal 3395469782(%edi,%edx,1),%edi - movl 40(%esp),%edx - addl %ebp,%edi - - movl %esi,%ebp - xorl 48(%esp),%edx - xorl %eax,%ebp - xorl 8(%esp),%edx - xorl %ebx,%ebp - xorl 28(%esp),%edx - roll $1,%edx - addl %ebp,%ecx - rorl $2,%esi - movl %edi,%ebp - roll $5,%ebp - movl %edx,40(%esp) - leal 3395469782(%edx,%ecx,1),%edx - movl 44(%esp),%ecx - addl %ebp,%edx - - movl %edi,%ebp - xorl 52(%esp),%ecx - xorl %esi,%ebp - xorl 12(%esp),%ecx - xorl %eax,%ebp - xorl 32(%esp),%ecx - roll $1,%ecx - addl %ebp,%ebx - rorl $2,%edi - movl %edx,%ebp - roll $5,%ebp - movl %ecx,44(%esp) - leal 3395469782(%ecx,%ebx,1),%ecx - movl 48(%esp),%ebx - addl %ebp,%ecx - - movl %edx,%ebp - xorl 56(%esp),%ebx - xorl %edi,%ebp - xorl 16(%esp),%ebx - xorl %esi,%ebp - xorl 36(%esp),%ebx - roll $1,%ebx - addl %ebp,%eax - rorl $2,%edx - movl %ecx,%ebp - roll $5,%ebp - movl %ebx,48(%esp) - leal 3395469782(%ebx,%eax,1),%ebx - movl 52(%esp),%eax - addl %ebp,%ebx - - movl %ecx,%ebp - xorl 60(%esp),%eax - xorl %edx,%ebp - xorl 20(%esp),%eax - xorl %edi,%ebp - xorl 40(%esp),%eax - roll $1,%eax - addl %ebp,%esi - rorl $2,%ecx - movl %ebx,%ebp - roll $5,%ebp - leal 3395469782(%eax,%esi,1),%eax - movl 56(%esp),%esi - addl %ebp,%eax - - movl %ebx,%ebp - xorl (%esp),%esi - xorl %ecx,%ebp - xorl 24(%esp),%esi - xorl %edx,%ebp - xorl 44(%esp),%esi - roll $1,%esi - addl %ebp,%edi - rorl $2,%ebx - movl %eax,%ebp - roll $5,%ebp - leal 3395469782(%esi,%edi,1),%esi - movl 60(%esp),%edi - addl %ebp,%esi - - movl %eax,%ebp - xorl 4(%esp),%edi - xorl %ebx,%ebp - xorl 28(%esp),%edi - xorl %ecx,%ebp - xorl 48(%esp),%edi - roll $1,%edi - addl %ebp,%edx - rorl $2,%eax - movl %esi,%ebp - roll $5,%ebp - leal 3395469782(%edi,%edx,1),%edi - addl %ebp,%edi - movl 96(%esp),%ebp - movl 100(%esp),%edx - addl (%ebp),%edi - addl 4(%ebp),%esi - addl 8(%ebp),%eax - addl 12(%ebp),%ebx - addl 16(%ebp),%ecx - movl %edi,(%ebp) - addl $64,%edx - movl %esi,4(%ebp) - cmpl 104(%esp),%edx - movl %eax,8(%ebp) - movl %ecx,%edi - movl %ebx,12(%ebp) - movl %edx,%esi - movl %ecx,16(%ebp) - jb .L002loop - addl $76,%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size sha1_block_data_order,.-.L_sha1_block_data_order_begin -.hidden _sha1_block_data_order_ssse3 -.type _sha1_block_data_order_ssse3,@function -.align 16 -_sha1_block_data_order_ssse3: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call .L003pic_point -.L003pic_point: - popl %ebp - leal .LK_XX_XX-.L003pic_point(%ebp),%ebp -.Lssse3_shortcut: - movdqa (%ebp),%xmm7 - movdqa 16(%ebp),%xmm0 - movdqa 32(%ebp),%xmm1 - movdqa 48(%ebp),%xmm2 - movdqa 64(%ebp),%xmm6 - movl 20(%esp),%edi - movl 24(%esp),%ebp - movl 28(%esp),%edx - movl %esp,%esi - subl $208,%esp - andl $-64,%esp - movdqa %xmm0,112(%esp) - movdqa %xmm1,128(%esp) - movdqa %xmm2,144(%esp) - shll $6,%edx - movdqa %xmm7,160(%esp) - addl %ebp,%edx - movdqa %xmm6,176(%esp) - addl $64,%ebp - movl %edi,192(%esp) - movl %ebp,196(%esp) - movl %edx,200(%esp) - movl %esi,204(%esp) - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - movl 16(%edi),%edi - movl %ebx,%esi - movdqu -64(%ebp),%xmm0 - movdqu -48(%ebp),%xmm1 - movdqu -32(%ebp),%xmm2 - movdqu -16(%ebp),%xmm3 -.byte 102,15,56,0,198 -.byte 102,15,56,0,206 -.byte 102,15,56,0,214 - movdqa %xmm7,96(%esp) -.byte 102,15,56,0,222 - paddd %xmm7,%xmm0 - paddd %xmm7,%xmm1 - paddd %xmm7,%xmm2 - movdqa %xmm0,(%esp) - psubd %xmm7,%xmm0 - movdqa %xmm1,16(%esp) - psubd %xmm7,%xmm1 - movdqa %xmm2,32(%esp) - movl %ecx,%ebp - psubd %xmm7,%xmm2 - xorl %edx,%ebp - pshufd $238,%xmm0,%xmm4 - andl %ebp,%esi - jmp .L004loop -.align 16 -.L004loop: - rorl $2,%ebx - xorl %edx,%esi - movl %eax,%ebp - punpcklqdq %xmm1,%xmm4 - movdqa %xmm3,%xmm6 - addl (%esp),%edi - xorl %ecx,%ebx - paddd %xmm3,%xmm7 - movdqa %xmm0,64(%esp) - roll $5,%eax - addl %esi,%edi - psrldq $4,%xmm6 - andl %ebx,%ebp - xorl %ecx,%ebx - pxor %xmm0,%xmm4 - addl %eax,%edi - rorl $7,%eax - pxor %xmm2,%xmm6 - xorl %ecx,%ebp - movl %edi,%esi - addl 4(%esp),%edx - pxor %xmm6,%xmm4 - xorl %ebx,%eax - roll $5,%edi - movdqa %xmm7,48(%esp) - addl %ebp,%edx - andl %eax,%esi - movdqa %xmm4,%xmm0 - xorl %ebx,%eax - addl %edi,%edx - rorl $7,%edi - movdqa %xmm4,%xmm6 - xorl %ebx,%esi - pslldq $12,%xmm0 - paddd %xmm4,%xmm4 - movl %edx,%ebp - addl 8(%esp),%ecx - psrld $31,%xmm6 - xorl %eax,%edi - roll $5,%edx - movdqa %xmm0,%xmm7 - addl %esi,%ecx - andl %edi,%ebp - xorl %eax,%edi - psrld $30,%xmm0 - addl %edx,%ecx - rorl $7,%edx - por %xmm6,%xmm4 - xorl %eax,%ebp - movl %ecx,%esi - addl 12(%esp),%ebx - pslld $2,%xmm7 - xorl %edi,%edx - roll $5,%ecx - pxor %xmm0,%xmm4 - movdqa 96(%esp),%xmm0 - addl %ebp,%ebx - andl %edx,%esi - pxor %xmm7,%xmm4 - pshufd $238,%xmm1,%xmm5 - xorl %edi,%edx - addl %ecx,%ebx - rorl $7,%ecx - xorl %edi,%esi - movl %ebx,%ebp - punpcklqdq %xmm2,%xmm5 - movdqa %xmm4,%xmm7 - addl 16(%esp),%eax - xorl %edx,%ecx - paddd %xmm4,%xmm0 - movdqa %xmm1,80(%esp) - roll $5,%ebx - addl %esi,%eax - psrldq $4,%xmm7 - andl %ecx,%ebp - xorl %edx,%ecx - pxor %xmm1,%xmm5 - addl %ebx,%eax - rorl $7,%ebx - pxor %xmm3,%xmm7 - xorl %edx,%ebp - movl %eax,%esi - addl 20(%esp),%edi - pxor %xmm7,%xmm5 - xorl %ecx,%ebx - roll $5,%eax - movdqa %xmm0,(%esp) - addl %ebp,%edi - andl %ebx,%esi - movdqa %xmm5,%xmm1 - xorl %ecx,%ebx - addl %eax,%edi - rorl $7,%eax - movdqa %xmm5,%xmm7 - xorl %ecx,%esi - pslldq $12,%xmm1 - paddd %xmm5,%xmm5 - movl %edi,%ebp - addl 24(%esp),%edx - psrld $31,%xmm7 - xorl %ebx,%eax - roll $5,%edi - movdqa %xmm1,%xmm0 - addl %esi,%edx - andl %eax,%ebp - xorl %ebx,%eax - psrld $30,%xmm1 - addl %edi,%edx - rorl $7,%edi - por %xmm7,%xmm5 - xorl %ebx,%ebp - movl %edx,%esi - addl 28(%esp),%ecx - pslld $2,%xmm0 - xorl %eax,%edi - roll $5,%edx - pxor %xmm1,%xmm5 - movdqa 112(%esp),%xmm1 - addl %ebp,%ecx - andl %edi,%esi - pxor %xmm0,%xmm5 - pshufd $238,%xmm2,%xmm6 - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%edx - xorl %eax,%esi - movl %ecx,%ebp - punpcklqdq %xmm3,%xmm6 - movdqa %xmm5,%xmm0 - addl 32(%esp),%ebx - xorl %edi,%edx - paddd %xmm5,%xmm1 - movdqa %xmm2,96(%esp) - roll $5,%ecx - addl %esi,%ebx - psrldq $4,%xmm0 - andl %edx,%ebp - xorl %edi,%edx - pxor %xmm2,%xmm6 - addl %ecx,%ebx - rorl $7,%ecx - pxor %xmm4,%xmm0 - xorl %edi,%ebp - movl %ebx,%esi - addl 36(%esp),%eax - pxor %xmm0,%xmm6 - xorl %edx,%ecx - roll $5,%ebx - movdqa %xmm1,16(%esp) - addl %ebp,%eax - andl %ecx,%esi - movdqa %xmm6,%xmm2 - xorl %edx,%ecx - addl %ebx,%eax - rorl $7,%ebx - movdqa %xmm6,%xmm0 - xorl %edx,%esi - pslldq $12,%xmm2 - paddd %xmm6,%xmm6 - movl %eax,%ebp - addl 40(%esp),%edi - psrld $31,%xmm0 - xorl %ecx,%ebx - roll $5,%eax - movdqa %xmm2,%xmm1 - addl %esi,%edi - andl %ebx,%ebp - xorl %ecx,%ebx - psrld $30,%xmm2 - addl %eax,%edi - rorl $7,%eax - por %xmm0,%xmm6 - xorl %ecx,%ebp - movdqa 64(%esp),%xmm0 - movl %edi,%esi - addl 44(%esp),%edx - pslld $2,%xmm1 - xorl %ebx,%eax - roll $5,%edi - pxor %xmm2,%xmm6 - movdqa 112(%esp),%xmm2 - addl %ebp,%edx - andl %eax,%esi - pxor %xmm1,%xmm6 - pshufd $238,%xmm3,%xmm7 - xorl %ebx,%eax - addl %edi,%edx - rorl $7,%edi - xorl %ebx,%esi - movl %edx,%ebp - punpcklqdq %xmm4,%xmm7 - movdqa %xmm6,%xmm1 - addl 48(%esp),%ecx - xorl %eax,%edi - paddd %xmm6,%xmm2 - movdqa %xmm3,64(%esp) - roll $5,%edx - addl %esi,%ecx - psrldq $4,%xmm1 - andl %edi,%ebp - xorl %eax,%edi - pxor %xmm3,%xmm7 - addl %edx,%ecx - rorl $7,%edx - pxor %xmm5,%xmm1 - xorl %eax,%ebp - movl %ecx,%esi - addl 52(%esp),%ebx - pxor %xmm1,%xmm7 - xorl %edi,%edx - roll $5,%ecx - movdqa %xmm2,32(%esp) - addl %ebp,%ebx - andl %edx,%esi - movdqa %xmm7,%xmm3 - xorl %edi,%edx - addl %ecx,%ebx - rorl $7,%ecx - movdqa %xmm7,%xmm1 - xorl %edi,%esi - pslldq $12,%xmm3 - paddd %xmm7,%xmm7 - movl %ebx,%ebp - addl 56(%esp),%eax - psrld $31,%xmm1 - xorl %edx,%ecx - roll $5,%ebx - movdqa %xmm3,%xmm2 - addl %esi,%eax - andl %ecx,%ebp - xorl %edx,%ecx - psrld $30,%xmm3 - addl %ebx,%eax - rorl $7,%ebx - por %xmm1,%xmm7 - xorl %edx,%ebp - movdqa 80(%esp),%xmm1 - movl %eax,%esi - addl 60(%esp),%edi - pslld $2,%xmm2 - xorl %ecx,%ebx - roll $5,%eax - pxor %xmm3,%xmm7 - movdqa 112(%esp),%xmm3 - addl %ebp,%edi - andl %ebx,%esi - pxor %xmm2,%xmm7 - pshufd $238,%xmm6,%xmm2 - xorl %ecx,%ebx - addl %eax,%edi - rorl $7,%eax - pxor %xmm4,%xmm0 - punpcklqdq %xmm7,%xmm2 - xorl %ecx,%esi - movl %edi,%ebp - addl (%esp),%edx - pxor %xmm1,%xmm0 - movdqa %xmm4,80(%esp) - xorl %ebx,%eax - roll $5,%edi - movdqa %xmm3,%xmm4 - addl %esi,%edx - paddd %xmm7,%xmm3 - andl %eax,%ebp - pxor %xmm2,%xmm0 - xorl %ebx,%eax - addl %edi,%edx - rorl $7,%edi - xorl %ebx,%ebp - movdqa %xmm0,%xmm2 - movdqa %xmm3,48(%esp) - movl %edx,%esi - addl 4(%esp),%ecx - xorl %eax,%edi - roll $5,%edx - pslld $2,%xmm0 - addl %ebp,%ecx - andl %edi,%esi - psrld $30,%xmm2 - xorl %eax,%edi - addl %edx,%ecx - rorl $7,%edx - xorl %eax,%esi - movl %ecx,%ebp - addl 8(%esp),%ebx - xorl %edi,%edx - roll $5,%ecx - por %xmm2,%xmm0 - addl %esi,%ebx - andl %edx,%ebp - movdqa 96(%esp),%xmm2 - xorl %edi,%edx - addl %ecx,%ebx - addl 12(%esp),%eax - xorl %edi,%ebp - movl %ebx,%esi - pshufd $238,%xmm7,%xmm3 - roll $5,%ebx - addl %ebp,%eax - xorl %edx,%esi - rorl $7,%ecx - addl %ebx,%eax - addl 16(%esp),%edi - pxor %xmm5,%xmm1 - punpcklqdq %xmm0,%xmm3 - xorl %ecx,%esi - movl %eax,%ebp - roll $5,%eax - pxor %xmm2,%xmm1 - movdqa %xmm5,96(%esp) - addl %esi,%edi - xorl %ecx,%ebp - movdqa %xmm4,%xmm5 - rorl $7,%ebx - paddd %xmm0,%xmm4 - addl %eax,%edi - pxor %xmm3,%xmm1 - addl 20(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - roll $5,%edi - movdqa %xmm1,%xmm3 - movdqa %xmm4,(%esp) - addl %ebp,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm1 - addl 24(%esp),%ecx - xorl %eax,%esi - psrld $30,%xmm3 - movl %edx,%ebp - roll $5,%edx - addl %esi,%ecx - xorl %eax,%ebp - rorl $7,%edi - addl %edx,%ecx - por %xmm3,%xmm1 - addl 28(%esp),%ebx - xorl %edi,%ebp - movdqa 64(%esp),%xmm3 - movl %ecx,%esi - roll $5,%ecx - addl %ebp,%ebx - xorl %edi,%esi - rorl $7,%edx - pshufd $238,%xmm0,%xmm4 - addl %ecx,%ebx - addl 32(%esp),%eax - pxor %xmm6,%xmm2 - punpcklqdq %xmm1,%xmm4 - xorl %edx,%esi - movl %ebx,%ebp - roll $5,%ebx - pxor %xmm3,%xmm2 - movdqa %xmm6,64(%esp) - addl %esi,%eax - xorl %edx,%ebp - movdqa 128(%esp),%xmm6 - rorl $7,%ecx - paddd %xmm1,%xmm5 - addl %ebx,%eax - pxor %xmm4,%xmm2 - addl 36(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - roll $5,%eax - movdqa %xmm2,%xmm4 - movdqa %xmm5,16(%esp) - addl %ebp,%edi - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%edi - pslld $2,%xmm2 - addl 40(%esp),%edx - xorl %ebx,%esi - psrld $30,%xmm4 - movl %edi,%ebp - roll $5,%edi - addl %esi,%edx - xorl %ebx,%ebp - rorl $7,%eax - addl %edi,%edx - por %xmm4,%xmm2 - addl 44(%esp),%ecx - xorl %eax,%ebp - movdqa 80(%esp),%xmm4 - movl %edx,%esi - roll $5,%edx - addl %ebp,%ecx - xorl %eax,%esi - rorl $7,%edi - pshufd $238,%xmm1,%xmm5 - addl %edx,%ecx - addl 48(%esp),%ebx - pxor %xmm7,%xmm3 - punpcklqdq %xmm2,%xmm5 - xorl %edi,%esi - movl %ecx,%ebp - roll $5,%ecx - pxor %xmm4,%xmm3 - movdqa %xmm7,80(%esp) - addl %esi,%ebx - xorl %edi,%ebp - movdqa %xmm6,%xmm7 - rorl $7,%edx - paddd %xmm2,%xmm6 - addl %ecx,%ebx - pxor %xmm5,%xmm3 - addl 52(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - roll $5,%ebx - movdqa %xmm3,%xmm5 - movdqa %xmm6,32(%esp) - addl %ebp,%eax - xorl %edx,%esi - rorl $7,%ecx - addl %ebx,%eax - pslld $2,%xmm3 - addl 56(%esp),%edi - xorl %ecx,%esi - psrld $30,%xmm5 - movl %eax,%ebp - roll $5,%eax - addl %esi,%edi - xorl %ecx,%ebp - rorl $7,%ebx - addl %eax,%edi - por %xmm5,%xmm3 - addl 60(%esp),%edx - xorl %ebx,%ebp - movdqa 96(%esp),%xmm5 - movl %edi,%esi - roll $5,%edi - addl %ebp,%edx - xorl %ebx,%esi - rorl $7,%eax - pshufd $238,%xmm2,%xmm6 - addl %edi,%edx - addl (%esp),%ecx - pxor %xmm0,%xmm4 - punpcklqdq %xmm3,%xmm6 - xorl %eax,%esi - movl %edx,%ebp - roll $5,%edx - pxor %xmm5,%xmm4 - movdqa %xmm0,96(%esp) - addl %esi,%ecx - xorl %eax,%ebp - movdqa %xmm7,%xmm0 - rorl $7,%edi - paddd %xmm3,%xmm7 - addl %edx,%ecx - pxor %xmm6,%xmm4 - addl 4(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - roll $5,%ecx - movdqa %xmm4,%xmm6 - movdqa %xmm7,48(%esp) - addl %ebp,%ebx - xorl %edi,%esi - rorl $7,%edx - addl %ecx,%ebx - pslld $2,%xmm4 - addl 8(%esp),%eax - xorl %edx,%esi - psrld $30,%xmm6 - movl %ebx,%ebp - roll $5,%ebx - addl %esi,%eax - xorl %edx,%ebp - rorl $7,%ecx - addl %ebx,%eax - por %xmm6,%xmm4 - addl 12(%esp),%edi - xorl %ecx,%ebp - movdqa 64(%esp),%xmm6 - movl %eax,%esi - roll $5,%eax - addl %ebp,%edi - xorl %ecx,%esi - rorl $7,%ebx - pshufd $238,%xmm3,%xmm7 - addl %eax,%edi - addl 16(%esp),%edx - pxor %xmm1,%xmm5 - punpcklqdq %xmm4,%xmm7 - xorl %ebx,%esi - movl %edi,%ebp - roll $5,%edi - pxor %xmm6,%xmm5 - movdqa %xmm1,64(%esp) - addl %esi,%edx - xorl %ebx,%ebp - movdqa %xmm0,%xmm1 - rorl $7,%eax - paddd %xmm4,%xmm0 - addl %edi,%edx - pxor %xmm7,%xmm5 - addl 20(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - roll $5,%edx - movdqa %xmm5,%xmm7 - movdqa %xmm0,(%esp) - addl %ebp,%ecx - xorl %eax,%esi - rorl $7,%edi - addl %edx,%ecx - pslld $2,%xmm5 - addl 24(%esp),%ebx - xorl %edi,%esi - psrld $30,%xmm7 - movl %ecx,%ebp - roll $5,%ecx - addl %esi,%ebx - xorl %edi,%ebp - rorl $7,%edx - addl %ecx,%ebx - por %xmm7,%xmm5 - addl 28(%esp),%eax - movdqa 80(%esp),%xmm7 - rorl $7,%ecx - movl %ebx,%esi - xorl %edx,%ebp - roll $5,%ebx - pshufd $238,%xmm4,%xmm0 - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 32(%esp),%edi - pxor %xmm2,%xmm6 - punpcklqdq %xmm5,%xmm0 - andl %ecx,%esi - xorl %edx,%ecx - rorl $7,%ebx - pxor %xmm7,%xmm6 - movdqa %xmm2,80(%esp) - movl %eax,%ebp - xorl %ecx,%esi - roll $5,%eax - movdqa %xmm1,%xmm2 - addl %esi,%edi - paddd %xmm5,%xmm1 - xorl %ebx,%ebp - pxor %xmm0,%xmm6 - xorl %ecx,%ebx - addl %eax,%edi - addl 36(%esp),%edx - andl %ebx,%ebp - movdqa %xmm6,%xmm0 - movdqa %xmm1,16(%esp) - xorl %ecx,%ebx - rorl $7,%eax - movl %edi,%esi - xorl %ebx,%ebp - roll $5,%edi - pslld $2,%xmm6 - addl %ebp,%edx - xorl %eax,%esi - psrld $30,%xmm0 - xorl %ebx,%eax - addl %edi,%edx - addl 40(%esp),%ecx - andl %eax,%esi - xorl %ebx,%eax - rorl $7,%edi - por %xmm0,%xmm6 - movl %edx,%ebp - xorl %eax,%esi - movdqa 96(%esp),%xmm0 - roll $5,%edx - addl %esi,%ecx - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - pshufd $238,%xmm5,%xmm1 - addl 44(%esp),%ebx - andl %edi,%ebp - xorl %eax,%edi - rorl $7,%edx - movl %ecx,%esi - xorl %edi,%ebp - roll $5,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - addl 48(%esp),%eax - pxor %xmm3,%xmm7 - punpcklqdq %xmm6,%xmm1 - andl %edx,%esi - xorl %edi,%edx - rorl $7,%ecx - pxor %xmm0,%xmm7 - movdqa %xmm3,96(%esp) - movl %ebx,%ebp - xorl %edx,%esi - roll $5,%ebx - movdqa 144(%esp),%xmm3 - addl %esi,%eax - paddd %xmm6,%xmm2 - xorl %ecx,%ebp - pxor %xmm1,%xmm7 - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%esp),%edi - andl %ecx,%ebp - movdqa %xmm7,%xmm1 - movdqa %xmm2,32(%esp) - xorl %edx,%ecx - rorl $7,%ebx - movl %eax,%esi - xorl %ecx,%ebp - roll $5,%eax - pslld $2,%xmm7 - addl %ebp,%edi - xorl %ebx,%esi - psrld $30,%xmm1 - xorl %ecx,%ebx - addl %eax,%edi - addl 56(%esp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - rorl $7,%eax - por %xmm1,%xmm7 - movl %edi,%ebp - xorl %ebx,%esi - movdqa 64(%esp),%xmm1 - roll $5,%edi - addl %esi,%edx - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - pshufd $238,%xmm6,%xmm2 - addl 60(%esp),%ecx - andl %eax,%ebp - xorl %ebx,%eax - rorl $7,%edi - movl %edx,%esi - xorl %eax,%ebp - roll $5,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - addl (%esp),%ebx - pxor %xmm4,%xmm0 - punpcklqdq %xmm7,%xmm2 - andl %edi,%esi - xorl %eax,%edi - rorl $7,%edx - pxor %xmm1,%xmm0 - movdqa %xmm4,64(%esp) - movl %ecx,%ebp - xorl %edi,%esi - roll $5,%ecx - movdqa %xmm3,%xmm4 - addl %esi,%ebx - paddd %xmm7,%xmm3 - xorl %edx,%ebp - pxor %xmm2,%xmm0 - xorl %edi,%edx - addl %ecx,%ebx - addl 4(%esp),%eax - andl %edx,%ebp - movdqa %xmm0,%xmm2 - movdqa %xmm3,48(%esp) - xorl %edi,%edx - rorl $7,%ecx - movl %ebx,%esi - xorl %edx,%ebp - roll $5,%ebx - pslld $2,%xmm0 - addl %ebp,%eax - xorl %ecx,%esi - psrld $30,%xmm2 - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%esp),%edi - andl %ecx,%esi - xorl %edx,%ecx - rorl $7,%ebx - por %xmm2,%xmm0 - movl %eax,%ebp - xorl %ecx,%esi - movdqa 80(%esp),%xmm2 - roll $5,%eax - addl %esi,%edi - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - pshufd $238,%xmm7,%xmm3 - addl 12(%esp),%edx - andl %ebx,%ebp - xorl %ecx,%ebx - rorl $7,%eax - movl %edi,%esi - xorl %ebx,%ebp - roll $5,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - addl 16(%esp),%ecx - pxor %xmm5,%xmm1 - punpcklqdq %xmm0,%xmm3 - andl %eax,%esi - xorl %ebx,%eax - rorl $7,%edi - pxor %xmm2,%xmm1 - movdqa %xmm5,80(%esp) - movl %edx,%ebp - xorl %eax,%esi - roll $5,%edx - movdqa %xmm4,%xmm5 - addl %esi,%ecx - paddd %xmm0,%xmm4 - xorl %edi,%ebp - pxor %xmm3,%xmm1 - xorl %eax,%edi - addl %edx,%ecx - addl 20(%esp),%ebx - andl %edi,%ebp - movdqa %xmm1,%xmm3 - movdqa %xmm4,(%esp) - xorl %eax,%edi - rorl $7,%edx - movl %ecx,%esi - xorl %edi,%ebp - roll $5,%ecx - pslld $2,%xmm1 - addl %ebp,%ebx - xorl %edx,%esi - psrld $30,%xmm3 - xorl %edi,%edx - addl %ecx,%ebx - addl 24(%esp),%eax - andl %edx,%esi - xorl %edi,%edx - rorl $7,%ecx - por %xmm3,%xmm1 - movl %ebx,%ebp - xorl %edx,%esi - movdqa 96(%esp),%xmm3 - roll $5,%ebx - addl %esi,%eax - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - pshufd $238,%xmm0,%xmm4 - addl 28(%esp),%edi - andl %ecx,%ebp - xorl %edx,%ecx - rorl $7,%ebx - movl %eax,%esi - xorl %ecx,%ebp - roll $5,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - addl 32(%esp),%edx - pxor %xmm6,%xmm2 - punpcklqdq %xmm1,%xmm4 - andl %ebx,%esi - xorl %ecx,%ebx - rorl $7,%eax - pxor %xmm3,%xmm2 - movdqa %xmm6,96(%esp) - movl %edi,%ebp - xorl %ebx,%esi - roll $5,%edi - movdqa %xmm5,%xmm6 - addl %esi,%edx - paddd %xmm1,%xmm5 - xorl %eax,%ebp - pxor %xmm4,%xmm2 - xorl %ebx,%eax - addl %edi,%edx - addl 36(%esp),%ecx - andl %eax,%ebp - movdqa %xmm2,%xmm4 - movdqa %xmm5,16(%esp) - xorl %ebx,%eax - rorl $7,%edi - movl %edx,%esi - xorl %eax,%ebp - roll $5,%edx - pslld $2,%xmm2 - addl %ebp,%ecx - xorl %edi,%esi - psrld $30,%xmm4 - xorl %eax,%edi - addl %edx,%ecx - addl 40(%esp),%ebx - andl %edi,%esi - xorl %eax,%edi - rorl $7,%edx - por %xmm4,%xmm2 - movl %ecx,%ebp - xorl %edi,%esi - movdqa 64(%esp),%xmm4 - roll $5,%ecx - addl %esi,%ebx - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - pshufd $238,%xmm1,%xmm5 - addl 44(%esp),%eax - andl %edx,%ebp - xorl %edi,%edx - rorl $7,%ecx - movl %ebx,%esi - xorl %edx,%ebp - roll $5,%ebx - addl %ebp,%eax - xorl %edx,%esi - addl %ebx,%eax - addl 48(%esp),%edi - pxor %xmm7,%xmm3 - punpcklqdq %xmm2,%xmm5 - xorl %ecx,%esi - movl %eax,%ebp - roll $5,%eax - pxor %xmm4,%xmm3 - movdqa %xmm7,64(%esp) - addl %esi,%edi - xorl %ecx,%ebp - movdqa %xmm6,%xmm7 - rorl $7,%ebx - paddd %xmm2,%xmm6 - addl %eax,%edi - pxor %xmm5,%xmm3 - addl 52(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - roll $5,%edi - movdqa %xmm3,%xmm5 - movdqa %xmm6,32(%esp) - addl %ebp,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %edi,%edx - pslld $2,%xmm3 - addl 56(%esp),%ecx - xorl %eax,%esi - psrld $30,%xmm5 - movl %edx,%ebp - roll $5,%edx - addl %esi,%ecx - xorl %eax,%ebp - rorl $7,%edi - addl %edx,%ecx - por %xmm5,%xmm3 - addl 60(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - roll $5,%ecx - addl %ebp,%ebx - xorl %edi,%esi - rorl $7,%edx - addl %ecx,%ebx - addl (%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - roll $5,%ebx - addl %esi,%eax - xorl %edx,%ebp - rorl $7,%ecx - paddd %xmm3,%xmm7 - addl %ebx,%eax - addl 4(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - movdqa %xmm7,48(%esp) - roll $5,%eax - addl %ebp,%edi - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%edi - addl 8(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - roll $5,%edi - addl %esi,%edx - xorl %ebx,%ebp - rorl $7,%eax - addl %edi,%edx - addl 12(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - roll $5,%edx - addl %ebp,%ecx - xorl %eax,%esi - rorl $7,%edi - addl %edx,%ecx - movl 196(%esp),%ebp - cmpl 200(%esp),%ebp - je .L005done - movdqa 160(%esp),%xmm7 - movdqa 176(%esp),%xmm6 - movdqu (%ebp),%xmm0 - movdqu 16(%ebp),%xmm1 - movdqu 32(%ebp),%xmm2 - movdqu 48(%ebp),%xmm3 - addl $64,%ebp -.byte 102,15,56,0,198 - movl %ebp,196(%esp) - movdqa %xmm7,96(%esp) - addl 16(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - roll $5,%ecx - addl %esi,%ebx - xorl %edi,%ebp - rorl $7,%edx -.byte 102,15,56,0,206 - addl %ecx,%ebx - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - paddd %xmm7,%xmm0 - roll $5,%ebx - addl %ebp,%eax - xorl %edx,%esi - rorl $7,%ecx - movdqa %xmm0,(%esp) - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - psubd %xmm7,%xmm0 - roll $5,%eax - addl %esi,%edi - xorl %ecx,%ebp - rorl $7,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - roll $5,%edi - addl %ebp,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - roll $5,%edx - addl %esi,%ecx - xorl %eax,%ebp - rorl $7,%edi -.byte 102,15,56,0,214 - addl %edx,%ecx - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - paddd %xmm7,%xmm1 - roll $5,%ecx - addl %ebp,%ebx - xorl %edi,%esi - rorl $7,%edx - movdqa %xmm1,16(%esp) - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - psubd %xmm7,%xmm1 - roll $5,%ebx - addl %esi,%eax - xorl %edx,%ebp - rorl $7,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - roll $5,%eax - addl %ebp,%edi - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - roll $5,%edi - addl %esi,%edx - xorl %ebx,%ebp - rorl $7,%eax -.byte 102,15,56,0,222 - addl %edi,%edx - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - paddd %xmm7,%xmm2 - roll $5,%edx - addl %ebp,%ecx - xorl %eax,%esi - rorl $7,%edi - movdqa %xmm2,32(%esp) - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - psubd %xmm7,%xmm2 - roll $5,%ecx - addl %esi,%ebx - xorl %edi,%ebp - rorl $7,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - roll $5,%ebx - addl %ebp,%eax - rorl $7,%ecx - addl %ebx,%eax - movl 192(%esp),%ebp - addl (%ebp),%eax - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,8(%ebp) - movl %ecx,%ebx - movl %edx,12(%ebp) - xorl %edx,%ebx - movl %edi,16(%ebp) - movl %esi,%ebp - pshufd $238,%xmm0,%xmm4 - andl %ebx,%esi - movl %ebp,%ebx - jmp .L004loop -.align 16 -.L005done: - addl 16(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - roll $5,%ecx - addl %esi,%ebx - xorl %edi,%ebp - rorl $7,%edx - addl %ecx,%ebx - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - roll $5,%ebx - addl %ebp,%eax - xorl %edx,%esi - rorl $7,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - roll $5,%eax - addl %esi,%edi - xorl %ecx,%ebp - rorl $7,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - roll $5,%edi - addl %ebp,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - roll $5,%edx - addl %esi,%ecx - xorl %eax,%ebp - rorl $7,%edi - addl %edx,%ecx - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - roll $5,%ecx - addl %ebp,%ebx - xorl %edi,%esi - rorl $7,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - roll $5,%ebx - addl %esi,%eax - xorl %edx,%ebp - rorl $7,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - roll $5,%eax - addl %ebp,%edi - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - roll $5,%edi - addl %esi,%edx - xorl %ebx,%ebp - rorl $7,%eax - addl %edi,%edx - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - roll $5,%edx - addl %ebp,%ecx - xorl %eax,%esi - rorl $7,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - roll $5,%ecx - addl %esi,%ebx - xorl %edi,%ebp - rorl $7,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - roll $5,%ebx - addl %ebp,%eax - rorl $7,%ecx - addl %ebx,%eax - movl 192(%esp),%ebp - addl (%ebp),%eax - movl 204(%esp),%esp - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,8(%ebp) - movl %edx,12(%ebp) - movl %edi,16(%ebp) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size _sha1_block_data_order_ssse3,.-_sha1_block_data_order_ssse3 -.hidden _sha1_block_data_order_avx -.type _sha1_block_data_order_avx,@function -.align 16 -_sha1_block_data_order_avx: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - call .L006pic_point -.L006pic_point: - popl %ebp - leal .LK_XX_XX-.L006pic_point(%ebp),%ebp -.Lavx_shortcut: - vzeroall - vmovdqa (%ebp),%xmm7 - vmovdqa 16(%ebp),%xmm0 - vmovdqa 32(%ebp),%xmm1 - vmovdqa 48(%ebp),%xmm2 - vmovdqa 64(%ebp),%xmm6 - movl 20(%esp),%edi - movl 24(%esp),%ebp - movl 28(%esp),%edx - movl %esp,%esi - subl $208,%esp - andl $-64,%esp - vmovdqa %xmm0,112(%esp) - vmovdqa %xmm1,128(%esp) - vmovdqa %xmm2,144(%esp) - shll $6,%edx - vmovdqa %xmm7,160(%esp) - addl %ebp,%edx - vmovdqa %xmm6,176(%esp) - addl $64,%ebp - movl %edi,192(%esp) - movl %ebp,196(%esp) - movl %edx,200(%esp) - movl %esi,204(%esp) - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - movl 16(%edi),%edi - movl %ebx,%esi - vmovdqu -64(%ebp),%xmm0 - vmovdqu -48(%ebp),%xmm1 - vmovdqu -32(%ebp),%xmm2 - vmovdqu -16(%ebp),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vmovdqa %xmm7,96(%esp) - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm7,%xmm0,%xmm4 - vpaddd %xmm7,%xmm1,%xmm5 - vpaddd %xmm7,%xmm2,%xmm6 - vmovdqa %xmm4,(%esp) - movl %ecx,%ebp - vmovdqa %xmm5,16(%esp) - xorl %edx,%ebp - vmovdqa %xmm6,32(%esp) - andl %ebp,%esi - jmp .L007loop -.align 16 -.L007loop: - shrdl $2,%ebx,%ebx - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%ebp - addl (%esp),%edi - vpaddd %xmm3,%xmm7,%xmm7 - vmovdqa %xmm0,64(%esp) - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm6 - addl %esi,%edi - andl %ebx,%ebp - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%edi - vpxor %xmm2,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%ebp - vmovdqa %xmm7,48(%esp) - movl %edi,%esi - addl 4(%esp),%edx - vpxor %xmm6,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%edi,%edi - addl %ebp,%edx - andl %eax,%esi - vpsrld $31,%xmm4,%xmm6 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm0 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%ebp - addl 8(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpsrld $30,%xmm0,%xmm7 - vpor %xmm6,%xmm4,%xmm4 - addl %esi,%ecx - andl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - vpslld $2,%xmm0,%xmm0 - shrdl $7,%edx,%edx - xorl %eax,%ebp - vpxor %xmm7,%xmm4,%xmm4 - movl %ecx,%esi - addl 12(%esp),%ebx - xorl %edi,%edx - shldl $5,%ecx,%ecx - vpxor %xmm0,%xmm4,%xmm4 - addl %ebp,%ebx - andl %edx,%esi - vmovdqa 96(%esp),%xmm0 - xorl %edi,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %edi,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%ebp - addl 16(%esp),%eax - vpaddd %xmm4,%xmm0,%xmm0 - vmovdqa %xmm1,80(%esp) - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm7 - addl %esi,%eax - andl %ecx,%ebp - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - xorl %edx,%ebp - vmovdqa %xmm0,(%esp) - movl %eax,%esi - addl 20(%esp),%edi - vpxor %xmm7,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %ebp,%edi - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm7 - xorl %ecx,%ebx - addl %eax,%edi - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm1 - vpaddd %xmm5,%xmm5,%xmm5 - movl %edi,%ebp - addl 24(%esp),%edx - xorl %ebx,%eax - shldl $5,%edi,%edi - vpsrld $30,%xmm1,%xmm0 - vpor %xmm7,%xmm5,%xmm5 - addl %esi,%edx - andl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - vpslld $2,%xmm1,%xmm1 - shrdl $7,%edi,%edi - xorl %ebx,%ebp - vpxor %xmm0,%xmm5,%xmm5 - movl %edx,%esi - addl 28(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpxor %xmm1,%xmm5,%xmm5 - addl %ebp,%ecx - andl %edi,%esi - vmovdqa 112(%esp),%xmm1 - xorl %eax,%edi - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%ebp - addl 32(%esp),%ebx - vpaddd %xmm5,%xmm1,%xmm1 - vmovdqa %xmm2,96(%esp) - xorl %edi,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm0 - addl %esi,%ebx - andl %edx,%ebp - vpxor %xmm2,%xmm6,%xmm6 - xorl %edi,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%ecx,%ecx - xorl %edi,%ebp - vmovdqa %xmm1,16(%esp) - movl %ebx,%esi - addl 36(%esp),%eax - vpxor %xmm0,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - addl %ebp,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm0 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm2 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%ebp - addl 40(%esp),%edi - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm1 - vpor %xmm0,%xmm6,%xmm6 - addl %esi,%edi - andl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - vpslld $2,%xmm2,%xmm2 - vmovdqa 64(%esp),%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%ebp - vpxor %xmm1,%xmm6,%xmm6 - movl %edi,%esi - addl 44(%esp),%edx - xorl %ebx,%eax - shldl $5,%edi,%edi - vpxor %xmm2,%xmm6,%xmm6 - addl %ebp,%edx - andl %eax,%esi - vmovdqa 112(%esp),%xmm2 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%ebp - addl 48(%esp),%ecx - vpaddd %xmm6,%xmm2,%xmm2 - vmovdqa %xmm3,64(%esp) - xorl %eax,%edi - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm1 - addl %esi,%ecx - andl %edi,%ebp - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%edi - addl %edx,%ecx - vpxor %xmm5,%xmm1,%xmm1 - shrdl $7,%edx,%edx - xorl %eax,%ebp - vmovdqa %xmm2,32(%esp) - movl %ecx,%esi - addl 52(%esp),%ebx - vpxor %xmm1,%xmm7,%xmm7 - xorl %edi,%edx - shldl $5,%ecx,%ecx - addl %ebp,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm1 - xorl %edi,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %edi,%esi - vpslldq $12,%xmm7,%xmm3 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%ebp - addl 56(%esp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm2 - vpor %xmm1,%xmm7,%xmm7 - addl %esi,%eax - andl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - vmovdqa 80(%esp),%xmm1 - shrdl $7,%ebx,%ebx - xorl %edx,%ebp - vpxor %xmm2,%xmm7,%xmm7 - movl %eax,%esi - addl 60(%esp),%edi - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpxor %xmm3,%xmm7,%xmm7 - addl %ebp,%edi - andl %ebx,%esi - vmovdqa 112(%esp),%xmm3 - xorl %ecx,%ebx - addl %eax,%edi - vpalignr $8,%xmm6,%xmm7,%xmm2 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %edi,%ebp - addl (%esp),%edx - vpxor %xmm1,%xmm0,%xmm0 - vmovdqa %xmm4,80(%esp) - xorl %ebx,%eax - shldl $5,%edi,%edi - vmovdqa %xmm3,%xmm4 - vpaddd %xmm7,%xmm3,%xmm3 - addl %esi,%edx - andl %eax,%ebp - vpxor %xmm2,%xmm0,%xmm0 - xorl %ebx,%eax - addl %edi,%edx - shrdl $7,%edi,%edi - xorl %ebx,%ebp - vpsrld $30,%xmm0,%xmm2 - vmovdqa %xmm3,48(%esp) - movl %edx,%esi - addl 4(%esp),%ecx - xorl %eax,%edi - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %ebp,%ecx - andl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%ebp - addl 8(%esp),%ebx - vpor %xmm2,%xmm0,%xmm0 - xorl %edi,%edx - shldl $5,%ecx,%ecx - vmovdqa 96(%esp),%xmm2 - addl %esi,%ebx - andl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 12(%esp),%eax - xorl %edi,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm3 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm5,96(%esp) - addl %esi,%edi - xorl %ecx,%ebp - vmovdqa %xmm4,%xmm5 - vpaddd %xmm0,%xmm4,%xmm4 - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpxor %xmm3,%xmm1,%xmm1 - addl 20(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - vpsrld $30,%xmm1,%xmm3 - vmovdqa %xmm4,(%esp) - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vpor %xmm3,%xmm1,%xmm1 - addl 28(%esp),%ebx - xorl %edi,%ebp - vmovdqa 64(%esp),%xmm3 - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm4 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - vmovdqa %xmm6,64(%esp) - addl %esi,%eax - xorl %edx,%ebp - vmovdqa 128(%esp),%xmm6 - vpaddd %xmm1,%xmm5,%xmm5 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm4,%xmm2,%xmm2 - addl 36(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm4 - vmovdqa %xmm5,16(%esp) - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpslld $2,%xmm2,%xmm2 - addl 40(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - vpor %xmm4,%xmm2,%xmm2 - addl 44(%esp),%ecx - xorl %eax,%ebp - vmovdqa 80(%esp),%xmm4 - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm5 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - vmovdqa %xmm7,80(%esp) - addl %esi,%ebx - xorl %edi,%ebp - vmovdqa %xmm6,%xmm7 - vpaddd %xmm2,%xmm6,%xmm6 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm5,%xmm3,%xmm3 - addl 52(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm5 - vmovdqa %xmm6,32(%esp) - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpor %xmm5,%xmm3,%xmm3 - addl 60(%esp),%edx - xorl %ebx,%ebp - vmovdqa 96(%esp),%xmm5 - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpalignr $8,%xmm2,%xmm3,%xmm6 - vpxor %xmm0,%xmm4,%xmm4 - addl (%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - vmovdqa %xmm0,96(%esp) - addl %esi,%ecx - xorl %eax,%ebp - vmovdqa %xmm7,%xmm0 - vpaddd %xmm3,%xmm7,%xmm7 - shrdl $7,%edi,%edi - addl %edx,%ecx - vpxor %xmm6,%xmm4,%xmm4 - addl 4(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm6 - vmovdqa %xmm7,48(%esp) - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm6,%xmm4,%xmm4 - addl 12(%esp),%edi - xorl %ecx,%ebp - vmovdqa 64(%esp),%xmm6 - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpalignr $8,%xmm3,%xmm4,%xmm7 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - vpxor %xmm6,%xmm5,%xmm5 - vmovdqa %xmm1,64(%esp) - addl %esi,%edx - xorl %ebx,%ebp - vmovdqa %xmm0,%xmm1 - vpaddd %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - addl %edi,%edx - vpxor %xmm7,%xmm5,%xmm5 - addl 20(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm7 - vmovdqa %xmm0,(%esp) - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm7,%xmm5,%xmm5 - addl 28(%esp),%eax - vmovdqa 80(%esp),%xmm7 - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm0 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%esp),%edi - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - vmovdqa %xmm2,80(%esp) - movl %eax,%ebp - xorl %ecx,%esi - vmovdqa %xmm1,%xmm2 - vpaddd %xmm5,%xmm1,%xmm1 - shldl $5,%eax,%eax - addl %esi,%edi - vpxor %xmm0,%xmm6,%xmm6 - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - addl 36(%esp),%edx - vpsrld $30,%xmm6,%xmm0 - vmovdqa %xmm1,16(%esp) - andl %ebx,%ebp - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %edi,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%ebp - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - addl 40(%esp),%ecx - andl %eax,%esi - vpor %xmm0,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%edi,%edi - vmovdqa 96(%esp),%xmm0 - movl %edx,%ebp - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - addl 44(%esp),%ebx - andl %edi,%ebp - xorl %eax,%edi - shrdl $7,%edx,%edx - movl %ecx,%esi - xorl %edi,%ebp - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm1 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%esp),%eax - andl %edx,%esi - xorl %edi,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - vmovdqa %xmm3,96(%esp) - movl %ebx,%ebp - xorl %edx,%esi - vmovdqa 144(%esp),%xmm3 - vpaddd %xmm6,%xmm2,%xmm2 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm1,%xmm7,%xmm7 - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%esp),%edi - vpsrld $30,%xmm7,%xmm1 - vmovdqa %xmm2,32(%esp) - andl %ecx,%ebp - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%ebp - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - addl 56(%esp),%edx - andl %ebx,%esi - vpor %xmm1,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vmovdqa 64(%esp),%xmm1 - movl %edi,%ebp - xorl %ebx,%esi - shldl $5,%edi,%edi - addl %esi,%edx - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - addl 60(%esp),%ecx - andl %eax,%ebp - xorl %ebx,%eax - shrdl $7,%edi,%edi - movl %edx,%esi - xorl %eax,%ebp - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm2 - vpxor %xmm4,%xmm0,%xmm0 - addl (%esp),%ebx - andl %edi,%esi - xorl %eax,%edi - shrdl $7,%edx,%edx - vpxor %xmm1,%xmm0,%xmm0 - vmovdqa %xmm4,64(%esp) - movl %ecx,%ebp - xorl %edi,%esi - vmovdqa %xmm3,%xmm4 - vpaddd %xmm7,%xmm3,%xmm3 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm2,%xmm0,%xmm0 - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 4(%esp),%eax - vpsrld $30,%xmm0,%xmm2 - vmovdqa %xmm3,48(%esp) - andl %edx,%ebp - xorl %edi,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%esp),%edi - andl %ecx,%esi - vpor %xmm2,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vmovdqa 80(%esp),%xmm2 - movl %eax,%ebp - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ebx,%ebp - xorl %ecx,%ebx - addl %eax,%edi - addl 12(%esp),%edx - andl %ebx,%ebp - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %edi,%esi - xorl %ebx,%ebp - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %edi,%edx - vpalignr $8,%xmm7,%xmm0,%xmm3 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%esp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%edi,%edi - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm5,80(%esp) - movl %edx,%ebp - xorl %eax,%esi - vmovdqa %xmm4,%xmm5 - vpaddd %xmm0,%xmm4,%xmm4 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm3,%xmm1,%xmm1 - xorl %edi,%ebp - xorl %eax,%edi - addl %edx,%ecx - addl 20(%esp),%ebx - vpsrld $30,%xmm1,%xmm3 - vmovdqa %xmm4,(%esp) - andl %edi,%ebp - xorl %eax,%edi - shrdl $7,%edx,%edx - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %edi,%ebp - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edx,%esi - xorl %edi,%edx - addl %ecx,%ebx - addl 24(%esp),%eax - andl %edx,%esi - vpor %xmm3,%xmm1,%xmm1 - xorl %edi,%edx - shrdl $7,%ecx,%ecx - vmovdqa 96(%esp),%xmm3 - movl %ebx,%ebp - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%ebp - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%esp),%edi - andl %ecx,%ebp - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%ebp - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%edi - vpalignr $8,%xmm0,%xmm1,%xmm4 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%esp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - vmovdqa %xmm6,96(%esp) - movl %edi,%ebp - xorl %ebx,%esi - vmovdqa %xmm5,%xmm6 - vpaddd %xmm1,%xmm5,%xmm5 - shldl $5,%edi,%edi - addl %esi,%edx - vpxor %xmm4,%xmm2,%xmm2 - xorl %eax,%ebp - xorl %ebx,%eax - addl %edi,%edx - addl 36(%esp),%ecx - vpsrld $30,%xmm2,%xmm4 - vmovdqa %xmm5,16(%esp) - andl %eax,%ebp - xorl %ebx,%eax - shrdl $7,%edi,%edi - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%ebp - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %edi,%esi - xorl %eax,%edi - addl %edx,%ecx - addl 40(%esp),%ebx - andl %edi,%esi - vpor %xmm4,%xmm2,%xmm2 - xorl %eax,%edi - shrdl $7,%edx,%edx - vmovdqa 64(%esp),%xmm4 - movl %ecx,%ebp - xorl %edi,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%ebp - xorl %edi,%edx - addl %ecx,%ebx - addl 44(%esp),%eax - andl %edx,%ebp - xorl %edi,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%ebp - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm5 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - vmovdqa %xmm7,64(%esp) - addl %esi,%edi - xorl %ecx,%ebp - vmovdqa %xmm6,%xmm7 - vpaddd %xmm2,%xmm6,%xmm6 - shrdl $7,%ebx,%ebx - addl %eax,%edi - vpxor %xmm5,%xmm3,%xmm3 - addl 52(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - vpsrld $30,%xmm3,%xmm5 - vmovdqa %xmm6,32(%esp) - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vpor %xmm5,%xmm3,%xmm3 - addl 60(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl (%esp),%eax - vpaddd %xmm3,%xmm7,%xmm7 - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm7,48(%esp) - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 8(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - addl 12(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - movl 196(%esp),%ebp - cmpl 200(%esp),%ebp - je .L008done - vmovdqa 160(%esp),%xmm7 - vmovdqa 176(%esp),%xmm6 - vmovdqu (%ebp),%xmm0 - vmovdqu 16(%ebp),%xmm1 - vmovdqu 32(%ebp),%xmm2 - vmovdqu 48(%ebp),%xmm3 - addl $64,%ebp - vpshufb %xmm6,%xmm0,%xmm0 - movl %ebp,196(%esp) - vmovdqa %xmm7,96(%esp) - addl 16(%esp),%ebx - xorl %edi,%esi - vpshufb %xmm6,%xmm1,%xmm1 - movl %ecx,%ebp - shldl $5,%ecx,%ecx - vpaddd %xmm7,%xmm0,%xmm4 - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm4,(%esp) - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - vpshufb %xmm6,%xmm2,%xmm2 - movl %edx,%ebp - shldl $5,%edx,%edx - vpaddd %xmm7,%xmm1,%xmm5 - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - vmovdqa %xmm5,16(%esp) - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - vpshufb %xmm6,%xmm3,%xmm3 - movl %edi,%ebp - shldl $5,%edi,%edi - vpaddd %xmm7,%xmm2,%xmm6 - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - vmovdqa %xmm6,32(%esp) - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - movl 192(%esp),%ebp - addl (%ebp),%eax - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,%ebx - movl %ecx,8(%ebp) - xorl %edx,%ebx - movl %edx,12(%ebp) - movl %edi,16(%ebp) - movl %esi,%ebp - andl %ebx,%esi - movl %ebp,%ebx - jmp .L007loop -.align 16 -.L008done: - addl 16(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%esp),%edi - xorl %ecx,%esi - movl %eax,%ebp - shldl $5,%eax,%eax - addl %esi,%edi - xorl %ecx,%ebp - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 28(%esp),%edx - xorl %ebx,%ebp - movl %edi,%esi - shldl $5,%edi,%edi - addl %ebp,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %edi,%edx - addl 32(%esp),%ecx - xorl %eax,%esi - movl %edx,%ebp - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%ebp - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 36(%esp),%ebx - xorl %edi,%ebp - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %ebp,%ebx - xorl %edi,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%esp),%eax - xorl %edx,%esi - movl %ebx,%ebp - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%ebp - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%esp),%edi - xorl %ecx,%ebp - movl %eax,%esi - shldl $5,%eax,%eax - addl %ebp,%edi - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%edi - addl 48(%esp),%edx - xorl %ebx,%esi - movl %edi,%ebp - shldl $5,%edi,%edi - addl %esi,%edx - xorl %ebx,%ebp - shrdl $7,%eax,%eax - addl %edi,%edx - addl 52(%esp),%ecx - xorl %eax,%ebp - movl %edx,%esi - shldl $5,%edx,%edx - addl %ebp,%ecx - xorl %eax,%esi - shrdl $7,%edi,%edi - addl %edx,%ecx - addl 56(%esp),%ebx - xorl %edi,%esi - movl %ecx,%ebp - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edi,%ebp - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%esp),%eax - xorl %edx,%ebp - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %ebp,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vzeroall - movl 192(%esp),%ebp - addl (%ebp),%eax - movl 204(%esp),%esp - addl 4(%ebp),%esi - addl 8(%ebp),%ecx - movl %eax,(%ebp) - addl 12(%ebp),%edx - movl %esi,4(%ebp) - addl 16(%ebp),%edi - movl %ecx,8(%ebp) - movl %edx,12(%ebp) - movl %edi,16(%ebp) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size _sha1_block_data_order_avx,.-_sha1_block_data_order_avx -.align 64 -.LK_XX_XX: -.long 1518500249,1518500249,1518500249,1518500249 -.long 1859775393,1859775393,1859775393,1859775393 -.long 2400959708,2400959708,2400959708,2400959708 -.long 3395469782,3395469782,3395469782,3395469782 -.long 66051,67438087,134810123,202182159 -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115 -.byte 102,111,114,109,32,102,111,114,32,120,56,54,44,32,67,82 -.byte 89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112 -.byte 114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S b/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S deleted file mode 100644 index dcaf8755..00000000 --- a/third_party/boringssl/linux-x86/crypto/fipsmodule/sha256-586.S +++ /dev/null @@ -1,5567 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl sha256_block_data_order -.hidden sha256_block_data_order -.type sha256_block_data_order,@function -.align 16 -sha256_block_data_order: -.L_sha256_block_data_order_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl %esp,%ebx - call .L000pic_point -.L000pic_point: - popl %ebp - leal .L001K256-.L000pic_point(%ebp),%ebp - subl $16,%esp - andl $-64,%esp - shll $6,%eax - addl %edi,%eax - movl %esi,(%esp) - movl %edi,4(%esp) - movl %eax,8(%esp) - movl %ebx,12(%esp) - leal OPENSSL_ia32cap_P-.L001K256(%ebp),%edx - movl (%edx),%ecx - movl 4(%edx),%ebx - testl $1048576,%ecx - jnz .L002loop - movl 8(%edx),%edx - testl $16777216,%ecx - jz .L003no_xmm - andl $1073741824,%ecx - andl $268435968,%ebx - orl %ebx,%ecx - andl $1342177280,%ecx - cmpl $1342177280,%ecx - je .L004AVX - testl $512,%ebx - jnz .L005SSSE3 -.L003no_xmm: - subl %edi,%eax - cmpl $256,%eax - jae .L006unrolled - jmp .L002loop -.align 16 -.L002loop: - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - bswap %eax - movl 12(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 16(%edi),%eax - movl 20(%edi),%ebx - movl 24(%edi),%ecx - bswap %eax - movl 28(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 32(%edi),%eax - movl 36(%edi),%ebx - movl 40(%edi),%ecx - bswap %eax - movl 44(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - movl 48(%edi),%eax - movl 52(%edi),%ebx - movl 56(%edi),%ecx - bswap %eax - movl 60(%edi),%edx - bswap %ebx - pushl %eax - bswap %ecx - pushl %ebx - bswap %edx - pushl %ecx - pushl %edx - addl $64,%edi - leal -36(%esp),%esp - movl %edi,104(%esp) - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,8(%esp) - xorl %ecx,%ebx - movl %ecx,12(%esp) - movl %edi,16(%esp) - movl %ebx,(%esp) - movl 16(%esi),%edx - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%edi - movl %ebx,24(%esp) - movl %ecx,28(%esp) - movl %edi,32(%esp) -.align 16 -.L00700_15: - movl %edx,%ecx - movl 24(%esp),%esi - rorl $14,%ecx - movl 28(%esp),%edi - xorl %edx,%ecx - xorl %edi,%esi - movl 96(%esp),%ebx - rorl $5,%ecx - andl %edx,%esi - movl %edx,20(%esp) - xorl %ecx,%edx - addl 32(%esp),%ebx - xorl %edi,%esi - rorl $6,%edx - movl %eax,%ecx - addl %esi,%ebx - rorl $9,%ecx - addl %edx,%ebx - movl 8(%esp),%edi - xorl %eax,%ecx - movl %eax,4(%esp) - leal -4(%esp),%esp - rorl $11,%ecx - movl (%ebp),%esi - xorl %eax,%ecx - movl 20(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %esi,%ebx - movl %eax,(%esp) - addl %ebx,%edx - andl 4(%esp),%eax - addl %ecx,%ebx - xorl %edi,%eax - addl $4,%ebp - addl %ebx,%eax - cmpl $3248222580,%esi - jne .L00700_15 - movl 156(%esp),%ecx - jmp .L00816_63 -.align 16 -.L00816_63: - movl %ecx,%ebx - movl 104(%esp),%esi - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 160(%esp),%ebx - shrl $10,%edi - addl 124(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 24(%esp),%esi - rorl $14,%ecx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %edx,%ecx - xorl %edi,%esi - movl %ebx,96(%esp) - rorl $5,%ecx - andl %edx,%esi - movl %edx,20(%esp) - xorl %ecx,%edx - addl 32(%esp),%ebx - xorl %edi,%esi - rorl $6,%edx - movl %eax,%ecx - addl %esi,%ebx - rorl $9,%ecx - addl %edx,%ebx - movl 8(%esp),%edi - xorl %eax,%ecx - movl %eax,4(%esp) - leal -4(%esp),%esp - rorl $11,%ecx - movl (%ebp),%esi - xorl %eax,%ecx - movl 20(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %esi,%ebx - movl %eax,(%esp) - addl %ebx,%edx - andl 4(%esp),%eax - addl %ecx,%ebx - xorl %edi,%eax - movl 156(%esp),%ecx - addl $4,%ebp - addl %ebx,%eax - cmpl $3329325298,%esi - jne .L00816_63 - movl 356(%esp),%esi - movl 8(%esp),%ebx - movl 16(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl 24(%esp),%eax - movl 28(%esp),%ebx - movl 32(%esp),%ecx - movl 360(%esp),%edi - addl 16(%esi),%edx - addl 20(%esi),%eax - addl 24(%esi),%ebx - addl 28(%esi),%ecx - movl %edx,16(%esi) - movl %eax,20(%esi) - movl %ebx,24(%esi) - movl %ecx,28(%esi) - leal 356(%esp),%esp - subl $256,%ebp - cmpl 8(%esp),%edi - jb .L002loop - movl 12(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 64 -.L001K256: -.long 1116352408,1899447441,3049323471,3921009573,961987163,1508970993,2453635748,2870763221,3624381080,310598401,607225278,1426881987,1925078388,2162078206,2614888103,3248222580,3835390401,4022224774,264347078,604807628,770255983,1249150122,1555081692,1996064986,2554220882,2821834349,2952996808,3210313671,3336571891,3584528711,113926993,338241895,666307205,773529912,1294757372,1396182291,1695183700,1986661051,2177026350,2456956037,2730485921,2820302411,3259730800,3345764771,3516065817,3600352804,4094571909,275423344,430227734,506948616,659060556,883997877,958139571,1322822218,1537002063,1747873779,1955562222,2024104815,2227730452,2361852424,2428436474,2756734187,3204031479,3329325298 -.long 66051,67438087,134810123,202182159 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97 -.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 -.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 -.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 -.byte 62,0 -.align 16 -.L006unrolled: - leal -96(%esp),%esp - movl (%esi),%eax - movl 4(%esi),%ebp - movl 8(%esi),%ecx - movl 12(%esi),%ebx - movl %ebp,4(%esp) - xorl %ecx,%ebp - movl %ecx,8(%esp) - movl %ebx,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %ebx,20(%esp) - movl %ecx,24(%esp) - movl %esi,28(%esp) - jmp .L009grand_loop -.align 16 -.L009grand_loop: - movl (%edi),%ebx - movl 4(%edi),%ecx - bswap %ebx - movl 8(%edi),%esi - bswap %ecx - movl %ebx,32(%esp) - bswap %esi - movl %ecx,36(%esp) - movl %esi,40(%esp) - movl 12(%edi),%ebx - movl 16(%edi),%ecx - bswap %ebx - movl 20(%edi),%esi - bswap %ecx - movl %ebx,44(%esp) - bswap %esi - movl %ecx,48(%esp) - movl %esi,52(%esp) - movl 24(%edi),%ebx - movl 28(%edi),%ecx - bswap %ebx - movl 32(%edi),%esi - bswap %ecx - movl %ebx,56(%esp) - bswap %esi - movl %ecx,60(%esp) - movl %esi,64(%esp) - movl 36(%edi),%ebx - movl 40(%edi),%ecx - bswap %ebx - movl 44(%edi),%esi - bswap %ecx - movl %ebx,68(%esp) - bswap %esi - movl %ecx,72(%esp) - movl %esi,76(%esp) - movl 48(%edi),%ebx - movl 52(%edi),%ecx - bswap %ebx - movl 56(%edi),%esi - bswap %ecx - movl %ebx,80(%esp) - bswap %esi - movl %ecx,84(%esp) - movl %esi,88(%esp) - movl 60(%edi),%ebx - addl $64,%edi - bswap %ebx - movl %edi,100(%esp) - movl %ebx,92(%esp) - movl %edx,%ecx - movl 20(%esp),%esi - rorl $14,%edx - movl 24(%esp),%edi - xorl %ecx,%edx - movl 32(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1116352408(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 16(%esp),%ecx - rorl $14,%edx - movl 20(%esp),%edi - xorl %esi,%edx - movl 36(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1899447441(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 12(%esp),%esi - rorl $14,%edx - movl 16(%esp),%edi - xorl %ecx,%edx - movl 40(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3049323471(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 8(%esp),%ecx - rorl $14,%edx - movl 12(%esp),%edi - xorl %esi,%edx - movl 44(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3921009573(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 4(%esp),%esi - rorl $14,%edx - movl 8(%esp),%edi - xorl %ecx,%edx - movl 48(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 961987163(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl (%esp),%ecx - rorl $14,%edx - movl 4(%esp),%edi - xorl %esi,%edx - movl 52(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1508970993(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 28(%esp),%esi - rorl $14,%edx - movl (%esp),%edi - xorl %ecx,%edx - movl 56(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2453635748(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 24(%esp),%ecx - rorl $14,%edx - movl 28(%esp),%edi - xorl %esi,%edx - movl 60(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2870763221(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 20(%esp),%esi - rorl $14,%edx - movl 24(%esp),%edi - xorl %ecx,%edx - movl 64(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3624381080(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 16(%esp),%ecx - rorl $14,%edx - movl 20(%esp),%edi - xorl %esi,%edx - movl 68(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 310598401(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 12(%esp),%esi - rorl $14,%edx - movl 16(%esp),%edi - xorl %ecx,%edx - movl 72(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 607225278(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 8(%esp),%ecx - rorl $14,%edx - movl 12(%esp),%edi - xorl %esi,%edx - movl 76(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1426881987(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 4(%esp),%esi - rorl $14,%edx - movl 8(%esp),%edi - xorl %ecx,%edx - movl 80(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1925078388(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl (%esp),%ecx - rorl $14,%edx - movl 4(%esp),%edi - xorl %esi,%edx - movl 84(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2162078206(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl %edx,%ecx - movl 28(%esp),%esi - rorl $14,%edx - movl (%esp),%edi - xorl %ecx,%edx - movl 88(%esp),%ebx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2614888103(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl %edx,%esi - movl 24(%esp),%ecx - rorl $14,%edx - movl 28(%esp),%edi - xorl %esi,%edx - movl 92(%esp),%ebx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3248222580(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 36(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 88(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 32(%esp),%ebx - shrl $10,%edi - addl 68(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,32(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3835390401(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 40(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 92(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 36(%esp),%ebx - shrl $10,%edi - addl 72(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,36(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 4022224774(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 44(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 32(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 40(%esp),%ebx - shrl $10,%edi - addl 76(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,40(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 264347078(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 48(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 36(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 44(%esp),%ebx - shrl $10,%edi - addl 80(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,44(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 604807628(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 52(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 40(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 48(%esp),%ebx - shrl $10,%edi - addl 84(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,48(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 770255983(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 56(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 44(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 52(%esp),%ebx - shrl $10,%edi - addl 88(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,52(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1249150122(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 60(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 48(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 56(%esp),%ebx - shrl $10,%edi - addl 92(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,56(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1555081692(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 64(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 52(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 60(%esp),%ebx - shrl $10,%edi - addl 32(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,60(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1996064986(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 68(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 56(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 64(%esp),%ebx - shrl $10,%edi - addl 36(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,64(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2554220882(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 72(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 60(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 68(%esp),%ebx - shrl $10,%edi - addl 40(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,68(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2821834349(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 76(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 64(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 72(%esp),%ebx - shrl $10,%edi - addl 44(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,72(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2952996808(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 80(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 68(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 76(%esp),%ebx - shrl $10,%edi - addl 48(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,76(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3210313671(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 84(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 72(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 80(%esp),%ebx - shrl $10,%edi - addl 52(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,80(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3336571891(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 88(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 76(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 84(%esp),%ebx - shrl $10,%edi - addl 56(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,84(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3584528711(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 92(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 80(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 88(%esp),%ebx - shrl $10,%edi - addl 60(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,88(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 113926993(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 32(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 84(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 92(%esp),%ebx - shrl $10,%edi - addl 64(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,92(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 338241895(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 36(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 88(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 32(%esp),%ebx - shrl $10,%edi - addl 68(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,32(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 666307205(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 40(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 92(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 36(%esp),%ebx - shrl $10,%edi - addl 72(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,36(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 773529912(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 44(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 32(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 40(%esp),%ebx - shrl $10,%edi - addl 76(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,40(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1294757372(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 48(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 36(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 44(%esp),%ebx - shrl $10,%edi - addl 80(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,44(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1396182291(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 52(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 40(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 48(%esp),%ebx - shrl $10,%edi - addl 84(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,48(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1695183700(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 56(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 44(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 52(%esp),%ebx - shrl $10,%edi - addl 88(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,52(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1986661051(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 60(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 48(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 56(%esp),%ebx - shrl $10,%edi - addl 92(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,56(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2177026350(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 64(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 52(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 60(%esp),%ebx - shrl $10,%edi - addl 32(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,60(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2456956037(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 68(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 56(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 64(%esp),%ebx - shrl $10,%edi - addl 36(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,64(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2730485921(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 72(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 60(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 68(%esp),%ebx - shrl $10,%edi - addl 40(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,68(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2820302411(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 76(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 64(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 72(%esp),%ebx - shrl $10,%edi - addl 44(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,72(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3259730800(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 80(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 68(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 76(%esp),%ebx - shrl $10,%edi - addl 48(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,76(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3345764771(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 84(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 72(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 80(%esp),%ebx - shrl $10,%edi - addl 52(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,80(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3516065817(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 88(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 76(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 84(%esp),%ebx - shrl $10,%edi - addl 56(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,84(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3600352804(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 92(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 80(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 88(%esp),%ebx - shrl $10,%edi - addl 60(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,88(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 4094571909(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 32(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 84(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 92(%esp),%ebx - shrl $10,%edi - addl 64(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,92(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 275423344(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 36(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 88(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 32(%esp),%ebx - shrl $10,%edi - addl 68(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,32(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 430227734(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 40(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 92(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 36(%esp),%ebx - shrl $10,%edi - addl 72(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,36(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 506948616(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 44(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 32(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 40(%esp),%ebx - shrl $10,%edi - addl 76(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,40(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 659060556(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 48(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 36(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 44(%esp),%ebx - shrl $10,%edi - addl 80(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,44(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 883997877(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 52(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 40(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 48(%esp),%ebx - shrl $10,%edi - addl 84(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,48(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 958139571(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 56(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 44(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 52(%esp),%ebx - shrl $10,%edi - addl 88(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,52(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1322822218(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 60(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 48(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 56(%esp),%ebx - shrl $10,%edi - addl 92(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - movl %ebx,56(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1537002063(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 64(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 52(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 60(%esp),%ebx - shrl $10,%edi - addl 32(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - movl %ebx,60(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 1747873779(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 68(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 56(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 64(%esp),%ebx - shrl $10,%edi - addl 36(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 20(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 24(%esp),%edi - xorl %ecx,%edx - movl %ebx,64(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - addl 28(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 4(%esp),%edi - xorl %eax,%ecx - movl %eax,(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 1955562222(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 72(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 12(%esp),%edx - addl %ecx,%ebp - movl 60(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 68(%esp),%ebx - shrl $10,%edi - addl 40(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 16(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 20(%esp),%edi - xorl %esi,%edx - movl %ebx,68(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,12(%esp) - xorl %esi,%edx - addl 24(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl (%esp),%edi - xorl %ebp,%esi - movl %ebp,28(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2024104815(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 76(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 8(%esp),%edx - addl %esi,%eax - movl 64(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 72(%esp),%ebx - shrl $10,%edi - addl 44(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 12(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 16(%esp),%edi - xorl %ecx,%edx - movl %ebx,72(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - addl 20(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 28(%esp),%edi - xorl %eax,%ecx - movl %eax,24(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2227730452(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 80(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 4(%esp),%edx - addl %ecx,%ebp - movl 68(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 76(%esp),%ebx - shrl $10,%edi - addl 48(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 8(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 12(%esp),%edi - xorl %esi,%edx - movl %ebx,76(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,4(%esp) - xorl %esi,%edx - addl 16(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 24(%esp),%edi - xorl %ebp,%esi - movl %ebp,20(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2361852424(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 84(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl (%esp),%edx - addl %esi,%eax - movl 72(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 80(%esp),%ebx - shrl $10,%edi - addl 52(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 4(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl 8(%esp),%edi - xorl %ecx,%edx - movl %ebx,80(%esp) - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - addl 12(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 20(%esp),%edi - xorl %eax,%ecx - movl %eax,16(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 2428436474(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 88(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 28(%esp),%edx - addl %ecx,%ebp - movl 76(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 84(%esp),%ebx - shrl $10,%edi - addl 56(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl (%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 4(%esp),%edi - xorl %esi,%edx - movl %ebx,84(%esp) - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,28(%esp) - xorl %esi,%edx - addl 8(%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 16(%esp),%edi - xorl %ebp,%esi - movl %ebp,12(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 2756734187(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - movl 92(%esp),%ecx - rorl $2,%esi - addl %edx,%eax - addl 24(%esp),%edx - addl %esi,%eax - movl 80(%esp),%esi - movl %ecx,%ebx - rorl $11,%ecx - movl %esi,%edi - rorl $2,%esi - xorl %ebx,%ecx - shrl $3,%ebx - rorl $7,%ecx - xorl %edi,%esi - xorl %ecx,%ebx - rorl $17,%esi - addl 88(%esp),%ebx - shrl $10,%edi - addl 60(%esp),%ebx - movl %edx,%ecx - xorl %esi,%edi - movl 28(%esp),%esi - rorl $14,%edx - addl %edi,%ebx - movl (%esp),%edi - xorl %ecx,%edx - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - addl 4(%esp),%ebx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%ebx - rorl $9,%ecx - movl %eax,%esi - movl 12(%esp),%edi - xorl %eax,%ecx - movl %eax,8(%esp) - xorl %edi,%eax - rorl $11,%ecx - andl %eax,%ebp - leal 3204031479(%ebx,%edx,1),%edx - xorl %esi,%ecx - xorl %edi,%ebp - movl 32(%esp),%esi - rorl $2,%ecx - addl %edx,%ebp - addl 20(%esp),%edx - addl %ecx,%ebp - movl 84(%esp),%ecx - movl %esi,%ebx - rorl $11,%esi - movl %ecx,%edi - rorl $2,%ecx - xorl %ebx,%esi - shrl $3,%ebx - rorl $7,%esi - xorl %edi,%ecx - xorl %esi,%ebx - rorl $17,%ecx - addl 92(%esp),%ebx - shrl $10,%edi - addl 64(%esp),%ebx - movl %edx,%esi - xorl %ecx,%edi - movl 24(%esp),%ecx - rorl $14,%edx - addl %edi,%ebx - movl 28(%esp),%edi - xorl %esi,%edx - xorl %edi,%ecx - rorl $5,%edx - andl %esi,%ecx - movl %esi,20(%esp) - xorl %esi,%edx - addl (%esp),%ebx - xorl %ecx,%edi - rorl $6,%edx - movl %ebp,%esi - addl %edi,%ebx - rorl $9,%esi - movl %ebp,%ecx - movl 8(%esp),%edi - xorl %ebp,%esi - movl %ebp,4(%esp) - xorl %edi,%ebp - rorl $11,%esi - andl %ebp,%eax - leal 3329325298(%ebx,%edx,1),%edx - xorl %ecx,%esi - xorl %edi,%eax - rorl $2,%esi - addl %edx,%eax - addl 16(%esp),%edx - addl %esi,%eax - movl 96(%esp),%esi - xorl %edi,%ebp - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebp - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebp,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebp,4(%esp) - xorl %edi,%ebp - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ebx - movl 28(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ebx - addl 28(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %ebx,24(%esi) - movl %ecx,28(%esi) - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ebx,24(%esp) - movl %ecx,28(%esp) - cmpl 104(%esp),%edi - jb .L009grand_loop - movl 108(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 32 -.L005SSSE3: - leal -96(%esp),%esp - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,4(%esp) - xorl %ecx,%ebx - movl %ecx,8(%esp) - movl %edi,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%edi - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ecx,24(%esp) - movl %esi,28(%esp) - movdqa 256(%ebp),%xmm7 - jmp .L010grand_ssse3 -.align 16 -.L010grand_ssse3: - movdqu (%edi),%xmm0 - movdqu 16(%edi),%xmm1 - movdqu 32(%edi),%xmm2 - movdqu 48(%edi),%xmm3 - addl $64,%edi -.byte 102,15,56,0,199 - movl %edi,100(%esp) -.byte 102,15,56,0,207 - movdqa (%ebp),%xmm4 -.byte 102,15,56,0,215 - movdqa 16(%ebp),%xmm5 - paddd %xmm0,%xmm4 -.byte 102,15,56,0,223 - movdqa 32(%ebp),%xmm6 - paddd %xmm1,%xmm5 - movdqa 48(%ebp),%xmm7 - movdqa %xmm4,32(%esp) - paddd %xmm2,%xmm6 - movdqa %xmm5,48(%esp) - paddd %xmm3,%xmm7 - movdqa %xmm6,64(%esp) - movdqa %xmm7,80(%esp) - jmp .L011ssse3_00_47 -.align 16 -.L011ssse3_00_47: - addl $64,%ebp - movl %edx,%ecx - movdqa %xmm1,%xmm4 - rorl $14,%edx - movl 20(%esp),%esi - movdqa %xmm3,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi -.byte 102,15,58,15,224,4 - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi -.byte 102,15,58,15,250,4 - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx - movl %eax,%ecx - movdqa %xmm4,%xmm6 - addl %edi,%edx - movl 4(%esp),%edi - psrld $3,%xmm4 - movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm0 - movl %eax,(%esp) - xorl %eax,%ecx - psrld $7,%xmm6 - xorl %edi,%eax - addl 28(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - pshufd $250,%xmm3,%xmm7 - xorl %esi,%ecx - addl 32(%esp),%edx - pslld $14,%xmm5 - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - psrld $11,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 - movl 16(%esp),%esi - xorl %ecx,%edx - pslld $11,%xmm5 - movl 20(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 - andl %ecx,%esi - movl %ecx,12(%esp) - movdqa %xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 - movl %ebx,%ecx - addl %edi,%edx - psrld $10,%xmm7 - movl (%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm0 - movl %ebx,28(%esp) - xorl %ebx,%ecx - psrlq $17,%xmm6 - xorl %edi,%ebx - addl 24(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 - andl %ebx,%eax - xorl %esi,%ecx - psrlq $2,%xmm6 - addl 36(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - pshufd $128,%xmm7,%xmm7 - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - psrldq $8,%xmm7 - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - paddd %xmm7,%xmm0 - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,24(%esp) - pshufd $80,%xmm0,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 - andl %eax,%ebx - psrlq $17,%xmm6 - xorl %esi,%ecx - addl 40(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - psrlq $2,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - pshufd $8,%xmm7,%xmm7 - xorl %edi,%esi - rorl $5,%edx - movdqa (%ebp),%xmm6 - andl %ecx,%esi - movl %ecx,4(%esp) - pslldq $8,%xmm7 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm0 - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - paddd %xmm0,%xmm6 - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movdqa %xmm6,32(%esp) - movl %edx,%ecx - movdqa %xmm2,%xmm4 - rorl $14,%edx - movl 4(%esp),%esi - movdqa %xmm0,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi -.byte 102,15,58,15,225,4 - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi -.byte 102,15,58,15,251,4 - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx - movl %eax,%ecx - movdqa %xmm4,%xmm6 - addl %edi,%edx - movl 20(%esp),%edi - psrld $3,%xmm4 - movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm1 - movl %eax,16(%esp) - xorl %eax,%ecx - psrld $7,%xmm6 - xorl %edi,%eax - addl 12(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - pshufd $250,%xmm0,%xmm7 - xorl %esi,%ecx - addl 48(%esp),%edx - pslld $14,%xmm5 - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - psrld $11,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 - movl (%esp),%esi - xorl %ecx,%edx - pslld $11,%xmm5 - movl 4(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 - andl %ecx,%esi - movl %ecx,28(%esp) - movdqa %xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 - movl %ebx,%ecx - addl %edi,%edx - psrld $10,%xmm7 - movl 16(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm1 - movl %ebx,12(%esp) - xorl %ebx,%ecx - psrlq $17,%xmm6 - xorl %edi,%ebx - addl 8(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 - andl %ebx,%eax - xorl %esi,%ecx - psrlq $2,%xmm6 - addl 52(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - pshufd $128,%xmm7,%xmm7 - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - psrldq $8,%xmm7 - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - paddd %xmm7,%xmm1 - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,8(%esp) - pshufd $80,%xmm1,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 - andl %eax,%ebx - psrlq $17,%xmm6 - xorl %esi,%ecx - addl 56(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - psrlq $2,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - pshufd $8,%xmm7,%xmm7 - xorl %edi,%esi - rorl $5,%edx - movdqa 16(%ebp),%xmm6 - andl %ecx,%esi - movl %ecx,20(%esp) - pslldq $8,%xmm7 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm1 - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - paddd %xmm1,%xmm6 - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movdqa %xmm6,48(%esp) - movl %edx,%ecx - movdqa %xmm3,%xmm4 - rorl $14,%edx - movl 20(%esp),%esi - movdqa %xmm1,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi -.byte 102,15,58,15,226,4 - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi -.byte 102,15,58,15,248,4 - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx - movl %eax,%ecx - movdqa %xmm4,%xmm6 - addl %edi,%edx - movl 4(%esp),%edi - psrld $3,%xmm4 - movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm2 - movl %eax,(%esp) - xorl %eax,%ecx - psrld $7,%xmm6 - xorl %edi,%eax - addl 28(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - pshufd $250,%xmm1,%xmm7 - xorl %esi,%ecx - addl 64(%esp),%edx - pslld $14,%xmm5 - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - psrld $11,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 - movl 16(%esp),%esi - xorl %ecx,%edx - pslld $11,%xmm5 - movl 20(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 - andl %ecx,%esi - movl %ecx,12(%esp) - movdqa %xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 - movl %ebx,%ecx - addl %edi,%edx - psrld $10,%xmm7 - movl (%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm2 - movl %ebx,28(%esp) - xorl %ebx,%ecx - psrlq $17,%xmm6 - xorl %edi,%ebx - addl 24(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 - andl %ebx,%eax - xorl %esi,%ecx - psrlq $2,%xmm6 - addl 68(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - pshufd $128,%xmm7,%xmm7 - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - psrldq $8,%xmm7 - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - paddd %xmm7,%xmm2 - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,24(%esp) - pshufd $80,%xmm2,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 - andl %eax,%ebx - psrlq $17,%xmm6 - xorl %esi,%ecx - addl 72(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - psrlq $2,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - pshufd $8,%xmm7,%xmm7 - xorl %edi,%esi - rorl $5,%edx - movdqa 32(%ebp),%xmm6 - andl %ecx,%esi - movl %ecx,4(%esp) - pslldq $8,%xmm7 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm2 - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - paddd %xmm2,%xmm6 - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movdqa %xmm6,64(%esp) - movl %edx,%ecx - movdqa %xmm0,%xmm4 - rorl $14,%edx - movl 4(%esp),%esi - movdqa %xmm2,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi -.byte 102,15,58,15,227,4 - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi -.byte 102,15,58,15,249,4 - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - movdqa %xmm4,%xmm5 - rorl $6,%edx - movl %eax,%ecx - movdqa %xmm4,%xmm6 - addl %edi,%edx - movl 20(%esp),%edi - psrld $3,%xmm4 - movl %eax,%esi - rorl $9,%ecx - paddd %xmm7,%xmm3 - movl %eax,16(%esp) - xorl %eax,%ecx - psrld $7,%xmm6 - xorl %edi,%eax - addl 12(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - pshufd $250,%xmm2,%xmm7 - xorl %esi,%ecx - addl 80(%esp),%edx - pslld $14,%xmm5 - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - psrld $11,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm5,%xmm4 - movl (%esp),%esi - xorl %ecx,%edx - pslld $11,%xmm5 - movl 4(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - pxor %xmm6,%xmm4 - andl %ecx,%esi - movl %ecx,28(%esp) - movdqa %xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - pxor %xmm5,%xmm4 - movl %ebx,%ecx - addl %edi,%edx - psrld $10,%xmm7 - movl 16(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm4,%xmm3 - movl %ebx,12(%esp) - xorl %ebx,%ecx - psrlq $17,%xmm6 - xorl %edi,%ebx - addl 8(%esp),%edx - rorl $11,%ecx - pxor %xmm6,%xmm7 - andl %ebx,%eax - xorl %esi,%ecx - psrlq $2,%xmm6 - addl 84(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - pshufd $128,%xmm7,%xmm7 - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - psrldq $8,%xmm7 - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - paddd %xmm7,%xmm3 - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,8(%esp) - pshufd $80,%xmm3,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - movdqa %xmm7,%xmm6 - rorl $11,%ecx - psrld $10,%xmm7 - andl %eax,%ebx - psrlq $17,%xmm6 - xorl %esi,%ecx - addl 88(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - pxor %xmm6,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - psrlq $2,%xmm6 - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - pxor %xmm6,%xmm7 - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - pshufd $8,%xmm7,%xmm7 - xorl %edi,%esi - rorl $5,%edx - movdqa 48(%ebp),%xmm6 - andl %ecx,%esi - movl %ecx,20(%esp) - pslldq $8,%xmm7 - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - paddd %xmm7,%xmm3 - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - paddd %xmm3,%xmm6 - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movdqa %xmm6,80(%esp) - cmpl $66051,64(%ebp) - jne .L011ssse3_00_47 - movl %edx,%ecx - rorl $14,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 32(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 36(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 40(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 48(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 52(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 56(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 64(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 68(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 72(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 80(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 84(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - rorl $14,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - rorl $9,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - rorl $11,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 88(%esp),%edx - xorl %edi,%ebx - rorl $2,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - rorl $14,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - rorl $5,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - rorl $6,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - rorl $9,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - rorl $11,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - rorl $2,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl 96(%esp),%esi - xorl %edi,%ebx - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebx,4(%esp) - xorl %edi,%ebx - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %edi,20(%esp) - movl 28(%esp),%edi - movl %ecx,24(%esi) - addl 28(%esi),%edi - movl %ecx,24(%esp) - movl %edi,28(%esi) - movl %edi,28(%esp) - movl 100(%esp),%edi - movdqa 64(%ebp),%xmm7 - subl $192,%ebp - cmpl 104(%esp),%edi - jb .L010grand_ssse3 - movl 108(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 32 -.L004AVX: - leal -96(%esp),%esp - vzeroall - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edi - movl %ebx,4(%esp) - xorl %ecx,%ebx - movl %ecx,8(%esp) - movl %edi,12(%esp) - movl 16(%esi),%edx - movl 20(%esi),%edi - movl 24(%esi),%ecx - movl 28(%esi),%esi - movl %edi,20(%esp) - movl 100(%esp),%edi - movl %ecx,24(%esp) - movl %esi,28(%esp) - vmovdqa 256(%ebp),%xmm7 - jmp .L012grand_avx -.align 32 -.L012grand_avx: - vmovdqu (%edi),%xmm0 - vmovdqu 16(%edi),%xmm1 - vmovdqu 32(%edi),%xmm2 - vmovdqu 48(%edi),%xmm3 - addl $64,%edi - vpshufb %xmm7,%xmm0,%xmm0 - movl %edi,100(%esp) - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd (%ebp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 16(%ebp),%xmm1,%xmm5 - vpaddd 32(%ebp),%xmm2,%xmm6 - vpaddd 48(%ebp),%xmm3,%xmm7 - vmovdqa %xmm4,32(%esp) - vmovdqa %xmm5,48(%esp) - vmovdqa %xmm6,64(%esp) - vmovdqa %xmm7,80(%esp) - jmp .L013avx_00_47 -.align 16 -.L013avx_00_47: - addl $64,%ebp - vpalignr $4,%xmm0,%xmm1,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - vpalignr $4,%xmm2,%xmm3,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - vpaddd %xmm7,%xmm0,%xmm0 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - vpshufd $250,%xmm3,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 32(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - vpaddd %xmm4,%xmm0,%xmm0 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 36(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - vpaddd %xmm7,%xmm0,%xmm0 - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm0,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 40(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm0,%xmm0 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - vpaddd (%ebp),%xmm0,%xmm6 - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,32(%esp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - vpalignr $4,%xmm3,%xmm0,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - vpaddd %xmm7,%xmm1,%xmm1 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - vpshufd $250,%xmm0,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 48(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - vpaddd %xmm4,%xmm1,%xmm1 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 52(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - vpaddd %xmm7,%xmm1,%xmm1 - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm1,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 56(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm1,%xmm1 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - vpaddd 16(%ebp),%xmm1,%xmm6 - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,48(%esp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - vpalignr $4,%xmm0,%xmm1,%xmm7 - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - vpaddd %xmm7,%xmm2,%xmm2 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - vpshufd $250,%xmm1,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 64(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - vpaddd %xmm4,%xmm2,%xmm2 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 68(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - vpaddd %xmm7,%xmm2,%xmm2 - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm2,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 72(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm2,%xmm2 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - vpaddd 32(%ebp),%xmm2,%xmm6 - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,64(%esp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - vpalignr $4,%xmm1,%xmm2,%xmm7 - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - vpaddd %xmm7,%xmm3,%xmm3 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrld $3,%xmm4,%xmm7 - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - vpslld $14,%xmm4,%xmm5 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - vpshufd $250,%xmm2,%xmm7 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpsrld $11,%xmm6,%xmm6 - addl 80(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpxor %xmm5,%xmm4,%xmm4 - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - vpslld $11,%xmm5,%xmm5 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - vpxor %xmm6,%xmm4,%xmm4 - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - vpsrld $10,%xmm7,%xmm6 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - vpxor %xmm5,%xmm4,%xmm4 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - vpaddd %xmm4,%xmm3,%xmm3 - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - vpxor %xmm5,%xmm6,%xmm6 - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - vpsrlq $19,%xmm7,%xmm7 - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - vpxor %xmm7,%xmm6,%xmm6 - addl 84(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - vpshufd $132,%xmm6,%xmm7 - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - vpsrldq $8,%xmm7,%xmm7 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - vpaddd %xmm7,%xmm3,%xmm3 - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - vpshufd $80,%xmm3,%xmm7 - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - vpsrld $10,%xmm7,%xmm6 - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - vpsrlq $17,%xmm7,%xmm5 - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - vpxor %xmm5,%xmm6,%xmm6 - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - vpsrlq $19,%xmm7,%xmm7 - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - vpxor %xmm7,%xmm6,%xmm6 - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - vpshufd $232,%xmm6,%xmm7 - addl 88(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - vpslldq $8,%xmm7,%xmm7 - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - vpaddd %xmm7,%xmm3,%xmm3 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - vpaddd 48(%ebp),%xmm3,%xmm6 - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - vmovdqa %xmm6,80(%esp) - cmpl $66051,64(%ebp) - jne .L013avx_00_47 - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 32(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 36(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 40(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 44(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 48(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 52(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 56(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 60(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 20(%esp),%esi - xorl %ecx,%edx - movl 24(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,16(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 4(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 28(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 64(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 12(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 16(%esp),%esi - xorl %ecx,%edx - movl 20(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,12(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl (%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,28(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 24(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 68(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 8(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 12(%esp),%esi - xorl %ecx,%edx - movl 16(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,8(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 28(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,24(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 20(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 72(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 4(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 8(%esp),%esi - xorl %ecx,%edx - movl 12(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,4(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 24(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,20(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 16(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 76(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl (%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 4(%esp),%esi - xorl %ecx,%edx - movl 8(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 20(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,16(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 12(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 80(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 28(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl (%esp),%esi - xorl %ecx,%edx - movl 4(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,28(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 16(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,12(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl 8(%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 84(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 24(%esp),%edx - addl %ecx,%eax - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 28(%esp),%esi - xorl %ecx,%edx - movl (%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,24(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %eax,%ecx - addl %edi,%edx - movl 12(%esp),%edi - movl %eax,%esi - shrdl $9,%ecx,%ecx - movl %eax,8(%esp) - xorl %eax,%ecx - xorl %edi,%eax - addl 4(%esp),%edx - shrdl $11,%ecx,%ecx - andl %eax,%ebx - xorl %esi,%ecx - addl 88(%esp),%edx - xorl %edi,%ebx - shrdl $2,%ecx,%ecx - addl %edx,%ebx - addl 20(%esp),%edx - addl %ecx,%ebx - movl %edx,%ecx - shrdl $14,%edx,%edx - movl 24(%esp),%esi - xorl %ecx,%edx - movl 28(%esp),%edi - xorl %edi,%esi - shrdl $5,%edx,%edx - andl %ecx,%esi - movl %ecx,20(%esp) - xorl %ecx,%edx - xorl %esi,%edi - shrdl $6,%edx,%edx - movl %ebx,%ecx - addl %edi,%edx - movl 8(%esp),%edi - movl %ebx,%esi - shrdl $9,%ecx,%ecx - movl %ebx,4(%esp) - xorl %ebx,%ecx - xorl %edi,%ebx - addl (%esp),%edx - shrdl $11,%ecx,%ecx - andl %ebx,%eax - xorl %esi,%ecx - addl 92(%esp),%edx - xorl %edi,%eax - shrdl $2,%ecx,%ecx - addl %edx,%eax - addl 16(%esp),%edx - addl %ecx,%eax - movl 96(%esp),%esi - xorl %edi,%ebx - movl 12(%esp),%ecx - addl (%esi),%eax - addl 4(%esi),%ebx - addl 8(%esi),%edi - addl 12(%esi),%ecx - movl %eax,(%esi) - movl %ebx,4(%esi) - movl %edi,8(%esi) - movl %ecx,12(%esi) - movl %ebx,4(%esp) - xorl %edi,%ebx - movl %edi,8(%esp) - movl %ecx,12(%esp) - movl 20(%esp),%edi - movl 24(%esp),%ecx - addl 16(%esi),%edx - addl 20(%esi),%edi - addl 24(%esi),%ecx - movl %edx,16(%esi) - movl %edi,20(%esi) - movl %edi,20(%esp) - movl 28(%esp),%edi - movl %ecx,24(%esi) - addl 28(%esi),%edi - movl %ecx,24(%esp) - movl %edi,28(%esi) - movl %edi,28(%esp) - movl 100(%esp),%edi - vmovdqa 64(%ebp),%xmm7 - subl $192,%ebp - cmpl 104(%esp),%edi - jb .L012grand_avx - movl 108(%esp),%esp - vzeroall - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size sha256_block_data_order,.-.L_sha256_block_data_order_begin -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S b/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S deleted file mode 100644 index 282cddaa..00000000 --- a/third_party/boringssl/linux-x86/crypto/fipsmodule/sha512-586.S +++ /dev/null @@ -1,2837 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl sha512_block_data_order -.hidden sha512_block_data_order -.type sha512_block_data_order,@function -.align 16 -sha512_block_data_order: -.L_sha512_block_data_order_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl %esp,%ebx - call .L000pic_point -.L000pic_point: - popl %ebp - leal .L001K512-.L000pic_point(%ebp),%ebp - subl $16,%esp - andl $-64,%esp - shll $7,%eax - addl %edi,%eax - movl %esi,(%esp) - movl %edi,4(%esp) - movl %eax,8(%esp) - movl %ebx,12(%esp) - leal OPENSSL_ia32cap_P-.L001K512(%ebp),%edx - movl (%edx),%ecx - testl $67108864,%ecx - jz .L002loop_x86 - movl 4(%edx),%edx - movq (%esi),%mm0 - andl $16777216,%ecx - movq 8(%esi),%mm1 - andl $512,%edx - movq 16(%esi),%mm2 - orl %edx,%ecx - movq 24(%esi),%mm3 - movq 32(%esi),%mm4 - movq 40(%esi),%mm5 - movq 48(%esi),%mm6 - movq 56(%esi),%mm7 - cmpl $16777728,%ecx - je .L003SSSE3 - subl $80,%esp - jmp .L004loop_sse2 -.align 16 -.L004loop_sse2: - movq %mm1,8(%esp) - movq %mm2,16(%esp) - movq %mm3,24(%esp) - movq %mm5,40(%esp) - movq %mm6,48(%esp) - pxor %mm1,%mm2 - movq %mm7,56(%esp) - movq %mm0,%mm3 - movl (%edi),%eax - movl 4(%edi),%ebx - addl $8,%edi - movl $15,%edx - bswap %eax - bswap %ebx - jmp .L00500_14_sse2 -.align 16 -.L00500_14_sse2: - movd %eax,%mm1 - movl (%edi),%eax - movd %ebx,%mm7 - movl 4(%edi),%ebx - addl $8,%edi - bswap %eax - bswap %ebx - punpckldq %mm1,%mm7 - movq %mm4,%mm1 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - movq %mm3,%mm0 - movq %mm7,72(%esp) - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - paddq (%ebp),%mm7 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - subl $8,%esp - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 40(%esp),%mm5 - paddq %mm2,%mm3 - movq %mm0,%mm2 - addl $8,%ebp - paddq %mm6,%mm3 - movq 48(%esp),%mm6 - decl %edx - jnz .L00500_14_sse2 - movd %eax,%mm1 - movd %ebx,%mm7 - punpckldq %mm1,%mm7 - movq %mm4,%mm1 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - movq %mm3,%mm0 - movq %mm7,72(%esp) - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - paddq (%ebp),%mm7 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - subl $8,%esp - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 192(%esp),%mm7 - paddq %mm2,%mm3 - movq %mm0,%mm2 - addl $8,%ebp - paddq %mm6,%mm3 - pxor %mm0,%mm0 - movl $32,%edx - jmp .L00616_79_sse2 -.align 16 -.L00616_79_sse2: - movq 88(%esp),%mm5 - movq %mm7,%mm1 - psrlq $1,%mm7 - movq %mm5,%mm6 - psrlq $6,%mm5 - psllq $56,%mm1 - paddq %mm3,%mm0 - movq %mm7,%mm3 - psrlq $6,%mm7 - pxor %mm1,%mm3 - psllq $7,%mm1 - pxor %mm7,%mm3 - psrlq $1,%mm7 - pxor %mm1,%mm3 - movq %mm5,%mm1 - psrlq $13,%mm5 - pxor %mm3,%mm7 - psllq $3,%mm6 - pxor %mm5,%mm1 - paddq 200(%esp),%mm7 - pxor %mm6,%mm1 - psrlq $42,%mm5 - paddq 128(%esp),%mm7 - pxor %mm5,%mm1 - psllq $42,%mm6 - movq 40(%esp),%mm5 - pxor %mm6,%mm1 - movq 48(%esp),%mm6 - paddq %mm1,%mm7 - movq %mm4,%mm1 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - movq %mm7,72(%esp) - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - paddq (%ebp),%mm7 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - subl $8,%esp - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 192(%esp),%mm7 - paddq %mm6,%mm2 - addl $8,%ebp - movq 88(%esp),%mm5 - movq %mm7,%mm1 - psrlq $1,%mm7 - movq %mm5,%mm6 - psrlq $6,%mm5 - psllq $56,%mm1 - paddq %mm3,%mm2 - movq %mm7,%mm3 - psrlq $6,%mm7 - pxor %mm1,%mm3 - psllq $7,%mm1 - pxor %mm7,%mm3 - psrlq $1,%mm7 - pxor %mm1,%mm3 - movq %mm5,%mm1 - psrlq $13,%mm5 - pxor %mm3,%mm7 - psllq $3,%mm6 - pxor %mm5,%mm1 - paddq 200(%esp),%mm7 - pxor %mm6,%mm1 - psrlq $42,%mm5 - paddq 128(%esp),%mm7 - pxor %mm5,%mm1 - psllq $42,%mm6 - movq 40(%esp),%mm5 - pxor %mm6,%mm1 - movq 48(%esp),%mm6 - paddq %mm1,%mm7 - movq %mm4,%mm1 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - movq %mm7,72(%esp) - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - paddq (%ebp),%mm7 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - subl $8,%esp - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 192(%esp),%mm7 - paddq %mm6,%mm0 - addl $8,%ebp - decl %edx - jnz .L00616_79_sse2 - paddq %mm3,%mm0 - movq 8(%esp),%mm1 - movq 24(%esp),%mm3 - movq 40(%esp),%mm5 - movq 48(%esp),%mm6 - movq 56(%esp),%mm7 - pxor %mm1,%mm2 - paddq (%esi),%mm0 - paddq 8(%esi),%mm1 - paddq 16(%esi),%mm2 - paddq 24(%esi),%mm3 - paddq 32(%esi),%mm4 - paddq 40(%esi),%mm5 - paddq 48(%esi),%mm6 - paddq 56(%esi),%mm7 - movl $640,%eax - movq %mm0,(%esi) - movq %mm1,8(%esi) - movq %mm2,16(%esi) - movq %mm3,24(%esi) - movq %mm4,32(%esi) - movq %mm5,40(%esi) - movq %mm6,48(%esi) - movq %mm7,56(%esi) - leal (%esp,%eax,1),%esp - subl %eax,%ebp - cmpl 88(%esp),%edi - jb .L004loop_sse2 - movl 92(%esp),%esp - emms - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 32 -.L003SSSE3: - leal -64(%esp),%edx - subl $256,%esp - movdqa 640(%ebp),%xmm1 - movdqu (%edi),%xmm0 -.byte 102,15,56,0,193 - movdqa (%ebp),%xmm3 - movdqa %xmm1,%xmm2 - movdqu 16(%edi),%xmm1 - paddq %xmm0,%xmm3 -.byte 102,15,56,0,202 - movdqa %xmm3,-128(%edx) - movdqa 16(%ebp),%xmm4 - movdqa %xmm2,%xmm3 - movdqu 32(%edi),%xmm2 - paddq %xmm1,%xmm4 -.byte 102,15,56,0,211 - movdqa %xmm4,-112(%edx) - movdqa 32(%ebp),%xmm5 - movdqa %xmm3,%xmm4 - movdqu 48(%edi),%xmm3 - paddq %xmm2,%xmm5 -.byte 102,15,56,0,220 - movdqa %xmm5,-96(%edx) - movdqa 48(%ebp),%xmm6 - movdqa %xmm4,%xmm5 - movdqu 64(%edi),%xmm4 - paddq %xmm3,%xmm6 -.byte 102,15,56,0,229 - movdqa %xmm6,-80(%edx) - movdqa 64(%ebp),%xmm7 - movdqa %xmm5,%xmm6 - movdqu 80(%edi),%xmm5 - paddq %xmm4,%xmm7 -.byte 102,15,56,0,238 - movdqa %xmm7,-64(%edx) - movdqa %xmm0,(%edx) - movdqa 80(%ebp),%xmm0 - movdqa %xmm6,%xmm7 - movdqu 96(%edi),%xmm6 - paddq %xmm5,%xmm0 -.byte 102,15,56,0,247 - movdqa %xmm0,-48(%edx) - movdqa %xmm1,16(%edx) - movdqa 96(%ebp),%xmm1 - movdqa %xmm7,%xmm0 - movdqu 112(%edi),%xmm7 - paddq %xmm6,%xmm1 -.byte 102,15,56,0,248 - movdqa %xmm1,-32(%edx) - movdqa %xmm2,32(%edx) - movdqa 112(%ebp),%xmm2 - movdqa (%edx),%xmm0 - paddq %xmm7,%xmm2 - movdqa %xmm2,-16(%edx) - nop -.align 32 -.L007loop_ssse3: - movdqa 16(%edx),%xmm2 - movdqa %xmm3,48(%edx) - leal 128(%ebp),%ebp - movq %mm1,8(%esp) - movl %edi,%ebx - movq %mm2,16(%esp) - leal 128(%edi),%edi - movq %mm3,24(%esp) - cmpl %eax,%edi - movq %mm5,40(%esp) - cmovbl %edi,%ebx - movq %mm6,48(%esp) - movl $4,%ecx - pxor %mm1,%mm2 - movq %mm7,56(%esp) - pxor %mm3,%mm3 - jmp .L00800_47_ssse3 -.align 32 -.L00800_47_ssse3: - movdqa %xmm5,%xmm3 - movdqa %xmm2,%xmm1 -.byte 102,15,58,15,208,8 - movdqa %xmm4,(%edx) -.byte 102,15,58,15,220,8 - movdqa %xmm2,%xmm4 - psrlq $7,%xmm2 - paddq %xmm3,%xmm0 - movdqa %xmm4,%xmm3 - psrlq $1,%xmm4 - psllq $56,%xmm3 - pxor %xmm4,%xmm2 - psrlq $7,%xmm4 - pxor %xmm3,%xmm2 - psllq $7,%xmm3 - pxor %xmm4,%xmm2 - movdqa %xmm7,%xmm4 - pxor %xmm3,%xmm2 - movdqa %xmm7,%xmm3 - psrlq $6,%xmm4 - paddq %xmm2,%xmm0 - movdqa %xmm7,%xmm2 - psrlq $19,%xmm3 - psllq $3,%xmm2 - pxor %xmm3,%xmm4 - psrlq $42,%xmm3 - pxor %xmm2,%xmm4 - psllq $42,%xmm2 - pxor %xmm3,%xmm4 - movdqa 32(%edx),%xmm3 - pxor %xmm2,%xmm4 - movdqa (%ebp),%xmm2 - movq %mm4,%mm1 - paddq %xmm4,%xmm0 - movq -128(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - paddq %xmm0,%xmm2 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 32(%esp),%mm5 - paddq %mm6,%mm2 - movq 40(%esp),%mm6 - movq %mm4,%mm1 - movq -120(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,24(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,56(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 48(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 16(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq (%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 24(%esp),%mm5 - paddq %mm6,%mm0 - movq 32(%esp),%mm6 - movdqa %xmm2,-128(%edx) - movdqa %xmm6,%xmm4 - movdqa %xmm3,%xmm2 -.byte 102,15,58,15,217,8 - movdqa %xmm5,16(%edx) -.byte 102,15,58,15,229,8 - movdqa %xmm3,%xmm5 - psrlq $7,%xmm3 - paddq %xmm4,%xmm1 - movdqa %xmm5,%xmm4 - psrlq $1,%xmm5 - psllq $56,%xmm4 - pxor %xmm5,%xmm3 - psrlq $7,%xmm5 - pxor %xmm4,%xmm3 - psllq $7,%xmm4 - pxor %xmm5,%xmm3 - movdqa %xmm0,%xmm5 - pxor %xmm4,%xmm3 - movdqa %xmm0,%xmm4 - psrlq $6,%xmm5 - paddq %xmm3,%xmm1 - movdqa %xmm0,%xmm3 - psrlq $19,%xmm4 - psllq $3,%xmm3 - pxor %xmm4,%xmm5 - psrlq $42,%xmm4 - pxor %xmm3,%xmm5 - psllq $42,%xmm3 - pxor %xmm4,%xmm5 - movdqa 48(%edx),%xmm4 - pxor %xmm3,%xmm5 - movdqa 16(%ebp),%xmm3 - movq %mm4,%mm1 - paddq %xmm5,%xmm1 - movq -112(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,16(%esp) - paddq %xmm1,%xmm3 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,48(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 40(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 8(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 56(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 16(%esp),%mm5 - paddq %mm6,%mm2 - movq 24(%esp),%mm6 - movq %mm4,%mm1 - movq -104(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,8(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,40(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 32(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq (%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 48(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 8(%esp),%mm5 - paddq %mm6,%mm0 - movq 16(%esp),%mm6 - movdqa %xmm3,-112(%edx) - movdqa %xmm7,%xmm5 - movdqa %xmm4,%xmm3 -.byte 102,15,58,15,226,8 - movdqa %xmm6,32(%edx) -.byte 102,15,58,15,238,8 - movdqa %xmm4,%xmm6 - psrlq $7,%xmm4 - paddq %xmm5,%xmm2 - movdqa %xmm6,%xmm5 - psrlq $1,%xmm6 - psllq $56,%xmm5 - pxor %xmm6,%xmm4 - psrlq $7,%xmm6 - pxor %xmm5,%xmm4 - psllq $7,%xmm5 - pxor %xmm6,%xmm4 - movdqa %xmm1,%xmm6 - pxor %xmm5,%xmm4 - movdqa %xmm1,%xmm5 - psrlq $6,%xmm6 - paddq %xmm4,%xmm2 - movdqa %xmm1,%xmm4 - psrlq $19,%xmm5 - psllq $3,%xmm4 - pxor %xmm5,%xmm6 - psrlq $42,%xmm5 - pxor %xmm4,%xmm6 - psllq $42,%xmm4 - pxor %xmm5,%xmm6 - movdqa (%edx),%xmm5 - pxor %xmm4,%xmm6 - movdqa 32(%ebp),%xmm4 - movq %mm4,%mm1 - paddq %xmm6,%xmm2 - movq -96(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,(%esp) - paddq %xmm2,%xmm4 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,32(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 24(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 56(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 40(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq (%esp),%mm5 - paddq %mm6,%mm2 - movq 8(%esp),%mm6 - movq %mm4,%mm1 - movq -88(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,56(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,24(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 16(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 48(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 32(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 56(%esp),%mm5 - paddq %mm6,%mm0 - movq (%esp),%mm6 - movdqa %xmm4,-96(%edx) - movdqa %xmm0,%xmm6 - movdqa %xmm5,%xmm4 -.byte 102,15,58,15,235,8 - movdqa %xmm7,48(%edx) -.byte 102,15,58,15,247,8 - movdqa %xmm5,%xmm7 - psrlq $7,%xmm5 - paddq %xmm6,%xmm3 - movdqa %xmm7,%xmm6 - psrlq $1,%xmm7 - psllq $56,%xmm6 - pxor %xmm7,%xmm5 - psrlq $7,%xmm7 - pxor %xmm6,%xmm5 - psllq $7,%xmm6 - pxor %xmm7,%xmm5 - movdqa %xmm2,%xmm7 - pxor %xmm6,%xmm5 - movdqa %xmm2,%xmm6 - psrlq $6,%xmm7 - paddq %xmm5,%xmm3 - movdqa %xmm2,%xmm5 - psrlq $19,%xmm6 - psllq $3,%xmm5 - pxor %xmm6,%xmm7 - psrlq $42,%xmm6 - pxor %xmm5,%xmm7 - psllq $42,%xmm5 - pxor %xmm6,%xmm7 - movdqa 16(%edx),%xmm6 - pxor %xmm5,%xmm7 - movdqa 48(%ebp),%xmm5 - movq %mm4,%mm1 - paddq %xmm7,%xmm3 - movq -80(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,48(%esp) - paddq %xmm3,%xmm5 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,16(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 8(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 40(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 24(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 48(%esp),%mm5 - paddq %mm6,%mm2 - movq 56(%esp),%mm6 - movq %mm4,%mm1 - movq -72(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,40(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,8(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq (%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 32(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 16(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 40(%esp),%mm5 - paddq %mm6,%mm0 - movq 48(%esp),%mm6 - movdqa %xmm5,-80(%edx) - movdqa %xmm1,%xmm7 - movdqa %xmm6,%xmm5 -.byte 102,15,58,15,244,8 - movdqa %xmm0,(%edx) -.byte 102,15,58,15,248,8 - movdqa %xmm6,%xmm0 - psrlq $7,%xmm6 - paddq %xmm7,%xmm4 - movdqa %xmm0,%xmm7 - psrlq $1,%xmm0 - psllq $56,%xmm7 - pxor %xmm0,%xmm6 - psrlq $7,%xmm0 - pxor %xmm7,%xmm6 - psllq $7,%xmm7 - pxor %xmm0,%xmm6 - movdqa %xmm3,%xmm0 - pxor %xmm7,%xmm6 - movdqa %xmm3,%xmm7 - psrlq $6,%xmm0 - paddq %xmm6,%xmm4 - movdqa %xmm3,%xmm6 - psrlq $19,%xmm7 - psllq $3,%xmm6 - pxor %xmm7,%xmm0 - psrlq $42,%xmm7 - pxor %xmm6,%xmm0 - psllq $42,%xmm6 - pxor %xmm7,%xmm0 - movdqa 32(%edx),%xmm7 - pxor %xmm6,%xmm0 - movdqa 64(%ebp),%xmm6 - movq %mm4,%mm1 - paddq %xmm0,%xmm4 - movq -64(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - paddq %xmm4,%xmm6 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 32(%esp),%mm5 - paddq %mm6,%mm2 - movq 40(%esp),%mm6 - movq %mm4,%mm1 - movq -56(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,24(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,56(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 48(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 16(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq (%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 24(%esp),%mm5 - paddq %mm6,%mm0 - movq 32(%esp),%mm6 - movdqa %xmm6,-64(%edx) - movdqa %xmm2,%xmm0 - movdqa %xmm7,%xmm6 -.byte 102,15,58,15,253,8 - movdqa %xmm1,16(%edx) -.byte 102,15,58,15,193,8 - movdqa %xmm7,%xmm1 - psrlq $7,%xmm7 - paddq %xmm0,%xmm5 - movdqa %xmm1,%xmm0 - psrlq $1,%xmm1 - psllq $56,%xmm0 - pxor %xmm1,%xmm7 - psrlq $7,%xmm1 - pxor %xmm0,%xmm7 - psllq $7,%xmm0 - pxor %xmm1,%xmm7 - movdqa %xmm4,%xmm1 - pxor %xmm0,%xmm7 - movdqa %xmm4,%xmm0 - psrlq $6,%xmm1 - paddq %xmm7,%xmm5 - movdqa %xmm4,%xmm7 - psrlq $19,%xmm0 - psllq $3,%xmm7 - pxor %xmm0,%xmm1 - psrlq $42,%xmm0 - pxor %xmm7,%xmm1 - psllq $42,%xmm7 - pxor %xmm0,%xmm1 - movdqa 48(%edx),%xmm0 - pxor %xmm7,%xmm1 - movdqa 80(%ebp),%xmm7 - movq %mm4,%mm1 - paddq %xmm1,%xmm5 - movq -48(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,16(%esp) - paddq %xmm5,%xmm7 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,48(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 40(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 8(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 56(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 16(%esp),%mm5 - paddq %mm6,%mm2 - movq 24(%esp),%mm6 - movq %mm4,%mm1 - movq -40(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,8(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,40(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 32(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq (%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 48(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 8(%esp),%mm5 - paddq %mm6,%mm0 - movq 16(%esp),%mm6 - movdqa %xmm7,-48(%edx) - movdqa %xmm3,%xmm1 - movdqa %xmm0,%xmm7 -.byte 102,15,58,15,198,8 - movdqa %xmm2,32(%edx) -.byte 102,15,58,15,202,8 - movdqa %xmm0,%xmm2 - psrlq $7,%xmm0 - paddq %xmm1,%xmm6 - movdqa %xmm2,%xmm1 - psrlq $1,%xmm2 - psllq $56,%xmm1 - pxor %xmm2,%xmm0 - psrlq $7,%xmm2 - pxor %xmm1,%xmm0 - psllq $7,%xmm1 - pxor %xmm2,%xmm0 - movdqa %xmm5,%xmm2 - pxor %xmm1,%xmm0 - movdqa %xmm5,%xmm1 - psrlq $6,%xmm2 - paddq %xmm0,%xmm6 - movdqa %xmm5,%xmm0 - psrlq $19,%xmm1 - psllq $3,%xmm0 - pxor %xmm1,%xmm2 - psrlq $42,%xmm1 - pxor %xmm0,%xmm2 - psllq $42,%xmm0 - pxor %xmm1,%xmm2 - movdqa (%edx),%xmm1 - pxor %xmm0,%xmm2 - movdqa 96(%ebp),%xmm0 - movq %mm4,%mm1 - paddq %xmm2,%xmm6 - movq -32(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,(%esp) - paddq %xmm6,%xmm0 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,32(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 24(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 56(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 40(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq (%esp),%mm5 - paddq %mm6,%mm2 - movq 8(%esp),%mm6 - movq %mm4,%mm1 - movq -24(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,56(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,24(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 16(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 48(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 32(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 56(%esp),%mm5 - paddq %mm6,%mm0 - movq (%esp),%mm6 - movdqa %xmm0,-32(%edx) - movdqa %xmm4,%xmm2 - movdqa %xmm1,%xmm0 -.byte 102,15,58,15,207,8 - movdqa %xmm3,48(%edx) -.byte 102,15,58,15,211,8 - movdqa %xmm1,%xmm3 - psrlq $7,%xmm1 - paddq %xmm2,%xmm7 - movdqa %xmm3,%xmm2 - psrlq $1,%xmm3 - psllq $56,%xmm2 - pxor %xmm3,%xmm1 - psrlq $7,%xmm3 - pxor %xmm2,%xmm1 - psllq $7,%xmm2 - pxor %xmm3,%xmm1 - movdqa %xmm6,%xmm3 - pxor %xmm2,%xmm1 - movdqa %xmm6,%xmm2 - psrlq $6,%xmm3 - paddq %xmm1,%xmm7 - movdqa %xmm6,%xmm1 - psrlq $19,%xmm2 - psllq $3,%xmm1 - pxor %xmm2,%xmm3 - psrlq $42,%xmm2 - pxor %xmm1,%xmm3 - psllq $42,%xmm1 - pxor %xmm2,%xmm3 - movdqa 16(%edx),%xmm2 - pxor %xmm1,%xmm3 - movdqa 112(%ebp),%xmm1 - movq %mm4,%mm1 - paddq %xmm3,%xmm7 - movq -16(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,48(%esp) - paddq %xmm7,%xmm1 - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,16(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 8(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 40(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 24(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 48(%esp),%mm5 - paddq %mm6,%mm2 - movq 56(%esp),%mm6 - movq %mm4,%mm1 - movq -8(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,40(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,8(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq (%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 32(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 16(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 40(%esp),%mm5 - paddq %mm6,%mm0 - movq 48(%esp),%mm6 - movdqa %xmm1,-16(%edx) - leal 128(%ebp),%ebp - decl %ecx - jnz .L00800_47_ssse3 - movdqa (%ebp),%xmm1 - leal -640(%ebp),%ebp - movdqu (%ebx),%xmm0 -.byte 102,15,56,0,193 - movdqa (%ebp),%xmm3 - movdqa %xmm1,%xmm2 - movdqu 16(%ebx),%xmm1 - paddq %xmm0,%xmm3 -.byte 102,15,56,0,202 - movq %mm4,%mm1 - movq -128(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 32(%esp),%mm5 - paddq %mm6,%mm2 - movq 40(%esp),%mm6 - movq %mm4,%mm1 - movq -120(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,24(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,56(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 48(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 16(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq (%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 24(%esp),%mm5 - paddq %mm6,%mm0 - movq 32(%esp),%mm6 - movdqa %xmm3,-128(%edx) - movdqa 16(%ebp),%xmm4 - movdqa %xmm2,%xmm3 - movdqu 32(%ebx),%xmm2 - paddq %xmm1,%xmm4 -.byte 102,15,56,0,211 - movq %mm4,%mm1 - movq -112(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,16(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,48(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 40(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 8(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 56(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 16(%esp),%mm5 - paddq %mm6,%mm2 - movq 24(%esp),%mm6 - movq %mm4,%mm1 - movq -104(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,8(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,40(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 32(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq (%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 48(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 8(%esp),%mm5 - paddq %mm6,%mm0 - movq 16(%esp),%mm6 - movdqa %xmm4,-112(%edx) - movdqa 32(%ebp),%xmm5 - movdqa %xmm3,%xmm4 - movdqu 48(%ebx),%xmm3 - paddq %xmm2,%xmm5 -.byte 102,15,56,0,220 - movq %mm4,%mm1 - movq -96(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,32(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 24(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 56(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 40(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq (%esp),%mm5 - paddq %mm6,%mm2 - movq 8(%esp),%mm6 - movq %mm4,%mm1 - movq -88(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,56(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,24(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 16(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 48(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 32(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 56(%esp),%mm5 - paddq %mm6,%mm0 - movq (%esp),%mm6 - movdqa %xmm5,-96(%edx) - movdqa 48(%ebp),%xmm6 - movdqa %xmm4,%xmm5 - movdqu 64(%ebx),%xmm4 - paddq %xmm3,%xmm6 -.byte 102,15,56,0,229 - movq %mm4,%mm1 - movq -80(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,48(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,16(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 8(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 40(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 24(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 48(%esp),%mm5 - paddq %mm6,%mm2 - movq 56(%esp),%mm6 - movq %mm4,%mm1 - movq -72(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,40(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,8(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq (%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 32(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 16(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 40(%esp),%mm5 - paddq %mm6,%mm0 - movq 48(%esp),%mm6 - movdqa %xmm6,-80(%edx) - movdqa 64(%ebp),%xmm7 - movdqa %xmm5,%xmm6 - movdqu 80(%ebx),%xmm5 - paddq %xmm4,%xmm7 -.byte 102,15,56,0,238 - movq %mm4,%mm1 - movq -64(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,32(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 56(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 24(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 8(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 32(%esp),%mm5 - paddq %mm6,%mm2 - movq 40(%esp),%mm6 - movq %mm4,%mm1 - movq -56(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,24(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,56(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 48(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 16(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq (%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 24(%esp),%mm5 - paddq %mm6,%mm0 - movq 32(%esp),%mm6 - movdqa %xmm7,-64(%edx) - movdqa %xmm0,(%edx) - movdqa 80(%ebp),%xmm0 - movdqa %xmm6,%xmm7 - movdqu 96(%ebx),%xmm6 - paddq %xmm5,%xmm0 -.byte 102,15,56,0,247 - movq %mm4,%mm1 - movq -48(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,16(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,48(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 40(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 8(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 56(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 16(%esp),%mm5 - paddq %mm6,%mm2 - movq 24(%esp),%mm6 - movq %mm4,%mm1 - movq -40(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,8(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,40(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 32(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq (%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 48(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 8(%esp),%mm5 - paddq %mm6,%mm0 - movq 16(%esp),%mm6 - movdqa %xmm0,-48(%edx) - movdqa %xmm1,16(%edx) - movdqa 96(%ebp),%xmm1 - movdqa %xmm7,%xmm0 - movdqu 112(%ebx),%xmm7 - paddq %xmm6,%xmm1 -.byte 102,15,56,0,248 - movq %mm4,%mm1 - movq -32(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,32(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 24(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 56(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 40(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq (%esp),%mm5 - paddq %mm6,%mm2 - movq 8(%esp),%mm6 - movq %mm4,%mm1 - movq -24(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,56(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,24(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 16(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 48(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 32(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 56(%esp),%mm5 - paddq %mm6,%mm0 - movq (%esp),%mm6 - movdqa %xmm1,-32(%edx) - movdqa %xmm2,32(%edx) - movdqa 112(%ebp),%xmm2 - movdqa (%edx),%xmm0 - paddq %xmm7,%xmm2 - movq %mm4,%mm1 - movq -16(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,48(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm0 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm0,16(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq 8(%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 40(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm0,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm0,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 24(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm0,%mm2 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - pxor %mm7,%mm6 - movq 48(%esp),%mm5 - paddq %mm6,%mm2 - movq 56(%esp),%mm6 - movq %mm4,%mm1 - movq -8(%edx),%mm7 - pxor %mm6,%mm5 - psrlq $14,%mm1 - movq %mm4,40(%esp) - pand %mm4,%mm5 - psllq $23,%mm4 - paddq %mm3,%mm2 - movq %mm1,%mm3 - psrlq $4,%mm1 - pxor %mm6,%mm5 - pxor %mm4,%mm3 - psllq $23,%mm4 - pxor %mm1,%mm3 - movq %mm2,8(%esp) - paddq %mm5,%mm7 - pxor %mm4,%mm3 - psrlq $23,%mm1 - paddq (%esp),%mm7 - pxor %mm1,%mm3 - psllq $4,%mm4 - pxor %mm4,%mm3 - movq 32(%esp),%mm4 - paddq %mm7,%mm3 - movq %mm2,%mm5 - psrlq $28,%mm5 - paddq %mm3,%mm4 - movq %mm2,%mm6 - movq %mm5,%mm7 - psllq $25,%mm6 - movq 16(%esp),%mm1 - psrlq $6,%mm5 - pxor %mm6,%mm7 - psllq $5,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm2 - psrlq $5,%mm5 - pxor %mm6,%mm7 - pand %mm2,%mm0 - psllq $6,%mm6 - pxor %mm5,%mm7 - pxor %mm1,%mm0 - pxor %mm7,%mm6 - movq 40(%esp),%mm5 - paddq %mm6,%mm0 - movq 48(%esp),%mm6 - movdqa %xmm2,-16(%edx) - movq 8(%esp),%mm1 - paddq %mm3,%mm0 - movq 24(%esp),%mm3 - movq 56(%esp),%mm7 - pxor %mm1,%mm2 - paddq (%esi),%mm0 - paddq 8(%esi),%mm1 - paddq 16(%esi),%mm2 - paddq 24(%esi),%mm3 - paddq 32(%esi),%mm4 - paddq 40(%esi),%mm5 - paddq 48(%esi),%mm6 - paddq 56(%esi),%mm7 - movq %mm0,(%esi) - movq %mm1,8(%esi) - movq %mm2,16(%esi) - movq %mm3,24(%esi) - movq %mm4,32(%esi) - movq %mm5,40(%esi) - movq %mm6,48(%esi) - movq %mm7,56(%esi) - cmpl %eax,%edi - jb .L007loop_ssse3 - movl 76(%edx),%esp - emms - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 16 -.L002loop_x86: - movl (%edi),%eax - movl 4(%edi),%ebx - movl 8(%edi),%ecx - movl 12(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 16(%edi),%eax - movl 20(%edi),%ebx - movl 24(%edi),%ecx - movl 28(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 32(%edi),%eax - movl 36(%edi),%ebx - movl 40(%edi),%ecx - movl 44(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 48(%edi),%eax - movl 52(%edi),%ebx - movl 56(%edi),%ecx - movl 60(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 64(%edi),%eax - movl 68(%edi),%ebx - movl 72(%edi),%ecx - movl 76(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 80(%edi),%eax - movl 84(%edi),%ebx - movl 88(%edi),%ecx - movl 92(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 96(%edi),%eax - movl 100(%edi),%ebx - movl 104(%edi),%ecx - movl 108(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - movl 112(%edi),%eax - movl 116(%edi),%ebx - movl 120(%edi),%ecx - movl 124(%edi),%edx - bswap %eax - bswap %ebx - bswap %ecx - bswap %edx - pushl %eax - pushl %ebx - pushl %ecx - pushl %edx - addl $128,%edi - subl $72,%esp - movl %edi,204(%esp) - leal 8(%esp),%edi - movl $16,%ecx -.long 2784229001 -.align 16 -.L00900_15_x86: - movl 40(%esp),%ecx - movl 44(%esp),%edx - movl %ecx,%esi - shrl $9,%ecx - movl %edx,%edi - shrl $9,%edx - movl %ecx,%ebx - shll $14,%esi - movl %edx,%eax - shll $14,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%eax - shll $4,%esi - xorl %edx,%ebx - shll $4,%edi - xorl %esi,%ebx - shrl $4,%ecx - xorl %edi,%eax - shrl $4,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 48(%esp),%ecx - movl 52(%esp),%edx - movl 56(%esp),%esi - movl 60(%esp),%edi - addl 64(%esp),%eax - adcl 68(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - andl 40(%esp),%ecx - andl 44(%esp),%edx - addl 192(%esp),%eax - adcl 196(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - movl (%ebp),%esi - movl 4(%ebp),%edi - addl %ecx,%eax - adcl %edx,%ebx - movl 32(%esp),%ecx - movl 36(%esp),%edx - addl %esi,%eax - adcl %edi,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - addl %ecx,%eax - adcl %edx,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl %eax,32(%esp) - movl %ebx,36(%esp) - movl %ecx,%esi - shrl $2,%ecx - movl %edx,%edi - shrl $2,%edx - movl %ecx,%ebx - shll $4,%esi - movl %edx,%eax - shll $4,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%ebx - shll $21,%esi - xorl %edx,%eax - shll $21,%edi - xorl %esi,%eax - shrl $21,%ecx - xorl %edi,%ebx - shrl $21,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl 16(%esp),%esi - movl 20(%esp),%edi - addl (%esp),%eax - adcl 4(%esp),%ebx - orl %esi,%ecx - orl %edi,%edx - andl 24(%esp),%ecx - andl 28(%esp),%edx - andl 8(%esp),%esi - andl 12(%esp),%edi - orl %esi,%ecx - orl %edi,%edx - addl %ecx,%eax - adcl %edx,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - movb (%ebp),%dl - subl $8,%esp - leal 8(%ebp),%ebp - cmpb $148,%dl - jne .L00900_15_x86 -.align 16 -.L01016_79_x86: - movl 312(%esp),%ecx - movl 316(%esp),%edx - movl %ecx,%esi - shrl $1,%ecx - movl %edx,%edi - shrl $1,%edx - movl %ecx,%eax - shll $24,%esi - movl %edx,%ebx - shll $24,%edi - xorl %esi,%ebx - shrl $6,%ecx - xorl %edi,%eax - shrl $6,%edx - xorl %ecx,%eax - shll $7,%esi - xorl %edx,%ebx - shll $1,%edi - xorl %esi,%ebx - shrl $1,%ecx - xorl %edi,%eax - shrl $1,%edx - xorl %ecx,%eax - shll $6,%edi - xorl %edx,%ebx - xorl %edi,%eax - movl %eax,(%esp) - movl %ebx,4(%esp) - movl 208(%esp),%ecx - movl 212(%esp),%edx - movl %ecx,%esi - shrl $6,%ecx - movl %edx,%edi - shrl $6,%edx - movl %ecx,%eax - shll $3,%esi - movl %edx,%ebx - shll $3,%edi - xorl %esi,%eax - shrl $13,%ecx - xorl %edi,%ebx - shrl $13,%edx - xorl %ecx,%eax - shll $10,%esi - xorl %edx,%ebx - shll $10,%edi - xorl %esi,%ebx - shrl $10,%ecx - xorl %edi,%eax - shrl $10,%edx - xorl %ecx,%ebx - shll $13,%edi - xorl %edx,%eax - xorl %edi,%eax - movl 320(%esp),%ecx - movl 324(%esp),%edx - addl (%esp),%eax - adcl 4(%esp),%ebx - movl 248(%esp),%esi - movl 252(%esp),%edi - addl %ecx,%eax - adcl %edx,%ebx - addl %esi,%eax - adcl %edi,%ebx - movl %eax,192(%esp) - movl %ebx,196(%esp) - movl 40(%esp),%ecx - movl 44(%esp),%edx - movl %ecx,%esi - shrl $9,%ecx - movl %edx,%edi - shrl $9,%edx - movl %ecx,%ebx - shll $14,%esi - movl %edx,%eax - shll $14,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%eax - shll $4,%esi - xorl %edx,%ebx - shll $4,%edi - xorl %esi,%ebx - shrl $4,%ecx - xorl %edi,%eax - shrl $4,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 48(%esp),%ecx - movl 52(%esp),%edx - movl 56(%esp),%esi - movl 60(%esp),%edi - addl 64(%esp),%eax - adcl 68(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - andl 40(%esp),%ecx - andl 44(%esp),%edx - addl 192(%esp),%eax - adcl 196(%esp),%ebx - xorl %esi,%ecx - xorl %edi,%edx - movl (%ebp),%esi - movl 4(%ebp),%edi - addl %ecx,%eax - adcl %edx,%ebx - movl 32(%esp),%ecx - movl 36(%esp),%edx - addl %esi,%eax - adcl %edi,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - addl %ecx,%eax - adcl %edx,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl %eax,32(%esp) - movl %ebx,36(%esp) - movl %ecx,%esi - shrl $2,%ecx - movl %edx,%edi - shrl $2,%edx - movl %ecx,%ebx - shll $4,%esi - movl %edx,%eax - shll $4,%edi - xorl %esi,%ebx - shrl $5,%ecx - xorl %edi,%eax - shrl $5,%edx - xorl %ecx,%ebx - shll $21,%esi - xorl %edx,%eax - shll $21,%edi - xorl %esi,%eax - shrl $21,%ecx - xorl %edi,%ebx - shrl $21,%edx - xorl %ecx,%eax - shll $5,%esi - xorl %edx,%ebx - shll $5,%edi - xorl %esi,%eax - xorl %edi,%ebx - movl 8(%esp),%ecx - movl 12(%esp),%edx - movl 16(%esp),%esi - movl 20(%esp),%edi - addl (%esp),%eax - adcl 4(%esp),%ebx - orl %esi,%ecx - orl %edi,%edx - andl 24(%esp),%ecx - andl 28(%esp),%edx - andl 8(%esp),%esi - andl 12(%esp),%edi - orl %esi,%ecx - orl %edi,%edx - addl %ecx,%eax - adcl %edx,%ebx - movl %eax,(%esp) - movl %ebx,4(%esp) - movb (%ebp),%dl - subl $8,%esp - leal 8(%ebp),%ebp - cmpb $23,%dl - jne .L01016_79_x86 - movl 840(%esp),%esi - movl 844(%esp),%edi - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%edx - addl 8(%esp),%eax - adcl 12(%esp),%ebx - movl %eax,(%esi) - movl %ebx,4(%esi) - addl 16(%esp),%ecx - adcl 20(%esp),%edx - movl %ecx,8(%esi) - movl %edx,12(%esi) - movl 16(%esi),%eax - movl 20(%esi),%ebx - movl 24(%esi),%ecx - movl 28(%esi),%edx - addl 24(%esp),%eax - adcl 28(%esp),%ebx - movl %eax,16(%esi) - movl %ebx,20(%esi) - addl 32(%esp),%ecx - adcl 36(%esp),%edx - movl %ecx,24(%esi) - movl %edx,28(%esi) - movl 32(%esi),%eax - movl 36(%esi),%ebx - movl 40(%esi),%ecx - movl 44(%esi),%edx - addl 40(%esp),%eax - adcl 44(%esp),%ebx - movl %eax,32(%esi) - movl %ebx,36(%esi) - addl 48(%esp),%ecx - adcl 52(%esp),%edx - movl %ecx,40(%esi) - movl %edx,44(%esi) - movl 48(%esi),%eax - movl 52(%esi),%ebx - movl 56(%esi),%ecx - movl 60(%esi),%edx - addl 56(%esp),%eax - adcl 60(%esp),%ebx - movl %eax,48(%esi) - movl %ebx,52(%esi) - addl 64(%esp),%ecx - adcl 68(%esp),%edx - movl %ecx,56(%esi) - movl %edx,60(%esi) - addl $840,%esp - subl $640,%ebp - cmpl 8(%esp),%edi - jb .L002loop_x86 - movl 12(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.align 64 -.L001K512: -.long 3609767458,1116352408 -.long 602891725,1899447441 -.long 3964484399,3049323471 -.long 2173295548,3921009573 -.long 4081628472,961987163 -.long 3053834265,1508970993 -.long 2937671579,2453635748 -.long 3664609560,2870763221 -.long 2734883394,3624381080 -.long 1164996542,310598401 -.long 1323610764,607225278 -.long 3590304994,1426881987 -.long 4068182383,1925078388 -.long 991336113,2162078206 -.long 633803317,2614888103 -.long 3479774868,3248222580 -.long 2666613458,3835390401 -.long 944711139,4022224774 -.long 2341262773,264347078 -.long 2007800933,604807628 -.long 1495990901,770255983 -.long 1856431235,1249150122 -.long 3175218132,1555081692 -.long 2198950837,1996064986 -.long 3999719339,2554220882 -.long 766784016,2821834349 -.long 2566594879,2952996808 -.long 3203337956,3210313671 -.long 1034457026,3336571891 -.long 2466948901,3584528711 -.long 3758326383,113926993 -.long 168717936,338241895 -.long 1188179964,666307205 -.long 1546045734,773529912 -.long 1522805485,1294757372 -.long 2643833823,1396182291 -.long 2343527390,1695183700 -.long 1014477480,1986661051 -.long 1206759142,2177026350 -.long 344077627,2456956037 -.long 1290863460,2730485921 -.long 3158454273,2820302411 -.long 3505952657,3259730800 -.long 106217008,3345764771 -.long 3606008344,3516065817 -.long 1432725776,3600352804 -.long 1467031594,4094571909 -.long 851169720,275423344 -.long 3100823752,430227734 -.long 1363258195,506948616 -.long 3750685593,659060556 -.long 3785050280,883997877 -.long 3318307427,958139571 -.long 3812723403,1322822218 -.long 2003034995,1537002063 -.long 3602036899,1747873779 -.long 1575990012,1955562222 -.long 1125592928,2024104815 -.long 2716904306,2227730452 -.long 442776044,2361852424 -.long 593698344,2428436474 -.long 3733110249,2756734187 -.long 2999351573,3204031479 -.long 3815920427,3329325298 -.long 3928383900,3391569614 -.long 566280711,3515267271 -.long 3454069534,3940187606 -.long 4000239992,4118630271 -.long 1914138554,116418474 -.long 2731055270,174292421 -.long 3203993006,289380356 -.long 320620315,460393269 -.long 587496836,685471733 -.long 1086792851,852142971 -.long 365543100,1017036298 -.long 2618297676,1126000580 -.long 3409855158,1288033470 -.long 4234509866,1501505948 -.long 987167468,1607167915 -.long 1246189591,1816402316 -.long 67438087,66051 -.long 202182159,134810123 -.size sha512_block_data_order,.-.L_sha512_block_data_order_begin -.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97 -.byte 110,115,102,111,114,109,32,102,111,114,32,120,56,54,44,32 -.byte 67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97 -.byte 112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103 -.byte 62,0 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S b/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S deleted file mode 100644 index 66bd5f5e..00000000 --- a/third_party/boringssl/linux-x86/crypto/fipsmodule/vpaes-x86.S +++ /dev/null @@ -1,708 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -#ifdef BORINGSSL_DISPATCH_TEST -#endif -.align 64 -.L_vpaes_consts: -.long 218628480,235210255,168496130,67568393 -.long 252381056,17041926,33884169,51187212 -.long 252645135,252645135,252645135,252645135 -.long 1512730624,3266504856,1377990664,3401244816 -.long 830229760,1275146365,2969422977,3447763452 -.long 3411033600,2979783055,338359620,2782886510 -.long 4209124096,907596821,221174255,1006095553 -.long 191964160,3799684038,3164090317,1589111125 -.long 182528256,1777043520,2877432650,3265356744 -.long 1874708224,3503451415,3305285752,363511674 -.long 1606117888,3487855781,1093350906,2384367825 -.long 197121,67569157,134941193,202313229 -.long 67569157,134941193,202313229,197121 -.long 134941193,202313229,197121,67569157 -.long 202313229,197121,67569157,134941193 -.long 33619971,100992007,168364043,235736079 -.long 235736079,33619971,100992007,168364043 -.long 168364043,235736079,33619971,100992007 -.long 100992007,168364043,235736079,33619971 -.long 50462976,117835012,185207048,252579084 -.long 252314880,51251460,117574920,184942860 -.long 184682752,252054788,50987272,118359308 -.long 118099200,185467140,251790600,50727180 -.long 2946363062,528716217,1300004225,1881839624 -.long 1532713819,1532713819,1532713819,1532713819 -.long 3602276352,4288629033,3737020424,4153884961 -.long 1354558464,32357713,2958822624,3775749553 -.long 1201988352,132424512,1572796698,503232858 -.long 2213177600,1597421020,4103937655,675398315 -.long 2749646592,4273543773,1511898873,121693092 -.long 3040248576,1103263732,2871565598,1608280554 -.long 2236667136,2588920351,482954393,64377734 -.long 3069987328,291237287,2117370568,3650299247 -.long 533321216,3573750986,2572112006,1401264716 -.long 1339849704,2721158661,548607111,3445553514 -.long 2128193280,3054596040,2183486460,1257083700 -.long 655635200,1165381986,3923443150,2344132524 -.long 190078720,256924420,290342170,357187870 -.long 1610966272,2263057382,4103205268,309794674 -.long 2592527872,2233205587,1335446729,3402964816 -.long 3973531904,3225098121,3002836325,1918774430 -.long 3870401024,2102906079,2284471353,4117666579 -.long 617007872,1021508343,366931923,691083277 -.long 2528395776,3491914898,2968704004,1613121270 -.long 3445188352,3247741094,844474987,4093578302 -.long 651481088,1190302358,1689581232,574775300 -.long 4289380608,206939853,2555985458,2489840491 -.long 2130264064,327674451,3566485037,3349835193 -.long 2470714624,316102159,3636825756,3393945945 -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105 -.byte 111,110,32,65,69,83,32,102,111,114,32,120,56,54,47,83 -.byte 83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117 -.byte 114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105 -.byte 118,101,114,115,105,116,121,41,0 -.align 64 -.hidden _vpaes_preheat -.type _vpaes_preheat,@function -.align 16 -_vpaes_preheat: - addl (%esp),%ebp - movdqa -48(%ebp),%xmm7 - movdqa -16(%ebp),%xmm6 - ret -.size _vpaes_preheat,.-_vpaes_preheat -.hidden _vpaes_encrypt_core -.type _vpaes_encrypt_core,@function -.align 16 -_vpaes_encrypt_core: - movl $16,%ecx - movl 240(%edx),%eax - movdqa %xmm6,%xmm1 - movdqa (%ebp),%xmm2 - pandn %xmm0,%xmm1 - pand %xmm6,%xmm0 - movdqu (%edx),%xmm5 -.byte 102,15,56,0,208 - movdqa 16(%ebp),%xmm0 - pxor %xmm5,%xmm2 - psrld $4,%xmm1 - addl $16,%edx -.byte 102,15,56,0,193 - leal 192(%ebp),%ebx - pxor %xmm2,%xmm0 - jmp .L000enc_entry -.align 16 -.L001enc_loop: - movdqa 32(%ebp),%xmm4 - movdqa 48(%ebp),%xmm0 -.byte 102,15,56,0,226 -.byte 102,15,56,0,195 - pxor %xmm5,%xmm4 - movdqa 64(%ebp),%xmm5 - pxor %xmm4,%xmm0 - movdqa -64(%ebx,%ecx,1),%xmm1 -.byte 102,15,56,0,234 - movdqa 80(%ebp),%xmm2 - movdqa (%ebx,%ecx,1),%xmm4 -.byte 102,15,56,0,211 - movdqa %xmm0,%xmm3 - pxor %xmm5,%xmm2 -.byte 102,15,56,0,193 - addl $16,%edx - pxor %xmm2,%xmm0 -.byte 102,15,56,0,220 - addl $16,%ecx - pxor %xmm0,%xmm3 -.byte 102,15,56,0,193 - andl $48,%ecx - subl $1,%eax - pxor %xmm3,%xmm0 -.L000enc_entry: - movdqa %xmm6,%xmm1 - movdqa -32(%ebp),%xmm5 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm6,%xmm0 -.byte 102,15,56,0,232 - movdqa %xmm7,%xmm3 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 - movdqa %xmm7,%xmm4 - pxor %xmm5,%xmm3 -.byte 102,15,56,0,224 - movdqa %xmm7,%xmm2 - pxor %xmm5,%xmm4 -.byte 102,15,56,0,211 - movdqa %xmm7,%xmm3 - pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 - movdqu (%edx),%xmm5 - pxor %xmm1,%xmm3 - jnz .L001enc_loop - movdqa 96(%ebp),%xmm4 - movdqa 112(%ebp),%xmm0 -.byte 102,15,56,0,226 - pxor %xmm5,%xmm4 -.byte 102,15,56,0,195 - movdqa 64(%ebx,%ecx,1),%xmm1 - pxor %xmm4,%xmm0 -.byte 102,15,56,0,193 - ret -.size _vpaes_encrypt_core,.-_vpaes_encrypt_core -.hidden _vpaes_decrypt_core -.type _vpaes_decrypt_core,@function -.align 16 -_vpaes_decrypt_core: - leal 608(%ebp),%ebx - movl 240(%edx),%eax - movdqa %xmm6,%xmm1 - movdqa -64(%ebx),%xmm2 - pandn %xmm0,%xmm1 - movl %eax,%ecx - psrld $4,%xmm1 - movdqu (%edx),%xmm5 - shll $4,%ecx - pand %xmm6,%xmm0 -.byte 102,15,56,0,208 - movdqa -48(%ebx),%xmm0 - xorl $48,%ecx -.byte 102,15,56,0,193 - andl $48,%ecx - pxor %xmm5,%xmm2 - movdqa 176(%ebp),%xmm5 - pxor %xmm2,%xmm0 - addl $16,%edx - leal -352(%ebx,%ecx,1),%ecx - jmp .L002dec_entry -.align 16 -.L003dec_loop: - movdqa -32(%ebx),%xmm4 - movdqa -16(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa (%ebx),%xmm4 - pxor %xmm1,%xmm0 - movdqa 16(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa 32(%ebx),%xmm4 - pxor %xmm1,%xmm0 - movdqa 48(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa 64(%ebx),%xmm4 - pxor %xmm1,%xmm0 - movdqa 80(%ebx),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - addl $16,%edx -.byte 102,15,58,15,237,12 - pxor %xmm1,%xmm0 - subl $1,%eax -.L002dec_entry: - movdqa %xmm6,%xmm1 - movdqa -32(%ebp),%xmm2 - pandn %xmm0,%xmm1 - pand %xmm6,%xmm0 - psrld $4,%xmm1 -.byte 102,15,56,0,208 - movdqa %xmm7,%xmm3 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 - movdqa %xmm7,%xmm4 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,224 - pxor %xmm2,%xmm4 - movdqa %xmm7,%xmm2 -.byte 102,15,56,0,211 - movdqa %xmm7,%xmm3 - pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 - movdqu (%edx),%xmm0 - pxor %xmm1,%xmm3 - jnz .L003dec_loop - movdqa 96(%ebx),%xmm4 -.byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 112(%ebx),%xmm0 - movdqa (%ecx),%xmm2 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 -.byte 102,15,56,0,194 - ret -.size _vpaes_decrypt_core,.-_vpaes_decrypt_core -.hidden _vpaes_schedule_core -.type _vpaes_schedule_core,@function -.align 16 -_vpaes_schedule_core: - addl (%esp),%ebp - movdqu (%esi),%xmm0 - movdqa 320(%ebp),%xmm2 - movdqa %xmm0,%xmm3 - leal (%ebp),%ebx - movdqa %xmm2,4(%esp) - call _vpaes_schedule_transform - movdqa %xmm0,%xmm7 - testl %edi,%edi - jnz .L004schedule_am_decrypting - movdqu %xmm0,(%edx) - jmp .L005schedule_go -.L004schedule_am_decrypting: - movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,217 - movdqu %xmm3,(%edx) - xorl $48,%ecx -.L005schedule_go: - cmpl $192,%eax - ja .L006schedule_256 - je .L007schedule_192 -.L008schedule_128: - movl $10,%eax -.L009loop_schedule_128: - call _vpaes_schedule_round - decl %eax - jz .L010schedule_mangle_last - call _vpaes_schedule_mangle - jmp .L009loop_schedule_128 -.align 16 -.L007schedule_192: - movdqu 8(%esi),%xmm0 - call _vpaes_schedule_transform - movdqa %xmm0,%xmm6 - pxor %xmm4,%xmm4 - movhlps %xmm4,%xmm6 - movl $4,%eax -.L011loop_schedule_192: - call _vpaes_schedule_round -.byte 102,15,58,15,198,8 - call _vpaes_schedule_mangle - call _vpaes_schedule_192_smear - call _vpaes_schedule_mangle - call _vpaes_schedule_round - decl %eax - jz .L010schedule_mangle_last - call _vpaes_schedule_mangle - call _vpaes_schedule_192_smear - jmp .L011loop_schedule_192 -.align 16 -.L006schedule_256: - movdqu 16(%esi),%xmm0 - call _vpaes_schedule_transform - movl $7,%eax -.L012loop_schedule_256: - call _vpaes_schedule_mangle - movdqa %xmm0,%xmm6 - call _vpaes_schedule_round - decl %eax - jz .L010schedule_mangle_last - call _vpaes_schedule_mangle - pshufd $255,%xmm0,%xmm0 - movdqa %xmm7,20(%esp) - movdqa %xmm6,%xmm7 - call .L_vpaes_schedule_low_round - movdqa 20(%esp),%xmm7 - jmp .L012loop_schedule_256 -.align 16 -.L010schedule_mangle_last: - leal 384(%ebp),%ebx - testl %edi,%edi - jnz .L013schedule_mangle_last_dec - movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,193 - leal 352(%ebp),%ebx - addl $32,%edx -.L013schedule_mangle_last_dec: - addl $-16,%edx - pxor 336(%ebp),%xmm0 - call _vpaes_schedule_transform - movdqu %xmm0,(%edx) - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - ret -.size _vpaes_schedule_core,.-_vpaes_schedule_core -.hidden _vpaes_schedule_192_smear -.type _vpaes_schedule_192_smear,@function -.align 16 -_vpaes_schedule_192_smear: - pshufd $128,%xmm6,%xmm1 - pshufd $254,%xmm7,%xmm0 - pxor %xmm1,%xmm6 - pxor %xmm1,%xmm1 - pxor %xmm0,%xmm6 - movdqa %xmm6,%xmm0 - movhlps %xmm1,%xmm6 - ret -.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear -.hidden _vpaes_schedule_round -.type _vpaes_schedule_round,@function -.align 16 -_vpaes_schedule_round: - movdqa 8(%esp),%xmm2 - pxor %xmm1,%xmm1 -.byte 102,15,58,15,202,15 -.byte 102,15,58,15,210,15 - pxor %xmm1,%xmm7 - pshufd $255,%xmm0,%xmm0 -.byte 102,15,58,15,192,1 - movdqa %xmm2,8(%esp) -.L_vpaes_schedule_low_round: - movdqa %xmm7,%xmm1 - pslldq $4,%xmm7 - pxor %xmm1,%xmm7 - movdqa %xmm7,%xmm1 - pslldq $8,%xmm7 - pxor %xmm1,%xmm7 - pxor 336(%ebp),%xmm7 - movdqa -16(%ebp),%xmm4 - movdqa -48(%ebp),%xmm5 - movdqa %xmm4,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm4,%xmm0 - movdqa -32(%ebp),%xmm2 -.byte 102,15,56,0,208 - pxor %xmm1,%xmm0 - movdqa %xmm5,%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 - movdqa %xmm5,%xmm4 -.byte 102,15,56,0,224 - pxor %xmm2,%xmm4 - movdqa %xmm5,%xmm2 -.byte 102,15,56,0,211 - pxor %xmm0,%xmm2 - movdqa %xmm5,%xmm3 -.byte 102,15,56,0,220 - pxor %xmm1,%xmm3 - movdqa 32(%ebp),%xmm4 -.byte 102,15,56,0,226 - movdqa 48(%ebp),%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 - pxor %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - ret -.size _vpaes_schedule_round,.-_vpaes_schedule_round -.hidden _vpaes_schedule_transform -.type _vpaes_schedule_transform,@function -.align 16 -_vpaes_schedule_transform: - movdqa -16(%ebp),%xmm2 - movdqa %xmm2,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm2,%xmm0 - movdqa (%ebx),%xmm2 -.byte 102,15,56,0,208 - movdqa 16(%ebx),%xmm0 -.byte 102,15,56,0,193 - pxor %xmm2,%xmm0 - ret -.size _vpaes_schedule_transform,.-_vpaes_schedule_transform -.hidden _vpaes_schedule_mangle -.type _vpaes_schedule_mangle,@function -.align 16 -_vpaes_schedule_mangle: - movdqa %xmm0,%xmm4 - movdqa 128(%ebp),%xmm5 - testl %edi,%edi - jnz .L014schedule_mangle_dec - addl $16,%edx - pxor 336(%ebp),%xmm4 -.byte 102,15,56,0,229 - movdqa %xmm4,%xmm3 -.byte 102,15,56,0,229 - pxor %xmm4,%xmm3 -.byte 102,15,56,0,229 - pxor %xmm4,%xmm3 - jmp .L015schedule_mangle_both -.align 16 -.L014schedule_mangle_dec: - movdqa -16(%ebp),%xmm2 - leal 416(%ebp),%esi - movdqa %xmm2,%xmm1 - pandn %xmm4,%xmm1 - psrld $4,%xmm1 - pand %xmm2,%xmm4 - movdqa (%esi),%xmm2 -.byte 102,15,56,0,212 - movdqa 16(%esi),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - movdqa 32(%esi),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 48(%esi),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - movdqa 64(%esi),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 80(%esi),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - movdqa 96(%esi),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 112(%esi),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 - addl $-16,%edx -.L015schedule_mangle_both: - movdqa 256(%ebp,%ecx,1),%xmm1 -.byte 102,15,56,0,217 - addl $-16,%ecx - andl $48,%ecx - movdqu %xmm3,(%edx) - ret -.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle -.globl vpaes_set_encrypt_key -.hidden vpaes_set_encrypt_key -.type vpaes_set_encrypt_key,@function -.align 16 -vpaes_set_encrypt_key: -.L_vpaes_set_encrypt_key_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi -#ifdef BORINGSSL_DISPATCH_TEST - pushl %ebx - pushl %edx - call .L016pic -.L016pic: - popl %ebx - leal BORINGSSL_function_hit+5-.L016pic(%ebx),%ebx - movl $1,%edx - movb %dl,(%ebx) - popl %edx - popl %ebx -#endif - movl 20(%esp),%esi - leal -56(%esp),%ebx - movl 24(%esp),%eax - andl $-16,%ebx - movl 28(%esp),%edx - xchgl %esp,%ebx - movl %ebx,48(%esp) - movl %eax,%ebx - shrl $5,%ebx - addl $5,%ebx - movl %ebx,240(%edx) - movl $48,%ecx - movl $0,%edi - leal .L_vpaes_consts+0x30-.L017pic_point,%ebp - call _vpaes_schedule_core -.L017pic_point: - movl 48(%esp),%esp - xorl %eax,%eax - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size vpaes_set_encrypt_key,.-.L_vpaes_set_encrypt_key_begin -.globl vpaes_set_decrypt_key -.hidden vpaes_set_decrypt_key -.type vpaes_set_decrypt_key,@function -.align 16 -vpaes_set_decrypt_key: -.L_vpaes_set_decrypt_key_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - leal -56(%esp),%ebx - movl 24(%esp),%eax - andl $-16,%ebx - movl 28(%esp),%edx - xchgl %esp,%ebx - movl %ebx,48(%esp) - movl %eax,%ebx - shrl $5,%ebx - addl $5,%ebx - movl %ebx,240(%edx) - shll $4,%ebx - leal 16(%edx,%ebx,1),%edx - movl $1,%edi - movl %eax,%ecx - shrl $1,%ecx - andl $32,%ecx - xorl $32,%ecx - leal .L_vpaes_consts+0x30-.L018pic_point,%ebp - call _vpaes_schedule_core -.L018pic_point: - movl 48(%esp),%esp - xorl %eax,%eax - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size vpaes_set_decrypt_key,.-.L_vpaes_set_decrypt_key_begin -.globl vpaes_encrypt -.hidden vpaes_encrypt -.type vpaes_encrypt,@function -.align 16 -vpaes_encrypt: -.L_vpaes_encrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi -#ifdef BORINGSSL_DISPATCH_TEST - pushl %ebx - pushl %edx - call .L019pic -.L019pic: - popl %ebx - leal BORINGSSL_function_hit+4-.L019pic(%ebx),%ebx - movl $1,%edx - movb %dl,(%ebx) - popl %edx - popl %ebx -#endif - leal .L_vpaes_consts+0x30-.L020pic_point,%ebp - call _vpaes_preheat -.L020pic_point: - movl 20(%esp),%esi - leal -56(%esp),%ebx - movl 24(%esp),%edi - andl $-16,%ebx - movl 28(%esp),%edx - xchgl %esp,%ebx - movl %ebx,48(%esp) - movdqu (%esi),%xmm0 - call _vpaes_encrypt_core - movdqu %xmm0,(%edi) - movl 48(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size vpaes_encrypt,.-.L_vpaes_encrypt_begin -.globl vpaes_decrypt -.hidden vpaes_decrypt -.type vpaes_decrypt,@function -.align 16 -vpaes_decrypt: -.L_vpaes_decrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - leal .L_vpaes_consts+0x30-.L021pic_point,%ebp - call _vpaes_preheat -.L021pic_point: - movl 20(%esp),%esi - leal -56(%esp),%ebx - movl 24(%esp),%edi - andl $-16,%ebx - movl 28(%esp),%edx - xchgl %esp,%ebx - movl %ebx,48(%esp) - movdqu (%esi),%xmm0 - call _vpaes_decrypt_core - movdqu %xmm0,(%edi) - movl 48(%esp),%esp - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size vpaes_decrypt,.-.L_vpaes_decrypt_begin -.globl vpaes_cbc_encrypt -.hidden vpaes_cbc_encrypt -.type vpaes_cbc_encrypt,@function -.align 16 -vpaes_cbc_encrypt: -.L_vpaes_cbc_encrypt_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 20(%esp),%esi - movl 24(%esp),%edi - movl 28(%esp),%eax - movl 32(%esp),%edx - subl $16,%eax - jc .L022cbc_abort - leal -56(%esp),%ebx - movl 36(%esp),%ebp - andl $-16,%ebx - movl 40(%esp),%ecx - xchgl %esp,%ebx - movdqu (%ebp),%xmm1 - subl %esi,%edi - movl %ebx,48(%esp) - movl %edi,(%esp) - movl %edx,4(%esp) - movl %ebp,8(%esp) - movl %eax,%edi - leal .L_vpaes_consts+0x30-.L023pic_point,%ebp - call _vpaes_preheat -.L023pic_point: - cmpl $0,%ecx - je .L024cbc_dec_loop - jmp .L025cbc_enc_loop -.align 16 -.L025cbc_enc_loop: - movdqu (%esi),%xmm0 - pxor %xmm1,%xmm0 - call _vpaes_encrypt_core - movl (%esp),%ebx - movl 4(%esp),%edx - movdqa %xmm0,%xmm1 - movdqu %xmm0,(%ebx,%esi,1) - leal 16(%esi),%esi - subl $16,%edi - jnc .L025cbc_enc_loop - jmp .L026cbc_done -.align 16 -.L024cbc_dec_loop: - movdqu (%esi),%xmm0 - movdqa %xmm1,16(%esp) - movdqa %xmm0,32(%esp) - call _vpaes_decrypt_core - movl (%esp),%ebx - movl 4(%esp),%edx - pxor 16(%esp),%xmm0 - movdqa 32(%esp),%xmm1 - movdqu %xmm0,(%ebx,%esi,1) - leal 16(%esi),%esi - subl $16,%edi - jnc .L024cbc_dec_loop -.L026cbc_done: - movl 8(%esp),%ebx - movl 48(%esp),%esp - movdqu %xmm1,(%ebx) -.L022cbc_abort: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size vpaes_cbc_encrypt,.-.L_vpaes_cbc_encrypt_begin -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S b/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S deleted file mode 100644 index 5de32518..00000000 --- a/third_party/boringssl/linux-x86/crypto/fipsmodule/x86-mont.S +++ /dev/null @@ -1,484 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl bn_mul_mont -.hidden bn_mul_mont -.type bn_mul_mont,@function -.align 16 -bn_mul_mont: -.L_bn_mul_mont_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - xorl %eax,%eax - movl 40(%esp),%edi - cmpl $4,%edi - jl .L000just_leave - leal 20(%esp),%esi - leal 24(%esp),%edx - addl $2,%edi - negl %edi - leal -32(%esp,%edi,4),%ebp - negl %edi - movl %ebp,%eax - subl %edx,%eax - andl $2047,%eax - subl %eax,%ebp - xorl %ebp,%edx - andl $2048,%edx - xorl $2048,%edx - subl %edx,%ebp - andl $-64,%ebp - movl %esp,%eax - subl %ebp,%eax - andl $-4096,%eax - movl %esp,%edx - leal (%ebp,%eax,1),%esp - movl (%esp),%eax - cmpl %ebp,%esp - ja .L001page_walk - jmp .L002page_walk_done -.align 16 -.L001page_walk: - leal -4096(%esp),%esp - movl (%esp),%eax - cmpl %ebp,%esp - ja .L001page_walk -.L002page_walk_done: - movl (%esi),%eax - movl 4(%esi),%ebx - movl 8(%esi),%ecx - movl 12(%esi),%ebp - movl 16(%esi),%esi - movl (%esi),%esi - movl %eax,4(%esp) - movl %ebx,8(%esp) - movl %ecx,12(%esp) - movl %ebp,16(%esp) - movl %esi,20(%esp) - leal -3(%edi),%ebx - movl %edx,24(%esp) - call .L003PIC_me_up -.L003PIC_me_up: - popl %eax - leal OPENSSL_ia32cap_P-.L003PIC_me_up(%eax),%eax - btl $26,(%eax) - jnc .L004non_sse2 - movl $-1,%eax - movd %eax,%mm7 - movl 8(%esp),%esi - movl 12(%esp),%edi - movl 16(%esp),%ebp - xorl %edx,%edx - xorl %ecx,%ecx - movd (%edi),%mm4 - movd (%esi),%mm5 - movd (%ebp),%mm3 - pmuludq %mm4,%mm5 - movq %mm5,%mm2 - movq %mm5,%mm0 - pand %mm7,%mm0 - pmuludq 20(%esp),%mm5 - pmuludq %mm5,%mm3 - paddq %mm0,%mm3 - movd 4(%ebp),%mm1 - movd 4(%esi),%mm0 - psrlq $32,%mm2 - psrlq $32,%mm3 - incl %ecx -.align 16 -.L0051st: - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - pand %mm7,%mm0 - movd 4(%ebp,%ecx,4),%mm1 - paddq %mm0,%mm3 - movd 4(%esi,%ecx,4),%mm0 - psrlq $32,%mm2 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm3 - leal 1(%ecx),%ecx - cmpl %ebx,%ecx - jl .L0051st - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - pand %mm7,%mm0 - paddq %mm0,%mm3 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm2 - psrlq $32,%mm3 - paddq %mm2,%mm3 - movq %mm3,32(%esp,%ebx,4) - incl %edx -.L006outer: - xorl %ecx,%ecx - movd (%edi,%edx,4),%mm4 - movd (%esi),%mm5 - movd 32(%esp),%mm6 - movd (%ebp),%mm3 - pmuludq %mm4,%mm5 - paddq %mm6,%mm5 - movq %mm5,%mm0 - movq %mm5,%mm2 - pand %mm7,%mm0 - pmuludq 20(%esp),%mm5 - pmuludq %mm5,%mm3 - paddq %mm0,%mm3 - movd 36(%esp),%mm6 - movd 4(%ebp),%mm1 - movd 4(%esi),%mm0 - psrlq $32,%mm2 - psrlq $32,%mm3 - paddq %mm6,%mm2 - incl %ecx - decl %ebx -.L007inner: - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - movd 36(%esp,%ecx,4),%mm6 - pand %mm7,%mm0 - movd 4(%ebp,%ecx,4),%mm1 - paddq %mm0,%mm3 - movd 4(%esi,%ecx,4),%mm0 - psrlq $32,%mm2 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm3 - paddq %mm6,%mm2 - decl %ebx - leal 1(%ecx),%ecx - jnz .L007inner - movl %ecx,%ebx - pmuludq %mm4,%mm0 - pmuludq %mm5,%mm1 - paddq %mm0,%mm2 - paddq %mm1,%mm3 - movq %mm2,%mm0 - pand %mm7,%mm0 - paddq %mm0,%mm3 - movd %mm3,28(%esp,%ecx,4) - psrlq $32,%mm2 - psrlq $32,%mm3 - movd 36(%esp,%ebx,4),%mm6 - paddq %mm2,%mm3 - paddq %mm6,%mm3 - movq %mm3,32(%esp,%ebx,4) - leal 1(%edx),%edx - cmpl %ebx,%edx - jle .L006outer - emms - jmp .L008common_tail -.align 16 -.L004non_sse2: - movl 8(%esp),%esi - leal 1(%ebx),%ebp - movl 12(%esp),%edi - xorl %ecx,%ecx - movl %esi,%edx - andl $1,%ebp - subl %edi,%edx - leal 4(%edi,%ebx,4),%eax - orl %edx,%ebp - movl (%edi),%edi - jz .L009bn_sqr_mont - movl %eax,28(%esp) - movl (%esi),%eax - xorl %edx,%edx -.align 16 -.L010mull: - movl %edx,%ebp - mull %edi - addl %eax,%ebp - leal 1(%ecx),%ecx - adcl $0,%edx - movl (%esi,%ecx,4),%eax - cmpl %ebx,%ecx - movl %ebp,28(%esp,%ecx,4) - jl .L010mull - movl %edx,%ebp - mull %edi - movl 20(%esp),%edi - addl %ebp,%eax - movl 16(%esp),%esi - adcl $0,%edx - imull 32(%esp),%edi - movl %eax,32(%esp,%ebx,4) - xorl %ecx,%ecx - movl %edx,36(%esp,%ebx,4) - movl %ecx,40(%esp,%ebx,4) - movl (%esi),%eax - mull %edi - addl 32(%esp),%eax - movl 4(%esi),%eax - adcl $0,%edx - incl %ecx - jmp .L0112ndmadd -.align 16 -.L0121stmadd: - movl %edx,%ebp - mull %edi - addl 32(%esp,%ecx,4),%ebp - leal 1(%ecx),%ecx - adcl $0,%edx - addl %eax,%ebp - movl (%esi,%ecx,4),%eax - adcl $0,%edx - cmpl %ebx,%ecx - movl %ebp,28(%esp,%ecx,4) - jl .L0121stmadd - movl %edx,%ebp - mull %edi - addl 32(%esp,%ebx,4),%eax - movl 20(%esp),%edi - adcl $0,%edx - movl 16(%esp),%esi - addl %eax,%ebp - adcl $0,%edx - imull 32(%esp),%edi - xorl %ecx,%ecx - addl 36(%esp,%ebx,4),%edx - movl %ebp,32(%esp,%ebx,4) - adcl $0,%ecx - movl (%esi),%eax - movl %edx,36(%esp,%ebx,4) - movl %ecx,40(%esp,%ebx,4) - mull %edi - addl 32(%esp),%eax - movl 4(%esi),%eax - adcl $0,%edx - movl $1,%ecx -.align 16 -.L0112ndmadd: - movl %edx,%ebp - mull %edi - addl 32(%esp,%ecx,4),%ebp - leal 1(%ecx),%ecx - adcl $0,%edx - addl %eax,%ebp - movl (%esi,%ecx,4),%eax - adcl $0,%edx - cmpl %ebx,%ecx - movl %ebp,24(%esp,%ecx,4) - jl .L0112ndmadd - movl %edx,%ebp - mull %edi - addl 32(%esp,%ebx,4),%ebp - adcl $0,%edx - addl %eax,%ebp - adcl $0,%edx - movl %ebp,28(%esp,%ebx,4) - xorl %eax,%eax - movl 12(%esp),%ecx - addl 36(%esp,%ebx,4),%edx - adcl 40(%esp,%ebx,4),%eax - leal 4(%ecx),%ecx - movl %edx,32(%esp,%ebx,4) - cmpl 28(%esp),%ecx - movl %eax,36(%esp,%ebx,4) - je .L008common_tail - movl (%ecx),%edi - movl 8(%esp),%esi - movl %ecx,12(%esp) - xorl %ecx,%ecx - xorl %edx,%edx - movl (%esi),%eax - jmp .L0121stmadd -.align 16 -.L009bn_sqr_mont: - movl %ebx,(%esp) - movl %ecx,12(%esp) - movl %edi,%eax - mull %edi - movl %eax,32(%esp) - movl %edx,%ebx - shrl $1,%edx - andl $1,%ebx - incl %ecx -.align 16 -.L013sqr: - movl (%esi,%ecx,4),%eax - movl %edx,%ebp - mull %edi - addl %ebp,%eax - leal 1(%ecx),%ecx - adcl $0,%edx - leal (%ebx,%eax,2),%ebp - shrl $31,%eax - cmpl (%esp),%ecx - movl %eax,%ebx - movl %ebp,28(%esp,%ecx,4) - jl .L013sqr - movl (%esi,%ecx,4),%eax - movl %edx,%ebp - mull %edi - addl %ebp,%eax - movl 20(%esp),%edi - adcl $0,%edx - movl 16(%esp),%esi - leal (%ebx,%eax,2),%ebp - imull 32(%esp),%edi - shrl $31,%eax - movl %ebp,32(%esp,%ecx,4) - leal (%eax,%edx,2),%ebp - movl (%esi),%eax - shrl $31,%edx - movl %ebp,36(%esp,%ecx,4) - movl %edx,40(%esp,%ecx,4) - mull %edi - addl 32(%esp),%eax - movl %ecx,%ebx - adcl $0,%edx - movl 4(%esi),%eax - movl $1,%ecx -.align 16 -.L0143rdmadd: - movl %edx,%ebp - mull %edi - addl 32(%esp,%ecx,4),%ebp - adcl $0,%edx - addl %eax,%ebp - movl 4(%esi,%ecx,4),%eax - adcl $0,%edx - movl %ebp,28(%esp,%ecx,4) - movl %edx,%ebp - mull %edi - addl 36(%esp,%ecx,4),%ebp - leal 2(%ecx),%ecx - adcl $0,%edx - addl %eax,%ebp - movl (%esi,%ecx,4),%eax - adcl $0,%edx - cmpl %ebx,%ecx - movl %ebp,24(%esp,%ecx,4) - jl .L0143rdmadd - movl %edx,%ebp - mull %edi - addl 32(%esp,%ebx,4),%ebp - adcl $0,%edx - addl %eax,%ebp - adcl $0,%edx - movl %ebp,28(%esp,%ebx,4) - movl 12(%esp),%ecx - xorl %eax,%eax - movl 8(%esp),%esi - addl 36(%esp,%ebx,4),%edx - adcl 40(%esp,%ebx,4),%eax - movl %edx,32(%esp,%ebx,4) - cmpl %ebx,%ecx - movl %eax,36(%esp,%ebx,4) - je .L008common_tail - movl 4(%esi,%ecx,4),%edi - leal 1(%ecx),%ecx - movl %edi,%eax - movl %ecx,12(%esp) - mull %edi - addl 32(%esp,%ecx,4),%eax - adcl $0,%edx - movl %eax,32(%esp,%ecx,4) - xorl %ebp,%ebp - cmpl %ebx,%ecx - leal 1(%ecx),%ecx - je .L015sqrlast - movl %edx,%ebx - shrl $1,%edx - andl $1,%ebx -.align 16 -.L016sqradd: - movl (%esi,%ecx,4),%eax - movl %edx,%ebp - mull %edi - addl %ebp,%eax - leal (%eax,%eax,1),%ebp - adcl $0,%edx - shrl $31,%eax - addl 32(%esp,%ecx,4),%ebp - leal 1(%ecx),%ecx - adcl $0,%eax - addl %ebx,%ebp - adcl $0,%eax - cmpl (%esp),%ecx - movl %ebp,28(%esp,%ecx,4) - movl %eax,%ebx - jle .L016sqradd - movl %edx,%ebp - addl %edx,%edx - shrl $31,%ebp - addl %ebx,%edx - adcl $0,%ebp -.L015sqrlast: - movl 20(%esp),%edi - movl 16(%esp),%esi - imull 32(%esp),%edi - addl 32(%esp,%ecx,4),%edx - movl (%esi),%eax - adcl $0,%ebp - movl %edx,32(%esp,%ecx,4) - movl %ebp,36(%esp,%ecx,4) - mull %edi - addl 32(%esp),%eax - leal -1(%ecx),%ebx - adcl $0,%edx - movl $1,%ecx - movl 4(%esi),%eax - jmp .L0143rdmadd -.align 16 -.L008common_tail: - movl 16(%esp),%ebp - movl 4(%esp),%edi - leal 32(%esp),%esi - movl (%esi),%eax - movl %ebx,%ecx - xorl %edx,%edx -.align 16 -.L017sub: - sbbl (%ebp,%edx,4),%eax - movl %eax,(%edi,%edx,4) - decl %ecx - movl 4(%esi,%edx,4),%eax - leal 1(%edx),%edx - jge .L017sub - sbbl $0,%eax - movl $-1,%edx - xorl %eax,%edx - jmp .L018copy -.align 16 -.L018copy: - movl 32(%esp,%ebx,4),%esi - movl (%edi,%ebx,4),%ebp - movl %ecx,32(%esp,%ebx,4) - andl %eax,%esi - andl %edx,%ebp - orl %esi,%ebp - movl %ebp,(%edi,%ebx,4) - decl %ebx - jge .L018copy - movl 24(%esp),%esp - movl $1,%eax -.L000just_leave: - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size bn_mul_mont,.-.L_bn_mul_mont_begin -.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105 -.byte 112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56 -.byte 54,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121 -.byte 32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46 -.byte 111,114,103,62,0 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S b/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S deleted file mode 100644 index e7162dfa..00000000 --- a/third_party/boringssl/linux-x86/crypto/test/trampoline-x86.S +++ /dev/null @@ -1,206 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__i386__) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.globl abi_test_trampoline -.hidden abi_test_trampoline -.type abi_test_trampoline,@function -.align 16 -abi_test_trampoline: -.L_abi_test_trampoline_begin: - pushl %ebp - pushl %ebx - pushl %esi - pushl %edi - movl 24(%esp),%ecx - movl (%ecx),%esi - movl 4(%ecx),%edi - movl 8(%ecx),%ebx - movl 12(%ecx),%ebp - subl $44,%esp - movl 72(%esp),%eax - xorl %ecx,%ecx -.L000loop: - cmpl 76(%esp),%ecx - jae .L001loop_done - movl (%eax,%ecx,4),%edx - movl %edx,(%esp,%ecx,4) - addl $1,%ecx - jmp .L000loop -.L001loop_done: - call *64(%esp) - addl $44,%esp - movl 24(%esp),%ecx - movl %esi,(%ecx) - movl %edi,4(%ecx) - movl %ebx,8(%ecx) - movl %ebp,12(%ecx) - popl %edi - popl %esi - popl %ebx - popl %ebp - ret -.size abi_test_trampoline,.-.L_abi_test_trampoline_begin -.globl abi_test_get_and_clear_direction_flag -.hidden abi_test_get_and_clear_direction_flag -.type abi_test_get_and_clear_direction_flag,@function -.align 16 -abi_test_get_and_clear_direction_flag: -.L_abi_test_get_and_clear_direction_flag_begin: - pushfl - popl %eax - andl $1024,%eax - shrl $10,%eax - cld - ret -.size abi_test_get_and_clear_direction_flag,.-.L_abi_test_get_and_clear_direction_flag_begin -.globl abi_test_set_direction_flag -.hidden abi_test_set_direction_flag -.type abi_test_set_direction_flag,@function -.align 16 -abi_test_set_direction_flag: -.L_abi_test_set_direction_flag_begin: - std - ret -.size abi_test_set_direction_flag,.-.L_abi_test_set_direction_flag_begin -.globl abi_test_clobber_eax -.hidden abi_test_clobber_eax -.type abi_test_clobber_eax,@function -.align 16 -abi_test_clobber_eax: -.L_abi_test_clobber_eax_begin: - xorl %eax,%eax - ret -.size abi_test_clobber_eax,.-.L_abi_test_clobber_eax_begin -.globl abi_test_clobber_ebx -.hidden abi_test_clobber_ebx -.type abi_test_clobber_ebx,@function -.align 16 -abi_test_clobber_ebx: -.L_abi_test_clobber_ebx_begin: - xorl %ebx,%ebx - ret -.size abi_test_clobber_ebx,.-.L_abi_test_clobber_ebx_begin -.globl abi_test_clobber_ecx -.hidden abi_test_clobber_ecx -.type abi_test_clobber_ecx,@function -.align 16 -abi_test_clobber_ecx: -.L_abi_test_clobber_ecx_begin: - xorl %ecx,%ecx - ret -.size abi_test_clobber_ecx,.-.L_abi_test_clobber_ecx_begin -.globl abi_test_clobber_edx -.hidden abi_test_clobber_edx -.type abi_test_clobber_edx,@function -.align 16 -abi_test_clobber_edx: -.L_abi_test_clobber_edx_begin: - xorl %edx,%edx - ret -.size abi_test_clobber_edx,.-.L_abi_test_clobber_edx_begin -.globl abi_test_clobber_edi -.hidden abi_test_clobber_edi -.type abi_test_clobber_edi,@function -.align 16 -abi_test_clobber_edi: -.L_abi_test_clobber_edi_begin: - xorl %edi,%edi - ret -.size abi_test_clobber_edi,.-.L_abi_test_clobber_edi_begin -.globl abi_test_clobber_esi -.hidden abi_test_clobber_esi -.type abi_test_clobber_esi,@function -.align 16 -abi_test_clobber_esi: -.L_abi_test_clobber_esi_begin: - xorl %esi,%esi - ret -.size abi_test_clobber_esi,.-.L_abi_test_clobber_esi_begin -.globl abi_test_clobber_ebp -.hidden abi_test_clobber_ebp -.type abi_test_clobber_ebp,@function -.align 16 -abi_test_clobber_ebp: -.L_abi_test_clobber_ebp_begin: - xorl %ebp,%ebp - ret -.size abi_test_clobber_ebp,.-.L_abi_test_clobber_ebp_begin -.globl abi_test_clobber_xmm0 -.hidden abi_test_clobber_xmm0 -.type abi_test_clobber_xmm0,@function -.align 16 -abi_test_clobber_xmm0: -.L_abi_test_clobber_xmm0_begin: - pxor %xmm0,%xmm0 - ret -.size abi_test_clobber_xmm0,.-.L_abi_test_clobber_xmm0_begin -.globl abi_test_clobber_xmm1 -.hidden abi_test_clobber_xmm1 -.type abi_test_clobber_xmm1,@function -.align 16 -abi_test_clobber_xmm1: -.L_abi_test_clobber_xmm1_begin: - pxor %xmm1,%xmm1 - ret -.size abi_test_clobber_xmm1,.-.L_abi_test_clobber_xmm1_begin -.globl abi_test_clobber_xmm2 -.hidden abi_test_clobber_xmm2 -.type abi_test_clobber_xmm2,@function -.align 16 -abi_test_clobber_xmm2: -.L_abi_test_clobber_xmm2_begin: - pxor %xmm2,%xmm2 - ret -.size abi_test_clobber_xmm2,.-.L_abi_test_clobber_xmm2_begin -.globl abi_test_clobber_xmm3 -.hidden abi_test_clobber_xmm3 -.type abi_test_clobber_xmm3,@function -.align 16 -abi_test_clobber_xmm3: -.L_abi_test_clobber_xmm3_begin: - pxor %xmm3,%xmm3 - ret -.size abi_test_clobber_xmm3,.-.L_abi_test_clobber_xmm3_begin -.globl abi_test_clobber_xmm4 -.hidden abi_test_clobber_xmm4 -.type abi_test_clobber_xmm4,@function -.align 16 -abi_test_clobber_xmm4: -.L_abi_test_clobber_xmm4_begin: - pxor %xmm4,%xmm4 - ret -.size abi_test_clobber_xmm4,.-.L_abi_test_clobber_xmm4_begin -.globl abi_test_clobber_xmm5 -.hidden abi_test_clobber_xmm5 -.type abi_test_clobber_xmm5,@function -.align 16 -abi_test_clobber_xmm5: -.L_abi_test_clobber_xmm5_begin: - pxor %xmm5,%xmm5 - ret -.size abi_test_clobber_xmm5,.-.L_abi_test_clobber_xmm5_begin -.globl abi_test_clobber_xmm6 -.hidden abi_test_clobber_xmm6 -.type abi_test_clobber_xmm6,@function -.align 16 -abi_test_clobber_xmm6: -.L_abi_test_clobber_xmm6_begin: - pxor %xmm6,%xmm6 - ret -.size abi_test_clobber_xmm6,.-.L_abi_test_clobber_xmm6_begin -.globl abi_test_clobber_xmm7 -.hidden abi_test_clobber_xmm7 -.type abi_test_clobber_xmm7,@function -.align 16 -abi_test_clobber_xmm7: -.L_abi_test_clobber_xmm7_begin: - pxor %xmm7,%xmm7 - ret -.size abi_test_clobber_xmm7,.-.L_abi_test_clobber_xmm7_begin -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S deleted file mode 100644 index b862f4e9..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/chacha/chacha-x86_64.S +++ /dev/null @@ -1,1633 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -.align 64 -.Lzero: -.long 0,0,0,0 -.Lone: -.long 1,0,0,0 -.Linc: -.long 0,1,2,3 -.Lfour: -.long 4,4,4,4 -.Lincy: -.long 0,2,4,6,1,3,5,7 -.Leight: -.long 8,8,8,8,8,8,8,8 -.Lrot16: -.byte 0x2,0x3,0x0,0x1, 0x6,0x7,0x4,0x5, 0xa,0xb,0x8,0x9, 0xe,0xf,0xc,0xd -.Lrot24: -.byte 0x3,0x0,0x1,0x2, 0x7,0x4,0x5,0x6, 0xb,0x8,0x9,0xa, 0xf,0xc,0xd,0xe -.Lsigma: -.byte 101,120,112,97,110,100,32,51,50,45,98,121,116,101,32,107,0 -.align 64 -.Lzeroz: -.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0 -.Lfourz: -.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0 -.Lincz: -.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 -.Lsixteen: -.long 16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16 -.byte 67,104,97,67,104,97,50,48,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.globl ChaCha20_ctr32 -.hidden ChaCha20_ctr32 -.type ChaCha20_ctr32,@function -.align 64 -ChaCha20_ctr32: -.cfi_startproc - cmpq $0,%rdx - je .Lno_data - movq OPENSSL_ia32cap_P+4(%rip),%r10 - testl $512,%r10d - jnz .LChaCha20_ssse3 - - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset rbx,-16 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset rbp,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset r15,-56 - subq $64+24,%rsp -.cfi_adjust_cfa_offset 88 -.Lctr32_body: - - - movdqu (%rcx),%xmm1 - movdqu 16(%rcx),%xmm2 - movdqu (%r8),%xmm3 - movdqa .Lone(%rip),%xmm4 - - - movdqa %xmm1,16(%rsp) - movdqa %xmm2,32(%rsp) - movdqa %xmm3,48(%rsp) - movq %rdx,%rbp - jmp .Loop_outer - -.align 32 -.Loop_outer: - movl $0x61707865,%eax - movl $0x3320646e,%ebx - movl $0x79622d32,%ecx - movl $0x6b206574,%edx - movl 16(%rsp),%r8d - movl 20(%rsp),%r9d - movl 24(%rsp),%r10d - movl 28(%rsp),%r11d - movd %xmm3,%r12d - movl 52(%rsp),%r13d - movl 56(%rsp),%r14d - movl 60(%rsp),%r15d - - movq %rbp,64+0(%rsp) - movl $10,%ebp - movq %rsi,64+8(%rsp) -.byte 102,72,15,126,214 - movq %rdi,64+16(%rsp) - movq %rsi,%rdi - shrq $32,%rdi - jmp .Loop - -.align 32 -.Loop: - addl %r8d,%eax - xorl %eax,%r12d - roll $16,%r12d - addl %r9d,%ebx - xorl %ebx,%r13d - roll $16,%r13d - addl %r12d,%esi - xorl %esi,%r8d - roll $12,%r8d - addl %r13d,%edi - xorl %edi,%r9d - roll $12,%r9d - addl %r8d,%eax - xorl %eax,%r12d - roll $8,%r12d - addl %r9d,%ebx - xorl %ebx,%r13d - roll $8,%r13d - addl %r12d,%esi - xorl %esi,%r8d - roll $7,%r8d - addl %r13d,%edi - xorl %edi,%r9d - roll $7,%r9d - movl %esi,32(%rsp) - movl %edi,36(%rsp) - movl 40(%rsp),%esi - movl 44(%rsp),%edi - addl %r10d,%ecx - xorl %ecx,%r14d - roll $16,%r14d - addl %r11d,%edx - xorl %edx,%r15d - roll $16,%r15d - addl %r14d,%esi - xorl %esi,%r10d - roll $12,%r10d - addl %r15d,%edi - xorl %edi,%r11d - roll $12,%r11d - addl %r10d,%ecx - xorl %ecx,%r14d - roll $8,%r14d - addl %r11d,%edx - xorl %edx,%r15d - roll $8,%r15d - addl %r14d,%esi - xorl %esi,%r10d - roll $7,%r10d - addl %r15d,%edi - xorl %edi,%r11d - roll $7,%r11d - addl %r9d,%eax - xorl %eax,%r15d - roll $16,%r15d - addl %r10d,%ebx - xorl %ebx,%r12d - roll $16,%r12d - addl %r15d,%esi - xorl %esi,%r9d - roll $12,%r9d - addl %r12d,%edi - xorl %edi,%r10d - roll $12,%r10d - addl %r9d,%eax - xorl %eax,%r15d - roll $8,%r15d - addl %r10d,%ebx - xorl %ebx,%r12d - roll $8,%r12d - addl %r15d,%esi - xorl %esi,%r9d - roll $7,%r9d - addl %r12d,%edi - xorl %edi,%r10d - roll $7,%r10d - movl %esi,40(%rsp) - movl %edi,44(%rsp) - movl 32(%rsp),%esi - movl 36(%rsp),%edi - addl %r11d,%ecx - xorl %ecx,%r13d - roll $16,%r13d - addl %r8d,%edx - xorl %edx,%r14d - roll $16,%r14d - addl %r13d,%esi - xorl %esi,%r11d - roll $12,%r11d - addl %r14d,%edi - xorl %edi,%r8d - roll $12,%r8d - addl %r11d,%ecx - xorl %ecx,%r13d - roll $8,%r13d - addl %r8d,%edx - xorl %edx,%r14d - roll $8,%r14d - addl %r13d,%esi - xorl %esi,%r11d - roll $7,%r11d - addl %r14d,%edi - xorl %edi,%r8d - roll $7,%r8d - decl %ebp - jnz .Loop - movl %edi,36(%rsp) - movl %esi,32(%rsp) - movq 64(%rsp),%rbp - movdqa %xmm2,%xmm1 - movq 64+8(%rsp),%rsi - paddd %xmm4,%xmm3 - movq 64+16(%rsp),%rdi - - addl $0x61707865,%eax - addl $0x3320646e,%ebx - addl $0x79622d32,%ecx - addl $0x6b206574,%edx - addl 16(%rsp),%r8d - addl 20(%rsp),%r9d - addl 24(%rsp),%r10d - addl 28(%rsp),%r11d - addl 48(%rsp),%r12d - addl 52(%rsp),%r13d - addl 56(%rsp),%r14d - addl 60(%rsp),%r15d - paddd 32(%rsp),%xmm1 - - cmpq $64,%rbp - jb .Ltail - - xorl 0(%rsi),%eax - xorl 4(%rsi),%ebx - xorl 8(%rsi),%ecx - xorl 12(%rsi),%edx - xorl 16(%rsi),%r8d - xorl 20(%rsi),%r9d - xorl 24(%rsi),%r10d - xorl 28(%rsi),%r11d - movdqu 32(%rsi),%xmm0 - xorl 48(%rsi),%r12d - xorl 52(%rsi),%r13d - xorl 56(%rsi),%r14d - xorl 60(%rsi),%r15d - leaq 64(%rsi),%rsi - pxor %xmm1,%xmm0 - - movdqa %xmm2,32(%rsp) - movd %xmm3,48(%rsp) - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - movdqu %xmm0,32(%rdi) - movl %r12d,48(%rdi) - movl %r13d,52(%rdi) - movl %r14d,56(%rdi) - movl %r15d,60(%rdi) - leaq 64(%rdi),%rdi - - subq $64,%rbp - jnz .Loop_outer - - jmp .Ldone - -.align 16 -.Ltail: - movl %eax,0(%rsp) - movl %ebx,4(%rsp) - xorq %rbx,%rbx - movl %ecx,8(%rsp) - movl %edx,12(%rsp) - movl %r8d,16(%rsp) - movl %r9d,20(%rsp) - movl %r10d,24(%rsp) - movl %r11d,28(%rsp) - movdqa %xmm1,32(%rsp) - movl %r12d,48(%rsp) - movl %r13d,52(%rsp) - movl %r14d,56(%rsp) - movl %r15d,60(%rsp) - -.Loop_tail: - movzbl (%rsi,%rbx,1),%eax - movzbl (%rsp,%rbx,1),%edx - leaq 1(%rbx),%rbx - xorl %edx,%eax - movb %al,-1(%rdi,%rbx,1) - decq %rbp - jnz .Loop_tail - -.Ldone: - leaq 64+24+48(%rsp),%rsi - movq -48(%rsi),%r15 -.cfi_restore r15 - movq -40(%rsi),%r14 -.cfi_restore r14 - movq -32(%rsi),%r13 -.cfi_restore r13 - movq -24(%rsi),%r12 -.cfi_restore r12 - movq -16(%rsi),%rbp -.cfi_restore rbp - movq -8(%rsi),%rbx -.cfi_restore rbx - leaq (%rsi),%rsp -.cfi_adjust_cfa_offset -136 -.Lno_data: - .byte 0xf3,0xc3 -.cfi_endproc -.size ChaCha20_ctr32,.-ChaCha20_ctr32 -.type ChaCha20_ssse3,@function -.align 32 -ChaCha20_ssse3: -.LChaCha20_ssse3: -.cfi_startproc - movq %rsp,%r9 -.cfi_def_cfa_register r9 - cmpq $128,%rdx - ja .LChaCha20_4x - -.Ldo_sse3_after_all: - subq $64+8,%rsp - movdqa .Lsigma(%rip),%xmm0 - movdqu (%rcx),%xmm1 - movdqu 16(%rcx),%xmm2 - movdqu (%r8),%xmm3 - movdqa .Lrot16(%rip),%xmm6 - movdqa .Lrot24(%rip),%xmm7 - - movdqa %xmm0,0(%rsp) - movdqa %xmm1,16(%rsp) - movdqa %xmm2,32(%rsp) - movdqa %xmm3,48(%rsp) - movq $10,%r8 - jmp .Loop_ssse3 - -.align 32 -.Loop_outer_ssse3: - movdqa .Lone(%rip),%xmm3 - movdqa 0(%rsp),%xmm0 - movdqa 16(%rsp),%xmm1 - movdqa 32(%rsp),%xmm2 - paddd 48(%rsp),%xmm3 - movq $10,%r8 - movdqa %xmm3,48(%rsp) - jmp .Loop_ssse3 - -.align 32 -.Loop_ssse3: - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $57,%xmm1,%xmm1 - pshufd $147,%xmm3,%xmm3 - nop - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,222 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $20,%xmm1 - pslld $12,%xmm4 - por %xmm4,%xmm1 - paddd %xmm1,%xmm0 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,223 - paddd %xmm3,%xmm2 - pxor %xmm2,%xmm1 - movdqa %xmm1,%xmm4 - psrld $25,%xmm1 - pslld $7,%xmm4 - por %xmm4,%xmm1 - pshufd $78,%xmm2,%xmm2 - pshufd $147,%xmm1,%xmm1 - pshufd $57,%xmm3,%xmm3 - decq %r8 - jnz .Loop_ssse3 - paddd 0(%rsp),%xmm0 - paddd 16(%rsp),%xmm1 - paddd 32(%rsp),%xmm2 - paddd 48(%rsp),%xmm3 - - cmpq $64,%rdx - jb .Ltail_ssse3 - - movdqu 0(%rsi),%xmm4 - movdqu 16(%rsi),%xmm5 - pxor %xmm4,%xmm0 - movdqu 32(%rsi),%xmm4 - pxor %xmm5,%xmm1 - movdqu 48(%rsi),%xmm5 - leaq 64(%rsi),%rsi - pxor %xmm4,%xmm2 - pxor %xmm5,%xmm3 - - movdqu %xmm0,0(%rdi) - movdqu %xmm1,16(%rdi) - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - leaq 64(%rdi),%rdi - - subq $64,%rdx - jnz .Loop_outer_ssse3 - - jmp .Ldone_ssse3 - -.align 16 -.Ltail_ssse3: - movdqa %xmm0,0(%rsp) - movdqa %xmm1,16(%rsp) - movdqa %xmm2,32(%rsp) - movdqa %xmm3,48(%rsp) - xorq %r8,%r8 - -.Loop_tail_ssse3: - movzbl (%rsi,%r8,1),%eax - movzbl (%rsp,%r8,1),%ecx - leaq 1(%r8),%r8 - xorl %ecx,%eax - movb %al,-1(%rdi,%r8,1) - decq %rdx - jnz .Loop_tail_ssse3 - -.Ldone_ssse3: - leaq (%r9),%rsp -.cfi_def_cfa_register rsp -.Lssse3_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ChaCha20_ssse3,.-ChaCha20_ssse3 -.type ChaCha20_4x,@function -.align 32 -ChaCha20_4x: -.LChaCha20_4x: -.cfi_startproc - movq %rsp,%r9 -.cfi_def_cfa_register r9 - movq %r10,%r11 - shrq $32,%r10 - testq $32,%r10 - jnz .LChaCha20_8x - cmpq $192,%rdx - ja .Lproceed4x - - andq $71303168,%r11 - cmpq $4194304,%r11 - je .Ldo_sse3_after_all - -.Lproceed4x: - subq $0x140+8,%rsp - movdqa .Lsigma(%rip),%xmm11 - movdqu (%rcx),%xmm15 - movdqu 16(%rcx),%xmm7 - movdqu (%r8),%xmm3 - leaq 256(%rsp),%rcx - leaq .Lrot16(%rip),%r10 - leaq .Lrot24(%rip),%r11 - - pshufd $0x00,%xmm11,%xmm8 - pshufd $0x55,%xmm11,%xmm9 - movdqa %xmm8,64(%rsp) - pshufd $0xaa,%xmm11,%xmm10 - movdqa %xmm9,80(%rsp) - pshufd $0xff,%xmm11,%xmm11 - movdqa %xmm10,96(%rsp) - movdqa %xmm11,112(%rsp) - - pshufd $0x00,%xmm15,%xmm12 - pshufd $0x55,%xmm15,%xmm13 - movdqa %xmm12,128-256(%rcx) - pshufd $0xaa,%xmm15,%xmm14 - movdqa %xmm13,144-256(%rcx) - pshufd $0xff,%xmm15,%xmm15 - movdqa %xmm14,160-256(%rcx) - movdqa %xmm15,176-256(%rcx) - - pshufd $0x00,%xmm7,%xmm4 - pshufd $0x55,%xmm7,%xmm5 - movdqa %xmm4,192-256(%rcx) - pshufd $0xaa,%xmm7,%xmm6 - movdqa %xmm5,208-256(%rcx) - pshufd $0xff,%xmm7,%xmm7 - movdqa %xmm6,224-256(%rcx) - movdqa %xmm7,240-256(%rcx) - - pshufd $0x00,%xmm3,%xmm0 - pshufd $0x55,%xmm3,%xmm1 - paddd .Linc(%rip),%xmm0 - pshufd $0xaa,%xmm3,%xmm2 - movdqa %xmm1,272-256(%rcx) - pshufd $0xff,%xmm3,%xmm3 - movdqa %xmm2,288-256(%rcx) - movdqa %xmm3,304-256(%rcx) - - jmp .Loop_enter4x - -.align 32 -.Loop_outer4x: - movdqa 64(%rsp),%xmm8 - movdqa 80(%rsp),%xmm9 - movdqa 96(%rsp),%xmm10 - movdqa 112(%rsp),%xmm11 - movdqa 128-256(%rcx),%xmm12 - movdqa 144-256(%rcx),%xmm13 - movdqa 160-256(%rcx),%xmm14 - movdqa 176-256(%rcx),%xmm15 - movdqa 192-256(%rcx),%xmm4 - movdqa 208-256(%rcx),%xmm5 - movdqa 224-256(%rcx),%xmm6 - movdqa 240-256(%rcx),%xmm7 - movdqa 256-256(%rcx),%xmm0 - movdqa 272-256(%rcx),%xmm1 - movdqa 288-256(%rcx),%xmm2 - movdqa 304-256(%rcx),%xmm3 - paddd .Lfour(%rip),%xmm0 - -.Loop_enter4x: - movdqa %xmm6,32(%rsp) - movdqa %xmm7,48(%rsp) - movdqa (%r10),%xmm7 - movl $10,%eax - movdqa %xmm0,256-256(%rcx) - jmp .Loop4x - -.align 32 -.Loop4x: - paddd %xmm12,%xmm8 - paddd %xmm13,%xmm9 - pxor %xmm8,%xmm0 - pxor %xmm9,%xmm1 -.byte 102,15,56,0,199 -.byte 102,15,56,0,207 - paddd %xmm0,%xmm4 - paddd %xmm1,%xmm5 - pxor %xmm4,%xmm12 - pxor %xmm5,%xmm13 - movdqa %xmm12,%xmm6 - pslld $12,%xmm12 - psrld $20,%xmm6 - movdqa %xmm13,%xmm7 - pslld $12,%xmm13 - por %xmm6,%xmm12 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm13 - paddd %xmm12,%xmm8 - paddd %xmm13,%xmm9 - pxor %xmm8,%xmm0 - pxor %xmm9,%xmm1 -.byte 102,15,56,0,198 -.byte 102,15,56,0,206 - paddd %xmm0,%xmm4 - paddd %xmm1,%xmm5 - pxor %xmm4,%xmm12 - pxor %xmm5,%xmm13 - movdqa %xmm12,%xmm7 - pslld $7,%xmm12 - psrld $25,%xmm7 - movdqa %xmm13,%xmm6 - pslld $7,%xmm13 - por %xmm7,%xmm12 - psrld $25,%xmm6 - movdqa (%r10),%xmm7 - por %xmm6,%xmm13 - movdqa %xmm4,0(%rsp) - movdqa %xmm5,16(%rsp) - movdqa 32(%rsp),%xmm4 - movdqa 48(%rsp),%xmm5 - paddd %xmm14,%xmm10 - paddd %xmm15,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm3 -.byte 102,15,56,0,215 -.byte 102,15,56,0,223 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - pxor %xmm4,%xmm14 - pxor %xmm5,%xmm15 - movdqa %xmm14,%xmm6 - pslld $12,%xmm14 - psrld $20,%xmm6 - movdqa %xmm15,%xmm7 - pslld $12,%xmm15 - por %xmm6,%xmm14 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm15 - paddd %xmm14,%xmm10 - paddd %xmm15,%xmm11 - pxor %xmm10,%xmm2 - pxor %xmm11,%xmm3 -.byte 102,15,56,0,214 -.byte 102,15,56,0,222 - paddd %xmm2,%xmm4 - paddd %xmm3,%xmm5 - pxor %xmm4,%xmm14 - pxor %xmm5,%xmm15 - movdqa %xmm14,%xmm7 - pslld $7,%xmm14 - psrld $25,%xmm7 - movdqa %xmm15,%xmm6 - pslld $7,%xmm15 - por %xmm7,%xmm14 - psrld $25,%xmm6 - movdqa (%r10),%xmm7 - por %xmm6,%xmm15 - paddd %xmm13,%xmm8 - paddd %xmm14,%xmm9 - pxor %xmm8,%xmm3 - pxor %xmm9,%xmm0 -.byte 102,15,56,0,223 -.byte 102,15,56,0,199 - paddd %xmm3,%xmm4 - paddd %xmm0,%xmm5 - pxor %xmm4,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm13,%xmm6 - pslld $12,%xmm13 - psrld $20,%xmm6 - movdqa %xmm14,%xmm7 - pslld $12,%xmm14 - por %xmm6,%xmm13 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm14 - paddd %xmm13,%xmm8 - paddd %xmm14,%xmm9 - pxor %xmm8,%xmm3 - pxor %xmm9,%xmm0 -.byte 102,15,56,0,222 -.byte 102,15,56,0,198 - paddd %xmm3,%xmm4 - paddd %xmm0,%xmm5 - pxor %xmm4,%xmm13 - pxor %xmm5,%xmm14 - movdqa %xmm13,%xmm7 - pslld $7,%xmm13 - psrld $25,%xmm7 - movdqa %xmm14,%xmm6 - pslld $7,%xmm14 - por %xmm7,%xmm13 - psrld $25,%xmm6 - movdqa (%r10),%xmm7 - por %xmm6,%xmm14 - movdqa %xmm4,32(%rsp) - movdqa %xmm5,48(%rsp) - movdqa 0(%rsp),%xmm4 - movdqa 16(%rsp),%xmm5 - paddd %xmm15,%xmm10 - paddd %xmm12,%xmm11 - pxor %xmm10,%xmm1 - pxor %xmm11,%xmm2 -.byte 102,15,56,0,207 -.byte 102,15,56,0,215 - paddd %xmm1,%xmm4 - paddd %xmm2,%xmm5 - pxor %xmm4,%xmm15 - pxor %xmm5,%xmm12 - movdqa %xmm15,%xmm6 - pslld $12,%xmm15 - psrld $20,%xmm6 - movdqa %xmm12,%xmm7 - pslld $12,%xmm12 - por %xmm6,%xmm15 - psrld $20,%xmm7 - movdqa (%r11),%xmm6 - por %xmm7,%xmm12 - paddd %xmm15,%xmm10 - paddd %xmm12,%xmm11 - pxor %xmm10,%xmm1 - pxor %xmm11,%xmm2 -.byte 102,15,56,0,206 -.byte 102,15,56,0,214 - paddd %xmm1,%xmm4 - paddd %xmm2,%xmm5 - pxor %xmm4,%xmm15 - pxor %xmm5,%xmm12 - movdqa %xmm15,%xmm7 - pslld $7,%xmm15 - psrld $25,%xmm7 - movdqa %xmm12,%xmm6 - pslld $7,%xmm12 - por %xmm7,%xmm15 - psrld $25,%xmm6 - movdqa (%r10),%xmm7 - por %xmm6,%xmm12 - decl %eax - jnz .Loop4x - - paddd 64(%rsp),%xmm8 - paddd 80(%rsp),%xmm9 - paddd 96(%rsp),%xmm10 - paddd 112(%rsp),%xmm11 - - movdqa %xmm8,%xmm6 - punpckldq %xmm9,%xmm8 - movdqa %xmm10,%xmm7 - punpckldq %xmm11,%xmm10 - punpckhdq %xmm9,%xmm6 - punpckhdq %xmm11,%xmm7 - movdqa %xmm8,%xmm9 - punpcklqdq %xmm10,%xmm8 - movdqa %xmm6,%xmm11 - punpcklqdq %xmm7,%xmm6 - punpckhqdq %xmm10,%xmm9 - punpckhqdq %xmm7,%xmm11 - paddd 128-256(%rcx),%xmm12 - paddd 144-256(%rcx),%xmm13 - paddd 160-256(%rcx),%xmm14 - paddd 176-256(%rcx),%xmm15 - - movdqa %xmm8,0(%rsp) - movdqa %xmm9,16(%rsp) - movdqa 32(%rsp),%xmm8 - movdqa 48(%rsp),%xmm9 - - movdqa %xmm12,%xmm10 - punpckldq %xmm13,%xmm12 - movdqa %xmm14,%xmm7 - punpckldq %xmm15,%xmm14 - punpckhdq %xmm13,%xmm10 - punpckhdq %xmm15,%xmm7 - movdqa %xmm12,%xmm13 - punpcklqdq %xmm14,%xmm12 - movdqa %xmm10,%xmm15 - punpcklqdq %xmm7,%xmm10 - punpckhqdq %xmm14,%xmm13 - punpckhqdq %xmm7,%xmm15 - paddd 192-256(%rcx),%xmm4 - paddd 208-256(%rcx),%xmm5 - paddd 224-256(%rcx),%xmm8 - paddd 240-256(%rcx),%xmm9 - - movdqa %xmm6,32(%rsp) - movdqa %xmm11,48(%rsp) - - movdqa %xmm4,%xmm14 - punpckldq %xmm5,%xmm4 - movdqa %xmm8,%xmm7 - punpckldq %xmm9,%xmm8 - punpckhdq %xmm5,%xmm14 - punpckhdq %xmm9,%xmm7 - movdqa %xmm4,%xmm5 - punpcklqdq %xmm8,%xmm4 - movdqa %xmm14,%xmm9 - punpcklqdq %xmm7,%xmm14 - punpckhqdq %xmm8,%xmm5 - punpckhqdq %xmm7,%xmm9 - paddd 256-256(%rcx),%xmm0 - paddd 272-256(%rcx),%xmm1 - paddd 288-256(%rcx),%xmm2 - paddd 304-256(%rcx),%xmm3 - - movdqa %xmm0,%xmm8 - punpckldq %xmm1,%xmm0 - movdqa %xmm2,%xmm7 - punpckldq %xmm3,%xmm2 - punpckhdq %xmm1,%xmm8 - punpckhdq %xmm3,%xmm7 - movdqa %xmm0,%xmm1 - punpcklqdq %xmm2,%xmm0 - movdqa %xmm8,%xmm3 - punpcklqdq %xmm7,%xmm8 - punpckhqdq %xmm2,%xmm1 - punpckhqdq %xmm7,%xmm3 - cmpq $256,%rdx - jb .Ltail4x - - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - leaq 128(%rsi),%rsi - pxor 16(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - - movdqu %xmm6,64(%rdi) - movdqu 0(%rsi),%xmm6 - movdqu %xmm11,80(%rdi) - movdqu 16(%rsi),%xmm11 - movdqu %xmm2,96(%rdi) - movdqu 32(%rsi),%xmm2 - movdqu %xmm7,112(%rdi) - leaq 128(%rdi),%rdi - movdqu 48(%rsi),%xmm7 - pxor 32(%rsp),%xmm6 - pxor %xmm10,%xmm11 - pxor %xmm14,%xmm2 - pxor %xmm8,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - leaq 128(%rsi),%rsi - pxor 48(%rsp),%xmm6 - pxor %xmm15,%xmm11 - pxor %xmm9,%xmm2 - pxor %xmm3,%xmm7 - movdqu %xmm6,64(%rdi) - movdqu %xmm11,80(%rdi) - movdqu %xmm2,96(%rdi) - movdqu %xmm7,112(%rdi) - leaq 128(%rdi),%rdi - - subq $256,%rdx - jnz .Loop_outer4x - - jmp .Ldone4x - -.Ltail4x: - cmpq $192,%rdx - jae .L192_or_more4x - cmpq $128,%rdx - jae .L128_or_more4x - cmpq $64,%rdx - jae .L64_or_more4x - - - xorq %r10,%r10 - - movdqa %xmm12,16(%rsp) - movdqa %xmm4,32(%rsp) - movdqa %xmm0,48(%rsp) - jmp .Loop_tail4x - -.align 32 -.L64_or_more4x: - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - movdqu %xmm6,0(%rdi) - movdqu %xmm11,16(%rdi) - movdqu %xmm2,32(%rdi) - movdqu %xmm7,48(%rdi) - je .Ldone4x - - movdqa 16(%rsp),%xmm6 - leaq 64(%rsi),%rsi - xorq %r10,%r10 - movdqa %xmm6,0(%rsp) - movdqa %xmm13,16(%rsp) - leaq 64(%rdi),%rdi - movdqa %xmm5,32(%rsp) - subq $64,%rdx - movdqa %xmm1,48(%rsp) - jmp .Loop_tail4x - -.align 32 -.L128_or_more4x: - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - pxor 16(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - movdqu %xmm6,64(%rdi) - movdqu %xmm11,80(%rdi) - movdqu %xmm2,96(%rdi) - movdqu %xmm7,112(%rdi) - je .Ldone4x - - movdqa 32(%rsp),%xmm6 - leaq 128(%rsi),%rsi - xorq %r10,%r10 - movdqa %xmm6,0(%rsp) - movdqa %xmm10,16(%rsp) - leaq 128(%rdi),%rdi - movdqa %xmm14,32(%rsp) - subq $128,%rdx - movdqa %xmm8,48(%rsp) - jmp .Loop_tail4x - -.align 32 -.L192_or_more4x: - movdqu 0(%rsi),%xmm6 - movdqu 16(%rsi),%xmm11 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm7 - pxor 0(%rsp),%xmm6 - pxor %xmm12,%xmm11 - pxor %xmm4,%xmm2 - pxor %xmm0,%xmm7 - - movdqu %xmm6,0(%rdi) - movdqu 64(%rsi),%xmm6 - movdqu %xmm11,16(%rdi) - movdqu 80(%rsi),%xmm11 - movdqu %xmm2,32(%rdi) - movdqu 96(%rsi),%xmm2 - movdqu %xmm7,48(%rdi) - movdqu 112(%rsi),%xmm7 - leaq 128(%rsi),%rsi - pxor 16(%rsp),%xmm6 - pxor %xmm13,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm1,%xmm7 - - movdqu %xmm6,64(%rdi) - movdqu 0(%rsi),%xmm6 - movdqu %xmm11,80(%rdi) - movdqu 16(%rsi),%xmm11 - movdqu %xmm2,96(%rdi) - movdqu 32(%rsi),%xmm2 - movdqu %xmm7,112(%rdi) - leaq 128(%rdi),%rdi - movdqu 48(%rsi),%xmm7 - pxor 32(%rsp),%xmm6 - pxor %xmm10,%xmm11 - pxor %xmm14,%xmm2 - pxor %xmm8,%xmm7 - movdqu %xmm6,0(%rdi) - movdqu %xmm11,16(%rdi) - movdqu %xmm2,32(%rdi) - movdqu %xmm7,48(%rdi) - je .Ldone4x - - movdqa 48(%rsp),%xmm6 - leaq 64(%rsi),%rsi - xorq %r10,%r10 - movdqa %xmm6,0(%rsp) - movdqa %xmm15,16(%rsp) - leaq 64(%rdi),%rdi - movdqa %xmm9,32(%rsp) - subq $192,%rdx - movdqa %xmm3,48(%rsp) - -.Loop_tail4x: - movzbl (%rsi,%r10,1),%eax - movzbl (%rsp,%r10,1),%ecx - leaq 1(%r10),%r10 - xorl %ecx,%eax - movb %al,-1(%rdi,%r10,1) - decq %rdx - jnz .Loop_tail4x - -.Ldone4x: - leaq (%r9),%rsp -.cfi_def_cfa_register rsp -.L4x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ChaCha20_4x,.-ChaCha20_4x -.type ChaCha20_8x,@function -.align 32 -ChaCha20_8x: -.LChaCha20_8x: -.cfi_startproc - movq %rsp,%r9 -.cfi_def_cfa_register r9 - subq $0x280+8,%rsp - andq $-32,%rsp - vzeroupper - - - - - - - - - - - vbroadcasti128 .Lsigma(%rip),%ymm11 - vbroadcasti128 (%rcx),%ymm3 - vbroadcasti128 16(%rcx),%ymm15 - vbroadcasti128 (%r8),%ymm7 - leaq 256(%rsp),%rcx - leaq 512(%rsp),%rax - leaq .Lrot16(%rip),%r10 - leaq .Lrot24(%rip),%r11 - - vpshufd $0x00,%ymm11,%ymm8 - vpshufd $0x55,%ymm11,%ymm9 - vmovdqa %ymm8,128-256(%rcx) - vpshufd $0xaa,%ymm11,%ymm10 - vmovdqa %ymm9,160-256(%rcx) - vpshufd $0xff,%ymm11,%ymm11 - vmovdqa %ymm10,192-256(%rcx) - vmovdqa %ymm11,224-256(%rcx) - - vpshufd $0x00,%ymm3,%ymm0 - vpshufd $0x55,%ymm3,%ymm1 - vmovdqa %ymm0,256-256(%rcx) - vpshufd $0xaa,%ymm3,%ymm2 - vmovdqa %ymm1,288-256(%rcx) - vpshufd $0xff,%ymm3,%ymm3 - vmovdqa %ymm2,320-256(%rcx) - vmovdqa %ymm3,352-256(%rcx) - - vpshufd $0x00,%ymm15,%ymm12 - vpshufd $0x55,%ymm15,%ymm13 - vmovdqa %ymm12,384-512(%rax) - vpshufd $0xaa,%ymm15,%ymm14 - vmovdqa %ymm13,416-512(%rax) - vpshufd $0xff,%ymm15,%ymm15 - vmovdqa %ymm14,448-512(%rax) - vmovdqa %ymm15,480-512(%rax) - - vpshufd $0x00,%ymm7,%ymm4 - vpshufd $0x55,%ymm7,%ymm5 - vpaddd .Lincy(%rip),%ymm4,%ymm4 - vpshufd $0xaa,%ymm7,%ymm6 - vmovdqa %ymm5,544-512(%rax) - vpshufd $0xff,%ymm7,%ymm7 - vmovdqa %ymm6,576-512(%rax) - vmovdqa %ymm7,608-512(%rax) - - jmp .Loop_enter8x - -.align 32 -.Loop_outer8x: - vmovdqa 128-256(%rcx),%ymm8 - vmovdqa 160-256(%rcx),%ymm9 - vmovdqa 192-256(%rcx),%ymm10 - vmovdqa 224-256(%rcx),%ymm11 - vmovdqa 256-256(%rcx),%ymm0 - vmovdqa 288-256(%rcx),%ymm1 - vmovdqa 320-256(%rcx),%ymm2 - vmovdqa 352-256(%rcx),%ymm3 - vmovdqa 384-512(%rax),%ymm12 - vmovdqa 416-512(%rax),%ymm13 - vmovdqa 448-512(%rax),%ymm14 - vmovdqa 480-512(%rax),%ymm15 - vmovdqa 512-512(%rax),%ymm4 - vmovdqa 544-512(%rax),%ymm5 - vmovdqa 576-512(%rax),%ymm6 - vmovdqa 608-512(%rax),%ymm7 - vpaddd .Leight(%rip),%ymm4,%ymm4 - -.Loop_enter8x: - vmovdqa %ymm14,64(%rsp) - vmovdqa %ymm15,96(%rsp) - vbroadcasti128 (%r10),%ymm15 - vmovdqa %ymm4,512-512(%rax) - movl $10,%eax - jmp .Loop8x - -.align 32 -.Loop8x: - vpaddd %ymm0,%ymm8,%ymm8 - vpxor %ymm4,%ymm8,%ymm4 - vpshufb %ymm15,%ymm4,%ymm4 - vpaddd %ymm1,%ymm9,%ymm9 - vpxor %ymm5,%ymm9,%ymm5 - vpshufb %ymm15,%ymm5,%ymm5 - vpaddd %ymm4,%ymm12,%ymm12 - vpxor %ymm0,%ymm12,%ymm0 - vpslld $12,%ymm0,%ymm14 - vpsrld $20,%ymm0,%ymm0 - vpor %ymm0,%ymm14,%ymm0 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm5,%ymm13,%ymm13 - vpxor %ymm1,%ymm13,%ymm1 - vpslld $12,%ymm1,%ymm15 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm1,%ymm15,%ymm1 - vpaddd %ymm0,%ymm8,%ymm8 - vpxor %ymm4,%ymm8,%ymm4 - vpshufb %ymm14,%ymm4,%ymm4 - vpaddd %ymm1,%ymm9,%ymm9 - vpxor %ymm5,%ymm9,%ymm5 - vpshufb %ymm14,%ymm5,%ymm5 - vpaddd %ymm4,%ymm12,%ymm12 - vpxor %ymm0,%ymm12,%ymm0 - vpslld $7,%ymm0,%ymm15 - vpsrld $25,%ymm0,%ymm0 - vpor %ymm0,%ymm15,%ymm0 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm5,%ymm13,%ymm13 - vpxor %ymm1,%ymm13,%ymm1 - vpslld $7,%ymm1,%ymm14 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm1,%ymm14,%ymm1 - vmovdqa %ymm12,0(%rsp) - vmovdqa %ymm13,32(%rsp) - vmovdqa 64(%rsp),%ymm12 - vmovdqa 96(%rsp),%ymm13 - vpaddd %ymm2,%ymm10,%ymm10 - vpxor %ymm6,%ymm10,%ymm6 - vpshufb %ymm15,%ymm6,%ymm6 - vpaddd %ymm3,%ymm11,%ymm11 - vpxor %ymm7,%ymm11,%ymm7 - vpshufb %ymm15,%ymm7,%ymm7 - vpaddd %ymm6,%ymm12,%ymm12 - vpxor %ymm2,%ymm12,%ymm2 - vpslld $12,%ymm2,%ymm14 - vpsrld $20,%ymm2,%ymm2 - vpor %ymm2,%ymm14,%ymm2 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm7,%ymm13,%ymm13 - vpxor %ymm3,%ymm13,%ymm3 - vpslld $12,%ymm3,%ymm15 - vpsrld $20,%ymm3,%ymm3 - vpor %ymm3,%ymm15,%ymm3 - vpaddd %ymm2,%ymm10,%ymm10 - vpxor %ymm6,%ymm10,%ymm6 - vpshufb %ymm14,%ymm6,%ymm6 - vpaddd %ymm3,%ymm11,%ymm11 - vpxor %ymm7,%ymm11,%ymm7 - vpshufb %ymm14,%ymm7,%ymm7 - vpaddd %ymm6,%ymm12,%ymm12 - vpxor %ymm2,%ymm12,%ymm2 - vpslld $7,%ymm2,%ymm15 - vpsrld $25,%ymm2,%ymm2 - vpor %ymm2,%ymm15,%ymm2 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm7,%ymm13,%ymm13 - vpxor %ymm3,%ymm13,%ymm3 - vpslld $7,%ymm3,%ymm14 - vpsrld $25,%ymm3,%ymm3 - vpor %ymm3,%ymm14,%ymm3 - vpaddd %ymm1,%ymm8,%ymm8 - vpxor %ymm7,%ymm8,%ymm7 - vpshufb %ymm15,%ymm7,%ymm7 - vpaddd %ymm2,%ymm9,%ymm9 - vpxor %ymm4,%ymm9,%ymm4 - vpshufb %ymm15,%ymm4,%ymm4 - vpaddd %ymm7,%ymm12,%ymm12 - vpxor %ymm1,%ymm12,%ymm1 - vpslld $12,%ymm1,%ymm14 - vpsrld $20,%ymm1,%ymm1 - vpor %ymm1,%ymm14,%ymm1 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm4,%ymm13,%ymm13 - vpxor %ymm2,%ymm13,%ymm2 - vpslld $12,%ymm2,%ymm15 - vpsrld $20,%ymm2,%ymm2 - vpor %ymm2,%ymm15,%ymm2 - vpaddd %ymm1,%ymm8,%ymm8 - vpxor %ymm7,%ymm8,%ymm7 - vpshufb %ymm14,%ymm7,%ymm7 - vpaddd %ymm2,%ymm9,%ymm9 - vpxor %ymm4,%ymm9,%ymm4 - vpshufb %ymm14,%ymm4,%ymm4 - vpaddd %ymm7,%ymm12,%ymm12 - vpxor %ymm1,%ymm12,%ymm1 - vpslld $7,%ymm1,%ymm15 - vpsrld $25,%ymm1,%ymm1 - vpor %ymm1,%ymm15,%ymm1 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm4,%ymm13,%ymm13 - vpxor %ymm2,%ymm13,%ymm2 - vpslld $7,%ymm2,%ymm14 - vpsrld $25,%ymm2,%ymm2 - vpor %ymm2,%ymm14,%ymm2 - vmovdqa %ymm12,64(%rsp) - vmovdqa %ymm13,96(%rsp) - vmovdqa 0(%rsp),%ymm12 - vmovdqa 32(%rsp),%ymm13 - vpaddd %ymm3,%ymm10,%ymm10 - vpxor %ymm5,%ymm10,%ymm5 - vpshufb %ymm15,%ymm5,%ymm5 - vpaddd %ymm0,%ymm11,%ymm11 - vpxor %ymm6,%ymm11,%ymm6 - vpshufb %ymm15,%ymm6,%ymm6 - vpaddd %ymm5,%ymm12,%ymm12 - vpxor %ymm3,%ymm12,%ymm3 - vpslld $12,%ymm3,%ymm14 - vpsrld $20,%ymm3,%ymm3 - vpor %ymm3,%ymm14,%ymm3 - vbroadcasti128 (%r11),%ymm14 - vpaddd %ymm6,%ymm13,%ymm13 - vpxor %ymm0,%ymm13,%ymm0 - vpslld $12,%ymm0,%ymm15 - vpsrld $20,%ymm0,%ymm0 - vpor %ymm0,%ymm15,%ymm0 - vpaddd %ymm3,%ymm10,%ymm10 - vpxor %ymm5,%ymm10,%ymm5 - vpshufb %ymm14,%ymm5,%ymm5 - vpaddd %ymm0,%ymm11,%ymm11 - vpxor %ymm6,%ymm11,%ymm6 - vpshufb %ymm14,%ymm6,%ymm6 - vpaddd %ymm5,%ymm12,%ymm12 - vpxor %ymm3,%ymm12,%ymm3 - vpslld $7,%ymm3,%ymm15 - vpsrld $25,%ymm3,%ymm3 - vpor %ymm3,%ymm15,%ymm3 - vbroadcasti128 (%r10),%ymm15 - vpaddd %ymm6,%ymm13,%ymm13 - vpxor %ymm0,%ymm13,%ymm0 - vpslld $7,%ymm0,%ymm14 - vpsrld $25,%ymm0,%ymm0 - vpor %ymm0,%ymm14,%ymm0 - decl %eax - jnz .Loop8x - - leaq 512(%rsp),%rax - vpaddd 128-256(%rcx),%ymm8,%ymm8 - vpaddd 160-256(%rcx),%ymm9,%ymm9 - vpaddd 192-256(%rcx),%ymm10,%ymm10 - vpaddd 224-256(%rcx),%ymm11,%ymm11 - - vpunpckldq %ymm9,%ymm8,%ymm14 - vpunpckldq %ymm11,%ymm10,%ymm15 - vpunpckhdq %ymm9,%ymm8,%ymm8 - vpunpckhdq %ymm11,%ymm10,%ymm10 - vpunpcklqdq %ymm15,%ymm14,%ymm9 - vpunpckhqdq %ymm15,%ymm14,%ymm14 - vpunpcklqdq %ymm10,%ymm8,%ymm11 - vpunpckhqdq %ymm10,%ymm8,%ymm8 - vpaddd 256-256(%rcx),%ymm0,%ymm0 - vpaddd 288-256(%rcx),%ymm1,%ymm1 - vpaddd 320-256(%rcx),%ymm2,%ymm2 - vpaddd 352-256(%rcx),%ymm3,%ymm3 - - vpunpckldq %ymm1,%ymm0,%ymm10 - vpunpckldq %ymm3,%ymm2,%ymm15 - vpunpckhdq %ymm1,%ymm0,%ymm0 - vpunpckhdq %ymm3,%ymm2,%ymm2 - vpunpcklqdq %ymm15,%ymm10,%ymm1 - vpunpckhqdq %ymm15,%ymm10,%ymm10 - vpunpcklqdq %ymm2,%ymm0,%ymm3 - vpunpckhqdq %ymm2,%ymm0,%ymm0 - vperm2i128 $0x20,%ymm1,%ymm9,%ymm15 - vperm2i128 $0x31,%ymm1,%ymm9,%ymm1 - vperm2i128 $0x20,%ymm10,%ymm14,%ymm9 - vperm2i128 $0x31,%ymm10,%ymm14,%ymm10 - vperm2i128 $0x20,%ymm3,%ymm11,%ymm14 - vperm2i128 $0x31,%ymm3,%ymm11,%ymm3 - vperm2i128 $0x20,%ymm0,%ymm8,%ymm11 - vperm2i128 $0x31,%ymm0,%ymm8,%ymm0 - vmovdqa %ymm15,0(%rsp) - vmovdqa %ymm9,32(%rsp) - vmovdqa 64(%rsp),%ymm15 - vmovdqa 96(%rsp),%ymm9 - - vpaddd 384-512(%rax),%ymm12,%ymm12 - vpaddd 416-512(%rax),%ymm13,%ymm13 - vpaddd 448-512(%rax),%ymm15,%ymm15 - vpaddd 480-512(%rax),%ymm9,%ymm9 - - vpunpckldq %ymm13,%ymm12,%ymm2 - vpunpckldq %ymm9,%ymm15,%ymm8 - vpunpckhdq %ymm13,%ymm12,%ymm12 - vpunpckhdq %ymm9,%ymm15,%ymm15 - vpunpcklqdq %ymm8,%ymm2,%ymm13 - vpunpckhqdq %ymm8,%ymm2,%ymm2 - vpunpcklqdq %ymm15,%ymm12,%ymm9 - vpunpckhqdq %ymm15,%ymm12,%ymm12 - vpaddd 512-512(%rax),%ymm4,%ymm4 - vpaddd 544-512(%rax),%ymm5,%ymm5 - vpaddd 576-512(%rax),%ymm6,%ymm6 - vpaddd 608-512(%rax),%ymm7,%ymm7 - - vpunpckldq %ymm5,%ymm4,%ymm15 - vpunpckldq %ymm7,%ymm6,%ymm8 - vpunpckhdq %ymm5,%ymm4,%ymm4 - vpunpckhdq %ymm7,%ymm6,%ymm6 - vpunpcklqdq %ymm8,%ymm15,%ymm5 - vpunpckhqdq %ymm8,%ymm15,%ymm15 - vpunpcklqdq %ymm6,%ymm4,%ymm7 - vpunpckhqdq %ymm6,%ymm4,%ymm4 - vperm2i128 $0x20,%ymm5,%ymm13,%ymm8 - vperm2i128 $0x31,%ymm5,%ymm13,%ymm5 - vperm2i128 $0x20,%ymm15,%ymm2,%ymm13 - vperm2i128 $0x31,%ymm15,%ymm2,%ymm15 - vperm2i128 $0x20,%ymm7,%ymm9,%ymm2 - vperm2i128 $0x31,%ymm7,%ymm9,%ymm7 - vperm2i128 $0x20,%ymm4,%ymm12,%ymm9 - vperm2i128 $0x31,%ymm4,%ymm12,%ymm4 - vmovdqa 0(%rsp),%ymm6 - vmovdqa 32(%rsp),%ymm12 - - cmpq $512,%rdx - jb .Ltail8x - - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - leaq 128(%rsi),%rsi - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm12,%ymm12 - vpxor 32(%rsi),%ymm13,%ymm13 - vpxor 64(%rsi),%ymm10,%ymm10 - vpxor 96(%rsi),%ymm15,%ymm15 - leaq 128(%rsi),%rsi - vmovdqu %ymm12,0(%rdi) - vmovdqu %ymm13,32(%rdi) - vmovdqu %ymm10,64(%rdi) - vmovdqu %ymm15,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm14,%ymm14 - vpxor 32(%rsi),%ymm2,%ymm2 - vpxor 64(%rsi),%ymm3,%ymm3 - vpxor 96(%rsi),%ymm7,%ymm7 - leaq 128(%rsi),%rsi - vmovdqu %ymm14,0(%rdi) - vmovdqu %ymm2,32(%rdi) - vmovdqu %ymm3,64(%rdi) - vmovdqu %ymm7,96(%rdi) - leaq 128(%rdi),%rdi - - vpxor 0(%rsi),%ymm11,%ymm11 - vpxor 32(%rsi),%ymm9,%ymm9 - vpxor 64(%rsi),%ymm0,%ymm0 - vpxor 96(%rsi),%ymm4,%ymm4 - leaq 128(%rsi),%rsi - vmovdqu %ymm11,0(%rdi) - vmovdqu %ymm9,32(%rdi) - vmovdqu %ymm0,64(%rdi) - vmovdqu %ymm4,96(%rdi) - leaq 128(%rdi),%rdi - - subq $512,%rdx - jnz .Loop_outer8x - - jmp .Ldone8x - -.Ltail8x: - cmpq $448,%rdx - jae .L448_or_more8x - cmpq $384,%rdx - jae .L384_or_more8x - cmpq $320,%rdx - jae .L320_or_more8x - cmpq $256,%rdx - jae .L256_or_more8x - cmpq $192,%rdx - jae .L192_or_more8x - cmpq $128,%rdx - jae .L128_or_more8x - cmpq $64,%rdx - jae .L64_or_more8x - - xorq %r10,%r10 - vmovdqa %ymm6,0(%rsp) - vmovdqa %ymm8,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L64_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - je .Ldone8x - - leaq 64(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm1,0(%rsp) - leaq 64(%rdi),%rdi - subq $64,%rdx - vmovdqa %ymm5,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L128_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - je .Ldone8x - - leaq 128(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm12,0(%rsp) - leaq 128(%rdi),%rdi - subq $128,%rdx - vmovdqa %ymm13,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L192_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - je .Ldone8x - - leaq 192(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm10,0(%rsp) - leaq 192(%rdi),%rdi - subq $192,%rdx - vmovdqa %ymm15,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L256_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - je .Ldone8x - - leaq 256(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm14,0(%rsp) - leaq 256(%rdi),%rdi - subq $256,%rdx - vmovdqa %ymm2,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L320_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - je .Ldone8x - - leaq 320(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm3,0(%rsp) - leaq 320(%rdi),%rdi - subq $320,%rdx - vmovdqa %ymm7,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L384_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vpxor 320(%rsi),%ymm3,%ymm3 - vpxor 352(%rsi),%ymm7,%ymm7 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - vmovdqu %ymm3,320(%rdi) - vmovdqu %ymm7,352(%rdi) - je .Ldone8x - - leaq 384(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm11,0(%rsp) - leaq 384(%rdi),%rdi - subq $384,%rdx - vmovdqa %ymm9,32(%rsp) - jmp .Loop_tail8x - -.align 32 -.L448_or_more8x: - vpxor 0(%rsi),%ymm6,%ymm6 - vpxor 32(%rsi),%ymm8,%ymm8 - vpxor 64(%rsi),%ymm1,%ymm1 - vpxor 96(%rsi),%ymm5,%ymm5 - vpxor 128(%rsi),%ymm12,%ymm12 - vpxor 160(%rsi),%ymm13,%ymm13 - vpxor 192(%rsi),%ymm10,%ymm10 - vpxor 224(%rsi),%ymm15,%ymm15 - vpxor 256(%rsi),%ymm14,%ymm14 - vpxor 288(%rsi),%ymm2,%ymm2 - vpxor 320(%rsi),%ymm3,%ymm3 - vpxor 352(%rsi),%ymm7,%ymm7 - vpxor 384(%rsi),%ymm11,%ymm11 - vpxor 416(%rsi),%ymm9,%ymm9 - vmovdqu %ymm6,0(%rdi) - vmovdqu %ymm8,32(%rdi) - vmovdqu %ymm1,64(%rdi) - vmovdqu %ymm5,96(%rdi) - vmovdqu %ymm12,128(%rdi) - vmovdqu %ymm13,160(%rdi) - vmovdqu %ymm10,192(%rdi) - vmovdqu %ymm15,224(%rdi) - vmovdqu %ymm14,256(%rdi) - vmovdqu %ymm2,288(%rdi) - vmovdqu %ymm3,320(%rdi) - vmovdqu %ymm7,352(%rdi) - vmovdqu %ymm11,384(%rdi) - vmovdqu %ymm9,416(%rdi) - je .Ldone8x - - leaq 448(%rsi),%rsi - xorq %r10,%r10 - vmovdqa %ymm0,0(%rsp) - leaq 448(%rdi),%rdi - subq $448,%rdx - vmovdqa %ymm4,32(%rsp) - -.Loop_tail8x: - movzbl (%rsi,%r10,1),%eax - movzbl (%rsp,%r10,1),%ecx - leaq 1(%r10),%r10 - xorl %ecx,%eax - movb %al,-1(%rdi,%r10,1) - decq %rdx - jnz .Loop_tail8x - -.Ldone8x: - vzeroall - leaq (%r9),%rsp -.cfi_def_cfa_register rsp -.L8x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ChaCha20_8x,.-ChaCha20_8x -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S deleted file mode 100644 index 2e41e91f..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S +++ /dev/null @@ -1,3079 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.data - -.align 16 -one: -.quad 1,0 -two: -.quad 2,0 -three: -.quad 3,0 -four: -.quad 4,0 -five: -.quad 5,0 -six: -.quad 6,0 -seven: -.quad 7,0 -eight: -.quad 8,0 - -OR_MASK: -.long 0x00000000,0x00000000,0x00000000,0x80000000 -poly: -.quad 0x1, 0xc200000000000000 -mask: -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d -con1: -.long 1,1,1,1 -con2: -.long 0x1b,0x1b,0x1b,0x1b -con3: -.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7 -and_mask: -.long 0,0xffffffff, 0xffffffff, 0xffffffff -.text -.type GFMUL,@function -.align 16 -GFMUL: -.cfi_startproc - vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 - vpclmulqdq $0x11,%xmm1,%xmm0,%xmm5 - vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 - vpclmulqdq $0x01,%xmm1,%xmm0,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm2,%xmm2 - vpxor %xmm3,%xmm5,%xmm5 - - vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 - vpshufd $78,%xmm2,%xmm4 - vpxor %xmm4,%xmm3,%xmm2 - - vpclmulqdq $0x10,poly(%rip),%xmm2,%xmm3 - vpshufd $78,%xmm2,%xmm4 - vpxor %xmm4,%xmm3,%xmm2 - - vpxor %xmm5,%xmm2,%xmm0 - .byte 0xf3,0xc3 -.cfi_endproc -.size GFMUL, .-GFMUL -.globl aesgcmsiv_htable_init -.hidden aesgcmsiv_htable_init -.type aesgcmsiv_htable_init,@function -.align 16 -aesgcmsiv_htable_init: -.cfi_startproc - vmovdqa (%rsi),%xmm0 - vmovdqa %xmm0,%xmm1 - vmovdqa %xmm0,(%rdi) - call GFMUL - vmovdqa %xmm0,16(%rdi) - call GFMUL - vmovdqa %xmm0,32(%rdi) - call GFMUL - vmovdqa %xmm0,48(%rdi) - call GFMUL - vmovdqa %xmm0,64(%rdi) - call GFMUL - vmovdqa %xmm0,80(%rdi) - call GFMUL - vmovdqa %xmm0,96(%rdi) - call GFMUL - vmovdqa %xmm0,112(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init -.globl aesgcmsiv_htable6_init -.hidden aesgcmsiv_htable6_init -.type aesgcmsiv_htable6_init,@function -.align 16 -aesgcmsiv_htable6_init: -.cfi_startproc - vmovdqa (%rsi),%xmm0 - vmovdqa %xmm0,%xmm1 - vmovdqa %xmm0,(%rdi) - call GFMUL - vmovdqa %xmm0,16(%rdi) - call GFMUL - vmovdqa %xmm0,32(%rdi) - call GFMUL - vmovdqa %xmm0,48(%rdi) - call GFMUL - vmovdqa %xmm0,64(%rdi) - call GFMUL - vmovdqa %xmm0,80(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init -.globl aesgcmsiv_htable_polyval -.hidden aesgcmsiv_htable_polyval -.type aesgcmsiv_htable_polyval,@function -.align 16 -aesgcmsiv_htable_polyval: -.cfi_startproc - testq %rdx,%rdx - jnz .Lhtable_polyval_start - .byte 0xf3,0xc3 - -.Lhtable_polyval_start: - vzeroall - - - - movq %rdx,%r11 - andq $127,%r11 - - jz .Lhtable_polyval_no_prefix - - vpxor %xmm9,%xmm9,%xmm9 - vmovdqa (%rcx),%xmm1 - subq %r11,%rdx - - subq $16,%r11 - - - vmovdqu (%rsi),%xmm0 - vpxor %xmm1,%xmm0,%xmm0 - - vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm5 - vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm3 - vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm4 - vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - leaq 16(%rsi),%rsi - testq %r11,%r11 - jnz .Lhtable_polyval_prefix_loop - jmp .Lhtable_polyval_prefix_complete - - -.align 64 -.Lhtable_polyval_prefix_loop: - subq $16,%r11 - - vmovdqu (%rsi),%xmm0 - - vpclmulqdq $0x00,(%rdi,%r11,1),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,(%rdi,%r11,1),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x01,(%rdi,%r11,1),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x10,(%rdi,%r11,1),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - testq %r11,%r11 - - leaq 16(%rsi),%rsi - - jnz .Lhtable_polyval_prefix_loop - -.Lhtable_polyval_prefix_complete: - vpsrldq $8,%xmm5,%xmm6 - vpslldq $8,%xmm5,%xmm5 - - vpxor %xmm6,%xmm4,%xmm9 - vpxor %xmm5,%xmm3,%xmm1 - - jmp .Lhtable_polyval_main_loop - -.Lhtable_polyval_no_prefix: - - - - - vpxor %xmm1,%xmm1,%xmm1 - vmovdqa (%rcx),%xmm9 - -.align 64 -.Lhtable_polyval_main_loop: - subq $0x80,%rdx - jb .Lhtable_polyval_out - - vmovdqu 112(%rsi),%xmm0 - - vpclmulqdq $0x01,(%rdi),%xmm0,%xmm5 - vpclmulqdq $0x00,(%rdi),%xmm0,%xmm3 - vpclmulqdq $0x11,(%rdi),%xmm0,%xmm4 - vpclmulqdq $0x10,(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vmovdqu 96(%rsi),%xmm0 - vpclmulqdq $0x01,16(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,16(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,16(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,16(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - - vmovdqu 80(%rsi),%xmm0 - - vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 - vpalignr $8,%xmm1,%xmm1,%xmm1 - - vpclmulqdq $0x01,32(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,32(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,32(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,32(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vpxor %xmm7,%xmm1,%xmm1 - - vmovdqu 64(%rsi),%xmm0 - - vpclmulqdq $0x01,48(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,48(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,48(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,48(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vmovdqu 48(%rsi),%xmm0 - - vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm7 - vpalignr $8,%xmm1,%xmm1,%xmm1 - - vpclmulqdq $0x01,64(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,64(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,64(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,64(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vpxor %xmm7,%xmm1,%xmm1 - - vmovdqu 32(%rsi),%xmm0 - - vpclmulqdq $0x01,80(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,80(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,80(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,80(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vpxor %xmm9,%xmm1,%xmm1 - - vmovdqu 16(%rsi),%xmm0 - - vpclmulqdq $0x01,96(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,96(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,96(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,96(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vmovdqu 0(%rsi),%xmm0 - vpxor %xmm1,%xmm0,%xmm0 - - vpclmulqdq $0x01,112(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x00,112(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm3,%xmm3 - vpclmulqdq $0x11,112(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm4,%xmm4 - vpclmulqdq $0x10,112(%rdi),%xmm0,%xmm6 - vpxor %xmm6,%xmm5,%xmm5 - - - vpsrldq $8,%xmm5,%xmm6 - vpslldq $8,%xmm5,%xmm5 - - vpxor %xmm6,%xmm4,%xmm9 - vpxor %xmm5,%xmm3,%xmm1 - - leaq 128(%rsi),%rsi - jmp .Lhtable_polyval_main_loop - - - -.Lhtable_polyval_out: - vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 - vpalignr $8,%xmm1,%xmm1,%xmm1 - vpxor %xmm6,%xmm1,%xmm1 - - vpclmulqdq $0x10,poly(%rip),%xmm1,%xmm6 - vpalignr $8,%xmm1,%xmm1,%xmm1 - vpxor %xmm6,%xmm1,%xmm1 - vpxor %xmm9,%xmm1,%xmm1 - - vmovdqu %xmm1,(%rcx) - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval -.globl aesgcmsiv_polyval_horner -.hidden aesgcmsiv_polyval_horner -.type aesgcmsiv_polyval_horner,@function -.align 16 -aesgcmsiv_polyval_horner: -.cfi_startproc - testq %rcx,%rcx - jnz .Lpolyval_horner_start - .byte 0xf3,0xc3 - -.Lpolyval_horner_start: - - - - xorq %r10,%r10 - shlq $4,%rcx - - vmovdqa (%rsi),%xmm1 - vmovdqa (%rdi),%xmm0 - -.Lpolyval_horner_loop: - vpxor (%rdx,%r10,1),%xmm0,%xmm0 - call GFMUL - - addq $16,%r10 - cmpq %r10,%rcx - jne .Lpolyval_horner_loop - - - vmovdqa %xmm0,(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner -.globl aes128gcmsiv_aes_ks -.hidden aes128gcmsiv_aes_ks -.type aes128gcmsiv_aes_ks,@function -.align 16 -aes128gcmsiv_aes_ks: -.cfi_startproc - vmovdqu (%rdi),%xmm1 - vmovdqa %xmm1,(%rsi) - - vmovdqa con1(%rip),%xmm0 - vmovdqa mask(%rip),%xmm15 - - movq $8,%rax - -.Lks128_loop: - addq $16,%rsi - subq $1,%rax - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm1,(%rsi) - jne .Lks128_loop - - vmovdqa con2(%rip),%xmm0 - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm1,16(%rsi) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslldq $4,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpslldq $4,%xmm3,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm1,32(%rsi) - .byte 0xf3,0xc3 -.cfi_endproc -.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks -.globl aes256gcmsiv_aes_ks -.hidden aes256gcmsiv_aes_ks -.type aes256gcmsiv_aes_ks,@function -.align 16 -aes256gcmsiv_aes_ks: -.cfi_startproc - vmovdqu (%rdi),%xmm1 - vmovdqu 16(%rdi),%xmm3 - vmovdqa %xmm1,(%rsi) - vmovdqa %xmm3,16(%rsi) - vmovdqa con1(%rip),%xmm0 - vmovdqa mask(%rip),%xmm15 - vpxor %xmm14,%xmm14,%xmm14 - movq $6,%rax - -.Lks256_loop: - addq $32,%rsi - subq $1,%rax - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm1,(%rsi) - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpsllq $32,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpshufb con3(%rip),%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vmovdqa %xmm3,16(%rsi) - jne .Lks256_loop - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpsllq $32,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vmovdqa %xmm1,32(%rsi) - .byte 0xf3,0xc3 -.cfi_endproc -.globl aes128gcmsiv_aes_ks_enc_x1 -.hidden aes128gcmsiv_aes_ks_enc_x1 -.type aes128gcmsiv_aes_ks_enc_x1,@function -.align 16 -aes128gcmsiv_aes_ks_enc_x1: -.cfi_startproc - vmovdqa (%rcx),%xmm1 - vmovdqa 0(%rdi),%xmm4 - - vmovdqa %xmm1,(%rdx) - vpxor %xmm1,%xmm4,%xmm4 - - vmovdqa con1(%rip),%xmm0 - vmovdqa mask(%rip),%xmm15 - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,16(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,32(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,48(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,64(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,80(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,96(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,112(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,128(%rdx) - - - vmovdqa con2(%rip),%xmm0 - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenc %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,144(%rdx) - - vpshufb %xmm15,%xmm1,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpsllq $32,%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpshufb con3(%rip),%xmm1,%xmm3 - vpxor %xmm3,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - - vaesenclast %xmm1,%xmm4,%xmm4 - vmovdqa %xmm1,160(%rdx) - - - vmovdqa %xmm4,0(%rsi) - .byte 0xf3,0xc3 -.cfi_endproc -.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1 -.globl aes128gcmsiv_kdf -.hidden aes128gcmsiv_kdf -.type aes128gcmsiv_kdf,@function -.align 16 -aes128gcmsiv_kdf: -.cfi_startproc - - - - - vmovdqa (%rdx),%xmm1 - vmovdqa 0(%rdi),%xmm9 - vmovdqa and_mask(%rip),%xmm12 - vmovdqa one(%rip),%xmm13 - vpshufd $0x90,%xmm9,%xmm9 - vpand %xmm12,%xmm9,%xmm9 - vpaddd %xmm13,%xmm9,%xmm10 - vpaddd %xmm13,%xmm10,%xmm11 - vpaddd %xmm13,%xmm11,%xmm12 - - vpxor %xmm1,%xmm9,%xmm9 - vpxor %xmm1,%xmm10,%xmm10 - vpxor %xmm1,%xmm11,%xmm11 - vpxor %xmm1,%xmm12,%xmm12 - - vmovdqa 16(%rdx),%xmm1 - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - - vmovdqa 32(%rdx),%xmm2 - vaesenc %xmm2,%xmm9,%xmm9 - vaesenc %xmm2,%xmm10,%xmm10 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - - vmovdqa 48(%rdx),%xmm1 - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - - vmovdqa 64(%rdx),%xmm2 - vaesenc %xmm2,%xmm9,%xmm9 - vaesenc %xmm2,%xmm10,%xmm10 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - - vmovdqa 80(%rdx),%xmm1 - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - - vmovdqa 96(%rdx),%xmm2 - vaesenc %xmm2,%xmm9,%xmm9 - vaesenc %xmm2,%xmm10,%xmm10 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - - vmovdqa 112(%rdx),%xmm1 - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - - vmovdqa 128(%rdx),%xmm2 - vaesenc %xmm2,%xmm9,%xmm9 - vaesenc %xmm2,%xmm10,%xmm10 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - - vmovdqa 144(%rdx),%xmm1 - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - - vmovdqa 160(%rdx),%xmm2 - vaesenclast %xmm2,%xmm9,%xmm9 - vaesenclast %xmm2,%xmm10,%xmm10 - vaesenclast %xmm2,%xmm11,%xmm11 - vaesenclast %xmm2,%xmm12,%xmm12 - - - vmovdqa %xmm9,0(%rsi) - vmovdqa %xmm10,16(%rsi) - vmovdqa %xmm11,32(%rsi) - vmovdqa %xmm12,48(%rsi) - .byte 0xf3,0xc3 -.cfi_endproc -.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf -.globl aes128gcmsiv_enc_msg_x4 -.hidden aes128gcmsiv_enc_msg_x4 -.type aes128gcmsiv_enc_msg_x4,@function -.align 16 -aes128gcmsiv_enc_msg_x4: -.cfi_startproc - testq %r8,%r8 - jnz .L128_enc_msg_x4_start - .byte 0xf3,0xc3 - -.L128_enc_msg_x4_start: - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-16 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-24 - - shrq $4,%r8 - movq %r8,%r10 - shlq $62,%r10 - shrq $62,%r10 - - - vmovdqa (%rdx),%xmm15 - vpor OR_MASK(%rip),%xmm15,%xmm15 - - vmovdqu four(%rip),%xmm4 - vmovdqa %xmm15,%xmm0 - vpaddd one(%rip),%xmm15,%xmm1 - vpaddd two(%rip),%xmm15,%xmm2 - vpaddd three(%rip),%xmm15,%xmm3 - - shrq $2,%r8 - je .L128_enc_msg_x4_check_remainder - - subq $64,%rsi - subq $64,%rdi - -.L128_enc_msg_x4_loop1: - addq $64,%rsi - addq $64,%rdi - - vmovdqa %xmm0,%xmm5 - vmovdqa %xmm1,%xmm6 - vmovdqa %xmm2,%xmm7 - vmovdqa %xmm3,%xmm8 - - vpxor (%rcx),%xmm5,%xmm5 - vpxor (%rcx),%xmm6,%xmm6 - vpxor (%rcx),%xmm7,%xmm7 - vpxor (%rcx),%xmm8,%xmm8 - - vmovdqu 16(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm0,%xmm0 - vmovdqu 32(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm1,%xmm1 - vmovdqu 48(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm2,%xmm2 - vmovdqu 64(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm3,%xmm3 - - vmovdqu 80(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 96(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 112(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 128(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 144(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 160(%rcx),%xmm12 - vaesenclast %xmm12,%xmm5,%xmm5 - vaesenclast %xmm12,%xmm6,%xmm6 - vaesenclast %xmm12,%xmm7,%xmm7 - vaesenclast %xmm12,%xmm8,%xmm8 - - - - vpxor 0(%rdi),%xmm5,%xmm5 - vpxor 16(%rdi),%xmm6,%xmm6 - vpxor 32(%rdi),%xmm7,%xmm7 - vpxor 48(%rdi),%xmm8,%xmm8 - - subq $1,%r8 - - vmovdqu %xmm5,0(%rsi) - vmovdqu %xmm6,16(%rsi) - vmovdqu %xmm7,32(%rsi) - vmovdqu %xmm8,48(%rsi) - - jne .L128_enc_msg_x4_loop1 - - addq $64,%rsi - addq $64,%rdi - -.L128_enc_msg_x4_check_remainder: - cmpq $0,%r10 - je .L128_enc_msg_x4_out - -.L128_enc_msg_x4_loop2: - - - vmovdqa %xmm0,%xmm5 - vpaddd one(%rip),%xmm0,%xmm0 - - vpxor (%rcx),%xmm5,%xmm5 - vaesenc 16(%rcx),%xmm5,%xmm5 - vaesenc 32(%rcx),%xmm5,%xmm5 - vaesenc 48(%rcx),%xmm5,%xmm5 - vaesenc 64(%rcx),%xmm5,%xmm5 - vaesenc 80(%rcx),%xmm5,%xmm5 - vaesenc 96(%rcx),%xmm5,%xmm5 - vaesenc 112(%rcx),%xmm5,%xmm5 - vaesenc 128(%rcx),%xmm5,%xmm5 - vaesenc 144(%rcx),%xmm5,%xmm5 - vaesenclast 160(%rcx),%xmm5,%xmm5 - - - vpxor (%rdi),%xmm5,%xmm5 - vmovdqu %xmm5,(%rsi) - - addq $16,%rdi - addq $16,%rsi - - subq $1,%r10 - jne .L128_enc_msg_x4_loop2 - -.L128_enc_msg_x4_out: - popq %r13 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r13 - popq %r12 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r12 - .byte 0xf3,0xc3 -.cfi_endproc -.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4 -.globl aes128gcmsiv_enc_msg_x8 -.hidden aes128gcmsiv_enc_msg_x8 -.type aes128gcmsiv_enc_msg_x8,@function -.align 16 -aes128gcmsiv_enc_msg_x8: -.cfi_startproc - testq %r8,%r8 - jnz .L128_enc_msg_x8_start - .byte 0xf3,0xc3 - -.L128_enc_msg_x8_start: - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-16 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-24 - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-32 - movq %rsp,%rbp -.cfi_def_cfa_register rbp - - - subq $128,%rsp - andq $-64,%rsp - - shrq $4,%r8 - movq %r8,%r10 - shlq $61,%r10 - shrq $61,%r10 - - - vmovdqu (%rdx),%xmm1 - vpor OR_MASK(%rip),%xmm1,%xmm1 - - - vpaddd seven(%rip),%xmm1,%xmm0 - vmovdqu %xmm0,(%rsp) - vpaddd one(%rip),%xmm1,%xmm9 - vpaddd two(%rip),%xmm1,%xmm10 - vpaddd three(%rip),%xmm1,%xmm11 - vpaddd four(%rip),%xmm1,%xmm12 - vpaddd five(%rip),%xmm1,%xmm13 - vpaddd six(%rip),%xmm1,%xmm14 - vmovdqa %xmm1,%xmm0 - - shrq $3,%r8 - je .L128_enc_msg_x8_check_remainder - - subq $128,%rsi - subq $128,%rdi - -.L128_enc_msg_x8_loop1: - addq $128,%rsi - addq $128,%rdi - - vmovdqa %xmm0,%xmm1 - vmovdqa %xmm9,%xmm2 - vmovdqa %xmm10,%xmm3 - vmovdqa %xmm11,%xmm4 - vmovdqa %xmm12,%xmm5 - vmovdqa %xmm13,%xmm6 - vmovdqa %xmm14,%xmm7 - - vmovdqu (%rsp),%xmm8 - - vpxor (%rcx),%xmm1,%xmm1 - vpxor (%rcx),%xmm2,%xmm2 - vpxor (%rcx),%xmm3,%xmm3 - vpxor (%rcx),%xmm4,%xmm4 - vpxor (%rcx),%xmm5,%xmm5 - vpxor (%rcx),%xmm6,%xmm6 - vpxor (%rcx),%xmm7,%xmm7 - vpxor (%rcx),%xmm8,%xmm8 - - vmovdqu 16(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu (%rsp),%xmm14 - vpaddd eight(%rip),%xmm14,%xmm14 - vmovdqu %xmm14,(%rsp) - vmovdqu 32(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpsubd one(%rip),%xmm14,%xmm14 - vmovdqu 48(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm0,%xmm0 - vmovdqu 64(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm9,%xmm9 - vmovdqu 80(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm10,%xmm10 - vmovdqu 96(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm11,%xmm11 - vmovdqu 112(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm12,%xmm12 - vmovdqu 128(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm13,%xmm13 - vmovdqu 144(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 160(%rcx),%xmm15 - vaesenclast %xmm15,%xmm1,%xmm1 - vaesenclast %xmm15,%xmm2,%xmm2 - vaesenclast %xmm15,%xmm3,%xmm3 - vaesenclast %xmm15,%xmm4,%xmm4 - vaesenclast %xmm15,%xmm5,%xmm5 - vaesenclast %xmm15,%xmm6,%xmm6 - vaesenclast %xmm15,%xmm7,%xmm7 - vaesenclast %xmm15,%xmm8,%xmm8 - - - - vpxor 0(%rdi),%xmm1,%xmm1 - vpxor 16(%rdi),%xmm2,%xmm2 - vpxor 32(%rdi),%xmm3,%xmm3 - vpxor 48(%rdi),%xmm4,%xmm4 - vpxor 64(%rdi),%xmm5,%xmm5 - vpxor 80(%rdi),%xmm6,%xmm6 - vpxor 96(%rdi),%xmm7,%xmm7 - vpxor 112(%rdi),%xmm8,%xmm8 - - decq %r8 - - vmovdqu %xmm1,0(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - vmovdqu %xmm7,96(%rsi) - vmovdqu %xmm8,112(%rsi) - - jne .L128_enc_msg_x8_loop1 - - addq $128,%rsi - addq $128,%rdi - -.L128_enc_msg_x8_check_remainder: - cmpq $0,%r10 - je .L128_enc_msg_x8_out - -.L128_enc_msg_x8_loop2: - - - vmovdqa %xmm0,%xmm1 - vpaddd one(%rip),%xmm0,%xmm0 - - vpxor (%rcx),%xmm1,%xmm1 - vaesenc 16(%rcx),%xmm1,%xmm1 - vaesenc 32(%rcx),%xmm1,%xmm1 - vaesenc 48(%rcx),%xmm1,%xmm1 - vaesenc 64(%rcx),%xmm1,%xmm1 - vaesenc 80(%rcx),%xmm1,%xmm1 - vaesenc 96(%rcx),%xmm1,%xmm1 - vaesenc 112(%rcx),%xmm1,%xmm1 - vaesenc 128(%rcx),%xmm1,%xmm1 - vaesenc 144(%rcx),%xmm1,%xmm1 - vaesenclast 160(%rcx),%xmm1,%xmm1 - - - vpxor (%rdi),%xmm1,%xmm1 - - vmovdqu %xmm1,(%rsi) - - addq $16,%rdi - addq $16,%rsi - - decq %r10 - jne .L128_enc_msg_x8_loop2 - -.L128_enc_msg_x8_out: - movq %rbp,%rsp -.cfi_def_cfa_register %rsp - popq %rbp -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbp - popq %r13 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r13 - popq %r12 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r12 - .byte 0xf3,0xc3 -.cfi_endproc -.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8 -.globl aes128gcmsiv_dec -.hidden aes128gcmsiv_dec -.type aes128gcmsiv_dec,@function -.align 16 -aes128gcmsiv_dec: -.cfi_startproc - testq $~15,%r9 - jnz .L128_dec_start - .byte 0xf3,0xc3 - -.L128_dec_start: - vzeroupper - vmovdqa (%rdx),%xmm0 - movq %rdx,%rax - - leaq 32(%rax),%rax - leaq 32(%rcx),%rcx - - - vmovdqu (%rdi,%r9,1),%xmm15 - vpor OR_MASK(%rip),%xmm15,%xmm15 - andq $~15,%r9 - - - cmpq $96,%r9 - jb .L128_dec_loop2 - - - subq $96,%r9 - vmovdqa %xmm15,%xmm7 - vpaddd one(%rip),%xmm7,%xmm8 - vpaddd two(%rip),%xmm7,%xmm9 - vpaddd one(%rip),%xmm9,%xmm10 - vpaddd two(%rip),%xmm9,%xmm11 - vpaddd one(%rip),%xmm11,%xmm12 - vpaddd two(%rip),%xmm11,%xmm15 - - vpxor (%r8),%xmm7,%xmm7 - vpxor (%r8),%xmm8,%xmm8 - vpxor (%r8),%xmm9,%xmm9 - vpxor (%r8),%xmm10,%xmm10 - vpxor (%r8),%xmm11,%xmm11 - vpxor (%r8),%xmm12,%xmm12 - - vmovdqu 16(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 32(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 48(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 64(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 80(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 96(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 112(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 128(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 144(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 160(%r8),%xmm4 - vaesenclast %xmm4,%xmm7,%xmm7 - vaesenclast %xmm4,%xmm8,%xmm8 - vaesenclast %xmm4,%xmm9,%xmm9 - vaesenclast %xmm4,%xmm10,%xmm10 - vaesenclast %xmm4,%xmm11,%xmm11 - vaesenclast %xmm4,%xmm12,%xmm12 - - - vpxor 0(%rdi),%xmm7,%xmm7 - vpxor 16(%rdi),%xmm8,%xmm8 - vpxor 32(%rdi),%xmm9,%xmm9 - vpxor 48(%rdi),%xmm10,%xmm10 - vpxor 64(%rdi),%xmm11,%xmm11 - vpxor 80(%rdi),%xmm12,%xmm12 - - vmovdqu %xmm7,0(%rsi) - vmovdqu %xmm8,16(%rsi) - vmovdqu %xmm9,32(%rsi) - vmovdqu %xmm10,48(%rsi) - vmovdqu %xmm11,64(%rsi) - vmovdqu %xmm12,80(%rsi) - - addq $96,%rdi - addq $96,%rsi - jmp .L128_dec_loop1 - - -.align 64 -.L128_dec_loop1: - cmpq $96,%r9 - jb .L128_dec_finish_96 - subq $96,%r9 - - vmovdqa %xmm12,%xmm6 - vmovdqa %xmm11,16-32(%rax) - vmovdqa %xmm10,32-32(%rax) - vmovdqa %xmm9,48-32(%rax) - vmovdqa %xmm8,64-32(%rax) - vmovdqa %xmm7,80-32(%rax) - - vmovdqa %xmm15,%xmm7 - vpaddd one(%rip),%xmm7,%xmm8 - vpaddd two(%rip),%xmm7,%xmm9 - vpaddd one(%rip),%xmm9,%xmm10 - vpaddd two(%rip),%xmm9,%xmm11 - vpaddd one(%rip),%xmm11,%xmm12 - vpaddd two(%rip),%xmm11,%xmm15 - - vmovdqa (%r8),%xmm4 - vpxor %xmm4,%xmm7,%xmm7 - vpxor %xmm4,%xmm8,%xmm8 - vpxor %xmm4,%xmm9,%xmm9 - vpxor %xmm4,%xmm10,%xmm10 - vpxor %xmm4,%xmm11,%xmm11 - vpxor %xmm4,%xmm12,%xmm12 - - vmovdqu 0-32(%rcx),%xmm4 - vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 - vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 - vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 16(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu -16(%rax),%xmm6 - vmovdqu -16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 32(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 0(%rax),%xmm6 - vmovdqu 0(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 48(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 16(%rax),%xmm6 - vmovdqu 16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 64(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 32(%rax),%xmm6 - vmovdqu 32(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 80(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 96(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 112(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - - vmovdqa 80-32(%rax),%xmm6 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqu 80-32(%rcx),%xmm5 - - vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 128(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - - vpsrldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm2,%xmm5 - vpslldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm0 - - vmovdqa poly(%rip),%xmm3 - - vmovdqu 144(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 160(%r8),%xmm6 - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpxor 0(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm7,%xmm7 - vpxor 16(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm8,%xmm8 - vpxor 32(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm9,%xmm9 - vpxor 48(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm10,%xmm10 - vpxor 64(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm11,%xmm11 - vpxor 80(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm12,%xmm12 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vmovdqu %xmm7,0(%rsi) - vmovdqu %xmm8,16(%rsi) - vmovdqu %xmm9,32(%rsi) - vmovdqu %xmm10,48(%rsi) - vmovdqu %xmm11,64(%rsi) - vmovdqu %xmm12,80(%rsi) - - vpxor %xmm5,%xmm0,%xmm0 - - leaq 96(%rdi),%rdi - leaq 96(%rsi),%rsi - jmp .L128_dec_loop1 - -.L128_dec_finish_96: - vmovdqa %xmm12,%xmm6 - vmovdqa %xmm11,16-32(%rax) - vmovdqa %xmm10,32-32(%rax) - vmovdqa %xmm9,48-32(%rax) - vmovdqa %xmm8,64-32(%rax) - vmovdqa %xmm7,80-32(%rax) - - vmovdqu 0-32(%rcx),%xmm4 - vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 - vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 - vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu -16(%rax),%xmm6 - vmovdqu -16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 0(%rax),%xmm6 - vmovdqu 0(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 16(%rax),%xmm6 - vmovdqu 16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 32(%rax),%xmm6 - vmovdqu 32(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 80-32(%rax),%xmm6 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqu 80-32(%rcx),%xmm5 - vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm2,%xmm5 - vpslldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm0 - - vmovdqa poly(%rip),%xmm3 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpxor %xmm5,%xmm0,%xmm0 - -.L128_dec_loop2: - - - - cmpq $16,%r9 - jb .L128_dec_out - subq $16,%r9 - - vmovdqa %xmm15,%xmm2 - vpaddd one(%rip),%xmm15,%xmm15 - - vpxor 0(%r8),%xmm2,%xmm2 - vaesenc 16(%r8),%xmm2,%xmm2 - vaesenc 32(%r8),%xmm2,%xmm2 - vaesenc 48(%r8),%xmm2,%xmm2 - vaesenc 64(%r8),%xmm2,%xmm2 - vaesenc 80(%r8),%xmm2,%xmm2 - vaesenc 96(%r8),%xmm2,%xmm2 - vaesenc 112(%r8),%xmm2,%xmm2 - vaesenc 128(%r8),%xmm2,%xmm2 - vaesenc 144(%r8),%xmm2,%xmm2 - vaesenclast 160(%r8),%xmm2,%xmm2 - vpxor (%rdi),%xmm2,%xmm2 - vmovdqu %xmm2,(%rsi) - addq $16,%rdi - addq $16,%rsi - - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa -32(%rcx),%xmm1 - call GFMUL - - jmp .L128_dec_loop2 - -.L128_dec_out: - vmovdqu %xmm0,(%rdx) - .byte 0xf3,0xc3 -.cfi_endproc -.size aes128gcmsiv_dec, .-aes128gcmsiv_dec -.globl aes128gcmsiv_ecb_enc_block -.hidden aes128gcmsiv_ecb_enc_block -.type aes128gcmsiv_ecb_enc_block,@function -.align 16 -aes128gcmsiv_ecb_enc_block: -.cfi_startproc - vmovdqa (%rdi),%xmm1 - - vpxor (%rdx),%xmm1,%xmm1 - vaesenc 16(%rdx),%xmm1,%xmm1 - vaesenc 32(%rdx),%xmm1,%xmm1 - vaesenc 48(%rdx),%xmm1,%xmm1 - vaesenc 64(%rdx),%xmm1,%xmm1 - vaesenc 80(%rdx),%xmm1,%xmm1 - vaesenc 96(%rdx),%xmm1,%xmm1 - vaesenc 112(%rdx),%xmm1,%xmm1 - vaesenc 128(%rdx),%xmm1,%xmm1 - vaesenc 144(%rdx),%xmm1,%xmm1 - vaesenclast 160(%rdx),%xmm1,%xmm1 - - vmovdqa %xmm1,(%rsi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block -.globl aes256gcmsiv_aes_ks_enc_x1 -.hidden aes256gcmsiv_aes_ks_enc_x1 -.type aes256gcmsiv_aes_ks_enc_x1,@function -.align 16 -aes256gcmsiv_aes_ks_enc_x1: -.cfi_startproc - vmovdqa con1(%rip),%xmm0 - vmovdqa mask(%rip),%xmm15 - vmovdqa (%rdi),%xmm8 - vmovdqa (%rcx),%xmm1 - vmovdqa 16(%rcx),%xmm3 - vpxor %xmm1,%xmm8,%xmm8 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm1,(%rdx) - vmovdqu %xmm3,16(%rdx) - vpxor %xmm14,%xmm14,%xmm14 - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,32(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,48(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,64(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,80(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,96(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,112(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,128(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,144(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,160(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,176(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslld $1,%xmm0,%xmm0 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenc %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,192(%rdx) - - vpshufd $0xff,%xmm1,%xmm2 - vaesenclast %xmm14,%xmm2,%xmm2 - vpslldq $4,%xmm3,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpxor %xmm2,%xmm3,%xmm3 - vaesenc %xmm3,%xmm8,%xmm8 - vmovdqu %xmm3,208(%rdx) - - vpshufb %xmm15,%xmm3,%xmm2 - vaesenclast %xmm0,%xmm2,%xmm2 - vpslldq $4,%xmm1,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpslldq $4,%xmm4,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpxor %xmm2,%xmm1,%xmm1 - vaesenclast %xmm1,%xmm8,%xmm8 - vmovdqu %xmm1,224(%rdx) - - vmovdqa %xmm8,(%rsi) - .byte 0xf3,0xc3 -.cfi_endproc -.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1 -.globl aes256gcmsiv_ecb_enc_block -.hidden aes256gcmsiv_ecb_enc_block -.type aes256gcmsiv_ecb_enc_block,@function -.align 16 -aes256gcmsiv_ecb_enc_block: -.cfi_startproc - vmovdqa (%rdi),%xmm1 - vpxor (%rdx),%xmm1,%xmm1 - vaesenc 16(%rdx),%xmm1,%xmm1 - vaesenc 32(%rdx),%xmm1,%xmm1 - vaesenc 48(%rdx),%xmm1,%xmm1 - vaesenc 64(%rdx),%xmm1,%xmm1 - vaesenc 80(%rdx),%xmm1,%xmm1 - vaesenc 96(%rdx),%xmm1,%xmm1 - vaesenc 112(%rdx),%xmm1,%xmm1 - vaesenc 128(%rdx),%xmm1,%xmm1 - vaesenc 144(%rdx),%xmm1,%xmm1 - vaesenc 160(%rdx),%xmm1,%xmm1 - vaesenc 176(%rdx),%xmm1,%xmm1 - vaesenc 192(%rdx),%xmm1,%xmm1 - vaesenc 208(%rdx),%xmm1,%xmm1 - vaesenclast 224(%rdx),%xmm1,%xmm1 - vmovdqa %xmm1,(%rsi) - .byte 0xf3,0xc3 -.cfi_endproc -.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block -.globl aes256gcmsiv_enc_msg_x4 -.hidden aes256gcmsiv_enc_msg_x4 -.type aes256gcmsiv_enc_msg_x4,@function -.align 16 -aes256gcmsiv_enc_msg_x4: -.cfi_startproc - testq %r8,%r8 - jnz .L256_enc_msg_x4_start - .byte 0xf3,0xc3 - -.L256_enc_msg_x4_start: - movq %r8,%r10 - shrq $4,%r8 - shlq $60,%r10 - jz .L256_enc_msg_x4_start2 - addq $1,%r8 - -.L256_enc_msg_x4_start2: - movq %r8,%r10 - shlq $62,%r10 - shrq $62,%r10 - - - vmovdqa (%rdx),%xmm15 - vpor OR_MASK(%rip),%xmm15,%xmm15 - - vmovdqa four(%rip),%xmm4 - vmovdqa %xmm15,%xmm0 - vpaddd one(%rip),%xmm15,%xmm1 - vpaddd two(%rip),%xmm15,%xmm2 - vpaddd three(%rip),%xmm15,%xmm3 - - shrq $2,%r8 - je .L256_enc_msg_x4_check_remainder - - subq $64,%rsi - subq $64,%rdi - -.L256_enc_msg_x4_loop1: - addq $64,%rsi - addq $64,%rdi - - vmovdqa %xmm0,%xmm5 - vmovdqa %xmm1,%xmm6 - vmovdqa %xmm2,%xmm7 - vmovdqa %xmm3,%xmm8 - - vpxor (%rcx),%xmm5,%xmm5 - vpxor (%rcx),%xmm6,%xmm6 - vpxor (%rcx),%xmm7,%xmm7 - vpxor (%rcx),%xmm8,%xmm8 - - vmovdqu 16(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm0,%xmm0 - vmovdqu 32(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm1,%xmm1 - vmovdqu 48(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm2,%xmm2 - vmovdqu 64(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vpaddd %xmm4,%xmm3,%xmm3 - - vmovdqu 80(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 96(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 112(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 128(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 144(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 160(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 176(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 192(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 208(%rcx),%xmm12 - vaesenc %xmm12,%xmm5,%xmm5 - vaesenc %xmm12,%xmm6,%xmm6 - vaesenc %xmm12,%xmm7,%xmm7 - vaesenc %xmm12,%xmm8,%xmm8 - - vmovdqu 224(%rcx),%xmm12 - vaesenclast %xmm12,%xmm5,%xmm5 - vaesenclast %xmm12,%xmm6,%xmm6 - vaesenclast %xmm12,%xmm7,%xmm7 - vaesenclast %xmm12,%xmm8,%xmm8 - - - - vpxor 0(%rdi),%xmm5,%xmm5 - vpxor 16(%rdi),%xmm6,%xmm6 - vpxor 32(%rdi),%xmm7,%xmm7 - vpxor 48(%rdi),%xmm8,%xmm8 - - subq $1,%r8 - - vmovdqu %xmm5,0(%rsi) - vmovdqu %xmm6,16(%rsi) - vmovdqu %xmm7,32(%rsi) - vmovdqu %xmm8,48(%rsi) - - jne .L256_enc_msg_x4_loop1 - - addq $64,%rsi - addq $64,%rdi - -.L256_enc_msg_x4_check_remainder: - cmpq $0,%r10 - je .L256_enc_msg_x4_out - -.L256_enc_msg_x4_loop2: - - - - vmovdqa %xmm0,%xmm5 - vpaddd one(%rip),%xmm0,%xmm0 - vpxor (%rcx),%xmm5,%xmm5 - vaesenc 16(%rcx),%xmm5,%xmm5 - vaesenc 32(%rcx),%xmm5,%xmm5 - vaesenc 48(%rcx),%xmm5,%xmm5 - vaesenc 64(%rcx),%xmm5,%xmm5 - vaesenc 80(%rcx),%xmm5,%xmm5 - vaesenc 96(%rcx),%xmm5,%xmm5 - vaesenc 112(%rcx),%xmm5,%xmm5 - vaesenc 128(%rcx),%xmm5,%xmm5 - vaesenc 144(%rcx),%xmm5,%xmm5 - vaesenc 160(%rcx),%xmm5,%xmm5 - vaesenc 176(%rcx),%xmm5,%xmm5 - vaesenc 192(%rcx),%xmm5,%xmm5 - vaesenc 208(%rcx),%xmm5,%xmm5 - vaesenclast 224(%rcx),%xmm5,%xmm5 - - - vpxor (%rdi),%xmm5,%xmm5 - - vmovdqu %xmm5,(%rsi) - - addq $16,%rdi - addq $16,%rsi - - subq $1,%r10 - jne .L256_enc_msg_x4_loop2 - -.L256_enc_msg_x4_out: - .byte 0xf3,0xc3 -.cfi_endproc -.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4 -.globl aes256gcmsiv_enc_msg_x8 -.hidden aes256gcmsiv_enc_msg_x8 -.type aes256gcmsiv_enc_msg_x8,@function -.align 16 -aes256gcmsiv_enc_msg_x8: -.cfi_startproc - testq %r8,%r8 - jnz .L256_enc_msg_x8_start - .byte 0xf3,0xc3 - -.L256_enc_msg_x8_start: - - movq %rsp,%r11 - subq $16,%r11 - andq $-64,%r11 - - movq %r8,%r10 - shrq $4,%r8 - shlq $60,%r10 - jz .L256_enc_msg_x8_start2 - addq $1,%r8 - -.L256_enc_msg_x8_start2: - movq %r8,%r10 - shlq $61,%r10 - shrq $61,%r10 - - - vmovdqa (%rdx),%xmm1 - vpor OR_MASK(%rip),%xmm1,%xmm1 - - - vpaddd seven(%rip),%xmm1,%xmm0 - vmovdqa %xmm0,(%r11) - vpaddd one(%rip),%xmm1,%xmm9 - vpaddd two(%rip),%xmm1,%xmm10 - vpaddd three(%rip),%xmm1,%xmm11 - vpaddd four(%rip),%xmm1,%xmm12 - vpaddd five(%rip),%xmm1,%xmm13 - vpaddd six(%rip),%xmm1,%xmm14 - vmovdqa %xmm1,%xmm0 - - shrq $3,%r8 - jz .L256_enc_msg_x8_check_remainder - - subq $128,%rsi - subq $128,%rdi - -.L256_enc_msg_x8_loop1: - addq $128,%rsi - addq $128,%rdi - - vmovdqa %xmm0,%xmm1 - vmovdqa %xmm9,%xmm2 - vmovdqa %xmm10,%xmm3 - vmovdqa %xmm11,%xmm4 - vmovdqa %xmm12,%xmm5 - vmovdqa %xmm13,%xmm6 - vmovdqa %xmm14,%xmm7 - - vmovdqa (%r11),%xmm8 - - vpxor (%rcx),%xmm1,%xmm1 - vpxor (%rcx),%xmm2,%xmm2 - vpxor (%rcx),%xmm3,%xmm3 - vpxor (%rcx),%xmm4,%xmm4 - vpxor (%rcx),%xmm5,%xmm5 - vpxor (%rcx),%xmm6,%xmm6 - vpxor (%rcx),%xmm7,%xmm7 - vpxor (%rcx),%xmm8,%xmm8 - - vmovdqu 16(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqa (%r11),%xmm14 - vpaddd eight(%rip),%xmm14,%xmm14 - vmovdqa %xmm14,(%r11) - vmovdqu 32(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpsubd one(%rip),%xmm14,%xmm14 - vmovdqu 48(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm0,%xmm0 - vmovdqu 64(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm9,%xmm9 - vmovdqu 80(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm10,%xmm10 - vmovdqu 96(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm11,%xmm11 - vmovdqu 112(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm12,%xmm12 - vmovdqu 128(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vpaddd eight(%rip),%xmm13,%xmm13 - vmovdqu 144(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 160(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 176(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 192(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 208(%rcx),%xmm15 - vaesenc %xmm15,%xmm1,%xmm1 - vaesenc %xmm15,%xmm2,%xmm2 - vaesenc %xmm15,%xmm3,%xmm3 - vaesenc %xmm15,%xmm4,%xmm4 - vaesenc %xmm15,%xmm5,%xmm5 - vaesenc %xmm15,%xmm6,%xmm6 - vaesenc %xmm15,%xmm7,%xmm7 - vaesenc %xmm15,%xmm8,%xmm8 - - vmovdqu 224(%rcx),%xmm15 - vaesenclast %xmm15,%xmm1,%xmm1 - vaesenclast %xmm15,%xmm2,%xmm2 - vaesenclast %xmm15,%xmm3,%xmm3 - vaesenclast %xmm15,%xmm4,%xmm4 - vaesenclast %xmm15,%xmm5,%xmm5 - vaesenclast %xmm15,%xmm6,%xmm6 - vaesenclast %xmm15,%xmm7,%xmm7 - vaesenclast %xmm15,%xmm8,%xmm8 - - - - vpxor 0(%rdi),%xmm1,%xmm1 - vpxor 16(%rdi),%xmm2,%xmm2 - vpxor 32(%rdi),%xmm3,%xmm3 - vpxor 48(%rdi),%xmm4,%xmm4 - vpxor 64(%rdi),%xmm5,%xmm5 - vpxor 80(%rdi),%xmm6,%xmm6 - vpxor 96(%rdi),%xmm7,%xmm7 - vpxor 112(%rdi),%xmm8,%xmm8 - - subq $1,%r8 - - vmovdqu %xmm1,0(%rsi) - vmovdqu %xmm2,16(%rsi) - vmovdqu %xmm3,32(%rsi) - vmovdqu %xmm4,48(%rsi) - vmovdqu %xmm5,64(%rsi) - vmovdqu %xmm6,80(%rsi) - vmovdqu %xmm7,96(%rsi) - vmovdqu %xmm8,112(%rsi) - - jne .L256_enc_msg_x8_loop1 - - addq $128,%rsi - addq $128,%rdi - -.L256_enc_msg_x8_check_remainder: - cmpq $0,%r10 - je .L256_enc_msg_x8_out - -.L256_enc_msg_x8_loop2: - - - vmovdqa %xmm0,%xmm1 - vpaddd one(%rip),%xmm0,%xmm0 - - vpxor (%rcx),%xmm1,%xmm1 - vaesenc 16(%rcx),%xmm1,%xmm1 - vaesenc 32(%rcx),%xmm1,%xmm1 - vaesenc 48(%rcx),%xmm1,%xmm1 - vaesenc 64(%rcx),%xmm1,%xmm1 - vaesenc 80(%rcx),%xmm1,%xmm1 - vaesenc 96(%rcx),%xmm1,%xmm1 - vaesenc 112(%rcx),%xmm1,%xmm1 - vaesenc 128(%rcx),%xmm1,%xmm1 - vaesenc 144(%rcx),%xmm1,%xmm1 - vaesenc 160(%rcx),%xmm1,%xmm1 - vaesenc 176(%rcx),%xmm1,%xmm1 - vaesenc 192(%rcx),%xmm1,%xmm1 - vaesenc 208(%rcx),%xmm1,%xmm1 - vaesenclast 224(%rcx),%xmm1,%xmm1 - - - vpxor (%rdi),%xmm1,%xmm1 - - vmovdqu %xmm1,(%rsi) - - addq $16,%rdi - addq $16,%rsi - subq $1,%r10 - jnz .L256_enc_msg_x8_loop2 - -.L256_enc_msg_x8_out: - .byte 0xf3,0xc3 - -.cfi_endproc -.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8 -.globl aes256gcmsiv_dec -.hidden aes256gcmsiv_dec -.type aes256gcmsiv_dec,@function -.align 16 -aes256gcmsiv_dec: -.cfi_startproc - testq $~15,%r9 - jnz .L256_dec_start - .byte 0xf3,0xc3 - -.L256_dec_start: - vzeroupper - vmovdqa (%rdx),%xmm0 - movq %rdx,%rax - - leaq 32(%rax),%rax - leaq 32(%rcx),%rcx - - - vmovdqu (%rdi,%r9,1),%xmm15 - vpor OR_MASK(%rip),%xmm15,%xmm15 - andq $~15,%r9 - - - cmpq $96,%r9 - jb .L256_dec_loop2 - - - subq $96,%r9 - vmovdqa %xmm15,%xmm7 - vpaddd one(%rip),%xmm7,%xmm8 - vpaddd two(%rip),%xmm7,%xmm9 - vpaddd one(%rip),%xmm9,%xmm10 - vpaddd two(%rip),%xmm9,%xmm11 - vpaddd one(%rip),%xmm11,%xmm12 - vpaddd two(%rip),%xmm11,%xmm15 - - vpxor (%r8),%xmm7,%xmm7 - vpxor (%r8),%xmm8,%xmm8 - vpxor (%r8),%xmm9,%xmm9 - vpxor (%r8),%xmm10,%xmm10 - vpxor (%r8),%xmm11,%xmm11 - vpxor (%r8),%xmm12,%xmm12 - - vmovdqu 16(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 32(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 48(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 64(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 80(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 96(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 112(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 128(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 144(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 160(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 176(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 192(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 208(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 224(%r8),%xmm4 - vaesenclast %xmm4,%xmm7,%xmm7 - vaesenclast %xmm4,%xmm8,%xmm8 - vaesenclast %xmm4,%xmm9,%xmm9 - vaesenclast %xmm4,%xmm10,%xmm10 - vaesenclast %xmm4,%xmm11,%xmm11 - vaesenclast %xmm4,%xmm12,%xmm12 - - - vpxor 0(%rdi),%xmm7,%xmm7 - vpxor 16(%rdi),%xmm8,%xmm8 - vpxor 32(%rdi),%xmm9,%xmm9 - vpxor 48(%rdi),%xmm10,%xmm10 - vpxor 64(%rdi),%xmm11,%xmm11 - vpxor 80(%rdi),%xmm12,%xmm12 - - vmovdqu %xmm7,0(%rsi) - vmovdqu %xmm8,16(%rsi) - vmovdqu %xmm9,32(%rsi) - vmovdqu %xmm10,48(%rsi) - vmovdqu %xmm11,64(%rsi) - vmovdqu %xmm12,80(%rsi) - - addq $96,%rdi - addq $96,%rsi - jmp .L256_dec_loop1 - - -.align 64 -.L256_dec_loop1: - cmpq $96,%r9 - jb .L256_dec_finish_96 - subq $96,%r9 - - vmovdqa %xmm12,%xmm6 - vmovdqa %xmm11,16-32(%rax) - vmovdqa %xmm10,32-32(%rax) - vmovdqa %xmm9,48-32(%rax) - vmovdqa %xmm8,64-32(%rax) - vmovdqa %xmm7,80-32(%rax) - - vmovdqa %xmm15,%xmm7 - vpaddd one(%rip),%xmm7,%xmm8 - vpaddd two(%rip),%xmm7,%xmm9 - vpaddd one(%rip),%xmm9,%xmm10 - vpaddd two(%rip),%xmm9,%xmm11 - vpaddd one(%rip),%xmm11,%xmm12 - vpaddd two(%rip),%xmm11,%xmm15 - - vmovdqa (%r8),%xmm4 - vpxor %xmm4,%xmm7,%xmm7 - vpxor %xmm4,%xmm8,%xmm8 - vpxor %xmm4,%xmm9,%xmm9 - vpxor %xmm4,%xmm10,%xmm10 - vpxor %xmm4,%xmm11,%xmm11 - vpxor %xmm4,%xmm12,%xmm12 - - vmovdqu 0-32(%rcx),%xmm4 - vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 - vpclmulqdq $0x01,%xmm4,%xmm6,%xmm1 - vpclmulqdq $0x10,%xmm4,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 16(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu -16(%rax),%xmm6 - vmovdqu -16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 32(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 0(%rax),%xmm6 - vmovdqu 0(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 48(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 16(%rax),%xmm6 - vmovdqu 16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 64(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 32(%rax),%xmm6 - vmovdqu 32(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 80(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 96(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 112(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - - vmovdqa 80-32(%rax),%xmm6 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqu 80-32(%rcx),%xmm5 - - vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 128(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - - vpsrldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm2,%xmm5 - vpslldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm0 - - vmovdqa poly(%rip),%xmm3 - - vmovdqu 144(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 160(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 176(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 192(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 208(%r8),%xmm4 - vaesenc %xmm4,%xmm7,%xmm7 - vaesenc %xmm4,%xmm8,%xmm8 - vaesenc %xmm4,%xmm9,%xmm9 - vaesenc %xmm4,%xmm10,%xmm10 - vaesenc %xmm4,%xmm11,%xmm11 - vaesenc %xmm4,%xmm12,%xmm12 - - vmovdqu 224(%r8),%xmm6 - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpxor 0(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm7,%xmm7 - vpxor 16(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm8,%xmm8 - vpxor 32(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm9,%xmm9 - vpxor 48(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm10,%xmm10 - vpxor 64(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm11,%xmm11 - vpxor 80(%rdi),%xmm6,%xmm4 - vaesenclast %xmm4,%xmm12,%xmm12 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vmovdqu %xmm7,0(%rsi) - vmovdqu %xmm8,16(%rsi) - vmovdqu %xmm9,32(%rsi) - vmovdqu %xmm10,48(%rsi) - vmovdqu %xmm11,64(%rsi) - vmovdqu %xmm12,80(%rsi) - - vpxor %xmm5,%xmm0,%xmm0 - - leaq 96(%rdi),%rdi - leaq 96(%rsi),%rsi - jmp .L256_dec_loop1 - -.L256_dec_finish_96: - vmovdqa %xmm12,%xmm6 - vmovdqa %xmm11,16-32(%rax) - vmovdqa %xmm10,32-32(%rax) - vmovdqa %xmm9,48-32(%rax) - vmovdqa %xmm8,64-32(%rax) - vmovdqa %xmm7,80-32(%rax) - - vmovdqu 0-32(%rcx),%xmm4 - vpclmulqdq $0x10,%xmm4,%xmm6,%xmm1 - vpclmulqdq $0x11,%xmm4,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm4,%xmm6,%xmm3 - vpclmulqdq $0x01,%xmm4,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu -16(%rax),%xmm6 - vmovdqu -16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 0(%rax),%xmm6 - vmovdqu 0(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 16(%rax),%xmm6 - vmovdqu 16(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vmovdqu 32(%rax),%xmm6 - vmovdqu 32(%rcx),%xmm13 - - vpclmulqdq $0x10,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x11,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x01,%xmm13,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - - vmovdqu 80-32(%rax),%xmm6 - vpxor %xmm0,%xmm6,%xmm6 - vmovdqu 80-32(%rcx),%xmm5 - vpclmulqdq $0x11,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - vpclmulqdq $0x10,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x01,%xmm5,%xmm6,%xmm4 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm2,%xmm5 - vpslldq $8,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm0 - - vmovdqa poly(%rip),%xmm3 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpalignr $8,%xmm0,%xmm0,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm0 - vpxor %xmm0,%xmm2,%xmm0 - - vpxor %xmm5,%xmm0,%xmm0 - -.L256_dec_loop2: - - - - cmpq $16,%r9 - jb .L256_dec_out - subq $16,%r9 - - vmovdqa %xmm15,%xmm2 - vpaddd one(%rip),%xmm15,%xmm15 - - vpxor 0(%r8),%xmm2,%xmm2 - vaesenc 16(%r8),%xmm2,%xmm2 - vaesenc 32(%r8),%xmm2,%xmm2 - vaesenc 48(%r8),%xmm2,%xmm2 - vaesenc 64(%r8),%xmm2,%xmm2 - vaesenc 80(%r8),%xmm2,%xmm2 - vaesenc 96(%r8),%xmm2,%xmm2 - vaesenc 112(%r8),%xmm2,%xmm2 - vaesenc 128(%r8),%xmm2,%xmm2 - vaesenc 144(%r8),%xmm2,%xmm2 - vaesenc 160(%r8),%xmm2,%xmm2 - vaesenc 176(%r8),%xmm2,%xmm2 - vaesenc 192(%r8),%xmm2,%xmm2 - vaesenc 208(%r8),%xmm2,%xmm2 - vaesenclast 224(%r8),%xmm2,%xmm2 - vpxor (%rdi),%xmm2,%xmm2 - vmovdqu %xmm2,(%rsi) - addq $16,%rdi - addq $16,%rsi - - vpxor %xmm2,%xmm0,%xmm0 - vmovdqa -32(%rcx),%xmm1 - call GFMUL - - jmp .L256_dec_loop2 - -.L256_dec_out: - vmovdqu %xmm0,(%rdx) - .byte 0xf3,0xc3 -.cfi_endproc -.size aes256gcmsiv_dec, .-aes256gcmsiv_dec -.globl aes256gcmsiv_kdf -.hidden aes256gcmsiv_kdf -.type aes256gcmsiv_kdf,@function -.align 16 -aes256gcmsiv_kdf: -.cfi_startproc - - - - - vmovdqa (%rdx),%xmm1 - vmovdqa 0(%rdi),%xmm4 - vmovdqa and_mask(%rip),%xmm11 - vmovdqa one(%rip),%xmm8 - vpshufd $0x90,%xmm4,%xmm4 - vpand %xmm11,%xmm4,%xmm4 - vpaddd %xmm8,%xmm4,%xmm6 - vpaddd %xmm8,%xmm6,%xmm7 - vpaddd %xmm8,%xmm7,%xmm11 - vpaddd %xmm8,%xmm11,%xmm12 - vpaddd %xmm8,%xmm12,%xmm13 - - vpxor %xmm1,%xmm4,%xmm4 - vpxor %xmm1,%xmm6,%xmm6 - vpxor %xmm1,%xmm7,%xmm7 - vpxor %xmm1,%xmm11,%xmm11 - vpxor %xmm1,%xmm12,%xmm12 - vpxor %xmm1,%xmm13,%xmm13 - - vmovdqa 16(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 32(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 48(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 64(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 80(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 96(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 112(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 128(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 144(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 160(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 176(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 192(%rdx),%xmm2 - vaesenc %xmm2,%xmm4,%xmm4 - vaesenc %xmm2,%xmm6,%xmm6 - vaesenc %xmm2,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vaesenc %xmm2,%xmm12,%xmm12 - vaesenc %xmm2,%xmm13,%xmm13 - - vmovdqa 208(%rdx),%xmm1 - vaesenc %xmm1,%xmm4,%xmm4 - vaesenc %xmm1,%xmm6,%xmm6 - vaesenc %xmm1,%xmm7,%xmm7 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - - vmovdqa 224(%rdx),%xmm2 - vaesenclast %xmm2,%xmm4,%xmm4 - vaesenclast %xmm2,%xmm6,%xmm6 - vaesenclast %xmm2,%xmm7,%xmm7 - vaesenclast %xmm2,%xmm11,%xmm11 - vaesenclast %xmm2,%xmm12,%xmm12 - vaesenclast %xmm2,%xmm13,%xmm13 - - - vmovdqa %xmm4,0(%rsi) - vmovdqa %xmm6,16(%rsi) - vmovdqa %xmm7,32(%rsi) - vmovdqa %xmm11,48(%rsi) - vmovdqa %xmm12,64(%rsi) - vmovdqa %xmm13,80(%rsi) - .byte 0xf3,0xc3 -.cfi_endproc -.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S b/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S deleted file mode 100644 index 2f3f641a..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S +++ /dev/null @@ -1,8922 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -chacha20_poly1305_constants: - -.align 64 -.Lchacha20_consts: -.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' -.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k' -.Lrol8: -.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 -.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14 -.Lrol16: -.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 -.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13 -.Lavx2_init: -.long 0,0,0,0 -.Lsse_inc: -.long 1,0,0,0 -.Lavx2_inc: -.long 2,0,0,0,2,0,0,0 -.Lclamp: -.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC -.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF -.align 16 -.Land_masks: -.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00 -.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff - -.type poly_hash_ad_internal,@function -.align 64 -poly_hash_ad_internal: -.cfi_startproc -.cfi_def_cfa rsp, 8 - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - cmpq $13,%r8 - jne .Lhash_ad_loop -.Lpoly_fast_tls_ad: - - movq (%rcx),%r10 - movq 5(%rcx),%r11 - shrq $24,%r11 - movq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - .byte 0xf3,0xc3 -.Lhash_ad_loop: - - cmpq $16,%r8 - jb .Lhash_ad_tail - addq 0+0(%rcx),%r10 - adcq 8+0(%rcx),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rcx),%rcx - subq $16,%r8 - jmp .Lhash_ad_loop -.Lhash_ad_tail: - cmpq $0,%r8 - je .Lhash_ad_done - - xorq %r13,%r13 - xorq %r14,%r14 - xorq %r15,%r15 - addq %r8,%rcx -.Lhash_ad_tail_loop: - shldq $8,%r13,%r14 - shlq $8,%r13 - movzbq -1(%rcx),%r15 - xorq %r15,%r13 - decq %rcx - decq %r8 - jne .Lhash_ad_tail_loop - - addq %r13,%r10 - adcq %r14,%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - -.Lhash_ad_done: - .byte 0xf3,0xc3 -.cfi_endproc -.size poly_hash_ad_internal, .-poly_hash_ad_internal - -.globl chacha20_poly1305_open -.hidden chacha20_poly1305_open -.type chacha20_poly1305_open,@function -.align 64 -chacha20_poly1305_open: -.cfi_startproc - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - - - pushq %r9 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r9,-64 - subq $288 + 0 + 32,%rsp -.cfi_adjust_cfa_offset 288 + 32 - - leaq 32(%rsp),%rbp - andq $-32,%rbp - - movq %rdx,%rbx - movq %r8,0+0+32(%rbp) - movq %rbx,8+0+32(%rbp) - - movl OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_open_avx2 - - cmpq $128,%rbx - jbe .Lopen_sse_128 - - movdqa .Lchacha20_consts(%rip),%xmm0 - movdqu 0(%r9),%xmm4 - movdqu 16(%r9),%xmm8 - movdqu 32(%r9),%xmm12 - - movdqa %xmm12,%xmm7 - - movdqa %xmm4,0+48(%rbp) - movdqa %xmm8,0+64(%rbp) - movdqa %xmm12,0+96(%rbp) - movq $10,%r10 -.Lopen_sse_init_rounds: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - - decq %r10 - jne .Lopen_sse_init_rounds - - paddd .Lchacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - - pand .Lclamp(%rip),%xmm0 - movdqa %xmm0,0+0(%rbp) - movdqa %xmm4,0+16(%rbp) - - movq %r8,%r8 - call poly_hash_ad_internal -.Lopen_sse_main_loop: - cmpq $256,%rbx - jb .Lopen_sse_tail - - movdqa .Lchacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa %xmm0,%xmm2 - movdqa %xmm4,%xmm6 - movdqa %xmm8,%xmm10 - movdqa %xmm0,%xmm3 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm11 - movdqa 0+96(%rbp),%xmm15 - paddd .Lsse_inc(%rip),%xmm15 - movdqa %xmm15,%xmm14 - paddd .Lsse_inc(%rip),%xmm14 - movdqa %xmm14,%xmm13 - paddd .Lsse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - movdqa %xmm15,0+144(%rbp) - - - - movq $4,%rcx - movq %rsi,%r8 -.Lopen_sse_main_loop_rounds: - movdqa %xmm8,0+80(%rbp) - movdqa .Lrol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - - leaq 16(%r8),%r8 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movdqa .Lrol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 -.byte 102,15,58,15,255,4 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,12 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - movdqa %xmm8,0+80(%rbp) - movdqa .Lrol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movdqa .Lrol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 -.byte 102,15,58,15,255,12 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,4 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - - decq %rcx - jge .Lopen_sse_main_loop_rounds - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%r8),%r8 - cmpq $-6,%rcx - jg .Lopen_sse_main_loop_rounds - paddd .Lchacha20_consts(%rip),%xmm3 - paddd 0+48(%rbp),%xmm7 - paddd 0+64(%rbp),%xmm11 - paddd 0+144(%rbp),%xmm15 - paddd .Lchacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd .Lchacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd .Lchacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqa %xmm12,0+80(%rbp) - movdqu 0 + 0(%rsi),%xmm12 - pxor %xmm3,%xmm12 - movdqu %xmm12,0 + 0(%rdi) - movdqu 16 + 0(%rsi),%xmm12 - pxor %xmm7,%xmm12 - movdqu %xmm12,16 + 0(%rdi) - movdqu 32 + 0(%rsi),%xmm12 - pxor %xmm11,%xmm12 - movdqu %xmm12,32 + 0(%rdi) - movdqu 48 + 0(%rsi),%xmm12 - pxor %xmm15,%xmm12 - movdqu %xmm12,48 + 0(%rdi) - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 64(%rdi) - movdqu %xmm6,16 + 64(%rdi) - movdqu %xmm10,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - movdqu 0 + 128(%rsi),%xmm3 - movdqu 16 + 128(%rsi),%xmm7 - movdqu 32 + 128(%rsi),%xmm11 - movdqu 48 + 128(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 128(%rdi) - movdqu %xmm5,16 + 128(%rdi) - movdqu %xmm9,32 + 128(%rdi) - movdqu %xmm15,48 + 128(%rdi) - movdqu 0 + 192(%rsi),%xmm3 - movdqu 16 + 192(%rsi),%xmm7 - movdqu 32 + 192(%rsi),%xmm11 - movdqu 48 + 192(%rsi),%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm7,%xmm4 - pxor %xmm11,%xmm8 - pxor 0+80(%rbp),%xmm15 - movdqu %xmm0,0 + 192(%rdi) - movdqu %xmm4,16 + 192(%rdi) - movdqu %xmm8,32 + 192(%rdi) - movdqu %xmm15,48 + 192(%rdi) - - leaq 256(%rsi),%rsi - leaq 256(%rdi),%rdi - subq $256,%rbx - jmp .Lopen_sse_main_loop -.Lopen_sse_tail: - - testq %rbx,%rbx - jz .Lopen_sse_finalize - cmpq $192,%rbx - ja .Lopen_sse_tail_256 - cmpq $128,%rbx - ja .Lopen_sse_tail_192 - cmpq $64,%rbx - ja .Lopen_sse_tail_128 - movdqa .Lchacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa 0+96(%rbp),%xmm12 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - - xorq %r8,%r8 - movq %rbx,%rcx - cmpq $16,%rcx - jb .Lopen_sse_tail_64_rounds -.Lopen_sse_tail_64_rounds_and_x1hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - subq $16,%rcx -.Lopen_sse_tail_64_rounds: - addq $16,%r8 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - - cmpq $16,%rcx - jae .Lopen_sse_tail_64_rounds_and_x1hash - cmpq $160,%r8 - jne .Lopen_sse_tail_64_rounds - paddd .Lchacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - - jmp .Lopen_sse_tail_64_dec_loop - -.Lopen_sse_tail_128: - movdqa .Lchacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa 0+96(%rbp),%xmm13 - paddd .Lsse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - - movq %rbx,%rcx - andq $-16,%rcx - xorq %r8,%r8 -.Lopen_sse_tail_128_rounds_and_x1hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - -.Lopen_sse_tail_128_rounds: - addq $16,%r8 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - - cmpq %rcx,%r8 - jb .Lopen_sse_tail_128_rounds_and_x1hash - cmpq $160,%r8 - jne .Lopen_sse_tail_128_rounds - paddd .Lchacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd .Lchacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqu 0 + 0(%rsi),%xmm3 - movdqu 16 + 0(%rsi),%xmm7 - movdqu 32 + 0(%rsi),%xmm11 - movdqu 48 + 0(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 0(%rdi) - movdqu %xmm5,16 + 0(%rdi) - movdqu %xmm9,32 + 0(%rdi) - movdqu %xmm15,48 + 0(%rdi) - - subq $64,%rbx - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - jmp .Lopen_sse_tail_64_dec_loop - -.Lopen_sse_tail_192: - movdqa .Lchacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa %xmm0,%xmm2 - movdqa %xmm4,%xmm6 - movdqa %xmm8,%xmm10 - movdqa 0+96(%rbp),%xmm14 - paddd .Lsse_inc(%rip),%xmm14 - movdqa %xmm14,%xmm13 - paddd .Lsse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - - movq %rbx,%rcx - movq $160,%r8 - cmpq $160,%rcx - cmovgq %r8,%rcx - andq $-16,%rcx - xorq %r8,%r8 -.Lopen_sse_tail_192_rounds_and_x1hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - -.Lopen_sse_tail_192_rounds: - addq $16,%r8 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 - - cmpq %rcx,%r8 - jb .Lopen_sse_tail_192_rounds_and_x1hash - cmpq $160,%r8 - jne .Lopen_sse_tail_192_rounds - cmpq $176,%rbx - jb .Lopen_sse_tail_192_finish - addq 0+160(%rsi),%r10 - adcq 8+160(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - cmpq $192,%rbx - jb .Lopen_sse_tail_192_finish - addq 0+176(%rsi),%r10 - adcq 8+176(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - -.Lopen_sse_tail_192_finish: - paddd .Lchacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd .Lchacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd .Lchacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqu 0 + 0(%rsi),%xmm3 - movdqu 16 + 0(%rsi),%xmm7 - movdqu 32 + 0(%rsi),%xmm11 - movdqu 48 + 0(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 0(%rdi) - movdqu %xmm6,16 + 0(%rdi) - movdqu %xmm10,32 + 0(%rdi) - movdqu %xmm15,48 + 0(%rdi) - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 64(%rdi) - movdqu %xmm5,16 + 64(%rdi) - movdqu %xmm9,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - - subq $128,%rbx - leaq 128(%rsi),%rsi - leaq 128(%rdi),%rdi - jmp .Lopen_sse_tail_64_dec_loop - -.Lopen_sse_tail_256: - movdqa .Lchacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa %xmm0,%xmm2 - movdqa %xmm4,%xmm6 - movdqa %xmm8,%xmm10 - movdqa %xmm0,%xmm3 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm11 - movdqa 0+96(%rbp),%xmm15 - paddd .Lsse_inc(%rip),%xmm15 - movdqa %xmm15,%xmm14 - paddd .Lsse_inc(%rip),%xmm14 - movdqa %xmm14,%xmm13 - paddd .Lsse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - movdqa %xmm15,0+144(%rbp) - - xorq %r8,%r8 -.Lopen_sse_tail_256_rounds_and_x1hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movdqa %xmm11,0+80(%rbp) - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm4 - pxor %xmm11,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm4 - pxor %xmm11,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm5 - pxor %xmm11,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm5 - pxor %xmm11,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm6 - pxor %xmm11,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm6 - pxor %xmm11,%xmm6 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 - movdqa 0+80(%rbp),%xmm11 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movdqa %xmm9,0+80(%rbp) - paddd %xmm7,%xmm3 - pxor %xmm3,%xmm15 - pshufb .Lrol16(%rip),%xmm15 - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm9 - pslld $12,%xmm9 - psrld $20,%xmm7 - pxor %xmm9,%xmm7 - paddd %xmm7,%xmm3 - pxor %xmm3,%xmm15 - pshufb .Lrol8(%rip),%xmm15 - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm9 - pslld $7,%xmm9 - psrld $25,%xmm7 - pxor %xmm9,%xmm7 -.byte 102,15,58,15,255,4 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,12 - movdqa 0+80(%rbp),%xmm9 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - movdqa %xmm11,0+80(%rbp) - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm4 - pxor %xmm11,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm4 - pxor %xmm11,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm5 - pxor %xmm11,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm5 - pxor %xmm11,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm11 - pslld $12,%xmm11 - psrld $20,%xmm6 - pxor %xmm11,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm11 - pslld $7,%xmm11 - psrld $25,%xmm6 - pxor %xmm11,%xmm6 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 - movdqa 0+80(%rbp),%xmm11 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - movdqa %xmm9,0+80(%rbp) - paddd %xmm7,%xmm3 - pxor %xmm3,%xmm15 - pshufb .Lrol16(%rip),%xmm15 - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm9 - pslld $12,%xmm9 - psrld $20,%xmm7 - pxor %xmm9,%xmm7 - paddd %xmm7,%xmm3 - pxor %xmm3,%xmm15 - pshufb .Lrol8(%rip),%xmm15 - paddd %xmm15,%xmm11 - pxor %xmm11,%xmm7 - movdqa %xmm7,%xmm9 - pslld $7,%xmm9 - psrld $25,%xmm7 - pxor %xmm9,%xmm7 -.byte 102,15,58,15,255,12 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,4 - movdqa 0+80(%rbp),%xmm9 - - addq $16,%r8 - cmpq $160,%r8 - jb .Lopen_sse_tail_256_rounds_and_x1hash - - movq %rbx,%rcx - andq $-16,%rcx -.Lopen_sse_tail_256_hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - addq $16,%r8 - cmpq %rcx,%r8 - jb .Lopen_sse_tail_256_hash - paddd .Lchacha20_consts(%rip),%xmm3 - paddd 0+48(%rbp),%xmm7 - paddd 0+64(%rbp),%xmm11 - paddd 0+144(%rbp),%xmm15 - paddd .Lchacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd .Lchacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd .Lchacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqa %xmm12,0+80(%rbp) - movdqu 0 + 0(%rsi),%xmm12 - pxor %xmm3,%xmm12 - movdqu %xmm12,0 + 0(%rdi) - movdqu 16 + 0(%rsi),%xmm12 - pxor %xmm7,%xmm12 - movdqu %xmm12,16 + 0(%rdi) - movdqu 32 + 0(%rsi),%xmm12 - pxor %xmm11,%xmm12 - movdqu %xmm12,32 + 0(%rdi) - movdqu 48 + 0(%rsi),%xmm12 - pxor %xmm15,%xmm12 - movdqu %xmm12,48 + 0(%rdi) - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 64(%rdi) - movdqu %xmm6,16 + 64(%rdi) - movdqu %xmm10,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - movdqu 0 + 128(%rsi),%xmm3 - movdqu 16 + 128(%rsi),%xmm7 - movdqu 32 + 128(%rsi),%xmm11 - movdqu 48 + 128(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 128(%rdi) - movdqu %xmm5,16 + 128(%rdi) - movdqu %xmm9,32 + 128(%rdi) - movdqu %xmm15,48 + 128(%rdi) - - movdqa 0+80(%rbp),%xmm12 - subq $192,%rbx - leaq 192(%rsi),%rsi - leaq 192(%rdi),%rdi - - -.Lopen_sse_tail_64_dec_loop: - cmpq $16,%rbx - jb .Lopen_sse_tail_16_init - subq $16,%rbx - movdqu (%rsi),%xmm3 - pxor %xmm3,%xmm0 - movdqu %xmm0,(%rdi) - leaq 16(%rsi),%rsi - leaq 16(%rdi),%rdi - movdqa %xmm4,%xmm0 - movdqa %xmm8,%xmm4 - movdqa %xmm12,%xmm8 - jmp .Lopen_sse_tail_64_dec_loop -.Lopen_sse_tail_16_init: - movdqa %xmm0,%xmm1 - - -.Lopen_sse_tail_16: - testq %rbx,%rbx - jz .Lopen_sse_finalize - - - - pxor %xmm3,%xmm3 - leaq -1(%rsi,%rbx,1),%rsi - movq %rbx,%r8 -.Lopen_sse_tail_16_compose: - pslldq $1,%xmm3 - pinsrb $0,(%rsi),%xmm3 - subq $1,%rsi - subq $1,%r8 - jnz .Lopen_sse_tail_16_compose - -.byte 102,73,15,126,221 - pextrq $1,%xmm3,%r14 - - pxor %xmm1,%xmm3 - - -.Lopen_sse_tail_16_extract: - pextrb $0,%xmm3,(%rdi) - psrldq $1,%xmm3 - addq $1,%rdi - subq $1,%rbx - jne .Lopen_sse_tail_16_extract - - addq %r13,%r10 - adcq %r14,%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - -.Lopen_sse_finalize: - addq 0+0+32(%rbp),%r10 - adcq 8+0+32(%rbp),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - - movq %r10,%r13 - movq %r11,%r14 - movq %r12,%r15 - subq $-5,%r10 - sbbq $-1,%r11 - sbbq $3,%r12 - cmovcq %r13,%r10 - cmovcq %r14,%r11 - cmovcq %r15,%r12 - - addq 0+0+16(%rbp),%r10 - adcq 8+0+16(%rbp),%r11 - -.cfi_remember_state - addq $288 + 0 + 32,%rsp -.cfi_adjust_cfa_offset -(288 + 32) - - popq %r9 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r9 - movq %r10,(%r9) - movq %r11,8(%r9) - popq %r15 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r15 - popq %r14 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r14 - popq %r13 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r13 - popq %r12 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r12 - popq %rbx -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbx - popq %rbp -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbp - .byte 0xf3,0xc3 - -.Lopen_sse_128: -.cfi_restore_state - movdqu .Lchacha20_consts(%rip),%xmm0 - movdqa %xmm0,%xmm1 - movdqa %xmm0,%xmm2 - movdqu 0(%r9),%xmm4 - movdqa %xmm4,%xmm5 - movdqa %xmm4,%xmm6 - movdqu 16(%r9),%xmm8 - movdqa %xmm8,%xmm9 - movdqa %xmm8,%xmm10 - movdqu 32(%r9),%xmm12 - movdqa %xmm12,%xmm13 - paddd .Lsse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm14 - paddd .Lsse_inc(%rip),%xmm14 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm11 - movdqa %xmm13,%xmm15 - movq $10,%r10 - -.Lopen_sse_128_rounds: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 - - decq %r10 - jnz .Lopen_sse_128_rounds - paddd .Lchacha20_consts(%rip),%xmm0 - paddd .Lchacha20_consts(%rip),%xmm1 - paddd .Lchacha20_consts(%rip),%xmm2 - paddd %xmm7,%xmm4 - paddd %xmm7,%xmm5 - paddd %xmm7,%xmm6 - paddd %xmm11,%xmm9 - paddd %xmm11,%xmm10 - paddd %xmm15,%xmm13 - paddd .Lsse_inc(%rip),%xmm15 - paddd %xmm15,%xmm14 - - pand .Lclamp(%rip),%xmm0 - movdqa %xmm0,0+0(%rbp) - movdqa %xmm4,0+16(%rbp) - - movq %r8,%r8 - call poly_hash_ad_internal -.Lopen_sse_128_xor_hash: - cmpq $16,%rbx - jb .Lopen_sse_tail_16 - subq $16,%rbx - addq 0+0(%rsi),%r10 - adcq 8+0(%rsi),%r11 - adcq $1,%r12 - - - movdqu 0(%rsi),%xmm3 - pxor %xmm3,%xmm1 - movdqu %xmm1,0(%rdi) - leaq 16(%rsi),%rsi - leaq 16(%rdi),%rdi - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - - movdqa %xmm5,%xmm1 - movdqa %xmm9,%xmm5 - movdqa %xmm13,%xmm9 - movdqa %xmm2,%xmm13 - movdqa %xmm6,%xmm2 - movdqa %xmm10,%xmm6 - movdqa %xmm14,%xmm10 - jmp .Lopen_sse_128_xor_hash -.size chacha20_poly1305_open, .-chacha20_poly1305_open -.cfi_endproc - - - - - - - -.globl chacha20_poly1305_seal -.hidden chacha20_poly1305_seal -.type chacha20_poly1305_seal,@function -.align 64 -chacha20_poly1305_seal: -.cfi_startproc - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - - - pushq %r9 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r9,-64 - subq $288 + 0 + 32,%rsp -.cfi_adjust_cfa_offset 288 + 32 - leaq 32(%rsp),%rbp - andq $-32,%rbp - - movq 56(%r9),%rbx - addq %rdx,%rbx - movq %r8,0+0+32(%rbp) - movq %rbx,8+0+32(%rbp) - movq %rdx,%rbx - - movl OPENSSL_ia32cap_P+8(%rip),%eax - andl $288,%eax - xorl $288,%eax - jz chacha20_poly1305_seal_avx2 - - cmpq $128,%rbx - jbe .Lseal_sse_128 - - movdqa .Lchacha20_consts(%rip),%xmm0 - movdqu 0(%r9),%xmm4 - movdqu 16(%r9),%xmm8 - movdqu 32(%r9),%xmm12 - - movdqa %xmm0,%xmm1 - movdqa %xmm0,%xmm2 - movdqa %xmm0,%xmm3 - movdqa %xmm4,%xmm5 - movdqa %xmm4,%xmm6 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm9 - movdqa %xmm8,%xmm10 - movdqa %xmm8,%xmm11 - movdqa %xmm12,%xmm15 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,%xmm14 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,%xmm13 - paddd .Lsse_inc(%rip),%xmm12 - - movdqa %xmm4,0+48(%rbp) - movdqa %xmm8,0+64(%rbp) - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - movdqa %xmm15,0+144(%rbp) - movq $10,%r10 -.Lseal_sse_init_rounds: - movdqa %xmm8,0+80(%rbp) - movdqa .Lrol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movdqa .Lrol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 -.byte 102,15,58,15,255,4 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,12 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - movdqa %xmm8,0+80(%rbp) - movdqa .Lrol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movdqa .Lrol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 -.byte 102,15,58,15,255,12 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,4 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - - decq %r10 - jnz .Lseal_sse_init_rounds - paddd .Lchacha20_consts(%rip),%xmm3 - paddd 0+48(%rbp),%xmm7 - paddd 0+64(%rbp),%xmm11 - paddd 0+144(%rbp),%xmm15 - paddd .Lchacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd .Lchacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd .Lchacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - - - pand .Lclamp(%rip),%xmm3 - movdqa %xmm3,0+0(%rbp) - movdqa %xmm7,0+16(%rbp) - - movq %r8,%r8 - call poly_hash_ad_internal - movdqu 0 + 0(%rsi),%xmm3 - movdqu 16 + 0(%rsi),%xmm7 - movdqu 32 + 0(%rsi),%xmm11 - movdqu 48 + 0(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 0(%rdi) - movdqu %xmm6,16 + 0(%rdi) - movdqu %xmm10,32 + 0(%rdi) - movdqu %xmm15,48 + 0(%rdi) - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 64(%rdi) - movdqu %xmm5,16 + 64(%rdi) - movdqu %xmm9,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - - cmpq $192,%rbx - ja .Lseal_sse_main_init - movq $128,%rcx - subq $128,%rbx - leaq 128(%rsi),%rsi - jmp .Lseal_sse_128_tail_hash -.Lseal_sse_main_init: - movdqu 0 + 128(%rsi),%xmm3 - movdqu 16 + 128(%rsi),%xmm7 - movdqu 32 + 128(%rsi),%xmm11 - movdqu 48 + 128(%rsi),%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm7,%xmm4 - pxor %xmm11,%xmm8 - pxor %xmm12,%xmm15 - movdqu %xmm0,0 + 128(%rdi) - movdqu %xmm4,16 + 128(%rdi) - movdqu %xmm8,32 + 128(%rdi) - movdqu %xmm15,48 + 128(%rdi) - - movq $192,%rcx - subq $192,%rbx - leaq 192(%rsi),%rsi - movq $2,%rcx - movq $8,%r8 - cmpq $64,%rbx - jbe .Lseal_sse_tail_64 - cmpq $128,%rbx - jbe .Lseal_sse_tail_128 - cmpq $192,%rbx - jbe .Lseal_sse_tail_192 - -.Lseal_sse_main_loop: - movdqa .Lchacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa %xmm0,%xmm2 - movdqa %xmm4,%xmm6 - movdqa %xmm8,%xmm10 - movdqa %xmm0,%xmm3 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm11 - movdqa 0+96(%rbp),%xmm15 - paddd .Lsse_inc(%rip),%xmm15 - movdqa %xmm15,%xmm14 - paddd .Lsse_inc(%rip),%xmm14 - movdqa %xmm14,%xmm13 - paddd .Lsse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - movdqa %xmm15,0+144(%rbp) - -.align 32 -.Lseal_sse_main_rounds: - movdqa %xmm8,0+80(%rbp) - movdqa .Lrol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movdqa .Lrol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 -.byte 102,15,58,15,255,4 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,12 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - movdqa %xmm8,0+80(%rbp) - movdqa .Lrol16(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $20,%xmm8 - pslld $32-20,%xmm4 - pxor %xmm8,%xmm4 - movdqa .Lrol8(%rip),%xmm8 - paddd %xmm7,%xmm3 - paddd %xmm6,%xmm2 - paddd %xmm5,%xmm1 - paddd %xmm4,%xmm0 - pxor %xmm3,%xmm15 - pxor %xmm2,%xmm14 - pxor %xmm1,%xmm13 - pxor %xmm0,%xmm12 -.byte 102,69,15,56,0,248 -.byte 102,69,15,56,0,240 -.byte 102,69,15,56,0,232 -.byte 102,69,15,56,0,224 - movdqa 0+80(%rbp),%xmm8 - paddd %xmm15,%xmm11 - paddd %xmm14,%xmm10 - paddd %xmm13,%xmm9 - paddd %xmm12,%xmm8 - pxor %xmm11,%xmm7 - pxor %xmm10,%xmm6 - pxor %xmm9,%xmm5 - pxor %xmm8,%xmm4 - movdqa %xmm8,0+80(%rbp) - movdqa %xmm7,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm7 - pxor %xmm8,%xmm7 - movdqa %xmm6,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm6 - pxor %xmm8,%xmm6 - movdqa %xmm5,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm5 - pxor %xmm8,%xmm5 - movdqa %xmm4,%xmm8 - psrld $25,%xmm8 - pslld $32-25,%xmm4 - pxor %xmm8,%xmm4 - movdqa 0+80(%rbp),%xmm8 -.byte 102,15,58,15,255,12 -.byte 102,69,15,58,15,219,8 -.byte 102,69,15,58,15,255,4 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - - leaq 16(%rdi),%rdi - decq %r8 - jge .Lseal_sse_main_rounds - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi - decq %rcx - jg .Lseal_sse_main_rounds - paddd .Lchacha20_consts(%rip),%xmm3 - paddd 0+48(%rbp),%xmm7 - paddd 0+64(%rbp),%xmm11 - paddd 0+144(%rbp),%xmm15 - paddd .Lchacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd .Lchacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd .Lchacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - - movdqa %xmm14,0+80(%rbp) - movdqa %xmm14,0+80(%rbp) - movdqu 0 + 0(%rsi),%xmm14 - pxor %xmm3,%xmm14 - movdqu %xmm14,0 + 0(%rdi) - movdqu 16 + 0(%rsi),%xmm14 - pxor %xmm7,%xmm14 - movdqu %xmm14,16 + 0(%rdi) - movdqu 32 + 0(%rsi),%xmm14 - pxor %xmm11,%xmm14 - movdqu %xmm14,32 + 0(%rdi) - movdqu 48 + 0(%rsi),%xmm14 - pxor %xmm15,%xmm14 - movdqu %xmm14,48 + 0(%rdi) - - movdqa 0+80(%rbp),%xmm14 - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 64(%rdi) - movdqu %xmm6,16 + 64(%rdi) - movdqu %xmm10,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - movdqu 0 + 128(%rsi),%xmm3 - movdqu 16 + 128(%rsi),%xmm7 - movdqu 32 + 128(%rsi),%xmm11 - movdqu 48 + 128(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 128(%rdi) - movdqu %xmm5,16 + 128(%rdi) - movdqu %xmm9,32 + 128(%rdi) - movdqu %xmm15,48 + 128(%rdi) - - cmpq $256,%rbx - ja .Lseal_sse_main_loop_xor - - movq $192,%rcx - subq $192,%rbx - leaq 192(%rsi),%rsi - jmp .Lseal_sse_128_tail_hash -.Lseal_sse_main_loop_xor: - movdqu 0 + 192(%rsi),%xmm3 - movdqu 16 + 192(%rsi),%xmm7 - movdqu 32 + 192(%rsi),%xmm11 - movdqu 48 + 192(%rsi),%xmm15 - pxor %xmm3,%xmm0 - pxor %xmm7,%xmm4 - pxor %xmm11,%xmm8 - pxor %xmm12,%xmm15 - movdqu %xmm0,0 + 192(%rdi) - movdqu %xmm4,16 + 192(%rdi) - movdqu %xmm8,32 + 192(%rdi) - movdqu %xmm15,48 + 192(%rdi) - - leaq 256(%rsi),%rsi - subq $256,%rbx - movq $6,%rcx - movq $4,%r8 - cmpq $192,%rbx - jg .Lseal_sse_main_loop - movq %rbx,%rcx - testq %rbx,%rbx - je .Lseal_sse_128_tail_hash - movq $6,%rcx - cmpq $128,%rbx - ja .Lseal_sse_tail_192 - cmpq $64,%rbx - ja .Lseal_sse_tail_128 - -.Lseal_sse_tail_64: - movdqa .Lchacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa 0+96(%rbp),%xmm12 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - -.Lseal_sse_tail_64_rounds_and_x2hash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -.Lseal_sse_tail_64_rounds_and_x1hash: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi - decq %rcx - jg .Lseal_sse_tail_64_rounds_and_x2hash - decq %r8 - jge .Lseal_sse_tail_64_rounds_and_x1hash - paddd .Lchacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - - jmp .Lseal_sse_128_tail_xor - -.Lseal_sse_tail_128: - movdqa .Lchacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa 0+96(%rbp),%xmm13 - paddd .Lsse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - -.Lseal_sse_tail_128_rounds_and_x2hash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -.Lseal_sse_tail_128_rounds_and_x1hash: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - - leaq 16(%rdi),%rdi - decq %rcx - jg .Lseal_sse_tail_128_rounds_and_x2hash - decq %r8 - jge .Lseal_sse_tail_128_rounds_and_x1hash - paddd .Lchacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd .Lchacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqu 0 + 0(%rsi),%xmm3 - movdqu 16 + 0(%rsi),%xmm7 - movdqu 32 + 0(%rsi),%xmm11 - movdqu 48 + 0(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 0(%rdi) - movdqu %xmm5,16 + 0(%rdi) - movdqu %xmm9,32 + 0(%rdi) - movdqu %xmm15,48 + 0(%rdi) - - movq $64,%rcx - subq $64,%rbx - leaq 64(%rsi),%rsi - jmp .Lseal_sse_128_tail_hash - -.Lseal_sse_tail_192: - movdqa .Lchacha20_consts(%rip),%xmm0 - movdqa 0+48(%rbp),%xmm4 - movdqa 0+64(%rbp),%xmm8 - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm5 - movdqa %xmm8,%xmm9 - movdqa %xmm0,%xmm2 - movdqa %xmm4,%xmm6 - movdqa %xmm8,%xmm10 - movdqa 0+96(%rbp),%xmm14 - paddd .Lsse_inc(%rip),%xmm14 - movdqa %xmm14,%xmm13 - paddd .Lsse_inc(%rip),%xmm13 - movdqa %xmm13,%xmm12 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,0+96(%rbp) - movdqa %xmm13,0+112(%rbp) - movdqa %xmm14,0+128(%rbp) - -.Lseal_sse_tail_192_rounds_and_x2hash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -.Lseal_sse_tail_192_rounds_and_x1hash: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 - - leaq 16(%rdi),%rdi - decq %rcx - jg .Lseal_sse_tail_192_rounds_and_x2hash - decq %r8 - jge .Lseal_sse_tail_192_rounds_and_x1hash - paddd .Lchacha20_consts(%rip),%xmm2 - paddd 0+48(%rbp),%xmm6 - paddd 0+64(%rbp),%xmm10 - paddd 0+128(%rbp),%xmm14 - paddd .Lchacha20_consts(%rip),%xmm1 - paddd 0+48(%rbp),%xmm5 - paddd 0+64(%rbp),%xmm9 - paddd 0+112(%rbp),%xmm13 - paddd .Lchacha20_consts(%rip),%xmm0 - paddd 0+48(%rbp),%xmm4 - paddd 0+64(%rbp),%xmm8 - paddd 0+96(%rbp),%xmm12 - movdqu 0 + 0(%rsi),%xmm3 - movdqu 16 + 0(%rsi),%xmm7 - movdqu 32 + 0(%rsi),%xmm11 - movdqu 48 + 0(%rsi),%xmm15 - pxor %xmm3,%xmm2 - pxor %xmm7,%xmm6 - pxor %xmm11,%xmm10 - pxor %xmm14,%xmm15 - movdqu %xmm2,0 + 0(%rdi) - movdqu %xmm6,16 + 0(%rdi) - movdqu %xmm10,32 + 0(%rdi) - movdqu %xmm15,48 + 0(%rdi) - movdqu 0 + 64(%rsi),%xmm3 - movdqu 16 + 64(%rsi),%xmm7 - movdqu 32 + 64(%rsi),%xmm11 - movdqu 48 + 64(%rsi),%xmm15 - pxor %xmm3,%xmm1 - pxor %xmm7,%xmm5 - pxor %xmm11,%xmm9 - pxor %xmm13,%xmm15 - movdqu %xmm1,0 + 64(%rdi) - movdqu %xmm5,16 + 64(%rdi) - movdqu %xmm9,32 + 64(%rdi) - movdqu %xmm15,48 + 64(%rdi) - - movq $128,%rcx - subq $128,%rbx - leaq 128(%rsi),%rsi - -.Lseal_sse_128_tail_hash: - cmpq $16,%rcx - jb .Lseal_sse_128_tail_xor - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - subq $16,%rcx - leaq 16(%rdi),%rdi - jmp .Lseal_sse_128_tail_hash - -.Lseal_sse_128_tail_xor: - cmpq $16,%rbx - jb .Lseal_sse_tail_16 - subq $16,%rbx - - movdqu 0(%rsi),%xmm3 - pxor %xmm3,%xmm0 - movdqu %xmm0,0(%rdi) - - addq 0(%rdi),%r10 - adcq 8(%rdi),%r11 - adcq $1,%r12 - leaq 16(%rsi),%rsi - leaq 16(%rdi),%rdi - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - - movdqa %xmm4,%xmm0 - movdqa %xmm8,%xmm4 - movdqa %xmm12,%xmm8 - movdqa %xmm1,%xmm12 - movdqa %xmm5,%xmm1 - movdqa %xmm9,%xmm5 - movdqa %xmm13,%xmm9 - jmp .Lseal_sse_128_tail_xor - -.Lseal_sse_tail_16: - testq %rbx,%rbx - jz .Lprocess_blocks_of_extra_in - - movq %rbx,%r8 - movq %rbx,%rcx - leaq -1(%rsi,%rbx,1),%rsi - pxor %xmm15,%xmm15 -.Lseal_sse_tail_16_compose: - pslldq $1,%xmm15 - pinsrb $0,(%rsi),%xmm15 - leaq -1(%rsi),%rsi - decq %rcx - jne .Lseal_sse_tail_16_compose - - - pxor %xmm0,%xmm15 - - - movq %rbx,%rcx - movdqu %xmm15,%xmm0 -.Lseal_sse_tail_16_extract: - pextrb $0,%xmm0,(%rdi) - psrldq $1,%xmm0 - addq $1,%rdi - subq $1,%rcx - jnz .Lseal_sse_tail_16_extract - - - - - - - - - movq 288 + 0 + 32(%rsp),%r9 - movq 56(%r9),%r14 - movq 48(%r9),%r13 - testq %r14,%r14 - jz .Lprocess_partial_block - - movq $16,%r15 - subq %rbx,%r15 - cmpq %r15,%r14 - - jge .Lload_extra_in - movq %r14,%r15 - -.Lload_extra_in: - - - leaq -1(%r13,%r15,1),%rsi - - - addq %r15,%r13 - subq %r15,%r14 - movq %r13,48(%r9) - movq %r14,56(%r9) - - - - addq %r15,%r8 - - - pxor %xmm11,%xmm11 -.Lload_extra_load_loop: - pslldq $1,%xmm11 - pinsrb $0,(%rsi),%xmm11 - leaq -1(%rsi),%rsi - subq $1,%r15 - jnz .Lload_extra_load_loop - - - - - movq %rbx,%r15 - -.Lload_extra_shift_loop: - pslldq $1,%xmm11 - subq $1,%r15 - jnz .Lload_extra_shift_loop - - - - - leaq .Land_masks(%rip),%r15 - shlq $4,%rbx - pand -16(%r15,%rbx,1),%xmm15 - - - por %xmm11,%xmm15 - - - -.byte 102,77,15,126,253 - pextrq $1,%xmm15,%r14 - addq %r13,%r10 - adcq %r14,%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - -.Lprocess_blocks_of_extra_in: - - movq 288+32+0 (%rsp),%r9 - movq 48(%r9),%rsi - movq 56(%r9),%r8 - movq %r8,%rcx - shrq $4,%r8 - -.Lprocess_extra_hash_loop: - jz process_extra_in_trailer - addq 0+0(%rsi),%r10 - adcq 8+0(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rsi),%rsi - subq $1,%r8 - jmp .Lprocess_extra_hash_loop -process_extra_in_trailer: - andq $15,%rcx - movq %rcx,%rbx - jz .Ldo_length_block - leaq -1(%rsi,%rcx,1),%rsi - -.Lprocess_extra_in_trailer_load: - pslldq $1,%xmm15 - pinsrb $0,(%rsi),%xmm15 - leaq -1(%rsi),%rsi - subq $1,%rcx - jnz .Lprocess_extra_in_trailer_load - -.Lprocess_partial_block: - - leaq .Land_masks(%rip),%r15 - shlq $4,%rbx - pand -16(%r15,%rbx,1),%xmm15 -.byte 102,77,15,126,253 - pextrq $1,%xmm15,%r14 - addq %r13,%r10 - adcq %r14,%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - -.Ldo_length_block: - addq 0+0+32(%rbp),%r10 - adcq 8+0+32(%rbp),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - - movq %r10,%r13 - movq %r11,%r14 - movq %r12,%r15 - subq $-5,%r10 - sbbq $-1,%r11 - sbbq $3,%r12 - cmovcq %r13,%r10 - cmovcq %r14,%r11 - cmovcq %r15,%r12 - - addq 0+0+16(%rbp),%r10 - adcq 8+0+16(%rbp),%r11 - -.cfi_remember_state - addq $288 + 0 + 32,%rsp -.cfi_adjust_cfa_offset -(288 + 32) - - popq %r9 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r9 - movq %r10,(%r9) - movq %r11,8(%r9) - popq %r15 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r15 - popq %r14 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r14 - popq %r13 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r13 - popq %r12 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r12 - popq %rbx -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbx - popq %rbp -.cfi_adjust_cfa_offset -8 -.cfi_restore %rbp - .byte 0xf3,0xc3 - -.Lseal_sse_128: -.cfi_restore_state - movdqu .Lchacha20_consts(%rip),%xmm0 - movdqa %xmm0,%xmm1 - movdqa %xmm0,%xmm2 - movdqu 0(%r9),%xmm4 - movdqa %xmm4,%xmm5 - movdqa %xmm4,%xmm6 - movdqu 16(%r9),%xmm8 - movdqa %xmm8,%xmm9 - movdqa %xmm8,%xmm10 - movdqu 32(%r9),%xmm14 - movdqa %xmm14,%xmm12 - paddd .Lsse_inc(%rip),%xmm12 - movdqa %xmm12,%xmm13 - paddd .Lsse_inc(%rip),%xmm13 - movdqa %xmm4,%xmm7 - movdqa %xmm8,%xmm11 - movdqa %xmm12,%xmm15 - movq $10,%r10 - -.Lseal_sse_128_rounds: - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,4 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,12 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,4 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,12 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,4 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,12 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol16(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm4 - pxor %xmm3,%xmm4 - paddd %xmm4,%xmm0 - pxor %xmm0,%xmm12 - pshufb .Lrol8(%rip),%xmm12 - paddd %xmm12,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,15,228,12 -.byte 102,69,15,58,15,192,8 -.byte 102,69,15,58,15,228,4 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol16(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm5 - pxor %xmm3,%xmm5 - paddd %xmm5,%xmm1 - pxor %xmm1,%xmm13 - pshufb .Lrol8(%rip),%xmm13 - paddd %xmm13,%xmm9 - pxor %xmm9,%xmm5 - movdqa %xmm5,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm5 - pxor %xmm3,%xmm5 -.byte 102,15,58,15,237,12 -.byte 102,69,15,58,15,201,8 -.byte 102,69,15,58,15,237,4 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol16(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $12,%xmm3 - psrld $20,%xmm6 - pxor %xmm3,%xmm6 - paddd %xmm6,%xmm2 - pxor %xmm2,%xmm14 - pshufb .Lrol8(%rip),%xmm14 - paddd %xmm14,%xmm10 - pxor %xmm10,%xmm6 - movdqa %xmm6,%xmm3 - pslld $7,%xmm3 - psrld $25,%xmm6 - pxor %xmm3,%xmm6 -.byte 102,15,58,15,246,12 -.byte 102,69,15,58,15,210,8 -.byte 102,69,15,58,15,246,4 - - decq %r10 - jnz .Lseal_sse_128_rounds - paddd .Lchacha20_consts(%rip),%xmm0 - paddd .Lchacha20_consts(%rip),%xmm1 - paddd .Lchacha20_consts(%rip),%xmm2 - paddd %xmm7,%xmm4 - paddd %xmm7,%xmm5 - paddd %xmm7,%xmm6 - paddd %xmm11,%xmm8 - paddd %xmm11,%xmm9 - paddd %xmm15,%xmm12 - paddd .Lsse_inc(%rip),%xmm15 - paddd %xmm15,%xmm13 - - pand .Lclamp(%rip),%xmm2 - movdqa %xmm2,0+0(%rbp) - movdqa %xmm6,0+16(%rbp) - - movq %r8,%r8 - call poly_hash_ad_internal - jmp .Lseal_sse_128_tail_xor -.size chacha20_poly1305_seal, .-chacha20_poly1305_seal -.cfi_endproc - - -.type chacha20_poly1305_open_avx2,@function -.align 64 -chacha20_poly1305_open_avx2: -.cfi_startproc - - -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r9,-64 -.cfi_adjust_cfa_offset 288 + 32 - - vzeroupper - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vbroadcasti128 0(%r9),%ymm4 - vbroadcasti128 16(%r9),%ymm8 - vbroadcasti128 32(%r9),%ymm12 - vpaddd .Lavx2_init(%rip),%ymm12,%ymm12 - cmpq $192,%rbx - jbe .Lopen_avx2_192 - cmpq $320,%rbx - jbe .Lopen_avx2_320 - - vmovdqa %ymm4,0+64(%rbp) - vmovdqa %ymm8,0+96(%rbp) - vmovdqa %ymm12,0+160(%rbp) - movq $10,%r10 -.Lopen_avx2_init_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - - decq %r10 - jne .Lopen_avx2_init_rounds - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - - vpand .Lclamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0+0(%rbp) - - vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 - - movq %r8,%r8 - call poly_hash_ad_internal - - xorq %rcx,%rcx -.Lopen_avx2_init_hash: - addq 0+0(%rsi,%rcx,1),%r10 - adcq 8+0(%rsi,%rcx,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - addq $16,%rcx - cmpq $64,%rcx - jne .Lopen_avx2_init_hash - - vpxor 0(%rsi),%ymm0,%ymm0 - vpxor 32(%rsi),%ymm4,%ymm4 - - vmovdqu %ymm0,0(%rdi) - vmovdqu %ymm4,32(%rdi) - leaq 64(%rsi),%rsi - leaq 64(%rdi),%rdi - subq $64,%rbx -.Lopen_avx2_main_loop: - - cmpq $512,%rbx - jb .Lopen_avx2_main_loop_done - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa .Lavx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm15 - vpaddd %ymm15,%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,0+256(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm12,0+160(%rbp) - - xorq %rcx,%rcx -.Lopen_avx2_main_loop_rounds: - addq 0+0(%rsi,%rcx,1),%r10 - adcq 8+0(%rsi,%rcx,1),%r11 - adcq $1,%r12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - addq %rax,%r15 - adcq %rdx,%r9 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - addq 0+16(%rsi,%rcx,1),%r10 - adcq 8+16(%rsi,%rcx,1),%r11 - adcq $1,%r12 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - addq %rax,%r15 - adcq %rdx,%r9 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - addq 0+32(%rsi,%rcx,1),%r10 - adcq 8+32(%rsi,%rcx,1),%r11 - adcq $1,%r12 - - leaq 48(%rcx),%rcx - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - addq %rax,%r15 - adcq %rdx,%r9 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpalignr $4,%ymm12,%ymm12,%ymm12 - - cmpq $60*8,%rcx - jne .Lopen_avx2_main_loop_rounds - vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 0+64(%rbp),%ymm7,%ymm7 - vpaddd 0+96(%rbp),%ymm11,%ymm11 - vpaddd 0+256(%rbp),%ymm15,%ymm15 - vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vmovdqa %ymm0,0+128(%rbp) - addq 0+60*8(%rsi),%r10 - adcq 8+60*8(%rsi),%r11 - adcq $1,%r12 - vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 - vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 - vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 - vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 - vpxor 0+0(%rsi),%ymm0,%ymm0 - vpxor 32+0(%rsi),%ymm3,%ymm3 - vpxor 64+0(%rsi),%ymm7,%ymm7 - vpxor 96+0(%rsi),%ymm11,%ymm11 - vmovdqu %ymm0,0+0(%rdi) - vmovdqu %ymm3,32+0(%rdi) - vmovdqu %ymm7,64+0(%rdi) - vmovdqu %ymm11,96+0(%rdi) - - vmovdqa 0+128(%rbp),%ymm0 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm2,%ymm2 - vpxor 64+128(%rsi),%ymm6,%ymm6 - vpxor 96+128(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm2,32+128(%rdi) - vmovdqu %ymm6,64+128(%rdi) - vmovdqu %ymm10,96+128(%rdi) - addq 0+60*8+16(%rsi),%r10 - adcq 8+60*8+16(%rsi),%r11 - adcq $1,%r12 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+256(%rsi),%ymm3,%ymm3 - vpxor 32+256(%rsi),%ymm1,%ymm1 - vpxor 64+256(%rsi),%ymm5,%ymm5 - vpxor 96+256(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+256(%rdi) - vmovdqu %ymm1,32+256(%rdi) - vmovdqu %ymm5,64+256(%rdi) - vmovdqu %ymm9,96+256(%rdi) - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 - vpxor 0+384(%rsi),%ymm3,%ymm3 - vpxor 32+384(%rsi),%ymm0,%ymm0 - vpxor 64+384(%rsi),%ymm4,%ymm4 - vpxor 96+384(%rsi),%ymm8,%ymm8 - vmovdqu %ymm3,0+384(%rdi) - vmovdqu %ymm0,32+384(%rdi) - vmovdqu %ymm4,64+384(%rdi) - vmovdqu %ymm8,96+384(%rdi) - - leaq 512(%rsi),%rsi - leaq 512(%rdi),%rdi - subq $512,%rbx - jmp .Lopen_avx2_main_loop -.Lopen_avx2_main_loop_done: - testq %rbx,%rbx - vzeroupper - je .Lopen_sse_finalize - - cmpq $384,%rbx - ja .Lopen_avx2_tail_512 - cmpq $256,%rbx - ja .Lopen_avx2_tail_384 - cmpq $128,%rbx - ja .Lopen_avx2_tail_256 - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa .Lavx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - - xorq %r8,%r8 - movq %rbx,%rcx - andq $-16,%rcx - testq %rcx,%rcx - je .Lopen_avx2_tail_128_rounds -.Lopen_avx2_tail_128_rounds_and_x1hash: - addq 0+0(%rsi,%r8,1),%r10 - adcq 8+0(%rsi,%r8,1),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - -.Lopen_avx2_tail_128_rounds: - addq $16,%r8 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - - cmpq %rcx,%r8 - jb .Lopen_avx2_tail_128_rounds_and_x1hash - cmpq $160,%r8 - jne .Lopen_avx2_tail_128_rounds - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - jmp .Lopen_avx2_tail_128_xor - -.Lopen_avx2_tail_256: - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa .Lavx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - - movq %rbx,0+128(%rbp) - movq %rbx,%rcx - subq $128,%rcx - shrq $4,%rcx - movq $10,%r8 - cmpq $10,%rcx - cmovgq %r8,%rcx - movq %rsi,%rbx - xorq %r8,%r8 -.Lopen_avx2_tail_256_rounds_and_x1hash: - addq 0+0(%rbx),%r10 - adcq 8+0(%rbx),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rbx),%rbx -.Lopen_avx2_tail_256_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - - incq %r8 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm6,%ymm6,%ymm6 - - cmpq %rcx,%r8 - jb .Lopen_avx2_tail_256_rounds_and_x1hash - cmpq $10,%r8 - jne .Lopen_avx2_tail_256_rounds - movq %rbx,%r8 - subq %rsi,%rbx - movq %rbx,%rcx - movq 0+128(%rbp),%rbx -.Lopen_avx2_tail_256_hash: - addq $16,%rcx - cmpq %rbx,%rcx - jg .Lopen_avx2_tail_256_done - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%r8),%r8 - jmp .Lopen_avx2_tail_256_hash -.Lopen_avx2_tail_256_done: - vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+0(%rsi),%ymm3,%ymm3 - vpxor 32+0(%rsi),%ymm1,%ymm1 - vpxor 64+0(%rsi),%ymm5,%ymm5 - vpxor 96+0(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+0(%rdi) - vmovdqu %ymm1,32+0(%rdi) - vmovdqu %ymm5,64+0(%rdi) - vmovdqu %ymm9,96+0(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - leaq 128(%rsi),%rsi - leaq 128(%rdi),%rdi - subq $128,%rbx - jmp .Lopen_avx2_tail_128_xor - -.Lopen_avx2_tail_384: - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa .Lavx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm14,0+224(%rbp) - - movq %rbx,0+128(%rbp) - movq %rbx,%rcx - subq $256,%rcx - shrq $4,%rcx - addq $6,%rcx - movq $10,%r8 - cmpq $10,%rcx - cmovgq %r8,%rcx - movq %rsi,%rbx - xorq %r8,%r8 -.Lopen_avx2_tail_384_rounds_and_x2hash: - addq 0+0(%rbx),%r10 - adcq 8+0(%rbx),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rbx),%rbx -.Lopen_avx2_tail_384_rounds_and_x1hash: - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - addq 0+0(%rbx),%r10 - adcq 8+0(%rbx),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rbx),%rbx - incq %r8 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - - cmpq %rcx,%r8 - jb .Lopen_avx2_tail_384_rounds_and_x2hash - cmpq $10,%r8 - jne .Lopen_avx2_tail_384_rounds_and_x1hash - movq %rbx,%r8 - subq %rsi,%rbx - movq %rbx,%rcx - movq 0+128(%rbp),%rbx -.Lopen_avx2_384_tail_hash: - addq $16,%rcx - cmpq %rbx,%rcx - jg .Lopen_avx2_384_tail_done - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%r8),%r8 - jmp .Lopen_avx2_384_tail_hash -.Lopen_avx2_384_tail_done: - vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+0(%rsi),%ymm3,%ymm3 - vpxor 32+0(%rsi),%ymm2,%ymm2 - vpxor 64+0(%rsi),%ymm6,%ymm6 - vpxor 96+0(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+0(%rdi) - vmovdqu %ymm2,32+0(%rdi) - vmovdqu %ymm6,64+0(%rdi) - vmovdqu %ymm10,96+0(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm1,%ymm1 - vpxor 64+128(%rsi),%ymm5,%ymm5 - vpxor 96+128(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm1,32+128(%rdi) - vmovdqu %ymm5,64+128(%rdi) - vmovdqu %ymm9,96+128(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - leaq 256(%rsi),%rsi - leaq 256(%rdi),%rdi - subq $256,%rbx - jmp .Lopen_avx2_tail_128_xor - -.Lopen_avx2_tail_512: - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa .Lavx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm15 - vpaddd %ymm15,%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,0+256(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm12,0+160(%rbp) - - xorq %rcx,%rcx - movq %rsi,%r8 -.Lopen_avx2_tail_512_rounds_and_x2hash: - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%r8),%r8 -.Lopen_avx2_tail_512_rounds_and_x1hash: - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - addq 0+16(%r8),%r10 - adcq 8+16(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%r8),%r8 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm12,%ymm12,%ymm12 - - incq %rcx - cmpq $4,%rcx - jl .Lopen_avx2_tail_512_rounds_and_x2hash - cmpq $10,%rcx - jne .Lopen_avx2_tail_512_rounds_and_x1hash - movq %rbx,%rcx - subq $384,%rcx - andq $-16,%rcx -.Lopen_avx2_tail_512_hash: - testq %rcx,%rcx - je .Lopen_avx2_tail_512_done - addq 0+0(%r8),%r10 - adcq 8+0(%r8),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%r8),%r8 - subq $16,%rcx - jmp .Lopen_avx2_tail_512_hash -.Lopen_avx2_tail_512_done: - vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 0+64(%rbp),%ymm7,%ymm7 - vpaddd 0+96(%rbp),%ymm11,%ymm11 - vpaddd 0+256(%rbp),%ymm15,%ymm15 - vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vmovdqa %ymm0,0+128(%rbp) - vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 - vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 - vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 - vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 - vpxor 0+0(%rsi),%ymm0,%ymm0 - vpxor 32+0(%rsi),%ymm3,%ymm3 - vpxor 64+0(%rsi),%ymm7,%ymm7 - vpxor 96+0(%rsi),%ymm11,%ymm11 - vmovdqu %ymm0,0+0(%rdi) - vmovdqu %ymm3,32+0(%rdi) - vmovdqu %ymm7,64+0(%rdi) - vmovdqu %ymm11,96+0(%rdi) - - vmovdqa 0+128(%rbp),%ymm0 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm2,%ymm2 - vpxor 64+128(%rsi),%ymm6,%ymm6 - vpxor 96+128(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm2,32+128(%rdi) - vmovdqu %ymm6,64+128(%rdi) - vmovdqu %ymm10,96+128(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+256(%rsi),%ymm3,%ymm3 - vpxor 32+256(%rsi),%ymm1,%ymm1 - vpxor 64+256(%rsi),%ymm5,%ymm5 - vpxor 96+256(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+256(%rdi) - vmovdqu %ymm1,32+256(%rdi) - vmovdqu %ymm5,64+256(%rdi) - vmovdqu %ymm9,96+256(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - leaq 384(%rsi),%rsi - leaq 384(%rdi),%rdi - subq $384,%rbx -.Lopen_avx2_tail_128_xor: - cmpq $32,%rbx - jb .Lopen_avx2_tail_32_xor - subq $32,%rbx - vpxor (%rsi),%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - leaq 32(%rsi),%rsi - leaq 32(%rdi),%rdi - vmovdqa %ymm4,%ymm0 - vmovdqa %ymm8,%ymm4 - vmovdqa %ymm12,%ymm8 - jmp .Lopen_avx2_tail_128_xor -.Lopen_avx2_tail_32_xor: - cmpq $16,%rbx - vmovdqa %xmm0,%xmm1 - jb .Lopen_avx2_exit - subq $16,%rbx - - vpxor (%rsi),%xmm0,%xmm1 - vmovdqu %xmm1,(%rdi) - leaq 16(%rsi),%rsi - leaq 16(%rdi),%rdi - vperm2i128 $0x11,%ymm0,%ymm0,%ymm0 - vmovdqa %xmm0,%xmm1 -.Lopen_avx2_exit: - vzeroupper - jmp .Lopen_sse_tail_16 - -.Lopen_avx2_192: - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm8,%ymm10 - vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 - vmovdqa %ymm12,%ymm11 - vmovdqa %ymm13,%ymm15 - movq $10,%r10 -.Lopen_avx2_192_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - - decq %r10 - jne .Lopen_avx2_192_rounds - vpaddd %ymm2,%ymm0,%ymm0 - vpaddd %ymm2,%ymm1,%ymm1 - vpaddd %ymm6,%ymm4,%ymm4 - vpaddd %ymm6,%ymm5,%ymm5 - vpaddd %ymm10,%ymm8,%ymm8 - vpaddd %ymm10,%ymm9,%ymm9 - vpaddd %ymm11,%ymm12,%ymm12 - vpaddd %ymm15,%ymm13,%ymm13 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - - vpand .Lclamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0+0(%rbp) - - vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 -.Lopen_avx2_short: - movq %r8,%r8 - call poly_hash_ad_internal -.Lopen_avx2_short_hash_and_xor_loop: - cmpq $32,%rbx - jb .Lopen_avx2_short_tail_32 - subq $32,%rbx - addq 0+0(%rsi),%r10 - adcq 8+0(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - addq 0+16(%rsi),%r10 - adcq 8+16(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - - vpxor (%rsi),%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - leaq 32(%rsi),%rsi - leaq 32(%rdi),%rdi - - vmovdqa %ymm4,%ymm0 - vmovdqa %ymm8,%ymm4 - vmovdqa %ymm12,%ymm8 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm5,%ymm1 - vmovdqa %ymm9,%ymm5 - vmovdqa %ymm13,%ymm9 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm6,%ymm2 - jmp .Lopen_avx2_short_hash_and_xor_loop -.Lopen_avx2_short_tail_32: - cmpq $16,%rbx - vmovdqa %xmm0,%xmm1 - jb .Lopen_avx2_short_tail_32_exit - subq $16,%rbx - addq 0+0(%rsi),%r10 - adcq 8+0(%rsi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - vpxor (%rsi),%xmm0,%xmm3 - vmovdqu %xmm3,(%rdi) - leaq 16(%rsi),%rsi - leaq 16(%rdi),%rdi - vextracti128 $1,%ymm0,%xmm1 -.Lopen_avx2_short_tail_32_exit: - vzeroupper - jmp .Lopen_sse_tail_16 - -.Lopen_avx2_320: - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm8,%ymm10 - vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 - vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm14,0+224(%rbp) - movq $10,%r10 -.Lopen_avx2_320_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm6,%ymm6,%ymm6 - - decq %r10 - jne .Lopen_avx2_320_rounds - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 - vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 - vpaddd %ymm7,%ymm4,%ymm4 - vpaddd %ymm7,%ymm5,%ymm5 - vpaddd %ymm7,%ymm6,%ymm6 - vpaddd %ymm11,%ymm8,%ymm8 - vpaddd %ymm11,%ymm9,%ymm9 - vpaddd %ymm11,%ymm10,%ymm10 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - - vpand .Lclamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0+0(%rbp) - - vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 - jmp .Lopen_avx2_short -.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2 -.cfi_endproc - - -.type chacha20_poly1305_seal_avx2,@function -.align 64 -chacha20_poly1305_seal_avx2: -.cfi_startproc - - -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r9,-64 -.cfi_adjust_cfa_offset 288 + 32 - - vzeroupper - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vbroadcasti128 0(%r9),%ymm4 - vbroadcasti128 16(%r9),%ymm8 - vbroadcasti128 32(%r9),%ymm12 - vpaddd .Lavx2_init(%rip),%ymm12,%ymm12 - cmpq $192,%rbx - jbe .Lseal_avx2_192 - cmpq $320,%rbx - jbe .Lseal_avx2_320 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm4,0+64(%rbp) - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm8,%ymm11 - vmovdqa %ymm8,0+96(%rbp) - vmovdqa %ymm12,%ymm15 - vpaddd .Lavx2_inc(%rip),%ymm15,%ymm14 - vpaddd .Lavx2_inc(%rip),%ymm14,%ymm13 - vpaddd .Lavx2_inc(%rip),%ymm13,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm15,0+256(%rbp) - movq $10,%r10 -.Lseal_avx2_init_rounds: - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm12,%ymm12,%ymm12 - - decq %r10 - jnz .Lseal_avx2_init_rounds - vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 0+64(%rbp),%ymm7,%ymm7 - vpaddd 0+96(%rbp),%ymm11,%ymm11 - vpaddd 0+256(%rbp),%ymm15,%ymm15 - vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 - vperm2i128 $0x02,%ymm3,%ymm7,%ymm15 - vperm2i128 $0x13,%ymm3,%ymm7,%ymm3 - vpand .Lclamp(%rip),%ymm15,%ymm15 - vmovdqa %ymm15,0+0(%rbp) - movq %r8,%r8 - call poly_hash_ad_internal - - vpxor 0(%rsi),%ymm3,%ymm3 - vpxor 32(%rsi),%ymm11,%ymm11 - vmovdqu %ymm3,0(%rdi) - vmovdqu %ymm11,32(%rdi) - vperm2i128 $0x02,%ymm2,%ymm6,%ymm15 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+64(%rsi),%ymm15,%ymm15 - vpxor 32+64(%rsi),%ymm2,%ymm2 - vpxor 64+64(%rsi),%ymm6,%ymm6 - vpxor 96+64(%rsi),%ymm10,%ymm10 - vmovdqu %ymm15,0+64(%rdi) - vmovdqu %ymm2,32+64(%rdi) - vmovdqu %ymm6,64+64(%rdi) - vmovdqu %ymm10,96+64(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm15 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+192(%rsi),%ymm15,%ymm15 - vpxor 32+192(%rsi),%ymm1,%ymm1 - vpxor 64+192(%rsi),%ymm5,%ymm5 - vpxor 96+192(%rsi),%ymm9,%ymm9 - vmovdqu %ymm15,0+192(%rdi) - vmovdqu %ymm1,32+192(%rdi) - vmovdqu %ymm5,64+192(%rdi) - vmovdqu %ymm9,96+192(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm15 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm15,%ymm8 - - leaq 320(%rsi),%rsi - subq $320,%rbx - movq $320,%rcx - cmpq $128,%rbx - jbe .Lseal_avx2_short_hash_remainder - vpxor 0(%rsi),%ymm0,%ymm0 - vpxor 32(%rsi),%ymm4,%ymm4 - vpxor 64(%rsi),%ymm8,%ymm8 - vpxor 96(%rsi),%ymm12,%ymm12 - vmovdqu %ymm0,320(%rdi) - vmovdqu %ymm4,352(%rdi) - vmovdqu %ymm8,384(%rdi) - vmovdqu %ymm12,416(%rdi) - leaq 128(%rsi),%rsi - subq $128,%rbx - movq $8,%rcx - movq $2,%r8 - cmpq $128,%rbx - jbe .Lseal_avx2_tail_128 - cmpq $256,%rbx - jbe .Lseal_avx2_tail_256 - cmpq $384,%rbx - jbe .Lseal_avx2_tail_384 - cmpq $512,%rbx - jbe .Lseal_avx2_tail_512 - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa .Lavx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm15 - vpaddd %ymm15,%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,0+256(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - - subq $16,%rdi - movq $9,%rcx - jmp .Lseal_avx2_main_loop_rounds_entry -.align 32 -.Lseal_avx2_main_loop: - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa .Lavx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm15 - vpaddd %ymm15,%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,0+256(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm12,0+160(%rbp) - - movq $10,%rcx -.align 32 -.Lseal_avx2_main_loop_rounds: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - addq %rax,%r15 - adcq %rdx,%r9 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - -.Lseal_avx2_main_loop_rounds_entry: - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - addq %rax,%r15 - adcq %rdx,%r9 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - addq 0+32(%rdi),%r10 - adcq 8+32(%rdi),%r11 - adcq $1,%r12 - - leaq 48(%rdi),%rdi - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - addq %rax,%r15 - adcq %rdx,%r9 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpalignr $4,%ymm12,%ymm12,%ymm12 - - decq %rcx - jne .Lseal_avx2_main_loop_rounds - vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 0+64(%rbp),%ymm7,%ymm7 - vpaddd 0+96(%rbp),%ymm11,%ymm11 - vpaddd 0+256(%rbp),%ymm15,%ymm15 - vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vmovdqa %ymm0,0+128(%rbp) - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 - vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 - vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 - vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 - vpxor 0+0(%rsi),%ymm0,%ymm0 - vpxor 32+0(%rsi),%ymm3,%ymm3 - vpxor 64+0(%rsi),%ymm7,%ymm7 - vpxor 96+0(%rsi),%ymm11,%ymm11 - vmovdqu %ymm0,0+0(%rdi) - vmovdqu %ymm3,32+0(%rdi) - vmovdqu %ymm7,64+0(%rdi) - vmovdqu %ymm11,96+0(%rdi) - - vmovdqa 0+128(%rbp),%ymm0 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm2,%ymm2 - vpxor 64+128(%rsi),%ymm6,%ymm6 - vpxor 96+128(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm2,32+128(%rdi) - vmovdqu %ymm6,64+128(%rdi) - vmovdqu %ymm10,96+128(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+256(%rsi),%ymm3,%ymm3 - vpxor 32+256(%rsi),%ymm1,%ymm1 - vpxor 64+256(%rsi),%ymm5,%ymm5 - vpxor 96+256(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+256(%rdi) - vmovdqu %ymm1,32+256(%rdi) - vmovdqu %ymm5,64+256(%rdi) - vmovdqu %ymm9,96+256(%rdi) - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x13,%ymm0,%ymm4,%ymm4 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm8 - vpxor 0+384(%rsi),%ymm3,%ymm3 - vpxor 32+384(%rsi),%ymm0,%ymm0 - vpxor 64+384(%rsi),%ymm4,%ymm4 - vpxor 96+384(%rsi),%ymm8,%ymm8 - vmovdqu %ymm3,0+384(%rdi) - vmovdqu %ymm0,32+384(%rdi) - vmovdqu %ymm4,64+384(%rdi) - vmovdqu %ymm8,96+384(%rdi) - - leaq 512(%rsi),%rsi - subq $512,%rbx - cmpq $512,%rbx - jg .Lseal_avx2_main_loop - - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - movq $10,%rcx - xorq %r8,%r8 - - cmpq $384,%rbx - ja .Lseal_avx2_tail_512 - cmpq $256,%rbx - ja .Lseal_avx2_tail_384 - cmpq $128,%rbx - ja .Lseal_avx2_tail_256 - -.Lseal_avx2_tail_128: - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa .Lavx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - -.Lseal_avx2_tail_128_rounds_and_3xhash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -.Lseal_avx2_tail_128_rounds_and_2xhash: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - decq %rcx - jg .Lseal_avx2_tail_128_rounds_and_3xhash - decq %r8 - jge .Lseal_avx2_tail_128_rounds_and_2xhash - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - jmp .Lseal_avx2_short_loop - -.Lseal_avx2_tail_256: - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa .Lavx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - -.Lseal_avx2_tail_256_rounds_and_3xhash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -.Lseal_avx2_tail_256_rounds_and_2xhash: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - decq %rcx - jg .Lseal_avx2_tail_256_rounds_and_3xhash - decq %r8 - jge .Lseal_avx2_tail_256_rounds_and_2xhash - vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+0(%rsi),%ymm3,%ymm3 - vpxor 32+0(%rsi),%ymm1,%ymm1 - vpxor 64+0(%rsi),%ymm5,%ymm5 - vpxor 96+0(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+0(%rdi) - vmovdqu %ymm1,32+0(%rdi) - vmovdqu %ymm5,64+0(%rdi) - vmovdqu %ymm9,96+0(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - movq $128,%rcx - leaq 128(%rsi),%rsi - subq $128,%rbx - jmp .Lseal_avx2_short_hash_remainder - -.Lseal_avx2_tail_384: - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa .Lavx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm14,0+224(%rbp) - -.Lseal_avx2_tail_384_rounds_and_3xhash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -.Lseal_avx2_tail_384_rounds_and_2xhash: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm6,%ymm6,%ymm6 - - leaq 32(%rdi),%rdi - decq %rcx - jg .Lseal_avx2_tail_384_rounds_and_3xhash - decq %r8 - jge .Lseal_avx2_tail_384_rounds_and_2xhash - vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+0(%rsi),%ymm3,%ymm3 - vpxor 32+0(%rsi),%ymm2,%ymm2 - vpxor 64+0(%rsi),%ymm6,%ymm6 - vpxor 96+0(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+0(%rdi) - vmovdqu %ymm2,32+0(%rdi) - vmovdqu %ymm6,64+0(%rdi) - vmovdqu %ymm10,96+0(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm1,%ymm1 - vpxor 64+128(%rsi),%ymm5,%ymm5 - vpxor 96+128(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm1,32+128(%rdi) - vmovdqu %ymm5,64+128(%rdi) - vmovdqu %ymm9,96+128(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - movq $256,%rcx - leaq 256(%rsi),%rsi - subq $256,%rbx - jmp .Lseal_avx2_short_hash_remainder - -.Lseal_avx2_tail_512: - vmovdqa .Lchacha20_consts(%rip),%ymm0 - vmovdqa 0+64(%rbp),%ymm4 - vmovdqa 0+96(%rbp),%ymm8 - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm10 - vmovdqa %ymm0,%ymm3 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa .Lavx2_inc(%rip),%ymm12 - vpaddd 0+160(%rbp),%ymm12,%ymm15 - vpaddd %ymm15,%ymm12,%ymm14 - vpaddd %ymm14,%ymm12,%ymm13 - vpaddd %ymm13,%ymm12,%ymm12 - vmovdqa %ymm15,0+256(%rbp) - vmovdqa %ymm14,0+224(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm12,0+160(%rbp) - -.Lseal_avx2_tail_512_rounds_and_3xhash: - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - addq %rax,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi -.Lseal_avx2_tail_512_rounds_and_2xhash: - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $4,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $12,%ymm15,%ymm15,%ymm15 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $4,%ymm4,%ymm4,%ymm4 - addq %rax,%r15 - adcq %rdx,%r9 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vmovdqa %ymm8,0+128(%rbp) - vmovdqa .Lrol16(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $20,%ymm7,%ymm8 - vpslld $32-20,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $20,%ymm6,%ymm8 - vpslld $32-20,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $20,%ymm5,%ymm8 - vpslld $32-20,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $20,%ymm4,%ymm8 - vpslld $32-20,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa .Lrol8(%rip),%ymm8 - vpaddd %ymm7,%ymm3,%ymm3 - vpaddd %ymm6,%ymm2,%ymm2 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - vpaddd %ymm5,%ymm1,%ymm1 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm3,%ymm15,%ymm15 - vpxor %ymm2,%ymm14,%ymm14 - vpxor %ymm1,%ymm13,%ymm13 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb %ymm8,%ymm15,%ymm15 - vpshufb %ymm8,%ymm14,%ymm14 - vpshufb %ymm8,%ymm13,%ymm13 - vpshufb %ymm8,%ymm12,%ymm12 - vpaddd %ymm15,%ymm11,%ymm11 - vpaddd %ymm14,%ymm10,%ymm10 - vpaddd %ymm13,%ymm9,%ymm9 - vpaddd 0+128(%rbp),%ymm12,%ymm8 - vpxor %ymm11,%ymm7,%ymm7 - vpxor %ymm10,%ymm6,%ymm6 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa %ymm8,0+128(%rbp) - vpsrld $25,%ymm7,%ymm8 - movq 0+0+0(%rbp),%rdx - movq %rdx,%r15 - mulxq %r10,%r13,%r14 - mulxq %r11,%rax,%rdx - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - vpslld $32-25,%ymm7,%ymm7 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $25,%ymm6,%ymm8 - vpslld $32-25,%ymm6,%ymm6 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $25,%ymm5,%ymm8 - vpslld $32-25,%ymm5,%ymm5 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $25,%ymm4,%ymm8 - vpslld $32-25,%ymm4,%ymm4 - vpxor %ymm8,%ymm4,%ymm4 - vmovdqa 0+128(%rbp),%ymm8 - vpalignr $12,%ymm7,%ymm7,%ymm7 - vpalignr $8,%ymm11,%ymm11,%ymm11 - vpalignr $4,%ymm15,%ymm15,%ymm15 - vpalignr $12,%ymm6,%ymm6,%ymm6 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpalignr $8,%ymm9,%ymm9,%ymm9 - movq 8+0+0(%rbp),%rdx - mulxq %r10,%r10,%rax - addq %r10,%r14 - mulxq %r11,%r11,%r9 - adcq %r11,%r15 - adcq $0,%r9 - imulq %r12,%rdx - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm12,%ymm12,%ymm12 - - - - - - - - - - - - - - - - - addq %rax,%r15 - adcq %rdx,%r9 - - - - - - - - - - - - - - - - - - - - - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - decq %rcx - jg .Lseal_avx2_tail_512_rounds_and_3xhash - decq %r8 - jge .Lseal_avx2_tail_512_rounds_and_2xhash - vpaddd .Lchacha20_consts(%rip),%ymm3,%ymm3 - vpaddd 0+64(%rbp),%ymm7,%ymm7 - vpaddd 0+96(%rbp),%ymm11,%ymm11 - vpaddd 0+256(%rbp),%ymm15,%ymm15 - vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 - vpaddd 0+64(%rbp),%ymm6,%ymm6 - vpaddd 0+96(%rbp),%ymm10,%ymm10 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 - vpaddd 0+64(%rbp),%ymm5,%ymm5 - vpaddd 0+96(%rbp),%ymm9,%ymm9 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd 0+64(%rbp),%ymm4,%ymm4 - vpaddd 0+96(%rbp),%ymm8,%ymm8 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - - vmovdqa %ymm0,0+128(%rbp) - vperm2i128 $0x02,%ymm3,%ymm7,%ymm0 - vperm2i128 $0x13,%ymm3,%ymm7,%ymm7 - vperm2i128 $0x02,%ymm11,%ymm15,%ymm3 - vperm2i128 $0x13,%ymm11,%ymm15,%ymm11 - vpxor 0+0(%rsi),%ymm0,%ymm0 - vpxor 32+0(%rsi),%ymm3,%ymm3 - vpxor 64+0(%rsi),%ymm7,%ymm7 - vpxor 96+0(%rsi),%ymm11,%ymm11 - vmovdqu %ymm0,0+0(%rdi) - vmovdqu %ymm3,32+0(%rdi) - vmovdqu %ymm7,64+0(%rdi) - vmovdqu %ymm11,96+0(%rdi) - - vmovdqa 0+128(%rbp),%ymm0 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm3 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm6 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm10 - vpxor 0+128(%rsi),%ymm3,%ymm3 - vpxor 32+128(%rsi),%ymm2,%ymm2 - vpxor 64+128(%rsi),%ymm6,%ymm6 - vpxor 96+128(%rsi),%ymm10,%ymm10 - vmovdqu %ymm3,0+128(%rdi) - vmovdqu %ymm2,32+128(%rdi) - vmovdqu %ymm6,64+128(%rdi) - vmovdqu %ymm10,96+128(%rdi) - vperm2i128 $0x02,%ymm1,%ymm5,%ymm3 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm5 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm9 - vpxor 0+256(%rsi),%ymm3,%ymm3 - vpxor 32+256(%rsi),%ymm1,%ymm1 - vpxor 64+256(%rsi),%ymm5,%ymm5 - vpxor 96+256(%rsi),%ymm9,%ymm9 - vmovdqu %ymm3,0+256(%rdi) - vmovdqu %ymm1,32+256(%rdi) - vmovdqu %ymm5,64+256(%rdi) - vmovdqu %ymm9,96+256(%rdi) - vperm2i128 $0x13,%ymm0,%ymm4,%ymm3 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x02,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm12 - vmovdqa %ymm3,%ymm8 - - movq $384,%rcx - leaq 384(%rsi),%rsi - subq $384,%rbx - jmp .Lseal_avx2_short_hash_remainder - -.Lseal_avx2_320: - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm8,%ymm10 - vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 - vpaddd .Lavx2_inc(%rip),%ymm13,%ymm14 - vmovdqa %ymm4,%ymm7 - vmovdqa %ymm8,%ymm11 - vmovdqa %ymm12,0+160(%rbp) - vmovdqa %ymm13,0+192(%rbp) - vmovdqa %ymm14,0+224(%rbp) - movq $10,%r10 -.Lseal_avx2_320_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $12,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $4,%ymm6,%ymm6,%ymm6 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol16(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpsrld $20,%ymm6,%ymm3 - vpslld $12,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpaddd %ymm6,%ymm2,%ymm2 - vpxor %ymm2,%ymm14,%ymm14 - vpshufb .Lrol8(%rip),%ymm14,%ymm14 - vpaddd %ymm14,%ymm10,%ymm10 - vpxor %ymm10,%ymm6,%ymm6 - vpslld $7,%ymm6,%ymm3 - vpsrld $25,%ymm6,%ymm6 - vpxor %ymm3,%ymm6,%ymm6 - vpalignr $4,%ymm14,%ymm14,%ymm14 - vpalignr $8,%ymm10,%ymm10,%ymm10 - vpalignr $12,%ymm6,%ymm6,%ymm6 - - decq %r10 - jne .Lseal_avx2_320_rounds - vpaddd .Lchacha20_consts(%rip),%ymm0,%ymm0 - vpaddd .Lchacha20_consts(%rip),%ymm1,%ymm1 - vpaddd .Lchacha20_consts(%rip),%ymm2,%ymm2 - vpaddd %ymm7,%ymm4,%ymm4 - vpaddd %ymm7,%ymm5,%ymm5 - vpaddd %ymm7,%ymm6,%ymm6 - vpaddd %ymm11,%ymm8,%ymm8 - vpaddd %ymm11,%ymm9,%ymm9 - vpaddd %ymm11,%ymm10,%ymm10 - vpaddd 0+160(%rbp),%ymm12,%ymm12 - vpaddd 0+192(%rbp),%ymm13,%ymm13 - vpaddd 0+224(%rbp),%ymm14,%ymm14 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - - vpand .Lclamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0+0(%rbp) - - vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 - vperm2i128 $0x02,%ymm2,%ymm6,%ymm9 - vperm2i128 $0x02,%ymm10,%ymm14,%ymm13 - vperm2i128 $0x13,%ymm2,%ymm6,%ymm2 - vperm2i128 $0x13,%ymm10,%ymm14,%ymm6 - jmp .Lseal_avx2_short - -.Lseal_avx2_192: - vmovdqa %ymm0,%ymm1 - vmovdqa %ymm0,%ymm2 - vmovdqa %ymm4,%ymm5 - vmovdqa %ymm4,%ymm6 - vmovdqa %ymm8,%ymm9 - vmovdqa %ymm8,%ymm10 - vpaddd .Lavx2_inc(%rip),%ymm12,%ymm13 - vmovdqa %ymm12,%ymm11 - vmovdqa %ymm13,%ymm15 - movq $10,%r10 -.Lseal_avx2_192_rounds: - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $12,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $4,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $12,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $4,%ymm5,%ymm5,%ymm5 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol16(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $20,%ymm4,%ymm3 - vpslld $12,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpaddd %ymm4,%ymm0,%ymm0 - vpxor %ymm0,%ymm12,%ymm12 - vpshufb .Lrol8(%rip),%ymm12,%ymm12 - vpaddd %ymm12,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpslld $7,%ymm4,%ymm3 - vpsrld $25,%ymm4,%ymm4 - vpxor %ymm3,%ymm4,%ymm4 - vpalignr $4,%ymm12,%ymm12,%ymm12 - vpalignr $8,%ymm8,%ymm8,%ymm8 - vpalignr $12,%ymm4,%ymm4,%ymm4 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol16(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpsrld $20,%ymm5,%ymm3 - vpslld $12,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpaddd %ymm5,%ymm1,%ymm1 - vpxor %ymm1,%ymm13,%ymm13 - vpshufb .Lrol8(%rip),%ymm13,%ymm13 - vpaddd %ymm13,%ymm9,%ymm9 - vpxor %ymm9,%ymm5,%ymm5 - vpslld $7,%ymm5,%ymm3 - vpsrld $25,%ymm5,%ymm5 - vpxor %ymm3,%ymm5,%ymm5 - vpalignr $4,%ymm13,%ymm13,%ymm13 - vpalignr $8,%ymm9,%ymm9,%ymm9 - vpalignr $12,%ymm5,%ymm5,%ymm5 - - decq %r10 - jne .Lseal_avx2_192_rounds - vpaddd %ymm2,%ymm0,%ymm0 - vpaddd %ymm2,%ymm1,%ymm1 - vpaddd %ymm6,%ymm4,%ymm4 - vpaddd %ymm6,%ymm5,%ymm5 - vpaddd %ymm10,%ymm8,%ymm8 - vpaddd %ymm10,%ymm9,%ymm9 - vpaddd %ymm11,%ymm12,%ymm12 - vpaddd %ymm15,%ymm13,%ymm13 - vperm2i128 $0x02,%ymm0,%ymm4,%ymm3 - - vpand .Lclamp(%rip),%ymm3,%ymm3 - vmovdqa %ymm3,0+0(%rbp) - - vperm2i128 $0x13,%ymm0,%ymm4,%ymm0 - vperm2i128 $0x13,%ymm8,%ymm12,%ymm4 - vperm2i128 $0x02,%ymm1,%ymm5,%ymm8 - vperm2i128 $0x02,%ymm9,%ymm13,%ymm12 - vperm2i128 $0x13,%ymm1,%ymm5,%ymm1 - vperm2i128 $0x13,%ymm9,%ymm13,%ymm5 -.Lseal_avx2_short: - movq %r8,%r8 - call poly_hash_ad_internal - xorq %rcx,%rcx -.Lseal_avx2_short_hash_remainder: - cmpq $16,%rcx - jb .Lseal_avx2_short_loop - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - subq $16,%rcx - addq $16,%rdi - jmp .Lseal_avx2_short_hash_remainder -.Lseal_avx2_short_loop: - cmpq $32,%rbx - jb .Lseal_avx2_short_tail - subq $32,%rbx - - vpxor (%rsi),%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - leaq 32(%rsi),%rsi - - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - addq 0+16(%rdi),%r10 - adcq 8+16(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 32(%rdi),%rdi - - vmovdqa %ymm4,%ymm0 - vmovdqa %ymm8,%ymm4 - vmovdqa %ymm12,%ymm8 - vmovdqa %ymm1,%ymm12 - vmovdqa %ymm5,%ymm1 - vmovdqa %ymm9,%ymm5 - vmovdqa %ymm13,%ymm9 - vmovdqa %ymm2,%ymm13 - vmovdqa %ymm6,%ymm2 - jmp .Lseal_avx2_short_loop -.Lseal_avx2_short_tail: - cmpq $16,%rbx - jb .Lseal_avx2_exit - subq $16,%rbx - vpxor (%rsi),%xmm0,%xmm3 - vmovdqu %xmm3,(%rdi) - leaq 16(%rsi),%rsi - addq 0+0(%rdi),%r10 - adcq 8+0(%rdi),%r11 - adcq $1,%r12 - movq 0+0+0(%rbp),%rax - movq %rax,%r15 - mulq %r10 - movq %rax,%r13 - movq %rdx,%r14 - movq 0+0+0(%rbp),%rax - mulq %r11 - imulq %r12,%r15 - addq %rax,%r14 - adcq %rdx,%r15 - movq 8+0+0(%rbp),%rax - movq %rax,%r9 - mulq %r10 - addq %rax,%r14 - adcq $0,%rdx - movq %rdx,%r10 - movq 8+0+0(%rbp),%rax - mulq %r11 - addq %rax,%r15 - adcq $0,%rdx - imulq %r12,%r9 - addq %r10,%r15 - adcq %rdx,%r9 - movq %r13,%r10 - movq %r14,%r11 - movq %r15,%r12 - andq $3,%r12 - movq %r15,%r13 - andq $-4,%r13 - movq %r9,%r14 - shrdq $2,%r9,%r15 - shrq $2,%r9 - addq %r13,%r15 - adcq %r14,%r9 - addq %r15,%r10 - adcq %r9,%r11 - adcq $0,%r12 - - leaq 16(%rdi),%rdi - vextracti128 $1,%ymm0,%xmm0 -.Lseal_avx2_exit: - vzeroupper - jmp .Lseal_sse_tail_16 -.cfi_endproc -.size chacha20_poly1305_seal_avx2, .-chacha20_poly1305_seal_avx2 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S deleted file mode 100644 index b28f7f80..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S +++ /dev/null @@ -1,852 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - -.type _aesni_ctr32_ghash_6x,@function -.align 32 -_aesni_ctr32_ghash_6x: -.cfi_startproc - vmovdqu 32(%r11),%xmm2 - subq $6,%rdx - vpxor %xmm4,%xmm4,%xmm4 - vmovdqu 0-128(%rcx),%xmm15 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpaddb %xmm2,%xmm11,%xmm12 - vpaddb %xmm2,%xmm12,%xmm13 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm15,%xmm1,%xmm9 - vmovdqu %xmm4,16+8(%rsp) - jmp .Loop6x - -.align 32 -.Loop6x: - addl $100663296,%ebx - jc .Lhandle_ctr32 - vmovdqu 0-32(%r9),%xmm3 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm15,%xmm10,%xmm10 - vpxor %xmm15,%xmm11,%xmm11 - -.Lresume_ctr32: - vmovdqu %xmm1,(%r8) - vpclmulqdq $0x10,%xmm3,%xmm7,%xmm5 - vpxor %xmm15,%xmm12,%xmm12 - vmovups 16-128(%rcx),%xmm2 - vpclmulqdq $0x01,%xmm3,%xmm7,%xmm6 - - - - - - - - - - - - - - - - - - xorq %r12,%r12 - cmpq %r14,%r15 - - vaesenc %xmm2,%xmm9,%xmm9 - vmovdqu 48+8(%rsp),%xmm0 - vpxor %xmm15,%xmm13,%xmm13 - vpclmulqdq $0x00,%xmm3,%xmm7,%xmm1 - vaesenc %xmm2,%xmm10,%xmm10 - vpxor %xmm15,%xmm14,%xmm14 - setnc %r12b - vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 - vaesenc %xmm2,%xmm11,%xmm11 - vmovdqu 16-32(%r9),%xmm3 - negq %r12 - vaesenc %xmm2,%xmm12,%xmm12 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm3,%xmm0,%xmm5 - vpxor %xmm4,%xmm8,%xmm8 - vaesenc %xmm2,%xmm13,%xmm13 - vpxor %xmm5,%xmm1,%xmm4 - andq $0x60,%r12 - vmovups 32-128(%rcx),%xmm15 - vpclmulqdq $0x10,%xmm3,%xmm0,%xmm1 - vaesenc %xmm2,%xmm14,%xmm14 - - vpclmulqdq $0x01,%xmm3,%xmm0,%xmm2 - leaq (%r14,%r12,1),%r14 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpclmulqdq $0x11,%xmm3,%xmm0,%xmm3 - vmovdqu 64+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 88(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 80(%r14),%r12 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,32+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,40+8(%rsp) - vmovdqu 48-32(%r9),%xmm5 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 48-128(%rcx),%xmm15 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm5,%xmm0,%xmm1 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm5,%xmm0,%xmm2 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm3,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm5,%xmm0,%xmm3 - vaesenc %xmm15,%xmm11,%xmm11 - vpclmulqdq $0x11,%xmm5,%xmm0,%xmm5 - vmovdqu 80+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor %xmm1,%xmm4,%xmm4 - vmovdqu 64-32(%r9),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 64-128(%rcx),%xmm15 - vpxor %xmm2,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm1,%xmm0,%xmm2 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm1,%xmm0,%xmm3 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 72(%r14),%r13 - vpxor %xmm5,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm1,%xmm0,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 64(%r14),%r12 - vpclmulqdq $0x11,%xmm1,%xmm0,%xmm1 - vmovdqu 96+8(%rsp),%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,48+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,56+8(%rsp) - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 96-32(%r9),%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 80-128(%rcx),%xmm15 - vpxor %xmm3,%xmm6,%xmm6 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm2,%xmm0,%xmm5 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 56(%r14),%r13 - vpxor %xmm1,%xmm7,%xmm7 - vpclmulqdq $0x01,%xmm2,%xmm0,%xmm1 - vpxor 112+8(%rsp),%xmm8,%xmm8 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 48(%r14),%r12 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm2 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,64+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,72+8(%rsp) - vpxor %xmm3,%xmm4,%xmm4 - vmovdqu 112-32(%r9),%xmm3 - vaesenc %xmm15,%xmm14,%xmm14 - - vmovups 96-128(%rcx),%xmm15 - vpxor %xmm5,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm5 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm1,%xmm6,%xmm6 - vpclmulqdq $0x01,%xmm3,%xmm8,%xmm1 - vaesenc %xmm15,%xmm10,%xmm10 - movbeq 40(%r14),%r13 - vpxor %xmm2,%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm3,%xmm8,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 32(%r14),%r12 - vpclmulqdq $0x11,%xmm3,%xmm8,%xmm8 - vaesenc %xmm15,%xmm12,%xmm12 - movq %r13,80+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - movq %r12,88+8(%rsp) - vpxor %xmm5,%xmm6,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor %xmm1,%xmm6,%xmm6 - - vmovups 112-128(%rcx),%xmm15 - vpslldq $8,%xmm6,%xmm5 - vpxor %xmm2,%xmm4,%xmm4 - vmovdqu 16(%r11),%xmm3 - - vaesenc %xmm15,%xmm9,%xmm9 - vpxor %xmm8,%xmm7,%xmm7 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor %xmm5,%xmm4,%xmm4 - movbeq 24(%r14),%r13 - vaesenc %xmm15,%xmm11,%xmm11 - movbeq 16(%r14),%r12 - vpalignr $8,%xmm4,%xmm4,%xmm0 - vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 - movq %r13,96+8(%rsp) - vaesenc %xmm15,%xmm12,%xmm12 - movq %r12,104+8(%rsp) - vaesenc %xmm15,%xmm13,%xmm13 - vmovups 128-128(%rcx),%xmm1 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vmovups 144-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm10,%xmm10 - vpsrldq $8,%xmm6,%xmm6 - vaesenc %xmm1,%xmm11,%xmm11 - vpxor %xmm6,%xmm7,%xmm7 - vaesenc %xmm1,%xmm12,%xmm12 - vpxor %xmm0,%xmm4,%xmm4 - movbeq 8(%r14),%r13 - vaesenc %xmm1,%xmm13,%xmm13 - movbeq 0(%r14),%r12 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 160-128(%rcx),%xmm1 - cmpl $11,%ebp - jb .Lenc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 176-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 192-128(%rcx),%xmm1 - je .Lenc_tail - - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - - vaesenc %xmm1,%xmm9,%xmm9 - vaesenc %xmm1,%xmm10,%xmm10 - vaesenc %xmm1,%xmm11,%xmm11 - vaesenc %xmm1,%xmm12,%xmm12 - vaesenc %xmm1,%xmm13,%xmm13 - vmovups 208-128(%rcx),%xmm15 - vaesenc %xmm1,%xmm14,%xmm14 - vmovups 224-128(%rcx),%xmm1 - jmp .Lenc_tail - -.align 32 -.Lhandle_ctr32: - vmovdqu (%r11),%xmm0 - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vmovdqu 0-32(%r9),%xmm3 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm15,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm15,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpshufb %xmm0,%xmm14,%xmm14 - vpshufb %xmm0,%xmm1,%xmm1 - jmp .Lresume_ctr32 - -.align 32 -.Lenc_tail: - vaesenc %xmm15,%xmm9,%xmm9 - vmovdqu %xmm7,16+8(%rsp) - vpalignr $8,%xmm4,%xmm4,%xmm8 - vaesenc %xmm15,%xmm10,%xmm10 - vpclmulqdq $0x10,%xmm3,%xmm4,%xmm4 - vpxor 0(%rdi),%xmm1,%xmm2 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 16(%rdi),%xmm1,%xmm0 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 32(%rdi),%xmm1,%xmm5 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 48(%rdi),%xmm1,%xmm6 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 64(%rdi),%xmm1,%xmm7 - vpxor 80(%rdi),%xmm1,%xmm3 - vmovdqu (%r8),%xmm1 - - vaesenclast %xmm2,%xmm9,%xmm9 - vmovdqu 32(%r11),%xmm2 - vaesenclast %xmm0,%xmm10,%xmm10 - vpaddb %xmm2,%xmm1,%xmm0 - movq %r13,112+8(%rsp) - leaq 96(%rdi),%rdi - vaesenclast %xmm5,%xmm11,%xmm11 - vpaddb %xmm2,%xmm0,%xmm5 - movq %r12,120+8(%rsp) - leaq 96(%rsi),%rsi - vmovdqu 0-128(%rcx),%xmm15 - vaesenclast %xmm6,%xmm12,%xmm12 - vpaddb %xmm2,%xmm5,%xmm6 - vaesenclast %xmm7,%xmm13,%xmm13 - vpaddb %xmm2,%xmm6,%xmm7 - vaesenclast %xmm3,%xmm14,%xmm14 - vpaddb %xmm2,%xmm7,%xmm3 - - addq $0x60,%r10 - subq $0x6,%rdx - jc .L6x_done - - vmovups %xmm9,-96(%rsi) - vpxor %xmm15,%xmm1,%xmm9 - vmovups %xmm10,-80(%rsi) - vmovdqa %xmm0,%xmm10 - vmovups %xmm11,-64(%rsi) - vmovdqa %xmm5,%xmm11 - vmovups %xmm12,-48(%rsi) - vmovdqa %xmm6,%xmm12 - vmovups %xmm13,-32(%rsi) - vmovdqa %xmm7,%xmm13 - vmovups %xmm14,-16(%rsi) - vmovdqa %xmm3,%xmm14 - vmovdqu 32+8(%rsp),%xmm7 - jmp .Loop6x - -.L6x_done: - vpxor 16+8(%rsp),%xmm8,%xmm8 - vpxor %xmm4,%xmm8,%xmm8 - - .byte 0xf3,0xc3 -.cfi_endproc -.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x -.globl aesni_gcm_decrypt -.hidden aesni_gcm_decrypt -.type aesni_gcm_decrypt,@function -.align 32 -aesni_gcm_decrypt: -.cfi_startproc - xorq %r10,%r10 - - - - cmpq $0x60,%rdx - jb .Lgcm_dec_abort - - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq .Lbswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $0xf80,%r15 - vmovdqu (%r9),%xmm8 - andq $-128,%rsp - vmovdqu (%r11),%xmm0 - leaq 128(%rcx),%rcx - leaq 32+32(%r9),%r9 - movl 240-128(%rcx),%ebp - vpshufb %xmm0,%xmm8,%xmm8 - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc .Ldec_no_key_aliasing - cmpq $768,%r15 - jnc .Ldec_no_key_aliasing - subq %r15,%rsp -.Ldec_no_key_aliasing: - - vmovdqu 80(%rdi),%xmm7 - leaq (%rdi),%r14 - vmovdqu 64(%rdi),%xmm4 - - - - - - - - leaq -192(%rdi,%rdx,1),%r15 - - vmovdqu 48(%rdi),%xmm5 - shrq $4,%rdx - xorq %r10,%r10 - vmovdqu 32(%rdi),%xmm6 - vpshufb %xmm0,%xmm7,%xmm7 - vmovdqu 16(%rdi),%xmm2 - vpshufb %xmm0,%xmm4,%xmm4 - vmovdqu (%rdi),%xmm3 - vpshufb %xmm0,%xmm5,%xmm5 - vmovdqu %xmm4,48(%rsp) - vpshufb %xmm0,%xmm6,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm2,%xmm2 - vmovdqu %xmm6,80(%rsp) - vpshufb %xmm0,%xmm3,%xmm3 - vmovdqu %xmm2,96(%rsp) - vmovdqu %xmm3,112(%rsp) - - call _aesni_ctr32_ghash_6x - - vmovups %xmm9,-96(%rsi) - vmovups %xmm10,-80(%rsi) - vmovups %xmm11,-64(%rsi) - vmovups %xmm12,-48(%rsi) - vmovups %xmm13,-32(%rsi) - vmovups %xmm14,-16(%rsi) - - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) - - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lgcm_dec_abort: - movq %r10,%rax - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_gcm_decrypt,.-aesni_gcm_decrypt -.type _aesni_ctr32_6x,@function -.align 32 -_aesni_ctr32_6x: -.cfi_startproc - vmovdqu 0-128(%rcx),%xmm4 - vmovdqu 32(%r11),%xmm2 - leaq -1(%rbp),%r13 - vmovups 16-128(%rcx),%xmm15 - leaq 32-128(%rcx),%r12 - vpxor %xmm4,%xmm1,%xmm9 - addl $100663296,%ebx - jc .Lhandle_ctr32_2 - vpaddb %xmm2,%xmm1,%xmm10 - vpaddb %xmm2,%xmm10,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddb %xmm2,%xmm11,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddb %xmm2,%xmm12,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpaddb %xmm2,%xmm13,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpaddb %xmm2,%xmm14,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp .Loop_ctr32 - -.align 16 -.Loop_ctr32: - vaesenc %xmm15,%xmm9,%xmm9 - vaesenc %xmm15,%xmm10,%xmm10 - vaesenc %xmm15,%xmm11,%xmm11 - vaesenc %xmm15,%xmm12,%xmm12 - vaesenc %xmm15,%xmm13,%xmm13 - vaesenc %xmm15,%xmm14,%xmm14 - vmovups (%r12),%xmm15 - leaq 16(%r12),%r12 - decl %r13d - jnz .Loop_ctr32 - - vmovdqu (%r12),%xmm3 - vaesenc %xmm15,%xmm9,%xmm9 - vpxor 0(%rdi),%xmm3,%xmm4 - vaesenc %xmm15,%xmm10,%xmm10 - vpxor 16(%rdi),%xmm3,%xmm5 - vaesenc %xmm15,%xmm11,%xmm11 - vpxor 32(%rdi),%xmm3,%xmm6 - vaesenc %xmm15,%xmm12,%xmm12 - vpxor 48(%rdi),%xmm3,%xmm8 - vaesenc %xmm15,%xmm13,%xmm13 - vpxor 64(%rdi),%xmm3,%xmm2 - vaesenc %xmm15,%xmm14,%xmm14 - vpxor 80(%rdi),%xmm3,%xmm3 - leaq 96(%rdi),%rdi - - vaesenclast %xmm4,%xmm9,%xmm9 - vaesenclast %xmm5,%xmm10,%xmm10 - vaesenclast %xmm6,%xmm11,%xmm11 - vaesenclast %xmm8,%xmm12,%xmm12 - vaesenclast %xmm2,%xmm13,%xmm13 - vaesenclast %xmm3,%xmm14,%xmm14 - vmovups %xmm9,0(%rsi) - vmovups %xmm10,16(%rsi) - vmovups %xmm11,32(%rsi) - vmovups %xmm12,48(%rsi) - vmovups %xmm13,64(%rsi) - vmovups %xmm14,80(%rsi) - leaq 96(%rsi),%rsi - - .byte 0xf3,0xc3 -.align 32 -.Lhandle_ctr32_2: - vpshufb %xmm0,%xmm1,%xmm6 - vmovdqu 48(%r11),%xmm5 - vpaddd 64(%r11),%xmm6,%xmm10 - vpaddd %xmm5,%xmm6,%xmm11 - vpaddd %xmm5,%xmm10,%xmm12 - vpshufb %xmm0,%xmm10,%xmm10 - vpaddd %xmm5,%xmm11,%xmm13 - vpshufb %xmm0,%xmm11,%xmm11 - vpxor %xmm4,%xmm10,%xmm10 - vpaddd %xmm5,%xmm12,%xmm14 - vpshufb %xmm0,%xmm12,%xmm12 - vpxor %xmm4,%xmm11,%xmm11 - vpaddd %xmm5,%xmm13,%xmm1 - vpshufb %xmm0,%xmm13,%xmm13 - vpxor %xmm4,%xmm12,%xmm12 - vpshufb %xmm0,%xmm14,%xmm14 - vpxor %xmm4,%xmm13,%xmm13 - vpshufb %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm14,%xmm14 - jmp .Loop_ctr32 -.cfi_endproc -.size _aesni_ctr32_6x,.-_aesni_ctr32_6x - -.globl aesni_gcm_encrypt -.hidden aesni_gcm_encrypt -.type aesni_gcm_encrypt,@function -.align 32 -aesni_gcm_encrypt: -.cfi_startproc -#ifdef BORINGSSL_DISPATCH_TEST -.extern BORINGSSL_function_hit -.hidden BORINGSSL_function_hit - movb $1,BORINGSSL_function_hit+2(%rip) -#endif - xorq %r10,%r10 - - - - - cmpq $288,%rdx - jb .Lgcm_enc_abort - - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - vzeroupper - - vmovdqu (%r8),%xmm1 - addq $-128,%rsp - movl 12(%r8),%ebx - leaq .Lbswap_mask(%rip),%r11 - leaq -128(%rcx),%r14 - movq $0xf80,%r15 - leaq 128(%rcx),%rcx - vmovdqu (%r11),%xmm0 - andq $-128,%rsp - movl 240-128(%rcx),%ebp - - andq %r15,%r14 - andq %rsp,%r15 - subq %r14,%r15 - jc .Lenc_no_key_aliasing - cmpq $768,%r15 - jnc .Lenc_no_key_aliasing - subq %r15,%rsp -.Lenc_no_key_aliasing: - - leaq (%rsi),%r14 - - - - - - - - - leaq -192(%rsi,%rdx,1),%r15 - - shrq $4,%rdx - - call _aesni_ctr32_6x - vpshufb %xmm0,%xmm9,%xmm8 - vpshufb %xmm0,%xmm10,%xmm2 - vmovdqu %xmm8,112(%rsp) - vpshufb %xmm0,%xmm11,%xmm4 - vmovdqu %xmm2,96(%rsp) - vpshufb %xmm0,%xmm12,%xmm5 - vmovdqu %xmm4,80(%rsp) - vpshufb %xmm0,%xmm13,%xmm6 - vmovdqu %xmm5,64(%rsp) - vpshufb %xmm0,%xmm14,%xmm7 - vmovdqu %xmm6,48(%rsp) - - call _aesni_ctr32_6x - - vmovdqu (%r9),%xmm8 - leaq 32+32(%r9),%r9 - subq $12,%rdx - movq $192,%r10 - vpshufb %xmm0,%xmm8,%xmm8 - - call _aesni_ctr32_ghash_6x - vmovdqu 32(%rsp),%xmm7 - vmovdqu (%r11),%xmm0 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm7,%xmm7,%xmm1 - vmovdqu 32-32(%r9),%xmm15 - vmovups %xmm9,-96(%rsi) - vpshufb %xmm0,%xmm9,%xmm9 - vpxor %xmm7,%xmm1,%xmm1 - vmovups %xmm10,-80(%rsi) - vpshufb %xmm0,%xmm10,%xmm10 - vmovups %xmm11,-64(%rsi) - vpshufb %xmm0,%xmm11,%xmm11 - vmovups %xmm12,-48(%rsi) - vpshufb %xmm0,%xmm12,%xmm12 - vmovups %xmm13,-32(%rsi) - vpshufb %xmm0,%xmm13,%xmm13 - vmovups %xmm14,-16(%rsi) - vpshufb %xmm0,%xmm14,%xmm14 - vmovdqu %xmm9,16(%rsp) - vmovdqu 48(%rsp),%xmm6 - vmovdqu 16-32(%r9),%xmm0 - vpunpckhqdq %xmm6,%xmm6,%xmm2 - vpclmulqdq $0x00,%xmm3,%xmm7,%xmm5 - vpxor %xmm6,%xmm2,%xmm2 - vpclmulqdq $0x11,%xmm3,%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 - - vmovdqu 64(%rsp),%xmm9 - vpclmulqdq $0x00,%xmm0,%xmm6,%xmm4 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm9,%xmm9,%xmm5 - vpclmulqdq $0x11,%xmm0,%xmm6,%xmm6 - vpxor %xmm9,%xmm5,%xmm5 - vpxor %xmm7,%xmm6,%xmm6 - vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vmovdqu 80(%rsp),%xmm1 - vpclmulqdq $0x00,%xmm3,%xmm9,%xmm7 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm4,%xmm7,%xmm7 - vpunpckhqdq %xmm1,%xmm1,%xmm4 - vpclmulqdq $0x11,%xmm3,%xmm9,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpxor %xmm6,%xmm9,%xmm9 - vpclmulqdq $0x00,%xmm15,%xmm5,%xmm5 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 96(%rsp),%xmm2 - vpclmulqdq $0x00,%xmm0,%xmm1,%xmm6 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm7,%xmm6,%xmm6 - vpunpckhqdq %xmm2,%xmm2,%xmm7 - vpclmulqdq $0x11,%xmm0,%xmm1,%xmm1 - vpxor %xmm2,%xmm7,%xmm7 - vpxor %xmm9,%xmm1,%xmm1 - vpclmulqdq $0x10,%xmm15,%xmm4,%xmm4 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm5,%xmm4,%xmm4 - - vpxor 112(%rsp),%xmm8,%xmm8 - vpclmulqdq $0x00,%xmm3,%xmm2,%xmm5 - vmovdqu 112-32(%r9),%xmm0 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpxor %xmm6,%xmm5,%xmm5 - vpclmulqdq $0x11,%xmm3,%xmm2,%xmm2 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm1,%xmm2,%xmm2 - vpclmulqdq $0x00,%xmm15,%xmm7,%xmm7 - vpxor %xmm4,%xmm7,%xmm4 - - vpclmulqdq $0x00,%xmm0,%xmm8,%xmm6 - vmovdqu 0-32(%r9),%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm1 - vpclmulqdq $0x11,%xmm0,%xmm8,%xmm8 - vpxor %xmm14,%xmm1,%xmm1 - vpxor %xmm5,%xmm6,%xmm5 - vpclmulqdq $0x10,%xmm15,%xmm9,%xmm9 - vmovdqu 32-32(%r9),%xmm15 - vpxor %xmm2,%xmm8,%xmm7 - vpxor %xmm4,%xmm9,%xmm6 - - vmovdqu 16-32(%r9),%xmm0 - vpxor %xmm5,%xmm7,%xmm9 - vpclmulqdq $0x00,%xmm3,%xmm14,%xmm4 - vpxor %xmm9,%xmm6,%xmm6 - vpunpckhqdq %xmm13,%xmm13,%xmm2 - vpclmulqdq $0x11,%xmm3,%xmm14,%xmm14 - vpxor %xmm13,%xmm2,%xmm2 - vpslldq $8,%xmm6,%xmm9 - vpclmulqdq $0x00,%xmm15,%xmm1,%xmm1 - vpxor %xmm9,%xmm5,%xmm8 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm6,%xmm7,%xmm7 - - vpclmulqdq $0x00,%xmm0,%xmm13,%xmm5 - vmovdqu 48-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm12,%xmm12,%xmm9 - vpclmulqdq $0x11,%xmm0,%xmm13,%xmm13 - vpxor %xmm12,%xmm9,%xmm9 - vpxor %xmm14,%xmm13,%xmm13 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpclmulqdq $0x10,%xmm15,%xmm2,%xmm2 - vmovdqu 80-32(%r9),%xmm15 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0x00,%xmm3,%xmm12,%xmm4 - vmovdqu 64-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm11,%xmm11,%xmm1 - vpclmulqdq $0x11,%xmm3,%xmm12,%xmm12 - vpxor %xmm11,%xmm1,%xmm1 - vpxor %xmm13,%xmm12,%xmm12 - vxorps 16(%rsp),%xmm7,%xmm7 - vpclmulqdq $0x00,%xmm15,%xmm9,%xmm9 - vpxor %xmm2,%xmm9,%xmm9 - - vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0x00,%xmm0,%xmm11,%xmm5 - vmovdqu 96-32(%r9),%xmm3 - vpxor %xmm4,%xmm5,%xmm5 - vpunpckhqdq %xmm10,%xmm10,%xmm2 - vpclmulqdq $0x11,%xmm0,%xmm11,%xmm11 - vpxor %xmm10,%xmm2,%xmm2 - vpalignr $8,%xmm8,%xmm8,%xmm14 - vpxor %xmm12,%xmm11,%xmm11 - vpclmulqdq $0x10,%xmm15,%xmm1,%xmm1 - vmovdqu 128-32(%r9),%xmm15 - vpxor %xmm9,%xmm1,%xmm1 - - vxorps %xmm7,%xmm14,%xmm14 - vpclmulqdq $0x10,16(%r11),%xmm8,%xmm8 - vxorps %xmm14,%xmm8,%xmm8 - - vpclmulqdq $0x00,%xmm3,%xmm10,%xmm4 - vmovdqu 112-32(%r9),%xmm0 - vpxor %xmm5,%xmm4,%xmm4 - vpunpckhqdq %xmm8,%xmm8,%xmm9 - vpclmulqdq $0x11,%xmm3,%xmm10,%xmm10 - vpxor %xmm8,%xmm9,%xmm9 - vpxor %xmm11,%xmm10,%xmm10 - vpclmulqdq $0x00,%xmm15,%xmm2,%xmm2 - vpxor %xmm1,%xmm2,%xmm2 - - vpclmulqdq $0x00,%xmm0,%xmm8,%xmm5 - vpclmulqdq $0x11,%xmm0,%xmm8,%xmm7 - vpxor %xmm4,%xmm5,%xmm5 - vpclmulqdq $0x10,%xmm15,%xmm9,%xmm6 - vpxor %xmm10,%xmm7,%xmm7 - vpxor %xmm2,%xmm6,%xmm6 - - vpxor %xmm5,%xmm7,%xmm4 - vpxor %xmm4,%xmm6,%xmm6 - vpslldq $8,%xmm6,%xmm1 - vmovdqu 16(%r11),%xmm3 - vpsrldq $8,%xmm6,%xmm6 - vpxor %xmm1,%xmm5,%xmm8 - vpxor %xmm6,%xmm7,%xmm7 - - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 - vpxor %xmm2,%xmm8,%xmm8 - - vpalignr $8,%xmm8,%xmm8,%xmm2 - vpclmulqdq $0x10,%xmm3,%xmm8,%xmm8 - vpxor %xmm7,%xmm2,%xmm2 - vpxor %xmm2,%xmm8,%xmm8 - vpshufb (%r11),%xmm8,%xmm8 - vmovdqu %xmm8,-64(%r9) - - vzeroupper - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lgcm_enc_abort: - movq %r10,%rax - .byte 0xf3,0xc3 -.cfi_endproc -.size aesni_gcm_encrypt,.-aesni_gcm_encrypt -.align 64 -.Lbswap_mask: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.Lpoly: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -.Lone_msb: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 -.Ltwo_lsb: -.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.Lone_lsb: -.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 -.byte 65,69,83,45,78,73,32,71,67,77,32,109,111,100,117,108,101,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 64 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S deleted file mode 100644 index 2d4654f8..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/aesni-x86_64.S +++ /dev/null @@ -1,2506 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P -.globl aes_hw_encrypt -.hidden aes_hw_encrypt -.type aes_hw_encrypt,@function -.align 16 -aes_hw_encrypt: -.cfi_startproc -#ifdef BORINGSSL_DISPATCH_TEST -.extern BORINGSSL_function_hit -.hidden BORINGSSL_function_hit - movb $1,BORINGSSL_function_hit+1(%rip) -#endif - movups (%rdi),%xmm2 - movl 240(%rdx),%eax - movups (%rdx),%xmm0 - movups 16(%rdx),%xmm1 - leaq 32(%rdx),%rdx - xorps %xmm0,%xmm2 -.Loop_enc1_1: -.byte 102,15,56,220,209 - decl %eax - movups (%rdx),%xmm1 - leaq 16(%rdx),%rdx - jnz .Loop_enc1_1 -.byte 102,15,56,221,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - .byte 0xf3,0xc3 -.cfi_endproc -.size aes_hw_encrypt,.-aes_hw_encrypt - -.globl aes_hw_decrypt -.hidden aes_hw_decrypt -.type aes_hw_decrypt,@function -.align 16 -aes_hw_decrypt: -.cfi_startproc - movups (%rdi),%xmm2 - movl 240(%rdx),%eax - movups (%rdx),%xmm0 - movups 16(%rdx),%xmm1 - leaq 32(%rdx),%rdx - xorps %xmm0,%xmm2 -.Loop_dec1_2: -.byte 102,15,56,222,209 - decl %eax - movups (%rdx),%xmm1 - leaq 16(%rdx),%rdx - jnz .Loop_dec1_2 -.byte 102,15,56,223,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - .byte 0xf3,0xc3 -.cfi_endproc -.size aes_hw_decrypt, .-aes_hw_decrypt -.type _aesni_encrypt2,@function -.align 16 -_aesni_encrypt2: -.cfi_startproc - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax - addq $16,%rax - -.Lenc_loop2: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Lenc_loop2 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 - .byte 0xf3,0xc3 -.cfi_endproc -.size _aesni_encrypt2,.-_aesni_encrypt2 -.type _aesni_decrypt2,@function -.align 16 -_aesni_decrypt2: -.cfi_startproc - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax - addq $16,%rax - -.Ldec_loop2: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Ldec_loop2 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 - .byte 0xf3,0xc3 -.cfi_endproc -.size _aesni_decrypt2,.-_aesni_decrypt2 -.type _aesni_encrypt3,@function -.align 16 -_aesni_encrypt3: -.cfi_startproc - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - xorps %xmm0,%xmm4 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax - addq $16,%rax - -.Lenc_loop3: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Lenc_loop3 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 - .byte 0xf3,0xc3 -.cfi_endproc -.size _aesni_encrypt3,.-_aesni_encrypt3 -.type _aesni_decrypt3,@function -.align 16 -_aesni_decrypt3: -.cfi_startproc - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - xorps %xmm0,%xmm4 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax - addq $16,%rax - -.Ldec_loop3: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Ldec_loop3 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 - .byte 0xf3,0xc3 -.cfi_endproc -.size _aesni_decrypt3,.-_aesni_decrypt3 -.type _aesni_encrypt4,@function -.align 16 -_aesni_encrypt4: -.cfi_startproc - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - xorps %xmm0,%xmm4 - xorps %xmm0,%xmm5 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 0x0f,0x1f,0x00 - addq $16,%rax - -.Lenc_loop4: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Lenc_loop4 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 -.byte 102,15,56,221,232 - .byte 0xf3,0xc3 -.cfi_endproc -.size _aesni_encrypt4,.-_aesni_encrypt4 -.type _aesni_decrypt4,@function -.align 16 -_aesni_decrypt4: -.cfi_startproc - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - xorps %xmm0,%xmm4 - xorps %xmm0,%xmm5 - movups 32(%rcx),%xmm0 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 0x0f,0x1f,0x00 - addq $16,%rax - -.Ldec_loop4: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Ldec_loop4 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 -.byte 102,15,56,223,232 - .byte 0xf3,0xc3 -.cfi_endproc -.size _aesni_decrypt4,.-_aesni_decrypt4 -.type _aesni_encrypt6,@function -.align 16 -_aesni_encrypt6: -.cfi_startproc - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 -.byte 102,15,56,220,209 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 102,15,56,220,217 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 -.byte 102,15,56,220,225 - pxor %xmm0,%xmm7 - movups (%rcx,%rax,1),%xmm0 - addq $16,%rax - jmp .Lenc_loop6_enter -.align 16 -.Lenc_loop6: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.Lenc_loop6_enter: -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Lenc_loop6 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 - .byte 0xf3,0xc3 -.cfi_endproc -.size _aesni_encrypt6,.-_aesni_encrypt6 -.type _aesni_decrypt6,@function -.align 16 -_aesni_decrypt6: -.cfi_startproc - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - pxor %xmm0,%xmm3 - pxor %xmm0,%xmm4 -.byte 102,15,56,222,209 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 102,15,56,222,217 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 -.byte 102,15,56,222,225 - pxor %xmm0,%xmm7 - movups (%rcx,%rax,1),%xmm0 - addq $16,%rax - jmp .Ldec_loop6_enter -.align 16 -.Ldec_loop6: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.Ldec_loop6_enter: -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Ldec_loop6 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 - .byte 0xf3,0xc3 -.cfi_endproc -.size _aesni_decrypt6,.-_aesni_decrypt6 -.type _aesni_encrypt8,@function -.align 16 -_aesni_encrypt8: -.cfi_startproc - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - pxor %xmm0,%xmm4 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 102,15,56,220,209 - pxor %xmm0,%xmm7 - pxor %xmm0,%xmm8 -.byte 102,15,56,220,217 - pxor %xmm0,%xmm9 - movups (%rcx,%rax,1),%xmm0 - addq $16,%rax - jmp .Lenc_loop8_inner -.align 16 -.Lenc_loop8: -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.Lenc_loop8_inner: -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 -.Lenc_loop8_enter: - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Lenc_loop8 - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 -.byte 102,15,56,221,208 -.byte 102,15,56,221,216 -.byte 102,15,56,221,224 -.byte 102,15,56,221,232 -.byte 102,15,56,221,240 -.byte 102,15,56,221,248 -.byte 102,68,15,56,221,192 -.byte 102,68,15,56,221,200 - .byte 0xf3,0xc3 -.cfi_endproc -.size _aesni_encrypt8,.-_aesni_encrypt8 -.type _aesni_decrypt8,@function -.align 16 -_aesni_decrypt8: -.cfi_startproc - movups (%rcx),%xmm0 - shll $4,%eax - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm2 - xorps %xmm0,%xmm3 - pxor %xmm0,%xmm4 - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 - leaq 32(%rcx,%rax,1),%rcx - negq %rax -.byte 102,15,56,222,209 - pxor %xmm0,%xmm7 - pxor %xmm0,%xmm8 -.byte 102,15,56,222,217 - pxor %xmm0,%xmm9 - movups (%rcx,%rax,1),%xmm0 - addq $16,%rax - jmp .Ldec_loop8_inner -.align 16 -.Ldec_loop8: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.Ldec_loop8_inner: -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 -.Ldec_loop8_enter: - movups (%rcx,%rax,1),%xmm1 - addq $32,%rax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups -16(%rcx,%rax,1),%xmm0 - jnz .Ldec_loop8 - -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 -.byte 102,15,56,223,208 -.byte 102,15,56,223,216 -.byte 102,15,56,223,224 -.byte 102,15,56,223,232 -.byte 102,15,56,223,240 -.byte 102,15,56,223,248 -.byte 102,68,15,56,223,192 -.byte 102,68,15,56,223,200 - .byte 0xf3,0xc3 -.cfi_endproc -.size _aesni_decrypt8,.-_aesni_decrypt8 -.globl aes_hw_ecb_encrypt -.hidden aes_hw_ecb_encrypt -.type aes_hw_ecb_encrypt,@function -.align 16 -aes_hw_ecb_encrypt: -.cfi_startproc - andq $-16,%rdx - jz .Lecb_ret - - movl 240(%rcx),%eax - movups (%rcx),%xmm0 - movq %rcx,%r11 - movl %eax,%r10d - testl %r8d,%r8d - jz .Lecb_decrypt - - cmpq $0x80,%rdx - jb .Lecb_enc_tail - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - movdqu 96(%rdi),%xmm8 - movdqu 112(%rdi),%xmm9 - leaq 128(%rdi),%rdi - subq $0x80,%rdx - jmp .Lecb_enc_loop8_enter -.align 16 -.Lecb_enc_loop8: - movups %xmm2,(%rsi) - movq %r11,%rcx - movdqu (%rdi),%xmm2 - movl %r10d,%eax - movups %xmm3,16(%rsi) - movdqu 16(%rdi),%xmm3 - movups %xmm4,32(%rsi) - movdqu 32(%rdi),%xmm4 - movups %xmm5,48(%rsi) - movdqu 48(%rdi),%xmm5 - movups %xmm6,64(%rsi) - movdqu 64(%rdi),%xmm6 - movups %xmm7,80(%rsi) - movdqu 80(%rdi),%xmm7 - movups %xmm8,96(%rsi) - movdqu 96(%rdi),%xmm8 - movups %xmm9,112(%rsi) - leaq 128(%rsi),%rsi - movdqu 112(%rdi),%xmm9 - leaq 128(%rdi),%rdi -.Lecb_enc_loop8_enter: - - call _aesni_encrypt8 - - subq $0x80,%rdx - jnc .Lecb_enc_loop8 - - movups %xmm2,(%rsi) - movq %r11,%rcx - movups %xmm3,16(%rsi) - movl %r10d,%eax - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - movups %xmm8,96(%rsi) - movups %xmm9,112(%rsi) - leaq 128(%rsi),%rsi - addq $0x80,%rdx - jz .Lecb_ret - -.Lecb_enc_tail: - movups (%rdi),%xmm2 - cmpq $0x20,%rdx - jb .Lecb_enc_one - movups 16(%rdi),%xmm3 - je .Lecb_enc_two - movups 32(%rdi),%xmm4 - cmpq $0x40,%rdx - jb .Lecb_enc_three - movups 48(%rdi),%xmm5 - je .Lecb_enc_four - movups 64(%rdi),%xmm6 - cmpq $0x60,%rdx - jb .Lecb_enc_five - movups 80(%rdi),%xmm7 - je .Lecb_enc_six - movdqu 96(%rdi),%xmm8 - xorps %xmm9,%xmm9 - call _aesni_encrypt8 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - movups %xmm8,96(%rsi) - jmp .Lecb_ret -.align 16 -.Lecb_enc_one: - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_3: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_3 -.byte 102,15,56,221,209 - movups %xmm2,(%rsi) - jmp .Lecb_ret -.align 16 -.Lecb_enc_two: - call _aesni_encrypt2 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - jmp .Lecb_ret -.align 16 -.Lecb_enc_three: - call _aesni_encrypt3 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - jmp .Lecb_ret -.align 16 -.Lecb_enc_four: - call _aesni_encrypt4 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - jmp .Lecb_ret -.align 16 -.Lecb_enc_five: - xorps %xmm7,%xmm7 - call _aesni_encrypt6 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - jmp .Lecb_ret -.align 16 -.Lecb_enc_six: - call _aesni_encrypt6 - movups %xmm2,(%rsi) - movups %xmm3,16(%rsi) - movups %xmm4,32(%rsi) - movups %xmm5,48(%rsi) - movups %xmm6,64(%rsi) - movups %xmm7,80(%rsi) - jmp .Lecb_ret - -.align 16 -.Lecb_decrypt: - cmpq $0x80,%rdx - jb .Lecb_dec_tail - - movdqu (%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqu 32(%rdi),%xmm4 - movdqu 48(%rdi),%xmm5 - movdqu 64(%rdi),%xmm6 - movdqu 80(%rdi),%xmm7 - movdqu 96(%rdi),%xmm8 - movdqu 112(%rdi),%xmm9 - leaq 128(%rdi),%rdi - subq $0x80,%rdx - jmp .Lecb_dec_loop8_enter -.align 16 -.Lecb_dec_loop8: - movups %xmm2,(%rsi) - movq %r11,%rcx - movdqu (%rdi),%xmm2 - movl %r10d,%eax - movups %xmm3,16(%rsi) - movdqu 16(%rdi),%xmm3 - movups %xmm4,32(%rsi) - movdqu 32(%rdi),%xmm4 - movups %xmm5,48(%rsi) - movdqu 48(%rdi),%xmm5 - movups %xmm6,64(%rsi) - movdqu 64(%rdi),%xmm6 - movups %xmm7,80(%rsi) - movdqu 80(%rdi),%xmm7 - movups %xmm8,96(%rsi) - movdqu 96(%rdi),%xmm8 - movups %xmm9,112(%rsi) - leaq 128(%rsi),%rsi - movdqu 112(%rdi),%xmm9 - leaq 128(%rdi),%rdi -.Lecb_dec_loop8_enter: - - call _aesni_decrypt8 - - movups (%r11),%xmm0 - subq $0x80,%rdx - jnc .Lecb_dec_loop8 - - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movq %r11,%rcx - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movl %r10d,%eax - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm7 - movups %xmm8,96(%rsi) - pxor %xmm8,%xmm8 - movups %xmm9,112(%rsi) - pxor %xmm9,%xmm9 - leaq 128(%rsi),%rsi - addq $0x80,%rdx - jz .Lecb_ret - -.Lecb_dec_tail: - movups (%rdi),%xmm2 - cmpq $0x20,%rdx - jb .Lecb_dec_one - movups 16(%rdi),%xmm3 - je .Lecb_dec_two - movups 32(%rdi),%xmm4 - cmpq $0x40,%rdx - jb .Lecb_dec_three - movups 48(%rdi),%xmm5 - je .Lecb_dec_four - movups 64(%rdi),%xmm6 - cmpq $0x60,%rdx - jb .Lecb_dec_five - movups 80(%rdi),%xmm7 - je .Lecb_dec_six - movups 96(%rdi),%xmm8 - movups (%rcx),%xmm0 - xorps %xmm9,%xmm9 - call _aesni_decrypt8 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm7 - movups %xmm8,96(%rsi) - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - jmp .Lecb_ret -.align 16 -.Lecb_dec_one: - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_4: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_4 -.byte 102,15,56,223,209 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - jmp .Lecb_ret -.align 16 -.Lecb_dec_two: - call _aesni_decrypt2 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - jmp .Lecb_ret -.align 16 -.Lecb_dec_three: - call _aesni_decrypt3 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - jmp .Lecb_ret -.align 16 -.Lecb_dec_four: - call _aesni_decrypt4 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - jmp .Lecb_ret -.align 16 -.Lecb_dec_five: - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - jmp .Lecb_ret -.align 16 -.Lecb_dec_six: - call _aesni_decrypt6 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - movups %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movups %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movups %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - movups %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - movups %xmm7,80(%rsi) - pxor %xmm7,%xmm7 - -.Lecb_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - .byte 0xf3,0xc3 -.cfi_endproc -.size aes_hw_ecb_encrypt,.-aes_hw_ecb_encrypt -.globl aes_hw_ctr32_encrypt_blocks -.hidden aes_hw_ctr32_encrypt_blocks -.type aes_hw_ctr32_encrypt_blocks,@function -.align 16 -aes_hw_ctr32_encrypt_blocks: -.cfi_startproc -#ifdef BORINGSSL_DISPATCH_TEST - movb $1,BORINGSSL_function_hit(%rip) -#endif - cmpq $1,%rdx - jne .Lctr32_bulk - - - - movups (%r8),%xmm2 - movups (%rdi),%xmm3 - movl 240(%rcx),%edx - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_enc1_5: -.byte 102,15,56,220,209 - decl %edx - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_5 -.byte 102,15,56,221,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - xorps %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movups %xmm2,(%rsi) - xorps %xmm2,%xmm2 - jmp .Lctr32_epilogue - -.align 16 -.Lctr32_bulk: - leaq (%rsp),%r11 -.cfi_def_cfa_register %r11 - pushq %rbp -.cfi_offset %rbp,-16 - subq $128,%rsp - andq $-16,%rsp - - - - - movdqu (%r8),%xmm2 - movdqu (%rcx),%xmm0 - movl 12(%r8),%r8d - pxor %xmm0,%xmm2 - movl 12(%rcx),%ebp - movdqa %xmm2,0(%rsp) - bswapl %r8d - movdqa %xmm2,%xmm3 - movdqa %xmm2,%xmm4 - movdqa %xmm2,%xmm5 - movdqa %xmm2,64(%rsp) - movdqa %xmm2,80(%rsp) - movdqa %xmm2,96(%rsp) - movq %rdx,%r10 - movdqa %xmm2,112(%rsp) - - leaq 1(%r8),%rax - leaq 2(%r8),%rdx - bswapl %eax - bswapl %edx - xorl %ebp,%eax - xorl %ebp,%edx -.byte 102,15,58,34,216,3 - leaq 3(%r8),%rax - movdqa %xmm3,16(%rsp) -.byte 102,15,58,34,226,3 - bswapl %eax - movq %r10,%rdx - leaq 4(%r8),%r10 - movdqa %xmm4,32(%rsp) - xorl %ebp,%eax - bswapl %r10d -.byte 102,15,58,34,232,3 - xorl %ebp,%r10d - movdqa %xmm5,48(%rsp) - leaq 5(%r8),%r9 - movl %r10d,64+12(%rsp) - bswapl %r9d - leaq 6(%r8),%r10 - movl 240(%rcx),%eax - xorl %ebp,%r9d - bswapl %r10d - movl %r9d,80+12(%rsp) - xorl %ebp,%r10d - leaq 7(%r8),%r9 - movl %r10d,96+12(%rsp) - bswapl %r9d - leaq OPENSSL_ia32cap_P(%rip),%r10 - movl 4(%r10),%r10d - xorl %ebp,%r9d - andl $71303168,%r10d - movl %r9d,112+12(%rsp) - - movups 16(%rcx),%xmm1 - - movdqa 64(%rsp),%xmm6 - movdqa 80(%rsp),%xmm7 - - cmpq $8,%rdx - jb .Lctr32_tail - - subq $6,%rdx - cmpl $4194304,%r10d - je .Lctr32_6x - - leaq 128(%rcx),%rcx - subq $2,%rdx - jmp .Lctr32_loop8 - -.align 16 -.Lctr32_6x: - shll $4,%eax - movl $48,%r10d - bswapl %ebp - leaq 32(%rcx,%rax,1),%rcx - subq %rax,%r10 - jmp .Lctr32_loop6 - -.align 16 -.Lctr32_loop6: - addl $6,%r8d - movups -48(%rcx,%r10,1),%xmm0 -.byte 102,15,56,220,209 - movl %r8d,%eax - xorl %ebp,%eax -.byte 102,15,56,220,217 -.byte 0x0f,0x38,0xf1,0x44,0x24,12 - leal 1(%r8),%eax -.byte 102,15,56,220,225 - xorl %ebp,%eax -.byte 0x0f,0x38,0xf1,0x44,0x24,28 -.byte 102,15,56,220,233 - leal 2(%r8),%eax - xorl %ebp,%eax -.byte 102,15,56,220,241 -.byte 0x0f,0x38,0xf1,0x44,0x24,44 - leal 3(%r8),%eax -.byte 102,15,56,220,249 - movups -32(%rcx,%r10,1),%xmm1 - xorl %ebp,%eax - -.byte 102,15,56,220,208 -.byte 0x0f,0x38,0xf1,0x44,0x24,60 - leal 4(%r8),%eax -.byte 102,15,56,220,216 - xorl %ebp,%eax -.byte 0x0f,0x38,0xf1,0x44,0x24,76 -.byte 102,15,56,220,224 - leal 5(%r8),%eax - xorl %ebp,%eax -.byte 102,15,56,220,232 -.byte 0x0f,0x38,0xf1,0x44,0x24,92 - movq %r10,%rax -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 - movups -16(%rcx,%r10,1),%xmm0 - - call .Lenc_loop6 - - movdqu (%rdi),%xmm8 - movdqu 16(%rdi),%xmm9 - movdqu 32(%rdi),%xmm10 - movdqu 48(%rdi),%xmm11 - movdqu 64(%rdi),%xmm12 - movdqu 80(%rdi),%xmm13 - leaq 96(%rdi),%rdi - movups -64(%rcx,%r10,1),%xmm1 - pxor %xmm2,%xmm8 - movaps 0(%rsp),%xmm2 - pxor %xmm3,%xmm9 - movaps 16(%rsp),%xmm3 - pxor %xmm4,%xmm10 - movaps 32(%rsp),%xmm4 - pxor %xmm5,%xmm11 - movaps 48(%rsp),%xmm5 - pxor %xmm6,%xmm12 - movaps 64(%rsp),%xmm6 - pxor %xmm7,%xmm13 - movaps 80(%rsp),%xmm7 - movdqu %xmm8,(%rsi) - movdqu %xmm9,16(%rsi) - movdqu %xmm10,32(%rsi) - movdqu %xmm11,48(%rsi) - movdqu %xmm12,64(%rsi) - movdqu %xmm13,80(%rsi) - leaq 96(%rsi),%rsi - - subq $6,%rdx - jnc .Lctr32_loop6 - - addq $6,%rdx - jz .Lctr32_done - - leal -48(%r10),%eax - leaq -80(%rcx,%r10,1),%rcx - negl %eax - shrl $4,%eax - jmp .Lctr32_tail - -.align 32 -.Lctr32_loop8: - addl $8,%r8d - movdqa 96(%rsp),%xmm8 -.byte 102,15,56,220,209 - movl %r8d,%r9d - movdqa 112(%rsp),%xmm9 -.byte 102,15,56,220,217 - bswapl %r9d - movups 32-128(%rcx),%xmm0 -.byte 102,15,56,220,225 - xorl %ebp,%r9d - nop -.byte 102,15,56,220,233 - movl %r9d,0+12(%rsp) - leaq 1(%r8),%r9 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 48-128(%rcx),%xmm1 - bswapl %r9d -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movl %r9d,16+12(%rsp) - leaq 2(%r8),%r9 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 64-128(%rcx),%xmm0 - bswapl %r9d -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movl %r9d,32+12(%rsp) - leaq 3(%r8),%r9 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 80-128(%rcx),%xmm1 - bswapl %r9d -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movl %r9d,48+12(%rsp) - leaq 4(%r8),%r9 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 96-128(%rcx),%xmm0 - bswapl %r9d -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movl %r9d,64+12(%rsp) - leaq 5(%r8),%r9 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 112-128(%rcx),%xmm1 - bswapl %r9d -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 - movl %r9d,80+12(%rsp) - leaq 6(%r8),%r9 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 128-128(%rcx),%xmm0 - bswapl %r9d -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - xorl %ebp,%r9d -.byte 0x66,0x90 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movl %r9d,96+12(%rsp) - leaq 7(%r8),%r9 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 144-128(%rcx),%xmm1 - bswapl %r9d -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 - xorl %ebp,%r9d - movdqu 0(%rdi),%xmm10 -.byte 102,15,56,220,232 - movl %r9d,112+12(%rsp) - cmpl $11,%eax -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 160-128(%rcx),%xmm0 - - jb .Lctr32_enc_done - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 176-128(%rcx),%xmm1 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 192-128(%rcx),%xmm0 - je .Lctr32_enc_done - -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movups 208-128(%rcx),%xmm1 - -.byte 102,15,56,220,208 -.byte 102,15,56,220,216 -.byte 102,15,56,220,224 -.byte 102,15,56,220,232 -.byte 102,15,56,220,240 -.byte 102,15,56,220,248 -.byte 102,68,15,56,220,192 -.byte 102,68,15,56,220,200 - movups 224-128(%rcx),%xmm0 - jmp .Lctr32_enc_done - -.align 16 -.Lctr32_enc_done: - movdqu 16(%rdi),%xmm11 - pxor %xmm0,%xmm10 - movdqu 32(%rdi),%xmm12 - pxor %xmm0,%xmm11 - movdqu 48(%rdi),%xmm13 - pxor %xmm0,%xmm12 - movdqu 64(%rdi),%xmm14 - pxor %xmm0,%xmm13 - movdqu 80(%rdi),%xmm15 - pxor %xmm0,%xmm14 - pxor %xmm0,%xmm15 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 -.byte 102,68,15,56,220,201 - movdqu 96(%rdi),%xmm1 - leaq 128(%rdi),%rdi - -.byte 102,65,15,56,221,210 - pxor %xmm0,%xmm1 - movdqu 112-128(%rdi),%xmm10 -.byte 102,65,15,56,221,219 - pxor %xmm0,%xmm10 - movdqa 0(%rsp),%xmm11 -.byte 102,65,15,56,221,228 -.byte 102,65,15,56,221,237 - movdqa 16(%rsp),%xmm12 - movdqa 32(%rsp),%xmm13 -.byte 102,65,15,56,221,246 -.byte 102,65,15,56,221,255 - movdqa 48(%rsp),%xmm14 - movdqa 64(%rsp),%xmm15 -.byte 102,68,15,56,221,193 - movdqa 80(%rsp),%xmm0 - movups 16-128(%rcx),%xmm1 -.byte 102,69,15,56,221,202 - - movups %xmm2,(%rsi) - movdqa %xmm11,%xmm2 - movups %xmm3,16(%rsi) - movdqa %xmm12,%xmm3 - movups %xmm4,32(%rsi) - movdqa %xmm13,%xmm4 - movups %xmm5,48(%rsi) - movdqa %xmm14,%xmm5 - movups %xmm6,64(%rsi) - movdqa %xmm15,%xmm6 - movups %xmm7,80(%rsi) - movdqa %xmm0,%xmm7 - movups %xmm8,96(%rsi) - movups %xmm9,112(%rsi) - leaq 128(%rsi),%rsi - - subq $8,%rdx - jnc .Lctr32_loop8 - - addq $8,%rdx - jz .Lctr32_done - leaq -128(%rcx),%rcx - -.Lctr32_tail: - - - leaq 16(%rcx),%rcx - cmpq $4,%rdx - jb .Lctr32_loop3 - je .Lctr32_loop4 - - - shll $4,%eax - movdqa 96(%rsp),%xmm8 - pxor %xmm9,%xmm9 - - movups 16(%rcx),%xmm0 -.byte 102,15,56,220,209 -.byte 102,15,56,220,217 - leaq 32-16(%rcx,%rax,1),%rcx - negq %rax -.byte 102,15,56,220,225 - addq $16,%rax - movups (%rdi),%xmm10 -.byte 102,15,56,220,233 -.byte 102,15,56,220,241 - movups 16(%rdi),%xmm11 - movups 32(%rdi),%xmm12 -.byte 102,15,56,220,249 -.byte 102,68,15,56,220,193 - - call .Lenc_loop8_enter - - movdqu 48(%rdi),%xmm13 - pxor %xmm10,%xmm2 - movdqu 64(%rdi),%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm10,%xmm6 - movdqu %xmm5,48(%rsi) - movdqu %xmm6,64(%rsi) - cmpq $6,%rdx - jb .Lctr32_done - - movups 80(%rdi),%xmm11 - xorps %xmm11,%xmm7 - movups %xmm7,80(%rsi) - je .Lctr32_done - - movups 96(%rdi),%xmm12 - xorps %xmm12,%xmm8 - movups %xmm8,96(%rsi) - jmp .Lctr32_done - -.align 32 -.Lctr32_loop4: -.byte 102,15,56,220,209 - leaq 16(%rcx),%rcx - decl %eax -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 -.byte 102,15,56,220,233 - movups (%rcx),%xmm1 - jnz .Lctr32_loop4 -.byte 102,15,56,221,209 -.byte 102,15,56,221,217 - movups (%rdi),%xmm10 - movups 16(%rdi),%xmm11 -.byte 102,15,56,221,225 -.byte 102,15,56,221,233 - movups 32(%rdi),%xmm12 - movups 48(%rdi),%xmm13 - - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - xorps %xmm11,%xmm3 - movups %xmm3,16(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm4,32(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm5,48(%rsi) - jmp .Lctr32_done - -.align 32 -.Lctr32_loop3: -.byte 102,15,56,220,209 - leaq 16(%rcx),%rcx - decl %eax -.byte 102,15,56,220,217 -.byte 102,15,56,220,225 - movups (%rcx),%xmm1 - jnz .Lctr32_loop3 -.byte 102,15,56,221,209 -.byte 102,15,56,221,217 -.byte 102,15,56,221,225 - - movups (%rdi),%xmm10 - xorps %xmm10,%xmm2 - movups %xmm2,(%rsi) - cmpq $2,%rdx - jb .Lctr32_done - - movups 16(%rdi),%xmm11 - xorps %xmm11,%xmm3 - movups %xmm3,16(%rsi) - je .Lctr32_done - - movups 32(%rdi),%xmm12 - xorps %xmm12,%xmm4 - movups %xmm4,32(%rsi) - -.Lctr32_done: - xorps %xmm0,%xmm0 - xorl %ebp,%ebp - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - movaps %xmm0,0(%rsp) - pxor %xmm8,%xmm8 - movaps %xmm0,16(%rsp) - pxor %xmm9,%xmm9 - movaps %xmm0,32(%rsp) - pxor %xmm10,%xmm10 - movaps %xmm0,48(%rsp) - pxor %xmm11,%xmm11 - movaps %xmm0,64(%rsp) - pxor %xmm12,%xmm12 - movaps %xmm0,80(%rsp) - pxor %xmm13,%xmm13 - movaps %xmm0,96(%rsp) - pxor %xmm14,%xmm14 - movaps %xmm0,112(%rsp) - pxor %xmm15,%xmm15 - movq -8(%r11),%rbp -.cfi_restore %rbp - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp -.Lctr32_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size aes_hw_ctr32_encrypt_blocks,.-aes_hw_ctr32_encrypt_blocks -.globl aes_hw_cbc_encrypt -.hidden aes_hw_cbc_encrypt -.type aes_hw_cbc_encrypt,@function -.align 16 -aes_hw_cbc_encrypt: -.cfi_startproc - testq %rdx,%rdx - jz .Lcbc_ret - - movl 240(%rcx),%r10d - movq %rcx,%r11 - testl %r9d,%r9d - jz .Lcbc_decrypt - - movups (%r8),%xmm2 - movl %r10d,%eax - cmpq $16,%rdx - jb .Lcbc_enc_tail - subq $16,%rdx - jmp .Lcbc_enc_loop -.align 16 -.Lcbc_enc_loop: - movups (%rdi),%xmm3 - leaq 16(%rdi),%rdi - - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - xorps %xmm0,%xmm3 - leaq 32(%rcx),%rcx - xorps %xmm3,%xmm2 -.Loop_enc1_6: -.byte 102,15,56,220,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_enc1_6 -.byte 102,15,56,221,209 - movl %r10d,%eax - movq %r11,%rcx - movups %xmm2,0(%rsi) - leaq 16(%rsi),%rsi - subq $16,%rdx - jnc .Lcbc_enc_loop - addq $16,%rdx - jnz .Lcbc_enc_tail - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movups %xmm2,(%r8) - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - jmp .Lcbc_ret - -.Lcbc_enc_tail: - movq %rdx,%rcx - xchgq %rdi,%rsi -.long 0x9066A4F3 - movl $16,%ecx - subq %rdx,%rcx - xorl %eax,%eax -.long 0x9066AAF3 - leaq -16(%rdi),%rdi - movl %r10d,%eax - movq %rdi,%rsi - movq %r11,%rcx - xorq %rdx,%rdx - jmp .Lcbc_enc_loop - -.align 16 -.Lcbc_decrypt: - cmpq $16,%rdx - jne .Lcbc_decrypt_bulk - - - - movdqu (%rdi),%xmm2 - movdqu (%r8),%xmm3 - movdqa %xmm2,%xmm4 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_7: -.byte 102,15,56,222,209 - decl %r10d - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_7 -.byte 102,15,56,223,209 - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movdqu %xmm4,(%r8) - xorps %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - jmp .Lcbc_ret -.align 16 -.Lcbc_decrypt_bulk: - leaq (%rsp),%r11 -.cfi_def_cfa_register %r11 - pushq %rbp -.cfi_offset %rbp,-16 - subq $16,%rsp - andq $-16,%rsp - movq %rcx,%rbp - movups (%r8),%xmm10 - movl %r10d,%eax - cmpq $0x50,%rdx - jbe .Lcbc_dec_tail - - movups (%rcx),%xmm0 - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqa %xmm2,%xmm11 - movdqu 32(%rdi),%xmm4 - movdqa %xmm3,%xmm12 - movdqu 48(%rdi),%xmm5 - movdqa %xmm4,%xmm13 - movdqu 64(%rdi),%xmm6 - movdqa %xmm5,%xmm14 - movdqu 80(%rdi),%xmm7 - movdqa %xmm6,%xmm15 - leaq OPENSSL_ia32cap_P(%rip),%r9 - movl 4(%r9),%r9d - cmpq $0x70,%rdx - jbe .Lcbc_dec_six_or_seven - - andl $71303168,%r9d - subq $0x50,%rdx - cmpl $4194304,%r9d - je .Lcbc_dec_loop6_enter - subq $0x20,%rdx - leaq 112(%rcx),%rcx - jmp .Lcbc_dec_loop8_enter -.align 16 -.Lcbc_dec_loop8: - movups %xmm9,(%rsi) - leaq 16(%rsi),%rsi -.Lcbc_dec_loop8_enter: - movdqu 96(%rdi),%xmm8 - pxor %xmm0,%xmm2 - movdqu 112(%rdi),%xmm9 - pxor %xmm0,%xmm3 - movups 16-112(%rcx),%xmm1 - pxor %xmm0,%xmm4 - movq $-1,%rbp - cmpq $0x70,%rdx - pxor %xmm0,%xmm5 - pxor %xmm0,%xmm6 - pxor %xmm0,%xmm7 - pxor %xmm0,%xmm8 - -.byte 102,15,56,222,209 - pxor %xmm0,%xmm9 - movups 32-112(%rcx),%xmm0 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 - adcq $0,%rbp - andq $128,%rbp -.byte 102,68,15,56,222,201 - addq %rdi,%rbp - movups 48-112(%rcx),%xmm1 -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 64-112(%rcx),%xmm0 - nop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 80-112(%rcx),%xmm1 - nop -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 96-112(%rcx),%xmm0 - nop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 112-112(%rcx),%xmm1 - nop -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 128-112(%rcx),%xmm0 - nop -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 144-112(%rcx),%xmm1 - cmpl $11,%eax -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 160-112(%rcx),%xmm0 - jb .Lcbc_dec_done -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 176-112(%rcx),%xmm1 - nop -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 192-112(%rcx),%xmm0 - je .Lcbc_dec_done -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movups 208-112(%rcx),%xmm1 - nop -.byte 102,15,56,222,208 -.byte 102,15,56,222,216 -.byte 102,15,56,222,224 -.byte 102,15,56,222,232 -.byte 102,15,56,222,240 -.byte 102,15,56,222,248 -.byte 102,68,15,56,222,192 -.byte 102,68,15,56,222,200 - movups 224-112(%rcx),%xmm0 - jmp .Lcbc_dec_done -.align 16 -.Lcbc_dec_done: -.byte 102,15,56,222,209 -.byte 102,15,56,222,217 - pxor %xmm0,%xmm10 - pxor %xmm0,%xmm11 -.byte 102,15,56,222,225 -.byte 102,15,56,222,233 - pxor %xmm0,%xmm12 - pxor %xmm0,%xmm13 -.byte 102,15,56,222,241 -.byte 102,15,56,222,249 - pxor %xmm0,%xmm14 - pxor %xmm0,%xmm15 -.byte 102,68,15,56,222,193 -.byte 102,68,15,56,222,201 - movdqu 80(%rdi),%xmm1 - -.byte 102,65,15,56,223,210 - movdqu 96(%rdi),%xmm10 - pxor %xmm0,%xmm1 -.byte 102,65,15,56,223,219 - pxor %xmm0,%xmm10 - movdqu 112(%rdi),%xmm0 -.byte 102,65,15,56,223,228 - leaq 128(%rdi),%rdi - movdqu 0(%rbp),%xmm11 -.byte 102,65,15,56,223,237 -.byte 102,65,15,56,223,246 - movdqu 16(%rbp),%xmm12 - movdqu 32(%rbp),%xmm13 -.byte 102,65,15,56,223,255 -.byte 102,68,15,56,223,193 - movdqu 48(%rbp),%xmm14 - movdqu 64(%rbp),%xmm15 -.byte 102,69,15,56,223,202 - movdqa %xmm0,%xmm10 - movdqu 80(%rbp),%xmm1 - movups -112(%rcx),%xmm0 - - movups %xmm2,(%rsi) - movdqa %xmm11,%xmm2 - movups %xmm3,16(%rsi) - movdqa %xmm12,%xmm3 - movups %xmm4,32(%rsi) - movdqa %xmm13,%xmm4 - movups %xmm5,48(%rsi) - movdqa %xmm14,%xmm5 - movups %xmm6,64(%rsi) - movdqa %xmm15,%xmm6 - movups %xmm7,80(%rsi) - movdqa %xmm1,%xmm7 - movups %xmm8,96(%rsi) - leaq 112(%rsi),%rsi - - subq $0x80,%rdx - ja .Lcbc_dec_loop8 - - movaps %xmm9,%xmm2 - leaq -112(%rcx),%rcx - addq $0x70,%rdx - jle .Lcbc_dec_clear_tail_collected - movups %xmm9,(%rsi) - leaq 16(%rsi),%rsi - cmpq $0x50,%rdx - jbe .Lcbc_dec_tail - - movaps %xmm11,%xmm2 -.Lcbc_dec_six_or_seven: - cmpq $0x60,%rdx - ja .Lcbc_dec_seven - - movaps %xmm7,%xmm8 - call _aesni_decrypt6 - pxor %xmm10,%xmm2 - movaps %xmm8,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - pxor %xmm14,%xmm6 - movdqu %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - pxor %xmm15,%xmm7 - movdqu %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - leaq 80(%rsi),%rsi - movdqa %xmm7,%xmm2 - pxor %xmm7,%xmm7 - jmp .Lcbc_dec_tail_collected - -.align 16 -.Lcbc_dec_seven: - movups 96(%rdi),%xmm8 - xorps %xmm9,%xmm9 - call _aesni_decrypt8 - movups 80(%rdi),%xmm9 - pxor %xmm10,%xmm2 - movups 96(%rdi),%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - pxor %xmm14,%xmm6 - movdqu %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - pxor %xmm15,%xmm7 - movdqu %xmm6,64(%rsi) - pxor %xmm6,%xmm6 - pxor %xmm9,%xmm8 - movdqu %xmm7,80(%rsi) - pxor %xmm7,%xmm7 - leaq 96(%rsi),%rsi - movdqa %xmm8,%xmm2 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 - jmp .Lcbc_dec_tail_collected - -.align 16 -.Lcbc_dec_loop6: - movups %xmm7,(%rsi) - leaq 16(%rsi),%rsi - movdqu 0(%rdi),%xmm2 - movdqu 16(%rdi),%xmm3 - movdqa %xmm2,%xmm11 - movdqu 32(%rdi),%xmm4 - movdqa %xmm3,%xmm12 - movdqu 48(%rdi),%xmm5 - movdqa %xmm4,%xmm13 - movdqu 64(%rdi),%xmm6 - movdqa %xmm5,%xmm14 - movdqu 80(%rdi),%xmm7 - movdqa %xmm6,%xmm15 -.Lcbc_dec_loop6_enter: - leaq 96(%rdi),%rdi - movdqa %xmm7,%xmm8 - - call _aesni_decrypt6 - - pxor %xmm10,%xmm2 - movdqa %xmm8,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm14,%xmm6 - movq %rbp,%rcx - movdqu %xmm5,48(%rsi) - pxor %xmm15,%xmm7 - movl %r10d,%eax - movdqu %xmm6,64(%rsi) - leaq 80(%rsi),%rsi - subq $0x60,%rdx - ja .Lcbc_dec_loop6 - - movdqa %xmm7,%xmm2 - addq $0x50,%rdx - jle .Lcbc_dec_clear_tail_collected - movups %xmm7,(%rsi) - leaq 16(%rsi),%rsi - -.Lcbc_dec_tail: - movups (%rdi),%xmm2 - subq $0x10,%rdx - jbe .Lcbc_dec_one - - movups 16(%rdi),%xmm3 - movaps %xmm2,%xmm11 - subq $0x10,%rdx - jbe .Lcbc_dec_two - - movups 32(%rdi),%xmm4 - movaps %xmm3,%xmm12 - subq $0x10,%rdx - jbe .Lcbc_dec_three - - movups 48(%rdi),%xmm5 - movaps %xmm4,%xmm13 - subq $0x10,%rdx - jbe .Lcbc_dec_four - - movups 64(%rdi),%xmm6 - movaps %xmm5,%xmm14 - movaps %xmm6,%xmm15 - xorps %xmm7,%xmm7 - call _aesni_decrypt6 - pxor %xmm10,%xmm2 - movaps %xmm15,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - pxor %xmm14,%xmm6 - movdqu %xmm5,48(%rsi) - pxor %xmm5,%xmm5 - leaq 64(%rsi),%rsi - movdqa %xmm6,%xmm2 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - subq $0x10,%rdx - jmp .Lcbc_dec_tail_collected - -.align 16 -.Lcbc_dec_one: - movaps %xmm2,%xmm11 - movups (%rcx),%xmm0 - movups 16(%rcx),%xmm1 - leaq 32(%rcx),%rcx - xorps %xmm0,%xmm2 -.Loop_dec1_8: -.byte 102,15,56,222,209 - decl %eax - movups (%rcx),%xmm1 - leaq 16(%rcx),%rcx - jnz .Loop_dec1_8 -.byte 102,15,56,223,209 - xorps %xmm10,%xmm2 - movaps %xmm11,%xmm10 - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_two: - movaps %xmm3,%xmm12 - call _aesni_decrypt2 - pxor %xmm10,%xmm2 - movaps %xmm12,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - movdqa %xmm3,%xmm2 - pxor %xmm3,%xmm3 - leaq 16(%rsi),%rsi - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_three: - movaps %xmm4,%xmm13 - call _aesni_decrypt3 - pxor %xmm10,%xmm2 - movaps %xmm13,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - movdqa %xmm4,%xmm2 - pxor %xmm4,%xmm4 - leaq 32(%rsi),%rsi - jmp .Lcbc_dec_tail_collected -.align 16 -.Lcbc_dec_four: - movaps %xmm5,%xmm14 - call _aesni_decrypt4 - pxor %xmm10,%xmm2 - movaps %xmm14,%xmm10 - pxor %xmm11,%xmm3 - movdqu %xmm2,(%rsi) - pxor %xmm12,%xmm4 - movdqu %xmm3,16(%rsi) - pxor %xmm3,%xmm3 - pxor %xmm13,%xmm5 - movdqu %xmm4,32(%rsi) - pxor %xmm4,%xmm4 - movdqa %xmm5,%xmm2 - pxor %xmm5,%xmm5 - leaq 48(%rsi),%rsi - jmp .Lcbc_dec_tail_collected - -.align 16 -.Lcbc_dec_clear_tail_collected: - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - pxor %xmm8,%xmm8 - pxor %xmm9,%xmm9 -.Lcbc_dec_tail_collected: - movups %xmm10,(%r8) - andq $15,%rdx - jnz .Lcbc_dec_tail_partial - movups %xmm2,(%rsi) - pxor %xmm2,%xmm2 - jmp .Lcbc_dec_ret -.align 16 -.Lcbc_dec_tail_partial: - movaps %xmm2,(%rsp) - pxor %xmm2,%xmm2 - movq $16,%rcx - movq %rsi,%rdi - subq %rdx,%rcx - leaq (%rsp),%rsi -.long 0x9066A4F3 - movdqa %xmm2,(%rsp) - -.Lcbc_dec_ret: - xorps %xmm0,%xmm0 - pxor %xmm1,%xmm1 - movq -8(%r11),%rbp -.cfi_restore %rbp - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp -.Lcbc_ret: - .byte 0xf3,0xc3 -.cfi_endproc -.size aes_hw_cbc_encrypt,.-aes_hw_cbc_encrypt -.globl aes_hw_set_decrypt_key -.hidden aes_hw_set_decrypt_key -.type aes_hw_set_decrypt_key,@function -.align 16 -aes_hw_set_decrypt_key: -.cfi_startproc -.byte 0x48,0x83,0xEC,0x08 -.cfi_adjust_cfa_offset 8 - call __aesni_set_encrypt_key - shll $4,%esi - testl %eax,%eax - jnz .Ldec_key_ret - leaq 16(%rdx,%rsi,1),%rdi - - movups (%rdx),%xmm0 - movups (%rdi),%xmm1 - movups %xmm0,(%rdi) - movups %xmm1,(%rdx) - leaq 16(%rdx),%rdx - leaq -16(%rdi),%rdi - -.Ldec_key_inverse: - movups (%rdx),%xmm0 - movups (%rdi),%xmm1 -.byte 102,15,56,219,192 -.byte 102,15,56,219,201 - leaq 16(%rdx),%rdx - leaq -16(%rdi),%rdi - movups %xmm0,16(%rdi) - movups %xmm1,-16(%rdx) - cmpq %rdx,%rdi - ja .Ldec_key_inverse - - movups (%rdx),%xmm0 -.byte 102,15,56,219,192 - pxor %xmm1,%xmm1 - movups %xmm0,(%rdi) - pxor %xmm0,%xmm0 -.Ldec_key_ret: - addq $8,%rsp -.cfi_adjust_cfa_offset -8 - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_set_decrypt_key: -.size aes_hw_set_decrypt_key,.-aes_hw_set_decrypt_key -.globl aes_hw_set_encrypt_key -.hidden aes_hw_set_encrypt_key -.type aes_hw_set_encrypt_key,@function -.align 16 -aes_hw_set_encrypt_key: -__aesni_set_encrypt_key: -.cfi_startproc -#ifdef BORINGSSL_DISPATCH_TEST - movb $1,BORINGSSL_function_hit+3(%rip) -#endif -.byte 0x48,0x83,0xEC,0x08 -.cfi_adjust_cfa_offset 8 - movq $-1,%rax - testq %rdi,%rdi - jz .Lenc_key_ret - testq %rdx,%rdx - jz .Lenc_key_ret - - movups (%rdi),%xmm0 - xorps %xmm4,%xmm4 - leaq OPENSSL_ia32cap_P(%rip),%r10 - movl 4(%r10),%r10d - andl $268437504,%r10d - leaq 16(%rdx),%rax - cmpl $256,%esi - je .L14rounds - cmpl $192,%esi - je .L12rounds - cmpl $128,%esi - jne .Lbad_keybits - -.L10rounds: - movl $9,%esi - cmpl $268435456,%r10d - je .L10rounds_alt - - movups %xmm0,(%rdx) -.byte 102,15,58,223,200,1 - call .Lkey_expansion_128_cold -.byte 102,15,58,223,200,2 - call .Lkey_expansion_128 -.byte 102,15,58,223,200,4 - call .Lkey_expansion_128 -.byte 102,15,58,223,200,8 - call .Lkey_expansion_128 -.byte 102,15,58,223,200,16 - call .Lkey_expansion_128 -.byte 102,15,58,223,200,32 - call .Lkey_expansion_128 -.byte 102,15,58,223,200,64 - call .Lkey_expansion_128 -.byte 102,15,58,223,200,128 - call .Lkey_expansion_128 -.byte 102,15,58,223,200,27 - call .Lkey_expansion_128 -.byte 102,15,58,223,200,54 - call .Lkey_expansion_128 - movups %xmm0,(%rax) - movl %esi,80(%rax) - xorl %eax,%eax - jmp .Lenc_key_ret - -.align 16 -.L10rounds_alt: - movdqa .Lkey_rotate(%rip),%xmm5 - movl $8,%r10d - movdqa .Lkey_rcon1(%rip),%xmm4 - movdqa %xmm0,%xmm2 - movdqu %xmm0,(%rdx) - jmp .Loop_key128 - -.align 16 -.Loop_key128: -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - pslld $1,%xmm4 - leaq 16(%rax),%rax - - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - - pxor %xmm2,%xmm0 - movdqu %xmm0,-16(%rax) - movdqa %xmm0,%xmm2 - - decl %r10d - jnz .Loop_key128 - - movdqa .Lkey_rcon1b(%rip),%xmm4 - -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - pslld $1,%xmm4 - - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - - pxor %xmm2,%xmm0 - movdqu %xmm0,(%rax) - - movdqa %xmm0,%xmm2 -.byte 102,15,56,0,197 -.byte 102,15,56,221,196 - - movdqa %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm2,%xmm3 - pslldq $4,%xmm2 - pxor %xmm3,%xmm2 - - pxor %xmm2,%xmm0 - movdqu %xmm0,16(%rax) - - movl %esi,96(%rax) - xorl %eax,%eax - jmp .Lenc_key_ret - -.align 16 -.L12rounds: - movq 16(%rdi),%xmm2 - movl $11,%esi - cmpl $268435456,%r10d - je .L12rounds_alt - - movups %xmm0,(%rdx) -.byte 102,15,58,223,202,1 - call .Lkey_expansion_192a_cold -.byte 102,15,58,223,202,2 - call .Lkey_expansion_192b -.byte 102,15,58,223,202,4 - call .Lkey_expansion_192a -.byte 102,15,58,223,202,8 - call .Lkey_expansion_192b -.byte 102,15,58,223,202,16 - call .Lkey_expansion_192a -.byte 102,15,58,223,202,32 - call .Lkey_expansion_192b -.byte 102,15,58,223,202,64 - call .Lkey_expansion_192a -.byte 102,15,58,223,202,128 - call .Lkey_expansion_192b - movups %xmm0,(%rax) - movl %esi,48(%rax) - xorq %rax,%rax - jmp .Lenc_key_ret - -.align 16 -.L12rounds_alt: - movdqa .Lkey_rotate192(%rip),%xmm5 - movdqa .Lkey_rcon1(%rip),%xmm4 - movl $8,%r10d - movdqu %xmm0,(%rdx) - jmp .Loop_key192 - -.align 16 -.Loop_key192: - movq %xmm2,0(%rax) - movdqa %xmm2,%xmm1 -.byte 102,15,56,0,213 -.byte 102,15,56,221,212 - pslld $1,%xmm4 - leaq 24(%rax),%rax - - movdqa %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm3,%xmm0 - - pshufd $0xff,%xmm0,%xmm3 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - - pxor %xmm2,%xmm0 - pxor %xmm3,%xmm2 - movdqu %xmm0,-16(%rax) - - decl %r10d - jnz .Loop_key192 - - movl %esi,32(%rax) - xorl %eax,%eax - jmp .Lenc_key_ret - -.align 16 -.L14rounds: - movups 16(%rdi),%xmm2 - movl $13,%esi - leaq 16(%rax),%rax - cmpl $268435456,%r10d - je .L14rounds_alt - - movups %xmm0,(%rdx) - movups %xmm2,16(%rdx) -.byte 102,15,58,223,202,1 - call .Lkey_expansion_256a_cold -.byte 102,15,58,223,200,1 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,2 - call .Lkey_expansion_256a -.byte 102,15,58,223,200,2 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,4 - call .Lkey_expansion_256a -.byte 102,15,58,223,200,4 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,8 - call .Lkey_expansion_256a -.byte 102,15,58,223,200,8 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,16 - call .Lkey_expansion_256a -.byte 102,15,58,223,200,16 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,32 - call .Lkey_expansion_256a -.byte 102,15,58,223,200,32 - call .Lkey_expansion_256b -.byte 102,15,58,223,202,64 - call .Lkey_expansion_256a - movups %xmm0,(%rax) - movl %esi,16(%rax) - xorq %rax,%rax - jmp .Lenc_key_ret - -.align 16 -.L14rounds_alt: - movdqa .Lkey_rotate(%rip),%xmm5 - movdqa .Lkey_rcon1(%rip),%xmm4 - movl $7,%r10d - movdqu %xmm0,0(%rdx) - movdqa %xmm2,%xmm1 - movdqu %xmm2,16(%rdx) - jmp .Loop_key256 - -.align 16 -.Loop_key256: -.byte 102,15,56,0,213 -.byte 102,15,56,221,212 - - movdqa %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm0,%xmm3 - pslldq $4,%xmm0 - pxor %xmm3,%xmm0 - pslld $1,%xmm4 - - pxor %xmm2,%xmm0 - movdqu %xmm0,(%rax) - - decl %r10d - jz .Ldone_key256 - - pshufd $0xff,%xmm0,%xmm2 - pxor %xmm3,%xmm3 -.byte 102,15,56,221,211 - - movdqa %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm1,%xmm3 - pslldq $4,%xmm1 - pxor %xmm3,%xmm1 - - pxor %xmm1,%xmm2 - movdqu %xmm2,16(%rax) - leaq 32(%rax),%rax - movdqa %xmm2,%xmm1 - - jmp .Loop_key256 - -.Ldone_key256: - movl %esi,16(%rax) - xorl %eax,%eax - jmp .Lenc_key_ret - -.align 16 -.Lbad_keybits: - movq $-2,%rax -.Lenc_key_ret: - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - addq $8,%rsp -.cfi_adjust_cfa_offset -8 - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_set_encrypt_key: - -.align 16 -.Lkey_expansion_128: - movups %xmm0,(%rax) - leaq 16(%rax),%rax -.Lkey_expansion_128_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - .byte 0xf3,0xc3 - -.align 16 -.Lkey_expansion_192a: - movups %xmm0,(%rax) - leaq 16(%rax),%rax -.Lkey_expansion_192a_cold: - movaps %xmm2,%xmm5 -.Lkey_expansion_192b_warm: - shufps $16,%xmm0,%xmm4 - movdqa %xmm2,%xmm3 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - pslldq $4,%xmm3 - xorps %xmm4,%xmm0 - pshufd $85,%xmm1,%xmm1 - pxor %xmm3,%xmm2 - pxor %xmm1,%xmm0 - pshufd $255,%xmm0,%xmm3 - pxor %xmm3,%xmm2 - .byte 0xf3,0xc3 - -.align 16 -.Lkey_expansion_192b: - movaps %xmm0,%xmm3 - shufps $68,%xmm0,%xmm5 - movups %xmm5,(%rax) - shufps $78,%xmm2,%xmm3 - movups %xmm3,16(%rax) - leaq 32(%rax),%rax - jmp .Lkey_expansion_192b_warm - -.align 16 -.Lkey_expansion_256a: - movups %xmm2,(%rax) - leaq 16(%rax),%rax -.Lkey_expansion_256a_cold: - shufps $16,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $140,%xmm0,%xmm4 - xorps %xmm4,%xmm0 - shufps $255,%xmm1,%xmm1 - xorps %xmm1,%xmm0 - .byte 0xf3,0xc3 - -.align 16 -.Lkey_expansion_256b: - movups %xmm0,(%rax) - leaq 16(%rax),%rax - - shufps $16,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $140,%xmm2,%xmm4 - xorps %xmm4,%xmm2 - shufps $170,%xmm1,%xmm1 - xorps %xmm1,%xmm2 - .byte 0xf3,0xc3 -.size aes_hw_set_encrypt_key,.-aes_hw_set_encrypt_key -.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key -.align 64 -.Lbswap_mask: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.Lincrement32: -.long 6,6,6,0 -.Lincrement64: -.long 1,0,0,0 -.Lxts_magic: -.long 0x87,0,1,0 -.Lincrement1: -.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1 -.Lkey_rotate: -.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d -.Lkey_rotate192: -.long 0x04070605,0x04070605,0x04070605,0x04070605 -.Lkey_rcon1: -.long 1,1,1,1 -.Lkey_rcon1b: -.long 0x1b,0x1b,0x1b,0x1b - -.byte 65,69,83,32,102,111,114,32,73,110,116,101,108,32,65,69,83,45,78,73,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 64 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S deleted file mode 100644 index b5fbdc81..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S +++ /dev/null @@ -1,427 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - - - -.type gcm_gmult_ssse3, @function -.globl gcm_gmult_ssse3 -.hidden gcm_gmult_ssse3 -.align 16 -gcm_gmult_ssse3: -.cfi_startproc -.Lgmult_seh_begin: - movdqu (%rdi),%xmm0 - movdqa .Lreverse_bytes(%rip),%xmm10 - movdqa .Llow4_mask(%rip),%xmm2 - - -.byte 102,65,15,56,0,194 - - - movdqa %xmm2,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm2,%xmm0 - - - - - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - movq $5,%rax -.Loop_row_1: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz .Loop_row_1 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movq $5,%rax -.Loop_row_2: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz .Loop_row_2 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movq $6,%rax -.Loop_row_3: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz .Loop_row_3 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - -.byte 102,65,15,56,0,210 - movdqu %xmm2,(%rdi) - - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 -.Lgmult_seh_end: -.cfi_endproc -.size gcm_gmult_ssse3,.-gcm_gmult_ssse3 - - - - - -.type gcm_ghash_ssse3, @function -.globl gcm_ghash_ssse3 -.hidden gcm_ghash_ssse3 -.align 16 -gcm_ghash_ssse3: -.Lghash_seh_begin: -.cfi_startproc - movdqu (%rdi),%xmm0 - movdqa .Lreverse_bytes(%rip),%xmm10 - movdqa .Llow4_mask(%rip),%xmm11 - - - andq $-16,%rcx - - - -.byte 102,65,15,56,0,194 - - - pxor %xmm3,%xmm3 -.Loop_ghash: - - movdqu (%rdx),%xmm1 -.byte 102,65,15,56,0,202 - pxor %xmm1,%xmm0 - - - movdqa %xmm11,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm11,%xmm0 - - - - - pxor %xmm2,%xmm2 - - movq $5,%rax -.Loop_row_4: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz .Loop_row_4 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movq $5,%rax -.Loop_row_5: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz .Loop_row_5 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movq $6,%rax -.Loop_row_6: - movdqa (%rsi),%xmm4 - leaq 16(%rsi),%rsi - - - movdqa %xmm2,%xmm6 -.byte 102,15,58,15,243,1 - movdqa %xmm6,%xmm3 - psrldq $1,%xmm2 - - - - - movdqa %xmm4,%xmm5 -.byte 102,15,56,0,224 -.byte 102,15,56,0,233 - - - pxor %xmm5,%xmm2 - - - - movdqa %xmm4,%xmm5 - psllq $60,%xmm5 - movdqa %xmm5,%xmm6 - pslldq $8,%xmm6 - pxor %xmm6,%xmm3 - - - psrldq $8,%xmm5 - pxor %xmm5,%xmm2 - psrlq $4,%xmm4 - pxor %xmm4,%xmm2 - - subq $1,%rax - jnz .Loop_row_6 - - - - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $1,%xmm3 - pxor %xmm3,%xmm2 - psrlq $5,%xmm3 - pxor %xmm3,%xmm2 - pxor %xmm3,%xmm3 - movdqa %xmm2,%xmm0 - - - leaq -256(%rsi),%rsi - - - leaq 16(%rdx),%rdx - subq $16,%rcx - jnz .Loop_ghash - - -.byte 102,65,15,56,0,194 - movdqu %xmm0,(%rdi) - - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 -.Lghash_seh_end: -.cfi_endproc -.size gcm_ghash_ssse3,.-gcm_ghash_ssse3 - -.align 16 - - -.Lreverse_bytes: -.byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 - -.Llow4_mask: -.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S deleted file mode 100644 index 91cea671..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/ghash-x86_64.S +++ /dev/null @@ -1,1127 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P -.globl gcm_init_clmul -.hidden gcm_init_clmul -.type gcm_init_clmul,@function -.align 16 -gcm_init_clmul: -.cfi_startproc -.L_init_clmul: - movdqu (%rsi),%xmm2 - pshufd $78,%xmm2,%xmm2 - - - pshufd $255,%xmm2,%xmm4 - movdqa %xmm2,%xmm3 - psllq $1,%xmm2 - pxor %xmm5,%xmm5 - psrlq $63,%xmm3 - pcmpgtd %xmm4,%xmm5 - pslldq $8,%xmm3 - por %xmm3,%xmm2 - - - pand .L0x1c2_polynomial(%rip),%xmm5 - pxor %xmm5,%xmm2 - - - pshufd $78,%xmm2,%xmm6 - movdqa %xmm2,%xmm0 - pxor %xmm2,%xmm6 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - pshufd $78,%xmm2,%xmm3 - pshufd $78,%xmm0,%xmm4 - pxor %xmm2,%xmm3 - movdqu %xmm2,0(%rdi) - pxor %xmm0,%xmm4 - movdqu %xmm0,16(%rdi) -.byte 102,15,58,15,227,8 - movdqu %xmm4,32(%rdi) - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - movdqa %xmm0,%xmm5 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,222,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - pshufd $78,%xmm5,%xmm3 - pshufd $78,%xmm0,%xmm4 - pxor %xmm5,%xmm3 - movdqu %xmm5,48(%rdi) - pxor %xmm0,%xmm4 - movdqu %xmm0,64(%rdi) -.byte 102,15,58,15,227,8 - movdqu %xmm4,80(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size gcm_init_clmul,.-gcm_init_clmul -.globl gcm_gmult_clmul -.hidden gcm_gmult_clmul -.type gcm_gmult_clmul,@function -.align 16 -gcm_gmult_clmul: -.cfi_startproc -.L_gmult_clmul: - movdqu (%rdi),%xmm0 - movdqa .Lbswap_mask(%rip),%xmm5 - movdqu (%rsi),%xmm2 - movdqu 32(%rsi),%xmm4 -.byte 102,15,56,0,197 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,220,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,197 - movdqu %xmm0,(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size gcm_gmult_clmul,.-gcm_gmult_clmul -.globl gcm_ghash_clmul -.hidden gcm_ghash_clmul -.type gcm_ghash_clmul,@function -.align 32 -gcm_ghash_clmul: -.cfi_startproc -.L_ghash_clmul: - movdqa .Lbswap_mask(%rip),%xmm10 - - movdqu (%rdi),%xmm0 - movdqu (%rsi),%xmm2 - movdqu 32(%rsi),%xmm7 -.byte 102,65,15,56,0,194 - - subq $0x10,%rcx - jz .Lodd_tail - - movdqu 16(%rsi),%xmm6 - leaq OPENSSL_ia32cap_P(%rip),%rax - movl 4(%rax),%eax - cmpq $0x30,%rcx - jb .Lskip4x - - andl $71303168,%eax - cmpl $4194304,%eax - je .Lskip4x - - subq $0x30,%rcx - movq $0xA040608020C0E000,%rax - movdqu 48(%rsi),%xmm14 - movdqu 64(%rsi),%xmm15 - - - - - movdqu 48(%rdx),%xmm3 - movdqu 32(%rdx),%xmm11 -.byte 102,65,15,56,0,218 -.byte 102,69,15,56,0,218 - movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,68,218,0 -.byte 102,15,58,68,234,17 -.byte 102,15,58,68,231,0 - - movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 - pxor %xmm11,%xmm12 -.byte 102,68,15,58,68,222,0 -.byte 102,68,15,58,68,238,17 -.byte 102,68,15,58,68,231,16 - xorps %xmm11,%xmm3 - xorps %xmm13,%xmm5 - movups 80(%rsi),%xmm7 - xorps %xmm12,%xmm4 - - movdqu 16(%rdx),%xmm11 - movdqu 0(%rdx),%xmm8 -.byte 102,69,15,56,0,218 -.byte 102,69,15,56,0,194 - movdqa %xmm11,%xmm13 - pshufd $78,%xmm11,%xmm12 - pxor %xmm8,%xmm0 - pxor %xmm11,%xmm12 -.byte 102,69,15,58,68,222,0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm8 - pxor %xmm0,%xmm8 -.byte 102,69,15,58,68,238,17 -.byte 102,68,15,58,68,231,0 - xorps %xmm11,%xmm3 - xorps %xmm13,%xmm5 - - leaq 64(%rdx),%rdx - subq $0x40,%rcx - jc .Ltail4x - - jmp .Lmod4_loop -.align 32 -.Lmod4_loop: -.byte 102,65,15,58,68,199,0 - xorps %xmm12,%xmm4 - movdqu 48(%rdx),%xmm11 -.byte 102,69,15,56,0,218 -.byte 102,65,15,58,68,207,17 - xorps %xmm3,%xmm0 - movdqu 32(%rdx),%xmm3 - movdqa %xmm11,%xmm13 -.byte 102,68,15,58,68,199,16 - pshufd $78,%xmm11,%xmm12 - xorps %xmm5,%xmm1 - pxor %xmm11,%xmm12 -.byte 102,65,15,56,0,218 - movups 32(%rsi),%xmm7 - xorps %xmm4,%xmm8 -.byte 102,68,15,58,68,218,0 - pshufd $78,%xmm3,%xmm4 - - pxor %xmm0,%xmm8 - movdqa %xmm3,%xmm5 - pxor %xmm1,%xmm8 - pxor %xmm3,%xmm4 - movdqa %xmm8,%xmm9 -.byte 102,68,15,58,68,234,17 - pslldq $8,%xmm8 - psrldq $8,%xmm9 - pxor %xmm8,%xmm0 - movdqa .L7_mask(%rip),%xmm8 - pxor %xmm9,%xmm1 -.byte 102,76,15,110,200 - - pand %xmm0,%xmm8 -.byte 102,69,15,56,0,200 - pxor %xmm0,%xmm9 -.byte 102,68,15,58,68,231,0 - psllq $57,%xmm9 - movdqa %xmm9,%xmm8 - pslldq $8,%xmm9 -.byte 102,15,58,68,222,0 - psrldq $8,%xmm8 - pxor %xmm9,%xmm0 - pxor %xmm8,%xmm1 - movdqu 0(%rdx),%xmm8 - - movdqa %xmm0,%xmm9 - psrlq $1,%xmm0 -.byte 102,15,58,68,238,17 - xorps %xmm11,%xmm3 - movdqu 16(%rdx),%xmm11 -.byte 102,69,15,56,0,218 -.byte 102,15,58,68,231,16 - xorps %xmm13,%xmm5 - movups 80(%rsi),%xmm7 -.byte 102,69,15,56,0,194 - pxor %xmm9,%xmm1 - pxor %xmm0,%xmm9 - psrlq $5,%xmm0 - - movdqa %xmm11,%xmm13 - pxor %xmm12,%xmm4 - pshufd $78,%xmm11,%xmm12 - pxor %xmm9,%xmm0 - pxor %xmm8,%xmm1 - pxor %xmm11,%xmm12 -.byte 102,69,15,58,68,222,0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - movdqa %xmm0,%xmm1 -.byte 102,69,15,58,68,238,17 - xorps %xmm11,%xmm3 - pshufd $78,%xmm0,%xmm8 - pxor %xmm0,%xmm8 - -.byte 102,68,15,58,68,231,0 - xorps %xmm13,%xmm5 - - leaq 64(%rdx),%rdx - subq $0x40,%rcx - jnc .Lmod4_loop - -.Ltail4x: -.byte 102,65,15,58,68,199,0 -.byte 102,65,15,58,68,207,17 -.byte 102,68,15,58,68,199,16 - xorps %xmm12,%xmm4 - xorps %xmm3,%xmm0 - xorps %xmm5,%xmm1 - pxor %xmm0,%xmm1 - pxor %xmm4,%xmm8 - - pxor %xmm1,%xmm8 - pxor %xmm0,%xmm1 - - movdqa %xmm8,%xmm9 - psrldq $8,%xmm8 - pslldq $8,%xmm9 - pxor %xmm8,%xmm1 - pxor %xmm9,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - addq $0x40,%rcx - jz .Ldone - movdqu 32(%rsi),%xmm7 - subq $0x10,%rcx - jz .Lodd_tail -.Lskip4x: - - - - - - movdqu (%rdx),%xmm8 - movdqu 16(%rdx),%xmm3 -.byte 102,69,15,56,0,194 -.byte 102,65,15,56,0,218 - pxor %xmm8,%xmm0 - - movdqa %xmm3,%xmm5 - pshufd $78,%xmm3,%xmm4 - pxor %xmm3,%xmm4 -.byte 102,15,58,68,218,0 -.byte 102,15,58,68,234,17 -.byte 102,15,58,68,231,0 - - leaq 32(%rdx),%rdx - nop - subq $0x20,%rcx - jbe .Leven_tail - nop - jmp .Lmod_loop - -.align 32 -.Lmod_loop: - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 - pxor %xmm0,%xmm4 - -.byte 102,15,58,68,198,0 -.byte 102,15,58,68,206,17 -.byte 102,15,58,68,231,16 - - pxor %xmm3,%xmm0 - pxor %xmm5,%xmm1 - movdqu (%rdx),%xmm9 - pxor %xmm0,%xmm8 -.byte 102,69,15,56,0,202 - movdqu 16(%rdx),%xmm3 - - pxor %xmm1,%xmm8 - pxor %xmm9,%xmm1 - pxor %xmm8,%xmm4 -.byte 102,65,15,56,0,218 - movdqa %xmm4,%xmm8 - psrldq $8,%xmm8 - pslldq $8,%xmm4 - pxor %xmm8,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm3,%xmm5 - - movdqa %xmm0,%xmm9 - movdqa %xmm0,%xmm8 - psllq $5,%xmm0 - pxor %xmm0,%xmm8 -.byte 102,15,58,68,218,0 - psllq $1,%xmm0 - pxor %xmm8,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm8 - pslldq $8,%xmm0 - psrldq $8,%xmm8 - pxor %xmm9,%xmm0 - pshufd $78,%xmm5,%xmm4 - pxor %xmm8,%xmm1 - pxor %xmm5,%xmm4 - - movdqa %xmm0,%xmm9 - psrlq $1,%xmm0 -.byte 102,15,58,68,234,17 - pxor %xmm9,%xmm1 - pxor %xmm0,%xmm9 - psrlq $5,%xmm0 - pxor %xmm9,%xmm0 - leaq 32(%rdx),%rdx - psrlq $1,%xmm0 -.byte 102,15,58,68,231,0 - pxor %xmm1,%xmm0 - - subq $0x20,%rcx - ja .Lmod_loop - -.Leven_tail: - movdqa %xmm0,%xmm1 - movdqa %xmm4,%xmm8 - pshufd $78,%xmm0,%xmm4 - pxor %xmm0,%xmm4 - -.byte 102,15,58,68,198,0 -.byte 102,15,58,68,206,17 -.byte 102,15,58,68,231,16 - - pxor %xmm3,%xmm0 - pxor %xmm5,%xmm1 - pxor %xmm0,%xmm8 - pxor %xmm1,%xmm8 - pxor %xmm8,%xmm4 - movdqa %xmm4,%xmm8 - psrldq $8,%xmm8 - pslldq $8,%xmm4 - pxor %xmm8,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 - testq %rcx,%rcx - jnz .Ldone - -.Lodd_tail: - movdqu (%rdx),%xmm8 -.byte 102,69,15,56,0,194 - pxor %xmm8,%xmm0 - movdqa %xmm0,%xmm1 - pshufd $78,%xmm0,%xmm3 - pxor %xmm0,%xmm3 -.byte 102,15,58,68,194,0 -.byte 102,15,58,68,202,17 -.byte 102,15,58,68,223,0 - pxor %xmm0,%xmm3 - pxor %xmm1,%xmm3 - - movdqa %xmm3,%xmm4 - psrldq $8,%xmm3 - pslldq $8,%xmm4 - pxor %xmm3,%xmm1 - pxor %xmm4,%xmm0 - - movdqa %xmm0,%xmm4 - movdqa %xmm0,%xmm3 - psllq $5,%xmm0 - pxor %xmm0,%xmm3 - psllq $1,%xmm0 - pxor %xmm3,%xmm0 - psllq $57,%xmm0 - movdqa %xmm0,%xmm3 - pslldq $8,%xmm0 - psrldq $8,%xmm3 - pxor %xmm4,%xmm0 - pxor %xmm3,%xmm1 - - - movdqa %xmm0,%xmm4 - psrlq $1,%xmm0 - pxor %xmm4,%xmm1 - pxor %xmm0,%xmm4 - psrlq $5,%xmm0 - pxor %xmm4,%xmm0 - psrlq $1,%xmm0 - pxor %xmm1,%xmm0 -.Ldone: -.byte 102,65,15,56,0,194 - movdqu %xmm0,(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size gcm_ghash_clmul,.-gcm_ghash_clmul -.globl gcm_init_avx -.hidden gcm_init_avx -.type gcm_init_avx,@function -.align 32 -gcm_init_avx: -.cfi_startproc - vzeroupper - - vmovdqu (%rsi),%xmm2 - vpshufd $78,%xmm2,%xmm2 - - - vpshufd $255,%xmm2,%xmm4 - vpsrlq $63,%xmm2,%xmm3 - vpsllq $1,%xmm2,%xmm2 - vpxor %xmm5,%xmm5,%xmm5 - vpcmpgtd %xmm4,%xmm5,%xmm5 - vpslldq $8,%xmm3,%xmm3 - vpor %xmm3,%xmm2,%xmm2 - - - vpand .L0x1c2_polynomial(%rip),%xmm5,%xmm5 - vpxor %xmm5,%xmm2,%xmm2 - - vpunpckhqdq %xmm2,%xmm2,%xmm6 - vmovdqa %xmm2,%xmm0 - vpxor %xmm2,%xmm6,%xmm6 - movq $4,%r10 - jmp .Linit_start_avx -.align 32 -.Linit_loop_avx: - vpalignr $8,%xmm3,%xmm4,%xmm5 - vmovdqu %xmm5,-16(%rdi) - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 -.Linit_start_avx: - vmovdqa %xmm0,%xmm5 - vpunpckhqdq %xmm0,%xmm0,%xmm3 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm2,%xmm0,%xmm1 - vpclmulqdq $0x00,%xmm2,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm3,%xmm3 - vpxor %xmm0,%xmm1,%xmm4 - vpxor %xmm4,%xmm3,%xmm3 - - vpslldq $8,%xmm3,%xmm4 - vpsrldq $8,%xmm3,%xmm3 - vpxor %xmm4,%xmm0,%xmm0 - vpxor %xmm3,%xmm1,%xmm1 - vpsllq $57,%xmm0,%xmm3 - vpsllq $62,%xmm0,%xmm4 - vpxor %xmm3,%xmm4,%xmm4 - vpsllq $63,%xmm0,%xmm3 - vpxor %xmm3,%xmm4,%xmm4 - vpslldq $8,%xmm4,%xmm3 - vpsrldq $8,%xmm4,%xmm4 - vpxor %xmm3,%xmm0,%xmm0 - vpxor %xmm4,%xmm1,%xmm1 - - vpsrlq $1,%xmm0,%xmm4 - vpxor %xmm0,%xmm1,%xmm1 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $5,%xmm4,%xmm4 - vpxor %xmm4,%xmm0,%xmm0 - vpsrlq $1,%xmm0,%xmm0 - vpxor %xmm1,%xmm0,%xmm0 - vpshufd $78,%xmm5,%xmm3 - vpshufd $78,%xmm0,%xmm4 - vpxor %xmm5,%xmm3,%xmm3 - vmovdqu %xmm5,0(%rdi) - vpxor %xmm0,%xmm4,%xmm4 - vmovdqu %xmm0,16(%rdi) - leaq 48(%rdi),%rdi - subq $1,%r10 - jnz .Linit_loop_avx - - vpalignr $8,%xmm4,%xmm3,%xmm5 - vmovdqu %xmm5,-16(%rdi) - - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.size gcm_init_avx,.-gcm_init_avx -.globl gcm_gmult_avx -.hidden gcm_gmult_avx -.type gcm_gmult_avx,@function -.align 32 -gcm_gmult_avx: -.cfi_startproc - jmp .L_gmult_clmul -.cfi_endproc -.size gcm_gmult_avx,.-gcm_gmult_avx -.globl gcm_ghash_avx -.hidden gcm_ghash_avx -.type gcm_ghash_avx,@function -.align 32 -gcm_ghash_avx: -.cfi_startproc - vzeroupper - - vmovdqu (%rdi),%xmm10 - leaq .L0x1c2_polynomial(%rip),%r10 - leaq 64(%rsi),%rsi - vmovdqu .Lbswap_mask(%rip),%xmm13 - vpshufb %xmm13,%xmm10,%xmm10 - cmpq $0x80,%rcx - jb .Lshort_avx - subq $0x80,%rcx - - vmovdqu 112(%rdx),%xmm14 - vmovdqu 0-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vmovdqu 32-64(%rsi),%xmm7 - - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm14,%xmm9,%xmm9 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 80(%rdx),%xmm14 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 48-64(%rsi),%xmm6 - vpxor %xmm14,%xmm9,%xmm9 - vmovdqu 64(%rdx),%xmm15 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 48(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 32(%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - - vmovdqu 16(%rdx),%xmm14 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm4,%xmm1,%xmm1 - vpshufb %xmm13,%xmm14,%xmm14 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpxor %xmm5,%xmm2,%xmm2 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu (%rdx),%xmm15 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm1,%xmm4,%xmm4 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 - - leaq 128(%rdx),%rdx - cmpq $0x80,%rcx - jb .Ltail_avx - - vpxor %xmm10,%xmm15,%xmm15 - subq $0x80,%rcx - jmp .Loop8x_avx - -.align 32 -.Loop8x_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vmovdqu 112(%rdx),%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpxor %xmm15,%xmm8,%xmm8 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm10 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm11 - vmovdqu 0-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm12 - vmovdqu 32-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - - vmovdqu 96(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpxor %xmm3,%xmm10,%xmm10 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vxorps %xmm4,%xmm11,%xmm11 - vmovdqu 16-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm5,%xmm12,%xmm12 - vxorps %xmm15,%xmm8,%xmm8 - - vmovdqu 80(%rdx),%xmm14 - vpxor %xmm10,%xmm12,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpxor %xmm11,%xmm12,%xmm12 - vpslldq $8,%xmm12,%xmm9 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vpsrldq $8,%xmm12,%xmm12 - vpxor %xmm9,%xmm10,%xmm10 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm14 - vxorps %xmm12,%xmm11,%xmm11 - vpxor %xmm1,%xmm4,%xmm4 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 80-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 64(%rdx),%xmm15 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vxorps %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - - vmovdqu 48(%rdx),%xmm14 - vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 96-64(%rsi),%xmm6 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 128-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu 32(%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpxor %xmm3,%xmm0,%xmm0 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm4,%xmm1,%xmm1 - vpclmulqdq $0x00,%xmm7,%xmm9,%xmm2 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm5,%xmm2,%xmm2 - vxorps %xmm12,%xmm10,%xmm10 - - vmovdqu 16(%rdx),%xmm14 - vpalignr $8,%xmm10,%xmm10,%xmm12 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm3 - vpshufb %xmm13,%xmm14,%xmm14 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm4 - vmovdqu 144-64(%rsi),%xmm6 - vpclmulqdq $0x10,(%r10),%xmm10,%xmm10 - vxorps %xmm11,%xmm12,%xmm12 - vpunpckhqdq %xmm14,%xmm14,%xmm9 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x10,%xmm7,%xmm8,%xmm5 - vmovdqu 176-64(%rsi),%xmm7 - vpxor %xmm14,%xmm9,%xmm9 - vpxor %xmm2,%xmm5,%xmm5 - - vmovdqu (%rdx),%xmm15 - vpclmulqdq $0x00,%xmm6,%xmm14,%xmm0 - vpshufb %xmm13,%xmm15,%xmm15 - vpclmulqdq $0x11,%xmm6,%xmm14,%xmm1 - vmovdqu 160-64(%rsi),%xmm6 - vpxor %xmm12,%xmm15,%xmm15 - vpclmulqdq $0x10,%xmm7,%xmm9,%xmm2 - vpxor %xmm10,%xmm15,%xmm15 - - leaq 128(%rdx),%rdx - subq $0x80,%rcx - jnc .Loop8x_avx - - addq $0x80,%rcx - jmp .Ltail_no_xor_avx - -.align 32 -.Lshort_avx: - vmovdqu -16(%rdx,%rcx,1),%xmm14 - leaq (%rdx,%rcx,1),%rdx - vmovdqu 0-64(%rsi),%xmm6 - vmovdqu 32-64(%rsi),%xmm7 - vpshufb %xmm13,%xmm14,%xmm15 - - vmovdqa %xmm0,%xmm3 - vmovdqa %xmm1,%xmm4 - vmovdqa %xmm2,%xmm5 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -32(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 16-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -48(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 48-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovdqu 80-64(%rsi),%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -64(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 64-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -80(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 96-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovdqu 128-64(%rsi),%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -96(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 112-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vpsrldq $8,%xmm7,%xmm7 - subq $0x10,%rcx - jz .Ltail_avx - - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vmovdqu -112(%rdx),%xmm14 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vmovdqu 144-64(%rsi),%xmm6 - vpshufb %xmm13,%xmm14,%xmm15 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - vmovq 184-64(%rsi),%xmm7 - subq $0x10,%rcx - jmp .Ltail_avx - -.align 32 -.Ltail_avx: - vpxor %xmm10,%xmm15,%xmm15 -.Ltail_no_xor_avx: - vpunpckhqdq %xmm15,%xmm15,%xmm8 - vpxor %xmm0,%xmm3,%xmm3 - vpclmulqdq $0x00,%xmm6,%xmm15,%xmm0 - vpxor %xmm15,%xmm8,%xmm8 - vpxor %xmm1,%xmm4,%xmm4 - vpclmulqdq $0x11,%xmm6,%xmm15,%xmm1 - vpxor %xmm2,%xmm5,%xmm5 - vpclmulqdq $0x00,%xmm7,%xmm8,%xmm2 - - vmovdqu (%r10),%xmm12 - - vpxor %xmm0,%xmm3,%xmm10 - vpxor %xmm1,%xmm4,%xmm11 - vpxor %xmm2,%xmm5,%xmm5 - - vpxor %xmm10,%xmm5,%xmm5 - vpxor %xmm11,%xmm5,%xmm5 - vpslldq $8,%xmm5,%xmm9 - vpsrldq $8,%xmm5,%xmm5 - vpxor %xmm9,%xmm10,%xmm10 - vpxor %xmm5,%xmm11,%xmm11 - - vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - vpclmulqdq $0x10,%xmm12,%xmm10,%xmm9 - vpalignr $8,%xmm10,%xmm10,%xmm10 - vpxor %xmm11,%xmm10,%xmm10 - vpxor %xmm9,%xmm10,%xmm10 - - cmpq $0,%rcx - jne .Lshort_avx - - vpshufb %xmm13,%xmm10,%xmm10 - vmovdqu %xmm10,(%rdi) - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.size gcm_ghash_avx,.-gcm_ghash_avx -.align 64 -.Lbswap_mask: -.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0 -.L0x1c2_polynomial: -.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2 -.L7_mask: -.long 7,0,7,0 -.align 64 - -.byte 71,72,65,83,72,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 64 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S deleted file mode 100644 index 4f082070..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/md5-x86_64.S +++ /dev/null @@ -1,702 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.align 16 - -.globl md5_block_asm_data_order -.hidden md5_block_asm_data_order -.type md5_block_asm_data_order,@function -md5_block_asm_data_order: -.cfi_startproc - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset r12,-32 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset r14,-40 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset r15,-48 -.Lprologue: - - - - - movq %rdi,%rbp - shlq $6,%rdx - leaq (%rsi,%rdx,1),%rdi - movl 0(%rbp),%eax - movl 4(%rbp),%ebx - movl 8(%rbp),%ecx - movl 12(%rbp),%edx - - - - - - - - cmpq %rdi,%rsi - je .Lend - - -.Lloop: - movl %eax,%r8d - movl %ebx,%r9d - movl %ecx,%r14d - movl %edx,%r15d - movl 0(%rsi),%r10d - movl %edx,%r11d - xorl %ecx,%r11d - leal -680876936(%rax,%r10,1),%eax - andl %ebx,%r11d - xorl %edx,%r11d - movl 4(%rsi),%r10d - addl %r11d,%eax - roll $7,%eax - movl %ecx,%r11d - addl %ebx,%eax - xorl %ebx,%r11d - leal -389564586(%rdx,%r10,1),%edx - andl %eax,%r11d - xorl %ecx,%r11d - movl 8(%rsi),%r10d - addl %r11d,%edx - roll $12,%edx - movl %ebx,%r11d - addl %eax,%edx - xorl %eax,%r11d - leal 606105819(%rcx,%r10,1),%ecx - andl %edx,%r11d - xorl %ebx,%r11d - movl 12(%rsi),%r10d - addl %r11d,%ecx - roll $17,%ecx - movl %eax,%r11d - addl %edx,%ecx - xorl %edx,%r11d - leal -1044525330(%rbx,%r10,1),%ebx - andl %ecx,%r11d - xorl %eax,%r11d - movl 16(%rsi),%r10d - addl %r11d,%ebx - roll $22,%ebx - movl %edx,%r11d - addl %ecx,%ebx - xorl %ecx,%r11d - leal -176418897(%rax,%r10,1),%eax - andl %ebx,%r11d - xorl %edx,%r11d - movl 20(%rsi),%r10d - addl %r11d,%eax - roll $7,%eax - movl %ecx,%r11d - addl %ebx,%eax - xorl %ebx,%r11d - leal 1200080426(%rdx,%r10,1),%edx - andl %eax,%r11d - xorl %ecx,%r11d - movl 24(%rsi),%r10d - addl %r11d,%edx - roll $12,%edx - movl %ebx,%r11d - addl %eax,%edx - xorl %eax,%r11d - leal -1473231341(%rcx,%r10,1),%ecx - andl %edx,%r11d - xorl %ebx,%r11d - movl 28(%rsi),%r10d - addl %r11d,%ecx - roll $17,%ecx - movl %eax,%r11d - addl %edx,%ecx - xorl %edx,%r11d - leal -45705983(%rbx,%r10,1),%ebx - andl %ecx,%r11d - xorl %eax,%r11d - movl 32(%rsi),%r10d - addl %r11d,%ebx - roll $22,%ebx - movl %edx,%r11d - addl %ecx,%ebx - xorl %ecx,%r11d - leal 1770035416(%rax,%r10,1),%eax - andl %ebx,%r11d - xorl %edx,%r11d - movl 36(%rsi),%r10d - addl %r11d,%eax - roll $7,%eax - movl %ecx,%r11d - addl %ebx,%eax - xorl %ebx,%r11d - leal -1958414417(%rdx,%r10,1),%edx - andl %eax,%r11d - xorl %ecx,%r11d - movl 40(%rsi),%r10d - addl %r11d,%edx - roll $12,%edx - movl %ebx,%r11d - addl %eax,%edx - xorl %eax,%r11d - leal -42063(%rcx,%r10,1),%ecx - andl %edx,%r11d - xorl %ebx,%r11d - movl 44(%rsi),%r10d - addl %r11d,%ecx - roll $17,%ecx - movl %eax,%r11d - addl %edx,%ecx - xorl %edx,%r11d - leal -1990404162(%rbx,%r10,1),%ebx - andl %ecx,%r11d - xorl %eax,%r11d - movl 48(%rsi),%r10d - addl %r11d,%ebx - roll $22,%ebx - movl %edx,%r11d - addl %ecx,%ebx - xorl %ecx,%r11d - leal 1804603682(%rax,%r10,1),%eax - andl %ebx,%r11d - xorl %edx,%r11d - movl 52(%rsi),%r10d - addl %r11d,%eax - roll $7,%eax - movl %ecx,%r11d - addl %ebx,%eax - xorl %ebx,%r11d - leal -40341101(%rdx,%r10,1),%edx - andl %eax,%r11d - xorl %ecx,%r11d - movl 56(%rsi),%r10d - addl %r11d,%edx - roll $12,%edx - movl %ebx,%r11d - addl %eax,%edx - xorl %eax,%r11d - leal -1502002290(%rcx,%r10,1),%ecx - andl %edx,%r11d - xorl %ebx,%r11d - movl 60(%rsi),%r10d - addl %r11d,%ecx - roll $17,%ecx - movl %eax,%r11d - addl %edx,%ecx - xorl %edx,%r11d - leal 1236535329(%rbx,%r10,1),%ebx - andl %ecx,%r11d - xorl %eax,%r11d - movl 0(%rsi),%r10d - addl %r11d,%ebx - roll $22,%ebx - movl %edx,%r11d - addl %ecx,%ebx - movl 4(%rsi),%r10d - movl %edx,%r11d - movl %edx,%r12d - notl %r11d - leal -165796510(%rax,%r10,1),%eax - andl %ebx,%r12d - andl %ecx,%r11d - movl 24(%rsi),%r10d - orl %r11d,%r12d - movl %ecx,%r11d - addl %r12d,%eax - movl %ecx,%r12d - roll $5,%eax - addl %ebx,%eax - notl %r11d - leal -1069501632(%rdx,%r10,1),%edx - andl %eax,%r12d - andl %ebx,%r11d - movl 44(%rsi),%r10d - orl %r11d,%r12d - movl %ebx,%r11d - addl %r12d,%edx - movl %ebx,%r12d - roll $9,%edx - addl %eax,%edx - notl %r11d - leal 643717713(%rcx,%r10,1),%ecx - andl %edx,%r12d - andl %eax,%r11d - movl 0(%rsi),%r10d - orl %r11d,%r12d - movl %eax,%r11d - addl %r12d,%ecx - movl %eax,%r12d - roll $14,%ecx - addl %edx,%ecx - notl %r11d - leal -373897302(%rbx,%r10,1),%ebx - andl %ecx,%r12d - andl %edx,%r11d - movl 20(%rsi),%r10d - orl %r11d,%r12d - movl %edx,%r11d - addl %r12d,%ebx - movl %edx,%r12d - roll $20,%ebx - addl %ecx,%ebx - notl %r11d - leal -701558691(%rax,%r10,1),%eax - andl %ebx,%r12d - andl %ecx,%r11d - movl 40(%rsi),%r10d - orl %r11d,%r12d - movl %ecx,%r11d - addl %r12d,%eax - movl %ecx,%r12d - roll $5,%eax - addl %ebx,%eax - notl %r11d - leal 38016083(%rdx,%r10,1),%edx - andl %eax,%r12d - andl %ebx,%r11d - movl 60(%rsi),%r10d - orl %r11d,%r12d - movl %ebx,%r11d - addl %r12d,%edx - movl %ebx,%r12d - roll $9,%edx - addl %eax,%edx - notl %r11d - leal -660478335(%rcx,%r10,1),%ecx - andl %edx,%r12d - andl %eax,%r11d - movl 16(%rsi),%r10d - orl %r11d,%r12d - movl %eax,%r11d - addl %r12d,%ecx - movl %eax,%r12d - roll $14,%ecx - addl %edx,%ecx - notl %r11d - leal -405537848(%rbx,%r10,1),%ebx - andl %ecx,%r12d - andl %edx,%r11d - movl 36(%rsi),%r10d - orl %r11d,%r12d - movl %edx,%r11d - addl %r12d,%ebx - movl %edx,%r12d - roll $20,%ebx - addl %ecx,%ebx - notl %r11d - leal 568446438(%rax,%r10,1),%eax - andl %ebx,%r12d - andl %ecx,%r11d - movl 56(%rsi),%r10d - orl %r11d,%r12d - movl %ecx,%r11d - addl %r12d,%eax - movl %ecx,%r12d - roll $5,%eax - addl %ebx,%eax - notl %r11d - leal -1019803690(%rdx,%r10,1),%edx - andl %eax,%r12d - andl %ebx,%r11d - movl 12(%rsi),%r10d - orl %r11d,%r12d - movl %ebx,%r11d - addl %r12d,%edx - movl %ebx,%r12d - roll $9,%edx - addl %eax,%edx - notl %r11d - leal -187363961(%rcx,%r10,1),%ecx - andl %edx,%r12d - andl %eax,%r11d - movl 32(%rsi),%r10d - orl %r11d,%r12d - movl %eax,%r11d - addl %r12d,%ecx - movl %eax,%r12d - roll $14,%ecx - addl %edx,%ecx - notl %r11d - leal 1163531501(%rbx,%r10,1),%ebx - andl %ecx,%r12d - andl %edx,%r11d - movl 52(%rsi),%r10d - orl %r11d,%r12d - movl %edx,%r11d - addl %r12d,%ebx - movl %edx,%r12d - roll $20,%ebx - addl %ecx,%ebx - notl %r11d - leal -1444681467(%rax,%r10,1),%eax - andl %ebx,%r12d - andl %ecx,%r11d - movl 8(%rsi),%r10d - orl %r11d,%r12d - movl %ecx,%r11d - addl %r12d,%eax - movl %ecx,%r12d - roll $5,%eax - addl %ebx,%eax - notl %r11d - leal -51403784(%rdx,%r10,1),%edx - andl %eax,%r12d - andl %ebx,%r11d - movl 28(%rsi),%r10d - orl %r11d,%r12d - movl %ebx,%r11d - addl %r12d,%edx - movl %ebx,%r12d - roll $9,%edx - addl %eax,%edx - notl %r11d - leal 1735328473(%rcx,%r10,1),%ecx - andl %edx,%r12d - andl %eax,%r11d - movl 48(%rsi),%r10d - orl %r11d,%r12d - movl %eax,%r11d - addl %r12d,%ecx - movl %eax,%r12d - roll $14,%ecx - addl %edx,%ecx - notl %r11d - leal -1926607734(%rbx,%r10,1),%ebx - andl %ecx,%r12d - andl %edx,%r11d - movl 0(%rsi),%r10d - orl %r11d,%r12d - movl %edx,%r11d - addl %r12d,%ebx - movl %edx,%r12d - roll $20,%ebx - addl %ecx,%ebx - movl 20(%rsi),%r10d - movl %ecx,%r11d - leal -378558(%rax,%r10,1),%eax - movl 32(%rsi),%r10d - xorl %edx,%r11d - xorl %ebx,%r11d - addl %r11d,%eax - roll $4,%eax - movl %ebx,%r11d - addl %ebx,%eax - leal -2022574463(%rdx,%r10,1),%edx - movl 44(%rsi),%r10d - xorl %ecx,%r11d - xorl %eax,%r11d - addl %r11d,%edx - roll $11,%edx - movl %eax,%r11d - addl %eax,%edx - leal 1839030562(%rcx,%r10,1),%ecx - movl 56(%rsi),%r10d - xorl %ebx,%r11d - xorl %edx,%r11d - addl %r11d,%ecx - roll $16,%ecx - movl %edx,%r11d - addl %edx,%ecx - leal -35309556(%rbx,%r10,1),%ebx - movl 4(%rsi),%r10d - xorl %eax,%r11d - xorl %ecx,%r11d - addl %r11d,%ebx - roll $23,%ebx - movl %ecx,%r11d - addl %ecx,%ebx - leal -1530992060(%rax,%r10,1),%eax - movl 16(%rsi),%r10d - xorl %edx,%r11d - xorl %ebx,%r11d - addl %r11d,%eax - roll $4,%eax - movl %ebx,%r11d - addl %ebx,%eax - leal 1272893353(%rdx,%r10,1),%edx - movl 28(%rsi),%r10d - xorl %ecx,%r11d - xorl %eax,%r11d - addl %r11d,%edx - roll $11,%edx - movl %eax,%r11d - addl %eax,%edx - leal -155497632(%rcx,%r10,1),%ecx - movl 40(%rsi),%r10d - xorl %ebx,%r11d - xorl %edx,%r11d - addl %r11d,%ecx - roll $16,%ecx - movl %edx,%r11d - addl %edx,%ecx - leal -1094730640(%rbx,%r10,1),%ebx - movl 52(%rsi),%r10d - xorl %eax,%r11d - xorl %ecx,%r11d - addl %r11d,%ebx - roll $23,%ebx - movl %ecx,%r11d - addl %ecx,%ebx - leal 681279174(%rax,%r10,1),%eax - movl 0(%rsi),%r10d - xorl %edx,%r11d - xorl %ebx,%r11d - addl %r11d,%eax - roll $4,%eax - movl %ebx,%r11d - addl %ebx,%eax - leal -358537222(%rdx,%r10,1),%edx - movl 12(%rsi),%r10d - xorl %ecx,%r11d - xorl %eax,%r11d - addl %r11d,%edx - roll $11,%edx - movl %eax,%r11d - addl %eax,%edx - leal -722521979(%rcx,%r10,1),%ecx - movl 24(%rsi),%r10d - xorl %ebx,%r11d - xorl %edx,%r11d - addl %r11d,%ecx - roll $16,%ecx - movl %edx,%r11d - addl %edx,%ecx - leal 76029189(%rbx,%r10,1),%ebx - movl 36(%rsi),%r10d - xorl %eax,%r11d - xorl %ecx,%r11d - addl %r11d,%ebx - roll $23,%ebx - movl %ecx,%r11d - addl %ecx,%ebx - leal -640364487(%rax,%r10,1),%eax - movl 48(%rsi),%r10d - xorl %edx,%r11d - xorl %ebx,%r11d - addl %r11d,%eax - roll $4,%eax - movl %ebx,%r11d - addl %ebx,%eax - leal -421815835(%rdx,%r10,1),%edx - movl 60(%rsi),%r10d - xorl %ecx,%r11d - xorl %eax,%r11d - addl %r11d,%edx - roll $11,%edx - movl %eax,%r11d - addl %eax,%edx - leal 530742520(%rcx,%r10,1),%ecx - movl 8(%rsi),%r10d - xorl %ebx,%r11d - xorl %edx,%r11d - addl %r11d,%ecx - roll $16,%ecx - movl %edx,%r11d - addl %edx,%ecx - leal -995338651(%rbx,%r10,1),%ebx - movl 0(%rsi),%r10d - xorl %eax,%r11d - xorl %ecx,%r11d - addl %r11d,%ebx - roll $23,%ebx - movl %ecx,%r11d - addl %ecx,%ebx - movl 0(%rsi),%r10d - movl $0xffffffff,%r11d - xorl %edx,%r11d - leal -198630844(%rax,%r10,1),%eax - orl %ebx,%r11d - xorl %ecx,%r11d - addl %r11d,%eax - movl 28(%rsi),%r10d - movl $0xffffffff,%r11d - roll $6,%eax - xorl %ecx,%r11d - addl %ebx,%eax - leal 1126891415(%rdx,%r10,1),%edx - orl %eax,%r11d - xorl %ebx,%r11d - addl %r11d,%edx - movl 56(%rsi),%r10d - movl $0xffffffff,%r11d - roll $10,%edx - xorl %ebx,%r11d - addl %eax,%edx - leal -1416354905(%rcx,%r10,1),%ecx - orl %edx,%r11d - xorl %eax,%r11d - addl %r11d,%ecx - movl 20(%rsi),%r10d - movl $0xffffffff,%r11d - roll $15,%ecx - xorl %eax,%r11d - addl %edx,%ecx - leal -57434055(%rbx,%r10,1),%ebx - orl %ecx,%r11d - xorl %edx,%r11d - addl %r11d,%ebx - movl 48(%rsi),%r10d - movl $0xffffffff,%r11d - roll $21,%ebx - xorl %edx,%r11d - addl %ecx,%ebx - leal 1700485571(%rax,%r10,1),%eax - orl %ebx,%r11d - xorl %ecx,%r11d - addl %r11d,%eax - movl 12(%rsi),%r10d - movl $0xffffffff,%r11d - roll $6,%eax - xorl %ecx,%r11d - addl %ebx,%eax - leal -1894986606(%rdx,%r10,1),%edx - orl %eax,%r11d - xorl %ebx,%r11d - addl %r11d,%edx - movl 40(%rsi),%r10d - movl $0xffffffff,%r11d - roll $10,%edx - xorl %ebx,%r11d - addl %eax,%edx - leal -1051523(%rcx,%r10,1),%ecx - orl %edx,%r11d - xorl %eax,%r11d - addl %r11d,%ecx - movl 4(%rsi),%r10d - movl $0xffffffff,%r11d - roll $15,%ecx - xorl %eax,%r11d - addl %edx,%ecx - leal -2054922799(%rbx,%r10,1),%ebx - orl %ecx,%r11d - xorl %edx,%r11d - addl %r11d,%ebx - movl 32(%rsi),%r10d - movl $0xffffffff,%r11d - roll $21,%ebx - xorl %edx,%r11d - addl %ecx,%ebx - leal 1873313359(%rax,%r10,1),%eax - orl %ebx,%r11d - xorl %ecx,%r11d - addl %r11d,%eax - movl 60(%rsi),%r10d - movl $0xffffffff,%r11d - roll $6,%eax - xorl %ecx,%r11d - addl %ebx,%eax - leal -30611744(%rdx,%r10,1),%edx - orl %eax,%r11d - xorl %ebx,%r11d - addl %r11d,%edx - movl 24(%rsi),%r10d - movl $0xffffffff,%r11d - roll $10,%edx - xorl %ebx,%r11d - addl %eax,%edx - leal -1560198380(%rcx,%r10,1),%ecx - orl %edx,%r11d - xorl %eax,%r11d - addl %r11d,%ecx - movl 52(%rsi),%r10d - movl $0xffffffff,%r11d - roll $15,%ecx - xorl %eax,%r11d - addl %edx,%ecx - leal 1309151649(%rbx,%r10,1),%ebx - orl %ecx,%r11d - xorl %edx,%r11d - addl %r11d,%ebx - movl 16(%rsi),%r10d - movl $0xffffffff,%r11d - roll $21,%ebx - xorl %edx,%r11d - addl %ecx,%ebx - leal -145523070(%rax,%r10,1),%eax - orl %ebx,%r11d - xorl %ecx,%r11d - addl %r11d,%eax - movl 44(%rsi),%r10d - movl $0xffffffff,%r11d - roll $6,%eax - xorl %ecx,%r11d - addl %ebx,%eax - leal -1120210379(%rdx,%r10,1),%edx - orl %eax,%r11d - xorl %ebx,%r11d - addl %r11d,%edx - movl 8(%rsi),%r10d - movl $0xffffffff,%r11d - roll $10,%edx - xorl %ebx,%r11d - addl %eax,%edx - leal 718787259(%rcx,%r10,1),%ecx - orl %edx,%r11d - xorl %eax,%r11d - addl %r11d,%ecx - movl 36(%rsi),%r10d - movl $0xffffffff,%r11d - roll $15,%ecx - xorl %eax,%r11d - addl %edx,%ecx - leal -343485551(%rbx,%r10,1),%ebx - orl %ecx,%r11d - xorl %edx,%r11d - addl %r11d,%ebx - movl 0(%rsi),%r10d - movl $0xffffffff,%r11d - roll $21,%ebx - xorl %edx,%r11d - addl %ecx,%ebx - - addl %r8d,%eax - addl %r9d,%ebx - addl %r14d,%ecx - addl %r15d,%edx - - - addq $64,%rsi - cmpq %rdi,%rsi - jb .Lloop - - -.Lend: - movl %eax,0(%rbp) - movl %ebx,4(%rbp) - movl %ecx,8(%rbp) - movl %edx,12(%rbp) - - movq (%rsp),%r15 -.cfi_restore r15 - movq 8(%rsp),%r14 -.cfi_restore r14 - movq 16(%rsp),%r12 -.cfi_restore r12 - movq 24(%rsp),%rbx -.cfi_restore rbx - movq 32(%rsp),%rbp -.cfi_restore rbp - addq $40,%rsp -.cfi_adjust_cfa_offset -40 -.Lepilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size md5_block_asm_data_order,.-md5_block_asm_data_order -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S deleted file mode 100644 index 655f1a2a..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S +++ /dev/null @@ -1,4543 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - - -.align 64 -.Lpoly: -.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001 - -.LOne: -.long 1,1,1,1,1,1,1,1 -.LTwo: -.long 2,2,2,2,2,2,2,2 -.LThree: -.long 3,3,3,3,3,3,3,3 -.LONE_mont: -.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe - - -.Lord: -.quad 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, 0xffffffff00000000 -.LordK: -.quad 0xccd1c8aaee00bc4f - - - -.globl ecp_nistz256_neg -.hidden ecp_nistz256_neg -.type ecp_nistz256_neg,@function -.align 32 -ecp_nistz256_neg: -.cfi_startproc - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-16 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-24 -.Lneg_body: - - xorq %r8,%r8 - xorq %r9,%r9 - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r13,%r13 - - subq 0(%rsi),%r8 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - movq %r8,%rax - sbbq 24(%rsi),%r11 - leaq .Lpoly(%rip),%rsi - movq %r9,%rdx - sbbq $0,%r13 - - addq 0(%rsi),%r8 - movq %r10,%rcx - adcq 8(%rsi),%r9 - adcq 16(%rsi),%r10 - movq %r11,%r12 - adcq 24(%rsi),%r11 - testq %r13,%r13 - - cmovzq %rax,%r8 - cmovzq %rdx,%r9 - movq %r8,0(%rdi) - cmovzq %rcx,%r10 - movq %r9,8(%rdi) - cmovzq %r12,%r11 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - movq 0(%rsp),%r13 -.cfi_restore %r13 - movq 8(%rsp),%r12 -.cfi_restore %r12 - leaq 16(%rsp),%rsp -.cfi_adjust_cfa_offset -16 -.Lneg_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_neg,.-ecp_nistz256_neg - - - - - - -.globl ecp_nistz256_ord_mul_mont -.hidden ecp_nistz256_ord_mul_mont -.type ecp_nistz256_ord_mul_mont,@function -.align 32 -ecp_nistz256_ord_mul_mont: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je .Lecp_nistz256_ord_mul_montx - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lord_mul_body: - - movq 0(%rdx),%rax - movq %rdx,%rbx - leaq .Lord(%rip),%r14 - movq .LordK(%rip),%r15 - - - movq %rax,%rcx - mulq 0(%rsi) - movq %rax,%r8 - movq %rcx,%rax - movq %rdx,%r9 - - mulq 8(%rsi) - addq %rax,%r9 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq 16(%rsi) - addq %rax,%r10 - movq %rcx,%rax - adcq $0,%rdx - - movq %r8,%r13 - imulq %r15,%r8 - - movq %rdx,%r11 - mulq 24(%rsi) - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%r12 - - - mulq 0(%r14) - movq %r8,%rbp - addq %rax,%r13 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%rcx - - subq %r8,%r10 - sbbq $0,%r8 - - mulq 8(%r14) - addq %rcx,%r9 - adcq $0,%rdx - addq %rax,%r9 - movq %rbp,%rax - adcq %rdx,%r10 - movq %rbp,%rdx - adcq $0,%r8 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r11 - movq 8(%rbx),%rax - sbbq %rdx,%rbp - - addq %r8,%r11 - adcq %rbp,%r12 - adcq $0,%r13 - - - movq %rax,%rcx - mulq 0(%rsi) - addq %rax,%r9 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rbp,%r10 - adcq $0,%rdx - addq %rax,%r10 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rbp,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %rcx,%rax - adcq $0,%rdx - - movq %r9,%rcx - imulq %r15,%r9 - - movq %rdx,%rbp - mulq 24(%rsi) - addq %rbp,%r12 - adcq $0,%rdx - xorq %r8,%r8 - addq %rax,%r12 - movq %r9,%rax - adcq %rdx,%r13 - adcq $0,%r8 - - - mulq 0(%r14) - movq %r9,%rbp - addq %rax,%rcx - movq %r9,%rax - adcq %rdx,%rcx - - subq %r9,%r11 - sbbq $0,%r9 - - mulq 8(%r14) - addq %rcx,%r10 - adcq $0,%rdx - addq %rax,%r10 - movq %rbp,%rax - adcq %rdx,%r11 - movq %rbp,%rdx - adcq $0,%r9 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r12 - movq 16(%rbx),%rax - sbbq %rdx,%rbp - - addq %r9,%r12 - adcq %rbp,%r13 - adcq $0,%r8 - - - movq %rax,%rcx - mulq 0(%rsi) - addq %rax,%r10 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rbp,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rbp,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %rcx,%rax - adcq $0,%rdx - - movq %r10,%rcx - imulq %r15,%r10 - - movq %rdx,%rbp - mulq 24(%rsi) - addq %rbp,%r13 - adcq $0,%rdx - xorq %r9,%r9 - addq %rax,%r13 - movq %r10,%rax - adcq %rdx,%r8 - adcq $0,%r9 - - - mulq 0(%r14) - movq %r10,%rbp - addq %rax,%rcx - movq %r10,%rax - adcq %rdx,%rcx - - subq %r10,%r12 - sbbq $0,%r10 - - mulq 8(%r14) - addq %rcx,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %rbp,%rax - adcq %rdx,%r12 - movq %rbp,%rdx - adcq $0,%r10 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r13 - movq 24(%rbx),%rax - sbbq %rdx,%rbp - - addq %r10,%r13 - adcq %rbp,%r8 - adcq $0,%r9 - - - movq %rax,%rcx - mulq 0(%rsi) - addq %rax,%r11 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 8(%rsi) - addq %rbp,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %rcx,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq 16(%rsi) - addq %rbp,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %rcx,%rax - adcq $0,%rdx - - movq %r11,%rcx - imulq %r15,%r11 - - movq %rdx,%rbp - mulq 24(%rsi) - addq %rbp,%r8 - adcq $0,%rdx - xorq %r10,%r10 - addq %rax,%r8 - movq %r11,%rax - adcq %rdx,%r9 - adcq $0,%r10 - - - mulq 0(%r14) - movq %r11,%rbp - addq %rax,%rcx - movq %r11,%rax - adcq %rdx,%rcx - - subq %r11,%r13 - sbbq $0,%r11 - - mulq 8(%r14) - addq %rcx,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %rbp,%rax - adcq %rdx,%r13 - movq %rbp,%rdx - adcq $0,%r11 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r8 - sbbq %rdx,%rbp - - addq %r11,%r8 - adcq %rbp,%r9 - adcq $0,%r10 - - - movq %r12,%rsi - subq 0(%r14),%r12 - movq %r13,%r11 - sbbq 8(%r14),%r13 - movq %r8,%rcx - sbbq 16(%r14),%r8 - movq %r9,%rbp - sbbq 24(%r14),%r9 - sbbq $0,%r10 - - cmovcq %rsi,%r12 - cmovcq %r11,%r13 - cmovcq %rcx,%r8 - cmovcq %rbp,%r9 - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lord_mul_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_ord_mul_mont,.-ecp_nistz256_ord_mul_mont - - - - - - - -.globl ecp_nistz256_ord_sqr_mont -.hidden ecp_nistz256_ord_sqr_mont -.type ecp_nistz256_ord_sqr_mont,@function -.align 32 -ecp_nistz256_ord_sqr_mont: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je .Lecp_nistz256_ord_sqr_montx - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lord_sqr_body: - - movq 0(%rsi),%r8 - movq 8(%rsi),%rax - movq 16(%rsi),%r14 - movq 24(%rsi),%r15 - leaq .Lord(%rip),%rsi - movq %rdx,%rbx - jmp .Loop_ord_sqr - -.align 32 -.Loop_ord_sqr: - - movq %rax,%rbp - mulq %r8 - movq %rax,%r9 -.byte 102,72,15,110,205 - movq %r14,%rax - movq %rdx,%r10 - - mulq %r8 - addq %rax,%r10 - movq %r15,%rax -.byte 102,73,15,110,214 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %r8 - addq %rax,%r11 - movq %r15,%rax -.byte 102,73,15,110,223 - adcq $0,%rdx - movq %rdx,%r12 - - - mulq %r14 - movq %rax,%r13 - movq %r14,%rax - movq %rdx,%r14 - - - mulq %rbp - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r15 - - mulq %rbp - addq %rax,%r12 - adcq $0,%rdx - - addq %r15,%r12 - adcq %rdx,%r13 - adcq $0,%r14 - - - xorq %r15,%r15 - movq %r8,%rax - addq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %r12,%r12 - adcq %r13,%r13 - adcq %r14,%r14 - adcq $0,%r15 - - - mulq %rax - movq %rax,%r8 -.byte 102,72,15,126,200 - movq %rdx,%rbp - - mulq %rax - addq %rbp,%r9 - adcq %rax,%r10 -.byte 102,72,15,126,208 - adcq $0,%rdx - movq %rdx,%rbp - - mulq %rax - addq %rbp,%r11 - adcq %rax,%r12 -.byte 102,72,15,126,216 - adcq $0,%rdx - movq %rdx,%rbp - - movq %r8,%rcx - imulq 32(%rsi),%r8 - - mulq %rax - addq %rbp,%r13 - adcq %rax,%r14 - movq 0(%rsi),%rax - adcq %rdx,%r15 - - - mulq %r8 - movq %r8,%rbp - addq %rax,%rcx - movq 8(%rsi),%rax - adcq %rdx,%rcx - - subq %r8,%r10 - sbbq $0,%rbp - - mulq %r8 - addq %rcx,%r9 - adcq $0,%rdx - addq %rax,%r9 - movq %r8,%rax - adcq %rdx,%r10 - movq %r8,%rdx - adcq $0,%rbp - - movq %r9,%rcx - imulq 32(%rsi),%r9 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r11 - movq 0(%rsi),%rax - sbbq %rdx,%r8 - - addq %rbp,%r11 - adcq $0,%r8 - - - mulq %r9 - movq %r9,%rbp - addq %rax,%rcx - movq 8(%rsi),%rax - adcq %rdx,%rcx - - subq %r9,%r11 - sbbq $0,%rbp - - mulq %r9 - addq %rcx,%r10 - adcq $0,%rdx - addq %rax,%r10 - movq %r9,%rax - adcq %rdx,%r11 - movq %r9,%rdx - adcq $0,%rbp - - movq %r10,%rcx - imulq 32(%rsi),%r10 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r8 - movq 0(%rsi),%rax - sbbq %rdx,%r9 - - addq %rbp,%r8 - adcq $0,%r9 - - - mulq %r10 - movq %r10,%rbp - addq %rax,%rcx - movq 8(%rsi),%rax - adcq %rdx,%rcx - - subq %r10,%r8 - sbbq $0,%rbp - - mulq %r10 - addq %rcx,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %r10,%rax - adcq %rdx,%r8 - movq %r10,%rdx - adcq $0,%rbp - - movq %r11,%rcx - imulq 32(%rsi),%r11 - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r9 - movq 0(%rsi),%rax - sbbq %rdx,%r10 - - addq %rbp,%r9 - adcq $0,%r10 - - - mulq %r11 - movq %r11,%rbp - addq %rax,%rcx - movq 8(%rsi),%rax - adcq %rdx,%rcx - - subq %r11,%r9 - sbbq $0,%rbp - - mulq %r11 - addq %rcx,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r11,%rax - adcq %rdx,%r9 - movq %r11,%rdx - adcq $0,%rbp - - shlq $32,%rax - shrq $32,%rdx - subq %rax,%r10 - sbbq %rdx,%r11 - - addq %rbp,%r10 - adcq $0,%r11 - - - xorq %rdx,%rdx - addq %r12,%r8 - adcq %r13,%r9 - movq %r8,%r12 - adcq %r14,%r10 - adcq %r15,%r11 - movq %r9,%rax - adcq $0,%rdx - - - subq 0(%rsi),%r8 - movq %r10,%r14 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - movq %r11,%r15 - sbbq 24(%rsi),%r11 - sbbq $0,%rdx - - cmovcq %r12,%r8 - cmovncq %r9,%rax - cmovncq %r10,%r14 - cmovncq %r11,%r15 - - decq %rbx - jnz .Loop_ord_sqr - - movq %r8,0(%rdi) - movq %rax,8(%rdi) - pxor %xmm1,%xmm1 - movq %r14,16(%rdi) - pxor %xmm2,%xmm2 - movq %r15,24(%rdi) - pxor %xmm3,%xmm3 - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lord_sqr_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_ord_sqr_mont,.-ecp_nistz256_ord_sqr_mont - -.type ecp_nistz256_ord_mul_montx,@function -.align 32 -ecp_nistz256_ord_mul_montx: -.cfi_startproc -.Lecp_nistz256_ord_mul_montx: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lord_mulx_body: - - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - leaq .Lord-128(%rip),%r14 - movq .LordK(%rip),%r15 - - - mulxq %r9,%r8,%r9 - mulxq %r10,%rcx,%r10 - mulxq %r11,%rbp,%r11 - addq %rcx,%r9 - mulxq %r12,%rcx,%r12 - movq %r8,%rdx - mulxq %r15,%rdx,%rax - adcq %rbp,%r10 - adcq %rcx,%r11 - adcq $0,%r12 - - - xorq %r13,%r13 - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 24+128(%r14),%rcx,%rbp - movq 8(%rbx),%rdx - adcxq %rcx,%r11 - adoxq %rbp,%r12 - adcxq %r8,%r12 - adoxq %r8,%r13 - adcq $0,%r13 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r9,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - adcxq %r8,%r13 - adoxq %r8,%r8 - adcq $0,%r8 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%r14),%rcx,%rbp - movq 16(%rbx),%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcxq %r9,%r13 - adoxq %r9,%r8 - adcq $0,%r8 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r10,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - adcxq %r9,%r8 - adoxq %r9,%r9 - adcq $0,%r9 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%r14),%rcx,%rbp - movq 24(%rbx),%rdx - adcxq %rcx,%r13 - adoxq %rbp,%r8 - adcxq %r10,%r8 - adoxq %r10,%r9 - adcq $0,%r9 - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r11,%rdx - mulxq %r15,%rdx,%rax - adcxq %rcx,%r8 - adoxq %rbp,%r9 - - adcxq %r10,%r9 - adoxq %r10,%r10 - adcq $0,%r10 - - - mulxq 0+128(%r14),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%r14),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%r14),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%r14),%rcx,%rbp - leaq 128(%r14),%r14 - movq %r12,%rbx - adcxq %rcx,%r8 - adoxq %rbp,%r9 - movq %r13,%rdx - adcxq %r11,%r9 - adoxq %r11,%r10 - adcq $0,%r10 - - - - movq %r8,%rcx - subq 0(%r14),%r12 - sbbq 8(%r14),%r13 - sbbq 16(%r14),%r8 - movq %r9,%rbp - sbbq 24(%r14),%r9 - sbbq $0,%r10 - - cmovcq %rbx,%r12 - cmovcq %rdx,%r13 - cmovcq %rcx,%r8 - cmovcq %rbp,%r9 - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lord_mulx_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_ord_mul_montx,.-ecp_nistz256_ord_mul_montx - -.type ecp_nistz256_ord_sqr_montx,@function -.align 32 -ecp_nistz256_ord_sqr_montx: -.cfi_startproc -.Lecp_nistz256_ord_sqr_montx: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lord_sqrx_body: - - movq %rdx,%rbx - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq .Lord(%rip),%rsi - jmp .Loop_ord_sqrx - -.align 32 -.Loop_ord_sqrx: - mulxq %r14,%r9,%r10 - mulxq %r15,%rcx,%r11 - movq %rdx,%rax -.byte 102,73,15,110,206 - mulxq %r8,%rbp,%r12 - movq %r14,%rdx - addq %rcx,%r10 -.byte 102,73,15,110,215 - adcq %rbp,%r11 - adcq $0,%r12 - xorq %r13,%r13 - - mulxq %r15,%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq %r8,%rcx,%rbp - movq %r15,%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcq $0,%r13 - - mulxq %r8,%rcx,%r14 - movq %rax,%rdx -.byte 102,73,15,110,216 - xorq %r15,%r15 - adcxq %r9,%r9 - adoxq %rcx,%r13 - adcxq %r10,%r10 - adoxq %r15,%r14 - - - mulxq %rdx,%r8,%rbp -.byte 102,72,15,126,202 - adcxq %r11,%r11 - adoxq %rbp,%r9 - adcxq %r12,%r12 - mulxq %rdx,%rcx,%rax -.byte 102,72,15,126,210 - adcxq %r13,%r13 - adoxq %rcx,%r10 - adcxq %r14,%r14 - mulxq %rdx,%rcx,%rbp -.byte 0x67 -.byte 102,72,15,126,218 - adoxq %rax,%r11 - adcxq %r15,%r15 - adoxq %rcx,%r12 - adoxq %rbp,%r13 - mulxq %rdx,%rcx,%rax - adoxq %rcx,%r14 - adoxq %rax,%r15 - - - movq %r8,%rdx - mulxq 32(%rsi),%rdx,%rcx - - xorq %rax,%rax - mulxq 0(%rsi),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - mulxq 8(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - mulxq 16(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - mulxq 24(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r8 - adcxq %rax,%r8 - - - movq %r9,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adoxq %rcx,%r9 - adcxq %rbp,%r10 - mulxq 8(%rsi),%rcx,%rbp - adoxq %rcx,%r10 - adcxq %rbp,%r11 - mulxq 16(%rsi),%rcx,%rbp - adoxq %rcx,%r11 - adcxq %rbp,%r8 - mulxq 24(%rsi),%rcx,%rbp - adoxq %rcx,%r8 - adcxq %rbp,%r9 - adoxq %rax,%r9 - - - movq %r10,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - mulxq 8(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r8 - mulxq 16(%rsi),%rcx,%rbp - adcxq %rcx,%r8 - adoxq %rbp,%r9 - mulxq 24(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - adcxq %rax,%r10 - - - movq %r11,%rdx - mulxq 32(%rsi),%rdx,%rcx - - mulxq 0(%rsi),%rcx,%rbp - adoxq %rcx,%r11 - adcxq %rbp,%r8 - mulxq 8(%rsi),%rcx,%rbp - adoxq %rcx,%r8 - adcxq %rbp,%r9 - mulxq 16(%rsi),%rcx,%rbp - adoxq %rcx,%r9 - adcxq %rbp,%r10 - mulxq 24(%rsi),%rcx,%rbp - adoxq %rcx,%r10 - adcxq %rbp,%r11 - adoxq %rax,%r11 - - - addq %r8,%r12 - adcq %r13,%r9 - movq %r12,%rdx - adcq %r14,%r10 - adcq %r15,%r11 - movq %r9,%r14 - adcq $0,%rax - - - subq 0(%rsi),%r12 - movq %r10,%r15 - sbbq 8(%rsi),%r9 - sbbq 16(%rsi),%r10 - movq %r11,%r8 - sbbq 24(%rsi),%r11 - sbbq $0,%rax - - cmovncq %r12,%rdx - cmovncq %r9,%r14 - cmovncq %r10,%r15 - cmovncq %r11,%r8 - - decq %rbx - jnz .Loop_ord_sqrx - - movq %rdx,0(%rdi) - movq %r14,8(%rdi) - pxor %xmm1,%xmm1 - movq %r15,16(%rdi) - pxor %xmm2,%xmm2 - movq %r8,24(%rdi) - pxor %xmm3,%xmm3 - - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lord_sqrx_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_ord_sqr_montx,.-ecp_nistz256_ord_sqr_montx - - - - - - -.globl ecp_nistz256_mul_mont -.hidden ecp_nistz256_mul_mont -.type ecp_nistz256_mul_mont,@function -.align 32 -ecp_nistz256_mul_mont: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx -.Lmul_mont: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lmul_body: - cmpl $0x80100,%ecx - je .Lmul_montx - movq %rdx,%rbx - movq 0(%rdx),%rax - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - - call __ecp_nistz256_mul_montq - jmp .Lmul_mont_done - -.align 32 -.Lmul_montx: - movq %rdx,%rbx - movq 0(%rdx),%rdx - movq 0(%rsi),%r9 - movq 8(%rsi),%r10 - movq 16(%rsi),%r11 - movq 24(%rsi),%r12 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_mul_montx -.Lmul_mont_done: - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lmul_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont - -.type __ecp_nistz256_mul_montq,@function -.align 32 -__ecp_nistz256_mul_montq: -.cfi_startproc - - - movq %rax,%rbp - mulq %r9 - movq .Lpoly+8(%rip),%r14 - movq %rax,%r8 - movq %rbp,%rax - movq %rdx,%r9 - - mulq %r10 - movq .Lpoly+24(%rip),%r15 - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %r11 - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %r12 - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - xorq %r13,%r13 - movq %rdx,%r12 - - - - - - - - - - - movq %r8,%rbp - shlq $32,%r8 - mulq %r15 - shrq $32,%rbp - addq %r8,%r9 - adcq %rbp,%r10 - adcq %rax,%r11 - movq 8(%rbx),%rax - adcq %rdx,%r12 - adcq $0,%r13 - xorq %r8,%r8 - - - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r9 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rcx,%r10 - adcq $0,%rdx - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 16(%rsi) - addq %rcx,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 24(%rsi) - addq %rcx,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %r9,%rax - adcq %rdx,%r13 - adcq $0,%r8 - - - - movq %r9,%rbp - shlq $32,%r9 - mulq %r15 - shrq $32,%rbp - addq %r9,%r10 - adcq %rbp,%r11 - adcq %rax,%r12 - movq 16(%rbx),%rax - adcq %rdx,%r13 - adcq $0,%r8 - xorq %r9,%r9 - - - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r10 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rcx,%r11 - adcq $0,%rdx - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 16(%rsi) - addq %rcx,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 24(%rsi) - addq %rcx,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %r10,%rax - adcq %rdx,%r8 - adcq $0,%r9 - - - - movq %r10,%rbp - shlq $32,%r10 - mulq %r15 - shrq $32,%rbp - addq %r10,%r11 - adcq %rbp,%r12 - adcq %rax,%r13 - movq 24(%rbx),%rax - adcq %rdx,%r8 - adcq $0,%r9 - xorq %r10,%r10 - - - - movq %rax,%rbp - mulq 0(%rsi) - addq %rax,%r11 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 8(%rsi) - addq %rcx,%r12 - adcq $0,%rdx - addq %rax,%r12 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 16(%rsi) - addq %rcx,%r13 - adcq $0,%rdx - addq %rax,%r13 - movq %rbp,%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq 24(%rsi) - addq %rcx,%r8 - adcq $0,%rdx - addq %rax,%r8 - movq %r11,%rax - adcq %rdx,%r9 - adcq $0,%r10 - - - - movq %r11,%rbp - shlq $32,%r11 - mulq %r15 - shrq $32,%rbp - addq %r11,%r12 - adcq %rbp,%r13 - movq %r12,%rcx - adcq %rax,%r8 - adcq %rdx,%r9 - movq %r13,%rbp - adcq $0,%r10 - - - - subq $-1,%r12 - movq %r8,%rbx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%rdx - sbbq %r15,%r9 - sbbq $0,%r10 - - cmovcq %rcx,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rbx,%r8 - movq %r13,8(%rdi) - cmovcq %rdx,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq - - - - - - - - -.globl ecp_nistz256_sqr_mont -.hidden ecp_nistz256_sqr_mont -.type ecp_nistz256_sqr_mont,@function -.align 32 -ecp_nistz256_sqr_mont: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 -.Lsqr_body: - cmpl $0x80100,%ecx - je .Lsqr_montx - movq 0(%rsi),%rax - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - - call __ecp_nistz256_sqr_montq - jmp .Lsqr_mont_done - -.align 32 -.Lsqr_montx: - movq 0(%rsi),%rdx - movq 8(%rsi),%r14 - movq 16(%rsi),%r15 - movq 24(%rsi),%r8 - leaq -128(%rsi),%rsi - - call __ecp_nistz256_sqr_montx -.Lsqr_mont_done: - movq 0(%rsp),%r15 -.cfi_restore %r15 - movq 8(%rsp),%r14 -.cfi_restore %r14 - movq 16(%rsp),%r13 -.cfi_restore %r13 - movq 24(%rsp),%r12 -.cfi_restore %r12 - movq 32(%rsp),%rbx -.cfi_restore %rbx - movq 40(%rsp),%rbp -.cfi_restore %rbp - leaq 48(%rsp),%rsp -.cfi_adjust_cfa_offset -48 -.Lsqr_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont - -.type __ecp_nistz256_sqr_montq,@function -.align 32 -__ecp_nistz256_sqr_montq: -.cfi_startproc - movq %rax,%r13 - mulq %r14 - movq %rax,%r9 - movq %r15,%rax - movq %rdx,%r10 - - mulq %r13 - addq %rax,%r10 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %r13 - addq %rax,%r11 - movq %r15,%rax - adcq $0,%rdx - movq %rdx,%r12 - - - mulq %r14 - addq %rax,%r11 - movq %r8,%rax - adcq $0,%rdx - movq %rdx,%rbp - - mulq %r14 - addq %rax,%r12 - movq %r8,%rax - adcq $0,%rdx - addq %rbp,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - - mulq %r15 - xorq %r15,%r15 - addq %rax,%r13 - movq 0(%rsi),%rax - movq %rdx,%r14 - adcq $0,%r14 - - addq %r9,%r9 - adcq %r10,%r10 - adcq %r11,%r11 - adcq %r12,%r12 - adcq %r13,%r13 - adcq %r14,%r14 - adcq $0,%r15 - - mulq %rax - movq %rax,%r8 - movq 8(%rsi),%rax - movq %rdx,%rcx - - mulq %rax - addq %rcx,%r9 - adcq %rax,%r10 - movq 16(%rsi),%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq %rax - addq %rcx,%r11 - adcq %rax,%r12 - movq 24(%rsi),%rax - adcq $0,%rdx - movq %rdx,%rcx - - mulq %rax - addq %rcx,%r13 - adcq %rax,%r14 - movq %r8,%rax - adcq %rdx,%r15 - - movq .Lpoly+8(%rip),%rsi - movq .Lpoly+24(%rip),%rbp - - - - - movq %r8,%rcx - shlq $32,%r8 - mulq %rbp - shrq $32,%rcx - addq %r8,%r9 - adcq %rcx,%r10 - adcq %rax,%r11 - movq %r9,%rax - adcq $0,%rdx - - - - movq %r9,%rcx - shlq $32,%r9 - movq %rdx,%r8 - mulq %rbp - shrq $32,%rcx - addq %r9,%r10 - adcq %rcx,%r11 - adcq %rax,%r8 - movq %r10,%rax - adcq $0,%rdx - - - - movq %r10,%rcx - shlq $32,%r10 - movq %rdx,%r9 - mulq %rbp - shrq $32,%rcx - addq %r10,%r11 - adcq %rcx,%r8 - adcq %rax,%r9 - movq %r11,%rax - adcq $0,%rdx - - - - movq %r11,%rcx - shlq $32,%r11 - movq %rdx,%r10 - mulq %rbp - shrq $32,%rcx - addq %r11,%r8 - adcq %rcx,%r9 - adcq %rax,%r10 - adcq $0,%rdx - xorq %r11,%r11 - - - - addq %r8,%r12 - adcq %r9,%r13 - movq %r12,%r8 - adcq %r10,%r14 - adcq %rdx,%r15 - movq %r13,%r9 - adcq $0,%r11 - - subq $-1,%r12 - movq %r14,%r10 - sbbq %rsi,%r13 - sbbq $0,%r14 - movq %r15,%rcx - sbbq %rbp,%r15 - sbbq $0,%r11 - - cmovcq %r8,%r12 - cmovcq %r9,%r13 - movq %r12,0(%rdi) - cmovcq %r10,%r14 - movq %r13,8(%rdi) - cmovcq %rcx,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq -.type __ecp_nistz256_mul_montx,@function -.align 32 -__ecp_nistz256_mul_montx: -.cfi_startproc - - - mulxq %r9,%r8,%r9 - mulxq %r10,%rcx,%r10 - movq $32,%r14 - xorq %r13,%r13 - mulxq %r11,%rbp,%r11 - movq .Lpoly+24(%rip),%r15 - adcq %rcx,%r9 - mulxq %r12,%rcx,%r12 - movq %r8,%rdx - adcq %rbp,%r10 - shlxq %r14,%r8,%rbp - adcq %rcx,%r11 - shrxq %r14,%r8,%rcx - adcq $0,%r12 - - - - addq %rbp,%r9 - adcq %rcx,%r10 - - mulxq %r15,%rcx,%rbp - movq 8(%rbx),%rdx - adcq %rcx,%r11 - adcq %rbp,%r12 - adcq $0,%r13 - xorq %r8,%r8 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r9 - adoxq %rbp,%r10 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r9,%rdx - adcxq %rcx,%r12 - shlxq %r14,%r9,%rcx - adoxq %rbp,%r13 - shrxq %r14,%r9,%rbp - - adcxq %r8,%r13 - adoxq %r8,%r8 - adcq $0,%r8 - - - - addq %rcx,%r10 - adcq %rbp,%r11 - - mulxq %r15,%rcx,%rbp - movq 16(%rbx),%rdx - adcq %rcx,%r12 - adcq %rbp,%r13 - adcq $0,%r8 - xorq %r9,%r9 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r10 - adoxq %rbp,%r11 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r10,%rdx - adcxq %rcx,%r13 - shlxq %r14,%r10,%rcx - adoxq %rbp,%r8 - shrxq %r14,%r10,%rbp - - adcxq %r9,%r8 - adoxq %r9,%r9 - adcq $0,%r9 - - - - addq %rcx,%r11 - adcq %rbp,%r12 - - mulxq %r15,%rcx,%rbp - movq 24(%rbx),%rdx - adcq %rcx,%r13 - adcq %rbp,%r8 - adcq $0,%r9 - xorq %r10,%r10 - - - - mulxq 0+128(%rsi),%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq 8+128(%rsi),%rcx,%rbp - adcxq %rcx,%r12 - adoxq %rbp,%r13 - - mulxq 16+128(%rsi),%rcx,%rbp - adcxq %rcx,%r13 - adoxq %rbp,%r8 - - mulxq 24+128(%rsi),%rcx,%rbp - movq %r11,%rdx - adcxq %rcx,%r8 - shlxq %r14,%r11,%rcx - adoxq %rbp,%r9 - shrxq %r14,%r11,%rbp - - adcxq %r10,%r9 - adoxq %r10,%r10 - adcq $0,%r10 - - - - addq %rcx,%r12 - adcq %rbp,%r13 - - mulxq %r15,%rcx,%rbp - movq %r12,%rbx - movq .Lpoly+8(%rip),%r14 - adcq %rcx,%r8 - movq %r13,%rdx - adcq %rbp,%r9 - adcq $0,%r10 - - - - xorl %eax,%eax - movq %r8,%rcx - sbbq $-1,%r12 - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%rbp - sbbq %r15,%r9 - sbbq $0,%r10 - - cmovcq %rbx,%r12 - cmovcq %rdx,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %rbp,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx - -.type __ecp_nistz256_sqr_montx,@function -.align 32 -__ecp_nistz256_sqr_montx: -.cfi_startproc - mulxq %r14,%r9,%r10 - mulxq %r15,%rcx,%r11 - xorl %eax,%eax - adcq %rcx,%r10 - mulxq %r8,%rbp,%r12 - movq %r14,%rdx - adcq %rbp,%r11 - adcq $0,%r12 - xorq %r13,%r13 - - - mulxq %r15,%rcx,%rbp - adcxq %rcx,%r11 - adoxq %rbp,%r12 - - mulxq %r8,%rcx,%rbp - movq %r15,%rdx - adcxq %rcx,%r12 - adoxq %rbp,%r13 - adcq $0,%r13 - - - mulxq %r8,%rcx,%r14 - movq 0+128(%rsi),%rdx - xorq %r15,%r15 - adcxq %r9,%r9 - adoxq %rcx,%r13 - adcxq %r10,%r10 - adoxq %r15,%r14 - - mulxq %rdx,%r8,%rbp - movq 8+128(%rsi),%rdx - adcxq %r11,%r11 - adoxq %rbp,%r9 - adcxq %r12,%r12 - mulxq %rdx,%rcx,%rax - movq 16+128(%rsi),%rdx - adcxq %r13,%r13 - adoxq %rcx,%r10 - adcxq %r14,%r14 -.byte 0x67 - mulxq %rdx,%rcx,%rbp - movq 24+128(%rsi),%rdx - adoxq %rax,%r11 - adcxq %r15,%r15 - adoxq %rcx,%r12 - movq $32,%rsi - adoxq %rbp,%r13 -.byte 0x67,0x67 - mulxq %rdx,%rcx,%rax - movq .Lpoly+24(%rip),%rdx - adoxq %rcx,%r14 - shlxq %rsi,%r8,%rcx - adoxq %rax,%r15 - shrxq %rsi,%r8,%rax - movq %rdx,%rbp - - - addq %rcx,%r9 - adcq %rax,%r10 - - mulxq %r8,%rcx,%r8 - adcq %rcx,%r11 - shlxq %rsi,%r9,%rcx - adcq $0,%r8 - shrxq %rsi,%r9,%rax - - - addq %rcx,%r10 - adcq %rax,%r11 - - mulxq %r9,%rcx,%r9 - adcq %rcx,%r8 - shlxq %rsi,%r10,%rcx - adcq $0,%r9 - shrxq %rsi,%r10,%rax - - - addq %rcx,%r11 - adcq %rax,%r8 - - mulxq %r10,%rcx,%r10 - adcq %rcx,%r9 - shlxq %rsi,%r11,%rcx - adcq $0,%r10 - shrxq %rsi,%r11,%rax - - - addq %rcx,%r8 - adcq %rax,%r9 - - mulxq %r11,%rcx,%r11 - adcq %rcx,%r10 - adcq $0,%r11 - - xorq %rdx,%rdx - addq %r8,%r12 - movq .Lpoly+8(%rip),%rsi - adcq %r9,%r13 - movq %r12,%r8 - adcq %r10,%r14 - adcq %r11,%r15 - movq %r13,%r9 - adcq $0,%rdx - - subq $-1,%r12 - movq %r14,%r10 - sbbq %rsi,%r13 - sbbq $0,%r14 - movq %r15,%r11 - sbbq %rbp,%r15 - sbbq $0,%rdx - - cmovcq %r8,%r12 - cmovcq %r9,%r13 - movq %r12,0(%rdi) - cmovcq %r10,%r14 - movq %r13,8(%rdi) - cmovcq %r11,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx - - -.globl ecp_nistz256_select_w5 -.hidden ecp_nistz256_select_w5 -.type ecp_nistz256_select_w5,@function -.align 32 -ecp_nistz256_select_w5: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%rax - movq 8(%rax),%rax - testl $32,%eax - jnz .Lavx2_select_w5 - movdqa .LOne(%rip),%xmm0 - movd %edx,%xmm1 - - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - - movdqa %xmm0,%xmm8 - pshufd $0,%xmm1,%xmm1 - - movq $16,%rax -.Lselect_loop_sse_w5: - - movdqa %xmm8,%xmm15 - paddd %xmm0,%xmm8 - pcmpeqd %xmm1,%xmm15 - - movdqa 0(%rsi),%xmm9 - movdqa 16(%rsi),%xmm10 - movdqa 32(%rsi),%xmm11 - movdqa 48(%rsi),%xmm12 - movdqa 64(%rsi),%xmm13 - movdqa 80(%rsi),%xmm14 - leaq 96(%rsi),%rsi - - pand %xmm15,%xmm9 - pand %xmm15,%xmm10 - por %xmm9,%xmm2 - pand %xmm15,%xmm11 - por %xmm10,%xmm3 - pand %xmm15,%xmm12 - por %xmm11,%xmm4 - pand %xmm15,%xmm13 - por %xmm12,%xmm5 - pand %xmm15,%xmm14 - por %xmm13,%xmm6 - por %xmm14,%xmm7 - - decq %rax - jnz .Lselect_loop_sse_w5 - - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - movdqu %xmm4,32(%rdi) - movdqu %xmm5,48(%rdi) - movdqu %xmm6,64(%rdi) - movdqu %xmm7,80(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_ecp_nistz256_select_w5: -.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5 - - - -.globl ecp_nistz256_select_w7 -.hidden ecp_nistz256_select_w7 -.type ecp_nistz256_select_w7,@function -.align 32 -ecp_nistz256_select_w7: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%rax - movq 8(%rax),%rax - testl $32,%eax - jnz .Lavx2_select_w7 - movdqa .LOne(%rip),%xmm8 - movd %edx,%xmm1 - - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - - movdqa %xmm8,%xmm0 - pshufd $0,%xmm1,%xmm1 - movq $64,%rax - -.Lselect_loop_sse_w7: - movdqa %xmm8,%xmm15 - paddd %xmm0,%xmm8 - movdqa 0(%rsi),%xmm9 - movdqa 16(%rsi),%xmm10 - pcmpeqd %xmm1,%xmm15 - movdqa 32(%rsi),%xmm11 - movdqa 48(%rsi),%xmm12 - leaq 64(%rsi),%rsi - - pand %xmm15,%xmm9 - pand %xmm15,%xmm10 - por %xmm9,%xmm2 - pand %xmm15,%xmm11 - por %xmm10,%xmm3 - pand %xmm15,%xmm12 - por %xmm11,%xmm4 - prefetcht0 255(%rsi) - por %xmm12,%xmm5 - - decq %rax - jnz .Lselect_loop_sse_w7 - - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - movdqu %xmm4,32(%rdi) - movdqu %xmm5,48(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_ecp_nistz256_select_w7: -.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7 - - -.type ecp_nistz256_avx2_select_w5,@function -.align 32 -ecp_nistz256_avx2_select_w5: -.cfi_startproc -.Lavx2_select_w5: - vzeroupper - vmovdqa .LTwo(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - vpxor %ymm4,%ymm4,%ymm4 - - vmovdqa .LOne(%rip),%ymm5 - vmovdqa .LTwo(%rip),%ymm10 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - movq $8,%rax -.Lselect_loop_avx2_w5: - - vmovdqa 0(%rsi),%ymm6 - vmovdqa 32(%rsi),%ymm7 - vmovdqa 64(%rsi),%ymm8 - - vmovdqa 96(%rsi),%ymm11 - vmovdqa 128(%rsi),%ymm12 - vmovdqa 160(%rsi),%ymm13 - - vpcmpeqd %ymm1,%ymm5,%ymm9 - vpcmpeqd %ymm1,%ymm10,%ymm14 - - vpaddd %ymm0,%ymm5,%ymm5 - vpaddd %ymm0,%ymm10,%ymm10 - leaq 192(%rsi),%rsi - - vpand %ymm9,%ymm6,%ymm6 - vpand %ymm9,%ymm7,%ymm7 - vpand %ymm9,%ymm8,%ymm8 - vpand %ymm14,%ymm11,%ymm11 - vpand %ymm14,%ymm12,%ymm12 - vpand %ymm14,%ymm13,%ymm13 - - vpxor %ymm6,%ymm2,%ymm2 - vpxor %ymm7,%ymm3,%ymm3 - vpxor %ymm8,%ymm4,%ymm4 - vpxor %ymm11,%ymm2,%ymm2 - vpxor %ymm12,%ymm3,%ymm3 - vpxor %ymm13,%ymm4,%ymm4 - - decq %rax - jnz .Lselect_loop_avx2_w5 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vmovdqu %ymm4,64(%rdi) - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_ecp_nistz256_avx2_select_w5: -.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5 - - - -.globl ecp_nistz256_avx2_select_w7 -.hidden ecp_nistz256_avx2_select_w7 -.type ecp_nistz256_avx2_select_w7,@function -.align 32 -ecp_nistz256_avx2_select_w7: -.cfi_startproc -.Lavx2_select_w7: - vzeroupper - vmovdqa .LThree(%rip),%ymm0 - - vpxor %ymm2,%ymm2,%ymm2 - vpxor %ymm3,%ymm3,%ymm3 - - vmovdqa .LOne(%rip),%ymm4 - vmovdqa .LTwo(%rip),%ymm8 - vmovdqa .LThree(%rip),%ymm12 - - vmovd %edx,%xmm1 - vpermd %ymm1,%ymm2,%ymm1 - - - movq $21,%rax -.Lselect_loop_avx2_w7: - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vmovdqa 64(%rsi),%ymm9 - vmovdqa 96(%rsi),%ymm10 - - vmovdqa 128(%rsi),%ymm13 - vmovdqa 160(%rsi),%ymm14 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - vpcmpeqd %ymm1,%ymm8,%ymm11 - vpcmpeqd %ymm1,%ymm12,%ymm15 - - vpaddd %ymm0,%ymm4,%ymm4 - vpaddd %ymm0,%ymm8,%ymm8 - vpaddd %ymm0,%ymm12,%ymm12 - leaq 192(%rsi),%rsi - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - vpand %ymm11,%ymm9,%ymm9 - vpand %ymm11,%ymm10,%ymm10 - vpand %ymm15,%ymm13,%ymm13 - vpand %ymm15,%ymm14,%ymm14 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - vpxor %ymm9,%ymm2,%ymm2 - vpxor %ymm10,%ymm3,%ymm3 - vpxor %ymm13,%ymm2,%ymm2 - vpxor %ymm14,%ymm3,%ymm3 - - decq %rax - jnz .Lselect_loop_avx2_w7 - - - vmovdqa 0(%rsi),%ymm5 - vmovdqa 32(%rsi),%ymm6 - - vpcmpeqd %ymm1,%ymm4,%ymm7 - - vpand %ymm7,%ymm5,%ymm5 - vpand %ymm7,%ymm6,%ymm6 - - vpxor %ymm5,%ymm2,%ymm2 - vpxor %ymm6,%ymm3,%ymm3 - - vmovdqu %ymm2,0(%rdi) - vmovdqu %ymm3,32(%rdi) - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_ecp_nistz256_avx2_select_w7: -.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7 -.type __ecp_nistz256_add_toq,@function -.align 32 -__ecp_nistz256_add_toq: -.cfi_startproc - xorq %r11,%r11 - addq 0(%rbx),%r12 - adcq 8(%rbx),%r13 - movq %r12,%rax - adcq 16(%rbx),%r8 - adcq 24(%rbx),%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq - -.type __ecp_nistz256_sub_fromq,@function -.align 32 -__ecp_nistz256_sub_fromq: -.cfi_startproc - subq 0(%rbx),%r12 - sbbq 8(%rbx),%r13 - movq %r12,%rax - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - movq %r13,%rbp - sbbq %r11,%r11 - - addq $-1,%r12 - movq %r8,%rcx - adcq %r14,%r13 - adcq $0,%r8 - movq %r9,%r10 - adcq %r15,%r9 - testq %r11,%r11 - - cmovzq %rax,%r12 - cmovzq %rbp,%r13 - movq %r12,0(%rdi) - cmovzq %rcx,%r8 - movq %r13,8(%rdi) - cmovzq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq - -.type __ecp_nistz256_subq,@function -.align 32 -__ecp_nistz256_subq: -.cfi_startproc - subq %r12,%rax - sbbq %r13,%rbp - movq %rax,%r12 - sbbq %r8,%rcx - sbbq %r9,%r10 - movq %rbp,%r13 - sbbq %r11,%r11 - - addq $-1,%rax - movq %rcx,%r8 - adcq %r14,%rbp - adcq $0,%rcx - movq %r10,%r9 - adcq %r15,%r10 - testq %r11,%r11 - - cmovnzq %rax,%r12 - cmovnzq %rbp,%r13 - cmovnzq %rcx,%r8 - cmovnzq %r10,%r9 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_subq,.-__ecp_nistz256_subq - -.type __ecp_nistz256_mul_by_2q,@function -.align 32 -__ecp_nistz256_mul_by_2q: -.cfi_startproc - xorq %r11,%r11 - addq %r12,%r12 - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q -.globl ecp_nistz256_point_double -.hidden ecp_nistz256_point_double -.type ecp_nistz256_point_double,@function -.align 32 -ecp_nistz256_point_double: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je .Lpoint_doublex - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $160+8,%rsp -.cfi_adjust_cfa_offset 32*5+8 -.Lpoint_doubleq_body: - -.Lpoint_double_shortcutq: - movdqu 0(%rsi),%xmm0 - movq %rsi,%rbx - movdqu 16(%rsi),%xmm1 - movq 32+0(%rsi),%r12 - movq 32+8(%rsi),%r13 - movq 32+16(%rsi),%r8 - movq 32+24(%rsi),%r9 - movq .Lpoly+8(%rip),%r14 - movq .Lpoly+24(%rip),%r15 - movdqa %xmm0,96(%rsp) - movdqa %xmm1,96+16(%rsp) - leaq 32(%rdi),%r10 - leaq 64(%rdi),%r11 -.byte 102,72,15,110,199 -.byte 102,73,15,110,202 -.byte 102,73,15,110,211 - - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_by_2q - - movq 64+0(%rsi),%rax - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - leaq 64-0(%rsi),%rsi - leaq 64(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 0+0(%rsp),%rax - movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 0(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 32(%rbx),%rax - movq 64+0(%rbx),%r9 - movq 64+8(%rbx),%r10 - movq 64+16(%rbx),%r11 - movq 64+24(%rbx),%r12 - leaq 64-0(%rbx),%rsi - leaq 32(%rbx),%rbx -.byte 102,72,15,126,215 - call __ecp_nistz256_mul_montq - call __ecp_nistz256_mul_by_2q - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_toq - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 0+0(%rsp),%rax - movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 -.byte 102,72,15,126,207 - call __ecp_nistz256_sqr_montq - xorq %r9,%r9 - movq %r12,%rax - addq $-1,%r12 - movq %r13,%r10 - adcq %rsi,%r13 - movq %r14,%rcx - adcq $0,%r14 - movq %r15,%r8 - adcq %rbp,%r15 - adcq $0,%r9 - xorq %rsi,%rsi - testq $1,%rax - - cmovzq %rax,%r12 - cmovzq %r10,%r13 - cmovzq %rcx,%r14 - cmovzq %r8,%r15 - cmovzq %rsi,%r9 - - movq %r13,%rax - shrq $1,%r12 - shlq $63,%rax - movq %r14,%r10 - shrq $1,%r13 - orq %rax,%r12 - shlq $63,%r10 - movq %r15,%rcx - shrq $1,%r14 - orq %r10,%r13 - shlq $63,%rcx - movq %r12,0(%rdi) - shrq $1,%r15 - movq %r13,8(%rdi) - shlq $63,%r9 - orq %rcx,%r14 - orq %r9,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - movq 64(%rsp),%rax - leaq 64(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2q - - leaq 32(%rsp),%rbx - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_toq - - movq 96(%rsp),%rax - leaq 96(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2q - - movq 0+32(%rsp),%rax - movq 8+32(%rsp),%r14 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r15 - movq 24+32(%rsp),%r8 -.byte 102,72,15,126,199 - call __ecp_nistz256_sqr_montq - - leaq 128(%rsp),%rbx - movq %r14,%r8 - movq %r15,%r9 - movq %rsi,%r14 - movq %rbp,%r15 - call __ecp_nistz256_sub_fromq - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 0(%rsp),%rdi - call __ecp_nistz256_subq - - movq 32(%rsp),%rax - leaq 32(%rsp),%rbx - movq %r12,%r14 - xorl %ecx,%ecx - movq %r12,0+0(%rsp) - movq %r13,%r10 - movq %r13,0+8(%rsp) - cmovzq %r8,%r11 - movq %r8,0+16(%rsp) - leaq 0-0(%rsp),%rsi - cmovzq %r9,%r12 - movq %r9,0+24(%rsp) - movq %r14,%r9 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - -.byte 102,72,15,126,203 -.byte 102,72,15,126,207 - call __ecp_nistz256_sub_fromq - - leaq 160+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lpoint_doubleq_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_point_double,.-ecp_nistz256_point_double -.globl ecp_nistz256_point_add -.hidden ecp_nistz256_point_add -.type ecp_nistz256_point_add,@function -.align 32 -ecp_nistz256_point_add: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je .Lpoint_addx - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $576+8,%rsp -.cfi_adjust_cfa_offset 32*18+8 -.Lpoint_addq_body: - - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq %rsi,%rbx - movq %rdx,%rsi - movdqa %xmm0,384(%rsp) - movdqa %xmm1,384+16(%rsp) - movdqa %xmm2,416(%rsp) - movdqa %xmm3,416+16(%rsp) - movdqa %xmm4,448(%rsp) - movdqa %xmm5,448+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rsi),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rsi),%xmm3 - movq 64+0(%rsi),%rax - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,480(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,480+16(%rsp) - movdqu 64(%rsi),%xmm0 - movdqu 80(%rsi),%xmm1 - movdqa %xmm2,512(%rsp) - movdqa %xmm3,512+16(%rsp) - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - - leaq 64-0(%rsi),%rsi - movq %rax,544+0(%rsp) - movq %r14,544+8(%rsp) - movq %r15,544+16(%rsp) - movq %r8,544+24(%rsp) - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm1,%xmm4 - por %xmm1,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - movq 64+0(%rbx),%rax - movq 64+8(%rbx),%r14 - movq 64+16(%rbx),%r15 - movq 64+24(%rbx),%r8 -.byte 102,72,15,110,203 - - leaq 64-0(%rbx),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 544(%rsp),%rax - leaq 544(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq 0+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 448(%rsp),%rax - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 416(%rsp),%rax - leaq 416(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq 0+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 512(%rsp),%rax - leaq 512(%rsp),%rbx - movq 0+256(%rsp),%r9 - movq 8+256(%rsp),%r10 - leaq 0+256(%rsp),%rsi - movq 16+256(%rsp),%r11 - movq 24+256(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 224(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - orq %r13,%r12 - movdqa %xmm4,%xmm2 - orq %r8,%r12 - orq %r9,%r12 - por %xmm5,%xmm2 -.byte 102,73,15,110,220 - - movq 384(%rsp),%rax - leaq 384(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq 0+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 480(%rsp),%rax - leaq 480(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 160(%rsp),%rbx - leaq 0(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - orq %r13,%r12 - orq %r8,%r12 - orq %r9,%r12 - -.byte 102,73,15,126,208 -.byte 102,73,15,126,217 - orq %r8,%r12 -.byte 0x3e - jnz .Ladd_proceedq - - - - testq %r9,%r9 - jz .Ladd_doubleq - - - - - - -.byte 102,72,15,126,199 - pxor %xmm0,%xmm0 - movdqu %xmm0,0(%rdi) - movdqu %xmm0,16(%rdi) - movdqu %xmm0,32(%rdi) - movdqu %xmm0,48(%rdi) - movdqu %xmm0,64(%rdi) - movdqu %xmm0,80(%rdi) - jmp .Ladd_doneq - -.align 32 -.Ladd_doubleq: -.byte 102,72,15,126,206 -.byte 102,72,15,126,199 - addq $416,%rsp -.cfi_adjust_cfa_offset -416 - jmp .Lpoint_double_shortcutq -.cfi_adjust_cfa_offset 416 - -.align 32 -.Ladd_proceedq: - movq 0+64(%rsp),%rax - movq 8+64(%rsp),%r14 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 448(%rsp),%rax - leaq 448(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 0+0(%rsp),%rax - movq 8+0(%rsp),%r14 - leaq 0+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 544(%rsp),%rax - leaq 544(%rsp),%rbx - movq 0+352(%rsp),%r9 - movq 8+352(%rsp),%r10 - leaq 0+352(%rsp),%rsi - movq 16+352(%rsp),%r11 - movq 24+352(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 0(%rsp),%rax - leaq 0(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 160(%rsp),%rax - leaq 160(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montq - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 96(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subq - - leaq 128(%rsp),%rbx - leaq 288(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 192+0(%rsp),%rax - movq 192+8(%rsp),%rbp - movq 192+16(%rsp),%rcx - movq 192+24(%rsp),%r10 - leaq 320(%rsp),%rdi - - call __ecp_nistz256_subq - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 128(%rsp),%rax - leaq 128(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq 0+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 320(%rsp),%rax - leaq 320(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 320(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 256(%rsp),%rbx - leaq 320(%rsp),%rdi - call __ecp_nistz256_sub_fromq - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 352(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 352+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 544(%rsp),%xmm2 - pand 544+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 480(%rsp),%xmm2 - pand 480+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 320(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 320+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 512(%rsp),%xmm2 - pand 512+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - -.Ladd_doneq: - leaq 576+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lpoint_addq_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_point_add,.-ecp_nistz256_point_add -.globl ecp_nistz256_point_add_affine -.hidden ecp_nistz256_point_add_affine -.type ecp_nistz256_point_add_affine,@function -.align 32 -ecp_nistz256_point_add_affine: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%rcx - movq 8(%rcx),%rcx - andl $0x80100,%ecx - cmpl $0x80100,%ecx - je .Lpoint_add_affinex - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $480+8,%rsp -.cfi_adjust_cfa_offset 32*15+8 -.Ladd_affineq_body: - - movdqu 0(%rsi),%xmm0 - movq %rdx,%rbx - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq 64+0(%rsi),%rax - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,320(%rsp) - movdqa %xmm1,320+16(%rsp) - movdqa %xmm2,352(%rsp) - movdqa %xmm3,352+16(%rsp) - movdqa %xmm4,384(%rsp) - movdqa %xmm5,384+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rbx),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rbx),%xmm1 - movdqu 32(%rbx),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rbx),%xmm3 - movdqa %xmm0,416(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,416+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - movdqa %xmm2,448(%rsp) - movdqa %xmm3,448+16(%rsp) - por %xmm2,%xmm3 - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm1,%xmm3 - - leaq 64-0(%rsi),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm3,%xmm4 - movq 0(%rbx),%rax - - movq %r12,%r9 - por %xmm3,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - movq %r13,%r10 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - movq %r14,%r11 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - - leaq 32-0(%rsp),%rsi - movq %r15,%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 320(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 384(%rsp),%rax - leaq 384(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 384(%rsp),%rax - leaq 384(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 288(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 448(%rsp),%rax - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq 0+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 352(%rsp),%rbx - leaq 96(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 0+64(%rsp),%rax - movq 8+64(%rsp),%r14 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 128(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 0+96(%rsp),%rax - movq 8+96(%rsp),%r14 - leaq 0+96(%rsp),%rsi - movq 16+96(%rsp),%r15 - movq 24+96(%rsp),%r8 - leaq 192(%rsp),%rdi - call __ecp_nistz256_sqr_montq - - movq 128(%rsp),%rax - leaq 128(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 320(%rsp),%rax - leaq 320(%rsp),%rbx - movq 0+128(%rsp),%r9 - movq 8+128(%rsp),%r10 - leaq 0+128(%rsp),%rsi - movq 16+128(%rsp),%r11 - movq 24+128(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montq - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 192(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subq - - leaq 160(%rsp),%rbx - leaq 224(%rsp),%rdi - call __ecp_nistz256_sub_fromq - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 64(%rsp),%rdi - - call __ecp_nistz256_subq - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 352(%rsp),%rax - leaq 352(%rsp),%rbx - movq 0+160(%rsp),%r9 - movq 8+160(%rsp),%r10 - leaq 0+160(%rsp),%rsi - movq 16+160(%rsp),%r11 - movq 24+160(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montq - - movq 96(%rsp),%rax - leaq 96(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq 0+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 64(%rsp),%rdi - call __ecp_nistz256_mul_montq - - leaq 32(%rsp),%rbx - leaq 256(%rsp),%rdi - call __ecp_nistz256_sub_fromq - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand .LONE_mont(%rip),%xmm2 - pand .LONE_mont+16(%rip),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 224(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 224+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 320(%rsp),%xmm2 - pand 320+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 256(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 256+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 352(%rsp),%xmm2 - pand 352+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - - leaq 480+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Ladd_affineq_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_point_add_affine,.-ecp_nistz256_point_add_affine -.type __ecp_nistz256_add_tox,@function -.align 32 -__ecp_nistz256_add_tox: -.cfi_startproc - xorq %r11,%r11 - adcq 0(%rbx),%r12 - adcq 8(%rbx),%r13 - movq %r12,%rax - adcq 16(%rbx),%r8 - adcq 24(%rbx),%r9 - movq %r13,%rbp - adcq $0,%r11 - - xorq %r10,%r10 - sbbq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox - -.type __ecp_nistz256_sub_fromx,@function -.align 32 -__ecp_nistz256_sub_fromx: -.cfi_startproc - xorq %r11,%r11 - sbbq 0(%rbx),%r12 - sbbq 8(%rbx),%r13 - movq %r12,%rax - sbbq 16(%rbx),%r8 - sbbq 24(%rbx),%r9 - movq %r13,%rbp - sbbq $0,%r11 - - xorq %r10,%r10 - adcq $-1,%r12 - movq %r8,%rcx - adcq %r14,%r13 - adcq $0,%r8 - movq %r9,%r10 - adcq %r15,%r9 - - btq $0,%r11 - cmovncq %rax,%r12 - cmovncq %rbp,%r13 - movq %r12,0(%rdi) - cmovncq %rcx,%r8 - movq %r13,8(%rdi) - cmovncq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx - -.type __ecp_nistz256_subx,@function -.align 32 -__ecp_nistz256_subx: -.cfi_startproc - xorq %r11,%r11 - sbbq %r12,%rax - sbbq %r13,%rbp - movq %rax,%r12 - sbbq %r8,%rcx - sbbq %r9,%r10 - movq %rbp,%r13 - sbbq $0,%r11 - - xorq %r9,%r9 - adcq $-1,%rax - movq %rcx,%r8 - adcq %r14,%rbp - adcq $0,%rcx - movq %r10,%r9 - adcq %r15,%r10 - - btq $0,%r11 - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - cmovcq %rcx,%r8 - cmovcq %r10,%r9 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_subx,.-__ecp_nistz256_subx - -.type __ecp_nistz256_mul_by_2x,@function -.align 32 -__ecp_nistz256_mul_by_2x: -.cfi_startproc - xorq %r11,%r11 - adcq %r12,%r12 - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - xorq %r10,%r10 - sbbq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - cmovcq %rbp,%r13 - movq %r12,0(%rdi) - cmovcq %rcx,%r8 - movq %r13,8(%rdi) - cmovcq %r10,%r9 - movq %r8,16(%rdi) - movq %r9,24(%rdi) - - .byte 0xf3,0xc3 -.cfi_endproc -.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x -.type ecp_nistz256_point_doublex,@function -.align 32 -ecp_nistz256_point_doublex: -.cfi_startproc -.Lpoint_doublex: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $160+8,%rsp -.cfi_adjust_cfa_offset 32*5+8 -.Lpoint_doublex_body: - -.Lpoint_double_shortcutx: - movdqu 0(%rsi),%xmm0 - movq %rsi,%rbx - movdqu 16(%rsi),%xmm1 - movq 32+0(%rsi),%r12 - movq 32+8(%rsi),%r13 - movq 32+16(%rsi),%r8 - movq 32+24(%rsi),%r9 - movq .Lpoly+8(%rip),%r14 - movq .Lpoly+24(%rip),%r15 - movdqa %xmm0,96(%rsp) - movdqa %xmm1,96+16(%rsp) - leaq 32(%rdi),%r10 - leaq 64(%rdi),%r11 -.byte 102,72,15,110,199 -.byte 102,73,15,110,202 -.byte 102,73,15,110,211 - - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - leaq 64-128(%rsi),%rsi - leaq 64(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 0(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 32(%rbx),%rdx - movq 64+0(%rbx),%r9 - movq 64+8(%rbx),%r10 - movq 64+16(%rbx),%r11 - movq 64+24(%rbx),%r12 - leaq 64-128(%rbx),%rsi - leaq 32(%rbx),%rbx -.byte 102,72,15,126,215 - call __ecp_nistz256_mul_montx - call __ecp_nistz256_mul_by_2x - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox - - movq 96+0(%rsp),%r12 - movq 96+8(%rsp),%r13 - leaq 64(%rsp),%rbx - movq 96+16(%rsp),%r8 - movq 96+24(%rsp),%r9 - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 -.byte 102,72,15,126,207 - call __ecp_nistz256_sqr_montx - xorq %r9,%r9 - movq %r12,%rax - addq $-1,%r12 - movq %r13,%r10 - adcq %rsi,%r13 - movq %r14,%rcx - adcq $0,%r14 - movq %r15,%r8 - adcq %rbp,%r15 - adcq $0,%r9 - xorq %rsi,%rsi - testq $1,%rax - - cmovzq %rax,%r12 - cmovzq %r10,%r13 - cmovzq %rcx,%r14 - cmovzq %r8,%r15 - cmovzq %rsi,%r9 - - movq %r13,%rax - shrq $1,%r12 - shlq $63,%rax - movq %r14,%r10 - shrq $1,%r13 - orq %rax,%r12 - shlq $63,%r10 - movq %r15,%rcx - shrq $1,%r14 - orq %r10,%r13 - shlq $63,%rcx - movq %r12,0(%rdi) - shrq $1,%r15 - movq %r13,8(%rdi) - shlq $63,%r9 - orq %rcx,%r14 - orq %r9,%r15 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - movq 64(%rsp),%rdx - leaq 64(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - leaq 32(%rsp),%rbx - leaq 32(%rsp),%rdi - call __ecp_nistz256_add_tox - - movq 96(%rsp),%rdx - leaq 96(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_by_2x - - movq 0+32(%rsp),%rdx - movq 8+32(%rsp),%r14 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r15 - movq 24+32(%rsp),%r8 -.byte 102,72,15,126,199 - call __ecp_nistz256_sqr_montx - - leaq 128(%rsp),%rbx - movq %r14,%r8 - movq %r15,%r9 - movq %rsi,%r14 - movq %rbp,%r15 - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 0(%rsp),%rdi - call __ecp_nistz256_subx - - movq 32(%rsp),%rdx - leaq 32(%rsp),%rbx - movq %r12,%r14 - xorl %ecx,%ecx - movq %r12,0+0(%rsp) - movq %r13,%r10 - movq %r13,0+8(%rsp) - cmovzq %r8,%r11 - movq %r8,0+16(%rsp) - leaq 0-128(%rsp),%rsi - cmovzq %r9,%r12 - movq %r9,0+24(%rsp) - movq %r14,%r9 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - -.byte 102,72,15,126,203 -.byte 102,72,15,126,207 - call __ecp_nistz256_sub_fromx - - leaq 160+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lpoint_doublex_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_point_doublex,.-ecp_nistz256_point_doublex -.type ecp_nistz256_point_addx,@function -.align 32 -ecp_nistz256_point_addx: -.cfi_startproc -.Lpoint_addx: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $576+8,%rsp -.cfi_adjust_cfa_offset 32*18+8 -.Lpoint_addx_body: - - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq %rsi,%rbx - movq %rdx,%rsi - movdqa %xmm0,384(%rsp) - movdqa %xmm1,384+16(%rsp) - movdqa %xmm2,416(%rsp) - movdqa %xmm3,416+16(%rsp) - movdqa %xmm4,448(%rsp) - movdqa %xmm5,448+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rsi),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rsi),%xmm3 - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,480(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,480+16(%rsp) - movdqu 64(%rsi),%xmm0 - movdqu 80(%rsi),%xmm1 - movdqa %xmm2,512(%rsp) - movdqa %xmm3,512+16(%rsp) - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - - leaq 64-128(%rsi),%rsi - movq %rdx,544+0(%rsp) - movq %r14,544+8(%rsp) - movq %r15,544+16(%rsp) - movq %r8,544+24(%rsp) - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm1,%xmm4 - por %xmm1,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - movq 64+0(%rbx),%rdx - movq 64+8(%rbx),%r14 - movq 64+16(%rbx),%r15 - movq 64+24(%rbx),%r8 -.byte 102,72,15,110,203 - - leaq 64-128(%rbx),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 544(%rsp),%rdx - leaq 544(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 416(%rsp),%rdx - leaq 416(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 224(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 512(%rsp),%rdx - leaq 512(%rsp),%rbx - movq 0+256(%rsp),%r9 - movq 8+256(%rsp),%r10 - leaq -128+256(%rsp),%rsi - movq 16+256(%rsp),%r11 - movq 24+256(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 224(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - orq %r13,%r12 - movdqa %xmm4,%xmm2 - orq %r8,%r12 - orq %r9,%r12 - por %xmm5,%xmm2 -.byte 102,73,15,110,220 - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+96(%rsp),%r9 - movq 8+96(%rsp),%r10 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r11 - movq 24+96(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 480(%rsp),%rdx - leaq 480(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 160(%rsp),%rbx - leaq 0(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - orq %r13,%r12 - orq %r8,%r12 - orq %r9,%r12 - -.byte 102,73,15,126,208 -.byte 102,73,15,126,217 - orq %r8,%r12 -.byte 0x3e - jnz .Ladd_proceedx - - - - testq %r9,%r9 - jz .Ladd_doublex - - - - - - -.byte 102,72,15,126,199 - pxor %xmm0,%xmm0 - movdqu %xmm0,0(%rdi) - movdqu %xmm0,16(%rdi) - movdqu %xmm0,32(%rdi) - movdqu %xmm0,48(%rdi) - movdqu %xmm0,64(%rdi) - movdqu %xmm0,80(%rdi) - jmp .Ladd_donex - -.align 32 -.Ladd_doublex: -.byte 102,72,15,126,206 -.byte 102,72,15,126,199 - addq $416,%rsp -.cfi_adjust_cfa_offset -416 - jmp .Lpoint_double_shortcutx -.cfi_adjust_cfa_offset 416 - -.align 32 -.Ladd_proceedx: - movq 0+64(%rsp),%rdx - movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 96(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+0(%rsp),%r9 - movq 8+0(%rsp),%r10 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r11 - movq 24+0(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 0+0(%rsp),%rdx - movq 8+0(%rsp),%r14 - leaq -128+0(%rsp),%rsi - movq 16+0(%rsp),%r15 - movq 24+0(%rsp),%r8 - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 544(%rsp),%rdx - leaq 544(%rsp),%rbx - movq 0+352(%rsp),%r9 - movq 8+352(%rsp),%r10 - leaq -128+352(%rsp),%rsi - movq 16+352(%rsp),%r11 - movq 24+352(%rsp),%r12 - leaq 352(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 0(%rsp),%rdx - leaq 0(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 128(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 160(%rsp),%rdx - leaq 160(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 192(%rsp),%rdi - call __ecp_nistz256_mul_montx - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 96(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subx - - leaq 128(%rsp),%rbx - leaq 288(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 192+0(%rsp),%rax - movq 192+8(%rsp),%rbp - movq 192+16(%rsp),%rcx - movq 192+24(%rsp),%r10 - leaq 320(%rsp),%rdi - - call __ecp_nistz256_subx - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 128(%rsp),%rdx - leaq 128(%rsp),%rbx - movq 0+224(%rsp),%r9 - movq 8+224(%rsp),%r10 - leaq -128+224(%rsp),%rsi - movq 16+224(%rsp),%r11 - movq 24+224(%rsp),%r12 - leaq 256(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 320(%rsp),%rdx - leaq 320(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 320(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 256(%rsp),%rbx - leaq 320(%rsp),%rdi - call __ecp_nistz256_sub_fromx - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 352(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 352+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 544(%rsp),%xmm2 - pand 544+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 480(%rsp),%xmm2 - pand 480+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 320(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 320+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 512(%rsp),%xmm2 - pand 512+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - -.Ladd_donex: - leaq 576+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lpoint_addx_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_point_addx,.-ecp_nistz256_point_addx -.type ecp_nistz256_point_add_affinex,@function -.align 32 -ecp_nistz256_point_add_affinex: -.cfi_startproc -.Lpoint_add_affinex: - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbp,-16 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset %rbx,-24 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r15,-56 - subq $480+8,%rsp -.cfi_adjust_cfa_offset 32*15+8 -.Ladd_affinex_body: - - movdqu 0(%rsi),%xmm0 - movq %rdx,%rbx - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 - movdqu 48(%rsi),%xmm3 - movdqu 64(%rsi),%xmm4 - movdqu 80(%rsi),%xmm5 - movq 64+0(%rsi),%rdx - movq 64+8(%rsi),%r14 - movq 64+16(%rsi),%r15 - movq 64+24(%rsi),%r8 - movdqa %xmm0,320(%rsp) - movdqa %xmm1,320+16(%rsp) - movdqa %xmm2,352(%rsp) - movdqa %xmm3,352+16(%rsp) - movdqa %xmm4,384(%rsp) - movdqa %xmm5,384+16(%rsp) - por %xmm4,%xmm5 - - movdqu 0(%rbx),%xmm0 - pshufd $0xb1,%xmm5,%xmm3 - movdqu 16(%rbx),%xmm1 - movdqu 32(%rbx),%xmm2 - por %xmm3,%xmm5 - movdqu 48(%rbx),%xmm3 - movdqa %xmm0,416(%rsp) - pshufd $0x1e,%xmm5,%xmm4 - movdqa %xmm1,416+16(%rsp) - por %xmm0,%xmm1 -.byte 102,72,15,110,199 - movdqa %xmm2,448(%rsp) - movdqa %xmm3,448+16(%rsp) - por %xmm2,%xmm3 - por %xmm4,%xmm5 - pxor %xmm4,%xmm4 - por %xmm1,%xmm3 - - leaq 64-128(%rsi),%rsi - leaq 32(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - pcmpeqd %xmm4,%xmm5 - pshufd $0xb1,%xmm3,%xmm4 - movq 0(%rbx),%rdx - - movq %r12,%r9 - por %xmm3,%xmm4 - pshufd $0,%xmm5,%xmm5 - pshufd $0x1e,%xmm4,%xmm3 - movq %r13,%r10 - por %xmm3,%xmm4 - pxor %xmm3,%xmm3 - movq %r14,%r11 - pcmpeqd %xmm3,%xmm4 - pshufd $0,%xmm4,%xmm4 - - leaq 32-128(%rsp),%rsi - movq %r15,%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 320(%rsp),%rbx - leaq 64(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 384(%rsp),%rdx - leaq 384(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 288(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 448(%rsp),%rdx - leaq 448(%rsp),%rbx - movq 0+32(%rsp),%r9 - movq 8+32(%rsp),%r10 - leaq -128+32(%rsp),%rsi - movq 16+32(%rsp),%r11 - movq 24+32(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 352(%rsp),%rbx - leaq 96(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+64(%rsp),%rdx - movq 8+64(%rsp),%r14 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r15 - movq 24+64(%rsp),%r8 - leaq 128(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 0+96(%rsp),%rdx - movq 8+96(%rsp),%r14 - leaq -128+96(%rsp),%rsi - movq 16+96(%rsp),%r15 - movq 24+96(%rsp),%r8 - leaq 192(%rsp),%rdi - call __ecp_nistz256_sqr_montx - - movq 128(%rsp),%rdx - leaq 128(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 160(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 320(%rsp),%rdx - leaq 320(%rsp),%rbx - movq 0+128(%rsp),%r9 - movq 8+128(%rsp),%r10 - leaq -128+128(%rsp),%rsi - movq 16+128(%rsp),%r11 - movq 24+128(%rsp),%r12 - leaq 0(%rsp),%rdi - call __ecp_nistz256_mul_montx - - - - - xorq %r11,%r11 - addq %r12,%r12 - leaq 192(%rsp),%rsi - adcq %r13,%r13 - movq %r12,%rax - adcq %r8,%r8 - adcq %r9,%r9 - movq %r13,%rbp - adcq $0,%r11 - - subq $-1,%r12 - movq %r8,%rcx - sbbq %r14,%r13 - sbbq $0,%r8 - movq %r9,%r10 - sbbq %r15,%r9 - sbbq $0,%r11 - - cmovcq %rax,%r12 - movq 0(%rsi),%rax - cmovcq %rbp,%r13 - movq 8(%rsi),%rbp - cmovcq %rcx,%r8 - movq 16(%rsi),%rcx - cmovcq %r10,%r9 - movq 24(%rsi),%r10 - - call __ecp_nistz256_subx - - leaq 160(%rsp),%rbx - leaq 224(%rsp),%rdi - call __ecp_nistz256_sub_fromx - - movq 0+0(%rsp),%rax - movq 0+8(%rsp),%rbp - movq 0+16(%rsp),%rcx - movq 0+24(%rsp),%r10 - leaq 64(%rsp),%rdi - - call __ecp_nistz256_subx - - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r8,16(%rdi) - movq %r9,24(%rdi) - movq 352(%rsp),%rdx - leaq 352(%rsp),%rbx - movq 0+160(%rsp),%r9 - movq 8+160(%rsp),%r10 - leaq -128+160(%rsp),%rsi - movq 16+160(%rsp),%r11 - movq 24+160(%rsp),%r12 - leaq 32(%rsp),%rdi - call __ecp_nistz256_mul_montx - - movq 96(%rsp),%rdx - leaq 96(%rsp),%rbx - movq 0+64(%rsp),%r9 - movq 8+64(%rsp),%r10 - leaq -128+64(%rsp),%rsi - movq 16+64(%rsp),%r11 - movq 24+64(%rsp),%r12 - leaq 64(%rsp),%rdi - call __ecp_nistz256_mul_montx - - leaq 32(%rsp),%rbx - leaq 256(%rsp),%rdi - call __ecp_nistz256_sub_fromx - -.byte 102,72,15,126,199 - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 288(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 288+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand .LONE_mont(%rip),%xmm2 - pand .LONE_mont+16(%rip),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 384(%rsp),%xmm2 - pand 384+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,64(%rdi) - movdqu %xmm3,80(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 224(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 224+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 416(%rsp),%xmm2 - pand 416+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 320(%rsp),%xmm2 - pand 320+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,0(%rdi) - movdqu %xmm3,16(%rdi) - - movdqa %xmm5,%xmm0 - movdqa %xmm5,%xmm1 - pandn 256(%rsp),%xmm0 - movdqa %xmm5,%xmm2 - pandn 256+16(%rsp),%xmm1 - movdqa %xmm5,%xmm3 - pand 448(%rsp),%xmm2 - pand 448+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - - movdqa %xmm4,%xmm0 - movdqa %xmm4,%xmm1 - pandn %xmm2,%xmm0 - movdqa %xmm4,%xmm2 - pandn %xmm3,%xmm1 - movdqa %xmm4,%xmm3 - pand 352(%rsp),%xmm2 - pand 352+16(%rsp),%xmm3 - por %xmm0,%xmm2 - por %xmm1,%xmm3 - movdqu %xmm2,32(%rdi) - movdqu %xmm3,48(%rdi) - - leaq 480+56(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbx -.cfi_restore %rbx - movq -8(%rsi),%rbp -.cfi_restore %rbp - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Ladd_affinex_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size ecp_nistz256_point_add_affinex,.-ecp_nistz256_point_add_affinex -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S deleted file mode 100644 index cf056e3e..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S +++ /dev/null @@ -1,343 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - -.type beeu_mod_inverse_vartime,@function -.hidden beeu_mod_inverse_vartime -.globl beeu_mod_inverse_vartime -.hidden beeu_mod_inverse_vartime -.align 32 -beeu_mod_inverse_vartime: -.cfi_startproc - pushq %rbp -.cfi_adjust_cfa_offset 8 -.cfi_offset rbp,-16 - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset r12,-24 - pushq %r13 -.cfi_adjust_cfa_offset 8 -.cfi_offset r13,-32 - pushq %r14 -.cfi_adjust_cfa_offset 8 -.cfi_offset r14,-40 - pushq %r15 -.cfi_adjust_cfa_offset 8 -.cfi_offset r15,-48 - pushq %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset rbx,-56 - pushq %rsi -.cfi_adjust_cfa_offset 8 -.cfi_offset rsi,-64 - - subq $80,%rsp -.cfi_adjust_cfa_offset 80 - movq %rdi,0(%rsp) - - - movq $1,%r8 - xorq %r9,%r9 - xorq %r10,%r10 - xorq %r11,%r11 - xorq %rdi,%rdi - - xorq %r12,%r12 - xorq %r13,%r13 - xorq %r14,%r14 - xorq %r15,%r15 - xorq %rbp,%rbp - - - vmovdqu 0(%rsi),%xmm0 - vmovdqu 16(%rsi),%xmm1 - vmovdqu %xmm0,48(%rsp) - vmovdqu %xmm1,64(%rsp) - - vmovdqu 0(%rdx),%xmm0 - vmovdqu 16(%rdx),%xmm1 - vmovdqu %xmm0,16(%rsp) - vmovdqu %xmm1,32(%rsp) - -.Lbeeu_loop: - xorq %rbx,%rbx - orq 48(%rsp),%rbx - orq 56(%rsp),%rbx - orq 64(%rsp),%rbx - orq 72(%rsp),%rbx - jz .Lbeeu_loop_end - - - - - - - - - - - movq $1,%rcx - - -.Lbeeu_shift_loop_XB: - movq %rcx,%rbx - andq 48(%rsp),%rbx - jnz .Lbeeu_shift_loop_end_XB - - - movq $1,%rbx - andq %r8,%rbx - jz .Lshift1_0 - addq 0(%rdx),%r8 - adcq 8(%rdx),%r9 - adcq 16(%rdx),%r10 - adcq 24(%rdx),%r11 - adcq $0,%rdi - -.Lshift1_0: - shrdq $1,%r9,%r8 - shrdq $1,%r10,%r9 - shrdq $1,%r11,%r10 - shrdq $1,%rdi,%r11 - shrq $1,%rdi - - shlq $1,%rcx - - - - - - cmpq $0x8000000,%rcx - jne .Lbeeu_shift_loop_XB - -.Lbeeu_shift_loop_end_XB: - bsfq %rcx,%rcx - testq %rcx,%rcx - jz .Lbeeu_no_shift_XB - - - - movq 8+48(%rsp),%rax - movq 16+48(%rsp),%rbx - movq 24+48(%rsp),%rsi - - shrdq %cl,%rax,0+48(%rsp) - shrdq %cl,%rbx,8+48(%rsp) - shrdq %cl,%rsi,16+48(%rsp) - - shrq %cl,%rsi - movq %rsi,24+48(%rsp) - - -.Lbeeu_no_shift_XB: - - movq $1,%rcx - - -.Lbeeu_shift_loop_YA: - movq %rcx,%rbx - andq 16(%rsp),%rbx - jnz .Lbeeu_shift_loop_end_YA - - - movq $1,%rbx - andq %r12,%rbx - jz .Lshift1_1 - addq 0(%rdx),%r12 - adcq 8(%rdx),%r13 - adcq 16(%rdx),%r14 - adcq 24(%rdx),%r15 - adcq $0,%rbp - -.Lshift1_1: - shrdq $1,%r13,%r12 - shrdq $1,%r14,%r13 - shrdq $1,%r15,%r14 - shrdq $1,%rbp,%r15 - shrq $1,%rbp - - shlq $1,%rcx - - - - - - cmpq $0x8000000,%rcx - jne .Lbeeu_shift_loop_YA - -.Lbeeu_shift_loop_end_YA: - bsfq %rcx,%rcx - testq %rcx,%rcx - jz .Lbeeu_no_shift_YA - - - - movq 8+16(%rsp),%rax - movq 16+16(%rsp),%rbx - movq 24+16(%rsp),%rsi - - shrdq %cl,%rax,0+16(%rsp) - shrdq %cl,%rbx,8+16(%rsp) - shrdq %cl,%rsi,16+16(%rsp) - - shrq %cl,%rsi - movq %rsi,24+16(%rsp) - - -.Lbeeu_no_shift_YA: - - movq 48(%rsp),%rax - movq 56(%rsp),%rbx - movq 64(%rsp),%rsi - movq 72(%rsp),%rcx - subq 16(%rsp),%rax - sbbq 24(%rsp),%rbx - sbbq 32(%rsp),%rsi - sbbq 40(%rsp),%rcx - jnc .Lbeeu_B_bigger_than_A - - - movq 16(%rsp),%rax - movq 24(%rsp),%rbx - movq 32(%rsp),%rsi - movq 40(%rsp),%rcx - subq 48(%rsp),%rax - sbbq 56(%rsp),%rbx - sbbq 64(%rsp),%rsi - sbbq 72(%rsp),%rcx - movq %rax,16(%rsp) - movq %rbx,24(%rsp) - movq %rsi,32(%rsp) - movq %rcx,40(%rsp) - - - addq %r8,%r12 - adcq %r9,%r13 - adcq %r10,%r14 - adcq %r11,%r15 - adcq %rdi,%rbp - jmp .Lbeeu_loop - -.Lbeeu_B_bigger_than_A: - - movq %rax,48(%rsp) - movq %rbx,56(%rsp) - movq %rsi,64(%rsp) - movq %rcx,72(%rsp) - - - addq %r12,%r8 - adcq %r13,%r9 - adcq %r14,%r10 - adcq %r15,%r11 - adcq %rbp,%rdi - - jmp .Lbeeu_loop - -.Lbeeu_loop_end: - - - - - movq 16(%rsp),%rbx - subq $1,%rbx - orq 24(%rsp),%rbx - orq 32(%rsp),%rbx - orq 40(%rsp),%rbx - - jnz .Lbeeu_err - - - - - movq 0(%rdx),%r8 - movq 8(%rdx),%r9 - movq 16(%rdx),%r10 - movq 24(%rdx),%r11 - xorq %rdi,%rdi - -.Lbeeu_reduction_loop: - movq %r12,16(%rsp) - movq %r13,24(%rsp) - movq %r14,32(%rsp) - movq %r15,40(%rsp) - movq %rbp,48(%rsp) - - - subq %r8,%r12 - sbbq %r9,%r13 - sbbq %r10,%r14 - sbbq %r11,%r15 - sbbq $0,%rbp - - - cmovcq 16(%rsp),%r12 - cmovcq 24(%rsp),%r13 - cmovcq 32(%rsp),%r14 - cmovcq 40(%rsp),%r15 - jnc .Lbeeu_reduction_loop - - - subq %r12,%r8 - sbbq %r13,%r9 - sbbq %r14,%r10 - sbbq %r15,%r11 - -.Lbeeu_save: - - movq 0(%rsp),%rdi - - movq %r8,0(%rdi) - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - - - movq $1,%rax - jmp .Lbeeu_finish - -.Lbeeu_err: - - xorq %rax,%rax - -.Lbeeu_finish: - addq $80,%rsp -.cfi_adjust_cfa_offset -80 - popq %rsi -.cfi_adjust_cfa_offset -8 -.cfi_restore rsi - popq %rbx -.cfi_adjust_cfa_offset -8 -.cfi_restore rbx - popq %r15 -.cfi_adjust_cfa_offset -8 -.cfi_restore r15 - popq %r14 -.cfi_adjust_cfa_offset -8 -.cfi_restore r14 - popq %r13 -.cfi_adjust_cfa_offset -8 -.cfi_restore r13 - popq %r12 -.cfi_adjust_cfa_offset -8 -.cfi_restore r12 - popq %rbp -.cfi_adjust_cfa_offset -8 -.cfi_restore rbp - .byte 0xf3,0xc3 -.cfi_endproc - -.size beeu_mod_inverse_vartime, .-beeu_mod_inverse_vartime -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S deleted file mode 100644 index b392637c..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S +++ /dev/null @@ -1,63 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - - -.globl CRYPTO_rdrand -.hidden CRYPTO_rdrand -.type CRYPTO_rdrand,@function -.align 16 -CRYPTO_rdrand: -.cfi_startproc - xorq %rax,%rax -.byte 72,15,199,242 - - adcq %rax,%rax - movq %rdx,0(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size CRYPTO_rdrand,.-CRYPTO_rdrand - - - - - -.globl CRYPTO_rdrand_multiple8_buf -.hidden CRYPTO_rdrand_multiple8_buf -.type CRYPTO_rdrand_multiple8_buf,@function -.align 16 -CRYPTO_rdrand_multiple8_buf: -.cfi_startproc - testq %rsi,%rsi - jz .Lout - movq $8,%rdx -.Lloop: -.byte 72,15,199,241 - jnc .Lerr - movq %rcx,0(%rdi) - addq %rdx,%rdi - subq %rdx,%rsi - jnz .Lloop -.Lout: - movq $1,%rax - .byte 0xf3,0xc3 -.Lerr: - xorq %rax,%rax - .byte 0xf3,0xc3 -.cfi_endproc -.size CRYPTO_rdrand_multiple8_buf,.-CRYPTO_rdrand_multiple8_buf -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S deleted file mode 100644 index 0f8a978a..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/rsaz-avx2.S +++ /dev/null @@ -1,1749 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - -.globl rsaz_1024_sqr_avx2 -.hidden rsaz_1024_sqr_avx2 -.type rsaz_1024_sqr_avx2,@function -.align 64 -rsaz_1024_sqr_avx2: -.cfi_startproc - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - vzeroupper - movq %rax,%rbp -.cfi_def_cfa_register %rbp - movq %rdx,%r13 - subq $832,%rsp - movq %r13,%r15 - subq $-128,%rdi - subq $-128,%rsi - subq $-128,%r13 - - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - vpxor %ymm9,%ymm9,%ymm9 - jz .Lsqr_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%r13),%ymm0 - andq $-2048,%rsp - vmovdqu 32-128(%r13),%ymm1 - vmovdqu 64-128(%r13),%ymm2 - vmovdqu 96-128(%r13),%ymm3 - vmovdqu 128-128(%r13),%ymm4 - vmovdqu 160-128(%r13),%ymm5 - vmovdqu 192-128(%r13),%ymm6 - vmovdqu 224-128(%r13),%ymm7 - vmovdqu 256-128(%r13),%ymm8 - leaq 832+128(%rsp),%r13 - vmovdqu %ymm0,0-128(%r13) - vmovdqu %ymm1,32-128(%r13) - vmovdqu %ymm2,64-128(%r13) - vmovdqu %ymm3,96-128(%r13) - vmovdqu %ymm4,128-128(%r13) - vmovdqu %ymm5,160-128(%r13) - vmovdqu %ymm6,192-128(%r13) - vmovdqu %ymm7,224-128(%r13) - vmovdqu %ymm8,256-128(%r13) - vmovdqu %ymm9,288-128(%r13) - -.Lsqr_1024_no_n_copy: - andq $-1024,%rsp - - vmovdqu 32-128(%rsi),%ymm1 - vmovdqu 64-128(%rsi),%ymm2 - vmovdqu 96-128(%rsi),%ymm3 - vmovdqu 128-128(%rsi),%ymm4 - vmovdqu 160-128(%rsi),%ymm5 - vmovdqu 192-128(%rsi),%ymm6 - vmovdqu 224-128(%rsi),%ymm7 - vmovdqu 256-128(%rsi),%ymm8 - - leaq 192(%rsp),%rbx - vmovdqu .Land_mask(%rip),%ymm15 - jmp .LOOP_GRANDE_SQR_1024 - -.align 32 -.LOOP_GRANDE_SQR_1024: - leaq 576+128(%rsp),%r9 - leaq 448(%rsp),%r12 - - - - - vpaddq %ymm1,%ymm1,%ymm1 - vpbroadcastq 0-128(%rsi),%ymm10 - vpaddq %ymm2,%ymm2,%ymm2 - vmovdqa %ymm1,0-128(%r9) - vpaddq %ymm3,%ymm3,%ymm3 - vmovdqa %ymm2,32-128(%r9) - vpaddq %ymm4,%ymm4,%ymm4 - vmovdqa %ymm3,64-128(%r9) - vpaddq %ymm5,%ymm5,%ymm5 - vmovdqa %ymm4,96-128(%r9) - vpaddq %ymm6,%ymm6,%ymm6 - vmovdqa %ymm5,128-128(%r9) - vpaddq %ymm7,%ymm7,%ymm7 - vmovdqa %ymm6,160-128(%r9) - vpaddq %ymm8,%ymm8,%ymm8 - vmovdqa %ymm7,192-128(%r9) - vpxor %ymm9,%ymm9,%ymm9 - vmovdqa %ymm8,224-128(%r9) - - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpbroadcastq 32-128(%rsi),%ymm11 - vmovdqu %ymm9,288-192(%rbx) - vpmuludq %ymm10,%ymm1,%ymm1 - vmovdqu %ymm9,320-448(%r12) - vpmuludq %ymm10,%ymm2,%ymm2 - vmovdqu %ymm9,352-448(%r12) - vpmuludq %ymm10,%ymm3,%ymm3 - vmovdqu %ymm9,384-448(%r12) - vpmuludq %ymm10,%ymm4,%ymm4 - vmovdqu %ymm9,416-448(%r12) - vpmuludq %ymm10,%ymm5,%ymm5 - vmovdqu %ymm9,448-448(%r12) - vpmuludq %ymm10,%ymm6,%ymm6 - vmovdqu %ymm9,480-448(%r12) - vpmuludq %ymm10,%ymm7,%ymm7 - vmovdqu %ymm9,512-448(%r12) - vpmuludq %ymm10,%ymm8,%ymm8 - vpbroadcastq 64-128(%rsi),%ymm10 - vmovdqu %ymm9,544-448(%r12) - - movq %rsi,%r15 - movl $4,%r14d - jmp .Lsqr_entry_1024 -.align 32 -.LOOP_SQR_1024: - vpbroadcastq 32-128(%r15),%ymm11 - vpmuludq 0-128(%rsi),%ymm10,%ymm0 - vpaddq 0-192(%rbx),%ymm0,%ymm0 - vpmuludq 0-128(%r9),%ymm10,%ymm1 - vpaddq 32-192(%rbx),%ymm1,%ymm1 - vpmuludq 32-128(%r9),%ymm10,%ymm2 - vpaddq 64-192(%rbx),%ymm2,%ymm2 - vpmuludq 64-128(%r9),%ymm10,%ymm3 - vpaddq 96-192(%rbx),%ymm3,%ymm3 - vpmuludq 96-128(%r9),%ymm10,%ymm4 - vpaddq 128-192(%rbx),%ymm4,%ymm4 - vpmuludq 128-128(%r9),%ymm10,%ymm5 - vpaddq 160-192(%rbx),%ymm5,%ymm5 - vpmuludq 160-128(%r9),%ymm10,%ymm6 - vpaddq 192-192(%rbx),%ymm6,%ymm6 - vpmuludq 192-128(%r9),%ymm10,%ymm7 - vpaddq 224-192(%rbx),%ymm7,%ymm7 - vpmuludq 224-128(%r9),%ymm10,%ymm8 - vpbroadcastq 64-128(%r15),%ymm10 - vpaddq 256-192(%rbx),%ymm8,%ymm8 -.Lsqr_entry_1024: - vmovdqu %ymm0,0-192(%rbx) - vmovdqu %ymm1,32-192(%rbx) - - vpmuludq 32-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 32-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 64-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 96-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 128-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 160-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 192-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 224-128(%r9),%ymm11,%ymm0 - vpbroadcastq 96-128(%r15),%ymm11 - vpaddq 288-192(%rbx),%ymm0,%ymm0 - - vmovdqu %ymm2,64-192(%rbx) - vmovdqu %ymm3,96-192(%rbx) - - vpmuludq 64-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 64-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 96-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq 128-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 160-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 224-128(%r9),%ymm10,%ymm1 - vpbroadcastq 128-128(%r15),%ymm10 - vpaddq 320-448(%r12),%ymm1,%ymm1 - - vmovdqu %ymm4,128-192(%rbx) - vmovdqu %ymm5,160-192(%rbx) - - vpmuludq 96-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 96-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq 128-128(%r9),%ymm11,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm0,%ymm0 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq 224-128(%r9),%ymm11,%ymm2 - vpbroadcastq 160-128(%r15),%ymm11 - vpaddq 352-448(%r12),%ymm2,%ymm2 - - vmovdqu %ymm6,192-192(%rbx) - vmovdqu %ymm7,224-192(%rbx) - - vpmuludq 128-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq 128-128(%r9),%ymm10,%ymm14 - vpaddq %ymm14,%ymm0,%ymm0 - vpmuludq 160-128(%r9),%ymm10,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 192-128(%r9),%ymm10,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 224-128(%r9),%ymm10,%ymm3 - vpbroadcastq 192-128(%r15),%ymm10 - vpaddq 384-448(%r12),%ymm3,%ymm3 - - vmovdqu %ymm8,256-192(%rbx) - vmovdqu %ymm0,288-192(%rbx) - leaq 8(%rbx),%rbx - - vpmuludq 160-128(%rsi),%ymm11,%ymm13 - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 160-128(%r9),%ymm11,%ymm12 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 192-128(%r9),%ymm11,%ymm14 - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq 224-128(%r9),%ymm11,%ymm4 - vpbroadcastq 224-128(%r15),%ymm11 - vpaddq 416-448(%r12),%ymm4,%ymm4 - - vmovdqu %ymm1,320-448(%r12) - vmovdqu %ymm2,352-448(%r12) - - vpmuludq 192-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 192-128(%r9),%ymm10,%ymm14 - vpbroadcastq 256-128(%r15),%ymm0 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq 224-128(%r9),%ymm10,%ymm5 - vpbroadcastq 0+8-128(%r15),%ymm10 - vpaddq 448-448(%r12),%ymm5,%ymm5 - - vmovdqu %ymm3,384-448(%r12) - vmovdqu %ymm4,416-448(%r12) - leaq 8(%r15),%r15 - - vpmuludq 224-128(%rsi),%ymm11,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 224-128(%r9),%ymm11,%ymm6 - vpaddq 480-448(%r12),%ymm6,%ymm6 - - vpmuludq 256-128(%rsi),%ymm0,%ymm7 - vmovdqu %ymm5,448-448(%r12) - vpaddq 512-448(%r12),%ymm7,%ymm7 - vmovdqu %ymm6,480-448(%r12) - vmovdqu %ymm7,512-448(%r12) - leaq 8(%r12),%r12 - - decl %r14d - jnz .LOOP_SQR_1024 - - vmovdqu 256(%rsp),%ymm8 - vmovdqu 288(%rsp),%ymm1 - vmovdqu 320(%rsp),%ymm2 - leaq 192(%rsp),%rbx - - vpsrlq $29,%ymm8,%ymm14 - vpand %ymm15,%ymm8,%ymm8 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - - vpermq $0x93,%ymm14,%ymm14 - vpxor %ymm9,%ymm9,%ymm9 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm8,%ymm8 - vpblendd $3,%ymm11,%ymm9,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,288-192(%rbx) - vmovdqu %ymm2,320-192(%rbx) - - movq (%rsp),%rax - movq 8(%rsp),%r10 - movq 16(%rsp),%r11 - movq 24(%rsp),%r12 - vmovdqu 32(%rsp),%ymm1 - vmovdqu 64-192(%rbx),%ymm2 - vmovdqu 96-192(%rbx),%ymm3 - vmovdqu 128-192(%rbx),%ymm4 - vmovdqu 160-192(%rbx),%ymm5 - vmovdqu 192-192(%rbx),%ymm6 - vmovdqu 224-192(%rbx),%ymm7 - - movq %rax,%r9 - imull %ecx,%eax - andl $0x1fffffff,%eax - vmovd %eax,%xmm12 - - movq %rax,%rdx - imulq -128(%r13),%rax - vpbroadcastq %xmm12,%ymm12 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax - shrq $29,%r9 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - addq %r9,%r10 - addq %rax,%r11 - imulq 24-128(%r13),%rdx - addq %rdx,%r12 - - movq %r10,%rax - imull %ecx,%eax - andl $0x1fffffff,%eax - - movl $9,%r14d - jmp .LOOP_REDUCE_1024 - -.align 32 -.LOOP_REDUCE_1024: - vmovd %eax,%xmm13 - vpbroadcastq %xmm13,%ymm13 - - vpmuludq 32-128(%r13),%ymm12,%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm10,%ymm1,%ymm1 - addq %rax,%r10 - vpmuludq 64-128(%r13),%ymm12,%ymm14 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm14,%ymm2,%ymm2 - vpmuludq 96-128(%r13),%ymm12,%ymm11 -.byte 0x67 - addq %rax,%r11 -.byte 0x67 - movq %rdx,%rax - imulq 16-128(%r13),%rax - shrq $29,%r10 - vpaddq %ymm11,%ymm3,%ymm3 - vpmuludq 128-128(%r13),%ymm12,%ymm10 - addq %rax,%r12 - addq %r10,%r11 - vpaddq %ymm10,%ymm4,%ymm4 - vpmuludq 160-128(%r13),%ymm12,%ymm14 - movq %r11,%rax - imull %ecx,%eax - vpaddq %ymm14,%ymm5,%ymm5 - vpmuludq 192-128(%r13),%ymm12,%ymm11 - andl $0x1fffffff,%eax - vpaddq %ymm11,%ymm6,%ymm6 - vpmuludq 224-128(%r13),%ymm12,%ymm10 - vpaddq %ymm10,%ymm7,%ymm7 - vpmuludq 256-128(%r13),%ymm12,%ymm14 - vmovd %eax,%xmm12 - - vpaddq %ymm14,%ymm8,%ymm8 - - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 32-8-128(%r13),%ymm13,%ymm11 - vmovdqu 96-8-128(%r13),%ymm14 - movq %rax,%rdx - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm1,%ymm1 - vpmuludq 64-8-128(%r13),%ymm13,%ymm10 - vmovdqu 128-8-128(%r13),%ymm11 - addq %rax,%r11 - movq %rdx,%rax - imulq 8-128(%r13),%rax - vpaddq %ymm10,%ymm2,%ymm2 - addq %r12,%rax - shrq $29,%r11 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 160-8-128(%r13),%ymm10 - addq %r11,%rax - vpaddq %ymm14,%ymm3,%ymm3 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 192-8-128(%r13),%ymm14 -.byte 0x67 - movq %rax,%r12 - imull %ecx,%eax - vpaddq %ymm11,%ymm4,%ymm4 - vpmuludq %ymm13,%ymm10,%ymm10 -.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 - andl $0x1fffffff,%eax - vpaddq %ymm10,%ymm5,%ymm5 - vpmuludq %ymm13,%ymm14,%ymm14 - vmovdqu 256-8-128(%r13),%ymm10 - vpaddq %ymm14,%ymm6,%ymm6 - vpmuludq %ymm13,%ymm11,%ymm11 - vmovdqu 288-8-128(%r13),%ymm9 - vmovd %eax,%xmm0 - imulq -128(%r13),%rax - vpaddq %ymm11,%ymm7,%ymm7 - vpmuludq %ymm13,%ymm10,%ymm10 - vmovdqu 32-16-128(%r13),%ymm14 - vpbroadcastq %xmm0,%ymm0 - vpaddq %ymm10,%ymm8,%ymm8 - vpmuludq %ymm13,%ymm9,%ymm9 - vmovdqu 64-16-128(%r13),%ymm11 - addq %rax,%r12 - - vmovdqu 32-24-128(%r13),%ymm13 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 96-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm1,%ymm1 - vpmuludq %ymm0,%ymm13,%ymm13 - vpmuludq %ymm12,%ymm11,%ymm11 -.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff - vpaddq %ymm1,%ymm13,%ymm13 - vpaddq %ymm11,%ymm2,%ymm2 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 160-16-128(%r13),%ymm11 -.byte 0x67 - vmovq %xmm13,%rax - vmovdqu %ymm13,(%rsp) - vpaddq %ymm10,%ymm3,%ymm3 - vpmuludq %ymm12,%ymm14,%ymm14 - vmovdqu 192-16-128(%r13),%ymm10 - vpaddq %ymm14,%ymm4,%ymm4 - vpmuludq %ymm12,%ymm11,%ymm11 - vmovdqu 224-16-128(%r13),%ymm14 - vpaddq %ymm11,%ymm5,%ymm5 - vpmuludq %ymm12,%ymm10,%ymm10 - vmovdqu 256-16-128(%r13),%ymm11 - vpaddq %ymm10,%ymm6,%ymm6 - vpmuludq %ymm12,%ymm14,%ymm14 - shrq $29,%r12 - vmovdqu 288-16-128(%r13),%ymm10 - addq %r12,%rax - vpaddq %ymm14,%ymm7,%ymm7 - vpmuludq %ymm12,%ymm11,%ymm11 - - movq %rax,%r9 - imull %ecx,%eax - vpaddq %ymm11,%ymm8,%ymm8 - vpmuludq %ymm12,%ymm10,%ymm10 - andl $0x1fffffff,%eax - vmovd %eax,%xmm12 - vmovdqu 96-24-128(%r13),%ymm11 -.byte 0x67 - vpaddq %ymm10,%ymm9,%ymm9 - vpbroadcastq %xmm12,%ymm12 - - vpmuludq 64-24-128(%r13),%ymm0,%ymm14 - vmovdqu 128-24-128(%r13),%ymm10 - movq %rax,%rdx - imulq -128(%r13),%rax - movq 8(%rsp),%r10 - vpaddq %ymm14,%ymm2,%ymm1 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 160-24-128(%r13),%ymm14 - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%r13),%rax -.byte 0x67 - shrq $29,%r9 - movq 16(%rsp),%r11 - vpaddq %ymm11,%ymm3,%ymm2 - vpmuludq %ymm0,%ymm10,%ymm10 - vmovdqu 192-24-128(%r13),%ymm11 - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%r13),%rax - vpaddq %ymm10,%ymm4,%ymm3 - vpmuludq %ymm0,%ymm14,%ymm14 - vmovdqu 224-24-128(%r13),%ymm10 - imulq 24-128(%r13),%rdx - addq %rax,%r11 - leaq (%r9,%r10,1),%rax - vpaddq %ymm14,%ymm5,%ymm4 - vpmuludq %ymm0,%ymm11,%ymm11 - vmovdqu 256-24-128(%r13),%ymm14 - movq %rax,%r10 - imull %ecx,%eax - vpmuludq %ymm0,%ymm10,%ymm10 - vpaddq %ymm11,%ymm6,%ymm5 - vmovdqu 288-24-128(%r13),%ymm11 - andl $0x1fffffff,%eax - vpaddq %ymm10,%ymm7,%ymm6 - vpmuludq %ymm0,%ymm14,%ymm14 - addq 24(%rsp),%rdx - vpaddq %ymm14,%ymm8,%ymm7 - vpmuludq %ymm0,%ymm11,%ymm11 - vpaddq %ymm11,%ymm9,%ymm8 - vmovq %r12,%xmm9 - movq %rdx,%r12 - - decl %r14d - jnz .LOOP_REDUCE_1024 - leaq 448(%rsp),%r12 - vpaddq %ymm9,%ymm13,%ymm0 - vpxor %ymm9,%ymm9,%ymm9 - - vpaddq 288-192(%rbx),%ymm0,%ymm0 - vpaddq 320-448(%r12),%ymm1,%ymm1 - vpaddq 352-448(%r12),%ymm2,%ymm2 - vpaddq 384-448(%r12),%ymm3,%ymm3 - vpaddq 416-448(%r12),%ymm4,%ymm4 - vpaddq 448-448(%r12),%ymm5,%ymm5 - vpaddq 480-448(%r12),%ymm6,%ymm6 - vpaddq 512-448(%r12),%ymm7,%ymm7 - vpaddq 544-448(%r12),%ymm8,%ymm8 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vpaddq %ymm13,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm14 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm11 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm12,%ymm12 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm13,%ymm13 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm0,%ymm0 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm1,%ymm1 - vmovdqu %ymm0,0-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm2,%ymm2 - vmovdqu %ymm1,32-128(%rdi) - vpblendd $3,%ymm13,%ymm9,%ymm13 - vpaddq %ymm12,%ymm3,%ymm3 - vmovdqu %ymm2,64-128(%rdi) - vpaddq %ymm13,%ymm4,%ymm4 - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vpaddq %ymm13,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm14 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm11 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm12 - vpermq $0x93,%ymm14,%ymm14 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm13 - vpermq $0x93,%ymm11,%ymm11 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm13,%ymm13 - - vpblendd $3,%ymm9,%ymm14,%ymm10 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm14,%ymm11,%ymm14 - vpaddq %ymm10,%ymm4,%ymm4 - vpblendd $3,%ymm11,%ymm12,%ymm11 - vpaddq %ymm14,%ymm5,%ymm5 - vmovdqu %ymm4,128-128(%rdi) - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm11,%ymm6,%ymm6 - vmovdqu %ymm5,160-128(%rdi) - vpblendd $3,%ymm13,%ymm0,%ymm13 - vpaddq %ymm12,%ymm7,%ymm7 - vmovdqu %ymm6,192-128(%rdi) - vpaddq %ymm13,%ymm8,%ymm8 - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - - movq %rdi,%rsi - decl %r8d - jne .LOOP_GRANDE_SQR_1024 - - vzeroall - movq %rbp,%rax -.cfi_def_cfa_register %rax - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lsqr_1024_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2 -.globl rsaz_1024_mul_avx2 -.hidden rsaz_1024_mul_avx2 -.type rsaz_1024_mul_avx2,@function -.align 64 -rsaz_1024_mul_avx2: -.cfi_startproc - leaq (%rsp),%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - movq %rax,%rbp -.cfi_def_cfa_register %rbp - vzeroall - movq %rdx,%r13 - subq $64,%rsp - - - - - - -.byte 0x67,0x67 - movq %rsi,%r15 - andq $4095,%r15 - addq $320,%r15 - shrq $12,%r15 - movq %rsi,%r15 - cmovnzq %r13,%rsi - cmovnzq %r15,%r13 - - movq %rcx,%r15 - subq $-128,%rsi - subq $-128,%rcx - subq $-128,%rdi - - andq $4095,%r15 - addq $320,%r15 -.byte 0x67,0x67 - shrq $12,%r15 - jz .Lmul_1024_no_n_copy - - - - - - subq $320,%rsp - vmovdqu 0-128(%rcx),%ymm0 - andq $-512,%rsp - vmovdqu 32-128(%rcx),%ymm1 - vmovdqu 64-128(%rcx),%ymm2 - vmovdqu 96-128(%rcx),%ymm3 - vmovdqu 128-128(%rcx),%ymm4 - vmovdqu 160-128(%rcx),%ymm5 - vmovdqu 192-128(%rcx),%ymm6 - vmovdqu 224-128(%rcx),%ymm7 - vmovdqu 256-128(%rcx),%ymm8 - leaq 64+128(%rsp),%rcx - vmovdqu %ymm0,0-128(%rcx) - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm1,32-128(%rcx) - vpxor %ymm1,%ymm1,%ymm1 - vmovdqu %ymm2,64-128(%rcx) - vpxor %ymm2,%ymm2,%ymm2 - vmovdqu %ymm3,96-128(%rcx) - vpxor %ymm3,%ymm3,%ymm3 - vmovdqu %ymm4,128-128(%rcx) - vpxor %ymm4,%ymm4,%ymm4 - vmovdqu %ymm5,160-128(%rcx) - vpxor %ymm5,%ymm5,%ymm5 - vmovdqu %ymm6,192-128(%rcx) - vpxor %ymm6,%ymm6,%ymm6 - vmovdqu %ymm7,224-128(%rcx) - vpxor %ymm7,%ymm7,%ymm7 - vmovdqu %ymm8,256-128(%rcx) - vmovdqa %ymm0,%ymm8 - vmovdqu %ymm9,288-128(%rcx) -.Lmul_1024_no_n_copy: - andq $-64,%rsp - - movq (%r13),%rbx - vpbroadcastq (%r13),%ymm10 - vmovdqu %ymm0,(%rsp) - xorq %r9,%r9 -.byte 0x67 - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - - vmovdqu .Land_mask(%rip),%ymm15 - movl $9,%r14d - vmovdqu %ymm9,288-128(%rdi) - jmp .Loop_mul_1024 - -.align 32 -.Loop_mul_1024: - vpsrlq $29,%ymm3,%ymm9 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r9,%rax - movq %rbx,%r10 - imulq 8-128(%rsi),%r10 - addq 8(%rsp),%r10 - - movq %rax,%r9 - imull %r8d,%eax - andl $0x1fffffff,%eax - - movq %rbx,%r11 - imulq 16-128(%rsi),%r11 - addq 16(%rsp),%r11 - - movq %rbx,%r12 - imulq 24-128(%rsi),%r12 - addq 24(%rsp),%r12 - vpmuludq 32-128(%rsi),%ymm10,%ymm0 - vmovd %eax,%xmm11 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq 64-128(%rsi),%ymm10,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq 96-128(%rsi),%ymm10,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq 128-128(%rsi),%ymm10,%ymm0 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq 160-128(%rsi),%ymm10,%ymm12 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq 192-128(%rsi),%ymm10,%ymm13 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq 224-128(%rsi),%ymm10,%ymm0 - vpermq $0x93,%ymm9,%ymm9 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq 256-128(%rsi),%ymm10,%ymm12 - vpbroadcastq 8(%r13),%ymm10 - vpaddq %ymm12,%ymm8,%ymm8 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r9 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r10 - movq %rdx,%rax - imulq 16-128(%rcx),%rax - addq %rax,%r11 - shrq $29,%r9 - imulq 24-128(%rcx),%rdx - addq %rdx,%r12 - addq %r9,%r10 - - vpmuludq 32-128(%rcx),%ymm11,%ymm13 - vmovq %xmm10,%rbx - vpaddq %ymm13,%ymm1,%ymm1 - vpmuludq 64-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm2,%ymm2 - vpmuludq 96-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm3,%ymm3 - vpmuludq 128-128(%rcx),%ymm11,%ymm13 - vpaddq %ymm13,%ymm4,%ymm4 - vpmuludq 160-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm0,%ymm5,%ymm5 - vpmuludq 192-128(%rcx),%ymm11,%ymm12 - vpaddq %ymm12,%ymm6,%ymm6 - vpmuludq 224-128(%rcx),%ymm11,%ymm13 - vpblendd $3,%ymm14,%ymm9,%ymm12 - vpaddq %ymm13,%ymm7,%ymm7 - vpmuludq 256-128(%rcx),%ymm11,%ymm0 - vpaddq %ymm12,%ymm3,%ymm3 - vpaddq %ymm0,%ymm8,%ymm8 - - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rsi),%ymm12 - movq %rbx,%rax - imulq 8-128(%rsi),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rsi),%ymm13 - - movq %r10,%rax - vpblendd $0xfc,%ymm14,%ymm9,%ymm9 - imull %r8d,%eax - vpaddq %ymm9,%ymm4,%ymm4 - andl $0x1fffffff,%eax - - imulq 16-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovd %eax,%xmm11 - vmovdqu -8+96-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -8+128-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+160-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+192-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -8+224-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -8+256-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -8+288-128(%rsi),%ymm9 - vpaddq %ymm12,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm13,%ymm13 - vpaddq %ymm13,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm9,%ymm9 - vpbroadcastq 16(%r13),%ymm10 - - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r10 - vmovdqu -8+32-128(%rcx),%ymm0 - movq %rdx,%rax - imulq 8-128(%rcx),%rax - addq %rax,%r11 - vmovdqu -8+64-128(%rcx),%ymm12 - shrq $29,%r10 - imulq 16-128(%rcx),%rdx - addq %rdx,%r12 - addq %r10,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -8+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -8+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -8+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -8+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rsi),%ymm0 - movq %rbx,%rax - imulq -128(%rsi),%rax - addq %r11,%rax - - vmovdqu -16+64-128(%rsi),%ymm12 - movq %rax,%r11 - imull %r8d,%eax - andl $0x1fffffff,%eax - - imulq 8-128(%rsi),%rbx - addq %rbx,%r12 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -16+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -16+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -16+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -16+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -16+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 24(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - - vmovdqu -16+32-128(%rcx),%ymm0 - movq %rax,%rdx - imulq -128(%rcx),%rax - addq %rax,%r11 - vmovdqu -16+64-128(%rcx),%ymm12 - imulq 8-128(%rcx),%rdx - addq %rdx,%r12 - shrq $29,%r11 - - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -16+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+128-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -16+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -16+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -16+288-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+32-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+64-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm9,%ymm9 - - addq %r11,%r12 - imulq -128(%rsi),%rbx - addq %rbx,%r12 - - movq %r12,%rax - imull %r8d,%eax - andl $0x1fffffff,%eax - - vpmuludq %ymm10,%ymm0,%ymm0 - vmovd %eax,%xmm11 - vmovdqu -24+96-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm1,%ymm1 - vpmuludq %ymm10,%ymm12,%ymm12 - vpbroadcastq %xmm11,%ymm11 - vmovdqu -24+128-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm2,%ymm2 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+160-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm3,%ymm3 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+192-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm4,%ymm4 - vpmuludq %ymm10,%ymm12,%ymm12 - vmovdqu -24+224-128(%rsi),%ymm0 - vpaddq %ymm12,%ymm5,%ymm5 - vpmuludq %ymm10,%ymm13,%ymm13 - vmovdqu -24+256-128(%rsi),%ymm12 - vpaddq %ymm13,%ymm6,%ymm6 - vpmuludq %ymm10,%ymm0,%ymm0 - vmovdqu -24+288-128(%rsi),%ymm13 - vpaddq %ymm0,%ymm7,%ymm7 - vpmuludq %ymm10,%ymm12,%ymm12 - vpaddq %ymm12,%ymm8,%ymm8 - vpmuludq %ymm10,%ymm13,%ymm13 - vpbroadcastq 32(%r13),%ymm10 - vpaddq %ymm13,%ymm9,%ymm9 - addq $32,%r13 - - vmovdqu -24+32-128(%rcx),%ymm0 - imulq -128(%rcx),%rax - addq %rax,%r12 - shrq $29,%r12 - - vmovdqu -24+64-128(%rcx),%ymm12 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovq %xmm10,%rbx - vmovdqu -24+96-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm1,%ymm0 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu %ymm0,(%rsp) - vpaddq %ymm12,%ymm2,%ymm1 - vmovdqu -24+128-128(%rcx),%ymm0 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+160-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm3,%ymm2 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+192-128(%rcx),%ymm13 - vpaddq %ymm0,%ymm4,%ymm3 - vpmuludq %ymm11,%ymm12,%ymm12 - vmovdqu -24+224-128(%rcx),%ymm0 - vpaddq %ymm12,%ymm5,%ymm4 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovdqu -24+256-128(%rcx),%ymm12 - vpaddq %ymm13,%ymm6,%ymm5 - vpmuludq %ymm11,%ymm0,%ymm0 - vmovdqu -24+288-128(%rcx),%ymm13 - movq %r12,%r9 - vpaddq %ymm0,%ymm7,%ymm6 - vpmuludq %ymm11,%ymm12,%ymm12 - addq (%rsp),%r9 - vpaddq %ymm12,%ymm8,%ymm7 - vpmuludq %ymm11,%ymm13,%ymm13 - vmovq %r12,%xmm12 - vpaddq %ymm13,%ymm9,%ymm8 - - decl %r14d - jnz .Loop_mul_1024 - vpaddq (%rsp),%ymm12,%ymm0 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm10,%ymm10 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpermq $0x93,%ymm11,%ymm11 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vpsrlq $29,%ymm0,%ymm12 - vpand %ymm15,%ymm0,%ymm0 - vpsrlq $29,%ymm1,%ymm13 - vpand %ymm15,%ymm1,%ymm1 - vpsrlq $29,%ymm2,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm2,%ymm2 - vpsrlq $29,%ymm3,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm3,%ymm3 - vpermq $0x93,%ymm10,%ymm10 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm11,%ymm11 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm0,%ymm0 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm1,%ymm1 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm2,%ymm2 - vpblendd $3,%ymm11,%ymm14,%ymm11 - vpaddq %ymm10,%ymm3,%ymm3 - vpaddq %ymm11,%ymm4,%ymm4 - - vmovdqu %ymm0,0-128(%rdi) - vmovdqu %ymm1,32-128(%rdi) - vmovdqu %ymm2,64-128(%rdi) - vmovdqu %ymm3,96-128(%rdi) - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vpsrlq $29,%ymm4,%ymm12 - vpand %ymm15,%ymm4,%ymm4 - vpsrlq $29,%ymm5,%ymm13 - vpand %ymm15,%ymm5,%ymm5 - vpsrlq $29,%ymm6,%ymm10 - vpermq $0x93,%ymm12,%ymm12 - vpand %ymm15,%ymm6,%ymm6 - vpsrlq $29,%ymm7,%ymm11 - vpermq $0x93,%ymm13,%ymm13 - vpand %ymm15,%ymm7,%ymm7 - vpsrlq $29,%ymm8,%ymm0 - vpermq $0x93,%ymm10,%ymm10 - vpand %ymm15,%ymm8,%ymm8 - vpermq $0x93,%ymm11,%ymm11 - - vpblendd $3,%ymm14,%ymm12,%ymm9 - vpermq $0x93,%ymm0,%ymm0 - vpblendd $3,%ymm12,%ymm13,%ymm12 - vpaddq %ymm9,%ymm4,%ymm4 - vpblendd $3,%ymm13,%ymm10,%ymm13 - vpaddq %ymm12,%ymm5,%ymm5 - vpblendd $3,%ymm10,%ymm11,%ymm10 - vpaddq %ymm13,%ymm6,%ymm6 - vpblendd $3,%ymm11,%ymm0,%ymm11 - vpaddq %ymm10,%ymm7,%ymm7 - vpaddq %ymm11,%ymm8,%ymm8 - - vmovdqu %ymm4,128-128(%rdi) - vmovdqu %ymm5,160-128(%rdi) - vmovdqu %ymm6,192-128(%rdi) - vmovdqu %ymm7,224-128(%rdi) - vmovdqu %ymm8,256-128(%rdi) - vzeroupper - - movq %rbp,%rax -.cfi_def_cfa_register %rax - movq -48(%rax),%r15 -.cfi_restore %r15 - movq -40(%rax),%r14 -.cfi_restore %r14 - movq -32(%rax),%r13 -.cfi_restore %r13 - movq -24(%rax),%r12 -.cfi_restore %r12 - movq -16(%rax),%rbp -.cfi_restore %rbp - movq -8(%rax),%rbx -.cfi_restore %rbx - leaq (%rax),%rsp -.cfi_def_cfa_register %rsp -.Lmul_1024_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2 -.globl rsaz_1024_red2norm_avx2 -.hidden rsaz_1024_red2norm_avx2 -.type rsaz_1024_red2norm_avx2,@function -.align 32 -rsaz_1024_red2norm_avx2: -.cfi_startproc - subq $-128,%rsi - xorq %rax,%rax - movq -128(%rsi),%r8 - movq -120(%rsi),%r9 - movq -112(%rsi),%r10 - shlq $0,%r8 - shlq $29,%r9 - movq %r10,%r11 - shlq $58,%r10 - shrq $6,%r11 - addq %r8,%rax - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,0(%rdi) - movq %r11,%rax - movq -104(%rsi),%r8 - movq -96(%rsi),%r9 - shlq $23,%r8 - movq %r9,%r10 - shlq $52,%r9 - shrq $12,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,8(%rdi) - movq %r10,%rax - movq -88(%rsi),%r11 - movq -80(%rsi),%r8 - shlq $17,%r11 - movq %r8,%r9 - shlq $46,%r8 - shrq $18,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,16(%rdi) - movq %r9,%rax - movq -72(%rsi),%r10 - movq -64(%rsi),%r11 - shlq $11,%r10 - movq %r11,%r8 - shlq $40,%r11 - shrq $24,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,24(%rdi) - movq %r8,%rax - movq -56(%rsi),%r9 - movq -48(%rsi),%r10 - movq -40(%rsi),%r11 - shlq $5,%r9 - shlq $34,%r10 - movq %r11,%r8 - shlq $63,%r11 - shrq $1,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,32(%rdi) - movq %r8,%rax - movq -32(%rsi),%r9 - movq -24(%rsi),%r10 - shlq $28,%r9 - movq %r10,%r11 - shlq $57,%r10 - shrq $7,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,40(%rdi) - movq %r11,%rax - movq -16(%rsi),%r8 - movq -8(%rsi),%r9 - shlq $22,%r8 - movq %r9,%r10 - shlq $51,%r9 - shrq $13,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,48(%rdi) - movq %r10,%rax - movq 0(%rsi),%r11 - movq 8(%rsi),%r8 - shlq $16,%r11 - movq %r8,%r9 - shlq $45,%r8 - shrq $19,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,56(%rdi) - movq %r9,%rax - movq 16(%rsi),%r10 - movq 24(%rsi),%r11 - shlq $10,%r10 - movq %r11,%r8 - shlq $39,%r11 - shrq $25,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,64(%rdi) - movq %r8,%rax - movq 32(%rsi),%r9 - movq 40(%rsi),%r10 - movq 48(%rsi),%r11 - shlq $4,%r9 - shlq $33,%r10 - movq %r11,%r8 - shlq $62,%r11 - shrq $2,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,72(%rdi) - movq %r8,%rax - movq 56(%rsi),%r9 - movq 64(%rsi),%r10 - shlq $27,%r9 - movq %r10,%r11 - shlq $56,%r10 - shrq $8,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,80(%rdi) - movq %r11,%rax - movq 72(%rsi),%r8 - movq 80(%rsi),%r9 - shlq $21,%r8 - movq %r9,%r10 - shlq $50,%r9 - shrq $14,%r10 - addq %r8,%rax - addq %r9,%rax - adcq $0,%r10 - movq %rax,88(%rdi) - movq %r10,%rax - movq 88(%rsi),%r11 - movq 96(%rsi),%r8 - shlq $15,%r11 - movq %r8,%r9 - shlq $44,%r8 - shrq $20,%r9 - addq %r11,%rax - addq %r8,%rax - adcq $0,%r9 - movq %rax,96(%rdi) - movq %r9,%rax - movq 104(%rsi),%r10 - movq 112(%rsi),%r11 - shlq $9,%r10 - movq %r11,%r8 - shlq $38,%r11 - shrq $26,%r8 - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,104(%rdi) - movq %r8,%rax - movq 120(%rsi),%r9 - movq 128(%rsi),%r10 - movq 136(%rsi),%r11 - shlq $3,%r9 - shlq $32,%r10 - movq %r11,%r8 - shlq $61,%r11 - shrq $3,%r8 - addq %r9,%rax - addq %r10,%rax - addq %r11,%rax - adcq $0,%r8 - movq %rax,112(%rdi) - movq %r8,%rax - movq 144(%rsi),%r9 - movq 152(%rsi),%r10 - shlq $26,%r9 - movq %r10,%r11 - shlq $55,%r10 - shrq $9,%r11 - addq %r9,%rax - addq %r10,%rax - adcq $0,%r11 - movq %rax,120(%rdi) - movq %r11,%rax - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2 - -.globl rsaz_1024_norm2red_avx2 -.hidden rsaz_1024_norm2red_avx2 -.type rsaz_1024_norm2red_avx2,@function -.align 32 -rsaz_1024_norm2red_avx2: -.cfi_startproc - subq $-128,%rdi - movq (%rsi),%r8 - movl $0x1fffffff,%eax - movq 8(%rsi),%r9 - movq %r8,%r11 - shrq $0,%r11 - andq %rax,%r11 - movq %r11,-128(%rdi) - movq %r8,%r10 - shrq $29,%r10 - andq %rax,%r10 - movq %r10,-120(%rdi) - shrdq $58,%r9,%r8 - andq %rax,%r8 - movq %r8,-112(%rdi) - movq 16(%rsi),%r10 - movq %r9,%r8 - shrq $23,%r8 - andq %rax,%r8 - movq %r8,-104(%rdi) - shrdq $52,%r10,%r9 - andq %rax,%r9 - movq %r9,-96(%rdi) - movq 24(%rsi),%r11 - movq %r10,%r9 - shrq $17,%r9 - andq %rax,%r9 - movq %r9,-88(%rdi) - shrdq $46,%r11,%r10 - andq %rax,%r10 - movq %r10,-80(%rdi) - movq 32(%rsi),%r8 - movq %r11,%r10 - shrq $11,%r10 - andq %rax,%r10 - movq %r10,-72(%rdi) - shrdq $40,%r8,%r11 - andq %rax,%r11 - movq %r11,-64(%rdi) - movq 40(%rsi),%r9 - movq %r8,%r11 - shrq $5,%r11 - andq %rax,%r11 - movq %r11,-56(%rdi) - movq %r8,%r10 - shrq $34,%r10 - andq %rax,%r10 - movq %r10,-48(%rdi) - shrdq $63,%r9,%r8 - andq %rax,%r8 - movq %r8,-40(%rdi) - movq 48(%rsi),%r10 - movq %r9,%r8 - shrq $28,%r8 - andq %rax,%r8 - movq %r8,-32(%rdi) - shrdq $57,%r10,%r9 - andq %rax,%r9 - movq %r9,-24(%rdi) - movq 56(%rsi),%r11 - movq %r10,%r9 - shrq $22,%r9 - andq %rax,%r9 - movq %r9,-16(%rdi) - shrdq $51,%r11,%r10 - andq %rax,%r10 - movq %r10,-8(%rdi) - movq 64(%rsi),%r8 - movq %r11,%r10 - shrq $16,%r10 - andq %rax,%r10 - movq %r10,0(%rdi) - shrdq $45,%r8,%r11 - andq %rax,%r11 - movq %r11,8(%rdi) - movq 72(%rsi),%r9 - movq %r8,%r11 - shrq $10,%r11 - andq %rax,%r11 - movq %r11,16(%rdi) - shrdq $39,%r9,%r8 - andq %rax,%r8 - movq %r8,24(%rdi) - movq 80(%rsi),%r10 - movq %r9,%r8 - shrq $4,%r8 - andq %rax,%r8 - movq %r8,32(%rdi) - movq %r9,%r11 - shrq $33,%r11 - andq %rax,%r11 - movq %r11,40(%rdi) - shrdq $62,%r10,%r9 - andq %rax,%r9 - movq %r9,48(%rdi) - movq 88(%rsi),%r11 - movq %r10,%r9 - shrq $27,%r9 - andq %rax,%r9 - movq %r9,56(%rdi) - shrdq $56,%r11,%r10 - andq %rax,%r10 - movq %r10,64(%rdi) - movq 96(%rsi),%r8 - movq %r11,%r10 - shrq $21,%r10 - andq %rax,%r10 - movq %r10,72(%rdi) - shrdq $50,%r8,%r11 - andq %rax,%r11 - movq %r11,80(%rdi) - movq 104(%rsi),%r9 - movq %r8,%r11 - shrq $15,%r11 - andq %rax,%r11 - movq %r11,88(%rdi) - shrdq $44,%r9,%r8 - andq %rax,%r8 - movq %r8,96(%rdi) - movq 112(%rsi),%r10 - movq %r9,%r8 - shrq $9,%r8 - andq %rax,%r8 - movq %r8,104(%rdi) - shrdq $38,%r10,%r9 - andq %rax,%r9 - movq %r9,112(%rdi) - movq 120(%rsi),%r11 - movq %r10,%r9 - shrq $3,%r9 - andq %rax,%r9 - movq %r9,120(%rdi) - movq %r10,%r8 - shrq $32,%r8 - andq %rax,%r8 - movq %r8,128(%rdi) - shrdq $61,%r11,%r10 - andq %rax,%r10 - movq %r10,136(%rdi) - xorq %r8,%r8 - movq %r11,%r10 - shrq $26,%r10 - andq %rax,%r10 - movq %r10,144(%rdi) - shrdq $55,%r8,%r11 - andq %rax,%r11 - movq %r11,152(%rdi) - movq %r8,160(%rdi) - movq %r8,168(%rdi) - movq %r8,176(%rdi) - movq %r8,184(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2 -.globl rsaz_1024_scatter5_avx2 -.hidden rsaz_1024_scatter5_avx2 -.type rsaz_1024_scatter5_avx2,@function -.align 32 -rsaz_1024_scatter5_avx2: -.cfi_startproc - vzeroupper - vmovdqu .Lscatter_permd(%rip),%ymm5 - shll $4,%edx - leaq (%rdi,%rdx,1),%rdi - movl $9,%eax - jmp .Loop_scatter_1024 - -.align 32 -.Loop_scatter_1024: - vmovdqu (%rsi),%ymm0 - leaq 32(%rsi),%rsi - vpermd %ymm0,%ymm5,%ymm0 - vmovdqu %xmm0,(%rdi) - leaq 512(%rdi),%rdi - decl %eax - jnz .Loop_scatter_1024 - - vzeroupper - .byte 0xf3,0xc3 -.cfi_endproc -.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2 - -.globl rsaz_1024_gather5_avx2 -.hidden rsaz_1024_gather5_avx2 -.type rsaz_1024_gather5_avx2,@function -.align 32 -rsaz_1024_gather5_avx2: -.cfi_startproc - vzeroupper - movq %rsp,%r11 -.cfi_def_cfa_register %r11 - leaq -256(%rsp),%rsp - andq $-32,%rsp - leaq .Linc(%rip),%r10 - leaq -128(%rsp),%rax - - vmovd %edx,%xmm4 - vmovdqa (%r10),%ymm0 - vmovdqa 32(%r10),%ymm1 - vmovdqa 64(%r10),%ymm5 - vpbroadcastd %xmm4,%ymm4 - - vpaddd %ymm5,%ymm0,%ymm2 - vpcmpeqd %ymm4,%ymm0,%ymm0 - vpaddd %ymm5,%ymm1,%ymm3 - vpcmpeqd %ymm4,%ymm1,%ymm1 - vmovdqa %ymm0,0+128(%rax) - vpaddd %ymm5,%ymm2,%ymm0 - vpcmpeqd %ymm4,%ymm2,%ymm2 - vmovdqa %ymm1,32+128(%rax) - vpaddd %ymm5,%ymm3,%ymm1 - vpcmpeqd %ymm4,%ymm3,%ymm3 - vmovdqa %ymm2,64+128(%rax) - vpaddd %ymm5,%ymm0,%ymm2 - vpcmpeqd %ymm4,%ymm0,%ymm0 - vmovdqa %ymm3,96+128(%rax) - vpaddd %ymm5,%ymm1,%ymm3 - vpcmpeqd %ymm4,%ymm1,%ymm1 - vmovdqa %ymm0,128+128(%rax) - vpaddd %ymm5,%ymm2,%ymm8 - vpcmpeqd %ymm4,%ymm2,%ymm2 - vmovdqa %ymm1,160+128(%rax) - vpaddd %ymm5,%ymm3,%ymm9 - vpcmpeqd %ymm4,%ymm3,%ymm3 - vmovdqa %ymm2,192+128(%rax) - vpaddd %ymm5,%ymm8,%ymm10 - vpcmpeqd %ymm4,%ymm8,%ymm8 - vmovdqa %ymm3,224+128(%rax) - vpaddd %ymm5,%ymm9,%ymm11 - vpcmpeqd %ymm4,%ymm9,%ymm9 - vpaddd %ymm5,%ymm10,%ymm12 - vpcmpeqd %ymm4,%ymm10,%ymm10 - vpaddd %ymm5,%ymm11,%ymm13 - vpcmpeqd %ymm4,%ymm11,%ymm11 - vpaddd %ymm5,%ymm12,%ymm14 - vpcmpeqd %ymm4,%ymm12,%ymm12 - vpaddd %ymm5,%ymm13,%ymm15 - vpcmpeqd %ymm4,%ymm13,%ymm13 - vpcmpeqd %ymm4,%ymm14,%ymm14 - vpcmpeqd %ymm4,%ymm15,%ymm15 - - vmovdqa -32(%r10),%ymm7 - leaq 128(%rsi),%rsi - movl $9,%edx - -.Loop_gather_1024: - vmovdqa 0-128(%rsi),%ymm0 - vmovdqa 32-128(%rsi),%ymm1 - vmovdqa 64-128(%rsi),%ymm2 - vmovdqa 96-128(%rsi),%ymm3 - vpand 0+128(%rax),%ymm0,%ymm0 - vpand 32+128(%rax),%ymm1,%ymm1 - vpand 64+128(%rax),%ymm2,%ymm2 - vpor %ymm0,%ymm1,%ymm4 - vpand 96+128(%rax),%ymm3,%ymm3 - vmovdqa 128-128(%rsi),%ymm0 - vmovdqa 160-128(%rsi),%ymm1 - vpor %ymm2,%ymm3,%ymm5 - vmovdqa 192-128(%rsi),%ymm2 - vmovdqa 224-128(%rsi),%ymm3 - vpand 128+128(%rax),%ymm0,%ymm0 - vpand 160+128(%rax),%ymm1,%ymm1 - vpand 192+128(%rax),%ymm2,%ymm2 - vpor %ymm0,%ymm4,%ymm4 - vpand 224+128(%rax),%ymm3,%ymm3 - vpand 256-128(%rsi),%ymm8,%ymm0 - vpor %ymm1,%ymm5,%ymm5 - vpand 288-128(%rsi),%ymm9,%ymm1 - vpor %ymm2,%ymm4,%ymm4 - vpand 320-128(%rsi),%ymm10,%ymm2 - vpor %ymm3,%ymm5,%ymm5 - vpand 352-128(%rsi),%ymm11,%ymm3 - vpor %ymm0,%ymm4,%ymm4 - vpand 384-128(%rsi),%ymm12,%ymm0 - vpor %ymm1,%ymm5,%ymm5 - vpand 416-128(%rsi),%ymm13,%ymm1 - vpor %ymm2,%ymm4,%ymm4 - vpand 448-128(%rsi),%ymm14,%ymm2 - vpor %ymm3,%ymm5,%ymm5 - vpand 480-128(%rsi),%ymm15,%ymm3 - leaq 512(%rsi),%rsi - vpor %ymm0,%ymm4,%ymm4 - vpor %ymm1,%ymm5,%ymm5 - vpor %ymm2,%ymm4,%ymm4 - vpor %ymm3,%ymm5,%ymm5 - - vpor %ymm5,%ymm4,%ymm4 - vextracti128 $1,%ymm4,%xmm5 - vpor %xmm4,%xmm5,%xmm5 - vpermd %ymm5,%ymm7,%ymm5 - vmovdqu %ymm5,(%rdi) - leaq 32(%rdi),%rdi - decl %edx - jnz .Loop_gather_1024 - - vpxor %ymm0,%ymm0,%ymm0 - vmovdqu %ymm0,(%rdi) - vzeroupper - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp - .byte 0xf3,0xc3 -.cfi_endproc -.LSEH_end_rsaz_1024_gather5: -.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2 -.align 64 -.Land_mask: -.quad 0x1fffffff,0x1fffffff,0x1fffffff,0x1fffffff -.Lscatter_permd: -.long 0,2,4,6,7,7,7,7 -.Lgather_permd: -.long 0,7,1,7,2,7,3,7 -.Linc: -.long 0,0,0,0, 1,1,1,1 -.long 2,2,2,2, 3,3,3,3 -.long 4,4,4,4, 4,4,4,4 -.align 64 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S deleted file mode 100644 index cf2e7bc7..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha1-x86_64.S +++ /dev/null @@ -1,5468 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -.globl sha1_block_data_order -.hidden sha1_block_data_order -.type sha1_block_data_order,@function -.align 16 -sha1_block_data_order: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%r10 - movl 0(%r10),%r9d - movl 4(%r10),%r8d - movl 8(%r10),%r10d - testl $512,%r8d - jz .Lialu - testl $536870912,%r10d - jnz _shaext_shortcut - andl $296,%r10d - cmpl $296,%r10d - je _avx2_shortcut - andl $268435456,%r8d - andl $1073741824,%r9d - orl %r9d,%r8d - cmpl $1342177280,%r8d - je _avx_shortcut - jmp _ssse3_shortcut - -.align 16 -.Lialu: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - movq %rdi,%r8 - subq $72,%rsp - movq %rsi,%r9 - andq $-64,%rsp - movq %rdx,%r10 - movq %rax,64(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xc0,0x00,0x06,0x23,0x08 -.Lprologue: - - movl 0(%r8),%esi - movl 4(%r8),%edi - movl 8(%r8),%r11d - movl 12(%r8),%r12d - movl 16(%r8),%r13d - jmp .Lloop - -.align 16 -.Lloop: - movl 0(%r9),%edx - bswapl %edx - movl 4(%r9),%ebp - movl %r12d,%eax - movl %edx,0(%rsp) - movl %esi,%ecx - bswapl %ebp - xorl %r11d,%eax - roll $5,%ecx - andl %edi,%eax - leal 1518500249(%rdx,%r13,1),%r13d - addl %ecx,%r13d - xorl %r12d,%eax - roll $30,%edi - addl %eax,%r13d - movl 8(%r9),%r14d - movl %r11d,%eax - movl %ebp,4(%rsp) - movl %r13d,%ecx - bswapl %r14d - xorl %edi,%eax - roll $5,%ecx - andl %esi,%eax - leal 1518500249(%rbp,%r12,1),%r12d - addl %ecx,%r12d - xorl %r11d,%eax - roll $30,%esi - addl %eax,%r12d - movl 12(%r9),%edx - movl %edi,%eax - movl %r14d,8(%rsp) - movl %r12d,%ecx - bswapl %edx - xorl %esi,%eax - roll $5,%ecx - andl %r13d,%eax - leal 1518500249(%r14,%r11,1),%r11d - addl %ecx,%r11d - xorl %edi,%eax - roll $30,%r13d - addl %eax,%r11d - movl 16(%r9),%ebp - movl %esi,%eax - movl %edx,12(%rsp) - movl %r11d,%ecx - bswapl %ebp - xorl %r13d,%eax - roll $5,%ecx - andl %r12d,%eax - leal 1518500249(%rdx,%rdi,1),%edi - addl %ecx,%edi - xorl %esi,%eax - roll $30,%r12d - addl %eax,%edi - movl 20(%r9),%r14d - movl %r13d,%eax - movl %ebp,16(%rsp) - movl %edi,%ecx - bswapl %r14d - xorl %r12d,%eax - roll $5,%ecx - andl %r11d,%eax - leal 1518500249(%rbp,%rsi,1),%esi - addl %ecx,%esi - xorl %r13d,%eax - roll $30,%r11d - addl %eax,%esi - movl 24(%r9),%edx - movl %r12d,%eax - movl %r14d,20(%rsp) - movl %esi,%ecx - bswapl %edx - xorl %r11d,%eax - roll $5,%ecx - andl %edi,%eax - leal 1518500249(%r14,%r13,1),%r13d - addl %ecx,%r13d - xorl %r12d,%eax - roll $30,%edi - addl %eax,%r13d - movl 28(%r9),%ebp - movl %r11d,%eax - movl %edx,24(%rsp) - movl %r13d,%ecx - bswapl %ebp - xorl %edi,%eax - roll $5,%ecx - andl %esi,%eax - leal 1518500249(%rdx,%r12,1),%r12d - addl %ecx,%r12d - xorl %r11d,%eax - roll $30,%esi - addl %eax,%r12d - movl 32(%r9),%r14d - movl %edi,%eax - movl %ebp,28(%rsp) - movl %r12d,%ecx - bswapl %r14d - xorl %esi,%eax - roll $5,%ecx - andl %r13d,%eax - leal 1518500249(%rbp,%r11,1),%r11d - addl %ecx,%r11d - xorl %edi,%eax - roll $30,%r13d - addl %eax,%r11d - movl 36(%r9),%edx - movl %esi,%eax - movl %r14d,32(%rsp) - movl %r11d,%ecx - bswapl %edx - xorl %r13d,%eax - roll $5,%ecx - andl %r12d,%eax - leal 1518500249(%r14,%rdi,1),%edi - addl %ecx,%edi - xorl %esi,%eax - roll $30,%r12d - addl %eax,%edi - movl 40(%r9),%ebp - movl %r13d,%eax - movl %edx,36(%rsp) - movl %edi,%ecx - bswapl %ebp - xorl %r12d,%eax - roll $5,%ecx - andl %r11d,%eax - leal 1518500249(%rdx,%rsi,1),%esi - addl %ecx,%esi - xorl %r13d,%eax - roll $30,%r11d - addl %eax,%esi - movl 44(%r9),%r14d - movl %r12d,%eax - movl %ebp,40(%rsp) - movl %esi,%ecx - bswapl %r14d - xorl %r11d,%eax - roll $5,%ecx - andl %edi,%eax - leal 1518500249(%rbp,%r13,1),%r13d - addl %ecx,%r13d - xorl %r12d,%eax - roll $30,%edi - addl %eax,%r13d - movl 48(%r9),%edx - movl %r11d,%eax - movl %r14d,44(%rsp) - movl %r13d,%ecx - bswapl %edx - xorl %edi,%eax - roll $5,%ecx - andl %esi,%eax - leal 1518500249(%r14,%r12,1),%r12d - addl %ecx,%r12d - xorl %r11d,%eax - roll $30,%esi - addl %eax,%r12d - movl 52(%r9),%ebp - movl %edi,%eax - movl %edx,48(%rsp) - movl %r12d,%ecx - bswapl %ebp - xorl %esi,%eax - roll $5,%ecx - andl %r13d,%eax - leal 1518500249(%rdx,%r11,1),%r11d - addl %ecx,%r11d - xorl %edi,%eax - roll $30,%r13d - addl %eax,%r11d - movl 56(%r9),%r14d - movl %esi,%eax - movl %ebp,52(%rsp) - movl %r11d,%ecx - bswapl %r14d - xorl %r13d,%eax - roll $5,%ecx - andl %r12d,%eax - leal 1518500249(%rbp,%rdi,1),%edi - addl %ecx,%edi - xorl %esi,%eax - roll $30,%r12d - addl %eax,%edi - movl 60(%r9),%edx - movl %r13d,%eax - movl %r14d,56(%rsp) - movl %edi,%ecx - bswapl %edx - xorl %r12d,%eax - roll $5,%ecx - andl %r11d,%eax - leal 1518500249(%r14,%rsi,1),%esi - addl %ecx,%esi - xorl %r13d,%eax - roll $30,%r11d - addl %eax,%esi - xorl 0(%rsp),%ebp - movl %r12d,%eax - movl %edx,60(%rsp) - movl %esi,%ecx - xorl 8(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - xorl 32(%rsp),%ebp - andl %edi,%eax - leal 1518500249(%rdx,%r13,1),%r13d - roll $30,%edi - xorl %r12d,%eax - addl %ecx,%r13d - roll $1,%ebp - addl %eax,%r13d - xorl 4(%rsp),%r14d - movl %r11d,%eax - movl %ebp,0(%rsp) - movl %r13d,%ecx - xorl 12(%rsp),%r14d - xorl %edi,%eax - roll $5,%ecx - xorl 36(%rsp),%r14d - andl %esi,%eax - leal 1518500249(%rbp,%r12,1),%r12d - roll $30,%esi - xorl %r11d,%eax - addl %ecx,%r12d - roll $1,%r14d - addl %eax,%r12d - xorl 8(%rsp),%edx - movl %edi,%eax - movl %r14d,4(%rsp) - movl %r12d,%ecx - xorl 16(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - xorl 40(%rsp),%edx - andl %r13d,%eax - leal 1518500249(%r14,%r11,1),%r11d - roll $30,%r13d - xorl %edi,%eax - addl %ecx,%r11d - roll $1,%edx - addl %eax,%r11d - xorl 12(%rsp),%ebp - movl %esi,%eax - movl %edx,8(%rsp) - movl %r11d,%ecx - xorl 20(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - xorl 44(%rsp),%ebp - andl %r12d,%eax - leal 1518500249(%rdx,%rdi,1),%edi - roll $30,%r12d - xorl %esi,%eax - addl %ecx,%edi - roll $1,%ebp - addl %eax,%edi - xorl 16(%rsp),%r14d - movl %r13d,%eax - movl %ebp,12(%rsp) - movl %edi,%ecx - xorl 24(%rsp),%r14d - xorl %r12d,%eax - roll $5,%ecx - xorl 48(%rsp),%r14d - andl %r11d,%eax - leal 1518500249(%rbp,%rsi,1),%esi - roll $30,%r11d - xorl %r13d,%eax - addl %ecx,%esi - roll $1,%r14d - addl %eax,%esi - xorl 20(%rsp),%edx - movl %edi,%eax - movl %r14d,16(%rsp) - movl %esi,%ecx - xorl 28(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - xorl 52(%rsp),%edx - leal 1859775393(%r14,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%edx - xorl 24(%rsp),%ebp - movl %esi,%eax - movl %edx,20(%rsp) - movl %r13d,%ecx - xorl 32(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - xorl 56(%rsp),%ebp - leal 1859775393(%rdx,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%ebp - xorl 28(%rsp),%r14d - movl %r13d,%eax - movl %ebp,24(%rsp) - movl %r12d,%ecx - xorl 36(%rsp),%r14d - xorl %edi,%eax - roll $5,%ecx - xorl 60(%rsp),%r14d - leal 1859775393(%rbp,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%r14d - xorl 32(%rsp),%edx - movl %r12d,%eax - movl %r14d,28(%rsp) - movl %r11d,%ecx - xorl 40(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - xorl 0(%rsp),%edx - leal 1859775393(%r14,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%edx - xorl 36(%rsp),%ebp - movl %r11d,%eax - movl %edx,32(%rsp) - movl %edi,%ecx - xorl 44(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - xorl 4(%rsp),%ebp - leal 1859775393(%rdx,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%ebp - xorl 40(%rsp),%r14d - movl %edi,%eax - movl %ebp,36(%rsp) - movl %esi,%ecx - xorl 48(%rsp),%r14d - xorl %r12d,%eax - roll $5,%ecx - xorl 8(%rsp),%r14d - leal 1859775393(%rbp,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%r14d - xorl 44(%rsp),%edx - movl %esi,%eax - movl %r14d,40(%rsp) - movl %r13d,%ecx - xorl 52(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - xorl 12(%rsp),%edx - leal 1859775393(%r14,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%edx - xorl 48(%rsp),%ebp - movl %r13d,%eax - movl %edx,44(%rsp) - movl %r12d,%ecx - xorl 56(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - xorl 16(%rsp),%ebp - leal 1859775393(%rdx,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%ebp - xorl 52(%rsp),%r14d - movl %r12d,%eax - movl %ebp,48(%rsp) - movl %r11d,%ecx - xorl 60(%rsp),%r14d - xorl %esi,%eax - roll $5,%ecx - xorl 20(%rsp),%r14d - leal 1859775393(%rbp,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%r14d - xorl 56(%rsp),%edx - movl %r11d,%eax - movl %r14d,52(%rsp) - movl %edi,%ecx - xorl 0(%rsp),%edx - xorl %r13d,%eax - roll $5,%ecx - xorl 24(%rsp),%edx - leal 1859775393(%r14,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%edx - xorl 60(%rsp),%ebp - movl %edi,%eax - movl %edx,56(%rsp) - movl %esi,%ecx - xorl 4(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - xorl 28(%rsp),%ebp - leal 1859775393(%rdx,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%ebp - xorl 0(%rsp),%r14d - movl %esi,%eax - movl %ebp,60(%rsp) - movl %r13d,%ecx - xorl 8(%rsp),%r14d - xorl %r11d,%eax - roll $5,%ecx - xorl 32(%rsp),%r14d - leal 1859775393(%rbp,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%r14d - xorl 4(%rsp),%edx - movl %r13d,%eax - movl %r14d,0(%rsp) - movl %r12d,%ecx - xorl 12(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - xorl 36(%rsp),%edx - leal 1859775393(%r14,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%edx - xorl 8(%rsp),%ebp - movl %r12d,%eax - movl %edx,4(%rsp) - movl %r11d,%ecx - xorl 16(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - xorl 40(%rsp),%ebp - leal 1859775393(%rdx,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%ebp - xorl 12(%rsp),%r14d - movl %r11d,%eax - movl %ebp,8(%rsp) - movl %edi,%ecx - xorl 20(%rsp),%r14d - xorl %r13d,%eax - roll $5,%ecx - xorl 44(%rsp),%r14d - leal 1859775393(%rbp,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%r14d - xorl 16(%rsp),%edx - movl %edi,%eax - movl %r14d,12(%rsp) - movl %esi,%ecx - xorl 24(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - xorl 48(%rsp),%edx - leal 1859775393(%r14,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%edx - xorl 20(%rsp),%ebp - movl %esi,%eax - movl %edx,16(%rsp) - movl %r13d,%ecx - xorl 28(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - xorl 52(%rsp),%ebp - leal 1859775393(%rdx,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%ebp - xorl 24(%rsp),%r14d - movl %r13d,%eax - movl %ebp,20(%rsp) - movl %r12d,%ecx - xorl 32(%rsp),%r14d - xorl %edi,%eax - roll $5,%ecx - xorl 56(%rsp),%r14d - leal 1859775393(%rbp,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%r14d - xorl 28(%rsp),%edx - movl %r12d,%eax - movl %r14d,24(%rsp) - movl %r11d,%ecx - xorl 36(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - xorl 60(%rsp),%edx - leal 1859775393(%r14,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%edx - xorl 32(%rsp),%ebp - movl %r11d,%eax - movl %edx,28(%rsp) - movl %edi,%ecx - xorl 40(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - xorl 0(%rsp),%ebp - leal 1859775393(%rdx,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%ebp - xorl 36(%rsp),%r14d - movl %r12d,%eax - movl %ebp,32(%rsp) - movl %r12d,%ebx - xorl 44(%rsp),%r14d - andl %r11d,%eax - movl %esi,%ecx - xorl 4(%rsp),%r14d - leal -1894007588(%rbp,%r13,1),%r13d - xorl %r11d,%ebx - roll $5,%ecx - addl %eax,%r13d - roll $1,%r14d - andl %edi,%ebx - addl %ecx,%r13d - roll $30,%edi - addl %ebx,%r13d - xorl 40(%rsp),%edx - movl %r11d,%eax - movl %r14d,36(%rsp) - movl %r11d,%ebx - xorl 48(%rsp),%edx - andl %edi,%eax - movl %r13d,%ecx - xorl 8(%rsp),%edx - leal -1894007588(%r14,%r12,1),%r12d - xorl %edi,%ebx - roll $5,%ecx - addl %eax,%r12d - roll $1,%edx - andl %esi,%ebx - addl %ecx,%r12d - roll $30,%esi - addl %ebx,%r12d - xorl 44(%rsp),%ebp - movl %edi,%eax - movl %edx,40(%rsp) - movl %edi,%ebx - xorl 52(%rsp),%ebp - andl %esi,%eax - movl %r12d,%ecx - xorl 12(%rsp),%ebp - leal -1894007588(%rdx,%r11,1),%r11d - xorl %esi,%ebx - roll $5,%ecx - addl %eax,%r11d - roll $1,%ebp - andl %r13d,%ebx - addl %ecx,%r11d - roll $30,%r13d - addl %ebx,%r11d - xorl 48(%rsp),%r14d - movl %esi,%eax - movl %ebp,44(%rsp) - movl %esi,%ebx - xorl 56(%rsp),%r14d - andl %r13d,%eax - movl %r11d,%ecx - xorl 16(%rsp),%r14d - leal -1894007588(%rbp,%rdi,1),%edi - xorl %r13d,%ebx - roll $5,%ecx - addl %eax,%edi - roll $1,%r14d - andl %r12d,%ebx - addl %ecx,%edi - roll $30,%r12d - addl %ebx,%edi - xorl 52(%rsp),%edx - movl %r13d,%eax - movl %r14d,48(%rsp) - movl %r13d,%ebx - xorl 60(%rsp),%edx - andl %r12d,%eax - movl %edi,%ecx - xorl 20(%rsp),%edx - leal -1894007588(%r14,%rsi,1),%esi - xorl %r12d,%ebx - roll $5,%ecx - addl %eax,%esi - roll $1,%edx - andl %r11d,%ebx - addl %ecx,%esi - roll $30,%r11d - addl %ebx,%esi - xorl 56(%rsp),%ebp - movl %r12d,%eax - movl %edx,52(%rsp) - movl %r12d,%ebx - xorl 0(%rsp),%ebp - andl %r11d,%eax - movl %esi,%ecx - xorl 24(%rsp),%ebp - leal -1894007588(%rdx,%r13,1),%r13d - xorl %r11d,%ebx - roll $5,%ecx - addl %eax,%r13d - roll $1,%ebp - andl %edi,%ebx - addl %ecx,%r13d - roll $30,%edi - addl %ebx,%r13d - xorl 60(%rsp),%r14d - movl %r11d,%eax - movl %ebp,56(%rsp) - movl %r11d,%ebx - xorl 4(%rsp),%r14d - andl %edi,%eax - movl %r13d,%ecx - xorl 28(%rsp),%r14d - leal -1894007588(%rbp,%r12,1),%r12d - xorl %edi,%ebx - roll $5,%ecx - addl %eax,%r12d - roll $1,%r14d - andl %esi,%ebx - addl %ecx,%r12d - roll $30,%esi - addl %ebx,%r12d - xorl 0(%rsp),%edx - movl %edi,%eax - movl %r14d,60(%rsp) - movl %edi,%ebx - xorl 8(%rsp),%edx - andl %esi,%eax - movl %r12d,%ecx - xorl 32(%rsp),%edx - leal -1894007588(%r14,%r11,1),%r11d - xorl %esi,%ebx - roll $5,%ecx - addl %eax,%r11d - roll $1,%edx - andl %r13d,%ebx - addl %ecx,%r11d - roll $30,%r13d - addl %ebx,%r11d - xorl 4(%rsp),%ebp - movl %esi,%eax - movl %edx,0(%rsp) - movl %esi,%ebx - xorl 12(%rsp),%ebp - andl %r13d,%eax - movl %r11d,%ecx - xorl 36(%rsp),%ebp - leal -1894007588(%rdx,%rdi,1),%edi - xorl %r13d,%ebx - roll $5,%ecx - addl %eax,%edi - roll $1,%ebp - andl %r12d,%ebx - addl %ecx,%edi - roll $30,%r12d - addl %ebx,%edi - xorl 8(%rsp),%r14d - movl %r13d,%eax - movl %ebp,4(%rsp) - movl %r13d,%ebx - xorl 16(%rsp),%r14d - andl %r12d,%eax - movl %edi,%ecx - xorl 40(%rsp),%r14d - leal -1894007588(%rbp,%rsi,1),%esi - xorl %r12d,%ebx - roll $5,%ecx - addl %eax,%esi - roll $1,%r14d - andl %r11d,%ebx - addl %ecx,%esi - roll $30,%r11d - addl %ebx,%esi - xorl 12(%rsp),%edx - movl %r12d,%eax - movl %r14d,8(%rsp) - movl %r12d,%ebx - xorl 20(%rsp),%edx - andl %r11d,%eax - movl %esi,%ecx - xorl 44(%rsp),%edx - leal -1894007588(%r14,%r13,1),%r13d - xorl %r11d,%ebx - roll $5,%ecx - addl %eax,%r13d - roll $1,%edx - andl %edi,%ebx - addl %ecx,%r13d - roll $30,%edi - addl %ebx,%r13d - xorl 16(%rsp),%ebp - movl %r11d,%eax - movl %edx,12(%rsp) - movl %r11d,%ebx - xorl 24(%rsp),%ebp - andl %edi,%eax - movl %r13d,%ecx - xorl 48(%rsp),%ebp - leal -1894007588(%rdx,%r12,1),%r12d - xorl %edi,%ebx - roll $5,%ecx - addl %eax,%r12d - roll $1,%ebp - andl %esi,%ebx - addl %ecx,%r12d - roll $30,%esi - addl %ebx,%r12d - xorl 20(%rsp),%r14d - movl %edi,%eax - movl %ebp,16(%rsp) - movl %edi,%ebx - xorl 28(%rsp),%r14d - andl %esi,%eax - movl %r12d,%ecx - xorl 52(%rsp),%r14d - leal -1894007588(%rbp,%r11,1),%r11d - xorl %esi,%ebx - roll $5,%ecx - addl %eax,%r11d - roll $1,%r14d - andl %r13d,%ebx - addl %ecx,%r11d - roll $30,%r13d - addl %ebx,%r11d - xorl 24(%rsp),%edx - movl %esi,%eax - movl %r14d,20(%rsp) - movl %esi,%ebx - xorl 32(%rsp),%edx - andl %r13d,%eax - movl %r11d,%ecx - xorl 56(%rsp),%edx - leal -1894007588(%r14,%rdi,1),%edi - xorl %r13d,%ebx - roll $5,%ecx - addl %eax,%edi - roll $1,%edx - andl %r12d,%ebx - addl %ecx,%edi - roll $30,%r12d - addl %ebx,%edi - xorl 28(%rsp),%ebp - movl %r13d,%eax - movl %edx,24(%rsp) - movl %r13d,%ebx - xorl 36(%rsp),%ebp - andl %r12d,%eax - movl %edi,%ecx - xorl 60(%rsp),%ebp - leal -1894007588(%rdx,%rsi,1),%esi - xorl %r12d,%ebx - roll $5,%ecx - addl %eax,%esi - roll $1,%ebp - andl %r11d,%ebx - addl %ecx,%esi - roll $30,%r11d - addl %ebx,%esi - xorl 32(%rsp),%r14d - movl %r12d,%eax - movl %ebp,28(%rsp) - movl %r12d,%ebx - xorl 40(%rsp),%r14d - andl %r11d,%eax - movl %esi,%ecx - xorl 0(%rsp),%r14d - leal -1894007588(%rbp,%r13,1),%r13d - xorl %r11d,%ebx - roll $5,%ecx - addl %eax,%r13d - roll $1,%r14d - andl %edi,%ebx - addl %ecx,%r13d - roll $30,%edi - addl %ebx,%r13d - xorl 36(%rsp),%edx - movl %r11d,%eax - movl %r14d,32(%rsp) - movl %r11d,%ebx - xorl 44(%rsp),%edx - andl %edi,%eax - movl %r13d,%ecx - xorl 4(%rsp),%edx - leal -1894007588(%r14,%r12,1),%r12d - xorl %edi,%ebx - roll $5,%ecx - addl %eax,%r12d - roll $1,%edx - andl %esi,%ebx - addl %ecx,%r12d - roll $30,%esi - addl %ebx,%r12d - xorl 40(%rsp),%ebp - movl %edi,%eax - movl %edx,36(%rsp) - movl %edi,%ebx - xorl 48(%rsp),%ebp - andl %esi,%eax - movl %r12d,%ecx - xorl 8(%rsp),%ebp - leal -1894007588(%rdx,%r11,1),%r11d - xorl %esi,%ebx - roll $5,%ecx - addl %eax,%r11d - roll $1,%ebp - andl %r13d,%ebx - addl %ecx,%r11d - roll $30,%r13d - addl %ebx,%r11d - xorl 44(%rsp),%r14d - movl %esi,%eax - movl %ebp,40(%rsp) - movl %esi,%ebx - xorl 52(%rsp),%r14d - andl %r13d,%eax - movl %r11d,%ecx - xorl 12(%rsp),%r14d - leal -1894007588(%rbp,%rdi,1),%edi - xorl %r13d,%ebx - roll $5,%ecx - addl %eax,%edi - roll $1,%r14d - andl %r12d,%ebx - addl %ecx,%edi - roll $30,%r12d - addl %ebx,%edi - xorl 48(%rsp),%edx - movl %r13d,%eax - movl %r14d,44(%rsp) - movl %r13d,%ebx - xorl 56(%rsp),%edx - andl %r12d,%eax - movl %edi,%ecx - xorl 16(%rsp),%edx - leal -1894007588(%r14,%rsi,1),%esi - xorl %r12d,%ebx - roll $5,%ecx - addl %eax,%esi - roll $1,%edx - andl %r11d,%ebx - addl %ecx,%esi - roll $30,%r11d - addl %ebx,%esi - xorl 52(%rsp),%ebp - movl %edi,%eax - movl %edx,48(%rsp) - movl %esi,%ecx - xorl 60(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - xorl 20(%rsp),%ebp - leal -899497514(%rdx,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%ebp - xorl 56(%rsp),%r14d - movl %esi,%eax - movl %ebp,52(%rsp) - movl %r13d,%ecx - xorl 0(%rsp),%r14d - xorl %r11d,%eax - roll $5,%ecx - xorl 24(%rsp),%r14d - leal -899497514(%rbp,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%r14d - xorl 60(%rsp),%edx - movl %r13d,%eax - movl %r14d,56(%rsp) - movl %r12d,%ecx - xorl 4(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - xorl 28(%rsp),%edx - leal -899497514(%r14,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%edx - xorl 0(%rsp),%ebp - movl %r12d,%eax - movl %edx,60(%rsp) - movl %r11d,%ecx - xorl 8(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - xorl 32(%rsp),%ebp - leal -899497514(%rdx,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%ebp - xorl 4(%rsp),%r14d - movl %r11d,%eax - movl %ebp,0(%rsp) - movl %edi,%ecx - xorl 12(%rsp),%r14d - xorl %r13d,%eax - roll $5,%ecx - xorl 36(%rsp),%r14d - leal -899497514(%rbp,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%r14d - xorl 8(%rsp),%edx - movl %edi,%eax - movl %r14d,4(%rsp) - movl %esi,%ecx - xorl 16(%rsp),%edx - xorl %r12d,%eax - roll $5,%ecx - xorl 40(%rsp),%edx - leal -899497514(%r14,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%edx - xorl 12(%rsp),%ebp - movl %esi,%eax - movl %edx,8(%rsp) - movl %r13d,%ecx - xorl 20(%rsp),%ebp - xorl %r11d,%eax - roll $5,%ecx - xorl 44(%rsp),%ebp - leal -899497514(%rdx,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%ebp - xorl 16(%rsp),%r14d - movl %r13d,%eax - movl %ebp,12(%rsp) - movl %r12d,%ecx - xorl 24(%rsp),%r14d - xorl %edi,%eax - roll $5,%ecx - xorl 48(%rsp),%r14d - leal -899497514(%rbp,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%r14d - xorl 20(%rsp),%edx - movl %r12d,%eax - movl %r14d,16(%rsp) - movl %r11d,%ecx - xorl 28(%rsp),%edx - xorl %esi,%eax - roll $5,%ecx - xorl 52(%rsp),%edx - leal -899497514(%r14,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%edx - xorl 24(%rsp),%ebp - movl %r11d,%eax - movl %edx,20(%rsp) - movl %edi,%ecx - xorl 32(%rsp),%ebp - xorl %r13d,%eax - roll $5,%ecx - xorl 56(%rsp),%ebp - leal -899497514(%rdx,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%ebp - xorl 28(%rsp),%r14d - movl %edi,%eax - movl %ebp,24(%rsp) - movl %esi,%ecx - xorl 36(%rsp),%r14d - xorl %r12d,%eax - roll $5,%ecx - xorl 60(%rsp),%r14d - leal -899497514(%rbp,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%r14d - xorl 32(%rsp),%edx - movl %esi,%eax - movl %r14d,28(%rsp) - movl %r13d,%ecx - xorl 40(%rsp),%edx - xorl %r11d,%eax - roll $5,%ecx - xorl 0(%rsp),%edx - leal -899497514(%r14,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%edx - xorl 36(%rsp),%ebp - movl %r13d,%eax - - movl %r12d,%ecx - xorl 44(%rsp),%ebp - xorl %edi,%eax - roll $5,%ecx - xorl 4(%rsp),%ebp - leal -899497514(%rdx,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%ebp - xorl 40(%rsp),%r14d - movl %r12d,%eax - - movl %r11d,%ecx - xorl 48(%rsp),%r14d - xorl %esi,%eax - roll $5,%ecx - xorl 8(%rsp),%r14d - leal -899497514(%rbp,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%r14d - xorl 44(%rsp),%edx - movl %r11d,%eax - - movl %edi,%ecx - xorl 52(%rsp),%edx - xorl %r13d,%eax - roll $5,%ecx - xorl 12(%rsp),%edx - leal -899497514(%r14,%rsi,1),%esi - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - roll $1,%edx - xorl 48(%rsp),%ebp - movl %edi,%eax - - movl %esi,%ecx - xorl 56(%rsp),%ebp - xorl %r12d,%eax - roll $5,%ecx - xorl 16(%rsp),%ebp - leal -899497514(%rdx,%r13,1),%r13d - xorl %r11d,%eax - addl %ecx,%r13d - roll $30,%edi - addl %eax,%r13d - roll $1,%ebp - xorl 52(%rsp),%r14d - movl %esi,%eax - - movl %r13d,%ecx - xorl 60(%rsp),%r14d - xorl %r11d,%eax - roll $5,%ecx - xorl 20(%rsp),%r14d - leal -899497514(%rbp,%r12,1),%r12d - xorl %edi,%eax - addl %ecx,%r12d - roll $30,%esi - addl %eax,%r12d - roll $1,%r14d - xorl 56(%rsp),%edx - movl %r13d,%eax - - movl %r12d,%ecx - xorl 0(%rsp),%edx - xorl %edi,%eax - roll $5,%ecx - xorl 24(%rsp),%edx - leal -899497514(%r14,%r11,1),%r11d - xorl %esi,%eax - addl %ecx,%r11d - roll $30,%r13d - addl %eax,%r11d - roll $1,%edx - xorl 60(%rsp),%ebp - movl %r12d,%eax - - movl %r11d,%ecx - xorl 4(%rsp),%ebp - xorl %esi,%eax - roll $5,%ecx - xorl 28(%rsp),%ebp - leal -899497514(%rdx,%rdi,1),%edi - xorl %r13d,%eax - addl %ecx,%edi - roll $30,%r12d - addl %eax,%edi - roll $1,%ebp - movl %r11d,%eax - movl %edi,%ecx - xorl %r13d,%eax - leal -899497514(%rbp,%rsi,1),%esi - roll $5,%ecx - xorl %r12d,%eax - addl %ecx,%esi - roll $30,%r11d - addl %eax,%esi - addl 0(%r8),%esi - addl 4(%r8),%edi - addl 8(%r8),%r11d - addl 12(%r8),%r12d - addl 16(%r8),%r13d - movl %esi,0(%r8) - movl %edi,4(%r8) - movl %r11d,8(%r8) - movl %r12d,12(%r8) - movl %r13d,16(%r8) - - subq $1,%r10 - leaq 64(%r9),%r9 - jnz .Lloop - - movq 64(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha1_block_data_order,.-sha1_block_data_order -.type sha1_block_data_order_shaext,@function -.align 32 -sha1_block_data_order_shaext: -_shaext_shortcut: -.cfi_startproc - movdqu (%rdi),%xmm0 - movd 16(%rdi),%xmm1 - movdqa K_XX_XX+160(%rip),%xmm3 - - movdqu (%rsi),%xmm4 - pshufd $27,%xmm0,%xmm0 - movdqu 16(%rsi),%xmm5 - pshufd $27,%xmm1,%xmm1 - movdqu 32(%rsi),%xmm6 -.byte 102,15,56,0,227 - movdqu 48(%rsi),%xmm7 -.byte 102,15,56,0,235 -.byte 102,15,56,0,243 - movdqa %xmm1,%xmm9 -.byte 102,15,56,0,251 - jmp .Loop_shaext - -.align 16 -.Loop_shaext: - decq %rdx - leaq 64(%rsi),%r8 - paddd %xmm4,%xmm1 - cmovneq %r8,%rsi - movdqa %xmm0,%xmm8 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,0 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,0 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,1 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,1 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 -.byte 15,56,201,229 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,213 - pxor %xmm6,%xmm4 -.byte 15,56,201,238 -.byte 15,56,202,231 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,2 -.byte 15,56,200,206 - pxor %xmm7,%xmm5 -.byte 15,56,202,236 -.byte 15,56,201,247 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,2 -.byte 15,56,200,215 - pxor %xmm4,%xmm6 -.byte 15,56,201,252 -.byte 15,56,202,245 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,204 - pxor %xmm5,%xmm7 -.byte 15,56,202,254 - movdqu (%rsi),%xmm4 - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,213 - movdqu 16(%rsi),%xmm5 -.byte 102,15,56,0,227 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 15,56,200,206 - movdqu 32(%rsi),%xmm6 -.byte 102,15,56,0,235 - - movdqa %xmm0,%xmm2 -.byte 15,58,204,193,3 -.byte 15,56,200,215 - movdqu 48(%rsi),%xmm7 -.byte 102,15,56,0,243 - - movdqa %xmm0,%xmm1 -.byte 15,58,204,194,3 -.byte 65,15,56,200,201 -.byte 102,15,56,0,251 - - paddd %xmm8,%xmm0 - movdqa %xmm1,%xmm9 - - jnz .Loop_shaext - - pshufd $27,%xmm0,%xmm0 - pshufd $27,%xmm1,%xmm1 - movdqu %xmm0,(%rdi) - movd %xmm1,16(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext -.type sha1_block_data_order_ssse3,@function -.align 16 -sha1_block_data_order_ssse3: -_ssse3_shortcut: -.cfi_startproc - movq %rsp,%r11 -.cfi_def_cfa_register %r11 - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - leaq -64(%rsp),%rsp - andq $-64,%rsp - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - shlq $6,%r10 - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r14 - - movl 0(%r8),%eax - movl 4(%r8),%ebx - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl %ebx,%esi - movl 16(%r8),%ebp - movl %ecx,%edi - xorl %edx,%edi - andl %edi,%esi - - movdqa 64(%r14),%xmm6 - movdqa -64(%r14),%xmm9 - movdqu 0(%r9),%xmm0 - movdqu 16(%r9),%xmm1 - movdqu 32(%r9),%xmm2 - movdqu 48(%r9),%xmm3 -.byte 102,15,56,0,198 -.byte 102,15,56,0,206 -.byte 102,15,56,0,214 - addq $64,%r9 - paddd %xmm9,%xmm0 -.byte 102,15,56,0,222 - paddd %xmm9,%xmm1 - paddd %xmm9,%xmm2 - movdqa %xmm0,0(%rsp) - psubd %xmm9,%xmm0 - movdqa %xmm1,16(%rsp) - psubd %xmm9,%xmm1 - movdqa %xmm2,32(%rsp) - psubd %xmm9,%xmm2 - jmp .Loop_ssse3 -.align 16 -.Loop_ssse3: - rorl $2,%ebx - pshufd $238,%xmm0,%xmm4 - xorl %edx,%esi - movdqa %xmm3,%xmm8 - paddd %xmm3,%xmm9 - movl %eax,%edi - addl 0(%rsp),%ebp - punpcklqdq %xmm1,%xmm4 - xorl %ecx,%ebx - roll $5,%eax - addl %esi,%ebp - psrldq $4,%xmm8 - andl %ebx,%edi - xorl %ecx,%ebx - pxor %xmm0,%xmm4 - addl %eax,%ebp - rorl $7,%eax - pxor %xmm2,%xmm8 - xorl %ecx,%edi - movl %ebp,%esi - addl 4(%rsp),%edx - pxor %xmm8,%xmm4 - xorl %ebx,%eax - roll $5,%ebp - movdqa %xmm9,48(%rsp) - addl %edi,%edx - andl %eax,%esi - movdqa %xmm4,%xmm10 - xorl %ebx,%eax - addl %ebp,%edx - rorl $7,%ebp - movdqa %xmm4,%xmm8 - xorl %ebx,%esi - pslldq $12,%xmm10 - paddd %xmm4,%xmm4 - movl %edx,%edi - addl 8(%rsp),%ecx - psrld $31,%xmm8 - xorl %eax,%ebp - roll $5,%edx - addl %esi,%ecx - movdqa %xmm10,%xmm9 - andl %ebp,%edi - xorl %eax,%ebp - psrld $30,%xmm10 - addl %edx,%ecx - rorl $7,%edx - por %xmm8,%xmm4 - xorl %eax,%edi - movl %ecx,%esi - addl 12(%rsp),%ebx - pslld $2,%xmm9 - pxor %xmm10,%xmm4 - xorl %ebp,%edx - movdqa -64(%r14),%xmm10 - roll $5,%ecx - addl %edi,%ebx - andl %edx,%esi - pxor %xmm9,%xmm4 - xorl %ebp,%edx - addl %ecx,%ebx - rorl $7,%ecx - pshufd $238,%xmm1,%xmm5 - xorl %ebp,%esi - movdqa %xmm4,%xmm9 - paddd %xmm4,%xmm10 - movl %ebx,%edi - addl 16(%rsp),%eax - punpcklqdq %xmm2,%xmm5 - xorl %edx,%ecx - roll $5,%ebx - addl %esi,%eax - psrldq $4,%xmm9 - andl %ecx,%edi - xorl %edx,%ecx - pxor %xmm1,%xmm5 - addl %ebx,%eax - rorl $7,%ebx - pxor %xmm3,%xmm9 - xorl %edx,%edi - movl %eax,%esi - addl 20(%rsp),%ebp - pxor %xmm9,%xmm5 - xorl %ecx,%ebx - roll $5,%eax - movdqa %xmm10,0(%rsp) - addl %edi,%ebp - andl %ebx,%esi - movdqa %xmm5,%xmm8 - xorl %ecx,%ebx - addl %eax,%ebp - rorl $7,%eax - movdqa %xmm5,%xmm9 - xorl %ecx,%esi - pslldq $12,%xmm8 - paddd %xmm5,%xmm5 - movl %ebp,%edi - addl 24(%rsp),%edx - psrld $31,%xmm9 - xorl %ebx,%eax - roll $5,%ebp - addl %esi,%edx - movdqa %xmm8,%xmm10 - andl %eax,%edi - xorl %ebx,%eax - psrld $30,%xmm8 - addl %ebp,%edx - rorl $7,%ebp - por %xmm9,%xmm5 - xorl %ebx,%edi - movl %edx,%esi - addl 28(%rsp),%ecx - pslld $2,%xmm10 - pxor %xmm8,%xmm5 - xorl %eax,%ebp - movdqa -32(%r14),%xmm8 - roll $5,%edx - addl %edi,%ecx - andl %ebp,%esi - pxor %xmm10,%xmm5 - xorl %eax,%ebp - addl %edx,%ecx - rorl $7,%edx - pshufd $238,%xmm2,%xmm6 - xorl %eax,%esi - movdqa %xmm5,%xmm10 - paddd %xmm5,%xmm8 - movl %ecx,%edi - addl 32(%rsp),%ebx - punpcklqdq %xmm3,%xmm6 - xorl %ebp,%edx - roll $5,%ecx - addl %esi,%ebx - psrldq $4,%xmm10 - andl %edx,%edi - xorl %ebp,%edx - pxor %xmm2,%xmm6 - addl %ecx,%ebx - rorl $7,%ecx - pxor %xmm4,%xmm10 - xorl %ebp,%edi - movl %ebx,%esi - addl 36(%rsp),%eax - pxor %xmm10,%xmm6 - xorl %edx,%ecx - roll $5,%ebx - movdqa %xmm8,16(%rsp) - addl %edi,%eax - andl %ecx,%esi - movdqa %xmm6,%xmm9 - xorl %edx,%ecx - addl %ebx,%eax - rorl $7,%ebx - movdqa %xmm6,%xmm10 - xorl %edx,%esi - pslldq $12,%xmm9 - paddd %xmm6,%xmm6 - movl %eax,%edi - addl 40(%rsp),%ebp - psrld $31,%xmm10 - xorl %ecx,%ebx - roll $5,%eax - addl %esi,%ebp - movdqa %xmm9,%xmm8 - andl %ebx,%edi - xorl %ecx,%ebx - psrld $30,%xmm9 - addl %eax,%ebp - rorl $7,%eax - por %xmm10,%xmm6 - xorl %ecx,%edi - movl %ebp,%esi - addl 44(%rsp),%edx - pslld $2,%xmm8 - pxor %xmm9,%xmm6 - xorl %ebx,%eax - movdqa -32(%r14),%xmm9 - roll $5,%ebp - addl %edi,%edx - andl %eax,%esi - pxor %xmm8,%xmm6 - xorl %ebx,%eax - addl %ebp,%edx - rorl $7,%ebp - pshufd $238,%xmm3,%xmm7 - xorl %ebx,%esi - movdqa %xmm6,%xmm8 - paddd %xmm6,%xmm9 - movl %edx,%edi - addl 48(%rsp),%ecx - punpcklqdq %xmm4,%xmm7 - xorl %eax,%ebp - roll $5,%edx - addl %esi,%ecx - psrldq $4,%xmm8 - andl %ebp,%edi - xorl %eax,%ebp - pxor %xmm3,%xmm7 - addl %edx,%ecx - rorl $7,%edx - pxor %xmm5,%xmm8 - xorl %eax,%edi - movl %ecx,%esi - addl 52(%rsp),%ebx - pxor %xmm8,%xmm7 - xorl %ebp,%edx - roll $5,%ecx - movdqa %xmm9,32(%rsp) - addl %edi,%ebx - andl %edx,%esi - movdqa %xmm7,%xmm10 - xorl %ebp,%edx - addl %ecx,%ebx - rorl $7,%ecx - movdqa %xmm7,%xmm8 - xorl %ebp,%esi - pslldq $12,%xmm10 - paddd %xmm7,%xmm7 - movl %ebx,%edi - addl 56(%rsp),%eax - psrld $31,%xmm8 - xorl %edx,%ecx - roll $5,%ebx - addl %esi,%eax - movdqa %xmm10,%xmm9 - andl %ecx,%edi - xorl %edx,%ecx - psrld $30,%xmm10 - addl %ebx,%eax - rorl $7,%ebx - por %xmm8,%xmm7 - xorl %edx,%edi - movl %eax,%esi - addl 60(%rsp),%ebp - pslld $2,%xmm9 - pxor %xmm10,%xmm7 - xorl %ecx,%ebx - movdqa -32(%r14),%xmm10 - roll $5,%eax - addl %edi,%ebp - andl %ebx,%esi - pxor %xmm9,%xmm7 - pshufd $238,%xmm6,%xmm9 - xorl %ecx,%ebx - addl %eax,%ebp - rorl $7,%eax - pxor %xmm4,%xmm0 - xorl %ecx,%esi - movl %ebp,%edi - addl 0(%rsp),%edx - punpcklqdq %xmm7,%xmm9 - xorl %ebx,%eax - roll $5,%ebp - pxor %xmm1,%xmm0 - addl %esi,%edx - andl %eax,%edi - movdqa %xmm10,%xmm8 - xorl %ebx,%eax - paddd %xmm7,%xmm10 - addl %ebp,%edx - pxor %xmm9,%xmm0 - rorl $7,%ebp - xorl %ebx,%edi - movl %edx,%esi - addl 4(%rsp),%ecx - movdqa %xmm0,%xmm9 - xorl %eax,%ebp - roll $5,%edx - movdqa %xmm10,48(%rsp) - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - pslld $2,%xmm0 - addl %edx,%ecx - rorl $7,%edx - psrld $30,%xmm9 - xorl %eax,%esi - movl %ecx,%edi - addl 8(%rsp),%ebx - por %xmm9,%xmm0 - xorl %ebp,%edx - roll $5,%ecx - pshufd $238,%xmm7,%xmm10 - addl %esi,%ebx - andl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 12(%rsp),%eax - xorl %ebp,%edi - movl %ebx,%esi - roll $5,%ebx - addl %edi,%eax - xorl %edx,%esi - rorl $7,%ecx - addl %ebx,%eax - pxor %xmm5,%xmm1 - addl 16(%rsp),%ebp - xorl %ecx,%esi - punpcklqdq %xmm0,%xmm10 - movl %eax,%edi - roll $5,%eax - pxor %xmm2,%xmm1 - addl %esi,%ebp - xorl %ecx,%edi - movdqa %xmm8,%xmm9 - rorl $7,%ebx - paddd %xmm0,%xmm8 - addl %eax,%ebp - pxor %xmm10,%xmm1 - addl 20(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - movdqa %xmm1,%xmm10 - addl %edi,%edx - xorl %ebx,%esi - movdqa %xmm8,0(%rsp) - rorl $7,%eax - addl %ebp,%edx - addl 24(%rsp),%ecx - pslld $2,%xmm1 - xorl %eax,%esi - movl %edx,%edi - psrld $30,%xmm10 - roll $5,%edx - addl %esi,%ecx - xorl %eax,%edi - rorl $7,%ebp - por %xmm10,%xmm1 - addl %edx,%ecx - addl 28(%rsp),%ebx - pshufd $238,%xmm0,%xmm8 - xorl %ebp,%edi - movl %ecx,%esi - roll $5,%ecx - addl %edi,%ebx - xorl %ebp,%esi - rorl $7,%edx - addl %ecx,%ebx - pxor %xmm6,%xmm2 - addl 32(%rsp),%eax - xorl %edx,%esi - punpcklqdq %xmm1,%xmm8 - movl %ebx,%edi - roll $5,%ebx - pxor %xmm3,%xmm2 - addl %esi,%eax - xorl %edx,%edi - movdqa 0(%r14),%xmm10 - rorl $7,%ecx - paddd %xmm1,%xmm9 - addl %ebx,%eax - pxor %xmm8,%xmm2 - addl 36(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - movdqa %xmm2,%xmm8 - addl %edi,%ebp - xorl %ecx,%esi - movdqa %xmm9,16(%rsp) - rorl $7,%ebx - addl %eax,%ebp - addl 40(%rsp),%edx - pslld $2,%xmm2 - xorl %ebx,%esi - movl %ebp,%edi - psrld $30,%xmm8 - roll $5,%ebp - addl %esi,%edx - xorl %ebx,%edi - rorl $7,%eax - por %xmm8,%xmm2 - addl %ebp,%edx - addl 44(%rsp),%ecx - pshufd $238,%xmm1,%xmm9 - xorl %eax,%edi - movl %edx,%esi - roll $5,%edx - addl %edi,%ecx - xorl %eax,%esi - rorl $7,%ebp - addl %edx,%ecx - pxor %xmm7,%xmm3 - addl 48(%rsp),%ebx - xorl %ebp,%esi - punpcklqdq %xmm2,%xmm9 - movl %ecx,%edi - roll $5,%ecx - pxor %xmm4,%xmm3 - addl %esi,%ebx - xorl %ebp,%edi - movdqa %xmm10,%xmm8 - rorl $7,%edx - paddd %xmm2,%xmm10 - addl %ecx,%ebx - pxor %xmm9,%xmm3 - addl 52(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - roll $5,%ebx - movdqa %xmm3,%xmm9 - addl %edi,%eax - xorl %edx,%esi - movdqa %xmm10,32(%rsp) - rorl $7,%ecx - addl %ebx,%eax - addl 56(%rsp),%ebp - pslld $2,%xmm3 - xorl %ecx,%esi - movl %eax,%edi - psrld $30,%xmm9 - roll $5,%eax - addl %esi,%ebp - xorl %ecx,%edi - rorl $7,%ebx - por %xmm9,%xmm3 - addl %eax,%ebp - addl 60(%rsp),%edx - pshufd $238,%xmm2,%xmm10 - xorl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - addl %edi,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %ebp,%edx - pxor %xmm0,%xmm4 - addl 0(%rsp),%ecx - xorl %eax,%esi - punpcklqdq %xmm3,%xmm10 - movl %edx,%edi - roll $5,%edx - pxor %xmm5,%xmm4 - addl %esi,%ecx - xorl %eax,%edi - movdqa %xmm8,%xmm9 - rorl $7,%ebp - paddd %xmm3,%xmm8 - addl %edx,%ecx - pxor %xmm10,%xmm4 - addl 4(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - roll $5,%ecx - movdqa %xmm4,%xmm10 - addl %edi,%ebx - xorl %ebp,%esi - movdqa %xmm8,48(%rsp) - rorl $7,%edx - addl %ecx,%ebx - addl 8(%rsp),%eax - pslld $2,%xmm4 - xorl %edx,%esi - movl %ebx,%edi - psrld $30,%xmm10 - roll $5,%ebx - addl %esi,%eax - xorl %edx,%edi - rorl $7,%ecx - por %xmm10,%xmm4 - addl %ebx,%eax - addl 12(%rsp),%ebp - pshufd $238,%xmm3,%xmm8 - xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - addl %edi,%ebp - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%ebp - pxor %xmm1,%xmm5 - addl 16(%rsp),%edx - xorl %ebx,%esi - punpcklqdq %xmm4,%xmm8 - movl %ebp,%edi - roll $5,%ebp - pxor %xmm6,%xmm5 - addl %esi,%edx - xorl %ebx,%edi - movdqa %xmm9,%xmm10 - rorl $7,%eax - paddd %xmm4,%xmm9 - addl %ebp,%edx - pxor %xmm8,%xmm5 - addl 20(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - roll $5,%edx - movdqa %xmm5,%xmm8 - addl %edi,%ecx - xorl %eax,%esi - movdqa %xmm9,0(%rsp) - rorl $7,%ebp - addl %edx,%ecx - addl 24(%rsp),%ebx - pslld $2,%xmm5 - xorl %ebp,%esi - movl %ecx,%edi - psrld $30,%xmm8 - roll $5,%ecx - addl %esi,%ebx - xorl %ebp,%edi - rorl $7,%edx - por %xmm8,%xmm5 - addl %ecx,%ebx - addl 28(%rsp),%eax - pshufd $238,%xmm4,%xmm9 - rorl $7,%ecx - movl %ebx,%esi - xorl %edx,%edi - roll $5,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - pxor %xmm2,%xmm6 - addl 32(%rsp),%ebp - andl %ecx,%esi - xorl %edx,%ecx - rorl $7,%ebx - punpcklqdq %xmm5,%xmm9 - movl %eax,%edi - xorl %ecx,%esi - pxor %xmm7,%xmm6 - roll $5,%eax - addl %esi,%ebp - movdqa %xmm10,%xmm8 - xorl %ebx,%edi - paddd %xmm5,%xmm10 - xorl %ecx,%ebx - pxor %xmm9,%xmm6 - addl %eax,%ebp - addl 36(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - rorl $7,%eax - movdqa %xmm6,%xmm9 - movl %ebp,%esi - xorl %ebx,%edi - movdqa %xmm10,16(%rsp) - roll $5,%ebp - addl %edi,%edx - xorl %eax,%esi - pslld $2,%xmm6 - xorl %ebx,%eax - addl %ebp,%edx - psrld $30,%xmm9 - addl 40(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - por %xmm9,%xmm6 - rorl $7,%ebp - movl %edx,%edi - xorl %eax,%esi - roll $5,%edx - pshufd $238,%xmm5,%xmm10 - addl %esi,%ecx - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 44(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - rorl $7,%edx - movl %ecx,%esi - xorl %ebp,%edi - roll $5,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - pxor %xmm3,%xmm7 - addl 48(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - rorl $7,%ecx - punpcklqdq %xmm6,%xmm10 - movl %ebx,%edi - xorl %edx,%esi - pxor %xmm0,%xmm7 - roll $5,%ebx - addl %esi,%eax - movdqa 32(%r14),%xmm9 - xorl %ecx,%edi - paddd %xmm6,%xmm8 - xorl %edx,%ecx - pxor %xmm10,%xmm7 - addl %ebx,%eax - addl 52(%rsp),%ebp - andl %ecx,%edi - xorl %edx,%ecx - rorl $7,%ebx - movdqa %xmm7,%xmm10 - movl %eax,%esi - xorl %ecx,%edi - movdqa %xmm8,32(%rsp) - roll $5,%eax - addl %edi,%ebp - xorl %ebx,%esi - pslld $2,%xmm7 - xorl %ecx,%ebx - addl %eax,%ebp - psrld $30,%xmm10 - addl 56(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - por %xmm10,%xmm7 - rorl $7,%eax - movl %ebp,%edi - xorl %ebx,%esi - roll $5,%ebp - pshufd $238,%xmm6,%xmm8 - addl %esi,%edx - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 60(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - rorl $7,%ebp - movl %edx,%esi - xorl %eax,%edi - roll $5,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - pxor %xmm4,%xmm0 - addl 0(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - rorl $7,%edx - punpcklqdq %xmm7,%xmm8 - movl %ecx,%edi - xorl %ebp,%esi - pxor %xmm1,%xmm0 - roll $5,%ecx - addl %esi,%ebx - movdqa %xmm9,%xmm10 - xorl %edx,%edi - paddd %xmm7,%xmm9 - xorl %ebp,%edx - pxor %xmm8,%xmm0 - addl %ecx,%ebx - addl 4(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - rorl $7,%ecx - movdqa %xmm0,%xmm8 - movl %ebx,%esi - xorl %edx,%edi - movdqa %xmm9,48(%rsp) - roll $5,%ebx - addl %edi,%eax - xorl %ecx,%esi - pslld $2,%xmm0 - xorl %edx,%ecx - addl %ebx,%eax - psrld $30,%xmm8 - addl 8(%rsp),%ebp - andl %ecx,%esi - xorl %edx,%ecx - por %xmm8,%xmm0 - rorl $7,%ebx - movl %eax,%edi - xorl %ecx,%esi - roll $5,%eax - pshufd $238,%xmm7,%xmm9 - addl %esi,%ebp - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 12(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - rorl $7,%eax - movl %ebp,%esi - xorl %ebx,%edi - roll $5,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - pxor %xmm5,%xmm1 - addl 16(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - rorl $7,%ebp - punpcklqdq %xmm0,%xmm9 - movl %edx,%edi - xorl %eax,%esi - pxor %xmm2,%xmm1 - roll $5,%edx - addl %esi,%ecx - movdqa %xmm10,%xmm8 - xorl %ebp,%edi - paddd %xmm0,%xmm10 - xorl %eax,%ebp - pxor %xmm9,%xmm1 - addl %edx,%ecx - addl 20(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - rorl $7,%edx - movdqa %xmm1,%xmm9 - movl %ecx,%esi - xorl %ebp,%edi - movdqa %xmm10,0(%rsp) - roll $5,%ecx - addl %edi,%ebx - xorl %edx,%esi - pslld $2,%xmm1 - xorl %ebp,%edx - addl %ecx,%ebx - psrld $30,%xmm9 - addl 24(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - por %xmm9,%xmm1 - rorl $7,%ecx - movl %ebx,%edi - xorl %edx,%esi - roll $5,%ebx - pshufd $238,%xmm0,%xmm10 - addl %esi,%eax - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%rsp),%ebp - andl %ecx,%edi - xorl %edx,%ecx - rorl $7,%ebx - movl %eax,%esi - xorl %ecx,%edi - roll $5,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - pxor %xmm6,%xmm2 - addl 32(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - rorl $7,%eax - punpcklqdq %xmm1,%xmm10 - movl %ebp,%edi - xorl %ebx,%esi - pxor %xmm3,%xmm2 - roll $5,%ebp - addl %esi,%edx - movdqa %xmm8,%xmm9 - xorl %eax,%edi - paddd %xmm1,%xmm8 - xorl %ebx,%eax - pxor %xmm10,%xmm2 - addl %ebp,%edx - addl 36(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - rorl $7,%ebp - movdqa %xmm2,%xmm10 - movl %edx,%esi - xorl %eax,%edi - movdqa %xmm8,16(%rsp) - roll $5,%edx - addl %edi,%ecx - xorl %ebp,%esi - pslld $2,%xmm2 - xorl %eax,%ebp - addl %edx,%ecx - psrld $30,%xmm10 - addl 40(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - por %xmm10,%xmm2 - rorl $7,%edx - movl %ecx,%edi - xorl %ebp,%esi - roll $5,%ecx - pshufd $238,%xmm1,%xmm8 - addl %esi,%ebx - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 44(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - rorl $7,%ecx - movl %ebx,%esi - xorl %edx,%edi - roll $5,%ebx - addl %edi,%eax - xorl %edx,%esi - addl %ebx,%eax - pxor %xmm7,%xmm3 - addl 48(%rsp),%ebp - xorl %ecx,%esi - punpcklqdq %xmm2,%xmm8 - movl %eax,%edi - roll $5,%eax - pxor %xmm4,%xmm3 - addl %esi,%ebp - xorl %ecx,%edi - movdqa %xmm9,%xmm10 - rorl $7,%ebx - paddd %xmm2,%xmm9 - addl %eax,%ebp - pxor %xmm8,%xmm3 - addl 52(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - movdqa %xmm3,%xmm8 - addl %edi,%edx - xorl %ebx,%esi - movdqa %xmm9,32(%rsp) - rorl $7,%eax - addl %ebp,%edx - addl 56(%rsp),%ecx - pslld $2,%xmm3 - xorl %eax,%esi - movl %edx,%edi - psrld $30,%xmm8 - roll $5,%edx - addl %esi,%ecx - xorl %eax,%edi - rorl $7,%ebp - por %xmm8,%xmm3 - addl %edx,%ecx - addl 60(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - roll $5,%ecx - addl %edi,%ebx - xorl %ebp,%esi - rorl $7,%edx - addl %ecx,%ebx - addl 0(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - roll $5,%ebx - paddd %xmm3,%xmm10 - addl %esi,%eax - xorl %edx,%edi - movdqa %xmm10,48(%rsp) - rorl $7,%ecx - addl %ebx,%eax - addl 4(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - addl %edi,%ebp - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%ebp - addl 8(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - roll $5,%ebp - addl %esi,%edx - xorl %ebx,%edi - rorl $7,%eax - addl %ebp,%edx - addl 12(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - roll $5,%edx - addl %edi,%ecx - xorl %eax,%esi - rorl $7,%ebp - addl %edx,%ecx - cmpq %r10,%r9 - je .Ldone_ssse3 - movdqa 64(%r14),%xmm6 - movdqa -64(%r14),%xmm9 - movdqu 0(%r9),%xmm0 - movdqu 16(%r9),%xmm1 - movdqu 32(%r9),%xmm2 - movdqu 48(%r9),%xmm3 -.byte 102,15,56,0,198 - addq $64,%r9 - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi -.byte 102,15,56,0,206 - roll $5,%ecx - addl %esi,%ebx - xorl %ebp,%edi - rorl $7,%edx - paddd %xmm9,%xmm0 - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - movdqa %xmm0,0(%rsp) - roll $5,%ebx - addl %edi,%eax - xorl %edx,%esi - rorl $7,%ecx - psubd %xmm9,%xmm0 - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - roll $5,%eax - addl %esi,%ebp - xorl %ecx,%edi - rorl $7,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - addl %edi,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi -.byte 102,15,56,0,214 - roll $5,%edx - addl %esi,%ecx - xorl %eax,%edi - rorl $7,%ebp - paddd %xmm9,%xmm1 - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - movdqa %xmm1,16(%rsp) - roll $5,%ecx - addl %edi,%ebx - xorl %ebp,%esi - rorl $7,%edx - psubd %xmm9,%xmm1 - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - roll $5,%ebx - addl %esi,%eax - xorl %edx,%edi - rorl $7,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - addl %edi,%ebp - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi -.byte 102,15,56,0,222 - roll $5,%ebp - addl %esi,%edx - xorl %ebx,%edi - rorl $7,%eax - paddd %xmm9,%xmm2 - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - movdqa %xmm2,32(%rsp) - roll $5,%edx - addl %edi,%ecx - xorl %eax,%esi - rorl $7,%ebp - psubd %xmm9,%xmm2 - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - roll $5,%ecx - addl %esi,%ebx - xorl %ebp,%edi - rorl $7,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - roll $5,%ebx - addl %edi,%eax - rorl $7,%ecx - addl %ebx,%eax - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - addl 12(%r8),%edx - movl %eax,0(%r8) - addl 16(%r8),%ebp - movl %esi,4(%r8) - movl %esi,%ebx - movl %ecx,8(%r8) - movl %ecx,%edi - movl %edx,12(%r8) - xorl %edx,%edi - movl %ebp,16(%r8) - andl %edi,%esi - jmp .Loop_ssse3 - -.align 16 -.Ldone_ssse3: - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - roll $5,%ecx - addl %esi,%ebx - xorl %ebp,%edi - rorl $7,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - roll $5,%ebx - addl %edi,%eax - xorl %edx,%esi - rorl $7,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - roll $5,%eax - addl %esi,%ebp - xorl %ecx,%edi - rorl $7,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - roll $5,%ebp - addl %edi,%edx - xorl %ebx,%esi - rorl $7,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - roll $5,%edx - addl %esi,%ecx - xorl %eax,%edi - rorl $7,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - roll $5,%ecx - addl %edi,%ebx - xorl %ebp,%esi - rorl $7,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - roll $5,%ebx - addl %esi,%eax - xorl %edx,%edi - rorl $7,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - roll $5,%eax - addl %edi,%ebp - xorl %ecx,%esi - rorl $7,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - roll $5,%ebp - addl %esi,%edx - xorl %ebx,%edi - rorl $7,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - roll $5,%edx - addl %edi,%ecx - xorl %eax,%esi - rorl $7,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - roll $5,%ecx - addl %esi,%ebx - xorl %ebp,%edi - rorl $7,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - roll $5,%ebx - addl %edi,%eax - rorl $7,%ecx - addl %ebx,%eax - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - movl %eax,0(%r8) - addl 12(%r8),%edx - movl %esi,4(%r8) - addl 16(%r8),%ebp - movl %ecx,8(%r8) - movl %edx,12(%r8) - movl %ebp,16(%r8) - movq -40(%r11),%r14 -.cfi_restore %r14 - movq -32(%r11),%r13 -.cfi_restore %r13 - movq -24(%r11),%r12 -.cfi_restore %r12 - movq -16(%r11),%rbp -.cfi_restore %rbp - movq -8(%r11),%rbx -.cfi_restore %rbx - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_ssse3: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3 -.type sha1_block_data_order_avx,@function -.align 16 -sha1_block_data_order_avx: -_avx_shortcut: -.cfi_startproc - movq %rsp,%r11 -.cfi_def_cfa_register %r11 - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - leaq -64(%rsp),%rsp - vzeroupper - andq $-64,%rsp - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - shlq $6,%r10 - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r14 - - movl 0(%r8),%eax - movl 4(%r8),%ebx - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl %ebx,%esi - movl 16(%r8),%ebp - movl %ecx,%edi - xorl %edx,%edi - andl %edi,%esi - - vmovdqa 64(%r14),%xmm6 - vmovdqa -64(%r14),%xmm11 - vmovdqu 0(%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r9 - vpshufb %xmm6,%xmm1,%xmm1 - vpshufb %xmm6,%xmm2,%xmm2 - vpshufb %xmm6,%xmm3,%xmm3 - vpaddd %xmm11,%xmm0,%xmm4 - vpaddd %xmm11,%xmm1,%xmm5 - vpaddd %xmm11,%xmm2,%xmm6 - vmovdqa %xmm4,0(%rsp) - vmovdqa %xmm5,16(%rsp) - vmovdqa %xmm6,32(%rsp) - jmp .Loop_avx -.align 16 -.Loop_avx: - shrdl $2,%ebx,%ebx - xorl %edx,%esi - vpalignr $8,%xmm0,%xmm1,%xmm4 - movl %eax,%edi - addl 0(%rsp),%ebp - vpaddd %xmm3,%xmm11,%xmm9 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrldq $4,%xmm3,%xmm8 - addl %esi,%ebp - andl %ebx,%edi - vpxor %xmm0,%xmm4,%xmm4 - xorl %ecx,%ebx - addl %eax,%ebp - vpxor %xmm2,%xmm8,%xmm8 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 4(%rsp),%edx - vpxor %xmm8,%xmm4,%xmm4 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vmovdqa %xmm9,48(%rsp) - addl %edi,%edx - andl %eax,%esi - vpsrld $31,%xmm4,%xmm8 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpslldq $12,%xmm4,%xmm10 - vpaddd %xmm4,%xmm4,%xmm4 - movl %edx,%edi - addl 8(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm4,%xmm4 - addl %esi,%ecx - andl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm4,%xmm4 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 12(%rsp),%ebx - vpxor %xmm10,%xmm4,%xmm4 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %edi,%ebx - andl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpalignr $8,%xmm1,%xmm2,%xmm5 - movl %ebx,%edi - addl 16(%rsp),%eax - vpaddd %xmm4,%xmm11,%xmm9 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrldq $4,%xmm4,%xmm8 - addl %esi,%eax - andl %ecx,%edi - vpxor %xmm1,%xmm5,%xmm5 - xorl %edx,%ecx - addl %ebx,%eax - vpxor %xmm3,%xmm8,%xmm8 - shrdl $7,%ebx,%ebx - xorl %edx,%edi - movl %eax,%esi - addl 20(%rsp),%ebp - vpxor %xmm8,%xmm5,%xmm5 - xorl %ecx,%ebx - shldl $5,%eax,%eax - vmovdqa %xmm9,0(%rsp) - addl %edi,%ebp - andl %ebx,%esi - vpsrld $31,%xmm5,%xmm8 - xorl %ecx,%ebx - addl %eax,%ebp - shrdl $7,%eax,%eax - xorl %ecx,%esi - vpslldq $12,%xmm5,%xmm10 - vpaddd %xmm5,%xmm5,%xmm5 - movl %ebp,%edi - addl 24(%rsp),%edx - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm5,%xmm5 - addl %esi,%edx - andl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm5,%xmm5 - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - movl %edx,%esi - addl 28(%rsp),%ecx - vpxor %xmm10,%xmm5,%xmm5 - xorl %eax,%ebp - shldl $5,%edx,%edx - vmovdqa -32(%r14),%xmm11 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - vpalignr $8,%xmm2,%xmm3,%xmm6 - movl %ecx,%edi - addl 32(%rsp),%ebx - vpaddd %xmm5,%xmm11,%xmm9 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vpsrldq $4,%xmm5,%xmm8 - addl %esi,%ebx - andl %edx,%edi - vpxor %xmm2,%xmm6,%xmm6 - xorl %ebp,%edx - addl %ecx,%ebx - vpxor %xmm4,%xmm8,%xmm8 - shrdl $7,%ecx,%ecx - xorl %ebp,%edi - movl %ebx,%esi - addl 36(%rsp),%eax - vpxor %xmm8,%xmm6,%xmm6 - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vmovdqa %xmm9,16(%rsp) - addl %edi,%eax - andl %ecx,%esi - vpsrld $31,%xmm6,%xmm8 - xorl %edx,%ecx - addl %ebx,%eax - shrdl $7,%ebx,%ebx - xorl %edx,%esi - vpslldq $12,%xmm6,%xmm10 - vpaddd %xmm6,%xmm6,%xmm6 - movl %eax,%edi - addl 40(%rsp),%ebp - xorl %ecx,%ebx - shldl $5,%eax,%eax - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm6,%xmm6 - addl %esi,%ebp - andl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm6,%xmm6 - shrdl $7,%eax,%eax - xorl %ecx,%edi - movl %ebp,%esi - addl 44(%rsp),%edx - vpxor %xmm10,%xmm6,%xmm6 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - addl %edi,%edx - andl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%esi - vpalignr $8,%xmm3,%xmm4,%xmm7 - movl %edx,%edi - addl 48(%rsp),%ecx - vpaddd %xmm6,%xmm11,%xmm9 - xorl %eax,%ebp - shldl $5,%edx,%edx - vpsrldq $4,%xmm6,%xmm8 - addl %esi,%ecx - andl %ebp,%edi - vpxor %xmm3,%xmm7,%xmm7 - xorl %eax,%ebp - addl %edx,%ecx - vpxor %xmm5,%xmm8,%xmm8 - shrdl $7,%edx,%edx - xorl %eax,%edi - movl %ecx,%esi - addl 52(%rsp),%ebx - vpxor %xmm8,%xmm7,%xmm7 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - vmovdqa %xmm9,32(%rsp) - addl %edi,%ebx - andl %edx,%esi - vpsrld $31,%xmm7,%xmm8 - xorl %ebp,%edx - addl %ecx,%ebx - shrdl $7,%ecx,%ecx - xorl %ebp,%esi - vpslldq $12,%xmm7,%xmm10 - vpaddd %xmm7,%xmm7,%xmm7 - movl %ebx,%edi - addl 56(%rsp),%eax - xorl %edx,%ecx - shldl $5,%ebx,%ebx - vpsrld $30,%xmm10,%xmm9 - vpor %xmm8,%xmm7,%xmm7 - addl %esi,%eax - andl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - vpslld $2,%xmm10,%xmm10 - vpxor %xmm9,%xmm7,%xmm7 - shrdl $7,%ebx,%ebx - xorl %edx,%edi - movl %eax,%esi - addl 60(%rsp),%ebp - vpxor %xmm10,%xmm7,%xmm7 - xorl %ecx,%ebx - shldl $5,%eax,%eax - addl %edi,%ebp - andl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - shrdl $7,%eax,%eax - xorl %ecx,%esi - movl %ebp,%edi - addl 0(%rsp),%edx - vpxor %xmm1,%xmm0,%xmm0 - xorl %ebx,%eax - shldl $5,%ebp,%ebp - vpaddd %xmm7,%xmm11,%xmm9 - addl %esi,%edx - andl %eax,%edi - vpxor %xmm8,%xmm0,%xmm0 - xorl %ebx,%eax - addl %ebp,%edx - shrdl $7,%ebp,%ebp - xorl %ebx,%edi - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - movl %edx,%esi - addl 4(%rsp),%ecx - xorl %eax,%ebp - shldl $5,%edx,%edx - vpslld $2,%xmm0,%xmm0 - addl %edi,%ecx - andl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - shrdl $7,%edx,%edx - xorl %eax,%esi - movl %ecx,%edi - addl 8(%rsp),%ebx - vpor %xmm8,%xmm0,%xmm0 - xorl %ebp,%edx - shldl $5,%ecx,%ecx - addl %esi,%ebx - andl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 12(%rsp),%eax - xorl %ebp,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm2,%xmm1,%xmm1 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm0,%xmm11,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm1,%xmm1 - addl 20(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm1,%xmm1 - addl 24(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm1,%xmm1 - addl 28(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - vpxor %xmm3,%xmm2,%xmm2 - addl %esi,%eax - xorl %edx,%edi - vpaddd %xmm1,%xmm11,%xmm9 - vmovdqa 0(%r14),%xmm11 - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpxor %xmm8,%xmm2,%xmm2 - addl 36(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpslld $2,%xmm2,%xmm2 - addl 40(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpor %xmm8,%xmm2,%xmm2 - addl 44(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebx - xorl %ebp,%edi - vpaddd %xmm2,%xmm11,%xmm9 - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpalignr $8,%xmm2,%xmm3,%xmm8 - vpxor %xmm0,%xmm4,%xmm4 - addl 0(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - vpxor %xmm5,%xmm4,%xmm4 - addl %esi,%ecx - xorl %eax,%edi - vpaddd %xmm3,%xmm11,%xmm9 - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpxor %xmm8,%xmm4,%xmm4 - addl 4(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - vpsrld $30,%xmm4,%xmm8 - vmovdqa %xmm9,48(%rsp) - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpslld $2,%xmm4,%xmm4 - addl 8(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vpor %xmm8,%xmm4,%xmm4 - addl 12(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm3,%xmm4,%xmm8 - vpxor %xmm1,%xmm5,%xmm5 - addl 16(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpxor %xmm6,%xmm5,%xmm5 - addl %esi,%edx - xorl %ebx,%edi - vpaddd %xmm4,%xmm11,%xmm9 - shrdl $7,%eax,%eax - addl %ebp,%edx - vpxor %xmm8,%xmm5,%xmm5 - addl 20(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - vpsrld $30,%xmm5,%xmm8 - vmovdqa %xmm9,0(%rsp) - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpslld $2,%xmm5,%xmm5 - addl 24(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vpor %xmm8,%xmm5,%xmm5 - addl 28(%rsp),%eax - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - vpalignr $8,%xmm4,%xmm5,%xmm8 - vpxor %xmm2,%xmm6,%xmm6 - addl 32(%rsp),%ebp - andl %ecx,%esi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - vpxor %xmm7,%xmm6,%xmm6 - movl %eax,%edi - xorl %ecx,%esi - vpaddd %xmm5,%xmm11,%xmm9 - shldl $5,%eax,%eax - addl %esi,%ebp - vpxor %xmm8,%xmm6,%xmm6 - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 36(%rsp),%edx - vpsrld $30,%xmm6,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - vpslld $2,%xmm6,%xmm6 - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - addl 40(%rsp),%ecx - andl %eax,%esi - vpor %xmm8,%xmm6,%xmm6 - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%edi - xorl %eax,%esi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 44(%rsp),%ebx - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%esi - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - vpalignr $8,%xmm5,%xmm6,%xmm8 - vpxor %xmm3,%xmm7,%xmm7 - addl 48(%rsp),%eax - andl %edx,%esi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - vpxor %xmm0,%xmm7,%xmm7 - movl %ebx,%edi - xorl %edx,%esi - vpaddd %xmm6,%xmm11,%xmm9 - vmovdqa 32(%r14),%xmm11 - shldl $5,%ebx,%ebx - addl %esi,%eax - vpxor %xmm8,%xmm7,%xmm7 - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 52(%rsp),%ebp - vpsrld $30,%xmm7,%xmm8 - vmovdqa %xmm9,32(%rsp) - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - vpslld $2,%xmm7,%xmm7 - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - addl 56(%rsp),%edx - andl %ebx,%esi - vpor %xmm8,%xmm7,%xmm7 - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%edi - xorl %ebx,%esi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 60(%rsp),%ecx - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - vpalignr $8,%xmm6,%xmm7,%xmm8 - vpxor %xmm4,%xmm0,%xmm0 - addl 0(%rsp),%ebx - andl %ebp,%esi - xorl %eax,%ebp - shrdl $7,%edx,%edx - vpxor %xmm1,%xmm0,%xmm0 - movl %ecx,%edi - xorl %ebp,%esi - vpaddd %xmm7,%xmm11,%xmm9 - shldl $5,%ecx,%ecx - addl %esi,%ebx - vpxor %xmm8,%xmm0,%xmm0 - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 4(%rsp),%eax - vpsrld $30,%xmm0,%xmm8 - vmovdqa %xmm9,48(%rsp) - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - vpslld $2,%xmm0,%xmm0 - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %ecx,%esi - xorl %edx,%ecx - addl %ebx,%eax - addl 8(%rsp),%ebp - andl %ecx,%esi - vpor %xmm8,%xmm0,%xmm0 - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%edi - xorl %ecx,%esi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ebx,%edi - xorl %ecx,%ebx - addl %eax,%ebp - addl 12(%rsp),%edx - andl %ebx,%edi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - movl %ebp,%esi - xorl %ebx,%edi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %eax,%esi - xorl %ebx,%eax - addl %ebp,%edx - vpalignr $8,%xmm7,%xmm0,%xmm8 - vpxor %xmm5,%xmm1,%xmm1 - addl 16(%rsp),%ecx - andl %eax,%esi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - vpxor %xmm2,%xmm1,%xmm1 - movl %edx,%edi - xorl %eax,%esi - vpaddd %xmm0,%xmm11,%xmm9 - shldl $5,%edx,%edx - addl %esi,%ecx - vpxor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edi - xorl %eax,%ebp - addl %edx,%ecx - addl 20(%rsp),%ebx - vpsrld $30,%xmm1,%xmm8 - vmovdqa %xmm9,0(%rsp) - andl %ebp,%edi - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%esi - vpslld $2,%xmm1,%xmm1 - xorl %ebp,%edi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %edx,%esi - xorl %ebp,%edx - addl %ecx,%ebx - addl 24(%rsp),%eax - andl %edx,%esi - vpor %xmm8,%xmm1,%xmm1 - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%edi - xorl %edx,%esi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %ecx,%edi - xorl %edx,%ecx - addl %ebx,%eax - addl 28(%rsp),%ebp - andl %ecx,%edi - xorl %edx,%ecx - shrdl $7,%ebx,%ebx - movl %eax,%esi - xorl %ecx,%edi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ebx,%esi - xorl %ecx,%ebx - addl %eax,%ebp - vpalignr $8,%xmm0,%xmm1,%xmm8 - vpxor %xmm6,%xmm2,%xmm2 - addl 32(%rsp),%edx - andl %ebx,%esi - xorl %ecx,%ebx - shrdl $7,%eax,%eax - vpxor %xmm3,%xmm2,%xmm2 - movl %ebp,%edi - xorl %ebx,%esi - vpaddd %xmm1,%xmm11,%xmm9 - shldl $5,%ebp,%ebp - addl %esi,%edx - vpxor %xmm8,%xmm2,%xmm2 - xorl %eax,%edi - xorl %ebx,%eax - addl %ebp,%edx - addl 36(%rsp),%ecx - vpsrld $30,%xmm2,%xmm8 - vmovdqa %xmm9,16(%rsp) - andl %eax,%edi - xorl %ebx,%eax - shrdl $7,%ebp,%ebp - movl %edx,%esi - vpslld $2,%xmm2,%xmm2 - xorl %eax,%edi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %ebp,%esi - xorl %eax,%ebp - addl %edx,%ecx - addl 40(%rsp),%ebx - andl %ebp,%esi - vpor %xmm8,%xmm2,%xmm2 - xorl %eax,%ebp - shrdl $7,%edx,%edx - movl %ecx,%edi - xorl %ebp,%esi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %edx,%edi - xorl %ebp,%edx - addl %ecx,%ebx - addl 44(%rsp),%eax - andl %edx,%edi - xorl %ebp,%edx - shrdl $7,%ecx,%ecx - movl %ebx,%esi - xorl %edx,%edi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - addl %ebx,%eax - vpalignr $8,%xmm1,%xmm2,%xmm8 - vpxor %xmm7,%xmm3,%xmm3 - addl 48(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - vpxor %xmm4,%xmm3,%xmm3 - addl %esi,%ebp - xorl %ecx,%edi - vpaddd %xmm2,%xmm11,%xmm9 - shrdl $7,%ebx,%ebx - addl %eax,%ebp - vpxor %xmm8,%xmm3,%xmm3 - addl 52(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - vpsrld $30,%xmm3,%xmm8 - vmovdqa %xmm9,32(%rsp) - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - vpslld $2,%xmm3,%xmm3 - addl 56(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vpor %xmm8,%xmm3,%xmm3 - addl 60(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 0(%rsp),%eax - vpaddd %xmm3,%xmm11,%xmm9 - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - vmovdqa %xmm9,48(%rsp) - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 4(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 8(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 12(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - cmpq %r10,%r9 - je .Ldone_avx - vmovdqa 64(%r14),%xmm6 - vmovdqa -64(%r14),%xmm11 - vmovdqu 0(%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - vpshufb %xmm6,%xmm0,%xmm0 - addq $64,%r9 - addl 16(%rsp),%ebx - xorl %ebp,%esi - vpshufb %xmm6,%xmm1,%xmm1 - movl %ecx,%edi - shldl $5,%ecx,%ecx - vpaddd %xmm11,%xmm0,%xmm4 - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - vmovdqa %xmm4,0(%rsp) - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - vpshufb %xmm6,%xmm2,%xmm2 - movl %edx,%edi - shldl $5,%edx,%edx - vpaddd %xmm11,%xmm1,%xmm5 - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - vmovdqa %xmm5,16(%rsp) - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - vpshufb %xmm6,%xmm3,%xmm3 - movl %ebp,%edi - shldl $5,%ebp,%ebp - vpaddd %xmm11,%xmm2,%xmm6 - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - vmovdqa %xmm6,32(%rsp) - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - addl 12(%r8),%edx - movl %eax,0(%r8) - addl 16(%r8),%ebp - movl %esi,4(%r8) - movl %esi,%ebx - movl %ecx,8(%r8) - movl %ecx,%edi - movl %edx,12(%r8) - xorl %edx,%edi - movl %ebp,16(%r8) - andl %edi,%esi - jmp .Loop_avx - -.align 16 -.Ldone_avx: - addl 16(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 20(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - xorl %edx,%esi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 24(%rsp),%ebp - xorl %ecx,%esi - movl %eax,%edi - shldl $5,%eax,%eax - addl %esi,%ebp - xorl %ecx,%edi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 28(%rsp),%edx - xorl %ebx,%edi - movl %ebp,%esi - shldl $5,%ebp,%ebp - addl %edi,%edx - xorl %ebx,%esi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 32(%rsp),%ecx - xorl %eax,%esi - movl %edx,%edi - shldl $5,%edx,%edx - addl %esi,%ecx - xorl %eax,%edi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 36(%rsp),%ebx - xorl %ebp,%edi - movl %ecx,%esi - shldl $5,%ecx,%ecx - addl %edi,%ebx - xorl %ebp,%esi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 40(%rsp),%eax - xorl %edx,%esi - movl %ebx,%edi - shldl $5,%ebx,%ebx - addl %esi,%eax - xorl %edx,%edi - shrdl $7,%ecx,%ecx - addl %ebx,%eax - addl 44(%rsp),%ebp - xorl %ecx,%edi - movl %eax,%esi - shldl $5,%eax,%eax - addl %edi,%ebp - xorl %ecx,%esi - shrdl $7,%ebx,%ebx - addl %eax,%ebp - addl 48(%rsp),%edx - xorl %ebx,%esi - movl %ebp,%edi - shldl $5,%ebp,%ebp - addl %esi,%edx - xorl %ebx,%edi - shrdl $7,%eax,%eax - addl %ebp,%edx - addl 52(%rsp),%ecx - xorl %eax,%edi - movl %edx,%esi - shldl $5,%edx,%edx - addl %edi,%ecx - xorl %eax,%esi - shrdl $7,%ebp,%ebp - addl %edx,%ecx - addl 56(%rsp),%ebx - xorl %ebp,%esi - movl %ecx,%edi - shldl $5,%ecx,%ecx - addl %esi,%ebx - xorl %ebp,%edi - shrdl $7,%edx,%edx - addl %ecx,%ebx - addl 60(%rsp),%eax - xorl %edx,%edi - movl %ebx,%esi - shldl $5,%ebx,%ebx - addl %edi,%eax - shrdl $7,%ecx,%ecx - addl %ebx,%eax - vzeroupper - - addl 0(%r8),%eax - addl 4(%r8),%esi - addl 8(%r8),%ecx - movl %eax,0(%r8) - addl 12(%r8),%edx - movl %esi,4(%r8) - addl 16(%r8),%ebp - movl %ecx,8(%r8) - movl %edx,12(%r8) - movl %ebp,16(%r8) - movq -40(%r11),%r14 -.cfi_restore %r14 - movq -32(%r11),%r13 -.cfi_restore %r13 - movq -24(%r11),%r12 -.cfi_restore %r12 - movq -16(%r11),%rbp -.cfi_restore %rbp - movq -8(%r11),%rbx -.cfi_restore %rbx - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha1_block_data_order_avx,.-sha1_block_data_order_avx -.type sha1_block_data_order_avx2,@function -.align 16 -sha1_block_data_order_avx2: -_avx2_shortcut: -.cfi_startproc - movq %rsp,%r11 -.cfi_def_cfa_register %r11 - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - vzeroupper - movq %rdi,%r8 - movq %rsi,%r9 - movq %rdx,%r10 - - leaq -640(%rsp),%rsp - shlq $6,%r10 - leaq 64(%r9),%r13 - andq $-128,%rsp - addq %r9,%r10 - leaq K_XX_XX+64(%rip),%r14 - - movl 0(%r8),%eax - cmpq %r10,%r13 - cmovaeq %r9,%r13 - movl 4(%r8),%ebp - movl 8(%r8),%ecx - movl 12(%r8),%edx - movl 16(%r8),%esi - vmovdqu 64(%r14),%ymm6 - - vmovdqu (%r9),%xmm0 - vmovdqu 16(%r9),%xmm1 - vmovdqu 32(%r9),%xmm2 - vmovdqu 48(%r9),%xmm3 - leaq 64(%r9),%r9 - vinserti128 $1,(%r13),%ymm0,%ymm0 - vinserti128 $1,16(%r13),%ymm1,%ymm1 - vpshufb %ymm6,%ymm0,%ymm0 - vinserti128 $1,32(%r13),%ymm2,%ymm2 - vpshufb %ymm6,%ymm1,%ymm1 - vinserti128 $1,48(%r13),%ymm3,%ymm3 - vpshufb %ymm6,%ymm2,%ymm2 - vmovdqu -64(%r14),%ymm11 - vpshufb %ymm6,%ymm3,%ymm3 - - vpaddd %ymm11,%ymm0,%ymm4 - vpaddd %ymm11,%ymm1,%ymm5 - vmovdqu %ymm4,0(%rsp) - vpaddd %ymm11,%ymm2,%ymm6 - vmovdqu %ymm5,32(%rsp) - vpaddd %ymm11,%ymm3,%ymm7 - vmovdqu %ymm6,64(%rsp) - vmovdqu %ymm7,96(%rsp) - vpalignr $8,%ymm0,%ymm1,%ymm4 - vpsrldq $4,%ymm3,%ymm8 - vpxor %ymm0,%ymm4,%ymm4 - vpxor %ymm2,%ymm8,%ymm8 - vpxor %ymm8,%ymm4,%ymm4 - vpsrld $31,%ymm4,%ymm8 - vpslldq $12,%ymm4,%ymm10 - vpaddd %ymm4,%ymm4,%ymm4 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm4,%ymm4 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm4,%ymm4 - vpxor %ymm10,%ymm4,%ymm4 - vpaddd %ymm11,%ymm4,%ymm9 - vmovdqu %ymm9,128(%rsp) - vpalignr $8,%ymm1,%ymm2,%ymm5 - vpsrldq $4,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm3,%ymm8,%ymm8 - vpxor %ymm8,%ymm5,%ymm5 - vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r14),%ymm11 - vpslldq $12,%ymm5,%ymm10 - vpaddd %ymm5,%ymm5,%ymm5 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm5,%ymm5 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm5,%ymm5 - vpxor %ymm10,%ymm5,%ymm5 - vpaddd %ymm11,%ymm5,%ymm9 - vmovdqu %ymm9,160(%rsp) - vpalignr $8,%ymm2,%ymm3,%ymm6 - vpsrldq $4,%ymm5,%ymm8 - vpxor %ymm2,%ymm6,%ymm6 - vpxor %ymm4,%ymm8,%ymm8 - vpxor %ymm8,%ymm6,%ymm6 - vpsrld $31,%ymm6,%ymm8 - vpslldq $12,%ymm6,%ymm10 - vpaddd %ymm6,%ymm6,%ymm6 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm6,%ymm6 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm6,%ymm6 - vpxor %ymm10,%ymm6,%ymm6 - vpaddd %ymm11,%ymm6,%ymm9 - vmovdqu %ymm9,192(%rsp) - vpalignr $8,%ymm3,%ymm4,%ymm7 - vpsrldq $4,%ymm6,%ymm8 - vpxor %ymm3,%ymm7,%ymm7 - vpxor %ymm5,%ymm8,%ymm8 - vpxor %ymm8,%ymm7,%ymm7 - vpsrld $31,%ymm7,%ymm8 - vpslldq $12,%ymm7,%ymm10 - vpaddd %ymm7,%ymm7,%ymm7 - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm7,%ymm7 - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm7,%ymm7 - vpxor %ymm10,%ymm7,%ymm7 - vpaddd %ymm11,%ymm7,%ymm9 - vmovdqu %ymm9,224(%rsp) - leaq 128(%rsp),%r13 - jmp .Loop_avx2 -.align 32 -.Loop_avx2: - rorxl $2,%ebp,%ebx - andnl %edx,%ebp,%edi - andl %ecx,%ebp - xorl %edi,%ebp - jmp .Lalign32_1 -.align 32 -.Lalign32_1: - vpalignr $8,%ymm6,%ymm7,%ymm8 - vpxor %ymm4,%ymm0,%ymm0 - addl -128(%r13),%esi - andnl %ecx,%eax,%edi - vpxor %ymm1,%ymm0,%ymm0 - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpxor %ymm8,%ymm0,%ymm0 - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - vpsrld $30,%ymm0,%ymm8 - vpslld $2,%ymm0,%ymm0 - addl -124(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - vpor %ymm8,%ymm0,%ymm0 - addl %r12d,%edx - xorl %edi,%esi - addl -120(%r13),%ecx - andnl %ebp,%edx,%edi - vpaddd %ymm11,%ymm0,%ymm9 - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - vmovdqu %ymm9,256(%rsp) - addl %r12d,%ecx - xorl %edi,%edx - addl -116(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -96(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - vpalignr $8,%ymm7,%ymm0,%ymm8 - vpxor %ymm5,%ymm1,%ymm1 - addl -92(%r13),%eax - andnl %edx,%ebp,%edi - vpxor %ymm2,%ymm1,%ymm1 - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - vpxor %ymm8,%ymm1,%ymm1 - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - vpsrld $30,%ymm1,%ymm8 - vpslld $2,%ymm1,%ymm1 - addl -88(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - vpor %ymm8,%ymm1,%ymm1 - addl %r12d,%esi - xorl %edi,%eax - addl -84(%r13),%edx - andnl %ebx,%esi,%edi - vpaddd %ymm11,%ymm1,%ymm9 - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - vmovdqu %ymm9,288(%rsp) - addl %r12d,%edx - xorl %edi,%esi - addl -64(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -60(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - vpalignr $8,%ymm0,%ymm1,%ymm8 - vpxor %ymm6,%ymm2,%ymm2 - addl -56(%r13),%ebp - andnl %esi,%ebx,%edi - vpxor %ymm3,%ymm2,%ymm2 - vmovdqu 0(%r14),%ymm11 - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpxor %ymm8,%ymm2,%ymm2 - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - vpsrld $30,%ymm2,%ymm8 - vpslld $2,%ymm2,%ymm2 - addl -52(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - vpor %ymm8,%ymm2,%ymm2 - addl %r12d,%eax - xorl %edi,%ebp - addl -32(%r13),%esi - andnl %ecx,%eax,%edi - vpaddd %ymm11,%ymm2,%ymm9 - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - vmovdqu %ymm9,320(%rsp) - addl %r12d,%esi - xorl %edi,%eax - addl -28(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -24(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - vpalignr $8,%ymm1,%ymm2,%ymm8 - vpxor %ymm7,%ymm3,%ymm3 - addl -20(%r13),%ebx - andnl %eax,%ecx,%edi - vpxor %ymm4,%ymm3,%ymm3 - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpxor %ymm8,%ymm3,%ymm3 - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - vpsrld $30,%ymm3,%ymm8 - vpslld $2,%ymm3,%ymm3 - addl 0(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - vpor %ymm8,%ymm3,%ymm3 - addl %r12d,%ebp - xorl %edi,%ebx - addl 4(%r13),%eax - andnl %edx,%ebp,%edi - vpaddd %ymm11,%ymm3,%ymm9 - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - vmovdqu %ymm9,352(%rsp) - addl %r12d,%eax - xorl %edi,%ebp - addl 8(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl 12(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vpalignr $8,%ymm2,%ymm3,%ymm8 - vpxor %ymm0,%ymm4,%ymm4 - addl 32(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - vpxor %ymm5,%ymm4,%ymm4 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpxor %ymm8,%ymm4,%ymm4 - addl %r12d,%ecx - xorl %ebp,%edx - addl 36(%r13),%ebx - vpsrld $30,%ymm4,%ymm8 - vpslld $2,%ymm4,%ymm4 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vpor %ymm8,%ymm4,%ymm4 - addl 40(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpaddd %ymm11,%ymm4,%ymm9 - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 44(%r13),%eax - vmovdqu %ymm9,384(%rsp) - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpalignr $8,%ymm3,%ymm4,%ymm8 - vpxor %ymm1,%ymm5,%ymm5 - addl 68(%r13),%edx - leal (%rdx,%rax,1),%edx - vpxor %ymm6,%ymm5,%ymm5 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - vpxor %ymm8,%ymm5,%ymm5 - addl %r12d,%edx - xorl %ebx,%esi - addl 72(%r13),%ecx - vpsrld $30,%ymm5,%ymm8 - vpslld $2,%ymm5,%ymm5 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - vpor %ymm8,%ymm5,%ymm5 - addl 76(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpaddd %ymm11,%ymm5,%ymm9 - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 96(%r13),%ebp - vmovdqu %ymm9,416(%rsp) - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 100(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpalignr $8,%ymm4,%ymm5,%ymm8 - vpxor %ymm2,%ymm6,%ymm6 - addl 104(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpxor %ymm7,%ymm6,%ymm6 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - vpxor %ymm8,%ymm6,%ymm6 - addl %r12d,%esi - xorl %ecx,%eax - addl 108(%r13),%edx - leaq 256(%r13),%r13 - vpsrld $30,%ymm6,%ymm8 - vpslld $2,%ymm6,%ymm6 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vpor %ymm8,%ymm6,%ymm6 - addl -128(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpaddd %ymm11,%ymm6,%ymm9 - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -124(%r13),%ebx - vmovdqu %ymm9,448(%rsp) - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -120(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpalignr $8,%ymm5,%ymm6,%ymm8 - vpxor %ymm3,%ymm7,%ymm7 - addl -116(%r13),%eax - leal (%rax,%rbx,1),%eax - vpxor %ymm0,%ymm7,%ymm7 - vmovdqu 32(%r14),%ymm11 - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - vpxor %ymm8,%ymm7,%ymm7 - addl %r12d,%eax - xorl %edx,%ebp - addl -96(%r13),%esi - vpsrld $30,%ymm7,%ymm8 - vpslld $2,%ymm7,%ymm7 - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpor %ymm8,%ymm7,%ymm7 - addl -92(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpaddd %ymm11,%ymm7,%ymm9 - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -88(%r13),%ecx - vmovdqu %ymm9,480(%rsp) - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -84(%r13),%ebx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - jmp .Lalign32_2 -.align 32 -.Lalign32_2: - vpalignr $8,%ymm6,%ymm7,%ymm8 - vpxor %ymm4,%ymm0,%ymm0 - addl -64(%r13),%ebp - xorl %esi,%ecx - vpxor %ymm1,%ymm0,%ymm0 - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - vpxor %ymm8,%ymm0,%ymm0 - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - vpsrld $30,%ymm0,%ymm8 - vpslld $2,%ymm0,%ymm0 - addl %r12d,%ebp - andl %edi,%ebx - addl -60(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - vpor %ymm8,%ymm0,%ymm0 - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - vpaddd %ymm11,%ymm0,%ymm9 - addl %r12d,%eax - andl %edi,%ebp - addl -56(%r13),%esi - xorl %ecx,%ebp - vmovdqu %ymm9,512(%rsp) - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl -52(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - addl -32(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - vpalignr $8,%ymm7,%ymm0,%ymm8 - vpxor %ymm5,%ymm1,%ymm1 - addl -28(%r13),%ebx - xorl %eax,%edx - vpxor %ymm2,%ymm1,%ymm1 - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - vpxor %ymm8,%ymm1,%ymm1 - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vpsrld $30,%ymm1,%ymm8 - vpslld $2,%ymm1,%ymm1 - addl %r12d,%ebx - andl %edi,%ecx - addl -24(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - vpor %ymm8,%ymm1,%ymm1 - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - vpaddd %ymm11,%ymm1,%ymm9 - addl %r12d,%ebp - andl %edi,%ebx - addl -20(%r13),%eax - xorl %edx,%ebx - vmovdqu %ymm9,544(%rsp) - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 0(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl 4(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - vpalignr $8,%ymm0,%ymm1,%ymm8 - vpxor %ymm6,%ymm2,%ymm2 - addl 8(%r13),%ecx - xorl %ebp,%esi - vpxor %ymm3,%ymm2,%ymm2 - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - vpxor %ymm8,%ymm2,%ymm2 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpsrld $30,%ymm2,%ymm8 - vpslld $2,%ymm2,%ymm2 - addl %r12d,%ecx - andl %edi,%edx - addl 12(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - vpor %ymm8,%ymm2,%ymm2 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vpaddd %ymm11,%ymm2,%ymm9 - addl %r12d,%ebx - andl %edi,%ecx - addl 32(%r13),%ebp - xorl %esi,%ecx - vmovdqu %ymm9,576(%rsp) - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 36(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 40(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - vpalignr $8,%ymm1,%ymm2,%ymm8 - vpxor %ymm7,%ymm3,%ymm3 - addl 44(%r13),%edx - xorl %ebx,%eax - vpxor %ymm4,%ymm3,%ymm3 - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - vpxor %ymm8,%ymm3,%ymm3 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - vpsrld $30,%ymm3,%ymm8 - vpslld $2,%ymm3,%ymm3 - addl %r12d,%edx - andl %edi,%esi - addl 64(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - vpor %ymm8,%ymm3,%ymm3 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - vpaddd %ymm11,%ymm3,%ymm9 - addl %r12d,%ecx - andl %edi,%edx - addl 68(%r13),%ebx - xorl %eax,%edx - vmovdqu %ymm9,608(%rsp) - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl 72(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 76(%r13),%eax - xorl %edx,%ebx - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl 100(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 104(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 108(%r13),%ebx - leaq 256(%r13),%r13 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -128(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -124(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -120(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -116(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -96(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -92(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -88(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -84(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -60(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -56(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -52(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -32(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -28(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -24(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -20(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - addl %r12d,%edx - leaq 128(%r9),%r13 - leaq 128(%r9),%rdi - cmpq %r10,%r13 - cmovaeq %r9,%r13 - - - addl 0(%r8),%edx - addl 4(%r8),%esi - addl 8(%r8),%ebp - movl %edx,0(%r8) - addl 12(%r8),%ebx - movl %esi,4(%r8) - movl %edx,%eax - addl 16(%r8),%ecx - movl %ebp,%r12d - movl %ebp,8(%r8) - movl %ebx,%edx - - movl %ebx,12(%r8) - movl %esi,%ebp - movl %ecx,16(%r8) - - movl %ecx,%esi - movl %r12d,%ecx - - - cmpq %r10,%r9 - je .Ldone_avx2 - vmovdqu 64(%r14),%ymm6 - cmpq %r10,%rdi - ja .Last_avx2 - - vmovdqu -64(%rdi),%xmm0 - vmovdqu -48(%rdi),%xmm1 - vmovdqu -32(%rdi),%xmm2 - vmovdqu -16(%rdi),%xmm3 - vinserti128 $1,0(%r13),%ymm0,%ymm0 - vinserti128 $1,16(%r13),%ymm1,%ymm1 - vinserti128 $1,32(%r13),%ymm2,%ymm2 - vinserti128 $1,48(%r13),%ymm3,%ymm3 - jmp .Last_avx2 - -.align 32 -.Last_avx2: - leaq 128+16(%rsp),%r13 - rorxl $2,%ebp,%ebx - andnl %edx,%ebp,%edi - andl %ecx,%ebp - xorl %edi,%ebp - subq $-128,%r9 - addl -128(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -124(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -120(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -116(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -96(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl -92(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl -88(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -84(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -64(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -60(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl -56(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl -52(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl -32(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl -28(%r13),%edx - andnl %ebx,%esi,%edi - addl %eax,%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - andl %ebp,%esi - addl %r12d,%edx - xorl %edi,%esi - addl -24(%r13),%ecx - andnl %ebp,%edx,%edi - addl %esi,%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - andl %eax,%edx - addl %r12d,%ecx - xorl %edi,%edx - addl -20(%r13),%ebx - andnl %eax,%ecx,%edi - addl %edx,%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - andl %esi,%ecx - addl %r12d,%ebx - xorl %edi,%ecx - addl 0(%r13),%ebp - andnl %esi,%ebx,%edi - addl %ecx,%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - andl %edx,%ebx - addl %r12d,%ebp - xorl %edi,%ebx - addl 4(%r13),%eax - andnl %edx,%ebp,%edi - addl %ebx,%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - andl %ecx,%ebp - addl %r12d,%eax - xorl %edi,%ebp - addl 8(%r13),%esi - andnl %ecx,%eax,%edi - addl %ebp,%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - andl %ebx,%eax - addl %r12d,%esi - xorl %edi,%eax - addl 12(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 32(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 36(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 40(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 44(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl 64(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vmovdqu -64(%r14),%ymm11 - vpshufb %ymm6,%ymm0,%ymm0 - addl 68(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl 72(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl 76(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl 96(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl 100(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpshufb %ymm6,%ymm1,%ymm1 - vpaddd %ymm11,%ymm0,%ymm8 - addl 104(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl 108(%r13),%edx - leaq 256(%r13),%r13 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -128(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -124(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -120(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vmovdqu %ymm8,0(%rsp) - vpshufb %ymm6,%ymm2,%ymm2 - vpaddd %ymm11,%ymm1,%ymm9 - addl -116(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -92(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - addl -88(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -84(%r13),%ebx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - vmovdqu %ymm9,32(%rsp) - vpshufb %ymm6,%ymm3,%ymm3 - vpaddd %ymm11,%ymm2,%ymm6 - addl -64(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl -60(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl -56(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl -52(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - addl -32(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - jmp .Lalign32_3 -.align 32 -.Lalign32_3: - vmovdqu %ymm6,64(%rsp) - vpaddd %ymm11,%ymm3,%ymm7 - addl -28(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl -24(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl -20(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 0(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - addl 4(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - andl %edi,%esi - vmovdqu %ymm7,96(%rsp) - addl 8(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - andl %edi,%edx - addl 12(%r13),%ebx - xorl %eax,%edx - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - andl %edi,%ecx - addl 32(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 36(%r13),%eax - xorl %edx,%ebx - movl %ecx,%edi - xorl %edx,%edi - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - andl %edi,%ebp - addl 40(%r13),%esi - xorl %ecx,%ebp - movl %ebx,%edi - xorl %ecx,%edi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - andl %edi,%eax - vpalignr $8,%ymm0,%ymm1,%ymm4 - addl 44(%r13),%edx - xorl %ebx,%eax - movl %ebp,%edi - xorl %ebx,%edi - vpsrldq $4,%ymm3,%ymm8 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpxor %ymm0,%ymm4,%ymm4 - vpxor %ymm2,%ymm8,%ymm8 - xorl %ebp,%esi - addl %r12d,%edx - vpxor %ymm8,%ymm4,%ymm4 - andl %edi,%esi - addl 64(%r13),%ecx - xorl %ebp,%esi - movl %eax,%edi - vpsrld $31,%ymm4,%ymm8 - xorl %ebp,%edi - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - vpslldq $12,%ymm4,%ymm10 - vpaddd %ymm4,%ymm4,%ymm4 - rorxl $2,%edx,%esi - xorl %eax,%edx - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm4,%ymm4 - addl %r12d,%ecx - andl %edi,%edx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm4,%ymm4 - addl 68(%r13),%ebx - xorl %eax,%edx - vpxor %ymm10,%ymm4,%ymm4 - movl %esi,%edi - xorl %eax,%edi - leal (%rbx,%rdx,1),%ebx - vpaddd %ymm11,%ymm4,%ymm9 - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - vmovdqu %ymm9,128(%rsp) - addl %r12d,%ebx - andl %edi,%ecx - addl 72(%r13),%ebp - xorl %esi,%ecx - movl %edx,%edi - xorl %esi,%edi - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - andl %edi,%ebx - addl 76(%r13),%eax - xorl %edx,%ebx - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpalignr $8,%ymm1,%ymm2,%ymm5 - addl 96(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpsrldq $4,%ymm4,%ymm8 - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - vpxor %ymm1,%ymm5,%ymm5 - vpxor %ymm3,%ymm8,%ymm8 - addl 100(%r13),%edx - leal (%rdx,%rax,1),%edx - vpxor %ymm8,%ymm5,%ymm5 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - xorl %ebp,%esi - addl %r12d,%edx - vpsrld $31,%ymm5,%ymm8 - vmovdqu -32(%r14),%ymm11 - xorl %ebx,%esi - addl 104(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - vpslldq $12,%ymm5,%ymm10 - vpaddd %ymm5,%ymm5,%ymm5 - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm5,%ymm5 - xorl %eax,%edx - addl %r12d,%ecx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm5,%ymm5 - xorl %ebp,%edx - addl 108(%r13),%ebx - leaq 256(%r13),%r13 - vpxor %ymm10,%ymm5,%ymm5 - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - vpaddd %ymm11,%ymm5,%ymm9 - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vmovdqu %ymm9,160(%rsp) - addl -128(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpalignr $8,%ymm2,%ymm3,%ymm6 - addl -124(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - vpsrldq $4,%ymm5,%ymm8 - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - vpxor %ymm2,%ymm6,%ymm6 - vpxor %ymm4,%ymm8,%ymm8 - addl -120(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpxor %ymm8,%ymm6,%ymm6 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - vpsrld $31,%ymm6,%ymm8 - xorl %ecx,%eax - addl -116(%r13),%edx - leal (%rdx,%rax,1),%edx - vpslldq $12,%ymm6,%ymm10 - vpaddd %ymm6,%ymm6,%ymm6 - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm6,%ymm6 - xorl %ebp,%esi - addl %r12d,%edx - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm6,%ymm6 - xorl %ebx,%esi - addl -96(%r13),%ecx - vpxor %ymm10,%ymm6,%ymm6 - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - vpaddd %ymm11,%ymm6,%ymm9 - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - vmovdqu %ymm9,192(%rsp) - addl -92(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - vpalignr $8,%ymm3,%ymm4,%ymm7 - addl -88(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - vpsrldq $4,%ymm6,%ymm8 - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - vpxor %ymm3,%ymm7,%ymm7 - vpxor %ymm5,%ymm8,%ymm8 - addl -84(%r13),%eax - leal (%rax,%rbx,1),%eax - vpxor %ymm8,%ymm7,%ymm7 - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - vpsrld $31,%ymm7,%ymm8 - xorl %edx,%ebp - addl -64(%r13),%esi - leal (%rsi,%rbp,1),%esi - vpslldq $12,%ymm7,%ymm10 - vpaddd %ymm7,%ymm7,%ymm7 - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - vpsrld $30,%ymm10,%ymm9 - vpor %ymm8,%ymm7,%ymm7 - xorl %ebx,%eax - addl %r12d,%esi - vpslld $2,%ymm10,%ymm10 - vpxor %ymm9,%ymm7,%ymm7 - xorl %ecx,%eax - addl -60(%r13),%edx - vpxor %ymm10,%ymm7,%ymm7 - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - rorxl $2,%esi,%eax - vpaddd %ymm11,%ymm7,%ymm9 - xorl %ebp,%esi - addl %r12d,%edx - xorl %ebx,%esi - vmovdqu %ymm9,224(%rsp) - addl -56(%r13),%ecx - leal (%rcx,%rsi,1),%ecx - rorxl $27,%edx,%r12d - rorxl $2,%edx,%esi - xorl %eax,%edx - addl %r12d,%ecx - xorl %ebp,%edx - addl -52(%r13),%ebx - leal (%rbx,%rdx,1),%ebx - rorxl $27,%ecx,%r12d - rorxl $2,%ecx,%edx - xorl %esi,%ecx - addl %r12d,%ebx - xorl %eax,%ecx - addl -32(%r13),%ebp - leal (%rcx,%rbp,1),%ebp - rorxl $27,%ebx,%r12d - rorxl $2,%ebx,%ecx - xorl %edx,%ebx - addl %r12d,%ebp - xorl %esi,%ebx - addl -28(%r13),%eax - leal (%rax,%rbx,1),%eax - rorxl $27,%ebp,%r12d - rorxl $2,%ebp,%ebx - xorl %ecx,%ebp - addl %r12d,%eax - xorl %edx,%ebp - addl -24(%r13),%esi - leal (%rsi,%rbp,1),%esi - rorxl $27,%eax,%r12d - rorxl $2,%eax,%ebp - xorl %ebx,%eax - addl %r12d,%esi - xorl %ecx,%eax - addl -20(%r13),%edx - leal (%rdx,%rax,1),%edx - rorxl $27,%esi,%r12d - addl %r12d,%edx - leaq 128(%rsp),%r13 - - - addl 0(%r8),%edx - addl 4(%r8),%esi - addl 8(%r8),%ebp - movl %edx,0(%r8) - addl 12(%r8),%ebx - movl %esi,4(%r8) - movl %edx,%eax - addl 16(%r8),%ecx - movl %ebp,%r12d - movl %ebp,8(%r8) - movl %ebx,%edx - - movl %ebx,12(%r8) - movl %esi,%ebp - movl %ecx,16(%r8) - - movl %ecx,%esi - movl %r12d,%ecx - - - cmpq %r10,%r9 - jbe .Loop_avx2 - -.Ldone_avx2: - vzeroupper - movq -40(%r11),%r14 -.cfi_restore %r14 - movq -32(%r11),%r13 -.cfi_restore %r13 - movq -24(%r11),%r12 -.cfi_restore %r12 - movq -16(%r11),%rbp -.cfi_restore %rbp - movq -8(%r11),%rbx -.cfi_restore %rbx - leaq (%r11),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx2: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2 -.align 64 -K_XX_XX: -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0 -.byte 83,72,65,49,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 64 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S deleted file mode 100644 index 6ce216f2..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha256-x86_64.S +++ /dev/null @@ -1,4184 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P -.globl sha256_block_data_order -.hidden sha256_block_data_order -.type sha256_block_data_order,@function -.align 16 -sha256_block_data_order: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 0(%r11),%r9d - movl 4(%r11),%r10d - movl 8(%r11),%r11d - testl $536870912,%r11d - jnz .Lshaext_shortcut - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je .Lavx_shortcut - testl $512,%r10d - jnz .Lssse3_shortcut - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $64+32,%rsp - leaq (%rsi,%rdx,4),%rdx - andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %rax,88(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 -.Lprologue: - - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - jmp .Lloop - -.align 16 -.Lloop: - movl %ebx,%edi - leaq K256(%rip),%rbp - xorl %ecx,%edi - movl 0(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - - leaq 4(%rbp),%rbp - addl %r14d,%r11d - movl 4(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - - leaq 4(%rbp),%rbp - addl %r14d,%r10d - movl 8(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - - leaq 4(%rbp),%rbp - addl %r14d,%r9d - movl 12(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - - leaq 20(%rbp),%rbp - addl %r14d,%r8d - movl 16(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - - leaq 4(%rbp),%rbp - addl %r14d,%edx - movl 20(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - - leaq 4(%rbp),%rbp - addl %r14d,%ecx - movl 24(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - - leaq 4(%rbp),%rbp - addl %r14d,%ebx - movl 28(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - - leaq 20(%rbp),%rbp - addl %r14d,%eax - movl 32(%rsi),%r12d - movl %r8d,%r13d - movl %eax,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - - leaq 4(%rbp),%rbp - addl %r14d,%r11d - movl 36(%rsi),%r12d - movl %edx,%r13d - movl %r11d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - - leaq 4(%rbp),%rbp - addl %r14d,%r10d - movl 40(%rsi),%r12d - movl %ecx,%r13d - movl %r10d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - - leaq 4(%rbp),%rbp - addl %r14d,%r9d - movl 44(%rsi),%r12d - movl %ebx,%r13d - movl %r9d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - - leaq 20(%rbp),%rbp - addl %r14d,%r8d - movl 48(%rsi),%r12d - movl %eax,%r13d - movl %r8d,%r14d - bswapl %r12d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - - leaq 4(%rbp),%rbp - addl %r14d,%edx - movl 52(%rsi),%r12d - movl %r11d,%r13d - movl %edx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - - leaq 4(%rbp),%rbp - addl %r14d,%ecx - movl 56(%rsi),%r12d - movl %r10d,%r13d - movl %ecx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - - leaq 4(%rbp),%rbp - addl %r14d,%ebx - movl 60(%rsi),%r12d - movl %r9d,%r13d - movl %ebx,%r14d - bswapl %r12d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - - leaq 20(%rbp),%rbp - jmp .Lrounds_16_xx -.align 16 -.Lrounds_16_xx: - movl 4(%rsp),%r13d - movl 56(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%eax - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 36(%rsp),%r12d - - addl 0(%rsp),%r12d - movl %r8d,%r13d - addl %r15d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,0(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - - leaq 4(%rbp),%rbp - movl 8(%rsp),%r13d - movl 60(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r11d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 40(%rsp),%r12d - - addl 4(%rsp),%r12d - movl %edx,%r13d - addl %edi,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,4(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - - leaq 4(%rbp),%rbp - movl 12(%rsp),%r13d - movl 0(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r10d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 44(%rsp),%r12d - - addl 8(%rsp),%r12d - movl %ecx,%r13d - addl %r15d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,8(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - - leaq 4(%rbp),%rbp - movl 16(%rsp),%r13d - movl 4(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r9d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 48(%rsp),%r12d - - addl 12(%rsp),%r12d - movl %ebx,%r13d - addl %edi,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,12(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - - leaq 20(%rbp),%rbp - movl 20(%rsp),%r13d - movl 8(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r8d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 52(%rsp),%r12d - - addl 16(%rsp),%r12d - movl %eax,%r13d - addl %r15d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,16(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - - leaq 4(%rbp),%rbp - movl 24(%rsp),%r13d - movl 12(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%edx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 56(%rsp),%r12d - - addl 20(%rsp),%r12d - movl %r11d,%r13d - addl %edi,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,20(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - - leaq 4(%rbp),%rbp - movl 28(%rsp),%r13d - movl 16(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ecx - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 60(%rsp),%r12d - - addl 24(%rsp),%r12d - movl %r10d,%r13d - addl %r15d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,24(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - - leaq 4(%rbp),%rbp - movl 32(%rsp),%r13d - movl 20(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ebx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 0(%rsp),%r12d - - addl 28(%rsp),%r12d - movl %r9d,%r13d - addl %edi,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,28(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - - leaq 20(%rbp),%rbp - movl 36(%rsp),%r13d - movl 24(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%eax - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 4(%rsp),%r12d - - addl 32(%rsp),%r12d - movl %r8d,%r13d - addl %r15d,%r12d - movl %eax,%r14d - rorl $14,%r13d - movl %r9d,%r15d - - xorl %r8d,%r13d - rorl $9,%r14d - xorl %r10d,%r15d - - movl %r12d,32(%rsp) - xorl %eax,%r14d - andl %r8d,%r15d - - rorl $5,%r13d - addl %r11d,%r12d - xorl %r10d,%r15d - - rorl $11,%r14d - xorl %r8d,%r13d - addl %r15d,%r12d - - movl %eax,%r15d - addl (%rbp),%r12d - xorl %eax,%r14d - - xorl %ebx,%r15d - rorl $6,%r13d - movl %ebx,%r11d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r11d - addl %r12d,%edx - addl %r12d,%r11d - - leaq 4(%rbp),%rbp - movl 40(%rsp),%r13d - movl 28(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r11d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 8(%rsp),%r12d - - addl 36(%rsp),%r12d - movl %edx,%r13d - addl %edi,%r12d - movl %r11d,%r14d - rorl $14,%r13d - movl %r8d,%edi - - xorl %edx,%r13d - rorl $9,%r14d - xorl %r9d,%edi - - movl %r12d,36(%rsp) - xorl %r11d,%r14d - andl %edx,%edi - - rorl $5,%r13d - addl %r10d,%r12d - xorl %r9d,%edi - - rorl $11,%r14d - xorl %edx,%r13d - addl %edi,%r12d - - movl %r11d,%edi - addl (%rbp),%r12d - xorl %r11d,%r14d - - xorl %eax,%edi - rorl $6,%r13d - movl %eax,%r10d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r10d - addl %r12d,%ecx - addl %r12d,%r10d - - leaq 4(%rbp),%rbp - movl 44(%rsp),%r13d - movl 32(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r10d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 12(%rsp),%r12d - - addl 40(%rsp),%r12d - movl %ecx,%r13d - addl %r15d,%r12d - movl %r10d,%r14d - rorl $14,%r13d - movl %edx,%r15d - - xorl %ecx,%r13d - rorl $9,%r14d - xorl %r8d,%r15d - - movl %r12d,40(%rsp) - xorl %r10d,%r14d - andl %ecx,%r15d - - rorl $5,%r13d - addl %r9d,%r12d - xorl %r8d,%r15d - - rorl $11,%r14d - xorl %ecx,%r13d - addl %r15d,%r12d - - movl %r10d,%r15d - addl (%rbp),%r12d - xorl %r10d,%r14d - - xorl %r11d,%r15d - rorl $6,%r13d - movl %r11d,%r9d - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%r9d - addl %r12d,%ebx - addl %r12d,%r9d - - leaq 4(%rbp),%rbp - movl 48(%rsp),%r13d - movl 36(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r9d - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 16(%rsp),%r12d - - addl 44(%rsp),%r12d - movl %ebx,%r13d - addl %edi,%r12d - movl %r9d,%r14d - rorl $14,%r13d - movl %ecx,%edi - - xorl %ebx,%r13d - rorl $9,%r14d - xorl %edx,%edi - - movl %r12d,44(%rsp) - xorl %r9d,%r14d - andl %ebx,%edi - - rorl $5,%r13d - addl %r8d,%r12d - xorl %edx,%edi - - rorl $11,%r14d - xorl %ebx,%r13d - addl %edi,%r12d - - movl %r9d,%edi - addl (%rbp),%r12d - xorl %r9d,%r14d - - xorl %r10d,%edi - rorl $6,%r13d - movl %r10d,%r8d - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%r8d - addl %r12d,%eax - addl %r12d,%r8d - - leaq 20(%rbp),%rbp - movl 52(%rsp),%r13d - movl 40(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%r8d - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 20(%rsp),%r12d - - addl 48(%rsp),%r12d - movl %eax,%r13d - addl %r15d,%r12d - movl %r8d,%r14d - rorl $14,%r13d - movl %ebx,%r15d - - xorl %eax,%r13d - rorl $9,%r14d - xorl %ecx,%r15d - - movl %r12d,48(%rsp) - xorl %r8d,%r14d - andl %eax,%r15d - - rorl $5,%r13d - addl %edx,%r12d - xorl %ecx,%r15d - - rorl $11,%r14d - xorl %eax,%r13d - addl %r15d,%r12d - - movl %r8d,%r15d - addl (%rbp),%r12d - xorl %r8d,%r14d - - xorl %r9d,%r15d - rorl $6,%r13d - movl %r9d,%edx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%edx - addl %r12d,%r11d - addl %r12d,%edx - - leaq 4(%rbp),%rbp - movl 56(%rsp),%r13d - movl 44(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%edx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 24(%rsp),%r12d - - addl 52(%rsp),%r12d - movl %r11d,%r13d - addl %edi,%r12d - movl %edx,%r14d - rorl $14,%r13d - movl %eax,%edi - - xorl %r11d,%r13d - rorl $9,%r14d - xorl %ebx,%edi - - movl %r12d,52(%rsp) - xorl %edx,%r14d - andl %r11d,%edi - - rorl $5,%r13d - addl %ecx,%r12d - xorl %ebx,%edi - - rorl $11,%r14d - xorl %r11d,%r13d - addl %edi,%r12d - - movl %edx,%edi - addl (%rbp),%r12d - xorl %edx,%r14d - - xorl %r8d,%edi - rorl $6,%r13d - movl %r8d,%ecx - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%ecx - addl %r12d,%r10d - addl %r12d,%ecx - - leaq 4(%rbp),%rbp - movl 60(%rsp),%r13d - movl 48(%rsp),%r15d - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ecx - movl %r15d,%r14d - rorl $2,%r15d - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%r15d - shrl $10,%r14d - - rorl $17,%r15d - xorl %r13d,%r12d - xorl %r14d,%r15d - addl 28(%rsp),%r12d - - addl 56(%rsp),%r12d - movl %r10d,%r13d - addl %r15d,%r12d - movl %ecx,%r14d - rorl $14,%r13d - movl %r11d,%r15d - - xorl %r10d,%r13d - rorl $9,%r14d - xorl %eax,%r15d - - movl %r12d,56(%rsp) - xorl %ecx,%r14d - andl %r10d,%r15d - - rorl $5,%r13d - addl %ebx,%r12d - xorl %eax,%r15d - - rorl $11,%r14d - xorl %r10d,%r13d - addl %r15d,%r12d - - movl %ecx,%r15d - addl (%rbp),%r12d - xorl %ecx,%r14d - - xorl %edx,%r15d - rorl $6,%r13d - movl %edx,%ebx - - andl %r15d,%edi - rorl $2,%r14d - addl %r13d,%r12d - - xorl %edi,%ebx - addl %r12d,%r9d - addl %r12d,%ebx - - leaq 4(%rbp),%rbp - movl 0(%rsp),%r13d - movl 52(%rsp),%edi - - movl %r13d,%r12d - rorl $11,%r13d - addl %r14d,%ebx - movl %edi,%r14d - rorl $2,%edi - - xorl %r12d,%r13d - shrl $3,%r12d - rorl $7,%r13d - xorl %r14d,%edi - shrl $10,%r14d - - rorl $17,%edi - xorl %r13d,%r12d - xorl %r14d,%edi - addl 32(%rsp),%r12d - - addl 60(%rsp),%r12d - movl %r9d,%r13d - addl %edi,%r12d - movl %ebx,%r14d - rorl $14,%r13d - movl %r10d,%edi - - xorl %r9d,%r13d - rorl $9,%r14d - xorl %r11d,%edi - - movl %r12d,60(%rsp) - xorl %ebx,%r14d - andl %r9d,%edi - - rorl $5,%r13d - addl %eax,%r12d - xorl %r11d,%edi - - rorl $11,%r14d - xorl %r9d,%r13d - addl %edi,%r12d - - movl %ebx,%edi - addl (%rbp),%r12d - xorl %ebx,%r14d - - xorl %ecx,%edi - rorl $6,%r13d - movl %ecx,%eax - - andl %edi,%r15d - rorl $2,%r14d - addl %r13d,%r12d - - xorl %r15d,%eax - addl %r12d,%r8d - addl %r12d,%eax - - leaq 20(%rbp),%rbp - cmpb $0,3(%rbp) - jnz .Lrounds_16_xx - - movq 64+0(%rsp),%rdi - addl %r14d,%eax - leaq 64(%rsi),%rsi - - addl 0(%rdi),%eax - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb .Lloop - - movq 88(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha256_block_data_order,.-sha256_block_data_order -.align 64 -.type K256,@object -K256: -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174 -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 -.long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2 - -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908 -.byte 83,72,65,50,53,54,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.type sha256_block_data_order_shaext,@function -.align 64 -sha256_block_data_order_shaext: -.cfi_startproc -.Lshaext_shortcut: - leaq K256+128(%rip),%rcx - movdqu (%rdi),%xmm1 - movdqu 16(%rdi),%xmm2 - movdqa 512-128(%rcx),%xmm7 - - pshufd $0x1b,%xmm1,%xmm0 - pshufd $0xb1,%xmm1,%xmm1 - pshufd $0x1b,%xmm2,%xmm2 - movdqa %xmm7,%xmm8 -.byte 102,15,58,15,202,8 - punpcklqdq %xmm0,%xmm2 - jmp .Loop_shaext - -.align 16 -.Loop_shaext: - movdqu (%rsi),%xmm3 - movdqu 16(%rsi),%xmm4 - movdqu 32(%rsi),%xmm5 -.byte 102,15,56,0,223 - movdqu 48(%rsi),%xmm6 - - movdqa 0-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 102,15,56,0,231 - movdqa %xmm2,%xmm10 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - nop - movdqa %xmm1,%xmm9 -.byte 15,56,203,202 - - movdqa 32-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 102,15,56,0,239 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - leaq 64(%rsi),%rsi -.byte 15,56,204,220 -.byte 15,56,203,202 - - movdqa 64-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 102,15,56,0,247 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - - movdqa 96-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 128-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 160-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 192-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 224-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 256-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 288-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 - nop - paddd %xmm7,%xmm6 -.byte 15,56,204,220 -.byte 15,56,203,202 - movdqa 320-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,205,245 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm6,%xmm7 -.byte 102,15,58,15,253,4 - nop - paddd %xmm7,%xmm3 -.byte 15,56,204,229 -.byte 15,56,203,202 - movdqa 352-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 -.byte 15,56,205,222 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm3,%xmm7 -.byte 102,15,58,15,254,4 - nop - paddd %xmm7,%xmm4 -.byte 15,56,204,238 -.byte 15,56,203,202 - movdqa 384-128(%rcx),%xmm0 - paddd %xmm3,%xmm0 -.byte 15,56,205,227 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm4,%xmm7 -.byte 102,15,58,15,251,4 - nop - paddd %xmm7,%xmm5 -.byte 15,56,204,243 -.byte 15,56,203,202 - movdqa 416-128(%rcx),%xmm0 - paddd %xmm4,%xmm0 -.byte 15,56,205,236 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - movdqa %xmm5,%xmm7 -.byte 102,15,58,15,252,4 -.byte 15,56,203,202 - paddd %xmm7,%xmm6 - - movdqa 448-128(%rcx),%xmm0 - paddd %xmm5,%xmm0 -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 -.byte 15,56,205,245 - movdqa %xmm8,%xmm7 -.byte 15,56,203,202 - - movdqa 480-128(%rcx),%xmm0 - paddd %xmm6,%xmm0 - nop -.byte 15,56,203,209 - pshufd $0x0e,%xmm0,%xmm0 - decq %rdx - nop -.byte 15,56,203,202 - - paddd %xmm10,%xmm2 - paddd %xmm9,%xmm1 - jnz .Loop_shaext - - pshufd $0xb1,%xmm2,%xmm2 - pshufd $0x1b,%xmm1,%xmm7 - pshufd $0xb1,%xmm1,%xmm1 - punpckhqdq %xmm2,%xmm1 -.byte 102,15,58,15,215,8 - - movdqu %xmm1,(%rdi) - movdqu %xmm2,16(%rdi) - .byte 0xf3,0xc3 -.cfi_endproc -.size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext -.type sha256_block_data_order_ssse3,@function -.align 64 -sha256_block_data_order_ssse3: -.cfi_startproc -.Lssse3_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $96,%rsp - leaq (%rsi,%rdx,4),%rdx - andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %rax,88(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 -.Lprologue_ssse3: - - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - - - jmp .Lloop_ssse3 -.align 16 -.Lloop_ssse3: - movdqa K256+512(%rip),%xmm7 - movdqu 0(%rsi),%xmm0 - movdqu 16(%rsi),%xmm1 - movdqu 32(%rsi),%xmm2 -.byte 102,15,56,0,199 - movdqu 48(%rsi),%xmm3 - leaq K256(%rip),%rbp -.byte 102,15,56,0,207 - movdqa 0(%rbp),%xmm4 - movdqa 32(%rbp),%xmm5 -.byte 102,15,56,0,215 - paddd %xmm0,%xmm4 - movdqa 64(%rbp),%xmm6 -.byte 102,15,56,0,223 - movdqa 96(%rbp),%xmm7 - paddd %xmm1,%xmm5 - paddd %xmm2,%xmm6 - paddd %xmm3,%xmm7 - movdqa %xmm4,0(%rsp) - movl %eax,%r14d - movdqa %xmm5,16(%rsp) - movl %ebx,%edi - movdqa %xmm6,32(%rsp) - xorl %ecx,%edi - movdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lssse3_00_47 - -.align 16 -.Lssse3_00_47: - subq $-128,%rbp - rorl $14,%r13d - movdqa %xmm1,%xmm4 - movl %r14d,%eax - movl %r9d,%r12d - movdqa %xmm3,%xmm7 - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,224,4 - andl %r8d,%r12d - xorl %r8d,%r13d -.byte 102,15,58,15,250,4 - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - addl %r12d,%r11d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm0 - rorl $2,%r14d - addl %r11d,%edx - psrld $7,%xmm6 - addl %edi,%r11d - movl %edx,%r13d - pshufd $250,%xmm3,%xmm7 - addl %r11d,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%r11d - movl %r8d,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 4(%rsp),%r10d - movl %r11d,%edi - pxor %xmm6,%xmm4 - xorl %r9d,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - addl %r12d,%r10d - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm0 - rorl $2,%r14d - addl %r10d,%ecx - psrlq $17,%xmm6 - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %ecx,%r13d - xorl %r8d,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - psrldq $8,%xmm7 - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - paddd %xmm7,%xmm0 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - pshufd $80,%xmm0,%xmm7 - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - movdqa %xmm7,%xmm6 - addl %edi,%r9d - movl %ebx,%r13d - psrld $10,%xmm7 - addl %r9d,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - psrlq $2,%xmm6 - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - pxor %xmm6,%xmm7 - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %r10d,%edi - addl %r12d,%r8d - movdqa 0(%rbp),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - paddd %xmm7,%xmm0 - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - paddd %xmm0,%xmm6 - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,0(%rsp) - rorl $14,%r13d - movdqa %xmm2,%xmm4 - movl %r14d,%r8d - movl %ebx,%r12d - movdqa %xmm0,%xmm7 - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,225,4 - andl %eax,%r12d - xorl %eax,%r13d -.byte 102,15,58,15,251,4 - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - addl %r12d,%edx - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm1 - rorl $2,%r14d - addl %edx,%r11d - psrld $7,%xmm6 - addl %edi,%edx - movl %r11d,%r13d - pshufd $250,%xmm0,%xmm7 - addl %edx,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%edx - movl %eax,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 20(%rsp),%ecx - movl %edx,%edi - pxor %xmm6,%xmm4 - xorl %ebx,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - addl %r12d,%ecx - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm1 - rorl $2,%r14d - addl %ecx,%r10d - psrlq $17,%xmm6 - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r10d,%r13d - xorl %eax,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - psrldq $8,%xmm7 - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - paddd %xmm7,%xmm1 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - pshufd $80,%xmm1,%xmm7 - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - movdqa %xmm7,%xmm6 - addl %edi,%ebx - movl %r9d,%r13d - psrld $10,%xmm7 - addl %ebx,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - psrlq $2,%xmm6 - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - pxor %xmm6,%xmm7 - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %ecx,%edi - addl %r12d,%eax - movdqa 32(%rbp),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - paddd %xmm7,%xmm1 - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - paddd %xmm1,%xmm6 - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,16(%rsp) - rorl $14,%r13d - movdqa %xmm3,%xmm4 - movl %r14d,%eax - movl %r9d,%r12d - movdqa %xmm1,%xmm7 - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d -.byte 102,15,58,15,226,4 - andl %r8d,%r12d - xorl %r8d,%r13d -.byte 102,15,58,15,248,4 - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %ebx,%r15d - addl %r12d,%r11d - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - paddd %xmm7,%xmm2 - rorl $2,%r14d - addl %r11d,%edx - psrld $7,%xmm6 - addl %edi,%r11d - movl %edx,%r13d - pshufd $250,%xmm1,%xmm7 - addl %r11d,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%r11d - movl %r8d,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %r11d,%r14d - pxor %xmm5,%xmm4 - andl %edx,%r12d - xorl %edx,%r13d - pslld $11,%xmm5 - addl 36(%rsp),%r10d - movl %r11d,%edi - pxor %xmm6,%xmm4 - xorl %r9d,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %eax,%edi - addl %r12d,%r10d - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - psrld $10,%xmm7 - addl %r13d,%r10d - xorl %eax,%r15d - paddd %xmm4,%xmm2 - rorl $2,%r14d - addl %r10d,%ecx - psrlq $17,%xmm6 - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %ecx,%r13d - xorl %r8d,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - psrldq $8,%xmm7 - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - paddd %xmm7,%xmm2 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - pshufd $80,%xmm2,%xmm7 - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - movdqa %xmm7,%xmm6 - addl %edi,%r9d - movl %ebx,%r13d - psrld $10,%xmm7 - addl %r9d,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%r9d - movl %ecx,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - psrlq $2,%xmm6 - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - pxor %xmm6,%xmm7 - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %r10d,%edi - addl %r12d,%r8d - movdqa 64(%rbp),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - paddd %xmm7,%xmm2 - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - paddd %xmm2,%xmm6 - movl %eax,%r13d - addl %r8d,%r14d - movdqa %xmm6,32(%rsp) - rorl $14,%r13d - movdqa %xmm0,%xmm4 - movl %r14d,%r8d - movl %ebx,%r12d - movdqa %xmm2,%xmm7 - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d -.byte 102,15,58,15,227,4 - andl %eax,%r12d - xorl %eax,%r13d -.byte 102,15,58,15,249,4 - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - movdqa %xmm4,%xmm5 - xorl %r9d,%r15d - addl %r12d,%edx - movdqa %xmm4,%xmm6 - rorl $6,%r13d - andl %r15d,%edi - psrld $3,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - paddd %xmm7,%xmm3 - rorl $2,%r14d - addl %edx,%r11d - psrld $7,%xmm6 - addl %edi,%edx - movl %r11d,%r13d - pshufd $250,%xmm2,%xmm7 - addl %edx,%r14d - rorl $14,%r13d - pslld $14,%xmm5 - movl %r14d,%edx - movl %eax,%r12d - pxor %xmm6,%xmm4 - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - psrld $11,%xmm6 - xorl %edx,%r14d - pxor %xmm5,%xmm4 - andl %r11d,%r12d - xorl %r11d,%r13d - pslld $11,%xmm5 - addl 52(%rsp),%ecx - movl %edx,%edi - pxor %xmm6,%xmm4 - xorl %ebx,%r12d - rorl $11,%r14d - movdqa %xmm7,%xmm6 - xorl %r8d,%edi - addl %r12d,%ecx - pxor %xmm5,%xmm4 - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - psrld $10,%xmm7 - addl %r13d,%ecx - xorl %r8d,%r15d - paddd %xmm4,%xmm3 - rorl $2,%r14d - addl %ecx,%r10d - psrlq $17,%xmm6 - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - pxor %xmm6,%xmm7 - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - psrlq $2,%xmm6 - xorl %r10d,%r13d - xorl %eax,%r12d - pxor %xmm6,%xmm7 - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - pshufd $128,%xmm7,%xmm7 - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - psrldq $8,%xmm7 - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - paddd %xmm7,%xmm3 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - pshufd $80,%xmm3,%xmm7 - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - movdqa %xmm7,%xmm6 - addl %edi,%ebx - movl %r9d,%r13d - psrld $10,%xmm7 - addl %ebx,%r14d - rorl $14,%r13d - psrlq $17,%xmm6 - movl %r14d,%ebx - movl %r10d,%r12d - pxor %xmm6,%xmm7 - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - psrlq $2,%xmm6 - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - pxor %xmm6,%xmm7 - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - pshufd $8,%xmm7,%xmm7 - xorl %ecx,%edi - addl %r12d,%eax - movdqa 96(%rbp),%xmm6 - rorl $6,%r13d - andl %edi,%r15d - pslldq $8,%xmm7 - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - paddd %xmm7,%xmm3 - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - paddd %xmm3,%xmm6 - movl %r8d,%r13d - addl %eax,%r14d - movdqa %xmm6,48(%rsp) - cmpb $0,131(%rbp) - jne .Lssse3_00_47 - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - rorl $6,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - rorl $2,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - rorl $11,%r14d - xorl %eax,%edi - addl %r12d,%r10d - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - rorl $2,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - rorl $6,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - rorl $6,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - rorl $2,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - rorl $11,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - rorl $2,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - xorl %ecx,%edi - addl %r12d,%eax - rorl $6,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - rorl $14,%r13d - movl %r14d,%eax - movl %r9d,%r12d - rorl $9,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - rorl $5,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - rorl $11,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - rorl $6,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - rorl $2,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - rorl $14,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - rorl $9,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - rorl $5,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - rorl $11,%r14d - xorl %eax,%edi - addl %r12d,%r10d - rorl $6,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - rorl $2,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - rorl $14,%r13d - movl %r14d,%r10d - movl %edx,%r12d - rorl $9,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - rorl $5,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - rorl $11,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - rorl $6,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - rorl $2,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - rorl $14,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - rorl $9,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - rorl $5,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - rorl $11,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - rorl $6,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - rorl $2,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - rorl $14,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - rorl $9,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - rorl $5,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - rorl $11,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - rorl $6,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - rorl $2,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - rorl $14,%r13d - movl %r14d,%edx - movl %eax,%r12d - rorl $9,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - rorl $5,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - rorl $11,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - rorl $6,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - rorl $2,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - rorl $14,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - rorl $9,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - rorl $5,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - rorl $11,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - rorl $6,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - rorl $2,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - rorl $14,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - rorl $9,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - rorl $5,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - rorl $11,%r14d - xorl %ecx,%edi - addl %r12d,%eax - rorl $6,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - rorl $2,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%rdi - movl %r14d,%eax - - addl 0(%rdi),%eax - leaq 64(%rsi),%rsi - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb .Lloop_ssse3 - - movq 88(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_ssse3: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha256_block_data_order_ssse3,.-sha256_block_data_order_ssse3 -.type sha256_block_data_order_avx,@function -.align 64 -sha256_block_data_order_avx: -.cfi_startproc -.Lavx_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $96,%rsp - leaq (%rsi,%rdx,4),%rdx - andq $-64,%rsp - movq %rdi,64+0(%rsp) - movq %rsi,64+8(%rsp) - movq %rdx,64+16(%rsp) - movq %rax,88(%rsp) -.cfi_escape 0x0f,0x06,0x77,0xd8,0x00,0x06,0x23,0x08 -.Lprologue_avx: - - vzeroupper - movl 0(%rdi),%eax - movl 4(%rdi),%ebx - movl 8(%rdi),%ecx - movl 12(%rdi),%edx - movl 16(%rdi),%r8d - movl 20(%rdi),%r9d - movl 24(%rdi),%r10d - movl 28(%rdi),%r11d - vmovdqa K256+512+32(%rip),%xmm8 - vmovdqa K256+512+64(%rip),%xmm9 - jmp .Lloop_avx -.align 16 -.Lloop_avx: - vmovdqa K256+512(%rip),%xmm7 - vmovdqu 0(%rsi),%xmm0 - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm7,%xmm0,%xmm0 - leaq K256(%rip),%rbp - vpshufb %xmm7,%xmm1,%xmm1 - vpshufb %xmm7,%xmm2,%xmm2 - vpaddd 0(%rbp),%xmm0,%xmm4 - vpshufb %xmm7,%xmm3,%xmm3 - vpaddd 32(%rbp),%xmm1,%xmm5 - vpaddd 64(%rbp),%xmm2,%xmm6 - vpaddd 96(%rbp),%xmm3,%xmm7 - vmovdqa %xmm4,0(%rsp) - movl %eax,%r14d - vmovdqa %xmm5,16(%rsp) - movl %ebx,%edi - vmovdqa %xmm6,32(%rsp) - xorl %ecx,%edi - vmovdqa %xmm7,48(%rsp) - movl %r8d,%r13d - jmp .Lavx_00_47 - -.align 16 -.Lavx_00_47: - subq $-128,%rbp - vpalignr $4,%xmm0,%xmm1,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm2,%xmm3,%xmm7 - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm0,%xmm0 - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - vpshufd $250,%xmm3,%xmm7 - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm0,%xmm0 - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpaddd %xmm6,%xmm0,%xmm0 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - vpshufd $80,%xmm0,%xmm7 - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - vpaddd %xmm6,%xmm0,%xmm0 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpaddd 0(%rbp),%xmm0,%xmm6 - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,0(%rsp) - vpalignr $4,%xmm1,%xmm2,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm3,%xmm0,%xmm7 - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm1,%xmm1 - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - vpshufd $250,%xmm0,%xmm7 - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm1,%xmm1 - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpaddd %xmm6,%xmm1,%xmm1 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - vpshufd $80,%xmm1,%xmm7 - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - vpxor %xmm7,%xmm6,%xmm6 - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - vpaddd %xmm6,%xmm1,%xmm1 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpaddd 32(%rbp),%xmm1,%xmm6 - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,16(%rsp) - vpalignr $4,%xmm2,%xmm3,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - vpalignr $4,%xmm0,%xmm1,%xmm7 - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - vpaddd %xmm7,%xmm2,%xmm2 - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - vpshufd $250,%xmm1,%xmm7 - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - vpsrld $11,%xmm6,%xmm6 - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - vpaddd %xmm4,%xmm2,%xmm2 - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - vpxor %xmm7,%xmm6,%xmm6 - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - vpaddd %xmm6,%xmm2,%xmm2 - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - vpshufd $80,%xmm2,%xmm7 - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - vpxor %xmm7,%xmm6,%xmm6 - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - vpaddd %xmm6,%xmm2,%xmm2 - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - vpaddd 64(%rbp),%xmm2,%xmm6 - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - vmovdqa %xmm6,32(%rsp) - vpalignr $4,%xmm3,%xmm0,%xmm4 - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - vpalignr $4,%xmm1,%xmm2,%xmm7 - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - vpsrld $7,%xmm4,%xmm6 - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - vpaddd %xmm7,%xmm3,%xmm3 - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - vpsrld $3,%xmm4,%xmm7 - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - vpslld $14,%xmm4,%xmm5 - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - vpxor %xmm6,%xmm7,%xmm4 - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - vpshufd $250,%xmm2,%xmm7 - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - vpsrld $11,%xmm6,%xmm6 - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - vpxor %xmm5,%xmm4,%xmm4 - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - vpslld $11,%xmm5,%xmm5 - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - vpxor %xmm6,%xmm4,%xmm4 - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - vpsrld $10,%xmm7,%xmm6 - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - vpxor %xmm5,%xmm4,%xmm4 - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - vpsrlq $17,%xmm7,%xmm7 - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - vpaddd %xmm4,%xmm3,%xmm3 - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - vpxor %xmm7,%xmm6,%xmm6 - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - vpsrlq $2,%xmm7,%xmm7 - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - vpxor %xmm7,%xmm6,%xmm6 - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - vpshufb %xmm8,%xmm6,%xmm6 - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - vpaddd %xmm6,%xmm3,%xmm3 - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - vpshufd $80,%xmm3,%xmm7 - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - vpsrld $10,%xmm7,%xmm6 - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - vpsrlq $17,%xmm7,%xmm7 - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - vpxor %xmm7,%xmm6,%xmm6 - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - vpsrlq $2,%xmm7,%xmm7 - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - vpxor %xmm7,%xmm6,%xmm6 - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - vpshufb %xmm9,%xmm6,%xmm6 - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - vpaddd %xmm6,%xmm3,%xmm3 - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - vpaddd 96(%rbp),%xmm3,%xmm6 - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - vmovdqa %xmm6,48(%rsp) - cmpb $0,131(%rbp) - jne .Lavx_00_47 - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 0(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 4(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 8(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 12(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 16(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 20(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 24(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 28(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%eax - movl %r9d,%r12d - shrdl $9,%r14d,%r14d - xorl %r8d,%r13d - xorl %r10d,%r12d - shrdl $5,%r13d,%r13d - xorl %eax,%r14d - andl %r8d,%r12d - xorl %r8d,%r13d - addl 32(%rsp),%r11d - movl %eax,%r15d - xorl %r10d,%r12d - shrdl $11,%r14d,%r14d - xorl %ebx,%r15d - addl %r12d,%r11d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %eax,%r14d - addl %r13d,%r11d - xorl %ebx,%edi - shrdl $2,%r14d,%r14d - addl %r11d,%edx - addl %edi,%r11d - movl %edx,%r13d - addl %r11d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r11d - movl %r8d,%r12d - shrdl $9,%r14d,%r14d - xorl %edx,%r13d - xorl %r9d,%r12d - shrdl $5,%r13d,%r13d - xorl %r11d,%r14d - andl %edx,%r12d - xorl %edx,%r13d - addl 36(%rsp),%r10d - movl %r11d,%edi - xorl %r9d,%r12d - shrdl $11,%r14d,%r14d - xorl %eax,%edi - addl %r12d,%r10d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r11d,%r14d - addl %r13d,%r10d - xorl %eax,%r15d - shrdl $2,%r14d,%r14d - addl %r10d,%ecx - addl %r15d,%r10d - movl %ecx,%r13d - addl %r10d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r10d - movl %edx,%r12d - shrdl $9,%r14d,%r14d - xorl %ecx,%r13d - xorl %r8d,%r12d - shrdl $5,%r13d,%r13d - xorl %r10d,%r14d - andl %ecx,%r12d - xorl %ecx,%r13d - addl 40(%rsp),%r9d - movl %r10d,%r15d - xorl %r8d,%r12d - shrdl $11,%r14d,%r14d - xorl %r11d,%r15d - addl %r12d,%r9d - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r10d,%r14d - addl %r13d,%r9d - xorl %r11d,%edi - shrdl $2,%r14d,%r14d - addl %r9d,%ebx - addl %edi,%r9d - movl %ebx,%r13d - addl %r9d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r9d - movl %ecx,%r12d - shrdl $9,%r14d,%r14d - xorl %ebx,%r13d - xorl %edx,%r12d - shrdl $5,%r13d,%r13d - xorl %r9d,%r14d - andl %ebx,%r12d - xorl %ebx,%r13d - addl 44(%rsp),%r8d - movl %r9d,%edi - xorl %edx,%r12d - shrdl $11,%r14d,%r14d - xorl %r10d,%edi - addl %r12d,%r8d - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %r9d,%r14d - addl %r13d,%r8d - xorl %r10d,%r15d - shrdl $2,%r14d,%r14d - addl %r8d,%eax - addl %r15d,%r8d - movl %eax,%r13d - addl %r8d,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%r8d - movl %ebx,%r12d - shrdl $9,%r14d,%r14d - xorl %eax,%r13d - xorl %ecx,%r12d - shrdl $5,%r13d,%r13d - xorl %r8d,%r14d - andl %eax,%r12d - xorl %eax,%r13d - addl 48(%rsp),%edx - movl %r8d,%r15d - xorl %ecx,%r12d - shrdl $11,%r14d,%r14d - xorl %r9d,%r15d - addl %r12d,%edx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %r8d,%r14d - addl %r13d,%edx - xorl %r9d,%edi - shrdl $2,%r14d,%r14d - addl %edx,%r11d - addl %edi,%edx - movl %r11d,%r13d - addl %edx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%edx - movl %eax,%r12d - shrdl $9,%r14d,%r14d - xorl %r11d,%r13d - xorl %ebx,%r12d - shrdl $5,%r13d,%r13d - xorl %edx,%r14d - andl %r11d,%r12d - xorl %r11d,%r13d - addl 52(%rsp),%ecx - movl %edx,%edi - xorl %ebx,%r12d - shrdl $11,%r14d,%r14d - xorl %r8d,%edi - addl %r12d,%ecx - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %edx,%r14d - addl %r13d,%ecx - xorl %r8d,%r15d - shrdl $2,%r14d,%r14d - addl %ecx,%r10d - addl %r15d,%ecx - movl %r10d,%r13d - addl %ecx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ecx - movl %r11d,%r12d - shrdl $9,%r14d,%r14d - xorl %r10d,%r13d - xorl %eax,%r12d - shrdl $5,%r13d,%r13d - xorl %ecx,%r14d - andl %r10d,%r12d - xorl %r10d,%r13d - addl 56(%rsp),%ebx - movl %ecx,%r15d - xorl %eax,%r12d - shrdl $11,%r14d,%r14d - xorl %edx,%r15d - addl %r12d,%ebx - shrdl $6,%r13d,%r13d - andl %r15d,%edi - xorl %ecx,%r14d - addl %r13d,%ebx - xorl %edx,%edi - shrdl $2,%r14d,%r14d - addl %ebx,%r9d - addl %edi,%ebx - movl %r9d,%r13d - addl %ebx,%r14d - shrdl $14,%r13d,%r13d - movl %r14d,%ebx - movl %r10d,%r12d - shrdl $9,%r14d,%r14d - xorl %r9d,%r13d - xorl %r11d,%r12d - shrdl $5,%r13d,%r13d - xorl %ebx,%r14d - andl %r9d,%r12d - xorl %r9d,%r13d - addl 60(%rsp),%eax - movl %ebx,%edi - xorl %r11d,%r12d - shrdl $11,%r14d,%r14d - xorl %ecx,%edi - addl %r12d,%eax - shrdl $6,%r13d,%r13d - andl %edi,%r15d - xorl %ebx,%r14d - addl %r13d,%eax - xorl %ecx,%r15d - shrdl $2,%r14d,%r14d - addl %eax,%r8d - addl %r15d,%eax - movl %r8d,%r13d - addl %eax,%r14d - movq 64+0(%rsp),%rdi - movl %r14d,%eax - - addl 0(%rdi),%eax - leaq 64(%rsi),%rsi - addl 4(%rdi),%ebx - addl 8(%rdi),%ecx - addl 12(%rdi),%edx - addl 16(%rdi),%r8d - addl 20(%rdi),%r9d - addl 24(%rdi),%r10d - addl 28(%rdi),%r11d - - cmpq 64+16(%rsp),%rsi - - movl %eax,0(%rdi) - movl %ebx,4(%rdi) - movl %ecx,8(%rdi) - movl %edx,12(%rdi) - movl %r8d,16(%rdi) - movl %r9d,20(%rdi) - movl %r10d,24(%rdi) - movl %r11d,28(%rdi) - jb .Lloop_avx - - movq 88(%rsp),%rsi -.cfi_def_cfa %rsi,8 - vzeroupper - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha256_block_data_order_avx,.-sha256_block_data_order_avx -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S deleted file mode 100644 index 45a58a1d..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/sha512-x86_64.S +++ /dev/null @@ -1,2992 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P -.globl sha512_block_data_order -.hidden sha512_block_data_order -.type sha512_block_data_order,@function -.align 16 -sha512_block_data_order: -.cfi_startproc - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 0(%r11),%r9d - movl 4(%r11),%r10d - movl 8(%r11),%r11d - andl $1073741824,%r9d - andl $268435968,%r10d - orl %r9d,%r10d - cmpl $1342177792,%r10d - je .Lavx_shortcut - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $128+32,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,152(%rsp) -.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 -.Lprologue: - - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp .Lloop - -.align 16 -.Lloop: - movq %rbx,%rdi - leaq K512(%rip),%rbp - xorq %rcx,%rdi - movq 0(%rsi),%r12 - movq %r8,%r13 - movq %rax,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r9,%r15 - - xorq %r8,%r13 - rorq $5,%r14 - xorq %r10,%r15 - - movq %r12,0(%rsp) - xorq %rax,%r14 - andq %r8,%r15 - - rorq $4,%r13 - addq %r11,%r12 - xorq %r10,%r15 - - rorq $6,%r14 - xorq %r8,%r13 - addq %r15,%r12 - - movq %rax,%r15 - addq (%rbp),%r12 - xorq %rax,%r14 - - xorq %rbx,%r15 - rorq $14,%r13 - movq %rbx,%r11 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r11 - addq %r12,%rdx - addq %r12,%r11 - - leaq 8(%rbp),%rbp - addq %r14,%r11 - movq 8(%rsi),%r12 - movq %rdx,%r13 - movq %r11,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r8,%rdi - - xorq %rdx,%r13 - rorq $5,%r14 - xorq %r9,%rdi - - movq %r12,8(%rsp) - xorq %r11,%r14 - andq %rdx,%rdi - - rorq $4,%r13 - addq %r10,%r12 - xorq %r9,%rdi - - rorq $6,%r14 - xorq %rdx,%r13 - addq %rdi,%r12 - - movq %r11,%rdi - addq (%rbp),%r12 - xorq %r11,%r14 - - xorq %rax,%rdi - rorq $14,%r13 - movq %rax,%r10 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r10 - addq %r12,%rcx - addq %r12,%r10 - - leaq 24(%rbp),%rbp - addq %r14,%r10 - movq 16(%rsi),%r12 - movq %rcx,%r13 - movq %r10,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rdx,%r15 - - xorq %rcx,%r13 - rorq $5,%r14 - xorq %r8,%r15 - - movq %r12,16(%rsp) - xorq %r10,%r14 - andq %rcx,%r15 - - rorq $4,%r13 - addq %r9,%r12 - xorq %r8,%r15 - - rorq $6,%r14 - xorq %rcx,%r13 - addq %r15,%r12 - - movq %r10,%r15 - addq (%rbp),%r12 - xorq %r10,%r14 - - xorq %r11,%r15 - rorq $14,%r13 - movq %r11,%r9 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r9 - addq %r12,%rbx - addq %r12,%r9 - - leaq 8(%rbp),%rbp - addq %r14,%r9 - movq 24(%rsi),%r12 - movq %rbx,%r13 - movq %r9,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rcx,%rdi - - xorq %rbx,%r13 - rorq $5,%r14 - xorq %rdx,%rdi - - movq %r12,24(%rsp) - xorq %r9,%r14 - andq %rbx,%rdi - - rorq $4,%r13 - addq %r8,%r12 - xorq %rdx,%rdi - - rorq $6,%r14 - xorq %rbx,%r13 - addq %rdi,%r12 - - movq %r9,%rdi - addq (%rbp),%r12 - xorq %r9,%r14 - - xorq %r10,%rdi - rorq $14,%r13 - movq %r10,%r8 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r8 - addq %r12,%rax - addq %r12,%r8 - - leaq 24(%rbp),%rbp - addq %r14,%r8 - movq 32(%rsi),%r12 - movq %rax,%r13 - movq %r8,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rbx,%r15 - - xorq %rax,%r13 - rorq $5,%r14 - xorq %rcx,%r15 - - movq %r12,32(%rsp) - xorq %r8,%r14 - andq %rax,%r15 - - rorq $4,%r13 - addq %rdx,%r12 - xorq %rcx,%r15 - - rorq $6,%r14 - xorq %rax,%r13 - addq %r15,%r12 - - movq %r8,%r15 - addq (%rbp),%r12 - xorq %r8,%r14 - - xorq %r9,%r15 - rorq $14,%r13 - movq %r9,%rdx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rdx - addq %r12,%r11 - addq %r12,%rdx - - leaq 8(%rbp),%rbp - addq %r14,%rdx - movq 40(%rsi),%r12 - movq %r11,%r13 - movq %rdx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rax,%rdi - - xorq %r11,%r13 - rorq $5,%r14 - xorq %rbx,%rdi - - movq %r12,40(%rsp) - xorq %rdx,%r14 - andq %r11,%rdi - - rorq $4,%r13 - addq %rcx,%r12 - xorq %rbx,%rdi - - rorq $6,%r14 - xorq %r11,%r13 - addq %rdi,%r12 - - movq %rdx,%rdi - addq (%rbp),%r12 - xorq %rdx,%r14 - - xorq %r8,%rdi - rorq $14,%r13 - movq %r8,%rcx - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rcx - addq %r12,%r10 - addq %r12,%rcx - - leaq 24(%rbp),%rbp - addq %r14,%rcx - movq 48(%rsi),%r12 - movq %r10,%r13 - movq %rcx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r11,%r15 - - xorq %r10,%r13 - rorq $5,%r14 - xorq %rax,%r15 - - movq %r12,48(%rsp) - xorq %rcx,%r14 - andq %r10,%r15 - - rorq $4,%r13 - addq %rbx,%r12 - xorq %rax,%r15 - - rorq $6,%r14 - xorq %r10,%r13 - addq %r15,%r12 - - movq %rcx,%r15 - addq (%rbp),%r12 - xorq %rcx,%r14 - - xorq %rdx,%r15 - rorq $14,%r13 - movq %rdx,%rbx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rbx - addq %r12,%r9 - addq %r12,%rbx - - leaq 8(%rbp),%rbp - addq %r14,%rbx - movq 56(%rsi),%r12 - movq %r9,%r13 - movq %rbx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r10,%rdi - - xorq %r9,%r13 - rorq $5,%r14 - xorq %r11,%rdi - - movq %r12,56(%rsp) - xorq %rbx,%r14 - andq %r9,%rdi - - rorq $4,%r13 - addq %rax,%r12 - xorq %r11,%rdi - - rorq $6,%r14 - xorq %r9,%r13 - addq %rdi,%r12 - - movq %rbx,%rdi - addq (%rbp),%r12 - xorq %rbx,%r14 - - xorq %rcx,%rdi - rorq $14,%r13 - movq %rcx,%rax - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rax - addq %r12,%r8 - addq %r12,%rax - - leaq 24(%rbp),%rbp - addq %r14,%rax - movq 64(%rsi),%r12 - movq %r8,%r13 - movq %rax,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r9,%r15 - - xorq %r8,%r13 - rorq $5,%r14 - xorq %r10,%r15 - - movq %r12,64(%rsp) - xorq %rax,%r14 - andq %r8,%r15 - - rorq $4,%r13 - addq %r11,%r12 - xorq %r10,%r15 - - rorq $6,%r14 - xorq %r8,%r13 - addq %r15,%r12 - - movq %rax,%r15 - addq (%rbp),%r12 - xorq %rax,%r14 - - xorq %rbx,%r15 - rorq $14,%r13 - movq %rbx,%r11 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r11 - addq %r12,%rdx - addq %r12,%r11 - - leaq 8(%rbp),%rbp - addq %r14,%r11 - movq 72(%rsi),%r12 - movq %rdx,%r13 - movq %r11,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r8,%rdi - - xorq %rdx,%r13 - rorq $5,%r14 - xorq %r9,%rdi - - movq %r12,72(%rsp) - xorq %r11,%r14 - andq %rdx,%rdi - - rorq $4,%r13 - addq %r10,%r12 - xorq %r9,%rdi - - rorq $6,%r14 - xorq %rdx,%r13 - addq %rdi,%r12 - - movq %r11,%rdi - addq (%rbp),%r12 - xorq %r11,%r14 - - xorq %rax,%rdi - rorq $14,%r13 - movq %rax,%r10 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r10 - addq %r12,%rcx - addq %r12,%r10 - - leaq 24(%rbp),%rbp - addq %r14,%r10 - movq 80(%rsi),%r12 - movq %rcx,%r13 - movq %r10,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rdx,%r15 - - xorq %rcx,%r13 - rorq $5,%r14 - xorq %r8,%r15 - - movq %r12,80(%rsp) - xorq %r10,%r14 - andq %rcx,%r15 - - rorq $4,%r13 - addq %r9,%r12 - xorq %r8,%r15 - - rorq $6,%r14 - xorq %rcx,%r13 - addq %r15,%r12 - - movq %r10,%r15 - addq (%rbp),%r12 - xorq %r10,%r14 - - xorq %r11,%r15 - rorq $14,%r13 - movq %r11,%r9 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r9 - addq %r12,%rbx - addq %r12,%r9 - - leaq 8(%rbp),%rbp - addq %r14,%r9 - movq 88(%rsi),%r12 - movq %rbx,%r13 - movq %r9,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rcx,%rdi - - xorq %rbx,%r13 - rorq $5,%r14 - xorq %rdx,%rdi - - movq %r12,88(%rsp) - xorq %r9,%r14 - andq %rbx,%rdi - - rorq $4,%r13 - addq %r8,%r12 - xorq %rdx,%rdi - - rorq $6,%r14 - xorq %rbx,%r13 - addq %rdi,%r12 - - movq %r9,%rdi - addq (%rbp),%r12 - xorq %r9,%r14 - - xorq %r10,%rdi - rorq $14,%r13 - movq %r10,%r8 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r8 - addq %r12,%rax - addq %r12,%r8 - - leaq 24(%rbp),%rbp - addq %r14,%r8 - movq 96(%rsi),%r12 - movq %rax,%r13 - movq %r8,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rbx,%r15 - - xorq %rax,%r13 - rorq $5,%r14 - xorq %rcx,%r15 - - movq %r12,96(%rsp) - xorq %r8,%r14 - andq %rax,%r15 - - rorq $4,%r13 - addq %rdx,%r12 - xorq %rcx,%r15 - - rorq $6,%r14 - xorq %rax,%r13 - addq %r15,%r12 - - movq %r8,%r15 - addq (%rbp),%r12 - xorq %r8,%r14 - - xorq %r9,%r15 - rorq $14,%r13 - movq %r9,%rdx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rdx - addq %r12,%r11 - addq %r12,%rdx - - leaq 8(%rbp),%rbp - addq %r14,%rdx - movq 104(%rsi),%r12 - movq %r11,%r13 - movq %rdx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %rax,%rdi - - xorq %r11,%r13 - rorq $5,%r14 - xorq %rbx,%rdi - - movq %r12,104(%rsp) - xorq %rdx,%r14 - andq %r11,%rdi - - rorq $4,%r13 - addq %rcx,%r12 - xorq %rbx,%rdi - - rorq $6,%r14 - xorq %r11,%r13 - addq %rdi,%r12 - - movq %rdx,%rdi - addq (%rbp),%r12 - xorq %rdx,%r14 - - xorq %r8,%rdi - rorq $14,%r13 - movq %r8,%rcx - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rcx - addq %r12,%r10 - addq %r12,%rcx - - leaq 24(%rbp),%rbp - addq %r14,%rcx - movq 112(%rsi),%r12 - movq %r10,%r13 - movq %rcx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r11,%r15 - - xorq %r10,%r13 - rorq $5,%r14 - xorq %rax,%r15 - - movq %r12,112(%rsp) - xorq %rcx,%r14 - andq %r10,%r15 - - rorq $4,%r13 - addq %rbx,%r12 - xorq %rax,%r15 - - rorq $6,%r14 - xorq %r10,%r13 - addq %r15,%r12 - - movq %rcx,%r15 - addq (%rbp),%r12 - xorq %rcx,%r14 - - xorq %rdx,%r15 - rorq $14,%r13 - movq %rdx,%rbx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rbx - addq %r12,%r9 - addq %r12,%rbx - - leaq 8(%rbp),%rbp - addq %r14,%rbx - movq 120(%rsi),%r12 - movq %r9,%r13 - movq %rbx,%r14 - bswapq %r12 - rorq $23,%r13 - movq %r10,%rdi - - xorq %r9,%r13 - rorq $5,%r14 - xorq %r11,%rdi - - movq %r12,120(%rsp) - xorq %rbx,%r14 - andq %r9,%rdi - - rorq $4,%r13 - addq %rax,%r12 - xorq %r11,%rdi - - rorq $6,%r14 - xorq %r9,%r13 - addq %rdi,%r12 - - movq %rbx,%rdi - addq (%rbp),%r12 - xorq %rbx,%r14 - - xorq %rcx,%rdi - rorq $14,%r13 - movq %rcx,%rax - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rax - addq %r12,%r8 - addq %r12,%rax - - leaq 24(%rbp),%rbp - jmp .Lrounds_16_xx -.align 16 -.Lrounds_16_xx: - movq 8(%rsp),%r13 - movq 112(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rax - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 72(%rsp),%r12 - - addq 0(%rsp),%r12 - movq %r8,%r13 - addq %r15,%r12 - movq %rax,%r14 - rorq $23,%r13 - movq %r9,%r15 - - xorq %r8,%r13 - rorq $5,%r14 - xorq %r10,%r15 - - movq %r12,0(%rsp) - xorq %rax,%r14 - andq %r8,%r15 - - rorq $4,%r13 - addq %r11,%r12 - xorq %r10,%r15 - - rorq $6,%r14 - xorq %r8,%r13 - addq %r15,%r12 - - movq %rax,%r15 - addq (%rbp),%r12 - xorq %rax,%r14 - - xorq %rbx,%r15 - rorq $14,%r13 - movq %rbx,%r11 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r11 - addq %r12,%rdx - addq %r12,%r11 - - leaq 8(%rbp),%rbp - movq 16(%rsp),%r13 - movq 120(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r11 - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 80(%rsp),%r12 - - addq 8(%rsp),%r12 - movq %rdx,%r13 - addq %rdi,%r12 - movq %r11,%r14 - rorq $23,%r13 - movq %r8,%rdi - - xorq %rdx,%r13 - rorq $5,%r14 - xorq %r9,%rdi - - movq %r12,8(%rsp) - xorq %r11,%r14 - andq %rdx,%rdi - - rorq $4,%r13 - addq %r10,%r12 - xorq %r9,%rdi - - rorq $6,%r14 - xorq %rdx,%r13 - addq %rdi,%r12 - - movq %r11,%rdi - addq (%rbp),%r12 - xorq %r11,%r14 - - xorq %rax,%rdi - rorq $14,%r13 - movq %rax,%r10 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r10 - addq %r12,%rcx - addq %r12,%r10 - - leaq 24(%rbp),%rbp - movq 24(%rsp),%r13 - movq 0(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r10 - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 88(%rsp),%r12 - - addq 16(%rsp),%r12 - movq %rcx,%r13 - addq %r15,%r12 - movq %r10,%r14 - rorq $23,%r13 - movq %rdx,%r15 - - xorq %rcx,%r13 - rorq $5,%r14 - xorq %r8,%r15 - - movq %r12,16(%rsp) - xorq %r10,%r14 - andq %rcx,%r15 - - rorq $4,%r13 - addq %r9,%r12 - xorq %r8,%r15 - - rorq $6,%r14 - xorq %rcx,%r13 - addq %r15,%r12 - - movq %r10,%r15 - addq (%rbp),%r12 - xorq %r10,%r14 - - xorq %r11,%r15 - rorq $14,%r13 - movq %r11,%r9 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r9 - addq %r12,%rbx - addq %r12,%r9 - - leaq 8(%rbp),%rbp - movq 32(%rsp),%r13 - movq 8(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r9 - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 96(%rsp),%r12 - - addq 24(%rsp),%r12 - movq %rbx,%r13 - addq %rdi,%r12 - movq %r9,%r14 - rorq $23,%r13 - movq %rcx,%rdi - - xorq %rbx,%r13 - rorq $5,%r14 - xorq %rdx,%rdi - - movq %r12,24(%rsp) - xorq %r9,%r14 - andq %rbx,%rdi - - rorq $4,%r13 - addq %r8,%r12 - xorq %rdx,%rdi - - rorq $6,%r14 - xorq %rbx,%r13 - addq %rdi,%r12 - - movq %r9,%rdi - addq (%rbp),%r12 - xorq %r9,%r14 - - xorq %r10,%rdi - rorq $14,%r13 - movq %r10,%r8 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r8 - addq %r12,%rax - addq %r12,%r8 - - leaq 24(%rbp),%rbp - movq 40(%rsp),%r13 - movq 16(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r8 - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 104(%rsp),%r12 - - addq 32(%rsp),%r12 - movq %rax,%r13 - addq %r15,%r12 - movq %r8,%r14 - rorq $23,%r13 - movq %rbx,%r15 - - xorq %rax,%r13 - rorq $5,%r14 - xorq %rcx,%r15 - - movq %r12,32(%rsp) - xorq %r8,%r14 - andq %rax,%r15 - - rorq $4,%r13 - addq %rdx,%r12 - xorq %rcx,%r15 - - rorq $6,%r14 - xorq %rax,%r13 - addq %r15,%r12 - - movq %r8,%r15 - addq (%rbp),%r12 - xorq %r8,%r14 - - xorq %r9,%r15 - rorq $14,%r13 - movq %r9,%rdx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rdx - addq %r12,%r11 - addq %r12,%rdx - - leaq 8(%rbp),%rbp - movq 48(%rsp),%r13 - movq 24(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rdx - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 112(%rsp),%r12 - - addq 40(%rsp),%r12 - movq %r11,%r13 - addq %rdi,%r12 - movq %rdx,%r14 - rorq $23,%r13 - movq %rax,%rdi - - xorq %r11,%r13 - rorq $5,%r14 - xorq %rbx,%rdi - - movq %r12,40(%rsp) - xorq %rdx,%r14 - andq %r11,%rdi - - rorq $4,%r13 - addq %rcx,%r12 - xorq %rbx,%rdi - - rorq $6,%r14 - xorq %r11,%r13 - addq %rdi,%r12 - - movq %rdx,%rdi - addq (%rbp),%r12 - xorq %rdx,%r14 - - xorq %r8,%rdi - rorq $14,%r13 - movq %r8,%rcx - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rcx - addq %r12,%r10 - addq %r12,%rcx - - leaq 24(%rbp),%rbp - movq 56(%rsp),%r13 - movq 32(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rcx - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 120(%rsp),%r12 - - addq 48(%rsp),%r12 - movq %r10,%r13 - addq %r15,%r12 - movq %rcx,%r14 - rorq $23,%r13 - movq %r11,%r15 - - xorq %r10,%r13 - rorq $5,%r14 - xorq %rax,%r15 - - movq %r12,48(%rsp) - xorq %rcx,%r14 - andq %r10,%r15 - - rorq $4,%r13 - addq %rbx,%r12 - xorq %rax,%r15 - - rorq $6,%r14 - xorq %r10,%r13 - addq %r15,%r12 - - movq %rcx,%r15 - addq (%rbp),%r12 - xorq %rcx,%r14 - - xorq %rdx,%r15 - rorq $14,%r13 - movq %rdx,%rbx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rbx - addq %r12,%r9 - addq %r12,%rbx - - leaq 8(%rbp),%rbp - movq 64(%rsp),%r13 - movq 40(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rbx - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 0(%rsp),%r12 - - addq 56(%rsp),%r12 - movq %r9,%r13 - addq %rdi,%r12 - movq %rbx,%r14 - rorq $23,%r13 - movq %r10,%rdi - - xorq %r9,%r13 - rorq $5,%r14 - xorq %r11,%rdi - - movq %r12,56(%rsp) - xorq %rbx,%r14 - andq %r9,%rdi - - rorq $4,%r13 - addq %rax,%r12 - xorq %r11,%rdi - - rorq $6,%r14 - xorq %r9,%r13 - addq %rdi,%r12 - - movq %rbx,%rdi - addq (%rbp),%r12 - xorq %rbx,%r14 - - xorq %rcx,%rdi - rorq $14,%r13 - movq %rcx,%rax - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rax - addq %r12,%r8 - addq %r12,%rax - - leaq 24(%rbp),%rbp - movq 72(%rsp),%r13 - movq 48(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rax - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 8(%rsp),%r12 - - addq 64(%rsp),%r12 - movq %r8,%r13 - addq %r15,%r12 - movq %rax,%r14 - rorq $23,%r13 - movq %r9,%r15 - - xorq %r8,%r13 - rorq $5,%r14 - xorq %r10,%r15 - - movq %r12,64(%rsp) - xorq %rax,%r14 - andq %r8,%r15 - - rorq $4,%r13 - addq %r11,%r12 - xorq %r10,%r15 - - rorq $6,%r14 - xorq %r8,%r13 - addq %r15,%r12 - - movq %rax,%r15 - addq (%rbp),%r12 - xorq %rax,%r14 - - xorq %rbx,%r15 - rorq $14,%r13 - movq %rbx,%r11 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r11 - addq %r12,%rdx - addq %r12,%r11 - - leaq 8(%rbp),%rbp - movq 80(%rsp),%r13 - movq 56(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r11 - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 16(%rsp),%r12 - - addq 72(%rsp),%r12 - movq %rdx,%r13 - addq %rdi,%r12 - movq %r11,%r14 - rorq $23,%r13 - movq %r8,%rdi - - xorq %rdx,%r13 - rorq $5,%r14 - xorq %r9,%rdi - - movq %r12,72(%rsp) - xorq %r11,%r14 - andq %rdx,%rdi - - rorq $4,%r13 - addq %r10,%r12 - xorq %r9,%rdi - - rorq $6,%r14 - xorq %rdx,%r13 - addq %rdi,%r12 - - movq %r11,%rdi - addq (%rbp),%r12 - xorq %r11,%r14 - - xorq %rax,%rdi - rorq $14,%r13 - movq %rax,%r10 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r10 - addq %r12,%rcx - addq %r12,%r10 - - leaq 24(%rbp),%rbp - movq 88(%rsp),%r13 - movq 64(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r10 - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 24(%rsp),%r12 - - addq 80(%rsp),%r12 - movq %rcx,%r13 - addq %r15,%r12 - movq %r10,%r14 - rorq $23,%r13 - movq %rdx,%r15 - - xorq %rcx,%r13 - rorq $5,%r14 - xorq %r8,%r15 - - movq %r12,80(%rsp) - xorq %r10,%r14 - andq %rcx,%r15 - - rorq $4,%r13 - addq %r9,%r12 - xorq %r8,%r15 - - rorq $6,%r14 - xorq %rcx,%r13 - addq %r15,%r12 - - movq %r10,%r15 - addq (%rbp),%r12 - xorq %r10,%r14 - - xorq %r11,%r15 - rorq $14,%r13 - movq %r11,%r9 - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%r9 - addq %r12,%rbx - addq %r12,%r9 - - leaq 8(%rbp),%rbp - movq 96(%rsp),%r13 - movq 72(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r9 - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 32(%rsp),%r12 - - addq 88(%rsp),%r12 - movq %rbx,%r13 - addq %rdi,%r12 - movq %r9,%r14 - rorq $23,%r13 - movq %rcx,%rdi - - xorq %rbx,%r13 - rorq $5,%r14 - xorq %rdx,%rdi - - movq %r12,88(%rsp) - xorq %r9,%r14 - andq %rbx,%rdi - - rorq $4,%r13 - addq %r8,%r12 - xorq %rdx,%rdi - - rorq $6,%r14 - xorq %rbx,%r13 - addq %rdi,%r12 - - movq %r9,%rdi - addq (%rbp),%r12 - xorq %r9,%r14 - - xorq %r10,%rdi - rorq $14,%r13 - movq %r10,%r8 - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%r8 - addq %r12,%rax - addq %r12,%r8 - - leaq 24(%rbp),%rbp - movq 104(%rsp),%r13 - movq 80(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%r8 - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 40(%rsp),%r12 - - addq 96(%rsp),%r12 - movq %rax,%r13 - addq %r15,%r12 - movq %r8,%r14 - rorq $23,%r13 - movq %rbx,%r15 - - xorq %rax,%r13 - rorq $5,%r14 - xorq %rcx,%r15 - - movq %r12,96(%rsp) - xorq %r8,%r14 - andq %rax,%r15 - - rorq $4,%r13 - addq %rdx,%r12 - xorq %rcx,%r15 - - rorq $6,%r14 - xorq %rax,%r13 - addq %r15,%r12 - - movq %r8,%r15 - addq (%rbp),%r12 - xorq %r8,%r14 - - xorq %r9,%r15 - rorq $14,%r13 - movq %r9,%rdx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rdx - addq %r12,%r11 - addq %r12,%rdx - - leaq 8(%rbp),%rbp - movq 112(%rsp),%r13 - movq 88(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rdx - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 48(%rsp),%r12 - - addq 104(%rsp),%r12 - movq %r11,%r13 - addq %rdi,%r12 - movq %rdx,%r14 - rorq $23,%r13 - movq %rax,%rdi - - xorq %r11,%r13 - rorq $5,%r14 - xorq %rbx,%rdi - - movq %r12,104(%rsp) - xorq %rdx,%r14 - andq %r11,%rdi - - rorq $4,%r13 - addq %rcx,%r12 - xorq %rbx,%rdi - - rorq $6,%r14 - xorq %r11,%r13 - addq %rdi,%r12 - - movq %rdx,%rdi - addq (%rbp),%r12 - xorq %rdx,%r14 - - xorq %r8,%rdi - rorq $14,%r13 - movq %r8,%rcx - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rcx - addq %r12,%r10 - addq %r12,%rcx - - leaq 24(%rbp),%rbp - movq 120(%rsp),%r13 - movq 96(%rsp),%r15 - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rcx - movq %r15,%r14 - rorq $42,%r15 - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%r15 - shrq $6,%r14 - - rorq $19,%r15 - xorq %r13,%r12 - xorq %r14,%r15 - addq 56(%rsp),%r12 - - addq 112(%rsp),%r12 - movq %r10,%r13 - addq %r15,%r12 - movq %rcx,%r14 - rorq $23,%r13 - movq %r11,%r15 - - xorq %r10,%r13 - rorq $5,%r14 - xorq %rax,%r15 - - movq %r12,112(%rsp) - xorq %rcx,%r14 - andq %r10,%r15 - - rorq $4,%r13 - addq %rbx,%r12 - xorq %rax,%r15 - - rorq $6,%r14 - xorq %r10,%r13 - addq %r15,%r12 - - movq %rcx,%r15 - addq (%rbp),%r12 - xorq %rcx,%r14 - - xorq %rdx,%r15 - rorq $14,%r13 - movq %rdx,%rbx - - andq %r15,%rdi - rorq $28,%r14 - addq %r13,%r12 - - xorq %rdi,%rbx - addq %r12,%r9 - addq %r12,%rbx - - leaq 8(%rbp),%rbp - movq 0(%rsp),%r13 - movq 104(%rsp),%rdi - - movq %r13,%r12 - rorq $7,%r13 - addq %r14,%rbx - movq %rdi,%r14 - rorq $42,%rdi - - xorq %r12,%r13 - shrq $7,%r12 - rorq $1,%r13 - xorq %r14,%rdi - shrq $6,%r14 - - rorq $19,%rdi - xorq %r13,%r12 - xorq %r14,%rdi - addq 64(%rsp),%r12 - - addq 120(%rsp),%r12 - movq %r9,%r13 - addq %rdi,%r12 - movq %rbx,%r14 - rorq $23,%r13 - movq %r10,%rdi - - xorq %r9,%r13 - rorq $5,%r14 - xorq %r11,%rdi - - movq %r12,120(%rsp) - xorq %rbx,%r14 - andq %r9,%rdi - - rorq $4,%r13 - addq %rax,%r12 - xorq %r11,%rdi - - rorq $6,%r14 - xorq %r9,%r13 - addq %rdi,%r12 - - movq %rbx,%rdi - addq (%rbp),%r12 - xorq %rbx,%r14 - - xorq %rcx,%rdi - rorq $14,%r13 - movq %rcx,%rax - - andq %rdi,%r15 - rorq $28,%r14 - addq %r13,%r12 - - xorq %r15,%rax - addq %r12,%r8 - addq %r12,%rax - - leaq 24(%rbp),%rbp - cmpb $0,7(%rbp) - jnz .Lrounds_16_xx - - movq 128+0(%rsp),%rdi - addq %r14,%rax - leaq 128(%rsi),%rsi - - addq 0(%rdi),%rax - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb .Lloop - - movq 152(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha512_block_data_order,.-sha512_block_data_order -.align 64 -.type K512,@object -K512: -.quad 0x428a2f98d728ae22,0x7137449123ef65cd -.quad 0x428a2f98d728ae22,0x7137449123ef65cd -.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc -.quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc -.quad 0x3956c25bf348b538,0x59f111f1b605d019 -.quad 0x3956c25bf348b538,0x59f111f1b605d019 -.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 -.quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118 -.quad 0xd807aa98a3030242,0x12835b0145706fbe -.quad 0xd807aa98a3030242,0x12835b0145706fbe -.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 -.quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2 -.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 -.quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1 -.quad 0x9bdc06a725c71235,0xc19bf174cf692694 -.quad 0x9bdc06a725c71235,0xc19bf174cf692694 -.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 -.quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3 -.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 -.quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65 -.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 -.quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483 -.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 -.quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5 -.quad 0x983e5152ee66dfab,0xa831c66d2db43210 -.quad 0x983e5152ee66dfab,0xa831c66d2db43210 -.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 -.quad 0xb00327c898fb213f,0xbf597fc7beef0ee4 -.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 -.quad 0xc6e00bf33da88fc2,0xd5a79147930aa725 -.quad 0x06ca6351e003826f,0x142929670a0e6e70 -.quad 0x06ca6351e003826f,0x142929670a0e6e70 -.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 -.quad 0x27b70a8546d22ffc,0x2e1b21385c26c926 -.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df -.quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df -.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 -.quad 0x650a73548baf63de,0x766a0abb3c77b2a8 -.quad 0x81c2c92e47edaee6,0x92722c851482353b -.quad 0x81c2c92e47edaee6,0x92722c851482353b -.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 -.quad 0xa2bfe8a14cf10364,0xa81a664bbc423001 -.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 -.quad 0xc24b8b70d0f89791,0xc76c51a30654be30 -.quad 0xd192e819d6ef5218,0xd69906245565a910 -.quad 0xd192e819d6ef5218,0xd69906245565a910 -.quad 0xf40e35855771202a,0x106aa07032bbd1b8 -.quad 0xf40e35855771202a,0x106aa07032bbd1b8 -.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 -.quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53 -.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 -.quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8 -.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb -.quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb -.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 -.quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3 -.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 -.quad 0x748f82ee5defb2fc,0x78a5636f43172f60 -.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec -.quad 0x84c87814a1f0ab72,0x8cc702081a6439ec -.quad 0x90befffa23631e28,0xa4506cebde82bde9 -.quad 0x90befffa23631e28,0xa4506cebde82bde9 -.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b -.quad 0xbef9a3f7b2c67915,0xc67178f2e372532b -.quad 0xca273eceea26619c,0xd186b8c721c0c207 -.quad 0xca273eceea26619c,0xd186b8c721c0c207 -.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 -.quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178 -.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 -.quad 0x06f067aa72176fba,0x0a637dc5a2c898a6 -.quad 0x113f9804bef90dae,0x1b710b35131c471b -.quad 0x113f9804bef90dae,0x1b710b35131c471b -.quad 0x28db77f523047d84,0x32caab7b40c72493 -.quad 0x28db77f523047d84,0x32caab7b40c72493 -.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c -.quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c -.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a -.quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a -.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 -.quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817 - -.quad 0x0001020304050607,0x08090a0b0c0d0e0f -.quad 0x0001020304050607,0x08090a0b0c0d0e0f -.byte 83,72,65,53,49,50,32,98,108,111,99,107,32,116,114,97,110,115,102,111,114,109,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.type sha512_block_data_order_avx,@function -.align 64 -sha512_block_data_order_avx: -.cfi_startproc -.Lavx_shortcut: - movq %rsp,%rax -.cfi_def_cfa_register %rax - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - shlq $4,%rdx - subq $160,%rsp - leaq (%rsi,%rdx,8),%rdx - andq $-64,%rsp - movq %rdi,128+0(%rsp) - movq %rsi,128+8(%rsp) - movq %rdx,128+16(%rsp) - movq %rax,152(%rsp) -.cfi_escape 0x0f,0x06,0x77,0x98,0x01,0x06,0x23,0x08 -.Lprologue_avx: - - vzeroupper - movq 0(%rdi),%rax - movq 8(%rdi),%rbx - movq 16(%rdi),%rcx - movq 24(%rdi),%rdx - movq 32(%rdi),%r8 - movq 40(%rdi),%r9 - movq 48(%rdi),%r10 - movq 56(%rdi),%r11 - jmp .Lloop_avx -.align 16 -.Lloop_avx: - vmovdqa K512+1280(%rip),%xmm11 - vmovdqu 0(%rsi),%xmm0 - leaq K512+128(%rip),%rbp - vmovdqu 16(%rsi),%xmm1 - vmovdqu 32(%rsi),%xmm2 - vpshufb %xmm11,%xmm0,%xmm0 - vmovdqu 48(%rsi),%xmm3 - vpshufb %xmm11,%xmm1,%xmm1 - vmovdqu 64(%rsi),%xmm4 - vpshufb %xmm11,%xmm2,%xmm2 - vmovdqu 80(%rsi),%xmm5 - vpshufb %xmm11,%xmm3,%xmm3 - vmovdqu 96(%rsi),%xmm6 - vpshufb %xmm11,%xmm4,%xmm4 - vmovdqu 112(%rsi),%xmm7 - vpshufb %xmm11,%xmm5,%xmm5 - vpaddq -128(%rbp),%xmm0,%xmm8 - vpshufb %xmm11,%xmm6,%xmm6 - vpaddq -96(%rbp),%xmm1,%xmm9 - vpshufb %xmm11,%xmm7,%xmm7 - vpaddq -64(%rbp),%xmm2,%xmm10 - vpaddq -32(%rbp),%xmm3,%xmm11 - vmovdqa %xmm8,0(%rsp) - vpaddq 0(%rbp),%xmm4,%xmm8 - vmovdqa %xmm9,16(%rsp) - vpaddq 32(%rbp),%xmm5,%xmm9 - vmovdqa %xmm10,32(%rsp) - vpaddq 64(%rbp),%xmm6,%xmm10 - vmovdqa %xmm11,48(%rsp) - vpaddq 96(%rbp),%xmm7,%xmm11 - vmovdqa %xmm8,64(%rsp) - movq %rax,%r14 - vmovdqa %xmm9,80(%rsp) - movq %rbx,%rdi - vmovdqa %xmm10,96(%rsp) - xorq %rcx,%rdi - vmovdqa %xmm11,112(%rsp) - movq %r8,%r13 - jmp .Lavx_00_47 - -.align 16 -.Lavx_00_47: - addq $256,%rbp - vpalignr $8,%xmm0,%xmm1,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rax - vpalignr $8,%xmm4,%xmm5,%xmm11 - movq %r9,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r8,%r13 - xorq %r10,%r12 - vpaddq %xmm11,%xmm0,%xmm0 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r8,%r12 - xorq %r8,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 0(%rsp),%r11 - movq %rax,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rbx,%r15 - addq %r12,%r11 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm7,%xmm11 - addq %r11,%rdx - addq %rdi,%r11 - vpxor %xmm9,%xmm8,%xmm8 - movq %rdx,%r13 - addq %r11,%r14 - vpsllq $3,%xmm7,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r11 - vpaddq %xmm8,%xmm0,%xmm0 - movq %r8,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm7,%xmm9 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rdx,%r12 - xorq %rdx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 8(%rsp),%r10 - movq %r11,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r9,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rax,%rdi - addq %r12,%r10 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm0,%xmm0 - xorq %r11,%r14 - addq %r13,%r10 - vpaddq -128(%rbp),%xmm0,%xmm10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,0(%rsp) - vpalignr $8,%xmm1,%xmm2,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r10 - vpalignr $8,%xmm5,%xmm6,%xmm11 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rcx,%r13 - xorq %r8,%r12 - vpaddq %xmm11,%xmm1,%xmm1 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rcx,%r12 - xorq %rcx,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 16(%rsp),%r9 - movq %r10,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r11,%r15 - addq %r12,%r9 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm0,%xmm11 - addq %r9,%rbx - addq %rdi,%r9 - vpxor %xmm9,%xmm8,%xmm8 - movq %rbx,%r13 - addq %r9,%r14 - vpsllq $3,%xmm0,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r9 - vpaddq %xmm8,%xmm1,%xmm1 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm0,%xmm9 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rbx,%r12 - xorq %rbx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 24(%rsp),%r8 - movq %r9,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r10,%rdi - addq %r12,%r8 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm1,%xmm1 - xorq %r9,%r14 - addq %r13,%r8 - vpaddq -96(%rbp),%xmm1,%xmm10 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,16(%rsp) - vpalignr $8,%xmm2,%xmm3,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r8 - vpalignr $8,%xmm6,%xmm7,%xmm11 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rax,%r13 - xorq %rcx,%r12 - vpaddq %xmm11,%xmm2,%xmm2 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rax,%r12 - xorq %rax,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 32(%rsp),%rdx - movq %r8,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r9,%r15 - addq %r12,%rdx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm1,%xmm11 - addq %rdx,%r11 - addq %rdi,%rdx - vpxor %xmm9,%xmm8,%xmm8 - movq %r11,%r13 - addq %rdx,%r14 - vpsllq $3,%xmm1,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rdx - vpaddq %xmm8,%xmm2,%xmm2 - movq %rax,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm1,%xmm9 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r11,%r12 - xorq %r11,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 40(%rsp),%rcx - movq %rdx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r8,%rdi - addq %r12,%rcx - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm2,%xmm2 - xorq %rdx,%r14 - addq %r13,%rcx - vpaddq -64(%rbp),%xmm2,%xmm10 - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,32(%rsp) - vpalignr $8,%xmm3,%xmm4,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rcx - vpalignr $8,%xmm7,%xmm0,%xmm11 - movq %r11,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r10,%r13 - xorq %rax,%r12 - vpaddq %xmm11,%xmm3,%xmm3 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r10,%r12 - xorq %r10,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 48(%rsp),%rbx - movq %rcx,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rdx,%r15 - addq %r12,%rbx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm2,%xmm11 - addq %rbx,%r9 - addq %rdi,%rbx - vpxor %xmm9,%xmm8,%xmm8 - movq %r9,%r13 - addq %rbx,%r14 - vpsllq $3,%xmm2,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rbx - vpaddq %xmm8,%xmm3,%xmm3 - movq %r10,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm2,%xmm9 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r9,%r12 - xorq %r9,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 56(%rsp),%rax - movq %rbx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r11,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rcx,%rdi - addq %r12,%rax - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm3,%xmm3 - xorq %rbx,%r14 - addq %r13,%rax - vpaddq -32(%rbp),%xmm3,%xmm10 - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,48(%rsp) - vpalignr $8,%xmm4,%xmm5,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rax - vpalignr $8,%xmm0,%xmm1,%xmm11 - movq %r9,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r8,%r13 - xorq %r10,%r12 - vpaddq %xmm11,%xmm4,%xmm4 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r8,%r12 - xorq %r8,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 64(%rsp),%r11 - movq %rax,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rbx,%r15 - addq %r12,%r11 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rax,%r14 - addq %r13,%r11 - vpxor %xmm10,%xmm8,%xmm8 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm3,%xmm11 - addq %r11,%rdx - addq %rdi,%r11 - vpxor %xmm9,%xmm8,%xmm8 - movq %rdx,%r13 - addq %r11,%r14 - vpsllq $3,%xmm3,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r11 - vpaddq %xmm8,%xmm4,%xmm4 - movq %r8,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm3,%xmm9 - xorq %rdx,%r13 - xorq %r9,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rdx,%r12 - xorq %rdx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 72(%rsp),%r10 - movq %r11,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r9,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rax,%rdi - addq %r12,%r10 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm4,%xmm4 - xorq %r11,%r14 - addq %r13,%r10 - vpaddq 0(%rbp),%xmm4,%xmm10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - vmovdqa %xmm10,64(%rsp) - vpalignr $8,%xmm5,%xmm6,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r10 - vpalignr $8,%xmm1,%xmm2,%xmm11 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rcx,%r13 - xorq %r8,%r12 - vpaddq %xmm11,%xmm5,%xmm5 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rcx,%r12 - xorq %rcx,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 80(%rsp),%r9 - movq %r10,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r11,%r15 - addq %r12,%r9 - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r10,%r14 - addq %r13,%r9 - vpxor %xmm10,%xmm8,%xmm8 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm4,%xmm11 - addq %r9,%rbx - addq %rdi,%r9 - vpxor %xmm9,%xmm8,%xmm8 - movq %rbx,%r13 - addq %r9,%r14 - vpsllq $3,%xmm4,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%r9 - vpaddq %xmm8,%xmm5,%xmm5 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm4,%xmm9 - xorq %rbx,%r13 - xorq %rdx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %rbx,%r12 - xorq %rbx,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 88(%rsp),%r8 - movq %r9,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r10,%rdi - addq %r12,%r8 - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm5,%xmm5 - xorq %r9,%r14 - addq %r13,%r8 - vpaddq 32(%rbp),%xmm5,%xmm10 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - vmovdqa %xmm10,80(%rsp) - vpalignr $8,%xmm6,%xmm7,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%r8 - vpalignr $8,%xmm2,%xmm3,%xmm11 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %rax,%r13 - xorq %rcx,%r12 - vpaddq %xmm11,%xmm6,%xmm6 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %rax,%r12 - xorq %rax,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 96(%rsp),%rdx - movq %r8,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %r9,%r15 - addq %r12,%rdx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %r8,%r14 - addq %r13,%rdx - vpxor %xmm10,%xmm8,%xmm8 - xorq %r9,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm5,%xmm11 - addq %rdx,%r11 - addq %rdi,%rdx - vpxor %xmm9,%xmm8,%xmm8 - movq %r11,%r13 - addq %rdx,%r14 - vpsllq $3,%xmm5,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rdx - vpaddq %xmm8,%xmm6,%xmm6 - movq %rax,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm5,%xmm9 - xorq %r11,%r13 - xorq %rbx,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r11,%r12 - xorq %r11,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 104(%rsp),%rcx - movq %rdx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %r8,%rdi - addq %r12,%rcx - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm6,%xmm6 - xorq %rdx,%r14 - addq %r13,%rcx - vpaddq 64(%rbp),%xmm6,%xmm10 - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - vmovdqa %xmm10,96(%rsp) - vpalignr $8,%xmm7,%xmm0,%xmm8 - shrdq $23,%r13,%r13 - movq %r14,%rcx - vpalignr $8,%xmm3,%xmm4,%xmm11 - movq %r11,%r12 - shrdq $5,%r14,%r14 - vpsrlq $1,%xmm8,%xmm10 - xorq %r10,%r13 - xorq %rax,%r12 - vpaddq %xmm11,%xmm7,%xmm7 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - vpsrlq $7,%xmm8,%xmm11 - andq %r10,%r12 - xorq %r10,%r13 - vpsllq $56,%xmm8,%xmm9 - addq 112(%rsp),%rbx - movq %rcx,%r15 - vpxor %xmm10,%xmm11,%xmm8 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - vpsrlq $7,%xmm10,%xmm10 - xorq %rdx,%r15 - addq %r12,%rbx - vpxor %xmm9,%xmm8,%xmm8 - shrdq $14,%r13,%r13 - andq %r15,%rdi - vpsllq $7,%xmm9,%xmm9 - xorq %rcx,%r14 - addq %r13,%rbx - vpxor %xmm10,%xmm8,%xmm8 - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - vpsrlq $6,%xmm6,%xmm11 - addq %rbx,%r9 - addq %rdi,%rbx - vpxor %xmm9,%xmm8,%xmm8 - movq %r9,%r13 - addq %rbx,%r14 - vpsllq $3,%xmm6,%xmm10 - shrdq $23,%r13,%r13 - movq %r14,%rbx - vpaddq %xmm8,%xmm7,%xmm7 - movq %r10,%r12 - shrdq $5,%r14,%r14 - vpsrlq $19,%xmm6,%xmm9 - xorq %r9,%r13 - xorq %r11,%r12 - vpxor %xmm10,%xmm11,%xmm11 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - vpsllq $42,%xmm10,%xmm10 - andq %r9,%r12 - xorq %r9,%r13 - vpxor %xmm9,%xmm11,%xmm11 - addq 120(%rsp),%rax - movq %rbx,%rdi - vpsrlq $42,%xmm9,%xmm9 - xorq %r11,%r12 - shrdq $6,%r14,%r14 - vpxor %xmm10,%xmm11,%xmm11 - xorq %rcx,%rdi - addq %r12,%rax - vpxor %xmm9,%xmm11,%xmm11 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - vpaddq %xmm11,%xmm7,%xmm7 - xorq %rbx,%r14 - addq %r13,%rax - vpaddq 96(%rbp),%xmm7,%xmm10 - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - vmovdqa %xmm10,112(%rsp) - cmpb $0,135(%rbp) - jne .Lavx_00_47 - shrdq $23,%r13,%r13 - movq %r14,%rax - movq %r9,%r12 - shrdq $5,%r14,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 0(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r11 - movq %r8,%r12 - shrdq $5,%r14,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 8(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - shrdq $6,%r14,%r14 - xorq %rax,%rdi - addq %r12,%r10 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r10 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 16(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - xorq %r11,%r15 - addq %r12,%r9 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r9 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 24(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - xorq %r10,%rdi - addq %r12,%r8 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r8 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 32(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - xorq %r9,%r15 - addq %r12,%rdx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - shrdq $28,%r14,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rdx - movq %rax,%r12 - shrdq $5,%r14,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 40(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - xorq %r8,%rdi - addq %r12,%rcx - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rcx - movq %r11,%r12 - shrdq $5,%r14,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 48(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rbx - movq %r10,%r12 - shrdq $5,%r14,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 56(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - shrdq $6,%r14,%r14 - xorq %rcx,%rdi - addq %r12,%rax - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rax - movq %r9,%r12 - shrdq $5,%r14,%r14 - xorq %r8,%r13 - xorq %r10,%r12 - shrdq $4,%r13,%r13 - xorq %rax,%r14 - andq %r8,%r12 - xorq %r8,%r13 - addq 64(%rsp),%r11 - movq %rax,%r15 - xorq %r10,%r12 - shrdq $6,%r14,%r14 - xorq %rbx,%r15 - addq %r12,%r11 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rax,%r14 - addq %r13,%r11 - xorq %rbx,%rdi - shrdq $28,%r14,%r14 - addq %r11,%rdx - addq %rdi,%r11 - movq %rdx,%r13 - addq %r11,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r11 - movq %r8,%r12 - shrdq $5,%r14,%r14 - xorq %rdx,%r13 - xorq %r9,%r12 - shrdq $4,%r13,%r13 - xorq %r11,%r14 - andq %rdx,%r12 - xorq %rdx,%r13 - addq 72(%rsp),%r10 - movq %r11,%rdi - xorq %r9,%r12 - shrdq $6,%r14,%r14 - xorq %rax,%rdi - addq %r12,%r10 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r11,%r14 - addq %r13,%r10 - xorq %rax,%r15 - shrdq $28,%r14,%r14 - addq %r10,%rcx - addq %r15,%r10 - movq %rcx,%r13 - addq %r10,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r10 - movq %rdx,%r12 - shrdq $5,%r14,%r14 - xorq %rcx,%r13 - xorq %r8,%r12 - shrdq $4,%r13,%r13 - xorq %r10,%r14 - andq %rcx,%r12 - xorq %rcx,%r13 - addq 80(%rsp),%r9 - movq %r10,%r15 - xorq %r8,%r12 - shrdq $6,%r14,%r14 - xorq %r11,%r15 - addq %r12,%r9 - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r10,%r14 - addq %r13,%r9 - xorq %r11,%rdi - shrdq $28,%r14,%r14 - addq %r9,%rbx - addq %rdi,%r9 - movq %rbx,%r13 - addq %r9,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r9 - movq %rcx,%r12 - shrdq $5,%r14,%r14 - xorq %rbx,%r13 - xorq %rdx,%r12 - shrdq $4,%r13,%r13 - xorq %r9,%r14 - andq %rbx,%r12 - xorq %rbx,%r13 - addq 88(%rsp),%r8 - movq %r9,%rdi - xorq %rdx,%r12 - shrdq $6,%r14,%r14 - xorq %r10,%rdi - addq %r12,%r8 - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %r9,%r14 - addq %r13,%r8 - xorq %r10,%r15 - shrdq $28,%r14,%r14 - addq %r8,%rax - addq %r15,%r8 - movq %rax,%r13 - addq %r8,%r14 - shrdq $23,%r13,%r13 - movq %r14,%r8 - movq %rbx,%r12 - shrdq $5,%r14,%r14 - xorq %rax,%r13 - xorq %rcx,%r12 - shrdq $4,%r13,%r13 - xorq %r8,%r14 - andq %rax,%r12 - xorq %rax,%r13 - addq 96(%rsp),%rdx - movq %r8,%r15 - xorq %rcx,%r12 - shrdq $6,%r14,%r14 - xorq %r9,%r15 - addq %r12,%rdx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %r8,%r14 - addq %r13,%rdx - xorq %r9,%rdi - shrdq $28,%r14,%r14 - addq %rdx,%r11 - addq %rdi,%rdx - movq %r11,%r13 - addq %rdx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rdx - movq %rax,%r12 - shrdq $5,%r14,%r14 - xorq %r11,%r13 - xorq %rbx,%r12 - shrdq $4,%r13,%r13 - xorq %rdx,%r14 - andq %r11,%r12 - xorq %r11,%r13 - addq 104(%rsp),%rcx - movq %rdx,%rdi - xorq %rbx,%r12 - shrdq $6,%r14,%r14 - xorq %r8,%rdi - addq %r12,%rcx - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rdx,%r14 - addq %r13,%rcx - xorq %r8,%r15 - shrdq $28,%r14,%r14 - addq %rcx,%r10 - addq %r15,%rcx - movq %r10,%r13 - addq %rcx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rcx - movq %r11,%r12 - shrdq $5,%r14,%r14 - xorq %r10,%r13 - xorq %rax,%r12 - shrdq $4,%r13,%r13 - xorq %rcx,%r14 - andq %r10,%r12 - xorq %r10,%r13 - addq 112(%rsp),%rbx - movq %rcx,%r15 - xorq %rax,%r12 - shrdq $6,%r14,%r14 - xorq %rdx,%r15 - addq %r12,%rbx - shrdq $14,%r13,%r13 - andq %r15,%rdi - xorq %rcx,%r14 - addq %r13,%rbx - xorq %rdx,%rdi - shrdq $28,%r14,%r14 - addq %rbx,%r9 - addq %rdi,%rbx - movq %r9,%r13 - addq %rbx,%r14 - shrdq $23,%r13,%r13 - movq %r14,%rbx - movq %r10,%r12 - shrdq $5,%r14,%r14 - xorq %r9,%r13 - xorq %r11,%r12 - shrdq $4,%r13,%r13 - xorq %rbx,%r14 - andq %r9,%r12 - xorq %r9,%r13 - addq 120(%rsp),%rax - movq %rbx,%rdi - xorq %r11,%r12 - shrdq $6,%r14,%r14 - xorq %rcx,%rdi - addq %r12,%rax - shrdq $14,%r13,%r13 - andq %rdi,%r15 - xorq %rbx,%r14 - addq %r13,%rax - xorq %rcx,%r15 - shrdq $28,%r14,%r14 - addq %rax,%r8 - addq %r15,%rax - movq %r8,%r13 - addq %rax,%r14 - movq 128+0(%rsp),%rdi - movq %r14,%rax - - addq 0(%rdi),%rax - leaq 128(%rsi),%rsi - addq 8(%rdi),%rbx - addq 16(%rdi),%rcx - addq 24(%rdi),%rdx - addq 32(%rdi),%r8 - addq 40(%rdi),%r9 - addq 48(%rdi),%r10 - addq 56(%rdi),%r11 - - cmpq 128+16(%rsp),%rsi - - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - movq %rcx,16(%rdi) - movq %rdx,24(%rdi) - movq %r8,32(%rdi) - movq %r9,40(%rdi) - movq %r10,48(%rdi) - movq %r11,56(%rdi) - jb .Lloop_avx - - movq 152(%rsp),%rsi -.cfi_def_cfa %rsi,8 - vzeroupper - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_avx: - .byte 0xf3,0xc3 -.cfi_endproc -.size sha512_block_data_order_avx,.-sha512_block_data_order_avx -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S deleted file mode 100644 index b651713f..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S +++ /dev/null @@ -1,1133 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - - - - - - - - - - - - - - -.type _vpaes_encrypt_core,@function -.align 16 -_vpaes_encrypt_core: -.cfi_startproc - movq %rdx,%r9 - movq $16,%r11 - movl 240(%rdx),%eax - movdqa %xmm9,%xmm1 - movdqa .Lk_ipt(%rip),%xmm2 - pandn %xmm0,%xmm1 - movdqu (%r9),%xmm5 - psrld $4,%xmm1 - pand %xmm9,%xmm0 -.byte 102,15,56,0,208 - movdqa .Lk_ipt+16(%rip),%xmm0 -.byte 102,15,56,0,193 - pxor %xmm5,%xmm2 - addq $16,%r9 - pxor %xmm2,%xmm0 - leaq .Lk_mc_backward(%rip),%r10 - jmp .Lenc_entry - -.align 16 -.Lenc_loop: - - movdqa %xmm13,%xmm4 - movdqa %xmm12,%xmm0 -.byte 102,15,56,0,226 -.byte 102,15,56,0,195 - pxor %xmm5,%xmm4 - movdqa %xmm15,%xmm5 - pxor %xmm4,%xmm0 - movdqa -64(%r11,%r10,1),%xmm1 -.byte 102,15,56,0,234 - movdqa (%r11,%r10,1),%xmm4 - movdqa %xmm14,%xmm2 -.byte 102,15,56,0,211 - movdqa %xmm0,%xmm3 - pxor %xmm5,%xmm2 -.byte 102,15,56,0,193 - addq $16,%r9 - pxor %xmm2,%xmm0 -.byte 102,15,56,0,220 - addq $16,%r11 - pxor %xmm0,%xmm3 -.byte 102,15,56,0,193 - andq $0x30,%r11 - subq $1,%rax - pxor %xmm3,%xmm0 - -.Lenc_entry: - - movdqa %xmm9,%xmm1 - movdqa %xmm11,%xmm5 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm9,%xmm0 -.byte 102,15,56,0,232 - movdqa %xmm10,%xmm3 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 - movdqa %xmm10,%xmm4 - pxor %xmm5,%xmm3 -.byte 102,15,56,0,224 - movdqa %xmm10,%xmm2 - pxor %xmm5,%xmm4 -.byte 102,15,56,0,211 - movdqa %xmm10,%xmm3 - pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 - movdqu (%r9),%xmm5 - pxor %xmm1,%xmm3 - jnz .Lenc_loop - - - movdqa -96(%r10),%xmm4 - movdqa -80(%r10),%xmm0 -.byte 102,15,56,0,226 - pxor %xmm5,%xmm4 -.byte 102,15,56,0,195 - movdqa 64(%r11,%r10,1),%xmm1 - pxor %xmm4,%xmm0 -.byte 102,15,56,0,193 - .byte 0xf3,0xc3 -.cfi_endproc -.size _vpaes_encrypt_core,.-_vpaes_encrypt_core - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -.type _vpaes_encrypt_core_2x,@function -.align 16 -_vpaes_encrypt_core_2x: -.cfi_startproc - movq %rdx,%r9 - movq $16,%r11 - movl 240(%rdx),%eax - movdqa %xmm9,%xmm1 - movdqa %xmm9,%xmm7 - movdqa .Lk_ipt(%rip),%xmm2 - movdqa %xmm2,%xmm8 - pandn %xmm0,%xmm1 - pandn %xmm6,%xmm7 - movdqu (%r9),%xmm5 - - psrld $4,%xmm1 - psrld $4,%xmm7 - pand %xmm9,%xmm0 - pand %xmm9,%xmm6 -.byte 102,15,56,0,208 -.byte 102,68,15,56,0,198 - movdqa .Lk_ipt+16(%rip),%xmm0 - movdqa %xmm0,%xmm6 -.byte 102,15,56,0,193 -.byte 102,15,56,0,247 - pxor %xmm5,%xmm2 - pxor %xmm5,%xmm8 - addq $16,%r9 - pxor %xmm2,%xmm0 - pxor %xmm8,%xmm6 - leaq .Lk_mc_backward(%rip),%r10 - jmp .Lenc2x_entry - -.align 16 -.Lenc2x_loop: - - movdqa .Lk_sb1(%rip),%xmm4 - movdqa .Lk_sb1+16(%rip),%xmm0 - movdqa %xmm4,%xmm12 - movdqa %xmm0,%xmm6 -.byte 102,15,56,0,226 -.byte 102,69,15,56,0,224 -.byte 102,15,56,0,195 -.byte 102,65,15,56,0,243 - pxor %xmm5,%xmm4 - pxor %xmm5,%xmm12 - movdqa .Lk_sb2(%rip),%xmm5 - movdqa %xmm5,%xmm13 - pxor %xmm4,%xmm0 - pxor %xmm12,%xmm6 - movdqa -64(%r11,%r10,1),%xmm1 - -.byte 102,15,56,0,234 -.byte 102,69,15,56,0,232 - movdqa (%r11,%r10,1),%xmm4 - - movdqa .Lk_sb2+16(%rip),%xmm2 - movdqa %xmm2,%xmm8 -.byte 102,15,56,0,211 -.byte 102,69,15,56,0,195 - movdqa %xmm0,%xmm3 - movdqa %xmm6,%xmm11 - pxor %xmm5,%xmm2 - pxor %xmm13,%xmm8 -.byte 102,15,56,0,193 -.byte 102,15,56,0,241 - addq $16,%r9 - pxor %xmm2,%xmm0 - pxor %xmm8,%xmm6 -.byte 102,15,56,0,220 -.byte 102,68,15,56,0,220 - addq $16,%r11 - pxor %xmm0,%xmm3 - pxor %xmm6,%xmm11 -.byte 102,15,56,0,193 -.byte 102,15,56,0,241 - andq $0x30,%r11 - subq $1,%rax - pxor %xmm3,%xmm0 - pxor %xmm11,%xmm6 - -.Lenc2x_entry: - - movdqa %xmm9,%xmm1 - movdqa %xmm9,%xmm7 - movdqa .Lk_inv+16(%rip),%xmm5 - movdqa %xmm5,%xmm13 - pandn %xmm0,%xmm1 - pandn %xmm6,%xmm7 - psrld $4,%xmm1 - psrld $4,%xmm7 - pand %xmm9,%xmm0 - pand %xmm9,%xmm6 -.byte 102,15,56,0,232 -.byte 102,68,15,56,0,238 - movdqa %xmm10,%xmm3 - movdqa %xmm10,%xmm11 - pxor %xmm1,%xmm0 - pxor %xmm7,%xmm6 -.byte 102,15,56,0,217 -.byte 102,68,15,56,0,223 - movdqa %xmm10,%xmm4 - movdqa %xmm10,%xmm12 - pxor %xmm5,%xmm3 - pxor %xmm13,%xmm11 -.byte 102,15,56,0,224 -.byte 102,68,15,56,0,230 - movdqa %xmm10,%xmm2 - movdqa %xmm10,%xmm8 - pxor %xmm5,%xmm4 - pxor %xmm13,%xmm12 -.byte 102,15,56,0,211 -.byte 102,69,15,56,0,195 - movdqa %xmm10,%xmm3 - movdqa %xmm10,%xmm11 - pxor %xmm0,%xmm2 - pxor %xmm6,%xmm8 -.byte 102,15,56,0,220 -.byte 102,69,15,56,0,220 - movdqu (%r9),%xmm5 - - pxor %xmm1,%xmm3 - pxor %xmm7,%xmm11 - jnz .Lenc2x_loop - - - movdqa -96(%r10),%xmm4 - movdqa -80(%r10),%xmm0 - movdqa %xmm4,%xmm12 - movdqa %xmm0,%xmm6 -.byte 102,15,56,0,226 -.byte 102,69,15,56,0,224 - pxor %xmm5,%xmm4 - pxor %xmm5,%xmm12 -.byte 102,15,56,0,195 -.byte 102,65,15,56,0,243 - movdqa 64(%r11,%r10,1),%xmm1 - - pxor %xmm4,%xmm0 - pxor %xmm12,%xmm6 -.byte 102,15,56,0,193 -.byte 102,15,56,0,241 - .byte 0xf3,0xc3 -.cfi_endproc -.size _vpaes_encrypt_core_2x,.-_vpaes_encrypt_core_2x - - - - - - -.type _vpaes_decrypt_core,@function -.align 16 -_vpaes_decrypt_core: -.cfi_startproc - movq %rdx,%r9 - movl 240(%rdx),%eax - movdqa %xmm9,%xmm1 - movdqa .Lk_dipt(%rip),%xmm2 - pandn %xmm0,%xmm1 - movq %rax,%r11 - psrld $4,%xmm1 - movdqu (%r9),%xmm5 - shlq $4,%r11 - pand %xmm9,%xmm0 -.byte 102,15,56,0,208 - movdqa .Lk_dipt+16(%rip),%xmm0 - xorq $0x30,%r11 - leaq .Lk_dsbd(%rip),%r10 -.byte 102,15,56,0,193 - andq $0x30,%r11 - pxor %xmm5,%xmm2 - movdqa .Lk_mc_forward+48(%rip),%xmm5 - pxor %xmm2,%xmm0 - addq $16,%r9 - addq %r10,%r11 - jmp .Ldec_entry - -.align 16 -.Ldec_loop: - - - - movdqa -32(%r10),%xmm4 - movdqa -16(%r10),%xmm1 -.byte 102,15,56,0,226 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa 0(%r10),%xmm4 - pxor %xmm1,%xmm0 - movdqa 16(%r10),%xmm1 - -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa 32(%r10),%xmm4 - pxor %xmm1,%xmm0 - movdqa 48(%r10),%xmm1 - -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - movdqa 64(%r10),%xmm4 - pxor %xmm1,%xmm0 - movdqa 80(%r10),%xmm1 - -.byte 102,15,56,0,226 -.byte 102,15,56,0,197 -.byte 102,15,56,0,203 - pxor %xmm4,%xmm0 - addq $16,%r9 -.byte 102,15,58,15,237,12 - pxor %xmm1,%xmm0 - subq $1,%rax - -.Ldec_entry: - - movdqa %xmm9,%xmm1 - pandn %xmm0,%xmm1 - movdqa %xmm11,%xmm2 - psrld $4,%xmm1 - pand %xmm9,%xmm0 -.byte 102,15,56,0,208 - movdqa %xmm10,%xmm3 - pxor %xmm1,%xmm0 -.byte 102,15,56,0,217 - movdqa %xmm10,%xmm4 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,224 - pxor %xmm2,%xmm4 - movdqa %xmm10,%xmm2 -.byte 102,15,56,0,211 - movdqa %xmm10,%xmm3 - pxor %xmm0,%xmm2 -.byte 102,15,56,0,220 - movdqu (%r9),%xmm0 - pxor %xmm1,%xmm3 - jnz .Ldec_loop - - - movdqa 96(%r10),%xmm4 -.byte 102,15,56,0,226 - pxor %xmm0,%xmm4 - movdqa 112(%r10),%xmm0 - movdqa -352(%r11),%xmm2 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 -.byte 102,15,56,0,194 - .byte 0xf3,0xc3 -.cfi_endproc -.size _vpaes_decrypt_core,.-_vpaes_decrypt_core - - - - - - -.type _vpaes_schedule_core,@function -.align 16 -_vpaes_schedule_core: -.cfi_startproc - - - - - - call _vpaes_preheat - movdqa .Lk_rcon(%rip),%xmm8 - movdqu (%rdi),%xmm0 - - - movdqa %xmm0,%xmm3 - leaq .Lk_ipt(%rip),%r11 - call _vpaes_schedule_transform - movdqa %xmm0,%xmm7 - - leaq .Lk_sr(%rip),%r10 - testq %rcx,%rcx - jnz .Lschedule_am_decrypting - - - movdqu %xmm0,(%rdx) - jmp .Lschedule_go - -.Lschedule_am_decrypting: - - movdqa (%r8,%r10,1),%xmm1 -.byte 102,15,56,0,217 - movdqu %xmm3,(%rdx) - xorq $0x30,%r8 - -.Lschedule_go: - cmpl $192,%esi - ja .Lschedule_256 - je .Lschedule_192 - - - - - - - - - - -.Lschedule_128: - movl $10,%esi - -.Loop_schedule_128: - call _vpaes_schedule_round - decq %rsi - jz .Lschedule_mangle_last - call _vpaes_schedule_mangle - jmp .Loop_schedule_128 - - - - - - - - - - - - - - - - -.align 16 -.Lschedule_192: - movdqu 8(%rdi),%xmm0 - call _vpaes_schedule_transform - movdqa %xmm0,%xmm6 - pxor %xmm4,%xmm4 - movhlps %xmm4,%xmm6 - movl $4,%esi - -.Loop_schedule_192: - call _vpaes_schedule_round -.byte 102,15,58,15,198,8 - call _vpaes_schedule_mangle - call _vpaes_schedule_192_smear - call _vpaes_schedule_mangle - call _vpaes_schedule_round - decq %rsi - jz .Lschedule_mangle_last - call _vpaes_schedule_mangle - call _vpaes_schedule_192_smear - jmp .Loop_schedule_192 - - - - - - - - - - - -.align 16 -.Lschedule_256: - movdqu 16(%rdi),%xmm0 - call _vpaes_schedule_transform - movl $7,%esi - -.Loop_schedule_256: - call _vpaes_schedule_mangle - movdqa %xmm0,%xmm6 - - - call _vpaes_schedule_round - decq %rsi - jz .Lschedule_mangle_last - call _vpaes_schedule_mangle - - - pshufd $0xFF,%xmm0,%xmm0 - movdqa %xmm7,%xmm5 - movdqa %xmm6,%xmm7 - call _vpaes_schedule_low_round - movdqa %xmm5,%xmm7 - - jmp .Loop_schedule_256 - - - - - - - - - - - - -.align 16 -.Lschedule_mangle_last: - - leaq .Lk_deskew(%rip),%r11 - testq %rcx,%rcx - jnz .Lschedule_mangle_last_dec - - - movdqa (%r8,%r10,1),%xmm1 -.byte 102,15,56,0,193 - leaq .Lk_opt(%rip),%r11 - addq $32,%rdx - -.Lschedule_mangle_last_dec: - addq $-16,%rdx - pxor .Lk_s63(%rip),%xmm0 - call _vpaes_schedule_transform - movdqu %xmm0,(%rdx) - - - pxor %xmm0,%xmm0 - pxor %xmm1,%xmm1 - pxor %xmm2,%xmm2 - pxor %xmm3,%xmm3 - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - pxor %xmm6,%xmm6 - pxor %xmm7,%xmm7 - .byte 0xf3,0xc3 -.cfi_endproc -.size _vpaes_schedule_core,.-_vpaes_schedule_core - - - - - - - - - - - - - - - -.type _vpaes_schedule_192_smear,@function -.align 16 -_vpaes_schedule_192_smear: -.cfi_startproc - pshufd $0x80,%xmm6,%xmm1 - pshufd $0xFE,%xmm7,%xmm0 - pxor %xmm1,%xmm6 - pxor %xmm1,%xmm1 - pxor %xmm0,%xmm6 - movdqa %xmm6,%xmm0 - movhlps %xmm1,%xmm6 - .byte 0xf3,0xc3 -.cfi_endproc -.size _vpaes_schedule_192_smear,.-_vpaes_schedule_192_smear - - - - - - - - - - - - - - - - - - - -.type _vpaes_schedule_round,@function -.align 16 -_vpaes_schedule_round: -.cfi_startproc - - pxor %xmm1,%xmm1 -.byte 102,65,15,58,15,200,15 -.byte 102,69,15,58,15,192,15 - pxor %xmm1,%xmm7 - - - pshufd $0xFF,%xmm0,%xmm0 -.byte 102,15,58,15,192,1 - - - - -_vpaes_schedule_low_round: - - movdqa %xmm7,%xmm1 - pslldq $4,%xmm7 - pxor %xmm1,%xmm7 - movdqa %xmm7,%xmm1 - pslldq $8,%xmm7 - pxor %xmm1,%xmm7 - pxor .Lk_s63(%rip),%xmm7 - - - movdqa %xmm9,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm9,%xmm0 - movdqa %xmm11,%xmm2 -.byte 102,15,56,0,208 - pxor %xmm1,%xmm0 - movdqa %xmm10,%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 - movdqa %xmm10,%xmm4 -.byte 102,15,56,0,224 - pxor %xmm2,%xmm4 - movdqa %xmm10,%xmm2 -.byte 102,15,56,0,211 - pxor %xmm0,%xmm2 - movdqa %xmm10,%xmm3 -.byte 102,15,56,0,220 - pxor %xmm1,%xmm3 - movdqa %xmm13,%xmm4 -.byte 102,15,56,0,226 - movdqa %xmm12,%xmm0 -.byte 102,15,56,0,195 - pxor %xmm4,%xmm0 - - - pxor %xmm7,%xmm0 - movdqa %xmm0,%xmm7 - .byte 0xf3,0xc3 -.cfi_endproc -.size _vpaes_schedule_round,.-_vpaes_schedule_round - - - - - - - - - - -.type _vpaes_schedule_transform,@function -.align 16 -_vpaes_schedule_transform: -.cfi_startproc - movdqa %xmm9,%xmm1 - pandn %xmm0,%xmm1 - psrld $4,%xmm1 - pand %xmm9,%xmm0 - movdqa (%r11),%xmm2 -.byte 102,15,56,0,208 - movdqa 16(%r11),%xmm0 -.byte 102,15,56,0,193 - pxor %xmm2,%xmm0 - .byte 0xf3,0xc3 -.cfi_endproc -.size _vpaes_schedule_transform,.-_vpaes_schedule_transform - - - - - - - - - - - - - - - - - - - - - - - - -.type _vpaes_schedule_mangle,@function -.align 16 -_vpaes_schedule_mangle: -.cfi_startproc - movdqa %xmm0,%xmm4 - movdqa .Lk_mc_forward(%rip),%xmm5 - testq %rcx,%rcx - jnz .Lschedule_mangle_dec - - - addq $16,%rdx - pxor .Lk_s63(%rip),%xmm4 -.byte 102,15,56,0,229 - movdqa %xmm4,%xmm3 -.byte 102,15,56,0,229 - pxor %xmm4,%xmm3 -.byte 102,15,56,0,229 - pxor %xmm4,%xmm3 - - jmp .Lschedule_mangle_both -.align 16 -.Lschedule_mangle_dec: - - leaq .Lk_dksd(%rip),%r11 - movdqa %xmm9,%xmm1 - pandn %xmm4,%xmm1 - psrld $4,%xmm1 - pand %xmm9,%xmm4 - - movdqa 0(%r11),%xmm2 -.byte 102,15,56,0,212 - movdqa 16(%r11),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - - movdqa 32(%r11),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 48(%r11),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - - movdqa 64(%r11),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 80(%r11),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 -.byte 102,15,56,0,221 - - movdqa 96(%r11),%xmm2 -.byte 102,15,56,0,212 - pxor %xmm3,%xmm2 - movdqa 112(%r11),%xmm3 -.byte 102,15,56,0,217 - pxor %xmm2,%xmm3 - - addq $-16,%rdx - -.Lschedule_mangle_both: - movdqa (%r8,%r10,1),%xmm1 -.byte 102,15,56,0,217 - addq $-16,%r8 - andq $0x30,%r8 - movdqu %xmm3,(%rdx) - .byte 0xf3,0xc3 -.cfi_endproc -.size _vpaes_schedule_mangle,.-_vpaes_schedule_mangle - - - - -.globl vpaes_set_encrypt_key -.hidden vpaes_set_encrypt_key -.type vpaes_set_encrypt_key,@function -.align 16 -vpaes_set_encrypt_key: -.cfi_startproc -#ifdef BORINGSSL_DISPATCH_TEST -.extern BORINGSSL_function_hit -.hidden BORINGSSL_function_hit - movb $1,BORINGSSL_function_hit+5(%rip) -#endif - - movl %esi,%eax - shrl $5,%eax - addl $5,%eax - movl %eax,240(%rdx) - - movl $0,%ecx - movl $0x30,%r8d - call _vpaes_schedule_core - xorl %eax,%eax - .byte 0xf3,0xc3 -.cfi_endproc -.size vpaes_set_encrypt_key,.-vpaes_set_encrypt_key - -.globl vpaes_set_decrypt_key -.hidden vpaes_set_decrypt_key -.type vpaes_set_decrypt_key,@function -.align 16 -vpaes_set_decrypt_key: -.cfi_startproc - movl %esi,%eax - shrl $5,%eax - addl $5,%eax - movl %eax,240(%rdx) - shll $4,%eax - leaq 16(%rdx,%rax,1),%rdx - - movl $1,%ecx - movl %esi,%r8d - shrl $1,%r8d - andl $32,%r8d - xorl $32,%r8d - call _vpaes_schedule_core - xorl %eax,%eax - .byte 0xf3,0xc3 -.cfi_endproc -.size vpaes_set_decrypt_key,.-vpaes_set_decrypt_key - -.globl vpaes_encrypt -.hidden vpaes_encrypt -.type vpaes_encrypt,@function -.align 16 -vpaes_encrypt: -.cfi_startproc -#ifdef BORINGSSL_DISPATCH_TEST -.extern BORINGSSL_function_hit -.hidden BORINGSSL_function_hit - movb $1,BORINGSSL_function_hit+4(%rip) -#endif - movdqu (%rdi),%xmm0 - call _vpaes_preheat - call _vpaes_encrypt_core - movdqu %xmm0,(%rsi) - .byte 0xf3,0xc3 -.cfi_endproc -.size vpaes_encrypt,.-vpaes_encrypt - -.globl vpaes_decrypt -.hidden vpaes_decrypt -.type vpaes_decrypt,@function -.align 16 -vpaes_decrypt: -.cfi_startproc - movdqu (%rdi),%xmm0 - call _vpaes_preheat - call _vpaes_decrypt_core - movdqu %xmm0,(%rsi) - .byte 0xf3,0xc3 -.cfi_endproc -.size vpaes_decrypt,.-vpaes_decrypt -.globl vpaes_cbc_encrypt -.hidden vpaes_cbc_encrypt -.type vpaes_cbc_encrypt,@function -.align 16 -vpaes_cbc_encrypt: -.cfi_startproc - xchgq %rcx,%rdx - subq $16,%rcx - jc .Lcbc_abort - movdqu (%r8),%xmm6 - subq %rdi,%rsi - call _vpaes_preheat - cmpl $0,%r9d - je .Lcbc_dec_loop - jmp .Lcbc_enc_loop -.align 16 -.Lcbc_enc_loop: - movdqu (%rdi),%xmm0 - pxor %xmm6,%xmm0 - call _vpaes_encrypt_core - movdqa %xmm0,%xmm6 - movdqu %xmm0,(%rsi,%rdi,1) - leaq 16(%rdi),%rdi - subq $16,%rcx - jnc .Lcbc_enc_loop - jmp .Lcbc_done -.align 16 -.Lcbc_dec_loop: - movdqu (%rdi),%xmm0 - movdqa %xmm0,%xmm7 - call _vpaes_decrypt_core - pxor %xmm6,%xmm0 - movdqa %xmm7,%xmm6 - movdqu %xmm0,(%rsi,%rdi,1) - leaq 16(%rdi),%rdi - subq $16,%rcx - jnc .Lcbc_dec_loop -.Lcbc_done: - movdqu %xmm6,(%r8) -.Lcbc_abort: - .byte 0xf3,0xc3 -.cfi_endproc -.size vpaes_cbc_encrypt,.-vpaes_cbc_encrypt -.globl vpaes_ctr32_encrypt_blocks -.hidden vpaes_ctr32_encrypt_blocks -.type vpaes_ctr32_encrypt_blocks,@function -.align 16 -vpaes_ctr32_encrypt_blocks: -.cfi_startproc - - xchgq %rcx,%rdx - testq %rcx,%rcx - jz .Lctr32_abort - movdqu (%r8),%xmm0 - movdqa .Lctr_add_one(%rip),%xmm8 - subq %rdi,%rsi - call _vpaes_preheat - movdqa %xmm0,%xmm6 - pshufb .Lrev_ctr(%rip),%xmm6 - - testq $1,%rcx - jz .Lctr32_prep_loop - - - - movdqu (%rdi),%xmm7 - call _vpaes_encrypt_core - pxor %xmm7,%xmm0 - paddd %xmm8,%xmm6 - movdqu %xmm0,(%rsi,%rdi,1) - subq $1,%rcx - leaq 16(%rdi),%rdi - jz .Lctr32_done - -.Lctr32_prep_loop: - - - movdqa %xmm6,%xmm14 - movdqa %xmm6,%xmm15 - paddd %xmm8,%xmm15 - -.Lctr32_loop: - movdqa .Lrev_ctr(%rip),%xmm1 - movdqa %xmm14,%xmm0 - movdqa %xmm15,%xmm6 -.byte 102,15,56,0,193 -.byte 102,15,56,0,241 - call _vpaes_encrypt_core_2x - movdqu (%rdi),%xmm1 - movdqu 16(%rdi),%xmm2 - movdqa .Lctr_add_two(%rip),%xmm3 - pxor %xmm1,%xmm0 - pxor %xmm2,%xmm6 - paddd %xmm3,%xmm14 - paddd %xmm3,%xmm15 - movdqu %xmm0,(%rsi,%rdi,1) - movdqu %xmm6,16(%rsi,%rdi,1) - subq $2,%rcx - leaq 32(%rdi),%rdi - jnz .Lctr32_loop - -.Lctr32_done: -.Lctr32_abort: - .byte 0xf3,0xc3 -.cfi_endproc -.size vpaes_ctr32_encrypt_blocks,.-vpaes_ctr32_encrypt_blocks - - - - - - -.type _vpaes_preheat,@function -.align 16 -_vpaes_preheat: -.cfi_startproc - leaq .Lk_s0F(%rip),%r10 - movdqa -32(%r10),%xmm10 - movdqa -16(%r10),%xmm11 - movdqa 0(%r10),%xmm9 - movdqa 48(%r10),%xmm13 - movdqa 64(%r10),%xmm12 - movdqa 80(%r10),%xmm15 - movdqa 96(%r10),%xmm14 - .byte 0xf3,0xc3 -.cfi_endproc -.size _vpaes_preheat,.-_vpaes_preheat - - - - - -.type _vpaes_consts,@object -.align 64 -_vpaes_consts: -.Lk_inv: -.quad 0x0E05060F0D080180, 0x040703090A0B0C02 -.quad 0x01040A060F0B0780, 0x030D0E0C02050809 - -.Lk_s0F: -.quad 0x0F0F0F0F0F0F0F0F, 0x0F0F0F0F0F0F0F0F - -.Lk_ipt: -.quad 0xC2B2E8985A2A7000, 0xCABAE09052227808 -.quad 0x4C01307D317C4D00, 0xCD80B1FCB0FDCC81 - -.Lk_sb1: -.quad 0xB19BE18FCB503E00, 0xA5DF7A6E142AF544 -.quad 0x3618D415FAE22300, 0x3BF7CCC10D2ED9EF -.Lk_sb2: -.quad 0xE27A93C60B712400, 0x5EB7E955BC982FCD -.quad 0x69EB88400AE12900, 0xC2A163C8AB82234A -.Lk_sbo: -.quad 0xD0D26D176FBDC700, 0x15AABF7AC502A878 -.quad 0xCFE474A55FBB6A00, 0x8E1E90D1412B35FA - -.Lk_mc_forward: -.quad 0x0407060500030201, 0x0C0F0E0D080B0A09 -.quad 0x080B0A0904070605, 0x000302010C0F0E0D -.quad 0x0C0F0E0D080B0A09, 0x0407060500030201 -.quad 0x000302010C0F0E0D, 0x080B0A0904070605 - -.Lk_mc_backward: -.quad 0x0605040702010003, 0x0E0D0C0F0A09080B -.quad 0x020100030E0D0C0F, 0x0A09080B06050407 -.quad 0x0E0D0C0F0A09080B, 0x0605040702010003 -.quad 0x0A09080B06050407, 0x020100030E0D0C0F - -.Lk_sr: -.quad 0x0706050403020100, 0x0F0E0D0C0B0A0908 -.quad 0x030E09040F0A0500, 0x0B06010C07020D08 -.quad 0x0F060D040B020900, 0x070E050C030A0108 -.quad 0x0B0E0104070A0D00, 0x0306090C0F020508 - -.Lk_rcon: -.quad 0x1F8391B9AF9DEEB6, 0x702A98084D7C7D81 - -.Lk_s63: -.quad 0x5B5B5B5B5B5B5B5B, 0x5B5B5B5B5B5B5B5B - -.Lk_opt: -.quad 0xFF9F4929D6B66000, 0xF7974121DEBE6808 -.quad 0x01EDBD5150BCEC00, 0xE10D5DB1B05C0CE0 - -.Lk_deskew: -.quad 0x07E4A34047A4E300, 0x1DFEB95A5DBEF91A -.quad 0x5F36B5DC83EA6900, 0x2841C2ABF49D1E77 - - - - - -.Lk_dksd: -.quad 0xFEB91A5DA3E44700, 0x0740E3A45A1DBEF9 -.quad 0x41C277F4B5368300, 0x5FDC69EAAB289D1E -.Lk_dksb: -.quad 0x9A4FCA1F8550D500, 0x03D653861CC94C99 -.quad 0x115BEDA7B6FC4A00, 0xD993256F7E3482C8 -.Lk_dkse: -.quad 0xD5031CCA1FC9D600, 0x53859A4C994F5086 -.quad 0xA23196054FDC7BE8, 0xCD5EF96A20B31487 -.Lk_dks9: -.quad 0xB6116FC87ED9A700, 0x4AED933482255BFC -.quad 0x4576516227143300, 0x8BB89FACE9DAFDCE - - - - - -.Lk_dipt: -.quad 0x0F505B040B545F00, 0x154A411E114E451A -.quad 0x86E383E660056500, 0x12771772F491F194 - -.Lk_dsb9: -.quad 0x851C03539A86D600, 0xCAD51F504F994CC9 -.quad 0xC03B1789ECD74900, 0x725E2C9EB2FBA565 -.Lk_dsbd: -.quad 0x7D57CCDFE6B1A200, 0xF56E9B13882A4439 -.quad 0x3CE2FAF724C6CB00, 0x2931180D15DEEFD3 -.Lk_dsbb: -.quad 0xD022649296B44200, 0x602646F6B0F2D404 -.quad 0xC19498A6CD596700, 0xF3FF0C3E3255AA6B -.Lk_dsbe: -.quad 0x46F2929626D4D000, 0x2242600464B4F6B0 -.quad 0x0C55A6CDFFAAC100, 0x9467F36B98593E32 -.Lk_dsbo: -.quad 0x1387EA537EF94000, 0xC7AA6DB9D4943E2D -.quad 0x12D7560F93441D00, 0xCA4B8159D8C58E9C - - -.Lrev_ctr: -.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908 - - -.Lctr_add_one: -.quad 0x0000000000000000, 0x0000000100000000 -.Lctr_add_two: -.quad 0x0000000000000000, 0x0000000200000000 - -.byte 86,101,99,116,111,114,32,80,101,114,109,117,116,97,116,105,111,110,32,65,69,83,32,102,111,114,32,120,56,54,95,54,52,47,83,83,83,69,51,44,32,77,105,107,101,32,72,97,109,98,117,114,103,32,40,83,116,97,110,102,111,114,100,32,85,110,105,118,101,114,115,105,116,121,41,0 -.align 64 -.size _vpaes_consts,.-_vpaes_consts -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S deleted file mode 100644 index e39b5ca7..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont.S +++ /dev/null @@ -1,1260 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -.globl bn_mul_mont -.hidden bn_mul_mont -.type bn_mul_mont,@function -.align 16 -bn_mul_mont: -.cfi_startproc - movl %r9d,%r9d - movq %rsp,%rax -.cfi_def_cfa_register %rax - testl $3,%r9d - jnz .Lmul_enter - cmpl $8,%r9d - jb .Lmul_enter - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - cmpq %rsi,%rdx - jne .Lmul4x_enter - testl $7,%r9d - jz .Lsqr8x_enter - jmp .Lmul4x_enter - -.align 16 -.Lmul_enter: - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - - negq %r9 - movq %rsp,%r11 - leaq -16(%rsp,%r9,8),%r10 - negq %r9 - andq $-1024,%r10 - - - - - - - - - - subq %r10,%r11 - andq $-4096,%r11 - leaq (%r10,%r11,1),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja .Lmul_page_walk - jmp .Lmul_page_walk_done - -.align 16 -.Lmul_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja .Lmul_page_walk -.Lmul_page_walk_done: - - movq %rax,8(%rsp,%r9,8) -.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 -.Lmul_body: - movq %rdx,%r12 - movq (%r8),%r8 - movq (%r12),%rbx - movq (%rsi),%rax - - xorq %r14,%r14 - xorq %r15,%r15 - - movq %r8,%rbp - mulq %rbx - movq %rax,%r10 - movq (%rcx),%rax - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq %rdx,%r13 - - leaq 1(%r15),%r15 - jmp .L1st_enter - -.align 16 -.L1st: - addq %rax,%r13 - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%r13 - movq %r10,%r11 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - -.L1st_enter: - mulq %rbx - addq %rax,%r11 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - leaq 1(%r15),%r15 - movq %rdx,%r10 - - mulq %rbp - cmpq %r9,%r15 - jne .L1st - - addq %rax,%r13 - movq (%rsi),%rax - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - movq %r10,%r11 - - xorq %rdx,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r9,8) - movq %rdx,(%rsp,%r9,8) - - leaq 1(%r14),%r14 - jmp .Louter -.align 16 -.Louter: - movq (%r12,%r14,8),%rbx - xorq %r15,%r15 - movq %r8,%rbp - movq (%rsp),%r10 - mulq %rbx - addq %rax,%r10 - movq (%rcx),%rax - adcq $0,%rdx - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq 8(%rsp),%r10 - movq %rdx,%r13 - - leaq 1(%r15),%r15 - jmp .Linner_enter - -.align 16 -.Linner: - addq %rax,%r13 - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - movq (%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - -.Linner_enter: - mulq %rbx - addq %rax,%r11 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - leaq 1(%r15),%r15 - - mulq %rbp - cmpq %r9,%r15 - jne .Linner - - addq %rax,%r13 - movq (%rsi),%rax - adcq $0,%rdx - addq %r10,%r13 - movq (%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - - xorq %rdx,%rdx - addq %r11,%r13 - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r9,8) - movq %rdx,(%rsp,%r9,8) - - leaq 1(%r14),%r14 - cmpq %r9,%r14 - jb .Louter - - xorq %r14,%r14 - movq (%rsp),%rax - movq %r9,%r15 - -.align 16 -.Lsub: sbbq (%rcx,%r14,8),%rax - movq %rax,(%rdi,%r14,8) - movq 8(%rsp,%r14,8),%rax - leaq 1(%r14),%r14 - decq %r15 - jnz .Lsub - - sbbq $0,%rax - movq $-1,%rbx - xorq %rax,%rbx - xorq %r14,%r14 - movq %r9,%r15 - -.Lcopy: - movq (%rdi,%r14,8),%rcx - movq (%rsp,%r14,8),%rdx - andq %rbx,%rcx - andq %rax,%rdx - movq %r9,(%rsp,%r14,8) - orq %rcx,%rdx - movq %rdx,(%rdi,%r14,8) - leaq 1(%r14),%r14 - subq $1,%r15 - jnz .Lcopy - - movq 8(%rsp,%r9,8),%rsi -.cfi_def_cfa %rsi,8 - movq $1,%rax - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lmul_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_mul_mont,.-bn_mul_mont -.type bn_mul4x_mont,@function -.align 16 -bn_mul4x_mont: -.cfi_startproc - movl %r9d,%r9d - movq %rsp,%rax -.cfi_def_cfa_register %rax -.Lmul4x_enter: - andl $0x80100,%r11d - cmpl $0x80100,%r11d - je .Lmulx4x_enter - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - - negq %r9 - movq %rsp,%r11 - leaq -32(%rsp,%r9,8),%r10 - negq %r9 - andq $-1024,%r10 - - subq %r10,%r11 - andq $-4096,%r11 - leaq (%r10,%r11,1),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja .Lmul4x_page_walk - jmp .Lmul4x_page_walk_done - -.Lmul4x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja .Lmul4x_page_walk -.Lmul4x_page_walk_done: - - movq %rax,8(%rsp,%r9,8) -.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 -.Lmul4x_body: - movq %rdi,16(%rsp,%r9,8) - movq %rdx,%r12 - movq (%r8),%r8 - movq (%r12),%rbx - movq (%rsi),%rax - - xorq %r14,%r14 - xorq %r15,%r15 - - movq %r8,%rbp - mulq %rbx - movq %rax,%r10 - movq (%rcx),%rax - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 4(%r15),%r15 - adcq $0,%rdx - movq %rdi,(%rsp) - movq %rdx,%r13 - jmp .L1st4x -.align 16 -.L1st4x: - mulq %rbx - addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%rsp,%r15,8) - movq %rdx,%r13 - - mulq %rbx - addq %rax,%r10 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq 8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx,%r15,8),%rax - adcq $0,%rdx - leaq 4(%r15),%r15 - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq -16(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-32(%rsp,%r15,8) - movq %rdx,%r13 - cmpq %r9,%r15 - jb .L1st4x - - mulq %rbx - addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%rsp,%r15,8) - movq %rdx,%r13 - - xorq %rdi,%rdi - addq %r10,%r13 - adcq $0,%rdi - movq %r13,-8(%rsp,%r15,8) - movq %rdi,(%rsp,%r15,8) - - leaq 1(%r14),%r14 -.align 4 -.Louter4x: - movq (%r12,%r14,8),%rbx - xorq %r15,%r15 - movq (%rsp),%r10 - movq %r8,%rbp - mulq %rbx - addq %rax,%r10 - movq (%rcx),%rax - adcq $0,%rdx - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - addq 8(%rsp),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 4(%r15),%r15 - adcq $0,%rdx - movq %rdi,(%rsp) - movq %rdx,%r13 - jmp .Linner4x -.align 16 -.Linner4x: - mulq %rbx - addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax - adcq $0,%rdx - addq -16(%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax - adcq $0,%rdx - addq -8(%rsp,%r15,8),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%rsp,%r15,8) - movq %rdx,%r13 - - mulq %rbx - addq %rax,%r10 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - addq (%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq 8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx,%r15,8),%rax - adcq $0,%rdx - addq 8(%rsp,%r15,8),%r11 - adcq $0,%rdx - leaq 4(%r15),%r15 - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq -16(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-32(%rsp,%r15,8) - movq %rdx,%r13 - cmpq %r9,%r15 - jb .Linner4x - - mulq %rbx - addq %rax,%r10 - movq -16(%rcx,%r15,8),%rax - adcq $0,%rdx - addq -16(%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%rsp,%r15,8) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx,%r15,8),%rax - adcq $0,%rdx - addq -8(%rsp,%r15,8),%r11 - adcq $0,%rdx - leaq 1(%r14),%r14 - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%rsp,%r15,8) - movq %rdx,%r13 - - xorq %rdi,%rdi - addq %r10,%r13 - adcq $0,%rdi - addq (%rsp,%r9,8),%r13 - adcq $0,%rdi - movq %r13,-8(%rsp,%r15,8) - movq %rdi,(%rsp,%r15,8) - - cmpq %r9,%r14 - jb .Louter4x - movq 16(%rsp,%r9,8),%rdi - leaq -4(%r9),%r15 - movq 0(%rsp),%rax - movq 8(%rsp),%rdx - shrq $2,%r15 - leaq (%rsp),%rsi - xorq %r14,%r14 - - subq 0(%rcx),%rax - movq 16(%rsi),%rbx - movq 24(%rsi),%rbp - sbbq 8(%rcx),%rdx - -.Lsub4x: - movq %rax,0(%rdi,%r14,8) - movq %rdx,8(%rdi,%r14,8) - sbbq 16(%rcx,%r14,8),%rbx - movq 32(%rsi,%r14,8),%rax - movq 40(%rsi,%r14,8),%rdx - sbbq 24(%rcx,%r14,8),%rbp - movq %rbx,16(%rdi,%r14,8) - movq %rbp,24(%rdi,%r14,8) - sbbq 32(%rcx,%r14,8),%rax - movq 48(%rsi,%r14,8),%rbx - movq 56(%rsi,%r14,8),%rbp - sbbq 40(%rcx,%r14,8),%rdx - leaq 4(%r14),%r14 - decq %r15 - jnz .Lsub4x - - movq %rax,0(%rdi,%r14,8) - movq 32(%rsi,%r14,8),%rax - sbbq 16(%rcx,%r14,8),%rbx - movq %rdx,8(%rdi,%r14,8) - sbbq 24(%rcx,%r14,8),%rbp - movq %rbx,16(%rdi,%r14,8) - - sbbq $0,%rax - movq %rbp,24(%rdi,%r14,8) - pxor %xmm0,%xmm0 -.byte 102,72,15,110,224 - pcmpeqd %xmm5,%xmm5 - pshufd $0,%xmm4,%xmm4 - movq %r9,%r15 - pxor %xmm4,%xmm5 - shrq $2,%r15 - xorl %eax,%eax - - jmp .Lcopy4x -.align 16 -.Lcopy4x: - movdqa (%rsp,%rax,1),%xmm1 - movdqu (%rdi,%rax,1),%xmm2 - pand %xmm4,%xmm1 - pand %xmm5,%xmm2 - movdqa 16(%rsp,%rax,1),%xmm3 - movdqa %xmm0,(%rsp,%rax,1) - por %xmm2,%xmm1 - movdqu 16(%rdi,%rax,1),%xmm2 - movdqu %xmm1,(%rdi,%rax,1) - pand %xmm4,%xmm3 - pand %xmm5,%xmm2 - movdqa %xmm0,16(%rsp,%rax,1) - por %xmm2,%xmm3 - movdqu %xmm3,16(%rdi,%rax,1) - leaq 32(%rax),%rax - decq %r15 - jnz .Lcopy4x - movq 8(%rsp,%r9,8),%rsi -.cfi_def_cfa %rsi, 8 - movq $1,%rax - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lmul4x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_mul4x_mont,.-bn_mul4x_mont -.extern bn_sqrx8x_internal -.hidden bn_sqrx8x_internal -.extern bn_sqr8x_internal -.hidden bn_sqr8x_internal - -.type bn_sqr8x_mont,@function -.align 32 -bn_sqr8x_mont: -.cfi_startproc - movq %rsp,%rax -.cfi_def_cfa_register %rax -.Lsqr8x_enter: - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 -.Lsqr8x_prologue: - - movl %r9d,%r10d - shll $3,%r9d - shlq $3+2,%r10 - negq %r9 - - - - - - - leaq -64(%rsp,%r9,2),%r11 - movq %rsp,%rbp - movq (%r8),%r8 - subq %rsi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb .Lsqr8x_sp_alt - subq %r11,%rbp - leaq -64(%rbp,%r9,2),%rbp - jmp .Lsqr8x_sp_done - -.align 32 -.Lsqr8x_sp_alt: - leaq 4096-64(,%r9,2),%r10 - leaq -64(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -.Lsqr8x_sp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lsqr8x_page_walk - jmp .Lsqr8x_page_walk_done - -.align 16 -.Lsqr8x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lsqr8x_page_walk -.Lsqr8x_page_walk_done: - - movq %r9,%r10 - negq %r9 - - movq %r8,32(%rsp) - movq %rax,40(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 -.Lsqr8x_body: - -.byte 102,72,15,110,209 - pxor %xmm0,%xmm0 -.byte 102,72,15,110,207 -.byte 102,73,15,110,218 - leaq OPENSSL_ia32cap_P(%rip),%rax - movl 8(%rax),%eax - andl $0x80100,%eax - cmpl $0x80100,%eax - jne .Lsqr8x_nox - - call bn_sqrx8x_internal - - - - - leaq (%r8,%rcx,1),%rbx - movq %rcx,%r9 - movq %rcx,%rdx -.byte 102,72,15,126,207 - sarq $3+2,%rcx - jmp .Lsqr8x_sub - -.align 32 -.Lsqr8x_nox: - call bn_sqr8x_internal - - - - - leaq (%rdi,%r9,1),%rbx - movq %r9,%rcx - movq %r9,%rdx -.byte 102,72,15,126,207 - sarq $3+2,%rcx - jmp .Lsqr8x_sub - -.align 32 -.Lsqr8x_sub: - movq 0(%rbx),%r12 - movq 8(%rbx),%r13 - movq 16(%rbx),%r14 - movq 24(%rbx),%r15 - leaq 32(%rbx),%rbx - sbbq 0(%rbp),%r12 - sbbq 8(%rbp),%r13 - sbbq 16(%rbp),%r14 - sbbq 24(%rbp),%r15 - leaq 32(%rbp),%rbp - movq %r12,0(%rdi) - movq %r13,8(%rdi) - movq %r14,16(%rdi) - movq %r15,24(%rdi) - leaq 32(%rdi),%rdi - incq %rcx - jnz .Lsqr8x_sub - - sbbq $0,%rax - leaq (%rbx,%r9,1),%rbx - leaq (%rdi,%r9,1),%rdi - -.byte 102,72,15,110,200 - pxor %xmm0,%xmm0 - pshufd $0,%xmm1,%xmm1 - movq 40(%rsp),%rsi -.cfi_def_cfa %rsi,8 - jmp .Lsqr8x_cond_copy - -.align 32 -.Lsqr8x_cond_copy: - movdqa 0(%rbx),%xmm2 - movdqa 16(%rbx),%xmm3 - leaq 32(%rbx),%rbx - movdqu 0(%rdi),%xmm4 - movdqu 16(%rdi),%xmm5 - leaq 32(%rdi),%rdi - movdqa %xmm0,-32(%rbx) - movdqa %xmm0,-16(%rbx) - movdqa %xmm0,-32(%rbx,%rdx,1) - movdqa %xmm0,-16(%rbx,%rdx,1) - pcmpeqd %xmm1,%xmm0 - pand %xmm1,%xmm2 - pand %xmm1,%xmm3 - pand %xmm0,%xmm4 - pand %xmm0,%xmm5 - pxor %xmm0,%xmm0 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqu %xmm4,-32(%rdi) - movdqu %xmm5,-16(%rdi) - addq $32,%r9 - jnz .Lsqr8x_cond_copy - - movq $1,%rax - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lsqr8x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_sqr8x_mont,.-bn_sqr8x_mont -.type bn_mulx4x_mont,@function -.align 32 -bn_mulx4x_mont: -.cfi_startproc - movq %rsp,%rax -.cfi_def_cfa_register %rax -.Lmulx4x_enter: - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 -.Lmulx4x_prologue: - - shll $3,%r9d - xorq %r10,%r10 - subq %r9,%r10 - movq (%r8),%r8 - leaq -72(%rsp,%r10,1),%rbp - andq $-128,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lmulx4x_page_walk - jmp .Lmulx4x_page_walk_done - -.align 16 -.Lmulx4x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lmulx4x_page_walk -.Lmulx4x_page_walk_done: - - leaq (%rdx,%r9,1),%r10 - - - - - - - - - - - - - movq %r9,0(%rsp) - shrq $5,%r9 - movq %r10,16(%rsp) - subq $1,%r9 - movq %r8,24(%rsp) - movq %rdi,32(%rsp) - movq %rax,40(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 - movq %r9,48(%rsp) - jmp .Lmulx4x_body - -.align 32 -.Lmulx4x_body: - leaq 8(%rdx),%rdi - movq (%rdx),%rdx - leaq 64+32(%rsp),%rbx - movq %rdx,%r9 - - mulxq 0(%rsi),%r8,%rax - mulxq 8(%rsi),%r11,%r14 - addq %rax,%r11 - movq %rdi,8(%rsp) - mulxq 16(%rsi),%r12,%r13 - adcq %r14,%r12 - adcq $0,%r13 - - movq %r8,%rdi - imulq 24(%rsp),%r8 - xorq %rbp,%rbp - - mulxq 24(%rsi),%rax,%r14 - movq %r8,%rdx - leaq 32(%rsi),%rsi - adcxq %rax,%r13 - adcxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%rdi - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 -.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 - movq 48(%rsp),%rdi - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-24(%rbx) - adcxq %rax,%r12 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r12,-16(%rbx) - - jmp .Lmulx4x_1st - -.align 32 -.Lmulx4x_1st: - adcxq %rbp,%r15 - mulxq 0(%rsi),%r10,%rax - adcxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 -.byte 0x67,0x67 - movq %r8,%rdx - adcxq %rax,%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - movq %r11,-32(%rbx) - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_1st - - movq 0(%rsp),%rax - movq 8(%rsp),%rdi - adcq %rbp,%r15 - addq %r15,%r14 - sbbq %r15,%r15 - movq %r14,-8(%rbx) - jmp .Lmulx4x_outer - -.align 32 -.Lmulx4x_outer: - movq (%rdi),%rdx - leaq 8(%rdi),%rdi - subq %rax,%rsi - movq %r15,(%rbx) - leaq 64+32(%rsp),%rbx - subq %rax,%rcx - - mulxq 0(%rsi),%r8,%r11 - xorl %ebp,%ebp - movq %rdx,%r9 - mulxq 8(%rsi),%r14,%r12 - adoxq -32(%rbx),%r8 - adcxq %r14,%r11 - mulxq 16(%rsi),%r15,%r13 - adoxq -24(%rbx),%r11 - adcxq %r15,%r12 - adoxq -16(%rbx),%r12 - adcxq %rbp,%r13 - adoxq %rbp,%r13 - - movq %rdi,8(%rsp) - movq %r8,%r15 - imulq 24(%rsp),%r8 - xorl %ebp,%ebp - - mulxq 24(%rsi),%rax,%r14 - movq %r8,%rdx - adcxq %rax,%r13 - adoxq -8(%rbx),%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - adoxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 16(%rcx),%rax,%r12 - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-24(%rbx) - leaq 32(%rcx),%rcx - adcxq %rax,%r12 - adoxq %rbp,%r15 - movq 48(%rsp),%rdi - movq %r12,-16(%rbx) - - jmp .Lmulx4x_inner - -.align 32 -.Lmulx4x_inner: - mulxq 0(%rsi),%r10,%rax - adcxq %rbp,%r15 - adoxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq 0(%rbx),%r10 - adoxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq 8(%rbx),%r11 - adoxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 - movq %r8,%rdx - adcxq 16(%rbx),%r12 - adoxq %rax,%r13 - adcxq 24(%rbx),%r13 - adoxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - adcxq %rbp,%r14 - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-32(%rbx) - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_inner - - movq 0(%rsp),%rax - movq 8(%rsp),%rdi - adcq %rbp,%r15 - subq 0(%rbx),%rbp - adcq %r15,%r14 - sbbq %r15,%r15 - movq %r14,-8(%rbx) - - cmpq 16(%rsp),%rdi - jne .Lmulx4x_outer - - leaq 64(%rsp),%rbx - subq %rax,%rcx - negq %r15 - movq %rax,%rdx - shrq $3+2,%rax - movq 32(%rsp),%rdi - jmp .Lmulx4x_sub - -.align 32 -.Lmulx4x_sub: - movq 0(%rbx),%r11 - movq 8(%rbx),%r12 - movq 16(%rbx),%r13 - movq 24(%rbx),%r14 - leaq 32(%rbx),%rbx - sbbq 0(%rcx),%r11 - sbbq 8(%rcx),%r12 - sbbq 16(%rcx),%r13 - sbbq 24(%rcx),%r14 - leaq 32(%rcx),%rcx - movq %r11,0(%rdi) - movq %r12,8(%rdi) - movq %r13,16(%rdi) - movq %r14,24(%rdi) - leaq 32(%rdi),%rdi - decq %rax - jnz .Lmulx4x_sub - - sbbq $0,%r15 - leaq 64(%rsp),%rbx - subq %rdx,%rdi - -.byte 102,73,15,110,207 - pxor %xmm0,%xmm0 - pshufd $0,%xmm1,%xmm1 - movq 40(%rsp),%rsi -.cfi_def_cfa %rsi,8 - jmp .Lmulx4x_cond_copy - -.align 32 -.Lmulx4x_cond_copy: - movdqa 0(%rbx),%xmm2 - movdqa 16(%rbx),%xmm3 - leaq 32(%rbx),%rbx - movdqu 0(%rdi),%xmm4 - movdqu 16(%rdi),%xmm5 - leaq 32(%rdi),%rdi - movdqa %xmm0,-32(%rbx) - movdqa %xmm0,-16(%rbx) - pcmpeqd %xmm1,%xmm0 - pand %xmm1,%xmm2 - pand %xmm1,%xmm3 - pand %xmm0,%xmm4 - pand %xmm0,%xmm5 - pxor %xmm0,%xmm0 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqu %xmm4,-32(%rdi) - movdqu %xmm5,-16(%rdi) - subq $32,%rdx - jnz .Lmulx4x_cond_copy - - movq %rdx,(%rbx) - - movq $1,%rax - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lmulx4x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_mulx4x_mont,.-bn_mulx4x_mont -.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -.align 16 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S b/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S deleted file mode 100644 index 59367b6e..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/fipsmodule/x86_64-mont5.S +++ /dev/null @@ -1,3609 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - -.extern OPENSSL_ia32cap_P -.hidden OPENSSL_ia32cap_P - -.globl bn_mul_mont_gather5 -.hidden bn_mul_mont_gather5 -.type bn_mul_mont_gather5,@function -.align 64 -bn_mul_mont_gather5: -.cfi_startproc - movl %r9d,%r9d - movq %rsp,%rax -.cfi_def_cfa_register %rax - testl $7,%r9d - jnz .Lmul_enter - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - jmp .Lmul4x_enter - -.align 16 -.Lmul_enter: - movd 8(%rsp),%xmm5 - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 - - negq %r9 - movq %rsp,%r11 - leaq -280(%rsp,%r9,8),%r10 - negq %r9 - andq $-1024,%r10 - - - - - - - - - - subq %r10,%r11 - andq $-4096,%r11 - leaq (%r10,%r11,1),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja .Lmul_page_walk - jmp .Lmul_page_walk_done - -.Lmul_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r11 - cmpq %r10,%rsp - ja .Lmul_page_walk -.Lmul_page_walk_done: - - leaq .Linc(%rip),%r10 - movq %rax,8(%rsp,%r9,8) -.cfi_escape 0x0f,0x0a,0x77,0x08,0x79,0x00,0x38,0x1e,0x22,0x06,0x23,0x08 -.Lmul_body: - - leaq 128(%rdx),%r12 - movdqa 0(%r10),%xmm0 - movdqa 16(%r10),%xmm1 - leaq 24-112(%rsp,%r9,8),%r10 - andq $-16,%r10 - - pshufd $0,%xmm5,%xmm5 - movdqa %xmm1,%xmm4 - movdqa %xmm1,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 -.byte 0x67 - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,112(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,128(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,144(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,160(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,176(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,192(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,208(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,224(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,240(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,256(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,272(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,288(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,304(%r10) - - paddd %xmm2,%xmm3 -.byte 0x67 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,320(%r10) - - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,336(%r10) - pand 64(%r12),%xmm0 - - pand 80(%r12),%xmm1 - pand 96(%r12),%xmm2 - movdqa %xmm3,352(%r10) - pand 112(%r12),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -128(%r12),%xmm4 - movdqa -112(%r12),%xmm5 - movdqa -96(%r12),%xmm2 - pand 112(%r10),%xmm4 - movdqa -80(%r12),%xmm3 - pand 128(%r10),%xmm5 - por %xmm4,%xmm0 - pand 144(%r10),%xmm2 - por %xmm5,%xmm1 - pand 160(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -64(%r12),%xmm4 - movdqa -48(%r12),%xmm5 - movdqa -32(%r12),%xmm2 - pand 176(%r10),%xmm4 - movdqa -16(%r12),%xmm3 - pand 192(%r10),%xmm5 - por %xmm4,%xmm0 - pand 208(%r10),%xmm2 - por %xmm5,%xmm1 - pand 224(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa 0(%r12),%xmm4 - movdqa 16(%r12),%xmm5 - movdqa 32(%r12),%xmm2 - pand 240(%r10),%xmm4 - movdqa 48(%r12),%xmm3 - pand 256(%r10),%xmm5 - por %xmm4,%xmm0 - pand 272(%r10),%xmm2 - por %xmm5,%xmm1 - pand 288(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - por %xmm1,%xmm0 - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 - leaq 256(%r12),%r12 -.byte 102,72,15,126,195 - - movq (%r8),%r8 - movq (%rsi),%rax - - xorq %r14,%r14 - xorq %r15,%r15 - - movq %r8,%rbp - mulq %rbx - movq %rax,%r10 - movq (%rcx),%rax - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq %rdx,%r13 - - leaq 1(%r15),%r15 - jmp .L1st_enter - -.align 16 -.L1st: - addq %rax,%r13 - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r11,%r13 - movq %r10,%r11 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - -.L1st_enter: - mulq %rbx - addq %rax,%r11 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - leaq 1(%r15),%r15 - movq %rdx,%r10 - - mulq %rbp - cmpq %r9,%r15 - jne .L1st - - - addq %rax,%r13 - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %r13,-16(%rsp,%r9,8) - movq %rdx,%r13 - movq %r10,%r11 - - xorq %rdx,%rdx - addq %r11,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r9,8) - movq %rdx,(%rsp,%r9,8) - - leaq 1(%r14),%r14 - jmp .Louter -.align 16 -.Louter: - leaq 24+128(%rsp,%r9,8),%rdx - andq $-16,%rdx - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - movdqa -128(%r12),%xmm0 - movdqa -112(%r12),%xmm1 - movdqa -96(%r12),%xmm2 - movdqa -80(%r12),%xmm3 - pand -128(%rdx),%xmm0 - pand -112(%rdx),%xmm1 - por %xmm0,%xmm4 - pand -96(%rdx),%xmm2 - por %xmm1,%xmm5 - pand -80(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa -64(%r12),%xmm0 - movdqa -48(%r12),%xmm1 - movdqa -32(%r12),%xmm2 - movdqa -16(%r12),%xmm3 - pand -64(%rdx),%xmm0 - pand -48(%rdx),%xmm1 - por %xmm0,%xmm4 - pand -32(%rdx),%xmm2 - por %xmm1,%xmm5 - pand -16(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 0(%r12),%xmm0 - movdqa 16(%r12),%xmm1 - movdqa 32(%r12),%xmm2 - movdqa 48(%r12),%xmm3 - pand 0(%rdx),%xmm0 - pand 16(%rdx),%xmm1 - por %xmm0,%xmm4 - pand 32(%rdx),%xmm2 - por %xmm1,%xmm5 - pand 48(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 64(%r12),%xmm0 - movdqa 80(%r12),%xmm1 - movdqa 96(%r12),%xmm2 - movdqa 112(%r12),%xmm3 - pand 64(%rdx),%xmm0 - pand 80(%rdx),%xmm1 - por %xmm0,%xmm4 - pand 96(%rdx),%xmm2 - por %xmm1,%xmm5 - pand 112(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - por %xmm5,%xmm4 - pshufd $0x4e,%xmm4,%xmm0 - por %xmm4,%xmm0 - leaq 256(%r12),%r12 - - movq (%rsi),%rax -.byte 102,72,15,126,195 - - xorq %r15,%r15 - movq %r8,%rbp - movq (%rsp),%r10 - - mulq %rbx - addq %rax,%r10 - movq (%rcx),%rax - adcq $0,%rdx - - imulq %r10,%rbp - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi),%rax - adcq $0,%rdx - movq 8(%rsp),%r10 - movq %rdx,%r13 - - leaq 1(%r15),%r15 - jmp .Linner_enter - -.align 16 -.Linner: - addq %rax,%r13 - movq (%rsi,%r15,8),%rax - adcq $0,%rdx - addq %r10,%r13 - movq (%rsp,%r15,8),%r10 - adcq $0,%rdx - movq %r13,-16(%rsp,%r15,8) - movq %rdx,%r13 - -.Linner_enter: - mulq %rbx - addq %rax,%r11 - movq (%rcx,%r15,8),%rax - adcq $0,%rdx - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - leaq 1(%r15),%r15 - - mulq %rbp - cmpq %r9,%r15 - jne .Linner - - addq %rax,%r13 - adcq $0,%rdx - addq %r10,%r13 - movq (%rsp,%r9,8),%r10 - adcq $0,%rdx - movq %r13,-16(%rsp,%r9,8) - movq %rdx,%r13 - - xorq %rdx,%rdx - addq %r11,%r13 - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-8(%rsp,%r9,8) - movq %rdx,(%rsp,%r9,8) - - leaq 1(%r14),%r14 - cmpq %r9,%r14 - jb .Louter - - xorq %r14,%r14 - movq (%rsp),%rax - leaq (%rsp),%rsi - movq %r9,%r15 - jmp .Lsub -.align 16 -.Lsub: sbbq (%rcx,%r14,8),%rax - movq %rax,(%rdi,%r14,8) - movq 8(%rsi,%r14,8),%rax - leaq 1(%r14),%r14 - decq %r15 - jnz .Lsub - - sbbq $0,%rax - movq $-1,%rbx - xorq %rax,%rbx - xorq %r14,%r14 - movq %r9,%r15 - -.Lcopy: - movq (%rdi,%r14,8),%rcx - movq (%rsp,%r14,8),%rdx - andq %rbx,%rcx - andq %rax,%rdx - movq %r14,(%rsp,%r14,8) - orq %rcx,%rdx - movq %rdx,(%rdi,%r14,8) - leaq 1(%r14),%r14 - subq $1,%r15 - jnz .Lcopy - - movq 8(%rsp,%r9,8),%rsi -.cfi_def_cfa %rsi,8 - movq $1,%rax - - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lmul_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_mul_mont_gather5,.-bn_mul_mont_gather5 -.type bn_mul4x_mont_gather5,@function -.align 32 -bn_mul4x_mont_gather5: -.cfi_startproc -.byte 0x67 - movq %rsp,%rax -.cfi_def_cfa_register %rax -.Lmul4x_enter: - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je .Lmulx4x_enter - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 -.Lmul4x_prologue: - -.byte 0x67 - shll $3,%r9d - leaq (%r9,%r9,2),%r10 - negq %r9 - - - - - - - - - - - leaq -320(%rsp,%r9,2),%r11 - movq %rsp,%rbp - subq %rdi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb .Lmul4xsp_alt - subq %r11,%rbp - leaq -320(%rbp,%r9,2),%rbp - jmp .Lmul4xsp_done - -.align 32 -.Lmul4xsp_alt: - leaq 4096-320(,%r9,2),%r10 - leaq -320(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -.Lmul4xsp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lmul4x_page_walk - jmp .Lmul4x_page_walk_done - -.Lmul4x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lmul4x_page_walk -.Lmul4x_page_walk_done: - - negq %r9 - - movq %rax,40(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 -.Lmul4x_body: - - call mul4x_internal - - movq 40(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq $1,%rax - - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lmul4x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_mul4x_mont_gather5,.-bn_mul4x_mont_gather5 - -.type mul4x_internal,@function -.align 32 -mul4x_internal: -.cfi_startproc - shlq $5,%r9 - movd 8(%rax),%xmm5 - leaq .Linc(%rip),%rax - leaq 128(%rdx,%r9,1),%r13 - shrq $5,%r9 - movdqa 0(%rax),%xmm0 - movdqa 16(%rax),%xmm1 - leaq 88-112(%rsp,%r9,1),%r10 - leaq 128(%rdx),%r12 - - pshufd $0,%xmm5,%xmm5 - movdqa %xmm1,%xmm4 -.byte 0x67,0x67 - movdqa %xmm1,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 -.byte 0x67 - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,112(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,128(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,144(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,160(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,176(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,192(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,208(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,224(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,240(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,256(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,272(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,288(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,304(%r10) - - paddd %xmm2,%xmm3 -.byte 0x67 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,320(%r10) - - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,336(%r10) - pand 64(%r12),%xmm0 - - pand 80(%r12),%xmm1 - pand 96(%r12),%xmm2 - movdqa %xmm3,352(%r10) - pand 112(%r12),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -128(%r12),%xmm4 - movdqa -112(%r12),%xmm5 - movdqa -96(%r12),%xmm2 - pand 112(%r10),%xmm4 - movdqa -80(%r12),%xmm3 - pand 128(%r10),%xmm5 - por %xmm4,%xmm0 - pand 144(%r10),%xmm2 - por %xmm5,%xmm1 - pand 160(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -64(%r12),%xmm4 - movdqa -48(%r12),%xmm5 - movdqa -32(%r12),%xmm2 - pand 176(%r10),%xmm4 - movdqa -16(%r12),%xmm3 - pand 192(%r10),%xmm5 - por %xmm4,%xmm0 - pand 208(%r10),%xmm2 - por %xmm5,%xmm1 - pand 224(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa 0(%r12),%xmm4 - movdqa 16(%r12),%xmm5 - movdqa 32(%r12),%xmm2 - pand 240(%r10),%xmm4 - movdqa 48(%r12),%xmm3 - pand 256(%r10),%xmm5 - por %xmm4,%xmm0 - pand 272(%r10),%xmm2 - por %xmm5,%xmm1 - pand 288(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - por %xmm1,%xmm0 - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 - leaq 256(%r12),%r12 -.byte 102,72,15,126,195 - - movq %r13,16+8(%rsp) - movq %rdi,56+8(%rsp) - - movq (%r8),%r8 - movq (%rsi),%rax - leaq (%rsi,%r9,1),%rsi - negq %r9 - - movq %r8,%rbp - mulq %rbx - movq %rax,%r10 - movq (%rcx),%rax - - imulq %r10,%rbp - leaq 64+8(%rsp),%r14 - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi,%r9,1),%rax - adcq $0,%rdx - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi,%r9,1),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 32(%r9),%r15 - leaq 32(%rcx),%rcx - adcq $0,%rdx - movq %rdi,(%r14) - movq %rdx,%r13 - jmp .L1st4x - -.align 32 -.L1st4x: - mulq %rbx - addq %rax,%r10 - movq -16(%rcx),%rax - leaq 32(%r14),%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%r14) - movq %rdx,%r13 - - mulq %rbx - addq %rax,%r10 - movq 0(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq 8(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-8(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 32(%rcx),%rcx - adcq $0,%rdx - movq %rdi,(%r14) - movq %rdx,%r13 - - addq $32,%r15 - jnz .L1st4x - - mulq %rbx - addq %rax,%r10 - movq -16(%rcx),%rax - leaq 32(%r14),%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %r13,-24(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx),%rax - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r9,1),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %rdi,-16(%r14) - movq %rdx,%r13 - - leaq (%rcx,%r9,1),%rcx - - xorq %rdi,%rdi - addq %r10,%r13 - adcq $0,%rdi - movq %r13,-8(%r14) - - jmp .Louter4x - -.align 32 -.Louter4x: - leaq 16+128(%r14),%rdx - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - movdqa -128(%r12),%xmm0 - movdqa -112(%r12),%xmm1 - movdqa -96(%r12),%xmm2 - movdqa -80(%r12),%xmm3 - pand -128(%rdx),%xmm0 - pand -112(%rdx),%xmm1 - por %xmm0,%xmm4 - pand -96(%rdx),%xmm2 - por %xmm1,%xmm5 - pand -80(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa -64(%r12),%xmm0 - movdqa -48(%r12),%xmm1 - movdqa -32(%r12),%xmm2 - movdqa -16(%r12),%xmm3 - pand -64(%rdx),%xmm0 - pand -48(%rdx),%xmm1 - por %xmm0,%xmm4 - pand -32(%rdx),%xmm2 - por %xmm1,%xmm5 - pand -16(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 0(%r12),%xmm0 - movdqa 16(%r12),%xmm1 - movdqa 32(%r12),%xmm2 - movdqa 48(%r12),%xmm3 - pand 0(%rdx),%xmm0 - pand 16(%rdx),%xmm1 - por %xmm0,%xmm4 - pand 32(%rdx),%xmm2 - por %xmm1,%xmm5 - pand 48(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 64(%r12),%xmm0 - movdqa 80(%r12),%xmm1 - movdqa 96(%r12),%xmm2 - movdqa 112(%r12),%xmm3 - pand 64(%rdx),%xmm0 - pand 80(%rdx),%xmm1 - por %xmm0,%xmm4 - pand 96(%rdx),%xmm2 - por %xmm1,%xmm5 - pand 112(%rdx),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - por %xmm5,%xmm4 - pshufd $0x4e,%xmm4,%xmm0 - por %xmm4,%xmm0 - leaq 256(%r12),%r12 -.byte 102,72,15,126,195 - - movq (%r14,%r9,1),%r10 - movq %r8,%rbp - mulq %rbx - addq %rax,%r10 - movq (%rcx),%rax - adcq $0,%rdx - - imulq %r10,%rbp - movq %rdx,%r11 - movq %rdi,(%r14) - - leaq (%r14,%r9,1),%r14 - - mulq %rbp - addq %rax,%r10 - movq 8(%rsi,%r9,1),%rax - adcq $0,%rdx - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - addq 8(%r14),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi,%r9,1),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 32(%r9),%r15 - leaq 32(%rcx),%rcx - adcq $0,%rdx - movq %rdx,%r13 - jmp .Linner4x - -.align 32 -.Linner4x: - mulq %rbx - addq %rax,%r10 - movq -16(%rcx),%rax - adcq $0,%rdx - addq 16(%r14),%r10 - leaq 32(%r14),%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdi,-32(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq -8(%rcx),%rax - adcq $0,%rdx - addq -8(%r14),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %r13,-24(%r14) - movq %rdx,%r13 - - mulq %rbx - addq %rax,%r10 - movq 0(%rcx),%rax - adcq $0,%rdx - addq (%r14),%r10 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq 8(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdi,-16(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq 8(%rcx),%rax - adcq $0,%rdx - addq 8(%r14),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq 16(%rsi,%r15,1),%rax - adcq $0,%rdx - addq %r11,%rdi - leaq 32(%rcx),%rcx - adcq $0,%rdx - movq %r13,-8(%r14) - movq %rdx,%r13 - - addq $32,%r15 - jnz .Linner4x - - mulq %rbx - addq %rax,%r10 - movq -16(%rcx),%rax - adcq $0,%rdx - addq 16(%r14),%r10 - leaq 32(%r14),%r14 - adcq $0,%rdx - movq %rdx,%r11 - - mulq %rbp - addq %rax,%r13 - movq -8(%rsi),%rax - adcq $0,%rdx - addq %r10,%r13 - adcq $0,%rdx - movq %rdi,-32(%r14) - movq %rdx,%rdi - - mulq %rbx - addq %rax,%r11 - movq %rbp,%rax - movq -8(%rcx),%rbp - adcq $0,%rdx - addq -8(%r14),%r11 - adcq $0,%rdx - movq %rdx,%r10 - - mulq %rbp - addq %rax,%rdi - movq (%rsi,%r9,1),%rax - adcq $0,%rdx - addq %r11,%rdi - adcq $0,%rdx - movq %r13,-24(%r14) - movq %rdx,%r13 - - movq %rdi,-16(%r14) - leaq (%rcx,%r9,1),%rcx - - xorq %rdi,%rdi - addq %r10,%r13 - adcq $0,%rdi - addq (%r14),%r13 - adcq $0,%rdi - movq %r13,-8(%r14) - - cmpq 16+8(%rsp),%r12 - jb .Louter4x - xorq %rax,%rax - subq %r13,%rbp - adcq %r15,%r15 - orq %r15,%rdi - subq %rdi,%rax - leaq (%r14,%r9,1),%rbx - movq (%rcx),%r12 - leaq (%rcx),%rbp - movq %r9,%rcx - sarq $3+2,%rcx - movq 56+8(%rsp),%rdi - decq %r12 - xorq %r10,%r10 - movq 8(%rbp),%r13 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 - jmp .Lsqr4x_sub_entry -.cfi_endproc -.size mul4x_internal,.-mul4x_internal -.globl bn_power5 -.hidden bn_power5 -.type bn_power5,@function -.align 32 -bn_power5: -.cfi_startproc - movq %rsp,%rax -.cfi_def_cfa_register %rax - leaq OPENSSL_ia32cap_P(%rip),%r11 - movl 8(%r11),%r11d - andl $0x80108,%r11d - cmpl $0x80108,%r11d - je .Lpowerx5_enter - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 -.Lpower5_prologue: - - shll $3,%r9d - leal (%r9,%r9,2),%r10d - negq %r9 - movq (%r8),%r8 - - - - - - - - - leaq -320(%rsp,%r9,2),%r11 - movq %rsp,%rbp - subq %rdi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb .Lpwr_sp_alt - subq %r11,%rbp - leaq -320(%rbp,%r9,2),%rbp - jmp .Lpwr_sp_done - -.align 32 -.Lpwr_sp_alt: - leaq 4096-320(,%r9,2),%r10 - leaq -320(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -.Lpwr_sp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lpwr_page_walk - jmp .Lpwr_page_walk_done - -.Lpwr_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lpwr_page_walk -.Lpwr_page_walk_done: - - movq %r9,%r10 - negq %r9 - - - - - - - - - - - movq %r8,32(%rsp) - movq %rax,40(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 -.Lpower5_body: -.byte 102,72,15,110,207 -.byte 102,72,15,110,209 -.byte 102,73,15,110,218 -.byte 102,72,15,110,226 - - call __bn_sqr8x_internal - call __bn_post4x_internal - call __bn_sqr8x_internal - call __bn_post4x_internal - call __bn_sqr8x_internal - call __bn_post4x_internal - call __bn_sqr8x_internal - call __bn_post4x_internal - call __bn_sqr8x_internal - call __bn_post4x_internal - -.byte 102,72,15,126,209 -.byte 102,72,15,126,226 - movq %rsi,%rdi - movq 40(%rsp),%rax - leaq 32(%rsp),%r8 - - call mul4x_internal - - movq 40(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq $1,%rax - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lpower5_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_power5,.-bn_power5 - -.globl bn_sqr8x_internal -.hidden bn_sqr8x_internal -.hidden bn_sqr8x_internal -.type bn_sqr8x_internal,@function -.align 32 -bn_sqr8x_internal: -__bn_sqr8x_internal: -.cfi_startproc - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - leaq 32(%r10),%rbp - leaq (%rsi,%r9,1),%rsi - - movq %r9,%rcx - - - movq -32(%rsi,%rbp,1),%r14 - leaq 48+8(%rsp,%r9,2),%rdi - movq -24(%rsi,%rbp,1),%rax - leaq -32(%rdi,%rbp,1),%rdi - movq -16(%rsi,%rbp,1),%rbx - movq %rax,%r15 - - mulq %r14 - movq %rax,%r10 - movq %rbx,%rax - movq %rdx,%r11 - movq %r10,-24(%rdi,%rbp,1) - - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq $0,%rdx - movq %r11,-16(%rdi,%rbp,1) - movq %rdx,%r10 - - - movq -8(%rsi,%rbp,1),%rbx - mulq %r15 - movq %rax,%r12 - movq %rbx,%rax - movq %rdx,%r13 - - leaq (%rbp),%rcx - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - movq %rdx,%r11 - adcq $0,%r11 - addq %r12,%r10 - adcq $0,%r11 - movq %r10,-8(%rdi,%rcx,1) - jmp .Lsqr4x_1st - -.align 32 -.Lsqr4x_1st: - movq (%rsi,%rcx,1),%rbx - mulq %r15 - addq %rax,%r13 - movq %rbx,%rax - movq %rdx,%r12 - adcq $0,%r12 - - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - movq 8(%rsi,%rcx,1),%rbx - movq %rdx,%r10 - adcq $0,%r10 - addq %r13,%r11 - adcq $0,%r10 - - - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - movq %r11,(%rdi,%rcx,1) - movq %rdx,%r13 - adcq $0,%r13 - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - movq 16(%rsi,%rcx,1),%rbx - movq %rdx,%r11 - adcq $0,%r11 - addq %r12,%r10 - adcq $0,%r11 - - mulq %r15 - addq %rax,%r13 - movq %rbx,%rax - movq %r10,8(%rdi,%rcx,1) - movq %rdx,%r12 - adcq $0,%r12 - - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - movq 24(%rsi,%rcx,1),%rbx - movq %rdx,%r10 - adcq $0,%r10 - addq %r13,%r11 - adcq $0,%r10 - - - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - movq %r11,16(%rdi,%rcx,1) - movq %rdx,%r13 - adcq $0,%r13 - leaq 32(%rcx),%rcx - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - movq %rdx,%r11 - adcq $0,%r11 - addq %r12,%r10 - adcq $0,%r11 - movq %r10,-8(%rdi,%rcx,1) - - cmpq $0,%rcx - jne .Lsqr4x_1st - - mulq %r15 - addq %rax,%r13 - leaq 16(%rbp),%rbp - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - - movq %r13,(%rdi) - movq %rdx,%r12 - movq %rdx,8(%rdi) - jmp .Lsqr4x_outer - -.align 32 -.Lsqr4x_outer: - movq -32(%rsi,%rbp,1),%r14 - leaq 48+8(%rsp,%r9,2),%rdi - movq -24(%rsi,%rbp,1),%rax - leaq -32(%rdi,%rbp,1),%rdi - movq -16(%rsi,%rbp,1),%rbx - movq %rax,%r15 - - mulq %r14 - movq -24(%rdi,%rbp,1),%r10 - addq %rax,%r10 - movq %rbx,%rax - adcq $0,%rdx - movq %r10,-24(%rdi,%rbp,1) - movq %rdx,%r11 - - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - adcq $0,%rdx - addq -16(%rdi,%rbp,1),%r11 - movq %rdx,%r10 - adcq $0,%r10 - movq %r11,-16(%rdi,%rbp,1) - - xorq %r12,%r12 - - movq -8(%rsi,%rbp,1),%rbx - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - adcq $0,%rdx - addq -8(%rdi,%rbp,1),%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq $0,%rdx - addq %r12,%r10 - movq %rdx,%r11 - adcq $0,%r11 - movq %r10,-8(%rdi,%rbp,1) - - leaq (%rbp),%rcx - jmp .Lsqr4x_inner - -.align 32 -.Lsqr4x_inner: - movq (%rsi,%rcx,1),%rbx - mulq %r15 - addq %rax,%r13 - movq %rbx,%rax - movq %rdx,%r12 - adcq $0,%r12 - addq (%rdi,%rcx,1),%r13 - adcq $0,%r12 - -.byte 0x67 - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - movq 8(%rsi,%rcx,1),%rbx - movq %rdx,%r10 - adcq $0,%r10 - addq %r13,%r11 - adcq $0,%r10 - - mulq %r15 - addq %rax,%r12 - movq %r11,(%rdi,%rcx,1) - movq %rbx,%rax - movq %rdx,%r13 - adcq $0,%r13 - addq 8(%rdi,%rcx,1),%r12 - leaq 16(%rcx),%rcx - adcq $0,%r13 - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - adcq $0,%rdx - addq %r12,%r10 - movq %rdx,%r11 - adcq $0,%r11 - movq %r10,-8(%rdi,%rcx,1) - - cmpq $0,%rcx - jne .Lsqr4x_inner - -.byte 0x67 - mulq %r15 - addq %rax,%r13 - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - - movq %r13,(%rdi) - movq %rdx,%r12 - movq %rdx,8(%rdi) - - addq $16,%rbp - jnz .Lsqr4x_outer - - - movq -32(%rsi),%r14 - leaq 48+8(%rsp,%r9,2),%rdi - movq -24(%rsi),%rax - leaq -32(%rdi,%rbp,1),%rdi - movq -16(%rsi),%rbx - movq %rax,%r15 - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - movq %rdx,%r11 - adcq $0,%r11 - - mulq %r14 - addq %rax,%r11 - movq %rbx,%rax - movq %r10,-24(%rdi) - movq %rdx,%r10 - adcq $0,%r10 - addq %r13,%r11 - movq -8(%rsi),%rbx - adcq $0,%r10 - - mulq %r15 - addq %rax,%r12 - movq %rbx,%rax - movq %r11,-16(%rdi) - movq %rdx,%r13 - adcq $0,%r13 - - mulq %r14 - addq %rax,%r10 - movq %rbx,%rax - movq %rdx,%r11 - adcq $0,%r11 - addq %r12,%r10 - adcq $0,%r11 - movq %r10,-8(%rdi) - - mulq %r15 - addq %rax,%r13 - movq -16(%rsi),%rax - adcq $0,%rdx - addq %r11,%r13 - adcq $0,%rdx - - movq %r13,(%rdi) - movq %rdx,%r12 - movq %rdx,8(%rdi) - - mulq %rbx - addq $16,%rbp - xorq %r14,%r14 - subq %r9,%rbp - xorq %r15,%r15 - - addq %r12,%rax - adcq $0,%rdx - movq %rax,8(%rdi) - movq %rdx,16(%rdi) - movq %r15,24(%rdi) - - movq -16(%rsi,%rbp,1),%rax - leaq 48+8(%rsp),%rdi - xorq %r10,%r10 - movq 8(%rdi),%r11 - - leaq (%r14,%r10,2),%r12 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq 16(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 24(%rdi),%r11 - adcq %rax,%r12 - movq -8(%rsi,%rbp,1),%rax - movq %r12,(%rdi) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,8(%rdi) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - movq 32(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 40(%rdi),%r11 - adcq %rax,%rbx - movq 0(%rsi,%rbp,1),%rax - movq %rbx,16(%rdi) - adcq %rdx,%r8 - leaq 16(%rbp),%rbp - movq %r8,24(%rdi) - sbbq %r15,%r15 - leaq 64(%rdi),%rdi - jmp .Lsqr4x_shift_n_add - -.align 32 -.Lsqr4x_shift_n_add: - leaq (%r14,%r10,2),%r12 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq -16(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq -8(%rdi),%r11 - adcq %rax,%r12 - movq -8(%rsi,%rbp,1),%rax - movq %r12,-32(%rdi) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,-24(%rdi) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - movq 0(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 8(%rdi),%r11 - adcq %rax,%rbx - movq 0(%rsi,%rbp,1),%rax - movq %rbx,-16(%rdi) - adcq %rdx,%r8 - - leaq (%r14,%r10,2),%r12 - movq %r8,-8(%rdi) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq 16(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 24(%rdi),%r11 - adcq %rax,%r12 - movq 8(%rsi,%rbp,1),%rax - movq %r12,0(%rdi) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,8(%rdi) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - movq 32(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq 40(%rdi),%r11 - adcq %rax,%rbx - movq 16(%rsi,%rbp,1),%rax - movq %rbx,16(%rdi) - adcq %rdx,%r8 - movq %r8,24(%rdi) - sbbq %r15,%r15 - leaq 64(%rdi),%rdi - addq $32,%rbp - jnz .Lsqr4x_shift_n_add - - leaq (%r14,%r10,2),%r12 -.byte 0x67 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r13 - shrq $63,%r11 - orq %r10,%r13 - movq -16(%rdi),%r10 - movq %r11,%r14 - mulq %rax - negq %r15 - movq -8(%rdi),%r11 - adcq %rax,%r12 - movq -8(%rsi),%rax - movq %r12,-32(%rdi) - adcq %rdx,%r13 - - leaq (%r14,%r10,2),%rbx - movq %r13,-24(%rdi) - sbbq %r15,%r15 - shrq $63,%r10 - leaq (%rcx,%r11,2),%r8 - shrq $63,%r11 - orq %r10,%r8 - mulq %rax - negq %r15 - adcq %rax,%rbx - adcq %rdx,%r8 - movq %rbx,-16(%rdi) - movq %r8,-8(%rdi) -.byte 102,72,15,126,213 -__bn_sqr8x_reduction: - xorq %rax,%rax - leaq (%r9,%rbp,1),%rcx - leaq 48+8(%rsp,%r9,2),%rdx - movq %rcx,0+8(%rsp) - leaq 48+8(%rsp,%r9,1),%rdi - movq %rdx,8+8(%rsp) - negq %r9 - jmp .L8x_reduction_loop - -.align 32 -.L8x_reduction_loop: - leaq (%rdi,%r9,1),%rdi -.byte 0x66 - movq 0(%rdi),%rbx - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq %rax,(%rdx) - leaq 64(%rdi),%rdi - -.byte 0x67 - movq %rbx,%r8 - imulq 32+8(%rsp),%rbx - movq 0(%rbp),%rax - movl $8,%ecx - jmp .L8x_reduce - -.align 32 -.L8x_reduce: - mulq %rbx - movq 8(%rbp),%rax - negq %r8 - movq %rdx,%r8 - adcq $0,%r8 - - mulq %rbx - addq %rax,%r9 - movq 16(%rbp),%rax - adcq $0,%rdx - addq %r9,%r8 - movq %rbx,48-8+8(%rsp,%rcx,8) - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r10 - movq 24(%rbp),%rax - adcq $0,%rdx - addq %r10,%r9 - movq 32+8(%rsp),%rsi - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r11 - movq 32(%rbp),%rax - adcq $0,%rdx - imulq %r8,%rsi - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r12 - movq 40(%rbp),%rax - adcq $0,%rdx - addq %r12,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r13 - movq 48(%rbp),%rax - adcq $0,%rdx - addq %r13,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r14 - movq 56(%rbp),%rax - adcq $0,%rdx - addq %r14,%r13 - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - movq %rsi,%rbx - addq %rax,%r15 - movq 0(%rbp),%rax - adcq $0,%rdx - addq %r15,%r14 - movq %rdx,%r15 - adcq $0,%r15 - - decl %ecx - jnz .L8x_reduce - - leaq 64(%rbp),%rbp - xorq %rax,%rax - movq 8+8(%rsp),%rdx - cmpq 0+8(%rsp),%rbp - jae .L8x_no_tail - -.byte 0x66 - addq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - sbbq %rsi,%rsi - - movq 48+56+8(%rsp),%rbx - movl $8,%ecx - movq 0(%rbp),%rax - jmp .L8x_tail - -.align 32 -.L8x_tail: - mulq %rbx - addq %rax,%r8 - movq 8(%rbp),%rax - movq %r8,(%rdi) - movq %rdx,%r8 - adcq $0,%r8 - - mulq %rbx - addq %rax,%r9 - movq 16(%rbp),%rax - adcq $0,%rdx - addq %r9,%r8 - leaq 8(%rdi),%rdi - movq %rdx,%r9 - adcq $0,%r9 - - mulq %rbx - addq %rax,%r10 - movq 24(%rbp),%rax - adcq $0,%rdx - addq %r10,%r9 - movq %rdx,%r10 - adcq $0,%r10 - - mulq %rbx - addq %rax,%r11 - movq 32(%rbp),%rax - adcq $0,%rdx - addq %r11,%r10 - movq %rdx,%r11 - adcq $0,%r11 - - mulq %rbx - addq %rax,%r12 - movq 40(%rbp),%rax - adcq $0,%rdx - addq %r12,%r11 - movq %rdx,%r12 - adcq $0,%r12 - - mulq %rbx - addq %rax,%r13 - movq 48(%rbp),%rax - adcq $0,%rdx - addq %r13,%r12 - movq %rdx,%r13 - adcq $0,%r13 - - mulq %rbx - addq %rax,%r14 - movq 56(%rbp),%rax - adcq $0,%rdx - addq %r14,%r13 - movq %rdx,%r14 - adcq $0,%r14 - - mulq %rbx - movq 48-16+8(%rsp,%rcx,8),%rbx - addq %rax,%r15 - adcq $0,%rdx - addq %r15,%r14 - movq 0(%rbp),%rax - movq %rdx,%r15 - adcq $0,%r15 - - decl %ecx - jnz .L8x_tail - - leaq 64(%rbp),%rbp - movq 8+8(%rsp),%rdx - cmpq 0+8(%rsp),%rbp - jae .L8x_tail_done - - movq 48+56+8(%rsp),%rbx - negq %rsi - movq 0(%rbp),%rax - adcq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - sbbq %rsi,%rsi - - movl $8,%ecx - jmp .L8x_tail - -.align 32 -.L8x_tail_done: - xorq %rax,%rax - addq (%rdx),%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rax - - negq %rsi -.L8x_no_tail: - adcq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - adcq $0,%rax - movq -8(%rbp),%rcx - xorq %rsi,%rsi - -.byte 102,72,15,126,213 - - movq %r8,0(%rdi) - movq %r9,8(%rdi) -.byte 102,73,15,126,217 - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - leaq 64(%rdi),%rdi - - cmpq %rdx,%rdi - jb .L8x_reduction_loop - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_sqr8x_internal,.-bn_sqr8x_internal -.type __bn_post4x_internal,@function -.align 32 -__bn_post4x_internal: -.cfi_startproc - movq 0(%rbp),%r12 - leaq (%rdi,%r9,1),%rbx - movq %r9,%rcx -.byte 102,72,15,126,207 - negq %rax -.byte 102,72,15,126,206 - sarq $3+2,%rcx - decq %r12 - xorq %r10,%r10 - movq 8(%rbp),%r13 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 - jmp .Lsqr4x_sub_entry - -.align 16 -.Lsqr4x_sub: - movq 0(%rbp),%r12 - movq 8(%rbp),%r13 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 -.Lsqr4x_sub_entry: - leaq 32(%rbp),%rbp - notq %r12 - notq %r13 - notq %r14 - notq %r15 - andq %rax,%r12 - andq %rax,%r13 - andq %rax,%r14 - andq %rax,%r15 - - negq %r10 - adcq 0(%rbx),%r12 - adcq 8(%rbx),%r13 - adcq 16(%rbx),%r14 - adcq 24(%rbx),%r15 - movq %r12,0(%rdi) - leaq 32(%rbx),%rbx - movq %r13,8(%rdi) - sbbq %r10,%r10 - movq %r14,16(%rdi) - movq %r15,24(%rdi) - leaq 32(%rdi),%rdi - - incq %rcx - jnz .Lsqr4x_sub - - movq %r9,%r10 - negq %r9 - .byte 0xf3,0xc3 -.cfi_endproc -.size __bn_post4x_internal,.-__bn_post4x_internal -.type bn_mulx4x_mont_gather5,@function -.align 32 -bn_mulx4x_mont_gather5: -.cfi_startproc - movq %rsp,%rax -.cfi_def_cfa_register %rax -.Lmulx4x_enter: - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 -.Lmulx4x_prologue: - - shll $3,%r9d - leaq (%r9,%r9,2),%r10 - negq %r9 - movq (%r8),%r8 - - - - - - - - - - - leaq -320(%rsp,%r9,2),%r11 - movq %rsp,%rbp - subq %rdi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb .Lmulx4xsp_alt - subq %r11,%rbp - leaq -320(%rbp,%r9,2),%rbp - jmp .Lmulx4xsp_done - -.Lmulx4xsp_alt: - leaq 4096-320(,%r9,2),%r10 - leaq -320(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -.Lmulx4xsp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lmulx4x_page_walk - jmp .Lmulx4x_page_walk_done - -.Lmulx4x_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lmulx4x_page_walk -.Lmulx4x_page_walk_done: - - - - - - - - - - - - - - movq %r8,32(%rsp) - movq %rax,40(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 -.Lmulx4x_body: - call mulx4x_internal - - movq 40(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq $1,%rax - - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lmulx4x_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_mulx4x_mont_gather5,.-bn_mulx4x_mont_gather5 - -.type mulx4x_internal,@function -.align 32 -mulx4x_internal: -.cfi_startproc - movq %r9,8(%rsp) - movq %r9,%r10 - negq %r9 - shlq $5,%r9 - negq %r10 - leaq 128(%rdx,%r9,1),%r13 - shrq $5+5,%r9 - movd 8(%rax),%xmm5 - subq $1,%r9 - leaq .Linc(%rip),%rax - movq %r13,16+8(%rsp) - movq %r9,24+8(%rsp) - movq %rdi,56+8(%rsp) - movdqa 0(%rax),%xmm0 - movdqa 16(%rax),%xmm1 - leaq 88-112(%rsp,%r10,1),%r10 - leaq 128(%rdx),%rdi - - pshufd $0,%xmm5,%xmm5 - movdqa %xmm1,%xmm4 -.byte 0x67 - movdqa %xmm1,%xmm2 -.byte 0x67 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,112(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,128(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,144(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,160(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,176(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,192(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,208(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,224(%r10) - movdqa %xmm4,%xmm3 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,240(%r10) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,256(%r10) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,272(%r10) - movdqa %xmm4,%xmm2 - - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,288(%r10) - movdqa %xmm4,%xmm3 -.byte 0x67 - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,304(%r10) - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,320(%r10) - - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,336(%r10) - - pand 64(%rdi),%xmm0 - pand 80(%rdi),%xmm1 - pand 96(%rdi),%xmm2 - movdqa %xmm3,352(%r10) - pand 112(%rdi),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -128(%rdi),%xmm4 - movdqa -112(%rdi),%xmm5 - movdqa -96(%rdi),%xmm2 - pand 112(%r10),%xmm4 - movdqa -80(%rdi),%xmm3 - pand 128(%r10),%xmm5 - por %xmm4,%xmm0 - pand 144(%r10),%xmm2 - por %xmm5,%xmm1 - pand 160(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa -64(%rdi),%xmm4 - movdqa -48(%rdi),%xmm5 - movdqa -32(%rdi),%xmm2 - pand 176(%r10),%xmm4 - movdqa -16(%rdi),%xmm3 - pand 192(%r10),%xmm5 - por %xmm4,%xmm0 - pand 208(%r10),%xmm2 - por %xmm5,%xmm1 - pand 224(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - movdqa 0(%rdi),%xmm4 - movdqa 16(%rdi),%xmm5 - movdqa 32(%rdi),%xmm2 - pand 240(%r10),%xmm4 - movdqa 48(%rdi),%xmm3 - pand 256(%r10),%xmm5 - por %xmm4,%xmm0 - pand 272(%r10),%xmm2 - por %xmm5,%xmm1 - pand 288(%r10),%xmm3 - por %xmm2,%xmm0 - por %xmm3,%xmm1 - pxor %xmm1,%xmm0 - pshufd $0x4e,%xmm0,%xmm1 - por %xmm1,%xmm0 - leaq 256(%rdi),%rdi -.byte 102,72,15,126,194 - leaq 64+32+8(%rsp),%rbx - - movq %rdx,%r9 - mulxq 0(%rsi),%r8,%rax - mulxq 8(%rsi),%r11,%r12 - addq %rax,%r11 - mulxq 16(%rsi),%rax,%r13 - adcq %rax,%r12 - adcq $0,%r13 - mulxq 24(%rsi),%rax,%r14 - - movq %r8,%r15 - imulq 32+8(%rsp),%r8 - xorq %rbp,%rbp - movq %r8,%rdx - - movq %rdi,8+8(%rsp) - - leaq 32(%rsi),%rsi - adcxq %rax,%r13 - adcxq %rbp,%r14 - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 16(%rcx),%rax,%r12 - movq 24+8(%rsp),%rdi - movq %r10,-32(%rbx) - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r11,-24(%rbx) - adcxq %rax,%r12 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r12,-16(%rbx) - jmp .Lmulx4x_1st - -.align 32 -.Lmulx4x_1st: - adcxq %rbp,%r15 - mulxq 0(%rsi),%r10,%rax - adcxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 -.byte 0x67,0x67 - movq %r8,%rdx - adcxq %rax,%r13 - adcxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - movq %r11,-32(%rbx) - adoxq %r15,%r13 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - leaq 32(%rcx),%rcx - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_1st - - movq 8(%rsp),%rax - adcq %rbp,%r15 - leaq (%rsi,%rax,1),%rsi - addq %r15,%r14 - movq 8+8(%rsp),%rdi - adcq %rbp,%rbp - movq %r14,-8(%rbx) - jmp .Lmulx4x_outer - -.align 32 -.Lmulx4x_outer: - leaq 16-256(%rbx),%r10 - pxor %xmm4,%xmm4 -.byte 0x67,0x67 - pxor %xmm5,%xmm5 - movdqa -128(%rdi),%xmm0 - movdqa -112(%rdi),%xmm1 - movdqa -96(%rdi),%xmm2 - pand 256(%r10),%xmm0 - movdqa -80(%rdi),%xmm3 - pand 272(%r10),%xmm1 - por %xmm0,%xmm4 - pand 288(%r10),%xmm2 - por %xmm1,%xmm5 - pand 304(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa -64(%rdi),%xmm0 - movdqa -48(%rdi),%xmm1 - movdqa -32(%rdi),%xmm2 - pand 320(%r10),%xmm0 - movdqa -16(%rdi),%xmm3 - pand 336(%r10),%xmm1 - por %xmm0,%xmm4 - pand 352(%r10),%xmm2 - por %xmm1,%xmm5 - pand 368(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 0(%rdi),%xmm0 - movdqa 16(%rdi),%xmm1 - movdqa 32(%rdi),%xmm2 - pand 384(%r10),%xmm0 - movdqa 48(%rdi),%xmm3 - pand 400(%r10),%xmm1 - por %xmm0,%xmm4 - pand 416(%r10),%xmm2 - por %xmm1,%xmm5 - pand 432(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 64(%rdi),%xmm0 - movdqa 80(%rdi),%xmm1 - movdqa 96(%rdi),%xmm2 - pand 448(%r10),%xmm0 - movdqa 112(%rdi),%xmm3 - pand 464(%r10),%xmm1 - por %xmm0,%xmm4 - pand 480(%r10),%xmm2 - por %xmm1,%xmm5 - pand 496(%r10),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - por %xmm5,%xmm4 - pshufd $0x4e,%xmm4,%xmm0 - por %xmm4,%xmm0 - leaq 256(%rdi),%rdi -.byte 102,72,15,126,194 - - movq %rbp,(%rbx) - leaq 32(%rbx,%rax,1),%rbx - mulxq 0(%rsi),%r8,%r11 - xorq %rbp,%rbp - movq %rdx,%r9 - mulxq 8(%rsi),%r14,%r12 - adoxq -32(%rbx),%r8 - adcxq %r14,%r11 - mulxq 16(%rsi),%r15,%r13 - adoxq -24(%rbx),%r11 - adcxq %r15,%r12 - mulxq 24(%rsi),%rdx,%r14 - adoxq -16(%rbx),%r12 - adcxq %rdx,%r13 - leaq (%rcx,%rax,1),%rcx - leaq 32(%rsi),%rsi - adoxq -8(%rbx),%r13 - adcxq %rbp,%r14 - adoxq %rbp,%r14 - - movq %r8,%r15 - imulq 32+8(%rsp),%r8 - - movq %r8,%rdx - xorq %rbp,%rbp - movq %rdi,8+8(%rsp) - - mulxq 0(%rcx),%rax,%r10 - adcxq %rax,%r15 - adoxq %r11,%r10 - mulxq 8(%rcx),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - mulxq 16(%rcx),%rax,%r12 - adcxq %rax,%r11 - adoxq %r13,%r12 - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - movq 24+8(%rsp),%rdi - movq %r10,-32(%rbx) - adcxq %rax,%r12 - movq %r11,-24(%rbx) - adoxq %rbp,%r15 - movq %r12,-16(%rbx) - leaq 32(%rcx),%rcx - jmp .Lmulx4x_inner - -.align 32 -.Lmulx4x_inner: - mulxq 0(%rsi),%r10,%rax - adcxq %rbp,%r15 - adoxq %r14,%r10 - mulxq 8(%rsi),%r11,%r14 - adcxq 0(%rbx),%r10 - adoxq %rax,%r11 - mulxq 16(%rsi),%r12,%rax - adcxq 8(%rbx),%r11 - adoxq %r14,%r12 - mulxq 24(%rsi),%r13,%r14 - movq %r8,%rdx - adcxq 16(%rbx),%r12 - adoxq %rax,%r13 - adcxq 24(%rbx),%r13 - adoxq %rbp,%r14 - leaq 32(%rsi),%rsi - leaq 32(%rbx),%rbx - adcxq %rbp,%r14 - - adoxq %r15,%r10 - mulxq 0(%rcx),%rax,%r15 - adcxq %rax,%r10 - adoxq %r15,%r11 - mulxq 8(%rcx),%rax,%r15 - adcxq %rax,%r11 - adoxq %r15,%r12 - mulxq 16(%rcx),%rax,%r15 - movq %r10,-40(%rbx) - adcxq %rax,%r12 - adoxq %r15,%r13 - movq %r11,-32(%rbx) - mulxq 24(%rcx),%rax,%r15 - movq %r9,%rdx - leaq 32(%rcx),%rcx - movq %r12,-24(%rbx) - adcxq %rax,%r13 - adoxq %rbp,%r15 - movq %r13,-16(%rbx) - - decq %rdi - jnz .Lmulx4x_inner - - movq 0+8(%rsp),%rax - adcq %rbp,%r15 - subq 0(%rbx),%rdi - movq 8+8(%rsp),%rdi - movq 16+8(%rsp),%r10 - adcq %r15,%r14 - leaq (%rsi,%rax,1),%rsi - adcq %rbp,%rbp - movq %r14,-8(%rbx) - - cmpq %r10,%rdi - jb .Lmulx4x_outer - - movq -8(%rcx),%r10 - movq %rbp,%r8 - movq (%rcx,%rax,1),%r12 - leaq (%rcx,%rax,1),%rbp - movq %rax,%rcx - leaq (%rbx,%rax,1),%rdi - xorl %eax,%eax - xorq %r15,%r15 - subq %r14,%r10 - adcq %r15,%r15 - orq %r15,%r8 - sarq $3+2,%rcx - subq %r8,%rax - movq 56+8(%rsp),%rdx - decq %r12 - movq 8(%rbp),%r13 - xorq %r8,%r8 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 - jmp .Lsqrx4x_sub_entry -.cfi_endproc -.size mulx4x_internal,.-mulx4x_internal -.type bn_powerx5,@function -.align 32 -bn_powerx5: -.cfi_startproc - movq %rsp,%rax -.cfi_def_cfa_register %rax -.Lpowerx5_enter: - pushq %rbx -.cfi_offset %rbx,-16 - pushq %rbp -.cfi_offset %rbp,-24 - pushq %r12 -.cfi_offset %r12,-32 - pushq %r13 -.cfi_offset %r13,-40 - pushq %r14 -.cfi_offset %r14,-48 - pushq %r15 -.cfi_offset %r15,-56 -.Lpowerx5_prologue: - - shll $3,%r9d - leaq (%r9,%r9,2),%r10 - negq %r9 - movq (%r8),%r8 - - - - - - - - - leaq -320(%rsp,%r9,2),%r11 - movq %rsp,%rbp - subq %rdi,%r11 - andq $4095,%r11 - cmpq %r11,%r10 - jb .Lpwrx_sp_alt - subq %r11,%rbp - leaq -320(%rbp,%r9,2),%rbp - jmp .Lpwrx_sp_done - -.align 32 -.Lpwrx_sp_alt: - leaq 4096-320(,%r9,2),%r10 - leaq -320(%rbp,%r9,2),%rbp - subq %r10,%r11 - movq $0,%r10 - cmovcq %r10,%r11 - subq %r11,%rbp -.Lpwrx_sp_done: - andq $-64,%rbp - movq %rsp,%r11 - subq %rbp,%r11 - andq $-4096,%r11 - leaq (%r11,%rbp,1),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lpwrx_page_walk - jmp .Lpwrx_page_walk_done - -.Lpwrx_page_walk: - leaq -4096(%rsp),%rsp - movq (%rsp),%r10 - cmpq %rbp,%rsp - ja .Lpwrx_page_walk -.Lpwrx_page_walk_done: - - movq %r9,%r10 - negq %r9 - - - - - - - - - - - - - pxor %xmm0,%xmm0 -.byte 102,72,15,110,207 -.byte 102,72,15,110,209 -.byte 102,73,15,110,218 -.byte 102,72,15,110,226 - movq %r8,32(%rsp) - movq %rax,40(%rsp) -.cfi_escape 0x0f,0x05,0x77,0x28,0x06,0x23,0x08 -.Lpowerx5_body: - - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - call __bn_sqrx8x_internal - call __bn_postx4x_internal - - movq %r10,%r9 - movq %rsi,%rdi -.byte 102,72,15,126,209 -.byte 102,72,15,126,226 - movq 40(%rsp),%rax - - call mulx4x_internal - - movq 40(%rsp),%rsi -.cfi_def_cfa %rsi,8 - movq $1,%rax - - movq -48(%rsi),%r15 -.cfi_restore %r15 - movq -40(%rsi),%r14 -.cfi_restore %r14 - movq -32(%rsi),%r13 -.cfi_restore %r13 - movq -24(%rsi),%r12 -.cfi_restore %r12 - movq -16(%rsi),%rbp -.cfi_restore %rbp - movq -8(%rsi),%rbx -.cfi_restore %rbx - leaq (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lpowerx5_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_powerx5,.-bn_powerx5 - -.globl bn_sqrx8x_internal -.hidden bn_sqrx8x_internal -.hidden bn_sqrx8x_internal -.type bn_sqrx8x_internal,@function -.align 32 -bn_sqrx8x_internal: -__bn_sqrx8x_internal: -.cfi_startproc - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - leaq 48+8(%rsp),%rdi - leaq (%rsi,%r9,1),%rbp - movq %r9,0+8(%rsp) - movq %rbp,8+8(%rsp) - jmp .Lsqr8x_zero_start - -.align 32 -.byte 0x66,0x66,0x66,0x2e,0x0f,0x1f,0x84,0x00,0x00,0x00,0x00,0x00 -.Lsqrx8x_zero: -.byte 0x3e - movdqa %xmm0,0(%rdi) - movdqa %xmm0,16(%rdi) - movdqa %xmm0,32(%rdi) - movdqa %xmm0,48(%rdi) -.Lsqr8x_zero_start: - movdqa %xmm0,64(%rdi) - movdqa %xmm0,80(%rdi) - movdqa %xmm0,96(%rdi) - movdqa %xmm0,112(%rdi) - leaq 128(%rdi),%rdi - subq $64,%r9 - jnz .Lsqrx8x_zero - - movq 0(%rsi),%rdx - - xorq %r10,%r10 - xorq %r11,%r11 - xorq %r12,%r12 - xorq %r13,%r13 - xorq %r14,%r14 - xorq %r15,%r15 - leaq 48+8(%rsp),%rdi - xorq %rbp,%rbp - jmp .Lsqrx8x_outer_loop - -.align 32 -.Lsqrx8x_outer_loop: - mulxq 8(%rsi),%r8,%rax - adcxq %r9,%r8 - adoxq %rax,%r10 - mulxq 16(%rsi),%r9,%rax - adcxq %r10,%r9 - adoxq %rax,%r11 -.byte 0xc4,0xe2,0xab,0xf6,0x86,0x18,0x00,0x00,0x00 - adcxq %r11,%r10 - adoxq %rax,%r12 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x20,0x00,0x00,0x00 - adcxq %r12,%r11 - adoxq %rax,%r13 - mulxq 40(%rsi),%r12,%rax - adcxq %r13,%r12 - adoxq %rax,%r14 - mulxq 48(%rsi),%r13,%rax - adcxq %r14,%r13 - adoxq %r15,%rax - mulxq 56(%rsi),%r14,%r15 - movq 8(%rsi),%rdx - adcxq %rax,%r14 - adoxq %rbp,%r15 - adcq 64(%rdi),%r15 - movq %r8,8(%rdi) - movq %r9,16(%rdi) - sbbq %rcx,%rcx - xorq %rbp,%rbp - - - mulxq 16(%rsi),%r8,%rbx - mulxq 24(%rsi),%r9,%rax - adcxq %r10,%r8 - adoxq %rbx,%r9 - mulxq 32(%rsi),%r10,%rbx - adcxq %r11,%r9 - adoxq %rax,%r10 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x28,0x00,0x00,0x00 - adcxq %r12,%r10 - adoxq %rbx,%r11 -.byte 0xc4,0xe2,0x9b,0xf6,0x9e,0x30,0x00,0x00,0x00 - adcxq %r13,%r11 - adoxq %r14,%r12 -.byte 0xc4,0x62,0x93,0xf6,0xb6,0x38,0x00,0x00,0x00 - movq 16(%rsi),%rdx - adcxq %rax,%r12 - adoxq %rbx,%r13 - adcxq %r15,%r13 - adoxq %rbp,%r14 - adcxq %rbp,%r14 - - movq %r8,24(%rdi) - movq %r9,32(%rdi) - - mulxq 24(%rsi),%r8,%rbx - mulxq 32(%rsi),%r9,%rax - adcxq %r10,%r8 - adoxq %rbx,%r9 - mulxq 40(%rsi),%r10,%rbx - adcxq %r11,%r9 - adoxq %rax,%r10 -.byte 0xc4,0xe2,0xa3,0xf6,0x86,0x30,0x00,0x00,0x00 - adcxq %r12,%r10 - adoxq %r13,%r11 -.byte 0xc4,0x62,0x9b,0xf6,0xae,0x38,0x00,0x00,0x00 -.byte 0x3e - movq 24(%rsi),%rdx - adcxq %rbx,%r11 - adoxq %rax,%r12 - adcxq %r14,%r12 - movq %r8,40(%rdi) - movq %r9,48(%rdi) - mulxq 32(%rsi),%r8,%rax - adoxq %rbp,%r13 - adcxq %rbp,%r13 - - mulxq 40(%rsi),%r9,%rbx - adcxq %r10,%r8 - adoxq %rax,%r9 - mulxq 48(%rsi),%r10,%rax - adcxq %r11,%r9 - adoxq %r12,%r10 - mulxq 56(%rsi),%r11,%r12 - movq 32(%rsi),%rdx - movq 40(%rsi),%r14 - adcxq %rbx,%r10 - adoxq %rax,%r11 - movq 48(%rsi),%r15 - adcxq %r13,%r11 - adoxq %rbp,%r12 - adcxq %rbp,%r12 - - movq %r8,56(%rdi) - movq %r9,64(%rdi) - - mulxq %r14,%r9,%rax - movq 56(%rsi),%r8 - adcxq %r10,%r9 - mulxq %r15,%r10,%rbx - adoxq %rax,%r10 - adcxq %r11,%r10 - mulxq %r8,%r11,%rax - movq %r14,%rdx - adoxq %rbx,%r11 - adcxq %r12,%r11 - - adcxq %rbp,%rax - - mulxq %r15,%r14,%rbx - mulxq %r8,%r12,%r13 - movq %r15,%rdx - leaq 64(%rsi),%rsi - adcxq %r14,%r11 - adoxq %rbx,%r12 - adcxq %rax,%r12 - adoxq %rbp,%r13 - -.byte 0x67,0x67 - mulxq %r8,%r8,%r14 - adcxq %r8,%r13 - adcxq %rbp,%r14 - - cmpq 8+8(%rsp),%rsi - je .Lsqrx8x_outer_break - - negq %rcx - movq $-8,%rcx - movq %rbp,%r15 - movq 64(%rdi),%r8 - adcxq 72(%rdi),%r9 - adcxq 80(%rdi),%r10 - adcxq 88(%rdi),%r11 - adcq 96(%rdi),%r12 - adcq 104(%rdi),%r13 - adcq 112(%rdi),%r14 - adcq 120(%rdi),%r15 - leaq (%rsi),%rbp - leaq 128(%rdi),%rdi - sbbq %rax,%rax - - movq -64(%rsi),%rdx - movq %rax,16+8(%rsp) - movq %rdi,24+8(%rsp) - - - xorl %eax,%eax - jmp .Lsqrx8x_loop - -.align 32 -.Lsqrx8x_loop: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rbp),%rax,%r14 - movq %rbx,(%rdi,%rcx,8) - movl $0,%ebx - adcxq %rax,%r13 - adoxq %r15,%r14 - -.byte 0xc4,0x62,0xfb,0xf6,0xbd,0x38,0x00,0x00,0x00 - movq 8(%rsi,%rcx,8),%rdx - adcxq %rax,%r14 - adoxq %rbx,%r15 - adcxq %rbx,%r15 - -.byte 0x67 - incq %rcx - jnz .Lsqrx8x_loop - - leaq 64(%rbp),%rbp - movq $-8,%rcx - cmpq 8+8(%rsp),%rbp - je .Lsqrx8x_break - - subq 16+8(%rsp),%rbx -.byte 0x66 - movq -64(%rsi),%rdx - adcxq 0(%rdi),%r8 - adcxq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi -.byte 0x67 - sbbq %rax,%rax - xorl %ebx,%ebx - movq %rax,16+8(%rsp) - jmp .Lsqrx8x_loop - -.align 32 -.Lsqrx8x_break: - xorq %rbp,%rbp - subq 16+8(%rsp),%rbx - adcxq %rbp,%r8 - movq 24+8(%rsp),%rcx - adcxq %rbp,%r9 - movq 0(%rsi),%rdx - adcq $0,%r10 - movq %r8,0(%rdi) - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - cmpq %rcx,%rdi - je .Lsqrx8x_outer_loop - - movq %r9,8(%rdi) - movq 8(%rcx),%r9 - movq %r10,16(%rdi) - movq 16(%rcx),%r10 - movq %r11,24(%rdi) - movq 24(%rcx),%r11 - movq %r12,32(%rdi) - movq 32(%rcx),%r12 - movq %r13,40(%rdi) - movq 40(%rcx),%r13 - movq %r14,48(%rdi) - movq 48(%rcx),%r14 - movq %r15,56(%rdi) - movq 56(%rcx),%r15 - movq %rcx,%rdi - jmp .Lsqrx8x_outer_loop - -.align 32 -.Lsqrx8x_outer_break: - movq %r9,72(%rdi) -.byte 102,72,15,126,217 - movq %r10,80(%rdi) - movq %r11,88(%rdi) - movq %r12,96(%rdi) - movq %r13,104(%rdi) - movq %r14,112(%rdi) - leaq 48+8(%rsp),%rdi - movq (%rsi,%rcx,1),%rdx - - movq 8(%rdi),%r11 - xorq %r10,%r10 - movq 0+8(%rsp),%r9 - adoxq %r11,%r11 - movq 16(%rdi),%r12 - movq 24(%rdi),%r13 - - -.align 32 -.Lsqrx4x_shift_n_add: - mulxq %rdx,%rax,%rbx - adoxq %r12,%r12 - adcxq %r10,%rax -.byte 0x48,0x8b,0x94,0x0e,0x08,0x00,0x00,0x00 -.byte 0x4c,0x8b,0x97,0x20,0x00,0x00,0x00 - adoxq %r13,%r13 - adcxq %r11,%rbx - movq 40(%rdi),%r11 - movq %rax,0(%rdi) - movq %rbx,8(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r10,%r10 - adcxq %r12,%rax - movq 16(%rsi,%rcx,1),%rdx - movq 48(%rdi),%r12 - adoxq %r11,%r11 - adcxq %r13,%rbx - movq 56(%rdi),%r13 - movq %rax,16(%rdi) - movq %rbx,24(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r12,%r12 - adcxq %r10,%rax - movq 24(%rsi,%rcx,1),%rdx - leaq 32(%rcx),%rcx - movq 64(%rdi),%r10 - adoxq %r13,%r13 - adcxq %r11,%rbx - movq 72(%rdi),%r11 - movq %rax,32(%rdi) - movq %rbx,40(%rdi) - - mulxq %rdx,%rax,%rbx - adoxq %r10,%r10 - adcxq %r12,%rax - jrcxz .Lsqrx4x_shift_n_add_break -.byte 0x48,0x8b,0x94,0x0e,0x00,0x00,0x00,0x00 - adoxq %r11,%r11 - adcxq %r13,%rbx - movq 80(%rdi),%r12 - movq 88(%rdi),%r13 - movq %rax,48(%rdi) - movq %rbx,56(%rdi) - leaq 64(%rdi),%rdi - nop - jmp .Lsqrx4x_shift_n_add - -.align 32 -.Lsqrx4x_shift_n_add_break: - adcxq %r13,%rbx - movq %rax,48(%rdi) - movq %rbx,56(%rdi) - leaq 64(%rdi),%rdi -.byte 102,72,15,126,213 -__bn_sqrx8x_reduction: - xorl %eax,%eax - movq 32+8(%rsp),%rbx - movq 48+8(%rsp),%rdx - leaq -64(%rbp,%r9,1),%rcx - - movq %rcx,0+8(%rsp) - movq %rdi,8+8(%rsp) - - leaq 48+8(%rsp),%rdi - jmp .Lsqrx8x_reduction_loop - -.align 32 -.Lsqrx8x_reduction_loop: - movq 8(%rdi),%r9 - movq 16(%rdi),%r10 - movq 24(%rdi),%r11 - movq 32(%rdi),%r12 - movq %rdx,%r8 - imulq %rbx,%rdx - movq 40(%rdi),%r13 - movq 48(%rdi),%r14 - movq 56(%rdi),%r15 - movq %rax,24+8(%rsp) - - leaq 64(%rdi),%rdi - xorq %rsi,%rsi - movq $-8,%rcx - jmp .Lsqrx8x_reduce - -.align 32 -.Lsqrx8x_reduce: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rbx,%rax - adoxq %r9,%r8 - - mulxq 8(%rbp),%rbx,%r9 - adcxq %rbx,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rbx,%r10 - adcxq %rbx,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rbx,%r11 - adcxq %rbx,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 - movq %rdx,%rax - movq %r8,%rdx - adcxq %rbx,%r11 - adoxq %r13,%r12 - - mulxq 32+8(%rsp),%rbx,%rdx - movq %rax,%rdx - movq %rax,64+48+8(%rsp,%rcx,8) - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rbp),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rbp),%rax,%r15 - movq %rbx,%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - adcxq %rsi,%r15 - -.byte 0x67,0x67,0x67 - incq %rcx - jnz .Lsqrx8x_reduce - - movq %rsi,%rax - cmpq 0+8(%rsp),%rbp - jae .Lsqrx8x_no_tail - - movq 48+8(%rsp),%rdx - addq 0(%rdi),%r8 - leaq 64(%rbp),%rbp - movq $-8,%rcx - adcxq 8(%rdi),%r9 - adcxq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi - sbbq %rax,%rax - - xorq %rsi,%rsi - movq %rax,16+8(%rsp) - jmp .Lsqrx8x_tail - -.align 32 -.Lsqrx8x_tail: - movq %r8,%rbx - mulxq 0(%rbp),%rax,%r8 - adcxq %rax,%rbx - adoxq %r9,%r8 - - mulxq 8(%rbp),%rax,%r9 - adcxq %rax,%r8 - adoxq %r10,%r9 - - mulxq 16(%rbp),%rax,%r10 - adcxq %rax,%r9 - adoxq %r11,%r10 - - mulxq 24(%rbp),%rax,%r11 - adcxq %rax,%r10 - adoxq %r12,%r11 - -.byte 0xc4,0x62,0xfb,0xf6,0xa5,0x20,0x00,0x00,0x00 - adcxq %rax,%r11 - adoxq %r13,%r12 - - mulxq 40(%rbp),%rax,%r13 - adcxq %rax,%r12 - adoxq %r14,%r13 - - mulxq 48(%rbp),%rax,%r14 - adcxq %rax,%r13 - adoxq %r15,%r14 - - mulxq 56(%rbp),%rax,%r15 - movq 72+48+8(%rsp,%rcx,8),%rdx - adcxq %rax,%r14 - adoxq %rsi,%r15 - movq %rbx,(%rdi,%rcx,8) - movq %r8,%rbx - adcxq %rsi,%r15 - - incq %rcx - jnz .Lsqrx8x_tail - - cmpq 0+8(%rsp),%rbp - jae .Lsqrx8x_tail_done - - subq 16+8(%rsp),%rsi - movq 48+8(%rsp),%rdx - leaq 64(%rbp),%rbp - adcq 0(%rdi),%r8 - adcq 8(%rdi),%r9 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - leaq 64(%rdi),%rdi - sbbq %rax,%rax - subq $8,%rcx - - xorq %rsi,%rsi - movq %rax,16+8(%rsp) - jmp .Lsqrx8x_tail - -.align 32 -.Lsqrx8x_tail_done: - xorq %rax,%rax - addq 24+8(%rsp),%r8 - adcq $0,%r9 - adcq $0,%r10 - adcq $0,%r11 - adcq $0,%r12 - adcq $0,%r13 - adcq $0,%r14 - adcq $0,%r15 - adcq $0,%rax - - subq 16+8(%rsp),%rsi -.Lsqrx8x_no_tail: - adcq 0(%rdi),%r8 -.byte 102,72,15,126,217 - adcq 8(%rdi),%r9 - movq 56(%rbp),%rsi -.byte 102,72,15,126,213 - adcq 16(%rdi),%r10 - adcq 24(%rdi),%r11 - adcq 32(%rdi),%r12 - adcq 40(%rdi),%r13 - adcq 48(%rdi),%r14 - adcq 56(%rdi),%r15 - adcq $0,%rax - - movq 32+8(%rsp),%rbx - movq 64(%rdi,%rcx,1),%rdx - - movq %r8,0(%rdi) - leaq 64(%rdi),%r8 - movq %r9,8(%rdi) - movq %r10,16(%rdi) - movq %r11,24(%rdi) - movq %r12,32(%rdi) - movq %r13,40(%rdi) - movq %r14,48(%rdi) - movq %r15,56(%rdi) - - leaq 64(%rdi,%rcx,1),%rdi - cmpq 8+8(%rsp),%r8 - jb .Lsqrx8x_reduction_loop - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_sqrx8x_internal,.-bn_sqrx8x_internal -.align 32 -.type __bn_postx4x_internal,@function -__bn_postx4x_internal: -.cfi_startproc - movq 0(%rbp),%r12 - movq %rcx,%r10 - movq %rcx,%r9 - negq %rax - sarq $3+2,%rcx - -.byte 102,72,15,126,202 -.byte 102,72,15,126,206 - decq %r12 - movq 8(%rbp),%r13 - xorq %r8,%r8 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 - jmp .Lsqrx4x_sub_entry - -.align 16 -.Lsqrx4x_sub: - movq 0(%rbp),%r12 - movq 8(%rbp),%r13 - movq 16(%rbp),%r14 - movq 24(%rbp),%r15 -.Lsqrx4x_sub_entry: - andnq %rax,%r12,%r12 - leaq 32(%rbp),%rbp - andnq %rax,%r13,%r13 - andnq %rax,%r14,%r14 - andnq %rax,%r15,%r15 - - negq %r8 - adcq 0(%rdi),%r12 - adcq 8(%rdi),%r13 - adcq 16(%rdi),%r14 - adcq 24(%rdi),%r15 - movq %r12,0(%rdx) - leaq 32(%rdi),%rdi - movq %r13,8(%rdx) - sbbq %r8,%r8 - movq %r14,16(%rdx) - movq %r15,24(%rdx) - leaq 32(%rdx),%rdx - - incq %rcx - jnz .Lsqrx4x_sub - - negq %r9 - - .byte 0xf3,0xc3 -.cfi_endproc -.size __bn_postx4x_internal,.-__bn_postx4x_internal -.globl bn_scatter5 -.hidden bn_scatter5 -.type bn_scatter5,@function -.align 16 -bn_scatter5: -.cfi_startproc - cmpl $0,%esi - jz .Lscatter_epilogue - leaq (%rdx,%rcx,8),%rdx -.Lscatter: - movq (%rdi),%rax - leaq 8(%rdi),%rdi - movq %rax,(%rdx) - leaq 256(%rdx),%rdx - subl $1,%esi - jnz .Lscatter -.Lscatter_epilogue: - .byte 0xf3,0xc3 -.cfi_endproc -.size bn_scatter5,.-bn_scatter5 - -.globl bn_gather5 -.hidden bn_gather5 -.type bn_gather5,@function -.align 32 -bn_gather5: -.cfi_startproc -.LSEH_begin_bn_gather5: - -.byte 0x4c,0x8d,0x14,0x24 -.cfi_def_cfa_register %r10 -.byte 0x48,0x81,0xec,0x08,0x01,0x00,0x00 - leaq .Linc(%rip),%rax - andq $-16,%rsp - - movd %ecx,%xmm5 - movdqa 0(%rax),%xmm0 - movdqa 16(%rax),%xmm1 - leaq 128(%rdx),%r11 - leaq 128(%rsp),%rax - - pshufd $0,%xmm5,%xmm5 - movdqa %xmm1,%xmm4 - movdqa %xmm1,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm4,%xmm3 - - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,-128(%rax) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,-112(%rax) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,-96(%rax) - movdqa %xmm4,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,-80(%rax) - movdqa %xmm4,%xmm3 - - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,-64(%rax) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,-48(%rax) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,-32(%rax) - movdqa %xmm4,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,-16(%rax) - movdqa %xmm4,%xmm3 - - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,0(%rax) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,16(%rax) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,32(%rax) - movdqa %xmm4,%xmm2 - paddd %xmm0,%xmm1 - pcmpeqd %xmm5,%xmm0 - movdqa %xmm3,48(%rax) - movdqa %xmm4,%xmm3 - - paddd %xmm1,%xmm2 - pcmpeqd %xmm5,%xmm1 - movdqa %xmm0,64(%rax) - movdqa %xmm4,%xmm0 - - paddd %xmm2,%xmm3 - pcmpeqd %xmm5,%xmm2 - movdqa %xmm1,80(%rax) - movdqa %xmm4,%xmm1 - - paddd %xmm3,%xmm0 - pcmpeqd %xmm5,%xmm3 - movdqa %xmm2,96(%rax) - movdqa %xmm4,%xmm2 - movdqa %xmm3,112(%rax) - jmp .Lgather - -.align 32 -.Lgather: - pxor %xmm4,%xmm4 - pxor %xmm5,%xmm5 - movdqa -128(%r11),%xmm0 - movdqa -112(%r11),%xmm1 - movdqa -96(%r11),%xmm2 - pand -128(%rax),%xmm0 - movdqa -80(%r11),%xmm3 - pand -112(%rax),%xmm1 - por %xmm0,%xmm4 - pand -96(%rax),%xmm2 - por %xmm1,%xmm5 - pand -80(%rax),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa -64(%r11),%xmm0 - movdqa -48(%r11),%xmm1 - movdqa -32(%r11),%xmm2 - pand -64(%rax),%xmm0 - movdqa -16(%r11),%xmm3 - pand -48(%rax),%xmm1 - por %xmm0,%xmm4 - pand -32(%rax),%xmm2 - por %xmm1,%xmm5 - pand -16(%rax),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 0(%r11),%xmm0 - movdqa 16(%r11),%xmm1 - movdqa 32(%r11),%xmm2 - pand 0(%rax),%xmm0 - movdqa 48(%r11),%xmm3 - pand 16(%rax),%xmm1 - por %xmm0,%xmm4 - pand 32(%rax),%xmm2 - por %xmm1,%xmm5 - pand 48(%rax),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - movdqa 64(%r11),%xmm0 - movdqa 80(%r11),%xmm1 - movdqa 96(%r11),%xmm2 - pand 64(%rax),%xmm0 - movdqa 112(%r11),%xmm3 - pand 80(%rax),%xmm1 - por %xmm0,%xmm4 - pand 96(%rax),%xmm2 - por %xmm1,%xmm5 - pand 112(%rax),%xmm3 - por %xmm2,%xmm4 - por %xmm3,%xmm5 - por %xmm5,%xmm4 - leaq 256(%r11),%r11 - pshufd $0x4e,%xmm4,%xmm0 - por %xmm4,%xmm0 - movq %xmm0,(%rdi) - leaq 8(%rdi),%rdi - subl $1,%esi - jnz .Lgather - - leaq (%r10),%rsp -.cfi_def_cfa_register %rsp - .byte 0xf3,0xc3 -.LSEH_end_bn_gather5: -.cfi_endproc -.size bn_gather5,.-bn_gather5 -.align 64 -.Linc: -.long 0,0, 1,1 -.long 2,2, 2,2 -.byte 77,111,110,116,103,111,109,101,114,121,32,77,117,108,116,105,112,108,105,99,97,116,105,111,110,32,119,105,116,104,32,115,99,97,116,116,101,114,47,103,97,116,104,101,114,32,102,111,114,32,120,56,54,95,54,52,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0 -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S b/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S deleted file mode 100644 index b7d6101b..00000000 --- a/third_party/boringssl/linux-x86_64/crypto/test/trampoline-x86_64.S +++ /dev/null @@ -1,518 +0,0 @@ -// This file is generated from a similarly-named Perl script in the BoringSSL -// source tree. Do not edit by hand. - -#if defined(__has_feature) -#if __has_feature(memory_sanitizer) && !defined(OPENSSL_NO_ASM) -#define OPENSSL_NO_ASM -#endif -#endif - -#if defined(__x86_64__) && !defined(OPENSSL_NO_ASM) -#if defined(BORINGSSL_PREFIX) -#include -#endif -.text - - - - - - - - -.type abi_test_trampoline, @function -.globl abi_test_trampoline -.hidden abi_test_trampoline -.align 16 -abi_test_trampoline: -.Labi_test_trampoline_seh_begin: -.cfi_startproc - - - - - - - - - - subq $120,%rsp -.cfi_adjust_cfa_offset 120 -.Labi_test_trampoline_seh_prolog_alloc: - movq %r8,48(%rsp) - movq %rbx,64(%rsp) -.cfi_offset rbx, -64 -.Labi_test_trampoline_seh_prolog_rbx: - movq %rbp,72(%rsp) -.cfi_offset rbp, -56 -.Labi_test_trampoline_seh_prolog_rbp: - movq %r12,80(%rsp) -.cfi_offset r12, -48 -.Labi_test_trampoline_seh_prolog_r12: - movq %r13,88(%rsp) -.cfi_offset r13, -40 -.Labi_test_trampoline_seh_prolog_r13: - movq %r14,96(%rsp) -.cfi_offset r14, -32 -.Labi_test_trampoline_seh_prolog_r14: - movq %r15,104(%rsp) -.cfi_offset r15, -24 -.Labi_test_trampoline_seh_prolog_r15: -.Labi_test_trampoline_seh_prolog_end: - movq 0(%rsi),%rbx - movq 8(%rsi),%rbp - movq 16(%rsi),%r12 - movq 24(%rsi),%r13 - movq 32(%rsi),%r14 - movq 40(%rsi),%r15 - - movq %rdi,32(%rsp) - movq %rsi,40(%rsp) - - - - - movq %rdx,%r10 - movq %rcx,%r11 - decq %r11 - js .Largs_done - movq (%r10),%rdi - addq $8,%r10 - decq %r11 - js .Largs_done - movq (%r10),%rsi - addq $8,%r10 - decq %r11 - js .Largs_done - movq (%r10),%rdx - addq $8,%r10 - decq %r11 - js .Largs_done - movq (%r10),%rcx - addq $8,%r10 - decq %r11 - js .Largs_done - movq (%r10),%r8 - addq $8,%r10 - decq %r11 - js .Largs_done - movq (%r10),%r9 - addq $8,%r10 - leaq 0(%rsp),%rax -.Largs_loop: - decq %r11 - js .Largs_done - - - - - - - movq %r11,56(%rsp) - movq (%r10),%r11 - movq %r11,(%rax) - movq 56(%rsp),%r11 - - addq $8,%r10 - addq $8,%rax - jmp .Largs_loop - -.Largs_done: - movq 32(%rsp),%rax - movq 48(%rsp),%r10 - testq %r10,%r10 - jz .Lno_unwind - - - pushfq - orq $0x100,0(%rsp) - popfq - - - - nop -.globl abi_test_unwind_start -.hidden abi_test_unwind_start -abi_test_unwind_start: - - call *%rax -.globl abi_test_unwind_return -.hidden abi_test_unwind_return -abi_test_unwind_return: - - - - - pushfq - andq $-0x101,0(%rsp) - popfq -.globl abi_test_unwind_stop -.hidden abi_test_unwind_stop -abi_test_unwind_stop: - - jmp .Lcall_done - -.Lno_unwind: - call *%rax - -.Lcall_done: - - movq 40(%rsp),%rsi - movq %rbx,0(%rsi) - movq %rbp,8(%rsi) - movq %r12,16(%rsi) - movq %r13,24(%rsi) - movq %r14,32(%rsi) - movq %r15,40(%rsi) - movq 64(%rsp),%rbx -.cfi_restore rbx - movq 72(%rsp),%rbp -.cfi_restore rbp - movq 80(%rsp),%r12 -.cfi_restore r12 - movq 88(%rsp),%r13 -.cfi_restore r13 - movq 96(%rsp),%r14 -.cfi_restore r14 - movq 104(%rsp),%r15 -.cfi_restore r15 - addq $120,%rsp -.cfi_adjust_cfa_offset -120 - - - .byte 0xf3,0xc3 -.cfi_endproc -.Labi_test_trampoline_seh_end: -.size abi_test_trampoline,.-abi_test_trampoline -.type abi_test_clobber_rax, @function -.globl abi_test_clobber_rax -.hidden abi_test_clobber_rax -.align 16 -abi_test_clobber_rax: - xorq %rax,%rax - .byte 0xf3,0xc3 -.size abi_test_clobber_rax,.-abi_test_clobber_rax -.type abi_test_clobber_rbx, @function -.globl abi_test_clobber_rbx -.hidden abi_test_clobber_rbx -.align 16 -abi_test_clobber_rbx: - xorq %rbx,%rbx - .byte 0xf3,0xc3 -.size abi_test_clobber_rbx,.-abi_test_clobber_rbx -.type abi_test_clobber_rcx, @function -.globl abi_test_clobber_rcx -.hidden abi_test_clobber_rcx -.align 16 -abi_test_clobber_rcx: - xorq %rcx,%rcx - .byte 0xf3,0xc3 -.size abi_test_clobber_rcx,.-abi_test_clobber_rcx -.type abi_test_clobber_rdx, @function -.globl abi_test_clobber_rdx -.hidden abi_test_clobber_rdx -.align 16 -abi_test_clobber_rdx: - xorq %rdx,%rdx - .byte 0xf3,0xc3 -.size abi_test_clobber_rdx,.-abi_test_clobber_rdx -.type abi_test_clobber_rdi, @function -.globl abi_test_clobber_rdi -.hidden abi_test_clobber_rdi -.align 16 -abi_test_clobber_rdi: - xorq %rdi,%rdi - .byte 0xf3,0xc3 -.size abi_test_clobber_rdi,.-abi_test_clobber_rdi -.type abi_test_clobber_rsi, @function -.globl abi_test_clobber_rsi -.hidden abi_test_clobber_rsi -.align 16 -abi_test_clobber_rsi: - xorq %rsi,%rsi - .byte 0xf3,0xc3 -.size abi_test_clobber_rsi,.-abi_test_clobber_rsi -.type abi_test_clobber_rbp, @function -.globl abi_test_clobber_rbp -.hidden abi_test_clobber_rbp -.align 16 -abi_test_clobber_rbp: - xorq %rbp,%rbp - .byte 0xf3,0xc3 -.size abi_test_clobber_rbp,.-abi_test_clobber_rbp -.type abi_test_clobber_r8, @function -.globl abi_test_clobber_r8 -.hidden abi_test_clobber_r8 -.align 16 -abi_test_clobber_r8: - xorq %r8,%r8 - .byte 0xf3,0xc3 -.size abi_test_clobber_r8,.-abi_test_clobber_r8 -.type abi_test_clobber_r9, @function -.globl abi_test_clobber_r9 -.hidden abi_test_clobber_r9 -.align 16 -abi_test_clobber_r9: - xorq %r9,%r9 - .byte 0xf3,0xc3 -.size abi_test_clobber_r9,.-abi_test_clobber_r9 -.type abi_test_clobber_r10, @function -.globl abi_test_clobber_r10 -.hidden abi_test_clobber_r10 -.align 16 -abi_test_clobber_r10: - xorq %r10,%r10 - .byte 0xf3,0xc3 -.size abi_test_clobber_r10,.-abi_test_clobber_r10 -.type abi_test_clobber_r11, @function -.globl abi_test_clobber_r11 -.hidden abi_test_clobber_r11 -.align 16 -abi_test_clobber_r11: - xorq %r11,%r11 - .byte 0xf3,0xc3 -.size abi_test_clobber_r11,.-abi_test_clobber_r11 -.type abi_test_clobber_r12, @function -.globl abi_test_clobber_r12 -.hidden abi_test_clobber_r12 -.align 16 -abi_test_clobber_r12: - xorq %r12,%r12 - .byte 0xf3,0xc3 -.size abi_test_clobber_r12,.-abi_test_clobber_r12 -.type abi_test_clobber_r13, @function -.globl abi_test_clobber_r13 -.hidden abi_test_clobber_r13 -.align 16 -abi_test_clobber_r13: - xorq %r13,%r13 - .byte 0xf3,0xc3 -.size abi_test_clobber_r13,.-abi_test_clobber_r13 -.type abi_test_clobber_r14, @function -.globl abi_test_clobber_r14 -.hidden abi_test_clobber_r14 -.align 16 -abi_test_clobber_r14: - xorq %r14,%r14 - .byte 0xf3,0xc3 -.size abi_test_clobber_r14,.-abi_test_clobber_r14 -.type abi_test_clobber_r15, @function -.globl abi_test_clobber_r15 -.hidden abi_test_clobber_r15 -.align 16 -abi_test_clobber_r15: - xorq %r15,%r15 - .byte 0xf3,0xc3 -.size abi_test_clobber_r15,.-abi_test_clobber_r15 -.type abi_test_clobber_xmm0, @function -.globl abi_test_clobber_xmm0 -.hidden abi_test_clobber_xmm0 -.align 16 -abi_test_clobber_xmm0: - pxor %xmm0,%xmm0 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm0,.-abi_test_clobber_xmm0 -.type abi_test_clobber_xmm1, @function -.globl abi_test_clobber_xmm1 -.hidden abi_test_clobber_xmm1 -.align 16 -abi_test_clobber_xmm1: - pxor %xmm1,%xmm1 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm1,.-abi_test_clobber_xmm1 -.type abi_test_clobber_xmm2, @function -.globl abi_test_clobber_xmm2 -.hidden abi_test_clobber_xmm2 -.align 16 -abi_test_clobber_xmm2: - pxor %xmm2,%xmm2 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm2,.-abi_test_clobber_xmm2 -.type abi_test_clobber_xmm3, @function -.globl abi_test_clobber_xmm3 -.hidden abi_test_clobber_xmm3 -.align 16 -abi_test_clobber_xmm3: - pxor %xmm3,%xmm3 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm3,.-abi_test_clobber_xmm3 -.type abi_test_clobber_xmm4, @function -.globl abi_test_clobber_xmm4 -.hidden abi_test_clobber_xmm4 -.align 16 -abi_test_clobber_xmm4: - pxor %xmm4,%xmm4 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm4,.-abi_test_clobber_xmm4 -.type abi_test_clobber_xmm5, @function -.globl abi_test_clobber_xmm5 -.hidden abi_test_clobber_xmm5 -.align 16 -abi_test_clobber_xmm5: - pxor %xmm5,%xmm5 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm5,.-abi_test_clobber_xmm5 -.type abi_test_clobber_xmm6, @function -.globl abi_test_clobber_xmm6 -.hidden abi_test_clobber_xmm6 -.align 16 -abi_test_clobber_xmm6: - pxor %xmm6,%xmm6 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm6,.-abi_test_clobber_xmm6 -.type abi_test_clobber_xmm7, @function -.globl abi_test_clobber_xmm7 -.hidden abi_test_clobber_xmm7 -.align 16 -abi_test_clobber_xmm7: - pxor %xmm7,%xmm7 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm7,.-abi_test_clobber_xmm7 -.type abi_test_clobber_xmm8, @function -.globl abi_test_clobber_xmm8 -.hidden abi_test_clobber_xmm8 -.align 16 -abi_test_clobber_xmm8: - pxor %xmm8,%xmm8 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm8,.-abi_test_clobber_xmm8 -.type abi_test_clobber_xmm9, @function -.globl abi_test_clobber_xmm9 -.hidden abi_test_clobber_xmm9 -.align 16 -abi_test_clobber_xmm9: - pxor %xmm9,%xmm9 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm9,.-abi_test_clobber_xmm9 -.type abi_test_clobber_xmm10, @function -.globl abi_test_clobber_xmm10 -.hidden abi_test_clobber_xmm10 -.align 16 -abi_test_clobber_xmm10: - pxor %xmm10,%xmm10 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm10,.-abi_test_clobber_xmm10 -.type abi_test_clobber_xmm11, @function -.globl abi_test_clobber_xmm11 -.hidden abi_test_clobber_xmm11 -.align 16 -abi_test_clobber_xmm11: - pxor %xmm11,%xmm11 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm11,.-abi_test_clobber_xmm11 -.type abi_test_clobber_xmm12, @function -.globl abi_test_clobber_xmm12 -.hidden abi_test_clobber_xmm12 -.align 16 -abi_test_clobber_xmm12: - pxor %xmm12,%xmm12 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm12,.-abi_test_clobber_xmm12 -.type abi_test_clobber_xmm13, @function -.globl abi_test_clobber_xmm13 -.hidden abi_test_clobber_xmm13 -.align 16 -abi_test_clobber_xmm13: - pxor %xmm13,%xmm13 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm13,.-abi_test_clobber_xmm13 -.type abi_test_clobber_xmm14, @function -.globl abi_test_clobber_xmm14 -.hidden abi_test_clobber_xmm14 -.align 16 -abi_test_clobber_xmm14: - pxor %xmm14,%xmm14 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm14,.-abi_test_clobber_xmm14 -.type abi_test_clobber_xmm15, @function -.globl abi_test_clobber_xmm15 -.hidden abi_test_clobber_xmm15 -.align 16 -abi_test_clobber_xmm15: - pxor %xmm15,%xmm15 - .byte 0xf3,0xc3 -.size abi_test_clobber_xmm15,.-abi_test_clobber_xmm15 - - - -.type abi_test_bad_unwind_wrong_register, @function -.globl abi_test_bad_unwind_wrong_register -.hidden abi_test_bad_unwind_wrong_register -.align 16 -abi_test_bad_unwind_wrong_register: -.cfi_startproc -.Labi_test_bad_unwind_wrong_register_seh_begin: - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r13,-16 -.Labi_test_bad_unwind_wrong_register_seh_push_r13: - - - - nop - popq %r12 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r12 - .byte 0xf3,0xc3 -.Labi_test_bad_unwind_wrong_register_seh_end: -.cfi_endproc -.size abi_test_bad_unwind_wrong_register,.-abi_test_bad_unwind_wrong_register - - - - -.type abi_test_bad_unwind_temporary, @function -.globl abi_test_bad_unwind_temporary -.hidden abi_test_bad_unwind_temporary -.align 16 -abi_test_bad_unwind_temporary: -.cfi_startproc -.Labi_test_bad_unwind_temporary_seh_begin: - pushq %r12 -.cfi_adjust_cfa_offset 8 -.cfi_offset %r12,-16 -.Labi_test_bad_unwind_temporary_seh_push_r12: - - movq %r12,%rax - incq %rax - movq %rax,(%rsp) - - - - movq %r12,(%rsp) - - - popq %r12 -.cfi_adjust_cfa_offset -8 -.cfi_restore %r12 - .byte 0xf3,0xc3 -.Labi_test_bad_unwind_temporary_seh_end: -.cfi_endproc -.size abi_test_bad_unwind_temporary,.-abi_test_bad_unwind_temporary - - - - -.type abi_test_set_direction_flag, @function -.globl abi_test_get_and_clear_direction_flag -.hidden abi_test_get_and_clear_direction_flag -abi_test_get_and_clear_direction_flag: - pushfq - popq %rax - andq $0x400,%rax - shrq $10,%rax - cld - .byte 0xf3,0xc3 -.size abi_test_get_and_clear_direction_flag,.-abi_test_get_and_clear_direction_flag - - - -.type abi_test_set_direction_flag, @function -.globl abi_test_set_direction_flag -.hidden abi_test_set_direction_flag -abi_test_set_direction_flag: - std - .byte 0xf3,0xc3 -.size abi_test_set_direction_flag,.-abi_test_set_direction_flag -#endif -.section .note.GNU-stack,"",@progbits diff --git a/third_party/boringssl/sources.cmake b/third_party/boringssl/sources.cmake index 1edab490..90437ae9 100644 --- a/third_party/boringssl/sources.cmake +++ b/third_party/boringssl/sources.cmake @@ -1,4 +1,4 @@ -# Copyright 2020 Google LLC +# Copyright 2026 Google LLC # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,432 +18,444 @@ # `tool/bump-boringssl-revision.sh` set(crypto_sources - ${BORINGSSL_ROOT}err_data.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_bitstr.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_bool.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_d2i_fp.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_dup.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_gentm.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_i2d_fp.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_int.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_mbstr.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_object.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_octet.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_print.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_strex.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_strnid.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_time.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_type.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_utctm.c - ${BORINGSSL_ROOT}src/crypto/asn1/a_utf8.c - ${BORINGSSL_ROOT}src/crypto/asn1/asn1_lib.c - ${BORINGSSL_ROOT}src/crypto/asn1/asn1_par.c - ${BORINGSSL_ROOT}src/crypto/asn1/asn_pack.c - ${BORINGSSL_ROOT}src/crypto/asn1/f_int.c - ${BORINGSSL_ROOT}src/crypto/asn1/f_string.c - ${BORINGSSL_ROOT}src/crypto/asn1/posix_time.c - ${BORINGSSL_ROOT}src/crypto/asn1/tasn_dec.c - ${BORINGSSL_ROOT}src/crypto/asn1/tasn_enc.c - ${BORINGSSL_ROOT}src/crypto/asn1/tasn_fre.c - ${BORINGSSL_ROOT}src/crypto/asn1/tasn_new.c - ${BORINGSSL_ROOT}src/crypto/asn1/tasn_typ.c - ${BORINGSSL_ROOT}src/crypto/asn1/tasn_utl.c - ${BORINGSSL_ROOT}src/crypto/base64/base64.c - ${BORINGSSL_ROOT}src/crypto/bio/bio.c - ${BORINGSSL_ROOT}src/crypto/bio/bio_mem.c - ${BORINGSSL_ROOT}src/crypto/bio/connect.c - ${BORINGSSL_ROOT}src/crypto/bio/fd.c - ${BORINGSSL_ROOT}src/crypto/bio/file.c - ${BORINGSSL_ROOT}src/crypto/bio/hexdump.c - ${BORINGSSL_ROOT}src/crypto/bio/pair.c - ${BORINGSSL_ROOT}src/crypto/bio/printf.c - ${BORINGSSL_ROOT}src/crypto/bio/socket.c - ${BORINGSSL_ROOT}src/crypto/bio/socket_helper.c - ${BORINGSSL_ROOT}src/crypto/blake2/blake2.c - ${BORINGSSL_ROOT}src/crypto/bn_extra/bn_asn1.c - ${BORINGSSL_ROOT}src/crypto/bn_extra/convert.c - ${BORINGSSL_ROOT}src/crypto/buf/buf.c - ${BORINGSSL_ROOT}src/crypto/bytestring/asn1_compat.c - ${BORINGSSL_ROOT}src/crypto/bytestring/ber.c - ${BORINGSSL_ROOT}src/crypto/bytestring/cbb.c - ${BORINGSSL_ROOT}src/crypto/bytestring/cbs.c - ${BORINGSSL_ROOT}src/crypto/bytestring/unicode.c - ${BORINGSSL_ROOT}src/crypto/chacha/chacha.c - ${BORINGSSL_ROOT}src/crypto/cipher_extra/cipher_extra.c - ${BORINGSSL_ROOT}src/crypto/cipher_extra/derive_key.c - ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_aesctrhmac.c - ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_aesgcmsiv.c - ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_chacha20poly1305.c - ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_des.c - ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_null.c - ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_rc2.c - ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_rc4.c - ${BORINGSSL_ROOT}src/crypto/cipher_extra/e_tls.c - ${BORINGSSL_ROOT}src/crypto/cipher_extra/tls_cbc.c - ${BORINGSSL_ROOT}src/crypto/conf/conf.c - ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_apple.c - ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_fuchsia.c - ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_linux.c - ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_win.c - ${BORINGSSL_ROOT}src/crypto/cpu_arm.c - ${BORINGSSL_ROOT}src/crypto/cpu_arm_linux.c - ${BORINGSSL_ROOT}src/crypto/cpu_intel.c - ${BORINGSSL_ROOT}src/crypto/cpu_ppc64le.c - ${BORINGSSL_ROOT}src/crypto/crypto.c - ${BORINGSSL_ROOT}src/crypto/curve25519/curve25519.c - ${BORINGSSL_ROOT}src/crypto/curve25519/spake25519.c - ${BORINGSSL_ROOT}src/crypto/des/des.c - ${BORINGSSL_ROOT}src/crypto/dh_extra/dh_asn1.c - ${BORINGSSL_ROOT}src/crypto/dh_extra/params.c - ${BORINGSSL_ROOT}src/crypto/digest_extra/digest_extra.c - ${BORINGSSL_ROOT}src/crypto/dsa/dsa.c - ${BORINGSSL_ROOT}src/crypto/dsa/dsa_asn1.c - ${BORINGSSL_ROOT}src/crypto/ec_extra/ec_asn1.c - ${BORINGSSL_ROOT}src/crypto/ec_extra/ec_derive.c - ${BORINGSSL_ROOT}src/crypto/ec_extra/hash_to_curve.c - ${BORINGSSL_ROOT}src/crypto/ecdh_extra/ecdh_extra.c - ${BORINGSSL_ROOT}src/crypto/ecdsa_extra/ecdsa_asn1.c - ${BORINGSSL_ROOT}src/crypto/engine/engine.c - ${BORINGSSL_ROOT}src/crypto/err/err.c - ${BORINGSSL_ROOT}src/crypto/evp/evp.c - ${BORINGSSL_ROOT}src/crypto/evp/evp_asn1.c - ${BORINGSSL_ROOT}src/crypto/evp/evp_ctx.c - ${BORINGSSL_ROOT}src/crypto/evp/p_dsa_asn1.c - ${BORINGSSL_ROOT}src/crypto/evp/p_ec.c - ${BORINGSSL_ROOT}src/crypto/evp/p_ec_asn1.c - ${BORINGSSL_ROOT}src/crypto/evp/p_ed25519.c - ${BORINGSSL_ROOT}src/crypto/evp/p_ed25519_asn1.c - ${BORINGSSL_ROOT}src/crypto/evp/p_hkdf.c - ${BORINGSSL_ROOT}src/crypto/evp/p_rsa.c - ${BORINGSSL_ROOT}src/crypto/evp/p_rsa_asn1.c - ${BORINGSSL_ROOT}src/crypto/evp/p_x25519.c - ${BORINGSSL_ROOT}src/crypto/evp/p_x25519_asn1.c - ${BORINGSSL_ROOT}src/crypto/evp/pbkdf.c - ${BORINGSSL_ROOT}src/crypto/evp/print.c - ${BORINGSSL_ROOT}src/crypto/evp/scrypt.c - ${BORINGSSL_ROOT}src/crypto/evp/sign.c - ${BORINGSSL_ROOT}src/crypto/ex_data.c - ${BORINGSSL_ROOT}src/crypto/fipsmodule/bcm.c - ${BORINGSSL_ROOT}src/crypto/fipsmodule/fips_shared_support.c - ${BORINGSSL_ROOT}src/crypto/hkdf/hkdf.c - ${BORINGSSL_ROOT}src/crypto/hpke/hpke.c - ${BORINGSSL_ROOT}src/crypto/hrss/hrss.c - ${BORINGSSL_ROOT}src/crypto/lhash/lhash.c - ${BORINGSSL_ROOT}src/crypto/mem.c - ${BORINGSSL_ROOT}src/crypto/obj/obj.c - ${BORINGSSL_ROOT}src/crypto/obj/obj_xref.c - ${BORINGSSL_ROOT}src/crypto/pem/pem_all.c - ${BORINGSSL_ROOT}src/crypto/pem/pem_info.c - ${BORINGSSL_ROOT}src/crypto/pem/pem_lib.c - ${BORINGSSL_ROOT}src/crypto/pem/pem_oth.c - ${BORINGSSL_ROOT}src/crypto/pem/pem_pk8.c - ${BORINGSSL_ROOT}src/crypto/pem/pem_pkey.c - ${BORINGSSL_ROOT}src/crypto/pem/pem_x509.c - ${BORINGSSL_ROOT}src/crypto/pem/pem_xaux.c - ${BORINGSSL_ROOT}src/crypto/pkcs7/pkcs7.c - ${BORINGSSL_ROOT}src/crypto/pkcs7/pkcs7_x509.c - ${BORINGSSL_ROOT}src/crypto/pkcs8/p5_pbev2.c - ${BORINGSSL_ROOT}src/crypto/pkcs8/pkcs8.c - ${BORINGSSL_ROOT}src/crypto/pkcs8/pkcs8_x509.c - ${BORINGSSL_ROOT}src/crypto/poly1305/poly1305.c - ${BORINGSSL_ROOT}src/crypto/poly1305/poly1305_arm.c - ${BORINGSSL_ROOT}src/crypto/poly1305/poly1305_vec.c - ${BORINGSSL_ROOT}src/crypto/pool/pool.c - ${BORINGSSL_ROOT}src/crypto/rand_extra/deterministic.c - ${BORINGSSL_ROOT}src/crypto/rand_extra/forkunsafe.c - ${BORINGSSL_ROOT}src/crypto/rand_extra/fuchsia.c - ${BORINGSSL_ROOT}src/crypto/rand_extra/passive.c - ${BORINGSSL_ROOT}src/crypto/rand_extra/rand_extra.c - ${BORINGSSL_ROOT}src/crypto/rand_extra/windows.c - ${BORINGSSL_ROOT}src/crypto/rc4/rc4.c - ${BORINGSSL_ROOT}src/crypto/refcount_c11.c - ${BORINGSSL_ROOT}src/crypto/refcount_lock.c - ${BORINGSSL_ROOT}src/crypto/rsa_extra/rsa_asn1.c - ${BORINGSSL_ROOT}src/crypto/rsa_extra/rsa_print.c - ${BORINGSSL_ROOT}src/crypto/siphash/siphash.c - ${BORINGSSL_ROOT}src/crypto/stack/stack.c - ${BORINGSSL_ROOT}src/crypto/thread.c - ${BORINGSSL_ROOT}src/crypto/thread_none.c - ${BORINGSSL_ROOT}src/crypto/thread_pthread.c - ${BORINGSSL_ROOT}src/crypto/thread_win.c - ${BORINGSSL_ROOT}src/crypto/trust_token/pmbtoken.c - ${BORINGSSL_ROOT}src/crypto/trust_token/trust_token.c - ${BORINGSSL_ROOT}src/crypto/trust_token/voprf.c - ${BORINGSSL_ROOT}src/crypto/x509/a_digest.c - ${BORINGSSL_ROOT}src/crypto/x509/a_sign.c - ${BORINGSSL_ROOT}src/crypto/x509/a_verify.c - ${BORINGSSL_ROOT}src/crypto/x509/algorithm.c - ${BORINGSSL_ROOT}src/crypto/x509/asn1_gen.c - ${BORINGSSL_ROOT}src/crypto/x509/by_dir.c - ${BORINGSSL_ROOT}src/crypto/x509/by_file.c - ${BORINGSSL_ROOT}src/crypto/x509/i2d_pr.c - ${BORINGSSL_ROOT}src/crypto/x509/name_print.c - ${BORINGSSL_ROOT}src/crypto/x509/rsa_pss.c - ${BORINGSSL_ROOT}src/crypto/x509/t_crl.c - ${BORINGSSL_ROOT}src/crypto/x509/t_req.c - ${BORINGSSL_ROOT}src/crypto/x509/t_x509.c - ${BORINGSSL_ROOT}src/crypto/x509/t_x509a.c - ${BORINGSSL_ROOT}src/crypto/x509/x509.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_att.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_cmp.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_d2.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_def.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_ext.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_lu.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_obj.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_req.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_set.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_trs.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_txt.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_v3.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_vfy.c - ${BORINGSSL_ROOT}src/crypto/x509/x509_vpm.c - ${BORINGSSL_ROOT}src/crypto/x509/x509cset.c - ${BORINGSSL_ROOT}src/crypto/x509/x509name.c - ${BORINGSSL_ROOT}src/crypto/x509/x509rset.c - ${BORINGSSL_ROOT}src/crypto/x509/x509spki.c - ${BORINGSSL_ROOT}src/crypto/x509/x_algor.c - ${BORINGSSL_ROOT}src/crypto/x509/x_all.c - ${BORINGSSL_ROOT}src/crypto/x509/x_attrib.c - ${BORINGSSL_ROOT}src/crypto/x509/x_crl.c - ${BORINGSSL_ROOT}src/crypto/x509/x_exten.c - ${BORINGSSL_ROOT}src/crypto/x509/x_info.c - ${BORINGSSL_ROOT}src/crypto/x509/x_name.c - ${BORINGSSL_ROOT}src/crypto/x509/x_pkey.c - ${BORINGSSL_ROOT}src/crypto/x509/x_pubkey.c - ${BORINGSSL_ROOT}src/crypto/x509/x_req.c - ${BORINGSSL_ROOT}src/crypto/x509/x_sig.c - ${BORINGSSL_ROOT}src/crypto/x509/x_spki.c - ${BORINGSSL_ROOT}src/crypto/x509/x_val.c - ${BORINGSSL_ROOT}src/crypto/x509/x_x509.c - ${BORINGSSL_ROOT}src/crypto/x509/x_x509a.c - ${BORINGSSL_ROOT}src/crypto/x509v3/pcy_cache.c - ${BORINGSSL_ROOT}src/crypto/x509v3/pcy_data.c - ${BORINGSSL_ROOT}src/crypto/x509v3/pcy_map.c - ${BORINGSSL_ROOT}src/crypto/x509v3/pcy_node.c - ${BORINGSSL_ROOT}src/crypto/x509v3/pcy_tree.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_akey.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_akeya.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_alt.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_bcons.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_bitst.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_conf.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_cpols.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_crld.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_enum.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_extku.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_genn.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_ia5.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_info.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_int.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_lib.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_ncons.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_ocsp.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_pci.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_pcia.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_pcons.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_pmaps.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_prn.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_purp.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_skey.c - ${BORINGSSL_ROOT}src/crypto/x509v3/v3_utl.c + ${BORINGSSL_ROOT}src/crypto/aes/aes.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_bitstr.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_bool.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_d2i_fp.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_dup.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_gentm.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_i2d_fp.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_int.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_mbstr.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_object.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_octet.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_strex.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_strnid.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_time.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_type.cc + ${BORINGSSL_ROOT}src/crypto/asn1/a_utctm.cc + ${BORINGSSL_ROOT}src/crypto/asn1/asn1_lib.cc + ${BORINGSSL_ROOT}src/crypto/asn1/asn1_par.cc + ${BORINGSSL_ROOT}src/crypto/asn1/asn_pack.cc + ${BORINGSSL_ROOT}src/crypto/asn1/f_int.cc + ${BORINGSSL_ROOT}src/crypto/asn1/f_string.cc + ${BORINGSSL_ROOT}src/crypto/asn1/posix_time.cc + ${BORINGSSL_ROOT}src/crypto/asn1/tasn_dec.cc + ${BORINGSSL_ROOT}src/crypto/asn1/tasn_enc.cc + ${BORINGSSL_ROOT}src/crypto/asn1/tasn_fre.cc + ${BORINGSSL_ROOT}src/crypto/asn1/tasn_new.cc + ${BORINGSSL_ROOT}src/crypto/asn1/tasn_typ.cc + ${BORINGSSL_ROOT}src/crypto/asn1/tasn_utl.cc + ${BORINGSSL_ROOT}src/crypto/base64/base64.cc + ${BORINGSSL_ROOT}src/crypto/bio/bio.cc + ${BORINGSSL_ROOT}src/crypto/bio/bio_mem.cc + ${BORINGSSL_ROOT}src/crypto/bio/connect.cc + ${BORINGSSL_ROOT}src/crypto/bio/errno.cc + ${BORINGSSL_ROOT}src/crypto/bio/fd.cc + ${BORINGSSL_ROOT}src/crypto/bio/file.cc + ${BORINGSSL_ROOT}src/crypto/bio/hexdump.cc + ${BORINGSSL_ROOT}src/crypto/bio/pair.cc + ${BORINGSSL_ROOT}src/crypto/bio/printf.cc + ${BORINGSSL_ROOT}src/crypto/bio/socket.cc + ${BORINGSSL_ROOT}src/crypto/bio/socket_helper.cc + ${BORINGSSL_ROOT}src/crypto/blake2/blake2.cc + ${BORINGSSL_ROOT}src/crypto/bn/bn_asn1.cc + ${BORINGSSL_ROOT}src/crypto/bn/convert.cc + ${BORINGSSL_ROOT}src/crypto/bn/div.cc + ${BORINGSSL_ROOT}src/crypto/bn/exponentiation.cc + ${BORINGSSL_ROOT}src/crypto/bn/sqrt.cc + ${BORINGSSL_ROOT}src/crypto/buf/buf.cc + ${BORINGSSL_ROOT}src/crypto/bytestring/asn1_compat.cc + ${BORINGSSL_ROOT}src/crypto/bytestring/ber.cc + ${BORINGSSL_ROOT}src/crypto/bytestring/cbb.cc + ${BORINGSSL_ROOT}src/crypto/bytestring/cbs.cc + ${BORINGSSL_ROOT}src/crypto/bytestring/unicode.cc + ${BORINGSSL_ROOT}src/crypto/chacha/chacha.cc + ${BORINGSSL_ROOT}src/crypto/cipher/derive_key.cc + ${BORINGSSL_ROOT}src/crypto/cipher/e_aesctrhmac.cc + ${BORINGSSL_ROOT}src/crypto/cipher/e_aeseax.cc + ${BORINGSSL_ROOT}src/crypto/cipher/e_aesgcmsiv.cc + ${BORINGSSL_ROOT}src/crypto/cipher/e_chacha20poly1305.cc + ${BORINGSSL_ROOT}src/crypto/cipher/e_des.cc + ${BORINGSSL_ROOT}src/crypto/cipher/e_null.cc + ${BORINGSSL_ROOT}src/crypto/cipher/e_rc2.cc + ${BORINGSSL_ROOT}src/crypto/cipher/e_rc4.cc + ${BORINGSSL_ROOT}src/crypto/cipher/e_tls.cc + ${BORINGSSL_ROOT}src/crypto/cipher/get_cipher.cc + ${BORINGSSL_ROOT}src/crypto/cipher/tls_cbc.cc + ${BORINGSSL_ROOT}src/crypto/cms/cms.cc + ${BORINGSSL_ROOT}src/crypto/conf/conf.cc + ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_apple.cc + ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_fuchsia.cc + ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_linux.cc + ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_openbsd.cc + ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_sysreg.cc + ${BORINGSSL_ROOT}src/crypto/cpu_aarch64_win.cc + ${BORINGSSL_ROOT}src/crypto/cpu_arm_freebsd.cc + ${BORINGSSL_ROOT}src/crypto/cpu_arm_linux.cc + ${BORINGSSL_ROOT}src/crypto/cpu_intel.cc + ${BORINGSSL_ROOT}src/crypto/crypto.cc + ${BORINGSSL_ROOT}src/crypto/curve25519/curve25519.cc + ${BORINGSSL_ROOT}src/crypto/curve25519/curve25519_64_adx.cc + ${BORINGSSL_ROOT}src/crypto/curve25519/spake25519.cc + ${BORINGSSL_ROOT}src/crypto/des/des.cc + ${BORINGSSL_ROOT}src/crypto/dh/dh_asn1.cc + ${BORINGSSL_ROOT}src/crypto/dh/params.cc + ${BORINGSSL_ROOT}src/crypto/digest/digest_extra.cc + ${BORINGSSL_ROOT}src/crypto/dsa/dsa.cc + ${BORINGSSL_ROOT}src/crypto/dsa/dsa_asn1.cc + ${BORINGSSL_ROOT}src/crypto/ec/ec_asn1.cc + ${BORINGSSL_ROOT}src/crypto/ec/ec_derive.cc + ${BORINGSSL_ROOT}src/crypto/ec/hash_to_curve.cc + ${BORINGSSL_ROOT}src/crypto/ecdh/ecdh.cc + ${BORINGSSL_ROOT}src/crypto/ecdsa/ecdsa_asn1.cc + ${BORINGSSL_ROOT}src/crypto/ecdsa/ecdsa_p1363.cc + ${BORINGSSL_ROOT}src/crypto/engine/engine.cc + ${BORINGSSL_ROOT}src/crypto/err/err.cc + ${BORINGSSL_ROOT}src/crypto/evp/evp.cc + ${BORINGSSL_ROOT}src/crypto/evp/evp_asn1.cc + ${BORINGSSL_ROOT}src/crypto/evp/evp_ctx.cc + ${BORINGSSL_ROOT}src/crypto/evp/evp_kem.cc + ${BORINGSSL_ROOT}src/crypto/evp/p_dh.cc + ${BORINGSSL_ROOT}src/crypto/evp/p_dsa.cc + ${BORINGSSL_ROOT}src/crypto/evp/p_ec.cc + ${BORINGSSL_ROOT}src/crypto/evp/p_ed25519.cc + ${BORINGSSL_ROOT}src/crypto/evp/p_hkdf.cc + ${BORINGSSL_ROOT}src/crypto/evp/p_mldsa.cc + ${BORINGSSL_ROOT}src/crypto/evp/p_mlkem.cc + ${BORINGSSL_ROOT}src/crypto/evp/p_rsa.cc + ${BORINGSSL_ROOT}src/crypto/evp/p_x25519.cc + ${BORINGSSL_ROOT}src/crypto/evp/p_xwing.cc + ${BORINGSSL_ROOT}src/crypto/evp/pbkdf.cc + ${BORINGSSL_ROOT}src/crypto/evp/print.cc + ${BORINGSSL_ROOT}src/crypto/evp/scrypt.cc + ${BORINGSSL_ROOT}src/crypto/evp/sign.cc + ${BORINGSSL_ROOT}src/crypto/ex_data.cc + ${BORINGSSL_ROOT}src/crypto/fipsmodule/bcm.cc + ${BORINGSSL_ROOT}src/crypto/fipsmodule/fips_shared_support.cc + ${BORINGSSL_ROOT}src/crypto/fuzzer_mode.cc + ${BORINGSSL_ROOT}src/crypto/hpke/hpke.cc + ${BORINGSSL_ROOT}src/crypto/hrss/hrss.cc + ${BORINGSSL_ROOT}src/crypto/kyber/kyber.cc + ${BORINGSSL_ROOT}src/crypto/lhash/lhash.cc + ${BORINGSSL_ROOT}src/crypto/md4/md4.cc + ${BORINGSSL_ROOT}src/crypto/md5/md5.cc + ${BORINGSSL_ROOT}src/crypto/mem.cc + ${BORINGSSL_ROOT}src/crypto/mldsa/mldsa.cc + ${BORINGSSL_ROOT}src/crypto/mlkem/mlkem.cc + ${BORINGSSL_ROOT}src/crypto/obj/obj.cc + ${BORINGSSL_ROOT}src/crypto/obj/obj_xref.cc + ${BORINGSSL_ROOT}src/crypto/pem/pem_all.cc + ${BORINGSSL_ROOT}src/crypto/pem/pem_info.cc + ${BORINGSSL_ROOT}src/crypto/pem/pem_lib.cc + ${BORINGSSL_ROOT}src/crypto/pem/pem_oth.cc + ${BORINGSSL_ROOT}src/crypto/pem/pem_pk8.cc + ${BORINGSSL_ROOT}src/crypto/pem/pem_pkey.cc + ${BORINGSSL_ROOT}src/crypto/pem/pem_x509.cc + ${BORINGSSL_ROOT}src/crypto/pem/pem_xaux.cc + ${BORINGSSL_ROOT}src/crypto/pkcs7/pkcs7.cc + ${BORINGSSL_ROOT}src/crypto/pkcs7/pkcs7_x509.cc + ${BORINGSSL_ROOT}src/crypto/pkcs8/p5_pbev2.cc + ${BORINGSSL_ROOT}src/crypto/pkcs8/pkcs8.cc + ${BORINGSSL_ROOT}src/crypto/pkcs8/pkcs8_x509.cc + ${BORINGSSL_ROOT}src/crypto/poly1305/poly1305.cc + ${BORINGSSL_ROOT}src/crypto/poly1305/poly1305_arm.cc + ${BORINGSSL_ROOT}src/crypto/poly1305/poly1305_vec.cc + ${BORINGSSL_ROOT}src/crypto/pool/pool.cc + ${BORINGSSL_ROOT}src/crypto/rand/deterministic.cc + ${BORINGSSL_ROOT}src/crypto/rand/fork_detect.cc + ${BORINGSSL_ROOT}src/crypto/rand/forkunsafe.cc + ${BORINGSSL_ROOT}src/crypto/rand/getentropy.cc + ${BORINGSSL_ROOT}src/crypto/rand/ios.cc + ${BORINGSSL_ROOT}src/crypto/rand/passive.cc + ${BORINGSSL_ROOT}src/crypto/rand/rand.cc + ${BORINGSSL_ROOT}src/crypto/rand/trusty.cc + ${BORINGSSL_ROOT}src/crypto/rand/urandom.cc + ${BORINGSSL_ROOT}src/crypto/rand/windows.cc + ${BORINGSSL_ROOT}src/crypto/rc4/rc4.cc + ${BORINGSSL_ROOT}src/crypto/refcount.cc + ${BORINGSSL_ROOT}src/crypto/rsa/rsa_asn1.cc + ${BORINGSSL_ROOT}src/crypto/rsa/rsa_crypt.cc + ${BORINGSSL_ROOT}src/crypto/rsa/rsa_extra.cc + ${BORINGSSL_ROOT}src/crypto/rsa/rsa_print.cc + ${BORINGSSL_ROOT}src/crypto/sha/sha1.cc + ${BORINGSSL_ROOT}src/crypto/sha/sha256.cc + ${BORINGSSL_ROOT}src/crypto/sha/sha512.cc + ${BORINGSSL_ROOT}src/crypto/siphash/siphash.cc + ${BORINGSSL_ROOT}src/crypto/slhdsa/slhdsa.cc + ${BORINGSSL_ROOT}src/crypto/spake2plus/spake2plus.cc + ${BORINGSSL_ROOT}src/crypto/stack/stack.cc + ${BORINGSSL_ROOT}src/crypto/thread.cc + ${BORINGSSL_ROOT}src/crypto/thread_none.cc + ${BORINGSSL_ROOT}src/crypto/thread_pthread.cc + ${BORINGSSL_ROOT}src/crypto/thread_win.cc + ${BORINGSSL_ROOT}src/crypto/trust_token/pmbtoken.cc + ${BORINGSSL_ROOT}src/crypto/trust_token/trust_token.cc + ${BORINGSSL_ROOT}src/crypto/trust_token/voprf.cc + ${BORINGSSL_ROOT}src/crypto/x509/a_digest.cc + ${BORINGSSL_ROOT}src/crypto/x509/a_sign.cc + ${BORINGSSL_ROOT}src/crypto/x509/a_verify.cc + ${BORINGSSL_ROOT}src/crypto/x509/algorithm.cc + ${BORINGSSL_ROOT}src/crypto/x509/asn1_gen.cc + ${BORINGSSL_ROOT}src/crypto/x509/by_dir.cc + ${BORINGSSL_ROOT}src/crypto/x509/by_file.cc + ${BORINGSSL_ROOT}src/crypto/x509/i2d_pr.cc + ${BORINGSSL_ROOT}src/crypto/x509/name_print.cc + ${BORINGSSL_ROOT}src/crypto/x509/policy.cc + ${BORINGSSL_ROOT}src/crypto/x509/rsa_pss.cc + ${BORINGSSL_ROOT}src/crypto/x509/t_crl.cc + ${BORINGSSL_ROOT}src/crypto/x509/t_req.cc + ${BORINGSSL_ROOT}src/crypto/x509/t_x509.cc + ${BORINGSSL_ROOT}src/crypto/x509/t_x509a.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_akey.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_akeya.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_alt.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_bcons.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_bitst.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_conf.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_cpols.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_crld.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_enum.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_extku.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_genn.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_ia5.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_info.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_int.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_lib.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_ncons.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_ocsp.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_pcons.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_pmaps.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_prn.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_purp.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_skey.cc + ${BORINGSSL_ROOT}src/crypto/x509/v3_utl.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_att.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_cmp.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_d2.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_def.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_ext.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_lu.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_obj.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_req.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_set.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_trs.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_txt.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_v3.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_vfy.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509_vpm.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509cset.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509name.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509rset.cc + ${BORINGSSL_ROOT}src/crypto/x509/x509spki.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_algor.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_all.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_attrib.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_crl.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_exten.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_name.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_pubkey.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_req.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_sig.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_spki.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_x509.cc + ${BORINGSSL_ROOT}src/crypto/x509/x_x509a.cc + ${BORINGSSL_ROOT}src/crypto/xwing/xwing.cc + ${BORINGSSL_ROOT}src/gen/crypto/err_data.cc ) set(crypto_sources_apple_aarch64 - ${BORINGSSL_ROOT}apple-aarch64/crypto/chacha/chacha-armv8.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/aesv8-armx64.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/armv8-mont.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/ghash-neon-armv8.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/ghashv8-armx64.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/p256-armv8-asm.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha1-armv8.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha256-armv8.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/sha512-armv8.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/fipsmodule/vpaes-armv8.S - ${BORINGSSL_ROOT}apple-aarch64/crypto/test/trampoline-armv8.S -) - -set(crypto_sources_apple_arm - ${BORINGSSL_ROOT}apple-arm/crypto/chacha/chacha-armv4.S - ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/aesv8-armx32.S - ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/armv4-mont.S - ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/bsaes-armv7.S - ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/ghash-armv4.S - ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/ghashv8-armx32.S - ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha1-armv4-large.S - ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha256-armv4.S - ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/sha512-armv4.S - ${BORINGSSL_ROOT}apple-arm/crypto/fipsmodule/vpaes-armv7.S - ${BORINGSSL_ROOT}apple-arm/crypto/test/trampoline-armv4.S + ${BORINGSSL_ROOT}gen/bcm/aesv8-armv8-apple.S + ${BORINGSSL_ROOT}gen/bcm/aesv8-gcm-armv8-apple.S + ${BORINGSSL_ROOT}gen/bcm/armv8-mont-apple.S + ${BORINGSSL_ROOT}gen/bcm/bn-armv8-apple.S + ${BORINGSSL_ROOT}gen/bcm/ghash-neon-armv8-apple.S + ${BORINGSSL_ROOT}gen/bcm/ghashv8-armv8-apple.S + ${BORINGSSL_ROOT}gen/bcm/p256-armv8-asm-apple.S + ${BORINGSSL_ROOT}gen/bcm/p256_beeu-armv8-asm-apple.S + ${BORINGSSL_ROOT}gen/bcm/sha1-armv8-apple.S + ${BORINGSSL_ROOT}gen/bcm/sha256-armv8-apple.S + ${BORINGSSL_ROOT}gen/bcm/sha512-armv8-apple.S + ${BORINGSSL_ROOT}gen/bcm/vpaes-armv8-apple.S + ${BORINGSSL_ROOT}gen/crypto/chacha-armv8-apple.S + ${BORINGSSL_ROOT}gen/crypto/chacha20_poly1305_armv8-apple.S + ${BORINGSSL_ROOT}gen/test_support/trampoline-armv8-apple.S ) set(crypto_sources_apple_x86 - ${BORINGSSL_ROOT}apple-x86/crypto/chacha/chacha-x86.S - ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/aesni-x86.S - ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/bn-586.S - ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/co-586.S - ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/ghash-ssse3-x86.S - ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/ghash-x86.S - ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/md5-586.S - ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha1-586.S - ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha256-586.S - ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/sha512-586.S - ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/vpaes-x86.S - ${BORINGSSL_ROOT}apple-x86/crypto/fipsmodule/x86-mont.S - ${BORINGSSL_ROOT}apple-x86/crypto/test/trampoline-x86.S + ${BORINGSSL_ROOT}gen/bcm/aesni-x86-apple.S + ${BORINGSSL_ROOT}gen/bcm/ghash-ssse3-x86-apple.S + ${BORINGSSL_ROOT}gen/bcm/ghash-x86-apple.S + ${BORINGSSL_ROOT}gen/bcm/vpaes-x86-apple.S + ${BORINGSSL_ROOT}gen/crypto/chacha-x86-apple.S + ${BORINGSSL_ROOT}gen/test_support/trampoline-x86-apple.S ) set(crypto_sources_apple_x86_64 - ${BORINGSSL_ROOT}apple-x86_64/crypto/chacha/chacha-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/aesni-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/ghash-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/md5-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/p256-x86_64-asm.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/rdrand-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/rsaz-avx2.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha1-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha256-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/sha512-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/vpaes-x86_64.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/x86_64-mont.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/fipsmodule/x86_64-mont5.S - ${BORINGSSL_ROOT}apple-x86_64/crypto/test/trampoline-x86_64.S + ${BORINGSSL_ROOT}gen/bcm/aes-gcm-avx2-x86_64-apple.S + ${BORINGSSL_ROOT}gen/bcm/aes-gcm-avx512-x86_64-apple.S + ${BORINGSSL_ROOT}gen/bcm/aesni-gcm-x86_64-apple.S + ${BORINGSSL_ROOT}gen/bcm/aesni-x86_64-apple.S + ${BORINGSSL_ROOT}gen/bcm/bn-586-apple.S + ${BORINGSSL_ROOT}gen/bcm/co-586-apple.S + ${BORINGSSL_ROOT}gen/bcm/ghash-ssse3-x86_64-apple.S + ${BORINGSSL_ROOT}gen/bcm/ghash-x86_64-apple.S + ${BORINGSSL_ROOT}gen/bcm/p256-x86_64-asm-apple.S + ${BORINGSSL_ROOT}gen/bcm/p256_beeu-x86_64-asm-apple.S + ${BORINGSSL_ROOT}gen/bcm/rdrand-x86_64-apple.S + ${BORINGSSL_ROOT}gen/bcm/rsaz-avx2-apple.S + ${BORINGSSL_ROOT}gen/bcm/sha1-586-apple.S + ${BORINGSSL_ROOT}gen/bcm/sha1-x86_64-apple.S + ${BORINGSSL_ROOT}gen/bcm/sha256-586-apple.S + ${BORINGSSL_ROOT}gen/bcm/sha256-x86_64-apple.S + ${BORINGSSL_ROOT}gen/bcm/sha512-586-apple.S + ${BORINGSSL_ROOT}gen/bcm/sha512-x86_64-apple.S + ${BORINGSSL_ROOT}gen/bcm/vpaes-x86_64-apple.S + ${BORINGSSL_ROOT}gen/bcm/x86-mont-apple.S + ${BORINGSSL_ROOT}gen/bcm/x86_64-mont-apple.S + ${BORINGSSL_ROOT}gen/bcm/x86_64-mont5-apple.S + ${BORINGSSL_ROOT}gen/crypto/aes128gcmsiv-x86_64-apple.S + ${BORINGSSL_ROOT}gen/crypto/chacha-x86_64-apple.S + ${BORINGSSL_ROOT}gen/crypto/chacha20_poly1305_x86_64-apple.S + ${BORINGSSL_ROOT}gen/crypto/md5-586-apple.S + ${BORINGSSL_ROOT}gen/crypto/md5-x86_64-apple.S + ${BORINGSSL_ROOT}gen/test_support/trampoline-x86_64-apple.S ) set(crypto_sources_linux_aarch64 - ${BORINGSSL_ROOT}linux-aarch64/crypto/chacha/chacha-armv8.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/aesv8-armx64.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/armv8-mont.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/ghash-neon-armv8.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/ghashv8-armx64.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/p256-armv8-asm.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/sha1-armv8.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/sha256-armv8.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/sha512-armv8.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/fipsmodule/vpaes-armv8.S - ${BORINGSSL_ROOT}linux-aarch64/crypto/test/trampoline-armv8.S + ${BORINGSSL_ROOT}gen/bcm/aesv8-armv8-linux.S + ${BORINGSSL_ROOT}gen/bcm/aesv8-gcm-armv8-linux.S + ${BORINGSSL_ROOT}gen/bcm/armv8-mont-linux.S + ${BORINGSSL_ROOT}gen/bcm/bn-armv8-linux.S + ${BORINGSSL_ROOT}gen/bcm/ghash-neon-armv8-linux.S + ${BORINGSSL_ROOT}gen/bcm/ghashv8-armv8-linux.S + ${BORINGSSL_ROOT}gen/bcm/p256-armv8-asm-linux.S + ${BORINGSSL_ROOT}gen/bcm/p256_beeu-armv8-asm-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha1-armv8-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha256-armv8-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha512-armv8-linux.S + ${BORINGSSL_ROOT}gen/bcm/vpaes-armv8-linux.S + ${BORINGSSL_ROOT}gen/crypto/chacha-armv8-linux.S + ${BORINGSSL_ROOT}gen/crypto/chacha20_poly1305_armv8-linux.S + ${BORINGSSL_ROOT}gen/test_support/trampoline-armv8-linux.S ) set(crypto_sources_linux_arm - ${BORINGSSL_ROOT}linux-arm/crypto/chacha/chacha-armv4.S - ${BORINGSSL_ROOT}linux-arm/crypto/fipsmodule/aesv8-armx32.S - ${BORINGSSL_ROOT}linux-arm/crypto/fipsmodule/armv4-mont.S - ${BORINGSSL_ROOT}linux-arm/crypto/fipsmodule/bsaes-armv7.S - ${BORINGSSL_ROOT}linux-arm/crypto/fipsmodule/ghash-armv4.S - ${BORINGSSL_ROOT}linux-arm/crypto/fipsmodule/ghashv8-armx32.S - ${BORINGSSL_ROOT}linux-arm/crypto/fipsmodule/sha1-armv4-large.S - ${BORINGSSL_ROOT}linux-arm/crypto/fipsmodule/sha256-armv4.S - ${BORINGSSL_ROOT}linux-arm/crypto/fipsmodule/sha512-armv4.S - ${BORINGSSL_ROOT}linux-arm/crypto/fipsmodule/vpaes-armv7.S - ${BORINGSSL_ROOT}linux-arm/crypto/test/trampoline-armv4.S - ${BORINGSSL_ROOT}src/crypto/curve25519/asm/x25519-asm-arm.S - ${BORINGSSL_ROOT}src/crypto/poly1305/poly1305_arm_asm.S -) - -set(crypto_sources_linux_ppc64le - ${BORINGSSL_ROOT}linux-ppc64le/crypto/fipsmodule/aesp8-ppc.S - ${BORINGSSL_ROOT}linux-ppc64le/crypto/fipsmodule/ghashp8-ppc.S - ${BORINGSSL_ROOT}linux-ppc64le/crypto/test/trampoline-ppc.S + ${BORINGSSL_ROOT}crypto/curve25519/asm/x25519-asm-arm.S + ${BORINGSSL_ROOT}crypto/poly1305/poly1305_arm_asm.S + ${BORINGSSL_ROOT}gen/bcm/aesv8-armv7-linux.S + ${BORINGSSL_ROOT}gen/bcm/armv4-mont-linux.S + ${BORINGSSL_ROOT}gen/bcm/bsaes-armv7-linux.S + ${BORINGSSL_ROOT}gen/bcm/ghash-armv4-linux.S + ${BORINGSSL_ROOT}gen/bcm/ghashv8-armv7-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha1-armv4-large-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha256-armv4-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha512-armv4-linux.S + ${BORINGSSL_ROOT}gen/bcm/vpaes-armv7-linux.S + ${BORINGSSL_ROOT}gen/crypto/chacha-armv4-linux.S + ${BORINGSSL_ROOT}gen/test_support/trampoline-armv4-linux.S ) set(crypto_sources_linux_x86 - ${BORINGSSL_ROOT}linux-x86/crypto/chacha/chacha-x86.S - ${BORINGSSL_ROOT}linux-x86/crypto/fipsmodule/aesni-x86.S - ${BORINGSSL_ROOT}linux-x86/crypto/fipsmodule/bn-586.S - ${BORINGSSL_ROOT}linux-x86/crypto/fipsmodule/co-586.S - ${BORINGSSL_ROOT}linux-x86/crypto/fipsmodule/ghash-ssse3-x86.S - ${BORINGSSL_ROOT}linux-x86/crypto/fipsmodule/ghash-x86.S - ${BORINGSSL_ROOT}linux-x86/crypto/fipsmodule/md5-586.S - ${BORINGSSL_ROOT}linux-x86/crypto/fipsmodule/sha1-586.S - ${BORINGSSL_ROOT}linux-x86/crypto/fipsmodule/sha256-586.S - ${BORINGSSL_ROOT}linux-x86/crypto/fipsmodule/sha512-586.S - ${BORINGSSL_ROOT}linux-x86/crypto/fipsmodule/vpaes-x86.S - ${BORINGSSL_ROOT}linux-x86/crypto/fipsmodule/x86-mont.S - ${BORINGSSL_ROOT}linux-x86/crypto/test/trampoline-x86.S + ${BORINGSSL_ROOT}gen/bcm/aesni-x86-linux.S + ${BORINGSSL_ROOT}gen/bcm/bn-586-linux.S + ${BORINGSSL_ROOT}gen/bcm/co-586-linux.S + ${BORINGSSL_ROOT}gen/bcm/ghash-ssse3-x86-linux.S + ${BORINGSSL_ROOT}gen/bcm/ghash-x86-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha1-586-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha256-586-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha512-586-linux.S + ${BORINGSSL_ROOT}gen/bcm/vpaes-x86-linux.S + ${BORINGSSL_ROOT}gen/crypto/chacha-x86-linux.S + ${BORINGSSL_ROOT}gen/crypto/md5-586-linux.S + ${BORINGSSL_ROOT}gen/test_support/trampoline-x86-linux.S ) set(crypto_sources_linux_x86_64 - ${BORINGSSL_ROOT}linux-x86_64/crypto/chacha/chacha-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/aesni-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/ghash-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/md5-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/p256-x86_64-asm.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/rdrand-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/rsaz-avx2.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/sha1-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/sha256-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/sha512-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/vpaes-x86_64.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/x86_64-mont.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/fipsmodule/x86_64-mont5.S - ${BORINGSSL_ROOT}linux-x86_64/crypto/test/trampoline-x86_64.S - ${BORINGSSL_ROOT}src/crypto/hrss/asm/poly_rq_mul.S + ${BORINGSSL_ROOT}crypto/hrss/asm/poly_rq_mul.S + ${BORINGSSL_ROOT}gen/bcm/aes-gcm-avx2-x86_64-linux.S + ${BORINGSSL_ROOT}gen/bcm/aes-gcm-avx512-x86_64-linux.S + ${BORINGSSL_ROOT}gen/bcm/aesni-gcm-x86_64-linux.S + ${BORINGSSL_ROOT}gen/bcm/aesni-x86_64-linux.S + ${BORINGSSL_ROOT}gen/bcm/ghash-ssse3-x86_64-linux.S + ${BORINGSSL_ROOT}gen/bcm/ghash-x86_64-linux.S + ${BORINGSSL_ROOT}gen/bcm/p256-x86_64-asm-linux.S + ${BORINGSSL_ROOT}gen/bcm/p256_beeu-x86_64-asm-linux.S + ${BORINGSSL_ROOT}gen/bcm/rdrand-x86_64-linux.S + ${BORINGSSL_ROOT}gen/bcm/rsaz-avx2-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha1-x86_64-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha256-x86_64-linux.S + ${BORINGSSL_ROOT}gen/bcm/sha512-x86_64-linux.S + ${BORINGSSL_ROOT}gen/bcm/vpaes-x86_64-linux.S + ${BORINGSSL_ROOT}gen/bcm/x86-mont-linux.S + ${BORINGSSL_ROOT}gen/bcm/x86_64-mont-linux.S + ${BORINGSSL_ROOT}gen/bcm/x86_64-mont5-linux.S + ${BORINGSSL_ROOT}gen/crypto/aes128gcmsiv-x86_64-linux.S + ${BORINGSSL_ROOT}gen/crypto/chacha-x86_64-linux.S + ${BORINGSSL_ROOT}gen/crypto/chacha20_poly1305_x86_64-linux.S + ${BORINGSSL_ROOT}gen/crypto/md5-x86_64-linux.S + ${BORINGSSL_ROOT}gen/test_support/trampoline-x86_64-linux.S + ${BORINGSSL_ROOT}third_party/fiat/asm/fiat_curve25519_adx_mul.S + ${BORINGSSL_ROOT}third_party/fiat/asm/fiat_curve25519_adx_square.S + ${BORINGSSL_ROOT}third_party/fiat/asm/fiat_p256_adx_mul.S + ${BORINGSSL_ROOT}third_party/fiat/asm/fiat_p256_adx_sqr.S ) set(crypto_sources_win_aarch64 - ${BORINGSSL_ROOT}win-aarch64/crypto/chacha/chacha-armv8.S - ${BORINGSSL_ROOT}win-aarch64/crypto/cipher_extra/chacha20_poly1305_armv8.S - ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/aesv8-armx64.S - ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/armv8-mont.S - ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/ghash-neon-armv8.S - ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/ghashv8-armx64.S - ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/p256-armv8-asm.S - ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/p256_beeu-armv8-asm.S - ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/sha1-armv8.S - ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/sha256-armv8.S - ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/sha512-armv8.S - ${BORINGSSL_ROOT}win-aarch64/crypto/fipsmodule/vpaes-armv8.S - ${BORINGSSL_ROOT}win-aarch64/crypto/test/trampoline-armv8.S + ${BORINGSSL_ROOT}gen/bcm/aesv8-armv8-win.S + ${BORINGSSL_ROOT}gen/bcm/aesv8-gcm-armv8-win.S + ${BORINGSSL_ROOT}gen/bcm/armv8-mont-win.S + ${BORINGSSL_ROOT}gen/bcm/bn-armv8-win.S + ${BORINGSSL_ROOT}gen/bcm/ghash-neon-armv8-win.S + ${BORINGSSL_ROOT}gen/bcm/ghashv8-armv8-win.S + ${BORINGSSL_ROOT}gen/bcm/p256-armv8-asm-win.S + ${BORINGSSL_ROOT}gen/bcm/p256_beeu-armv8-asm-win.S + ${BORINGSSL_ROOT}gen/bcm/sha1-armv8-win.S + ${BORINGSSL_ROOT}gen/bcm/sha256-armv8-win.S + ${BORINGSSL_ROOT}gen/bcm/sha512-armv8-win.S + ${BORINGSSL_ROOT}gen/bcm/vpaes-armv8-win.S + ${BORINGSSL_ROOT}gen/crypto/chacha-armv8-win.S + ${BORINGSSL_ROOT}gen/crypto/chacha20_poly1305_armv8-win.S + ${BORINGSSL_ROOT}gen/test_support/trampoline-armv8-win.S ) set(crypto_sources_win_x86 - ${BORINGSSL_ROOT}win-x86/crypto/chacha/chacha-x86.asm - ${BORINGSSL_ROOT}win-x86/crypto/fipsmodule/aesni-x86.asm - ${BORINGSSL_ROOT}win-x86/crypto/fipsmodule/bn-586.asm - ${BORINGSSL_ROOT}win-x86/crypto/fipsmodule/co-586.asm - ${BORINGSSL_ROOT}win-x86/crypto/fipsmodule/ghash-ssse3-x86.asm - ${BORINGSSL_ROOT}win-x86/crypto/fipsmodule/ghash-x86.asm - ${BORINGSSL_ROOT}win-x86/crypto/fipsmodule/md5-586.asm - ${BORINGSSL_ROOT}win-x86/crypto/fipsmodule/sha1-586.asm - ${BORINGSSL_ROOT}win-x86/crypto/fipsmodule/sha256-586.asm - ${BORINGSSL_ROOT}win-x86/crypto/fipsmodule/sha512-586.asm - ${BORINGSSL_ROOT}win-x86/crypto/fipsmodule/vpaes-x86.asm - ${BORINGSSL_ROOT}win-x86/crypto/fipsmodule/x86-mont.asm - ${BORINGSSL_ROOT}win-x86/crypto/test/trampoline-x86.asm + ${BORINGSSL_ROOT}gen/bcm/aesni-x86-win.asm + ${BORINGSSL_ROOT}gen/bcm/bn-586-win.asm + ${BORINGSSL_ROOT}gen/bcm/co-586-win.asm + ${BORINGSSL_ROOT}gen/bcm/ghash-ssse3-x86-win.asm + ${BORINGSSL_ROOT}gen/bcm/ghash-x86-win.asm + ${BORINGSSL_ROOT}gen/bcm/sha1-586-win.asm + ${BORINGSSL_ROOT}gen/bcm/sha256-586-win.asm + ${BORINGSSL_ROOT}gen/bcm/sha512-586-win.asm + ${BORINGSSL_ROOT}gen/bcm/vpaes-x86-win.asm + ${BORINGSSL_ROOT}gen/crypto/chacha-x86-win.asm + ${BORINGSSL_ROOT}gen/crypto/md5-586-win.asm + ${BORINGSSL_ROOT}gen/test_support/trampoline-x86-win.asm ) set(crypto_sources_win_x86_64 - ${BORINGSSL_ROOT}win-x86_64/crypto/chacha/chacha-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/cipher_extra/aes128gcmsiv-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/cipher_extra/chacha20_poly1305_x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/aesni-gcm-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/aesni-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/ghash-ssse3-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/ghash-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/md5-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/p256-x86_64-asm.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/p256_beeu-x86_64-asm.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/rdrand-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/rsaz-avx2.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/sha1-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/sha256-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/sha512-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/vpaes-x86_64.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/x86_64-mont.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/fipsmodule/x86_64-mont5.asm - ${BORINGSSL_ROOT}win-x86_64/crypto/test/trampoline-x86_64.asm + ${BORINGSSL_ROOT}gen/bcm/aes-gcm-avx2-x86_64-win.asm + ${BORINGSSL_ROOT}gen/bcm/aes-gcm-avx512-x86_64-win.asm + ${BORINGSSL_ROOT}gen/bcm/aesni-gcm-x86_64-win.asm + ${BORINGSSL_ROOT}gen/bcm/aesni-x86_64-win.asm + ${BORINGSSL_ROOT}gen/bcm/ghash-ssse3-x86_64-win.asm + ${BORINGSSL_ROOT}gen/bcm/ghash-x86_64-win.asm + ${BORINGSSL_ROOT}gen/bcm/p256-x86_64-asm-win.asm + ${BORINGSSL_ROOT}gen/bcm/p256_beeu-x86_64-asm-win.asm + ${BORINGSSL_ROOT}gen/bcm/rdrand-x86_64-win.asm + ${BORINGSSL_ROOT}gen/bcm/rsaz-avx2-win.asm + ${BORINGSSL_ROOT}gen/bcm/sha1-x86_64-win.asm + ${BORINGSSL_ROOT}gen/bcm/sha256-x86_64-win.asm + ${BORINGSSL_ROOT}gen/bcm/sha512-x86_64-win.asm + ${BORINGSSL_ROOT}gen/bcm/vpaes-x86_64-win.asm + ${BORINGSSL_ROOT}gen/bcm/x86-mont-win.asm + ${BORINGSSL_ROOT}gen/bcm/x86_64-mont-win.asm + ${BORINGSSL_ROOT}gen/bcm/x86_64-mont5-win.asm + ${BORINGSSL_ROOT}gen/crypto/aes128gcmsiv-x86_64-win.asm + ${BORINGSSL_ROOT}gen/crypto/chacha-x86_64-win.asm + ${BORINGSSL_ROOT}gen/crypto/chacha20_poly1305_x86_64-win.asm + ${BORINGSSL_ROOT}gen/crypto/md5-x86_64-win.asm + ${BORINGSSL_ROOT}gen/test_support/trampoline-x86_64-win.asm ) diff --git a/third_party/boringssl/src/INCORPORATING.md b/third_party/boringssl/src/INCORPORATING.md deleted file mode 100644 index 96bf9088..00000000 --- a/third_party/boringssl/src/INCORPORATING.md +++ /dev/null @@ -1,108 +0,0 @@ -# Incorporating BoringSSL into a project - -**Note**: if your target project is not a Google project then first read the -[main README](/README.md) about the purpose of BoringSSL. - -## Bazel - -If you are using [Bazel](https://bazel.build) then you can incorporate -BoringSSL as an external repository by using a commit from the -`master-with-bazel` branch. That branch is maintained by a bot from `master` -and includes the needed generated files and a top-level BUILD file. - -For example: - - git_repository( - name = "boringssl", - commit = "_some commit_", - remote = "https://boringssl.googlesource.com/boringssl", - ) - -You would still need to keep the referenced commit up to date if a specific -commit is referred to. - -## Directory layout - -Typically projects create a `third_party/boringssl` directory to put -BoringSSL-specific files into. The source code of BoringSSL itself goes into -`third_party/boringssl/src`, either by copying or as a -[submodule](https://git-scm.com/docs/git-submodule). - -It's generally a mistake to put BoringSSL's source code into -`third_party/boringssl` directly because pre-built files and custom build files -need to go somewhere and merging these with the BoringSSL source code makes -updating things more complex. - -## Build support - -BoringSSL is designed to work with many different build systems. Currently, -different projects use [GYP](https://gyp.gsrc.io/), -[GN](https://gn.googlesource.com/gn/+/master/docs/quick_start.md), -[Bazel](https://bazel.build/) and [Make](https://www.gnu.org/software/make/) to -build BoringSSL, without too much pain. - -The development build system is CMake and the CMake build knows how to -automatically generate the intermediate files that BoringSSL needs. However, -outside of the CMake environment, these intermediates are generated once and -checked into the incorporating project's source repository. This avoids -incorporating projects needing to support Perl and Go in their build systems. - -The script [`util/generate_build_files.py`](/util/generate_build_files.py) -expects to be run from the `third_party/boringssl` directory and to find the -BoringSSL source code in `src/`. You should pass it a single argument: the name -of the build system that you're using. If you don't use any of the supported -build systems then you should augment `generate_build_files.py` with support -for it. - -The script will pregenerate the intermediate files (see -[BUILDING.md](/BUILDING.md) for details about which tools will need to be -installed) and output helper files for that build system. It doesn't generate a -complete build script, just file and test lists, which change often. For -example, see the -[file](https://code.google.com/p/chromium/codesearch#chromium/src/third_party/boringssl/BUILD.generated.gni) -and -[test](https://code.google.com/p/chromium/codesearch#chromium/src/third_party/boringssl/BUILD.generated_tests.gni) -lists generated for GN in Chromium. - -Generally one checks in these generated files alongside the hand-written build -files. Periodically an engineer updates the BoringSSL revision, regenerates -these files and checks in the updated result. As an example, see how this is -done [in Chromium](https://code.google.com/p/chromium/codesearch#chromium/src/third_party/boringssl/). - -## Defines - -BoringSSL does not present a lot of configurability in order to reduce the -number of configurations that need to be tested. But there are a couple of -\#defines that you may wish to set: - -`OPENSSL_NO_ASM` prevents the use of assembly code (although it's up to you to -ensure that the build system doesn't link it in if you wish to reduce binary -size). This will have a significant performance impact but can be useful if you -wish to use tools like -[AddressSanitizer](http://clang.llvm.org/docs/AddressSanitizer.html) that -interact poorly with assembly code. - -`OPENSSL_SMALL` removes some code that is especially large at some performance -cost. - -## Symbols - -You cannot link multiple versions of BoringSSL or OpenSSL into a single binary -without dealing with symbol conflicts. If you are statically linking multiple -versions together, there's not a lot that can be done because C doesn't have a -module system. - -If you are using multiple versions in a single binary, in different shared -objects, ensure you build BoringSSL with `-fvisibility=hidden` and do not -export any of BoringSSL's symbols. This will prevent any collisions with other -verisons that may be included in other shared objects. Note that this requires -that all callers of BoringSSL APIs live in the same shared object as BoringSSL. - -If you require that BoringSSL APIs be used across shared object boundaries, -continue to build with `-fvisibility=hidden` but define -`BORINGSSL_SHARED_LIBRARY` in both BoringSSL and consumers. BoringSSL's own -source files (but *not* consumers' source files) must also build with -`BORINGSSL_IMPLEMENTATION` defined. This will export BoringSSL's public symbols -in the resulting shared object while hiding private symbols. However note that, -as with a static link, this precludes dynamically linking with another version -of BoringSSL or OpenSSL. diff --git a/third_party/boringssl/src/LICENSE b/third_party/boringssl/src/LICENSE deleted file mode 100644 index 49c41fa7..00000000 --- a/third_party/boringssl/src/LICENSE +++ /dev/null @@ -1,251 +0,0 @@ -BoringSSL is a fork of OpenSSL. As such, large parts of it fall under OpenSSL -licensing. Files that are completely new have a Google copyright and an ISC -license. This license is reproduced at the bottom of this file. - -Contributors to BoringSSL are required to follow the CLA rules for Chromium: -https://cla.developers.google.com/clas - -Files in third_party/ have their own licenses, as described therein. The MIT -license, for third_party/fiat, which, unlike other third_party directories, is -compiled into non-test libraries, is included below. - -The OpenSSL toolkit stays under a dual license, i.e. both the conditions of the -OpenSSL License and the original SSLeay license apply to the toolkit. See below -for the actual license texts. Actually both licenses are BSD-style Open Source -licenses. In case of any license issues related to OpenSSL please contact -openssl-core@openssl.org. - -The following are Google-internal bug numbers where explicit permission from -some authors is recorded for use of their work. (This is purely for our own -record keeping.) - 27287199 - 27287880 - 27287883 - - OpenSSL License - --------------- - -/* ==================================================================== - * Copyright (c) 1998-2011 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ - - Original SSLeay License - ----------------------- - -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - - -ISC license used for completely new code in BoringSSL: - -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - - -The code in third_party/fiat carries the MIT license: - -Copyright (c) 2015-2016 the fiat-crypto authors (see -https://github.com/mit-plv/fiat-crypto/blob/master/AUTHORS). - -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in all -copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE -SOFTWARE. - - -Licenses for support code -------------------------- - -Parts of the TLS test suite are under the Go license. This code is not included -in BoringSSL (i.e. libcrypto and libssl) when compiled, however, so -distributing code linked against BoringSSL does not trigger this license: - -Copyright (c) 2009 The Go Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - - -BoringSSL uses the Chromium test infrastructure to run a continuous build, -trybots etc. The scripts which manage this, and the script for generating build -metadata, are under the Chromium license. Distributing code linked against -BoringSSL does not trigger this license. - -Copyright 2015 The Chromium Authors. All rights reserved. - -Redistribution and use in source and binary forms, with or without -modification, are permitted provided that the following conditions are -met: - - * Redistributions of source code must retain the above copyright -notice, this list of conditions and the following disclaimer. - * Redistributions in binary form must reproduce the above -copyright notice, this list of conditions and the following disclaimer -in the documentation and/or other materials provided with the -distribution. - * Neither the name of Google Inc. nor the names of its -contributors may be used to endorse or promote products derived from -this software without specific prior written permission. - -THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS -"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT -LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR -A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT -OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, -SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT -LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, -DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY -THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT -(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE -OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/third_party/boringssl/src/README.md b/third_party/boringssl/src/README.md deleted file mode 100644 index 2a99b60b..00000000 --- a/third_party/boringssl/src/README.md +++ /dev/null @@ -1,42 +0,0 @@ -# BoringSSL - -BoringSSL is a fork of OpenSSL that is designed to meet Google's needs. - -Although BoringSSL is an open source project, it is not intended for general -use, as OpenSSL is. We don't recommend that third parties depend upon it. Doing -so is likely to be frustrating because there are no guarantees of API or ABI -stability. - -Programs ship their own copies of BoringSSL when they use it and we update -everything as needed when deciding to make API changes. This allows us to -mostly avoid compromises in the name of compatibility. It works for us, but it -may not work for you. - -BoringSSL arose because Google used OpenSSL for many years in various ways and, -over time, built up a large number of patches that were maintained while -tracking upstream OpenSSL. As Google's product portfolio became more complex, -more copies of OpenSSL sprung up and the effort involved in maintaining all -these patches in multiple places was growing steadily. - -Currently BoringSSL is the SSL library in Chrome/Chromium, Android (but it's -not part of the NDK) and a number of other apps/programs. - -Project links: - - * [API documentation](https://commondatastorage.googleapis.com/chromium-boringssl-docs/headers.html) - * [Bug tracker](https://bugs.chromium.org/p/boringssl/issues/list) - * [CI](https://ci.chromium.org/p/boringssl/g/main/console) - * [Code review](https://boringssl-review.googlesource.com) - -There are other files in this directory which might be helpful: - - * [PORTING.md](/PORTING.md): how to port OpenSSL-using code to BoringSSL. - * [BUILDING.md](/BUILDING.md): how to build BoringSSL - * [INCORPORATING.md](/INCORPORATING.md): how to incorporate BoringSSL into a project. - * [API-CONVENTIONS.md](/API-CONVENTIONS.md): general API conventions for BoringSSL consumers and developers. - * [STYLE.md](/STYLE.md): rules and guidelines for coding style. - * include/openssl: public headers with API documentation in comments. Also [available online](https://commondatastorage.googleapis.com/chromium-boringssl-docs/headers.html). - * [FUZZING.md](/FUZZING.md): information about fuzzing BoringSSL. - * [CONTRIBUTING.md](/CONTRIBUTING.md): how to contribute to BoringSSL. - * [BREAKING-CHANGES.md](/BREAKING-CHANGES.md): notes on potentially-breaking changes. - * [SANDBOXING.md](/SANDBOXING.md): notes on using BoringSSL in a sandboxed environment. diff --git a/third_party/boringssl/src/crypto/aes/aes.cc b/third_party/boringssl/src/crypto/aes/aes.cc new file mode 100644 index 00000000..ac2c943d --- /dev/null +++ b/third_party/boringssl/src/crypto/aes/aes.cc @@ -0,0 +1,44 @@ +// Copyright 2025 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../fipsmodule/bcm_interface.h" + + +using namespace bssl; + +void AES_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { + BCM_aes_encrypt(in, out, key); +} + +void AES_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { + BCM_aes_decrypt(in, out, key); +} + +int AES_set_encrypt_key(const uint8_t *key, unsigned bits, AES_KEY *aeskey) { + if (bits != 128 && bits != 192 && bits != 256) { + return -2; + } + return bcm_success(BCM_aes_set_encrypt_key(key, bits, aeskey)) ? 0 : -1; +} + +int AES_set_decrypt_key(const uint8_t *key, unsigned bits, AES_KEY *aeskey) { + if (bits != 128 && bits != 192 && bits != 256) { + return -2; + } + return bcm_success(BCM_aes_set_decrypt_key(key, bits, aeskey)) ? 0 : -1; +} diff --git a/third_party/boringssl/src/crypto/armv8_feature_parsing.h b/third_party/boringssl/src/crypto/armv8_feature_parsing.h new file mode 100644 index 00000000..5ebb8764 --- /dev/null +++ b/third_party/boringssl/src/crypto/armv8_feature_parsing.h @@ -0,0 +1,67 @@ +#ifndef OPENSSL_CRYPTO_CPU_ARMV8_FEATURE_PARSING_H +#define OPENSSL_CRYPTO_CPU_ARMV8_FEATURE_PARSING_H + +#include +#include + +#include "internal.h" + +#if defined(OPENSSL_AARCH64) + +BSSL_NAMESPACE_BEGIN +namespace armcap { + +// Common field indices based on ARM architecture specification for +// ID_AA64ISAR0_EL1. These are indices (multiplied by 4 for the bit shift). +// Note: SHA3_FIELD_IDX 8 * 4 = 32 (ID_AA64ISAR0_EL1_SHA3_SHIFT value) +#define ID_AA64ISAR0_AES_FIELD_IDX 1 // Bits [7:4] +#define ID_AA64ISAR0_SHA1_FIELD_IDX 2 // Bits [11:8] +#define ID_AA64ISAR0_SHA2_FIELD_IDX 3 // Bits [15:12] +#define ID_AA64ISAR0_SHA3_FIELD_IDX 8 // Bits [35:32] +#define NBITS_ID_FIELD 4 + +// Helper function to extract a 4-bit field based on its index. +inline unsigned GetIDField(uint64_t reg, unsigned field_idx) { + // We mask with 0xf to ensure only the 4 relevant bits are returned. + return (reg >> (field_idx * NBITS_ID_FIELD)) & 0xf; +} + +// The core function that converts the raw ID_AA64ISAR0 register value +// into the OR'd capability flags (ARMV8_AES, ARMV8_SHA3, etc.). +inline uint32_t ParseISAR0Flags(uint64_t isar0) { + uint32_t armcap = 0; + // AES and PMULL check + unsigned aes = GetIDField(isar0, ID_AA64ISAR0_AES_FIELD_IDX); + if (aes > 0) { + armcap |= ARMV8_AES; + } + if (aes > 1) { + armcap |= ARMV8_PMULL; + } + // SHA1 check + unsigned sha1 = GetIDField(isar0, ID_AA64ISAR0_SHA1_FIELD_IDX); + if (sha1 > 0) { + armcap |= ARMV8_SHA1; + } + // SHA256 and SHA512 check + unsigned sha2 = GetIDField(isar0, ID_AA64ISAR0_SHA2_FIELD_IDX); + if (sha2 > 0) { + armcap |= ARMV8_SHA256; + } + if (sha2 > 1) { + armcap |= ARMV8_SHA512; + } + // SHA3 (EOR3) check + unsigned sha3 = GetIDField(isar0, ID_AA64ISAR0_SHA3_FIELD_IDX); + if (sha3 > 0) { + armcap |= ARMV8_SHA3; + } + return armcap; +} + +} // namespace armcap +BSSL_NAMESPACE_END + +#endif // OPENSSL_AARCH64 + +#endif // OPENSSL_CRYPTO_CPU_ARMV8_FEATURE_PARSING_H diff --git a/third_party/boringssl/src/crypto/asn1/a_bitstr.c b/third_party/boringssl/src/crypto/asn1/a_bitstr.c deleted file mode 100644 index 9c508577..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_bitstr.c +++ /dev/null @@ -1,281 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include - -#include "../internal.h" -#include "internal.h" - - -int ASN1_BIT_STRING_set(ASN1_BIT_STRING *x, const unsigned char *d, int len) { - return ASN1_STRING_set(x, d, len); -} - -int asn1_bit_string_length(const ASN1_BIT_STRING *str, - uint8_t *out_padding_bits) { - int len = str->length; - if (str->flags & ASN1_STRING_FLAG_BITS_LEFT) { - // If the string is already empty, it cannot have padding bits. - *out_padding_bits = len == 0 ? 0 : str->flags & 0x07; - return len; - } - - // TODO(https://crbug.com/boringssl/447): If we move this logic to - // |ASN1_BIT_STRING_set_bit|, can we remove this representation? - while (len > 0 && str->data[len - 1] == 0) { - len--; - } - uint8_t padding_bits = 0; - if (len > 0) { - uint8_t last = str->data[len - 1]; - assert(last != 0); - for (; padding_bits < 7; padding_bits++) { - if (last & (1 << padding_bits)) { - break; - } - } - } - *out_padding_bits = padding_bits; - return len; -} - -int ASN1_BIT_STRING_num_bytes(const ASN1_BIT_STRING *str, size_t *out) { - uint8_t padding_bits; - int len = asn1_bit_string_length(str, &padding_bits); - if (padding_bits != 0) { - return 0; - } - *out = len; - return 1; -} - -int i2c_ASN1_BIT_STRING(const ASN1_BIT_STRING *a, unsigned char **pp) { - if (a == NULL) { - return 0; - } - - uint8_t bits; - int len = asn1_bit_string_length(a, &bits); - int ret = 1 + len; - if (pp == NULL) { - return ret; - } - - uint8_t *p = *pp; - *(p++) = bits; - OPENSSL_memcpy(p, a->data, len); - if (len > 0) { - p[len - 1] &= (0xff << bits); - } - p += len; - *pp = p; - return ret; -} - -ASN1_BIT_STRING *c2i_ASN1_BIT_STRING(ASN1_BIT_STRING **a, - const unsigned char **pp, long len) { - ASN1_BIT_STRING *ret = NULL; - const unsigned char *p; - unsigned char *s; - int padding; - - if (len < 1) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_SHORT); - goto err; - } - - if (len > INT_MAX) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_LONG); - goto err; - } - - if ((a == NULL) || ((*a) == NULL)) { - if ((ret = ASN1_BIT_STRING_new()) == NULL) { - return NULL; - } - } else { - ret = (*a); - } - - p = *pp; - padding = *(p++); - len--; - if (padding > 7) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BIT_STRING_BITS_LEFT); - goto err; - } - - // Unused bits in a BIT STRING must be zero. - uint8_t padding_mask = (1 << padding) - 1; - if (padding != 0 && (len < 1 || (p[len - 1] & padding_mask) != 0)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BIT_STRING_PADDING); - goto err; - } - - // We do this to preserve the settings. If we modify the settings, via - // the _set_bit function, we will recalculate on output - ret->flags &= ~(ASN1_STRING_FLAG_BITS_LEFT | 0x07); // clear - ret->flags |= (ASN1_STRING_FLAG_BITS_LEFT | padding); // set - - if (len > 0) { - s = OPENSSL_memdup(p, len); - if (s == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - goto err; - } - p += len; - } else { - s = NULL; - } - - ret->length = (int)len; - OPENSSL_free(ret->data); - ret->data = s; - ret->type = V_ASN1_BIT_STRING; - if (a != NULL) { - (*a) = ret; - } - *pp = p; - return ret; -err: - if ((ret != NULL) && ((a == NULL) || (*a != ret))) { - ASN1_BIT_STRING_free(ret); - } - return NULL; -} - -// These next 2 functions from Goetz Babin-Ebell -int ASN1_BIT_STRING_set_bit(ASN1_BIT_STRING *a, int n, int value) { - int w, v, iv; - unsigned char *c; - - w = n / 8; - v = 1 << (7 - (n & 0x07)); - iv = ~v; - if (!value) { - v = 0; - } - - if (a == NULL) { - return 0; - } - - a->flags &= ~(ASN1_STRING_FLAG_BITS_LEFT | 0x07); // clear, set on write - - if ((a->length < (w + 1)) || (a->data == NULL)) { - if (!value) { - return 1; // Don't need to set - } - if (a->data == NULL) { - c = (unsigned char *)OPENSSL_malloc(w + 1); - } else { - c = (unsigned char *)OPENSSL_realloc(a->data, w + 1); - } - if (c == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return 0; - } - if (w + 1 - a->length > 0) { - OPENSSL_memset(c + a->length, 0, w + 1 - a->length); - } - a->data = c; - a->length = w + 1; - } - a->data[w] = ((a->data[w]) & iv) | v; - while ((a->length > 0) && (a->data[a->length - 1] == 0)) { - a->length--; - } - return 1; -} - -int ASN1_BIT_STRING_get_bit(const ASN1_BIT_STRING *a, int n) { - int w, v; - - w = n / 8; - v = 1 << (7 - (n & 0x07)); - if ((a == NULL) || (a->length < (w + 1)) || (a->data == NULL)) { - return 0; - } - return ((a->data[w] & v) != 0); -} - -// Checks if the given bit string contains only bits specified by -// the flags vector. Returns 0 if there is at least one bit set in 'a' -// which is not specified in 'flags', 1 otherwise. -// 'len' is the length of 'flags'. -int ASN1_BIT_STRING_check(const ASN1_BIT_STRING *a, const unsigned char *flags, - int flags_len) { - int i, ok; - // Check if there is one bit set at all. - if (!a || !a->data) { - return 1; - } - - // Check each byte of the internal representation of the bit string. - ok = 1; - for (i = 0; i < a->length && ok; ++i) { - unsigned char mask = i < flags_len ? ~flags[i] : 0xff; - // We are done if there is an unneeded bit set. - ok = (a->data[i] & mask) == 0; - } - return ok; -} diff --git a/third_party/boringssl/src/crypto/asn1/a_bitstr.cc b/third_party/boringssl/src/crypto/asn1/a_bitstr.cc new file mode 100644 index 00000000..a071589e --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_bitstr.cc @@ -0,0 +1,297 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +static void set_unused_bits(ASN1_BIT_STRING *str, uint8_t unused_bits) { + assert(unused_bits < 8); + assert(unused_bits == 0 || str->length > 0); + // |ASN1_STRING_FLAG_BITS_LEFT| and the bottom 3 bits encode |padding|. + str->flags &= ~0x07; + str->flags |= ASN1_STRING_FLAG_BITS_LEFT | unused_bits; +} + +int ASN1_BIT_STRING_set(ASN1_BIT_STRING *str, const uint8_t *data, + ossl_ssize_t len) { + return ASN1_STRING_set(str, data, len); +} + +int ASN1_BIT_STRING_set1(ASN1_BIT_STRING *str, const uint8_t *data, + size_t length, int unused_bits) { + if (unused_bits < 0 || unused_bits > 7) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BIT_STRING_BITS_LEFT); + return 0; + } + const uint8_t unused_bits_mask = (1 << unused_bits) - 1; + if ((length > 0 && (data[length - 1] & unused_bits_mask) != 0) || + (length == 0 && unused_bits != 0)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BIT_STRING_BITS_LEFT); + return 0; + } + if (!ASN1_STRING_set(str, data, length)) { + return 0; + } + str->type = V_ASN1_BIT_STRING; + set_unused_bits(str, unused_bits); + return 1; +} + +uint8_t ASN1_BIT_STRING_unused_bits(const ASN1_BIT_STRING *str) { + // If the string is already empty, it cannot have padding bits. + return str->length == 0 ? 0 : str->flags & 0x07; +} + +int ASN1_BIT_STRING_num_bytes(const ASN1_BIT_STRING *str, size_t *out) { + if (ASN1_BIT_STRING_unused_bits(str) != 0) { + return 0; + } + *out = str->length; + return 1; +} + +int i2c_ASN1_BIT_STRING(const ASN1_BIT_STRING *a, unsigned char **pp) { + if (a == nullptr) { + return 0; + } + + uint8_t bits = ASN1_BIT_STRING_unused_bits(a); + int len = ASN1_STRING_length(a); + if (len > INT_MAX - 1) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_OVERFLOW); + return 0; + } + int ret = 1 + len; + if (pp == nullptr) { + return ret; + } + + uint8_t *p = *pp; + *(p++) = bits; + OPENSSL_memcpy(p, a->data, len); + if (len > 0) { + p[len - 1] &= (0xff << bits); + } + p += len; + *pp = p; + return ret; +} + +int bssl::asn1_marshal_bit_string(CBB *out, const ASN1_BIT_STRING *in, + CBS_ASN1_TAG tag) { + int len = i2c_ASN1_BIT_STRING(in, nullptr); + if (len <= 0) { + return 0; + } + tag = tag == 0 ? CBS_ASN1_BITSTRING : tag; + CBB child; + uint8_t *ptr; + return CBB_add_asn1(out, &child, tag) && // + CBB_add_space(&child, &ptr, static_cast(len)) && // + i2c_ASN1_BIT_STRING(in, &ptr) == len && // + CBB_flush(out); +} + +static int asn1_parse_bit_string_contents(Span in, + ASN1_BIT_STRING *out) { + CBS cbs = in; + uint8_t padding; + if (!CBS_get_u8(&cbs, &padding)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_SHORT); + return 0; + } + + if (padding > 7) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BIT_STRING_BITS_LEFT); + return 0; + } + + // Unused bits in a BIT STRING must be zero. + uint8_t padding_mask = (1 << padding) - 1; + if (padding != 0) { + CBS copy = cbs; + uint8_t last; + if (!CBS_get_last_u8(©, &last) || (last & padding_mask) != 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BIT_STRING_PADDING); + return 0; + } + } + + return ASN1_BIT_STRING_set1(out, CBS_data(&cbs), CBS_len(&cbs), padding); +} + +ASN1_BIT_STRING *c2i_ASN1_BIT_STRING(ASN1_BIT_STRING **a, + const unsigned char **pp, long len) { + if (len < 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_SHORT); + return nullptr; + } + + ASN1_BIT_STRING *ret = nullptr; + if (a == nullptr || *a == nullptr) { + if ((ret = ASN1_BIT_STRING_new()) == nullptr) { + return nullptr; + } + } else { + ret = *a; + } + + if (!asn1_parse_bit_string_contents(Span(*pp, len), ret)) { + if (ret != nullptr && (a == nullptr || *a != ret)) { + ASN1_BIT_STRING_free(ret); + } + return nullptr; + } + + if (a != nullptr) { + *a = ret; + } + *pp += len; + return ret; +} + +int bssl::asn1_parse_bit_string(CBS *cbs, ASN1_BIT_STRING *out, + CBS_ASN1_TAG tag) { + tag = tag == 0 ? CBS_ASN1_BITSTRING : tag; + CBS child; + if (!CBS_get_asn1(cbs, &child, tag)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + return asn1_parse_bit_string_contents(child, out); +} + +int bssl::asn1_parse_bit_string_with_bad_length(CBS *cbs, + ASN1_BIT_STRING *out) { + CBS child; + CBS_ASN1_TAG tag; + size_t header_len; + int indefinite; + if (!CBS_get_any_ber_asn1_element(cbs, &child, &tag, &header_len, + /*out_ber_found=*/nullptr, + &indefinite) || + tag != CBS_ASN1_BITSTRING || indefinite || // + !CBS_skip(&child, header_len)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + return asn1_parse_bit_string_contents(child, out); +} + +static void trim_trailing_zeros(ASN1_BIT_STRING *a) { + while (a->length > 0 && a->data[a->length - 1] == 0) { + a->length--; + } + uint8_t padding_bits = 0; + if (a->length > 0) { + uint8_t last = a->data[a->length - 1]; + assert(last != 0); + for (; padding_bits < 7; padding_bits++) { + if (last & (1 << padding_bits)) { + break; + } + } + } + set_unused_bits(a, padding_bits); +} + +// These next 2 functions from Goetz Babin-Ebell +int ASN1_BIT_STRING_set_bit(ASN1_BIT_STRING *a, int n, int value) { + if (a == nullptr) { + return 0; + } + + if (n < 0) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + int w = n / 8; + int v = 1 << (7 - (n & 0x07)); + int iv = ~v; + if (!value) { + v = 0; + } + + if ((a->length < (w + 1)) || (a->data == nullptr)) { + if (!value) { + trim_trailing_zeros(a); + return 1; // Don't need to set + } + unsigned char *c; + if (a->data == nullptr) { + c = (unsigned char *)OPENSSL_malloc(w + 1); + } else { + c = (unsigned char *)OPENSSL_realloc(a->data, w + 1); + } + if (c == nullptr) { + return 0; + } + if (w + 1 - a->length > 0) { + OPENSSL_memset(c + a->length, 0, w + 1 - a->length); + } + a->data = c; + a->length = w + 1; + } + a->data[w] = (a->data[w] & iv) | v; + trim_trailing_zeros(a); + return 1; +} + +int ASN1_BIT_STRING_get_bit(const ASN1_BIT_STRING *a, int n) { + int w, v; + + w = n / 8; + v = 1 << (7 - (n & 0x07)); + if ((a == nullptr) || (a->length < (w + 1)) || (a->data == nullptr)) { + return 0; + } + return ((a->data[w] & v) != 0); +} + +// Checks if the given bit string contains only bits specified by +// the flags vector. Returns 0 if there is at least one bit set in 'a' +// which is not specified in 'flags', 1 otherwise. +// 'len' is the length of 'flags'. +int ASN1_BIT_STRING_check(const ASN1_BIT_STRING *a, const unsigned char *flags, + int flags_len) { + int i, ok; + // Check if there is one bit set at all. + if (!a || !a->data) { + return 1; + } + + // Check each byte of the internal representation of the bit string. + ok = 1; + for (i = 0; i < a->length && ok; ++i) { + unsigned char mask = i < flags_len ? ~flags[i] : 0xff; + // We are done if there is an unneeded bit set. + ok = (a->data[i] & mask) == 0; + } + return ok; +} diff --git a/third_party/boringssl/src/crypto/asn1/a_bool.c b/third_party/boringssl/src/crypto/asn1/a_bool.c deleted file mode 100644 index 2a4448cc..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_bool.c +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -int i2d_ASN1_BOOLEAN(ASN1_BOOLEAN a, unsigned char **pp) { - int r; - unsigned char *p, *allocated = NULL; - - r = ASN1_object_size(0, 1, V_ASN1_BOOLEAN); - if (pp == NULL) { - return r; - } - - if (*pp == NULL) { - if ((p = allocated = OPENSSL_malloc(r)) == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return -1; - } - } else { - p = *pp; - } - - ASN1_put_object(&p, 0, 1, V_ASN1_BOOLEAN, V_ASN1_UNIVERSAL); - *p = a ? 0xff : 0x00; - - // If a new buffer was allocated, just return it back. - // If not, return the incremented buffer pointer. - *pp = allocated != NULL ? allocated : p + 1; - return r; -} - -ASN1_BOOLEAN d2i_ASN1_BOOLEAN(ASN1_BOOLEAN *a, const unsigned char **pp, - long length) { - const unsigned char *p = *pp; - long len; - int inf, tag, xclass; - inf = ASN1_get_object(&p, &len, &tag, &xclass, length); - if (inf & 0x80) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_OBJECT_HEADER); - return -1; - } - - if (inf & V_ASN1_CONSTRUCTED) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_TYPE_NOT_PRIMITIVE); - return -1; - } - - if (tag != V_ASN1_BOOLEAN || xclass != V_ASN1_UNIVERSAL) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_EXPECTING_A_BOOLEAN); - return -1; - } - - if (len != 1) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BOOLEAN_IS_WRONG_LENGTH); - return -1; - } - ASN1_BOOLEAN ret = (ASN1_BOOLEAN) * (p++); - if (a != NULL) { - (*a) = ret; - } - *pp = p; - return ret; -} diff --git a/third_party/boringssl/src/crypto/asn1/a_bool.cc b/third_party/boringssl/src/crypto/asn1/a_bool.cc new file mode 100644 index 00000000..92946884 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_bool.cc @@ -0,0 +1,50 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "../bytestring/internal.h" + + +int i2d_ASN1_BOOLEAN(ASN1_BOOLEAN a, unsigned char **outp) { + return bssl::I2DFromCBB( + /*initial_capacity=*/3, outp, [&](CBB *cbb) -> bool { + return CBB_add_asn1_bool(cbb, a != ASN1_BOOLEAN_FALSE); + }); +} + +ASN1_BOOLEAN d2i_ASN1_BOOLEAN(ASN1_BOOLEAN *out, const unsigned char **inp, + long len) { + if (len < 0) { + return ASN1_BOOLEAN_NONE; + } + + CBS cbs; + CBS_init(&cbs, *inp, (size_t)len); + int val; + if (!CBS_get_asn1_bool(&cbs, &val)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return ASN1_BOOLEAN_NONE; + } + + ASN1_BOOLEAN ret = val ? ASN1_BOOLEAN_TRUE : ASN1_BOOLEAN_FALSE; + if (out != nullptr) { + *out = ret; + } + *inp = CBS_data(&cbs); + return ret; +} diff --git a/third_party/boringssl/src/crypto/asn1/a_d2i_fp.c b/third_party/boringssl/src/crypto/asn1/a_d2i_fp.c deleted file mode 100644 index 36c9d699..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_d2i_fp.c +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include -#include - - -void *ASN1_item_d2i_bio(const ASN1_ITEM *it, BIO *in, void *x) { - uint8_t *data; - size_t len; - // Historically, this function did not impose a limit in OpenSSL and is used - // to read CRLs, so we leave this without an external bound. - if (!BIO_read_asn1(in, &data, &len, INT_MAX)) { - return NULL; - } - const uint8_t *ptr = data; - void *ret = ASN1_item_d2i(x, &ptr, len, it); - OPENSSL_free(data); - return ret; -} - -void *ASN1_item_d2i_fp(const ASN1_ITEM *it, FILE *in, void *x) { - BIO *b = BIO_new_fp(in, BIO_NOCLOSE); - if (b == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_BUF_LIB); - return NULL; - } - void *ret = ASN1_item_d2i_bio(it, b, x); - BIO_free(b); - return ret; -} diff --git a/third_party/boringssl/src/crypto/asn1/a_d2i_fp.cc b/third_party/boringssl/src/crypto/asn1/a_d2i_fp.cc new file mode 100644 index 00000000..36eee341 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_d2i_fp.cc @@ -0,0 +1,47 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include + + +void *ASN1_item_d2i_bio(const ASN1_ITEM *it, BIO *in, void *x) { + uint8_t *data; + size_t len; + // Historically, this function did not impose a limit in OpenSSL and is used + // to read CRLs, so we leave this without an external bound. + if (!BIO_read_asn1(in, &data, &len, INT_MAX)) { + return nullptr; + } + const uint8_t *ptr = data; + void *ret = ASN1_item_d2i(reinterpret_cast(x), &ptr, len, it); + OPENSSL_free(data); + return ret; +} + +void *ASN1_item_d2i_fp(const ASN1_ITEM *it, FILE *in, void *x) { + BIO *b = BIO_new_fp(in, BIO_NOCLOSE); + if (b == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_BUF_LIB); + return nullptr; + } + void *ret = ASN1_item_d2i_bio(it, b, x); + BIO_free(b); + return ret; +} diff --git a/third_party/boringssl/src/crypto/asn1/a_dup.c b/third_party/boringssl/src/crypto/asn1/a_dup.c deleted file mode 100644 index 8ee0c7eb..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_dup.c +++ /dev/null @@ -1,85 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -// ASN1_ITEM version of dup: this follows the model above except we don't -// need to allocate the buffer. At some point this could be rewritten to -// directly dup the underlying structure instead of doing and encode and -// decode. -void *ASN1_item_dup(const ASN1_ITEM *it, void *x) { - unsigned char *b = NULL; - const unsigned char *p; - long i; - void *ret; - - if (x == NULL) { - return NULL; - } - - i = ASN1_item_i2d(x, &b, it); - if (b == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return NULL; - } - p = b; - ret = ASN1_item_d2i(NULL, &p, i, it); - OPENSSL_free(b); - return ret; -} diff --git a/third_party/boringssl/src/crypto/asn1/a_dup.cc b/third_party/boringssl/src/crypto/asn1/a_dup.cc new file mode 100644 index 00000000..b98a0e8d --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_dup.cc @@ -0,0 +1,42 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +// ASN1_ITEM version of dup: this follows the model above except we don't +// need to allocate the buffer. At some point this could be rewritten to +// directly dup the underlying structure instead of doing and encode and +// decode. +void *ASN1_item_dup(const ASN1_ITEM *it, void *x) { + unsigned char *b = nullptr; + const unsigned char *p; + long i; + void *ret; + + if (x == nullptr) { + return nullptr; + } + + i = ASN1_item_i2d(reinterpret_cast(x), &b, it); + if (b == nullptr) { + return nullptr; + } + p = b; + ret = ASN1_item_d2i(nullptr, &p, i, it); + OPENSSL_free(b); + return ret; +} diff --git a/third_party/boringssl/src/crypto/asn1/a_gentm.c b/third_party/boringssl/src/crypto/asn1/a_gentm.c deleted file mode 100644 index 283ff7d3..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_gentm.c +++ /dev/null @@ -1,148 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include -#include -#include - -#include -#include - -#include "internal.h" - -int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d) { - if (d->type != V_ASN1_GENERALIZEDTIME) { - return 0; - } - CBS cbs; - CBS_init(&cbs, d->data, (size_t)d->length); - if (!CBS_parse_generalized_time(&cbs, tm, /*allow_timezone_offset=*/0)) { - return 0; - } - return 1; -} - -int ASN1_GENERALIZEDTIME_check(const ASN1_GENERALIZEDTIME *d) { - return asn1_generalizedtime_to_tm(NULL, d); -} - -int ASN1_GENERALIZEDTIME_set_string(ASN1_GENERALIZEDTIME *s, const char *str) { - ASN1_GENERALIZEDTIME t; - - t.type = V_ASN1_GENERALIZEDTIME; - t.length = strlen(str); - t.data = (unsigned char *)str; - if (ASN1_GENERALIZEDTIME_check(&t)) { - if (s != NULL) { - if (!ASN1_STRING_set((ASN1_STRING *)s, (unsigned char *)str, t.length)) { - return 0; - } - s->type = V_ASN1_GENERALIZEDTIME; - } - return 1; - } else { - return 0; - } -} - -ASN1_GENERALIZEDTIME *ASN1_GENERALIZEDTIME_set(ASN1_GENERALIZEDTIME *s, - time_t t) { - return ASN1_GENERALIZEDTIME_adj(s, t, 0, 0); -} - -ASN1_GENERALIZEDTIME *ASN1_GENERALIZEDTIME_adj(ASN1_GENERALIZEDTIME *s, - time_t t, int offset_day, - long offset_sec) { - struct tm data; - if (!OPENSSL_gmtime(&t, &data)) { - return NULL; - } - - if (offset_day || offset_sec) { - if (!OPENSSL_gmtime_adj(&data, offset_day, offset_sec)) { - return NULL; - } - } - - if (data.tm_year < 0 - 1900 || data.tm_year > 9999 - 1900) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_TIME_VALUE); - return NULL; - } - - char buf[16]; - BIO_snprintf(buf, sizeof(buf), "%04d%02d%02d%02d%02d%02dZ", - data.tm_year + 1900, data.tm_mon + 1, data.tm_mday, data.tm_hour, - data.tm_min, data.tm_sec); - - int free_s = 0; - if (s == NULL) { - free_s = 1; - s = ASN1_UTCTIME_new(); - if (s == NULL) { - return NULL; - } - } - - if (!ASN1_STRING_set(s, buf, strlen(buf))) { - if (free_s) { - ASN1_UTCTIME_free(s); - } - return NULL; - } - s->type = V_ASN1_GENERALIZEDTIME; - return s; -} diff --git a/third_party/boringssl/src/crypto/asn1/a_gentm.cc b/third_party/boringssl/src/crypto/asn1/a_gentm.cc new file mode 100644 index 00000000..1dd22be0 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_gentm.cc @@ -0,0 +1,130 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +int bssl::asn1_generalizedtime_to_tm(struct tm *tm, + const ASN1_GENERALIZEDTIME *d) { + if (d->type != V_ASN1_GENERALIZEDTIME) { + return 0; + } + CBS cbs; + CBS_init(&cbs, d->data, (size_t)d->length); + if (!CBS_parse_generalized_time(&cbs, tm, /*allow_timezone_offset=*/0)) { + return 0; + } + return 1; +} + +int bssl::asn1_parse_generalized_time(CBS *cbs, ASN1_GENERALIZEDTIME *out, + CBS_ASN1_TAG tag) { + tag = tag == 0 ? CBS_ASN1_GENERALIZEDTIME : tag; + CBS child; + if (!CBS_get_asn1(cbs, &child, tag) || + !CBS_parse_generalized_time(&child, nullptr, + /*allow_timezone_offset=*/0)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + if (!ASN1_STRING_set(out, CBS_data(&child), CBS_len(&child))) { + return 0; + } + out->type = V_ASN1_GENERALIZEDTIME; + return 1; +} + +int ASN1_GENERALIZEDTIME_check(const ASN1_GENERALIZEDTIME *d) { + return asn1_generalizedtime_to_tm(nullptr, d); +} + +int ASN1_GENERALIZEDTIME_set_string(ASN1_GENERALIZEDTIME *s, const char *str) { + size_t len = strlen(str); + CBS cbs; + CBS_init(&cbs, (const uint8_t *)str, len); + if (!CBS_parse_generalized_time(&cbs, /*out_tm=*/nullptr, + /*allow_timezone_offset=*/0)) { + return 0; + } + if (s != nullptr) { + if (!ASN1_STRING_set(s, str, len)) { + return 0; + } + s->type = V_ASN1_GENERALIZEDTIME; + } + return 1; +} + +ASN1_GENERALIZEDTIME *ASN1_GENERALIZEDTIME_set(ASN1_GENERALIZEDTIME *s, + int64_t posix_time) { + return ASN1_GENERALIZEDTIME_adj(s, posix_time, 0, 0); +} + +ASN1_GENERALIZEDTIME *ASN1_GENERALIZEDTIME_adj(ASN1_GENERALIZEDTIME *s, + int64_t posix_time, + int offset_day, + long offset_sec) { + struct tm data; + if (!OPENSSL_posix_to_tm(posix_time, &data)) { + return nullptr; + } + + if (offset_day || offset_sec) { + if (!OPENSSL_gmtime_adj(&data, offset_day, offset_sec)) { + return nullptr; + } + } + + if (data.tm_year < 0 - 1900 || data.tm_year > 9999 - 1900) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_TIME_VALUE); + return nullptr; + } + + char buf[16]; + int ret = snprintf(buf, sizeof(buf), "%04d%02d%02d%02d%02d%02dZ", + data.tm_year + 1900, data.tm_mon + 1, data.tm_mday, + data.tm_hour, data.tm_min, data.tm_sec); + // |snprintf| must write exactly 15 bytes (plus the NUL) to the buffer. + BSSL_CHECK(ret == static_cast(sizeof(buf) - 1)); + + int free_s = 0; + if (s == nullptr) { + free_s = 1; + s = ASN1_UTCTIME_new(); + if (s == nullptr) { + return nullptr; + } + } + + if (!ASN1_STRING_set(s, buf, strlen(buf))) { + if (free_s) { + ASN1_UTCTIME_free(s); + } + return nullptr; + } + s->type = V_ASN1_GENERALIZEDTIME; + return s; +} diff --git a/third_party/boringssl/src/crypto/asn1/a_i2d_fp.c b/third_party/boringssl/src/crypto/asn1/a_i2d_fp.c deleted file mode 100644 index 4a14f2bc..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_i2d_fp.c +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - - -int ASN1_item_i2d_fp(const ASN1_ITEM *it, FILE *out, void *x) { - BIO *b = BIO_new_fp(out, BIO_NOCLOSE); - if (b == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_BUF_LIB); - return 0; - } - int ret = ASN1_item_i2d_bio(it, b, x); - BIO_free(b); - return ret; -} - -int ASN1_item_i2d_bio(const ASN1_ITEM *it, BIO *out, void *x) { - unsigned char *b = NULL; - int n = ASN1_item_i2d(x, &b, it); - if (b == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return 0; - } - - int ret = BIO_write_all(out, b, n); - OPENSSL_free(b); - return ret; -} diff --git a/third_party/boringssl/src/crypto/asn1/a_i2d_fp.cc b/third_party/boringssl/src/crypto/asn1/a_i2d_fp.cc new file mode 100644 index 00000000..30385cf5 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_i2d_fp.cc @@ -0,0 +1,44 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + + +int ASN1_item_i2d_fp(const ASN1_ITEM *it, FILE *out, const void *x) { + BIO *b = BIO_new_fp(out, BIO_NOCLOSE); + if (b == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_BUF_LIB); + return 0; + } + int ret = ASN1_item_i2d_bio(it, b, x); + BIO_free(b); + return ret; +} + +int ASN1_item_i2d_bio(const ASN1_ITEM *it, BIO *out, const void *x) { + unsigned char *b = nullptr; + int n = ASN1_item_i2d(reinterpret_cast(const_cast(x)), + &b, it); + if (b == nullptr) { + return 0; + } + + int ret = BIO_write_all(out, b, n); + OPENSSL_free(b); + return ret; +} diff --git a/third_party/boringssl/src/crypto/asn1/a_int.c b/third_party/boringssl/src/crypto/asn1/a_int.c deleted file mode 100644 index afc88d23..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_int.c +++ /dev/null @@ -1,438 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include -#include -#include - -#include "../internal.h" - - -ASN1_INTEGER *ASN1_INTEGER_dup(const ASN1_INTEGER *x) { - return ASN1_STRING_dup(x); -} - -int ASN1_INTEGER_cmp(const ASN1_INTEGER *x, const ASN1_INTEGER *y) { - // Compare signs. - int neg = x->type & V_ASN1_NEG; - if (neg != (y->type & V_ASN1_NEG)) { - return neg ? -1 : 1; - } - - int ret = ASN1_STRING_cmp(x, y); - if (neg) { - // This could be |-ret|, but |ASN1_STRING_cmp| is not forbidden from - // returning |INT_MIN|. - if (ret < 0) { - return 1; - } else if (ret > 0) { - return -1; - } else { - return 0; - } - } - - return ret; -} - -// negate_twos_complement negates |len| bytes from |buf| in-place, interpreted -// as a signed, big-endian two's complement value. -static void negate_twos_complement(uint8_t *buf, size_t len) { - uint8_t borrow = 0; - for (size_t i = len - 1; i < len; i--) { - uint8_t t = buf[i]; - buf[i] = 0u - borrow - t; - borrow |= t != 0; - } -} - -static int is_all_zeros(const uint8_t *in, size_t len) { - for (size_t i = 0; i < len; i++) { - if (in[i] != 0) { - return 0; - } - } - return 1; -} - -int i2c_ASN1_INTEGER(const ASN1_INTEGER *in, unsigned char **outp) { - if (in == NULL) { - return 0; - } - - // |ASN1_INTEGER|s should be represented minimally, but it is possible to - // construct invalid ones. Skip leading zeros so this does not produce an - // invalid encoding or break invariants. - int start = 0; - while (start < in->length && in->data[start] == 0) { - start++; - } - - int is_negative = (in->type & V_ASN1_NEG) != 0; - int pad; - if (start >= in->length) { - // Zero is represented as a single byte. - is_negative = 0; - pad = 1; - } else if (is_negative) { - // 0x80...01 through 0xff...ff have a two's complement of 0x7f...ff - // through 0x00...01 and need an extra byte to be negative. - // 0x01...00 through 0x80...00 have a two's complement of 0xfe...ff - // through 0x80...00 and can be negated as-is. - pad = in->data[start] > 0x80 || - (in->data[start] == 0x80 && - !is_all_zeros(in->data + start + 1, in->length - start - 1)); - } else { - // If the high bit is set, the signed representation needs an extra - // byte to be positive. - pad = (in->data[start] & 0x80) != 0; - } - - if (in->length - start > INT_MAX - pad) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_OVERFLOW); - return 0; - } - int len = pad + in->length - start; - assert(len > 0); - if (outp == NULL) { - return len; - } - - if (pad) { - (*outp)[0] = 0; - } - OPENSSL_memcpy(*outp + pad, in->data + start, in->length - start); - if (is_negative) { - negate_twos_complement(*outp, len); - assert((*outp)[0] >= 0x80); - } else { - assert((*outp)[0] < 0x80); - } - *outp += len; - return len; -} - -ASN1_INTEGER *c2i_ASN1_INTEGER(ASN1_INTEGER **out, const unsigned char **inp, - long len) { - // This function can handle lengths up to INT_MAX - 1, but the rest of the - // legacy ASN.1 code mixes integer types, so avoid exposing it to - // ASN1_INTEGERS with larger lengths. - if (len < 0 || len > INT_MAX / 2) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_TOO_LONG); - return NULL; - } - - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - int is_negative; - if (!CBS_is_valid_asn1_integer(&cbs, &is_negative)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_INTEGER); - return NULL; - } - - ASN1_INTEGER *ret = NULL; - if (out == NULL || *out == NULL) { - ret = ASN1_INTEGER_new(); - if (ret == NULL) { - return NULL; - } - } else { - ret = *out; - } - - // Convert to |ASN1_INTEGER|'s sign-and-magnitude representation. First, - // determine the size needed for a minimal result. - if (is_negative) { - // 0xff00...01 through 0xff7f..ff have a two's complement of 0x00ff...ff - // through 0x000100...001 and need one leading zero removed. 0x8000...00 - // through 0xff00...00 have a two's complement of 0x8000...00 through - // 0x0100...00 and will be minimally-encoded as-is. - if (CBS_len(&cbs) > 0 && CBS_data(&cbs)[0] == 0xff && - !is_all_zeros(CBS_data(&cbs) + 1, CBS_len(&cbs) - 1)) { - CBS_skip(&cbs, 1); - } - } else { - // Remove the leading zero byte, if any. - if (CBS_len(&cbs) > 0 && CBS_data(&cbs)[0] == 0x00) { - CBS_skip(&cbs, 1); - } - } - - if (!ASN1_STRING_set(ret, CBS_data(&cbs), CBS_len(&cbs))) { - goto err; - } - - if (is_negative) { - ret->type = V_ASN1_NEG_INTEGER; - negate_twos_complement(ret->data, ret->length); - } else { - ret->type = V_ASN1_INTEGER; - } - - // The value should be minimally-encoded. - assert(ret->length == 0 || ret->data[0] != 0); - // Zero is not negative. - assert(!is_negative || ret->length > 0); - - *inp += len; - if (out != NULL) { - *out = ret; - } - return ret; - -err: - if (ret != NULL && (out == NULL || *out != ret)) { - ASN1_INTEGER_free(ret); - } - return NULL; -} - -int ASN1_INTEGER_set(ASN1_INTEGER *a, long v) { - if (v >= 0) { - return ASN1_INTEGER_set_uint64(a, (uint64_t)v); - } - - if (!ASN1_INTEGER_set_uint64(a, 0 - (uint64_t)v)) { - return 0; - } - - a->type = V_ASN1_NEG_INTEGER; - return 1; -} - -int ASN1_ENUMERATED_set(ASN1_ENUMERATED *a, long v) { - if (v >= 0) { - return ASN1_ENUMERATED_set_uint64(a, (uint64_t)v); - } - - if (!ASN1_ENUMERATED_set_uint64(a, 0 - (uint64_t)v)) { - return 0; - } - - a->type = V_ASN1_NEG_ENUMERATED; - return 1; -} - -static int asn1_string_set_uint64(ASN1_STRING *out, uint64_t v, int type) { - uint8_t buf[sizeof(uint64_t)]; - CRYPTO_store_u64_be(buf, v); - size_t leading_zeros; - for (leading_zeros = 0; leading_zeros < sizeof(buf); leading_zeros++) { - if (buf[leading_zeros] != 0) { - break; - } - } - - if (!ASN1_STRING_set(out, buf + leading_zeros, sizeof(buf) - leading_zeros)) { - return 0; - } - out->type = type; - return 1; -} - -int ASN1_INTEGER_set_uint64(ASN1_INTEGER *out, uint64_t v) { - return asn1_string_set_uint64(out, v, V_ASN1_INTEGER); -} - -int ASN1_ENUMERATED_set_uint64(ASN1_ENUMERATED *out, uint64_t v) { - return asn1_string_set_uint64(out, v, V_ASN1_ENUMERATED); -} - -static int asn1_string_get_abs_uint64(uint64_t *out, const ASN1_STRING *a, - int type) { - if ((a->type & ~V_ASN1_NEG) != type) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_INTEGER_TYPE); - return 0; - } - uint8_t buf[sizeof(uint64_t)] = {0}; - if (a->length > (int)sizeof(buf)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_INTEGER); - return 0; - } - OPENSSL_memcpy(buf + sizeof(buf) - a->length, a->data, a->length); - *out = CRYPTO_load_u64_be(buf); - return 1; -} - -static int asn1_string_get_uint64(uint64_t *out, const ASN1_STRING *a, - int type) { - if (!asn1_string_get_abs_uint64(out, a, type)) { - return 0; - } - if (a->type & V_ASN1_NEG) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_INTEGER); - return 0; - } - return 1; -} - -int ASN1_INTEGER_get_uint64(uint64_t *out, const ASN1_INTEGER *a) { - return asn1_string_get_uint64(out, a, V_ASN1_INTEGER); -} - -int ASN1_ENUMERATED_get_uint64(uint64_t *out, const ASN1_ENUMERATED *a) { - return asn1_string_get_uint64(out, a, V_ASN1_ENUMERATED); -} - -static long asn1_string_get_long(const ASN1_STRING *a, int type) { - if (a == NULL) { - return 0; - } - - uint64_t v; - if (!asn1_string_get_abs_uint64(&v, a, type)) { - goto err; - } - - int64_t i64; - int fits_in_i64; - // Check |v != 0| to handle manually-constructed negative zeros. - if ((a->type & V_ASN1_NEG) && v != 0) { - i64 = (int64_t)(0u - v); - fits_in_i64 = i64 < 0; - } else { - i64 = (int64_t)v; - fits_in_i64 = i64 >= 0; - } - static_assert(sizeof(long) <= sizeof(int64_t), "long is too big"); - - if (fits_in_i64 && LONG_MIN <= i64 && i64 <= LONG_MAX) { - return (long)i64; - } - -err: - // This function's return value does not distinguish overflow from -1. - ERR_clear_error(); - return -1; -} - -long ASN1_INTEGER_get(const ASN1_INTEGER *a) { - return asn1_string_get_long(a, V_ASN1_INTEGER); -} - -long ASN1_ENUMERATED_get(const ASN1_ENUMERATED *a) { - return asn1_string_get_long(a, V_ASN1_ENUMERATED); -} - -static ASN1_STRING *bn_to_asn1_string(const BIGNUM *bn, ASN1_STRING *ai, - int type) { - ASN1_INTEGER *ret; - if (ai == NULL) { - ret = ASN1_STRING_type_new(type); - } else { - ret = ai; - } - if (ret == NULL) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - goto err; - } - - if (BN_is_negative(bn) && !BN_is_zero(bn)) { - ret->type = type | V_ASN1_NEG; - } else { - ret->type = type; - } - - int len = BN_num_bytes(bn); - if (!ASN1_STRING_set(ret, NULL, len) || - !BN_bn2bin_padded(ret->data, len, bn)) { - goto err; - } - return ret; - -err: - if (ret != ai) { - ASN1_STRING_free(ret); - } - return NULL; -} - -ASN1_INTEGER *BN_to_ASN1_INTEGER(const BIGNUM *bn, ASN1_INTEGER *ai) { - return bn_to_asn1_string(bn, ai, V_ASN1_INTEGER); -} - -ASN1_ENUMERATED *BN_to_ASN1_ENUMERATED(const BIGNUM *bn, ASN1_ENUMERATED *ai) { - return bn_to_asn1_string(bn, ai, V_ASN1_ENUMERATED); -} - -static BIGNUM *asn1_string_to_bn(const ASN1_STRING *ai, BIGNUM *bn, int type) { - if ((ai->type & ~V_ASN1_NEG) != type) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_INTEGER_TYPE); - return NULL; - } - - BIGNUM *ret; - if ((ret = BN_bin2bn(ai->data, ai->length, bn)) == NULL) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BN_LIB); - } else if (ai->type & V_ASN1_NEG) { - BN_set_negative(ret, 1); - } - return ret; -} - -BIGNUM *ASN1_INTEGER_to_BN(const ASN1_INTEGER *ai, BIGNUM *bn) { - return asn1_string_to_bn(ai, bn, V_ASN1_INTEGER); -} - -BIGNUM *ASN1_ENUMERATED_to_BN(const ASN1_ENUMERATED *ai, BIGNUM *bn) { - return asn1_string_to_bn(ai, bn, V_ASN1_ENUMERATED); -} diff --git a/third_party/boringssl/src/crypto/asn1/a_int.cc b/third_party/boringssl/src/crypto/asn1/a_int.cc new file mode 100644 index 00000000..216c9daa --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_int.cc @@ -0,0 +1,467 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +ASN1_INTEGER *ASN1_INTEGER_dup(const ASN1_INTEGER *x) { + return ASN1_STRING_dup(x); +} + +int ASN1_INTEGER_cmp(const ASN1_INTEGER *x, const ASN1_INTEGER *y) { + // Compare signs. + int neg = x->type & V_ASN1_NEG; + if (neg != (y->type & V_ASN1_NEG)) { + return neg ? -1 : 1; + } + + int ret = ASN1_STRING_cmp(x, y); + if (neg) { + // This could be |-ret|, but |ASN1_STRING_cmp| is not forbidden from + // returning |INT_MIN|. + if (ret < 0) { + return 1; + } else if (ret > 0) { + return -1; + } else { + return 0; + } + } + + return ret; +} + +// negate_twos_complement negates |len| bytes from |buf| in-place, interpreted +// as a signed, big-endian two's complement value. +static void negate_twos_complement(uint8_t *buf, size_t len) { + uint8_t borrow = 0; + for (size_t i = len - 1; i < len; i--) { + uint8_t t = buf[i]; + buf[i] = 0u - borrow - t; + borrow |= t != 0; + } +} + +static int is_all_zeros(const uint8_t *in, size_t len) { + for (size_t i = 0; i < len; i++) { + if (in[i] != 0) { + return 0; + } + } + return 1; +} + +int bssl::asn1_marshal_integer(CBB *out, const ASN1_INTEGER *in, + CBS_ASN1_TAG tag) { + int len = i2c_ASN1_INTEGER(in, nullptr); + if (len <= 0) { + return 0; + } + tag = tag == 0 ? CBS_ASN1_INTEGER : tag; + CBB child; + uint8_t *ptr; + return CBB_add_asn1(out, &child, tag) && // + CBB_add_space(&child, &ptr, static_cast(len)) && // + i2c_ASN1_INTEGER(in, &ptr) == len && // + CBB_flush(out); +} + +int i2c_ASN1_INTEGER(const ASN1_INTEGER *in, unsigned char **outp) { + if (in == nullptr) { + return 0; + } + + // |ASN1_INTEGER|s should be represented minimally, but it is possible to + // construct invalid ones. Skip leading zeros so this does not produce an + // invalid encoding or break invariants. + CBS cbs; + CBS_init(&cbs, in->data, in->length); + while (CBS_len(&cbs) > 0 && CBS_data(&cbs)[0] == 0) { + CBS_skip(&cbs, 1); + } + + int is_negative = (in->type & V_ASN1_NEG) != 0; + size_t pad; + CBS copy = cbs; + uint8_t msb; + if (!CBS_get_u8(©, &msb)) { + // Zero is represented as a single byte. + is_negative = 0; + pad = 1; + } else if (is_negative) { + // 0x80...01 through 0xff...ff have a two's complement of 0x7f...ff + // through 0x00...01 and need an extra byte to be negative. + // 0x01...00 through 0x80...00 have a two's complement of 0xfe...ff + // through 0x80...00 and can be negated as-is. + pad = msb > 0x80 || + (msb == 0x80 && !is_all_zeros(CBS_data(©), CBS_len(©))); + } else { + // If the high bit is set, the signed representation needs an extra + // byte to be positive. + pad = (msb & 0x80) != 0; + } + + if (CBS_len(&cbs) > INT_MAX - pad) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_OVERFLOW); + return 0; + } + int len = (int)(pad + CBS_len(&cbs)); + assert(len > 0); + if (outp == nullptr) { + return len; + } + + if (pad) { + (*outp)[0] = 0; + } + OPENSSL_memcpy(*outp + pad, CBS_data(&cbs), CBS_len(&cbs)); + if (is_negative) { + negate_twos_complement(*outp, len); + assert((*outp)[0] >= 0x80); + } else { + assert((*outp)[0] < 0x80); + } + *outp += len; + return len; +} + +static int asn1_parse_integer_contents(Span in, + ASN1_INTEGER *out) { + CBS cbs = in; + int is_negative; + if (!CBS_is_valid_asn1_integer(&cbs, &is_negative)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_INTEGER); + return 0; + } + + // Convert to |ASN1_INTEGER|'s sign-and-magnitude representation. First, + // determine the size needed for a minimal result. + if (is_negative) { + // 0xff00...01 through 0xff7f..ff have a two's complement of 0x00ff...ff + // through 0x000100...001 and need one leading zero removed. 0x8000...00 + // through 0xff00...00 have a two's complement of 0x8000...00 through + // 0x0100...00 and will be minimally-encoded as-is. + if (CBS_len(&cbs) > 0 && CBS_data(&cbs)[0] == 0xff && + !is_all_zeros(CBS_data(&cbs) + 1, CBS_len(&cbs) - 1)) { + CBS_skip(&cbs, 1); + } + } else { + // Remove the leading zero byte, if any. + if (CBS_len(&cbs) > 0 && CBS_data(&cbs)[0] == 0x00) { + CBS_skip(&cbs, 1); + } + } + + if (!ASN1_STRING_set(out, CBS_data(&cbs), CBS_len(&cbs))) { + return 0; + } + + if (is_negative) { + out->type = V_ASN1_NEG_INTEGER; + negate_twos_complement(out->data, out->length); + } else { + out->type = V_ASN1_INTEGER; + } + + // The value should be minimally-encoded. + assert(out->length == 0 || out->data[0] != 0); + // Zero is not negative. + assert(!is_negative || out->length > 0); + return 1; +} + +int bssl::asn1_parse_integer(CBS *cbs, ASN1_INTEGER *out, CBS_ASN1_TAG tag) { + tag = tag == 0 ? CBS_ASN1_INTEGER : tag; + CBS child; + if (!CBS_get_asn1(cbs, &child, tag)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + return asn1_parse_integer_contents(child, out); +} + +int bssl::asn1_parse_enumerated(CBS *cbs, ASN1_ENUMERATED *out, + CBS_ASN1_TAG tag) { + tag = tag == 0 ? CBS_ASN1_ENUMERATED : tag; + if (!asn1_parse_integer(cbs, out, tag)) { + return 0; + } + // Fix the type value. + out->type = + (out->type & V_ASN1_NEG) ? V_ASN1_NEG_ENUMERATED : V_ASN1_ENUMERATED; + return 1; +} + +ASN1_INTEGER *c2i_ASN1_INTEGER(ASN1_INTEGER **out, const unsigned char **inp, + long len) { + if (len < 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_SHORT); + return nullptr; + } + + ASN1_INTEGER *ret = nullptr; + if (out == nullptr || *out == nullptr) { + ret = ASN1_INTEGER_new(); + if (ret == nullptr) { + return nullptr; + } + } else { + ret = *out; + } + + if (!asn1_parse_integer_contents(Span(*inp, len), ret)) { + if (ret != nullptr && (out == nullptr || *out != ret)) { + ASN1_INTEGER_free(ret); + } + return nullptr; + } + + *inp += len; + if (out != nullptr) { + *out = ret; + } + return ret; + +} + +int ASN1_INTEGER_set_int64(ASN1_INTEGER *a, int64_t v) { + if (v >= 0) { + return ASN1_INTEGER_set_uint64(a, (uint64_t)v); + } + + if (!ASN1_INTEGER_set_uint64(a, 0 - (uint64_t)v)) { + return 0; + } + + a->type = V_ASN1_NEG_INTEGER; + return 1; +} + +int ASN1_ENUMERATED_set_int64(ASN1_ENUMERATED *a, int64_t v) { + if (v >= 0) { + return ASN1_ENUMERATED_set_uint64(a, (uint64_t)v); + } + + if (!ASN1_ENUMERATED_set_uint64(a, 0 - (uint64_t)v)) { + return 0; + } + + a->type = V_ASN1_NEG_ENUMERATED; + return 1; +} + +int ASN1_INTEGER_set(ASN1_INTEGER *a, long v) { + static_assert(sizeof(long) <= sizeof(int64_t), "long fits in int64_t"); + return ASN1_INTEGER_set_int64(a, v); +} + +int ASN1_ENUMERATED_set(ASN1_ENUMERATED *a, long v) { + static_assert(sizeof(long) <= sizeof(int64_t), "long fits in int64_t"); + return ASN1_ENUMERATED_set_int64(a, v); +} + +static int asn1_string_set_uint64(ASN1_STRING *out, uint64_t v, int type) { + uint8_t buf[sizeof(uint64_t)]; + CRYPTO_store_u64_be(buf, v); + size_t leading_zeros; + for (leading_zeros = 0; leading_zeros < sizeof(buf); leading_zeros++) { + if (buf[leading_zeros] != 0) { + break; + } + } + + if (!ASN1_STRING_set(out, buf + leading_zeros, sizeof(buf) - leading_zeros)) { + return 0; + } + out->type = type; + return 1; +} + +int ASN1_INTEGER_set_uint64(ASN1_INTEGER *out, uint64_t v) { + return asn1_string_set_uint64(out, v, V_ASN1_INTEGER); +} + +int ASN1_ENUMERATED_set_uint64(ASN1_ENUMERATED *out, uint64_t v) { + return asn1_string_set_uint64(out, v, V_ASN1_ENUMERATED); +} + +static int asn1_string_get_abs_uint64(uint64_t *out, const ASN1_STRING *a, + int type) { + if ((a->type & ~V_ASN1_NEG) != type) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_INTEGER_TYPE); + return 0; + } + uint8_t buf[sizeof(uint64_t)] = {0}; + if (a->length > (int)sizeof(buf)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_INTEGER); + return 0; + } + OPENSSL_memcpy(buf + sizeof(buf) - a->length, a->data, a->length); + *out = CRYPTO_load_u64_be(buf); + return 1; +} + +static int asn1_string_get_uint64(uint64_t *out, const ASN1_STRING *a, + int type) { + if (!asn1_string_get_abs_uint64(out, a, type)) { + return 0; + } + if (a->type & V_ASN1_NEG) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_INTEGER); + return 0; + } + return 1; +} + +int ASN1_INTEGER_get_uint64(uint64_t *out, const ASN1_INTEGER *a) { + return asn1_string_get_uint64(out, a, V_ASN1_INTEGER); +} + +int ASN1_ENUMERATED_get_uint64(uint64_t *out, const ASN1_ENUMERATED *a) { + return asn1_string_get_uint64(out, a, V_ASN1_ENUMERATED); +} + +static int asn1_string_get_int64(int64_t *out, const ASN1_STRING *a, int type) { + uint64_t v; + if (!asn1_string_get_abs_uint64(&v, a, type)) { + return 0; + } + int64_t i64; + int fits_in_i64; + // Check |v != 0| to handle manually-constructed negative zeros. + if ((a->type & V_ASN1_NEG) && v != 0) { + i64 = (int64_t)(0u - v); + fits_in_i64 = i64 < 0; + } else { + i64 = (int64_t)v; + fits_in_i64 = i64 >= 0; + } + if (!fits_in_i64) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_INTEGER); + return 0; + } + *out = i64; + return 1; +} + +int ASN1_INTEGER_get_int64(int64_t *out, const ASN1_INTEGER *a) { + return asn1_string_get_int64(out, a, V_ASN1_INTEGER); +} + +int ASN1_ENUMERATED_get_int64(int64_t *out, const ASN1_ENUMERATED *a) { + return asn1_string_get_int64(out, a, V_ASN1_ENUMERATED); +} + +static long asn1_string_get_long(const ASN1_STRING *a, int type) { + if (a == nullptr) { + return 0; + } + + int64_t v; + if (!asn1_string_get_int64(&v, a, type) || // + v < LONG_MIN || v > LONG_MAX) { + // This function's return value does not distinguish overflow from -1. + ERR_clear_error(); + return -1; + } + + return (long)v; +} + +long ASN1_INTEGER_get(const ASN1_INTEGER *a) { + return asn1_string_get_long(a, V_ASN1_INTEGER); +} + +long ASN1_ENUMERATED_get(const ASN1_ENUMERATED *a) { + return asn1_string_get_long(a, V_ASN1_ENUMERATED); +} + +static ASN1_STRING *bn_to_asn1_string(const BIGNUM *bn, ASN1_STRING *ai, + int type) { + ASN1_INTEGER *ret; + if (ai == nullptr) { + ret = ASN1_STRING_type_new(type); + } else { + ret = ai; + } + int len; + if (ret == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + goto err; + } + + if (BN_is_negative(bn) && !BN_is_zero(bn)) { + ret->type = type | V_ASN1_NEG; + } else { + ret->type = type; + } + + len = BN_num_bytes(bn); + if (!ASN1_STRING_set(ret, nullptr, len) || + !BN_bn2bin_padded(ret->data, len, bn)) { + goto err; + } + return ret; + +err: + if (ret != ai) { + ASN1_STRING_free(ret); + } + return nullptr; +} + +ASN1_INTEGER *BN_to_ASN1_INTEGER(const BIGNUM *bn, ASN1_INTEGER *ai) { + return bn_to_asn1_string(bn, ai, V_ASN1_INTEGER); +} + +ASN1_ENUMERATED *BN_to_ASN1_ENUMERATED(const BIGNUM *bn, ASN1_ENUMERATED *ai) { + return bn_to_asn1_string(bn, ai, V_ASN1_ENUMERATED); +} + +static BIGNUM *asn1_string_to_bn(const ASN1_STRING *ai, BIGNUM *bn, int type) { + if ((ai->type & ~V_ASN1_NEG) != type) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_INTEGER_TYPE); + return nullptr; + } + + BIGNUM *ret; + if ((ret = BN_bin2bn(ai->data, ai->length, bn)) == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BN_LIB); + } else if (ai->type & V_ASN1_NEG) { + BN_set_negative(ret, 1); + } + return ret; +} + +BIGNUM *ASN1_INTEGER_to_BN(const ASN1_INTEGER *ai, BIGNUM *bn) { + return asn1_string_to_bn(ai, bn, V_ASN1_INTEGER); +} + +BIGNUM *ASN1_ENUMERATED_to_BN(const ASN1_ENUMERATED *ai, BIGNUM *bn) { + return asn1_string_to_bn(ai, bn, V_ASN1_ENUMERATED); +} diff --git a/third_party/boringssl/src/crypto/asn1/a_mbstr.c b/third_party/boringssl/src/crypto/asn1/a_mbstr.c deleted file mode 100644 index c53d6d58..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_mbstr.c +++ /dev/null @@ -1,293 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include - -#include "../bytestring/internal.h" -#include "internal.h" - -// These functions take a string in UTF8, ASCII or multibyte form and a mask -// of permissible ASN1 string types. It then works out the minimal type -// (using the order Printable < IA5 < T61 < BMP < Universal < UTF8) and -// creates a string of the correct type with the supplied data. Yes this is -// horrible: it has to be :-( The 'ncopy' form checks minimum and maximum -// size limits too. - -int ASN1_mbstring_copy(ASN1_STRING **out, const unsigned char *in, int len, - int inform, unsigned long mask) { - return ASN1_mbstring_ncopy(out, in, len, inform, mask, 0, 0); -} - -OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_BMPSTRING) -OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_UNIVERSALSTRING) -OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_UTF8STRING) - -int ASN1_mbstring_ncopy(ASN1_STRING **out, const unsigned char *in, int len, - int inform, unsigned long mask, long minsize, - long maxsize) { - int str_type; - char free_out; - ASN1_STRING *dest; - size_t nchar = 0; - char strbuf[32]; - if (len == -1) { - len = strlen((const char *)in); - } - if (!mask) { - mask = DIRSTRING_TYPE; - } - - int (*decode_func)(CBS *, uint32_t *); - int error; - switch (inform) { - case MBSTRING_BMP: - decode_func = cbs_get_ucs2_be; - error = ASN1_R_INVALID_BMPSTRING; - break; - - case MBSTRING_UNIV: - decode_func = cbs_get_utf32_be; - error = ASN1_R_INVALID_UNIVERSALSTRING; - break; - - case MBSTRING_UTF8: - decode_func = cbs_get_utf8; - error = ASN1_R_INVALID_UTF8STRING; - break; - - case MBSTRING_ASC: - decode_func = cbs_get_latin1; - error = ERR_R_INTERNAL_ERROR; // Latin-1 inputs are never invalid. - break; - - default: - OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_FORMAT); - return -1; - } - - // Check |minsize| and |maxsize| and work out the minimal type, if any. - CBS cbs; - CBS_init(&cbs, in, len); - size_t utf8_len = 0; - while (CBS_len(&cbs) != 0) { - uint32_t c; - if (!decode_func(&cbs, &c)) { - OPENSSL_PUT_ERROR(ASN1, error); - return -1; - } - if (nchar == 0 && (inform == MBSTRING_BMP || inform == MBSTRING_UNIV) && - c == 0xfeff) { - // Reject byte-order mark. We could drop it but that would mean - // adding ambiguity around whether a BOM was included or not when - // matching strings. - // - // For a little-endian UCS-2 string, the BOM will appear as 0xfffe - // and will be rejected as noncharacter, below. - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_CHARACTERS); - return -1; - } - - // Update which output formats are still possible. - if ((mask & B_ASN1_PRINTABLESTRING) && !asn1_is_printable(c)) { - mask &= ~B_ASN1_PRINTABLESTRING; - } - if ((mask & B_ASN1_IA5STRING) && (c > 127)) { - mask &= ~B_ASN1_IA5STRING; - } - if ((mask & B_ASN1_T61STRING) && (c > 0xff)) { - mask &= ~B_ASN1_T61STRING; - } - if ((mask & B_ASN1_BMPSTRING) && (c > 0xffff)) { - mask &= ~B_ASN1_BMPSTRING; - } - if (!mask) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_CHARACTERS); - return -1; - } - - nchar++; - utf8_len += cbb_get_utf8_len(c); - } - - if (minsize > 0 && nchar < (size_t)minsize) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_SHORT); - BIO_snprintf(strbuf, sizeof strbuf, "%ld", minsize); - ERR_add_error_data(2, "minsize=", strbuf); - return -1; - } - - if (maxsize > 0 && nchar > (size_t)maxsize) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_LONG); - BIO_snprintf(strbuf, sizeof strbuf, "%ld", maxsize); - ERR_add_error_data(2, "maxsize=", strbuf); - return -1; - } - - // Now work out output format and string type - int (*encode_func)(CBB *, uint32_t) = cbb_add_latin1; - size_t size_estimate = nchar; - int outform = MBSTRING_ASC; - if (mask & B_ASN1_PRINTABLESTRING) { - str_type = V_ASN1_PRINTABLESTRING; - } else if (mask & B_ASN1_IA5STRING) { - str_type = V_ASN1_IA5STRING; - } else if (mask & B_ASN1_T61STRING) { - str_type = V_ASN1_T61STRING; - } else if (mask & B_ASN1_BMPSTRING) { - str_type = V_ASN1_BMPSTRING; - outform = MBSTRING_BMP; - encode_func = cbb_add_ucs2_be; - size_estimate = 2 * nchar; - } else if (mask & B_ASN1_UNIVERSALSTRING) { - str_type = V_ASN1_UNIVERSALSTRING; - encode_func = cbb_add_utf32_be; - size_estimate = 4 * nchar; - outform = MBSTRING_UNIV; - } else if (mask & B_ASN1_UTF8STRING) { - str_type = V_ASN1_UTF8STRING; - outform = MBSTRING_UTF8; - encode_func = cbb_add_utf8; - size_estimate = utf8_len; - } else { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_CHARACTERS); - return -1; - } - - if (!out) { - return str_type; - } - if (*out) { - free_out = 0; - dest = *out; - if (dest->data) { - dest->length = 0; - OPENSSL_free(dest->data); - dest->data = NULL; - } - dest->type = str_type; - } else { - free_out = 1; - dest = ASN1_STRING_type_new(str_type); - if (!dest) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return -1; - } - *out = dest; - } - - // If both the same type just copy across - if (inform == outform) { - if (!ASN1_STRING_set(dest, in, len)) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return -1; - } - return str_type; - } - - CBB cbb; - if (!CBB_init(&cbb, size_estimate + 1)) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - goto err; - } - CBS_init(&cbs, in, len); - while (CBS_len(&cbs) != 0) { - uint32_t c; - if (!decode_func(&cbs, &c) || !encode_func(&cbb, c)) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_INTERNAL_ERROR); - goto err; - } - } - uint8_t *data = NULL; - size_t data_len; - if (// OpenSSL historically NUL-terminated this value with a single byte, - // even for |MBSTRING_BMP| and |MBSTRING_UNIV|. - !CBB_add_u8(&cbb, 0) || !CBB_finish(&cbb, &data, &data_len) || - data_len < 1 || data_len > INT_MAX) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_INTERNAL_ERROR); - OPENSSL_free(data); - goto err; - } - dest->length = (int)(data_len - 1); - dest->data = data; - return str_type; - -err: - if (free_out) { - ASN1_STRING_free(dest); - } - CBB_cleanup(&cbb); - return -1; -} - -int asn1_is_printable(uint32_t value) { - if (value > 0x7f) { - return 0; - } - // Note we cannot use |isalnum| because it is locale-dependent. - return ('a' <= value && value <= 'z') || // - ('A' <= value && value <= 'Z') || // - ('0' <= value && value <= '9') || // - value == ' ' || value == '\'' || value == '(' || value == ')' || - value == '+' || value == ',' || value == '-' || value == '.' || - value == '/' || value == ':' || value == '=' || value == '?'; -} diff --git a/third_party/boringssl/src/crypto/asn1/a_mbstr.cc b/third_party/boringssl/src/crypto/asn1/a_mbstr.cc new file mode 100644 index 00000000..c70bb3b5 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_mbstr.cc @@ -0,0 +1,242 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include + +#include "../bytestring/internal.h" +#include "internal.h" + + +using namespace bssl; + +// These functions take a string in UTF8, ASCII or multibyte form and a mask +// of permissible ASN1 string types. It then works out the minimal type +// (using the order Printable < IA5 < T61 < BMP < Universal < UTF8) and +// creates a string of the correct type with the supplied data. Yes this is +// horrible: it has to be :-( The 'ncopy' form checks minimum and maximum +// size limits too. + +int ASN1_mbstring_copy(ASN1_STRING **out, const unsigned char *in, + ossl_ssize_t len, int inform, unsigned long mask) { + return ASN1_mbstring_ncopy(out, in, len, inform, mask, /*minsize=*/0, + /*maxsize=*/0); +} + +OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_BMPSTRING) +OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_UNIVERSALSTRING) +OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_UTF8STRING) + +int ASN1_mbstring_ncopy(ASN1_STRING **out, const unsigned char *in, + ossl_ssize_t len, int inform, unsigned long mask, + ossl_ssize_t minsize, ossl_ssize_t maxsize) { + if (len == -1) { + len = strlen((const char *)in); + } + if (!mask) { + mask = DIRSTRING_TYPE; + } + + int (*decode_func)(CBS *, uint32_t *); + int error; + switch (inform) { + case MBSTRING_BMP: + decode_func = CBS_get_ucs2_be; + error = ASN1_R_INVALID_BMPSTRING; + break; + + case MBSTRING_UNIV: + decode_func = CBS_get_utf32_be; + error = ASN1_R_INVALID_UNIVERSALSTRING; + break; + + case MBSTRING_UTF8: + decode_func = CBS_get_utf8; + error = ASN1_R_INVALID_UTF8STRING; + break; + + case MBSTRING_ASC: + decode_func = CBS_get_latin1; + error = ERR_R_INTERNAL_ERROR; // Latin-1 inputs are never invalid. + break; + + default: + OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_FORMAT); + return -1; + } + + // Check |minsize| and |maxsize| and work out the minimal type, if any. + CBS cbs; + CBS_init(&cbs, in, len); + size_t utf8_len = 0, nchar = 0; + while (CBS_len(&cbs) != 0) { + uint32_t c; + if (!decode_func(&cbs, &c)) { + OPENSSL_PUT_ERROR(ASN1, error); + return -1; + } + if (nchar == 0 && (inform == MBSTRING_BMP || inform == MBSTRING_UNIV) && + c == 0xfeff) { + // Reject byte-order mark. We could drop it but that would mean + // adding ambiguity around whether a BOM was included or not when + // matching strings. + // + // For a little-endian UCS-2 string, the BOM will appear as 0xfffe + // and will be rejected as noncharacter, below. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_CHARACTERS); + return -1; + } + + // Update which output formats are still possible. + if ((mask & B_ASN1_PRINTABLESTRING) && !asn1_is_printable(c)) { + mask &= ~B_ASN1_PRINTABLESTRING; + } + if ((mask & B_ASN1_IA5STRING) && (c > 127)) { + mask &= ~B_ASN1_IA5STRING; + } + if ((mask & B_ASN1_T61STRING) && (c > 0xff)) { + mask &= ~B_ASN1_T61STRING; + } + if ((mask & B_ASN1_BMPSTRING) && (c > 0xffff)) { + mask &= ~B_ASN1_BMPSTRING; + } + if (!mask) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_CHARACTERS); + return -1; + } + + nchar++; + utf8_len += CBB_get_utf8_len(c); + if (maxsize > 0 && nchar > (size_t)maxsize) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_LONG); + ERR_add_error_dataf("maxsize=%zu", (size_t)maxsize); + return -1; + } + } + + if (minsize > 0 && nchar < (size_t)minsize) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_SHORT); + ERR_add_error_dataf("minsize=%zu", (size_t)minsize); + return -1; + } + + // Now work out output format and string type + int str_type; + int (*encode_func)(CBB *, uint32_t) = CBB_add_latin1; + size_t size_estimate = nchar; + int outform = MBSTRING_ASC; + if (mask & B_ASN1_PRINTABLESTRING) { + str_type = V_ASN1_PRINTABLESTRING; + } else if (mask & B_ASN1_IA5STRING) { + str_type = V_ASN1_IA5STRING; + } else if (mask & B_ASN1_T61STRING) { + str_type = V_ASN1_T61STRING; + } else if (mask & B_ASN1_BMPSTRING) { + str_type = V_ASN1_BMPSTRING; + outform = MBSTRING_BMP; + encode_func = CBB_add_ucs2_be; + size_estimate = 2 * nchar; + } else if (mask & B_ASN1_UNIVERSALSTRING) { + str_type = V_ASN1_UNIVERSALSTRING; + encode_func = CBB_add_utf32_be; + size_estimate = 4 * nchar; + outform = MBSTRING_UNIV; + } else if (mask & B_ASN1_UTF8STRING) { + str_type = V_ASN1_UTF8STRING; + outform = MBSTRING_UTF8; + encode_func = CBB_add_utf8; + size_estimate = utf8_len; + } else { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_CHARACTERS); + return -1; + } + + if (!out) { + return str_type; + } + + int free_dest = 0; + ASN1_STRING *dest; + if (*out) { + dest = *out; + } else { + free_dest = 1; + dest = ASN1_STRING_type_new(str_type); + if (!dest) { + return -1; + } + } + + CBB cbb; + CBB_zero(&cbb); + // If both the same type just copy across + uint8_t *data = nullptr; + size_t data_len = 0; + if (inform == outform) { + if (!ASN1_STRING_set(dest, in, len)) { + goto err; + } + dest->type = str_type; + *out = dest; + return str_type; + } + if (!CBB_init(&cbb, size_estimate + 1)) { + goto err; + } + CBS_init(&cbs, in, len); + while (CBS_len(&cbs) != 0) { + uint32_t c; + if (!decode_func(&cbs, &c) || !encode_func(&cbb, c)) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_INTERNAL_ERROR); + goto err; + } + } + if (/* OpenSSL historically NUL-terminated this value with a single byte, + * even for |MBSTRING_BMP| and |MBSTRING_UNIV|. */ + !CBB_add_u8(&cbb, 0) || // + !CBB_finish(&cbb, &data, &data_len) || // + data_len < 1 || // + data_len > INT_MAX) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_INTERNAL_ERROR); + OPENSSL_free(data); + goto err; + } + dest->type = str_type; + ASN1_STRING_set0(dest, data, (int)data_len - 1); + *out = dest; + return str_type; + +err: + if (free_dest) { + ASN1_STRING_free(dest); + } + CBB_cleanup(&cbb); + return -1; +} + +int bssl::asn1_is_printable(uint32_t value) { + if (value > 0x7f) { + return 0; + } + return OPENSSL_isalnum(value) || // + value == ' ' || value == '\'' || value == '(' || value == ')' || + value == '+' || value == ',' || value == '-' || value == '.' || + value == '/' || value == ':' || value == '=' || value == '?'; +} diff --git a/third_party/boringssl/src/crypto/asn1/a_object.c b/third_party/boringssl/src/crypto/asn1/a_object.c deleted file mode 100644 index c854fb88..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_object.c +++ /dev/null @@ -1,290 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include - -#include "../internal.h" -#include "internal.h" - - -int i2d_ASN1_OBJECT(const ASN1_OBJECT *a, unsigned char **pp) { - if (a == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_PASSED_NULL_PARAMETER); - return -1; - } - - if (a->length == 0) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_OBJECT); - return -1; - } - - int objsize = ASN1_object_size(0, a->length, V_ASN1_OBJECT); - if (pp == NULL || objsize == -1) { - return objsize; - } - - unsigned char *p, *allocated = NULL; - if (*pp == NULL) { - if ((p = allocated = OPENSSL_malloc(objsize)) == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return -1; - } - } else { - p = *pp; - } - - ASN1_put_object(&p, 0, a->length, V_ASN1_OBJECT, V_ASN1_UNIVERSAL); - OPENSSL_memcpy(p, a->data, a->length); - - // If a new buffer was allocated, just return it back. - // If not, return the incremented buffer pointer. - *pp = allocated != NULL ? allocated : p + a->length; - return objsize; -} - -int i2t_ASN1_OBJECT(char *buf, int buf_len, const ASN1_OBJECT *a) { - return OBJ_obj2txt(buf, buf_len, a, 0); -} - -static int write_str(BIO *bp, const char *str) { - int len = strlen(str); - return BIO_write(bp, str, len) == len ? len : -1; -} - -int i2a_ASN1_OBJECT(BIO *bp, const ASN1_OBJECT *a) { - if (a == NULL || a->data == NULL) { - return write_str(bp, "NULL"); - } - - char buf[80], *allocated = NULL; - const char *str = buf; - int len = i2t_ASN1_OBJECT(buf, sizeof(buf), a); - if (len > (int)sizeof(buf) - 1) { - // The input was truncated. Allocate a buffer that fits. - allocated = OPENSSL_malloc(len + 1); - if (allocated == NULL) { - return -1; - } - len = i2t_ASN1_OBJECT(allocated, len + 1, a); - str = allocated; - } - if (len <= 0) { - str = ""; - } - - int ret = write_str(bp, str); - OPENSSL_free(allocated); - return ret; -} - -ASN1_OBJECT *d2i_ASN1_OBJECT(ASN1_OBJECT **a, const unsigned char **pp, - long length) { - long len; - int tag, xclass; - const unsigned char *p = *pp; - int inf = ASN1_get_object(&p, &len, &tag, &xclass, length); - if (inf & 0x80) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_OBJECT_HEADER); - return NULL; - } - - if (inf & V_ASN1_CONSTRUCTED) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_TYPE_NOT_PRIMITIVE); - return NULL; - } - - if (tag != V_ASN1_OBJECT || xclass != V_ASN1_UNIVERSAL) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_EXPECTING_AN_OBJECT); - return NULL; - } - ASN1_OBJECT *ret = c2i_ASN1_OBJECT(a, &p, len); - if (ret) { - *pp = p; - } - return ret; -} - -ASN1_OBJECT *c2i_ASN1_OBJECT(ASN1_OBJECT **a, const unsigned char **pp, - long len) { - ASN1_OBJECT *ret = NULL; - const unsigned char *p; - unsigned char *data; - int i, length; - - // Sanity check OID encoding. Need at least one content octet. MSB must - // be clear in the last octet. can't have leading 0x80 in subidentifiers, - // see: X.690 8.19.2 - if (len <= 0 || len > INT_MAX || pp == NULL || (p = *pp) == NULL || - p[len - 1] & 0x80) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_OBJECT_ENCODING); - return NULL; - } - // Now 0 < len <= INT_MAX, so the cast is safe. - length = (int)len; - for (i = 0; i < length; i++, p++) { - if (*p == 0x80 && (!i || !(p[-1] & 0x80))) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_OBJECT_ENCODING); - return NULL; - } - } - - if ((a == NULL) || ((*a) == NULL) || - !((*a)->flags & ASN1_OBJECT_FLAG_DYNAMIC)) { - if ((ret = ASN1_OBJECT_new()) == NULL) { - return NULL; - } - } else { - ret = (*a); - } - - p = *pp; - // detach data from object - data = (unsigned char *)ret->data; - ret->data = NULL; - // once detached we can change it - if ((data == NULL) || (ret->length < length)) { - ret->length = 0; - OPENSSL_free(data); - data = (unsigned char *)OPENSSL_malloc(length); - if (data == NULL) { - i = ERR_R_MALLOC_FAILURE; - goto err; - } - ret->flags |= ASN1_OBJECT_FLAG_DYNAMIC_DATA; - } - OPENSSL_memcpy(data, p, length); - // If there are dynamic strings, free them here, and clear the flag - if ((ret->flags & ASN1_OBJECT_FLAG_DYNAMIC_STRINGS) != 0) { - OPENSSL_free((char *)ret->sn); - OPENSSL_free((char *)ret->ln); - ret->flags &= ~ASN1_OBJECT_FLAG_DYNAMIC_STRINGS; - } - // reattach data to object, after which it remains const - ret->data = data; - ret->length = length; - ret->sn = NULL; - ret->ln = NULL; - p += length; - - if (a != NULL) { - (*a) = ret; - } - *pp = p; - return ret; -err: - OPENSSL_PUT_ERROR(ASN1, i); - if ((ret != NULL) && ((a == NULL) || (*a != ret))) { - ASN1_OBJECT_free(ret); - } - return NULL; -} - -ASN1_OBJECT *ASN1_OBJECT_new(void) { - ASN1_OBJECT *ret; - - ret = (ASN1_OBJECT *)OPENSSL_malloc(sizeof(ASN1_OBJECT)); - if (ret == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return NULL; - } - ret->length = 0; - ret->data = NULL; - ret->nid = 0; - ret->sn = NULL; - ret->ln = NULL; - ret->flags = ASN1_OBJECT_FLAG_DYNAMIC; - return ret; -} - -void ASN1_OBJECT_free(ASN1_OBJECT *a) { - if (a == NULL) { - return; - } - if (a->flags & ASN1_OBJECT_FLAG_DYNAMIC_STRINGS) { - OPENSSL_free((void *)a->sn); - OPENSSL_free((void *)a->ln); - a->sn = a->ln = NULL; - } - if (a->flags & ASN1_OBJECT_FLAG_DYNAMIC_DATA) { - OPENSSL_free((void *)a->data); - a->data = NULL; - a->length = 0; - } - if (a->flags & ASN1_OBJECT_FLAG_DYNAMIC) { - OPENSSL_free(a); - } -} - -ASN1_OBJECT *ASN1_OBJECT_create(int nid, const unsigned char *data, int len, - const char *sn, const char *ln) { - ASN1_OBJECT o; - - o.sn = sn; - o.ln = ln; - o.data = data; - o.nid = nid; - o.length = len; - o.flags = ASN1_OBJECT_FLAG_DYNAMIC | ASN1_OBJECT_FLAG_DYNAMIC_STRINGS | - ASN1_OBJECT_FLAG_DYNAMIC_DATA; - return (OBJ_dup(&o)); -} diff --git a/third_party/boringssl/src/crypto/asn1/a_object.cc b/third_party/boringssl/src/crypto/asn1/a_object.cc new file mode 100644 index 00000000..125058ab --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_object.cc @@ -0,0 +1,191 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +int bssl::asn1_marshal_object(CBB *out, const ASN1_OBJECT *in, + CBS_ASN1_TAG tag) { + if (in == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + if (in->length <= 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_OBJECT); + return 0; + } + + tag = tag == 0 ? CBS_ASN1_OBJECT : tag; + return CBB_add_asn1_element(out, tag, in->data, in->length); +} + +int i2d_ASN1_OBJECT(const ASN1_OBJECT *in, unsigned char **outp) { + return I2DFromCBB( + /*initial_capacity=*/static_cast(in->length) + 2, outp, + [&](CBB *cbb) -> bool { + return asn1_marshal_object(cbb, in, /*tag=*/0); + }); +} + +int i2t_ASN1_OBJECT(char *buf, int buf_len, const ASN1_OBJECT *a) { + return OBJ_obj2txt(buf, buf_len, a, 0); +} + +static int write_str(BIO *bp, const char *str) { + size_t len = strlen(str); + if (len > INT_MAX) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_OVERFLOW); + return -1; + } + return BIO_write(bp, str, (int)len) == (int)len ? (int)len : -1; +} + +int i2a_ASN1_OBJECT(BIO *bp, const ASN1_OBJECT *a) { + if (a == nullptr || a->data == nullptr) { + return write_str(bp, "NULL"); + } + + char buf[80], *allocated = nullptr; + const char *str = buf; + int len = i2t_ASN1_OBJECT(buf, sizeof(buf), a); + if (len > (int)sizeof(buf) - 1) { + // The input was truncated. Allocate a buffer that fits. + allocated = reinterpret_cast(OPENSSL_malloc(len + 1)); + if (allocated == nullptr) { + return -1; + } + len = i2t_ASN1_OBJECT(allocated, len + 1, a); + str = allocated; + } + if (len <= 0) { + str = ""; + } + + int ret = write_str(bp, str); + OPENSSL_free(allocated); + return ret; +} + +ASN1_OBJECT *d2i_ASN1_OBJECT(ASN1_OBJECT **out, const unsigned char **inp, + long len) { + return D2IFromCBS(out, inp, len, [](CBS *cbs) -> ASN1_OBJECT * { + CBS child; + if (!CBS_get_asn1(cbs, &child, CBS_ASN1_OBJECT)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return nullptr; + } + const uint8_t *contents = CBS_data(&child); + return c2i_ASN1_OBJECT(nullptr, &contents, CBS_len(&child)); + }); +} + +ASN1_OBJECT *c2i_ASN1_OBJECT(ASN1_OBJECT **out, const unsigned char **inp, + long len) { + return D2IFromCBS(out, inp, len, [](CBS *cbs) -> ASN1_OBJECT * { + if (!CBS_is_valid_asn1_oid(cbs)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_OBJECT_ENCODING); + return nullptr; + } + ASN1_OBJECT *ret = + ASN1_OBJECT_create(NID_undef, CBS_data(cbs), CBS_len(cbs), + /*sn=*/nullptr, /*ln=*/nullptr); + if (ret != nullptr) { + // |c2i_ASN1_OBJECT| consumes its whole input on success. + BSSL_CHECK(CBS_skip(cbs, CBS_len(cbs))); + } + return ret; + }); +} + +ASN1_OBJECT *bssl::asn1_parse_object(CBS *cbs, CBS_ASN1_TAG tag) { + tag = tag == 0 ? CBS_ASN1_OBJECT : tag; + CBS child; + if (!CBS_get_asn1(cbs, &child, tag)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return nullptr; + } + if (!CBS_is_valid_asn1_oid(&child)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_OBJECT_ENCODING); + return nullptr; + } + return ASN1_OBJECT_create(NID_undef, CBS_data(&child), CBS_len(&child), + /*sn=*/nullptr, /*ln=*/nullptr); +} + +ASN1_OBJECT *bssl::ASN1_OBJECT_new() { + ASN1_OBJECT *ret = New(); + if (ret == nullptr) { + return nullptr; + } + ret->length = 0; + ret->data = nullptr; + ret->nid = 0; + ret->sn = nullptr; + ret->ln = nullptr; + ret->flags = ASN1_OBJECT_FLAG_DYNAMIC; + return ret; +} + +void ASN1_OBJECT_free(ASN1_OBJECT *a) { + if (a == nullptr) { + return; + } + if (a->flags & ASN1_OBJECT_FLAG_DYNAMIC_STRINGS) { + OPENSSL_free((void *)a->sn); + OPENSSL_free((void *)a->ln); + a->sn = a->ln = nullptr; + } + if (a->flags & ASN1_OBJECT_FLAG_DYNAMIC_DATA) { + OPENSSL_free((void *)a->data); + a->data = nullptr; + a->length = 0; + } + if (a->flags & ASN1_OBJECT_FLAG_DYNAMIC) { + Delete(a); + } +} + +ASN1_OBJECT *ASN1_OBJECT_create(int nid, const unsigned char *data, size_t len, + const char *sn, const char *ln) { + if (len > INT_MAX) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_STRING_TOO_LONG); + return nullptr; + } + + ASN1_OBJECT o; + o.sn = sn; + o.ln = ln; + o.data = data; + o.nid = nid; + o.length = (int)len; + o.flags = ASN1_OBJECT_FLAG_DYNAMIC | ASN1_OBJECT_FLAG_DYNAMIC_STRINGS | + ASN1_OBJECT_FLAG_DYNAMIC_DATA; + return OBJ_dup(&o); +} diff --git a/third_party/boringssl/src/crypto/asn1/a_octet.c b/third_party/boringssl/src/crypto/asn1/a_octet.c deleted file mode 100644 index f9c33791..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_octet.c +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -ASN1_OCTET_STRING *ASN1_OCTET_STRING_dup(const ASN1_OCTET_STRING *x) { - return ASN1_STRING_dup(x); -} - -int ASN1_OCTET_STRING_cmp(const ASN1_OCTET_STRING *a, - const ASN1_OCTET_STRING *b) { - return ASN1_STRING_cmp(a, b); -} - -int ASN1_OCTET_STRING_set(ASN1_OCTET_STRING *x, const unsigned char *d, - int len) { - return ASN1_STRING_set(x, d, len); -} diff --git a/third_party/boringssl/src/crypto/asn1/a_octet.cc b/third_party/boringssl/src/crypto/asn1/a_octet.cc new file mode 100644 index 00000000..edcfb1eb --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_octet.cc @@ -0,0 +1,32 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +ASN1_OCTET_STRING *ASN1_OCTET_STRING_dup(const ASN1_OCTET_STRING *x) { + return ASN1_STRING_dup(x); +} + +int ASN1_OCTET_STRING_cmp(const ASN1_OCTET_STRING *a, + const ASN1_OCTET_STRING *b) { + return ASN1_STRING_cmp(a, b); +} + +int ASN1_OCTET_STRING_set(ASN1_OCTET_STRING *x, const unsigned char *d, + int len) { + return ASN1_STRING_set(x, d, len); +} diff --git a/third_party/boringssl/src/crypto/asn1/a_print.c b/third_party/boringssl/src/crypto/asn1/a_print.c deleted file mode 100644 index a5be1646..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_print.c +++ /dev/null @@ -1,82 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include "internal.h" - - -int ASN1_PRINTABLE_type(const unsigned char *s, int len) { - if (len < 0) { - len = strlen((const char *)s); - } - - int printable = 1; - for (int i = 0; i < len; i++) { - unsigned char c = s[i]; - if (c & 0x80) { - // No need to continue iterating. - return V_ASN1_T61STRING; - } - if (!asn1_is_printable(c)) { - printable = 0; - } - } - - return printable ? V_ASN1_PRINTABLESTRING : V_ASN1_IA5STRING; -} diff --git a/third_party/boringssl/src/crypto/asn1/a_strex.c b/third_party/boringssl/src/crypto/asn1/a_strex.c deleted file mode 100644 index fd08b02f..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_strex.c +++ /dev/null @@ -1,463 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include -#include -#include - -#include -#include -#include - -#include "../bytestring/internal.h" -#include "internal.h" - - -#define ESC_FLAGS \ - (ASN1_STRFLGS_ESC_2253 | ASN1_STRFLGS_ESC_QUOTE | ASN1_STRFLGS_ESC_CTRL | \ - ASN1_STRFLGS_ESC_MSB) - -static int maybe_write(BIO *out, const void *buf, int len) { - // If |out| is NULL, ignore the output but report the length. - return out == NULL || BIO_write(out, buf, len) == len; -} - -static int is_control_character(unsigned char c) { return c < 32 || c == 127; } - -static int do_esc_char(uint32_t c, unsigned long flags, char *do_quotes, - BIO *out, int is_first, int is_last) { - // |c| is a |uint32_t| because, depending on |ASN1_STRFLGS_UTF8_CONVERT|, - // we may be escaping bytes or Unicode codepoints. - char buf[16]; // Large enough for "\\W01234567". - unsigned char u8 = (unsigned char)c; - if (c > 0xffff) { - BIO_snprintf(buf, sizeof(buf), "\\W%08" PRIX32, c); - } else if (c > 0xff) { - BIO_snprintf(buf, sizeof(buf), "\\U%04" PRIX32, c); - } else if ((flags & ASN1_STRFLGS_ESC_MSB) && c > 0x7f) { - BIO_snprintf(buf, sizeof(buf), "\\%02X", c); - } else if ((flags & ASN1_STRFLGS_ESC_CTRL) && is_control_character(c)) { - BIO_snprintf(buf, sizeof(buf), "\\%02X", c); - } else if (flags & ASN1_STRFLGS_ESC_2253) { - // See RFC 2253, sections 2.4 and 4. - if (c == '\\' || c == '"') { - // Quotes and backslashes are always escaped, quoted or not. - BIO_snprintf(buf, sizeof(buf), "\\%c", (int)c); - } else if (c == ',' || c == '+' || c == '<' || c == '>' || c == ';' || - (is_first && (c == ' ' || c == '#')) || - (is_last && (c == ' '))) { - if (flags & ASN1_STRFLGS_ESC_QUOTE) { - // No need to escape, just tell the caller to quote. - if (do_quotes != NULL) { - *do_quotes = 1; - } - return maybe_write(out, &u8, 1) ? 1 : -1; - } - BIO_snprintf(buf, sizeof(buf), "\\%c", (int)c); - } else { - return maybe_write(out, &u8, 1) ? 1 : -1; - } - } else if ((flags & ESC_FLAGS) && c == '\\') { - // If any escape flags are set, also escape backslashes. - BIO_snprintf(buf, sizeof(buf), "\\%c", (int)c); - } else { - return maybe_write(out, &u8, 1) ? 1 : -1; - } - - int len = strlen(buf); - return maybe_write(out, buf, len) ? len : -1; -} - -// This function sends each character in a buffer to do_esc_char(). It -// interprets the content formats and converts to or from UTF8 as -// appropriate. - -static int do_buf(const unsigned char *buf, int buflen, int encoding, - unsigned long flags, char *quotes, BIO *out) { - int (*get_char)(CBS *cbs, uint32_t *out); - int get_char_error; - switch (encoding) { - case MBSTRING_UNIV: - get_char = cbs_get_utf32_be; - get_char_error = ASN1_R_INVALID_UNIVERSALSTRING; - break; - case MBSTRING_BMP: - get_char = cbs_get_ucs2_be; - get_char_error = ASN1_R_INVALID_BMPSTRING; - break; - case MBSTRING_ASC: - get_char = cbs_get_latin1; - get_char_error = ERR_R_INTERNAL_ERROR; // Should not be possible. - break; - case MBSTRING_UTF8: - get_char = cbs_get_utf8; - get_char_error = ASN1_R_INVALID_UTF8STRING; - break; - default: - assert(0); - return -1; - } - - CBS cbs; - CBS_init(&cbs, buf, buflen); - int outlen = 0; - while (CBS_len(&cbs) != 0) { - const int is_first = CBS_data(&cbs) == buf; - uint32_t c; - if (!get_char(&cbs, &c)) { - OPENSSL_PUT_ERROR(ASN1, get_char_error); - return -1; - } - const int is_last = CBS_len(&cbs) == 0; - if (flags & ASN1_STRFLGS_UTF8_CONVERT) { - unsigned char utfbuf[6]; - int utflen; - utflen = UTF8_putc(utfbuf, sizeof(utfbuf), c); - for (int i = 0; i < utflen; i++) { - int len = do_esc_char(utfbuf[i], flags, quotes, out, is_first && i == 0, - is_last && i == utflen - 1); - if (len < 0) { - return -1; - } - outlen += len; - } - } else { - int len = do_esc_char(c, flags, quotes, out, is_first, is_last); - if (len < 0) { - return -1; - } - outlen += len; - } - } - return outlen; -} - -// This function hex dumps a buffer of characters - -static int do_hex_dump(BIO *out, unsigned char *buf, int buflen) { - static const char hexdig[] = "0123456789ABCDEF"; - unsigned char *p, *q; - char hextmp[2]; - if (out) { - p = buf; - q = buf + buflen; - while (p != q) { - hextmp[0] = hexdig[*p >> 4]; - hextmp[1] = hexdig[*p & 0xf]; - if (!maybe_write(out, hextmp, 2)) { - return -1; - } - p++; - } - } - return buflen << 1; -} - -// "dump" a string. This is done when the type is unknown, or the flags -// request it. We can either dump the content octets or the entire DER -// encoding. This uses the RFC 2253 #01234 format. - -static int do_dump(unsigned long flags, BIO *out, const ASN1_STRING *str) { - if (!maybe_write(out, "#", 1)) { - return -1; - } - - // If we don't dump DER encoding just dump content octets - if (!(flags & ASN1_STRFLGS_DUMP_DER)) { - int outlen = do_hex_dump(out, str->data, str->length); - if (outlen < 0) { - return -1; - } - return outlen + 1; - } - - // Placing the ASN1_STRING in a temporary ASN1_TYPE allows the DER encoding - // to readily obtained. - ASN1_TYPE t; - t.type = str->type; - // Negative INTEGER and ENUMERATED values are the only case where - // |ASN1_STRING| and |ASN1_TYPE| types do not match. - // - // TODO(davidben): There are also some type fields which, in |ASN1_TYPE|, do - // not correspond to |ASN1_STRING|. It is unclear whether those are allowed - // in |ASN1_STRING| at all, or what the space of allowed types is. - // |ASN1_item_ex_d2i| will never produce such a value so, for now, we say - // this is an invalid input. But this corner of the library in general - // should be more robust. - if (t.type == V_ASN1_NEG_INTEGER) { - t.type = V_ASN1_INTEGER; - } else if (t.type == V_ASN1_NEG_ENUMERATED) { - t.type = V_ASN1_ENUMERATED; - } - t.value.asn1_string = (ASN1_STRING *)str; - unsigned char *der_buf = NULL; - int der_len = i2d_ASN1_TYPE(&t, &der_buf); - if (der_len < 0) { - return -1; - } - int outlen = do_hex_dump(out, der_buf, der_len); - OPENSSL_free(der_buf); - if (outlen < 0) { - return -1; - } - return outlen + 1; -} - -// string_type_to_encoding returns the |MBSTRING_*| constant for the encoding -// used by the |ASN1_STRING| type |type|, or -1 if |tag| is not a string -// type. -static int string_type_to_encoding(int type) { - // This function is sometimes passed ASN.1 universal types and sometimes - // passed |ASN1_STRING| type values - switch (type) { - case V_ASN1_UTF8STRING: - return MBSTRING_UTF8; - case V_ASN1_NUMERICSTRING: - case V_ASN1_PRINTABLESTRING: - case V_ASN1_T61STRING: - case V_ASN1_IA5STRING: - case V_ASN1_UTCTIME: - case V_ASN1_GENERALIZEDTIME: - case V_ASN1_ISO64STRING: - // |MBSTRING_ASC| refers to Latin-1, not ASCII. - return MBSTRING_ASC; - case V_ASN1_UNIVERSALSTRING: - return MBSTRING_UNIV; - case V_ASN1_BMPSTRING: - return MBSTRING_BMP; - } - return -1; -} - -// This is the main function, print out an ASN1_STRING taking note of various -// escape and display options. Returns number of characters written or -1 if -// an error occurred. - -int ASN1_STRING_print_ex(BIO *out, const ASN1_STRING *str, - unsigned long flags) { - int type = str->type; - int outlen = 0; - if (flags & ASN1_STRFLGS_SHOW_TYPE) { - const char *tagname = ASN1_tag2str(type); - outlen += strlen(tagname); - if (!maybe_write(out, tagname, outlen) || !maybe_write(out, ":", 1)) { - return -1; - } - outlen++; - } - - // Decide what to do with |str|, either dump the contents or display it. - int encoding; - if (flags & ASN1_STRFLGS_DUMP_ALL) { - // Dump everything. - encoding = -1; - } else if (flags & ASN1_STRFLGS_IGNORE_TYPE) { - // Ignore the string type and interpret the contents as Latin-1. - encoding = MBSTRING_ASC; - } else { - encoding = string_type_to_encoding(type); - if (encoding == -1 && (flags & ASN1_STRFLGS_DUMP_UNKNOWN) == 0) { - encoding = MBSTRING_ASC; - } - } - - if (encoding == -1) { - int len = do_dump(flags, out, str); - if (len < 0) { - return -1; - } - outlen += len; - return outlen; - } - - // Measure the length. - char quotes = 0; - int len = do_buf(str->data, str->length, encoding, flags, "es, NULL); - if (len < 0) { - return -1; - } - outlen += len; - if (quotes) { - outlen += 2; - } - if (!out) { - return outlen; - } - - // Encode the value. - if ((quotes && !maybe_write(out, "\"", 1)) || - do_buf(str->data, str->length, encoding, flags, NULL, out) < 0 || - (quotes && !maybe_write(out, "\"", 1))) { - return -1; - } - return outlen; -} - -int ASN1_STRING_print_ex_fp(FILE *fp, const ASN1_STRING *str, - unsigned long flags) { - BIO *bio = NULL; - if (fp != NULL) { - // If |fp| is NULL, this function returns the number of bytes without - // writing. - bio = BIO_new_fp(fp, BIO_NOCLOSE); - if (bio == NULL) { - return -1; - } - } - int ret = ASN1_STRING_print_ex(bio, str, flags); - BIO_free(bio); - return ret; -} - -int ASN1_STRING_to_UTF8(unsigned char **out, const ASN1_STRING *in) { - if (!in) { - return -1; - } - int mbflag = string_type_to_encoding(in->type); - if (mbflag == -1) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_TAG); - return -1; - } - ASN1_STRING stmp, *str = &stmp; - stmp.data = NULL; - stmp.length = 0; - stmp.flags = 0; - int ret = - ASN1_mbstring_copy(&str, in->data, in->length, mbflag, B_ASN1_UTF8STRING); - if (ret < 0) { - return ret; - } - *out = stmp.data; - return stmp.length; -} - -int ASN1_STRING_print(BIO *bp, const ASN1_STRING *v) { - int i, n; - char buf[80]; - const char *p; - - if (v == NULL) { - return 0; - } - n = 0; - p = (const char *)v->data; - for (i = 0; i < v->length; i++) { - if ((p[i] > '~') || ((p[i] < ' ') && (p[i] != '\n') && (p[i] != '\r'))) { - buf[n] = '.'; - } else { - buf[n] = p[i]; - } - n++; - if (n >= 80) { - if (BIO_write(bp, buf, n) <= 0) { - return 0; - } - n = 0; - } - } - if (n > 0) { - if (BIO_write(bp, buf, n) <= 0) { - return 0; - } - } - return 1; -} - -int ASN1_TIME_print(BIO *bp, const ASN1_TIME *tm) { - if (tm->type == V_ASN1_UTCTIME) { - return ASN1_UTCTIME_print(bp, tm); - } - if (tm->type == V_ASN1_GENERALIZEDTIME) { - return ASN1_GENERALIZEDTIME_print(bp, tm); - } - BIO_puts(bp, "Bad time value"); - return 0; -} - -static const char *const mon[12] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", - "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}; - -int ASN1_GENERALIZEDTIME_print(BIO *bp, const ASN1_GENERALIZEDTIME *tm) { - CBS cbs; - CBS_init(&cbs, tm->data, tm->length); - struct tm utc; - if (!CBS_parse_generalized_time(&cbs, &utc, /*allow_timezone_offset=*/0)) { - BIO_puts(bp, "Bad time value"); - return 0; - } - - return BIO_printf(bp, "%s %2d %02d:%02d:%02d %d GMT", mon[utc.tm_mon], - utc.tm_mday, utc.tm_hour, utc.tm_min, utc.tm_sec, - utc.tm_year + 1900) > 0; -} - -int ASN1_UTCTIME_print(BIO *bp, const ASN1_UTCTIME *tm) { - CBS cbs; - CBS_init(&cbs, tm->data, tm->length); - struct tm utc; - if (!CBS_parse_utc_time(&cbs, &utc, /*allow_timezone_offset=*/0)) { - BIO_puts(bp, "Bad time value"); - return 0; - } - - return BIO_printf(bp, "%s %2d %02d:%02d:%02d %d GMT", mon[utc.tm_mon], - utc.tm_mday, utc.tm_hour, utc.tm_min, utc.tm_sec, - utc.tm_year + 1900) > 0; -} diff --git a/third_party/boringssl/src/crypto/asn1/a_strex.cc b/third_party/boringssl/src/crypto/asn1/a_strex.cc new file mode 100644 index 00000000..663865ce --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_strex.cc @@ -0,0 +1,411 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +#define ESC_FLAGS \ + (ASN1_STRFLGS_ESC_2253 | ASN1_STRFLGS_ESC_QUOTE | ASN1_STRFLGS_ESC_CTRL | \ + ASN1_STRFLGS_ESC_MSB) + +static int maybe_write(BIO *out, const void *buf, int len) { + // If |out| is NULL, ignore the output but report the length. + return out == nullptr || BIO_write(out, buf, len) == len; +} + +static int is_control_character(unsigned char c) { return c < 32 || c == 127; } + +static int do_esc_char(uint32_t c, unsigned long flags, char *do_quotes, + BIO *out, int is_first, int is_last) { + // |c| is a |uint32_t| because, depending on |ASN1_STRFLGS_UTF8_CONVERT|, + // we may be escaping bytes or Unicode codepoints. + char buf[16]; // Large enough for "\\W01234567". + unsigned char u8 = (unsigned char)c; + if (c > 0xffff) { + snprintf(buf, sizeof(buf), "\\W%08" PRIX32, c); + } else if (c > 0xff) { + snprintf(buf, sizeof(buf), "\\U%04" PRIX32, c); + } else if ((flags & ASN1_STRFLGS_ESC_MSB) && c > 0x7f) { + snprintf(buf, sizeof(buf), "\\%02X", c); + } else if ((flags & ASN1_STRFLGS_ESC_CTRL) && is_control_character(c)) { + snprintf(buf, sizeof(buf), "\\%02X", c); + } else if (flags & ASN1_STRFLGS_ESC_2253) { + // See RFC 2253, sections 2.4 and 4. + if (c == '\\' || c == '"') { + // Quotes and backslashes are always escaped, quoted or not. + snprintf(buf, sizeof(buf), "\\%c", (int)c); + } else if (c == ',' || c == '+' || c == '<' || c == '>' || c == ';' || + (is_first && (c == ' ' || c == '#')) || + (is_last && (c == ' '))) { + if (flags & ASN1_STRFLGS_ESC_QUOTE) { + // No need to escape, just tell the caller to quote. + if (do_quotes != nullptr) { + *do_quotes = 1; + } + return maybe_write(out, &u8, 1) ? 1 : -1; + } + snprintf(buf, sizeof(buf), "\\%c", (int)c); + } else { + return maybe_write(out, &u8, 1) ? 1 : -1; + } + } else if ((flags & ESC_FLAGS) && c == '\\') { + // If any escape flags are set, also escape backslashes. + snprintf(buf, sizeof(buf), "\\%c", (int)c); + } else { + return maybe_write(out, &u8, 1) ? 1 : -1; + } + + static_assert(sizeof(buf) < INT_MAX, "len may not fit in int"); + int len = (int)strlen(buf); + return maybe_write(out, buf, len) ? len : -1; +} + +// This function sends each character in a buffer to do_esc_char(). It +// interprets the content formats and converts to or from UTF8 as +// appropriate. + +static int do_buf(const unsigned char *buf, int buflen, int encoding, + unsigned long flags, char *quotes, BIO *out) { + int (*get_char)(CBS *cbs, uint32_t *out); + int get_char_error; + switch (encoding) { + case MBSTRING_UNIV: + get_char = CBS_get_utf32_be; + get_char_error = ASN1_R_INVALID_UNIVERSALSTRING; + break; + case MBSTRING_BMP: + get_char = CBS_get_ucs2_be; + get_char_error = ASN1_R_INVALID_BMPSTRING; + break; + case MBSTRING_ASC: + get_char = CBS_get_latin1; + get_char_error = ERR_R_INTERNAL_ERROR; // Should not be possible. + break; + case MBSTRING_UTF8: + get_char = CBS_get_utf8; + get_char_error = ASN1_R_INVALID_UTF8STRING; + break; + default: + assert(0); + return -1; + } + + CBS cbs; + CBS_init(&cbs, buf, buflen); + int outlen = 0; + while (CBS_len(&cbs) != 0) { + const int is_first = CBS_data(&cbs) == buf; + uint32_t c; + if (!get_char(&cbs, &c)) { + OPENSSL_PUT_ERROR(ASN1, get_char_error); + return -1; + } + const int is_last = CBS_len(&cbs) == 0; + if (flags & ASN1_STRFLGS_UTF8_CONVERT) { + uint8_t utf8_buf[6]; + CBB utf8_cbb; + CBB_init_fixed(&utf8_cbb, utf8_buf, sizeof(utf8_buf)); + if (!CBB_add_utf8(&utf8_cbb, c)) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_INTERNAL_ERROR); + return -1; + } + size_t utf8_len = CBB_len(&utf8_cbb); + for (size_t i = 0; i < utf8_len; i++) { + int len = do_esc_char(utf8_buf[i], flags, quotes, out, + is_first && i == 0, is_last && i == utf8_len - 1); + if (len < 0) { + return -1; + } + outlen += len; + } + } else { + int len = do_esc_char(c, flags, quotes, out, is_first, is_last); + if (len < 0) { + return -1; + } + outlen += len; + } + } + return outlen; +} + +static int do_hex_dump(BIO *out, Span in) { + if (in.size() > INT_MAX / 2) { + return -1; + } + if (out) { + static const char kHexDigit[] = "0123456789ABCDEF"; + for (uint8_t b : in) { + char hextmp[2]; + hextmp[0] = kHexDigit[b >> 4]; + hextmp[1] = kHexDigit[b & 0xf]; + if (!maybe_write(out, hextmp, 2)) { + return -1; + } + } + } + return static_cast(in.size() * 2); +} + +// "dump" a string. This is done when the type is unknown, or the flags +// request it. We can either dump the content octets or the entire DER +// encoding. This uses the RFC 2253 #01234 format. + +static int do_dump(unsigned long flags, BIO *out, const ASN1_STRING *str) { + if (!maybe_write(out, "#", 1)) { + return -1; + } + + // If we don't dump DER encoding just dump content octets + if (!(flags & ASN1_STRFLGS_DUMP_DER)) { + int outlen = do_hex_dump(out, Span(str->data, str->length)); + if (outlen < 0) { + return -1; + } + return outlen + 1; + } + + ScopedCBB cbb; + // Roughly estimate the encoded size with |str->length| to reduce unnecessary + // reallocations. (Tag, length, miscellaneous type-dependent overhead.) + if (!CBB_init(cbb.get(), 4 + str->length) || + !asn1_marshal_any_string(cbb.get(), str)) { + return -1; + } + int outlen = do_hex_dump(out, CBBAsSpan(cbb.get())); + if (outlen < 0) { + return -1; + } + return outlen + 1; +} + +// string_type_to_encoding returns the |MBSTRING_*| constant for the encoding +// used by the |ASN1_STRING| type |type|, or -1 if |tag| is not a string +// type. +static int string_type_to_encoding(int type) { + // This function is sometimes passed ASN.1 universal types and sometimes + // passed |ASN1_STRING| type values + switch (type) { + case V_ASN1_UTF8STRING: + return MBSTRING_UTF8; + case V_ASN1_NUMERICSTRING: + case V_ASN1_PRINTABLESTRING: + case V_ASN1_T61STRING: + case V_ASN1_IA5STRING: + case V_ASN1_UTCTIME: + case V_ASN1_GENERALIZEDTIME: + case V_ASN1_ISO64STRING: + // |MBSTRING_ASC| refers to Latin-1, not ASCII. + return MBSTRING_ASC; + case V_ASN1_UNIVERSALSTRING: + return MBSTRING_UNIV; + case V_ASN1_BMPSTRING: + return MBSTRING_BMP; + } + return -1; +} + +// This is the main function, print out an ASN1_STRING taking note of various +// escape and display options. Returns number of characters written or -1 if +// an error occurred. + +int ASN1_STRING_print_ex(BIO *out, const ASN1_STRING *str, + unsigned long flags) { + int type = str->type; + int outlen = 0; + if (flags & ASN1_STRFLGS_SHOW_TYPE) { + const char *tagname = ASN1_tag2str(type); + outlen += strlen(tagname); + if (!maybe_write(out, tagname, outlen) || !maybe_write(out, ":", 1)) { + return -1; + } + outlen++; + } + + // Decide what to do with |str|, either dump the contents or display it. + int encoding; + if (flags & ASN1_STRFLGS_DUMP_ALL) { + // Dump everything. + encoding = -1; + } else if (flags & ASN1_STRFLGS_IGNORE_TYPE) { + // Ignore the string type and interpret the contents as Latin-1. + encoding = MBSTRING_ASC; + } else { + encoding = string_type_to_encoding(type); + if (encoding == -1 && (flags & ASN1_STRFLGS_DUMP_UNKNOWN) == 0) { + encoding = MBSTRING_ASC; + } + } + + if (encoding == -1) { + int len = do_dump(flags, out, str); + if (len < 0) { + return -1; + } + outlen += len; + return outlen; + } + + // Measure the length. + char quotes = 0; + int len = do_buf(str->data, str->length, encoding, flags, "es, nullptr); + if (len < 0) { + return -1; + } + outlen += len; + if (quotes) { + outlen += 2; + } + if (!out) { + return outlen; + } + + // Encode the value. + if ((quotes && !maybe_write(out, "\"", 1)) || + do_buf(str->data, str->length, encoding, flags, nullptr, out) < 0 || + (quotes && !maybe_write(out, "\"", 1))) { + return -1; + } + return outlen; +} + +int ASN1_STRING_print_ex_fp(FILE *fp, const ASN1_STRING *str, + unsigned long flags) { + BIO *bio = nullptr; + if (fp != nullptr) { + // If |fp| is NULL, this function returns the number of bytes without + // writing. + bio = BIO_new_fp(fp, BIO_NOCLOSE); + if (bio == nullptr) { + return -1; + } + } + int ret = ASN1_STRING_print_ex(bio, str, flags); + BIO_free(bio); + return ret; +} + +int ASN1_STRING_to_UTF8(unsigned char **out, const ASN1_STRING *in) { + if (!in) { + return -1; + } + int mbflag = string_type_to_encoding(in->type); + if (mbflag == -1) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_TAG); + return -1; + } + ASN1_STRING stmp, *str = &stmp; + stmp.data = nullptr; + stmp.length = 0; + stmp.flags = 0; + int ret = + ASN1_mbstring_copy(&str, in->data, in->length, mbflag, B_ASN1_UTF8STRING); + if (ret < 0) { + return ret; + } + *out = stmp.data; + return stmp.length; +} + +int ASN1_STRING_print(BIO *bp, const ASN1_STRING *v) { + int i, n; + char buf[80]; + const char *p; + + if (v == nullptr) { + return 0; + } + n = 0; + p = (const char *)v->data; + for (i = 0; i < v->length; i++) { + if ((p[i] > '~') || ((p[i] < ' ') && (p[i] != '\n') && (p[i] != '\r'))) { + buf[n] = '.'; + } else { + buf[n] = p[i]; + } + n++; + if (n >= 80) { + if (BIO_write(bp, buf, n) <= 0) { + return 0; + } + n = 0; + } + } + if (n > 0) { + if (BIO_write(bp, buf, n) <= 0) { + return 0; + } + } + return 1; +} + +int ASN1_TIME_print(BIO *bp, const ASN1_TIME *tm) { + if (tm->type == V_ASN1_UTCTIME) { + return ASN1_UTCTIME_print(bp, tm); + } + if (tm->type == V_ASN1_GENERALIZEDTIME) { + return ASN1_GENERALIZEDTIME_print(bp, tm); + } + BIO_puts(bp, "Bad time value"); + return 0; +} + +static const char *const mon[12] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun", + "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"}; + +int ASN1_GENERALIZEDTIME_print(BIO *bp, const ASN1_GENERALIZEDTIME *tm) { + CBS cbs; + CBS_init(&cbs, tm->data, tm->length); + struct tm utc; + if (!CBS_parse_generalized_time(&cbs, &utc, /*allow_timezone_offset=*/0)) { + BIO_puts(bp, "Bad time value"); + return 0; + } + + return BIO_printf(bp, "%s %2d %02d:%02d:%02d %d GMT", mon[utc.tm_mon], + utc.tm_mday, utc.tm_hour, utc.tm_min, utc.tm_sec, + utc.tm_year + 1900) > 0; +} + +int ASN1_UTCTIME_print(BIO *bp, const ASN1_UTCTIME *tm) { + CBS cbs; + CBS_init(&cbs, tm->data, tm->length); + struct tm utc; + if (!CBS_parse_utc_time(&cbs, &utc, /*allow_timezone_offset=*/0)) { + BIO_puts(bp, "Bad time value"); + return 0; + } + + return BIO_printf(bp, "%s %2d %02d:%02d:%02d %d GMT", mon[utc.tm_mon], + utc.tm_mday, utc.tm_hour, utc.tm_min, utc.tm_sec, + utc.tm_year + 1900) > 0; +} diff --git a/third_party/boringssl/src/crypto/asn1/a_strnid.c b/third_party/boringssl/src/crypto/asn1/a_strnid.c deleted file mode 100644 index 3be266e8..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_strnid.c +++ /dev/null @@ -1,245 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include -#include -#include - -#include "../internal.h" -#include "../lhash/internal.h" -#include "internal.h" - - -DEFINE_LHASH_OF(ASN1_STRING_TABLE) - -static LHASH_OF(ASN1_STRING_TABLE) *string_tables = NULL; -static struct CRYPTO_STATIC_MUTEX string_tables_lock = CRYPTO_STATIC_MUTEX_INIT; - -void ASN1_STRING_set_default_mask(unsigned long mask) {} - -unsigned long ASN1_STRING_get_default_mask(void) { return B_ASN1_UTF8STRING; } - -int ASN1_STRING_set_default_mask_asc(const char *p) { return 1; } - -static const ASN1_STRING_TABLE *asn1_string_table_get(int nid); - -// The following function generates an ASN1_STRING based on limits in a -// table. Frequently the types and length of an ASN1_STRING are restricted by -// a corresponding OID. For example certificates and certificate requests. - -ASN1_STRING *ASN1_STRING_set_by_NID(ASN1_STRING **out, const unsigned char *in, - int len, int inform, int nid) { - ASN1_STRING *str = NULL; - int ret; - if (!out) { - out = &str; - } - const ASN1_STRING_TABLE *tbl = asn1_string_table_get(nid); - if (tbl != NULL) { - unsigned long mask = tbl->mask; - if (!(tbl->flags & STABLE_NO_MASK)) { - mask &= B_ASN1_UTF8STRING; - } - ret = ASN1_mbstring_ncopy(out, in, len, inform, mask, tbl->minsize, - tbl->maxsize); - } else { - ret = ASN1_mbstring_copy(out, in, len, inform, B_ASN1_UTF8STRING); - } - if (ret <= 0) { - return NULL; - } - return *out; -} - -// Now the tables and helper functions for the string table: - -// See RFC 5280. -#define ub_name 32768 -#define ub_common_name 64 -#define ub_locality_name 128 -#define ub_state_name 128 -#define ub_organization_name 64 -#define ub_organization_unit_name 64 -#define ub_email_address 128 -#define ub_serial_number 64 - -// This table must be kept in NID order - -static const ASN1_STRING_TABLE tbl_standard[] = { - {NID_commonName, 1, ub_common_name, DIRSTRING_TYPE, 0}, - {NID_countryName, 2, 2, B_ASN1_PRINTABLESTRING, STABLE_NO_MASK}, - {NID_localityName, 1, ub_locality_name, DIRSTRING_TYPE, 0}, - {NID_stateOrProvinceName, 1, ub_state_name, DIRSTRING_TYPE, 0}, - {NID_organizationName, 1, ub_organization_name, DIRSTRING_TYPE, 0}, - {NID_organizationalUnitName, 1, ub_organization_unit_name, DIRSTRING_TYPE, - 0}, - {NID_pkcs9_emailAddress, 1, ub_email_address, B_ASN1_IA5STRING, - STABLE_NO_MASK}, - {NID_pkcs9_unstructuredName, 1, -1, PKCS9STRING_TYPE, 0}, - {NID_pkcs9_challengePassword, 1, -1, PKCS9STRING_TYPE, 0}, - {NID_pkcs9_unstructuredAddress, 1, -1, DIRSTRING_TYPE, 0}, - {NID_givenName, 1, ub_name, DIRSTRING_TYPE, 0}, - {NID_surname, 1, ub_name, DIRSTRING_TYPE, 0}, - {NID_initials, 1, ub_name, DIRSTRING_TYPE, 0}, - {NID_serialNumber, 1, ub_serial_number, B_ASN1_PRINTABLESTRING, - STABLE_NO_MASK}, - {NID_friendlyName, -1, -1, B_ASN1_BMPSTRING, STABLE_NO_MASK}, - {NID_name, 1, ub_name, DIRSTRING_TYPE, 0}, - {NID_dnQualifier, -1, -1, B_ASN1_PRINTABLESTRING, STABLE_NO_MASK}, - {NID_domainComponent, 1, -1, B_ASN1_IA5STRING, STABLE_NO_MASK}, - {NID_ms_csp_name, -1, -1, B_ASN1_BMPSTRING, STABLE_NO_MASK}}; - -static int table_cmp(const ASN1_STRING_TABLE *a, const ASN1_STRING_TABLE *b) { - if (a->nid < b->nid) { - return -1; - } - if (a->nid > b->nid) { - return 1; - } - return 0; -} - -static int table_cmp_void(const void *a, const void *b) { - return table_cmp(a, b); -} - -static uint32_t table_hash(const ASN1_STRING_TABLE *tbl) { - return OPENSSL_hash32(&tbl->nid, sizeof(tbl->nid)); -} - -static const ASN1_STRING_TABLE *asn1_string_table_get(int nid) { - ASN1_STRING_TABLE key; - key.nid = nid; - const ASN1_STRING_TABLE *tbl = - bsearch(&key, tbl_standard, OPENSSL_ARRAY_SIZE(tbl_standard), - sizeof(ASN1_STRING_TABLE), table_cmp_void); - if (tbl != NULL) { - return tbl; - } - - CRYPTO_STATIC_MUTEX_lock_read(&string_tables_lock); - if (string_tables != NULL) { - tbl = lh_ASN1_STRING_TABLE_retrieve(string_tables, &key); - } - CRYPTO_STATIC_MUTEX_unlock_read(&string_tables_lock); - // Note returning |tbl| without the lock is only safe because - // |ASN1_STRING_TABLE_add| cannot modify or delete existing entries. If we - // wish to support that, this function must copy the result under a lock. - return tbl; -} - -int ASN1_STRING_TABLE_add(int nid, long minsize, long maxsize, - unsigned long mask, unsigned long flags) { - // Existing entries cannot be overwritten. - if (asn1_string_table_get(nid) != NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - - int ret = 0; - CRYPTO_STATIC_MUTEX_lock_write(&string_tables_lock); - - if (string_tables == NULL) { - string_tables = lh_ASN1_STRING_TABLE_new(table_hash, table_cmp); - if (string_tables == NULL) { - goto err; - } - } else { - // Check again for an existing entry. One may have been added while - // unlocked. - ASN1_STRING_TABLE key; - key.nid = nid; - if (lh_ASN1_STRING_TABLE_retrieve(string_tables, &key) != NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - goto err; - } - } - - ASN1_STRING_TABLE *tbl = OPENSSL_malloc(sizeof(ASN1_STRING_TABLE)); - if (tbl == NULL) { - goto err; - } - tbl->nid = nid; - tbl->flags = flags; - tbl->minsize = minsize; - tbl->maxsize = maxsize; - tbl->mask = mask; - ASN1_STRING_TABLE *old_tbl; - if (!lh_ASN1_STRING_TABLE_insert(string_tables, &old_tbl, tbl)) { - OPENSSL_free(tbl); - goto err; - } - assert(old_tbl == NULL); - ret = 1; - -err: - CRYPTO_STATIC_MUTEX_unlock_write(&string_tables_lock); - return ret; -} - -void ASN1_STRING_TABLE_cleanup(void) {} - -void asn1_get_string_table_for_testing(const ASN1_STRING_TABLE **out_ptr, - size_t *out_len) { - *out_ptr = tbl_standard; - *out_len = OPENSSL_ARRAY_SIZE(tbl_standard); -} diff --git a/third_party/boringssl/src/crypto/asn1/a_strnid.cc b/third_party/boringssl/src/crypto/asn1/a_strnid.cc new file mode 100644 index 00000000..7ebf30a7 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_strnid.cc @@ -0,0 +1,208 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include + +#include +#include +#include + +#include "../internal.h" +#include "../lhash/internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +BSSL_NAMESPACE_BEGIN + +DEFINE_LHASH_OF(ASN1_STRING_TABLE) + +BSSL_NAMESPACE_END + +static LHASH_OF(ASN1_STRING_TABLE) *string_tables = nullptr; +static StaticMutex string_tables_lock; + +void ASN1_STRING_set_default_mask(unsigned long mask) {} + +unsigned long ASN1_STRING_get_default_mask() { return B_ASN1_UTF8STRING; } + +int ASN1_STRING_set_default_mask_asc(const char *p) { return 1; } + +static const ASN1_STRING_TABLE *asn1_string_table_get(int nid); + +// The following function generates an ASN1_STRING based on limits in a +// table. Frequently the types and length of an ASN1_STRING are restricted by +// a corresponding OID. For example certificates and certificate requests. + +ASN1_STRING *ASN1_STRING_set_by_NID(ASN1_STRING **out, const unsigned char *in, + ossl_ssize_t len, int inform, int nid) { + ASN1_STRING *str = nullptr; + int ret; + if (!out) { + out = &str; + } + const ASN1_STRING_TABLE *tbl = asn1_string_table_get(nid); + if (tbl != nullptr) { + unsigned long mask = tbl->mask; + if (!(tbl->flags & STABLE_NO_MASK)) { + mask &= B_ASN1_UTF8STRING; + } + ret = ASN1_mbstring_ncopy(out, in, len, inform, mask, tbl->minsize, + tbl->maxsize); + } else { + ret = ASN1_mbstring_copy(out, in, len, inform, B_ASN1_UTF8STRING); + } + if (ret <= 0) { + return nullptr; + } + return *out; +} + +// Now the tables and helper functions for the string table: + +// See RFC 5280. +#define ub_name 32768 +#define ub_common_name 64 +#define ub_locality_name 128 +#define ub_state_name 128 +#define ub_organization_name 64 +#define ub_organization_unit_name 64 +#define ub_email_address 128 +#define ub_serial_number 64 + +// This table must be kept in NID order + +static const ASN1_STRING_TABLE tbl_standard[] = { + {NID_commonName, 1, ub_common_name, DIRSTRING_TYPE, 0}, + {NID_countryName, 2, 2, B_ASN1_PRINTABLESTRING, STABLE_NO_MASK}, + {NID_localityName, 1, ub_locality_name, DIRSTRING_TYPE, 0}, + {NID_stateOrProvinceName, 1, ub_state_name, DIRSTRING_TYPE, 0}, + {NID_organizationName, 1, ub_organization_name, DIRSTRING_TYPE, 0}, + {NID_organizationalUnitName, 1, ub_organization_unit_name, DIRSTRING_TYPE, + 0}, + {NID_pkcs9_emailAddress, 1, ub_email_address, B_ASN1_IA5STRING, + STABLE_NO_MASK}, + {NID_pkcs9_unstructuredName, 1, -1, PKCS9STRING_TYPE, 0}, + {NID_pkcs9_challengePassword, 1, -1, PKCS9STRING_TYPE, 0}, + {NID_pkcs9_unstructuredAddress, 1, -1, DIRSTRING_TYPE, 0}, + {NID_givenName, 1, ub_name, DIRSTRING_TYPE, 0}, + {NID_surname, 1, ub_name, DIRSTRING_TYPE, 0}, + {NID_initials, 1, ub_name, DIRSTRING_TYPE, 0}, + {NID_serialNumber, 1, ub_serial_number, B_ASN1_PRINTABLESTRING, + STABLE_NO_MASK}, + {NID_friendlyName, -1, -1, B_ASN1_BMPSTRING, STABLE_NO_MASK}, + {NID_name, 1, ub_name, DIRSTRING_TYPE, 0}, + {NID_dnQualifier, -1, -1, B_ASN1_PRINTABLESTRING, STABLE_NO_MASK}, + {NID_domainComponent, 1, -1, B_ASN1_IA5STRING, STABLE_NO_MASK}, + {NID_ms_csp_name, -1, -1, B_ASN1_BMPSTRING, STABLE_NO_MASK}}; + +static int table_cmp(const ASN1_STRING_TABLE *a, const ASN1_STRING_TABLE *b) { + if (a->nid < b->nid) { + return -1; + } + if (a->nid > b->nid) { + return 1; + } + return 0; +} + +static int table_cmp_void(const void *a, const void *b) { + return table_cmp(reinterpret_cast(a), + reinterpret_cast(b)); +} + +static uint32_t table_hash(const ASN1_STRING_TABLE *tbl) { + return OPENSSL_hash32(&tbl->nid, sizeof(tbl->nid)); +} + +static const ASN1_STRING_TABLE *asn1_string_table_get(int nid) { + ASN1_STRING_TABLE key; + key.nid = nid; + const ASN1_STRING_TABLE *tbl = reinterpret_cast( + bsearch(&key, tbl_standard, std::size(tbl_standard), + sizeof(ASN1_STRING_TABLE), table_cmp_void)); + if (tbl != nullptr) { + return tbl; + } + + string_tables_lock.LockRead(); + if (string_tables != nullptr) { + tbl = lh_ASN1_STRING_TABLE_retrieve(string_tables, &key); + } + string_tables_lock.UnlockRead(); + // Note returning |tbl| without the lock is only safe because + // |ASN1_STRING_TABLE_add| cannot modify or delete existing entries. If we + // wish to support that, this function must copy the result under a lock. + return tbl; +} + +int ASN1_STRING_TABLE_add(int nid, long minsize, long maxsize, + unsigned long mask, unsigned long flags) { + // Existing entries cannot be overwritten. + if (asn1_string_table_get(nid) != nullptr) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + MutexWriteLock lock(&string_tables_lock); + ASN1_STRING_TABLE *tbl = nullptr; + if (string_tables == nullptr) { + string_tables = lh_ASN1_STRING_TABLE_new(table_hash, table_cmp); + if (string_tables == nullptr) { + return 0; + } + } else { + // Check again for an existing entry. One may have been added while + // unlocked. + ASN1_STRING_TABLE key; + key.nid = nid; + if (lh_ASN1_STRING_TABLE_retrieve(string_tables, &key) != nullptr) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + } + + tbl = New(); + if (tbl == nullptr) { + return 0; + } + tbl->nid = nid; + tbl->flags = flags; + tbl->minsize = minsize; + tbl->maxsize = maxsize; + tbl->mask = mask; + ASN1_STRING_TABLE *old_tbl; + if (!lh_ASN1_STRING_TABLE_insert(string_tables, &old_tbl, tbl)) { + Delete(tbl); + return 0; + } + assert(old_tbl == nullptr); + return 1; +} + +void ASN1_STRING_TABLE_cleanup() {} + +void bssl::asn1_get_string_table_for_testing(const ASN1_STRING_TABLE **out_ptr, + size_t *out_len) { + *out_ptr = tbl_standard; + *out_len = std::size(tbl_standard); +} diff --git a/third_party/boringssl/src/crypto/asn1/a_time.c b/third_party/boringssl/src/crypto/asn1/a_time.c deleted file mode 100644 index 50829690..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_time.c +++ /dev/null @@ -1,244 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include - -#include "internal.h" - -// This is an implementation of the ASN1 Time structure which is: Time ::= -// CHOICE { utcTime UTCTime, generalTime GeneralizedTime } written by Steve -// Henson. - -IMPLEMENT_ASN1_MSTRING(ASN1_TIME, B_ASN1_TIME) - -IMPLEMENT_ASN1_FUNCTIONS_const(ASN1_TIME) - -ASN1_TIME *ASN1_TIME_set(ASN1_TIME *s, time_t t) { - return ASN1_TIME_adj(s, t, 0, 0); -} - -ASN1_TIME *ASN1_TIME_adj(ASN1_TIME *s, time_t t, int offset_day, - long offset_sec) { - struct tm *ts; - struct tm data; - - ts = OPENSSL_gmtime(&t, &data); - if (ts == NULL) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ERROR_GETTING_TIME); - return NULL; - } - if (offset_day || offset_sec) { - if (!OPENSSL_gmtime_adj(ts, offset_day, offset_sec)) { - return NULL; - } - } - if ((ts->tm_year >= 50) && (ts->tm_year < 150)) { - return ASN1_UTCTIME_adj(s, t, offset_day, offset_sec); - } - return ASN1_GENERALIZEDTIME_adj(s, t, offset_day, offset_sec); -} - -int ASN1_TIME_check(const ASN1_TIME *t) { - if (t->type == V_ASN1_GENERALIZEDTIME) { - return ASN1_GENERALIZEDTIME_check(t); - } else if (t->type == V_ASN1_UTCTIME) { - return ASN1_UTCTIME_check(t); - } - return 0; -} - -// Convert an ASN1_TIME structure to GeneralizedTime -ASN1_GENERALIZEDTIME *ASN1_TIME_to_generalizedtime(const ASN1_TIME *t, - ASN1_GENERALIZEDTIME **out) { - ASN1_GENERALIZEDTIME *ret = NULL; - char *str; - int newlen; - - if (!ASN1_TIME_check(t)) { - return NULL; - } - - if (!out || !*out) { - if (!(ret = ASN1_GENERALIZEDTIME_new())) { - goto err; - } - } else { - ret = *out; - } - - // If already GeneralizedTime just copy across - if (t->type == V_ASN1_GENERALIZEDTIME) { - if (!ASN1_STRING_set(ret, t->data, t->length)) { - goto err; - } - goto done; - } - - // grow the string - if (!ASN1_STRING_set(ret, NULL, t->length + 2)) { - goto err; - } - // ASN1_STRING_set() allocated 'len + 1' bytes. - newlen = t->length + 2 + 1; - str = (char *)ret->data; - // Work out the century and prepend - if (t->data[0] >= '5') { - OPENSSL_strlcpy(str, "19", newlen); - } else { - OPENSSL_strlcpy(str, "20", newlen); - } - - OPENSSL_strlcat(str, (char *)t->data, newlen); - -done: - if (out != NULL && *out == NULL) { - *out = ret; - } - return ret; - -err: - if (out == NULL || *out != ret) { - ASN1_GENERALIZEDTIME_free(ret); - } - return NULL; -} - - -int ASN1_TIME_set_string(ASN1_TIME *s, const char *str) { - ASN1_TIME t; - - t.length = strlen(str); - t.data = (unsigned char *)str; - t.flags = 0; - - t.type = V_ASN1_UTCTIME; - - if (!ASN1_TIME_check(&t)) { - t.type = V_ASN1_GENERALIZEDTIME; - if (!ASN1_TIME_check(&t)) { - return 0; - } - } - - if (s && !ASN1_STRING_copy((ASN1_STRING *)s, (ASN1_STRING *)&t)) { - return 0; - } - - return 1; -} - -static int asn1_time_to_tm(struct tm *tm, const ASN1_TIME *t, - int allow_timezone_offset) { - if (t == NULL) { - time_t now_t; - time(&now_t); - if (OPENSSL_gmtime(&now_t, tm)) { - return 1; - } - return 0; - } - - if (t->type == V_ASN1_UTCTIME) { - return asn1_utctime_to_tm(tm, t, allow_timezone_offset); - } else if (t->type == V_ASN1_GENERALIZEDTIME) { - return asn1_generalizedtime_to_tm(tm, t); - } - - return 0; -} - -int ASN1_TIME_diff(int *out_days, int *out_seconds, const ASN1_TIME *from, - const ASN1_TIME *to) { - struct tm tm_from, tm_to; - if (!asn1_time_to_tm(&tm_from, from, /*allow_timezone_offset=*/1)) { - return 0; - } - if (!asn1_time_to_tm(&tm_to, to, /*allow_timezone_offset=*/1)) { - return 0; - } - return OPENSSL_gmtime_diff(out_days, out_seconds, &tm_from, &tm_to); -} - -// The functions below do *not* permissively allow the use of four digit -// timezone offsets in UTC times, as is done elsewhere in the code. They are -// both new API, and used internally to X509_cmp_time. This is to discourage the -// use of nonstandard times in new code, and to ensure that this code behaves -// correctly in X509_cmp_time which historically did its own time validations -// slightly different than the many other copies of X.509 time validation -// sprinkled through the codebase. The custom checks in X509_cmp_time meant that -// it did not allow four digit timezone offsets in UTC times. -int ASN1_TIME_to_time_t(const ASN1_TIME *t, time_t *out_time) { - struct tm tm; - if (!asn1_time_to_tm(&tm, t, /*allow_timezone_offset=*/0)) { - return 0; - } - return OPENSSL_timegm(&tm, out_time); -} - -int ASN1_TIME_to_posix(const ASN1_TIME *t, int64_t *out_time) { - struct tm tm; - if (!asn1_time_to_tm(&tm, t, /*allow_timezone_offset=*/0)) { - return 0; - } - return OPENSSL_tm_to_posix(&tm, out_time); -} diff --git a/third_party/boringssl/src/crypto/asn1/a_time.cc b/third_party/boringssl/src/crypto/asn1/a_time.cc new file mode 100644 index 00000000..9d642d4a --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_time.cc @@ -0,0 +1,248 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +// This is an implementation of the ASN1 Time structure which is: Time ::= +// CHOICE { utcTime UTCTime, generalTime GeneralizedTime } written by Steve +// Henson. + +BSSL_NAMESPACE_BEGIN + +IMPLEMENT_ASN1_MSTRING(ASN1_TIME, B_ASN1_TIME) + +BSSL_NAMESPACE_END + +IMPLEMENT_ASN1_FUNCTIONS_const(ASN1_TIME) + +ASN1_TIME *ASN1_TIME_set_posix(ASN1_TIME *s, int64_t posix_time) { + return ASN1_TIME_adj(s, posix_time, 0, 0); +} + +ASN1_TIME *ASN1_TIME_set(ASN1_TIME *s, time_t time) { + return ASN1_TIME_adj(s, time, 0, 0); +} + +static int fits_in_utc_time(const struct tm *tm) { + return 50 <= tm->tm_year && tm->tm_year < 150; +} + +ASN1_TIME *ASN1_TIME_adj(ASN1_TIME *s, int64_t posix_time, int offset_day, + long offset_sec) { + struct tm tm; + + if (!OPENSSL_posix_to_tm(posix_time, &tm)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ERROR_GETTING_TIME); + return nullptr; + } + if (offset_day || offset_sec) { + if (!OPENSSL_gmtime_adj(&tm, offset_day, offset_sec)) { + return nullptr; + } + } + if (fits_in_utc_time(&tm)) { + return ASN1_UTCTIME_adj(s, posix_time, offset_day, offset_sec); + } + return ASN1_GENERALIZEDTIME_adj(s, posix_time, offset_day, offset_sec); +} + +int ASN1_TIME_check(const ASN1_TIME *t) { + if (t->type == V_ASN1_GENERALIZEDTIME) { + return ASN1_GENERALIZEDTIME_check(t); + } else if (t->type == V_ASN1_UTCTIME) { + return ASN1_UTCTIME_check(t); + } + return 0; +} + +// Convert an ASN1_TIME structure to GeneralizedTime +ASN1_GENERALIZEDTIME *ASN1_TIME_to_generalizedtime(const ASN1_TIME *in, + ASN1_GENERALIZEDTIME **out) { + if (!ASN1_TIME_check(in)) { + return nullptr; + } + + UniquePtr tmp; + ASN1_GENERALIZEDTIME *ret = nullptr; + if (!out || !*out) { + tmp.reset(ASN1_GENERALIZEDTIME_new()); + if (tmp == nullptr) { + return nullptr; + } + ret = tmp.get(); + } else { + ret = *out; + } + + // If already GeneralizedTime just copy across + if (in->type == V_ASN1_GENERALIZEDTIME) { + if (!ASN1_STRING_set(ret, in->data, in->length)) { + return nullptr; + } + } else { + assert(in->type == V_ASN1_UTCTIME); + // |ASN1_TIME_check| implies a bound on the string's lengths. In particular, + // the longest possible UTCTime is "YYMMDDHHMMSS+HHMM", with the (invalid) + // timezone offsets. + static constexpr size_t kMaxUTCTimeLength = 17; + BSSL_CHECK(in->length > 0 && + static_cast(in->length) <= kMaxUTCTimeLength); + char buf[2 /* century */ + kMaxUTCTimeLength + 1 /* NUL */]; + // Work out the century and prepend. + if (in->data[0] >= '5') { + snprintf(buf, sizeof(buf), "19%.*s", in->length, + reinterpret_cast(in->data)); + } else { + snprintf(buf, sizeof(buf), "20%.*s", in->length, + reinterpret_cast(in->data)); + } + if (!ASN1_STRING_set(ret, buf, -1 /* use strlen */)) { + return nullptr; + } + } + + if (out != nullptr && *out == nullptr) { + *out = ret; + } + tmp.release(); // Ownership passed to caller. + return ret; +} + +int ASN1_TIME_set_string(ASN1_TIME *s, const char *str) { + return ASN1_UTCTIME_set_string(s, str) || + ASN1_GENERALIZEDTIME_set_string(s, str); +} + +int ASN1_TIME_set_string_X509(ASN1_TIME *s, const char *str) { + CBS cbs; + CBS_init(&cbs, (const uint8_t *)str, strlen(str)); + int type; + struct tm tm; + if (CBS_parse_utc_time(&cbs, /*out_tm=*/nullptr, + /*allow_timezone_offset=*/0)) { + type = V_ASN1_UTCTIME; + } else if (CBS_parse_generalized_time(&cbs, &tm, + /*allow_timezone_offset=*/0)) { + type = V_ASN1_GENERALIZEDTIME; + if (fits_in_utc_time(&tm)) { + type = V_ASN1_UTCTIME; + CBS_skip(&cbs, 2); + } + } else { + return 0; + } + + if (s != nullptr) { + if (!ASN1_STRING_set(s, CBS_data(&cbs), CBS_len(&cbs))) { + return 0; + } + s->type = type; + } + return 1; +} + +static int asn1_time_to_tm(struct tm *tm, const ASN1_TIME *t, + int allow_timezone_offset) { + if (t == nullptr) { + if (OPENSSL_posix_to_tm(time(nullptr), tm)) { + return 1; + } + return 0; + } + + if (t->type == V_ASN1_UTCTIME) { + return asn1_utctime_to_tm(tm, t, allow_timezone_offset); + } else if (t->type == V_ASN1_GENERALIZEDTIME) { + return asn1_generalizedtime_to_tm(tm, t); + } + + return 0; +} + +int ASN1_TIME_diff(int *out_days, int *out_seconds, const ASN1_TIME *from, + const ASN1_TIME *to) { + struct tm tm_from, tm_to; + if (!asn1_time_to_tm(&tm_from, from, /*allow_timezone_offset=*/1)) { + return 0; + } + if (!asn1_time_to_tm(&tm_to, to, /*allow_timezone_offset=*/1)) { + return 0; + } + return OPENSSL_gmtime_diff(out_days, out_seconds, &tm_from, &tm_to); +} + +int ASN1_TIME_to_posix_nonstandard(const ASN1_TIME *t, int64_t *out_time) { + struct tm tm; + if (!asn1_time_to_tm(&tm, t, /*allow_timezone_offset=*/1)) { + return 0; + } + return OPENSSL_tm_to_posix(&tm, out_time); +} + +// The functions below do *not* permissively allow the use of four digit +// timezone offsets in UTC times, as is done elsewhere in the code. They are +// both new API, and used internally to X509_cmp_time. This is to discourage the +// use of nonstandard times in new code, and to ensure that this code behaves +// correctly in X509_cmp_time which historically did its own time validations +// slightly different than the many other copies of X.509 time validation +// sprinkled through the codebase. The custom checks in X509_cmp_time meant that +// it did not allow four digit timezone offsets in UTC times. +int ASN1_TIME_to_time_t(const ASN1_TIME *t, time_t *out_time) { + struct tm tm; + if (!asn1_time_to_tm(&tm, t, /*allow_timezone_offset=*/0)) { + return 0; + } + return OPENSSL_timegm(&tm, out_time); +} + +int ASN1_TIME_to_posix(const ASN1_TIME *t, int64_t *out_time) { + struct tm tm; + if (!asn1_time_to_tm(&tm, t, /*allow_timezone_offset=*/0)) { + return 0; + } + return OPENSSL_tm_to_posix(&tm, out_time); +} + +int bssl::asn1_parse_time(CBS *cbs, ASN1_TIME *out, + int allow_utc_timezone_offset) { + if (CBS_peek_asn1_tag(cbs, CBS_ASN1_UTCTIME)) { + return asn1_parse_utc_time(cbs, out, /*tag=*/0, allow_utc_timezone_offset); + } + return asn1_parse_generalized_time(cbs, out, /*tag=*/0); +} + +int bssl::asn1_marshal_time(CBB *cbb, const ASN1_TIME *in) { + if (in->type != V_ASN1_UTCTIME && in->type != V_ASN1_GENERALIZEDTIME) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_TYPE); + return 0; + } + return asn1_marshal_octet_string(cbb, in, + static_cast(in->type)); +} diff --git a/third_party/boringssl/src/crypto/asn1/a_type.c b/third_party/boringssl/src/crypto/asn1/a_type.c deleted file mode 100644 index d59fdba8..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_type.c +++ /dev/null @@ -1,162 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include -#include - -#include "internal.h" - - -int ASN1_TYPE_get(const ASN1_TYPE *a) { - if (a->type == V_ASN1_BOOLEAN || a->type == V_ASN1_NULL || - a->value.ptr != NULL) { - return a->type; - } - return 0; -} - -const void *asn1_type_value_as_pointer(const ASN1_TYPE *a) { - if (a->type == V_ASN1_BOOLEAN) { - return a->value.boolean ? (void *)0xff : NULL; - } - if (a->type == V_ASN1_NULL) { - return NULL; - } - return a->value.ptr; -} - -void ASN1_TYPE_set(ASN1_TYPE *a, int type, void *value) { - ASN1_TYPE **tmp_a = &a; - ASN1_primitive_free((ASN1_VALUE **)tmp_a, NULL); - a->type = type; - if (type == V_ASN1_BOOLEAN) { - a->value.boolean = value ? 0xff : 0; - } else { - a->value.ptr = value; - } -} - -int ASN1_TYPE_set1(ASN1_TYPE *a, int type, const void *value) { - if (!value || (type == V_ASN1_BOOLEAN)) { - void *p = (void *)value; - ASN1_TYPE_set(a, type, p); - } else if (type == V_ASN1_OBJECT) { - ASN1_OBJECT *odup; - odup = OBJ_dup(value); - if (!odup) { - return 0; - } - ASN1_TYPE_set(a, type, odup); - } else { - ASN1_STRING *sdup; - sdup = ASN1_STRING_dup(value); - if (!sdup) { - return 0; - } - ASN1_TYPE_set(a, type, sdup); - } - return 1; -} - -// Returns 0 if they are equal, != 0 otherwise. -int ASN1_TYPE_cmp(const ASN1_TYPE *a, const ASN1_TYPE *b) { - int result = -1; - - if (!a || !b || a->type != b->type) { - return -1; - } - - switch (a->type) { - case V_ASN1_OBJECT: - result = OBJ_cmp(a->value.object, b->value.object); - break; - case V_ASN1_NULL: - result = 0; // They do not have content. - break; - case V_ASN1_BOOLEAN: - result = a->value.boolean - b->value.boolean; - break; - case V_ASN1_INTEGER: - case V_ASN1_ENUMERATED: - case V_ASN1_BIT_STRING: - case V_ASN1_OCTET_STRING: - case V_ASN1_SEQUENCE: - case V_ASN1_SET: - case V_ASN1_NUMERICSTRING: - case V_ASN1_PRINTABLESTRING: - case V_ASN1_T61STRING: - case V_ASN1_VIDEOTEXSTRING: - case V_ASN1_IA5STRING: - case V_ASN1_UTCTIME: - case V_ASN1_GENERALIZEDTIME: - case V_ASN1_GRAPHICSTRING: - case V_ASN1_VISIBLESTRING: - case V_ASN1_GENERALSTRING: - case V_ASN1_UNIVERSALSTRING: - case V_ASN1_BMPSTRING: - case V_ASN1_UTF8STRING: - case V_ASN1_OTHER: - default: - result = ASN1_STRING_cmp(a->value.asn1_string, b->value.asn1_string); - break; - } - - return result; -} diff --git a/third_party/boringssl/src/crypto/asn1/a_type.cc b/third_party/boringssl/src/crypto/asn1/a_type.cc new file mode 100644 index 00000000..03ac9bf4 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_type.cc @@ -0,0 +1,435 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +int ASN1_TYPE_get(const ASN1_TYPE *a) { + switch (a->type) { + case V_ASN1_NULL: + case V_ASN1_BOOLEAN: + return a->type; + case V_ASN1_OBJECT: + return a->value.object != nullptr ? a->type : 0; + default: + return a->value.asn1_string != nullptr ? a->type : 0; + } +} + +const void *bssl::asn1_type_value_as_pointer(const ASN1_TYPE *a) { + switch (a->type) { + case V_ASN1_NULL: + return nullptr; + case V_ASN1_BOOLEAN: + return a->value.boolean ? (void *)0xff : nullptr; + case V_ASN1_OBJECT: + return a->value.object; + default: + return a->value.asn1_string; + } +} + +void bssl::asn1_type_set0_string(ASN1_TYPE *a, ASN1_STRING *str) { + // |ASN1_STRING| types are almost the same as |ASN1_TYPE| types, except that + // the negative flag is not reflected into |ASN1_TYPE|. + int type = str->type; + if (type == V_ASN1_NEG_INTEGER) { + type = V_ASN1_INTEGER; + } else if (type == V_ASN1_NEG_ENUMERATED) { + type = V_ASN1_ENUMERATED; + } + + // These types are not |ASN1_STRING| types and use a different + // representation when stored in |ASN1_TYPE|. + assert(type != V_ASN1_NULL && type != V_ASN1_OBJECT && + type != V_ASN1_BOOLEAN); + ASN1_TYPE_set(a, type, str); +} + +void bssl::asn1_type_cleanup(ASN1_TYPE *a) { + switch (a->type) { + case V_ASN1_NULL: + a->value.ptr = nullptr; + break; + case V_ASN1_BOOLEAN: + a->value.boolean = ASN1_BOOLEAN_NONE; + break; + case V_ASN1_OBJECT: + ASN1_OBJECT_free(a->value.object); + a->value.object = nullptr; + break; + default: + ASN1_STRING_free(a->value.asn1_string); + a->value.asn1_string = nullptr; + break; + } +} + +void ASN1_TYPE_set(ASN1_TYPE *a, int type, void *value) { + asn1_type_cleanup(a); + a->type = type; + switch (type) { + case V_ASN1_NULL: + a->value.ptr = nullptr; + break; + case V_ASN1_BOOLEAN: + a->value.boolean = value ? ASN1_BOOLEAN_TRUE : ASN1_BOOLEAN_FALSE; + break; + case V_ASN1_OBJECT: + a->value.object = reinterpret_cast(value); + break; + default: + a->value.asn1_string = reinterpret_cast(value); + break; + } +} + +int ASN1_TYPE_set1(ASN1_TYPE *a, int type, const void *value) { + if (!value || (type == V_ASN1_BOOLEAN)) { + void *p = (void *)value; + ASN1_TYPE_set(a, type, p); + } else if (type == V_ASN1_OBJECT) { + ASN1_OBJECT *odup; + odup = OBJ_dup(reinterpret_cast(value)); + if (!odup) { + return 0; + } + ASN1_TYPE_set(a, type, odup); + } else { + ASN1_STRING *sdup; + sdup = ASN1_STRING_dup(reinterpret_cast(value)); + if (!sdup) { + return 0; + } + ASN1_TYPE_set(a, type, sdup); + } + return 1; +} + +// Returns 0 if they are equal, != 0 otherwise. +int ASN1_TYPE_cmp(const ASN1_TYPE *a, const ASN1_TYPE *b) { + int result = -1; + + if (!a || !b || a->type != b->type) { + return -1; + } + + switch (a->type) { + case V_ASN1_OBJECT: + result = OBJ_cmp(a->value.object, b->value.object); + break; + case V_ASN1_NULL: + result = 0; // They do not have content. + break; + case V_ASN1_BOOLEAN: + result = a->value.boolean - b->value.boolean; + break; + case V_ASN1_INTEGER: + case V_ASN1_ENUMERATED: + case V_ASN1_BIT_STRING: + case V_ASN1_OCTET_STRING: + case V_ASN1_SEQUENCE: + case V_ASN1_SET: + case V_ASN1_NUMERICSTRING: + case V_ASN1_PRINTABLESTRING: + case V_ASN1_T61STRING: + case V_ASN1_VIDEOTEXSTRING: + case V_ASN1_IA5STRING: + case V_ASN1_UTCTIME: + case V_ASN1_GENERALIZEDTIME: + case V_ASN1_GRAPHICSTRING: + case V_ASN1_VISIBLESTRING: + case V_ASN1_GENERALSTRING: + case V_ASN1_UNIVERSALSTRING: + case V_ASN1_BMPSTRING: + case V_ASN1_UTF8STRING: + case V_ASN1_OTHER: + default: + result = ASN1_STRING_cmp(a->value.asn1_string, b->value.asn1_string); + break; + } + + return result; +} + +int bssl::asn1_parse_any(CBS *cbs, ASN1_TYPE *out) { + CBS_ASN1_TAG tag; + CBS elem; + size_t header_len; + if (!CBS_get_any_asn1_element(cbs, &elem, &tag, &header_len)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + + // Handle the non-string types. + if (tag == CBS_ASN1_OBJECT) { + UniquePtr obj(asn1_parse_object(&elem, /*tag=*/0)); + if (obj == nullptr) { + return 0; + } + ASN1_TYPE_set(out, V_ASN1_OBJECT, obj.release()); + return 1; + } + if (tag == CBS_ASN1_NULL) { + if (CBS_len(&elem) != header_len) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + ASN1_TYPE_set(out, V_ASN1_NULL, nullptr); + return 1; + } + if (tag == CBS_ASN1_BOOLEAN) { + int b; + if (!CBS_get_asn1_bool(&elem, &b)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + // V_ASN1_BOOLEAN will interpret the pointer as null for false and any + // arbitrary non-null pointer for true. + ASN1_TYPE_set(out, V_ASN1_BOOLEAN, b ? out : nullptr); + return 1; + } + + // All other cases are handled identically to the string-based ANY parser. + UniquePtr str(ASN1_STRING_new()); + if (str == nullptr || !asn1_parse_any_as_string(&elem, str.get())) { + return 0; + } + asn1_type_set0_string(out, str.release()); + return 1; +} + +int bssl::asn1_parse_any_as_string(CBS *cbs, ASN1_STRING *out) { + CBS_ASN1_TAG tag; + CBS elem; + size_t header_len; + if (!CBS_get_any_asn1_element(cbs, &elem, &tag, &header_len)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + + // Reject unexpectedly constructed or primitive universal types, rather than + // encoding them as an opaque |V_ASN1_OTHER|. As of X.680 (02/2021), tag + // numbers 0-36 have been allocated, except 15. Of these, only 8 (EXTERNAL), + // 11 (EMBEDDED PDV), 16 (SEQUENCE), 17 (SET), and 29 (CHARACTER STRING) are + // constructed. + const CBS_ASN1_TAG tag_class = (tag & CBS_ASN1_CLASS_MASK); + const CBS_ASN1_TAG number = tag & CBS_ASN1_TAG_NUMBER_MASK; + if (tag_class == CBS_ASN1_UNIVERSAL && number <= 36 && number != 15) { + const bool is_constructed = (tag & CBS_ASN1_CONSTRUCTED) != 0; + if (number == V_ASN1_EXTERNAL || number == 11 /* EMBEDDED PDV */ || + number == V_ASN1_SEQUENCE || number == V_ASN1_SET || + number == 29 /* CHARACTER STRING*/) { + if (!is_constructed) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_TYPE_NOT_CONSTRUCTED); + return 0; + } + } else { + if (is_constructed) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_TYPE_NOT_PRIMITIVE); + return 0; + } + } + } + + // Historically, parsing high universal tag numbers made OpenSSL's + // |ASN1_STRING| representation ambiguous. We've since fixed this with + // |V_ASN1_OTHER| but, for now, continue to enforce the limit. + if (tag_class == CBS_ASN1_UNIVERSAL && number > V_ASN1_MAX_UNIVERSAL) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + + // These types are just parsed as |V_ASN1_OTHER| here. Check the contents + // before the generic |V_ASN1_OTHER| path. + CBS body = elem; + BSSL_CHECK(CBS_skip(&body, header_len)); + switch (tag) { + case CBS_ASN1_OBJECT: + if (!CBS_is_valid_asn1_oid(&body)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_OBJECT_ENCODING); + return 0; + } + break; + case CBS_ASN1_NULL: + if (CBS_len(&body) != 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NULL_IS_WRONG_LENGTH); + return 0; + } + break; + case CBS_ASN1_BOOLEAN: { + uint8_t v; + if (!CBS_get_u8(&body, &v) || CBS_len(&body) != 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BOOLEAN_IS_WRONG_LENGTH); + return 0; + } + if (v != 0 && v != 0xff) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + break; + } + } + + switch (tag) { + case CBS_ASN1_INTEGER: + return asn1_parse_integer(&elem, out, tag); + case CBS_ASN1_ENUMERATED: + return asn1_parse_enumerated(&elem, out, tag); + case CBS_ASN1_BITSTRING: + return asn1_parse_bit_string(&elem, out, tag); + case CBS_ASN1_UNIVERSALSTRING: + return asn1_parse_universal_string(&elem, out, tag); + case CBS_ASN1_BMPSTRING: + return asn1_parse_bmp_string(&elem, out, tag); + case CBS_ASN1_UTF8STRING: + return asn1_parse_utf8_string(&elem, out, tag); + case CBS_ASN1_UTCTIME: + // TODO(crbug.com/42290221): Reject timezone offsets here. We have no + // known cases where UTCTime inside ANY needs accept invalid timezones. + return asn1_parse_utc_time(&elem, out, tag, /*allow_timezone_offset=*/1); + case CBS_ASN1_GENERALIZEDTIME: + return asn1_parse_generalized_time(&elem, out, tag); + case CBS_ASN1_OCTETSTRING: + case CBS_ASN1_T61STRING: + case CBS_ASN1_IA5STRING: + case CBS_ASN1_NUMERICSTRING: + case CBS_ASN1_PRINTABLESTRING: + case CBS_ASN1_VIDEOTEXSTRING: + case CBS_ASN1_GRAPHICSTRING: + case CBS_ASN1_VISIBLESTRING: + case CBS_ASN1_GENERALSTRING: + // T61String is parsed as Latin-1, so all byte strings are valid. The + // others we currently do not enforce. + // + // TODO(crbug.com/42290290): Enforce the encoding of the other string + // types. + if (!asn1_parse_octet_string(&elem, out, tag)) { + return 0; + } + out->type = static_cast(tag); + return 1; + default: + // All unrecognized types, or types that cannot be represented as + // |ASN1_STRING|, are represented as the whole element. + if (!ASN1_STRING_set(out, CBS_data(&elem), CBS_len(&elem))) { + return 0; + } + if (tag == CBS_ASN1_SEQUENCE) { + out->type = V_ASN1_SEQUENCE; + } else if (tag == CBS_ASN1_SET) { + out->type = V_ASN1_SET; + } else { + out->type = V_ASN1_OTHER; + } + return 1; + } +} + +static int asn1_marshal_string_with_type(CBB *out, const ASN1_STRING *in, + int type); + +int bssl::asn1_marshal_any(CBB *out, const ASN1_TYPE *in) { + switch (in->type) { + case V_ASN1_OBJECT: + return asn1_marshal_object(out, in->value.object, /*tag=*/0); + case V_ASN1_NULL: + return CBB_add_asn1_element(out, CBS_ASN1_NULL, nullptr, 0); + case V_ASN1_BOOLEAN: + return CBB_add_asn1_bool(out, in->value.boolean != ASN1_BOOLEAN_FALSE); + case V_ASN1_INTEGER: + case V_ASN1_ENUMERATED: + case V_ASN1_BIT_STRING: + case V_ASN1_OCTET_STRING: + case V_ASN1_NUMERICSTRING: + case V_ASN1_PRINTABLESTRING: + case V_ASN1_T61STRING: + case V_ASN1_VIDEOTEXSTRING: + case V_ASN1_IA5STRING: + case V_ASN1_UTCTIME: + case V_ASN1_GENERALIZEDTIME: + case V_ASN1_GRAPHICSTRING: + case V_ASN1_VISIBLESTRING: + case V_ASN1_GENERALSTRING: + case V_ASN1_UNIVERSALSTRING: + case V_ASN1_BMPSTRING: + case V_ASN1_UTF8STRING: + case V_ASN1_SEQUENCE: + case V_ASN1_SET: + case V_ASN1_OTHER: + // If |in->type| and the underlying |ASN1_STRING| type don't match, use + // |in->type|. See b/446993031. + return asn1_marshal_string_with_type(out, in->value.asn1_string, + in->type); + default: + // |ASN1_TYPE|s can have type -1 when default-constructed. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_TYPE); + return 0; + } +} + +static int asn1_marshal_string_with_type(CBB *out, const ASN1_STRING *in, + int type) { + switch (type) { + case V_ASN1_INTEGER: + case V_ASN1_NEG_INTEGER: + return asn1_marshal_integer(out, in, CBS_ASN1_INTEGER); + case V_ASN1_ENUMERATED: + case V_ASN1_NEG_ENUMERATED: + return asn1_marshal_integer(out, in, CBS_ASN1_ENUMERATED); + case V_ASN1_BIT_STRING: + return asn1_marshal_bit_string(out, in, /*tag=*/0); + case V_ASN1_OCTET_STRING: + case V_ASN1_NUMERICSTRING: + case V_ASN1_PRINTABLESTRING: + case V_ASN1_T61STRING: + case V_ASN1_VIDEOTEXSTRING: + case V_ASN1_IA5STRING: + case V_ASN1_UTCTIME: + case V_ASN1_GENERALIZEDTIME: + case V_ASN1_GRAPHICSTRING: + case V_ASN1_VISIBLESTRING: + case V_ASN1_GENERALSTRING: + case V_ASN1_UNIVERSALSTRING: + case V_ASN1_BMPSTRING: + case V_ASN1_UTF8STRING: + return asn1_marshal_octet_string(out, in, + static_cast(type)); + case V_ASN1_SEQUENCE: + case V_ASN1_SET: + case V_ASN1_OTHER: + // These three types store the whole TLV as contents. + return CBB_add_bytes(out, ASN1_STRING_get0_data(in), + ASN1_STRING_length(in)); + default: + // |ASN1_TYPE|s can have type -1 when default-constructed. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_TYPE); + return 0; + } +} + +int bssl::asn1_marshal_any_string(CBB *out, const ASN1_STRING *in) { + return asn1_marshal_string_with_type(out, in, in->type); +} diff --git a/third_party/boringssl/src/crypto/asn1/a_utctm.c b/third_party/boringssl/src/crypto/asn1/a_utctm.c deleted file mode 100644 index 201c6543..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_utctm.c +++ /dev/null @@ -1,177 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include -#include -#include - -#include -#include - -#include "internal.h" - -int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d, - int allow_timezone_offset) { - if (d->type != V_ASN1_UTCTIME) { - return 0; - } - CBS cbs; - CBS_init(&cbs, d->data, (size_t)d->length); - if (!CBS_parse_utc_time(&cbs, tm, allow_timezone_offset)) { - return 0; - } - return 1; -} - -int ASN1_UTCTIME_check(const ASN1_UTCTIME *d) { - return asn1_utctime_to_tm(NULL, d, /*allow_timezone_offset=*/1); -} - -int ASN1_UTCTIME_set_string(ASN1_UTCTIME *s, const char *str) { - ASN1_UTCTIME t; - - t.type = V_ASN1_UTCTIME; - t.length = strlen(str); - t.data = (unsigned char *)str; - if (ASN1_UTCTIME_check(&t)) { - if (s != NULL) { - if (!ASN1_STRING_set((ASN1_STRING *)s, (unsigned char *)str, t.length)) { - return 0; - } - s->type = V_ASN1_UTCTIME; - } - return 1; - } else { - return 0; - } -} - -ASN1_UTCTIME *ASN1_UTCTIME_set(ASN1_UTCTIME *s, time_t t) { - return ASN1_UTCTIME_adj(s, t, 0, 0); -} - -ASN1_UTCTIME *ASN1_UTCTIME_adj(ASN1_UTCTIME *s, time_t t, int offset_day, - long offset_sec) { - struct tm data; - if (!OPENSSL_gmtime(&t, &data)) { - return NULL; - } - - if (offset_day || offset_sec) { - if (!OPENSSL_gmtime_adj(&data, offset_day, offset_sec)) { - return NULL; - } - } - - if (data.tm_year < 50 || data.tm_year >= 150) { - return NULL; - } - - char buf[14]; - BIO_snprintf(buf, sizeof(buf), "%02d%02d%02d%02d%02d%02dZ", - data.tm_year % 100, data.tm_mon + 1, data.tm_mday, data.tm_hour, - data.tm_min, data.tm_sec); - - int free_s = 0; - if (s == NULL) { - free_s = 1; - s = ASN1_UTCTIME_new(); - if (s == NULL) { - return NULL; - } - } - - if (!ASN1_STRING_set(s, buf, strlen(buf))) { - if (free_s) { - ASN1_UTCTIME_free(s); - } - return NULL; - } - s->type = V_ASN1_UTCTIME; - return s; -} - -int ASN1_UTCTIME_cmp_time_t(const ASN1_UTCTIME *s, time_t t) { - struct tm stm, ttm; - int day, sec; - - if (!asn1_utctime_to_tm(&stm, s, /*allow_timezone_offset=*/1)) { - return -2; - } - - if (!OPENSSL_gmtime(&t, &ttm)) { - return -2; - } - - if (!OPENSSL_gmtime_diff(&day, &sec, &ttm, &stm)) { - return -2; - } - - if (day > 0) { - return 1; - } - if (day < 0) { - return -1; - } - if (sec > 0) { - return 1; - } - if (sec < 0) { - return -1; - } - return 0; -} diff --git a/third_party/boringssl/src/crypto/asn1/a_utctm.cc b/third_party/boringssl/src/crypto/asn1/a_utctm.cc new file mode 100644 index 00000000..16a14067 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/a_utctm.cc @@ -0,0 +1,128 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +int bssl::asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d, + int allow_timezone_offset) { + if (d->type != V_ASN1_UTCTIME) { + return 0; + } + CBS cbs; + CBS_init(&cbs, d->data, (size_t)d->length); + if (!CBS_parse_utc_time(&cbs, tm, allow_timezone_offset)) { + return 0; + } + return 1; +} + +int bssl::asn1_parse_utc_time(CBS *cbs, ASN1_UTCTIME *out, CBS_ASN1_TAG tag, + int allow_timezone_offset) { + tag = tag == 0 ? CBS_ASN1_UTCTIME : tag; + CBS child; + if (!CBS_get_asn1(cbs, &child, tag) || + !CBS_parse_utc_time(&child, nullptr, allow_timezone_offset)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + if (!ASN1_STRING_set(out, CBS_data(&child), CBS_len(&child))) { + return 0; + } + out->type = V_ASN1_UTCTIME; + return 1; +} + +int ASN1_UTCTIME_check(const ASN1_UTCTIME *d) { + return asn1_utctime_to_tm(nullptr, d, /*allow_timezone_offset=*/1); +} + +int ASN1_UTCTIME_set_string(ASN1_UTCTIME *s, const char *str) { + // Although elsewhere we allow timezone offsets with UTCTime, to be compatible + // with some existing misissued certificates, this function is used to + // construct new certificates and can be stricter. + size_t len = strlen(str); + CBS cbs; + CBS_init(&cbs, (const uint8_t *)str, len); + if (!CBS_parse_utc_time(&cbs, /*out_tm=*/nullptr, + /*allow_timezone_offset=*/0)) { + return 0; + } + if (s != nullptr) { + if (!ASN1_STRING_set(s, str, len)) { + return 0; + } + s->type = V_ASN1_UTCTIME; + } + return 1; +} + +ASN1_UTCTIME *ASN1_UTCTIME_set(ASN1_UTCTIME *s, int64_t posix_time) { + return ASN1_UTCTIME_adj(s, posix_time, 0, 0); +} + +ASN1_UTCTIME *ASN1_UTCTIME_adj(ASN1_UTCTIME *s, int64_t posix_time, + int offset_day, long offset_sec) { + struct tm data; + if (!OPENSSL_posix_to_tm(posix_time, &data)) { + return nullptr; + } + + if (offset_day || offset_sec) { + if (!OPENSSL_gmtime_adj(&data, offset_day, offset_sec)) { + return nullptr; + } + } + + if (data.tm_year < 50 || data.tm_year >= 150) { + return nullptr; + } + + char buf[14]; + int ret = snprintf(buf, sizeof(buf), "%02d%02d%02d%02d%02d%02dZ", + data.tm_year % 100, data.tm_mon + 1, data.tm_mday, + data.tm_hour, data.tm_min, data.tm_sec); + // |snprintf| must write exactly 15 bytes (plus the NUL) to the buffer. + BSSL_CHECK(ret == static_cast(sizeof(buf) - 1)); + + int free_s = 0; + if (s == nullptr) { + free_s = 1; + s = ASN1_UTCTIME_new(); + if (s == nullptr) { + return nullptr; + } + } + + if (!ASN1_STRING_set(s, buf, strlen(buf))) { + if (free_s) { + ASN1_UTCTIME_free(s); + } + return nullptr; + } + s->type = V_ASN1_UTCTIME; + return s; +} diff --git a/third_party/boringssl/src/crypto/asn1/a_utf8.c b/third_party/boringssl/src/crypto/asn1/a_utf8.c deleted file mode 100644 index 02c1d56c..00000000 --- a/third_party/boringssl/src/crypto/asn1/a_utf8.c +++ /dev/null @@ -1,142 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include "internal.h" - -// UTF8 utilities - -// This takes a character 'value' and writes the UTF8 encoded value in 'str' -// where 'str' is a buffer containing 'len' characters. Returns the number of -// characters written or -1 if 'len' is too small. 'str' can be set to NULL -// in which case it just returns the number of characters. It will need at -// most 6 characters. - -int UTF8_putc(unsigned char *str, int len, uint32_t value) { - if (!str) { - len = 6; // Maximum we will need - } else if (len <= 0) { - return -1; - } - if (value < 0x80) { - if (str) { - *str = (unsigned char)value; - } - return 1; - } - if (value < 0x800) { - if (len < 2) { - return -1; - } - if (str) { - *str++ = (unsigned char)(((value >> 6) & 0x1f) | 0xc0); - *str = (unsigned char)((value & 0x3f) | 0x80); - } - return 2; - } - if (value < 0x10000) { - if (len < 3) { - return -1; - } - if (str) { - *str++ = (unsigned char)(((value >> 12) & 0xf) | 0xe0); - *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80); - *str = (unsigned char)((value & 0x3f) | 0x80); - } - return 3; - } - if (value < 0x200000) { - if (len < 4) { - return -1; - } - if (str) { - *str++ = (unsigned char)(((value >> 18) & 0x7) | 0xf0); - *str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80); - *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80); - *str = (unsigned char)((value & 0x3f) | 0x80); - } - return 4; - } - if (value < 0x4000000) { - if (len < 5) { - return -1; - } - if (str) { - *str++ = (unsigned char)(((value >> 24) & 0x3) | 0xf8); - *str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80); - *str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80); - *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80); - *str = (unsigned char)((value & 0x3f) | 0x80); - } - return 5; - } - if (len < 6) { - return -1; - } - if (str) { - *str++ = (unsigned char)(((value >> 30) & 0x1) | 0xfc); - *str++ = (unsigned char)(((value >> 24) & 0x3f) | 0x80); - *str++ = (unsigned char)(((value >> 18) & 0x3f) | 0x80); - *str++ = (unsigned char)(((value >> 12) & 0x3f) | 0x80); - *str++ = (unsigned char)(((value >> 6) & 0x3f) | 0x80); - *str = (unsigned char)((value & 0x3f) | 0x80); - } - return 6; -} diff --git a/third_party/boringssl/src/crypto/asn1/asn1_lib.c b/third_party/boringssl/src/crypto/asn1/asn1_lib.c deleted file mode 100644 index 98061db3..00000000 --- a/third_party/boringssl/src/crypto/asn1/asn1_lib.c +++ /dev/null @@ -1,390 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include - -#include "../internal.h" -#include "internal.h" - - -// Cross-module errors from crypto/x509/i2d_pr.c. -OPENSSL_DECLARE_ERROR_REASON(ASN1, UNSUPPORTED_PUBLIC_KEY_TYPE) - -// Cross-module errors from crypto/x509/algorithm.c. -OPENSSL_DECLARE_ERROR_REASON(ASN1, CONTEXT_NOT_INITIALISED) -OPENSSL_DECLARE_ERROR_REASON(ASN1, DIGEST_AND_KEY_TYPE_NOT_SUPPORTED) -OPENSSL_DECLARE_ERROR_REASON(ASN1, UNKNOWN_MESSAGE_DIGEST_ALGORITHM) -OPENSSL_DECLARE_ERROR_REASON(ASN1, UNKNOWN_SIGNATURE_ALGORITHM) -OPENSSL_DECLARE_ERROR_REASON(ASN1, WRONG_PUBLIC_KEY_TYPE) -// Cross-module errors from crypto/x509/asn1_gen.c. TODO(davidben): Remove -// these once asn1_gen.c is gone. -OPENSSL_DECLARE_ERROR_REASON(ASN1, DEPTH_EXCEEDED) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_BITSTRING_FORMAT) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_BOOLEAN) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_FORMAT) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_HEX) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_IMPLICIT_TAG) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_INTEGER) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_NESTED_TAGGING) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_NULL_VALUE) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_OBJECT) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_TIME_VALUE) -OPENSSL_DECLARE_ERROR_REASON(ASN1, INTEGER_NOT_ASCII_FORMAT) -OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_MODIFIER) -OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_NUMBER) -OPENSSL_DECLARE_ERROR_REASON(ASN1, LIST_ERROR) -OPENSSL_DECLARE_ERROR_REASON(ASN1, MISSING_VALUE) -OPENSSL_DECLARE_ERROR_REASON(ASN1, NOT_ASCII_FORMAT) -OPENSSL_DECLARE_ERROR_REASON(ASN1, OBJECT_NOT_ASCII_FORMAT) -OPENSSL_DECLARE_ERROR_REASON(ASN1, SEQUENCE_OR_SET_NEEDS_CONFIG) -OPENSSL_DECLARE_ERROR_REASON(ASN1, TIME_NOT_ASCII_FORMAT) -OPENSSL_DECLARE_ERROR_REASON(ASN1, UNKNOWN_FORMAT) -OPENSSL_DECLARE_ERROR_REASON(ASN1, UNKNOWN_TAG) -OPENSSL_DECLARE_ERROR_REASON(ASN1, UNSUPPORTED_TYPE) - -static void asn1_put_length(unsigned char **pp, int length); - -int ASN1_get_object(const unsigned char **inp, long *out_len, int *out_tag, - int *out_class, long in_len) { - if (in_len < 0) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG); - return 0x80; - } - - // TODO(https://crbug.com/boringssl/354): This should use |CBS_get_asn1| to - // reject non-minimal lengths, which are only allowed in BER. However, - // Android sometimes needs allow a non-minimal length in certificate - // signature fields (see b/18228011). Make this only apply to that field, - // while requiring DER elsewhere. Better yet, it should be limited to an - // preprocessing step in that part of Android. - unsigned tag; - size_t header_len; - int indefinite; - CBS cbs, body; - CBS_init(&cbs, *inp, (size_t)in_len); - if (!CBS_get_any_ber_asn1_element(&cbs, &body, &tag, &header_len, - /*out_ber_found=*/NULL, &indefinite) || - indefinite || !CBS_skip(&body, header_len) || - // Bound the length to comfortably fit in an int. Lengths in this - // module often switch between int and long without overflow checks. - CBS_len(&body) > INT_MAX / 2) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG); - return 0x80; - } - - // Convert between tag representations. - int tag_class = (tag & CBS_ASN1_CLASS_MASK) >> CBS_ASN1_TAG_SHIFT; - int constructed = (tag & CBS_ASN1_CONSTRUCTED) >> CBS_ASN1_TAG_SHIFT; - int tag_number = tag & CBS_ASN1_TAG_NUMBER_MASK; - - // To avoid ambiguity with V_ASN1_NEG, impose a limit on universal tags. - if (tag_class == V_ASN1_UNIVERSAL && tag_number > V_ASN1_MAX_UNIVERSAL) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG); - return 0x80; - } - - *inp = CBS_data(&body); - *out_len = CBS_len(&body); - *out_tag = tag_number; - *out_class = tag_class; - return constructed; -} - -// class 0 is constructed constructed == 2 for indefinite length constructed -void ASN1_put_object(unsigned char **pp, int constructed, int length, int tag, - int xclass) { - unsigned char *p = *pp; - int i, ttag; - - i = (constructed) ? V_ASN1_CONSTRUCTED : 0; - i |= (xclass & V_ASN1_PRIVATE); - if (tag < 31) { - *(p++) = i | (tag & V_ASN1_PRIMITIVE_TAG); - } else { - *(p++) = i | V_ASN1_PRIMITIVE_TAG; - for (i = 0, ttag = tag; ttag > 0; i++) { - ttag >>= 7; - } - ttag = i; - while (i-- > 0) { - p[i] = tag & 0x7f; - if (i != (ttag - 1)) { - p[i] |= 0x80; - } - tag >>= 7; - } - p += ttag; - } - if (constructed == 2) { - *(p++) = 0x80; - } else { - asn1_put_length(&p, length); - } - *pp = p; -} - -int ASN1_put_eoc(unsigned char **pp) { - // This function is no longer used in the library, but some external code - // uses it. - unsigned char *p = *pp; - *p++ = 0; - *p++ = 0; - *pp = p; - return 2; -} - -static void asn1_put_length(unsigned char **pp, int length) { - unsigned char *p = *pp; - int i, l; - if (length <= 127) { - *(p++) = (unsigned char)length; - } else { - l = length; - for (i = 0; l > 0; i++) { - l >>= 8; - } - *(p++) = i | 0x80; - l = i; - while (i-- > 0) { - p[i] = length & 0xff; - length >>= 8; - } - p += l; - } - *pp = p; -} - -int ASN1_object_size(int constructed, int length, int tag) { - int ret = 1; - if (length < 0) { - return -1; - } - if (tag >= 31) { - while (tag > 0) { - tag >>= 7; - ret++; - } - } - if (constructed == 2) { - ret += 3; - } else { - ret++; - if (length > 127) { - int tmplen = length; - while (tmplen > 0) { - tmplen >>= 8; - ret++; - } - } - } - if (ret >= INT_MAX - length) { - return -1; - } - return ret + length; -} - -int ASN1_STRING_copy(ASN1_STRING *dst, const ASN1_STRING *str) { - if (str == NULL) { - return 0; - } - if (!ASN1_STRING_set(dst, str->data, str->length)) { - return 0; - } - dst->type = str->type; - dst->flags = str->flags; - return 1; -} - -ASN1_STRING *ASN1_STRING_dup(const ASN1_STRING *str) { - ASN1_STRING *ret; - if (!str) { - return NULL; - } - ret = ASN1_STRING_new(); - if (!ret) { - return NULL; - } - if (!ASN1_STRING_copy(ret, str)) { - ASN1_STRING_free(ret); - return NULL; - } - return ret; -} - -int ASN1_STRING_set(ASN1_STRING *str, const void *_data, int len) { - unsigned char *c; - const char *data = _data; - - if (len < 0) { - if (data == NULL) { - return 0; - } else { - len = strlen(data); - } - } - if ((str->length <= len) || (str->data == NULL)) { - c = str->data; - if (c == NULL) { - str->data = OPENSSL_malloc(len + 1); - } else { - str->data = OPENSSL_realloc(c, len + 1); - } - - if (str->data == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - str->data = c; - return 0; - } - } - str->length = len; - if (data != NULL) { - OPENSSL_memcpy(str->data, data, len); - // an allowance for strings :-) - str->data[len] = '\0'; - } - return 1; -} - -void ASN1_STRING_set0(ASN1_STRING *str, void *data, int len) { - OPENSSL_free(str->data); - str->data = data; - str->length = len; -} - -ASN1_STRING *ASN1_STRING_new(void) { - return (ASN1_STRING_type_new(V_ASN1_OCTET_STRING)); -} - -ASN1_STRING *ASN1_STRING_type_new(int type) { - ASN1_STRING *ret; - - ret = (ASN1_STRING *)OPENSSL_malloc(sizeof(ASN1_STRING)); - if (ret == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return NULL; - } - ret->length = 0; - ret->type = type; - ret->data = NULL; - ret->flags = 0; - return ret; -} - -void ASN1_STRING_free(ASN1_STRING *str) { - if (str == NULL) { - return; - } - OPENSSL_free(str->data); - OPENSSL_free(str); -} - -int ASN1_STRING_cmp(const ASN1_STRING *a, const ASN1_STRING *b) { - // Capture padding bits and implicit truncation in BIT STRINGs. - int a_length = a->length, b_length = b->length; - uint8_t a_padding = 0, b_padding = 0; - if (a->type == V_ASN1_BIT_STRING) { - a_length = asn1_bit_string_length(a, &a_padding); - } - if (b->type == V_ASN1_BIT_STRING) { - b_length = asn1_bit_string_length(b, &b_padding); - } - - if (a_length < b_length) { - return -1; - } - if (a_length > b_length) { - return 1; - } - // In a BIT STRING, the number of bits is 8 * length - padding. Invert this - // comparison so we compare by lengths. - if (a_padding > b_padding) { - return -1; - } - if (a_padding < b_padding) { - return 1; - } - - int ret = OPENSSL_memcmp(a->data, b->data, a_length); - if (ret != 0) { - return ret; - } - - // Comparing the type first is more natural, but this matches OpenSSL. - if (a->type < b->type) { - return -1; - } - if (a->type > b->type) { - return 1; - } - return 0; -} - -int ASN1_STRING_length(const ASN1_STRING *str) { return str->length; } - -int ASN1_STRING_type(const ASN1_STRING *str) { return str->type; } - -unsigned char *ASN1_STRING_data(ASN1_STRING *str) { return str->data; } - -const unsigned char *ASN1_STRING_get0_data(const ASN1_STRING *str) { - return str->data; -} diff --git a/third_party/boringssl/src/crypto/asn1/asn1_lib.cc b/third_party/boringssl/src/crypto/asn1/asn1_lib.cc new file mode 100644 index 00000000..c129c6b4 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/asn1_lib.cc @@ -0,0 +1,438 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +// Cross-module errors from crypto/x509/i2d_pr.c. +OPENSSL_DECLARE_ERROR_REASON(ASN1, UNSUPPORTED_PUBLIC_KEY_TYPE) + +// Cross-module errors from crypto/x509/algorithm.c. +OPENSSL_DECLARE_ERROR_REASON(ASN1, CONTEXT_NOT_INITIALISED) +OPENSSL_DECLARE_ERROR_REASON(ASN1, DIGEST_AND_KEY_TYPE_NOT_SUPPORTED) +OPENSSL_DECLARE_ERROR_REASON(ASN1, UNKNOWN_MESSAGE_DIGEST_ALGORITHM) +OPENSSL_DECLARE_ERROR_REASON(ASN1, UNKNOWN_SIGNATURE_ALGORITHM) +OPENSSL_DECLARE_ERROR_REASON(ASN1, WRONG_PUBLIC_KEY_TYPE) +// Cross-module errors from crypto/x509/asn1_gen.c. TODO(davidben): Remove +// these once asn1_gen.c is gone. +OPENSSL_DECLARE_ERROR_REASON(ASN1, DEPTH_EXCEEDED) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_BITSTRING_FORMAT) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_BOOLEAN) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_FORMAT) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_HEX) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_IMPLICIT_TAG) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_INTEGER) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_NESTED_TAGGING) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_NULL_VALUE) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_OBJECT) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ILLEGAL_TIME_VALUE) +OPENSSL_DECLARE_ERROR_REASON(ASN1, INTEGER_NOT_ASCII_FORMAT) +OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_MODIFIER) +OPENSSL_DECLARE_ERROR_REASON(ASN1, INVALID_NUMBER) +OPENSSL_DECLARE_ERROR_REASON(ASN1, LIST_ERROR) +OPENSSL_DECLARE_ERROR_REASON(ASN1, MISSING_VALUE) +OPENSSL_DECLARE_ERROR_REASON(ASN1, NOT_ASCII_FORMAT) +OPENSSL_DECLARE_ERROR_REASON(ASN1, OBJECT_NOT_ASCII_FORMAT) +OPENSSL_DECLARE_ERROR_REASON(ASN1, SEQUENCE_OR_SET_NEEDS_CONFIG) +OPENSSL_DECLARE_ERROR_REASON(ASN1, TIME_NOT_ASCII_FORMAT) +OPENSSL_DECLARE_ERROR_REASON(ASN1, UNKNOWN_FORMAT) +OPENSSL_DECLARE_ERROR_REASON(ASN1, UNKNOWN_TAG) +OPENSSL_DECLARE_ERROR_REASON(ASN1, UNSUPPORTED_TYPE) + +// Limit |ASN1_STRING|s to 64 MiB of data. Most of this module, as well as +// downstream code, does not correctly handle overflow. We cap string fields +// more tightly than strictly necessary to fit in |int|. This is not expected to +// impact real world uses of this field. +// +// In particular, this limit is small enough that the bit count of a BIT STRING +// comfortably fits in an |int|, with room for arithmetic. +#define ASN1_STRING_MAX (64 * 1024 * 1024) + +static void asn1_put_length(unsigned char **pp, int length); + +int ASN1_get_object(const unsigned char **inp, long *out_len, int *out_tag, + int *out_class, long in_len) { + if (in_len < 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG); + return 0x80; + } + + CBS_ASN1_TAG tag; + CBS cbs, body; + CBS_init(&cbs, *inp, (size_t)in_len); + if (!CBS_get_any_asn1(&cbs, &body, &tag) || + // Bound the length to comfortably fit in an int. Lengths in this + // module often switch between int and long without overflow checks. + CBS_len(&body) > INT_MAX / 2) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG); + return 0x80; + } + + // Convert between tag representations. + int tag_class = (tag & CBS_ASN1_CLASS_MASK) >> CBS_ASN1_TAG_SHIFT; + int constructed = (tag & CBS_ASN1_CONSTRUCTED) >> CBS_ASN1_TAG_SHIFT; + int tag_number = tag & CBS_ASN1_TAG_NUMBER_MASK; + + // To avoid ambiguity with V_ASN1_NEG, impose a limit on universal tags. + if (tag_class == V_ASN1_UNIVERSAL && tag_number > V_ASN1_MAX_UNIVERSAL) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG); + return 0x80; + } + + *inp = CBS_data(&body); + *out_len = CBS_len(&body); + *out_tag = tag_number; + *out_class = tag_class; + return constructed; +} + +// class 0 is constructed constructed == 2 for indefinite length constructed +void ASN1_put_object(unsigned char **pp, int constructed, int length, int tag, + int xclass) { + unsigned char *p = *pp; + int i, ttag; + + i = (constructed) ? V_ASN1_CONSTRUCTED : 0; + i |= (xclass & V_ASN1_PRIVATE); + if (tag < 31) { + *(p++) = i | (tag & V_ASN1_PRIMITIVE_TAG); + } else { + *(p++) = i | V_ASN1_PRIMITIVE_TAG; + for (i = 0, ttag = tag; ttag > 0; i++) { + ttag >>= 7; + } + ttag = i; + while (i-- > 0) { + p[i] = tag & 0x7f; + if (i != (ttag - 1)) { + p[i] |= 0x80; + } + tag >>= 7; + } + p += ttag; + } + if (constructed == 2) { + *(p++) = 0x80; + } else { + asn1_put_length(&p, length); + } + *pp = p; +} + +int ASN1_put_eoc(unsigned char **pp) { + // This function is no longer used in the library, but some external code + // uses it. + unsigned char *p = *pp; + *p++ = 0; + *p++ = 0; + *pp = p; + return 2; +} + +static void asn1_put_length(unsigned char **pp, int length) { + unsigned char *p = *pp; + int i, l; + if (length <= 127) { + *(p++) = (unsigned char)length; + } else { + l = length; + for (i = 0; l > 0; i++) { + l >>= 8; + } + *(p++) = i | 0x80; + l = i; + while (i-- > 0) { + p[i] = length & 0xff; + length >>= 8; + } + p += l; + } + *pp = p; +} + +int ASN1_object_size(int constructed, int length, int tag) { + int ret = 1; + if (length < 0) { + return -1; + } + if (tag >= 31) { + while (tag > 0) { + tag >>= 7; + ret++; + } + } + if (constructed == 2) { + ret += 3; + } else { + ret++; + if (length > 127) { + int tmplen = length; + while (tmplen > 0) { + tmplen >>= 8; + ret++; + } + } + } + if (ret >= INT_MAX - length) { + return -1; + } + return ret + length; +} + +int ASN1_STRING_copy(ASN1_STRING *dst, const ASN1_STRING *str) { + if (str == nullptr) { + return 0; + } + if (dst == str) { + return 1; + } + if (!ASN1_STRING_set(dst, str->data, str->length)) { + return 0; + } + dst->type = str->type; + dst->flags = str->flags; + return 1; +} + +ASN1_STRING *ASN1_STRING_dup(const ASN1_STRING *str) { + ASN1_STRING *ret; + if (!str) { + return nullptr; + } + ret = ASN1_STRING_new(); + if (!ret) { + return nullptr; + } + if (!ASN1_STRING_copy(ret, str)) { + ASN1_STRING_free(ret); + return nullptr; + } + return ret; +} + +int ASN1_STRING_set(ASN1_STRING *str, const void *_data, ossl_ssize_t len_s) { + const char *data = reinterpret_cast(_data); + size_t len; + if (len_s < 0) { + if (data == nullptr) { + return 0; + } + len = strlen(data); + } else { + len = (size_t)len_s; + } + + static_assert(ASN1_STRING_MAX < INT_MAX, "len will not overflow int"); + if (len > ASN1_STRING_MAX) { + OPENSSL_PUT_ERROR(ASN1, ERR_R_OVERFLOW); + return 0; + } + + if (str->length <= (int)len || str->data == nullptr) { + unsigned char *c = str->data; + if (c == nullptr) { + str->data = reinterpret_cast(OPENSSL_malloc(len + 1)); + } else { + str->data = reinterpret_cast(OPENSSL_realloc(c, len + 1)); + } + + if (str->data == nullptr) { + str->data = c; + return 0; + } + } + str->length = (int)len; + str->flags &= ~0x07; // Clear unused bits if this is a BIT STRING. + if (data != nullptr) { + OPENSSL_memcpy(str->data, data, len); + // Historically, OpenSSL would NUL-terminate most (but not all) + // |ASN1_STRING|s, in case anyone accidentally passed |str->data| into a + // function expecting a C string. We retain this behavior for compatibility, + // but code must not rely on this. See CVE-2021-3712. + str->data[len] = '\0'; + } + return 1; +} + +void ASN1_STRING_set0(ASN1_STRING *str, void *data, int len) { + OPENSSL_free(str->data); + str->data = reinterpret_cast(data); + str->length = len; + str->flags &= ~0x07; // Clear unused bits if this is a BIT STRING. +} + +ASN1_STRING *ASN1_STRING_new() { + return (ASN1_STRING_type_new(V_ASN1_OCTET_STRING)); +} + +ASN1_STRING *ASN1_STRING_type_new(int type) { + ASN1_STRING *ret = New(); + if (ret == nullptr) { + return nullptr; + } + ret->length = 0; + ret->type = type; + ret->data = nullptr; + ret->flags = 0; + return ret; +} + +void bssl::asn1_string_init(ASN1_STRING *str, int type) { + OPENSSL_memset(str, 0, sizeof(ASN1_STRING)); + str->type = type; +} + +void bssl::asn1_string_cleanup(ASN1_STRING *str) { + OPENSSL_free(str->data); + str->data = nullptr; +} + +void ASN1_STRING_free(ASN1_STRING *str) { + if (str == nullptr) { + return; + } + asn1_string_cleanup(str); + Delete(str); +} + +int ASN1_STRING_cmp(const ASN1_STRING *a, const ASN1_STRING *b) { + uint8_t a_padding = 0, b_padding = 0; + if (a->type == V_ASN1_BIT_STRING) { + a_padding = ASN1_BIT_STRING_unused_bits(a); + } + if (b->type == V_ASN1_BIT_STRING) { + b_padding = ASN1_BIT_STRING_unused_bits(b); + } + + if (a->length < b->length) { + return -1; + } + if (a->length > b->length) { + return 1; + } + // In a BIT STRING, the number of bits is 8 * length - padding. Invert this + // comparison so we compare by lengths. + if (a_padding > b_padding) { + return -1; + } + if (a_padding < b_padding) { + return 1; + } + + int ret = OPENSSL_memcmp(a->data, b->data, a->length); + if (ret != 0) { + return ret; + } + + // Comparing the type first is more natural, but this matches OpenSSL. + if (a->type < b->type) { + return -1; + } + if (a->type > b->type) { + return 1; + } + return 0; +} + +int ASN1_STRING_length(const ASN1_STRING *str) { return str->length; } + +int ASN1_STRING_type(const ASN1_STRING *str) { return str->type; } + +unsigned char *ASN1_STRING_data(ASN1_STRING *str) { return str->data; } + +const unsigned char *ASN1_STRING_get0_data(const ASN1_STRING *str) { + return str->data; +} + +int bssl::asn1_parse_octet_string(CBS *cbs, ASN1_STRING *out, + CBS_ASN1_TAG tag) { + tag = tag == 0 ? CBS_ASN1_OCTETSTRING : tag; + CBS child; + if (!CBS_get_asn1(cbs, &child, tag)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + if (!ASN1_STRING_set(out, CBS_data(&child), CBS_len(&child))) { + return 0; + } + out->type = V_ASN1_OCTET_STRING; + return 1; +} + +int bssl::asn1_marshal_octet_string(CBB *out, const ASN1_STRING *in, + CBS_ASN1_TAG tag) { + tag = tag == 0 ? CBS_ASN1_OCTETSTRING : tag; + return CBB_add_asn1_element(out, tag, ASN1_STRING_get0_data(in), + ASN1_STRING_length(in)); +} + +static int asn1_parse_character_string(CBS *cbs, ASN1_STRING *out, + CBS_ASN1_TAG tag, int str_type, + int (*get_char)(CBS *cbs, uint32_t *), + int bad_char_err) { + CBS child; + if (!CBS_get_asn1(cbs, &child, tag)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + CBS copy = child; + while (CBS_len(©) != 0) { + uint32_t c; + if (!get_char(©, &c)) { + OPENSSL_PUT_ERROR(ASN1, bad_char_err); + return 0; + } + } + if (!ASN1_STRING_set(out, CBS_data(&child), CBS_len(&child))) { + return 0; + } + out->type = str_type; + return 1; +} + +int bssl::asn1_parse_bmp_string(CBS *cbs, ASN1_BMPSTRING *out, + CBS_ASN1_TAG tag) { + tag = tag == 0 ? CBS_ASN1_BMPSTRING : tag; + return asn1_parse_character_string(cbs, out, tag, V_ASN1_BMPSTRING, + &CBS_get_ucs2_be, + ASN1_R_INVALID_BMPSTRING); +} + +int bssl::asn1_parse_universal_string(CBS *cbs, ASN1_UNIVERSALSTRING *out, + CBS_ASN1_TAG tag) { + tag = tag == 0 ? CBS_ASN1_UNIVERSALSTRING : tag; + return asn1_parse_character_string(cbs, out, tag, V_ASN1_UNIVERSALSTRING, + &CBS_get_utf32_be, + ASN1_R_INVALID_UNIVERSALSTRING); +} + +int bssl::asn1_parse_utf8_string(CBS *cbs, ASN1_UNIVERSALSTRING *out, + CBS_ASN1_TAG tag) { + tag = tag == 0 ? CBS_ASN1_UTF8STRING : tag; + return asn1_parse_character_string(cbs, out, tag, V_ASN1_UTF8STRING, + &CBS_get_utf8, ASN1_R_INVALID_UTF8STRING); +} diff --git a/third_party/boringssl/src/crypto/asn1/asn1_par.c b/third_party/boringssl/src/crypto/asn1/asn1_par.c deleted file mode 100644 index d065e200..00000000 --- a/third_party/boringssl/src/crypto/asn1/asn1_par.c +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - - -const char *ASN1_tag2str(int tag) { - static const char *const tag2str[] = { - "EOC", - "BOOLEAN", - "INTEGER", - "BIT STRING", - "OCTET STRING", - "NULL", - "OBJECT", - "OBJECT DESCRIPTOR", - "EXTERNAL", - "REAL", - "ENUMERATED", - "", - "UTF8STRING", - "", - "", - "", - "SEQUENCE", - "SET", - "NUMERICSTRING", - "PRINTABLESTRING", - "T61STRING", - "VIDEOTEXSTRING", - "IA5STRING", - "UTCTIME", - "GENERALIZEDTIME", - "GRAPHICSTRING", - "VISIBLESTRING", - "GENERALSTRING", - "UNIVERSALSTRING", - "", - "BMPSTRING", - }; - - if ((tag == V_ASN1_NEG_INTEGER) || (tag == V_ASN1_NEG_ENUMERATED)) { - tag &= ~V_ASN1_NEG; - } - - if (tag < 0 || tag > 30) { - return "(unknown)"; - } - return tag2str[tag]; -} diff --git a/third_party/boringssl/src/crypto/asn1/asn1_par.cc b/third_party/boringssl/src/crypto/asn1/asn1_par.cc new file mode 100644 index 00000000..7767ccad --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/asn1_par.cc @@ -0,0 +1,61 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + + +const char *ASN1_tag2str(int tag) { + static const char *const tag2str[] = { + "EOC", + "BOOLEAN", + "INTEGER", + "BIT STRING", + "OCTET STRING", + "NULL", + "OBJECT", + "OBJECT DESCRIPTOR", + "EXTERNAL", + "REAL", + "ENUMERATED", + "", + "UTF8STRING", + "", + "", + "", + "SEQUENCE", + "SET", + "NUMERICSTRING", + "PRINTABLESTRING", + "T61STRING", + "VIDEOTEXSTRING", + "IA5STRING", + "UTCTIME", + "GENERALIZEDTIME", + "GRAPHICSTRING", + "VISIBLESTRING", + "GENERALSTRING", + "UNIVERSALSTRING", + "", + "BMPSTRING", + }; + + if ((tag == V_ASN1_NEG_INTEGER) || (tag == V_ASN1_NEG_ENUMERATED)) { + tag &= ~V_ASN1_NEG; + } + + if (tag < 0 || tag > 30) { + return "(unknown)"; + } + return tag2str[tag]; +} diff --git a/third_party/boringssl/src/crypto/asn1/asn_pack.c b/third_party/boringssl/src/crypto/asn1/asn_pack.c deleted file mode 100644 index 069cef0b..00000000 --- a/third_party/boringssl/src/crypto/asn1/asn_pack.c +++ /dev/null @@ -1,99 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - - -ASN1_STRING *ASN1_item_pack(void *obj, const ASN1_ITEM *it, ASN1_STRING **out) { - uint8_t *new_data = NULL; - int len = ASN1_item_i2d(obj, &new_data, it); - if (len <= 0) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ENCODE_ERROR); - return NULL; - } - - ASN1_STRING *ret = NULL; - if (out == NULL || *out == NULL) { - ret = ASN1_STRING_new(); - if (ret == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - OPENSSL_free(new_data); - return NULL; - } - } else { - ret = *out; - } - - ASN1_STRING_set0(ret, new_data, len); - if (out != NULL) { - *out = ret; - } - return ret; -} - -void *ASN1_item_unpack(const ASN1_STRING *oct, const ASN1_ITEM *it) { - const unsigned char *p = oct->data; - void *ret = ASN1_item_d2i(NULL, &p, oct->length, it); - if (ret == NULL || p != oct->data + oct->length) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); - ASN1_item_free(ret, it); - return NULL; - } - return ret; -} diff --git a/third_party/boringssl/src/crypto/asn1/asn_pack.cc b/third_party/boringssl/src/crypto/asn1/asn_pack.cc new file mode 100644 index 00000000..e2d85f91 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/asn_pack.cc @@ -0,0 +1,56 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + + +ASN1_STRING *ASN1_item_pack(void *obj, const ASN1_ITEM *it, ASN1_STRING **out) { + uint8_t *new_data = nullptr; + int len = ASN1_item_i2d(reinterpret_cast(obj), &new_data, it); + if (len <= 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ENCODE_ERROR); + return nullptr; + } + + ASN1_STRING *ret = nullptr; + if (out == nullptr || *out == nullptr) { + ret = ASN1_STRING_new(); + if (ret == nullptr) { + OPENSSL_free(new_data); + return nullptr; + } + } else { + ret = *out; + } + + ASN1_STRING_set0(ret, new_data, len); + if (out != nullptr) { + *out = ret; + } + return ret; +} + +void *ASN1_item_unpack(const ASN1_STRING *oct, const ASN1_ITEM *it) { + const unsigned char *p = oct->data; + void *ret = ASN1_item_d2i(nullptr, &p, oct->length, it); + if (ret == nullptr || p != oct->data + oct->length) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + ASN1_item_free(reinterpret_cast(ret), it); + return nullptr; + } + return ret; +} diff --git a/third_party/boringssl/src/crypto/asn1/f_int.c b/third_party/boringssl/src/crypto/asn1/f_int.c deleted file mode 100644 index 047fcacf..00000000 --- a/third_party/boringssl/src/crypto/asn1/f_int.c +++ /dev/null @@ -1,105 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -int i2a_ASN1_INTEGER(BIO *bp, const ASN1_INTEGER *a) { - int i, n = 0; - static const char *h = "0123456789ABCDEF"; - char buf[2]; - - if (a == NULL) { - return 0; - } - - if (a->type & V_ASN1_NEG) { - if (BIO_write(bp, "-", 1) != 1) { - goto err; - } - n = 1; - } - - if (a->length == 0) { - if (BIO_write(bp, "00", 2) != 2) { - goto err; - } - n += 2; - } else { - for (i = 0; i < a->length; i++) { - if ((i != 0) && (i % 35 == 0)) { - if (BIO_write(bp, "\\\n", 2) != 2) { - goto err; - } - n += 2; - } - buf[0] = h[((unsigned char)a->data[i] >> 4) & 0x0f]; - buf[1] = h[((unsigned char)a->data[i]) & 0x0f]; - if (BIO_write(bp, buf, 2) != 2) { - goto err; - } - n += 2; - } - } - return n; -err: - return -1; -} - -int i2a_ASN1_ENUMERATED(BIO *bp, const ASN1_ENUMERATED *a) { - return i2a_ASN1_INTEGER(bp, a); -} diff --git a/third_party/boringssl/src/crypto/asn1/f_int.cc b/third_party/boringssl/src/crypto/asn1/f_int.cc new file mode 100644 index 00000000..2774d177 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/f_int.cc @@ -0,0 +1,63 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +int i2a_ASN1_INTEGER(BIO *bp, const ASN1_INTEGER *a) { + int i, n = 0; + static const char *h = "0123456789ABCDEF"; + char buf[2]; + + if (a == nullptr) { + return 0; + } + + if (a->type & V_ASN1_NEG) { + if (BIO_write(bp, "-", 1) != 1) { + goto err; + } + n = 1; + } + + if (a->length == 0) { + if (BIO_write(bp, "00", 2) != 2) { + goto err; + } + n += 2; + } else { + for (i = 0; i < a->length; i++) { + if ((i != 0) && (i % 35 == 0)) { + if (BIO_write(bp, "\\\n", 2) != 2) { + goto err; + } + n += 2; + } + buf[0] = h[((unsigned char)a->data[i] >> 4) & 0x0f]; + buf[1] = h[((unsigned char)a->data[i]) & 0x0f]; + if (BIO_write(bp, buf, 2) != 2) { + goto err; + } + n += 2; + } + } + return n; +err: + return -1; +} + +int i2a_ASN1_ENUMERATED(BIO *bp, const ASN1_ENUMERATED *a) { + return i2a_ASN1_INTEGER(bp, a); +} diff --git a/third_party/boringssl/src/crypto/asn1/f_string.c b/third_party/boringssl/src/crypto/asn1/f_string.c deleted file mode 100644 index 4bc81107..00000000 --- a/third_party/boringssl/src/crypto/asn1/f_string.c +++ /dev/null @@ -1,94 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -int i2a_ASN1_STRING(BIO *bp, const ASN1_STRING *a, int type) { - int i, n = 0; - static const char *h = "0123456789ABCDEF"; - char buf[2]; - - if (a == NULL) { - return 0; - } - - if (a->length == 0) { - if (BIO_write(bp, "0", 1) != 1) { - goto err; - } - n = 1; - } else { - for (i = 0; i < a->length; i++) { - if ((i != 0) && (i % 35 == 0)) { - if (BIO_write(bp, "\\\n", 2) != 2) { - goto err; - } - n += 2; - } - buf[0] = h[((unsigned char)a->data[i] >> 4) & 0x0f]; - buf[1] = h[((unsigned char)a->data[i]) & 0x0f]; - if (BIO_write(bp, buf, 2) != 2) { - goto err; - } - n += 2; - } - } - return n; -err: - return -1; -} diff --git a/third_party/boringssl/src/crypto/asn1/f_string.cc b/third_party/boringssl/src/crypto/asn1/f_string.cc new file mode 100644 index 00000000..38cc7c29 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/f_string.cc @@ -0,0 +1,52 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +int i2a_ASN1_STRING(BIO *bp, const ASN1_STRING *a, int type) { + int i, n = 0; + static const char *h = "0123456789ABCDEF"; + char buf[2]; + + if (a == nullptr) { + return 0; + } + + if (a->length == 0) { + if (BIO_write(bp, "0", 1) != 1) { + goto err; + } + n = 1; + } else { + for (i = 0; i < a->length; i++) { + if ((i != 0) && (i % 35 == 0)) { + if (BIO_write(bp, "\\\n", 2) != 2) { + goto err; + } + n += 2; + } + buf[0] = h[((unsigned char)a->data[i] >> 4) & 0x0f]; + buf[1] = h[((unsigned char)a->data[i]) & 0x0f]; + if (BIO_write(bp, buf, 2) != 2) { + goto err; + } + n += 2; + } + } + return n; +err: + return -1; +} diff --git a/third_party/boringssl/src/crypto/asn1/internal.h b/third_party/boringssl/src/crypto/asn1/internal.h index 3d78dd63..970de107 100644 --- a/third_party/boringssl/src/crypto/asn1/internal.h +++ b/third_party/boringssl/src/crypto/asn1/internal.h @@ -1,102 +1,41 @@ -/* - * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL project - * 2006. - */ -/* ==================================================================== - * Copyright (c) 2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ - -#ifndef OPENSSL_HEADER_ASN1_ASN1_LOCL_H -#define OPENSSL_HEADER_ASN1_ASN1_LOCL_H +// Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_ASN1_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_ASN1_INTERNAL_H #include #include #include -#if defined(__cplusplus) -extern "C" { -#endif +BSSL_NAMESPACE_BEGIN // Wrapper functions for time functions. -// OPENSSL_posix_to_tm converts a int64_t POSIX time value in |time| whuch must -// be in the range of year 0000 to 9999 to a broken out time value in |tm|. It -// returns one on success and zero on error. -OPENSSL_EXPORT int OPENSSL_posix_to_tm(int64_t time, struct tm *out_tm); - -// OPENSSL_tm_to_posix converts a time value between the years 0 and 9999 in -// |tm| to a POSIX time value in |out|. One is returned on success, zero is -// returned on failure. It is a failure if the tm contains out of range values. -OPENSSL_EXPORT int OPENSSL_tm_to_posix(const struct tm *tm, int64_t *out); - // OPENSSL_gmtime converts a time_t value in |time| which must be in the range // of year 0000 to 9999 to a broken out time value in |tm|. On success |tm| is // returned. On failure NULL is returned. OPENSSL_EXPORT struct tm *OPENSSL_gmtime(const time_t *time, struct tm *result); -// OPENSSL_timegm converts a time value between the years 0 and 9999 in |tm| to -// a time_t value in |out|. One is returned on success, zero is returned on -// failure. It is a failure if the converted time can not be represented in a -// time_t, or if the tm contains out of range values. -OPENSSL_EXPORT int OPENSSL_timegm(const struct tm *tm, time_t *out); - // OPENSSL_gmtime_adj returns one on success, and updates |tm| by adding // |offset_day| days and |offset_sec| seconds. It returns zero on failure. |tm| // must be in the range of year 0000 to 9999 both before and after the update or // a failure will be returned. -int OPENSSL_gmtime_adj(struct tm *tm, int offset_day, long offset_sec); +OPENSSL_EXPORT int OPENSSL_gmtime_adj(struct tm *tm, int offset_day, + int64_t offset_sec); // OPENSSL_gmtime_diff calculates the difference between |from| and |to|. It // returns one, and outputs the difference as a number of days and seconds in @@ -107,7 +46,8 @@ OPENSSL_EXPORT int OPENSSL_gmtime_diff(int *out_days, int *out_secs, const struct tm *from, const struct tm *to); -// Internal ASN1 structures and functions: not for application use + +// Object identifiers. // These are used internally in the ASN1_OBJECT to keep track of // whether the names and data need to be free()ed @@ -115,6 +55,8 @@ OPENSSL_EXPORT int OPENSSL_gmtime_diff(int *out_days, int *out_secs, #define ASN1_OBJECT_FLAG_DYNAMIC_STRINGS 0x04 // internal use #define ASN1_OBJECT_FLAG_DYNAMIC_DATA 0x08 // internal use +BSSL_NAMESPACE_END + // An asn1_object_st (aka |ASN1_OBJECT|) represents an ASN.1 OBJECT IDENTIFIER. // Note: Mutating an |ASN1_OBJECT| is only permitted when initializing it. The // library maintains a table of static |ASN1_OBJECT|s, which may be referenced @@ -129,42 +71,153 @@ struct asn1_object_st { int flags; // Should we free this one }; -ASN1_OBJECT *ASN1_OBJECT_new(void); +BSSL_NAMESPACE_BEGIN -// ASN1_ENCODING structure: this is used to save the received -// encoding of an ASN1 type. This is useful to get round -// problems with invalid encodings which can break signatures. -typedef struct ASN1_ENCODING_st { - unsigned char *enc; // DER encoding - long len; // Length of encoding - int modified; // set to 1 if 'enc' is invalid - // alias_only is zero if |enc| owns the buffer that it points to - // (although |enc| may still be NULL). If one, |enc| points into a - // buffer that is owned elsewhere. - unsigned alias_only : 1; - // alias_only_on_next_parse is one iff the next parsing operation - // should avoid taking a copy of the input and rather set - // |alias_only|. - unsigned alias_only_on_next_parse : 1; -} ASN1_ENCODING; +ASN1_OBJECT *ASN1_OBJECT_new(); + +// asn1_parse_object parses a DER-encoded ASN.1 OBJECT IDENTIFIER from |cbs| and +// write the result to |out|. If |tag| is non-zero, the value is implicitly +// tagged with |tag|. On success, it returns a newly-allocated |ASN1_OBJECT| +// with the result and advances |cbs| past the parsed element. +// +// TODO(crbug.com/boringssl/414361735): This should return a bssl::UniquePtr, +// but cannot until it is made C++ linkage. +ASN1_OBJECT *asn1_parse_object(CBS *cbs, CBS_ASN1_TAG tag); + +// asn1_marshal_object marshals |in| as a DER-encoded, ASN.1 OBJECT IDENTIFIER +// and writes the result to |out|. It returns one on success and zero on error. +// If |tag| is non-zero, the tag is replaced with |tag|. +int asn1_marshal_object(CBB *out, const ASN1_OBJECT *in, CBS_ASN1_TAG tag); + + +// Strings. + +// asn1_is_printable returns one if |value| is a valid Unicode codepoint for an +// ASN.1 PrintableString, and zero otherwise. +int asn1_is_printable(uint32_t value); + +// asn1_string_init initializes |str|, which may be uninitialized, with type +// |type|. +void asn1_string_init(ASN1_STRING *str, int type); + +// asn1_string_cleanup releases memory associated with |str|'s value, without +// freeing |str| itself. +void asn1_string_cleanup(ASN1_STRING *str); + +// The following functions parse a DER-encoded ASN.1 value of the specified +// type from |cbs| and write the result to |*out|. If |tag| is non-zero, the +// value is implicitly tagged with |tag|. On success, they return one and +// advance |cbs| past the parsed element. On entry, |*out| must contain an +// |ASN1_STRING| in some valid state. +int asn1_parse_bit_string(CBS *cbs, ASN1_BIT_STRING *out, CBS_ASN1_TAG tag); +int asn1_parse_integer(CBS *cbs, ASN1_INTEGER *out, CBS_ASN1_TAG tag); +int asn1_parse_enumerated(CBS *cbs, ASN1_ENUMERATED *out, CBS_ASN1_TAG tag); +int asn1_parse_octet_string(CBS *cbs, ASN1_STRING *out, CBS_ASN1_TAG tag); +int asn1_parse_bmp_string(CBS *cbs, ASN1_BMPSTRING *out, CBS_ASN1_TAG tag); +int asn1_parse_universal_string(CBS *cbs, ASN1_UNIVERSALSTRING *out, + CBS_ASN1_TAG tag); +int asn1_parse_utf8_string(CBS *cbs, ASN1_UNIVERSALSTRING *out, + CBS_ASN1_TAG tag); +int asn1_parse_generalized_time(CBS *cbs, ASN1_GENERALIZEDTIME *out, + CBS_ASN1_TAG tag); +int asn1_parse_utc_time(CBS *cbs, ASN1_UTCTIME *out, CBS_ASN1_TAG tag, + int allow_timezone_offset); + +// asn1_parse_bit_string_with_bad_length behaves like |asn1_parse_bit_string| +// but tolerates BER non-minimal, definite lengths. +int asn1_parse_bit_string_with_bad_length(CBS *cbs, ASN1_BIT_STRING *out); + +// asn1_marshal_bit_string marshals |in| as a DER-encoded, ASN.1 BIT STRING and +// writes the result to |out|. It returns one on success and zero on error. If +// |tag| is non-zero, the tag is replaced with |tag|. +int asn1_marshal_bit_string(CBB *out, const ASN1_BIT_STRING *in, + CBS_ASN1_TAG tag); + +// asn1_marshal_integer marshals |in| as a DER-encoded, ASN.1 INTEGER and writes +// the result to |out|. It returns one on success and zero on error. If |tag| is +// non-zero, the tag is replaced with |tag|. This can also be used to marshal an +// ASN.1 ENUMERATED value by overriding the tag. +int asn1_marshal_integer(CBB *out, const ASN1_INTEGER *in, CBS_ASN1_TAG tag); + +// asn1_marshal_octet_string marshals |in| as a DER-encoded, ASN.1 OCTET STRING +// and writes the result to |out|. It returns one on success and zero on error. +// If |tag| is non-zero, the tag is replaced with |tag|. +// +// This function may be used to marshal other string-based universal types whose +// encoding is that of an implicitly-tagged OCTET STRING, e.g. UTF8String. +int asn1_marshal_octet_string(CBB *out, const ASN1_STRING *in, + CBS_ASN1_TAG tag); OPENSSL_EXPORT int asn1_utctime_to_tm(struct tm *tm, const ASN1_UTCTIME *d, int allow_timezone_offset); OPENSSL_EXPORT int asn1_generalizedtime_to_tm(struct tm *tm, const ASN1_GENERALIZEDTIME *d); -void asn1_item_combine_free(ASN1_VALUE **pval, const ASN1_ITEM *it, - int combine); +int asn1_parse_time(CBS *cbs, ASN1_TIME *out, int allow_utc_timezone_offset); +int asn1_marshal_time(CBB *cbb, const ASN1_TIME *in); + -int UTF8_putc(unsigned char *str, int len, uint32_t value); +// The ASN.1 ANY type. + +// asn1_type_value_as_pointer returns |a|'s value in pointer form. This is +// usually the value object but, for BOOLEAN values, is 0 or 0xff cast to +// a pointer. +const void *asn1_type_value_as_pointer(const ASN1_TYPE *a); + +// asn1_type_set0_string sets |a|'s value to the object represented by |str| and +// takes ownership of |str|. +void asn1_type_set0_string(ASN1_TYPE *a, ASN1_STRING *str); + +// asn1_type_cleanup releases memory associated with |a|'s value, without +// freeing |a| itself. +void asn1_type_cleanup(ASN1_TYPE *a); + +// asn1_parse_any parses a DER-encoded ASN.1 value of any type from |cbs| and +// writes the result to |*out|. On success, it advances |cbs| past the parsed +// element and returns one. On entry, |*out| must contain an |ASN1_TYPE| in some +// valid state. +int asn1_parse_any(CBS *cbs, ASN1_TYPE *out); + +// asn1_parse_any_as_string behaves like |asn1_parse_any| but represents the +// value as an |ASN1_STRING|. Types which are not represented with +// |ASN1_STRING|, such as |ASN1_OBJECT|, are represented with type +// |V_ASN1_OTHER|. +int asn1_parse_any_as_string(CBS *cbs, ASN1_STRING *out); + +// asn1_marshal_any marshals |in| as a DER-encoded ASN.1 value and writes the +// result to |out|. It returns one on success and zeron on error. +int asn1_marshal_any(CBB *out, const ASN1_TYPE *in); + +// asn1_marshal_any_string marshals |in| as a DER-encoded ASN.1 value and writes +// the result to |out|. It returns one on success and zeron on error. +int asn1_marshal_any_string(CBB *out, const ASN1_STRING *in); + + +// Support structures for the template-based encoder. + +// ASN1_ENCODING is used to save the received encoding of an ASN.1 type. This +// avoids problems with invalid encodings that break signatures. +typedef struct ASN1_ENCODING_st { + // enc is the saved DER encoding. Its ownership is determined by |buf|. + uint8_t *enc; + // len is the length of |enc|. If zero, there is no saved encoding. + size_t len; +} ASN1_ENCODING; int ASN1_item_ex_new(ASN1_VALUE **pval, const ASN1_ITEM *it); void ASN1_item_ex_free(ASN1_VALUE **pval, const ASN1_ITEM *it); void ASN1_template_free(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt); + +// ASN1_item_ex_d2i parses |len| bytes from |*in| as a structure of type |it| +// and writes the result to |*pval|. If |tag| is non-negative, |it| is +// implicitly tagged with the tag specified by |tag| and |aclass|. If |opt| is +// non-zero, the value is optional. +// +// This function returns one and advances |*in| if an object was successfully +// parsed, -1 if an optional value was successfully skipped, and zero on error. int ASN1_item_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, long len, - const ASN1_ITEM *it, int tag, int aclass, char opt, - ASN1_TLC *ctx); + const ASN1_ITEM *it, int tag, int aclass, char opt); // ASN1_item_ex_i2d encodes |*pval| as a value of type |it| to |out| under the // i2d output convention. It returns a non-zero length on success and -1 on @@ -206,25 +259,14 @@ void asn1_enc_free(ASN1_VALUE **pval, const ASN1_ITEM *it); int asn1_enc_restore(int *len, unsigned char **out, ASN1_VALUE **pval, const ASN1_ITEM *it); -int asn1_enc_save(ASN1_VALUE **pval, const unsigned char *in, int inlen, +// asn1_enc_save saves |inlen| bytes from |in| as |*pval|'s saved encoding. It +// returns one on success and zero on error. If |buf| is non-NULL, |in| must +// point into |buf|. +int asn1_enc_save(ASN1_VALUE **pval, const uint8_t *in, size_t inlen, const ASN1_ITEM *it); -// asn1_type_value_as_pointer returns |a|'s value in pointer form. This is -// usually the value object but, for BOOLEAN values, is 0 or 0xff cast to -// a pointer. -const void *asn1_type_value_as_pointer(const ASN1_TYPE *a); - -// asn1_is_printable returns one if |value| is a valid Unicode codepoint for an -// ASN.1 PrintableString, and zero otherwise. -int asn1_is_printable(uint32_t value); - -// asn1_bit_string_length returns the number of bytes in |str| and sets -// |*out_padding_bits| to the number of padding bits. -// -// This function should be used instead of |ASN1_STRING_length| to correctly -// handle the non-|ASN1_STRING_FLAG_BITS_LEFT| case. -int asn1_bit_string_length(const ASN1_BIT_STRING *str, - uint8_t *out_padding_bits); +// asn1_encoding_clear clears the cached encoding in |enc|. +void asn1_encoding_clear(ASN1_ENCODING *enc); typedef struct { int nid; @@ -239,9 +281,84 @@ typedef struct { OPENSSL_EXPORT void asn1_get_string_table_for_testing( const ASN1_STRING_TABLE **out_ptr, size_t *out_len); +typedef ASN1_VALUE *ASN1_new_func(); +typedef void ASN1_free_func(ASN1_VALUE *a); +typedef ASN1_VALUE *ASN1_d2i_func(ASN1_VALUE **a, const unsigned char **in, + long length); +typedef int ASN1_i2d_func(ASN1_VALUE *a, unsigned char **in); -#if defined(__cplusplus) -} // extern C -#endif - -#endif // OPENSSL_HEADER_ASN1_ASN1_LOCL_H +// An ASN1_ex_parse function should parse a value from |cbs| and set |*pval| to +// the result. It should return one on success and zero on failure. If |opt| is +// non-zero, the field may be optional. If an optional element is missing, the +// function should return one and consume zero bytes from |cbs|. +// +// If |opt| is non-zero, the function can assume that |*pval| is nullptr on +// entry. Otherwise, |*pval| may either be nullptr, or the result of +// |ASN1_ex_new_func|. The function may either write into the existing object, +// if any, or unconditionally make a new one. (The existing object comes from +// tasn_new.cc recursively filling in objects before parsing into them.) +typedef int ASN1_ex_parse(ASN1_VALUE **pval, CBS *cbs, const ASN1_ITEM *it, + int opt); + +typedef int ASN1_ex_i2d(ASN1_VALUE **pval, unsigned char **out, + const ASN1_ITEM *it); +typedef int ASN1_ex_new_func(ASN1_VALUE **pval, const ASN1_ITEM *it); +typedef void ASN1_ex_free_func(ASN1_VALUE **pval, const ASN1_ITEM *it); + +typedef struct ASN1_EXTERN_FUNCS_st { + ASN1_ex_new_func *asn1_ex_new; + ASN1_ex_free_func *asn1_ex_free; + ASN1_ex_parse *asn1_ex_parse; + ASN1_ex_i2d *asn1_ex_i2d; +} ASN1_EXTERN_FUNCS; + +#define IMPLEMENT_EXTERN_ASN1_SIMPLE(name, new_func, free_func, tag, \ + parse_func, i2d_func) \ + static int name##_new_cb(ASN1_VALUE **pval, const ASN1_ITEM *it) { \ + *pval = (ASN1_VALUE *)new_func(); \ + return *pval != nullptr; \ + } \ + \ + static void name##_free_cb(ASN1_VALUE **pval, const ASN1_ITEM *it) { \ + free_func((name *)*pval); \ + *pval = nullptr; \ + } \ + \ + static int name##_parse_cb(ASN1_VALUE **pval, CBS *cbs, const ASN1_ITEM *it, \ + int opt) { \ + if (opt && !CBS_peek_asn1_tag(cbs, (tag))) { \ + return 1; \ + } \ + \ + if ((*pval == nullptr && !name##_new_cb(pval, it)) || \ + !parse_func(cbs, (name *)*pval)) { \ + return 0; \ + } \ + return 1; \ + } \ + \ + static int name##_i2d_cb(ASN1_VALUE **pval, unsigned char **out, \ + const ASN1_ITEM *it) { \ + return i2d_func((name *)*pval, out); \ + } \ + \ + static const ASN1_EXTERN_FUNCS name##_extern_funcs = { \ + name##_new_cb, name##_free_cb, name##_parse_cb, name##_i2d_cb}; \ + \ + IMPLEMENT_EXTERN_ASN1(name, name##_extern_funcs) + +// ASN1_TIME is an |ASN1_ITEM| whose ASN.1 type is X.509 Time (RFC 5280) and C +// type is |ASN1_TIME*|. +DECLARE_ASN1_ITEM(ASN1_TIME) + +// DIRECTORYSTRING is an |ASN1_ITEM| whose ASN.1 type is X.509 DirectoryString +// (RFC 5280) and C type is |ASN1_STRING*|. +DECLARE_ASN1_ITEM(DIRECTORYSTRING) + +// DISPLAYTEXT is an |ASN1_ITEM| whose ASN.1 type is X.509 DisplayText (RFC +// 5280) and C type is |ASN1_STRING*|. +DECLARE_ASN1_ITEM(DISPLAYTEXT) + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_ASN1_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/asn1/posix_time.c b/third_party/boringssl/src/crypto/asn1/posix_time.c deleted file mode 100644 index 81fbe833..00000000 --- a/third_party/boringssl/src/crypto/asn1/posix_time.c +++ /dev/null @@ -1,227 +0,0 @@ -/* Copyright (c) 2022, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -// Time conversion to/from POSIX time_t and struct tm, with no support -// for time zones other than UTC - -#include -#include -#include -#include -#include - -#include "internal.h" - -#define SECS_PER_HOUR (60 * 60) -#define SECS_PER_DAY (24 * SECS_PER_HOUR) - - -// Is a year/month/day combination valid, in the range from year 0000 -// to 9999? -static int is_valid_date(int year, int month, int day) { - if (day < 1 || month < 1 || year < 0 || year > 9999) { - return 0; - } - switch (month) { - case 1: - case 3: - case 5: - case 7: - case 8: - case 10: - case 12: - return day > 0 && day <= 31; - case 4: - case 6: - case 9: - case 11: - return day > 0 && day <= 30; - case 2: - if ((year % 4 == 0 && year % 100 != 0) || year % 400 == 0) { - return day > 0 && day <= 29; - } else { - return day > 0 && day <= 28; - } - default: - return 0; - } -} - -// Is a time valid? Leap seconds of 60 are not considered valid, as -// the POSIX time in seconds does not include them. -static int is_valid_time(int hours, int minutes, int seconds) { - if (hours < 0 || minutes < 0 || seconds < 0 || hours > 23 || minutes > 59 || - seconds > 59) { - return 0; - } - return 1; -} - -// Is a int64 time representing a time within our expected range? -static int is_valid_epoch_time(int64_t time) { - // 0000-01-01 00:00:00 UTC to 9999-12-31 23:59:59 UTC - return (int64_t)-62167219200 <= time && time <= (int64_t)253402300799; -} - -// Inspired by algorithms presented in -// https://howardhinnant.github.io/date_algorithms.html -// (Public Domain) -static int posix_time_from_utc(int year, int month, int day, int hours, - int minutes, int seconds, int64_t *out_time) { - if (!is_valid_date(year, month, day) || - !is_valid_time(hours, minutes, seconds)) { - return 0; - } - if (month <= 2) { - year--; // Start years on Mar 1, so leap days always finish a year. - } - // At this point year will be in the range -1 and 9999. - assert(-1 <= year && year <= 9999); - int64_t era = (year >= 0 ? year : year - 399) / 400; - int64_t year_of_era = year - era * 400; - int64_t day_of_year = - (153 * (month > 2 ? month - 3 : month + 9) + 2) / 5 + day - 1; - int64_t day_of_era = - year_of_era * 365 + year_of_era / 4 - year_of_era / 100 + day_of_year; - int64_t posix_days = era * 146097 + day_of_era - 719468; - *out_time = posix_days * SECS_PER_DAY + hours * SECS_PER_HOUR + minutes * 60 + - seconds; - return 1; -} - -// Inspired by algorithms presented in -// https://howardhinnant.github.io/date_algorithms.html -// (Public Domain) -static int utc_from_posix_time(int64_t time, int *out_year, int *out_month, - int *out_day, int *out_hours, int *out_minutes, - int *out_seconds) { - if (!is_valid_epoch_time(time)) { - return 0; - } - int64_t days = time / SECS_PER_DAY; - int64_t leftover_seconds = time % SECS_PER_DAY; - if (leftover_seconds < 0) { - days--; - leftover_seconds += SECS_PER_DAY; - } - days += 719468; // Shift to starting epoch of Mar 1 0000. - // At this point, days will be in the range -61 and 3652364. - assert(-61 <= days && days <= 3652364); - int64_t era = (days > 0 ? days : days - 146096) / 146097; - int64_t day_of_era = days - era * 146097; - int64_t year_of_era = (day_of_era - day_of_era / 1460 + day_of_era / 36524 - - day_of_era / 146096) / - 365; - *out_year = (int)(year_of_era + era * 400); // Year starting on Mar 1. - int64_t day_of_year = - day_of_era - (365 * year_of_era + year_of_era / 4 - year_of_era / 100); - int64_t month_of_year = (5 * day_of_year + 2) / 153; - *out_month = - (int)(month_of_year < 10 ? month_of_year + 3 : month_of_year - 9); - if (*out_month <= 2) { - (*out_year)++; // Adjust year back to Jan 1 start of year. - } - *out_day = (int)(day_of_year - (153 * month_of_year + 2) / 5 + 1); - *out_hours = (int)(leftover_seconds / SECS_PER_HOUR); - leftover_seconds %= SECS_PER_HOUR; - *out_minutes = (int)(leftover_seconds / 60); - *out_seconds = (int)(leftover_seconds % 60); - return 1; -} - -int OPENSSL_tm_to_posix(const struct tm *tm, int64_t *out) { - return posix_time_from_utc(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, - tm->tm_hour, tm->tm_min, tm->tm_sec, out); -} - -int OPENSSL_posix_to_tm(int64_t time, struct tm *out_tm) { - memset(out_tm, 0, sizeof(struct tm)); - if (!utc_from_posix_time(time, &out_tm->tm_year, &out_tm->tm_mon, - &out_tm->tm_mday, &out_tm->tm_hour, &out_tm->tm_min, - &out_tm->tm_sec)) { - return 0; - } - out_tm->tm_year -= 1900; - out_tm->tm_mon -= 1; - - return 1; -} - -int OPENSSL_timegm(const struct tm *tm, time_t *out) { - static_assert( - sizeof(time_t) == sizeof(int32_t) || sizeof(time_t) == sizeof(int64_t), - "time_t is broken"); - int64_t posix_time; - if (!OPENSSL_tm_to_posix(tm, &posix_time)) { - return 0; - } - if (sizeof(time_t) == sizeof(int32_t) && - (posix_time > INT32_MAX || posix_time < INT32_MIN)) { - return 0; - } - *out = (time_t)posix_time; - return 1; -} - -struct tm *OPENSSL_gmtime(const time_t *time, struct tm *out_tm) { - static_assert( - sizeof(time_t) == sizeof(int32_t) || sizeof(time_t) == sizeof(int64_t), - "time_t is broken"); - int64_t posix_time = *time; - if (!OPENSSL_posix_to_tm(posix_time, out_tm)) { - return NULL; - } - return out_tm; -} - -int OPENSSL_gmtime_adj(struct tm *tm, int off_day, long offset_sec) { - int64_t posix_time; - if (!posix_time_from_utc(tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, - tm->tm_hour, tm->tm_min, tm->tm_sec, &posix_time)) { - return 0; - } - if (!utc_from_posix_time(posix_time + off_day * SECS_PER_DAY + offset_sec, - &tm->tm_year, &tm->tm_mon, &tm->tm_mday, - &tm->tm_hour, &tm->tm_min, &tm->tm_sec)) { - return 0; - } - tm->tm_year -= 1900; - tm->tm_mon -= 1; - - return 1; -} - -int OPENSSL_gmtime_diff(int *out_days, int *out_secs, const struct tm *from, - const struct tm *to) { - int64_t time_to; - if (!posix_time_from_utc(to->tm_year + 1900, to->tm_mon + 1, to->tm_mday, - to->tm_hour, to->tm_min, to->tm_sec, &time_to)) { - return 0; - } - int64_t time_from; - if (!posix_time_from_utc(from->tm_year + 1900, from->tm_mon + 1, - from->tm_mday, from->tm_hour, from->tm_min, - from->tm_sec, &time_from)) { - return 0; - } - int64_t timediff = time_to - time_from; - int64_t daydiff = timediff / SECS_PER_DAY; - timediff %= SECS_PER_DAY; - if (daydiff > INT_MAX || daydiff < INT_MIN) { - return 0; - } - *out_secs = (int)timediff; - *out_days = (int)daydiff; - return 1; -} diff --git a/third_party/boringssl/src/crypto/asn1/posix_time.cc b/third_party/boringssl/src/crypto/asn1/posix_time.cc new file mode 100644 index 00000000..d6162a62 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/posix_time.cc @@ -0,0 +1,244 @@ +// Copyright 2022 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Time conversion to/from POSIX time_t and struct tm, with no support +// for time zones other than UTC + +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + +#define SECS_PER_HOUR (60 * 60) +#define SECS_PER_DAY (INT64_C(24) * SECS_PER_HOUR) + + +using namespace bssl; + +// Is a year/month/day combination valid, in the range from year 0000 +// to 9999? +static int is_valid_date(int64_t year, int64_t month, int64_t day) { + if (day < 1 || month < 1 || year < 0 || year > 9999) { + return 0; + } + switch (month) { + case 1: + case 3: + case 5: + case 7: + case 8: + case 10: + case 12: + return day > 0 && day <= 31; + case 4: + case 6: + case 9: + case 11: + return day > 0 && day <= 30; + case 2: + if ((year % 4 == 0 && year % 100 != 0) || year % 400 == 0) { + return day > 0 && day <= 29; + } else { + return day > 0 && day <= 28; + } + default: + return 0; + } +} + +// Is a time valid? Leap seconds of 60 are not considered valid, as +// the POSIX time in seconds does not include them. +static int is_valid_time(int64_t hours, int64_t minutes, int64_t seconds) { + if (hours < 0 || minutes < 0 || seconds < 0 || hours > 23 || minutes > 59 || + seconds > 59) { + return 0; + } + return 1; +} + +// 0000-01-01 00:00:00 UTC +#define MIN_POSIX_TIME INT64_C(-62167219200) +// 9999-12-31 23:59:59 UTC +#define MAX_POSIX_TIME INT64_C(253402300799) + +// Is an int64 time within our expected range? +static int is_valid_posix_time(int64_t time) { + return MIN_POSIX_TIME <= time && time <= MAX_POSIX_TIME; +} + +// Inspired by algorithms presented in +// https://howardhinnant.github.io/date_algorithms.html +// (Public Domain) +static int posix_time_from_utc(int64_t year, int64_t month, int64_t day, + int64_t hours, int64_t minutes, int64_t seconds, + int64_t *out_time) { + if (!is_valid_date(year, month, day) || + !is_valid_time(hours, minutes, seconds)) { + return 0; + } + if (month <= 2) { + year--; // Start years on Mar 1, so leap days always finish a year. + } + // At this point year will be in the range -1 and 9999. + assert(-1 <= year && year <= 9999); + int64_t era = (year >= 0 ? year : year - 399) / 400; + int64_t year_of_era = year - era * 400; + int64_t day_of_year = + (153 * (month > 2 ? month - 3 : month + 9) + 2) / 5 + day - 1; + int64_t day_of_era = + year_of_era * 365 + year_of_era / 4 - year_of_era / 100 + day_of_year; + int64_t posix_days = era * 146097 + day_of_era - 719468; + *out_time = posix_days * SECS_PER_DAY + hours * SECS_PER_HOUR + minutes * 60 + + seconds; + return 1; +} + +// Inspired by algorithms presented in +// https://howardhinnant.github.io/date_algorithms.html +// (Public Domain) +static int utc_from_posix_time(int64_t time, int *out_year, int *out_month, + int *out_day, int *out_hours, int *out_minutes, + int *out_seconds) { + if (!is_valid_posix_time(time)) { + return 0; + } + int64_t days = time / SECS_PER_DAY; + int64_t leftover_seconds = time % SECS_PER_DAY; + if (leftover_seconds < 0) { + days--; + leftover_seconds += SECS_PER_DAY; + } + days += 719468; // Shift to starting epoch of Mar 1 0000. + // At this point, days will be in the range -61 and 3652364. + assert(-61 <= days && days <= 3652364); + int64_t era = (days > 0 ? days : days - 146096) / 146097; + int64_t day_of_era = days - era * 146097; + int64_t year_of_era = (day_of_era - day_of_era / 1460 + day_of_era / 36524 - + day_of_era / 146096) / + 365; + *out_year = (int)(year_of_era + era * 400); // Year starting on Mar 1. + int64_t day_of_year = + day_of_era - (365 * year_of_era + year_of_era / 4 - year_of_era / 100); + int64_t month_of_year = (5 * day_of_year + 2) / 153; + *out_month = + (int)(month_of_year < 10 ? month_of_year + 3 : month_of_year - 9); + if (*out_month <= 2) { + (*out_year)++; // Adjust year back to Jan 1 start of year. + } + *out_day = (int)(day_of_year - (153 * month_of_year + 2) / 5 + 1); + *out_hours = (int)(leftover_seconds / SECS_PER_HOUR); + leftover_seconds %= SECS_PER_HOUR; + *out_minutes = (int)(leftover_seconds / 60); + *out_seconds = (int)(leftover_seconds % 60); + return 1; +} + +int OPENSSL_tm_to_posix(const struct tm *tm, int64_t *out) { + return posix_time_from_utc(tm->tm_year + INT64_C(1900), + tm->tm_mon + INT64_C(1), tm->tm_mday, tm->tm_hour, + tm->tm_min, tm->tm_sec, out); +} + +int OPENSSL_posix_to_tm(int64_t time, struct tm *out_tm) { + struct tm tmp_tm = {}; + if (!utc_from_posix_time(time, &tmp_tm.tm_year, &tmp_tm.tm_mon, + &tmp_tm.tm_mday, &tmp_tm.tm_hour, &tmp_tm.tm_min, + &tmp_tm.tm_sec)) { + return 0; + } + tmp_tm.tm_year -= 1900; + tmp_tm.tm_mon -= 1; + *out_tm = tmp_tm; + + return 1; +} + +int OPENSSL_timegm(const struct tm *tm, time_t *out) { + static_assert( + sizeof(time_t) == sizeof(int32_t) || sizeof(time_t) == sizeof(int64_t), + "time_t is broken"); + int64_t posix_time; + if (!OPENSSL_tm_to_posix(tm, &posix_time)) { + return 0; + } + if (sizeof(time_t) == sizeof(int32_t) && + (posix_time > INT32_MAX || posix_time < INT32_MIN)) { + return 0; + } + *out = (time_t)posix_time; + return 1; +} + +struct tm *bssl::OPENSSL_gmtime(const time_t *time, struct tm *out_tm) { + static_assert( + sizeof(time_t) == sizeof(int32_t) || sizeof(time_t) == sizeof(int64_t), + "time_t is broken"); + int64_t posix_time = *time; + if (!OPENSSL_posix_to_tm(posix_time, out_tm)) { + return nullptr; + } + return out_tm; +} + +int bssl::OPENSSL_gmtime_adj(struct tm *tm, int offset_day, + int64_t offset_sec) { + int64_t posix_time; + if (!OPENSSL_tm_to_posix(tm, &posix_time)) { + return 0; + } + static_assert(INT_MAX <= INT64_MAX / SECS_PER_DAY, + "day offset in seconds cannot overflow"); + static_assert(MAX_POSIX_TIME <= INT64_MAX - INT_MAX * SECS_PER_DAY, + "addition cannot overflow"); + static_assert(MIN_POSIX_TIME >= INT64_MIN - INT_MIN * SECS_PER_DAY, + "addition cannot underflow"); + posix_time += offset_day * SECS_PER_DAY; + if (posix_time > 0 && offset_sec > INT64_MAX - posix_time) { + return 0; + } + if (posix_time < 0 && offset_sec < INT64_MIN - posix_time) { + return 0; + } + posix_time += offset_sec; + + if (!OPENSSL_posix_to_tm(posix_time, tm)) { + return 0; + } + + return 1; +} + +int bssl::OPENSSL_gmtime_diff(int *out_days, int *out_secs, + const struct tm *from, const struct tm *to) { + int64_t time_to, time_from; + if (!OPENSSL_tm_to_posix(to, &time_to) || + !OPENSSL_tm_to_posix(from, &time_from)) { + return 0; + } + // Times are in range, so these calculations can not overflow. + static_assert(SECS_PER_DAY <= INT_MAX, "seconds per day does not fit in int"); + static_assert((MAX_POSIX_TIME - MIN_POSIX_TIME) / SECS_PER_DAY <= INT_MAX, + "range of valid POSIX times, in days, does not fit in int"); + int64_t timediff = time_to - time_from; + int64_t daydiff = timediff / SECS_PER_DAY; + timediff %= SECS_PER_DAY; + *out_secs = (int)timediff; + *out_days = (int)daydiff; + return 1; +} diff --git a/third_party/boringssl/src/crypto/asn1/tasn_dec.c b/third_party/boringssl/src/crypto/asn1/tasn_dec.c deleted file mode 100644 index de877917..00000000 --- a/third_party/boringssl/src/crypto/asn1/tasn_dec.c +++ /dev/null @@ -1,929 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include -#include -#include -#include - -#include -#include - -#include "../bytestring/internal.h" -#include "../internal.h" -#include "internal.h" - -// Constructed types with a recursive definition (such as can be found in PKCS7) -// could eventually exceed the stack given malicious input with excessive -// recursion. Therefore we limit the stack depth. This is the maximum number of -// recursive invocations of asn1_item_embed_d2i(). -#define ASN1_MAX_CONSTRUCTED_NEST 30 - -static int asn1_check_tlen(long *olen, int *otag, unsigned char *oclass, - char *cst, const unsigned char **in, long len, - int exptag, int expclass, char opt); - -static int asn1_template_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, - long len, const ASN1_TEMPLATE *tt, char opt, - int depth); -static int asn1_template_noexp_d2i(ASN1_VALUE **val, const unsigned char **in, - long len, const ASN1_TEMPLATE *tt, char opt, - int depth); -static int asn1_ex_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len, - int utype, const ASN1_ITEM *it); -static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, const unsigned char **in, - long len, const ASN1_ITEM *it, int tag, - int aclass, char opt); -static int asn1_item_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, - long len, const ASN1_ITEM *it, int tag, int aclass, - char opt, int depth); - -// Table to convert tags to bit values, used for MSTRING type -static const unsigned long tag2bit[31] = { - 0, // (reserved) - 0, // BOOLEAN - 0, // INTEGER - B_ASN1_BIT_STRING, - B_ASN1_OCTET_STRING, - 0, // NULL - 0, // OBJECT IDENTIFIER - B_ASN1_UNKNOWN, // ObjectDescriptor - B_ASN1_UNKNOWN, // EXTERNAL - B_ASN1_UNKNOWN, // REAL - B_ASN1_UNKNOWN, // ENUMERATED - B_ASN1_UNKNOWN, // EMBEDDED PDV - B_ASN1_UTF8STRING, - B_ASN1_UNKNOWN, // RELATIVE-OID - B_ASN1_UNKNOWN, // TIME - B_ASN1_UNKNOWN, // (reserved) - B_ASN1_SEQUENCE, - 0, // SET - B_ASN1_NUMERICSTRING, - B_ASN1_PRINTABLESTRING, - B_ASN1_T61STRING, - B_ASN1_VIDEOTEXSTRING, - B_ASN1_IA5STRING, - B_ASN1_UTCTIME, - B_ASN1_GENERALIZEDTIME, - B_ASN1_GRAPHICSTRING, - B_ASN1_ISO64STRING, - B_ASN1_GENERALSTRING, - B_ASN1_UNIVERSALSTRING, - B_ASN1_UNKNOWN, // CHARACTER STRING - B_ASN1_BMPSTRING, -}; - -unsigned long ASN1_tag2bit(int tag) { - if (tag < 0 || tag > 30) { - return 0; - } - return tag2bit[tag]; -} - -// Macro to initialize and invalidate the cache - -// Decode an ASN1 item, this currently behaves just like a standard 'd2i' -// function. 'in' points to a buffer to read the data from, in future we -// will have more advanced versions that can input data a piece at a time and -// this will simply be a special case. - -ASN1_VALUE *ASN1_item_d2i(ASN1_VALUE **pval, const unsigned char **in, long len, - const ASN1_ITEM *it) { - ASN1_VALUE *ptmpval = NULL; - if (!pval) { - pval = &ptmpval; - } - - if (asn1_item_ex_d2i(pval, in, len, it, -1, 0, 0, 0) > 0) { - return *pval; - } - return NULL; -} - -// Decode an item, taking care of IMPLICIT tagging, if any. If 'opt' set and -// tag mismatch return -1 to handle OPTIONAL - -static int asn1_item_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, - long len, const ASN1_ITEM *it, int tag, int aclass, - char opt, int depth) { - const ASN1_TEMPLATE *tt, *errtt = NULL; - const ASN1_EXTERN_FUNCS *ef; - const unsigned char *p = NULL, *q; - unsigned char oclass; - char cst, isopt; - int i; - int otag; - int ret = 0; - ASN1_VALUE **pchptr; - int combine = aclass & ASN1_TFLG_COMBINE; - aclass &= ~ASN1_TFLG_COMBINE; - if (!pval) { - return 0; - } - - // Bound |len| to comfortably fit in an int. Lengths in this module often - // switch between int and long without overflow checks. - if (len > INT_MAX / 2) { - len = INT_MAX / 2; - } - - if (++depth > ASN1_MAX_CONSTRUCTED_NEST) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_TOO_DEEP); - goto err; - } - - switch (it->itype) { - case ASN1_ITYPE_PRIMITIVE: - if (it->templates) { - // tagging or OPTIONAL is currently illegal on an item template - // because the flags can't get passed down. In practice this - // isn't a problem: we include the relevant flags from the item - // template in the template itself. - if ((tag != -1) || opt) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_OPTIONS_ON_ITEM_TEMPLATE); - goto err; - } - return asn1_template_ex_d2i(pval, in, len, it->templates, opt, depth); - } - return asn1_d2i_ex_primitive(pval, in, len, it, tag, aclass, opt); - break; - - case ASN1_ITYPE_MSTRING: - // It never makes sense for multi-strings to have implicit tagging, so - // if tag != -1, then this looks like an error in the template. - if (tag != -1) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); - goto err; - } - - p = *in; - // Just read in tag and class - ret = asn1_check_tlen(NULL, &otag, &oclass, NULL, &p, len, -1, 0, 1); - if (!ret) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - goto err; - } - - // Must be UNIVERSAL class - if (oclass != V_ASN1_UNIVERSAL) { - // If OPTIONAL, assume this is OK - if (opt) { - return -1; - } - OPENSSL_PUT_ERROR(ASN1, ASN1_R_MSTRING_NOT_UNIVERSAL); - goto err; - } - // Check tag matches bit map - if (!(ASN1_tag2bit(otag) & it->utype)) { - // If OPTIONAL, assume this is OK - if (opt) { - return -1; - } - OPENSSL_PUT_ERROR(ASN1, ASN1_R_MSTRING_WRONG_TAG); - goto err; - } - return asn1_d2i_ex_primitive(pval, in, len, it, otag, 0, 0); - - case ASN1_ITYPE_EXTERN: - // Use new style d2i - ef = it->funcs; - return ef->asn1_ex_d2i(pval, in, len, it, tag, aclass, opt, NULL); - - case ASN1_ITYPE_CHOICE: { - // It never makes sense for CHOICE types to have implicit tagging, so if - // tag != -1, then this looks like an error in the template. - if (tag != -1) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); - goto err; - } - - const ASN1_AUX *aux = it->funcs; - ASN1_aux_cb *asn1_cb = aux != NULL ? aux->asn1_cb : NULL; - if (asn1_cb && !asn1_cb(ASN1_OP_D2I_PRE, pval, it, NULL)) { - goto auxerr; - } - - if (*pval) { - // Free up and zero CHOICE value if initialised - i = asn1_get_choice_selector(pval, it); - if ((i >= 0) && (i < it->tcount)) { - tt = it->templates + i; - pchptr = asn1_get_field_ptr(pval, tt); - ASN1_template_free(pchptr, tt); - asn1_set_choice_selector(pval, -1, it); - } - } else if (!ASN1_item_ex_new(pval, it)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - goto err; - } - // CHOICE type, try each possibility in turn - p = *in; - for (i = 0, tt = it->templates; i < it->tcount; i++, tt++) { - pchptr = asn1_get_field_ptr(pval, tt); - // We mark field as OPTIONAL so its absence can be recognised. - ret = asn1_template_ex_d2i(pchptr, &p, len, tt, 1, depth); - // If field not present, try the next one - if (ret == -1) { - continue; - } - // If positive return, read OK, break loop - if (ret > 0) { - break; - } - // Otherwise must be an ASN1 parsing error - errtt = tt; - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - goto err; - } - - // Did we fall off the end without reading anything? - if (i == it->tcount) { - // If OPTIONAL, this is OK - if (opt) { - // Free and zero it - ASN1_item_ex_free(pval, it); - return -1; - } - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NO_MATCHING_CHOICE_TYPE); - goto err; - } - - asn1_set_choice_selector(pval, i, it); - if (asn1_cb && !asn1_cb(ASN1_OP_D2I_POST, pval, it, NULL)) { - goto auxerr; - } - *in = p; - return 1; - } - - case ASN1_ITYPE_SEQUENCE: { - p = *in; - - // If no IMPLICIT tagging set to SEQUENCE, UNIVERSAL - if (tag == -1) { - tag = V_ASN1_SEQUENCE; - aclass = V_ASN1_UNIVERSAL; - } - // Get SEQUENCE length and update len, p - ret = asn1_check_tlen(&len, NULL, NULL, &cst, &p, len, tag, aclass, opt); - if (!ret) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - goto err; - } else if (ret == -1) { - return -1; - } - if (!cst) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_SEQUENCE_NOT_CONSTRUCTED); - goto err; - } - - if (!*pval && !ASN1_item_ex_new(pval, it)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - goto err; - } - - const ASN1_AUX *aux = it->funcs; - ASN1_aux_cb *asn1_cb = aux != NULL ? aux->asn1_cb : NULL; - if (asn1_cb && !asn1_cb(ASN1_OP_D2I_PRE, pval, it, NULL)) { - goto auxerr; - } - - // Free up and zero any ADB found - for (i = 0, tt = it->templates; i < it->tcount; i++, tt++) { - if (tt->flags & ASN1_TFLG_ADB_MASK) { - const ASN1_TEMPLATE *seqtt; - ASN1_VALUE **pseqval; - seqtt = asn1_do_adb(pval, tt, 0); - if (seqtt == NULL) { - continue; - } - pseqval = asn1_get_field_ptr(pval, seqtt); - ASN1_template_free(pseqval, seqtt); - } - } - - // Get each field entry - for (i = 0, tt = it->templates; i < it->tcount; i++, tt++) { - const ASN1_TEMPLATE *seqtt; - ASN1_VALUE **pseqval; - seqtt = asn1_do_adb(pval, tt, 1); - if (seqtt == NULL) { - goto err; - } - pseqval = asn1_get_field_ptr(pval, seqtt); - // Have we ran out of data? - if (!len) { - break; - } - q = p; - // This determines the OPTIONAL flag value. The field cannot be - // omitted if it is the last of a SEQUENCE and there is still - // data to be read. This isn't strictly necessary but it - // increases efficiency in some cases. - if (i == (it->tcount - 1)) { - isopt = 0; - } else { - isopt = (char)(seqtt->flags & ASN1_TFLG_OPTIONAL); - } - // attempt to read in field, allowing each to be OPTIONAL - - ret = asn1_template_ex_d2i(pseqval, &p, len, seqtt, isopt, depth); - if (!ret) { - errtt = seqtt; - goto err; - } else if (ret == -1) { - // OPTIONAL component absent. Free and zero the field. - ASN1_template_free(pseqval, seqtt); - continue; - } - // Update length - len -= p - q; - } - - // Check all data read - if (len) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_SEQUENCE_LENGTH_MISMATCH); - goto err; - } - - // If we get here we've got no more data in the SEQUENCE, however we - // may not have read all fields so check all remaining are OPTIONAL - // and clear any that are. - for (; i < it->tcount; tt++, i++) { - const ASN1_TEMPLATE *seqtt; - seqtt = asn1_do_adb(pval, tt, 1); - if (seqtt == NULL) { - goto err; - } - if (seqtt->flags & ASN1_TFLG_OPTIONAL) { - ASN1_VALUE **pseqval; - pseqval = asn1_get_field_ptr(pval, seqtt); - ASN1_template_free(pseqval, seqtt); - } else { - errtt = seqtt; - OPENSSL_PUT_ERROR(ASN1, ASN1_R_FIELD_MISSING); - goto err; - } - } - // Save encoding - if (!asn1_enc_save(pval, *in, p - *in, it)) { - goto auxerr; - } - if (asn1_cb && !asn1_cb(ASN1_OP_D2I_POST, pval, it, NULL)) { - goto auxerr; - } - *in = p; - return 1; - } - - default: - return 0; - } -auxerr: - OPENSSL_PUT_ERROR(ASN1, ASN1_R_AUX_ERROR); -err: - if (combine == 0) { - ASN1_item_ex_free(pval, it); - } - if (errtt) { - ERR_add_error_data(4, "Field=", errtt->field_name, ", Type=", it->sname); - } else { - ERR_add_error_data(2, "Type=", it->sname); - } - return 0; -} - -int ASN1_item_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, long len, - const ASN1_ITEM *it, int tag, int aclass, char opt, - ASN1_TLC *ctx) { - return asn1_item_ex_d2i(pval, in, len, it, tag, aclass, opt, 0); -} - -// Templates are handled with two separate functions. One handles any -// EXPLICIT tag and the other handles the rest. - -static int asn1_template_ex_d2i(ASN1_VALUE **val, const unsigned char **in, - long inlen, const ASN1_TEMPLATE *tt, char opt, - int depth) { - int flags, aclass; - int ret; - long len; - const unsigned char *p, *q; - if (!val) { - return 0; - } - flags = tt->flags; - aclass = flags & ASN1_TFLG_TAG_CLASS; - - p = *in; - - // Check if EXPLICIT tag expected - if (flags & ASN1_TFLG_EXPTAG) { - char cst; - // Need to work out amount of data available to the inner content and - // where it starts: so read in EXPLICIT header to get the info. - ret = asn1_check_tlen(&len, NULL, NULL, &cst, &p, inlen, tt->tag, aclass, - opt); - q = p; - if (!ret) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - return 0; - } else if (ret == -1) { - return -1; - } - if (!cst) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_EXPLICIT_TAG_NOT_CONSTRUCTED); - return 0; - } - // We've found the field so it can't be OPTIONAL now - ret = asn1_template_noexp_d2i(val, &p, len, tt, 0, depth); - if (!ret) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - return 0; - } - // We read the field in OK so update length - len -= p - q; - // Check for trailing data. - if (len) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_EXPLICIT_LENGTH_MISMATCH); - goto err; - } - } else { - return asn1_template_noexp_d2i(val, in, inlen, tt, opt, depth); - } - - *in = p; - return 1; - -err: - ASN1_template_free(val, tt); - return 0; -} - -static int asn1_template_noexp_d2i(ASN1_VALUE **val, const unsigned char **in, - long len, const ASN1_TEMPLATE *tt, char opt, - int depth) { - int flags, aclass; - int ret; - const unsigned char *p; - if (!val) { - return 0; - } - flags = tt->flags; - aclass = flags & ASN1_TFLG_TAG_CLASS; - - p = *in; - - if (flags & ASN1_TFLG_SK_MASK) { - // SET OF, SEQUENCE OF - int sktag, skaclass; - // First work out expected inner tag value - if (flags & ASN1_TFLG_IMPTAG) { - sktag = tt->tag; - skaclass = aclass; - } else { - skaclass = V_ASN1_UNIVERSAL; - if (flags & ASN1_TFLG_SET_OF) { - sktag = V_ASN1_SET; - } else { - sktag = V_ASN1_SEQUENCE; - } - } - // Get the tag - ret = - asn1_check_tlen(&len, NULL, NULL, NULL, &p, len, sktag, skaclass, opt); - if (!ret) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - return 0; - } else if (ret == -1) { - return -1; - } - if (!*val) { - *val = (ASN1_VALUE *)sk_ASN1_VALUE_new_null(); - } else { - // We've got a valid STACK: free up any items present - STACK_OF(ASN1_VALUE) *sktmp = (STACK_OF(ASN1_VALUE) *)*val; - ASN1_VALUE *vtmp; - while (sk_ASN1_VALUE_num(sktmp) > 0) { - vtmp = sk_ASN1_VALUE_pop(sktmp); - ASN1_item_ex_free(&vtmp, ASN1_ITEM_ptr(tt->item)); - } - } - - if (!*val) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - goto err; - } - - // Read as many items as we can - while (len > 0) { - ASN1_VALUE *skfield; - const unsigned char *q = p; - skfield = NULL; - if (!asn1_item_ex_d2i(&skfield, &p, len, ASN1_ITEM_ptr(tt->item), -1, 0, - 0, depth)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - goto err; - } - len -= p - q; - if (!sk_ASN1_VALUE_push((STACK_OF(ASN1_VALUE) *)*val, skfield)) { - ASN1_item_ex_free(&skfield, ASN1_ITEM_ptr(tt->item)); - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - goto err; - } - } - } else if (flags & ASN1_TFLG_IMPTAG) { - // IMPLICIT tagging - ret = asn1_item_ex_d2i(val, &p, len, ASN1_ITEM_ptr(tt->item), tt->tag, - aclass, opt, depth); - if (!ret) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - goto err; - } else if (ret == -1) { - return -1; - } - } else { - // Nothing special - ret = asn1_item_ex_d2i(val, &p, len, ASN1_ITEM_ptr(tt->item), -1, - tt->flags & ASN1_TFLG_COMBINE, opt, depth); - if (!ret) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - goto err; - } else if (ret == -1) { - return -1; - } - } - - *in = p; - return 1; - -err: - ASN1_template_free(val, tt); - return 0; -} - -static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, const unsigned char **in, - long inlen, const ASN1_ITEM *it, int tag, - int aclass, char opt) { - int ret = 0, utype; - long plen; - char cst; - const unsigned char *p; - const unsigned char *cont = NULL; - long len; - if (!pval) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_NULL); - return 0; // Should never happen - } - - if (it->itype == ASN1_ITYPE_MSTRING) { - utype = tag; - tag = -1; - } else { - utype = it->utype; - } - - if (utype == V_ASN1_ANY) { - // If type is ANY need to figure out type from tag - unsigned char oclass; - if (tag >= 0) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_TAGGED_ANY); - return 0; - } - if (opt) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_OPTIONAL_ANY); - return 0; - } - p = *in; - ret = asn1_check_tlen(NULL, &utype, &oclass, NULL, &p, inlen, -1, 0, 0); - if (!ret) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - return 0; - } - if (oclass != V_ASN1_UNIVERSAL) { - utype = V_ASN1_OTHER; - } - } - if (tag == -1) { - tag = utype; - aclass = V_ASN1_UNIVERSAL; - } - p = *in; - // Check header - ret = asn1_check_tlen(&plen, NULL, NULL, &cst, &p, inlen, tag, aclass, opt); - if (!ret) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); - return 0; - } else if (ret == -1) { - return -1; - } - ret = 0; - // SEQUENCE, SET and "OTHER" are left in encoded form - if ((utype == V_ASN1_SEQUENCE) || (utype == V_ASN1_SET) || - (utype == V_ASN1_OTHER)) { - // SEQUENCE and SET must be constructed - if (utype != V_ASN1_OTHER && !cst) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_TYPE_NOT_CONSTRUCTED); - return 0; - } - - cont = *in; - len = p - cont + plen; - p += plen; - } else if (cst) { - // This parser historically supported BER constructed strings. We no - // longer do and will gradually tighten this parser into a DER - // parser. BER types should use |CBS_asn1_ber_to_der|. - OPENSSL_PUT_ERROR(ASN1, ASN1_R_TYPE_NOT_PRIMITIVE); - return 0; - } else { - cont = p; - len = plen; - p += plen; - } - - // We now have content length and type: translate into a structure - if (!asn1_ex_c2i(pval, cont, len, utype, it)) { - goto err; - } - - *in = p; - ret = 1; -err: - return ret; -} - -// Translate ASN1 content octets into a structure - -static int asn1_ex_c2i(ASN1_VALUE **pval, const unsigned char *cont, int len, - int utype, const ASN1_ITEM *it) { - ASN1_VALUE **opval = NULL; - ASN1_STRING *stmp; - ASN1_TYPE *typ = NULL; - int ret = 0; - ASN1_INTEGER **tint; - - // Historically, |it->funcs| for primitive types contained an - // |ASN1_PRIMITIVE_FUNCS| table of callbacks. - assert(it->funcs == NULL); - - // If ANY type clear type and set pointer to internal value - if (it->utype == V_ASN1_ANY) { - if (!*pval) { - typ = ASN1_TYPE_new(); - if (typ == NULL) { - goto err; - } - *pval = (ASN1_VALUE *)typ; - } else { - typ = (ASN1_TYPE *)*pval; - } - - if (utype != typ->type) { - ASN1_TYPE_set(typ, utype, NULL); - } - opval = pval; - pval = &typ->value.asn1_value; - } - switch (utype) { - case V_ASN1_OBJECT: - if (!c2i_ASN1_OBJECT((ASN1_OBJECT **)pval, &cont, len)) { - goto err; - } - break; - - case V_ASN1_NULL: - if (len) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NULL_IS_WRONG_LENGTH); - goto err; - } - *pval = (ASN1_VALUE *)1; - break; - - case V_ASN1_BOOLEAN: - if (len != 1) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BOOLEAN_IS_WRONG_LENGTH); - goto err; - } else { - ASN1_BOOLEAN *tbool; - tbool = (ASN1_BOOLEAN *)pval; - *tbool = *cont; - } - break; - - case V_ASN1_BIT_STRING: - if (!c2i_ASN1_BIT_STRING((ASN1_BIT_STRING **)pval, &cont, len)) { - goto err; - } - break; - - case V_ASN1_INTEGER: - case V_ASN1_ENUMERATED: - tint = (ASN1_INTEGER **)pval; - if (!c2i_ASN1_INTEGER(tint, &cont, len)) { - goto err; - } - // Fixup type to match the expected form - (*tint)->type = utype | ((*tint)->type & V_ASN1_NEG); - break; - - case V_ASN1_OCTET_STRING: - case V_ASN1_NUMERICSTRING: - case V_ASN1_PRINTABLESTRING: - case V_ASN1_T61STRING: - case V_ASN1_VIDEOTEXSTRING: - case V_ASN1_IA5STRING: - case V_ASN1_UTCTIME: - case V_ASN1_GENERALIZEDTIME: - case V_ASN1_GRAPHICSTRING: - case V_ASN1_VISIBLESTRING: - case V_ASN1_GENERALSTRING: - case V_ASN1_UNIVERSALSTRING: - case V_ASN1_BMPSTRING: - case V_ASN1_UTF8STRING: - case V_ASN1_OTHER: - case V_ASN1_SET: - case V_ASN1_SEQUENCE: - default: { - CBS cbs; - CBS_init(&cbs, cont, (size_t)len); - if (utype == V_ASN1_BMPSTRING) { - while (CBS_len(&cbs) != 0) { - uint32_t c; - if (!cbs_get_ucs2_be(&cbs, &c)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_BMPSTRING); - goto err; - } - } - } - if (utype == V_ASN1_UNIVERSALSTRING) { - while (CBS_len(&cbs) != 0) { - uint32_t c; - if (!cbs_get_utf32_be(&cbs, &c)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_UNIVERSALSTRING); - goto err; - } - } - } - if (utype == V_ASN1_UTF8STRING) { - while (CBS_len(&cbs) != 0) { - uint32_t c; - if (!cbs_get_utf8(&cbs, &c)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_UTF8STRING); - goto err; - } - } - } - if (utype == V_ASN1_UTCTIME) { - if (!CBS_parse_utc_time(&cbs, NULL, /*allow_timezone_offset=*/1)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_TIME_FORMAT); - goto err; - } - } - if (utype == V_ASN1_GENERALIZEDTIME) { - if (!CBS_parse_generalized_time(&cbs, NULL, - /*allow_timezone_offset=*/0)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_TIME_FORMAT); - goto err; - } - } - // TODO(https://crbug.com/boringssl/427): Check other string types. - - // All based on ASN1_STRING and handled the same - if (!*pval) { - stmp = ASN1_STRING_type_new(utype); - if (!stmp) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - goto err; - } - *pval = (ASN1_VALUE *)stmp; - } else { - stmp = (ASN1_STRING *)*pval; - stmp->type = utype; - } - if (!ASN1_STRING_set(stmp, cont, len)) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - ASN1_STRING_free(stmp); - *pval = NULL; - goto err; - } - break; - } - } - // If ASN1_ANY and NULL type fix up value - if (typ && (utype == V_ASN1_NULL)) { - typ->value.ptr = NULL; - } - - ret = 1; -err: - if (!ret) { - ASN1_TYPE_free(typ); - if (opval) { - *opval = NULL; - } - } - return ret; -} - -// Check an ASN1 tag and length: a bit like ASN1_get_object but it -// checks the expected tag. - -static int asn1_check_tlen(long *olen, int *otag, unsigned char *oclass, - char *cst, const unsigned char **in, long len, - int exptag, int expclass, char opt) { - int i; - int ptag, pclass; - long plen; - const unsigned char *p; - p = *in; - - i = ASN1_get_object(&p, &plen, &ptag, &pclass, len); - if (i & 0x80) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_OBJECT_HEADER); - return 0; - } - if (exptag >= 0) { - if ((exptag != ptag) || (expclass != pclass)) { - // If type is OPTIONAL, not an error: indicate missing type. - if (opt) { - return -1; - } - OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_TAG); - return 0; - } - } - - if (cst) { - *cst = i & V_ASN1_CONSTRUCTED; - } - - if (olen) { - *olen = plen; - } - - if (oclass) { - *oclass = pclass; - } - - if (otag) { - *otag = ptag; - } - - *in = p; - return 1; -} diff --git a/third_party/boringssl/src/crypto/asn1/tasn_dec.cc b/third_party/boringssl/src/crypto/asn1/tasn_dec.cc new file mode 100644 index 00000000..20701dff --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/tasn_dec.cc @@ -0,0 +1,851 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +// Constructed types with a recursive definition (such as can be found in PKCS7) +// could eventually exceed the stack given malicious input with excessive +// recursion. Therefore we limit the stack depth. This is the maximum number of +// recursive invocations of asn1_item_embed_d2i(). +#define ASN1_MAX_CONSTRUCTED_NEST 30 + +static int asn1_check_tlen(long *olen, int *otag, unsigned char *oclass, + char *cst, const unsigned char **in, long len, + int exptag, int expclass, char opt); + +static int asn1_template_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, + long len, const ASN1_TEMPLATE *tt, char opt, + int depth); +static int asn1_template_noexp_d2i(ASN1_VALUE **val, const unsigned char **in, + long len, const ASN1_TEMPLATE *tt, char opt, + int depth); +static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, const unsigned char **in, + long len, const ASN1_ITEM *it, int tag, + int aclass, char opt); +static int asn1_item_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, + long len, const ASN1_ITEM *it, int tag, int aclass, + char opt, int depth); + +unsigned long ASN1_tag2bit(int tag) { + switch (tag) { + case V_ASN1_BIT_STRING: + return B_ASN1_BIT_STRING; + case V_ASN1_OCTET_STRING: + return B_ASN1_OCTET_STRING; + case V_ASN1_UTF8STRING: + return B_ASN1_UTF8STRING; + case V_ASN1_SEQUENCE: + return B_ASN1_SEQUENCE; + case V_ASN1_NUMERICSTRING: + return B_ASN1_NUMERICSTRING; + case V_ASN1_PRINTABLESTRING: + return B_ASN1_PRINTABLESTRING; + case V_ASN1_T61STRING: + return B_ASN1_T61STRING; + case V_ASN1_VIDEOTEXSTRING: + return B_ASN1_VIDEOTEXSTRING; + case V_ASN1_IA5STRING: + return B_ASN1_IA5STRING; + case V_ASN1_UTCTIME: + return B_ASN1_UTCTIME; + case V_ASN1_GENERALIZEDTIME: + return B_ASN1_GENERALIZEDTIME; + case V_ASN1_GRAPHICSTRING: + return B_ASN1_GRAPHICSTRING; + case V_ASN1_ISO64STRING: + return B_ASN1_ISO64STRING; + case V_ASN1_GENERALSTRING: + return B_ASN1_GENERALSTRING; + case V_ASN1_UNIVERSALSTRING: + return B_ASN1_UNIVERSALSTRING; + case V_ASN1_BMPSTRING: + return B_ASN1_BMPSTRING; + default: + return 0; + } +} + +// Decode an ASN1 item, this currently behaves just like a standard 'd2i' +// function. 'in' points to a buffer to read the data from, in future we +// will have more advanced versions that can input data a piece at a time and +// this will simply be a special case. + +ASN1_VALUE *ASN1_item_d2i(ASN1_VALUE **pval, const unsigned char **in, long len, + const ASN1_ITEM *it) { + ASN1_VALUE *ret = nullptr; + if (asn1_item_ex_d2i(&ret, in, len, it, /*tag=*/-1, /*aclass=*/0, /*opt=*/0, + /*depth=*/0) <= 0) { + // Clean up, in case the caller left a partial object. + // + // TODO(davidben): I don't think it can leave one, but the codepaths below + // are a bit inconsistent. Revisit this when rewriting this function. + ASN1_item_ex_free(&ret, it); + } + + // If the caller supplied an output pointer, free the old one and replace it + // with |ret|. This differs from OpenSSL slightly in that we don't support + // object reuse. We run this on both success and failure. On failure, even + // with object reuse, OpenSSL destroys the previous object. + if (pval != nullptr) { + ASN1_item_ex_free(pval, it); + *pval = ret; + } + return ret; +} + +// Decode an item, taking care of IMPLICIT tagging, if any. If 'opt' set and +// tag mismatch return -1 to handle OPTIONAL +// +// TODO(davidben): Historically, all functions in this file had to account for +// |*pval| containing an arbitrary existing value. This is no longer the case +// because |ASN1_item_d2i| now always starts from NULL. As part of rewriting +// this function, take the simplified assumptions into account. Though we must +// still account for the internal calls to |ASN1_item_ex_new|. + +static int asn1_item_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, + long len, const ASN1_ITEM *it, int tag, int aclass, + char opt, int depth) { + const ASN1_TEMPLATE *tt, *errtt = nullptr; + const unsigned char *p = nullptr, *q; + unsigned char oclass; + char cst, isopt; + int i; + int otag; + int ret = 0; + ASN1_VALUE **pchptr; + if (!pval) { + return 0; + } + if (len < 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BUFFER_TOO_SMALL); + goto err; + } + + // Bound |len| to comfortably fit in an int. Lengths in this module often + // switch between int and long without overflow checks. + if (len > INT_MAX / 2) { + len = INT_MAX / 2; + } + + if (++depth > ASN1_MAX_CONSTRUCTED_NEST) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_TOO_DEEP); + goto err; + } + + switch (it->itype) { + case ASN1_ITYPE_PRIMITIVE: + if (it->templates) { + // tagging or OPTIONAL is currently illegal on an item template + // because the flags can't get passed down. In practice this + // isn't a problem: we include the relevant flags from the item + // template in the template itself. + if ((tag != -1) || opt) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_OPTIONS_ON_ITEM_TEMPLATE); + goto err; + } + return asn1_template_ex_d2i(pval, in, len, it->templates, opt, depth); + } + return asn1_d2i_ex_primitive(pval, in, len, it, tag, aclass, opt); + + case ASN1_ITYPE_MSTRING: + // It never makes sense for multi-strings to have implicit tagging, so + // if tag != -1, then this looks like an error in the template. + if (tag != -1) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + goto err; + } + + p = *in; + // Just read in tag and class + ret = + asn1_check_tlen(nullptr, &otag, &oclass, nullptr, &p, len, -1, 0, 1); + if (!ret) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + goto err; + } + + // Must be UNIVERSAL class + if (oclass != V_ASN1_UNIVERSAL) { + // If OPTIONAL, assume this is OK + if (opt) { + return -1; + } + OPENSSL_PUT_ERROR(ASN1, ASN1_R_MSTRING_NOT_UNIVERSAL); + goto err; + } + // Check tag matches bit map + if (!(ASN1_tag2bit(otag) & it->utype)) { + // If OPTIONAL, assume this is OK + if (opt) { + return -1; + } + OPENSSL_PUT_ERROR(ASN1, ASN1_R_MSTRING_WRONG_TAG); + goto err; + } + return asn1_d2i_ex_primitive(pval, in, len, it, otag, 0, 0); + + case ASN1_ITYPE_EXTERN: { + // We don't support implicit tagging with external types. + if (tag != -1) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + goto err; + } + const ASN1_EXTERN_FUNCS *ef = + reinterpret_cast(it->funcs); + CBS cbs; + CBS_init(&cbs, *in, len); + CBS copy = cbs; + if (!ef->asn1_ex_parse(pval, &cbs, it, opt)) { + goto err; + } + *in = CBS_data(&cbs); + // Check whether the function skipped an optional element. + // + // TODO(crbug.com/42290418): Switch the rest of this function to + // |asn1_ex_parse|'s calling convention. + return CBS_len(&cbs) == CBS_len(©) ? -1 : 1; + } + + case ASN1_ITYPE_CHOICE: { + // It never makes sense for CHOICE types to have implicit tagging, so if + // tag != -1, then this looks like an error in the template. + if (tag != -1) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + goto err; + } + + const ASN1_AUX *aux = reinterpret_cast(it->funcs); + ASN1_aux_cb *asn1_cb = aux != nullptr ? aux->asn1_cb : nullptr; + if (asn1_cb && !asn1_cb(ASN1_OP_D2I_PRE, pval, it, nullptr)) { + goto auxerr; + } + + if (*pval) { + // Free up and zero CHOICE value if initialised + i = asn1_get_choice_selector(pval, it); + if ((i >= 0) && (i < it->tcount)) { + tt = it->templates + i; + pchptr = asn1_get_field_ptr(pval, tt); + ASN1_template_free(pchptr, tt); + asn1_set_choice_selector(pval, -1, it); + } + } else if (!ASN1_item_ex_new(pval, it)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + goto err; + } + // CHOICE type, try each possibility in turn + p = *in; + for (i = 0, tt = it->templates; i < it->tcount; i++, tt++) { + pchptr = asn1_get_field_ptr(pval, tt); + // We mark field as OPTIONAL so its absence can be recognised. + ret = asn1_template_ex_d2i(pchptr, &p, len, tt, 1, depth); + // If field not present, try the next one + if (ret == -1) { + continue; + } + // If positive return, read OK, break loop + if (ret > 0) { + break; + } + // Otherwise must be an ASN1 parsing error + errtt = tt; + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + goto err; + } + + // Did we fall off the end without reading anything? + if (i == it->tcount) { + // If OPTIONAL, this is OK + if (opt) { + // Free and zero it + ASN1_item_ex_free(pval, it); + return -1; + } + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NO_MATCHING_CHOICE_TYPE); + goto err; + } + + asn1_set_choice_selector(pval, i, it); + if (asn1_cb && !asn1_cb(ASN1_OP_D2I_POST, pval, it, nullptr)) { + goto auxerr; + } + *in = p; + return 1; + } + + case ASN1_ITYPE_SEQUENCE: { + p = *in; + + // If no IMPLICIT tagging set to SEQUENCE, UNIVERSAL + if (tag == -1) { + tag = V_ASN1_SEQUENCE; + aclass = V_ASN1_UNIVERSAL; + } + // Get SEQUENCE length and update len, p + ret = asn1_check_tlen(&len, nullptr, nullptr, &cst, &p, len, tag, aclass, + opt); + if (!ret) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + goto err; + } else if (ret == -1) { + return -1; + } + if (!cst) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_SEQUENCE_NOT_CONSTRUCTED); + goto err; + } + + if (!*pval && !ASN1_item_ex_new(pval, it)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + goto err; + } + + const ASN1_AUX *aux = reinterpret_cast(it->funcs); + ASN1_aux_cb *asn1_cb = aux != nullptr ? aux->asn1_cb : nullptr; + if (asn1_cb && !asn1_cb(ASN1_OP_D2I_PRE, pval, it, nullptr)) { + goto auxerr; + } + + // Free up and zero any ADB found + for (i = 0, tt = it->templates; i < it->tcount; i++, tt++) { + if (tt->flags & ASN1_TFLG_ADB_MASK) { + const ASN1_TEMPLATE *seqtt; + ASN1_VALUE **pseqval; + seqtt = asn1_do_adb(pval, tt, 0); + if (seqtt == nullptr) { + continue; + } + pseqval = asn1_get_field_ptr(pval, seqtt); + ASN1_template_free(pseqval, seqtt); + } + } + + // Get each field entry + for (i = 0, tt = it->templates; i < it->tcount; i++, tt++) { + const ASN1_TEMPLATE *seqtt; + ASN1_VALUE **pseqval; + seqtt = asn1_do_adb(pval, tt, 1); + if (seqtt == nullptr) { + goto err; + } + pseqval = asn1_get_field_ptr(pval, seqtt); + // Have we ran out of data? + if (!len) { + break; + } + q = p; + // This determines the OPTIONAL flag value. The field cannot be + // omitted if it is the last of a SEQUENCE and there is still + // data to be read. This isn't strictly necessary but it + // increases efficiency in some cases. + if (i == (it->tcount - 1)) { + isopt = 0; + } else { + isopt = (seqtt->flags & ASN1_TFLG_OPTIONAL) != 0; + } + // attempt to read in field, allowing each to be OPTIONAL + + ret = asn1_template_ex_d2i(pseqval, &p, len, seqtt, isopt, depth); + if (!ret) { + errtt = seqtt; + goto err; + } else if (ret == -1) { + // OPTIONAL component absent. Free and zero the field. + ASN1_template_free(pseqval, seqtt); + continue; + } + // Update length + len -= p - q; + } + + // Check all data read + if (len) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_SEQUENCE_LENGTH_MISMATCH); + goto err; + } + + // If we get here we've got no more data in the SEQUENCE, however we + // may not have read all fields so check all remaining are OPTIONAL + // and clear any that are. + for (; i < it->tcount; tt++, i++) { + const ASN1_TEMPLATE *seqtt; + seqtt = asn1_do_adb(pval, tt, 1); + if (seqtt == nullptr) { + goto err; + } + if (seqtt->flags & ASN1_TFLG_OPTIONAL) { + ASN1_VALUE **pseqval; + pseqval = asn1_get_field_ptr(pval, seqtt); + ASN1_template_free(pseqval, seqtt); + } else { + errtt = seqtt; + OPENSSL_PUT_ERROR(ASN1, ASN1_R_FIELD_MISSING); + goto err; + } + } + // Save encoding + if (!asn1_enc_save(pval, *in, p - *in, it)) { + goto auxerr; + } + if (asn1_cb && !asn1_cb(ASN1_OP_D2I_POST, pval, it, nullptr)) { + goto auxerr; + } + *in = p; + return 1; + } + + default: + return 0; + } +auxerr: + OPENSSL_PUT_ERROR(ASN1, ASN1_R_AUX_ERROR); +err: + ASN1_item_ex_free(pval, it); + if (errtt) { + ERR_add_error_data(4, "Field=", errtt->field_name, ", Type=", it->sname); + } else { + ERR_add_error_data(2, "Type=", it->sname); + } + return 0; +} + +int bssl::ASN1_item_ex_d2i(ASN1_VALUE **pval, const unsigned char **in, + long len, const ASN1_ITEM *it, int tag, int aclass, + char opt) { + return asn1_item_ex_d2i(pval, in, len, it, tag, aclass, opt, /*depth=*/0); +} + +// Templates are handled with two separate functions. One handles any +// EXPLICIT tag and the other handles the rest. + +static int asn1_template_ex_d2i(ASN1_VALUE **val, const unsigned char **in, + long inlen, const ASN1_TEMPLATE *tt, char opt, + int depth) { + int aclass; + int ret; + long len; + const unsigned char *p, *q; + if (!val) { + return 0; + } + uint32_t flags = tt->flags; + aclass = flags & ASN1_TFLG_TAG_CLASS; + + p = *in; + + // Check if EXPLICIT tag expected + if (flags & ASN1_TFLG_EXPTAG) { + char cst; + // Need to work out amount of data available to the inner content and + // where it starts: so read in EXPLICIT header to get the info. + ret = asn1_check_tlen(&len, nullptr, nullptr, &cst, &p, inlen, tt->tag, + aclass, opt); + q = p; + if (!ret) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + return 0; + } else if (ret == -1) { + return -1; + } + if (!cst) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_EXPLICIT_TAG_NOT_CONSTRUCTED); + return 0; + } + // We've found the field so it can't be OPTIONAL now + ret = asn1_template_noexp_d2i(val, &p, len, tt, /*opt=*/0, depth); + if (!ret) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + return 0; + } + // We read the field in OK so update length + len -= p - q; + // Check for trailing data. + if (len) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_EXPLICIT_LENGTH_MISMATCH); + goto err; + } + } else { + return asn1_template_noexp_d2i(val, in, inlen, tt, opt, depth); + } + + *in = p; + return 1; + +err: + ASN1_template_free(val, tt); + return 0; +} + +static int asn1_template_noexp_d2i(ASN1_VALUE **val, const unsigned char **in, + long len, const ASN1_TEMPLATE *tt, char opt, + int depth) { + int aclass; + int ret; + const unsigned char *p; + if (!val) { + return 0; + } + uint32_t flags = tt->flags; + aclass = flags & ASN1_TFLG_TAG_CLASS; + + p = *in; + + if (flags & ASN1_TFLG_SK_MASK) { + // SET OF, SEQUENCE OF + int sktag, skaclass; + // First work out expected inner tag value + if (flags & ASN1_TFLG_IMPTAG) { + sktag = tt->tag; + skaclass = aclass; + } else { + skaclass = V_ASN1_UNIVERSAL; + if (flags & ASN1_TFLG_SET_OF) { + sktag = V_ASN1_SET; + } else { + sktag = V_ASN1_SEQUENCE; + } + } + // Get the tag + ret = asn1_check_tlen(&len, nullptr, nullptr, nullptr, &p, len, sktag, + skaclass, opt); + if (!ret) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + return 0; + } else if (ret == -1) { + return -1; + } + if (!*val) { + *val = (ASN1_VALUE *)sk_ASN1_VALUE_new_null(); + } else { + // We've got a valid STACK: free up any items present + STACK_OF(ASN1_VALUE) *sktmp = (STACK_OF(ASN1_VALUE) *)*val; + ASN1_VALUE *vtmp; + while (sk_ASN1_VALUE_num(sktmp) > 0) { + vtmp = sk_ASN1_VALUE_pop(sktmp); + ASN1_item_ex_free(&vtmp, ASN1_ITEM_ptr(tt->item)); + } + } + + if (!*val) { + goto err; + } + + // Read as many items as we can + while (len > 0) { + ASN1_VALUE *skfield; + const unsigned char *q = p; + skfield = nullptr; + if (!asn1_item_ex_d2i(&skfield, &p, len, ASN1_ITEM_ptr(tt->item), + /*tag=*/-1, /*aclass=*/0, /*opt=*/0, depth)) { + ASN1_item_ex_free(&skfield, ASN1_ITEM_ptr(tt->item)); + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + goto err; + } + len -= p - q; + if (!sk_ASN1_VALUE_push((STACK_OF(ASN1_VALUE) *)*val, skfield)) { + ASN1_item_ex_free(&skfield, ASN1_ITEM_ptr(tt->item)); + goto err; + } + } + } else if (flags & ASN1_TFLG_IMPTAG) { + // IMPLICIT tagging + ret = asn1_item_ex_d2i(val, &p, len, ASN1_ITEM_ptr(tt->item), tt->tag, + aclass, opt, depth); + if (!ret) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + goto err; + } else if (ret == -1) { + return -1; + } + } else { + // Nothing special + ret = asn1_item_ex_d2i(val, &p, len, ASN1_ITEM_ptr(tt->item), /*tag=*/-1, + /*aclass=*/0, opt, depth); + if (!ret) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NESTED_ASN1_ERROR); + goto err; + } else if (ret == -1) { + return -1; + } + } + + *in = p; + return 1; + +err: + ASN1_template_free(val, tt); + return 0; +} + +// TODO(crbug.com/42290418): Switch the whole file to use a CBS-based calling +// convention. +static int asn1_d2i_ex_primitive_cbs(ASN1_VALUE **pval, CBS *cbs, + const ASN1_ITEM *it, int tag, int aclass, + char opt); + +// asn1_d2i_ex_primitive returns one on success, zero on error, and -1 if an +// optional value was skipped. +static int asn1_d2i_ex_primitive(ASN1_VALUE **pval, const unsigned char **in, + long inlen, const ASN1_ITEM *it, int tag, + int aclass, char opt) { + CBS cbs; + CBS_init(&cbs, *in, inlen); + int ret = asn1_d2i_ex_primitive_cbs(pval, &cbs, it, tag, aclass, opt); + if (ret <= 0) { + return ret; + } + *in = CBS_data(&cbs); + return 1; +} + +static ASN1_STRING *ensure_string(ASN1_VALUE **pval) { + if (*pval) { + return (ASN1_STRING *)*pval; + } + ASN1_STRING *str = ASN1_STRING_new(); + if (str == nullptr) { + return nullptr; + } + *pval = (ASN1_VALUE *)str; + return str; +} + +static int asn1_d2i_ex_primitive_cbs(ASN1_VALUE **pval, CBS *cbs, + const ASN1_ITEM *it, int tag, int aclass, + char opt) { + // Historically, |it->funcs| for primitive types contained an + // |ASN1_PRIMITIVE_FUNCS| table of callbacks. + assert(it->funcs == nullptr); + + int utype; + assert(it->itype == ASN1_ITYPE_PRIMITIVE || it->itype == ASN1_ITYPE_MSTRING); + if (it->itype == ASN1_ITYPE_MSTRING) { + // MSTRING passes utype in |tag|, normally used for implicit tagging. + utype = tag; + tag = -1; + } else { + utype = it->utype; + } + + // Handle ANY types. + if (utype == V_ASN1_ANY) { + if (tag >= 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_TAGGED_ANY); + return 0; + } + if (opt && CBS_len(cbs) == 0) { + return -1; // Omitted OPTIONAL value. + } + ASN1_TYPE *typ; + if (!*pval) { + typ = ASN1_TYPE_new(); + if (typ == nullptr) { + return 0; + } + *pval = (ASN1_VALUE *)typ; + } else { + typ = (ASN1_TYPE *)*pval; + } + return asn1_parse_any(cbs, typ); + } + + // Convert the crypto/asn1 tag into a CBS one. + if (tag == -1) { + tag = utype; + aclass = V_ASN1_UNIVERSAL; + } + + // All edge cases of |utype| should have been handled already. |utype| is now + // either a primitive |ASN1_ITEM|, handled by |DECLARE_ASN1_ITEM|, or a + // multistring option with a corresponding |B_ASN1_*| constant. + assert(utype >= 0 && utype <= V_ASN1_MAX_UNIVERSAL); + CBS_ASN1_TAG cbs_tag = + (static_cast(aclass) << CBS_ASN1_TAG_SHIFT) | + static_cast(tag); + if (utype == V_ASN1_SEQUENCE || utype == V_ASN1_SET) { + cbs_tag |= CBS_ASN1_CONSTRUCTED; + } + + if (opt && !CBS_peek_asn1_tag(cbs, cbs_tag)) { + return -1; // Omitted OPTIONAL value. + } + + // Handle non-|ASN1_STRING| types. + switch (utype) { + case V_ASN1_OBJECT: { + UniquePtr obj(asn1_parse_object(cbs, cbs_tag)); + if (obj == nullptr) { + return 0; + } + ASN1_OBJECT_free((ASN1_OBJECT *)*pval); + *pval = (ASN1_VALUE *)obj.release(); + return 1; + } + case V_ASN1_NULL: { + CBS null; + if (!CBS_get_asn1(cbs, &null, cbs_tag)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + if (CBS_len(&null) != 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NULL_IS_WRONG_LENGTH); + return 0; + } + *pval = (ASN1_VALUE *)1; + return 1; + } + case V_ASN1_BOOLEAN: { + CBS child; + if (!CBS_get_asn1(cbs, &child, cbs_tag)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + // TODO(crbug.com/42290221): Reject invalid BOOLEAN encodings and just + // call |CBS_get_asn1_bool|. + if (CBS_len(&child) != 1) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BOOLEAN_IS_WRONG_LENGTH); + return 0; + } + ASN1_BOOLEAN *tbool; + tbool = (ASN1_BOOLEAN *)pval; + *tbool = CBS_data(&child)[0]; + return 1; + } + } + + // All other types as an |ASN1_STRING| representation. + ASN1_STRING *str = ensure_string(pval); + if (str == nullptr) { + return 0; + } + + switch (utype) { + case V_ASN1_BIT_STRING: + return asn1_parse_bit_string(cbs, str, cbs_tag); + case V_ASN1_INTEGER: + return asn1_parse_integer(cbs, str, cbs_tag); + case V_ASN1_ENUMERATED: + return asn1_parse_enumerated(cbs, str, cbs_tag); + case V_ASN1_UNIVERSALSTRING: + return asn1_parse_universal_string(cbs, str, cbs_tag); + case V_ASN1_BMPSTRING: + return asn1_parse_bmp_string(cbs, str, cbs_tag); + case V_ASN1_UTF8STRING: + return asn1_parse_utf8_string(cbs, str, cbs_tag); + case V_ASN1_UTCTIME: + // TODO(crbug.com/42290221): Reject timezone offsets. We need to parse + // invalid timestamps in |X509| objects, but that parser no longer uses + // this code. + return asn1_parse_utc_time(cbs, str, cbs_tag, + /*allow_timezone_offset=*/1); + case V_ASN1_GENERALIZEDTIME: + return asn1_parse_generalized_time(cbs, str, cbs_tag); + case V_ASN1_OCTET_STRING: + case V_ASN1_NUMERICSTRING: + case V_ASN1_PRINTABLESTRING: + case V_ASN1_T61STRING: + case V_ASN1_VIDEOTEXSTRING: + case V_ASN1_IA5STRING: + case V_ASN1_GRAPHICSTRING: + case V_ASN1_VISIBLESTRING: + case V_ASN1_GENERALSTRING: + // T61String is parsed as Latin-1, so all byte strings are valid. The + // others we currently do not enforce. + // + // TODO(crbug.com/42290290): Enforce the encoding of the other string + // types. + if (!asn1_parse_octet_string(cbs, str, cbs_tag)) { + return 0; + } + str->type = utype; + return 1; + case V_ASN1_SEQUENCE: { + // Save the entire element in the string. + CBS elem; + if (!CBS_get_asn1_element(cbs, &elem, cbs_tag)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + str->type = V_ASN1_SEQUENCE; + return ASN1_STRING_set(str, CBS_data(&elem), CBS_len(&elem)); + } + default: + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + return 0; + } +} + +// Check an ASN1 tag and length: a bit like ASN1_get_object but it +// checks the expected tag. + +static int asn1_check_tlen(long *olen, int *otag, unsigned char *oclass, + char *cst, const unsigned char **in, long len, + int exptag, int expclass, char opt) { + int i; + int ptag, pclass; + long plen; + const unsigned char *p; + p = *in; + + i = ASN1_get_object(&p, &plen, &ptag, &pclass, len); + if (i & 0x80) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_OBJECT_HEADER); + return 0; + } + if (exptag >= 0) { + if ((exptag != ptag) || (expclass != pclass)) { + // If type is OPTIONAL, not an error: indicate missing type. + if (opt) { + return -1; + } + OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_TAG); + return 0; + } + } + + if (cst) { + *cst = i & V_ASN1_CONSTRUCTED; + } + + if (olen) { + *olen = plen; + } + + if (oclass) { + *oclass = pclass; + } + + if (otag) { + *otag = ptag; + } + + *in = p; + return 1; +} diff --git a/third_party/boringssl/src/crypto/asn1/tasn_enc.c b/third_party/boringssl/src/crypto/asn1/tasn_enc.c deleted file mode 100644 index 39a3d521..00000000 --- a/third_party/boringssl/src/crypto/asn1/tasn_enc.c +++ /dev/null @@ -1,693 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include -#include - -#include "../internal.h" -#include "internal.h" - - -static int asn1_item_ex_i2d_opt(ASN1_VALUE **pval, unsigned char **out, - const ASN1_ITEM *it, int tag, int aclass, - int optional); -static int asn1_i2d_ex_primitive(ASN1_VALUE **pval, unsigned char **out, - const ASN1_ITEM *it, int tag, int aclass, - int optional); -static int asn1_ex_i2c(ASN1_VALUE **pval, unsigned char *cont, int *out_omit, - int *putype, const ASN1_ITEM *it); -static int asn1_set_seq_out(STACK_OF(ASN1_VALUE) *sk, unsigned char **out, - int skcontlen, const ASN1_ITEM *item, int do_sort); -static int asn1_template_ex_i2d(ASN1_VALUE **pval, unsigned char **out, - const ASN1_TEMPLATE *tt, int tag, int aclass); - -// Top level i2d equivalents - -int ASN1_item_i2d(ASN1_VALUE *val, unsigned char **out, const ASN1_ITEM *it) { - if (out && !*out) { - unsigned char *p, *buf; - int len = ASN1_item_ex_i2d(&val, NULL, it, /*tag=*/-1, /*aclass=*/0); - if (len <= 0) { - return len; - } - buf = OPENSSL_malloc(len); - if (!buf) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return -1; - } - p = buf; - int len2 = ASN1_item_ex_i2d(&val, &p, it, /*tag=*/-1, /*aclass=*/0); - if (len2 <= 0) { - return len2; - } - assert(len == len2); - *out = buf; - return len; - } - - return ASN1_item_ex_i2d(&val, out, it, /*tag=*/-1, /*aclass=*/0); -} - -// Encode an item, taking care of IMPLICIT tagging (if any). This function -// performs the normal item handling: it can be used in external types. - -int ASN1_item_ex_i2d(ASN1_VALUE **pval, unsigned char **out, - const ASN1_ITEM *it, int tag, int aclass) { - int ret = asn1_item_ex_i2d_opt(pval, out, it, tag, aclass, /*optional=*/0); - assert(ret != 0); - return ret; -} - -// asn1_item_ex_i2d_opt behaves like |ASN1_item_ex_i2d| but, if |optional| is -// non-zero and |*pval| is omitted, it returns zero and writes no bytes. -int asn1_item_ex_i2d_opt(ASN1_VALUE **pval, unsigned char **out, - const ASN1_ITEM *it, int tag, int aclass, - int optional) { - const ASN1_TEMPLATE *tt = NULL; - int i, seqcontlen, seqlen; - - // Historically, |aclass| was repurposed to pass additional flags into the - // encoding process. - assert((aclass & ASN1_TFLG_TAG_CLASS) == aclass); - // If not overridding the tag, |aclass| is ignored and should be zero. - assert(tag != -1 || aclass == 0); - - // All fields are pointers, except for boolean |ASN1_ITYPE_PRIMITIVE|s. - // Optional primitives are handled later. - if ((it->itype != ASN1_ITYPE_PRIMITIVE) && !*pval) { - if (optional) { - return 0; - } - OPENSSL_PUT_ERROR(ASN1, ASN1_R_MISSING_VALUE); - return -1; - } - - switch (it->itype) { - case ASN1_ITYPE_PRIMITIVE: - if (it->templates) { - if (it->templates->flags & ASN1_TFLG_OPTIONAL) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); - return -1; - } - return asn1_template_ex_i2d(pval, out, it->templates, tag, aclass); - } - return asn1_i2d_ex_primitive(pval, out, it, tag, aclass, optional); - - case ASN1_ITYPE_MSTRING: - // It never makes sense for multi-strings to have implicit tagging, so - // if tag != -1, then this looks like an error in the template. - if (tag != -1) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); - return -1; - } - return asn1_i2d_ex_primitive(pval, out, it, -1, 0, optional); - - case ASN1_ITYPE_CHOICE: { - // It never makes sense for CHOICE types to have implicit tagging, so if - // tag != -1, then this looks like an error in the template. - if (tag != -1) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); - return -1; - } - i = asn1_get_choice_selector(pval, it); - if (i < 0 || i >= it->tcount) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NO_MATCHING_CHOICE_TYPE); - return -1; - } - const ASN1_TEMPLATE *chtt = it->templates + i; - if (chtt->flags & ASN1_TFLG_OPTIONAL) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); - return -1; - } - ASN1_VALUE **pchval = asn1_get_field_ptr(pval, chtt); - return asn1_template_ex_i2d(pchval, out, chtt, -1, 0); - } - - case ASN1_ITYPE_EXTERN: { - // If new style i2d it does all the work - const ASN1_EXTERN_FUNCS *ef = it->funcs; - int ret = ef->asn1_ex_i2d(pval, out, it, tag, aclass); - if (ret == 0) { - // |asn1_ex_i2d| should never return zero. We have already checked - // for optional values generically, and |ASN1_ITYPE_EXTERN| fields - // must be pointers. - OPENSSL_PUT_ERROR(ASN1, ERR_R_INTERNAL_ERROR); - return -1; - } - return ret; - } - - case ASN1_ITYPE_SEQUENCE: { - i = asn1_enc_restore(&seqcontlen, out, pval, it); - // An error occurred - if (i < 0) { - return -1; - } - // We have a valid cached encoding... - if (i > 0) { - return seqcontlen; - } - // Otherwise carry on - seqcontlen = 0; - // If no IMPLICIT tagging set to SEQUENCE, UNIVERSAL - if (tag == -1) { - tag = V_ASN1_SEQUENCE; - aclass = V_ASN1_UNIVERSAL; - } - // First work out sequence content length - for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) { - const ASN1_TEMPLATE *seqtt; - ASN1_VALUE **pseqval; - int tmplen; - seqtt = asn1_do_adb(pval, tt, 1); - if (!seqtt) { - return -1; - } - pseqval = asn1_get_field_ptr(pval, seqtt); - tmplen = asn1_template_ex_i2d(pseqval, NULL, seqtt, -1, 0); - if (tmplen == -1 || (tmplen > INT_MAX - seqcontlen)) { - return -1; - } - seqcontlen += tmplen; - } - - seqlen = ASN1_object_size(/*constructed=*/1, seqcontlen, tag); - if (!out || seqlen == -1) { - return seqlen; - } - // Output SEQUENCE header - ASN1_put_object(out, /*constructed=*/1, seqcontlen, tag, aclass); - for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) { - const ASN1_TEMPLATE *seqtt; - ASN1_VALUE **pseqval; - seqtt = asn1_do_adb(pval, tt, 1); - if (!seqtt) { - return -1; - } - pseqval = asn1_get_field_ptr(pval, seqtt); - if (asn1_template_ex_i2d(pseqval, out, seqtt, -1, 0) < 0) { - return -1; - } - } - return seqlen; - } - - default: - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); - return -1; - } -} - -// asn1_template_ex_i2d behaves like |asn1_item_ex_i2d_opt| but uses an -// |ASN1_TEMPLATE| instead of an |ASN1_ITEM|. An |ASN1_TEMPLATE| wraps an -// |ASN1_ITEM| with modifiers such as tagging, SEQUENCE or SET, etc. Instead of -// taking an |optional| parameter, it uses the |ASN1_TFLG_OPTIONAL| flag. -static int asn1_template_ex_i2d(ASN1_VALUE **pval, unsigned char **out, - const ASN1_TEMPLATE *tt, int tag, int iclass) { - int i, ret, flags, ttag, tclass; - size_t j; - flags = tt->flags; - - // Historically, |iclass| was repurposed to pass additional flags into the - // encoding process. - assert((iclass & ASN1_TFLG_TAG_CLASS) == iclass); - // If not overridding the tag, |iclass| is ignored and should be zero. - assert(tag != -1 || iclass == 0); - - // Work out tag and class to use: tagging may come either from the - // template or the arguments, not both because this would create - // ambiguity. - if (flags & ASN1_TFLG_TAG_MASK) { - // Error if argument and template tagging - if (tag != -1) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); - return -1; - } - // Get tagging from template - ttag = tt->tag; - tclass = flags & ASN1_TFLG_TAG_CLASS; - } else if (tag != -1) { - // No template tagging, get from arguments - ttag = tag; - tclass = iclass & ASN1_TFLG_TAG_CLASS; - } else { - ttag = -1; - tclass = 0; - } - - const int optional = (flags & ASN1_TFLG_OPTIONAL) != 0; - - // At this point 'ttag' contains the outer tag to use, and 'tclass' is the - // class. - - if (flags & ASN1_TFLG_SK_MASK) { - // SET OF, SEQUENCE OF - STACK_OF(ASN1_VALUE) *sk = (STACK_OF(ASN1_VALUE) *)*pval; - int isset, sktag, skaclass; - int skcontlen, sklen; - ASN1_VALUE *skitem; - - if (!*pval) { - if (optional) { - return 0; - } - OPENSSL_PUT_ERROR(ASN1, ASN1_R_MISSING_VALUE); - return -1; - } - - if (flags & ASN1_TFLG_SET_OF) { - isset = 1; - // Historically, types with both bits set were mutated when - // serialized to apply the sort. We no longer support this. - assert((flags & ASN1_TFLG_SEQUENCE_OF) == 0); - } else { - isset = 0; - } - - // Work out inner tag value: if EXPLICIT or no tagging use underlying - // type. - if ((ttag != -1) && !(flags & ASN1_TFLG_EXPTAG)) { - sktag = ttag; - skaclass = tclass; - } else { - skaclass = V_ASN1_UNIVERSAL; - if (isset) { - sktag = V_ASN1_SET; - } else { - sktag = V_ASN1_SEQUENCE; - } - } - - // Determine total length of items - skcontlen = 0; - for (j = 0; j < sk_ASN1_VALUE_num(sk); j++) { - int tmplen; - skitem = sk_ASN1_VALUE_value(sk, j); - tmplen = ASN1_item_ex_i2d(&skitem, NULL, ASN1_ITEM_ptr(tt->item), -1, 0); - if (tmplen == -1 || (skcontlen > INT_MAX - tmplen)) { - return -1; - } - skcontlen += tmplen; - } - sklen = ASN1_object_size(/*constructed=*/1, skcontlen, sktag); - if (sklen == -1) { - return -1; - } - // If EXPLICIT need length of surrounding tag - if (flags & ASN1_TFLG_EXPTAG) { - ret = ASN1_object_size(/*constructed=*/1, sklen, ttag); - } else { - ret = sklen; - } - - if (!out || ret == -1) { - return ret; - } - - // Now encode this lot... - // EXPLICIT tag - if (flags & ASN1_TFLG_EXPTAG) { - ASN1_put_object(out, /*constructed=*/1, sklen, ttag, tclass); - } - // SET or SEQUENCE and IMPLICIT tag - ASN1_put_object(out, /*constructed=*/1, skcontlen, sktag, skaclass); - // And the stuff itself - if (!asn1_set_seq_out(sk, out, skcontlen, ASN1_ITEM_ptr(tt->item), isset)) { - return -1; - } - return ret; - } - - if (flags & ASN1_TFLG_EXPTAG) { - // EXPLICIT tagging - // Find length of tagged item - i = asn1_item_ex_i2d_opt(pval, NULL, ASN1_ITEM_ptr(tt->item), -1, 0, - optional); - if (i <= 0) { - return i; - } - // Find length of EXPLICIT tag - ret = ASN1_object_size(/*constructed=*/1, i, ttag); - if (out && ret != -1) { - // Output tag and item - ASN1_put_object(out, /*constructed=*/1, i, ttag, tclass); - if (ASN1_item_ex_i2d(pval, out, ASN1_ITEM_ptr(tt->item), -1, 0) < 0) { - return -1; - } - } - return ret; - } - - // Either normal or IMPLICIT tagging - return asn1_item_ex_i2d_opt(pval, out, ASN1_ITEM_ptr(tt->item), ttag, tclass, - optional); -} - -// Temporary structure used to hold DER encoding of items for SET OF - -typedef struct { - unsigned char *data; - int length; -} DER_ENC; - -static int der_cmp(const void *a, const void *b) { - const DER_ENC *d1 = a, *d2 = b; - int cmplen, i; - cmplen = (d1->length < d2->length) ? d1->length : d2->length; - i = OPENSSL_memcmp(d1->data, d2->data, cmplen); - if (i) { - return i; - } - return d1->length - d2->length; -} - -// asn1_set_seq_out writes |sk| to |out| under the i2d output convention, -// excluding the tag and length. It returns one on success and zero on error. -// |skcontlen| must be the total encoded size. If |do_sort| is non-zero, the -// elements are sorted for a SET OF type. Each element of |sk| has type -// |item|. -static int asn1_set_seq_out(STACK_OF(ASN1_VALUE) *sk, unsigned char **out, - int skcontlen, const ASN1_ITEM *item, int do_sort) { - // No need to sort if there are fewer than two items. - if (!do_sort || sk_ASN1_VALUE_num(sk) < 2) { - for (size_t i = 0; i < sk_ASN1_VALUE_num(sk); i++) { - ASN1_VALUE *skitem = sk_ASN1_VALUE_value(sk, i); - if (ASN1_item_ex_i2d(&skitem, out, item, -1, 0) < 0) { - return 0; - } - } - return 1; - } - - if (sk_ASN1_VALUE_num(sk) > ((size_t)-1) / sizeof(DER_ENC)) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_OVERFLOW); - return 0; - } - - int ret = 0; - unsigned char *const buf = OPENSSL_malloc(skcontlen); - DER_ENC *encoded = OPENSSL_malloc(sk_ASN1_VALUE_num(sk) * sizeof(*encoded)); - if (encoded == NULL || buf == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - goto err; - } - - // Encode all the elements into |buf| and populate |encoded|. - unsigned char *p = buf; - for (size_t i = 0; i < sk_ASN1_VALUE_num(sk); i++) { - ASN1_VALUE *skitem = sk_ASN1_VALUE_value(sk, i); - encoded[i].data = p; - encoded[i].length = ASN1_item_ex_i2d(&skitem, &p, item, -1, 0); - if (encoded[i].length < 0) { - goto err; - } - assert(p - buf <= skcontlen); - } - - qsort(encoded, sk_ASN1_VALUE_num(sk), sizeof(*encoded), der_cmp); - - // Output the elements in sorted order. - p = *out; - for (size_t i = 0; i < sk_ASN1_VALUE_num(sk); i++) { - OPENSSL_memcpy(p, encoded[i].data, encoded[i].length); - p += encoded[i].length; - } - *out = p; - - ret = 1; - -err: - OPENSSL_free(encoded); - OPENSSL_free(buf); - return ret; -} - -// asn1_i2d_ex_primitive behaves like |ASN1_item_ex_i2d| but |item| must be a -// a PRIMITIVE or MSTRING type that is not an |ASN1_ITEM_TEMPLATE|. -static int asn1_i2d_ex_primitive(ASN1_VALUE **pval, unsigned char **out, - const ASN1_ITEM *it, int tag, int aclass, - int optional) { - // Get length of content octets and maybe find out the underlying type. - int omit; - int utype = it->utype; - int len = asn1_ex_i2c(pval, NULL, &omit, &utype, it); - if (len < 0) { - return -1; - } - if (omit) { - if (optional) { - return 0; - } - OPENSSL_PUT_ERROR(ASN1, ASN1_R_MISSING_VALUE); - return -1; - } - - // If SEQUENCE, SET or OTHER then header is included in pseudo content - // octets so don't include tag+length. We need to check here because the - // call to asn1_ex_i2c() could change utype. - int usetag = - utype != V_ASN1_SEQUENCE && utype != V_ASN1_SET && utype != V_ASN1_OTHER; - - // If not implicitly tagged get tag from underlying type - if (tag == -1) { - tag = utype; - } - - // Output tag+length followed by content octets - if (out) { - if (usetag) { - ASN1_put_object(out, /*constructed=*/0, len, tag, aclass); - } - int len2 = asn1_ex_i2c(pval, *out, &omit, &utype, it); - if (len2 < 0) { - return -1; - } - assert(len == len2); - assert(!omit); - *out += len; - } - - if (usetag) { - return ASN1_object_size(/*constructed=*/0, len, tag); - } - return len; -} - -// asn1_ex_i2c writes the |*pval| to |cout| under the i2d output convention, -// excluding the tag and length. It returns the number of bytes written, -// possibly zero, on success or -1 on error. If |*pval| should be omitted, it -// returns zero and sets |*out_omit| to true. -// -// If |it| is an MSTRING or ANY type, it gets the underlying type from |*pval|, -// which must be an |ASN1_STRING| or |ASN1_TYPE|, respectively. It then updates -// |*putype| with the tag number of type used, or |V_ASN1_OTHER| if it was not a -// universal type. If |*putype| is set to |V_ASN1_SEQUENCE|, |V_ASN1_SET|, or -// |V_ASN1_OTHER|, it additionally outputs the tag and length, so the caller -// must not do so. -// -// Otherwise, |*putype| must contain |it->utype|. -// -// WARNING: Unlike most functions in this file, |asn1_ex_i2c| can return zero -// without omitting the element. ASN.1 values may have empty contents. -static int asn1_ex_i2c(ASN1_VALUE **pval, unsigned char *cout, int *out_omit, - int *putype, const ASN1_ITEM *it) { - ASN1_BOOLEAN *tbool = NULL; - ASN1_STRING *strtmp; - ASN1_OBJECT *otmp; - int utype; - const unsigned char *cont; - unsigned char c; - int len; - - // Historically, |it->funcs| for primitive types contained an - // |ASN1_PRIMITIVE_FUNCS| table of callbacks. - assert(it->funcs == NULL); - - *out_omit = 0; - - // Should type be omitted? - if ((it->itype != ASN1_ITYPE_PRIMITIVE) || (it->utype != V_ASN1_BOOLEAN)) { - if (!*pval) { - *out_omit = 1; - return 0; - } - } - - if (it->itype == ASN1_ITYPE_MSTRING) { - // If MSTRING type set the underlying type - strtmp = (ASN1_STRING *)*pval; - utype = strtmp->type; - if (utype < 0 && utype != V_ASN1_OTHER) { - // MSTRINGs can have type -1 when default-constructed. - OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_TYPE); - return -1; - } - // Negative INTEGER and ENUMERATED values use |ASN1_STRING| type values - // that do not match their corresponding utype values. INTEGERs cannot - // participate in MSTRING types, but ENUMERATEDs can. - // - // TODO(davidben): Is this a bug? Although arguably one of the MSTRING - // types should contain more values, rather than less. See - // https://crbug.com/boringssl/412. But it is not possible to fit all - // possible ANY values into an |ASN1_STRING|, so matching the spec here - // is somewhat hopeless. - if (utype == V_ASN1_NEG_INTEGER) { - utype = V_ASN1_INTEGER; - } else if (utype == V_ASN1_NEG_ENUMERATED) { - utype = V_ASN1_ENUMERATED; - } - *putype = utype; - } else if (it->utype == V_ASN1_ANY) { - // If ANY set type and pointer to value - ASN1_TYPE *typ; - typ = (ASN1_TYPE *)*pval; - utype = typ->type; - if (utype < 0 && utype != V_ASN1_OTHER) { - // |ASN1_TYPE|s can have type -1 when default-constructed. - OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_TYPE); - return -1; - } - *putype = utype; - pval = &typ->value.asn1_value; - } else { - utype = *putype; - } - - switch (utype) { - case V_ASN1_OBJECT: - otmp = (ASN1_OBJECT *)*pval; - cont = otmp->data; - len = otmp->length; - if (len == 0) { - // Some |ASN1_OBJECT|s do not have OIDs and cannot be serialized. - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_OBJECT); - return -1; - } - break; - - case V_ASN1_NULL: - cont = NULL; - len = 0; - break; - - case V_ASN1_BOOLEAN: - tbool = (ASN1_BOOLEAN *)pval; - if (*tbool == -1) { - *out_omit = 1; - return 0; - } - if (it->utype != V_ASN1_ANY) { - // Default handling if value == size field then omit - if ((*tbool && (it->size > 0)) || (!*tbool && !it->size)) { - *out_omit = 1; - return 0; - } - } - c = *tbool ? 0xff : 0x00; - cont = &c; - len = 1; - break; - - case V_ASN1_BIT_STRING: { - int ret = - i2c_ASN1_BIT_STRING((ASN1_BIT_STRING *)*pval, cout ? &cout : NULL); - // |i2c_ASN1_BIT_STRING| returns zero on error instead of -1. - return ret <= 0 ? -1 : ret; - } - - case V_ASN1_INTEGER: - case V_ASN1_ENUMERATED: { - // |i2c_ASN1_INTEGER| also handles ENUMERATED. - int ret = i2c_ASN1_INTEGER((ASN1_INTEGER *)*pval, cout ? &cout : NULL); - // |i2c_ASN1_INTEGER| returns zero on error instead of -1. - return ret <= 0 ? -1 : ret; - } - - case V_ASN1_OCTET_STRING: - case V_ASN1_NUMERICSTRING: - case V_ASN1_PRINTABLESTRING: - case V_ASN1_T61STRING: - case V_ASN1_VIDEOTEXSTRING: - case V_ASN1_IA5STRING: - case V_ASN1_UTCTIME: - case V_ASN1_GENERALIZEDTIME: - case V_ASN1_GRAPHICSTRING: - case V_ASN1_VISIBLESTRING: - case V_ASN1_GENERALSTRING: - case V_ASN1_UNIVERSALSTRING: - case V_ASN1_BMPSTRING: - case V_ASN1_UTF8STRING: - case V_ASN1_SEQUENCE: - case V_ASN1_SET: - default: - // All based on ASN1_STRING and handled the same - strtmp = (ASN1_STRING *)*pval; - cont = strtmp->data; - len = strtmp->length; - - break; - } - if (cout && len) { - OPENSSL_memcpy(cout, cont, len); - } - return len; -} diff --git a/third_party/boringssl/src/crypto/asn1/tasn_enc.cc b/third_party/boringssl/src/crypto/asn1/tasn_enc.cc new file mode 100644 index 00000000..1d1ad97f --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/tasn_enc.cc @@ -0,0 +1,664 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +static int asn1_item_ex_i2d_opt(ASN1_VALUE **pval, unsigned char **out, + const ASN1_ITEM *it, int tag, int aclass, + int optional); +static int asn1_i2d_ex_primitive(ASN1_VALUE **pval, unsigned char **out, + const ASN1_ITEM *it, int tag, int aclass, + int optional); +static int asn1_ex_i2c(ASN1_VALUE **pval, unsigned char *cont, int *out_omit, + int *putype, const ASN1_ITEM *it); +static int asn1_set_seq_out(STACK_OF(ASN1_VALUE) *sk, unsigned char **out, + int skcontlen, const ASN1_ITEM *item, int do_sort); +static int asn1_template_ex_i2d(ASN1_VALUE **pval, unsigned char **out, + const ASN1_TEMPLATE *tt, int tag, int aclass, + int optional); + +// Top level i2d equivalents + +int ASN1_item_i2d(ASN1_VALUE *val, unsigned char **out, const ASN1_ITEM *it) { + if (out && !*out) { + unsigned char *p, *buf; + int len = ASN1_item_ex_i2d(&val, nullptr, it, /*tag=*/-1, /*aclass=*/0); + if (len <= 0) { + return len; + } + buf = reinterpret_cast(OPENSSL_malloc(len)); + if (!buf) { + return -1; + } + p = buf; + int len2 = ASN1_item_ex_i2d(&val, &p, it, /*tag=*/-1, /*aclass=*/0); + if (len2 <= 0) { + OPENSSL_free(buf); + return len2; + } + assert(len == len2); + *out = buf; + return len; + } + + return ASN1_item_ex_i2d(&val, out, it, /*tag=*/-1, /*aclass=*/0); +} + +// Encode an item, taking care of IMPLICIT tagging (if any). This function +// performs the normal item handling: it can be used in external types. + +int bssl::ASN1_item_ex_i2d(ASN1_VALUE **pval, unsigned char **out, + const ASN1_ITEM *it, int tag, int aclass) { + int ret = asn1_item_ex_i2d_opt(pval, out, it, tag, aclass, /*optional=*/0); + assert(ret != 0); + return ret; +} + +// asn1_item_ex_i2d_opt behaves like |ASN1_item_ex_i2d| but, if |optional| is +// non-zero and |*pval| is omitted, it returns zero and writes no bytes. +int asn1_item_ex_i2d_opt(ASN1_VALUE **pval, unsigned char **out, + const ASN1_ITEM *it, int tag, int aclass, + int optional) { + const ASN1_TEMPLATE *tt = nullptr; + int i, seqcontlen, seqlen; + + // Historically, |aclass| was repurposed to pass additional flags into the + // encoding process. + assert((aclass & ASN1_TFLG_TAG_CLASS) == aclass); + // If not overriding the tag, |aclass| is ignored and should be zero. + assert(tag != -1 || aclass == 0); + + // All fields are pointers, except for boolean |ASN1_ITYPE_PRIMITIVE|s. + // Optional primitives are handled later. + if ((it->itype != ASN1_ITYPE_PRIMITIVE) && !*pval) { + if (optional) { + return 0; + } + OPENSSL_PUT_ERROR(ASN1, ASN1_R_MISSING_VALUE); + return -1; + } + + switch (it->itype) { + case ASN1_ITYPE_PRIMITIVE: + if (it->templates) { + // This is an |ASN1_ITEM_TEMPLATE|. + if (it->templates->flags & ASN1_TFLG_OPTIONAL) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + return -1; + } + return asn1_template_ex_i2d(pval, out, it->templates, tag, aclass, + optional); + } + return asn1_i2d_ex_primitive(pval, out, it, tag, aclass, optional); + + case ASN1_ITYPE_MSTRING: + // It never makes sense for multi-strings to have implicit tagging, so + // if tag != -1, then this looks like an error in the template. + if (tag != -1) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + return -1; + } + return asn1_i2d_ex_primitive(pval, out, it, -1, 0, optional); + + case ASN1_ITYPE_CHOICE: { + // It never makes sense for CHOICE types to have implicit tagging, so if + // tag != -1, then this looks like an error in the template. + if (tag != -1) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + return -1; + } + i = asn1_get_choice_selector(pval, it); + if (i < 0 || i >= it->tcount) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NO_MATCHING_CHOICE_TYPE); + return -1; + } + const ASN1_TEMPLATE *chtt = it->templates + i; + if (chtt->flags & ASN1_TFLG_OPTIONAL) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + return -1; + } + ASN1_VALUE **pchval = asn1_get_field_ptr(pval, chtt); + return asn1_template_ex_i2d(pchval, out, chtt, -1, 0, /*optional=*/0); + } + + case ASN1_ITYPE_EXTERN: { + // We don't support implicit tagging with external types. + if (tag != -1) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + return -1; + } + const ASN1_EXTERN_FUNCS *ef = + reinterpret_cast(it->funcs); + int ret = ef->asn1_ex_i2d(pval, out, it); + if (ret == 0) { + // |asn1_ex_i2d| should never return zero. We have already checked + // for optional values generically, and |ASN1_ITYPE_EXTERN| fields + // must be pointers. + OPENSSL_PUT_ERROR(ASN1, ERR_R_INTERNAL_ERROR); + return -1; + } + return ret; + } + + case ASN1_ITYPE_SEQUENCE: { + i = asn1_enc_restore(&seqcontlen, out, pval, it); + // An error occurred + if (i < 0) { + return -1; + } + // We have a valid cached encoding... + if (i > 0) { + return seqcontlen; + } + // Otherwise carry on + seqcontlen = 0; + // If no IMPLICIT tagging set to SEQUENCE, UNIVERSAL + if (tag == -1) { + tag = V_ASN1_SEQUENCE; + aclass = V_ASN1_UNIVERSAL; + } + // First work out sequence content length + for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) { + const ASN1_TEMPLATE *seqtt; + ASN1_VALUE **pseqval; + int tmplen; + seqtt = asn1_do_adb(pval, tt, 1); + if (!seqtt) { + return -1; + } + pseqval = asn1_get_field_ptr(pval, seqtt); + tmplen = asn1_template_ex_i2d(pseqval, nullptr, seqtt, -1, 0, + /*optional=*/0); + if (tmplen == -1 || (tmplen > INT_MAX - seqcontlen)) { + return -1; + } + seqcontlen += tmplen; + } + + seqlen = ASN1_object_size(/*constructed=*/1, seqcontlen, tag); + if (!out || seqlen == -1) { + return seqlen; + } + // Output SEQUENCE header + ASN1_put_object(out, /*constructed=*/1, seqcontlen, tag, aclass); + for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) { + const ASN1_TEMPLATE *seqtt; + ASN1_VALUE **pseqval; + seqtt = asn1_do_adb(pval, tt, 1); + if (!seqtt) { + return -1; + } + pseqval = asn1_get_field_ptr(pval, seqtt); + if (asn1_template_ex_i2d(pseqval, out, seqtt, -1, 0, /*optional=*/0) < + 0) { + return -1; + } + } + return seqlen; + } + + default: + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + return -1; + } +} + +// asn1_template_ex_i2d behaves like |asn1_item_ex_i2d_opt| but uses an +// |ASN1_TEMPLATE| instead of an |ASN1_ITEM|. An |ASN1_TEMPLATE| wraps an +// |ASN1_ITEM| with modifiers such as tagging, SEQUENCE or SET, etc. +static int asn1_template_ex_i2d(ASN1_VALUE **pval, unsigned char **out, + const ASN1_TEMPLATE *tt, int tag, int iclass, + int optional) { + int i, ret, ttag, tclass; + size_t j; + uint32_t flags = tt->flags; + + // Historically, |iclass| was repurposed to pass additional flags into the + // encoding process. + assert((iclass & ASN1_TFLG_TAG_CLASS) == iclass); + // If not overriding the tag, |iclass| is ignored and should be zero. + assert(tag != -1 || iclass == 0); + + // Work out tag and class to use: tagging may come either from the + // template or the arguments, not both because this would create + // ambiguity. + if (flags & ASN1_TFLG_TAG_MASK) { + // Error if argument and template tagging + if (tag != -1) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + return -1; + } + // Get tagging from template + ttag = tt->tag; + tclass = flags & ASN1_TFLG_TAG_CLASS; + } else if (tag != -1) { + // No template tagging, get from arguments + ttag = tag; + tclass = iclass & ASN1_TFLG_TAG_CLASS; + } else { + ttag = -1; + tclass = 0; + } + + // The template may itself by marked as optional, or this may be the template + // of an |ASN1_ITEM_TEMPLATE| type which was contained inside an outer + // optional template. (They cannot both be true because the + // |ASN1_ITEM_TEMPLATE| codepath rejects optional templates.) + assert(!optional || (flags & ASN1_TFLG_OPTIONAL) == 0); + optional = optional || (flags & ASN1_TFLG_OPTIONAL) != 0; + + // At this point 'ttag' contains the outer tag to use, and 'tclass' is the + // class. + + if (flags & ASN1_TFLG_SK_MASK) { + // SET OF, SEQUENCE OF + STACK_OF(ASN1_VALUE) *sk = (STACK_OF(ASN1_VALUE) *)*pval; + int isset, sktag, skaclass; + int skcontlen, sklen; + ASN1_VALUE *skitem; + + if (!*pval) { + if (optional) { + return 0; + } + OPENSSL_PUT_ERROR(ASN1, ASN1_R_MISSING_VALUE); + return -1; + } + + if (flags & ASN1_TFLG_SET_OF) { + isset = 1; + // Historically, types with both bits set were mutated when + // serialized to apply the sort. We no longer support this. + assert((flags & ASN1_TFLG_SEQUENCE_OF) == 0); + } else { + isset = 0; + } + + // Work out inner tag value: if EXPLICIT or no tagging use underlying + // type. + if ((ttag != -1) && !(flags & ASN1_TFLG_EXPTAG)) { + sktag = ttag; + skaclass = tclass; + } else { + skaclass = V_ASN1_UNIVERSAL; + if (isset) { + sktag = V_ASN1_SET; + } else { + sktag = V_ASN1_SEQUENCE; + } + } + + // Determine total length of items + skcontlen = 0; + for (j = 0; j < sk_ASN1_VALUE_num(sk); j++) { + int tmplen; + skitem = sk_ASN1_VALUE_value(sk, j); + tmplen = + ASN1_item_ex_i2d(&skitem, nullptr, ASN1_ITEM_ptr(tt->item), -1, 0); + if (tmplen == -1 || (skcontlen > INT_MAX - tmplen)) { + return -1; + } + skcontlen += tmplen; + } + sklen = ASN1_object_size(/*constructed=*/1, skcontlen, sktag); + if (sklen == -1) { + return -1; + } + // If EXPLICIT need length of surrounding tag + if (flags & ASN1_TFLG_EXPTAG) { + ret = ASN1_object_size(/*constructed=*/1, sklen, ttag); + } else { + ret = sklen; + } + + if (!out || ret == -1) { + return ret; + } + + // Now encode this lot... + // EXPLICIT tag + if (flags & ASN1_TFLG_EXPTAG) { + ASN1_put_object(out, /*constructed=*/1, sklen, ttag, tclass); + } + // SET or SEQUENCE and IMPLICIT tag + ASN1_put_object(out, /*constructed=*/1, skcontlen, sktag, skaclass); + // And the stuff itself + if (!asn1_set_seq_out(sk, out, skcontlen, ASN1_ITEM_ptr(tt->item), isset)) { + return -1; + } + return ret; + } + + if (flags & ASN1_TFLG_EXPTAG) { + // EXPLICIT tagging + // Find length of tagged item + i = asn1_item_ex_i2d_opt(pval, nullptr, ASN1_ITEM_ptr(tt->item), -1, 0, + optional); + if (i <= 0) { + return i; + } + // Find length of EXPLICIT tag + ret = ASN1_object_size(/*constructed=*/1, i, ttag); + if (out && ret != -1) { + // Output tag and item + ASN1_put_object(out, /*constructed=*/1, i, ttag, tclass); + if (ASN1_item_ex_i2d(pval, out, ASN1_ITEM_ptr(tt->item), -1, 0) < 0) { + return -1; + } + } + return ret; + } + + // Either normal or IMPLICIT tagging + return asn1_item_ex_i2d_opt(pval, out, ASN1_ITEM_ptr(tt->item), ttag, tclass, + optional); +} + +// Temporary structure used to hold DER encoding of items for SET OF + +typedef struct { + unsigned char *data; + int length; +} DER_ENC; + +static int der_cmp(const void *a, const void *b) { + const DER_ENC *d1 = reinterpret_cast(a), + *d2 = reinterpret_cast(b); + int cmplen, i; + cmplen = (d1->length < d2->length) ? d1->length : d2->length; + i = OPENSSL_memcmp(d1->data, d2->data, cmplen); + if (i) { + return i; + } + return d1->length - d2->length; +} + +// asn1_set_seq_out writes |sk| to |out| under the i2d output convention, +// excluding the tag and length. It returns one on success and zero on error. +// |skcontlen| must be the total encoded size. If |do_sort| is non-zero, the +// elements are sorted for a SET OF type. Each element of |sk| has type +// |item|. +static int asn1_set_seq_out(STACK_OF(ASN1_VALUE) *sk, unsigned char **out, + int skcontlen, const ASN1_ITEM *item, int do_sort) { + // No need to sort if there are fewer than two items. + if (!do_sort || sk_ASN1_VALUE_num(sk) < 2) { + for (size_t i = 0; i < sk_ASN1_VALUE_num(sk); i++) { + ASN1_VALUE *skitem = sk_ASN1_VALUE_value(sk, i); + if (ASN1_item_ex_i2d(&skitem, out, item, -1, 0) < 0) { + return 0; + } + } + return 1; + } + + int ret = 0; + uint8_t *const buf = reinterpret_cast(OPENSSL_malloc(skcontlen)); + DER_ENC *encoded = reinterpret_cast( + OPENSSL_calloc(sk_ASN1_VALUE_num(sk), sizeof(*encoded))); + uint8_t *p = buf; + if (encoded == nullptr || buf == nullptr) { + goto err; + } + + // Encode all the elements into |buf| and populate |encoded|. + for (size_t i = 0; i < sk_ASN1_VALUE_num(sk); i++) { + ASN1_VALUE *skitem = sk_ASN1_VALUE_value(sk, i); + encoded[i].data = p; + encoded[i].length = ASN1_item_ex_i2d(&skitem, &p, item, -1, 0); + if (encoded[i].length < 0) { + goto err; + } + assert(p - buf <= skcontlen); + } + + qsort(encoded, sk_ASN1_VALUE_num(sk), sizeof(*encoded), der_cmp); + + // Output the elements in sorted order. + p = *out; + for (size_t i = 0; i < sk_ASN1_VALUE_num(sk); i++) { + OPENSSL_memcpy(p, encoded[i].data, encoded[i].length); + p += encoded[i].length; + } + *out = p; + + ret = 1; + +err: + OPENSSL_free(encoded); + OPENSSL_free(buf); + return ret; +} + +// asn1_i2d_ex_primitive behaves like |ASN1_item_ex_i2d| but |item| must be a +// a PRIMITIVE or MSTRING type that is not an |ASN1_ITEM_TEMPLATE|. +static int asn1_i2d_ex_primitive(ASN1_VALUE **pval, unsigned char **out, + const ASN1_ITEM *it, int tag, int aclass, + int optional) { + // Get length of content octets and maybe find out the underlying type. + int omit; + int utype = it->utype; + int len = asn1_ex_i2c(pval, nullptr, &omit, &utype, it); + if (len < 0) { + return -1; + } + if (omit) { + if (optional) { + return 0; + } + OPENSSL_PUT_ERROR(ASN1, ASN1_R_MISSING_VALUE); + return -1; + } + + // If SEQUENCE, SET or OTHER then header is included in pseudo content + // octets so don't include tag+length. We need to check here because the + // call to asn1_ex_i2c() could change utype. + int usetag = + utype != V_ASN1_SEQUENCE && utype != V_ASN1_SET && utype != V_ASN1_OTHER; + + // If not implicitly tagged get tag from underlying type + if (tag == -1) { + tag = utype; + } + + // Output tag+length followed by content octets + if (out) { + if (usetag) { + ASN1_put_object(out, /*constructed=*/0, len, tag, aclass); + } + int len2 = asn1_ex_i2c(pval, *out, &omit, &utype, it); + if (len2 < 0) { + return -1; + } + assert(len == len2); + assert(!omit); + *out += len; + } + + if (usetag) { + return ASN1_object_size(/*constructed=*/0, len, tag); + } + return len; +} + +// asn1_ex_i2c writes the |*pval| to |cout| under the i2d output convention, +// excluding the tag and length. It returns the number of bytes written, +// possibly zero, on success or -1 on error. If |*pval| should be omitted, it +// returns zero and sets |*out_omit| to true. +// +// If |it| is an MSTRING or ANY type, it gets the underlying type from |*pval|, +// which must be an |ASN1_STRING| or |ASN1_TYPE|, respectively. It then updates +// |*putype| with the tag number of type used, or |V_ASN1_OTHER| if it was not a +// universal type. If |*putype| is set to |V_ASN1_SEQUENCE|, |V_ASN1_SET|, or +// |V_ASN1_OTHER|, it additionally outputs the tag and length, so the caller +// must not do so. +// +// Otherwise, |*putype| must contain |it->utype|. +// +// WARNING: Unlike most functions in this file, |asn1_ex_i2c| can return zero +// without omitting the element. ASN.1 values may have empty contents. +static int asn1_ex_i2c(ASN1_VALUE **pval, unsigned char *cout, int *out_omit, + int *putype, const ASN1_ITEM *it) { + ASN1_BOOLEAN *tbool = nullptr; + ASN1_STRING *strtmp; + ASN1_OBJECT *otmp; + int utype; + const unsigned char *cont; + unsigned char c; + int len; + + assert(it->itype == ASN1_ITYPE_PRIMITIVE || it->itype == ASN1_ITYPE_MSTRING); + // Historically, |it->funcs| for primitive types contained an + // |ASN1_PRIMITIVE_FUNCS| table of callbacks. + assert(it->funcs == nullptr); + + *out_omit = 0; + + // Handle omitted optional values for all but BOOLEAN, which uses a + // non-pointer representation. + if (it->itype != ASN1_ITYPE_PRIMITIVE || it->utype != V_ASN1_BOOLEAN) { + if (!*pval) { + *out_omit = 1; + return 0; + } + } + + if (it->itype == ASN1_ITYPE_MSTRING) { + // If MSTRING type set the underlying type + strtmp = (ASN1_STRING *)*pval; + utype = strtmp->type; + if (utype < 0 && utype != V_ASN1_OTHER) { + // MSTRINGs can have type -1 when default-constructed. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_TYPE); + return -1; + } + // Negative INTEGER and ENUMERATED values use |ASN1_STRING| type values that + // do not match their corresponding utype values. + if (utype == V_ASN1_NEG_INTEGER) { + utype = V_ASN1_INTEGER; + } else if (utype == V_ASN1_NEG_ENUMERATED) { + utype = V_ASN1_ENUMERATED; + } + *putype = utype; + } else if (it->utype == V_ASN1_ANY) { + // If ANY set type and pointer to value + ASN1_TYPE *typ; + typ = (ASN1_TYPE *)*pval; + utype = typ->type; + if (utype < 0 && utype != V_ASN1_OTHER) { + // |ASN1_TYPE|s can have type -1 when default-constructed. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_TYPE); + return -1; + } + *putype = utype; + pval = &typ->value.asn1_value; + } else { + utype = *putype; + } + + switch (utype) { + case V_ASN1_OBJECT: + otmp = (ASN1_OBJECT *)*pval; + cont = otmp->data; + len = otmp->length; + if (len == 0) { + // Some |ASN1_OBJECT|s do not have OIDs and cannot be serialized. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_OBJECT); + return -1; + } + break; + + case V_ASN1_NULL: + cont = nullptr; + len = 0; + break; + + case V_ASN1_BOOLEAN: + tbool = (ASN1_BOOLEAN *)pval; + if (*tbool == ASN1_BOOLEAN_NONE) { + *out_omit = 1; + return 0; + } + if (it->utype != V_ASN1_ANY) { + // Default handling if value == size field then omit + if ((*tbool && (it->size > 0)) || (!*tbool && !it->size)) { + *out_omit = 1; + return 0; + } + } + c = *tbool ? 0xff : 0x00; + cont = &c; + len = 1; + break; + + case V_ASN1_BIT_STRING: { + int ret = + i2c_ASN1_BIT_STRING((ASN1_BIT_STRING *)*pval, cout ? &cout : nullptr); + // |i2c_ASN1_BIT_STRING| returns zero on error instead of -1. + return ret <= 0 ? -1 : ret; + } + + case V_ASN1_INTEGER: + case V_ASN1_ENUMERATED: { + // |i2c_ASN1_INTEGER| also handles ENUMERATED. + int ret = i2c_ASN1_INTEGER((ASN1_INTEGER *)*pval, cout ? &cout : nullptr); + // |i2c_ASN1_INTEGER| returns zero on error instead of -1. + return ret <= 0 ? -1 : ret; + } + + case V_ASN1_OCTET_STRING: + case V_ASN1_NUMERICSTRING: + case V_ASN1_PRINTABLESTRING: + case V_ASN1_T61STRING: + case V_ASN1_VIDEOTEXSTRING: + case V_ASN1_IA5STRING: + case V_ASN1_UTCTIME: + case V_ASN1_GENERALIZEDTIME: + case V_ASN1_GRAPHICSTRING: + case V_ASN1_VISIBLESTRING: + case V_ASN1_GENERALSTRING: + case V_ASN1_UNIVERSALSTRING: + case V_ASN1_BMPSTRING: + case V_ASN1_UTF8STRING: + case V_ASN1_SEQUENCE: + case V_ASN1_SET: + // This is not a valid |ASN1_ITEM| type, but it appears in |ASN1_TYPE|. + case V_ASN1_OTHER: + // All based on ASN1_STRING and handled the same + strtmp = (ASN1_STRING *)*pval; + cont = strtmp->data; + len = strtmp->length; + break; + + default: + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BAD_TEMPLATE); + return -1; + } + if (cout && len) { + OPENSSL_memcpy(cout, cont, len); + } + return len; +} diff --git a/third_party/boringssl/src/crypto/asn1/tasn_fre.c b/third_party/boringssl/src/crypto/asn1/tasn_fre.c deleted file mode 100644 index 3da1fa64..00000000 --- a/third_party/boringssl/src/crypto/asn1/tasn_fre.c +++ /dev/null @@ -1,241 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include - -#include "internal.h" - -// Free up an ASN1 structure - -void ASN1_item_free(ASN1_VALUE *val, const ASN1_ITEM *it) { - asn1_item_combine_free(&val, it, 0); -} - -void ASN1_item_ex_free(ASN1_VALUE **pval, const ASN1_ITEM *it) { - asn1_item_combine_free(pval, it, 0); -} - -void asn1_item_combine_free(ASN1_VALUE **pval, const ASN1_ITEM *it, - int combine) { - const ASN1_TEMPLATE *tt = NULL, *seqtt; - const ASN1_EXTERN_FUNCS *ef; - int i; - if (!pval) { - return; - } - if ((it->itype != ASN1_ITYPE_PRIMITIVE) && !*pval) { - return; - } - - switch (it->itype) { - case ASN1_ITYPE_PRIMITIVE: - if (it->templates) { - ASN1_template_free(pval, it->templates); - } else { - ASN1_primitive_free(pval, it); - } - break; - - case ASN1_ITYPE_MSTRING: - ASN1_primitive_free(pval, it); - break; - - case ASN1_ITYPE_CHOICE: { - const ASN1_AUX *aux = it->funcs; - ASN1_aux_cb *asn1_cb = aux != NULL ? aux->asn1_cb : NULL; - if (asn1_cb) { - i = asn1_cb(ASN1_OP_FREE_PRE, pval, it, NULL); - if (i == 2) { - return; - } - } - i = asn1_get_choice_selector(pval, it); - if ((i >= 0) && (i < it->tcount)) { - ASN1_VALUE **pchval; - tt = it->templates + i; - pchval = asn1_get_field_ptr(pval, tt); - ASN1_template_free(pchval, tt); - } - if (asn1_cb) { - asn1_cb(ASN1_OP_FREE_POST, pval, it, NULL); - } - if (!combine) { - OPENSSL_free(*pval); - *pval = NULL; - } - break; - } - - case ASN1_ITYPE_EXTERN: - ef = it->funcs; - if (ef && ef->asn1_ex_free) { - ef->asn1_ex_free(pval, it); - } - break; - - case ASN1_ITYPE_SEQUENCE: { - if (!asn1_refcount_dec_and_test_zero(pval, it)) { - return; - } - const ASN1_AUX *aux = it->funcs; - ASN1_aux_cb *asn1_cb = aux != NULL ? aux->asn1_cb : NULL; - if (asn1_cb) { - i = asn1_cb(ASN1_OP_FREE_PRE, pval, it, NULL); - if (i == 2) { - return; - } - } - asn1_enc_free(pval, it); - // If we free up as normal we will invalidate any ANY DEFINED BY - // field and we wont be able to determine the type of the field it - // defines. So free up in reverse order. - tt = it->templates + it->tcount - 1; - for (i = 0; i < it->tcount; tt--, i++) { - ASN1_VALUE **pseqval; - seqtt = asn1_do_adb(pval, tt, 0); - if (!seqtt) { - continue; - } - pseqval = asn1_get_field_ptr(pval, seqtt); - ASN1_template_free(pseqval, seqtt); - } - if (asn1_cb) { - asn1_cb(ASN1_OP_FREE_POST, pval, it, NULL); - } - if (!combine) { - OPENSSL_free(*pval); - *pval = NULL; - } - break; - } - } -} - -void ASN1_template_free(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt) { - size_t i; - if (tt->flags & ASN1_TFLG_SK_MASK) { - STACK_OF(ASN1_VALUE) *sk = (STACK_OF(ASN1_VALUE) *)*pval; - for (i = 0; i < sk_ASN1_VALUE_num(sk); i++) { - ASN1_VALUE *vtmp; - vtmp = sk_ASN1_VALUE_value(sk, i); - asn1_item_combine_free(&vtmp, ASN1_ITEM_ptr(tt->item), 0); - } - sk_ASN1_VALUE_free(sk); - *pval = NULL; - } else { - asn1_item_combine_free(pval, ASN1_ITEM_ptr(tt->item), - tt->flags & ASN1_TFLG_COMBINE); - } -} - -void ASN1_primitive_free(ASN1_VALUE **pval, const ASN1_ITEM *it) { - int utype; - // Historically, |it->funcs| for primitive types contained an - // |ASN1_PRIMITIVE_FUNCS| table of calbacks. - assert(it == NULL || it->funcs == NULL); - // Special case: if 'it' is NULL free contents of ASN1_TYPE - if (!it) { - ASN1_TYPE *typ = (ASN1_TYPE *)*pval; - utype = typ->type; - pval = &typ->value.asn1_value; - if (utype != V_ASN1_BOOLEAN && !*pval) { - return; - } - } else if (it->itype == ASN1_ITYPE_MSTRING) { - utype = -1; - if (!*pval) { - return; - } - } else { - utype = it->utype; - if ((utype != V_ASN1_BOOLEAN) && !*pval) { - return; - } - } - - switch (utype) { - case V_ASN1_OBJECT: - ASN1_OBJECT_free((ASN1_OBJECT *)*pval); - break; - - case V_ASN1_BOOLEAN: - if (it) { - *(ASN1_BOOLEAN *)pval = it->size; - } else { - *(ASN1_BOOLEAN *)pval = -1; - } - return; - - case V_ASN1_NULL: - break; - - case V_ASN1_ANY: - ASN1_primitive_free(pval, NULL); - OPENSSL_free(*pval); - break; - - default: - ASN1_STRING_free((ASN1_STRING *)*pval); - *pval = NULL; - break; - } - *pval = NULL; -} diff --git a/third_party/boringssl/src/crypto/asn1/tasn_fre.cc b/third_party/boringssl/src/crypto/asn1/tasn_fre.cc new file mode 100644 index 00000000..7b5164bd --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/tasn_fre.cc @@ -0,0 +1,167 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + +#include "internal.h" + + +using namespace bssl; + +// Free up an ASN1 structure + +void ASN1_item_free(ASN1_VALUE *val, const ASN1_ITEM *it) { + ASN1_item_ex_free(&val, it); +} + +void bssl::ASN1_item_ex_free(ASN1_VALUE **pval, const ASN1_ITEM *it) { + if (!pval) { + return; + } + if ((it->itype != ASN1_ITYPE_PRIMITIVE) && !*pval) { + return; + } + + switch (it->itype) { + case ASN1_ITYPE_PRIMITIVE: + if (it->templates) { + ASN1_template_free(pval, it->templates); + } else { + ASN1_primitive_free(pval, it); + } + break; + + case ASN1_ITYPE_MSTRING: + ASN1_primitive_free(pval, it); + break; + + case ASN1_ITYPE_CHOICE: { + const ASN1_AUX *aux = reinterpret_cast(it->funcs); + ASN1_aux_cb *asn1_cb = aux != nullptr ? aux->asn1_cb : nullptr; + if (asn1_cb) { + if (asn1_cb(ASN1_OP_FREE_PRE, pval, it, nullptr) == 2) { + return; + } + } + int i = asn1_get_choice_selector(pval, it); + if ((i >= 0) && (i < it->tcount)) { + const ASN1_TEMPLATE *tt = it->templates + i; + ASN1_VALUE **pchval = asn1_get_field_ptr(pval, tt); + ASN1_template_free(pchval, tt); + } + if (asn1_cb) { + asn1_cb(ASN1_OP_FREE_POST, pval, it, nullptr); + } + OPENSSL_free(*pval); + *pval = nullptr; + break; + } + + case ASN1_ITYPE_EXTERN: { + const ASN1_EXTERN_FUNCS *ef = + reinterpret_cast(it->funcs); + if (ef && ef->asn1_ex_free) { + ef->asn1_ex_free(pval, it); + } + break; + } + + case ASN1_ITYPE_SEQUENCE: { + if (!asn1_refcount_dec_and_test_zero(pval, it)) { + return; + } + const ASN1_AUX *aux = reinterpret_cast(it->funcs); + ASN1_aux_cb *asn1_cb = aux != nullptr ? aux->asn1_cb : nullptr; + if (asn1_cb) { + if (asn1_cb(ASN1_OP_FREE_PRE, pval, it, nullptr) == 2) { + return; + } + } + asn1_enc_free(pval, it); + // If we free up as normal we will invalidate any ANY DEFINED BY + // field and we won't be able to determine the type of the field it + // defines. So free up in reverse order. + for (int i = it->tcount - 1; i >= 0; i--) { + const ASN1_TEMPLATE *seqtt = asn1_do_adb(pval, &it->templates[i], 0); + if (!seqtt) { + continue; + } + ASN1_VALUE **pseqval = asn1_get_field_ptr(pval, seqtt); + ASN1_template_free(pseqval, seqtt); + } + if (asn1_cb) { + asn1_cb(ASN1_OP_FREE_POST, pval, it, nullptr); + } + OPENSSL_free(*pval); + *pval = nullptr; + break; + } + } +} + +void bssl::ASN1_template_free(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt) { + if (tt->flags & ASN1_TFLG_SK_MASK) { + STACK_OF(ASN1_VALUE) *sk = (STACK_OF(ASN1_VALUE) *)*pval; + for (size_t i = 0; i < sk_ASN1_VALUE_num(sk); i++) { + ASN1_VALUE *vtmp = sk_ASN1_VALUE_value(sk, i); + ASN1_item_ex_free(&vtmp, ASN1_ITEM_ptr(tt->item)); + } + sk_ASN1_VALUE_free(sk); + *pval = nullptr; + } else { + ASN1_item_ex_free(pval, ASN1_ITEM_ptr(tt->item)); + } +} + +void bssl::ASN1_primitive_free(ASN1_VALUE **pval, const ASN1_ITEM *it) { + // Historically, |it->funcs| for primitive types contained an + // |ASN1_PRIMITIVE_FUNCS| table of callbacks. + assert(it->funcs == nullptr); + + int utype = it->itype == ASN1_ITYPE_MSTRING ? -1 : it->utype; + switch (utype) { + case V_ASN1_OBJECT: + ASN1_OBJECT_free((ASN1_OBJECT *)*pval); + break; + + case V_ASN1_BOOLEAN: + if (it) { + *(ASN1_BOOLEAN *)pval = (ASN1_BOOLEAN)it->size; + } else { + *(ASN1_BOOLEAN *)pval = ASN1_BOOLEAN_NONE; + } + return; + + case V_ASN1_NULL: + break; + + case V_ASN1_ANY: + if (*pval != nullptr) { + asn1_type_cleanup((ASN1_TYPE *)*pval); + OPENSSL_free(*pval); + } + break; + + default: + ASN1_STRING_free((ASN1_STRING *)*pval); + *pval = nullptr; + break; + } + *pval = nullptr; +} diff --git a/third_party/boringssl/src/crypto/asn1/tasn_new.c b/third_party/boringssl/src/crypto/asn1/tasn_new.c deleted file mode 100644 index 97411c53..00000000 --- a/third_party/boringssl/src/crypto/asn1/tasn_new.c +++ /dev/null @@ -1,340 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include -#include -#include - -#include "../internal.h" -#include "internal.h" - - -static int asn1_item_ex_combine_new(ASN1_VALUE **pval, const ASN1_ITEM *it, - int combine); -static void asn1_item_clear(ASN1_VALUE **pval, const ASN1_ITEM *it); -static int ASN1_template_new(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt); -static void asn1_template_clear(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt); -static int ASN1_primitive_new(ASN1_VALUE **pval, const ASN1_ITEM *it); -static void asn1_primitive_clear(ASN1_VALUE **pval, const ASN1_ITEM *it); - -ASN1_VALUE *ASN1_item_new(const ASN1_ITEM *it) { - ASN1_VALUE *ret = NULL; - if (ASN1_item_ex_new(&ret, it) > 0) { - return ret; - } - return NULL; -} - -// Allocate an ASN1 structure - -int ASN1_item_ex_new(ASN1_VALUE **pval, const ASN1_ITEM *it) { - return asn1_item_ex_combine_new(pval, it, 0); -} - -static int asn1_item_ex_combine_new(ASN1_VALUE **pval, const ASN1_ITEM *it, - int combine) { - const ASN1_TEMPLATE *tt = NULL; - const ASN1_EXTERN_FUNCS *ef; - ASN1_VALUE **pseqval; - int i; - - switch (it->itype) { - case ASN1_ITYPE_EXTERN: - ef = it->funcs; - if (ef && ef->asn1_ex_new) { - if (!ef->asn1_ex_new(pval, it)) { - goto memerr; - } - } - break; - - case ASN1_ITYPE_PRIMITIVE: - if (it->templates) { - if (!ASN1_template_new(pval, it->templates)) { - goto memerr; - } - } else if (!ASN1_primitive_new(pval, it)) { - goto memerr; - } - break; - - case ASN1_ITYPE_MSTRING: - if (!ASN1_primitive_new(pval, it)) { - goto memerr; - } - break; - - case ASN1_ITYPE_CHOICE: { - const ASN1_AUX *aux = it->funcs; - ASN1_aux_cb *asn1_cb = aux != NULL ? aux->asn1_cb : NULL; - if (asn1_cb) { - i = asn1_cb(ASN1_OP_NEW_PRE, pval, it, NULL); - if (!i) { - goto auxerr; - } - if (i == 2) { - return 1; - } - } - if (!combine) { - *pval = OPENSSL_malloc(it->size); - if (!*pval) { - goto memerr; - } - OPENSSL_memset(*pval, 0, it->size); - } - asn1_set_choice_selector(pval, -1, it); - if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, NULL)) { - goto auxerr2; - } - break; - } - - case ASN1_ITYPE_SEQUENCE: { - const ASN1_AUX *aux = it->funcs; - ASN1_aux_cb *asn1_cb = aux != NULL ? aux->asn1_cb : NULL; - if (asn1_cb) { - i = asn1_cb(ASN1_OP_NEW_PRE, pval, it, NULL); - if (!i) { - goto auxerr; - } - if (i == 2) { - return 1; - } - } - if (!combine) { - *pval = OPENSSL_malloc(it->size); - if (!*pval) { - goto memerr; - } - OPENSSL_memset(*pval, 0, it->size); - asn1_refcount_set_one(pval, it); - asn1_enc_init(pval, it); - } - for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) { - pseqval = asn1_get_field_ptr(pval, tt); - if (!ASN1_template_new(pseqval, tt)) { - goto memerr2; - } - } - if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, NULL)) { - goto auxerr2; - } - break; - } - } - return 1; - -memerr2: - asn1_item_combine_free(pval, it, combine); -memerr: - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return 0; - -auxerr2: - asn1_item_combine_free(pval, it, combine); -auxerr: - OPENSSL_PUT_ERROR(ASN1, ASN1_R_AUX_ERROR); - return 0; -} - -static void asn1_item_clear(ASN1_VALUE **pval, const ASN1_ITEM *it) { - const ASN1_EXTERN_FUNCS *ef; - - switch (it->itype) { - case ASN1_ITYPE_EXTERN: - ef = it->funcs; - if (ef && ef->asn1_ex_clear) { - ef->asn1_ex_clear(pval, it); - } else { - *pval = NULL; - } - break; - - case ASN1_ITYPE_PRIMITIVE: - if (it->templates) { - asn1_template_clear(pval, it->templates); - } else { - asn1_primitive_clear(pval, it); - } - break; - - case ASN1_ITYPE_MSTRING: - asn1_primitive_clear(pval, it); - break; - - case ASN1_ITYPE_CHOICE: - case ASN1_ITYPE_SEQUENCE: - *pval = NULL; - break; - } -} - -static int ASN1_template_new(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt) { - const ASN1_ITEM *it = ASN1_ITEM_ptr(tt->item); - int ret; - if (tt->flags & ASN1_TFLG_OPTIONAL) { - asn1_template_clear(pval, tt); - return 1; - } - // If ANY DEFINED BY nothing to do - - if (tt->flags & ASN1_TFLG_ADB_MASK) { - *pval = NULL; - return 1; - } - // If SET OF or SEQUENCE OF, its a STACK - if (tt->flags & ASN1_TFLG_SK_MASK) { - STACK_OF(ASN1_VALUE) *skval; - skval = sk_ASN1_VALUE_new_null(); - if (!skval) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - ret = 0; - goto done; - } - *pval = (ASN1_VALUE *)skval; - ret = 1; - goto done; - } - // Otherwise pass it back to the item routine - ret = asn1_item_ex_combine_new(pval, it, tt->flags & ASN1_TFLG_COMBINE); -done: - return ret; -} - -static void asn1_template_clear(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt) { - // If ADB or STACK just NULL the field - if (tt->flags & (ASN1_TFLG_ADB_MASK | ASN1_TFLG_SK_MASK)) { - *pval = NULL; - } else { - asn1_item_clear(pval, ASN1_ITEM_ptr(tt->item)); - } -} - -// NB: could probably combine most of the real XXX_new() behaviour and junk -// all the old functions. - -static int ASN1_primitive_new(ASN1_VALUE **pval, const ASN1_ITEM *it) { - ASN1_TYPE *typ; - int utype; - - if (!it) { - return 0; - } - - // Historically, |it->funcs| for primitive types contained an - // |ASN1_PRIMITIVE_FUNCS| table of calbacks. - assert(it->funcs == NULL); - - if (it->itype == ASN1_ITYPE_MSTRING) { - utype = -1; - } else { - utype = it->utype; - } - switch (utype) { - case V_ASN1_OBJECT: - *pval = (ASN1_VALUE *)OBJ_nid2obj(NID_undef); - return 1; - - case V_ASN1_BOOLEAN: - *(ASN1_BOOLEAN *)pval = it->size; - return 1; - - case V_ASN1_NULL: - *pval = (ASN1_VALUE *)1; - return 1; - - case V_ASN1_ANY: - typ = OPENSSL_malloc(sizeof(ASN1_TYPE)); - if (!typ) { - return 0; - } - typ->value.ptr = NULL; - typ->type = -1; - *pval = (ASN1_VALUE *)typ; - break; - - default: - *pval = (ASN1_VALUE *)ASN1_STRING_type_new(utype); - break; - } - if (*pval) { - return 1; - } - return 0; -} - -static void asn1_primitive_clear(ASN1_VALUE **pval, const ASN1_ITEM *it) { - int utype; - // Historically, |it->funcs| for primitive types contained an - // |ASN1_PRIMITIVE_FUNCS| table of calbacks. - assert(it == NULL || it->funcs == NULL); - if (!it || (it->itype == ASN1_ITYPE_MSTRING)) { - utype = -1; - } else { - utype = it->utype; - } - if (utype == V_ASN1_BOOLEAN) { - *(ASN1_BOOLEAN *)pval = it->size; - } else { - *pval = NULL; - } -} diff --git a/third_party/boringssl/src/crypto/asn1/tasn_new.cc b/third_party/boringssl/src/crypto/asn1/tasn_new.cc new file mode 100644 index 00000000..f73484bb --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/tasn_new.cc @@ -0,0 +1,278 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +static void asn1_item_clear(ASN1_VALUE **pval, const ASN1_ITEM *it); +static int ASN1_template_new(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt); +static void asn1_template_clear(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt); +static int ASN1_primitive_new(ASN1_VALUE **pval, const ASN1_ITEM *it); +static void asn1_primitive_clear(ASN1_VALUE **pval, const ASN1_ITEM *it); + +ASN1_VALUE *ASN1_item_new(const ASN1_ITEM *it) { + ASN1_VALUE *ret = nullptr; + if (ASN1_item_ex_new(&ret, it) > 0) { + return ret; + } + return nullptr; +} + +// Allocate an ASN1 structure + +int bssl::ASN1_item_ex_new(ASN1_VALUE **pval, const ASN1_ITEM *it) { + const ASN1_TEMPLATE *tt = nullptr; + const ASN1_EXTERN_FUNCS *ef; + ASN1_VALUE **pseqval; + int i; + + switch (it->itype) { + case ASN1_ITYPE_EXTERN: + ef = reinterpret_cast(it->funcs); + if (ef && ef->asn1_ex_new) { + if (!ef->asn1_ex_new(pval, it)) { + goto memerr; + } + } + break; + + case ASN1_ITYPE_PRIMITIVE: + if (it->templates) { + if (!ASN1_template_new(pval, it->templates)) { + goto memerr; + } + } else if (!ASN1_primitive_new(pval, it)) { + goto memerr; + } + break; + + case ASN1_ITYPE_MSTRING: + if (!ASN1_primitive_new(pval, it)) { + goto memerr; + } + break; + + case ASN1_ITYPE_CHOICE: { + const ASN1_AUX *aux = reinterpret_cast(it->funcs); + ASN1_aux_cb *asn1_cb = aux != nullptr ? aux->asn1_cb : nullptr; + if (asn1_cb) { + i = asn1_cb(ASN1_OP_NEW_PRE, pval, it, nullptr); + if (!i) { + goto auxerr; + } + if (i == 2) { + return 1; + } + } + *pval = reinterpret_cast(OPENSSL_zalloc(it->size)); + if (!*pval) { + goto memerr; + } + asn1_set_choice_selector(pval, -1, it); + if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, nullptr)) { + goto auxerr2; + } + break; + } + + case ASN1_ITYPE_SEQUENCE: { + const ASN1_AUX *aux = reinterpret_cast(it->funcs); + ASN1_aux_cb *asn1_cb = aux != nullptr ? aux->asn1_cb : nullptr; + if (asn1_cb) { + i = asn1_cb(ASN1_OP_NEW_PRE, pval, it, nullptr); + if (!i) { + goto auxerr; + } + if (i == 2) { + return 1; + } + } + *pval = reinterpret_cast(OPENSSL_zalloc(it->size)); + if (!*pval) { + goto memerr; + } + asn1_refcount_set_one(pval, it); + asn1_enc_init(pval, it); + for (i = 0, tt = it->templates; i < it->tcount; tt++, i++) { + pseqval = asn1_get_field_ptr(pval, tt); + if (!ASN1_template_new(pseqval, tt)) { + goto memerr2; + } + } + if (asn1_cb && !asn1_cb(ASN1_OP_NEW_POST, pval, it, nullptr)) { + goto auxerr2; + } + break; + } + } + return 1; + +memerr2: + ASN1_item_ex_free(pval, it); +memerr: + return 0; + +auxerr2: + ASN1_item_ex_free(pval, it); +auxerr: + OPENSSL_PUT_ERROR(ASN1, ASN1_R_AUX_ERROR); + return 0; +} + +static void asn1_item_clear(ASN1_VALUE **pval, const ASN1_ITEM *it) { + switch (it->itype) { + case ASN1_ITYPE_EXTERN: + *pval = nullptr; + break; + + case ASN1_ITYPE_PRIMITIVE: + if (it->templates) { + asn1_template_clear(pval, it->templates); + } else { + asn1_primitive_clear(pval, it); + } + break; + + case ASN1_ITYPE_MSTRING: + asn1_primitive_clear(pval, it); + break; + + case ASN1_ITYPE_CHOICE: + case ASN1_ITYPE_SEQUENCE: + *pval = nullptr; + break; + } +} + +static int ASN1_template_new(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt) { + const ASN1_ITEM *it = ASN1_ITEM_ptr(tt->item); + int ret; + if (tt->flags & ASN1_TFLG_OPTIONAL) { + asn1_template_clear(pval, tt); + return 1; + } + // If ANY DEFINED BY nothing to do + + if (tt->flags & ASN1_TFLG_ADB_MASK) { + *pval = nullptr; + return 1; + } + // If SET OF or SEQUENCE OF, its a STACK + if (tt->flags & ASN1_TFLG_SK_MASK) { + STACK_OF(ASN1_VALUE) *skval; + skval = sk_ASN1_VALUE_new_null(); + if (!skval) { + ret = 0; + goto done; + } + *pval = (ASN1_VALUE *)skval; + ret = 1; + goto done; + } + // Otherwise pass it back to the item routine + ret = ASN1_item_ex_new(pval, it); +done: + return ret; +} + +static void asn1_template_clear(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt) { + // If ADB or STACK just NULL the field + if (tt->flags & (ASN1_TFLG_ADB_MASK | ASN1_TFLG_SK_MASK)) { + *pval = nullptr; + } else { + asn1_item_clear(pval, ASN1_ITEM_ptr(tt->item)); + } +} + +// NB: could probably combine most of the real XXX_new() behaviour and junk +// all the old functions. + +static int ASN1_primitive_new(ASN1_VALUE **pval, const ASN1_ITEM *it) { + if (!it) { + return 0; + } + + // Historically, |it->funcs| for primitive types contained an + // |ASN1_PRIMITIVE_FUNCS| table of callbacks. + assert(it->funcs == nullptr); + + int utype; + if (it->itype == ASN1_ITYPE_MSTRING) { + utype = -1; + } else { + utype = it->utype; + } + switch (utype) { + case V_ASN1_OBJECT: + *pval = (ASN1_VALUE *)OBJ_get_undef(); + return 1; + + case V_ASN1_BOOLEAN: + *(ASN1_BOOLEAN *)pval = (ASN1_BOOLEAN)it->size; + return 1; + + case V_ASN1_NULL: + *pval = (ASN1_VALUE *)1; + return 1; + + case V_ASN1_ANY: { + ASN1_TYPE *typ = New(); + if (!typ) { + return 0; + } + typ->value.ptr = nullptr; + typ->type = -1; + *pval = (ASN1_VALUE *)typ; + break; + } + + default: + *pval = (ASN1_VALUE *)ASN1_STRING_type_new(utype); + break; + } + if (*pval) { + return 1; + } + return 0; +} + +static void asn1_primitive_clear(ASN1_VALUE **pval, const ASN1_ITEM *it) { + int utype; + // Historically, |it->funcs| for primitive types contained an + // |ASN1_PRIMITIVE_FUNCS| table of callbacks. + assert(it == nullptr || it->funcs == nullptr); + if (!it || (it->itype == ASN1_ITYPE_MSTRING)) { + utype = -1; + } else { + utype = it->utype; + } + if (utype == V_ASN1_BOOLEAN) { + *(ASN1_BOOLEAN *)pval = (ASN1_BOOLEAN)it->size; + } else { + *pval = nullptr; + } +} diff --git a/third_party/boringssl/src/crypto/asn1/tasn_typ.c b/third_party/boringssl/src/crypto/asn1/tasn_typ.c deleted file mode 100644 index abfac934..00000000 --- a/third_party/boringssl/src/crypto/asn1/tasn_typ.c +++ /dev/null @@ -1,126 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -// Declarations for string types - -#define IMPLEMENT_ASN1_STRING_FUNCTIONS(sname) \ - IMPLEMENT_ASN1_TYPE(sname) \ - IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(sname, sname, sname) \ - sname *sname##_new(void) { return ASN1_STRING_type_new(V_##sname); } \ - void sname##_free(sname *x) { ASN1_STRING_free(x); } - -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_OCTET_STRING) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_INTEGER) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_ENUMERATED) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_BIT_STRING) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_UTF8STRING) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_PRINTABLESTRING) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_T61STRING) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_IA5STRING) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_GENERALSTRING) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_UTCTIME) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_GENERALIZEDTIME) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_VISIBLESTRING) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_UNIVERSALSTRING) -IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_BMPSTRING) - -IMPLEMENT_ASN1_TYPE(ASN1_NULL) -IMPLEMENT_ASN1_FUNCTIONS_const(ASN1_NULL) - -IMPLEMENT_ASN1_TYPE(ASN1_OBJECT) - -IMPLEMENT_ASN1_TYPE(ASN1_ANY) - -// Just swallow an ASN1_SEQUENCE in an ASN1_STRING -IMPLEMENT_ASN1_TYPE(ASN1_SEQUENCE) - -IMPLEMENT_ASN1_FUNCTIONS_const_fname(ASN1_TYPE, ASN1_ANY, ASN1_TYPE) - -// Multistring types - -IMPLEMENT_ASN1_MSTRING(ASN1_PRINTABLE, B_ASN1_PRINTABLE) -IMPLEMENT_ASN1_FUNCTIONS_const_fname(ASN1_STRING, ASN1_PRINTABLE, - ASN1_PRINTABLE) - -IMPLEMENT_ASN1_MSTRING(DISPLAYTEXT, B_ASN1_DISPLAYTEXT) -IMPLEMENT_ASN1_FUNCTIONS_const_fname(ASN1_STRING, DISPLAYTEXT, DISPLAYTEXT) - -IMPLEMENT_ASN1_MSTRING(DIRECTORYSTRING, B_ASN1_DIRECTORYSTRING) -IMPLEMENT_ASN1_FUNCTIONS_const_fname(ASN1_STRING, DIRECTORYSTRING, - DIRECTORYSTRING) - -// Three separate BOOLEAN type: normal, DEFAULT TRUE and DEFAULT FALSE -IMPLEMENT_ASN1_TYPE_ex(ASN1_BOOLEAN, ASN1_BOOLEAN, -1) -IMPLEMENT_ASN1_TYPE_ex(ASN1_TBOOLEAN, ASN1_BOOLEAN, 1) -IMPLEMENT_ASN1_TYPE_ex(ASN1_FBOOLEAN, ASN1_BOOLEAN, 0) - -ASN1_ITEM_TEMPLATE(ASN1_SEQUENCE_ANY) = - ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SEQUENCE_OF, 0, ASN1_SEQUENCE_ANY, ASN1_ANY) -ASN1_ITEM_TEMPLATE_END(ASN1_SEQUENCE_ANY) - -ASN1_ITEM_TEMPLATE(ASN1_SET_ANY) = ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SET_OF, 0, - ASN1_SET_ANY, ASN1_ANY) -ASN1_ITEM_TEMPLATE_END(ASN1_SET_ANY) - -IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(ASN1_SEQUENCE_ANY, - ASN1_SEQUENCE_ANY, - ASN1_SEQUENCE_ANY) -IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(ASN1_SEQUENCE_ANY, ASN1_SET_ANY, - ASN1_SET_ANY) diff --git a/third_party/boringssl/src/crypto/asn1/tasn_typ.cc b/third_party/boringssl/src/crypto/asn1/tasn_typ.cc new file mode 100644 index 00000000..0c3f6e2f --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/tasn_typ.cc @@ -0,0 +1,94 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "internal.h" + + +using namespace bssl; + +// TODO(crbug.com/42290417): While we need |ASN1_ITEM|s, the exposed new, free, +// i2d, and d2i functions should call the underlying implementations directly. + +#define IMPLEMENT_ASN1_STRING_FUNCTIONS(sname) \ + IMPLEMENT_ASN1_TYPE(sname) \ + IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(sname, sname, sname) \ + sname *sname##_new() { return ASN1_STRING_type_new(V_##sname); } \ + void sname##_free(sname *x) { ASN1_STRING_free(x); } + +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_OCTET_STRING) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_INTEGER) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_ENUMERATED) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_BIT_STRING) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_UTF8STRING) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_PRINTABLESTRING) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_T61STRING) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_IA5STRING) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_GENERALSTRING) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_UTCTIME) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_GENERALIZEDTIME) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_VISIBLESTRING) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_UNIVERSALSTRING) +IMPLEMENT_ASN1_STRING_FUNCTIONS(ASN1_BMPSTRING) + +IMPLEMENT_ASN1_TYPE(ASN1_NULL) +IMPLEMENT_ASN1_FUNCTIONS_const(ASN1_NULL) + +IMPLEMENT_ASN1_TYPE(ASN1_OBJECT) + +IMPLEMENT_ASN1_TYPE(ASN1_ANY) + +// Just swallow an ASN1_SEQUENCE in an ASN1_STRING +IMPLEMENT_ASN1_TYPE(ASN1_SEQUENCE) + +IMPLEMENT_ASN1_FUNCTIONS_const_fname(ASN1_TYPE, ASN1_ANY, ASN1_TYPE) + +BSSL_NAMESPACE_BEGIN + +IMPLEMENT_ASN1_MSTRING(DISPLAYTEXT, B_ASN1_DISPLAYTEXT) + +BSSL_NAMESPACE_END + +IMPLEMENT_ASN1_FUNCTIONS_const_fname(ASN1_STRING, DISPLAYTEXT, DISPLAYTEXT) + +BSSL_NAMESPACE_BEGIN + +IMPLEMENT_ASN1_MSTRING(DIRECTORYSTRING, B_ASN1_DIRECTORYSTRING) + +BSSL_NAMESPACE_END + +IMPLEMENT_ASN1_FUNCTIONS_const_fname(ASN1_STRING, DIRECTORYSTRING, + DIRECTORYSTRING) + +// Three separate BOOLEAN type: normal, DEFAULT TRUE and DEFAULT FALSE +IMPLEMENT_ASN1_TYPE_ex(ASN1_BOOLEAN, ASN1_BOOLEAN, ASN1_BOOLEAN_NONE) +IMPLEMENT_ASN1_TYPE_ex(ASN1_TBOOLEAN, ASN1_BOOLEAN, ASN1_BOOLEAN_TRUE) +IMPLEMENT_ASN1_TYPE_ex(ASN1_FBOOLEAN, ASN1_BOOLEAN, ASN1_BOOLEAN_FALSE) + +ASN1_ITEM_TEMPLATE(ASN1_SEQUENCE_ANY) = + ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SEQUENCE_OF, 0, ASN1_SEQUENCE_ANY, ASN1_ANY) +ASN1_ITEM_TEMPLATE_END(ASN1_SEQUENCE_ANY) + +ASN1_ITEM_TEMPLATE(ASN1_SET_ANY) = ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SET_OF, 0, + ASN1_SET_ANY, ASN1_ANY) +ASN1_ITEM_TEMPLATE_END(ASN1_SET_ANY) + +IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(ASN1_SEQUENCE_ANY, + ASN1_SEQUENCE_ANY, + ASN1_SEQUENCE_ANY) +IMPLEMENT_ASN1_ENCODE_FUNCTIONS_const_fname(ASN1_SEQUENCE_ANY, ASN1_SET_ANY, + ASN1_SET_ANY) diff --git a/third_party/boringssl/src/crypto/asn1/tasn_utl.c b/third_party/boringssl/src/crypto/asn1/tasn_utl.c deleted file mode 100644 index 4a975743..00000000 --- a/third_party/boringssl/src/crypto/asn1/tasn_utl.c +++ /dev/null @@ -1,277 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include "../internal.h" -#include "internal.h" - - -// Utility functions for manipulating fields and offsets - -// Add 'offset' to 'addr' -#define offset2ptr(addr, offset) (void *)(((char *)(addr)) + (offset)) - -// Given an ASN1_ITEM CHOICE type return the selector value -int asn1_get_choice_selector(ASN1_VALUE **pval, const ASN1_ITEM *it) { - int *sel = offset2ptr(*pval, it->utype); - return *sel; -} - -// Given an ASN1_ITEM CHOICE type set the selector value, return old value. -int asn1_set_choice_selector(ASN1_VALUE **pval, int value, - const ASN1_ITEM *it) { - int *sel, ret; - sel = offset2ptr(*pval, it->utype); - ret = *sel; - *sel = value; - return ret; -} - -static CRYPTO_refcount_t *asn1_get_references(ASN1_VALUE **pval, - const ASN1_ITEM *it) { - if (it->itype != ASN1_ITYPE_SEQUENCE) { - return NULL; - } - const ASN1_AUX *aux = it->funcs; - if (!aux || !(aux->flags & ASN1_AFLG_REFCOUNT)) { - return NULL; - } - return offset2ptr(*pval, aux->ref_offset); -} - -void asn1_refcount_set_one(ASN1_VALUE **pval, const ASN1_ITEM *it) { - CRYPTO_refcount_t *references = asn1_get_references(pval, it); - if (references != NULL) { - *references = 1; - } -} - -int asn1_refcount_dec_and_test_zero(ASN1_VALUE **pval, const ASN1_ITEM *it) { - CRYPTO_refcount_t *references = asn1_get_references(pval, it); - if (references != NULL) { - return CRYPTO_refcount_dec_and_test_zero(references); - } - return 1; -} - -static ASN1_ENCODING *asn1_get_enc_ptr(ASN1_VALUE **pval, const ASN1_ITEM *it) { - assert(it->itype == ASN1_ITYPE_SEQUENCE); - const ASN1_AUX *aux; - if (!pval || !*pval) { - return NULL; - } - aux = it->funcs; - if (!aux || !(aux->flags & ASN1_AFLG_ENCODING)) { - return NULL; - } - return offset2ptr(*pval, aux->enc_offset); -} - -void asn1_enc_init(ASN1_VALUE **pval, const ASN1_ITEM *it) { - ASN1_ENCODING *enc; - enc = asn1_get_enc_ptr(pval, it); - if (enc) { - enc->enc = NULL; - enc->len = 0; - enc->alias_only = 0; - enc->alias_only_on_next_parse = 0; - enc->modified = 1; - } -} - -void asn1_enc_free(ASN1_VALUE **pval, const ASN1_ITEM *it) { - ASN1_ENCODING *enc; - enc = asn1_get_enc_ptr(pval, it); - if (enc) { - if (!enc->alias_only) { - OPENSSL_free(enc->enc); - } - enc->enc = NULL; - enc->len = 0; - enc->alias_only = 0; - enc->alias_only_on_next_parse = 0; - enc->modified = 1; - } -} - -int asn1_enc_save(ASN1_VALUE **pval, const unsigned char *in, int inlen, - const ASN1_ITEM *it) { - ASN1_ENCODING *enc; - enc = asn1_get_enc_ptr(pval, it); - if (!enc) { - return 1; - } - - if (!enc->alias_only) { - OPENSSL_free(enc->enc); - } - - enc->alias_only = enc->alias_only_on_next_parse; - enc->alias_only_on_next_parse = 0; - - if (enc->alias_only) { - enc->enc = (uint8_t *)in; - } else { - enc->enc = OPENSSL_malloc(inlen); - if (!enc->enc) { - return 0; - } - OPENSSL_memcpy(enc->enc, in, inlen); - } - - enc->len = inlen; - enc->modified = 0; - - return 1; -} - -int asn1_enc_restore(int *len, unsigned char **out, ASN1_VALUE **pval, - const ASN1_ITEM *it) { - ASN1_ENCODING *enc; - enc = asn1_get_enc_ptr(pval, it); - if (!enc || enc->modified) { - return 0; - } - if (out) { - OPENSSL_memcpy(*out, enc->enc, enc->len); - *out += enc->len; - } - if (len) { - *len = enc->len; - } - return 1; -} - -// Given an ASN1_TEMPLATE get a pointer to a field -ASN1_VALUE **asn1_get_field_ptr(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt) { - ASN1_VALUE **pvaltmp; - if (tt->flags & ASN1_TFLG_COMBINE) { - return pval; - } - pvaltmp = offset2ptr(*pval, tt->offset); - // NOTE for BOOLEAN types the field is just a plain int so we can't return - // int **, so settle for (int *). - return pvaltmp; -} - -// Handle ANY DEFINED BY template, find the selector, look up the relevant -// ASN1_TEMPLATE in the table and return it. -const ASN1_TEMPLATE *asn1_do_adb(ASN1_VALUE **pval, const ASN1_TEMPLATE *tt, - int nullerr) { - const ASN1_ADB *adb; - const ASN1_ADB_TABLE *atbl; - ASN1_VALUE **sfld; - int i; - if (!(tt->flags & ASN1_TFLG_ADB_MASK)) { - return tt; - } - - // Else ANY DEFINED BY ... get the table - adb = ASN1_ADB_ptr(tt->item); - - // Get the selector field - sfld = offset2ptr(*pval, adb->offset); - - // Check if NULL - if (*sfld == NULL) { - if (!adb->null_tt) { - goto err; - } - return adb->null_tt; - } - - // Convert type to a NID: - // NB: don't check for NID_undef here because it - // might be a legitimate value in the table - assert(tt->flags & ASN1_TFLG_ADB_OID); - int selector = OBJ_obj2nid((ASN1_OBJECT *)*sfld); - - // Try to find matching entry in table Maybe should check application types - // first to allow application override? Might also be useful to have a flag - // which indicates table is sorted and we can do a binary search. For now - // stick to a linear search. - - for (atbl = adb->tbl, i = 0; i < adb->tblcount; i++, atbl++) { - if (atbl->value == selector) { - return &atbl->tt; - } - } - - // FIXME: need to search application table too - - // No match, return default type - if (!adb->default_tt) { - goto err; - } - return adb->default_tt; - -err: - // FIXME: should log the value or OID of unsupported type - if (nullerr) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNSUPPORTED_ANY_DEFINED_BY_TYPE); - } - return NULL; -} diff --git a/third_party/boringssl/src/crypto/asn1/tasn_utl.cc b/third_party/boringssl/src/crypto/asn1/tasn_utl.cc new file mode 100644 index 00000000..7af99419 --- /dev/null +++ b/third_party/boringssl/src/crypto/asn1/tasn_utl.cc @@ -0,0 +1,218 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +// Utility functions for manipulating fields and offsets + +// Add 'offset' to 'addr' +#define offset2ptr(addr, offset) (void *)(((char *)(addr)) + (offset)) + +// Given an ASN1_ITEM CHOICE type return the selector value +int bssl::asn1_get_choice_selector(ASN1_VALUE **pval, const ASN1_ITEM *it) { + int *sel = reinterpret_cast(offset2ptr(*pval, it->utype)); + return *sel; +} + +// Given an ASN1_ITEM CHOICE type set the selector value, return old value. +int bssl::asn1_set_choice_selector(ASN1_VALUE **pval, int value, + const ASN1_ITEM *it) { + int *sel, ret; + sel = reinterpret_cast(offset2ptr(*pval, it->utype)); + ret = *sel; + *sel = value; + return ret; +} + +static CRYPTO_refcount_t *asn1_get_references(ASN1_VALUE **pval, + const ASN1_ITEM *it) { + if (it->itype != ASN1_ITYPE_SEQUENCE) { + return nullptr; + } + const ASN1_AUX *aux = reinterpret_cast(it->funcs); + if (!aux || !(aux->flags & ASN1_AFLG_REFCOUNT)) { + return nullptr; + } + return reinterpret_cast( + offset2ptr(*pval, aux->ref_offset)); +} + +void bssl::asn1_refcount_set_one(ASN1_VALUE **pval, const ASN1_ITEM *it) { + CRYPTO_refcount_t *references = asn1_get_references(pval, it); + if (references != nullptr) { + *references = 1; + } +} + +int bssl::asn1_refcount_dec_and_test_zero(ASN1_VALUE **pval, + const ASN1_ITEM *it) { + CRYPTO_refcount_t *references = asn1_get_references(pval, it); + if (references != nullptr) { + return CRYPTO_refcount_dec_and_test_zero(references); + } + return 1; +} + +static ASN1_ENCODING *asn1_get_enc_ptr(ASN1_VALUE **pval, const ASN1_ITEM *it) { + assert(it->itype == ASN1_ITYPE_SEQUENCE); + const ASN1_AUX *aux; + if (!pval || !*pval) { + return nullptr; + } + aux = reinterpret_cast(it->funcs); + if (!aux || !(aux->flags & ASN1_AFLG_ENCODING)) { + return nullptr; + } + return reinterpret_cast(offset2ptr(*pval, aux->enc_offset)); +} + +void bssl::asn1_enc_init(ASN1_VALUE **pval, const ASN1_ITEM *it) { + ASN1_ENCODING *enc = asn1_get_enc_ptr(pval, it); + if (enc) { + enc->enc = nullptr; + enc->len = 0; + } +} + +void bssl::asn1_enc_free(ASN1_VALUE **pval, const ASN1_ITEM *it) { + ASN1_ENCODING *enc = asn1_get_enc_ptr(pval, it); + if (enc) { + asn1_encoding_clear(enc); + } +} + +int bssl::asn1_enc_save(ASN1_VALUE **pval, const uint8_t *in, size_t in_len, + const ASN1_ITEM *it) { + ASN1_ENCODING *enc; + enc = asn1_get_enc_ptr(pval, it); + if (!enc) { + return 1; + } + + asn1_encoding_clear(enc); + enc->enc = reinterpret_cast(OPENSSL_memdup(in, in_len)); + if (!enc->enc) { + return 0; + } + + enc->len = in_len; + return 1; +} + +void bssl::asn1_encoding_clear(ASN1_ENCODING *enc) { + OPENSSL_free(enc->enc); + enc->enc = nullptr; + enc->len = 0; +} + +int bssl::asn1_enc_restore(int *len, unsigned char **out, ASN1_VALUE **pval, + const ASN1_ITEM *it) { + ASN1_ENCODING *enc = asn1_get_enc_ptr(pval, it); + if (!enc || enc->len == 0) { + return 0; + } + if (out) { + OPENSSL_memcpy(*out, enc->enc, enc->len); + *out += enc->len; + } + if (len) { + *len = enc->len; + } + return 1; +} + +// Given an ASN1_TEMPLATE get a pointer to a field +ASN1_VALUE **bssl::asn1_get_field_ptr(ASN1_VALUE **pval, + const ASN1_TEMPLATE *tt) { + ASN1_VALUE **pvaltmp = + reinterpret_cast(offset2ptr(*pval, tt->offset)); + // NOTE for BOOLEAN types the field is just a plain int so we can't return + // int **, so settle for (int *). + return pvaltmp; +} + +// Handle ANY DEFINED BY template, find the selector, look up the relevant +// ASN1_TEMPLATE in the table and return it. +const ASN1_TEMPLATE *bssl::asn1_do_adb(ASN1_VALUE **pval, + const ASN1_TEMPLATE *tt, int nullerr) { + const ASN1_ADB *adb; + const ASN1_ADB_TABLE *atbl; + ASN1_VALUE **sfld; + int i; + if (!(tt->flags & ASN1_TFLG_ADB_MASK)) { + return tt; + } + + // Else ANY DEFINED BY ... get the table + adb = ASN1_ADB_ptr(tt->item); + + // Get the selector field + sfld = reinterpret_cast(offset2ptr(*pval, adb->offset)); + + // Check if NULL + int selector; + if (*sfld == nullptr) { + if (!adb->null_tt) { + goto err; + } + return adb->null_tt; + } + + // Convert type to a NID: + // NB: don't check for NID_undef here because it + // might be a legitimate value in the table + assert(tt->flags & ASN1_TFLG_ADB_OID); + selector = OBJ_obj2nid((ASN1_OBJECT *)*sfld); + + // Try to find matching entry in table Maybe should check application types + // first to allow application override? Might also be useful to have a flag + // which indicates table is sorted and we can do a binary search. For now + // stick to a linear search. + + for (atbl = adb->tbl, i = 0; i < adb->tblcount; i++, atbl++) { + if (atbl->value == selector) { + return &atbl->tt; + } + } + + // FIXME: need to search application table too + + // No match, return default type + if (!adb->default_tt) { + goto err; + } + return adb->default_tt; + +err: + // FIXME: should log the value or OID of unsupported type + if (nullerr) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNSUPPORTED_ANY_DEFINED_BY_TYPE); + } + return nullptr; +} diff --git a/third_party/boringssl/src/crypto/base64/base64.c b/third_party/boringssl/src/crypto/base64/base64.c deleted file mode 100644 index d2b1e584..00000000 --- a/third_party/boringssl/src/crypto/base64/base64.c +++ /dev/null @@ -1,480 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include "../internal.h" - - -// constant_time_lt_args_8 behaves like |constant_time_lt_8| but takes |uint8_t| -// arguments for a slightly simpler implementation. -static inline uint8_t constant_time_lt_args_8(uint8_t a, uint8_t b) { - crypto_word_t aw = a; - crypto_word_t bw = b; - // |crypto_word_t| is larger than |uint8_t|, so |aw| and |bw| have the same - // MSB. |aw| < |bw| iff MSB(|aw| - |bw|) is 1. - return constant_time_msb_w(aw - bw); -} - -// constant_time_in_range_8 returns |CONSTTIME_TRUE_8| if |min| <= |a| <= |max| -// and |CONSTTIME_FALSE_8| otherwise. -static inline uint8_t constant_time_in_range_8(uint8_t a, uint8_t min, - uint8_t max) { - a -= min; - return constant_time_lt_args_8(a, max - min + 1); -} - -// Encoding. - -static uint8_t conv_bin2ascii(uint8_t a) { - // Since PEM is sometimes used to carry private keys, we encode base64 data - // itself in constant-time. - a &= 0x3f; - uint8_t ret = constant_time_select_8(constant_time_eq_8(a, 62), '+', '/'); - ret = - constant_time_select_8(constant_time_lt_args_8(a, 62), a - 52 + '0', ret); - ret = - constant_time_select_8(constant_time_lt_args_8(a, 52), a - 26 + 'a', ret); - ret = constant_time_select_8(constant_time_lt_args_8(a, 26), a + 'A', ret); - return ret; -} - -static_assert(sizeof(((EVP_ENCODE_CTX *)(NULL))->data) % 3 == 0, - "data length must be a multiple of base64 chunk size"); - -int EVP_EncodedLength(size_t *out_len, size_t len) { - if (len + 2 < len) { - return 0; - } - len += 2; - len /= 3; - - if (((len << 2) >> 2) != len) { - return 0; - } - len <<= 2; - - if (len + 1 < len) { - return 0; - } - len++; - - *out_len = len; - return 1; -} - -EVP_ENCODE_CTX *EVP_ENCODE_CTX_new(void) { - EVP_ENCODE_CTX *ret = OPENSSL_malloc(sizeof(EVP_ENCODE_CTX)); - if (ret == NULL) { - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(EVP_ENCODE_CTX)); - return ret; -} - -void EVP_ENCODE_CTX_free(EVP_ENCODE_CTX *ctx) { - OPENSSL_free(ctx); -} - -void EVP_EncodeInit(EVP_ENCODE_CTX *ctx) { - OPENSSL_memset(ctx, 0, sizeof(EVP_ENCODE_CTX)); -} - -void EVP_EncodeUpdate(EVP_ENCODE_CTX *ctx, uint8_t *out, int *out_len, - const uint8_t *in, size_t in_len) { - size_t total = 0; - - *out_len = 0; - if (in_len == 0) { - return; - } - - assert(ctx->data_used < sizeof(ctx->data)); - - if (sizeof(ctx->data) - ctx->data_used > in_len) { - OPENSSL_memcpy(&ctx->data[ctx->data_used], in, in_len); - ctx->data_used += (unsigned)in_len; - return; - } - - if (ctx->data_used != 0) { - const size_t todo = sizeof(ctx->data) - ctx->data_used; - OPENSSL_memcpy(&ctx->data[ctx->data_used], in, todo); - in += todo; - in_len -= todo; - - size_t encoded = EVP_EncodeBlock(out, ctx->data, sizeof(ctx->data)); - ctx->data_used = 0; - - out += encoded; - *(out++) = '\n'; - *out = '\0'; - - total = encoded + 1; - } - - while (in_len >= sizeof(ctx->data)) { - size_t encoded = EVP_EncodeBlock(out, in, sizeof(ctx->data)); - in += sizeof(ctx->data); - in_len -= sizeof(ctx->data); - - out += encoded; - *(out++) = '\n'; - *out = '\0'; - - if (total + encoded + 1 < total) { - *out_len = 0; - return; - } - - total += encoded + 1; - } - - if (in_len != 0) { - OPENSSL_memcpy(ctx->data, in, in_len); - } - - ctx->data_used = (unsigned)in_len; - - if (total > INT_MAX) { - // We cannot signal an error, but we can at least avoid making *out_len - // negative. - total = 0; - } - *out_len = (int)total; -} - -void EVP_EncodeFinal(EVP_ENCODE_CTX *ctx, uint8_t *out, int *out_len) { - if (ctx->data_used == 0) { - *out_len = 0; - return; - } - - size_t encoded = EVP_EncodeBlock(out, ctx->data, ctx->data_used); - out[encoded++] = '\n'; - out[encoded] = '\0'; - ctx->data_used = 0; - - // ctx->data_used is bounded by sizeof(ctx->data), so this does not - // overflow. - assert(encoded <= INT_MAX); - *out_len = (int)encoded; -} - -size_t EVP_EncodeBlock(uint8_t *dst, const uint8_t *src, size_t src_len) { - uint32_t l; - size_t remaining = src_len, ret = 0; - - while (remaining) { - if (remaining >= 3) { - l = (((uint32_t)src[0]) << 16L) | (((uint32_t)src[1]) << 8L) | src[2]; - *(dst++) = conv_bin2ascii(l >> 18L); - *(dst++) = conv_bin2ascii(l >> 12L); - *(dst++) = conv_bin2ascii(l >> 6L); - *(dst++) = conv_bin2ascii(l); - remaining -= 3; - } else { - l = ((uint32_t)src[0]) << 16L; - if (remaining == 2) { - l |= ((uint32_t)src[1] << 8L); - } - - *(dst++) = conv_bin2ascii(l >> 18L); - *(dst++) = conv_bin2ascii(l >> 12L); - *(dst++) = (remaining == 1) ? '=' : conv_bin2ascii(l >> 6L); - *(dst++) = '='; - remaining = 0; - } - ret += 4; - src += 3; - } - - *dst = '\0'; - return ret; -} - - -// Decoding. - -int EVP_DecodedLength(size_t *out_len, size_t len) { - if (len % 4 != 0) { - return 0; - } - - *out_len = (len / 4) * 3; - return 1; -} - -void EVP_DecodeInit(EVP_ENCODE_CTX *ctx) { - OPENSSL_memset(ctx, 0, sizeof(EVP_ENCODE_CTX)); -} - -static uint8_t base64_ascii_to_bin(uint8_t a) { - // Since PEM is sometimes used to carry private keys, we decode base64 data - // itself in constant-time. - const uint8_t is_upper = constant_time_in_range_8(a, 'A', 'Z'); - const uint8_t is_lower = constant_time_in_range_8(a, 'a', 'z'); - const uint8_t is_digit = constant_time_in_range_8(a, '0', '9'); - const uint8_t is_plus = constant_time_eq_8(a, '+'); - const uint8_t is_slash = constant_time_eq_8(a, '/'); - const uint8_t is_equals = constant_time_eq_8(a, '='); - - uint8_t ret = 0; - ret |= is_upper & (a - 'A'); // [0,26) - ret |= is_lower & (a - 'a' + 26); // [26,52) - ret |= is_digit & (a - '0' + 52); // [52,62) - ret |= is_plus & 62; - ret |= is_slash & 63; - // Invalid inputs, 'A', and '=' have all been mapped to zero. Map invalid - // inputs to 0xff. Note '=' is padding and handled separately by the caller. - const uint8_t is_valid = - is_upper | is_lower | is_digit | is_plus | is_slash | is_equals; - ret |= ~is_valid; - return ret; -} - -// base64_decode_quad decodes a single “quad” (i.e. four characters) of base64 -// data and writes up to three bytes to |out|. It sets |*out_num_bytes| to the -// number of bytes written, which will be less than three if the quad ended -// with padding. It returns one on success or zero on error. -static int base64_decode_quad(uint8_t *out, size_t *out_num_bytes, - const uint8_t *in) { - const uint8_t a = base64_ascii_to_bin(in[0]); - const uint8_t b = base64_ascii_to_bin(in[1]); - const uint8_t c = base64_ascii_to_bin(in[2]); - const uint8_t d = base64_ascii_to_bin(in[3]); - if (a == 0xff || b == 0xff || c == 0xff || d == 0xff) { - return 0; - } - - const uint32_t v = ((uint32_t)a) << 18 | ((uint32_t)b) << 12 | - ((uint32_t)c) << 6 | (uint32_t)d; - - const unsigned padding_pattern = (in[0] == '=') << 3 | - (in[1] == '=') << 2 | - (in[2] == '=') << 1 | - (in[3] == '='); - - switch (padding_pattern) { - case 0: - // The common case of no padding. - *out_num_bytes = 3; - out[0] = v >> 16; - out[1] = v >> 8; - out[2] = v; - break; - - case 1: // xxx= - *out_num_bytes = 2; - out[0] = v >> 16; - out[1] = v >> 8; - break; - - case 3: // xx== - *out_num_bytes = 1; - out[0] = v >> 16; - break; - - default: - return 0; - } - - return 1; -} - -int EVP_DecodeUpdate(EVP_ENCODE_CTX *ctx, uint8_t *out, int *out_len, - const uint8_t *in, size_t in_len) { - *out_len = 0; - - if (ctx->error_encountered) { - return -1; - } - - size_t bytes_out = 0, i; - for (i = 0; i < in_len; i++) { - const char c = in[i]; - switch (c) { - case ' ': - case '\t': - case '\r': - case '\n': - continue; - } - - if (ctx->eof_seen) { - ctx->error_encountered = 1; - return -1; - } - - ctx->data[ctx->data_used++] = c; - if (ctx->data_used == 4) { - size_t num_bytes_resulting; - if (!base64_decode_quad(out, &num_bytes_resulting, ctx->data)) { - ctx->error_encountered = 1; - return -1; - } - - ctx->data_used = 0; - bytes_out += num_bytes_resulting; - out += num_bytes_resulting; - - if (num_bytes_resulting < 3) { - ctx->eof_seen = 1; - } - } - } - - if (bytes_out > INT_MAX) { - ctx->error_encountered = 1; - *out_len = 0; - return -1; - } - *out_len = (int)bytes_out; - - if (ctx->eof_seen) { - return 0; - } - - return 1; -} - -int EVP_DecodeFinal(EVP_ENCODE_CTX *ctx, uint8_t *out, int *out_len) { - *out_len = 0; - if (ctx->error_encountered || ctx->data_used != 0) { - return -1; - } - - return 1; -} - -int EVP_DecodeBase64(uint8_t *out, size_t *out_len, size_t max_out, - const uint8_t *in, size_t in_len) { - *out_len = 0; - - if (in_len % 4 != 0) { - return 0; - } - - size_t max_len; - if (!EVP_DecodedLength(&max_len, in_len) || - max_out < max_len) { - return 0; - } - - size_t i, bytes_out = 0; - for (i = 0; i < in_len; i += 4) { - size_t num_bytes_resulting; - - if (!base64_decode_quad(out, &num_bytes_resulting, &in[i])) { - return 0; - } - - bytes_out += num_bytes_resulting; - out += num_bytes_resulting; - if (num_bytes_resulting != 3 && i != in_len - 4) { - return 0; - } - } - - *out_len = bytes_out; - return 1; -} - -int EVP_DecodeBlock(uint8_t *dst, const uint8_t *src, size_t src_len) { - // Trim spaces and tabs from the beginning of the input. - while (src_len > 0) { - if (src[0] != ' ' && src[0] != '\t') { - break; - } - - src++; - src_len--; - } - - // Trim newlines, spaces and tabs from the end of the line. - while (src_len > 0) { - switch (src[src_len-1]) { - case ' ': - case '\t': - case '\r': - case '\n': - src_len--; - continue; - } - - break; - } - - size_t dst_len; - if (!EVP_DecodedLength(&dst_len, src_len) || - dst_len > INT_MAX || - !EVP_DecodeBase64(dst, &dst_len, dst_len, src, src_len)) { - return -1; - } - - // EVP_DecodeBlock does not take padding into account, so put the - // NULs back in... so the caller can strip them back out. - while (dst_len % 3 != 0) { - dst[dst_len++] = '\0'; - } - assert(dst_len <= INT_MAX); - - return (int)dst_len; -} diff --git a/third_party/boringssl/src/crypto/base64/base64.cc b/third_party/boringssl/src/crypto/base64/base64.cc new file mode 100644 index 00000000..4b202e17 --- /dev/null +++ b/third_party/boringssl/src/crypto/base64/base64.cc @@ -0,0 +1,433 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" + + +using namespace bssl; + +// constant_time_lt_args_8 behaves like |constant_time_lt_8| but takes |uint8_t| +// arguments for a slightly simpler implementation. +static uint8_t constant_time_lt_args_8(uint8_t a, uint8_t b) { + crypto_word_t aw = a; + crypto_word_t bw = b; + // |crypto_word_t| is larger than |uint8_t|, so |aw| and |bw| have the same + // MSB. |aw| < |bw| iff MSB(|aw| - |bw|) is 1. + return constant_time_msb_w(aw - bw); +} + +// constant_time_in_range_8 returns |CONSTTIME_TRUE_8| if |min| <= |a| <= |max| +// and |CONSTTIME_FALSE_8| otherwise. +static uint8_t constant_time_in_range_8(uint8_t a, uint8_t min, uint8_t max) { + a -= min; + return constant_time_lt_args_8(a, max - min + 1); +} + +// Encoding. + +static uint8_t conv_bin2ascii(uint8_t a) { + // Since PEM is sometimes used to carry private keys, we encode base64 data + // itself in constant-time. + a &= 0x3f; + uint8_t ret = constant_time_select_8(constant_time_eq_8(a, 62), '+', '/'); + ret = + constant_time_select_8(constant_time_lt_args_8(a, 62), a - 52 + '0', ret); + ret = + constant_time_select_8(constant_time_lt_args_8(a, 52), a - 26 + 'a', ret); + ret = constant_time_select_8(constant_time_lt_args_8(a, 26), a + 'A', ret); + return ret; +} + +static_assert(sizeof(((EVP_ENCODE_CTX *)nullptr)->data) % 3 == 0, + "data length must be a multiple of base64 chunk size"); + +int EVP_EncodedLength(size_t *out_len, size_t len) { + if (len + 2 < len) { + return 0; + } + len += 2; + len /= 3; + + if (((len << 2) >> 2) != len) { + return 0; + } + len <<= 2; + + if (len + 1 < len) { + return 0; + } + len++; + + *out_len = len; + return 1; +} + +EVP_ENCODE_CTX *EVP_ENCODE_CTX_new() { return New(); } + +void EVP_ENCODE_CTX_free(EVP_ENCODE_CTX *ctx) { Delete(ctx); } + +void EVP_EncodeInit(EVP_ENCODE_CTX *ctx) { + OPENSSL_memset(ctx, 0, sizeof(EVP_ENCODE_CTX)); +} + +void EVP_EncodeUpdate(EVP_ENCODE_CTX *ctx, uint8_t *out, int *out_len, + const uint8_t *in, size_t in_len) { + size_t total = 0; + + *out_len = 0; + if (in_len == 0) { + return; + } + + assert(ctx->data_used < sizeof(ctx->data)); + + if (sizeof(ctx->data) - ctx->data_used > in_len) { + OPENSSL_memcpy(&ctx->data[ctx->data_used], in, in_len); + ctx->data_used += (unsigned)in_len; + return; + } + + if (ctx->data_used != 0) { + const size_t todo = sizeof(ctx->data) - ctx->data_used; + OPENSSL_memcpy(&ctx->data[ctx->data_used], in, todo); + in += todo; + in_len -= todo; + + size_t encoded = EVP_EncodeBlock(out, ctx->data, sizeof(ctx->data)); + ctx->data_used = 0; + + out += encoded; + *(out++) = '\n'; + *out = '\0'; + + total = encoded + 1; + } + + while (in_len >= sizeof(ctx->data)) { + size_t encoded = EVP_EncodeBlock(out, in, sizeof(ctx->data)); + in += sizeof(ctx->data); + in_len -= sizeof(ctx->data); + + out += encoded; + *(out++) = '\n'; + *out = '\0'; + + if (total + encoded + 1 < total) { + *out_len = 0; + return; + } + + total += encoded + 1; + } + + if (in_len != 0) { + OPENSSL_memcpy(ctx->data, in, in_len); + } + + ctx->data_used = (unsigned)in_len; + + if (total > INT_MAX) { + // We cannot signal an error, but we can at least avoid making *out_len + // negative. + total = 0; + } + *out_len = (int)total; +} + +void EVP_EncodeFinal(EVP_ENCODE_CTX *ctx, uint8_t *out, int *out_len) { + if (ctx->data_used == 0) { + *out_len = 0; + return; + } + + size_t encoded = EVP_EncodeBlock(out, ctx->data, ctx->data_used); + out[encoded++] = '\n'; + out[encoded] = '\0'; + ctx->data_used = 0; + + // ctx->data_used is bounded by sizeof(ctx->data), so this does not + // overflow. + assert(encoded <= INT_MAX); + *out_len = (int)encoded; +} + +size_t EVP_EncodeBlock(uint8_t *dst, const uint8_t *src, size_t src_len) { + uint32_t l; + size_t remaining = src_len, ret = 0; + + while (remaining) { + if (remaining >= 3) { + l = (((uint32_t)src[0]) << 16L) | (((uint32_t)src[1]) << 8L) | src[2]; + *(dst++) = conv_bin2ascii(l >> 18L); + *(dst++) = conv_bin2ascii(l >> 12L); + *(dst++) = conv_bin2ascii(l >> 6L); + *(dst++) = conv_bin2ascii(l); + remaining -= 3; + } else { + l = ((uint32_t)src[0]) << 16L; + if (remaining == 2) { + l |= ((uint32_t)src[1] << 8L); + } + + *(dst++) = conv_bin2ascii(l >> 18L); + *(dst++) = conv_bin2ascii(l >> 12L); + *(dst++) = (remaining == 1) ? '=' : conv_bin2ascii(l >> 6L); + *(dst++) = '='; + remaining = 0; + } + ret += 4; + src += 3; + } + + *dst = '\0'; + return ret; +} + + +// Decoding. + +int EVP_DecodedLength(size_t *out_len, size_t len) { + if (len % 4 != 0) { + return 0; + } + + *out_len = (len / 4) * 3; + return 1; +} + +void EVP_DecodeInit(EVP_ENCODE_CTX *ctx) { + OPENSSL_memset(ctx, 0, sizeof(EVP_ENCODE_CTX)); +} + +static uint8_t base64_ascii_to_bin(uint8_t a) { + // Since PEM is sometimes used to carry private keys, we decode base64 data + // itself in constant-time. + const uint8_t is_upper = constant_time_in_range_8(a, 'A', 'Z'); + const uint8_t is_lower = constant_time_in_range_8(a, 'a', 'z'); + const uint8_t is_digit = constant_time_in_range_8(a, '0', '9'); + const uint8_t is_plus = constant_time_eq_8(a, '+'); + const uint8_t is_slash = constant_time_eq_8(a, '/'); + const uint8_t is_equals = constant_time_eq_8(a, '='); + + uint8_t ret = 0; + ret |= is_upper & (a - 'A'); // [0,26) + ret |= is_lower & (a - 'a' + 26); // [26,52) + ret |= is_digit & (a - '0' + 52); // [52,62) + ret |= is_plus & 62; + ret |= is_slash & 63; + // Invalid inputs, 'A', and '=' have all been mapped to zero. Map invalid + // inputs to 0xff. Note '=' is padding and handled separately by the caller. + const uint8_t is_valid = + is_upper | is_lower | is_digit | is_plus | is_slash | is_equals; + ret |= ~is_valid; + return ret; +} + +// base64_decode_quad decodes a single “quad” (i.e. four characters) of base64 +// data and writes up to three bytes to |out|. It sets |*out_num_bytes| to the +// number of bytes written, which will be less than three if the quad ended +// with padding. It returns one on success or zero on error. +static int base64_decode_quad(uint8_t *out, size_t *out_num_bytes, + const uint8_t *in) { + const uint8_t a = base64_ascii_to_bin(in[0]); + const uint8_t b = base64_ascii_to_bin(in[1]); + const uint8_t c = base64_ascii_to_bin(in[2]); + const uint8_t d = base64_ascii_to_bin(in[3]); + if (a == 0xff || b == 0xff || c == 0xff || d == 0xff) { + return 0; + } + + const uint32_t v = ((uint32_t)a) << 18 | ((uint32_t)b) << 12 | + ((uint32_t)c) << 6 | (uint32_t)d; + + const unsigned padding_pattern = (in[0] == '=') << 3 | // + (in[1] == '=') << 2 | // + (in[2] == '=') << 1 | // + (in[3] == '='); + + // In presence of padding, the lowest bits of v are unused. Canonical encoding + // (RFC 4648, section 3.5) requires that these bits all be set to zero. Common + // PEM parsers accept noncanonical base64, adding to the malleability of the + // format. This decoder follows OpenSSL's and Go's PEM parsers and accepts it. + switch (padding_pattern) { + case 0: + // The common case of no padding. + *out_num_bytes = 3; + out[0] = v >> 16; + out[1] = v >> 8; + out[2] = v; + break; + + case 1: // xxx= + *out_num_bytes = 2; + out[0] = v >> 16; + out[1] = v >> 8; + break; + + case 3: // xx== + *out_num_bytes = 1; + out[0] = v >> 16; + break; + + default: + return 0; + } + + return 1; +} + +int EVP_DecodeUpdate(EVP_ENCODE_CTX *ctx, uint8_t *out, int *out_len, + const uint8_t *in, size_t in_len) { + *out_len = 0; + + if (ctx->error_encountered) { + return -1; + } + + size_t bytes_out = 0, i; + for (i = 0; i < in_len; i++) { + const char c = in[i]; + switch (c) { + case ' ': + case '\t': + case '\r': + case '\n': + continue; + } + + if (ctx->eof_seen) { + ctx->error_encountered = 1; + return -1; + } + + ctx->data[ctx->data_used++] = c; + if (ctx->data_used == 4) { + size_t num_bytes_resulting; + if (!base64_decode_quad(out, &num_bytes_resulting, ctx->data)) { + ctx->error_encountered = 1; + return -1; + } + + ctx->data_used = 0; + bytes_out += num_bytes_resulting; + out += num_bytes_resulting; + + if (num_bytes_resulting < 3) { + ctx->eof_seen = 1; + } + } + } + + if (bytes_out > INT_MAX) { + ctx->error_encountered = 1; + *out_len = 0; + return -1; + } + *out_len = (int)bytes_out; + + if (ctx->eof_seen) { + return 0; + } + + return 1; +} + +int EVP_DecodeFinal(EVP_ENCODE_CTX *ctx, uint8_t *out, int *out_len) { + *out_len = 0; + if (ctx->error_encountered || ctx->data_used != 0) { + return -1; + } + + return 1; +} + +int EVP_DecodeBase64(uint8_t *out, size_t *out_len, size_t max_out, + const uint8_t *in, size_t in_len) { + *out_len = 0; + + if (in_len % 4 != 0) { + return 0; + } + + size_t max_len; + if (!EVP_DecodedLength(&max_len, in_len) || max_out < max_len) { + return 0; + } + + size_t i, bytes_out = 0; + for (i = 0; i < in_len; i += 4) { + size_t num_bytes_resulting; + + if (!base64_decode_quad(out, &num_bytes_resulting, &in[i])) { + return 0; + } + + bytes_out += num_bytes_resulting; + out += num_bytes_resulting; + if (num_bytes_resulting != 3 && i != in_len - 4) { + return 0; + } + } + + *out_len = bytes_out; + return 1; +} + +int EVP_DecodeBlock(uint8_t *dst, const uint8_t *src, size_t src_len) { + // Trim spaces and tabs from the beginning of the input. + while (src_len > 0) { + if (src[0] != ' ' && src[0] != '\t') { + break; + } + + src++; + src_len--; + } + + // Trim newlines, spaces and tabs from the end of the line. + while (src_len > 0) { + switch (src[src_len - 1]) { + case ' ': + case '\t': + case '\r': + case '\n': + src_len--; + continue; + } + + break; + } + + size_t dst_len; + if (!EVP_DecodedLength(&dst_len, src_len) || dst_len > INT_MAX || + !EVP_DecodeBase64(dst, &dst_len, dst_len, src, src_len)) { + return -1; + } + + // EVP_DecodeBlock does not take padding into account, so put the + // NULs back in... so the caller can strip them back out. + while (dst_len % 3 != 0) { + dst[dst_len++] = '\0'; + } + assert(dst_len <= INT_MAX); + + return (int)dst_len; +} diff --git a/third_party/boringssl/src/crypto/bcm_support.h b/third_party/boringssl/src/crypto/bcm_support.h new file mode 100644 index 00000000..20e6ceff --- /dev/null +++ b/third_party/boringssl/src/crypto/bcm_support.h @@ -0,0 +1,66 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_BCM_SUPPORT_H +#define OPENSSL_HEADER_CRYPTO_BCM_SUPPORT_H + +#include + +#include + + +BSSL_NAMESPACE_BEGIN + +// Provided by libcrypto, called from BCM + +// CRYPTO_init_sysrand initializes long-lived resources needed to draw entropy +// from the operating system, if the operating system requires initialization. +void CRYPTO_init_sysrand(); + +// CRYPTO_sysrand fills |len| bytes at |buf| with entropy from the operating +// system. +void CRYPTO_sysrand(uint8_t *buf, size_t len); + +// RAND_need_entropy is called whenever the BCM module has stopped because it +// has run out of entropy. +void RAND_need_entropy(size_t bytes_needed); + +// crypto_get_fork_generation returns the fork generation number for the current +// process, or zero if not supported on the platform. The fork generation number +// is a non-zero, strictly-monotonic counter with the property that, if queried +// in an address space and then again in a subsequently forked copy, the forked +// address space will observe a greater value. +// +// This function may be used to clear cached values across a fork. When +// initializing a cache, record the fork generation. Before using the cache, +// check if the fork generation has changed. If so, drop the cache and update +// the save fork generation. Note this logic transparently handles platforms +// which always return zero. +// +// This is not reliably supported on all platforms which implement |fork|, so it +// should only be used as a hardening measure. +OPENSSL_EXPORT uint64_t CRYPTO_get_fork_generation(); + +// CRYPTO_fork_detect_force_madv_wipeonfork_for_testing is an internal detail +// used for testing purposes. +OPENSSL_EXPORT void CRYPTO_fork_detect_force_madv_wipeonfork_for_testing( + int on); + +// CRYPTO_get_stderr returns stderr. This function exists to avoid BCM needing +// a data dependency on libc. +FILE *CRYPTO_get_stderr(); + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_BCM_SUPPORT_H diff --git a/third_party/boringssl/src/crypto/bio/bio.c b/third_party/boringssl/src/crypto/bio/bio.c deleted file mode 100644 index 3d36e28d..00000000 --- a/third_party/boringssl/src/crypto/bio/bio.c +++ /dev/null @@ -1,702 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "../internal.h" - - -BIO *BIO_new(const BIO_METHOD *method) { - BIO *ret = OPENSSL_malloc(sizeof(BIO)); - if (ret == NULL) { - OPENSSL_PUT_ERROR(BIO, ERR_R_MALLOC_FAILURE); - return NULL; - } - - OPENSSL_memset(ret, 0, sizeof(BIO)); - ret->method = method; - ret->shutdown = 1; - ret->references = 1; - - if (method->create != NULL && !method->create(ret)) { - OPENSSL_free(ret); - return NULL; - } - - return ret; -} - -int BIO_free(BIO *bio) { - BIO *next_bio; - - for (; bio != NULL; bio = next_bio) { - if (!CRYPTO_refcount_dec_and_test_zero(&bio->references)) { - return 0; - } - - next_bio = BIO_pop(bio); - - if (bio->method != NULL && bio->method->destroy != NULL) { - bio->method->destroy(bio); - } - - OPENSSL_free(bio); - } - return 1; -} - -int BIO_up_ref(BIO *bio) { - CRYPTO_refcount_inc(&bio->references); - return 1; -} - -void BIO_vfree(BIO *bio) { - BIO_free(bio); -} - -void BIO_free_all(BIO *bio) { - BIO_free(bio); -} - -int BIO_read(BIO *bio, void *buf, int len) { - if (bio == NULL || bio->method == NULL || bio->method->bread == NULL) { - OPENSSL_PUT_ERROR(BIO, BIO_R_UNSUPPORTED_METHOD); - return -2; - } - if (!bio->init) { - OPENSSL_PUT_ERROR(BIO, BIO_R_UNINITIALIZED); - return -2; - } - if (len <= 0) { - return 0; - } - int ret = bio->method->bread(bio, buf, len); - if (ret > 0) { - bio->num_read += ret; - } - return ret; -} - -int BIO_gets(BIO *bio, char *buf, int len) { - if (bio == NULL || bio->method == NULL || bio->method->bgets == NULL) { - OPENSSL_PUT_ERROR(BIO, BIO_R_UNSUPPORTED_METHOD); - return -2; - } - if (!bio->init) { - OPENSSL_PUT_ERROR(BIO, BIO_R_UNINITIALIZED); - return -2; - } - if (len <= 0) { - return 0; - } - int ret = bio->method->bgets(bio, buf, len); - if (ret > 0) { - bio->num_read += ret; - } - return ret; -} - -int BIO_write(BIO *bio, const void *in, int inl) { - if (bio == NULL || bio->method == NULL || bio->method->bwrite == NULL) { - OPENSSL_PUT_ERROR(BIO, BIO_R_UNSUPPORTED_METHOD); - return -2; - } - if (!bio->init) { - OPENSSL_PUT_ERROR(BIO, BIO_R_UNINITIALIZED); - return -2; - } - if (inl <= 0) { - return 0; - } - int ret = bio->method->bwrite(bio, in, inl); - if (ret > 0) { - bio->num_write += ret; - } - return ret; -} - -int BIO_write_all(BIO *bio, const void *data, size_t len) { - const uint8_t *data_u8 = data; - while (len > 0) { - int ret = BIO_write(bio, data_u8, len > INT_MAX ? INT_MAX : (int)len); - if (ret <= 0) { - return 0; - } - data_u8 += ret; - len -= ret; - } - return 1; -} - -int BIO_puts(BIO *bio, const char *in) { - return BIO_write(bio, in, strlen(in)); -} - -int BIO_flush(BIO *bio) { - return BIO_ctrl(bio, BIO_CTRL_FLUSH, 0, NULL); -} - -long BIO_ctrl(BIO *bio, int cmd, long larg, void *parg) { - if (bio == NULL) { - return 0; - } - - if (bio->method == NULL || bio->method->ctrl == NULL) { - OPENSSL_PUT_ERROR(BIO, BIO_R_UNSUPPORTED_METHOD); - return -2; - } - - return bio->method->ctrl(bio, cmd, larg, parg); -} - -char *BIO_ptr_ctrl(BIO *b, int cmd, long larg) { - char *p = NULL; - - if (BIO_ctrl(b, cmd, larg, (void *)&p) <= 0) { - return NULL; - } - - return p; -} - -long BIO_int_ctrl(BIO *b, int cmd, long larg, int iarg) { - int i = iarg; - - return BIO_ctrl(b, cmd, larg, (void *)&i); -} - -int BIO_reset(BIO *bio) { - return BIO_ctrl(bio, BIO_CTRL_RESET, 0, NULL); -} - -int BIO_eof(BIO *bio) { - return BIO_ctrl(bio, BIO_CTRL_EOF, 0, NULL); -} - -void BIO_set_flags(BIO *bio, int flags) { - bio->flags |= flags; -} - -int BIO_test_flags(const BIO *bio, int flags) { - return bio->flags & flags; -} - -int BIO_should_read(const BIO *bio) { - return BIO_test_flags(bio, BIO_FLAGS_READ); -} - -int BIO_should_write(const BIO *bio) { - return BIO_test_flags(bio, BIO_FLAGS_WRITE); -} - -int BIO_should_retry(const BIO *bio) { - return BIO_test_flags(bio, BIO_FLAGS_SHOULD_RETRY); -} - -int BIO_should_io_special(const BIO *bio) { - return BIO_test_flags(bio, BIO_FLAGS_IO_SPECIAL); -} - -int BIO_get_retry_reason(const BIO *bio) { return bio->retry_reason; } - -void BIO_set_retry_reason(BIO *bio, int reason) { bio->retry_reason = reason; } - -void BIO_clear_flags(BIO *bio, int flags) { - bio->flags &= ~flags; -} - -void BIO_set_retry_read(BIO *bio) { - bio->flags |= BIO_FLAGS_READ | BIO_FLAGS_SHOULD_RETRY; -} - -void BIO_set_retry_write(BIO *bio) { - bio->flags |= BIO_FLAGS_WRITE | BIO_FLAGS_SHOULD_RETRY; -} - -static const int kRetryFlags = BIO_FLAGS_RWS | BIO_FLAGS_SHOULD_RETRY; - -int BIO_get_retry_flags(BIO *bio) { - return bio->flags & kRetryFlags; -} - -void BIO_clear_retry_flags(BIO *bio) { - bio->flags &= ~kRetryFlags; - bio->retry_reason = 0; -} - -int BIO_method_type(const BIO *bio) { return bio->method->type; } - -void BIO_copy_next_retry(BIO *bio) { - BIO_clear_retry_flags(bio); - BIO_set_flags(bio, BIO_get_retry_flags(bio->next_bio)); - bio->retry_reason = bio->next_bio->retry_reason; -} - -long BIO_callback_ctrl(BIO *bio, int cmd, bio_info_cb fp) { - if (bio == NULL) { - return 0; - } - - if (bio->method == NULL || bio->method->callback_ctrl == NULL) { - OPENSSL_PUT_ERROR(BIO, BIO_R_UNSUPPORTED_METHOD); - return 0; - } - - return bio->method->callback_ctrl(bio, cmd, fp); -} - -size_t BIO_pending(const BIO *bio) { - const long r = BIO_ctrl((BIO *) bio, BIO_CTRL_PENDING, 0, NULL); - assert(r >= 0); - - if (r < 0) { - return 0; - } - return r; -} - -size_t BIO_ctrl_pending(const BIO *bio) { - return BIO_pending(bio); -} - -size_t BIO_wpending(const BIO *bio) { - const long r = BIO_ctrl((BIO *) bio, BIO_CTRL_WPENDING, 0, NULL); - assert(r >= 0); - - if (r < 0) { - return 0; - } - return r; -} - -int BIO_set_close(BIO *bio, int close_flag) { - return BIO_ctrl(bio, BIO_CTRL_SET_CLOSE, close_flag, NULL); -} - -OPENSSL_EXPORT size_t BIO_number_read(const BIO *bio) { - return bio->num_read; -} - -OPENSSL_EXPORT size_t BIO_number_written(const BIO *bio) { - return bio->num_write; -} - -BIO *BIO_push(BIO *bio, BIO *appended_bio) { - BIO *last_bio; - - if (bio == NULL) { - return bio; - } - - last_bio = bio; - while (last_bio->next_bio != NULL) { - last_bio = last_bio->next_bio; - } - - last_bio->next_bio = appended_bio; - return bio; -} - -BIO *BIO_pop(BIO *bio) { - BIO *ret; - - if (bio == NULL) { - return NULL; - } - ret = bio->next_bio; - bio->next_bio = NULL; - return ret; -} - -BIO *BIO_next(BIO *bio) { - if (!bio) { - return NULL; - } - return bio->next_bio; -} - -BIO *BIO_find_type(BIO *bio, int type) { - int method_type, mask; - - if (!bio) { - return NULL; - } - mask = type & 0xff; - - do { - if (bio->method != NULL) { - method_type = bio->method->type; - - if (!mask) { - if (method_type & type) { - return bio; - } - } else if (method_type == type) { - return bio; - } - } - bio = bio->next_bio; - } while (bio != NULL); - - return NULL; -} - -int BIO_indent(BIO *bio, unsigned indent, unsigned max_indent) { - if (indent > max_indent) { - indent = max_indent; - } - - while (indent--) { - if (BIO_puts(bio, " ") != 1) { - return 0; - } - } - return 1; -} - -static int print_bio(const char *str, size_t len, void *bio) { - return BIO_write((BIO *)bio, str, len); -} - -void ERR_print_errors(BIO *bio) { - ERR_print_errors_cb(print_bio, bio); -} - -// bio_read_all reads everything from |bio| and prepends |prefix| to it. On -// success, |*out| is set to an allocated buffer (which should be freed with -// |OPENSSL_free|), |*out_len| is set to its length and one is returned. The -// buffer will contain |prefix| followed by the contents of |bio|. On failure, -// zero is returned. -// -// The function will fail if the size of the output would equal or exceed -// |max_len|. -static int bio_read_all(BIO *bio, uint8_t **out, size_t *out_len, - const uint8_t *prefix, size_t prefix_len, - size_t max_len) { - static const size_t kChunkSize = 4096; - - size_t len = prefix_len + kChunkSize; - if (len > max_len) { - len = max_len; - } - if (len < prefix_len) { - return 0; - } - *out = OPENSSL_malloc(len); - if (*out == NULL) { - return 0; - } - OPENSSL_memcpy(*out, prefix, prefix_len); - size_t done = prefix_len; - - for (;;) { - if (done == len) { - OPENSSL_free(*out); - return 0; - } - const size_t todo = len - done; - assert(todo < INT_MAX); - const int n = BIO_read(bio, *out + done, todo); - if (n == 0) { - *out_len = done; - return 1; - } else if (n == -1) { - OPENSSL_free(*out); - return 0; - } - - done += n; - if (len < max_len && len - done < kChunkSize / 2) { - len += kChunkSize; - if (len < kChunkSize || len > max_len) { - len = max_len; - } - uint8_t *new_buf = OPENSSL_realloc(*out, len); - if (new_buf == NULL) { - OPENSSL_free(*out); - return 0; - } - *out = new_buf; - } - } -} - -// bio_read_full reads |len| bytes |bio| and writes them into |out|. It -// tolerates partial reads from |bio| and returns one on success or zero if a -// read fails before |len| bytes are read. On failure, it additionally sets -// |*out_eof_on_first_read| to whether the error was due to |bio| returning zero -// on the first read. |out_eof_on_first_read| may be NULL to discard the value. -static int bio_read_full(BIO *bio, uint8_t *out, int *out_eof_on_first_read, - size_t len) { - int first_read = 1; - while (len > 0) { - int todo = len <= INT_MAX ? (int)len : INT_MAX; - int ret = BIO_read(bio, out, todo); - if (ret <= 0) { - if (out_eof_on_first_read != NULL) { - *out_eof_on_first_read = first_read && ret == 0; - } - return 0; - } - out += ret; - len -= (size_t)ret; - first_read = 0; - } - - return 1; -} - -// For compatibility with existing |d2i_*_bio| callers, |BIO_read_asn1| uses -// |ERR_LIB_ASN1| errors. -OPENSSL_DECLARE_ERROR_REASON(ASN1, ASN1_R_DECODE_ERROR) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ASN1_R_HEADER_TOO_LONG) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ASN1_R_NOT_ENOUGH_DATA) -OPENSSL_DECLARE_ERROR_REASON(ASN1, ASN1_R_TOO_LONG) - -int BIO_read_asn1(BIO *bio, uint8_t **out, size_t *out_len, size_t max_len) { - uint8_t header[6]; - - static const size_t kInitialHeaderLen = 2; - int eof_on_first_read; - if (!bio_read_full(bio, header, &eof_on_first_read, kInitialHeaderLen)) { - if (eof_on_first_read) { - // Historically, OpenSSL returned |ASN1_R_HEADER_TOO_LONG| when - // |d2i_*_bio| could not read anything. CPython conditions on this to - // determine if |bio| was empty. - OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG); - } else { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); - } - return 0; - } - - const uint8_t tag = header[0]; - const uint8_t length_byte = header[1]; - - if ((tag & 0x1f) == 0x1f) { - // Long form tags are not supported. - OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); - return 0; - } - - size_t len, header_len; - if ((length_byte & 0x80) == 0) { - // Short form length. - len = length_byte; - header_len = kInitialHeaderLen; - } else { - const size_t num_bytes = length_byte & 0x7f; - - if ((tag & 0x20 /* constructed */) != 0 && num_bytes == 0) { - // indefinite length. - if (!bio_read_all(bio, out, out_len, header, kInitialHeaderLen, - max_len)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); - return 0; - } - return 1; - } - - if (num_bytes == 0 || num_bytes > 4) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); - return 0; - } - - if (!bio_read_full(bio, header + kInitialHeaderLen, NULL, num_bytes)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); - return 0; - } - header_len = kInitialHeaderLen + num_bytes; - - uint32_t len32 = 0; - for (unsigned i = 0; i < num_bytes; i++) { - len32 <<= 8; - len32 |= header[kInitialHeaderLen + i]; - } - - if (len32 < 128) { - // Length should have used short-form encoding. - OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); - return 0; - } - - if ((len32 >> ((num_bytes-1)*8)) == 0) { - // Length should have been at least one byte shorter. - OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); - return 0; - } - - len = len32; - } - - if (len + header_len < len || - len + header_len > max_len || - len > INT_MAX) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_TOO_LONG); - return 0; - } - len += header_len; - *out_len = len; - - *out = OPENSSL_malloc(len); - if (*out == NULL) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return 0; - } - OPENSSL_memcpy(*out, header, header_len); - if (!bio_read_full(bio, (*out) + header_len, NULL, len - header_len)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); - OPENSSL_free(*out); - return 0; - } - - return 1; -} - -void BIO_set_retry_special(BIO *bio) { - bio->flags |= BIO_FLAGS_READ | BIO_FLAGS_IO_SPECIAL; -} - -int BIO_set_write_buffer_size(BIO *bio, int buffer_size) { return 0; } - -static struct CRYPTO_STATIC_MUTEX g_index_lock = CRYPTO_STATIC_MUTEX_INIT; -static int g_index = BIO_TYPE_START; - -int BIO_get_new_index(void) { - CRYPTO_STATIC_MUTEX_lock_write(&g_index_lock); - // If |g_index| exceeds 255, it will collide with the flags bits. - int ret = g_index > 255 ? -1 : g_index++; - CRYPTO_STATIC_MUTEX_unlock_write(&g_index_lock); - return ret; -} - -BIO_METHOD *BIO_meth_new(int type, const char *name) { - BIO_METHOD *method = OPENSSL_malloc(sizeof(BIO_METHOD)); - if (method == NULL) { - return NULL; - } - OPENSSL_memset(method, 0, sizeof(BIO_METHOD)); - method->type = type; - method->name = name; - return method; -} - -void BIO_meth_free(BIO_METHOD *method) { - OPENSSL_free(method); -} - -int BIO_meth_set_create(BIO_METHOD *method, - int (*create)(BIO *)) { - method->create = create; - return 1; -} - -int BIO_meth_set_destroy(BIO_METHOD *method, - int (*destroy)(BIO *)) { - method->destroy = destroy; - return 1; -} - -int BIO_meth_set_write(BIO_METHOD *method, - int (*write)(BIO *, const char *, int)) { - method->bwrite = write; - return 1; -} - -int BIO_meth_set_read(BIO_METHOD *method, - int (*read)(BIO *, char *, int)) { - method->bread = read; - return 1; -} - -int BIO_meth_set_gets(BIO_METHOD *method, - int (*gets)(BIO *, char *, int)) { - method->bgets = gets; - return 1; -} - -int BIO_meth_set_ctrl(BIO_METHOD *method, - long (*ctrl)(BIO *, int, long, void *)) { - method->ctrl = ctrl; - return 1; -} - -void BIO_set_data(BIO *bio, void *ptr) { bio->ptr = ptr; } - -void *BIO_get_data(BIO *bio) { return bio->ptr; } - -void BIO_set_init(BIO *bio, int init) { bio->init = init; } - -int BIO_get_init(BIO *bio) { return bio->init; } - -void BIO_set_shutdown(BIO *bio, int shutdown) { bio->shutdown = shutdown; } - -int BIO_get_shutdown(BIO *bio) { return bio->shutdown; } - -int BIO_meth_set_puts(BIO_METHOD *method, int (*puts)(BIO *, const char *)) { - // Ignore the parameter. We implement |BIO_puts| using |BIO_write|. - return 1; -} diff --git a/third_party/boringssl/src/crypto/bio/bio.cc b/third_party/boringssl/src/crypto/bio/bio.cc new file mode 100644 index 00000000..9250be0f --- /dev/null +++ b/third_party/boringssl/src/crypto/bio/bio.cc @@ -0,0 +1,673 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +static ExDataClass g_ex_data_class(/*with_app_data=*/true); + +Bio::Bio(const BIO_METHOD *m) : RefCounted(CheckSubClass()), method(m) { + CRYPTO_new_ex_data(&ex_data); +} + +BIO *BIO_new(const BIO_METHOD *method) { + UniquePtr ret(New(method)); + if (ret == nullptr) { + return nullptr; + } + + if (method->create != nullptr && !method->create(ret.get())) { + return nullptr; + } + + return ret.release(); +} + +Bio::~Bio() { + BIO *next = BIO_pop(this); + if (method->destroy != nullptr) { + method->destroy(this); + } + CRYPTO_free_ex_data(&g_ex_data_class, &ex_data); + BIO_free(next); +} + +int BIO_free(BIO *bio) { + if (bio == nullptr) { + return 1; + } + return FromOpaque(bio)->DecRefInternal(); +} + +int BIO_up_ref(BIO *bio) { + FromOpaque(bio)->UpRefInternal(); + return 1; +} + +void BIO_vfree(BIO *bio) { BIO_free(bio); } + +void BIO_free_all(BIO *bio) { BIO_free(bio); } + +int BIO_read(BIO *bio, void *buf, int len) { + auto *impl = FromOpaque(bio); + if (impl == nullptr || impl->method->bread == nullptr) { + OPENSSL_PUT_ERROR(BIO, BIO_R_UNSUPPORTED_METHOD); + return -2; + } + if (!impl->init) { + OPENSSL_PUT_ERROR(BIO, BIO_R_UNINITIALIZED); + return -2; + } + if (len <= 0) { + return 0; + } + int ret = impl->method->bread(impl, reinterpret_cast(buf), len); + if (ret > 0) { + impl->num_read += ret; + } + return ret; +} + +int BIO_gets(BIO *bio, char *buf, int len) { + auto *impl = FromOpaque(bio); + if (impl == nullptr || impl->method->bgets == nullptr) { + OPENSSL_PUT_ERROR(BIO, BIO_R_UNSUPPORTED_METHOD); + return -2; + } + if (!impl->init) { + OPENSSL_PUT_ERROR(BIO, BIO_R_UNINITIALIZED); + return -2; + } + if (len <= 0) { + return 0; + } + int ret = impl->method->bgets(impl, buf, len); + if (ret > 0) { + impl->num_read += ret; + } + return ret; +} + +int BIO_write(BIO *bio, const void *in, int inl) { + auto *impl = FromOpaque(bio); + if (impl == nullptr || impl->method->bwrite == nullptr) { + OPENSSL_PUT_ERROR(BIO, BIO_R_UNSUPPORTED_METHOD); + return -2; + } + if (!impl->init) { + OPENSSL_PUT_ERROR(BIO, BIO_R_UNINITIALIZED); + return -2; + } + if (inl <= 0) { + return 0; + } + int ret = impl->method->bwrite(impl, reinterpret_cast(in), inl); + if (ret > 0) { + impl->num_write += ret; + } + return ret; +} + +int BIO_write_all(BIO *bio, const void *data, size_t len) { + const uint8_t *data_u8 = reinterpret_cast(data); + while (len > 0) { + int ret = BIO_write(bio, data_u8, len > INT_MAX ? INT_MAX : (int)len); + if (ret <= 0) { + return 0; + } + data_u8 += ret; + len -= ret; + } + return 1; +} + +int BIO_puts(BIO *bio, const char *in) { + size_t len = strlen(in); + if (len > INT_MAX) { + // |BIO_write| and the return value both assume the string fits in |int|. + OPENSSL_PUT_ERROR(BIO, ERR_R_OVERFLOW); + return -1; + } + return BIO_write(bio, in, (int)len); +} + +int BIO_flush(BIO *bio) { + return (int)BIO_ctrl(bio, BIO_CTRL_FLUSH, 0, nullptr); +} + +long BIO_ctrl(BIO *bio, int cmd, long larg, void *parg) { + auto *impl = FromOpaque(bio); + if (impl == nullptr) { + return 0; + } + + if (impl->method->ctrl == nullptr) { + OPENSSL_PUT_ERROR(BIO, BIO_R_UNSUPPORTED_METHOD); + return -2; + } + + return impl->method->ctrl(impl, cmd, larg, parg); +} + +char *BIO_ptr_ctrl(BIO *b, int cmd, long larg) { + char *p = nullptr; + + if (BIO_ctrl(b, cmd, larg, (void *)&p) <= 0) { + return nullptr; + } + + return p; +} + +long BIO_int_ctrl(BIO *b, int cmd, long larg, int iarg) { + int i = iarg; + + return BIO_ctrl(b, cmd, larg, (void *)&i); +} + +int BIO_reset(BIO *bio) { + return (int)BIO_ctrl(bio, BIO_CTRL_RESET, 0, nullptr); +} + +int BIO_eof(BIO *bio) { return (int)BIO_ctrl(bio, BIO_CTRL_EOF, 0, nullptr); } + +void BIO_set_flags(BIO *bio, int flags) { FromOpaque(bio)->flags |= flags; } + +int BIO_test_flags(const BIO *bio, int flags) { + return FromOpaque(bio)->flags & flags; +} + +int BIO_should_read(const BIO *bio) { + return BIO_test_flags(bio, BIO_FLAGS_READ); +} + +int BIO_should_write(const BIO *bio) { + return BIO_test_flags(bio, BIO_FLAGS_WRITE); +} + +int BIO_should_retry(const BIO *bio) { + return BIO_test_flags(bio, BIO_FLAGS_SHOULD_RETRY); +} + +int BIO_should_io_special(const BIO *bio) { + return BIO_test_flags(bio, BIO_FLAGS_IO_SPECIAL); +} + +int BIO_get_retry_reason(const BIO *bio) { + return FromOpaque(bio)->retry_reason; +} + +void BIO_set_retry_reason(BIO *bio, int reason) { + FromOpaque(bio)->retry_reason = reason; +} + +void BIO_clear_flags(BIO *bio, int flags) { FromOpaque(bio)->flags &= ~flags; } + +void BIO_set_retry_read(BIO *bio) { + FromOpaque(bio)->flags |= BIO_FLAGS_READ | BIO_FLAGS_SHOULD_RETRY; +} + +void BIO_set_retry_write(BIO *bio) { + FromOpaque(bio)->flags |= BIO_FLAGS_WRITE | BIO_FLAGS_SHOULD_RETRY; +} + +void BIO_set_retry_special(BIO *bio) { + FromOpaque(bio)->flags |= BIO_FLAGS_IO_SPECIAL | BIO_FLAGS_SHOULD_RETRY; +} + +static const int kRetryFlags = BIO_FLAGS_RWS | BIO_FLAGS_SHOULD_RETRY; + +int BIO_get_retry_flags(BIO *bio) { + return FromOpaque(bio)->flags & kRetryFlags; +} + +void BIO_clear_retry_flags(BIO *bio) { + auto *impl = FromOpaque(bio); + impl->flags &= ~kRetryFlags; + impl->retry_reason = 0; +} + +int BIO_method_type(const BIO *bio) { + return FromOpaque(bio)->method->type; +} + +void BIO_copy_next_retry(BIO *bio) { + auto *impl = FromOpaque(bio); + BIO_clear_retry_flags(impl); + BIO_set_flags(impl, BIO_get_retry_flags(impl->next_bio)); + impl->retry_reason = impl->next_bio->retry_reason; +} + +long BIO_callback_ctrl(BIO *bio, int cmd, BIO_info_cb *fp) { + auto *impl = FromOpaque(bio); + if (impl == nullptr) { + return 0; + } + + if (impl->method->callback_ctrl == nullptr) { + OPENSSL_PUT_ERROR(BIO, BIO_R_UNSUPPORTED_METHOD); + return 0; + } + + return impl->method->callback_ctrl(impl, cmd, fp); +} + +size_t BIO_pending(const BIO *bio) { + const long r = BIO_ctrl(const_cast(bio), BIO_CTRL_PENDING, 0, nullptr); + assert(r >= 0); + + if (r < 0) { + return 0; + } + return r; +} + +size_t BIO_ctrl_pending(const BIO *bio) { return BIO_pending(bio); } + +size_t BIO_wpending(const BIO *bio) { + const long r = + BIO_ctrl(const_cast(bio), BIO_CTRL_WPENDING, 0, nullptr); + assert(r >= 0); + + if (r < 0) { + return 0; + } + return r; +} + +int BIO_set_close(BIO *bio, int close_flag) { + return (int)BIO_ctrl(bio, BIO_CTRL_SET_CLOSE, close_flag, nullptr); +} + +uint64_t BIO_number_read(const BIO *bio) { return FromOpaque(bio)->num_read; } + +uint64_t BIO_number_written(const BIO *bio) { + return FromOpaque(bio)->num_write; +} + +BIO *BIO_push(BIO *bio, BIO *appended_bio) { + if (bio == nullptr) { + return bio; + } + + Bio *last_bio = FromOpaque(bio); + while (last_bio->next_bio != nullptr) { + last_bio = last_bio->next_bio; + } + + last_bio->next_bio = FromOpaque(appended_bio); + return bio; +} + +BIO *BIO_pop(BIO *bio) { + if (bio == nullptr) { + return nullptr; + } + return std::exchange(FromOpaque(bio)->next_bio, nullptr); +} + +BIO *BIO_next(BIO *bio) { + if (!bio) { + return nullptr; + } + return FromOpaque(bio)->next_bio; +} + +BIO *BIO_find_type(BIO *bio, int type) { + if (!bio) { + return nullptr; + } + + int mask = type & 0xff; + do { + int method_type = BIO_method_type(bio); + if (!mask) { + if (method_type & type) { + return bio; + } + } else if (method_type == type) { + return bio; + } + bio = BIO_next(bio); + } while (bio != nullptr); + + return nullptr; +} + +int BIO_indent(BIO *bio, unsigned indent, unsigned max_indent) { + if (indent > max_indent) { + indent = max_indent; + } + + while (indent--) { + if (BIO_puts(bio, " ") != 1) { + return 0; + } + } + return 1; +} + +static int print_bio(const char *str, size_t len, void *bio) { + return BIO_write_all((BIO *)bio, str, len); +} + +void ERR_print_errors(BIO *bio) { ERR_print_errors_cb(print_bio, bio); } + +// bio_read_all reads everything from |bio| and prepends |prefix| to it. On +// success, |*out| is set to an allocated buffer (which should be freed with +// |OPENSSL_free|), |*out_len| is set to its length and one is returned. The +// buffer will contain |prefix| followed by the contents of |bio|. On failure, +// zero is returned. +// +// The function will fail if the size of the output would equal or exceed +// |max_len|. +static int bio_read_all(Bio *bio, uint8_t **out, size_t *out_len, + const uint8_t *prefix, size_t prefix_len, + size_t max_len) { + static const size_t kChunkSize = 4096; + + size_t len = prefix_len + kChunkSize; + if (len > max_len) { + len = max_len; + } + if (len < prefix_len) { + return 0; + } + *out = reinterpret_cast(OPENSSL_malloc(len)); + if (*out == nullptr) { + return 0; + } + OPENSSL_memcpy(*out, prefix, prefix_len); + size_t done = prefix_len; + + for (;;) { + if (done == len) { + OPENSSL_free(*out); + return 0; + } + size_t todo = len - done; + if (todo > INT_MAX) { + todo = INT_MAX; + } + const int n = BIO_read(bio, *out + done, (int)todo); + if (n == 0) { + *out_len = done; + return 1; + } else if (n < 0) { + OPENSSL_free(*out); + return 0; + } + + done += n; + if (len < max_len && len - done < kChunkSize / 2) { + len += kChunkSize; + if (len < kChunkSize || len > max_len) { + len = max_len; + } + uint8_t *new_buf = + reinterpret_cast(OPENSSL_realloc(*out, len)); + if (new_buf == nullptr) { + OPENSSL_free(*out); + return 0; + } + *out = new_buf; + } + } +} + +// bio_read_full reads |len| bytes |bio| and writes them into |out|. It +// tolerates partial reads from |bio| and returns one on success or zero if a +// read fails before |len| bytes are read. On failure, it additionally sets +// |*out_eof_on_first_read| to whether the error was due to |bio| returning zero +// on the first read. |out_eof_on_first_read| may be NULL to discard the value. +static int bio_read_full(Bio *bio, uint8_t *out, int *out_eof_on_first_read, + size_t len) { + int first_read = 1; + while (len > 0) { + int todo = len <= INT_MAX ? (int)len : INT_MAX; + int ret = BIO_read(bio, out, todo); + if (ret <= 0) { + if (out_eof_on_first_read != nullptr) { + *out_eof_on_first_read = first_read && ret == 0; + } + return 0; + } + out += ret; + len -= (size_t)ret; + first_read = 0; + } + + return 1; +} + +// For compatibility with existing |d2i_*_bio| callers, |BIO_read_asn1| uses +// |ERR_LIB_ASN1| errors. +OPENSSL_DECLARE_ERROR_REASON(ASN1, ASN1_R_DECODE_ERROR) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ASN1_R_HEADER_TOO_LONG) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ASN1_R_NOT_ENOUGH_DATA) +OPENSSL_DECLARE_ERROR_REASON(ASN1, ASN1_R_TOO_LONG) + +int BIO_read_asn1(BIO *bio, uint8_t **out, size_t *out_len, size_t max_len) { + uint8_t header[6]; + + static const size_t kInitialHeaderLen = 2; + int eof_on_first_read; + auto *impl = FromOpaque(bio); + if (!bio_read_full(impl, header, &eof_on_first_read, kInitialHeaderLen)) { + if (eof_on_first_read) { + // Historically, OpenSSL returned |ASN1_R_HEADER_TOO_LONG| when + // |d2i_*_bio| could not read anything. CPython conditions on this to + // determine if |bio| was empty. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_HEADER_TOO_LONG); + } else { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); + } + return 0; + } + + const uint8_t tag = header[0]; + const uint8_t length_byte = header[1]; + + if ((tag & 0x1f) == 0x1f) { + // Long form tags are not supported. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + + size_t len, header_len; + if ((length_byte & 0x80) == 0) { + // Short form length. + len = length_byte; + header_len = kInitialHeaderLen; + } else { + const size_t num_bytes = length_byte & 0x7f; + + if ((tag & 0x20 /* constructed */) != 0 && num_bytes == 0) { + // indefinite length. + if (!bio_read_all(impl, out, out_len, header, kInitialHeaderLen, + max_len)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); + return 0; + } + return 1; + } + + if (num_bytes == 0 || num_bytes > 4) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + + if (!bio_read_full(impl, header + kInitialHeaderLen, nullptr, num_bytes)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); + return 0; + } + header_len = kInitialHeaderLen + num_bytes; + + uint32_t len32 = 0; + for (unsigned i = 0; i < num_bytes; i++) { + len32 <<= 8; + len32 |= header[kInitialHeaderLen + i]; + } + + if (len32 < 128) { + // Length should have used short-form encoding. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + + if ((len32 >> ((num_bytes - 1) * 8)) == 0) { + // Length should have been at least one byte shorter. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DECODE_ERROR); + return 0; + } + + len = len32; + } + + if (len + header_len < len || len + header_len > max_len || len > INT_MAX) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_TOO_LONG); + return 0; + } + len += header_len; + *out_len = len; + + *out = reinterpret_cast(OPENSSL_malloc(len)); + if (*out == nullptr) { + return 0; + } + OPENSSL_memcpy(*out, header, header_len); + if (!bio_read_full(impl, (*out) + header_len, nullptr, len - header_len)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ENOUGH_DATA); + OPENSSL_free(*out); + return 0; + } + + return 1; +} + +int BIO_set_write_buffer_size(BIO *bio, int buffer_size) { return 0; } + +static StaticMutex g_index_lock; +static int g_index = BIO_TYPE_START; + +int BIO_get_new_index() { + MutexWriteLock lock(&g_index_lock); + // If |g_index| exceeds 255, it will collide with the flags bits. + int ret = g_index > 255 ? -1 : g_index++; + return ret; +} + +BIO_METHOD *BIO_meth_new(int type, const char *name) { + BIO_METHOD *method = New(); + if (method == nullptr) { + return nullptr; + } + method->type = type; + method->name = name; + return method; +} + +void BIO_meth_free(BIO_METHOD *method) { Delete(method); } + +int BIO_meth_set_create(BIO_METHOD *method, int (*create_func)(BIO *)) { + method->create = create_func; + return 1; +} + +int BIO_meth_set_destroy(BIO_METHOD *method, int (*destroy_func)(BIO *)) { + method->destroy = destroy_func; + return 1; +} + +int BIO_meth_set_write(BIO_METHOD *method, + int (*write_func)(BIO *, const char *, int)) { + method->bwrite = write_func; + return 1; +} + +int BIO_meth_set_read(BIO_METHOD *method, + int (*read_func)(BIO *, char *, int)) { + method->bread = read_func; + return 1; +} + +int BIO_meth_set_gets(BIO_METHOD *method, + int (*gets_func)(BIO *, char *, int)) { + method->bgets = gets_func; + return 1; +} + +int BIO_meth_set_ctrl(BIO_METHOD *method, + long (*ctrl_func)(BIO *, int, long, void *)) { + method->ctrl = ctrl_func; + return 1; +} + +int BIO_meth_set_callback_ctrl(BIO_METHOD *method, + long (*callback_ctrl_func)(BIO *, int, + BIO_info_cb *)) { + method->callback_ctrl = callback_ctrl_func; + return 1; +} + +void BIO_set_data(BIO *bio, void *ptr) { FromOpaque(bio)->ptr = ptr; } + +void *BIO_get_data(BIO *bio) { return FromOpaque(bio)->ptr; } + +void BIO_set_init(BIO *bio, int init) { FromOpaque(bio)->init = init; } + +int BIO_get_init(BIO *bio) { return FromOpaque(bio)->init; } + +void BIO_set_shutdown(BIO *bio, int shutdown) { + FromOpaque(bio)->shutdown = shutdown; +} + +int BIO_get_shutdown(BIO *bio) { return FromOpaque(bio)->shutdown; } + +int BIO_meth_set_puts(BIO_METHOD *method, int (*puts)(BIO *, const char *)) { + // Ignore the parameter. We implement |BIO_puts| using |BIO_write|. + return 1; +} + +int BIO_get_ex_new_index(long argl, void *argp, // + CRYPTO_EX_unused *unused, // + CRYPTO_EX_dup *dup_unused, // + CRYPTO_EX_free *free_func) { + return CRYPTO_get_ex_new_index_ex(&g_ex_data_class, argl, argp, free_func); +} + +int BIO_set_ex_data(BIO *bio, int idx, void *data) { + return CRYPTO_set_ex_data(&FromOpaque(bio)->ex_data, idx, data); +} + +void *BIO_get_ex_data(const BIO *bio, int idx) { + return CRYPTO_get_ex_data(&FromOpaque(bio)->ex_data, idx); +} diff --git a/third_party/boringssl/src/crypto/bio/bio_mem.c b/third_party/boringssl/src/crypto/bio/bio_mem.c deleted file mode 100644 index f40a9a79..00000000 --- a/third_party/boringssl/src/crypto/bio/bio_mem.c +++ /dev/null @@ -1,324 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include - -#include "../internal.h" - - -BIO *BIO_new_mem_buf(const void *buf, int len) { - BIO *ret; - BUF_MEM *b; - const size_t size = len < 0 ? strlen((char *)buf) : (size_t)len; - - if (!buf && len != 0) { - OPENSSL_PUT_ERROR(BIO, BIO_R_NULL_PARAMETER); - return NULL; - } - - ret = BIO_new(BIO_s_mem()); - if (ret == NULL) { - return NULL; - } - - b = (BUF_MEM *)ret->ptr; - // BIO_FLAGS_MEM_RDONLY ensures |b->data| is not written to. - b->data = (void *)buf; - b->length = size; - b->max = size; - - ret->flags |= BIO_FLAGS_MEM_RDONLY; - - // |num| is used to store the value that this BIO will return when it runs - // out of data. If it's negative then the retry flags will also be set. Since - // this is static data, retrying wont help - ret->num = 0; - - return ret; -} - -static int mem_new(BIO *bio) { - BUF_MEM *b; - - b = BUF_MEM_new(); - if (b == NULL) { - return 0; - } - - // |shutdown| is used to store the close flag: whether the BIO has ownership - // of the BUF_MEM. - bio->shutdown = 1; - bio->init = 1; - bio->num = -1; - bio->ptr = (char *)b; - - return 1; -} - -static int mem_free(BIO *bio) { - if (!bio->shutdown || !bio->init || bio->ptr == NULL) { - return 1; - } - - BUF_MEM *b = (BUF_MEM *)bio->ptr; - if (bio->flags & BIO_FLAGS_MEM_RDONLY) { - b->data = NULL; - } - BUF_MEM_free(b); - bio->ptr = NULL; - return 1; -} - -static int mem_read(BIO *bio, char *out, int outl) { - int ret; - BUF_MEM *b = (BUF_MEM*) bio->ptr; - - BIO_clear_retry_flags(bio); - ret = outl; - if (b->length < INT_MAX && ret > (int)b->length) { - ret = b->length; - } - - if (ret > 0) { - OPENSSL_memcpy(out, b->data, ret); - b->length -= ret; - if (bio->flags & BIO_FLAGS_MEM_RDONLY) { - b->data += ret; - } else { - OPENSSL_memmove(b->data, &b->data[ret], b->length); - } - } else if (b->length == 0) { - ret = bio->num; - if (ret != 0) { - BIO_set_retry_read(bio); - } - } - return ret; -} - -static int mem_write(BIO *bio, const char *in, int inl) { - int ret = -1; - int blen; - BUF_MEM *b; - - b = (BUF_MEM *)bio->ptr; - - if (bio->flags & BIO_FLAGS_MEM_RDONLY) { - OPENSSL_PUT_ERROR(BIO, BIO_R_WRITE_TO_READ_ONLY_BIO); - goto err; - } - - BIO_clear_retry_flags(bio); - blen = b->length; - if (INT_MAX - blen < inl) { - goto err; - } - if (BUF_MEM_grow_clean(b, blen + inl) != ((size_t) blen) + inl) { - goto err; - } - OPENSSL_memcpy(&b->data[blen], in, inl); - ret = inl; - -err: - return ret; -} - -static int mem_gets(BIO *bio, char *buf, int size) { - int i, j; - char *p; - BUF_MEM *b = (BUF_MEM *)bio->ptr; - - BIO_clear_retry_flags(bio); - j = b->length; - if (size - 1 < j) { - j = size - 1; - } - if (j <= 0) { - if (size > 0) { - *buf = 0; - } - return 0; - } - - p = b->data; - for (i = 0; i < j; i++) { - if (p[i] == '\n') { - i++; - break; - } - } - - // i is now the max num of bytes to copy, either j or up to and including the - // first newline - - i = mem_read(bio, buf, i); - if (i > 0) { - buf[i] = '\0'; - } - return i; -} - -static long mem_ctrl(BIO *bio, int cmd, long num, void *ptr) { - long ret = 1; - char **pptr; - - BUF_MEM *b = (BUF_MEM *)bio->ptr; - - switch (cmd) { - case BIO_CTRL_RESET: - if (b->data != NULL) { - // For read only case reset to the start again - if (bio->flags & BIO_FLAGS_MEM_RDONLY) { - b->data -= b->max - b->length; - b->length = b->max; - } else { - OPENSSL_memset(b->data, 0, b->max); - b->length = 0; - } - } - break; - case BIO_CTRL_EOF: - ret = (long)(b->length == 0); - break; - case BIO_C_SET_BUF_MEM_EOF_RETURN: - bio->num = (int)num; - break; - case BIO_CTRL_INFO: - ret = (long)b->length; - if (ptr != NULL) { - pptr = (char **)ptr; - *pptr = (char *)&b->data[0]; - } - break; - case BIO_C_SET_BUF_MEM: - mem_free(bio); - bio->shutdown = (int)num; - bio->ptr = ptr; - break; - case BIO_C_GET_BUF_MEM_PTR: - if (ptr != NULL) { - pptr = (char **)ptr; - *pptr = (char *)b; - } - break; - case BIO_CTRL_GET_CLOSE: - ret = (long)bio->shutdown; - break; - case BIO_CTRL_SET_CLOSE: - bio->shutdown = (int)num; - break; - - case BIO_CTRL_WPENDING: - ret = 0L; - break; - case BIO_CTRL_PENDING: - ret = (long)b->length; - break; - case BIO_CTRL_FLUSH: - ret = 1; - break; - default: - ret = 0; - break; - } - return ret; -} - -static const BIO_METHOD mem_method = { - BIO_TYPE_MEM, "memory buffer", - mem_write, mem_read, - NULL /* puts */, mem_gets, - mem_ctrl, mem_new, - mem_free, NULL /* callback_ctrl */, -}; - -const BIO_METHOD *BIO_s_mem(void) { return &mem_method; } - -int BIO_mem_contents(const BIO *bio, const uint8_t **out_contents, - size_t *out_len) { - const BUF_MEM *b; - if (bio->method != &mem_method) { - return 0; - } - - b = (BUF_MEM *)bio->ptr; - *out_contents = (uint8_t *)b->data; - *out_len = b->length; - return 1; -} - -long BIO_get_mem_data(BIO *bio, char **contents) { - return BIO_ctrl(bio, BIO_CTRL_INFO, 0, (char *) contents); -} - -int BIO_get_mem_ptr(BIO *bio, BUF_MEM **out) { - return BIO_ctrl(bio, BIO_C_GET_BUF_MEM_PTR, 0, (char *) out); -} - -int BIO_set_mem_buf(BIO *bio, BUF_MEM *b, int take_ownership) { - return BIO_ctrl(bio, BIO_C_SET_BUF_MEM, take_ownership, (char *) b); -} - -int BIO_set_mem_eof_return(BIO *bio, int eof_value) { - return BIO_ctrl(bio, BIO_C_SET_BUF_MEM_EOF_RETURN, eof_value, NULL); -} diff --git a/third_party/boringssl/src/crypto/bio/bio_mem.cc b/third_party/boringssl/src/crypto/bio/bio_mem.cc new file mode 100644 index 00000000..479b9ce2 --- /dev/null +++ b/third_party/boringssl/src/crypto/bio/bio_mem.cc @@ -0,0 +1,261 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +BIO *BIO_new_mem_buf(const void *buf, ossl_ssize_t len) { + BIO *ret; + BUF_MEM *b; + const size_t size = len < 0 ? strlen((char *)buf) : (size_t)len; + + if (!buf && len != 0) { + OPENSSL_PUT_ERROR(BIO, BIO_R_NULL_PARAMETER); + return nullptr; + } + + ret = BIO_new(BIO_s_mem()); + if (ret == nullptr) { + return nullptr; + } + + b = (BUF_MEM *)BIO_get_data(ret); + // BIO_FLAGS_MEM_RDONLY ensures |b->data| is not written to. + b->data = reinterpret_cast(const_cast(buf)); + b->length = size; + b->max = size; + + BIO_set_flags(ret, BIO_FLAGS_MEM_RDONLY); + + // |num| is used to store the value that this BIO will return when it runs + // out of data. If it's negative then the retry flags will also be set. Since + // this is static data, retrying won't help + FromOpaque(ret)->num = 0; + + return ret; +} + +static int mem_new(BIO *bio) { + BUF_MEM *b; + + b = BUF_MEM_new(); + if (b == nullptr) { + return 0; + } + + // |shutdown| is used to store the close flag: whether the BIO has ownership + // of the BUF_MEM. + BIO_set_shutdown(bio, 1); + BIO_set_init(bio, 1); + FromOpaque(bio)->num = -1; + BIO_set_data(bio, (char *)b); + + return 1; +} + +static int mem_free(BIO *bio) { + if (!BIO_get_shutdown(bio) || !BIO_get_init(bio) || + BIO_get_data(bio) == nullptr) { + return 1; + } + + BUF_MEM *b = (BUF_MEM *)BIO_get_data(bio); + if (BIO_test_flags(bio, BIO_FLAGS_MEM_RDONLY)) { + b->data = nullptr; + } + BUF_MEM_free(b); + BIO_set_data(bio, nullptr); + return 1; +} + +static int mem_read(BIO *bio, char *out, int outl) { + BIO_clear_retry_flags(bio); + if (outl <= 0) { + return 0; + } + + BUF_MEM *b = reinterpret_cast(BIO_get_data(bio)); + int ret = outl; + if ((size_t)ret > b->length) { + ret = (int)b->length; + } + + if (ret > 0) { + OPENSSL_memcpy(out, b->data, ret); + b->length -= ret; + if (BIO_test_flags(bio, BIO_FLAGS_MEM_RDONLY)) { + b->data += ret; + } else { + OPENSSL_memmove(b->data, &b->data[ret], b->length); + } + } else if (b->length == 0) { + ret = FromOpaque(bio)->num; + if (ret != 0) { + BIO_set_retry_read(bio); + } + } + return ret; +} + +static int mem_write(BIO *bio, const char *in, int inl) { + BIO_clear_retry_flags(bio); + if (inl <= 0) { + return 0; // Successfully write zero bytes. + } + + if (BIO_test_flags(bio, BIO_FLAGS_MEM_RDONLY)) { + OPENSSL_PUT_ERROR(BIO, BIO_R_WRITE_TO_READ_ONLY_BIO); + return -1; + } + + BUF_MEM *b = reinterpret_cast(BIO_get_data(bio)); + if (!BUF_MEM_append(b, in, inl)) { + return -1; + } + + return inl; +} + +static int mem_gets(BIO *bio, char *buf, int size) { + BIO_clear_retry_flags(bio); + if (size <= 0) { + return 0; + } + + // The buffer size includes space for the trailing NUL, so we can read at most + // one fewer byte. + BUF_MEM *b = reinterpret_cast(BIO_get_data(bio)); + int ret = size - 1; + if ((size_t)ret > b->length) { + ret = (int)b->length; + } + + // Stop at the first newline. + const char *newline = + reinterpret_cast(OPENSSL_memchr(b->data, '\n', ret)); + if (newline != nullptr) { + ret = (int)(newline - b->data + 1); + } + + ret = mem_read(bio, buf, ret); + if (ret >= 0) { + buf[ret] = '\0'; + } + return ret; +} + +static long mem_ctrl(BIO *bio, int cmd, long num, void *ptr) { + BUF_MEM *b = static_cast(BIO_get_data(bio)); + switch (cmd) { + case BIO_CTRL_RESET: + if (b->data != nullptr) { + // For read only case reset to the start again + if (BIO_test_flags(bio, BIO_FLAGS_MEM_RDONLY)) { + b->data -= b->max - b->length; + b->length = b->max; + } else { + OPENSSL_memset(b->data, 0, b->max); + b->length = 0; + } + } + return 1; + case BIO_CTRL_EOF: + return b->length == 0; + case BIO_C_SET_BUF_MEM_EOF_RETURN: + FromOpaque(bio)->num = static_cast(num); + return 1; + case BIO_CTRL_INFO: + if (ptr != nullptr) { + char **out = reinterpret_cast(ptr); + *out = b->data; + } + // This API can overflow on 64-bit Windows, where |long| is smaller than + // |ptrdiff_t|. |BIO_mem_contents| is the overflow-safe API. + return static_cast(b->length); + case BIO_C_SET_BUF_MEM: + mem_free(bio); + BIO_set_shutdown(bio, static_cast(num)); + BIO_set_data(bio, ptr); + return 1; + case BIO_C_GET_BUF_MEM_PTR: + if (ptr != nullptr) { + BUF_MEM **out = reinterpret_cast(ptr); + *out = b; + } + return 1; + case BIO_CTRL_GET_CLOSE: + return BIO_get_shutdown(bio); + case BIO_CTRL_SET_CLOSE: + BIO_set_shutdown(bio, static_cast(num)); + return 1; + case BIO_CTRL_WPENDING: + return 0; + case BIO_CTRL_PENDING: + // TODO(crbug.com/412584975): This can overflow on 64-bit Windows. + return static_cast(b->length); + case BIO_CTRL_FLUSH: + return 1; + default: + return 0; + } +} + +static const BIO_METHOD mem_method = { + BIO_TYPE_MEM, "memory buffer", mem_write, + mem_read, mem_gets, mem_ctrl, + mem_new, mem_free, /*callback_ctrl=*/nullptr, +}; + +const BIO_METHOD *BIO_s_mem() { return &mem_method; } + +int BIO_mem_contents(const BIO *bio, const uint8_t **out_contents, + size_t *out_len) { + const BUF_MEM *b; + if (FromOpaque(bio)->method != &mem_method) { + return 0; + } + + b = (BUF_MEM *)BIO_get_data((BIO *)bio); + *out_contents = (uint8_t *)b->data; + *out_len = b->length; + return 1; +} + +long BIO_get_mem_data(BIO *bio, char **contents) { + return BIO_ctrl(bio, BIO_CTRL_INFO, 0, contents); +} + +int BIO_get_mem_ptr(BIO *bio, BUF_MEM **out) { + return (int)BIO_ctrl(bio, BIO_C_GET_BUF_MEM_PTR, 0, out); +} + +int BIO_set_mem_buf(BIO *bio, BUF_MEM *b, int take_ownership) { + return (int)BIO_ctrl(bio, BIO_C_SET_BUF_MEM, take_ownership, b); +} + +int BIO_set_mem_eof_return(BIO *bio, int eof_value) { + return (int)BIO_ctrl(bio, BIO_C_SET_BUF_MEM_EOF_RETURN, eof_value, nullptr); +} diff --git a/third_party/boringssl/src/crypto/bio/connect.c b/third_party/boringssl/src/crypto/bio/connect.c deleted file mode 100644 index 9b86e513..00000000 --- a/third_party/boringssl/src/crypto/bio/connect.c +++ /dev/null @@ -1,547 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#if !defined(OPENSSL_TRUSTY) - -#include -#include -#include - -#if !defined(OPENSSL_WINDOWS) -#include -#include -#include -#include -#else -OPENSSL_MSVC_PRAGMA(warning(push, 3)) -#include -#include -OPENSSL_MSVC_PRAGMA(warning(pop)) -#endif - -#include -#include - -#include "internal.h" -#include "../internal.h" - - -enum { - BIO_CONN_S_BEFORE, - BIO_CONN_S_BLOCKED_CONNECT, - BIO_CONN_S_OK, -}; - -typedef struct bio_connect_st { - int state; - - char *param_hostname; - char *param_port; - int nbio; - - unsigned short port; - - struct sockaddr_storage them; - socklen_t them_length; - - // the file descriptor is kept in bio->num in order to match the socket - // BIO. - - // info_callback is called when the connection is initially made - // callback(BIO,state,ret); The callback should return 'ret', state is for - // compatibility with the SSL info_callback. - int (*info_callback)(const BIO *bio, int state, int ret); -} BIO_CONNECT; - -#if !defined(OPENSSL_WINDOWS) -static int closesocket(int sock) { - return close(sock); -} -#endif - -// split_host_and_port sets |*out_host| and |*out_port| to the host and port -// parsed from |name|. It returns one on success or zero on error. Even when -// successful, |*out_port| may be NULL on return if no port was specified. -static int split_host_and_port(char **out_host, char **out_port, - const char *name) { - const char *host, *port = NULL; - size_t host_len = 0; - - *out_host = NULL; - *out_port = NULL; - - if (name[0] == '[') { // bracketed IPv6 address - const char *close = strchr(name, ']'); - if (close == NULL) { - return 0; - } - host = name + 1; - host_len = close - host; - if (close[1] == ':') { // [IP]:port - port = close + 2; - } else if (close[1] != 0) { - return 0; - } - } else { - const char *colon = strchr(name, ':'); - if (colon == NULL || strchr(colon + 1, ':') != NULL) { // IPv6 address - host = name; - host_len = strlen(name); - } else { // host:port - host = name; - host_len = colon - name; - port = colon + 1; - } - } - - *out_host = OPENSSL_strndup(host, host_len); - if (*out_host == NULL) { - return 0; - } - if (port == NULL) { - *out_port = NULL; - return 1; - } - *out_port = OPENSSL_strdup(port); - if (*out_port == NULL) { - OPENSSL_free(*out_host); - *out_host = NULL; - return 0; - } - return 1; -} - -static int conn_state(BIO *bio, BIO_CONNECT *c) { - int ret = -1, i; - int (*cb)(const BIO *, int, int) = NULL; - - if (c->info_callback != NULL) { - cb = c->info_callback; - } - - for (;;) { - switch (c->state) { - case BIO_CONN_S_BEFORE: - // If there's a hostname and a port, assume that both are - // exactly what they say. If there is only a hostname, try - // (just once) to split it into a hostname and port. - - if (c->param_hostname == NULL) { - OPENSSL_PUT_ERROR(BIO, BIO_R_NO_HOSTNAME_SPECIFIED); - goto exit_loop; - } - - if (c->param_port == NULL) { - char *host, *port; - if (!split_host_and_port(&host, &port, c->param_hostname) || - port == NULL) { - OPENSSL_free(host); - OPENSSL_free(port); - OPENSSL_PUT_ERROR(BIO, BIO_R_NO_PORT_SPECIFIED); - ERR_add_error_data(2, "host=", c->param_hostname); - goto exit_loop; - } - - OPENSSL_free(c->param_port); - c->param_port = port; - OPENSSL_free(c->param_hostname); - c->param_hostname = host; - } - - if (!bio_ip_and_port_to_socket_and_addr( - &bio->num, &c->them, &c->them_length, c->param_hostname, - c->param_port)) { - OPENSSL_PUT_ERROR(BIO, BIO_R_UNABLE_TO_CREATE_SOCKET); - ERR_add_error_data(4, "host=", c->param_hostname, ":", c->param_port); - goto exit_loop; - } - - if (c->nbio) { - if (!bio_socket_nbio(bio->num, 1)) { - OPENSSL_PUT_ERROR(BIO, BIO_R_ERROR_SETTING_NBIO); - ERR_add_error_data(4, "host=", c->param_hostname, ":", - c->param_port); - goto exit_loop; - } - } - - i = 1; - ret = setsockopt(bio->num, SOL_SOCKET, SO_KEEPALIVE, (char *)&i, - sizeof(i)); - if (ret < 0) { - OPENSSL_PUT_SYSTEM_ERROR(); - OPENSSL_PUT_ERROR(BIO, BIO_R_KEEPALIVE); - ERR_add_error_data(4, "host=", c->param_hostname, ":", c->param_port); - goto exit_loop; - } - - BIO_clear_retry_flags(bio); - ret = connect(bio->num, (struct sockaddr*) &c->them, c->them_length); - if (ret < 0) { - if (bio_fd_should_retry(ret)) { - BIO_set_flags(bio, (BIO_FLAGS_IO_SPECIAL | BIO_FLAGS_SHOULD_RETRY)); - c->state = BIO_CONN_S_BLOCKED_CONNECT; - bio->retry_reason = BIO_RR_CONNECT; - } else { - OPENSSL_PUT_SYSTEM_ERROR(); - OPENSSL_PUT_ERROR(BIO, BIO_R_CONNECT_ERROR); - ERR_add_error_data(4, "host=", c->param_hostname, ":", - c->param_port); - } - goto exit_loop; - } else { - c->state = BIO_CONN_S_OK; - } - break; - - case BIO_CONN_S_BLOCKED_CONNECT: - i = bio_sock_error(bio->num); - if (i) { - if (bio_fd_should_retry(ret)) { - BIO_set_flags(bio, (BIO_FLAGS_IO_SPECIAL | BIO_FLAGS_SHOULD_RETRY)); - c->state = BIO_CONN_S_BLOCKED_CONNECT; - bio->retry_reason = BIO_RR_CONNECT; - ret = -1; - } else { - BIO_clear_retry_flags(bio); - OPENSSL_PUT_SYSTEM_ERROR(); - OPENSSL_PUT_ERROR(BIO, BIO_R_NBIO_CONNECT_ERROR); - ERR_add_error_data(4, "host=", c->param_hostname, ":", c->param_port); - ret = 0; - } - goto exit_loop; - } else { - c->state = BIO_CONN_S_OK; - } - break; - - case BIO_CONN_S_OK: - ret = 1; - goto exit_loop; - default: - assert(0); - goto exit_loop; - } - - if (cb != NULL) { - ret = cb((BIO *)bio, c->state, ret); - if (ret == 0) { - goto end; - } - } - } - -exit_loop: - if (cb != NULL) { - ret = cb((BIO *)bio, c->state, ret); - } - -end: - return ret; -} - -static BIO_CONNECT *BIO_CONNECT_new(void) { - BIO_CONNECT *ret = OPENSSL_malloc(sizeof(BIO_CONNECT)); - - if (ret == NULL) { - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(BIO_CONNECT)); - - ret->state = BIO_CONN_S_BEFORE; - return ret; -} - -static void BIO_CONNECT_free(BIO_CONNECT *c) { - if (c == NULL) { - return; - } - - OPENSSL_free(c->param_hostname); - OPENSSL_free(c->param_port); - OPENSSL_free(c); -} - -static int conn_new(BIO *bio) { - bio->init = 0; - bio->num = -1; - bio->flags = 0; - bio->ptr = BIO_CONNECT_new(); - return bio->ptr != NULL; -} - -static void conn_close_socket(BIO *bio) { - BIO_CONNECT *c = (BIO_CONNECT *) bio->ptr; - - if (bio->num == -1) { - return; - } - - // Only do a shutdown if things were established - if (c->state == BIO_CONN_S_OK) { - shutdown(bio->num, 2); - } - closesocket(bio->num); - bio->num = -1; -} - -static int conn_free(BIO *bio) { - if (bio->shutdown) { - conn_close_socket(bio); - } - - BIO_CONNECT_free((BIO_CONNECT*) bio->ptr); - - return 1; -} - -static int conn_read(BIO *bio, char *out, int out_len) { - int ret = 0; - BIO_CONNECT *data; - - data = (BIO_CONNECT *)bio->ptr; - if (data->state != BIO_CONN_S_OK) { - ret = conn_state(bio, data); - if (ret <= 0) { - return ret; - } - } - - bio_clear_socket_error(); - ret = recv(bio->num, out, out_len, 0); - BIO_clear_retry_flags(bio); - if (ret <= 0) { - if (bio_fd_should_retry(ret)) { - BIO_set_retry_read(bio); - } - } - - return ret; -} - -static int conn_write(BIO *bio, const char *in, int in_len) { - int ret; - BIO_CONNECT *data; - - data = (BIO_CONNECT *)bio->ptr; - if (data->state != BIO_CONN_S_OK) { - ret = conn_state(bio, data); - if (ret <= 0) { - return ret; - } - } - - bio_clear_socket_error(); - ret = send(bio->num, in, in_len, 0); - BIO_clear_retry_flags(bio); - if (ret <= 0) { - if (bio_fd_should_retry(ret)) { - BIO_set_retry_write(bio); - } - } - - return ret; -} - -static long conn_ctrl(BIO *bio, int cmd, long num, void *ptr) { - int *ip; - long ret = 1; - BIO_CONNECT *data; - - data = (BIO_CONNECT *)bio->ptr; - - switch (cmd) { - case BIO_CTRL_RESET: - ret = 0; - data->state = BIO_CONN_S_BEFORE; - conn_close_socket(bio); - bio->flags = 0; - break; - case BIO_C_DO_STATE_MACHINE: - // use this one to start the connection - if (data->state != BIO_CONN_S_OK) { - ret = (long)conn_state(bio, data); - } else { - ret = 1; - } - break; - case BIO_C_SET_CONNECT: - if (ptr != NULL) { - bio->init = 1; - if (num == 0) { - OPENSSL_free(data->param_hostname); - data->param_hostname = OPENSSL_strdup(ptr); - if (data->param_hostname == NULL) { - ret = 0; - } - } else if (num == 1) { - OPENSSL_free(data->param_port); - data->param_port = OPENSSL_strdup(ptr); - if (data->param_port == NULL) { - ret = 0; - } - } else { - ret = 0; - } - } - break; - case BIO_C_SET_NBIO: - data->nbio = (int)num; - break; - case BIO_C_GET_FD: - if (bio->init) { - ip = (int *)ptr; - if (ip != NULL) { - *ip = bio->num; - } - ret = bio->num; - } else { - ret = -1; - } - break; - case BIO_CTRL_GET_CLOSE: - ret = bio->shutdown; - break; - case BIO_CTRL_SET_CLOSE: - bio->shutdown = (int)num; - break; - case BIO_CTRL_PENDING: - case BIO_CTRL_WPENDING: - ret = 0; - break; - case BIO_CTRL_FLUSH: - break; - case BIO_CTRL_GET_CALLBACK: { - int (**fptr)(const BIO *bio, int state, int xret) = ptr; - *fptr = data->info_callback; - } break; - default: - ret = 0; - break; - } - return ret; -} - -static long conn_callback_ctrl(BIO *bio, int cmd, bio_info_cb fp) { - long ret = 1; - BIO_CONNECT *data; - - data = (BIO_CONNECT *)bio->ptr; - - switch (cmd) { - case BIO_CTRL_SET_CALLBACK: - // This is the actual type signature of |fp|. The caller is expected to - // cast it to |bio_info_cb| due to the |BIO_callback_ctrl| calling - // convention. - OPENSSL_MSVC_PRAGMA(warning(push)) - OPENSSL_MSVC_PRAGMA(warning(disable : 4191)) - data->info_callback = (int (*)(const struct bio_st *, int, int))fp; - OPENSSL_MSVC_PRAGMA(warning(pop)) - break; - default: - ret = 0; - break; - } - return ret; -} - -BIO *BIO_new_connect(const char *hostname) { - BIO *ret; - - ret = BIO_new(BIO_s_connect()); - if (ret == NULL) { - return NULL; - } - if (!BIO_set_conn_hostname(ret, hostname)) { - BIO_free(ret); - return NULL; - } - return ret; -} - -static const BIO_METHOD methods_connectp = { - BIO_TYPE_CONNECT, "socket connect", conn_write, conn_read, - NULL /* puts */, NULL /* gets */, conn_ctrl, conn_new, - conn_free, conn_callback_ctrl, -}; - -const BIO_METHOD *BIO_s_connect(void) { return &methods_connectp; } - -int BIO_set_conn_hostname(BIO *bio, const char *name) { - return BIO_ctrl(bio, BIO_C_SET_CONNECT, 0, (void*) name); -} - -int BIO_set_conn_port(BIO *bio, const char *port_str) { - return BIO_ctrl(bio, BIO_C_SET_CONNECT, 1, (void*) port_str); -} - -int BIO_set_conn_int_port(BIO *bio, const int *port) { - char buf[DECIMAL_SIZE(int) + 1]; - BIO_snprintf(buf, sizeof(buf), "%d", *port); - return BIO_set_conn_port(bio, buf); -} - -int BIO_set_nbio(BIO *bio, int on) { - return BIO_ctrl(bio, BIO_C_SET_NBIO, on, NULL); -} - -int BIO_do_connect(BIO *bio) { - return BIO_ctrl(bio, BIO_C_DO_STATE_MACHINE, 0, NULL); -} - -#endif // OPENSSL_TRUSTY diff --git a/third_party/boringssl/src/crypto/bio/connect.cc b/third_party/boringssl/src/crypto/bio/connect.cc new file mode 100644 index 00000000..7f4a17be --- /dev/null +++ b/third_party/boringssl/src/crypto/bio/connect.cc @@ -0,0 +1,459 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if !defined(OPENSSL_NO_SOCK) + +#include +#include +#include + +#if !defined(OPENSSL_WINDOWS) +#include +#include +#include +#include +#else +#include +#include +#endif + +#include + +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +enum { + BIO_CONN_S_BEFORE, + BIO_CONN_S_BLOCKED_CONNECT, + BIO_CONN_S_OK, +}; + +namespace { +struct BIO_CONNECT { + int state = BIO_CONN_S_BEFORE; + + UniquePtr param_hostname; + UniquePtr param_port; + int nbio = 0; + + unsigned short port = 0; + + sockaddr_storage them; + socklen_t them_length = 0; + + // The file descriptor is kept in bio->num in order to match the socket BIO. + + // info_callback is called when the connection is initially made + // callback(BIO,state,ret); The callback should return 'ret', state is for + // compatibility with the SSL info_callback. + int (*info_callback)(BIO *bio, int state, int ret) = nullptr; +}; +} // namespace + +#if !defined(OPENSSL_WINDOWS) +static int closesocket(int sock) { return close(sock); } +#endif + +// split_host_and_port sets |*out_host| and |*out_port| to the host and port +// parsed from |name|. It returns one on success or zero on error. Even when +// successful, |*out_port| may be NULL on return if no port was specified. +static int split_host_and_port(UniquePtr *out_host, + UniquePtr *out_port, const char *name) { + const char *host, *port = nullptr; + size_t host_len = 0; + + *out_host = nullptr; + *out_port = nullptr; + + if (name[0] == '[') { // bracketed IPv6 address + const char *close = strchr(name, ']'); + if (close == nullptr) { + return 0; + } + host = name + 1; + host_len = close - host; + if (close[1] == ':') { // [IP]:port + port = close + 2; + } else if (close[1] != 0) { + return 0; + } + } else { + const char *colon = strchr(name, ':'); + if (colon == nullptr || + strchr(colon + 1, ':') != nullptr) { // IPv6 address + host = name; + host_len = strlen(name); + } else { // host:port + host = name; + host_len = colon - name; + port = colon + 1; + } + } + + out_host->reset(OPENSSL_strndup(host, host_len)); + if (*out_host == nullptr) { + return 0; + } + if (port == nullptr) { + *out_port = nullptr; + return 1; + } + out_port->reset(OPENSSL_strdup(port)); + if (*out_port == nullptr) { + *out_host = nullptr; + return 0; + } + return 1; +} + +static int conn_state(BIO *bio, BIO_CONNECT *c) { + int ret = -1, i; + int (*cb)(BIO *, int, int) = nullptr; + + if (c->info_callback != nullptr) { + cb = c->info_callback; + } + + for (;;) { + switch (c->state) { + case BIO_CONN_S_BEFORE: + // If there's a hostname and a port, assume that both are + // exactly what they say. If there is only a hostname, try + // (just once) to split it into a hostname and port. + + if (c->param_hostname == nullptr) { + OPENSSL_PUT_ERROR(BIO, BIO_R_NO_HOSTNAME_SPECIFIED); + goto exit_loop; + } + + if (c->param_port == nullptr) { + UniquePtr host, port; + if (!split_host_and_port(&host, &port, c->param_hostname.get()) || + port == nullptr) { + OPENSSL_PUT_ERROR(BIO, BIO_R_NO_PORT_SPECIFIED); + ERR_add_error_data(2, "host=", c->param_hostname.get()); + goto exit_loop; + } + + c->param_port = std::move(port); + c->param_hostname = std::move(host); + } + + if (!bio_ip_and_port_to_socket_and_addr( + &FromOpaque(bio)->num, &c->them, &c->them_length, + c->param_hostname.get(), c->param_port.get())) { + OPENSSL_PUT_ERROR(BIO, BIO_R_UNABLE_TO_CREATE_SOCKET); + ERR_add_error_data(4, "host=", c->param_hostname.get(), ":", + c->param_port.get()); + goto exit_loop; + } + + if (c->nbio) { + if (!bio_socket_nbio(FromOpaque(bio)->num, 1)) { + OPENSSL_PUT_ERROR(BIO, BIO_R_ERROR_SETTING_NBIO); + ERR_add_error_data(4, "host=", c->param_hostname.get(), ":", + c->param_port.get()); + goto exit_loop; + } + } + + i = 1; + ret = setsockopt(FromOpaque(bio)->num, SOL_SOCKET, SO_KEEPALIVE, + (char *)&i, sizeof(i)); + if (ret < 0) { + OPENSSL_PUT_SYSTEM_ERROR(); + OPENSSL_PUT_ERROR(BIO, BIO_R_KEEPALIVE); + ERR_add_error_data(4, "host=", c->param_hostname.get(), ":", + c->param_port.get()); + goto exit_loop; + } + + BIO_clear_retry_flags(bio); + ret = connect(FromOpaque(bio)->num, (struct sockaddr *)&c->them, + c->them_length); + if (ret < 0) { + if (bio_socket_should_retry(ret)) { + BIO_set_retry_special(bio); + c->state = BIO_CONN_S_BLOCKED_CONNECT; + BIO_set_retry_reason(bio, BIO_RR_CONNECT); + } else { + OPENSSL_PUT_SYSTEM_ERROR(); + OPENSSL_PUT_ERROR(BIO, BIO_R_CONNECT_ERROR); + ERR_add_error_data(4, "host=", c->param_hostname.get(), ":", + c->param_port.get()); + } + goto exit_loop; + } else { + c->state = BIO_CONN_S_OK; + } + break; + + case BIO_CONN_S_BLOCKED_CONNECT: + i = bio_sock_error(FromOpaque(bio)->num); + if (i) { + if (bio_socket_should_retry(ret)) { + BIO_set_retry_special(bio); + c->state = BIO_CONN_S_BLOCKED_CONNECT; + BIO_set_retry_reason(bio, BIO_RR_CONNECT); + ret = -1; + } else { + BIO_clear_retry_flags(bio); + OPENSSL_PUT_SYSTEM_ERROR(); + OPENSSL_PUT_ERROR(BIO, BIO_R_NBIO_CONNECT_ERROR); + ERR_add_error_data(4, "host=", c->param_hostname.get(), ":", + c->param_port.get()); + ret = 0; + } + goto exit_loop; + } else { + c->state = BIO_CONN_S_OK; + } + break; + + case BIO_CONN_S_OK: + ret = 1; + goto exit_loop; + default: + assert(0); + goto exit_loop; + } + + if (cb != nullptr) { + ret = cb((BIO *)bio, c->state, ret); + if (ret == 0) { + goto end; + } + } + } + +exit_loop: + if (cb != nullptr) { + ret = cb((BIO *)bio, c->state, ret); + } + +end: + return ret; +} + +static int conn_new(BIO *bio) { + BIO_set_init(bio, 0); + FromOpaque(bio)->num = -1; + FromOpaque(bio)->flags = 0; + BIO_set_data(bio, New()); + return BIO_get_data(bio) != nullptr; +} + +static void conn_close_socket(BIO *bio) { + BIO_CONNECT *c = (BIO_CONNECT *)BIO_get_data(bio); + + if (FromOpaque(bio)->num == -1) { + return; + } + + // Only do a shutdown if things were established + if (c->state == BIO_CONN_S_OK) { + shutdown(FromOpaque(bio)->num, 2); + } + closesocket(FromOpaque(bio)->num); + FromOpaque(bio)->num = -1; +} + +static int conn_free(BIO *bio) { + if (BIO_get_shutdown(bio)) { + conn_close_socket(bio); + } + + Delete(static_cast(BIO_get_data(bio))); + return 1; +} + +static int conn_read(BIO *bio, char *out, int out_len) { + int ret = 0; + BIO_CONNECT *data; + + data = (BIO_CONNECT *)BIO_get_data(bio); + if (data->state != BIO_CONN_S_OK) { + ret = conn_state(bio, data); + if (ret <= 0) { + return ret; + } + } + + bio_clear_socket_error(); + ret = (int)recv(FromOpaque(bio)->num, out, out_len, 0); + BIO_clear_retry_flags(bio); + if (ret <= 0) { + if (bio_socket_should_retry(ret)) { + BIO_set_retry_read(bio); + } + } + + return ret; +} + +static int conn_write(BIO *bio, const char *in, int in_len) { + int ret; + BIO_CONNECT *data; + + data = (BIO_CONNECT *)BIO_get_data(bio); + if (data->state != BIO_CONN_S_OK) { + ret = conn_state(bio, data); + if (ret <= 0) { + return ret; + } + } + + bio_clear_socket_error(); + ret = (int)send(FromOpaque(bio)->num, in, in_len, 0); + BIO_clear_retry_flags(bio); + if (ret <= 0) { + if (bio_socket_should_retry(ret)) { + BIO_set_retry_write(bio); + } + } + + return ret; +} + +static long conn_ctrl(BIO *bio, int cmd, long num, void *ptr) { + BIO_CONNECT *data = static_cast(BIO_get_data(bio)); + switch (cmd) { + case BIO_CTRL_RESET: + data->state = BIO_CONN_S_BEFORE; + conn_close_socket(bio); + FromOpaque(bio)->flags = 0; + return 0; + case BIO_C_DO_STATE_MACHINE: + // use this one to start the connection + if (data->state != BIO_CONN_S_OK) { + return conn_state(bio, data); + } else { + return 1; + } + case BIO_C_SET_CONNECT: + if (ptr == nullptr) { + return 0; + } + BIO_set_init(bio, 1); + if (num == 0) { + data->param_hostname.reset( + OPENSSL_strdup(reinterpret_cast(ptr))); + if (data->param_hostname == nullptr) { + return 0; + } + } else if (num == 1) { + data->param_port.reset( + OPENSSL_strdup(reinterpret_cast(ptr))); + if (data->param_port == nullptr) { + return 0; + } + } else { + return 0; + } + return 1; + case BIO_C_SET_NBIO: + data->nbio = static_cast(num); + return 1; + case BIO_C_GET_FD: + if (BIO_get_init(bio)) { + int *out = static_cast(ptr); + if (out != nullptr) { + *out = FromOpaque(bio)->num; + } + return FromOpaque(bio)->num; + } else { + return -1; + } + case BIO_CTRL_GET_CLOSE: + return BIO_get_shutdown(bio); + case BIO_CTRL_SET_CLOSE: + BIO_set_shutdown(bio, static_cast(num)); + return 1; + case BIO_CTRL_FLUSH: + return 1; + case BIO_CTRL_GET_CALLBACK: { + auto out = reinterpret_cast(ptr); + *out = data->info_callback; + return 1; + } + default: + return 0; + } +} + +static long conn_callback_ctrl(BIO *bio, int cmd, BIO_info_cb *fp) { + BIO_CONNECT *data = static_cast(BIO_get_data(bio)); + switch (cmd) { + case BIO_CTRL_SET_CALLBACK: + data->info_callback = fp; + return 1; + default: + return 0; + } +} + +BIO *BIO_new_connect(const char *hostname) { + BIO *ret; + + ret = BIO_new(BIO_s_connect()); + if (ret == nullptr) { + return nullptr; + } + if (!BIO_set_conn_hostname(ret, hostname)) { + BIO_free(ret); + return nullptr; + } + return ret; +} + +static const BIO_METHOD methods_connectp = { + BIO_TYPE_CONNECT, "socket connect", conn_write, + conn_read, /*gets=*/nullptr, conn_ctrl, + conn_new, conn_free, conn_callback_ctrl, +}; + +const BIO_METHOD *BIO_s_connect() { return &methods_connectp; } + +int BIO_set_conn_hostname(BIO *bio, const char *name) { + return (int)BIO_ctrl(bio, BIO_C_SET_CONNECT, 0, (void *)name); +} + +int BIO_set_conn_port(BIO *bio, const char *port_str) { + return (int)BIO_ctrl(bio, BIO_C_SET_CONNECT, 1, (void *)port_str); +} + +int BIO_set_conn_int_port(BIO *bio, const int *port) { + char buf[DECIMAL_SIZE(int) + 1]; + snprintf(buf, sizeof(buf), "%d", *port); + return BIO_set_conn_port(bio, buf); +} + +int BIO_set_nbio(BIO *bio, int on) { + return (int)BIO_ctrl(bio, BIO_C_SET_NBIO, on, nullptr); +} + +int BIO_do_connect(BIO *bio) { + return (int)BIO_ctrl(bio, BIO_C_DO_STATE_MACHINE, 0, nullptr); +} + +#endif // OPENSSL_NO_SOCK diff --git a/third_party/boringssl/src/crypto/bio/errno.cc b/third_party/boringssl/src/crypto/bio/errno.cc new file mode 100644 index 00000000..6e31fac6 --- /dev/null +++ b/third_party/boringssl/src/crypto/bio/errno.cc @@ -0,0 +1,52 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "internal.h" + + +using namespace bssl; + +int bssl::bio_errno_should_retry(int return_value) { + if (return_value != -1) { + return 0; + } + + return +#ifdef EWOULDBLOCK + errno == EWOULDBLOCK || +#endif +#ifdef ENOTCONN + errno == ENOTCONN || +#endif +#ifdef EINTR + errno == EINTR || +#endif +#ifdef EAGAIN + errno == EAGAIN || +#endif +#ifdef EPROTO + errno == EPROTO || +#endif +#ifdef EINPROGRESS + errno == EINPROGRESS || +#endif +#ifdef EALREADY + errno == EALREADY || +#endif + 0; +} diff --git a/third_party/boringssl/src/crypto/bio/fd.c b/third_party/boringssl/src/crypto/bio/fd.c deleted file mode 100644 index 349ee9dd..00000000 --- a/third_party/boringssl/src/crypto/bio/fd.c +++ /dev/null @@ -1,275 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#if !defined(OPENSSL_TRUSTY) - -#include -#include - -#if !defined(OPENSSL_WINDOWS) -#include -#else -#include -OPENSSL_MSVC_PRAGMA(warning(push, 3)) -#include -OPENSSL_MSVC_PRAGMA(warning(pop)) -#endif - -#include -#include - -#include "internal.h" -#include "../internal.h" - - -static int bio_fd_non_fatal_error(int err) { - if ( -#ifdef EWOULDBLOCK - err == EWOULDBLOCK || -#endif -#ifdef WSAEWOULDBLOCK - err == WSAEWOULDBLOCK || -#endif -#ifdef ENOTCONN - err == ENOTCONN || -#endif -#ifdef EINTR - err == EINTR || -#endif -#ifdef EAGAIN - err == EAGAIN || -#endif -#ifdef EPROTO - err == EPROTO || -#endif -#ifdef EINPROGRESS - err == EINPROGRESS || -#endif -#ifdef EALREADY - err == EALREADY || -#endif - 0) { - return 1; - } - return 0; -} - -#if defined(OPENSSL_WINDOWS) - #define BORINGSSL_ERRNO (int)GetLastError() - #define BORINGSSL_CLOSE _close - #define BORINGSSL_LSEEK _lseek - #define BORINGSSL_READ _read - #define BORINGSSL_WRITE _write -#else - #define BORINGSSL_ERRNO errno - #define BORINGSSL_CLOSE close - #define BORINGSSL_LSEEK lseek - #define BORINGSSL_READ read - #define BORINGSSL_WRITE write -#endif - -int bio_fd_should_retry(int i) { - if (i == -1) { - return bio_fd_non_fatal_error(BORINGSSL_ERRNO); - } - return 0; -} - -BIO *BIO_new_fd(int fd, int close_flag) { - BIO *ret = BIO_new(BIO_s_fd()); - if (ret == NULL) { - return NULL; - } - BIO_set_fd(ret, fd, close_flag); - return ret; -} - -static int fd_new(BIO *bio) { - // num is used to store the file descriptor. - bio->num = -1; - return 1; -} - -static int fd_free(BIO *bio) { - if (bio->shutdown) { - if (bio->init) { - BORINGSSL_CLOSE(bio->num); - } - bio->init = 0; - } - return 1; -} - -static int fd_read(BIO *b, char *out, int outl) { - int ret = 0; - - ret = BORINGSSL_READ(b->num, out, outl); - BIO_clear_retry_flags(b); - if (ret <= 0) { - if (bio_fd_should_retry(ret)) { - BIO_set_retry_read(b); - } - } - - return ret; -} - -static int fd_write(BIO *b, const char *in, int inl) { - int ret = BORINGSSL_WRITE(b->num, in, inl); - BIO_clear_retry_flags(b); - if (ret <= 0) { - if (bio_fd_should_retry(ret)) { - BIO_set_retry_write(b); - } - } - - return ret; -} - -static long fd_ctrl(BIO *b, int cmd, long num, void *ptr) { - long ret = 1; - int *ip; - - switch (cmd) { - case BIO_CTRL_RESET: - num = 0; - OPENSSL_FALLTHROUGH; - case BIO_C_FILE_SEEK: - ret = 0; - if (b->init) { - ret = (long)BORINGSSL_LSEEK(b->num, num, SEEK_SET); - } - break; - case BIO_C_FILE_TELL: - case BIO_CTRL_INFO: - ret = 0; - if (b->init) { - ret = (long)BORINGSSL_LSEEK(b->num, 0, SEEK_CUR); - } - break; - case BIO_C_SET_FD: - fd_free(b); - b->num = *((int *)ptr); - b->shutdown = (int)num; - b->init = 1; - break; - case BIO_C_GET_FD: - if (b->init) { - ip = (int *)ptr; - if (ip != NULL) { - *ip = b->num; - } - return b->num; - } else { - ret = -1; - } - break; - case BIO_CTRL_GET_CLOSE: - ret = b->shutdown; - break; - case BIO_CTRL_SET_CLOSE: - b->shutdown = (int)num; - break; - case BIO_CTRL_PENDING: - case BIO_CTRL_WPENDING: - ret = 0; - break; - case BIO_CTRL_FLUSH: - ret = 1; - break; - default: - ret = 0; - break; - } - - return ret; -} - -static int fd_gets(BIO *bp, char *buf, int size) { - char *ptr = buf; - char *end = buf + size - 1; - - if (size <= 0) { - return 0; - } - - while (ptr < end && fd_read(bp, ptr, 1) > 0 && ptr[0] != '\n') { - ptr++; - } - - ptr[0] = '\0'; - - return ptr - buf; -} - -static const BIO_METHOD methods_fdp = { - BIO_TYPE_FD, "file descriptor", fd_write, fd_read, NULL /* puts */, - fd_gets, fd_ctrl, fd_new, fd_free, NULL /* callback_ctrl */, -}; - -const BIO_METHOD *BIO_s_fd(void) { return &methods_fdp; } - -int BIO_set_fd(BIO *bio, int fd, int close_flag) { - return BIO_int_ctrl(bio, BIO_C_SET_FD, close_flag, fd); -} - -int BIO_get_fd(BIO *bio, int *out_fd) { - return BIO_ctrl(bio, BIO_C_GET_FD, 0, (char *) out_fd); -} - -#endif // OPENSSL_TRUSTY diff --git a/third_party/boringssl/src/crypto/bio/fd.cc b/third_party/boringssl/src/crypto/bio/fd.cc new file mode 100644 index 00000000..b49acb65 --- /dev/null +++ b/third_party/boringssl/src/crypto/bio/fd.cc @@ -0,0 +1,181 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if !defined(OPENSSL_NO_POSIX_IO) + +#include +#include + +#if !defined(OPENSSL_WINDOWS) +#include +#else +#include +#endif + +#include +#include + +#include "internal.h" +#include "../internal.h" + + +#if defined(OPENSSL_WINDOWS) + #define BORINGSSL_CLOSE _close + #define BORINGSSL_LSEEK _lseek + #define BORINGSSL_READ _read + #define BORINGSSL_WRITE _write +#else + #define BORINGSSL_CLOSE close + #define BORINGSSL_LSEEK lseek + #define BORINGSSL_READ read + #define BORINGSSL_WRITE write +#endif + +using namespace bssl; + +BIO *BIO_new_fd(int fd, int close_flag) { + BIO *ret = BIO_new(BIO_s_fd()); + if (ret == nullptr) { + return nullptr; + } + BIO_set_fd(ret, fd, close_flag); + return ret; +} + +static int fd_new(BIO *bio) { + // num is used to store the file descriptor. + FromOpaque(bio)->num = -1; + return 1; +} + +static int fd_free(BIO *bio) { + if (BIO_get_shutdown(bio)) { + if (BIO_get_init(bio)) { + BORINGSSL_CLOSE(FromOpaque(bio)->num); + } + BIO_set_init(bio, 0); + } + return 1; +} + +static int fd_read(BIO *b, char *out, int outl) { + int ret = 0; + + ret = (int)BORINGSSL_READ(FromOpaque(b)->num, out, outl); + BIO_clear_retry_flags(b); + if (ret <= 0) { + if (bio_errno_should_retry(ret)) { + BIO_set_retry_read(b); + } + } + + return ret; +} + +static int fd_write(BIO *b, const char *in, int inl) { + int ret = (int)BORINGSSL_WRITE(FromOpaque(b)->num, in, inl); + BIO_clear_retry_flags(b); + if (ret <= 0) { + if (bio_errno_should_retry(ret)) { + BIO_set_retry_write(b); + } + } + + return ret; +} + +static long fd_ctrl(BIO *b, int cmd, long num, void *ptr) { + switch (cmd) { + case BIO_CTRL_RESET: + num = 0; + [[fallthrough]]; + case BIO_C_FILE_SEEK: + if (BIO_get_init(b)) { + return (long)BORINGSSL_LSEEK(FromOpaque(b)->num, num, SEEK_SET); + } + return 0; + case BIO_C_FILE_TELL: + case BIO_CTRL_INFO: + if (BIO_get_init(b)) { + return (long)BORINGSSL_LSEEK(FromOpaque(b)->num, 0, SEEK_CUR); + } + return 0; + case BIO_C_SET_FD: + fd_free(b); + FromOpaque(b)->num = *static_cast(ptr); + BIO_set_shutdown(b, static_cast(num)); + BIO_set_init(b, 1); + return 1; + case BIO_C_GET_FD: + if (BIO_get_init(b)) { + int *out = static_cast(ptr); + if (out != nullptr) { + *out = FromOpaque(b)->num; + } + return FromOpaque(b)->num; + } else { + return -1; + } + case BIO_CTRL_GET_CLOSE: + return BIO_get_shutdown(b); + case BIO_CTRL_SET_CLOSE: + BIO_set_shutdown(b, static_cast(num)); + return 1; + case BIO_CTRL_FLUSH: + return 1; + default: + return 0; + } +} + +static int fd_gets(BIO *bp, char *buf, int size) { + if (size <= 0) { + return 0; + } + + char *ptr = buf; + char *end = buf + size - 1; + while (ptr < end && fd_read(bp, ptr, 1) > 0) { + char c = ptr[0]; + ptr++; + if (c == '\n') { + break; + } + } + + ptr[0] = '\0'; + + // The output length is bounded by |size|. + return (int)(ptr - buf); +} + +static const BIO_METHOD methods_fdp = { + BIO_TYPE_FD, "file descriptor", fd_write, + fd_read, fd_gets, fd_ctrl, + fd_new, fd_free, /*callback_ctrl=*/nullptr, +}; + +const BIO_METHOD *BIO_s_fd() { return &methods_fdp; } + +#endif // OPENSSL_NO_POSIX_IO + +int BIO_set_fd(BIO *bio, int fd, int close_flag) { + return (int)BIO_int_ctrl(bio, BIO_C_SET_FD, close_flag, fd); +} + +int BIO_get_fd(BIO *bio, int *out_fd) { + return (int)BIO_ctrl(bio, BIO_C_GET_FD, 0, (char *) out_fd); +} diff --git a/third_party/boringssl/src/crypto/bio/file.c b/third_party/boringssl/src/crypto/bio/file.c deleted file mode 100644 index 66278e57..00000000 --- a/third_party/boringssl/src/crypto/bio/file.c +++ /dev/null @@ -1,317 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#if defined(__linux) || defined(__sun) || defined(__hpux) -// Following definition aliases fopen to fopen64 on above mentioned -// platforms. This makes it possible to open and sequentially access -// files larger than 2GB from 32-bit application. It does not allow to -// traverse them beyond 2GB with fseek/ftell, but on the other hand *no* -// 32-bit platform permits that, not with fseek/ftell. Not to mention -// that breaking 2GB limit for seeking would require surgery to *our* -// API. But sequential access suffices for practical cases when you -// can run into large files, such as fingerprinting, so we can let API -// alone. For reference, the list of 32-bit platforms which allow for -// sequential access of large files without extra "magic" comprise *BSD, -// Darwin, IRIX... -#ifndef _FILE_OFFSET_BITS -#define _FILE_OFFSET_BITS 64 -#endif -#endif - -#include - -#if !defined(OPENSSL_TRUSTY) - -#include -#include -#include - -#include -#include - -#include "../internal.h" - - -#define BIO_FP_READ 0x02 -#define BIO_FP_WRITE 0x04 -#define BIO_FP_APPEND 0x08 - -BIO *BIO_new_file(const char *filename, const char *mode) { - BIO *ret; - FILE *file; - - file = fopen(filename, mode); - if (file == NULL) { - OPENSSL_PUT_SYSTEM_ERROR(); - - ERR_add_error_data(5, "fopen('", filename, "','", mode, "')"); - if (errno == ENOENT) { - OPENSSL_PUT_ERROR(BIO, BIO_R_NO_SUCH_FILE); - } else { - OPENSSL_PUT_ERROR(BIO, BIO_R_SYS_LIB); - } - return NULL; - } - - ret = BIO_new_fp(file, BIO_CLOSE); - if (ret == NULL) { - fclose(file); - return NULL; - } - - return ret; -} - -BIO *BIO_new_fp(FILE *stream, int close_flag) { - BIO *ret = BIO_new(BIO_s_file()); - - if (ret == NULL) { - return NULL; - } - - BIO_set_fp(ret, stream, close_flag); - return ret; -} - -static int file_free(BIO *bio) { - if (!bio->shutdown) { - return 1; - } - - if (bio->init && bio->ptr != NULL) { - fclose(bio->ptr); - bio->ptr = NULL; - } - bio->init = 0; - - return 1; -} - -static int file_read(BIO *b, char *out, int outl) { - if (!b->init) { - return 0; - } - - size_t ret = fread(out, 1, outl, (FILE *)b->ptr); - if (ret == 0 && ferror((FILE *)b->ptr)) { - OPENSSL_PUT_SYSTEM_ERROR(); - OPENSSL_PUT_ERROR(BIO, ERR_R_SYS_LIB); - return -1; - } - - // fread reads at most |outl| bytes, so |ret| fits in an int. - return (int)ret; -} - -static int file_write(BIO *b, const char *in, int inl) { - int ret = 0; - - if (!b->init) { - return 0; - } - - ret = fwrite(in, inl, 1, (FILE *)b->ptr); - if (ret > 0) { - ret = inl; - } - return ret; -} - -static long file_ctrl(BIO *b, int cmd, long num, void *ptr) { - long ret = 1; - FILE *fp = (FILE *)b->ptr; - FILE **fpp; - char p[4]; - - switch (cmd) { - case BIO_CTRL_RESET: - num = 0; - OPENSSL_FALLTHROUGH; - case BIO_C_FILE_SEEK: - ret = (long)fseek(fp, num, 0); - break; - case BIO_CTRL_EOF: - ret = (long)feof(fp); - break; - case BIO_C_FILE_TELL: - case BIO_CTRL_INFO: - ret = ftell(fp); - break; - case BIO_C_SET_FILE_PTR: - file_free(b); - b->shutdown = (int)num & BIO_CLOSE; - b->ptr = ptr; - b->init = 1; - break; - case BIO_C_SET_FILENAME: - file_free(b); - b->shutdown = (int)num & BIO_CLOSE; - if (num & BIO_FP_APPEND) { - if (num & BIO_FP_READ) { - OPENSSL_strlcpy(p, "a+", sizeof(p)); - } else { - OPENSSL_strlcpy(p, "a", sizeof(p)); - } - } else if ((num & BIO_FP_READ) && (num & BIO_FP_WRITE)) { - OPENSSL_strlcpy(p, "r+", sizeof(p)); - } else if (num & BIO_FP_WRITE) { - OPENSSL_strlcpy(p, "w", sizeof(p)); - } else if (num & BIO_FP_READ) { - OPENSSL_strlcpy(p, "r", sizeof(p)); - } else { - OPENSSL_PUT_ERROR(BIO, BIO_R_BAD_FOPEN_MODE); - ret = 0; - break; - } - fp = fopen(ptr, p); - if (fp == NULL) { - OPENSSL_PUT_SYSTEM_ERROR(); - ERR_add_error_data(5, "fopen('", ptr, "','", p, "')"); - OPENSSL_PUT_ERROR(BIO, ERR_R_SYS_LIB); - ret = 0; - break; - } - b->ptr = fp; - b->init = 1; - break; - case BIO_C_GET_FILE_PTR: - // the ptr parameter is actually a FILE ** in this case. - if (ptr != NULL) { - fpp = (FILE **)ptr; - *fpp = (FILE *)b->ptr; - } - break; - case BIO_CTRL_GET_CLOSE: - ret = (long)b->shutdown; - break; - case BIO_CTRL_SET_CLOSE: - b->shutdown = (int)num; - break; - case BIO_CTRL_FLUSH: - ret = 0 == fflush((FILE *)b->ptr); - break; - case BIO_CTRL_WPENDING: - case BIO_CTRL_PENDING: - default: - ret = 0; - break; - } - return ret; -} - -static int file_gets(BIO *bp, char *buf, int size) { - int ret = 0; - - if (size == 0) { - return 0; - } - - if (!fgets(buf, size, (FILE *)bp->ptr)) { - buf[0] = 0; - goto err; - } - ret = strlen(buf); - -err: - return ret; -} - -static const BIO_METHOD methods_filep = { - BIO_TYPE_FILE, "FILE pointer", - file_write, file_read, - NULL /* puts */, file_gets, - file_ctrl, NULL /* create */, - file_free, NULL /* callback_ctrl */, -}; - -const BIO_METHOD *BIO_s_file(void) { return &methods_filep; } - - -int BIO_get_fp(BIO *bio, FILE **out_file) { - return BIO_ctrl(bio, BIO_C_GET_FILE_PTR, 0, (char*) out_file); -} - -int BIO_set_fp(BIO *bio, FILE *file, int close_flag) { - return BIO_ctrl(bio, BIO_C_SET_FILE_PTR, close_flag, (char *) file); -} - -int BIO_read_filename(BIO *bio, const char *filename) { - return BIO_ctrl(bio, BIO_C_SET_FILENAME, BIO_CLOSE | BIO_FP_READ, - (char *)filename); -} - -int BIO_write_filename(BIO *bio, const char *filename) { - return BIO_ctrl(bio, BIO_C_SET_FILENAME, BIO_CLOSE | BIO_FP_WRITE, - (char *)filename); -} - -int BIO_append_filename(BIO *bio, const char *filename) { - return BIO_ctrl(bio, BIO_C_SET_FILENAME, BIO_CLOSE | BIO_FP_APPEND, - (char *)filename); -} - -int BIO_rw_filename(BIO *bio, const char *filename) { - return BIO_ctrl(bio, BIO_C_SET_FILENAME, - BIO_CLOSE | BIO_FP_READ | BIO_FP_WRITE, (char *)filename); -} - -long BIO_tell(BIO *bio) { return BIO_ctrl(bio, BIO_C_FILE_TELL, 0, NULL); } - -long BIO_seek(BIO *bio, long offset) { - return BIO_ctrl(bio, BIO_C_FILE_SEEK, offset, NULL); -} - -#endif // OPENSSL_TRUSTY diff --git a/third_party/boringssl/src/crypto/bio/file.cc b/third_party/boringssl/src/crypto/bio/file.cc new file mode 100644 index 00000000..ca80b69a --- /dev/null +++ b/third_party/boringssl/src/crypto/bio/file.cc @@ -0,0 +1,278 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(__linux) || defined(__sun) || defined(__hpux) +// Following definition aliases fopen to fopen64 on above mentioned +// platforms. This makes it possible to open and sequentially access +// files larger than 2GB from 32-bit application. It does not allow to +// traverse them beyond 2GB with fseek/ftell, but on the other hand *no* +// 32-bit platform permits that, not with fseek/ftell. Not to mention +// that breaking 2GB limit for seeking would require surgery to *our* +// API. But sequential access suffices for practical cases when you +// can run into large files, such as fingerprinting, so we can let API +// alone. For reference, the list of 32-bit platforms which allow for +// sequential access of large files without extra "magic" comprise *BSD, +// Darwin, IRIX... +#ifndef _FILE_OFFSET_BITS +#define _FILE_OFFSET_BITS 64 +#endif +#endif + +#include + +#include +#include +#include +#include + +#include +#include + +#include "../internal.h" +#include "internal.h" + +#if defined(OPENSSL_WINDOWS) +#include +#include +#endif + +#define BIO_FP_READ 0x02 +#define BIO_FP_WRITE 0x04 +#define BIO_FP_APPEND 0x08 + +#if !defined(OPENSSL_NO_FILESYSTEM) +#define fopen_if_available fopen +#else +static FILE *fopen_if_available(const char *path, const char *mode) { + errno = ENOENT; + return nullptr; +} +#endif + +BIO *BIO_new_file(const char *filename, const char *mode) { + BIO *ret; + FILE *file; + + file = fopen_if_available(filename, mode); + if (file == nullptr) { + OPENSSL_PUT_SYSTEM_ERROR(); + + ERR_add_error_data(5, "fopen('", filename, "','", mode, "')"); + if (errno == ENOENT) { + OPENSSL_PUT_ERROR(BIO, BIO_R_NO_SUCH_FILE); + } else { + OPENSSL_PUT_ERROR(BIO, BIO_R_SYS_LIB); + } + return nullptr; + } + + ret = BIO_new_fp(file, BIO_CLOSE); + if (ret == nullptr) { + fclose(file); + return nullptr; + } + + return ret; +} + +BIO *BIO_new_fp(FILE *stream, int flags) { + BIO *ret = BIO_new(BIO_s_file()); + if (ret == nullptr) { + return nullptr; + } + + BIO_set_fp(ret, stream, flags); + return ret; +} + +static int file_free(BIO *bio) { + if (!BIO_get_shutdown(bio)) { + return 1; + } + + if (BIO_get_init(bio) && BIO_get_data(bio) != nullptr) { + fclose(reinterpret_cast(BIO_get_data(bio))); + BIO_set_data(bio, nullptr); + } + BIO_set_init(bio, 0); + + return 1; +} + +static int file_read(BIO *b, char *out, int outl) { + if (!BIO_get_init(b)) { + return 0; + } + + size_t ret = fread(out, 1, outl, (FILE *)BIO_get_data(b)); + if (ret == 0 && ferror((FILE *)BIO_get_data(b))) { + OPENSSL_PUT_SYSTEM_ERROR(); + OPENSSL_PUT_ERROR(BIO, ERR_R_SYS_LIB); + return -1; + } + + // fread reads at most |outl| bytes, so |ret| fits in an int. + return (int)ret; +} + +static int file_write(BIO *b, const char *in, int inl) { + if (!BIO_get_init(b)) { + return 0; + } + + int ret = (int)fwrite(in, inl, 1, (FILE *)BIO_get_data(b)); + if (ret > 0) { + ret = inl; + } + return ret; +} + +static long file_ctrl(BIO *b, int cmd, long num, void *ptr) { + FILE *fp = static_cast(BIO_get_data(b)); + switch (cmd) { + case BIO_CTRL_RESET: + num = 0; + [[fallthrough]]; + case BIO_C_FILE_SEEK: + return fseek(fp, num, 0); + case BIO_CTRL_EOF: + // feof may return any non-zero value for EOF, but we must return 1. + return feof(fp) != 0; + case BIO_C_FILE_TELL: + case BIO_CTRL_INFO: + return ftell(fp); + case BIO_C_SET_FILE_PTR: + file_free(b); + static_assert((BIO_CLOSE & BIO_FP_TEXT) == 0, + "BIO_CLOSE and BIO_FP_TEXT must not collide"); +#if defined(OPENSSL_WINDOWS) + // If |BIO_FP_TEXT| is not set, OpenSSL will switch the file to binary + // mode. BoringSSL intentionally diverges here because it means code + // tested under POSIX will inadvertently change the state of |FILE| + // objects when wrapping them in a |BIO|. + if (num & BIO_FP_TEXT) { + _setmode(_fileno(reinterpret_cast(ptr)), _O_TEXT); + } +#endif + BIO_set_shutdown(b, static_cast(num) & BIO_CLOSE); + BIO_set_data(b, ptr); + BIO_set_init(b, 1); + return 1; + case BIO_C_SET_FILENAME: + file_free(b); + BIO_set_shutdown(b, static_cast(num) & BIO_CLOSE); + const char *mode; + if (num & BIO_FP_APPEND) { + if (num & BIO_FP_READ) { + mode = "ab+"; + } else { + mode = "ab"; + } + } else if ((num & BIO_FP_READ) && (num & BIO_FP_WRITE)) { + mode = "rb+"; + } else if (num & BIO_FP_WRITE) { + mode = "wb"; + } else if (num & BIO_FP_READ) { + mode = "rb"; + } else { + OPENSSL_PUT_ERROR(BIO, BIO_R_BAD_FOPEN_MODE); + return 0; + } + fp = fopen_if_available(reinterpret_cast(ptr), mode); + if (fp == nullptr) { + OPENSSL_PUT_SYSTEM_ERROR(); + ERR_add_error_data(5, "fopen('", ptr, "','", mode, "')"); + OPENSSL_PUT_ERROR(BIO, ERR_R_SYS_LIB); + return 0; + } + BIO_set_data(b, fp); + BIO_set_init(b, 1); + return 1; + case BIO_C_GET_FILE_PTR: + // the ptr parameter is actually a FILE ** in this case. + if (ptr != nullptr) { + FILE **out = static_cast(ptr); + *out = fp; + } + return 1; + case BIO_CTRL_GET_CLOSE: + return BIO_get_shutdown(b); + case BIO_CTRL_SET_CLOSE: + BIO_set_shutdown(b, static_cast(num)); + return 1; + case BIO_CTRL_FLUSH: + return fflush(fp) == 0; + default: + return 0; + } +} + +static int file_gets(BIO *bp, char *buf, int size) { + if (size == 0) { + return 0; + } + + if (!fgets(buf, size, (FILE *)BIO_get_data(bp))) { + buf[0] = 0; + // TODO(davidben): This doesn't distinguish error and EOF. This should check + // |ferror| as in |file_read|. + return 0; + } + + return (int)strlen(buf); +} + +static const BIO_METHOD methods_filep = { + BIO_TYPE_FILE, "FILE pointer", file_write, + file_read, file_gets, file_ctrl, + /*create=*/nullptr, file_free, /*callback_ctrl=*/nullptr, +}; + +const BIO_METHOD *BIO_s_file() { return &methods_filep; } + + +int BIO_get_fp(BIO *bio, FILE **out_file) { + return (int)BIO_ctrl(bio, BIO_C_GET_FILE_PTR, 0, (char *)out_file); +} + +int BIO_set_fp(BIO *bio, FILE *file, int flags) { + return (int)BIO_ctrl(bio, BIO_C_SET_FILE_PTR, flags, (char *)file); +} + +int BIO_read_filename(BIO *bio, const char *filename) { + return (int)BIO_ctrl(bio, BIO_C_SET_FILENAME, BIO_CLOSE | BIO_FP_READ, + (char *)filename); +} + +int BIO_write_filename(BIO *bio, const char *filename) { + return (int)BIO_ctrl(bio, BIO_C_SET_FILENAME, BIO_CLOSE | BIO_FP_WRITE, + (char *)filename); +} + +int BIO_append_filename(BIO *bio, const char *filename) { + return (int)BIO_ctrl(bio, BIO_C_SET_FILENAME, BIO_CLOSE | BIO_FP_APPEND, + (char *)filename); +} + +int BIO_rw_filename(BIO *bio, const char *filename) { + return (int)BIO_ctrl(bio, BIO_C_SET_FILENAME, + BIO_CLOSE | BIO_FP_READ | BIO_FP_WRITE, + (char *)filename); +} + +long BIO_tell(BIO *bio) { return BIO_ctrl(bio, BIO_C_FILE_TELL, 0, nullptr); } + +long BIO_seek(BIO *bio, long offset) { + return BIO_ctrl(bio, BIO_C_FILE_SEEK, offset, nullptr); +} diff --git a/third_party/boringssl/src/crypto/bio/hexdump.c b/third_party/boringssl/src/crypto/bio/hexdump.c deleted file mode 100644 index 6d928bc0..00000000 --- a/third_party/boringssl/src/crypto/bio/hexdump.c +++ /dev/null @@ -1,192 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include "../internal.h" - - -// hexdump_ctx contains the state of a hexdump. -struct hexdump_ctx { - BIO *bio; - char right_chars[18]; // the contents of the right-hand side, ASCII dump. - unsigned used; // number of bytes in the current line. - size_t n; // number of bytes total. - unsigned indent; -}; - -static void hexbyte(char *out, uint8_t b) { - static const char hextable[] = "0123456789abcdef"; - out[0] = hextable[b>>4]; - out[1] = hextable[b&0x0f]; -} - -static char to_char(uint8_t b) { - if (b < 32 || b > 126) { - return '.'; - } - return b; -} - -// hexdump_write adds |len| bytes of |data| to the current hex dump described by -// |ctx|. -static int hexdump_write(struct hexdump_ctx *ctx, const uint8_t *data, - size_t len) { - char buf[10]; - unsigned l; - - // Output lines look like: - // 00000010 2e 2f 30 31 32 33 34 35 36 37 38 ... 3c 3d // |./0123456789:;<=| - // ^ offset ^ extra space ^ ASCII of line - - for (size_t i = 0; i < len; i++) { - if (ctx->used == 0) { - // The beginning of a line. - BIO_indent(ctx->bio, ctx->indent, UINT_MAX); - - hexbyte(&buf[0], ctx->n >> 24); - hexbyte(&buf[2], ctx->n >> 16); - hexbyte(&buf[4], ctx->n >> 8); - hexbyte(&buf[6], ctx->n); - buf[8] = buf[9] = ' '; - if (BIO_write(ctx->bio, buf, 10) < 0) { - return 0; - } - } - - hexbyte(buf, data[i]); - buf[2] = ' '; - l = 3; - if (ctx->used == 7) { - // There's an additional space after the 8th byte. - buf[3] = ' '; - l = 4; - } else if (ctx->used == 15) { - // At the end of the line there's an extra space and the bar for the - // right column. - buf[3] = ' '; - buf[4] = '|'; - l = 5; - } - - if (BIO_write(ctx->bio, buf, l) < 0) { - return 0; - } - ctx->right_chars[ctx->used] = to_char(data[i]); - ctx->used++; - ctx->n++; - if (ctx->used == 16) { - ctx->right_chars[16] = '|'; - ctx->right_chars[17] = '\n'; - if (BIO_write(ctx->bio, ctx->right_chars, sizeof(ctx->right_chars)) < 0) { - return 0; - } - ctx->used = 0; - } - } - - return 1; -} - -// finish flushes any buffered data in |ctx|. -static int finish(struct hexdump_ctx *ctx) { - // See the comments in |hexdump| for the details of this format. - const unsigned n_bytes = ctx->used; - unsigned l; - char buf[5]; - - if (n_bytes == 0) { - return 1; - } - - OPENSSL_memset(buf, ' ', 4); - buf[4] = '|'; - - for (; ctx->used < 16; ctx->used++) { - l = 3; - if (ctx->used == 7) { - l = 4; - } else if (ctx->used == 15) { - l = 5; - } - if (BIO_write(ctx->bio, buf, l) < 0) { - return 0; - } - } - - ctx->right_chars[n_bytes] = '|'; - ctx->right_chars[n_bytes + 1] = '\n'; - if (BIO_write(ctx->bio, ctx->right_chars, n_bytes + 2) < 0) { - return 0; - } - return 1; -} - -int BIO_hexdump(BIO *bio, const uint8_t *data, size_t len, unsigned indent) { - struct hexdump_ctx ctx; - OPENSSL_memset(&ctx, 0, sizeof(ctx)); - ctx.bio = bio; - ctx.indent = indent; - - if (!hexdump_write(&ctx, data, len) || !finish(&ctx)) { - return 0; - } - - return 1; -} diff --git a/third_party/boringssl/src/crypto/bio/hexdump.cc b/third_party/boringssl/src/crypto/bio/hexdump.cc new file mode 100644 index 00000000..a3f611a3 --- /dev/null +++ b/third_party/boringssl/src/crypto/bio/hexdump.cc @@ -0,0 +1,154 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "../internal.h" + + +using namespace bssl; + +namespace { +// hexdump_ctx contains the state of a hexdump. +struct hexdump_ctx { + BIO *bio; + char right_chars[18]; // the contents of the right-hand side, ASCII dump. + unsigned used; // number of bytes in the current line. + size_t n; // number of bytes total. + unsigned indent; +}; +} // namespace + +static void hexbyte(char *out, uint8_t b) { + static const char hextable[] = "0123456789abcdef"; + out[0] = hextable[b >> 4]; + out[1] = hextable[b & 0x0f]; +} + +static char to_char(uint8_t b) { + if (b < 32 || b > 126) { + return '.'; + } + return b; +} + +// hexdump_write adds |len| bytes of |data| to the current hex dump described by +// |ctx|. +static int hexdump_write(struct hexdump_ctx *ctx, const uint8_t *data, + size_t len) { + char buf[10]; + unsigned l; + + // Output lines look like: + // 00000010 2e 2f 30 31 32 33 34 35 36 37 38 ... 3c 3d // |./0123456789:;<=| + // ^ offset ^ extra space ^ ASCII of line + + for (size_t i = 0; i < len; i++) { + if (ctx->used == 0) { + // The beginning of a line. + BIO_indent(ctx->bio, ctx->indent, UINT_MAX); + + hexbyte(&buf[0], ctx->n >> 24); + hexbyte(&buf[2], ctx->n >> 16); + hexbyte(&buf[4], ctx->n >> 8); + hexbyte(&buf[6], ctx->n); + buf[8] = buf[9] = ' '; + if (BIO_write(ctx->bio, buf, 10) < 0) { + return 0; + } + } + + hexbyte(buf, data[i]); + buf[2] = ' '; + l = 3; + if (ctx->used == 7) { + // There's an additional space after the 8th byte. + buf[3] = ' '; + l = 4; + } else if (ctx->used == 15) { + // At the end of the line there's an extra space and the bar for the + // right column. + buf[3] = ' '; + buf[4] = '|'; + l = 5; + } + + if (BIO_write(ctx->bio, buf, l) < 0) { + return 0; + } + ctx->right_chars[ctx->used] = to_char(data[i]); + ctx->used++; + ctx->n++; + if (ctx->used == 16) { + ctx->right_chars[16] = '|'; + ctx->right_chars[17] = '\n'; + if (BIO_write(ctx->bio, ctx->right_chars, sizeof(ctx->right_chars)) < 0) { + return 0; + } + ctx->used = 0; + } + } + + return 1; +} + +// finish flushes any buffered data in |ctx|. +static int finish(struct hexdump_ctx *ctx) { + // See the comments in |hexdump| for the details of this format. + const unsigned n_bytes = ctx->used; + unsigned l; + char buf[5]; + + if (n_bytes == 0) { + return 1; + } + + OPENSSL_memset(buf, ' ', 4); + buf[4] = '|'; + + for (; ctx->used < 16; ctx->used++) { + l = 3; + if (ctx->used == 7) { + l = 4; + } else if (ctx->used == 15) { + l = 5; + } + if (BIO_write(ctx->bio, buf, l) < 0) { + return 0; + } + } + + ctx->right_chars[n_bytes] = '|'; + ctx->right_chars[n_bytes + 1] = '\n'; + if (BIO_write(ctx->bio, ctx->right_chars, n_bytes + 2) < 0) { + return 0; + } + return 1; +} + +int BIO_hexdump(BIO *bio, const uint8_t *data, size_t len, unsigned indent) { + struct hexdump_ctx ctx; + OPENSSL_memset(&ctx, 0, sizeof(ctx)); + ctx.bio = bio; + ctx.indent = indent; + + if (!hexdump_write(&ctx, data, len) || !finish(&ctx)) { + return 0; + } + + return 1; +} diff --git a/third_party/boringssl/src/crypto/bio/internal.h b/third_party/boringssl/src/crypto/bio/internal.h index 8ed27dae..77cd8a88 100644 --- a/third_party/boringssl/src/crypto/bio/internal.h +++ b/third_party/boringssl/src/crypto/bio/internal.h @@ -1,64 +1,28 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#ifndef OPENSSL_HEADER_BIO_INTERNAL_H -#define OPENSSL_HEADER_BIO_INTERNAL_H - -#include +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_BIO_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_BIO_INTERNAL_H + +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" + +#if !defined(OPENSSL_NO_SOCK) #if !defined(OPENSSL_WINDOWS) #if defined(OPENSSL_PNACL) // newlib uses u_short in socket.h without defining it. @@ -67,18 +31,64 @@ typedef unsigned short u_short; #include #include #else -OPENSSL_MSVC_PRAGMA(warning(push, 3)) #include -OPENSSL_MSVC_PRAGMA(warning(pop)) typedef int socklen_t; #endif +#endif // !OPENSSL_NO_SOCK -#if defined(__cplusplus) -extern "C" { -#endif +DECLARE_OPAQUE_STRUCT(bio_st, Bio) + +struct bio_method_st { + int type; + const char *name; + int (*bwrite)(BIO *, const char *, int); + int (*bread)(BIO *, char *, int); + int (*bgets)(BIO *, char *, int); + long (*ctrl)(BIO *, int, long, void *); + int (*create)(BIO *); + int (*destroy)(BIO *); + long (*callback_ctrl)(BIO *, int, BIO_info_cb *); +}; + +BSSL_NAMESPACE_BEGIN -// BIO_ip_and_port_to_socket_and_addr creates a socket and fills in |*out_addr| +class Bio : public bio_st, public RefCounted { + public: + explicit Bio(const BIO_METHOD *m); + + const BIO_METHOD *method; + CRYPTO_EX_DATA ex_data; + + // TODO(crbug.com/412269080): |init| and |shutdown| could be bitfields, or + // integrated into |flags|, to save memory. + + // init is non-zero if this |BIO| has been initialised. + int init = 0; + // shutdown is often used by specific |BIO_METHOD|s to determine whether + // they own some underlying resource. This flag can often be controlled by + // |BIO_set_close|. For example, whether an fd BIO closes the underlying fd + // when it, itself, is closed. + int shutdown = 1; + int flags = 0; + int retry_reason = 0; + // num is a BIO-specific value. For example, in fd BIOs it's used to store a + // file descriptor. + int num = 0; + void *ptr = nullptr; + // next_bio points to the next |BIO| in a chain. This |BIO| owns a reference + // to |next_bio|. + Bio *next_bio = nullptr; // used by filter BIOs + uint64_t num_read = 0, num_write = 0; + + private: + friend RefCounted; + ~Bio(); +}; + +#if !defined(OPENSSL_NO_SOCK) + +// bio_ip_and_port_to_socket_and_addr creates a socket and fills in |*out_addr| // and |*out_addr_length| with the correct values for connecting to |hostname| // on |port_str|. It returns one on success or zero on error. int bio_ip_and_port_to_socket_and_addr(int *out_sock, @@ -87,25 +97,29 @@ int bio_ip_and_port_to_socket_and_addr(int *out_sock, const char *hostname, const char *port_str); -// BIO_socket_nbio sets whether |sock| is non-blocking. It returns one on +// bio_socket_nbio sets whether |sock| is non-blocking. It returns one on // success and zero otherwise. int bio_socket_nbio(int sock, int on); -// BIO_clear_socket_error clears the last system socket error. +// bio_clear_socket_error clears the last system socket error. // // TODO(fork): remove all callers of this. -void bio_clear_socket_error(void); +void bio_clear_socket_error(); -// BIO_sock_error returns the last socket error on |sock|. +// bio_sock_error returns the last socket error on |sock|. int bio_sock_error(int sock); -// BIO_fd_should_retry returns non-zero if |return_value| indicates an error +// bio_socket_should_retry returns non-zero if |return_value| indicates an error +// and the last socket error indicates that it's non-fatal. +int bio_socket_should_retry(int return_value); + +#endif // !OPENSSL_NO_SOCK + +// bio_errno_should_retry returns non-zero if |return_value| indicates an error // and |errno| indicates that it's non-fatal. -int bio_fd_should_retry(int return_value); +int bio_errno_should_retry(int return_value); +BSSL_NAMESPACE_END -#if defined(__cplusplus) -} // extern C -#endif -#endif // OPENSSL_HEADER_BIO_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_BIO_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/bio/pair.c b/third_party/boringssl/src/crypto/bio/pair.c deleted file mode 100644 index a1a9c9c9..00000000 --- a/third_party/boringssl/src/crypto/bio/pair.c +++ /dev/null @@ -1,483 +0,0 @@ -/* ==================================================================== - * Copyright (c) 1998-2003 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include - -#include -#include - -#include "../internal.h" - - -struct bio_bio_st { - BIO *peer; // NULL if buf == NULL. - // If peer != NULL, then peer->ptr is also a bio_bio_st, - // and its "peer" member points back to us. - // peer != NULL iff init != 0 in the BIO. - - // This is for what we write (i.e. reading uses peer's struct): - int closed; // valid iff peer != NULL - size_t len; // valid iff buf != NULL; 0 if peer == NULL - size_t offset; // valid iff buf != NULL; 0 if len == 0 - size_t size; - uint8_t *buf; // "size" elements (if != NULL) - - size_t request; // valid iff peer != NULL; 0 if len != 0, - // otherwise set by peer to number of bytes - // it (unsuccessfully) tried to read, - // never more than buffer space (size-len) warrants. -}; - -static int bio_new(BIO *bio) { - struct bio_bio_st *b; - - b = OPENSSL_malloc(sizeof *b); - if (b == NULL) { - return 0; - } - OPENSSL_memset(b, 0, sizeof(struct bio_bio_st)); - - b->size = 17 * 1024; // enough for one TLS record (just a default) - bio->ptr = b; - return 1; -} - -static void bio_destroy_pair(BIO *bio) { - struct bio_bio_st *b = bio->ptr; - BIO *peer_bio; - struct bio_bio_st *peer_b; - - if (b == NULL) { - return; - } - - peer_bio = b->peer; - if (peer_bio == NULL) { - return; - } - - peer_b = peer_bio->ptr; - - assert(peer_b != NULL); - assert(peer_b->peer == bio); - - peer_b->peer = NULL; - peer_bio->init = 0; - assert(peer_b->buf != NULL); - peer_b->len = 0; - peer_b->offset = 0; - - b->peer = NULL; - bio->init = 0; - assert(b->buf != NULL); - b->len = 0; - b->offset = 0; -} - -static int bio_free(BIO *bio) { - struct bio_bio_st *b = bio->ptr; - - assert(b != NULL); - - if (b->peer) { - bio_destroy_pair(bio); - } - - OPENSSL_free(b->buf); - OPENSSL_free(b); - - return 1; -} - -static int bio_read(BIO *bio, char *buf, int size_) { - size_t size = size_; - size_t rest; - struct bio_bio_st *b, *peer_b; - - BIO_clear_retry_flags(bio); - - if (!bio->init) { - return 0; - } - - b = bio->ptr; - assert(b != NULL); - assert(b->peer != NULL); - peer_b = b->peer->ptr; - assert(peer_b != NULL); - assert(peer_b->buf != NULL); - - peer_b->request = 0; // will be set in "retry_read" situation - - if (buf == NULL || size == 0) { - return 0; - } - - if (peer_b->len == 0) { - if (peer_b->closed) { - return 0; // writer has closed, and no data is left - } else { - BIO_set_retry_read(bio); // buffer is empty - if (size <= peer_b->size) { - peer_b->request = size; - } else { - // don't ask for more than the peer can - // deliver in one write - peer_b->request = peer_b->size; - } - return -1; - } - } - - // we can read - if (peer_b->len < size) { - size = peer_b->len; - } - - // now read "size" bytes - rest = size; - - assert(rest > 0); - // one or two iterations - do { - size_t chunk; - - assert(rest <= peer_b->len); - if (peer_b->offset + rest <= peer_b->size) { - chunk = rest; - } else { - // wrap around ring buffer - chunk = peer_b->size - peer_b->offset; - } - assert(peer_b->offset + chunk <= peer_b->size); - - OPENSSL_memcpy(buf, peer_b->buf + peer_b->offset, chunk); - - peer_b->len -= chunk; - if (peer_b->len) { - peer_b->offset += chunk; - assert(peer_b->offset <= peer_b->size); - if (peer_b->offset == peer_b->size) { - peer_b->offset = 0; - } - buf += chunk; - } else { - // buffer now empty, no need to advance "buf" - assert(chunk == rest); - peer_b->offset = 0; - } - rest -= chunk; - } while (rest); - - return size; -} - -static int bio_write(BIO *bio, const char *buf, int num_) { - size_t num = num_; - size_t rest; - struct bio_bio_st *b; - - BIO_clear_retry_flags(bio); - - if (!bio->init || buf == NULL || num == 0) { - return 0; - } - - b = bio->ptr; - assert(b != NULL); - assert(b->peer != NULL); - assert(b->buf != NULL); - - b->request = 0; - if (b->closed) { - // we already closed - OPENSSL_PUT_ERROR(BIO, BIO_R_BROKEN_PIPE); - return -1; - } - - assert(b->len <= b->size); - - if (b->len == b->size) { - BIO_set_retry_write(bio); // buffer is full - return -1; - } - - // we can write - if (num > b->size - b->len) { - num = b->size - b->len; - } - - // now write "num" bytes - rest = num; - - assert(rest > 0); - // one or two iterations - do { - size_t write_offset; - size_t chunk; - - assert(b->len + rest <= b->size); - - write_offset = b->offset + b->len; - if (write_offset >= b->size) { - write_offset -= b->size; - } - // b->buf[write_offset] is the first byte we can write to. - - if (write_offset + rest <= b->size) { - chunk = rest; - } else { - // wrap around ring buffer - chunk = b->size - write_offset; - } - - OPENSSL_memcpy(b->buf + write_offset, buf, chunk); - - b->len += chunk; - - assert(b->len <= b->size); - - rest -= chunk; - buf += chunk; - } while (rest); - - return num; -} - -static int bio_make_pair(BIO *bio1, BIO *bio2, size_t writebuf1_len, - size_t writebuf2_len) { - struct bio_bio_st *b1, *b2; - - assert(bio1 != NULL); - assert(bio2 != NULL); - - b1 = bio1->ptr; - b2 = bio2->ptr; - - if (b1->peer != NULL || b2->peer != NULL) { - OPENSSL_PUT_ERROR(BIO, BIO_R_IN_USE); - return 0; - } - - if (b1->buf == NULL) { - if (writebuf1_len) { - b1->size = writebuf1_len; - } - b1->buf = OPENSSL_malloc(b1->size); - if (b1->buf == NULL) { - OPENSSL_PUT_ERROR(BIO, ERR_R_MALLOC_FAILURE); - return 0; - } - b1->len = 0; - b1->offset = 0; - } - - if (b2->buf == NULL) { - if (writebuf2_len) { - b2->size = writebuf2_len; - } - b2->buf = OPENSSL_malloc(b2->size); - if (b2->buf == NULL) { - OPENSSL_PUT_ERROR(BIO, ERR_R_MALLOC_FAILURE); - return 0; - } - b2->len = 0; - b2->offset = 0; - } - - b1->peer = bio2; - b1->closed = 0; - b1->request = 0; - b2->peer = bio1; - b2->closed = 0; - b2->request = 0; - - bio1->init = 1; - bio2->init = 1; - - return 1; -} - -static long bio_ctrl(BIO *bio, int cmd, long num, void *ptr) { - long ret; - struct bio_bio_st *b = bio->ptr; - - assert(b != NULL); - - switch (cmd) { - // specific CTRL codes - - case BIO_C_GET_WRITE_BUF_SIZE: - ret = (long)b->size; - break; - - case BIO_C_GET_WRITE_GUARANTEE: - // How many bytes can the caller feed to the next write - // without having to keep any? - if (b->peer == NULL || b->closed) { - ret = 0; - } else { - ret = (long)b->size - b->len; - } - break; - - case BIO_C_GET_READ_REQUEST: - // If the peer unsuccessfully tried to read, how many bytes - // were requested? (As with BIO_CTRL_PENDING, that number - // can usually be treated as boolean.) - ret = (long)b->request; - break; - - case BIO_C_RESET_READ_REQUEST: - // Reset request. (Can be useful after read attempts - // at the other side that are meant to be non-blocking, - // e.g. when probing SSL_read to see if any data is - // available.) - b->request = 0; - ret = 1; - break; - - case BIO_C_SHUTDOWN_WR: - // similar to shutdown(..., SHUT_WR) - b->closed = 1; - ret = 1; - break; - - // standard CTRL codes follow - - case BIO_CTRL_GET_CLOSE: - ret = bio->shutdown; - break; - - case BIO_CTRL_SET_CLOSE: - bio->shutdown = (int)num; - ret = 1; - break; - - case BIO_CTRL_PENDING: - if (b->peer != NULL) { - struct bio_bio_st *peer_b = b->peer->ptr; - ret = (long)peer_b->len; - } else { - ret = 0; - } - break; - - case BIO_CTRL_WPENDING: - ret = 0; - if (b->buf != NULL) { - ret = (long)b->len; - } - break; - - case BIO_CTRL_FLUSH: - ret = 1; - break; - - case BIO_CTRL_EOF: { - BIO *other_bio = ptr; - - if (other_bio) { - struct bio_bio_st *other_b = other_bio->ptr; - assert(other_b != NULL); - ret = other_b->len == 0 && other_b->closed; - } else { - ret = 1; - } - } break; - - default: - ret = 0; - } - return ret; -} - - -static const BIO_METHOD methods_biop = { - BIO_TYPE_BIO, "BIO pair", bio_write, bio_read, NULL /* puts */, - NULL /* gets */, bio_ctrl, bio_new, bio_free, NULL /* callback_ctrl */, -}; - -static const BIO_METHOD *bio_s_bio(void) { return &methods_biop; } - -int BIO_new_bio_pair(BIO** bio1_p, size_t writebuf1_len, - BIO** bio2_p, size_t writebuf2_len) { - BIO *bio1 = BIO_new(bio_s_bio()); - BIO *bio2 = BIO_new(bio_s_bio()); - if (bio1 == NULL || bio2 == NULL || - !bio_make_pair(bio1, bio2, writebuf1_len, writebuf2_len)) { - BIO_free(bio1); - BIO_free(bio2); - *bio1_p = NULL; - *bio2_p = NULL; - return 0; - } - - *bio1_p = bio1; - *bio2_p = bio2; - return 1; -} - -size_t BIO_ctrl_get_read_request(BIO *bio) { - return BIO_ctrl(bio, BIO_C_GET_READ_REQUEST, 0, NULL); -} - -size_t BIO_ctrl_get_write_guarantee(BIO *bio) { - return BIO_ctrl(bio, BIO_C_GET_WRITE_GUARANTEE, 0, NULL); -} - -int BIO_shutdown_wr(BIO *bio) { - return BIO_ctrl(bio, BIO_C_SHUTDOWN_WR, 0, NULL); -} diff --git a/third_party/boringssl/src/crypto/bio/pair.cc b/third_party/boringssl/src/crypto/bio/pair.cc new file mode 100644 index 00000000..11b5c2ef --- /dev/null +++ b/third_party/boringssl/src/crypto/bio/pair.cc @@ -0,0 +1,443 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +namespace { +struct bio_bio_st { + BIO *peer = nullptr; // NULL if buf == NULL. + // If peer != NULL, then BIO_get_data(peer) is also a + // bio_bio_st, and its "peer" member points back to us. + // peer != NULL iff init != 0 in the BIO. + + // This is for what we write (i.e. reading uses peer's struct): + int closed = 0; // valid iff peer != NULL + size_t len = 0; // valid iff buf != NULL; 0 if peer == NULL + size_t offset = 0; // valid iff buf != NULL; 0 if len == 0 + size_t size = 0; + uint8_t *buf = nullptr; // "size" elements (if != NULL) + + size_t request = 0; // valid iff peer != NULL; 0 if len != 0, + // otherwise set by peer to number of bytes + // it (unsuccessfully) tried to read, + // never more than buffer space (size-len) warrants. +}; +} // namespace + +static int bio_new(BIO *bio) { + struct bio_bio_st *b = New(); + if (b == nullptr) { + return 0; + } + + b->size = 17 * 1024; // enough for one TLS record (just a default) + BIO_set_data(bio, b); + return 1; +} + +static void bio_destroy_pair(BIO *bio) { + struct bio_bio_st *b = reinterpret_cast(BIO_get_data(bio)); + BIO *peer_bio; + struct bio_bio_st *peer_b; + + if (b == nullptr) { + return; + } + + peer_bio = b->peer; + if (peer_bio == nullptr) { + return; + } + + peer_b = reinterpret_cast(BIO_get_data(peer_bio)); + + assert(peer_b != nullptr); + assert(peer_b->peer == bio); + + peer_b->peer = nullptr; + BIO_set_init(peer_bio, 0); + assert(peer_b->buf != nullptr); + peer_b->len = 0; + peer_b->offset = 0; + + b->peer = nullptr; + BIO_set_init(bio, 0); + assert(b->buf != nullptr); + b->len = 0; + b->offset = 0; +} + +static int bio_free(BIO *bio) { + struct bio_bio_st *b = reinterpret_cast(BIO_get_data(bio)); + + assert(b != nullptr); + + if (b->peer) { + bio_destroy_pair(bio); + } + + OPENSSL_free(b->buf); + Delete(b); + + return 1; +} + +static int bio_read(BIO *bio, char *buf, int size_) { + size_t size = size_; + size_t rest; + struct bio_bio_st *b, *peer_b; + + BIO_clear_retry_flags(bio); + + if (!BIO_get_init(bio)) { + return 0; + } + + b = reinterpret_cast(BIO_get_data(bio)); + assert(b != nullptr); + assert(b->peer != nullptr); + peer_b = reinterpret_cast(BIO_get_data(b->peer)); + assert(peer_b != nullptr); + assert(peer_b->buf != nullptr); + + peer_b->request = 0; // will be set in "retry_read" situation + + if (buf == nullptr || size == 0) { + return 0; + } + + if (peer_b->len == 0) { + if (peer_b->closed) { + return 0; // writer has closed, and no data is left + } else { + BIO_set_retry_read(bio); // buffer is empty + if (size <= peer_b->size) { + peer_b->request = size; + } else { + // don't ask for more than the peer can + // deliver in one write + peer_b->request = peer_b->size; + } + return -1; + } + } + + // we can read + if (peer_b->len < size) { + size = peer_b->len; + } + + // now read "size" bytes + rest = size; + + assert(rest > 0); + // one or two iterations + do { + size_t chunk; + + assert(rest <= peer_b->len); + if (peer_b->offset + rest <= peer_b->size) { + chunk = rest; + } else { + // wrap around ring buffer + chunk = peer_b->size - peer_b->offset; + } + assert(peer_b->offset + chunk <= peer_b->size); + + OPENSSL_memcpy(buf, peer_b->buf + peer_b->offset, chunk); + + peer_b->len -= chunk; + if (peer_b->len) { + peer_b->offset += chunk; + assert(peer_b->offset <= peer_b->size); + if (peer_b->offset == peer_b->size) { + peer_b->offset = 0; + } + buf += chunk; + } else { + // buffer now empty, no need to advance "buf" + assert(chunk == rest); + peer_b->offset = 0; + } + rest -= chunk; + } while (rest); + + // |size| is bounded by the buffer size, which fits in |int|. + return (int)size; +} + +static int bio_write(BIO *bio, const char *buf, int num_) { + size_t num = num_; + size_t rest; + struct bio_bio_st *b; + + BIO_clear_retry_flags(bio); + + if (!BIO_get_init(bio) || buf == nullptr || num == 0) { + return 0; + } + + b = reinterpret_cast(BIO_get_data(bio)); + assert(b != nullptr); + assert(b->peer != nullptr); + assert(b->buf != nullptr); + + b->request = 0; + if (b->closed) { + // we already closed + OPENSSL_PUT_ERROR(BIO, BIO_R_BROKEN_PIPE); + return -1; + } + + assert(b->len <= b->size); + + if (b->len == b->size) { + BIO_set_retry_write(bio); // buffer is full + return -1; + } + + // we can write + if (num > b->size - b->len) { + num = b->size - b->len; + } + + // now write "num" bytes + rest = num; + + assert(rest > 0); + // one or two iterations + do { + size_t write_offset; + size_t chunk; + + assert(b->len + rest <= b->size); + + write_offset = b->offset + b->len; + if (write_offset >= b->size) { + write_offset -= b->size; + } + // b->buf[write_offset] is the first byte we can write to. + + if (write_offset + rest <= b->size) { + chunk = rest; + } else { + // wrap around ring buffer + chunk = b->size - write_offset; + } + + OPENSSL_memcpy(b->buf + write_offset, buf, chunk); + + b->len += chunk; + + assert(b->len <= b->size); + + rest -= chunk; + buf += chunk; + } while (rest); + + // |num| is bounded by the buffer size, which fits in |int|. + return (int)num; +} + +static int bio_make_pair(BIO *bio1, BIO *bio2, size_t writebuf1_len, + size_t writebuf2_len) { + struct bio_bio_st *b1, *b2; + + assert(bio1 != nullptr); + assert(bio2 != nullptr); + + b1 = reinterpret_cast(BIO_get_data(bio1)); + b2 = reinterpret_cast(BIO_get_data(bio2)); + + if (b1->peer != nullptr || b2->peer != nullptr) { + OPENSSL_PUT_ERROR(BIO, BIO_R_IN_USE); + return 0; + } + + if (b1->buf == nullptr) { + if (writebuf1_len) { + b1->size = writebuf1_len; + } + b1->buf = reinterpret_cast(OPENSSL_malloc(b1->size)); + if (b1->buf == nullptr) { + return 0; + } + b1->len = 0; + b1->offset = 0; + } + + if (b2->buf == nullptr) { + if (writebuf2_len) { + b2->size = writebuf2_len; + } + b2->buf = reinterpret_cast(OPENSSL_malloc(b2->size)); + if (b2->buf == nullptr) { + return 0; + } + b2->len = 0; + b2->offset = 0; + } + + b1->peer = bio2; + b1->closed = 0; + b1->request = 0; + b2->peer = bio1; + b2->closed = 0; + b2->request = 0; + + BIO_set_init(bio1, 1); + BIO_set_init(bio2, 1); + + return 1; +} + +static long bio_ctrl(BIO *bio, int cmd, long num, void *ptr) { + struct bio_bio_st *b = reinterpret_cast(BIO_get_data(bio)); + assert(b != nullptr); + switch (cmd) { + // Specific control codes first: + case BIO_C_GET_WRITE_BUF_SIZE: + // TODO(crbug.com/412584975): This can overflow on 64-bit Windows. Do we + // need it? It implements |BIO_get_write_buf_size|, but we don't have the + // wrapper. + return static_cast(b->size); + + case BIO_C_GET_WRITE_GUARANTEE: + // How many bytes can the caller feed to the next write + // without having to keep any? + if (b->peer == nullptr || b->closed) { + return 0; + } + // TODO(crbug.com/412584975): This can overflow on 64-bit Windows. + return static_cast(b->size - b->len); + + case BIO_C_GET_READ_REQUEST: + // If the peer unsuccessfully tried to read, how many bytes + // were requested? (As with BIO_CTRL_PENDING, that number + // can usually be treated as boolean.) + // + // TODO(crbug.com/412584975): This can overflow on 64-bit Windows. + return static_cast(b->request); + + case BIO_C_RESET_READ_REQUEST: + // Reset request. (Can be useful after read attempts + // at the other side that are meant to be non-blocking, + // e.g. when probing SSL_read to see if any data is + // available.) + b->request = 0; + return 1; + + case BIO_C_SHUTDOWN_WR: + // similar to shutdown(..., SHUT_WR) + b->closed = 1; + return 1; + + // Standard control codes: + case BIO_CTRL_GET_CLOSE: + return BIO_get_shutdown(bio); + + case BIO_CTRL_SET_CLOSE: + BIO_set_shutdown(bio, static_cast(num)); + return 1; + + case BIO_CTRL_PENDING: + if (b->peer != nullptr) { + struct bio_bio_st *peer_b = + reinterpret_cast(BIO_get_data(b->peer)); + // TODO(crbug.com/412584975): This can overflow on 64-bit Windows. + return static_cast(peer_b->len); + } + return 0; + + case BIO_CTRL_WPENDING: + if (b->buf == nullptr) { + return 0; + } + // TODO(crbug.com/412584975): This can overflow on 64-bit Windows. + return static_cast(b->len); + + case BIO_CTRL_FLUSH: + return 1; + + case BIO_CTRL_EOF: { + if (b->peer) { + auto *peer_b = reinterpret_cast(BIO_get_data(b->peer)); + assert(peer_b != nullptr); + return peer_b->len == 0 && peer_b->closed; + } + return 1; + } + + default: + return 0; + } +} + + +static const BIO_METHOD methods_biop = { + BIO_TYPE_BIO, + "BIO pair", + bio_write, + bio_read, + /*gets=*/nullptr, + bio_ctrl, + bio_new, + bio_free, + /*callback_ctrl=*/nullptr, +}; + +static const BIO_METHOD *bio_s_bio() { return &methods_biop; } + +int BIO_new_bio_pair(BIO **bio1_p, size_t writebuf1_len, BIO **bio2_p, + size_t writebuf2_len) { + BIO *bio1 = BIO_new(bio_s_bio()); + BIO *bio2 = BIO_new(bio_s_bio()); + if (bio1 == nullptr || bio2 == nullptr || + !bio_make_pair(bio1, bio2, writebuf1_len, writebuf2_len)) { + BIO_free(bio1); + BIO_free(bio2); + *bio1_p = nullptr; + *bio2_p = nullptr; + return 0; + } + + *bio1_p = bio1; + *bio2_p = bio2; + return 1; +} + +size_t BIO_ctrl_get_read_request(BIO *bio) { + return BIO_ctrl(bio, BIO_C_GET_READ_REQUEST, 0, nullptr); +} + +size_t BIO_ctrl_get_write_guarantee(BIO *bio) { + return BIO_ctrl(bio, BIO_C_GET_WRITE_GUARANTEE, 0, nullptr); +} + +int BIO_shutdown_wr(BIO *bio) { + return (int)BIO_ctrl(bio, BIO_C_SHUTDOWN_WR, 0, nullptr); +} diff --git a/third_party/boringssl/src/crypto/bio/printf.c b/third_party/boringssl/src/crypto/bio/printf.c deleted file mode 100644 index 253546b7..00000000 --- a/third_party/boringssl/src/crypto/bio/printf.c +++ /dev/null @@ -1,103 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include -#include - -int BIO_printf(BIO *bio, const char *format, ...) { - va_list args; - char buf[256], *out, out_malloced = 0; - int out_len, ret; - - va_start(args, format); - out_len = vsnprintf(buf, sizeof(buf), format, args); - va_end(args); - if (out_len < 0) { - return -1; - } - - if ((size_t) out_len >= sizeof(buf)) { - const int requested_len = out_len; - // The output was truncated. Note that vsnprintf's return value - // does not include a trailing NUL, but the buffer must be sized - // for it. - out = OPENSSL_malloc(requested_len + 1); - out_malloced = 1; - if (out == NULL) { - OPENSSL_PUT_ERROR(BIO, ERR_R_MALLOC_FAILURE); - return -1; - } - va_start(args, format); - out_len = vsnprintf(out, requested_len + 1, format, args); - va_end(args); - assert(out_len == requested_len); - } else { - out = buf; - } - - ret = BIO_write(bio, out, out_len); - if (out_malloced) { - OPENSSL_free(out); - } - - return ret; -} diff --git a/third_party/boringssl/src/crypto/bio/printf.cc b/third_party/boringssl/src/crypto/bio/printf.cc new file mode 100644 index 00000000..0837ee56 --- /dev/null +++ b/third_party/boringssl/src/crypto/bio/printf.cc @@ -0,0 +1,59 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include + +int BIO_printf(BIO *bio, const char *format, ...) { + va_list args; + char buf[256], *out, out_malloced = 0; + int out_len, ret; + + va_start(args, format); + out_len = vsnprintf(buf, sizeof(buf), format, args); + va_end(args); + if (out_len < 0) { + return -1; + } + + if ((size_t)out_len >= sizeof(buf)) { + const size_t requested_len = (size_t)out_len; + // The output was truncated. Note that vsnprintf's return value does not + // include a trailing NUL, but the buffer must be sized for it. + out = reinterpret_cast(OPENSSL_malloc(requested_len + 1)); + out_malloced = 1; + if (out == nullptr) { + return -1; + } + va_start(args, format); + out_len = vsnprintf(out, requested_len + 1, format, args); + va_end(args); + assert(out_len == (int)requested_len); + } else { + out = buf; + } + + ret = BIO_write(bio, out, out_len); + if (out_malloced) { + OPENSSL_free(out); + } + + return ret; +} diff --git a/third_party/boringssl/src/crypto/bio/socket.c b/third_party/boringssl/src/crypto/bio/socket.c deleted file mode 100644 index 679959eb..00000000 --- a/third_party/boringssl/src/crypto/bio/socket.c +++ /dev/null @@ -1,192 +0,0 @@ -/* crypto/bio/bss_sock.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#if !defined(OPENSSL_TRUSTY) - -#include -#include - -#if !defined(OPENSSL_WINDOWS) -#include -#else -OPENSSL_MSVC_PRAGMA(warning(push, 3)) -#include -OPENSSL_MSVC_PRAGMA(warning(pop)) - -OPENSSL_MSVC_PRAGMA(comment(lib, "Ws2_32.lib")) -#endif - -#include "internal.h" - - -#if !defined(OPENSSL_WINDOWS) -static int closesocket(int sock) { - return close(sock); -} -#endif - -static int sock_free(BIO *bio) { - if (bio->shutdown) { - if (bio->init) { - closesocket(bio->num); - } - bio->init = 0; - bio->flags = 0; - } - return 1; -} - -static int sock_read(BIO *b, char *out, int outl) { - if (out == NULL) { - return 0; - } - - bio_clear_socket_error(); -#if defined(OPENSSL_WINDOWS) - int ret = recv(b->num, out, outl, 0); -#else - int ret = read(b->num, out, outl); -#endif - BIO_clear_retry_flags(b); - if (ret <= 0) { - if (bio_fd_should_retry(ret)) { - BIO_set_retry_read(b); - } - } - return ret; -} - -static int sock_write(BIO *b, const char *in, int inl) { - int ret; - - bio_clear_socket_error(); -#if defined(OPENSSL_WINDOWS) - ret = send(b->num, in, inl, 0); -#else - ret = write(b->num, in, inl); -#endif - BIO_clear_retry_flags(b); - if (ret <= 0) { - if (bio_fd_should_retry(ret)) { - BIO_set_retry_write(b); - } - } - return ret; -} - -static long sock_ctrl(BIO *b, int cmd, long num, void *ptr) { - long ret = 1; - int *ip; - - switch (cmd) { - case BIO_C_SET_FD: - sock_free(b); - b->num = *((int *)ptr); - b->shutdown = (int)num; - b->init = 1; - break; - case BIO_C_GET_FD: - if (b->init) { - ip = (int *)ptr; - if (ip != NULL) { - *ip = b->num; - } - ret = b->num; - } else { - ret = -1; - } - break; - case BIO_CTRL_GET_CLOSE: - ret = b->shutdown; - break; - case BIO_CTRL_SET_CLOSE: - b->shutdown = (int)num; - break; - case BIO_CTRL_FLUSH: - ret = 1; - break; - default: - ret = 0; - break; - } - return ret; -} - -static const BIO_METHOD methods_sockp = { - BIO_TYPE_SOCKET, "socket", - sock_write, sock_read, - NULL /* puts */, NULL /* gets, */, - sock_ctrl, NULL /* create */, - sock_free, NULL /* callback_ctrl */, -}; - -const BIO_METHOD *BIO_s_socket(void) { return &methods_sockp; } - -BIO *BIO_new_socket(int fd, int close_flag) { - BIO *ret; - - ret = BIO_new(BIO_s_socket()); - if (ret == NULL) { - return NULL; - } - BIO_set_fd(ret, fd, close_flag); - return ret; -} - -#endif // OPENSSL_TRUSTY diff --git a/third_party/boringssl/src/crypto/bio/socket.cc b/third_party/boringssl/src/crypto/bio/socket.cc new file mode 100644 index 00000000..c61daeca --- /dev/null +++ b/third_party/boringssl/src/crypto/bio/socket.cc @@ -0,0 +1,210 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if !defined(OPENSSL_NO_SOCK) + +#include +#include + +#if !defined(OPENSSL_WINDOWS) +#include +#else +#include +OPENSSL_MSVC_PRAGMA(comment(lib, "Ws2_32.lib")) +#endif + +#include "internal.h" + + +using namespace bssl; + +#if !defined(OPENSSL_WINDOWS) +static int closesocket(int sock) { return close(sock); } +#endif + +static int sock_free(BIO *bio) { + if (BIO_get_shutdown(bio)) { + if (BIO_get_init(bio)) { + closesocket(FromOpaque(bio)->num); + } + BIO_set_init(bio, 0); + BIO_clear_retry_flags(bio); + } + return 1; +} + +static int sock_read(BIO *b, char *out, int outl) { + if (out == nullptr) { + return 0; + } + + bio_clear_socket_error(); +#if defined(OPENSSL_WINDOWS) + int ret = recv(FromOpaque(b)->num, out, outl, 0); +#else + int ret = (int)read(FromOpaque(b)->num, out, outl); +#endif + BIO_clear_retry_flags(b); + if (ret <= 0) { + if (bio_socket_should_retry(ret)) { + BIO_set_retry_read(b); + } + } + return ret; +} + +static int sock_write(BIO *b, const char *in, int inl) { + bio_clear_socket_error(); +#if defined(OPENSSL_WINDOWS) + int ret = send(FromOpaque(b)->num, in, inl, 0); +#else + int ret = (int)write(FromOpaque(b)->num, in, inl); +#endif + BIO_clear_retry_flags(b); + if (ret <= 0) { + if (bio_socket_should_retry(ret)) { + BIO_set_retry_write(b); + } + } + return ret; +} + +static long sock_ctrl(BIO *b, int cmd, long num, void *ptr) { + switch (cmd) { + case BIO_C_SET_FD: + sock_free(b); + FromOpaque(b)->num = *static_cast(ptr); + BIO_set_shutdown(b, static_cast(num)); + BIO_set_init(b, 1); + return 1; + case BIO_C_GET_FD: + if (BIO_get_init(b)) { + int *out = static_cast(ptr); + if (out != nullptr) { + *out = FromOpaque(b)->num; + } + return FromOpaque(b)->num; + } + return -1; + case BIO_CTRL_GET_CLOSE: + return BIO_get_shutdown(b); + case BIO_CTRL_SET_CLOSE: + BIO_set_shutdown(b, static_cast(num)); + return 1; + case BIO_CTRL_FLUSH: + return 1; + default: + return 0; + } +} + +static const BIO_METHOD methods_sockp = { + BIO_TYPE_SOCKET, + "socket", + sock_write, + sock_read, + nullptr /* gets, */, + sock_ctrl, + nullptr /* create */, + sock_free, + nullptr /* callback_ctrl */, +}; + +const BIO_METHOD *BIO_s_socket() { return &methods_sockp; } + +BIO *BIO_new_socket(int fd, int close_flag) { + BIO *ret; + + ret = BIO_new(BIO_s_socket()); + if (ret == nullptr) { + return nullptr; + } + BIO_set_fd(ret, fd, close_flag); + return ret; +} + +// These functions are provided solely for compatibility with software that +// tries to copy and then modify |BIO_s_socket|. See bio.h for details. +// PostgreSQL's use makes several fragile assumptions on |BIO_s_socket|: +// +// - We do not store anything in |BIO_set_data|. (Broken in upstream OpenSSL, +// which broke PostgreSQL.) +// - We do not store anything in |BIO_set_app_data|. +// - |BIO_s_socket| is implemented internally using the non-|size_t|-clean +// I/O functions rather than the |size_t|-clean ones. +// - |BIO_METHOD| never gains another function pointer that is used in concert +// with any of the functions here. +// +// Some other projects doing similar things use |BIO_meth_get_read| and +// |BIO_meth_get_write| and in turn assume that |BIO_s_socket| has not been +// ported to the |size_t|-clean |BIO_read_ex| and |BIO_write_ex|. (Not yet +// implemented in BoringSSL.) +// +// This is hopelessly fragile. PostgreSQL 18 will include a fix to stop using +// these APIs, but older versions and other software remain impacted, so we +// implement these functions, but only support |BIO_s_socket|. For now they just +// return the underlying functions, but if we ever need to break the above +// assumptions, we can return an older, frozen version of |BIO_s_socket|. +// Limiting to exactly one allowed |BIO_METHOD| lets us do this. +// +// These functions are also deprecated in upstream OpenSSL. See +// https://github.com/openssl/openssl/issues/26047 +// +// TODO(davidben): Once Folly and all versions of PostgreSQL we care about are +// updated or patched, remove these functions. + +int (*BIO_meth_get_write(const BIO_METHOD *method))(BIO *, const char *, int) { + BSSL_CHECK(method == BIO_s_socket()); + return method->bwrite; +} + +int (*BIO_meth_get_read(const BIO_METHOD *method))(BIO *, char *, int) { + BSSL_CHECK(method == BIO_s_socket()); + return method->bread; +} + +int (*BIO_meth_get_gets(const BIO_METHOD *method))(BIO *, char *, int) { + BSSL_CHECK(method == BIO_s_socket()); + return method->bgets; +} + +int (*BIO_meth_get_puts(const BIO_METHOD *method))(BIO *, const char *) { + BSSL_CHECK(method == BIO_s_socket()); + return nullptr; +} + +long (*BIO_meth_get_ctrl(const BIO_METHOD *method))(BIO *, int, long, void *) { + BSSL_CHECK(method == BIO_s_socket()); + return method->ctrl; +} + +int (*BIO_meth_get_create(const BIO_METHOD *method))(BIO *) { + BSSL_CHECK(method == BIO_s_socket()); + return method->create; +} + +int (*BIO_meth_get_destroy(const BIO_METHOD *method))(BIO *) { + BSSL_CHECK(method == BIO_s_socket()); + return method->destroy; +} + +long (*BIO_meth_get_callback_ctrl(const BIO_METHOD *method))(BIO *, int, + bio_info_cb) { + BSSL_CHECK(method == BIO_s_socket()); + return method->callback_ctrl; +} + +#endif // OPENSSL_NO_SOCK diff --git a/third_party/boringssl/src/crypto/bio/socket_helper.c b/third_party/boringssl/src/crypto/bio/socket_helper.c deleted file mode 100644 index 4cd7825a..00000000 --- a/third_party/boringssl/src/crypto/bio/socket_helper.c +++ /dev/null @@ -1,124 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#if defined(__linux__) -#undef _POSIX_C_SOURCE -#define _POSIX_C_SOURCE 200112L -#endif - -#include -#include - -#if !defined(OPENSSL_TRUSTY) - -#include -#include -#include - -#if !defined(OPENSSL_WINDOWS) -#include -#include -#else -OPENSSL_MSVC_PRAGMA(warning(push, 3)) -#include -#include -OPENSSL_MSVC_PRAGMA(warning(pop)) -#endif - -#include "internal.h" -#include "../internal.h" - - -int bio_ip_and_port_to_socket_and_addr(int *out_sock, - struct sockaddr_storage *out_addr, - socklen_t *out_addr_length, - const char *hostname, - const char *port_str) { - struct addrinfo hint, *result, *cur; - int ret; - - *out_sock = -1; - - OPENSSL_memset(&hint, 0, sizeof(hint)); - hint.ai_family = AF_UNSPEC; - hint.ai_socktype = SOCK_STREAM; - - ret = getaddrinfo(hostname, port_str, &hint, &result); - if (ret != 0) { - OPENSSL_PUT_ERROR(SYS, 0); -#if defined(OPENSSL_WINDOWS) - ERR_add_error_data(1, gai_strerrorA(ret)); -#else - ERR_add_error_data(1, gai_strerror(ret)); -#endif - return 0; - } - - ret = 0; - - for (cur = result; cur; cur = cur->ai_next) { - if ((size_t) cur->ai_addrlen > sizeof(struct sockaddr_storage)) { - continue; - } - OPENSSL_memset(out_addr, 0, sizeof(struct sockaddr_storage)); - OPENSSL_memcpy(out_addr, cur->ai_addr, cur->ai_addrlen); - *out_addr_length = cur->ai_addrlen; - - *out_sock = socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol); - if (*out_sock < 0) { - OPENSSL_PUT_SYSTEM_ERROR(); - goto out; - } - - ret = 1; - break; - } - -out: - freeaddrinfo(result); - return ret; -} - -int bio_socket_nbio(int sock, int on) { -#if defined(OPENSSL_WINDOWS) - u_long arg = on; - - return 0 == ioctlsocket(sock, FIONBIO, &arg); -#else - int flags = fcntl(sock, F_GETFL, 0); - if (flags < 0) { - return 0; - } - if (!on) { - flags &= ~O_NONBLOCK; - } else { - flags |= O_NONBLOCK; - } - return fcntl(sock, F_SETFL, flags) == 0; -#endif -} - -void bio_clear_socket_error(void) {} - -int bio_sock_error(int sock) { - int error; - socklen_t error_size = sizeof(error); - - if (getsockopt(sock, SOL_SOCKET, SO_ERROR, (char *)&error, &error_size) < 0) { - return 1; - } - return error; -} - -#endif // OPENSSL_TRUSTY diff --git a/third_party/boringssl/src/crypto/bio/socket_helper.cc b/third_party/boringssl/src/crypto/bio/socket_helper.cc new file mode 100644 index 00000000..e251b341 --- /dev/null +++ b/third_party/boringssl/src/crypto/bio/socket_helper.cc @@ -0,0 +1,133 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if defined(__linux__) +#undef _POSIX_C_SOURCE +#define _POSIX_C_SOURCE 200112L +#endif + +#include +#include + +#if !defined(OPENSSL_NO_SOCK) + +#include +#include +#include + +#if !defined(OPENSSL_WINDOWS) +#include +#include +#else +#include +#include +#endif + +#include "internal.h" +#include "../internal.h" + + +using namespace bssl; + +int bssl::bio_ip_and_port_to_socket_and_addr(int *out_sock, + struct sockaddr_storage *out_addr, + socklen_t *out_addr_length, + const char *hostname, + const char *port_str) { + struct addrinfo hint, *result, *cur; + int ret; + + *out_sock = -1; + + OPENSSL_memset(&hint, 0, sizeof(hint)); + hint.ai_family = AF_UNSPEC; + hint.ai_socktype = SOCK_STREAM; + + ret = getaddrinfo(hostname, port_str, &hint, &result); + if (ret != 0) { + OPENSSL_PUT_ERROR(SYS, 0); +#if defined(OPENSSL_WINDOWS) + ERR_add_error_data(1, gai_strerrorA(ret)); +#else + ERR_add_error_data(1, gai_strerror(ret)); +#endif + return 0; + } + + ret = 0; + + for (cur = result; cur; cur = cur->ai_next) { + if ((size_t) cur->ai_addrlen > sizeof(struct sockaddr_storage)) { + continue; + } + OPENSSL_memset(out_addr, 0, sizeof(struct sockaddr_storage)); + OPENSSL_memcpy(out_addr, cur->ai_addr, cur->ai_addrlen); + *out_addr_length = cur->ai_addrlen; + + *out_sock = socket(cur->ai_family, cur->ai_socktype, cur->ai_protocol); + if (*out_sock < 0) { + OPENSSL_PUT_SYSTEM_ERROR(); + goto out; + } + + ret = 1; + break; + } + +out: + freeaddrinfo(result); + return ret; +} + +int bssl::bio_socket_nbio(int sock, int on) { +#if defined(OPENSSL_WINDOWS) + u_long arg = on; + + return 0 == ioctlsocket(sock, FIONBIO, &arg); +#else + int flags = fcntl(sock, F_GETFL, 0); + if (flags < 0) { + return 0; + } + if (!on) { + flags &= ~O_NONBLOCK; + } else { + flags |= O_NONBLOCK; + } + return fcntl(sock, F_SETFL, flags) == 0; +#endif +} + +void bssl::bio_clear_socket_error() {} + +int bssl::bio_sock_error(int sock) { + int error; + socklen_t error_size = sizeof(error); + + if (getsockopt(sock, SOL_SOCKET, SO_ERROR, (char *)&error, &error_size) < 0) { + return 1; + } + return error; +} + +int bssl::bio_socket_should_retry(int return_value) { +#if defined(OPENSSL_WINDOWS) + return return_value == -1 && WSAGetLastError() == WSAEWOULDBLOCK; +#else + // On POSIX platforms, sockets and fds are the same. + return bio_errno_should_retry(return_value); +#endif +} + +#endif // OPENSSL_NO_SOCK diff --git a/third_party/boringssl/src/crypto/blake2/blake2.c b/third_party/boringssl/src/crypto/blake2/blake2.c deleted file mode 100644 index 5c6b17ed..00000000 --- a/third_party/boringssl/src/crypto/blake2/blake2.c +++ /dev/null @@ -1,156 +0,0 @@ -/* Copyright (c) 2021, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - -#include "../internal.h" - -// https://tools.ietf.org/html/rfc7693#section-2.6 -static const uint64_t kIV[8] = { - UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), - UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), - UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), - UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179), -}; - -// https://tools.ietf.org/html/rfc7693#section-2.7 -static const uint8_t kSigma[10 * 16] = { - // clang-format off - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3, - 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4, - 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8, - 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13, - 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9, - 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11, - 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10, - 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5, - 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0, - // clang-format on -}; - -// https://tools.ietf.org/html/rfc7693#section-3.1 -static void blake2b_mix(uint64_t v[16], int a, int b, int c, int d, uint64_t x, - uint64_t y) { - v[a] = v[a] + v[b] + x; - v[d] = CRYPTO_rotr_u64(v[d] ^ v[a], 32); - v[c] = v[c] + v[d]; - v[b] = CRYPTO_rotr_u64(v[b] ^ v[c], 24); - v[a] = v[a] + v[b] + y; - v[d] = CRYPTO_rotr_u64(v[d] ^ v[a], 16); - v[c] = v[c] + v[d]; - v[b] = CRYPTO_rotr_u64(v[b] ^ v[c], 63); -} - -static void blake2b_transform( - BLAKE2B_CTX *b2b, - const uint64_t block_words[BLAKE2B_CBLOCK / sizeof(uint64_t)], - size_t num_bytes, int is_final_block) { - // https://tools.ietf.org/html/rfc7693#section-3.2 - uint64_t v[16]; - static_assert(sizeof(v) == sizeof(b2b->h) + sizeof(kIV), ""); - OPENSSL_memcpy(v, b2b->h, sizeof(b2b->h)); - OPENSSL_memcpy(&v[8], kIV, sizeof(kIV)); - - b2b->t_low += num_bytes; - if (b2b->t_low < num_bytes) { - b2b->t_high++; - } - v[12] ^= b2b->t_low; - v[13] ^= b2b->t_high; - - if (is_final_block) { - v[14] = ~v[14]; - } - - for (int round = 0; round < 12; round++) { - const uint8_t *const s = &kSigma[16 * (round % 10)]; - blake2b_mix(v, 0, 4, 8, 12, block_words[s[0]], block_words[s[1]]); - blake2b_mix(v, 1, 5, 9, 13, block_words[s[2]], block_words[s[3]]); - blake2b_mix(v, 2, 6, 10, 14, block_words[s[4]], block_words[s[5]]); - blake2b_mix(v, 3, 7, 11, 15, block_words[s[6]], block_words[s[7]]); - blake2b_mix(v, 0, 5, 10, 15, block_words[s[8]], block_words[s[9]]); - blake2b_mix(v, 1, 6, 11, 12, block_words[s[10]], block_words[s[11]]); - blake2b_mix(v, 2, 7, 8, 13, block_words[s[12]], block_words[s[13]]); - blake2b_mix(v, 3, 4, 9, 14, block_words[s[14]], block_words[s[15]]); - } - - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(b2b->h); i++) { - b2b->h[i] ^= v[i]; - b2b->h[i] ^= v[i + 8]; - } -} - -void BLAKE2B256_Init(BLAKE2B_CTX *b2b) { - OPENSSL_memset(b2b, 0, sizeof(BLAKE2B_CTX)); - - static_assert(sizeof(kIV) == sizeof(b2b->h), ""); - OPENSSL_memcpy(&b2b->h, kIV, sizeof(kIV)); - - // https://tools.ietf.org/html/rfc7693#section-2.5 - b2b->h[0] ^= 0x01010000 | BLAKE2B256_DIGEST_LENGTH; -} - -void BLAKE2B256_Update(BLAKE2B_CTX *b2b, const void *in_data, size_t len) { - const uint8_t *data = (const uint8_t *)in_data; - - size_t todo = sizeof(b2b->block.bytes) - b2b->block_used; - if (todo > len) { - todo = len; - } - OPENSSL_memcpy(&b2b->block.bytes[b2b->block_used], data, todo); - b2b->block_used += todo; - data += todo; - len -= todo; - - if (!len) { - return; - } - - // More input remains therefore we must have filled |b2b->block|. - assert(b2b->block_used == BLAKE2B_CBLOCK); - blake2b_transform(b2b, b2b->block.words, BLAKE2B_CBLOCK, - /*is_final_block=*/0); - b2b->block_used = 0; - - while (len > BLAKE2B_CBLOCK) { - uint64_t block_words[BLAKE2B_CBLOCK / sizeof(uint64_t)]; - OPENSSL_memcpy(block_words, data, sizeof(block_words)); - blake2b_transform(b2b, block_words, BLAKE2B_CBLOCK, /*is_final_block=*/0); - data += BLAKE2B_CBLOCK; - len -= BLAKE2B_CBLOCK; - } - - OPENSSL_memcpy(b2b->block.bytes, data, len); - b2b->block_used = len; -} - -void BLAKE2B256_Final(uint8_t out[BLAKE2B256_DIGEST_LENGTH], BLAKE2B_CTX *b2b) { - OPENSSL_memset(&b2b->block.bytes[b2b->block_used], 0, - sizeof(b2b->block.bytes) - b2b->block_used); - blake2b_transform(b2b, b2b->block.words, b2b->block_used, - /*is_final_block=*/1); - static_assert(BLAKE2B256_DIGEST_LENGTH <= sizeof(b2b->h), ""); - memcpy(out, b2b->h, BLAKE2B256_DIGEST_LENGTH); -} - -void BLAKE2B256(const uint8_t *data, size_t len, - uint8_t out[BLAKE2B256_DIGEST_LENGTH]) { - BLAKE2B_CTX ctx; - BLAKE2B256_Init(&ctx); - BLAKE2B256_Update(&ctx, data, len); - BLAKE2B256_Final(out, &ctx); -} diff --git a/third_party/boringssl/src/crypto/blake2/blake2.cc b/third_party/boringssl/src/crypto/blake2/blake2.cc new file mode 100644 index 00000000..e149164d --- /dev/null +++ b/third_party/boringssl/src/crypto/blake2/blake2.cc @@ -0,0 +1,173 @@ +// Copyright 2021 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include + +#include "../internal.h" + +using namespace bssl; + +// https://tools.ietf.org/html/rfc7693#section-2.6 +static const uint64_t kIV[8] = { + UINT64_C(0x6a09e667f3bcc908), UINT64_C(0xbb67ae8584caa73b), + UINT64_C(0x3c6ef372fe94f82b), UINT64_C(0xa54ff53a5f1d36f1), + UINT64_C(0x510e527fade682d1), UINT64_C(0x9b05688c2b3e6c1f), + UINT64_C(0x1f83d9abfb41bd6b), UINT64_C(0x5be0cd19137e2179), +}; + +// https://tools.ietf.org/html/rfc7693#section-2.7 +static const uint8_t kSigma[10 * 16] = { + // clang-format off + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3, + 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4, + 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8, + 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13, + 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9, + 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11, + 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10, + 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5, + 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13, 0, + // clang-format on +}; + +// https://tools.ietf.org/html/rfc7693#section-3.1 +static void blake2b_mix(uint64_t v[16], int a, int b, int c, int d, uint64_t x, + uint64_t y) { + v[a] = v[a] + v[b] + x; + v[d] = CRYPTO_rotr_u64(v[d] ^ v[a], 32); + v[c] = v[c] + v[d]; + v[b] = CRYPTO_rotr_u64(v[b] ^ v[c], 24); + v[a] = v[a] + v[b] + y; + v[d] = CRYPTO_rotr_u64(v[d] ^ v[a], 16); + v[c] = v[c] + v[d]; + v[b] = CRYPTO_rotr_u64(v[b] ^ v[c], 63); +} + +static uint64_t blake2b_load(const uint8_t block[BLAKE2B_CBLOCK], size_t i) { + return CRYPTO_load_u64_le(block + 8 * i); +} + +static void blake2b_transform(BLAKE2B_CTX *b2b, + const uint8_t block[BLAKE2B_CBLOCK], + size_t num_bytes, int is_final_block) { + // https://tools.ietf.org/html/rfc7693#section-3.2 + uint64_t v[16]; + static_assert(sizeof(v) == sizeof(b2b->h) + sizeof(kIV)); + OPENSSL_memcpy(v, b2b->h, sizeof(b2b->h)); + OPENSSL_memcpy(&v[8], kIV, sizeof(kIV)); + + b2b->t_low += num_bytes; + if (b2b->t_low < num_bytes) { + b2b->t_high++; + } + v[12] ^= b2b->t_low; + v[13] ^= b2b->t_high; + + if (is_final_block) { + v[14] = ~v[14]; + } + + for (int round = 0; round < 12; round++) { + const uint8_t *const s = &kSigma[16 * (round % 10)]; + blake2b_mix(v, 0, 4, 8, 12, blake2b_load(block, s[0]), + blake2b_load(block, s[1])); + blake2b_mix(v, 1, 5, 9, 13, blake2b_load(block, s[2]), + blake2b_load(block, s[3])); + blake2b_mix(v, 2, 6, 10, 14, blake2b_load(block, s[4]), + blake2b_load(block, s[5])); + blake2b_mix(v, 3, 7, 11, 15, blake2b_load(block, s[6]), + blake2b_load(block, s[7])); + blake2b_mix(v, 0, 5, 10, 15, blake2b_load(block, s[8]), + blake2b_load(block, s[9])); + blake2b_mix(v, 1, 6, 11, 12, blake2b_load(block, s[10]), + blake2b_load(block, s[11])); + blake2b_mix(v, 2, 7, 8, 13, blake2b_load(block, s[12]), + blake2b_load(block, s[13])); + blake2b_mix(v, 3, 4, 9, 14, blake2b_load(block, s[14]), + blake2b_load(block, s[15])); + } + + for (size_t i = 0; i < std::size(b2b->h); i++) { + b2b->h[i] ^= v[i]; + b2b->h[i] ^= v[i + 8]; + } +} + +void BLAKE2B256_Init(BLAKE2B_CTX *b2b) { + OPENSSL_memset(b2b, 0, sizeof(BLAKE2B_CTX)); + + static_assert(sizeof(kIV) == sizeof(b2b->h)); + OPENSSL_memcpy(&b2b->h, kIV, sizeof(kIV)); + + // https://tools.ietf.org/html/rfc7693#section-2.5 + b2b->h[0] ^= 0x01010000 | BLAKE2B256_DIGEST_LENGTH; +} + +void BLAKE2B256_Update(BLAKE2B_CTX *b2b, const void *in_data, size_t len) { + if (len == 0) { + // Work around a C language bug. See https://crbug.com/1019588. + return; + } + + const uint8_t *data = reinterpret_cast(in_data); + size_t todo = sizeof(b2b->block) - b2b->block_used; + if (todo > len) { + todo = len; + } + OPENSSL_memcpy(&b2b->block[b2b->block_used], data, todo); + b2b->block_used += todo; + data += todo; + len -= todo; + + if (!len) { + return; + } + + // More input remains therefore we must have filled |b2b->block|. + assert(b2b->block_used == BLAKE2B_CBLOCK); + blake2b_transform(b2b, b2b->block, BLAKE2B_CBLOCK, + /*is_final_block=*/0); + b2b->block_used = 0; + + while (len > BLAKE2B_CBLOCK) { + blake2b_transform(b2b, data, BLAKE2B_CBLOCK, /*is_final_block=*/0); + data += BLAKE2B_CBLOCK; + len -= BLAKE2B_CBLOCK; + } + + OPENSSL_memcpy(b2b->block, data, len); + b2b->block_used = len; +} + +void BLAKE2B256_Final(uint8_t out[BLAKE2B256_DIGEST_LENGTH], BLAKE2B_CTX *b2b) { + OPENSSL_memset(&b2b->block[b2b->block_used], 0, + sizeof(b2b->block) - b2b->block_used); + blake2b_transform(b2b, b2b->block, b2b->block_used, + /*is_final_block=*/1); + static_assert(BLAKE2B256_DIGEST_LENGTH <= sizeof(b2b->h)); + memcpy(out, b2b->h, BLAKE2B256_DIGEST_LENGTH); +} + +void BLAKE2B256(const uint8_t *data, size_t len, + uint8_t out[BLAKE2B256_DIGEST_LENGTH]) { + BLAKE2B_CTX ctx; + BLAKE2B256_Init(&ctx); + BLAKE2B256_Update(&ctx, data, len); + BLAKE2B256_Final(out, &ctx); +} diff --git a/third_party/boringssl/src/crypto/bn/bn_asn1.cc b/third_party/boringssl/src/crypto/bn/bn_asn1.cc new file mode 100644 index 00000000..71b7ae1b --- /dev/null +++ b/third_party/boringssl/src/crypto/bn/bn_asn1.cc @@ -0,0 +1,57 @@ +// Copyright 2015 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + + +int BN_parse_asn1_unsigned(CBS *cbs, BIGNUM *ret) { + CBS child; + int is_negative; + if (!CBS_get_asn1(cbs, &child, CBS_ASN1_INTEGER) || + !CBS_is_valid_asn1_integer(&child, &is_negative)) { + OPENSSL_PUT_ERROR(BN, BN_R_BAD_ENCODING); + return 0; + } + + if (is_negative) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + + return BN_bin2bn(CBS_data(&child), CBS_len(&child), ret) != nullptr; +} + +int BN_marshal_asn1(CBB *cbb, const BIGNUM *bn) { + // Negative numbers are unsupported. + if (BN_is_negative(bn)) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + + CBB child; + if (!CBB_add_asn1(cbb, &child, CBS_ASN1_INTEGER) || + // The number must be padded with a leading zero if the high bit would + // otherwise be set or if |bn| is zero. + (BN_num_bits(bn) % 8 == 0 && !CBB_add_u8(&child, 0x00)) || + !BN_bn2cbb_padded(&child, BN_num_bytes(bn), bn) || + !CBB_flush(cbb)) { + OPENSSL_PUT_ERROR(BN, BN_R_ENCODE_ERROR); + return 0; + } + + return 1; +} diff --git a/third_party/boringssl/src/crypto/bn/convert.cc b/third_party/boringssl/src/crypto/bn/convert.cc new file mode 100644 index 00000000..d41453ad --- /dev/null +++ b/third_party/boringssl/src/crypto/bn/convert.cc @@ -0,0 +1,409 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include "../fipsmodule/bn/internal.h" + + +using namespace bssl; + +int BN_bn2cbb_padded(CBB *out, size_t len, const BIGNUM *in) { + uint8_t *ptr; + return CBB_add_space(out, &ptr, len) && BN_bn2bin_padded(ptr, len, in); +} + +static const char hextable[] = "0123456789abcdef"; + +char *BN_bn2hex(const BIGNUM *bn) { + int width = bn_minimal_width(bn); + char *buf = reinterpret_cast( + OPENSSL_malloc(1 /* leading '-' */ + 1 /* zero is non-empty */ + + width * BN_BYTES * 2 + 1 /* trailing NUL */)); + if (buf == nullptr) { + return nullptr; + } + + char *p = buf; + if (bn->neg) { + *(p++) = '-'; + } + + if (BN_is_zero(bn)) { + *(p++) = '0'; + } + + int z = 0; + for (int i = width - 1; i >= 0; i--) { + for (int j = BN_BITS2 - 8; j >= 0; j -= 8) { + // strip leading zeros + int v = ((int)(bn->d[i] >> (long)j)) & 0xff; + if (z || v != 0) { + *(p++) = hextable[v >> 4]; + *(p++) = hextable[v & 0x0f]; + z = 1; + } + } + } + *p = '\0'; + + return buf; +} + +// decode_hex decodes |in_len| bytes of hex data from |in| and updates |bn|. +static int decode_hex(BIGNUM *bn, const char *in, int in_len) { + if (in_len > INT_MAX / 4) { + OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); + return 0; + } + // |in_len| is the number of hex digits. + if (!bn_expand(bn, in_len * 4)) { + return 0; + } + + int i = 0; + while (in_len > 0) { + // Decode one |BN_ULONG| at a time. + int todo = BN_BYTES * 2; + if (todo > in_len) { + todo = in_len; + } + + BN_ULONG word = 0; + int j; + for (j = todo; j > 0; j--) { + uint8_t hex = 0; + if (!OPENSSL_fromxdigit(&hex, in[in_len - j])) { + // This shouldn't happen. The caller checks |OPENSSL_isxdigit|. + assert(0); + } + word = (word << 4) | hex; + } + + bn->d[i++] = word; + in_len -= todo; + } + assert(i <= bn->dmax); + bn->width = i; + return 1; +} + +// decode_dec decodes |in_len| bytes of decimal data from |in| and updates |bn|. +static int decode_dec(BIGNUM *bn, const char *in, int in_len) { + int i, j; + BN_ULONG l = 0; + + // Decode |BN_DEC_NUM| digits at a time. + j = BN_DEC_NUM - (in_len % BN_DEC_NUM); + if (j == BN_DEC_NUM) { + j = 0; + } + l = 0; + for (i = 0; i < in_len; i++) { + l *= 10; + l += in[i] - '0'; + if (++j == BN_DEC_NUM) { + if (!BN_mul_word(bn, BN_DEC_CONV) || !BN_add_word(bn, l)) { + return 0; + } + l = 0; + j = 0; + } + } + return 1; +} + +typedef int (*decode_func)(BIGNUM *bn, const char *in, int in_len); +typedef int (*char_test_func)(int c); + +static int bn_x2bn(BIGNUM **outp, const char *in, decode_func decode, + char_test_func want_char) { + BIGNUM *ret = nullptr; + int neg = 0, i; + int num; + + if (in == nullptr || *in == 0) { + return 0; + } + + if (*in == '-') { + neg = 1; + in++; + } + + for (i = 0; want_char((unsigned char)in[i]) && i + neg < INT_MAX; i++) { + } + + num = i + neg; + if (outp == nullptr) { + return num; + } + + // in is the start of the hex digits, and it is 'i' long + if (*outp == nullptr) { + ret = BN_new(); + if (ret == nullptr) { + return 0; + } + } else { + ret = *outp; + BN_zero(ret); + } + + if (!decode(ret, in, i)) { + goto err; + } + + bn_set_minimal_width(ret); + if (!BN_is_zero(ret)) { + ret->neg = neg; + } + + *outp = ret; + return num; + +err: + if (*outp == nullptr) { + BN_free(ret); + } + + return 0; +} + +int BN_hex2bn(BIGNUM **outp, const char *in) { + return bn_x2bn(outp, in, decode_hex, OPENSSL_isxdigit); +} + +char *BN_bn2dec(const BIGNUM *a) { + // It is easier to print strings little-endian, so we assemble it in reverse + // and fix at the end. + ScopedCBB cbb; + if (!CBB_init(cbb.get(), 16) || // + !CBB_add_u8(cbb.get(), 0 /* trailing NUL */)) { + return nullptr; + } + + if (BN_is_zero(a)) { + if (!CBB_add_u8(cbb.get(), '0')) { + return nullptr; + } + } else { + UniquePtr copy(BN_dup(a)); + if (copy == nullptr) { + return nullptr; + } + + while (!BN_is_zero(copy.get())) { + BN_ULONG word = BN_div_word(copy.get(), BN_DEC_CONV); + if (word == (BN_ULONG)-1) { + return nullptr; + } + + const int add_leading_zeros = !BN_is_zero(copy.get()); + for (int i = 0; i < BN_DEC_NUM && (add_leading_zeros || word != 0); i++) { + if (!CBB_add_u8(cbb.get(), '0' + word % 10)) { + return nullptr; + } + word /= 10; + } + assert(word == 0); + } + } + + if (BN_is_negative(a) && // + !CBB_add_u8(cbb.get(), '-')) { + return nullptr; + } + + uint8_t *data; + size_t len; + if (!CBB_finish(cbb.get(), &data, &len)) { + return nullptr; + } + + std::reverse(data, data + len); + return reinterpret_cast(data); +} + +int BN_dec2bn(BIGNUM **outp, const char *in) { + return bn_x2bn(outp, in, decode_dec, OPENSSL_isdigit); +} + +int BN_asc2bn(BIGNUM **outp, const char *in) { + const char *const orig_in = in; + if (*in == '-') { + in++; + } + + if (in[0] == '0' && (in[1] == 'X' || in[1] == 'x')) { + if (!BN_hex2bn(outp, in + 2)) { + return 0; + } + } else { + if (!BN_dec2bn(outp, in)) { + return 0; + } + } + + if (*orig_in == '-' && !BN_is_zero(*outp)) { + (*outp)->neg = 1; + } + + return 1; +} + +int BN_print(BIO *bp, const BIGNUM *a) { + if (a->neg && BIO_write(bp, "-", 1) != 1) { + return 0; + } + + if (BN_is_zero(a) && BIO_write(bp, "0", 1) != 1) { + return 0; + } + + int z = 0; + for (int i = bn_minimal_width(a) - 1; i >= 0; i--) { + for (int j = BN_BITS2 - 4; j >= 0; j -= 4) { + // strip leading zeros + int v = ((int)(a->d[i] >> (long)j)) & 0x0f; + if (z || v != 0) { + if (BIO_write(bp, &hextable[v], 1) != 1) { + return 0; + } + z = 1; + } + } + } + return 1; +} + +int BN_print_fp(FILE *fp, const BIGNUM *a) { + BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); + if (b == nullptr) { + return 0; + } + + int ret = BN_print(b, a); + BIO_free(b); + return ret; +} + + +size_t BN_bn2mpi(const BIGNUM *in, uint8_t *out) { + const size_t bits = BN_num_bits(in); + const size_t bytes = (bits + 7) / 8; + // If the number of bits is a multiple of 8, i.e. if the MSB is set, + // prefix with a zero byte. + int extend = 0; + if (bytes != 0 && (bits & 0x07) == 0) { + extend = 1; + } + + const size_t len = bytes + extend; + if (len < bytes || 4 + len < len || (len & 0xffffffff) != len) { + // If we cannot represent the number then we emit zero as the interface + // doesn't allow an error to be signalled. + if (out) { + OPENSSL_memset(out, 0, 4); + } + return 4; + } + + if (out == nullptr) { + return 4 + len; + } + + out[0] = len >> 24; + out[1] = len >> 16; + out[2] = len >> 8; + out[3] = len; + if (extend) { + out[4] = 0; + } + BN_bn2bin(in, out + 4 + extend); + if (in->neg && len > 0) { + out[4] |= 0x80; + } + return len + 4; +} + +BIGNUM *BN_mpi2bn(const uint8_t *in, size_t len, BIGNUM *out) { + if (len < 4) { + OPENSSL_PUT_ERROR(BN, BN_R_BAD_ENCODING); + return nullptr; + } + const size_t in_len = ((size_t)in[0] << 24) | // + ((size_t)in[1] << 16) | // + ((size_t)in[2] << 8) | // + ((size_t)in[3]); + if (in_len != len - 4) { + OPENSSL_PUT_ERROR(BN, BN_R_BAD_ENCODING); + return nullptr; + } + + int out_is_alloced = 0; + if (out == nullptr) { + out = BN_new(); + if (out == nullptr) { + return nullptr; + } + out_is_alloced = 1; + } + + if (in_len == 0) { + BN_zero(out); + return out; + } + + in += 4; + if (BN_bin2bn(in, in_len, out) == nullptr) { + if (out_is_alloced) { + BN_free(out); + } + return nullptr; + } + out->neg = ((*in) & 0x80) != 0; + if (out->neg) { + BN_clear_bit(out, BN_num_bits(out) - 1); + } + return out; +} + +int BN_bn2binpad(const BIGNUM *in, uint8_t *out, int len) { + if (len < 0 || // + !BN_bn2bin_padded(out, (size_t)len, in)) { + return -1; + } + return len; +} + +int BN_bn2lebinpad(const BIGNUM *in, uint8_t *out, int len) { + if (len < 0 || // + !BN_bn2le_padded(out, (size_t)len, in)) { + return -1; + } + return len; +} diff --git a/third_party/boringssl/src/crypto/bn/div.cc b/third_party/boringssl/src/crypto/bn/div.cc new file mode 100644 index 00000000..824ab75d --- /dev/null +++ b/third_party/boringssl/src/crypto/bn/div.cc @@ -0,0 +1,102 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../fipsmodule/bn/internal.h" +#include "../internal.h" + + +using namespace bssl; + +int BN_mod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) { + if (e == 0 || a->width == 0) { + BN_zero(r); + return 1; + } + + size_t num_words = 1 + ((e - 1) / BN_BITS2); + + // If |a| definitely has less than |e| bits, just BN_copy. + if ((size_t)a->width < num_words) { + return BN_copy(r, a) != nullptr; + } + + // Otherwise, first make sure we have enough space in |r|. + // Note that this will fail if num_words > INT_MAX. + if (!bn_wexpand(r, num_words)) { + return 0; + } + + // Copy the content of |a| into |r|. + OPENSSL_memcpy(r->d, a->d, num_words * sizeof(BN_ULONG)); + + // If |e| isn't word-aligned, we have to mask off some of our bits. + size_t top_word_exponent = e % (sizeof(BN_ULONG) * 8); + if (top_word_exponent != 0) { + r->d[num_words - 1] &= (((BN_ULONG)1) << top_word_exponent) - 1; + } + + // Fill in the remaining fields of |r|. + r->neg = a->neg; + r->width = (int)num_words; + bn_set_minimal_width(r); + return 1; +} + +int BN_nnmod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) { + if (!BN_mod_pow2(r, a, e)) { + return 0; + } + + // If the returned value was non-negative, we're done. + if (BN_is_zero(r) || !r->neg) { + return 1; + } + + size_t num_words = 1 + (e - 1) / BN_BITS2; + + // Expand |r| to the size of our modulus. + if (!bn_wexpand(r, num_words)) { + return 0; + } + + // Clear the upper words of |r|. + OPENSSL_memset(&r->d[r->width], 0, (num_words - r->width) * BN_BYTES); + + // Set parameters of |r|. + r->neg = 0; + r->width = (int)num_words; + + // Now, invert every word. The idea here is that we want to compute 2^e-|x|, + // which is actually equivalent to the twos-complement representation of |x| + // in |e| bits, which is -x = ~x + 1. + for (int i = 0; i < r->width; i++) { + r->d[i] = ~r->d[i]; + } + + // If our exponent doesn't span the top word, we have to mask the rest. + size_t top_word_exponent = e % BN_BITS2; + if (top_word_exponent != 0) { + r->d[r->width - 1] &= (((BN_ULONG)1) << top_word_exponent) - 1; + } + + // Keep the minimal-width invariant for |BIGNUM|. + bn_set_minimal_width(r); + + // Finally, add one, for the reason described above. + return BN_add(r, r, BN_value_one()); +} diff --git a/third_party/boringssl/src/crypto/bn/exponentiation.cc b/third_party/boringssl/src/crypto/bn/exponentiation.cc new file mode 100644 index 00000000..f4a4b406 --- /dev/null +++ b/third_party/boringssl/src/crypto/bn/exponentiation.cc @@ -0,0 +1,168 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include + +#include "../fipsmodule/bn/internal.h" + + +using namespace bssl; + +int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { + BN_CTXScope scope(ctx); + BIGNUM *rr; + if (r == a || r == p) { + rr = BN_CTX_get(ctx); + } else { + rr = r; + } + + BIGNUM *v = BN_CTX_get(ctx); + if (rr == nullptr || v == nullptr) { + return 0; + } + + if (BN_copy(v, a) == nullptr) { + return 0; + } + int bits = BN_num_bits(p); + + if (BN_is_odd(p)) { + if (BN_copy(rr, a) == nullptr) { + return 0; + } + } else { + if (!BN_one(rr)) { + return 0; + } + } + + for (int i = 1; i < bits; i++) { + if (!BN_sqr(v, v, ctx)) { + return 0; + } + if (BN_is_bit_set(p, i)) { + if (!BN_mul(rr, rr, v, ctx)) { + return 0; + } + } + } + + if (r != rr && !BN_copy(r, rr)) { + return 0; + } + return 1; +} + +static int mod_exp_even(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, + const BIGNUM *m, BN_CTX *ctx) { + // No cryptographic operations require modular exponentiation with an even + // modulus. We support it for backwards compatibility with any applications + // that may have relied on the operation, but optimize for simplicity over + // performance with straightforward square-and-multiply routine. + int bits = BN_num_bits(p); + if (bits == 0) { + return BN_one(r); + } + + // Make a copy of |a|, in case it aliases |r|. + BN_CTXScope scope(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + if (tmp == nullptr || !BN_copy(tmp, a)) { + return 0; + } + + assert(BN_is_bit_set(p, bits - 1)); + if (!BN_copy(r, tmp)) { + return 0; + } + + for (int i = bits - 2; i >= 0; i--) { + if (!BN_mod_sqr(r, r, m, ctx) || + (BN_is_bit_set(p, i) && !BN_mod_mul(r, r, tmp, m, ctx))) { + return 0; + } + } + + return 1; +} + +int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, + BN_CTX *ctx) { + if (m->neg) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + if (a->neg || BN_ucmp(a, m) >= 0) { + if (!BN_nnmod(r, a, m, ctx)) { + return 0; + } + a = r; + } + + if (BN_is_odd(m)) { + return BN_mod_exp_mont(r, a, p, m, ctx, nullptr); + } + + return mod_exp_even(r, a, p, m, ctx); +} + +int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, + const BIGNUM *m, BN_CTX *ctx, + const BN_MONT_CTX *mont) { + // BN_mod_exp_mont requires reduced inputs. + if (bn_minimal_width(m) == 1) { + a %= m->d[0]; + } + + UniquePtr a_bignum(BN_new()); + if (a_bignum == nullptr || !BN_set_word(a_bignum.get(), a)) { + OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR); + return 0; + } + + return BN_mod_exp_mont(rr, a_bignum.get(), p, m, ctx, mont); +} + +int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1, + const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m, + BN_CTX *ctx, const BN_MONT_CTX *mont) { + // Allocate a montgomery context if it was not supplied by the caller. + UniquePtr new_mont; + if (mont == nullptr) { + new_mont.reset(BN_MONT_CTX_new_for_modulus(m, ctx)); + if (new_mont == nullptr) { + return 0; + } + mont = new_mont.get(); + } + + // BN_mod_mul_montgomery removes one Montgomery factor, so passing one + // Montgomery-encoded and one non-Montgomery-encoded value gives a + // non-Montgomery-encoded result. + UniquePtr tmp(BN_new()); + if (tmp == nullptr || // + !BN_mod_exp_mont(rr, a1, p1, m, ctx, mont) || + !BN_mod_exp_mont(tmp.get(), a2, p2, m, ctx, mont) || + !BN_to_montgomery(rr, rr, mont, ctx) || + !BN_mod_mul_montgomery(rr, rr, tmp.get(), mont, ctx)) { + return 0; + } + + return 1; +} diff --git a/third_party/boringssl/src/crypto/bn/sqrt.cc b/third_party/boringssl/src/crypto/bn/sqrt.cc new file mode 100644 index 00000000..56a0f003 --- /dev/null +++ b/third_party/boringssl/src/crypto/bn/sqrt.cc @@ -0,0 +1,93 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + + +int BN_sqrt(BIGNUM *out_sqrt, const BIGNUM *in, BN_CTX *ctx) { + BIGNUM *estimate, *tmp, *delta, *last_delta, *tmp2; + int ok = 0, last_delta_valid = 0; + + if (in->neg) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + if (BN_is_zero(in)) { + BN_zero(out_sqrt); + return 1; + } + + bssl::BN_CTXScope scope(ctx); + if (out_sqrt == in) { + estimate = BN_CTX_get(ctx); + } else { + estimate = out_sqrt; + } + tmp = BN_CTX_get(ctx); + last_delta = BN_CTX_get(ctx); + delta = BN_CTX_get(ctx); + if (estimate == nullptr || tmp == nullptr || last_delta == nullptr || + delta == nullptr) { + goto err; + } + + // We estimate that the square root of an n-bit number is 2^{n/2}. + if (!BN_lshift(estimate, BN_value_one(), BN_num_bits(in)/2)) { + goto err; + } + + // This is Newton's method for finding a root of the equation |estimate|^2 - + // |in| = 0. + for (;;) { + // |estimate| = 1/2 * (|estimate| + |in|/|estimate|) + if (!BN_div(tmp, nullptr, in, estimate, ctx) || + !BN_add(tmp, tmp, estimate) || !BN_rshift1(estimate, tmp) || + // |tmp| = |estimate|^2 + !BN_sqr(tmp, estimate, ctx) || + // |delta| = |in| - |tmp| + !BN_sub(delta, in, tmp)) { + OPENSSL_PUT_ERROR(BN, ERR_R_BN_LIB); + goto err; + } + + delta->neg = 0; + // The difference between |in| and |estimate| squared is required to always + // decrease. This ensures that the loop always terminates, but I don't have + // a proof that it always finds the square root for a given square. + if (last_delta_valid && BN_cmp(delta, last_delta) >= 0) { + break; + } + + last_delta_valid = 1; + + tmp2 = last_delta; + last_delta = delta; + delta = tmp2; + } + + if (BN_cmp(tmp, in) != 0) { + OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE); + goto err; + } + + ok = 1; + +err: + if (ok && out_sqrt == in && !BN_copy(out_sqrt, estimate)) { + ok = 0; + } + return ok; +} diff --git a/third_party/boringssl/src/crypto/bn_extra/bn_asn1.c b/third_party/boringssl/src/crypto/bn_extra/bn_asn1.c deleted file mode 100644 index a8333d41..00000000 --- a/third_party/boringssl/src/crypto/bn_extra/bn_asn1.c +++ /dev/null @@ -1,57 +0,0 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - - -int BN_parse_asn1_unsigned(CBS *cbs, BIGNUM *ret) { - CBS child; - int is_negative; - if (!CBS_get_asn1(cbs, &child, CBS_ASN1_INTEGER) || - !CBS_is_valid_asn1_integer(&child, &is_negative)) { - OPENSSL_PUT_ERROR(BN, BN_R_BAD_ENCODING); - return 0; - } - - if (is_negative) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - - return BN_bin2bn(CBS_data(&child), CBS_len(&child), ret) != NULL; -} - -int BN_marshal_asn1(CBB *cbb, const BIGNUM *bn) { - // Negative numbers are unsupported. - if (BN_is_negative(bn)) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_INTEGER) || - // The number must be padded with a leading zero if the high bit would - // otherwise be set or if |bn| is zero. - (BN_num_bits(bn) % 8 == 0 && !CBB_add_u8(&child, 0x00)) || - !BN_bn2cbb_padded(&child, BN_num_bytes(bn), bn) || - !CBB_flush(cbb)) { - OPENSSL_PUT_ERROR(BN, BN_R_ENCODE_ERROR); - return 0; - } - - return 1; -} diff --git a/third_party/boringssl/src/crypto/bn_extra/convert.c b/third_party/boringssl/src/crypto/bn_extra/convert.c deleted file mode 100644 index 6e930fc6..00000000 --- a/third_party/boringssl/src/crypto/bn_extra/convert.c +++ /dev/null @@ -1,470 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include -#include - -#include -#include -#include -#include - -#include "../fipsmodule/bn/internal.h" - - -int BN_bn2cbb_padded(CBB *out, size_t len, const BIGNUM *in) { - uint8_t *ptr; - return CBB_add_space(out, &ptr, len) && BN_bn2bin_padded(ptr, len, in); -} - -static const char hextable[] = "0123456789abcdef"; - -char *BN_bn2hex(const BIGNUM *bn) { - int width = bn_minimal_width(bn); - char *buf = OPENSSL_malloc(1 /* leading '-' */ + 1 /* zero is non-empty */ + - width * BN_BYTES * 2 + 1 /* trailing NUL */); - if (buf == NULL) { - OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE); - return NULL; - } - - char *p = buf; - if (bn->neg) { - *(p++) = '-'; - } - - if (BN_is_zero(bn)) { - *(p++) = '0'; - } - - int z = 0; - for (int i = width - 1; i >= 0; i--) { - for (int j = BN_BITS2 - 8; j >= 0; j -= 8) { - // strip leading zeros - int v = ((int)(bn->d[i] >> (long)j)) & 0xff; - if (z || v != 0) { - *(p++) = hextable[v >> 4]; - *(p++) = hextable[v & 0x0f]; - z = 1; - } - } - } - *p = '\0'; - - return buf; -} - -// decode_hex decodes |in_len| bytes of hex data from |in| and updates |bn|. -static int decode_hex(BIGNUM *bn, const char *in, int in_len) { - if (in_len > INT_MAX/4) { - OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); - return 0; - } - // |in_len| is the number of hex digits. - if (!bn_expand(bn, in_len * 4)) { - return 0; - } - - int i = 0; - while (in_len > 0) { - // Decode one |BN_ULONG| at a time. - int todo = BN_BYTES * 2; - if (todo > in_len) { - todo = in_len; - } - - BN_ULONG word = 0; - int j; - for (j = todo; j > 0; j--) { - char c = in[in_len - j]; - - BN_ULONG hex; - if (c >= '0' && c <= '9') { - hex = c - '0'; - } else if (c >= 'a' && c <= 'f') { - hex = c - 'a' + 10; - } else if (c >= 'A' && c <= 'F') { - hex = c - 'A' + 10; - } else { - hex = 0; - // This shouldn't happen. The caller checks |isxdigit|. - assert(0); - } - word = (word << 4) | hex; - } - - bn->d[i++] = word; - in_len -= todo; - } - assert(i <= bn->dmax); - bn->width = i; - return 1; -} - -// decode_dec decodes |in_len| bytes of decimal data from |in| and updates |bn|. -static int decode_dec(BIGNUM *bn, const char *in, int in_len) { - int i, j; - BN_ULONG l = 0; - - // Decode |BN_DEC_NUM| digits at a time. - j = BN_DEC_NUM - (in_len % BN_DEC_NUM); - if (j == BN_DEC_NUM) { - j = 0; - } - l = 0; - for (i = 0; i < in_len; i++) { - l *= 10; - l += in[i] - '0'; - if (++j == BN_DEC_NUM) { - if (!BN_mul_word(bn, BN_DEC_CONV) || - !BN_add_word(bn, l)) { - return 0; - } - l = 0; - j = 0; - } - } - return 1; -} - -typedef int (*decode_func) (BIGNUM *bn, const char *in, int in_len); -typedef int (*char_test_func) (int c); - -static int bn_x2bn(BIGNUM **outp, const char *in, decode_func decode, char_test_func want_char) { - BIGNUM *ret = NULL; - int neg = 0, i; - int num; - - if (in == NULL || *in == 0) { - return 0; - } - - if (*in == '-') { - neg = 1; - in++; - } - - for (i = 0; want_char((unsigned char)in[i]) && i + neg < INT_MAX; i++) {} - - num = i + neg; - if (outp == NULL) { - return num; - } - - // in is the start of the hex digits, and it is 'i' long - if (*outp == NULL) { - ret = BN_new(); - if (ret == NULL) { - return 0; - } - } else { - ret = *outp; - BN_zero(ret); - } - - if (!decode(ret, in, i)) { - goto err; - } - - bn_set_minimal_width(ret); - if (!BN_is_zero(ret)) { - ret->neg = neg; - } - - *outp = ret; - return num; - -err: - if (*outp == NULL) { - BN_free(ret); - } - - return 0; -} - -int BN_hex2bn(BIGNUM **outp, const char *in) { - return bn_x2bn(outp, in, decode_hex, isxdigit); -} - -char *BN_bn2dec(const BIGNUM *a) { - // It is easier to print strings little-endian, so we assemble it in reverse - // and fix at the end. - BIGNUM *copy = NULL; - CBB cbb; - if (!CBB_init(&cbb, 16) || - !CBB_add_u8(&cbb, 0 /* trailing NUL */)) { - goto cbb_err; - } - - if (BN_is_zero(a)) { - if (!CBB_add_u8(&cbb, '0')) { - goto cbb_err; - } - } else { - copy = BN_dup(a); - if (copy == NULL) { - goto err; - } - - while (!BN_is_zero(copy)) { - BN_ULONG word = BN_div_word(copy, BN_DEC_CONV); - if (word == (BN_ULONG)-1) { - goto err; - } - - const int add_leading_zeros = !BN_is_zero(copy); - for (int i = 0; i < BN_DEC_NUM && (add_leading_zeros || word != 0); i++) { - if (!CBB_add_u8(&cbb, '0' + word % 10)) { - goto cbb_err; - } - word /= 10; - } - assert(word == 0); - } - } - - if (BN_is_negative(a) && - !CBB_add_u8(&cbb, '-')) { - goto cbb_err; - } - - uint8_t *data; - size_t len; - if (!CBB_finish(&cbb, &data, &len)) { - goto cbb_err; - } - - // Reverse the buffer. - for (size_t i = 0; i < len/2; i++) { - uint8_t tmp = data[i]; - data[i] = data[len - 1 - i]; - data[len - 1 - i] = tmp; - } - - BN_free(copy); - return (char *)data; - -cbb_err: - OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE); -err: - BN_free(copy); - CBB_cleanup(&cbb); - return NULL; -} - -int BN_dec2bn(BIGNUM **outp, const char *in) { - return bn_x2bn(outp, in, decode_dec, isdigit); -} - -int BN_asc2bn(BIGNUM **outp, const char *in) { - const char *const orig_in = in; - if (*in == '-') { - in++; - } - - if (in[0] == '0' && (in[1] == 'X' || in[1] == 'x')) { - if (!BN_hex2bn(outp, in+2)) { - return 0; - } - } else { - if (!BN_dec2bn(outp, in)) { - return 0; - } - } - - if (*orig_in == '-' && !BN_is_zero(*outp)) { - (*outp)->neg = 1; - } - - return 1; -} - -int BN_print(BIO *bp, const BIGNUM *a) { - int i, j, v, z = 0; - int ret = 0; - - if (a->neg && BIO_write(bp, "-", 1) != 1) { - goto end; - } - - if (BN_is_zero(a) && BIO_write(bp, "0", 1) != 1) { - goto end; - } - - for (i = bn_minimal_width(a) - 1; i >= 0; i--) { - for (j = BN_BITS2 - 4; j >= 0; j -= 4) { - // strip leading zeros - v = ((int)(a->d[i] >> (long)j)) & 0x0f; - if (z || v != 0) { - if (BIO_write(bp, &hextable[v], 1) != 1) { - goto end; - } - z = 1; - } - } - } - ret = 1; - -end: - return ret; -} - -int BN_print_fp(FILE *fp, const BIGNUM *a) { - BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); - if (b == NULL) { - return 0; - } - - int ret = BN_print(b, a); - BIO_free(b); - return ret; -} - - -size_t BN_bn2mpi(const BIGNUM *in, uint8_t *out) { - const size_t bits = BN_num_bits(in); - const size_t bytes = (bits + 7) / 8; - // If the number of bits is a multiple of 8, i.e. if the MSB is set, - // prefix with a zero byte. - int extend = 0; - if (bytes != 0 && (bits & 0x07) == 0) { - extend = 1; - } - - const size_t len = bytes + extend; - if (len < bytes || - 4 + len < len || - (len & 0xffffffff) != len) { - // If we cannot represent the number then we emit zero as the interface - // doesn't allow an error to be signalled. - if (out) { - OPENSSL_memset(out, 0, 4); - } - return 4; - } - - if (out == NULL) { - return 4 + len; - } - - out[0] = len >> 24; - out[1] = len >> 16; - out[2] = len >> 8; - out[3] = len; - if (extend) { - out[4] = 0; - } - BN_bn2bin(in, out + 4 + extend); - if (in->neg && len > 0) { - out[4] |= 0x80; - } - return len + 4; -} - -BIGNUM *BN_mpi2bn(const uint8_t *in, size_t len, BIGNUM *out) { - if (len < 4) { - OPENSSL_PUT_ERROR(BN, BN_R_BAD_ENCODING); - return NULL; - } - const size_t in_len = ((size_t)in[0] << 24) | - ((size_t)in[1] << 16) | - ((size_t)in[2] << 8) | - ((size_t)in[3]); - if (in_len != len - 4) { - OPENSSL_PUT_ERROR(BN, BN_R_BAD_ENCODING); - return NULL; - } - - int out_is_alloced = 0; - if (out == NULL) { - out = BN_new(); - if (out == NULL) { - OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE); - return NULL; - } - out_is_alloced = 1; - } - - if (in_len == 0) { - BN_zero(out); - return out; - } - - in += 4; - if (BN_bin2bn(in, in_len, out) == NULL) { - if (out_is_alloced) { - BN_free(out); - } - return NULL; - } - out->neg = ((*in) & 0x80) != 0; - if (out->neg) { - BN_clear_bit(out, BN_num_bits(out) - 1); - } - return out; -} - -int BN_bn2binpad(const BIGNUM *in, uint8_t *out, int len) { - if (len < 0 || - !BN_bn2bin_padded(out, (size_t)len, in)) { - return -1; - } - return len; -} diff --git a/third_party/boringssl/src/crypto/buf/buf.c b/third_party/boringssl/src/crypto/buf/buf.c deleted file mode 100644 index bd97dd34..00000000 --- a/third_party/boringssl/src/crypto/buf/buf.c +++ /dev/null @@ -1,172 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include - -#include "../internal.h" - - -BUF_MEM *BUF_MEM_new(void) { - BUF_MEM *ret; - - ret = OPENSSL_malloc(sizeof(BUF_MEM)); - if (ret == NULL) { - OPENSSL_PUT_ERROR(BUF, ERR_R_MALLOC_FAILURE); - return NULL; - } - - OPENSSL_memset(ret, 0, sizeof(BUF_MEM)); - return ret; -} - -void BUF_MEM_free(BUF_MEM *buf) { - if (buf == NULL) { - return; - } - - OPENSSL_free(buf->data); - OPENSSL_free(buf); -} - -int BUF_MEM_reserve(BUF_MEM *buf, size_t cap) { - if (buf->max >= cap) { - return 1; - } - - size_t n = cap + 3; - if (n < cap) { - // overflow - OPENSSL_PUT_ERROR(BUF, ERR_R_MALLOC_FAILURE); - return 0; - } - n = n / 3; - size_t alloc_size = n * 4; - if (alloc_size / 4 != n) { - // overflow - OPENSSL_PUT_ERROR(BUF, ERR_R_MALLOC_FAILURE); - return 0; - } - - char *new_buf = OPENSSL_realloc(buf->data, alloc_size); - if (new_buf == NULL) { - OPENSSL_PUT_ERROR(BUF, ERR_R_MALLOC_FAILURE); - return 0; - } - - buf->data = new_buf; - buf->max = alloc_size; - return 1; -} - -size_t BUF_MEM_grow(BUF_MEM *buf, size_t len) { - if (!BUF_MEM_reserve(buf, len)) { - return 0; - } - if (buf->length < len) { - OPENSSL_memset(&buf->data[buf->length], 0, len - buf->length); - } - buf->length = len; - return len; -} - -size_t BUF_MEM_grow_clean(BUF_MEM *buf, size_t len) { - return BUF_MEM_grow(buf, len); -} - -int BUF_MEM_append(BUF_MEM *buf, const void *in, size_t len) { - // Work around a C language bug. See https://crbug.com/1019588. - if (len == 0) { - return 1; - } - size_t new_len = buf->length + len; - if (new_len < len) { - OPENSSL_PUT_ERROR(BUF, ERR_R_OVERFLOW); - return 0; - } - if (!BUF_MEM_reserve(buf, new_len)) { - return 0; - } - OPENSSL_memcpy(buf->data + buf->length, in, len); - buf->length = new_len; - return 1; -} - -char *BUF_strdup(const char *str) { return OPENSSL_strdup(str); } - -size_t BUF_strnlen(const char *str, size_t max_len) { - return OPENSSL_strnlen(str, max_len); -} - -char *BUF_strndup(const char *str, size_t size) { - return OPENSSL_strndup(str, size); -} - -size_t BUF_strlcpy(char *dst, const char *src, size_t dst_size) { - return OPENSSL_strlcpy(dst, src, dst_size); -} - -size_t BUF_strlcat(char *dst, const char *src, size_t dst_size) { - return OPENSSL_strlcat(dst, src, dst_size); -} - -void *BUF_memdup(const void *data, size_t size) { - return OPENSSL_memdup(data, size); -} diff --git a/third_party/boringssl/src/crypto/buf/buf.cc b/third_party/boringssl/src/crypto/buf/buf.cc new file mode 100644 index 00000000..9109241a --- /dev/null +++ b/third_party/boringssl/src/crypto/buf/buf.cc @@ -0,0 +1,119 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" + + +using namespace bssl; + +BUF_MEM *BUF_MEM_new() { return New(); } + +void BUF_MEM_free(BUF_MEM *buf) { + if (buf == nullptr) { + return; + } + OPENSSL_free(buf->data); + Delete(buf); +} + +int BUF_MEM_reserve(BUF_MEM *buf, size_t cap) { + if (buf->max >= cap) { + return 1; + } + + size_t n = cap + 3; + if (n < cap) { + OPENSSL_PUT_ERROR(BUF, ERR_R_OVERFLOW); + return 0; + } + n = n / 3; + size_t alloc_size = n * 4; + if (alloc_size / 4 != n) { + OPENSSL_PUT_ERROR(BUF, ERR_R_OVERFLOW); + return 0; + } + + char *new_buf = + reinterpret_cast(OPENSSL_realloc(buf->data, alloc_size)); + if (new_buf == nullptr) { + return 0; + } + + buf->data = new_buf; + buf->max = alloc_size; + return 1; +} + +size_t BUF_MEM_grow(BUF_MEM *buf, size_t len) { + if (!BUF_MEM_reserve(buf, len)) { + return 0; + } + if (buf->length < len) { + OPENSSL_memset(&buf->data[buf->length], 0, len - buf->length); + } + buf->length = len; + return len; +} + +size_t BUF_MEM_grow_clean(BUF_MEM *buf, size_t len) { + return BUF_MEM_grow(buf, len); +} + +int BUF_MEM_append(BUF_MEM *buf, const void *in, size_t len) { + // Work around a C language bug. See https://crbug.com/1019588. + if (len == 0) { + return 1; + } + size_t new_len = buf->length + len; + if (new_len < len) { + OPENSSL_PUT_ERROR(BUF, ERR_R_OVERFLOW); + return 0; + } + if (!BUF_MEM_reserve(buf, new_len)) { + return 0; + } + OPENSSL_memcpy(buf->data + buf->length, in, len); + buf->length = new_len; + return 1; +} + +char *BUF_strdup(const char *str) { return OPENSSL_strdup(str); } + +size_t BUF_strnlen(const char *str, size_t max_len) { + return OPENSSL_strnlen(str, max_len); +} + +char *BUF_strndup(const char *str, size_t size) { + return OPENSSL_strndup(str, size); +} + +size_t BUF_strlcpy(char *dst, const char *src, size_t dst_size) { + return OPENSSL_strlcpy(dst, src, dst_size); +} + +size_t BUF_strlcat(char *dst, const char *src, size_t dst_size) { + return OPENSSL_strlcat(dst, src, dst_size); +} + +void *BUF_memdup(const void *data, size_t size) { + return OPENSSL_memdup(data, size); +} diff --git a/third_party/boringssl/src/crypto/bytestring/asn1_compat.c b/third_party/boringssl/src/crypto/bytestring/asn1_compat.c deleted file mode 100644 index 50df9cce..00000000 --- a/third_party/boringssl/src/crypto/bytestring/asn1_compat.c +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2016, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - - -#include - -#include -#include -#include - -#include - -#include "internal.h" -#include "../internal.h" - - -int CBB_finish_i2d(CBB *cbb, uint8_t **outp) { - assert(cbb->base->can_resize); - - uint8_t *der; - size_t der_len; - if (!CBB_finish(cbb, &der, &der_len)) { - CBB_cleanup(cbb); - return -1; - } - if (der_len > INT_MAX) { - OPENSSL_free(der); - return -1; - } - if (outp != NULL) { - if (*outp == NULL) { - *outp = der; - der = NULL; - } else { - OPENSSL_memcpy(*outp, der, der_len); - *outp += der_len; - } - } - OPENSSL_free(der); - return (int)der_len; -} diff --git a/third_party/boringssl/src/crypto/bytestring/asn1_compat.cc b/third_party/boringssl/src/crypto/bytestring/asn1_compat.cc new file mode 100644 index 00000000..0ce6055d --- /dev/null +++ b/third_party/boringssl/src/crypto/bytestring/asn1_compat.cc @@ -0,0 +1,55 @@ +// Copyright 2016 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + + +#include + +#include +#include +#include + +#include + +#include "internal.h" +#include "../internal.h" + + +using namespace bssl; + +int bssl::CBB_finish_i2d(CBB *cbb, uint8_t **outp) { + assert(!cbb->is_child); + assert(cbb->u.base.can_resize); + + uint8_t *der; + size_t der_len; + if (!CBB_finish(cbb, &der, &der_len)) { + CBB_cleanup(cbb); + return -1; + } + if (der_len > INT_MAX) { + OPENSSL_free(der); + return -1; + } + if (outp != nullptr) { + if (*outp == nullptr) { + *outp = der; + der = nullptr; + } else { + OPENSSL_memcpy(*outp, der, der_len); + *outp += der_len; + } + } + OPENSSL_free(der); + return (int)der_len; +} diff --git a/third_party/boringssl/src/crypto/bytestring/ber.c b/third_party/boringssl/src/crypto/bytestring/ber.c deleted file mode 100644 index dc707b93..00000000 --- a/third_party/boringssl/src/crypto/bytestring/ber.c +++ /dev/null @@ -1,266 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include "internal.h" -#include "../internal.h" - - -// kMaxDepth is a just a sanity limit. The code should be such that the length -// of the input being processes always decreases. None the less, a very large -// input could otherwise cause the stack to overflow. -static const unsigned kMaxDepth = 2048; - -// is_string_type returns one if |tag| is a string type and zero otherwise. It -// ignores the constructed bit. -static int is_string_type(unsigned tag) { - // While BER supports constructed BIT STRINGS, OpenSSL misparses them. To - // avoid acting on an ambiguous input, we do not support constructed BIT - // STRINGS. See https://github.com/openssl/openssl/issues/12810. - switch (tag & ~CBS_ASN1_CONSTRUCTED) { - case CBS_ASN1_OCTETSTRING: - case CBS_ASN1_UTF8STRING: - case CBS_ASN1_NUMERICSTRING: - case CBS_ASN1_PRINTABLESTRING: - case CBS_ASN1_T61STRING: - case CBS_ASN1_VIDEOTEXSTRING: - case CBS_ASN1_IA5STRING: - case CBS_ASN1_GRAPHICSTRING: - case CBS_ASN1_VISIBLESTRING: - case CBS_ASN1_GENERALSTRING: - case CBS_ASN1_UNIVERSALSTRING: - case CBS_ASN1_BMPSTRING: - return 1; - default: - return 0; - } -} - -// cbs_find_ber walks an ASN.1 structure in |orig_in| and sets |*ber_found| -// depending on whether an indefinite length element or constructed string was -// found. The value of |orig_in| is not changed. It returns one on success (i.e. -// |*ber_found| was set) and zero on error. -static int cbs_find_ber(const CBS *orig_in, int *ber_found, unsigned depth) { - CBS in; - - if (depth > kMaxDepth) { - return 0; - } - - CBS_init(&in, CBS_data(orig_in), CBS_len(orig_in)); - *ber_found = 0; - - while (CBS_len(&in) > 0) { - CBS contents; - unsigned tag; - size_t header_len; - int indefinite; - if (!CBS_get_any_ber_asn1_element(&in, &contents, &tag, &header_len, - ber_found, &indefinite)) { - return 0; - } - if (*ber_found) { - return 1; - } - if (tag & CBS_ASN1_CONSTRUCTED) { - if (is_string_type(tag)) { - // Constructed strings are only legal in BER and require conversion. - *ber_found = 1; - return 1; - } - if (!CBS_skip(&contents, header_len) || - !cbs_find_ber(&contents, ber_found, depth + 1)) { - return 0; - } - } - } - - return 1; -} - -// cbs_get_eoc returns one if |cbs| begins with an "end of contents" (EOC) value -// and zero otherwise. If an EOC was found, it advances |cbs| past it. -static int cbs_get_eoc(CBS *cbs) { - if (CBS_len(cbs) >= 2 && - CBS_data(cbs)[0] == 0 && CBS_data(cbs)[1] == 0) { - return CBS_skip(cbs, 2); - } - return 0; -} - -// cbs_convert_ber reads BER data from |in| and writes DER data to |out|. If -// |string_tag| is non-zero, then all elements must match |string_tag| up to the -// constructed bit and primitive element bodies are written to |out| without -// element headers. This is used when concatenating the fragments of a -// constructed string. If |looking_for_eoc| is set then any EOC elements found -// will cause the function to return after consuming it. It returns one on -// success and zero on error. -static int cbs_convert_ber(CBS *in, CBB *out, unsigned string_tag, - char looking_for_eoc, unsigned depth) { - assert(!(string_tag & CBS_ASN1_CONSTRUCTED)); - - if (depth > kMaxDepth) { - return 0; - } - - while (CBS_len(in) > 0) { - if (looking_for_eoc && cbs_get_eoc(in)) { - return 1; - } - - CBS contents; - unsigned tag, child_string_tag = string_tag; - size_t header_len; - int indefinite; - CBB *out_contents, out_contents_storage; - if (!CBS_get_any_ber_asn1_element(in, &contents, &tag, &header_len, - /*out_ber_found=*/NULL, &indefinite)) { - return 0; - } - - if (string_tag != 0) { - // This is part of a constructed string. All elements must match - // |string_tag| up to the constructed bit and get appended to |out| - // without a child element. - if ((tag & ~CBS_ASN1_CONSTRUCTED) != string_tag) { - return 0; - } - out_contents = out; - } else { - unsigned out_tag = tag; - if ((tag & CBS_ASN1_CONSTRUCTED) && is_string_type(tag)) { - // If a constructed string, clear the constructed bit and inform - // children to concatenate bodies. - out_tag &= ~CBS_ASN1_CONSTRUCTED; - child_string_tag = out_tag; - } - if (!CBB_add_asn1(out, &out_contents_storage, out_tag)) { - return 0; - } - out_contents = &out_contents_storage; - } - - if (indefinite) { - if (!cbs_convert_ber(in, out_contents, child_string_tag, - /*looking_for_eoc=*/1, depth + 1) || - !CBB_flush(out)) { - return 0; - } - continue; - } - - if (!CBS_skip(&contents, header_len)) { - return 0; - } - - if (tag & CBS_ASN1_CONSTRUCTED) { - // Recurse into children. - if (!cbs_convert_ber(&contents, out_contents, child_string_tag, - /*looking_for_eoc=*/0, depth + 1)) { - return 0; - } - } else { - // Copy primitive contents as-is. - if (!CBB_add_bytes(out_contents, CBS_data(&contents), - CBS_len(&contents))) { - return 0; - } - } - - if (!CBB_flush(out)) { - return 0; - } - } - - return looking_for_eoc == 0; -} - -int CBS_asn1_ber_to_der(CBS *in, CBS *out, uint8_t **out_storage) { - CBB cbb; - - // First, do a quick walk to find any indefinite-length elements. Most of the - // time we hope that there aren't any and thus we can quickly return. - int conversion_needed; - if (!cbs_find_ber(in, &conversion_needed, 0)) { - return 0; - } - - if (!conversion_needed) { - if (!CBS_get_any_asn1_element(in, out, NULL, NULL)) { - return 0; - } - *out_storage = NULL; - return 1; - } - - size_t len; - if (!CBB_init(&cbb, CBS_len(in)) || - !cbs_convert_ber(in, &cbb, 0, 0, 0) || - !CBB_finish(&cbb, out_storage, &len)) { - CBB_cleanup(&cbb); - return 0; - } - - CBS_init(out, *out_storage, len); - return 1; -} - -int CBS_get_asn1_implicit_string(CBS *in, CBS *out, uint8_t **out_storage, - unsigned outer_tag, unsigned inner_tag) { - assert(!(outer_tag & CBS_ASN1_CONSTRUCTED)); - assert(!(inner_tag & CBS_ASN1_CONSTRUCTED)); - assert(is_string_type(inner_tag)); - - if (CBS_peek_asn1_tag(in, outer_tag)) { - // Normal implicitly-tagged string. - *out_storage = NULL; - return CBS_get_asn1(in, out, outer_tag); - } - - // Otherwise, try to parse an implicitly-tagged constructed string. - // |CBS_asn1_ber_to_der| is assumed to have run, so only allow one level deep - // of nesting. - CBB result; - CBS child; - if (!CBB_init(&result, CBS_len(in)) || - !CBS_get_asn1(in, &child, outer_tag | CBS_ASN1_CONSTRUCTED)) { - goto err; - } - - while (CBS_len(&child) > 0) { - CBS chunk; - if (!CBS_get_asn1(&child, &chunk, inner_tag) || - !CBB_add_bytes(&result, CBS_data(&chunk), CBS_len(&chunk))) { - goto err; - } - } - - uint8_t *data; - size_t len; - if (!CBB_finish(&result, &data, &len)) { - goto err; - } - - CBS_init(out, data, len); - *out_storage = data; - return 1; - -err: - CBB_cleanup(&result); - return 0; -} diff --git a/third_party/boringssl/src/crypto/bytestring/ber.cc b/third_party/boringssl/src/crypto/bytestring/ber.cc new file mode 100644 index 00000000..f7ec5d71 --- /dev/null +++ b/third_party/boringssl/src/crypto/bytestring/ber.cc @@ -0,0 +1,268 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "internal.h" + + +using namespace bssl; + +// kMaxDepth limits the recursion depth to avoid overflowing the stack. +static const uint32_t kMaxDepth = 128; + +// is_string_type returns one if |tag| is a string type and zero otherwise. It +// ignores the constructed bit. +static int is_string_type(CBS_ASN1_TAG tag) { + // While BER supports constructed BIT STRINGS, OpenSSL misparses them. To + // avoid acting on an ambiguous input, we do not support constructed BIT + // STRINGS. See https://github.com/openssl/openssl/issues/12810. + switch (tag & ~CBS_ASN1_CONSTRUCTED) { + case CBS_ASN1_OCTETSTRING: + case CBS_ASN1_UTF8STRING: + case CBS_ASN1_NUMERICSTRING: + case CBS_ASN1_PRINTABLESTRING: + case CBS_ASN1_T61STRING: + case CBS_ASN1_VIDEOTEXSTRING: + case CBS_ASN1_IA5STRING: + case CBS_ASN1_GRAPHICSTRING: + case CBS_ASN1_VISIBLESTRING: + case CBS_ASN1_GENERALSTRING: + case CBS_ASN1_UNIVERSALSTRING: + case CBS_ASN1_BMPSTRING: + return 1; + default: + return 0; + } +} + +// cbs_find_ber walks an ASN.1 structure in |orig_in| and sets |*ber_found| +// depending on whether an indefinite length element or constructed string was +// found. The value of |orig_in| is not changed. It returns one on success (i.e. +// |*ber_found| was set) and zero on error. +static int cbs_find_ber(const CBS *orig_in, int *ber_found, uint32_t depth) { + if (depth > kMaxDepth) { + return 0; + } + + CBS in = *orig_in; + *ber_found = 0; + + while (CBS_len(&in) > 0) { + CBS contents; + CBS_ASN1_TAG tag; + size_t header_len; + int indefinite; + if (!CBS_get_any_ber_asn1_element(&in, &contents, &tag, &header_len, + ber_found, &indefinite)) { + return 0; + } + if (*ber_found) { + return 1; + } + if (tag & CBS_ASN1_CONSTRUCTED) { + if (is_string_type(tag)) { + // Constructed strings are only legal in BER and require conversion. + *ber_found = 1; + return 1; + } + if (!CBS_skip(&contents, header_len) || + !cbs_find_ber(&contents, ber_found, depth + 1)) { + return 0; + } + if (*ber_found) { + // We already found BER. No need to continue parsing. + return 1; + } + } + } + + return 1; +} + +// cbs_get_eoc returns one if |cbs| begins with an "end of contents" (EOC) value +// and zero otherwise. If an EOC was found, it advances |cbs| past it. +static int cbs_get_eoc(CBS *cbs) { + if (CBS_len(cbs) >= 2 && + CBS_data(cbs)[0] == 0 && CBS_data(cbs)[1] == 0) { + return CBS_skip(cbs, 2); + } + return 0; +} + +// cbs_convert_ber reads BER data from |in| and writes DER data to |out|. If +// |string_tag| is non-zero, then all elements must match |string_tag| up to the +// constructed bit and primitive element bodies are written to |out| without +// element headers. This is used when concatenating the fragments of a +// constructed string. If |looking_for_eoc| is set then any EOC elements found +// will cause the function to return after consuming it. It returns one on +// success and zero on error. +static int cbs_convert_ber(CBS *in, CBB *out, CBS_ASN1_TAG string_tag, + int looking_for_eoc, uint32_t depth) { + assert(!(string_tag & CBS_ASN1_CONSTRUCTED)); + + if (depth > kMaxDepth) { + return 0; + } + + while (CBS_len(in) > 0) { + if (looking_for_eoc && cbs_get_eoc(in)) { + return 1; + } + + CBS contents; + CBS_ASN1_TAG tag, child_string_tag = string_tag; + size_t header_len; + int indefinite; + CBB *out_contents, out_contents_storage; + if (!CBS_get_any_ber_asn1_element(in, &contents, &tag, &header_len, + /*out_ber_found=*/nullptr, &indefinite)) { + return 0; + } + + if (string_tag != 0) { + // This is part of a constructed string. All elements must match + // |string_tag| up to the constructed bit and get appended to |out| + // without a child element. + if ((tag & ~CBS_ASN1_CONSTRUCTED) != string_tag) { + return 0; + } + out_contents = out; + } else { + CBS_ASN1_TAG out_tag = tag; + if ((tag & CBS_ASN1_CONSTRUCTED) && is_string_type(tag)) { + // If a constructed string, clear the constructed bit and inform + // children to concatenate bodies. + out_tag &= ~CBS_ASN1_CONSTRUCTED; + child_string_tag = out_tag; + } + if (!CBB_add_asn1(out, &out_contents_storage, out_tag)) { + return 0; + } + out_contents = &out_contents_storage; + } + + if (indefinite) { + if (!cbs_convert_ber(in, out_contents, child_string_tag, + /*looking_for_eoc=*/1, depth + 1) || + !CBB_flush(out)) { + return 0; + } + continue; + } + + if (!CBS_skip(&contents, header_len)) { + return 0; + } + + if (tag & CBS_ASN1_CONSTRUCTED) { + // Recurse into children. + if (!cbs_convert_ber(&contents, out_contents, child_string_tag, + /*looking_for_eoc=*/0, depth + 1)) { + return 0; + } + } else { + // Copy primitive contents as-is. + if (!CBB_add_bytes(out_contents, CBS_data(&contents), + CBS_len(&contents))) { + return 0; + } + } + + if (!CBB_flush(out)) { + return 0; + } + } + + return looking_for_eoc == 0; +} + +int bssl::CBS_asn1_ber_to_der(CBS *in, CBS *out, uint8_t **out_storage) { + CBB cbb; + + // First, do a quick walk to find any indefinite-length elements. Most of the + // time we hope that there aren't any and thus we can quickly return. + int conversion_needed; + if (!cbs_find_ber(in, &conversion_needed, 0)) { + return 0; + } + + if (!conversion_needed) { + if (!CBS_get_any_asn1_element(in, out, nullptr, nullptr)) { + return 0; + } + *out_storage = nullptr; + return 1; + } + + size_t len; + if (!CBB_init(&cbb, CBS_len(in)) || + !cbs_convert_ber(in, &cbb, 0, 0, 0) || + !CBB_finish(&cbb, out_storage, &len)) { + CBB_cleanup(&cbb); + return 0; + } + + CBS_init(out, *out_storage, len); + return 1; +} + +int bssl::CBS_get_asn1_implicit_string(CBS *in, CBS *out, uint8_t **out_storage, + CBS_ASN1_TAG outer_tag, + CBS_ASN1_TAG inner_tag) { + assert(!(outer_tag & CBS_ASN1_CONSTRUCTED)); + assert(!(inner_tag & CBS_ASN1_CONSTRUCTED)); + assert(is_string_type(inner_tag)); + + if (CBS_peek_asn1_tag(in, outer_tag)) { + // Normal implicitly-tagged string. + *out_storage = nullptr; + return CBS_get_asn1(in, out, outer_tag); + } + + // Otherwise, try to parse an implicitly-tagged constructed string. + // |CBS_asn1_ber_to_der| is assumed to have run, so only allow one level deep + // of nesting. + CBB result; + CBS child; + if (!CBB_init(&result, CBS_len(in)) || + !CBS_get_asn1(in, &child, outer_tag | CBS_ASN1_CONSTRUCTED)) { + goto err; + } + + while (CBS_len(&child) > 0) { + CBS chunk; + if (!CBS_get_asn1(&child, &chunk, inner_tag) || + !CBB_add_bytes(&result, CBS_data(&chunk), CBS_len(&chunk))) { + goto err; + } + } + + uint8_t *data; + size_t len; + if (!CBB_finish(&result, &data, &len)) { + goto err; + } + + CBS_init(out, data, len); + *out_storage = data; + return 1; + +err: + CBB_cleanup(&result); + return 0; +} diff --git a/third_party/boringssl/src/crypto/bytestring/cbb.c b/third_party/boringssl/src/crypto/bytestring/cbb.c deleted file mode 100644 index 6ce20ad7..00000000 --- a/third_party/boringssl/src/crypto/bytestring/cbb.c +++ /dev/null @@ -1,725 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include - -#include - -#include "../internal.h" - - -void CBB_zero(CBB *cbb) { - OPENSSL_memset(cbb, 0, sizeof(CBB)); -} - -static int cbb_init(CBB *cbb, uint8_t *buf, size_t cap) { - // This assumes that |cbb| has already been zeroed. - struct cbb_buffer_st *base; - - base = OPENSSL_malloc(sizeof(struct cbb_buffer_st)); - if (base == NULL) { - return 0; - } - - base->buf = buf; - base->len = 0; - base->cap = cap; - base->can_resize = 1; - base->error = 0; - - cbb->base = base; - cbb->is_child = 0; - return 1; -} - -int CBB_init(CBB *cbb, size_t initial_capacity) { - CBB_zero(cbb); - - uint8_t *buf = OPENSSL_malloc(initial_capacity); - if (initial_capacity > 0 && buf == NULL) { - return 0; - } - - if (!cbb_init(cbb, buf, initial_capacity)) { - OPENSSL_free(buf); - return 0; - } - - return 1; -} - -int CBB_init_fixed(CBB *cbb, uint8_t *buf, size_t len) { - CBB_zero(cbb); - - if (!cbb_init(cbb, buf, len)) { - return 0; - } - - cbb->base->can_resize = 0; - return 1; -} - -void CBB_cleanup(CBB *cbb) { - // Child |CBB|s are non-owning. They are implicitly discarded and should not - // be used with |CBB_cleanup| or |ScopedCBB|. - assert(!cbb->is_child); - if (cbb->is_child) { - return; - } - - if (cbb->base) { - if (cbb->base->can_resize) { - OPENSSL_free(cbb->base->buf); - } - OPENSSL_free(cbb->base); - } - cbb->base = NULL; -} - -static int cbb_buffer_reserve(struct cbb_buffer_st *base, uint8_t **out, - size_t len) { - size_t newlen; - - if (base == NULL) { - return 0; - } - - newlen = base->len + len; - if (newlen < base->len) { - // Overflow - goto err; - } - - if (newlen > base->cap) { - size_t newcap = base->cap * 2; - uint8_t *newbuf; - - if (!base->can_resize) { - goto err; - } - - if (newcap < base->cap || newcap < newlen) { - newcap = newlen; - } - newbuf = OPENSSL_realloc(base->buf, newcap); - if (newbuf == NULL) { - goto err; - } - - base->buf = newbuf; - base->cap = newcap; - } - - if (out) { - *out = base->buf + base->len; - } - - return 1; - -err: - base->error = 1; - return 0; -} - -static int cbb_buffer_add(struct cbb_buffer_st *base, uint8_t **out, - size_t len) { - if (!cbb_buffer_reserve(base, out, len)) { - return 0; - } - // This will not overflow or |cbb_buffer_reserve| would have failed. - base->len += len; - return 1; -} - -static int cbb_buffer_add_u(struct cbb_buffer_st *base, uint64_t v, - size_t len_len) { - if (len_len == 0) { - return 1; - } - - uint8_t *buf; - if (!cbb_buffer_add(base, &buf, len_len)) { - return 0; - } - - for (size_t i = len_len - 1; i < len_len; i--) { - buf[i] = v; - v >>= 8; - } - - if (v != 0) { - base->error = 1; - return 0; - } - - return 1; -} - -int CBB_finish(CBB *cbb, uint8_t **out_data, size_t *out_len) { - if (cbb->is_child) { - return 0; - } - - if (!CBB_flush(cbb)) { - return 0; - } - - if (cbb->base->can_resize && (out_data == NULL || out_len == NULL)) { - // |out_data| and |out_len| can only be NULL if the CBB is fixed. - return 0; - } - - if (out_data != NULL) { - *out_data = cbb->base->buf; - } - if (out_len != NULL) { - *out_len = cbb->base->len; - } - cbb->base->buf = NULL; - CBB_cleanup(cbb); - return 1; -} - -// CBB_flush recurses and then writes out any pending length prefix. The -// current length of the underlying base is taken to be the length of the -// length-prefixed data. -int CBB_flush(CBB *cbb) { - size_t child_start, i, len; - - // If |cbb->base| has hit an error, the buffer is in an undefined state, so - // fail all following calls. In particular, |cbb->child| may point to invalid - // memory. - if (cbb->base == NULL || cbb->base->error) { - return 0; - } - - if (cbb->child == NULL || cbb->child->pending_len_len == 0) { - return 1; - } - - child_start = cbb->child->offset + cbb->child->pending_len_len; - - if (!CBB_flush(cbb->child) || - child_start < cbb->child->offset || - cbb->base->len < child_start) { - goto err; - } - - len = cbb->base->len - child_start; - - if (cbb->child->pending_is_asn1) { - // For ASN.1 we assume that we'll only need a single byte for the length. - // If that turned out to be incorrect, we have to move the contents along - // in order to make space. - uint8_t len_len; - uint8_t initial_length_byte; - - assert (cbb->child->pending_len_len == 1); - - if (len > 0xfffffffe) { - // Too large. - goto err; - } else if (len > 0xffffff) { - len_len = 5; - initial_length_byte = 0x80 | 4; - } else if (len > 0xffff) { - len_len = 4; - initial_length_byte = 0x80 | 3; - } else if (len > 0xff) { - len_len = 3; - initial_length_byte = 0x80 | 2; - } else if (len > 0x7f) { - len_len = 2; - initial_length_byte = 0x80 | 1; - } else { - len_len = 1; - initial_length_byte = (uint8_t)len; - len = 0; - } - - if (len_len != 1) { - // We need to move the contents along in order to make space. - size_t extra_bytes = len_len - 1; - if (!cbb_buffer_add(cbb->base, NULL, extra_bytes)) { - goto err; - } - OPENSSL_memmove(cbb->base->buf + child_start + extra_bytes, - cbb->base->buf + child_start, len); - } - cbb->base->buf[cbb->child->offset++] = initial_length_byte; - cbb->child->pending_len_len = len_len - 1; - } - - for (i = cbb->child->pending_len_len - 1; i < cbb->child->pending_len_len; - i--) { - cbb->base->buf[cbb->child->offset + i] = (uint8_t)len; - len >>= 8; - } - if (len != 0) { - goto err; - } - - cbb->child->base = NULL; - cbb->child = NULL; - - return 1; - -err: - cbb->base->error = 1; - return 0; -} - -const uint8_t *CBB_data(const CBB *cbb) { - assert(cbb->child == NULL); - return cbb->base->buf + cbb->offset + cbb->pending_len_len; -} - -size_t CBB_len(const CBB *cbb) { - assert(cbb->child == NULL); - assert(cbb->offset + cbb->pending_len_len <= cbb->base->len); - - return cbb->base->len - cbb->offset - cbb->pending_len_len; -} - -static int cbb_add_length_prefixed(CBB *cbb, CBB *out_contents, - uint8_t len_len) { - uint8_t *prefix_bytes; - - if (!CBB_flush(cbb)) { - return 0; - } - - size_t offset = cbb->base->len; - if (!cbb_buffer_add(cbb->base, &prefix_bytes, len_len)) { - return 0; - } - - OPENSSL_memset(prefix_bytes, 0, len_len); - OPENSSL_memset(out_contents, 0, sizeof(CBB)); - out_contents->base = cbb->base; - out_contents->is_child = 1; - cbb->child = out_contents; - cbb->child->offset = offset; - cbb->child->pending_len_len = len_len; - cbb->child->pending_is_asn1 = 0; - - return 1; -} - -int CBB_add_u8_length_prefixed(CBB *cbb, CBB *out_contents) { - return cbb_add_length_prefixed(cbb, out_contents, 1); -} - -int CBB_add_u16_length_prefixed(CBB *cbb, CBB *out_contents) { - return cbb_add_length_prefixed(cbb, out_contents, 2); -} - -int CBB_add_u24_length_prefixed(CBB *cbb, CBB *out_contents) { - return cbb_add_length_prefixed(cbb, out_contents, 3); -} - -// add_base128_integer encodes |v| as a big-endian base-128 integer where the -// high bit of each byte indicates where there is more data. This is the -// encoding used in DER for both high tag number form and OID components. -static int add_base128_integer(CBB *cbb, uint64_t v) { - unsigned len_len = 0; - uint64_t copy = v; - while (copy > 0) { - len_len++; - copy >>= 7; - } - if (len_len == 0) { - len_len = 1; // Zero is encoded with one byte. - } - for (unsigned i = len_len - 1; i < len_len; i--) { - uint8_t byte = (v >> (7 * i)) & 0x7f; - if (i != 0) { - // The high bit denotes whether there is more data. - byte |= 0x80; - } - if (!CBB_add_u8(cbb, byte)) { - return 0; - } - } - return 1; -} - -int CBB_add_asn1(CBB *cbb, CBB *out_contents, unsigned tag) { - if (!CBB_flush(cbb)) { - return 0; - } - - // Split the tag into leading bits and tag number. - uint8_t tag_bits = (tag >> CBS_ASN1_TAG_SHIFT) & 0xe0; - unsigned tag_number = tag & CBS_ASN1_TAG_NUMBER_MASK; - if (tag_number >= 0x1f) { - // Set all the bits in the tag number to signal high tag number form. - if (!CBB_add_u8(cbb, tag_bits | 0x1f) || - !add_base128_integer(cbb, tag_number)) { - return 0; - } - } else if (!CBB_add_u8(cbb, tag_bits | tag_number)) { - return 0; - } - - size_t offset = cbb->base->len; - if (!CBB_add_u8(cbb, 0)) { - return 0; - } - - OPENSSL_memset(out_contents, 0, sizeof(CBB)); - out_contents->base = cbb->base; - out_contents->is_child = 1; - cbb->child = out_contents; - cbb->child->offset = offset; - cbb->child->pending_len_len = 1; - cbb->child->pending_is_asn1 = 1; - - return 1; -} - -int CBB_add_bytes(CBB *cbb, const uint8_t *data, size_t len) { - uint8_t *dest; - - if (!CBB_flush(cbb) || - !cbb_buffer_add(cbb->base, &dest, len)) { - return 0; - } - OPENSSL_memcpy(dest, data, len); - return 1; -} - -int CBB_add_zeros(CBB *cbb, size_t len) { - uint8_t *out; - if (!CBB_add_space(cbb, &out, len)) { - return 0; - } - OPENSSL_memset(out, 0, len); - return 1; -} - -int CBB_add_space(CBB *cbb, uint8_t **out_data, size_t len) { - if (!CBB_flush(cbb) || - !cbb_buffer_add(cbb->base, out_data, len)) { - return 0; - } - return 1; -} - -int CBB_reserve(CBB *cbb, uint8_t **out_data, size_t len) { - if (!CBB_flush(cbb) || - !cbb_buffer_reserve(cbb->base, out_data, len)) { - return 0; - } - return 1; -} - -int CBB_did_write(CBB *cbb, size_t len) { - size_t newlen = cbb->base->len + len; - if (cbb->child != NULL || - newlen < cbb->base->len || - newlen > cbb->base->cap) { - return 0; - } - cbb->base->len = newlen; - return 1; -} - -int CBB_add_u8(CBB *cbb, uint8_t value) { - if (!CBB_flush(cbb)) { - return 0; - } - - return cbb_buffer_add_u(cbb->base, value, 1); -} - -int CBB_add_u16(CBB *cbb, uint16_t value) { - if (!CBB_flush(cbb)) { - return 0; - } - - return cbb_buffer_add_u(cbb->base, value, 2); -} - -int CBB_add_u16le(CBB *cbb, uint16_t value) { - return CBB_add_u16(cbb, CRYPTO_bswap2(value)); -} - -int CBB_add_u24(CBB *cbb, uint32_t value) { - if (!CBB_flush(cbb)) { - return 0; - } - - return cbb_buffer_add_u(cbb->base, value, 3); -} - -int CBB_add_u32(CBB *cbb, uint32_t value) { - if (!CBB_flush(cbb)) { - return 0; - } - - return cbb_buffer_add_u(cbb->base, value, 4); -} - -int CBB_add_u32le(CBB *cbb, uint32_t value) { - return CBB_add_u32(cbb, CRYPTO_bswap4(value)); -} - -int CBB_add_u64(CBB *cbb, uint64_t value) { - if (!CBB_flush(cbb)) { - return 0; - } - return cbb_buffer_add_u(cbb->base, value, 8); -} - -int CBB_add_u64le(CBB *cbb, uint64_t value) { - return CBB_add_u64(cbb, CRYPTO_bswap8(value)); -} - -void CBB_discard_child(CBB *cbb) { - if (cbb->child == NULL) { - return; - } - - cbb->base->len = cbb->child->offset; - - cbb->child->base = NULL; - cbb->child = NULL; -} - -int CBB_add_asn1_uint64(CBB *cbb, uint64_t value) { - CBB child; - int started = 0; - - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_INTEGER)) { - return 0; - } - - for (size_t i = 0; i < 8; i++) { - uint8_t byte = (value >> 8*(7-i)) & 0xff; - if (!started) { - if (byte == 0) { - // Don't encode leading zeros. - continue; - } - // If the high bit is set, add a padding byte to make it - // unsigned. - if ((byte & 0x80) && !CBB_add_u8(&child, 0)) { - return 0; - } - started = 1; - } - if (!CBB_add_u8(&child, byte)) { - return 0; - } - } - - // 0 is encoded as a single 0, not the empty string. - if (!started && !CBB_add_u8(&child, 0)) { - return 0; - } - - return CBB_flush(cbb); -} - -int CBB_add_asn1_int64(CBB *cbb, int64_t value) { - if (value >= 0) { - return CBB_add_asn1_uint64(cbb, value); - } - - uint8_t bytes[sizeof(int64_t)]; - memcpy(bytes, &value, sizeof(value)); - int start = 7; - // Skip leading sign-extension bytes unless they are necessary. - while (start > 0 && (bytes[start] == 0xff && (bytes[start - 1] & 0x80))) { - start--; - } - - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_INTEGER)) { - return 0; - } - for (int i = start; i >= 0; i--) { - if (!CBB_add_u8(&child, bytes[i])) { - return 0; - } - } - return CBB_flush(cbb); -} - -int CBB_add_asn1_octet_string(CBB *cbb, const uint8_t *data, size_t data_len) { - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_OCTETSTRING) || - !CBB_add_bytes(&child, data, data_len) || - !CBB_flush(cbb)) { - return 0; - } - - return 1; -} - -int CBB_add_asn1_bool(CBB *cbb, int value) { - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_BOOLEAN) || - !CBB_add_u8(&child, value != 0 ? 0xff : 0) || - !CBB_flush(cbb)) { - return 0; - } - - return 1; -} - -// parse_dotted_decimal parses one decimal component from |cbs|, where |cbs| is -// an OID literal, e.g., "1.2.840.113554.4.1.72585". It consumes both the -// component and the dot, so |cbs| may be passed into the function again for the -// next value. -static int parse_dotted_decimal(CBS *cbs, uint64_t *out) { - *out = 0; - int seen_digit = 0; - for (;;) { - // Valid terminators for a component are the end of the string or a - // non-terminal dot. If the string ends with a dot, this is not a valid OID - // string. - uint8_t u; - if (!CBS_get_u8(cbs, &u) || - (u == '.' && CBS_len(cbs) > 0)) { - break; - } - if (u < '0' || u > '9' || - // Forbid stray leading zeros. - (seen_digit && *out == 0) || - // Check for overflow. - *out > UINT64_MAX / 10 || - *out * 10 > UINT64_MAX - (u - '0')) { - return 0; - } - *out = *out * 10 + (u - '0'); - seen_digit = 1; - } - // The empty string is not a legal OID component. - return seen_digit; -} - -int CBB_add_asn1_oid_from_text(CBB *cbb, const char *text, size_t len) { - if (!CBB_flush(cbb)) { - return 0; - } - - CBS cbs; - CBS_init(&cbs, (const uint8_t *)text, len); - - // OIDs must have at least two components. - uint64_t a, b; - if (!parse_dotted_decimal(&cbs, &a) || - !parse_dotted_decimal(&cbs, &b)) { - return 0; - } - - // The first component is encoded as 40 * |a| + |b|. This assumes that |a| is - // 0, 1, or 2 and that, when it is 0 or 1, |b| is at most 39. - if (a > 2 || - (a < 2 && b > 39) || - b > UINT64_MAX - 80 || - !add_base128_integer(cbb, 40u * a + b)) { - return 0; - } - - // The remaining components are encoded unmodified. - while (CBS_len(&cbs) > 0) { - if (!parse_dotted_decimal(&cbs, &a) || - !add_base128_integer(cbb, a)) { - return 0; - } - } - - return 1; -} - -static int compare_set_of_element(const void *a_ptr, const void *b_ptr) { - // See X.690, section 11.6 for the ordering. They are sorted in ascending - // order by their DER encoding. - const CBS *a = a_ptr, *b = b_ptr; - size_t a_len = CBS_len(a), b_len = CBS_len(b); - size_t min_len = a_len < b_len ? a_len : b_len; - int ret = OPENSSL_memcmp(CBS_data(a), CBS_data(b), min_len); - if (ret != 0) { - return ret; - } - if (a_len == b_len) { - return 0; - } - // If one is a prefix of the other, the shorter one sorts first. (This is not - // actually reachable. No DER encoding is a prefix of another DER encoding.) - return a_len < b_len ? -1 : 1; -} - -int CBB_flush_asn1_set_of(CBB *cbb) { - if (!CBB_flush(cbb)) { - return 0; - } - - CBS cbs; - size_t num_children = 0; - CBS_init(&cbs, CBB_data(cbb), CBB_len(cbb)); - while (CBS_len(&cbs) != 0) { - if (!CBS_get_any_asn1_element(&cbs, NULL, NULL, NULL)) { - return 0; - } - num_children++; - } - - if (num_children < 2) { - return 1; // Nothing to do. This is the common case for X.509. - } - if (num_children > ((size_t)-1) / sizeof(CBS)) { - return 0; // Overflow. - } - - // Parse out the children and sort. We alias them into a copy of so they - // remain valid as we rewrite |cbb|. - int ret = 0; - size_t buf_len = CBB_len(cbb); - uint8_t *buf = OPENSSL_memdup(CBB_data(cbb), buf_len); - CBS *children = OPENSSL_malloc(num_children * sizeof(CBS)); - if (buf == NULL || children == NULL) { - goto err; - } - CBS_init(&cbs, buf, buf_len); - for (size_t i = 0; i < num_children; i++) { - if (!CBS_get_any_asn1_element(&cbs, &children[i], NULL, NULL)) { - goto err; - } - } - qsort(children, num_children, sizeof(CBS), compare_set_of_element); - - // Rewind |cbb| and write the contents back in the new order. - cbb->base->len = cbb->offset + cbb->pending_len_len; - for (size_t i = 0; i < num_children; i++) { - if (!CBB_add_bytes(cbb, CBS_data(&children[i]), CBS_len(&children[i]))) { - goto err; - } - } - assert(CBB_len(cbb) == buf_len); - - ret = 1; - -err: - OPENSSL_free(buf); - OPENSSL_free(children); - return ret; -} diff --git a/third_party/boringssl/src/crypto/bytestring/cbb.cc b/third_party/boringssl/src/crypto/bytestring/cbb.cc new file mode 100644 index 00000000..bbe59fa8 --- /dev/null +++ b/third_party/boringssl/src/crypto/bytestring/cbb.cc @@ -0,0 +1,761 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +void CBB_zero(CBB *cbb) { OPENSSL_memset(cbb, 0, sizeof(CBB)); } + +static void cbb_init(CBB *cbb, uint8_t *buf, size_t cap, int can_resize) { + cbb->is_child = 0; + cbb->child = nullptr; + cbb->u.base.buf = buf; + cbb->u.base.len = 0; + cbb->u.base.cap = cap; + cbb->u.base.can_resize = can_resize; + cbb->u.base.error = 0; +} + +int CBB_init(CBB *cbb, size_t initial_capacity) { + CBB_zero(cbb); + + uint8_t *buf = reinterpret_cast(OPENSSL_malloc(initial_capacity)); + if (initial_capacity > 0 && buf == nullptr) { + return 0; + } + + cbb_init(cbb, buf, initial_capacity, /*can_resize=*/1); + return 1; +} + +int CBB_init_fixed(CBB *cbb, uint8_t *buf, size_t len) { + CBB_zero(cbb); + cbb_init(cbb, buf, len, /*can_resize=*/0); + return 1; +} + +void CBB_cleanup(CBB *cbb) { + // Child |CBB|s are non-owning. They are implicitly discarded and should not + // be used with |CBB_cleanup| or |ScopedCBB|. + assert(!cbb->is_child); + if (cbb->is_child) { + return; + } + + if (cbb->u.base.can_resize) { + OPENSSL_free(cbb->u.base.buf); + } +} + +static int cbb_buffer_reserve(struct cbb_buffer_st *base, uint8_t **out, + size_t len) { + if (base == nullptr) { + return 0; + } + + size_t newlen = base->len + len; + if (newlen < base->len) { + // Overflow + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_OVERFLOW); + goto err; + } + + if (newlen > base->cap) { + if (!base->can_resize) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_OVERFLOW); + goto err; + } + + size_t newcap = base->cap * 2; + if (newcap < base->cap || newcap < newlen) { + newcap = newlen; + } + uint8_t *newbuf = + reinterpret_cast(OPENSSL_realloc(base->buf, newcap)); + if (newbuf == nullptr) { + goto err; + } + + base->buf = newbuf; + base->cap = newcap; + } + + if (out) { + *out = base->buf + base->len; + } + + return 1; + +err: + base->error = 1; + return 0; +} + +static int cbb_buffer_add(struct cbb_buffer_st *base, uint8_t **out, + size_t len) { + if (!cbb_buffer_reserve(base, out, len)) { + return 0; + } + // This will not overflow or |cbb_buffer_reserve| would have failed. + base->len += len; + return 1; +} + +int CBB_finish(CBB *cbb, uint8_t **out_data, size_t *out_len) { + if (cbb->is_child) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + if (!CBB_flush(cbb)) { + return 0; + } + + if (cbb->u.base.can_resize && (out_data == nullptr || out_len == nullptr)) { + // |out_data| and |out_len| can only be NULL if the CBB is fixed. + return 0; + } + + if (out_data != nullptr) { + *out_data = cbb->u.base.buf; + } + if (out_len != nullptr) { + *out_len = cbb->u.base.len; + } + cbb->u.base.buf = nullptr; + CBB_cleanup(cbb); + return 1; +} + +static struct cbb_buffer_st *cbb_get_base(CBB *cbb) { + if (cbb->is_child) { + return cbb->u.child.base; + } + return &cbb->u.base; +} + +static void cbb_on_error(CBB *cbb) { + // Due to C's lack of destructors and |CBB|'s auto-flushing API, a failing + // |CBB|-taking function may leave a dangling pointer to a child |CBB|. As a + // result, the convention is callers may not write to |CBB|s that have failed. + // But, as a safety measure, we lock the |CBB| into an error state. Once the + // error bit is set, |cbb->child| will not be read. + // + // TODO(davidben): This still isn't quite ideal. A |CBB| function *outside* + // this file may originate an error while the |CBB| points to a local child. + // In that case we don't set the error bit and are reliant on the error + // convention. Perhaps we allow |CBB_cleanup| on child |CBB|s and make every + // child's |CBB_cleanup| set the error bit if unflushed. That will be + // convenient for C++ callers, but very tedious for C callers. So C callers + // perhaps should get a |CBB_on_error| function that can be, less tediously, + // stuck in a |goto err| block. + cbb_get_base(cbb)->error = 1; + + // Clearing the pointer is not strictly necessary, but GCC's dangling pointer + // warning does not know |cbb->child| will not be read once |error| is set + // above. + cbb->child = nullptr; +} + +// CBB_flush recurses and then writes out any pending length prefix. The +// current length of the underlying base is taken to be the length of the +// length-prefixed data. +int CBB_flush(CBB *cbb) { + // If |base| has hit an error, the buffer is in an undefined state, so + // fail all following calls. In particular, |cbb->child| may point to invalid + // memory. + struct cbb_buffer_st *base = cbb_get_base(cbb); + if (base == nullptr || base->error) { + return 0; + } + + if (cbb->child == nullptr) { + // Nothing to flush. + return 1; + } + + assert(cbb->child->is_child); + struct cbb_child_st *child = &cbb->child->u.child; + assert(child->base == base); + size_t child_start = child->offset + child->pending_len_len; + + size_t len; + if (!CBB_flush(cbb->child) || child_start < child->offset || + base->len < child_start) { + goto err; + } + + len = base->len - child_start; + + if (child->pending_is_asn1) { + // For ASN.1 we assume that we'll only need a single byte for the length. + // If that turned out to be incorrect, we have to move the contents along + // in order to make space. + uint8_t len_len; + uint8_t initial_length_byte; + + assert(child->pending_len_len == 1); + + if (len > 0xfffffffe) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_OVERFLOW); + // Too large. + goto err; + } else if (len > 0xffffff) { + len_len = 5; + initial_length_byte = 0x80 | 4; + } else if (len > 0xffff) { + len_len = 4; + initial_length_byte = 0x80 | 3; + } else if (len > 0xff) { + len_len = 3; + initial_length_byte = 0x80 | 2; + } else if (len > 0x7f) { + len_len = 2; + initial_length_byte = 0x80 | 1; + } else { + len_len = 1; + initial_length_byte = (uint8_t)len; + len = 0; + } + + if (len_len != 1) { + // We need to move the contents along in order to make space. + size_t extra_bytes = len_len - 1; + if (!cbb_buffer_add(base, nullptr, extra_bytes)) { + goto err; + } + OPENSSL_memmove(base->buf + child_start + extra_bytes, + base->buf + child_start, len); + } + base->buf[child->offset++] = initial_length_byte; + child->pending_len_len = len_len - 1; + } + + for (size_t i = child->pending_len_len - 1; i < child->pending_len_len; i--) { + base->buf[child->offset + i] = (uint8_t)len; + len >>= 8; + } + if (len != 0) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_OVERFLOW); + goto err; + } + + child->base = nullptr; + cbb->child = nullptr; + + return 1; + +err: + cbb_on_error(cbb); + return 0; +} + +uint8_t *CBB_data(const CBB *cbb) { + assert(cbb->child == nullptr); + if (cbb->is_child) { + return cbb->u.child.base->buf + cbb->u.child.offset + + cbb->u.child.pending_len_len; + } + return cbb->u.base.buf; +} + +size_t CBB_len(const CBB *cbb) { + assert(cbb->child == nullptr); + if (cbb->is_child) { + assert(cbb->u.child.offset + cbb->u.child.pending_len_len <= + cbb->u.child.base->len); + return cbb->u.child.base->len - cbb->u.child.offset - + cbb->u.child.pending_len_len; + } + return cbb->u.base.len; +} + +static int cbb_add_child(CBB *cbb, CBB *out_child, uint8_t len_len, + int is_asn1) { + assert(cbb->child == nullptr); + assert(!is_asn1 || len_len == 1); + struct cbb_buffer_st *base = cbb_get_base(cbb); + size_t offset = base->len; + + // Reserve space for the length prefix. + uint8_t *prefix_bytes; + if (!cbb_buffer_add(base, &prefix_bytes, len_len)) { + return 0; + } + OPENSSL_memset(prefix_bytes, 0, len_len); + + CBB_zero(out_child); + out_child->is_child = 1; + out_child->u.child.base = base; + out_child->u.child.offset = offset; + out_child->u.child.pending_len_len = len_len; + out_child->u.child.pending_is_asn1 = is_asn1; + cbb->child = out_child; + return 1; +} + +static int cbb_add_length_prefixed(CBB *cbb, CBB *out_contents, + uint8_t len_len) { + if (!CBB_flush(cbb)) { + return 0; + } + + return cbb_add_child(cbb, out_contents, len_len, /*is_asn1=*/0); +} + +int CBB_add_u8_length_prefixed(CBB *cbb, CBB *out_contents) { + return cbb_add_length_prefixed(cbb, out_contents, 1); +} + +int CBB_add_u16_length_prefixed(CBB *cbb, CBB *out_contents) { + return cbb_add_length_prefixed(cbb, out_contents, 2); +} + +int CBB_add_u24_length_prefixed(CBB *cbb, CBB *out_contents) { + return cbb_add_length_prefixed(cbb, out_contents, 3); +} + +// add_base128_integer encodes |v| as a big-endian base-128 integer where the +// high bit of each byte indicates where there is more data. This is the +// encoding used in DER for both high tag number form and OID components. +static int add_base128_integer(CBB *cbb, uint64_t v) { + unsigned len_len = 0; + uint64_t copy = v; + while (copy > 0) { + len_len++; + copy >>= 7; + } + if (len_len == 0) { + len_len = 1; // Zero is encoded with one byte. + } + for (unsigned i = len_len - 1; i < len_len; i--) { + uint8_t byte = (v >> (7 * i)) & 0x7f; + if (i != 0) { + // The high bit denotes whether there is more data. + byte |= 0x80; + } + if (!CBB_add_u8(cbb, byte)) { + return 0; + } + } + return 1; +} + +int CBB_add_asn1(CBB *cbb, CBB *out_contents, CBS_ASN1_TAG tag) { + if (!CBB_flush(cbb)) { + return 0; + } + + // Split the tag into leading bits and tag number. + uint8_t tag_bits = (tag >> CBS_ASN1_TAG_SHIFT) & 0xe0; + CBS_ASN1_TAG tag_number = tag & CBS_ASN1_TAG_NUMBER_MASK; + if (tag_number >= 0x1f) { + // Set all the bits in the tag number to signal high tag number form. + if (!CBB_add_u8(cbb, tag_bits | 0x1f) || + !add_base128_integer(cbb, tag_number)) { + return 0; + } + } else if (!CBB_add_u8(cbb, tag_bits | tag_number)) { + return 0; + } + + // Reserve one byte of length prefix. |CBB_flush| will finish it later. + return cbb_add_child(cbb, out_contents, /*len_len=*/1, /*is_asn1=*/1); +} + +int CBB_add_bytes(CBB *cbb, const uint8_t *data, size_t len) { + uint8_t *out; + if (!CBB_add_space(cbb, &out, len)) { + return 0; + } + OPENSSL_memcpy(out, data, len); + return 1; +} + +int CBB_add_zeros(CBB *cbb, size_t len) { + uint8_t *out; + if (!CBB_add_space(cbb, &out, len)) { + return 0; + } + OPENSSL_memset(out, 0, len); + return 1; +} + +int CBB_add_space(CBB *cbb, uint8_t **out_data, size_t len) { + if (!CBB_flush(cbb) || !cbb_buffer_add(cbb_get_base(cbb), out_data, len)) { + return 0; + } + return 1; +} + +int CBB_reserve(CBB *cbb, uint8_t **out_data, size_t len) { + if (!CBB_flush(cbb) || + !cbb_buffer_reserve(cbb_get_base(cbb), out_data, len)) { + return 0; + } + return 1; +} + +int CBB_did_write(CBB *cbb, size_t len) { + struct cbb_buffer_st *base = cbb_get_base(cbb); + size_t newlen = base->len + len; + if (cbb->child != nullptr || newlen < base->len || newlen > base->cap) { + return 0; + } + base->len = newlen; + return 1; +} + +static int cbb_add_u(CBB *cbb, uint64_t v, size_t len_len) { + uint8_t *buf; + if (!CBB_add_space(cbb, &buf, len_len)) { + return 0; + } + + for (size_t i = len_len - 1; i < len_len; i--) { + buf[i] = v; + v >>= 8; + } + + // |v| must fit in |len_len| bytes. + if (v != 0) { + cbb_on_error(cbb); + return 0; + } + + return 1; +} + +int CBB_add_u8(CBB *cbb, uint8_t value) { return cbb_add_u(cbb, value, 1); } + +int CBB_add_u16(CBB *cbb, uint16_t value) { return cbb_add_u(cbb, value, 2); } + +int CBB_add_u16le(CBB *cbb, uint16_t value) { + return CBB_add_u16(cbb, CRYPTO_bswap2(value)); +} + +int CBB_add_u24(CBB *cbb, uint32_t value) { return cbb_add_u(cbb, value, 3); } + +int CBB_add_u32(CBB *cbb, uint32_t value) { return cbb_add_u(cbb, value, 4); } + +int CBB_add_u32le(CBB *cbb, uint32_t value) { + return CBB_add_u32(cbb, CRYPTO_bswap4(value)); +} + +int CBB_add_u64(CBB *cbb, uint64_t value) { return cbb_add_u(cbb, value, 8); } + +int CBB_add_u64le(CBB *cbb, uint64_t value) { + return CBB_add_u64(cbb, CRYPTO_bswap8(value)); +} + +void CBB_discard(CBB *cbb, size_t len) { + BSSL_CHECK(cbb->child == nullptr); + BSSL_CHECK(len <= CBB_len(cbb)); + struct cbb_buffer_st *base = cbb_get_base(cbb); + base->len -= len; +} + +void CBB_discard_child(CBB *cbb) { + if (cbb->child == nullptr) { + return; + } + + struct cbb_buffer_st *base = cbb_get_base(cbb); + assert(cbb->child->is_child); + base->len = cbb->child->u.child.offset; + + cbb->child->u.child.base = nullptr; + cbb->child = nullptr; +} + +int CBB_add_asn1_element(CBB *cbb, CBS_ASN1_TAG tag, const uint8_t *data, + size_t data_len) { + CBB child; + if (!CBB_add_asn1(cbb, &child, tag) || + !CBB_add_bytes(&child, data, data_len) || // + !CBB_flush(cbb)) { + cbb_on_error(cbb); + return 0; + } + + return 1; +} + +int CBB_add_asn1_uint64(CBB *cbb, uint64_t value) { + return CBB_add_asn1_uint64_with_tag(cbb, value, CBS_ASN1_INTEGER); +} + +int CBB_add_asn1_uint64_with_tag(CBB *cbb, uint64_t value, CBS_ASN1_TAG tag) { + CBB child; + int started = 0; + if (!CBB_add_asn1(cbb, &child, tag)) { + goto err; + } + + for (size_t i = 0; i < 8; i++) { + uint8_t byte = (value >> 8 * (7 - i)) & 0xff; + if (!started) { + if (byte == 0) { + // Don't encode leading zeros. + continue; + } + // If the high bit is set, add a padding byte to make it + // unsigned. + if ((byte & 0x80) && !CBB_add_u8(&child, 0)) { + goto err; + } + started = 1; + } + if (!CBB_add_u8(&child, byte)) { + goto err; + } + } + + // 0 is encoded as a single 0, not the empty string. + if (!started && !CBB_add_u8(&child, 0)) { + goto err; + } + + return CBB_flush(cbb); + +err: + cbb_on_error(cbb); + return 0; +} + +int CBB_add_asn1_int64(CBB *cbb, int64_t value) { + return CBB_add_asn1_int64_with_tag(cbb, value, CBS_ASN1_INTEGER); +} + +int CBB_add_asn1_int64_with_tag(CBB *cbb, int64_t value, CBS_ASN1_TAG tag) { + if (value >= 0) { + return CBB_add_asn1_uint64_with_tag(cbb, (uint64_t)value, tag); + } + + uint8_t bytes[sizeof(int64_t)]; + memcpy(bytes, &value, sizeof(value)); + int start = 7; + // Skip leading sign-extension bytes unless they are necessary. + while (start > 0 && (bytes[start] == 0xff && (bytes[start - 1] & 0x80))) { + start--; + } + + CBB child; + if (!CBB_add_asn1(cbb, &child, tag)) { + goto err; + } + for (int i = start; i >= 0; i--) { + if (!CBB_add_u8(&child, bytes[i])) { + goto err; + } + } + return CBB_flush(cbb); + +err: + cbb_on_error(cbb); + return 0; +} + +int CBB_add_asn1_octet_string(CBB *cbb, const uint8_t *data, size_t data_len) { + return CBB_add_asn1_element(cbb, CBS_ASN1_OCTETSTRING, data, data_len); +} + +int CBB_add_asn1_bool(CBB *cbb, int value) { + CBB child; + if (!CBB_add_asn1(cbb, &child, CBS_ASN1_BOOLEAN) || + !CBB_add_u8(&child, value != 0 ? 0xff : 0) || !CBB_flush(cbb)) { + cbb_on_error(cbb); + return 0; + } + + return 1; +} + +// parse_dotted_decimal parses one decimal component from |cbs|, where |cbs| is +// an OID literal, e.g., "1.2.840.113554.4.1.72585". It consumes both the +// component and the dot, so |cbs| may be passed into the function again for the +// next value. +static int parse_dotted_decimal(CBS *cbs, uint64_t *out) { + if (!CBS_get_u64_decimal(cbs, out)) { + return 0; + } + + // The integer must have either ended at the end of the string, or a + // non-terminal dot, which should be consumed. If the string ends with a dot, + // this is not a valid OID string. + uint8_t dot; + return !CBS_get_u8(cbs, &dot) || (dot == '.' && CBS_len(cbs) > 0); +} + +int CBB_add_asn1_oid_from_text(CBB *cbb, const char *text, size_t len) { + if (!CBB_flush(cbb)) { + return 0; + } + + CBS cbs; + CBS_init(&cbs, (const uint8_t *)text, len); + + // OIDs must have at least two components. + uint64_t a, b; + if (!parse_dotted_decimal(&cbs, &a) || !parse_dotted_decimal(&cbs, &b)) { + return 0; + } + + // The first component is encoded as 40 * |a| + |b|. This assumes that |a| is + // 0, 1, or 2 and that, when it is 0 or 1, |b| is at most 39. + if (a > 2 || (a < 2 && b > 39) || b > UINT64_MAX - 80 || + !add_base128_integer(cbb, 40u * a + b)) { + return 0; + } + + // The remaining components are encoded unmodified. + while (CBS_len(&cbs) > 0) { + if (!parse_dotted_decimal(&cbs, &a) || !add_base128_integer(cbb, a)) { + return 0; + } + } + + return 1; +} + +int CBB_add_asn1_relative_oid_from_text(CBB *cbb, const char *text, + size_t len) { + if (!CBB_flush(cbb)) { + return 0; + } + + // Relative OIDs must have at least one component. + if (!len) { + return 0; + } + + CBS cbs; + CBS_init(&cbs, reinterpret_cast(text), len); + + while (CBS_len(&cbs) > 0) { + uint64_t a; + if (!parse_dotted_decimal(&cbs, &a) || !add_base128_integer(cbb, a)) { + return 0; + } + } + + return 1; +} + +int CBB_add_asn1_oid_component(CBB *cbb, uint64_t value) { + if (!CBB_flush(cbb)) { + return 0; + } + + return add_base128_integer(cbb, value); +} + +static int compare_set_of_element(const void *a_ptr, const void *b_ptr) { + // See X.690, section 11.6 for the ordering. They are sorted in ascending + // order by their DER encoding. + const CBS *a = reinterpret_cast(a_ptr), + *b = reinterpret_cast(b_ptr); + size_t a_len = CBS_len(a), b_len = CBS_len(b); + size_t min_len = a_len < b_len ? a_len : b_len; + int ret = OPENSSL_memcmp(CBS_data(a), CBS_data(b), min_len); + if (ret != 0) { + return ret; + } + if (a_len == b_len) { + return 0; + } + // If one is a prefix of the other, the shorter one sorts first. (This is not + // actually reachable. No DER encoding is a prefix of another DER encoding.) + return a_len < b_len ? -1 : 1; +} + +int CBB_flush_asn1_set_of(CBB *cbb) { + if (!CBB_flush(cbb)) { + return 0; + } + + CBS cbs; + size_t num_children = 0; + CBS_init(&cbs, CBB_data(cbb), CBB_len(cbb)); + while (CBS_len(&cbs) != 0) { + if (!CBS_get_any_asn1_element(&cbs, nullptr, nullptr, nullptr)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + num_children++; + } + + if (num_children < 2) { + return 1; // Nothing to do. This is the common case for X.509. + } + + // Parse out the children and sort. We alias them into a copy of so they + // remain valid as we rewrite |cbb|. + int ret = 0; + size_t buf_len = CBB_len(cbb); + uint8_t *buf = + reinterpret_cast(OPENSSL_memdup(CBB_data(cbb), buf_len)); + CBS *children = + reinterpret_cast(OPENSSL_calloc(num_children, sizeof(CBS))); + uint8_t *out; + size_t offset = 0; + if (buf == nullptr || children == nullptr) { + goto err; + } + CBS_init(&cbs, buf, buf_len); + for (size_t i = 0; i < num_children; i++) { + if (!CBS_get_any_asn1_element(&cbs, &children[i], nullptr, nullptr)) { + goto err; + } + } + qsort(children, num_children, sizeof(CBS), compare_set_of_element); + + // Write the contents back in the new order. + out = (uint8_t *)CBB_data(cbb); + for (size_t i = 0; i < num_children; i++) { + OPENSSL_memcpy(out + offset, CBS_data(&children[i]), CBS_len(&children[i])); + offset += CBS_len(&children[i]); + } + assert(offset == buf_len); + + ret = 1; + +err: + OPENSSL_free(buf); + OPENSSL_free(children); + return ret; +} + +bool bssl::CBBFinishArray(CBB *cbb, Array *out) { + uint8_t *ptr; + size_t len; + if (!CBB_finish(cbb, &ptr, &len)) { + OPENSSL_PUT_ERROR(SSL, ERR_R_INTERNAL_ERROR); + return false; + } + out->Reset(ptr, len); + return true; +} diff --git a/third_party/boringssl/src/crypto/bytestring/cbs.c b/third_party/boringssl/src/crypto/bytestring/cbs.c deleted file mode 100644 index 4e7f3797..00000000 --- a/third_party/boringssl/src/crypto/bytestring/cbs.c +++ /dev/null @@ -1,883 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include -#include -#include - -#include -#include -#include -#include - -#include "../asn1/internal.h" -#include "../internal.h" -#include "internal.h" - - -void CBS_init(CBS *cbs, const uint8_t *data, size_t len) { - cbs->data = data; - cbs->len = len; -} - -static int cbs_get(CBS *cbs, const uint8_t **p, size_t n) { - if (cbs->len < n) { - return 0; - } - - *p = cbs->data; - cbs->data += n; - cbs->len -= n; - return 1; -} - -int CBS_skip(CBS *cbs, size_t len) { - const uint8_t *dummy; - return cbs_get(cbs, &dummy, len); -} - -const uint8_t *CBS_data(const CBS *cbs) { - return cbs->data; -} - -size_t CBS_len(const CBS *cbs) { - return cbs->len; -} - -int CBS_stow(const CBS *cbs, uint8_t **out_ptr, size_t *out_len) { - OPENSSL_free(*out_ptr); - *out_ptr = NULL; - *out_len = 0; - - if (cbs->len == 0) { - return 1; - } - *out_ptr = OPENSSL_memdup(cbs->data, cbs->len); - if (*out_ptr == NULL) { - return 0; - } - *out_len = cbs->len; - return 1; -} - -int CBS_strdup(const CBS *cbs, char **out_ptr) { - if (*out_ptr != NULL) { - OPENSSL_free(*out_ptr); - } - *out_ptr = OPENSSL_strndup((const char*)cbs->data, cbs->len); - return (*out_ptr != NULL); -} - -int CBS_contains_zero_byte(const CBS *cbs) { - return OPENSSL_memchr(cbs->data, 0, cbs->len) != NULL; -} - -int CBS_mem_equal(const CBS *cbs, const uint8_t *data, size_t len) { - if (len != cbs->len) { - return 0; - } - return CRYPTO_memcmp(cbs->data, data, len) == 0; -} - -static int cbs_get_u(CBS *cbs, uint64_t *out, size_t len) { - uint64_t result = 0; - const uint8_t *data; - - if (!cbs_get(cbs, &data, len)) { - return 0; - } - for (size_t i = 0; i < len; i++) { - result <<= 8; - result |= data[i]; - } - *out = result; - return 1; -} - -int CBS_get_u8(CBS *cbs, uint8_t *out) { - const uint8_t *v; - if (!cbs_get(cbs, &v, 1)) { - return 0; - } - *out = *v; - return 1; -} - -int CBS_get_u16(CBS *cbs, uint16_t *out) { - uint64_t v; - if (!cbs_get_u(cbs, &v, 2)) { - return 0; - } - *out = v; - return 1; -} - -int CBS_get_u16le(CBS *cbs, uint16_t *out) { - if (!CBS_get_u16(cbs, out)) { - return 0; - } - *out = CRYPTO_bswap2(*out); - return 1; -} - -int CBS_get_u24(CBS *cbs, uint32_t *out) { - uint64_t v; - if (!cbs_get_u(cbs, &v, 3)) { - return 0; - } - *out = v; - return 1; -} - -int CBS_get_u32(CBS *cbs, uint32_t *out) { - uint64_t v; - if (!cbs_get_u(cbs, &v, 4)) { - return 0; - } - *out = v; - return 1; -} - -int CBS_get_u32le(CBS *cbs, uint32_t *out) { - if (!CBS_get_u32(cbs, out)) { - return 0; - } - *out = CRYPTO_bswap4(*out); - return 1; -} - -int CBS_get_u64(CBS *cbs, uint64_t *out) { - return cbs_get_u(cbs, out, 8); -} - -int CBS_get_u64le(CBS *cbs, uint64_t *out) { - if (!cbs_get_u(cbs, out, 8)) { - return 0; - } - *out = CRYPTO_bswap8(*out); - return 1; -} - -int CBS_get_last_u8(CBS *cbs, uint8_t *out) { - if (cbs->len == 0) { - return 0; - } - *out = cbs->data[cbs->len - 1]; - cbs->len--; - return 1; -} - -int CBS_get_bytes(CBS *cbs, CBS *out, size_t len) { - const uint8_t *v; - if (!cbs_get(cbs, &v, len)) { - return 0; - } - CBS_init(out, v, len); - return 1; -} - -int CBS_copy_bytes(CBS *cbs, uint8_t *out, size_t len) { - const uint8_t *v; - if (!cbs_get(cbs, &v, len)) { - return 0; - } - OPENSSL_memcpy(out, v, len); - return 1; -} - -static int cbs_get_length_prefixed(CBS *cbs, CBS *out, size_t len_len) { - uint64_t len; - if (!cbs_get_u(cbs, &len, len_len)) { - return 0; - } - // If |len_len| <= 3 then we know that |len| will fit into a |size_t|, even on - // 32-bit systems. - assert(len_len <= 3); - return CBS_get_bytes(cbs, out, len); -} - -int CBS_get_u8_length_prefixed(CBS *cbs, CBS *out) { - return cbs_get_length_prefixed(cbs, out, 1); -} - -int CBS_get_u16_length_prefixed(CBS *cbs, CBS *out) { - return cbs_get_length_prefixed(cbs, out, 2); -} - -int CBS_get_u24_length_prefixed(CBS *cbs, CBS *out) { - return cbs_get_length_prefixed(cbs, out, 3); -} - -int CBS_get_until_first(CBS *cbs, CBS *out, uint8_t c) { - const uint8_t *split = OPENSSL_memchr(CBS_data(cbs), c, CBS_len(cbs)); - if (split == NULL) { - return 0; - } - return CBS_get_bytes(cbs, out, split - CBS_data(cbs)); -} - -// parse_base128_integer reads a big-endian base-128 integer from |cbs| and sets -// |*out| to the result. This is the encoding used in DER for both high tag -// number form and OID components. -static int parse_base128_integer(CBS *cbs, uint64_t *out) { - uint64_t v = 0; - uint8_t b; - do { - if (!CBS_get_u8(cbs, &b)) { - return 0; - } - if ((v >> (64 - 7)) != 0) { - // The value is too large. - return 0; - } - if (v == 0 && b == 0x80) { - // The value must be minimally encoded. - return 0; - } - v = (v << 7) | (b & 0x7f); - - // Values end at an octet with the high bit cleared. - } while (b & 0x80); - - *out = v; - return 1; -} - -static int parse_asn1_tag(CBS *cbs, unsigned *out) { - uint8_t tag_byte; - if (!CBS_get_u8(cbs, &tag_byte)) { - return 0; - } - - // ITU-T X.690 section 8.1.2.3 specifies the format for identifiers with a tag - // number no greater than 30. - // - // If the number portion is 31 (0x1f, the largest value that fits in the - // allotted bits), then the tag is more than one byte long and the - // continuation bytes contain the tag number. - unsigned tag = ((unsigned)tag_byte & 0xe0) << CBS_ASN1_TAG_SHIFT; - unsigned tag_number = tag_byte & 0x1f; - if (tag_number == 0x1f) { - uint64_t v; - if (!parse_base128_integer(cbs, &v) || - // Check the tag number is within our supported bounds. - v > CBS_ASN1_TAG_NUMBER_MASK || - // Small tag numbers should have used low tag number form, even in BER. - v < 0x1f) { - return 0; - } - tag_number = (unsigned)v; - } - - tag |= tag_number; - - // Tag [UNIVERSAL 0] is reserved for use by the encoding. Reject it here to - // avoid some ambiguity around ANY values and BER indefinite-length EOCs. See - // https://crbug.com/boringssl/455. - if ((tag & ~CBS_ASN1_CONSTRUCTED) == 0) { - return 0; - } - - *out = tag; - return 1; -} - -static int cbs_get_any_asn1_element(CBS *cbs, CBS *out, unsigned *out_tag, - size_t *out_header_len, int *out_ber_found, - int *out_indefinite, int ber_ok) { - CBS header = *cbs; - CBS throwaway; - - if (out == NULL) { - out = &throwaway; - } - if (ber_ok) { - *out_ber_found = 0; - *out_indefinite = 0; - } else { - assert(out_ber_found == NULL); - assert(out_indefinite == NULL); - } - - unsigned tag; - if (!parse_asn1_tag(&header, &tag)) { - return 0; - } - if (out_tag != NULL) { - *out_tag = tag; - } - - uint8_t length_byte; - if (!CBS_get_u8(&header, &length_byte)) { - return 0; - } - - size_t header_len = CBS_len(cbs) - CBS_len(&header); - - size_t len; - // The format for the length encoding is specified in ITU-T X.690 section - // 8.1.3. - if ((length_byte & 0x80) == 0) { - // Short form length. - len = ((size_t) length_byte) + header_len; - if (out_header_len != NULL) { - *out_header_len = header_len; - } - } else { - // The high bit indicate that this is the long form, while the next 7 bits - // encode the number of subsequent octets used to encode the length (ITU-T - // X.690 clause 8.1.3.5.b). - const size_t num_bytes = length_byte & 0x7f; - uint64_t len64; - - if (ber_ok && (tag & CBS_ASN1_CONSTRUCTED) != 0 && num_bytes == 0) { - // indefinite length - if (out_header_len != NULL) { - *out_header_len = header_len; - } - *out_ber_found = 1; - *out_indefinite = 1; - return CBS_get_bytes(cbs, out, header_len); - } - - // ITU-T X.690 clause 8.1.3.5.c specifies that the value 0xff shall not be - // used as the first byte of the length. If this parser encounters that - // value, num_bytes will be parsed as 127, which will fail this check. - if (num_bytes == 0 || num_bytes > 4) { - return 0; - } - if (!cbs_get_u(&header, &len64, num_bytes)) { - return 0; - } - // ITU-T X.690 section 10.1 (DER length forms) requires encoding the - // length with the minimum number of octets. BER could, technically, have - // 125 superfluous zero bytes. We do not attempt to handle that and still - // require that the length fit in a |uint32_t| for BER. - if (len64 < 128) { - // Length should have used short-form encoding. - if (ber_ok) { - *out_ber_found = 1; - } else { - return 0; - } - } - if ((len64 >> ((num_bytes - 1) * 8)) == 0) { - // Length should have been at least one byte shorter. - if (ber_ok) { - *out_ber_found = 1; - } else { - return 0; - } - } - len = len64; - if (len + header_len + num_bytes < len) { - // Overflow. - return 0; - } - len += header_len + num_bytes; - if (out_header_len != NULL) { - *out_header_len = header_len + num_bytes; - } - } - - return CBS_get_bytes(cbs, out, len); -} - -int CBS_get_any_asn1(CBS *cbs, CBS *out, unsigned *out_tag) { - size_t header_len; - if (!CBS_get_any_asn1_element(cbs, out, out_tag, &header_len)) { - return 0; - } - - if (!CBS_skip(out, header_len)) { - assert(0); - return 0; - } - - return 1; -} - -int CBS_get_any_asn1_element(CBS *cbs, CBS *out, unsigned *out_tag, - size_t *out_header_len) { - return cbs_get_any_asn1_element(cbs, out, out_tag, out_header_len, NULL, NULL, - /*ber_ok=*/0); -} - -int CBS_get_any_ber_asn1_element(CBS *cbs, CBS *out, unsigned *out_tag, - size_t *out_header_len, int *out_ber_found, - int *out_indefinite) { - int ber_found_temp; - return cbs_get_any_asn1_element( - cbs, out, out_tag, out_header_len, - out_ber_found ? out_ber_found : &ber_found_temp, out_indefinite, - /*ber_ok=*/1); -} - -static int cbs_get_asn1(CBS *cbs, CBS *out, unsigned tag_value, - int skip_header) { - size_t header_len; - unsigned tag; - CBS throwaway; - - if (out == NULL) { - out = &throwaway; - } - - if (!CBS_get_any_asn1_element(cbs, out, &tag, &header_len) || - tag != tag_value) { - return 0; - } - - if (skip_header && !CBS_skip(out, header_len)) { - assert(0); - return 0; - } - - return 1; -} - -int CBS_get_asn1(CBS *cbs, CBS *out, unsigned tag_value) { - return cbs_get_asn1(cbs, out, tag_value, 1 /* skip header */); -} - -int CBS_get_asn1_element(CBS *cbs, CBS *out, unsigned tag_value) { - return cbs_get_asn1(cbs, out, tag_value, 0 /* include header */); -} - -int CBS_peek_asn1_tag(const CBS *cbs, unsigned tag_value) { - if (CBS_len(cbs) < 1) { - return 0; - } - - CBS copy = *cbs; - unsigned actual_tag; - return parse_asn1_tag(©, &actual_tag) && tag_value == actual_tag; -} - -int CBS_get_asn1_uint64(CBS *cbs, uint64_t *out) { - CBS bytes; - if (!CBS_get_asn1(cbs, &bytes, CBS_ASN1_INTEGER) || - !CBS_is_unsigned_asn1_integer(&bytes)) { - return 0; - } - - *out = 0; - const uint8_t *data = CBS_data(&bytes); - size_t len = CBS_len(&bytes); - for (size_t i = 0; i < len; i++) { - if ((*out >> 56) != 0) { - // Too large to represent as a uint64_t. - return 0; - } - *out <<= 8; - *out |= data[i]; - } - - return 1; -} - -int CBS_get_asn1_int64(CBS *cbs, int64_t *out) { - int is_negative; - CBS bytes; - if (!CBS_get_asn1(cbs, &bytes, CBS_ASN1_INTEGER) || - !CBS_is_valid_asn1_integer(&bytes, &is_negative)) { - return 0; - } - const uint8_t *data = CBS_data(&bytes); - const size_t len = CBS_len(&bytes); - if (len > sizeof(int64_t)) { - return 0; - } - uint8_t sign_extend[sizeof(int64_t)]; - memset(sign_extend, is_negative ? 0xff : 0, sizeof(sign_extend)); - for (size_t i = 0; i < len; i++) { - sign_extend[i] = data[len - i - 1]; - } - memcpy(out, sign_extend, sizeof(sign_extend)); - return 1; -} - -int CBS_get_asn1_bool(CBS *cbs, int *out) { - CBS bytes; - if (!CBS_get_asn1(cbs, &bytes, CBS_ASN1_BOOLEAN) || - CBS_len(&bytes) != 1) { - return 0; - } - - const uint8_t value = *CBS_data(&bytes); - if (value != 0 && value != 0xff) { - return 0; - } - - *out = !!value; - return 1; -} - -int CBS_get_optional_asn1(CBS *cbs, CBS *out, int *out_present, unsigned tag) { - int present = 0; - - if (CBS_peek_asn1_tag(cbs, tag)) { - if (!CBS_get_asn1(cbs, out, tag)) { - return 0; - } - present = 1; - } - - if (out_present != NULL) { - *out_present = present; - } - - return 1; -} - -int CBS_get_optional_asn1_octet_string(CBS *cbs, CBS *out, int *out_present, - unsigned tag) { - CBS child; - int present; - if (!CBS_get_optional_asn1(cbs, &child, &present, tag)) { - return 0; - } - if (present) { - assert(out); - if (!CBS_get_asn1(&child, out, CBS_ASN1_OCTETSTRING) || - CBS_len(&child) != 0) { - return 0; - } - } else { - CBS_init(out, NULL, 0); - } - if (out_present) { - *out_present = present; - } - return 1; -} - -int CBS_get_optional_asn1_uint64(CBS *cbs, uint64_t *out, unsigned tag, - uint64_t default_value) { - CBS child; - int present; - if (!CBS_get_optional_asn1(cbs, &child, &present, tag)) { - return 0; - } - if (present) { - if (!CBS_get_asn1_uint64(&child, out) || - CBS_len(&child) != 0) { - return 0; - } - } else { - *out = default_value; - } - return 1; -} - -int CBS_get_optional_asn1_bool(CBS *cbs, int *out, unsigned tag, - int default_value) { - CBS child, child2; - int present; - if (!CBS_get_optional_asn1(cbs, &child, &present, tag)) { - return 0; - } - if (present) { - uint8_t boolean; - - if (!CBS_get_asn1(&child, &child2, CBS_ASN1_BOOLEAN) || - CBS_len(&child2) != 1 || - CBS_len(&child) != 0) { - return 0; - } - - boolean = CBS_data(&child2)[0]; - if (boolean == 0) { - *out = 0; - } else if (boolean == 0xff) { - *out = 1; - } else { - return 0; - } - } else { - *out = default_value; - } - return 1; -} - -int CBS_is_valid_asn1_bitstring(const CBS *cbs) { - CBS in = *cbs; - uint8_t num_unused_bits; - if (!CBS_get_u8(&in, &num_unused_bits) || - num_unused_bits > 7) { - return 0; - } - - if (num_unused_bits == 0) { - return 1; - } - - // All num_unused_bits bits must exist and be zeros. - uint8_t last; - if (!CBS_get_last_u8(&in, &last) || - (last & ((1 << num_unused_bits) - 1)) != 0) { - return 0; - } - - return 1; -} - -int CBS_asn1_bitstring_has_bit(const CBS *cbs, unsigned bit) { - if (!CBS_is_valid_asn1_bitstring(cbs)) { - return 0; - } - - const unsigned byte_num = (bit >> 3) + 1; - const unsigned bit_num = 7 - (bit & 7); - - // Unused bits are zero, and this function does not distinguish between - // missing and unset bits. Thus it is sufficient to do a byte-level length - // check. - return byte_num < CBS_len(cbs) && - (CBS_data(cbs)[byte_num] & (1 << bit_num)) != 0; -} - -int CBS_is_valid_asn1_integer(const CBS *cbs, int *out_is_negative) { - CBS copy = *cbs; - uint8_t first_byte, second_byte; - if (!CBS_get_u8(©, &first_byte)) { - return 0; // INTEGERs may not be empty. - } - if (out_is_negative != NULL) { - *out_is_negative = (first_byte & 0x80) != 0; - } - if (!CBS_get_u8(©, &second_byte)) { - return 1; // One byte INTEGERs are always minimal. - } - if ((first_byte == 0x00 && (second_byte & 0x80) == 0) || - (first_byte == 0xff && (second_byte & 0x80) != 0)) { - return 0; // The value is minimal iff the first 9 bits are not all equal. - } - return 1; -} - -int CBS_is_unsigned_asn1_integer(const CBS *cbs) { - int is_negative; - return CBS_is_valid_asn1_integer(cbs, &is_negative) && !is_negative; -} - -static int add_decimal(CBB *out, uint64_t v) { - char buf[DECIMAL_SIZE(uint64_t) + 1]; - BIO_snprintf(buf, sizeof(buf), "%" PRIu64, v); - return CBB_add_bytes(out, (const uint8_t *)buf, strlen(buf)); -} - -char *CBS_asn1_oid_to_text(const CBS *cbs) { - CBB cbb; - if (!CBB_init(&cbb, 32)) { - goto err; - } - - CBS copy = *cbs; - // The first component is 40 * value1 + value2, where value1 is 0, 1, or 2. - uint64_t v; - if (!parse_base128_integer(©, &v)) { - goto err; - } - - if (v >= 80) { - if (!CBB_add_bytes(&cbb, (const uint8_t *)"2.", 2) || - !add_decimal(&cbb, v - 80)) { - goto err; - } - } else if (!add_decimal(&cbb, v / 40) || - !CBB_add_u8(&cbb, '.') || - !add_decimal(&cbb, v % 40)) { - goto err; - } - - while (CBS_len(©) != 0) { - if (!parse_base128_integer(©, &v) || - !CBB_add_u8(&cbb, '.') || - !add_decimal(&cbb, v)) { - goto err; - } - } - - uint8_t *txt; - size_t txt_len; - if (!CBB_add_u8(&cbb, '\0') || - !CBB_finish(&cbb, &txt, &txt_len)) { - goto err; - } - - return (char *)txt; - -err: - CBB_cleanup(&cbb); - return NULL; -} - -static int cbs_get_two_digits(CBS *cbs, int *out) { - uint8_t first_digit, second_digit; - if (!CBS_get_u8(cbs, &first_digit)) { - return 0; - } - if (!isdigit(first_digit)) { - return 0; - } - if (!CBS_get_u8(cbs, &second_digit)) { - return 0; - } - if (!isdigit(second_digit)) { - return 0; - } - *out = (first_digit - '0') * 10 + (second_digit - '0'); - return 1; -} - -static int is_valid_day(int year, int month, int day) { - if (day < 1) { - return 0; - } - switch (month) { - case 1: - case 3: - case 5: - case 7: - case 8: - case 10: - case 12: - return day <= 31; - case 4: - case 6: - case 9: - case 11: - return day <= 30; - case 2: - if ((year % 4 == 0 && year % 100 != 0) || year % 400 == 0) { - return day <= 29; - } else { - return day <= 28; - } - default: - return 0; - } -} - -static int CBS_parse_rfc5280_time_internal(const CBS *cbs, int is_gentime, - int allow_timezone_offset, - struct tm *out_tm) { - int year, month, day, hour, min, sec, tmp; - CBS copy = *cbs; - uint8_t tz; - - if (is_gentime) { - if (!cbs_get_two_digits(©, &tmp)) { - return 0; - } - year = tmp * 100; - if (!cbs_get_two_digits(©, &tmp)) { - return 0; - } - year += tmp; - } else { - year = 1900; - if (!cbs_get_two_digits(©, &tmp)) { - return 0; - } - year += tmp; - if (year < 1950) { - year += 100; - } - if (year >= 2050) { - return 0; // A Generalized time must be used. - } - } - if (!cbs_get_two_digits(©, &month) || month < 1 || - month > 12 || // Reject invalid months. - !cbs_get_two_digits(©, &day) || - !is_valid_day(year, month, day) || // Reject invalid days. - !cbs_get_two_digits(©, &hour) || - hour > 23 || // Reject invalid hours. - !cbs_get_two_digits(©, &min) || - min > 59 || // Reject invalid minutes. - !cbs_get_two_digits(©, &sec) || sec > 59 || !CBS_get_u8(©, &tz)) { - return 0; - } - - int offset_sign = 0; - switch (tz) { - case 'Z': - break; // We correctly have 'Z' on the end as per spec. - case '+': - offset_sign = 1; - break; // Should not be allowed per RFC 5280. - case '-': - offset_sign = -1; - break; // Should not be allowed per RFC 5280. - default: - return 0; // Reject anything else after the time. - } - - // If allow_timezone_offset is non-zero, allow for a four digit timezone - // offset to be specified even though this is not allowed by RFC 5280. We are - // permissive of this for UTCTimes due to the unfortunate existence of - // artisinally rolled long lived certificates that were baked into places that - // are now difficult to change. These certificates were generated with the - // 'openssl' command that permissively allowed the creation of certificates - // with notBefore and notAfter times specified as strings for direct - // certificate inclusion on the command line. For context see cl/237068815. - // - // TODO(bbe): This has been expunged from public web-pki as the ecosystem has - // managed to encourage CA compliance with standards. We should find a way to - // get rid of this or make it off by default. - int offset_seconds = 0; - if (offset_sign != 0) { - if (!allow_timezone_offset) { - return 0; - } - int offset_hours, offset_minutes; - if (!cbs_get_two_digits(©, &offset_hours) || - offset_hours > 23 || // Reject invalid hours. - !cbs_get_two_digits(©, &offset_minutes) || - offset_minutes > 59) { // Reject invalid minutes. - return 0; - } - offset_seconds = offset_sign * (offset_hours * 3600 + offset_minutes * 60); - } - - if (CBS_len(©) != 0) { - return 0; // Reject invalid lengths. - } - - if (out_tm != NULL) { - // Fill in the tm fields corresponding to what we validated. - out_tm->tm_year = year - 1900; - out_tm->tm_mon = month - 1; - out_tm->tm_mday = day; - out_tm->tm_hour = hour; - out_tm->tm_min = min; - out_tm->tm_sec = sec; - if (offset_seconds && !OPENSSL_gmtime_adj(out_tm, 0, offset_seconds)) { - return 0; - } - } - return 1; -} - -int CBS_parse_generalized_time(const CBS *cbs, struct tm *out_tm, - int allow_timezone_offset) { - return CBS_parse_rfc5280_time_internal(cbs, 1, allow_timezone_offset, out_tm); -} - -int CBS_parse_utc_time(const CBS *cbs, struct tm *out_tm, - int allow_timezone_offset) { - return CBS_parse_rfc5280_time_internal(cbs, 0, allow_timezone_offset, out_tm); -} diff --git a/third_party/boringssl/src/crypto/bytestring/cbs.cc b/third_party/boringssl/src/crypto/bytestring/cbs.cc new file mode 100644 index 00000000..dbaa3e8a --- /dev/null +++ b/third_party/boringssl/src/crypto/bytestring/cbs.cc @@ -0,0 +1,984 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include +#include +#include +#include + +#include "../asn1/internal.h" +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +static int cbs_get(CBS *cbs, const uint8_t **p, size_t n) { + if (cbs->len < n) { + return 0; + } + + *p = cbs->data; + cbs->data += n; + cbs->len -= n; + return 1; +} + +int CBS_skip(CBS *cbs, size_t len) { + const uint8_t *dummy; + return cbs_get(cbs, &dummy, len); +} + +int CBS_stow(const CBS *cbs, uint8_t **out_ptr, size_t *out_len) { + OPENSSL_free(*out_ptr); + *out_ptr = nullptr; + *out_len = 0; + + if (cbs->len == 0) { + return 1; + } + *out_ptr = reinterpret_cast(OPENSSL_memdup(cbs->data, cbs->len)); + if (*out_ptr == nullptr) { + return 0; + } + *out_len = cbs->len; + return 1; +} + +int CBS_strdup(const CBS *cbs, char **out_ptr) { + if (*out_ptr != nullptr) { + OPENSSL_free(*out_ptr); + } + *out_ptr = OPENSSL_strndup((const char *)cbs->data, cbs->len); + return (*out_ptr != nullptr); +} + +int CBS_contains_zero_byte(const CBS *cbs) { + return OPENSSL_memchr(cbs->data, 0, cbs->len) != nullptr; +} + +int CBS_mem_equal(const CBS *cbs, const uint8_t *data, size_t len) { + if (len != cbs->len) { + return 0; + } + return CRYPTO_memcmp(cbs->data, data, len) == 0; +} + +static int cbs_get_u(CBS *cbs, uint64_t *out, size_t len) { + uint64_t result = 0; + const uint8_t *data; + + if (!cbs_get(cbs, &data, len)) { + return 0; + } + for (size_t i = 0; i < len; i++) { + result <<= 8; + result |= data[i]; + } + *out = result; + return 1; +} + +int CBS_get_u8(CBS *cbs, uint8_t *out) { + const uint8_t *v; + if (!cbs_get(cbs, &v, 1)) { + return 0; + } + *out = *v; + return 1; +} + +int CBS_get_u16(CBS *cbs, uint16_t *out) { + uint64_t v; + if (!cbs_get_u(cbs, &v, 2)) { + return 0; + } + *out = v; + return 1; +} + +int CBS_get_u16le(CBS *cbs, uint16_t *out) { + if (!CBS_get_u16(cbs, out)) { + return 0; + } + *out = CRYPTO_bswap2(*out); + return 1; +} + +int CBS_get_u24(CBS *cbs, uint32_t *out) { + uint64_t v; + if (!cbs_get_u(cbs, &v, 3)) { + return 0; + } + *out = (uint32_t)v; + return 1; +} + +int CBS_get_u32(CBS *cbs, uint32_t *out) { + uint64_t v; + if (!cbs_get_u(cbs, &v, 4)) { + return 0; + } + *out = (uint32_t)v; + return 1; +} + +int CBS_get_u32le(CBS *cbs, uint32_t *out) { + if (!CBS_get_u32(cbs, out)) { + return 0; + } + *out = CRYPTO_bswap4(*out); + return 1; +} + +int CBS_get_u64(CBS *cbs, uint64_t *out) { return cbs_get_u(cbs, out, 8); } + +int CBS_get_u64le(CBS *cbs, uint64_t *out) { + if (!cbs_get_u(cbs, out, 8)) { + return 0; + } + *out = CRYPTO_bswap8(*out); + return 1; +} + +int CBS_get_last_u8(CBS *cbs, uint8_t *out) { + if (cbs->len == 0) { + return 0; + } + *out = cbs->data[cbs->len - 1]; + cbs->len--; + return 1; +} + +int CBS_get_bytes(CBS *cbs, CBS *out, size_t len) { + const uint8_t *v; + if (!cbs_get(cbs, &v, len)) { + return 0; + } + CBS_init(out, v, len); + return 1; +} + +int CBS_copy_bytes(CBS *cbs, uint8_t *out, size_t len) { + const uint8_t *v; + if (!cbs_get(cbs, &v, len)) { + return 0; + } + OPENSSL_memcpy(out, v, len); + return 1; +} + +static int cbs_get_length_prefixed(CBS *cbs, CBS *out, size_t len_len) { + uint64_t len; + if (!cbs_get_u(cbs, &len, len_len)) { + return 0; + } + // If |len_len| <= 3 then we know that |len| will fit into a |size_t|, even on + // 32-bit systems. + assert(len_len <= 3); + return CBS_get_bytes(cbs, out, len); +} + +int CBS_get_u8_length_prefixed(CBS *cbs, CBS *out) { + return cbs_get_length_prefixed(cbs, out, 1); +} + +int CBS_get_u16_length_prefixed(CBS *cbs, CBS *out) { + return cbs_get_length_prefixed(cbs, out, 2); +} + +int CBS_get_u24_length_prefixed(CBS *cbs, CBS *out) { + return cbs_get_length_prefixed(cbs, out, 3); +} + +int CBS_get_until_first(CBS *cbs, CBS *out, uint8_t c) { + const uint8_t *split = reinterpret_cast( + OPENSSL_memchr(CBS_data(cbs), c, CBS_len(cbs))); + if (split == nullptr) { + return 0; + } + return CBS_get_bytes(cbs, out, split - CBS_data(cbs)); +} + +int CBS_get_until_first_of(CBS *cbs, CBS *out, const char *chars) { + size_t pos = 0; + while (pos < CBS_len(cbs)) { + uint8_t c = CBS_data(cbs)[pos]; + // Special-case for \0 characters. We don't want to match on a null byte, + // even though strchr will happily return the \0 at the end of `chars`. + if (!c || !strchr(chars, c)) { + pos++; + } else { + break; + } + } + if (pos == CBS_len(cbs)) { + return 0; + } + return CBS_get_bytes(cbs, out, pos); +} + +int CBS_get_until_first_not_of(CBS *cbs, CBS *out, const char *chars) { + size_t pos = 0; + while (pos < CBS_len(cbs)) { + uint8_t c = CBS_data(cbs)[pos]; + // Special-case for \0 characters. We don't want to match on a null byte, + // even though strchr will happily return the \0 at the end of `chars`. + if (c && strchr(chars, c)) { + pos++; + } else { + break; + } + } + if (pos == CBS_len(cbs)) { + return 0; + } + return CBS_get_bytes(cbs, out, pos); +} + +int CBS_get_u64_decimal(CBS *cbs, uint64_t *out) { + uint64_t v = 0; + int seen_digit = 0; + while (CBS_len(cbs) != 0) { + uint8_t c = CBS_data(cbs)[0]; + if (!OPENSSL_isdigit(c)) { + break; + } + CBS_skip(cbs, 1); + if (/* Forbid stray leading zeros */ + (v == 0 && seen_digit) || + // Check for overflow. + v > UINT64_MAX / 10 || // + v * 10 > UINT64_MAX - (c - '0')) { + return 0; + } + v = v * 10 + (c - '0'); + seen_digit = 1; + } + + *out = v; + return seen_digit; +} + +// parse_base128_integer reads a big-endian base-128 integer from |cbs| and sets +// |*out| to the result. This is the encoding used in DER for both high tag +// number form and OID components. +static int parse_base128_integer(CBS *cbs, uint64_t *out) { + uint64_t v = 0; + uint8_t b; + do { + if (!CBS_get_u8(cbs, &b)) { + return 0; + } + if ((v >> (64 - 7)) != 0) { + // The value is too large. + return 0; + } + if (v == 0 && b == 0x80) { + // The value must be minimally encoded. + return 0; + } + v = (v << 7) | (b & 0x7f); + + // Values end at an octet with the high bit cleared. + } while (b & 0x80); + + *out = v; + return 1; +} + +static int parse_asn1_tag(CBS *cbs, CBS_ASN1_TAG *out) { + uint8_t tag_byte; + if (!CBS_get_u8(cbs, &tag_byte)) { + return 0; + } + + // ITU-T X.690 section 8.1.2.3 specifies the format for identifiers with a tag + // number no greater than 30. + // + // If the number portion is 31 (0x1f, the largest value that fits in the + // allotted bits), then the tag is more than one byte long and the + // continuation bytes contain the tag number. + CBS_ASN1_TAG tag = ((CBS_ASN1_TAG)tag_byte & 0xe0) << CBS_ASN1_TAG_SHIFT; + CBS_ASN1_TAG tag_number = tag_byte & 0x1f; + if (tag_number == 0x1f) { + uint64_t v; + if (!parse_base128_integer(cbs, &v) || + // Check the tag number is within our supported bounds. + v > CBS_ASN1_TAG_NUMBER_MASK || + // Small tag numbers should have used low tag number form, even in BER. + v < 0x1f) { + return 0; + } + tag_number = (CBS_ASN1_TAG)v; + } + + tag |= tag_number; + + // Tag [UNIVERSAL 0] is reserved for use by the encoding. Reject it here to + // avoid some ambiguity around ANY values and BER indefinite-length EOCs. See + // https://crbug.com/boringssl/455. + if ((tag & ~CBS_ASN1_CONSTRUCTED) == 0) { + return 0; + } + + *out = tag; + return 1; +} + +static int cbs_get_any_asn1_element(CBS *cbs, CBS *out, CBS_ASN1_TAG *out_tag, + size_t *out_header_len, int *out_ber_found, + int *out_indefinite, int ber_ok) { + CBS header = *cbs; + CBS throwaway; + + if (out == nullptr) { + out = &throwaway; + } + if (ber_ok) { + *out_ber_found = 0; + *out_indefinite = 0; + } else { + assert(out_ber_found == nullptr); + assert(out_indefinite == nullptr); + } + + CBS_ASN1_TAG tag; + if (!parse_asn1_tag(&header, &tag)) { + return 0; + } + if (out_tag != nullptr) { + *out_tag = tag; + } + + uint8_t length_byte; + if (!CBS_get_u8(&header, &length_byte)) { + return 0; + } + + size_t header_len = CBS_len(cbs) - CBS_len(&header); + + size_t len; + // The format for the length encoding is specified in ITU-T X.690 section + // 8.1.3. + if ((length_byte & 0x80) == 0) { + // Short form length. + len = ((size_t)length_byte) + header_len; + if (out_header_len != nullptr) { + *out_header_len = header_len; + } + } else { + // The high bit indicate that this is the long form, while the next 7 bits + // encode the number of subsequent octets used to encode the length (ITU-T + // X.690 clause 8.1.3.5.b). + const size_t num_bytes = length_byte & 0x7f; + uint64_t len64; + + if (ber_ok && (tag & CBS_ASN1_CONSTRUCTED) != 0 && num_bytes == 0) { + // indefinite length + if (out_header_len != nullptr) { + *out_header_len = header_len; + } + *out_ber_found = 1; + *out_indefinite = 1; + return CBS_get_bytes(cbs, out, header_len); + } + + // ITU-T X.690 clause 8.1.3.5.c specifies that the value 0xff shall not be + // used as the first byte of the length. If this parser encounters that + // value, num_bytes will be parsed as 127, which will fail this check. + if (num_bytes == 0 || num_bytes > 4) { + return 0; + } + if (!cbs_get_u(&header, &len64, num_bytes)) { + return 0; + } + // ITU-T X.690 section 10.1 (DER length forms) requires encoding the + // length with the minimum number of octets. BER could, technically, have + // 125 superfluous zero bytes. We do not attempt to handle that and still + // require that the length fit in a |uint32_t| for BER. + if (len64 < 128) { + // Length should have used short-form encoding. + if (ber_ok) { + *out_ber_found = 1; + } else { + return 0; + } + } + if ((len64 >> ((num_bytes - 1) * 8)) == 0) { + // Length should have been at least one byte shorter. + if (ber_ok) { + *out_ber_found = 1; + } else { + return 0; + } + } + len = len64; + if (len + header_len + num_bytes < len) { + // Overflow. + return 0; + } + len += header_len + num_bytes; + if (out_header_len != nullptr) { + *out_header_len = header_len + num_bytes; + } + } + + return CBS_get_bytes(cbs, out, len); +} + +int CBS_get_any_asn1(CBS *cbs, CBS *out, CBS_ASN1_TAG *out_tag) { + size_t header_len; + if (!CBS_get_any_asn1_element(cbs, out, out_tag, &header_len)) { + return 0; + } + + if (out && !CBS_skip(out, header_len)) { + assert(0); + return 0; + } + + return 1; +} + +int CBS_get_any_asn1_element(CBS *cbs, CBS *out, CBS_ASN1_TAG *out_tag, + size_t *out_header_len) { + return cbs_get_any_asn1_element(cbs, out, out_tag, out_header_len, nullptr, + nullptr, + /*ber_ok=*/0); +} + +int CBS_get_any_ber_asn1_element(CBS *cbs, CBS *out, CBS_ASN1_TAG *out_tag, + size_t *out_header_len, int *out_ber_found, + int *out_indefinite) { + int ber_found_temp; + return cbs_get_any_asn1_element( + cbs, out, out_tag, out_header_len, + out_ber_found ? out_ber_found : &ber_found_temp, out_indefinite, + /*ber_ok=*/1); +} + +static int cbs_get_asn1(CBS *cbs, CBS *out, CBS_ASN1_TAG tag_value, + int skip_header) { + size_t header_len; + CBS_ASN1_TAG tag; + CBS throwaway; + + if (out == nullptr) { + out = &throwaway; + } + + if (!CBS_get_any_asn1_element(cbs, out, &tag, &header_len) || + tag != tag_value) { + return 0; + } + + if (skip_header && !CBS_skip(out, header_len)) { + assert(0); + return 0; + } + + return 1; +} + +int CBS_get_asn1(CBS *cbs, CBS *out, CBS_ASN1_TAG tag_value) { + return cbs_get_asn1(cbs, out, tag_value, 1 /* skip header */); +} + +int CBS_get_asn1_element(CBS *cbs, CBS *out, CBS_ASN1_TAG tag_value) { + return cbs_get_asn1(cbs, out, tag_value, 0 /* include header */); +} + +int CBS_peek_asn1_tag(const CBS *cbs, CBS_ASN1_TAG tag_value) { + CBS copy = *cbs; + CBS_ASN1_TAG actual_tag; + return parse_asn1_tag(©, &actual_tag) && tag_value == actual_tag; +} + +int CBS_get_asn1_uint64(CBS *cbs, uint64_t *out) { + return CBS_get_asn1_uint64_with_tag(cbs, out, CBS_ASN1_INTEGER); +} + +int CBS_get_asn1_uint64_with_tag(CBS *cbs, uint64_t *out, CBS_ASN1_TAG tag) { + CBS bytes; + if (!CBS_get_asn1(cbs, &bytes, tag) || + !CBS_is_unsigned_asn1_integer(&bytes)) { + return 0; + } + + *out = 0; + const uint8_t *data = CBS_data(&bytes); + size_t len = CBS_len(&bytes); + for (size_t i = 0; i < len; i++) { + if ((*out >> 56) != 0) { + // Too large to represent as a uint64_t. + return 0; + } + *out <<= 8; + *out |= data[i]; + } + + return 1; +} + +int CBS_get_asn1_int64(CBS *cbs, int64_t *out) { + return CBS_get_asn1_int64_with_tag(cbs, out, CBS_ASN1_INTEGER); +} + +int CBS_get_asn1_int64_with_tag(CBS *cbs, int64_t *out, CBS_ASN1_TAG tag) { + int is_negative; + CBS bytes; + if (!CBS_get_asn1(cbs, &bytes, tag) || + !CBS_is_valid_asn1_integer(&bytes, &is_negative)) { + return 0; + } + const uint8_t *data = CBS_data(&bytes); + const size_t len = CBS_len(&bytes); + if (len > sizeof(int64_t)) { + return 0; + } + uint8_t sign_extend[sizeof(int64_t)]; + OPENSSL_memset(sign_extend, is_negative ? 0xff : 0, sizeof(sign_extend)); + OPENSSL_memcpy(sign_extend + sizeof(int64_t) - len, data, len); + *out = CRYPTO_load_u64_be(sign_extend); + return 1; +} + +int CBS_get_asn1_bool(CBS *cbs, int *out) { + CBS bytes; + if (!CBS_get_asn1(cbs, &bytes, CBS_ASN1_BOOLEAN) || CBS_len(&bytes) != 1) { + return 0; + } + + const uint8_t value = *CBS_data(&bytes); + if (value != 0 && value != 0xff) { + return 0; + } + + *out = !!value; + return 1; +} + +int CBS_get_optional_asn1(CBS *cbs, CBS *out, int *out_present, + CBS_ASN1_TAG tag) { + int present = 0; + + if (CBS_peek_asn1_tag(cbs, tag)) { + if (!CBS_get_asn1(cbs, out, tag)) { + return 0; + } + present = 1; + } + + if (out_present != nullptr) { + *out_present = present; + } + + return 1; +} + +int CBS_get_optional_asn1_octet_string(CBS *cbs, CBS *out, int *out_present, + CBS_ASN1_TAG tag) { + CBS child; + int present; + if (!CBS_get_optional_asn1(cbs, &child, &present, tag)) { + return 0; + } + if (present) { + assert(out); + if (!CBS_get_asn1(&child, out, CBS_ASN1_OCTETSTRING) || + CBS_len(&child) != 0) { + return 0; + } + } else { + CBS_init(out, nullptr, 0); + } + if (out_present) { + *out_present = present; + } + return 1; +} + +int CBS_get_optional_asn1_uint64(CBS *cbs, uint64_t *out, CBS_ASN1_TAG tag, + uint64_t default_value) { + CBS child; + int present; + if (!CBS_get_optional_asn1(cbs, &child, &present, tag)) { + return 0; + } + if (present) { + if (!CBS_get_asn1_uint64(&child, out) || CBS_len(&child) != 0) { + return 0; + } + } else { + *out = default_value; + } + return 1; +} + +int CBS_get_optional_asn1_bool(CBS *cbs, int *out, CBS_ASN1_TAG tag, + int default_value) { + CBS child, child2; + int present; + if (!CBS_get_optional_asn1(cbs, &child, &present, tag)) { + return 0; + } + if (present) { + uint8_t boolean; + + if (!CBS_get_asn1(&child, &child2, CBS_ASN1_BOOLEAN) || + CBS_len(&child2) != 1 || CBS_len(&child) != 0) { + return 0; + } + + boolean = CBS_data(&child2)[0]; + if (boolean == 0) { + *out = 0; + } else if (boolean == 0xff) { + *out = 1; + } else { + return 0; + } + } else { + *out = default_value; + } + return 1; +} + +int CBS_is_valid_asn1_bitstring(const CBS *cbs) { + CBS in = *cbs; + uint8_t num_unused_bits; + if (!CBS_get_u8(&in, &num_unused_bits) || num_unused_bits > 7) { + return 0; + } + + if (num_unused_bits == 0) { + return 1; + } + + // All num_unused_bits bits must exist and be zeros. + uint8_t last; + if (!CBS_get_last_u8(&in, &last) || + (last & ((1 << num_unused_bits) - 1)) != 0) { + return 0; + } + + return 1; +} + +int CBS_asn1_bitstring_has_bit(const CBS *cbs, unsigned bit) { + if (!CBS_is_valid_asn1_bitstring(cbs)) { + return 0; + } + + const unsigned byte_num = (bit >> 3) + 1; + const unsigned bit_num = 7 - (bit & 7); + + // Unused bits are zero, and this function does not distinguish between + // missing and unset bits. Thus it is sufficient to do a byte-level length + // check. + return byte_num < CBS_len(cbs) && + (CBS_data(cbs)[byte_num] & (1 << bit_num)) != 0; +} + +int CBS_is_valid_asn1_integer(const CBS *cbs, int *out_is_negative) { + CBS copy = *cbs; + uint8_t first_byte, second_byte; + if (!CBS_get_u8(©, &first_byte)) { + return 0; // INTEGERs may not be empty. + } + if (out_is_negative != nullptr) { + *out_is_negative = (first_byte & 0x80) != 0; + } + if (!CBS_get_u8(©, &second_byte)) { + return 1; // One byte INTEGERs are always minimal. + } + if ((first_byte == 0x00 && (second_byte & 0x80) == 0) || + (first_byte == 0xff && (second_byte & 0x80) != 0)) { + return 0; // The value is minimal iff the first 9 bits are not all equal. + } + return 1; +} + +int CBS_is_unsigned_asn1_integer(const CBS *cbs) { + int is_negative; + return CBS_is_valid_asn1_integer(cbs, &is_negative) && !is_negative; +} + +static int add_decimal(CBB *out, uint64_t v) { + char buf[DECIMAL_SIZE(uint64_t) + 1]; + snprintf(buf, sizeof(buf), "%" PRIu64, v); + return CBB_add_bytes(out, (const uint8_t *)buf, strlen(buf)); +} + +int CBS_is_valid_asn1_oid(const CBS *cbs) { + if (CBS_len(cbs) == 0) { + return 0; // OID encodings cannot be empty. + } + + CBS copy = *cbs; + uint8_t v, prev = 0; + while (CBS_get_u8(©, &v)) { + // OID encodings are a sequence of minimally-encoded base-128 integers (see + // |parse_base128_integer|). If |prev|'s MSB was clear, it was the last byte + // of an integer (or |v| is the first byte). |v| is then the first byte of + // the next integer. If first byte of an integer is 0x80, it is not + // minimally-encoded. + if ((prev & 0x80) == 0 && v == 0x80) { + return 0; + } + prev = v; + } + + // The last byte should must end an integer encoding. + return (prev & 0x80) == 0; +} + +char *CBS_asn1_oid_to_text(const CBS *cbs) { + CBS copy = *cbs; + CBB cbb; + if (!CBB_init(&cbb, 32)) { + goto err; + } + + // The first component is 40 * value1 + value2, where value1 is 0, 1, or 2. + uint64_t v; + if (!parse_base128_integer(©, &v)) { + goto err; + } + + if (v >= 80) { + if (!CBB_add_bytes(&cbb, (const uint8_t *)"2.", 2) || + !add_decimal(&cbb, v - 80)) { + goto err; + } + } else if (!add_decimal(&cbb, v / 40) || !CBB_add_u8(&cbb, '.') || + !add_decimal(&cbb, v % 40)) { + goto err; + } + + while (CBS_len(©) != 0) { + if (!parse_base128_integer(©, &v) || !CBB_add_u8(&cbb, '.') || + !add_decimal(&cbb, v)) { + goto err; + } + } + + uint8_t *txt; + size_t txt_len; + if (!CBB_add_u8(&cbb, '\0') || !CBB_finish(&cbb, &txt, &txt_len)) { + goto err; + } + + return (char *)txt; + +err: + CBB_cleanup(&cbb); + return nullptr; +} + +int CBS_is_valid_asn1_relative_oid(const CBS *cbs) { + return CBS_is_valid_asn1_oid(cbs); +} + +char *CBS_asn1_relative_oid_to_text(const CBS *cbs) { + CBS copy = *cbs; + ScopedCBB cbb; + if (!CBB_init(cbb.get(), 32)) { + return nullptr; + } + + // Relative OIDs must have at least one component. + uint64_t v; + if (!parse_base128_integer(©, &v) || !add_decimal(cbb.get(), v)) { + return nullptr; + } + + while (CBS_len(©) != 0) { + if (!parse_base128_integer(©, &v) || !CBB_add_u8(cbb.get(), '.') || + !add_decimal(cbb.get(), v)) { + return nullptr; + } + } + + uint8_t *txt; + size_t txt_len; + if (!CBB_add_u8(cbb.get(), '\0') || !CBB_finish(cbb.get(), &txt, &txt_len)) { + return nullptr; + } + + return reinterpret_cast(txt); +} + +static int cbs_get_two_digits(CBS *cbs, int *out) { + uint8_t first_digit, second_digit; + if (!CBS_get_u8(cbs, &first_digit)) { + return 0; + } + if (!OPENSSL_isdigit(first_digit)) { + return 0; + } + if (!CBS_get_u8(cbs, &second_digit)) { + return 0; + } + if (!OPENSSL_isdigit(second_digit)) { + return 0; + } + *out = (first_digit - '0') * 10 + (second_digit - '0'); + return 1; +} + +static int is_valid_day(int year, int month, int day) { + if (day < 1) { + return 0; + } + switch (month) { + case 1: + case 3: + case 5: + case 7: + case 8: + case 10: + case 12: + return day <= 31; + case 4: + case 6: + case 9: + case 11: + return day <= 30; + case 2: + if ((year % 4 == 0 && year % 100 != 0) || year % 400 == 0) { + return day <= 29; + } else { + return day <= 28; + } + default: + return 0; + } +} + +static int CBS_parse_rfc5280_time_internal(const CBS *cbs, int is_gentime, + int allow_timezone_offset, + struct tm *out_tm) { + int year, month, day, hour, min, sec, tmp; + CBS copy = *cbs; + uint8_t tz; + + if (is_gentime) { + if (!cbs_get_two_digits(©, &tmp)) { + return 0; + } + year = tmp * 100; + if (!cbs_get_two_digits(©, &tmp)) { + return 0; + } + year += tmp; + } else { + year = 1900; + if (!cbs_get_two_digits(©, &tmp)) { + return 0; + } + year += tmp; + if (year < 1950) { + year += 100; + } + if (year >= 2050) { + return 0; // A Generalized time must be used. + } + } + if (!cbs_get_two_digits(©, &month) || month < 1 || + month > 12 || // Reject invalid months. + !cbs_get_two_digits(©, &day) || + !is_valid_day(year, month, day) || // Reject invalid days. + !cbs_get_two_digits(©, &hour) || + hour > 23 || // Reject invalid hours. + !cbs_get_two_digits(©, &min) || + min > 59 || // Reject invalid minutes. + !cbs_get_two_digits(©, &sec) || sec > 59 || !CBS_get_u8(©, &tz)) { + return 0; + } + + int offset_sign = 0; + switch (tz) { + case 'Z': + break; // We correctly have 'Z' on the end as per spec. + case '+': + offset_sign = 1; + break; // Should not be allowed per RFC 5280. + case '-': + offset_sign = -1; + break; // Should not be allowed per RFC 5280. + default: + return 0; // Reject anything else after the time. + } + + // If allow_timezone_offset is non-zero, allow for a four digit timezone + // offset to be specified even though this is not allowed by RFC 5280. We are + // permissive of this for UTCTimes due to the unfortunate existence of + // artisinally rolled long lived certificates that were baked into places that + // are now difficult to change. These certificates were generated with the + // 'openssl' command that permissively allowed the creation of certificates + // with notBefore and notAfter times specified as strings for direct + // certificate inclusion on the command line. For context see cl/237068815. + // + // TODO(bbe): This has been expunged from public web-pki as the ecosystem has + // managed to encourage CA compliance with standards. We should find a way to + // get rid of this or make it off by default. + int offset_seconds = 0; + if (offset_sign != 0) { + if (!allow_timezone_offset) { + return 0; + } + int offset_hours, offset_minutes; + if (!cbs_get_two_digits(©, &offset_hours) || + offset_hours > 23 || // Reject invalid hours. + !cbs_get_two_digits(©, &offset_minutes) || + offset_minutes > 59) { // Reject invalid minutes. + return 0; + } + offset_seconds = offset_sign * (offset_hours * 3600 + offset_minutes * 60); + } + + if (CBS_len(©) != 0) { + return 0; // Reject invalid lengths. + } + + if (out_tm != nullptr) { + // Fill in the tm fields corresponding to what we validated. + out_tm->tm_year = year - 1900; + out_tm->tm_mon = month - 1; + out_tm->tm_mday = day; + out_tm->tm_hour = hour; + out_tm->tm_min = min; + out_tm->tm_sec = sec; + if (offset_seconds && !OPENSSL_gmtime_adj(out_tm, 0, offset_seconds)) { + return 0; + } + } + return 1; +} + +int CBS_parse_generalized_time(const CBS *cbs, struct tm *out_tm, + int allow_timezone_offset) { + return CBS_parse_rfc5280_time_internal(cbs, 1, allow_timezone_offset, out_tm); +} + +int CBS_parse_utc_time(const CBS *cbs, struct tm *out_tm, + int allow_timezone_offset) { + return CBS_parse_rfc5280_time_internal(cbs, 0, allow_timezone_offset, out_tm); +} diff --git a/third_party/boringssl/src/crypto/bytestring/internal.h b/third_party/boringssl/src/crypto/bytestring/internal.h index 7ef0e21c..35bd4c9c 100644 --- a/third_party/boringssl/src/crypto/bytestring/internal.h +++ b/third_party/boringssl/src/crypto/bytestring/internal.h @@ -1,27 +1,31 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_BYTESTRING_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_BYTESTRING_INTERNAL_H -#ifndef OPENSSL_HEADER_BYTESTRING_INTERNAL_H -#define OPENSSL_HEADER_BYTESTRING_INTERNAL_H +#include +#include +#include -#include +#include -#if defined(__cplusplus) -extern "C" { -#endif +#include "../mem_internal.h" +BSSL_NAMESPACE_BEGIN + // CBS_asn1_ber_to_der reads a BER element from |in|. If it finds // indefinite-length elements or constructed strings then it converts the BER // data to DER, sets |out| to the converted contents and |*out_storage| to a @@ -54,8 +58,8 @@ OPENSSL_EXPORT int CBS_asn1_ber_to_der(CBS *in, CBS *out, // It returns one on success and zero otherwise. OPENSSL_EXPORT int CBS_get_asn1_implicit_string(CBS *in, CBS *out, uint8_t **out_storage, - unsigned outer_tag, - unsigned inner_tag); + CBS_ASN1_TAG outer_tag, + CBS_ASN1_TAG inner_tag); // CBB_finish_i2d calls |CBB_finish| on |cbb| which must have been initialized // with |CBB_init|. If |outp| is not NULL then the result is written to |*outp| @@ -64,33 +68,63 @@ OPENSSL_EXPORT int CBS_get_asn1_implicit_string(CBS *in, CBS *out, // error, it calls |CBB_cleanup| on |cbb|. // // This function may be used to help implement legacy i2d ASN.1 functions. -int CBB_finish_i2d(CBB *cbb, uint8_t **outp); - +OPENSSL_EXPORT int CBB_finish_i2d(CBB *cbb, uint8_t **outp); -// Unicode utilities. - -// The following functions read one Unicode code point from |cbs| with the -// corresponding encoding and store it in |*out|. They return one on success and -// zero on error. -OPENSSL_EXPORT int cbs_get_utf8(CBS *cbs, uint32_t *out); -OPENSSL_EXPORT int cbs_get_latin1(CBS *cbs, uint32_t *out); -OPENSSL_EXPORT int cbs_get_ucs2_be(CBS *cbs, uint32_t *out); -OPENSSL_EXPORT int cbs_get_utf32_be(CBS *cbs, uint32_t *out); +// CBBAsSpan returns a span containing |cbb|'s contents. It does not flush +// |cbb|. The span is valid until the next operation to |cbb|. +// +// To avoid unfinalized length prefixes, it is a fatal error to call this on a +// CBB with any active children. +inline Span CBBAsSpan(const CBB *cbb) { + return Span(CBB_data(cbb), CBB_len(cbb)); +} -// cbb_get_utf8_len returns the number of bytes needed to represent |u| in -// UTF-8. -OPENSSL_EXPORT size_t cbb_get_utf8_len(uint32_t u); +// CBBFinishArray behaves like |CBB_finish| but stores the result in an Array. +OPENSSL_EXPORT bool CBBFinishArray(CBB *cbb, Array *out); -// The following functions encode |u| to |cbb| with the corresponding -// encoding. They return one on success and zero on error. -OPENSSL_EXPORT int cbb_add_utf8(CBB *cbb, uint32_t u); -OPENSSL_EXPORT int cbb_add_latin1(CBB *cbb, uint32_t u); -OPENSSL_EXPORT int cbb_add_ucs2_be(CBB *cbb, uint32_t u); -OPENSSL_EXPORT int cbb_add_utf32_be(CBB *cbb, uint32_t u); +// D2IFromCBS takes a functor of type |Unique(CBS*)| and implements the d2i +// calling convention. For compatibility with functions that don't tag their +// return value (e.g. public APIs), |T*(CBS)| is also accepted. The callback can +// assume that the |CBS|'s length fits in |long|. The callback should not access +// |out|, |inp|, or |len| directly. +template +inline T *D2IFromCBS(T **out, const uint8_t **inp, long len, CBSFunc func) { + static_assert(std::is_invocable_v); + static_assert( + std::is_same_v, UniquePtr> || + std::is_same_v, T *>); + if (len < 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_BUFFER_TOO_SMALL); + return nullptr; + } + CBS cbs; + CBS_init(&cbs, *inp, len); + UniquePtr ret(func(&cbs)); + if (ret == nullptr) { + return nullptr; + } + if (out != nullptr) { + UniquePtr free_out(*out); + *out = ret.get(); + } + *inp = CBS_data(&cbs); + return ret.release(); +} +// I2DFromCBB takes a functor of type |bool(CBB*)| and implements the i2d +// calling convention. It internally makes a |CBB| with the specified initial +// capacity. The callback should not access |outp| directly. +template +inline int I2DFromCBB(size_t initial_capacity, uint8_t **outp, CBBFunc func) { + static_assert(std::is_invocable_v); + static_assert(std::is_same_v, bool>); + ScopedCBB cbb; + if (!CBB_init(cbb.get(), initial_capacity) || !func(cbb.get())) { + return -1; + } + return CBB_finish_i2d(cbb.get(), outp); +} -#if defined(__cplusplus) -} // extern C -#endif +BSSL_NAMESPACE_END -#endif // OPENSSL_HEADER_BYTESTRING_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_BYTESTRING_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/bytestring/unicode.c b/third_party/boringssl/src/crypto/bytestring/unicode.c deleted file mode 100644 index 6f9467f9..00000000 --- a/third_party/boringssl/src/crypto/bytestring/unicode.c +++ /dev/null @@ -1,155 +0,0 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include "internal.h" - - -static int is_valid_code_point(uint32_t v) { - // References in the following are to Unicode 9.0.0. - if (// The Unicode space runs from zero to 0x10ffff (3.4 D9). - v > 0x10ffff || - // Values 0x...fffe, 0x...ffff, and 0xfdd0-0xfdef are permanently reserved - // (3.4 D14) - (v & 0xfffe) == 0xfffe || - (v >= 0xfdd0 && v <= 0xfdef) || - // Surrogate code points are invalid (3.2 C1). - (v >= 0xd800 && v <= 0xdfff)) { - return 0; - } - return 1; -} - -// BOTTOM_BITS returns a byte with the bottom |n| bits set. -#define BOTTOM_BITS(n) (uint8_t)((1u << (n)) - 1) - -// TOP_BITS returns a byte with the top |n| bits set. -#define TOP_BITS(n) ((uint8_t)~BOTTOM_BITS(8 - (n))) - -int cbs_get_utf8(CBS *cbs, uint32_t *out) { - uint8_t c; - if (!CBS_get_u8(cbs, &c)) { - return 0; - } - if (c <= 0x7f) { - *out = c; - return 1; - } - uint32_t v, lower_bound; - size_t len; - if ((c & TOP_BITS(3)) == TOP_BITS(2)) { - v = c & BOTTOM_BITS(5); - len = 1; - lower_bound = 0x80; - } else if ((c & TOP_BITS(4)) == TOP_BITS(3)) { - v = c & BOTTOM_BITS(4); - len = 2; - lower_bound = 0x800; - } else if ((c & TOP_BITS(5)) == TOP_BITS(4)) { - v = c & BOTTOM_BITS(3); - len = 3; - lower_bound = 0x10000; - } else { - return 0; - } - for (size_t i = 0; i < len; i++) { - if (!CBS_get_u8(cbs, &c) || - (c & TOP_BITS(2)) != TOP_BITS(1)) { - return 0; - } - v <<= 6; - v |= c & BOTTOM_BITS(6); - } - if (!is_valid_code_point(v) || - v < lower_bound) { - return 0; - } - *out = v; - return 1; -} - -int cbs_get_latin1(CBS *cbs, uint32_t *out) { - uint8_t c; - if (!CBS_get_u8(cbs, &c)) { - return 0; - } - *out = c; - return 1; -} - -int cbs_get_ucs2_be(CBS *cbs, uint32_t *out) { - // Note UCS-2 (used by BMPString) does not support surrogates. - uint16_t c; - if (!CBS_get_u16(cbs, &c) || - !is_valid_code_point(c)) { - return 0; - } - *out = c; - return 1; -} - -int cbs_get_utf32_be(CBS *cbs, uint32_t *out) { - return CBS_get_u32(cbs, out) && is_valid_code_point(*out); -} - -size_t cbb_get_utf8_len(uint32_t u) { - if (u <= 0x7f) { - return 1; - } - if (u <= 0x7ff) { - return 2; - } - if (u <= 0xffff) { - return 3; - } - return 4; -} - -int cbb_add_utf8(CBB *cbb, uint32_t u) { - if (!is_valid_code_point(u)) { - return 0; - } - if (u <= 0x7f) { - return CBB_add_u8(cbb, (uint8_t)u); - } - if (u <= 0x7ff) { - return CBB_add_u8(cbb, TOP_BITS(2) | (u >> 6)) && - CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6))); - } - if (u <= 0xffff) { - return CBB_add_u8(cbb, TOP_BITS(3) | (u >> 12)) && - CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) && - CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6))); - } - if (u <= 0x10ffff) { - return CBB_add_u8(cbb, TOP_BITS(4) | (u >> 18)) && - CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 12) & BOTTOM_BITS(6))) && - CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) && - CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6))); - } - return 0; -} - -int cbb_add_latin1(CBB *cbb, uint32_t u) { - return u <= 0xff && CBB_add_u8(cbb, (uint8_t)u); -} - -int cbb_add_ucs2_be(CBB *cbb, uint32_t u) { - return u <= 0xffff && is_valid_code_point(u) && CBB_add_u16(cbb, (uint16_t)u); -} - -int cbb_add_utf32_be(CBB *cbb, uint32_t u) { - return is_valid_code_point(u) && CBB_add_u32(cbb, u); -} diff --git a/third_party/boringssl/src/crypto/bytestring/unicode.cc b/third_party/boringssl/src/crypto/bytestring/unicode.cc new file mode 100644 index 00000000..74f558eb --- /dev/null +++ b/third_party/boringssl/src/crypto/bytestring/unicode.cc @@ -0,0 +1,156 @@ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "internal.h" + + +static int is_valid_code_point(uint32_t v) { + // References in the following are to Unicode 15.0.0. + if (// The Unicode space runs from zero to 0x10ffff (3.4 D9). + v > 0x10ffff || + // Values 0x...fffe, 0x...ffff, and 0xfdd0-0xfdef are permanently reserved + // as noncharacters (3.4 D14). See also 23.7. As our APIs are intended for + // "open interchange", such as ASN.1, we reject them. + (v & 0xfffe) == 0xfffe || + (v >= 0xfdd0 && v <= 0xfdef) || + // Surrogate code points are invalid (3.2 C1). + (v >= 0xd800 && v <= 0xdfff)) { + return 0; + } + return 1; +} + +// BOTTOM_BITS returns a byte with the bottom |n| bits set. +#define BOTTOM_BITS(n) (uint8_t)((1u << (n)) - 1) + +// TOP_BITS returns a byte with the top |n| bits set. +#define TOP_BITS(n) ((uint8_t)~BOTTOM_BITS(8 - (n))) + +int CBS_get_utf8(CBS *cbs, uint32_t *out) { + uint8_t c; + if (!CBS_get_u8(cbs, &c)) { + return 0; + } + if (c <= 0x7f) { + *out = c; + return 1; + } + uint32_t v, lower_bound; + size_t len; + if ((c & TOP_BITS(3)) == TOP_BITS(2)) { + v = c & BOTTOM_BITS(5); + len = 1; + lower_bound = 0x80; + } else if ((c & TOP_BITS(4)) == TOP_BITS(3)) { + v = c & BOTTOM_BITS(4); + len = 2; + lower_bound = 0x800; + } else if ((c & TOP_BITS(5)) == TOP_BITS(4)) { + v = c & BOTTOM_BITS(3); + len = 3; + lower_bound = 0x10000; + } else { + return 0; + } + for (size_t i = 0; i < len; i++) { + if (!CBS_get_u8(cbs, &c) || + (c & TOP_BITS(2)) != TOP_BITS(1)) { + return 0; + } + v <<= 6; + v |= c & BOTTOM_BITS(6); + } + if (!is_valid_code_point(v) || + v < lower_bound) { + return 0; + } + *out = v; + return 1; +} + +int CBS_get_latin1(CBS *cbs, uint32_t *out) { + uint8_t c; + if (!CBS_get_u8(cbs, &c)) { + return 0; + } + *out = c; + return 1; +} + +int CBS_get_ucs2_be(CBS *cbs, uint32_t *out) { + // Note UCS-2 (used by BMPString) does not support surrogates. + uint16_t c; + if (!CBS_get_u16(cbs, &c) || + !is_valid_code_point(c)) { + return 0; + } + *out = c; + return 1; +} + +int CBS_get_utf32_be(CBS *cbs, uint32_t *out) { + return CBS_get_u32(cbs, out) && is_valid_code_point(*out); +} + +size_t CBB_get_utf8_len(uint32_t u) { + if (u <= 0x7f) { + return 1; + } + if (u <= 0x7ff) { + return 2; + } + if (u <= 0xffff) { + return 3; + } + return 4; +} + +int CBB_add_utf8(CBB *cbb, uint32_t u) { + if (!is_valid_code_point(u)) { + return 0; + } + if (u <= 0x7f) { + return CBB_add_u8(cbb, (uint8_t)u); + } + if (u <= 0x7ff) { + return CBB_add_u8(cbb, TOP_BITS(2) | (u >> 6)) && + CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6))); + } + if (u <= 0xffff) { + return CBB_add_u8(cbb, TOP_BITS(3) | (u >> 12)) && + CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) && + CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6))); + } + if (u <= 0x10ffff) { + return CBB_add_u8(cbb, TOP_BITS(4) | (u >> 18)) && + CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 12) & BOTTOM_BITS(6))) && + CBB_add_u8(cbb, TOP_BITS(1) | ((u >> 6) & BOTTOM_BITS(6))) && + CBB_add_u8(cbb, TOP_BITS(1) | (u & BOTTOM_BITS(6))); + } + return 0; +} + +int CBB_add_latin1(CBB *cbb, uint32_t u) { + return u <= 0xff && CBB_add_u8(cbb, (uint8_t)u); +} + +int CBB_add_ucs2_be(CBB *cbb, uint32_t u) { + return u <= 0xffff && is_valid_code_point(u) && CBB_add_u16(cbb, (uint16_t)u); +} + +int CBB_add_utf32_be(CBB *cbb, uint32_t u) { + return is_valid_code_point(u) && CBB_add_u32(cbb, u); +} diff --git a/third_party/boringssl/src/crypto/chacha/chacha.c b/third_party/boringssl/src/crypto/chacha/chacha.c deleted file mode 100644 index 1092b7aa..00000000 --- a/third_party/boringssl/src/crypto/chacha/chacha.c +++ /dev/null @@ -1,173 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -// Adapted from the public domain, estream code by D. Bernstein. - -#include - -#include -#include - -#include "../internal.h" -#include "internal.h" - - -// sigma contains the ChaCha constants, which happen to be an ASCII string. -static const uint8_t sigma[16] = { 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', - '2', '-', 'b', 'y', 't', 'e', ' ', 'k' }; - -// QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. -#define QUARTERROUND(a, b, c, d) \ - x[a] += x[b]; \ - x[d] = CRYPTO_rotl_u32(x[d] ^ x[a], 16); \ - x[c] += x[d]; \ - x[b] = CRYPTO_rotl_u32(x[b] ^ x[c], 12); \ - x[a] += x[b]; \ - x[d] = CRYPTO_rotl_u32(x[d] ^ x[a], 8); \ - x[c] += x[d]; \ - x[b] = CRYPTO_rotl_u32(x[b] ^ x[c], 7); - -void CRYPTO_hchacha20(uint8_t out[32], const uint8_t key[32], - const uint8_t nonce[16]) { - uint32_t x[16]; - OPENSSL_memcpy(x, sigma, sizeof(sigma)); - OPENSSL_memcpy(&x[4], key, 32); - OPENSSL_memcpy(&x[12], nonce, 16); - - for (size_t i = 0; i < 20; i += 2) { - QUARTERROUND(0, 4, 8, 12) - QUARTERROUND(1, 5, 9, 13) - QUARTERROUND(2, 6, 10, 14) - QUARTERROUND(3, 7, 11, 15) - QUARTERROUND(0, 5, 10, 15) - QUARTERROUND(1, 6, 11, 12) - QUARTERROUND(2, 7, 8, 13) - QUARTERROUND(3, 4, 9, 14) - } - - OPENSSL_memcpy(out, &x[0], sizeof(uint32_t) * 4); - OPENSSL_memcpy(&out[16], &x[12], sizeof(uint32_t) * 4); -} - -#if defined(CHACHA20_ASM) - -void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, - const uint8_t key[32], const uint8_t nonce[12], - uint32_t counter) { - assert(!buffers_alias(out, in_len, in, in_len) || in == out); - - uint32_t counter_nonce[4]; - counter_nonce[0] = counter; - counter_nonce[1] = CRYPTO_load_u32_le(nonce + 0); - counter_nonce[2] = CRYPTO_load_u32_le(nonce + 4); - counter_nonce[3] = CRYPTO_load_u32_le(nonce + 8); - - const uint32_t *key_ptr = (const uint32_t *)key; -#if !defined(OPENSSL_X86) && !defined(OPENSSL_X86_64) - // The assembly expects the key to be four-byte aligned. - uint32_t key_u32[8]; - if ((((uintptr_t)key) & 3) != 0) { - key_u32[0] = CRYPTO_load_u32_le(key + 0); - key_u32[1] = CRYPTO_load_u32_le(key + 4); - key_u32[2] = CRYPTO_load_u32_le(key + 8); - key_u32[3] = CRYPTO_load_u32_le(key + 12); - key_u32[4] = CRYPTO_load_u32_le(key + 16); - key_u32[5] = CRYPTO_load_u32_le(key + 20); - key_u32[6] = CRYPTO_load_u32_le(key + 24); - key_u32[7] = CRYPTO_load_u32_le(key + 28); - - key_ptr = key_u32; - } -#endif - - ChaCha20_ctr32(out, in, in_len, key_ptr, counter_nonce); -} - -#else - -// chacha_core performs 20 rounds of ChaCha on the input words in -// |input| and writes the 64 output bytes to |output|. -static void chacha_core(uint8_t output[64], const uint32_t input[16]) { - uint32_t x[16]; - int i; - - OPENSSL_memcpy(x, input, sizeof(uint32_t) * 16); - for (i = 20; i > 0; i -= 2) { - QUARTERROUND(0, 4, 8, 12) - QUARTERROUND(1, 5, 9, 13) - QUARTERROUND(2, 6, 10, 14) - QUARTERROUND(3, 7, 11, 15) - QUARTERROUND(0, 5, 10, 15) - QUARTERROUND(1, 6, 11, 12) - QUARTERROUND(2, 7, 8, 13) - QUARTERROUND(3, 4, 9, 14) - } - - for (i = 0; i < 16; ++i) { - x[i] += input[i]; - } - for (i = 0; i < 16; ++i) { - CRYPTO_store_u32_le(output + 4 * i, x[i]); - } -} - -void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, - const uint8_t key[32], const uint8_t nonce[12], - uint32_t counter) { - assert(!buffers_alias(out, in_len, in, in_len) || in == out); - - uint32_t input[16]; - uint8_t buf[64]; - size_t todo, i; - - input[0] = CRYPTO_load_u32_le(sigma + 0); - input[1] = CRYPTO_load_u32_le(sigma + 4); - input[2] = CRYPTO_load_u32_le(sigma + 8); - input[3] = CRYPTO_load_u32_le(sigma + 12); - - input[4] = CRYPTO_load_u32_le(key + 0); - input[5] = CRYPTO_load_u32_le(key + 4); - input[6] = CRYPTO_load_u32_le(key + 8); - input[7] = CRYPTO_load_u32_le(key + 12); - - input[8] = CRYPTO_load_u32_le(key + 16); - input[9] = CRYPTO_load_u32_le(key + 20); - input[10] = CRYPTO_load_u32_le(key + 24); - input[11] = CRYPTO_load_u32_le(key + 28); - - input[12] = counter; - input[13] = CRYPTO_load_u32_le(nonce + 0); - input[14] = CRYPTO_load_u32_le(nonce + 4); - input[15] = CRYPTO_load_u32_le(nonce + 8); - - while (in_len > 0) { - todo = sizeof(buf); - if (in_len < todo) { - todo = in_len; - } - - chacha_core(buf, input); - for (i = 0; i < todo; i++) { - out[i] = in[i] ^ buf[i]; - } - - out += todo; - in += todo; - in_len -= todo; - - input[12]++; - } -} - -#endif diff --git a/third_party/boringssl/src/crypto/chacha/chacha.cc b/third_party/boringssl/src/crypto/chacha/chacha.cc new file mode 100644 index 00000000..6e444411 --- /dev/null +++ b/third_party/boringssl/src/crypto/chacha/chacha.cc @@ -0,0 +1,226 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Adapted from the public domain, estream code by D. Bernstein. + +#include + +#include +#include + +#include "../internal.h" +#include "internal.h" + + +// sigma contains the ChaCha constants, which happen to be an ASCII string. +using namespace bssl; + +static const uint8_t sigma[16] = { 'e', 'x', 'p', 'a', 'n', 'd', ' ', '3', + '2', '-', 'b', 'y', 't', 'e', ' ', 'k' }; + +// QUARTERROUND updates a, b, c, d with a ChaCha "quarter" round. +#define QUARTERROUND(a, b, c, d) \ + x[a] += x[b]; \ + x[d] = CRYPTO_rotl_u32(x[d] ^ x[a], 16); \ + x[c] += x[d]; \ + x[b] = CRYPTO_rotl_u32(x[b] ^ x[c], 12); \ + x[a] += x[b]; \ + x[d] = CRYPTO_rotl_u32(x[d] ^ x[a], 8); \ + x[c] += x[d]; \ + x[b] = CRYPTO_rotl_u32(x[b] ^ x[c], 7); + +void bssl::CRYPTO_hchacha20(uint8_t out[32], const uint8_t key[32], + const uint8_t nonce[16]) { + uint32_t x[16]; + OPENSSL_memcpy(x, sigma, sizeof(sigma)); + OPENSSL_memcpy(&x[4], key, 32); + OPENSSL_memcpy(&x[12], nonce, 16); + + for (size_t i = 0; i < 20; i += 2) { + QUARTERROUND(0, 4, 8, 12) + QUARTERROUND(1, 5, 9, 13) + QUARTERROUND(2, 6, 10, 14) + QUARTERROUND(3, 7, 11, 15) + QUARTERROUND(0, 5, 10, 15) + QUARTERROUND(1, 6, 11, 12) + QUARTERROUND(2, 7, 8, 13) + QUARTERROUND(3, 4, 9, 14) + } + + OPENSSL_memcpy(out, &x[0], sizeof(uint32_t) * 4); + OPENSSL_memcpy(&out[16], &x[12], sizeof(uint32_t) * 4); +} + +#if defined(CHACHA20_ASM_NOHW) +static void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len, + const uint32_t key[8], const uint32_t counter[4]) { +#if defined(CHACHA20_ASM_NEON) + if (ChaCha20_ctr32_neon_capable(in_len)) { + ChaCha20_ctr32_neon(out, in, in_len, key, counter); + return; + } +#endif +#if defined(CHACHA20_ASM_AVX2) + if (ChaCha20_ctr32_avx2_capable(in_len)) { + ChaCha20_ctr32_avx2(out, in, in_len, key, counter); + return; + } +#endif +#if defined(CHACHA20_ASM_SSSE3_4X) + if (ChaCha20_ctr32_ssse3_4x_capable(in_len)) { + ChaCha20_ctr32_ssse3_4x(out, in, in_len, key, counter); + return; + } +#endif +#if defined(CHACHA20_ASM_SSSE3) + if (ChaCha20_ctr32_ssse3_capable(in_len)) { + ChaCha20_ctr32_ssse3(out, in, in_len, key, counter); + return; + } +#endif + if (in_len > 0) { + ChaCha20_ctr32_nohw(out, in, in_len, key, counter); + } +} +#endif + +#if defined(CHACHA20_ASM_NOHW) + +void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[12], + uint32_t counter) { + assert(!buffers_alias(out, in_len, in, in_len) || in == out); + + uint32_t counter_nonce[4]; + counter_nonce[0] = counter; + counter_nonce[1] = CRYPTO_load_u32_le(nonce + 0); + counter_nonce[2] = CRYPTO_load_u32_le(nonce + 4); + counter_nonce[3] = CRYPTO_load_u32_le(nonce + 8); + + const uint32_t *key_ptr = (const uint32_t *)key; +#if !defined(OPENSSL_X86) && !defined(OPENSSL_X86_64) + // The assembly expects the key to be four-byte aligned. + uint32_t key_u32[8]; + if ((((uintptr_t)key) & 3) != 0) { + key_u32[0] = CRYPTO_load_u32_le(key + 0); + key_u32[1] = CRYPTO_load_u32_le(key + 4); + key_u32[2] = CRYPTO_load_u32_le(key + 8); + key_u32[3] = CRYPTO_load_u32_le(key + 12); + key_u32[4] = CRYPTO_load_u32_le(key + 16); + key_u32[5] = CRYPTO_load_u32_le(key + 20); + key_u32[6] = CRYPTO_load_u32_le(key + 24); + key_u32[7] = CRYPTO_load_u32_le(key + 28); + + key_ptr = key_u32; + } +#endif + + while (in_len > 0) { + // The assembly functions do not have defined overflow behavior. While + // overflow is almost always a bug in the caller, we prefer our functions to + // behave the same across platforms, so divide into multiple calls to avoid + // this case. + uint64_t todo = 64 * ((UINT64_C(1) << 32) - counter_nonce[0]); + if (todo > in_len) { + todo = in_len; + } + + ChaCha20_ctr32(out, in, (size_t)todo, key_ptr, counter_nonce); + in += todo; + out += todo; + in_len -= todo; + + // We're either done and will next break out of the loop, or we stopped at + // the wraparound point and the counter should continue at zero. + counter_nonce[0] = 0; + } +} + +#else + +// chacha_core performs 20 rounds of ChaCha on the input words in +// |input| and writes the 64 output bytes to |output|. +static void chacha_core(uint8_t output[64], const uint32_t input[16]) { + uint32_t x[16]; + int i; + + OPENSSL_memcpy(x, input, sizeof(uint32_t) * 16); + for (i = 20; i > 0; i -= 2) { + QUARTERROUND(0, 4, 8, 12) + QUARTERROUND(1, 5, 9, 13) + QUARTERROUND(2, 6, 10, 14) + QUARTERROUND(3, 7, 11, 15) + QUARTERROUND(0, 5, 10, 15) + QUARTERROUND(1, 6, 11, 12) + QUARTERROUND(2, 7, 8, 13) + QUARTERROUND(3, 4, 9, 14) + } + + for (i = 0; i < 16; ++i) { + x[i] += input[i]; + } + for (i = 0; i < 16; ++i) { + CRYPTO_store_u32_le(output + 4 * i, x[i]); + } +} + +void CRYPTO_chacha_20(uint8_t *out, const uint8_t *in, size_t in_len, + const uint8_t key[32], const uint8_t nonce[12], + uint32_t counter) { + assert(!buffers_alias(out, in_len, in, in_len) || in == out); + + uint32_t input[16]; + uint8_t buf[64]; + size_t todo, i; + + input[0] = CRYPTO_load_u32_le(sigma + 0); + input[1] = CRYPTO_load_u32_le(sigma + 4); + input[2] = CRYPTO_load_u32_le(sigma + 8); + input[3] = CRYPTO_load_u32_le(sigma + 12); + + input[4] = CRYPTO_load_u32_le(key + 0); + input[5] = CRYPTO_load_u32_le(key + 4); + input[6] = CRYPTO_load_u32_le(key + 8); + input[7] = CRYPTO_load_u32_le(key + 12); + + input[8] = CRYPTO_load_u32_le(key + 16); + input[9] = CRYPTO_load_u32_le(key + 20); + input[10] = CRYPTO_load_u32_le(key + 24); + input[11] = CRYPTO_load_u32_le(key + 28); + + input[12] = counter; + input[13] = CRYPTO_load_u32_le(nonce + 0); + input[14] = CRYPTO_load_u32_le(nonce + 4); + input[15] = CRYPTO_load_u32_le(nonce + 8); + + while (in_len > 0) { + todo = sizeof(buf); + if (in_len < todo) { + todo = in_len; + } + + chacha_core(buf, input); + for (i = 0; i < todo; i++) { + out[i] = in[i] ^ buf[i]; + } + + out += todo; + in += todo; + in_len -= todo; + + input[12]++; + } +} + +#endif diff --git a/third_party/boringssl/src/crypto/chacha/internal.h b/third_party/boringssl/src/crypto/chacha/internal.h index 1435e3b0..a758055d 100644 --- a/third_party/boringssl/src/crypto/chacha/internal.h +++ b/third_party/boringssl/src/crypto/chacha/internal.h @@ -1,45 +1,103 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_CHACHA_INTERNAL -#define OPENSSL_HEADER_CHACHA_INTERNAL +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_CHACHA_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_CHACHA_INTERNAL_H #include -#if defined(__cplusplus) -extern "C" { -#endif +#include "../internal.h" + +BSSL_NAMESPACE_BEGIN // CRYPTO_hchacha20 computes the HChaCha20 function, which should only be used // as part of XChaCha20. void CRYPTO_hchacha20(uint8_t out[32], const uint8_t key[32], const uint8_t nonce[16]); -#if !defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ - defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) -#define CHACHA20_ASM +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) -// ChaCha20_ctr32 is defined in asm/chacha-*.pl. -void ChaCha20_ctr32(uint8_t *out, const uint8_t *in, size_t in_len, - const uint32_t key[8], const uint32_t counter[4]); -#endif +#define CHACHA20_ASM_NOHW + +#define CHACHA20_ASM_SSSE3 +inline int ChaCha20_ctr32_ssse3_capable(size_t len) { + // Unlike the x86_64 version, the x86 SSSE3 routine runs for all non-zero + // lengths. + return len > 0 && CRYPTO_is_SSSE3_capable(); +} +extern "C" void ChaCha20_ctr32_ssse3(uint8_t *out, const uint8_t *in, + size_t in_len, const uint32_t key[8], + const uint32_t counter[4]); + +#elif !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) +#define CHACHA20_ASM_NOHW -#if defined(__cplusplus) -} // extern C +#define CHACHA20_ASM_NEON +inline int ChaCha20_ctr32_neon_capable(size_t len) { + return len >= 192 && CRYPTO_is_NEON_capable(); +} +extern "C" void ChaCha20_ctr32_neon(uint8_t *out, const uint8_t *in, + size_t in_len, const uint32_t key[8], + const uint32_t counter[4]); +#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) +#define CHACHA20_ASM_NOHW + +#define CHACHA20_ASM_AVX2 +inline int ChaCha20_ctr32_avx2_capable(size_t len) { + return len > 128 && CRYPTO_is_AVX2_capable(); +} +extern "C" void ChaCha20_ctr32_avx2(uint8_t *out, const uint8_t *in, + size_t in_len, const uint32_t key[8], + const uint32_t counter[4]); + +#define CHACHA20_ASM_SSSE3_4X +inline int ChaCha20_ctr32_ssse3_4x_capable(size_t len) { + int capable = len > 128 && CRYPTO_is_SSSE3_capable(); + int faster = len > 192 || !CRYPTO_cpu_perf_is_like_silvermont(); + return capable && faster; +} +extern "C" void ChaCha20_ctr32_ssse3_4x(uint8_t *out, const uint8_t *in, + size_t in_len, const uint32_t key[8], + const uint32_t counter[4]); + +#define CHACHA20_ASM_SSSE3 +inline int ChaCha20_ctr32_ssse3_capable(size_t len) { + return len > 128 && CRYPTO_is_SSSE3_capable(); +} +extern "C" void ChaCha20_ctr32_ssse3(uint8_t *out, const uint8_t *in, + size_t in_len, const uint32_t key[8], + const uint32_t counter[4]); #endif -#endif // OPENSSL_HEADER_CHACHA_INTERNAL +#if defined(CHACHA20_ASM_NOHW) +// ChaCha20_ctr32_nohw encrypts |in_len| bytes from |in| and writes the result +// to |out|. If |in| and |out| alias, they must be equal. |in_len| may not be +// zero. +// +// |counter[0]| is the initial 32-bit block counter, and the remainder is the +// 96-bit nonce. If the counter overflows, the output is undefined. The function +// will produce output, but the output may vary by machine and may not be +// self-consistent. (On some architectures, the assembly implements a mix of +// 64-bit and 32-bit counters.) +extern "C" void ChaCha20_ctr32_nohw(uint8_t *out, const uint8_t *in, + size_t in_len, const uint32_t key[8], + const uint32_t counter[4]); +#endif + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_CHACHA_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/cipher/derive_key.cc b/third_party/boringssl/src/crypto/cipher/derive_key.cc new file mode 100644 index 00000000..4dc493ed --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/derive_key.cc @@ -0,0 +1,106 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + + +int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md, + const uint8_t salt[8], const uint8_t *data, size_t data_len, + unsigned count, uint8_t *key, uint8_t *iv) { + uint8_t md_buf[EVP_MAX_MD_SIZE]; + unsigned addmd = 0; + unsigned mds = 0, i; + int rv = 0; + + unsigned nkey = EVP_CIPHER_key_length(type); + unsigned niv = EVP_CIPHER_iv_length(type); + + assert(nkey <= EVP_MAX_KEY_LENGTH); + assert(niv <= EVP_MAX_IV_LENGTH); + + if (data == nullptr) { + return nkey; + } + + bssl::ScopedEVP_MD_CTX c; + for (;;) { + if (!EVP_DigestInit_ex(c.get(), md, nullptr)) { + goto err; + } + if (addmd++) { + if (!EVP_DigestUpdate(c.get(), md_buf, mds)) { + goto err; + } + } + if (!EVP_DigestUpdate(c.get(), data, data_len)) { + goto err; + } + if (salt != nullptr) { + if (!EVP_DigestUpdate(c.get(), salt, 8)) { + goto err; + } + } + if (!EVP_DigestFinal_ex(c.get(), md_buf, &mds)) { + goto err; + } + + for (i = 1; i < count; i++) { + if (!EVP_DigestInit_ex(c.get(), md, nullptr) || + !EVP_DigestUpdate(c.get(), md_buf, mds) || + !EVP_DigestFinal_ex(c.get(), md_buf, &mds)) { + goto err; + } + } + + i = 0; + if (nkey) { + for (;;) { + if (nkey == 0 || i == mds) { + break; + } + if (key != nullptr) { + *(key++) = md_buf[i]; + } + nkey--; + i++; + } + } + + if (niv && i != mds) { + for (;;) { + if (niv == 0 || i == mds) { + break; + } + if (iv != nullptr) { + *(iv++) = md_buf[i]; + } + niv--; + i++; + } + } + if (nkey == 0 && niv == 0) { + break; + } + } + rv = EVP_CIPHER_key_length(type); + +err: + OPENSSL_cleanse(md_buf, EVP_MAX_MD_SIZE); + return rv; +} diff --git a/third_party/boringssl/src/crypto/cipher/e_aesctrhmac.cc b/third_party/boringssl/src/crypto/cipher/e_aesctrhmac.cc new file mode 100644 index 00000000..019d9e60 --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/e_aesctrhmac.cc @@ -0,0 +1,287 @@ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include +#include + +#include "../fipsmodule/aes/internal.h" +#include "../fipsmodule/cipher/internal.h" + + +using namespace bssl; + +#define EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN SHA256_DIGEST_LENGTH +#define EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN 12 + +struct aead_aes_ctr_hmac_sha256_ctx { + union { + double align; + AES_KEY ks; + } ks; + ctr128_f ctr; + block128_f block; + SHA256_CTX inner_init_state; + SHA256_CTX outer_init_state; +}; + +static_assert(sizeof(((EVP_AEAD_CTX *)nullptr)->state) >= + sizeof(struct aead_aes_ctr_hmac_sha256_ctx), + "AEAD state is too small"); +static_assert(alignof(union evp_aead_ctx_st_state) >= + alignof(struct aead_aes_ctr_hmac_sha256_ctx), + "AEAD state has insufficient alignment"); + +static void hmac_init(SHA256_CTX *out_inner, SHA256_CTX *out_outer, + const uint8_t hmac_key[32]) { + static const size_t hmac_key_len = 32; + uint8_t block[SHA256_CBLOCK]; + OPENSSL_memcpy(block, hmac_key, hmac_key_len); + OPENSSL_memset(block + hmac_key_len, 0x36, sizeof(block) - hmac_key_len); + + unsigned i; + for (i = 0; i < hmac_key_len; i++) { + block[i] ^= 0x36; + } + + SHA256_Init(out_inner); + SHA256_Update(out_inner, block, sizeof(block)); + + OPENSSL_memset(block + hmac_key_len, 0x5c, sizeof(block) - hmac_key_len); + for (i = 0; i < hmac_key_len; i++) { + block[i] ^= (0x36 ^ 0x5c); + } + + SHA256_Init(out_outer); + SHA256_Update(out_outer, block, sizeof(block)); +} + +static int aead_aes_ctr_hmac_sha256_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t tag_len) { + struct aead_aes_ctr_hmac_sha256_ctx *aes_ctx = + (struct aead_aes_ctr_hmac_sha256_ctx *)&ctx->state; + static const size_t hmac_key_len = 32; + + if (key_len < hmac_key_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); + return 0; // EVP_AEAD_CTX_init should catch this. + } + + const size_t aes_key_len = key_len - hmac_key_len; + if (aes_key_len != 16 && aes_key_len != 32) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); + return 0; // EVP_AEAD_CTX_init should catch this. + } + + if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) { + tag_len = EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN; + } + + if (tag_len > EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE); + return 0; + } + + aes_ctx->ctr = aes_ctr_set_key(&aes_ctx->ks.ks, nullptr, &aes_ctx->block, key, + aes_key_len); + ctx->tag_len = tag_len; + hmac_init(&aes_ctx->inner_init_state, &aes_ctx->outer_init_state, + key + aes_key_len); + + return 1; +} + +static void aead_aes_ctr_hmac_sha256_cleanup(EVP_AEAD_CTX *ctx) {} + +static void hmac_update_uint64(SHA256_CTX *sha256, uint64_t value) { + unsigned i; + uint8_t bytes[8]; + + for (i = 0; i < sizeof(bytes); i++) { + bytes[i] = value & 0xff; + value >>= 8; + } + SHA256_Update(sha256, bytes, sizeof(bytes)); +} + +static void hmac_calculate(uint8_t out[SHA256_DIGEST_LENGTH], + const SHA256_CTX *inner_init_state, + const SHA256_CTX *outer_init_state, + Span aadvecs, + const uint8_t *nonce, + Span iovecs, bool encrypt) { + size_t ad_len = bssl::iovec::TotalLength(aadvecs); + SHA256_CTX sha256; + OPENSSL_memcpy(&sha256, inner_init_state, sizeof(sha256)); + hmac_update_uint64(&sha256, ad_len); + hmac_update_uint64(&sha256, bssl::iovec::TotalLength(iovecs)); + SHA256_Update(&sha256, nonce, EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN); + for (const CRYPTO_IVEC &aadvec : aadvecs) { + SHA256_Update(&sha256, aadvec.in, aadvec.len); + } + + // Pad with zeros to the end of the SHA-256 block. + const unsigned num_padding = + (SHA256_CBLOCK - ((sizeof(uint64_t) * 2 + + EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN + ad_len) % + SHA256_CBLOCK)) % + SHA256_CBLOCK; + uint8_t padding[SHA256_CBLOCK]; + OPENSSL_memset(padding, 0, num_padding); + SHA256_Update(&sha256, padding, num_padding); + + for (const CRYPTO_IOVEC &iovec : iovecs) { + SHA256_Update(&sha256, encrypt ? iovec.out : iovec.in, iovec.len); + } + + uint8_t inner_digest[SHA256_DIGEST_LENGTH]; + SHA256_Final(inner_digest, &sha256); + + OPENSSL_memcpy(&sha256, outer_init_state, sizeof(sha256)); + SHA256_Update(&sha256, inner_digest, sizeof(inner_digest)); + SHA256_Final(out, &sha256); +} + +static void aead_aes_ctr_hmac_sha256_crypt( + const struct aead_aes_ctr_hmac_sha256_ctx *aes_ctx, + Span iovecs, const uint8_t *nonce) { + uint8_t partial_block_buffer[AES_BLOCK_SIZE]; + unsigned partial_block_offset = 0; + OPENSSL_memset(partial_block_buffer, 0, sizeof(partial_block_buffer)); + + uint8_t counter[AES_BLOCK_SIZE]; + OPENSSL_memcpy(counter, nonce, EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN); + OPENSSL_memset(counter + EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN, 0, 4); + + for (const CRYPTO_IOVEC &iovec : iovecs) { + CRYPTO_ctr128_encrypt_ctr32(iovec.in, iovec.out, iovec.len, &aes_ctx->ks.ks, + counter, partial_block_buffer, + &partial_block_offset, aes_ctx->ctr); + } +} + +static int aead_aes_ctr_hmac_sha256_sealv(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, + size_t *out_tag_len, + Span nonce, + Span aadvecs) { + const struct aead_aes_ctr_hmac_sha256_ctx *aes_ctx = + (struct aead_aes_ctr_hmac_sha256_ctx *)&ctx->state; + const uint64_t in_len_64 = bssl::iovec::TotalLength(iovecs); + + if (in_len_64 >= (UINT64_C(1) << 32) * AES_BLOCK_SIZE) { + // This input is so large it would overflow the 32-bit block counter. + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + if (out_tag.size() < ctx->tag_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + + if (nonce.size() != EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + aead_aes_ctr_hmac_sha256_crypt(aes_ctx, iovecs, nonce.data()); + + uint8_t hmac_result[SHA256_DIGEST_LENGTH]; + hmac_calculate(hmac_result, &aes_ctx->inner_init_state, + &aes_ctx->outer_init_state, aadvecs, nonce.data(), iovecs, + /*encrypt=*/true); + CopyToPrefix(Span(hmac_result).first(ctx->tag_len), out_tag); + *out_tag_len = ctx->tag_len; + + return 1; +} + +static int aead_aes_ctr_hmac_sha256_openv_detached( + const EVP_AEAD_CTX *ctx, Span iovecs, + Span nonce, Span in_tag, + Span aadvecs) { + const struct aead_aes_ctr_hmac_sha256_ctx *aes_ctx = + (struct aead_aes_ctr_hmac_sha256_ctx *)&ctx->state; + + if (in_tag.size() != ctx->tag_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + if (nonce.size() != EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + uint8_t hmac_result[SHA256_DIGEST_LENGTH]; + hmac_calculate(hmac_result, &aes_ctx->inner_init_state, + &aes_ctx->outer_init_state, aadvecs, nonce.data(), iovecs, + /*encrypt=*/false); + if (CRYPTO_memcmp(hmac_result, in_tag.data(), ctx->tag_len) != 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + aead_aes_ctr_hmac_sha256_crypt(aes_ctx, iovecs, nonce.data()); + + return 1; +} + +static const EVP_AEAD aead_aes_128_ctr_hmac_sha256 = { + 16 /* AES key */ + 32 /* HMAC key */, + 12, // nonce length + EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN, // overhead + EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN, // max tag length + + aead_aes_ctr_hmac_sha256_init, + nullptr /* init_with_direction */, + aead_aes_ctr_hmac_sha256_cleanup, + nullptr /* openv */, + aead_aes_ctr_hmac_sha256_sealv, + aead_aes_ctr_hmac_sha256_openv_detached, + nullptr /* get_iv */, + nullptr /* tag_len */, +}; + +static const EVP_AEAD aead_aes_256_ctr_hmac_sha256 = { + 32 /* AES key */ + 32 /* HMAC key */, + 12, // nonce length + EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN, // overhead + EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN, // max tag length + + aead_aes_ctr_hmac_sha256_init, + nullptr /* init_with_direction */, + aead_aes_ctr_hmac_sha256_cleanup, + nullptr /* openv */, + aead_aes_ctr_hmac_sha256_sealv, + aead_aes_ctr_hmac_sha256_openv_detached, + nullptr /* get_iv */, + nullptr /* tag_len */, +}; + +const EVP_AEAD *EVP_aead_aes_128_ctr_hmac_sha256() { + return &aead_aes_128_ctr_hmac_sha256; +} + +const EVP_AEAD *EVP_aead_aes_256_ctr_hmac_sha256() { + return &aead_aes_256_ctr_hmac_sha256; +} diff --git a/third_party/boringssl/src/crypto/cipher/e_aeseax.cc b/third_party/boringssl/src/crypto/cipher/e_aeseax.cc new file mode 100644 index 00000000..f85da3d6 --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/e_aeseax.cc @@ -0,0 +1,353 @@ +// Copyright 2025 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../fipsmodule/cipher/internal.h" +#include "../internal.h" + + +using namespace bssl; + +// Implementation of AES-EAX defined in +// https://www.iacr.org/archive/fse2004/30170391/30170391.pdf. + +#define EVP_AEAD_AES_EAX_TAG_LEN AES_BLOCK_SIZE + +struct aead_aes_eax_ctx { + union { + double align; + AES_KEY ks; + } ks; + uint8_t b[AES_BLOCK_SIZE]; + uint8_t p[AES_BLOCK_SIZE]; +}; + +static void mult_by_X(uint8_t out[AES_BLOCK_SIZE], + const uint8_t in[AES_BLOCK_SIZE]) { + const crypto_word_t in_hi = CRYPTO_load_word_be(in); + for (size_t i = 0; i < AES_BLOCK_SIZE - 1; ++i) { + out[i] = (in[i] << 1) | (in[i + 1] >> 7); + } + // Carry over 0x87 if msb is 1, 0x00 if msb is 0. + out[AES_BLOCK_SIZE - 1] = in[AES_BLOCK_SIZE - 1] << 1; + const uint8_t p = 0x87; + constant_time_conditional_memxor(out + AES_BLOCK_SIZE - 1, &p, /*n=*/1, + constant_time_msb_w(in_hi)); +} + +static int aead_aes_eax_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t tag_len) { + struct aead_aes_eax_ctx *aes_ctx = (struct aead_aes_eax_ctx *)&ctx->state; + + if (key_len != 16 && key_len != 32) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); + return 0; + } + + if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) { + tag_len = EVP_AEAD_AES_EAX_TAG_LEN; + } + + if (tag_len != EVP_AEAD_AES_EAX_TAG_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_TAG_SIZE); + return 0; + } + + if (AES_set_encrypt_key(key, /*bits=*/key_len * 8, &aes_ctx->ks.ks) != 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_AES_KEY_SETUP_FAILED); + return 0; + } + ctx->tag_len = tag_len; + + // L <- Ek(0^n). + OPENSSL_memset(aes_ctx->b, 0, sizeof(aes_ctx->b)); + AES_encrypt(aes_ctx->b, aes_ctx->b, &aes_ctx->ks.ks); + // B <- 2L. + mult_by_X(aes_ctx->b, aes_ctx->b); + // P <- 4L = 2B. + mult_by_X(aes_ctx->p, aes_ctx->b); + return 1; +} + +static void aead_aes_eax_cleanup(EVP_AEAD_CTX *ctx) {} + +// Implements the CBK function in the paper. +static void cbk_block(const struct aead_aes_eax_ctx *aes_ctx, + const uint8_t in[AES_BLOCK_SIZE], + uint8_t out[AES_BLOCK_SIZE]) { + CRYPTO_xor16(out, in, out); + AES_encrypt(out, out, &aes_ctx->ks.ks); +} + +// Precondition: in_len <= AES_BLOCK_SIZE. +static void pad(const struct aead_aes_eax_ctx *aes_ctx, + uint8_t out[AES_BLOCK_SIZE], const uint8_t *in, size_t in_len) { + assert(in_len <= AES_BLOCK_SIZE); + if (in_len == AES_BLOCK_SIZE) { + CRYPTO_xor16(out, aes_ctx->b, in); + return; + } + OPENSSL_memset(out, 0, AES_BLOCK_SIZE); + OPENSSL_memcpy(out, in, in_len); + out[in_len] = 0x80; + CRYPTO_xor16(out, aes_ctx->p, out); +} + +template +static void omac_with_tag(const struct aead_aes_eax_ctx *aes_ctx, + uint8_t out[AES_BLOCK_SIZE], Span ivecs, + uint8_t tag) { + OPENSSL_memset(out, 0, AES_BLOCK_SIZE); + out[AES_BLOCK_SIZE - 1] = tag; + size_t in_len = bssl::iovec::TotalLength(ivecs); + if (in_len == 0) { + // CBK(pad(M;B,P)) = CBK(B). Avoiding padding to skip a copy. + cbk_block(aes_ctx, aes_ctx->b, out); + return; + } + // CBK(M1) = Ek(M1 ^ 0^n) + AES_encrypt(out, out, &aes_ctx->ks.ks); + bssl::iovec::ForEachBlockRange( + ivecs, + [&](const uint8_t *in, size_t len) { + while (len >= AES_BLOCK_SIZE) { + // Full blocks, no padding needed. + cbk_block(aes_ctx, in, out); + in += AES_BLOCK_SIZE; + len -= AES_BLOCK_SIZE; + } + BSSL_CHECK(len == 0); + return true; + }, + [&](const uint8_t *in, size_t len) { + // Remaining blocks. + while (len > AES_BLOCK_SIZE) { + // Full blocks, no padding needed. + cbk_block(aes_ctx, in, out); + in += AES_BLOCK_SIZE; + len -= AES_BLOCK_SIZE; + } + // Last partial block. + uint8_t padded_block[AES_BLOCK_SIZE]; + pad(aes_ctx, padded_block, in, len); + cbk_block(aes_ctx, padded_block, out); + return true; + }); +} + +static void omac_with_tag_iovec_out(const struct aead_aes_eax_ctx *aes_ctx, + uint8_t out[AES_BLOCK_SIZE], + Span iovecs, + uint8_t tag) { + OPENSSL_memset(out, 0, AES_BLOCK_SIZE); + out[AES_BLOCK_SIZE - 1] = tag; + size_t in_len = bssl::iovec::TotalLength(iovecs); + if (in_len == 0) { + // CBK(pad(M;B,P)) = CBK(B). Avoiding padding to skip a copy. + cbk_block(aes_ctx, aes_ctx->b, out); + return; + } + // CBK(M1) = Ek(M1 ^ 0^n) + AES_encrypt(out, out, &aes_ctx->ks.ks); + bssl::iovec::ForEachOutBlockRange( + iovecs, + [&](const uint8_t *in, size_t len) { + while (len >= AES_BLOCK_SIZE) { + // Full blocks, no padding needed. + cbk_block(aes_ctx, in, out); + in += AES_BLOCK_SIZE; + len -= AES_BLOCK_SIZE; + } + BSSL_CHECK(len == 0); + return true; + }, + [&](const uint8_t *in, size_t len) { + while (len > AES_BLOCK_SIZE) { + // Full blocks, no padding needed. + cbk_block(aes_ctx, in, out); + in += AES_BLOCK_SIZE; + len -= AES_BLOCK_SIZE; + } + // Last partial block. + uint8_t padded_block[AES_BLOCK_SIZE]; + pad(aes_ctx, padded_block, in, len); + cbk_block(aes_ctx, padded_block, out); + return true; + }); +} + +// Encrypts/decrypts |in_len| bytes from |in| to |out| using AES-CTR with |n| as +// the IV. +static void aes_ctr(const struct aead_aes_eax_ctx *aes_ctx, + Span iovecs, + const uint8_t n[AES_BLOCK_SIZE]) { + uint8_t ivec[AES_BLOCK_SIZE]; + OPENSSL_memcpy(ivec, n, AES_BLOCK_SIZE); + + uint8_t ecount_buf[AES_BLOCK_SIZE]; + unsigned int num = 0; + + for (const CRYPTO_IOVEC &iovec : iovecs) { + AES_ctr128_encrypt(iovec.in, iovec.out, iovec.len, &aes_ctx->ks.ks, ivec, + ecount_buf, &num); + } +} + +static int aead_aes_eax_sealv(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, size_t *out_tag_len, + Span nonce, + Span aadvecs) { + // We use the full 128 bits of the nonce as counter, so no need to check the + // plaintext size. + + if (out_tag.size() < ctx->tag_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + + if (nonce.size() != 12 && nonce.size() != 16) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + const struct aead_aes_eax_ctx *aes_ctx = + (struct aead_aes_eax_ctx *)&ctx->state; + + // N <- OMAC(0 || nonce) + uint8_t n[AES_BLOCK_SIZE]; + CRYPTO_IVEC noncevec[1]; + noncevec[0].in = nonce.data(); + noncevec[0].len = nonce.size(); + omac_with_tag(aes_ctx, n, Span(noncevec), /*tag=*/0); + // H <- OMAC(1 || ad) + uint8_t h[AES_BLOCK_SIZE]; + omac_with_tag(aes_ctx, h, aadvecs, /*tag=*/1); + + // C <- CTR^{N}_{K}(M) + aes_ctr(aes_ctx, iovecs, n); + + // MAC <- OMAC(2 || C) + omac_with_tag_iovec_out(aes_ctx, out_tag.data(), iovecs, /*tag=*/2); + // MAC <- N ^ C ^ H + CRYPTO_xor16(out_tag.data(), n, out_tag.data()); + CRYPTO_xor16(out_tag.data(), h, out_tag.data()); + + *out_tag_len = ctx->tag_len; + return 1; +} + +static int aead_aes_eax_openv_detached(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span nonce, + Span in_tag, + Span aadvecs) { + const uint64_t ad_len_64 = bssl::iovec::TotalLength(aadvecs); + if (ad_len_64 >= (UINT64_C(1) << 61)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + const uint64_t in_len_64 = bssl::iovec::TotalLength(iovecs); + if (in_tag.size() != EVP_AEAD_AES_EAX_TAG_LEN || + in_len_64 > (UINT64_C(1) << 36) + AES_BLOCK_SIZE) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + if (nonce.size() != 12 && nonce.size() != 16) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + const struct aead_aes_eax_ctx *aes_ctx = + (struct aead_aes_eax_ctx *)&ctx->state; + + // N <- OMAC(0 || nonce) + uint8_t n[AES_BLOCK_SIZE]; + CRYPTO_IVEC noncevec[1]; + noncevec[0].in = nonce.data(); + noncevec[0].len = nonce.size(); + omac_with_tag(aes_ctx, n, Span(noncevec), + /*tag=*/0); + // H <- OMAC(1 || ad) + uint8_t h[AES_BLOCK_SIZE]; + omac_with_tag(aes_ctx, h, aadvecs, /*tag=*/1); + + // MAC <- OMAC(2 || C) + uint8_t mac[AES_BLOCK_SIZE]; + omac_with_tag(aes_ctx, mac, iovecs, /*tag=*/2); + // MAC <- N ^ C ^ H + CRYPTO_xor16(mac, n, mac); + CRYPTO_xor16(mac, h, mac); + + if (CRYPTO_memcmp(mac, in_tag.data(), in_tag.size()) != 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + // M <- CTR^{N}_{K}(C) + aes_ctr(aes_ctx, iovecs, n); + return 1; +} + +static const EVP_AEAD aead_aes_128_eax = { + 16, // AES key size + 16, // nonce length + EVP_AEAD_AES_EAX_TAG_LEN, // overhead + EVP_AEAD_AES_EAX_TAG_LEN, // max tag length + + aead_aes_eax_init, + nullptr, // init_with_direction + aead_aes_eax_cleanup, + nullptr, // openv + aead_aes_eax_sealv, + aead_aes_eax_openv_detached, + nullptr, // get_iv + nullptr, // tag_len +}; + +static const EVP_AEAD aead_aes_256_eax = { + 32, // AES key size + 16, // nonce length + EVP_AEAD_AES_EAX_TAG_LEN, // overhead + EVP_AEAD_AES_EAX_TAG_LEN, // max tag length + + aead_aes_eax_init, + nullptr, // init_with_direction + aead_aes_eax_cleanup, + nullptr, // openv + aead_aes_eax_sealv, + aead_aes_eax_openv_detached, + nullptr, // get_iv + nullptr, // tag_len +}; + +const EVP_AEAD *EVP_aead_aes_128_eax() { return &aead_aes_128_eax; } + +const EVP_AEAD *EVP_aead_aes_256_eax() { return &aead_aes_256_eax; } diff --git a/third_party/boringssl/src/crypto/cipher/e_aesgcmsiv.cc b/third_party/boringssl/src/crypto/cipher/e_aesgcmsiv.cc new file mode 100644 index 00000000..cc80306b --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/e_aesgcmsiv.cc @@ -0,0 +1,1061 @@ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include + +#include "../fipsmodule/aes/internal.h" +#include "../fipsmodule/cipher/internal.h" +#include "../internal.h" + + +using namespace bssl; + +#define EVP_AEAD_AES_GCM_SIV_NONCE_LEN 12 +#define EVP_AEAD_AES_GCM_SIV_TAG_LEN 16 + +namespace { +void inc_counter(uint8_t tag[16], uint32_t by) { + CRYPTO_store_u32_le(tag, CRYPTO_load_u32_le(tag) + by); +} + +// TODO(davidben): AES-GCM-SIV assembly is not correct for Windows. It must save +// and restore xmm6 through xmm15. +#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \ + !defined(OPENSSL_WINDOWS) +#define AES_GCM_SIV_ASM + +// Optimised AES-GCM-SIV + +struct aead_aes_gcm_siv_asm_ctx { + alignas(16) uint8_t key[16 * 15]; + int is_128_bit; +}; + +// The assembly code assumes 8-byte alignment of the EVP_AEAD_CTX's state, and +// aligns to 16 bytes itself. +static_assert(sizeof(((EVP_AEAD_CTX *)nullptr)->state) + 8 >= + sizeof(struct aead_aes_gcm_siv_asm_ctx), + "AEAD state is too small"); +static_assert(alignof(union evp_aead_ctx_st_state) >= 8, + "AEAD state has insufficient alignment"); + +extern "C" { +// aes128gcmsiv_aes_ks writes an AES-128 key schedule for |key| to +// |out_expanded_key|. +extern void aes128gcmsiv_aes_ks(const uint8_t key[16], + uint8_t out_expanded_key[16 * 15]); + +// aes256gcmsiv_aes_ks writes an AES-256 key schedule for |key| to +// |out_expanded_key|. +extern void aes256gcmsiv_aes_ks(const uint8_t key[32], + uint8_t out_expanded_key[16 * 15]); + +// aesgcmsiv_polyval_horner updates the POLYVAL value in |in_out_poly| to +// include a number (|in_blocks|) of 16-byte blocks of data from |in|, given +// the POLYVAL key in |key|. +extern void aesgcmsiv_polyval_horner(const uint8_t in_out_poly[16], + const uint8_t key[16], const uint8_t *in, + size_t in_blocks); + +// aesgcmsiv_htable_init writes powers 1..8 of |auth_key| to |out_htable|. +extern void aesgcmsiv_htable_init(uint8_t out_htable[16 * 8], + const uint8_t auth_key[16]); + +// aesgcmsiv_htable6_init writes powers 1..6 of |auth_key| to |out_htable|. +extern void aesgcmsiv_htable6_init(uint8_t out_htable[16 * 6], + const uint8_t auth_key[16]); + +// aesgcmsiv_htable_polyval updates the POLYVAL value in |in_out_poly| to +// include |in_len| bytes of data from |in|. (Where |in_len| must be a multiple +// of 16.) It uses the precomputed powers of the key given in |htable|. +extern void aesgcmsiv_htable_polyval(const uint8_t htable[16 * 8], + const uint8_t *in, size_t in_len, + uint8_t in_out_poly[16]); + +// aes128gcmsiv_dec decrypts |in_len| & ~15 bytes from |out| and writes them to +// |in|. |in| and |out| may be equal, but must not otherwise alias. +// +// |in_out_calculated_tag_and_scratch|, on entry, must contain: +// 1. The current value of the calculated tag, which will be updated during +// decryption and written back to the beginning of this buffer on exit. +// 2. The claimed tag, which is needed to derive counter values. +// +// While decrypting, the whole of |in_out_calculated_tag_and_scratch| may be +// used for other purposes. In order to decrypt and update the POLYVAL value, it +// uses the expanded key from |key| and the table of powers in |htable|. +extern void aes128gcmsiv_dec(const uint8_t *in, uint8_t *out, + uint8_t in_out_calculated_tag_and_scratch[16 * 8], + const uint8_t htable[16 * 6], + const struct aead_aes_gcm_siv_asm_ctx *key, + size_t in_len); + +// aes256gcmsiv_dec acts like |aes128gcmsiv_dec|, but for AES-256. +extern void aes256gcmsiv_dec(const uint8_t *in, uint8_t *out, + uint8_t in_out_calculated_tag_and_scratch[16 * 8], + const uint8_t htable[16 * 6], + const struct aead_aes_gcm_siv_asm_ctx *key, + size_t in_len); + +// aes128gcmsiv_kdf performs the AES-GCM-SIV KDF given the expanded key from +// |key_schedule| and the nonce in |nonce|. Note that, while only 12 bytes of +// the nonce are used, 16 bytes are read and so the value must be +// right-padded. +extern void aes128gcmsiv_kdf(const uint8_t nonce[16], + uint64_t out_key_material[8], + const uint8_t *key_schedule); + +// aes256gcmsiv_kdf acts like |aes128gcmsiv_kdf|, but for AES-256. +extern void aes256gcmsiv_kdf(const uint8_t nonce[16], + uint64_t out_key_material[12], + const uint8_t *key_schedule); + +// aes128gcmsiv_aes_ks_enc_x1 performs a key expansion of the AES-128 key in +// |key|, writes the expanded key to |out_expanded_key| and encrypts a single +// block from |in| to |out|. +extern void aes128gcmsiv_aes_ks_enc_x1(const uint8_t in[16], uint8_t out[16], + uint8_t out_expanded_key[16 * 15], + const uint64_t key[2]); + +// aes256gcmsiv_aes_ks_enc_x1 acts like |aes128gcmsiv_aes_ks_enc_x1|, but for +// AES-256. +extern void aes256gcmsiv_aes_ks_enc_x1(const uint8_t in[16], uint8_t out[16], + uint8_t out_expanded_key[16 * 15], + const uint64_t key[4]); + +// aes128gcmsiv_ecb_enc_block encrypts a single block from |in| to |out| using +// the expanded key in |expanded_key|. +extern void aes128gcmsiv_ecb_enc_block( + const uint8_t in[16], uint8_t out[16], + const struct aead_aes_gcm_siv_asm_ctx *expanded_key); + +// aes256gcmsiv_ecb_enc_block acts like |aes128gcmsiv_ecb_enc_block|, but for +// AES-256. +extern void aes256gcmsiv_ecb_enc_block( + const uint8_t in[16], uint8_t out[16], + const struct aead_aes_gcm_siv_asm_ctx *expanded_key); + +// aes128gcmsiv_enc_msg_x4 encrypts |in_len| bytes from |in| to |out| using the +// expanded key from |key|. (The value of |in_len| must be a multiple of 16.) +// The |in| and |out| buffers may be equal but must not otherwise overlap. The +// initial counter is constructed from the given |tag| as required by +// AES-GCM-SIV. +extern void aes128gcmsiv_enc_msg_x4(const uint8_t *in, uint8_t *out, + const uint8_t *tag, + const struct aead_aes_gcm_siv_asm_ctx *key, + size_t in_len); + +// aes256gcmsiv_enc_msg_x4 acts like |aes128gcmsiv_enc_msg_x4|, but for +// AES-256. +extern void aes256gcmsiv_enc_msg_x4(const uint8_t *in, uint8_t *out, + const uint8_t *tag, + const struct aead_aes_gcm_siv_asm_ctx *key, + size_t in_len); + +// aes128gcmsiv_enc_msg_x8 acts like |aes128gcmsiv_enc_msg_x4|, but is +// optimised for longer messages. +extern void aes128gcmsiv_enc_msg_x8(const uint8_t *in, uint8_t *out, + const uint8_t *tag, + const struct aead_aes_gcm_siv_asm_ctx *key, + size_t in_len); + +// aes256gcmsiv_enc_msg_x8 acts like |aes256gcmsiv_enc_msg_x4|, but is +// optimised for longer messages. +extern void aes256gcmsiv_enc_msg_x8(const uint8_t *in, uint8_t *out, + const uint8_t *tag, + const struct aead_aes_gcm_siv_asm_ctx *key, + size_t in_len); +} + +// asm_ctx_from_ctx returns a 16-byte aligned context pointer from |ctx|. +struct aead_aes_gcm_siv_asm_ctx *asm_ctx_from_ctx(const EVP_AEAD_CTX *ctx) { + // ctx->state must already be 8-byte aligned. Thus, at most, we may need to + // add eight to align it to 16 bytes. + const uintptr_t offset = ((uintptr_t)&ctx->state) & 8; + return (struct aead_aes_gcm_siv_asm_ctx *)(&ctx->state.opaque[offset]); +} + +int aead_aes_gcm_siv_asm_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t tag_len) { + const size_t key_bits = key_len * 8; + + if (key_bits != 128 && key_bits != 256) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); + return 0; // EVP_AEAD_CTX_init should catch this. + } + + if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) { + tag_len = EVP_AEAD_AES_GCM_SIV_TAG_LEN; + } + + if (tag_len != EVP_AEAD_AES_GCM_SIV_TAG_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE); + return 0; + } + + struct aead_aes_gcm_siv_asm_ctx *gcm_siv_ctx = asm_ctx_from_ctx(ctx); + assert((((uintptr_t)gcm_siv_ctx) & 15) == 0); + + if (key_bits == 128) { + aes128gcmsiv_aes_ks(key, &gcm_siv_ctx->key[0]); + gcm_siv_ctx->is_128_bit = 1; + } else { + aes256gcmsiv_aes_ks(key, &gcm_siv_ctx->key[0]); + gcm_siv_ctx->is_128_bit = 0; + } + + ctx->tag_len = tag_len; + + return 1; +} + +void aead_aes_gcm_siv_asm_cleanup(EVP_AEAD_CTX *ctx) {} + +// gcm_siv_asm_polyval evaluates POLYVAL at |auth_key| on the given plaintext +// and AD. The result is written to |out_tag|. +void gcm_siv_asm_polyval(uint8_t out_tag[16], Span iovecs, + Span aadvecs, + const uint8_t auth_key[16], const uint8_t nonce[12]) { + OPENSSL_memset(out_tag, 0, 16); + const size_t ad_len = bssl::iovec::TotalLength(aadvecs); + const size_t ad_blocks = ad_len / 16; + const size_t in_len = bssl::iovec::TotalLength(iovecs); + const size_t in_blocks = in_len / 16; + int htable_init = 0; + alignas(16) uint8_t htable[16 * 8]; + + if (ad_blocks > 8 || in_blocks > 8) { + htable_init = 1; + aesgcmsiv_htable_init(htable, auth_key); + } + + auto f_whole = [&](const uint8_t *in, size_t len) { + if (htable_init) { + aesgcmsiv_htable_polyval(htable, in, len, out_tag); + } else { + aesgcmsiv_polyval_horner(out_tag, auth_key, in, len / AES_BLOCK_SIZE); + } + return true; + }; + + auto f_final = [&](const uint8_t *in, size_t len) { + size_t len_whole = (len / AES_BLOCK_SIZE) * AES_BLOCK_SIZE; + if (len_whole != 0) { + f_whole(in, len_whole); + in += len_whole; + len -= len_whole; + } + if (len != 0) { + uint8_t pad_buf[AES_BLOCK_SIZE]; + OPENSSL_memcpy(pad_buf, in, len); + OPENSSL_memset(pad_buf + len, 0, AES_BLOCK_SIZE - len); + aesgcmsiv_polyval_horner(out_tag, auth_key, pad_buf, 1); + } + return true; + }; + + bssl::iovec::ForEachBlockRange(aadvecs, f_whole, + f_final); + + bssl::iovec::ForEachBlockRange(iovecs, f_whole, + f_final); + + uint8_t length_block[16]; + CRYPTO_store_u64_le(length_block, ad_len * 8); + CRYPTO_store_u64_le(length_block + 8, in_len * 8); + aesgcmsiv_polyval_horner(out_tag, auth_key, length_block, 1); + + for (size_t i = 0; i < 12; i++) { + out_tag[i] ^= nonce[i]; + } + + out_tag[15] &= 0x7f; +} + +// aead_aes_gcm_siv_asm_crypt_last_block handles the encryption/decryption +// (same thing in CTR mode) of the final block of a plaintext/ciphertext. It +// writes |total_in_len| & 15 bytes to |out_last_block|, based on an initial +// counter derived from |tag|. +void aead_aes_gcm_siv_asm_crypt_last_block( + int is_128_bit, uint8_t *out_last_block, const uint8_t *in_last_block, + size_t total_in_len, const uint8_t tag[16], + const struct aead_aes_gcm_siv_asm_ctx *enc_key_expanded) { + alignas(16) uint8_t counter[16]; + OPENSSL_memcpy(&counter, tag, sizeof(counter)); + counter[15] |= 0x80; + inc_counter(counter, static_cast(total_in_len / 16)); + + if (is_128_bit) { + aes128gcmsiv_ecb_enc_block(counter, counter, enc_key_expanded); + } else { + aes256gcmsiv_ecb_enc_block(counter, counter, enc_key_expanded); + } + + const size_t last_bytes_len = total_in_len & 15; + for (size_t i = 0; i < last_bytes_len; i++) { + out_last_block[i] = in_last_block[i] ^ counter[i]; + } +} + +// aead_aes_gcm_siv_kdf calculates the record encryption and authentication +// keys given the |nonce|. +void aead_aes_gcm_siv_kdf(int is_128_bit, + const struct aead_aes_gcm_siv_asm_ctx *gcm_siv_ctx, + uint64_t out_record_auth_key[2], + uint64_t out_record_enc_key[4], + const uint8_t nonce[12]) { + alignas(16) uint8_t padded_nonce[16]; + OPENSSL_memcpy(padded_nonce, nonce, 12); + + alignas(16) uint64_t key_material[12]; + if (is_128_bit) { + aes128gcmsiv_kdf(padded_nonce, key_material, &gcm_siv_ctx->key[0]); + out_record_enc_key[0] = key_material[4]; + out_record_enc_key[1] = key_material[6]; + } else { + aes256gcmsiv_kdf(padded_nonce, key_material, &gcm_siv_ctx->key[0]); + out_record_enc_key[0] = key_material[4]; + out_record_enc_key[1] = key_material[6]; + out_record_enc_key[2] = key_material[8]; + out_record_enc_key[3] = key_material[10]; + } + + out_record_auth_key[0] = key_material[0]; + out_record_auth_key[1] = key_material[2]; +} + +int aead_aes_gcm_siv_asm_sealv(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, size_t *out_tag_len, + Span nonce, + Span aadvecs) { + const struct aead_aes_gcm_siv_asm_ctx *gcm_siv_ctx = asm_ctx_from_ctx(ctx); + const size_t in_len = bssl::iovec::TotalLength(iovecs); + const uint64_t in_len_64 = in_len; + const size_t ad_len = bssl::iovec::TotalLength(aadvecs); + const uint64_t ad_len_64 = ad_len; + + if (in_len_64 > (UINT64_C(1) << 36) || ad_len_64 >= (UINT64_C(1) << 61)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + if (out_tag.size() < EVP_AEAD_AES_GCM_SIV_TAG_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + + if (nonce.size() != EVP_AEAD_AES_GCM_SIV_NONCE_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + alignas(16) uint64_t record_auth_key[2]; + alignas(16) uint64_t record_enc_key[4]; + aead_aes_gcm_siv_kdf(gcm_siv_ctx->is_128_bit, gcm_siv_ctx, record_auth_key, + record_enc_key, nonce.data()); + + alignas(16) uint8_t tag[16] = {0}; + gcm_siv_asm_polyval(tag, iovecs, aadvecs, (const uint8_t *)record_auth_key, + nonce.data()); + + struct aead_aes_gcm_siv_asm_ctx enc_key_expanded; + + if (gcm_siv_ctx->is_128_bit) { + aes128gcmsiv_aes_ks_enc_x1(tag, tag, &enc_key_expanded.key[0], + record_enc_key); + // Maintain a counter across calls to assembly. The functions internally set + // the MSB of the last byte, so we only need to update the counter. + alignas(16) uint8_t counter[16]; + OPENSSL_memcpy(counter, tag, 16); + bssl::iovec::ForEachBlockRange( + iovecs, + [&](const uint8_t *in, uint8_t *out, size_t len) { + if (len >= 128) { + aes128gcmsiv_enc_msg_x8(in, out, counter, &enc_key_expanded, len); + } else { + aes128gcmsiv_enc_msg_x4(in, out, counter, &enc_key_expanded, len); + } + inc_counter(counter, static_cast(len / AES_BLOCK_SIZE)); + return true; + }, + [&](const uint8_t *in, uint8_t *out, size_t len) { + size_t len_whole = (len / AES_BLOCK_SIZE) * AES_BLOCK_SIZE; + if (len_whole != 0) { + if (len_whole >= 128) { + aes128gcmsiv_enc_msg_x8(in, out, counter, &enc_key_expanded, + len_whole); + } else { + aes128gcmsiv_enc_msg_x4(in, out, counter, &enc_key_expanded, + len_whole); + } + in += len_whole; + out += len_whole; + len -= len_whole; + } + if (len != 0) { + aead_aes_gcm_siv_asm_crypt_last_block( + /*is_128_bit=*/true, /*out_last_block=*/out, + /*in_last_block=*/in, /*total_in_len=*/in_len, tag, + &enc_key_expanded); + } + return true; + }); + } else { + aes256gcmsiv_aes_ks_enc_x1(tag, tag, &enc_key_expanded.key[0], + record_enc_key); + alignas(16) uint8_t counter[16]; + OPENSSL_memcpy(counter, tag, 16); + bssl::iovec::ForEachBlockRange( + iovecs, + [&](const uint8_t *in, uint8_t *out, size_t len) { + if (len >= 128) { + aes256gcmsiv_enc_msg_x8(in, out, counter, &enc_key_expanded, len); + } else { + aes256gcmsiv_enc_msg_x4(in, out, counter, &enc_key_expanded, len); + } + inc_counter(counter, static_cast(len / AES_BLOCK_SIZE)); + return true; + }, + [&](const uint8_t *in, uint8_t *out, size_t len) { + size_t len_whole = (len / AES_BLOCK_SIZE) * AES_BLOCK_SIZE; + if (len_whole != 0) { + if (len_whole >= 128) { + aes256gcmsiv_enc_msg_x8(in, out, counter, &enc_key_expanded, + len_whole); + } else { + aes256gcmsiv_enc_msg_x4(in, out, counter, &enc_key_expanded, + len_whole); + } + in += len_whole; + out += len_whole; + len -= len_whole; + } + if (len != 0) { + aead_aes_gcm_siv_asm_crypt_last_block( + /*is_128_bit=*/false, /*out_last_block=*/out, + /*in_last_block=*/in, /*total_in_len=*/in_len, tag, + &enc_key_expanded); + } + return true; + }); + } + + CopyToPrefix(tag, out_tag); + *out_tag_len = EVP_AEAD_AES_GCM_SIV_TAG_LEN; + + return 1; +} + +int aead_aes_gcm_siv_asm_openv_detached(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span nonce, + Span in_tag, + Span aadvecs) { + const size_t ad_len = bssl::iovec::TotalLength(aadvecs); + const uint64_t ad_len_64 = ad_len; + if (ad_len_64 >= (UINT64_C(1) << 61)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + const size_t in_len = bssl::iovec::TotalLength(iovecs); + const uint64_t in_len_64 = in_len; + if (in_len_64 > UINT64_C(1) << 36 || + in_tag.size() != EVP_AEAD_AES_GCM_SIV_TAG_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + if (nonce.size() != EVP_AEAD_AES_GCM_SIV_NONCE_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + const struct aead_aes_gcm_siv_asm_ctx *gcm_siv_ctx = asm_ctx_from_ctx(ctx); + + alignas(16) uint64_t record_auth_key[2]; + alignas(16) uint64_t record_enc_key[4]; + aead_aes_gcm_siv_kdf(gcm_siv_ctx->is_128_bit, gcm_siv_ctx, record_auth_key, + record_enc_key, nonce.data()); + + struct aead_aes_gcm_siv_asm_ctx expanded_key; + if (gcm_siv_ctx->is_128_bit) { + aes128gcmsiv_aes_ks((const uint8_t *)record_enc_key, &expanded_key.key[0]); + } else { + aes256gcmsiv_aes_ks((const uint8_t *)record_enc_key, &expanded_key.key[0]); + } + // calculated_tag is 16*8 bytes, rather than 16 bytes, because + // aes[128|256]gcmsiv_dec uses the extra as scratch space. + alignas(16) uint8_t calculated_tag[16 * 8] = {0}; + + OPENSSL_memset(calculated_tag, 0, EVP_AEAD_AES_GCM_SIV_TAG_LEN); + bssl::iovec::ForEachBlockRange( + aadvecs, + [&](const uint8_t *in, size_t len) { + aesgcmsiv_polyval_horner(calculated_tag, + (const uint8_t *)record_auth_key, in, + len / AES_BLOCK_SIZE); + return true; + }, + [&](const uint8_t *in, size_t len) { + size_t len_whole = (len / AES_BLOCK_SIZE) * AES_BLOCK_SIZE; + if (len_whole != 0) { + aesgcmsiv_polyval_horner(calculated_tag, + (const uint8_t *)record_auth_key, in, + len_whole / AES_BLOCK_SIZE); + in += len_whole; + len -= len_whole; + } + if (len != 0) { + uint8_t pad_buf[AES_BLOCK_SIZE]; + OPENSSL_memcpy(pad_buf, in, len); + OPENSSL_memset(pad_buf + len, 0, AES_BLOCK_SIZE - len); + aesgcmsiv_polyval_horner( + calculated_tag, (const uint8_t *)record_auth_key, pad_buf, 1); + } + return true; + }); + + alignas(16) uint8_t htable[16 * 6]; + aesgcmsiv_htable6_init(htable, (const uint8_t *)record_auth_key); + + size_t blocks = 0; + bssl::iovec::ForEachBlockRange( + iovecs, + [&](const uint8_t *in, uint8_t *out, size_t len) { + // aes[128|256]gcmsiv_dec needs access to the claimed tag. So it's put + // into its scratch space. The function may clobber the claimed tag, so + // this is copied before each call. + OPENSSL_memcpy(calculated_tag + 16, in_tag.data(), + EVP_AEAD_AES_GCM_SIV_TAG_LEN); + inc_counter(calculated_tag + 16, static_cast(blocks)); + if (gcm_siv_ctx->is_128_bit) { + aes128gcmsiv_dec(in, out, calculated_tag, htable, &expanded_key, len); + } else { + aes256gcmsiv_dec(in, out, calculated_tag, htable, &expanded_key, len); + } + blocks += len / AES_BLOCK_SIZE; + return true; + }, + [&](const uint8_t *in, uint8_t *out, size_t len) { + size_t len_whole = (len / AES_BLOCK_SIZE) * AES_BLOCK_SIZE; + if (len_whole != 0) { + // aes[128|256]gcmsiv_dec needs access to the claimed tag. So it's put + // into its scratch space. The function may clobber the claimed tag, + // so this is copied before each call. + OPENSSL_memcpy(calculated_tag + 16, in_tag.data(), + EVP_AEAD_AES_GCM_SIV_TAG_LEN); + inc_counter(calculated_tag + 16, static_cast(blocks)); + if (gcm_siv_ctx->is_128_bit) { + aes128gcmsiv_dec(in, out, calculated_tag, htable, &expanded_key, + len_whole); + } else { + aes256gcmsiv_dec(in, out, calculated_tag, htable, &expanded_key, + len_whole); + } + in += len_whole; + out += len_whole; + len -= len_whole; + } + if (len != 0) { + aead_aes_gcm_siv_asm_crypt_last_block( + gcm_siv_ctx->is_128_bit, /*out_last_block=*/out, + /*in_last_block=*/in, /*total_in_len=*/in_len, in_tag.data(), + &expanded_key); + uint8_t pad_buf[AES_BLOCK_SIZE]; + OPENSSL_memcpy(pad_buf, out, len); + OPENSSL_memset(pad_buf + len, 0, AES_BLOCK_SIZE - len); + aesgcmsiv_polyval_horner( + calculated_tag, (const uint8_t *)record_auth_key, pad_buf, 1); + } + return true; + }); + + uint8_t length_block[16]; + CRYPTO_store_u64_le(length_block, ad_len * 8); + CRYPTO_store_u64_le(length_block + 8, in_len * 8); + aesgcmsiv_polyval_horner(calculated_tag, (const uint8_t *)record_auth_key, + length_block, 1); + + for (size_t i = 0; i < 12; i++) { + calculated_tag[i] ^= nonce[i]; + } + + calculated_tag[15] &= 0x7f; + + if (gcm_siv_ctx->is_128_bit) { + aes128gcmsiv_ecb_enc_block(calculated_tag, calculated_tag, &expanded_key); + } else { + aes256gcmsiv_ecb_enc_block(calculated_tag, calculated_tag, &expanded_key); + } + + if (CRYPTO_memcmp(calculated_tag, in_tag.data(), + EVP_AEAD_AES_GCM_SIV_TAG_LEN) != 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + return 1; +} + +const EVP_AEAD aead_aes_128_gcm_siv_asm = { + 16, // key length + EVP_AEAD_AES_GCM_SIV_NONCE_LEN, // nonce length + EVP_AEAD_AES_GCM_SIV_TAG_LEN, // overhead + EVP_AEAD_AES_GCM_SIV_TAG_LEN, // max tag length + + aead_aes_gcm_siv_asm_init, + nullptr /* init_with_direction */, + aead_aes_gcm_siv_asm_cleanup, + nullptr /* openv */, + aead_aes_gcm_siv_asm_sealv, + aead_aes_gcm_siv_asm_openv_detached, + nullptr /* get_iv */, + nullptr /* tag_len */, +}; + +const EVP_AEAD aead_aes_256_gcm_siv_asm = { + 32, // key length + EVP_AEAD_AES_GCM_SIV_NONCE_LEN, // nonce length + EVP_AEAD_AES_GCM_SIV_TAG_LEN, // overhead + EVP_AEAD_AES_GCM_SIV_TAG_LEN, // max tag length + + aead_aes_gcm_siv_asm_init, + nullptr /* init_with_direction */, + aead_aes_gcm_siv_asm_cleanup, + nullptr /* openv */, + aead_aes_gcm_siv_asm_sealv, + aead_aes_gcm_siv_asm_openv_detached, + nullptr /* get_iv */, + nullptr /* tag_len */, +}; + +#endif // X86_64 && !NO_ASM && !WINDOWS + + +struct aead_aes_gcm_siv_ctx { + union { + double align; + AES_KEY ks; + } ks; + block128_f kgk_block; + unsigned is_256 : 1; +}; + +static_assert(sizeof(((EVP_AEAD_CTX *)nullptr)->state) >= + sizeof(struct aead_aes_gcm_siv_ctx), + "AEAD state is too small"); +static_assert(alignof(union evp_aead_ctx_st_state) >= + alignof(struct aead_aes_gcm_siv_ctx), + "AEAD state has insufficient alignment"); + +int aead_aes_gcm_siv_init(EVP_AEAD_CTX *ctx, const uint8_t *key, size_t key_len, + size_t tag_len) { + const size_t key_bits = key_len * 8; + + if (key_bits != 128 && key_bits != 256) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); + return 0; // EVP_AEAD_CTX_init should catch this. + } + + if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) { + tag_len = EVP_AEAD_AES_GCM_SIV_TAG_LEN; + } + if (tag_len != EVP_AEAD_AES_GCM_SIV_TAG_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE); + return 0; + } + + struct aead_aes_gcm_siv_ctx *gcm_siv_ctx = + (struct aead_aes_gcm_siv_ctx *)&ctx->state; + OPENSSL_memset(gcm_siv_ctx, 0, sizeof(struct aead_aes_gcm_siv_ctx)); + + aes_ctr_set_key(&gcm_siv_ctx->ks.ks, nullptr, &gcm_siv_ctx->kgk_block, key, + key_len); + gcm_siv_ctx->is_256 = (key_len == 32); + ctx->tag_len = tag_len; + + return 1; +} + +void aead_aes_gcm_siv_cleanup(EVP_AEAD_CTX *ctx) {} + +// gcm_siv_crypt encrypts (or decrypts—it's the same thing) bytes from |in| to +// |out| in the |iovec|, using the block function |enc_block| with |key| in +// counter mode, starting at |initial_counter|. This differs from the +// traditional counter mode code in that the counter is handled little-endian, +// only the first four bytes are used and the GCM-SIV tweak to the final byte +// is applied. The |in| and |out| pointers may be equal but otherwise must not +// alias. +void gcm_siv_crypt(Span iovecs, + const uint8_t initial_counter[AES_BLOCK_SIZE], + block128_f enc_block, const AES_KEY *key) { + uint8_t counter[16]; + + OPENSSL_memcpy(counter, initial_counter, AES_BLOCK_SIZE); + counter[15] |= 0x80; + + auto crypt_bytes = [&](const uint8_t *in, uint8_t *out, size_t len) { + for (size_t done = 0; done < len;) { + uint8_t keystream[AES_BLOCK_SIZE]; + enc_block(counter, keystream, key); + inc_counter(counter, 1); + + size_t todo = AES_BLOCK_SIZE; + if (len - done < todo) { + todo = len - done; + } + + for (size_t i = 0; i < todo; i++) { + out[done + i] = keystream[i] ^ in[done + i]; + } + + done += todo; + } + return true; + }; + + bssl::iovec::ForEachBlockRange( + iovecs, crypt_bytes, crypt_bytes); +} + + +// POLYVAL. +// +// POLYVAL is a polynomial authenticator that operates over a field very +// similar to the one that GHASH uses. See +// https://www.rfc-editor.org/rfc/rfc8452.html#section-3. + +// POLYVAL(H, X_1, ..., X_n) = +// ByteReverse(GHASH(mulX_GHASH(ByteReverse(H)), ByteReverse(X_1), ..., +// ByteReverse(X_n))). +// +// See https://www.rfc-editor.org/rfc/rfc8452.html#appendix-A. + +struct polyval_ctx { + uint8_t S[16]; + u128 Htable[16]; + gmult_func gmult; + ghash_func ghash; +}; + +// byte_reverse reverses the order of the bytes in |b->c|. +void byte_reverse(uint8_t b[16]) { + uint64_t hi = CRYPTO_load_u64_le(b); + uint64_t lo = CRYPTO_load_u64_le(b + 8); + CRYPTO_store_u64_le(b, CRYPTO_bswap8(lo)); + CRYPTO_store_u64_le(b + 8, CRYPTO_bswap8(hi)); +} + +// reverse_and_mulX_ghash interprets |b| as a reversed element of the GHASH +// field, multiplies that by 'x' and serialises the result back into |b|, but +// with GHASH's backwards bit ordering. +void reverse_and_mulX_ghash(uint8_t b[16]) { + uint64_t hi = CRYPTO_load_u64_le(b); + uint64_t lo = CRYPTO_load_u64_le(b + 8); + const crypto_word_t carry = constant_time_eq_w(hi & 1, 1); + hi >>= 1; + hi |= lo << 63; + lo >>= 1; + lo ^= ((uint64_t)constant_time_select_w(carry, 0xe1, 0)) << 56; + + CRYPTO_store_u64_le(b, CRYPTO_bswap8(lo)); + CRYPTO_store_u64_le(b + 8, CRYPTO_bswap8(hi)); +} + +void crypto_polyval_init(struct polyval_ctx *ctx, const uint8_t key[16]) { + alignas(8) uint8_t H[16]; + OPENSSL_memcpy(H, key, 16); + reverse_and_mulX_ghash(H); + + CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, ctx->Htable, H); + OPENSSL_memset(&ctx->S, 0, sizeof(ctx->S)); +} + +void crypto_polyval_update_blocks(struct polyval_ctx *ctx, const uint8_t *in, + size_t in_len) { + assert((in_len & 15) == 0); + alignas(8) uint8_t buf[32 * 16]; + + while (in_len > 0) { + size_t todo = in_len; + if (todo > sizeof(buf)) { + todo = sizeof(buf); + } + OPENSSL_memcpy(buf, in, todo); + in += todo; + in_len -= todo; + + size_t blocks = todo / 16; + for (size_t i = 0; i < blocks; i++) { + byte_reverse(buf + 16 * i); + } + + ctx->ghash(ctx->S, ctx->Htable, buf, todo); + } +} + +void crypto_polyval_finish(const struct polyval_ctx *ctx, uint8_t out[16]) { + OPENSSL_memcpy(out, &ctx->S, 16); + byte_reverse(out); +} + +// gcm_siv_polyval evaluates POLYVAL at |auth_key| on the given plaintext and +// AD. The result is written to |out_tag|. +void gcm_siv_polyval(uint8_t out_tag[16], Span iovecs, + bool encrypt, Span aadvecs, + const uint8_t auth_key[16], + const uint8_t nonce[EVP_AEAD_AES_GCM_SIV_NONCE_LEN]) { + struct polyval_ctx polyval_ctx; + crypto_polyval_init(&polyval_ctx, auth_key); + + auto f_whole = [&](const uint8_t *in, size_t len) { + crypto_polyval_update_blocks(&polyval_ctx, in, len); + return true; + }; + auto f_final = [&](const uint8_t *in, size_t len) { + size_t len_whole = (len / AES_BLOCK_SIZE) * AES_BLOCK_SIZE; + if (len_whole != 0) { + crypto_polyval_update_blocks(&polyval_ctx, in, len_whole); + in += len_whole; + len -= len_whole; + } + if (len != 0) { + uint8_t pad_buf[AES_BLOCK_SIZE]; + OPENSSL_memcpy(pad_buf, in, len); + OPENSSL_memset(pad_buf + len, 0, AES_BLOCK_SIZE - len); + crypto_polyval_update_blocks(&polyval_ctx, pad_buf, AES_BLOCK_SIZE); + } + return true; + }; + + bssl::iovec::ForEachBlockRange(aadvecs, f_whole, + f_final); + + if (encrypt) { + bssl::iovec::ForEachBlockRange( + iovecs, f_whole, f_final); + } else { + bssl::iovec::ForEachOutBlockRange(iovecs, f_whole, f_final); + } + + uint8_t length_block[16]; + CRYPTO_store_u64_le(length_block, + ((uint64_t)bssl::iovec::TotalLength(aadvecs)) * 8); + CRYPTO_store_u64_le(length_block + 8, + ((uint64_t)bssl::iovec::TotalLength(iovecs)) * 8); + crypto_polyval_update_blocks(&polyval_ctx, length_block, + sizeof(length_block)); + + crypto_polyval_finish(&polyval_ctx, out_tag); + for (size_t i = 0; i < EVP_AEAD_AES_GCM_SIV_NONCE_LEN; i++) { + out_tag[i] ^= nonce[i]; + } + out_tag[15] &= 0x7f; +} + +// gcm_siv_record_keys contains the keys used for a specific GCM-SIV record. +struct gcm_siv_record_keys { + uint8_t auth_key[16]; + union { + double align; + AES_KEY ks; + } enc_key; + block128_f enc_block; +}; + +// gcm_siv_keys calculates the keys for a specific GCM-SIV record with the +// given nonce and writes them to |*out_keys|. +void gcm_siv_keys(const struct aead_aes_gcm_siv_ctx *gcm_siv_ctx, + struct gcm_siv_record_keys *out_keys, + const uint8_t nonce[EVP_AEAD_AES_GCM_SIV_NONCE_LEN]) { + const AES_KEY *const key = &gcm_siv_ctx->ks.ks; + uint8_t key_material[(128 /* POLYVAL key */ + 256 /* max AES key */) / 8]; + const size_t blocks_needed = gcm_siv_ctx->is_256 ? 6 : 4; + + uint8_t counter[AES_BLOCK_SIZE]; + OPENSSL_memset(counter, 0, AES_BLOCK_SIZE - EVP_AEAD_AES_GCM_SIV_NONCE_LEN); + OPENSSL_memcpy(counter + AES_BLOCK_SIZE - EVP_AEAD_AES_GCM_SIV_NONCE_LEN, + nonce, EVP_AEAD_AES_GCM_SIV_NONCE_LEN); + for (size_t i = 0; i < blocks_needed; i++) { + counter[0] = i; + + uint8_t ciphertext[AES_BLOCK_SIZE]; + gcm_siv_ctx->kgk_block(counter, ciphertext, key); + OPENSSL_memcpy(&key_material[i * 8], ciphertext, 8); + } + + OPENSSL_memcpy(out_keys->auth_key, key_material, 16); + // Note the |ctr128_f| function uses a big-endian couner, while AES-GCM-SIV + // uses a little-endian counter. We ignore the return value and only use + // |block128_f|. This has a significant performance cost for the fallback + // bitsliced AES implementations (bsaes and aes_nohw). + // + // We currently do not consider AES-GCM-SIV to be performance-sensitive on + // client hardware. If this changes, we can write little-endian |ctr128_f| + // functions. + aes_ctr_set_key(&out_keys->enc_key.ks, nullptr, &out_keys->enc_block, + key_material + 16, gcm_siv_ctx->is_256 ? 32 : 16); +} + +int aead_aes_gcm_siv_sealv(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, size_t *out_tag_len, + Span nonce, + Span aadvecs) { + const struct aead_aes_gcm_siv_ctx *gcm_siv_ctx = + (struct aead_aes_gcm_siv_ctx *)&ctx->state; + size_t in_len = bssl::iovec::TotalLength(iovecs); + size_t ad_len = bssl::iovec::TotalLength(aadvecs); + const uint64_t in_len_64 = in_len; + const uint64_t ad_len_64 = ad_len; + + if (in_len + EVP_AEAD_AES_GCM_SIV_TAG_LEN < in_len || + in_len_64 > (UINT64_C(1) << 36) || ad_len_64 >= (UINT64_C(1) << 61)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + if (out_tag.size() < EVP_AEAD_AES_GCM_SIV_TAG_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + + if (nonce.size() != EVP_AEAD_AES_GCM_SIV_NONCE_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + struct gcm_siv_record_keys keys; + gcm_siv_keys(gcm_siv_ctx, &keys, nonce.data()); + + uint8_t tag[EVP_AEAD_AES_GCM_SIV_TAG_LEN]; + gcm_siv_polyval(tag, iovecs, true, aadvecs, keys.auth_key, nonce.data()); + keys.enc_block(tag, tag, &keys.enc_key.ks); + + gcm_siv_crypt(iovecs, tag, keys.enc_block, &keys.enc_key.ks); + + CopyToPrefix(tag, out_tag); + *out_tag_len = EVP_AEAD_AES_GCM_SIV_TAG_LEN; + + return 1; +} + +int aead_aes_gcm_siv_openv_detached(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span nonce, + Span in_tag, + Span aadvecs) { + const uint64_t ad_len_64 = bssl::iovec::TotalLength(aadvecs); + if (ad_len_64 >= (UINT64_C(1) << 61)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + const uint64_t in_len_64 = bssl::iovec::TotalLength(iovecs); + if (in_tag.size() != EVP_AEAD_AES_GCM_SIV_TAG_LEN || + in_len_64 > (UINT64_C(1) << 36) + AES_BLOCK_SIZE) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + if (nonce.size() != EVP_AEAD_AES_GCM_SIV_NONCE_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + const struct aead_aes_gcm_siv_ctx *gcm_siv_ctx = + (struct aead_aes_gcm_siv_ctx *)&ctx->state; + + struct gcm_siv_record_keys keys; + gcm_siv_keys(gcm_siv_ctx, &keys, nonce.data()); + + gcm_siv_crypt(iovecs, in_tag.data(), keys.enc_block, &keys.enc_key.ks); + + uint8_t expected_tag[EVP_AEAD_AES_GCM_SIV_TAG_LEN]; + gcm_siv_polyval(expected_tag, iovecs, false, aadvecs, keys.auth_key, + nonce.data()); + keys.enc_block(expected_tag, expected_tag, &keys.enc_key.ks); + + if (CRYPTO_memcmp(expected_tag, in_tag.data(), sizeof(expected_tag)) != 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + return 1; +} + +const EVP_AEAD aead_aes_128_gcm_siv = { + 16, // key length + EVP_AEAD_AES_GCM_SIV_NONCE_LEN, // nonce length + EVP_AEAD_AES_GCM_SIV_TAG_LEN, // overhead + EVP_AEAD_AES_GCM_SIV_TAG_LEN, // max tag length + + aead_aes_gcm_siv_init, + nullptr /* init_with_direction */, + aead_aes_gcm_siv_cleanup, + nullptr /* openv */, + aead_aes_gcm_siv_sealv, + aead_aes_gcm_siv_openv_detached, + nullptr /* get_iv */, + nullptr /* tag_len */, +}; + +const EVP_AEAD aead_aes_256_gcm_siv = { + 32, // key length + EVP_AEAD_AES_GCM_SIV_NONCE_LEN, // nonce length + EVP_AEAD_AES_GCM_SIV_TAG_LEN, // overhead + EVP_AEAD_AES_GCM_SIV_TAG_LEN, // max tag length + + aead_aes_gcm_siv_init, + nullptr /* init_with_direction */, + aead_aes_gcm_siv_cleanup, + nullptr /* openv */, + aead_aes_gcm_siv_sealv, + aead_aes_gcm_siv_openv_detached, + nullptr /* get_iv */, + nullptr /* tag_len */, +}; +} // namespace + +#if defined(AES_GCM_SIV_ASM) + +const EVP_AEAD *EVP_aead_aes_128_gcm_siv() { + if (CRYPTO_is_AVX_capable() && CRYPTO_is_AESNI_capable()) { + return &aead_aes_128_gcm_siv_asm; + } + return &aead_aes_128_gcm_siv; +} + +const EVP_AEAD *EVP_aead_aes_256_gcm_siv() { + if (CRYPTO_is_AVX_capable() && CRYPTO_is_AESNI_capable()) { + return &aead_aes_256_gcm_siv_asm; + } + return &aead_aes_256_gcm_siv; +} + +#else + +const EVP_AEAD *EVP_aead_aes_128_gcm_siv() { return &aead_aes_128_gcm_siv; } + +const EVP_AEAD *EVP_aead_aes_256_gcm_siv() { return &aead_aes_256_gcm_siv; } + +#endif // AES_GCM_SIV_ASM diff --git a/third_party/boringssl/src/crypto/cipher/e_chacha20poly1305.cc b/third_party/boringssl/src/crypto/cipher/e_chacha20poly1305.cc new file mode 100644 index 00000000..4f99f6ec --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/e_chacha20poly1305.cc @@ -0,0 +1,404 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../chacha/internal.h" +#include "../fipsmodule/cipher/internal.h" +#include "../internal.h" +#include "internal.h" + +using namespace bssl; + +struct aead_chacha20_poly1305_ctx { + uint8_t key[32]; +}; + +static_assert(sizeof(((EVP_AEAD_CTX *)nullptr)->state) >= + sizeof(struct aead_chacha20_poly1305_ctx), + "AEAD state is too small"); +static_assert(alignof(union evp_aead_ctx_st_state) >= + alignof(struct aead_chacha20_poly1305_ctx), + "AEAD state has insufficient alignment"); + +static int aead_chacha20_poly1305_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t tag_len) { + struct aead_chacha20_poly1305_ctx *c20_ctx = + (struct aead_chacha20_poly1305_ctx *)&ctx->state; + + if (tag_len == 0) { + tag_len = POLY1305_TAG_LEN; + } + + if (tag_len > POLY1305_TAG_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + if (key_len != sizeof(c20_ctx->key)) { + return 0; // internal error - EVP_AEAD_CTX_init should catch this. + } + + OPENSSL_memcpy(c20_ctx->key, key, key_len); + ctx->tag_len = tag_len; + + return 1; +} + +static void aead_chacha20_poly1305_cleanup(EVP_AEAD_CTX *ctx) {} + +static void poly1305_update_length(poly1305_state *poly1305, size_t data_len) { + uint8_t length_bytes[8]; + + for (unsigned i = 0; i < sizeof(length_bytes); i++) { + length_bytes[i] = data_len; + data_len >>= 8; + } + + CRYPTO_poly1305_update(poly1305, length_bytes, sizeof(length_bytes)); +} + +// calc_tag_pre prepares filling |tag| with the authentication tag for the given +// inputs. +static size_t calc_tag_pre(poly1305_state *ctx, const uint8_t key[32], + const uint8_t nonce[12], + Span aadvecs) { + alignas(16) uint8_t poly1305_key[32]; + OPENSSL_memset(poly1305_key, 0, sizeof(poly1305_key)); + CRYPTO_chacha_20(poly1305_key, poly1305_key, sizeof(poly1305_key), key, nonce, + 0); + + static const uint8_t padding[16] = {0}; // Padding is all zeros. + CRYPTO_poly1305_init(ctx, poly1305_key); + size_t ad_len = 0; + for (const CRYPTO_IVEC &aadvec : aadvecs) { + CRYPTO_poly1305_update(ctx, aadvec.in, aadvec.len); + ad_len += aadvec.len; + } + if (ad_len % 16 != 0) { + CRYPTO_poly1305_update(ctx, padding, sizeof(padding) - (ad_len % 16)); + } + return ad_len; +} + +static void calc_tag_post(poly1305_state *ctx, uint8_t tag[POLY1305_TAG_LEN], + size_t ciphertext_total, size_t ad_len) { + static const uint8_t padding[16] = {0}; // Padding is all zeros. + if (ciphertext_total % 16 != 0) { + CRYPTO_poly1305_update(ctx, padding, + sizeof(padding) - (ciphertext_total % 16)); + } + poly1305_update_length(ctx, ad_len); + poly1305_update_length(ctx, ciphertext_total); + CRYPTO_poly1305_finish(ctx, tag); +} + +static int chacha20_poly1305_sealv(const uint8_t *key, + Span iovecs, + Span out_tag, size_t *out_tag_len, + Span nonce, + Span aadvecs, + size_t tag_len) { + if (out_tag.size() < tag_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + if (nonce.size() != 12) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + // |CRYPTO_chacha_20| uses a 32-bit block counter. Therefore we disallow + // individual operations that work on more than 256GB at a time. + // |in_len_64| is needed because, on 32-bit platforms, size_t is only + // 32-bits and this produces a warning because it's always false. + // Casting to uint64_t inside the conditional is not sufficient to stop + // the warning. + const uint64_t in_len_64 = bssl::iovec::TotalLength(iovecs); + if (in_len_64 >= (UINT64_C(1) << 32) * 64 - 64) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + union chacha20_poly1305_seal_data data; + if (chacha20_poly1305_asm_capable() && iovecs.size() <= 2 && + aadvecs.size() <= 1) { + OPENSSL_memcpy(data.in.key, key, 32); + data.in.counter = 0; + CopySpan(nonce, data.in.nonce); + if (iovecs.size() >= 2) { + // |chacha20_poly1305_seal| only supports one extra input and expects it + // to have been encrypted ahead of time. (Historically it was only used + // for very short inputs.) + constexpr size_t kChaChaBlockSize = 64; + uint32_t block_counter = + (uint32_t)(1 + (iovecs[0].len / kChaChaBlockSize)); + size_t offset = iovecs[0].len % kChaChaBlockSize; + size_t done = 0; + if (offset != 0) { + uint8_t block[kChaChaBlockSize]; + memset(block, 0, sizeof(block)); + CRYPTO_chacha_20(block, block, sizeof(block), key, nonce.data(), + block_counter); + for (size_t i = offset; i < sizeof(block) && done < iovecs[1].len; + i++, done++) { + iovecs[1].out[done] = iovecs[1].in[done] ^ block[i]; + } + ++block_counter; + } + if (done < iovecs[1].len) { + CRYPTO_chacha_20(iovecs[1].out + done, iovecs[1].in + done, + iovecs[1].len - done, key, nonce.data(), + block_counter); + } + // TODO(crbug.com/473454967): Support more than 1 extra ciphertext. + data.in.extra_ciphertext = iovecs[1].out; + data.in.extra_ciphertext_len = iovecs[1].len; + } else { + data.in.extra_ciphertext = nullptr; + data.in.extra_ciphertext_len = 0; + } + chacha20_poly1305_seal(iovecs.size() >= 1 ? iovecs[0].out : nullptr, + iovecs.size() >= 1 ? iovecs[0].in : nullptr, + iovecs.size() >= 1 ? iovecs[0].len : 0, + aadvecs.size() >= 1 ? aadvecs[0].in : nullptr, + aadvecs.size() >= 1 ? aadvecs[0].len : 0, &data); + } else { + poly1305_state ctx; + size_t ad_len = calc_tag_pre(&ctx, key, nonce.data(), aadvecs); + + size_t ciphertext_total = 0; + size_t block = 1; + bssl::iovec::ForEachBlockRange<64, /*WriteOut=*/true>( + iovecs, + [&](const uint8_t *in, uint8_t *out, size_t len) { + // TODO(crbug.com/473454967): Maybe just provide asm version of this? + // Here, len is always a multiple of 64. + CRYPTO_chacha_20(out, in, len, key, nonce.data(), block); + CRYPTO_poly1305_update(&ctx, out, len); + ciphertext_total += len; + block += len / 64; + return true; + }, + [&](const uint8_t *in, uint8_t *out, size_t len) { + // Here, len may be anything. If an asm version can't handle that, + // it will be worth splitting off multiples of 64 here. + CRYPTO_chacha_20(out, in, len, key, nonce.data(), block); + CRYPTO_poly1305_update(&ctx, out, len); + ciphertext_total += len; + return true; + }); + + calc_tag_post(&ctx, data.out.tag, ciphertext_total, ad_len); + } + + CopyToPrefix(Span(data.out.tag).first(tag_len), out_tag); + *out_tag_len = tag_len; + return 1; +} + +static int aead_chacha20_poly1305_sealv(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, + size_t *out_tag_len, + Span nonce, + Span aadvecs) { + const struct aead_chacha20_poly1305_ctx *c20_ctx = + (struct aead_chacha20_poly1305_ctx *)&ctx->state; + + return chacha20_poly1305_sealv(c20_ctx->key, iovecs, out_tag, out_tag_len, + nonce, aadvecs, ctx->tag_len); +} + +static int aead_xchacha20_poly1305_sealv(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, + size_t *out_tag_len, + Span nonce, + Span aadvecs) { + const struct aead_chacha20_poly1305_ctx *c20_ctx = + (struct aead_chacha20_poly1305_ctx *)&ctx->state; + + if (nonce.size() != 24) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + alignas(4) uint8_t derived_key[32]; + alignas(4) uint8_t derived_nonce[12]; + CRYPTO_hchacha20(derived_key, c20_ctx->key, nonce.data()); + OPENSSL_memset(derived_nonce, 0, 4); + OPENSSL_memcpy(&derived_nonce[4], &nonce[16], 8); + + return chacha20_poly1305_sealv(derived_key, iovecs, out_tag, out_tag_len, + derived_nonce, aadvecs, ctx->tag_len); +} + +static int chacha20_poly1305_openv_detached(const uint8_t *key, + Span iovecs, + Span nonce, + Span in_tag, + Span aadvecs, + size_t tag_len) { + if (nonce.size() != 12) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + if (in_tag.size() != tag_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + // |CRYPTO_chacha_20| uses a 32-bit block counter. Therefore we disallow + // individual operations that work on more than 256GB at a time. + // |in_len_64| is needed because, on 32-bit platforms, size_t is only + // 32-bits and this produces a warning because it's always false. + // Casting to uint64_t inside the conditional is not sufficient to stop + // the warning. + const uint64_t in_len_64 = bssl::iovec::TotalLength(iovecs); + if (in_len_64 >= (UINT64_C(1) << 32) * 64 - 64) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + union chacha20_poly1305_open_data data; + if (chacha20_poly1305_asm_capable() && iovecs.size() <= 1 && + aadvecs.size() <= 1) { + // TODO(crbug.com/473454967): Support more than 1 ciphertext segment. + OPENSSL_memcpy(data.in.key, key, 32); + data.in.counter = 0; + CopySpan(nonce, data.in.nonce); + chacha20_poly1305_open(iovecs.size() >= 1 ? iovecs[0].out : nullptr, + iovecs.size() >= 1 ? iovecs[0].in : nullptr, + iovecs.size() >= 1 ? iovecs[0].len : 0, + aadvecs.size() >= 1 ? aadvecs[0].in : nullptr, + aadvecs.size() >= 1 ? aadvecs[0].len : 0, &data); + } else { + poly1305_state ctx; + size_t ad_len = calc_tag_pre(&ctx, key, nonce.data(), aadvecs); + + size_t ciphertext_total = 0; + size_t block = 1; + bssl::iovec::ForEachBlockRange<64, /*WriteOut=*/true>( + iovecs, + [&](const uint8_t *in, uint8_t *out, size_t len) { + // TODO(crbug.com/473454967): Maybe just provide asm version of this? + // Here, len is always a multiple of 64. + CRYPTO_poly1305_update(&ctx, in, len); + CRYPTO_chacha_20(out, in, len, key, nonce.data(), block); + ciphertext_total += len; + block += len / 64; + return true; + }, + [&](const uint8_t *in, uint8_t *out, size_t len) { + // Here, len may be anything. If an asm version can't handle that, + // it will be worth splitting off multiples of 64 here. + CRYPTO_poly1305_update(&ctx, in, len); + CRYPTO_chacha_20(out, in, len, key, nonce.data(), block); + ciphertext_total += len; + return true; + }); + + calc_tag_post(&ctx, data.out.tag, ciphertext_total, ad_len); + } + + if (CRYPTO_memcmp(data.out.tag, in_tag.data(), tag_len) != 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + return 1; +} + +static int aead_chacha20_poly1305_openv_detached( + const EVP_AEAD_CTX *ctx, Span iovecs, + Span nonce, Span in_tag, + Span aadvecs) { + const struct aead_chacha20_poly1305_ctx *c20_ctx = + (struct aead_chacha20_poly1305_ctx *)&ctx->state; + + return chacha20_poly1305_openv_detached(c20_ctx->key, iovecs, nonce, in_tag, + aadvecs, ctx->tag_len); +} + +static int aead_xchacha20_poly1305_openv_detached( + const EVP_AEAD_CTX *ctx, Span iovecs, + Span nonce, Span in_tag, + Span aadvecs) { + const struct aead_chacha20_poly1305_ctx *c20_ctx = + (struct aead_chacha20_poly1305_ctx *)&ctx->state; + + if (nonce.size() != 24) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + alignas(4) uint8_t derived_key[32]; + alignas(4) uint8_t derived_nonce[12]; + CRYPTO_hchacha20(derived_key, c20_ctx->key, nonce.data()); + OPENSSL_memset(derived_nonce, 0, 4); + OPENSSL_memcpy(&derived_nonce[4], &nonce[16], 8); + + return chacha20_poly1305_openv_detached(derived_key, iovecs, derived_nonce, + in_tag, aadvecs, ctx->tag_len); +} + +static const EVP_AEAD aead_chacha20_poly1305 = { + 32, // key len + 12, // nonce len + POLY1305_TAG_LEN, // overhead + POLY1305_TAG_LEN, // max tag length + + aead_chacha20_poly1305_init, + nullptr, // init_with_direction + aead_chacha20_poly1305_cleanup, + nullptr, // openv + aead_chacha20_poly1305_sealv, + aead_chacha20_poly1305_openv_detached, + nullptr, // get_iv + nullptr, // tag_len +}; + +static const EVP_AEAD aead_xchacha20_poly1305 = { + 32, // key len + 24, // nonce len + POLY1305_TAG_LEN, // overhead + POLY1305_TAG_LEN, // max tag length + + aead_chacha20_poly1305_init, + nullptr, // init_with_direction + aead_chacha20_poly1305_cleanup, + nullptr, // openv + aead_xchacha20_poly1305_sealv, + aead_xchacha20_poly1305_openv_detached, + nullptr, // get_iv + nullptr, // tag_len +}; + +const EVP_AEAD *EVP_aead_chacha20_poly1305() { return &aead_chacha20_poly1305; } + +const EVP_AEAD *EVP_aead_xchacha20_poly1305() { + return &aead_xchacha20_poly1305; +} diff --git a/third_party/boringssl/src/crypto/cipher/e_des.cc b/third_party/boringssl/src/crypto/cipher/e_des.cc new file mode 100644 index 00000000..29cc21e9 --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/e_des.cc @@ -0,0 +1,214 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include "../des/internal.h" +#include "../fipsmodule/cipher/internal.h" +#include "internal.h" + + +using namespace bssl; + +typedef struct { + union { + double align; + DES_key_schedule ks; + } ks; +} EVP_DES_KEY; + +static int des_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, + const uint8_t *iv, int enc) { + EVP_DES_KEY *dat = (EVP_DES_KEY *)ctx->cipher_data; + DES_set_key_ex(key, &dat->ks.ks); + return 1; +} + +static int des_cbc_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + EVP_DES_KEY *dat = (EVP_DES_KEY *)ctx->cipher_data; + DES_ncbc_encrypt_ex(in, out, len, &dat->ks.ks, ctx->iv, ctx->encrypt); + return 1; +} + +static const EVP_CIPHER evp_des_cbc = { + /*nid=*/NID_des_cbc, + /*block_size=*/8, + /*key_len=*/8, + /*iv_len=*/8, + /*ctx_size=*/sizeof(EVP_DES_KEY), + /*flags=*/EVP_CIPH_CBC_MODE, + /*init=*/des_init_key, + /*cipher_update=*/des_cbc_cipher_update, + /*cipher_final=*/nullptr, + /*update_aad=*/nullptr, + /*cleanup=*/nullptr, + /*ctrl=*/nullptr, +}; + +const EVP_CIPHER *EVP_des_cbc() { return &evp_des_cbc; } + +static int des_ecb_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + if (len < ctx->cipher->block_size) { + return 1; + } + len -= ctx->cipher->block_size; + + EVP_DES_KEY *dat = (EVP_DES_KEY *)ctx->cipher_data; + for (size_t i = 0; i <= len; i += ctx->cipher->block_size) { + DES_ecb_encrypt_ex(in + i, out + i, &dat->ks.ks, ctx->encrypt); + } + return 1; +} + +static const EVP_CIPHER evp_des_ecb = { + /*nid=*/NID_des_ecb, + /*block_size=*/8, + /*key_len=*/8, + /*iv_len=*/0, + /*ctx_size=*/sizeof(EVP_DES_KEY), + /*flags=*/EVP_CIPH_ECB_MODE, + /*init=*/des_init_key, + /*cipher_update=*/des_ecb_cipher_update, + /*cipher_final=*/nullptr, + /*update_aad=*/nullptr, + /*cleanup=*/nullptr, + /*ctrl=*/nullptr, +}; + +const EVP_CIPHER *EVP_des_ecb() { return &evp_des_ecb; } + +typedef struct { + union { + double align; + DES_key_schedule ks[3]; + } ks; +} DES_EDE_KEY; + +static int des_ede3_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, + const uint8_t *iv, int enc) { + DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data; + DES_set_key_ex(key, &dat->ks.ks[0]); + DES_set_key_ex(key + 8, &dat->ks.ks[1]); + DES_set_key_ex(key + 16, &dat->ks.ks[2]); + return 1; +} + +static int des_ede3_cbc_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data; + DES_ede3_cbc_encrypt_ex(in, out, len, &dat->ks.ks[0], &dat->ks.ks[1], + &dat->ks.ks[2], ctx->iv, ctx->encrypt); + return 1; +} + +static const EVP_CIPHER evp_des_ede3_cbc = { + /*nid=*/NID_des_ede3_cbc, + /*block_size=*/8, + /*key_len=*/24, + /*iv_len=*/8, + /*ctx_size=*/sizeof(DES_EDE_KEY), + /*flags=*/EVP_CIPH_CBC_MODE, + /*init=*/des_ede3_init_key, + /*cipher_update=*/des_ede3_cbc_cipher_update, + /*cipher_final=*/nullptr, + /*update_aad=*/nullptr, + /*cleanup=*/nullptr, + /*ctrl=*/nullptr, +}; + +const EVP_CIPHER *EVP_des_ede3_cbc() { return &evp_des_ede3_cbc; } + +static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, + const uint8_t *iv, int enc) { + DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data; + // 2-DES is 3-DES with the first key used twice. + DES_set_key_ex(key, &dat->ks.ks[0]); + DES_set_key_ex(key + 8, &dat->ks.ks[1]); + DES_set_key_ex(key, &dat->ks.ks[2]); + return 1; +} + +static const EVP_CIPHER evp_des_ede_cbc = { + /*nid=*/NID_des_ede_cbc, + /*block_size=*/8, + /*key_len=*/16, + /*iv_len=*/8, + /*ctx_size=*/sizeof(DES_EDE_KEY), + /*flags=*/EVP_CIPH_CBC_MODE, + /*init=*/des_ede_init_key, + /*cipher_update=*/des_ede3_cbc_cipher_update, + /*cipher_final=*/nullptr, + /*update_aad=*/nullptr, + /*cleanup=*/nullptr, + /*ctrl=*/nullptr, +}; + +const EVP_CIPHER *EVP_des_ede_cbc() { return &evp_des_ede_cbc; } + +static int des_ede_ecb_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + if (len < ctx->cipher->block_size) { + return 1; + } + len -= ctx->cipher->block_size; + + DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data; + for (size_t i = 0; i <= len; i += ctx->cipher->block_size) { + DES_ecb3_encrypt_ex(in + i, out + i, &dat->ks.ks[0], &dat->ks.ks[1], + &dat->ks.ks[2], ctx->encrypt); + } + return 1; +} + +static const EVP_CIPHER evp_des_ede = { + /*nid=*/NID_des_ede_ecb, + /*block_size=*/8, + /*key_len=*/16, + /*iv_len=*/0, + /*ctx_size=*/sizeof(DES_EDE_KEY), + /*flags=*/EVP_CIPH_ECB_MODE, + /*init=*/des_ede_init_key, + /*cipher_update=*/des_ede_ecb_cipher_update, + /*cipher_final=*/nullptr, + /*update_aad=*/nullptr, + /*cleanup=*/nullptr, + /*ctrl=*/nullptr, +}; + +const EVP_CIPHER *EVP_des_ede() { return &evp_des_ede; } + +static const EVP_CIPHER evp_des_ede3 = { + /*nid=*/NID_des_ede3_ecb, + /*block_size=*/8, + /*key_len=*/24, + /*iv_len=*/0, + /*ctx_size=*/sizeof(DES_EDE_KEY), + /*flags=*/EVP_CIPH_ECB_MODE, + /*init=*/des_ede3_init_key, + /*cipher_update=*/des_ede_ecb_cipher_update, + /*cipher_final=*/nullptr, + /*update_aad=*/nullptr, + /*cleanup=*/nullptr, + /*ctrl=*/nullptr, +}; + +const EVP_CIPHER *EVP_des_ede3() { return &evp_des_ede3; } + +const EVP_CIPHER *EVP_des_ede3_ecb() { return EVP_des_ede3(); } diff --git a/third_party/boringssl/src/crypto/cipher/e_null.cc b/third_party/boringssl/src/crypto/cipher/e_null.cc new file mode 100644 index 00000000..1338d192 --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/e_null.cc @@ -0,0 +1,55 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include + +#include "../fipsmodule/cipher/internal.h" +#include "../internal.h" + + +using namespace bssl; + +static int null_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, + const uint8_t *iv, int enc) { + return 1; +} + +static int null_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + if (in != out) { + OPENSSL_memcpy(out, in, len); + } + return 1; +} + +static const EVP_CIPHER n_cipher = { + /*nid=*/NID_undef, + /*block_size=*/1, + /*key_len=*/0, + /*iv_len=*/0, + /*ctx_size=*/0, + /*flags=*/0, + /*init=*/null_init_key, + /*cipher_update=*/null_cipher_update, + /*cipher_final=*/nullptr, + /*update_aad=*/nullptr, + /*cleanup=*/nullptr, + /*ctrl=*/nullptr, +}; + +const EVP_CIPHER *EVP_enc_null() { return &n_cipher; } diff --git a/third_party/boringssl/src/crypto/cipher/e_rc2.cc b/third_party/boringssl/src/crypto/cipher/e_rc2.cc new file mode 100644 index 00000000..91c5f52d --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/e_rc2.cc @@ -0,0 +1,421 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "../fipsmodule/cipher/internal.h" +#include "../internal.h" + + +#define c2l(c, l) \ + do { \ + (l) = ((uint32_t)(*((c)++))); \ + (l) |= ((uint32_t)(*((c)++))) << 8L; \ + (l) |= ((uint32_t)(*((c)++))) << 16L; \ + (l) |= ((uint32_t)(*((c)++))) << 24L; \ + } while (0) + +#define c2ln(c, l1, l2, n) \ + do { \ + (c) += (n); \ + (l1) = (l2) = 0; \ + switch (n) { \ + case 8: \ + (l2) = ((uint32_t)(*(--(c)))) << 24L; \ + [[fallthrough]]; \ + case 7: \ + (l2) |= ((uint32_t)(*(--(c)))) << 16L; \ + [[fallthrough]]; \ + case 6: \ + (l2) |= ((uint32_t)(*(--(c)))) << 8L; \ + [[fallthrough]]; \ + case 5: \ + (l2) |= ((uint32_t)(*(--(c)))); \ + [[fallthrough]]; \ + case 4: \ + (l1) = ((uint32_t)(*(--(c)))) << 24L; \ + [[fallthrough]]; \ + case 3: \ + (l1) |= ((uint32_t)(*(--(c)))) << 16L; \ + [[fallthrough]]; \ + case 2: \ + (l1) |= ((uint32_t)(*(--(c)))) << 8L; \ + [[fallthrough]]; \ + case 1: \ + (l1) |= ((uint32_t)(*(--(c)))); \ + } \ + } while (0) + +#define l2c(l, c) \ + do { \ + *((c)++) = (uint8_t)(((l)) & 0xff); \ + *((c)++) = (uint8_t)(((l) >> 8L) & 0xff); \ + *((c)++) = (uint8_t)(((l) >> 16L) & 0xff); \ + *((c)++) = (uint8_t)(((l) >> 24L) & 0xff); \ + } while (0) + +#define l2cn(l1, l2, c, n) \ + do { \ + (c) += (n); \ + switch (n) { \ + case 8: \ + *(--(c)) = (uint8_t)(((l2) >> 24L) & 0xff); \ + [[fallthrough]]; \ + case 7: \ + *(--(c)) = (uint8_t)(((l2) >> 16L) & 0xff); \ + [[fallthrough]]; \ + case 6: \ + *(--(c)) = (uint8_t)(((l2) >> 8L) & 0xff); \ + [[fallthrough]]; \ + case 5: \ + *(--(c)) = (uint8_t)(((l2)) & 0xff); \ + [[fallthrough]]; \ + case 4: \ + *(--(c)) = (uint8_t)(((l1) >> 24L) & 0xff); \ + [[fallthrough]]; \ + case 3: \ + *(--(c)) = (uint8_t)(((l1) >> 16L) & 0xff); \ + [[fallthrough]]; \ + case 2: \ + *(--(c)) = (uint8_t)(((l1) >> 8L) & 0xff); \ + [[fallthrough]]; \ + case 1: \ + *(--(c)) = (uint8_t)(((l1)) & 0xff); \ + } \ + } while (0) + +typedef struct rc2_key_st { + uint16_t data[64]; +} RC2_KEY; + +static void RC2_encrypt(uint32_t *d, RC2_KEY *key) { + int i, n; + uint16_t *p0, *p1; + uint16_t x0, x1, x2, x3, t; + uint32_t l; + + l = d[0]; + x0 = (uint16_t)l & 0xffff; + x1 = (uint16_t)(l >> 16L); + l = d[1]; + x2 = (uint16_t)l & 0xffff; + x3 = (uint16_t)(l >> 16L); + + n = 3; + i = 5; + + p0 = p1 = &key->data[0]; + for (;;) { + t = (x0 + (x1 & ~x3) + (x2 & x3) + *(p0++)) & 0xffff; + x0 = (t << 1) | (t >> 15); + t = (x1 + (x2 & ~x0) + (x3 & x0) + *(p0++)) & 0xffff; + x1 = (t << 2) | (t >> 14); + t = (x2 + (x3 & ~x1) + (x0 & x1) + *(p0++)) & 0xffff; + x2 = (t << 3) | (t >> 13); + t = (x3 + (x0 & ~x2) + (x1 & x2) + *(p0++)) & 0xffff; + x3 = (t << 5) | (t >> 11); + + if (--i == 0) { + if (--n == 0) { + break; + } + i = (n == 2) ? 6 : 5; + + x0 += p1[x3 & 0x3f]; + x1 += p1[x0 & 0x3f]; + x2 += p1[x1 & 0x3f]; + x3 += p1[x2 & 0x3f]; + } + } + + d[0] = (uint32_t)(x0 & 0xffff) | ((uint32_t)(x1 & 0xffff) << 16L); + d[1] = (uint32_t)(x2 & 0xffff) | ((uint32_t)(x3 & 0xffff) << 16L); +} + +static void RC2_decrypt(uint32_t *d, RC2_KEY *key) { + int i, n; + uint16_t *p0, *p1; + uint16_t x0, x1, x2, x3, t; + uint32_t l; + + l = d[0]; + x0 = (uint16_t)l & 0xffff; + x1 = (uint16_t)(l >> 16L); + l = d[1]; + x2 = (uint16_t)l & 0xffff; + x3 = (uint16_t)(l >> 16L); + + n = 3; + i = 5; + + p0 = &key->data[63]; + p1 = &key->data[0]; + for (;;) { + t = ((x3 << 11) | (x3 >> 5)) & 0xffff; + x3 = (t - (x0 & ~x2) - (x1 & x2) - *(p0--)) & 0xffff; + t = ((x2 << 13) | (x2 >> 3)) & 0xffff; + x2 = (t - (x3 & ~x1) - (x0 & x1) - *(p0--)) & 0xffff; + t = ((x1 << 14) | (x1 >> 2)) & 0xffff; + x1 = (t - (x2 & ~x0) - (x3 & x0) - *(p0--)) & 0xffff; + t = ((x0 << 15) | (x0 >> 1)) & 0xffff; + x0 = (t - (x1 & ~x3) - (x2 & x3) - *(p0--)) & 0xffff; + + if (--i == 0) { + if (--n == 0) { + break; + } + i = (n == 2) ? 6 : 5; + + x3 = (x3 - p1[x2 & 0x3f]) & 0xffff; + x2 = (x2 - p1[x1 & 0x3f]) & 0xffff; + x1 = (x1 - p1[x0 & 0x3f]) & 0xffff; + x0 = (x0 - p1[x3 & 0x3f]) & 0xffff; + } + } + + d[0] = (uint32_t)(x0 & 0xffff) | ((uint32_t)(x1 & 0xffff) << 16L); + d[1] = (uint32_t)(x2 & 0xffff) | ((uint32_t)(x3 & 0xffff) << 16L); +} + +static void RC2_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, + RC2_KEY *ks, uint8_t *iv, int encrypt) { + uint32_t tin0, tin1; + uint32_t tout0, tout1, xor0, xor1; + long l = length; + uint32_t tin[2]; + + if (encrypt) { + c2l(iv, tout0); + c2l(iv, tout1); + iv -= 8; + for (l -= 8; l >= 0; l -= 8) { + c2l(in, tin0); + c2l(in, tin1); + tin0 ^= tout0; + tin1 ^= tout1; + tin[0] = tin0; + tin[1] = tin1; + RC2_encrypt(tin, ks); + tout0 = tin[0]; + l2c(tout0, out); + tout1 = tin[1]; + l2c(tout1, out); + } + if (l != -8) { + c2ln(in, tin0, tin1, l + 8); + tin0 ^= tout0; + tin1 ^= tout1; + tin[0] = tin0; + tin[1] = tin1; + RC2_encrypt(tin, ks); + tout0 = tin[0]; + l2c(tout0, out); + tout1 = tin[1]; + l2c(tout1, out); + } + l2c(tout0, iv); + l2c(tout1, iv); + } else { + c2l(iv, xor0); + c2l(iv, xor1); + iv -= 8; + for (l -= 8; l >= 0; l -= 8) { + c2l(in, tin0); + tin[0] = tin0; + c2l(in, tin1); + tin[1] = tin1; + RC2_decrypt(tin, ks); + tout0 = tin[0] ^ xor0; + tout1 = tin[1] ^ xor1; + l2c(tout0, out); + l2c(tout1, out); + xor0 = tin0; + xor1 = tin1; + } + if (l != -8) { + c2l(in, tin0); + tin[0] = tin0; + c2l(in, tin1); + tin[1] = tin1; + RC2_decrypt(tin, ks); + tout0 = tin[0] ^ xor0; + tout1 = tin[1] ^ xor1; + l2cn(tout0, tout1, out, l + 8); + xor0 = tin0; + xor1 = tin1; + } + l2c(xor0, iv); + l2c(xor1, iv); + } + tin[0] = tin[1] = 0; +} + +static const uint8_t key_table[256] = { + 0xd9, 0x78, 0xf9, 0xc4, 0x19, 0xdd, 0xb5, 0xed, 0x28, 0xe9, 0xfd, 0x79, + 0x4a, 0xa0, 0xd8, 0x9d, 0xc6, 0x7e, 0x37, 0x83, 0x2b, 0x76, 0x53, 0x8e, + 0x62, 0x4c, 0x64, 0x88, 0x44, 0x8b, 0xfb, 0xa2, 0x17, 0x9a, 0x59, 0xf5, + 0x87, 0xb3, 0x4f, 0x13, 0x61, 0x45, 0x6d, 0x8d, 0x09, 0x81, 0x7d, 0x32, + 0xbd, 0x8f, 0x40, 0xeb, 0x86, 0xb7, 0x7b, 0x0b, 0xf0, 0x95, 0x21, 0x22, + 0x5c, 0x6b, 0x4e, 0x82, 0x54, 0xd6, 0x65, 0x93, 0xce, 0x60, 0xb2, 0x1c, + 0x73, 0x56, 0xc0, 0x14, 0xa7, 0x8c, 0xf1, 0xdc, 0x12, 0x75, 0xca, 0x1f, + 0x3b, 0xbe, 0xe4, 0xd1, 0x42, 0x3d, 0xd4, 0x30, 0xa3, 0x3c, 0xb6, 0x26, + 0x6f, 0xbf, 0x0e, 0xda, 0x46, 0x69, 0x07, 0x57, 0x27, 0xf2, 0x1d, 0x9b, + 0xbc, 0x94, 0x43, 0x03, 0xf8, 0x11, 0xc7, 0xf6, 0x90, 0xef, 0x3e, 0xe7, + 0x06, 0xc3, 0xd5, 0x2f, 0xc8, 0x66, 0x1e, 0xd7, 0x08, 0xe8, 0xea, 0xde, + 0x80, 0x52, 0xee, 0xf7, 0x84, 0xaa, 0x72, 0xac, 0x35, 0x4d, 0x6a, 0x2a, + 0x96, 0x1a, 0xd2, 0x71, 0x5a, 0x15, 0x49, 0x74, 0x4b, 0x9f, 0xd0, 0x5e, + 0x04, 0x18, 0xa4, 0xec, 0xc2, 0xe0, 0x41, 0x6e, 0x0f, 0x51, 0xcb, 0xcc, + 0x24, 0x91, 0xaf, 0x50, 0xa1, 0xf4, 0x70, 0x39, 0x99, 0x7c, 0x3a, 0x85, + 0x23, 0xb8, 0xb4, 0x7a, 0xfc, 0x02, 0x36, 0x5b, 0x25, 0x55, 0x97, 0x31, + 0x2d, 0x5d, 0xfa, 0x98, 0xe3, 0x8a, 0x92, 0xae, 0x05, 0xdf, 0x29, 0x10, + 0x67, 0x6c, 0xba, 0xc9, 0xd3, 0x00, 0xe6, 0xcf, 0xe1, 0x9e, 0xa8, 0x2c, + 0x63, 0x16, 0x01, 0x3f, 0x58, 0xe2, 0x89, 0xa9, 0x0d, 0x38, 0x34, 0x1b, + 0xab, 0x33, 0xff, 0xb0, 0xbb, 0x48, 0x0c, 0x5f, 0xb9, 0xb1, 0xcd, 0x2e, + 0xc5, 0xf3, 0xdb, 0x47, 0xe5, 0xa5, 0x9c, 0x77, 0x0a, 0xa6, 0x20, 0x68, + 0xfe, 0x7f, 0xc1, 0xad, +}; + +static void RC2_set_key(RC2_KEY *key, int len, const uint8_t *data, int bits) { + int i, j; + uint8_t *k; + uint16_t *ki; + unsigned int c, d; + + k = (uint8_t *)&key->data[0]; + *k = 0; // for if there is a zero length key + + if (len > 128) { + len = 128; + } + if (bits <= 0) { + bits = 1024; + } + if (bits > 1024) { + bits = 1024; + } + + for (i = 0; i < len; i++) { + k[i] = data[i]; + } + + // expand table + d = k[len - 1]; + j = 0; + for (i = len; i < 128; i++, j++) { + d = key_table[(k[j] + d) & 0xff]; + k[i] = d; + } + + // hmm.... key reduction to 'bits' bits + + j = (bits + 7) >> 3; + i = 128 - j; + c = (0xff >> (-bits & 0x07)); + + d = key_table[k[i] & c]; + k[i] = d; + while (i--) { + d = key_table[k[i + j] ^ d]; + k[i] = d; + } + + // copy from bytes into uint16_t's + ki = &(key->data[63]); + for (i = 127; i >= 0; i -= 2) { + *(ki--) = ((k[i] << 8) | k[i - 1]) & 0xffff; + } +} + +typedef struct { + int key_bits; // effective key bits + RC2_KEY ks; // key schedule +} EVP_RC2_KEY; + +static int rc2_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, + const uint8_t *iv, int enc) { + EVP_RC2_KEY *rc2_key = (EVP_RC2_KEY *)ctx->cipher_data; + RC2_set_key(&rc2_key->ks, EVP_CIPHER_CTX_key_length(ctx), key, + rc2_key->key_bits); + return 1; +} + +static int rc2_cbc_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + EVP_RC2_KEY *key = (EVP_RC2_KEY *)ctx->cipher_data; + static const size_t kChunkSize = 0x10000; + + while (len >= kChunkSize) { + RC2_cbc_encrypt(in, out, kChunkSize, &key->ks, ctx->iv, ctx->encrypt); + len -= kChunkSize; + in += kChunkSize; + out += kChunkSize; + } + if (len) { + RC2_cbc_encrypt(in, out, len, &key->ks, ctx->iv, ctx->encrypt); + } + return 1; +} + +static int rc2_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) { + EVP_RC2_KEY *key = (EVP_RC2_KEY *)ctx->cipher_data; + + switch (type) { + case EVP_CTRL_INIT: + key->key_bits = EVP_CIPHER_CTX_key_length(ctx) * 8; + return 1; + case EVP_CTRL_SET_RC2_KEY_BITS: + // Should be overridden by later call to |EVP_CTRL_INIT|, but + // people call it, so it may as well work. + key->key_bits = arg; + return 1; + + default: + return -1; + } +} + +static const EVP_CIPHER rc2_40_cbc = { + /*nid=*/NID_rc2_40_cbc, + /*block_size=*/8, + /*key_len=*/5 /* 40 bit */, + /*iv_len=*/8, + /*ctx_size=*/sizeof(EVP_RC2_KEY), + /*flags=*/EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT, + /*init=*/rc2_init_key, + /*cipher_update=*/rc2_cbc_cipher_update, + /*cipher_final=*/nullptr, + /*update_aad=*/nullptr, + /*cleanup=*/nullptr, + /*ctrl=*/rc2_ctrl, +}; + +const EVP_CIPHER *EVP_rc2_40_cbc() { return &rc2_40_cbc; } + +static const EVP_CIPHER rc2_cbc = { + /*nid=*/NID_rc2_cbc, + /*block_size=*/8, + /*key_len=*/16 /* 128 bit */, + /*iv_len=*/8, + /*ctx_size=*/sizeof(EVP_RC2_KEY), + /*flags=*/EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT, + /*init=*/rc2_init_key, + /*cipher_update=*/rc2_cbc_cipher_update, + /*cipher_final=*/nullptr, + /*update_aad=*/nullptr, + /*cleanup=*/nullptr, + /*ctrl=*/rc2_ctrl, +}; + +const EVP_CIPHER *EVP_rc2_cbc() { return &rc2_cbc; } diff --git a/third_party/boringssl/src/crypto/cipher/e_rc4.cc b/third_party/boringssl/src/crypto/cipher/e_rc4.cc new file mode 100644 index 00000000..7e1e92e7 --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/e_rc4.cc @@ -0,0 +1,56 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include + +#include "../fipsmodule/cipher/internal.h" + + +static int rc4_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, + const uint8_t *iv, int enc) { + RC4_KEY *rc4key = (RC4_KEY *)ctx->cipher_data; + + RC4_set_key(rc4key, EVP_CIPHER_CTX_key_length(ctx), key); + return 1; +} + +static int rc4_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + RC4_KEY *rc4key = (RC4_KEY *)ctx->cipher_data; + + RC4(rc4key, len, in, out); + return 1; +} + +static const EVP_CIPHER rc4 = { + /*nid=*/NID_rc4, + /*block_size=*/1, + /*key_len=*/16, + /*iv_len=*/0, + /*ctx_size=*/sizeof(RC4_KEY), + /*flags=*/EVP_CIPH_VARIABLE_LENGTH, + /*init=*/rc4_init_key, + /*cipher_update=*/rc4_cipher_update, + /*cipher_final=*/nullptr, + /*update_aad=*/nullptr, + /*cleanup=*/nullptr, + /*ctrl=*/nullptr, +}; + +const EVP_CIPHER *EVP_rc4() { return &rc4; } diff --git a/third_party/boringssl/src/crypto/cipher/e_tls.cc b/third_party/boringssl/src/crypto/cipher/e_tls.cc new file mode 100644 index 00000000..f38a41e1 --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/e_tls.cc @@ -0,0 +1,612 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../fipsmodule/cipher/internal.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +typedef struct { + EVP_CIPHER_CTX cipher_ctx; + HMAC_CTX *hmac_ctx; + // mac_key is the portion of the key used for the MAC. It is retained + // separately for the constant-time CBC code. + uint8_t mac_key[EVP_MAX_MD_SIZE]; + uint8_t mac_key_len; + // implicit_iv is one iff this is a pre-TLS-1.1 CBC cipher without an explicit + // IV. + char implicit_iv; +} AEAD_TLS_CTX; + +static_assert(EVP_MAX_MD_SIZE < 256, "mac_key_len does not fit in uint8_t"); + +static_assert(sizeof(((EVP_AEAD_CTX *)nullptr)->state) >= sizeof(AEAD_TLS_CTX), + "AEAD state is too small"); +static_assert(alignof(union evp_aead_ctx_st_state) >= alignof(AEAD_TLS_CTX), + "AEAD state has insufficient alignment"); + +static void aead_tls_cleanup(EVP_AEAD_CTX *ctx) { + AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; + EVP_CIPHER_CTX_cleanup(&tls_ctx->cipher_ctx); + HMAC_CTX_free(tls_ctx->hmac_ctx); +} + +static int aead_tls_init(EVP_AEAD_CTX *ctx, const uint8_t *key, size_t key_len, + size_t tag_len, enum evp_aead_direction_t dir, + const EVP_CIPHER *cipher, const EVP_MD *md, + char implicit_iv) { + if (tag_len != EVP_AEAD_DEFAULT_TAG_LENGTH && tag_len != EVP_MD_size(md)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_TAG_SIZE); + return 0; + } + + if (key_len != EVP_AEAD_key_length(ctx->aead)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); + return 0; + } + + size_t mac_key_len = EVP_MD_size(md); + size_t enc_key_len = EVP_CIPHER_key_length(cipher); + assert(mac_key_len + enc_key_len + + (implicit_iv ? EVP_CIPHER_iv_length(cipher) : 0) == + key_len); + + AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; + tls_ctx->hmac_ctx = HMAC_CTX_new(); + if (!tls_ctx->hmac_ctx) { + return 0; + } + EVP_CIPHER_CTX_init(&tls_ctx->cipher_ctx); + assert(mac_key_len <= EVP_MAX_MD_SIZE); + OPENSSL_memcpy(tls_ctx->mac_key, key, mac_key_len); + tls_ctx->mac_key_len = (uint8_t)mac_key_len; + tls_ctx->implicit_iv = implicit_iv; + + if (!EVP_CipherInit_ex( + &tls_ctx->cipher_ctx, cipher, nullptr, &key[mac_key_len], + implicit_iv ? &key[mac_key_len + enc_key_len] : nullptr, + dir == evp_aead_seal) || + !HMAC_Init_ex(tls_ctx->hmac_ctx, key, mac_key_len, md, nullptr)) { + aead_tls_cleanup(ctx); + return 0; + } + EVP_CIPHER_CTX_set_padding(&tls_ctx->cipher_ctx, 0); + + return 1; +} + +static size_t aead_tls_tag_len(const EVP_AEAD_CTX *ctx, const size_t in_len) { + const AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; + assert(EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) == EVP_CIPH_CBC_MODE); + + const size_t hmac_len = HMAC_size(tls_ctx->hmac_ctx); + const size_t block_size = EVP_CIPHER_CTX_block_size(&tls_ctx->cipher_ctx); + // An overflow of |in_len + hmac_len| doesn't affect the result mod + // |block_size|, provided that |block_size| is a smaller power of two. + assert(block_size == 8 /*3DES*/ || block_size == 16 /*AES*/); + const size_t pad_len = block_size - ((in_len + hmac_len) & (block_size - 1)); + return hmac_len + pad_len; +} + +static int aead_tls_sealv(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, size_t *out_tag_len, + Span nonce, + Span aadvecs) { + AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; + + if (!tls_ctx->cipher_ctx.encrypt) { + // Unlike a normal AEAD, a TLS AEAD may only be used in one direction. + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_OPERATION); + return 0; + } + + size_t in_len = bssl::iovec::TotalLength(iovecs); + if (out_tag.size() < aead_tls_tag_len(ctx, in_len)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + + if (nonce.size() != EVP_AEAD_nonce_length(ctx->aead)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); + return 0; + } + + size_t ad_len = bssl::iovec::TotalLength(aadvecs); + if (ad_len != 13 - 2 /* length bytes */) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_AD_SIZE); + return 0; + } + + // To allow for CBC mode which changes cipher length, |ad| doesn't include the + // length for legacy ciphers. + uint8_t ad_extra[2]; + CRYPTO_store_u16_be(ad_extra, static_cast(in_len)); + + // Compute the MAC. This must be first in case the operation is being done + // in-place. + uint8_t mac[EVP_MAX_MD_SIZE]; + if (!HMAC_Init_ex(tls_ctx->hmac_ctx, nullptr, 0, nullptr, nullptr)) { + return 0; + } + for (const CRYPTO_IVEC &aadvec : aadvecs) { + if (!HMAC_Update(tls_ctx->hmac_ctx, aadvec.in, aadvec.len)) { + return 0; + } + } + if (!HMAC_Update(tls_ctx->hmac_ctx, ad_extra, sizeof(ad_extra))) { + return 0; + } + for (const CRYPTO_IOVEC &iovec : iovecs) { + if (!HMAC_Update(tls_ctx->hmac_ctx, iovec.in, iovec.len)) { + return 0; + } + } + unsigned mac_len; + if (!HMAC_Final(tls_ctx->hmac_ctx, mac, &mac_len)) { + return 0; + } + + // Configure the explicit IV. + assert(EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) == EVP_CIPH_CBC_MODE); + if (!tls_ctx->implicit_iv && + !EVP_EncryptInit_ex(&tls_ctx->cipher_ctx, nullptr, nullptr, nullptr, + nonce.data())) { + return 0; + } + + size_t block_size = EVP_CIPHER_CTX_block_size(&tls_ctx->cipher_ctx); + assert(block_size == 8 /*3DES*/ || block_size == 16 /*AES*/); + + // Encrypt the input. + size_t len = 0; + size_t tag_len = 0; + if (!bssl::iovec::ForEachBlockRange_Dynamic( + block_size, iovecs, + [&](const uint8_t *in, uint8_t *out, size_t chunk_len) { + // Complete block(s). + size_t out_len; + if (!EVP_EncryptUpdate_ex(&tls_ctx->cipher_ctx, out, &out_len, + chunk_len, in, chunk_len)) { + return false; + } + assert(out_len == chunk_len); + len += out_len; + return true; + }, + [&](const uint8_t *in, uint8_t *out, size_t chunk_len) { + // Final chunk, possibly with a partial block. + size_t out_len; + if (!EVP_EncryptUpdate_ex(&tls_ctx->cipher_ctx, out, &out_len, + chunk_len, in, chunk_len)) { + return false; + } + len += out_len; + size_t remaining = chunk_len - out_len; + assert(remaining < block_size); + if (remaining == 0) { + return true; + } + + // Feed the MAC into the cipher in two steps. First complete the + // final partial block from encrypting the input and split the + // result between |out| and |out_tag|. Then feed the rest. + const size_t early_mac_len = block_size - remaining; + assert(early_mac_len < block_size); + assert(len + block_size - early_mac_len == in_len); + uint8_t buf[EVP_MAX_BLOCK_LENGTH]; + size_t buf_len; + if (!EVP_EncryptUpdate_ex(&tls_ctx->cipher_ctx, buf, &buf_len, + sizeof(buf), mac, early_mac_len)) { + return false; + } + assert(buf_len == block_size); + OPENSSL_memcpy(out + out_len, buf, remaining); + OPENSSL_memcpy(out_tag.data(), buf + remaining, early_mac_len); + tag_len = early_mac_len; + return true; + })) { + return 0; + } + + if (!EVP_EncryptUpdate_ex(&tls_ctx->cipher_ctx, out_tag.data() + tag_len, + &len, out_tag.size() - tag_len, mac + tag_len, + mac_len - tag_len)) { + return 0; + } + tag_len += len; + + // Compute padding and feed that into the cipher. + uint8_t padding[256]; + unsigned padding_len = block_size - ((in_len + mac_len) & (block_size - 1)); + OPENSSL_memset(padding, padding_len - 1, padding_len); + if (!EVP_EncryptUpdate_ex(&tls_ctx->cipher_ctx, out_tag.data() + tag_len, + &len, out_tag.size() - tag_len, padding, + padding_len)) { + return 0; + } + tag_len += len; + + if (!EVP_EncryptFinal_ex2(&tls_ctx->cipher_ctx, out_tag.data() + tag_len, + &len, out_tag.size() - tag_len)) { + return 0; + } + assert(len == 0); // Padding is explicit. + assert(tag_len == aead_tls_tag_len(ctx, in_len)); + + *out_tag_len = tag_len; + return 1; +} + +static int aead_tls_openv(const EVP_AEAD_CTX *ctx, + Span iovecs, + size_t *out_total_bytes, Span nonce, + Span aadvecs) { + AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; + + if (tls_ctx->cipher_ctx.encrypt) { + // Unlike a normal AEAD, a TLS AEAD may only be used in one direction. + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_OPERATION); + return 0; + } + + size_t in_len = bssl::iovec::TotalLength(iovecs); + if (in_len < HMAC_size(tls_ctx->hmac_ctx)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + if (nonce.size() != EVP_AEAD_nonce_length(ctx->aead)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); + return 0; + } + + size_t ad_len = bssl::iovec::TotalLength(aadvecs); + if (ad_len != 13 - 2 /* length bytes */) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_AD_SIZE); + return 0; + } + + // Configure the explicit IV. + assert(EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) == EVP_CIPH_CBC_MODE); + if (!tls_ctx->implicit_iv && + !EVP_DecryptInit_ex(&tls_ctx->cipher_ctx, nullptr, nullptr, nullptr, + nonce.data())) { + return 0; + } + + // Decrypt to get the plaintext + MAC + padding. + size_t total = 0; + size_t block_size = EVP_CIPHER_CTX_block_size(&tls_ctx->cipher_ctx); + auto decrypt_update = [&](const uint8_t *in, uint8_t *out, size_t len) { + size_t out_len; + if (!EVP_DecryptUpdate_ex(&tls_ctx->cipher_ctx, out, &out_len, len, in, + len)) { + return false; + } + CONSTTIME_SECRET(out, out_len); + if (out_len != len) { + // A byte sequence that was not a multiple of the block size was provided + // as ciphertext. This is generally invalid and thus should be rejected. + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return false; + } + total += len; + return true; + }; + if (!bssl::iovec::ForEachBlockRange_Dynamic( + block_size, iovecs, decrypt_update, decrypt_update)) { + return false; + } + assert(total == in_len); + + const size_t mac_len = HMAC_size(tls_ctx->hmac_ctx); + + // Split the decrypted record into |iovecs_without_trailer| and |trailer|, + // based on the public lower bound of where the plaintext ends. The plaintext + // is followed by |mac_len| and then at most 256 bytes of padding. + InplaceVector iovecs_without_trailer; + iovecs_without_trailer.CopyFrom(iovecs); + uint8_t trailer_buf[EVP_MAX_MD_SIZE + 256]; + const size_t trailer_len = std::min(in_len, mac_len + 256); + std::optional> trailer = bssl::iovec::GetAndRemoveOutSuffix( + Span(trailer_buf).first(trailer_len), Span(iovecs_without_trailer)); + BSSL_CHECK(trailer.has_value()); + + // Remove CBC padding. Code from here on is timing-sensitive with respect to + // |padding_ok|, |trailer_minus_padding|, and derived values. + crypto_word_t padding_ok; + size_t trailer_minus_padding; + if (!EVP_tls_cbc_remove_padding(&padding_ok, &trailer_minus_padding, + trailer->data(), trailer->size(), block_size, + mac_len)) { + // Publicly invalid. This can be rejected in non-constant time. + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + // If the padding is valid, |trailer->first(trailer_minus_padding)| is the + // last bytes of plaintext and the MAC. Otherwise, it is still large enough to + // extract a MAC, but it will be irrelevant. Note that |trailer_minus_padding| + // is secret. + declassify_assert(trailer_minus_padding >= mac_len); + size_t data_in_trailer_len = trailer_minus_padding - mac_len; + size_t max_data_in_trailer_len = trailer->size() - mac_len; + size_t data_len = total - trailer->size() + data_in_trailer_len; + + // To allow for CBC mode which changes cipher length, |ad_len| doesn't + // include the length for legacy ciphers. + uint8_t ad_extra[2]; + CRYPTO_store_u16_be(ad_extra, static_cast(data_len)); + + // Compute the MAC and extract the one in the record. + uint8_t mac[EVP_MAX_MD_SIZE]; + size_t got_mac_len; + assert(EVP_tls_cbc_record_digest_supported(tls_ctx->hmac_ctx->md)); + if (!EVP_tls_cbc_digest_record( + tls_ctx->hmac_ctx->md, mac, &got_mac_len, ad_extra, aadvecs, + iovecs_without_trailer, trailer->first(max_data_in_trailer_len), + data_in_trailer_len, tls_ctx->mac_key, tls_ctx->mac_key_len)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + assert(got_mac_len == mac_len); + + uint8_t record_mac[EVP_MAX_MD_SIZE]; + EVP_tls_cbc_copy_mac(record_mac, mac_len, trailer->data(), + trailer_minus_padding, trailer->size()); + + // Perform the MAC check and the padding check in constant-time. It should be + // safe to simply perform the padding check first, but it would not be under a + // different choice of MAC location on padding failure. See + // EVP_tls_cbc_remove_padding. The value barrier seems to be necessary to + // prevent a branch in Clang. + crypto_word_t good = value_barrier_w( + constant_time_eq_int(CRYPTO_memcmp(record_mac, mac, mac_len), 0)); + good &= padding_ok; + if (!constant_time_declassify_w(good)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + // End of timing-sensitive code. + CONSTTIME_DECLASSIFY(&data_len, sizeof(data_len)); + for (const CRYPTO_IOVEC &iovec : iovecs) { + CONSTTIME_DECLASSIFY(iovec.out, iovec.len); + } + + *out_total_bytes = data_len; + return 1; +} + +static int aead_aes_128_cbc_sha1_tls_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t tag_len, + enum evp_aead_direction_t dir) { + return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_aes_128_cbc(), + EVP_sha1(), 0); +} + +static int aead_aes_128_cbc_sha1_tls_implicit_iv_init( + EVP_AEAD_CTX *ctx, const uint8_t *key, size_t key_len, size_t tag_len, + enum evp_aead_direction_t dir) { + return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_aes_128_cbc(), + EVP_sha1(), 1); +} + +static int aead_aes_128_cbc_sha256_tls_init(EVP_AEAD_CTX *ctx, + const uint8_t *key, size_t key_len, + size_t tag_len, + enum evp_aead_direction_t dir) { + return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_aes_128_cbc(), + EVP_sha256(), 0); +} + +static int aead_aes_256_cbc_sha1_tls_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t tag_len, + enum evp_aead_direction_t dir) { + return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_aes_256_cbc(), + EVP_sha1(), 0); +} + +static int aead_aes_256_cbc_sha1_tls_implicit_iv_init( + EVP_AEAD_CTX *ctx, const uint8_t *key, size_t key_len, size_t tag_len, + enum evp_aead_direction_t dir) { + return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_aes_256_cbc(), + EVP_sha1(), 1); +} + +static int aead_des_ede3_cbc_sha1_tls_init(EVP_AEAD_CTX *ctx, + const uint8_t *key, size_t key_len, + size_t tag_len, + enum evp_aead_direction_t dir) { + return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_des_ede3_cbc(), + EVP_sha1(), 0); +} + +static int aead_des_ede3_cbc_sha1_tls_implicit_iv_init( + EVP_AEAD_CTX *ctx, const uint8_t *key, size_t key_len, size_t tag_len, + enum evp_aead_direction_t dir) { + return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_des_ede3_cbc(), + EVP_sha1(), 1); +} + +static int aead_tls_get_iv(const EVP_AEAD_CTX *ctx, const uint8_t **out_iv, + size_t *out_iv_len) { + const AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; + const size_t iv_len = EVP_CIPHER_CTX_iv_length(&tls_ctx->cipher_ctx); + if (iv_len <= 1) { + OPENSSL_PUT_ERROR(CIPHER, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + *out_iv = tls_ctx->cipher_ctx.iv; + *out_iv_len = iv_len; + return 1; +} + +static const EVP_AEAD aead_aes_128_cbc_sha1_tls = { + SHA_DIGEST_LENGTH + 16, // key len (SHA1 + AES128) + 16, // nonce len (IV) + 16 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) + SHA_DIGEST_LENGTH, // max tag length + + nullptr, // init + aead_aes_128_cbc_sha1_tls_init, + aead_tls_cleanup, + aead_tls_openv, + aead_tls_sealv, + nullptr, // openv_detached + nullptr, // get_iv + aead_tls_tag_len, +}; + +static const EVP_AEAD aead_aes_128_cbc_sha1_tls_implicit_iv = { + SHA_DIGEST_LENGTH + 16 + 16, // key len (SHA1 + AES128 + IV) + 0, // nonce len + 16 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) + SHA_DIGEST_LENGTH, // max tag length + + nullptr, // init + aead_aes_128_cbc_sha1_tls_implicit_iv_init, + aead_tls_cleanup, + aead_tls_openv, + aead_tls_sealv, + nullptr, // openv_detached + aead_tls_get_iv, // get_iv + aead_tls_tag_len, +}; + +static const EVP_AEAD aead_aes_128_cbc_sha256_tls = { + SHA256_DIGEST_LENGTH + 16, // key len (SHA256 + AES128) + 16, // nonce len (IV) + 16 + SHA256_DIGEST_LENGTH, // overhead (padding + SHA256) + SHA256_DIGEST_LENGTH, // max tag length + + nullptr, // init + aead_aes_128_cbc_sha256_tls_init, + aead_tls_cleanup, + aead_tls_openv, + aead_tls_sealv, + nullptr, // openv_detached + nullptr, // get_iv + aead_tls_tag_len, +}; + +static const EVP_AEAD aead_aes_256_cbc_sha1_tls = { + SHA_DIGEST_LENGTH + 32, // key len (SHA1 + AES256) + 16, // nonce len (IV) + 16 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) + SHA_DIGEST_LENGTH, // max tag length + + nullptr, // init + aead_aes_256_cbc_sha1_tls_init, + aead_tls_cleanup, + aead_tls_openv, + aead_tls_sealv, + nullptr, // openv_detached + nullptr, // get_iv + aead_tls_tag_len, +}; + +static const EVP_AEAD aead_aes_256_cbc_sha1_tls_implicit_iv = { + SHA_DIGEST_LENGTH + 32 + 16, // key len (SHA1 + AES256 + IV) + 0, // nonce len + 16 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) + SHA_DIGEST_LENGTH, // max tag length + + nullptr, // init + aead_aes_256_cbc_sha1_tls_implicit_iv_init, + aead_tls_cleanup, + aead_tls_openv, + aead_tls_sealv, + nullptr, // openv_detached + aead_tls_get_iv, // get_iv + aead_tls_tag_len, +}; + +static const EVP_AEAD aead_des_ede3_cbc_sha1_tls = { + SHA_DIGEST_LENGTH + 24, // key len (SHA1 + 3DES) + 8, // nonce len (IV) + 8 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) + SHA_DIGEST_LENGTH, // max tag length + + nullptr, // init + aead_des_ede3_cbc_sha1_tls_init, + aead_tls_cleanup, + aead_tls_openv, + aead_tls_sealv, + nullptr, // openv_detached + nullptr, // get_iv + aead_tls_tag_len, +}; + +static const EVP_AEAD aead_des_ede3_cbc_sha1_tls_implicit_iv = { + SHA_DIGEST_LENGTH + 24 + 8, // key len (SHA1 + 3DES + IV) + 0, // nonce len + 8 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) + SHA_DIGEST_LENGTH, // max tag length + + nullptr, // init + aead_des_ede3_cbc_sha1_tls_implicit_iv_init, + aead_tls_cleanup, + aead_tls_openv, + aead_tls_sealv, + nullptr, // openv_detached + aead_tls_get_iv, // get_iv + aead_tls_tag_len, +}; + +const EVP_AEAD *EVP_aead_aes_128_cbc_sha1_tls() { + return &aead_aes_128_cbc_sha1_tls; +} + +const EVP_AEAD *EVP_aead_aes_128_cbc_sha1_tls_implicit_iv() { + return &aead_aes_128_cbc_sha1_tls_implicit_iv; +} + +const EVP_AEAD *EVP_aead_aes_128_cbc_sha256_tls() { + return &aead_aes_128_cbc_sha256_tls; +} + +const EVP_AEAD *EVP_aead_aes_256_cbc_sha1_tls() { + return &aead_aes_256_cbc_sha1_tls; +} + +const EVP_AEAD *EVP_aead_aes_256_cbc_sha1_tls_implicit_iv() { + return &aead_aes_256_cbc_sha1_tls_implicit_iv; +} + +const EVP_AEAD *EVP_aead_des_ede3_cbc_sha1_tls() { + return &aead_des_ede3_cbc_sha1_tls; +} + +const EVP_AEAD *EVP_aead_des_ede3_cbc_sha1_tls_implicit_iv() { + return &aead_des_ede3_cbc_sha1_tls_implicit_iv; +} diff --git a/third_party/boringssl/src/crypto/cipher/get_cipher.cc b/third_party/boringssl/src/crypto/cipher/get_cipher.cc new file mode 100644 index 00000000..dabc54aa --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/get_cipher.cc @@ -0,0 +1,85 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "../internal.h" + + +static const struct { + int nid; + const char *name; + const EVP_CIPHER *(*func)(); +} kCiphers[] = { + {NID_aes_128_cbc, "aes-128-cbc", EVP_aes_128_cbc}, + {NID_aes_128_ctr, "aes-128-ctr", EVP_aes_128_ctr}, + {NID_aes_128_ecb, "aes-128-ecb", EVP_aes_128_ecb}, + {NID_aes_128_gcm, "aes-128-gcm", EVP_aes_128_gcm}, + {NID_aes_128_ofb128, "aes-128-ofb", EVP_aes_128_ofb}, + {NID_aes_192_cbc, "aes-192-cbc", EVP_aes_192_cbc}, + {NID_aes_192_ctr, "aes-192-ctr", EVP_aes_192_ctr}, + {NID_aes_192_ecb, "aes-192-ecb", EVP_aes_192_ecb}, + {NID_aes_192_gcm, "aes-192-gcm", EVP_aes_192_gcm}, + {NID_aes_192_ofb128, "aes-192-ofb", EVP_aes_192_ofb}, + {NID_aes_256_cbc, "aes-256-cbc", EVP_aes_256_cbc}, + {NID_aes_256_ctr, "aes-256-ctr", EVP_aes_256_ctr}, + {NID_aes_256_ecb, "aes-256-ecb", EVP_aes_256_ecb}, + {NID_aes_256_gcm, "aes-256-gcm", EVP_aes_256_gcm}, + {NID_aes_256_ofb128, "aes-256-ofb", EVP_aes_256_ofb}, + {NID_des_cbc, "des-cbc", EVP_des_cbc}, + {NID_des_ecb, "des-ecb", EVP_des_ecb}, + {NID_des_ede_cbc, "des-ede-cbc", EVP_des_ede_cbc}, + {NID_des_ede_ecb, "des-ede", EVP_des_ede}, + {NID_des_ede3_cbc, "des-ede3-cbc", EVP_des_ede3_cbc}, + {NID_rc2_cbc, "rc2-cbc", EVP_rc2_cbc}, + {NID_rc4, "rc4", EVP_rc4}, +}; + +const EVP_CIPHER *EVP_get_cipherbynid(int nid) { + for (const auto &cipher : kCiphers) { + if (cipher.nid == nid) { + return cipher.func(); + } + } + return nullptr; +} + +const EVP_CIPHER *EVP_get_cipherbyname(const char *name) { + if (name == nullptr) { + return nullptr; + } + + // This is not a name used by OpenSSL, but tcpdump registers it with + // |EVP_add_cipher_alias|. Our |EVP_add_cipher_alias| is a no-op, so we + // support the name here. + if (OPENSSL_strcasecmp(name, "3des") == 0) { + name = "des-ede3-cbc"; + } + + for (const auto &cipher : kCiphers) { + if (OPENSSL_strcasecmp(cipher.name, name) == 0) { + return cipher.func(); + } + } + + return nullptr; +} diff --git a/third_party/boringssl/src/crypto/cipher/internal.h b/third_party/boringssl/src/crypto/cipher/internal.h new file mode 100644 index 00000000..e6ce207e --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/internal.h @@ -0,0 +1,240 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_CIPHER_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_CIPHER_INTERNAL_H + +#include +#include + +#include +#include +#include + +#include "../internal.h" + + +BSSL_NAMESPACE_BEGIN + +// EVP_tls_cbc_get_padding determines the padding from the decrypted, TLS, CBC +// record in |in|. This decrypted record should not include any "decrypted" +// explicit IV. If the record is publicly invalid, it returns zero. Otherwise, +// it returns one and sets |*out_padding_ok| to all ones (0xfff..f) if the +// padding is valid and zero otherwise. It then sets |*out_len| to the length +// with the padding removed or |in_len| if invalid. +// +// If the function returns one, it runs in time independent of the contents of +// |in|. It is also guaranteed that, independent of |*out_padding_ok|, |mac_len| +// <= |*out_len| <= |in_len|, satisfying |EVP_tls_cbc_copy_mac|'s precondition. +int EVP_tls_cbc_remove_padding(crypto_word_t *out_padding_ok, size_t *out_len, + const uint8_t *in, size_t in_len, + size_t block_size, size_t mac_size); + +// EVP_tls_cbc_copy_mac copies |md_size| bytes from the end of the first +// |in_len| bytes of |in| to |out| in constant time (independent of the concrete +// value of |in_len|, which may vary within a 256-byte window). |in| must point +// to a buffer of |orig_len| bytes. +// +// On entry: +// orig_len >= in_len >= md_size +// md_size <= EVP_MAX_MD_SIZE +void EVP_tls_cbc_copy_mac(uint8_t *out, size_t md_size, const uint8_t *in, + size_t in_len, size_t orig_len); + +// EVP_tls_cbc_record_digest_supported returns 1 iff |md| is a hash function +// which EVP_tls_cbc_digest_record supports. +int EVP_tls_cbc_record_digest_supported(const EVP_MD *md); + +// EVP_sha1_final_with_secret_suffix computes the result of hashing |len| bytes +// from |in| to |ctx| and writes the resulting hash to |out|. |len| is treated +// as secret and must be at most |max_len|, which is treated as public. |in| +// must point to a buffer of at least |max_len| bytes. It returns one on success +// and zero if inputs are too long. +// +// This function is exported for unit tests. +OPENSSL_EXPORT int EVP_sha1_final_with_secret_suffix( + SHA_CTX *ctx, uint8_t out[SHA_DIGEST_LENGTH], const uint8_t *in, size_t len, + size_t max_len); + +// EVP_sha256_final_with_secret_suffix acts like +// |EVP_sha1_final_with_secret_suffix|, but for SHA-256. +// +// This function is exported for unit tests. +OPENSSL_EXPORT int EVP_sha256_final_with_secret_suffix( + SHA256_CTX *ctx, uint8_t out[SHA256_DIGEST_LENGTH], const uint8_t *in, + size_t len, size_t max_len); + +// EVP_tls_cbc_digest_record computes the MAC of a decrypted, padded TLS +// record. +// +// md: the hash function used in the HMAC. +// EVP_tls_cbc_record_digest_supported must return true for this hash. +// md_out: the digest output. At most EVP_MAX_MD_SIZE bytes will be written. +// md_out_size: the number of output bytes is written here. +// len_header: the two length bytes of the TLS record header. +// aadvecs: the 11-byte TLS record header as it was provided by the caller. +// iovecs_without_trailer: the section of the plaintext that does not include +// the trailer whose length is secret (typically the entire plaintext with +// an upper bound of padding and MAC size removed) +// trailer: a buffer, of public length, containing the remainder of the +// plaintext as a prefix. +// data_in_trailer_size: the secret, reported length of the data portion in +// |trailer| once the padding and MAC have been removed. +// +// On entry: by virtue of having been through one of the remove_padding +// functions, above, we know that data_plus_mac_size is large enough to contain +// a padding byte and MAC. (If the padding was invalid, it might contain the +// padding too. ) +int EVP_tls_cbc_digest_record( + const EVP_MD *md, uint8_t *md_out, size_t *md_out_size, + const uint8_t len_header[2], bssl::Span aadvecs, + bssl::Span iovecs_without_trailer, + bssl::Span trailer, size_t data_in_trailer_size, + const uint8_t *mac_secret, unsigned mac_secret_length); + +#define POLY1305_TAG_LEN 16 + +// For convenience (the x86_64 calling convention allows only six parameters in +// registers), the final parameter for the assembly functions is both an input +// and output parameter. +union chacha20_poly1305_open_data { + struct { + alignas(16) uint8_t key[32]; + uint32_t counter; + uint8_t nonce[12]; + } in; + struct { + uint8_t tag[POLY1305_TAG_LEN]; + } out; +}; + +union chacha20_poly1305_seal_data { + struct { + alignas(16) uint8_t key[32]; + uint32_t counter; + uint8_t nonce[12]; + const uint8_t *extra_ciphertext; + size_t extra_ciphertext_len; + } in; + struct { + uint8_t tag[POLY1305_TAG_LEN]; + } out; +}; + +#if (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ + !defined(OPENSSL_NO_ASM) + +static_assert(sizeof(union chacha20_poly1305_open_data) == 48, + "wrong chacha20_poly1305_open_data size"); +static_assert(sizeof(union chacha20_poly1305_seal_data) == 48 + 8 + 8, + "wrong chacha20_poly1305_seal_data size"); + +inline int chacha20_poly1305_asm_capable() { +#if defined(OPENSSL_X86_64) + return CRYPTO_is_SSE4_1_capable(); +#elif defined(OPENSSL_AARCH64) + return CRYPTO_is_NEON_capable(); +#endif +} + +// chacha20_poly1305_open is defined in chacha20_poly1305_*.pl. It decrypts +// |plaintext_len| bytes from |ciphertext| and writes them to |out_plaintext|. +// Additional input parameters are passed in |aead_data->in|. On exit, it will +// write calculated tag value to |aead_data->out.tag|, which the caller must +// check. +#if defined(OPENSSL_X86_64) +extern "C" void chacha20_poly1305_open_sse41( + uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data); +extern "C" void chacha20_poly1305_open_avx2( + uint8_t *out_plaintext, const uint8_t *ciphertext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_open_data *data); +inline void chacha20_poly1305_open(uint8_t *out_plaintext, + const uint8_t *ciphertext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_open_data *data) { + if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) { + chacha20_poly1305_open_avx2(out_plaintext, ciphertext, plaintext_len, ad, + ad_len, data); + } else { + chacha20_poly1305_open_sse41(out_plaintext, ciphertext, plaintext_len, ad, + ad_len, data); + } +} +#else +extern "C" void chacha20_poly1305_open(uint8_t *out_plaintext, + const uint8_t *ciphertext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_open_data *data); +#endif + +// chacha20_poly1305_open is defined in chacha20_poly1305_*.pl. It encrypts +// |plaintext_len| bytes from |plaintext| and writes them to |out_ciphertext|. +// Additional input parameters are passed in |aead_data->in|. The calculated tag +// value is over the computed ciphertext concatenated with |extra_ciphertext| +// and written to |aead_data->out.tag|. +#if defined(OPENSSL_X86_64) +extern "C" void chacha20_poly1305_seal_sse41( + uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data); +extern "C" void chacha20_poly1305_seal_avx2( + uint8_t *out_ciphertext, const uint8_t *plaintext, size_t plaintext_len, + const uint8_t *ad, size_t ad_len, union chacha20_poly1305_seal_data *data); +inline void chacha20_poly1305_seal(uint8_t *out_ciphertext, + const uint8_t *plaintext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_seal_data *data) { + if (CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable()) { + chacha20_poly1305_seal_avx2(out_ciphertext, plaintext, plaintext_len, ad, + ad_len, data); + } else { + chacha20_poly1305_seal_sse41(out_ciphertext, plaintext, plaintext_len, ad, + ad_len, data); + } +} +#else +extern "C" void chacha20_poly1305_seal(uint8_t *out_ciphertext, + const uint8_t *plaintext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_seal_data *data); +#endif + +#else + +inline int chacha20_poly1305_asm_capable() { return 0; } + +inline void chacha20_poly1305_open(uint8_t *out_plaintext, + const uint8_t *ciphertext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_open_data *data) { + abort(); +} + +inline void chacha20_poly1305_seal(uint8_t *out_ciphertext, + const uint8_t *plaintext, + size_t plaintext_len, const uint8_t *ad, + size_t ad_len, + union chacha20_poly1305_seal_data *data) { + abort(); +} +#endif + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_CIPHER_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/cipher/tls_cbc.cc b/third_party/boringssl/src/crypto/cipher/tls_cbc.cc new file mode 100644 index 00000000..f0c57647 --- /dev/null +++ b/third_party/boringssl/src/crypto/cipher/tls_cbc.cc @@ -0,0 +1,476 @@ +// Copyright 2012-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include + +#include "../fipsmodule/cipher/internal.h" +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +int bssl::EVP_tls_cbc_remove_padding(crypto_word_t *out_padding_ok, + size_t *out_len, const uint8_t *in, + size_t in_len, size_t block_size, + size_t mac_size) { + const size_t overhead = 1 /* padding length byte */ + mac_size; + + // These lengths are all public so we can test them in non-constant time. + if (overhead > in_len) { + return 0; + } + + size_t padding_length = in[in_len - 1]; + + crypto_word_t good = constant_time_ge_w(in_len, overhead + padding_length); + // The padding consists of a length byte at the end of the record and + // then that many bytes of padding, all with the same value as the + // length byte. Thus, with the length byte included, there are i+1 + // bytes of padding. + // + // We can't check just |padding_length+1| bytes because that leaks + // decrypted information. Therefore we always have to check the maximum + // amount of padding possible. (Again, the length of the record is + // public information so we can use it.) + size_t to_check = 256; // maximum amount of padding, inc length byte. + if (to_check > in_len) { + to_check = in_len; + } + + for (size_t i = 0; i < to_check; i++) { + uint8_t mask = constant_time_ge_8(padding_length, i); + // The value barrier on |(in_len - 1 - i)| isn't needed to enforce + // constant-time. It is just there to prevent a false positive in + // constant-time checks by valgrind. + uint8_t b = in[value_barrier_w(in_len - 1 - i)]; + // The final |padding_length+1| bytes should all have the value + // |padding_length|. Therefore the XOR should be zero. + good &= ~(mask & (padding_length ^ b)); + } + + // If any of the final |padding_length+1| bytes had the wrong value, + // one or more of the lower eight bits of |good| will be cleared. + good = constant_time_eq_w(0xff, good & 0xff); + + // Always treat |padding_length| as zero on error. If, assuming block size of + // 16, a padding of [<15 arbitrary bytes> 15] treated |padding_length| as 16 + // and returned -1, distinguishing good MAC and bad padding from bad MAC and + // bad padding would give POODLE's padding oracle. + padding_length = good & (padding_length + 1); + *out_len = in_len - padding_length; + *out_padding_ok = good; + return 1; +} + +void bssl::EVP_tls_cbc_copy_mac(uint8_t *out, size_t md_size, const uint8_t *in, + size_t in_len, size_t orig_len) { + uint8_t rotated_mac1[EVP_MAX_MD_SIZE], rotated_mac2[EVP_MAX_MD_SIZE]; + uint8_t *rotated_mac = rotated_mac1; + uint8_t *rotated_mac_tmp = rotated_mac2; + + // mac_end is the index of |in| just after the end of the MAC. + size_t mac_end = in_len; + size_t mac_start = mac_end - md_size; + + declassify_assert(orig_len >= in_len); + declassify_assert(in_len >= md_size); + assert(md_size <= EVP_MAX_MD_SIZE); + assert(md_size > 0); + + // scan_start contains the number of bytes that we can ignore because + // the MAC's position can only vary by 255 bytes. + size_t scan_start = 0; + // This information is public so it's safe to branch based on it. + if (orig_len > md_size + 255 + 1) { + scan_start = orig_len - (md_size + 255 + 1); + } + + size_t rotate_offset = 0; + uint8_t mac_started = 0; + OPENSSL_memset(rotated_mac, 0, md_size); + for (size_t i = scan_start, j = 0; i < orig_len; i++, j++) { + if (j >= md_size) { + j -= md_size; + } + crypto_word_t is_mac_start = constant_time_eq_w(i, mac_start); + mac_started |= is_mac_start; + uint8_t mac_ended = constant_time_ge_8(i, mac_end); + rotated_mac[j] |= in[i] & mac_started & ~mac_ended; + // Save the offset that |mac_start| is mapped to. + rotate_offset |= j & is_mac_start; + } + + // Now rotate the MAC. We rotate in log(md_size) steps, one for each bit + // position. + for (size_t offset = 1; offset < md_size; offset <<= 1, rotate_offset >>= 1) { + // Rotate by |offset| iff the corresponding bit is set in + // |rotate_offset|, placing the result in |rotated_mac_tmp|. + const uint8_t skip_rotate = (rotate_offset & 1) - 1; + for (size_t i = 0, j = offset; i < md_size; i++, j++) { + if (j >= md_size) { + j -= md_size; + } + rotated_mac_tmp[i] = + constant_time_select_8(skip_rotate, rotated_mac[i], rotated_mac[j]); + } + + // Swap pointers so |rotated_mac| contains the (possibly) rotated value. + // Note the number of iterations and thus the identity of these pointers is + // public information. + uint8_t *tmp = rotated_mac; + rotated_mac = rotated_mac_tmp; + rotated_mac_tmp = tmp; + } + + OPENSSL_memcpy(out, rotated_mac, md_size); +} + +int bssl::EVP_sha1_final_with_secret_suffix(SHA_CTX *ctx, + uint8_t out[SHA_DIGEST_LENGTH], + const uint8_t *in, size_t len, + size_t max_len) { + // Bound the input length so |total_bits| below fits in four bytes. This is + // redundant with TLS record size limits. This also ensures |input_idx| below + // does not overflow. + size_t max_len_bits = max_len << 3; + if (ctx->Nh != 0 || + (max_len_bits >> 3) != max_len || // Overflow + ctx->Nl + max_len_bits < max_len_bits || + ctx->Nl + max_len_bits > UINT32_MAX) { + return 0; + } + + // We need to hash the following into |ctx|: + // + // - ctx->data[:ctx->num] + // - in[:len] + // - A 0x80 byte + // - However many zero bytes are needed to pad up to a block. + // - Eight bytes of length. + size_t num_blocks = (ctx->num + len + 1 + 8 + SHA_CBLOCK - 1) >> 6; + size_t last_block = num_blocks - 1; + size_t max_blocks = (ctx->num + max_len + 1 + 8 + SHA_CBLOCK - 1) >> 6; + + // The bounds above imply |total_bits| fits in four bytes. + size_t total_bits = ctx->Nl + (len << 3); + uint8_t length_bytes[4]; + CRYPTO_store_u32_be(length_bytes, total_bits); + + // We now construct and process each expected block in constant-time. + uint8_t block[SHA_CBLOCK] = {0}; + uint32_t result[5] = {0}; + // input_idx is the index into |in| corresponding to the current block. + // However, we allow this index to overflow beyond |max_len|, to simplify the + // 0x80 byte. + size_t input_idx = 0; + for (size_t i = 0; i < max_blocks; i++) { + // Fill |block| with data from the partial block in |ctx| and |in|. We copy + // as if we were hashing up to |max_len| and then zero the excess later. + size_t block_start = 0; + if (i == 0) { + OPENSSL_memcpy(block, ctx->data, ctx->num); + block_start = ctx->num; + } + if (input_idx < max_len) { + size_t to_copy = SHA_CBLOCK - block_start; + if (to_copy > max_len - input_idx) { + to_copy = max_len - input_idx; + } + OPENSSL_memcpy(block + block_start, in + input_idx, to_copy); + } + + // Zero any bytes beyond |len| and add the 0x80 byte. + for (size_t j = block_start; j < SHA_CBLOCK; j++) { + // input[idx] corresponds to block[j]. + size_t idx = input_idx + j - block_start; + // The barriers on |len| are not strictly necessary. However, without + // them, GCC compiles this code by incorporating |len| into the loop + // counter and subtracting it out later. This is still constant-time, but + // it frustrates attempts to validate this. + uint8_t is_in_bounds = constant_time_lt_8(idx, value_barrier_w(len)); + uint8_t is_padding_byte = constant_time_eq_8(idx, value_barrier_w(len)); + block[j] &= is_in_bounds; + block[j] |= 0x80 & is_padding_byte; + } + + input_idx += SHA_CBLOCK - block_start; + + // Fill in the length if this is the last block. Use a value barrier to + // prevent Clang from compiling the conditional select as a jump. + crypto_word_t is_last_block = + value_barrier_w(constant_time_eq_w(i, last_block)); + for (size_t j = 0; j < 4; j++) { + block[SHA_CBLOCK - 4 + j] |= is_last_block & length_bytes[j]; + } + + // Process the block and save the hash state if it is the final value. + SHA1_Transform(ctx, block); + for (size_t j = 0; j < 5; j++) { + result[j] |= is_last_block & ctx->h[j]; + } + } + + // Write the output. + for (size_t i = 0; i < 5; i++) { + CRYPTO_store_u32_be(out + 4 * i, result[i]); + } + return 1; +} + +int bssl::EVP_sha256_final_with_secret_suffix(SHA256_CTX *ctx, + uint8_t out[SHA256_DIGEST_LENGTH], + const uint8_t *in, size_t len, + size_t max_len) { + // Bound the input length so |total_bits| below fits in four bytes. This is + // redundant with TLS record size limits. This also ensures |input_idx| below + // does not overflow. + size_t max_len_bits = max_len << 3; + if (ctx->Nh != 0 || + (max_len_bits >> 3) != max_len || // Overflow + ctx->Nl + max_len_bits < max_len_bits || + ctx->Nl + max_len_bits > UINT32_MAX) { + return 0; + } + + // We need to hash the following into |ctx|: + // + // - ctx->data[:ctx->num] + // - in[:len] + // - A 0x80 byte + // - However many zero bytes are needed to pad up to a block. + // - Eight bytes of length. + size_t num_blocks = (ctx->num + len + 1 + 8 + SHA256_CBLOCK - 1) >> 6; + size_t last_block = num_blocks - 1; + size_t max_blocks = (ctx->num + max_len + 1 + 8 + SHA256_CBLOCK - 1) >> 6; + + // The bounds above imply |total_bits| fits in four bytes. + size_t total_bits = ctx->Nl + (len << 3); + uint8_t length_bytes[4]; + CRYPTO_store_u32_be(length_bytes, total_bits); + + // We now construct and process each expected block in constant-time. + uint8_t block[SHA256_CBLOCK] = {0}; + uint32_t result[8] = {0}; + // input_idx is the index into |in| corresponding to the current block. + // However, we allow this index to overflow beyond |max_len|, to simplify the + // 0x80 byte. + size_t input_idx = 0; + for (size_t i = 0; i < max_blocks; i++) { + // Fill |block| with data from the partial block in |ctx| and |in|. We copy + // as if we were hashing up to |max_len| and then zero the excess later. + size_t block_start = 0; + if (i == 0) { + OPENSSL_memcpy(block, ctx->data, ctx->num); + block_start = ctx->num; + } + if (input_idx < max_len) { + size_t to_copy = SHA256_CBLOCK - block_start; + if (to_copy > max_len - input_idx) { + to_copy = max_len - input_idx; + } + OPENSSL_memcpy(block + block_start, in + input_idx, to_copy); + } + + // Zero any bytes beyond |len| and add the 0x80 byte. + for (size_t j = block_start; j < SHA256_CBLOCK; j++) { + // input[idx] corresponds to block[j]. + size_t idx = input_idx + j - block_start; + // The barriers on |len| are not strictly necessary. However, without + // them, GCC compiles this code by incorporating |len| into the loop + // counter and subtracting it out later. This is still constant-time, but + // it frustrates attempts to validate this. + uint8_t is_in_bounds = constant_time_lt_8(idx, value_barrier_w(len)); + uint8_t is_padding_byte = constant_time_eq_8(idx, value_barrier_w(len)); + block[j] &= is_in_bounds; + block[j] |= 0x80 & is_padding_byte; + } + + input_idx += SHA256_CBLOCK - block_start; + + // Fill in the length if this is the last block. Use a value barrier to + // prevent Clang from compiling the conditional select as a jump. + crypto_word_t is_last_block = + value_barrier_w(constant_time_eq_w(i, last_block)); + for (size_t j = 0; j < 4; j++) { + block[SHA256_CBLOCK - 4 + j] |= is_last_block & length_bytes[j]; + } + + // Process the block and save the hash state if it is the final value. + SHA256_Transform(ctx, block); + for (size_t j = 0; j < 8; j++) { + result[j] |= is_last_block & ctx->h[j]; + } + } + + // Write the output. + for (size_t i = 0; i < 8; i++) { + CRYPTO_store_u32_be(out + 4 * i, result[i]); + } + return 1; +} + +int bssl::EVP_tls_cbc_record_digest_supported(const EVP_MD *md) { + switch (EVP_MD_type(md)) { + case NID_sha1: + case NID_sha256: + return 1; + default: + return 0; + } +} + +static int tls_cbc_digest_record_sha1( + uint8_t *md_out, size_t *md_out_size, const uint8_t len_header[2], + Span aadvecs, + Span iovecs_without_trailer, + Span trailer, size_t data_in_trailer_size, + const uint8_t *mac_secret, unsigned mac_secret_length) { + if (mac_secret_length > SHA_CBLOCK) { + // HMAC pads small keys with zeros and hashes large keys down. This function + // should never reach the large key case. + assert(0); + return 0; + } + + // Compute the initial HMAC block. + uint8_t hmac_pad[SHA_CBLOCK]; + OPENSSL_memset(hmac_pad, 0, sizeof(hmac_pad)); + OPENSSL_memcpy(hmac_pad, mac_secret, mac_secret_length); + for (size_t i = 0; i < SHA_CBLOCK; i++) { + hmac_pad[i] ^= 0x36; + } + + SHA_CTX ctx; + SHA1_Init(&ctx); + SHA1_Update(&ctx, hmac_pad, SHA_CBLOCK); + for (const CRYPTO_IVEC &aadvec : aadvecs) { + SHA1_Update(&ctx, aadvec.in, aadvec.len); + } + SHA1_Update(&ctx, len_header, 2); + + // Hash the public minimum length directly. This reduces the number of blocks + // that must be computed in constant-time. + for (const CRYPTO_IOVEC &iovec : iovecs_without_trailer) { + SHA1_Update(&ctx, iovec.out, iovec.len); + } + + // Hash the remaining data without leaking |data_in_trailer_size|. + uint8_t mac_out[SHA_DIGEST_LENGTH]; + if (!EVP_sha1_final_with_secret_suffix(&ctx, mac_out, trailer.data(), + data_in_trailer_size, + trailer.size())) { + return 0; + } + + // Complete the HMAC in the standard manner. + SHA1_Init(&ctx); + for (size_t i = 0; i < SHA_CBLOCK; i++) { + hmac_pad[i] ^= 0x6a; + } + + SHA1_Update(&ctx, hmac_pad, SHA_CBLOCK); + SHA1_Update(&ctx, mac_out, SHA_DIGEST_LENGTH); + SHA1_Final(md_out, &ctx); + *md_out_size = SHA_DIGEST_LENGTH; + return 1; +} + +static int tls_cbc_digest_record_sha256( + uint8_t *md_out, size_t *md_out_size, const uint8_t len_header[2], + Span aadvecs, + Span iovecs_without_trailer, + Span trailer, size_t data_in_trailer_size, + const uint8_t *mac_secret, unsigned mac_secret_length) { + if (mac_secret_length > SHA256_CBLOCK) { + // HMAC pads small keys with zeros and hashes large keys down. This function + // should never reach the large key case. + assert(0); + return 0; + } + + // Compute the initial HMAC block. + uint8_t hmac_pad[SHA256_CBLOCK]; + OPENSSL_memset(hmac_pad, 0, sizeof(hmac_pad)); + OPENSSL_memcpy(hmac_pad, mac_secret, mac_secret_length); + for (size_t i = 0; i < SHA256_CBLOCK; i++) { + hmac_pad[i] ^= 0x36; + } + + SHA256_CTX ctx; + SHA256_Init(&ctx); + SHA256_Update(&ctx, hmac_pad, SHA256_CBLOCK); + for (const CRYPTO_IVEC &aadvec : aadvecs) { + SHA256_Update(&ctx, aadvec.in, aadvec.len); + } + SHA256_Update(&ctx, len_header, 2); + + // Hash the public minimum length directly. This reduces the number of blocks + // that must be computed in constant-time. + for (const CRYPTO_IOVEC &iovec : iovecs_without_trailer) { + SHA256_Update(&ctx, iovec.out, iovec.len); + } + + // Hash the remaining data without leaking |data_in_trailer_size|. + uint8_t mac_out[SHA256_DIGEST_LENGTH]; + if (!EVP_sha256_final_with_secret_suffix(&ctx, mac_out, trailer.data(), + data_in_trailer_size, + trailer.size())) { + return 0; + } + + // Complete the HMAC in the standard manner. + SHA256_Init(&ctx); + for (size_t i = 0; i < SHA256_CBLOCK; i++) { + hmac_pad[i] ^= 0x6a; + } + + SHA256_Update(&ctx, hmac_pad, SHA256_CBLOCK); + SHA256_Update(&ctx, mac_out, SHA256_DIGEST_LENGTH); + SHA256_Final(md_out, &ctx); + *md_out_size = SHA256_DIGEST_LENGTH; + return 1; +} + +int bssl::EVP_tls_cbc_digest_record( + const EVP_MD *md, uint8_t *md_out, size_t *md_out_size, + const uint8_t len_header[2], Span aadvecs, + Span iovecs_without_trailer, + Span trailer, size_t data_in_trailer_size, + const uint8_t *mac_secret, unsigned mac_secret_length) { + switch (EVP_MD_type(md)) { + case NID_sha1: + return tls_cbc_digest_record_sha1( // + md_out, md_out_size, len_header, aadvecs, iovecs_without_trailer, + trailer, data_in_trailer_size, mac_secret, mac_secret_length); + + case NID_sha256: + return tls_cbc_digest_record_sha256( // + md_out, md_out_size, len_header, aadvecs, iovecs_without_trailer, + trailer, data_in_trailer_size, mac_secret, mac_secret_length); + + default: + // EVP_tls_cbc_record_digest_supported should have been called first to + // check that the hash function is supported. + assert(0); + *md_out_size = 0; + return 0; + } +} diff --git a/third_party/boringssl/src/crypto/cipher_extra/cipher_extra.c b/third_party/boringssl/src/crypto/cipher_extra/cipher_extra.c deleted file mode 100644 index 62850ab6..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/cipher_extra.c +++ /dev/null @@ -1,127 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include - -#include "internal.h" -#include "../internal.h" - - -static const struct { - int nid; - const char *name; - const EVP_CIPHER *(*func)(void); -} kCiphers[] = { - {NID_aes_128_cbc, "aes-128-cbc", EVP_aes_128_cbc}, - {NID_aes_128_ctr, "aes-128-ctr", EVP_aes_128_ctr}, - {NID_aes_128_ecb, "aes-128-ecb", EVP_aes_128_ecb}, - {NID_aes_128_gcm, "aes-128-gcm", EVP_aes_128_gcm}, - {NID_aes_128_ofb128, "aes-128-ofb", EVP_aes_128_ofb}, - {NID_aes_192_cbc, "aes-192-cbc", EVP_aes_192_cbc}, - {NID_aes_192_ctr, "aes-192-ctr", EVP_aes_192_ctr}, - {NID_aes_192_ecb, "aes-192-ecb", EVP_aes_192_ecb}, - {NID_aes_192_gcm, "aes-192-gcm", EVP_aes_192_gcm}, - {NID_aes_192_ofb128, "aes-192-ofb", EVP_aes_192_ofb}, - {NID_aes_256_cbc, "aes-256-cbc", EVP_aes_256_cbc}, - {NID_aes_256_ctr, "aes-256-ctr", EVP_aes_256_ctr}, - {NID_aes_256_ecb, "aes-256-ecb", EVP_aes_256_ecb}, - {NID_aes_256_gcm, "aes-256-gcm", EVP_aes_256_gcm}, - {NID_aes_256_ofb128, "aes-256-ofb", EVP_aes_256_ofb}, - {NID_des_cbc, "des-cbc", EVP_des_cbc}, - {NID_des_ecb, "des-ecb", EVP_des_ecb}, - {NID_des_ede_cbc, "des-ede-cbc", EVP_des_ede_cbc}, - {NID_des_ede_ecb, "des-ede", EVP_des_ede}, - {NID_des_ede3_cbc, "des-ede3-cbc", EVP_des_ede3_cbc}, - {NID_rc2_cbc, "rc2-cbc", EVP_rc2_cbc}, - {NID_rc4, "rc4", EVP_rc4}, -}; - -const EVP_CIPHER *EVP_get_cipherbynid(int nid) { - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kCiphers); i++) { - if (kCiphers[i].nid == nid) { - return kCiphers[i].func(); - } - } - return NULL; -} - -const EVP_CIPHER *EVP_get_cipherbyname(const char *name) { - if (name == NULL) { - return NULL; - } - - // This is not a name used by OpenSSL, but tcpdump registers it with - // |EVP_add_cipher_alias|. Our |EVP_add_cipher_alias| is a no-op, so we - // support the name here. - if (OPENSSL_strcasecmp(name, "3des") == 0) { - name = "des-ede3-cbc"; - } - - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kCiphers); i++) { - if (OPENSSL_strcasecmp(kCiphers[i].name, name) == 0) { - return kCiphers[i].func(); - } - } - - return NULL; -} diff --git a/third_party/boringssl/src/crypto/cipher_extra/derive_key.c b/third_party/boringssl/src/crypto/cipher_extra/derive_key.c deleted file mode 100644 index 4b84c4eb..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/derive_key.c +++ /dev/null @@ -1,152 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include - - -#define PKCS5_SALT_LEN 8 - -int EVP_BytesToKey(const EVP_CIPHER *type, const EVP_MD *md, - const uint8_t *salt, const uint8_t *data, size_t data_len, - unsigned count, uint8_t *key, uint8_t *iv) { - EVP_MD_CTX c; - uint8_t md_buf[EVP_MAX_MD_SIZE]; - unsigned addmd = 0; - unsigned mds = 0, i; - int rv = 0; - - unsigned nkey = EVP_CIPHER_key_length(type); - unsigned niv = EVP_CIPHER_iv_length(type); - - assert(nkey <= EVP_MAX_KEY_LENGTH); - assert(niv <= EVP_MAX_IV_LENGTH); - - if (data == NULL) { - return nkey; - } - - EVP_MD_CTX_init(&c); - for (;;) { - if (!EVP_DigestInit_ex(&c, md, NULL)) { - goto err; - } - if (addmd++) { - if (!EVP_DigestUpdate(&c, md_buf, mds)) { - goto err; - } - } - if (!EVP_DigestUpdate(&c, data, data_len)) { - goto err; - } - if (salt != NULL) { - if (!EVP_DigestUpdate(&c, salt, PKCS5_SALT_LEN)) { - goto err; - } - } - if (!EVP_DigestFinal_ex(&c, md_buf, &mds)) { - goto err; - } - - for (i = 1; i < count; i++) { - if (!EVP_DigestInit_ex(&c, md, NULL) || - !EVP_DigestUpdate(&c, md_buf, mds) || - !EVP_DigestFinal_ex(&c, md_buf, &mds)) { - goto err; - } - } - - i = 0; - if (nkey) { - for (;;) { - if (nkey == 0 || i == mds) { - break; - } - if (key != NULL) { - *(key++) = md_buf[i]; - } - nkey--; - i++; - } - } - - if (niv && i != mds) { - for (;;) { - if (niv == 0 || i == mds) { - break; - } - if (iv != NULL) { - *(iv++) = md_buf[i]; - } - niv--; - i++; - } - } - if (nkey == 0 && niv == 0) { - break; - } - } - rv = EVP_CIPHER_key_length(type); - -err: - EVP_MD_CTX_cleanup(&c); - OPENSSL_cleanse(md_buf, EVP_MAX_MD_SIZE); - return rv; -} diff --git a/third_party/boringssl/src/crypto/cipher_extra/e_aesctrhmac.c b/third_party/boringssl/src/crypto/cipher_extra/e_aesctrhmac.c deleted file mode 100644 index 32b42d2e..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/e_aesctrhmac.c +++ /dev/null @@ -1,284 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - -#include -#include -#include -#include - -#include "../fipsmodule/cipher/internal.h" - - -#define EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN SHA256_DIGEST_LENGTH -#define EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN 12 - -struct aead_aes_ctr_hmac_sha256_ctx { - union { - double align; - AES_KEY ks; - } ks; - ctr128_f ctr; - block128_f block; - SHA256_CTX inner_init_state; - SHA256_CTX outer_init_state; -}; - -static_assert(sizeof(((EVP_AEAD_CTX *)NULL)->state) >= - sizeof(struct aead_aes_ctr_hmac_sha256_ctx), - "AEAD state is too small"); -static_assert(alignof(union evp_aead_ctx_st_state) >= - alignof(struct aead_aes_ctr_hmac_sha256_ctx), - "AEAD state has insufficient alignment"); - -static void hmac_init(SHA256_CTX *out_inner, SHA256_CTX *out_outer, - const uint8_t hmac_key[32]) { - static const size_t hmac_key_len = 32; - uint8_t block[SHA256_CBLOCK]; - OPENSSL_memcpy(block, hmac_key, hmac_key_len); - OPENSSL_memset(block + hmac_key_len, 0x36, sizeof(block) - hmac_key_len); - - unsigned i; - for (i = 0; i < hmac_key_len; i++) { - block[i] ^= 0x36; - } - - SHA256_Init(out_inner); - SHA256_Update(out_inner, block, sizeof(block)); - - OPENSSL_memset(block + hmac_key_len, 0x5c, sizeof(block) - hmac_key_len); - for (i = 0; i < hmac_key_len; i++) { - block[i] ^= (0x36 ^ 0x5c); - } - - SHA256_Init(out_outer); - SHA256_Update(out_outer, block, sizeof(block)); -} - -static int aead_aes_ctr_hmac_sha256_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t tag_len) { - struct aead_aes_ctr_hmac_sha256_ctx *aes_ctx = - (struct aead_aes_ctr_hmac_sha256_ctx *)&ctx->state; - static const size_t hmac_key_len = 32; - - if (key_len < hmac_key_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); - return 0; // EVP_AEAD_CTX_init should catch this. - } - - const size_t aes_key_len = key_len - hmac_key_len; - if (aes_key_len != 16 && aes_key_len != 32) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); - return 0; // EVP_AEAD_CTX_init should catch this. - } - - if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) { - tag_len = EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN; - } - - if (tag_len > EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE); - return 0; - } - - aes_ctx->ctr = - aes_ctr_set_key(&aes_ctx->ks.ks, NULL, &aes_ctx->block, key, aes_key_len); - ctx->tag_len = tag_len; - hmac_init(&aes_ctx->inner_init_state, &aes_ctx->outer_init_state, - key + aes_key_len); - - return 1; -} - -static void aead_aes_ctr_hmac_sha256_cleanup(EVP_AEAD_CTX *ctx) {} - -static void hmac_update_uint64(SHA256_CTX *sha256, uint64_t value) { - unsigned i; - uint8_t bytes[8]; - - for (i = 0; i < sizeof(bytes); i++) { - bytes[i] = value & 0xff; - value >>= 8; - } - SHA256_Update(sha256, bytes, sizeof(bytes)); -} - -static void hmac_calculate(uint8_t out[SHA256_DIGEST_LENGTH], - const SHA256_CTX *inner_init_state, - const SHA256_CTX *outer_init_state, - const uint8_t *ad, size_t ad_len, - const uint8_t *nonce, const uint8_t *ciphertext, - size_t ciphertext_len) { - SHA256_CTX sha256; - OPENSSL_memcpy(&sha256, inner_init_state, sizeof(sha256)); - hmac_update_uint64(&sha256, ad_len); - hmac_update_uint64(&sha256, ciphertext_len); - SHA256_Update(&sha256, nonce, EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN); - SHA256_Update(&sha256, ad, ad_len); - - // Pad with zeros to the end of the SHA-256 block. - const unsigned num_padding = - (SHA256_CBLOCK - ((sizeof(uint64_t)*2 + - EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN + ad_len) % - SHA256_CBLOCK)) % - SHA256_CBLOCK; - uint8_t padding[SHA256_CBLOCK]; - OPENSSL_memset(padding, 0, num_padding); - SHA256_Update(&sha256, padding, num_padding); - - SHA256_Update(&sha256, ciphertext, ciphertext_len); - - uint8_t inner_digest[SHA256_DIGEST_LENGTH]; - SHA256_Final(inner_digest, &sha256); - - OPENSSL_memcpy(&sha256, outer_init_state, sizeof(sha256)); - SHA256_Update(&sha256, inner_digest, sizeof(inner_digest)); - SHA256_Final(out, &sha256); -} - -static void aead_aes_ctr_hmac_sha256_crypt( - const struct aead_aes_ctr_hmac_sha256_ctx *aes_ctx, uint8_t *out, - const uint8_t *in, size_t len, const uint8_t *nonce) { - // Since the AEAD operation is one-shot, keeping a buffer of unused keystream - // bytes is pointless. However, |CRYPTO_ctr128_encrypt| requires it. - uint8_t partial_block_buffer[AES_BLOCK_SIZE]; - unsigned partial_block_offset = 0; - OPENSSL_memset(partial_block_buffer, 0, sizeof(partial_block_buffer)); - - uint8_t counter[AES_BLOCK_SIZE]; - OPENSSL_memcpy(counter, nonce, EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN); - OPENSSL_memset(counter + EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN, 0, 4); - - if (aes_ctx->ctr) { - CRYPTO_ctr128_encrypt_ctr32(in, out, len, &aes_ctx->ks.ks, counter, - partial_block_buffer, &partial_block_offset, - aes_ctx->ctr); - } else { - CRYPTO_ctr128_encrypt(in, out, len, &aes_ctx->ks.ks, counter, - partial_block_buffer, &partial_block_offset, - aes_ctx->block); - } -} - -static int aead_aes_ctr_hmac_sha256_seal_scatter( - const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag, - size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in, - size_t extra_in_len, const uint8_t *ad, size_t ad_len) { - const struct aead_aes_ctr_hmac_sha256_ctx *aes_ctx = - (struct aead_aes_ctr_hmac_sha256_ctx *) &ctx->state; - const uint64_t in_len_64 = in_len; - - if (in_len_64 >= (UINT64_C(1) << 32) * AES_BLOCK_SIZE) { - // This input is so large it would overflow the 32-bit block counter. - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - if (max_out_tag_len < ctx->tag_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - - if (nonce_len != EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - aead_aes_ctr_hmac_sha256_crypt(aes_ctx, out, in, in_len, nonce); - - uint8_t hmac_result[SHA256_DIGEST_LENGTH]; - hmac_calculate(hmac_result, &aes_ctx->inner_init_state, - &aes_ctx->outer_init_state, ad, ad_len, nonce, out, in_len); - OPENSSL_memcpy(out_tag, hmac_result, ctx->tag_len); - *out_tag_len = ctx->tag_len; - - return 1; -} - -static int aead_aes_ctr_hmac_sha256_open_gather( - const EVP_AEAD_CTX *ctx, uint8_t *out, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *in_tag, - size_t in_tag_len, const uint8_t *ad, size_t ad_len) { - const struct aead_aes_ctr_hmac_sha256_ctx *aes_ctx = - (struct aead_aes_ctr_hmac_sha256_ctx *) &ctx->state; - - if (in_tag_len != ctx->tag_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - if (nonce_len != EVP_AEAD_AES_CTR_HMAC_SHA256_NONCE_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - uint8_t hmac_result[SHA256_DIGEST_LENGTH]; - hmac_calculate(hmac_result, &aes_ctx->inner_init_state, - &aes_ctx->outer_init_state, ad, ad_len, nonce, in, - in_len); - if (CRYPTO_memcmp(hmac_result, in_tag, ctx->tag_len) != 0) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - aead_aes_ctr_hmac_sha256_crypt(aes_ctx, out, in, in_len, nonce); - - return 1; -} - -static const EVP_AEAD aead_aes_128_ctr_hmac_sha256 = { - 16 /* AES key */ + 32 /* HMAC key */, - 12, // nonce length - EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN, // overhead - EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN, // max tag length - 0, // seal_scatter_supports_extra_in - - aead_aes_ctr_hmac_sha256_init, - NULL /* init_with_direction */, - aead_aes_ctr_hmac_sha256_cleanup, - NULL /* open */, - aead_aes_ctr_hmac_sha256_seal_scatter, - aead_aes_ctr_hmac_sha256_open_gather, - NULL /* get_iv */, - NULL /* tag_len */, -}; - -static const EVP_AEAD aead_aes_256_ctr_hmac_sha256 = { - 32 /* AES key */ + 32 /* HMAC key */, - 12, // nonce length - EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN, // overhead - EVP_AEAD_AES_CTR_HMAC_SHA256_TAG_LEN, // max tag length - 0, // seal_scatter_supports_extra_in - - aead_aes_ctr_hmac_sha256_init, - NULL /* init_with_direction */, - aead_aes_ctr_hmac_sha256_cleanup, - NULL /* open */, - aead_aes_ctr_hmac_sha256_seal_scatter, - aead_aes_ctr_hmac_sha256_open_gather, - NULL /* get_iv */, - NULL /* tag_len */, -}; - -const EVP_AEAD *EVP_aead_aes_128_ctr_hmac_sha256(void) { - return &aead_aes_128_ctr_hmac_sha256; -} - -const EVP_AEAD *EVP_aead_aes_256_ctr_hmac_sha256(void) { - return &aead_aes_256_ctr_hmac_sha256; -} diff --git a/third_party/boringssl/src/crypto/cipher_extra/e_aesgcmsiv.c b/third_party/boringssl/src/crypto/cipher_extra/e_aesgcmsiv.c deleted file mode 100644 index 15601e19..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/e_aesgcmsiv.c +++ /dev/null @@ -1,853 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - -#include -#include -#include - -#include "../fipsmodule/cipher/internal.h" -#include "../internal.h" - - -#define EVP_AEAD_AES_GCM_SIV_NONCE_LEN 12 -#define EVP_AEAD_AES_GCM_SIV_TAG_LEN 16 - -// TODO(davidben): AES-GCM-SIV assembly is not correct for Windows. It must save -// and restore xmm6 through xmm15. -#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \ - !defined(OPENSSL_WINDOWS) -#define AES_GCM_SIV_ASM - -// Optimised AES-GCM-SIV - -struct aead_aes_gcm_siv_asm_ctx { - alignas(16) uint8_t key[16*15]; - int is_128_bit; -}; - -// The assembly code assumes 8-byte alignment of the EVP_AEAD_CTX's state, and -// aligns to 16 bytes itself. -static_assert(sizeof(((EVP_AEAD_CTX *)NULL)->state) + 8 >= - sizeof(struct aead_aes_gcm_siv_asm_ctx), - "AEAD state is too small"); -static_assert(alignof(union evp_aead_ctx_st_state) >= 8, - "AEAD state has insufficient alignment"); - -// asm_ctx_from_ctx returns a 16-byte aligned context pointer from |ctx|. -static struct aead_aes_gcm_siv_asm_ctx *asm_ctx_from_ctx( - const EVP_AEAD_CTX *ctx) { - // ctx->state must already be 8-byte aligned. Thus, at most, we may need to - // add eight to align it to 16 bytes. - const uintptr_t offset = ((uintptr_t)&ctx->state) & 8; - return (struct aead_aes_gcm_siv_asm_ctx *)(&ctx->state.opaque[offset]); -} - -// aes128gcmsiv_aes_ks writes an AES-128 key schedule for |key| to -// |out_expanded_key|. -extern void aes128gcmsiv_aes_ks( - const uint8_t key[16], uint8_t out_expanded_key[16*15]); - -// aes256gcmsiv_aes_ks writes an AES-256 key schedule for |key| to -// |out_expanded_key|. -extern void aes256gcmsiv_aes_ks( - const uint8_t key[32], uint8_t out_expanded_key[16*15]); - -static int aead_aes_gcm_siv_asm_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t tag_len) { - const size_t key_bits = key_len * 8; - - if (key_bits != 128 && key_bits != 256) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); - return 0; // EVP_AEAD_CTX_init should catch this. - } - - if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) { - tag_len = EVP_AEAD_AES_GCM_SIV_TAG_LEN; - } - - if (tag_len != EVP_AEAD_AES_GCM_SIV_TAG_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE); - return 0; - } - - struct aead_aes_gcm_siv_asm_ctx *gcm_siv_ctx = asm_ctx_from_ctx(ctx); - assert((((uintptr_t)gcm_siv_ctx) & 15) == 0); - - if (key_bits == 128) { - aes128gcmsiv_aes_ks(key, &gcm_siv_ctx->key[0]); - gcm_siv_ctx->is_128_bit = 1; - } else { - aes256gcmsiv_aes_ks(key, &gcm_siv_ctx->key[0]); - gcm_siv_ctx->is_128_bit = 0; - } - - ctx->tag_len = tag_len; - - return 1; -} - -static void aead_aes_gcm_siv_asm_cleanup(EVP_AEAD_CTX *ctx) {} - -// aesgcmsiv_polyval_horner updates the POLYVAL value in |in_out_poly| to -// include a number (|in_blocks|) of 16-byte blocks of data from |in|, given -// the POLYVAL key in |key|. -extern void aesgcmsiv_polyval_horner(const uint8_t in_out_poly[16], - const uint8_t key[16], const uint8_t *in, - size_t in_blocks); - -// aesgcmsiv_htable_init writes powers 1..8 of |auth_key| to |out_htable|. -extern void aesgcmsiv_htable_init(uint8_t out_htable[16 * 8], - const uint8_t auth_key[16]); - -// aesgcmsiv_htable6_init writes powers 1..6 of |auth_key| to |out_htable|. -extern void aesgcmsiv_htable6_init(uint8_t out_htable[16 * 6], - const uint8_t auth_key[16]); - -// aesgcmsiv_htable_polyval updates the POLYVAL value in |in_out_poly| to -// include |in_len| bytes of data from |in|. (Where |in_len| must be a multiple -// of 16.) It uses the precomputed powers of the key given in |htable|. -extern void aesgcmsiv_htable_polyval(const uint8_t htable[16 * 8], - const uint8_t *in, size_t in_len, - uint8_t in_out_poly[16]); - -// aes128gcmsiv_dec decrypts |in_len| & ~15 bytes from |out| and writes them to -// |in|. (The full value of |in_len| is still used to find the authentication -// tag appended to the ciphertext, however, so must not be pre-masked.) -// -// |in| and |out| may be equal, but must not otherwise overlap. -// -// While decrypting, it updates the POLYVAL value found at the beginning of -// |in_out_calculated_tag_and_scratch| and writes the updated value back before -// return. During executation, it may use the whole of this space for other -// purposes. In order to decrypt and update the POLYVAL value, it uses the -// expanded key from |key| and the table of powers in |htable|. -extern void aes128gcmsiv_dec(const uint8_t *in, uint8_t *out, - uint8_t in_out_calculated_tag_and_scratch[16 * 8], - const uint8_t htable[16 * 6], - const struct aead_aes_gcm_siv_asm_ctx *key, - size_t in_len); - -// aes256gcmsiv_dec acts like |aes128gcmsiv_dec|, but for AES-256. -extern void aes256gcmsiv_dec(const uint8_t *in, uint8_t *out, - uint8_t in_out_calculated_tag_and_scratch[16 * 8], - const uint8_t htable[16 * 6], - const struct aead_aes_gcm_siv_asm_ctx *key, - size_t in_len); - -// aes128gcmsiv_kdf performs the AES-GCM-SIV KDF given the expanded key from -// |key_schedule| and the nonce in |nonce|. Note that, while only 12 bytes of -// the nonce are used, 16 bytes are read and so the value must be -// right-padded. -extern void aes128gcmsiv_kdf(const uint8_t nonce[16], - uint64_t out_key_material[8], - const uint8_t *key_schedule); - -// aes256gcmsiv_kdf acts like |aes128gcmsiv_kdf|, but for AES-256. -extern void aes256gcmsiv_kdf(const uint8_t nonce[16], - uint64_t out_key_material[12], - const uint8_t *key_schedule); - -// aes128gcmsiv_aes_ks_enc_x1 performs a key expansion of the AES-128 key in -// |key|, writes the expanded key to |out_expanded_key| and encrypts a single -// block from |in| to |out|. -extern void aes128gcmsiv_aes_ks_enc_x1(const uint8_t in[16], uint8_t out[16], - uint8_t out_expanded_key[16 * 15], - const uint64_t key[2]); - -// aes256gcmsiv_aes_ks_enc_x1 acts like |aes128gcmsiv_aes_ks_enc_x1|, but for -// AES-256. -extern void aes256gcmsiv_aes_ks_enc_x1(const uint8_t in[16], uint8_t out[16], - uint8_t out_expanded_key[16 * 15], - const uint64_t key[4]); - -// aes128gcmsiv_ecb_enc_block encrypts a single block from |in| to |out| using -// the expanded key in |expanded_key|. -extern void aes128gcmsiv_ecb_enc_block( - const uint8_t in[16], uint8_t out[16], - const struct aead_aes_gcm_siv_asm_ctx *expanded_key); - -// aes256gcmsiv_ecb_enc_block acts like |aes128gcmsiv_ecb_enc_block|, but for -// AES-256. -extern void aes256gcmsiv_ecb_enc_block( - const uint8_t in[16], uint8_t out[16], - const struct aead_aes_gcm_siv_asm_ctx *expanded_key); - -// aes128gcmsiv_enc_msg_x4 encrypts |in_len| bytes from |in| to |out| using the -// expanded key from |key|. (The value of |in_len| must be a multiple of 16.) -// The |in| and |out| buffers may be equal but must not otherwise overlap. The -// initial counter is constructed from the given |tag| as required by -// AES-GCM-SIV. -extern void aes128gcmsiv_enc_msg_x4(const uint8_t *in, uint8_t *out, - const uint8_t *tag, - const struct aead_aes_gcm_siv_asm_ctx *key, - size_t in_len); - -// aes256gcmsiv_enc_msg_x4 acts like |aes128gcmsiv_enc_msg_x4|, but for -// AES-256. -extern void aes256gcmsiv_enc_msg_x4(const uint8_t *in, uint8_t *out, - const uint8_t *tag, - const struct aead_aes_gcm_siv_asm_ctx *key, - size_t in_len); - -// aes128gcmsiv_enc_msg_x8 acts like |aes128gcmsiv_enc_msg_x4|, but is -// optimised for longer messages. -extern void aes128gcmsiv_enc_msg_x8(const uint8_t *in, uint8_t *out, - const uint8_t *tag, - const struct aead_aes_gcm_siv_asm_ctx *key, - size_t in_len); - -// aes256gcmsiv_enc_msg_x8 acts like |aes256gcmsiv_enc_msg_x4|, but is -// optimised for longer messages. -extern void aes256gcmsiv_enc_msg_x8(const uint8_t *in, uint8_t *out, - const uint8_t *tag, - const struct aead_aes_gcm_siv_asm_ctx *key, - size_t in_len); - -// gcm_siv_asm_polyval evaluates POLYVAL at |auth_key| on the given plaintext -// and AD. The result is written to |out_tag|. -static void gcm_siv_asm_polyval(uint8_t out_tag[16], const uint8_t *in, - size_t in_len, const uint8_t *ad, size_t ad_len, - const uint8_t auth_key[16], - const uint8_t nonce[12]) { - OPENSSL_memset(out_tag, 0, 16); - const size_t ad_blocks = ad_len / 16; - const size_t in_blocks = in_len / 16; - int htable_init = 0; - alignas(16) uint8_t htable[16*8]; - - if (ad_blocks > 8 || in_blocks > 8) { - htable_init = 1; - aesgcmsiv_htable_init(htable, auth_key); - } - - if (htable_init) { - aesgcmsiv_htable_polyval(htable, ad, ad_len & ~15, out_tag); - } else { - aesgcmsiv_polyval_horner(out_tag, auth_key, ad, ad_blocks); - } - - uint8_t scratch[16]; - if (ad_len & 15) { - OPENSSL_memset(scratch, 0, sizeof(scratch)); - OPENSSL_memcpy(scratch, &ad[ad_len & ~15], ad_len & 15); - aesgcmsiv_polyval_horner(out_tag, auth_key, scratch, 1); - } - - if (htable_init) { - aesgcmsiv_htable_polyval(htable, in, in_len & ~15, out_tag); - } else { - aesgcmsiv_polyval_horner(out_tag, auth_key, in, in_blocks); - } - - if (in_len & 15) { - OPENSSL_memset(scratch, 0, sizeof(scratch)); - OPENSSL_memcpy(scratch, &in[in_len & ~15], in_len & 15); - aesgcmsiv_polyval_horner(out_tag, auth_key, scratch, 1); - } - - uint8_t length_block[16]; - CRYPTO_store_u64_le(length_block, ad_len * 8); - CRYPTO_store_u64_le(length_block + 8, in_len * 8); - aesgcmsiv_polyval_horner(out_tag, auth_key, length_block, 1); - - for (size_t i = 0; i < 12; i++) { - out_tag[i] ^= nonce[i]; - } - - out_tag[15] &= 0x7f; -} - -// aead_aes_gcm_siv_asm_crypt_last_block handles the encryption/decryption -// (same thing in CTR mode) of the final block of a plaintext/ciphertext. It -// writes |in_len| & 15 bytes to |out| + |in_len|, based on an initial counter -// derived from |tag|. -static void aead_aes_gcm_siv_asm_crypt_last_block( - int is_128_bit, uint8_t *out, const uint8_t *in, size_t in_len, - const uint8_t tag[16], - const struct aead_aes_gcm_siv_asm_ctx *enc_key_expanded) { - alignas(16) uint8_t counter[16]; - OPENSSL_memcpy(&counter, tag, sizeof(counter)); - counter[15] |= 0x80; - CRYPTO_store_u32_le(counter, CRYPTO_load_u32_le(counter) + in_len / 16); - - if (is_128_bit) { - aes128gcmsiv_ecb_enc_block(counter, counter, enc_key_expanded); - } else { - aes256gcmsiv_ecb_enc_block(counter, counter, enc_key_expanded); - } - - const size_t last_bytes_offset = in_len & ~15; - const size_t last_bytes_len = in_len & 15; - uint8_t *last_bytes_out = &out[last_bytes_offset]; - const uint8_t *last_bytes_in = &in[last_bytes_offset]; - for (size_t i = 0; i < last_bytes_len; i++) { - last_bytes_out[i] = last_bytes_in[i] ^ counter[i]; - } -} - -// aead_aes_gcm_siv_kdf calculates the record encryption and authentication -// keys given the |nonce|. -static void aead_aes_gcm_siv_kdf( - int is_128_bit, const struct aead_aes_gcm_siv_asm_ctx *gcm_siv_ctx, - uint64_t out_record_auth_key[2], uint64_t out_record_enc_key[4], - const uint8_t nonce[12]) { - alignas(16) uint8_t padded_nonce[16]; - OPENSSL_memcpy(padded_nonce, nonce, 12); - - alignas(16) uint64_t key_material[12]; - if (is_128_bit) { - aes128gcmsiv_kdf(padded_nonce, key_material, &gcm_siv_ctx->key[0]); - out_record_enc_key[0] = key_material[4]; - out_record_enc_key[1] = key_material[6]; - } else { - aes256gcmsiv_kdf(padded_nonce, key_material, &gcm_siv_ctx->key[0]); - out_record_enc_key[0] = key_material[4]; - out_record_enc_key[1] = key_material[6]; - out_record_enc_key[2] = key_material[8]; - out_record_enc_key[3] = key_material[10]; - } - - out_record_auth_key[0] = key_material[0]; - out_record_auth_key[1] = key_material[2]; -} - -static int aead_aes_gcm_siv_asm_seal_scatter( - const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag, - size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in, - size_t extra_in_len, const uint8_t *ad, size_t ad_len) { - const struct aead_aes_gcm_siv_asm_ctx *gcm_siv_ctx = asm_ctx_from_ctx(ctx); - const uint64_t in_len_64 = in_len; - const uint64_t ad_len_64 = ad_len; - - if (in_len_64 > (UINT64_C(1) << 36) || - ad_len_64 >= (UINT64_C(1) << 61)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - if (max_out_tag_len < EVP_AEAD_AES_GCM_SIV_TAG_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - - if (nonce_len != EVP_AEAD_AES_GCM_SIV_NONCE_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - alignas(16) uint64_t record_auth_key[2]; - alignas(16) uint64_t record_enc_key[4]; - aead_aes_gcm_siv_kdf(gcm_siv_ctx->is_128_bit, gcm_siv_ctx, record_auth_key, - record_enc_key, nonce); - - alignas(16) uint8_t tag[16] = {0}; - gcm_siv_asm_polyval(tag, in, in_len, ad, ad_len, - (const uint8_t *)record_auth_key, nonce); - - struct aead_aes_gcm_siv_asm_ctx enc_key_expanded; - - if (gcm_siv_ctx->is_128_bit) { - aes128gcmsiv_aes_ks_enc_x1(tag, tag, &enc_key_expanded.key[0], - record_enc_key); - - if (in_len < 128) { - aes128gcmsiv_enc_msg_x4(in, out, tag, &enc_key_expanded, in_len & ~15); - } else { - aes128gcmsiv_enc_msg_x8(in, out, tag, &enc_key_expanded, in_len & ~15); - } - } else { - aes256gcmsiv_aes_ks_enc_x1(tag, tag, &enc_key_expanded.key[0], - record_enc_key); - - if (in_len < 128) { - aes256gcmsiv_enc_msg_x4(in, out, tag, &enc_key_expanded, in_len & ~15); - } else { - aes256gcmsiv_enc_msg_x8(in, out, tag, &enc_key_expanded, in_len & ~15); - } - } - - if (in_len & 15) { - aead_aes_gcm_siv_asm_crypt_last_block(gcm_siv_ctx->is_128_bit, out, in, - in_len, tag, &enc_key_expanded); - } - - OPENSSL_memcpy(out_tag, tag, sizeof(tag)); - *out_tag_len = EVP_AEAD_AES_GCM_SIV_TAG_LEN; - - return 1; -} - -// TODO(martinkr): Add aead_aes_gcm_siv_asm_open_gather. N.B. aes128gcmsiv_dec -// expects ciphertext and tag in a contiguous buffer. - -static int aead_aes_gcm_siv_asm_open(const EVP_AEAD_CTX *ctx, uint8_t *out, - size_t *out_len, size_t max_out_len, - const uint8_t *nonce, size_t nonce_len, - const uint8_t *in, size_t in_len, - const uint8_t *ad, size_t ad_len) { - const uint64_t ad_len_64 = ad_len; - if (ad_len_64 >= (UINT64_C(1) << 61)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - const uint64_t in_len_64 = in_len; - if (in_len < EVP_AEAD_AES_GCM_SIV_TAG_LEN || - in_len_64 > (UINT64_C(1) << 36) + AES_BLOCK_SIZE) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - if (nonce_len != EVP_AEAD_AES_GCM_SIV_NONCE_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - const struct aead_aes_gcm_siv_asm_ctx *gcm_siv_ctx = asm_ctx_from_ctx(ctx); - const size_t plaintext_len = in_len - EVP_AEAD_AES_GCM_SIV_TAG_LEN; - const uint8_t *const given_tag = in + plaintext_len; - - if (max_out_len < plaintext_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - - alignas(16) uint64_t record_auth_key[2]; - alignas(16) uint64_t record_enc_key[4]; - aead_aes_gcm_siv_kdf(gcm_siv_ctx->is_128_bit, gcm_siv_ctx, record_auth_key, - record_enc_key, nonce); - - struct aead_aes_gcm_siv_asm_ctx expanded_key; - if (gcm_siv_ctx->is_128_bit) { - aes128gcmsiv_aes_ks((const uint8_t *) record_enc_key, &expanded_key.key[0]); - } else { - aes256gcmsiv_aes_ks((const uint8_t *) record_enc_key, &expanded_key.key[0]); - } - // calculated_tag is 16*8 bytes, rather than 16 bytes, because - // aes[128|256]gcmsiv_dec uses the extra as scratch space. - alignas(16) uint8_t calculated_tag[16 * 8] = {0}; - - OPENSSL_memset(calculated_tag, 0, EVP_AEAD_AES_GCM_SIV_TAG_LEN); - const size_t ad_blocks = ad_len / 16; - aesgcmsiv_polyval_horner(calculated_tag, (const uint8_t *)record_auth_key, ad, - ad_blocks); - - uint8_t scratch[16]; - if (ad_len & 15) { - OPENSSL_memset(scratch, 0, sizeof(scratch)); - OPENSSL_memcpy(scratch, &ad[ad_len & ~15], ad_len & 15); - aesgcmsiv_polyval_horner(calculated_tag, (const uint8_t *)record_auth_key, - scratch, 1); - } - - alignas(16) uint8_t htable[16 * 6]; - aesgcmsiv_htable6_init(htable, (const uint8_t *)record_auth_key); - - if (gcm_siv_ctx->is_128_bit) { - aes128gcmsiv_dec(in, out, calculated_tag, htable, &expanded_key, - plaintext_len); - } else { - aes256gcmsiv_dec(in, out, calculated_tag, htable, &expanded_key, - plaintext_len); - } - - if (plaintext_len & 15) { - aead_aes_gcm_siv_asm_crypt_last_block(gcm_siv_ctx->is_128_bit, out, in, - plaintext_len, given_tag, - &expanded_key); - OPENSSL_memset(scratch, 0, sizeof(scratch)); - OPENSSL_memcpy(scratch, out + (plaintext_len & ~15), plaintext_len & 15); - aesgcmsiv_polyval_horner(calculated_tag, (const uint8_t *)record_auth_key, - scratch, 1); - } - - uint8_t length_block[16]; - CRYPTO_store_u64_le(length_block, ad_len * 8); - CRYPTO_store_u64_le(length_block + 8, plaintext_len * 8); - aesgcmsiv_polyval_horner(calculated_tag, (const uint8_t *)record_auth_key, - length_block, 1); - - for (size_t i = 0; i < 12; i++) { - calculated_tag[i] ^= nonce[i]; - } - - calculated_tag[15] &= 0x7f; - - if (gcm_siv_ctx->is_128_bit) { - aes128gcmsiv_ecb_enc_block(calculated_tag, calculated_tag, &expanded_key); - } else { - aes256gcmsiv_ecb_enc_block(calculated_tag, calculated_tag, &expanded_key); - } - - if (CRYPTO_memcmp(calculated_tag, given_tag, EVP_AEAD_AES_GCM_SIV_TAG_LEN) != - 0) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - *out_len = in_len - EVP_AEAD_AES_GCM_SIV_TAG_LEN; - return 1; -} - -static const EVP_AEAD aead_aes_128_gcm_siv_asm = { - 16, // key length - EVP_AEAD_AES_GCM_SIV_NONCE_LEN, // nonce length - EVP_AEAD_AES_GCM_SIV_TAG_LEN, // overhead - EVP_AEAD_AES_GCM_SIV_TAG_LEN, // max tag length - 0, // seal_scatter_supports_extra_in - - aead_aes_gcm_siv_asm_init, - NULL /* init_with_direction */, - aead_aes_gcm_siv_asm_cleanup, - aead_aes_gcm_siv_asm_open, - aead_aes_gcm_siv_asm_seal_scatter, - NULL /* open_gather */, - NULL /* get_iv */, - NULL /* tag_len */, -}; - -static const EVP_AEAD aead_aes_256_gcm_siv_asm = { - 32, // key length - EVP_AEAD_AES_GCM_SIV_NONCE_LEN, // nonce length - EVP_AEAD_AES_GCM_SIV_TAG_LEN, // overhead - EVP_AEAD_AES_GCM_SIV_TAG_LEN, // max tag length - 0, // seal_scatter_supports_extra_in - - aead_aes_gcm_siv_asm_init, - NULL /* init_with_direction */, - aead_aes_gcm_siv_asm_cleanup, - aead_aes_gcm_siv_asm_open, - aead_aes_gcm_siv_asm_seal_scatter, - NULL /* open_gather */, - NULL /* get_iv */, - NULL /* tag_len */, -}; - -#endif // X86_64 && !NO_ASM && !WINDOWS - -struct aead_aes_gcm_siv_ctx { - union { - double align; - AES_KEY ks; - } ks; - block128_f kgk_block; - unsigned is_256:1; -}; - -static_assert(sizeof(((EVP_AEAD_CTX *)NULL)->state) >= - sizeof(struct aead_aes_gcm_siv_ctx), - "AEAD state is too small"); -static_assert(alignof(union evp_aead_ctx_st_state) >= - alignof(struct aead_aes_gcm_siv_ctx), - "AEAD state has insufficient alignment"); - -static int aead_aes_gcm_siv_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t tag_len) { - const size_t key_bits = key_len * 8; - - if (key_bits != 128 && key_bits != 256) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); - return 0; // EVP_AEAD_CTX_init should catch this. - } - - if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) { - tag_len = EVP_AEAD_AES_GCM_SIV_TAG_LEN; - } - if (tag_len != EVP_AEAD_AES_GCM_SIV_TAG_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE); - return 0; - } - - struct aead_aes_gcm_siv_ctx *gcm_siv_ctx = - (struct aead_aes_gcm_siv_ctx *)&ctx->state; - OPENSSL_memset(gcm_siv_ctx, 0, sizeof(struct aead_aes_gcm_siv_ctx)); - - aes_ctr_set_key(&gcm_siv_ctx->ks.ks, NULL, &gcm_siv_ctx->kgk_block, key, - key_len); - gcm_siv_ctx->is_256 = (key_len == 32); - ctx->tag_len = tag_len; - - return 1; -} - -static void aead_aes_gcm_siv_cleanup(EVP_AEAD_CTX *ctx) {} - -// gcm_siv_crypt encrypts (or decrypts—it's the same thing) |in_len| bytes from -// |in| to |out|, using the block function |enc_block| with |key| in counter -// mode, starting at |initial_counter|. This differs from the traditional -// counter mode code in that the counter is handled little-endian, only the -// first four bytes are used and the GCM-SIV tweak to the final byte is -// applied. The |in| and |out| pointers may be equal but otherwise must not -// alias. -static void gcm_siv_crypt(uint8_t *out, const uint8_t *in, size_t in_len, - const uint8_t initial_counter[AES_BLOCK_SIZE], - block128_f enc_block, const AES_KEY *key) { - uint8_t counter[16]; - - OPENSSL_memcpy(counter, initial_counter, AES_BLOCK_SIZE); - counter[15] |= 0x80; - - for (size_t done = 0; done < in_len;) { - uint8_t keystream[AES_BLOCK_SIZE]; - enc_block(counter, keystream, key); - CRYPTO_store_u32_le(counter, CRYPTO_load_u32_le(counter) + 1); - - size_t todo = AES_BLOCK_SIZE; - if (in_len - done < todo) { - todo = in_len - done; - } - - for (size_t i = 0; i < todo; i++) { - out[done + i] = keystream[i] ^ in[done + i]; - } - - done += todo; - } -} - -// gcm_siv_polyval evaluates POLYVAL at |auth_key| on the given plaintext and -// AD. The result is written to |out_tag|. -static void gcm_siv_polyval( - uint8_t out_tag[16], const uint8_t *in, size_t in_len, const uint8_t *ad, - size_t ad_len, const uint8_t auth_key[16], - const uint8_t nonce[EVP_AEAD_AES_GCM_SIV_NONCE_LEN]) { - struct polyval_ctx polyval_ctx; - CRYPTO_POLYVAL_init(&polyval_ctx, auth_key); - - CRYPTO_POLYVAL_update_blocks(&polyval_ctx, ad, ad_len & ~15); - - uint8_t scratch[16]; - if (ad_len & 15) { - OPENSSL_memset(scratch, 0, sizeof(scratch)); - OPENSSL_memcpy(scratch, &ad[ad_len & ~15], ad_len & 15); - CRYPTO_POLYVAL_update_blocks(&polyval_ctx, scratch, sizeof(scratch)); - } - - CRYPTO_POLYVAL_update_blocks(&polyval_ctx, in, in_len & ~15); - if (in_len & 15) { - OPENSSL_memset(scratch, 0, sizeof(scratch)); - OPENSSL_memcpy(scratch, &in[in_len & ~15], in_len & 15); - CRYPTO_POLYVAL_update_blocks(&polyval_ctx, scratch, sizeof(scratch)); - } - - uint8_t length_block[16]; - CRYPTO_store_u64_le(length_block, ad_len * 8); - CRYPTO_store_u64_le(length_block + 8, in_len * 8); - CRYPTO_POLYVAL_update_blocks(&polyval_ctx, length_block, - sizeof(length_block)); - - CRYPTO_POLYVAL_finish(&polyval_ctx, out_tag); - for (size_t i = 0; i < EVP_AEAD_AES_GCM_SIV_NONCE_LEN; i++) { - out_tag[i] ^= nonce[i]; - } - out_tag[15] &= 0x7f; -} - -// gcm_siv_record_keys contains the keys used for a specific GCM-SIV record. -struct gcm_siv_record_keys { - uint8_t auth_key[16]; - union { - double align; - AES_KEY ks; - } enc_key; - block128_f enc_block; -}; - -// gcm_siv_keys calculates the keys for a specific GCM-SIV record with the -// given nonce and writes them to |*out_keys|. -static void gcm_siv_keys( - const struct aead_aes_gcm_siv_ctx *gcm_siv_ctx, - struct gcm_siv_record_keys *out_keys, - const uint8_t nonce[EVP_AEAD_AES_GCM_SIV_NONCE_LEN]) { - const AES_KEY *const key = &gcm_siv_ctx->ks.ks; - uint8_t key_material[(128 /* POLYVAL key */ + 256 /* max AES key */) / 8]; - const size_t blocks_needed = gcm_siv_ctx->is_256 ? 6 : 4; - - uint8_t counter[AES_BLOCK_SIZE]; - OPENSSL_memset(counter, 0, AES_BLOCK_SIZE - EVP_AEAD_AES_GCM_SIV_NONCE_LEN); - OPENSSL_memcpy(counter + AES_BLOCK_SIZE - EVP_AEAD_AES_GCM_SIV_NONCE_LEN, - nonce, EVP_AEAD_AES_GCM_SIV_NONCE_LEN); - for (size_t i = 0; i < blocks_needed; i++) { - counter[0] = i; - - uint8_t ciphertext[AES_BLOCK_SIZE]; - gcm_siv_ctx->kgk_block(counter, ciphertext, key); - OPENSSL_memcpy(&key_material[i * 8], ciphertext, 8); - } - - OPENSSL_memcpy(out_keys->auth_key, key_material, 16); - // Note the |ctr128_f| function uses a big-endian couner, while AES-GCM-SIV - // uses a little-endian counter. We ignore the return value and only use - // |block128_f|. This has a significant performance cost for the fallback - // bitsliced AES implementations (bsaes and aes_nohw). - // - // We currently do not consider AES-GCM-SIV to be performance-sensitive on - // client hardware. If this changes, we can write little-endian |ctr128_f| - // functions. - aes_ctr_set_key(&out_keys->enc_key.ks, NULL, &out_keys->enc_block, - key_material + 16, gcm_siv_ctx->is_256 ? 32 : 16); -} - -static int aead_aes_gcm_siv_seal_scatter( - const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag, - size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in, - size_t extra_in_len, const uint8_t *ad, size_t ad_len) { - const struct aead_aes_gcm_siv_ctx *gcm_siv_ctx = - (struct aead_aes_gcm_siv_ctx *)&ctx->state; - const uint64_t in_len_64 = in_len; - const uint64_t ad_len_64 = ad_len; - - if (in_len + EVP_AEAD_AES_GCM_SIV_TAG_LEN < in_len || - in_len_64 > (UINT64_C(1) << 36) || - ad_len_64 >= (UINT64_C(1) << 61)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - if (max_out_tag_len < EVP_AEAD_AES_GCM_SIV_TAG_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - - if (nonce_len != EVP_AEAD_AES_GCM_SIV_NONCE_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - struct gcm_siv_record_keys keys; - gcm_siv_keys(gcm_siv_ctx, &keys, nonce); - - uint8_t tag[16]; - gcm_siv_polyval(tag, in, in_len, ad, ad_len, keys.auth_key, nonce); - keys.enc_block(tag, tag, &keys.enc_key.ks); - - gcm_siv_crypt(out, in, in_len, tag, keys.enc_block, &keys.enc_key.ks); - - OPENSSL_memcpy(out_tag, tag, EVP_AEAD_AES_GCM_SIV_TAG_LEN); - *out_tag_len = EVP_AEAD_AES_GCM_SIV_TAG_LEN; - - return 1; -} - -static int aead_aes_gcm_siv_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out, - const uint8_t *nonce, size_t nonce_len, - const uint8_t *in, size_t in_len, - const uint8_t *in_tag, - size_t in_tag_len, const uint8_t *ad, - size_t ad_len) { - const uint64_t ad_len_64 = ad_len; - if (ad_len_64 >= (UINT64_C(1) << 61)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - const uint64_t in_len_64 = in_len; - if (in_tag_len != EVP_AEAD_AES_GCM_SIV_TAG_LEN || - in_len_64 > (UINT64_C(1) << 36) + AES_BLOCK_SIZE) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - if (nonce_len != EVP_AEAD_AES_GCM_SIV_NONCE_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - const struct aead_aes_gcm_siv_ctx *gcm_siv_ctx = - (struct aead_aes_gcm_siv_ctx *)&ctx->state; - - struct gcm_siv_record_keys keys; - gcm_siv_keys(gcm_siv_ctx, &keys, nonce); - - gcm_siv_crypt(out, in, in_len, in_tag, keys.enc_block, &keys.enc_key.ks); - - uint8_t expected_tag[EVP_AEAD_AES_GCM_SIV_TAG_LEN]; - gcm_siv_polyval(expected_tag, out, in_len, ad, ad_len, keys.auth_key, nonce); - keys.enc_block(expected_tag, expected_tag, &keys.enc_key.ks); - - if (CRYPTO_memcmp(expected_tag, in_tag, sizeof(expected_tag)) != 0) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - return 1; -} - -static const EVP_AEAD aead_aes_128_gcm_siv = { - 16, // key length - EVP_AEAD_AES_GCM_SIV_NONCE_LEN, // nonce length - EVP_AEAD_AES_GCM_SIV_TAG_LEN, // overhead - EVP_AEAD_AES_GCM_SIV_TAG_LEN, // max tag length - 0, // seal_scatter_supports_extra_in - - aead_aes_gcm_siv_init, - NULL /* init_with_direction */, - aead_aes_gcm_siv_cleanup, - NULL /* open */, - aead_aes_gcm_siv_seal_scatter, - aead_aes_gcm_siv_open_gather, - NULL /* get_iv */, - NULL /* tag_len */, -}; - -static const EVP_AEAD aead_aes_256_gcm_siv = { - 32, // key length - EVP_AEAD_AES_GCM_SIV_NONCE_LEN, // nonce length - EVP_AEAD_AES_GCM_SIV_TAG_LEN, // overhead - EVP_AEAD_AES_GCM_SIV_TAG_LEN, // max tag length - 0, // seal_scatter_supports_extra_in - - aead_aes_gcm_siv_init, - NULL /* init_with_direction */, - aead_aes_gcm_siv_cleanup, - NULL /* open */, - aead_aes_gcm_siv_seal_scatter, - aead_aes_gcm_siv_open_gather, - NULL /* get_iv */, - NULL /* tag_len */, -}; - -#if defined(AES_GCM_SIV_ASM) - -const EVP_AEAD *EVP_aead_aes_128_gcm_siv(void) { - if (CRYPTO_is_AVX_capable() && CRYPTO_is_AESNI_capable()) { - return &aead_aes_128_gcm_siv_asm; - } - return &aead_aes_128_gcm_siv; -} - -const EVP_AEAD *EVP_aead_aes_256_gcm_siv(void) { - if (CRYPTO_is_AVX_capable() && CRYPTO_is_AESNI_capable()) { - return &aead_aes_256_gcm_siv_asm; - } - return &aead_aes_256_gcm_siv; -} - -#else - -const EVP_AEAD *EVP_aead_aes_128_gcm_siv(void) { - return &aead_aes_128_gcm_siv; -} - -const EVP_AEAD *EVP_aead_aes_256_gcm_siv(void) { - return &aead_aes_256_gcm_siv; -} - -#endif // AES_GCM_SIV_ASM diff --git a/third_party/boringssl/src/crypto/cipher_extra/e_chacha20poly1305.c b/third_party/boringssl/src/crypto/cipher_extra/e_chacha20poly1305.c deleted file mode 100644 index 4a46a1df..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/e_chacha20poly1305.c +++ /dev/null @@ -1,341 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../chacha/internal.h" -#include "../fipsmodule/cipher/internal.h" -#include "../internal.h" - -struct aead_chacha20_poly1305_ctx { - uint8_t key[32]; -}; - -static_assert(sizeof(((EVP_AEAD_CTX *)NULL)->state) >= - sizeof(struct aead_chacha20_poly1305_ctx), - "AEAD state is too small"); -static_assert(alignof(union evp_aead_ctx_st_state) >= - alignof(struct aead_chacha20_poly1305_ctx), - "AEAD state has insufficient alignment"); - -static int aead_chacha20_poly1305_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t tag_len) { - struct aead_chacha20_poly1305_ctx *c20_ctx = - (struct aead_chacha20_poly1305_ctx *)&ctx->state; - - if (tag_len == 0) { - tag_len = POLY1305_TAG_LEN; - } - - if (tag_len > POLY1305_TAG_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - if (key_len != sizeof(c20_ctx->key)) { - return 0; // internal error - EVP_AEAD_CTX_init should catch this. - } - - OPENSSL_memcpy(c20_ctx->key, key, key_len); - ctx->tag_len = tag_len; - - return 1; -} - -static void aead_chacha20_poly1305_cleanup(EVP_AEAD_CTX *ctx) {} - -static void poly1305_update_length(poly1305_state *poly1305, size_t data_len) { - uint8_t length_bytes[8]; - - for (unsigned i = 0; i < sizeof(length_bytes); i++) { - length_bytes[i] = data_len; - data_len >>= 8; - } - - CRYPTO_poly1305_update(poly1305, length_bytes, sizeof(length_bytes)); -} - -// calc_tag fills |tag| with the authentication tag for the given inputs. -static void calc_tag(uint8_t tag[POLY1305_TAG_LEN], const uint8_t *key, - const uint8_t nonce[12], const uint8_t *ad, size_t ad_len, - const uint8_t *ciphertext, size_t ciphertext_len, - const uint8_t *ciphertext_extra, - size_t ciphertext_extra_len) { - alignas(16) uint8_t poly1305_key[32]; - OPENSSL_memset(poly1305_key, 0, sizeof(poly1305_key)); - CRYPTO_chacha_20(poly1305_key, poly1305_key, sizeof(poly1305_key), key, nonce, - 0); - - static const uint8_t padding[16] = { 0 }; // Padding is all zeros. - poly1305_state ctx; - CRYPTO_poly1305_init(&ctx, poly1305_key); - CRYPTO_poly1305_update(&ctx, ad, ad_len); - if (ad_len % 16 != 0) { - CRYPTO_poly1305_update(&ctx, padding, sizeof(padding) - (ad_len % 16)); - } - CRYPTO_poly1305_update(&ctx, ciphertext, ciphertext_len); - CRYPTO_poly1305_update(&ctx, ciphertext_extra, ciphertext_extra_len); - const size_t ciphertext_total = ciphertext_len + ciphertext_extra_len; - if (ciphertext_total % 16 != 0) { - CRYPTO_poly1305_update(&ctx, padding, - sizeof(padding) - (ciphertext_total % 16)); - } - poly1305_update_length(&ctx, ad_len); - poly1305_update_length(&ctx, ciphertext_total); - CRYPTO_poly1305_finish(&ctx, tag); -} - -static int chacha20_poly1305_seal_scatter( - const uint8_t *key, uint8_t *out, uint8_t *out_tag, - size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in, - size_t extra_in_len, const uint8_t *ad, size_t ad_len, size_t tag_len) { - if (extra_in_len + tag_len < tag_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - if (max_out_tag_len < tag_len + extra_in_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - if (nonce_len != 12) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - // |CRYPTO_chacha_20| uses a 32-bit block counter. Therefore we disallow - // individual operations that work on more than 256GB at a time. - // |in_len_64| is needed because, on 32-bit platforms, size_t is only - // 32-bits and this produces a warning because it's always false. - // Casting to uint64_t inside the conditional is not sufficient to stop - // the warning. - const uint64_t in_len_64 = in_len; - if (in_len_64 >= (UINT64_C(1) << 32) * 64 - 64) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - if (max_out_tag_len < tag_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - - // The the extra input is given, it is expected to be very short and so is - // encrypted byte-by-byte first. - if (extra_in_len) { - static const size_t kChaChaBlockSize = 64; - uint32_t block_counter = 1 + (in_len / kChaChaBlockSize); - size_t offset = in_len % kChaChaBlockSize; - uint8_t block[64 /* kChaChaBlockSize */]; - - for (size_t done = 0; done < extra_in_len; block_counter++) { - memset(block, 0, sizeof(block)); - CRYPTO_chacha_20(block, block, sizeof(block), key, nonce, - block_counter); - for (size_t i = offset; i < sizeof(block) && done < extra_in_len; - i++, done++) { - out_tag[done] = extra_in[done] ^ block[i]; - } - offset = 0; - } - } - - union chacha20_poly1305_seal_data data; - if (chacha20_poly1305_asm_capable()) { - OPENSSL_memcpy(data.in.key, key, 32); - data.in.counter = 0; - OPENSSL_memcpy(data.in.nonce, nonce, 12); - data.in.extra_ciphertext = out_tag; - data.in.extra_ciphertext_len = extra_in_len; - chacha20_poly1305_seal(out, in, in_len, ad, ad_len, &data); - } else { - CRYPTO_chacha_20(out, in, in_len, key, nonce, 1); - calc_tag(data.out.tag, key, nonce, ad, ad_len, out, in_len, out_tag, - extra_in_len); - } - - OPENSSL_memcpy(out_tag + extra_in_len, data.out.tag, tag_len); - *out_tag_len = extra_in_len + tag_len; - return 1; -} - -static int aead_chacha20_poly1305_seal_scatter( - const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag, - size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in, - size_t extra_in_len, const uint8_t *ad, size_t ad_len) { - const struct aead_chacha20_poly1305_ctx *c20_ctx = - (struct aead_chacha20_poly1305_ctx *)&ctx->state; - - return chacha20_poly1305_seal_scatter( - c20_ctx->key, out, out_tag, out_tag_len, max_out_tag_len, nonce, - nonce_len, in, in_len, extra_in, extra_in_len, ad, ad_len, ctx->tag_len); -} - -static int aead_xchacha20_poly1305_seal_scatter( - const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag, - size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in, - size_t extra_in_len, const uint8_t *ad, size_t ad_len) { - const struct aead_chacha20_poly1305_ctx *c20_ctx = - (struct aead_chacha20_poly1305_ctx *)&ctx->state; - - if (nonce_len != 24) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - alignas(4) uint8_t derived_key[32]; - alignas(4) uint8_t derived_nonce[12]; - CRYPTO_hchacha20(derived_key, c20_ctx->key, nonce); - OPENSSL_memset(derived_nonce, 0, 4); - OPENSSL_memcpy(&derived_nonce[4], &nonce[16], 8); - - return chacha20_poly1305_seal_scatter( - derived_key, out, out_tag, out_tag_len, max_out_tag_len, - derived_nonce, sizeof(derived_nonce), in, in_len, extra_in, extra_in_len, - ad, ad_len, ctx->tag_len); -} - -static int chacha20_poly1305_open_gather( - const uint8_t *key, uint8_t *out, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *in_tag, - size_t in_tag_len, const uint8_t *ad, size_t ad_len, size_t tag_len) { - if (nonce_len != 12) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - if (in_tag_len != tag_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - // |CRYPTO_chacha_20| uses a 32-bit block counter. Therefore we disallow - // individual operations that work on more than 256GB at a time. - // |in_len_64| is needed because, on 32-bit platforms, size_t is only - // 32-bits and this produces a warning because it's always false. - // Casting to uint64_t inside the conditional is not sufficient to stop - // the warning. - const uint64_t in_len_64 = in_len; - if (in_len_64 >= (UINT64_C(1) << 32) * 64 - 64) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - union chacha20_poly1305_open_data data; - if (chacha20_poly1305_asm_capable()) { - OPENSSL_memcpy(data.in.key, key, 32); - data.in.counter = 0; - OPENSSL_memcpy(data.in.nonce, nonce, 12); - chacha20_poly1305_open(out, in, in_len, ad, ad_len, &data); - } else { - calc_tag(data.out.tag, key, nonce, ad, ad_len, in, in_len, NULL, 0); - CRYPTO_chacha_20(out, in, in_len, key, nonce, 1); - } - - if (CRYPTO_memcmp(data.out.tag, in_tag, tag_len) != 0) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - return 1; -} - -static int aead_chacha20_poly1305_open_gather( - const EVP_AEAD_CTX *ctx, uint8_t *out, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *in_tag, - size_t in_tag_len, const uint8_t *ad, size_t ad_len) { - const struct aead_chacha20_poly1305_ctx *c20_ctx = - (struct aead_chacha20_poly1305_ctx *)&ctx->state; - - return chacha20_poly1305_open_gather(c20_ctx->key, out, nonce, nonce_len, in, - in_len, in_tag, in_tag_len, ad, ad_len, - ctx->tag_len); -} - -static int aead_xchacha20_poly1305_open_gather( - const EVP_AEAD_CTX *ctx, uint8_t *out, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *in_tag, - size_t in_tag_len, const uint8_t *ad, size_t ad_len) { - const struct aead_chacha20_poly1305_ctx *c20_ctx = - (struct aead_chacha20_poly1305_ctx *)&ctx->state; - - if (nonce_len != 24) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - alignas(4) uint8_t derived_key[32]; - alignas(4) uint8_t derived_nonce[12]; - CRYPTO_hchacha20(derived_key, c20_ctx->key, nonce); - OPENSSL_memset(derived_nonce, 0, 4); - OPENSSL_memcpy(&derived_nonce[4], &nonce[16], 8); - - return chacha20_poly1305_open_gather( - derived_key, out, derived_nonce, sizeof(derived_nonce), in, in_len, - in_tag, in_tag_len, ad, ad_len, ctx->tag_len); -} - -static const EVP_AEAD aead_chacha20_poly1305 = { - 32, // key len - 12, // nonce len - POLY1305_TAG_LEN, // overhead - POLY1305_TAG_LEN, // max tag length - 1, // seal_scatter_supports_extra_in - - aead_chacha20_poly1305_init, - NULL, // init_with_direction - aead_chacha20_poly1305_cleanup, - NULL /* open */, - aead_chacha20_poly1305_seal_scatter, - aead_chacha20_poly1305_open_gather, - NULL, // get_iv - NULL, // tag_len -}; - -static const EVP_AEAD aead_xchacha20_poly1305 = { - 32, // key len - 24, // nonce len - POLY1305_TAG_LEN, // overhead - POLY1305_TAG_LEN, // max tag length - 1, // seal_scatter_supports_extra_in - - aead_chacha20_poly1305_init, - NULL, // init_with_direction - aead_chacha20_poly1305_cleanup, - NULL /* open */, - aead_xchacha20_poly1305_seal_scatter, - aead_xchacha20_poly1305_open_gather, - NULL, // get_iv - NULL, // tag_len -}; - -const EVP_AEAD *EVP_aead_chacha20_poly1305(void) { - return &aead_chacha20_poly1305; -} - -const EVP_AEAD *EVP_aead_xchacha20_poly1305(void) { - return &aead_xchacha20_poly1305; -} diff --git a/third_party/boringssl/src/crypto/cipher_extra/e_des.c b/third_party/boringssl/src/crypto/cipher_extra/e_des.c deleted file mode 100644 index 300ec00e..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/e_des.c +++ /dev/null @@ -1,259 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include -#include - -#include "../fipsmodule/cipher/internal.h" -#include "internal.h" - - -typedef struct { - union { - double align; - DES_key_schedule ks; - } ks; -} EVP_DES_KEY; - -static int des_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, - const uint8_t *iv, int enc) { - DES_cblock *deskey = (DES_cblock *)key; - EVP_DES_KEY *dat = (EVP_DES_KEY *)ctx->cipher_data; - - DES_set_key(deskey, &dat->ks.ks); - return 1; -} - -static int des_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, - size_t in_len) { - EVP_DES_KEY *dat = (EVP_DES_KEY *)ctx->cipher_data; - - DES_ncbc_encrypt(in, out, in_len, &dat->ks.ks, (DES_cblock *)ctx->iv, - ctx->encrypt); - - return 1; -} - -static const EVP_CIPHER evp_des_cbc = { - /* nid = */ NID_des_cbc, - /* block_size = */ 8, - /* key_len = */ 8, - /* iv_len = */ 8, - /* ctx_size = */ sizeof(EVP_DES_KEY), - /* flags = */ EVP_CIPH_CBC_MODE, - /* app_data = */ NULL, - /* init = */ des_init_key, - /* cipher = */ des_cbc_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, -}; - -const EVP_CIPHER *EVP_des_cbc(void) { return &evp_des_cbc; } - -static int des_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, - size_t in_len) { - if (in_len < ctx->cipher->block_size) { - return 1; - } - in_len -= ctx->cipher->block_size; - - EVP_DES_KEY *dat = (EVP_DES_KEY *)ctx->cipher_data; - for (size_t i = 0; i <= in_len; i += ctx->cipher->block_size) { - DES_ecb_encrypt((DES_cblock *)(in + i), (DES_cblock *)(out + i), - &dat->ks.ks, ctx->encrypt); - } - return 1; -} - -static const EVP_CIPHER evp_des_ecb = { - /* nid = */ NID_des_ecb, - /* block_size = */ 8, - /* key_len = */ 8, - /* iv_len = */ 0, - /* ctx_size = */ sizeof(EVP_DES_KEY), - /* flags = */ EVP_CIPH_ECB_MODE, - /* app_data = */ NULL, - /* init = */ des_init_key, - /* cipher = */ des_ecb_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, -}; - -const EVP_CIPHER *EVP_des_ecb(void) { return &evp_des_ecb; } - -typedef struct { - union { - double align; - DES_key_schedule ks[3]; - } ks; -} DES_EDE_KEY; - -static int des_ede3_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, - const uint8_t *iv, int enc) { - DES_cblock *deskey = (DES_cblock *)key; - DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data; - - DES_set_key(&deskey[0], &dat->ks.ks[0]); - DES_set_key(&deskey[1], &dat->ks.ks[1]); - DES_set_key(&deskey[2], &dat->ks.ks[2]); - - return 1; -} - -static int des_ede3_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, - const uint8_t *in, size_t in_len) { - DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data; - - DES_ede3_cbc_encrypt(in, out, in_len, &dat->ks.ks[0], &dat->ks.ks[1], - &dat->ks.ks[2], (DES_cblock *)ctx->iv, ctx->encrypt); - - return 1; -} - -static const EVP_CIPHER evp_des_ede3_cbc = { - /* nid = */ NID_des_ede3_cbc, - /* block_size = */ 8, - /* key_len = */ 24, - /* iv_len = */ 8, - /* ctx_size = */ sizeof(DES_EDE_KEY), - /* flags = */ EVP_CIPH_CBC_MODE, - /* app_data = */ NULL, - /* init = */ des_ede3_init_key, - /* cipher = */ des_ede3_cbc_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, -}; - -const EVP_CIPHER *EVP_des_ede3_cbc(void) { return &evp_des_ede3_cbc; } - -static int des_ede_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, - const uint8_t *iv, int enc) { - DES_cblock *deskey = (DES_cblock *)key; - DES_EDE_KEY *dat = (DES_EDE_KEY *)ctx->cipher_data; - - DES_set_key(&deskey[0], &dat->ks.ks[0]); - DES_set_key(&deskey[1], &dat->ks.ks[1]); - DES_set_key(&deskey[0], &dat->ks.ks[2]); - - return 1; -} - -static const EVP_CIPHER evp_des_ede_cbc = { - /* nid = */ NID_des_ede_cbc, - /* block_size = */ 8, - /* key_len = */ 16, - /* iv_len = */ 8, - /* ctx_size = */ sizeof(DES_EDE_KEY), - /* flags = */ EVP_CIPH_CBC_MODE, - /* app_data = */ NULL, - /* init = */ des_ede_init_key, - /* cipher = */ des_ede3_cbc_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, -}; - -const EVP_CIPHER *EVP_des_ede_cbc(void) { return &evp_des_ede_cbc; } - -static int des_ede_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, - const uint8_t *in, size_t in_len) { - if (in_len < ctx->cipher->block_size) { - return 1; - } - in_len -= ctx->cipher->block_size; - - DES_EDE_KEY *dat = (DES_EDE_KEY *) ctx->cipher_data; - for (size_t i = 0; i <= in_len; i += ctx->cipher->block_size) { - DES_ecb3_encrypt((DES_cblock *) (in + i), (DES_cblock *) (out + i), - &dat->ks.ks[0], &dat->ks.ks[1], &dat->ks.ks[2], - ctx->encrypt); - } - return 1; -} - -static const EVP_CIPHER evp_des_ede = { - /* nid = */ NID_des_ede_ecb, - /* block_size = */ 8, - /* key_len = */ 16, - /* iv_len = */ 0, - /* ctx_size = */ sizeof(DES_EDE_KEY), - /* flags = */ EVP_CIPH_ECB_MODE, - /* app_data = */ NULL, - /* init = */ des_ede_init_key, - /* cipher = */ des_ede_ecb_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, -}; - -const EVP_CIPHER *EVP_des_ede(void) { return &evp_des_ede; } - -static const EVP_CIPHER evp_des_ede3 = { - /* nid = */ NID_des_ede3_ecb, - /* block_size = */ 8, - /* key_len = */ 24, - /* iv_len = */ 0, - /* ctx_size = */ sizeof(DES_EDE_KEY), - /* flags = */ EVP_CIPH_ECB_MODE, - /* app_data = */ NULL, - /* init = */ des_ede3_init_key, - /* cipher = */ des_ede_ecb_cipher, - /* cleanup = */ NULL, - /* ctrl = */ NULL, -}; - -const EVP_CIPHER *EVP_des_ede3(void) { return &evp_des_ede3; } - -const EVP_CIPHER *EVP_des_ede3_ecb(void) { return EVP_des_ede3(); } diff --git a/third_party/boringssl/src/crypto/cipher_extra/e_null.c b/third_party/boringssl/src/crypto/cipher_extra/e_null.c deleted file mode 100644 index e9ee8adc..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/e_null.c +++ /dev/null @@ -1,86 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include - -#include "../fipsmodule/cipher/internal.h" -#include "../internal.h" - - -static int null_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, - const uint8_t *iv, int enc) { - return 1; -} - -static int null_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, - const uint8_t *in, size_t in_len) { - if (in != out) { - OPENSSL_memcpy(out, in, in_len); - } - return 1; -} - -static const EVP_CIPHER n_cipher = { - NID_undef, 1 /* block size */, 0 /* key_len */, 0 /* iv_len */, - 0 /* ctx_size */, 0 /* flags */, NULL /* app_data */, null_init_key, - null_cipher, NULL /* cleanup */, NULL /* ctrl */, -}; - -const EVP_CIPHER *EVP_enc_null(void) { return &n_cipher; } diff --git a/third_party/boringssl/src/crypto/cipher_extra/e_rc2.c b/third_party/boringssl/src/crypto/cipher_extra/e_rc2.c deleted file mode 100644 index ffc5e6b1..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/e_rc2.c +++ /dev/null @@ -1,463 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include - -#include "../fipsmodule/cipher/internal.h" -#include "../internal.h" - - -#define c2l(c, l) \ - do { \ - (l) = ((uint32_t)(*((c)++))); \ - (l) |= ((uint32_t)(*((c)++))) << 8L; \ - (l) |= ((uint32_t)(*((c)++))) << 16L; \ - (l) |= ((uint32_t)(*((c)++))) << 24L; \ - } while (0) - -#define c2ln(c, l1, l2, n) \ - do { \ - (c) += (n); \ - (l1) = (l2) = 0; \ - switch (n) { \ - case 8: \ - (l2) = ((uint32_t)(*(--(c)))) << 24L; \ - OPENSSL_FALLTHROUGH; \ - case 7: \ - (l2) |= ((uint32_t)(*(--(c)))) << 16L; \ - OPENSSL_FALLTHROUGH; \ - case 6: \ - (l2) |= ((uint32_t)(*(--(c)))) << 8L; \ - OPENSSL_FALLTHROUGH; \ - case 5: \ - (l2) |= ((uint32_t)(*(--(c)))); \ - OPENSSL_FALLTHROUGH; \ - case 4: \ - (l1) = ((uint32_t)(*(--(c)))) << 24L; \ - OPENSSL_FALLTHROUGH; \ - case 3: \ - (l1) |= ((uint32_t)(*(--(c)))) << 16L; \ - OPENSSL_FALLTHROUGH; \ - case 2: \ - (l1) |= ((uint32_t)(*(--(c)))) << 8L; \ - OPENSSL_FALLTHROUGH; \ - case 1: \ - (l1) |= ((uint32_t)(*(--(c)))); \ - } \ - } while (0) - -#define l2c(l, c) \ - do { \ - *((c)++) = (uint8_t)(((l)) & 0xff); \ - *((c)++) = (uint8_t)(((l) >> 8L) & 0xff); \ - *((c)++) = (uint8_t)(((l) >> 16L) & 0xff); \ - *((c)++) = (uint8_t)(((l) >> 24L) & 0xff); \ - } while (0) - -#define l2cn(l1, l2, c, n) \ - do { \ - (c) += (n); \ - switch (n) { \ - case 8: \ - *(--(c)) = (uint8_t)(((l2) >> 24L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ - case 7: \ - *(--(c)) = (uint8_t)(((l2) >> 16L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ - case 6: \ - *(--(c)) = (uint8_t)(((l2) >> 8L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ - case 5: \ - *(--(c)) = (uint8_t)(((l2)) & 0xff); \ - OPENSSL_FALLTHROUGH; \ - case 4: \ - *(--(c)) = (uint8_t)(((l1) >> 24L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ - case 3: \ - *(--(c)) = (uint8_t)(((l1) >> 16L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ - case 2: \ - *(--(c)) = (uint8_t)(((l1) >> 8L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ - case 1: \ - *(--(c)) = (uint8_t)(((l1)) & 0xff); \ - } \ - } while (0) - -typedef struct rc2_key_st { uint16_t data[64]; } RC2_KEY; - -static void RC2_encrypt(uint32_t *d, RC2_KEY *key) { - int i, n; - uint16_t *p0, *p1; - uint16_t x0, x1, x2, x3, t; - uint32_t l; - - l = d[0]; - x0 = (uint16_t)l & 0xffff; - x1 = (uint16_t)(l >> 16L); - l = d[1]; - x2 = (uint16_t)l & 0xffff; - x3 = (uint16_t)(l >> 16L); - - n = 3; - i = 5; - - p0 = p1 = &key->data[0]; - for (;;) { - t = (x0 + (x1 & ~x3) + (x2 & x3) + *(p0++)) & 0xffff; - x0 = (t << 1) | (t >> 15); - t = (x1 + (x2 & ~x0) + (x3 & x0) + *(p0++)) & 0xffff; - x1 = (t << 2) | (t >> 14); - t = (x2 + (x3 & ~x1) + (x0 & x1) + *(p0++)) & 0xffff; - x2 = (t << 3) | (t >> 13); - t = (x3 + (x0 & ~x2) + (x1 & x2) + *(p0++)) & 0xffff; - x3 = (t << 5) | (t >> 11); - - if (--i == 0) { - if (--n == 0) { - break; - } - i = (n == 2) ? 6 : 5; - - x0 += p1[x3 & 0x3f]; - x1 += p1[x0 & 0x3f]; - x2 += p1[x1 & 0x3f]; - x3 += p1[x2 & 0x3f]; - } - } - - d[0] = (uint32_t)(x0 & 0xffff) | ((uint32_t)(x1 & 0xffff) << 16L); - d[1] = (uint32_t)(x2 & 0xffff) | ((uint32_t)(x3 & 0xffff) << 16L); -} - -static void RC2_decrypt(uint32_t *d, RC2_KEY *key) { - int i, n; - uint16_t *p0, *p1; - uint16_t x0, x1, x2, x3, t; - uint32_t l; - - l = d[0]; - x0 = (uint16_t)l & 0xffff; - x1 = (uint16_t)(l >> 16L); - l = d[1]; - x2 = (uint16_t)l & 0xffff; - x3 = (uint16_t)(l >> 16L); - - n = 3; - i = 5; - - p0 = &key->data[63]; - p1 = &key->data[0]; - for (;;) { - t = ((x3 << 11) | (x3 >> 5)) & 0xffff; - x3 = (t - (x0 & ~x2) - (x1 & x2) - *(p0--)) & 0xffff; - t = ((x2 << 13) | (x2 >> 3)) & 0xffff; - x2 = (t - (x3 & ~x1) - (x0 & x1) - *(p0--)) & 0xffff; - t = ((x1 << 14) | (x1 >> 2)) & 0xffff; - x1 = (t - (x2 & ~x0) - (x3 & x0) - *(p0--)) & 0xffff; - t = ((x0 << 15) | (x0 >> 1)) & 0xffff; - x0 = (t - (x1 & ~x3) - (x2 & x3) - *(p0--)) & 0xffff; - - if (--i == 0) { - if (--n == 0) { - break; - } - i = (n == 2) ? 6 : 5; - - x3 = (x3 - p1[x2 & 0x3f]) & 0xffff; - x2 = (x2 - p1[x1 & 0x3f]) & 0xffff; - x1 = (x1 - p1[x0 & 0x3f]) & 0xffff; - x0 = (x0 - p1[x3 & 0x3f]) & 0xffff; - } - } - - d[0] = (uint32_t)(x0 & 0xffff) | ((uint32_t)(x1 & 0xffff) << 16L); - d[1] = (uint32_t)(x2 & 0xffff) | ((uint32_t)(x3 & 0xffff) << 16L); -} - -static void RC2_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, - RC2_KEY *ks, uint8_t *iv, int encrypt) { - uint32_t tin0, tin1; - uint32_t tout0, tout1, xor0, xor1; - long l = length; - uint32_t tin[2]; - - if (encrypt) { - c2l(iv, tout0); - c2l(iv, tout1); - iv -= 8; - for (l -= 8; l >= 0; l -= 8) { - c2l(in, tin0); - c2l(in, tin1); - tin0 ^= tout0; - tin1 ^= tout1; - tin[0] = tin0; - tin[1] = tin1; - RC2_encrypt(tin, ks); - tout0 = tin[0]; - l2c(tout0, out); - tout1 = tin[1]; - l2c(tout1, out); - } - if (l != -8) { - c2ln(in, tin0, tin1, l + 8); - tin0 ^= tout0; - tin1 ^= tout1; - tin[0] = tin0; - tin[1] = tin1; - RC2_encrypt(tin, ks); - tout0 = tin[0]; - l2c(tout0, out); - tout1 = tin[1]; - l2c(tout1, out); - } - l2c(tout0, iv); - l2c(tout1, iv); - } else { - c2l(iv, xor0); - c2l(iv, xor1); - iv -= 8; - for (l -= 8; l >= 0; l -= 8) { - c2l(in, tin0); - tin[0] = tin0; - c2l(in, tin1); - tin[1] = tin1; - RC2_decrypt(tin, ks); - tout0 = tin[0] ^ xor0; - tout1 = tin[1] ^ xor1; - l2c(tout0, out); - l2c(tout1, out); - xor0 = tin0; - xor1 = tin1; - } - if (l != -8) { - c2l(in, tin0); - tin[0] = tin0; - c2l(in, tin1); - tin[1] = tin1; - RC2_decrypt(tin, ks); - tout0 = tin[0] ^ xor0; - tout1 = tin[1] ^ xor1; - l2cn(tout0, tout1, out, l + 8); - xor0 = tin0; - xor1 = tin1; - } - l2c(xor0, iv); - l2c(xor1, iv); - } - tin[0] = tin[1] = 0; -} - -static const uint8_t key_table[256] = { - 0xd9, 0x78, 0xf9, 0xc4, 0x19, 0xdd, 0xb5, 0xed, 0x28, 0xe9, 0xfd, 0x79, - 0x4a, 0xa0, 0xd8, 0x9d, 0xc6, 0x7e, 0x37, 0x83, 0x2b, 0x76, 0x53, 0x8e, - 0x62, 0x4c, 0x64, 0x88, 0x44, 0x8b, 0xfb, 0xa2, 0x17, 0x9a, 0x59, 0xf5, - 0x87, 0xb3, 0x4f, 0x13, 0x61, 0x45, 0x6d, 0x8d, 0x09, 0x81, 0x7d, 0x32, - 0xbd, 0x8f, 0x40, 0xeb, 0x86, 0xb7, 0x7b, 0x0b, 0xf0, 0x95, 0x21, 0x22, - 0x5c, 0x6b, 0x4e, 0x82, 0x54, 0xd6, 0x65, 0x93, 0xce, 0x60, 0xb2, 0x1c, - 0x73, 0x56, 0xc0, 0x14, 0xa7, 0x8c, 0xf1, 0xdc, 0x12, 0x75, 0xca, 0x1f, - 0x3b, 0xbe, 0xe4, 0xd1, 0x42, 0x3d, 0xd4, 0x30, 0xa3, 0x3c, 0xb6, 0x26, - 0x6f, 0xbf, 0x0e, 0xda, 0x46, 0x69, 0x07, 0x57, 0x27, 0xf2, 0x1d, 0x9b, - 0xbc, 0x94, 0x43, 0x03, 0xf8, 0x11, 0xc7, 0xf6, 0x90, 0xef, 0x3e, 0xe7, - 0x06, 0xc3, 0xd5, 0x2f, 0xc8, 0x66, 0x1e, 0xd7, 0x08, 0xe8, 0xea, 0xde, - 0x80, 0x52, 0xee, 0xf7, 0x84, 0xaa, 0x72, 0xac, 0x35, 0x4d, 0x6a, 0x2a, - 0x96, 0x1a, 0xd2, 0x71, 0x5a, 0x15, 0x49, 0x74, 0x4b, 0x9f, 0xd0, 0x5e, - 0x04, 0x18, 0xa4, 0xec, 0xc2, 0xe0, 0x41, 0x6e, 0x0f, 0x51, 0xcb, 0xcc, - 0x24, 0x91, 0xaf, 0x50, 0xa1, 0xf4, 0x70, 0x39, 0x99, 0x7c, 0x3a, 0x85, - 0x23, 0xb8, 0xb4, 0x7a, 0xfc, 0x02, 0x36, 0x5b, 0x25, 0x55, 0x97, 0x31, - 0x2d, 0x5d, 0xfa, 0x98, 0xe3, 0x8a, 0x92, 0xae, 0x05, 0xdf, 0x29, 0x10, - 0x67, 0x6c, 0xba, 0xc9, 0xd3, 0x00, 0xe6, 0xcf, 0xe1, 0x9e, 0xa8, 0x2c, - 0x63, 0x16, 0x01, 0x3f, 0x58, 0xe2, 0x89, 0xa9, 0x0d, 0x38, 0x34, 0x1b, - 0xab, 0x33, 0xff, 0xb0, 0xbb, 0x48, 0x0c, 0x5f, 0xb9, 0xb1, 0xcd, 0x2e, - 0xc5, 0xf3, 0xdb, 0x47, 0xe5, 0xa5, 0x9c, 0x77, 0x0a, 0xa6, 0x20, 0x68, - 0xfe, 0x7f, 0xc1, 0xad, -}; - -static void RC2_set_key(RC2_KEY *key, int len, const uint8_t *data, int bits) { - int i, j; - uint8_t *k; - uint16_t *ki; - unsigned int c, d; - - k = (uint8_t *)&key->data[0]; - *k = 0; // for if there is a zero length key - - if (len > 128) { - len = 128; - } - if (bits <= 0) { - bits = 1024; - } - if (bits > 1024) { - bits = 1024; - } - - for (i = 0; i < len; i++) { - k[i] = data[i]; - } - - // expand table - d = k[len - 1]; - j = 0; - for (i = len; i < 128; i++, j++) { - d = key_table[(k[j] + d) & 0xff]; - k[i] = d; - } - - // hmm.... key reduction to 'bits' bits - - j = (bits + 7) >> 3; - i = 128 - j; - c = (0xff >> (-bits & 0x07)); - - d = key_table[k[i] & c]; - k[i] = d; - while (i--) { - d = key_table[k[i + j] ^ d]; - k[i] = d; - } - - // copy from bytes into uint16_t's - ki = &(key->data[63]); - for (i = 127; i >= 0; i -= 2) { - *(ki--) = ((k[i] << 8) | k[i - 1]) & 0xffff; - } -} - -typedef struct { - int key_bits; // effective key bits - RC2_KEY ks; // key schedule -} EVP_RC2_KEY; - -static int rc2_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, - const uint8_t *iv, int enc) { - EVP_RC2_KEY *rc2_key = (EVP_RC2_KEY *)ctx->cipher_data; - RC2_set_key(&rc2_key->ks, EVP_CIPHER_CTX_key_length(ctx), key, - rc2_key->key_bits); - return 1; -} - -static int rc2_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, - size_t inl) { - EVP_RC2_KEY *key = (EVP_RC2_KEY *)ctx->cipher_data; - static const size_t kChunkSize = 0x10000; - - while (inl >= kChunkSize) { - RC2_cbc_encrypt(in, out, kChunkSize, &key->ks, ctx->iv, ctx->encrypt); - inl -= kChunkSize; - in += kChunkSize; - out += kChunkSize; - } - if (inl) { - RC2_cbc_encrypt(in, out, inl, &key->ks, ctx->iv, ctx->encrypt); - } - return 1; -} - -static int rc2_ctrl(EVP_CIPHER_CTX *ctx, int type, int arg, void *ptr) { - EVP_RC2_KEY *key = (EVP_RC2_KEY *)ctx->cipher_data; - - switch (type) { - case EVP_CTRL_INIT: - key->key_bits = EVP_CIPHER_CTX_key_length(ctx) * 8; - return 1; - case EVP_CTRL_SET_RC2_KEY_BITS: - // Should be overridden by later call to |EVP_CTRL_INIT|, but - // people call it, so it may as well work. - key->key_bits = arg; - return 1; - - default: - return -1; - } -} - -static const EVP_CIPHER rc2_40_cbc = { - NID_rc2_40_cbc, - 8 /* block size */, - 5 /* 40 bit */, - 8 /* iv len */, - sizeof(EVP_RC2_KEY), - EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT, - NULL /* app_data */, - rc2_init_key, - rc2_cbc_cipher, - NULL, - rc2_ctrl, -}; - -const EVP_CIPHER *EVP_rc2_40_cbc(void) { - return &rc2_40_cbc; -} - -static const EVP_CIPHER rc2_cbc = { - NID_rc2_cbc, - 8 /* block size */, - 16 /* 128 bit */, - 8 /* iv len */, - sizeof(EVP_RC2_KEY), - EVP_CIPH_CBC_MODE | EVP_CIPH_VARIABLE_LENGTH | EVP_CIPH_CTRL_INIT, - NULL /* app_data */, - rc2_init_key, - rc2_cbc_cipher, - NULL, - rc2_ctrl, -}; - -const EVP_CIPHER *EVP_rc2_cbc(void) { - return &rc2_cbc; -} diff --git a/third_party/boringssl/src/crypto/cipher_extra/e_rc4.c b/third_party/boringssl/src/crypto/cipher_extra/e_rc4.c deleted file mode 100644 index 2f4f9bba..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/e_rc4.c +++ /dev/null @@ -1,89 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include - -#include -#include -#include - -#include "../fipsmodule/cipher/internal.h" - - -static int rc4_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, - const uint8_t *iv, int enc) { - RC4_KEY *rc4key = (RC4_KEY *)ctx->cipher_data; - - RC4_set_key(rc4key, EVP_CIPHER_CTX_key_length(ctx), key); - return 1; -} - -static int rc4_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, - size_t in_len) { - RC4_KEY *rc4key = (RC4_KEY *)ctx->cipher_data; - - RC4(rc4key, in_len, in, out); - return 1; -} - -static const EVP_CIPHER rc4 = { - NID_rc4, 1 /* block_size */, 16 /* key_size */, - 0 /* iv_len */, sizeof(RC4_KEY), EVP_CIPH_VARIABLE_LENGTH, - NULL /* app_data */, rc4_init_key, rc4_cipher, - NULL /* cleanup */, NULL /* ctrl */, }; - -const EVP_CIPHER *EVP_rc4(void) { return &rc4; } diff --git a/third_party/boringssl/src/crypto/cipher_extra/e_tls.c b/third_party/boringssl/src/crypto/cipher_extra/e_tls.c deleted file mode 100644 index cfaf95dc..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/e_tls.c +++ /dev/null @@ -1,595 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "../fipsmodule/cipher/internal.h" -#include "../internal.h" -#include "internal.h" - - -typedef struct { - EVP_CIPHER_CTX cipher_ctx; - HMAC_CTX hmac_ctx; - // mac_key is the portion of the key used for the MAC. It is retained - // separately for the constant-time CBC code. - uint8_t mac_key[EVP_MAX_MD_SIZE]; - uint8_t mac_key_len; - // implicit_iv is one iff this is a pre-TLS-1.1 CBC cipher without an explicit - // IV. - char implicit_iv; -} AEAD_TLS_CTX; - -static_assert(EVP_MAX_MD_SIZE < 256, "mac_key_len does not fit in uint8_t"); - -static_assert(sizeof(((EVP_AEAD_CTX *)NULL)->state) >= sizeof(AEAD_TLS_CTX), - "AEAD state is too small"); -static_assert(alignof(union evp_aead_ctx_st_state) >= alignof(AEAD_TLS_CTX), - "AEAD state has insufficient alignment"); - -static void aead_tls_cleanup(EVP_AEAD_CTX *ctx) { - AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; - EVP_CIPHER_CTX_cleanup(&tls_ctx->cipher_ctx); - HMAC_CTX_cleanup(&tls_ctx->hmac_ctx); -} - -static int aead_tls_init(EVP_AEAD_CTX *ctx, const uint8_t *key, size_t key_len, - size_t tag_len, enum evp_aead_direction_t dir, - const EVP_CIPHER *cipher, const EVP_MD *md, - char implicit_iv) { - if (tag_len != EVP_AEAD_DEFAULT_TAG_LENGTH && - tag_len != EVP_MD_size(md)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_TAG_SIZE); - return 0; - } - - if (key_len != EVP_AEAD_key_length(ctx->aead)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); - return 0; - } - - size_t mac_key_len = EVP_MD_size(md); - size_t enc_key_len = EVP_CIPHER_key_length(cipher); - assert(mac_key_len + enc_key_len + - (implicit_iv ? EVP_CIPHER_iv_length(cipher) : 0) == key_len); - - AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; - EVP_CIPHER_CTX_init(&tls_ctx->cipher_ctx); - HMAC_CTX_init(&tls_ctx->hmac_ctx); - assert(mac_key_len <= EVP_MAX_MD_SIZE); - OPENSSL_memcpy(tls_ctx->mac_key, key, mac_key_len); - tls_ctx->mac_key_len = (uint8_t)mac_key_len; - tls_ctx->implicit_iv = implicit_iv; - - if (!EVP_CipherInit_ex(&tls_ctx->cipher_ctx, cipher, NULL, &key[mac_key_len], - implicit_iv ? &key[mac_key_len + enc_key_len] : NULL, - dir == evp_aead_seal) || - !HMAC_Init_ex(&tls_ctx->hmac_ctx, key, mac_key_len, md, NULL)) { - aead_tls_cleanup(ctx); - return 0; - } - EVP_CIPHER_CTX_set_padding(&tls_ctx->cipher_ctx, 0); - - return 1; -} - -static size_t aead_tls_tag_len(const EVP_AEAD_CTX *ctx, const size_t in_len, - const size_t extra_in_len) { - assert(extra_in_len == 0); - const AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; - - const size_t hmac_len = HMAC_size(&tls_ctx->hmac_ctx); - if (EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) != EVP_CIPH_CBC_MODE) { - // The NULL cipher. - return hmac_len; - } - - const size_t block_size = EVP_CIPHER_CTX_block_size(&tls_ctx->cipher_ctx); - // An overflow of |in_len + hmac_len| doesn't affect the result mod - // |block_size|, provided that |block_size| is a smaller power of two. - assert(block_size != 0 && (block_size & (block_size - 1)) == 0); - const size_t pad_len = block_size - (in_len + hmac_len) % block_size; - return hmac_len + pad_len; -} - -static int aead_tls_seal_scatter(const EVP_AEAD_CTX *ctx, uint8_t *out, - uint8_t *out_tag, size_t *out_tag_len, - const size_t max_out_tag_len, - const uint8_t *nonce, const size_t nonce_len, - const uint8_t *in, const size_t in_len, - const uint8_t *extra_in, - const size_t extra_in_len, const uint8_t *ad, - const size_t ad_len) { - AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; - - if (!tls_ctx->cipher_ctx.encrypt) { - // Unlike a normal AEAD, a TLS AEAD may only be used in one direction. - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_OPERATION); - return 0; - } - - if (in_len > INT_MAX) { - // EVP_CIPHER takes int as input. - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - if (max_out_tag_len < aead_tls_tag_len(ctx, in_len, extra_in_len)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - - if (nonce_len != EVP_AEAD_nonce_length(ctx->aead)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); - return 0; - } - - if (ad_len != 13 - 2 /* length bytes */) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_AD_SIZE); - return 0; - } - - // To allow for CBC mode which changes cipher length, |ad| doesn't include the - // length for legacy ciphers. - uint8_t ad_extra[2]; - ad_extra[0] = (uint8_t)(in_len >> 8); - ad_extra[1] = (uint8_t)(in_len & 0xff); - - // Compute the MAC. This must be first in case the operation is being done - // in-place. - uint8_t mac[EVP_MAX_MD_SIZE]; - unsigned mac_len; - if (!HMAC_Init_ex(&tls_ctx->hmac_ctx, NULL, 0, NULL, NULL) || - !HMAC_Update(&tls_ctx->hmac_ctx, ad, ad_len) || - !HMAC_Update(&tls_ctx->hmac_ctx, ad_extra, sizeof(ad_extra)) || - !HMAC_Update(&tls_ctx->hmac_ctx, in, in_len) || - !HMAC_Final(&tls_ctx->hmac_ctx, mac, &mac_len)) { - return 0; - } - - // Configure the explicit IV. - if (EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) == EVP_CIPH_CBC_MODE && - !tls_ctx->implicit_iv && - !EVP_EncryptInit_ex(&tls_ctx->cipher_ctx, NULL, NULL, NULL, nonce)) { - return 0; - } - - // Encrypt the input. - int len; - if (!EVP_EncryptUpdate(&tls_ctx->cipher_ctx, out, &len, in, (int)in_len)) { - return 0; - } - - unsigned block_size = EVP_CIPHER_CTX_block_size(&tls_ctx->cipher_ctx); - - // Feed the MAC into the cipher in two steps. First complete the final partial - // block from encrypting the input and split the result between |out| and - // |out_tag|. Then feed the rest. - - const size_t early_mac_len = (block_size - (in_len % block_size)) % block_size; - if (early_mac_len != 0) { - assert(len + block_size - early_mac_len == in_len); - uint8_t buf[EVP_MAX_BLOCK_LENGTH]; - int buf_len; - if (!EVP_EncryptUpdate(&tls_ctx->cipher_ctx, buf, &buf_len, mac, - (int)early_mac_len)) { - return 0; - } - assert(buf_len == (int)block_size); - OPENSSL_memcpy(out + len, buf, block_size - early_mac_len); - OPENSSL_memcpy(out_tag, buf + block_size - early_mac_len, early_mac_len); - } - size_t tag_len = early_mac_len; - - if (!EVP_EncryptUpdate(&tls_ctx->cipher_ctx, out_tag + tag_len, &len, - mac + tag_len, mac_len - tag_len)) { - return 0; - } - tag_len += len; - - if (block_size > 1) { - assert(block_size <= 256); - assert(EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) == EVP_CIPH_CBC_MODE); - - // Compute padding and feed that into the cipher. - uint8_t padding[256]; - unsigned padding_len = block_size - ((in_len + mac_len) % block_size); - OPENSSL_memset(padding, padding_len - 1, padding_len); - if (!EVP_EncryptUpdate(&tls_ctx->cipher_ctx, out_tag + tag_len, &len, - padding, (int)padding_len)) { - return 0; - } - tag_len += len; - } - - if (!EVP_EncryptFinal_ex(&tls_ctx->cipher_ctx, out_tag + tag_len, &len)) { - return 0; - } - assert(len == 0); // Padding is explicit. - assert(tag_len == aead_tls_tag_len(ctx, in_len, extra_in_len)); - - *out_tag_len = tag_len; - return 1; -} - -static int aead_tls_open(const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len, - size_t max_out_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, - const uint8_t *ad, size_t ad_len) { - AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; - - if (tls_ctx->cipher_ctx.encrypt) { - // Unlike a normal AEAD, a TLS AEAD may only be used in one direction. - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_OPERATION); - return 0; - } - - if (in_len < HMAC_size(&tls_ctx->hmac_ctx)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - if (max_out_len < in_len) { - // This requires that the caller provide space for the MAC, even though it - // will always be removed on return. - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - - if (nonce_len != EVP_AEAD_nonce_length(ctx->aead)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); - return 0; - } - - if (ad_len != 13 - 2 /* length bytes */) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_AD_SIZE); - return 0; - } - - if (in_len > INT_MAX) { - // EVP_CIPHER takes int as input. - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - // Configure the explicit IV. - if (EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) == EVP_CIPH_CBC_MODE && - !tls_ctx->implicit_iv && - !EVP_DecryptInit_ex(&tls_ctx->cipher_ctx, NULL, NULL, NULL, nonce)) { - return 0; - } - - // Decrypt to get the plaintext + MAC + padding. - size_t total = 0; - int len; - if (!EVP_DecryptUpdate(&tls_ctx->cipher_ctx, out, &len, in, (int)in_len)) { - return 0; - } - total += len; - if (!EVP_DecryptFinal_ex(&tls_ctx->cipher_ctx, out + total, &len)) { - return 0; - } - total += len; - assert(total == in_len); - - CONSTTIME_SECRET(out, total); - - // Remove CBC padding. Code from here on is timing-sensitive with respect to - // |padding_ok| and |data_plus_mac_len| for CBC ciphers. - size_t data_plus_mac_len; - crypto_word_t padding_ok; - if (EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) == EVP_CIPH_CBC_MODE) { - if (!EVP_tls_cbc_remove_padding( - &padding_ok, &data_plus_mac_len, out, total, - EVP_CIPHER_CTX_block_size(&tls_ctx->cipher_ctx), - HMAC_size(&tls_ctx->hmac_ctx))) { - // Publicly invalid. This can be rejected in non-constant time. - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - } else { - padding_ok = CONSTTIME_TRUE_W; - data_plus_mac_len = total; - // |data_plus_mac_len| = |total| = |in_len| at this point. |in_len| has - // already been checked against the MAC size at the top of the function. - assert(data_plus_mac_len >= HMAC_size(&tls_ctx->hmac_ctx)); - } - size_t data_len = data_plus_mac_len - HMAC_size(&tls_ctx->hmac_ctx); - - // At this point, if the padding is valid, the first |data_plus_mac_len| bytes - // after |out| are the plaintext and MAC. Otherwise, |data_plus_mac_len| is - // still large enough to extract a MAC, but it will be irrelevant. - - // To allow for CBC mode which changes cipher length, |ad| doesn't include the - // length for legacy ciphers. - uint8_t ad_fixed[13]; - OPENSSL_memcpy(ad_fixed, ad, 11); - ad_fixed[11] = (uint8_t)(data_len >> 8); - ad_fixed[12] = (uint8_t)(data_len & 0xff); - ad_len += 2; - - // Compute the MAC and extract the one in the record. - uint8_t mac[EVP_MAX_MD_SIZE]; - size_t mac_len; - uint8_t record_mac_tmp[EVP_MAX_MD_SIZE]; - uint8_t *record_mac; - if (EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) == EVP_CIPH_CBC_MODE && - EVP_tls_cbc_record_digest_supported(tls_ctx->hmac_ctx.md)) { - if (!EVP_tls_cbc_digest_record(tls_ctx->hmac_ctx.md, mac, &mac_len, - ad_fixed, out, data_len, total, - tls_ctx->mac_key, tls_ctx->mac_key_len)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - assert(mac_len == HMAC_size(&tls_ctx->hmac_ctx)); - - record_mac = record_mac_tmp; - EVP_tls_cbc_copy_mac(record_mac, mac_len, out, data_plus_mac_len, total); - } else { - // We should support the constant-time path for all CBC-mode ciphers - // implemented. - assert(EVP_CIPHER_CTX_mode(&tls_ctx->cipher_ctx) != EVP_CIPH_CBC_MODE); - - unsigned mac_len_u; - if (!HMAC_Init_ex(&tls_ctx->hmac_ctx, NULL, 0, NULL, NULL) || - !HMAC_Update(&tls_ctx->hmac_ctx, ad_fixed, ad_len) || - !HMAC_Update(&tls_ctx->hmac_ctx, out, data_len) || - !HMAC_Final(&tls_ctx->hmac_ctx, mac, &mac_len_u)) { - return 0; - } - mac_len = mac_len_u; - - assert(mac_len == HMAC_size(&tls_ctx->hmac_ctx)); - record_mac = &out[data_len]; - } - - // Perform the MAC check and the padding check in constant-time. It should be - // safe to simply perform the padding check first, but it would not be under a - // different choice of MAC location on padding failure. See - // EVP_tls_cbc_remove_padding. - crypto_word_t good = - constant_time_eq_int(CRYPTO_memcmp(record_mac, mac, mac_len), 0); - good &= padding_ok; - CONSTTIME_DECLASSIFY(&good, sizeof(good)); - if (!good) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - CONSTTIME_DECLASSIFY(&data_len, sizeof(data_len)); - CONSTTIME_DECLASSIFY(out, data_len); - - // End of timing-sensitive code. - - *out_len = data_len; - return 1; -} - -static int aead_aes_128_cbc_sha1_tls_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t tag_len, - enum evp_aead_direction_t dir) { - return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_aes_128_cbc(), - EVP_sha1(), 0); -} - -static int aead_aes_128_cbc_sha1_tls_implicit_iv_init( - EVP_AEAD_CTX *ctx, const uint8_t *key, size_t key_len, size_t tag_len, - enum evp_aead_direction_t dir) { - return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_aes_128_cbc(), - EVP_sha1(), 1); -} - -static int aead_aes_256_cbc_sha1_tls_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t tag_len, - enum evp_aead_direction_t dir) { - return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_aes_256_cbc(), - EVP_sha1(), 0); -} - -static int aead_aes_256_cbc_sha1_tls_implicit_iv_init( - EVP_AEAD_CTX *ctx, const uint8_t *key, size_t key_len, size_t tag_len, - enum evp_aead_direction_t dir) { - return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_aes_256_cbc(), - EVP_sha1(), 1); -} - -static int aead_des_ede3_cbc_sha1_tls_init(EVP_AEAD_CTX *ctx, - const uint8_t *key, size_t key_len, - size_t tag_len, - enum evp_aead_direction_t dir) { - return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_des_ede3_cbc(), - EVP_sha1(), 0); -} - -static int aead_des_ede3_cbc_sha1_tls_implicit_iv_init( - EVP_AEAD_CTX *ctx, const uint8_t *key, size_t key_len, size_t tag_len, - enum evp_aead_direction_t dir) { - return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_des_ede3_cbc(), - EVP_sha1(), 1); -} - -static int aead_tls_get_iv(const EVP_AEAD_CTX *ctx, const uint8_t **out_iv, - size_t *out_iv_len) { - const AEAD_TLS_CTX *tls_ctx = (AEAD_TLS_CTX *)&ctx->state; - const size_t iv_len = EVP_CIPHER_CTX_iv_length(&tls_ctx->cipher_ctx); - if (iv_len <= 1) { - return 0; - } - - *out_iv = tls_ctx->cipher_ctx.iv; - *out_iv_len = iv_len; - return 1; -} - -static int aead_null_sha1_tls_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t tag_len, - enum evp_aead_direction_t dir) { - return aead_tls_init(ctx, key, key_len, tag_len, dir, EVP_enc_null(), - EVP_sha1(), 1 /* implicit iv */); -} - -static const EVP_AEAD aead_aes_128_cbc_sha1_tls = { - SHA_DIGEST_LENGTH + 16, // key len (SHA1 + AES128) - 16, // nonce len (IV) - 16 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) - SHA_DIGEST_LENGTH, // max tag length - 0, // seal_scatter_supports_extra_in - - NULL, // init - aead_aes_128_cbc_sha1_tls_init, - aead_tls_cleanup, - aead_tls_open, - aead_tls_seal_scatter, - NULL, // open_gather - NULL, // get_iv - aead_tls_tag_len, -}; - -static const EVP_AEAD aead_aes_128_cbc_sha1_tls_implicit_iv = { - SHA_DIGEST_LENGTH + 16 + 16, // key len (SHA1 + AES128 + IV) - 0, // nonce len - 16 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) - SHA_DIGEST_LENGTH, // max tag length - 0, // seal_scatter_supports_extra_in - - NULL, // init - aead_aes_128_cbc_sha1_tls_implicit_iv_init, - aead_tls_cleanup, - aead_tls_open, - aead_tls_seal_scatter, - NULL, // open_gather - aead_tls_get_iv, // get_iv - aead_tls_tag_len, -}; - -static const EVP_AEAD aead_aes_256_cbc_sha1_tls = { - SHA_DIGEST_LENGTH + 32, // key len (SHA1 + AES256) - 16, // nonce len (IV) - 16 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) - SHA_DIGEST_LENGTH, // max tag length - 0, // seal_scatter_supports_extra_in - - NULL, // init - aead_aes_256_cbc_sha1_tls_init, - aead_tls_cleanup, - aead_tls_open, - aead_tls_seal_scatter, - NULL, // open_gather - NULL, // get_iv - aead_tls_tag_len, -}; - -static const EVP_AEAD aead_aes_256_cbc_sha1_tls_implicit_iv = { - SHA_DIGEST_LENGTH + 32 + 16, // key len (SHA1 + AES256 + IV) - 0, // nonce len - 16 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) - SHA_DIGEST_LENGTH, // max tag length - 0, // seal_scatter_supports_extra_in - - NULL, // init - aead_aes_256_cbc_sha1_tls_implicit_iv_init, - aead_tls_cleanup, - aead_tls_open, - aead_tls_seal_scatter, - NULL, // open_gather - aead_tls_get_iv, // get_iv - aead_tls_tag_len, -}; - -static const EVP_AEAD aead_des_ede3_cbc_sha1_tls = { - SHA_DIGEST_LENGTH + 24, // key len (SHA1 + 3DES) - 8, // nonce len (IV) - 8 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) - SHA_DIGEST_LENGTH, // max tag length - 0, // seal_scatter_supports_extra_in - - NULL, // init - aead_des_ede3_cbc_sha1_tls_init, - aead_tls_cleanup, - aead_tls_open, - aead_tls_seal_scatter, - NULL, // open_gather - NULL, // get_iv - aead_tls_tag_len, -}; - -static const EVP_AEAD aead_des_ede3_cbc_sha1_tls_implicit_iv = { - SHA_DIGEST_LENGTH + 24 + 8, // key len (SHA1 + 3DES + IV) - 0, // nonce len - 8 + SHA_DIGEST_LENGTH, // overhead (padding + SHA1) - SHA_DIGEST_LENGTH, // max tag length - 0, // seal_scatter_supports_extra_in - - NULL, // init - aead_des_ede3_cbc_sha1_tls_implicit_iv_init, - aead_tls_cleanup, - aead_tls_open, - aead_tls_seal_scatter, - NULL, // open_gather - aead_tls_get_iv, // get_iv - aead_tls_tag_len, -}; - -static const EVP_AEAD aead_null_sha1_tls = { - SHA_DIGEST_LENGTH, // key len - 0, // nonce len - SHA_DIGEST_LENGTH, // overhead (SHA1) - SHA_DIGEST_LENGTH, // max tag length - 0, // seal_scatter_supports_extra_in - - NULL, // init - aead_null_sha1_tls_init, - aead_tls_cleanup, - aead_tls_open, - aead_tls_seal_scatter, - NULL, // open_gather - NULL, // get_iv - aead_tls_tag_len, -}; - -const EVP_AEAD *EVP_aead_aes_128_cbc_sha1_tls(void) { - return &aead_aes_128_cbc_sha1_tls; -} - -const EVP_AEAD *EVP_aead_aes_128_cbc_sha1_tls_implicit_iv(void) { - return &aead_aes_128_cbc_sha1_tls_implicit_iv; -} - -const EVP_AEAD *EVP_aead_aes_256_cbc_sha1_tls(void) { - return &aead_aes_256_cbc_sha1_tls; -} - -const EVP_AEAD *EVP_aead_aes_256_cbc_sha1_tls_implicit_iv(void) { - return &aead_aes_256_cbc_sha1_tls_implicit_iv; -} - -const EVP_AEAD *EVP_aead_des_ede3_cbc_sha1_tls(void) { - return &aead_des_ede3_cbc_sha1_tls; -} - -const EVP_AEAD *EVP_aead_des_ede3_cbc_sha1_tls_implicit_iv(void) { - return &aead_des_ede3_cbc_sha1_tls_implicit_iv; -} - -const EVP_AEAD *EVP_aead_null_sha1_tls(void) { return &aead_null_sha1_tls; } diff --git a/third_party/boringssl/src/crypto/cipher_extra/internal.h b/third_party/boringssl/src/crypto/cipher_extra/internal.h deleted file mode 100644 index 76a03144..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/internal.h +++ /dev/null @@ -1,229 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#ifndef OPENSSL_HEADER_CIPHER_EXTRA_INTERNAL_H -#define OPENSSL_HEADER_CIPHER_EXTRA_INTERNAL_H - -#include -#include - -#include - -#include "../internal.h" - -#if defined(__cplusplus) -extern "C" { -#endif - - -// EVP_tls_cbc_get_padding determines the padding from the decrypted, TLS, CBC -// record in |in|. This decrypted record should not include any "decrypted" -// explicit IV. If the record is publicly invalid, it returns zero. Otherwise, -// it returns one and sets |*out_padding_ok| to all ones (0xfff..f) if the -// padding is valid and zero otherwise. It then sets |*out_len| to the length -// with the padding removed or |in_len| if invalid. -// -// If the function returns one, it runs in time independent of the contents of -// |in|. It is also guaranteed that |*out_len| >= |mac_size|, satisfying -// |EVP_tls_cbc_copy_mac|'s precondition. -int EVP_tls_cbc_remove_padding(crypto_word_t *out_padding_ok, size_t *out_len, - const uint8_t *in, size_t in_len, - size_t block_size, size_t mac_size); - -// EVP_tls_cbc_copy_mac copies |md_size| bytes from the end of the first -// |in_len| bytes of |in| to |out| in constant time (independent of the concrete -// value of |in_len|, which may vary within a 256-byte window). |in| must point -// to a buffer of |orig_len| bytes. -// -// On entry: -// orig_len >= in_len >= md_size -// md_size <= EVP_MAX_MD_SIZE -void EVP_tls_cbc_copy_mac(uint8_t *out, size_t md_size, const uint8_t *in, - size_t in_len, size_t orig_len); - -// EVP_tls_cbc_record_digest_supported returns 1 iff |md| is a hash function -// which EVP_tls_cbc_digest_record supports. -int EVP_tls_cbc_record_digest_supported(const EVP_MD *md); - -// EVP_sha1_final_with_secret_suffix computes the result of hashing |len| bytes -// from |in| to |ctx| and writes the resulting hash to |out|. |len| is treated -// as secret and must be at most |max_len|, which is treated as public. |in| -// must point to a buffer of at least |max_len| bytes. It returns one on success -// and zero if inputs are too long. -// -// This function is exported for unit tests. -OPENSSL_EXPORT int EVP_sha1_final_with_secret_suffix( - SHA_CTX *ctx, uint8_t out[SHA_DIGEST_LENGTH], const uint8_t *in, size_t len, - size_t max_len); - -// EVP_tls_cbc_digest_record computes the MAC of a decrypted, padded TLS -// record. -// -// md: the hash function used in the HMAC. -// EVP_tls_cbc_record_digest_supported must return true for this hash. -// md_out: the digest output. At most EVP_MAX_MD_SIZE bytes will be written. -// md_out_size: the number of output bytes is written here. -// header: the 13-byte, TLS record header. -// data: the record data itself -// data_size: the secret, reported length of the data once the padding and MAC -// have been removed. -// data_plus_mac_plus_padding_size: the public length of the whole -// record, including padding. -// -// On entry: by virtue of having been through one of the remove_padding -// functions, above, we know that data_plus_mac_size is large enough to contain -// a padding byte and MAC. (If the padding was invalid, it might contain the -// padding too. ) -int EVP_tls_cbc_digest_record(const EVP_MD *md, uint8_t *md_out, - size_t *md_out_size, const uint8_t header[13], - const uint8_t *data, size_t data_size, - size_t data_plus_mac_plus_padding_size, - const uint8_t *mac_secret, - unsigned mac_secret_length); - -#define POLY1305_TAG_LEN 16 - -// For convenience (the x86_64 calling convention allows only six parameters in -// registers), the final parameter for the assembly functions is both an input -// and output parameter. -union chacha20_poly1305_open_data { - struct { - alignas(16) uint8_t key[32]; - uint32_t counter; - uint8_t nonce[12]; - } in; - struct { - uint8_t tag[POLY1305_TAG_LEN]; - } out; -}; - -union chacha20_poly1305_seal_data { - struct { - alignas(16) uint8_t key[32]; - uint32_t counter; - uint8_t nonce[12]; - const uint8_t *extra_ciphertext; - size_t extra_ciphertext_len; - } in; - struct { - uint8_t tag[POLY1305_TAG_LEN]; - } out; -}; - -#if (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ - !defined(OPENSSL_NO_ASM) - -static_assert(sizeof(union chacha20_poly1305_open_data) == 48, - "wrong chacha20_poly1305_open_data size"); -static_assert(sizeof(union chacha20_poly1305_seal_data) == 48 + 8 + 8, - "wrong chacha20_poly1305_seal_data size"); - -OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) { -#if defined(OPENSSL_X86_64) - return CRYPTO_is_SSE4_1_capable(); -#elif defined(OPENSSL_AARCH64) - return CRYPTO_is_NEON_capable(); -#endif -} - -// chacha20_poly1305_open is defined in chacha20_poly1305_*.pl. It decrypts -// |plaintext_len| bytes from |ciphertext| and writes them to |out_plaintext|. -// Additional input parameters are passed in |aead_data->in|. On exit, it will -// write calculated tag value to |aead_data->out.tag|, which the caller must -// check. -extern void chacha20_poly1305_open(uint8_t *out_plaintext, - const uint8_t *ciphertext, - size_t plaintext_len, const uint8_t *ad, - size_t ad_len, - union chacha20_poly1305_open_data *data); - -// chacha20_poly1305_open is defined in chacha20_poly1305_*.pl. It encrypts -// |plaintext_len| bytes from |plaintext| and writes them to |out_ciphertext|. -// Additional input parameters are passed in |aead_data->in|. The calculated tag -// value is over the computed ciphertext concatenated with |extra_ciphertext| -// and written to |aead_data->out.tag|. -extern void chacha20_poly1305_seal(uint8_t *out_ciphertext, - const uint8_t *plaintext, - size_t plaintext_len, const uint8_t *ad, - size_t ad_len, - union chacha20_poly1305_seal_data *data); -#else - -OPENSSL_INLINE int chacha20_poly1305_asm_capable(void) { return 0; } - -OPENSSL_INLINE void chacha20_poly1305_open(uint8_t *out_plaintext, - const uint8_t *ciphertext, - size_t plaintext_len, const uint8_t *ad, - size_t ad_len, - union chacha20_poly1305_open_data *data) { - abort(); -} - -OPENSSL_INLINE void chacha20_poly1305_seal(uint8_t *out_ciphertext, - const uint8_t *plaintext, - size_t plaintext_len, const uint8_t *ad, - size_t ad_len, - union chacha20_poly1305_seal_data *data) { - abort(); -} -#endif - - -#if defined(__cplusplus) -} // extern C -#endif - -#endif // OPENSSL_HEADER_CIPHER_EXTRA_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/cipher_extra/tls_cbc.c b/third_party/boringssl/src/crypto/cipher_extra/tls_cbc.c deleted file mode 100644 index e1e95d42..00000000 --- a/third_party/boringssl/src/crypto/cipher_extra/tls_cbc.c +++ /dev/null @@ -1,338 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2012 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include -#include - -#include -#include -#include - -#include "../internal.h" -#include "internal.h" -#include "../fipsmodule/cipher/internal.h" - - -int EVP_tls_cbc_remove_padding(crypto_word_t *out_padding_ok, size_t *out_len, - const uint8_t *in, size_t in_len, - size_t block_size, size_t mac_size) { - const size_t overhead = 1 /* padding length byte */ + mac_size; - - // These lengths are all public so we can test them in non-constant time. - if (overhead > in_len) { - return 0; - } - - size_t padding_length = in[in_len - 1]; - - crypto_word_t good = constant_time_ge_w(in_len, overhead + padding_length); - // The padding consists of a length byte at the end of the record and - // then that many bytes of padding, all with the same value as the - // length byte. Thus, with the length byte included, there are i+1 - // bytes of padding. - // - // We can't check just |padding_length+1| bytes because that leaks - // decrypted information. Therefore we always have to check the maximum - // amount of padding possible. (Again, the length of the record is - // public information so we can use it.) - size_t to_check = 256; // maximum amount of padding, inc length byte. - if (to_check > in_len) { - to_check = in_len; - } - - for (size_t i = 0; i < to_check; i++) { - uint8_t mask = constant_time_ge_8(padding_length, i); - uint8_t b = in[in_len - 1 - i]; - // The final |padding_length+1| bytes should all have the value - // |padding_length|. Therefore the XOR should be zero. - good &= ~(mask & (padding_length ^ b)); - } - - // If any of the final |padding_length+1| bytes had the wrong value, - // one or more of the lower eight bits of |good| will be cleared. - good = constant_time_eq_w(0xff, good & 0xff); - - // Always treat |padding_length| as zero on error. If, assuming block size of - // 16, a padding of [<15 arbitrary bytes> 15] treated |padding_length| as 16 - // and returned -1, distinguishing good MAC and bad padding from bad MAC and - // bad padding would give POODLE's padding oracle. - padding_length = good & (padding_length + 1); - *out_len = in_len - padding_length; - *out_padding_ok = good; - return 1; -} - -void EVP_tls_cbc_copy_mac(uint8_t *out, size_t md_size, const uint8_t *in, - size_t in_len, size_t orig_len) { - uint8_t rotated_mac1[EVP_MAX_MD_SIZE], rotated_mac2[EVP_MAX_MD_SIZE]; - uint8_t *rotated_mac = rotated_mac1; - uint8_t *rotated_mac_tmp = rotated_mac2; - - // mac_end is the index of |in| just after the end of the MAC. - size_t mac_end = in_len; - size_t mac_start = mac_end - md_size; - - assert(orig_len >= in_len); - assert(in_len >= md_size); - assert(md_size <= EVP_MAX_MD_SIZE); - assert(md_size > 0); - - // scan_start contains the number of bytes that we can ignore because - // the MAC's position can only vary by 255 bytes. - size_t scan_start = 0; - // This information is public so it's safe to branch based on it. - if (orig_len > md_size + 255 + 1) { - scan_start = orig_len - (md_size + 255 + 1); - } - - size_t rotate_offset = 0; - uint8_t mac_started = 0; - OPENSSL_memset(rotated_mac, 0, md_size); - for (size_t i = scan_start, j = 0; i < orig_len; i++, j++) { - if (j >= md_size) { - j -= md_size; - } - crypto_word_t is_mac_start = constant_time_eq_w(i, mac_start); - mac_started |= is_mac_start; - uint8_t mac_ended = constant_time_ge_8(i, mac_end); - rotated_mac[j] |= in[i] & mac_started & ~mac_ended; - // Save the offset that |mac_start| is mapped to. - rotate_offset |= j & is_mac_start; - } - - // Now rotate the MAC. We rotate in log(md_size) steps, one for each bit - // position. - for (size_t offset = 1; offset < md_size; offset <<= 1, rotate_offset >>= 1) { - // Rotate by |offset| iff the corresponding bit is set in - // |rotate_offset|, placing the result in |rotated_mac_tmp|. - const uint8_t skip_rotate = (rotate_offset & 1) - 1; - for (size_t i = 0, j = offset; i < md_size; i++, j++) { - if (j >= md_size) { - j -= md_size; - } - rotated_mac_tmp[i] = - constant_time_select_8(skip_rotate, rotated_mac[i], rotated_mac[j]); - } - - // Swap pointers so |rotated_mac| contains the (possibly) rotated value. - // Note the number of iterations and thus the identity of these pointers is - // public information. - uint8_t *tmp = rotated_mac; - rotated_mac = rotated_mac_tmp; - rotated_mac_tmp = tmp; - } - - OPENSSL_memcpy(out, rotated_mac, md_size); -} - -int EVP_sha1_final_with_secret_suffix(SHA_CTX *ctx, - uint8_t out[SHA_DIGEST_LENGTH], - const uint8_t *in, size_t len, - size_t max_len) { - // Bound the input length so |total_bits| below fits in four bytes. This is - // redundant with TLS record size limits. This also ensures |input_idx| below - // does not overflow. - size_t max_len_bits = max_len << 3; - if (ctx->Nh != 0 || - (max_len_bits >> 3) != max_len || // Overflow - ctx->Nl + max_len_bits < max_len_bits || - ctx->Nl + max_len_bits > UINT32_MAX) { - return 0; - } - - // We need to hash the following into |ctx|: - // - // - ctx->data[:ctx->num] - // - in[:len] - // - A 0x80 byte - // - However many zero bytes are needed to pad up to a block. - // - Eight bytes of length. - size_t num_blocks = (ctx->num + len + 1 + 8 + SHA_CBLOCK - 1) >> 6; - size_t last_block = num_blocks - 1; - size_t max_blocks = (ctx->num + max_len + 1 + 8 + SHA_CBLOCK - 1) >> 6; - - // The bounds above imply |total_bits| fits in four bytes. - size_t total_bits = ctx->Nl + (len << 3); - uint8_t length_bytes[4]; - length_bytes[0] = (uint8_t)(total_bits >> 24); - length_bytes[1] = (uint8_t)(total_bits >> 16); - length_bytes[2] = (uint8_t)(total_bits >> 8); - length_bytes[3] = (uint8_t)total_bits; - - // We now construct and process each expected block in constant-time. - uint8_t block[SHA_CBLOCK] = {0}; - uint32_t result[5] = {0}; - // input_idx is the index into |in| corresponding to the current block. - // However, we allow this index to overflow beyond |max_len|, to simplify the - // 0x80 byte. - size_t input_idx = 0; - for (size_t i = 0; i < max_blocks; i++) { - // Fill |block| with data from the partial block in |ctx| and |in|. We copy - // as if we were hashing up to |max_len| and then zero the excess later. - size_t block_start = 0; - if (i == 0) { - OPENSSL_memcpy(block, ctx->data, ctx->num); - block_start = ctx->num; - } - if (input_idx < max_len) { - size_t to_copy = SHA_CBLOCK - block_start; - if (to_copy > max_len - input_idx) { - to_copy = max_len - input_idx; - } - OPENSSL_memcpy(block + block_start, in + input_idx, to_copy); - } - - // Zero any bytes beyond |len| and add the 0x80 byte. - for (size_t j = block_start; j < SHA_CBLOCK; j++) { - // input[idx] corresponds to block[j]. - size_t idx = input_idx + j - block_start; - // The barriers on |len| are not strictly necessary. However, without - // them, GCC compiles this code by incorporating |len| into the loop - // counter and subtracting it out later. This is still constant-time, but - // it frustrates attempts to validate this. - uint8_t is_in_bounds = constant_time_lt_8(idx, value_barrier_w(len)); - uint8_t is_padding_byte = constant_time_eq_8(idx, value_barrier_w(len)); - block[j] &= is_in_bounds; - block[j] |= 0x80 & is_padding_byte; - } - - input_idx += SHA_CBLOCK - block_start; - - // Fill in the length if this is the last block. - crypto_word_t is_last_block = constant_time_eq_w(i, last_block); - for (size_t j = 0; j < 4; j++) { - block[SHA_CBLOCK - 4 + j] |= is_last_block & length_bytes[j]; - } - - // Process the block and save the hash state if it is the final value. - SHA1_Transform(ctx, block); - for (size_t j = 0; j < 5; j++) { - result[j] |= is_last_block & ctx->h[j]; - } - } - - // Write the output. - for (size_t i = 0; i < 5; i++) { - CRYPTO_store_u32_be(out + 4 * i, result[i]); - } - return 1; -} - -int EVP_tls_cbc_record_digest_supported(const EVP_MD *md) { - return EVP_MD_type(md) == NID_sha1; -} - -int EVP_tls_cbc_digest_record(const EVP_MD *md, uint8_t *md_out, - size_t *md_out_size, const uint8_t header[13], - const uint8_t *data, size_t data_size, - size_t data_plus_mac_plus_padding_size, - const uint8_t *mac_secret, - unsigned mac_secret_length) { - if (EVP_MD_type(md) != NID_sha1) { - // EVP_tls_cbc_record_digest_supported should have been called first to - // check that the hash function is supported. - assert(0); - *md_out_size = 0; - return 0; - } - - if (mac_secret_length > SHA_CBLOCK) { - // HMAC pads small keys with zeros and hashes large keys down. This function - // should never reach the large key case. - assert(0); - return 0; - } - - // Compute the initial HMAC block. - uint8_t hmac_pad[SHA_CBLOCK]; - OPENSSL_memset(hmac_pad, 0, sizeof(hmac_pad)); - OPENSSL_memcpy(hmac_pad, mac_secret, mac_secret_length); - for (size_t i = 0; i < SHA_CBLOCK; i++) { - hmac_pad[i] ^= 0x36; - } - - SHA_CTX ctx; - SHA1_Init(&ctx); - SHA1_Update(&ctx, hmac_pad, SHA_CBLOCK); - SHA1_Update(&ctx, header, 13); - - // There are at most 256 bytes of padding, so we can compute the public - // minimum length for |data_size|. - size_t min_data_size = 0; - if (data_plus_mac_plus_padding_size > SHA_DIGEST_LENGTH + 256) { - min_data_size = data_plus_mac_plus_padding_size - SHA_DIGEST_LENGTH - 256; - } - - // Hash the public minimum length directly. This reduces the number of blocks - // that must be computed in constant-time. - SHA1_Update(&ctx, data, min_data_size); - - // Hash the remaining data without leaking |data_size|. - uint8_t mac_out[SHA_DIGEST_LENGTH]; - if (!EVP_sha1_final_with_secret_suffix( - &ctx, mac_out, data + min_data_size, data_size - min_data_size, - data_plus_mac_plus_padding_size - min_data_size)) { - return 0; - } - - // Complete the HMAC in the standard manner. - SHA1_Init(&ctx); - for (size_t i = 0; i < SHA_CBLOCK; i++) { - hmac_pad[i] ^= 0x6a; - } - - SHA1_Update(&ctx, hmac_pad, SHA_CBLOCK); - SHA1_Update(&ctx, mac_out, SHA_DIGEST_LENGTH); - SHA1_Final(md_out, &ctx); - *md_out_size = SHA_DIGEST_LENGTH; - return 1; -} diff --git a/third_party/boringssl/src/crypto/cms/cms.cc b/third_party/boringssl/src/crypto/cms/cms.cc new file mode 100644 index 00000000..e7b18036 --- /dev/null +++ b/third_party/boringssl/src/crypto/cms/cms.cc @@ -0,0 +1,178 @@ +// Copyright 2025 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "../pkcs7/internal.h" + + +using namespace bssl; + +// TODO(davidben): Should we move the core PKCS#7 / CMS implementation into +// crypto/cms instead of crypto/pkcs7? CMS is getting new features while PKCS#7 +// is not. +OPENSSL_DECLARE_ERROR_REASON(CMS, CERTIFICATE_HAS_NO_KEYID) + +DECLARE_OPAQUE_STRUCT(CMS_SignerInfo_st, CMSSignerInfo) +DECLARE_OPAQUE_STRUCT(CMS_ContentInfo_st, CMSContentInfo) + +BSSL_NAMESPACE_BEGIN + +class CMSSignerInfo : public CMS_SignerInfo_st { + public: + UniquePtr signcert; + UniquePtr pkey; + const EVP_MD *md = nullptr; + bool use_key_id = false; +}; + +class CMSContentInfo : public CMS_ContentInfo_st { + public: + static constexpr bool kAllowUniquePtr = true; + bool has_signer_info = false; + CMSSignerInfo signer_info; + Array der; +}; + +BSSL_NAMESPACE_END + +CMS_ContentInfo *CMS_sign(X509 *signcert, EVP_PKEY *pkey, STACK_OF(X509) *certs, + BIO *data, uint32_t flags) { + // We only support external signatures and do not support embedding + // certificates in SignedData. + if ((flags & CMS_DETACHED) == 0 || sk_X509_num(certs) != 0) { + OPENSSL_PUT_ERROR(CMS, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return nullptr; + } + + UniquePtr cms = MakeUnique(); + if (cms == nullptr) { + return nullptr; + } + + if (pkey != nullptr && + !CMS_add1_signer(cms.get(), signcert, pkey, /*md=*/nullptr, flags)) { + return nullptr; + } + + // We don't actually use streaming mode, but Linux passes |CMS_STREAM| to + // |CMS_sign| and OpenSSL interprets it as an alias for |CMS_PARTIAL| in this + // context. + if ((flags & (CMS_PARTIAL | CMS_STREAM)) == 0 && + !CMS_final(cms.get(), data, nullptr, flags)) { + return nullptr; + } + + return cms.release(); +} + +void CMS_ContentInfo_free(CMS_ContentInfo *cms) { Delete(FromOpaque(cms)); } + +CMS_SignerInfo *CMS_add1_signer(CMS_ContentInfo *cms, X509 *signcert, + EVP_PKEY *pkey, const EVP_MD *md, + uint32_t flags) { + auto *impl = FromOpaque(cms); + if ( // Already finalized. + !impl->der.empty() || + // We only support one signer. + impl->has_signer_info || + // We do not support embedding certificates in SignedData. + (flags & CMS_NOCERTS) == 0 || + // We do not support attributes in SignedData. + (flags & CMS_NOATTR) == 0) { + OPENSSL_PUT_ERROR(CMS, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return nullptr; + } + + if (signcert == nullptr || pkey == nullptr) { + OPENSSL_PUT_ERROR(CMS, ERR_R_PASSED_NULL_PARAMETER); + return nullptr; + } + + if (!X509_check_private_key(signcert, pkey)) { + OPENSSL_PUT_ERROR(CMS, CMS_R_PRIVATE_KEY_DOES_NOT_MATCH_CERTIFICATE); + return nullptr; + } + + // Default to SHA-256. + if (md == nullptr) { + md = EVP_sha256(); + } + + // Save information for later. + impl->has_signer_info = true; + impl->signer_info.signcert = UpRef(signcert); + impl->signer_info.pkey = UpRef(pkey); + impl->signer_info.md = md; + impl->signer_info.use_key_id = (flags & CMS_USE_KEYID) != 0; + return &impl->signer_info; +} + +int CMS_final(CMS_ContentInfo *cms, BIO *data, BIO *dcont, uint32_t flags) { + auto *impl = FromOpaque(cms); + if ( // Already finalized. + !impl->der.empty() || + // Require a SignerInfo. We do not support signature-less SignedDatas. + !impl->has_signer_info || + // We only support the straightforward passthrough mode, without S/MIME + // translations. + (flags & CMS_BINARY) == 0 || + // We do not support |dcont|. It is unclear what it does. + dcont != nullptr) { + OPENSSL_PUT_ERROR(CMS, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + ScopedCBB cbb; + if (!CBB_init(cbb.get(), 2048) || + !pkcs7_add_external_signature(cbb.get(), impl->signer_info.signcert.get(), + impl->signer_info.pkey.get(), + impl->signer_info.md, data, + impl->signer_info.use_key_id) || + !CBBFinishArray(cbb.get(), &impl->der)) { + return 0; + } + + return 1; +} + +int i2d_CMS_bio(BIO *out, CMS_ContentInfo *cms) { + auto *impl = FromOpaque(cms); + if (impl->der.empty()) { + // Not yet finalized. + OPENSSL_PUT_ERROR(CMS, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + return BIO_write_all(out, impl->der.data(), impl->der.size()); +} + +int i2d_CMS_bio_stream(BIO *out, CMS_ContentInfo *cms, BIO *in, int flags) { + // We do not support streaming mode. + if ((flags & CMS_STREAM) != 0 || in != nullptr) { + OPENSSL_PUT_ERROR(CMS, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + return i2d_CMS_bio(out, cms); +} diff --git a/third_party/boringssl/src/crypto/conf/conf.c b/third_party/boringssl/src/crypto/conf/conf.c deleted file mode 100644 index c1e4e963..00000000 --- a/third_party/boringssl/src/crypto/conf/conf.c +++ /dev/null @@ -1,821 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include "conf_def.h" -#include "internal.h" -#include "../internal.h" -#include "../lhash/internal.h" - - -DEFINE_LHASH_OF(CONF_VALUE) - -struct conf_st { - LHASH_OF(CONF_VALUE) *data; -}; - -static const char kDefaultSectionName[] = "default"; - -// The maximum length we can grow a value to after variable expansion. 64k -// should be more than enough for all reasonable uses. -#define MAX_CONF_VALUE_LENGTH 65536 - -static uint32_t conf_value_hash(const CONF_VALUE *v) { - const uint32_t section_hash = v->section ? OPENSSL_strhash(v->section) : 0; - const uint32_t name_hash = v->name ? OPENSSL_strhash(v->name) : 0; - return (section_hash << 2) ^ name_hash; -} - -static int conf_value_cmp(const CONF_VALUE *a, const CONF_VALUE *b) { - int i; - - if (a->section != b->section) { - i = strcmp(a->section, b->section); - if (i) { - return i; - } - } - - if (a->name != NULL && b->name != NULL) { - return strcmp(a->name, b->name); - } else if (a->name == b->name) { - return 0; - } else { - return (a->name == NULL) ? -1 : 1; - } -} - -CONF *NCONF_new(void *method) { - CONF *conf; - - if (method != NULL) { - return NULL; - } - - conf = OPENSSL_malloc(sizeof(CONF)); - if (conf == NULL) { - return NULL; - } - - conf->data = lh_CONF_VALUE_new(conf_value_hash, conf_value_cmp); - if (conf->data == NULL) { - OPENSSL_free(conf); - return NULL; - } - - return conf; -} - -CONF_VALUE *CONF_VALUE_new(void) { - CONF_VALUE *v = OPENSSL_malloc(sizeof(CONF_VALUE)); - if (!v) { - OPENSSL_PUT_ERROR(CONF, ERR_R_MALLOC_FAILURE); - return NULL; - } - OPENSSL_memset(v, 0, sizeof(CONF_VALUE)); - return v; -} - -static void value_free_contents(CONF_VALUE *value) { - if (value->section) { - OPENSSL_free(value->section); - } - if (value->name) { - OPENSSL_free(value->name); - if (value->value) { - OPENSSL_free(value->value); - } - } else { - if (value->value) { - sk_CONF_VALUE_free((STACK_OF(CONF_VALUE)*)value->value); - } - } -} - -static void value_free(CONF_VALUE *value) { - value_free_contents(value); - OPENSSL_free(value); -} - -static void value_free_arg(CONF_VALUE *value, void *arg) { value_free(value); } - -void NCONF_free(CONF *conf) { - if (conf == NULL || conf->data == NULL) { - return; - } - - lh_CONF_VALUE_doall_arg(conf->data, value_free_arg, NULL); - lh_CONF_VALUE_free(conf->data); - OPENSSL_free(conf); -} - -static CONF_VALUE *NCONF_new_section(const CONF *conf, const char *section) { - STACK_OF(CONF_VALUE) *sk = NULL; - int ok = 0; - CONF_VALUE *v = NULL, *old_value; - - sk = sk_CONF_VALUE_new_null(); - v = CONF_VALUE_new(); - if (sk == NULL || v == NULL) { - goto err; - } - v->section = OPENSSL_strdup(section); - if (v->section == NULL) { - goto err; - } - - v->name = NULL; - v->value = (char *)sk; - - if (!lh_CONF_VALUE_insert(conf->data, &old_value, v)) { - goto err; - } - if (old_value) { - value_free(old_value); - } - ok = 1; - -err: - if (!ok) { - if (sk != NULL) { - sk_CONF_VALUE_free(sk); - } - if (v != NULL) { - OPENSSL_free(v); - } - v = NULL; - } - return v; -} - -static int str_copy(CONF *conf, char *section, char **pto, char *from) { - int q, r, rr = 0, to = 0, len = 0; - char *s, *e, *rp, *rrp, *np, *cp, v; - const char *p; - BUF_MEM *buf; - - buf = BUF_MEM_new(); - if (buf == NULL) { - return 0; - } - - len = strlen(from) + 1; - if (!BUF_MEM_grow(buf, len)) { - goto err; - } - - for (;;) { - if (IS_QUOTE(conf, *from)) { - q = *from; - from++; - while (!IS_EOF(conf, *from) && (*from != q)) { - if (IS_ESC(conf, *from)) { - from++; - if (IS_EOF(conf, *from)) { - break; - } - } - buf->data[to++] = *(from++); - } - if (*from == q) { - from++; - } - } else if (IS_DQUOTE(conf, *from)) { - q = *from; - from++; - while (!IS_EOF(conf, *from)) { - if (*from == q) { - if (*(from + 1) == q) { - from++; - } else { - break; - } - } - buf->data[to++] = *(from++); - } - if (*from == q) { - from++; - } - } else if (IS_ESC(conf, *from)) { - from++; - v = *(from++); - if (IS_EOF(conf, v)) { - break; - } else if (v == 'r') { - v = '\r'; - } else if (v == 'n') { - v = '\n'; - } else if (v == 'b') { - v = '\b'; - } else if (v == 't') { - v = '\t'; - } - buf->data[to++] = v; - } else if (IS_EOF(conf, *from)) { - break; - } else if (*from == '$') { - // try to expand it - rrp = NULL; - s = &(from[1]); - if (*s == '{') { - q = '}'; - } else if (*s == '(') { - q = ')'; - } else { - q = 0; - } - - if (q) { - s++; - } - cp = section; - e = np = s; - while (IS_ALPHA_NUMERIC(conf, *e)) { - e++; - } - if (e[0] == ':' && e[1] == ':') { - cp = np; - rrp = e; - rr = *e; - *rrp = '\0'; - e += 2; - np = e; - while (IS_ALPHA_NUMERIC(conf, *e)) { - e++; - } - } - r = *e; - *e = '\0'; - rp = e; - if (q) { - if (r != q) { - OPENSSL_PUT_ERROR(CONF, CONF_R_NO_CLOSE_BRACE); - goto err; - } - e++; - } - // So at this point we have - // np which is the start of the name string which is - // '\0' terminated. - // cp which is the start of the section string which is - // '\0' terminated. - // e is the 'next point after'. - // r and rr are the chars replaced by the '\0' - // rp and rrp is where 'r' and 'rr' came from. - p = NCONF_get_string(conf, cp, np); - if (rrp != NULL) { - *rrp = rr; - } - *rp = r; - if (p == NULL) { - OPENSSL_PUT_ERROR(CONF, CONF_R_VARIABLE_HAS_NO_VALUE); - goto err; - } - size_t newsize = strlen(p) + buf->length - (e - from); - if (newsize > MAX_CONF_VALUE_LENGTH) { - OPENSSL_PUT_ERROR(CONF, CONF_R_VARIABLE_EXPANSION_TOO_LONG); - goto err; - } - if (!BUF_MEM_grow_clean(buf, newsize)) { - OPENSSL_PUT_ERROR(CONF, ERR_R_MALLOC_FAILURE); - goto err; - } - while (*p) { - buf->data[to++] = *(p++); - } - - /* Since we change the pointer 'from', we also have - to change the perceived length of the string it - points at. /RL */ - len -= e - from; - from = e; - - /* In case there were no braces or parenthesis around - the variable reference, we have to put back the - character that was replaced with a '\0'. /RL */ - *rp = r; - } else { - buf->data[to++] = *(from++); - } - } - - buf->data[to] = '\0'; - if (*pto != NULL) { - OPENSSL_free(*pto); - } - *pto = buf->data; - OPENSSL_free(buf); - return 1; - -err: - if (buf != NULL) { - BUF_MEM_free(buf); - } - return 0; -} - -static CONF_VALUE *get_section(const CONF *conf, const char *section) { - CONF_VALUE template; - - OPENSSL_memset(&template, 0, sizeof(template)); - template.section = (char *) section; - return lh_CONF_VALUE_retrieve(conf->data, &template); -} - -STACK_OF(CONF_VALUE) *NCONF_get_section(const CONF *conf, const char *section) { - CONF_VALUE *section_value = get_section(conf, section); - if (section_value == NULL) { - return NULL; - } - return (STACK_OF(CONF_VALUE)*) section_value->value; -} - -const char *NCONF_get_string(const CONF *conf, const char *section, - const char *name) { - CONF_VALUE template, *value; - - if (section == NULL) { - section = kDefaultSectionName; - } - - OPENSSL_memset(&template, 0, sizeof(template)); - template.section = (char *) section; - template.name = (char *) name; - value = lh_CONF_VALUE_retrieve(conf->data, &template); - if (value == NULL) { - return NULL; - } - return value->value; -} - -static int add_string(const CONF *conf, CONF_VALUE *section, - CONF_VALUE *value) { - STACK_OF(CONF_VALUE) *section_stack = (STACK_OF(CONF_VALUE)*) section->value; - CONF_VALUE *old_value; - - value->section = OPENSSL_strdup(section->section); - if (!sk_CONF_VALUE_push(section_stack, value)) { - return 0; - } - - if (!lh_CONF_VALUE_insert(conf->data, &old_value, value)) { - return 0; - } - if (old_value != NULL) { - (void)sk_CONF_VALUE_delete_ptr(section_stack, old_value); - value_free(old_value); - } - - return 1; -} - -static char *eat_ws(CONF *conf, char *p) { - while (IS_WS(conf, *p) && !IS_EOF(conf, *p)) { - p++; - } - return p; -} - -#define scan_esc(conf, p) (((IS_EOF((conf), (p)[1])) ? ((p) + 1) : ((p) + 2))) - -static char *eat_alpha_numeric(CONF *conf, char *p) { - for (;;) { - if (IS_ESC(conf, *p)) { - p = scan_esc(conf, p); - continue; - } - if (!IS_ALPHA_NUMERIC_PUNCT(conf, *p)) { - return p; - } - p++; - } -} - -static char *scan_quote(CONF *conf, char *p) { - int q = *p; - - p++; - while (!IS_EOF(conf, *p) && *p != q) { - if (IS_ESC(conf, *p)) { - p++; - if (IS_EOF(conf, *p)) { - return p; - } - } - p++; - } - if (*p == q) { - p++; - } - return p; -} - - -static char *scan_dquote(CONF *conf, char *p) { - int q = *p; - - p++; - while (!(IS_EOF(conf, *p))) { - if (*p == q) { - if (*(p + 1) == q) { - p++; - } else { - break; - } - } - p++; - } - if (*p == q) { - p++; - } - return p; -} - -static void clear_comments(CONF *conf, char *p) { - for (;;) { - if (IS_FCOMMENT(conf, *p)) { - *p = '\0'; - return; - } - if (!IS_WS(conf, *p)) { - break; - } - p++; - } - - for (;;) { - if (IS_COMMENT(conf, *p)) { - *p = '\0'; - return; - } - if (IS_DQUOTE(conf, *p)) { - p = scan_dquote(conf, p); - continue; - } - if (IS_QUOTE(conf, *p)) { - p = scan_quote(conf, p); - continue; - } - if (IS_ESC(conf, *p)) { - p = scan_esc(conf, p); - continue; - } - if (IS_EOF(conf, *p)) { - return; - } else { - p++; - } - } -} - -static int def_load_bio(CONF *conf, BIO *in, long *out_error_line) { - static const size_t CONFBUFSIZE = 512; - int bufnum = 0, i, ii; - BUF_MEM *buff = NULL; - char *s, *p, *end; - int again; - long eline = 0; - char btmp[DECIMAL_SIZE(eline) + 1]; - CONF_VALUE *v = NULL, *tv; - CONF_VALUE *sv = NULL; - char *section = NULL, *buf; - char *start, *psection, *pname; - - if ((buff = BUF_MEM_new()) == NULL) { - OPENSSL_PUT_ERROR(CONF, ERR_R_BUF_LIB); - goto err; - } - - section = OPENSSL_strdup(kDefaultSectionName); - if (section == NULL) { - OPENSSL_PUT_ERROR(CONF, ERR_R_MALLOC_FAILURE); - goto err; - } - - sv = NCONF_new_section(conf, section); - if (sv == NULL) { - OPENSSL_PUT_ERROR(CONF, CONF_R_UNABLE_TO_CREATE_NEW_SECTION); - goto err; - } - - bufnum = 0; - again = 0; - for (;;) { - if (!BUF_MEM_grow(buff, bufnum + CONFBUFSIZE)) { - OPENSSL_PUT_ERROR(CONF, ERR_R_BUF_LIB); - goto err; - } - p = &(buff->data[bufnum]); - *p = '\0'; - BIO_gets(in, p, CONFBUFSIZE - 1); - p[CONFBUFSIZE - 1] = '\0'; - ii = i = strlen(p); - if (i == 0 && !again) { - break; - } - again = 0; - while (i > 0) { - if ((p[i - 1] != '\r') && (p[i - 1] != '\n')) { - break; - } else { - i--; - } - } - // we removed some trailing stuff so there is a new - // line on the end. - if (ii && i == ii) { - again = 1; // long line - } else { - p[i] = '\0'; - eline++; // another input line - } - - // we now have a line with trailing \r\n removed - - // i is the number of bytes - bufnum += i; - - v = NULL; - // check for line continuation - if (bufnum >= 1) { - // If we have bytes and the last char '\\' and - // second last char is not '\\' - p = &(buff->data[bufnum - 1]); - if (IS_ESC(conf, p[0]) && ((bufnum <= 1) || !IS_ESC(conf, p[-1]))) { - bufnum--; - again = 1; - } - } - if (again) { - continue; - } - bufnum = 0; - buf = buff->data; - - clear_comments(conf, buf); - s = eat_ws(conf, buf); - if (IS_EOF(conf, *s)) { - continue; // blank line - } - if (*s == '[') { - char *ss; - - s++; - start = eat_ws(conf, s); - ss = start; - again: - end = eat_alpha_numeric(conf, ss); - p = eat_ws(conf, end); - if (*p != ']') { - if (*p != '\0' && ss != p) { - ss = p; - goto again; - } - OPENSSL_PUT_ERROR(CONF, CONF_R_MISSING_CLOSE_SQUARE_BRACKET); - goto err; - } - *end = '\0'; - if (!str_copy(conf, NULL, §ion, start)) { - goto err; - } - if ((sv = get_section(conf, section)) == NULL) { - sv = NCONF_new_section(conf, section); - } - if (sv == NULL) { - OPENSSL_PUT_ERROR(CONF, CONF_R_UNABLE_TO_CREATE_NEW_SECTION); - goto err; - } - continue; - } else { - pname = s; - psection = NULL; - end = eat_alpha_numeric(conf, s); - if ((end[0] == ':') && (end[1] == ':')) { - *end = '\0'; - end += 2; - psection = pname; - pname = end; - end = eat_alpha_numeric(conf, end); - } - p = eat_ws(conf, end); - if (*p != '=') { - OPENSSL_PUT_ERROR(CONF, CONF_R_MISSING_EQUAL_SIGN); - goto err; - } - *end = '\0'; - p++; - start = eat_ws(conf, p); - while (!IS_EOF(conf, *p)) { - p++; - } - p--; - while ((p != start) && (IS_WS(conf, *p))) { - p--; - } - p++; - *p = '\0'; - - if (!(v = CONF_VALUE_new())) { - goto err; - } - if (psection == NULL) { - psection = section; - } - v->name = OPENSSL_strdup(pname); - if (v->name == NULL) { - OPENSSL_PUT_ERROR(CONF, ERR_R_MALLOC_FAILURE); - goto err; - } - if (!str_copy(conf, psection, &(v->value), start)) { - goto err; - } - - if (strcmp(psection, section) != 0) { - if ((tv = get_section(conf, psection)) == NULL) { - tv = NCONF_new_section(conf, psection); - } - if (tv == NULL) { - OPENSSL_PUT_ERROR(CONF, CONF_R_UNABLE_TO_CREATE_NEW_SECTION); - goto err; - } - } else { - tv = sv; - } - if (add_string(conf, tv, v) == 0) { - OPENSSL_PUT_ERROR(CONF, ERR_R_MALLOC_FAILURE); - goto err; - } - v = NULL; - } - } - if (buff != NULL) { - BUF_MEM_free(buff); - } - if (section != NULL) { - OPENSSL_free(section); - } - return 1; - -err: - if (buff != NULL) { - BUF_MEM_free(buff); - } - if (section != NULL) { - OPENSSL_free(section); - } - if (out_error_line != NULL) { - *out_error_line = eline; - } - BIO_snprintf(btmp, sizeof btmp, "%ld", eline); - ERR_add_error_data(2, "line ", btmp); - - if (v != NULL) { - if (v->name != NULL) { - OPENSSL_free(v->name); - } - if (v->value != NULL) { - OPENSSL_free(v->value); - } - if (v != NULL) { - OPENSSL_free(v); - } - } - return 0; -} - -int NCONF_load(CONF *conf, const char *filename, long *out_error_line) { - BIO *in = BIO_new_file(filename, "rb"); - int ret; - - if (in == NULL) { - OPENSSL_PUT_ERROR(CONF, ERR_R_SYS_LIB); - return 0; - } - - ret = def_load_bio(conf, in, out_error_line); - BIO_free(in); - - return ret; -} - -int NCONF_load_bio(CONF *conf, BIO *bio, long *out_error_line) { - return def_load_bio(conf, bio, out_error_line); -} - -int CONF_parse_list(const char *list, char sep, int remove_whitespace, - int (*list_cb)(const char *elem, int len, void *usr), - void *arg) { - int ret; - const char *lstart, *tmpend, *p; - - if (list == NULL) { - OPENSSL_PUT_ERROR(CONF, CONF_R_LIST_CANNOT_BE_NULL); - return 0; - } - - lstart = list; - for (;;) { - if (remove_whitespace) { - while (*lstart && isspace((unsigned char)*lstart)) { - lstart++; - } - } - p = strchr(lstart, sep); - if (p == lstart || !*lstart) { - ret = list_cb(NULL, 0, arg); - } else { - if (p) { - tmpend = p - 1; - } else { - tmpend = lstart + strlen(lstart) - 1; - } - if (remove_whitespace) { - while (isspace((unsigned char)*tmpend)) { - tmpend--; - } - } - ret = list_cb(lstart, tmpend - lstart + 1, arg); - } - if (ret <= 0) { - return ret; - } - if (p == NULL) { - return 1; - } - lstart = p + 1; - } -} - -int CONF_modules_load_file(const char *filename, const char *appname, - unsigned long flags) { - return 1; -} - -void CONF_modules_free(void) {} - -void OPENSSL_config(const char *config_name) {} - -void OPENSSL_no_config(void) {} diff --git a/third_party/boringssl/src/crypto/conf/conf.cc b/third_party/boringssl/src/crypto/conf/conf.cc new file mode 100644 index 00000000..0c7ea9db --- /dev/null +++ b/third_party/boringssl/src/crypto/conf/conf.cc @@ -0,0 +1,637 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +BSSL_NAMESPACE_BEGIN + +struct conf_section_st { + char *name; + // values contains non-owning pointers to the values in the section. + STACK_OF(CONF_VALUE) *values; +}; + +BSSL_NAMESPACE_END + +static const char kDefaultSectionName[] = "default"; + +static uint32_t conf_section_hash(const CONF_SECTION *s) { + return OPENSSL_strhash(s->name); +} + +static int conf_section_cmp(const CONF_SECTION *a, const CONF_SECTION *b) { + return strcmp(a->name, b->name); +} + +static uint32_t conf_value_hash(const CONF_VALUE *v) { + const uint32_t section_hash = OPENSSL_strhash(v->section); + const uint32_t name_hash = OPENSSL_strhash(v->name); + return (section_hash << 2) ^ name_hash; +} + +static int conf_value_cmp(const CONF_VALUE *a, const CONF_VALUE *b) { + int cmp = strcmp(a->section, b->section); + if (cmp != 0) { + return cmp; + } + + return strcmp(a->name, b->name); +} + +CONF *NCONF_new(void *method) { + if (method != nullptr) { + return nullptr; + } + + CONF *conf = New(); + if (conf == nullptr) { + return nullptr; + } + + conf->sections = lh_CONF_SECTION_new(conf_section_hash, conf_section_cmp); + conf->values = lh_CONF_VALUE_new(conf_value_hash, conf_value_cmp); + if (conf->sections == nullptr || conf->values == nullptr) { + NCONF_free(conf); + return nullptr; + } + + return conf; +} + +CONF_VALUE *bssl::CONF_VALUE_new() { return New(); } + +static void value_free(CONF_VALUE *value) { + if (value == nullptr) { + return; + } + OPENSSL_free(value->section); + OPENSSL_free(value->name); + OPENSSL_free(value->value); + Delete(value); +} + +static void section_free(CONF_SECTION *section) { + if (section == nullptr) { + return; + } + OPENSSL_free(section->name); + sk_CONF_VALUE_free(section->values); + Delete(section); +} + +static void value_free_arg(CONF_VALUE *value, void *arg) { value_free(value); } + +static void section_free_arg(CONF_SECTION *section, void *arg) { + section_free(section); +} + +void NCONF_free(CONF *conf) { + if (conf == nullptr) { + return; + } + + lh_CONF_SECTION_doall_arg(conf->sections, section_free_arg, nullptr); + lh_CONF_SECTION_free(conf->sections); + lh_CONF_VALUE_doall_arg(conf->values, value_free_arg, nullptr); + lh_CONF_VALUE_free(conf->values); + Delete(conf); +} + +static CONF_SECTION *NCONF_new_section(const CONF *conf, const char *section) { + CONF_SECTION *s = New(); + if (!s) { + return nullptr; + } + s->name = OPENSSL_strdup(section); + s->values = sk_CONF_VALUE_new_null(); + if (s->name == nullptr || s->values == nullptr) { + goto err; + } + + CONF_SECTION *old_section; + if (!lh_CONF_SECTION_insert(conf->sections, &old_section, s)) { + goto err; + } + section_free(old_section); + return s; + +err: + section_free(s); + return nullptr; +} + +static int is_comment(char c) { return c == '#'; } + +static int is_quote(char c) { return c == '"' || c == '\'' || c == '`'; } + +static int is_esc(char c) { return c == '\\'; } + +static int is_conf_ws(char c) { + // This differs from |OPENSSL_isspace| in that CONF does not accept '\v' and + // '\f' as whitespace. + return c == ' ' || c == '\t' || c == '\r' || c == '\n'; +} + +static int is_name_char(char c) { + // Alphanumeric characters, and a handful of symbols, may appear in value and + // section names without escaping. + return OPENSSL_isalnum(c) || c == '_' || c == '!' || c == '.' || c == '%' || + c == '&' || c == '*' || c == '+' || c == ',' || c == '/' || c == ';' || + c == '?' || c == '@' || c == '^' || c == '~' || c == '|' || c == '-'; +} + +static int str_copy(CONF *conf, char *section, char **pto, char *from) { + int q, to = 0, len = 0; + char v; + BUF_MEM *buf; + + buf = BUF_MEM_new(); + if (buf == nullptr) { + return 0; + } + + len = strlen(from) + 1; + if (!BUF_MEM_grow(buf, len)) { + goto err; + } + + for (;;) { + if (is_quote(*from)) { + q = *from; + from++; + while (*from != '\0' && *from != q) { + if (is_esc(*from)) { + from++; + if (*from == '\0') { + break; + } + } + buf->data[to++] = *(from++); + } + if (*from == q) { + from++; + } + } else if (is_esc(*from)) { + from++; + v = *(from++); + if (v == '\0') { + break; + } else if (v == 'r') { + v = '\r'; + } else if (v == 'n') { + v = '\n'; + } else if (v == 'b') { + v = '\b'; + } else if (v == 't') { + v = '\t'; + } + buf->data[to++] = v; + } else if (*from == '\0') { + break; + } else if (*from == '$') { + // Historically, $foo would expand to a previously-parsed value. This + // feature has been removed as it was unused and is a DoS vector. If + // trying to embed '$' in a line, either escape it or wrap the value in + // quotes. + OPENSSL_PUT_ERROR(CONF, CONF_R_VARIABLE_EXPANSION_NOT_SUPPORTED); + goto err; + } else { + buf->data[to++] = *(from++); + } + } + + buf->data[to] = '\0'; + OPENSSL_free(*pto); + *pto = buf->data; + OPENSSL_free(buf); + return 1; + +err: + BUF_MEM_free(buf); + return 0; +} + +static CONF_SECTION *get_section(const CONF *conf, const char *section) { + CONF_SECTION templ; + OPENSSL_memset(&templ, 0, sizeof(templ)); + templ.name = (char *)section; + return lh_CONF_SECTION_retrieve(conf->sections, &templ); +} + +const STACK_OF(CONF_VALUE) *NCONF_get_section(const CONF *conf, + const char *section) { + const CONF_SECTION *section_obj = get_section(conf, section); + if (section_obj == nullptr) { + return nullptr; + } + return section_obj->values; +} + +const char *NCONF_get_string(const CONF *conf, const char *section, + const char *name) { + CONF_VALUE templ, *value; + + if (section == nullptr) { + section = kDefaultSectionName; + } + + OPENSSL_memset(&templ, 0, sizeof(templ)); + templ.section = (char *)section; + templ.name = (char *)name; + value = lh_CONF_VALUE_retrieve(conf->values, &templ); + if (value == nullptr) { + return nullptr; + } + return value->value; +} + +static int add_string(const CONF *conf, CONF_SECTION *section, + CONF_VALUE *value) { + value->section = OPENSSL_strdup(section->name); + if (value->section == nullptr) { + return 0; + } + + if (!sk_CONF_VALUE_push(section->values, value)) { + return 0; + } + + CONF_VALUE *old_value; + if (!lh_CONF_VALUE_insert(conf->values, &old_value, value)) { + // Remove |value| from |section->values|, so we do not leave a dangling + // pointer. + sk_CONF_VALUE_pop(section->values); + return 0; + } + if (old_value != nullptr) { + (void)sk_CONF_VALUE_delete_ptr(section->values, old_value); + value_free(old_value); + } + + return 1; +} + +static char *eat_ws(char *p) { + while (*p != '\0' && is_conf_ws(*p)) { + p++; + } + return p; +} + +static char *scan_esc(char *p) { + assert(p[0] == '\\'); + return p[1] == '\0' ? p + 1 : p + 2; +} + +static char *eat_name(char *p) { + for (;;) { + if (is_esc(*p)) { + p = scan_esc(p); + continue; + } + if (!is_name_char(*p)) { + return p; + } + p++; + } +} + +static char *scan_quote(char *p) { + int q = *p; + + p++; + while (*p != '\0' && *p != q) { + if (is_esc(*p)) { + p++; + if (*p == '\0') { + return p; + } + } + p++; + } + if (*p == q) { + p++; + } + return p; +} + +static void clear_comments(char *p) { + for (;;) { + if (!is_conf_ws(*p)) { + break; + } + p++; + } + + for (;;) { + if (is_comment(*p)) { + *p = '\0'; + return; + } + if (is_quote(*p)) { + p = scan_quote(p); + continue; + } + if (is_esc(*p)) { + p = scan_esc(p); + continue; + } + if (*p == '\0') { + return; + } else { + p++; + } + } +} + +int NCONF_load_bio(CONF *conf, BIO *in, long *out_error_line) { + static const size_t CONFBUFSIZE = 512; + int bufnum = 0, i, ii; + BUF_MEM *buff = nullptr; + char *s, *p, *end; + int again; + long eline = 0; + CONF_VALUE *v = nullptr; + CONF_SECTION *sv = nullptr; + char *section = nullptr, *buf; + char *start, *psection, *pname; + + if ((buff = BUF_MEM_new()) == nullptr) { + OPENSSL_PUT_ERROR(CONF, ERR_R_BUF_LIB); + goto err; + } + + section = OPENSSL_strdup(kDefaultSectionName); + if (section == nullptr) { + goto err; + } + + sv = NCONF_new_section(conf, section); + if (sv == nullptr) { + OPENSSL_PUT_ERROR(CONF, CONF_R_UNABLE_TO_CREATE_NEW_SECTION); + goto err; + } + + bufnum = 0; + again = 0; + for (;;) { + if (!BUF_MEM_grow(buff, bufnum + CONFBUFSIZE)) { + OPENSSL_PUT_ERROR(CONF, ERR_R_BUF_LIB); + goto err; + } + p = &(buff->data[bufnum]); + *p = '\0'; + BIO_gets(in, p, CONFBUFSIZE - 1); + p[CONFBUFSIZE - 1] = '\0'; + ii = i = strlen(p); + if (i == 0 && !again) { + break; + } + again = 0; + while (i > 0) { + if ((p[i - 1] != '\r') && (p[i - 1] != '\n')) { + break; + } else { + i--; + } + } + // we removed some trailing stuff so there is a new + // line on the end. + if (ii && i == ii) { + again = 1; // long line + } else { + p[i] = '\0'; + eline++; // another input line + } + + // we now have a line with trailing \r\n removed + + // i is the number of bytes + bufnum += i; + + v = nullptr; + // check for line continuation + if (bufnum >= 1) { + // If we have bytes and the last char '\\' and + // second last char is not '\\' + p = &(buff->data[bufnum - 1]); + if (is_esc(p[0]) && ((bufnum <= 1) || !is_esc(p[-1]))) { + bufnum--; + again = 1; + } + } + if (again) { + continue; + } + bufnum = 0; + buf = buff->data; + + clear_comments(buf); + s = eat_ws(buf); + if (*s == '\0') { + continue; // blank line + } + if (*s == '[') { + char *ss; + + s++; + start = eat_ws(s); + ss = start; + again: + end = eat_name(ss); + p = eat_ws(end); + if (*p != ']') { + if (*p != '\0' && ss != p) { + ss = p; + goto again; + } + OPENSSL_PUT_ERROR(CONF, CONF_R_MISSING_CLOSE_SQUARE_BRACKET); + goto err; + } + *end = '\0'; + if (!str_copy(conf, nullptr, §ion, start)) { + goto err; + } + if ((sv = get_section(conf, section)) == nullptr) { + sv = NCONF_new_section(conf, section); + } + if (sv == nullptr) { + OPENSSL_PUT_ERROR(CONF, CONF_R_UNABLE_TO_CREATE_NEW_SECTION); + goto err; + } + continue; + } else { + pname = s; + psection = nullptr; + end = eat_name(s); + if ((end[0] == ':') && (end[1] == ':')) { + *end = '\0'; + end += 2; + psection = pname; + pname = end; + end = eat_name(end); + } + p = eat_ws(end); + if (*p != '=') { + OPENSSL_PUT_ERROR(CONF, CONF_R_MISSING_EQUAL_SIGN); + goto err; + } + *end = '\0'; + p++; + start = eat_ws(p); + while (*p != '\0') { + p++; + } + p--; + while (p != start && is_conf_ws(*p)) { + p--; + } + p++; + *p = '\0'; + + if (!(v = CONF_VALUE_new())) { + goto err; + } + if (psection == nullptr) { + psection = section; + } + v->name = OPENSSL_strdup(pname); + if (v->name == nullptr) { + goto err; + } + if (!str_copy(conf, psection, &(v->value), start)) { + goto err; + } + + CONF_SECTION *tv; + if (strcmp(psection, section) != 0) { + if ((tv = get_section(conf, psection)) == nullptr) { + tv = NCONF_new_section(conf, psection); + } + if (tv == nullptr) { + OPENSSL_PUT_ERROR(CONF, CONF_R_UNABLE_TO_CREATE_NEW_SECTION); + goto err; + } + } else { + tv = sv; + } + if (add_string(conf, tv, v) == 0) { + goto err; + } + v = nullptr; + } + } + BUF_MEM_free(buff); + Delete(section); + return 1; + +err: + BUF_MEM_free(buff); + Delete(section); + if (out_error_line != nullptr) { + *out_error_line = eline; + } + ERR_add_error_dataf("line %ld", eline); + value_free(v); + return 0; +} + +int NCONF_load(CONF *conf, const char *filename, long *out_error_line) { + BIO *in = BIO_new_file(filename, "rb"); + int ret; + + if (in == nullptr) { + OPENSSL_PUT_ERROR(CONF, ERR_R_SYS_LIB); + return 0; + } + + ret = NCONF_load_bio(conf, in, out_error_line); + BIO_free(in); + + return ret; +} + +int bssl::CONF_parse_list(const char *list, char sep, int remove_whitespace, + int (*list_cb)(const char *elem, size_t len, + void *usr), + void *arg) { + int ret; + const char *lstart, *tmpend, *p; + + if (list == nullptr) { + OPENSSL_PUT_ERROR(CONF, CONF_R_LIST_CANNOT_BE_NULL); + return 0; + } + + lstart = list; + for (;;) { + if (remove_whitespace) { + while (*lstart && OPENSSL_isspace((unsigned char)*lstart)) { + lstart++; + } + } + p = strchr(lstart, sep); + if (p == lstart || !*lstart) { + ret = list_cb(nullptr, 0, arg); + } else { + if (p) { + tmpend = p - 1; + } else { + tmpend = lstart + strlen(lstart) - 1; + } + if (remove_whitespace) { + while (OPENSSL_isspace((unsigned char)*tmpend)) { + tmpend--; + } + } + ret = list_cb(lstart, tmpend - lstart + 1, arg); + } + if (ret <= 0) { + return ret; + } + if (p == nullptr) { + return 1; + } + lstart = p + 1; + } +} + +int CONF_modules_load_file(const char *filename, const char *appname, + unsigned long flags) { + return 1; +} + +void CONF_modules_unload(int all) {} + +void CONF_modules_free() {} + +void OPENSSL_config(const char *config_name) {} + +void OPENSSL_no_config() {} diff --git a/third_party/boringssl/src/crypto/conf/conf_def.h b/third_party/boringssl/src/crypto/conf/conf_def.h deleted file mode 100644 index b1e6ba63..00000000 --- a/third_party/boringssl/src/crypto/conf/conf_def.h +++ /dev/null @@ -1,127 +0,0 @@ -/* crypto/conf/conf_def.h */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -/* THIS FILE WAS AUTOMAGICALLY GENERATED! - Please modify and use keysets.pl to regenerate it. */ - -#define CONF_NUMBER 1 -#define CONF_UPPER 2 -#define CONF_LOWER 4 -#define CONF_UNDER 256 -#define CONF_PUNCTUATION 512 -#define CONF_WS 16 -#define CONF_ESC 32 -#define CONF_QUOTE 64 -#define CONF_DQUOTE 1024 -#define CONF_COMMENT 128 -#define CONF_FCOMMENT 2048 -#define CONF_EOF 8 -#define CONF_HIGHBIT 4096 -#define CONF_ALPHA (CONF_UPPER|CONF_LOWER) -#define CONF_ALPHA_NUMERIC (CONF_ALPHA|CONF_NUMBER|CONF_UNDER) -#define CONF_ALPHA_NUMERIC_PUNCT (CONF_ALPHA|CONF_NUMBER|CONF_UNDER| \ - CONF_PUNCTUATION) - -#define KEYTYPES(c) CONF_type_default -#define IS_COMMENT(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_COMMENT) -#define IS_FCOMMENT(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_FCOMMENT) -#define IS_EOF(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_EOF) -#define IS_ESC(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_ESC) -#define IS_NUMBER(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_NUMBER) -#define IS_WS(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_WS) -#define IS_ALPHA_NUMERIC(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_ALPHA_NUMERIC) -#define IS_ALPHA_NUMERIC_PUNCT(c,a) \ - (KEYTYPES(c)[(a)&0xff]&CONF_ALPHA_NUMERIC_PUNCT) -#define IS_QUOTE(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_QUOTE) -#define IS_DQUOTE(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_DQUOTE) -#define IS_HIGHBIT(c,a) (KEYTYPES(c)[(a)&0xff]&CONF_HIGHBIT) - -static const unsigned short CONF_type_default[256]={ - 0x0008,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0010,0x0010,0x0000,0x0000,0x0010,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000,0x0000, - 0x0010,0x0200,0x0040,0x0080,0x0000,0x0200,0x0200,0x0040, - 0x0000,0x0000,0x0200,0x0200,0x0200,0x0200,0x0200,0x0200, - 0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001,0x0001, - 0x0001,0x0001,0x0000,0x0200,0x0000,0x0000,0x0000,0x0200, - 0x0200,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002, - 0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002, - 0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002,0x0002, - 0x0002,0x0002,0x0002,0x0000,0x0020,0x0000,0x0200,0x0100, - 0x0040,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004, - 0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004, - 0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004,0x0004, - 0x0004,0x0004,0x0004,0x0000,0x0200,0x0000,0x0200,0x0000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - 0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000,0x1000, - }; diff --git a/third_party/boringssl/src/crypto/conf/internal.h b/third_party/boringssl/src/crypto/conf/internal.h index 3e0e57df..7fe35aa2 100644 --- a/third_party/boringssl/src/crypto/conf/internal.h +++ b/third_party/boringssl/src/crypto/conf/internal.h @@ -1,31 +1,54 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2015 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #ifndef OPENSSL_HEADER_CRYPTO_CONF_INTERNAL_H #define OPENSSL_HEADER_CRYPTO_CONF_INTERNAL_H -#if defined(__cplusplus) -extern "C" { -#endif +#include +#include "../lhash/internal.h" + + +BSSL_NAMESPACE_BEGIN + +typedef struct conf_section_st CONF_SECTION; + +DEFINE_LHASH_OF(CONF_SECTION) +DEFINE_LHASH_OF(CONF_VALUE) + +BSSL_NAMESPACE_END + +struct conf_st { + LHASH_OF(CONF_VALUE) *values; + LHASH_OF(CONF_SECTION) *sections; +}; + +BSSL_NAMESPACE_BEGIN // CONF_VALUE_new returns a freshly allocated and zeroed |CONF_VALUE|. -CONF_VALUE *CONF_VALUE_new(void); +CONF_VALUE *CONF_VALUE_new(); +// CONF_parse_list takes a list separated by 'sep' and calls |list_cb| giving +// the start and length of each member, optionally stripping leading and +// trailing whitespace. This can be used to parse comma separated lists for +// example. If |list_cb| returns <= 0, then the iteration is halted and that +// value is returned immediately. Otherwise it returns one. Note that |list_cb| +// may be called on an empty member. +OPENSSL_EXPORT int CONF_parse_list( + const char *list, char sep, int remove_whitespace, + int (*list_cb)(const char *elem, size_t len, void *usr), void *arg); -#if defined(__cplusplus) -} // extern C -#endif +BSSL_NAMESPACE_END #endif // OPENSSL_HEADER_CRYPTO_CONF_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/cpu_aarch64_apple.c b/third_party/boringssl/src/crypto/cpu_aarch64_apple.c deleted file mode 100644 index 6699ff7e..00000000 --- a/third_party/boringssl/src/crypto/cpu_aarch64_apple.c +++ /dev/null @@ -1,72 +0,0 @@ -/* Copyright (c) 2021, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "internal.h" - -#if defined(OPENSSL_AARCH64) && defined(OPENSSL_APPLE) && \ - !defined(OPENSSL_STATIC_ARMCAP) - -#include -#include - -#include - - -extern uint32_t OPENSSL_armcap_P; - -static int has_hw_feature(const char *name) { - int value; - size_t len = sizeof(value); - if (sysctlbyname(name, &value, &len, NULL, 0) != 0) { - return 0; - } - if (len != sizeof(int)) { - // This should not happen. All the values queried should be integer-valued. - assert(0); - return 0; - } - - // Per sys/sysctl.h: - // - // Selectors that return errors are not support on the system. Supported - // features will return 1 if they are recommended or 0 if they are supported - // but are not expected to help performance. Future versions of these - // selectors may return larger values as necessary so it is best to test for - // non zero. - return value != 0; -} - -void OPENSSL_cpuid_setup(void) { - // Apple ARM64 platforms have NEON and cryptography extensions available - // statically, so we do not need to query them. In particular, there sometimes - // are no sysctls corresponding to such features. See below. -#if !defined(__ARM_NEON) || !defined(__ARM_FEATURE_AES) || \ - !defined(__ARM_FEATURE_SHA2) -#error "NEON and crypto extensions should be statically available." -#endif - OPENSSL_armcap_P = - ARMV7_NEON | ARMV8_AES | ARMV8_PMULL | ARMV8_SHA1 | ARMV8_SHA256; - - // macOS has sysctls named both like "hw.optional.arm.FEAT_SHA512" and like - // "hw.optional.armv8_2_sha512". There does not appear to be documentation on - // which to use. The "armv8_2_sha512" style omits statically-available - // features, while the "FEAT_SHA512" style includes them. However, the - // "FEAT_SHA512" style was added in macOS 12, so we use the older style for - // better compatibility and handle static features above. - if (has_hw_feature("hw.optional.armv8_2_sha512")) { - OPENSSL_armcap_P |= ARMV8_SHA512; - } -} - -#endif // OPENSSL_AARCH64 && OPENSSL_APPLE && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_aarch64_apple.cc b/third_party/boringssl/src/crypto/cpu_aarch64_apple.cc new file mode 100644 index 00000000..c25b5ac0 --- /dev/null +++ b/third_party/boringssl/src/crypto/cpu_aarch64_apple.cc @@ -0,0 +1,76 @@ +// Copyright 2021 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" + +#if defined(OPENSSL_AARCH64) && defined(OPENSSL_APPLE) && \ + !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM) + +#include +#include + + +using namespace bssl; + +static int has_hw_feature(const char *name) { + int value; + size_t len = sizeof(value); + if (sysctlbyname(name, &value, &len, nullptr, 0) != 0) { + return 0; + } + if (len != sizeof(int)) { + // This should not happen. All the values queried should be integer-valued. + assert(0); + return 0; + } + + // Per sys/sysctl.h: + // + // Selectors that return errors are not support on the system. Supported + // features will return 1 if they are recommended or 0 if they are supported + // but are not expected to help performance. Future versions of these + // selectors may return larger values as necessary so it is best to test for + // non zero. + return value != 0; +} + +void bssl::OPENSSL_cpuid_setup() { + // Apple ARM64 platforms have NEON and cryptography extensions available + // statically, so we do not need to query them. In particular, there sometimes + // are no sysctls corresponding to such features. See below. +#if !defined(__ARM_NEON) || !defined(__ARM_FEATURE_AES) || \ + !defined(__ARM_FEATURE_SHA2) +#error "NEON and crypto extensions should be statically available." +#endif + OPENSSL_armcap_P = + ARMV7_NEON | ARMV8_AES | ARMV8_PMULL | ARMV8_SHA1 | ARMV8_SHA256; + + // See Apple's documentation for sysctl names: + // https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics + // + // The new feature names, e.g. "hw.optional.arm.FEAT_SHA512", are only + // available in macOS 12. For compatibility with macOS 11, we also support + // the old names. The old names don't have values for features like FEAT_AES, + // so instead we detect them statically above. + if (has_hw_feature("hw.optional.arm.FEAT_SHA512") || + has_hw_feature("hw.optional.armv8_2_sha512")) { + OPENSSL_armcap_P |= ARMV8_SHA512; + } + if (has_hw_feature("hw.optional.arm.FEAT_SHA3") || + has_hw_feature("hw.optional.armv8_2_sha3")) { + OPENSSL_armcap_P |= ARMV8_SHA3; + } +} + +#endif // OPENSSL_AARCH64 && OPENSSL_APPLE && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_aarch64_fuchsia.c b/third_party/boringssl/src/crypto/cpu_aarch64_fuchsia.c deleted file mode 100644 index 64bc4489..00000000 --- a/third_party/boringssl/src/crypto/cpu_aarch64_fuchsia.c +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "internal.h" - -#if defined(OPENSSL_AARCH64) && defined(OPENSSL_FUCHSIA) && \ - !defined(OPENSSL_STATIC_ARMCAP) - -#include -#include -#include - -#include - -extern uint32_t OPENSSL_armcap_P; - -void OPENSSL_cpuid_setup(void) { - uint32_t hwcap; - zx_status_t rc = zx_system_get_features(ZX_FEATURE_KIND_CPU, &hwcap); - if (rc != ZX_OK || (hwcap & ZX_ARM64_FEATURE_ISA_ASIMD) == 0) { - // If NEON/ASIMD is missing, don't report other features either. This - // matches OpenSSL, and the other features depend on SIMD registers. - return; - } - - OPENSSL_armcap_P |= ARMV7_NEON; - - if (hwcap & ZX_ARM64_FEATURE_ISA_AES) { - OPENSSL_armcap_P |= ARMV8_AES; - } - if (hwcap & ZX_ARM64_FEATURE_ISA_PMULL) { - OPENSSL_armcap_P |= ARMV8_PMULL; - } - if (hwcap & ZX_ARM64_FEATURE_ISA_SHA1) { - OPENSSL_armcap_P |= ARMV8_SHA1; - } - if (hwcap & ZX_ARM64_FEATURE_ISA_SHA256) { - OPENSSL_armcap_P |= ARMV8_SHA256; - } - if (hwcap & ZX_ARM64_FEATURE_ISA_SHA512) { - OPENSSL_armcap_P |= ARMV8_SHA512; - } -} - -#endif // OPENSSL_AARCH64 && OPENSSL_FUCHSIA && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_aarch64_fuchsia.cc b/third_party/boringssl/src/crypto/cpu_aarch64_fuchsia.cc new file mode 100644 index 00000000..88db3b2d --- /dev/null +++ b/third_party/boringssl/src/crypto/cpu_aarch64_fuchsia.cc @@ -0,0 +1,58 @@ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" + +#if defined(OPENSSL_AARCH64) && defined(OPENSSL_FUCHSIA) && \ + !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM) + +#include +#include +#include + + +using namespace bssl; + +void bssl::OPENSSL_cpuid_setup() { + uint32_t hwcap; + zx_status_t rc = zx_system_get_features(ZX_FEATURE_KIND_CPU, &hwcap); + if (rc != ZX_OK || (hwcap & ZX_ARM64_FEATURE_ISA_ASIMD) == 0) { + // If NEON/ASIMD is missing, don't report other features either. This + // matches OpenSSL, and the other features depend on SIMD registers. + return; + } + + OPENSSL_armcap_P |= ARMV7_NEON; + + if (hwcap & ZX_ARM64_FEATURE_ISA_AES) { + OPENSSL_armcap_P |= ARMV8_AES; + } + if (hwcap & ZX_ARM64_FEATURE_ISA_PMULL) { + OPENSSL_armcap_P |= ARMV8_PMULL; + } + if (hwcap & ZX_ARM64_FEATURE_ISA_SHA1) { + OPENSSL_armcap_P |= ARMV8_SHA1; + } + if (hwcap & ZX_ARM64_FEATURE_ISA_SHA256) { + OPENSSL_armcap_P |= ARMV8_SHA256; + } + if (hwcap & ZX_ARM64_FEATURE_ISA_SHA512) { + OPENSSL_armcap_P |= ARMV8_SHA512; + } + if (hwcap & ZX_ARM64_FEATURE_ISA_SHA3) { + OPENSSL_armcap_P |= ARMV8_SHA3; + } +} + +#endif // OPENSSL_AARCH64 && OPENSSL_FUCHSIA && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_aarch64_linux.c b/third_party/boringssl/src/crypto/cpu_aarch64_linux.c deleted file mode 100644 index 42227115..00000000 --- a/third_party/boringssl/src/crypto/cpu_aarch64_linux.c +++ /dev/null @@ -1,64 +0,0 @@ -/* Copyright (c) 2016, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "internal.h" - -#if defined(OPENSSL_AARCH64) && defined(OPENSSL_LINUX) && \ - !defined(OPENSSL_STATIC_ARMCAP) - -#include - -#include - - -extern uint32_t OPENSSL_armcap_P; - -void OPENSSL_cpuid_setup(void) { - unsigned long hwcap = getauxval(AT_HWCAP); - - // See /usr/include/asm/hwcap.h on an aarch64 installation for the source of - // these values. - static const unsigned long kNEON = 1 << 1; - static const unsigned long kAES = 1 << 3; - static const unsigned long kPMULL = 1 << 4; - static const unsigned long kSHA1 = 1 << 5; - static const unsigned long kSHA256 = 1 << 6; - static const unsigned long kSHA512 = 1 << 21; - - if ((hwcap & kNEON) == 0) { - // Matching OpenSSL, if NEON is missing, don't report other features - // either. - return; - } - - OPENSSL_armcap_P |= ARMV7_NEON; - - if (hwcap & kAES) { - OPENSSL_armcap_P |= ARMV8_AES; - } - if (hwcap & kPMULL) { - OPENSSL_armcap_P |= ARMV8_PMULL; - } - if (hwcap & kSHA1) { - OPENSSL_armcap_P |= ARMV8_SHA1; - } - if (hwcap & kSHA256) { - OPENSSL_armcap_P |= ARMV8_SHA256; - } - if (hwcap & kSHA512) { - OPENSSL_armcap_P |= ARMV8_SHA512; - } -} - -#endif // OPENSSL_AARCH64 && OPENSSL_LINUX && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_aarch64_linux.cc b/third_party/boringssl/src/crypto/cpu_aarch64_linux.cc new file mode 100644 index 00000000..527c7f03 --- /dev/null +++ b/third_party/boringssl/src/crypto/cpu_aarch64_linux.cc @@ -0,0 +1,66 @@ +// Copyright 2016 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" + +#if defined(OPENSSL_AARCH64) && defined(OPENSSL_LINUX) && \ + !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM) + +#include + + +using namespace bssl; + +void bssl::OPENSSL_cpuid_setup() { + unsigned long hwcap = getauxval(AT_HWCAP); + + // See /usr/include/asm/hwcap.h on an aarch64 installation for the source of + // these values. + static const unsigned long kNEON = 1 << 1; + static const unsigned long kAES = 1 << 3; + static const unsigned long kPMULL = 1 << 4; + static const unsigned long kSHA1 = 1 << 5; + static const unsigned long kSHA256 = 1 << 6; + static const unsigned long kSHA3 = 1 << 17; + static const unsigned long kSHA512 = 1 << 21; + + if ((hwcap & kNEON) == 0) { + // Matching OpenSSL, if NEON is missing, don't report other features + // either. + return; + } + + OPENSSL_armcap_P |= ARMV7_NEON; + + if (hwcap & kAES) { + OPENSSL_armcap_P |= ARMV8_AES; + } + if (hwcap & kPMULL) { + OPENSSL_armcap_P |= ARMV8_PMULL; + } + if (hwcap & kSHA1) { + OPENSSL_armcap_P |= ARMV8_SHA1; + } + if (hwcap & kSHA256) { + OPENSSL_armcap_P |= ARMV8_SHA256; + } + if (hwcap & kSHA512) { + OPENSSL_armcap_P |= ARMV8_SHA512; + } + if (hwcap & kSHA3) { + OPENSSL_armcap_P |= ARMV8_SHA3; + } +} + +#endif // OPENSSL_AARCH64 && OPENSSL_LINUX && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_aarch64_openbsd.cc b/third_party/boringssl/src/crypto/cpu_aarch64_openbsd.cc new file mode 100644 index 00000000..235c14ae --- /dev/null +++ b/third_party/boringssl/src/crypto/cpu_aarch64_openbsd.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2022, Robert Nagy +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if defined(OPENSSL_AARCH64) && defined(OPENSSL_OPENBSD) && \ + !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM) + +#include +#include +#include + +#include "armv8_feature_parsing.h" +#include "internal.h" + + +using namespace bssl; + +void bssl::OPENSSL_cpuid_setup() { + int isar0_mib[] = {CTL_MACHDEP, CPU_ID_AA64ISAR0}; + uint64_t cpu_id = 0; + size_t len = sizeof(cpu_id); + + if (sysctl(isar0_mib, 2, &cpu_id, &len, nullptr, 0) < 0) { + return; + } + + OPENSSL_armcap_P |= ARMV7_NEON; + + // Use the common parsing function to check other features. + OPENSSL_armcap_P |= armcap::ParseISAR0Flags(cpu_id); +} + +#endif // OPENSSL_AARCH64 && OPENSSL_OPENBSD && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_aarch64_sysreg.cc b/third_party/boringssl/src/crypto/cpu_aarch64_sysreg.cc new file mode 100644 index 00000000..770be1af --- /dev/null +++ b/third_party/boringssl/src/crypto/cpu_aarch64_sysreg.cc @@ -0,0 +1,66 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "internal.h" + +// While Arm system registers are normally not available to userspace, FreeBSD +// expects userspace to simply read them. It traps the reads and fills in CPU +// capabilities. +#if defined(OPENSSL_AARCH64) && !defined(OPENSSL_STATIC_ARMCAP) && \ + (defined(ANDROID_BAREMETAL) || defined(OPENSSL_FREEBSD)) && \ + !defined(OPENSSL_NO_ASM) + +#include "./armv8_feature_parsing.h" + +BSSL_NAMESPACE_BEGIN + +#define ID_AA64PFR0_EL1_ADVSIMD 5 + +#define READ_SYSREG(name) \ + ({ \ + uint64_t _r; \ + __asm__("mrs %0, " name : "=r"(_r)); \ + _r; \ + }) + +// We use the common GetIDField helper now, but need a signed variant +// for the NEON check using ID_AA64PFR0_EL1. +static int GetSignedIDField(uint64_t reg, unsigned field) { + unsigned value = armcap::GetIDField(reg, field); + if (value & (1 << (NBITS_ID_FIELD - 1))) { + return (int)(value | (UINT64_MAX << NBITS_ID_FIELD)); + } else { + return (int)value; + } +} + +void OPENSSL_cpuid_setup() { + uint64_t id_aa64pfr0_el1 = READ_SYSREG("id_aa64pfr0_el1"); + if (GetSignedIDField(id_aa64pfr0_el1, ID_AA64PFR0_EL1_ADVSIMD) < 0) { + // If AdvSIMD ("NEON") is missing, don't report other features either. + // This matches OpenSSL. + return; + } + + // Use the common parsing function to check all cryptographic features. + uint64_t id_aa64isar0_el1 = READ_SYSREG("id_aa64isar0_el1"); + OPENSSL_armcap_P |= ARMV7_NEON | armcap::ParseISAR0Flags(id_aa64isar0_el1); +} + +BSSL_NAMESPACE_END + +#endif // OPENSSL_AARCH64 && !OPENSSL_STATIC_ARMCAP && + // (ANDROID_BAREMETAL || OPENSSL_FREEBSD) diff --git a/third_party/boringssl/src/crypto/cpu_aarch64_win.c b/third_party/boringssl/src/crypto/cpu_aarch64_win.c deleted file mode 100644 index 0630f96a..00000000 --- a/third_party/boringssl/src/crypto/cpu_aarch64_win.c +++ /dev/null @@ -1,41 +0,0 @@ -/* Copyright (c) 2018, Google Inc. - * Copyright (c) 2020, Arm Ltd. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "internal.h" - -#if defined(OPENSSL_AARCH64) && defined(OPENSSL_WINDOWS) && \ - !defined(OPENSSL_STATIC_ARMCAP) - -#include - -#include - -extern uint32_t OPENSSL_armcap_P; -void OPENSSL_cpuid_setup(void) { - // We do not need to check for the presence of NEON, as Armv8-A always has it - OPENSSL_armcap_P |= ARMV7_NEON; - - if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)) { - // These are all covered by one call in Windows - OPENSSL_armcap_P |= ARMV8_AES; - OPENSSL_armcap_P |= ARMV8_PMULL; - OPENSSL_armcap_P |= ARMV8_SHA1; - OPENSSL_armcap_P |= ARMV8_SHA256; - } - // As of writing, Windows does not have a |PF_*| value for ARMv8.2 SHA-512 - // extensions. When it does, add it here. -} - -#endif // OPENSSL_AARCH64 && OPENSSL_WINDOWS && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_aarch64_win.cc b/third_party/boringssl/src/crypto/cpu_aarch64_win.cc new file mode 100644 index 00000000..4d51bbda --- /dev/null +++ b/third_party/boringssl/src/crypto/cpu_aarch64_win.cc @@ -0,0 +1,52 @@ +// Copyright 2018 The BoringSSL Authors +// Copyright (c) 2020, Arm Ltd. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" + +#if defined(OPENSSL_AARCH64) && defined(OPENSSL_WINDOWS) && \ + !defined(OPENSSL_STATIC_ARMCAP) && !defined(OPENSSL_NO_ASM) + +#include + + +#if !defined(PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE) +#define PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE 64 +#endif +#if !defined(PF_ARM_SHA512_INSTRUCTIONS_AVAILABLE) +#define PF_ARM_SHA512_INSTRUCTIONS_AVAILABLE 65 +#endif + +using namespace bssl; + +void bssl::OPENSSL_cpuid_setup() { + // We do not need to check for the presence of NEON, as Armv8-A always has it + OPENSSL_armcap_P |= ARMV7_NEON; + + if (IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE)) { + // These are all covered by one call in Windows + OPENSSL_armcap_P |= ARMV8_AES; + OPENSSL_armcap_P |= ARMV8_PMULL; + OPENSSL_armcap_P |= ARMV8_SHA1; + OPENSSL_armcap_P |= ARMV8_SHA256; + } + if (IsProcessorFeaturePresent(PF_ARM_SHA512_INSTRUCTIONS_AVAILABLE)) { + OPENSSL_armcap_P |= ARMV8_SHA512; + } + if (IsProcessorFeaturePresent(PF_ARM_SHA3_INSTRUCTIONS_AVAILABLE)) { + OPENSSL_armcap_P |= ARMV8_SHA3; + } +} + +#endif // OPENSSL_AARCH64 && OPENSSL_WINDOWS && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_arm.c b/third_party/boringssl/src/crypto/cpu_arm.c deleted file mode 100644 index 33259084..00000000 --- a/third_party/boringssl/src/crypto/cpu_arm.c +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "internal.h" - -#if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \ - !defined(OPENSSL_STATIC_ARMCAP) - -#include - - -extern uint32_t OPENSSL_armcap_P; - -int CRYPTO_is_NEON_capable_at_runtime(void) { - return (OPENSSL_armcap_P & ARMV7_NEON) != 0; -} - -int CRYPTO_is_ARMv8_AES_capable_at_runtime(void) { - return (OPENSSL_armcap_P & ARMV8_AES) != 0; -} - -int CRYPTO_is_ARMv8_PMULL_capable_at_runtime(void) { - return (OPENSSL_armcap_P & ARMV8_PMULL) != 0; -} - -#endif /* (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && - !defined(OPENSSL_STATIC_ARMCAP) */ diff --git a/third_party/boringssl/src/crypto/cpu_arm_freebsd.cc b/third_party/boringssl/src/crypto/cpu_arm_freebsd.cc new file mode 100644 index 00000000..7a8d05b6 --- /dev/null +++ b/third_party/boringssl/src/crypto/cpu_arm_freebsd.cc @@ -0,0 +1,55 @@ +// Copyright 2022 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && \ + defined(OPENSSL_FREEBSD) && !defined(OPENSSL_STATIC_ARMCAP) +#include +#include + +#include + + +using namespace bssl; + +void bssl::OPENSSL_cpuid_setup() { + unsigned long hwcap = 0, hwcap2 = 0; + + // |elf_aux_info| may fail, in which case |hwcap| and |hwcap2| will be + // left at zero. The rest of this function will then gracefully report + // the features are absent. + elf_aux_info(AT_HWCAP, &hwcap, sizeof(hwcap)); + elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2)); + + // Matching OpenSSL, only report other features if NEON is present. + if (hwcap & HWCAP_NEON) { + OPENSSL_armcap_P |= ARMV7_NEON; + + if (hwcap2 & HWCAP2_AES) { + OPENSSL_armcap_P |= ARMV8_AES; + } + if (hwcap2 & HWCAP2_PMULL) { + OPENSSL_armcap_P |= ARMV8_PMULL; + } + if (hwcap2 & HWCAP2_SHA1) { + OPENSSL_armcap_P |= ARMV8_SHA1; + } + if (hwcap2 & HWCAP2_SHA2) { + OPENSSL_armcap_P |= ARMV8_SHA256; + } + } +} + +#endif // OPENSSL_ARM && OPENSSL_OPENBSD && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_arm_linux.c b/third_party/boringssl/src/crypto/cpu_arm_linux.c deleted file mode 100644 index 67e6a1ba..00000000 --- a/third_party/boringssl/src/crypto/cpu_arm_linux.c +++ /dev/null @@ -1,230 +0,0 @@ -/* Copyright (c) 2016, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "internal.h" - -#if defined(OPENSSL_ARM) && defined(OPENSSL_LINUX) && \ - !defined(OPENSSL_STATIC_ARMCAP) -#include -#include -#include -#include - -#include -#include - -#include "cpu_arm_linux.h" - -#define AT_HWCAP 16 -#define AT_HWCAP2 26 - -// |getauxval| is not available on Android until API level 20. Link it as a weak -// symbol and use other methods as fallback. -unsigned long getauxval(unsigned long type) __attribute__((weak)); - -static int open_eintr(const char *path, int flags) { - int ret; - do { - ret = open(path, flags); - } while (ret < 0 && errno == EINTR); - return ret; -} - -static ssize_t read_eintr(int fd, void *out, size_t len) { - ssize_t ret; - do { - ret = read(fd, out, len); - } while (ret < 0 && errno == EINTR); - return ret; -} - -// read_full reads exactly |len| bytes from |fd| to |out|. On error or end of -// file, it returns zero. -static int read_full(int fd, void *out, size_t len) { - char *outp = out; - while (len > 0) { - ssize_t ret = read_eintr(fd, outp, len); - if (ret <= 0) { - return 0; - } - outp += ret; - len -= ret; - } - return 1; -} - -// read_file opens |path| and reads until end-of-file. On success, it returns -// one and sets |*out_ptr| and |*out_len| to a newly-allocated buffer with the -// contents. Otherwise, it returns zero. -static int read_file(char **out_ptr, size_t *out_len, const char *path) { - int fd = open_eintr(path, O_RDONLY); - if (fd < 0) { - return 0; - } - - static const size_t kReadSize = 1024; - int ret = 0; - size_t cap = kReadSize, len = 0; - char *buf = OPENSSL_malloc(cap); - if (buf == NULL) { - goto err; - } - - for (;;) { - if (cap - len < kReadSize) { - size_t new_cap = cap * 2; - if (new_cap < cap) { - goto err; - } - char *new_buf = OPENSSL_realloc(buf, new_cap); - if (new_buf == NULL) { - goto err; - } - buf = new_buf; - cap = new_cap; - } - - ssize_t bytes_read = read_eintr(fd, buf + len, kReadSize); - if (bytes_read < 0) { - goto err; - } - if (bytes_read == 0) { - break; - } - len += bytes_read; - } - - *out_ptr = buf; - *out_len = len; - ret = 1; - buf = NULL; - -err: - OPENSSL_free(buf); - close(fd); - return ret; -} - -// getauxval_proc behaves like |getauxval| but reads from /proc/self/auxv. -static unsigned long getauxval_proc(unsigned long type) { - int fd = open_eintr("/proc/self/auxv", O_RDONLY); - if (fd < 0) { - return 0; - } - - struct { - unsigned long tag; - unsigned long value; - } entry; - - for (;;) { - if (!read_full(fd, &entry, sizeof(entry)) || - (entry.tag == 0 && entry.value == 0)) { - break; - } - if (entry.tag == type) { - close(fd); - return entry.value; - } - } - close(fd); - return 0; -} - -extern uint32_t OPENSSL_armcap_P; - -static int g_has_broken_neon, g_needs_hwcap2_workaround; - -void OPENSSL_cpuid_setup(void) { - // We ignore the return value of |read_file| and proceed with an empty - // /proc/cpuinfo on error. If |getauxval| works, we will still detect - // capabilities. There may be a false positive due to - // |crypto_cpuinfo_has_broken_neon|, but this is now rare. - char *cpuinfo_data = NULL; - size_t cpuinfo_len = 0; - read_file(&cpuinfo_data, &cpuinfo_len, "/proc/cpuinfo"); - STRING_PIECE cpuinfo; - cpuinfo.data = cpuinfo_data; - cpuinfo.len = cpuinfo_len; - - // |getauxval| is not available on Android until API level 20. If it is - // unavailable, read from /proc/self/auxv as a fallback. This is unreadable - // on some versions of Android, so further fall back to /proc/cpuinfo. - // - // See - // https://android.googlesource.com/platform/ndk/+/882ac8f3392858991a0e1af33b4b7387ec856bd2 - // and b/13679666 (Google-internal) for details. - unsigned long hwcap = 0; - if (getauxval != NULL) { - hwcap = getauxval(AT_HWCAP); - } - if (hwcap == 0) { - hwcap = getauxval_proc(AT_HWCAP); - } - if (hwcap == 0) { - hwcap = crypto_get_arm_hwcap_from_cpuinfo(&cpuinfo); - } - - // Clear NEON support if known broken. Note, if NEON is available statically, - // the non-NEON code is dropped and this workaround is a no-op. - // - // TODO(davidben): The Android NDK now builds with NEON statically available - // by default. Cronet still has some consumers that support NEON-less devices - // (b/150371744). Get metrics on whether they still see this CPU and, if not, - // remove this check entirely. - g_has_broken_neon = crypto_cpuinfo_has_broken_neon(&cpuinfo); - if (g_has_broken_neon) { - hwcap &= ~HWCAP_NEON; - } - - // Matching OpenSSL, only report other features if NEON is present. - if (hwcap & HWCAP_NEON) { - OPENSSL_armcap_P |= ARMV7_NEON; - - // Some ARMv8 Android devices don't expose AT_HWCAP2. Fall back to - // /proc/cpuinfo. See https://crbug.com/boringssl/46. As of February 2021, - // this is now rare (see Chrome's Net.NeedsHWCAP2Workaround metric), but AES - // and PMULL extensions are very useful, so we still carry the workaround - // for now. - unsigned long hwcap2 = 0; - if (getauxval != NULL) { - hwcap2 = getauxval(AT_HWCAP2); - } - if (hwcap2 == 0) { - hwcap2 = crypto_get_arm_hwcap2_from_cpuinfo(&cpuinfo); - g_needs_hwcap2_workaround = hwcap2 != 0; - } - - if (hwcap2 & HWCAP2_AES) { - OPENSSL_armcap_P |= ARMV8_AES; - } - if (hwcap2 & HWCAP2_PMULL) { - OPENSSL_armcap_P |= ARMV8_PMULL; - } - if (hwcap2 & HWCAP2_SHA1) { - OPENSSL_armcap_P |= ARMV8_SHA1; - } - if (hwcap2 & HWCAP2_SHA2) { - OPENSSL_armcap_P |= ARMV8_SHA256; - } - } - - OPENSSL_free(cpuinfo_data); -} - -int CRYPTO_has_broken_NEON(void) { return g_has_broken_neon; } - -int CRYPTO_needs_hwcap2_workaround(void) { return g_needs_hwcap2_workaround; } - -#endif // OPENSSL_ARM && OPENSSL_LINUX && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_arm_linux.cc b/third_party/boringssl/src/crypto/cpu_arm_linux.cc new file mode 100644 index 00000000..d8ea98a6 --- /dev/null +++ b/third_party/boringssl/src/crypto/cpu_arm_linux.cc @@ -0,0 +1,173 @@ +// Copyright 2016 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) && \ + defined(OPENSSL_LINUX) && !defined(OPENSSL_STATIC_ARMCAP) +#include +#include +#include +#include +#include + +#include + +#include + +#include "cpu_arm_linux.h" + +using namespace bssl; + +static int open_eintr(const char *path, int flags) { + int ret; + do { + ret = open(path, flags); + } while (ret < 0 && errno == EINTR); + return ret; +} + +static ssize_t read_eintr(int fd, void *out, size_t len) { + ssize_t ret; + do { + ret = read(fd, out, len); + } while (ret < 0 && errno == EINTR); + return ret; +} + +// read_file opens |path| and reads until end-of-file. On success, it returns +// one and sets |*out_ptr| and |*out_len| to a newly-allocated buffer with the +// contents. Otherwise, it returns zero. +static int read_file(char **out_ptr, size_t *out_len, const char *path) { + int fd = open_eintr(path, O_RDONLY); + if (fd < 0) { + return 0; + } + + static const size_t kReadSize = 1024; + int ret = 0; + size_t cap = kReadSize, len = 0; + char *buf = reinterpret_cast(OPENSSL_malloc(cap)); + if (buf == nullptr) { + goto err; + } + + for (;;) { + if (cap - len < kReadSize) { + size_t new_cap = cap * 2; + if (new_cap < cap) { + goto err; + } + char *new_buf = reinterpret_cast(OPENSSL_realloc(buf, new_cap)); + if (new_buf == nullptr) { + goto err; + } + buf = new_buf; + cap = new_cap; + } + + ssize_t bytes_read = read_eintr(fd, buf + len, kReadSize); + if (bytes_read < 0) { + goto err; + } + if (bytes_read == 0) { + break; + } + len += bytes_read; + } + + *out_ptr = buf; + *out_len = len; + ret = 1; + buf = nullptr; + +err: + OPENSSL_free(buf); + close(fd); + return ret; +} + +static int g_needs_hwcap2_workaround; + +void bssl::OPENSSL_cpuid_setup() { + // Matching OpenSSL, only report other features if NEON is present. + unsigned long hwcap = getauxval(AT_HWCAP); + if (hwcap & CRYPTO_HWCAP_NEON) { +#if defined(HWCAP_ARM_NEON) + static_assert(HWCAP_ARM_NEON == CRYPTO_HWCAP_NEON, + "CRYPTO_HWCAP values must match Linux"); +#endif + OPENSSL_armcap_P |= ARMV7_NEON; + + // Some ARMv8 Android devices don't expose AT_HWCAP2. Fall back to + // /proc/cpuinfo. See https://crbug.com/40644934. The fix was added to + // Android CTS in N, so, after Net.NeedsHWCAP2Workaround confirms this, we + // should be able to disable this when __ANDROID_MIN_SDK_VERSION__ is high + // enough. (It may not be worth carrying the workaround at all at that + // point. Then again, AES and PMULL extensions are crucial for performance + // when available.) + unsigned long hwcap2 = getauxval(AT_HWCAP2); + if (hwcap2 == 0) { + char *cpuinfo_data = nullptr; + size_t cpuinfo_len = 0; + if (read_file(&cpuinfo_data, &cpuinfo_len, "/proc/cpuinfo")) { + hwcap2 = armcap::GetHWCAP2FromCpuinfo( + std::string_view(cpuinfo_data, cpuinfo_len)); + g_needs_hwcap2_workaround = hwcap2 != 0; + OPENSSL_free(cpuinfo_data); + } + } + + // HWCAP2_* values, without the "CRYPTO_" prefix, are exposed through + // in some versions of glibc(>= 2.41). Assert that we don't + // diverge from those values. + if (hwcap2 & CRYPTO_HWCAP2_AES) { +#if defined(HWCAP2_AES) + static_assert(HWCAP2_AES == CRYPTO_HWCAP2_AES, + "CRYPTO_HWCAP2 values must match Linux"); +#endif + OPENSSL_armcap_P |= ARMV8_AES; + } + if (hwcap2 & CRYPTO_HWCAP2_PMULL) { +#if defined(HWCAP2_PMULL) + static_assert(HWCAP2_PMULL == CRYPTO_HWCAP2_PMULL, + "CRYPTO_HWCAP2 values must match Linux"); +#endif + OPENSSL_armcap_P |= ARMV8_PMULL; + } + if (hwcap2 & CRYPTO_HWCAP2_SHA1) { +#if defined(HWCAP2_SHA1) + static_assert(HWCAP2_SHA1 == CRYPTO_HWCAP2_SHA1, + "CRYPTO_HWCAP2 values must match Linux"); +#endif + OPENSSL_armcap_P |= ARMV8_SHA1; + } + if (hwcap2 & CRYPTO_HWCAP2_SHA2) { +#if defined(HWCAP2_SHA2) + static_assert(HWCAP2_SHA2 == CRYPTO_HWCAP2_SHA2, + "CRYPTO_HWCAP2 values must match Linux"); +#endif + OPENSSL_armcap_P |= ARMV8_SHA256; + } + } +} + +int CRYPTO_has_broken_NEON() { return 0; } + +int CRYPTO_needs_hwcap2_workaround() { + OPENSSL_init_cpuid(); + return g_needs_hwcap2_workaround; +} + +#endif // OPENSSL_ARM && OPENSSL_LINUX && !OPENSSL_STATIC_ARMCAP diff --git a/third_party/boringssl/src/crypto/cpu_arm_linux.h b/third_party/boringssl/src/crypto/cpu_arm_linux.h index e326285f..b01ee541 100644 --- a/third_party/boringssl/src/crypto/cpu_arm_linux.h +++ b/third_party/boringssl/src/crypto/cpu_arm_linux.h @@ -1,201 +1,141 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #ifndef OPENSSL_HEADER_CRYPTO_CPU_ARM_LINUX_H #define OPENSSL_HEADER_CRYPTO_CPU_ARM_LINUX_H #include -#include +#include + +#include #include "internal.h" -#if defined(__cplusplus) -extern "C" { -#endif +BSSL_NAMESPACE_BEGIN +namespace armcap { // The cpuinfo parser lives in a header file so it may be accessible from // cross-platform fuzzers without adding code to those platforms normally. -#define HWCAP_NEON (1 << 12) +#define CRYPTO_HWCAP_NEON (1 << 12) // See /usr/include/asm/hwcap.h on an ARM installation for the source of // these values. -#define HWCAP2_AES (1 << 0) -#define HWCAP2_PMULL (1 << 1) -#define HWCAP2_SHA1 (1 << 2) -#define HWCAP2_SHA2 (1 << 3) - -typedef struct { - const char *data; - size_t len; -} STRING_PIECE; - -static int STRING_PIECE_equals(const STRING_PIECE *a, const char *b) { - size_t b_len = strlen(b); - return a->len == b_len && OPENSSL_memcmp(a->data, b, b_len) == 0; -} - -// STRING_PIECE_split finds the first occurence of |sep| in |in| and, if found, -// sets |*out_left| and |*out_right| to |in| split before and after it. It -// returns one if |sep| was found and zero otherwise. -static int STRING_PIECE_split(STRING_PIECE *out_left, STRING_PIECE *out_right, - const STRING_PIECE *in, char sep) { - const char *p = (const char *)OPENSSL_memchr(in->data, sep, in->len); - if (p == NULL) { - return 0; +// We add the prefix "CRYPTO_" to the definitions so as not to collide with +// some versions of glibc (>= 2.41) that expose them through . +#define CRYPTO_HWCAP2_AES (1 << 0) +#define CRYPTO_HWCAP2_PMULL (1 << 1) +#define CRYPTO_HWCAP2_SHA1 (1 << 2) +#define CRYPTO_HWCAP2_SHA2 (1 << 3) + +// SplitStringView finds the first occurrence of |sep| in |in| and, if found, +// sets |*out_left| and |*out_right| to |in| split before and after |sep|, and +// returns true. If not found, it returns false. +inline bool SplitStringView(std::string_view *out_left, + std::string_view *out_right, std::string_view in, + char sep) { + auto pos = in.find(sep); + if (pos == std::string_view::npos) { + return false; } - // |out_left| or |out_right| may alias |in|, so make a copy. - STRING_PIECE in_copy = *in; - out_left->data = in_copy.data; - out_left->len = p - in_copy.data; - out_right->data = in_copy.data + out_left->len + 1; - out_right->len = in_copy.len - out_left->len - 1; - return 1; + *out_left = in.substr(0, pos); + *out_right = in.substr(pos + 1); + return true; } -// STRING_PIECE_get_delimited reads a |sep|-delimited entry from |s|, writing it -// to |out| and updating |s| to point beyond it. It returns one on success and -// zero if |s| is empty. If |s| is has no copies of |sep| and is non-empty, it -// reads the entire string to |out|. -static int STRING_PIECE_get_delimited(STRING_PIECE *s, STRING_PIECE *out, char sep) { - if (s->len == 0) { - return 0; +// GetDelimited reads a |sep|-delimited entry from |s|, writing it to |out| and +// updating |s| to point beyond it. It returns true on success and false if |s| +// is empty. If |s| has no copies of |sep| and is non-empty, it reads the entire +// string to |out|. +inline bool GetDelimited(std::string_view *s, std::string_view *out, char sep) { + if (s->empty()) { + return false; } - if (!STRING_PIECE_split(out, s, s, sep)) { + if (!SplitStringView(out, s, *s, sep)) { // |s| had no instances of |sep|. Return the entire string. *out = *s; - s->data += s->len; - s->len = 0; + *s = std::string_view(); } - return 1; + return true; } -// STRING_PIECE_trim removes leading and trailing whitespace from |s|. -static void STRING_PIECE_trim(STRING_PIECE *s) { - while (s->len != 0 && (s->data[0] == ' ' || s->data[0] == '\t')) { - s->data++; - s->len--; - } - while (s->len != 0 && - (s->data[s->len - 1] == ' ' || s->data[s->len - 1] == '\t')) { - s->len--; +// TrimStringView removes leading and trailing whitespace from |s|. +inline std::string_view TrimStringView(std::string_view s) { + size_t pos = s.find_first_not_of(" \t"); + if (pos == std::string_view::npos) { + return {}; } + s = s.substr(pos); + pos = s.find_last_not_of(" \t"); + assert(pos != std::string_view::npos); + return s.substr(0, pos + 1); } -// extract_cpuinfo_field extracts a /proc/cpuinfo field named |field| from -// |in|. If found, it sets |*out| to the value and returns one. Otherwise, it -// returns zero. -static int extract_cpuinfo_field(STRING_PIECE *out, const STRING_PIECE *in, - const char *field) { +// ExtractCpuinfoField extracts a /proc/cpuinfo field named |field| from |in|. +// If found, it returns the value. Otherwise, it returns the empty string. +inline std::string_view ExtractCpuinfoField(std::string_view in, + std::string_view field) { // Process |in| one line at a time. - STRING_PIECE remaining = *in, line; - while (STRING_PIECE_get_delimited(&remaining, &line, '\n')) { - STRING_PIECE key, value; - if (!STRING_PIECE_split(&key, &value, &line, ':')) { + std::string_view line; + while (GetDelimited(&in, &line, '\n')) { + std::string_view key, value; + if (!SplitStringView(&key, &value, line, ':')) { continue; } - STRING_PIECE_trim(&key); - if (STRING_PIECE_equals(&key, field)) { - STRING_PIECE_trim(&value); - *out = value; - return 1; + if (TrimStringView(key) == field) { + return TrimStringView(value); } } - return 0; + return {}; } -static int cpuinfo_field_equals(const STRING_PIECE *cpuinfo, const char *field, - const char *value) { - STRING_PIECE extracted; - return extract_cpuinfo_field(&extracted, cpuinfo, field) && - STRING_PIECE_equals(&extracted, value); -} - -// has_list_item treats |list| as a space-separated list of items and returns -// one if |item| is contained in |list| and zero otherwise. -static int has_list_item(const STRING_PIECE *list, const char *item) { - STRING_PIECE remaining = *list, feature; - while (STRING_PIECE_get_delimited(&remaining, &feature, ' ')) { - if (STRING_PIECE_equals(&feature, item)) { - return 1; +// HasListItem treats |list| as a space-separated list of items and returns +// whether |item| is contained in |list|. +inline bool HasListItem(std::string_view list, std::string_view item) { + std::string_view feature; + while (GetDelimited(&list, &feature, ' ')) { + if (feature == item) { + return true; } } - return 0; -} - -// crypto_get_arm_hwcap_from_cpuinfo returns an equivalent ARM |AT_HWCAP| value -// from |cpuinfo|. -static unsigned long crypto_get_arm_hwcap_from_cpuinfo( - const STRING_PIECE *cpuinfo) { - if (cpuinfo_field_equals(cpuinfo, "CPU architecture", "8")) { - // This is a 32-bit ARM binary running on a 64-bit kernel. NEON is always - // available on ARMv8. Linux omits required features, so reading the - // "Features" line does not work. (For simplicity, use strict equality. We - // assume everything running on future ARM architectures will have a - // working |getauxval|.) - return HWCAP_NEON; - } - - STRING_PIECE features; - if (extract_cpuinfo_field(&features, cpuinfo, "Features") && - has_list_item(&features, "neon")) { - return HWCAP_NEON; - } - return 0; + return false; } -// crypto_get_arm_hwcap2_from_cpuinfo returns an equivalent ARM |AT_HWCAP2| -// value from |cpuinfo|. -static unsigned long crypto_get_arm_hwcap2_from_cpuinfo( - const STRING_PIECE *cpuinfo) { - STRING_PIECE features; - if (!extract_cpuinfo_field(&features, cpuinfo, "Features")) { - return 0; - } - +// GetHWCAP2FromCpuinfo returns an equivalent ARM |AT_HWCAP2| value from +// |cpuinfo|. +inline unsigned long GetHWCAP2FromCpuinfo(std::string_view cpuinfo) { + std::string_view features = ExtractCpuinfoField(cpuinfo, "Features"); unsigned long ret = 0; - if (has_list_item(&features, "aes")) { - ret |= HWCAP2_AES; + if (HasListItem(features, "aes")) { + ret |= CRYPTO_HWCAP2_AES; } - if (has_list_item(&features, "pmull")) { - ret |= HWCAP2_PMULL; + if (HasListItem(features, "pmull")) { + ret |= CRYPTO_HWCAP2_PMULL; } - if (has_list_item(&features, "sha1")) { - ret |= HWCAP2_SHA1; + if (HasListItem(features, "sha1")) { + ret |= CRYPTO_HWCAP2_SHA1; } - if (has_list_item(&features, "sha2")) { - ret |= HWCAP2_SHA2; + if (HasListItem(features, "sha2")) { + ret |= CRYPTO_HWCAP2_SHA2; } return ret; } -// crypto_cpuinfo_has_broken_neon returns one if |cpuinfo| matches a CPU known -// to have broken NEON unit and zero otherwise. See https://crbug.com/341598. -static int crypto_cpuinfo_has_broken_neon(const STRING_PIECE *cpuinfo) { - return cpuinfo_field_equals(cpuinfo, "CPU implementer", "0x51") && - cpuinfo_field_equals(cpuinfo, "CPU architecture", "7") && - cpuinfo_field_equals(cpuinfo, "CPU variant", "0x1") && - cpuinfo_field_equals(cpuinfo, "CPU part", "0x04d") && - cpuinfo_field_equals(cpuinfo, "CPU revision", "0"); -} - - -#if defined(__cplusplus) -} // extern C -#endif +} // namespace armcap +BSSL_NAMESPACE_END #endif // OPENSSL_HEADER_CRYPTO_CPU_ARM_LINUX_H diff --git a/third_party/boringssl/src/crypto/cpu_intel.c b/third_party/boringssl/src/crypto/cpu_intel.c deleted file mode 100644 index fa96a7f6..00000000 --- a/third_party/boringssl/src/crypto/cpu_intel.c +++ /dev/null @@ -1,290 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#if !defined(OPENSSL_NO_ASM) && (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) - -#include -#include -#include -#include - -#if defined(_MSC_VER) -OPENSSL_MSVC_PRAGMA(warning(push, 3)) -#include -#include -OPENSSL_MSVC_PRAGMA(warning(pop)) -#endif - -#include "internal.h" - - -// OPENSSL_cpuid runs the cpuid instruction. |leaf| is passed in as EAX and ECX -// is set to zero. It writes EAX, EBX, ECX, and EDX to |*out_eax| through -// |*out_edx|. -static void OPENSSL_cpuid(uint32_t *out_eax, uint32_t *out_ebx, - uint32_t *out_ecx, uint32_t *out_edx, uint32_t leaf) { -#if defined(_MSC_VER) - int tmp[4]; - __cpuid(tmp, (int)leaf); - *out_eax = (uint32_t)tmp[0]; - *out_ebx = (uint32_t)tmp[1]; - *out_ecx = (uint32_t)tmp[2]; - *out_edx = (uint32_t)tmp[3]; -#elif defined(__pic__) && defined(OPENSSL_32_BIT) - // Inline assembly may not clobber the PIC register. For 32-bit, this is EBX. - // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602. - __asm__ volatile ( - "xor %%ecx, %%ecx\n" - "mov %%ebx, %%edi\n" - "cpuid\n" - "xchg %%edi, %%ebx\n" - : "=a"(*out_eax), "=D"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) - : "a"(leaf) - ); -#else - __asm__ volatile ( - "xor %%ecx, %%ecx\n" - "cpuid\n" - : "=a"(*out_eax), "=b"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) - : "a"(leaf) - ); -#endif -} - -// OPENSSL_xgetbv returns the value of an Intel Extended Control Register (XCR). -// Currently only XCR0 is defined by Intel so |xcr| should always be zero. -static uint64_t OPENSSL_xgetbv(uint32_t xcr) { -#if defined(_MSC_VER) - return (uint64_t)_xgetbv(xcr); -#else - uint32_t eax, edx; - __asm__ volatile ("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); - return (((uint64_t)edx) << 32) | eax; -#endif -} - -// handle_cpu_env applies the value from |in| to the CPUID values in |out[0]| -// and |out[1]|. See the comment in |OPENSSL_cpuid_setup| about this. -static void handle_cpu_env(uint32_t *out, const char *in) { - const int invert = in[0] == '~'; - const int or = in[0] == '|'; - const int skip_first_byte = invert || or; - const int hex = in[skip_first_byte] == '0' && in[skip_first_byte+1] == 'x'; - - int sscanf_result; - uint64_t v; - if (hex) { - sscanf_result = sscanf(in + invert + 2, "%" PRIx64, &v); - } else { - sscanf_result = sscanf(in + invert, "%" PRIu64, &v); - } - - if (!sscanf_result) { - return; - } - - if (invert) { - out[0] &= ~v; - out[1] &= ~(v >> 32); - } else if (or) { - out[0] |= v; - out[1] |= (v >> 32); - } else { - out[0] = v; - out[1] = v >> 32; - } -} - -void OPENSSL_cpuid_setup(void) { - // Determine the vendor and maximum input value. - uint32_t eax, ebx, ecx, edx; - OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0); - - uint32_t num_ids = eax; - - int is_intel = ebx == 0x756e6547 /* Genu */ && - edx == 0x49656e69 /* ineI */ && - ecx == 0x6c65746e /* ntel */; - int is_amd = ebx == 0x68747541 /* Auth */ && - edx == 0x69746e65 /* enti */ && - ecx == 0x444d4163 /* cAMD */; - - uint32_t extended_features[2] = {0}; - if (num_ids >= 7) { - OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 7); - extended_features[0] = ebx; - extended_features[1] = ecx; - } - - OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1); - - if (is_amd) { - // See https://www.amd.com/system/files/TechDocs/25481.pdf, page 10. - const uint32_t base_family = (eax >> 8) & 15; - const uint32_t base_model = (eax >> 4) & 15; - - uint32_t family = base_family; - uint32_t model = base_model; - if (base_family == 0xf) { - const uint32_t ext_family = (eax >> 20) & 255; - family += ext_family; - const uint32_t ext_model = (eax >> 16) & 15; - model |= ext_model << 4; - } - - if (family < 0x17 || (family == 0x17 && 0x70 <= model && model <= 0x7f)) { - // Disable RDRAND on AMD families before 0x17 (Zen) due to reported - // failures after suspend. - // https://bugzilla.redhat.com/show_bug.cgi?id=1150286 - // Also disable for family 0x17, models 0x70–0x7f, due to possible RDRAND - // failures there too. - ecx &= ~(1u << 30); - } - } - - // Force the hyper-threading bit so that the more conservative path is always - // chosen. - edx |= 1u << 28; - - // Reserved bit #20 was historically repurposed to control the in-memory - // representation of RC4 state. Always set it to zero. - edx &= ~(1u << 20); - - // Reserved bit #30 is repurposed to signal an Intel CPU. - if (is_intel) { - edx |= (1u << 30); - - // Clear the XSAVE bit on Knights Landing to mimic Silvermont. This enables - // some Silvermont-specific codepaths which perform better. See OpenSSL - // commit 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. - if ((eax & 0x0fff0ff0) == 0x00050670 /* Knights Landing */ || - (eax & 0x0fff0ff0) == 0x00080650 /* Knights Mill (per SDE) */) { - ecx &= ~(1u << 26); - } - } else { - edx &= ~(1u << 30); - } - - // The SDBG bit is repurposed to denote AMD XOP support. Don't ever use AMD - // XOP code paths. - ecx &= ~(1u << 11); - - uint64_t xcr0 = 0; - if (ecx & (1u << 27)) { - // XCR0 may only be queried if the OSXSAVE bit is set. - xcr0 = OPENSSL_xgetbv(0); - } - // See Intel manual, volume 1, section 14.3. - if ((xcr0 & 6) != 6) { - // YMM registers cannot be used. - ecx &= ~(1u << 28); // AVX - ecx &= ~(1u << 12); // FMA - ecx &= ~(1u << 11); // AMD XOP - // Clear AVX2 and AVX512* bits. - // - // TODO(davidben): Should bits 17 and 26-28 also be cleared? Upstream - // doesn't clear those. - extended_features[0] &= - ~((1u << 5) | (1u << 16) | (1u << 21) | (1u << 30) | (1u << 31)); - } - // See Intel manual, volume 1, section 15.2. - if ((xcr0 & 0xe6) != 0xe6) { - // Clear AVX512F. Note we don't touch other AVX512 extensions because they - // can be used with YMM. - extended_features[0] &= ~(1u << 16); - } - - // Disable ADX instructions on Knights Landing. See OpenSSL commit - // 64d92d74985ebb3d0be58a9718f9e080a14a8e7f. - if ((ecx & (1u << 26)) == 0) { - extended_features[0] &= ~(1u << 19); - } - - OPENSSL_ia32cap_P[0] = edx; - OPENSSL_ia32cap_P[1] = ecx; - OPENSSL_ia32cap_P[2] = extended_features[0]; - OPENSSL_ia32cap_P[3] = extended_features[1]; - - const char *env1, *env2; - env1 = getenv("OPENSSL_ia32cap"); - if (env1 == NULL) { - return; - } - - // OPENSSL_ia32cap can contain zero, one or two values, separated with a ':'. - // Each value is a 64-bit, unsigned value which may start with "0x" to - // indicate a hex value. Prior to the 64-bit value, a '~' or '|' may be given. - // - // If the '~' prefix is present: - // the value is inverted and ANDed with the probed CPUID result - // If the '|' prefix is present: - // the value is ORed with the probed CPUID result - // Otherwise: - // the value is taken as the result of the CPUID - // - // The first value determines OPENSSL_ia32cap_P[0] and [1]. The second [2] - // and [3]. - - handle_cpu_env(&OPENSSL_ia32cap_P[0], env1); - env2 = strchr(env1, ':'); - if (env2 != NULL) { - handle_cpu_env(&OPENSSL_ia32cap_P[2], env2 + 1); - } -} - -#endif // !OPENSSL_NO_ASM && (OPENSSL_X86 || OPENSSL_X86_64) diff --git a/third_party/boringssl/src/crypto/cpu_intel.cc b/third_party/boringssl/src/crypto/cpu_intel.cc new file mode 100644 index 00000000..6a406a79 --- /dev/null +++ b/third_party/boringssl/src/crypto/cpu_intel.cc @@ -0,0 +1,287 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) + +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#include +#include +#endif + +#include "internal.h" + + +using namespace bssl; + +// OPENSSL_cpuid runs the cpuid instruction. |leaf| is passed in as EAX and ECX +// is set to zero. It writes EAX, EBX, ECX, and EDX to |*out_eax| through +// |*out_edx|. +static void OPENSSL_cpuid(uint32_t *out_eax, uint32_t *out_ebx, + uint32_t *out_ecx, uint32_t *out_edx, uint32_t leaf) { +#if defined(_MSC_VER) + int tmp[4]; + __cpuid(tmp, (int)leaf); + *out_eax = (uint32_t)tmp[0]; + *out_ebx = (uint32_t)tmp[1]; + *out_ecx = (uint32_t)tmp[2]; + *out_edx = (uint32_t)tmp[3]; +#elif defined(__pic__) && defined(OPENSSL_32_BIT) + // Inline assembly may not clobber the PIC register. For 32-bit, this is EBX. + // See https://gcc.gnu.org/bugzilla/show_bug.cgi?id=47602. + __asm__ volatile( + "xor %%ecx, %%ecx\n" + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(*out_eax), "=D"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) + : "a"(leaf)); +#else + __asm__ volatile( + "xor %%ecx, %%ecx\n" + "cpuid\n" + : "=a"(*out_eax), "=b"(*out_ebx), "=c"(*out_ecx), "=d"(*out_edx) + : "a"(leaf)); +#endif +} + +// OPENSSL_xgetbv returns the value of an Intel Extended Control Register (XCR). +// Currently only XCR0 is defined by Intel so |xcr| should always be zero. +static uint64_t OPENSSL_xgetbv(uint32_t xcr) { +#if defined(_MSC_VER) + return (uint64_t)_xgetbv(xcr); +#else + uint32_t eax, edx; + __asm__ volatile("xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); + return (((uint64_t)edx) << 32) | eax; +#endif +} + +static bool os_supports_avx512(uint64_t xcr0) { +#if defined(__APPLE__) + // The Darwin kernel had a bug where it could corrupt the opmask registers. + // See + // https://community.intel.com/t5/Software-Tuning-Performance/MacOS-Darwin-kernel-bug-clobbers-AVX-512-opmask-register-state/m-p/1327259 + // Darwin also does not initially set the XCR0 bits for AVX512, but they are + // set if the thread tries to use AVX512 anyway. Thus, to safely and + // consistently use AVX512 on macOS we'd need to check the kernel version as + // well as detect AVX512 support using a macOS-specific method. We don't + // bother with this, especially given Apple's transition to arm64. + return false; +#else + return (xcr0 & 0xe6) == 0xe6; +#endif +} + +// handle_cpu_env applies the value from |in| to the CPUID values in |out[0]| +// and |out[1]|. See the comment in |OPENSSL_cpuid_setup| about this. The +// |is_last| argument specifies whether the value is at the end of the string. +// Otherwise it may be followed by a colon. +static void handle_cpu_env(uint32_t out[2], const char *in, bool is_last) { + const int invert_op = in[0] == '~'; + const int or_op = in[0] == '|'; + const int skip_first_byte = invert_op || or_op; + const int hex = in[skip_first_byte] == '0' && in[skip_first_byte + 1] == 'x'; + const int base = hex ? 16 : 10; + + const char *start = in + skip_first_byte; + char *end; + errno = 0; + // We need to parse 64-bit values with `strtoull`. + static_assert(sizeof(unsigned long long) == sizeof(uint64_t)); + unsigned long long v = strtoull(start, &end, base); + + if (end == start || (*end != '\0' && (is_last || *end != ':')) || + (v == ULLONG_MAX && errno == ERANGE)) { + return; + } + + if (invert_op) { + out[0] &= ~v; + out[1] &= ~(v >> 32); + } else if (or_op) { + out[0] |= v; + out[1] |= (v >> 32); + } else { + out[0] = v; + out[1] = v >> 32; + } +} + +void bssl::OPENSSL_adjust_ia32cap(uint32_t cap[4], const char *env) { + // OPENSSL_ia32cap can contain zero, one or two values, separated with a ':'. + // Each value is a 64-bit, unsigned value which may start with "0x" to + // indicate a hex value. Prior to the 64-bit value, a '~' or '|' may be given. + // + // If the '~' prefix is present: + // the value is inverted and ANDed with the probed CPUID result + // If the '|' prefix is present: + // the value is ORed with the probed CPUID result + // Otherwise: + // the value is taken as the result of the CPUID + // + // The first value determines OPENSSL_ia32cap_P[0] and [1]. The second [2] + // and [3]. + handle_cpu_env(cap, env, /*is_last=*/false); + env = strchr(env, ':'); + if (env != nullptr) { + handle_cpu_env(cap + 2, env + 1, /*is_last=*/true); + } +} + +void bssl::OPENSSL_cpuid_setup() { + // Determine the vendor and maximum input value. + uint32_t eax, ebx, ecx, edx; + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 0); + + uint32_t num_ids = eax; + + int is_intel = ebx == 0x756e6547 /* Genu */ && // + edx == 0x49656e69 /* ineI */ && // + ecx == 0x6c65746e /* ntel */; + int is_amd = ebx == 0x68747541 /* Auth */ && // + edx == 0x69746e65 /* enti */ && // + ecx == 0x444d4163 /* cAMD */; + + uint32_t extended_features[2] = {0}; + if (num_ids >= 7) { + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 7); + extended_features[0] = ebx; + extended_features[1] = ecx; + } + + OPENSSL_cpuid(&eax, &ebx, &ecx, &edx, 1); + + const uint32_t base_family = (eax >> 8) & 15; + const uint32_t base_model = (eax >> 4) & 15; + + uint32_t family = base_family; + uint32_t model = base_model; + if (base_family == 15) { + const uint32_t ext_family = (eax >> 20) & 255; + family += ext_family; + } + if (base_family == 6 || base_family == 15) { + const uint32_t ext_model = (eax >> 16) & 15; + model |= ext_model << 4; + } + + if (is_amd) { + if (family < 0x17 || (family == 0x17 && 0x70 <= model && model <= 0x7f)) { + // Disable RDRAND on AMD families before 0x17 (Zen) due to reported + // failures after suspend. + // https://bugzilla.redhat.com/show_bug.cgi?id=1150286 + // Also disable for family 0x17, models 0x70–0x7f, due to possible RDRAND + // failures there too. + ecx &= ~(1u << 30); + } + } + + // Reserved bit #30 is repurposed to signal an Intel CPU. + if (is_intel) { + edx |= (1u << 30); + } else { + edx &= ~(1u << 30); + } + + uint64_t xcr0 = 0; + if (ecx & (1u << 27)) { + // XCR0 may only be queried if the OSXSAVE bit is set. + xcr0 = OPENSSL_xgetbv(0); + } + // See Intel manual, volume 1, section 14.3. + if ((xcr0 & 6) != 6) { + // YMM registers cannot be used. + ecx &= ~(1u << 28); // AVX + ecx &= ~(1u << 12); // FMA + ecx &= ~(1u << 11); // AMD XOP + extended_features[0] &= ~(1u << 5); // AVX2 + extended_features[1] &= ~(1u << 9); // VAES + extended_features[1] &= ~(1u << 10); // VPCLMULQDQ + } + // See Intel manual, volume 1, sections 15.2 ("Detection of AVX-512 Foundation + // Instructions") through 15.4 ("Detection of Intel AVX-512 Instruction Groups + // Operating at 256 and 128-bit Vector Lengths"). + if (!os_supports_avx512(xcr0)) { + // Without XCR0.111xx11x, no AVX512 feature can be used. This includes ZMM + // registers, masking, SIMD registers 16-31 (even if accessed as YMM or + // XMM), and EVEX-coded instructions (even on YMM or XMM). Even if only + // XCR0.ZMM_Hi256 is missing, it isn't valid to use AVX512 features on + // shorter vectors, since AVX512 ties everything to the availability of + // 512-bit vectors. See the above-mentioned sections of the Intel manual, + // which say that *all* these XCR0 bits must be checked even when just using + // 128-bit or 256-bit vectors, and also volume 2a section 2.7.11 ("#UD + // Equations for EVEX") which says that all EVEX-coded instructions raise an + // undefined-instruction exception if any of these XCR0 bits is zero. + extended_features[0] &= ~(1u << 16); // AVX512F + extended_features[0] &= ~(1u << 17); // AVX512DQ + extended_features[0] &= ~(1u << 21); // AVX512IFMA + extended_features[0] &= ~(1u << 26); // AVX512PF + extended_features[0] &= ~(1u << 27); // AVX512ER + extended_features[0] &= ~(1u << 28); // AVX512CD + extended_features[0] &= ~(1u << 30); // AVX512BW + extended_features[0] &= ~(1u << 31); // AVX512VL + extended_features[1] &= ~(1u << 1); // AVX512VBMI + extended_features[1] &= ~(1u << 6); // AVX512VBMI2 + extended_features[1] &= ~(1u << 11); // AVX512VNNI + extended_features[1] &= ~(1u << 12); // AVX512BITALG + extended_features[1] &= ~(1u << 14); // AVX512VPOPCNTDQ + } + + // Repurpose the bit for the removed MPX feature to indicate when using zmm + // registers should be avoided even when they are supported. (When set, AVX512 + // features can still be used, but only using ymm or xmm registers.) Skylake + // suffered from severe downclocking when zmm registers were used, which + // affected unrelated code running on the system, making zmm registers not too + // useful outside of benchmarks. The situation improved significantly by Ice + // Lake, but a small amount of downclocking remained. (See + // https://lore.kernel.org/linux-crypto/e8ce1146-3952-6977-1d0e-a22758e58914@intel.com/) + // We take a conservative approach of not allowing zmm registers until after + // Ice Lake and Tiger Lake, i.e. until Sapphire Rapids on the server side. + // + // AMD CPUs, which support AVX512 starting with Zen 4, have not been reported + // to have any downclocking problem when zmm registers are used. + if (is_intel && family == 6 && + (model == 85 || // Skylake, Cascade Lake, Cooper Lake (server) + model == 106 || // Ice Lake (server) + model == 108 || // Ice Lake (micro server) + model == 125 || // Ice Lake (client) + model == 126 || // Ice Lake (mobile) + model == 140 || // Tiger Lake (mobile) + model == 141)) { // Tiger Lake (client) + extended_features[0] |= 1u << 14; + } else { + extended_features[0] &= ~(1u << 14); + } + + OPENSSL_ia32cap_P[0] = edx; + OPENSSL_ia32cap_P[1] = ecx; + OPENSSL_ia32cap_P[2] = extended_features[0]; + OPENSSL_ia32cap_P[3] = extended_features[1]; + + const char *env = getenv("OPENSSL_ia32cap"); + if (env != nullptr) { + OPENSSL_adjust_ia32cap(OPENSSL_ia32cap_P, env); + } +} + +#endif // !OPENSSL_NO_ASM && (OPENSSL_X86 || OPENSSL_X86_64) diff --git a/third_party/boringssl/src/crypto/cpu_ppc64le.c b/third_party/boringssl/src/crypto/cpu_ppc64le.c deleted file mode 100644 index a802e37e..00000000 --- a/third_party/boringssl/src/crypto/cpu_ppc64le.c +++ /dev/null @@ -1,38 +0,0 @@ -/* Copyright (c) 2016, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#if defined(OPENSSL_PPC64LE) - -#include - -#include "internal.h" - - -#if !defined(PPC_FEATURE2_HAS_VCRYPTO) -// PPC_FEATURE2_HAS_VCRYPTO was taken from section 4.1.2.3 of the “OpenPOWER -// ABI for Linux Supplement”. -#define PPC_FEATURE2_HAS_VCRYPTO 0x02000000 -#endif - -void OPENSSL_cpuid_setup(void) { - OPENSSL_ppc64le_hwcap2 = getauxval(AT_HWCAP2); -} - -int CRYPTO_is_PPC64LE_vcrypto_capable(void) { - return (OPENSSL_ppc64le_hwcap2 & PPC_FEATURE2_HAS_VCRYPTO) != 0; -} - -#endif // OPENSSL_PPC64LE diff --git a/third_party/boringssl/src/crypto/crypto.c b/third_party/boringssl/src/crypto/crypto.c deleted file mode 100644 index af7e560a..00000000 --- a/third_party/boringssl/src/crypto/crypto.c +++ /dev/null @@ -1,233 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include "fipsmodule/rand/fork_detect.h" -#include "fipsmodule/rand/internal.h" -#include "internal.h" - - -#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_STATIC_ARMCAP) && \ - (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ - defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) || \ - defined(OPENSSL_PPC64LE)) -// x86, x86_64, the ARMs and ppc64le need to record the result of a -// cpuid/getauxval call for the asm to work correctly, unless compiled without -// asm code. -#define NEED_CPUID - -#else - -// Otherwise, don't emit a static initialiser. - -#if !defined(BORINGSSL_NO_STATIC_INITIALIZER) -#define BORINGSSL_NO_STATIC_INITIALIZER -#endif - -#endif // !NO_ASM && !STATIC_ARMCAP && - // (X86 || X86_64 || ARM || AARCH64 || PPC64LE) - - -// Our assembly does not use the GOT to reference symbols, which means -// references to visible symbols will often require a TEXTREL. This is -// undesirable, so all assembly-referenced symbols should be hidden. CPU -// capabilities are the only such symbols defined in C. Explicitly hide them, -// rather than rely on being built with -fvisibility=hidden. -#if defined(OPENSSL_WINDOWS) -#define HIDDEN -#else -#define HIDDEN __attribute__((visibility("hidden"))) -#endif - - -// The capability variables are defined in this file in order to work around a -// linker bug. When linking with a .a, if no symbols in a .o are referenced -// then the .o is discarded, even if it has constructor functions. -// -// This still means that any binaries that don't include some functionality -// that tests the capability values will still skip the constructor but, so -// far, the init constructor function only sets the capability variables. - -#if defined(BORINGSSL_DISPATCH_TEST) -// This value must be explicitly initialised to zero in order to work around a -// bug in libtool or the linker on OS X. -// -// If not initialised then it becomes a "common symbol". When put into an -// archive, linking on OS X will fail to resolve common symbols. By -// initialising it to zero, it becomes a "data symbol", which isn't so -// affected. -HIDDEN uint8_t BORINGSSL_function_hit[7] = {0}; -#endif - -#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) - -// This value must be explicitly initialized to zero. See similar comment above. -HIDDEN uint32_t OPENSSL_ia32cap_P[4] = {0}; - -#elif defined(OPENSSL_PPC64LE) - -HIDDEN unsigned long OPENSSL_ppc64le_hwcap2 = 0; - -#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) - -#include - -#if defined(OPENSSL_STATIC_ARMCAP) - -// See ARM ACLE for the definitions of these macros. Note |__ARM_FEATURE_AES| -// covers both AES and PMULL and |__ARM_FEATURE_SHA2| covers SHA-1 and SHA-256. -// https://developer.arm.com/architectures/system-architectures/software-standards/acle -// https://github.com/ARM-software/acle/issues/152 -// -// TODO(davidben): Do we still need |OPENSSL_STATIC_ARMCAP_*| or are the -// standard flags and -march sufficient? -HIDDEN uint32_t OPENSSL_armcap_P = -#if defined(OPENSSL_STATIC_ARMCAP_NEON) || defined(__ARM_NEON) - ARMV7_NEON | -#endif -#if defined(OPENSSL_STATIC_ARMCAP_AES) || defined(__ARM_FEATURE_AES) - ARMV8_AES | -#endif -#if defined(OPENSSL_STATIC_ARMCAP_PMULL) || defined(__ARM_FEATURE_AES) - ARMV8_PMULL | -#endif -#if defined(OPENSSL_STATIC_ARMCAP_SHA1) || defined(__ARM_FEATURE_SHA2) - ARMV8_SHA1 | -#endif -#if defined(OPENSSL_STATIC_ARMCAP_SHA256) || defined(__ARM_FEATURE_SHA2) - ARMV8_SHA256 | -#endif -#if defined(__ARM_FEATURE_SHA512) - ARMV8_SHA512 | -#endif - 0; - -#else -HIDDEN uint32_t OPENSSL_armcap_P = 0; - -uint32_t *OPENSSL_get_armcap_pointer_for_test(void) { - return &OPENSSL_armcap_P; -} -#endif - -#endif - -#if defined(BORINGSSL_FIPS) -// In FIPS mode, the power-on self-test function calls |CRYPTO_library_init| -// because we have to ensure that CPUID detection occurs first. -#define BORINGSSL_NO_STATIC_INITIALIZER -#endif - -#if defined(OPENSSL_WINDOWS) && !defined(BORINGSSL_NO_STATIC_INITIALIZER) -#define OPENSSL_CDECL __cdecl -#else -#define OPENSSL_CDECL -#endif - -#if defined(BORINGSSL_NO_STATIC_INITIALIZER) -static CRYPTO_once_t once = CRYPTO_ONCE_INIT; -#elif defined(_MSC_VER) -#pragma section(".CRT$XCU", read) -static void __cdecl do_library_init(void); -__declspec(allocate(".CRT$XCU")) void(*library_init_constructor)(void) = - do_library_init; -#else -static void do_library_init(void) __attribute__ ((constructor)); -#endif - -// do_library_init is the actual initialization function. If -// BORINGSSL_NO_STATIC_INITIALIZER isn't defined, this is set as a static -// initializer. Otherwise, it is called by CRYPTO_library_init. -static void OPENSSL_CDECL do_library_init(void) { - // WARNING: this function may only configure the capability variables. See the - // note above about the linker bug. -#if defined(NEED_CPUID) - OPENSSL_cpuid_setup(); -#endif -} - -void CRYPTO_library_init(void) { - // TODO(davidben): It would be tidier if this build knob could be replaced - // with an internal lazy-init mechanism that would handle things correctly - // in-library. https://crbug.com/542879 -#if defined(BORINGSSL_NO_STATIC_INITIALIZER) - CRYPTO_once(&once, do_library_init); -#endif -} - -int CRYPTO_is_confidential_build(void) { -#if defined(BORINGSSL_CONFIDENTIAL) - return 1; -#else - return 0; -#endif -} - -int CRYPTO_has_asm(void) { -#if defined(OPENSSL_NO_ASM) - return 0; -#else - return 1; -#endif -} - -void CRYPTO_pre_sandbox_init(void) { - // Read from /proc/cpuinfo if needed. - CRYPTO_library_init(); - // Open /dev/urandom if needed. - CRYPTO_init_sysrand(); - // Set up MADV_WIPEONFORK state if needed. - CRYPTO_get_fork_generation(); -} - -const char *SSLeay_version(int which) { return OpenSSL_version(which); } - -const char *OpenSSL_version(int which) { - switch (which) { - case OPENSSL_VERSION: - return "BoringSSL"; - case OPENSSL_CFLAGS: - return "compiler: n/a"; - case OPENSSL_BUILT_ON: - return "built on: n/a"; - case OPENSSL_PLATFORM: - return "platform: n/a"; - case OPENSSL_DIR: - return "OPENSSLDIR: n/a"; - default: - return "not available"; - } -} - -unsigned long SSLeay(void) { return OPENSSL_VERSION_NUMBER; } - -unsigned long OpenSSL_version_num(void) { return OPENSSL_VERSION_NUMBER; } - -int CRYPTO_malloc_init(void) { return 1; } - -int OPENSSL_malloc_init(void) { return 1; } - -void ENGINE_load_builtin_engines(void) {} - -int ENGINE_register_all_complete(void) { return 1; } - -void OPENSSL_load_builtin_modules(void) {} - -int OPENSSL_init_crypto(uint64_t opts, const OPENSSL_INIT_SETTINGS *settings) { - CRYPTO_library_init(); - return 1; -} - -void OPENSSL_cleanup(void) {} diff --git a/third_party/boringssl/src/crypto/crypto.cc b/third_party/boringssl/src/crypto/crypto.cc new file mode 100644 index 00000000..28b5f544 --- /dev/null +++ b/third_party/boringssl/src/crypto/crypto.cc @@ -0,0 +1,152 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "bcm_support.h" +#include "fipsmodule/rand/internal.h" +#include "internal.h" + + +using namespace bssl; + +static_assert(sizeof(ossl_ssize_t) == sizeof(size_t), + "ossl_ssize_t should be the same size as size_t"); + + +// Our assembly does not use the GOT to reference symbols, which means +// references to visible symbols will often require a TEXTREL. This is +// undesirable, so all assembly-referenced symbols should be hidden. CPU +// capabilities are the only such symbols defined in C. Explicitly hide them, +// rather than rely on being built with -fvisibility=hidden. +#if defined(OPENSSL_WINDOWS) +#define HIDDEN +#else +#define HIDDEN __attribute__((visibility("hidden"))) +#endif + + +// The capability variables are defined in this file in order to work around a +// linker bug. When linking with a .a, if no symbols in a .o are referenced +// then the .o is discarded, even if it has constructor functions. +// +// This still means that any binaries that don't include some functionality +// that tests the capability values will still skip the constructor but, so +// far, the init constructor function only sets the capability variables. + +#if defined(BORINGSSL_DISPATCH_TEST) +// This value must be explicitly initialised to zero in order to work around a +// bug in libtool or the linker on OS X. +// +// If not initialised then it becomes a "common symbol". When put into an +// archive, linking on OS X will fail to resolve common symbols. By +// initialising it to zero, it becomes a "data symbol", which isn't so +// affected. +HIDDEN uint8_t bssl::BORINGSSL_function_hit[8] = {0}; +#endif + +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) + +// This value must be explicitly initialized to zero. See similar comment above. +HIDDEN uint32_t bssl::OPENSSL_ia32cap_P[4] = {0}; + +uint32_t bssl::OPENSSL_get_ia32cap(int idx) { + OPENSSL_init_cpuid(); + return OPENSSL_ia32cap_P[idx]; +} + +#elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \ + !defined(OPENSSL_STATIC_ARMCAP) +HIDDEN uint32_t bssl::OPENSSL_armcap_P = 0; + +uint32_t *bssl::OPENSSL_get_armcap_pointer_for_test() { + OPENSSL_init_cpuid(); + return &OPENSSL_armcap_P; +} + +uint32_t bssl::OPENSSL_get_armcap() { + OPENSSL_init_cpuid(); + return OPENSSL_armcap_P; +} +#endif + +#if defined(NEED_CPUID) +static bssl::CRYPTO_once_t once = CRYPTO_ONCE_INIT; +void bssl::OPENSSL_init_cpuid() { CRYPTO_once(&once, OPENSSL_cpuid_setup); } +#endif + +void CRYPTO_library_init() {} + +int CRYPTO_is_confidential_build() { +#if defined(BORINGSSL_CONFIDENTIAL) + return 1; +#else + return 0; +#endif +} + +void CRYPTO_pre_sandbox_init() { + // Read from /proc/cpuinfo if needed. + OPENSSL_init_cpuid(); + // Open /dev/urandom if needed. + CRYPTO_init_sysrand(); + // Set up MADV_WIPEONFORK state if needed. + CRYPTO_get_fork_generation(); +} + +const char *SSLeay_version(int which) { return OpenSSL_version(which); } + +const char *OpenSSL_version(int which) { + switch (which) { + case OPENSSL_VERSION: + return "BoringSSL"; + case OPENSSL_CFLAGS: + return "compiler: n/a"; + case OPENSSL_BUILT_ON: + return "built on: n/a"; + case OPENSSL_PLATFORM: + return "platform: n/a"; + case OPENSSL_DIR: + return "OPENSSLDIR: n/a"; + default: + return "not available"; + } +} + +unsigned long SSLeay() { return OPENSSL_VERSION_NUMBER; } + +unsigned long OpenSSL_version_num() { return OPENSSL_VERSION_NUMBER; } + +int CRYPTO_malloc_init() { return 1; } + +int OPENSSL_malloc_init() { return 1; } + +void ENGINE_load_builtin_engines() {} + +int ENGINE_register_all_complete() { return 1; } + +void ENGINE_cleanup() {} + +void OPENSSL_load_builtin_modules() {} + +int OPENSSL_init_crypto(uint64_t opts, const OPENSSL_INIT_SETTINGS *settings) { + return 1; +} + +void OPENSSL_cleanup() {} + +FILE *bssl::CRYPTO_get_stderr() { return stderr; } diff --git a/third_party/boringssl/src/crypto/curve25519/curve25519.c b/third_party/boringssl/src/crypto/curve25519/curve25519.c deleted file mode 100644 index 17740b83..00000000 --- a/third_party/boringssl/src/crypto/curve25519/curve25519.c +++ /dev/null @@ -1,2152 +0,0 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -// Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP -// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as -// public domain. Other parts have been replaced to call into code generated by -// Fiat (https://github.com/mit-plv/fiat-crypto) in //third_party/fiat. -// -// The field functions are shared by Ed25519 and X25519 where possible. - -#include - -#include -#include - -#include -#include -#include - -#include "internal.h" -#include "../internal.h" - - -// Various pre-computed constants. -#include "./curve25519_tables.h" - -#if defined(OPENSSL_NO_ASM) -#define FIAT_25519_NO_ASM -#endif - -#if defined(BORINGSSL_CURVE25519_64BIT) -#include "../../third_party/fiat/curve25519_64.h" -#else -#include "../../third_party/fiat/curve25519_32.h" -#endif // BORINGSSL_CURVE25519_64BIT - - -// Low-level intrinsic operations - -static uint64_t load_3(const uint8_t *in) { - uint64_t result; - result = (uint64_t)in[0]; - result |= ((uint64_t)in[1]) << 8; - result |= ((uint64_t)in[2]) << 16; - return result; -} - -static uint64_t load_4(const uint8_t *in) { - uint64_t result; - result = (uint64_t)in[0]; - result |= ((uint64_t)in[1]) << 8; - result |= ((uint64_t)in[2]) << 16; - result |= ((uint64_t)in[3]) << 24; - return result; -} - - -// Field operations. - -#if defined(BORINGSSL_CURVE25519_64BIT) - -typedef uint64_t fe_limb_t; -#define FE_NUM_LIMBS 5 - -// assert_fe asserts that |f| satisfies bounds: -// -// [[0x0 ~> 0x8cccccccccccc], -// [0x0 ~> 0x8cccccccccccc], -// [0x0 ~> 0x8cccccccccccc], -// [0x0 ~> 0x8cccccccccccc], -// [0x0 ~> 0x8cccccccccccc]] -// -// See comments in curve25519_64.h for which functions use these bounds for -// inputs or outputs. -#define assert_fe(f) \ - do { \ - for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ - assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc)); \ - } \ - } while (0) - -// assert_fe_loose asserts that |f| satisfies bounds: -// -// [[0x0 ~> 0x1a666666666664], -// [0x0 ~> 0x1a666666666664], -// [0x0 ~> 0x1a666666666664], -// [0x0 ~> 0x1a666666666664], -// [0x0 ~> 0x1a666666666664]] -// -// See comments in curve25519_64.h for which functions use these bounds for -// inputs or outputs. -#define assert_fe_loose(f) \ - do { \ - for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ - assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \ - } \ - } while (0) - -#else - -typedef uint32_t fe_limb_t; -#define FE_NUM_LIMBS 10 - -// assert_fe asserts that |f| satisfies bounds: -// -// [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], -// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], -// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], -// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], -// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]] -// -// See comments in curve25519_32.h for which functions use these bounds for -// inputs or outputs. -#define assert_fe(f) \ - do { \ - for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ - assert(f[_assert_fe_i] <= \ - ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \ - } \ - } while (0) - -// assert_fe_loose asserts that |f| satisfies bounds: -// -// [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], -// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], -// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], -// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], -// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]] -// -// See comments in curve25519_32.h for which functions use these bounds for -// inputs or outputs. -#define assert_fe_loose(f) \ - do { \ - for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ - assert(f[_assert_fe_i] <= \ - ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \ - } \ - } while (0) - -#endif // BORINGSSL_CURVE25519_64BIT - -static_assert(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS, - "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe"); - -static void fe_frombytes_strict(fe *h, const uint8_t s[32]) { - // |fiat_25519_from_bytes| requires the top-most bit be clear. - assert((s[31] & 0x80) == 0); - fiat_25519_from_bytes(h->v, s); - assert_fe(h->v); -} - -static void fe_frombytes(fe *h, const uint8_t s[32]) { - uint8_t s_copy[32]; - OPENSSL_memcpy(s_copy, s, 32); - s_copy[31] &= 0x7f; - fe_frombytes_strict(h, s_copy); -} - -static void fe_tobytes(uint8_t s[32], const fe *f) { - assert_fe(f->v); - fiat_25519_to_bytes(s, f->v); -} - -// h = 0 -static void fe_0(fe *h) { - OPENSSL_memset(h, 0, sizeof(fe)); -} - -static void fe_loose_0(fe_loose *h) { - OPENSSL_memset(h, 0, sizeof(fe_loose)); -} - -// h = 1 -static void fe_1(fe *h) { - OPENSSL_memset(h, 0, sizeof(fe)); - h->v[0] = 1; -} - -static void fe_loose_1(fe_loose *h) { - OPENSSL_memset(h, 0, sizeof(fe_loose)); - h->v[0] = 1; -} - -// h = f + g -// Can overlap h with f or g. -static void fe_add(fe_loose *h, const fe *f, const fe *g) { - assert_fe(f->v); - assert_fe(g->v); - fiat_25519_add(h->v, f->v, g->v); - assert_fe_loose(h->v); -} - -// h = f - g -// Can overlap h with f or g. -static void fe_sub(fe_loose *h, const fe *f, const fe *g) { - assert_fe(f->v); - assert_fe(g->v); - fiat_25519_sub(h->v, f->v, g->v); - assert_fe_loose(h->v); -} - -static void fe_carry(fe *h, const fe_loose* f) { - assert_fe_loose(f->v); - fiat_25519_carry(h->v, f->v); - assert_fe(h->v); -} - -static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS], - const fe_limb_t in1[FE_NUM_LIMBS], - const fe_limb_t in2[FE_NUM_LIMBS]) { - assert_fe_loose(in1); - assert_fe_loose(in2); - fiat_25519_carry_mul(out, in1, in2); - assert_fe(out); -} - -static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) { - fe_mul_impl(h->v, f->v, g->v); -} - -static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) { - fe_mul_impl(h->v, f->v, g->v); -} - -static void fe_mul_ttt(fe *h, const fe *f, const fe *g) { - fe_mul_impl(h->v, f->v, g->v); -} - -static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) { - fe_mul_impl(h->v, f->v, g->v); -} - -static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) { - fe_mul_impl(h->v, f->v, g->v); -} - -static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) { - fe_mul_impl(h->v, f->v, g->v); -} - -static void fe_sq_tl(fe *h, const fe_loose *f) { - assert_fe_loose(f->v); - fiat_25519_carry_square(h->v, f->v); - assert_fe(h->v); -} - -static void fe_sq_tt(fe *h, const fe *f) { - assert_fe_loose(f->v); - fiat_25519_carry_square(h->v, f->v); - assert_fe(h->v); -} - -// Replace (f,g) with (g,f) if b == 1; -// replace (f,g) with (f,g) if b == 0. -// -// Preconditions: b in {0,1}. -static void fe_cswap(fe *f, fe *g, fe_limb_t b) { - b = 0-b; - for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { - fe_limb_t x = f->v[i] ^ g->v[i]; - x &= b; - f->v[i] ^= x; - g->v[i] ^= x; - } -} - -static void fe_mul121666(fe *h, const fe_loose *f) { - assert_fe_loose(f->v); - fiat_25519_carry_scmul_121666(h->v, f->v); - assert_fe(h->v); -} - -// h = -f -static void fe_neg(fe_loose *h, const fe *f) { - assert_fe(f->v); - fiat_25519_opp(h->v, f->v); - assert_fe_loose(h->v); -} - -// Replace (f,g) with (g,g) if b == 1; -// replace (f,g) with (f,g) if b == 0. -// -// Preconditions: b in {0,1}. -static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) { - // Silence an unused function warning. |fiat_25519_selectznz| isn't quite the - // calling convention the rest of this code wants, so implement it by hand. - // - // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a - // different one. - (void)fiat_25519_selectznz; - - b = 0-b; - for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { - fe_limb_t x = f->v[i] ^ g->v[i]; - x &= b; - f->v[i] ^= x; - } -} - -// h = f -static void fe_copy(fe *h, const fe *f) { - OPENSSL_memmove(h, f, sizeof(fe)); -} - -static void fe_copy_lt(fe_loose *h, const fe *f) { - static_assert(sizeof(fe_loose) == sizeof(fe), "fe and fe_loose mismatch"); - OPENSSL_memmove(h, f, sizeof(fe)); -} -#if !defined(OPENSSL_SMALL) -static void fe_copy_ll(fe_loose *h, const fe_loose *f) { - OPENSSL_memmove(h, f, sizeof(fe_loose)); -} -#endif // !defined(OPENSSL_SMALL) - -static void fe_loose_invert(fe *out, const fe_loose *z) { - fe t0; - fe t1; - fe t2; - fe t3; - int i; - - fe_sq_tl(&t0, z); - fe_sq_tt(&t1, &t0); - for (i = 1; i < 2; ++i) { - fe_sq_tt(&t1, &t1); - } - fe_mul_tlt(&t1, z, &t1); - fe_mul_ttt(&t0, &t0, &t1); - fe_sq_tt(&t2, &t0); - fe_mul_ttt(&t1, &t1, &t2); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 5; ++i) { - fe_sq_tt(&t2, &t2); - } - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 10; ++i) { - fe_sq_tt(&t2, &t2); - } - fe_mul_ttt(&t2, &t2, &t1); - fe_sq_tt(&t3, &t2); - for (i = 1; i < 20; ++i) { - fe_sq_tt(&t3, &t3); - } - fe_mul_ttt(&t2, &t3, &t2); - fe_sq_tt(&t2, &t2); - for (i = 1; i < 10; ++i) { - fe_sq_tt(&t2, &t2); - } - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 50; ++i) { - fe_sq_tt(&t2, &t2); - } - fe_mul_ttt(&t2, &t2, &t1); - fe_sq_tt(&t3, &t2); - for (i = 1; i < 100; ++i) { - fe_sq_tt(&t3, &t3); - } - fe_mul_ttt(&t2, &t3, &t2); - fe_sq_tt(&t2, &t2); - for (i = 1; i < 50; ++i) { - fe_sq_tt(&t2, &t2); - } - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t1, &t1); - for (i = 1; i < 5; ++i) { - fe_sq_tt(&t1, &t1); - } - fe_mul_ttt(out, &t1, &t0); -} - -static void fe_invert(fe *out, const fe *z) { - fe_loose l; - fe_copy_lt(&l, z); - fe_loose_invert(out, &l); -} - -// return 0 if f == 0 -// return 1 if f != 0 -static int fe_isnonzero(const fe_loose *f) { - fe tight; - fe_carry(&tight, f); - uint8_t s[32]; - fe_tobytes(s, &tight); - - static const uint8_t zero[32] = {0}; - return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0; -} - -// return 1 if f is in {1,3,5,...,q-2} -// return 0 if f is in {0,2,4,...,q-1} -static int fe_isnegative(const fe *f) { - uint8_t s[32]; - fe_tobytes(s, f); - return s[0] & 1; -} - -static void fe_sq2_tt(fe *h, const fe *f) { - // h = f^2 - fe_sq_tt(h, f); - - // h = h + h - fe_loose tmp; - fe_add(&tmp, h, h); - fe_carry(h, &tmp); -} - -static void fe_pow22523(fe *out, const fe *z) { - fe t0; - fe t1; - fe t2; - int i; - - fe_sq_tt(&t0, z); - fe_sq_tt(&t1, &t0); - for (i = 1; i < 2; ++i) { - fe_sq_tt(&t1, &t1); - } - fe_mul_ttt(&t1, z, &t1); - fe_mul_ttt(&t0, &t0, &t1); - fe_sq_tt(&t0, &t0); - fe_mul_ttt(&t0, &t1, &t0); - fe_sq_tt(&t1, &t0); - for (i = 1; i < 5; ++i) { - fe_sq_tt(&t1, &t1); - } - fe_mul_ttt(&t0, &t1, &t0); - fe_sq_tt(&t1, &t0); - for (i = 1; i < 10; ++i) { - fe_sq_tt(&t1, &t1); - } - fe_mul_ttt(&t1, &t1, &t0); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 20; ++i) { - fe_sq_tt(&t2, &t2); - } - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t1, &t1); - for (i = 1; i < 10; ++i) { - fe_sq_tt(&t1, &t1); - } - fe_mul_ttt(&t0, &t1, &t0); - fe_sq_tt(&t1, &t0); - for (i = 1; i < 50; ++i) { - fe_sq_tt(&t1, &t1); - } - fe_mul_ttt(&t1, &t1, &t0); - fe_sq_tt(&t2, &t1); - for (i = 1; i < 100; ++i) { - fe_sq_tt(&t2, &t2); - } - fe_mul_ttt(&t1, &t2, &t1); - fe_sq_tt(&t1, &t1); - for (i = 1; i < 50; ++i) { - fe_sq_tt(&t1, &t1); - } - fe_mul_ttt(&t0, &t1, &t0); - fe_sq_tt(&t0, &t0); - for (i = 1; i < 2; ++i) { - fe_sq_tt(&t0, &t0); - } - fe_mul_ttt(out, &t0, z); -} - - -// Group operations. - -void x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) { - fe recip; - fe x; - fe y; - - fe_invert(&recip, &h->Z); - fe_mul_ttt(&x, &h->X, &recip); - fe_mul_ttt(&y, &h->Y, &recip); - fe_tobytes(s, &y); - s[31] ^= fe_isnegative(&x) << 7; -} - -static void ge_p3_tobytes(uint8_t s[32], const ge_p3 *h) { - fe recip; - fe x; - fe y; - - fe_invert(&recip, &h->Z); - fe_mul_ttt(&x, &h->X, &recip); - fe_mul_ttt(&y, &h->Y, &recip); - fe_tobytes(s, &y); - s[31] ^= fe_isnegative(&x) << 7; -} - -int x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) { - fe u; - fe_loose v; - fe w; - fe vxx; - fe_loose check; - - fe_frombytes(&h->Y, s); - fe_1(&h->Z); - fe_sq_tt(&w, &h->Y); - fe_mul_ttt(&vxx, &w, &d); - fe_sub(&v, &w, &h->Z); // u = y^2-1 - fe_carry(&u, &v); - fe_add(&v, &vxx, &h->Z); // v = dy^2+1 - - fe_mul_ttl(&w, &u, &v); // w = u*v - fe_pow22523(&h->X, &w); // x = w^((q-5)/8) - fe_mul_ttt(&h->X, &h->X, &u); // x = u*w^((q-5)/8) - - fe_sq_tt(&vxx, &h->X); - fe_mul_ttl(&vxx, &vxx, &v); - fe_sub(&check, &vxx, &u); - if (fe_isnonzero(&check)) { - fe_add(&check, &vxx, &u); - if (fe_isnonzero(&check)) { - return 0; - } - fe_mul_ttt(&h->X, &h->X, &sqrtm1); - } - - if (fe_isnegative(&h->X) != (s[31] >> 7)) { - fe_loose t; - fe_neg(&t, &h->X); - fe_carry(&h->X, &t); - } - - fe_mul_ttt(&h->T, &h->X, &h->Y); - return 1; -} - -static void ge_p2_0(ge_p2 *h) { - fe_0(&h->X); - fe_1(&h->Y); - fe_1(&h->Z); -} - -static void ge_p3_0(ge_p3 *h) { - fe_0(&h->X); - fe_1(&h->Y); - fe_1(&h->Z); - fe_0(&h->T); -} - -static void ge_cached_0(ge_cached *h) { - fe_loose_1(&h->YplusX); - fe_loose_1(&h->YminusX); - fe_loose_1(&h->Z); - fe_loose_0(&h->T2d); -} - -static void ge_precomp_0(ge_precomp *h) { - fe_loose_1(&h->yplusx); - fe_loose_1(&h->yminusx); - fe_loose_0(&h->xy2d); -} - -// r = p -static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) { - fe_copy(&r->X, &p->X); - fe_copy(&r->Y, &p->Y); - fe_copy(&r->Z, &p->Z); -} - -// r = p -void x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) { - fe_add(&r->YplusX, &p->Y, &p->X); - fe_sub(&r->YminusX, &p->Y, &p->X); - fe_copy_lt(&r->Z, &p->Z); - fe_mul_ltt(&r->T2d, &p->T, &d2); -} - -// r = p -void x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) { - fe_mul_tll(&r->X, &p->X, &p->T); - fe_mul_tll(&r->Y, &p->Y, &p->Z); - fe_mul_tll(&r->Z, &p->Z, &p->T); -} - -// r = p -void x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) { - fe_mul_tll(&r->X, &p->X, &p->T); - fe_mul_tll(&r->Y, &p->Y, &p->Z); - fe_mul_tll(&r->Z, &p->Z, &p->T); - fe_mul_tll(&r->T, &p->X, &p->Y); -} - -// r = p -static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) { - ge_p3 t; - x25519_ge_p1p1_to_p3(&t, p); - x25519_ge_p3_to_cached(r, &t); -} - -// r = 2 * p -static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) { - fe trX, trZ, trT; - fe t0; - - fe_sq_tt(&trX, &p->X); - fe_sq_tt(&trZ, &p->Y); - fe_sq2_tt(&trT, &p->Z); - fe_add(&r->Y, &p->X, &p->Y); - fe_sq_tl(&t0, &r->Y); - - fe_add(&r->Y, &trZ, &trX); - fe_sub(&r->Z, &trZ, &trX); - fe_carry(&trZ, &r->Y); - fe_sub(&r->X, &t0, &trZ); - fe_carry(&trZ, &r->Z); - fe_sub(&r->T, &trT, &trZ); -} - -// r = 2 * p -static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) { - ge_p2 q; - ge_p3_to_p2(&q, p); - ge_p2_dbl(r, &q); -} - -// r = p + q -static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { - fe trY, trZ, trT; - - fe_add(&r->X, &p->Y, &p->X); - fe_sub(&r->Y, &p->Y, &p->X); - fe_mul_tll(&trZ, &r->X, &q->yplusx); - fe_mul_tll(&trY, &r->Y, &q->yminusx); - fe_mul_tlt(&trT, &q->xy2d, &p->T); - fe_add(&r->T, &p->Z, &p->Z); - fe_sub(&r->X, &trZ, &trY); - fe_add(&r->Y, &trZ, &trY); - fe_carry(&trZ, &r->T); - fe_add(&r->Z, &trZ, &trT); - fe_sub(&r->T, &trZ, &trT); -} - -// r = p - q -static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { - fe trY, trZ, trT; - - fe_add(&r->X, &p->Y, &p->X); - fe_sub(&r->Y, &p->Y, &p->X); - fe_mul_tll(&trZ, &r->X, &q->yminusx); - fe_mul_tll(&trY, &r->Y, &q->yplusx); - fe_mul_tlt(&trT, &q->xy2d, &p->T); - fe_add(&r->T, &p->Z, &p->Z); - fe_sub(&r->X, &trZ, &trY); - fe_add(&r->Y, &trZ, &trY); - fe_carry(&trZ, &r->T); - fe_sub(&r->Z, &trZ, &trT); - fe_add(&r->T, &trZ, &trT); -} - -// r = p + q -void x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { - fe trX, trY, trZ, trT; - - fe_add(&r->X, &p->Y, &p->X); - fe_sub(&r->Y, &p->Y, &p->X); - fe_mul_tll(&trZ, &r->X, &q->YplusX); - fe_mul_tll(&trY, &r->Y, &q->YminusX); - fe_mul_tlt(&trT, &q->T2d, &p->T); - fe_mul_ttl(&trX, &p->Z, &q->Z); - fe_add(&r->T, &trX, &trX); - fe_sub(&r->X, &trZ, &trY); - fe_add(&r->Y, &trZ, &trY); - fe_carry(&trZ, &r->T); - fe_add(&r->Z, &trZ, &trT); - fe_sub(&r->T, &trZ, &trT); -} - -// r = p - q -void x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { - fe trX, trY, trZ, trT; - - fe_add(&r->X, &p->Y, &p->X); - fe_sub(&r->Y, &p->Y, &p->X); - fe_mul_tll(&trZ, &r->X, &q->YminusX); - fe_mul_tll(&trY, &r->Y, &q->YplusX); - fe_mul_tlt(&trT, &q->T2d, &p->T); - fe_mul_ttl(&trX, &p->Z, &q->Z); - fe_add(&r->T, &trX, &trX); - fe_sub(&r->X, &trZ, &trY); - fe_add(&r->Y, &trZ, &trY); - fe_carry(&trZ, &r->T); - fe_sub(&r->Z, &trZ, &trT); - fe_add(&r->T, &trZ, &trT); -} - -static uint8_t equal(signed char b, signed char c) { - uint8_t ub = b; - uint8_t uc = c; - uint8_t x = ub ^ uc; // 0: yes; 1..255: no - uint32_t y = x; // 0: yes; 1..255: no - y -= 1; // 4294967295: yes; 0..254: no - y >>= 31; // 1: yes; 0: no - return y; -} - -static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) { - fe_cmov(&t->yplusx, &u->yplusx, b); - fe_cmov(&t->yminusx, &u->yminusx, b); - fe_cmov(&t->xy2d, &u->xy2d, b); -} - -void x25519_ge_scalarmult_small_precomp( - ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) { - // precomp_table is first expanded into matching |ge_precomp| - // elements. - ge_precomp multiples[15]; - - unsigned i; - for (i = 0; i < 15; i++) { - // The precomputed table is assumed to already clear the top bit, so - // |fe_frombytes_strict| may be used directly. - const uint8_t *bytes = &precomp_table[i*(2 * 32)]; - fe x, y; - fe_frombytes_strict(&x, bytes); - fe_frombytes_strict(&y, bytes + 32); - - ge_precomp *out = &multiples[i]; - fe_add(&out->yplusx, &y, &x); - fe_sub(&out->yminusx, &y, &x); - fe_mul_ltt(&out->xy2d, &x, &y); - fe_mul_llt(&out->xy2d, &out->xy2d, &d2); - } - - // See the comment above |k25519SmallPrecomp| about the structure of the - // precomputed elements. This loop does 64 additions and 64 doublings to - // calculate the result. - ge_p3_0(h); - - for (i = 63; i < 64; i--) { - unsigned j; - signed char index = 0; - - for (j = 0; j < 4; j++) { - const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7)); - index |= (bit << j); - } - - ge_precomp e; - ge_precomp_0(&e); - - for (j = 1; j < 16; j++) { - cmov(&e, &multiples[j-1], equal(index, j)); - } - - ge_cached cached; - ge_p1p1 r; - x25519_ge_p3_to_cached(&cached, h); - x25519_ge_add(&r, h, &cached); - x25519_ge_p1p1_to_p3(h, &r); - - ge_madd(&r, h, &e); - x25519_ge_p1p1_to_p3(h, &r); - } -} - -#if defined(OPENSSL_SMALL) - -void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) { - x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp); -} - -#else - -static uint8_t negative(signed char b) { - uint32_t x = b; - x >>= 31; // 1: yes; 0: no - return x; -} - -static void table_select(ge_precomp *t, int pos, signed char b) { - ge_precomp minust; - uint8_t bnegative = negative(b); - uint8_t babs = b - ((uint8_t)((-bnegative) & b) << 1); - - ge_precomp_0(t); - cmov(t, &k25519Precomp[pos][0], equal(babs, 1)); - cmov(t, &k25519Precomp[pos][1], equal(babs, 2)); - cmov(t, &k25519Precomp[pos][2], equal(babs, 3)); - cmov(t, &k25519Precomp[pos][3], equal(babs, 4)); - cmov(t, &k25519Precomp[pos][4], equal(babs, 5)); - cmov(t, &k25519Precomp[pos][5], equal(babs, 6)); - cmov(t, &k25519Precomp[pos][6], equal(babs, 7)); - cmov(t, &k25519Precomp[pos][7], equal(babs, 8)); - fe_copy_ll(&minust.yplusx, &t->yminusx); - fe_copy_ll(&minust.yminusx, &t->yplusx); - - // NOTE: the input table is canonical, but types don't encode it - fe tmp; - fe_carry(&tmp, &t->xy2d); - fe_neg(&minust.xy2d, &tmp); - - cmov(t, &minust, bnegative); -} - -// h = a * B -// where a = a[0]+256*a[1]+...+256^31 a[31] -// B is the Ed25519 base point (x,4/5) with x positive. -// -// Preconditions: -// a[31] <= 127 -void x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) { - signed char e[64]; - signed char carry; - ge_p1p1 r; - ge_p2 s; - ge_precomp t; - int i; - - for (i = 0; i < 32; ++i) { - e[2 * i + 0] = (a[i] >> 0) & 15; - e[2 * i + 1] = (a[i] >> 4) & 15; - } - // each e[i] is between 0 and 15 - // e[63] is between 0 and 7 - - carry = 0; - for (i = 0; i < 63; ++i) { - e[i] += carry; - carry = e[i] + 8; - carry >>= 4; - e[i] -= carry << 4; - } - e[63] += carry; - // each e[i] is between -8 and 8 - - ge_p3_0(h); - for (i = 1; i < 64; i += 2) { - table_select(&t, i / 2, e[i]); - ge_madd(&r, h, &t); - x25519_ge_p1p1_to_p3(h, &r); - } - - ge_p3_dbl(&r, h); - x25519_ge_p1p1_to_p2(&s, &r); - ge_p2_dbl(&r, &s); - x25519_ge_p1p1_to_p2(&s, &r); - ge_p2_dbl(&r, &s); - x25519_ge_p1p1_to_p2(&s, &r); - ge_p2_dbl(&r, &s); - x25519_ge_p1p1_to_p3(h, &r); - - for (i = 0; i < 64; i += 2) { - table_select(&t, i / 2, e[i]); - ge_madd(&r, h, &t); - x25519_ge_p1p1_to_p3(h, &r); - } -} - -#endif - -static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) { - fe_cmov(&t->YplusX, &u->YplusX, b); - fe_cmov(&t->YminusX, &u->YminusX, b); - fe_cmov(&t->Z, &u->Z, b); - fe_cmov(&t->T2d, &u->T2d, b); -} - -// r = scalar * A. -// where a = a[0]+256*a[1]+...+256^31 a[31]. -void x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, const ge_p3 *A) { - ge_p2 Ai_p2[8]; - ge_cached Ai[16]; - ge_p1p1 t; - - ge_cached_0(&Ai[0]); - x25519_ge_p3_to_cached(&Ai[1], A); - ge_p3_to_p2(&Ai_p2[1], A); - - unsigned i; - for (i = 2; i < 16; i += 2) { - ge_p2_dbl(&t, &Ai_p2[i / 2]); - ge_p1p1_to_cached(&Ai[i], &t); - if (i < 8) { - x25519_ge_p1p1_to_p2(&Ai_p2[i], &t); - } - x25519_ge_add(&t, A, &Ai[i]); - ge_p1p1_to_cached(&Ai[i + 1], &t); - if (i < 7) { - x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t); - } - } - - ge_p2_0(r); - ge_p3 u; - - for (i = 0; i < 256; i += 4) { - ge_p2_dbl(&t, r); - x25519_ge_p1p1_to_p2(r, &t); - ge_p2_dbl(&t, r); - x25519_ge_p1p1_to_p2(r, &t); - ge_p2_dbl(&t, r); - x25519_ge_p1p1_to_p2(r, &t); - ge_p2_dbl(&t, r); - x25519_ge_p1p1_to_p3(&u, &t); - - uint8_t index = scalar[31 - i/8]; - index >>= 4 - (i & 4); - index &= 0xf; - - unsigned j; - ge_cached selected; - ge_cached_0(&selected); - for (j = 0; j < 16; j++) { - cmov_cached(&selected, &Ai[j], equal(j, index)); - } - - x25519_ge_add(&t, &u, &selected); - x25519_ge_p1p1_to_p2(r, &t); - } -} - -static void slide(signed char *r, const uint8_t *a) { - int i; - int b; - int k; - - for (i = 0; i < 256; ++i) { - r[i] = 1 & (a[i >> 3] >> (i & 7)); - } - - for (i = 0; i < 256; ++i) { - if (r[i]) { - for (b = 1; b <= 6 && i + b < 256; ++b) { - if (r[i + b]) { - if (r[i] + (r[i + b] << b) <= 15) { - r[i] += r[i + b] << b; - r[i + b] = 0; - } else if (r[i] - (r[i + b] << b) >= -15) { - r[i] -= r[i + b] << b; - for (k = i + b; k < 256; ++k) { - if (!r[k]) { - r[k] = 1; - break; - } - r[k] = 0; - } - } else { - break; - } - } - } - } - } -} - -// r = a * A + b * B -// where a = a[0]+256*a[1]+...+256^31 a[31]. -// and b = b[0]+256*b[1]+...+256^31 b[31]. -// B is the Ed25519 base point (x,4/5) with x positive. -static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a, - const ge_p3 *A, const uint8_t *b) { - signed char aslide[256]; - signed char bslide[256]; - ge_cached Ai[8]; // A,3A,5A,7A,9A,11A,13A,15A - ge_p1p1 t; - ge_p3 u; - ge_p3 A2; - int i; - - slide(aslide, a); - slide(bslide, b); - - x25519_ge_p3_to_cached(&Ai[0], A); - ge_p3_dbl(&t, A); - x25519_ge_p1p1_to_p3(&A2, &t); - x25519_ge_add(&t, &A2, &Ai[0]); - x25519_ge_p1p1_to_p3(&u, &t); - x25519_ge_p3_to_cached(&Ai[1], &u); - x25519_ge_add(&t, &A2, &Ai[1]); - x25519_ge_p1p1_to_p3(&u, &t); - x25519_ge_p3_to_cached(&Ai[2], &u); - x25519_ge_add(&t, &A2, &Ai[2]); - x25519_ge_p1p1_to_p3(&u, &t); - x25519_ge_p3_to_cached(&Ai[3], &u); - x25519_ge_add(&t, &A2, &Ai[3]); - x25519_ge_p1p1_to_p3(&u, &t); - x25519_ge_p3_to_cached(&Ai[4], &u); - x25519_ge_add(&t, &A2, &Ai[4]); - x25519_ge_p1p1_to_p3(&u, &t); - x25519_ge_p3_to_cached(&Ai[5], &u); - x25519_ge_add(&t, &A2, &Ai[5]); - x25519_ge_p1p1_to_p3(&u, &t); - x25519_ge_p3_to_cached(&Ai[6], &u); - x25519_ge_add(&t, &A2, &Ai[6]); - x25519_ge_p1p1_to_p3(&u, &t); - x25519_ge_p3_to_cached(&Ai[7], &u); - - ge_p2_0(r); - - for (i = 255; i >= 0; --i) { - if (aslide[i] || bslide[i]) { - break; - } - } - - for (; i >= 0; --i) { - ge_p2_dbl(&t, r); - - if (aslide[i] > 0) { - x25519_ge_p1p1_to_p3(&u, &t); - x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]); - } else if (aslide[i] < 0) { - x25519_ge_p1p1_to_p3(&u, &t); - x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]); - } - - if (bslide[i] > 0) { - x25519_ge_p1p1_to_p3(&u, &t); - ge_madd(&t, &u, &Bi[bslide[i] / 2]); - } else if (bslide[i] < 0) { - x25519_ge_p1p1_to_p3(&u, &t); - ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]); - } - - x25519_ge_p1p1_to_p2(r, &t); - } -} - -// int64_lshift21 returns |a << 21| but is defined when shifting bits into the -// sign bit. This works around a language flaw in C. -static inline int64_t int64_lshift21(int64_t a) { - return (int64_t)((uint64_t)a << 21); -} - -// The set of scalars is \Z/l -// where l = 2^252 + 27742317777372353535851937790883648493. - -// Input: -// s[0]+256*s[1]+...+256^63*s[63] = s -// -// Output: -// s[0]+256*s[1]+...+256^31*s[31] = s mod l -// where l = 2^252 + 27742317777372353535851937790883648493. -// Overwrites s in place. -void x25519_sc_reduce(uint8_t s[64]) { - int64_t s0 = 2097151 & load_3(s); - int64_t s1 = 2097151 & (load_4(s + 2) >> 5); - int64_t s2 = 2097151 & (load_3(s + 5) >> 2); - int64_t s3 = 2097151 & (load_4(s + 7) >> 7); - int64_t s4 = 2097151 & (load_4(s + 10) >> 4); - int64_t s5 = 2097151 & (load_3(s + 13) >> 1); - int64_t s6 = 2097151 & (load_4(s + 15) >> 6); - int64_t s7 = 2097151 & (load_3(s + 18) >> 3); - int64_t s8 = 2097151 & load_3(s + 21); - int64_t s9 = 2097151 & (load_4(s + 23) >> 5); - int64_t s10 = 2097151 & (load_3(s + 26) >> 2); - int64_t s11 = 2097151 & (load_4(s + 28) >> 7); - int64_t s12 = 2097151 & (load_4(s + 31) >> 4); - int64_t s13 = 2097151 & (load_3(s + 34) >> 1); - int64_t s14 = 2097151 & (load_4(s + 36) >> 6); - int64_t s15 = 2097151 & (load_3(s + 39) >> 3); - int64_t s16 = 2097151 & load_3(s + 42); - int64_t s17 = 2097151 & (load_4(s + 44) >> 5); - int64_t s18 = 2097151 & (load_3(s + 47) >> 2); - int64_t s19 = 2097151 & (load_4(s + 49) >> 7); - int64_t s20 = 2097151 & (load_4(s + 52) >> 4); - int64_t s21 = 2097151 & (load_3(s + 55) >> 1); - int64_t s22 = 2097151 & (load_4(s + 57) >> 6); - int64_t s23 = (load_4(s + 60) >> 3); - int64_t carry0; - int64_t carry1; - int64_t carry2; - int64_t carry3; - int64_t carry4; - int64_t carry5; - int64_t carry6; - int64_t carry7; - int64_t carry8; - int64_t carry9; - int64_t carry10; - int64_t carry11; - int64_t carry12; - int64_t carry13; - int64_t carry14; - int64_t carry15; - int64_t carry16; - - s11 += s23 * 666643; - s12 += s23 * 470296; - s13 += s23 * 654183; - s14 -= s23 * 997805; - s15 += s23 * 136657; - s16 -= s23 * 683901; - s23 = 0; - - s10 += s22 * 666643; - s11 += s22 * 470296; - s12 += s22 * 654183; - s13 -= s22 * 997805; - s14 += s22 * 136657; - s15 -= s22 * 683901; - s22 = 0; - - s9 += s21 * 666643; - s10 += s21 * 470296; - s11 += s21 * 654183; - s12 -= s21 * 997805; - s13 += s21 * 136657; - s14 -= s21 * 683901; - s21 = 0; - - s8 += s20 * 666643; - s9 += s20 * 470296; - s10 += s20 * 654183; - s11 -= s20 * 997805; - s12 += s20 * 136657; - s13 -= s20 * 683901; - s20 = 0; - - s7 += s19 * 666643; - s8 += s19 * 470296; - s9 += s19 * 654183; - s10 -= s19 * 997805; - s11 += s19 * 136657; - s12 -= s19 * 683901; - s19 = 0; - - s6 += s18 * 666643; - s7 += s18 * 470296; - s8 += s18 * 654183; - s9 -= s18 * 997805; - s10 += s18 * 136657; - s11 -= s18 * 683901; - s18 = 0; - - carry6 = (s6 + (1 << 20)) >> 21; - s7 += carry6; - s6 -= int64_lshift21(carry6); - carry8 = (s8 + (1 << 20)) >> 21; - s9 += carry8; - s8 -= int64_lshift21(carry8); - carry10 = (s10 + (1 << 20)) >> 21; - s11 += carry10; - s10 -= int64_lshift21(carry10); - carry12 = (s12 + (1 << 20)) >> 21; - s13 += carry12; - s12 -= int64_lshift21(carry12); - carry14 = (s14 + (1 << 20)) >> 21; - s15 += carry14; - s14 -= int64_lshift21(carry14); - carry16 = (s16 + (1 << 20)) >> 21; - s17 += carry16; - s16 -= int64_lshift21(carry16); - - carry7 = (s7 + (1 << 20)) >> 21; - s8 += carry7; - s7 -= int64_lshift21(carry7); - carry9 = (s9 + (1 << 20)) >> 21; - s10 += carry9; - s9 -= int64_lshift21(carry9); - carry11 = (s11 + (1 << 20)) >> 21; - s12 += carry11; - s11 -= int64_lshift21(carry11); - carry13 = (s13 + (1 << 20)) >> 21; - s14 += carry13; - s13 -= int64_lshift21(carry13); - carry15 = (s15 + (1 << 20)) >> 21; - s16 += carry15; - s15 -= int64_lshift21(carry15); - - s5 += s17 * 666643; - s6 += s17 * 470296; - s7 += s17 * 654183; - s8 -= s17 * 997805; - s9 += s17 * 136657; - s10 -= s17 * 683901; - s17 = 0; - - s4 += s16 * 666643; - s5 += s16 * 470296; - s6 += s16 * 654183; - s7 -= s16 * 997805; - s8 += s16 * 136657; - s9 -= s16 * 683901; - s16 = 0; - - s3 += s15 * 666643; - s4 += s15 * 470296; - s5 += s15 * 654183; - s6 -= s15 * 997805; - s7 += s15 * 136657; - s8 -= s15 * 683901; - s15 = 0; - - s2 += s14 * 666643; - s3 += s14 * 470296; - s4 += s14 * 654183; - s5 -= s14 * 997805; - s6 += s14 * 136657; - s7 -= s14 * 683901; - s14 = 0; - - s1 += s13 * 666643; - s2 += s13 * 470296; - s3 += s13 * 654183; - s4 -= s13 * 997805; - s5 += s13 * 136657; - s6 -= s13 * 683901; - s13 = 0; - - s0 += s12 * 666643; - s1 += s12 * 470296; - s2 += s12 * 654183; - s3 -= s12 * 997805; - s4 += s12 * 136657; - s5 -= s12 * 683901; - s12 = 0; - - carry0 = (s0 + (1 << 20)) >> 21; - s1 += carry0; - s0 -= int64_lshift21(carry0); - carry2 = (s2 + (1 << 20)) >> 21; - s3 += carry2; - s2 -= int64_lshift21(carry2); - carry4 = (s4 + (1 << 20)) >> 21; - s5 += carry4; - s4 -= int64_lshift21(carry4); - carry6 = (s6 + (1 << 20)) >> 21; - s7 += carry6; - s6 -= int64_lshift21(carry6); - carry8 = (s8 + (1 << 20)) >> 21; - s9 += carry8; - s8 -= int64_lshift21(carry8); - carry10 = (s10 + (1 << 20)) >> 21; - s11 += carry10; - s10 -= int64_lshift21(carry10); - - carry1 = (s1 + (1 << 20)) >> 21; - s2 += carry1; - s1 -= int64_lshift21(carry1); - carry3 = (s3 + (1 << 20)) >> 21; - s4 += carry3; - s3 -= int64_lshift21(carry3); - carry5 = (s5 + (1 << 20)) >> 21; - s6 += carry5; - s5 -= int64_lshift21(carry5); - carry7 = (s7 + (1 << 20)) >> 21; - s8 += carry7; - s7 -= int64_lshift21(carry7); - carry9 = (s9 + (1 << 20)) >> 21; - s10 += carry9; - s9 -= int64_lshift21(carry9); - carry11 = (s11 + (1 << 20)) >> 21; - s12 += carry11; - s11 -= int64_lshift21(carry11); - - s0 += s12 * 666643; - s1 += s12 * 470296; - s2 += s12 * 654183; - s3 -= s12 * 997805; - s4 += s12 * 136657; - s5 -= s12 * 683901; - s12 = 0; - - carry0 = s0 >> 21; - s1 += carry0; - s0 -= int64_lshift21(carry0); - carry1 = s1 >> 21; - s2 += carry1; - s1 -= int64_lshift21(carry1); - carry2 = s2 >> 21; - s3 += carry2; - s2 -= int64_lshift21(carry2); - carry3 = s3 >> 21; - s4 += carry3; - s3 -= int64_lshift21(carry3); - carry4 = s4 >> 21; - s5 += carry4; - s4 -= int64_lshift21(carry4); - carry5 = s5 >> 21; - s6 += carry5; - s5 -= int64_lshift21(carry5); - carry6 = s6 >> 21; - s7 += carry6; - s6 -= int64_lshift21(carry6); - carry7 = s7 >> 21; - s8 += carry7; - s7 -= int64_lshift21(carry7); - carry8 = s8 >> 21; - s9 += carry8; - s8 -= int64_lshift21(carry8); - carry9 = s9 >> 21; - s10 += carry9; - s9 -= int64_lshift21(carry9); - carry10 = s10 >> 21; - s11 += carry10; - s10 -= int64_lshift21(carry10); - carry11 = s11 >> 21; - s12 += carry11; - s11 -= int64_lshift21(carry11); - - s0 += s12 * 666643; - s1 += s12 * 470296; - s2 += s12 * 654183; - s3 -= s12 * 997805; - s4 += s12 * 136657; - s5 -= s12 * 683901; - s12 = 0; - - carry0 = s0 >> 21; - s1 += carry0; - s0 -= int64_lshift21(carry0); - carry1 = s1 >> 21; - s2 += carry1; - s1 -= int64_lshift21(carry1); - carry2 = s2 >> 21; - s3 += carry2; - s2 -= int64_lshift21(carry2); - carry3 = s3 >> 21; - s4 += carry3; - s3 -= int64_lshift21(carry3); - carry4 = s4 >> 21; - s5 += carry4; - s4 -= int64_lshift21(carry4); - carry5 = s5 >> 21; - s6 += carry5; - s5 -= int64_lshift21(carry5); - carry6 = s6 >> 21; - s7 += carry6; - s6 -= int64_lshift21(carry6); - carry7 = s7 >> 21; - s8 += carry7; - s7 -= int64_lshift21(carry7); - carry8 = s8 >> 21; - s9 += carry8; - s8 -= int64_lshift21(carry8); - carry9 = s9 >> 21; - s10 += carry9; - s9 -= int64_lshift21(carry9); - carry10 = s10 >> 21; - s11 += carry10; - s10 -= int64_lshift21(carry10); - - s[0] = s0 >> 0; - s[1] = s0 >> 8; - s[2] = (s0 >> 16) | (s1 << 5); - s[3] = s1 >> 3; - s[4] = s1 >> 11; - s[5] = (s1 >> 19) | (s2 << 2); - s[6] = s2 >> 6; - s[7] = (s2 >> 14) | (s3 << 7); - s[8] = s3 >> 1; - s[9] = s3 >> 9; - s[10] = (s3 >> 17) | (s4 << 4); - s[11] = s4 >> 4; - s[12] = s4 >> 12; - s[13] = (s4 >> 20) | (s5 << 1); - s[14] = s5 >> 7; - s[15] = (s5 >> 15) | (s6 << 6); - s[16] = s6 >> 2; - s[17] = s6 >> 10; - s[18] = (s6 >> 18) | (s7 << 3); - s[19] = s7 >> 5; - s[20] = s7 >> 13; - s[21] = s8 >> 0; - s[22] = s8 >> 8; - s[23] = (s8 >> 16) | (s9 << 5); - s[24] = s9 >> 3; - s[25] = s9 >> 11; - s[26] = (s9 >> 19) | (s10 << 2); - s[27] = s10 >> 6; - s[28] = (s10 >> 14) | (s11 << 7); - s[29] = s11 >> 1; - s[30] = s11 >> 9; - s[31] = s11 >> 17; -} - -// Input: -// a[0]+256*a[1]+...+256^31*a[31] = a -// b[0]+256*b[1]+...+256^31*b[31] = b -// c[0]+256*c[1]+...+256^31*c[31] = c -// -// Output: -// s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l -// where l = 2^252 + 27742317777372353535851937790883648493. -static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b, - const uint8_t *c) { - int64_t a0 = 2097151 & load_3(a); - int64_t a1 = 2097151 & (load_4(a + 2) >> 5); - int64_t a2 = 2097151 & (load_3(a + 5) >> 2); - int64_t a3 = 2097151 & (load_4(a + 7) >> 7); - int64_t a4 = 2097151 & (load_4(a + 10) >> 4); - int64_t a5 = 2097151 & (load_3(a + 13) >> 1); - int64_t a6 = 2097151 & (load_4(a + 15) >> 6); - int64_t a7 = 2097151 & (load_3(a + 18) >> 3); - int64_t a8 = 2097151 & load_3(a + 21); - int64_t a9 = 2097151 & (load_4(a + 23) >> 5); - int64_t a10 = 2097151 & (load_3(a + 26) >> 2); - int64_t a11 = (load_4(a + 28) >> 7); - int64_t b0 = 2097151 & load_3(b); - int64_t b1 = 2097151 & (load_4(b + 2) >> 5); - int64_t b2 = 2097151 & (load_3(b + 5) >> 2); - int64_t b3 = 2097151 & (load_4(b + 7) >> 7); - int64_t b4 = 2097151 & (load_4(b + 10) >> 4); - int64_t b5 = 2097151 & (load_3(b + 13) >> 1); - int64_t b6 = 2097151 & (load_4(b + 15) >> 6); - int64_t b7 = 2097151 & (load_3(b + 18) >> 3); - int64_t b8 = 2097151 & load_3(b + 21); - int64_t b9 = 2097151 & (load_4(b + 23) >> 5); - int64_t b10 = 2097151 & (load_3(b + 26) >> 2); - int64_t b11 = (load_4(b + 28) >> 7); - int64_t c0 = 2097151 & load_3(c); - int64_t c1 = 2097151 & (load_4(c + 2) >> 5); - int64_t c2 = 2097151 & (load_3(c + 5) >> 2); - int64_t c3 = 2097151 & (load_4(c + 7) >> 7); - int64_t c4 = 2097151 & (load_4(c + 10) >> 4); - int64_t c5 = 2097151 & (load_3(c + 13) >> 1); - int64_t c6 = 2097151 & (load_4(c + 15) >> 6); - int64_t c7 = 2097151 & (load_3(c + 18) >> 3); - int64_t c8 = 2097151 & load_3(c + 21); - int64_t c9 = 2097151 & (load_4(c + 23) >> 5); - int64_t c10 = 2097151 & (load_3(c + 26) >> 2); - int64_t c11 = (load_4(c + 28) >> 7); - int64_t s0; - int64_t s1; - int64_t s2; - int64_t s3; - int64_t s4; - int64_t s5; - int64_t s6; - int64_t s7; - int64_t s8; - int64_t s9; - int64_t s10; - int64_t s11; - int64_t s12; - int64_t s13; - int64_t s14; - int64_t s15; - int64_t s16; - int64_t s17; - int64_t s18; - int64_t s19; - int64_t s20; - int64_t s21; - int64_t s22; - int64_t s23; - int64_t carry0; - int64_t carry1; - int64_t carry2; - int64_t carry3; - int64_t carry4; - int64_t carry5; - int64_t carry6; - int64_t carry7; - int64_t carry8; - int64_t carry9; - int64_t carry10; - int64_t carry11; - int64_t carry12; - int64_t carry13; - int64_t carry14; - int64_t carry15; - int64_t carry16; - int64_t carry17; - int64_t carry18; - int64_t carry19; - int64_t carry20; - int64_t carry21; - int64_t carry22; - - s0 = c0 + a0 * b0; - s1 = c1 + a0 * b1 + a1 * b0; - s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0; - s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0; - s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0; - s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0; - s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0; - s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 + - a6 * b1 + a7 * b0; - s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 + - a6 * b2 + a7 * b1 + a8 * b0; - s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 + - a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0; - s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 + - a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0; - s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 + - a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0; - s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 + - a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1; - s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 + - a9 * b4 + a10 * b3 + a11 * b2; - s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 + - a10 * b4 + a11 * b3; - s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 + - a11 * b4; - s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5; - s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6; - s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7; - s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8; - s20 = a9 * b11 + a10 * b10 + a11 * b9; - s21 = a10 * b11 + a11 * b10; - s22 = a11 * b11; - s23 = 0; - - carry0 = (s0 + (1 << 20)) >> 21; - s1 += carry0; - s0 -= int64_lshift21(carry0); - carry2 = (s2 + (1 << 20)) >> 21; - s3 += carry2; - s2 -= int64_lshift21(carry2); - carry4 = (s4 + (1 << 20)) >> 21; - s5 += carry4; - s4 -= int64_lshift21(carry4); - carry6 = (s6 + (1 << 20)) >> 21; - s7 += carry6; - s6 -= int64_lshift21(carry6); - carry8 = (s8 + (1 << 20)) >> 21; - s9 += carry8; - s8 -= int64_lshift21(carry8); - carry10 = (s10 + (1 << 20)) >> 21; - s11 += carry10; - s10 -= int64_lshift21(carry10); - carry12 = (s12 + (1 << 20)) >> 21; - s13 += carry12; - s12 -= int64_lshift21(carry12); - carry14 = (s14 + (1 << 20)) >> 21; - s15 += carry14; - s14 -= int64_lshift21(carry14); - carry16 = (s16 + (1 << 20)) >> 21; - s17 += carry16; - s16 -= int64_lshift21(carry16); - carry18 = (s18 + (1 << 20)) >> 21; - s19 += carry18; - s18 -= int64_lshift21(carry18); - carry20 = (s20 + (1 << 20)) >> 21; - s21 += carry20; - s20 -= int64_lshift21(carry20); - carry22 = (s22 + (1 << 20)) >> 21; - s23 += carry22; - s22 -= int64_lshift21(carry22); - - carry1 = (s1 + (1 << 20)) >> 21; - s2 += carry1; - s1 -= int64_lshift21(carry1); - carry3 = (s3 + (1 << 20)) >> 21; - s4 += carry3; - s3 -= int64_lshift21(carry3); - carry5 = (s5 + (1 << 20)) >> 21; - s6 += carry5; - s5 -= int64_lshift21(carry5); - carry7 = (s7 + (1 << 20)) >> 21; - s8 += carry7; - s7 -= int64_lshift21(carry7); - carry9 = (s9 + (1 << 20)) >> 21; - s10 += carry9; - s9 -= int64_lshift21(carry9); - carry11 = (s11 + (1 << 20)) >> 21; - s12 += carry11; - s11 -= int64_lshift21(carry11); - carry13 = (s13 + (1 << 20)) >> 21; - s14 += carry13; - s13 -= int64_lshift21(carry13); - carry15 = (s15 + (1 << 20)) >> 21; - s16 += carry15; - s15 -= int64_lshift21(carry15); - carry17 = (s17 + (1 << 20)) >> 21; - s18 += carry17; - s17 -= int64_lshift21(carry17); - carry19 = (s19 + (1 << 20)) >> 21; - s20 += carry19; - s19 -= int64_lshift21(carry19); - carry21 = (s21 + (1 << 20)) >> 21; - s22 += carry21; - s21 -= int64_lshift21(carry21); - - s11 += s23 * 666643; - s12 += s23 * 470296; - s13 += s23 * 654183; - s14 -= s23 * 997805; - s15 += s23 * 136657; - s16 -= s23 * 683901; - s23 = 0; - - s10 += s22 * 666643; - s11 += s22 * 470296; - s12 += s22 * 654183; - s13 -= s22 * 997805; - s14 += s22 * 136657; - s15 -= s22 * 683901; - s22 = 0; - - s9 += s21 * 666643; - s10 += s21 * 470296; - s11 += s21 * 654183; - s12 -= s21 * 997805; - s13 += s21 * 136657; - s14 -= s21 * 683901; - s21 = 0; - - s8 += s20 * 666643; - s9 += s20 * 470296; - s10 += s20 * 654183; - s11 -= s20 * 997805; - s12 += s20 * 136657; - s13 -= s20 * 683901; - s20 = 0; - - s7 += s19 * 666643; - s8 += s19 * 470296; - s9 += s19 * 654183; - s10 -= s19 * 997805; - s11 += s19 * 136657; - s12 -= s19 * 683901; - s19 = 0; - - s6 += s18 * 666643; - s7 += s18 * 470296; - s8 += s18 * 654183; - s9 -= s18 * 997805; - s10 += s18 * 136657; - s11 -= s18 * 683901; - s18 = 0; - - carry6 = (s6 + (1 << 20)) >> 21; - s7 += carry6; - s6 -= int64_lshift21(carry6); - carry8 = (s8 + (1 << 20)) >> 21; - s9 += carry8; - s8 -= int64_lshift21(carry8); - carry10 = (s10 + (1 << 20)) >> 21; - s11 += carry10; - s10 -= int64_lshift21(carry10); - carry12 = (s12 + (1 << 20)) >> 21; - s13 += carry12; - s12 -= int64_lshift21(carry12); - carry14 = (s14 + (1 << 20)) >> 21; - s15 += carry14; - s14 -= int64_lshift21(carry14); - carry16 = (s16 + (1 << 20)) >> 21; - s17 += carry16; - s16 -= int64_lshift21(carry16); - - carry7 = (s7 + (1 << 20)) >> 21; - s8 += carry7; - s7 -= int64_lshift21(carry7); - carry9 = (s9 + (1 << 20)) >> 21; - s10 += carry9; - s9 -= int64_lshift21(carry9); - carry11 = (s11 + (1 << 20)) >> 21; - s12 += carry11; - s11 -= int64_lshift21(carry11); - carry13 = (s13 + (1 << 20)) >> 21; - s14 += carry13; - s13 -= int64_lshift21(carry13); - carry15 = (s15 + (1 << 20)) >> 21; - s16 += carry15; - s15 -= int64_lshift21(carry15); - - s5 += s17 * 666643; - s6 += s17 * 470296; - s7 += s17 * 654183; - s8 -= s17 * 997805; - s9 += s17 * 136657; - s10 -= s17 * 683901; - s17 = 0; - - s4 += s16 * 666643; - s5 += s16 * 470296; - s6 += s16 * 654183; - s7 -= s16 * 997805; - s8 += s16 * 136657; - s9 -= s16 * 683901; - s16 = 0; - - s3 += s15 * 666643; - s4 += s15 * 470296; - s5 += s15 * 654183; - s6 -= s15 * 997805; - s7 += s15 * 136657; - s8 -= s15 * 683901; - s15 = 0; - - s2 += s14 * 666643; - s3 += s14 * 470296; - s4 += s14 * 654183; - s5 -= s14 * 997805; - s6 += s14 * 136657; - s7 -= s14 * 683901; - s14 = 0; - - s1 += s13 * 666643; - s2 += s13 * 470296; - s3 += s13 * 654183; - s4 -= s13 * 997805; - s5 += s13 * 136657; - s6 -= s13 * 683901; - s13 = 0; - - s0 += s12 * 666643; - s1 += s12 * 470296; - s2 += s12 * 654183; - s3 -= s12 * 997805; - s4 += s12 * 136657; - s5 -= s12 * 683901; - s12 = 0; - - carry0 = (s0 + (1 << 20)) >> 21; - s1 += carry0; - s0 -= int64_lshift21(carry0); - carry2 = (s2 + (1 << 20)) >> 21; - s3 += carry2; - s2 -= int64_lshift21(carry2); - carry4 = (s4 + (1 << 20)) >> 21; - s5 += carry4; - s4 -= int64_lshift21(carry4); - carry6 = (s6 + (1 << 20)) >> 21; - s7 += carry6; - s6 -= int64_lshift21(carry6); - carry8 = (s8 + (1 << 20)) >> 21; - s9 += carry8; - s8 -= int64_lshift21(carry8); - carry10 = (s10 + (1 << 20)) >> 21; - s11 += carry10; - s10 -= int64_lshift21(carry10); - - carry1 = (s1 + (1 << 20)) >> 21; - s2 += carry1; - s1 -= int64_lshift21(carry1); - carry3 = (s3 + (1 << 20)) >> 21; - s4 += carry3; - s3 -= int64_lshift21(carry3); - carry5 = (s5 + (1 << 20)) >> 21; - s6 += carry5; - s5 -= int64_lshift21(carry5); - carry7 = (s7 + (1 << 20)) >> 21; - s8 += carry7; - s7 -= int64_lshift21(carry7); - carry9 = (s9 + (1 << 20)) >> 21; - s10 += carry9; - s9 -= int64_lshift21(carry9); - carry11 = (s11 + (1 << 20)) >> 21; - s12 += carry11; - s11 -= int64_lshift21(carry11); - - s0 += s12 * 666643; - s1 += s12 * 470296; - s2 += s12 * 654183; - s3 -= s12 * 997805; - s4 += s12 * 136657; - s5 -= s12 * 683901; - s12 = 0; - - carry0 = s0 >> 21; - s1 += carry0; - s0 -= int64_lshift21(carry0); - carry1 = s1 >> 21; - s2 += carry1; - s1 -= int64_lshift21(carry1); - carry2 = s2 >> 21; - s3 += carry2; - s2 -= int64_lshift21(carry2); - carry3 = s3 >> 21; - s4 += carry3; - s3 -= int64_lshift21(carry3); - carry4 = s4 >> 21; - s5 += carry4; - s4 -= int64_lshift21(carry4); - carry5 = s5 >> 21; - s6 += carry5; - s5 -= int64_lshift21(carry5); - carry6 = s6 >> 21; - s7 += carry6; - s6 -= int64_lshift21(carry6); - carry7 = s7 >> 21; - s8 += carry7; - s7 -= int64_lshift21(carry7); - carry8 = s8 >> 21; - s9 += carry8; - s8 -= int64_lshift21(carry8); - carry9 = s9 >> 21; - s10 += carry9; - s9 -= int64_lshift21(carry9); - carry10 = s10 >> 21; - s11 += carry10; - s10 -= int64_lshift21(carry10); - carry11 = s11 >> 21; - s12 += carry11; - s11 -= int64_lshift21(carry11); - - s0 += s12 * 666643; - s1 += s12 * 470296; - s2 += s12 * 654183; - s3 -= s12 * 997805; - s4 += s12 * 136657; - s5 -= s12 * 683901; - s12 = 0; - - carry0 = s0 >> 21; - s1 += carry0; - s0 -= int64_lshift21(carry0); - carry1 = s1 >> 21; - s2 += carry1; - s1 -= int64_lshift21(carry1); - carry2 = s2 >> 21; - s3 += carry2; - s2 -= int64_lshift21(carry2); - carry3 = s3 >> 21; - s4 += carry3; - s3 -= int64_lshift21(carry3); - carry4 = s4 >> 21; - s5 += carry4; - s4 -= int64_lshift21(carry4); - carry5 = s5 >> 21; - s6 += carry5; - s5 -= int64_lshift21(carry5); - carry6 = s6 >> 21; - s7 += carry6; - s6 -= int64_lshift21(carry6); - carry7 = s7 >> 21; - s8 += carry7; - s7 -= int64_lshift21(carry7); - carry8 = s8 >> 21; - s9 += carry8; - s8 -= int64_lshift21(carry8); - carry9 = s9 >> 21; - s10 += carry9; - s9 -= int64_lshift21(carry9); - carry10 = s10 >> 21; - s11 += carry10; - s10 -= int64_lshift21(carry10); - - s[0] = s0 >> 0; - s[1] = s0 >> 8; - s[2] = (s0 >> 16) | (s1 << 5); - s[3] = s1 >> 3; - s[4] = s1 >> 11; - s[5] = (s1 >> 19) | (s2 << 2); - s[6] = s2 >> 6; - s[7] = (s2 >> 14) | (s3 << 7); - s[8] = s3 >> 1; - s[9] = s3 >> 9; - s[10] = (s3 >> 17) | (s4 << 4); - s[11] = s4 >> 4; - s[12] = s4 >> 12; - s[13] = (s4 >> 20) | (s5 << 1); - s[14] = s5 >> 7; - s[15] = (s5 >> 15) | (s6 << 6); - s[16] = s6 >> 2; - s[17] = s6 >> 10; - s[18] = (s6 >> 18) | (s7 << 3); - s[19] = s7 >> 5; - s[20] = s7 >> 13; - s[21] = s8 >> 0; - s[22] = s8 >> 8; - s[23] = (s8 >> 16) | (s9 << 5); - s[24] = s9 >> 3; - s[25] = s9 >> 11; - s[26] = (s9 >> 19) | (s10 << 2); - s[27] = s10 >> 6; - s[28] = (s10 >> 14) | (s11 << 7); - s[29] = s11 >> 1; - s[30] = s11 >> 9; - s[31] = s11 >> 17; -} - -void ED25519_keypair(uint8_t out_public_key[32], uint8_t out_private_key[64]) { - uint8_t seed[32]; - RAND_bytes(seed, 32); - ED25519_keypair_from_seed(out_public_key, out_private_key, seed); -} - -int ED25519_sign(uint8_t out_sig[64], const uint8_t *message, - size_t message_len, const uint8_t private_key[64]) { - // NOTE: The documentation on this function says that it returns zero on - // allocation failure. While that can't happen with the current - // implementation, we want to reserve the ability to allocate in this - // implementation in the future. - - uint8_t az[SHA512_DIGEST_LENGTH]; - SHA512(private_key, 32, az); - - az[0] &= 248; - az[31] &= 63; - az[31] |= 64; - - SHA512_CTX hash_ctx; - SHA512_Init(&hash_ctx); - SHA512_Update(&hash_ctx, az + 32, 32); - SHA512_Update(&hash_ctx, message, message_len); - uint8_t nonce[SHA512_DIGEST_LENGTH]; - SHA512_Final(nonce, &hash_ctx); - - x25519_sc_reduce(nonce); - ge_p3 R; - x25519_ge_scalarmult_base(&R, nonce); - ge_p3_tobytes(out_sig, &R); - - SHA512_Init(&hash_ctx); - SHA512_Update(&hash_ctx, out_sig, 32); - SHA512_Update(&hash_ctx, private_key + 32, 32); - SHA512_Update(&hash_ctx, message, message_len); - uint8_t hram[SHA512_DIGEST_LENGTH]; - SHA512_Final(hram, &hash_ctx); - - x25519_sc_reduce(hram); - sc_muladd(out_sig + 32, hram, az, nonce); - - return 1; -} - -int ED25519_verify(const uint8_t *message, size_t message_len, - const uint8_t signature[64], const uint8_t public_key[32]) { - ge_p3 A; - if ((signature[63] & 224) != 0 || - !x25519_ge_frombytes_vartime(&A, public_key)) { - return 0; - } - - fe_loose t; - fe_neg(&t, &A.X); - fe_carry(&A.X, &t); - fe_neg(&t, &A.T); - fe_carry(&A.T, &t); - - uint8_t pkcopy[32]; - OPENSSL_memcpy(pkcopy, public_key, 32); - uint8_t rcopy[32]; - OPENSSL_memcpy(rcopy, signature, 32); - uint8_t scopy[32]; - OPENSSL_memcpy(scopy, signature + 32, 32); - - // https://tools.ietf.org/html/rfc8032#section-5.1.7 requires that s be in - // the range [0, order) in order to prevent signature malleability. - - // kOrder is the order of Curve25519 in little-endian form. - static const uint64_t kOrder[4] = { - UINT64_C(0x5812631a5cf5d3ed), - UINT64_C(0x14def9dea2f79cd6), - 0, - UINT64_C(0x1000000000000000), - }; - for (size_t i = 3;; i--) { - uint64_t word = CRYPTO_load_u64_le(scopy + i * 8); - if (word > kOrder[i]) { - return 0; - } else if (word < kOrder[i]) { - break; - } else if (i == 0) { - return 0; - } - } - - SHA512_CTX hash_ctx; - SHA512_Init(&hash_ctx); - SHA512_Update(&hash_ctx, signature, 32); - SHA512_Update(&hash_ctx, public_key, 32); - SHA512_Update(&hash_ctx, message, message_len); - uint8_t h[SHA512_DIGEST_LENGTH]; - SHA512_Final(h, &hash_ctx); - - x25519_sc_reduce(h); - - ge_p2 R; - ge_double_scalarmult_vartime(&R, h, &A, scopy); - - uint8_t rcheck[32]; - x25519_ge_tobytes(rcheck, &R); - - return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0; -} - -void ED25519_keypair_from_seed(uint8_t out_public_key[32], - uint8_t out_private_key[64], - const uint8_t seed[32]) { - uint8_t az[SHA512_DIGEST_LENGTH]; - SHA512(seed, 32, az); - - az[0] &= 248; - az[31] &= 127; - az[31] |= 64; - - ge_p3 A; - x25519_ge_scalarmult_base(&A, az); - ge_p3_tobytes(out_public_key, &A); - - OPENSSL_memcpy(out_private_key, seed, 32); - OPENSSL_memcpy(out_private_key + 32, out_public_key, 32); -} - - -static void x25519_scalar_mult_generic(uint8_t out[32], - const uint8_t scalar[32], - const uint8_t point[32]) { - fe x1, x2, z2, x3, z3, tmp0, tmp1; - fe_loose x2l, z2l, x3l, tmp0l, tmp1l; - - uint8_t e[32]; - OPENSSL_memcpy(e, scalar, 32); - e[0] &= 248; - e[31] &= 127; - e[31] |= 64; - - // The following implementation was transcribed to Coq and proven to - // correspond to unary scalar multiplication in affine coordinates given that - // x1 != 0 is the x coordinate of some point on the curve. It was also checked - // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 - // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the - // underlying field, so it applies to Curve25519 itself and the quadratic - // twist of Curve25519. It was not proven in Coq that prime-field arithmetic - // correctly simulates extension-field arithmetic on prime-field values. - // The decoding of the byte array representation of e was not considered. - // Specification of Montgomery curves in affine coordinates: - // - // Proof that these form a group that is isomorphic to a Weierstrass curve: - // - // Coq transcription and correctness proof of the loop (where scalarbits=255): - // - // - // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 - fe_frombytes(&x1, point); - fe_1(&x2); - fe_0(&z2); - fe_copy(&x3, &x1); - fe_1(&z3); - - unsigned swap = 0; - int pos; - for (pos = 254; pos >= 0; --pos) { - // loop invariant as of right before the test, for the case where x1 != 0: - // pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero - // let r := e >> (pos+1) in the following equalities of projective points: - // to_xz (r*P) === if swap then (x3, z3) else (x2, z2) - // to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) - // x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) - unsigned b = 1 & (e[pos / 8] >> (pos & 7)); - swap ^= b; - fe_cswap(&x2, &x3, swap); - fe_cswap(&z2, &z3, swap); - swap = b; - // Coq transcription of ladderstep formula (called from transcribed loop): - // - // - // x1 != 0 - // x1 = 0 - fe_sub(&tmp0l, &x3, &z3); - fe_sub(&tmp1l, &x2, &z2); - fe_add(&x2l, &x2, &z2); - fe_add(&z2l, &x3, &z3); - fe_mul_tll(&z3, &tmp0l, &x2l); - fe_mul_tll(&z2, &z2l, &tmp1l); - fe_sq_tl(&tmp0, &tmp1l); - fe_sq_tl(&tmp1, &x2l); - fe_add(&x3l, &z3, &z2); - fe_sub(&z2l, &z3, &z2); - fe_mul_ttt(&x2, &tmp1, &tmp0); - fe_sub(&tmp1l, &tmp1, &tmp0); - fe_sq_tl(&z2, &z2l); - fe_mul121666(&z3, &tmp1l); - fe_sq_tl(&x3, &x3l); - fe_add(&tmp0l, &tmp0, &z3); - fe_mul_ttt(&z3, &x1, &z2); - fe_mul_tll(&z2, &tmp1l, &tmp0l); - } - // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) - fe_cswap(&x2, &x3, swap); - fe_cswap(&z2, &z3, swap); - - fe_invert(&z2, &z2); - fe_mul_ttt(&x2, &x2, &z2); - fe_tobytes(out, &x2); -} - -static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32], - const uint8_t point[32]) { -#if defined(BORINGSSL_X25519_NEON) - if (CRYPTO_is_NEON_capable()) { - x25519_NEON(out, scalar, point); - return; - } -#endif - - x25519_scalar_mult_generic(out, scalar, point); -} - -void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) { - RAND_bytes(out_private_key, 32); - - // All X25519 implementations should decode scalars correctly (see - // https://tools.ietf.org/html/rfc7748#section-5). However, if an - // implementation doesn't then it might interoperate with random keys a - // fraction of the time because they'll, randomly, happen to be correctly - // formed. - // - // Thus we do the opposite of the masking here to make sure that our private - // keys are never correctly masked and so, hopefully, any incorrect - // implementations are deterministically broken. - // - // This does not affect security because, although we're throwing away - // entropy, a valid implementation of scalarmult should throw away the exact - // same bits anyway. - out_private_key[0] |= ~248; - out_private_key[31] &= ~64; - out_private_key[31] |= ~127; - - X25519_public_from_private(out_public_value, out_private_key); -} - -int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32], - const uint8_t peer_public_value[32]) { - static const uint8_t kZeros[32] = {0}; - x25519_scalar_mult(out_shared_key, private_key, peer_public_value); - // The all-zero output results when the input is a point of small order. - return CRYPTO_memcmp(kZeros, out_shared_key, 32) != 0; -} - -void X25519_public_from_private(uint8_t out_public_value[32], - const uint8_t private_key[32]) { -#if defined(BORINGSSL_X25519_NEON) - if (CRYPTO_is_NEON_capable()) { - static const uint8_t kMongomeryBasePoint[32] = {9}; - x25519_NEON(out_public_value, private_key, kMongomeryBasePoint); - return; - } -#endif - - uint8_t e[32]; - OPENSSL_memcpy(e, private_key, 32); - e[0] &= 248; - e[31] &= 127; - e[31] |= 64; - - ge_p3 A; - x25519_ge_scalarmult_base(&A, e); - - // We only need the u-coordinate of the curve25519 point. The map is - // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y). - fe_loose zplusy, zminusy; - fe zminusy_inv; - fe_add(&zplusy, &A.Z, &A.Y); - fe_sub(&zminusy, &A.Z, &A.Y); - fe_loose_invert(&zminusy_inv, &zminusy); - fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv); - fe_tobytes(out_public_value, &zminusy_inv); -} diff --git a/third_party/boringssl/src/crypto/curve25519/curve25519.cc b/third_party/boringssl/src/crypto/curve25519/curve25519.cc new file mode 100644 index 00000000..66d5d84b --- /dev/null +++ b/third_party/boringssl/src/crypto/curve25519/curve25519.cc @@ -0,0 +1,2157 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Some of this code is taken from the ref10 version of Ed25519 in SUPERCOP +// 20141124 (http://bench.cr.yp.to/supercop.html). That code is released as +// public domain. Other parts have been replaced to call into code generated by +// Fiat (https://github.com/mit-plv/fiat-crypto) in //third_party/fiat. +// +// The field functions are shared by Ed25519 and X25519 where possible. + +#include +#include + +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + +// Various pre-computed constants. +#include "./curve25519_tables.h" + +#if defined(BORINGSSL_HAS_UINT128) +#include "../../third_party/fiat/curve25519_64.h" +#elif defined(OPENSSL_64_BIT) +#include "../../third_party/fiat/curve25519_64_msvc.h" +#else +#include "../../third_party/fiat/curve25519_32.h" +#endif + + +using namespace bssl; + +// Low-level intrinsic operations + +static uint64_t load_3(const uint8_t *in) { + uint64_t result; + result = (uint64_t)in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + return result; +} + +static uint64_t load_4(const uint8_t *in) { + uint64_t result; + result = (uint64_t)in[0]; + result |= ((uint64_t)in[1]) << 8; + result |= ((uint64_t)in[2]) << 16; + result |= ((uint64_t)in[3]) << 24; + return result; +} + + +// Field operations. + +#if defined(OPENSSL_64_BIT) + +typedef uint64_t fe_limb_t; +#define FE_NUM_LIMBS 5 + +// assert_fe asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc], +// [0x0 ~> 0x8cccccccccccc]] +// +// See comments in curve25519_64.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ + declassify_assert(f[_assert_fe_i] <= UINT64_C(0x8cccccccccccc)); \ + } \ + } while (0) + +// assert_fe_loose asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664], +// [0x0 ~> 0x1a666666666664]] +// +// See comments in curve25519_64.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe_loose(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 5; _assert_fe_i++) { \ + declassify_assert(f[_assert_fe_i] <= UINT64_C(0x1a666666666664)); \ + } \ + } while (0) + +#else + +typedef uint32_t fe_limb_t; +#define FE_NUM_LIMBS 10 + +// assert_fe asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333], +// [0x0 ~> 0x4666666], [0x0 ~> 0x2333333]] +// +// See comments in curve25519_32.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ + declassify_assert(f[_assert_fe_i] <= \ + ((_assert_fe_i & 1) ? 0x2333333u : 0x4666666u)); \ + } \ + } while (0) + +// assert_fe_loose asserts that |f| satisfies bounds: +// +// [[0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999], +// [0x0 ~> 0xd333332], [0x0 ~> 0x6999999]] +// +// See comments in curve25519_32.h for which functions use these bounds for +// inputs or outputs. +#define assert_fe_loose(f) \ + do { \ + for (unsigned _assert_fe_i = 0; _assert_fe_i < 10; _assert_fe_i++) { \ + declassify_assert(f[_assert_fe_i] <= \ + ((_assert_fe_i & 1) ? 0x6999999u : 0xd333332u)); \ + } \ + } while (0) + +#endif // OPENSSL_64_BIT + +static_assert(sizeof(fe) == sizeof(fe_limb_t) * FE_NUM_LIMBS, + "fe_limb_t[FE_NUM_LIMBS] is inconsistent with fe"); + +static void fe_frombytes_strict(fe *h, const uint8_t s[32]) { + // |fiat_25519_from_bytes| requires the top-most bit be clear. + declassify_assert((s[31] & 0x80) == 0); + fiat_25519_from_bytes(h->v, s); + assert_fe(h->v); +} + +static void fe_frombytes(fe *h, const uint8_t s[32]) { + uint8_t s_copy[32]; + OPENSSL_memcpy(s_copy, s, 32); + s_copy[31] &= 0x7f; + fe_frombytes_strict(h, s_copy); +} + +static void fe_tobytes(uint8_t s[32], const fe *f) { + assert_fe(f->v); + fiat_25519_to_bytes(s, f->v); +} + +// h = 0 +static void fe_0(fe *h) { OPENSSL_memset(h, 0, sizeof(fe)); } + +static void fe_loose_0(fe_loose *h) { OPENSSL_memset(h, 0, sizeof(fe_loose)); } + +// h = 1 +static void fe_1(fe *h) { + OPENSSL_memset(h, 0, sizeof(fe)); + h->v[0] = 1; +} + +static void fe_loose_1(fe_loose *h) { + OPENSSL_memset(h, 0, sizeof(fe_loose)); + h->v[0] = 1; +} + +// h = f + g +// Can overlap h with f or g. +static void fe_add(fe_loose *h, const fe *f, const fe *g) { + assert_fe(f->v); + assert_fe(g->v); + fiat_25519_add(h->v, f->v, g->v); + assert_fe_loose(h->v); +} + +// h = f - g +// Can overlap h with f or g. +static void fe_sub(fe_loose *h, const fe *f, const fe *g) { + assert_fe(f->v); + assert_fe(g->v); + fiat_25519_sub(h->v, f->v, g->v); + assert_fe_loose(h->v); +} + +static void fe_carry(fe *h, const fe_loose *f) { + assert_fe_loose(f->v); + fiat_25519_carry(h->v, f->v); + assert_fe(h->v); +} + +static void fe_mul_impl(fe_limb_t out[FE_NUM_LIMBS], + const fe_limb_t in1[FE_NUM_LIMBS], + const fe_limb_t in2[FE_NUM_LIMBS]) { + assert_fe_loose(in1); + assert_fe_loose(in2); + fiat_25519_carry_mul(out, in1, in2); + assert_fe(out); +} + +static void fe_mul_ltt(fe_loose *h, const fe *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} + +static void fe_mul_llt(fe_loose *h, const fe_loose *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} + +static void fe_mul_ttt(fe *h, const fe *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} + +static void fe_mul_tlt(fe *h, const fe_loose *f, const fe *g) { + fe_mul_impl(h->v, f->v, g->v); +} + +static void fe_mul_ttl(fe *h, const fe *f, const fe_loose *g) { + fe_mul_impl(h->v, f->v, g->v); +} + +static void fe_mul_tll(fe *h, const fe_loose *f, const fe_loose *g) { + fe_mul_impl(h->v, f->v, g->v); +} + +static void fe_sq_tl(fe *h, const fe_loose *f) { + assert_fe_loose(f->v); + fiat_25519_carry_square(h->v, f->v); + assert_fe(h->v); +} + +static void fe_sq_tt(fe *h, const fe *f) { + assert_fe_loose(f->v); + fiat_25519_carry_square(h->v, f->v); + assert_fe(h->v); +} + +// Replace (f,g) with (g,f) if b == 1; +// replace (f,g) with (f,g) if b == 0. +// +// Preconditions: b in {0,1}. +static void fe_cswap(fe *f, fe *g, fe_limb_t b) { + b = 0 - b; + for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { + fe_limb_t x = f->v[i] ^ g->v[i]; + x &= b; + f->v[i] ^= x; + g->v[i] ^= x; + } +} + +static void fe_mul121666(fe *h, const fe_loose *f) { + assert_fe_loose(f->v); + fiat_25519_carry_scmul_121666(h->v, f->v); + assert_fe(h->v); +} + +// h = -f +static void fe_neg(fe_loose *h, const fe *f) { + assert_fe(f->v); + fiat_25519_opp(h->v, f->v); + assert_fe_loose(h->v); +} + +// Replace (f,g) with (g,g) if b == 1; +// replace (f,g) with (f,g) if b == 0. +// +// Preconditions: b in {0,1}. +static void fe_cmov(fe_loose *f, const fe_loose *g, fe_limb_t b) { + // Silence an unused function warning. |fiat_25519_selectznz| isn't quite the + // calling convention the rest of this code wants, so implement it by hand. + // + // TODO(davidben): Switch to fiat's calling convention, or ask fiat to emit a + // different one. + + b = 0 - b; + for (unsigned i = 0; i < FE_NUM_LIMBS; i++) { + fe_limb_t x = f->v[i] ^ g->v[i]; + x &= b; + f->v[i] ^= x; + } +} + +// h = f +static void fe_copy(fe *h, const fe *f) { OPENSSL_memmove(h, f, sizeof(fe)); } + +static void fe_copy_lt(fe_loose *h, const fe *f) { + static_assert(sizeof(fe_loose) == sizeof(fe), "fe and fe_loose mismatch"); + OPENSSL_memmove(h, f, sizeof(fe)); +} + +static void fe_loose_invert(fe *out, const fe_loose *z) { + fe t0; + fe t1; + fe t2; + fe t3; + int i; + + fe_sq_tl(&t0, z); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 2; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_tlt(&t1, z, &t1); + fe_mul_ttt(&t0, &t0, &t1); + fe_sq_tt(&t2, &t0); + fe_mul_ttt(&t1, &t1, &t2); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 5; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t2, &t2, &t1); + fe_sq_tt(&t3, &t2); + for (i = 1; i < 20; ++i) { + fe_sq_tt(&t3, &t3); + } + fe_mul_ttt(&t2, &t3, &t2); + fe_sq_tt(&t2, &t2); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t2, &t2, &t1); + fe_sq_tt(&t3, &t2); + for (i = 1; i < 100; ++i) { + fe_sq_tt(&t3, &t3); + } + fe_mul_ttt(&t2, &t3, &t2); + fe_sq_tt(&t2, &t2); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t1, &t1); + for (i = 1; i < 5; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(out, &t1, &t0); +} + +static void fe_invert(fe *out, const fe *z) { + fe_loose l; + fe_copy_lt(&l, z); + fe_loose_invert(out, &l); +} + +// return 0 if f == 0 +// return 1 if f != 0 +static int fe_isnonzero(const fe_loose *f) { + fe tight; + fe_carry(&tight, f); + uint8_t s[32]; + fe_tobytes(s, &tight); + + static const uint8_t zero[32] = {0}; + return CRYPTO_memcmp(s, zero, sizeof(zero)) != 0; +} + +// return 1 if f is in {1,3,5,...,q-2} +// return 0 if f is in {0,2,4,...,q-1} +static int fe_isnegative(const fe *f) { + uint8_t s[32]; + fe_tobytes(s, f); + return s[0] & 1; +} + +static void fe_sq2_tt(fe *h, const fe *f) { + // h = f^2 + fe_sq_tt(h, f); + + // h = h + h + fe_loose tmp; + fe_add(&tmp, h, h); + fe_carry(h, &tmp); +} + +static void fe_pow22523(fe *out, const fe *z) { + fe t0; + fe t1; + fe t2; + int i; + + fe_sq_tt(&t0, z); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 2; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t1, z, &t1); + fe_mul_ttt(&t0, &t0, &t1); + fe_sq_tt(&t0, &t0); + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 5; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t1, &t1, &t0); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 20; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t1, &t1); + for (i = 1; i < 10; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t1, &t0); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t1, &t1, &t0); + fe_sq_tt(&t2, &t1); + for (i = 1; i < 100; ++i) { + fe_sq_tt(&t2, &t2); + } + fe_mul_ttt(&t1, &t2, &t1); + fe_sq_tt(&t1, &t1); + for (i = 1; i < 50; ++i) { + fe_sq_tt(&t1, &t1); + } + fe_mul_ttt(&t0, &t1, &t0); + fe_sq_tt(&t0, &t0); + for (i = 1; i < 2; ++i) { + fe_sq_tt(&t0, &t0); + } + fe_mul_ttt(out, &t0, z); +} + + +// Group operations. + +void bssl::x25519_ge_tobytes(uint8_t s[32], const ge_p2 *h) { + fe recip; + fe x; + fe y; + + fe_invert(&recip, &h->Z); + fe_mul_ttt(&x, &h->X, &recip); + fe_mul_ttt(&y, &h->Y, &recip); + fe_tobytes(s, &y); + s[31] ^= fe_isnegative(&x) << 7; +} + +static void ge_p3_tobytes(uint8_t s[32], const ge_p3 *h) { + fe recip; + fe x; + fe y; + + fe_invert(&recip, &h->Z); + fe_mul_ttt(&x, &h->X, &recip); + fe_mul_ttt(&y, &h->Y, &recip); + fe_tobytes(s, &y); + s[31] ^= fe_isnegative(&x) << 7; +} + +int bssl::x25519_ge_frombytes_vartime(ge_p3 *h, const uint8_t s[32]) { + fe u; + fe_loose v; + fe w; + fe vxx; + fe_loose check; + + fe_frombytes(&h->Y, s); + fe_1(&h->Z); + fe_sq_tt(&w, &h->Y); + fe_mul_ttt(&vxx, &w, &d); + fe_sub(&v, &w, &h->Z); // u = y^2-1 + fe_carry(&u, &v); + fe_add(&v, &vxx, &h->Z); // v = dy^2+1 + + fe_mul_ttl(&w, &u, &v); // w = u*v + fe_pow22523(&h->X, &w); // x = w^((q-5)/8) + fe_mul_ttt(&h->X, &h->X, &u); // x = u*w^((q-5)/8) + + fe_sq_tt(&vxx, &h->X); + fe_mul_ttl(&vxx, &vxx, &v); + fe_sub(&check, &vxx, &u); + if (fe_isnonzero(&check)) { + fe_add(&check, &vxx, &u); + if (fe_isnonzero(&check)) { + return 0; + } + fe_mul_ttt(&h->X, &h->X, &sqrtm1); + } + + if (fe_isnegative(&h->X) != (s[31] >> 7)) { + fe_loose t; + fe_neg(&t, &h->X); + fe_carry(&h->X, &t); + } + + fe_mul_ttt(&h->T, &h->X, &h->Y); + return 1; +} + +static void ge_p2_0(ge_p2 *h) { + fe_0(&h->X); + fe_1(&h->Y); + fe_1(&h->Z); +} + +static void ge_p3_0(ge_p3 *h) { + fe_0(&h->X); + fe_1(&h->Y); + fe_1(&h->Z); + fe_0(&h->T); +} + +static void ge_cached_0(ge_cached *h) { + fe_loose_1(&h->YplusX); + fe_loose_1(&h->YminusX); + fe_loose_1(&h->Z); + fe_loose_0(&h->T2d); +} + +static void ge_precomp_0(ge_precomp *h) { + fe_loose_1(&h->yplusx); + fe_loose_1(&h->yminusx); + fe_loose_0(&h->xy2d); +} + +// r = p +static void ge_p3_to_p2(ge_p2 *r, const ge_p3 *p) { + fe_copy(&r->X, &p->X); + fe_copy(&r->Y, &p->Y); + fe_copy(&r->Z, &p->Z); +} + +// r = p +void bssl::x25519_ge_p3_to_cached(ge_cached *r, const ge_p3 *p) { + fe_add(&r->YplusX, &p->Y, &p->X); + fe_sub(&r->YminusX, &p->Y, &p->X); + fe_copy_lt(&r->Z, &p->Z); + fe_mul_ltt(&r->T2d, &p->T, &d2); +} + +// r = p +void bssl::x25519_ge_p1p1_to_p2(ge_p2 *r, const ge_p1p1 *p) { + fe_mul_tll(&r->X, &p->X, &p->T); + fe_mul_tll(&r->Y, &p->Y, &p->Z); + fe_mul_tll(&r->Z, &p->Z, &p->T); +} + +// r = p +void bssl::x25519_ge_p1p1_to_p3(ge_p3 *r, const ge_p1p1 *p) { + fe_mul_tll(&r->X, &p->X, &p->T); + fe_mul_tll(&r->Y, &p->Y, &p->Z); + fe_mul_tll(&r->Z, &p->Z, &p->T); + fe_mul_tll(&r->T, &p->X, &p->Y); +} + +// r = p +static void ge_p1p1_to_cached(ge_cached *r, const ge_p1p1 *p) { + ge_p3 t; + x25519_ge_p1p1_to_p3(&t, p); + x25519_ge_p3_to_cached(r, &t); +} + +// r = 2 * p +static void ge_p2_dbl(ge_p1p1 *r, const ge_p2 *p) { + fe trX, trZ, trT; + fe t0; + + fe_sq_tt(&trX, &p->X); + fe_sq_tt(&trZ, &p->Y); + fe_sq2_tt(&trT, &p->Z); + fe_add(&r->Y, &p->X, &p->Y); + fe_sq_tl(&t0, &r->Y); + + fe_add(&r->Y, &trZ, &trX); + fe_sub(&r->Z, &trZ, &trX); + fe_carry(&trZ, &r->Y); + fe_sub(&r->X, &t0, &trZ); + fe_carry(&trZ, &r->Z); + fe_sub(&r->T, &trT, &trZ); +} + +// r = 2 * p +static void ge_p3_dbl(ge_p1p1 *r, const ge_p3 *p) { + ge_p2 q; + ge_p3_to_p2(&q, p); + ge_p2_dbl(r, &q); +} + +// r = p + q +static void ge_madd(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { + fe trY, trZ, trT; + + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->yplusx); + fe_mul_tll(&trY, &r->Y, &q->yminusx); + fe_mul_tlt(&trT, &q->xy2d, &p->T); + fe_add(&r->T, &p->Z, &p->Z); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_add(&r->Z, &trZ, &trT); + fe_sub(&r->T, &trZ, &trT); +} + +// r = p - q +static void ge_msub(ge_p1p1 *r, const ge_p3 *p, const ge_precomp *q) { + fe trY, trZ, trT; + + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->yminusx); + fe_mul_tll(&trY, &r->Y, &q->yplusx); + fe_mul_tlt(&trT, &q->xy2d, &p->T); + fe_add(&r->T, &p->Z, &p->Z); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_sub(&r->Z, &trZ, &trT); + fe_add(&r->T, &trZ, &trT); +} + +// r = p + q +void bssl::x25519_ge_add(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { + fe trX, trY, trZ, trT; + + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->YplusX); + fe_mul_tll(&trY, &r->Y, &q->YminusX); + fe_mul_tlt(&trT, &q->T2d, &p->T); + fe_mul_ttl(&trX, &p->Z, &q->Z); + fe_add(&r->T, &trX, &trX); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_add(&r->Z, &trZ, &trT); + fe_sub(&r->T, &trZ, &trT); +} + +// r = p - q +void bssl::x25519_ge_sub(ge_p1p1 *r, const ge_p3 *p, const ge_cached *q) { + fe trX, trY, trZ, trT; + + fe_add(&r->X, &p->Y, &p->X); + fe_sub(&r->Y, &p->Y, &p->X); + fe_mul_tll(&trZ, &r->X, &q->YminusX); + fe_mul_tll(&trY, &r->Y, &q->YplusX); + fe_mul_tlt(&trT, &q->T2d, &p->T); + fe_mul_ttl(&trX, &p->Z, &q->Z); + fe_add(&r->T, &trX, &trX); + fe_sub(&r->X, &trZ, &trY); + fe_add(&r->Y, &trZ, &trY); + fe_carry(&trZ, &r->T); + fe_sub(&r->Z, &trZ, &trT); + fe_add(&r->T, &trZ, &trT); +} + +static void cmov(ge_precomp *t, const ge_precomp *u, uint8_t b) { + fe_cmov(&t->yplusx, &u->yplusx, b); + fe_cmov(&t->yminusx, &u->yminusx, b); + fe_cmov(&t->xy2d, &u->xy2d, b); +} + +void bssl::x25519_ge_scalarmult_small_precomp( + ge_p3 *h, const uint8_t a[32], const uint8_t precomp_table[15 * 2 * 32]) { + // precomp_table is first expanded into matching |ge_precomp| + // elements. + ge_precomp multiples[15]; + + unsigned i; + for (i = 0; i < 15; i++) { + // The precomputed table is assumed to already clear the top bit, so + // |fe_frombytes_strict| may be used directly. + const uint8_t *bytes = &precomp_table[i * (2 * 32)]; + fe x, y; + fe_frombytes_strict(&x, bytes); + fe_frombytes_strict(&y, bytes + 32); + + ge_precomp *out = &multiples[i]; + fe_add(&out->yplusx, &y, &x); + fe_sub(&out->yminusx, &y, &x); + fe_mul_ltt(&out->xy2d, &x, &y); + fe_mul_llt(&out->xy2d, &out->xy2d, &d2); + } + + // See the comment above |k25519SmallPrecomp| about the structure of the + // precomputed elements. This loop does 64 additions and 64 doublings to + // calculate the result. + ge_p3_0(h); + + for (i = 63; i < 64; i--) { + unsigned j; + signed char index = 0; + + for (j = 0; j < 4; j++) { + const uint8_t bit = 1 & (a[(8 * j) + (i / 8)] >> (i & 7)); + index |= (bit << j); + } + + ge_precomp e; + ge_precomp_0(&e); + + for (j = 1; j < 16; j++) { + cmov(&e, &multiples[j - 1], 1 & constant_time_eq_w(index, j)); + } + + ge_cached cached; + ge_p1p1 r; + x25519_ge_p3_to_cached(&cached, h); + x25519_ge_add(&r, h, &cached); + x25519_ge_p1p1_to_p3(h, &r); + + ge_madd(&r, h, &e); + x25519_ge_p1p1_to_p3(h, &r); + } +} + +#if defined(OPENSSL_SMALL) + +void bssl::x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) { + x25519_ge_scalarmult_small_precomp(h, a, k25519SmallPrecomp); +} + +#else + +static void table_select(ge_precomp *t, const int pos, const signed char b) { + uint8_t bnegative = constant_time_msb_w(b); + uint8_t babs = b - ((bnegative & b) << 1); + + uint8_t t_bytes[3][32] = { + {static_cast(constant_time_is_zero_w(b) & 1)}, + {static_cast(constant_time_is_zero_w(b) & 1)}, + {0}}; +#if defined(__clang__) // materialize for vectorization, 6% speedup + __asm__("" : "+m"(t_bytes) : /*no inputs*/); +#endif + static_assert(sizeof(t_bytes) == sizeof(k25519Precomp[pos][0])); + for (int i = 0; i < 8; i++) { + constant_time_conditional_memxor(t_bytes, k25519Precomp[pos][i], + sizeof(t_bytes), + constant_time_eq_w(babs, 1 + i)); + } + + fe yplusx, yminusx, xy2d; + fe_frombytes_strict(&yplusx, t_bytes[0]); + fe_frombytes_strict(&yminusx, t_bytes[1]); + fe_frombytes_strict(&xy2d, t_bytes[2]); + + fe_copy_lt(&t->yplusx, &yplusx); + fe_copy_lt(&t->yminusx, &yminusx); + fe_copy_lt(&t->xy2d, &xy2d); + + ge_precomp minust; + fe_copy_lt(&minust.yplusx, &yminusx); + fe_copy_lt(&minust.yminusx, &yplusx); + fe_neg(&minust.xy2d, &xy2d); + cmov(t, &minust, bnegative >> 7); +} + +// h = a * B +// where a = a[0]+256*a[1]+...+256^31 a[31] +// B is the Ed25519 base point (x,4/5) with x positive. +// +// Preconditions: +// a[31] <= 127 +void bssl::x25519_ge_scalarmult_base(ge_p3 *h, const uint8_t a[32]) { +#if defined(BORINGSSL_FE25519_ADX) + if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() && + CRYPTO_is_ADX_capable()) { + uint8_t t[4][32]; + x25519_ge_scalarmult_base_adx(t, a); + fiat_25519_from_bytes(h->X.v, t[0]); + fiat_25519_from_bytes(h->Y.v, t[1]); + fiat_25519_from_bytes(h->Z.v, t[2]); + fiat_25519_from_bytes(h->T.v, t[3]); + return; + } +#endif + signed char e[64]; + signed char carry; + ge_p1p1 r; + ge_p2 s; + ge_precomp t; + int i; + + for (i = 0; i < 32; ++i) { + e[2 * i + 0] = (a[i] >> 0) & 15; + e[2 * i + 1] = (a[i] >> 4) & 15; + } + // each e[i] is between 0 and 15 + // e[63] is between 0 and 7 + + carry = 0; + for (i = 0; i < 63; ++i) { + e[i] += carry; + carry = e[i] + 8; + carry >>= 4; + e[i] -= carry << 4; + } + e[63] += carry; + // each e[i] is between -8 and 8 + + ge_p3_0(h); + for (i = 1; i < 64; i += 2) { + table_select(&t, i / 2, e[i]); + ge_madd(&r, h, &t); + x25519_ge_p1p1_to_p3(h, &r); + } + + ge_p3_dbl(&r, h); + x25519_ge_p1p1_to_p2(&s, &r); + ge_p2_dbl(&r, &s); + x25519_ge_p1p1_to_p2(&s, &r); + ge_p2_dbl(&r, &s); + x25519_ge_p1p1_to_p2(&s, &r); + ge_p2_dbl(&r, &s); + x25519_ge_p1p1_to_p3(h, &r); + + for (i = 0; i < 64; i += 2) { + table_select(&t, i / 2, e[i]); + ge_madd(&r, h, &t); + x25519_ge_p1p1_to_p3(h, &r); + } +} + +#endif + +static void cmov_cached(ge_cached *t, ge_cached *u, uint8_t b) { + fe_cmov(&t->YplusX, &u->YplusX, b); + fe_cmov(&t->YminusX, &u->YminusX, b); + fe_cmov(&t->Z, &u->Z, b); + fe_cmov(&t->T2d, &u->T2d, b); +} + +// r = scalar * A. +// where a = a[0]+256*a[1]+...+256^31 a[31]. +void bssl::x25519_ge_scalarmult(ge_p2 *r, const uint8_t *scalar, + const ge_p3 *A) { + ge_p2 Ai_p2[8]; + ge_cached Ai[16]; + ge_p1p1 t; + + ge_cached_0(&Ai[0]); + x25519_ge_p3_to_cached(&Ai[1], A); + ge_p3_to_p2(&Ai_p2[1], A); + + unsigned i; + for (i = 2; i < 16; i += 2) { + ge_p2_dbl(&t, &Ai_p2[i / 2]); + ge_p1p1_to_cached(&Ai[i], &t); + if (i < 8) { + x25519_ge_p1p1_to_p2(&Ai_p2[i], &t); + } + x25519_ge_add(&t, A, &Ai[i]); + ge_p1p1_to_cached(&Ai[i + 1], &t); + if (i < 7) { + x25519_ge_p1p1_to_p2(&Ai_p2[i + 1], &t); + } + } + + ge_p2_0(r); + ge_p3 u; + + for (i = 0; i < 256; i += 4) { + ge_p2_dbl(&t, r); + x25519_ge_p1p1_to_p2(r, &t); + ge_p2_dbl(&t, r); + x25519_ge_p1p1_to_p2(r, &t); + ge_p2_dbl(&t, r); + x25519_ge_p1p1_to_p2(r, &t); + ge_p2_dbl(&t, r); + x25519_ge_p1p1_to_p3(&u, &t); + + uint8_t index = scalar[31 - i / 8]; + index >>= 4 - (i & 4); + index &= 0xf; + + unsigned j; + ge_cached selected; + ge_cached_0(&selected); + for (j = 0; j < 16; j++) { + cmov_cached(&selected, &Ai[j], 1 & constant_time_eq_w(index, j)); + } + + x25519_ge_add(&t, &u, &selected); + x25519_ge_p1p1_to_p2(r, &t); + } +} + +static void slide(signed char *r, const uint8_t *a) { + int i; + int b; + int k; + + for (i = 0; i < 256; ++i) { + r[i] = 1 & (a[i >> 3] >> (i & 7)); + } + + for (i = 0; i < 256; ++i) { + if (r[i]) { + for (b = 1; b <= 6 && i + b < 256; ++b) { + if (r[i + b]) { + if (r[i] + (r[i + b] << b) <= 15) { + r[i] += r[i + b] << b; + r[i + b] = 0; + } else if (r[i] - (r[i + b] << b) >= -15) { + r[i] -= r[i + b] << b; + for (k = i + b; k < 256; ++k) { + if (!r[k]) { + r[k] = 1; + break; + } + r[k] = 0; + } + } else { + break; + } + } + } + } + } +} + +// r = a * A + b * B +// where a = a[0]+256*a[1]+...+256^31 a[31]. +// and b = b[0]+256*b[1]+...+256^31 b[31]. +// B is the Ed25519 base point (x,4/5) with x positive. +static void ge_double_scalarmult_vartime(ge_p2 *r, const uint8_t *a, + const ge_p3 *A, const uint8_t *b) { + signed char aslide[256]; + signed char bslide[256]; + ge_cached Ai[8]; // A,3A,5A,7A,9A,11A,13A,15A + ge_p1p1 t; + ge_p3 u; + ge_p3 A2; + int i; + + slide(aslide, a); + slide(bslide, b); + + x25519_ge_p3_to_cached(&Ai[0], A); + ge_p3_dbl(&t, A); + x25519_ge_p1p1_to_p3(&A2, &t); + x25519_ge_add(&t, &A2, &Ai[0]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[1], &u); + x25519_ge_add(&t, &A2, &Ai[1]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[2], &u); + x25519_ge_add(&t, &A2, &Ai[2]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[3], &u); + x25519_ge_add(&t, &A2, &Ai[3]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[4], &u); + x25519_ge_add(&t, &A2, &Ai[4]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[5], &u); + x25519_ge_add(&t, &A2, &Ai[5]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[6], &u); + x25519_ge_add(&t, &A2, &Ai[6]); + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_p3_to_cached(&Ai[7], &u); + + ge_p2_0(r); + + for (i = 255; i >= 0; --i) { + if (aslide[i] || bslide[i]) { + break; + } + } + + for (; i >= 0; --i) { + ge_p2_dbl(&t, r); + + if (aslide[i] > 0) { + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_add(&t, &u, &Ai[aslide[i] / 2]); + } else if (aslide[i] < 0) { + x25519_ge_p1p1_to_p3(&u, &t); + x25519_ge_sub(&t, &u, &Ai[(-aslide[i]) / 2]); + } + + if (bslide[i] > 0) { + x25519_ge_p1p1_to_p3(&u, &t); + ge_madd(&t, &u, &Bi[bslide[i] / 2]); + } else if (bslide[i] < 0) { + x25519_ge_p1p1_to_p3(&u, &t); + ge_msub(&t, &u, &Bi[(-bslide[i]) / 2]); + } + + x25519_ge_p1p1_to_p2(r, &t); + } +} + +// int64_lshift21 returns |a << 21| but is defined when shifting bits into the +// sign bit. This works around a language flaw in C. +static int64_t int64_lshift21(int64_t a) { + return (int64_t)((uint64_t)a << 21); +} + +// The set of scalars is \Z/l +// where l = 2^252 + 27742317777372353535851937790883648493. + +// Input: +// s[0]+256*s[1]+...+256^63*s[63] = s +// +// Output: +// s[0]+256*s[1]+...+256^31*s[31] = s mod l +// where l = 2^252 + 27742317777372353535851937790883648493. +// Overwrites s in place. +void bssl::x25519_sc_reduce(uint8_t s[64]) { + int64_t s0 = 2097151 & load_3(s); + int64_t s1 = 2097151 & (load_4(s + 2) >> 5); + int64_t s2 = 2097151 & (load_3(s + 5) >> 2); + int64_t s3 = 2097151 & (load_4(s + 7) >> 7); + int64_t s4 = 2097151 & (load_4(s + 10) >> 4); + int64_t s5 = 2097151 & (load_3(s + 13) >> 1); + int64_t s6 = 2097151 & (load_4(s + 15) >> 6); + int64_t s7 = 2097151 & (load_3(s + 18) >> 3); + int64_t s8 = 2097151 & load_3(s + 21); + int64_t s9 = 2097151 & (load_4(s + 23) >> 5); + int64_t s10 = 2097151 & (load_3(s + 26) >> 2); + int64_t s11 = 2097151 & (load_4(s + 28) >> 7); + int64_t s12 = 2097151 & (load_4(s + 31) >> 4); + int64_t s13 = 2097151 & (load_3(s + 34) >> 1); + int64_t s14 = 2097151 & (load_4(s + 36) >> 6); + int64_t s15 = 2097151 & (load_3(s + 39) >> 3); + int64_t s16 = 2097151 & load_3(s + 42); + int64_t s17 = 2097151 & (load_4(s + 44) >> 5); + int64_t s18 = 2097151 & (load_3(s + 47) >> 2); + int64_t s19 = 2097151 & (load_4(s + 49) >> 7); + int64_t s20 = 2097151 & (load_4(s + 52) >> 4); + int64_t s21 = 2097151 & (load_3(s + 55) >> 1); + int64_t s22 = 2097151 & (load_4(s + 57) >> 6); + int64_t s23 = (load_4(s + 60) >> 3); + int64_t carry0; + int64_t carry1; + int64_t carry2; + int64_t carry3; + int64_t carry4; + int64_t carry5; + int64_t carry6; + int64_t carry7; + int64_t carry8; + int64_t carry9; + int64_t carry10; + int64_t carry11; + int64_t carry12; + int64_t carry13; + int64_t carry14; + int64_t carry15; + int64_t carry16; + + s11 += s23 * 666643; + s12 += s23 * 470296; + s13 += s23 * 654183; + s14 -= s23 * 997805; + s15 += s23 * 136657; + s16 -= s23 * 683901; + s23 = 0; + + s10 += s22 * 666643; + s11 += s22 * 470296; + s12 += s22 * 654183; + s13 -= s22 * 997805; + s14 += s22 * 136657; + s15 -= s22 * 683901; + s22 = 0; + + s9 += s21 * 666643; + s10 += s21 * 470296; + s11 += s21 * 654183; + s12 -= s21 * 997805; + s13 += s21 * 136657; + s14 -= s21 * 683901; + s21 = 0; + + s8 += s20 * 666643; + s9 += s20 * 470296; + s10 += s20 * 654183; + s11 -= s20 * 997805; + s12 += s20 * 136657; + s13 -= s20 * 683901; + s20 = 0; + + s7 += s19 * 666643; + s8 += s19 * 470296; + s9 += s19 * 654183; + s10 -= s19 * 997805; + s11 += s19 * 136657; + s12 -= s19 * 683901; + s19 = 0; + + s6 += s18 * 666643; + s7 += s18 * 470296; + s8 += s18 * 654183; + s9 -= s18 * 997805; + s10 += s18 * 136657; + s11 -= s18 * 683901; + s18 = 0; + + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry12 = (s12 + (1 << 20)) >> 21; + s13 += carry12; + s12 -= int64_lshift21(carry12); + carry14 = (s14 + (1 << 20)) >> 21; + s15 += carry14; + s14 -= int64_lshift21(carry14); + carry16 = (s16 + (1 << 20)) >> 21; + s17 += carry16; + s16 -= int64_lshift21(carry16); + + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + carry13 = (s13 + (1 << 20)) >> 21; + s14 += carry13; + s13 -= int64_lshift21(carry13); + carry15 = (s15 + (1 << 20)) >> 21; + s16 += carry15; + s15 -= int64_lshift21(carry15); + + s5 += s17 * 666643; + s6 += s17 * 470296; + s7 += s17 * 654183; + s8 -= s17 * 997805; + s9 += s17 * 136657; + s10 -= s17 * 683901; + s17 = 0; + + s4 += s16 * 666643; + s5 += s16 * 470296; + s6 += s16 * 654183; + s7 -= s16 * 997805; + s8 += s16 * 136657; + s9 -= s16 * 683901; + s16 = 0; + + s3 += s15 * 666643; + s4 += s15 * 470296; + s5 += s15 * 654183; + s6 -= s15 * 997805; + s7 += s15 * 136657; + s8 -= s15 * 683901; + s15 = 0; + + s2 += s14 * 666643; + s3 += s14 * 470296; + s4 += s14 * 654183; + s5 -= s14 * 997805; + s6 += s14 * 136657; + s7 -= s14 * 683901; + s14 = 0; + + s1 += s13 * 666643; + s2 += s13 * 470296; + s3 += s13 * 654183; + s4 -= s13 * 997805; + s5 += s13 * 136657; + s6 -= s13 * 683901; + s13 = 0; + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = (s0 + (1 << 20)) >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry2 = (s2 + (1 << 20)) >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry4 = (s4 + (1 << 20)) >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + + carry1 = (s1 + (1 << 20)) >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry3 = (s3 + (1 << 20)) >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry5 = (s5 + (1 << 20)) >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry11 = s11 >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + + s[0] = s0 >> 0; + s[1] = s0 >> 8; + s[2] = (s0 >> 16) | (s1 << 5); + s[3] = s1 >> 3; + s[4] = s1 >> 11; + s[5] = (s1 >> 19) | (s2 << 2); + s[6] = s2 >> 6; + s[7] = (s2 >> 14) | (s3 << 7); + s[8] = s3 >> 1; + s[9] = s3 >> 9; + s[10] = (s3 >> 17) | (s4 << 4); + s[11] = s4 >> 4; + s[12] = s4 >> 12; + s[13] = (s4 >> 20) | (s5 << 1); + s[14] = s5 >> 7; + s[15] = (s5 >> 15) | (s6 << 6); + s[16] = s6 >> 2; + s[17] = s6 >> 10; + s[18] = (s6 >> 18) | (s7 << 3); + s[19] = s7 >> 5; + s[20] = s7 >> 13; + s[21] = s8 >> 0; + s[22] = s8 >> 8; + s[23] = (s8 >> 16) | (s9 << 5); + s[24] = s9 >> 3; + s[25] = s9 >> 11; + s[26] = (s9 >> 19) | (s10 << 2); + s[27] = s10 >> 6; + s[28] = (s10 >> 14) | (s11 << 7); + s[29] = s11 >> 1; + s[30] = s11 >> 9; + s[31] = s11 >> 17; +} + +// Input: +// a[0]+256*a[1]+...+256^31*a[31] = a +// b[0]+256*b[1]+...+256^31*b[31] = b +// c[0]+256*c[1]+...+256^31*c[31] = c +// +// Output: +// s[0]+256*s[1]+...+256^31*s[31] = (ab+c) mod l +// where l = 2^252 + 27742317777372353535851937790883648493. +static void sc_muladd(uint8_t *s, const uint8_t *a, const uint8_t *b, + const uint8_t *c) { + int64_t a0 = 2097151 & load_3(a); + int64_t a1 = 2097151 & (load_4(a + 2) >> 5); + int64_t a2 = 2097151 & (load_3(a + 5) >> 2); + int64_t a3 = 2097151 & (load_4(a + 7) >> 7); + int64_t a4 = 2097151 & (load_4(a + 10) >> 4); + int64_t a5 = 2097151 & (load_3(a + 13) >> 1); + int64_t a6 = 2097151 & (load_4(a + 15) >> 6); + int64_t a7 = 2097151 & (load_3(a + 18) >> 3); + int64_t a8 = 2097151 & load_3(a + 21); + int64_t a9 = 2097151 & (load_4(a + 23) >> 5); + int64_t a10 = 2097151 & (load_3(a + 26) >> 2); + int64_t a11 = (load_4(a + 28) >> 7); + int64_t b0 = 2097151 & load_3(b); + int64_t b1 = 2097151 & (load_4(b + 2) >> 5); + int64_t b2 = 2097151 & (load_3(b + 5) >> 2); + int64_t b3 = 2097151 & (load_4(b + 7) >> 7); + int64_t b4 = 2097151 & (load_4(b + 10) >> 4); + int64_t b5 = 2097151 & (load_3(b + 13) >> 1); + int64_t b6 = 2097151 & (load_4(b + 15) >> 6); + int64_t b7 = 2097151 & (load_3(b + 18) >> 3); + int64_t b8 = 2097151 & load_3(b + 21); + int64_t b9 = 2097151 & (load_4(b + 23) >> 5); + int64_t b10 = 2097151 & (load_3(b + 26) >> 2); + int64_t b11 = (load_4(b + 28) >> 7); + int64_t c0 = 2097151 & load_3(c); + int64_t c1 = 2097151 & (load_4(c + 2) >> 5); + int64_t c2 = 2097151 & (load_3(c + 5) >> 2); + int64_t c3 = 2097151 & (load_4(c + 7) >> 7); + int64_t c4 = 2097151 & (load_4(c + 10) >> 4); + int64_t c5 = 2097151 & (load_3(c + 13) >> 1); + int64_t c6 = 2097151 & (load_4(c + 15) >> 6); + int64_t c7 = 2097151 & (load_3(c + 18) >> 3); + int64_t c8 = 2097151 & load_3(c + 21); + int64_t c9 = 2097151 & (load_4(c + 23) >> 5); + int64_t c10 = 2097151 & (load_3(c + 26) >> 2); + int64_t c11 = (load_4(c + 28) >> 7); + int64_t s0; + int64_t s1; + int64_t s2; + int64_t s3; + int64_t s4; + int64_t s5; + int64_t s6; + int64_t s7; + int64_t s8; + int64_t s9; + int64_t s10; + int64_t s11; + int64_t s12; + int64_t s13; + int64_t s14; + int64_t s15; + int64_t s16; + int64_t s17; + int64_t s18; + int64_t s19; + int64_t s20; + int64_t s21; + int64_t s22; + int64_t s23; + int64_t carry0; + int64_t carry1; + int64_t carry2; + int64_t carry3; + int64_t carry4; + int64_t carry5; + int64_t carry6; + int64_t carry7; + int64_t carry8; + int64_t carry9; + int64_t carry10; + int64_t carry11; + int64_t carry12; + int64_t carry13; + int64_t carry14; + int64_t carry15; + int64_t carry16; + int64_t carry17; + int64_t carry18; + int64_t carry19; + int64_t carry20; + int64_t carry21; + int64_t carry22; + + s0 = c0 + a0 * b0; + s1 = c1 + a0 * b1 + a1 * b0; + s2 = c2 + a0 * b2 + a1 * b1 + a2 * b0; + s3 = c3 + a0 * b3 + a1 * b2 + a2 * b1 + a3 * b0; + s4 = c4 + a0 * b4 + a1 * b3 + a2 * b2 + a3 * b1 + a4 * b0; + s5 = c5 + a0 * b5 + a1 * b4 + a2 * b3 + a3 * b2 + a4 * b1 + a5 * b0; + s6 = c6 + a0 * b6 + a1 * b5 + a2 * b4 + a3 * b3 + a4 * b2 + a5 * b1 + a6 * b0; + s7 = c7 + a0 * b7 + a1 * b6 + a2 * b5 + a3 * b4 + a4 * b3 + a5 * b2 + + a6 * b1 + a7 * b0; + s8 = c8 + a0 * b8 + a1 * b7 + a2 * b6 + a3 * b5 + a4 * b4 + a5 * b3 + + a6 * b2 + a7 * b1 + a8 * b0; + s9 = c9 + a0 * b9 + a1 * b8 + a2 * b7 + a3 * b6 + a4 * b5 + a5 * b4 + + a6 * b3 + a7 * b2 + a8 * b1 + a9 * b0; + s10 = c10 + a0 * b10 + a1 * b9 + a2 * b8 + a3 * b7 + a4 * b6 + a5 * b5 + + a6 * b4 + a7 * b3 + a8 * b2 + a9 * b1 + a10 * b0; + s11 = c11 + a0 * b11 + a1 * b10 + a2 * b9 + a3 * b8 + a4 * b7 + a5 * b6 + + a6 * b5 + a7 * b4 + a8 * b3 + a9 * b2 + a10 * b1 + a11 * b0; + s12 = a1 * b11 + a2 * b10 + a3 * b9 + a4 * b8 + a5 * b7 + a6 * b6 + a7 * b5 + + a8 * b4 + a9 * b3 + a10 * b2 + a11 * b1; + s13 = a2 * b11 + a3 * b10 + a4 * b9 + a5 * b8 + a6 * b7 + a7 * b6 + a8 * b5 + + a9 * b4 + a10 * b3 + a11 * b2; + s14 = a3 * b11 + a4 * b10 + a5 * b9 + a6 * b8 + a7 * b7 + a8 * b6 + a9 * b5 + + a10 * b4 + a11 * b3; + s15 = a4 * b11 + a5 * b10 + a6 * b9 + a7 * b8 + a8 * b7 + a9 * b6 + a10 * b5 + + a11 * b4; + s16 = a5 * b11 + a6 * b10 + a7 * b9 + a8 * b8 + a9 * b7 + a10 * b6 + a11 * b5; + s17 = a6 * b11 + a7 * b10 + a8 * b9 + a9 * b8 + a10 * b7 + a11 * b6; + s18 = a7 * b11 + a8 * b10 + a9 * b9 + a10 * b8 + a11 * b7; + s19 = a8 * b11 + a9 * b10 + a10 * b9 + a11 * b8; + s20 = a9 * b11 + a10 * b10 + a11 * b9; + s21 = a10 * b11 + a11 * b10; + s22 = a11 * b11; + s23 = 0; + + carry0 = (s0 + (1 << 20)) >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry2 = (s2 + (1 << 20)) >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry4 = (s4 + (1 << 20)) >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry12 = (s12 + (1 << 20)) >> 21; + s13 += carry12; + s12 -= int64_lshift21(carry12); + carry14 = (s14 + (1 << 20)) >> 21; + s15 += carry14; + s14 -= int64_lshift21(carry14); + carry16 = (s16 + (1 << 20)) >> 21; + s17 += carry16; + s16 -= int64_lshift21(carry16); + carry18 = (s18 + (1 << 20)) >> 21; + s19 += carry18; + s18 -= int64_lshift21(carry18); + carry20 = (s20 + (1 << 20)) >> 21; + s21 += carry20; + s20 -= int64_lshift21(carry20); + carry22 = (s22 + (1 << 20)) >> 21; + s23 += carry22; + s22 -= int64_lshift21(carry22); + + carry1 = (s1 + (1 << 20)) >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry3 = (s3 + (1 << 20)) >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry5 = (s5 + (1 << 20)) >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + carry13 = (s13 + (1 << 20)) >> 21; + s14 += carry13; + s13 -= int64_lshift21(carry13); + carry15 = (s15 + (1 << 20)) >> 21; + s16 += carry15; + s15 -= int64_lshift21(carry15); + carry17 = (s17 + (1 << 20)) >> 21; + s18 += carry17; + s17 -= int64_lshift21(carry17); + carry19 = (s19 + (1 << 20)) >> 21; + s20 += carry19; + s19 -= int64_lshift21(carry19); + carry21 = (s21 + (1 << 20)) >> 21; + s22 += carry21; + s21 -= int64_lshift21(carry21); + + s11 += s23 * 666643; + s12 += s23 * 470296; + s13 += s23 * 654183; + s14 -= s23 * 997805; + s15 += s23 * 136657; + s16 -= s23 * 683901; + s23 = 0; + + s10 += s22 * 666643; + s11 += s22 * 470296; + s12 += s22 * 654183; + s13 -= s22 * 997805; + s14 += s22 * 136657; + s15 -= s22 * 683901; + s22 = 0; + + s9 += s21 * 666643; + s10 += s21 * 470296; + s11 += s21 * 654183; + s12 -= s21 * 997805; + s13 += s21 * 136657; + s14 -= s21 * 683901; + s21 = 0; + + s8 += s20 * 666643; + s9 += s20 * 470296; + s10 += s20 * 654183; + s11 -= s20 * 997805; + s12 += s20 * 136657; + s13 -= s20 * 683901; + s20 = 0; + + s7 += s19 * 666643; + s8 += s19 * 470296; + s9 += s19 * 654183; + s10 -= s19 * 997805; + s11 += s19 * 136657; + s12 -= s19 * 683901; + s19 = 0; + + s6 += s18 * 666643; + s7 += s18 * 470296; + s8 += s18 * 654183; + s9 -= s18 * 997805; + s10 += s18 * 136657; + s11 -= s18 * 683901; + s18 = 0; + + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry12 = (s12 + (1 << 20)) >> 21; + s13 += carry12; + s12 -= int64_lshift21(carry12); + carry14 = (s14 + (1 << 20)) >> 21; + s15 += carry14; + s14 -= int64_lshift21(carry14); + carry16 = (s16 + (1 << 20)) >> 21; + s17 += carry16; + s16 -= int64_lshift21(carry16); + + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + carry13 = (s13 + (1 << 20)) >> 21; + s14 += carry13; + s13 -= int64_lshift21(carry13); + carry15 = (s15 + (1 << 20)) >> 21; + s16 += carry15; + s15 -= int64_lshift21(carry15); + + s5 += s17 * 666643; + s6 += s17 * 470296; + s7 += s17 * 654183; + s8 -= s17 * 997805; + s9 += s17 * 136657; + s10 -= s17 * 683901; + s17 = 0; + + s4 += s16 * 666643; + s5 += s16 * 470296; + s6 += s16 * 654183; + s7 -= s16 * 997805; + s8 += s16 * 136657; + s9 -= s16 * 683901; + s16 = 0; + + s3 += s15 * 666643; + s4 += s15 * 470296; + s5 += s15 * 654183; + s6 -= s15 * 997805; + s7 += s15 * 136657; + s8 -= s15 * 683901; + s15 = 0; + + s2 += s14 * 666643; + s3 += s14 * 470296; + s4 += s14 * 654183; + s5 -= s14 * 997805; + s6 += s14 * 136657; + s7 -= s14 * 683901; + s14 = 0; + + s1 += s13 * 666643; + s2 += s13 * 470296; + s3 += s13 * 654183; + s4 -= s13 * 997805; + s5 += s13 * 136657; + s6 -= s13 * 683901; + s13 = 0; + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = (s0 + (1 << 20)) >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry2 = (s2 + (1 << 20)) >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry4 = (s4 + (1 << 20)) >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry6 = (s6 + (1 << 20)) >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry8 = (s8 + (1 << 20)) >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry10 = (s10 + (1 << 20)) >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + + carry1 = (s1 + (1 << 20)) >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry3 = (s3 + (1 << 20)) >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry5 = (s5 + (1 << 20)) >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry7 = (s7 + (1 << 20)) >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry9 = (s9 + (1 << 20)) >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry11 = (s11 + (1 << 20)) >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + carry11 = s11 >> 21; + s12 += carry11; + s11 -= int64_lshift21(carry11); + + s0 += s12 * 666643; + s1 += s12 * 470296; + s2 += s12 * 654183; + s3 -= s12 * 997805; + s4 += s12 * 136657; + s5 -= s12 * 683901; + s12 = 0; + + carry0 = s0 >> 21; + s1 += carry0; + s0 -= int64_lshift21(carry0); + carry1 = s1 >> 21; + s2 += carry1; + s1 -= int64_lshift21(carry1); + carry2 = s2 >> 21; + s3 += carry2; + s2 -= int64_lshift21(carry2); + carry3 = s3 >> 21; + s4 += carry3; + s3 -= int64_lshift21(carry3); + carry4 = s4 >> 21; + s5 += carry4; + s4 -= int64_lshift21(carry4); + carry5 = s5 >> 21; + s6 += carry5; + s5 -= int64_lshift21(carry5); + carry6 = s6 >> 21; + s7 += carry6; + s6 -= int64_lshift21(carry6); + carry7 = s7 >> 21; + s8 += carry7; + s7 -= int64_lshift21(carry7); + carry8 = s8 >> 21; + s9 += carry8; + s8 -= int64_lshift21(carry8); + carry9 = s9 >> 21; + s10 += carry9; + s9 -= int64_lshift21(carry9); + carry10 = s10 >> 21; + s11 += carry10; + s10 -= int64_lshift21(carry10); + + s[0] = s0 >> 0; + s[1] = s0 >> 8; + s[2] = (s0 >> 16) | (s1 << 5); + s[3] = s1 >> 3; + s[4] = s1 >> 11; + s[5] = (s1 >> 19) | (s2 << 2); + s[6] = s2 >> 6; + s[7] = (s2 >> 14) | (s3 << 7); + s[8] = s3 >> 1; + s[9] = s3 >> 9; + s[10] = (s3 >> 17) | (s4 << 4); + s[11] = s4 >> 4; + s[12] = s4 >> 12; + s[13] = (s4 >> 20) | (s5 << 1); + s[14] = s5 >> 7; + s[15] = (s5 >> 15) | (s6 << 6); + s[16] = s6 >> 2; + s[17] = s6 >> 10; + s[18] = (s6 >> 18) | (s7 << 3); + s[19] = s7 >> 5; + s[20] = s7 >> 13; + s[21] = s8 >> 0; + s[22] = s8 >> 8; + s[23] = (s8 >> 16) | (s9 << 5); + s[24] = s9 >> 3; + s[25] = s9 >> 11; + s[26] = (s9 >> 19) | (s10 << 2); + s[27] = s10 >> 6; + s[28] = (s10 >> 14) | (s11 << 7); + s[29] = s11 >> 1; + s[30] = s11 >> 9; + s[31] = s11 >> 17; +} + +void ED25519_keypair(uint8_t out_public_key[32], uint8_t out_private_key[64]) { + uint8_t seed[32]; + RAND_bytes(seed, 32); + ED25519_keypair_from_seed(out_public_key, out_private_key, seed); +} + +int ED25519_sign(uint8_t out_sig[64], const uint8_t *message, + size_t message_len, const uint8_t private_key[64]) { + // NOTE: The documentation on this function says that it returns zero on + // allocation failure. While that can't happen with the current + // implementation, we want to reserve the ability to allocate in this + // implementation in the future. + + uint8_t az[SHA512_DIGEST_LENGTH]; + SHA512(private_key, 32, az); + + az[0] &= 248; + az[31] &= 63; + az[31] |= 64; + + SHA512_CTX hash_ctx; + SHA512_Init(&hash_ctx); + SHA512_Update(&hash_ctx, az + 32, 32); + SHA512_Update(&hash_ctx, message, message_len); + uint8_t nonce[SHA512_DIGEST_LENGTH]; + SHA512_Final(nonce, &hash_ctx); + + x25519_sc_reduce(nonce); + ge_p3 R; + x25519_ge_scalarmult_base(&R, nonce); + ge_p3_tobytes(out_sig, &R); + + SHA512_Init(&hash_ctx); + SHA512_Update(&hash_ctx, out_sig, 32); + SHA512_Update(&hash_ctx, private_key + 32, 32); + SHA512_Update(&hash_ctx, message, message_len); + uint8_t hram[SHA512_DIGEST_LENGTH]; + SHA512_Final(hram, &hash_ctx); + + x25519_sc_reduce(hram); + sc_muladd(out_sig + 32, hram, az, nonce); + + // The signature is computed from the private key, but is public. + CONSTTIME_DECLASSIFY(out_sig, 64); + return 1; +} + +int ED25519_verify(const uint8_t *message, size_t message_len, + const uint8_t signature[64], const uint8_t public_key[32]) { + ge_p3 A; + if ((signature[63] & 224) != 0 || + !x25519_ge_frombytes_vartime(&A, public_key)) { + return 0; + } + + fe_loose t; + fe_neg(&t, &A.X); + fe_carry(&A.X, &t); + fe_neg(&t, &A.T); + fe_carry(&A.T, &t); + + uint8_t pkcopy[32]; + OPENSSL_memcpy(pkcopy, public_key, 32); + uint8_t rcopy[32]; + OPENSSL_memcpy(rcopy, signature, 32); + uint8_t scopy[32]; + OPENSSL_memcpy(scopy, signature + 32, 32); + + // https://tools.ietf.org/html/rfc8032#section-5.1.7 requires that s be in + // the range [0, order) in order to prevent signature malleability. + + // kOrder is the order of Curve25519 in little-endian form. + static const uint64_t kOrder[4] = { + UINT64_C(0x5812631a5cf5d3ed), + UINT64_C(0x14def9dea2f79cd6), + 0, + UINT64_C(0x1000000000000000), + }; + for (size_t i = 3;; i--) { + uint64_t word = CRYPTO_load_u64_le(scopy + i * 8); + if (word > kOrder[i]) { + return 0; + } else if (word < kOrder[i]) { + break; + } else if (i == 0) { + return 0; + } + } + + SHA512_CTX hash_ctx; + SHA512_Init(&hash_ctx); + SHA512_Update(&hash_ctx, signature, 32); + SHA512_Update(&hash_ctx, public_key, 32); + SHA512_Update(&hash_ctx, message, message_len); + uint8_t h[SHA512_DIGEST_LENGTH]; + SHA512_Final(h, &hash_ctx); + + x25519_sc_reduce(h); + + ge_p2 R; + ge_double_scalarmult_vartime(&R, h, &A, scopy); + + uint8_t rcheck[32]; + x25519_ge_tobytes(rcheck, &R); + + return CRYPTO_memcmp(rcheck, rcopy, sizeof(rcheck)) == 0; +} + +void ED25519_keypair_from_seed(uint8_t out_public_key[32], + uint8_t out_private_key[64], + const uint8_t seed[32]) { + uint8_t az[SHA512_DIGEST_LENGTH]; + SHA512(seed, 32, az); + + az[0] &= 248; + az[31] &= 127; + az[31] |= 64; + + ge_p3 A; + x25519_ge_scalarmult_base(&A, az); + ge_p3_tobytes(out_public_key, &A); + // The public key is derived from the private key, but it is public. + CONSTTIME_DECLASSIFY(out_public_key, 32); + + OPENSSL_memcpy(out_private_key, seed, 32); + OPENSSL_memcpy(out_private_key + 32, out_public_key, 32); +} + + +static void x25519_scalar_mult_generic(uint8_t out[32], + const uint8_t scalar[32], + const uint8_t point[32]) { + fe x1, x2, z2, x3, z3, tmp0, tmp1; + fe_loose x2l, z2l, x3l, tmp0l, tmp1l; + + uint8_t e[32]; + OPENSSL_memcpy(e, scalar, 32); + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + // The following implementation was transcribed to Coq and proven to + // correspond to unary scalar multiplication in affine coordinates given that + // x1 != 0 is the x coordinate of some point on the curve. It was also checked + // in Coq that doing a ladderstep with x1 = x3 = 0 gives z2' = z3' = 0, and z2 + // = z3 = 0 gives z2' = z3' = 0. The statement was quantified over the + // underlying field, so it applies to Curve25519 itself and the quadratic + // twist of Curve25519. It was not proven in Coq that prime-field arithmetic + // correctly simulates extension-field arithmetic on prime-field values. + // The decoding of the byte array representation of e was not considered. + // Specification of Montgomery curves in affine coordinates: + // + // Proof that these form a group that is isomorphic to a Weierstrass curve: + // + // Coq transcription and correctness proof of the loop (where scalarbits=255): + // + // + // preconditions: 0 <= e < 2^255 (not necessarily e < order), fe_invert(0) = 0 + fe_frombytes(&x1, point); + fe_1(&x2); + fe_0(&z2); + fe_copy(&x3, &x1); + fe_1(&z3); + + unsigned swap = 0; + int pos; + for (pos = 254; pos >= 0; --pos) { + // loop invariant as of right before the test, for the case where x1 != 0: + // pos >= -1; if z2 = 0 then x2 is nonzero; if z3 = 0 then x3 is nonzero + // let r := e >> (pos+1) in the following equalities of projective points: + // to_xz (r*P) === if swap then (x3, z3) else (x2, z2) + // to_xz ((r+1)*P) === if swap then (x2, z2) else (x3, z3) + // x1 is the nonzero x coordinate of the nonzero point (r*P-(r+1)*P) + unsigned b = 1 & (e[pos / 8] >> (pos & 7)); + swap ^= b; + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + swap = b; + // Coq transcription of ladderstep formula (called from transcribed loop): + // + // + // x1 != 0 + // + // x1 = 0 + // + fe_sub(&tmp0l, &x3, &z3); + fe_sub(&tmp1l, &x2, &z2); + fe_add(&x2l, &x2, &z2); + fe_add(&z2l, &x3, &z3); + fe_mul_tll(&z3, &tmp0l, &x2l); + fe_mul_tll(&z2, &z2l, &tmp1l); + fe_sq_tl(&tmp0, &tmp1l); + fe_sq_tl(&tmp1, &x2l); + fe_add(&x3l, &z3, &z2); + fe_sub(&z2l, &z3, &z2); + fe_mul_ttt(&x2, &tmp1, &tmp0); + fe_sub(&tmp1l, &tmp1, &tmp0); + fe_sq_tl(&z2, &z2l); + fe_mul121666(&z3, &tmp1l); + fe_sq_tl(&x3, &x3l); + fe_add(&tmp0l, &tmp0, &z3); + fe_mul_ttt(&z3, &x1, &z2); + fe_mul_tll(&z2, &tmp1l, &tmp0l); + } + // here pos=-1, so r=e, so to_xz (e*P) === if swap then (x3, z3) else (x2, z2) + fe_cswap(&x2, &x3, swap); + fe_cswap(&z2, &z3, swap); + + fe_invert(&z2, &z2); + fe_mul_ttt(&x2, &x2, &z2); + fe_tobytes(out, &x2); +} + +static void x25519_scalar_mult(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]) { +#if defined(BORINGSSL_X25519_NEON) + if (CRYPTO_is_NEON_capable()) { + x25519_NEON(out, scalar, point); + return; + } +#elif defined(BORINGSSL_FE25519_ADX) + if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() && + CRYPTO_is_ADX_capable()) { + x25519_scalar_mult_adx(out, scalar, point); + return; + } +#endif + + x25519_scalar_mult_generic(out, scalar, point); +} + +void X25519_keypair(uint8_t out_public_value[32], uint8_t out_private_key[32]) { + RAND_bytes(out_private_key, 32); + + // All X25519 implementations should decode scalars correctly (see + // https://tools.ietf.org/html/rfc7748#section-5). However, if an + // implementation doesn't then it might interoperate with random keys a + // fraction of the time because they'll, randomly, happen to be correctly + // formed. + // + // Thus we do the opposite of the masking here to make sure that our private + // keys are never correctly masked and so, hopefully, any incorrect + // implementations are deterministically broken. + // + // This does not affect security because, although we're throwing away + // entropy, a valid implementation of scalarmult should throw away the exact + // same bits anyway. + out_private_key[0] |= ~248; + out_private_key[31] &= ~64; + out_private_key[31] |= ~127; + + X25519_public_from_private(out_public_value, out_private_key); +} + +int X25519(uint8_t out_shared_key[32], const uint8_t private_key[32], + const uint8_t peer_public_value[32]) { + static const uint8_t kZeros[32] = {0}; + x25519_scalar_mult(out_shared_key, private_key, peer_public_value); + // The all-zero output results when the input is a point of small order. + return constant_time_declassify_int( + CRYPTO_memcmp(kZeros, out_shared_key, 32)) != 0; +} + +void X25519_public_from_private(uint8_t out_public_value[32], + const uint8_t private_key[32]) { +#if defined(BORINGSSL_X25519_NEON) + if (CRYPTO_is_NEON_capable()) { + static const uint8_t kMongomeryBasePoint[32] = {9}; + x25519_NEON(out_public_value, private_key, kMongomeryBasePoint); + return; + } +#endif + + uint8_t e[32]; + OPENSSL_memcpy(e, private_key, 32); + e[0] &= 248; + e[31] &= 127; + e[31] |= 64; + + ge_p3 A; + x25519_ge_scalarmult_base(&A, e); + + // We only need the u-coordinate of the curve25519 point. The map is + // u=(y+1)/(1-y). Since y=Y/Z, this gives u=(Z+Y)/(Z-Y). + fe_loose zplusy, zminusy; + fe zminusy_inv; + fe_add(&zplusy, &A.Z, &A.Y); + fe_sub(&zminusy, &A.Z, &A.Y); + fe_loose_invert(&zminusy_inv, &zminusy); + fe_mul_tlt(&zminusy_inv, &zplusy, &zminusy_inv); + fe_tobytes(out_public_value, &zminusy_inv); + CONSTTIME_DECLASSIFY(out_public_value, 32); +} diff --git a/third_party/boringssl/src/crypto/curve25519/curve25519_64_adx.cc b/third_party/boringssl/src/crypto/curve25519/curve25519_64_adx.cc new file mode 100644 index 00000000..da270c22 --- /dev/null +++ b/third_party/boringssl/src/crypto/curve25519/curve25519_64_adx.cc @@ -0,0 +1,18 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" +#if defined(BORINGSSL_FE25519_ADX) +#include "../../third_party/fiat/curve25519_64_adx.h" +#endif diff --git a/third_party/boringssl/src/crypto/curve25519/curve25519_tables.h b/third_party/boringssl/src/crypto/curve25519/curve25519_tables.h index 310581cf..942336d0 100644 --- a/third_party/boringssl/src/crypto/curve25519/curve25519_tables.h +++ b/third_party/boringssl/src/crypto/curve25519/curve25519_tables.h @@ -1,23 +1,23 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // This file is generated from // ./make_curve25519_tables.py > curve25519_tables.h -static const fe d = {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +static const bssl::fe d = {{ +#if defined(OPENSSL_64_BIT) 929955233495203, 466365720129213, 1662059464998953, 2033849074728123, 1442794654840575 #else @@ -26,8 +26,8 @@ static const fe d = {{ #endif }}; -static const fe sqrtm1 = {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +static const bssl::fe sqrtm1 = {{ +#if defined(OPENSSL_64_BIT) 1718705420411056, 234908883556509, 2233514472574048, 2117202627021982, 765476049583133 #else @@ -36,8 +36,8 @@ static const fe sqrtm1 = {{ #endif }}; -static const fe d2 = {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +static const bssl::fe d2 = {{ +#if defined(OPENSSL_64_BIT) 1859910466990425, 932731440258426, 1072319116312658, 1815898335770999, 633789495995903 #else @@ -142,7493 +142,2885 @@ static const uint8_t k25519SmallPrecomp[15 * 2 * 32] = { #else // k25519Precomp[i][j] = (j+1)*256^i*B -static const ge_precomp k25519Precomp[32][8] = { +const uint8_t bssl::k25519Precomp[32][8][3][32] = { { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1288382639258501, 245678601348599, 269427782077623, - 1462984067271730, 137412439391563 -#else - 25967493, 19198397, 29566455, 3660896, 54414519, 4014786, - 27544626, 21800161, 61029707, 2047604 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 62697248952638, 204681361388450, 631292143396476, - 338455783676468, 1213667448819585 -#else - 54563134, 934261, 64385954, 3049989, 66381436, 9406985, - 12720692, 5043384, 19500929, 18085054 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 301289933810280, 1259582250014073, 1422107436869536, - 796239922652654, 1953934009299142 -#else - 58370664, 4489569, 9688441, 18769238, 10184608, 21191052, - 29287918, 11864899, 42594502, 29115885 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1380971894829527, 790832306631236, 2067202295274102, - 1995808275510000, 1566530869037010 -#else - 54292951, 20578084, 45527620, 11784319, 41753206, 30803714, - 55390960, 29739860, 66750418, 23343128 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 463307831301544, 432984605774163, 1610641361907204, - 750899048855000, 1894842303421586 -#else - 45405608, 6903824, 27185491, 6451973, 37531140, 24000426, - 51492312, 11189267, 40279186, 28235350 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 748439484463711, 1033211726465151, 1396005112841647, - 1611506220286469, 1972177495910992 -#else - 26966623, 11152617, 32442495, 15396054, 14353839, 20802097, - 63980037, 24013313, 51636816, 29387734 -#endif - }}, + {0x85, 0x3b, 0x8c, 0xf5, 0xc6, 0x93, 0xbc, 0x2f, 0x19, 0xe, 0x8c, + 0xfb, 0xc6, 0x2d, 0x93, 0xcf, 0xc2, 0x42, 0x3d, 0x64, 0x98, 0x48, + 0xb, 0x27, 0x65, 0xba, 0xd4, 0x33, 0x3a, 0x9d, 0xcf, 0x7}, + {0x3e, 0x91, 0x40, 0xd7, 0x5, 0x39, 0x10, 0x9d, 0xb3, 0xbe, 0x40, + 0xd1, 0x5, 0x9f, 0x39, 0xfd, 0x9, 0x8a, 0x8f, 0x68, 0x34, 0x84, + 0xc1, 0xa5, 0x67, 0x12, 0xf8, 0x98, 0x92, 0x2f, 0xfd, 0x44}, + {0x68, 0xaa, 0x7a, 0x87, 0x5, 0x12, 0xc9, 0xab, 0x9e, 0xc4, 0xaa, + 0xcc, 0x23, 0xe8, 0xd9, 0x26, 0x8c, 0x59, 0x43, 0xdd, 0xcb, 0x7d, + 0x1b, 0x5a, 0xa8, 0x65, 0xc, 0x9f, 0x68, 0x7b, 0x11, 0x6f}, + }, + { + {0xd7, 0x71, 0x3c, 0x93, 0xfc, 0xe7, 0x24, 0x92, 0xb5, 0xf5, 0xf, + 0x7a, 0x96, 0x9d, 0x46, 0x9f, 0x2, 0x7, 0xd6, 0xe1, 0x65, 0x9a, + 0xa6, 0x5a, 0x2e, 0x2e, 0x7d, 0xa8, 0x3f, 0x6, 0xc, 0x59}, + {0xa8, 0xd5, 0xb4, 0x42, 0x60, 0xa5, 0x99, 0x8a, 0xf6, 0xac, 0x60, + 0x4e, 0xc, 0x81, 0x2b, 0x8f, 0xaa, 0x37, 0x6e, 0xb1, 0x6b, 0x23, + 0x9e, 0xe0, 0x55, 0x25, 0xc9, 0x69, 0xa6, 0x95, 0xb5, 0x6b}, + {0x5f, 0x7a, 0x9b, 0xa5, 0xb3, 0xa8, 0xfa, 0x43, 0x78, 0xcf, 0x9a, + 0x5d, 0xdd, 0x6b, 0xc1, 0x36, 0x31, 0x6a, 0x3d, 0xb, 0x84, 0xa0, + 0xf, 0x50, 0x73, 0xb, 0xa5, 0x3e, 0xb1, 0xf5, 0x1a, 0x70}, + }, + { + {0x30, 0x97, 0xee, 0x4c, 0xa8, 0xb0, 0x25, 0xaf, 0x8a, 0x4b, 0x86, + 0xe8, 0x30, 0x84, 0x5a, 0x2, 0x32, 0x67, 0x1, 0x9f, 0x2, 0x50, + 0x1b, 0xc1, 0xf4, 0xf8, 0x80, 0x9a, 0x1b, 0x4e, 0x16, 0x7a}, + {0x65, 0xd2, 0xfc, 0xa4, 0xe8, 0x1f, 0x61, 0x56, 0x7d, 0xba, 0xc1, + 0xe5, 0xfd, 0x53, 0xd3, 0x3b, 0xbd, 0xd6, 0x4b, 0x21, 0x1a, 0xf3, + 0x31, 0x81, 0x62, 0xda, 0x5b, 0x55, 0x87, 0x15, 0xb9, 0x2a}, + {0x89, 0xd8, 0xd0, 0xd, 0x3f, 0x93, 0xae, 0x14, 0x62, 0xda, 0x35, + 0x1c, 0x22, 0x23, 0x94, 0x58, 0x4c, 0xdb, 0xf2, 0x8c, 0x45, 0xe5, + 0x70, 0xd1, 0xc6, 0xb4, 0xb9, 0x12, 0xaf, 0x26, 0x28, 0x5a}, + }, + { + {0x9f, 0x9, 0xfc, 0x8e, 0xb9, 0x51, 0x73, 0x28, 0x38, 0x25, 0xfd, + 0x7d, 0xf4, 0xc6, 0x65, 0x67, 0x65, 0x92, 0xa, 0xfb, 0x3d, 0x8d, + 0x34, 0xca, 0x27, 0x87, 0xe5, 0x21, 0x3, 0x91, 0xe, 0x68}, + {0xbf, 0x18, 0x68, 0x5, 0xa, 0x5, 0xfe, 0x95, 0xa9, 0xfa, 0x60, + 0x56, 0x71, 0x89, 0x7e, 0x32, 0x73, 0x50, 0xa0, 0x6, 0xcd, 0xe3, + 0xe8, 0xc3, 0x9a, 0xa4, 0x45, 0x74, 0x4c, 0x3f, 0x93, 0x27}, + {0x9, 0xff, 0x76, 0xc4, 0xe9, 0xfb, 0x13, 0x5a, 0x72, 0xc1, 0x5c, + 0x7b, 0x45, 0x39, 0x9e, 0x6e, 0x94, 0x44, 0x2b, 0x10, 0xf9, 0xdc, + 0xdb, 0x5d, 0x2b, 0x3e, 0x55, 0x63, 0xbf, 0xc, 0x9d, 0x7f}, + }, + { + {0x33, 0xbb, 0xa5, 0x8, 0x44, 0xbc, 0x12, 0xa2, 0x2, 0xed, 0x5e, + 0xc7, 0xc3, 0x48, 0x50, 0x8d, 0x44, 0xec, 0xbf, 0x5a, 0xc, 0xeb, + 0x1b, 0xdd, 0xeb, 0x6, 0xe2, 0x46, 0xf1, 0xcc, 0x45, 0x29}, + {0xba, 0xd6, 0x47, 0xa4, 0xc3, 0x82, 0x91, 0x7f, 0xb7, 0x29, 0x27, + 0x4b, 0xd1, 0x14, 0x0, 0xd5, 0x87, 0xa0, 0x64, 0xb8, 0x1c, 0xf1, + 0x3c, 0xe3, 0xf3, 0x55, 0x1b, 0xeb, 0x73, 0x7e, 0x4a, 0x15}, + {0x85, 0x82, 0x2a, 0x81, 0xf1, 0xdb, 0xbb, 0xbc, 0xfc, 0xd1, 0xbd, + 0xd0, 0x7, 0x8, 0xe, 0x27, 0x2d, 0xa7, 0xbd, 0x1b, 0xb, 0x67, + 0x1b, 0xb4, 0x9a, 0xb6, 0x3b, 0x6b, 0x69, 0xbe, 0xaa, 0x43}, + }, + { + {0x31, 0x71, 0x15, 0x77, 0xeb, 0xee, 0xc, 0x3a, 0x88, 0xaf, 0xc8, + 0x0, 0x89, 0x15, 0x27, 0x9b, 0x36, 0xa7, 0x59, 0xda, 0x68, 0xb6, + 0x65, 0x80, 0xbd, 0x38, 0xcc, 0xa2, 0xb6, 0x7b, 0xe5, 0x51}, + {0xa4, 0x8c, 0x7d, 0x7b, 0xb6, 0x6, 0x98, 0x49, 0x39, 0x27, 0xd2, + 0x27, 0x84, 0xe2, 0x5b, 0x57, 0xb9, 0x53, 0x45, 0x20, 0xe7, 0x5c, + 0x8, 0xbb, 0x84, 0x78, 0x41, 0xae, 0x41, 0x4c, 0xb6, 0x38}, + {0x71, 0x4b, 0xea, 0x2, 0x67, 0x32, 0xac, 0x85, 0x1, 0xbb, 0xa1, + 0x41, 0x3, 0xe0, 0x70, 0xbe, 0x44, 0xc1, 0x3b, 0x8, 0x4b, 0xa2, + 0xe4, 0x53, 0xe3, 0x61, 0xd, 0x9f, 0x1a, 0xe9, 0xb8, 0x10}, + }, + { + {0xbf, 0xa3, 0x4e, 0x94, 0xd0, 0x5c, 0x1a, 0x6b, 0xd2, 0xc0, 0x9d, + 0xb3, 0x3a, 0x35, 0x70, 0x74, 0x49, 0x2e, 0x54, 0x28, 0x82, 0x52, + 0xb2, 0x71, 0x7e, 0x92, 0x3c, 0x28, 0x69, 0xea, 0x1b, 0x46}, + {0xb1, 0x21, 0x32, 0xaa, 0x9a, 0x2c, 0x6f, 0xba, 0xa7, 0x23, 0xba, + 0x3b, 0x53, 0x21, 0xa0, 0x6c, 0x3a, 0x2c, 0x19, 0x92, 0x4f, 0x76, + 0xea, 0x9d, 0xe0, 0x17, 0x53, 0x2e, 0x5d, 0xdd, 0x6e, 0x1d}, + {0xa2, 0xb3, 0xb8, 0x1, 0xc8, 0x6d, 0x83, 0xf1, 0x9a, 0xa4, 0x3e, + 0x5, 0x47, 0x5f, 0x3, 0xb3, 0xf3, 0xad, 0x77, 0x58, 0xba, 0x41, + 0x9c, 0x52, 0xa7, 0x90, 0xf, 0x6a, 0x1c, 0xbb, 0x9f, 0x7a}, + }, + { + {0x8f, 0x3e, 0xdd, 0x4, 0x66, 0x59, 0xb7, 0x59, 0x2c, 0x70, 0x88, + 0xe2, 0x77, 0x3, 0xb3, 0x6c, 0x23, 0xc3, 0xd9, 0x5e, 0x66, 0x9c, + 0x33, 0xb1, 0x2f, 0xe5, 0xbc, 0x61, 0x60, 0xe7, 0x15, 0x9}, + {0xd9, 0x34, 0x92, 0xf3, 0xed, 0x5d, 0xa7, 0xe2, 0xf9, 0x58, 0xb5, + 0xe1, 0x80, 0x76, 0x3d, 0x96, 0xfb, 0x23, 0x3c, 0x6e, 0xac, 0x41, + 0x27, 0x2c, 0xc3, 0x1, 0xe, 0x32, 0xa1, 0x24, 0x90, 0x3a}, + {0x1a, 0x91, 0xa2, 0xc9, 0xd9, 0xf5, 0xc1, 0xe7, 0xd7, 0xa7, 0xcc, + 0x8b, 0x78, 0x71, 0xa3, 0xb8, 0x32, 0x2a, 0xb6, 0xe, 0x19, 0x12, + 0x64, 0x63, 0x95, 0x4e, 0xcc, 0x2e, 0x5c, 0x7c, 0x90, 0x26}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1601611775252272, 1720807796594148, 1132070835939856, - 1260455018889551, 2147779492816911 -#else - 15636272, 23865875, 24204772, 25642034, 616976, 16869170, - 27787599, 18782243, 28944399, 32004408 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 316559037616741, 2177824224946892, 1459442586438991, - 1461528397712656, 751590696113597 -#else - 16568933, 4717097, 55552716, 32452109, 15682895, 21747389, - 16354576, 21778470, 7689661, 11199574 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1850748884277385, 1200145853858453, 1068094770532492, - 672251375690438, 1586055907191707 -#else - 30464137, 27578307, 55329429, 17883566, 23220364, 15915852, - 7512774, 10017326, 49359771, 23634074 -#endif - }}, + {0x1d, 0x9c, 0x2f, 0x63, 0xe, 0xdd, 0xcc, 0x2e, 0x15, 0x31, 0x89, + 0x76, 0x96, 0xb6, 0xd0, 0x51, 0x58, 0x7a, 0x63, 0xa8, 0x6b, 0xb7, + 0xdf, 0x52, 0x39, 0xef, 0xe, 0xa0, 0x49, 0x7d, 0xd3, 0x6d}, + {0x5e, 0x51, 0xaa, 0x49, 0x54, 0x63, 0x5b, 0xed, 0x3a, 0x82, 0xc6, + 0xb, 0x9f, 0xc4, 0x65, 0xa8, 0xc4, 0xd1, 0x42, 0x5b, 0xe9, 0x1f, + 0xc, 0x85, 0xb9, 0x15, 0xd3, 0x3, 0x6f, 0x6d, 0xd7, 0x30}, + {0xc7, 0xe4, 0x6, 0x21, 0x17, 0x44, 0x44, 0x6c, 0x69, 0x7f, 0x8d, + 0x92, 0x80, 0xd6, 0x53, 0xfb, 0x26, 0x3f, 0x4d, 0x69, 0xa4, 0x9e, + 0x73, 0xb4, 0xb0, 0x4b, 0x86, 0x2e, 0x11, 0x97, 0xc6, 0x10}, + }, + { + {0x5, 0xc8, 0x58, 0x83, 0xa0, 0x2a, 0xa6, 0xc, 0x47, 0x42, 0x20, + 0x7a, 0xe3, 0x4a, 0x3d, 0x6a, 0xdc, 0xed, 0x11, 0x3b, 0xa6, 0xd3, + 0x64, 0x74, 0xef, 0x6, 0x8, 0x55, 0xaf, 0x9b, 0xbf, 0x3}, + {0xde, 0x5f, 0xbe, 0x7d, 0x27, 0xc4, 0x93, 0x64, 0xa2, 0x7e, 0xad, + 0x19, 0xad, 0x4f, 0x5d, 0x26, 0x90, 0x45, 0x30, 0x46, 0xc8, 0xdf, + 0x0, 0xe, 0x9, 0xfe, 0x66, 0xed, 0xab, 0x1c, 0xe6, 0x25}, + {0x4, 0x66, 0x58, 0xcc, 0x28, 0xe1, 0x13, 0x3f, 0x7e, 0x74, 0x59, + 0xb4, 0xec, 0x73, 0x58, 0x6f, 0xf5, 0x68, 0x12, 0xcc, 0xed, 0x3d, + 0xb6, 0xa0, 0x2c, 0xe2, 0x86, 0x45, 0x63, 0x78, 0x6d, 0x56}, + }, + { + {0xd0, 0x2f, 0x5a, 0xc6, 0x85, 0x42, 0x5, 0xa1, 0xc3, 0x67, 0x16, + 0xf3, 0x2a, 0x11, 0x64, 0x6c, 0x58, 0xee, 0x1a, 0x73, 0x40, 0xe2, + 0xa, 0x68, 0x2a, 0xb2, 0x93, 0x47, 0xf3, 0xa5, 0xfb, 0x14}, + {0x34, 0x8, 0xc1, 0x9c, 0x9f, 0xa4, 0x37, 0x16, 0x51, 0xc4, 0x9b, + 0xa8, 0xd5, 0x56, 0x8e, 0xbc, 0xdb, 0xd2, 0x7f, 0x7f, 0xf, 0xec, + 0xb5, 0x1c, 0xd9, 0x35, 0xcc, 0x5e, 0xca, 0x5b, 0x97, 0x33}, + {0xd4, 0xf7, 0x85, 0x69, 0x16, 0x46, 0xd7, 0x3c, 0x57, 0x0, 0xc8, + 0xc9, 0x84, 0x5e, 0x3e, 0x59, 0x1e, 0x13, 0x61, 0x7b, 0xb6, 0xf2, + 0xc3, 0x2f, 0x6c, 0x52, 0xfc, 0x83, 0xea, 0x9c, 0x82, 0x14}, + }, + { + {0xb8, 0xec, 0x71, 0x4e, 0x2f, 0xb, 0xe7, 0x21, 0xe3, 0x77, 0xa4, + 0x40, 0xb9, 0xdd, 0x56, 0xe6, 0x80, 0x4f, 0x1d, 0xce, 0xce, 0x56, + 0x65, 0xbf, 0x7e, 0x7b, 0x5d, 0x53, 0xc4, 0x3b, 0xfc, 0x5}, + {0xc2, 0x95, 0xdd, 0x97, 0x84, 0x7b, 0x43, 0xff, 0xa7, 0xb5, 0x4e, + 0xaa, 0x30, 0x4e, 0x74, 0x6c, 0x8b, 0xe8, 0x85, 0x3c, 0x61, 0x5d, + 0xc, 0x9e, 0x73, 0x81, 0x75, 0x5f, 0x1e, 0xc7, 0xd9, 0x2f}, + {0xdd, 0xde, 0xaf, 0x52, 0xae, 0xb3, 0xb8, 0x24, 0xcf, 0x30, 0x3b, + 0xed, 0x8c, 0x63, 0x95, 0x34, 0x95, 0x81, 0xbe, 0xa9, 0x83, 0xbc, + 0xa4, 0x33, 0x4, 0x1f, 0x65, 0x5c, 0x47, 0x67, 0x37, 0x37}, + }, + { + {0x90, 0x65, 0x24, 0x14, 0xcb, 0x95, 0x40, 0x63, 0x35, 0x55, 0xc1, + 0x16, 0x40, 0x14, 0x12, 0xef, 0x60, 0xbc, 0x10, 0x89, 0xc, 0x14, + 0x38, 0x9e, 0x8c, 0x7c, 0x90, 0x30, 0x57, 0x90, 0xf5, 0x6b}, + {0xd9, 0xad, 0xd1, 0x40, 0xfd, 0x99, 0xba, 0x2f, 0x27, 0xd0, 0xf4, + 0x96, 0x6f, 0x16, 0x7, 0xb3, 0xae, 0x3b, 0xf0, 0x15, 0x52, 0xf0, + 0x63, 0x43, 0x99, 0xf9, 0x18, 0x3b, 0x6c, 0xa5, 0xbe, 0x1f}, + {0x8a, 0x5b, 0x41, 0xe1, 0xf1, 0x78, 0xa7, 0xf, 0x7e, 0xa7, 0xc3, + 0xba, 0xf7, 0x9f, 0x40, 0x6, 0x50, 0x9a, 0xa2, 0x9a, 0xb8, 0xd7, + 0x52, 0x6f, 0x56, 0x5a, 0x63, 0x7a, 0xf6, 0x1c, 0x52, 0x2}, + }, + { + {0xe4, 0x5e, 0x2f, 0x77, 0x20, 0x67, 0x14, 0xb1, 0xce, 0x9a, 0x7, + 0x96, 0xb1, 0x94, 0xf8, 0xe8, 0x4a, 0x82, 0xac, 0x0, 0x4d, 0x22, + 0xf8, 0x4a, 0xc4, 0x6c, 0xcd, 0xf7, 0xd9, 0x53, 0x17, 0x0}, + {0x94, 0x52, 0x9d, 0xa, 0xb, 0xee, 0x3f, 0x51, 0x66, 0x5a, 0xdf, + 0xf, 0x5c, 0xe7, 0x98, 0x8f, 0xce, 0x7, 0xe1, 0xbf, 0x88, 0x86, + 0x61, 0xd4, 0xed, 0x2c, 0x38, 0x71, 0x7e, 0xa, 0xa0, 0x3f}, + {0x34, 0xdb, 0x3d, 0x96, 0x2d, 0x23, 0x69, 0x3c, 0x58, 0x38, 0x97, + 0xb4, 0xda, 0x87, 0xde, 0x1d, 0x85, 0xf2, 0x91, 0xa0, 0xf9, 0xd1, + 0xd7, 0xaa, 0xb6, 0xed, 0x48, 0xa0, 0x2f, 0xfe, 0xb5, 0x12}, + }, + { + {0x92, 0x1e, 0x6f, 0xad, 0x26, 0x7c, 0x2b, 0xdf, 0x13, 0x89, 0x4b, + 0x50, 0x23, 0xd3, 0x66, 0x4b, 0xc3, 0x8b, 0x1c, 0x75, 0xc0, 0x9d, + 0x40, 0x8c, 0xb8, 0xc7, 0x96, 0x7, 0xc2, 0x93, 0x7e, 0x6f}, + {0x4d, 0xe3, 0xfc, 0x96, 0xc4, 0xfb, 0xf0, 0x71, 0xed, 0x5b, 0xf3, + 0xad, 0x6b, 0x82, 0xb9, 0x73, 0x61, 0xc5, 0x28, 0xff, 0x61, 0x72, + 0x4, 0xd2, 0x6f, 0x20, 0xb1, 0x6f, 0xf9, 0x76, 0x9b, 0x74}, + {0x5, 0xae, 0xa6, 0xae, 0x4, 0xf6, 0x5a, 0x1f, 0x99, 0x9c, 0xe4, + 0xbe, 0xf1, 0x51, 0x23, 0xc1, 0x66, 0x6b, 0xff, 0xee, 0xb5, 0x8, + 0xa8, 0x61, 0x51, 0x21, 0xe0, 0x1, 0xf, 0xc1, 0xce, 0xf}, + }, + { + {0x45, 0x4e, 0x24, 0xc4, 0x9d, 0xd2, 0xf2, 0x3d, 0xa, 0xde, 0xd8, + 0x93, 0x74, 0xe, 0x2, 0x2b, 0x4d, 0x21, 0xc, 0x82, 0x7e, 0x6, + 0xc8, 0x6c, 0xa, 0xb9, 0xea, 0x6f, 0x16, 0x79, 0x37, 0x41}, + {0x44, 0x1e, 0xfe, 0x49, 0xa6, 0x58, 0x4d, 0x64, 0x7e, 0x77, 0xad, + 0x31, 0xa2, 0xae, 0xfc, 0x21, 0xd2, 0xd0, 0x7f, 0x88, 0x5a, 0x1c, + 0x44, 0x2, 0xf3, 0x11, 0xc5, 0x83, 0x71, 0xaa, 0x1, 0x49}, + {0xf0, 0xf8, 0x1a, 0x8c, 0x54, 0xb7, 0xb1, 0x8, 0xb4, 0x99, 0x62, + 0x24, 0x7c, 0x7a, 0xf, 0xce, 0x39, 0xd9, 0x6, 0x1e, 0xf9, 0xb0, + 0x60, 0xf7, 0x13, 0x12, 0x6d, 0x72, 0x7b, 0x88, 0xbb, 0x41}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 934282339813791, 1846903124198670, 1172395437954843, - 1007037127761661, 1830588347719256 -#else - 50071967, 13921891, 10945806, 27521001, 27105051, 17470053, - 38182653, 15006022, 3284568, 27277892 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1694390458783935, 1735906047636159, 705069562067493, - 648033061693059, 696214010414170 -#else - 23599295, 25248385, 55915199, 25867015, 13236773, 10506355, - 7464579, 9656445, 13059162, 10374397 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1121406372216585, 192876649532226, 190294192191717, - 1994165897297032, 2245000007398739 -#else - 7798537, 16710257, 3033922, 2874086, 28997861, 2835604, - 32406664, 29715387, 66467155, 33453106 -#endif - }}, + {0xae, 0x91, 0x66, 0x7c, 0x59, 0x4c, 0x23, 0x7e, 0xc8, 0xb4, 0x85, + 0xa, 0x3d, 0x9d, 0x88, 0x64, 0xe7, 0xfa, 0x4a, 0x35, 0xc, 0xc9, + 0xe2, 0xda, 0x1d, 0x9e, 0x6a, 0xc, 0x7, 0x1e, 0x87, 0xa}, + {0xbe, 0x46, 0x43, 0x74, 0x44, 0x7d, 0xe8, 0x40, 0x25, 0x2b, 0xb5, + 0x15, 0xd4, 0xda, 0x48, 0x1d, 0x3e, 0x60, 0x3b, 0xa1, 0x18, 0x8a, + 0x3a, 0x7c, 0xf7, 0xbd, 0xcd, 0x2f, 0xc1, 0x28, 0xb7, 0x4e}, + {0x89, 0x89, 0xbc, 0x4b, 0x99, 0xb5, 0x1, 0x33, 0x60, 0x42, 0xdd, + 0x5b, 0x3a, 0xae, 0x6b, 0x73, 0x3c, 0x9e, 0xd5, 0x19, 0xe2, 0xad, + 0x61, 0xd, 0x64, 0xd4, 0x85, 0x26, 0xf, 0x30, 0xe7, 0x3e}, + }, + { + {0x18, 0x75, 0x1e, 0x84, 0x47, 0x79, 0xfa, 0x43, 0xd7, 0x46, 0x9c, + 0x63, 0x59, 0xfa, 0xc6, 0xe5, 0x74, 0x2b, 0x5, 0xe3, 0x1d, 0x5e, + 0x6, 0xa1, 0x30, 0x90, 0xb8, 0xcf, 0xa2, 0xc6, 0x47, 0x7d}, + {0xb7, 0xd6, 0x7d, 0x9e, 0xe4, 0x55, 0xd2, 0xf5, 0xac, 0x1e, 0xb, + 0x61, 0x5c, 0x11, 0x16, 0x80, 0xca, 0x87, 0xe1, 0x92, 0x5d, 0x97, + 0x99, 0x3c, 0xc2, 0x25, 0x91, 0x97, 0x62, 0x57, 0x81, 0x13}, + {0xe0, 0xd6, 0xf0, 0x8e, 0x14, 0xd0, 0xda, 0x3f, 0x3c, 0x6f, 0x54, + 0x91, 0x9a, 0x74, 0x3e, 0x9d, 0x57, 0x81, 0xbb, 0x26, 0x10, 0x62, + 0xec, 0x71, 0x80, 0xec, 0xc9, 0x34, 0x8d, 0xf5, 0x8c, 0x14}, + }, + { + {0x6d, 0x75, 0xe4, 0x9a, 0x7d, 0x2f, 0x57, 0xe2, 0x7f, 0x48, 0xf3, + 0x88, 0xbb, 0x45, 0xc3, 0x56, 0x8d, 0xa8, 0x60, 0x69, 0x6d, 0xb, + 0xd1, 0x9f, 0xb9, 0xa1, 0xae, 0x4e, 0xad, 0xeb, 0x8f, 0x27}, + {0x27, 0xf0, 0x34, 0x79, 0xf6, 0x92, 0xa4, 0x46, 0xa9, 0xa, 0x84, + 0xf6, 0xbe, 0x84, 0x99, 0x46, 0x54, 0x18, 0x61, 0x89, 0x2a, 0xbc, + 0xa1, 0x5c, 0xd4, 0xbb, 0x5d, 0xbd, 0x1e, 0xfa, 0xf2, 0x3f}, + {0x66, 0x39, 0x93, 0x8c, 0x1f, 0x68, 0xaa, 0xb1, 0x98, 0xc, 0x29, + 0x20, 0x9c, 0x94, 0x21, 0x8c, 0x52, 0x3c, 0x9d, 0x21, 0x91, 0x52, + 0x11, 0x39, 0x7b, 0x67, 0x9c, 0xfe, 0x2, 0xdd, 0x4, 0x41}, + }, + { + {0xb8, 0x6a, 0x9, 0xdb, 0x6, 0x4e, 0x21, 0x81, 0x35, 0x4f, 0xe4, + 0xc, 0xc9, 0xb6, 0xa8, 0x21, 0xf5, 0x2a, 0x9e, 0x40, 0x2a, 0xc1, + 0x24, 0x65, 0x81, 0xa4, 0xfc, 0x8e, 0xa4, 0xb5, 0x65, 0x1}, + {0x2a, 0x42, 0x24, 0x11, 0x5e, 0xbf, 0xb2, 0x72, 0xb5, 0x3a, 0xa3, + 0x98, 0x33, 0xc, 0xfa, 0xa1, 0x66, 0xb6, 0x52, 0xfa, 0x1, 0x61, + 0xcb, 0x94, 0xd5, 0x53, 0xaf, 0xaf, 0x0, 0x3b, 0x86, 0x2c}, + {0x76, 0x6a, 0x84, 0xa0, 0x74, 0xa4, 0x90, 0xf1, 0xc0, 0x7c, 0x2f, + 0xcd, 0x84, 0xf9, 0xef, 0x12, 0x8f, 0x2b, 0xaa, 0x58, 0x6, 0x29, + 0x5e, 0x69, 0xb8, 0xc8, 0xfe, 0xbf, 0xd9, 0x67, 0x1b, 0x59}, + }, + { + {0x5d, 0xb5, 0x18, 0x9f, 0x71, 0xb3, 0xb9, 0x99, 0x1e, 0x64, 0x8c, + 0xa1, 0xfa, 0xe5, 0x65, 0xe4, 0xed, 0x5, 0x9f, 0xc2, 0x36, 0x11, + 0x8, 0x61, 0x8b, 0x12, 0x30, 0x70, 0x86, 0x4f, 0x9b, 0x48}, + {0xfa, 0x9b, 0xb4, 0x80, 0x1c, 0xd, 0x2f, 0x31, 0x8a, 0xec, 0xf3, + 0xab, 0x5e, 0x51, 0x79, 0x59, 0x88, 0x1c, 0xf0, 0x9e, 0xc0, 0x33, + 0x70, 0x72, 0xcb, 0x7b, 0x8f, 0xca, 0xc7, 0x2e, 0xe0, 0x3d}, + {0xef, 0x92, 0xeb, 0x3a, 0x2d, 0x10, 0x32, 0xd2, 0x61, 0xa8, 0x16, + 0x61, 0xb4, 0x53, 0x62, 0xe1, 0x24, 0xaa, 0xb, 0x19, 0xe7, 0xab, + 0x7e, 0x3d, 0xbf, 0xbe, 0x6c, 0x49, 0xba, 0xfb, 0xf5, 0x49}, + }, + { + {0x2e, 0x57, 0x9c, 0x1e, 0x8c, 0x62, 0x5d, 0x15, 0x41, 0x47, 0x88, + 0xc5, 0xac, 0x86, 0x4d, 0x8a, 0xeb, 0x63, 0x57, 0x51, 0xf6, 0x52, + 0xa3, 0x91, 0x5b, 0x51, 0x67, 0x88, 0xc2, 0xa6, 0xa1, 0x6}, + {0xd4, 0xcf, 0x5b, 0x8a, 0x10, 0x9a, 0x94, 0x30, 0xeb, 0x73, 0x64, + 0xbc, 0x70, 0xdd, 0x40, 0xdc, 0x1c, 0xd, 0x7c, 0x30, 0xc1, 0x94, + 0xc2, 0x92, 0x74, 0x6e, 0xfa, 0xcb, 0x6d, 0xa8, 0x4, 0x56}, + {0xb6, 0x64, 0x17, 0x7c, 0xd4, 0xd1, 0x88, 0x72, 0x51, 0x8b, 0x41, + 0xe0, 0x40, 0x11, 0x54, 0x72, 0xd1, 0xf6, 0xac, 0x18, 0x60, 0x1a, + 0x3, 0x9f, 0xc6, 0x42, 0x27, 0xfe, 0x89, 0x9e, 0x98, 0x20}, + }, + { + {0x2e, 0xec, 0xea, 0x85, 0x8b, 0x27, 0x74, 0x16, 0xdf, 0x2b, 0xcb, + 0x7a, 0x7, 0xdc, 0x21, 0x56, 0x5a, 0xf4, 0xcb, 0x61, 0x16, 0x4c, + 0xa, 0x64, 0xd3, 0x95, 0x5, 0xf7, 0x50, 0x99, 0xb, 0x73}, + {0x7f, 0xcc, 0x2d, 0x3a, 0xfd, 0x77, 0x97, 0x49, 0x92, 0xd8, 0x4f, + 0xa5, 0x2c, 0x7c, 0x85, 0x32, 0xa0, 0xe3, 0x7, 0xd2, 0x64, 0xd8, + 0x79, 0xa2, 0x29, 0x7e, 0xa6, 0xc, 0x1d, 0xed, 0x3, 0x4}, + {0x52, 0xc5, 0x4e, 0x87, 0x35, 0x2d, 0x4b, 0xc9, 0x8d, 0x6f, 0x24, + 0x98, 0xcf, 0xc8, 0xe6, 0xc5, 0xce, 0x35, 0xc0, 0x16, 0xfa, 0x46, + 0xcb, 0xf7, 0xcc, 0x3d, 0x30, 0x8, 0x43, 0x45, 0xd7, 0x5b}, + }, + { + {0x2a, 0x79, 0xe7, 0x15, 0x21, 0x93, 0xc4, 0x85, 0xc9, 0xdd, 0xcd, + 0xbd, 0xa2, 0x89, 0x4c, 0xc6, 0x62, 0xd7, 0xa3, 0xad, 0xa8, 0x3d, + 0x1e, 0x9d, 0x2c, 0xf8, 0x67, 0x30, 0x12, 0xdb, 0xb7, 0x5b}, + {0xc2, 0x4c, 0xb2, 0x28, 0x95, 0xd1, 0x9a, 0x7f, 0x81, 0xc1, 0x35, + 0x63, 0x65, 0x54, 0x6b, 0x7f, 0x36, 0x72, 0xc0, 0x4f, 0x6e, 0xb6, + 0xb8, 0x66, 0x83, 0xad, 0x80, 0x73, 0x0, 0x78, 0x3a, 0x13}, + {0xbe, 0x62, 0xca, 0xc6, 0x67, 0xf4, 0x61, 0x9, 0xee, 0x52, 0x19, + 0x21, 0xd6, 0x21, 0xec, 0x4, 0x70, 0x47, 0xd5, 0x9b, 0x77, 0x60, + 0x23, 0x18, 0xd2, 0xe0, 0xf0, 0x58, 0x6d, 0xca, 0xd, 0x74}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 769950342298419, 132954430919746, 844085933195555, - 974092374476333, 726076285546016 -#else - 10861363, 11473154, 27284546, 1981175, 37044515, 12577860, - 32867885, 14515107, 51670560, 10819379 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 425251763115706, 608463272472562, 442562545713235, - 837766094556764, 374555092627893 -#else - 4708026, 6336745, 20377586, 9066809, 55836755, 6594695, - 41455196, 12483687, 54440373, 5581305 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1086255230780037, 274979815921559, 1960002765731872, - 929474102396301, 1190409889297339 -#else - 19563141, 16186464, 37722007, 4097518, 10237984, 29206317, - 28542349, 13850243, 43430843, 17738489 -#endif - }}, + {0x3c, 0x43, 0x78, 0x4, 0x57, 0x8c, 0x1a, 0x23, 0x9d, 0x43, 0x81, + 0xc2, 0xe, 0x27, 0xb5, 0xb7, 0x9f, 0x7, 0xd9, 0xe3, 0xea, 0x99, + 0xaa, 0xdb, 0xd9, 0x3, 0x2b, 0x6c, 0x25, 0xf5, 0x3, 0x2c}, + {0x4e, 0xce, 0xcf, 0x52, 0x7, 0xee, 0x48, 0xdf, 0xb7, 0x8, 0xec, + 0x6, 0xf3, 0xfa, 0xff, 0xc3, 0xc4, 0x59, 0x54, 0xb9, 0x2a, 0xb, + 0x71, 0x5, 0x8d, 0xa3, 0x3e, 0x96, 0xfa, 0x25, 0x1d, 0x16}, + {0x7d, 0xa4, 0x53, 0x7b, 0x75, 0x18, 0xf, 0x79, 0x79, 0x58, 0xc, + 0xcf, 0x30, 0x1, 0x7b, 0x30, 0xf9, 0xf7, 0x7e, 0x25, 0x77, 0x3d, + 0x90, 0x31, 0xaf, 0xbb, 0x96, 0xbd, 0xbd, 0x68, 0x94, 0x69}, + }, + { + {0x48, 0x19, 0xa9, 0x6a, 0xe6, 0x3d, 0xdd, 0xd8, 0xcc, 0xd2, 0xc0, + 0x2f, 0xc2, 0x64, 0x50, 0x48, 0x2f, 0xea, 0xfd, 0x34, 0x66, 0x24, + 0x48, 0x9b, 0x3a, 0x2e, 0x4a, 0x6c, 0x4e, 0x1c, 0x3e, 0x29}, + {0xcf, 0xfe, 0xda, 0xf4, 0x46, 0x2f, 0x1f, 0xbd, 0xf7, 0xd6, 0x7f, + 0xa4, 0x14, 0x1, 0xef, 0x7c, 0x7f, 0xb3, 0x47, 0x4a, 0xda, 0xfd, + 0x1f, 0xd3, 0x85, 0x57, 0x90, 0x73, 0xa4, 0x19, 0x52, 0x52}, + {0xe1, 0x12, 0x51, 0x92, 0x4b, 0x13, 0x6e, 0x37, 0xa0, 0x5d, 0xa1, + 0xdc, 0xb5, 0x78, 0x37, 0x70, 0x11, 0x31, 0x1c, 0x46, 0xaf, 0x89, + 0x45, 0xb0, 0x23, 0x28, 0x3, 0x7f, 0x44, 0x5c, 0x60, 0x5b}, + }, + { + {0x4c, 0xf0, 0xe7, 0xf0, 0xc6, 0xfe, 0xe9, 0x3b, 0x62, 0x49, 0xe3, + 0x75, 0x9e, 0x57, 0x6a, 0x86, 0x1a, 0xe6, 0x1d, 0x1e, 0x16, 0xef, + 0x42, 0x55, 0xd5, 0xbd, 0x5a, 0xcc, 0xf4, 0xfe, 0x12, 0x2f}, + {0x89, 0x7c, 0xc4, 0x20, 0x59, 0x80, 0x65, 0xb9, 0xcc, 0x8f, 0x3b, + 0x92, 0xc, 0x10, 0xf0, 0xe7, 0x77, 0xef, 0xe2, 0x2, 0x65, 0x25, + 0x1, 0x0, 0xee, 0xb3, 0xae, 0xa8, 0xce, 0x6d, 0xa7, 0x24}, + {0x40, 0xc7, 0xc0, 0xdf, 0xb2, 0x22, 0x45, 0xa, 0x7, 0xa4, 0xc9, + 0x40, 0x7f, 0x6e, 0xd0, 0x10, 0x68, 0xf6, 0xcf, 0x78, 0x41, 0x14, + 0xcf, 0xc6, 0x90, 0x37, 0xa4, 0x18, 0x25, 0x7b, 0x60, 0x5e}, + }, + { + {0x14, 0xcf, 0x96, 0xa5, 0x1c, 0x43, 0x2c, 0xa0, 0x0, 0xe4, 0xd3, + 0xae, 0x40, 0x2d, 0xc4, 0xe3, 0xdb, 0x26, 0xf, 0x2e, 0x80, 0x26, + 0x45, 0xd2, 0x68, 0x70, 0x45, 0x9e, 0x13, 0x33, 0x1f, 0x20}, + {0x18, 0x18, 0xdf, 0x6c, 0x8f, 0x1d, 0xb3, 0x58, 0xa2, 0x58, 0x62, + 0xc3, 0x4f, 0xa7, 0xcf, 0x35, 0x6e, 0x1d, 0xe6, 0x66, 0x4f, 0xff, + 0xb3, 0xe1, 0xf7, 0xd5, 0xcd, 0x6c, 0xab, 0xac, 0x67, 0x50}, + {0x51, 0x9d, 0x3, 0x8, 0x6b, 0x7f, 0x52, 0xfd, 0x6, 0x0, 0x7c, + 0x1, 0x64, 0x49, 0xb1, 0x18, 0xa8, 0xa4, 0x25, 0x2e, 0xb0, 0xe, + 0x22, 0xd5, 0x75, 0x3, 0x46, 0x62, 0x88, 0xba, 0x7c, 0x39}, + }, + { + {0xe7, 0x79, 0x13, 0xc8, 0xfb, 0xc3, 0x15, 0x78, 0xf1, 0x2a, 0xe1, + 0xdd, 0x20, 0x94, 0x61, 0xa6, 0xd5, 0xfd, 0xa8, 0x85, 0xf8, 0xc0, + 0xa9, 0xff, 0x52, 0xc2, 0xe1, 0xc1, 0x22, 0x40, 0x1b, 0x77}, + {0xb2, 0x59, 0x59, 0xf0, 0x93, 0x30, 0xc1, 0x30, 0x76, 0x79, 0xa9, + 0xe9, 0x8d, 0xa1, 0x3a, 0xe2, 0x26, 0x5e, 0x1d, 0x72, 0x91, 0xd4, + 0x2f, 0x22, 0x3a, 0x6c, 0x6e, 0x76, 0x20, 0xd3, 0x39, 0x23}, + {0xa7, 0x2f, 0x3a, 0x51, 0x86, 0xd9, 0x7d, 0xd8, 0x8, 0xcf, 0xd4, + 0xf9, 0x71, 0x9b, 0xac, 0xf5, 0xb3, 0x83, 0xa2, 0x1e, 0x1b, 0xc3, + 0x6b, 0xd0, 0x76, 0x1a, 0x97, 0x19, 0x92, 0x18, 0x1a, 0x33}, + }, + { + {0xaf, 0x72, 0x75, 0x9d, 0x3a, 0x2f, 0x51, 0x26, 0x9e, 0x4a, 0x7, + 0x68, 0x88, 0xe2, 0xcb, 0x5b, 0xc4, 0xf7, 0x80, 0x11, 0xc1, 0xc1, + 0xed, 0x84, 0x7b, 0xa6, 0x49, 0xf6, 0x9f, 0x61, 0xc9, 0x1a}, + {0xc6, 0x80, 0x4f, 0xfb, 0x45, 0x6f, 0x16, 0xf5, 0xcf, 0x75, 0xc7, + 0x61, 0xde, 0xc7, 0x36, 0x9c, 0x1c, 0xd9, 0x41, 0x90, 0x1b, 0xe8, + 0xd4, 0xe3, 0x21, 0xfe, 0xbd, 0x83, 0x6b, 0x7c, 0x16, 0x31}, + {0x68, 0x10, 0x4b, 0x52, 0x42, 0x38, 0x2b, 0xf2, 0x87, 0xe9, 0x9c, + 0xee, 0x3b, 0x34, 0x68, 0x50, 0xc8, 0x50, 0x62, 0x4a, 0x84, 0x71, + 0x9d, 0xfc, 0x11, 0xb1, 0x8, 0x1f, 0x34, 0x36, 0x24, 0x61}, + }, + { + {0x38, 0x26, 0x2d, 0x1a, 0xe3, 0x49, 0x63, 0x8b, 0x35, 0xfd, 0xd3, + 0x9b, 0x0, 0xb7, 0xdf, 0x9d, 0xa4, 0x6b, 0xa0, 0xa3, 0xb8, 0xf1, + 0x8b, 0x7f, 0x45, 0x4, 0xd9, 0x78, 0x31, 0xaa, 0x22, 0x15}, + {0x8d, 0x89, 0x4e, 0x87, 0xdb, 0x41, 0x9d, 0xd9, 0x20, 0xdc, 0x7, + 0x6c, 0xf1, 0xa5, 0xfe, 0x9, 0xbc, 0x9b, 0xf, 0xd0, 0x67, 0x2c, + 0x3d, 0x79, 0x40, 0xff, 0x5e, 0x9e, 0x30, 0xe2, 0xeb, 0x46}, + {0x38, 0x49, 0x61, 0x69, 0x53, 0x2f, 0x38, 0x2c, 0x10, 0x6d, 0x2d, + 0xb7, 0x9a, 0x40, 0xfe, 0xda, 0x27, 0xf2, 0x46, 0xb6, 0x91, 0x33, + 0xc8, 0xe8, 0x6c, 0x30, 0x24, 0x5, 0xf5, 0x70, 0xfe, 0x45}, + }, + { + {0x91, 0x14, 0x95, 0xc8, 0x20, 0x49, 0xf2, 0x62, 0xa2, 0xc, 0x63, + 0x3f, 0xc8, 0x7, 0xf0, 0x5, 0xb8, 0xd4, 0xc9, 0xf5, 0xd2, 0x45, + 0xbb, 0x6f, 0x45, 0x22, 0x7a, 0xb5, 0x6d, 0x9f, 0x61, 0x16}, + {0x8c, 0xb, 0xc, 0x96, 0xa6, 0x75, 0x48, 0xda, 0x20, 0x2f, 0xe, + 0xef, 0x76, 0xd0, 0x68, 0x5b, 0xd4, 0x8f, 0xb, 0x3d, 0xcf, 0x51, + 0xfb, 0x7, 0xd4, 0x92, 0xe3, 0xa0, 0x23, 0x16, 0x8d, 0x42}, + {0xfd, 0x8, 0xa3, 0x1, 0x44, 0x4a, 0x4f, 0x8, 0xac, 0xca, 0xa5, + 0x76, 0xc3, 0x19, 0x22, 0xa8, 0x7d, 0xbc, 0xd1, 0x43, 0x46, 0xde, + 0xb8, 0xde, 0xc6, 0x38, 0xbd, 0x60, 0x2d, 0x59, 0x81, 0x1d}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1388594989461809, 316767091099457, 394298842192982, - 1230079486801005, 1440737038838979 -#else - 51736881, 20691677, 32573249, 4720197, 40672342, 5875510, - 47920237, 18329612, 57289923, 21468654 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 7380825640100, 146210432690483, 304903576448906, - 1198869323871120, 997689833219095 -#else - 58559652, 109982, 15149363, 2178705, 22900618, 4543417, 3044240, - 17864545, 1762327, 14866737 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1181317918772081, 114573476638901, 262805072233344, - 265712217171332, 294181933805782 -#else - 48909169, 17603008, 56635573, 1707277, 49922944, 3916100, - 38872452, 3959420, 27914454, 4383652 -#endif - }}, + {0xe8, 0xc5, 0x85, 0x7b, 0x9f, 0xb6, 0x65, 0x87, 0xb2, 0xba, 0x68, + 0xd1, 0x8b, 0x67, 0xf0, 0x6f, 0x9b, 0xf, 0x33, 0x1d, 0x7c, 0xe7, + 0x70, 0x3a, 0x7c, 0x8e, 0xaf, 0xb0, 0x51, 0x6d, 0x5f, 0x3a}, + {0x5f, 0xac, 0xd, 0xa6, 0x56, 0x87, 0x36, 0x61, 0x57, 0xdc, 0xab, + 0xeb, 0x6a, 0x2f, 0xe0, 0x17, 0x7d, 0xf, 0xce, 0x4c, 0x2d, 0x3f, + 0x19, 0x7f, 0xf0, 0xdc, 0xec, 0x89, 0x77, 0x4a, 0x23, 0x20}, + {0x52, 0xb2, 0x78, 0x71, 0xb6, 0xd, 0xd2, 0x76, 0x60, 0xd1, 0x1e, + 0xd5, 0xf9, 0x34, 0x1c, 0x7, 0x70, 0x11, 0xe4, 0xb3, 0x20, 0x4a, + 0x2a, 0xf6, 0x66, 0xe3, 0xff, 0x3c, 0x35, 0x82, 0xd6, 0x7c}, + }, + { + {0xf3, 0xf4, 0xac, 0x68, 0x60, 0xcd, 0x65, 0xa6, 0xd3, 0xe3, 0xd7, + 0x3c, 0x18, 0x2d, 0xd9, 0x42, 0xd9, 0x25, 0x60, 0x33, 0x9d, 0x38, + 0x59, 0x57, 0xff, 0xd8, 0x2c, 0x2b, 0x3b, 0x25, 0xf0, 0x3e}, + {0xb6, 0xfa, 0x87, 0xd8, 0x5b, 0xa4, 0xe1, 0xb, 0x6e, 0x3b, 0x40, + 0xba, 0x32, 0x6a, 0x84, 0x2a, 0x0, 0x60, 0x6e, 0xe9, 0x12, 0x10, + 0x92, 0xd9, 0x43, 0x9, 0xdc, 0x3b, 0x86, 0xc8, 0x38, 0x28}, + {0x30, 0x50, 0x46, 0x4a, 0xcf, 0xb0, 0x6b, 0xd1, 0xab, 0x77, 0xc5, + 0x15, 0x41, 0x6b, 0x49, 0xfa, 0x9d, 0x41, 0xab, 0xf4, 0x8a, 0xae, + 0xcf, 0x82, 0x12, 0x28, 0xa8, 0x6, 0xa6, 0xb8, 0xdc, 0x21}, + }, + { + {0xba, 0x31, 0x77, 0xbe, 0xfa, 0x0, 0x8d, 0x9a, 0x89, 0x18, 0x9e, + 0x62, 0x7e, 0x60, 0x3, 0x82, 0x7f, 0xd9, 0xf3, 0x43, 0x37, 0x2, + 0xcc, 0xb2, 0x8b, 0x67, 0x6f, 0x6c, 0xbf, 0xd, 0x84, 0x5d}, + {0xc8, 0x9f, 0x9d, 0x8c, 0x46, 0x4, 0x60, 0x5c, 0xcb, 0xa3, 0x2a, + 0xd4, 0x6e, 0x9, 0x40, 0x25, 0x9c, 0x2f, 0xee, 0x12, 0x4c, 0x4d, + 0x5b, 0x12, 0xab, 0x1d, 0xa3, 0x94, 0x81, 0xd0, 0xc3, 0xb}, + {0x8b, 0xe1, 0x9f, 0x30, 0xd, 0x38, 0x6e, 0x70, 0xc7, 0x65, 0xe1, + 0xb9, 0xa6, 0x2d, 0xb0, 0x6e, 0xab, 0x20, 0xae, 0x7d, 0x99, 0xba, + 0xbb, 0x57, 0xdd, 0x96, 0xc1, 0x2a, 0x23, 0x76, 0x42, 0x3a}, + }, + { + {0xcb, 0x7e, 0x44, 0xdb, 0x72, 0xc1, 0xf8, 0x3b, 0xbd, 0x2d, 0x28, + 0xc6, 0x1f, 0xc4, 0xcf, 0x5f, 0xfe, 0x15, 0xaa, 0x75, 0xc0, 0xff, + 0xac, 0x80, 0xf9, 0xa9, 0xe1, 0x24, 0xe8, 0xc9, 0x70, 0x7}, + {0xfa, 0x84, 0x70, 0x8a, 0x2c, 0x43, 0x42, 0x4b, 0x45, 0xe5, 0xb9, + 0xdf, 0xe3, 0x19, 0x8a, 0x89, 0x5d, 0xe4, 0x58, 0x9c, 0x21, 0x0, + 0x9f, 0xbe, 0xd1, 0xeb, 0x6d, 0xa1, 0xce, 0x77, 0xf1, 0x1f}, + {0xfd, 0xb5, 0xb5, 0x45, 0x9a, 0xd9, 0x61, 0xcf, 0x24, 0x79, 0x3a, + 0x1b, 0xe9, 0x84, 0x9, 0x86, 0x89, 0x3e, 0x3e, 0x30, 0x19, 0x9, + 0x30, 0xe7, 0x1e, 0xb, 0x50, 0x41, 0xfd, 0x64, 0xf2, 0x39}, + }, + { + {0xe1, 0x7b, 0x9, 0xfe, 0xab, 0x4a, 0x9b, 0xd1, 0x29, 0x19, 0xe0, + 0xdf, 0xe1, 0xfc, 0x6d, 0xa4, 0xff, 0xf1, 0xa6, 0x2c, 0x94, 0x8, + 0xc9, 0xc3, 0x4e, 0xf1, 0x35, 0x2c, 0x27, 0x21, 0xc6, 0x65}, + {0x9c, 0xe2, 0xe7, 0xdb, 0x17, 0x34, 0xad, 0xa7, 0x9c, 0x13, 0x9c, + 0x2b, 0x6a, 0x37, 0x94, 0xbd, 0xa9, 0x7b, 0x59, 0x93, 0x8e, 0x1b, + 0xe9, 0xa0, 0x40, 0x98, 0x88, 0x68, 0x34, 0xd7, 0x12, 0x17}, + {0xdd, 0x93, 0x31, 0xce, 0xf8, 0x89, 0x2b, 0xe7, 0xbb, 0xc0, 0x25, + 0xa1, 0x56, 0x33, 0x10, 0x4d, 0x83, 0xfe, 0x1c, 0x2e, 0x3d, 0xa9, + 0x19, 0x4, 0x72, 0xe2, 0x9c, 0xb1, 0xa, 0x80, 0xf9, 0x22}, + }, + { + {0xac, 0xfd, 0x6e, 0x9a, 0xdd, 0x9f, 0x2, 0x42, 0x41, 0x49, 0xa5, + 0x34, 0xbe, 0xce, 0x12, 0xb9, 0x7b, 0xf3, 0xbd, 0x87, 0xb9, 0x64, + 0xf, 0x64, 0xb4, 0xca, 0x98, 0x85, 0xd3, 0xa4, 0x71, 0x41}, + {0xcb, 0xf8, 0x9e, 0x3e, 0x8a, 0x36, 0x5a, 0x60, 0x15, 0x47, 0x50, + 0xa5, 0x22, 0xc0, 0xe9, 0xe3, 0x8f, 0x24, 0x24, 0x5f, 0xb0, 0x48, + 0x3d, 0x55, 0xe5, 0x26, 0x76, 0x64, 0xcd, 0x16, 0xf4, 0x13}, + {0x8c, 0x4c, 0xc9, 0x99, 0xaa, 0x58, 0x27, 0xfa, 0x7, 0xb8, 0x0, + 0xb0, 0x6f, 0x6f, 0x0, 0x23, 0x92, 0x53, 0xda, 0xad, 0xdd, 0x91, + 0xd2, 0xfb, 0xab, 0xd1, 0x4b, 0x57, 0xfa, 0x14, 0x82, 0x50}, + }, + { + {0xd6, 0x3, 0xd0, 0x53, 0xbb, 0x15, 0x1a, 0x46, 0x65, 0xc9, 0xf3, + 0xbc, 0x88, 0x28, 0x10, 0xb2, 0x5a, 0x3a, 0x68, 0x6c, 0x75, 0x76, + 0xc5, 0x27, 0x47, 0xb4, 0x6c, 0xc8, 0xa4, 0x58, 0x77, 0x3a}, + {0x4b, 0xfe, 0xd6, 0x3e, 0x15, 0x69, 0x2, 0xc2, 0xc4, 0x77, 0x1d, + 0x51, 0x39, 0x67, 0x5a, 0xa6, 0x94, 0xaf, 0x14, 0x2c, 0x46, 0x26, + 0xde, 0xcb, 0x4b, 0xa7, 0xab, 0x6f, 0xec, 0x60, 0xf9, 0x22}, + {0x76, 0x50, 0xae, 0x93, 0xf6, 0x11, 0x81, 0x54, 0xa6, 0x54, 0xfd, + 0x1d, 0xdf, 0x21, 0xae, 0x1d, 0x65, 0x5e, 0x11, 0xf3, 0x90, 0x8c, + 0x24, 0x12, 0x94, 0xf4, 0xe7, 0x8d, 0x5f, 0xd1, 0x9f, 0x5d}, + }, + { + {0x1e, 0x52, 0xd7, 0xee, 0x2a, 0x4d, 0x24, 0x3f, 0x15, 0x96, 0x2e, + 0x43, 0x28, 0x90, 0x3a, 0x8e, 0xd4, 0x16, 0x9c, 0x2e, 0x77, 0xba, + 0x64, 0xe1, 0xd8, 0x98, 0xeb, 0x47, 0xfa, 0x87, 0xc1, 0x3b}, + {0x7f, 0x72, 0x63, 0x6d, 0xd3, 0x8, 0x14, 0x3, 0x33, 0xb5, 0xc7, + 0xd7, 0xef, 0x9a, 0x37, 0x6a, 0x4b, 0xe2, 0xae, 0xcc, 0xc5, 0x8f, + 0xe1, 0xa9, 0xd3, 0xbe, 0x8f, 0x4f, 0x91, 0x35, 0x2f, 0x33}, + {0xc, 0xc2, 0x86, 0xea, 0x15, 0x1, 0x47, 0x6d, 0x25, 0xd1, 0x46, + 0x6c, 0xcb, 0xb7, 0x8a, 0x99, 0x88, 0x1, 0x66, 0x3a, 0xb5, 0x32, + 0x78, 0xd7, 0x3, 0xba, 0x6f, 0x90, 0xce, 0x81, 0xd, 0x45}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 665000864555967, 2065379846933859, 370231110385876, - 350988370788628, 1233371373142985 -#else - 5153727, 9909285, 1723747, 30776558, 30523604, 5516873, - 19480852, 5230134, 43156425, 18378665 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2019367628972465, 676711900706637, 110710997811333, - 1108646842542025, 517791959672113 -#else - 36839857, 30090922, 7665485, 10083793, 28475525, 1649722, - 20654025, 16520125, 30598449, 7715701 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 965130719900578, 247011430587952, 526356006571389, - 91986625355052, 2157223321444601 -#else - 28881826, 14381568, 9657904, 3680757, 46927229, 7843315, - 35708204, 1370707, 29794553, 32145132 -#endif - }}, + {0x3f, 0x74, 0xae, 0x1c, 0x96, 0xd8, 0x74, 0xd0, 0xed, 0x63, 0x1c, + 0xee, 0xf5, 0x18, 0x6d, 0xf8, 0x29, 0xed, 0xf4, 0xe7, 0x5b, 0xc5, + 0xbd, 0x97, 0x8, 0xb1, 0x3a, 0x66, 0x79, 0xd2, 0xba, 0x4c}, + {0x75, 0x52, 0x20, 0xa6, 0xa1, 0xb6, 0x7b, 0x6e, 0x83, 0x8e, 0x3c, + 0x41, 0xd7, 0x21, 0x4f, 0xaa, 0xb2, 0x5c, 0x8f, 0xe8, 0x55, 0xd1, + 0x56, 0x6f, 0xe1, 0x5b, 0x34, 0xa6, 0x4b, 0x5d, 0xe2, 0x2d}, + {0xcd, 0x1f, 0xd7, 0xa0, 0x24, 0x90, 0xd1, 0x80, 0xf8, 0x8a, 0x28, + 0xfb, 0xa, 0xc2, 0x25, 0xc5, 0x19, 0x64, 0x3a, 0x5f, 0x4b, 0x97, + 0xa3, 0xb1, 0x33, 0x72, 0x0, 0xe2, 0xef, 0xbc, 0x7f, 0x7d}, + }, + { + {0x94, 0x90, 0xc2, 0xf3, 0xc5, 0x5d, 0x7c, 0xcd, 0xab, 0x5, 0x91, + 0x2a, 0x9a, 0xa2, 0x81, 0xc7, 0x58, 0x30, 0x1c, 0x42, 0x36, 0x1d, + 0xc6, 0x80, 0xd7, 0xd4, 0xd8, 0xdc, 0x96, 0xd1, 0x9c, 0x4f}, + {0x1, 0x28, 0x6b, 0x26, 0x6a, 0x1e, 0xef, 0xfa, 0x16, 0x9f, 0x73, + 0xd5, 0xc4, 0x68, 0x6c, 0x86, 0x2c, 0x76, 0x3, 0x1b, 0xbc, 0x2f, + 0x8a, 0xf6, 0x8d, 0x5a, 0xb7, 0x87, 0x5e, 0x43, 0x75, 0x59}, + {0x68, 0x37, 0x7b, 0x6a, 0xd8, 0x97, 0x92, 0x19, 0x63, 0x7a, 0xd1, + 0x1a, 0x24, 0x58, 0xd0, 0xd0, 0x17, 0xc, 0x1c, 0x5c, 0xad, 0x9c, + 0x2, 0xba, 0x7, 0x3, 0x7a, 0x38, 0x84, 0xd0, 0xcd, 0x7c}, + }, + { + {0x93, 0xcc, 0x60, 0x67, 0x18, 0x84, 0xc, 0x9b, 0x99, 0x2a, 0xb3, + 0x1a, 0x7a, 0x0, 0xae, 0xcd, 0x18, 0xda, 0xb, 0x62, 0x86, 0xec, + 0x8d, 0xa8, 0x44, 0xca, 0x90, 0x81, 0x84, 0xca, 0x93, 0x35}, + {0x17, 0x4, 0x26, 0x6d, 0x2c, 0x42, 0xa6, 0xdc, 0xbd, 0x40, 0x82, + 0x94, 0x50, 0x3d, 0x15, 0xae, 0x77, 0xc6, 0x68, 0xfb, 0xb4, 0xc1, + 0xc0, 0xa9, 0x53, 0xcf, 0xd0, 0x61, 0xed, 0xd0, 0x8b, 0x42}, + {0xa7, 0x9a, 0x84, 0x5e, 0x9a, 0x18, 0x13, 0x92, 0xcd, 0xfa, 0xd8, + 0x65, 0x35, 0xc3, 0xd8, 0xd4, 0xd1, 0xbb, 0xfd, 0x53, 0x5b, 0x54, + 0x52, 0x8c, 0xe6, 0x63, 0x2d, 0xda, 0x8, 0x83, 0x39, 0x27}, + }, + { + {0x53, 0x24, 0x70, 0xa, 0x4c, 0xe, 0xa1, 0xb9, 0xde, 0x1b, 0x7d, + 0xd5, 0x66, 0x58, 0xa2, 0xf, 0xf7, 0xda, 0x27, 0xcd, 0xb5, 0xd9, + 0xb9, 0xff, 0xfd, 0x33, 0x2c, 0x49, 0x45, 0x29, 0x2c, 0x57}, + {0x13, 0xd4, 0x5e, 0x43, 0x28, 0x8d, 0xc3, 0x42, 0xc9, 0xcc, 0x78, + 0x32, 0x60, 0xf3, 0x50, 0xbd, 0xef, 0x3, 0xda, 0x79, 0x1a, 0xab, + 0x7, 0xbb, 0x55, 0x33, 0x8c, 0xbe, 0xae, 0x97, 0x95, 0x26}, + {0xbe, 0x30, 0xcd, 0xd6, 0x45, 0xc7, 0x7f, 0xc7, 0xfb, 0xae, 0xba, + 0xe3, 0xd3, 0xe8, 0xdf, 0xe4, 0xc, 0xda, 0x5d, 0xaa, 0x30, 0x88, + 0x2c, 0xa2, 0x80, 0xca, 0x5b, 0xc0, 0x98, 0x54, 0x98, 0x7f}, + }, + { + {0x63, 0x63, 0xbf, 0xf, 0x52, 0x15, 0x56, 0xd3, 0xa6, 0xfb, 0x4d, + 0xcf, 0x45, 0x5a, 0x4, 0x8, 0xc2, 0xa0, 0x3f, 0x87, 0xbc, 0x4f, + 0xc2, 0xee, 0xe7, 0x12, 0x9b, 0xd6, 0x3c, 0x65, 0xf2, 0x30}, + {0x17, 0xe1, 0xb, 0x9f, 0x88, 0xce, 0x49, 0x38, 0x88, 0xa2, 0x54, + 0x7b, 0x1b, 0xad, 0x5, 0x80, 0x1c, 0x92, 0xfc, 0x23, 0x9f, 0xc3, + 0xa3, 0x3d, 0x4, 0xf3, 0x31, 0xa, 0x47, 0xec, 0xc2, 0x76}, + {0x85, 0xc, 0xc1, 0xaa, 0x38, 0xc9, 0x8, 0x8a, 0xcb, 0x6b, 0x27, + 0xdb, 0x60, 0x9b, 0x17, 0x46, 0x70, 0xac, 0x6f, 0xe, 0x1e, 0xc0, + 0x20, 0xa9, 0xda, 0x73, 0x64, 0x59, 0xf1, 0x73, 0x12, 0x2f}, + }, + { + {0xc0, 0xb, 0xa7, 0x55, 0xd7, 0x8b, 0x48, 0x30, 0xe7, 0x42, 0xd4, + 0xf1, 0xa4, 0xb5, 0xd6, 0x6, 0x62, 0x61, 0x59, 0xbc, 0x9e, 0xa6, + 0xd1, 0xea, 0x84, 0xf7, 0xc5, 0xed, 0x97, 0x19, 0xac, 0x38}, + {0x11, 0x1e, 0xe0, 0x8a, 0x7c, 0xfc, 0x39, 0x47, 0x9f, 0xab, 0x6a, + 0x4a, 0x90, 0x74, 0x52, 0xfd, 0x2e, 0x8f, 0x72, 0x87, 0x82, 0x8a, + 0xd9, 0x41, 0xf2, 0x69, 0x5b, 0xd8, 0x2a, 0x57, 0x9e, 0x5d}, + {0x3b, 0xb1, 0x51, 0xa7, 0x17, 0xb5, 0x66, 0x6, 0x8c, 0x85, 0x9b, + 0x7e, 0x86, 0x6, 0x7d, 0x74, 0x49, 0xde, 0x4d, 0x45, 0x11, 0xc0, + 0xac, 0xac, 0x9c, 0xe6, 0xe9, 0xbf, 0x9c, 0xcd, 0xdf, 0x22}, + }, + { + {0xa1, 0xe0, 0x3b, 0x10, 0xb4, 0x59, 0xec, 0x56, 0x69, 0xf9, 0x59, + 0xd2, 0xec, 0xba, 0xe3, 0x2e, 0x32, 0xcd, 0xf5, 0x13, 0x94, 0xb2, + 0x7c, 0x79, 0x72, 0xe4, 0xcd, 0x24, 0x78, 0x87, 0xe9, 0xf}, + {0xd9, 0xc, 0xd, 0xc3, 0xe0, 0xd2, 0xdb, 0x8d, 0x33, 0x43, 0xbb, + 0xac, 0x5f, 0x66, 0x8e, 0xad, 0x1f, 0x96, 0x2a, 0x32, 0x8c, 0x25, + 0x6b, 0x8f, 0xc7, 0xc1, 0x48, 0x54, 0xc0, 0x16, 0x29, 0x6b}, + {0x3b, 0x91, 0xba, 0xa, 0xd1, 0x34, 0xdb, 0x7e, 0xe, 0xac, 0x6d, + 0x2e, 0x82, 0xcd, 0xa3, 0x4e, 0x15, 0xf8, 0x78, 0x65, 0xff, 0x3d, + 0x8, 0x66, 0x17, 0xa, 0xf0, 0x7f, 0x30, 0x3f, 0x30, 0x4c}, + }, + { + {0x0, 0x45, 0xd9, 0xd, 0x58, 0x3, 0xfc, 0x29, 0x93, 0xec, 0xbb, + 0x6f, 0xa4, 0x7a, 0xd2, 0xec, 0xf8, 0xa7, 0xe2, 0xc2, 0x5f, 0x15, + 0xa, 0x13, 0xd5, 0xa1, 0x6, 0xb7, 0x1a, 0x15, 0x6b, 0x41}, + {0x85, 0x8c, 0xb2, 0x17, 0xd6, 0x3b, 0xa, 0xd3, 0xea, 0x3b, 0x77, + 0x39, 0xb7, 0x77, 0xd3, 0xc5, 0xbf, 0x5c, 0x6a, 0x1e, 0x8c, 0xe7, + 0xc6, 0xc6, 0xc4, 0xb7, 0x2a, 0x8b, 0xf7, 0xb8, 0x61, 0xd}, + {0xb0, 0x36, 0xc1, 0xe9, 0xef, 0xd7, 0xa8, 0x56, 0x20, 0x4b, 0xe4, + 0x58, 0xcd, 0xe5, 0x7, 0xbd, 0xab, 0xe0, 0x57, 0x1b, 0xda, 0x2f, + 0xe6, 0xaf, 0xd2, 0xe8, 0x77, 0x42, 0xf7, 0x2a, 0x1a, 0x19}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2068619540119183, 1966274918058806, 957728544705549, - 729906502578991, 159834893065166 -#else - 14499471, 30824833, 33917750, 29299779, 28494861, 14271267, - 30290735, 10876454, 33954766, 2381725 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2073601412052185, 31021124762708, 264500969797082, - 248034690651703, 1030252227928288 -#else - 59913433, 30899068, 52378708, 462250, 39384538, 3941371, - 60872247, 3696004, 34808032, 15351954 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 551790716293402, 1989538725166328, 801169423371717, - 2052451893578887, 678432056995012 -#else - 27431194, 8222322, 16448760, 29646437, 48401861, 11938354, - 34147463, 30583916, 29551812, 10109425 -#endif - }}, + {0xfb, 0xe, 0x46, 0x4f, 0x43, 0x2b, 0xe6, 0x9f, 0xd6, 0x7, 0x36, + 0xa6, 0xd4, 0x3, 0xd3, 0xde, 0x24, 0xda, 0xa0, 0xb7, 0xe, 0x21, + 0x52, 0xf0, 0x93, 0x5b, 0x54, 0x0, 0xbe, 0x7d, 0x7e, 0x23}, + {0x31, 0x14, 0x3c, 0xc5, 0x4b, 0xf7, 0x16, 0xce, 0xde, 0xed, 0x72, + 0x20, 0xce, 0x25, 0x97, 0x2b, 0xe7, 0x3e, 0xb2, 0xb5, 0x6f, 0xc3, + 0xb9, 0xb8, 0x8, 0xc9, 0x5c, 0xb, 0x45, 0xe, 0x2e, 0x7e}, + {0x30, 0xb4, 0x1, 0x67, 0xed, 0x75, 0x35, 0x1, 0x10, 0xfd, 0xb, + 0x9f, 0xe6, 0x94, 0x10, 0x23, 0x22, 0x7f, 0xe4, 0x83, 0x15, 0xf, + 0x32, 0x75, 0xe3, 0x55, 0x11, 0xb1, 0x99, 0xa6, 0xaf, 0x71}, + }, + { + {0xd6, 0x50, 0x3b, 0x47, 0x1c, 0x3c, 0x42, 0xea, 0x10, 0xef, 0x38, + 0x3b, 0x1f, 0x7a, 0xe8, 0x51, 0x95, 0xbe, 0xc9, 0xb2, 0x5f, 0xbf, + 0x84, 0x9b, 0x1c, 0x9a, 0xf8, 0x78, 0xbc, 0x1f, 0x73, 0x0}, + {0x1d, 0xb6, 0x53, 0x39, 0x9b, 0x6f, 0xce, 0x65, 0xe6, 0x41, 0xa1, + 0xaf, 0xea, 0x39, 0x58, 0xc6, 0xfe, 0x59, 0xf7, 0xa9, 0xfd, 0x5f, + 0x43, 0xf, 0x8e, 0xc2, 0xb1, 0xc2, 0xe9, 0x42, 0x11, 0x2}, + {0x80, 0x18, 0xf8, 0x48, 0x18, 0xc7, 0x30, 0xe4, 0x19, 0xc1, 0xce, + 0x5e, 0x22, 0xc, 0x96, 0xbf, 0xe3, 0x15, 0xba, 0x6b, 0x83, 0xe0, + 0xda, 0xb6, 0x8, 0x58, 0xe1, 0x47, 0x33, 0x6f, 0x4d, 0x4c}, + }, + { + {0x70, 0x19, 0x8f, 0x98, 0xfc, 0xdd, 0xc, 0x2f, 0x1b, 0xf5, 0xb9, + 0xb0, 0x27, 0x62, 0x91, 0x6b, 0xbe, 0x76, 0x91, 0x77, 0xc4, 0xb6, + 0xc7, 0x6e, 0xa8, 0x9f, 0x8f, 0xa8, 0x0, 0x95, 0xbf, 0x38}, + {0xc9, 0x1f, 0x7d, 0xc1, 0xcf, 0xec, 0xf7, 0x18, 0x14, 0x3c, 0x40, + 0x51, 0xa6, 0xf5, 0x75, 0x6c, 0xdf, 0xc, 0xee, 0xf7, 0x2b, 0x71, + 0xde, 0xdb, 0x22, 0x7a, 0xe4, 0xa7, 0xaa, 0xdd, 0x3f, 0x19}, + {0x6f, 0x87, 0xe8, 0x37, 0x3c, 0xc9, 0xd2, 0x1f, 0x2c, 0x46, 0xd1, + 0x18, 0x5a, 0x1e, 0xf6, 0xa2, 0x76, 0x12, 0x24, 0x39, 0x82, 0xf5, + 0x80, 0x50, 0x69, 0x49, 0xd, 0xbf, 0x9e, 0xb9, 0x6f, 0x6a}, + }, + { + {0xc6, 0x23, 0xe4, 0xb6, 0xb5, 0x22, 0xb1, 0xee, 0x8e, 0xff, 0x86, + 0xf2, 0x10, 0x70, 0x9d, 0x93, 0x8c, 0x5d, 0xcf, 0x1d, 0x83, 0x2a, + 0xa9, 0x90, 0x10, 0xeb, 0xc5, 0x42, 0x9f, 0xda, 0x6f, 0x13}, + {0xeb, 0x55, 0x8, 0x56, 0xbb, 0xc1, 0x46, 0x6a, 0x9d, 0xf0, 0x93, + 0xf8, 0x38, 0xbb, 0x16, 0x24, 0xc1, 0xac, 0x71, 0x8f, 0x37, 0x11, + 0x1d, 0xd7, 0xea, 0x96, 0x18, 0xa3, 0x14, 0x69, 0xf7, 0x75}, + {0xd1, 0xbd, 0x5, 0xa3, 0xb1, 0xdf, 0x4c, 0xf9, 0x8, 0x2c, 0xf8, + 0x9f, 0x9d, 0x4b, 0x36, 0xf, 0x8a, 0x58, 0xbb, 0xc3, 0xa5, 0xd8, + 0x87, 0x2a, 0xba, 0xdc, 0xe8, 0xb, 0x51, 0x83, 0x21, 0x2}, + }, + { + {0x7f, 0x7a, 0x30, 0x43, 0x1, 0x71, 0x5a, 0x9d, 0x5f, 0xa4, 0x7d, + 0xc4, 0x9e, 0xde, 0x63, 0xb0, 0xd3, 0x7a, 0x92, 0xbe, 0x52, 0xfe, + 0xbb, 0x22, 0x6c, 0x42, 0x40, 0xfd, 0x41, 0xc4, 0x87, 0x13}, + {0x14, 0x2d, 0xad, 0x5e, 0x38, 0x66, 0xf7, 0x4a, 0x30, 0x58, 0x7c, + 0xca, 0x80, 0xd8, 0x8e, 0xa0, 0x3d, 0x1e, 0x21, 0x10, 0xe6, 0xa6, + 0x13, 0xd, 0x3, 0x6c, 0x80, 0x7b, 0xe1, 0x1c, 0x7, 0x6a}, + {0xf8, 0x8a, 0x97, 0x87, 0xd1, 0xc3, 0xd3, 0xb5, 0x13, 0x44, 0xe, + 0x7f, 0x3d, 0x5a, 0x2b, 0x72, 0xa0, 0x7c, 0x47, 0xbb, 0x48, 0x48, + 0x7b, 0xd, 0x92, 0xdc, 0x1e, 0xaf, 0x6a, 0xb2, 0x71, 0x31}, + }, + { + {0xd1, 0x47, 0x8a, 0xb2, 0xd8, 0xb7, 0xd, 0xa6, 0xf1, 0xa4, 0x70, + 0x17, 0xd6, 0x14, 0xbf, 0xa6, 0x58, 0xbd, 0xdd, 0x53, 0x93, 0xf8, + 0xa1, 0xd4, 0xe9, 0x43, 0x42, 0x34, 0x63, 0x4a, 0x51, 0x6c}, + {0xa8, 0x4c, 0x56, 0x97, 0x90, 0x31, 0x2f, 0xa9, 0x19, 0xe1, 0x75, + 0x22, 0x4c, 0xb8, 0x7b, 0xff, 0x50, 0x51, 0x87, 0xa4, 0x37, 0xfe, + 0x55, 0x4f, 0x5a, 0x83, 0xf0, 0x3c, 0x87, 0xd4, 0x1f, 0x22}, + {0x41, 0x63, 0x15, 0x3a, 0x4f, 0x20, 0x22, 0x23, 0x2d, 0x3, 0xa, + 0xba, 0xe9, 0xe0, 0x73, 0xfb, 0xe, 0x3, 0xf, 0x41, 0x4c, 0xdd, + 0xe0, 0xfc, 0xaa, 0x4a, 0x92, 0xfb, 0x96, 0xa5, 0xda, 0x48}, + }, + { + {0x93, 0x97, 0x4c, 0xc8, 0x5d, 0x1d, 0xf6, 0x14, 0x6, 0x82, 0x41, + 0xef, 0xe3, 0xf9, 0x41, 0x99, 0xac, 0x77, 0x62, 0x34, 0x8f, 0xb8, + 0xf5, 0xcd, 0xa9, 0x79, 0x8a, 0xe, 0xfa, 0x37, 0xc8, 0x58}, + {0xc7, 0x9c, 0xa5, 0x5c, 0x66, 0x8e, 0xca, 0x6e, 0xa0, 0xac, 0x38, + 0x2e, 0x4b, 0x25, 0x47, 0xa8, 0xce, 0x17, 0x1e, 0xd2, 0x8, 0xc7, + 0xaf, 0x31, 0xf7, 0x4a, 0xd8, 0xca, 0xfc, 0xd6, 0x6d, 0x67}, + {0x58, 0x90, 0xfc, 0x96, 0x85, 0x68, 0xf9, 0xc, 0x1b, 0xa0, 0x56, + 0x7b, 0xf3, 0xbb, 0xdc, 0x1d, 0x6a, 0xd6, 0x35, 0x49, 0x7d, 0xe7, + 0xc2, 0xdc, 0xa, 0x7f, 0xa5, 0xc6, 0xf2, 0x73, 0x4f, 0x1c}, + }, + { + {0x84, 0x34, 0x7c, 0xfc, 0x6e, 0x70, 0x6e, 0xb3, 0x61, 0xcf, 0xc1, + 0xc3, 0xb4, 0xc9, 0xdf, 0x73, 0xe5, 0xc7, 0x1c, 0x78, 0xc9, 0x79, + 0x1d, 0xeb, 0x5c, 0x67, 0xaf, 0x7d, 0xdb, 0x9a, 0x45, 0x70}, + {0xbb, 0xa0, 0x5f, 0x30, 0xbd, 0x4f, 0x7a, 0xe, 0xad, 0x63, 0xc6, + 0x54, 0xe0, 0x4c, 0x9d, 0x82, 0x48, 0x38, 0xe3, 0x2f, 0x83, 0xc3, + 0x21, 0xf4, 0x42, 0x4c, 0xf6, 0x1b, 0xd, 0xc8, 0x5a, 0x79}, + {0xb3, 0x2b, 0xb4, 0x91, 0x49, 0xdb, 0x91, 0x1b, 0xca, 0xdc, 0x2, + 0x4b, 0x23, 0x96, 0x26, 0x57, 0xdc, 0x78, 0x8c, 0x1f, 0xe5, 0x9e, + 0xdf, 0x9f, 0xd3, 0x1f, 0xe2, 0x8c, 0x84, 0x62, 0xe1, 0x5f}, }, }, { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1368953770187805, 790347636712921, 437508475667162, - 2142576377050580, 1932081720066286 -#else - 53451805, 20399000, 35825113, 11777097, 21447386, 6519384, - 64730580, 31926875, 10092782, 28790261 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 953638594433374, 1092333936795051, 1419774766716690, - 805677984380077, 859228993502513 -#else - 27939166, 14210322, 4677035, 16277044, 44144402, 21156292, - 34600109, 12005537, 49298737, 12803509 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1200766035879111, 20142053207432, 1465634435977050, - 1645256912097844, 295121984874596 -#else - 17228999, 17892808, 65875336, 300139, 65883994, 21839654, - 30364212, 24516238, 18016356, 4397660 -#endif - }}, + {0x8, 0xb2, 0x7c, 0x5d, 0x2d, 0x85, 0x79, 0x28, 0xe7, 0xf2, 0x7d, + 0x68, 0x70, 0xdd, 0xde, 0xb8, 0x91, 0x78, 0x68, 0x21, 0xab, 0xff, + 0xb, 0xdc, 0x35, 0xaa, 0x7d, 0x67, 0x43, 0xc0, 0x44, 0x2b}, + {0x1a, 0x96, 0x94, 0xe1, 0x4f, 0x21, 0x59, 0x4e, 0x4f, 0xcd, 0x71, + 0xd, 0xc7, 0x7d, 0xbe, 0x49, 0x2d, 0xf2, 0x50, 0x3b, 0xd2, 0xcf, + 0x0, 0x93, 0x32, 0x72, 0x91, 0xfc, 0x46, 0xd4, 0x89, 0x47}, + {0x8e, 0xb7, 0x4e, 0x7, 0xab, 0x87, 0x1c, 0x1a, 0x67, 0xf4, 0xda, + 0x99, 0x8e, 0xd1, 0xc6, 0xfa, 0x67, 0x90, 0x4f, 0x48, 0xcd, 0xbb, + 0xac, 0x3e, 0xe4, 0xa4, 0xb9, 0x2b, 0xef, 0x2e, 0xc5, 0x60}, + }, + { + {0x11, 0x6d, 0xae, 0x7c, 0xc2, 0xc5, 0x2b, 0x70, 0xab, 0x8c, 0xa4, + 0x54, 0x9b, 0x69, 0xc7, 0x44, 0xb2, 0x2e, 0x49, 0xba, 0x56, 0x40, + 0xbc, 0xef, 0x6d, 0x67, 0xb6, 0xd9, 0x48, 0x72, 0xd7, 0x70}, + {0xf1, 0x8b, 0xfd, 0x3b, 0xbc, 0x89, 0x5d, 0xb, 0x1a, 0x55, 0xf3, + 0xc9, 0x37, 0x92, 0x6b, 0xb0, 0xf5, 0x28, 0x30, 0xd5, 0xb0, 0x16, + 0x4c, 0xe, 0xab, 0xca, 0xcf, 0x2c, 0x31, 0x9c, 0xbc, 0x10}, + {0x5b, 0xa0, 0xc2, 0x3e, 0x4b, 0xe8, 0x8a, 0xaa, 0xe0, 0x81, 0x17, + 0xed, 0xf4, 0x9e, 0x69, 0x98, 0xd1, 0x85, 0x8e, 0x70, 0xe4, 0x13, + 0x45, 0x79, 0x13, 0xf4, 0x76, 0xa9, 0xd3, 0x5b, 0x75, 0x63}, + }, + { + {0xb7, 0xac, 0xf1, 0x97, 0x18, 0x10, 0xc7, 0x3d, 0xd8, 0xbb, 0x65, + 0xc1, 0x5e, 0x7d, 0xda, 0x5d, 0xf, 0x2, 0xa1, 0xf, 0x9c, 0x5b, + 0x8e, 0x50, 0x56, 0x2a, 0xc5, 0x37, 0x17, 0x75, 0x63, 0x27}, + {0x53, 0x8, 0xd1, 0x2a, 0x3e, 0xa0, 0x5f, 0xb5, 0x69, 0x35, 0xe6, + 0x9e, 0x90, 0x75, 0x6f, 0x35, 0x90, 0xb8, 0x69, 0xbe, 0xfd, 0xf1, + 0xf9, 0x9f, 0x84, 0x6f, 0xc1, 0x8b, 0xc4, 0xc1, 0x8c, 0xd}, + {0xa9, 0x19, 0xb4, 0x6e, 0xd3, 0x2, 0x94, 0x2, 0xa5, 0x60, 0xb4, + 0x77, 0x7e, 0x4e, 0xb4, 0xf0, 0x56, 0x49, 0x3c, 0xd4, 0x30, 0x62, + 0xa8, 0xcf, 0xe7, 0x66, 0xd1, 0x7a, 0x8a, 0xdd, 0xc2, 0x70}, + }, + { + {0x13, 0x7e, 0xed, 0xb8, 0x7d, 0x96, 0xd4, 0x91, 0x7a, 0x81, 0x76, + 0xd7, 0xa, 0x2f, 0x25, 0x74, 0x64, 0x25, 0x85, 0xd, 0xe0, 0x82, + 0x9, 0xe4, 0xe5, 0x3c, 0xa5, 0x16, 0x38, 0x61, 0xb8, 0x32}, + {0xe, 0xec, 0x6f, 0x9f, 0x50, 0x94, 0x61, 0x65, 0x8d, 0x51, 0xc6, + 0x46, 0xa9, 0x7e, 0x2e, 0xee, 0x5c, 0x9b, 0xe0, 0x67, 0xf3, 0xc1, + 0x33, 0x97, 0x95, 0x84, 0x94, 0x63, 0x63, 0xac, 0xf, 0x2e}, + {0x64, 0xcd, 0x48, 0xe4, 0xbe, 0xf7, 0xe7, 0x79, 0xd0, 0x86, 0x78, + 0x8, 0x67, 0x3a, 0xc8, 0x6a, 0x2e, 0xdb, 0xe4, 0xa0, 0xd9, 0xd4, + 0x9f, 0xf8, 0x41, 0x4f, 0x5a, 0x73, 0x5c, 0x21, 0x79, 0x41}, + }, + { + {0x34, 0xcd, 0x6b, 0x28, 0xb9, 0x33, 0xae, 0xe4, 0xdc, 0xd6, 0x9d, + 0x55, 0xb6, 0x7e, 0xef, 0xb7, 0x1f, 0x8e, 0xd3, 0xb3, 0x1f, 0x14, + 0x8b, 0x27, 0x86, 0xc2, 0x41, 0x22, 0x66, 0x85, 0xfa, 0x31}, + {0x2a, 0xed, 0xdc, 0xd7, 0xe7, 0x94, 0x70, 0x8c, 0x70, 0x9c, 0xd3, + 0x47, 0xc3, 0x8a, 0xfb, 0x97, 0x2, 0xd9, 0x6, 0xa9, 0x33, 0xe0, + 0x3b, 0xe1, 0x76, 0x9d, 0xd9, 0xc, 0xa3, 0x44, 0x3, 0x70}, + {0xf4, 0x22, 0x36, 0x2e, 0x42, 0x6c, 0x82, 0xaf, 0x2d, 0x50, 0x33, + 0x98, 0x87, 0x29, 0x20, 0xc1, 0x23, 0x91, 0x38, 0x2b, 0xe1, 0xb7, + 0xc1, 0x9b, 0x89, 0x24, 0x95, 0xa9, 0x12, 0x23, 0xbb, 0x24}, + }, + { + {0x6b, 0x5c, 0xf8, 0xf5, 0x2a, 0xc, 0xf8, 0x41, 0x94, 0x67, 0xfa, + 0x4, 0xc3, 0x84, 0x72, 0x68, 0xad, 0x1b, 0xba, 0xa3, 0x99, 0xdf, + 0x45, 0x89, 0x16, 0x5d, 0xeb, 0xff, 0xf9, 0x2a, 0x1d, 0xd}, + {0xc3, 0x67, 0xde, 0x32, 0x17, 0xed, 0xa8, 0xb1, 0x48, 0x49, 0x1b, + 0x46, 0x18, 0x94, 0xb4, 0x3c, 0xd2, 0xbc, 0xcf, 0x76, 0x43, 0x43, + 0xbd, 0x8e, 0x8, 0x80, 0x18, 0x1e, 0x87, 0x3e, 0xee, 0xf}, + {0xdf, 0x1e, 0x62, 0x32, 0xa1, 0x8a, 0xda, 0xa9, 0x79, 0x65, 0x22, + 0x59, 0xa1, 0x22, 0xb8, 0x30, 0x93, 0xc1, 0x9a, 0xa7, 0x7b, 0x19, + 0x4, 0x40, 0x76, 0x1d, 0x53, 0x18, 0x97, 0xd7, 0xac, 0x16}, + }, + { + {0xad, 0xb6, 0x87, 0x78, 0xc5, 0xc6, 0x59, 0xc9, 0xba, 0xfe, 0x90, + 0x5f, 0xad, 0x9e, 0xe1, 0x94, 0x4, 0xf5, 0x42, 0xa3, 0x62, 0x4e, + 0xe2, 0x16, 0x0, 0x17, 0x16, 0x18, 0x4b, 0xd3, 0x4e, 0x16}, + {0x3d, 0x1d, 0x9b, 0x2d, 0xaf, 0x72, 0xdf, 0x72, 0x5a, 0x24, 0x32, + 0xa4, 0x36, 0x2a, 0x46, 0x63, 0x37, 0x96, 0xb3, 0x16, 0x79, 0xa0, + 0xce, 0x3e, 0x9, 0x23, 0x30, 0xb9, 0xf6, 0xe, 0x3e, 0x12}, + {0x9a, 0xe6, 0x2f, 0x19, 0x4c, 0xd9, 0x7e, 0x48, 0x13, 0x15, 0x91, + 0x3a, 0xea, 0x2c, 0xae, 0x61, 0x27, 0xde, 0xa4, 0xb9, 0xd3, 0xf6, + 0x7b, 0x87, 0xeb, 0xf3, 0x73, 0x10, 0xc6, 0xf, 0xda, 0x78}, + }, + { + {0x94, 0x3a, 0xc, 0x68, 0xf1, 0x80, 0x9f, 0xa2, 0xe6, 0xe7, 0xe9, + 0x1a, 0x15, 0x7e, 0xf7, 0x71, 0x73, 0x79, 0x1, 0x48, 0x58, 0xf1, + 0x0, 0x11, 0xdd, 0x8d, 0xb3, 0x16, 0xb3, 0xa4, 0x4a, 0x5}, + {0x6a, 0xc6, 0x2b, 0xe5, 0x28, 0x5d, 0xf1, 0x5b, 0x8e, 0x1a, 0xf0, + 0x70, 0x18, 0xe3, 0x47, 0x2c, 0xdd, 0x8b, 0xc2, 0x6, 0xbc, 0xaf, + 0x19, 0x24, 0x3a, 0x17, 0x6b, 0x25, 0xeb, 0xde, 0x25, 0x2d}, + {0xb8, 0x7c, 0x26, 0x19, 0x8d, 0x46, 0xc8, 0xdf, 0xaf, 0x4d, 0xe5, + 0x66, 0x9c, 0x78, 0x28, 0xb, 0x17, 0xec, 0x6e, 0x66, 0x2a, 0x1d, + 0xeb, 0x2a, 0x60, 0xa7, 0x7d, 0xab, 0xa6, 0x10, 0x46, 0x13}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1735718747031557, 1248237894295956, 1204753118328107, - 976066523550493, 65943769534592 -#else - 56150021, 25864224, 4776340, 18600194, 27850027, 17952220, - 40489757, 14544524, 49631360, 982638 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1060098822528990, 1586825862073490, 212301317240126, - 1975302711403555, 666724059764335 -#else - 29253598, 15796703, 64244882, 23645547, 10057022, 3163536, - 7332899, 29434304, 46061167, 9934962 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1091990273418756, 1572899409348578, 80968014455247, - 306009358661350, 1520450739132526 -#else - 5793284, 16271923, 42977250, 23438027, 29188559, 1206517, - 52360934, 4559894, 36984942, 22656481 -#endif - }}, + {0x15, 0xf5, 0xd1, 0x77, 0xe7, 0x65, 0x2a, 0xcd, 0xf1, 0x60, 0xaa, + 0x8f, 0x87, 0x91, 0x89, 0x54, 0xe5, 0x6, 0xbc, 0xda, 0xbc, 0x3b, + 0xb7, 0xb1, 0xfb, 0xc9, 0x7c, 0xa9, 0xcb, 0x78, 0x48, 0x65}, + {0xfe, 0xb0, 0xf6, 0x8d, 0xc7, 0x8e, 0x13, 0x51, 0x1b, 0xf5, 0x75, + 0xe5, 0x89, 0xda, 0x97, 0x53, 0xb9, 0xf1, 0x7a, 0x71, 0x1d, 0x7a, + 0x20, 0x9, 0x50, 0xd6, 0x20, 0x2b, 0xba, 0xfd, 0x2, 0x21}, + {0xa1, 0xe6, 0x5c, 0x5, 0x5, 0xe4, 0x9e, 0x96, 0x29, 0xad, 0x51, + 0x12, 0x68, 0xa7, 0xbc, 0x36, 0x15, 0xa4, 0x7d, 0xaa, 0x17, 0xf5, + 0x1a, 0x3a, 0xba, 0xb2, 0xec, 0x29, 0xdb, 0x25, 0xd7, 0xa}, + }, + { + {0x85, 0x6f, 0x5, 0x9b, 0xc, 0xbc, 0xc7, 0xfe, 0xd7, 0xff, 0xf5, + 0xe7, 0x68, 0x52, 0x7d, 0x53, 0xfa, 0xae, 0x12, 0x43, 0x62, 0xc6, + 0xaf, 0x77, 0xd9, 0x9f, 0x39, 0x2, 0x53, 0x5f, 0x67, 0x4f}, + {0x57, 0x24, 0x4e, 0x83, 0xb1, 0x67, 0x42, 0xdc, 0xc5, 0x1b, 0xce, + 0x70, 0xb5, 0x44, 0x75, 0xb6, 0xd7, 0x5e, 0xd1, 0xf7, 0xb, 0x7a, + 0xf0, 0x1a, 0x50, 0x36, 0xa0, 0x71, 0xfb, 0xcf, 0xef, 0x4a}, + {0x1e, 0x17, 0x15, 0x4, 0x36, 0x36, 0x2d, 0xc3, 0x3b, 0x48, 0x98, + 0x89, 0x11, 0xef, 0x2b, 0xcd, 0x10, 0x51, 0x94, 0xd0, 0xad, 0x6e, + 0xa, 0x87, 0x61, 0x65, 0xa8, 0xa2, 0x72, 0xbb, 0xcc, 0xb}, + }, + { + {0x96, 0x12, 0xfe, 0x50, 0x4c, 0x5e, 0x6d, 0x18, 0x7e, 0x9f, 0xe8, + 0xfe, 0x82, 0x7b, 0x39, 0xe0, 0xb0, 0x31, 0x70, 0x50, 0xc5, 0xf6, + 0xc7, 0x3b, 0xc2, 0x37, 0x8f, 0x10, 0x69, 0xfd, 0x78, 0x66}, + {0xc8, 0xa9, 0xb1, 0xea, 0x2f, 0x96, 0x5e, 0x18, 0xcd, 0x7d, 0x14, + 0x65, 0x35, 0xe6, 0xe7, 0x86, 0xf2, 0x6d, 0x5b, 0xbb, 0x31, 0xe0, + 0x92, 0xb0, 0x3e, 0xb7, 0xd6, 0x59, 0xab, 0xf0, 0x24, 0x40}, + {0xc2, 0x63, 0x68, 0x63, 0x31, 0xfa, 0x86, 0x15, 0xf2, 0x33, 0x2d, + 0x57, 0x48, 0x8c, 0xf6, 0x7, 0xfc, 0xae, 0x9e, 0x78, 0x9f, 0xcc, + 0x73, 0x4f, 0x1, 0x47, 0xad, 0x8e, 0x10, 0xe2, 0x42, 0x2d}, + }, + { + {0x93, 0x75, 0x53, 0xf, 0xd, 0x7b, 0x71, 0x21, 0x4c, 0x6, 0x1e, + 0x13, 0xb, 0x69, 0x4e, 0x91, 0x9f, 0xe0, 0x2a, 0x75, 0xae, 0x87, + 0xb6, 0x1b, 0x6e, 0x3c, 0x42, 0x9b, 0xa7, 0xf3, 0xb, 0x42}, + {0x9b, 0xd2, 0xdf, 0x94, 0x15, 0x13, 0xf5, 0x97, 0x6a, 0x4c, 0x3f, + 0x31, 0x5d, 0x98, 0x55, 0x61, 0x10, 0x50, 0x45, 0x8, 0x7, 0x3f, + 0xa1, 0xeb, 0x22, 0xd3, 0xd2, 0xb8, 0x8, 0x26, 0x6b, 0x67}, + {0x47, 0x2b, 0x5b, 0x1c, 0x65, 0xba, 0x38, 0x81, 0x80, 0x1b, 0x1b, + 0x31, 0xec, 0xb6, 0x71, 0x86, 0xb0, 0x35, 0x31, 0xbc, 0xb1, 0xc, + 0xff, 0x7b, 0xe0, 0xf1, 0xc, 0x9c, 0xfa, 0x2f, 0x5d, 0x74}, + }, + { + {0x6a, 0x4e, 0xd3, 0x21, 0x57, 0xdf, 0x36, 0x60, 0xd0, 0xb3, 0x7b, + 0x99, 0x27, 0x88, 0xdb, 0xb1, 0xfa, 0x6a, 0x75, 0xc8, 0xc3, 0x9, + 0xc2, 0xd3, 0x39, 0xc8, 0x1d, 0x4c, 0xe5, 0x5b, 0xe1, 0x6}, + {0xbd, 0xc8, 0xc9, 0x2b, 0x1e, 0x5a, 0x52, 0xbf, 0x81, 0x9d, 0x47, + 0x26, 0x8, 0x26, 0x5b, 0xea, 0xdb, 0x55, 0x1, 0xdf, 0xe, 0xc7, + 0x11, 0xd5, 0xd0, 0xf5, 0xc, 0x96, 0xeb, 0x3c, 0xe2, 0x1a}, + {0x4a, 0x99, 0x32, 0x19, 0x87, 0x5d, 0x72, 0x5b, 0xb0, 0xda, 0xb1, + 0xce, 0xb5, 0x1c, 0x35, 0x32, 0x5, 0xca, 0xb7, 0xda, 0x49, 0x15, + 0xc4, 0x7d, 0xf7, 0xc1, 0x8e, 0x27, 0x61, 0xd8, 0xde, 0x58}, + }, + { + {0xa8, 0xc9, 0xc2, 0xb6, 0xa8, 0x5b, 0xfb, 0x2d, 0x8c, 0x59, 0x2c, + 0xf5, 0x8e, 0xef, 0xee, 0x48, 0x73, 0x15, 0x2d, 0xf1, 0x7, 0x91, + 0x80, 0x33, 0xd8, 0x5b, 0x1d, 0x53, 0x6b, 0x69, 0xba, 0x8}, + {0x5c, 0xc5, 0x66, 0xf2, 0x93, 0x37, 0x17, 0xd8, 0x49, 0x4e, 0x45, + 0xcc, 0xc5, 0x76, 0xc9, 0xc8, 0xa8, 0xc3, 0x26, 0xbc, 0xf8, 0x82, + 0xe3, 0x5c, 0xf9, 0xf6, 0x85, 0x54, 0xe8, 0x9d, 0xf3, 0x2f}, + {0x7a, 0xc5, 0xef, 0xc3, 0xee, 0x3e, 0xed, 0x77, 0x11, 0x48, 0xff, + 0xd4, 0x17, 0x55, 0xe0, 0x4, 0xcb, 0x71, 0xa6, 0xf1, 0x3f, 0x7a, + 0x3d, 0xea, 0x54, 0xfe, 0x7c, 0x94, 0xb4, 0x33, 0x6, 0x12}, + }, + { + {0xa, 0x10, 0x12, 0x49, 0x47, 0x31, 0xbd, 0x82, 0x6, 0xbe, 0x6f, + 0x7e, 0x6d, 0x7b, 0x23, 0xde, 0xc6, 0x79, 0xea, 0x11, 0x19, 0x76, + 0x1e, 0xe1, 0xde, 0x3b, 0x39, 0xcb, 0xe3, 0x3b, 0x43, 0x7}, + {0x42, 0x0, 0x61, 0x91, 0x78, 0x98, 0x94, 0xb, 0xe8, 0xfa, 0xeb, + 0xec, 0x3c, 0xb1, 0xe7, 0x4e, 0xc0, 0xa4, 0xf0, 0x94, 0x95, 0x73, + 0xbe, 0x70, 0x85, 0x91, 0xd5, 0xb4, 0x99, 0xa, 0xd3, 0x35}, + {0xf4, 0x97, 0xe9, 0x5c, 0xc0, 0x44, 0x79, 0xff, 0xa3, 0x51, 0x5c, + 0xb0, 0xe4, 0x3d, 0x5d, 0x57, 0x7c, 0x84, 0x76, 0x5a, 0xfd, 0x81, + 0x33, 0x58, 0x9f, 0xda, 0xf6, 0x7a, 0xde, 0x3e, 0x87, 0x2d}, + }, + { + {0x81, 0xf9, 0x5d, 0x4e, 0xe1, 0x2, 0x62, 0xaa, 0xf5, 0xe1, 0x15, + 0x50, 0x17, 0x59, 0xd, 0xa2, 0x6c, 0x1d, 0xe2, 0xba, 0xd3, 0x75, + 0xa2, 0x18, 0x53, 0x2, 0x60, 0x1, 0x8a, 0x61, 0x43, 0x5}, + {0x9, 0x34, 0x37, 0x43, 0x64, 0x31, 0x7a, 0x15, 0xd9, 0x81, 0xaa, + 0xf4, 0xee, 0xb7, 0xb8, 0xfa, 0x6, 0x48, 0xa6, 0xf5, 0xe6, 0xfe, + 0x93, 0xb0, 0xb6, 0xa7, 0x7f, 0x70, 0x54, 0x36, 0x77, 0x2e}, + {0xc1, 0x23, 0x4c, 0x97, 0xf4, 0xbd, 0xea, 0xd, 0x93, 0x46, 0xce, + 0x9d, 0x25, 0xa, 0x6f, 0xaa, 0x2c, 0xba, 0x9a, 0xa2, 0xb8, 0x2c, + 0x20, 0x4, 0xd, 0x96, 0x7, 0x2d, 0x36, 0x43, 0x14, 0x4b}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1480517209436112, 1511153322193952, 1244343858991172, - 304788150493241, 369136856496443 -#else - 39464912, 22061425, 16282656, 22517939, 28414020, 18542168, - 24191033, 4541697, 53770555, 5500567 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2151330273626164, 762045184746182, 1688074332551515, - 823046109005759, 907602769079491 -#else - 12650548, 32057319, 9052870, 11355358, 49428827, 25154267, - 49678271, 12264342, 10874051, 13524335 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2047386910586836, 168470092900250, 1552838872594810, - 340951180073789, 360819374702533 -#else - 25556948, 30508442, 714650, 2510400, 23394682, 23139102, - 33119037, 5080568, 44580805, 5376627 -#endif - }}, + {0xcb, 0x9c, 0x52, 0x1c, 0xe9, 0x54, 0x7c, 0x96, 0xfb, 0x35, 0xc6, + 0x64, 0x92, 0x26, 0xf6, 0x30, 0x65, 0x19, 0x12, 0x78, 0xf4, 0xaf, + 0x47, 0x27, 0x5c, 0x6f, 0xf6, 0xea, 0x18, 0x84, 0x3, 0x17}, + {0x7a, 0x1f, 0x6e, 0xb6, 0xc7, 0xb7, 0xc4, 0xcc, 0x7e, 0x2f, 0xc, + 0xf5, 0x25, 0x7e, 0x15, 0x44, 0x1c, 0xaf, 0x3e, 0x71, 0xfc, 0x6d, + 0xf0, 0x3e, 0xf7, 0x63, 0xda, 0x52, 0x67, 0x44, 0x2f, 0x58}, + {0xe4, 0x4c, 0x32, 0x20, 0xd3, 0x7b, 0x31, 0xc6, 0xc4, 0x8b, 0x48, + 0xa4, 0xe8, 0x42, 0x10, 0xa8, 0x64, 0x13, 0x5a, 0x4e, 0x8b, 0xf1, + 0x1e, 0xb2, 0xc9, 0x8d, 0xa2, 0xcd, 0x4b, 0x1c, 0x2a, 0xc}, + }, + { + {0x45, 0x69, 0xbd, 0x69, 0x48, 0x81, 0xc4, 0xed, 0x22, 0x8d, 0x1c, + 0xbe, 0x7d, 0x90, 0x6d, 0xd, 0xab, 0xc5, 0x5c, 0xd5, 0x12, 0xd2, + 0x3b, 0xc6, 0x83, 0xdc, 0x14, 0xa3, 0x30, 0x9b, 0x6a, 0x5a}, + {0x47, 0x4, 0x1f, 0x6f, 0xd0, 0xc7, 0x4d, 0xd2, 0x59, 0xc0, 0x87, + 0xdb, 0x3e, 0x9e, 0x26, 0xb2, 0x8f, 0xd2, 0xb2, 0xfb, 0x72, 0x2, + 0x5b, 0xd1, 0x77, 0x48, 0xf6, 0xc6, 0xd1, 0x8b, 0x55, 0x7c}, + {0x3d, 0x46, 0x96, 0xd3, 0x24, 0x15, 0xec, 0xd0, 0xf0, 0x24, 0x5a, + 0xc3, 0x8a, 0x62, 0xbb, 0x12, 0xa4, 0x5f, 0xbc, 0x1c, 0x79, 0x3a, + 0xc, 0xa5, 0xc3, 0xaf, 0xfb, 0xa, 0xca, 0xa5, 0x4, 0x4}, + }, + { + {0xd1, 0x6f, 0x41, 0x2a, 0x1b, 0x9e, 0xbc, 0x62, 0x8b, 0x59, 0x50, + 0xe3, 0x28, 0xf7, 0xc6, 0xb5, 0x67, 0x69, 0x5d, 0x3d, 0xd8, 0x3f, + 0x34, 0x4, 0x98, 0xee, 0xf8, 0xe7, 0x16, 0x75, 0x52, 0x39}, + {0xd6, 0x43, 0xa7, 0xa, 0x7, 0x40, 0x1f, 0x8c, 0xe8, 0x5e, 0x26, + 0x5b, 0xcb, 0xd0, 0xba, 0xcc, 0xde, 0xd2, 0x8f, 0x66, 0x6b, 0x4, + 0x4b, 0x57, 0x33, 0x96, 0xdd, 0xca, 0xfd, 0x5b, 0x39, 0x46}, + {0x9c, 0x9a, 0x5d, 0x1a, 0x2d, 0xdb, 0x7f, 0x11, 0x2a, 0x5c, 0x0, + 0xd1, 0xbc, 0x45, 0x77, 0x9c, 0xea, 0x6f, 0xd5, 0x54, 0xf1, 0xbe, + 0xd4, 0xef, 0x16, 0xd0, 0x22, 0xe8, 0x29, 0x9a, 0x57, 0x76}, + }, + { + {0xf2, 0x34, 0xb4, 0x52, 0x13, 0xb5, 0x3c, 0x33, 0xe1, 0x80, 0xde, + 0x93, 0x49, 0x28, 0x32, 0xd8, 0xce, 0x35, 0xd, 0x75, 0x87, 0x28, + 0x51, 0xb5, 0xc1, 0x77, 0x27, 0x2a, 0xbb, 0x14, 0xc5, 0x2}, + {0x17, 0x2a, 0xc0, 0x49, 0x7e, 0x8e, 0xb6, 0x45, 0x7f, 0xa3, 0xa9, + 0xbc, 0xa2, 0x51, 0xcd, 0x23, 0x1b, 0x4c, 0x22, 0xec, 0x11, 0x5f, + 0xd6, 0x3e, 0xb1, 0xbd, 0x5, 0x9e, 0xdc, 0x84, 0xa3, 0x43}, + {0x45, 0xb6, 0xf1, 0x8b, 0xda, 0xd5, 0x4b, 0x68, 0x53, 0x4b, 0xb5, + 0xf6, 0x7e, 0xd3, 0x8b, 0xfb, 0x53, 0xd2, 0xb0, 0xa9, 0xd7, 0x16, + 0x39, 0x31, 0x59, 0x80, 0x54, 0x61, 0x9, 0x92, 0x60, 0x11}, + }, + { + {0xcd, 0x4d, 0x9b, 0x36, 0x16, 0x56, 0x38, 0x7a, 0x63, 0x35, 0x5c, + 0x65, 0xa7, 0x2c, 0xc0, 0x75, 0x21, 0x80, 0xf1, 0xd4, 0xf9, 0x1b, + 0xc2, 0x7d, 0x42, 0xe0, 0xe6, 0x91, 0x74, 0x7d, 0x63, 0x2f}, + {0xaa, 0xcf, 0xda, 0x29, 0x69, 0x16, 0x4d, 0xb4, 0x8f, 0x59, 0x13, + 0x84, 0x4c, 0x9f, 0x52, 0xda, 0x59, 0x55, 0x3d, 0x45, 0xca, 0x63, + 0xef, 0xe9, 0xb, 0x8e, 0x69, 0xc5, 0x5b, 0x12, 0x1e, 0x35}, + {0xbe, 0x7b, 0xf6, 0x1a, 0x46, 0x9b, 0xb4, 0xd4, 0x61, 0x89, 0xab, + 0xc8, 0x7a, 0x3, 0x3, 0xd6, 0xfb, 0x99, 0xa6, 0xf9, 0x9f, 0xe1, + 0xde, 0x71, 0x9a, 0x2a, 0xce, 0xe7, 0x6, 0x2d, 0x18, 0x7f}, + }, + { + {0x22, 0x75, 0x21, 0x8e, 0x72, 0x4b, 0x45, 0x9, 0xd8, 0xb8, 0x84, + 0xd4, 0xf4, 0xe8, 0x58, 0xaa, 0x3c, 0x90, 0x46, 0x7f, 0x4d, 0x25, + 0x58, 0xd3, 0x17, 0x52, 0x1c, 0x24, 0x43, 0xc0, 0xac, 0x44}, + {0xec, 0x68, 0x1, 0xab, 0x64, 0x8e, 0x7c, 0x7a, 0x43, 0xc5, 0xed, + 0x15, 0x55, 0x4a, 0x5a, 0xcb, 0xda, 0xe, 0xcd, 0x47, 0xd3, 0x19, + 0x55, 0x9, 0xb0, 0x93, 0x3e, 0x34, 0x8c, 0xac, 0xd4, 0x67}, + {0x77, 0x57, 0x7a, 0x4f, 0xbb, 0x6b, 0x7d, 0x1c, 0xe1, 0x13, 0x83, + 0x91, 0xd4, 0xfe, 0x35, 0x8b, 0x84, 0x46, 0x6b, 0xc9, 0xc6, 0xa1, + 0xdc, 0x4a, 0xbd, 0x71, 0xad, 0x12, 0x83, 0x1c, 0x6d, 0x55}, + }, + { + {0x21, 0xe8, 0x1b, 0xb1, 0x56, 0x67, 0xf0, 0x81, 0xdd, 0xf3, 0xa3, + 0x10, 0x23, 0xf8, 0xaf, 0xf, 0x5d, 0x46, 0x99, 0x6a, 0x55, 0xd0, + 0xb2, 0xf8, 0x5, 0x7f, 0x8c, 0xcc, 0x38, 0xbe, 0x7a, 0x9}, + {0x82, 0x39, 0x8d, 0xc, 0xe3, 0x40, 0xef, 0x17, 0x34, 0xfa, 0xa3, + 0x15, 0x3e, 0x7, 0xf7, 0x31, 0x6e, 0x64, 0x73, 0x7, 0xcb, 0xf3, + 0x21, 0x4f, 0xff, 0x4e, 0x82, 0x1d, 0x6d, 0x6c, 0x6c, 0x74}, + {0xa4, 0x2d, 0xa5, 0x7e, 0x87, 0xc9, 0x49, 0xc, 0x43, 0x1d, 0xdc, + 0x9b, 0x55, 0x69, 0x43, 0x4c, 0xd2, 0xeb, 0xcc, 0xf7, 0x9, 0x38, + 0x2c, 0x2, 0xbd, 0x84, 0xee, 0x4b, 0xa3, 0x14, 0x7e, 0x57}, + }, + { + {0x2b, 0xd7, 0x4d, 0xbd, 0xbe, 0xce, 0xfe, 0x94, 0x11, 0x22, 0xf, + 0x6, 0xda, 0x4f, 0x6a, 0xf4, 0xff, 0xd1, 0xc8, 0xc0, 0x77, 0x59, + 0x4a, 0x12, 0x95, 0x92, 0x0, 0xfb, 0xb8, 0x4, 0x53, 0x70}, + {0xa, 0x3b, 0xa7, 0x61, 0xac, 0x68, 0xe2, 0xf0, 0xf5, 0xa5, 0x91, + 0x37, 0x10, 0xfa, 0xfa, 0xf2, 0xe9, 0x0, 0x6d, 0x6b, 0x82, 0x3e, + 0xe1, 0xc1, 0x42, 0x8f, 0xd7, 0x6f, 0xe9, 0x7e, 0xfa, 0x60}, + {0xc6, 0x6e, 0x29, 0x4d, 0x35, 0x1d, 0x3d, 0xb6, 0xd8, 0x31, 0xad, + 0x5f, 0x3e, 0x5, 0xc3, 0xf3, 0xec, 0x42, 0xbd, 0xb4, 0x8c, 0x95, + 0xb, 0x67, 0xfd, 0x53, 0x63, 0xa1, 0xc, 0x8e, 0x39, 0x21}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1982622644432056, 2014393600336956, 128909208804214, - 1617792623929191, 105294281913815 -#else - 41020600, 29543379, 50095164, 30016803, 60382070, 1920896, - 44787559, 24106988, 4535767, 1569007 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 980234343912898, 1712256739246056, 588935272190264, - 204298813091998, 841798321043288 -#else - 64853442, 14606629, 45416424, 25514613, 28430648, 8775819, - 36614302, 3044289, 31848280, 12543772 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 197561292938973, 454817274782871, 1963754960082318, - 2113372252160468, 971377527342673 -#else - 45080285, 2943892, 35251351, 6777305, 13784462, 29262229, - 39731668, 31491700, 7718481, 14474653 -#endif - }}, + {0x1, 0x56, 0xb7, 0xb4, 0xf9, 0xaa, 0x98, 0x27, 0x72, 0xad, 0x8d, + 0x5c, 0x13, 0x72, 0xac, 0x5e, 0x23, 0xa0, 0xb7, 0x61, 0x61, 0xaa, + 0xce, 0xd2, 0x4e, 0x7d, 0x8f, 0xe9, 0x84, 0xb2, 0xbf, 0x1b}, + {0xf3, 0x33, 0x2b, 0x38, 0x8a, 0x5, 0xf5, 0x89, 0xb4, 0xc0, 0x48, + 0xad, 0xb, 0xba, 0xe2, 0x5a, 0x6e, 0xb3, 0x3d, 0xa5, 0x3, 0xb5, + 0x93, 0x8f, 0xe6, 0x32, 0xa2, 0x95, 0x9d, 0xed, 0xa3, 0x5a}, + {0x61, 0x65, 0xd9, 0xc7, 0xe9, 0x77, 0x67, 0x65, 0x36, 0x80, 0xc7, + 0x72, 0x54, 0x12, 0x2b, 0xcb, 0xee, 0x6e, 0x50, 0xd9, 0x99, 0x32, + 0x5, 0x65, 0xcc, 0x57, 0x89, 0x5e, 0x4e, 0xe1, 0x7, 0x4a}, + }, + { + {0x9b, 0xa4, 0x77, 0xc4, 0xcd, 0x58, 0xb, 0x24, 0x17, 0xf0, 0x47, + 0x64, 0xde, 0xda, 0x38, 0xfd, 0xad, 0x6a, 0xc8, 0xa7, 0x32, 0x8d, + 0x92, 0x19, 0x81, 0xa0, 0xaf, 0x84, 0xed, 0x7a, 0xaf, 0x50}, + {0x99, 0xf9, 0xd, 0x98, 0xcb, 0x12, 0xe4, 0x4e, 0x71, 0xc7, 0x6e, + 0x3c, 0x6f, 0xd7, 0x15, 0xa3, 0xfd, 0x77, 0x5c, 0x92, 0xde, 0xed, + 0xa5, 0xbb, 0x2, 0x34, 0x31, 0x1d, 0x39, 0xac, 0xb, 0x3f}, + {0xe5, 0x5b, 0xf6, 0x15, 0x1, 0xde, 0x4f, 0x6e, 0xb2, 0x9, 0x61, + 0x21, 0x21, 0x26, 0x98, 0x29, 0xd9, 0xd6, 0xad, 0xb, 0x81, 0x5, + 0x2, 0x78, 0x6, 0xd0, 0xeb, 0xba, 0x16, 0xa3, 0x21, 0x19}, + }, + { + {0x8b, 0xc1, 0xf3, 0xd9, 0x9a, 0xad, 0x5a, 0xd7, 0x9c, 0xc1, 0xb1, + 0x60, 0xef, 0xe, 0x6a, 0x56, 0xd9, 0xe, 0x5c, 0x25, 0xac, 0xb, + 0x9a, 0x3e, 0xf5, 0xc7, 0x62, 0xa0, 0xec, 0x9d, 0x4, 0x7b}, + {0xfc, 0x70, 0xb8, 0xdf, 0x7e, 0x2f, 0x42, 0x89, 0xbd, 0xb3, 0x76, + 0x4f, 0xeb, 0x6b, 0x29, 0x2c, 0xf7, 0x4d, 0xc2, 0x36, 0xd4, 0xf1, + 0x38, 0x7, 0xb0, 0xae, 0x73, 0xe2, 0x41, 0xdf, 0x58, 0x64}, + {0x83, 0x44, 0x44, 0x35, 0x7a, 0xe3, 0xcb, 0xdc, 0x93, 0xbe, 0xed, + 0xf, 0x33, 0x79, 0x88, 0x75, 0x87, 0xdd, 0xc5, 0x12, 0xc3, 0x4, + 0x60, 0x78, 0x64, 0xe, 0x95, 0xc2, 0xcb, 0xdc, 0x93, 0x60}, + }, + { + {0x4b, 0x3, 0x84, 0x60, 0xbe, 0xee, 0xde, 0x6b, 0x54, 0xb8, 0xf, + 0x78, 0xb6, 0xc2, 0x99, 0x31, 0x95, 0x6, 0x2d, 0xb6, 0xab, 0x76, + 0x33, 0x97, 0x90, 0x7d, 0x64, 0x8b, 0xc9, 0x80, 0x31, 0x6e}, + {0x6d, 0x70, 0xe0, 0x85, 0x85, 0x9a, 0xf3, 0x1f, 0x33, 0x39, 0xe7, + 0xb3, 0xd8, 0xa5, 0xd0, 0x36, 0x3b, 0x45, 0x8f, 0x71, 0xe1, 0xf2, + 0xb9, 0x43, 0x7c, 0xa9, 0x27, 0x48, 0x8, 0xea, 0xd1, 0x57}, + {0x71, 0xb0, 0x28, 0xa1, 0xe7, 0xb6, 0x7a, 0xee, 0xaa, 0x8b, 0xa8, + 0x93, 0x6d, 0x59, 0xc1, 0xa4, 0x30, 0x61, 0x21, 0xb2, 0x82, 0xde, + 0xb4, 0xf7, 0x18, 0xbd, 0x97, 0xdd, 0x9d, 0x99, 0x3e, 0x36}, + }, + { + {0xc6, 0xae, 0x4b, 0xe2, 0xdc, 0x48, 0x18, 0x2f, 0x60, 0xaf, 0xbc, + 0xba, 0x55, 0x72, 0x9b, 0x76, 0x31, 0xe9, 0xef, 0x3c, 0x6e, 0x3c, + 0xcb, 0x90, 0x55, 0xb3, 0xf9, 0xc6, 0x9b, 0x97, 0x1f, 0x23}, + {0xc4, 0x1f, 0xee, 0x35, 0xc1, 0x43, 0xa8, 0x96, 0xcf, 0xc8, 0xe4, + 0x8, 0x55, 0xb3, 0x6e, 0x97, 0x30, 0xd3, 0x8c, 0xb5, 0x1, 0x68, + 0x2f, 0xb4, 0x2b, 0x5, 0x3a, 0x69, 0x78, 0x9b, 0xee, 0x48}, + {0xc6, 0xf3, 0x2a, 0xcc, 0x4b, 0xde, 0x31, 0x5c, 0x1f, 0x8d, 0x20, + 0xfe, 0x30, 0xb0, 0x4b, 0xb0, 0x66, 0xb4, 0x4f, 0xc1, 0x9, 0x70, + 0x8d, 0xb7, 0x13, 0x24, 0x79, 0x8, 0x9b, 0xfa, 0x9b, 0x7}, + }, + { + {0x45, 0x42, 0xd5, 0xa2, 0x80, 0xed, 0xc9, 0xf3, 0x52, 0x39, 0xf6, + 0x77, 0x78, 0x8b, 0xa0, 0xa, 0x75, 0x54, 0x8, 0xd1, 0x63, 0xac, + 0x6d, 0xd7, 0x6b, 0x63, 0x70, 0x94, 0x15, 0xfb, 0xf4, 0x1e}, + {0xf4, 0xd, 0x30, 0xda, 0x51, 0x3a, 0x90, 0xe3, 0xb0, 0x5a, 0xa9, + 0x3d, 0x23, 0x64, 0x39, 0x84, 0x80, 0x64, 0x35, 0xb, 0x2d, 0xf1, + 0x3c, 0xed, 0x94, 0x71, 0x81, 0x84, 0xf6, 0x77, 0x8c, 0x3}, + {0xec, 0x7b, 0x16, 0x5b, 0xe6, 0x5e, 0x4e, 0x85, 0xc2, 0xcd, 0xd0, + 0x96, 0x42, 0xa, 0x59, 0x59, 0x99, 0x21, 0x10, 0x98, 0x34, 0xdf, + 0xb2, 0x72, 0x56, 0xff, 0xb, 0x4a, 0x2a, 0xe9, 0x5e, 0x57}, + }, + { + {0x1, 0xd8, 0xa4, 0xa, 0x45, 0xbc, 0x46, 0x5d, 0xd8, 0xb9, 0x33, + 0xa5, 0x27, 0x12, 0xaf, 0xc3, 0xc2, 0x6, 0x89, 0x2b, 0x26, 0x3b, + 0x9e, 0x38, 0x1b, 0x58, 0x2f, 0x38, 0x7e, 0x1e, 0xa, 0x20}, + {0xcf, 0x2f, 0x18, 0x8a, 0x90, 0x80, 0xc0, 0xd4, 0xbd, 0x9d, 0x48, + 0x99, 0xc2, 0x70, 0xe1, 0x30, 0xde, 0x33, 0xf7, 0x52, 0x57, 0xbd, + 0xba, 0x5, 0x0, 0xfd, 0xd3, 0x2c, 0x11, 0xe7, 0xd4, 0x43}, + {0xc5, 0x3a, 0xf9, 0xea, 0x67, 0xb9, 0x8d, 0x51, 0xc0, 0x52, 0x66, + 0x5, 0x9b, 0x98, 0xbc, 0x71, 0xf5, 0x97, 0x71, 0x56, 0xd9, 0x85, + 0x2b, 0xfe, 0x38, 0x4e, 0x1e, 0x65, 0x52, 0xca, 0xe, 0x5}, + }, + { + {0xea, 0x68, 0xe6, 0x60, 0x76, 0x39, 0xac, 0x97, 0x97, 0xb4, 0x3a, + 0x15, 0xfe, 0xbb, 0x19, 0x9b, 0x9f, 0xa7, 0xec, 0x34, 0xb5, 0x79, + 0xb1, 0x4c, 0x57, 0xae, 0x31, 0xa1, 0x9f, 0xc0, 0x51, 0x61}, + {0x9c, 0xc, 0x3f, 0x45, 0xde, 0x1a, 0x43, 0xc3, 0x9b, 0x3b, 0x70, + 0xff, 0x5e, 0x4, 0xf5, 0xe9, 0x3d, 0x7b, 0x84, 0xed, 0xc9, 0x7a, + 0xd9, 0xfc, 0xc6, 0xf4, 0x58, 0x1c, 0xc2, 0xe6, 0xe, 0x4b}, + {0x96, 0x5d, 0xf0, 0xfd, 0xd, 0x5c, 0xf5, 0x3a, 0x7a, 0xee, 0xb4, + 0x2a, 0xe0, 0x2e, 0x26, 0xdd, 0x9, 0x17, 0x17, 0x12, 0x87, 0xbb, + 0xb2, 0x11, 0xb, 0x3, 0xf, 0x80, 0xfa, 0x24, 0xef, 0x1f}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 164699448829328, 3127451757672, 1199504971548753, - 1766155447043652, 1899238924683527 -#else - 2385296, 2454213, 44477544, 46602, 62670929, 17874016, 656964, - 26317767, 24316167, 28300865 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 732262946680281, 1674412764227063, 2182456405662809, - 1350894754474250, 558458873295247 -#else - 13741529, 10911568, 33875447, 24950694, 46931033, 32521134, - 33040650, 20129900, 46379407, 8321685 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2103305098582922, 1960809151316468, 715134605001343, - 1454892949167181, 40827143824949 -#else - 21060490, 31341688, 15712756, 29218333, 1639039, 10656336, - 23845965, 21679594, 57124405, 608371 -#endif - }}, + {0x86, 0x6b, 0x97, 0x30, 0xf5, 0xaf, 0xd2, 0x22, 0x4, 0x46, 0xd2, + 0xc2, 0x6, 0xb8, 0x90, 0x8d, 0xe5, 0xba, 0xe5, 0x4d, 0x6c, 0x89, + 0xa1, 0xdc, 0x17, 0xc, 0x34, 0xc8, 0xe6, 0x5f, 0x0, 0x28}, + {0x96, 0x31, 0xa7, 0x1a, 0xfb, 0x53, 0xd6, 0x37, 0x18, 0x64, 0xd7, + 0x3f, 0x30, 0x95, 0x94, 0xf, 0xb2, 0x17, 0x3a, 0xfb, 0x9, 0xb, + 0x20, 0xad, 0x3e, 0x61, 0xc8, 0x2f, 0x29, 0x49, 0x4d, 0x54}, + {0x88, 0x86, 0x52, 0x34, 0x9f, 0xba, 0xef, 0x6a, 0xa1, 0x7d, 0x10, + 0x25, 0x94, 0xff, 0x1b, 0x5c, 0x36, 0x4b, 0xd9, 0x66, 0xcd, 0xbb, + 0x5b, 0xf7, 0xfa, 0x6d, 0x31, 0xf, 0x93, 0x72, 0xe4, 0x72}, + }, + { + {0x27, 0x76, 0x2a, 0xd3, 0x35, 0xf6, 0xf3, 0x7, 0xf0, 0x66, 0x65, + 0x5f, 0x86, 0x4d, 0xaa, 0x7a, 0x50, 0x44, 0xd0, 0x28, 0x97, 0xe7, + 0x85, 0x3c, 0x38, 0x64, 0xe0, 0xf, 0x0, 0x7f, 0xee, 0x1f}, + {0x4f, 0x8, 0x81, 0x97, 0x8c, 0x20, 0x95, 0x26, 0xe1, 0xe, 0x45, + 0x23, 0xb, 0x2a, 0x50, 0xb1, 0x2, 0xde, 0xef, 0x3, 0xa6, 0xae, + 0x9d, 0xfd, 0x4c, 0xa3, 0x33, 0x27, 0x8c, 0x2e, 0x9d, 0x5a}, + {0xe5, 0xf7, 0xdb, 0x3, 0xda, 0x5, 0x53, 0x76, 0xbd, 0xcd, 0x34, + 0x14, 0x49, 0xf2, 0xda, 0xa4, 0xec, 0x88, 0x4a, 0xd2, 0xcd, 0xd5, + 0x4a, 0x7b, 0x43, 0x5, 0x4, 0xee, 0x51, 0x40, 0xf9, 0x0}, + }, + { + {0x53, 0x97, 0xaf, 0x7, 0xbb, 0x93, 0xef, 0xd7, 0xa7, 0x66, 0xb7, + 0x3d, 0xcf, 0xd0, 0x3e, 0x58, 0xc5, 0x1e, 0xb, 0x6e, 0xbf, 0x98, + 0x69, 0xce, 0x52, 0x4, 0xd4, 0x5d, 0xd2, 0xff, 0xb7, 0x47}, + {0xb2, 0x30, 0xd3, 0xc3, 0x23, 0x6b, 0x35, 0x8d, 0x6, 0x1b, 0x47, + 0xb0, 0x9b, 0x8b, 0x1c, 0xf2, 0x3c, 0xb8, 0x42, 0x6e, 0x6c, 0x31, + 0x6c, 0xb3, 0xd, 0xb1, 0xea, 0x8b, 0x7e, 0x9c, 0xd7, 0x7}, + {0x12, 0xdd, 0x8, 0xbc, 0x9c, 0xfb, 0xfb, 0x87, 0x9b, 0xc2, 0xee, + 0xe1, 0x3a, 0x6b, 0x6, 0x8a, 0xbf, 0xc1, 0x1f, 0xdb, 0x2b, 0x24, + 0x57, 0xd, 0xb6, 0x4b, 0xa6, 0x5e, 0xa3, 0x20, 0x35, 0x1c}, + }, + { + {0x59, 0xc0, 0x6b, 0x21, 0x40, 0x6f, 0xa8, 0xcd, 0x7e, 0xd8, 0xbc, + 0x12, 0x1d, 0x23, 0xbb, 0x1f, 0x90, 0x9, 0xc7, 0x17, 0x9e, 0x6a, + 0x95, 0xb4, 0x55, 0x2e, 0xd1, 0x66, 0x3b, 0xc, 0x75, 0x38}, + {0x4a, 0xa3, 0xcb, 0xbc, 0xa6, 0x53, 0xd2, 0x80, 0x9b, 0x21, 0x38, + 0x38, 0xa1, 0xc3, 0x61, 0x3e, 0x96, 0xe3, 0x82, 0x98, 0x1, 0xb6, + 0xc3, 0x90, 0x6f, 0xe6, 0xe, 0x5d, 0x77, 0x5, 0x3d, 0x1c}, + {0x1a, 0xe5, 0x22, 0x94, 0x40, 0xf1, 0x2e, 0x69, 0x71, 0xf6, 0x5d, + 0x2b, 0x3c, 0xc7, 0xc0, 0xcb, 0x29, 0xe0, 0x4c, 0x74, 0xe7, 0x4f, + 0x1, 0x21, 0x7c, 0x48, 0x30, 0xd3, 0xc7, 0xe2, 0x21, 0x6}, + }, + { + {0xf3, 0xf0, 0xdb, 0xb0, 0x96, 0x17, 0xae, 0xb7, 0x96, 0xe1, 0x7c, + 0xe1, 0xb9, 0xaf, 0xdf, 0x54, 0xb4, 0xa3, 0xaa, 0xe9, 0x71, 0x30, + 0x92, 0x25, 0x9d, 0x2e, 0x0, 0xa1, 0x9c, 0x58, 0x8e, 0x5d}, + {0x8d, 0x83, 0x59, 0x82, 0xcc, 0x60, 0x98, 0xaf, 0xdc, 0x9a, 0x9f, + 0xc6, 0xc1, 0x48, 0xea, 0x90, 0x30, 0x1e, 0x58, 0x65, 0x37, 0x48, + 0x26, 0x65, 0xbc, 0xa5, 0xd3, 0x7b, 0x9, 0xd6, 0x7, 0x0}, + {0x4b, 0xa9, 0x42, 0x8, 0x95, 0x1d, 0xbf, 0xc0, 0x3e, 0x2e, 0x8f, + 0x58, 0x63, 0xc3, 0xd3, 0xb2, 0xef, 0xe2, 0x51, 0xbb, 0x38, 0x14, + 0x96, 0xa, 0x86, 0xbf, 0x1c, 0x3c, 0x78, 0xd7, 0x83, 0x15}, + }, + { + {0xc7, 0x28, 0x9d, 0xcc, 0x4, 0x47, 0x3, 0x90, 0x8f, 0xc5, 0x2c, + 0xf7, 0x9e, 0x67, 0x1b, 0x1d, 0x26, 0x87, 0x5b, 0xbe, 0x5f, 0x2b, + 0xe1, 0x16, 0xa, 0x58, 0xc5, 0x83, 0x4e, 0x6, 0x58, 0x49}, + {0xe1, 0x7a, 0xa2, 0x5d, 0xef, 0xa2, 0xee, 0xec, 0x74, 0x1, 0x67, + 0x55, 0x14, 0x3a, 0x7c, 0x59, 0x7a, 0x16, 0x9, 0x66, 0x12, 0x2a, + 0xa6, 0xc9, 0x70, 0x8f, 0xed, 0x81, 0x2e, 0x5f, 0x2a, 0x25}, + {0xd, 0xe8, 0x66, 0x50, 0x26, 0x94, 0x28, 0xd, 0x6b, 0x8c, 0x7c, + 0x30, 0x85, 0xf7, 0xc3, 0xfc, 0xfd, 0x12, 0x11, 0xc, 0x78, 0xda, + 0x53, 0x1b, 0x88, 0xb3, 0x43, 0xd8, 0xb, 0x17, 0x9c, 0x7}, + }, + { + {0x56, 0xd0, 0xd5, 0xc0, 0x50, 0xcd, 0xd6, 0xcd, 0x3b, 0x57, 0x3, + 0xbb, 0x6d, 0x68, 0xf7, 0x9a, 0x48, 0xef, 0xc3, 0xf3, 0x3f, 0x72, + 0xa6, 0x3c, 0xcc, 0x8a, 0x7b, 0x31, 0xd7, 0xc0, 0x68, 0x67}, + {0xff, 0x6f, 0xfa, 0x64, 0xe4, 0xec, 0x6, 0x5, 0x23, 0xe5, 0x5, + 0x62, 0x1e, 0x43, 0xe3, 0xbe, 0x42, 0xea, 0xb8, 0x51, 0x24, 0x42, + 0x79, 0x35, 0x0, 0xfb, 0xc9, 0x4a, 0xe3, 0x5, 0xec, 0x6d}, + {0xb3, 0xc1, 0x55, 0xf1, 0xe5, 0x25, 0xb6, 0x94, 0x91, 0x7b, 0x7b, + 0x99, 0xa7, 0xf3, 0x7b, 0x41, 0x0, 0x26, 0x6b, 0x6d, 0xdc, 0xbd, + 0x2c, 0xc2, 0xf4, 0x52, 0xcd, 0xdd, 0x14, 0x5e, 0x44, 0x51}, + }, + { + {0x55, 0xa4, 0xbe, 0x2b, 0xab, 0x47, 0x31, 0x89, 0x29, 0x91, 0x7, + 0x92, 0x4f, 0xa2, 0x53, 0x8c, 0xa7, 0xf7, 0x30, 0xbe, 0x48, 0xf9, + 0x49, 0x4b, 0x3d, 0xd4, 0x4f, 0x6e, 0x8, 0x90, 0xe9, 0x12}, + {0x51, 0x49, 0x14, 0x3b, 0x4b, 0x2b, 0x50, 0x57, 0xb3, 0xbc, 0x4b, + 0x44, 0x6b, 0xff, 0x67, 0x8e, 0xdb, 0x85, 0x63, 0x16, 0x27, 0x69, + 0xbd, 0xb8, 0xc8, 0x95, 0x92, 0xe3, 0x31, 0x6f, 0x18, 0x13}, + {0x2e, 0xbb, 0xdf, 0x7f, 0xb3, 0x96, 0xc, 0xf1, 0xf9, 0xea, 0x1c, + 0x12, 0x5e, 0x93, 0x9a, 0x9f, 0x3f, 0x98, 0x5b, 0x3a, 0xc4, 0x36, + 0x11, 0xdf, 0xaf, 0x99, 0x3e, 0x5d, 0xf0, 0xe3, 0xb2, 0x77}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1239289043050212, 1744654158124578, 758702410031698, - 1796762995074688, 1603056663766 -#else - 53436132, 18466845, 56219170, 25997372, 61071954, 11305546, - 1123968, 26773855, 27229398, 23887 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2232056027107988, 987343914584615, 2115594492994461, - 1819598072792159, 1119305654014850 -#else - 43864724, 33260226, 55364135, 14712570, 37643165, 31524814, - 12797023, 27114124, 65475458, 16678953 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 320153677847348, 939613871605645, 641883205761567, - 1930009789398224, 329165806634126 -#else - 37608244, 4770661, 51054477, 14001337, 7830047, 9564805, - 65600720, 28759386, 49939598, 4904952 -#endif - }}, + {0xa4, 0xb0, 0xdd, 0x12, 0x9c, 0x63, 0x98, 0xd5, 0x6b, 0x86, 0x24, + 0xc0, 0x30, 0x9f, 0xd1, 0xa5, 0x60, 0xe4, 0xfc, 0x58, 0x3, 0x2f, + 0x7c, 0xd1, 0x8a, 0x5e, 0x9, 0x2e, 0x15, 0x95, 0xa1, 0x7}, + {0xde, 0xc4, 0x2e, 0x9c, 0xc5, 0xa9, 0x6f, 0x29, 0xcb, 0xf3, 0x84, + 0x4f, 0xbf, 0x61, 0x8b, 0xbc, 0x8, 0xf9, 0xa8, 0x17, 0xd9, 0x6, + 0x77, 0x1c, 0x5d, 0x25, 0xd3, 0x7a, 0xfc, 0x95, 0xb7, 0x63}, + {0xc8, 0x5f, 0x9e, 0x38, 0x2, 0x8f, 0x36, 0xa8, 0x3b, 0xe4, 0x8d, + 0xcf, 0x2, 0x3b, 0x43, 0x90, 0x43, 0x26, 0x41, 0xc5, 0x5d, 0xfd, + 0xa1, 0xaf, 0x37, 0x1, 0x2f, 0x3, 0x3d, 0xe8, 0x8f, 0x3e}, + }, + { + {0x3c, 0xd1, 0xef, 0xe8, 0x8d, 0x4c, 0x70, 0x8, 0x31, 0x37, 0xe0, + 0x33, 0x8e, 0x1a, 0xc5, 0xdf, 0xe3, 0xcd, 0x60, 0x12, 0xa5, 0x5d, + 0x9d, 0xa5, 0x86, 0x8c, 0x25, 0xa6, 0x99, 0x8, 0xd6, 0x22}, + {0x94, 0xa2, 0x70, 0x5, 0xb9, 0x15, 0x8b, 0x2f, 0x49, 0x45, 0x8, + 0x67, 0x70, 0x42, 0xf2, 0x94, 0x84, 0xfd, 0xbb, 0x61, 0xe1, 0x5a, + 0x1c, 0xde, 0x7, 0x40, 0xac, 0x7f, 0x79, 0x3b, 0xba, 0x75}, + {0x96, 0xd1, 0xcd, 0x70, 0xc0, 0xdb, 0x39, 0x62, 0x9a, 0x8a, 0x7d, + 0x6c, 0x8b, 0x8a, 0xfe, 0x60, 0x60, 0x12, 0x40, 0xeb, 0xbc, 0x47, + 0x88, 0xb3, 0x5e, 0x9e, 0x77, 0x87, 0x7b, 0xd0, 0x4, 0x9}, + }, + { + {0xb9, 0x40, 0xf9, 0x48, 0x66, 0x2d, 0x32, 0xf4, 0x39, 0xc, 0x2d, + 0xbd, 0xc, 0x2f, 0x95, 0x6, 0x31, 0xf9, 0x81, 0xa0, 0xad, 0x97, + 0x76, 0x16, 0x6c, 0x2a, 0xf7, 0xba, 0xce, 0xaa, 0x40, 0x62}, + {0x9c, 0x91, 0xba, 0xdd, 0xd4, 0x1f, 0xce, 0xb4, 0xaa, 0x8d, 0x4c, + 0xc7, 0x3e, 0xdb, 0x31, 0xcf, 0x51, 0xcc, 0x86, 0xad, 0x63, 0xcc, + 0x63, 0x2c, 0x7, 0xde, 0x1d, 0xbc, 0x3f, 0x14, 0xe2, 0x43}, + {0xa0, 0x95, 0xa2, 0x5b, 0x9c, 0x74, 0x34, 0xf8, 0x5a, 0xd2, 0x37, + 0xca, 0x5b, 0x7c, 0x94, 0xd6, 0x6a, 0x31, 0xc9, 0xe7, 0xa7, 0x3b, + 0xf1, 0x66, 0xac, 0xc, 0xb4, 0x8d, 0x23, 0xaf, 0xbd, 0x56}, + }, + { + {0xb2, 0x3b, 0x9d, 0xc1, 0x6c, 0xd3, 0x10, 0x13, 0xb9, 0x86, 0x23, + 0x62, 0xb7, 0x6b, 0x2a, 0x6, 0x5c, 0x4f, 0xa1, 0xd7, 0x91, 0x85, + 0x9b, 0x7c, 0x54, 0x57, 0x1e, 0x7e, 0x50, 0x31, 0xaa, 0x3}, + {0xeb, 0x33, 0x35, 0xf5, 0xe3, 0xb9, 0x2a, 0x36, 0x40, 0x3d, 0xb9, + 0x6e, 0xd5, 0x68, 0x85, 0x33, 0x72, 0x55, 0x5a, 0x1d, 0x52, 0x14, + 0xe, 0x9e, 0x18, 0x13, 0x74, 0x83, 0x6d, 0xa8, 0x24, 0x1d}, + {0x1f, 0xce, 0xd4, 0xff, 0x48, 0x76, 0xec, 0xf4, 0x1c, 0x8c, 0xac, + 0x54, 0xf0, 0xea, 0x45, 0xe0, 0x7c, 0x35, 0x9, 0x1d, 0x82, 0x25, + 0xd2, 0x88, 0x59, 0x48, 0xeb, 0x9a, 0xdc, 0x61, 0xb2, 0x43}, + }, + { + {0x64, 0x13, 0x95, 0x6c, 0x8b, 0x3d, 0x51, 0x19, 0x7b, 0xf4, 0xb, + 0x0, 0x26, 0x71, 0xfe, 0x94, 0x67, 0x95, 0x4f, 0xd5, 0xdd, 0x10, + 0x8d, 0x2, 0x64, 0x9, 0x94, 0x42, 0xe2, 0xd5, 0xb4, 0x2}, + {0xbb, 0x79, 0xbb, 0x88, 0x19, 0x1e, 0x5b, 0xe5, 0x9d, 0x35, 0x7a, + 0xc1, 0x7d, 0xd0, 0x9e, 0xa0, 0x33, 0xea, 0x3d, 0x60, 0xe2, 0x2e, + 0x2c, 0xb0, 0xc2, 0x6b, 0x27, 0x5b, 0xcf, 0x55, 0x60, 0x32}, + {0xf2, 0x8d, 0xd1, 0x28, 0xcb, 0x55, 0xa1, 0xb4, 0x8, 0xe5, 0x6c, + 0x18, 0x46, 0x46, 0xcc, 0xea, 0x89, 0x43, 0x82, 0x6c, 0x93, 0xf4, + 0x9c, 0xc4, 0x10, 0x34, 0x5d, 0xae, 0x9, 0xc8, 0xa6, 0x27}, + }, + { + {0x54, 0x69, 0x3d, 0xc4, 0xa, 0x27, 0x2c, 0xcd, 0xb2, 0xca, 0x66, + 0x6a, 0x57, 0x3e, 0x4a, 0xdd, 0x6c, 0x3, 0xd7, 0x69, 0x24, 0x59, + 0xfa, 0x79, 0x99, 0x25, 0x8c, 0x3d, 0x60, 0x3, 0x15, 0x22}, + {0x88, 0xb1, 0xd, 0x1f, 0xcd, 0xeb, 0xa6, 0x8b, 0xe8, 0x5b, 0x5a, + 0x67, 0x3a, 0xd7, 0xd3, 0x37, 0x5a, 0x58, 0xf5, 0x15, 0xa3, 0xdf, + 0x2e, 0xf2, 0x7e, 0xa1, 0x60, 0xff, 0x74, 0x71, 0xb6, 0x2c}, + {0xd0, 0xe1, 0xb, 0x39, 0xf9, 0xcd, 0xee, 0x59, 0xf1, 0xe3, 0x8c, + 0x72, 0x44, 0x20, 0x42, 0xa9, 0xf4, 0xf0, 0x94, 0x7a, 0x66, 0x1c, + 0x89, 0x82, 0x36, 0xf4, 0x90, 0x38, 0xb7, 0xf4, 0x1d, 0x7b}, + }, + { + {0x8c, 0xf5, 0xf8, 0x7, 0x18, 0x22, 0x2e, 0x5f, 0xd4, 0x9, 0x94, + 0xd4, 0x9f, 0x5c, 0x55, 0xe3, 0x30, 0xa6, 0xb6, 0x1f, 0x8d, 0xa8, + 0xaa, 0xb2, 0x3d, 0xe0, 0x52, 0xd3, 0x45, 0x82, 0x69, 0x68}, + {0x24, 0xa2, 0xb2, 0xb3, 0xe0, 0xf2, 0x92, 0xe4, 0x60, 0x11, 0x55, + 0x2b, 0x6, 0x9e, 0x6c, 0x7c, 0xe, 0x7b, 0x7f, 0xd, 0xe2, 0x8f, + 0xeb, 0x15, 0x92, 0x59, 0xfc, 0x58, 0x26, 0xef, 0xfc, 0x61}, + {0x7a, 0x18, 0x18, 0x2a, 0x85, 0x5d, 0xb1, 0xdb, 0xd7, 0xac, 0xdd, + 0x86, 0xd3, 0xaa, 0xe4, 0xf3, 0x82, 0xc4, 0xf6, 0xf, 0x81, 0xe2, + 0xba, 0x44, 0xcf, 0x1, 0xaf, 0x3d, 0x47, 0x4c, 0xcf, 0x46}, + }, + { + {0x40, 0x81, 0x49, 0xf1, 0xa7, 0x6e, 0x3c, 0x21, 0x54, 0x48, 0x2b, + 0x39, 0xf8, 0x7e, 0x1e, 0x7c, 0xba, 0xce, 0x29, 0x56, 0x8c, 0xc3, + 0x88, 0x24, 0xbb, 0xc5, 0x8c, 0xd, 0xe5, 0xaa, 0x65, 0x10}, + {0xf9, 0xe5, 0xc4, 0x9e, 0xed, 0x25, 0x65, 0x42, 0x3, 0x33, 0x90, + 0x16, 0x1, 0xda, 0x5e, 0xe, 0xdc, 0xca, 0xe5, 0xcb, 0xf2, 0xa7, + 0xb1, 0x72, 0x40, 0x5f, 0xeb, 0x14, 0xcd, 0x7b, 0x38, 0x29}, + {0x57, 0xd, 0x20, 0xdf, 0x25, 0x45, 0x2c, 0x1c, 0x4a, 0x67, 0xca, + 0xbf, 0xd6, 0x2d, 0x3b, 0x5c, 0x30, 0x40, 0x83, 0xe1, 0xb1, 0xe7, + 0x7, 0xa, 0x16, 0xe7, 0x1c, 0x4f, 0xe6, 0x98, 0xa1, 0x69}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 980930490474130, 1242488692177893, 1251446316964684, - 1086618677993530, 1961430968465772 -#else - 24059538, 14617003, 19037157, 18514524, 19766092, 18648003, - 5169210, 16191880, 2128236, 29227599 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 276821765317453, 1536835591188030, 1305212741412361, - 61473904210175, 2051377036983058 -#else - 50127693, 4124965, 58568254, 22900634, 30336521, 19449185, - 37302527, 916032, 60226322, 30567899 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 833449923882501, 1750270368490475, 1123347002068295, - 185477424765687, 278090826653186 -#else - 44477957, 12419371, 59974635, 26081060, 50629959, 16739174, - 285431, 2763829, 15736322, 4143876 -#endif - }}, + {0xed, 0xca, 0xc5, 0xdc, 0x34, 0x44, 0x1, 0xe1, 0x33, 0xfb, 0x84, + 0x3c, 0x96, 0x5d, 0xed, 0x47, 0xe7, 0xa0, 0x86, 0xed, 0x76, 0x95, + 0x1, 0x70, 0xe4, 0xf9, 0x67, 0xd2, 0x7b, 0x69, 0xb2, 0x25}, + {0xbc, 0x78, 0x1a, 0xd9, 0xe0, 0xb2, 0x62, 0x90, 0x67, 0x96, 0x50, + 0xc8, 0x9c, 0x88, 0xc9, 0x47, 0xb8, 0x70, 0x50, 0x40, 0x66, 0x4a, + 0xf5, 0x9d, 0xbf, 0xa1, 0x93, 0x24, 0xa9, 0xe6, 0x69, 0x73}, + {0x64, 0x68, 0x98, 0x13, 0xfb, 0x3f, 0x67, 0x9d, 0xb8, 0xc7, 0x5d, + 0x41, 0xd9, 0xfb, 0xa5, 0x3c, 0x5e, 0x3b, 0x27, 0xdf, 0x3b, 0xcc, + 0x4e, 0xe0, 0xd2, 0x4c, 0x4e, 0xb5, 0x3d, 0x68, 0x20, 0x14}, + }, + { + {0xd0, 0x5a, 0xcc, 0xc1, 0x6f, 0xbb, 0xee, 0x34, 0x8b, 0xac, 0x46, + 0x96, 0xe9, 0xc, 0x1b, 0x6a, 0x53, 0xde, 0x6b, 0xa6, 0x49, 0xda, + 0xb0, 0xd3, 0xc1, 0x81, 0xd0, 0x61, 0x41, 0x3b, 0xe8, 0x31}, + {0x97, 0xd1, 0x9d, 0x24, 0x1e, 0xbd, 0x78, 0xb4, 0x2, 0xc1, 0x58, + 0x5e, 0x0, 0x35, 0xc, 0x62, 0x5c, 0xac, 0xba, 0xcc, 0x2f, 0xd3, + 0x2, 0xfb, 0x2d, 0xa7, 0x8, 0xf5, 0xeb, 0x3b, 0xb6, 0x60}, + {0x4f, 0x2b, 0x6, 0x9e, 0x12, 0xc7, 0xe8, 0x97, 0xd8, 0xa, 0x32, + 0x29, 0x4f, 0x8f, 0xe4, 0x49, 0x3f, 0x68, 0x18, 0x6f, 0x4b, 0xe1, + 0xec, 0x5b, 0x17, 0x3, 0x55, 0x2d, 0xb6, 0x1e, 0xcf, 0x55}, + }, + { + {0x52, 0x8c, 0xf5, 0x7d, 0xe3, 0xb5, 0x76, 0x30, 0x36, 0xcc, 0x99, + 0xe7, 0xdd, 0xb9, 0x3a, 0xd7, 0x20, 0xee, 0x13, 0x49, 0xe3, 0x1c, + 0x83, 0xbd, 0x33, 0x1, 0xba, 0x62, 0xaa, 0xfb, 0x56, 0x1a}, + {0x58, 0x3d, 0xc2, 0x65, 0x10, 0x10, 0x79, 0x58, 0x9c, 0x81, 0x94, + 0x50, 0x6d, 0x8, 0x9d, 0x8b, 0xa7, 0x5f, 0xc5, 0x12, 0xa9, 0x2f, + 0x40, 0xe2, 0xd4, 0x91, 0x8, 0x57, 0x64, 0x65, 0x9a, 0x66}, + {0xec, 0xc9, 0x9d, 0x5c, 0x50, 0x6b, 0x3e, 0x94, 0x1a, 0x37, 0x7c, + 0xa7, 0xbb, 0x57, 0x25, 0x30, 0x51, 0x76, 0x34, 0x41, 0x56, 0xae, + 0x73, 0x98, 0x5c, 0x8a, 0xc5, 0x99, 0x67, 0x83, 0xc4, 0x13}, + }, + { + {0x80, 0xd0, 0x8b, 0x5d, 0x6a, 0xfb, 0xdc, 0xc4, 0x42, 0x48, 0x1a, + 0x57, 0xec, 0xc4, 0xeb, 0xde, 0x65, 0x53, 0xe5, 0xb8, 0x83, 0xe8, + 0xb2, 0xd4, 0x27, 0xb8, 0xe5, 0xc8, 0x7d, 0xc8, 0xbd, 0x50}, + {0xb9, 0xe1, 0xb3, 0x5a, 0x46, 0x5d, 0x3a, 0x42, 0x61, 0x3f, 0xf1, + 0xc7, 0x87, 0xc1, 0x13, 0xfc, 0xb6, 0xb9, 0xb5, 0xec, 0x64, 0x36, + 0xf8, 0x19, 0x7, 0xb6, 0x37, 0xa6, 0x93, 0xc, 0xf8, 0x66}, + {0x11, 0xe1, 0xdf, 0x6e, 0x83, 0x37, 0x6d, 0x60, 0xd9, 0xab, 0x11, + 0xf0, 0x15, 0x3e, 0x35, 0x32, 0x96, 0x3b, 0xb7, 0x25, 0xc3, 0x3a, + 0xb0, 0x64, 0xae, 0xd5, 0x5f, 0x72, 0x44, 0x64, 0xd5, 0x1d}, + }, + { + {0x9a, 0xc8, 0xba, 0x8, 0x0, 0xe6, 0x97, 0xc2, 0xe0, 0xc3, 0xe1, + 0xea, 0x11, 0xea, 0x4c, 0x7d, 0x7c, 0x97, 0xe7, 0x9f, 0xe1, 0x8b, + 0xe3, 0xf3, 0xcd, 0x5, 0xa3, 0x63, 0xf, 0x45, 0x3a, 0x3a}, + {0x7d, 0x12, 0x62, 0x33, 0xf8, 0x7f, 0xa4, 0x8f, 0x15, 0x7c, 0xcd, + 0x71, 0xc4, 0x6a, 0x9f, 0xbc, 0x8b, 0xc, 0x22, 0x49, 0x43, 0x45, + 0x71, 0x6e, 0x2e, 0x73, 0x9f, 0x21, 0x12, 0x59, 0x64, 0xe}, + {0x27, 0x46, 0x39, 0xd8, 0x31, 0x2f, 0x8f, 0x7, 0x10, 0xa5, 0x94, + 0xde, 0x83, 0x31, 0x9d, 0x38, 0x80, 0x6f, 0x99, 0x17, 0x6d, 0x6c, + 0xe3, 0xd1, 0x7b, 0xa8, 0xa9, 0x93, 0x93, 0x8d, 0x8c, 0x31}, + }, + { + {0x98, 0xd3, 0x1d, 0xab, 0x29, 0x9e, 0x66, 0x5d, 0x3b, 0x9e, 0x2d, + 0x34, 0x58, 0x16, 0x92, 0xfc, 0xcd, 0x73, 0x59, 0xf3, 0xfd, 0x1d, + 0x85, 0x55, 0xf6, 0xa, 0x95, 0x25, 0xc3, 0x41, 0x9a, 0x50}, + {0x19, 0xfe, 0xff, 0x2a, 0x3, 0x5d, 0x74, 0xf2, 0x66, 0xdb, 0x24, + 0x7f, 0x49, 0x3c, 0x9f, 0xc, 0xef, 0x98, 0x85, 0xba, 0xe3, 0xd3, + 0x98, 0xbc, 0x14, 0x53, 0x1d, 0x9a, 0x67, 0x7c, 0x4c, 0x22}, + {0xe9, 0x25, 0xf9, 0xa6, 0xdc, 0x6e, 0xc0, 0xbd, 0x33, 0x1f, 0x1b, + 0x64, 0xf4, 0xf3, 0x3e, 0x79, 0x89, 0x3e, 0x83, 0x9d, 0x80, 0x12, + 0xec, 0x82, 0x89, 0x13, 0xa1, 0x28, 0x23, 0xf0, 0xbf, 0x5}, + }, + { + {0xe4, 0x12, 0xc5, 0xd, 0xdd, 0xa0, 0x81, 0x68, 0xfe, 0xfa, 0xa5, + 0x44, 0xc8, 0xd, 0xe7, 0x4f, 0x40, 0x52, 0x4a, 0x8f, 0x6b, 0x8e, + 0x74, 0x1f, 0xea, 0xa3, 0x1, 0xee, 0xcd, 0x77, 0x62, 0x57}, + {0xb, 0xe0, 0xca, 0x23, 0x70, 0x13, 0x32, 0x36, 0x59, 0xcf, 0xac, + 0xd1, 0xa, 0xcf, 0x4a, 0x54, 0x88, 0x1c, 0x1a, 0xd2, 0x49, 0x10, + 0x74, 0x96, 0xa7, 0x44, 0x2a, 0xfa, 0xc3, 0x8c, 0xb, 0x78}, + {0x5f, 0x30, 0x4f, 0x23, 0xbc, 0x8a, 0xf3, 0x1e, 0x8, 0xde, 0x5, + 0x14, 0xbd, 0x7f, 0x57, 0x9a, 0xd, 0x2a, 0xe6, 0x34, 0x14, 0xa5, + 0x82, 0x5e, 0xa1, 0xb7, 0x71, 0x62, 0x72, 0x18, 0xf4, 0x5f}, + }, + { + {0x40, 0x95, 0xb6, 0x13, 0xe8, 0x47, 0xdb, 0xe5, 0xe1, 0x10, 0x26, + 0x43, 0x3b, 0x2a, 0x5d, 0xf3, 0x76, 0x12, 0x78, 0x38, 0xe9, 0x26, + 0x1f, 0xac, 0x69, 0xcb, 0xa0, 0xa0, 0x8c, 0xdb, 0xd4, 0x29}, + {0x9d, 0xdb, 0x89, 0x17, 0xc, 0x8, 0x8e, 0x39, 0xf5, 0x78, 0xe7, + 0xf3, 0x25, 0x20, 0x60, 0xa7, 0x5d, 0x3, 0xbd, 0x6, 0x4c, 0x89, + 0x98, 0xfa, 0xbe, 0x66, 0xa9, 0x25, 0xdc, 0x3, 0x6a, 0x10}, + {0xd0, 0x53, 0x33, 0x33, 0xaf, 0xa, 0xad, 0xd9, 0xe5, 0x9, 0xd3, + 0xac, 0xa5, 0x9d, 0x66, 0x38, 0xf0, 0xf7, 0x88, 0xc8, 0x8a, 0x65, + 0x57, 0x3c, 0xfa, 0xbe, 0x2c, 0x5, 0x51, 0x8a, 0xb3, 0x4a}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 794524995833413, 1849907304548286, 53348672473145, - 1272368559505217, 1147304168324779 -#else - 2379333, 11839345, 62998462, 27565766, 11274297, 794957, 212801, - 18959769, 23527083, 17096164 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1504846112759364, 1203096289004681, 562139421471418, - 274333017451844, 1284344053775441 -#else - 33431108, 22423954, 49269897, 17927531, 8909498, 8376530, - 34483524, 4087880, 51919953, 19138217 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 483048732424432, 2116063063343382, 30120189902313, - 292451576741007, 1156379271702225 -#else - 1767664, 7197987, 53903638, 31531796, 54017513, 448825, 5799055, - 4357868, 62334673, 17231393 -#endif - }}, + {0x9c, 0xc0, 0xdd, 0x5f, 0xef, 0xd1, 0xcf, 0xd6, 0xce, 0x5d, 0x57, + 0xf7, 0xfd, 0x3e, 0x2b, 0xe8, 0xc2, 0x34, 0x16, 0x20, 0x5d, 0x6b, + 0xd5, 0x25, 0x9b, 0x2b, 0xed, 0x4, 0xbb, 0xc6, 0x41, 0x30}, + {0x93, 0xd5, 0x68, 0x67, 0x25, 0x2b, 0x7c, 0xda, 0x13, 0xca, 0x22, + 0x44, 0x57, 0xc0, 0xc1, 0x98, 0x1d, 0xce, 0xa, 0xca, 0xd5, 0xb, + 0xa8, 0xf1, 0x90, 0xa6, 0x88, 0xc0, 0xad, 0xd1, 0xcd, 0x29}, + {0x48, 0xe1, 0x56, 0xd9, 0xf9, 0xf2, 0xf2, 0xf, 0x2e, 0x6b, 0x35, + 0x9f, 0x75, 0x97, 0xe7, 0xad, 0x5c, 0x2, 0x6c, 0x5f, 0xbb, 0x98, + 0x46, 0x1a, 0x7b, 0x9a, 0x4, 0x14, 0x68, 0xbd, 0x4b, 0x10}, + }, + { + {0x63, 0xf1, 0x7f, 0xd6, 0x5f, 0x9a, 0x5d, 0xa9, 0x81, 0x56, 0xc7, + 0x4c, 0x9d, 0xe6, 0x2b, 0xe9, 0x57, 0xf2, 0x20, 0xde, 0x4c, 0x2, + 0xf8, 0xb7, 0xf5, 0x2d, 0x7, 0xfb, 0x20, 0x2a, 0x4f, 0x20}, + {0x67, 0xed, 0xf1, 0x68, 0x31, 0xfd, 0xf0, 0x51, 0xc2, 0x3b, 0x6f, + 0xd8, 0xcd, 0x1d, 0x81, 0x2c, 0xde, 0xf2, 0xd2, 0x4, 0x43, 0x5c, + 0xdc, 0x44, 0x49, 0x71, 0x2a, 0x9, 0x57, 0xcc, 0xe8, 0x5b}, + {0x79, 0xb0, 0xeb, 0x30, 0x3d, 0x3b, 0x14, 0xc8, 0x30, 0x2e, 0x65, + 0xbd, 0x5a, 0x15, 0x89, 0x75, 0x31, 0x5c, 0x6d, 0x8f, 0x31, 0x3c, + 0x3c, 0x65, 0x1f, 0x16, 0x79, 0xc2, 0x17, 0xfb, 0x70, 0x25}, + }, + { + {0x5a, 0x24, 0xb8, 0xb, 0x55, 0xa9, 0x2e, 0x19, 0xd1, 0x50, 0x90, + 0x8f, 0xa8, 0xfb, 0xe6, 0xc8, 0x35, 0xc9, 0xa4, 0x88, 0x2d, 0xea, + 0x86, 0x79, 0x68, 0x86, 0x1, 0xde, 0x91, 0x5f, 0x1c, 0x24}, + {0x75, 0x15, 0xb6, 0x2c, 0x7f, 0x36, 0xfa, 0x3e, 0x6c, 0x2, 0xd6, + 0x1c, 0x76, 0x6f, 0xf9, 0xf5, 0x62, 0x25, 0xb5, 0x65, 0x2a, 0x14, + 0xc7, 0xe8, 0xcd, 0xa, 0x3, 0x53, 0xea, 0x65, 0xcb, 0x3d}, + {0xaa, 0x6c, 0xde, 0x40, 0x29, 0x17, 0xd8, 0x28, 0x3a, 0x73, 0xd9, + 0x22, 0xf0, 0x2c, 0xbf, 0x8f, 0xd1, 0x1, 0x5b, 0x23, 0xdd, 0xfc, + 0xd7, 0x16, 0xe5, 0xf0, 0xcd, 0x5f, 0xdd, 0xe, 0x42, 0x8}, + }, + { + {0xce, 0x10, 0xf4, 0x4, 0x4e, 0xc3, 0x58, 0x3, 0x85, 0x6, 0x6e, + 0x27, 0x5a, 0x5b, 0x13, 0xb6, 0x21, 0x15, 0xb9, 0xeb, 0xc7, 0x70, + 0x96, 0x5d, 0x9c, 0x88, 0xdb, 0x21, 0xf3, 0x54, 0xd6, 0x4}, + {0x4a, 0xfa, 0x62, 0x83, 0xab, 0x20, 0xff, 0xcd, 0x6e, 0x3e, 0x1a, + 0xe2, 0xd4, 0x18, 0xe1, 0x57, 0x2b, 0xe6, 0x39, 0xfc, 0x17, 0x96, + 0x17, 0xe3, 0xfd, 0x69, 0x17, 0xbc, 0xef, 0x53, 0x9a, 0xd}, + {0xd5, 0xb5, 0xbd, 0xdd, 0x16, 0xc1, 0x7d, 0x5e, 0x2d, 0xdd, 0xa5, + 0x8d, 0xb6, 0xde, 0x54, 0x29, 0x92, 0xa2, 0x34, 0x33, 0x17, 0x8, + 0xb6, 0x1c, 0xd7, 0x1a, 0x99, 0x18, 0x26, 0x4f, 0x7a, 0x4a}, + }, + { + {0x4b, 0x2a, 0x37, 0xaf, 0x91, 0xb2, 0xc3, 0x24, 0xf2, 0x47, 0x81, + 0x71, 0x70, 0x82, 0xda, 0x93, 0xf2, 0x9e, 0x89, 0x86, 0x64, 0x85, + 0x84, 0xdd, 0x33, 0xee, 0xe0, 0x23, 0x42, 0x31, 0x96, 0x4a}, + {0x95, 0x5f, 0xb1, 0x5f, 0x2, 0x18, 0xa7, 0xf4, 0x8f, 0x1b, 0x5c, + 0x6b, 0x34, 0x5f, 0xf6, 0x3d, 0x12, 0x11, 0xe0, 0x0, 0x85, 0xf0, + 0xfc, 0xcd, 0x48, 0x18, 0xd3, 0xdd, 0x4c, 0xc, 0xb5, 0x11}, + {0xd6, 0xff, 0xa4, 0x8, 0x44, 0x27, 0xe8, 0xa6, 0xd9, 0x76, 0x15, + 0x9c, 0x7e, 0x17, 0x8e, 0x73, 0xf2, 0xb3, 0x2, 0x3d, 0xb6, 0x48, + 0x33, 0x77, 0x51, 0xcc, 0x6b, 0xce, 0x4d, 0xce, 0x4b, 0x4f}, + }, + { + {0x6f, 0xb, 0x9d, 0xc4, 0x6e, 0x61, 0xe2, 0x30, 0x17, 0x23, 0xec, + 0xca, 0x8f, 0x71, 0x56, 0xe4, 0xa6, 0x4f, 0x6b, 0xf2, 0x9b, 0x40, + 0xeb, 0x48, 0x37, 0x5f, 0x59, 0x61, 0xe5, 0xce, 0x42, 0x30}, + {0x84, 0x25, 0x24, 0xe2, 0x5a, 0xce, 0x1f, 0xa7, 0x9e, 0x8a, 0xf5, + 0x92, 0x56, 0x72, 0xea, 0x26, 0xf4, 0x3c, 0xea, 0x1c, 0xd7, 0x9, + 0x1a, 0xd2, 0xe6, 0x1, 0x1c, 0xb7, 0x14, 0xdd, 0xfc, 0x73}, + {0x41, 0xac, 0x9b, 0x44, 0x79, 0x70, 0x7e, 0x42, 0xa, 0x31, 0xe2, + 0xbc, 0x6d, 0xe3, 0x5a, 0x85, 0x7c, 0x1a, 0x84, 0x5f, 0x21, 0x76, + 0xae, 0x4c, 0xd6, 0xe1, 0x9c, 0x9a, 0xc, 0x74, 0x9e, 0x38}, + }, + { + {0x28, 0xac, 0xe, 0x57, 0xf6, 0x78, 0xbd, 0xc9, 0xe1, 0x9c, 0x91, + 0x27, 0x32, 0xb, 0x5b, 0xe5, 0xed, 0x91, 0x9b, 0xa1, 0xab, 0x3e, + 0xfc, 0x65, 0x90, 0x36, 0x26, 0xd6, 0xe5, 0x25, 0xc4, 0x25}, + {0xce, 0xb9, 0xdc, 0x34, 0xae, 0xb3, 0xfc, 0x64, 0xad, 0xd0, 0x48, + 0xe3, 0x23, 0x3, 0x50, 0x97, 0x1b, 0x38, 0xc6, 0x62, 0x7d, 0xf0, + 0xb3, 0x45, 0x88, 0x67, 0x5a, 0x46, 0x79, 0x53, 0x54, 0x61}, + {0x6e, 0xde, 0xd7, 0xf1, 0xa6, 0x6, 0x3e, 0x3f, 0x8, 0x23, 0x6, + 0x8e, 0x27, 0x76, 0xf9, 0x3e, 0x77, 0x6c, 0x8a, 0x4e, 0x26, 0xf6, + 0x14, 0x8c, 0x59, 0x47, 0x48, 0x15, 0x89, 0xa0, 0x39, 0x65}, + }, + { + {0x19, 0x4a, 0xbb, 0x14, 0xd4, 0xdb, 0xc4, 0xdd, 0x8e, 0x4f, 0x42, + 0x98, 0x3c, 0xbc, 0xb2, 0x19, 0x69, 0x71, 0xca, 0x36, 0xd7, 0x9f, + 0xa8, 0x48, 0x90, 0xbd, 0x19, 0xf0, 0xe, 0x32, 0x65, 0xf}, + {0x73, 0xf7, 0xd2, 0xc3, 0x74, 0x1f, 0xd2, 0xe9, 0x45, 0x68, 0xc4, + 0x25, 0x41, 0x54, 0x50, 0xc1, 0x33, 0x9e, 0xb9, 0xf9, 0xe8, 0x5c, + 0x4e, 0x62, 0x6c, 0x18, 0xcd, 0xc5, 0xaa, 0xe4, 0xc5, 0x11}, + {0xc6, 0xe0, 0xfd, 0xca, 0xb1, 0xd1, 0x86, 0xd4, 0x81, 0x51, 0x3b, + 0x16, 0xe3, 0xe6, 0x3f, 0x4f, 0x9a, 0x93, 0xf2, 0xfa, 0xd, 0xaf, + 0xa8, 0x59, 0x2a, 0x7, 0x33, 0xec, 0xbd, 0xc7, 0xab, 0x4c}, }, }, { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 928372153029038, 2147692869914564, 1455665844462196, - 1986737809425946, 185207050258089 -#else - 6721966, 13833823, 43585476, 32003117, 26354292, 21691111, - 23365146, 29604700, 7390889, 2759800 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 137732961814206, 706670923917341, 1387038086865771, - 1965643813686352, 1384777115696347 -#else - 4409022, 2052381, 23373853, 10530217, 7676779, 20668478, - 21302352, 29290375, 1244379, 20634787 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 481144981981577, 2053319313589856, 2065402289827512, - 617954271490316, 1106602634668125 -#else - 62687625, 7169618, 4982368, 30596842, 30256824, 30776892, - 14086412, 9208236, 15886429, 16489664 -#endif - }}, + {0x89, 0xd2, 0x78, 0x3f, 0x8f, 0x78, 0x8f, 0xc0, 0x9f, 0x4d, 0x40, + 0xa1, 0x2c, 0xa7, 0x30, 0xfe, 0x9d, 0xcc, 0x65, 0xcf, 0xfc, 0x8b, + 0x77, 0xf2, 0x21, 0x20, 0xcb, 0x5a, 0x16, 0x98, 0xe4, 0x7e}, + {0x2e, 0xa, 0x9c, 0x8, 0x24, 0x96, 0x9e, 0x23, 0x38, 0x47, 0xfe, + 0x3a, 0xc0, 0xc4, 0x48, 0xc7, 0x2a, 0xa1, 0x4f, 0x76, 0x2a, 0xed, + 0xdb, 0x17, 0x82, 0x85, 0x1c, 0x32, 0xf0, 0x93, 0x9b, 0x63}, + {0xc3, 0xa1, 0x11, 0x91, 0xe3, 0x8, 0xd5, 0x7b, 0x89, 0x74, 0x90, + 0x80, 0xd4, 0x90, 0x2b, 0x2b, 0x19, 0xfd, 0x72, 0xae, 0xc2, 0xae, + 0xd2, 0xe7, 0xa6, 0x2, 0xb6, 0x85, 0x3c, 0x49, 0xdf, 0xe}, + }, + { + {0x13, 0x41, 0x76, 0x84, 0xd2, 0xc4, 0x67, 0x67, 0x35, 0xf8, 0xf5, + 0xf7, 0x3f, 0x40, 0x90, 0xa0, 0xde, 0xbe, 0xe6, 0xca, 0xfa, 0xcf, + 0x8f, 0x1c, 0x69, 0xa3, 0xdf, 0xd1, 0x54, 0xc, 0xc0, 0x4}, + {0x68, 0x5a, 0x9b, 0x59, 0x58, 0x81, 0xcc, 0xae, 0xe, 0xe2, 0xad, + 0xeb, 0xf, 0x4f, 0x57, 0xea, 0x7, 0x7f, 0xb6, 0x22, 0x74, 0x1d, + 0xe4, 0x4f, 0xb4, 0x4f, 0x9d, 0x1, 0xe3, 0x92, 0x3b, 0x40}, + {0xf8, 0x5c, 0x46, 0x8b, 0x81, 0x2f, 0xc2, 0x4d, 0xf8, 0xef, 0x80, + 0x14, 0x5a, 0xf3, 0xa0, 0x71, 0x57, 0xd6, 0xc7, 0x4, 0xad, 0xbf, + 0xe8, 0xae, 0xf4, 0x76, 0x61, 0xb2, 0x2a, 0xb1, 0x5b, 0x35}, + }, + { + {0x18, 0x73, 0x8c, 0x5a, 0xc7, 0xda, 0x1, 0xa3, 0x11, 0xaa, 0xce, + 0xb3, 0x9d, 0x3, 0x90, 0xed, 0x2d, 0x3f, 0xae, 0x3b, 0xbf, 0x7c, + 0x7, 0x6f, 0x8e, 0xad, 0x52, 0xe0, 0xf8, 0xea, 0x18, 0x75}, + {0xf4, 0xbb, 0x93, 0x74, 0xcc, 0x64, 0x1e, 0xa7, 0xc3, 0xb0, 0xa3, + 0xec, 0xd9, 0x84, 0xbd, 0xe5, 0x85, 0xe7, 0x5, 0xfa, 0xc, 0xc5, + 0x6b, 0xa, 0x12, 0xc3, 0x2e, 0x18, 0x32, 0x81, 0x9b, 0xf}, + {0x32, 0x6c, 0x7f, 0x1b, 0xc4, 0x59, 0x88, 0xa4, 0x98, 0x32, 0x38, + 0xf4, 0xbc, 0x60, 0x2d, 0xf, 0xd9, 0xd1, 0xb1, 0xc9, 0x29, 0xa9, + 0x15, 0x18, 0xc4, 0x55, 0x17, 0xbb, 0x1b, 0x87, 0xc3, 0x47}, + }, + { + {0xb0, 0x66, 0x50, 0xc8, 0x50, 0x5d, 0xe6, 0xfb, 0xb0, 0x99, 0xa2, + 0xb3, 0xb0, 0xc4, 0xec, 0x62, 0xe0, 0xe8, 0x1a, 0x44, 0xea, 0x54, + 0x37, 0xe5, 0x5f, 0x8d, 0xd4, 0xe8, 0x2c, 0xa0, 0xfe, 0x8}, + {0x48, 0x4f, 0xec, 0x71, 0x97, 0x53, 0x44, 0x51, 0x6e, 0x5d, 0x8c, + 0xc9, 0x7d, 0xb1, 0x5, 0xf8, 0x6b, 0xc6, 0xc3, 0x47, 0x1a, 0xc1, + 0x62, 0xf7, 0xdc, 0x99, 0x46, 0x76, 0x85, 0x9b, 0xb8, 0x0}, + {0xd0, 0xea, 0xde, 0x68, 0x76, 0xdd, 0x4d, 0x82, 0x23, 0x5d, 0x68, + 0x4b, 0x20, 0x45, 0x64, 0xc8, 0x65, 0xd6, 0x89, 0x5d, 0xcd, 0xcf, + 0x14, 0xb5, 0x37, 0xd5, 0x75, 0x4f, 0xa7, 0x29, 0x38, 0x47}, + }, + { + {0xc9, 0x2, 0x39, 0xad, 0x3a, 0x53, 0xd9, 0x23, 0x8f, 0x58, 0x3, + 0xef, 0xce, 0xdd, 0xc2, 0x64, 0xb4, 0x2f, 0xe1, 0xcf, 0x90, 0x73, + 0x25, 0x15, 0x90, 0xd3, 0xe4, 0x44, 0x4d, 0x8b, 0x66, 0x6c}, + {0x18, 0xc4, 0x79, 0x46, 0x75, 0xda, 0xd2, 0x82, 0xf0, 0x8d, 0x61, + 0xb2, 0xd8, 0xd7, 0x3b, 0xe6, 0xa, 0xeb, 0x47, 0xac, 0x24, 0xef, + 0x5e, 0x35, 0xb4, 0xc6, 0x33, 0x48, 0x4c, 0x68, 0x78, 0x20}, + {0xc, 0x82, 0x78, 0x7a, 0x21, 0xcf, 0x48, 0x3b, 0x97, 0x3e, 0x27, + 0x81, 0xb2, 0xa, 0x6a, 0xf7, 0x7b, 0xed, 0x8e, 0x8c, 0xa7, 0x65, + 0x6c, 0xa9, 0x3f, 0x43, 0x8a, 0x4f, 0x5, 0xa6, 0x11, 0x74}, + }, + { + {0xb4, 0x75, 0xb1, 0x18, 0x3d, 0xe5, 0x9a, 0x57, 0x2, 0xa1, 0x92, + 0xf3, 0x59, 0x31, 0x71, 0x68, 0xf5, 0x35, 0xef, 0x1e, 0xba, 0xec, + 0x55, 0x84, 0x8f, 0x39, 0x8c, 0x45, 0x72, 0xa8, 0xc9, 0x1e}, + {0x6d, 0xc8, 0x9d, 0xb9, 0x32, 0x9d, 0x65, 0x4d, 0x15, 0xf1, 0x3a, + 0x60, 0x75, 0xdc, 0x4c, 0x4, 0x88, 0xe4, 0xc2, 0xdc, 0x2c, 0x71, + 0x4c, 0xb3, 0xff, 0x34, 0x81, 0xfb, 0x74, 0x65, 0x13, 0x7c}, + {0x9b, 0x50, 0xa2, 0x0, 0xd4, 0xa4, 0xe6, 0xb8, 0xb4, 0x82, 0xc8, + 0xb, 0x2, 0xd7, 0x81, 0x9b, 0x61, 0x75, 0x95, 0xf1, 0x9b, 0xcc, + 0xe7, 0x57, 0x60, 0x64, 0xcd, 0xc7, 0xa5, 0x88, 0xdd, 0x3a}, + }, + { + {0x46, 0x30, 0x39, 0x59, 0xd4, 0x98, 0xc2, 0x85, 0xec, 0x59, 0xf6, + 0x5f, 0x98, 0x35, 0x7e, 0x8f, 0x3a, 0x6e, 0xf6, 0xf2, 0x2a, 0xa2, + 0x2c, 0x1d, 0x20, 0xa7, 0x6, 0xa4, 0x31, 0x11, 0xba, 0x61}, + {0xf2, 0xdc, 0x35, 0xb6, 0x70, 0x57, 0x89, 0xab, 0xbc, 0x1f, 0x6c, + 0xf6, 0x6c, 0xef, 0xdf, 0x2, 0x87, 0xd1, 0xb6, 0xbe, 0x68, 0x2, + 0x53, 0x85, 0x74, 0x9e, 0x87, 0xcc, 0xfc, 0x29, 0x99, 0x24}, + {0x29, 0x90, 0x95, 0x16, 0xf1, 0xa0, 0xd0, 0xa3, 0x89, 0xbd, 0x7e, + 0xba, 0x6c, 0x6b, 0x3b, 0x2, 0x7, 0x33, 0x78, 0x26, 0x3e, 0x5a, + 0xf1, 0x7b, 0xe7, 0xec, 0xd8, 0xbb, 0xc, 0x31, 0x20, 0x56}, + }, + { + {0xd6, 0x85, 0xe2, 0x77, 0xf4, 0xb5, 0x46, 0x66, 0x93, 0x61, 0x8f, + 0x6c, 0x67, 0xff, 0xe8, 0x40, 0xdd, 0x94, 0xb5, 0xab, 0x11, 0x73, + 0xec, 0xa6, 0x4d, 0xec, 0x8c, 0x65, 0xf3, 0x46, 0xc8, 0x7e}, + {0x43, 0xd6, 0x34, 0x49, 0x43, 0x93, 0x89, 0x52, 0xf5, 0x22, 0x12, + 0xa5, 0x6, 0xf8, 0xdb, 0xb9, 0x22, 0x1c, 0xf4, 0xc3, 0x8f, 0x87, + 0x6d, 0x8f, 0x30, 0x97, 0x9d, 0x4d, 0x2a, 0x6a, 0x67, 0x37}, + {0xc7, 0x2e, 0xa2, 0x1d, 0x3f, 0x8f, 0x5e, 0x9b, 0x13, 0xcd, 0x1, + 0x6c, 0x77, 0x1d, 0xf, 0x13, 0xb8, 0x9f, 0x98, 0xa2, 0xcf, 0x8f, + 0x4c, 0x21, 0xd5, 0x9d, 0x9b, 0x39, 0x23, 0xf7, 0xaa, 0x6d}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 696298019648792, 893299659040895, 1148636718636009, - 26734077349617, 2203955659340681 -#else - 1996056, 10375649, 14346367, 13311202, 60234729, 17116020, - 53415665, 398368, 36502409, 32841498 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 657390353372855, 998499966885562, 991893336905797, - 810470207106761, 343139804608786 -#else - 41801399, 9795879, 64331450, 14878808, 33577029, 14780362, - 13348553, 12076947, 36272402, 5113181 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 791736669492960, 934767652997115, 824656780392914, - 1759463253018643, 361530362383518 -#else - 49338080, 11797795, 31950843, 13929123, 41220562, 12288343, - 36767763, 26218045, 13847710, 5387222 -#endif - }}, + {0xa2, 0x8e, 0xad, 0xac, 0xbf, 0x4, 0x3b, 0x58, 0x84, 0xe8, 0x8b, + 0x14, 0xe8, 0x43, 0xb7, 0x29, 0xdb, 0xc5, 0x10, 0x8, 0x3b, 0x58, + 0x1e, 0x2b, 0xaa, 0xbb, 0xb3, 0x8e, 0xe5, 0x49, 0x54, 0x2b}, + {0x47, 0xbe, 0x3d, 0xeb, 0x62, 0x75, 0x3a, 0x5f, 0xb8, 0xa0, 0xbd, + 0x8e, 0x54, 0x38, 0xea, 0xf7, 0x99, 0x72, 0x74, 0x45, 0x31, 0xe5, + 0xc3, 0x0, 0x51, 0xd5, 0x27, 0x16, 0xe7, 0xe9, 0x4, 0x13}, + {0xfe, 0x9c, 0xdc, 0x6a, 0xd2, 0x14, 0x98, 0x78, 0xb, 0xdd, 0x48, + 0x8b, 0x3f, 0xab, 0x1b, 0x3c, 0xa, 0xc6, 0x79, 0xf9, 0xff, 0xe1, + 0xf, 0xda, 0x93, 0xd6, 0x2d, 0x7c, 0x2d, 0xde, 0x68, 0x44}, + }, + { + {0xce, 0x7, 0x63, 0xf8, 0xc6, 0xd8, 0x9a, 0x4b, 0x28, 0xc, 0x5d, + 0x43, 0x31, 0x35, 0x11, 0x21, 0x2c, 0x77, 0x7a, 0x65, 0xc5, 0x66, + 0xa8, 0xd4, 0x52, 0x73, 0x24, 0x63, 0x7e, 0x42, 0xa6, 0x5d}, + {0x9e, 0x46, 0x19, 0x94, 0x5e, 0x35, 0xbb, 0x51, 0x54, 0xc7, 0xdd, + 0x23, 0x4c, 0xdc, 0xe6, 0x33, 0x62, 0x99, 0x7f, 0x44, 0xd6, 0xb6, + 0xa5, 0x93, 0x63, 0xbd, 0x44, 0xfb, 0x6f, 0x7c, 0xce, 0x6c}, + {0xca, 0x22, 0xac, 0xde, 0x88, 0xc6, 0x94, 0x1a, 0xf8, 0x1f, 0xae, + 0xbb, 0xf7, 0x6e, 0x6, 0xb9, 0xf, 0x58, 0x59, 0x8d, 0x38, 0x8c, + 0xad, 0x88, 0xa8, 0x2c, 0x9f, 0xe7, 0xbf, 0x9a, 0xf2, 0x58}, + }, + { + {0xf6, 0xcd, 0xe, 0x71, 0xbf, 0x64, 0x5a, 0x4b, 0x3c, 0x29, 0x2c, + 0x46, 0x38, 0xe5, 0x4c, 0xb1, 0xb9, 0x3a, 0xb, 0xd5, 0x56, 0xd0, + 0x43, 0x36, 0x70, 0x48, 0x5b, 0x18, 0x24, 0x37, 0xf9, 0x6a}, + {0x68, 0x3e, 0xe7, 0x8d, 0xab, 0xcf, 0xe, 0xe9, 0xa5, 0x76, 0x7e, + 0x37, 0x9f, 0x6f, 0x3, 0x54, 0x82, 0x59, 0x1, 0xbe, 0xb, 0x5b, + 0x49, 0xf0, 0x36, 0x1e, 0xf4, 0xa7, 0xc4, 0x29, 0x76, 0x57}, + {0x88, 0xa8, 0xc6, 0x9, 0x45, 0x2, 0x20, 0x32, 0x73, 0x89, 0x55, + 0x4b, 0x13, 0x36, 0xe0, 0xd2, 0x9f, 0x28, 0x33, 0x3c, 0x23, 0x36, + 0xe2, 0x83, 0x8f, 0xc1, 0xae, 0xc, 0xbb, 0x25, 0x1f, 0x70}, + }, + { + {0x13, 0xc1, 0xbe, 0x7c, 0xd9, 0xf6, 0x18, 0x9d, 0xe4, 0xdb, 0xbf, + 0x74, 0xe6, 0x6, 0x4a, 0x84, 0xd6, 0x60, 0x4e, 0xac, 0x22, 0xb5, + 0xf5, 0x20, 0x51, 0x5e, 0x95, 0x50, 0xc0, 0x5b, 0xa, 0x72}, + {0xed, 0x6c, 0x61, 0xe4, 0xf8, 0xb0, 0xa8, 0xc3, 0x7d, 0xa8, 0x25, + 0x9e, 0xe, 0x66, 0x0, 0xf7, 0x9c, 0xa5, 0xbc, 0xf4, 0x1f, 0x6, + 0xe3, 0x61, 0xe9, 0xb, 0xc4, 0xbd, 0xbf, 0x92, 0xc, 0x2e}, + {0x35, 0x5a, 0x80, 0x9b, 0x43, 0x9, 0x3f, 0xc, 0xfc, 0xab, 0x42, + 0x62, 0x37, 0x8b, 0x4e, 0xe8, 0x46, 0x93, 0x22, 0x5c, 0xf3, 0x17, + 0x14, 0x69, 0xec, 0xf0, 0x4e, 0x14, 0xbb, 0x9c, 0x9b, 0xe}, + }, + { + {0xee, 0xbe, 0xb1, 0x5d, 0xd5, 0x9b, 0xee, 0x8d, 0xb9, 0x3f, 0x72, + 0xa, 0x37, 0xab, 0xc3, 0xc9, 0x91, 0xd7, 0x68, 0x1c, 0xbf, 0xf1, + 0xa8, 0x44, 0xde, 0x3c, 0xfd, 0x1c, 0x19, 0x44, 0x6d, 0x36}, + {0xad, 0x20, 0x57, 0xfb, 0x8f, 0xd4, 0xba, 0xfb, 0xe, 0xd, 0xf9, + 0xdb, 0x6b, 0x91, 0x81, 0xee, 0xbf, 0x43, 0x55, 0x63, 0x52, 0x31, + 0x81, 0xd4, 0xd8, 0x7b, 0x33, 0x3f, 0xeb, 0x4, 0x11, 0x22}, + {0x14, 0x8c, 0xbc, 0xf2, 0x43, 0x17, 0x3c, 0x9e, 0x3b, 0x6c, 0x85, + 0xb5, 0xfc, 0x26, 0xda, 0x2e, 0x97, 0xfb, 0xa7, 0x68, 0xe, 0x2f, + 0xb8, 0xcc, 0x44, 0x32, 0x59, 0xbc, 0xe6, 0xa4, 0x67, 0x41}, + }, + { + {0xee, 0x8f, 0xce, 0xf8, 0x65, 0x26, 0xbe, 0xc2, 0x2c, 0xd6, 0x80, + 0xe8, 0x14, 0xff, 0x67, 0xe9, 0xee, 0x4e, 0x36, 0x2f, 0x7e, 0x6e, + 0x2e, 0xf1, 0xf6, 0xd2, 0x7e, 0xcb, 0x70, 0x33, 0xb3, 0x34}, + {0x0, 0x27, 0xf6, 0x76, 0x28, 0x9d, 0x3b, 0x64, 0xeb, 0x68, 0x76, + 0xe, 0x40, 0x9d, 0x1d, 0x5d, 0x84, 0x6, 0xfc, 0x21, 0x3, 0x43, + 0x4b, 0x1b, 0x6a, 0x24, 0x55, 0x22, 0x7e, 0xbb, 0x38, 0x79}, + {0xcc, 0xd6, 0x81, 0x86, 0xee, 0x91, 0xc5, 0xcd, 0x53, 0xa7, 0x85, + 0xed, 0x9c, 0x10, 0x2, 0xce, 0x83, 0x88, 0x80, 0x58, 0xc1, 0x85, + 0x74, 0xed, 0xe4, 0x65, 0xfe, 0x2d, 0x6e, 0xfc, 0x76, 0x11}, + }, + { + {0xb8, 0xe, 0x77, 0x49, 0x89, 0xe2, 0x90, 0xdb, 0xa3, 0x40, 0xf4, + 0xac, 0x2a, 0xcc, 0xfb, 0x98, 0x9b, 0x87, 0xd7, 0xde, 0xfe, 0x4f, + 0x35, 0x21, 0xb6, 0x6, 0x69, 0xf2, 0x54, 0x3e, 0x6a, 0x1f}, + {0x9b, 0x61, 0x9c, 0x5b, 0xd0, 0x6c, 0xaf, 0xb4, 0x80, 0x84, 0xa5, + 0xb2, 0xf4, 0xc9, 0xdf, 0x2d, 0xc4, 0x4d, 0xe9, 0xeb, 0x2, 0xa5, + 0x4f, 0x3d, 0x34, 0x5f, 0x7d, 0x67, 0x4c, 0x3a, 0xfc, 0x8}, + {0xea, 0x34, 0x7, 0xd3, 0x99, 0xc1, 0xa4, 0x60, 0xd6, 0x5c, 0x16, + 0x31, 0xb6, 0x85, 0xc0, 0x40, 0x95, 0x82, 0x59, 0xf7, 0x23, 0x3e, + 0x33, 0xe2, 0xd1, 0x0, 0xb9, 0x16, 0x1, 0xad, 0x2f, 0x4f}, + }, + { + {0x38, 0xb6, 0x3b, 0xb7, 0x1d, 0xd9, 0x2c, 0x96, 0x8, 0x9c, 0x12, + 0xfc, 0xaa, 0x77, 0x5, 0xe6, 0x89, 0x16, 0xb6, 0xf3, 0x39, 0x9b, + 0x61, 0x6f, 0x81, 0xee, 0x44, 0x29, 0x5f, 0x99, 0x51, 0x34}, + {0x54, 0x4e, 0xae, 0x94, 0x41, 0xb2, 0xbe, 0x44, 0x6c, 0xef, 0x57, + 0x18, 0x51, 0x1c, 0x54, 0x5f, 0x98, 0x4, 0x8d, 0x36, 0x2d, 0x6b, + 0x1e, 0xa6, 0xab, 0xf7, 0x2e, 0x97, 0xa4, 0x84, 0x54, 0x44}, + {0x7c, 0x7d, 0xea, 0x9f, 0xd0, 0xfc, 0x52, 0x91, 0xf6, 0x5c, 0x93, + 0xb0, 0x94, 0x6c, 0x81, 0x4a, 0x40, 0x5c, 0x28, 0x47, 0xaa, 0x9a, + 0x8e, 0x25, 0xb7, 0x93, 0x28, 0x4, 0xa6, 0x9c, 0xb8, 0x10}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2022541353055597, 2094700262587466, 1551008075025686, - 242785517418164, 695985404963562 -#else - 48526701, 30138214, 17824842, 31213466, 22744342, 23111821, - 8763060, 3617786, 47508202, 10370990 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1287487199965223, 2215311941380308, 1552928390931986, - 1664859529680196, 1125004975265243 -#else - 20246567, 19185054, 22358228, 33010720, 18507282, 23140436, - 14554436, 24808340, 32232923, 16763880 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 677434665154918, 989582503122485, 1817429540898386, - 1052904935475344, 1143826298169798 -#else - 9648486, 10094563, 26416693, 14745928, 36734546, 27081810, - 11094160, 15689506, 3140038, 17044340 -#endif - }}, + {0x6e, 0xf0, 0x45, 0x5a, 0xbe, 0x41, 0x39, 0x75, 0x65, 0x5f, 0x9c, + 0x6d, 0xed, 0xae, 0x7c, 0xd0, 0xb6, 0x51, 0xff, 0x72, 0x9c, 0x6b, + 0x77, 0x11, 0xa9, 0x4d, 0xd, 0xef, 0xd9, 0xd1, 0xd2, 0x17}, + {0x9c, 0x28, 0x18, 0x97, 0x49, 0x47, 0x59, 0x3d, 0x26, 0x3f, 0x53, + 0x24, 0xc5, 0xf8, 0xeb, 0x12, 0x15, 0xef, 0xc3, 0x14, 0xcb, 0xbf, + 0x62, 0x2, 0x8e, 0x51, 0xb7, 0x77, 0xd5, 0x78, 0xb8, 0x20}, + {0x6a, 0x3e, 0x3f, 0x7, 0x18, 0xaf, 0xf2, 0x27, 0x69, 0x10, 0x52, + 0xd7, 0x19, 0xe5, 0x3f, 0xfd, 0x22, 0x0, 0xa6, 0x3c, 0x2c, 0xb7, + 0xe3, 0x22, 0xa7, 0xc6, 0x65, 0xcc, 0x63, 0x4f, 0x21, 0x72}, + }, + { + {0xc9, 0x29, 0x3b, 0xf4, 0xb9, 0xb7, 0x9d, 0x1d, 0x75, 0x8f, 0x51, + 0x4f, 0x4a, 0x82, 0x5, 0xd6, 0xc4, 0x9d, 0x2f, 0x31, 0xbd, 0x72, + 0xc0, 0xf2, 0xb0, 0x45, 0x15, 0x5a, 0x85, 0xac, 0x24, 0x1f}, + {0x93, 0xa6, 0x7, 0x53, 0x40, 0x7f, 0xe3, 0xb4, 0x95, 0x67, 0x33, + 0x2f, 0xd7, 0x14, 0xa7, 0xab, 0x99, 0x10, 0x76, 0x73, 0xa7, 0xd0, + 0xfb, 0xd6, 0xc9, 0xcb, 0x71, 0x81, 0xc5, 0x48, 0xdf, 0x5f}, + {0xaa, 0x5, 0x95, 0x8e, 0x32, 0x8, 0xd6, 0x24, 0xee, 0x20, 0x14, + 0xc, 0xd1, 0xc1, 0x48, 0x47, 0xa2, 0x25, 0xfb, 0x6, 0x5c, 0xe4, + 0xff, 0xc7, 0xe6, 0x95, 0xe3, 0x2a, 0x9e, 0x73, 0xba, 0x0}, + }, + { + {0x26, 0xbb, 0x88, 0xea, 0xf5, 0x26, 0x44, 0xae, 0xfb, 0x3b, 0x97, + 0x84, 0xd9, 0x79, 0x6, 0x36, 0x50, 0x4e, 0x69, 0x26, 0xc, 0x3, + 0x9f, 0x5c, 0x26, 0xd2, 0x18, 0xd5, 0xe7, 0x7d, 0x29, 0x72}, + {0xd6, 0x90, 0x87, 0x5c, 0xde, 0x98, 0x2e, 0x59, 0xdf, 0xa2, 0xc2, + 0x45, 0xd3, 0xb7, 0xbf, 0xe5, 0x22, 0x99, 0xb4, 0xf9, 0x60, 0x3b, + 0x5a, 0x11, 0xf3, 0x78, 0xad, 0x67, 0x3e, 0x3a, 0x28, 0x3}, + {0x39, 0xb9, 0xc, 0xbe, 0xc7, 0x1d, 0x24, 0x48, 0x80, 0x30, 0x63, + 0x8b, 0x4d, 0x9b, 0xf1, 0x32, 0x8, 0x93, 0x28, 0x2, 0xd, 0xc9, + 0xdf, 0xd3, 0x45, 0x19, 0x27, 0x46, 0x68, 0x29, 0xe1, 0x5}, + }, + { + {0x50, 0x45, 0x2c, 0x24, 0xc8, 0xbb, 0xbf, 0xad, 0xd9, 0x81, 0x30, + 0xd0, 0xec, 0xc, 0xc8, 0xbc, 0x92, 0xdf, 0xc8, 0xf5, 0xa6, 0x66, + 0x35, 0x84, 0x4c, 0xce, 0x58, 0x82, 0xd3, 0x25, 0xcf, 0x78}, + {0x5a, 0x49, 0x9c, 0x2d, 0xb3, 0xee, 0x82, 0xba, 0x7c, 0xb9, 0x2b, + 0xf1, 0xfc, 0xc8, 0xef, 0xce, 0xe0, 0xd1, 0xb5, 0x93, 0xae, 0xab, + 0x2d, 0xb0, 0x9b, 0x8d, 0x69, 0x13, 0x9c, 0xc, 0xc0, 0x39}, + {0x68, 0x9d, 0x48, 0x31, 0x8e, 0x6b, 0xae, 0x15, 0x87, 0xf0, 0x2b, + 0x9c, 0xab, 0x1c, 0x85, 0xaa, 0x5, 0xfa, 0x4e, 0xf0, 0x97, 0x5a, + 0xa7, 0xc9, 0x32, 0xf8, 0x3f, 0x6b, 0x7, 0x52, 0x6b, 0x0}, + }, + { + {0x2d, 0x8, 0xce, 0xb9, 0x16, 0x7e, 0xcb, 0xf5, 0x29, 0xbc, 0x7a, + 0x41, 0x4c, 0xf1, 0x7, 0x34, 0xab, 0xa7, 0xf4, 0x2b, 0xce, 0x6b, + 0xb3, 0xd4, 0xce, 0x75, 0x9f, 0x1a, 0x56, 0xe9, 0xe2, 0x7d}, + {0x1c, 0x78, 0x95, 0x9d, 0xe1, 0xcf, 0xe0, 0x29, 0xe2, 0x10, 0x63, + 0x96, 0x18, 0xdf, 0x81, 0xb6, 0x39, 0x6b, 0x51, 0x70, 0xd3, 0x39, + 0xdf, 0x57, 0x22, 0x61, 0xc7, 0x3b, 0x44, 0xe3, 0x57, 0x4d}, + {0xcb, 0x5e, 0xa5, 0xb6, 0xf4, 0xd4, 0x70, 0xde, 0x99, 0xdb, 0x85, + 0x5d, 0x7f, 0x52, 0x1, 0x48, 0x81, 0x9a, 0xee, 0xd3, 0x40, 0xc4, + 0xc9, 0xdb, 0xed, 0x29, 0x60, 0x1a, 0xaf, 0x90, 0x2a, 0x6b}, + }, + { + {0xa, 0xd8, 0xb2, 0x5b, 0x24, 0xf3, 0xeb, 0x77, 0x9b, 0x7, 0xb9, + 0x2f, 0x47, 0x1b, 0x30, 0xd8, 0x33, 0x73, 0xee, 0x4c, 0xf2, 0xe6, + 0x47, 0xc6, 0x9, 0x21, 0x6c, 0x27, 0xc8, 0x12, 0x58, 0x46}, + {0x97, 0x1e, 0xe6, 0x9a, 0xfc, 0xf4, 0x23, 0x69, 0xd1, 0x5f, 0x3f, + 0xe0, 0x1d, 0x28, 0x35, 0x57, 0x2d, 0xd1, 0xed, 0xe6, 0x43, 0xae, + 0x64, 0xa7, 0x4a, 0x3e, 0x2d, 0xd1, 0xe9, 0xf4, 0xd8, 0x5f}, + {0xd9, 0x62, 0x10, 0x2a, 0xb2, 0xbe, 0x43, 0x4d, 0x16, 0xdc, 0x31, + 0x38, 0x75, 0xfb, 0x65, 0x70, 0xd7, 0x68, 0x29, 0xde, 0x7b, 0x4a, + 0xd, 0x18, 0x90, 0x67, 0xb1, 0x1c, 0x2b, 0x2c, 0xb3, 0x5}, + }, + { + {0x95, 0x81, 0xd5, 0x7a, 0x2c, 0xa4, 0xfc, 0xf7, 0xcc, 0xf3, 0x33, + 0x43, 0x6e, 0x28, 0x14, 0x32, 0x9d, 0x97, 0xb, 0x34, 0xd, 0x9d, + 0xc2, 0xb6, 0xe1, 0x7, 0x73, 0x56, 0x48, 0x1a, 0x77, 0x31}, + {0xfd, 0xa8, 0x4d, 0xd2, 0xcc, 0x5e, 0xc0, 0xc8, 0x83, 0xef, 0xdf, + 0x5, 0xac, 0x1a, 0xcf, 0xa1, 0x61, 0xcd, 0xf9, 0x7d, 0xf2, 0xef, + 0xbe, 0xdb, 0x99, 0x1e, 0x47, 0x7b, 0xa3, 0x56, 0x55, 0x3b}, + {0x82, 0xd4, 0x4d, 0xe1, 0x24, 0xc5, 0xb0, 0x32, 0xb6, 0xa4, 0x2b, + 0x1a, 0x54, 0x51, 0xb3, 0xed, 0xf3, 0x5a, 0x2b, 0x28, 0x48, 0x60, + 0xd1, 0xa3, 0xeb, 0x36, 0x73, 0x7a, 0xd2, 0x79, 0xc0, 0x4f}, + }, + { + {0xd, 0xc5, 0x86, 0xc, 0x44, 0x8b, 0x34, 0xdc, 0x51, 0xe6, 0x94, + 0xcc, 0xc9, 0xcb, 0x37, 0x13, 0xb9, 0x3c, 0x3e, 0x64, 0x4d, 0xf7, + 0x22, 0x64, 0x8, 0xcd, 0xe3, 0xba, 0xc2, 0x70, 0x11, 0x24}, + {0x7f, 0x2f, 0xbf, 0x89, 0xb0, 0x38, 0xc9, 0x51, 0xa7, 0xe9, 0xdf, + 0x2, 0x65, 0xbd, 0x97, 0x24, 0x53, 0xe4, 0x80, 0x78, 0x9c, 0xc0, + 0xff, 0xff, 0x92, 0x8e, 0xf9, 0xca, 0xce, 0x67, 0x45, 0x12}, + {0xb4, 0x73, 0xc4, 0xa, 0x86, 0xab, 0xf9, 0x3f, 0x35, 0xe4, 0x13, + 0x1, 0xee, 0x1d, 0x91, 0xf0, 0xaf, 0xc4, 0xc6, 0xeb, 0x60, 0x50, + 0xe7, 0x4a, 0xd, 0x0, 0x87, 0x6c, 0x96, 0x12, 0x86, 0x3f}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 367266328308408, 318431188922404, 695629353755355, - 634085657580832, 24581612564426 -#else - 50948792, 5472694, 31895588, 4744994, 8823515, 10365685, - 39884064, 9448612, 38334410, 366294 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 773360688841258, 1815381330538070, 363773437667376, - 539629987070205, 783280434248437 -#else - 19153450, 11523972, 56012374, 27051289, 42461232, 5420646, - 28344573, 8041113, 719605, 11671788 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 180820816194166, 168937968377394, 748416242794470, - 1227281252254508, 1567587861004268 -#else - 8678006, 2694440, 60300850, 2517371, 4964326, 11152271, - 51675948, 18287915, 27000812, 23358879 -#endif - }}, + {0x13, 0x8d, 0x4, 0x36, 0xfa, 0xfc, 0x18, 0x9c, 0xdd, 0x9d, 0x89, + 0x73, 0xb3, 0x9d, 0x15, 0x29, 0xaa, 0xd0, 0x92, 0x9f, 0xb, 0x35, + 0x9f, 0xdc, 0xd4, 0x19, 0x8a, 0x87, 0xee, 0x7e, 0xf5, 0x26}, + {0xde, 0xd, 0x2a, 0x78, 0xc9, 0xc, 0x9a, 0x55, 0x85, 0x83, 0x71, + 0xea, 0xb2, 0xcd, 0x1d, 0x55, 0x8c, 0x23, 0xef, 0x31, 0x5b, 0x86, + 0x62, 0x7f, 0x3d, 0x61, 0x73, 0x79, 0x76, 0xa7, 0x4a, 0x50}, + {0xb1, 0xef, 0x87, 0x56, 0xd5, 0x2c, 0xab, 0xc, 0x7b, 0xf1, 0x7a, + 0x24, 0x62, 0xd1, 0x80, 0x51, 0x67, 0x24, 0x5a, 0x4f, 0x34, 0x5a, + 0xc1, 0x85, 0x69, 0x30, 0xba, 0x9d, 0x3d, 0x94, 0x41, 0x40}, + }, + { + {0xdd, 0xaa, 0x6c, 0xa2, 0x43, 0x77, 0x21, 0x4b, 0xce, 0xb7, 0x8a, + 0x64, 0x24, 0xb4, 0xa6, 0x47, 0xe3, 0xc9, 0xfb, 0x3, 0x7a, 0x4f, + 0x1d, 0xcb, 0x19, 0xd0, 0x0, 0x98, 0x42, 0x31, 0xd9, 0x12}, + {0x96, 0xcc, 0xeb, 0x43, 0xba, 0xee, 0xc0, 0xc3, 0xaf, 0x9c, 0xea, + 0x26, 0x9c, 0x9c, 0x74, 0x8d, 0xc6, 0xcc, 0x77, 0x1c, 0xee, 0x95, + 0xfa, 0xd9, 0xf, 0x34, 0x84, 0x76, 0xd9, 0xa1, 0x20, 0x14}, + {0x4f, 0x59, 0x37, 0xd3, 0x99, 0x77, 0xc6, 0x0, 0x7b, 0xa4, 0x3a, + 0xb2, 0x40, 0x51, 0x3c, 0x5e, 0x95, 0xf3, 0x5f, 0xe3, 0x54, 0x28, + 0x18, 0x44, 0x12, 0xa0, 0x59, 0x43, 0x31, 0x92, 0x4f, 0x1b}, + }, + { + {0xb1, 0x66, 0x98, 0xa4, 0x30, 0x30, 0xcf, 0x33, 0x59, 0x48, 0x5f, + 0x21, 0xd2, 0x73, 0x1f, 0x25, 0xf6, 0xf4, 0xde, 0x51, 0x40, 0xaa, + 0x82, 0xab, 0xf6, 0x23, 0x9a, 0x6f, 0xd5, 0x91, 0xf1, 0x5f}, + {0x51, 0x9, 0x15, 0x89, 0x9d, 0x10, 0x5c, 0x3e, 0x6a, 0x69, 0xe9, + 0x2d, 0x91, 0xfa, 0xce, 0x39, 0x20, 0x30, 0x5f, 0x97, 0x3f, 0xe4, + 0xea, 0x20, 0xae, 0x2d, 0x13, 0x7f, 0x2a, 0x57, 0x9b, 0x23}, + {0x68, 0x90, 0x2d, 0xac, 0x33, 0xd4, 0x9e, 0x81, 0x23, 0x85, 0xc9, + 0x5f, 0x79, 0xab, 0x83, 0x28, 0x3d, 0xeb, 0x93, 0x55, 0x80, 0x72, + 0x45, 0xef, 0xcb, 0x36, 0x8f, 0x75, 0x6a, 0x52, 0xc, 0x2}, + }, + { + {0x89, 0xcc, 0x42, 0xf0, 0x59, 0xef, 0x31, 0xe9, 0xb6, 0x4b, 0x12, + 0x8e, 0x9d, 0x9c, 0x58, 0x2c, 0x97, 0x59, 0xc7, 0xae, 0x8a, 0xe1, + 0xc8, 0xad, 0xc, 0xc5, 0x2, 0x56, 0xa, 0xfe, 0x2c, 0x45}, + {0xbc, 0xdb, 0xd8, 0x9e, 0xf8, 0x34, 0x98, 0x77, 0x6c, 0xa4, 0x7c, + 0xdc, 0xf9, 0xaa, 0xf2, 0xc8, 0x74, 0xb0, 0xe1, 0xa3, 0xdc, 0x4c, + 0x52, 0xa9, 0x77, 0x38, 0x31, 0x15, 0x46, 0xcc, 0xaa, 0x2}, + {0xdf, 0x77, 0x78, 0x64, 0xa0, 0xf7, 0xa0, 0x86, 0x9f, 0x7c, 0x60, + 0xe, 0x27, 0x64, 0xc4, 0xbb, 0xc9, 0x11, 0xfb, 0xf1, 0x25, 0xea, + 0x17, 0xab, 0x7b, 0x87, 0x4b, 0x30, 0x7b, 0x7d, 0xfb, 0x4c}, + }, + { + {0x12, 0xef, 0x89, 0x97, 0xc2, 0x99, 0x86, 0xe2, 0xd, 0x19, 0x57, + 0xdf, 0x71, 0xcd, 0x6e, 0x2b, 0xd0, 0x70, 0xc9, 0xec, 0x57, 0xc8, + 0x43, 0xc3, 0xc5, 0x3a, 0x4d, 0x43, 0xbc, 0x4c, 0x1d, 0x5b}, + {0xfe, 0x75, 0x9b, 0xb8, 0x6c, 0x3d, 0xb4, 0x72, 0x80, 0xdc, 0x6a, + 0x9c, 0xd9, 0x94, 0xc6, 0x54, 0x9f, 0x4c, 0xe3, 0x3e, 0x37, 0xaa, + 0xc3, 0xb8, 0x64, 0x53, 0x7, 0x39, 0x2b, 0x62, 0xb4, 0x14}, + {0x26, 0x9f, 0xa, 0xcc, 0x15, 0x26, 0xfb, 0xb6, 0xe5, 0xcc, 0x8d, + 0xb8, 0x2b, 0xe, 0x4f, 0x3a, 0x5, 0xa7, 0x69, 0x33, 0x8b, 0x49, + 0x1, 0x13, 0xd1, 0x2d, 0x59, 0x58, 0x12, 0xf7, 0x98, 0x2f}, + }, + { + {0x1, 0xa7, 0x54, 0x4f, 0x44, 0xae, 0x12, 0x2e, 0xde, 0xd7, 0xcb, + 0xa9, 0xf0, 0x3e, 0xfe, 0xfc, 0xe0, 0x5d, 0x83, 0x75, 0xd, 0x89, + 0xbf, 0xce, 0x54, 0x45, 0x61, 0xe7, 0xe9, 0x62, 0x80, 0x1d}, + {0x56, 0x9e, 0xf, 0xb5, 0x4c, 0xa7, 0x94, 0xc, 0x20, 0x13, 0x8e, + 0x8e, 0xa9, 0xf4, 0x1f, 0x5b, 0x67, 0xf, 0x30, 0x82, 0x21, 0xcc, + 0x2a, 0x9a, 0xf9, 0xaa, 0x6, 0xd8, 0x49, 0xe2, 0x6a, 0x3a}, + {0x5a, 0x7c, 0x90, 0xa9, 0x85, 0xda, 0x7a, 0x65, 0x62, 0xf, 0xb9, + 0x91, 0xb5, 0xa8, 0xe, 0x1a, 0xe9, 0xb4, 0x34, 0xdf, 0xfb, 0x1d, + 0xe, 0x8d, 0xf3, 0x5f, 0xf2, 0xae, 0xe8, 0x8c, 0x8b, 0x29}, + }, + { + {0xde, 0x65, 0x21, 0xa, 0xea, 0x72, 0x7a, 0x83, 0xf6, 0x79, 0xcf, + 0xb, 0xb4, 0x7, 0xab, 0x3f, 0x70, 0xae, 0x38, 0x77, 0xc7, 0x36, + 0x16, 0x52, 0xdc, 0xd7, 0xa7, 0x3, 0x18, 0x27, 0xa6, 0x6b}, + {0xb2, 0xc, 0xf7, 0xef, 0x53, 0x79, 0x92, 0x2a, 0x76, 0x70, 0x15, + 0x79, 0x2a, 0xc9, 0x89, 0x4b, 0x6a, 0xcf, 0xa7, 0x30, 0x7a, 0x45, + 0x18, 0x94, 0x85, 0xe4, 0x5c, 0x4d, 0x40, 0xa8, 0xb8, 0x34}, + {0x35, 0x33, 0x69, 0x83, 0xb5, 0xec, 0x6e, 0xc2, 0xfd, 0xfe, 0xb5, + 0x63, 0xdf, 0x13, 0xa8, 0xd5, 0x73, 0x25, 0xb2, 0xa4, 0x9a, 0xaa, + 0x93, 0xa2, 0x6a, 0x1c, 0x5e, 0x46, 0xdd, 0x2b, 0xd6, 0x71}, + }, + { + {0xf5, 0x5e, 0xf7, 0xb1, 0xda, 0xb5, 0x2d, 0xcd, 0xf5, 0x65, 0xb0, + 0x16, 0xcf, 0x95, 0x7f, 0xd7, 0x85, 0xf0, 0x49, 0x3f, 0xea, 0x1f, + 0x57, 0x14, 0x3d, 0x2b, 0x2b, 0x26, 0x21, 0x36, 0x33, 0x1c}, + {0x80, 0xdf, 0x78, 0xd3, 0x28, 0xcc, 0x33, 0x65, 0xb4, 0xa4, 0xf, + 0xa, 0x79, 0x43, 0xdb, 0xf6, 0x5a, 0xda, 0x1, 0xf7, 0xf9, 0x5f, + 0x64, 0xe3, 0xa4, 0x2b, 0x17, 0xf3, 0x17, 0xf3, 0xd5, 0x74}, + {0x81, 0xca, 0xd9, 0x67, 0x54, 0xe5, 0x6f, 0xa8, 0x37, 0x8c, 0x29, + 0x2b, 0x75, 0x7c, 0x8b, 0x39, 0x3b, 0x62, 0xac, 0xe3, 0x92, 0x8, + 0x6d, 0xda, 0x8c, 0xd9, 0xe9, 0x47, 0x45, 0xcc, 0xeb, 0x4a}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 478775558583645, 2062896624554807, 699391259285399, - 358099408427873, 1277310261461761 -#else - 51950941, 7134311, 8639287, 30739555, 59873175, 10421741, - 564065, 5336097, 6750977, 19033406 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1984740906540026, 1079164179400229, 1056021349262661, - 1659958556483663, 1088529069025527 -#else - 11836410, 29574944, 26297893, 16080799, 23455045, 15735944, - 1695823, 24735310, 8169719, 16220347 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 580736401511151, 1842931091388998, 1177201471228238, - 2075460256527244, 1301133425678027 -#else - 48993007, 8653646, 17578566, 27461813, 59083086, 17541668, - 55964556, 30926767, 61118155, 19388398 -#endif - }}, + {0x10, 0xb6, 0x54, 0x73, 0x9e, 0x8d, 0x40, 0xb, 0x6e, 0x5b, 0xa8, + 0x5b, 0x53, 0x32, 0x6b, 0x80, 0x7, 0xa2, 0x58, 0x4a, 0x3, 0x3a, + 0xe6, 0xdb, 0x2c, 0xdf, 0xa1, 0xc9, 0xdd, 0xd9, 0x3b, 0x17}, + {0xc9, 0x1, 0x6d, 0x27, 0x1b, 0x7, 0xf0, 0x12, 0x70, 0x8c, 0xc4, + 0x86, 0xc5, 0xba, 0xb8, 0xe7, 0xa9, 0xfb, 0xd6, 0x71, 0x9b, 0x12, + 0x8, 0x53, 0x92, 0xb7, 0x3d, 0x5a, 0xf9, 0xfb, 0x88, 0x5d}, + {0xdf, 0x72, 0x58, 0xfe, 0x1e, 0xf, 0x50, 0x2b, 0xc1, 0x18, 0x39, + 0xd4, 0x2e, 0x58, 0xd6, 0x58, 0xe0, 0x3a, 0x67, 0xc9, 0x8e, 0x27, + 0xed, 0xe6, 0x19, 0xa3, 0x9e, 0xb1, 0x13, 0xcd, 0xe1, 0x6}, + }, + { + {0x53, 0x3, 0x5b, 0x9e, 0x62, 0xaf, 0x2b, 0x47, 0x47, 0x4, 0x8d, + 0x27, 0x90, 0xb, 0xaa, 0x3b, 0x27, 0xbf, 0x43, 0x96, 0x46, 0x5f, + 0x78, 0xc, 0x13, 0x7b, 0x83, 0x8d, 0x1a, 0x6a, 0x3a, 0x7f}, + {0x23, 0x6f, 0x16, 0x6f, 0x51, 0xad, 0xd0, 0x40, 0xbe, 0x6a, 0xab, + 0x1f, 0x93, 0x32, 0x8e, 0x11, 0x8e, 0x8, 0x4d, 0xa0, 0x14, 0x5e, + 0xe3, 0x3f, 0x66, 0x62, 0xe1, 0x26, 0x35, 0x60, 0x80, 0x30}, + {0xb, 0x80, 0x3d, 0x5d, 0x39, 0x44, 0xe6, 0xf7, 0xf6, 0xed, 0x1, + 0xc9, 0x55, 0xd5, 0xa8, 0x95, 0x39, 0x63, 0x2c, 0x59, 0x30, 0x78, + 0xcd, 0x68, 0x7e, 0x30, 0x51, 0x2e, 0xed, 0xfd, 0xd0, 0x30}, + }, + { + {0x50, 0x47, 0xb8, 0x68, 0x1e, 0x97, 0xb4, 0x9c, 0xcf, 0xbb, 0x64, + 0x66, 0x29, 0x72, 0x95, 0xa0, 0x2b, 0x41, 0xfa, 0x72, 0x26, 0xe7, + 0x8d, 0x5c, 0xd9, 0x89, 0xc5, 0x51, 0x43, 0x8, 0x15, 0x46}, + {0xb3, 0x33, 0x12, 0xf2, 0x1a, 0x4d, 0x59, 0xe0, 0x9c, 0x4d, 0xcc, + 0xf0, 0x8e, 0xe7, 0xdb, 0x1b, 0x77, 0x9a, 0x49, 0x8f, 0x7f, 0x18, + 0x65, 0x69, 0x68, 0x98, 0x9, 0x2c, 0x20, 0x14, 0x92, 0xa}, + {0x2e, 0xa0, 0xb9, 0xae, 0xc0, 0x19, 0x90, 0xbc, 0xae, 0x4c, 0x3, + 0x16, 0xd, 0x11, 0xc7, 0x55, 0xec, 0x32, 0x99, 0x65, 0x1, 0xf5, + 0x6d, 0xe, 0xfe, 0x5d, 0xca, 0x95, 0x28, 0xd, 0xca, 0x3b}, + }, + { + {0xbf, 0x1, 0xcc, 0x9e, 0xb6, 0x8e, 0x68, 0x9c, 0x6f, 0x89, 0x44, + 0xa6, 0xad, 0x83, 0xbc, 0xf0, 0xe2, 0x9f, 0x7a, 0x5f, 0x5f, 0x95, + 0x2d, 0xca, 0x41, 0x82, 0xf2, 0x8d, 0x3, 0xb4, 0xa8, 0x4e}, + {0xa4, 0x62, 0x5d, 0x3c, 0xbc, 0x31, 0xf0, 0x40, 0x60, 0x7a, 0xf0, + 0xcf, 0x3e, 0x8b, 0xfc, 0x19, 0x45, 0xb5, 0xf, 0x13, 0xa2, 0x3d, + 0x18, 0x98, 0xcd, 0x13, 0x8f, 0xae, 0xdd, 0xde, 0x31, 0x56}, + {0x2, 0xd2, 0xca, 0xf1, 0xa, 0x46, 0xed, 0x2a, 0x83, 0xee, 0x8c, + 0xa4, 0x5, 0x53, 0x30, 0x46, 0x5f, 0x1a, 0xf1, 0x49, 0x45, 0x77, + 0x21, 0x91, 0x63, 0xa4, 0x2c, 0x54, 0x30, 0x9, 0xce, 0x24}, + }, + { + {0x85, 0xb, 0xf3, 0xfd, 0x55, 0xa1, 0xcf, 0x3f, 0xa4, 0x2e, 0x37, + 0x36, 0x8e, 0x16, 0xf7, 0xd2, 0x44, 0xf8, 0x92, 0x64, 0xde, 0x64, + 0xe0, 0xb2, 0x80, 0x42, 0x4f, 0x32, 0xa7, 0x28, 0x99, 0x54}, + {0x6, 0xc1, 0x6, 0xfd, 0xf5, 0x90, 0xe8, 0x1f, 0xf2, 0x10, 0x88, + 0x5d, 0x35, 0x68, 0xc4, 0xb5, 0x3e, 0xaf, 0x8c, 0x6e, 0xfe, 0x8, + 0x78, 0x82, 0x4b, 0xd7, 0x6, 0x8a, 0xc2, 0xe3, 0xd4, 0x41}, + {0x2e, 0x1a, 0xee, 0x63, 0xa7, 0x32, 0x6e, 0xf2, 0xea, 0xfd, 0x5f, + 0xd2, 0xb7, 0xe4, 0x91, 0xae, 0x69, 0x4d, 0x7f, 0xd1, 0x3b, 0xd3, + 0x3b, 0xbc, 0x6a, 0xff, 0xdc, 0xc0, 0xde, 0x66, 0x1b, 0x49}, + }, + { + {0xa1, 0x64, 0xda, 0xd0, 0x8e, 0x4a, 0xf0, 0x75, 0x4b, 0x28, 0xe2, + 0x67, 0xaf, 0x2c, 0x22, 0xed, 0xa4, 0x7b, 0x7b, 0x1f, 0x79, 0xa3, + 0x34, 0x82, 0x67, 0x8b, 0x1, 0xb7, 0xb0, 0xb8, 0xf6, 0x4c}, + {0xa7, 0x32, 0xea, 0xc7, 0x3d, 0xb1, 0xf5, 0x98, 0x98, 0xdb, 0x16, + 0x7e, 0xcc, 0xf8, 0xd5, 0xe3, 0x47, 0xd9, 0xf8, 0xcb, 0x52, 0xbf, + 0xa, 0xac, 0xac, 0xe4, 0x5e, 0xc8, 0xd0, 0x38, 0xf3, 0x8}, + {0xbd, 0x73, 0x1a, 0x99, 0x21, 0xa8, 0x83, 0xc3, 0x7a, 0xc, 0x32, + 0xdf, 0x1, 0xbc, 0x27, 0xab, 0x63, 0x70, 0x77, 0x84, 0x1b, 0x33, + 0x3d, 0xc1, 0x99, 0x8a, 0x7, 0xeb, 0x82, 0x4a, 0xd, 0x53}, + }, + { + {0x9e, 0xbf, 0x9a, 0x6c, 0x45, 0x73, 0x69, 0x6d, 0x80, 0xa8, 0x0, + 0x49, 0xfc, 0xb2, 0x7f, 0x25, 0x50, 0xb8, 0xcf, 0xc8, 0x12, 0xf4, + 0xac, 0x2b, 0x5b, 0xbd, 0xbf, 0xc, 0xe0, 0xe7, 0xb3, 0xd}, + {0x25, 0x48, 0xf9, 0xe1, 0x30, 0x36, 0x4c, 0x0, 0x5a, 0x53, 0xab, + 0x8c, 0x26, 0x78, 0x2d, 0x7e, 0x8b, 0xff, 0x84, 0xcc, 0x23, 0x23, + 0x48, 0xc7, 0xb9, 0x70, 0x17, 0x10, 0x3f, 0x75, 0xea, 0x65}, + {0x63, 0x63, 0x9, 0xe2, 0x3e, 0xfc, 0x66, 0x3d, 0x6b, 0xcb, 0xb5, + 0x61, 0x7f, 0x2c, 0xd6, 0x81, 0x1a, 0x3b, 0x44, 0x13, 0x42, 0x4, + 0xbe, 0xf, 0xdb, 0xa1, 0xe1, 0x21, 0x19, 0xec, 0xa4, 0x2}, + }, + { + {0x5f, 0x79, 0xcf, 0xf1, 0x62, 0x61, 0xc8, 0xf5, 0xf2, 0x57, 0xee, + 0x26, 0x19, 0x86, 0x8c, 0x11, 0x78, 0x35, 0x6, 0x1c, 0x85, 0x24, + 0x21, 0x17, 0xcf, 0x7f, 0x6, 0xec, 0x5d, 0x2b, 0xd1, 0x36}, + {0xa2, 0xb8, 0x24, 0x3b, 0x9a, 0x25, 0xe6, 0x5c, 0xb8, 0xa0, 0xaf, + 0x45, 0xcc, 0x7a, 0x57, 0xb8, 0x37, 0x70, 0xa0, 0x8b, 0xe8, 0xe6, + 0xcb, 0xcc, 0xbf, 0x9, 0x78, 0x12, 0x51, 0x3c, 0x14, 0x3d}, + {0x57, 0x45, 0x15, 0x79, 0x91, 0x27, 0x6d, 0x12, 0xa, 0x3a, 0x78, + 0xfc, 0x5c, 0x8f, 0xe4, 0xd5, 0xac, 0x9b, 0x17, 0xdf, 0xe8, 0xb6, + 0xbd, 0x36, 0x59, 0x28, 0xa8, 0x5b, 0x88, 0x17, 0xf5, 0x2e}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1515728832059182, 1575261009617579, 1510246567196186, - 191078022609704, 116661716289141 -#else - 43800366, 22586119, 15213227, 23473218, 36255258, 22504427, - 27884328, 2847284, 2655861, 1738395 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1295295738269652, 1714742313707026, 545583042462581, - 2034411676262552, 1513248090013606 -#else - 39571412, 19301410, 41772562, 25551651, 57738101, 8129820, - 21651608, 30315096, 48021414, 22549153 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 230710545179830, 30821514358353, 760704303452229, - 390668103790604, 573437871383156 -#else - 1533110, 3437855, 23735889, 459276, 29970501, 11335377, - 26030092, 5821408, 10478196, 8544890 -#endif - }}, + {0x51, 0x2f, 0x5b, 0x30, 0xfb, 0xbf, 0xee, 0x96, 0xb8, 0x96, 0x95, + 0x88, 0xad, 0x38, 0xf9, 0xd3, 0x25, 0xdd, 0xd5, 0x46, 0xc7, 0x2d, + 0xf5, 0xf0, 0x95, 0x0, 0x3a, 0xbb, 0x90, 0x82, 0x96, 0x57}, + {0xdc, 0xae, 0x58, 0x8c, 0x4e, 0x97, 0x37, 0x46, 0xa4, 0x41, 0xf0, + 0xab, 0xfb, 0x22, 0xef, 0xb9, 0x8a, 0x71, 0x80, 0xe9, 0x56, 0xd9, + 0x85, 0xe1, 0xa6, 0xa8, 0x43, 0xb1, 0xfa, 0x78, 0x1b, 0x2f}, + {0x1, 0xe1, 0x20, 0xa, 0x43, 0xb8, 0x1a, 0xf7, 0x47, 0xec, 0xf0, + 0x24, 0x8d, 0x65, 0x93, 0xf3, 0xd1, 0xee, 0xe2, 0x6e, 0xa8, 0x9, + 0x75, 0xcf, 0xe1, 0xa3, 0x2a, 0xdc, 0x35, 0x3e, 0xc4, 0x7d}, + }, + { + {0x18, 0x97, 0x3e, 0x27, 0x5c, 0x2a, 0x78, 0x5a, 0x94, 0xfd, 0x4e, + 0x5e, 0x99, 0xc6, 0x76, 0x35, 0x3e, 0x7d, 0x23, 0x1f, 0x5, 0xd8, + 0x2e, 0xf, 0x99, 0xa, 0xd5, 0x82, 0x1d, 0xb8, 0x4f, 0x4}, + {0xc3, 0xd9, 0x7d, 0x88, 0x65, 0x66, 0x96, 0x85, 0x55, 0x53, 0xb0, + 0x4b, 0x31, 0x9b, 0xf, 0xc9, 0xb1, 0x79, 0x20, 0xef, 0xf8, 0x8d, + 0xe0, 0xc6, 0x2f, 0xc1, 0x8c, 0x75, 0x16, 0x20, 0xf7, 0x7e}, + {0xd9, 0xe3, 0x7, 0xa9, 0xc5, 0x18, 0xdf, 0xc1, 0x59, 0x63, 0x4c, + 0xce, 0x1d, 0x37, 0xb3, 0x57, 0x49, 0xbb, 0x1, 0xb2, 0x34, 0x45, + 0x70, 0xca, 0x2e, 0xdd, 0x30, 0x9c, 0x3f, 0x82, 0x79, 0x7f}, + }, + { + {0xba, 0x87, 0xf5, 0x68, 0xf0, 0x1f, 0x9c, 0x6a, 0xde, 0xc8, 0x50, + 0x0, 0x4e, 0x89, 0x27, 0x8, 0xe7, 0x5b, 0xed, 0x7d, 0x55, 0x99, + 0xbf, 0x3c, 0xf0, 0xd6, 0x6, 0x1c, 0x43, 0xb0, 0xa9, 0x64}, + {0xe8, 0x13, 0xb5, 0xa3, 0x39, 0xd2, 0x34, 0x83, 0xd8, 0xa8, 0x1f, + 0xb9, 0xd4, 0x70, 0x36, 0xc1, 0x33, 0xbd, 0x90, 0xf5, 0x36, 0x41, + 0xb5, 0x12, 0xb4, 0xd9, 0x84, 0xd7, 0x73, 0x3, 0x4e, 0xa}, + {0x19, 0x29, 0x7d, 0x5b, 0xa1, 0xd6, 0xb3, 0x2e, 0x35, 0x82, 0x3a, + 0xd5, 0xa0, 0xf6, 0xb4, 0xb0, 0x47, 0x5d, 0xa4, 0x89, 0x43, 0xce, + 0x56, 0x71, 0x6c, 0x34, 0x18, 0xce, 0xa, 0x7d, 0x1a, 0x7}, + }, + { + {0x31, 0x44, 0xe1, 0x20, 0x52, 0x35, 0xc, 0xcc, 0x41, 0x51, 0xb1, + 0x9, 0x7, 0x95, 0x65, 0xd, 0x36, 0x5f, 0x9d, 0x20, 0x1b, 0x62, + 0xf5, 0x9a, 0xd3, 0x55, 0x77, 0x61, 0xf7, 0xbc, 0x69, 0x7c}, + {0xb, 0xba, 0x87, 0xc8, 0xaa, 0x2d, 0x7, 0xd3, 0xee, 0x62, 0xa5, + 0xbf, 0x5, 0x29, 0x26, 0x1, 0x8b, 0x76, 0xef, 0xc0, 0x2, 0x30, + 0x54, 0xcf, 0x9c, 0x7e, 0xea, 0x46, 0x71, 0xcc, 0x3b, 0x2c}, + {0x5f, 0x29, 0xe8, 0x4, 0xeb, 0xd7, 0xf0, 0x7, 0x7d, 0xf3, 0x50, + 0x2f, 0x25, 0x18, 0xdb, 0x10, 0xd7, 0x98, 0x17, 0x17, 0xa3, 0xa9, + 0x51, 0xe9, 0x1d, 0xa5, 0xac, 0x22, 0x73, 0x9a, 0x5a, 0x6f}, + }, + { + {0xbe, 0x44, 0xd9, 0xa3, 0xeb, 0xd4, 0x29, 0xe7, 0x9e, 0xaf, 0x78, + 0x80, 0x40, 0x9, 0x9e, 0x8d, 0x3, 0x9c, 0x86, 0x47, 0x7a, 0x56, + 0x25, 0x45, 0x24, 0x3b, 0x8d, 0xee, 0x80, 0x96, 0xab, 0x2}, + {0xc5, 0xc6, 0x41, 0x2f, 0xc, 0x0, 0xa1, 0x8b, 0x9b, 0xfb, 0xfe, + 0xc, 0xc1, 0x79, 0x9f, 0xc4, 0x9f, 0x1c, 0xc5, 0x3c, 0x70, 0x47, + 0xfa, 0x4e, 0xca, 0xaf, 0x47, 0xe1, 0xa2, 0x21, 0x4e, 0x49}, + {0x9a, 0xd, 0xe5, 0xdd, 0x85, 0x8a, 0xa4, 0xef, 0x49, 0xa2, 0xb9, + 0xf, 0x4e, 0x22, 0x9a, 0x21, 0xd9, 0xf6, 0x1e, 0xd9, 0x1d, 0x1f, + 0x9, 0xfa, 0x34, 0xbb, 0x46, 0xea, 0xcb, 0x76, 0x5d, 0x6b}, + }, + { + {0x22, 0x25, 0x78, 0x1e, 0x17, 0x41, 0xf9, 0xe0, 0xd3, 0x36, 0x69, + 0x3, 0x74, 0xae, 0xe6, 0xf1, 0x46, 0xc7, 0xfc, 0xd0, 0xa2, 0x3e, + 0x8b, 0x40, 0x3e, 0x31, 0xdd, 0x3, 0x9c, 0x86, 0xfb, 0x16}, + {0x94, 0xd9, 0xc, 0xec, 0x6c, 0x55, 0x57, 0x88, 0xba, 0x1d, 0xd0, + 0x5c, 0x6f, 0xdc, 0x72, 0x64, 0x77, 0xb4, 0x42, 0x8f, 0x14, 0x69, + 0x1, 0xaf, 0x54, 0x73, 0x27, 0x85, 0xf6, 0x33, 0xe3, 0xa}, + {0x62, 0x9, 0xb6, 0x33, 0x97, 0x19, 0x8e, 0x28, 0x33, 0xe1, 0xab, + 0xd8, 0xb4, 0x72, 0xfc, 0x24, 0x3e, 0xd0, 0x91, 0x9, 0xed, 0xf7, + 0x11, 0x48, 0x75, 0xd0, 0x70, 0x8f, 0x8b, 0xe3, 0x81, 0x3f}, + }, + { + {0x24, 0xc8, 0x17, 0x5f, 0x35, 0x7f, 0xdb, 0xa, 0xa4, 0x99, 0x42, + 0xd7, 0xc3, 0x23, 0xb9, 0x74, 0xf7, 0xea, 0xf8, 0xcb, 0x8b, 0x3e, + 0x7c, 0xd5, 0x3d, 0xdc, 0xde, 0x4c, 0xd3, 0xe2, 0xd3, 0xa}, + {0xfe, 0xaf, 0xd9, 0x7e, 0xcc, 0xf, 0x91, 0x7f, 0x4b, 0x87, 0x65, + 0x24, 0xa1, 0xb8, 0x5c, 0x54, 0x4, 0x47, 0xc, 0x4b, 0xd2, 0x7e, + 0x39, 0xa8, 0x93, 0x9, 0xf5, 0x4, 0xc1, 0xf, 0x51, 0x50}, + {0x9d, 0x24, 0x6e, 0x33, 0xc5, 0xf, 0xc, 0x6f, 0xd9, 0xcf, 0x31, + 0xc3, 0x19, 0xde, 0x5e, 0x74, 0x1c, 0xfe, 0xee, 0x9, 0x0, 0xfd, + 0xd6, 0xf2, 0xbe, 0x1e, 0xfa, 0xf0, 0x8b, 0x15, 0x7c, 0x12}, + }, + { + {0x74, 0xb9, 0x51, 0xae, 0xc4, 0x8f, 0xa2, 0xde, 0x96, 0xfe, 0x4d, + 0x74, 0xd3, 0x73, 0x99, 0x1d, 0xa8, 0x48, 0x38, 0x87, 0xb, 0x68, + 0x40, 0x62, 0x95, 0xdf, 0x67, 0xd1, 0x79, 0x24, 0xd8, 0x4e}, + {0xa2, 0x79, 0x98, 0x2e, 0x42, 0x7c, 0x19, 0xf6, 0x47, 0x36, 0xca, + 0x52, 0xd4, 0xdd, 0x4a, 0xa4, 0xcb, 0xac, 0x4e, 0x4b, 0xc1, 0x3f, + 0x41, 0x9b, 0x68, 0x4f, 0xef, 0x7, 0x7d, 0xf8, 0x4e, 0x35}, + {0x75, 0xd9, 0xc5, 0x60, 0x22, 0xb5, 0xe3, 0xfe, 0xb8, 0xb0, 0x41, + 0xeb, 0xfc, 0x2e, 0x35, 0x50, 0x3c, 0x65, 0xf6, 0xa9, 0x30, 0xac, + 0x8, 0x88, 0x6d, 0x23, 0x39, 0x5, 0xd2, 0x92, 0x2d, 0x30}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1169380107545646, 263167233745614, 2022901299054448, - 819900753251120, 2023898464874585 -#else - 32173102, 17425121, 24896206, 3921497, 22579056, 30143578, - 19270448, 12217473, 17789017, 30158437 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2102254323485823, 1570832666216754, 34696906544624, - 1993213739807337, 70638552271463 -#else - 36555903, 31326030, 51530034, 23407230, 13243888, 517024, - 15479401, 29701199, 30460519, 1052596 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 894132856735058, 548675863558441, 845349339503395, - 1942269668326667, 1615682209874691 -#else - 55493970, 13323617, 32618793, 8175907, 51878691, 12596686, - 27491595, 28942073, 3179267, 24075541 -#endif - }}, + {0x77, 0xf1, 0xe0, 0xe4, 0xb6, 0x6f, 0xbc, 0x2d, 0x93, 0x6a, 0xbd, + 0xa4, 0x29, 0xbf, 0xe1, 0x4, 0xe8, 0xf6, 0x7a, 0x78, 0xd4, 0x66, + 0x19, 0x5e, 0x60, 0xd0, 0x26, 0xb4, 0x5e, 0x5f, 0xdc, 0xe}, + {0x3d, 0x28, 0xa4, 0xbc, 0xa2, 0xc1, 0x13, 0x78, 0xd9, 0x3d, 0x86, + 0xa1, 0x91, 0xf0, 0x62, 0xed, 0x86, 0xfa, 0x68, 0xc2, 0xb8, 0xbc, + 0xc7, 0xae, 0x4c, 0xae, 0x1c, 0x6f, 0xb7, 0xd3, 0xe5, 0x10}, + {0x67, 0x8e, 0xda, 0x53, 0xd6, 0xbf, 0x53, 0x54, 0x41, 0xf6, 0xa9, + 0x24, 0xec, 0x1e, 0xdc, 0xe9, 0x23, 0x8a, 0x57, 0x3, 0x3b, 0x26, + 0x87, 0xbf, 0x72, 0xba, 0x1c, 0x36, 0x51, 0x6c, 0xb4, 0x45}, + }, + { + {0xe4, 0xe3, 0x7f, 0x8a, 0xdd, 0x4d, 0x9d, 0xce, 0x30, 0xe, 0x62, + 0x76, 0x56, 0x64, 0x13, 0xab, 0x58, 0x99, 0xe, 0xb3, 0x7b, 0x4f, + 0x59, 0x4b, 0xdf, 0x29, 0x12, 0x32, 0xef, 0xa, 0x1c, 0x5c}, + {0xa1, 0x7f, 0x4f, 0x31, 0xbf, 0x2a, 0x40, 0xa9, 0x50, 0xf4, 0x8c, + 0x8e, 0xdc, 0xf1, 0x57, 0xe2, 0x84, 0xbe, 0xa8, 0x23, 0x4b, 0xd5, + 0xbb, 0x1d, 0x3b, 0x71, 0xcb, 0x6d, 0xa3, 0xbf, 0x77, 0x21}, + {0x8f, 0xdb, 0x79, 0xfa, 0xbc, 0x1b, 0x8, 0x37, 0xb3, 0x59, 0x5f, + 0xc2, 0x1e, 0x81, 0x48, 0x60, 0x87, 0x24, 0x83, 0x9c, 0x65, 0x76, + 0x7a, 0x8, 0xbb, 0xb5, 0x8a, 0x7d, 0x38, 0x19, 0xe6, 0x4a}, + }, + { + {0x83, 0xfb, 0x5b, 0x98, 0x44, 0x7e, 0x11, 0x61, 0x36, 0x31, 0x96, + 0x71, 0x2a, 0x46, 0xe0, 0xfc, 0x4b, 0x90, 0x25, 0xd4, 0x48, 0x34, + 0xac, 0x83, 0x64, 0x3d, 0xa4, 0x5b, 0xbe, 0x5a, 0x68, 0x75}, + {0x2e, 0xa3, 0x44, 0x53, 0xaa, 0xf6, 0xdb, 0x8d, 0x78, 0x40, 0x1b, + 0xb4, 0xb4, 0xea, 0x88, 0x7d, 0x60, 0xd, 0x13, 0x4a, 0x97, 0xeb, + 0xb0, 0x5e, 0x3, 0x3e, 0xbf, 0x17, 0x1b, 0xd9, 0x0, 0x1a}, + {0xb2, 0xf2, 0x61, 0xeb, 0x33, 0x9, 0x96, 0x6e, 0x52, 0x49, 0xff, + 0xc9, 0xa8, 0xf, 0x3d, 0x54, 0x69, 0x65, 0xf6, 0x7a, 0x10, 0x75, + 0x72, 0xdf, 0xaa, 0xe6, 0xb0, 0x23, 0xb6, 0x29, 0x55, 0x13}, + }, + { + {0xfe, 0x83, 0x2e, 0xe2, 0xbc, 0x16, 0xc7, 0xf5, 0xc1, 0x85, 0x9, + 0xe8, 0x19, 0xeb, 0x2b, 0xb4, 0xae, 0x4a, 0x25, 0x14, 0x37, 0xa6, + 0x9d, 0xec, 0x13, 0xa6, 0x90, 0x15, 0x5, 0xea, 0x72, 0x59}, + {0x18, 0xd5, 0xd1, 0xad, 0xd7, 0xdb, 0xf0, 0x18, 0x11, 0x1f, 0xc1, + 0xcf, 0x88, 0x78, 0x9f, 0x97, 0x9b, 0x75, 0x14, 0x71, 0xf0, 0xe1, + 0x32, 0x87, 0x1, 0x3a, 0xca, 0x65, 0x1a, 0xb8, 0xb5, 0x79}, + {0x11, 0x78, 0x8f, 0xdc, 0x20, 0xac, 0xd4, 0xf, 0xa8, 0x4f, 0x4d, + 0xac, 0x94, 0xd2, 0x9a, 0x9a, 0x34, 0x4, 0x36, 0xb3, 0x64, 0x2d, + 0x1b, 0xc0, 0xdb, 0x3b, 0x5f, 0x90, 0x95, 0x9c, 0x7e, 0x4f}, + }, + { + {0xfe, 0x99, 0x52, 0x35, 0x3d, 0x44, 0xc8, 0x71, 0xd7, 0xea, 0xeb, + 0xdb, 0x1c, 0x3b, 0xcd, 0x8b, 0x66, 0x94, 0xa4, 0xf1, 0x9e, 0x49, + 0x92, 0x80, 0xc8, 0xad, 0x44, 0xa1, 0xc4, 0xee, 0x42, 0x19}, + {0x2e, 0x30, 0x81, 0x57, 0xbc, 0x4b, 0x67, 0x62, 0xf, 0xdc, 0xad, + 0x89, 0x39, 0xf, 0x52, 0xd8, 0xc6, 0xd9, 0xfb, 0x53, 0xae, 0x99, + 0x29, 0x8c, 0x4c, 0x8e, 0x63, 0x2e, 0xd9, 0x3a, 0x99, 0x31}, + {0x92, 0x49, 0x23, 0xae, 0x19, 0x53, 0xac, 0x7d, 0x92, 0x3e, 0xea, + 0xc, 0x91, 0x3d, 0x1b, 0x2c, 0x22, 0x11, 0x3c, 0x25, 0x94, 0xe4, + 0x3c, 0x55, 0x75, 0xca, 0xf9, 0x4e, 0x31, 0x65, 0xa, 0x2a}, + }, + { + {0x3a, 0x79, 0x1c, 0x3c, 0xcd, 0x1a, 0x36, 0xcf, 0x3b, 0xbc, 0x35, + 0x5a, 0xac, 0xbc, 0x9e, 0x2f, 0xab, 0xa6, 0xcd, 0xa8, 0xe9, 0x60, + 0xe8, 0x60, 0x13, 0x1a, 0xea, 0x6d, 0x9b, 0xc3, 0x5d, 0x5}, + {0xc2, 0x27, 0xf9, 0xf7, 0x7f, 0x93, 0xb7, 0x2d, 0x35, 0xa6, 0xd0, + 0x17, 0x6, 0x1f, 0x74, 0xdb, 0x76, 0xaf, 0x55, 0x11, 0xa2, 0xf3, + 0x82, 0x59, 0xed, 0x2d, 0x7c, 0x64, 0x18, 0xe2, 0xf6, 0x4c}, + {0xb6, 0x5b, 0x8d, 0xc2, 0x7c, 0x22, 0x19, 0xb1, 0xab, 0xff, 0x4d, + 0x77, 0xbc, 0x4e, 0xe2, 0x7, 0x89, 0x2c, 0xa3, 0xe4, 0xce, 0x78, + 0x3c, 0xa8, 0xb6, 0x24, 0xaa, 0x10, 0x77, 0x30, 0x1a, 0x12}, + }, + { + {0xc9, 0x83, 0x74, 0xc7, 0x3e, 0x71, 0x59, 0xd6, 0xaf, 0x96, 0x2b, + 0xb8, 0x77, 0xe0, 0xbf, 0x88, 0xd3, 0xbc, 0x97, 0x10, 0x23, 0x28, + 0x9e, 0x28, 0x9b, 0x3a, 0xed, 0x6c, 0x4a, 0xb9, 0x7b, 0x52}, + {0x97, 0x4a, 0x3, 0x9f, 0x5e, 0x5d, 0xdb, 0xe4, 0x2d, 0xbc, 0x34, + 0x30, 0x9, 0xfc, 0x53, 0xe1, 0xb1, 0xd3, 0x51, 0x95, 0x91, 0x46, + 0x5, 0x46, 0x2d, 0xe5, 0x40, 0x7a, 0x6c, 0xc7, 0x3f, 0x33}, + {0x2e, 0x48, 0x5b, 0x99, 0x2a, 0x99, 0x3d, 0x56, 0x1, 0x38, 0x38, + 0x6e, 0x7c, 0xd0, 0x5, 0x34, 0xe5, 0xd8, 0x64, 0x2f, 0xde, 0x35, + 0x50, 0x48, 0xf7, 0xa9, 0xa7, 0x20, 0x9b, 0x6, 0x89, 0x6b}, + }, + { + {0x77, 0xdb, 0xc7, 0xb5, 0x8c, 0xfa, 0x82, 0x40, 0x55, 0xc1, 0x34, + 0xc7, 0xf8, 0x86, 0x86, 0x6, 0x7e, 0xa5, 0xe7, 0xf6, 0xd9, 0xc8, + 0xe6, 0x29, 0xcf, 0x9b, 0x63, 0xa7, 0x8, 0xd3, 0x73, 0x4}, + {0xd, 0x22, 0x70, 0x62, 0x41, 0xa0, 0x2a, 0x81, 0x4e, 0x5b, 0x24, + 0xf9, 0xfa, 0x89, 0x5a, 0x99, 0x5, 0xef, 0x72, 0x50, 0xce, 0xc4, + 0xad, 0xff, 0x73, 0xeb, 0x73, 0xaa, 0x3, 0x21, 0xbc, 0x23}, + {0x5, 0x9e, 0x58, 0x3, 0x26, 0x79, 0xee, 0xca, 0x92, 0xc4, 0xdc, + 0x46, 0x12, 0x42, 0x4b, 0x2b, 0x4f, 0xa9, 0x1, 0xe6, 0x74, 0xef, + 0xa1, 0x2, 0x1a, 0x34, 0x4, 0xde, 0xbf, 0x73, 0x2f, 0x10}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1287670217537834, 1222355136884920, 1846481788678694, - 1150426571265110, 1613523400722047 -#else - 31947050, 19187781, 62468280, 18214510, 51982886, 27514722, - 52352086, 17142691, 19072639, 24043372 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 793388516527298, 1315457083650035, 1972286999342417, - 1901825953052455, 338269477222410 -#else - 11685058, 11822410, 3158003, 19601838, 33402193, 29389366, - 5977895, 28339415, 473098, 5040608 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 550201530671806, 778605267108140, 2063911101902983, - 115500557286349, 2041641272971022 -#else - 46817982, 8198641, 39698732, 11602122, 1290375, 30754672, - 28326861, 1721092, 47550222, 30422825 -#endif - }}, + {0x9a, 0x1c, 0x51, 0xb5, 0xe0, 0xda, 0xb4, 0xa2, 0x6, 0xff, 0xff, + 0x2b, 0x29, 0x60, 0xc8, 0x7a, 0x34, 0x42, 0x50, 0xf5, 0x5d, 0x37, + 0x1f, 0x98, 0x2d, 0xa1, 0x4e, 0xda, 0x25, 0xd7, 0x6b, 0x3f}, + {0xc6, 0x45, 0x57, 0x7f, 0xab, 0xb9, 0x18, 0xeb, 0x90, 0xc6, 0x87, + 0x57, 0xee, 0x8a, 0x3a, 0x2, 0xa9, 0xaf, 0xf7, 0x2d, 0xda, 0x12, + 0x27, 0xb7, 0x3d, 0x1, 0x5c, 0xea, 0x25, 0x7d, 0x59, 0x36}, + {0xac, 0x58, 0x60, 0x10, 0x7b, 0x8d, 0x4d, 0x73, 0x5f, 0x90, 0xc6, + 0x6f, 0x9e, 0x57, 0x40, 0xd9, 0x2d, 0x93, 0x2, 0x92, 0xf9, 0xf8, + 0x66, 0x64, 0xd0, 0xd6, 0x60, 0xda, 0x19, 0xcc, 0x7e, 0x7b}, + }, + { + {0x9b, 0xfa, 0x7c, 0xa7, 0x51, 0x4a, 0xae, 0x6d, 0x50, 0x86, 0xa3, + 0xe7, 0x54, 0x36, 0x26, 0x82, 0xdb, 0x82, 0x2d, 0x8f, 0xcd, 0xff, + 0xbb, 0x9, 0xba, 0xca, 0xf5, 0x1b, 0x66, 0xdc, 0xbe, 0x3}, + {0xd, 0x69, 0x5c, 0x69, 0x3c, 0x37, 0xc2, 0x78, 0x6e, 0x90, 0x42, + 0x6, 0x66, 0x2e, 0x25, 0xdd, 0xd2, 0x2b, 0xe1, 0x4a, 0x44, 0x44, + 0x1d, 0x95, 0x56, 0x39, 0x74, 0x1, 0x76, 0xad, 0x35, 0x42}, + {0xf5, 0x75, 0x89, 0x7, 0xd, 0xcb, 0x58, 0x62, 0x98, 0xf2, 0x89, + 0x91, 0x54, 0x42, 0x29, 0x49, 0xe4, 0x6e, 0xe3, 0xe2, 0x23, 0xb4, + 0xca, 0xa0, 0xa1, 0x66, 0xf0, 0xcd, 0xb0, 0xe2, 0x7c, 0xe}, + }, + { + {0xf9, 0x70, 0x4b, 0xd9, 0xdf, 0xfe, 0xa6, 0xfe, 0x2d, 0xba, 0xfc, + 0xc1, 0x51, 0xc0, 0x30, 0xf1, 0x89, 0xab, 0x2f, 0x7f, 0x7e, 0xd4, + 0x82, 0x48, 0xb5, 0xee, 0xec, 0x8a, 0x13, 0x56, 0x52, 0x61}, + {0xa3, 0x85, 0x8c, 0xc4, 0x3a, 0x64, 0x94, 0xc4, 0xad, 0x39, 0x61, + 0x3c, 0xf4, 0x1d, 0x36, 0xfd, 0x48, 0x4d, 0xe9, 0x3a, 0xdd, 0x17, + 0xdb, 0x9, 0x4a, 0x67, 0xb4, 0x8f, 0x5d, 0xa, 0x6e, 0x66}, + {0xd, 0xcb, 0x70, 0x48, 0x4e, 0xf6, 0xbb, 0x2a, 0x6b, 0x8b, 0x45, + 0xaa, 0xf0, 0xbc, 0x65, 0xcd, 0x5d, 0x98, 0xe8, 0x75, 0xba, 0x4e, + 0xbe, 0x9a, 0xe4, 0xde, 0x14, 0xd5, 0x10, 0xc8, 0xb, 0x7f}, + }, + { + {0xa0, 0x13, 0x72, 0x73, 0xad, 0x9d, 0xac, 0x83, 0x98, 0x2e, 0xf7, + 0x2e, 0xba, 0xf8, 0xf6, 0x9f, 0x57, 0x69, 0xec, 0x43, 0xdd, 0x2e, + 0x1e, 0x31, 0x75, 0xab, 0xc5, 0xde, 0x7d, 0x90, 0x3a, 0x1d}, + {0x6f, 0x13, 0xf4, 0x26, 0xa4, 0x6b, 0x0, 0xb9, 0x35, 0x30, 0xe0, + 0x57, 0x9e, 0x36, 0x67, 0x8d, 0x28, 0x3c, 0x46, 0x4f, 0xd9, 0xdf, + 0xc8, 0xcb, 0xf5, 0xdb, 0xee, 0xf8, 0xbc, 0x8d, 0x1f, 0xd}, + {0xdc, 0x81, 0xd0, 0x3e, 0x31, 0x93, 0x16, 0xba, 0x80, 0x34, 0x1b, + 0x85, 0xad, 0x9f, 0x32, 0x29, 0xcb, 0x21, 0x3, 0x3, 0x3c, 0x1, + 0x28, 0x1, 0xe3, 0xfd, 0x1b, 0xa3, 0x44, 0x1b, 0x1, 0x0}, + }, + { + {0x5c, 0xa7, 0xa, 0x6a, 0x69, 0x1f, 0x56, 0x16, 0x6a, 0xbd, 0x52, + 0x58, 0x5c, 0x72, 0xbf, 0xc1, 0xad, 0x66, 0x79, 0x9a, 0x7f, 0xdd, + 0xa8, 0x11, 0x26, 0x10, 0x85, 0xd2, 0xa2, 0x88, 0xd9, 0x63}, + {0xc, 0x6c, 0xc6, 0x3f, 0x6c, 0xa0, 0xdf, 0x3f, 0xd2, 0xd, 0xd6, + 0x4d, 0x8e, 0xe3, 0x40, 0x5d, 0x71, 0x4d, 0x8e, 0x26, 0x38, 0x8b, + 0xe3, 0x7a, 0xe1, 0x57, 0x83, 0x6e, 0x91, 0x8d, 0xc4, 0x3a}, + {0x2e, 0x23, 0xbd, 0xaf, 0x53, 0x7, 0x12, 0x0, 0x83, 0xf6, 0xd8, + 0xfd, 0xb8, 0xce, 0x2b, 0xe9, 0x91, 0x2b, 0xe7, 0x84, 0xb3, 0x69, + 0x16, 0xf8, 0x66, 0xa0, 0x68, 0x23, 0x2b, 0xd5, 0xfa, 0x33}, + }, + { + {0xe8, 0xcf, 0x22, 0xc4, 0xd0, 0xc8, 0x2c, 0x8d, 0xcb, 0x3a, 0xa1, + 0x5, 0x7b, 0x4f, 0x2b, 0x7, 0x6f, 0xa5, 0xf6, 0xec, 0xe6, 0xb6, + 0xfe, 0xa3, 0xe2, 0x71, 0xa, 0xb9, 0xcc, 0x55, 0xc3, 0x3c}, + {0x16, 0x1e, 0xe4, 0xc5, 0xc6, 0x49, 0x6, 0x54, 0x35, 0x77, 0x3f, + 0x33, 0x30, 0x64, 0xf8, 0xa, 0x46, 0xe7, 0x5, 0xf3, 0xd2, 0xfc, + 0xac, 0xb2, 0xa7, 0xdc, 0x56, 0xa2, 0x29, 0xf4, 0xc0, 0x16}, + {0x31, 0x91, 0x3e, 0x90, 0x43, 0x94, 0xb6, 0xe9, 0xce, 0x37, 0x56, + 0x7a, 0xcb, 0x94, 0xa4, 0xb8, 0x44, 0x92, 0xba, 0xba, 0xa4, 0xd1, + 0x7c, 0xc8, 0x68, 0x75, 0xae, 0x6b, 0x42, 0xaf, 0x1e, 0x63}, + }, + { + {0xe8, 0xd, 0x70, 0xa3, 0xb9, 0x75, 0xd9, 0x47, 0x52, 0x5, 0xf8, + 0xe2, 0xfb, 0xc5, 0x80, 0x72, 0xe1, 0x5d, 0xe4, 0x32, 0x27, 0x8f, + 0x65, 0x53, 0xb5, 0x80, 0x5f, 0x66, 0x7f, 0x2c, 0x1f, 0x43}, + {0x9f, 0xfe, 0x66, 0xda, 0x10, 0x4, 0xe9, 0xb3, 0xa6, 0xe5, 0x16, + 0x6c, 0x52, 0x4b, 0xdd, 0x85, 0x83, 0xbf, 0xf9, 0x1e, 0x61, 0x97, + 0x3d, 0xbc, 0xb5, 0x19, 0xa9, 0x1e, 0x8b, 0x64, 0x99, 0x55}, + {0x19, 0x7b, 0x8f, 0x85, 0x44, 0x63, 0x2, 0xd6, 0x4a, 0x51, 0xea, + 0xa1, 0x2f, 0x35, 0xab, 0x14, 0xd7, 0xa9, 0x90, 0x20, 0x1a, 0x44, + 0x0, 0x89, 0x26, 0x3b, 0x25, 0x91, 0x5f, 0x71, 0x4, 0x7b}, + }, + { + {0xc6, 0xba, 0xe6, 0xc4, 0x80, 0xc2, 0x76, 0xb3, 0xb, 0x9b, 0x1d, + 0x6d, 0xdd, 0xd3, 0xe, 0x97, 0x44, 0xf9, 0xb, 0x45, 0x58, 0x95, + 0x9a, 0xb0, 0x23, 0xe2, 0xcd, 0x57, 0xfa, 0xac, 0xd0, 0x48}, + {0x43, 0xae, 0xf6, 0xac, 0x28, 0xbd, 0xed, 0x83, 0xb4, 0x7a, 0x5c, + 0x7d, 0x8b, 0x7c, 0x35, 0x86, 0x44, 0x2c, 0xeb, 0xb7, 0x69, 0x47, + 0x40, 0xc0, 0x3f, 0x58, 0xf6, 0xc2, 0xf5, 0x7b, 0xb3, 0x59}, + {0x71, 0xe6, 0xab, 0x7d, 0xe4, 0x26, 0xf, 0xb6, 0x37, 0x3a, 0x2f, + 0x62, 0x97, 0xa1, 0xd1, 0xf1, 0x94, 0x3, 0x96, 0xe9, 0x7e, 0xce, + 0x8, 0x42, 0xdb, 0x3b, 0x6d, 0x33, 0x91, 0x41, 0x23, 0x16}, }, }, { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 717255318455100, 519313764361315, 2080406977303708, - 541981206705521, 774328150311600 -#else - 7881532, 10687937, 7578723, 7738378, 48157852, 31000479, - 21820785, 8076149, 39240368, 11538388 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 261715221532238, 1795354330069993, 1496878026850283, - 499739720521052, 389031152673770 -#else - 47173198, 3899860, 18283497, 26752864, 51380203, 22305220, - 8754524, 7446702, 61432810, 5797015 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1997217696294013, 1717306351628065, 1684313917746180, - 1644426076011410, 1857378133465451 -#else - 55813245, 29760862, 51326753, 25589858, 12708868, 25098233, - 2014098, 24503858, 64739691, 27677090 -#endif - }}, + {0x40, 0x86, 0xf3, 0x1f, 0xd6, 0x9c, 0x49, 0xdd, 0xa0, 0x25, 0x36, + 0x6, 0xc3, 0x9b, 0xcd, 0x29, 0xc3, 0x3d, 0xd7, 0x3d, 0x2, 0xd8, + 0xe2, 0x51, 0x31, 0x92, 0x3b, 0x20, 0x7a, 0x70, 0x25, 0x4a}, + {0xf6, 0x7f, 0x26, 0xf6, 0xde, 0x99, 0xe4, 0xb9, 0x43, 0x8, 0x2c, + 0x74, 0x7b, 0xca, 0x72, 0x77, 0xb1, 0xf2, 0xa4, 0xe9, 0x3f, 0x15, + 0xa0, 0x23, 0x6, 0x50, 0xd0, 0xd5, 0xec, 0xdf, 0xdf, 0x2c}, + {0x6a, 0xed, 0xf6, 0x53, 0x8a, 0x66, 0xb7, 0x2a, 0xa1, 0x70, 0xd1, + 0x1d, 0x58, 0x42, 0x42, 0x30, 0x61, 0x1, 0xe2, 0x3a, 0x4c, 0x14, + 0x0, 0x40, 0xfc, 0x49, 0x8e, 0x24, 0x6d, 0x89, 0x21, 0x57}, + }, + { + {0x4e, 0xda, 0xd0, 0xa1, 0x91, 0x50, 0x5d, 0x28, 0x8, 0x3e, 0xfe, + 0xb5, 0xa7, 0x6f, 0xaa, 0x4b, 0xb3, 0x93, 0x93, 0xe1, 0x7c, 0x17, + 0xe5, 0x63, 0xfd, 0x30, 0xb0, 0xc4, 0xaf, 0x35, 0xc9, 0x3}, + {0xae, 0x1b, 0x18, 0xfd, 0x17, 0x55, 0x6e, 0xb, 0xb4, 0x63, 0xb9, + 0x2b, 0x9f, 0x62, 0x22, 0x90, 0x25, 0x46, 0x6, 0x32, 0xe9, 0xbc, + 0x9, 0x55, 0xda, 0x13, 0x3c, 0xf6, 0x74, 0xdd, 0x8e, 0x57}, + {0x3d, 0xc, 0x2b, 0x49, 0xc6, 0x76, 0x72, 0x99, 0xfc, 0x5, 0xe2, + 0xdf, 0xc4, 0xc2, 0xcc, 0x47, 0x3c, 0x3a, 0x62, 0xdd, 0x84, 0x9b, + 0xd2, 0xdc, 0xa2, 0xc7, 0x88, 0x2, 0x59, 0xab, 0xc2, 0x3e}, + }, + { + {0xcb, 0xd1, 0x32, 0xae, 0x9, 0x3a, 0x21, 0xa7, 0xd5, 0xc2, 0xf5, + 0x40, 0xdf, 0x87, 0x2b, 0xf, 0x29, 0xab, 0x1e, 0xe8, 0xc6, 0xa4, + 0xae, 0xb, 0x5e, 0xac, 0xdb, 0x6a, 0x6c, 0xf6, 0x1b, 0xe}, + {0xb9, 0x7b, 0xd8, 0xe4, 0x7b, 0xd2, 0xa0, 0xa1, 0xed, 0x1a, 0x39, + 0x61, 0xeb, 0x4d, 0x8b, 0xa9, 0x83, 0x9b, 0xcb, 0x73, 0xd0, 0xdd, + 0xa0, 0x99, 0xce, 0xca, 0xf, 0x20, 0x5a, 0xc2, 0xd5, 0x2d}, + {0x7e, 0x88, 0x2c, 0x79, 0xe9, 0xd5, 0xab, 0xe2, 0x5d, 0x6d, 0x92, + 0xcb, 0x18, 0x0, 0x2, 0x1a, 0x1e, 0x5f, 0xae, 0xba, 0xcd, 0x69, + 0xba, 0xbf, 0x5f, 0x8f, 0xe8, 0x5a, 0xb3, 0x48, 0x5, 0x73}, + }, + { + {0x34, 0xe3, 0xd6, 0xa1, 0x4b, 0x9, 0x5b, 0x80, 0x19, 0x3f, 0x35, + 0x9, 0x77, 0xf1, 0x3e, 0xbf, 0x2b, 0x70, 0x22, 0x6, 0xcb, 0x6, + 0x3f, 0x42, 0xdd, 0x45, 0x78, 0xd8, 0x77, 0x22, 0x5a, 0x58}, + {0xee, 0xb8, 0xa8, 0xcb, 0xa3, 0x51, 0x35, 0xc4, 0x16, 0x5f, 0x11, + 0xb2, 0x1d, 0x6f, 0xa2, 0x65, 0x50, 0x38, 0x8c, 0xab, 0x52, 0x4f, + 0xf, 0x76, 0xca, 0xb8, 0x1d, 0x41, 0x3b, 0x44, 0x43, 0x30}, + {0x62, 0x89, 0xd4, 0x33, 0x82, 0x5f, 0x8a, 0xa1, 0x7f, 0x25, 0x78, + 0xec, 0xb5, 0xc4, 0x98, 0x66, 0xff, 0x41, 0x3e, 0x37, 0xa5, 0x6f, + 0x8e, 0xa7, 0x1f, 0x98, 0xef, 0x50, 0x89, 0x27, 0x56, 0x76}, + }, + { + {0x9d, 0xcf, 0x86, 0xea, 0xa3, 0x73, 0x70, 0xe1, 0xdc, 0x5f, 0x15, + 0x7, 0xb7, 0xfb, 0x8c, 0x3a, 0x8e, 0x8a, 0x83, 0x31, 0xfc, 0xe7, + 0x53, 0x48, 0x16, 0xf6, 0x13, 0xb6, 0x84, 0xf4, 0xbb, 0x28}, + {0xc0, 0xc8, 0x1f, 0xd5, 0x59, 0xcf, 0xc3, 0x38, 0xf2, 0xb6, 0x6, + 0x5, 0xfd, 0xd2, 0xed, 0x9b, 0x8f, 0xe, 0x57, 0xab, 0x9f, 0x10, + 0xbf, 0x26, 0xa6, 0x46, 0xb8, 0xc1, 0xa8, 0x60, 0x41, 0x3f}, + {0x7c, 0x6c, 0x13, 0x6f, 0x5c, 0x2f, 0x61, 0xf2, 0xbe, 0x11, 0xdd, + 0xf6, 0x7, 0xd1, 0xea, 0xaf, 0x33, 0x6f, 0xde, 0x13, 0xd2, 0x9a, + 0x7e, 0x52, 0x5d, 0xf7, 0x88, 0x81, 0x35, 0xcb, 0x79, 0x1e}, + }, + { + {0x81, 0x81, 0xe0, 0xf5, 0xd8, 0x53, 0xe9, 0x77, 0xd9, 0xde, 0x9d, + 0x29, 0x44, 0xc, 0xa5, 0x84, 0xe5, 0x25, 0x45, 0x86, 0xc, 0x2d, + 0x6c, 0xdc, 0xf4, 0xf2, 0xd1, 0x39, 0x2d, 0xb5, 0x8a, 0x47}, + {0xf1, 0xe3, 0xf7, 0xee, 0xc3, 0x36, 0x34, 0x1, 0xf8, 0x10, 0x9e, + 0xfe, 0x7f, 0x6a, 0x8b, 0x82, 0xfc, 0xde, 0xf9, 0xbc, 0xe5, 0x8, + 0xf9, 0x7f, 0x31, 0x38, 0x3b, 0x3a, 0x1b, 0x95, 0xd7, 0x65}, + {0x59, 0xd1, 0x52, 0x92, 0xd3, 0xa4, 0xa6, 0x66, 0x7, 0xc8, 0x1a, + 0x87, 0xbc, 0xe1, 0xdd, 0xe5, 0x6f, 0xc9, 0xc1, 0xa6, 0x40, 0x6b, + 0x2c, 0xb8, 0x14, 0x22, 0x21, 0x1a, 0x41, 0x7a, 0xd8, 0x16}, + }, + { + {0x83, 0x5, 0x4e, 0xd5, 0xe2, 0xd5, 0xa4, 0xfb, 0xfa, 0x99, 0xbd, + 0x2e, 0xd7, 0xaf, 0x1f, 0xe2, 0x8f, 0x77, 0xe9, 0x6e, 0x73, 0xc2, + 0x7a, 0x49, 0xde, 0x6d, 0x5a, 0x7a, 0x57, 0xb, 0x99, 0x1f}, + {0x15, 0x62, 0x6, 0x42, 0x5a, 0x7e, 0xbd, 0xb3, 0xc1, 0x24, 0x5a, + 0xc, 0xcd, 0xe3, 0x9b, 0x87, 0xb7, 0x94, 0xf9, 0xd6, 0xb1, 0x5d, + 0xc0, 0x57, 0xa6, 0x8c, 0xf3, 0x65, 0x81, 0x7c, 0xf8, 0x28}, + {0xd6, 0xf7, 0xe8, 0x1b, 0xad, 0x4e, 0x34, 0xa3, 0x8f, 0x79, 0xea, + 0xac, 0xeb, 0x50, 0x1e, 0x7d, 0x52, 0xe0, 0xd, 0x52, 0x9e, 0x56, + 0xc6, 0x77, 0x3e, 0x6d, 0x4d, 0x53, 0xe1, 0x2f, 0x88, 0x45}, + }, + { + {0xe4, 0x6f, 0x3c, 0x94, 0x29, 0x99, 0xac, 0xd8, 0xa2, 0x92, 0x83, + 0xa3, 0x61, 0xf1, 0xf9, 0xb5, 0xf3, 0x9a, 0xc8, 0xbe, 0x13, 0xdb, + 0x99, 0x26, 0x74, 0xf0, 0x5, 0xe4, 0x3c, 0x84, 0xcf, 0x7d}, + {0xd6, 0x83, 0x79, 0x75, 0x5d, 0x34, 0x69, 0x66, 0xa6, 0x11, 0xaa, + 0x17, 0x11, 0xed, 0xb6, 0x62, 0x8f, 0x12, 0x5e, 0x98, 0x57, 0x18, + 0xdd, 0x7d, 0xdd, 0xf6, 0x26, 0xf6, 0xb8, 0xe5, 0x8f, 0x68}, + {0xc0, 0x32, 0x47, 0x4a, 0x48, 0xd6, 0x90, 0x6c, 0x99, 0x32, 0x56, + 0xca, 0xfd, 0x43, 0x21, 0xd5, 0xe1, 0xc6, 0x5d, 0x91, 0xc3, 0x28, + 0xbe, 0xb3, 0x1b, 0x19, 0x27, 0x73, 0x7e, 0x68, 0x39, 0x67}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1475434724792648, 76931896285979, 1116729029771667, - 2002544139318042, 725547833803938 -#else - 44636488, 21985690, 39426843, 1146374, 18956691, 16640559, - 1192730, 29840233, 15123618, 10811505 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2022306639183567, 726296063571875, 315345054448644, - 1058733329149221, 1448201136060677 -#else - 14352079, 30134717, 48166819, 10822654, 32750596, 4699007, - 67038501, 15776355, 38222085, 21579878 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1710065158525665, 1895094923036397, 123988286168546, - 1145519900776355, 1607510767693874 -#else - 38867681, 25481956, 62129901, 28239114, 29416930, 1847569, - 46454691, 17069576, 4714546, 23953777 -#endif - }}, + {0xc0, 0x1a, 0xc, 0xc8, 0x9d, 0xcc, 0x6d, 0xa6, 0x36, 0xa4, 0x38, + 0x1b, 0xf4, 0x5c, 0xa0, 0x97, 0xc6, 0xd7, 0xdb, 0x95, 0xbe, 0xf3, + 0xeb, 0xa7, 0xab, 0x7d, 0x7e, 0x8d, 0xf6, 0xb8, 0xa0, 0x7d}, + {0xa6, 0x75, 0x56, 0x38, 0x14, 0x20, 0x78, 0xef, 0xe8, 0xa9, 0xfd, + 0xaa, 0x30, 0x9f, 0x64, 0xa2, 0xcb, 0xa8, 0xdf, 0x5c, 0x50, 0xeb, + 0xd1, 0x4c, 0xb3, 0xc0, 0x4d, 0x1d, 0xba, 0x5a, 0x11, 0x46}, + {0x76, 0xda, 0xb5, 0xc3, 0x53, 0x19, 0xf, 0xd4, 0x9b, 0x9e, 0x11, + 0x21, 0x73, 0x6f, 0xac, 0x1d, 0x60, 0x59, 0xb2, 0xfe, 0x21, 0x60, + 0xcc, 0x3, 0x4b, 0x4b, 0x67, 0x83, 0x7e, 0x88, 0x5f, 0x5a}, + }, + { + {0xb9, 0x43, 0xa6, 0xa0, 0xd3, 0x28, 0x96, 0x9e, 0x64, 0x20, 0xc3, + 0xe6, 0x0, 0xcb, 0xc3, 0xb5, 0x32, 0xec, 0x2d, 0x7c, 0x89, 0x2, + 0x53, 0x9b, 0xc, 0xc7, 0xd1, 0xd5, 0xe2, 0x7a, 0xe3, 0x43}, + {0x11, 0x3d, 0xa1, 0x70, 0xcf, 0x1, 0x63, 0x8f, 0xc4, 0xd0, 0xd, + 0x35, 0x15, 0xb8, 0xce, 0xcf, 0x7e, 0xa4, 0xbc, 0xa4, 0xd4, 0x97, + 0x2, 0xf7, 0x34, 0x14, 0x4d, 0xe4, 0x56, 0xb6, 0x69, 0x36}, + {0x33, 0xe1, 0xa6, 0xed, 0x6, 0x3f, 0x7e, 0x38, 0xc0, 0x3a, 0xa1, + 0x99, 0x51, 0x1d, 0x30, 0x67, 0x11, 0x38, 0x26, 0x36, 0xf8, 0xd8, + 0x5a, 0xbd, 0xbe, 0xe9, 0xd5, 0x4f, 0xcd, 0xe6, 0x21, 0x6a}, + }, + { + {0xe3, 0xb2, 0x99, 0x66, 0x12, 0x29, 0x41, 0xef, 0x1, 0x13, 0x8d, + 0x70, 0x47, 0x8, 0xd3, 0x71, 0xbd, 0xb0, 0x82, 0x11, 0xd0, 0x32, + 0x54, 0x32, 0x36, 0x8b, 0x1e, 0x0, 0x7, 0x1b, 0x37, 0x45}, + {0x5f, 0xe6, 0x46, 0x30, 0xa, 0x17, 0xc6, 0xf1, 0x24, 0x35, 0xd2, + 0x0, 0x2a, 0x2a, 0x71, 0x58, 0x55, 0xb7, 0x82, 0x8c, 0x3c, 0xbd, + 0xdb, 0x69, 0x57, 0xff, 0x95, 0xa1, 0xf1, 0xf9, 0x6b, 0x58}, + {0xb, 0x79, 0xf8, 0x5e, 0x8d, 0x8, 0xdb, 0xa6, 0xe5, 0x37, 0x9, + 0x61, 0xdc, 0xf0, 0x78, 0x52, 0xb8, 0x6e, 0xa1, 0x61, 0xd2, 0x49, + 0x3, 0xac, 0x79, 0x21, 0xe5, 0x90, 0x37, 0xb0, 0xaf, 0xe}, + }, + { + {0x1d, 0xae, 0x75, 0xf, 0x5e, 0x80, 0x40, 0x51, 0x30, 0xcc, 0x62, + 0x26, 0xe3, 0xfb, 0x2, 0xec, 0x6d, 0x39, 0x92, 0xea, 0x1e, 0xdf, + 0xeb, 0x2c, 0xb3, 0x5b, 0x43, 0xc5, 0x44, 0x33, 0xae, 0x44}, + {0x2f, 0x4, 0x48, 0x37, 0xc1, 0x55, 0x5, 0x96, 0x11, 0xaa, 0xb, + 0x82, 0xe6, 0x41, 0x9a, 0x21, 0xc, 0x6d, 0x48, 0x73, 0x38, 0xf7, + 0x81, 0x1c, 0x61, 0xc6, 0x2, 0x5a, 0x67, 0xcc, 0x9a, 0x30}, + {0xee, 0x43, 0xa5, 0xbb, 0xb9, 0x89, 0xf2, 0x9c, 0x42, 0x71, 0xc9, + 0x5a, 0x9d, 0xe, 0x76, 0xf3, 0xaa, 0x60, 0x93, 0x4f, 0xc6, 0xe5, + 0x82, 0x1d, 0x8f, 0x67, 0x94, 0x7f, 0x1b, 0x22, 0xd5, 0x62}, + }, + { + {0x3c, 0x7a, 0xf7, 0x3a, 0x26, 0xd4, 0x85, 0x75, 0x4d, 0x14, 0xe9, + 0xfe, 0x11, 0x7b, 0xae, 0xdf, 0x3d, 0x19, 0xf7, 0x59, 0x80, 0x70, + 0x6, 0xa5, 0x37, 0x20, 0x92, 0x83, 0x53, 0x9a, 0xf2, 0x14}, + {0x6d, 0x93, 0xd0, 0x18, 0x9c, 0x29, 0x4c, 0x52, 0xc, 0x1a, 0xc, + 0x8a, 0x6c, 0xb5, 0x6b, 0xc8, 0x31, 0x86, 0x4a, 0xdb, 0x2e, 0x5, + 0x75, 0xa3, 0x62, 0x45, 0x75, 0xbc, 0xe4, 0xfd, 0xe, 0x5c}, + {0xf5, 0xd7, 0xb2, 0x25, 0xdc, 0x7e, 0x71, 0xdf, 0x40, 0x30, 0xb5, + 0x99, 0xdb, 0x70, 0xf9, 0x21, 0x62, 0x4c, 0xed, 0xc3, 0xb7, 0x34, + 0x92, 0xda, 0x3e, 0x9, 0xee, 0x7b, 0x5c, 0x36, 0x72, 0x5e}, + }, + { + {0x3e, 0xb3, 0x8, 0x2f, 0x6, 0x39, 0x93, 0x7d, 0xbe, 0x32, 0x9f, + 0xdf, 0xe5, 0x59, 0x96, 0x5b, 0xfd, 0xbd, 0x9e, 0x1f, 0xad, 0x3d, + 0xff, 0xac, 0xb7, 0x49, 0x73, 0xcb, 0x55, 0x5, 0xb2, 0x70}, + {0x7f, 0x21, 0x71, 0x45, 0x7, 0xfc, 0x5b, 0x57, 0x5b, 0xd9, 0x94, + 0x6, 0x5d, 0x67, 0x79, 0x37, 0x33, 0x1e, 0x19, 0xf4, 0xbb, 0x37, + 0xa, 0x9a, 0xbc, 0xea, 0xb4, 0x47, 0x4c, 0x10, 0xf1, 0x77}, + {0x4c, 0x2c, 0x11, 0x55, 0xc5, 0x13, 0x51, 0xbe, 0xcd, 0x1f, 0x88, + 0x9a, 0x3a, 0x42, 0x88, 0x66, 0x47, 0x3b, 0x50, 0x5e, 0x85, 0x77, + 0x66, 0x44, 0x4a, 0x40, 0x6, 0x4a, 0x8f, 0x39, 0x34, 0xe}, + }, + { + {0x28, 0x19, 0x4b, 0x3e, 0x9, 0xb, 0x93, 0x18, 0x40, 0xf6, 0xf3, + 0x73, 0xe, 0xe1, 0xe3, 0x7d, 0x6f, 0x5d, 0x39, 0x73, 0xda, 0x17, + 0x32, 0xf4, 0x3e, 0x9c, 0x37, 0xca, 0xd6, 0xde, 0x8a, 0x6f}, + {0xe8, 0xbd, 0xce, 0x3e, 0xd9, 0x22, 0x7d, 0xb6, 0x7, 0x2f, 0x82, + 0x27, 0x41, 0xe8, 0xb3, 0x9, 0x8d, 0x6d, 0x5b, 0xb0, 0x1f, 0xa6, + 0x3f, 0x74, 0x72, 0x23, 0x36, 0x8a, 0x36, 0x5, 0x54, 0x5e}, + {0x9a, 0xb2, 0xb7, 0xfd, 0x3d, 0x12, 0x40, 0xe3, 0x91, 0xb2, 0x1a, + 0xa2, 0xe1, 0x97, 0x7b, 0x48, 0x9e, 0x94, 0xe6, 0xfd, 0x2, 0x7d, + 0x96, 0xf9, 0x97, 0xde, 0xd3, 0xc8, 0x2e, 0xe7, 0xd, 0x78}, + }, + { + {0x72, 0x27, 0xf4, 0x0, 0xf3, 0xea, 0x1f, 0x67, 0xaa, 0x41, 0x8c, + 0x2a, 0x2a, 0xeb, 0x72, 0x8f, 0x92, 0x32, 0x37, 0x97, 0xd7, 0x7f, + 0xa1, 0x29, 0xa6, 0x87, 0xb5, 0x32, 0xad, 0xc6, 0xef, 0x1d}, + {0xbc, 0xe7, 0x9a, 0x8, 0x45, 0x85, 0xe2, 0xa, 0x6, 0x4d, 0x7f, + 0x1c, 0xcf, 0xde, 0x8d, 0x38, 0xb8, 0x11, 0x48, 0xa, 0x51, 0x15, + 0xac, 0x38, 0xe4, 0x8c, 0x92, 0x71, 0xf6, 0x8b, 0xb2, 0xe}, + {0xa7, 0x95, 0x51, 0xef, 0x1a, 0xbe, 0x5b, 0xaf, 0xed, 0x15, 0x7b, + 0x91, 0x77, 0x12, 0x8c, 0x14, 0x2e, 0xda, 0xe5, 0x7a, 0xfb, 0xf7, + 0x91, 0x29, 0x67, 0x28, 0xdd, 0xf8, 0x1b, 0x20, 0x7d, 0x46}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 561605375422540, 1071733543815037, 131496498800990, - 1946868434569999, 828138133964203 -#else - 15200332, 8368572, 19679101, 15970074, 35236190, 1959450, - 24611599, 29010600, 55362987, 12340219 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1548495173745801, 442310529226540, 998072547000384, - 553054358385281, 644824326376171 -#else - 12876937, 23074376, 33134380, 6590940, 60801088, 14872439, - 9613953, 8241152, 15370987, 9608631 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1445526537029440, 2225519789662536, 914628859347385, - 1064754194555068, 1660295614401091 -#else - 62965568, 21540023, 8446280, 33162829, 4407737, 13629032, - 59383996, 15866073, 38898243, 24740332 -#endif - }}, + {0xa9, 0xe7, 0x7a, 0x56, 0xbd, 0xf4, 0x1e, 0xbc, 0xbd, 0x98, 0x44, + 0xd6, 0xb2, 0x4c, 0x62, 0x3f, 0xc8, 0x4e, 0x1f, 0x2c, 0xd2, 0x64, + 0x10, 0xe4, 0x1, 0x40, 0x38, 0xba, 0xa5, 0xc5, 0xf9, 0x2e}, + {0xad, 0x4f, 0xef, 0x74, 0x9a, 0x91, 0xfe, 0x95, 0xa2, 0x8, 0xa3, + 0xf6, 0xec, 0x7b, 0x82, 0x3a, 0x1, 0x7b, 0xa4, 0x9, 0xd3, 0x1, + 0x4e, 0x96, 0x97, 0xc7, 0xa3, 0x5b, 0x4f, 0x3c, 0xc4, 0x71}, + {0xcd, 0x74, 0x9e, 0xfa, 0xf6, 0x6d, 0xfd, 0xb6, 0x7a, 0x26, 0xaf, + 0xe4, 0xbc, 0x78, 0x82, 0xf1, 0xe, 0x99, 0xef, 0xf1, 0xd0, 0xb3, + 0x55, 0x82, 0x93, 0xf2, 0xc5, 0x90, 0xa3, 0x8c, 0x75, 0x5a}, + }, + { + {0x94, 0xdc, 0x61, 0x1d, 0x8b, 0x91, 0xe0, 0x8c, 0x66, 0x30, 0x81, + 0x9a, 0x46, 0x36, 0xed, 0x8d, 0xd3, 0xaa, 0xe8, 0xaf, 0x29, 0xa8, + 0xe6, 0xd4, 0x3f, 0xd4, 0x39, 0xf6, 0x27, 0x80, 0x73, 0xa}, + {0x95, 0x24, 0x46, 0xd9, 0x10, 0x27, 0xb7, 0xa2, 0x3, 0x50, 0x7d, + 0xd5, 0xd2, 0xc6, 0xa8, 0x3a, 0xca, 0x87, 0xb4, 0xa0, 0xbf, 0x0, + 0xd4, 0xe3, 0xec, 0x72, 0xeb, 0xb3, 0x44, 0xe2, 0xba, 0x2d}, + {0xcc, 0xe1, 0xff, 0x57, 0x2f, 0x4a, 0xf, 0x98, 0x43, 0x98, 0x83, + 0xe1, 0xd, 0xd, 0x67, 0x0, 0xfd, 0x15, 0xfb, 0x49, 0x4a, 0x3f, + 0x5c, 0x10, 0x9c, 0xa6, 0x26, 0x51, 0x63, 0xca, 0x98, 0x26}, + }, + { + {0xe, 0xd9, 0x3d, 0x5e, 0x2f, 0x70, 0x3d, 0x2e, 0x86, 0x53, 0xd2, + 0xe4, 0x18, 0x9, 0x3f, 0x9e, 0x6a, 0xa9, 0x4d, 0x2, 0xf6, 0x3e, + 0x77, 0x5e, 0x32, 0x33, 0xfa, 0x4a, 0xc, 0x4b, 0x0, 0x3c}, + {0x78, 0xba, 0xb0, 0x32, 0x88, 0x31, 0x65, 0xe7, 0x8b, 0xff, 0x5c, + 0x92, 0xf7, 0x31, 0x18, 0x38, 0xcc, 0x1f, 0x29, 0xa0, 0x91, 0x1b, + 0xa8, 0x8, 0x7, 0xeb, 0xca, 0x49, 0xcc, 0x3d, 0xb4, 0x1f}, + {0x2b, 0xb8, 0xf4, 0x6, 0xac, 0x46, 0xa9, 0x9a, 0xf3, 0xc4, 0x6, + 0xa8, 0xa5, 0x84, 0xa2, 0x1c, 0x87, 0x47, 0xcd, 0xc6, 0x5f, 0x26, + 0xd3, 0x3e, 0x17, 0xd2, 0x1f, 0xcd, 0x1, 0xfd, 0x43, 0x6b}, + }, + { + {0xf3, 0xe, 0x76, 0x3e, 0x58, 0x42, 0xc7, 0xb5, 0x90, 0xb9, 0xa, + 0xee, 0xb9, 0x52, 0xdc, 0x75, 0x3f, 0x92, 0x2b, 0x7, 0xc2, 0x27, + 0x14, 0xbf, 0xf0, 0xd9, 0xf0, 0x6f, 0x2d, 0xb, 0x42, 0x73}, + {0x44, 0xc5, 0x97, 0x46, 0x4b, 0x5d, 0xa7, 0xc7, 0xbf, 0xff, 0xf, + 0xdf, 0x48, 0xf8, 0xfd, 0x15, 0x5a, 0x78, 0x46, 0xaa, 0xeb, 0xb9, + 0x68, 0x28, 0x14, 0xf7, 0x52, 0x5b, 0x10, 0xd7, 0x68, 0x5a}, + {0x6, 0x1e, 0x85, 0x9e, 0xcb, 0xf6, 0x2c, 0xaf, 0xc4, 0x38, 0x22, + 0xc6, 0x13, 0x39, 0x59, 0x8f, 0x73, 0xf3, 0xfb, 0x99, 0x96, 0xb8, + 0x8a, 0xda, 0x9e, 0xbc, 0x34, 0xea, 0x2f, 0x63, 0xb5, 0x3d}, + }, + { + {0xd5, 0x25, 0x98, 0x82, 0xb1, 0x90, 0x49, 0x2e, 0x91, 0x89, 0x9a, + 0x3e, 0x87, 0xeb, 0xea, 0xed, 0xf8, 0x4a, 0x70, 0x4c, 0x39, 0x3d, + 0xf0, 0xee, 0xe, 0x2b, 0xdf, 0x95, 0xa4, 0x7e, 0x19, 0x59}, + {0xd8, 0xd9, 0x5d, 0xf7, 0x2b, 0xee, 0x6e, 0xf4, 0xa5, 0x59, 0x67, + 0x39, 0xf6, 0xb1, 0x17, 0xd, 0x73, 0x72, 0x9e, 0x49, 0x31, 0xd1, + 0xf2, 0x1b, 0x13, 0x5f, 0xd7, 0x49, 0xdf, 0x1a, 0x32, 0x4}, + {0xae, 0x5a, 0xe5, 0xe4, 0x19, 0x60, 0xe1, 0x4, 0xe9, 0x92, 0x2f, + 0x7e, 0x7a, 0x43, 0x7b, 0xe7, 0xa4, 0x9a, 0x15, 0x6f, 0xc1, 0x2d, + 0xce, 0xc7, 0xc0, 0xc, 0xd7, 0xf4, 0xc1, 0xfd, 0xea, 0x45}, + }, + { + {0xed, 0xb1, 0xcc, 0xcf, 0x24, 0x46, 0xe, 0xb6, 0x95, 0x3, 0x5c, + 0xbd, 0x92, 0xc2, 0xdb, 0x59, 0xc9, 0x81, 0x4, 0xdc, 0x1d, 0x9d, + 0xa0, 0x31, 0x40, 0xd9, 0x56, 0x5d, 0xea, 0xce, 0x73, 0x3f}, + {0x2b, 0xd7, 0x45, 0x80, 0x85, 0x1, 0x84, 0x69, 0x51, 0x6, 0x2f, + 0xcf, 0xa2, 0xfa, 0x22, 0x4c, 0xc6, 0x2d, 0x22, 0x6b, 0x65, 0x36, + 0x1a, 0x94, 0xde, 0xda, 0x62, 0x3, 0xc8, 0xeb, 0x5e, 0x5a}, + {0xc6, 0x8d, 0x4e, 0xa, 0xd1, 0xbf, 0xa7, 0xb7, 0x39, 0xb3, 0xc9, + 0x44, 0x7e, 0x0, 0x57, 0xbe, 0xfa, 0xae, 0x57, 0x15, 0x7f, 0x20, + 0xc1, 0x60, 0xdb, 0x18, 0x62, 0x26, 0x91, 0x88, 0x5, 0x26}, + }, + { + {0x42, 0xe5, 0x76, 0xc6, 0x3c, 0x8e, 0x81, 0x4c, 0xad, 0xcc, 0xce, + 0x3, 0x93, 0x2c, 0x42, 0x5e, 0x8, 0x9f, 0x12, 0xb4, 0xca, 0xcc, + 0x7, 0xec, 0xb8, 0x43, 0x44, 0xb2, 0x10, 0xfa, 0xed, 0xd}, + {0x4, 0xff, 0x60, 0x83, 0xa6, 0x4, 0xf7, 0x59, 0xf4, 0xe6, 0x61, + 0x76, 0xde, 0x3f, 0xd9, 0xc3, 0x51, 0x35, 0x87, 0x12, 0x73, 0x2a, + 0x1b, 0x83, 0x57, 0x5d, 0x61, 0x4e, 0x2e, 0xc, 0xad, 0x54}, + {0x2a, 0x52, 0x2b, 0xb8, 0xd5, 0x67, 0x3b, 0xee, 0xeb, 0xc1, 0xa5, + 0x9f, 0x46, 0x63, 0xf1, 0x36, 0xd3, 0x9f, 0xc1, 0x6e, 0xf2, 0xd2, + 0xb4, 0xa5, 0x8, 0x94, 0x7a, 0xa7, 0xba, 0xb2, 0xec, 0x62}, + }, + { + {0x74, 0x28, 0xb6, 0xaf, 0x36, 0x28, 0x7, 0x92, 0xa5, 0x4, 0xe1, + 0x79, 0x85, 0x5e, 0xcd, 0x5f, 0x4a, 0xa1, 0x30, 0xc6, 0xad, 0x1, + 0xad, 0x5a, 0x98, 0x3f, 0x66, 0x75, 0x50, 0x3d, 0x91, 0x61}, + {0x3d, 0x2b, 0x15, 0x61, 0x52, 0x79, 0xed, 0xe5, 0xd1, 0xd7, 0xdd, + 0xe, 0x7d, 0x35, 0x62, 0x49, 0x71, 0x4c, 0x6b, 0xb9, 0xd0, 0xc8, + 0x82, 0x74, 0xbe, 0xd8, 0x66, 0xa9, 0x19, 0xf9, 0x59, 0x2e}, + {0xda, 0x31, 0x32, 0x1a, 0x36, 0x2d, 0xc6, 0xd, 0x70, 0x2, 0x20, + 0x94, 0x32, 0x58, 0x47, 0xfa, 0xce, 0x94, 0x95, 0x3f, 0x51, 0x1, + 0xd8, 0x2, 0x5c, 0x5d, 0xc0, 0x31, 0xa1, 0xc2, 0xdb, 0x3d}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1199690223111956, 24028135822341, 66638289244341, - 57626156285975, 565093967979607 -#else - 26660628, 17876777, 8393733, 358047, 59707573, 992987, 43204631, - 858696, 20571223, 8420556 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 876926774220824, 554618976488214, 1012056309841565, - 839961821554611, 1414499340307677 -#else - 14620696, 13067227, 51661590, 8264466, 14106269, 15080814, - 33531827, 12516406, 45534429, 21077682 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 703047626104145, 1266841406201770, 165556500219173, - 486991595001879, 1011325891650656 -#else - 236881, 10476226, 57258, 18877408, 6472997, 2466984, 17258519, - 7256740, 8791136, 15069930 -#endif - }}, + {0x14, 0xbb, 0x96, 0x27, 0xa2, 0x57, 0xaa, 0xf3, 0x21, 0xda, 0x7, + 0x9b, 0xb7, 0xba, 0x3a, 0x88, 0x1c, 0x39, 0xa0, 0x31, 0x18, 0xe2, + 0x4b, 0xe5, 0xf9, 0x5, 0x32, 0xd8, 0x38, 0xfb, 0xe7, 0x5e}, + {0x4b, 0xc5, 0x5e, 0xce, 0xf9, 0xf, 0xdc, 0x9a, 0xd, 0x13, 0x2f, + 0x8c, 0x6b, 0x2a, 0x9c, 0x3, 0x15, 0x95, 0xf8, 0xf0, 0xc7, 0x7, + 0x80, 0x2, 0x6b, 0xb3, 0x4, 0xac, 0x14, 0x83, 0x96, 0x78}, + {0x8e, 0x6a, 0x44, 0x41, 0xcb, 0xfd, 0x8d, 0x53, 0xf9, 0x37, 0x49, + 0x43, 0xa9, 0xfd, 0xac, 0xa5, 0x78, 0x8c, 0x3c, 0x26, 0x8d, 0x90, + 0xaf, 0x46, 0x9, 0xd, 0xca, 0x9b, 0x3c, 0x63, 0xd0, 0x61}, + }, + { + {0xdf, 0x73, 0xfc, 0xf8, 0xbc, 0x28, 0xa3, 0xad, 0xfc, 0x37, 0xf0, + 0xa6, 0x5d, 0x69, 0x84, 0xee, 0x9, 0xa9, 0xc2, 0x38, 0xdb, 0xb4, + 0x7f, 0x63, 0xdc, 0x7b, 0x6, 0xf8, 0x2d, 0xac, 0x23, 0x5b}, + {0x66, 0x25, 0xdb, 0xff, 0x35, 0x49, 0x74, 0x63, 0xbb, 0x68, 0xb, + 0x78, 0x89, 0x6b, 0xbd, 0xc5, 0x3, 0xec, 0x3e, 0x55, 0x80, 0x32, + 0x1b, 0x6f, 0xf5, 0xd7, 0xae, 0x47, 0xd8, 0x5f, 0x96, 0x6e}, + {0x7b, 0x52, 0x80, 0xee, 0x53, 0xb9, 0xd2, 0x9a, 0x8d, 0x6d, 0xde, + 0xfa, 0xaa, 0x19, 0x8f, 0xe8, 0xcf, 0x82, 0xe, 0x15, 0x4, 0x17, + 0x71, 0xe, 0xdc, 0xde, 0x95, 0xdd, 0xb9, 0xbb, 0xb9, 0x79}, + }, + { + {0x74, 0x73, 0x9f, 0x8e, 0xae, 0x7d, 0x99, 0xd1, 0x16, 0x8, 0xbb, + 0xcf, 0xf8, 0xa2, 0x32, 0xa0, 0xa, 0x5f, 0x44, 0x6d, 0x12, 0xba, + 0x6c, 0xcd, 0x34, 0xb8, 0xcc, 0xa, 0x46, 0x11, 0xa8, 0x1b}, + {0xc2, 0x26, 0x31, 0x6a, 0x40, 0x55, 0xb3, 0xeb, 0x93, 0xc3, 0xc8, + 0x68, 0xa8, 0x83, 0x63, 0xd2, 0x82, 0x7a, 0xb9, 0xe5, 0x29, 0x64, + 0xc, 0x6c, 0x47, 0x21, 0xfd, 0xc9, 0x58, 0xf1, 0x65, 0x50}, + {0x54, 0x99, 0x42, 0xc, 0xfb, 0x69, 0x81, 0x70, 0x67, 0xcf, 0x6e, + 0xd7, 0xac, 0x0, 0x46, 0xe1, 0xba, 0x45, 0xe6, 0x70, 0x8a, 0xb9, + 0xaa, 0x2e, 0xf2, 0xfa, 0xa4, 0x58, 0x9e, 0xf3, 0x81, 0x39}, + }, + { + {0xde, 0x6f, 0xe6, 0x6d, 0xa5, 0xdf, 0x45, 0xc8, 0x3a, 0x48, 0x40, + 0x2c, 0x0, 0xa5, 0x52, 0xe1, 0x32, 0xf6, 0xb4, 0xc7, 0x63, 0xe1, + 0xd2, 0xe9, 0x65, 0x1b, 0xbc, 0xdc, 0x2e, 0x45, 0xf4, 0x30}, + {0x93, 0xa, 0x23, 0x59, 0x75, 0x8a, 0xfb, 0x18, 0x5d, 0xf4, 0xe6, + 0x60, 0x69, 0x8f, 0x16, 0x1d, 0xb5, 0x3c, 0xa9, 0x14, 0x45, 0xa9, + 0x85, 0x3a, 0xfd, 0xd0, 0xac, 0x5, 0x37, 0x8, 0xdc, 0x38}, + {0x40, 0x97, 0x75, 0xc5, 0x82, 0x27, 0x6d, 0x85, 0xcc, 0xbe, 0x9c, + 0xf9, 0x69, 0x45, 0x13, 0xfa, 0x71, 0x4e, 0xea, 0xc0, 0x73, 0xfc, + 0x44, 0x88, 0x69, 0x24, 0x3f, 0x59, 0x1a, 0x9a, 0x2d, 0x63}, + }, + { + {0xa7, 0x84, 0xc, 0xed, 0x11, 0xfd, 0x9, 0xbf, 0x3a, 0x69, 0x9f, + 0xd, 0x81, 0x71, 0xf0, 0x63, 0x79, 0x87, 0xcf, 0x57, 0x2d, 0x8c, + 0x90, 0x21, 0xa2, 0x4b, 0xf6, 0x8a, 0xf2, 0x7d, 0x5a, 0x3a}, + {0xa6, 0xcb, 0x7, 0xb8, 0x15, 0x6b, 0xbb, 0xf6, 0xd7, 0xf0, 0x54, + 0xbc, 0xdf, 0xc7, 0x23, 0x18, 0xb, 0x67, 0x29, 0x6e, 0x3, 0x97, + 0x1d, 0xbb, 0x57, 0x4a, 0xed, 0x47, 0x88, 0xf4, 0x24, 0xb}, + {0xc7, 0xea, 0x1b, 0x51, 0xbe, 0xd4, 0xda, 0xdc, 0xf2, 0xcc, 0x26, + 0xed, 0x75, 0x80, 0x53, 0xa4, 0x65, 0x9a, 0x5f, 0x0, 0x9f, 0xff, + 0x9c, 0xe1, 0x63, 0x1f, 0x48, 0x75, 0x44, 0xf7, 0xfc, 0x34}, + }, + { + {0x98, 0xaa, 0xcf, 0x78, 0xab, 0x1d, 0xbb, 0xa5, 0xf2, 0x72, 0xb, + 0x19, 0x67, 0xa2, 0xed, 0x5c, 0x8e, 0x60, 0x92, 0xa, 0x11, 0xc9, + 0x9, 0x93, 0xb0, 0x74, 0xb3, 0x2f, 0x4, 0xa3, 0x19, 0x1}, + {0xca, 0x67, 0x97, 0x78, 0x4c, 0xe0, 0x97, 0xc1, 0x7d, 0x46, 0xd9, + 0x38, 0xcb, 0x4d, 0x71, 0xb8, 0xa8, 0x5f, 0xf9, 0x83, 0x82, 0x88, + 0xde, 0x55, 0xf7, 0x63, 0xfa, 0x4d, 0x16, 0xdc, 0x3b, 0x3d}, + {0x7d, 0x17, 0xc2, 0xe8, 0x9c, 0xd8, 0xa2, 0x67, 0xc1, 0xd0, 0x95, + 0x68, 0xf6, 0xa5, 0x9d, 0x66, 0xb0, 0xa2, 0x82, 0xb2, 0xe5, 0x98, + 0x65, 0xf5, 0x73, 0xa, 0xe2, 0xed, 0xf1, 0x88, 0xc0, 0x56}, + }, + { + {0x2, 0x8f, 0xf3, 0x24, 0xac, 0x5f, 0x1b, 0x58, 0xbd, 0xc, 0xe3, + 0xba, 0xfe, 0xe9, 0xb, 0xa9, 0xf0, 0x92, 0xcf, 0x8a, 0x2, 0x69, + 0x21, 0x9a, 0x8f, 0x3, 0x59, 0x83, 0xa4, 0x7e, 0x8b, 0x3}, + {0x17, 0x6e, 0xa8, 0x10, 0x11, 0x3d, 0x6d, 0x33, 0xfa, 0xb2, 0x75, + 0xb, 0x32, 0x88, 0xf3, 0xd7, 0x88, 0x29, 0x7, 0x25, 0x76, 0x33, + 0x15, 0xf9, 0x87, 0x8b, 0x10, 0x99, 0x6b, 0x4c, 0x67, 0x9}, + {0xf8, 0x6f, 0x31, 0x99, 0x21, 0xf8, 0x4e, 0x9f, 0x4f, 0x8d, 0xa7, + 0xea, 0x82, 0xd2, 0x49, 0x2f, 0x74, 0x31, 0xef, 0x5a, 0xab, 0xa5, + 0x71, 0x9, 0x65, 0xeb, 0x69, 0x59, 0x2, 0x31, 0x5e, 0x6e}, + }, + { + {0x22, 0x62, 0x6, 0x63, 0xe, 0xfb, 0x4, 0x33, 0x3f, 0xba, 0xac, + 0x87, 0x89, 0x6, 0x35, 0xfb, 0xa3, 0x61, 0x10, 0x8c, 0x77, 0x24, + 0x19, 0xbd, 0x20, 0x86, 0x83, 0xd1, 0x43, 0xad, 0x58, 0x30}, + {0xfb, 0x93, 0xe5, 0x87, 0xf5, 0x62, 0x6c, 0xb1, 0x71, 0x3e, 0x5d, + 0xca, 0xde, 0xed, 0x99, 0x49, 0x6d, 0x3e, 0xcc, 0x14, 0xe0, 0xc1, + 0x91, 0xb4, 0xa8, 0xdb, 0xa8, 0x89, 0x47, 0x11, 0xf5, 0x8}, + {0xd0, 0x63, 0x76, 0xe5, 0xfd, 0xf, 0x3c, 0x32, 0x10, 0xa6, 0x2e, + 0xa2, 0x38, 0xdf, 0xc3, 0x5, 0x9a, 0x4f, 0x99, 0xac, 0xbd, 0x8a, + 0xc7, 0xbd, 0x99, 0xdc, 0xe3, 0xef, 0xa4, 0x9f, 0x54, 0x26}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1622861044480487, 1156394801573634, 1869132565415504, - 327103985777730, 2095342781472284 -#else - 1276391, 24182514, 22949634, 17231625, 43615824, 27852245, - 14711874, 4874229, 36445724, 31223040 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 334886927423922, 489511099221528, 129160865966726, - 1720809113143481, 619700195649254 -#else - 5855666, 4990204, 53397016, 7294283, 59304582, 1924646, - 65685689, 25642053, 34039526, 9234252 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1646545795166119, 1758370782583567, 714746174550637, - 1472693650165135, 898994790308209 -#else - 20590503, 24535444, 31529743, 26201766, 64402029, 10650547, - 31559055, 21944845, 18979185, 13396066 -#endif - }}, + {0x6e, 0x66, 0x3f, 0xaf, 0x49, 0x85, 0x46, 0xdb, 0xa5, 0xe, 0x4a, + 0xf1, 0x4, 0xcf, 0x7f, 0xd7, 0x47, 0xc, 0xba, 0xa4, 0xf7, 0x3f, + 0xf2, 0x3d, 0x85, 0x3c, 0xce, 0x32, 0xe1, 0xdf, 0x10, 0x3a}, + {0xd6, 0xf9, 0x6b, 0x1e, 0x46, 0x5a, 0x1d, 0x74, 0x81, 0xa5, 0x77, + 0x77, 0xfc, 0xb3, 0x5, 0x23, 0xd9, 0xd3, 0x74, 0x64, 0xa2, 0x74, + 0x55, 0xd4, 0xff, 0xe0, 0x1, 0x64, 0xdc, 0xe1, 0x26, 0x19}, + {0xa0, 0xce, 0x17, 0xea, 0x8a, 0x4e, 0x7f, 0xe0, 0xfd, 0xc1, 0x1f, + 0x3a, 0x46, 0x15, 0xd5, 0x2f, 0xf1, 0xc0, 0xf2, 0x31, 0xfd, 0x22, + 0x53, 0x17, 0x15, 0x5d, 0x1e, 0x86, 0x1d, 0xd0, 0xa1, 0x1f}, + }, + { + {0xab, 0x94, 0xdf, 0xd1, 0x0, 0xac, 0xdc, 0x38, 0xe9, 0xd, 0x8, + 0xd1, 0xdd, 0x2b, 0x71, 0x2e, 0x62, 0xe2, 0xd5, 0xfd, 0x3e, 0xe9, + 0x13, 0x7f, 0xe5, 0x1, 0x9a, 0xee, 0x18, 0xed, 0xfc, 0x73}, + {0x32, 0x98, 0x59, 0x7d, 0x94, 0x55, 0x80, 0xcc, 0x20, 0x55, 0xf1, + 0x37, 0xda, 0x56, 0x46, 0x1e, 0x20, 0x93, 0x5, 0x4e, 0x74, 0xf7, + 0xf6, 0x99, 0x33, 0xcf, 0x75, 0x6a, 0xbc, 0x63, 0x35, 0x77}, + {0xb3, 0x9c, 0x13, 0x63, 0x8, 0xe9, 0xb1, 0x6, 0xcd, 0x3e, 0xa0, + 0xc5, 0x67, 0xda, 0x93, 0xa4, 0x32, 0x89, 0x63, 0xad, 0xc8, 0xce, + 0x77, 0x8d, 0x44, 0x4f, 0x86, 0x1b, 0x70, 0x6b, 0x42, 0x1f}, + }, + { + {0x52, 0x25, 0xa1, 0x91, 0xc8, 0x35, 0x7e, 0xf1, 0x76, 0x9c, 0x5e, + 0x57, 0x53, 0x81, 0x6b, 0xb7, 0x3e, 0x72, 0x9b, 0xd, 0x6f, 0x40, + 0x83, 0xfa, 0x38, 0xe4, 0xa7, 0x3f, 0x1b, 0xbb, 0x76, 0xb}, + {0x1, 0x1c, 0x91, 0x41, 0x4c, 0x26, 0xc9, 0xef, 0x25, 0x2c, 0xa2, + 0x17, 0xb8, 0xb7, 0xa3, 0xf1, 0x47, 0x14, 0xf, 0xf3, 0x6b, 0xda, + 0x75, 0x58, 0x90, 0xb0, 0x31, 0x1d, 0x27, 0xf5, 0x1a, 0x4e}, + {0x9b, 0x93, 0x92, 0x7f, 0xf9, 0xc1, 0xb8, 0x8, 0x6e, 0xab, 0x44, + 0xd4, 0xcb, 0x71, 0x67, 0xbe, 0x17, 0x80, 0xbb, 0x99, 0x63, 0x64, + 0xe5, 0x22, 0x55, 0xa9, 0x72, 0xb7, 0x1e, 0xd6, 0x6d, 0x7b}, + }, + { + {0xc7, 0xd2, 0x1, 0xab, 0xf9, 0xab, 0x30, 0x57, 0x18, 0x3b, 0x14, + 0x40, 0xdc, 0x76, 0xfb, 0x16, 0x81, 0xb2, 0xcb, 0xa0, 0x65, 0xbe, + 0x6c, 0x86, 0xfe, 0x6a, 0xff, 0x9b, 0x65, 0x9b, 0xfa, 0x53}, + {0x92, 0x3d, 0xf3, 0x50, 0xe8, 0xc1, 0xad, 0xb7, 0xcf, 0xd5, 0x8c, + 0x60, 0x4f, 0xfa, 0x98, 0x79, 0xdb, 0x5b, 0xfc, 0x8d, 0xbd, 0x2d, + 0x96, 0xad, 0x4f, 0x2f, 0x1d, 0xaf, 0xce, 0x9b, 0x3e, 0x70}, + {0x55, 0x54, 0x88, 0x94, 0xe9, 0xc8, 0x14, 0x6c, 0xe5, 0xd4, 0xae, + 0x65, 0x66, 0x5d, 0x3a, 0x84, 0xf1, 0x5a, 0xd6, 0xbc, 0x3e, 0xb7, + 0x1b, 0x18, 0x50, 0x1f, 0xc6, 0xc4, 0xe5, 0x93, 0x8d, 0x39}, + }, + { + {0xf2, 0xe3, 0xe7, 0xd2, 0x60, 0x7c, 0x87, 0xc3, 0xb1, 0x8b, 0x82, + 0x30, 0xa0, 0xaa, 0x34, 0x3b, 0x38, 0xf1, 0x9e, 0x73, 0xe7, 0x26, + 0x3e, 0x28, 0x77, 0x5, 0xc3, 0x2, 0x90, 0x9c, 0x9c, 0x69}, + {0xf3, 0x48, 0xe2, 0x33, 0x67, 0xd1, 0x4b, 0x1c, 0x5f, 0xa, 0xbf, + 0x15, 0x87, 0x12, 0x9e, 0xbd, 0x76, 0x3, 0xb, 0xa1, 0xf0, 0x8c, + 0x3f, 0xd4, 0x13, 0x1b, 0x19, 0xdf, 0x5d, 0x9b, 0xb0, 0x53}, + {0xcc, 0xf1, 0x46, 0x59, 0x23, 0xa7, 0x6, 0xf3, 0x7d, 0xd9, 0xe5, + 0xcc, 0xb5, 0x18, 0x17, 0x92, 0x75, 0xe9, 0xb4, 0x81, 0x47, 0xd2, + 0xcd, 0x28, 0x7, 0xd9, 0xcd, 0x6f, 0xc, 0xf3, 0xca, 0x51}, + }, + { + {0xc7, 0x54, 0xac, 0x18, 0x9a, 0xf9, 0x7a, 0x73, 0xf, 0xb3, 0x1c, + 0xc5, 0xdc, 0x78, 0x33, 0x90, 0xc7, 0xc, 0xe1, 0x4c, 0x33, 0xbc, + 0x89, 0x2b, 0x9a, 0xe9, 0xf8, 0x89, 0xc1, 0x29, 0xae, 0x12}, + {0xa, 0xe0, 0x74, 0x76, 0x42, 0xa7, 0xb, 0xa6, 0xf3, 0x7b, 0x7a, + 0xa1, 0x70, 0x85, 0xe, 0x63, 0xcc, 0x24, 0x33, 0xcf, 0x3d, 0x56, + 0x58, 0x37, 0xaa, 0xfd, 0x83, 0x23, 0x29, 0xaa, 0x4, 0x55}, + {0xcf, 0x1, 0xd, 0x1f, 0xcb, 0xc0, 0x9e, 0xa9, 0xae, 0xf7, 0x34, + 0x3a, 0xcc, 0xef, 0xd1, 0xd, 0x22, 0x4e, 0x9c, 0xd0, 0x21, 0x75, + 0xca, 0x55, 0xea, 0xa5, 0xeb, 0x58, 0xe9, 0x4f, 0xd1, 0x5f}, + }, + { + {0x8e, 0xcb, 0x93, 0xbf, 0x5e, 0xfe, 0x42, 0x3c, 0x5f, 0x56, 0xd4, + 0x36, 0x51, 0xa8, 0xdf, 0xbe, 0xe8, 0x20, 0x42, 0x88, 0x9e, 0x85, + 0xf0, 0xe0, 0x28, 0xd1, 0x25, 0x7, 0x96, 0x3f, 0xd7, 0x7d}, + {0x2c, 0xab, 0x45, 0x28, 0xdf, 0x2d, 0xdc, 0xb5, 0x93, 0xe9, 0x7f, + 0xa, 0xb1, 0x91, 0x94, 0x6, 0x46, 0xe3, 0x2, 0x40, 0xd6, 0xf3, + 0xaa, 0x4d, 0xd1, 0x74, 0x64, 0x58, 0x6e, 0xf2, 0x3f, 0x9}, + {0x29, 0x98, 0x5, 0x68, 0xfe, 0x24, 0xd, 0xb1, 0xe5, 0x23, 0xaf, + 0xdb, 0x72, 0x6, 0x73, 0x75, 0x29, 0xac, 0x57, 0xb4, 0x3a, 0x25, + 0x67, 0x13, 0xa4, 0x70, 0xb4, 0x86, 0xbc, 0xbc, 0x59, 0x2f}, + }, + { + {0x1, 0xc3, 0x91, 0xb6, 0x60, 0xd5, 0x41, 0x70, 0x1e, 0xe7, 0xd7, + 0xad, 0x3f, 0x1b, 0x20, 0x85, 0x85, 0x55, 0x33, 0x11, 0x63, 0xe1, + 0xc2, 0x16, 0xb1, 0x28, 0x8, 0x1, 0x3d, 0x5e, 0xa5, 0x2a}, + {0x5f, 0x13, 0x17, 0x99, 0x42, 0x7d, 0x84, 0x83, 0xd7, 0x3, 0x7d, + 0x56, 0x1f, 0x91, 0x1b, 0xad, 0xd1, 0xaa, 0x77, 0xbe, 0xd9, 0x48, + 0x77, 0x7e, 0x4a, 0xaf, 0x51, 0x2e, 0x2e, 0xb4, 0x58, 0x54}, + {0x4f, 0x44, 0x7, 0xc, 0xe6, 0x92, 0x51, 0xed, 0x10, 0x1d, 0x42, + 0x74, 0x2d, 0x4e, 0xc5, 0x42, 0x64, 0xc8, 0xb5, 0xfd, 0x82, 0x4c, + 0x2b, 0x35, 0x64, 0x86, 0x76, 0x8a, 0x4a, 0x0, 0xe9, 0x13}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 333403773039279, 295772542452938, 1693106465353610, - 912330357530760, 471235657950362 -#else - 24474287, 4968103, 22267082, 4407354, 24063882, 25229252, - 48291976, 13594781, 33514650, 7021958 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1811196219982022, 1068969825533602, 289602974833439, - 1988956043611592, 863562343398367 -#else - 55541958, 26988926, 45743778, 15928891, 40950559, 4315420, - 41160136, 29637754, 45628383, 12868081 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 906282429780072, 2108672665779781, 432396390473936, - 150625823801893, 1708930497638539 -#else - 38473832, 13504660, 19988037, 31421671, 21078224, 6443208, - 45662757, 2244499, 54653067, 25465048 -#endif - }}, + {0x7f, 0x87, 0x3b, 0x19, 0xc9, 0x0, 0x2e, 0xbb, 0x6b, 0x50, 0xdc, + 0xe0, 0x90, 0xa8, 0xe3, 0xec, 0x9f, 0x64, 0xde, 0x36, 0xc0, 0xb7, + 0xf3, 0xec, 0x1a, 0x9e, 0xde, 0x98, 0x8, 0x4, 0x46, 0x5f}, + {0xdb, 0xce, 0x2f, 0x83, 0x45, 0x88, 0x9d, 0x73, 0x63, 0xf8, 0x6b, + 0xae, 0xc9, 0xd6, 0x38, 0xfa, 0xf7, 0xfe, 0x4f, 0xb7, 0xca, 0xd, + 0xbc, 0x32, 0x5e, 0xe4, 0xbc, 0x14, 0x88, 0x7e, 0x93, 0x73}, + {0x8d, 0xf4, 0x7b, 0x29, 0x16, 0x71, 0x3, 0xb9, 0x34, 0x68, 0xf0, + 0xd4, 0x22, 0x3b, 0xd1, 0xa9, 0xc6, 0xbd, 0x96, 0x46, 0x57, 0x15, + 0x97, 0xe1, 0x35, 0xe8, 0xd5, 0x91, 0xe8, 0xa4, 0xf8, 0x2c}, + }, + { + {0xa2, 0x6b, 0xd0, 0x17, 0x7e, 0x48, 0xb5, 0x2c, 0x6b, 0x19, 0x50, + 0x39, 0x1c, 0x38, 0xd2, 0x24, 0x30, 0x8a, 0x97, 0x85, 0x81, 0x9c, + 0x65, 0xd7, 0xf6, 0xa4, 0xd6, 0x91, 0x28, 0x7f, 0x6f, 0x7a}, + {0x67, 0xf, 0x11, 0x7, 0x87, 0xfd, 0x93, 0x6d, 0x49, 0xb5, 0x38, + 0x7c, 0xd3, 0x9, 0x4c, 0xdd, 0x86, 0x6a, 0x73, 0xc2, 0x4c, 0x6a, + 0xb1, 0x7c, 0x9, 0x2a, 0x25, 0x58, 0x6e, 0xbd, 0x49, 0x20}, + {0x49, 0xef, 0x9a, 0x6a, 0x8d, 0xfd, 0x9, 0x7d, 0xb, 0xb9, 0x3d, + 0x5b, 0xbe, 0x60, 0xee, 0xf0, 0xd4, 0xbf, 0x9e, 0x51, 0x2c, 0xb5, + 0x21, 0x4c, 0x1d, 0x94, 0x45, 0xc5, 0xdf, 0xaa, 0x11, 0x60}, + }, + { + {0x90, 0xf8, 0xcb, 0x2, 0xc8, 0xd0, 0xde, 0x63, 0xaa, 0x6a, 0xff, + 0xd, 0xca, 0x98, 0xd0, 0xfb, 0x99, 0xed, 0xb6, 0xb9, 0xfd, 0xa, + 0x4d, 0x62, 0x1e, 0xb, 0x34, 0x79, 0xb7, 0x18, 0xce, 0x69}, + {0x3c, 0xf8, 0x95, 0xcf, 0x6d, 0x92, 0x67, 0x5f, 0x71, 0x90, 0x28, + 0x71, 0x61, 0x85, 0x7e, 0x7c, 0x5b, 0x7a, 0x8f, 0x99, 0xf3, 0xe7, + 0xa1, 0xd6, 0xe0, 0xf9, 0x62, 0xb, 0x1b, 0xcc, 0xc5, 0x6f}, + {0xcb, 0x79, 0x98, 0xb2, 0x28, 0x55, 0xef, 0xd1, 0x92, 0x90, 0x7e, + 0xd4, 0x3c, 0xae, 0x1a, 0xdd, 0x52, 0x23, 0x9f, 0x18, 0x42, 0x4, + 0x7e, 0x12, 0xf1, 0x1, 0x71, 0xe5, 0x3a, 0x6b, 0x59, 0x15}, + }, + { + {0xca, 0x24, 0x51, 0x7e, 0x16, 0x31, 0xff, 0x9, 0xdf, 0x45, 0xc7, + 0xd9, 0x8b, 0x15, 0xe4, 0xb, 0xe5, 0x56, 0xf5, 0x7e, 0x22, 0x7d, + 0x2b, 0x29, 0x38, 0xd1, 0xb6, 0xaf, 0x41, 0xe2, 0xa4, 0x3a}, + {0xa2, 0x79, 0x91, 0x3f, 0xd2, 0x39, 0x27, 0x46, 0xcf, 0xdd, 0xd6, + 0x97, 0x31, 0x12, 0x83, 0xff, 0x8a, 0x14, 0xf2, 0x53, 0xb5, 0xde, + 0x7, 0x13, 0xda, 0x4d, 0x5f, 0x7b, 0x68, 0x37, 0x22, 0xd}, + {0xf5, 0x5, 0x33, 0x2a, 0xbf, 0x38, 0xc1, 0x2c, 0xc3, 0x26, 0xe9, + 0xa2, 0x8f, 0x3f, 0x58, 0x48, 0xeb, 0xd2, 0x49, 0x55, 0xa2, 0xb1, + 0x3a, 0x8, 0x6c, 0xa3, 0x87, 0x46, 0x6e, 0xaa, 0xfc, 0x32}, + }, + { + {0xdf, 0xcc, 0x87, 0x27, 0x73, 0xa4, 0x7, 0x32, 0xf8, 0xe3, 0x13, + 0xf2, 0x8, 0x19, 0xe3, 0x17, 0x4e, 0x96, 0xd, 0xf6, 0xd7, 0xec, + 0xb2, 0xd5, 0xe9, 0xb, 0x60, 0xc2, 0x36, 0x63, 0x6f, 0x74}, + {0xf5, 0x9a, 0x7d, 0xc5, 0x8d, 0x6e, 0xc5, 0x7b, 0xf2, 0xbd, 0xf0, + 0x9d, 0xed, 0xd2, 0xb, 0x3e, 0xa3, 0xe4, 0xef, 0x22, 0xde, 0x14, + 0xc0, 0xaa, 0x5c, 0x6a, 0xbd, 0xfe, 0xce, 0xe9, 0x27, 0x46}, + {0x1c, 0x97, 0x6c, 0xab, 0x45, 0xf3, 0x4a, 0x3f, 0x1f, 0x73, 0x43, + 0x99, 0x72, 0xeb, 0x88, 0xe2, 0x6d, 0x18, 0x44, 0x3, 0x8a, 0x6a, + 0x59, 0x33, 0x93, 0x62, 0xd6, 0x7e, 0x0, 0x17, 0x49, 0x7b}, + }, + { + {0xdd, 0xa2, 0x53, 0xdd, 0x28, 0x1b, 0x34, 0x54, 0x3f, 0xfc, 0x42, + 0xdf, 0x5b, 0x90, 0x17, 0xaa, 0xf4, 0xf8, 0xd2, 0x4d, 0xd9, 0x92, + 0xf5, 0xf, 0x7d, 0xd3, 0x8c, 0xe0, 0xf, 0x62, 0x3, 0x1d}, + {0x64, 0xb0, 0x84, 0xab, 0x5c, 0xfb, 0x85, 0x2d, 0x14, 0xbc, 0xf3, + 0x89, 0xd2, 0x10, 0x78, 0x49, 0xc, 0xce, 0x15, 0x7b, 0x44, 0xdc, + 0x6a, 0x47, 0x7b, 0xfd, 0x44, 0xf8, 0x76, 0xa3, 0x2b, 0x12}, + {0x54, 0xe5, 0xb4, 0xa2, 0xcd, 0x32, 0x2, 0xc2, 0x7f, 0x18, 0x5d, + 0x11, 0x42, 0xfd, 0xd0, 0x9e, 0xd9, 0x79, 0xd4, 0x7d, 0xbe, 0xb4, + 0xab, 0x2e, 0x4c, 0xec, 0x68, 0x2b, 0xf5, 0xb, 0xc7, 0x2}, + }, + { + {0xe1, 0x72, 0x8d, 0x45, 0xbf, 0x32, 0xe5, 0xac, 0xb5, 0x3c, 0xb7, + 0x7c, 0xe0, 0x68, 0xe7, 0x5b, 0xe7, 0xbd, 0x8b, 0xee, 0x94, 0x7d, + 0xcf, 0x56, 0x3, 0x3a, 0xb4, 0xfe, 0xe3, 0x97, 0x6, 0x6b}, + {0xbb, 0x2f, 0xb, 0x5d, 0x4b, 0xec, 0x87, 0xa2, 0xca, 0x82, 0x48, + 0x7, 0x90, 0x57, 0x5c, 0x41, 0x5c, 0x81, 0xd0, 0xc1, 0x1e, 0xa6, + 0x44, 0xe0, 0xe0, 0xf5, 0x9e, 0x40, 0xa, 0x4f, 0x33, 0x26}, + {0xc0, 0xa3, 0x62, 0xdf, 0x4a, 0xf0, 0xc8, 0xb6, 0x5d, 0xa4, 0x6d, + 0x7, 0xef, 0x0, 0xf0, 0x3e, 0xa9, 0xd2, 0xf0, 0x49, 0x58, 0xb9, + 0x9c, 0x9c, 0xae, 0x2f, 0x1b, 0x44, 0x43, 0x7f, 0xc3, 0x1c}, + }, + { + {0xb9, 0xae, 0xce, 0xc9, 0xf1, 0x56, 0x66, 0xd7, 0x6a, 0x65, 0xe5, + 0x18, 0xf8, 0x15, 0x5b, 0x1c, 0x34, 0x23, 0x4c, 0x84, 0x32, 0x28, + 0xe7, 0x26, 0x38, 0x68, 0x19, 0x2f, 0x77, 0x6f, 0x34, 0x3a}, + {0x4f, 0x32, 0xc7, 0x5c, 0x5a, 0x56, 0x8f, 0x50, 0x22, 0xa9, 0x6, + 0xe5, 0xc0, 0xc4, 0x61, 0xd0, 0x19, 0xac, 0x45, 0x5c, 0xdb, 0xab, + 0x18, 0xfb, 0x4a, 0x31, 0x80, 0x3, 0xc1, 0x9, 0x68, 0x6c}, + {0xc8, 0x6a, 0xda, 0xe2, 0x12, 0x51, 0xd5, 0xd2, 0xed, 0x51, 0xe8, + 0xb1, 0x31, 0x3, 0xbd, 0xe9, 0x62, 0x72, 0xc6, 0x8e, 0xdd, 0x46, + 0x7, 0x96, 0xd0, 0xc5, 0xf7, 0x6e, 0x9f, 0x1b, 0x91, 0x5}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 925664675702328, 21416848568684, 1831436641861340, - 601157008940113, 371818055044496 -#else - 36513336, 13793478, 61256044, 319135, 41385692, 27290532, - 33086545, 8957937, 51875216, 5540520 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1479786007267725, 1738881859066675, 68646196476567, - 2146507056100328, 1247662817535471 -#else - 55478669, 22050529, 58989363, 25911358, 2620055, 1022908, - 43398120, 31985447, 50980335, 18591624 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 52035296774456, 939969390708103, 312023458773250, - 59873523517659, 1231345905848899 -#else - 23152952, 775386, 27395463, 14006635, 57407746, 4649511, - 1689819, 892185, 55595587, 18348483 -#endif - }}, + {0xef, 0xea, 0x2e, 0x51, 0xf3, 0xac, 0x49, 0x53, 0x49, 0xcb, 0xc1, + 0x1c, 0xd3, 0x41, 0xc1, 0x20, 0x8d, 0x68, 0x9a, 0xa9, 0x7, 0xc, + 0x18, 0x24, 0x17, 0x2d, 0x4b, 0xc6, 0xd1, 0xf9, 0x5e, 0x55}, + {0xbb, 0xe, 0xdf, 0xf5, 0x83, 0x99, 0x33, 0xc1, 0xac, 0x4c, 0x2c, + 0x51, 0x8f, 0x75, 0xf3, 0xc0, 0xe1, 0x98, 0xb3, 0xb, 0xa, 0x13, + 0xf1, 0x2c, 0x62, 0xc, 0x27, 0xaa, 0xf9, 0xec, 0x3c, 0x6b}, + {0x8, 0xbd, 0x73, 0x3b, 0xba, 0x70, 0xa7, 0x36, 0xc, 0xbf, 0xaf, + 0xa3, 0x8, 0xef, 0x4a, 0x62, 0xf2, 0x46, 0x9, 0xb4, 0x98, 0xff, + 0x37, 0x57, 0x9d, 0x74, 0x81, 0x33, 0xe1, 0x4d, 0x5f, 0x67}, + }, + { + {0x1d, 0xb3, 0xda, 0x3b, 0xd9, 0xf6, 0x2f, 0xa1, 0xfe, 0x2d, 0x65, + 0x9d, 0xf, 0xd8, 0x25, 0x7, 0x87, 0x94, 0xbe, 0x9a, 0xf3, 0x4f, + 0x9c, 0x1, 0x43, 0x3c, 0xcd, 0x82, 0xb8, 0x50, 0xf4, 0x60}, + {0xfc, 0x82, 0x17, 0x6b, 0x3, 0x52, 0x2c, 0xe, 0xb4, 0x83, 0xad, + 0x6c, 0x81, 0x6c, 0x81, 0x64, 0x3e, 0x7, 0x64, 0x69, 0xd9, 0xbd, + 0xdc, 0xd0, 0x20, 0xc5, 0x64, 0x1, 0xf7, 0x9d, 0xd9, 0x13}, + {0xca, 0xc0, 0xe5, 0x21, 0xc3, 0x5e, 0x4b, 0x1, 0xa2, 0xbf, 0x19, + 0xd7, 0xc9, 0x69, 0xcb, 0x4f, 0xa0, 0x23, 0x0, 0x75, 0x18, 0x1c, + 0x5f, 0x4e, 0x80, 0xac, 0xed, 0x55, 0x9e, 0xde, 0x6, 0x1c}, + }, + { + {0xaa, 0x69, 0x6d, 0xff, 0x40, 0x2b, 0xd5, 0xff, 0xbb, 0x49, 0x40, + 0xdc, 0x18, 0xb, 0x53, 0x34, 0x97, 0x98, 0x4d, 0xa3, 0x2f, 0x5c, + 0x4a, 0x5e, 0x2d, 0xba, 0x32, 0x7d, 0x8e, 0x6f, 0x9, 0x78}, + {0xe2, 0xc4, 0x3e, 0xa3, 0xd6, 0x7a, 0xf, 0x99, 0x8e, 0xe0, 0x2e, + 0xbe, 0x38, 0xf9, 0x8, 0x66, 0x15, 0x45, 0x28, 0x63, 0xc5, 0x43, + 0xa1, 0x9c, 0xd, 0xb6, 0x2d, 0xec, 0x1f, 0x8a, 0xf3, 0x4c}, + {0xe7, 0x5c, 0xfa, 0xd, 0x65, 0xaa, 0xaa, 0xa0, 0x8c, 0x47, 0xb5, + 0x48, 0x2a, 0x9e, 0xc4, 0xf9, 0x5b, 0x72, 0x3, 0x70, 0x7d, 0xcc, + 0x9, 0x4f, 0xbe, 0x1a, 0x9, 0x26, 0x3a, 0xad, 0x3c, 0x37}, + }, + { + {0xad, 0xbb, 0xdd, 0x89, 0xfb, 0xa8, 0xbe, 0xf1, 0xcb, 0xae, 0xae, + 0x61, 0xbc, 0x2c, 0xcb, 0x3b, 0x9d, 0x8d, 0x9b, 0x1f, 0xbb, 0xa7, + 0x58, 0x8f, 0x86, 0xa6, 0x12, 0x51, 0xda, 0x7e, 0x54, 0x21}, + {0x7c, 0xf5, 0xc9, 0x82, 0x4d, 0x63, 0x94, 0xb2, 0x36, 0x45, 0x93, + 0x24, 0xe1, 0xfd, 0xcb, 0x1f, 0x5a, 0xdb, 0x8c, 0x41, 0xb3, 0x4d, + 0x9c, 0x9e, 0xfc, 0x19, 0x44, 0x45, 0xd9, 0xf3, 0x40, 0x0}, + {0xd3, 0x86, 0x59, 0xfd, 0x39, 0xe9, 0xfd, 0xde, 0xc, 0x38, 0xa, + 0x51, 0x89, 0x2c, 0x27, 0xf4, 0xb9, 0x19, 0x31, 0xbb, 0x7, 0xa4, + 0x2b, 0xb7, 0xf4, 0x4d, 0x25, 0x4a, 0x33, 0xa, 0x55, 0x63}, + }, + { + {0x49, 0x7b, 0x54, 0x72, 0x45, 0x58, 0xba, 0x9b, 0xe0, 0x8, 0xc4, + 0xe2, 0xfa, 0xc6, 0x5, 0xf3, 0x8d, 0xf1, 0x34, 0xc7, 0x69, 0xfa, + 0xe8, 0x60, 0x7a, 0x76, 0x7d, 0xaa, 0xaf, 0x2b, 0xa9, 0x39}, + {0x37, 0xcf, 0x69, 0xb5, 0xed, 0xd6, 0x7, 0x65, 0xe1, 0x2e, 0xa5, + 0xc, 0xb0, 0x29, 0x84, 0x17, 0x5d, 0xd6, 0x6b, 0xeb, 0x90, 0x0, + 0x7c, 0xea, 0x51, 0x8f, 0xf7, 0xda, 0xc7, 0x62, 0xea, 0x3e}, + {0x4e, 0x27, 0x93, 0xe6, 0x13, 0xc7, 0x24, 0x9d, 0x75, 0xd3, 0xdb, + 0x68, 0x77, 0x85, 0x63, 0x5f, 0x9a, 0xb3, 0x8a, 0xeb, 0x60, 0x55, + 0x52, 0x70, 0xcd, 0xc4, 0xc9, 0x65, 0x6, 0x6a, 0x43, 0x68}, + }, + { + {0x7c, 0x10, 0x20, 0xe8, 0x17, 0xd3, 0x56, 0x1e, 0x65, 0xe9, 0xa, + 0x84, 0x44, 0x68, 0x26, 0xc5, 0x7a, 0xfc, 0xf, 0x32, 0xc6, 0xa1, + 0xe0, 0xc1, 0x72, 0x14, 0x61, 0x91, 0x9c, 0x66, 0x73, 0x53}, + {0x27, 0x3f, 0x2f, 0x20, 0xe8, 0x35, 0x2, 0xbc, 0xb0, 0x75, 0xf9, + 0x64, 0xe2, 0x0, 0x5c, 0xc7, 0x16, 0x24, 0x8c, 0xa3, 0xd5, 0xe9, + 0xa4, 0x91, 0xf9, 0x89, 0xb7, 0x8a, 0xf6, 0xe7, 0xb6, 0x17}, + {0x57, 0x52, 0xe, 0x9a, 0xab, 0x14, 0x28, 0x5d, 0xfc, 0xb3, 0xca, + 0xc9, 0x84, 0x20, 0x8f, 0x90, 0xca, 0x1e, 0x2d, 0x5b, 0x88, 0xf5, + 0xca, 0xaf, 0x11, 0x7d, 0xf8, 0x78, 0xa6, 0xb5, 0xb4, 0x1c}, + }, + { + {0xe7, 0x7, 0xa0, 0xa2, 0x62, 0xaa, 0x74, 0x6b, 0xb1, 0xc7, 0x71, + 0xf0, 0xb0, 0xe0, 0x11, 0xf3, 0x23, 0xe2, 0xb, 0x0, 0x38, 0xe4, + 0x7, 0x57, 0xac, 0x6e, 0xef, 0x82, 0x2d, 0xfd, 0xc0, 0x2d}, + {0x6c, 0xfc, 0x4a, 0x39, 0x6b, 0xc0, 0x64, 0xb6, 0xb1, 0x5f, 0xda, + 0x98, 0x24, 0xde, 0x88, 0xc, 0x34, 0xd8, 0xca, 0x4b, 0x16, 0x3, + 0x8d, 0x4f, 0xa2, 0x34, 0x74, 0xde, 0x78, 0xca, 0xb, 0x33}, + {0x4e, 0x74, 0x19, 0x11, 0x84, 0xff, 0x2e, 0x98, 0x24, 0x47, 0x7, + 0x2b, 0x96, 0x5e, 0x69, 0xf9, 0xfb, 0x53, 0xc9, 0xbf, 0x4f, 0xc1, + 0x8a, 0xc5, 0xf5, 0x1c, 0x9f, 0x36, 0x1b, 0xbe, 0x31, 0x3c}, + }, + { + {0x72, 0x42, 0xcb, 0xf9, 0x93, 0xbc, 0x68, 0xc1, 0x98, 0xdb, 0xce, + 0xc7, 0x1f, 0x71, 0xb8, 0xae, 0x7a, 0x8d, 0xac, 0x34, 0xaa, 0x52, + 0xe, 0x7f, 0xbb, 0x55, 0x7d, 0x7e, 0x9, 0xc1, 0xce, 0x41}, + {0xee, 0x8a, 0x94, 0x8, 0x4d, 0x86, 0xf4, 0xb0, 0x6f, 0x1c, 0xba, + 0x91, 0xee, 0x19, 0xdc, 0x7, 0x58, 0xa1, 0xac, 0xa6, 0xae, 0xcd, + 0x75, 0x79, 0xbb, 0xd4, 0x62, 0x42, 0x13, 0x61, 0xb, 0x33}, + {0x8a, 0x80, 0x6d, 0xa2, 0xd7, 0x19, 0x96, 0xf7, 0x6d, 0x15, 0x9e, + 0x1d, 0x9e, 0xd4, 0x1f, 0xbb, 0x27, 0xdf, 0xa1, 0xdb, 0x6c, 0xc3, + 0xd7, 0x73, 0x7d, 0x77, 0x28, 0x1f, 0xd9, 0x4c, 0xb4, 0x26}, }, + }, + { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 643355106415761, 290186807495774, 2013561737429023, - 319648069511546, 393736678496162 -#else - 9770129, 9586738, 26496094, 4324120, 1556511, 30004408, - 27453818, 4763127, 47929250, 5867133 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 129358342392716, 1932811617704777, 1176749390799681, - 398040349861790, 1170779668090425 -#else - 34343820, 1927589, 31726409, 28801137, 23962433, 17534932, - 27846558, 5931263, 37359161, 17445976 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2051980782668029, 121859921510665, 2048329875753063, - 1235229850149665, 519062146124755 -#else - 27461885, 30576896, 22380809, 1815854, 44075111, 30522493, - 7283489, 18406359, 47582163, 7734628 -#endif - }}, + {0x83, 0x3, 0x73, 0x62, 0x93, 0xf2, 0xb7, 0xe1, 0x2c, 0x8a, 0xca, + 0xeb, 0xff, 0x79, 0x52, 0x4b, 0x14, 0x13, 0xd4, 0xbf, 0x8a, 0x77, + 0xfc, 0xda, 0xf, 0x61, 0x72, 0x9c, 0x14, 0x10, 0xeb, 0x7d}, + {0x75, 0x74, 0x38, 0x8f, 0x47, 0x48, 0xf0, 0x51, 0x3c, 0xcb, 0xbe, + 0x9c, 0xf4, 0xbc, 0x5d, 0xb2, 0x55, 0x20, 0x9f, 0xd9, 0x44, 0x12, + 0xab, 0x9a, 0xd6, 0xa5, 0x10, 0x1c, 0x6c, 0x9e, 0x70, 0x2c}, + {0x7a, 0xee, 0x66, 0x87, 0x6a, 0xaf, 0x62, 0xcb, 0xe, 0xcd, 0x53, + 0x55, 0x4, 0xec, 0xcb, 0x66, 0xb5, 0xe4, 0xb, 0xf, 0x38, 0x1, + 0x80, 0x58, 0xea, 0xe2, 0x2c, 0xf6, 0x9f, 0x8e, 0xe6, 0x8}, + }, + { + {0xf9, 0xf2, 0xb8, 0xa, 0xd5, 0x9, 0x2d, 0x2f, 0xdf, 0x23, 0x59, + 0xc5, 0x8d, 0x21, 0xb9, 0xac, 0xb9, 0x6c, 0x76, 0x73, 0x26, 0x34, + 0x8f, 0x4a, 0xf5, 0x19, 0xf7, 0x38, 0xd7, 0x3b, 0xb1, 0x4c}, + {0xad, 0x30, 0xc1, 0x4b, 0xa, 0x50, 0xad, 0x34, 0x9c, 0xd4, 0xb, + 0x3d, 0x49, 0xdb, 0x38, 0x8d, 0xbe, 0x89, 0xa, 0x50, 0x98, 0x3d, + 0x5c, 0xa2, 0x9, 0x3b, 0xba, 0xee, 0x87, 0x3f, 0x1f, 0x2f}, + {0x4a, 0xb6, 0x15, 0xe5, 0x75, 0x8c, 0x84, 0xf7, 0x38, 0x90, 0x4a, + 0xdb, 0xba, 0x1, 0x95, 0xa5, 0x50, 0x1b, 0x75, 0x3f, 0x3f, 0x31, + 0xd, 0xc2, 0xe8, 0x2e, 0xae, 0xc0, 0x53, 0xe3, 0xa1, 0x19}, + }, + { + {0xbd, 0xbd, 0x96, 0xd5, 0xcd, 0x72, 0x21, 0xb4, 0x40, 0xfc, 0xee, + 0x98, 0x43, 0x45, 0xe0, 0x93, 0xb5, 0x9, 0x41, 0xb4, 0x47, 0x53, + 0xb1, 0x9f, 0x34, 0xae, 0x66, 0x2, 0x99, 0xd3, 0x6b, 0x73}, + {0xc3, 0x5, 0xfa, 0xba, 0x60, 0x75, 0x1c, 0x7d, 0x61, 0x5e, 0xe5, + 0xc6, 0xa0, 0xa0, 0xe1, 0xb3, 0x73, 0x64, 0xd6, 0xc0, 0x18, 0x97, + 0x52, 0xe3, 0x86, 0x34, 0xc, 0xc2, 0x11, 0x6b, 0x54, 0x41}, + {0xb4, 0xb3, 0x34, 0x93, 0x50, 0x2d, 0x53, 0x85, 0x73, 0x65, 0x81, + 0x60, 0x4b, 0x11, 0xfd, 0x46, 0x75, 0x83, 0x5c, 0x42, 0x30, 0x5f, + 0x5f, 0xcc, 0x5c, 0xab, 0x7f, 0xb8, 0xa2, 0x95, 0x22, 0x41}, + }, + { + {0xc6, 0xea, 0x93, 0xe2, 0x61, 0x52, 0x65, 0x2e, 0xdb, 0xac, 0x33, + 0x21, 0x3, 0x92, 0x5a, 0x84, 0x6b, 0x99, 0x0, 0x79, 0xcb, 0x75, + 0x9, 0x46, 0x80, 0xdd, 0x5a, 0x19, 0x8d, 0xbb, 0x60, 0x7}, + {0xe9, 0xd6, 0x7e, 0xf5, 0x88, 0x9b, 0xc9, 0x19, 0x25, 0xc8, 0xf8, + 0x6d, 0x26, 0xcb, 0x93, 0x53, 0x73, 0xd2, 0xa, 0xb3, 0x13, 0x32, + 0xee, 0x5c, 0x34, 0x2e, 0x2d, 0xb5, 0xeb, 0x53, 0xe1, 0x14}, + {0x8a, 0x81, 0xe6, 0xcd, 0x17, 0x1a, 0x3e, 0x41, 0x84, 0xa0, 0x69, + 0xed, 0xa9, 0x6d, 0x15, 0x57, 0xb1, 0xcc, 0xca, 0x46, 0x8f, 0x26, + 0xbf, 0x2c, 0xf2, 0xc5, 0x3a, 0xc3, 0x9b, 0xbe, 0x34, 0x6b}, + }, + { + {0xd3, 0xf2, 0x71, 0x65, 0x65, 0x69, 0xfc, 0x11, 0x7a, 0x73, 0xe, + 0x53, 0x45, 0xe8, 0xc9, 0xc6, 0x35, 0x50, 0xfe, 0xd4, 0xa2, 0xe7, + 0x3a, 0xe3, 0xb, 0xd3, 0x6d, 0x2e, 0xb6, 0xc7, 0xb9, 0x1}, + {0xb2, 0xc0, 0x78, 0x3a, 0x64, 0x2f, 0xdf, 0xf3, 0x7c, 0x2, 0x2e, + 0xf2, 0x1e, 0x97, 0x3e, 0x4c, 0xa3, 0xb5, 0xc1, 0x49, 0x5e, 0x1c, + 0x7d, 0xec, 0x2d, 0xdd, 0x22, 0x9, 0x8f, 0xc1, 0x12, 0x20}, + {0x29, 0x9d, 0xc8, 0x5a, 0xe5, 0x55, 0xb, 0x88, 0x63, 0xa7, 0xa0, + 0x45, 0x1f, 0x24, 0x83, 0x14, 0x1f, 0x6c, 0xe7, 0xc2, 0xdf, 0xef, + 0x36, 0x3d, 0xe8, 0xad, 0x4b, 0x4e, 0x78, 0x5b, 0xaf, 0x8}, + }, + { + {0x4b, 0x2c, 0xcc, 0x89, 0xd2, 0x14, 0x73, 0xe2, 0x8d, 0x17, 0x87, + 0xa2, 0x11, 0xbd, 0xe4, 0x4b, 0xce, 0x64, 0x33, 0xfa, 0xd6, 0x28, + 0xd5, 0x18, 0x6e, 0x82, 0xd9, 0xaf, 0xd5, 0xc1, 0x23, 0x64}, + {0x33, 0x25, 0x1f, 0x88, 0xdc, 0x99, 0x34, 0x28, 0xb6, 0x23, 0x93, + 0x77, 0xda, 0x25, 0x5, 0x9d, 0xf4, 0x41, 0x34, 0x67, 0xfb, 0xdd, + 0x7a, 0x89, 0x8d, 0x16, 0x3a, 0x16, 0x71, 0x9d, 0xb7, 0x32}, + {0x6a, 0xb3, 0xfc, 0xed, 0xd9, 0xf8, 0x85, 0xcc, 0xf9, 0xe5, 0x46, + 0x37, 0x8f, 0xc2, 0xbc, 0x22, 0xcd, 0xd3, 0xe5, 0xf9, 0x38, 0xe3, + 0x9d, 0xe4, 0xcc, 0x2d, 0x3e, 0xc1, 0xfb, 0x5e, 0xa, 0x48}, + }, + { + {0x1f, 0x22, 0xce, 0x42, 0xe4, 0x4c, 0x61, 0xb6, 0x28, 0x39, 0x5, + 0x4c, 0xcc, 0x9d, 0x19, 0x6e, 0x3, 0xbe, 0x1c, 0xdc, 0xa4, 0xb4, + 0x3f, 0x66, 0x6, 0x8e, 0x1c, 0x69, 0x47, 0x1d, 0xb3, 0x24}, + {0x71, 0x20, 0x62, 0x1, 0xb, 0xe7, 0x51, 0xb, 0xc5, 0xaf, 0x1d, + 0x8b, 0xcf, 0x5, 0xb5, 0x6, 0xcd, 0xab, 0x5a, 0xef, 0x61, 0xb0, + 0x6b, 0x2c, 0x31, 0xbf, 0xb7, 0xc, 0x60, 0x27, 0xaa, 0x47}, + {0xc3, 0xf8, 0x15, 0xc0, 0xed, 0x1e, 0x54, 0x2a, 0x7c, 0x3f, 0x69, + 0x7c, 0x7e, 0xfe, 0xa4, 0x11, 0xd6, 0x78, 0xa2, 0x4e, 0x13, 0x66, + 0xaf, 0xf0, 0x94, 0xa0, 0xdd, 0x14, 0x5d, 0x58, 0x5b, 0x54}, + }, + { + {0xe1, 0x21, 0xb3, 0xe3, 0xd0, 0xe4, 0x4, 0x62, 0x95, 0x1e, 0xff, + 0x28, 0x7a, 0x63, 0xaa, 0x3b, 0x9e, 0xbd, 0x99, 0x5b, 0xfd, 0xcf, + 0xc, 0xb, 0x71, 0xd0, 0xc8, 0x64, 0x3e, 0xdc, 0x22, 0x4d}, + {0xf, 0x3a, 0xd4, 0xa0, 0x5e, 0x27, 0xbf, 0x67, 0xbe, 0xee, 0x9b, + 0x8, 0x34, 0x8e, 0xe6, 0xad, 0x2e, 0xe7, 0x79, 0xd4, 0x4c, 0x13, + 0x89, 0x42, 0x54, 0x54, 0xba, 0x32, 0xc3, 0xf9, 0x62, 0xf}, + {0x39, 0x5f, 0x3b, 0xd6, 0x89, 0x65, 0xb4, 0xfc, 0x61, 0xcf, 0xcb, + 0x57, 0x3f, 0x6a, 0xae, 0x5c, 0x5, 0xfa, 0x3a, 0x95, 0xd2, 0xc2, + 0xba, 0xfe, 0x36, 0x14, 0x37, 0x36, 0x1a, 0xa0, 0xf, 0x1c}, }, }, { { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1608170971973096, 415809060360428, 1350468408164766, - 2038620059057678, 1026904485989112 -#else - 59098600, 23963614, 55988460, 6196037, 29344158, 20123547, - 7585294, 30377806, 18549496, 15302069 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1837656083115103, 1510134048812070, 906263674192061, - 1821064197805734, 565375124676301 -#else - 34450527, 27383209, 59436070, 22502750, 6258877, 13504381, - 10458790, 27135971, 58236621, 8424745 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 578027192365650, 2034800251375322, 2128954087207123, - 478816193810521, 2196171989962750 -#else - 24687186, 8613276, 36441818, 30320886, 1863891, 31723888, - 19206233, 7134917, 55824382, 32725512 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1633188840273139, 852787172373708, 1548762607215796, - 1266275218902681, 1107218203325133 -#else - 11334899, 24336410, 8025292, 12707519, 17523892, 23078361, - 10243737, 18868971, 62042829, 16498836 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 462189358480054, 1784816734159228, 1611334301651368, - 1303938263943540, 707589560319424 -#else - 8911542, 6887158, 57524604, 26595841, 11145640, 24010752, - 17303924, 19430194, 6536640, 10543906 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1038829280972848, 38176604650029, 753193246598573, - 1136076426528122, 595709990562434 -#else - 38162480, 15479762, 49642029, 568875, 65611181, 11223453, - 64439674, 16928857, 39873154, 8876770 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1408451820859834, 2194984964010833, 2198361797561729, - 1061962440055713, 1645147963442934 -#else - 41365946, 20987567, 51458897, 32707824, 34082177, 32758143, - 33627041, 15824473, 66504438, 24514614 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 4701053362120, 1647641066302348, 1047553002242085, - 1923635013395977, 206970314902065 -#else - 10330056, 70051, 7957388, 24551765, 9764901, 15609756, 27698697, - 28664395, 1657393, 3084098 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1750479161778571, 1362553355169293, 1891721260220598, - 966109370862782, 1024913988299801 -#else - 10477963, 26084172, 12119565, 20303627, 29016246, 28188843, - 31280318, 14396151, 36875289, 15272408 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 212699049131723, 1117950018299775, 1873945661751056, - 1403802921984058, 130896082652698 -#else - 54820555, 3169462, 28813183, 16658753, 25116432, 27923966, - 41934906, 20918293, 42094106, 1950503 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 636808533673210, 1262201711667560, 390951380330599, - 1663420692697294, 561951321757406 -#else - 40928506, 9489186, 11053416, 18808271, 36055143, 5825629, - 58724558, 24786899, 15341278, 8373727 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 520731594438141, 1446301499955692, 273753264629267, - 1565101517999256, 1019411827004672 -#else - 28685821, 7759505, 52730348, 21551571, 35137043, 4079241, - 298136, 23321830, 64230656, 15190419 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 926527492029409, 1191853477411379, 734233225181171, - 184038887541270, 1790426146325343 -#else - 34175969, 13806335, 52771379, 17760000, 43104243, 10940927, - 8669718, 2742393, 41075551, 26679428 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1464651961852572, 1483737295721717, 1519450561335517, - 1161429831763785, 405914998179977 -#else - 65528476, 21825014, 41129205, 22109408, 49696989, 22641577, - 9291593, 17306653, 54954121, 6048604 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 996126634382301, 796204125879525, 127517800546509, - 344155944689303, 615279846169038 -#else - 36803549, 14843443, 1539301, 11864366, 20201677, 1900163, - 13934231, 5128323, 11213262, 9168384 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 738724080975276, 2188666632415296, 1961313708559162, - 1506545807547587, 1151301638969740 -#else - 40828332, 11007846, 19408960, 32613674, 48515898, 29225851, - 62020803, 22449281, 20470156, 17155731 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 622917337413835, 1218989177089035, 1284857712846592, - 970502061709359, 351025208117090 -#else - 43972811, 9282191, 14855179, 18164354, 59746048, 19145871, - 44324911, 14461607, 14042978, 5230683 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2067814584765580, 1677855129927492, 2086109782475197, - 235286517313238, 1416314046739645 -#else - 29969548, 30812838, 50396996, 25001989, 9175485, 31085458, - 21556950, 3506042, 61174973, 21104723 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 586844262630358, 307444381952195, 458399356043426, - 602068024507062, 1028548203415243 -#else - 63964118, 8744660, 19704003, 4581278, 46678178, 6830682, - 45824694, 8971512, 38569675, 15326562 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 678489922928203, 2016657584724032, 90977383049628, - 1026831907234582, 615271492942522 -#else - 47644235, 10110287, 49846336, 30050539, 43608476, 1355668, - 51585814, 15300987, 46594746, 9168259 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 301225714012278, 1094837270268560, 1202288391010439, - 644352775178361, 1647055902137983 -#else - 61755510, 4488612, 43305616, 16314346, 7780487, 17915493, - 38160505, 9601604, 33087103, 24543045 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1210746697896478, 1416608304244708, 686487477217856, - 1245131191434135, 1051238336855737 -#else - 47665694, 18041531, 46311396, 21109108, 37284416, 10229460, - 39664535, 18553900, 61111993, 15664671 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1135604073198207, 1683322080485474, 769147804376683, - 2086688130589414, 900445683120379 -#else - 23294591, 16921819, 44458082, 25083453, 27844203, 11461195, - 13099750, 31094076, 18151675, 13417686 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1971518477615628, 401909519527336, 448627091057375, - 1409486868273821, 1214789035034363 -#else - 42385932, 29377914, 35958184, 5988918, 40250079, 6685064, - 1661597, 21002991, 15271675, 18101767 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1364039144731711, 1897497433586190, 2203097701135459, - 145461396811251, 1349844460790699 -#else - 11433023, 20325767, 8239630, 28274915, 65123427, 32828713, - 48410099, 2167543, 60187563, 20114249 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1045230323257973, 818206601145807, 630513189076103, - 1672046528998132, 807204017562437 -#else - 35672693, 15575145, 30436815, 12192228, 44645511, 9395378, - 57191156, 24915434, 12215109, 12028277 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 439961968385997, 386362664488986, 1382706320807688, - 309894000125359, 2207801346498567 -#else - 14098381, 6555944, 23007258, 5757252, 51681032, 20603929, - 30123439, 4617780, 50208775, 32898803 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1229004686397588, 920643968530863, 123975893911178, - 681423993215777, 1400559197080973 -#else - 63082644, 18313596, 11893167, 13718664, 52299402, 1847384, - 51288865, 10154008, 23973261, 20869958 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2003766096898049, 170074059235165, 1141124258967971, - 1485419893480973, 1573762821028725 -#else - 40577025, 29858441, 65199965, 2534300, 35238307, 17004076, - 18341389, 22134481, 32013173, 23450893 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 729905708611432, 1270323270673202, 123353058984288, - 426460209632942, 2195574535456672 -#else - 41629544, 10876442, 55337778, 18929291, 54739296, 1838103, - 21911214, 6354752, 4425632, 32716610 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1271140255321235, 2044363183174497, 52125387634689, - 1445120246694705, 942541986339084 -#else - 56675475, 18941465, 22229857, 30463385, 53917697, 776728, - 49693489, 21533969, 4725004, 14044970 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1761608437466135, 583360847526804, 1586706389685493, - 2157056599579261, 1170692369685772 -#else - 19268631, 26250011, 1555348, 8692754, 45634805, 23643767, - 6347389, 32142648, 47586572, 17444675 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 871476219910823, 1878769545097794, 2241832391238412, - 548957640601001, 690047440233174 -#else - 42244775, 12986007, 56209986, 27995847, 55796492, 33405905, - 19541417, 8180106, 9282262, 10282508 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 297194732135507, 1366347803776820, 1301185512245601, - 561849853336294, 1533554921345731 -#else - 40903763, 4428546, 58447668, 20360168, 4098401, 19389175, - 15522534, 8372215, 5542595, 22851749 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 999628998628371, 1132836708493400, 2084741674517453, - 469343353015612, 678782988708035 -#else - 56546323, 14895632, 26814552, 16880582, 49628109, 31065071, - 64326972, 6993760, 49014979, 10114654 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2189427607417022, 699801937082607, 412764402319267, - 1478091893643349, 2244675696854460 -#else - 47001790, 32625013, 31422703, 10427861, 59998115, 6150668, - 38017109, 22025285, 25953724, 33448274 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1712292055966563, 204413590624874, 1405738637332841, - 408981300829763, 861082219276721 -#else - 62874467, 25515139, 57989738, 3045999, 2101609, 20947138, - 19390019, 6094296, 63793585, 12831124 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 508561155940631, 966928475686665, 2236717801150132, - 424543858577297, 2089272956986143 -#else - 51110167, 7578151, 5310217, 14408357, 33560244, 33329692, - 31575953, 6326196, 7381791, 31132593 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 221245220129925, 1156020201681217, 491145634799213, - 542422431960839, 828100817819207 -#else - 46206085, 3296810, 24736065, 17226043, 18374253, 7318640, - 6295303, 8082724, 51746375, 12339663 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 153756971240384, 1299874139923977, 393099165260502, - 1058234455773022, 996989038681183 -#else - 27724736, 2291157, 6088201, 19369634, 1792726, 5857634, - 13848414, 15768922, 25091167, 14856294 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 559086812798481, 573177704212711, 1629737083816402, - 1399819713462595, 1646954378266038 -#else - 48242193, 8331042, 24373479, 8541013, 66406866, 24284974, - 12927299, 20858939, 44926390, 24541532 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1887963056288059, 228507035730124, 1468368348640282, - 930557653420194, 613513962454686 -#else - 55685435, 28132841, 11632844, 3405020, 30536730, 21880393, - 39848098, 13866389, 30146206, 9142070 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1224529808187553, 1577022856702685, 2206946542980843, - 625883007765001, 279930793512158 -#else - 3924129, 18246916, 53291741, 23499471, 12291819, 32886066, - 39406089, 9326383, 58871006, 4171293 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1076287717051609, 1114455570543035, 187297059715481, - 250446884292121, 1885187512550540 -#else - 51186905, 16037936, 6713787, 16606682, 45496729, 2790943, - 26396185, 3731949, 345228, 28091483 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 902497362940219, 76749815795675, 1657927525633846, - 1420238379745202, 1340321636548352 -#else - 45781307, 13448258, 25284571, 1143661, 20614966, 24705045, - 2031538, 21163201, 50855680, 19972348 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1129576631190784, 1281994010027327, 996844254743018, - 257876363489249, 1150850742055018 -#else - 31016192, 16832003, 26371391, 19103199, 62081514, 14854136, - 17477601, 3842657, 28012650, 17149012 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 628740660038789, 1943038498527841, 467786347793886, - 1093341428303375, 235413859513003 -#else - 62033029, 9368965, 58546785, 28953529, 51858910, 6970559, - 57918991, 16292056, 58241707, 3507939 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 237425418909360, 469614029179605, 1512389769174935, - 1241726368345357, 441602891065214 -#else - 29439664, 3537914, 23333589, 6997794, 49553303, 22536363, - 51899661, 18503164, 57943934, 6580395 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1736417953058555, 726531315520508, 1833335034432527, - 1629442561574747, 624418919286085 -#else - 54923003, 25874643, 16438268, 10826160, 58412047, 27318820, - 17860443, 24280586, 65013061, 9304566 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1960754663920689, 497040957888962, 1909832851283095, - 1271432136996826, 2219780368020940 -#else - 20714545, 29217521, 29088194, 7406487, 11426967, 28458727, - 14792666, 18945815, 5289420, 33077305 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1537037379417136, 1358865369268262, 2130838645654099, - 828733687040705, 1999987652890901 -#else - 50443312, 22903641, 60948518, 20248671, 9192019, 31751970, - 17271489, 12349094, 26939669, 29802138 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 629042105241814, 1098854999137608, 887281544569320, - 1423102019874777, 7911258951561 -#else - 54218966, 9373457, 31595848, 16374215, 21471720, 13221525, - 39825369, 21205872, 63410057, 117886 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1811562332665373, 1501882019007673, 2213763501088999, - 359573079719636, 36370565049116 -#else - 22263325, 26994382, 3984569, 22379786, 51994855, 32987646, - 28311252, 5358056, 43789084, 541963 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 218907117361280, 1209298913016966, 1944312619096112, - 1130690631451061, 1342327389191701 -#else - 16259200, 3261970, 2309254, 18019958, 50223152, 28972515, - 24134069, 16848603, 53771797, 20002236 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1369976867854704, 1396479602419169, 1765656654398856, - 2203659200586299, 998327836117241 -#else - 9378160, 20414246, 44262881, 20809167, 28198280, 26310334, - 64709179, 32837080, 690425, 14876244 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2230701885562825, 1348173180338974, 2172856128624598, - 1426538746123771, 444193481326151 -#else - 24977353, 33240048, 58884894, 20089345, 28432342, 32378079, - 54040059, 21257083, 44727879, 6618998 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 784210426627951, 918204562375674, 1284546780452985, - 1324534636134684, 1872449409642708 -#else - 65570671, 11685645, 12944378, 13682314, 42719353, 19141238, - 8044828, 19737104, 32239828, 27901670 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 319638829540294, 596282656808406, 2037902696412608, - 1557219121643918, 341938082688094 -#else - 48505798, 4762989, 66182614, 8885303, 38696384, 30367116, - 9781646, 23204373, 32779358, 5095274 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1901860206695915, 2004489122065736, 1625847061568236, - 973529743399879, 2075287685312905 -#else - 34100715, 28339925, 34843976, 29869215, 9460460, 24227009, - 42507207, 14506723, 21639561, 30924196 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1371853944110545, 1042332820512553, 1949855697918254, - 1791195775521505, 37487364849293 -#else - 50707921, 20442216, 25239337, 15531969, 3987758, 29055114, - 65819361, 26690896, 17874573, 558605 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 687200189577855, 1082536651125675, 644224940871546, - 340923196057951, 343581346747396 -#else - 53508735, 10240080, 9171883, 16131053, 46239610, 9599699, - 33499487, 5080151, 2085892, 5119761 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2082717129583892, 27829425539422, 145655066671970, - 1690527209845512, 1865260509673478 -#else - 44903700, 31034903, 50727262, 414690, 42089314, 2170429, - 30634760, 25190818, 35108870, 27794547 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1059729620568824, 2163709103470266, 1440302280256872, - 1769143160546397, 869830310425069 -#else - 60263160, 15791201, 8550074, 32241778, 29928808, 21462176, - 27534429, 26362287, 44757485, 12961481 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1609516219779025, 777277757338817, 2101121130363987, - 550762194946473, 1905542338659364 -#else - 42616785, 23983660, 10368193, 11582341, 43711571, 31309144, - 16533929, 8206996, 36914212, 28394793 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2024821921041576, 426948675450149, 595133284085473, - 471860860885970, 600321679413000 -#else - 55987368, 30172197, 2307365, 6362031, 66973409, 8868176, - 50273234, 7031274, 7589640, 8945490 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 598474602406721, 1468128276358244, 1191923149557635, - 1501376424093216, 1281662691293476 -#else - 34956097, 8917966, 6661220, 21876816, 65916803, 17761038, - 7251488, 22372252, 24099108, 19098262 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1721138489890707, 1264336102277790, 433064545421287, - 1359988423149466, 1561871293409447 -#else - 5019539, 25646962, 4244126, 18840076, 40175591, 6453164, - 47990682, 20265406, 60876967, 23273695 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 719520245587143, 393380711632345, 132350400863381, - 1543271270810729, 1819543295798660 -#else - 10853575, 10721687, 26480089, 5861829, 44113045, 1972174, - 65242217, 22996533, 63745412, 27113307 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 396397949784152, 1811354474471839, 1362679985304303, - 2117033964846756, 498041172552279 -#else - 50106456, 5906789, 221599, 26991285, 7828207, 20305514, - 24362660, 31546264, 53242455, 7421391 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1812471844975748, 1856491995543149, 126579494584102, - 1036244859282620, 1975108050082550 -#else - 8139908, 27007935, 32257645, 27663886, 30375718, 1886181, - 45933756, 15441251, 28826358, 29431403 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 650623932407995, 1137551288410575, 2125223403615539, - 1725658013221271, 2134892965117796 -#else - 6267067, 9695052, 7709135, 16950835, 34239795, 31668296, - 14795159, 25714308, 13746020, 31812384 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 522584000310195, 1241762481390450, 1743702789495384, - 2227404127826575, 1686746002148897 -#else - 28584883, 7787108, 60375922, 18503702, 22846040, 25983196, - 63926927, 33190907, 4771361, 25134474 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 427904865186312, 1703211129693455, 1585368107547509, - 1436984488744336, 761188534613978 -#else - 24949256, 6376279, 39642383, 25379823, 48462709, 23623825, - 33543568, 21412737, 3569626, 11342593 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 318101947455002, 248138407995851, 1481904195303927, - 309278454311197, 1258516760217879 -#else - 26514970, 4740088, 27912651, 3697550, 19331575, 22082093, - 6809885, 4608608, 7325975, 18753361 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1275068538599310, 513726919533379, 349926553492294, - 688428871968420, 1702400196000666 -#else - 55490446, 19000001, 42787651, 7655127, 65739590, 5214311, - 39708324, 10258389, 49462170, 25367739 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1061864036265233, 961611260325381, 321859632700838, - 1045600629959517, 1985130202504038 -#else - 11431185, 15823007, 26570245, 14329124, 18029990, 4796082, - 35662685, 15580663, 9280358, 29580745 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1558816436882417, 1962896332636523, 1337709822062152, - 1501413830776938, 294436165831932 -#else - 66948081, 23228174, 44253547, 29249434, 46247496, 19933429, - 34297962, 22372809, 51563772, 4387440 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 818359826554971, 1862173000996177, 626821592884859, - 573655738872376, 1749691246745455 -#else - 46309467, 12194511, 3937617, 27748540, 39954043, 9340369, - 42594872, 8548136, 20617071, 26072431 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1988022651432119, 1082111498586040, 1834020786104821, - 1454826876423687, 692929915223122 -#else - 66170039, 29623845, 58394552, 16124717, 24603125, 27329039, - 53333511, 21678609, 24345682, 10325460 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2146513703733331, 584788900394667, 464965657279958, - 2183973639356127, 238371159456790 -#else - 47253587, 31985546, 44906155, 8714033, 14007766, 6928528, - 16318175, 32543743, 4766742, 3552007 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1129007025494441, 2197883144413266, 265142755578169, - 971864464758890, 1983715884903702 -#else - 45357481, 16823515, 1351762, 32751011, 63099193, 3950934, - 3217514, 14481909, 10988822, 29559670 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1291366624493075, 381456718189114, 1711482489312444, - 1815233647702022, 892279782992467 -#else - 15564307, 19242862, 3101242, 5684148, 30446780, 25503076, - 12677126, 27049089, 58813011, 13296004 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 444548969917454, 1452286453853356, 2113731441506810, - 645188273895859, 810317625309512 -#else - 57666574, 6624295, 36809900, 21640754, 62437882, 31497052, - 31521203, 9614054, 37108040, 12074673 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2242724082797924, 1373354730327868, 1006520110883049, - 2147330369940688, 1151816104883620 -#else - 4771172, 33419193, 14290748, 20464580, 27992297, 14998318, - 65694928, 31997715, 29832612, 17163397 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1745720200383796, 1911723143175317, 2056329390702074, - 355227174309849, 879232794371100 -#else - 7064884, 26013258, 47946901, 28486894, 48217594, 30641695, - 25825241, 5293297, 39986204, 13101589 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 163723479936298, 115424889803150, 1156016391581227, - 1894942220753364, 1970549419986329 -#else - 64810282, 2439669, 59642254, 1719964, 39841323, 17225986, - 32512468, 28236839, 36752793, 29363474 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 681981452362484, 267208874112496, 1374683991933094, - 638600984916117, 646178654558546 -#else - 37102324, 10162315, 33928688, 3981722, 50626726, 20484387, - 14413973, 9515896, 19568978, 9628812 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 13378654854251, 106237307029567, 1944412051589651, - 1841976767925457, 230702819835573 -#else - 33053803, 199357, 15894591, 1583059, 27380243, 28973997, - 49269969, 27447592, 60817077, 3437739 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 260683893467075, 854060306077237, 913639551980112, - 4704576840123, 280254810808712 -#else - 48129987, 3884492, 19469877, 12726490, 15913552, 13614290, - 44147131, 70103, 7463304, 4176122 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 715374893080287, 1173334812210491, 1806524662079626, - 1894596008000979, 398905715033393 -#else - 39984863, 10659916, 11482427, 17484051, 12771466, 26919315, - 34389459, 28231680, 24216881, 5944158 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 500026409727661, 1596431288195371, 1420380351989370, - 985211561521489, 392444930785633 -#else - 8894125, 7450974, 64444715, 23788679, 39028346, 21165316, - 19345745, 14680796, 11632993, 5847885 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2096421546958141, 1922523000950363, 789831022876840, - 427295144688779, 320923973161730 -#else - 26942781, 31239115, 9129563, 28647825, 26024104, 11769399, - 55590027, 6367193, 57381634, 4782139 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1927770723575450, 1485792977512719, 1850996108474547, - 551696031508956, 2126047405475647 -#else - 19916442, 28726022, 44198159, 22140040, 25606323, 27581991, - 33253852, 8220911, 6358847, 31680575 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2112099158080148, 742570803909715, 6484558077432, - 1951119898618916, 93090382703416 -#else - 801428, 31472730, 16569427, 11065167, 29875704, 96627, 7908388, - 29073952, 53570360, 1387154 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 383905201636970, 859946997631870, 855623867637644, - 1017125780577795, 794250831877809 -#else - 19646058, 5720633, 55692158, 12814208, 11607948, 12749789, - 14147075, 15156355, 45242033, 11835259 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 77571826285752, 999304298101753, 487841111777762, - 1038031143212339, 339066367948762 -#else - 19299512, 1155910, 28703737, 14890794, 2925026, 7269399, - 26121523, 15467869, 40548314, 5052482 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 674994775520533, 266035846330789, 826951213393478, - 1405007746162285, 1781791018620876 -#else - 64091413, 10058205, 1980837, 3964243, 22160966, 12322533, - 60677741, 20936246, 12228556, 26550755 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1001412661522686, 348196197067298, 1666614366723946, - 888424995032760, 580747687801357 -#else - 32944382, 14922211, 44263970, 5188527, 21913450, 24834489, - 4001464, 13238564, 60994061, 8653814 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1939560076207777, 1409892634407635, 552574736069277, - 383854338280405, 190706709864139 -#else - 22865569, 28901697, 27603667, 21009037, 14348957, 8234005, - 24808405, 5719875, 28483275, 2841751 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2177087163428741, 1439255351721944, 1208070840382793, - 2230616362004769, 1396886392021913 -#else - 50687877, 32441126, 66781144, 21446575, 21886281, 18001658, - 65220897, 33238773, 19932057, 20815229 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 676962063230039, 1880275537148808, 2046721011602706, - 888463247083003, 1318301552024067 -#else - 55452759, 10087520, 58243976, 28018288, 47830290, 30498519, - 3999227, 13239134, 62331395, 19644223 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1466980508178206, 617045217998949, 652303580573628, - 757303753529064, 207583137376902 -#else - 1382174, 21859713, 17266789, 9194690, 53784508, 9720080, - 20403944, 11284705, 53095046, 3093229 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1511056752906902, 105403126891277, 493434892772846, - 1091943425335976, 1802717338077427 -#else - 16650902, 22516500, 66044685, 1570628, 58779118, 7352752, - 66806440, 16271224, 43059443, 26862581 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1853982405405128, 1878664056251147, 1528011020803992, - 1019626468153565, 1128438412189035 -#else - 45197768, 27626490, 62497547, 27994275, 35364760, 22769138, - 24123613, 15193618, 45456747, 16815042 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1963939888391106, 293456433791664, 697897559513649, - 985882796904380, 796244541237972 -#else - 57172930, 29264984, 41829040, 4372841, 2087473, 10399484, - 31870908, 14690798, 17361620, 11864968 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 416770998629779, 389655552427054, 1314476859406756, - 1749382513022778, 1161905598739491 -#else - 55801235, 6210371, 13206574, 5806320, 38091172, 19587231, - 54777658, 26067830, 41530403, 17313742 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1428358296490651, 1027115282420478, 304840698058337, - 441410174026628, 1819358356278573 -#else - 14668443, 21284197, 26039038, 15305210, 25515617, 4542480, - 10453892, 6577524, 9145645, 27110552 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 204943430200135, 1554861433819175, 216426658514651, - 264149070665950, 2047097371738319 -#else - 5974855, 3053895, 57675815, 23169240, 35243739, 3225008, - 59136222, 3936127, 61456591, 30504127 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1934415182909034, 1393285083565062, 516409331772960, - 1157690734993892, 121039666594268 -#else - 30625386, 28825032, 41552902, 20761565, 46624288, 7695098, - 17097188, 17250936, 39109084, 1803631 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 662035583584445, 286736105093098, 1131773000510616, - 818494214211439, 472943792054479 -#else - 63555773, 9865098, 61880298, 4272700, 61435032, 16864731, - 14911343, 12196514, 45703375, 7047411 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 665784778135882, 1893179629898606, 808313193813106, - 276797254706413, 1563426179676396 -#else - 20093258, 9920966, 55970670, 28210574, 13161586, 12044805, - 34252013, 4124600, 34765036, 23296865 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 945205108984232, 526277562959295, 1324180513733566, - 1666970227868664, 153547609289173 -#else - 46320040, 14084653, 53577151, 7842146, 19119038, 19731827, - 4752376, 24839792, 45429205, 2288037 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2031433403516252, 203996615228162, 170487168837083, - 981513604791390, 843573964916831 -#else - 40289628, 30270716, 29965058, 3039786, 52635099, 2540456, - 29457502, 14625692, 42289247, 12570231 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1476570093962618, 838514669399805, 1857930577281364, - 2017007352225784, 317085545220047 -#else - 66045306, 22002608, 16920317, 12494842, 1278292, 27685323, - 45948920, 30055751, 55134159, 4724942 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1461557121912842, 1600674043318359, 2157134900399597, - 1670641601940616, 127765583803283 -#else - 17960970, 21778898, 62967895, 23851901, 58232301, 32143814, - 54201480, 24894499, 37532563, 1903855 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1293543509393474, 2143624609202546, 1058361566797508, - 214097127393994, 946888515472729 -#else - 23134274, 19275300, 56426866, 31942495, 20684484, 15770816, - 54119114, 3190295, 26955097, 14109738 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 357067959932916, 1290876214345711, 521245575443703, - 1494975468601005, 800942377643885 -#else - 15308788, 5320727, 36995055, 19235554, 22902007, 7767164, - 29425325, 22276870, 31960941, 11934971 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 566116659100033, 820247422481740, 994464017954148, - 327157611686365, 92591318111744 -#else - 39713153, 8435795, 4109644, 12222639, 42480996, 14818668, - 20638173, 4875028, 10491392, 1379718 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 617256647603209, 1652107761099439, 1857213046645471, - 1085597175214970, 817432759830522 -#else - 53949449, 9197840, 3875503, 24618324, 65725151, 27674630, - 33518458, 16176658, 21432314, 12180697 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 771808161440705, 1323510426395069, 680497615846440, - 851580615547985, 1320806384849017 -#else - 55321537, 11500837, 13787581, 19721842, 44678184, 10140204, - 1465425, 12689540, 56807545, 19681548 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1219260086131915, 647169006596815, 79601124759706, - 2161724213426748, 404861897060198 -#else - 5414091, 18168391, 46101199, 9643569, 12834970, 1186149, - 64485948, 32212200, 26128230, 6032912 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1327968293887866, 1335500852943256, 1401587164534264, - 558137311952440, 1551360549268902 -#else - 40771450, 19788269, 32496024, 19900513, 17847800, 20885276, - 3604024, 8316894, 41233830, 23117073 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 417621685193956, 1429953819744454, 396157358457099, - 1940470778873255, 214000046234152 -#else - 3296484, 6223048, 24680646, 21307972, 44056843, 5903204, - 58246567, 28915267, 12376616, 3188849 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1268047918491973, 2172375426948536, 1533916099229249, - 1761293575457130, 1590622667026765 -#else - 29190469, 18895386, 27549112, 32370916, 3520065, 22857131, - 32049514, 26245319, 50999629, 23702124 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1627072914981959, 2211603081280073, 1912369601616504, - 1191770436221309, 2187309757525860 -#else - 52364359, 24245275, 735817, 32955454, 46701176, 28496527, - 25246077, 17758763, 18640740, 32593455 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1149147819689533, 378692712667677, 828475842424202, - 2218619146419342, 70688125792186 -#else - 60180029, 17123636, 10361373, 5642961, 4910474, 12345252, - 35470478, 33060001, 10530746, 1053335 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1299739417079761, 1438616663452759, 1536729078504412, - 2053896748919838, 1008421032591246 -#else - 37842897, 19367626, 53570647, 21437058, 47651804, 22899047, - 35646494, 30605446, 24018830, 15026644 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2040723824657366, 399555637875075, 632543375452995, - 872649937008051, 1235394727030233 -#else - 44516310, 30409154, 64819587, 5953842, 53668675, 9425630, - 25310643, 13003497, 64794073, 18408815 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2211311599327900, 2139787259888175, 938706616835350, - 12609661139114, 2081897930719789 -#else - 39688860, 32951110, 59064879, 31885314, 41016598, 13987818, - 39811242, 187898, 43942445, 31022696 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1324994503390450, 336982330582631, 1183998925654177, - 1091654665913274, 48727673971319 -#else - 45364466, 19743956, 1844839, 5021428, 56674465, 17642958, - 9716666, 16266922, 62038647, 726098 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1845522914617879, 1222198248335542, 150841072760134, - 1927029069940982, 1189913404498011 -#else - 29370903, 27500434, 7334070, 18212173, 9385286, 2247707, - 53446902, 28714970, 30007387, 17731091 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1079559557592645, 2215338383666441, 1903569501302605, - 49033973033940, 305703433934152 -#else - 66172485, 16086690, 23751945, 33011114, 65941325, 28365395, - 9137108, 730663, 9835848, 4555336 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 94653405416909, 1386121349852999, 1062130477891762, - 36553947479274, 833669648948846 -#else - 43732429, 1410445, 44855111, 20654817, 30867634, 15826977, - 17693930, 544696, 55123566, 12422645 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1432015813136298, 440364795295369, 1395647062821501, - 1976874522764578, 934452372723352 -#else - 31117226, 21338698, 53606025, 6561946, 57231997, 20796761, - 61990178, 29457725, 29120152, 13924425 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1296625309219774, 2068273464883862, 1858621048097805, - 1492281814208508, 2235868981918946 -#else - 49707966, 19321222, 19675798, 30819676, 56101901, 27695611, - 57724924, 22236731, 7240930, 33317044 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1490330266465570, 1858795661361448, 1436241134969763, - 294573218899647, 1208140011028933 -#else - 35747106, 22207651, 52101416, 27698213, 44655523, 21401660, - 1222335, 4389483, 3293637, 18002689 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1282462923712748, 741885683986255, 2027754642827561, - 518989529541027, 1826610009555945 -#else - 50424044, 19110186, 11038543, 11054958, 53307689, 30215898, - 42789283, 7733546, 12796905, 27218610 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1525827120027511, 723686461809551, 1597702369236987, - 244802101764964, 1502833890372311 -#else - 58349431, 22736595, 41689999, 10783768, 36493307, 23807620, - 38855524, 3647835, 3222231, 22393970 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 113622036244513, 1233740067745854, 674109952278496, - 2114345180342965, 166764512856263 -#else - 18606113, 1693100, 41660478, 18384159, 4112352, 10045021, - 23603893, 31506198, 59558087, 2484984 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2041668749310338, 2184405322203901, 1633400637611036, - 2110682505536899, 2048144390084644 -#else - 9255298, 30423235, 54952701, 32550175, 13098012, 24339566, - 16377219, 31451620, 47306788, 30519729 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 503058759232932, 760293024620937, 2027152777219493, - 666858468148475, 1539184379870952 -#else - 44379556, 7496159, 61366665, 11329248, 19991973, 30206930, - 35390715, 9936965, 37011176, 22935634 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1916168475367211, 915626432541343, 883217071712575, - 363427871374304, 1976029821251593 -#else - 21878571, 28553135, 4338335, 13643897, 64071999, 13160959, - 19708896, 5415497, 59748361, 29445138 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 678039535434506, 570587290189340, 1605302676614120, - 2147762562875701, 1706063797091704 -#else - 27736842, 10103576, 12500508, 8502413, 63695848, 23920873, - 10436917, 32004156, 43449720, 25422331 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1439489648586438, 2194580753290951, 832380563557396, - 561521973970522, 584497280718389 -#else - 19492550, 21450067, 37426887, 32701801, 63900692, 12403436, - 30066266, 8367329, 13243957, 8709688 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 187989455492609, 681223515948275, 1933493571072456, - 1872921007304880, 488162364135671 -#else - 12015105, 2801261, 28198131, 10151021, 24818120, 28811299, - 55914672, 27908697, 5150967, 7274186 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1413466089534451, 410844090765630, 1397263346404072, - 408227143123410, 1594561803147811 -#else - 2831347, 21062286, 1478974, 6122054, 23825128, 20820846, - 31097298, 6083058, 31021603, 23760822 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2102170800973153, 719462588665004, 1479649438510153, - 1097529543970028, 1302363283777685 -#else - 64578913, 31324785, 445612, 10720828, 53259337, 22048494, - 43601132, 16354464, 15067285, 19406725 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 942065717847195, 1069313679352961, 2007341951411051, - 70973416446291, 1419433790163706 -#else - 7840923, 14037873, 33744001, 15934015, 66380651, 29911725, - 21403987, 1057586, 47729402, 21151211 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1146565545556377, 1661971299445212, 406681704748893, - 564452436406089, 1109109865829139 -#else - 915865, 17085158, 15608284, 24765302, 42751837, 6060029, - 49737545, 8410996, 59888403, 16527024 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2214421081775077, 1165671861210569, 1890453018796184, - 3556249878661, 442116172656317 -#else - 32922597, 32997445, 20336073, 17369864, 10903704, 28169945, - 16957573, 52992, 23834301, 6588044 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 753830546620811, 1666955059895019, 1530775289309243, - 1119987029104146, 2164156153857580 -#else - 32752011, 11232950, 3381995, 24839566, 22652987, 22810329, - 17159698, 16689107, 46794284, 32248439 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 615171919212796, 1523849404854568, 854560460547503, - 2067097370290715, 1765325848586042 -#else - 62419196, 9166775, 41398568, 22707125, 11576751, 12733943, - 7924251, 30802151, 1976122, 26305405 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1094538949313667, 1796592198908825, 870221004284388, - 2025558921863561, 1699010892802384 -#else - 21251203, 16309901, 64125849, 26771309, 30810596, 12967303, - 156041, 30183180, 12331344, 25317235 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1951351290725195, 1916457206844795, 198025184438026, - 1909076887557595, 1938542290318919 -#else - 8651595, 29077400, 51023227, 28557437, 13002506, 2950805, - 29054427, 28447462, 10008135, 28886531 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1014323197538413, 869150639940606, 1756009942696599, - 1334952557375672, 1544945379082874 -#else - 31486061, 15114593, 52847614, 12951353, 14369431, 26166587, - 16347320, 19892343, 8684154, 23021480 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 764055910920305, 1603590757375439, 146805246592357, - 1843313433854297, 954279890114939 -#else - 19443825, 11385320, 24468943, 23895364, 43189605, 2187568, - 40845657, 27467510, 31316347, 14219878 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 80113526615750, 764536758732259, 1055139345100233, - 469252651759390, 617897512431515 -#else - 38514374, 1193784, 32245219, 11392485, 31092169, 15722801, - 27146014, 6992409, 29126555, 9207390 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 74497112547268, 740094153192149, 1745254631717581, - 727713886503130, 1283034364416928 -#else - 32382916, 1110093, 18477781, 11028262, 39697101, 26006320, - 62128346, 10843781, 59151264, 19118701 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 525892105991110, 1723776830270342, 1476444848991936, - 573789489857760, 133864092632978 -#else - 2814918, 7836403, 27519878, 25686276, 46214848, 22000742, - 45614304, 8550129, 28346258, 1994730 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 542611720192581, 1986812262899321, 1162535242465837, - 481498966143464, 544600533583622 -#else - 47530565, 8085544, 53108345, 29605809, 2785837, 17323125, - 47591912, 7174893, 22628102, 8115180 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 64123227344372, 1239927720647794, 1360722983445904, - 222610813654661, 62429487187991 -#else - 36703732, 955510, 55975026, 18476362, 34661776, 20276352, - 41457285, 3317159, 57165847, 930271 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1793193323953132, 91096687857833, 70945970938921, - 2158587638946380, 1537042406482111 -#else - 51805164, 26720662, 28856489, 1357446, 23421993, 1057177, - 24091212, 32165462, 44343487, 22903716 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1895854577604609, 1394895708949416, 1728548428495944, - 1140864900240149, 563645333603061 -#else - 44357633, 28250434, 54201256, 20785565, 51297352, 25757378, - 52269845, 17000211, 65241845, 8398969 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 141358280486863, 91435889572504, 1087208572552643, - 1829599652522921, 1193307020643647 -#else - 35139535, 2106402, 62372504, 1362500, 12813763, 16200670, - 22981545, 27263159, 18009407, 17781660 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1611230858525381, 950720175540785, 499589887488610, - 2001656988495019, 88977313255908 -#else - 49887941, 24009210, 39324209, 14166834, 29815394, 7444469, - 29551787, 29827013, 19288548, 1325865 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1189080501479658, 2184348804772597, 1040818725742319, - 2018318290311834, 1712060030915354 -#else - 15100138, 17718680, 43184885, 32549333, 40658671, 15509407, - 12376730, 30075286, 33166106, 25511682 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 873966876953756, 1090638350350440, 1708559325189137, - 672344594801910, 1320437969700239 -#else - 20909212, 13023121, 57899112, 16251777, 61330449, 25459517, - 12412150, 10018715, 2213263, 19676059 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1508590048271766, 1131769479776094, 101550868699323, - 428297785557897, 561791648661744 -#else - 32529814, 22479743, 30361438, 16864679, 57972923, 1513225, - 22922121, 6382134, 61341936, 8371347 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 756417570499462, 237882279232602, 2136263418594016, - 1701968045454886, 703713185137472 -#else - 9923462, 11271500, 12616794, 3544722, 37110496, 31832805, - 12891686, 25361300, 40665920, 10486143 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1781187809325462, 1697624151492346, 1381393690939988, - 175194132284669, 1483054666415238 -#else - 44511638, 26541766, 8587002, 25296571, 4084308, 20584370, - 361725, 2610596, 43187334, 22099236 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2175517777364616, 708781536456029, 955668231122942, - 1967557500069555, 2021208005604118 -#else - 5408392, 32417741, 62139741, 10561667, 24145918, 14240566, - 31319731, 29318891, 19985174, 30118346 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1115135966606887, 224217372950782, 915967306279222, - 593866251291540, 561747094208006 -#else - 53114407, 16616820, 14549246, 3341099, 32155958, 13648976, - 49531796, 8849296, 65030, 8370684 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1443163092879439, 391875531646162, 2180847134654632, - 464538543018753, 1594098196837178 -#else - 58787919, 21504805, 31204562, 5839400, 46481576, 32497154, - 47665921, 6922163, 12743482, 23753914 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 850858855888869, 319436476624586, 327807784938441, - 740785849558761, 17128415486016 -#else - 64747493, 12678784, 28815050, 4759974, 43215817, 4884716, - 23783145, 11038569, 18800704, 255233 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2132756334090067, 536247820155645, 48907151276867, - 608473197600695, 1261689545022784 -#else - 61839187, 31780545, 13957885, 7990715, 23132995, 728773, - 13393847, 9066957, 19258688, 18800639 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1525176236978354, 974205476721062, 293436255662638, - 148269621098039, 137961998433963 -#else - 64172210, 22726896, 56676774, 14516792, 63468078, 4372540, - 35173943, 2209389, 65584811, 2055793 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1121075518299410, 2071745529082111, 1265567917414828, - 1648196578317805, 496232102750820 -#else - 580882, 16705327, 5468415, 30871414, 36182444, 18858431, - 59905517, 24560042, 37087844, 7394434 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 122321229299801, 1022922077493685, 2001275453369484, - 2017441881607947, 993205880778002 -#else - 23838809, 1822728, 51370421, 15242726, 8318092, 29821328, - 45436683, 30062226, 62287122, 14799920 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 654925550560074, 1168810995576858, 575655959430926, - 905758704861388, 496774564663534 -#else - 13345610, 9759151, 3371034, 17416641, 16353038, 8577942, - 31129804, 13496856, 58052846, 7402517 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1954109525779738, 2117022646152485, 338102630417180, - 1194140505732026, 107881734943492 -#else - 2286874, 29118501, 47066405, 31546095, 53412636, 5038121, - 11006906, 17794080, 8205060, 1607563 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1714785840001267, 2036500018681589, 1876380234251966, - 2056717182974196, 1645855254384642 -#else - 14414067, 25552300, 3331829, 30346215, 22249150, 27960244, - 18364660, 30647474, 30019586, 24525154 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 106431476499341, 62482972120563, 1513446655109411, - 807258751769522, 538491469114 -#else - 39420813, 1585952, 56333811, 931068, 37988643, 22552112, - 52698034, 12029092, 9944378, 8024 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2002850762893643, 1243624520538135, 1486040410574605, - 2184752338181213, 378495998083531 -#else - 4368715, 29844802, 29874199, 18531449, 46878477, 22143727, - 50994269, 32555346, 58966475, 5640029 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 922510868424903, 1089502620807680, 402544072617374, - 1131446598479839, 1290278588136533 -#else - 10299591, 13746483, 11661824, 16234854, 7630238, 5998374, - 9809887, 16859868, 15219797, 19226649 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1867998812076769, 715425053580701, 39968586461416, - 2173068014586163, 653822651801304 -#else - 27425505, 27835351, 3055005, 10660664, 23458024, 595578, - 51710259, 32381236, 48766680, 9742716 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 162892278589453, 182585796682149, 75093073137630, - 497037941226502, 133871727117371 -#else - 6744077, 2427284, 26042789, 2720740, 66260958, 1118973, - 32324614, 7406442, 12420155, 1994844 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1914596576579670, 1608999621851578, 1987629837704609, - 1519655314857977, 1819193753409464 -#else - 14012502, 28529712, 48724410, 23975962, 40623521, 29617992, - 54075385, 22644628, 24319928, 27108099 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1949315551096831, 1069003344994464, 1939165033499916, - 1548227205730856, 1933767655861407 -#else - 16412671, 29047065, 10772640, 15929391, 50040076, 28895810, - 10555944, 23070383, 37006495, 28815383 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1730519386931635, 1393284965610134, 1597143735726030, - 416032382447158, 1429665248828629 -#else - 22397363, 25786748, 57815702, 20761563, 17166286, 23799296, - 39775798, 6199365, 21880021, 21303672 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 360275475604565, 547835731063078, 215360904187529, - 596646739879007, 332709650425085 -#else - 62825557, 5368522, 35991846, 8163388, 36785801, 3209127, - 16557151, 8890729, 8840445, 4957760 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 47602113726801, 1522314509708010, 437706261372925, - 814035330438027, 335930650933545 -#else - 51661137, 709326, 60189418, 22684253, 37330941, 6522331, - 45388683, 12130071, 52312361, 5005756 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1291597595523886, 1058020588994081, 402837842324045, - 1363323695882781, 2105763393033193 -#else - 64994094, 19246303, 23019041, 15765735, 41839181, 6002751, - 10183197, 20315106, 50713577, 31378319 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 109521982566564, 1715257748585139, 1112231216891516, - 2046641005101484, 134249157157013 -#else - 48083108, 1632004, 13466291, 25559332, 43468412, 16573536, - 35094956, 30497327, 22208661, 2000468 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2156991030936798, 2227544497153325, 1869050094431622, - 754875860479115, 1754242344267058 -#else - 3065054, 32141671, 41510189, 33192999, 49425798, 27851016, - 58944651, 11248526, 63417650, 26140247 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1846089562873800, 98894784984326, 1412430299204844, - 171351226625762, 1100604760929008 -#else - 10379208, 27508878, 8877318, 1473647, 37817580, 21046851, - 16690914, 2553332, 63976176, 16400288 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 84172382130492, 499710970700046, 425749630620778, - 1762872794206857, 612842602127960 -#else - 15716668, 1254266, 48636174, 7446273, 58659946, 6344163, - 45011593, 26268851, 26894936, 9132066 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 868309334532756, 1703010512741873, 1952690008738057, - 4325269926064, 2071083554962116 -#else - 24158868, 12938817, 11085297, 25376834, 39045385, 29097348, - 36532400, 64451, 60291780, 30861549 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 523094549451158, 401938899487815, 1407690589076010, - 2022387426254453, 158660516411257 -#else - 13488534, 7794716, 22236231, 5989356, 25426474, 20976224, - 2350709, 30135921, 62420857, 2364225 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 612867287630009, 448212612103814, 571629077419196, - 1466796750919376, 1728478129663858 -#else - 16335033, 9132434, 25640582, 6678888, 1725628, 8517937, - 55301840, 21856974, 15445874, 25756331 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1723848973783452, 2208822520534681, 1718748322776940, - 1974268454121942, 1194212502258141 -#else - 29004188, 25687351, 28661401, 32914020, 54314860, 25611345, - 31863254, 29418892, 66830813, 17795152 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1254114807944608, 977770684047110, 2010756238954993, - 1783628927194099, 1525962994408256 -#else - 60986784, 18687766, 38493958, 14569918, 56250865, 29962602, - 10343411, 26578142, 37280576, 22738620 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 232464058235826, 1948628555342434, 1835348780427694, - 1031609499437291, 64472106918373 -#else - 27081650, 3463984, 14099042, 29036828, 1616302, 27348828, - 29542635, 15372179, 17293797, 960709 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 767338676040683, 754089548318405, 1523192045639075, - 435746025122062, 512692508440385 -#else - 20263915, 11434237, 61343429, 11236809, 13505955, 22697330, - 50997518, 6493121, 47724353, 7639713 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1255955808701983, 1700487367990941, 1166401238800299, - 1175121994891534, 1190934801395380 -#else - 64278047, 18715199, 25403037, 25339236, 58791851, 17380732, - 18006286, 17510682, 29994676, 17746311 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 349144008168292, 1337012557669162, 1475912332999108, - 1321618454900458, 47611291904320 -#else - 9769828, 5202651, 42951466, 19923039, 39057860, 21992807, - 42495722, 19693649, 35924288, 709463 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 877519947135419, 2172838026132651, 272304391224129, - 1655143327559984, 886229406429814 -#else - 12286395, 13076066, 45333675, 32377809, 42105665, 4057651, - 35090736, 24663557, 16102006, 13205847 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 375806028254706, 214463229793940, 572906353144089, - 572168269875638, 697556386112979 -#else - 13733362, 5599946, 10557076, 3195751, 61550873, 8536969, - 41568694, 8525971, 10151379, 10394400 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1168827102357844, 823864273033637, 2071538752104697, - 788062026895924, 599578340743362 -#else - 4024660, 17416881, 22436261, 12276534, 58009849, 30868332, - 19698228, 11743039, 33806530, 8934413 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1948116082078088, 2054898304487796, 2204939184983900, - 210526805152138, 786593586607626 -#else - 51229064, 29029191, 58528116, 30620370, 14634844, 32856154, - 57659786, 3137093, 55571978, 11721157 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1915320147894736, 156481169009469, 655050471180417, - 592917090415421, 2165897438660879 -#else - 17555920, 28540494, 8268605, 2331751, 44370049, 9761012, - 9319229, 8835153, 57903375, 32274386 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1726336468579724, 1119932070398949, 1929199510967666, - 33918788322959, 1836837863503150 -#else - 66647436, 25724417, 20614117, 16688288, 59594098, 28747312, - 22300303, 505429, 6108462, 27371017 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 829996854845988, 217061778005138, 1686565909803640, - 1346948817219846, 1723823550730181 -#else - 62038564, 12367916, 36445330, 3234472, 32617080, 25131790, - 29880582, 20071101, 40210373, 25686972 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 384301494966394, 687038900403062, 2211195391021739, - 254684538421383, 1245698430589680 -#else - 35133562, 5726538, 26934134, 10237677, 63935147, 32949378, - 24199303, 3795095, 7592688, 18562353 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1247567493562688, 1978182094455847, 183871474792955, - 806570235643435, 288461518067916 -#else - 21594432, 18590204, 17466407, 29477210, 32537083, 2739898, - 6407723, 12018833, 38852812, 4298411 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1449077384734201, 38285445457996, 2136537659177832, - 2146493000841573, 725161151123125 -#else - 46458361, 21592935, 39872588, 570497, 3767144, 31836892, - 13891941, 31985238, 13717173, 10805743 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1201928866368855, 800415690605445, 1703146756828343, - 997278587541744, 1858284414104014 -#else - 52432215, 17910135, 15287173, 11927123, 24177847, 25378864, - 66312432, 14860608, 40169934, 27690595 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 356468809648877, 782373916933152, 1718002439402870, - 1392222252219254, 663171266061951 -#else - 12962541, 5311799, 57048096, 11658279, 18855286, 25600231, - 13286262, 20745728, 62727807, 9882021 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 759628738230460, 1012693474275852, 353780233086498, - 246080061387552, 2030378857679162 -#else - 18512060, 11319350, 46985740, 15090308, 18818594, 5271736, - 44380960, 3666878, 43141434, 30255002 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2040672435071076, 888593182036908, 1298443657189359, - 1804780278521327, 354070726137060 -#else - 60319844, 30408388, 16192428, 13241070, 15898607, 19348318, - 57023983, 26893321, 64705764, 5276064 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1894938527423184, 1463213041477277, 474410505497651, - 247294963033299, 877975941029128 -#else - 30169808, 28236784, 26306205, 21803573, 27814963, 7069267, - 7152851, 3684982, 1449224, 13082861 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 207937160991127, 12966911039119, 820997788283092, - 1010440472205286, 1701372890140810 -#else - 10342807, 3098505, 2119311, 193222, 25702612, 12233820, - 23697382, 15056736, 46092426, 25352431 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 218882774543183, 533427444716285, 1233243976733245, - 435054256891319, 1509568989549904 -#else - 33958735, 3261607, 22745853, 7948688, 19370557, 18376767, - 40936887, 6482813, 56808784, 22494330 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1888838535711826, 1052177758340622, 1213553803324135, - 169182009127332, 463374268115872 -#else - 32869458, 28145887, 25609742, 15678670, 56421095, 18083360, - 26112420, 2521008, 44444576, 6904814 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 299137589460312, 1594371588983567, 868058494039073, - 257771590636681, 1805012993142921 -#else - 29506904, 4457497, 3377935, 23757988, 36598817, 12935079, - 1561737, 3841096, 38105225, 26896789 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1806842755664364, 2098896946025095, 1356630998422878, - 1458279806348064, 347755825962072 -#else - 10340844, 26924055, 48452231, 31276001, 12621150, 20215377, - 30878496, 21730062, 41524312, 5181965 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1402334161391744, 1560083671046299, 1008585416617747, - 1147797150908892, 1420416683642459 -#else - 25940096, 20896407, 17324187, 23247058, 58437395, 15029093, - 24396252, 17103510, 64786011, 21165857 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 665506704253369, 273770475169863, 799236974202630, - 848328990077558, 1811448782807931 -#else - 45343161, 9916822, 65808455, 4079497, 66080518, 11909558, - 1782390, 12641087, 20603771, 26992690 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1468412523962641, 771866649897997, 1931766110147832, - 799561180078482, 524837559150077 -#else - 48226577, 21881051, 24849421, 11501709, 13161720, 28785558, - 1925522, 11914390, 4662781, 7820689 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2223212657821850, 630416247363666, 2144451165500328, - 816911130947791, 1024351058410032 -#else - 12241050, 33128450, 8132690, 9393934, 32846760, 31954812, - 29749455, 12172924, 16136752, 15264020 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1266603897524861, 156378408858100, 1275649024228779, - 447738405888420, 253186462063095 -#else - 56758909, 18873868, 58896884, 2330219, 49446315, 19008651, - 10658212, 6671822, 19012087, 3772772 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2022215964509735, 136144366993649, 1800716593296582, - 1193970603800203, 871675847064218 -#else - 3753511, 30133366, 10617073, 2028709, 14841030, 26832768, - 28718731, 17791548, 20527770, 12988982 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1862751661970328, 851596246739884, 1519315554814041, - 1542798466547449, 1417975335901520 -#else - 52286360, 27757162, 63400876, 12689772, 66209881, 22639565, - 42925817, 22989488, 3299664, 21129479 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1228168094547481, 334133883362894, 587567568420081, - 433612590281181, 603390400373205 -#else - 50331161, 18301130, 57466446, 4978982, 3308785, 8755439, - 6943197, 6461331, 41525717, 8991217 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 121893973206505, 1843345804916664, 1703118377384911, - 497810164760654, 101150811654673 -#else - 49882601, 1816361, 65435576, 27467992, 31783887, 25378441, - 34160718, 7417949, 36866577, 1507264 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 458346255946468, 290909935619344, 1452768413850679, - 550922875254215, 1537286854336538 -#else - 29692644, 6829891, 56610064, 4334895, 20945975, 21647936, - 38221255, 8209390, 14606362, 22907359 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 584322311184395, 380661238802118, 114839394528060, - 655082270500073, 2111856026034852 -#else - 63627275, 8707080, 32188102, 5672294, 22096700, 1711240, - 34088169, 9761486, 4170404, 31469107 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 996965581008991, 2148998626477022, 1012273164934654, - 1073876063914522, 1688031788934939 -#else - 55521375, 14855944, 62981086, 32022574, 40459774, 15084045, - 22186522, 16002000, 52832027, 25153633 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 923487018849600, 2085106799623355, 528082801620136, - 1606206360876188, 735907091712524 -#else - 62297408, 13761028, 35404987, 31070512, 63796392, 7869046, - 59995292, 23934339, 13240844, 10965870 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1697697887804317, 1335343703828273, 831288615207040, - 949416685250051, 288760277392022 -#else - 59366301, 25297669, 52340529, 19898171, 43876480, 12387165, - 4498947, 14147411, 29514390, 4302863 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1419122478109648, 1325574567803701, 602393874111094, - 2107893372601700, 1314159682671307 -#else - 53695440, 21146572, 20757301, 19752600, 14785142, 8976368, - 62047588, 31410058, 17846987, 19582505 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2201150872731804, 2180241023425241, 97663456423163, - 1633405770247824, 848945042443986 -#else - 64864412, 32799703, 62511833, 32488122, 60861691, 1455298, - 45461136, 24339642, 61886162, 12650266 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1173339555550611, 818605084277583, 47521504364289, - 924108720564965, 735423405754506 -#else - 57202067, 17484121, 21134159, 12198166, 40044289, 708125, - 387813, 13770293, 47974538, 10958662 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 830104860549448, 1886653193241086, 1600929509383773, - 1475051275443631, 286679780900937 -#else - 22470984, 12369526, 23446014, 28113323, 45588061, 23855708, - 55336367, 21979976, 42025033, 4271861 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1577111294832995, 1030899169768747, 144900916293530, - 1964672592979567, 568390100955250 -#else - 41939299, 23500789, 47199531, 15361594, 61124506, 2159191, - 75375, 29275903, 34582642, 8469672 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 278388655910247, 487143369099838, 927762205508727, - 181017540174210, 1616886700741287 -#else - 15854951, 4148314, 58214974, 7259001, 11666551, 13824734, - 36577666, 2697371, 24154791, 24093489 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1191033906638969, 940823957346562, 1606870843663445, - 861684761499847, 658674867251089 -#else - 15446137, 17747788, 29759746, 14019369, 30811221, 23944241, - 35526855, 12840103, 24913809, 9815020 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1875032594195546, 1427106132796197, 724736390962158, - 901860512044740, 635268497268760 -#else - 62399578, 27940162, 35267365, 21265538, 52665326, 10799413, - 58005188, 13438768, 18735128, 9466238 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 622869792298357, 1903919278950367, 1922588621661629, - 1520574711600434, 1087100760174640 -#else - 11933045, 9281483, 5081055, 28370608, 64480701, 28648802, - 59381042, 22658328, 44380208, 16199063 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 25465949416618, 1693639527318811, 1526153382657203, - 125943137857169, 145276964043999 -#else - 14576810, 379472, 40322331, 25237195, 37682355, 22741457, - 67006097, 1876698, 30801119, 2164795 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 214739857969358, 920212862967915, 1939901550972269, - 1211862791775221, 85097515720120 -#else - 15995086, 3199873, 13672555, 13712240, 47730029, 28906785, - 54027253, 18058162, 53616056, 1268051 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2006245852772938, 734762734836159, 254642929763427, - 1406213292755966, 239303749517686 -#else - 56818250, 29895392, 63822271, 10948817, 23037027, 3794475, - 63638526, 20954210, 50053494, 3565903 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1619678837192149, 1919424032779215, 1357391272956794, - 1525634040073113, 1310226789796241 -#else - 29210069, 24135095, 61189071, 28601646, 10834810, 20226706, - 50596761, 22733718, 39946641, 19523900 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1040763709762123, 1704449869235352, 605263070456329, - 1998838089036355, 1312142911487502 -#else - 53946955, 15508587, 16663704, 25398282, 38758921, 9019122, - 37925443, 29785008, 2244110, 19552453 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1996723311435669, 1844342766567060, 985455700466044, - 1165924681400960, 311508689870129 -#else - 61955989, 29753495, 57802388, 27482848, 16243068, 14684434, - 41435776, 17373631, 13491505, 4641841 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 43173156290518, 2202883069785309, 1137787467085917, - 1733636061944606, 1394992037553852 -#else - 10813398, 643330, 47920349, 32825515, 30292061, 16954354, - 27548446, 25833190, 14476988, 20787001 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 670078326344559, 555655025059356, 471959386282438, - 2141455487356409, 849015953823125 -#else - 10292079, 9984945, 6481436, 8279905, 59857350, 7032742, - 27282937, 31910173, 39196053, 12651323 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2197214573372804, 794254097241315, 1030190060513737, - 267632515541902, 2040478049202624 -#else - 35923332, 32741048, 22271203, 11835308, 10201545, 15351028, - 17099662, 3988035, 21721536, 30405492 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1812516004670529, 1609256702920783, 1706897079364493, - 258549904773295, 996051247540686 -#else - 10202177, 27008593, 35735631, 23979793, 34958221, 25434748, - 54202543, 3852693, 13216206, 14842320 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1540374301420584, 1764656898914615, 1810104162020396, - 923808779163088, 664390074196579 -#else - 51293224, 22953365, 60569911, 26295436, 60124204, 26972653, - 35608016, 13765823, 39674467, 9900183 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1323460699404750, 1262690757880991, 871777133477900, - 1060078894988977, 1712236889662886 -#else - 14465486, 19721101, 34974879, 18815558, 39665676, 12990491, - 33046193, 15796406, 60056998, 25514317 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1696163952057966, 1391710137550823, 608793846867416, - 1034391509472039, 1780770894075012 -#else - 30924398, 25274812, 6359015, 20738097, 16508376, 9071735, - 41620263, 15413634, 9524356, 26535554 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1367603834210841, 2131988646583224, 890353773628144, - 1908908219165595, 270836895252891 -#else - 12274201, 20378885, 32627640, 31769106, 6736624, 13267305, - 5237659, 28444949, 15663515, 4035784 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 597536315471731, 40375058742586, 1942256403956049, - 1185484645495932, 312666282024145 -#else - 64157555, 8903984, 17349946, 601635, 50676049, 28941875, - 53376124, 17665097, 44850385, 4659090 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1919411405316294, 1234508526402192, 1066863051997083, - 1008444703737597, 1348810787701552 -#else - 50192582, 28601458, 36715152, 18395610, 20774811, 15897498, - 5736189, 15026997, 64930608, 20098846 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2102881477513865, 1570274565945361, 1573617900503708, - 18662635732583, 2232324307922098 -#else - 58249865, 31335375, 28571665, 23398914, 66634396, 23448733, - 63307367, 278094, 23440562, 33264224 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1853931367696942, 8107973870707, 350214504129299, - 775206934582587, 1752317649166792 -#else - 10226222, 27625730, 15139955, 120818, 52241171, 5218602, - 32937275, 11551483, 50536904, 26111567 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1417148368003523, 721357181628282, 505725498207811, - 373232277872983, 261634707184480 -#else - 17932739, 21117156, 43069306, 10749059, 11316803, 7535897, - 22503767, 5561594, 63462240, 3898660 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2186733281493267, 2250694917008620, 1014829812957440, - 479998161452389, 83566193876474 -#else - 7749907, 32584865, 50769132, 33537967, 42090752, 15122142, - 65535333, 7152529, 21831162, 1245233 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1268116367301224, 560157088142809, 802626839600444, - 2210189936605713, 1129993785579988 -#else - 26958440, 18896406, 4314585, 8346991, 61431100, 11960071, - 34519569, 32934396, 36706772, 16838219 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 615183387352312, 917611676109240, 878893615973325, - 978940963313282, 938686890583575 -#else - 54942968, 9166946, 33491384, 13673479, 29787085, 13096535, - 6280834, 14587357, 44770839, 13987524 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 522024729211672, 1045059315315808, 1892245413707790, - 1907891107684253, 2059998109500714 -#else - 42758936, 7778774, 21116000, 15572597, 62275598, 28196653, - 62807965, 28429792, 59639082, 30696363 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1799679152208884, 912132775900387, 25967768040979, - 432130448590461, 274568990261996 -#else - 9681908, 26817309, 35157219, 13591837, 60225043, 386949, - 31622781, 6439245, 52527852, 4091396 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 98698809797682, 2144627600856209, 1907959298569602, - 811491302610148, 1262481774981493 -#else - 58682418, 1470726, 38999185, 31957441, 3978626, 28430809, - 47486180, 12092162, 29077877, 18812444 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1791451399743152, 1713538728337276, 118349997257490, - 1882306388849954, 158235232210248 -#else - 5269168, 26694706, 53878652, 25533716, 25932562, 1763552, - 61502754, 28048550, 47091016, 2357888 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1217809823321928, 2173947284933160, 1986927836272325, - 1388114931125539, 12686131160169 -#else - 32264008, 18146780, 61721128, 32394338, 65017541, 29607531, - 23104803, 20684524, 5727337, 189038 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1650875518872272, 1136263858253897, 1732115601395988, - 734312880662190, 1252904681142109 -#else - 14609104, 24599962, 61108297, 16931650, 52531476, 25810533, - 40363694, 10942114, 41219933, 18669734 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 372986456113865, 525430915458171, 2116279931702135, - 501422713587815, 1907002872974925 -#else - 20513481, 5557931, 51504251, 7829530, 26413943, 31535028, - 45729895, 7471780, 13913677, 28416557 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 803147181835288, 868941437997146, 316299302989663, - 943495589630550, 571224287904572 -#else - 41534488, 11967825, 29233242, 12948236, 60354399, 4713226, - 58167894, 14059179, 12878652, 8511905 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 227742695588364, 1776969298667369, 628602552821802, - 457210915378118, 2041906378111140 -#else - 41452044, 3393630, 64153449, 26478905, 64858154, 9366907, - 36885446, 6812973, 5568676, 30426776 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 815000523470260, 913085688728307, 1052060118271173, - 1345536665214223, 541623413135555 -#else - 11630004, 12144454, 2116339, 13606037, 27378885, 15676917, - 49700111, 20050058, 52713667, 8070817 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1580216071604333, 1877997504342444, 857147161260913, - 703522726778478, 2182763974211603 -#else - 27117677, 23547054, 35826092, 27984343, 1127281, 12772488, - 37262958, 10483305, 55556115, 32525717 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1870080310923419, 71988220958492, 1783225432016732, - 615915287105016, 1035570475990230 -#else - 10637467, 27866368, 5674780, 1072708, 40765276, 26572129, - 65424888, 9177852, 39615702, 15431202 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 730987750830150, 857613889540280, 1083813157271766, - 1002817255970169, 1719228484436074 -#else - 20525126, 10892566, 54366392, 12779442, 37615830, 16150074, - 38868345, 14943141, 52052074, 25618500 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 377616581647602, 1581980403078513, 804044118130621, - 2034382823044191, 643844048472185 -#else - 37084402, 5626925, 66557297, 23573344, 753597, 11981191, - 25244767, 30314666, 63752313, 9594023 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 176957326463017, 1573744060478586, 528642225008045, - 1816109618372371, 1515140189765006 -#else - 43356201, 2636869, 61944954, 23450613, 585133, 7877383, - 11345683, 27062142, 13352334, 22577348 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1888911448245718, 1387110895611080, 1924503794066429, - 1731539523700949, 2230378382645454 -#else - 65177046, 28146973, 3304648, 20669563, 17015805, 28677341, - 37325013, 25801949, 53893326, 33235227 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 443392177002051, 233793396845137, 2199506622312416, - 1011858706515937, 974676837063129 -#else - 20239939, 6607058, 6203985, 3483793, 48721888, 32775202, - 46385121, 15077869, 44358105, 14523816 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1846351103143623, 1949984838808427, 671247021915253, - 1946756846184401, 1929296930380217 -#else - 27406023, 27512775, 27423595, 29057038, 4996213, 10002360, - 38266833, 29008937, 36936121, 28748764 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 849646212452002, 1410198775302919, 73767886183695, - 1641663456615812, 762256272452411 -#else - 11374242, 12660715, 17861383, 21013599, 10935567, 1099227, - 53222788, 24462691, 39381819, 11358503 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 692017667358279, 723305578826727, 1638042139863265, - 748219305990306, 334589200523901 -#else - 54378055, 10311866, 1510375, 10778093, 64989409, 24408729, - 32676002, 11149336, 40985213, 4985767 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 22893968530686, 2235758574399251, 1661465835630252, - 925707319443452, 1203475116966621 -#else - 48012542, 341146, 60911379, 33315398, 15756972, 24757770, - 66125820, 13794113, 47694557, 17933176 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 801299035785166, 1733292596726131, 1664508947088596, - 467749120991922, 1647498584535623 -#else - 6490062, 11940286, 25495923, 25828072, 8668372, 24803116, - 3367602, 6970005, 65417799, 24549641 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 903105258014366, 427141894933047, 561187017169777, - 1884330244401954, 1914145708422219 -#else - 1656478, 13457317, 15370807, 6364910, 13605745, 8362338, - 47934242, 28078708, 50312267, 28522993 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1344191060517578, 1960935031767890, 1518838929955259, - 1781502350597190, 1564784025565682 -#else - 44835530, 20030007, 67044178, 29220208, 48503227, 22632463, - 46537798, 26546453, 67009010, 23317098 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 673723351748086, 1979969272514923, 1175287312495508, - 1187589090978666, 1881897672213940 -#else - 17747446, 10039260, 19368299, 29503841, 46478228, 17513145, - 31992682, 17696456, 37848500, 28042460 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1917185587363432, 1098342571752737, 5935801044414, - 2000527662351839, 1538640296181569 -#else - 31932008, 28568291, 47496481, 16366579, 22023614, 88450, - 11371999, 29810185, 4882241, 22927527 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2495540013192, 678856913479236, 224998292422872, - 219635787698590, 1972465269000940 -#else - 29796488, 37186, 19818052, 10115756, 55279832, 3352735, - 18551198, 3272828, 61917932, 29392022 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 271413961212179, 1353052061471651, 344711291283483, - 2014925838520662, 2006221033113941 -#else - 12501267, 4044383, 58495907, 20162046, 34678811, 5136598, - 47878486, 30024734, 330069, 29895023 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 194583029968109, 514316781467765, 829677956235672, - 1676415686873082, 810104584395840 -#else - 6384877, 2899513, 17807477, 7663917, 64749976, 12363164, - 25366522, 24980540, 66837568, 12071498 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1980510813313589, 1948645276483975, 152063780665900, - 129968026417582, 256984195613935 -#else - 58743349, 29511910, 25133447, 29037077, 60897836, 2265926, - 34339246, 1936674, 61949167, 3829362 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1860190562533102, 1936576191345085, 461100292705964, - 1811043097042830, 957486749306835 -#else - 28425966, 27718999, 66531773, 28857233, 52891308, 6870929, - 7921550, 26986645, 26333139, 14267664 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 796664815624365, 1543160838872951, 1500897791837765, - 1667315977988401, 599303877030711 -#else - 56041645, 11871230, 27385719, 22994888, 62522949, 22365119, - 10004785, 24844944, 45347639, 8930323 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1151480509533204, 2136010406720455, 738796060240027, - 319298003765044, 1150614464349587 -#else - 45911060, 17158396, 25654215, 31829035, 12282011, 11008919, - 1541940, 4757911, 40617363, 17145491 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1731069268103150, 735642447616087, 1364750481334268, - 417232839982871, 927108269127661 -#else - 13537262, 25794942, 46504023, 10961926, 61186044, 20336366, - 53952279, 6217253, 51165165, 13814989 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1017222050227968, 1987716148359, 2234319589635701, - 621282683093392, 2132553131763026 -#else - 49686272, 15157789, 18705543, 29619, 24409717, 33293956, - 27361680, 9257833, 65152338, 31777517 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1567828528453324, 1017807205202360, 565295260895298, - 829541698429100, 307243822276582 -#else - 42063564, 23362465, 15366584, 15166509, 54003778, 8423555, - 37937324, 12361134, 48422886, 4578289 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 249079270936248, 1501514259790706, 947909724204848, - 944551802437487, 552658763982480 -#else - 24579768, 3711570, 1342322, 22374306, 40103728, 14124955, - 44564335, 14074918, 21964432, 8235257 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2089966982947227, 1854140343916181, 2151980759220007, - 2139781292261749, 158070445864917 -#else - 60580251, 31142934, 9442965, 27628844, 12025639, 32067012, - 64127349, 31885225, 13006805, 2355433 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1338766321464554, 1906702607371284, 1519569445519894, - 115384726262267, 1393058953390992 -#else - 50803946, 19949172, 60476436, 28412082, 16974358, 22643349, - 27202043, 1719366, 1141648, 20758196 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1364621558265400, 1512388234908357, 1926731583198686, - 2041482526432505, 920401122333774 -#else - 54244920, 20334445, 58790597, 22536340, 60298718, 28710537, - 13475065, 30420460, 32674894, 13715045 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1884844597333588, 601480070269079, 620203503079537, - 1079527400117915, 1202076693132015 -#else - 11423316, 28086373, 32344215, 8962751, 24989809, 9241752, - 53843611, 16086211, 38367983, 17912338 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 840922919763324, 727955812569642, 1303406629750194, - 522898432152867, 294161410441865 -#else - 65699196, 12530727, 60740138, 10847386, 19531186, 19422272, - 55399715, 7791793, 39862921, 4383346 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 353760790835310, 1598361541848743, 1122905698202299, - 1922533590158905, 419107700666580 -#else - 38137966, 5271446, 65842855, 23817442, 54653627, 16732598, - 62246457, 28647982, 27193556, 6245191 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 359856369838236, 180914355488683, 861726472646627, - 218807937262986, 575626773232501 -#else - 51914908, 5362277, 65324971, 2695833, 4960227, 12840725, - 23061898, 3260492, 22510453, 8577507 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 755467689082474, 909202735047934, 730078068932500, - 936309075711518, 2007798262842972 -#else - 54476394, 11257345, 34415870, 13548176, 66387860, 10879010, - 31168030, 13952092, 37537372, 29918525 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1609384177904073, 362745185608627, 1335318541768201, - 800965770436248, 547877979267412 -#else - 3877321, 23981693, 32416691, 5405324, 56104457, 19897796, - 3759768, 11935320, 5611860, 8164018 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 984339177776787, 815727786505884, 1645154585713747, - 1659074964378553, 1686601651984156 -#else - 50833043, 14667796, 15906460, 12155291, 44997715, 24514713, - 32003001, 24722143, 5773084, 25132323 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1697863093781930, 599794399429786, 1104556219769607, - 830560774794755, 12812858601017 -#else - 43320746, 25300131, 1950874, 8937633, 18686727, 16459170, - 66203139, 12376319, 31632953, 190926 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1168737550514982, 897832437380552, 463140296333799, - 302564600022547, 2008360505135501 -#else - 42515238, 17415546, 58684872, 13378745, 14162407, 6901328, - 58820115, 4508563, 41767309, 29926903 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1856930662813910, 678090852002597, 1920179140755167, - 1259527833759868, 55540971895511 -#else - 8884438, 27670423, 6023973, 10104341, 60227295, 28612898, - 18722940, 18768427, 65436375, 827624 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1158643631044921, 476554103621892, 178447851439725, - 1305025542653569, 103433927680625 -#else - 34388281, 17265135, 34605316, 7101209, 13354605, 2659080, - 65308289, 19446395, 42230385, 1541285 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2176793111709008, 1576725716350391, 2009350167273523, - 2012390194631546, 2125297410909580 -#else - 2901328, 32436745, 3880375, 23495044, 49487923, 29941650, - 45306746, 29986950, 20456844, 31669399 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 825403285195098, 2144208587560784, 1925552004644643, - 1915177840006985, 1015952128947864 -#else - 27019610, 12299467, 53450576, 31951197, 54247203, 28692960, - 47568713, 28538373, 29439640, 15138866 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1807108316634472, 1534392066433717, 347342975407218, - 1153820745616376, 7375003497471 -#else - 21536104, 26928012, 34661045, 22864223, 44700786, 5175813, - 61688824, 17193268, 7779327, 109896 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 983061001799725, 431211889901241, 2201903782961093, - 817393911064341, 2214616493042167 -#else - 30279725, 14648750, 59063993, 6425557, 13639621, 32810923, - 28698389, 12180118, 23177719, 33000357 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 228567918409756, 865093958780220, 358083886450556, - 159617889659320, 1360637926292598 -#else - 26572828, 3405927, 35407164, 12890904, 47843196, 5335865, - 60615096, 2378491, 4439158, 20275085 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 234147501399755, 2229469128637390, 2175289352258889, - 1397401514549353, 1885288963089922 -#else - 44392139, 3489069, 57883598, 33221678, 18875721, 32414337, - 14819433, 20822905, 49391106, 28092994 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1111762412951562, 252849572507389, 1048714233823341, - 146111095601446, 1237505378776770 -#else - 62052362, 16566550, 15953661, 3767752, 56672365, 15627059, - 66287910, 2177224, 8550082, 18440267 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1113790697840279, 1051167139966244, 1045930658550944, - 2011366241542643, 1686166824620755 -#else - 48635543, 16596774, 66727204, 15663610, 22860960, 15585581, - 39264755, 29971692, 43848403, 25125843 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1054097349305049, 1872495070333352, 182121071220717, - 1064378906787311, 100273572924182 -#else - 34628313, 15707274, 58902952, 27902350, 29464557, 2713815, - 44383727, 15860481, 45206294, 1494192 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1306410853171605, 1627717417672447, 50983221088417, - 1109249951172250, 870201789081392 -#else - 47546773, 19467038, 41524991, 24254879, 13127841, 759709, - 21923482, 16529112, 8742704, 12967017 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 104233794644221, 1548919791188248, 2224541913267306, - 2054909377116478, 1043803389015153 -#else - 38643965, 1553204, 32536856, 23080703, 42417258, 33148257, - 58194238, 30620535, 37205105, 15553882 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 216762189468802, 707284285441622, 190678557969733, - 973969342604308, 1403009538434867 -#else - 21877890, 3230008, 9881174, 10539357, 62311749, 2841331, - 11543572, 14513274, 19375923, 20906471 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1279024291038477, 344776835218310, 273722096017199, - 1834200436811442, 634517197663804 -#else - 8832269, 19058947, 13253510, 5137575, 5037871, 4078777, - 24880818, 27331716, 2862652, 9455043 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 343805853118335, 1302216857414201, 566872543223541, - 2051138939539004, 321428858384280 -#else - 29306751, 5123106, 20245049, 19404543, 9592565, 8447059, - 65031740, 30564351, 15511448, 4789663 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 470067171324852, 1618629234173951, 2000092177515639, - 7307679772789, 1117521120249968 -#else - 46429108, 7004546, 8824831, 24119455, 63063159, 29803695, - 61354101, 108892, 23513200, 16652362 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 278151578291475, 1810282338562947, 1771599529530998, - 1383659409671631, 685373414471841 -#else - 33852691, 4144781, 62632835, 26975308, 10770038, 26398890, - 60458447, 20618131, 48789665, 10212859 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 577009397403102, 1791440261786291, 2177643735971638, - 174546149911960, 1412505077782326 -#else - 2756062, 8598110, 7383731, 26694540, 22312758, 32449420, - 21179800, 2600940, 57120566, 21047965 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 893719721537457, 1201282458018197, 1522349501711173, - 58011597740583, 1130406465887139 -#else - 42463153, 13317461, 36659605, 17900503, 21365573, 22684775, - 11344423, 864440, 64609187, 16844368 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 412607348255453, 1280455764199780, 2233277987330768, - 14180080401665, 331584698417165 -#else - 40676061, 6148328, 49924452, 19080277, 18782928, 33278435, - 44547329, 211299, 2719757, 4940997 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 262483770854550, 990511055108216, 526885552771698, - 571664396646158, 354086190278723 -#else - 65784982, 3911312, 60160120, 14759764, 37081714, 7851206, - 21690126, 8518463, 26699843, 5276295 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1820352417585487, 24495617171480, 1547899057533253, - 10041836186225, 480457105094042 -#else - 53958991, 27125364, 9396248, 365013, 24703301, 23065493, - 1321585, 149635, 51656090, 7159368 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2023310314989233, 637905337525881, 2106474638900687, - 557820711084072, 1687858215057826 -#else - 9987761, 30149673, 17507961, 9505530, 9731535, 31388918, - 22356008, 8312176, 22477218, 25151047 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1144168702609745, 604444390410187, 1544541121756138, - 1925315550126027, 626401428894002 -#else - 18155857, 17049442, 19744715, 9006923, 15154154, 23015456, - 24256459, 28689437, 44560690, 9334108 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1922168257351784, 2018674099908659, 1776454117494445, - 956539191509034, 36031129147635 -#else - 2986088, 28642539, 10776627, 30080588, 10620589, 26471229, - 45695018, 14253544, 44521715, 536905 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 544644538748041, 1039872944430374, 876750409130610, - 710657711326551, 1216952687484972 -#else - 4377737, 8115836, 24567078, 15495314, 11625074, 13064599, - 7390551, 10589625, 10838060, 18134008 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 58242421545916, 2035812695641843, 2118491866122923, - 1191684463816273, 46921517454099 -#else - 47766460, 867879, 9277171, 30335973, 52677291, 31567988, - 19295825, 17757482, 6378259, 699185 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 272268252444639, 1374166457774292, 2230115177009552, - 1053149803909880, 1354288411641016 -#else - 7895007, 4057113, 60027092, 20476675, 49222032, 33231305, - 66392824, 15693154, 62063800, 20180469 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1857910905368338, 1754729879288912, 885945464109877, - 1516096106802166, 1602902393369811 -#else - 59371282, 27685029, 52542544, 26147512, 11385653, 13201616, - 31730678, 22591592, 63190227, 23885106 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1193437069800958, 901107149704790, 999672920611411, - 477584824802207, 364239578697845 -#else - 10188286, 17783598, 59772502, 13427542, 22223443, 14896287, - 30743455, 7116568, 45322357, 5427592 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 886299989548838, 1538292895758047, 1590564179491896, - 1944527126709657, 837344427345298 -#else - 696102, 13206899, 27047647, 22922350, 15285304, 23701253, - 10798489, 28975712, 19236242, 12477404 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 754558365378305, 1712186480903618, 1703656826337531, - 750310918489786, 518996040250900 -#else - 55879425, 11243795, 50054594, 25513566, 66320635, 25386464, - 63211194, 11180503, 43939348, 7733643 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1309847803895382, 1462151862813074, 211370866671570, - 1544595152703681, 1027691798954090 -#else - 17800790, 19518253, 40108434, 21787760, 23887826, 3149671, - 23466177, 23016261, 10322026, 15313801 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 803217563745370, 1884799722343599, 1357706345069218, - 2244955901722095, 730869460037413 -#else - 26246234, 11968874, 32263343, 28085704, 6830754, 20231401, - 51314159, 33452449, 42659621, 10890803 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 689299471295966, 1831210565161071, 1375187341585438, - 1106284977546171, 1893781834054269 -#else - 35743198, 10271362, 54448239, 27287163, 16690206, 20491888, - 52126651, 16484930, 25180797, 28219548 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 696351368613042, 1494385251239250, 738037133616932, - 636385507851544, 927483222611406 -#else - 66522290, 10376443, 34522450, 22268075, 19801892, 10997610, - 2276632, 9482883, 316878, 13820577 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1949114198209333, 1104419699537997, 783495707664463, - 1747473107602770, 2002634765788641 -#else - 57226037, 29044064, 64993357, 16457135, 56008783, 11674995, - 30756178, 26039378, 30696929, 29841583 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1607325776830197, 530883941415333, 1451089452727895, - 1581691157083423, 496100432831154 -#else - 32988917, 23951020, 12499365, 7910787, 56491607, 21622917, - 59766047, 23569034, 34759346, 7392472 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1068900648804224, 2006891997072550, 1134049269345549, - 1638760646180091, 2055396084625778 -#else - 58253184, 15927860, 9866406, 29905021, 64711949, 16898650, - 36699387, 24419436, 25112946, 30627788 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2222475519314561, 1870703901472013, 1884051508440561, - 1344072275216753, 1318025677799069 -#else - 64604801, 33117465, 25621773, 27875660, 15085041, 28074555, - 42223985, 20028237, 5537437, 19640113 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 155711679280656, 681100400509288, 389811735211209, - 2135723811340709, 408733211204125 -#else - 55883280, 2320284, 57524584, 10149186, 33664201, 5808647, - 52232613, 31824764, 31234589, 6090599 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 7813206966729, 194444201427550, 2071405409526507, - 1065605076176312, 1645486789731291 -#else - 57475529, 116425, 26083934, 2897444, 60744427, 30866345, 609720, - 15878753, 60138459, 24519663 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 16625790644959, 1647648827778410, 1579910185572704, - 436452271048548, 121070048451050 -#else - 39351007, 247743, 51914090, 24551880, 23288160, 23542496, - 43239268, 6503645, 20650474, 1804084 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1037263028552531, 568385780377829, 297953104144430, - 1558584511931211, 2238221839292471 -#else - 39519059, 15456423, 8972517, 8469608, 15640622, 4439847, - 3121995, 23224719, 27842615, 33352104 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 190565267697443, 672855706028058, 338796554369226, - 337687268493904, 853246848691734 -#else - 51801891, 2839643, 22530074, 10026331, 4602058, 5048462, - 28248656, 5031932, 55733782, 12714368 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1763863028400139, 766498079432444, 1321118624818005, - 69494294452268, 858786744165651 -#else - 20807691, 26283607, 29286140, 11421711, 39232341, 19686201, - 45881388, 1035545, 47375635, 12796919 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1292056768563024, 1456632109855638, 1100631247050184, - 1386133165675321, 1232898350193752 -#else - 12076880, 19253146, 58323862, 21705509, 42096072, 16400683, - 49517369, 20654993, 3480664, 18371617 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 366253102478259, 525676242508811, 1449610995265438, - 1183300845322183, 185960306491545 -#else - 34747315, 5457596, 28548107, 7833186, 7303070, 21600887, - 42745799, 17632556, 33734809, 2771024 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 28315355815982, 460422265558930, 1799675876678724, - 1969256312504498, 1051823843138725 -#else - 45719598, 421931, 26597266, 6860826, 22486084, 26817260, - 49971378, 29344205, 42556581, 15673396 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 156914999361983, 1606148405719949, 1665208410108430, - 317643278692271, 1383783705665320 -#else - 46924223, 2338215, 19788685, 23933476, 63107598, 24813538, - 46837679, 4733253, 3727144, 20619984 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 54684536365732, 2210010038536222, 1194984798155308, - 535239027773705, 1516355079301361 -#else - 6120100, 814863, 55314462, 32931715, 6812204, 17806661, 2019593, - 7975683, 31123697, 22595451 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1484387703771650, 198537510937949, 2186282186359116, - 617687444857508, 647477376402122 -#else - 30069250, 22119100, 30434653, 2958439, 18399564, 32578143, - 12296868, 9204260, 50676426, 9648164 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2147715541830533, 500032538445817, 646380016884826, - 352227855331122, 1488268620408052 -#else - 32705413, 32003455, 30705657, 7451065, 55303258, 9631812, - 3305266, 5248604, 41100532, 22176930 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 159386186465542, 1877626593362941, 618737197060512, - 1026674284330807, 1158121760792685 -#else - 17219846, 2375039, 35537917, 27978816, 47649184, 9219902, - 294711, 15298639, 2662509, 17257359 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1744544377739822, 1964054180355661, 1685781755873170, - 2169740670377448, 1286112621104591 -#else - 65935918, 25995736, 62742093, 29266687, 45762450, 25120105, - 32087528, 32331655, 32247247, 19164571 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 81977249784993, 1667943117713086, 1668983819634866, - 1605016835177615, 1353960708075544 -#else - 14312609, 1221556, 17395390, 24854289, 62163122, 24869796, - 38911119, 23916614, 51081240, 20175586 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1602253788689063, 439542044889886, 2220348297664483, - 657877410752869, 157451572512238 -#else - 65680039, 23875441, 57873182, 6549686, 59725795, 33085767, - 23046501, 9803137, 17597934, 2346211 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1029287186166717, 65860128430192, 525298368814832, - 1491902500801986, 1461064796385400 -#else - 18510781, 15337574, 26171504, 981392, 44867312, 7827555, - 43617730, 22231079, 3059832, 21771562 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 408216988729246, 2121095722306989, 913562102267595, - 1879708920318308, 241061448436731 -#else - 10141598, 6082907, 17829293, 31606789, 9830091, 13613136, - 41552228, 28009845, 33606651, 3592095 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1185483484383269, 1356339572588553, 584932367316448, - 102132779946470, 1792922621116791 -#else - 33114149, 17665080, 40583177, 20211034, 33076704, 8716171, - 1151462, 1521897, 66126199, 26716628 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1966196870701923, 2230044620318636, 1425982460745905, - 261167817826569, 46517743394330 -#else - 34169699, 29298616, 23947180, 33230254, 34035889, 21248794, - 50471177, 3891703, 26353178, 693168 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 107077591595359, 884959942172345, 27306869797400, - 2224911448949390, 964352058245223 -#else - 30374239, 1595580, 50224825, 13186930, 4600344, 406904, 9585294, - 33153764, 31375463, 14369965 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1730194207717538, 431790042319772, 1831515233279467, - 1372080552768581, 1074513929381760 -#else - 52738210, 25781902, 1510300, 6434173, 48324075, 27291703, - 32732229, 20445593, 17901440, 16011505 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1450880638731607, 1019861580989005, 1229729455116861, - 1174945729836143, 826083146840706 -#else - 18171223, 21619806, 54608461, 15197121, 56070717, 18324396, - 47936623, 17508055, 8764034, 12309598 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1899935429242705, 1602068751520477, 940583196550370, - 82431069053859, 1540863155745696 -#else - 5975889, 28311244, 47649501, 23872684, 55567586, 14015781, - 43443107, 1228318, 17544096, 22960650 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2136688454840028, 2099509000964294, 1690800495246475, - 1217643678575476, 828720645084218 -#else - 5811932, 31839139, 3442886, 31285122, 48741515, 25194890, - 49064820, 18144304, 61543482, 12348899 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 765548025667841, 462473984016099, 998061409979798, - 546353034089527, 2212508972466858 -#else - 35709185, 11407554, 25755363, 6891399, 63851926, 14872273, - 42259511, 8141294, 56476330, 32968952 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 46575283771160, 892570971573071, 1281983193144090, - 1491520128287375, 75847005908304 -#else - 54433560, 694025, 62032719, 13300343, 14015258, 19103038, - 57410191, 22225381, 30944592, 1130208 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1801436127943107, 1734436817907890, 1268728090345068, - 167003097070711, 2233597765834956 -#else - 8247747, 26843490, 40546482, 25845122, 52706924, 18905521, - 4652151, 2488540, 23550156, 33283200 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1997562060465113, 1048700225534011, 7615603985628, - 1855310849546841, 2242557647635213 -#else - 17294297, 29765994, 7026747, 15626851, 22990044, 113481, - 2267737, 27646286, 66700045, 33416712 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1161017320376250, 492624580169043, 2169815802355237, - 976496781732542, 1770879511019629 -#else - 16091066, 17300506, 18599251, 7340678, 2137637, 32332775, - 63744702, 14550935, 3260525, 26388161 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1357044908364776, 729130645262438, 1762469072918979, - 1365633616878458, 181282906404941 -#else - 62198760, 20221544, 18550886, 10864893, 50649539, 26262835, - 44079994, 20349526, 54360141, 2701325 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1080413443139865, 1155205815510486, 1848782073549786, - 622566975152580, 124965574467971 -#else - 58534169, 16099414, 4629974, 17213908, 46322650, 27548999, - 57090500, 9276970, 11329923, 1862132 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1184526762066993, 247622751762817, 692129017206356, - 820018689412496, 2188697339828085 -#else - 14763057, 17650824, 36190593, 3689866, 3511892, 10313526, - 45157776, 12219230, 58070901, 32614131 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2020536369003019, 202261491735136, 1053169669150884, - 2056531979272544, 778165514694311 -#else - 8894987, 30108338, 6150752, 3013931, 301220, 15693451, 35127648, - 30644714, 51670695, 11595569 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 237404399610207, 1308324858405118, 1229680749538400, - 720131409105291, 1958958863624906 -#else - 15214943, 3537601, 40870142, 19495559, 4418656, 18323671, - 13947275, 10730794, 53619402, 29190761 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 515583508038846, 17656978857189, 1717918437373989, - 1568052070792483, 46975803123923 -#else - 64570558, 7682792, 32759013, 263109, 37124133, 25598979, - 44776739, 23365796, 977107, 699994 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 281527309158085, 36970532401524, 866906920877543, - 2222282602952734, 1289598729589882 -#else - 54642373, 4195083, 57897332, 550903, 51543527, 12917919, - 19118110, 33114591, 36574330, 19216518 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1278207464902042, 494742455008756, 1262082121427081, - 1577236621659884, 1888786707293291 -#else - 31788442, 19046775, 4799988, 7372237, 8808585, 18806489, - 9408236, 23502657, 12493931, 28145115 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 353042527954210, 1830056151907359, 1111731275799225, - 174960955838824, 404312815582675 -#else - 41428258, 5260743, 47873055, 27269961, 63412921, 16566086, - 27218280, 2607121, 29375955, 6024730 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2064251142068628, 1666421603389706, 1419271365315441, - 468767774902855, 191535130366583 -#else - 842132, 30759739, 62345482, 24831616, 26332017, 21148791, - 11831879, 6985184, 57168503, 2854095 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1716987058588002, 1859366439773457, 1767194234188234, - 64476199777924, 1117233614485261 -#else - 62261602, 25585100, 2516241, 27706719, 9695690, 26333246, - 16512644, 960770, 12121869, 16648078 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 984292135520292, 135138246951259, 2220652137473167, - 1722843421165029, 190482558012909 -#else - 51890212, 14667095, 53772635, 2013716, 30598287, 33090295, - 35603941, 25672367, 20237805, 2838411 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 298845952651262, 1166086588952562, 1179896526238434, - 1347812759398693, 1412945390096208 -#else - 47820798, 4453151, 15298546, 17376044, 22115042, 17581828, - 12544293, 20083975, 1068880, 21054527 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1143239552672925, 906436640714209, 2177000572812152, - 2075299936108548, 325186347798433 -#else - 57549981, 17035596, 33238497, 13506958, 30505848, 32439836, - 58621956, 30924378, 12521377, 4845654 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 721024854374772, 684487861263316, 1373438744094159, - 2193186935276995, 1387043709851261 -#else - 38910324, 10744107, 64150484, 10199663, 7759311, 20465832, - 3409347, 32681032, 60626557, 20668561 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 418098668140962, 715065997721283, 1471916138376055, - 2168570337288357, 937812682637044 -#else - 43547042, 6230155, 46726851, 10655313, 43068279, 21933259, - 10477733, 32314216, 63995636, 13974497 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1043584187226485, 2143395746619356, 2209558562919611, - 482427979307092, 847556718384018 -#else - 12966261, 15550616, 35069916, 31939085, 21025979, 32924988, - 5642324, 7188737, 18895762, 12629579 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1248731221520759, 1465200936117687, 540803492710140, - 52978634680892, 261434490176109 -#else - 14741879, 18607545, 22177207, 21833195, 1279740, 8058600, - 11758140, 789443, 32195181, 3895677 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1057329623869501, 620334067429122, 461700859268034, - 2012481616501857, 297268569108938 -#else - 10758205, 15755439, 62598914, 9243697, 62229442, 6879878, - 64904289, 29988312, 58126794, 4429646 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1055352180870759, 1553151421852298, 1510903185371259, - 1470458349428097, 1226259419062731 -#else - 64654951, 15725972, 46672522, 23143759, 61304955, 22514211, - 59972993, 21911536, 18047435, 18272689 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1492988790301668, 790326625573331, 1190107028409745, - 1389394752159193, 1620408196604194 -#else - 41935844, 22247266, 29759955, 11776784, 44846481, 17733976, - 10993113, 20703595, 49488162, 24145963 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 47000654413729, 1004754424173864, 1868044813557703, - 173236934059409, 588771199737015 -#else - 21987233, 700364, 42603816, 14972007, 59334599, 27836036, - 32155025, 2581431, 37149879, 8773374 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 30498470091663, 1082245510489825, 576771653181956, - 806509986132686, 1317634017056939 -#else - 41540495, 454462, 53896929, 16126714, 25240068, 8594567, - 20656846, 12017935, 59234475, 19634276 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 420308055751555, 1493354863316002, 165206721528088, - 1884845694919786, 2065456951573059 -#else - 6028163, 6263078, 36097058, 22252721, 66289944, 2461771, - 35267690, 28086389, 65387075, 30777706 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1115636332012334, 1854340990964155, 83792697369514, - 1972177451994021, 457455116057587 -#else - 54829870, 16624276, 987579, 27631834, 32908202, 1248608, - 7719845, 29387734, 28408819, 6816612 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1698968457310898, 1435137169051090, 1083661677032510, - 938363267483709, 340103887207182 -#else - 56750770, 25316602, 19549650, 21385210, 22082622, 16147817, - 20613181, 13982702, 56769294, 5067942 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1995325341336574, 911500251774648, 164010755403692, - 855378419194762, 1573601397528842 -#else - 36602878, 29732664, 12074680, 13582412, 47230892, 2443950, - 47389578, 12746131, 5331210, 23448488 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 241719380661528, 310028521317150, 1215881323380194, - 1408214976493624, 2141142156467363 -#else - 30528792, 3601899, 65151774, 4619784, 39747042, 18118043, - 24180792, 20984038, 27679907, 31905504 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1315157046163473, 727368447885818, 1363466668108618, - 1668921439990361, 1398483384337907 -#else - 9402385, 19597367, 32834042, 10838634, 40528714, 20317236, - 26653273, 24868867, 22611443, 20839026 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 75029678299646, 1015388206460473, 1849729037055212, - 1939814616452984, 444404230394954 -#else - 22190590, 1118029, 22736441, 15130463, 36648172, 27563110, - 19189624, 28905490, 4854858, 6622139 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2053597130993710, 2024431685856332, 2233550957004860, - 2012407275509545, 872546993104440 -#else - 58798126, 30600981, 58846284, 30166382, 56707132, 33282502, - 13424425, 29987205, 26404408, 13001963 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1217269667678610, 599909351968693, 1390077048548598, - 1471879360694802, 739586172317596 -#else - 35867026, 18138731, 64114613, 8939345, 11562230, 20713762, - 41044498, 21932711, 51703708, 11020692 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1718318639380794, 1560510726633958, 904462881159922, - 1418028351780052, 94404349451937 -#else - 1866042, 25604943, 59210214, 23253421, 12483314, 13477547, - 3175636, 21130269, 28761761, 1406734 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2132502667405250, 214379346175414, 1502748313768060, - 1960071701057800, 1353971822643138 -#else - 66660290, 31776765, 13018550, 3194501, 57528444, 22392694, - 24760584, 29207344, 25577410, 20175752 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 319394212043702, 2127459436033571, 717646691535162, - 663366796076914, 318459064945314 -#else - 42818486, 4759344, 66418211, 31701615, 2066746, 10693769, - 37513074, 9884935, 57739938, 4745409 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 405989424923593, 1960452633787083, 667349034401665, - 1492674260767112, 1451061489880787 -#else - 57967561, 6049713, 47577803, 29213020, 35848065, 9944275, - 51646856, 22242579, 10931923, 21622501 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 947085906234007, 323284730494107, 1485778563977200, - 728576821512394, 901584347702286 -#else - 50547351, 14112679, 59096219, 4817317, 59068400, 22139825, - 44255434, 10856640, 46638094, 13434653 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1575783124125742, 2126210792434375, 1569430791264065, - 1402582372904727, 1891780248341114 -#else - 22759470, 23480998, 50342599, 31683009, 13637441, 23386341, - 1765143, 20900106, 28445306, 28189722 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 838432205560695, 1997703511451664, 1018791879907867, - 1662001808174331, 78328132957753 -#else - 29875063, 12493613, 2795536, 29768102, 1710619, 15181182, - 56913147, 24765756, 9074233, 1167180 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 739152638255629, 2074935399403557, 505483666745895, - 1611883356514088, 628654635394878 -#else - 40903181, 11014232, 57266213, 30918946, 40200743, 7532293, - 48391976, 24018933, 3843902, 9367684 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1822054032121349, 643057948186973, 7306757352712, - 577249257962099, 284735863382083 -#else - 56139269, 27150720, 9591133, 9582310, 11349256, 108879, - 16235123, 8601684, 66969667, 4242894 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1366558556363930, 1448606567552086, 1478881020944768, - 165803179355898, 1115718458123498 -#else - 22092954, 20363309, 65066070, 21585919, 32186752, 22037044, - 60534522, 2470659, 39691498, 16625500 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 204146226972102, 1630511199034723, 2215235214174763, - 174665910283542, 956127674017216 -#else - 56051142, 3042015, 13770083, 24296510, 584235, 33009577, - 59338006, 2602724, 39757248, 14247412 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1562934578796716, 1070893489712745, 11324610642270, - 958989751581897, 2172552325473805 -#else - 6314156, 23289540, 34336361, 15957556, 56951134, 168749, - 58490057, 14290060, 27108877, 32373552 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1770564423056027, 735523631664565, 1326060113795289, - 1509650369341127, 65892421582684 -#else - 58522267, 26383465, 13241781, 10960156, 34117849, 19759835, - 33547975, 22495543, 39960412, 981873 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 623682558650637, 1337866509471512, 990313350206649, - 1314236615762469, 1164772974270275 -#else - 22833421, 9293594, 34459416, 19935764, 57971897, 14756818, - 44180005, 19583651, 56629059, 17356469 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 223256821462517, 723690150104139, 1000261663630601, - 933280913953265, 254872671543046 -#else - 59340277, 3326785, 38997067, 10783823, 19178761, 14905060, - 22680049, 13906969, 51175174, 3797898 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1969087237026041, 624795725447124, 1335555107635969, - 2069986355593023, 1712100149341902 -#else - 21721337, 29341686, 54902740, 9310181, 63226625, 19901321, - 23740223, 30845200, 20491982, 25512280 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1236103475266979, 1837885883267218, 1026072585230455, - 1025865513954973, 1801964901432134 -#else - 9209251, 18419377, 53852306, 27386633, 66377847, 15289672, - 25947805, 15286587, 30997318, 26851369 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1115241013365517, 1712251818829143, 2148864332502771, - 2096001471438138, 2235017246626125 -#else - 7392013, 16618386, 23946583, 25514540, 53843699, 32020573, - 52911418, 31232855, 17649997, 33304352 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1299268198601632, 2047148477845621, 2165648650132450, - 1612539282026145, 514197911628890 -#else - 57807776, 19360604, 30609525, 30504889, 41933794, 32270679, - 51867297, 24028707, 64875610, 7662145 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 118352772338543, 1067608711804704, 1434796676193498, - 1683240170548391, 230866769907437 -#else - 49550191, 1763593, 33994528, 15908609, 37067994, 21380136, - 7335079, 25082233, 63934189, 3440182 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1850689576796636, 1601590730430274, 1139674615958142, - 1954384401440257, 76039205311 -#else - 47219164, 27577423, 42997570, 23865561, 10799742, 16982475, - 40449, 29122597, 4862399, 1133 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1723387471374172, 997301467038410, 533927635123657, - 20928644693965, 1756575222802513 -#else - 34252636, 25680474, 61686474, 14860949, 50789833, 7956141, - 7258061, 311861, 36513873, 26175010 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2146711623855116, 503278928021499, 625853062251406, - 1109121378393107, 1033853809911861 -#else - 63335436, 31988495, 28985339, 7499440, 24445838, 9325937, - 29727763, 16527196, 18278453, 15405622 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 571005965509422, 2005213373292546, 1016697270349626, - 56607856974274, 914438579435146 -#else - 62726958, 8508651, 47210498, 29880007, 61124410, 15149969, - 53795266, 843522, 45233802, 13626196 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1346698876211176, 2076651707527589, 1084761571110205, - 265334478828406, 1068954492309671 -#else - 2281448, 20067377, 56193445, 30944521, 1879357, 16164207, - 56324982, 3953791, 13340839, 15928663 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1769967932677654, 1695893319756416, 1151863389675920, - 1781042784397689, 400287774418285 -#else - 31727126, 26374577, 48671360, 25270779, 2875792, 17164102, - 41838969, 26539605, 43656557, 5964752 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1851867764003121, 403841933237558, 820549523771987, - 761292590207581, 1743735048551143 -#else - 4100401, 27594980, 49929526, 6017713, 48403027, 12227140, - 40424029, 11344143, 2538215, 25983677 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 410915148140008, 2107072311871739, 1004367461876503, - 99684895396761, 1180818713503224 -#else - 57675240, 6123112, 11159803, 31397824, 30016279, 14966241, - 46633881, 1485420, 66479608, 17595569 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 285945406881439, 648174397347453, 1098403762631981, - 1366547441102991, 1505876883139217 -#else - 40304287, 4260918, 11851389, 9658551, 35091757, 16367491, - 46903439, 20363143, 11659921, 22439314 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 672095903120153, 1675918957959872, 636236529315028, - 1569297300327696, 2164144194785875 -#else - 26180377, 10015009, 36264640, 24973138, 5418196, 9480663, - 2231568, 23384352, 33100371, 32248261 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1902708175321798, 1035343530915438, 1178560808893263, - 301095684058146, 1280977479761118 -#else - 15121094, 28352561, 56718958, 15427820, 39598927, 17561924, - 21670946, 4486675, 61177054, 19088051 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1615357281742403, 404257611616381, 2160201349780978, - 1160947379188955, 1578038619549541 -#else - 16166467, 24070699, 56004733, 6023907, 35182066, 32189508, - 2340059, 17299464, 56373093, 23514607 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2013087639791217, 822734930507457, 1785668418619014, - 1668650702946164, 389450875221715 -#else - 28042865, 29997343, 54982337, 12259705, 63391366, 26608532, - 6766452, 24864833, 18036435, 5803270 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 453918449698368, 106406819929001, 2072540975937135, - 308588860670238, 1304394580755385 -#else - 66291264, 6763911, 11803561, 1585585, 10958447, 30883267, - 23855390, 4598332, 60949433, 19436993 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1295082798350326, 2091844511495996, 1851348972587817, - 3375039684596, 789440738712837 -#else - 36077558, 19298237, 17332028, 31170912, 31312681, 27587249, - 696308, 50292, 47013125, 11763583 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2083069137186154, 848523102004566, 993982213589257, - 1405313299916317, 1532824818698468 -#else - 66514282, 31040148, 34874710, 12643979, 12650761, 14811489, - 665117, 20940800, 47335652, 22840869 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1495961298852430, 1397203457344779, 1774950217066942, - 139302743555696, 66603584342787 -#else - 30464590, 22291560, 62981387, 20819953, 19835326, 26448819, - 42712688, 2075772, 50088707, 992470 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1782411379088302, 1096724939964781, 27593390721418, - 542241850291353, 1540337798439873 -#else - 18357166, 26559999, 7766381, 16342475, 37783946, 411173, - 14578841, 8080033, 55534529, 22952821 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 693543956581437, 171507720360750, 1557908942697227, - 1074697073443438, 1104093109037196 -#else - 19598397, 10334610, 12555054, 2555664, 18821899, 23214652, - 21873262, 16014234, 26224780, 16452269 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 345288228393419, 1099643569747172, 134881908403743, - 1740551994106740, 248212179299770 -#else - 36884939, 5145195, 5944548, 16385966, 3976735, 2009897, - 55731060, 25936245, 46575034, 3698649 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 231429562203065, 1526290236421172, 2021375064026423, - 1520954495658041, 806337791525116 -#else - 14187449, 3448569, 56472628, 22743496, 44444983, 30120835, - 7268409, 22663988, 27394300, 12015369 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1079623667189886, 872403650198613, 766894200588288, - 2163700860774109, 2023464507911816 -#else - 19695742, 16087646, 28032085, 12999827, 6817792, 11427614, - 20244189, 32241655, 53849736, 30151970 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 854645372543796, 1936406001954827, 151460662541253, - 825325739271555, 1554306377287556 -#else - 30860084, 12735208, 65220619, 28854697, 50133957, 2256939, - 58942851, 12298311, 58558340, 23160969 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1497138821904622, 1044820250515590, 1742593886423484, - 1237204112746837, 849047450816987 -#else - 61389038, 22309106, 65198214, 15569034, 26642876, 25966672, - 61319509, 18435777, 62132699, 12651792 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 667962773375330, 1897271816877105, 1399712621683474, - 1143302161683099, 2081798441209593 -#else - 64260450, 9953420, 11531313, 28271553, 26895122, 20857343, - 53990043, 17036529, 9768697, 31021214 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 127147851567005, 1936114012888110, 1704424366552046, - 856674880716312, 716603621335359 -#else - 42389405, 1894650, 66821166, 28850346, 15348718, 25397902, - 32767512, 12765450, 4940095, 10678226 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1072409664800960, 2146937497077528, 1508780108920651, - 935767602384853, 1112800433544068 -#else - 18860224, 15980149, 48121624, 31991861, 40875851, 22482575, - 59264981, 13944023, 42736516, 16582018 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 333549023751292, 280219272863308, 2104176666454852, - 1036466864875785, 536135186520207 -#else - 51604604, 4970267, 37215820, 4175592, 46115652, 31354675, - 55404809, 15444559, 56105103, 7989036 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 373666279883137, 146457241530109, 304116267127857, - 416088749147715, 1258577131183391 -#else - 31490433, 5568061, 64696061, 2182382, 34772017, 4531685, - 35030595, 6200205, 47422751, 18754260 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1186115062588401, 2251609796968486, 1098944457878953, - 1153112761201374, 1791625503417267 -#else - 49800177, 17674491, 35586086, 33551600, 34221481, 16375548, - 8680158, 17182719, 28550067, 26697300 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1870078460219737, 2129630962183380, 852283639691142, - 292865602592851, 401904317342226 -#else - 38981977, 27866340, 16837844, 31733974, 60258182, 12700015, - 37068883, 4364037, 1155602, 5988841 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1361070124828035, 815664541425524, 1026798897364671, - 1951790935390647, 555874891834790 -#else - 21890435, 20281525, 54484852, 12154348, 59276991, 15300495, - 23148983, 29083951, 24618406, 8283181 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1546301003424277, 459094500062839, 1097668518375311, - 1780297770129643, 720763293687608 -#else - 33972757, 23041680, 9975415, 6841041, 35549071, 16356535, - 3070187, 26528504, 1466168, 10740210 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1212405311403990, 1536693382542438, 61028431067459, - 1863929423417129, 1223219538638038 -#else - 65599446, 18066246, 53605478, 22898515, 32799043, 909394, - 53169961, 27774712, 34944214, 18227391 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1294303766540260, 1183557465955093, 882271357233093, - 63854569425375, 2213283684565087 -#else - 3960804, 19286629, 39082773, 17636380, 47704005, 13146867, - 15567327, 951507, 63848543, 32980496 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 339050984211414, 601386726509773, 413735232134068, - 966191255137228, 1839475899458159 -#else - 24740822, 5052253, 37014733, 8961360, 25877428, 6165135, - 42740684, 14397371, 59728495, 27410326 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 235605972169408, 2174055643032978, 1538335001838863, - 1281866796917192, 1815940222628465 -#else - 38220480, 3510802, 39005586, 32395953, 55870735, 22922977, - 51667400, 19101303, 65483377, 27059617 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1632352921721536, 1833328609514701, 2092779091951987, - 1923956201873226, 2210068022482919 -#else - 793280, 24323954, 8836301, 27318725, 39747955, 31184838, - 33152842, 28669181, 57202663, 32932579 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 35271216625062, 1712350667021807, 983664255668860, - 98571260373038, 1232645608559836 -#else - 5666214, 525582, 20782575, 25516013, 42570364, 14657739, - 16099374, 1468826, 60937436, 18367850 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1998172393429622, 1798947921427073, 784387737563581, - 1589352214827263, 1589861734168180 -#else - 62249590, 29775088, 64191105, 26806412, 7778749, 11688288, - 36704511, 23683193, 65549940, 23690785 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1733739258725305, 31715717059538, 201969945218860, - 992093044556990, 1194308773174556 -#else - 10896313, 25834728, 824274, 472601, 47648556, 3009586, 25248958, - 14783338, 36527388, 17796587 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 846415389605137, 746163495539180, 829658752826080, - 592067705956946, 957242537821393 -#else - 10566929, 12612572, 35164652, 11118702, 54475488, 12362878, - 21752402, 8822496, 24003793, 14264025 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1758148849754419, 619249044817679, 168089007997045, - 1371497636330523, 1867101418880350 -#else - 27713843, 26198459, 56100623, 9227529, 27050101, 2504721, - 23886875, 20436907, 13958494, 27821979 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 326633984209635, 261759506071016, 1700682323676193, - 1577907266349064, 1217647663383016 -#else - 43627235, 4867225, 39861736, 3900520, 29838369, 25342141, - 35219464, 23512650, 7340520, 18144364 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1714182387328607, 1477856482074168, 574895689942184, - 2159118410227270, 1555532449716575 -#else - 4646495, 25543308, 44342840, 22021777, 23184552, 8566613, - 31366726, 32173371, 52042079, 23179239 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 853828206885131, 998498946036955, 1835887550391235, - 207627336608048, 258363815956050 -#else - 49838347, 12723031, 50115803, 14878793, 21619651, 27356856, - 27584816, 3093888, 58265170, 3849920 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 141141474651677, 1236728744905256, 643101419899887, - 1646615130509173, 1208239602291765 -#else - 58043933, 2103171, 25561640, 18428694, 61869039, 9582957, - 32477045, 24536477, 5002293, 18004173 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1501663228068911, 1354879465566912, 1444432675498247, - 897812463852601, 855062598754348 -#else - 55051311, 22376525, 21115584, 20189277, 8808711, 21523724, - 16489529, 13378448, 41263148, 12741425 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 714380763546606, 1032824444965790, 1774073483745338, - 1063840874947367, 1738680636537158 -#else - 61162478, 10645102, 36197278, 15390283, 63821882, 26435754, - 24306471, 15852464, 28834118, 25908360 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1640635546696252, 633168953192112, 2212651044092396, - 30590958583852, 368515260889378 -#else - 49773116, 24447374, 42577584, 9434952, 58636780, 32971069, - 54018092, 455840, 20461858, 5491305 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1171650314802029, 1567085444565577, 1453660792008405, - 757914533009261, 1619511342778196 -#else - 13669229, 17458950, 54626889, 23351392, 52539093, 21661233, - 42112877, 11293806, 38520660, 24132599 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 420958967093237, 971103481109486, 2169549185607107, - 1301191633558497, 1661514101014240 -#else - 28497909, 6272777, 34085870, 14470569, 8906179, 32328802, - 18504673, 19389266, 29867744, 24758489 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 907123651818302, 1332556122804146, 1824055253424487, - 1367614217442959, 1982558335973172 -#else - 50901822, 13517195, 39309234, 19856633, 24009063, 27180541, - 60741263, 20379039, 22853428, 29542421 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1121533090144639, 1021251337022187, 110469995947421, - 1511059774758394, 2110035908131662 -#else - 24191359, 16712145, 53177067, 15217830, 14542237, 1646131, - 18603514, 22516545, 12876622, 31441985 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 303213233384524, 2061932261128138, 352862124777736, - 40828818670255, 249879468482660 -#else - 17902668, 4518229, 66697162, 30725184, 26878216, 5258055, - 54248111, 608396, 16031844, 3723494 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 856559257852200, 508517664949010, 1378193767894916, - 1723459126947129, 1962275756614521 -#else - 38476072, 12763727, 46662418, 7577503, 33001348, 20536687, - 17558841, 25681542, 23896953, 29240187 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1445691340537320, 40614383122127, 402104303144865, - 485134269878232, 1659439323587426 -#else - 47103464, 21542479, 31520463, 605201, 2543521, 5991821, - 64163800, 7229063, 57189218, 24727572 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 20057458979482, 1183363722525800, 2140003847237215, - 2053873950687614, 2112017736174909 -#else - 28816026, 298879, 38943848, 17633493, 19000927, 31888542, - 54428030, 30605106, 49057085, 31471516 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2228654250927986, 1483591363415267, 1368661293910956, - 1076511285177291, 526650682059608 -#else - 16000882, 33209536, 3493091, 22107234, 37604268, 20394642, - 12577739, 16041268, 47393624, 7847706 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 709481497028540, 531682216165724, 316963769431931, - 1814315888453765, 258560242424104 -#else - 10151868, 10572098, 27312476, 7922682, 14825339, 4723128, - 34252933, 27035413, 57088296, 3852847 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1053447823660455, 1955135194248683, 1010900954918985, - 1182614026976701, 1240051576966610 -#else - 55678375, 15697595, 45987307, 29133784, 5386313, 15063598, - 16514493, 17622322, 29330898, 18478208 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1957943897155497, 1788667368028035, 137692910029106, - 1039519607062, 826404763313028 -#else - 41609129, 29175637, 51885955, 26653220, 16615730, 2051784, - 3303702, 15490, 39560068, 12314390 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1848942433095597, 1582009882530495, 1849292741020143, - 1068498323302788, 2001402229799484 -#else - 15683501, 27551389, 18109119, 23573784, 15337967, 27556609, - 50391428, 15921865, 16103996, 29823217 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1528282417624269, 2142492439828191, 2179662545816034, - 362568973150328, 1591374675250271 -#else - 43939021, 22773182, 13588191, 31925625, 63310306, 32479502, - 47835256, 5402698, 37293151, 23713330 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 160026679434388, 232341189218716, 2149181472355545, - 598041771119831, 183859001910173 -#else - 23190676, 2384583, 34394524, 3462153, 37205209, 32025299, - 55842007, 8911516, 41903005, 2739712 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2013278155187349, 662660471354454, 793981225706267, - 411706605985744, 804490933124791 -#else - 21374101, 30000182, 33584214, 9874410, 15377179, 11831242, - 33578960, 6134906, 4931255, 11987849 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2051892037280204, 488391251096321, 2230187337030708, - 930221970662692, 679002758255210 -#else - 67101132, 30575573, 50885377, 7277596, 105524, 33232381, - 35628324, 13861387, 37032554, 10117929 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1530723630438670, 875873929577927, 341560134269988, - 449903119530753, 1055551308214179 -#else - 37607694, 22809559, 40945095, 13051538, 41483300, 5089642, - 60783361, 6704078, 12890019, 15728940 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1461835919309432, 1955256480136428, 180866187813063, - 1551979252664528, 557743861963950 -#else - 45136504, 21783052, 66157804, 29135591, 14704839, 2695116, - 903376, 23126293, 12885166, 8311031 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 359179641731115, 1324915145732949, 902828372691474, - 294254275669987, 1887036027752957 -#else - 49592363, 5352193, 10384213, 19742774, 7506450, 13453191, - 26423267, 4384730, 1888765, 28119028 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2043271609454323, 2038225437857464, 1317528426475850, - 1398989128982787, 2027639881006861 -#else - 41291507, 30447119, 53614264, 30371925, 30896458, 19632703, - 34857219, 20846562, 47644429, 30214188 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2072902725256516, 312132452743412, 309930885642209, - 996244312618453, 1590501300352303 -#else - 43500868, 30888657, 66582772, 4651135, 5765089, 4618330, - 6092245, 14845197, 17151279, 23700316 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1397254305160710, 695734355138021, 2233992044438756, - 1776180593969996, 1085588199351115 -#else - 42278406, 20820711, 51942885, 10367249, 37577956, 33289075, - 22825804, 26467153, 50242379, 16176524 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 440567051331029, 254894786356681, 493869224930222, - 1556322069683366, 1567456540319218 -#else - 43525589, 6564960, 20063689, 3798228, 62368686, 7359224, - 2006182, 23191006, 38362610, 23356922 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1950722461391320, 1907845598854797, 1822757481635527, - 2121567704750244, 73811931471221 -#else - 56482264, 29068029, 53788301, 28429114, 3432135, 27161203, - 23632036, 31613822, 32808309, 1099883 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 387139307395758, 2058036430315676, 1220915649965325, - 1794832055328951, 1230009312169328 -#else - 15030958, 5768825, 39657628, 30667132, 60681485, 18193060, - 51830967, 26745081, 2051440, 18328567 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1765973779329517, 659344059446977, 19821901606666, - 1301928341311214, 1116266004075885 -#else - 63746541, 26315059, 7517889, 9824992, 23555850, 295369, 5148398, - 19400244, 44422509, 16633659 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1127572801181483, 1224743760571696, 1276219889847274, - 1529738721702581, 1589819666871853 -#else - 4577067, 16802144, 13249840, 18250104, 19958762, 19017158, - 18559669, 22794883, 8402477, 23690159 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2181229378964934, 2190885205260020, 1511536077659137, - 1246504208580490, 668883326494241 -#else - 38702534, 32502850, 40318708, 32646733, 49896449, 22523642, - 9453450, 18574360, 17983009, 9967138 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 437866655573314, 669026411194768, 81896997980338, - 523874406393178, 245052060935236 -#else - 41346370, 6524721, 26585488, 9969270, 24709298, 1220360, - 65430874, 7806336, 17507396, 3651560 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1975438052228868, 1071801519999806, 594652299224319, - 1877697652668809, 1489635366987285 -#else - 56688388, 29436320, 14584638, 15971087, 51340543, 8861009, - 26556809, 27979875, 48555541, 22197296 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 958592545673770, 233048016518599, 851568750216589, - 567703851596087, 1740300006094761 -#else - 2839082, 14284142, 4029895, 3472686, 14402957, 12689363, - 40466743, 8459446, 61503401, 25932490 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2014540178270324, 192672779514432, 213877182641530, - 2194819933853411, 1716422829364835 -#else - 62269556, 30018987, 9744960, 2871048, 25113978, 3187018, - 41998051, 32705365, 17258083, 25576693 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1540769606609725, 2148289943846077, 1597804156127445, - 1230603716683868, 815423458809453 -#else - 18164541, 22959256, 49953981, 32012014, 19237077, 23809137, - 23357532, 18337424, 26908269, 12150756 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1738560251245018, 1779576754536888, 1783765347671392, - 1880170990446751, 1088225159617541 -#else - 36843994, 25906566, 5112248, 26517760, 65609056, 26580174, - 43167, 28016731, 34806789, 16215818 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 659303913929492, 1956447718227573, 1830568515922666, - 841069049744408, 1669607124206368 -#else - 60209940, 9824393, 54804085, 29153342, 35711722, 27277596, - 32574488, 12532905, 59605792, 24879084 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1143465490433355, 1532194726196059, 1093276745494697, - 481041706116088, 2121405433561163 -#else - 39765323, 17038963, 39957339, 22831480, 946345, 16291093, - 254968, 7168080, 21676107, 31611404 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1686424298744462, 1451806974487153, 266296068846582, - 1834686947542675, 1720762336132256 -#else - 21260942, 25129680, 50276977, 21633609, 43430902, 3968120, - 63456915, 27338965, 63552672, 25641356 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 889217026388959, 1043290623284660, 856125087551909, - 1669272323124636, 1603340330827879 -#else - 16544735, 13250366, 50304436, 15546241, 62525861, 12757257, - 64646556, 24874095, 48201831, 23891632 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1206396181488998, 333158148435054, 1402633492821422, - 1120091191722026, 1945474114550509 -#else - 64693606, 17976703, 18312302, 4964443, 51836334, 20900867, - 26820650, 16690659, 25459437, 28989823 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 766720088232571, 1512222781191002, 1189719893490790, - 2091302129467914, 2141418006894941 -#else - 41964155, 11425019, 28423002, 22533875, 60963942, 17728207, - 9142794, 31162830, 60676445, 31909614 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 419663647306612, 1998875112167987, 1426599870253707, - 1154928355379510, 486538532138187 -#else - 44004212, 6253475, 16964147, 29785560, 41994891, 21257994, - 39651638, 17209773, 6335691, 7249989 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 938160078005954, 1421776319053174, 1941643234741774, - 180002183320818, 1414380336750546 -#else - 36775618, 13979674, 7503222, 21186118, 55152142, 28932738, - 36836594, 2682241, 25993170, 21075909 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 398001940109652, 1577721237663248, 1012748649830402, - 1540516006905144, 1011684812884559 -#else - 4364628, 5930691, 32304656, 23509878, 59054082, 15091130, - 22857016, 22955477, 31820367, 15075278 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1653276489969630, 6081825167624, 1921777941170836, - 1604139841794531, 861211053640641 -#else - 31879134, 24635739, 17258760, 90626, 59067028, 28636722, - 24162787, 23903546, 49138625, 12833044 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 996661541407379, 1455877387952927, 744312806857277, - 139213896196746, 1000282908547789 -#else - 19073683, 14851414, 42705695, 21694263, 7625277, 11091125, - 47489674, 2074448, 57694925, 14905376 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1450817495603008, 1476865707053229, 1030490562252053, - 620966950353376, 1744760161539058 -#else - 24483648, 21618865, 64589997, 22007013, 65555733, 15355505, - 41826784, 9253128, 27628530, 25998952 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 559728410002599, 37056661641185, 2038622963352006, - 1637244893271723, 1026565352238948 -#else - 17597607, 8340603, 19355617, 552187, 26198470, 30377849, - 4593323, 24396850, 52997988, 15297015 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 962165956135846, 1116599660248791, 182090178006815, - 1455605467021751, 196053588803284 -#else - 510886, 14337390, 35323607, 16638631, 6328095, 2713355, - 46891447, 21690211, 8683220, 2921426 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 796863823080135, 1897365583584155, 420466939481601, - 2165972651724672, 932177357788289 -#else - 18606791, 11874196, 27155355, 28272950, 43077121, 6265445, - 41930624, 32275507, 4674689, 13890525 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 877047233620632, 1375632631944375, 643773611882121, - 660022738847877, 19353932331831 -#else - 13609624, 13069022, 39736503, 20498523, 24360585, 9592974, - 14977157, 9835105, 4389687, 288396 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2216943882299338, 394841323190322, 2222656898319671, - 558186553950529, 1077236877025190 -#else - 9922506, 33035038, 13613106, 5883594, 48350519, 33120168, - 54804801, 8317627, 23388070, 16052080 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 801118384953213, 1914330175515892, 574541023311511, - 1471123787903705, 1526158900256288 -#else - 12719997, 11937594, 35138804, 28525742, 26900119, 8561328, - 46953177, 21921452, 52354592, 22741539 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 949617889087234, 2207116611267331, 912920039141287, - 501158539198789, 62362560771472 -#else - 15961858, 14150409, 26716931, 32888600, 44314535, 13603568, - 11829573, 7467844, 38286736, 929274 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1474518386765335, 1760793622169197, 1157399790472736, - 1622864308058898, 165428294422792 -#else - 11038231, 21972036, 39798381, 26237869, 56610336, 17246600, - 43629330, 24182562, 45715720, 2465073 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1961673048027128, 102619413083113, 1051982726768458, - 1603657989805485, 1941613251499678 -#else - 20017144, 29231206, 27915241, 1529148, 12396362, 15675764, - 13817261, 23896366, 2463390, 28932292 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1401939116319266, 335306339903072, 72046196085786, - 862423201496006, 850518754531384 -#else - 50749986, 20890520, 55043680, 4996453, 65852442, 1073571, - 9583558, 12851107, 4003896, 12673717 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1234706593321979, 1083343891215917, 898273974314935, - 1640859118399498, 157578398571149 -#else - 65377275, 18398561, 63845933, 16143081, 19294135, 13385325, - 14741514, 24450706, 7903885, 2348101 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1143483057726416, 1992614991758919, 674268662140796, - 1773370048077526, 674318359920189 -#else - 24536016, 17039225, 12715591, 29692277, 1511292, 10047386, - 63266518, 26425272, 38731325, 10048126 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1835401379538542, 173900035308392, 818247630716732, - 1762100412152786, 1021506399448291 -#else - 54486638, 27349611, 30718824, 2591312, 56491836, 12192839, - 18873298, 26257342, 34811107, 15221631 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1506632088156630, 2127481795522179, 513812919490255, - 140643715928370, 442476620300318 -#else - 40630742, 22450567, 11546243, 31701949, 9180879, 7656409, - 45764914, 2095754, 29769758, 6593415 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2056683376856736, 219094741662735, 2193541883188309, - 1841182310235800, 556477468664293 -#else - 35114656, 30646970, 4176911, 3264766, 12538965, 32686321, - 26312344, 27435754, 30958053, 8292160 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1315019427910827, 1049075855992603, 2066573052986543, - 266904467185534, 2040482348591520 -#else - 31429803, 19595316, 29173531, 15632448, 12174511, 30794338, - 32808830, 3977186, 26143136, 30405556 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 94096246544434, 922482381166992, 24517828745563, - 2139430508542503, 2097139044231004 -#else - 22648882, 1402143, 44308880, 13746058, 7936347, 365344, - 58440231, 31879998, 63350620, 31249806 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 537697207950515, 1399352016347350, 1563663552106345, - 2148749520888918, 549922092988516 -#else - 51616947, 8012312, 64594134, 20851969, 43143017, 23300402, - 65496150, 32018862, 50444388, 8194477 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1747985413252434, 680511052635695, 1809559829982725, - 594274250930054, 201673170745982 -#else - 27338066, 26047012, 59694639, 10140404, 48082437, 26964542, - 27277190, 8855376, 28572286, 3005164 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 323583936109569, 1973572998577657, 1192219029966558, - 79354804385273, 1374043025560347 -#else - 26287105, 4821776, 25476601, 29408529, 63344350, 17765447, - 49100281, 1182478, 41014043, 20474836 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 213277331329947, 416202017849623, 1950535221091783, - 1313441578103244, 2171386783823658 -#else - 59937691, 3178079, 23970071, 6201893, 49913287, 29065239, - 45232588, 19571804, 32208682, 32356184 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 189088804229831, 993969372859110, 895870121536987, - 1547301535298256, 1477373024911350 -#else - 50451143, 2817642, 56822502, 14811297, 6024667, 13349505, - 39793360, 23056589, 39436278, 22014573 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1620578418245010, 541035331188469, 2235785724453865, - 2154865809088198, 1974627268751826 -#else - 15941010, 24148500, 45741813, 8062054, 31876073, 33315803, - 51830470, 32110002, 15397330, 29424239 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1346805451740245, 1350981335690626, 942744349501813, - 2155094562545502, 1012483751693409 -#else - 8934485, 20068965, 43822466, 20131190, 34662773, 14047985, - 31170398, 32113411, 39603297, 15087183 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2107080134091762, 1132567062788208, 1824935377687210, - 769194804343737, 1857941799971888 -#else - 48751602, 31397940, 24524912, 16876564, 15520426, 27193656, - 51606457, 11461895, 16788528, 27685490 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1074666112436467, 249279386739593, 1174337926625354, - 1559013532006480, 1472287775519121 -#else - 65161459, 16013772, 21750665, 3714552, 49707082, 17498998, - 63338576, 23231111, 31322513, 21938797 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1872620123779532, 1892932666768992, 1921559078394978, - 1270573311796160, 1438913646755037 -#else - 21426636, 27904214, 53460576, 28206894, 38296674, 28633461, - 48833472, 18933017, 13040861, 21441484 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 837390187648199, 1012253300223599, 989780015893987, - 1351393287739814, 328627746545550 -#else - 11293895, 12478086, 39972463, 15083749, 37801443, 14748871, - 14555558, 20137329, 1613710, 4896935 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1028328827183114, 1711043289969857, 1350832470374933, - 1923164689604327, 1495656368846911 -#else - 41213962, 15323293, 58619073, 25496531, 25967125, 20128972, - 2825959, 28657387, 43137087, 22287016 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1900828492104143, 430212361082163, 687437570852799, - 832514536673512, 1685641495940794 -#else - 51184079, 28324551, 49665331, 6410663, 3622847, 10243618, - 20615400, 12405433, 43355834, 25118015 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 842632847936398, 605670026766216, 290836444839585, - 163210774892356, 2213815011799645 -#else - 60017550, 12556207, 46917512, 9025186, 50036385, 4333800, - 4378436, 2432030, 23097949, 32988414 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1176336383453996, 1725477294339771, 12700622672454, - 678015708818208, 162724078519879 -#else - 4565804, 17528778, 20084411, 25711615, 1724998, 189254, - 24767264, 10103221, 48596551, 2424777 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1448049969043497, 1789411762943521, 385587766217753, - 90201620913498, 832999441066823 -#else - 366633, 21577626, 8173089, 26664313, 30788633, 5745705, - 59940186, 1344108, 63466311, 12412658 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 516086333293313, 2240508292484616, 1351669528166508, - 1223255565316488, 750235824427138 -#else - 43107073, 7690285, 14929416, 33386175, 34898028, 20141445, - 24162696, 18227928, 63967362, 11179384 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1263624896582495, 1102602401673328, 526302183714372, - 2152015839128799, 1483839308490010 -#else - 18289503, 18829478, 8056944, 16430056, 45379140, 7842513, - 61107423, 32067534, 48424218, 22110928 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 442991718646863, 1599275157036458, 1925389027579192, - 899514691371390, 350263251085160 -#else - 476239, 6601091, 60956074, 23831056, 17503544, 28690532, - 27672958, 13403813, 11052904, 5219329 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1689713572022143, 593854559254373, 978095044791970, - 1985127338729499, 1676069120347625 -#else - 20678527, 25178694, 34436965, 8849122, 62099106, 14574751, - 31186971, 29580702, 9014761, 24975376 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1557207018622683, 340631692799603, 1477725909476187, - 614735951619419, 2033237123746766 -#else - 53464795, 23204192, 51146355, 5075807, 65594203, 22019831, - 34006363, 9160279, 8473550, 30297594 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 968764929340557, 1225534776710944, 662967304013036, - 1155521416178595, 791142883466590 -#else - 24900749, 14435722, 17209120, 18261891, 44516588, 9878982, - 59419555, 17218610, 42540382, 11788947 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1487081286167458, 993039441814934, 1792378982844640, - 698652444999874, 2153908693179754 -#else - 63990690, 22159237, 53306774, 14797440, 9652448, 26708528, - 47071426, 10410732, 42540394, 32095740 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1123181311102823, 685575944875442, 507605465509927, - 1412590462117473, 568017325228626 -#else - 51449703, 16736705, 44641714, 10215877, 58011687, 7563910, - 11871841, 21049238, 48595538, 8464117 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 560258797465417, 2193971151466401, 1824086900849026, - 579056363542056, 1690063960036441 -#else - 43708233, 8348506, 52522913, 32692717, 63158658, 27181012, - 14325288, 8628612, 33313881, 25183915 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1918407319222416, 353767553059963, 1930426334528099, - 1564816146005724, 1861342381708096 -#else - 46921872, 28586496, 22367355, 5271547, 66011747, 28765593, - 42303196, 23317577, 58168128, 27736162 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2131325168777276, 1176636658428908, 1756922641512981, - 1390243617176012, 1966325177038383 -#else - 60160060, 31759219, 34483180, 17533252, 32635413, 26180187, - 15989196, 20716244, 28358191, 29300528 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2063958120364491, 2140267332393533, 699896251574968, - 273268351312140, 375580724713232 -#else - 43547083, 30755372, 34757181, 31892468, 57961144, 10429266, - 50471180, 4072015, 61757200, 5596588 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2024297515263178, 416959329722687, 1079014235017302, - 171612225573183, 1031677520051053 -#else - 38872266, 30164383, 12312895, 6213178, 3117142, 16078565, - 29266239, 2557221, 1768301, 15373193 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2033900009388450, 1744902869870788, 2190580087917640, - 1949474984254121, 231049754293748 -#else - 59865506, 30307471, 62515396, 26001078, 66980936, 32642186, - 66017961, 29049440, 42448372, 3442909 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 343868674606581, 550155864008088, 1450580864229630, - 481603765195050, 896972360018042 -#else - 36898293, 5124042, 14181784, 8197961, 18964734, 21615339, - 22597930, 7176455, 48523386, 13365929 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2151139328380127, 314745882084928, 59756825775204, - 1676664391494651, 2048348075599360 -#else - 59231455, 32054473, 8324672, 4690079, 6261860, 890446, 24538107, - 24984246, 57419264, 30522764 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1528930066340597, 1605003907059576, 1055061081337675, - 1458319101947665, 1234195845213142 -#else - 25008885, 22782833, 62803832, 23916421, 16265035, 15721635, - 683793, 21730648, 15723478, 18390951 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 830430507734812, 1780282976102377, 1425386760709037, - 362399353095425, 2168861579799910 -#else - 57448220, 12374378, 40101865, 26528283, 59384749, 21239917, - 11879681, 5400171, 519526, 32318556 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1155762232730333, 980662895504006, 2053766700883521, - 490966214077606, 510405877041357 -#else - 22258397, 17222199, 59239046, 14613015, 44588609, 30603508, - 46754982, 7315966, 16648397, 7605640 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1683750316716132, 652278688286128, 1221798761193539, - 1897360681476669, 319658166027343 -#else - 59027556, 25089834, 58885552, 9719709, 19259459, 18206220, - 23994941, 28272877, 57640015, 4763277 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 618808732869972, 72755186759744, 2060379135624181, - 1730731526741822, 48862757828238 -#else - 45409620, 9220968, 51378240, 1084136, 41632757, 30702041, - 31088446, 25789909, 55752334, 728111 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1463171970593505, 1143040711767452, 614590986558883, - 1409210575145591, 1882816996436803 -#else - 26047201, 21802961, 60208540, 17032633, 24092067, 9158119, - 62835319, 20998873, 37743427, 28056159 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2230133264691131, 563950955091024, 2042915975426398, - 827314356293472, 672028980152815 -#else - 17510331, 33231575, 5854288, 8403524, 17133918, 30441820, - 38997856, 12327944, 10750447, 10014012 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 264204366029760, 1654686424479449, 2185050199932931, - 2207056159091748, 506015669043634 -#else - 56796096, 3936951, 9156313, 24656749, 16498691, 32559785, - 39627812, 32887699, 3424690, 7540221 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1784446333136569, 1973746527984364, 334856327359575, - 1156769775884610, 1023950124675478 -#else - 30322361, 26590322, 11361004, 29411115, 7433303, 4989748, - 60037442, 17237212, 57864598, 15258045 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2065270940578383, 31477096270353, 306421879113491, - 181958643936686, 1907105536686083 -#else - 13054543, 30774935, 19155473, 469045, 54626067, 4566041, - 5631406, 2711395, 1062915, 28418087 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1496516440779464, 1748485652986458, 872778352227340, - 818358834654919, 97932669284220 -#else - 47868616, 22299832, 37599834, 26054466, 61273100, 13005410, - 61042375, 12194496, 32960380, 1459310 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 471636015770351, 672455402793577, 1804995246884103, - 1842309243470804, 1501862504981682 -#else - 19852015, 7027924, 23669353, 10020366, 8586503, 26896525, - 394196, 27452547, 18638002, 22379495 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1013216974933691, 538921919682598, 1915776722521558, - 1742822441583877, 1886550687916656 -#else - 31395515, 15098109, 26581030, 8030562, 50580950, 28547297, - 9012485, 25970078, 60465776, 28111795 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2094270000643336, 303971879192276, 40801275554748, - 649448917027930, 1818544418535447 -#else - 57916680, 31207054, 65111764, 4529533, 25766844, 607986, - 67095642, 9677542, 34813975, 27098423 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2241737709499165, 549397817447461, 838180519319392, - 1725686958520781, 1705639080897747 -#else - 64664349, 33404494, 29348901, 8186665, 1873760, 12489863, - 36174285, 25714739, 59256019, 25416002 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1216074541925116, 50120933933509, 1565829004133810, - 721728156134580, 349206064666188 -#else - 51872508, 18120922, 7766469, 746860, 26346930, 23332670, - 39775412, 10754587, 57677388, 5203575 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 948617110470858, 346222547451945, 1126511960599975, - 1759386906004538, 493053284802266 -#else - 31834314, 14135496, 66338857, 5159117, 20917671, 16786336, - 59640890, 26216907, 31809242, 7347066 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1454933046815146, 874696014266362, 1467170975468588, - 1432316382418897, 2111710746366763 -#else - 57502122, 21680191, 20414458, 13033986, 13716524, 21862551, - 19797969, 21343177, 15192875, 31466942 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2105387117364450, 1996463405126433, 1303008614294500, - 851908115948209, 1353742049788635 -#else - 54445282, 31372712, 1168161, 29749623, 26747876, 19416341, - 10609329, 12694420, 33473243, 20172328 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 750300956351719, 1487736556065813, 15158817002104, - 1511998221598392, 971739901354129 -#else - 33184999, 11180355, 15832085, 22169002, 65475192, 225883, - 15089336, 22530529, 60973201, 14480052 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1874648163531693, 2124487685930551, 1810030029384882, - 918400043048335, 586348627300650 -#else - 31308717, 27934434, 31030839, 31657333, 15674546, 26971549, - 5496207, 13685227, 27595050, 8737275 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1235084464747900, 1166111146432082, 1745394857881591, - 1405516473883040, 4463504151617 -#else - 46790012, 18404192, 10933842, 17376410, 8335351, 26008410, - 36100512, 20943827, 26498113, 66511 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1663810156463827, 327797390285791, 1341846161759410, - 1964121122800605, 1747470312055380 -#else - 22644435, 24792703, 50437087, 4884561, 64003250, 19995065, - 30540765, 29267685, 53781076, 26039336 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 660005247548233, 2071860029952887, 1358748199950107, - 911703252219107, 1014379923023831 -#else - 39091017, 9834844, 18617207, 30873120, 63706907, 20246925, - 8205539, 13585437, 49981399, 15115438 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2206641276178231, 1690587809721504, 1600173622825126, - 2156096097634421, 1106822408548216 -#else - 23711543, 32881517, 31206560, 25191721, 6164646, 23844445, - 33572981, 32128335, 8236920, 16492939 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1344788193552206, 1949552134239140, 1735915881729557, - 675891104100469, 1834220014427292 -#else - 43198286, 20038905, 40809380, 29050590, 25005589, 25867162, - 19574901, 10071562, 6708380, 27332008 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1920949492387964, 158885288387530, 70308263664033, - 626038464897817, 1468081726101009 -#else - 2101372, 28624378, 19702730, 2367575, 51681697, 1047674, - 5301017, 9328700, 29955601, 21876122 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 622221042073383, 1210146474039168, 1742246422343683, - 1403839361379025, 417189490895736 -#else - 3096359, 9271816, 45488000, 18032587, 52260867, 25961494, - 41216721, 20918836, 57191288, 6216607 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 22727256592983, 168471543384997, 1324340989803650, - 1839310709638189, 504999476432775 -#else - 34493015, 338662, 41913253, 2510421, 37895298, 19734218, - 24822829, 27407865, 40341383, 7525078 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1313240518756327, 1721896294296942, 52263574587266, - 2065069734239232, 804910473424630 -#else - 44042215, 19568808, 16133486, 25658254, 63719298, 778787, - 66198528, 30771936, 47722230, 11994100 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1337466662091884, 1287645354669772, 2018019646776184, - 652181229374245, 898011753211715 -#else - 21691500, 19929806, 66467532, 19187410, 3285880, 30070836, - 42044197, 9718257, 59631427, 13381417 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1969792547910734, 779969968247557, 2011350094423418, - 1823964252907487, 1058949448296945 -#else - 18445390, 29352196, 14979845, 11622458, 65381754, 29971451, - 23111647, 27179185, 28535281, 15779576 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 207343737062002, 1118176942430253, 758894594548164, - 806764629546266, 1157700123092949 -#else - 30098034, 3089662, 57874477, 16662134, 45801924, 11308410, - 53040410, 12021729, 9955285, 17251076 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1273565321399022, 1638509681964574, 759235866488935, - 666015124346707, 897983460943405 -#else - 9734894, 18977602, 59635230, 24415696, 2060391, 11313496, - 48682835, 9924398, 20194861, 13380996 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1717263794012298, 1059601762860786, 1837819172257618, - 1054130665797229, 680893204263559 -#else - 40730762, 25589224, 44941042, 15789296, 49053522, 27385639, - 65123949, 15707770, 26342023, 10146099 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2237039662793603, 2249022333361206, 2058613546633703, - 149454094845279, 2215176649164582 -#else - 41091971, 33334488, 21339190, 33513044, 19745255, 30675732, - 37471583, 2227039, 21612326, 33008704 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 79472182719605, 1851130257050174, 1825744808933107, - 821667333481068, 781795293511946 -#else - 54031477, 1184227, 23562814, 27583990, 46757619, 27205717, - 25764460, 12243797, 46252298, 11649657 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 755822026485370, 152464789723500, 1178207602290608, - 410307889503239, 156581253571278 -#else - 57077370, 11262625, 27384172, 2271902, 26947504, 17556661, - 39943, 6114064, 33514190, 2333242 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1418185496130297, 484520167728613, 1646737281442950, - 1401487684670265, 1349185550126961 -#else - 45675257, 21132610, 8119781, 7219913, 45278342, 24538297, - 60429113, 20883793, 24350577, 20104431 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1495380034400429, 325049476417173, 46346894893933, - 1553408840354856, 828980101835683 -#else - 62992557, 22282898, 43222677, 4843614, 37020525, 690622, - 35572776, 23147595, 8317859, 12352766 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1280337889310282, 2070832742866672, 1640940617225222, - 2098284908289951, 450929509534434 -#else - 18200138, 19078521, 34021104, 30857812, 43406342, 24451920, - 43556767, 31266881, 20712162, 6719373 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 407703353998781, 126572141483652, 286039827513621, - 1999255076709338, 2030511179441770 -#else - 26656189, 6075253, 59250308, 1886071, 38764821, 4262325, - 11117530, 29791222, 26224234, 30256974 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1254958221100483, 1153235960999843, 942907704968834, - 637105404087392, 1149293270147267 -#else - 49939907, 18700334, 63713187, 17184554, 47154818, 14050419, - 21728352, 9493610, 18620611, 17125804 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 894249020470196, 400291701616810, 406878712230981, - 1599128793487393, 1145868722604026 -#else - 53785524, 13325348, 11432106, 5964811, 18609221, 6062965, - 61839393, 23828875, 36407290, 17074774 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1497955250203334, 110116344653260, 1128535642171976, - 1900106496009660, 129792717460909 -#else - 43248326, 22321272, 26961356, 1640861, 34695752, 16816491, - 12248508, 28313793, 13735341, 1934062 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 452487513298665, 1352120549024569, 1173495883910956, - 1999111705922009, 367328130454226 -#else - 25089769, 6742589, 17081145, 20148166, 21909292, 17486451, - 51972569, 29789085, 45830866, 5473615 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1717539401269642, 1475188995688487, 891921989653942, - 836824441505699, 1885988485608364 -#else - 31883658, 25593331, 1083431, 21982029, 22828470, 13290673, - 59983779, 12469655, 29111212, 28103418 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1241784121422547, 187337051947583, 1118481812236193, - 428747751936362, 30358898927325 -#else - 24244947, 18504025, 40845887, 2791539, 52111265, 16666677, - 24367466, 6388839, 56813277, 452382 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2022432361201842, 1088816090685051, 1977843398539868, - 1854834215890724, 564238862029357 -#else - 41468082, 30136590, 5217915, 16224624, 19987036, 29472163, - 42872612, 27639183, 15766061, 8407814 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 938868489100585, 1100285072929025, 1017806255688848, - 1957262154788833, 152787950560442 -#else - 46701865, 13990230, 15495425, 16395525, 5377168, 15166495, - 58191841, 29165478, 59040954, 2276717 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 867319417678923, 620471962942542, 226032203305716, - 342001443957629, 1761675818237336 -#else - 30157899, 12924066, 49396814, 9245752, 19895028, 3368142, - 43281277, 5096218, 22740376, 26251015 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1295072362439987, 931227904689414, 1355731432641687, - 922235735834035, 892227229410209 -#else - 2041139, 19298082, 7783686, 13876377, 41161879, 20201972, - 24051123, 13742383, 51471265, 13295221 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1680989767906154, 535362787031440, 2136691276706570, - 1942228485381244, 1267350086882274 -#else - 33338218, 25048699, 12532112, 7977527, 9106186, 31839181, - 49388668, 28941459, 62657506, 18884987 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 366018233770527, 432660629755596, 126409707644535, - 1973842949591662, 645627343442376 -#else - 47063583, 5454096, 52762316, 6447145, 28862071, 1883651, - 64639598, 29412551, 7770568, 9620597 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 535509430575217, 546885533737322, 1524675609547799, - 2138095752851703, 1260738089896827 -#else - 23208049, 7979712, 33071466, 8149229, 1758231, 22719437, - 30945527, 31860109, 33606523, 18786461 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1159906385590467, 2198530004321610, 714559485023225, - 81880727882151, 1484020820037082 -#else - 1439939, 17283952, 66028874, 32760649, 4625401, 10647766, - 62065063, 1220117, 30494170, 22113633 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1377485731340769, 2046328105512000, 1802058637158797, - 62146136768173, 1356993908853901 -#else - 62071265, 20526136, 64138304, 30492664, 15640973, 26852766, - 40369837, 926049, 65424525, 20220784 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2013612215646735, 1830770575920375, 536135310219832, - 609272325580394, 270684344495013 -#else - 13908495, 30005160, 30919927, 27280607, 45587000, 7989038, - 9021034, 9078865, 3353509, 4033511 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1237542585982777, 2228682050256790, 1385281931622824, - 593183794882890, 493654978552689 -#else - 37445433, 18440821, 32259990, 33209950, 24295848, 20642309, - 23161162, 8839127, 27485041, 7356032 -#endif - }}, - }, - }, - { - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 47341488007760, 1891414891220257, 983894663308928, - 176161768286818, 1126261115179708 -#else - 9661008, 705443, 11980065, 28184278, 65480320, 14661172, - 60762722, 2625014, 28431036, 16782598 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1694030170963455, 502038567066200, 1691160065225467, - 949628319562187, 275110186693066 -#else - 43269631, 25243016, 41163352, 7480957, 49427195, 25200248, - 44562891, 14150564, 15970762, 4099461 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1124515748676336, 1661673816593408, 1499640319059718, - 1584929449166988, 558148594103306 -#else - 29262576, 16756590, 26350592, 24760869, 8529670, 22346382, - 13617292, 23617289, 11465738, 8317062 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1784525599998356, 1619698033617383, 2097300287550715, - 258265458103756, 1905684794832758 -#else - 41615764, 26591503, 32500199, 24135381, 44070139, 31252209, - 14898636, 3848455, 20969334, 28396916 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1288941072872766, 931787902039402, 190731008859042, - 2006859954667190, 1005931482221702 -#else - 46724414, 19206718, 48772458, 13884721, 34069410, 2842113, - 45498038, 29904543, 11177094, 14989547 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1465551264822703, 152905080555927, 680334307368453, - 173227184634745, 666407097159852 -#else - 42612143, 21838415, 16959895, 2278463, 12066309, 10137771, - 13515641, 2581286, 38621356, 9930239 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2111017076203943, 1378760485794347, 1248583954016456, - 1352289194864422, 1895180776543896 -#else - 49357223, 31456605, 16544299, 20545132, 51194056, 18605350, - 18345766, 20150679, 16291480, 28240394 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 171348223915638, 662766099800389, 462338943760497, - 466917763340314, 656911292869115 -#else - 33879670, 2553287, 32678213, 9875984, 8534129, 6889387, - 57432090, 6957616, 4368891, 9788741 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 488623681976577, 866497561541722, 1708105560937768, - 1673781214218839, 1506146329818807 -#else - 16660737, 7281060, 56278106, 12911819, 20108584, 25452756, - 45386327, 24941283, 16250551, 22443329 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 160425464456957, 950394373239689, 430497123340934, - 711676555398832, 320964687779005 -#else - 47343357, 2390525, 50557833, 14161979, 1905286, 6414907, - 4689584, 10604807, 36918461, 4782746 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 988979367990485, 1359729327576302, 1301834257246029, - 294141160829308, 29348272277475 -#else - 65754325, 14736940, 59741422, 20261545, 7710541, 19398842, - 57127292, 4383044, 22546403, 437323 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1434382743317910, 100082049942065, 221102347892623, - 186982837860588, 1305765053501834 -#else - 31665558, 21373968, 50922033, 1491338, 48740239, 3294681, - 27343084, 2786261, 36475274, 19457415 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 2205916462268190, 499863829790820, 961960554686616, - 158062762756985, 1841471168298305 -#else - 52641566, 32870716, 33734756, 7448551, 19294360, 14334329, - 47418233, 2355318, 47824193, 27440058 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1191737341426592, 1847042034978363, 1382213545049056, - 1039952395710448, 788812858896859 -#else - 15121312, 17758270, 6377019, 27523071, 56310752, 20596586, - 18952176, 15496498, 37728731, 11754227 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1346965964571152, 1291881610839830, 2142916164336056, - 786821641205979, 1571709146321039 -#else - 64471568, 20071356, 8488726, 19250536, 12728760, 31931939, - 7141595, 11724556, 22761615, 23420291 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 787164375951248, 202869205373189, 1356590421032140, - 1431233331032510, 786341368775957 -#else - 16918416, 11729663, 49025285, 3022986, 36093132, 20214772, - 38367678, 21327038, 32851221, 11717399 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 492448143532951, 304105152670757, 1761767168301056, - 233782684697790, 1981295323106089 -#else - 11166615, 7338049, 60386341, 4531519, 37640192, 26252376, - 31474878, 3483633, 65915689, 29523600 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 665807507761866, 1343384868355425, 895831046139653, - 439338948736892, 1986828765695105 -#else - 66923210, 9921304, 31456609, 20017994, 55095045, 13348922, - 33142652, 6546660, 47123585, 29606055 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 756096210874553, 1721699973539149, 258765301727885, - 1390588532210645, 1212530909934781 -#else - 34648249, 11266711, 55911757, 25655328, 31703693, 3855903, - 58571733, 20721383, 36336829, 18068118 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 852891097972275, 1816988871354562, 1543772755726524, - 1174710635522444, 202129090724628 -#else - 49102387, 12709067, 3991746, 27075244, 45617340, 23004006, - 35973516, 17504552, 10928916, 3011958 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1205281565824323, 22430498399418, 992947814485516, - 1392458699738672, 688441466734558 -#else - 60151107, 17960094, 31696058, 334240, 29576716, 14796075, - 36277808, 20749251, 18008030, 10258577 -#endif - }}, - }, - { - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1050627428414972, 1955849529137135, 2171162376368357, - 91745868298214, 447733118757826 -#else - 44660220, 15655568, 7018479, 29144429, 36794597, 32352840, - 65255398, 1367119, 25127874, 6671743 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1287181461435438, 622722465530711, 880952150571872, - 741035693459198, 311565274989772 -#else - 29701166, 19180498, 56230743, 9279287, 67091296, 13127209, - 21382910, 11042292, 25838796, 4642684 -#endif - }}, - {{ -#if defined(BORINGSSL_CURVE25519_64BIT) - 1003649078149734, 545233927396469, 1849786171789880, - 1318943684880434, 280345687170552 -#else - 46678630, 14955536, 42982517, 8124618, 61739576, 27563961, - 30468146, 19653792, 18423288, 4177476 -#endif - }}, + {0x50, 0x6a, 0x93, 0x8c, 0xe, 0x2b, 0x8, 0x69, 0xb6, 0xc5, 0xda, + 0xc1, 0x35, 0xa0, 0xc9, 0xf9, 0x34, 0xb6, 0xdf, 0xc4, 0x54, 0x3e, + 0xb7, 0x6f, 0x40, 0xc1, 0x2b, 0x1d, 0x9b, 0x41, 0x5, 0x40}, + {0xff, 0x3d, 0x94, 0x22, 0xb6, 0x4, 0xc6, 0xd2, 0xa0, 0xb3, 0xcf, + 0x44, 0xce, 0xbe, 0x8c, 0xbc, 0x78, 0x86, 0x80, 0x97, 0xf3, 0x4f, + 0x25, 0x5d, 0xbf, 0xa6, 0x1c, 0x3b, 0x4f, 0x61, 0xa3, 0xf}, + {0xf0, 0x82, 0xbe, 0xb9, 0xbd, 0xfe, 0x3, 0xa0, 0x90, 0xac, 0x44, + 0x3a, 0xaf, 0xc1, 0x89, 0x20, 0x8e, 0xfa, 0x54, 0x19, 0x91, 0x9f, + 0x49, 0xf8, 0x42, 0xab, 0x40, 0xef, 0x8a, 0x21, 0xba, 0x1f}, + }, + { + {0x94, 0x1, 0x7b, 0x3e, 0x4, 0x57, 0x3e, 0x4f, 0x7f, 0xaf, 0xda, + 0x8, 0xee, 0x3e, 0x1d, 0xa8, 0xf1, 0xde, 0xdc, 0x99, 0xab, 0xc6, + 0x39, 0xc8, 0xd5, 0x61, 0x77, 0xff, 0x13, 0x5d, 0x53, 0x6c}, + {0x3e, 0xf5, 0xc8, 0xfa, 0x48, 0x94, 0x54, 0xab, 0x41, 0x37, 0xa6, + 0x7b, 0x9a, 0xe8, 0xf6, 0x81, 0x1, 0x5e, 0x2b, 0x6c, 0x7d, 0x6c, + 0xfd, 0x74, 0x42, 0x6e, 0xc8, 0xa8, 0xca, 0x3a, 0x2e, 0x39}, + {0xaf, 0x35, 0x8a, 0x3e, 0xe9, 0x34, 0xbd, 0x4c, 0x16, 0xe8, 0x87, + 0x58, 0x44, 0x81, 0x7, 0x2e, 0xab, 0xb0, 0x9a, 0xf2, 0x76, 0x9c, + 0x31, 0x19, 0x3b, 0xc1, 0xa, 0xd5, 0xe4, 0x7f, 0xe1, 0x25}, + }, + { + {0xa7, 0x21, 0xf1, 0x76, 0xf5, 0x7f, 0x5f, 0x91, 0xe3, 0x87, 0xcd, + 0x2f, 0x27, 0x32, 0x4a, 0xc3, 0x26, 0xe5, 0x1b, 0x4d, 0xde, 0x2f, + 0xba, 0xcc, 0x9b, 0x89, 0x69, 0x89, 0x8f, 0x82, 0xba, 0x6b}, + {0x76, 0xf6, 0x4, 0x1e, 0xd7, 0x9b, 0x28, 0xa, 0x95, 0xf, 0x42, + 0xd6, 0x52, 0x1c, 0x8e, 0x20, 0xab, 0x1f, 0x69, 0x34, 0xb0, 0xd8, + 0x86, 0x51, 0x51, 0xb3, 0x9f, 0x2a, 0x44, 0x51, 0x57, 0x25}, + {0x1, 0x39, 0xfe, 0x90, 0x66, 0xbc, 0xd1, 0xe2, 0xd5, 0x7a, 0x99, + 0xa0, 0x18, 0x4a, 0xb5, 0x4c, 0xd4, 0x60, 0x84, 0xaf, 0x14, 0x69, + 0x1d, 0x97, 0xe4, 0x7b, 0x6b, 0x7f, 0x4f, 0x50, 0x9d, 0x55}, + }, + { + {0xfd, 0x66, 0xd2, 0xf6, 0xe7, 0x91, 0x48, 0x9c, 0x1b, 0x78, 0x7, + 0x3, 0x9b, 0xa1, 0x44, 0x7, 0x3b, 0xe2, 0x61, 0x60, 0x1d, 0x8f, + 0x38, 0x88, 0xe, 0xd5, 0x4b, 0x35, 0xa3, 0xa6, 0x3e, 0x12}, + {0xd5, 0x54, 0xeb, 0xb3, 0x78, 0x83, 0x73, 0xa7, 0x7c, 0x3c, 0x55, + 0xa5, 0x66, 0xd3, 0x69, 0x1d, 0xba, 0x0, 0x28, 0xf9, 0x62, 0xcf, + 0x26, 0xa, 0x17, 0x32, 0x7e, 0x80, 0xd5, 0x12, 0xab, 0x1}, + {0x96, 0x2d, 0xe3, 0x41, 0x90, 0x18, 0x8d, 0x11, 0x48, 0x58, 0x31, + 0xd8, 0xc2, 0xe3, 0xed, 0xb9, 0xd9, 0x45, 0x32, 0xd8, 0x71, 0x42, + 0xab, 0x1e, 0x54, 0xa1, 0x18, 0xc9, 0xe2, 0x61, 0x39, 0x4a}, + }, + { + {0x1e, 0x3f, 0x23, 0xf3, 0x44, 0xd6, 0x27, 0x3, 0x16, 0xf0, 0xfc, + 0x34, 0xe, 0x26, 0x9a, 0x49, 0x79, 0xb9, 0xda, 0xf2, 0x16, 0xa7, + 0xb5, 0x83, 0x1f, 0x11, 0xd4, 0x9b, 0xad, 0xee, 0xac, 0x68}, + {0xa0, 0xbb, 0xe6, 0xf8, 0xe0, 0x3b, 0xdc, 0x71, 0xa, 0xe3, 0xff, + 0x7e, 0x34, 0xf8, 0xce, 0xd6, 0x6a, 0x47, 0x3a, 0xe1, 0x5f, 0x42, + 0x92, 0xa9, 0x63, 0xb7, 0x1d, 0xfb, 0xe3, 0xbc, 0xd6, 0x2c}, + {0x10, 0xc2, 0xd7, 0xf3, 0xe, 0xc9, 0xb4, 0x38, 0xc, 0x4, 0xad, + 0xb7, 0x24, 0x6e, 0x8e, 0x30, 0x23, 0x3e, 0xe7, 0xb7, 0xf1, 0xd9, + 0x60, 0x38, 0x97, 0xf5, 0x8, 0xb5, 0xd5, 0x60, 0x57, 0x59}, + }, + { + {0x90, 0x27, 0x2, 0xfd, 0xeb, 0xcb, 0x2a, 0x88, 0x60, 0x57, 0x11, + 0xc4, 0x5, 0x33, 0xaf, 0x89, 0xf4, 0x73, 0x34, 0x7d, 0xe3, 0x92, + 0xf4, 0x65, 0x2b, 0x5a, 0x51, 0x54, 0xdf, 0xc5, 0xb2, 0x2c}, + {0x97, 0x63, 0xaa, 0x4, 0xe1, 0xbf, 0x29, 0x61, 0xcb, 0xfc, 0xa7, + 0xa4, 0x8, 0x0, 0x96, 0x8f, 0x58, 0x94, 0x90, 0x7d, 0x89, 0xc0, + 0x8b, 0x3f, 0xa9, 0x91, 0xb2, 0xdc, 0x3e, 0xa4, 0x9f, 0x70}, + {0xca, 0x2a, 0xfd, 0x63, 0x8c, 0x5d, 0xa, 0xeb, 0xff, 0x4e, 0x69, + 0x2e, 0x66, 0xc1, 0x2b, 0xd2, 0x3a, 0xb0, 0xcb, 0xf8, 0x6e, 0xf3, + 0x23, 0x27, 0x1f, 0x13, 0xc8, 0xf0, 0xec, 0x29, 0xf0, 0x70}, + }, + { + {0xb9, 0xb0, 0x10, 0x5e, 0xaa, 0xaf, 0x6a, 0x2a, 0xa9, 0x1a, 0x4, + 0xef, 0x70, 0xa3, 0xf0, 0x78, 0x1f, 0xd6, 0x3a, 0xaa, 0x77, 0xfb, + 0x3e, 0x77, 0xe1, 0xd9, 0x4b, 0xa7, 0xa2, 0xa5, 0xec, 0x44}, + {0x33, 0x3e, 0xed, 0x2e, 0xb3, 0x7, 0x13, 0x46, 0xe7, 0x81, 0x55, + 0xa4, 0x33, 0x2f, 0x4, 0xae, 0x66, 0x3, 0x5f, 0x19, 0xd3, 0x49, + 0x44, 0xc9, 0x58, 0x48, 0x31, 0x6c, 0x8a, 0x5d, 0x7d, 0xb}, + {0x43, 0xd5, 0x95, 0x7b, 0x32, 0x48, 0xd4, 0x25, 0x1d, 0xf, 0x34, + 0xa3, 0x0, 0x83, 0xd3, 0x70, 0x2b, 0xc5, 0xe1, 0x60, 0x1c, 0x53, + 0x1c, 0xde, 0xe4, 0xe9, 0x7d, 0x2c, 0x51, 0x24, 0x22, 0x27}, + }, + { + {0xfc, 0x75, 0xa9, 0x42, 0x8a, 0xbb, 0x7b, 0xbf, 0x58, 0xa3, 0xad, + 0x96, 0x77, 0x39, 0x5c, 0x8c, 0x48, 0xaa, 0xed, 0xcd, 0x6f, 0xc7, + 0x7f, 0xe2, 0xa6, 0x20, 0xbc, 0xf6, 0xd7, 0x5f, 0x73, 0x19}, + {0x2e, 0x34, 0xc5, 0x49, 0xaf, 0x92, 0xbc, 0x1a, 0xd0, 0xfa, 0xe6, + 0xb2, 0x11, 0xd8, 0xee, 0xff, 0x29, 0x4e, 0xc8, 0xfc, 0x8d, 0x8c, + 0xa2, 0xef, 0x43, 0xc5, 0x4c, 0xa4, 0x18, 0xdf, 0xb5, 0x11}, + {0x66, 0x42, 0xc8, 0x42, 0xd0, 0x90, 0xab, 0xe3, 0x7e, 0x54, 0x19, + 0x7f, 0xf, 0x8e, 0x84, 0xeb, 0xb9, 0x97, 0xa4, 0x65, 0xd0, 0xa1, + 0x3, 0x25, 0x5f, 0x89, 0xdf, 0x91, 0x11, 0x91, 0xef, 0xf}, }, }, }; @@ -7636,10 +3028,10 @@ static const ge_precomp k25519Precomp[32][8] = { #endif // OPENSSL_SMALL // Bi[i] = (2*i+1)*B -static const ge_precomp Bi[8] = { +static const bssl::ge_precomp Bi[8] = { { {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 1288382639258501, 245678601348599, 269427782077623, 1462984067271730, 137412439391563 #else @@ -7648,7 +3040,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 62697248952638, 204681361388450, 631292143396476, 338455783676468, 1213667448819585 #else @@ -7657,7 +3049,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 301289933810280, 1259582250014073, 1422107436869536, 796239922652654, 1953934009299142 #else @@ -7668,7 +3060,7 @@ static const ge_precomp Bi[8] = { }, { {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 1601611775252272, 1720807796594148, 1132070835939856, 1260455018889551, 2147779492816911 #else @@ -7677,7 +3069,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 316559037616741, 2177824224946892, 1459442586438991, 1461528397712656, 751590696113597 #else @@ -7686,7 +3078,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 1850748884277385, 1200145853858453, 1068094770532492, 672251375690438, 1586055907191707 #else @@ -7697,7 +3089,7 @@ static const ge_precomp Bi[8] = { }, { {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 769950342298419, 132954430919746, 844085933195555, 974092374476333, 726076285546016 #else @@ -7706,7 +3098,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 425251763115706, 608463272472562, 442562545713235, 837766094556764, 374555092627893 #else @@ -7715,7 +3107,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 1086255230780037, 274979815921559, 1960002765731872, 929474102396301, 1190409889297339 #else @@ -7726,7 +3118,7 @@ static const ge_precomp Bi[8] = { }, { {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 665000864555967, 2065379846933859, 370231110385876, 350988370788628, 1233371373142985 #else @@ -7735,7 +3127,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 2019367628972465, 676711900706637, 110710997811333, 1108646842542025, 517791959672113 #else @@ -7744,7 +3136,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 965130719900578, 247011430587952, 526356006571389, 91986625355052, 2157223321444601 #else @@ -7755,7 +3147,7 @@ static const ge_precomp Bi[8] = { }, { {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 1802695059465007, 1664899123557221, 593559490740857, 2160434469266659, 927570450755031 #else @@ -7764,7 +3156,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 1725674970513508, 1933645953859181, 1542344539275782, 1767788773573747, 1297447965928905 #else @@ -7773,7 +3165,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 1381809363726107, 1430341051343062, 2061843536018959, 1551778050872521, 2036394857967624 #else @@ -7784,7 +3176,7 @@ static const ge_precomp Bi[8] = { }, { {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 1970894096313054, 528066325833207, 1619374932191227, 2207306624415883, 1169170329061080 #else @@ -7793,7 +3185,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 2070390218572616, 1458919061857835, 624171843017421, 1055332792707765, 433987520732508 #else @@ -7802,7 +3194,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 893653801273833, 1168026499324677, 1242553501121234, 1306366254304474, 1086752658510815 #else @@ -7813,7 +3205,7 @@ static const ge_precomp Bi[8] = { }, { {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 213454002618221, 939771523987438, 1159882208056014, 317388369627517, 621213314200687 #else @@ -7822,7 +3214,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 1971678598905747, 338026507889165, 762398079972271, 655096486107477, 42299032696322 #else @@ -7831,7 +3223,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 177130678690680, 1754759263300204, 1864311296286618, 1180675631479880, 1292726903152791 #else @@ -7842,7 +3234,7 @@ static const ge_precomp Bi[8] = { }, { {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 1913163449625248, 460779200291993, 2193883288642314, 1008900146920800, 1721983679009502 #else @@ -7851,7 +3243,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 1070401523076875, 1272492007800961, 1910153608563310, 2075579521696771, 1191169788841221 #else @@ -7860,7 +3252,7 @@ static const ge_precomp Bi[8] = { #endif }}, {{ -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) 692896803108118, 500174642072499, 2068223309439677, 1162190621851337, 1426986007309901 #else diff --git a/third_party/boringssl/src/crypto/curve25519/internal.h b/third_party/boringssl/src/crypto/curve25519/internal.h index 76ff78fa..c3bc6fe0 100644 --- a/third_party/boringssl/src/crypto/curve25519/internal.h +++ b/third_party/boringssl/src/crypto/curve25519/internal.h @@ -1,42 +1,56 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_CURVE25519_INTERNAL_H -#define OPENSSL_HEADER_CURVE25519_INTERNAL_H - -#if defined(__cplusplus) -extern "C" { -#endif +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_CURVE25519_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_CURVE25519_INTERNAL_H -#include +#include #include "../internal.h" +BSSL_NAMESPACE_BEGIN #if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_APPLE) #define BORINGSSL_X25519_NEON // x25519_NEON is defined in asm/x25519-arm.S. -void x25519_NEON(uint8_t out[32], const uint8_t scalar[32], - const uint8_t point[32]); +extern "C" void x25519_NEON(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]); #endif -#if defined(BORINGSSL_HAS_UINT128) -#define BORINGSSL_CURVE25519_64BIT +#if !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SMALL) && \ + defined(__GNUC__) && defined(__x86_64__) && !defined(OPENSSL_WINDOWS) +#define BORINGSSL_FE25519_ADX + +// fiat_curve25519_adx_mul is defined in +// third_party/fiat/asm/fiat_curve25519_adx_mul.S +extern "C" void __attribute__((sysv_abi)) fiat_curve25519_adx_mul( + uint64_t out[4], const uint64_t in1[4], const uint64_t in2[4]); + +// fiat_curve25519_adx_square is defined in +// third_party/fiat/asm/fiat_curve25519_adx_square.S +extern "C" void __attribute__((sysv_abi)) fiat_curve25519_adx_square( + uint64_t out[4], const uint64_t in[4]); + +// x25519_scalar_mult_adx is defined in third_party/fiat/curve25519_64_adx.h +void x25519_scalar_mult_adx(uint8_t out[32], const uint8_t scalar[32], + const uint8_t point[32]); +void x25519_ge_scalarmult_base_adx(uint8_t h[4][32], const uint8_t a[32]); + #endif -#if defined(BORINGSSL_CURVE25519_64BIT) +#if defined(OPENSSL_64_BIT) // fe means field element. Here the field is \Z/(2^255-19). An element t, // entries t[0]...t[4], represents the integer t[0]+2^51 t[1]+2^102 t[2]+2^153 // t[3]+2^204 t[4]. @@ -124,6 +138,8 @@ enum spake2_state_t { spake2_state_key_generated, }; +BSSL_NAMESPACE_END + struct spake2_ctx_st { uint8_t private_key[32]; uint8_t my_msg[32]; @@ -134,13 +150,14 @@ struct spake2_ctx_st { uint8_t *their_name; size_t their_name_len; enum spake2_role_t my_role; - enum spake2_state_t state; + enum bssl::spake2_state_t state; char disable_password_scalar_hack; }; +BSSL_NAMESPACE_BEGIN -#if defined(__cplusplus) -} // extern C -#endif +extern const uint8_t k25519Precomp[32][8][3][32]; + +BSSL_NAMESPACE_END -#endif // OPENSSL_HEADER_CURVE25519_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_CURVE25519_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/curve25519/spake25519.c b/third_party/boringssl/src/crypto/curve25519/spake25519.c deleted file mode 100644 index c45d15a5..00000000 --- a/third_party/boringssl/src/crypto/curve25519/spake25519.c +++ /dev/null @@ -1,524 +0,0 @@ -/* Copyright (c) 2016, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include -#include -#include -#include - -#include "../fipsmodule/bn/internal.h" -#include "../internal.h" -#include "./internal.h" - - -// The following precomputation tables are for the following -// points used in the SPAKE2 protocol. -// -// N: -// x: 49918732221787544735331783592030787422991506689877079631459872391322455579424 -// y: 54629554431565467720832445949441049581317094546788069926228343916274969994000 -// encoded: 10e3df0ae37d8e7a99b5fe74b44672103dbddcbd06af680d71329a11693bc778 -// -// M: -// x: 31406539342727633121250288103050113562375374900226415211311216773867585644232 -// y: 21177308356423958466833845032658859666296341766942662650232962324899758529114 -// encoded: 5ada7e4bf6ddd9adb6626d32131c6b5c51a1e347a3478f53cfcf441b88eed12e -// -// These points and their precomputation tables are generated with the -// following Python code. For a description of the precomputation table, -// see curve25519.c in this directory. -// -// Exact copies of the source code are kept in bug 27296743. - -/* -import hashlib -import ed25519 as E # http://ed25519.cr.yp.to/python/ed25519.py - -SEED_N = 'edwards25519 point generation seed (N)' -SEED_M = 'edwards25519 point generation seed (M)' - -def genpoint(seed): - v = hashlib.sha256(seed).digest() - it = 1 - while True: - try: - x,y = E.decodepoint(v) - except Exception, e: - print e - it += 1 - v = hashlib.sha256(v).digest() - continue - print "Found in %d iterations:" % it - print " x = %d" % x - print " y = %d" % y - print " Encoded (hex)" - print E.encodepoint((x,y)).encode('hex') - return (x,y) - -def gentable(P): - t = [] - for i in range(1,16): - k = ((i >> 3 & 1) * (1 << 192) + - (i >> 2 & 1) * (1 << 128) + - (i >> 1 & 1) * (1 << 64) + - (i & 1)) - t.append(E.scalarmult(P, k)) - return ''.join(E.encodeint(x) + E.encodeint(y) for (x,y) in t) - -def printtable(table, name): - print "static const uint8_t %s[15 * 2 * 32] = {" % name, - for i in range(15 * 2 * 32): - if i % 12 == 0: - print "\n ", - print " 0x%02x," % ord(table[i]), - print "\n};" - -if __name__ == "__main__": - print "Searching for N" - N = genpoint(SEED_N) - print "Generating precomputation table for N" - Ntable = gentable(N) - printtable(Ntable, "kSpakeNSmallPrecomp") - - print "Searching for M" - M = genpoint(SEED_M) - print "Generating precomputation table for M" - Mtable = gentable(M) - printtable(Mtable, "kSpakeMSmallPrecomp") -*/ - -static const uint8_t kSpakeNSmallPrecomp[15 * 2 * 32] = { - 0x20, 0x1b, 0xc5, 0xb3, 0x43, 0x17, 0x71, 0x10, 0x44, 0x1e, 0x73, 0xb3, - 0xae, 0x3f, 0xbf, 0x9f, 0xf5, 0x44, 0xc8, 0x13, 0x8f, 0xd1, 0x01, 0xc2, - 0x8a, 0x1a, 0x6d, 0xea, 0x4d, 0x00, 0x5d, 0x6e, 0x10, 0xe3, 0xdf, 0x0a, - 0xe3, 0x7d, 0x8e, 0x7a, 0x99, 0xb5, 0xfe, 0x74, 0xb4, 0x46, 0x72, 0x10, - 0x3d, 0xbd, 0xdc, 0xbd, 0x06, 0xaf, 0x68, 0x0d, 0x71, 0x32, 0x9a, 0x11, - 0x69, 0x3b, 0xc7, 0x78, 0x93, 0xf1, 0x57, 0x97, 0x6e, 0xf0, 0x6e, 0x45, - 0x37, 0x4a, 0xf4, 0x0b, 0x18, 0x51, 0xf5, 0x4f, 0x67, 0x3c, 0xdc, 0xec, - 0x84, 0xed, 0xd0, 0xeb, 0xca, 0xfb, 0xdb, 0xff, 0x7f, 0xeb, 0xa8, 0x23, - 0x68, 0x87, 0x13, 0x64, 0x6a, 0x10, 0xf7, 0x45, 0xe0, 0x0f, 0x32, 0x21, - 0x59, 0x7c, 0x0e, 0x50, 0xad, 0x56, 0xd7, 0x12, 0x69, 0x7b, 0x58, 0xf8, - 0xb9, 0x3b, 0xa5, 0xbb, 0x4d, 0x1b, 0x87, 0x1c, 0x46, 0xa7, 0x17, 0x9d, - 0x6d, 0x84, 0x45, 0xbe, 0x7f, 0x95, 0xd2, 0x34, 0xcd, 0x89, 0x95, 0xc0, - 0xf0, 0xd3, 0xdf, 0x6e, 0x10, 0x4a, 0xe3, 0x7b, 0xce, 0x7f, 0x40, 0x27, - 0xc7, 0x2b, 0xab, 0x66, 0x03, 0x59, 0xb4, 0x7b, 0xc7, 0xc7, 0xf0, 0x39, - 0x9a, 0x33, 0x35, 0xbf, 0xcc, 0x2f, 0xf3, 0x2e, 0x68, 0x9d, 0x53, 0x5c, - 0x88, 0x52, 0xe3, 0x77, 0x90, 0xa1, 0x27, 0x85, 0xc5, 0x74, 0x7f, 0x23, - 0x0e, 0x93, 0x01, 0x3e, 0xe7, 0x2e, 0x2e, 0x95, 0xf3, 0x0d, 0xc2, 0x25, - 0x25, 0x39, 0x39, 0x3d, 0x6e, 0x8e, 0x89, 0xbd, 0xe8, 0xbb, 0x67, 0x5e, - 0x8c, 0x66, 0x8b, 0x63, 0x28, 0x1e, 0x4e, 0x74, 0x85, 0xa8, 0xaf, 0x0f, - 0x12, 0x5d, 0xb6, 0x8a, 0x83, 0x1a, 0x77, 0x76, 0x5e, 0x62, 0x8a, 0xa7, - 0x3c, 0xb8, 0x05, 0x57, 0x2b, 0xaf, 0x36, 0x2e, 0x10, 0x90, 0xb2, 0x39, - 0xb4, 0x3e, 0x75, 0x6d, 0x3a, 0xa8, 0x31, 0x35, 0xc2, 0x1e, 0x8f, 0xc2, - 0x79, 0x89, 0x35, 0x16, 0x26, 0xd1, 0xc7, 0x0b, 0x04, 0x1f, 0x1d, 0xf9, - 0x9c, 0x05, 0xa6, 0x6b, 0xb5, 0x19, 0x5a, 0x24, 0x6d, 0x91, 0xc5, 0x31, - 0xfd, 0xc5, 0xfa, 0xe7, 0xa6, 0xcb, 0x0e, 0x4b, 0x18, 0x0d, 0x94, 0xc7, - 0xee, 0x1d, 0x46, 0x1f, 0x92, 0xb1, 0xb2, 0x4a, 0x2b, 0x43, 0x37, 0xfe, - 0xc2, 0x15, 0x11, 0x89, 0xef, 0x59, 0x73, 0x3c, 0x06, 0x76, 0x78, 0xcb, - 0xa6, 0x0d, 0x79, 0x5f, 0x28, 0x0b, 0x5b, 0x8c, 0x9e, 0xe4, 0xaa, 0x51, - 0x9a, 0x42, 0x6f, 0x11, 0x50, 0x3d, 0x01, 0xd6, 0x21, 0xc0, 0x99, 0x5e, - 0x1a, 0xe8, 0x81, 0x25, 0x80, 0xeb, 0xed, 0x5d, 0x37, 0x47, 0x30, 0x70, - 0xa0, 0x4e, 0x0b, 0x43, 0x17, 0xbe, 0xb6, 0x47, 0xe7, 0x2a, 0x62, 0x9d, - 0x5d, 0xa6, 0xc5, 0x33, 0x62, 0x9d, 0x56, 0x24, 0x9d, 0x1d, 0xb2, 0x13, - 0xbc, 0x17, 0x66, 0x43, 0xd1, 0x68, 0xd5, 0x3b, 0x17, 0x69, 0x17, 0xa6, - 0x06, 0x9e, 0x12, 0xb8, 0x7c, 0xd5, 0xaf, 0x3e, 0x21, 0x1b, 0x31, 0xeb, - 0x0b, 0xa4, 0x98, 0x1c, 0xf2, 0x6a, 0x5e, 0x7c, 0x9b, 0x45, 0x8f, 0xb2, - 0x12, 0x06, 0xd5, 0x8c, 0x1d, 0xb2, 0xa7, 0x57, 0x5f, 0x2f, 0x4f, 0xdb, - 0x52, 0x99, 0x7c, 0x58, 0x01, 0x5f, 0xf2, 0xa5, 0xf6, 0x51, 0x86, 0x21, - 0x2f, 0x5b, 0x8d, 0x6a, 0xae, 0x83, 0x34, 0x6d, 0x58, 0x4b, 0xef, 0xfe, - 0xbf, 0x73, 0x5d, 0xdb, 0xc4, 0x97, 0x2a, 0x85, 0xf3, 0x6c, 0x46, 0x42, - 0xb3, 0x90, 0xc1, 0x57, 0x97, 0x50, 0x35, 0xb1, 0x9d, 0xb7, 0xc7, 0x3c, - 0x85, 0x6d, 0x6c, 0xfd, 0xce, 0xb0, 0xc9, 0xa2, 0x77, 0xee, 0xc3, 0x6b, - 0x0c, 0x37, 0xfa, 0x30, 0x91, 0xd1, 0x2c, 0xb8, 0x5e, 0x7f, 0x81, 0x5f, - 0x87, 0xfd, 0x18, 0x02, 0x5a, 0x30, 0x4e, 0x62, 0xbc, 0x65, 0xc6, 0xce, - 0x1a, 0xcf, 0x2b, 0xaa, 0x56, 0x3e, 0x4d, 0xcf, 0xba, 0x62, 0x5f, 0x9a, - 0xd0, 0x72, 0xff, 0xef, 0x28, 0xbd, 0xbe, 0xd8, 0x57, 0x3d, 0xf5, 0x57, - 0x7d, 0xe9, 0x71, 0x31, 0xec, 0x98, 0x90, 0x94, 0xd9, 0x54, 0xbf, 0x84, - 0x0b, 0xe3, 0x06, 0x47, 0x19, 0x9a, 0x13, 0x1d, 0xef, 0x9d, 0x13, 0xf3, - 0xdb, 0xc3, 0x5c, 0x72, 0x9e, 0xed, 0x24, 0xaa, 0x64, 0xed, 0xe7, 0x0d, - 0xa0, 0x7c, 0x73, 0xba, 0x9b, 0x86, 0xa7, 0x3b, 0x55, 0xab, 0x58, 0x30, - 0xf1, 0x15, 0x81, 0x83, 0x2f, 0xf9, 0x62, 0x84, 0x98, 0x66, 0xf6, 0x55, - 0x21, 0xd8, 0xf2, 0x25, 0x64, 0x71, 0x4b, 0x12, 0x76, 0x59, 0xc5, 0xaa, - 0x93, 0x67, 0xc3, 0x86, 0x25, 0xab, 0x4e, 0x4b, 0xf6, 0xd8, 0x3f, 0x44, - 0x2e, 0x11, 0xe0, 0xbd, 0x6a, 0xf2, 0x5d, 0xf5, 0xf9, 0x53, 0xea, 0xa4, - 0xc8, 0xd9, 0x50, 0x33, 0x81, 0xd9, 0xa8, 0x2d, 0x91, 0x7d, 0x13, 0x2a, - 0x11, 0xcf, 0xde, 0x3f, 0x0a, 0xd2, 0xbc, 0x33, 0xb2, 0x62, 0x53, 0xea, - 0x77, 0x88, 0x43, 0x66, 0x27, 0x43, 0x85, 0xe9, 0x5f, 0x55, 0xf5, 0x2a, - 0x8a, 0xac, 0xdf, 0xff, 0x9b, 0x4c, 0x96, 0x9c, 0xa5, 0x7a, 0xce, 0xd5, - 0x79, 0x18, 0xf1, 0x0b, 0x58, 0x95, 0x7a, 0xe7, 0xd3, 0x74, 0x65, 0x0b, - 0xa4, 0x64, 0x30, 0xe8, 0x5c, 0xfc, 0x55, 0x56, 0xee, 0x14, 0x14, 0xd3, - 0x45, 0x3b, 0xf8, 0xde, 0x05, 0x3e, 0xb9, 0x3c, 0xd7, 0x6a, 0x52, 0x72, - 0x5b, 0x39, 0x09, 0xbe, 0x82, 0x23, 0x10, 0x4a, 0xb7, 0xc3, 0xdc, 0x4c, - 0x5d, 0xc9, 0xf1, 0x14, 0x83, 0xf9, 0x0b, 0x9b, 0xe9, 0x23, 0x84, 0x6a, - 0xc4, 0x08, 0x3d, 0xda, 0x3d, 0x12, 0x95, 0x87, 0x18, 0xa4, 0x7d, 0x3f, - 0x23, 0xde, 0xd4, 0x1e, 0xa8, 0x47, 0xc3, 0x71, 0xdb, 0xf5, 0x03, 0x6c, - 0x57, 0xe7, 0xa4, 0x43, 0x82, 0x33, 0x7b, 0x62, 0x46, 0x7d, 0xf7, 0x10, - 0x69, 0x18, 0x38, 0x27, 0x9a, 0x6f, 0x38, 0xac, 0xfa, 0x92, 0xc5, 0xae, - 0x66, 0xa6, 0x73, 0x95, 0x15, 0x0e, 0x4c, 0x04, 0xb6, 0xfc, 0xf5, 0xc7, - 0x21, 0x3a, 0x99, 0xdb, 0x0e, 0x36, 0xf0, 0x56, 0xbc, 0x75, 0xf9, 0x87, - 0x9b, 0x11, 0x18, 0x92, 0x64, 0x1a, 0xe7, 0xc7, 0xab, 0x5a, 0xc7, 0x26, - 0x7f, 0x13, 0x98, 0x42, 0x52, 0x43, 0xdb, 0xc8, 0x6d, 0x0b, 0xb7, 0x31, - 0x93, 0x24, 0xd6, 0xe8, 0x24, 0x1f, 0x6f, 0x21, 0xa7, 0x8c, 0xeb, 0xdb, - 0x83, 0xb8, 0x89, 0xe3, 0xc1, 0xd7, 0x69, 0x3b, 0x02, 0x6b, 0x54, 0x0f, - 0x84, 0x2f, 0xb5, 0x5c, 0x17, 0x77, 0xbe, 0xe5, 0x61, 0x0d, 0xc5, 0xdf, - 0x3b, 0xcf, 0x3e, 0x93, 0x4f, 0xf5, 0x89, 0xb9, 0x5a, 0xc5, 0x29, 0x31, - 0xc0, 0xc2, 0xff, 0xe5, 0x3f, 0xa6, 0xac, 0x03, 0xca, 0xf5, 0xff, 0xe0, - 0x36, 0xce, 0xf3, 0xe2, 0xb7, 0x9c, 0x02, 0xe9, 0x9e, 0xd2, 0xbc, 0x87, - 0x2f, 0x3d, 0x9a, 0x1d, 0x8f, 0xc5, 0x72, 0xb8, 0xa2, 0x01, 0xd4, 0x68, - 0xb1, 0x84, 0x16, 0x10, 0xf6, 0xf3, 0x52, 0x25, 0xd9, 0xdc, 0x4c, 0xdd, - 0x0f, 0xd6, 0x4a, 0xcf, 0x60, 0x96, 0x7e, 0xcc, 0x42, 0x0f, 0x64, 0x9d, - 0x72, 0x46, 0x04, 0x07, 0xf2, 0x5b, 0xf4, 0x07, 0xd1, 0xf4, 0x59, 0x71, -}; - -static const uint8_t kSpakeMSmallPrecomp[15 * 2 * 32] = { - 0xc8, 0xa6, 0x63, 0xc5, 0x97, 0xf1, 0xee, 0x40, 0xab, 0x62, 0x42, 0xee, - 0x25, 0x6f, 0x32, 0x6c, 0x75, 0x2c, 0xa7, 0xd3, 0xbd, 0x32, 0x3b, 0x1e, - 0x11, 0x9c, 0xbd, 0x04, 0xa9, 0x78, 0x6f, 0x45, 0x5a, 0xda, 0x7e, 0x4b, - 0xf6, 0xdd, 0xd9, 0xad, 0xb6, 0x62, 0x6d, 0x32, 0x13, 0x1c, 0x6b, 0x5c, - 0x51, 0xa1, 0xe3, 0x47, 0xa3, 0x47, 0x8f, 0x53, 0xcf, 0xcf, 0x44, 0x1b, - 0x88, 0xee, 0xd1, 0x2e, 0x03, 0x89, 0xaf, 0xc0, 0x61, 0x2d, 0x9e, 0x35, - 0xeb, 0x0e, 0x03, 0xe0, 0xb7, 0xfb, 0xa5, 0xbc, 0x44, 0xbe, 0x0c, 0x89, - 0x0a, 0x0f, 0xd6, 0x59, 0x47, 0x9e, 0xe6, 0x3d, 0x36, 0x9d, 0xff, 0x44, - 0x5e, 0xac, 0xab, 0xe5, 0x3a, 0xd5, 0xb0, 0x35, 0x9f, 0x6d, 0x7f, 0xba, - 0xc0, 0x85, 0x0e, 0xf4, 0x70, 0x3f, 0x13, 0x90, 0x4c, 0x50, 0x1a, 0xee, - 0xc5, 0xeb, 0x69, 0xfe, 0x98, 0x42, 0x87, 0x1d, 0xce, 0x6c, 0x29, 0xaa, - 0x2b, 0x31, 0xc2, 0x38, 0x7b, 0x6b, 0xee, 0x88, 0x0b, 0xba, 0xce, 0xa8, - 0xca, 0x19, 0x60, 0x1b, 0x16, 0xf1, 0x25, 0x1e, 0xcf, 0x63, 0x66, 0x1e, - 0xbb, 0x63, 0xeb, 0x7d, 0xca, 0xd2, 0xb4, 0x23, 0x5a, 0x01, 0x6f, 0x05, - 0xd1, 0xdc, 0x41, 0x73, 0x75, 0xc0, 0xfd, 0x30, 0x91, 0x52, 0x68, 0x96, - 0x45, 0xb3, 0x66, 0x01, 0x3b, 0x53, 0x89, 0x3c, 0x69, 0xbc, 0x6c, 0x69, - 0xe3, 0x51, 0x8f, 0xe3, 0xd2, 0x84, 0xd5, 0x28, 0x66, 0xb5, 0xe6, 0x06, - 0x09, 0xfe, 0x6d, 0xb0, 0x72, 0x16, 0xe0, 0x8a, 0xce, 0x61, 0x65, 0xa9, - 0x21, 0x32, 0x48, 0xdc, 0x7a, 0x1d, 0xe1, 0x38, 0x7f, 0x8c, 0x75, 0x88, - 0x3d, 0x08, 0xa9, 0x4a, 0x6f, 0x3d, 0x9f, 0x7f, 0x3f, 0xbd, 0x57, 0x6b, - 0x19, 0xce, 0x3f, 0x4a, 0xc9, 0xd3, 0xf9, 0x6e, 0x72, 0x7b, 0x5b, 0x74, - 0xea, 0xbe, 0x9c, 0x7a, 0x6d, 0x9c, 0x40, 0x49, 0xe6, 0xfb, 0x2a, 0x1a, - 0x75, 0x70, 0xe5, 0x4e, 0xed, 0x74, 0xe0, 0x75, 0xac, 0xc0, 0xb1, 0x11, - 0x3e, 0xf2, 0xaf, 0x88, 0x4d, 0x66, 0xb6, 0xf6, 0x15, 0x4f, 0x3c, 0x6c, - 0x77, 0xae, 0x47, 0x51, 0x63, 0x9a, 0xfe, 0xe1, 0xb4, 0x1a, 0x12, 0xdf, - 0xe9, 0x54, 0x8d, 0x3b, 0x30, 0x2a, 0x75, 0xe3, 0xe5, 0x29, 0xb1, 0x4c, - 0xb0, 0x7c, 0x6d, 0xb5, 0xae, 0x85, 0xdb, 0x1e, 0x38, 0x55, 0x96, 0xa5, - 0x5b, 0x9f, 0x15, 0x23, 0x28, 0x36, 0xb8, 0xa2, 0x41, 0xb4, 0xd7, 0x19, - 0x91, 0x8d, 0x26, 0x3e, 0xca, 0x9c, 0x05, 0x7a, 0x2b, 0x60, 0x45, 0x86, - 0x8b, 0xee, 0x64, 0x6f, 0x5c, 0x09, 0x4d, 0x4b, 0x5a, 0x7f, 0xb0, 0xc3, - 0x26, 0x9d, 0x8b, 0xb8, 0x83, 0x69, 0xcf, 0x16, 0x72, 0x62, 0x3e, 0x5e, - 0x53, 0x4f, 0x9c, 0x73, 0x76, 0xfc, 0x19, 0xef, 0xa0, 0x74, 0x3a, 0x11, - 0x1e, 0xd0, 0x4d, 0xb7, 0x87, 0xa1, 0xd6, 0x87, 0x6c, 0x0e, 0x6c, 0x8c, - 0xe9, 0xa0, 0x44, 0xc4, 0x72, 0x3e, 0x73, 0x17, 0x13, 0xd1, 0x4e, 0x3d, - 0x8e, 0x1d, 0x5a, 0x8b, 0x75, 0xcb, 0x59, 0x2c, 0x47, 0x87, 0x15, 0x41, - 0xfe, 0x08, 0xe9, 0xa6, 0x97, 0x17, 0x08, 0x26, 0x6a, 0xb5, 0xbb, 0x73, - 0xaa, 0xb8, 0x5b, 0x65, 0x65, 0x5b, 0x30, 0x9e, 0x62, 0x59, 0x02, 0xf8, - 0xb8, 0x0f, 0x32, 0x10, 0xc1, 0x36, 0x08, 0x52, 0x98, 0x4a, 0x1e, 0xf0, - 0xab, 0x21, 0x5e, 0xde, 0x16, 0x0c, 0xda, 0x09, 0x99, 0x6b, 0x9e, 0xc0, - 0x90, 0xa5, 0x5a, 0xcc, 0xb0, 0xb7, 0xbb, 0xd2, 0x8b, 0x5f, 0xd3, 0x3b, - 0x3e, 0x8c, 0xa5, 0x71, 0x66, 0x06, 0xe3, 0x28, 0xd4, 0xf8, 0x3f, 0xe5, - 0x27, 0xdf, 0xfe, 0x0f, 0x09, 0xb2, 0x8a, 0x09, 0x5a, 0x23, 0x61, 0x0d, - 0x2d, 0xf5, 0x44, 0xf1, 0x5c, 0xf8, 0x82, 0x4e, 0xdc, 0x78, 0x7a, 0xab, - 0xc3, 0x57, 0x91, 0xaf, 0x65, 0x6e, 0x71, 0xf1, 0x44, 0xbf, 0xed, 0x43, - 0x50, 0xb4, 0x67, 0x48, 0xef, 0x5a, 0x10, 0x46, 0x81, 0xb4, 0x0c, 0xc8, - 0x48, 0xed, 0x99, 0x7a, 0x45, 0xa5, 0x92, 0xc3, 0x69, 0xd6, 0xd7, 0x8a, - 0x20, 0x1b, 0xeb, 0x8f, 0xb2, 0xff, 0xec, 0x6d, 0x76, 0x04, 0xf8, 0xc2, - 0x58, 0x9b, 0xf2, 0x20, 0x53, 0xc4, 0x74, 0x91, 0x19, 0xdd, 0x2d, 0x12, - 0x53, 0xc7, 0x6e, 0xd0, 0x02, 0x51, 0x3c, 0xa6, 0x7d, 0x80, 0x75, 0x6b, - 0x1d, 0xdf, 0xf8, 0x6a, 0x52, 0xbb, 0x81, 0xf8, 0x30, 0x45, 0xef, 0x51, - 0x85, 0x36, 0xbe, 0x8e, 0xcf, 0x0b, 0x9a, 0x46, 0xe8, 0x3f, 0x99, 0xfd, - 0xf7, 0xd9, 0x3e, 0x84, 0xe5, 0xe3, 0x37, 0xcf, 0x98, 0x7f, 0xeb, 0x5e, - 0x5a, 0x53, 0x77, 0x1c, 0x20, 0xdc, 0xf1, 0x20, 0x99, 0xec, 0x60, 0x40, - 0x93, 0xef, 0x5c, 0x1c, 0x81, 0xe2, 0xa5, 0xad, 0x2a, 0xc2, 0xdb, 0x6b, - 0xc1, 0x7e, 0x8f, 0xa9, 0x23, 0x5b, 0xd9, 0x0d, 0xfe, 0xa0, 0xac, 0x11, - 0x28, 0xba, 0x8e, 0x92, 0x07, 0x2d, 0x07, 0x40, 0x83, 0x14, 0x4c, 0x35, - 0x8d, 0xd0, 0x11, 0xff, 0x98, 0xdb, 0x00, 0x30, 0x6f, 0x65, 0xb6, 0xa0, - 0x7f, 0x9c, 0x08, 0xb8, 0xce, 0xb3, 0xa8, 0x42, 0xd3, 0x84, 0x45, 0xe1, - 0xe3, 0x8f, 0xa6, 0x89, 0x21, 0xd7, 0x74, 0x02, 0x4d, 0x64, 0xdf, 0x54, - 0x15, 0x9e, 0xba, 0x12, 0x49, 0x09, 0x41, 0xf6, 0x10, 0x24, 0xa1, 0x84, - 0x15, 0xfd, 0x68, 0x6a, 0x57, 0x66, 0xb3, 0x6d, 0x4c, 0xea, 0xbf, 0xbc, - 0x60, 0x3f, 0x52, 0x1c, 0x44, 0x1b, 0xc0, 0x4a, 0x25, 0xe3, 0xd9, 0x4c, - 0x9a, 0x74, 0xad, 0xfc, 0x9e, 0x8d, 0x0b, 0x18, 0x66, 0x24, 0xd1, 0x06, - 0xac, 0x68, 0xc1, 0xae, 0x14, 0xce, 0xb1, 0xf3, 0x86, 0x9f, 0x87, 0x11, - 0xd7, 0x9f, 0x30, 0x92, 0xdb, 0xec, 0x0b, 0x4a, 0xe8, 0xf6, 0x53, 0x36, - 0x68, 0x12, 0x11, 0x5e, 0xe0, 0x34, 0xa4, 0xff, 0x00, 0x0a, 0x26, 0xb8, - 0x62, 0x79, 0x9c, 0x0c, 0xd5, 0xe5, 0xf5, 0x1c, 0x1a, 0x16, 0x84, 0x4d, - 0x8e, 0x5d, 0x31, 0x7e, 0xf7, 0xe2, 0xd3, 0xa1, 0x41, 0x90, 0x61, 0x5d, - 0x04, 0xb2, 0x9a, 0x18, 0x9e, 0x54, 0xfb, 0xd1, 0x61, 0x95, 0x1b, 0x08, - 0xca, 0x7c, 0x49, 0x44, 0x74, 0x1d, 0x2f, 0xca, 0xc4, 0x7a, 0xe1, 0x8b, - 0x2f, 0xbb, 0x96, 0xee, 0x19, 0x8a, 0x5d, 0xfb, 0x3e, 0x82, 0xe7, 0x15, - 0xdb, 0x29, 0x14, 0xee, 0xc9, 0x4d, 0x9a, 0xfb, 0x9f, 0x8a, 0xbb, 0x17, - 0x37, 0x1b, 0x6e, 0x28, 0x6c, 0xf9, 0xff, 0xb5, 0xb5, 0x8b, 0x9d, 0x88, - 0x20, 0x08, 0x10, 0xd7, 0xca, 0x58, 0xf6, 0xe1, 0x32, 0x91, 0x6f, 0x36, - 0xc0, 0xad, 0xc1, 0x57, 0x5d, 0x76, 0x31, 0x43, 0xf3, 0xdd, 0xec, 0xf1, - 0xa9, 0x79, 0xe9, 0xe9, 0x85, 0xd7, 0x91, 0xc7, 0x31, 0x62, 0x3c, 0xd2, - 0x90, 0x2c, 0x9c, 0xa4, 0x56, 0x37, 0x7b, 0xbe, 0x40, 0x58, 0xc0, 0x81, - 0x83, 0x22, 0xe8, 0x13, 0x79, 0x18, 0xdb, 0x3a, 0x1b, 0x31, 0x0d, 0x00, - 0x6c, 0x22, 0x62, 0x75, 0x70, 0xd8, 0x96, 0x59, 0x99, 0x44, 0x79, 0x71, - 0xa6, 0x76, 0x81, 0x28, 0xb2, 0x65, 0xe8, 0x47, 0x14, 0xc6, 0x39, 0x06, -}; - -SPAKE2_CTX *SPAKE2_CTX_new(enum spake2_role_t my_role, - const uint8_t *my_name, size_t my_name_len, - const uint8_t *their_name, size_t their_name_len) { - SPAKE2_CTX *ctx = OPENSSL_malloc(sizeof(SPAKE2_CTX)); - if (ctx == NULL) { - return NULL; - } - - OPENSSL_memset(ctx, 0, sizeof(SPAKE2_CTX)); - ctx->my_role = my_role; - - CBS my_name_cbs, their_name_cbs; - CBS_init(&my_name_cbs, my_name, my_name_len); - CBS_init(&their_name_cbs, their_name, their_name_len); - if (!CBS_stow(&my_name_cbs, &ctx->my_name, &ctx->my_name_len) || - !CBS_stow(&their_name_cbs, &ctx->their_name, &ctx->their_name_len)) { - SPAKE2_CTX_free(ctx); - return NULL; - } - - return ctx; -} - -void SPAKE2_CTX_free(SPAKE2_CTX *ctx) { - if (ctx == NULL) { - return; - } - - OPENSSL_free(ctx->my_name); - OPENSSL_free(ctx->their_name); - OPENSSL_free(ctx); -} - -// left_shift_3 sets |n| to |n|*8, where |n| is represented in little-endian -// order. -static void left_shift_3(uint8_t n[32]) { - uint8_t carry = 0; - unsigned i; - - for (i = 0; i < 32; i++) { - const uint8_t next_carry = n[i] >> 5; - n[i] = (n[i] << 3) | carry; - carry = next_carry; - } -} - -typedef struct { - BN_ULONG words[32 / sizeof(BN_ULONG)]; -} scalar; - -// kOrder is the order of the prime-order subgroup of curve25519. -static const scalar kOrder = { - {TOBN(0x5812631a, 0x5cf5d3ed), TOBN(0x14def9de, 0xa2f79cd6), - TOBN(0x00000000, 0x00000000), TOBN(0x10000000, 0x00000000)}}; - -// scalar_cmov copies |src| to |dest| if |mask| is all ones. -static void scalar_cmov(scalar *dest, const scalar *src, crypto_word_t mask) { - bn_select_words(dest->words, mask, src->words, dest->words, - OPENSSL_ARRAY_SIZE(dest->words)); -} - -// scalar_double sets |s| to |2×s|. -static void scalar_double(scalar *s) { - bn_add_words(s->words, s->words, s->words, OPENSSL_ARRAY_SIZE(s->words)); -} - -// scalar_add sets |dest| to |dest| plus |src|. -static void scalar_add(scalar *dest, const scalar *src) { - bn_add_words(dest->words, dest->words, src->words, - OPENSSL_ARRAY_SIZE(dest->words)); -} - -int SPAKE2_generate_msg(SPAKE2_CTX *ctx, uint8_t *out, size_t *out_len, - size_t max_out_len, const uint8_t *password, - size_t password_len) { - if (ctx->state != spake2_state_init) { - return 0; - } - - if (max_out_len < sizeof(ctx->my_msg)) { - return 0; - } - - uint8_t private_tmp[64]; - RAND_bytes(private_tmp, sizeof(private_tmp)); - x25519_sc_reduce(private_tmp); - // Multiply by the cofactor (eight) so that we'll clear it when operating on - // the peer's point later in the protocol. - left_shift_3(private_tmp); - OPENSSL_memcpy(ctx->private_key, private_tmp, sizeof(ctx->private_key)); - - ge_p3 P; - x25519_ge_scalarmult_base(&P, ctx->private_key); - - // mask = h(password) * . - uint8_t password_tmp[SHA512_DIGEST_LENGTH]; - SHA512(password, password_len, password_tmp); - OPENSSL_memcpy(ctx->password_hash, password_tmp, sizeof(ctx->password_hash)); - x25519_sc_reduce(password_tmp); - - // Due to a copy-paste error, the call to |left_shift_3| was omitted after - // the |x25519_sc_reduce|, just above. This meant that |ctx->password_scalar| - // was not a multiple of eight to clear the cofactor and thus three bits of - // the password hash would leak. In order to fix this in a unilateral way, - // points of small order are added to the mask point such that it is in the - // prime-order subgroup. Since the ephemeral scalar is a multiple of eight, - // these points will cancel out when calculating the shared secret. - // - // Adding points of small order is the same as adding multiples of the prime - // order to the password scalar. Since that's faster, that is what is done - // below. The prime order (kOrder) is a large prime, thus odd, thus the LSB - // is one. So adding it will flip the LSB. Adding twice it will flip the next - // bit and so one for all the bottom three bits. - - scalar password_scalar; - OPENSSL_memcpy(&password_scalar, password_tmp, sizeof(password_scalar)); - - // |password_scalar| is the result of |x25519_sc_reduce| and thus is, at - // most, $l-1$ (where $l$ is |kOrder|, the order of the prime-order subgroup - // of Ed25519). In the following, we may add $l + 2×l + 4×l$ for a max value - // of $8×l-1$. That is < 2**256, as required. - - if (!ctx->disable_password_scalar_hack) { - scalar order = kOrder; - scalar tmp; - - OPENSSL_memset(&tmp, 0, sizeof(tmp)); - scalar_cmov(&tmp, &order, - constant_time_eq_w(password_scalar.words[0] & 1, 1)); - scalar_add(&password_scalar, &tmp); - - scalar_double(&order); - OPENSSL_memset(&tmp, 0, sizeof(tmp)); - scalar_cmov(&tmp, &order, - constant_time_eq_w(password_scalar.words[0] & 2, 2)); - scalar_add(&password_scalar, &tmp); - - scalar_double(&order); - OPENSSL_memset(&tmp, 0, sizeof(tmp)); - scalar_cmov(&tmp, &order, - constant_time_eq_w(password_scalar.words[0] & 4, 4)); - scalar_add(&password_scalar, &tmp); - - assert((password_scalar.words[0] & 7) == 0); - } - - OPENSSL_memcpy(ctx->password_scalar, password_scalar.words, - sizeof(ctx->password_scalar)); - - ge_p3 mask; - x25519_ge_scalarmult_small_precomp(&mask, ctx->password_scalar, - ctx->my_role == spake2_role_alice - ? kSpakeMSmallPrecomp - : kSpakeNSmallPrecomp); - - // P* = P + mask. - ge_cached mask_cached; - x25519_ge_p3_to_cached(&mask_cached, &mask); - ge_p1p1 Pstar; - x25519_ge_add(&Pstar, &P, &mask_cached); - - // Encode P* - ge_p2 Pstar_proj; - x25519_ge_p1p1_to_p2(&Pstar_proj, &Pstar); - x25519_ge_tobytes(ctx->my_msg, &Pstar_proj); - - OPENSSL_memcpy(out, ctx->my_msg, sizeof(ctx->my_msg)); - *out_len = sizeof(ctx->my_msg); - ctx->state = spake2_state_msg_generated; - - return 1; -} - -static void update_with_length_prefix(SHA512_CTX *sha, const uint8_t *data, - const size_t len) { - uint8_t len_le[8]; - size_t l = len; - unsigned i; - - for (i = 0; i < 8; i++) { - len_le[i] = l & 0xff; - l >>= 8; - } - - SHA512_Update(sha, len_le, sizeof(len_le)); - SHA512_Update(sha, data, len); -} - -int SPAKE2_process_msg(SPAKE2_CTX *ctx, uint8_t *out_key, size_t *out_key_len, - size_t max_out_key_len, const uint8_t *their_msg, - size_t their_msg_len) { - if (ctx->state != spake2_state_msg_generated || - their_msg_len != 32) { - return 0; - } - - ge_p3 Qstar; - if (!x25519_ge_frombytes_vartime(&Qstar, their_msg)) { - // Point received from peer was not on the curve. - return 0; - } - - // Unmask peer's value. - ge_p3 peers_mask; - x25519_ge_scalarmult_small_precomp(&peers_mask, ctx->password_scalar, - ctx->my_role == spake2_role_alice - ? kSpakeNSmallPrecomp - : kSpakeMSmallPrecomp); - - ge_cached peers_mask_cached; - x25519_ge_p3_to_cached(&peers_mask_cached, &peers_mask); - - ge_p1p1 Q_compl; - ge_p3 Q_ext; - x25519_ge_sub(&Q_compl, &Qstar, &peers_mask_cached); - x25519_ge_p1p1_to_p3(&Q_ext, &Q_compl); - - ge_p2 dh_shared; - x25519_ge_scalarmult(&dh_shared, ctx->private_key, &Q_ext); - - uint8_t dh_shared_encoded[32]; - x25519_ge_tobytes(dh_shared_encoded, &dh_shared); - - SHA512_CTX sha; - SHA512_Init(&sha); - if (ctx->my_role == spake2_role_alice) { - update_with_length_prefix(&sha, ctx->my_name, ctx->my_name_len); - update_with_length_prefix(&sha, ctx->their_name, ctx->their_name_len); - update_with_length_prefix(&sha, ctx->my_msg, sizeof(ctx->my_msg)); - update_with_length_prefix(&sha, their_msg, 32); - } else { - update_with_length_prefix(&sha, ctx->their_name, ctx->their_name_len); - update_with_length_prefix(&sha, ctx->my_name, ctx->my_name_len); - update_with_length_prefix(&sha, their_msg, 32); - update_with_length_prefix(&sha, ctx->my_msg, sizeof(ctx->my_msg)); - } - update_with_length_prefix(&sha, dh_shared_encoded, sizeof(dh_shared_encoded)); - update_with_length_prefix(&sha, ctx->password_hash, - sizeof(ctx->password_hash)); - - uint8_t key[SHA512_DIGEST_LENGTH]; - SHA512_Final(key, &sha); - - size_t to_copy = max_out_key_len; - if (to_copy > sizeof(key)) { - to_copy = sizeof(key); - } - OPENSSL_memcpy(out_key, key, to_copy); - *out_key_len = to_copy; - ctx->state = spake2_state_key_generated; - - return 1; -} diff --git a/third_party/boringssl/src/crypto/curve25519/spake25519.cc b/third_party/boringssl/src/crypto/curve25519/spake25519.cc new file mode 100644 index 00000000..0f6cf04e --- /dev/null +++ b/third_party/boringssl/src/crypto/curve25519/spake25519.cc @@ -0,0 +1,532 @@ +// Copyright 2016 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include +#include +#include +#include + +#include "../fipsmodule/bn/internal.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "./internal.h" + + +using namespace bssl; + +// The following precomputation tables are for the following +// points used in the SPAKE2 protocol. +// +// N: +// x: +// 49918732221787544735331783592030787422991506689877079631459872391322455579424 +// y: +// 54629554431565467720832445949441049581317094546788069926228343916274969994000 +// encoded: 10e3df0ae37d8e7a99b5fe74b44672103dbddcbd06af680d71329a11693bc778 +// +// M: +// x: +// 31406539342727633121250288103050113562375374900226415211311216773867585644232 +// y: +// 21177308356423958466833845032658859666296341766942662650232962324899758529114 +// encoded: 5ada7e4bf6ddd9adb6626d32131c6b5c51a1e347a3478f53cfcf441b88eed12e +// +// These points and their precomputation tables are generated with the +// following Python code. For a description of the precomputation table, +// see curve25519.c in this directory. +// +// Exact copies of the source code are kept in bug 27296743. + +/* +import hashlib +import ed25519 as E # http://ed25519.cr.yp.to/python/ed25519.py + +SEED_N = 'edwards25519 point generation seed (N)' +SEED_M = 'edwards25519 point generation seed (M)' + +def genpoint(seed): + v = hashlib.sha256(seed).digest() + it = 1 + while True: + try: + x,y = E.decodepoint(v) + except Exception, e: + print e + it += 1 + v = hashlib.sha256(v).digest() + continue + print "Found in %d iterations:" % it + print " x = %d" % x + print " y = %d" % y + print " Encoded (hex)" + print E.encodepoint((x,y)).encode('hex') + return (x,y) + +def gentable(P): + t = [] + for i in range(1,16): + k = ((i >> 3 & 1) * (1 << 192) + + (i >> 2 & 1) * (1 << 128) + + (i >> 1 & 1) * (1 << 64) + + (i & 1)) + t.append(E.scalarmult(P, k)) + return ''.join(E.encodeint(x) + E.encodeint(y) for (x,y) in t) + +def printtable(table, name): + print "static const uint8_t %s[15 * 2 * 32] = {" % name, + for i in range(15 * 2 * 32): + if i % 12 == 0: + print "\n ", + print " 0x%02x," % ord(table[i]), + print "\n};" + +if __name__ == "__main__": + print "Searching for N" + N = genpoint(SEED_N) + print "Generating precomputation table for N" + Ntable = gentable(N) + printtable(Ntable, "kSpakeNSmallPrecomp") + + print "Searching for M" + M = genpoint(SEED_M) + print "Generating precomputation table for M" + Mtable = gentable(M) + printtable(Mtable, "kSpakeMSmallPrecomp") +*/ + +static const uint8_t kSpakeNSmallPrecomp[15 * 2 * 32] = { + 0x20, 0x1b, 0xc5, 0xb3, 0x43, 0x17, 0x71, 0x10, 0x44, 0x1e, 0x73, 0xb3, + 0xae, 0x3f, 0xbf, 0x9f, 0xf5, 0x44, 0xc8, 0x13, 0x8f, 0xd1, 0x01, 0xc2, + 0x8a, 0x1a, 0x6d, 0xea, 0x4d, 0x00, 0x5d, 0x6e, 0x10, 0xe3, 0xdf, 0x0a, + 0xe3, 0x7d, 0x8e, 0x7a, 0x99, 0xb5, 0xfe, 0x74, 0xb4, 0x46, 0x72, 0x10, + 0x3d, 0xbd, 0xdc, 0xbd, 0x06, 0xaf, 0x68, 0x0d, 0x71, 0x32, 0x9a, 0x11, + 0x69, 0x3b, 0xc7, 0x78, 0x93, 0xf1, 0x57, 0x97, 0x6e, 0xf0, 0x6e, 0x45, + 0x37, 0x4a, 0xf4, 0x0b, 0x18, 0x51, 0xf5, 0x4f, 0x67, 0x3c, 0xdc, 0xec, + 0x84, 0xed, 0xd0, 0xeb, 0xca, 0xfb, 0xdb, 0xff, 0x7f, 0xeb, 0xa8, 0x23, + 0x68, 0x87, 0x13, 0x64, 0x6a, 0x10, 0xf7, 0x45, 0xe0, 0x0f, 0x32, 0x21, + 0x59, 0x7c, 0x0e, 0x50, 0xad, 0x56, 0xd7, 0x12, 0x69, 0x7b, 0x58, 0xf8, + 0xb9, 0x3b, 0xa5, 0xbb, 0x4d, 0x1b, 0x87, 0x1c, 0x46, 0xa7, 0x17, 0x9d, + 0x6d, 0x84, 0x45, 0xbe, 0x7f, 0x95, 0xd2, 0x34, 0xcd, 0x89, 0x95, 0xc0, + 0xf0, 0xd3, 0xdf, 0x6e, 0x10, 0x4a, 0xe3, 0x7b, 0xce, 0x7f, 0x40, 0x27, + 0xc7, 0x2b, 0xab, 0x66, 0x03, 0x59, 0xb4, 0x7b, 0xc7, 0xc7, 0xf0, 0x39, + 0x9a, 0x33, 0x35, 0xbf, 0xcc, 0x2f, 0xf3, 0x2e, 0x68, 0x9d, 0x53, 0x5c, + 0x88, 0x52, 0xe3, 0x77, 0x90, 0xa1, 0x27, 0x85, 0xc5, 0x74, 0x7f, 0x23, + 0x0e, 0x93, 0x01, 0x3e, 0xe7, 0x2e, 0x2e, 0x95, 0xf3, 0x0d, 0xc2, 0x25, + 0x25, 0x39, 0x39, 0x3d, 0x6e, 0x8e, 0x89, 0xbd, 0xe8, 0xbb, 0x67, 0x5e, + 0x8c, 0x66, 0x8b, 0x63, 0x28, 0x1e, 0x4e, 0x74, 0x85, 0xa8, 0xaf, 0x0f, + 0x12, 0x5d, 0xb6, 0x8a, 0x83, 0x1a, 0x77, 0x76, 0x5e, 0x62, 0x8a, 0xa7, + 0x3c, 0xb8, 0x05, 0x57, 0x2b, 0xaf, 0x36, 0x2e, 0x10, 0x90, 0xb2, 0x39, + 0xb4, 0x3e, 0x75, 0x6d, 0x3a, 0xa8, 0x31, 0x35, 0xc2, 0x1e, 0x8f, 0xc2, + 0x79, 0x89, 0x35, 0x16, 0x26, 0xd1, 0xc7, 0x0b, 0x04, 0x1f, 0x1d, 0xf9, + 0x9c, 0x05, 0xa6, 0x6b, 0xb5, 0x19, 0x5a, 0x24, 0x6d, 0x91, 0xc5, 0x31, + 0xfd, 0xc5, 0xfa, 0xe7, 0xa6, 0xcb, 0x0e, 0x4b, 0x18, 0x0d, 0x94, 0xc7, + 0xee, 0x1d, 0x46, 0x1f, 0x92, 0xb1, 0xb2, 0x4a, 0x2b, 0x43, 0x37, 0xfe, + 0xc2, 0x15, 0x11, 0x89, 0xef, 0x59, 0x73, 0x3c, 0x06, 0x76, 0x78, 0xcb, + 0xa6, 0x0d, 0x79, 0x5f, 0x28, 0x0b, 0x5b, 0x8c, 0x9e, 0xe4, 0xaa, 0x51, + 0x9a, 0x42, 0x6f, 0x11, 0x50, 0x3d, 0x01, 0xd6, 0x21, 0xc0, 0x99, 0x5e, + 0x1a, 0xe8, 0x81, 0x25, 0x80, 0xeb, 0xed, 0x5d, 0x37, 0x47, 0x30, 0x70, + 0xa0, 0x4e, 0x0b, 0x43, 0x17, 0xbe, 0xb6, 0x47, 0xe7, 0x2a, 0x62, 0x9d, + 0x5d, 0xa6, 0xc5, 0x33, 0x62, 0x9d, 0x56, 0x24, 0x9d, 0x1d, 0xb2, 0x13, + 0xbc, 0x17, 0x66, 0x43, 0xd1, 0x68, 0xd5, 0x3b, 0x17, 0x69, 0x17, 0xa6, + 0x06, 0x9e, 0x12, 0xb8, 0x7c, 0xd5, 0xaf, 0x3e, 0x21, 0x1b, 0x31, 0xeb, + 0x0b, 0xa4, 0x98, 0x1c, 0xf2, 0x6a, 0x5e, 0x7c, 0x9b, 0x45, 0x8f, 0xb2, + 0x12, 0x06, 0xd5, 0x8c, 0x1d, 0xb2, 0xa7, 0x57, 0x5f, 0x2f, 0x4f, 0xdb, + 0x52, 0x99, 0x7c, 0x58, 0x01, 0x5f, 0xf2, 0xa5, 0xf6, 0x51, 0x86, 0x21, + 0x2f, 0x5b, 0x8d, 0x6a, 0xae, 0x83, 0x34, 0x6d, 0x58, 0x4b, 0xef, 0xfe, + 0xbf, 0x73, 0x5d, 0xdb, 0xc4, 0x97, 0x2a, 0x85, 0xf3, 0x6c, 0x46, 0x42, + 0xb3, 0x90, 0xc1, 0x57, 0x97, 0x50, 0x35, 0xb1, 0x9d, 0xb7, 0xc7, 0x3c, + 0x85, 0x6d, 0x6c, 0xfd, 0xce, 0xb0, 0xc9, 0xa2, 0x77, 0xee, 0xc3, 0x6b, + 0x0c, 0x37, 0xfa, 0x30, 0x91, 0xd1, 0x2c, 0xb8, 0x5e, 0x7f, 0x81, 0x5f, + 0x87, 0xfd, 0x18, 0x02, 0x5a, 0x30, 0x4e, 0x62, 0xbc, 0x65, 0xc6, 0xce, + 0x1a, 0xcf, 0x2b, 0xaa, 0x56, 0x3e, 0x4d, 0xcf, 0xba, 0x62, 0x5f, 0x9a, + 0xd0, 0x72, 0xff, 0xef, 0x28, 0xbd, 0xbe, 0xd8, 0x57, 0x3d, 0xf5, 0x57, + 0x7d, 0xe9, 0x71, 0x31, 0xec, 0x98, 0x90, 0x94, 0xd9, 0x54, 0xbf, 0x84, + 0x0b, 0xe3, 0x06, 0x47, 0x19, 0x9a, 0x13, 0x1d, 0xef, 0x9d, 0x13, 0xf3, + 0xdb, 0xc3, 0x5c, 0x72, 0x9e, 0xed, 0x24, 0xaa, 0x64, 0xed, 0xe7, 0x0d, + 0xa0, 0x7c, 0x73, 0xba, 0x9b, 0x86, 0xa7, 0x3b, 0x55, 0xab, 0x58, 0x30, + 0xf1, 0x15, 0x81, 0x83, 0x2f, 0xf9, 0x62, 0x84, 0x98, 0x66, 0xf6, 0x55, + 0x21, 0xd8, 0xf2, 0x25, 0x64, 0x71, 0x4b, 0x12, 0x76, 0x59, 0xc5, 0xaa, + 0x93, 0x67, 0xc3, 0x86, 0x25, 0xab, 0x4e, 0x4b, 0xf6, 0xd8, 0x3f, 0x44, + 0x2e, 0x11, 0xe0, 0xbd, 0x6a, 0xf2, 0x5d, 0xf5, 0xf9, 0x53, 0xea, 0xa4, + 0xc8, 0xd9, 0x50, 0x33, 0x81, 0xd9, 0xa8, 0x2d, 0x91, 0x7d, 0x13, 0x2a, + 0x11, 0xcf, 0xde, 0x3f, 0x0a, 0xd2, 0xbc, 0x33, 0xb2, 0x62, 0x53, 0xea, + 0x77, 0x88, 0x43, 0x66, 0x27, 0x43, 0x85, 0xe9, 0x5f, 0x55, 0xf5, 0x2a, + 0x8a, 0xac, 0xdf, 0xff, 0x9b, 0x4c, 0x96, 0x9c, 0xa5, 0x7a, 0xce, 0xd5, + 0x79, 0x18, 0xf1, 0x0b, 0x58, 0x95, 0x7a, 0xe7, 0xd3, 0x74, 0x65, 0x0b, + 0xa4, 0x64, 0x30, 0xe8, 0x5c, 0xfc, 0x55, 0x56, 0xee, 0x14, 0x14, 0xd3, + 0x45, 0x3b, 0xf8, 0xde, 0x05, 0x3e, 0xb9, 0x3c, 0xd7, 0x6a, 0x52, 0x72, + 0x5b, 0x39, 0x09, 0xbe, 0x82, 0x23, 0x10, 0x4a, 0xb7, 0xc3, 0xdc, 0x4c, + 0x5d, 0xc9, 0xf1, 0x14, 0x83, 0xf9, 0x0b, 0x9b, 0xe9, 0x23, 0x84, 0x6a, + 0xc4, 0x08, 0x3d, 0xda, 0x3d, 0x12, 0x95, 0x87, 0x18, 0xa4, 0x7d, 0x3f, + 0x23, 0xde, 0xd4, 0x1e, 0xa8, 0x47, 0xc3, 0x71, 0xdb, 0xf5, 0x03, 0x6c, + 0x57, 0xe7, 0xa4, 0x43, 0x82, 0x33, 0x7b, 0x62, 0x46, 0x7d, 0xf7, 0x10, + 0x69, 0x18, 0x38, 0x27, 0x9a, 0x6f, 0x38, 0xac, 0xfa, 0x92, 0xc5, 0xae, + 0x66, 0xa6, 0x73, 0x95, 0x15, 0x0e, 0x4c, 0x04, 0xb6, 0xfc, 0xf5, 0xc7, + 0x21, 0x3a, 0x99, 0xdb, 0x0e, 0x36, 0xf0, 0x56, 0xbc, 0x75, 0xf9, 0x87, + 0x9b, 0x11, 0x18, 0x92, 0x64, 0x1a, 0xe7, 0xc7, 0xab, 0x5a, 0xc7, 0x26, + 0x7f, 0x13, 0x98, 0x42, 0x52, 0x43, 0xdb, 0xc8, 0x6d, 0x0b, 0xb7, 0x31, + 0x93, 0x24, 0xd6, 0xe8, 0x24, 0x1f, 0x6f, 0x21, 0xa7, 0x8c, 0xeb, 0xdb, + 0x83, 0xb8, 0x89, 0xe3, 0xc1, 0xd7, 0x69, 0x3b, 0x02, 0x6b, 0x54, 0x0f, + 0x84, 0x2f, 0xb5, 0x5c, 0x17, 0x77, 0xbe, 0xe5, 0x61, 0x0d, 0xc5, 0xdf, + 0x3b, 0xcf, 0x3e, 0x93, 0x4f, 0xf5, 0x89, 0xb9, 0x5a, 0xc5, 0x29, 0x31, + 0xc0, 0xc2, 0xff, 0xe5, 0x3f, 0xa6, 0xac, 0x03, 0xca, 0xf5, 0xff, 0xe0, + 0x36, 0xce, 0xf3, 0xe2, 0xb7, 0x9c, 0x02, 0xe9, 0x9e, 0xd2, 0xbc, 0x87, + 0x2f, 0x3d, 0x9a, 0x1d, 0x8f, 0xc5, 0x72, 0xb8, 0xa2, 0x01, 0xd4, 0x68, + 0xb1, 0x84, 0x16, 0x10, 0xf6, 0xf3, 0x52, 0x25, 0xd9, 0xdc, 0x4c, 0xdd, + 0x0f, 0xd6, 0x4a, 0xcf, 0x60, 0x96, 0x7e, 0xcc, 0x42, 0x0f, 0x64, 0x9d, + 0x72, 0x46, 0x04, 0x07, 0xf2, 0x5b, 0xf4, 0x07, 0xd1, 0xf4, 0x59, 0x71, +}; + +static const uint8_t kSpakeMSmallPrecomp[15 * 2 * 32] = { + 0xc8, 0xa6, 0x63, 0xc5, 0x97, 0xf1, 0xee, 0x40, 0xab, 0x62, 0x42, 0xee, + 0x25, 0x6f, 0x32, 0x6c, 0x75, 0x2c, 0xa7, 0xd3, 0xbd, 0x32, 0x3b, 0x1e, + 0x11, 0x9c, 0xbd, 0x04, 0xa9, 0x78, 0x6f, 0x45, 0x5a, 0xda, 0x7e, 0x4b, + 0xf6, 0xdd, 0xd9, 0xad, 0xb6, 0x62, 0x6d, 0x32, 0x13, 0x1c, 0x6b, 0x5c, + 0x51, 0xa1, 0xe3, 0x47, 0xa3, 0x47, 0x8f, 0x53, 0xcf, 0xcf, 0x44, 0x1b, + 0x88, 0xee, 0xd1, 0x2e, 0x03, 0x89, 0xaf, 0xc0, 0x61, 0x2d, 0x9e, 0x35, + 0xeb, 0x0e, 0x03, 0xe0, 0xb7, 0xfb, 0xa5, 0xbc, 0x44, 0xbe, 0x0c, 0x89, + 0x0a, 0x0f, 0xd6, 0x59, 0x47, 0x9e, 0xe6, 0x3d, 0x36, 0x9d, 0xff, 0x44, + 0x5e, 0xac, 0xab, 0xe5, 0x3a, 0xd5, 0xb0, 0x35, 0x9f, 0x6d, 0x7f, 0xba, + 0xc0, 0x85, 0x0e, 0xf4, 0x70, 0x3f, 0x13, 0x90, 0x4c, 0x50, 0x1a, 0xee, + 0xc5, 0xeb, 0x69, 0xfe, 0x98, 0x42, 0x87, 0x1d, 0xce, 0x6c, 0x29, 0xaa, + 0x2b, 0x31, 0xc2, 0x38, 0x7b, 0x6b, 0xee, 0x88, 0x0b, 0xba, 0xce, 0xa8, + 0xca, 0x19, 0x60, 0x1b, 0x16, 0xf1, 0x25, 0x1e, 0xcf, 0x63, 0x66, 0x1e, + 0xbb, 0x63, 0xeb, 0x7d, 0xca, 0xd2, 0xb4, 0x23, 0x5a, 0x01, 0x6f, 0x05, + 0xd1, 0xdc, 0x41, 0x73, 0x75, 0xc0, 0xfd, 0x30, 0x91, 0x52, 0x68, 0x96, + 0x45, 0xb3, 0x66, 0x01, 0x3b, 0x53, 0x89, 0x3c, 0x69, 0xbc, 0x6c, 0x69, + 0xe3, 0x51, 0x8f, 0xe3, 0xd2, 0x84, 0xd5, 0x28, 0x66, 0xb5, 0xe6, 0x06, + 0x09, 0xfe, 0x6d, 0xb0, 0x72, 0x16, 0xe0, 0x8a, 0xce, 0x61, 0x65, 0xa9, + 0x21, 0x32, 0x48, 0xdc, 0x7a, 0x1d, 0xe1, 0x38, 0x7f, 0x8c, 0x75, 0x88, + 0x3d, 0x08, 0xa9, 0x4a, 0x6f, 0x3d, 0x9f, 0x7f, 0x3f, 0xbd, 0x57, 0x6b, + 0x19, 0xce, 0x3f, 0x4a, 0xc9, 0xd3, 0xf9, 0x6e, 0x72, 0x7b, 0x5b, 0x74, + 0xea, 0xbe, 0x9c, 0x7a, 0x6d, 0x9c, 0x40, 0x49, 0xe6, 0xfb, 0x2a, 0x1a, + 0x75, 0x70, 0xe5, 0x4e, 0xed, 0x74, 0xe0, 0x75, 0xac, 0xc0, 0xb1, 0x11, + 0x3e, 0xf2, 0xaf, 0x88, 0x4d, 0x66, 0xb6, 0xf6, 0x15, 0x4f, 0x3c, 0x6c, + 0x77, 0xae, 0x47, 0x51, 0x63, 0x9a, 0xfe, 0xe1, 0xb4, 0x1a, 0x12, 0xdf, + 0xe9, 0x54, 0x8d, 0x3b, 0x30, 0x2a, 0x75, 0xe3, 0xe5, 0x29, 0xb1, 0x4c, + 0xb0, 0x7c, 0x6d, 0xb5, 0xae, 0x85, 0xdb, 0x1e, 0x38, 0x55, 0x96, 0xa5, + 0x5b, 0x9f, 0x15, 0x23, 0x28, 0x36, 0xb8, 0xa2, 0x41, 0xb4, 0xd7, 0x19, + 0x91, 0x8d, 0x26, 0x3e, 0xca, 0x9c, 0x05, 0x7a, 0x2b, 0x60, 0x45, 0x86, + 0x8b, 0xee, 0x64, 0x6f, 0x5c, 0x09, 0x4d, 0x4b, 0x5a, 0x7f, 0xb0, 0xc3, + 0x26, 0x9d, 0x8b, 0xb8, 0x83, 0x69, 0xcf, 0x16, 0x72, 0x62, 0x3e, 0x5e, + 0x53, 0x4f, 0x9c, 0x73, 0x76, 0xfc, 0x19, 0xef, 0xa0, 0x74, 0x3a, 0x11, + 0x1e, 0xd0, 0x4d, 0xb7, 0x87, 0xa1, 0xd6, 0x87, 0x6c, 0x0e, 0x6c, 0x8c, + 0xe9, 0xa0, 0x44, 0xc4, 0x72, 0x3e, 0x73, 0x17, 0x13, 0xd1, 0x4e, 0x3d, + 0x8e, 0x1d, 0x5a, 0x8b, 0x75, 0xcb, 0x59, 0x2c, 0x47, 0x87, 0x15, 0x41, + 0xfe, 0x08, 0xe9, 0xa6, 0x97, 0x17, 0x08, 0x26, 0x6a, 0xb5, 0xbb, 0x73, + 0xaa, 0xb8, 0x5b, 0x65, 0x65, 0x5b, 0x30, 0x9e, 0x62, 0x59, 0x02, 0xf8, + 0xb8, 0x0f, 0x32, 0x10, 0xc1, 0x36, 0x08, 0x52, 0x98, 0x4a, 0x1e, 0xf0, + 0xab, 0x21, 0x5e, 0xde, 0x16, 0x0c, 0xda, 0x09, 0x99, 0x6b, 0x9e, 0xc0, + 0x90, 0xa5, 0x5a, 0xcc, 0xb0, 0xb7, 0xbb, 0xd2, 0x8b, 0x5f, 0xd3, 0x3b, + 0x3e, 0x8c, 0xa5, 0x71, 0x66, 0x06, 0xe3, 0x28, 0xd4, 0xf8, 0x3f, 0xe5, + 0x27, 0xdf, 0xfe, 0x0f, 0x09, 0xb2, 0x8a, 0x09, 0x5a, 0x23, 0x61, 0x0d, + 0x2d, 0xf5, 0x44, 0xf1, 0x5c, 0xf8, 0x82, 0x4e, 0xdc, 0x78, 0x7a, 0xab, + 0xc3, 0x57, 0x91, 0xaf, 0x65, 0x6e, 0x71, 0xf1, 0x44, 0xbf, 0xed, 0x43, + 0x50, 0xb4, 0x67, 0x48, 0xef, 0x5a, 0x10, 0x46, 0x81, 0xb4, 0x0c, 0xc8, + 0x48, 0xed, 0x99, 0x7a, 0x45, 0xa5, 0x92, 0xc3, 0x69, 0xd6, 0xd7, 0x8a, + 0x20, 0x1b, 0xeb, 0x8f, 0xb2, 0xff, 0xec, 0x6d, 0x76, 0x04, 0xf8, 0xc2, + 0x58, 0x9b, 0xf2, 0x20, 0x53, 0xc4, 0x74, 0x91, 0x19, 0xdd, 0x2d, 0x12, + 0x53, 0xc7, 0x6e, 0xd0, 0x02, 0x51, 0x3c, 0xa6, 0x7d, 0x80, 0x75, 0x6b, + 0x1d, 0xdf, 0xf8, 0x6a, 0x52, 0xbb, 0x81, 0xf8, 0x30, 0x45, 0xef, 0x51, + 0x85, 0x36, 0xbe, 0x8e, 0xcf, 0x0b, 0x9a, 0x46, 0xe8, 0x3f, 0x99, 0xfd, + 0xf7, 0xd9, 0x3e, 0x84, 0xe5, 0xe3, 0x37, 0xcf, 0x98, 0x7f, 0xeb, 0x5e, + 0x5a, 0x53, 0x77, 0x1c, 0x20, 0xdc, 0xf1, 0x20, 0x99, 0xec, 0x60, 0x40, + 0x93, 0xef, 0x5c, 0x1c, 0x81, 0xe2, 0xa5, 0xad, 0x2a, 0xc2, 0xdb, 0x6b, + 0xc1, 0x7e, 0x8f, 0xa9, 0x23, 0x5b, 0xd9, 0x0d, 0xfe, 0xa0, 0xac, 0x11, + 0x28, 0xba, 0x8e, 0x92, 0x07, 0x2d, 0x07, 0x40, 0x83, 0x14, 0x4c, 0x35, + 0x8d, 0xd0, 0x11, 0xff, 0x98, 0xdb, 0x00, 0x30, 0x6f, 0x65, 0xb6, 0xa0, + 0x7f, 0x9c, 0x08, 0xb8, 0xce, 0xb3, 0xa8, 0x42, 0xd3, 0x84, 0x45, 0xe1, + 0xe3, 0x8f, 0xa6, 0x89, 0x21, 0xd7, 0x74, 0x02, 0x4d, 0x64, 0xdf, 0x54, + 0x15, 0x9e, 0xba, 0x12, 0x49, 0x09, 0x41, 0xf6, 0x10, 0x24, 0xa1, 0x84, + 0x15, 0xfd, 0x68, 0x6a, 0x57, 0x66, 0xb3, 0x6d, 0x4c, 0xea, 0xbf, 0xbc, + 0x60, 0x3f, 0x52, 0x1c, 0x44, 0x1b, 0xc0, 0x4a, 0x25, 0xe3, 0xd9, 0x4c, + 0x9a, 0x74, 0xad, 0xfc, 0x9e, 0x8d, 0x0b, 0x18, 0x66, 0x24, 0xd1, 0x06, + 0xac, 0x68, 0xc1, 0xae, 0x14, 0xce, 0xb1, 0xf3, 0x86, 0x9f, 0x87, 0x11, + 0xd7, 0x9f, 0x30, 0x92, 0xdb, 0xec, 0x0b, 0x4a, 0xe8, 0xf6, 0x53, 0x36, + 0x68, 0x12, 0x11, 0x5e, 0xe0, 0x34, 0xa4, 0xff, 0x00, 0x0a, 0x26, 0xb8, + 0x62, 0x79, 0x9c, 0x0c, 0xd5, 0xe5, 0xf5, 0x1c, 0x1a, 0x16, 0x84, 0x4d, + 0x8e, 0x5d, 0x31, 0x7e, 0xf7, 0xe2, 0xd3, 0xa1, 0x41, 0x90, 0x61, 0x5d, + 0x04, 0xb2, 0x9a, 0x18, 0x9e, 0x54, 0xfb, 0xd1, 0x61, 0x95, 0x1b, 0x08, + 0xca, 0x7c, 0x49, 0x44, 0x74, 0x1d, 0x2f, 0xca, 0xc4, 0x7a, 0xe1, 0x8b, + 0x2f, 0xbb, 0x96, 0xee, 0x19, 0x8a, 0x5d, 0xfb, 0x3e, 0x82, 0xe7, 0x15, + 0xdb, 0x29, 0x14, 0xee, 0xc9, 0x4d, 0x9a, 0xfb, 0x9f, 0x8a, 0xbb, 0x17, + 0x37, 0x1b, 0x6e, 0x28, 0x6c, 0xf9, 0xff, 0xb5, 0xb5, 0x8b, 0x9d, 0x88, + 0x20, 0x08, 0x10, 0xd7, 0xca, 0x58, 0xf6, 0xe1, 0x32, 0x91, 0x6f, 0x36, + 0xc0, 0xad, 0xc1, 0x57, 0x5d, 0x76, 0x31, 0x43, 0xf3, 0xdd, 0xec, 0xf1, + 0xa9, 0x79, 0xe9, 0xe9, 0x85, 0xd7, 0x91, 0xc7, 0x31, 0x62, 0x3c, 0xd2, + 0x90, 0x2c, 0x9c, 0xa4, 0x56, 0x37, 0x7b, 0xbe, 0x40, 0x58, 0xc0, 0x81, + 0x83, 0x22, 0xe8, 0x13, 0x79, 0x18, 0xdb, 0x3a, 0x1b, 0x31, 0x0d, 0x00, + 0x6c, 0x22, 0x62, 0x75, 0x70, 0xd8, 0x96, 0x59, 0x99, 0x44, 0x79, 0x71, + 0xa6, 0x76, 0x81, 0x28, 0xb2, 0x65, 0xe8, 0x47, 0x14, 0xc6, 0x39, 0x06, +}; + +SPAKE2_CTX *SPAKE2_CTX_new(enum spake2_role_t my_role, const uint8_t *my_name, + size_t my_name_len, const uint8_t *their_name, + size_t their_name_len) { + SPAKE2_CTX *ctx = New(); + if (ctx == nullptr) { + return nullptr; + } + + ctx->my_role = my_role; + + CBS my_name_cbs, their_name_cbs; + CBS_init(&my_name_cbs, my_name, my_name_len); + CBS_init(&their_name_cbs, their_name, their_name_len); + if (!CBS_stow(&my_name_cbs, &ctx->my_name, &ctx->my_name_len) || + !CBS_stow(&their_name_cbs, &ctx->their_name, &ctx->their_name_len)) { + SPAKE2_CTX_free(ctx); + return nullptr; + } + + return ctx; +} + +void SPAKE2_CTX_free(SPAKE2_CTX *ctx) { + if (ctx == nullptr) { + return; + } + + OPENSSL_free(ctx->my_name); + OPENSSL_free(ctx->their_name); + Delete(ctx); +} + +// left_shift_3 sets |n| to |n|*8, where |n| is represented in little-endian +// order. +static void left_shift_3(uint8_t n[32]) { + uint8_t carry = 0; + unsigned i; + + for (i = 0; i < 32; i++) { + const uint8_t next_carry = n[i] >> 5; + n[i] = (n[i] << 3) | carry; + carry = next_carry; + } +} + +namespace { +typedef struct { + BN_ULONG words[32 / sizeof(BN_ULONG)]; +} scalar; +} // namespace + +// kOrder is the order of the prime-order subgroup of curve25519. +static const scalar kOrder = { + {TOBN(0x5812631a, 0x5cf5d3ed), TOBN(0x14def9de, 0xa2f79cd6), + TOBN(0x00000000, 0x00000000), TOBN(0x10000000, 0x00000000)}}; + +// scalar_cmov copies |src| to |dest| if |mask| is all ones. +static void scalar_cmov(scalar *dest, const scalar *src, crypto_word_t mask) { + bn_select_words(dest->words, mask, src->words, dest->words, + std::size(dest->words)); +} + +// scalar_double sets |s| to |2×s|. +static void scalar_double(scalar *s) { + bn_add_words(s->words, s->words, s->words, std::size(s->words)); +} + +// scalar_add sets |dest| to |dest| plus |src|. +static void scalar_add(scalar *dest, const scalar *src) { + bn_add_words(dest->words, dest->words, src->words, std::size(dest->words)); +} + +int SPAKE2_generate_msg(SPAKE2_CTX *ctx, uint8_t *out, size_t *out_len, + size_t max_out_len, const uint8_t *password, + size_t password_len) { + if (ctx->state != spake2_state_init) { + return 0; + } + + if (max_out_len < sizeof(ctx->my_msg)) { + return 0; + } + + uint8_t private_tmp[64]; + RAND_bytes(private_tmp, sizeof(private_tmp)); + x25519_sc_reduce(private_tmp); + // Multiply by the cofactor (eight) so that we'll clear it when operating on + // the peer's point later in the protocol. + left_shift_3(private_tmp); + OPENSSL_memcpy(ctx->private_key, private_tmp, sizeof(ctx->private_key)); + + ge_p3 P; + x25519_ge_scalarmult_base(&P, ctx->private_key); + + // mask = h(password) * . + uint8_t password_tmp[SHA512_DIGEST_LENGTH]; + SHA512(password, password_len, password_tmp); + OPENSSL_memcpy(ctx->password_hash, password_tmp, sizeof(ctx->password_hash)); + x25519_sc_reduce(password_tmp); + + // Due to a copy-paste error, the call to |left_shift_3| was omitted after + // the |x25519_sc_reduce|, just above. This meant that |ctx->password_scalar| + // was not a multiple of eight to clear the cofactor and thus three bits of + // the password hash would leak. In order to fix this in a unilateral way, + // points of small order are added to the mask point such that it is in the + // prime-order subgroup. Since the ephemeral scalar is a multiple of eight, + // these points will cancel out when calculating the shared secret. + // + // Adding points of small order is the same as adding multiples of the prime + // order to the password scalar. Since that's faster, that is what is done + // below. The prime order (kOrder) is a large prime, thus odd, thus the LSB + // is one. So adding it will flip the LSB. Adding twice it will flip the next + // bit and so one for all the bottom three bits. + + scalar password_scalar; + OPENSSL_memcpy(&password_scalar, password_tmp, sizeof(password_scalar)); + + // |password_scalar| is the result of |x25519_sc_reduce| and thus is, at + // most, $l-1$ (where $l$ is |kOrder|, the order of the prime-order subgroup + // of Ed25519). In the following, we may add $l + 2×l + 4×l$ for a max value + // of $8×l-1$. That is < 2**256, as required. + + if (!ctx->disable_password_scalar_hack) { + scalar order = kOrder; + scalar tmp; + + OPENSSL_memset(&tmp, 0, sizeof(tmp)); + scalar_cmov(&tmp, &order, + constant_time_eq_w(password_scalar.words[0] & 1, 1)); + scalar_add(&password_scalar, &tmp); + + scalar_double(&order); + OPENSSL_memset(&tmp, 0, sizeof(tmp)); + scalar_cmov(&tmp, &order, + constant_time_eq_w(password_scalar.words[0] & 2, 2)); + scalar_add(&password_scalar, &tmp); + + scalar_double(&order); + OPENSSL_memset(&tmp, 0, sizeof(tmp)); + scalar_cmov(&tmp, &order, + constant_time_eq_w(password_scalar.words[0] & 4, 4)); + scalar_add(&password_scalar, &tmp); + + assert((password_scalar.words[0] & 7) == 0); + } + + OPENSSL_memcpy(ctx->password_scalar, password_scalar.words, + sizeof(ctx->password_scalar)); + + ge_p3 mask; + x25519_ge_scalarmult_small_precomp(&mask, ctx->password_scalar, + ctx->my_role == spake2_role_alice + ? kSpakeMSmallPrecomp + : kSpakeNSmallPrecomp); + + // P* = P + mask. + ge_cached mask_cached; + x25519_ge_p3_to_cached(&mask_cached, &mask); + ge_p1p1 Pstar; + x25519_ge_add(&Pstar, &P, &mask_cached); + + // Encode P* + ge_p2 Pstar_proj; + x25519_ge_p1p1_to_p2(&Pstar_proj, &Pstar); + x25519_ge_tobytes(ctx->my_msg, &Pstar_proj); + + OPENSSL_memcpy(out, ctx->my_msg, sizeof(ctx->my_msg)); + *out_len = sizeof(ctx->my_msg); + ctx->state = spake2_state_msg_generated; + + return 1; +} + +static void update_with_length_prefix(SHA512_CTX *sha, const uint8_t *data, + const size_t len) { + uint8_t len_le[8]; + size_t l = len; + unsigned i; + + for (i = 0; i < 8; i++) { + len_le[i] = l & 0xff; + l >>= 8; + } + + SHA512_Update(sha, len_le, sizeof(len_le)); + SHA512_Update(sha, data, len); +} + +int SPAKE2_process_msg(SPAKE2_CTX *ctx, uint8_t *out_key, size_t *out_key_len, + size_t max_out_key_len, const uint8_t *their_msg, + size_t their_msg_len) { + if (ctx->state != spake2_state_msg_generated || their_msg_len != 32) { + return 0; + } + + ge_p3 Qstar; + if (!x25519_ge_frombytes_vartime(&Qstar, their_msg)) { + // Point received from peer was not on the curve. + return 0; + } + + // Unmask peer's value. + ge_p3 peers_mask; + x25519_ge_scalarmult_small_precomp(&peers_mask, ctx->password_scalar, + ctx->my_role == spake2_role_alice + ? kSpakeNSmallPrecomp + : kSpakeMSmallPrecomp); + + ge_cached peers_mask_cached; + x25519_ge_p3_to_cached(&peers_mask_cached, &peers_mask); + + ge_p1p1 Q_compl; + ge_p3 Q_ext; + x25519_ge_sub(&Q_compl, &Qstar, &peers_mask_cached); + x25519_ge_p1p1_to_p3(&Q_ext, &Q_compl); + + ge_p2 dh_shared; + x25519_ge_scalarmult(&dh_shared, ctx->private_key, &Q_ext); + + uint8_t dh_shared_encoded[32]; + x25519_ge_tobytes(dh_shared_encoded, &dh_shared); + + SHA512_CTX sha; + SHA512_Init(&sha); + if (ctx->my_role == spake2_role_alice) { + update_with_length_prefix(&sha, ctx->my_name, ctx->my_name_len); + update_with_length_prefix(&sha, ctx->their_name, ctx->their_name_len); + update_with_length_prefix(&sha, ctx->my_msg, sizeof(ctx->my_msg)); + update_with_length_prefix(&sha, their_msg, 32); + } else { + update_with_length_prefix(&sha, ctx->their_name, ctx->their_name_len); + update_with_length_prefix(&sha, ctx->my_name, ctx->my_name_len); + update_with_length_prefix(&sha, their_msg, 32); + update_with_length_prefix(&sha, ctx->my_msg, sizeof(ctx->my_msg)); + } + update_with_length_prefix(&sha, dh_shared_encoded, sizeof(dh_shared_encoded)); + update_with_length_prefix(&sha, ctx->password_hash, + sizeof(ctx->password_hash)); + + uint8_t key[SHA512_DIGEST_LENGTH]; + SHA512_Final(key, &sha); + + size_t to_copy = max_out_key_len; + if (to_copy > sizeof(key)) { + to_copy = sizeof(key); + } + OPENSSL_memcpy(out_key, key, to_copy); + *out_key_len = to_copy; + ctx->state = spake2_state_key_generated; + + return 1; +} diff --git a/third_party/boringssl/src/crypto/des/des.c b/third_party/boringssl/src/crypto/des/des.c deleted file mode 100644 index 95c430ca..00000000 --- a/third_party/boringssl/src/crypto/des/des.c +++ /dev/null @@ -1,784 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include "internal.h" - - -static const uint32_t des_skb[8][64] = { - { // for C bits (numbered as per FIPS 46) 1 2 3 4 5 6 - 0x00000000L, 0x00000010L, 0x20000000L, 0x20000010L, 0x00010000L, - 0x00010010L, 0x20010000L, 0x20010010L, 0x00000800L, 0x00000810L, - 0x20000800L, 0x20000810L, 0x00010800L, 0x00010810L, 0x20010800L, - 0x20010810L, 0x00000020L, 0x00000030L, 0x20000020L, 0x20000030L, - 0x00010020L, 0x00010030L, 0x20010020L, 0x20010030L, 0x00000820L, - 0x00000830L, 0x20000820L, 0x20000830L, 0x00010820L, 0x00010830L, - 0x20010820L, 0x20010830L, 0x00080000L, 0x00080010L, 0x20080000L, - 0x20080010L, 0x00090000L, 0x00090010L, 0x20090000L, 0x20090010L, - 0x00080800L, 0x00080810L, 0x20080800L, 0x20080810L, 0x00090800L, - 0x00090810L, 0x20090800L, 0x20090810L, 0x00080020L, 0x00080030L, - 0x20080020L, 0x20080030L, 0x00090020L, 0x00090030L, 0x20090020L, - 0x20090030L, 0x00080820L, 0x00080830L, 0x20080820L, 0x20080830L, - 0x00090820L, 0x00090830L, 0x20090820L, 0x20090830L, }, - { // for C bits (numbered as per FIPS 46) 7 8 10 11 12 13 - 0x00000000L, 0x02000000L, 0x00002000L, 0x02002000L, 0x00200000L, - 0x02200000L, 0x00202000L, 0x02202000L, 0x00000004L, 0x02000004L, - 0x00002004L, 0x02002004L, 0x00200004L, 0x02200004L, 0x00202004L, - 0x02202004L, 0x00000400L, 0x02000400L, 0x00002400L, 0x02002400L, - 0x00200400L, 0x02200400L, 0x00202400L, 0x02202400L, 0x00000404L, - 0x02000404L, 0x00002404L, 0x02002404L, 0x00200404L, 0x02200404L, - 0x00202404L, 0x02202404L, 0x10000000L, 0x12000000L, 0x10002000L, - 0x12002000L, 0x10200000L, 0x12200000L, 0x10202000L, 0x12202000L, - 0x10000004L, 0x12000004L, 0x10002004L, 0x12002004L, 0x10200004L, - 0x12200004L, 0x10202004L, 0x12202004L, 0x10000400L, 0x12000400L, - 0x10002400L, 0x12002400L, 0x10200400L, 0x12200400L, 0x10202400L, - 0x12202400L, 0x10000404L, 0x12000404L, 0x10002404L, 0x12002404L, - 0x10200404L, 0x12200404L, 0x10202404L, 0x12202404L, }, - { // for C bits (numbered as per FIPS 46) 14 15 16 17 19 20 - 0x00000000L, 0x00000001L, 0x00040000L, 0x00040001L, 0x01000000L, - 0x01000001L, 0x01040000L, 0x01040001L, 0x00000002L, 0x00000003L, - 0x00040002L, 0x00040003L, 0x01000002L, 0x01000003L, 0x01040002L, - 0x01040003L, 0x00000200L, 0x00000201L, 0x00040200L, 0x00040201L, - 0x01000200L, 0x01000201L, 0x01040200L, 0x01040201L, 0x00000202L, - 0x00000203L, 0x00040202L, 0x00040203L, 0x01000202L, 0x01000203L, - 0x01040202L, 0x01040203L, 0x08000000L, 0x08000001L, 0x08040000L, - 0x08040001L, 0x09000000L, 0x09000001L, 0x09040000L, 0x09040001L, - 0x08000002L, 0x08000003L, 0x08040002L, 0x08040003L, 0x09000002L, - 0x09000003L, 0x09040002L, 0x09040003L, 0x08000200L, 0x08000201L, - 0x08040200L, 0x08040201L, 0x09000200L, 0x09000201L, 0x09040200L, - 0x09040201L, 0x08000202L, 0x08000203L, 0x08040202L, 0x08040203L, - 0x09000202L, 0x09000203L, 0x09040202L, 0x09040203L, }, - { // for C bits (numbered as per FIPS 46) 21 23 24 26 27 28 - 0x00000000L, 0x00100000L, 0x00000100L, 0x00100100L, 0x00000008L, - 0x00100008L, 0x00000108L, 0x00100108L, 0x00001000L, 0x00101000L, - 0x00001100L, 0x00101100L, 0x00001008L, 0x00101008L, 0x00001108L, - 0x00101108L, 0x04000000L, 0x04100000L, 0x04000100L, 0x04100100L, - 0x04000008L, 0x04100008L, 0x04000108L, 0x04100108L, 0x04001000L, - 0x04101000L, 0x04001100L, 0x04101100L, 0x04001008L, 0x04101008L, - 0x04001108L, 0x04101108L, 0x00020000L, 0x00120000L, 0x00020100L, - 0x00120100L, 0x00020008L, 0x00120008L, 0x00020108L, 0x00120108L, - 0x00021000L, 0x00121000L, 0x00021100L, 0x00121100L, 0x00021008L, - 0x00121008L, 0x00021108L, 0x00121108L, 0x04020000L, 0x04120000L, - 0x04020100L, 0x04120100L, 0x04020008L, 0x04120008L, 0x04020108L, - 0x04120108L, 0x04021000L, 0x04121000L, 0x04021100L, 0x04121100L, - 0x04021008L, 0x04121008L, 0x04021108L, 0x04121108L, }, - { // for D bits (numbered as per FIPS 46) 1 2 3 4 5 6 - 0x00000000L, 0x10000000L, 0x00010000L, 0x10010000L, 0x00000004L, - 0x10000004L, 0x00010004L, 0x10010004L, 0x20000000L, 0x30000000L, - 0x20010000L, 0x30010000L, 0x20000004L, 0x30000004L, 0x20010004L, - 0x30010004L, 0x00100000L, 0x10100000L, 0x00110000L, 0x10110000L, - 0x00100004L, 0x10100004L, 0x00110004L, 0x10110004L, 0x20100000L, - 0x30100000L, 0x20110000L, 0x30110000L, 0x20100004L, 0x30100004L, - 0x20110004L, 0x30110004L, 0x00001000L, 0x10001000L, 0x00011000L, - 0x10011000L, 0x00001004L, 0x10001004L, 0x00011004L, 0x10011004L, - 0x20001000L, 0x30001000L, 0x20011000L, 0x30011000L, 0x20001004L, - 0x30001004L, 0x20011004L, 0x30011004L, 0x00101000L, 0x10101000L, - 0x00111000L, 0x10111000L, 0x00101004L, 0x10101004L, 0x00111004L, - 0x10111004L, 0x20101000L, 0x30101000L, 0x20111000L, 0x30111000L, - 0x20101004L, 0x30101004L, 0x20111004L, 0x30111004L, }, - { // for D bits (numbered as per FIPS 46) 8 9 11 12 13 14 - 0x00000000L, 0x08000000L, 0x00000008L, 0x08000008L, 0x00000400L, - 0x08000400L, 0x00000408L, 0x08000408L, 0x00020000L, 0x08020000L, - 0x00020008L, 0x08020008L, 0x00020400L, 0x08020400L, 0x00020408L, - 0x08020408L, 0x00000001L, 0x08000001L, 0x00000009L, 0x08000009L, - 0x00000401L, 0x08000401L, 0x00000409L, 0x08000409L, 0x00020001L, - 0x08020001L, 0x00020009L, 0x08020009L, 0x00020401L, 0x08020401L, - 0x00020409L, 0x08020409L, 0x02000000L, 0x0A000000L, 0x02000008L, - 0x0A000008L, 0x02000400L, 0x0A000400L, 0x02000408L, 0x0A000408L, - 0x02020000L, 0x0A020000L, 0x02020008L, 0x0A020008L, 0x02020400L, - 0x0A020400L, 0x02020408L, 0x0A020408L, 0x02000001L, 0x0A000001L, - 0x02000009L, 0x0A000009L, 0x02000401L, 0x0A000401L, 0x02000409L, - 0x0A000409L, 0x02020001L, 0x0A020001L, 0x02020009L, 0x0A020009L, - 0x02020401L, 0x0A020401L, 0x02020409L, 0x0A020409L, }, - { // for D bits (numbered as per FIPS 46) 16 17 18 19 20 21 - 0x00000000L, 0x00000100L, 0x00080000L, 0x00080100L, 0x01000000L, - 0x01000100L, 0x01080000L, 0x01080100L, 0x00000010L, 0x00000110L, - 0x00080010L, 0x00080110L, 0x01000010L, 0x01000110L, 0x01080010L, - 0x01080110L, 0x00200000L, 0x00200100L, 0x00280000L, 0x00280100L, - 0x01200000L, 0x01200100L, 0x01280000L, 0x01280100L, 0x00200010L, - 0x00200110L, 0x00280010L, 0x00280110L, 0x01200010L, 0x01200110L, - 0x01280010L, 0x01280110L, 0x00000200L, 0x00000300L, 0x00080200L, - 0x00080300L, 0x01000200L, 0x01000300L, 0x01080200L, 0x01080300L, - 0x00000210L, 0x00000310L, 0x00080210L, 0x00080310L, 0x01000210L, - 0x01000310L, 0x01080210L, 0x01080310L, 0x00200200L, 0x00200300L, - 0x00280200L, 0x00280300L, 0x01200200L, 0x01200300L, 0x01280200L, - 0x01280300L, 0x00200210L, 0x00200310L, 0x00280210L, 0x00280310L, - 0x01200210L, 0x01200310L, 0x01280210L, 0x01280310L, }, - { // for D bits (numbered as per FIPS 46) 22 23 24 25 27 28 - 0x00000000L, 0x04000000L, 0x00040000L, 0x04040000L, 0x00000002L, - 0x04000002L, 0x00040002L, 0x04040002L, 0x00002000L, 0x04002000L, - 0x00042000L, 0x04042000L, 0x00002002L, 0x04002002L, 0x00042002L, - 0x04042002L, 0x00000020L, 0x04000020L, 0x00040020L, 0x04040020L, - 0x00000022L, 0x04000022L, 0x00040022L, 0x04040022L, 0x00002020L, - 0x04002020L, 0x00042020L, 0x04042020L, 0x00002022L, 0x04002022L, - 0x00042022L, 0x04042022L, 0x00000800L, 0x04000800L, 0x00040800L, - 0x04040800L, 0x00000802L, 0x04000802L, 0x00040802L, 0x04040802L, - 0x00002800L, 0x04002800L, 0x00042800L, 0x04042800L, 0x00002802L, - 0x04002802L, 0x00042802L, 0x04042802L, 0x00000820L, 0x04000820L, - 0x00040820L, 0x04040820L, 0x00000822L, 0x04000822L, 0x00040822L, - 0x04040822L, 0x00002820L, 0x04002820L, 0x00042820L, 0x04042820L, - 0x00002822L, 0x04002822L, 0x00042822L, 0x04042822L, }}; - -static const uint32_t DES_SPtrans[8][64] = { - { // nibble 0 - 0x02080800L, 0x00080000L, 0x02000002L, 0x02080802L, 0x02000000L, - 0x00080802L, 0x00080002L, 0x02000002L, 0x00080802L, 0x02080800L, - 0x02080000L, 0x00000802L, 0x02000802L, 0x02000000L, 0x00000000L, - 0x00080002L, 0x00080000L, 0x00000002L, 0x02000800L, 0x00080800L, - 0x02080802L, 0x02080000L, 0x00000802L, 0x02000800L, 0x00000002L, - 0x00000800L, 0x00080800L, 0x02080002L, 0x00000800L, 0x02000802L, - 0x02080002L, 0x00000000L, 0x00000000L, 0x02080802L, 0x02000800L, - 0x00080002L, 0x02080800L, 0x00080000L, 0x00000802L, 0x02000800L, - 0x02080002L, 0x00000800L, 0x00080800L, 0x02000002L, 0x00080802L, - 0x00000002L, 0x02000002L, 0x02080000L, 0x02080802L, 0x00080800L, - 0x02080000L, 0x02000802L, 0x02000000L, 0x00000802L, 0x00080002L, - 0x00000000L, 0x00080000L, 0x02000000L, 0x02000802L, 0x02080800L, - 0x00000002L, 0x02080002L, 0x00000800L, 0x00080802L, }, - { // nibble 1 - 0x40108010L, 0x00000000L, 0x00108000L, 0x40100000L, 0x40000010L, - 0x00008010L, 0x40008000L, 0x00108000L, 0x00008000L, 0x40100010L, - 0x00000010L, 0x40008000L, 0x00100010L, 0x40108000L, 0x40100000L, - 0x00000010L, 0x00100000L, 0x40008010L, 0x40100010L, 0x00008000L, - 0x00108010L, 0x40000000L, 0x00000000L, 0x00100010L, 0x40008010L, - 0x00108010L, 0x40108000L, 0x40000010L, 0x40000000L, 0x00100000L, - 0x00008010L, 0x40108010L, 0x00100010L, 0x40108000L, 0x40008000L, - 0x00108010L, 0x40108010L, 0x00100010L, 0x40000010L, 0x00000000L, - 0x40000000L, 0x00008010L, 0x00100000L, 0x40100010L, 0x00008000L, - 0x40000000L, 0x00108010L, 0x40008010L, 0x40108000L, 0x00008000L, - 0x00000000L, 0x40000010L, 0x00000010L, 0x40108010L, 0x00108000L, - 0x40100000L, 0x40100010L, 0x00100000L, 0x00008010L, 0x40008000L, - 0x40008010L, 0x00000010L, 0x40100000L, 0x00108000L, }, - { // nibble 2 - 0x04000001L, 0x04040100L, 0x00000100L, 0x04000101L, 0x00040001L, - 0x04000000L, 0x04000101L, 0x00040100L, 0x04000100L, 0x00040000L, - 0x04040000L, 0x00000001L, 0x04040101L, 0x00000101L, 0x00000001L, - 0x04040001L, 0x00000000L, 0x00040001L, 0x04040100L, 0x00000100L, - 0x00000101L, 0x04040101L, 0x00040000L, 0x04000001L, 0x04040001L, - 0x04000100L, 0x00040101L, 0x04040000L, 0x00040100L, 0x00000000L, - 0x04000000L, 0x00040101L, 0x04040100L, 0x00000100L, 0x00000001L, - 0x00040000L, 0x00000101L, 0x00040001L, 0x04040000L, 0x04000101L, - 0x00000000L, 0x04040100L, 0x00040100L, 0x04040001L, 0x00040001L, - 0x04000000L, 0x04040101L, 0x00000001L, 0x00040101L, 0x04000001L, - 0x04000000L, 0x04040101L, 0x00040000L, 0x04000100L, 0x04000101L, - 0x00040100L, 0x04000100L, 0x00000000L, 0x04040001L, 0x00000101L, - 0x04000001L, 0x00040101L, 0x00000100L, 0x04040000L, }, - { // nibble 3 - 0x00401008L, 0x10001000L, 0x00000008L, 0x10401008L, 0x00000000L, - 0x10400000L, 0x10001008L, 0x00400008L, 0x10401000L, 0x10000008L, - 0x10000000L, 0x00001008L, 0x10000008L, 0x00401008L, 0x00400000L, - 0x10000000L, 0x10400008L, 0x00401000L, 0x00001000L, 0x00000008L, - 0x00401000L, 0x10001008L, 0x10400000L, 0x00001000L, 0x00001008L, - 0x00000000L, 0x00400008L, 0x10401000L, 0x10001000L, 0x10400008L, - 0x10401008L, 0x00400000L, 0x10400008L, 0x00001008L, 0x00400000L, - 0x10000008L, 0x00401000L, 0x10001000L, 0x00000008L, 0x10400000L, - 0x10001008L, 0x00000000L, 0x00001000L, 0x00400008L, 0x00000000L, - 0x10400008L, 0x10401000L, 0x00001000L, 0x10000000L, 0x10401008L, - 0x00401008L, 0x00400000L, 0x10401008L, 0x00000008L, 0x10001000L, - 0x00401008L, 0x00400008L, 0x00401000L, 0x10400000L, 0x10001008L, - 0x00001008L, 0x10000000L, 0x10000008L, 0x10401000L, }, - { // nibble 4 - 0x08000000L, 0x00010000L, 0x00000400L, 0x08010420L, 0x08010020L, - 0x08000400L, 0x00010420L, 0x08010000L, 0x00010000L, 0x00000020L, - 0x08000020L, 0x00010400L, 0x08000420L, 0x08010020L, 0x08010400L, - 0x00000000L, 0x00010400L, 0x08000000L, 0x00010020L, 0x00000420L, - 0x08000400L, 0x00010420L, 0x00000000L, 0x08000020L, 0x00000020L, - 0x08000420L, 0x08010420L, 0x00010020L, 0x08010000L, 0x00000400L, - 0x00000420L, 0x08010400L, 0x08010400L, 0x08000420L, 0x00010020L, - 0x08010000L, 0x00010000L, 0x00000020L, 0x08000020L, 0x08000400L, - 0x08000000L, 0x00010400L, 0x08010420L, 0x00000000L, 0x00010420L, - 0x08000000L, 0x00000400L, 0x00010020L, 0x08000420L, 0x00000400L, - 0x00000000L, 0x08010420L, 0x08010020L, 0x08010400L, 0x00000420L, - 0x00010000L, 0x00010400L, 0x08010020L, 0x08000400L, 0x00000420L, - 0x00000020L, 0x00010420L, 0x08010000L, 0x08000020L, }, - { // nibble 5 - 0x80000040L, 0x00200040L, 0x00000000L, 0x80202000L, 0x00200040L, - 0x00002000L, 0x80002040L, 0x00200000L, 0x00002040L, 0x80202040L, - 0x00202000L, 0x80000000L, 0x80002000L, 0x80000040L, 0x80200000L, - 0x00202040L, 0x00200000L, 0x80002040L, 0x80200040L, 0x00000000L, - 0x00002000L, 0x00000040L, 0x80202000L, 0x80200040L, 0x80202040L, - 0x80200000L, 0x80000000L, 0x00002040L, 0x00000040L, 0x00202000L, - 0x00202040L, 0x80002000L, 0x00002040L, 0x80000000L, 0x80002000L, - 0x00202040L, 0x80202000L, 0x00200040L, 0x00000000L, 0x80002000L, - 0x80000000L, 0x00002000L, 0x80200040L, 0x00200000L, 0x00200040L, - 0x80202040L, 0x00202000L, 0x00000040L, 0x80202040L, 0x00202000L, - 0x00200000L, 0x80002040L, 0x80000040L, 0x80200000L, 0x00202040L, - 0x00000000L, 0x00002000L, 0x80000040L, 0x80002040L, 0x80202000L, - 0x80200000L, 0x00002040L, 0x00000040L, 0x80200040L, }, - { // nibble 6 - 0x00004000L, 0x00000200L, 0x01000200L, 0x01000004L, 0x01004204L, - 0x00004004L, 0x00004200L, 0x00000000L, 0x01000000L, 0x01000204L, - 0x00000204L, 0x01004000L, 0x00000004L, 0x01004200L, 0x01004000L, - 0x00000204L, 0x01000204L, 0x00004000L, 0x00004004L, 0x01004204L, - 0x00000000L, 0x01000200L, 0x01000004L, 0x00004200L, 0x01004004L, - 0x00004204L, 0x01004200L, 0x00000004L, 0x00004204L, 0x01004004L, - 0x00000200L, 0x01000000L, 0x00004204L, 0x01004000L, 0x01004004L, - 0x00000204L, 0x00004000L, 0x00000200L, 0x01000000L, 0x01004004L, - 0x01000204L, 0x00004204L, 0x00004200L, 0x00000000L, 0x00000200L, - 0x01000004L, 0x00000004L, 0x01000200L, 0x00000000L, 0x01000204L, - 0x01000200L, 0x00004200L, 0x00000204L, 0x00004000L, 0x01004204L, - 0x01000000L, 0x01004200L, 0x00000004L, 0x00004004L, 0x01004204L, - 0x01000004L, 0x01004200L, 0x01004000L, 0x00004004L, }, - { // nibble 7 - 0x20800080L, 0x20820000L, 0x00020080L, 0x00000000L, 0x20020000L, - 0x00800080L, 0x20800000L, 0x20820080L, 0x00000080L, 0x20000000L, - 0x00820000L, 0x00020080L, 0x00820080L, 0x20020080L, 0x20000080L, - 0x20800000L, 0x00020000L, 0x00820080L, 0x00800080L, 0x20020000L, - 0x20820080L, 0x20000080L, 0x00000000L, 0x00820000L, 0x20000000L, - 0x00800000L, 0x20020080L, 0x20800080L, 0x00800000L, 0x00020000L, - 0x20820000L, 0x00000080L, 0x00800000L, 0x00020000L, 0x20000080L, - 0x20820080L, 0x00020080L, 0x20000000L, 0x00000000L, 0x00820000L, - 0x20800080L, 0x20020080L, 0x20020000L, 0x00800080L, 0x20820000L, - 0x00000080L, 0x00800080L, 0x20020000L, 0x20820080L, 0x00800000L, - 0x20800000L, 0x20000080L, 0x00820000L, 0x00020080L, 0x20020080L, - 0x20800000L, 0x00000080L, 0x20820000L, 0x00820080L, 0x00000000L, - 0x20000000L, 0x20800080L, 0x00020000L, 0x00820080L, }}; - -#define HPERM_OP(a, t, n, m) \ - ((t) = ((((a) << (16 - (n))) ^ (a)) & (m)), \ - (a) = (a) ^ (t) ^ ((t) >> (16 - (n)))) - -void DES_set_key(const DES_cblock *key, DES_key_schedule *schedule) { - static const int shifts2[16] = {0, 0, 1, 1, 1, 1, 1, 1, - 0, 1, 1, 1, 1, 1, 1, 0}; - uint32_t c, d, t, s, t2; - const uint8_t *in; - int i; - - in = key->bytes; - - c2l(in, c); - c2l(in, d); - - // do PC1 in 47 simple operations :-) - // Thanks to John Fletcher (john_fletcher@lccmail.ocf.llnl.gov) - // for the inspiration. :-) - PERM_OP(d, c, t, 4, 0x0f0f0f0fL); - HPERM_OP(c, t, -2, 0xcccc0000L); - HPERM_OP(d, t, -2, 0xcccc0000L); - PERM_OP(d, c, t, 1, 0x55555555L); - PERM_OP(c, d, t, 8, 0x00ff00ffL); - PERM_OP(d, c, t, 1, 0x55555555L); - d = (((d & 0x000000ffL) << 16L) | (d & 0x0000ff00L) | - ((d & 0x00ff0000L) >> 16L) | ((c & 0xf0000000L) >> 4L)); - c &= 0x0fffffffL; - - for (i = 0; i < ITERATIONS; i++) { - if (shifts2[i]) { - c = ((c >> 2L) | (c << 26L)); - d = ((d >> 2L) | (d << 26L)); - } else { - c = ((c >> 1L) | (c << 27L)); - d = ((d >> 1L) | (d << 27L)); - } - c &= 0x0fffffffL; - d &= 0x0fffffffL; - // could be a few less shifts but I am to lazy at this - // point in time to investigate - s = des_skb[0][(c) & 0x3f] | - des_skb[1][((c >> 6L) & 0x03) | ((c >> 7L) & 0x3c)] | - des_skb[2][((c >> 13L) & 0x0f) | ((c >> 14L) & 0x30)] | - des_skb[3][((c >> 20L) & 0x01) | ((c >> 21L) & 0x06) | - ((c >> 22L) & 0x38)]; - t = des_skb[4][(d) & 0x3f] | - des_skb[5][((d >> 7L) & 0x03) | ((d >> 8L) & 0x3c)] | - des_skb[6][(d >> 15L) & 0x3f] | - des_skb[7][((d >> 21L) & 0x0f) | ((d >> 22L) & 0x30)]; - - // table contained 0213 4657 - t2 = ((t << 16L) | (s & 0x0000ffffL)) & 0xffffffffL; - schedule->subkeys[i][0] = CRYPTO_rotr_u32(t2, 30); - - t2 = ((s >> 16L) | (t & 0xffff0000L)); - schedule->subkeys[i][1] = CRYPTO_rotr_u32(t2, 26); - } -} - -static const uint8_t kOddParity[256] = { - 1, 1, 2, 2, 4, 4, 7, 7, 8, 8, 11, 11, 13, 13, 14, - 14, 16, 16, 19, 19, 21, 21, 22, 22, 25, 25, 26, 26, 28, 28, - 31, 31, 32, 32, 35, 35, 37, 37, 38, 38, 41, 41, 42, 42, 44, - 44, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 59, 59, - 61, 61, 62, 62, 64, 64, 67, 67, 69, 69, 70, 70, 73, 73, 74, - 74, 76, 76, 79, 79, 81, 81, 82, 82, 84, 84, 87, 87, 88, 88, - 91, 91, 93, 93, 94, 94, 97, 97, 98, 98, 100, 100, 103, 103, 104, - 104, 107, 107, 109, 109, 110, 110, 112, 112, 115, 115, 117, 117, 118, 118, - 121, 121, 122, 122, 124, 124, 127, 127, 128, 128, 131, 131, 133, 133, 134, - 134, 137, 137, 138, 138, 140, 140, 143, 143, 145, 145, 146, 146, 148, 148, - 151, 151, 152, 152, 155, 155, 157, 157, 158, 158, 161, 161, 162, 162, 164, - 164, 167, 167, 168, 168, 171, 171, 173, 173, 174, 174, 176, 176, 179, 179, - 181, 181, 182, 182, 185, 185, 186, 186, 188, 188, 191, 191, 193, 193, 194, - 194, 196, 196, 199, 199, 200, 200, 203, 203, 205, 205, 206, 206, 208, 208, - 211, 211, 213, 213, 214, 214, 217, 217, 218, 218, 220, 220, 223, 223, 224, - 224, 227, 227, 229, 229, 230, 230, 233, 233, 234, 234, 236, 236, 239, 239, - 241, 241, 242, 242, 244, 244, 247, 247, 248, 248, 251, 251, 253, 253, 254, - 254 -}; - -void DES_set_odd_parity(DES_cblock *key) { - unsigned i; - - for (i = 0; i < DES_KEY_SZ; i++) { - key->bytes[i] = kOddParity[key->bytes[i]]; - } -} - -static void DES_encrypt1(uint32_t *data, const DES_key_schedule *ks, int enc) { - uint32_t l, r, t, u; - - r = data[0]; - l = data[1]; - - IP(r, l); - // Things have been modified so that the initial rotate is done outside - // the loop. This required the DES_SPtrans values in sp.h to be - // rotated 1 bit to the right. One perl script later and things have a - // 5% speed up on a sparc2. Thanks to Richard Outerbridge - // <71755.204@CompuServe.COM> for pointing this out. - // clear the top bits on machines with 8byte longs - // shift left by 2 - r = CRYPTO_rotr_u32(r, 29); - l = CRYPTO_rotr_u32(l, 29); - - // I don't know if it is worth the effort of loop unrolling the - // inner loop - if (enc) { - D_ENCRYPT(ks, l, r, 0); - D_ENCRYPT(ks, r, l, 1); - D_ENCRYPT(ks, l, r, 2); - D_ENCRYPT(ks, r, l, 3); - D_ENCRYPT(ks, l, r, 4); - D_ENCRYPT(ks, r, l, 5); - D_ENCRYPT(ks, l, r, 6); - D_ENCRYPT(ks, r, l, 7); - D_ENCRYPT(ks, l, r, 8); - D_ENCRYPT(ks, r, l, 9); - D_ENCRYPT(ks, l, r, 10); - D_ENCRYPT(ks, r, l, 11); - D_ENCRYPT(ks, l, r, 12); - D_ENCRYPT(ks, r, l, 13); - D_ENCRYPT(ks, l, r, 14); - D_ENCRYPT(ks, r, l, 15); - } else { - D_ENCRYPT(ks, l, r, 15); - D_ENCRYPT(ks, r, l, 14); - D_ENCRYPT(ks, l, r, 13); - D_ENCRYPT(ks, r, l, 12); - D_ENCRYPT(ks, l, r, 11); - D_ENCRYPT(ks, r, l, 10); - D_ENCRYPT(ks, l, r, 9); - D_ENCRYPT(ks, r, l, 8); - D_ENCRYPT(ks, l, r, 7); - D_ENCRYPT(ks, r, l, 6); - D_ENCRYPT(ks, l, r, 5); - D_ENCRYPT(ks, r, l, 4); - D_ENCRYPT(ks, l, r, 3); - D_ENCRYPT(ks, r, l, 2); - D_ENCRYPT(ks, l, r, 1); - D_ENCRYPT(ks, r, l, 0); - } - - // rotate and clear the top bits on machines with 8byte longs - l = CRYPTO_rotr_u32(l, 3); - r = CRYPTO_rotr_u32(r, 3); - - FP(r, l); - data[0] = l; - data[1] = r; -} - -static void DES_encrypt2(uint32_t *data, const DES_key_schedule *ks, int enc) { - uint32_t l, r, t, u; - - r = data[0]; - l = data[1]; - - // Things have been modified so that the initial rotate is done outside the - // loop. This required the DES_SPtrans values in sp.h to be rotated 1 bit to - // the right. One perl script later and things have a 5% speed up on a - // sparc2. Thanks to Richard Outerbridge <71755.204@CompuServe.COM> for - // pointing this out. - // clear the top bits on machines with 8byte longs - r = CRYPTO_rotr_u32(r, 29); - l = CRYPTO_rotr_u32(l, 29); - - // I don't know if it is worth the effort of loop unrolling the - // inner loop - if (enc) { - D_ENCRYPT(ks, l, r, 0); - D_ENCRYPT(ks, r, l, 1); - D_ENCRYPT(ks, l, r, 2); - D_ENCRYPT(ks, r, l, 3); - D_ENCRYPT(ks, l, r, 4); - D_ENCRYPT(ks, r, l, 5); - D_ENCRYPT(ks, l, r, 6); - D_ENCRYPT(ks, r, l, 7); - D_ENCRYPT(ks, l, r, 8); - D_ENCRYPT(ks, r, l, 9); - D_ENCRYPT(ks, l, r, 10); - D_ENCRYPT(ks, r, l, 11); - D_ENCRYPT(ks, l, r, 12); - D_ENCRYPT(ks, r, l, 13); - D_ENCRYPT(ks, l, r, 14); - D_ENCRYPT(ks, r, l, 15); - } else { - D_ENCRYPT(ks, l, r, 15); - D_ENCRYPT(ks, r, l, 14); - D_ENCRYPT(ks, l, r, 13); - D_ENCRYPT(ks, r, l, 12); - D_ENCRYPT(ks, l, r, 11); - D_ENCRYPT(ks, r, l, 10); - D_ENCRYPT(ks, l, r, 9); - D_ENCRYPT(ks, r, l, 8); - D_ENCRYPT(ks, l, r, 7); - D_ENCRYPT(ks, r, l, 6); - D_ENCRYPT(ks, l, r, 5); - D_ENCRYPT(ks, r, l, 4); - D_ENCRYPT(ks, l, r, 3); - D_ENCRYPT(ks, r, l, 2); - D_ENCRYPT(ks, l, r, 1); - D_ENCRYPT(ks, r, l, 0); - } - // rotate and clear the top bits on machines with 8byte longs - data[0] = CRYPTO_rotr_u32(l, 3); - data[1] = CRYPTO_rotr_u32(r, 3); -} - -void DES_encrypt3(uint32_t *data, const DES_key_schedule *ks1, - const DES_key_schedule *ks2, const DES_key_schedule *ks3) { - uint32_t l, r; - - l = data[0]; - r = data[1]; - IP(l, r); - data[0] = l; - data[1] = r; - DES_encrypt2((uint32_t *)data, ks1, DES_ENCRYPT); - DES_encrypt2((uint32_t *)data, ks2, DES_DECRYPT); - DES_encrypt2((uint32_t *)data, ks3, DES_ENCRYPT); - l = data[0]; - r = data[1]; - FP(r, l); - data[0] = l; - data[1] = r; -} - -void DES_decrypt3(uint32_t *data, const DES_key_schedule *ks1, - const DES_key_schedule *ks2, const DES_key_schedule *ks3) { - uint32_t l, r; - - l = data[0]; - r = data[1]; - IP(l, r); - data[0] = l; - data[1] = r; - DES_encrypt2((uint32_t *)data, ks3, DES_DECRYPT); - DES_encrypt2((uint32_t *)data, ks2, DES_ENCRYPT); - DES_encrypt2((uint32_t *)data, ks1, DES_DECRYPT); - l = data[0]; - r = data[1]; - FP(r, l); - data[0] = l; - data[1] = r; -} - -void DES_ecb_encrypt(const DES_cblock *in_block, DES_cblock *out_block, - const DES_key_schedule *schedule, int is_encrypt) { - uint32_t l; - uint32_t ll[2]; - const uint8_t *in = in_block->bytes; - uint8_t *out = out_block->bytes; - - c2l(in, l); - ll[0] = l; - c2l(in, l); - ll[1] = l; - DES_encrypt1(ll, schedule, is_encrypt); - l = ll[0]; - l2c(l, out); - l = ll[1]; - l2c(l, out); - ll[0] = ll[1] = 0; -} - -void DES_ncbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const DES_key_schedule *schedule, DES_cblock *ivec, - int enc) { - uint32_t tin0, tin1; - uint32_t tout0, tout1, xor0, xor1; - uint32_t tin[2]; - unsigned char *iv; - - iv = ivec->bytes; - - if (enc) { - c2l(iv, tout0); - c2l(iv, tout1); - for (; len >= 8; len -= 8) { - c2l(in, tin0); - c2l(in, tin1); - tin0 ^= tout0; - tin[0] = tin0; - tin1 ^= tout1; - tin[1] = tin1; - DES_encrypt1((uint32_t *)tin, schedule, DES_ENCRYPT); - tout0 = tin[0]; - l2c(tout0, out); - tout1 = tin[1]; - l2c(tout1, out); - } - if (len != 0) { - c2ln(in, tin0, tin1, len); - tin0 ^= tout0; - tin[0] = tin0; - tin1 ^= tout1; - tin[1] = tin1; - DES_encrypt1((uint32_t *)tin, schedule, DES_ENCRYPT); - tout0 = tin[0]; - l2c(tout0, out); - tout1 = tin[1]; - l2c(tout1, out); - } - iv = ivec->bytes; - l2c(tout0, iv); - l2c(tout1, iv); - } else { - c2l(iv, xor0); - c2l(iv, xor1); - for (; len >= 8; len -= 8) { - c2l(in, tin0); - tin[0] = tin0; - c2l(in, tin1); - tin[1] = tin1; - DES_encrypt1((uint32_t *)tin, schedule, DES_DECRYPT); - tout0 = tin[0] ^ xor0; - tout1 = tin[1] ^ xor1; - l2c(tout0, out); - l2c(tout1, out); - xor0 = tin0; - xor1 = tin1; - } - if (len != 0) { - c2l(in, tin0); - tin[0] = tin0; - c2l(in, tin1); - tin[1] = tin1; - DES_encrypt1((uint32_t *)tin, schedule, DES_DECRYPT); - tout0 = tin[0] ^ xor0; - tout1 = tin[1] ^ xor1; - l2cn(tout0, tout1, out, len); - xor0 = tin0; - xor1 = tin1; - } - iv = ivec->bytes; - l2c(xor0, iv); - l2c(xor1, iv); - } - tin[0] = tin[1] = 0; -} - -void DES_ecb3_encrypt(const DES_cblock *input, DES_cblock *output, - const DES_key_schedule *ks1, const DES_key_schedule *ks2, - const DES_key_schedule *ks3, int enc) { - uint32_t l0, l1; - uint32_t ll[2]; - const uint8_t *in = input->bytes; - uint8_t *out = output->bytes; - - c2l(in, l0); - c2l(in, l1); - ll[0] = l0; - ll[1] = l1; - if (enc) { - DES_encrypt3(ll, ks1, ks2, ks3); - } else { - DES_decrypt3(ll, ks1, ks2, ks3); - } - l0 = ll[0]; - l1 = ll[1]; - l2c(l0, out); - l2c(l1, out); -} - -void DES_ede3_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const DES_key_schedule *ks1, - const DES_key_schedule *ks2, - const DES_key_schedule *ks3, DES_cblock *ivec, - int enc) { - uint32_t tin0, tin1; - uint32_t tout0, tout1, xor0, xor1; - uint32_t tin[2]; - uint8_t *iv; - - iv = ivec->bytes; - - if (enc) { - c2l(iv, tout0); - c2l(iv, tout1); - for (; len >= 8; len -= 8) { - c2l(in, tin0); - c2l(in, tin1); - tin0 ^= tout0; - tin1 ^= tout1; - - tin[0] = tin0; - tin[1] = tin1; - DES_encrypt3((uint32_t *)tin, ks1, ks2, ks3); - tout0 = tin[0]; - tout1 = tin[1]; - - l2c(tout0, out); - l2c(tout1, out); - } - if (len != 0) { - c2ln(in, tin0, tin1, len); - tin0 ^= tout0; - tin1 ^= tout1; - - tin[0] = tin0; - tin[1] = tin1; - DES_encrypt3((uint32_t *)tin, ks1, ks2, ks3); - tout0 = tin[0]; - tout1 = tin[1]; - - l2c(tout0, out); - l2c(tout1, out); - } - iv = ivec->bytes; - l2c(tout0, iv); - l2c(tout1, iv); - } else { - uint32_t t0, t1; - - c2l(iv, xor0); - c2l(iv, xor1); - for (; len >= 8; len -= 8) { - c2l(in, tin0); - c2l(in, tin1); - - t0 = tin0; - t1 = tin1; - - tin[0] = tin0; - tin[1] = tin1; - DES_decrypt3((uint32_t *)tin, ks1, ks2, ks3); - tout0 = tin[0]; - tout1 = tin[1]; - - tout0 ^= xor0; - tout1 ^= xor1; - l2c(tout0, out); - l2c(tout1, out); - xor0 = t0; - xor1 = t1; - } - if (len != 0) { - c2l(in, tin0); - c2l(in, tin1); - - t0 = tin0; - t1 = tin1; - - tin[0] = tin0; - tin[1] = tin1; - DES_decrypt3((uint32_t *)tin, ks1, ks2, ks3); - tout0 = tin[0]; - tout1 = tin[1]; - - tout0 ^= xor0; - tout1 ^= xor1; - l2cn(tout0, tout1, out, len); - xor0 = t0; - xor1 = t1; - } - - iv = ivec->bytes; - l2c(xor0, iv); - l2c(xor1, iv); - } - - tin[0] = tin[1] = 0; -} - -void DES_ede2_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const DES_key_schedule *ks1, - const DES_key_schedule *ks2, - DES_cblock *ivec, - int enc) { - DES_ede3_cbc_encrypt(in, out, len, ks1, ks2, ks1, ivec, enc); -} - - -// Deprecated functions. - -void DES_set_key_unchecked(const DES_cblock *key, DES_key_schedule *schedule) { - DES_set_key(key, schedule); -} - -#undef HPERM_OP -#undef c2l -#undef l2c -#undef c2ln -#undef l2cn -#undef PERM_OP -#undef IP -#undef FP -#undef LOAD_DATA -#undef D_ENCRYPT -#undef ITERATIONS -#undef HALF_ITERATIONS diff --git a/third_party/boringssl/src/crypto/des/des.cc b/third_party/boringssl/src/crypto/des/des.cc new file mode 100644 index 00000000..4620b8c2 --- /dev/null +++ b/third_party/boringssl/src/crypto/des/des.cc @@ -0,0 +1,837 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "internal.h" + + +using namespace bssl; + +/* IP and FP + * The problem is more of a geometric problem that random bit fiddling. + 0 1 2 3 4 5 6 7 62 54 46 38 30 22 14 6 + 8 9 10 11 12 13 14 15 60 52 44 36 28 20 12 4 +16 17 18 19 20 21 22 23 58 50 42 34 26 18 10 2 +24 25 26 27 28 29 30 31 to 56 48 40 32 24 16 8 0 + +32 33 34 35 36 37 38 39 63 55 47 39 31 23 15 7 +40 41 42 43 44 45 46 47 61 53 45 37 29 21 13 5 +48 49 50 51 52 53 54 55 59 51 43 35 27 19 11 3 +56 57 58 59 60 61 62 63 57 49 41 33 25 17 9 1 + +The output has been subject to swaps of the form +0 1 -> 3 1 but the odd and even bits have been put into +2 3 2 0 +different words. The main trick is to remember that +t=((l>>size)^r)&(mask); +r^=t; +l^=(t<> (n)) ^ (b)) & (m)); \ + (b) ^= (t); \ + (a) ^= ((t) << (n)); \ + } while (0) + +#define IP(l, r) \ + do { \ + uint32_t tt; \ + PERM_OP(r, l, tt, 4, 0x0f0f0f0fL); \ + PERM_OP(l, r, tt, 16, 0x0000ffffL); \ + PERM_OP(r, l, tt, 2, 0x33333333L); \ + PERM_OP(l, r, tt, 8, 0x00ff00ffL); \ + PERM_OP(r, l, tt, 1, 0x55555555L); \ + } while (0) + +#define FP(l, r) \ + do { \ + uint32_t tt; \ + PERM_OP(l, r, tt, 1, 0x55555555L); \ + PERM_OP(r, l, tt, 8, 0x00ff00ffL); \ + PERM_OP(l, r, tt, 2, 0x33333333L); \ + PERM_OP(r, l, tt, 16, 0x0000ffffL); \ + PERM_OP(l, r, tt, 4, 0x0f0f0f0fL); \ + } while (0) + +#define LOAD_DATA(ks, R, S, u, t, E0, E1) \ + do { \ + (u) = (R) ^ (ks)->subkeys[S][0]; \ + (t) = (R) ^ (ks)->subkeys[S][1]; \ + } while (0) + +#define D_ENCRYPT(ks, LL, R, S) \ + do { \ + LOAD_DATA(ks, R, S, u, t, E0, E1); \ + t = CRYPTO_rotr_u32(t, 4); \ + (LL) ^= \ + DES_SPtrans[0][(u >> 2L) & 0x3f] ^ DES_SPtrans[2][(u >> 10L) & 0x3f] ^ \ + DES_SPtrans[4][(u >> 18L) & 0x3f] ^ \ + DES_SPtrans[6][(u >> 26L) & 0x3f] ^ DES_SPtrans[1][(t >> 2L) & 0x3f] ^ \ + DES_SPtrans[3][(t >> 10L) & 0x3f] ^ \ + DES_SPtrans[5][(t >> 18L) & 0x3f] ^ DES_SPtrans[7][(t >> 26L) & 0x3f]; \ + } while (0) + +#define ITERATIONS 16 +#define HALF_ITERATIONS 8 + +static const uint32_t des_skb[8][64] = { + { // for C bits (numbered as per FIPS 46) 1 2 3 4 5 6 + 0x00000000, 0x00000010, 0x20000000, 0x20000010, 0x00010000, + 0x00010010, 0x20010000, 0x20010010, 0x00000800, 0x00000810, + 0x20000800, 0x20000810, 0x00010800, 0x00010810, 0x20010800, + 0x20010810, 0x00000020, 0x00000030, 0x20000020, 0x20000030, + 0x00010020, 0x00010030, 0x20010020, 0x20010030, 0x00000820, + 0x00000830, 0x20000820, 0x20000830, 0x00010820, 0x00010830, + 0x20010820, 0x20010830, 0x00080000, 0x00080010, 0x20080000, + 0x20080010, 0x00090000, 0x00090010, 0x20090000, 0x20090010, + 0x00080800, 0x00080810, 0x20080800, 0x20080810, 0x00090800, + 0x00090810, 0x20090800, 0x20090810, 0x00080020, 0x00080030, + 0x20080020, 0x20080030, 0x00090020, 0x00090030, 0x20090020, + 0x20090030, 0x00080820, 0x00080830, 0x20080820, 0x20080830, + 0x00090820, 0x00090830, 0x20090820, 0x20090830, }, + { // for C bits (numbered as per FIPS 46) 7 8 10 11 12 13 + 0x00000000, 0x02000000, 0x00002000, 0x02002000, 0x00200000, + 0x02200000, 0x00202000, 0x02202000, 0x00000004, 0x02000004, + 0x00002004, 0x02002004, 0x00200004, 0x02200004, 0x00202004, + 0x02202004, 0x00000400, 0x02000400, 0x00002400, 0x02002400, + 0x00200400, 0x02200400, 0x00202400, 0x02202400, 0x00000404, + 0x02000404, 0x00002404, 0x02002404, 0x00200404, 0x02200404, + 0x00202404, 0x02202404, 0x10000000, 0x12000000, 0x10002000, + 0x12002000, 0x10200000, 0x12200000, 0x10202000, 0x12202000, + 0x10000004, 0x12000004, 0x10002004, 0x12002004, 0x10200004, + 0x12200004, 0x10202004, 0x12202004, 0x10000400, 0x12000400, + 0x10002400, 0x12002400, 0x10200400, 0x12200400, 0x10202400, + 0x12202400, 0x10000404, 0x12000404, 0x10002404, 0x12002404, + 0x10200404, 0x12200404, 0x10202404, 0x12202404, }, + { // for C bits (numbered as per FIPS 46) 14 15 16 17 19 20 + 0x00000000, 0x00000001, 0x00040000, 0x00040001, 0x01000000, + 0x01000001, 0x01040000, 0x01040001, 0x00000002, 0x00000003, + 0x00040002, 0x00040003, 0x01000002, 0x01000003, 0x01040002, + 0x01040003, 0x00000200, 0x00000201, 0x00040200, 0x00040201, + 0x01000200, 0x01000201, 0x01040200, 0x01040201, 0x00000202, + 0x00000203, 0x00040202, 0x00040203, 0x01000202, 0x01000203, + 0x01040202, 0x01040203, 0x08000000, 0x08000001, 0x08040000, + 0x08040001, 0x09000000, 0x09000001, 0x09040000, 0x09040001, + 0x08000002, 0x08000003, 0x08040002, 0x08040003, 0x09000002, + 0x09000003, 0x09040002, 0x09040003, 0x08000200, 0x08000201, + 0x08040200, 0x08040201, 0x09000200, 0x09000201, 0x09040200, + 0x09040201, 0x08000202, 0x08000203, 0x08040202, 0x08040203, + 0x09000202, 0x09000203, 0x09040202, 0x09040203, }, + { // for C bits (numbered as per FIPS 46) 21 23 24 26 27 28 + 0x00000000, 0x00100000, 0x00000100, 0x00100100, 0x00000008, + 0x00100008, 0x00000108, 0x00100108, 0x00001000, 0x00101000, + 0x00001100, 0x00101100, 0x00001008, 0x00101008, 0x00001108, + 0x00101108, 0x04000000, 0x04100000, 0x04000100, 0x04100100, + 0x04000008, 0x04100008, 0x04000108, 0x04100108, 0x04001000, + 0x04101000, 0x04001100, 0x04101100, 0x04001008, 0x04101008, + 0x04001108, 0x04101108, 0x00020000, 0x00120000, 0x00020100, + 0x00120100, 0x00020008, 0x00120008, 0x00020108, 0x00120108, + 0x00021000, 0x00121000, 0x00021100, 0x00121100, 0x00021008, + 0x00121008, 0x00021108, 0x00121108, 0x04020000, 0x04120000, + 0x04020100, 0x04120100, 0x04020008, 0x04120008, 0x04020108, + 0x04120108, 0x04021000, 0x04121000, 0x04021100, 0x04121100, + 0x04021008, 0x04121008, 0x04021108, 0x04121108, }, + { // for D bits (numbered as per FIPS 46) 1 2 3 4 5 6 + 0x00000000, 0x10000000, 0x00010000, 0x10010000, 0x00000004, + 0x10000004, 0x00010004, 0x10010004, 0x20000000, 0x30000000, + 0x20010000, 0x30010000, 0x20000004, 0x30000004, 0x20010004, + 0x30010004, 0x00100000, 0x10100000, 0x00110000, 0x10110000, + 0x00100004, 0x10100004, 0x00110004, 0x10110004, 0x20100000, + 0x30100000, 0x20110000, 0x30110000, 0x20100004, 0x30100004, + 0x20110004, 0x30110004, 0x00001000, 0x10001000, 0x00011000, + 0x10011000, 0x00001004, 0x10001004, 0x00011004, 0x10011004, + 0x20001000, 0x30001000, 0x20011000, 0x30011000, 0x20001004, + 0x30001004, 0x20011004, 0x30011004, 0x00101000, 0x10101000, + 0x00111000, 0x10111000, 0x00101004, 0x10101004, 0x00111004, + 0x10111004, 0x20101000, 0x30101000, 0x20111000, 0x30111000, + 0x20101004, 0x30101004, 0x20111004, 0x30111004, }, + { // for D bits (numbered as per FIPS 46) 8 9 11 12 13 14 + 0x00000000, 0x08000000, 0x00000008, 0x08000008, 0x00000400, + 0x08000400, 0x00000408, 0x08000408, 0x00020000, 0x08020000, + 0x00020008, 0x08020008, 0x00020400, 0x08020400, 0x00020408, + 0x08020408, 0x00000001, 0x08000001, 0x00000009, 0x08000009, + 0x00000401, 0x08000401, 0x00000409, 0x08000409, 0x00020001, + 0x08020001, 0x00020009, 0x08020009, 0x00020401, 0x08020401, + 0x00020409, 0x08020409, 0x02000000, 0x0A000000, 0x02000008, + 0x0A000008, 0x02000400, 0x0A000400, 0x02000408, 0x0A000408, + 0x02020000, 0x0A020000, 0x02020008, 0x0A020008, 0x02020400, + 0x0A020400, 0x02020408, 0x0A020408, 0x02000001, 0x0A000001, + 0x02000009, 0x0A000009, 0x02000401, 0x0A000401, 0x02000409, + 0x0A000409, 0x02020001, 0x0A020001, 0x02020009, 0x0A020009, + 0x02020401, 0x0A020401, 0x02020409, 0x0A020409, }, + { // for D bits (numbered as per FIPS 46) 16 17 18 19 20 21 + 0x00000000, 0x00000100, 0x00080000, 0x00080100, 0x01000000, + 0x01000100, 0x01080000, 0x01080100, 0x00000010, 0x00000110, + 0x00080010, 0x00080110, 0x01000010, 0x01000110, 0x01080010, + 0x01080110, 0x00200000, 0x00200100, 0x00280000, 0x00280100, + 0x01200000, 0x01200100, 0x01280000, 0x01280100, 0x00200010, + 0x00200110, 0x00280010, 0x00280110, 0x01200010, 0x01200110, + 0x01280010, 0x01280110, 0x00000200, 0x00000300, 0x00080200, + 0x00080300, 0x01000200, 0x01000300, 0x01080200, 0x01080300, + 0x00000210, 0x00000310, 0x00080210, 0x00080310, 0x01000210, + 0x01000310, 0x01080210, 0x01080310, 0x00200200, 0x00200300, + 0x00280200, 0x00280300, 0x01200200, 0x01200300, 0x01280200, + 0x01280300, 0x00200210, 0x00200310, 0x00280210, 0x00280310, + 0x01200210, 0x01200310, 0x01280210, 0x01280310, }, + { // for D bits (numbered as per FIPS 46) 22 23 24 25 27 28 + 0x00000000, 0x04000000, 0x00040000, 0x04040000, 0x00000002, + 0x04000002, 0x00040002, 0x04040002, 0x00002000, 0x04002000, + 0x00042000, 0x04042000, 0x00002002, 0x04002002, 0x00042002, + 0x04042002, 0x00000020, 0x04000020, 0x00040020, 0x04040020, + 0x00000022, 0x04000022, 0x00040022, 0x04040022, 0x00002020, + 0x04002020, 0x00042020, 0x04042020, 0x00002022, 0x04002022, + 0x00042022, 0x04042022, 0x00000800, 0x04000800, 0x00040800, + 0x04040800, 0x00000802, 0x04000802, 0x00040802, 0x04040802, + 0x00002800, 0x04002800, 0x00042800, 0x04042800, 0x00002802, + 0x04002802, 0x00042802, 0x04042802, 0x00000820, 0x04000820, + 0x00040820, 0x04040820, 0x00000822, 0x04000822, 0x00040822, + 0x04040822, 0x00002820, 0x04002820, 0x00042820, 0x04042820, + 0x00002822, 0x04002822, 0x00042822, 0x04042822, }}; + +static const uint32_t DES_SPtrans[8][64] = { + { // nibble 0 + 0x02080800, 0x00080000, 0x02000002, 0x02080802, 0x02000000, + 0x00080802, 0x00080002, 0x02000002, 0x00080802, 0x02080800, + 0x02080000, 0x00000802, 0x02000802, 0x02000000, 0x00000000, + 0x00080002, 0x00080000, 0x00000002, 0x02000800, 0x00080800, + 0x02080802, 0x02080000, 0x00000802, 0x02000800, 0x00000002, + 0x00000800, 0x00080800, 0x02080002, 0x00000800, 0x02000802, + 0x02080002, 0x00000000, 0x00000000, 0x02080802, 0x02000800, + 0x00080002, 0x02080800, 0x00080000, 0x00000802, 0x02000800, + 0x02080002, 0x00000800, 0x00080800, 0x02000002, 0x00080802, + 0x00000002, 0x02000002, 0x02080000, 0x02080802, 0x00080800, + 0x02080000, 0x02000802, 0x02000000, 0x00000802, 0x00080002, + 0x00000000, 0x00080000, 0x02000000, 0x02000802, 0x02080800, + 0x00000002, 0x02080002, 0x00000800, 0x00080802, }, + { // nibble 1 + 0x40108010, 0x00000000, 0x00108000, 0x40100000, 0x40000010, + 0x00008010, 0x40008000, 0x00108000, 0x00008000, 0x40100010, + 0x00000010, 0x40008000, 0x00100010, 0x40108000, 0x40100000, + 0x00000010, 0x00100000, 0x40008010, 0x40100010, 0x00008000, + 0x00108010, 0x40000000, 0x00000000, 0x00100010, 0x40008010, + 0x00108010, 0x40108000, 0x40000010, 0x40000000, 0x00100000, + 0x00008010, 0x40108010, 0x00100010, 0x40108000, 0x40008000, + 0x00108010, 0x40108010, 0x00100010, 0x40000010, 0x00000000, + 0x40000000, 0x00008010, 0x00100000, 0x40100010, 0x00008000, + 0x40000000, 0x00108010, 0x40008010, 0x40108000, 0x00008000, + 0x00000000, 0x40000010, 0x00000010, 0x40108010, 0x00108000, + 0x40100000, 0x40100010, 0x00100000, 0x00008010, 0x40008000, + 0x40008010, 0x00000010, 0x40100000, 0x00108000, }, + { // nibble 2 + 0x04000001, 0x04040100, 0x00000100, 0x04000101, 0x00040001, + 0x04000000, 0x04000101, 0x00040100, 0x04000100, 0x00040000, + 0x04040000, 0x00000001, 0x04040101, 0x00000101, 0x00000001, + 0x04040001, 0x00000000, 0x00040001, 0x04040100, 0x00000100, + 0x00000101, 0x04040101, 0x00040000, 0x04000001, 0x04040001, + 0x04000100, 0x00040101, 0x04040000, 0x00040100, 0x00000000, + 0x04000000, 0x00040101, 0x04040100, 0x00000100, 0x00000001, + 0x00040000, 0x00000101, 0x00040001, 0x04040000, 0x04000101, + 0x00000000, 0x04040100, 0x00040100, 0x04040001, 0x00040001, + 0x04000000, 0x04040101, 0x00000001, 0x00040101, 0x04000001, + 0x04000000, 0x04040101, 0x00040000, 0x04000100, 0x04000101, + 0x00040100, 0x04000100, 0x00000000, 0x04040001, 0x00000101, + 0x04000001, 0x00040101, 0x00000100, 0x04040000, }, + { // nibble 3 + 0x00401008, 0x10001000, 0x00000008, 0x10401008, 0x00000000, + 0x10400000, 0x10001008, 0x00400008, 0x10401000, 0x10000008, + 0x10000000, 0x00001008, 0x10000008, 0x00401008, 0x00400000, + 0x10000000, 0x10400008, 0x00401000, 0x00001000, 0x00000008, + 0x00401000, 0x10001008, 0x10400000, 0x00001000, 0x00001008, + 0x00000000, 0x00400008, 0x10401000, 0x10001000, 0x10400008, + 0x10401008, 0x00400000, 0x10400008, 0x00001008, 0x00400000, + 0x10000008, 0x00401000, 0x10001000, 0x00000008, 0x10400000, + 0x10001008, 0x00000000, 0x00001000, 0x00400008, 0x00000000, + 0x10400008, 0x10401000, 0x00001000, 0x10000000, 0x10401008, + 0x00401008, 0x00400000, 0x10401008, 0x00000008, 0x10001000, + 0x00401008, 0x00400008, 0x00401000, 0x10400000, 0x10001008, + 0x00001008, 0x10000000, 0x10000008, 0x10401000, }, + { // nibble 4 + 0x08000000, 0x00010000, 0x00000400, 0x08010420, 0x08010020, + 0x08000400, 0x00010420, 0x08010000, 0x00010000, 0x00000020, + 0x08000020, 0x00010400, 0x08000420, 0x08010020, 0x08010400, + 0x00000000, 0x00010400, 0x08000000, 0x00010020, 0x00000420, + 0x08000400, 0x00010420, 0x00000000, 0x08000020, 0x00000020, + 0x08000420, 0x08010420, 0x00010020, 0x08010000, 0x00000400, + 0x00000420, 0x08010400, 0x08010400, 0x08000420, 0x00010020, + 0x08010000, 0x00010000, 0x00000020, 0x08000020, 0x08000400, + 0x08000000, 0x00010400, 0x08010420, 0x00000000, 0x00010420, + 0x08000000, 0x00000400, 0x00010020, 0x08000420, 0x00000400, + 0x00000000, 0x08010420, 0x08010020, 0x08010400, 0x00000420, + 0x00010000, 0x00010400, 0x08010020, 0x08000400, 0x00000420, + 0x00000020, 0x00010420, 0x08010000, 0x08000020, }, + { // nibble 5 + 0x80000040, 0x00200040, 0x00000000, 0x80202000, 0x00200040, + 0x00002000, 0x80002040, 0x00200000, 0x00002040, 0x80202040, + 0x00202000, 0x80000000, 0x80002000, 0x80000040, 0x80200000, + 0x00202040, 0x00200000, 0x80002040, 0x80200040, 0x00000000, + 0x00002000, 0x00000040, 0x80202000, 0x80200040, 0x80202040, + 0x80200000, 0x80000000, 0x00002040, 0x00000040, 0x00202000, + 0x00202040, 0x80002000, 0x00002040, 0x80000000, 0x80002000, + 0x00202040, 0x80202000, 0x00200040, 0x00000000, 0x80002000, + 0x80000000, 0x00002000, 0x80200040, 0x00200000, 0x00200040, + 0x80202040, 0x00202000, 0x00000040, 0x80202040, 0x00202000, + 0x00200000, 0x80002040, 0x80000040, 0x80200000, 0x00202040, + 0x00000000, 0x00002000, 0x80000040, 0x80002040, 0x80202000, + 0x80200000, 0x00002040, 0x00000040, 0x80200040, }, + { // nibble 6 + 0x00004000, 0x00000200, 0x01000200, 0x01000004, 0x01004204, + 0x00004004, 0x00004200, 0x00000000, 0x01000000, 0x01000204, + 0x00000204, 0x01004000, 0x00000004, 0x01004200, 0x01004000, + 0x00000204, 0x01000204, 0x00004000, 0x00004004, 0x01004204, + 0x00000000, 0x01000200, 0x01000004, 0x00004200, 0x01004004, + 0x00004204, 0x01004200, 0x00000004, 0x00004204, 0x01004004, + 0x00000200, 0x01000000, 0x00004204, 0x01004000, 0x01004004, + 0x00000204, 0x00004000, 0x00000200, 0x01000000, 0x01004004, + 0x01000204, 0x00004204, 0x00004200, 0x00000000, 0x00000200, + 0x01000004, 0x00000004, 0x01000200, 0x00000000, 0x01000204, + 0x01000200, 0x00004200, 0x00000204, 0x00004000, 0x01004204, + 0x01000000, 0x01004200, 0x00000004, 0x00004004, 0x01004204, + 0x01000004, 0x01004200, 0x01004000, 0x00004004, }, + { // nibble 7 + 0x20800080, 0x20820000, 0x00020080, 0x00000000, 0x20020000, + 0x00800080, 0x20800000, 0x20820080, 0x00000080, 0x20000000, + 0x00820000, 0x00020080, 0x00820080, 0x20020080, 0x20000080, + 0x20800000, 0x00020000, 0x00820080, 0x00800080, 0x20020000, + 0x20820080, 0x20000080, 0x00000000, 0x00820000, 0x20000000, + 0x00800000, 0x20020080, 0x20800080, 0x00800000, 0x00020000, + 0x20820000, 0x00000080, 0x00800000, 0x00020000, 0x20000080, + 0x20820080, 0x00020080, 0x20000000, 0x00000000, 0x00820000, + 0x20800080, 0x20020080, 0x20020000, 0x00800080, 0x20820000, + 0x00000080, 0x00800080, 0x20020000, 0x20820080, 0x00800000, + 0x20800000, 0x20000080, 0x00820000, 0x00020080, 0x20020080, + 0x20800000, 0x00000080, 0x20820000, 0x00820080, 0x00000000, + 0x20000000, 0x20800080, 0x00020000, 0x00820080, }}; + +#define HPERM_OP(a, t, n, m) \ + ((t) = ((((a) << (16 - (n))) ^ (a)) & (m)), \ + (a) = (a) ^ (t) ^ ((t) >> (16 - (n)))) + +void DES_set_key(const DES_cblock *key, DES_key_schedule *schedule) { + DES_set_key_ex(key->bytes, schedule); +} + +void bssl::DES_set_key_ex(const uint8_t key[8], DES_key_schedule *schedule) { + static const int shifts2[16] = {0, 0, 1, 1, 1, 1, 1, 1, + 0, 1, 1, 1, 1, 1, 1, 0}; + uint32_t c, d, t, s, t2; + const uint8_t *in; + int i; + + in = key; + + c2l(in, c); + c2l(in, d); + + // do PC1 in 47 simple operations :-) + // Thanks to John Fletcher (john_fletcher@lccmail.ocf.llnl.gov) + // for the inspiration. :-) + PERM_OP(d, c, t, 4, 0x0f0f0f0f); + HPERM_OP(c, t, -2, 0xcccc0000); + HPERM_OP(d, t, -2, 0xcccc0000); + PERM_OP(d, c, t, 1, 0x55555555); + PERM_OP(c, d, t, 8, 0x00ff00ff); + PERM_OP(d, c, t, 1, 0x55555555); + d = (((d & 0x000000ff) << 16) | (d & 0x0000ff00) | + ((d & 0x00ff0000) >> 16) | ((c & 0xf0000000) >> 4)); + c &= 0x0fffffff; + + for (i = 0; i < ITERATIONS; i++) { + if (shifts2[i]) { + c = ((c >> 2) | (c << 26)); + d = ((d >> 2) | (d << 26)); + } else { + c = ((c >> 1) | (c << 27)); + d = ((d >> 1) | (d << 27)); + } + c &= 0x0fffffff; + d &= 0x0fffffff; + // could be a few less shifts but I am to lazy at this + // point in time to investigate + s = des_skb[0][(c) & 0x3f] | + des_skb[1][((c >> 6) & 0x03) | ((c >> 7) & 0x3c)] | + des_skb[2][((c >> 13) & 0x0f) | ((c >> 14) & 0x30)] | + des_skb[3][((c >> 20) & 0x01) | ((c >> 21) & 0x06) | + ((c >> 22) & 0x38)]; + t = des_skb[4][(d) & 0x3f] | + des_skb[5][((d >> 7) & 0x03) | ((d >> 8) & 0x3c)] | + des_skb[6][(d >> 15) & 0x3f] | + des_skb[7][((d >> 21) & 0x0f) | ((d >> 22) & 0x30)]; + + // table contained 0213 4657 + t2 = ((t << 16) | (s & 0x0000ffff)) & 0xffffffff; + schedule->subkeys[i][0] = CRYPTO_rotr_u32(t2, 30); + + t2 = ((s >> 16) | (t & 0xffff0000)); + schedule->subkeys[i][1] = CRYPTO_rotr_u32(t2, 26); + } +} + +static const uint8_t kOddParity[256] = { + 1, 1, 2, 2, 4, 4, 7, 7, 8, 8, 11, 11, 13, 13, 14, + 14, 16, 16, 19, 19, 21, 21, 22, 22, 25, 25, 26, 26, 28, 28, + 31, 31, 32, 32, 35, 35, 37, 37, 38, 38, 41, 41, 42, 42, 44, + 44, 47, 47, 49, 49, 50, 50, 52, 52, 55, 55, 56, 56, 59, 59, + 61, 61, 62, 62, 64, 64, 67, 67, 69, 69, 70, 70, 73, 73, 74, + 74, 76, 76, 79, 79, 81, 81, 82, 82, 84, 84, 87, 87, 88, 88, + 91, 91, 93, 93, 94, 94, 97, 97, 98, 98, 100, 100, 103, 103, 104, + 104, 107, 107, 109, 109, 110, 110, 112, 112, 115, 115, 117, 117, 118, 118, + 121, 121, 122, 122, 124, 124, 127, 127, 128, 128, 131, 131, 133, 133, 134, + 134, 137, 137, 138, 138, 140, 140, 143, 143, 145, 145, 146, 146, 148, 148, + 151, 151, 152, 152, 155, 155, 157, 157, 158, 158, 161, 161, 162, 162, 164, + 164, 167, 167, 168, 168, 171, 171, 173, 173, 174, 174, 176, 176, 179, 179, + 181, 181, 182, 182, 185, 185, 186, 186, 188, 188, 191, 191, 193, 193, 194, + 194, 196, 196, 199, 199, 200, 200, 203, 203, 205, 205, 206, 206, 208, 208, + 211, 211, 213, 213, 214, 214, 217, 217, 218, 218, 220, 220, 223, 223, 224, + 224, 227, 227, 229, 229, 230, 230, 233, 233, 234, 234, 236, 236, 239, 239, + 241, 241, 242, 242, 244, 244, 247, 247, 248, 248, 251, 251, 253, 253, 254, + 254 +}; + +void DES_set_odd_parity(DES_cblock *key) { + unsigned i; + + for (i = 0; i < DES_KEY_SZ; i++) { + key->bytes[i] = kOddParity[key->bytes[i]]; + } +} + +static void DES_encrypt1(uint32_t data[2], const DES_key_schedule *ks, + int enc) { + uint32_t l, r, t, u; + + r = data[0]; + l = data[1]; + + IP(r, l); + // Things have been modified so that the initial rotate is done outside + // the loop. This required the DES_SPtrans values in sp.h to be + // rotated 1 bit to the right. One perl script later and things have a + // 5% speed up on a sparc2. Thanks to Richard Outerbridge + // <71755.204@CompuServe.COM> for pointing this out. + // clear the top bits on machines with 8byte longs + // shift left by 2 + r = CRYPTO_rotr_u32(r, 29); + l = CRYPTO_rotr_u32(l, 29); + + // I don't know if it is worth the effort of loop unrolling the + // inner loop + if (enc) { + D_ENCRYPT(ks, l, r, 0); + D_ENCRYPT(ks, r, l, 1); + D_ENCRYPT(ks, l, r, 2); + D_ENCRYPT(ks, r, l, 3); + D_ENCRYPT(ks, l, r, 4); + D_ENCRYPT(ks, r, l, 5); + D_ENCRYPT(ks, l, r, 6); + D_ENCRYPT(ks, r, l, 7); + D_ENCRYPT(ks, l, r, 8); + D_ENCRYPT(ks, r, l, 9); + D_ENCRYPT(ks, l, r, 10); + D_ENCRYPT(ks, r, l, 11); + D_ENCRYPT(ks, l, r, 12); + D_ENCRYPT(ks, r, l, 13); + D_ENCRYPT(ks, l, r, 14); + D_ENCRYPT(ks, r, l, 15); + } else { + D_ENCRYPT(ks, l, r, 15); + D_ENCRYPT(ks, r, l, 14); + D_ENCRYPT(ks, l, r, 13); + D_ENCRYPT(ks, r, l, 12); + D_ENCRYPT(ks, l, r, 11); + D_ENCRYPT(ks, r, l, 10); + D_ENCRYPT(ks, l, r, 9); + D_ENCRYPT(ks, r, l, 8); + D_ENCRYPT(ks, l, r, 7); + D_ENCRYPT(ks, r, l, 6); + D_ENCRYPT(ks, l, r, 5); + D_ENCRYPT(ks, r, l, 4); + D_ENCRYPT(ks, l, r, 3); + D_ENCRYPT(ks, r, l, 2); + D_ENCRYPT(ks, l, r, 1); + D_ENCRYPT(ks, r, l, 0); + } + + // rotate and clear the top bits on machines with 8byte longs + l = CRYPTO_rotr_u32(l, 3); + r = CRYPTO_rotr_u32(r, 3); + + FP(r, l); + data[0] = l; + data[1] = r; +} + +static void DES_encrypt2(uint32_t data[2], const DES_key_schedule *ks, + int enc) { + uint32_t l, r, t, u; + + r = data[0]; + l = data[1]; + + // Things have been modified so that the initial rotate is done outside the + // loop. This required the DES_SPtrans values in sp.h to be rotated 1 bit to + // the right. One perl script later and things have a 5% speed up on a + // sparc2. Thanks to Richard Outerbridge <71755.204@CompuServe.COM> for + // pointing this out. + // clear the top bits on machines with 8byte longs + r = CRYPTO_rotr_u32(r, 29); + l = CRYPTO_rotr_u32(l, 29); + + // I don't know if it is worth the effort of loop unrolling the + // inner loop + if (enc) { + D_ENCRYPT(ks, l, r, 0); + D_ENCRYPT(ks, r, l, 1); + D_ENCRYPT(ks, l, r, 2); + D_ENCRYPT(ks, r, l, 3); + D_ENCRYPT(ks, l, r, 4); + D_ENCRYPT(ks, r, l, 5); + D_ENCRYPT(ks, l, r, 6); + D_ENCRYPT(ks, r, l, 7); + D_ENCRYPT(ks, l, r, 8); + D_ENCRYPT(ks, r, l, 9); + D_ENCRYPT(ks, l, r, 10); + D_ENCRYPT(ks, r, l, 11); + D_ENCRYPT(ks, l, r, 12); + D_ENCRYPT(ks, r, l, 13); + D_ENCRYPT(ks, l, r, 14); + D_ENCRYPT(ks, r, l, 15); + } else { + D_ENCRYPT(ks, l, r, 15); + D_ENCRYPT(ks, r, l, 14); + D_ENCRYPT(ks, l, r, 13); + D_ENCRYPT(ks, r, l, 12); + D_ENCRYPT(ks, l, r, 11); + D_ENCRYPT(ks, r, l, 10); + D_ENCRYPT(ks, l, r, 9); + D_ENCRYPT(ks, r, l, 8); + D_ENCRYPT(ks, l, r, 7); + D_ENCRYPT(ks, r, l, 6); + D_ENCRYPT(ks, l, r, 5); + D_ENCRYPT(ks, r, l, 4); + D_ENCRYPT(ks, l, r, 3); + D_ENCRYPT(ks, r, l, 2); + D_ENCRYPT(ks, l, r, 1); + D_ENCRYPT(ks, r, l, 0); + } + // rotate and clear the top bits on machines with 8byte longs + data[0] = CRYPTO_rotr_u32(l, 3); + data[1] = CRYPTO_rotr_u32(r, 3); +} + +void bssl::DES_encrypt3(uint32_t data[2], const DES_key_schedule *ks1, + const DES_key_schedule *ks2, + const DES_key_schedule *ks3) { + uint32_t l, r; + + l = data[0]; + r = data[1]; + IP(l, r); + data[0] = l; + data[1] = r; + DES_encrypt2(data, ks1, DES_ENCRYPT); + DES_encrypt2(data, ks2, DES_DECRYPT); + DES_encrypt2(data, ks3, DES_ENCRYPT); + l = data[0]; + r = data[1]; + FP(r, l); + data[0] = l; + data[1] = r; +} + +void bssl::DES_decrypt3(uint32_t data[2], const DES_key_schedule *ks1, + const DES_key_schedule *ks2, + const DES_key_schedule *ks3) { + uint32_t l, r; + + l = data[0]; + r = data[1]; + IP(l, r); + data[0] = l; + data[1] = r; + DES_encrypt2(data, ks3, DES_DECRYPT); + DES_encrypt2(data, ks2, DES_ENCRYPT); + DES_encrypt2(data, ks1, DES_DECRYPT); + l = data[0]; + r = data[1]; + FP(r, l); + data[0] = l; + data[1] = r; +} + +void DES_ecb_encrypt(const DES_cblock *in_block, DES_cblock *out_block, + const DES_key_schedule *schedule, int is_encrypt) { + DES_ecb_encrypt_ex(in_block->bytes, out_block->bytes, schedule, is_encrypt); +} + +void bssl::DES_ecb_encrypt_ex(const uint8_t in[8], uint8_t out[8], + const DES_key_schedule *schedule, + int is_encrypt) { + uint32_t ll[2]; + ll[0] = CRYPTO_load_u32_le(in); + ll[1] = CRYPTO_load_u32_le(in + 4); + DES_encrypt1(ll, schedule, is_encrypt); + CRYPTO_store_u32_le(out, ll[0]); + CRYPTO_store_u32_le(out + 4, ll[1]); +} + +void DES_ncbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const DES_key_schedule *schedule, DES_cblock *ivec, + int enc) { + DES_ncbc_encrypt_ex(in, out, len, schedule, ivec->bytes, enc); +} + +void bssl::DES_ncbc_encrypt_ex(const uint8_t *in, uint8_t *out, size_t len, + const DES_key_schedule *schedule, + uint8_t ivec[8], int enc) { + uint32_t tin0, tin1; + uint32_t tout0, tout1, xor0, xor1; + uint32_t tin[2]; + unsigned char *iv; + assert(len % 8 == 0); + + iv = ivec; + + if (enc) { + c2l(iv, tout0); + c2l(iv, tout1); + for (; len >= 8; len -= 8) { + c2l(in, tin0); + c2l(in, tin1); + tin0 ^= tout0; + tin[0] = tin0; + tin1 ^= tout1; + tin[1] = tin1; + DES_encrypt1(tin, schedule, DES_ENCRYPT); + tout0 = tin[0]; + l2c(tout0, out); + tout1 = tin[1]; + l2c(tout1, out); + } + if (len != 0) { + c2ln(in, tin0, tin1, len); + tin0 ^= tout0; + tin[0] = tin0; + tin1 ^= tout1; + tin[1] = tin1; + DES_encrypt1(tin, schedule, DES_ENCRYPT); + tout0 = tin[0]; + l2c(tout0, out); + tout1 = tin[1]; + l2c(tout1, out); + } + iv = ivec; + l2c(tout0, iv); + l2c(tout1, iv); + } else { + c2l(iv, xor0); + c2l(iv, xor1); + for (; len >= 8; len -= 8) { + c2l(in, tin0); + tin[0] = tin0; + c2l(in, tin1); + tin[1] = tin1; + DES_encrypt1(tin, schedule, DES_DECRYPT); + tout0 = tin[0] ^ xor0; + tout1 = tin[1] ^ xor1; + l2c(tout0, out); + l2c(tout1, out); + xor0 = tin0; + xor1 = tin1; + } + if (len != 0) { + c2l(in, tin0); + tin[0] = tin0; + c2l(in, tin1); + tin[1] = tin1; + DES_encrypt1(tin, schedule, DES_DECRYPT); + tout0 = tin[0] ^ xor0; + tout1 = tin[1] ^ xor1; + l2cn(tout0, tout1, out, len); + xor0 = tin0; + xor1 = tin1; + } + iv = ivec; + l2c(xor0, iv); + l2c(xor1, iv); + } + tin[0] = tin[1] = 0; +} + +void DES_ecb3_encrypt(const DES_cblock *input, DES_cblock *output, + const DES_key_schedule *ks1, const DES_key_schedule *ks2, + const DES_key_schedule *ks3, int enc) { + DES_ecb3_encrypt_ex(input->bytes, output->bytes, ks1, ks2, ks3, enc); +} + +void bssl::DES_ecb3_encrypt_ex(const uint8_t in[8], uint8_t out[8], + const DES_key_schedule *ks1, + const DES_key_schedule *ks2, + const DES_key_schedule *ks3, int enc) { + uint32_t ll[2]; + ll[0] = CRYPTO_load_u32_le(in); + ll[1] = CRYPTO_load_u32_le(in + 4); + if (enc) { + DES_encrypt3(ll, ks1, ks2, ks3); + } else { + DES_decrypt3(ll, ks1, ks2, ks3); + } + CRYPTO_store_u32_le(out, ll[0]); + CRYPTO_store_u32_le(out + 4, ll[1]); +} + +void DES_ede3_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const DES_key_schedule *ks1, + const DES_key_schedule *ks2, + const DES_key_schedule *ks3, DES_cblock *ivec, + int enc) { + DES_ede3_cbc_encrypt_ex(in, out, len, ks1, ks2, ks3, ivec->bytes, enc); +} + +void bssl::DES_ede3_cbc_encrypt_ex(const uint8_t *in, uint8_t *out, size_t len, + const DES_key_schedule *ks1, + const DES_key_schedule *ks2, + const DES_key_schedule *ks3, uint8_t ivec[8], + int enc) { + uint32_t tin0, tin1; + uint32_t tout0, tout1, xor0, xor1; + uint32_t tin[2]; + uint8_t *iv; + assert(len % 8 == 0); + + iv = ivec; + + if (enc) { + c2l(iv, tout0); + c2l(iv, tout1); + for (; len >= 8; len -= 8) { + c2l(in, tin0); + c2l(in, tin1); + tin0 ^= tout0; + tin1 ^= tout1; + + tin[0] = tin0; + tin[1] = tin1; + DES_encrypt3(tin, ks1, ks2, ks3); + tout0 = tin[0]; + tout1 = tin[1]; + + l2c(tout0, out); + l2c(tout1, out); + } + if (len != 0) { + c2ln(in, tin0, tin1, len); + tin0 ^= tout0; + tin1 ^= tout1; + + tin[0] = tin0; + tin[1] = tin1; + DES_encrypt3(tin, ks1, ks2, ks3); + tout0 = tin[0]; + tout1 = tin[1]; + + l2c(tout0, out); + l2c(tout1, out); + } + iv = ivec; + l2c(tout0, iv); + l2c(tout1, iv); + } else { + uint32_t t0, t1; + + c2l(iv, xor0); + c2l(iv, xor1); + for (; len >= 8; len -= 8) { + c2l(in, tin0); + c2l(in, tin1); + + t0 = tin0; + t1 = tin1; + + tin[0] = tin0; + tin[1] = tin1; + DES_decrypt3(tin, ks1, ks2, ks3); + tout0 = tin[0]; + tout1 = tin[1]; + + tout0 ^= xor0; + tout1 ^= xor1; + l2c(tout0, out); + l2c(tout1, out); + xor0 = t0; + xor1 = t1; + } + if (len != 0) { + c2l(in, tin0); + c2l(in, tin1); + + t0 = tin0; + t1 = tin1; + + tin[0] = tin0; + tin[1] = tin1; + DES_decrypt3(tin, ks1, ks2, ks3); + tout0 = tin[0]; + tout1 = tin[1]; + + tout0 ^= xor0; + tout1 ^= xor1; + l2cn(tout0, tout1, out, len); + xor0 = t0; + xor1 = t1; + } + + iv = ivec; + l2c(xor0, iv); + l2c(xor1, iv); + } + + tin[0] = tin[1] = 0; +} + +void DES_ede2_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const DES_key_schedule *ks1, + const DES_key_schedule *ks2, + DES_cblock *ivec, + int enc) { + DES_ede3_cbc_encrypt(in, out, len, ks1, ks2, ks1, ivec, enc); +} + + +// Deprecated functions. + +void DES_set_key_unchecked(const DES_cblock *key, DES_key_schedule *schedule) { + DES_set_key(key, schedule); +} diff --git a/third_party/boringssl/src/crypto/des/internal.h b/third_party/boringssl/src/crypto/des/internal.h index 2124fd58..6bb46ff5 100644 --- a/third_party/boringssl/src/crypto/des/internal.h +++ b/third_party/boringssl/src/crypto/des/internal.h @@ -1,70 +1,30 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#ifndef OPENSSL_HEADER_DES_INTERNAL_H -#define OPENSSL_HEADER_DES_INTERNAL_H +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_DES_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_DES_INTERNAL_H #include +#include #include "../internal.h" -#if defined(__cplusplus) -extern "C" { -#endif +BSSL_NAMESPACE_BEGIN + +// TODO(davidben): Ideally these macros would be replaced with +// |CRYPTO_load_u32_le| and |CRYPTO_store_u32_le|. #define c2l(c, l) \ do { \ @@ -90,25 +50,25 @@ extern "C" { switch (n) { \ case 8: \ (l2) = ((uint32_t)(*(--(c)))) << 24L; \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 7: \ (l2) |= ((uint32_t)(*(--(c)))) << 16L; \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 6: \ (l2) |= ((uint32_t)(*(--(c)))) << 8L; \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 5: \ (l2) |= ((uint32_t)(*(--(c)))); \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 4: \ (l1) = ((uint32_t)(*(--(c)))) << 24L; \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 3: \ (l1) |= ((uint32_t)(*(--(c)))) << 16L; \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 2: \ (l1) |= ((uint32_t)(*(--(c)))) << 8L; \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 1: \ (l1) |= ((uint32_t)(*(--(c)))); \ } \ @@ -121,118 +81,64 @@ extern "C" { switch (n) { \ case 8: \ *(--(c)) = (unsigned char)(((l2) >> 24L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 7: \ *(--(c)) = (unsigned char)(((l2) >> 16L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 6: \ *(--(c)) = (unsigned char)(((l2) >> 8L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 5: \ *(--(c)) = (unsigned char)(((l2)) & 0xff); \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 4: \ *(--(c)) = (unsigned char)(((l1) >> 24L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 3: \ *(--(c)) = (unsigned char)(((l1) >> 16L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 2: \ *(--(c)) = (unsigned char)(((l1) >> 8L) & 0xff); \ - OPENSSL_FALLTHROUGH; \ + [[fallthrough]]; \ case 1: \ *(--(c)) = (unsigned char)(((l1)) & 0xff); \ } \ } while (0) -/* IP and FP - * The problem is more of a geometric problem that random bit fiddling. - 0 1 2 3 4 5 6 7 62 54 46 38 30 22 14 6 - 8 9 10 11 12 13 14 15 60 52 44 36 28 20 12 4 -16 17 18 19 20 21 22 23 58 50 42 34 26 18 10 2 -24 25 26 27 28 29 30 31 to 56 48 40 32 24 16 8 0 - -32 33 34 35 36 37 38 39 63 55 47 39 31 23 15 7 -40 41 42 43 44 45 46 47 61 53 45 37 29 21 13 5 -48 49 50 51 52 53 54 55 59 51 43 35 27 19 11 3 -56 57 58 59 60 61 62 63 57 49 41 33 25 17 9 1 - -The output has been subject to swaps of the form -0 1 -> 3 1 but the odd and even bits have been put into -2 3 2 0 -different words. The main trick is to remember that -t=((l>>size)^r)&(mask); -r^=t; -l^=(t<> (n)) ^ (b)) & (m)); \ - (b) ^= (t); \ - (a) ^= ((t) << (n)); \ - } while (0) -#define IP(l, r) \ - do { \ - uint32_t tt; \ - PERM_OP(r, l, tt, 4, 0x0f0f0f0fL); \ - PERM_OP(l, r, tt, 16, 0x0000ffffL); \ - PERM_OP(r, l, tt, 2, 0x33333333L); \ - PERM_OP(l, r, tt, 8, 0x00ff00ffL); \ - PERM_OP(r, l, tt, 1, 0x55555555L); \ - } while (0) +// Correctly-typed versions of DES functions. +// +// See https://crbug.com/boringssl/683. -#define FP(l, r) \ - do { \ - uint32_t tt; \ - PERM_OP(l, r, tt, 1, 0x55555555L); \ - PERM_OP(r, l, tt, 8, 0x00ff00ffL); \ - PERM_OP(l, r, tt, 2, 0x33333333L); \ - PERM_OP(r, l, tt, 16, 0x0000ffffL); \ - PERM_OP(l, r, tt, 4, 0x0f0f0f0fL); \ - } while (0) +void DES_set_key_ex(const uint8_t key[8], DES_key_schedule *schedule); +void DES_ecb_encrypt_ex(const uint8_t in[8], uint8_t out[8], + const DES_key_schedule *schedule, int is_encrypt); +void DES_ncbc_encrypt_ex(const uint8_t *in, uint8_t *out, size_t len, + const DES_key_schedule *schedule, uint8_t ivec[8], + int enc); +void DES_ecb3_encrypt_ex(const uint8_t input[8], uint8_t output[8], + const DES_key_schedule *ks1, + const DES_key_schedule *ks2, + const DES_key_schedule *ks3, int enc); +void DES_ede3_cbc_encrypt_ex(const uint8_t *in, uint8_t *out, size_t len, + const DES_key_schedule *ks1, + const DES_key_schedule *ks2, + const DES_key_schedule *ks3, uint8_t ivec[8], + int enc); -#define LOAD_DATA(ks, R, S, u, t, E0, E1) \ - do { \ - (u) = (R) ^ (ks)->subkeys[S][0]; \ - (t) = (R) ^ (ks)->subkeys[S][1]; \ - } while (0) -#define D_ENCRYPT(ks, LL, R, S) \ - do { \ - LOAD_DATA(ks, R, S, u, t, E0, E1); \ - t = CRYPTO_rotr_u32(t, 4); \ - (LL) ^= \ - DES_SPtrans[0][(u >> 2L) & 0x3f] ^ DES_SPtrans[2][(u >> 10L) & 0x3f] ^ \ - DES_SPtrans[4][(u >> 18L) & 0x3f] ^ \ - DES_SPtrans[6][(u >> 26L) & 0x3f] ^ DES_SPtrans[1][(t >> 2L) & 0x3f] ^ \ - DES_SPtrans[3][(t >> 10L) & 0x3f] ^ \ - DES_SPtrans[5][(t >> 18L) & 0x3f] ^ DES_SPtrans[7][(t >> 26L) & 0x3f]; \ - } while (0) +// Private functions. +// +// These functions are only exported for use in |decrepit|. -#define ITERATIONS 16 -#define HALF_ITERATIONS 8 +OPENSSL_EXPORT void DES_decrypt3(uint32_t data[2], const DES_key_schedule *ks1, + const DES_key_schedule *ks2, + const DES_key_schedule *ks3); +OPENSSL_EXPORT void DES_encrypt3(uint32_t data[2], const DES_key_schedule *ks1, + const DES_key_schedule *ks2, + const DES_key_schedule *ks3); -#if defined(__cplusplus) -} // extern C -#endif +BSSL_NAMESPACE_END -#endif // OPENSSL_HEADER_DES_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_DES_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/dh/dh_asn1.cc b/third_party/boringssl/src/crypto/dh/dh_asn1.cc new file mode 100644 index 00000000..01241db9 --- /dev/null +++ b/third_party/boringssl/src/crypto/dh/dh_asn1.cc @@ -0,0 +1,109 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../fipsmodule/dh/internal.h" + + +using namespace bssl; + +static int parse_integer(CBS *cbs, UniquePtr *out) { + assert(*out == nullptr); + out->reset(BN_new()); + if (*out == nullptr) { + return 0; + } + return BN_parse_asn1_unsigned(cbs, out->get()); +} + +static int marshal_integer(CBB *cbb, BIGNUM *bn) { + if (bn == nullptr) { + // A DH object may be missing some components. + OPENSSL_PUT_ERROR(DH, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + return BN_marshal_asn1(cbb, bn); +} + +DH *DH_parse_parameters(CBS *cbs) { + UniquePtr ret(DH_new()); + if (ret == nullptr) { + return nullptr; + } + + CBS child; + auto *impl = FromOpaque(ret.get()); + if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || + !parse_integer(&child, &impl->p) || // + !parse_integer(&child, &impl->g)) { + OPENSSL_PUT_ERROR(DH, DH_R_DECODE_ERROR); + return nullptr; + } + + uint64_t priv_length; + if (CBS_len(&child) != 0) { + if (!CBS_get_asn1_uint64(&child, &priv_length) || + priv_length > UINT_MAX) { + OPENSSL_PUT_ERROR(DH, DH_R_DECODE_ERROR); + return nullptr; + } + impl->priv_length = (unsigned)priv_length; + } + + if (CBS_len(&child) != 0) { + OPENSSL_PUT_ERROR(DH, DH_R_DECODE_ERROR); + return nullptr; + } + + if (!dh_check_params_fast(ret.get())) { + OPENSSL_PUT_ERROR(DH, DH_R_DECODE_ERROR); + return nullptr; + } + + return ret.release(); +} + +int DH_marshal_parameters(CBB *cbb, const DH *dh) { + CBB child; + auto *impl = FromOpaque(dh); + if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || + !marshal_integer(&child, impl->p.get()) || + !marshal_integer(&child, impl->g.get()) || + (impl->priv_length != 0 && + !CBB_add_asn1_uint64(&child, impl->priv_length)) || + !CBB_flush(cbb)) { + OPENSSL_PUT_ERROR(DH, DH_R_ENCODE_ERROR); + return 0; + } + return 1; +} + +DH *d2i_DHparams(DH **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, DH_parse_parameters); +} + +int i2d_DHparams(const DH *in, uint8_t **outp) { + return I2DFromCBB( + /*initial_capacity=*/256, outp, + [&](CBB *cbb) -> bool { return DH_marshal_parameters(cbb, in); }); +} diff --git a/third_party/boringssl/src/crypto/dh/params.cc b/third_party/boringssl/src/crypto/dh/params.cc new file mode 100644 index 00000000..406a069d --- /dev/null +++ b/third_party/boringssl/src/crypto/dh/params.cc @@ -0,0 +1,411 @@ +// Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include "../fipsmodule/bn/internal.h" +#include "../fipsmodule/dh/internal.h" + + +using namespace bssl; + +static BIGNUM *get_params(BIGNUM *ret, Span words) { + BIGNUM *alloc = nullptr; + if (ret == nullptr) { + alloc = BN_new(); + if (alloc == nullptr) { + return nullptr; + } + ret = alloc; + } + + if (!bn_set_words(ret, words.data(), words.size())) { + BN_free(alloc); + return nullptr; + } + + return ret; +} + +BIGNUM *BN_get_rfc3526_prime_1536(BIGNUM *ret) { + static const BN_ULONG kWords[] = { + TOBN(0xffffffff, 0xffffffff), TOBN(0xf1746c08, 0xca237327), + TOBN(0x670c354e, 0x4abc9804), TOBN(0x9ed52907, 0x7096966d), + TOBN(0x1c62f356, 0x208552bb), TOBN(0x83655d23, 0xdca3ad96), + TOBN(0x69163fa8, 0xfd24cf5f), TOBN(0x98da4836, 0x1c55d39a), + TOBN(0xc2007cb8, 0xa163bf05), TOBN(0x49286651, 0xece45b3d), + TOBN(0xae9f2411, 0x7c4b1fe6), TOBN(0xee386bfb, 0x5a899fa5), + TOBN(0x0bff5cb6, 0xf406b7ed), TOBN(0xf44c42e9, 0xa637ed6b), + TOBN(0xe485b576, 0x625e7ec6), TOBN(0x4fe1356d, 0x6d51c245), + TOBN(0x302b0a6d, 0xf25f1437), TOBN(0xef9519b3, 0xcd3a431b), + TOBN(0x514a0879, 0x8e3404dd), TOBN(0x020bbea6, 0x3b139b22), + TOBN(0x29024e08, 0x8a67cc74), TOBN(0xc4c6628b, 0x80dc1cd1), + TOBN(0xc90fdaa2, 0x2168c234), TOBN(0xffffffff, 0xffffffff), + }; + return get_params(ret, kWords); +} + +BIGNUM *BN_get_rfc3526_prime_2048(BIGNUM *ret) { + static const BN_ULONG kWords[] = { + TOBN(0xffffffff, 0xffffffff), TOBN(0x15728e5a, 0x8aacaa68), + TOBN(0x15d22618, 0x98fa0510), TOBN(0x3995497c, 0xea956ae5), + TOBN(0xde2bcbf6, 0x95581718), TOBN(0xb5c55df0, 0x6f4c52c9), + TOBN(0x9b2783a2, 0xec07a28f), TOBN(0xe39e772c, 0x180e8603), + TOBN(0x32905e46, 0x2e36ce3b), TOBN(0xf1746c08, 0xca18217c), + TOBN(0x670c354e, 0x4abc9804), TOBN(0x9ed52907, 0x7096966d), + TOBN(0x1c62f356, 0x208552bb), TOBN(0x83655d23, 0xdca3ad96), + TOBN(0x69163fa8, 0xfd24cf5f), TOBN(0x98da4836, 0x1c55d39a), + TOBN(0xc2007cb8, 0xa163bf05), TOBN(0x49286651, 0xece45b3d), + TOBN(0xae9f2411, 0x7c4b1fe6), TOBN(0xee386bfb, 0x5a899fa5), + TOBN(0x0bff5cb6, 0xf406b7ed), TOBN(0xf44c42e9, 0xa637ed6b), + TOBN(0xe485b576, 0x625e7ec6), TOBN(0x4fe1356d, 0x6d51c245), + TOBN(0x302b0a6d, 0xf25f1437), TOBN(0xef9519b3, 0xcd3a431b), + TOBN(0x514a0879, 0x8e3404dd), TOBN(0x020bbea6, 0x3b139b22), + TOBN(0x29024e08, 0x8a67cc74), TOBN(0xc4c6628b, 0x80dc1cd1), + TOBN(0xc90fdaa2, 0x2168c234), TOBN(0xffffffff, 0xffffffff), + }; + return get_params(ret, kWords); +} + +BIGNUM *BN_get_rfc3526_prime_3072(BIGNUM *ret) { + static const BN_ULONG kWords[] = { + TOBN(0xffffffff, 0xffffffff), TOBN(0x4b82d120, 0xa93ad2ca), + TOBN(0x43db5bfc, 0xe0fd108e), TOBN(0x08e24fa0, 0x74e5ab31), + TOBN(0x770988c0, 0xbad946e2), TOBN(0xbbe11757, 0x7a615d6c), + TOBN(0x521f2b18, 0x177b200c), TOBN(0xd8760273, 0x3ec86a64), + TOBN(0xf12ffa06, 0xd98a0864), TOBN(0xcee3d226, 0x1ad2ee6b), + TOBN(0x1e8c94e0, 0x4a25619d), TOBN(0xabf5ae8c, 0xdb0933d7), + TOBN(0xb3970f85, 0xa6e1e4c7), TOBN(0x8aea7157, 0x5d060c7d), + TOBN(0xecfb8504, 0x58dbef0a), TOBN(0xa85521ab, 0xdf1cba64), + TOBN(0xad33170d, 0x04507a33), TOBN(0x15728e5a, 0x8aaac42d), + TOBN(0x15d22618, 0x98fa0510), TOBN(0x3995497c, 0xea956ae5), + TOBN(0xde2bcbf6, 0x95581718), TOBN(0xb5c55df0, 0x6f4c52c9), + TOBN(0x9b2783a2, 0xec07a28f), TOBN(0xe39e772c, 0x180e8603), + TOBN(0x32905e46, 0x2e36ce3b), TOBN(0xf1746c08, 0xca18217c), + TOBN(0x670c354e, 0x4abc9804), TOBN(0x9ed52907, 0x7096966d), + TOBN(0x1c62f356, 0x208552bb), TOBN(0x83655d23, 0xdca3ad96), + TOBN(0x69163fa8, 0xfd24cf5f), TOBN(0x98da4836, 0x1c55d39a), + TOBN(0xc2007cb8, 0xa163bf05), TOBN(0x49286651, 0xece45b3d), + TOBN(0xae9f2411, 0x7c4b1fe6), TOBN(0xee386bfb, 0x5a899fa5), + TOBN(0x0bff5cb6, 0xf406b7ed), TOBN(0xf44c42e9, 0xa637ed6b), + TOBN(0xe485b576, 0x625e7ec6), TOBN(0x4fe1356d, 0x6d51c245), + TOBN(0x302b0a6d, 0xf25f1437), TOBN(0xef9519b3, 0xcd3a431b), + TOBN(0x514a0879, 0x8e3404dd), TOBN(0x020bbea6, 0x3b139b22), + TOBN(0x29024e08, 0x8a67cc74), TOBN(0xc4c6628b, 0x80dc1cd1), + TOBN(0xc90fdaa2, 0x2168c234), TOBN(0xffffffff, 0xffffffff), + }; + return get_params(ret, kWords); +} + +BIGNUM *BN_get_rfc3526_prime_4096(BIGNUM *ret) { + static const BN_ULONG kWords[] = { + TOBN(0xffffffff, 0xffffffff), TOBN(0x4df435c9, 0x34063199), + TOBN(0x86ffb7dc, 0x90a6c08f), TOBN(0x93b4ea98, 0x8d8fddc1), + TOBN(0xd0069127, 0xd5b05aa9), TOBN(0xb81bdd76, 0x2170481c), + TOBN(0x1f612970, 0xcee2d7af), TOBN(0x233ba186, 0x515be7ed), + TOBN(0x99b2964f, 0xa090c3a2), TOBN(0x287c5947, 0x4e6bc05d), + TOBN(0x2e8efc14, 0x1fbecaa6), TOBN(0xdbbbc2db, 0x04de8ef9), + TOBN(0x2583e9ca, 0x2ad44ce8), TOBN(0x1a946834, 0xb6150bda), + TOBN(0x99c32718, 0x6af4e23c), TOBN(0x88719a10, 0xbdba5b26), + TOBN(0x1a723c12, 0xa787e6d7), TOBN(0x4b82d120, 0xa9210801), + TOBN(0x43db5bfc, 0xe0fd108e), TOBN(0x08e24fa0, 0x74e5ab31), + TOBN(0x770988c0, 0xbad946e2), TOBN(0xbbe11757, 0x7a615d6c), + TOBN(0x521f2b18, 0x177b200c), TOBN(0xd8760273, 0x3ec86a64), + TOBN(0xf12ffa06, 0xd98a0864), TOBN(0xcee3d226, 0x1ad2ee6b), + TOBN(0x1e8c94e0, 0x4a25619d), TOBN(0xabf5ae8c, 0xdb0933d7), + TOBN(0xb3970f85, 0xa6e1e4c7), TOBN(0x8aea7157, 0x5d060c7d), + TOBN(0xecfb8504, 0x58dbef0a), TOBN(0xa85521ab, 0xdf1cba64), + TOBN(0xad33170d, 0x04507a33), TOBN(0x15728e5a, 0x8aaac42d), + TOBN(0x15d22618, 0x98fa0510), TOBN(0x3995497c, 0xea956ae5), + TOBN(0xde2bcbf6, 0x95581718), TOBN(0xb5c55df0, 0x6f4c52c9), + TOBN(0x9b2783a2, 0xec07a28f), TOBN(0xe39e772c, 0x180e8603), + TOBN(0x32905e46, 0x2e36ce3b), TOBN(0xf1746c08, 0xca18217c), + TOBN(0x670c354e, 0x4abc9804), TOBN(0x9ed52907, 0x7096966d), + TOBN(0x1c62f356, 0x208552bb), TOBN(0x83655d23, 0xdca3ad96), + TOBN(0x69163fa8, 0xfd24cf5f), TOBN(0x98da4836, 0x1c55d39a), + TOBN(0xc2007cb8, 0xa163bf05), TOBN(0x49286651, 0xece45b3d), + TOBN(0xae9f2411, 0x7c4b1fe6), TOBN(0xee386bfb, 0x5a899fa5), + TOBN(0x0bff5cb6, 0xf406b7ed), TOBN(0xf44c42e9, 0xa637ed6b), + TOBN(0xe485b576, 0x625e7ec6), TOBN(0x4fe1356d, 0x6d51c245), + TOBN(0x302b0a6d, 0xf25f1437), TOBN(0xef9519b3, 0xcd3a431b), + TOBN(0x514a0879, 0x8e3404dd), TOBN(0x020bbea6, 0x3b139b22), + TOBN(0x29024e08, 0x8a67cc74), TOBN(0xc4c6628b, 0x80dc1cd1), + TOBN(0xc90fdaa2, 0x2168c234), TOBN(0xffffffff, 0xffffffff), + }; + return get_params(ret, kWords); +} + +BIGNUM *BN_get_rfc3526_prime_6144(BIGNUM *ret) { + static const BN_ULONG kWords[] = { + TOBN(0xffffffff, 0xffffffff), TOBN(0xe694f91e, 0x6dcc4024), + TOBN(0x12bf2d5b, 0x0b7474d6), TOBN(0x043e8f66, 0x3f4860ee), + TOBN(0x387fe8d7, 0x6e3c0468), TOBN(0xda56c9ec, 0x2ef29632), + TOBN(0xeb19ccb1, 0xa313d55c), TOBN(0xf550aa3d, 0x8a1fbff0), + TOBN(0x06a1d58b, 0xb7c5da76), TOBN(0xa79715ee, 0xf29be328), + TOBN(0x14cc5ed2, 0x0f8037e0), TOBN(0xcc8f6d7e, 0xbf48e1d8), + TOBN(0x4bd407b2, 0x2b4154aa), TOBN(0x0f1d45b7, 0xff585ac5), + TOBN(0x23a97a7e, 0x36cc88be), TOBN(0x59e7c97f, 0xbec7e8f3), + TOBN(0xb5a84031, 0x900b1c9e), TOBN(0xd55e702f, 0x46980c82), + TOBN(0xf482d7ce, 0x6e74fef6), TOBN(0xf032ea15, 0xd1721d03), + TOBN(0x5983ca01, 0xc64b92ec), TOBN(0x6fb8f401, 0x378cd2bf), + TOBN(0x33205151, 0x2bd7af42), TOBN(0xdb7f1447, 0xe6cc254b), + TOBN(0x44ce6cba, 0xced4bb1b), TOBN(0xda3edbeb, 0xcf9b14ed), + TOBN(0x179727b0, 0x865a8918), TOBN(0xb06a53ed, 0x9027d831), + TOBN(0xe5db382f, 0x413001ae), TOBN(0xf8ff9406, 0xad9e530e), + TOBN(0xc9751e76, 0x3dba37bd), TOBN(0xc1d4dcb2, 0x602646de), + TOBN(0x36c3fab4, 0xd27c7026), TOBN(0x4df435c9, 0x34028492), + TOBN(0x86ffb7dc, 0x90a6c08f), TOBN(0x93b4ea98, 0x8d8fddc1), + TOBN(0xd0069127, 0xd5b05aa9), TOBN(0xb81bdd76, 0x2170481c), + TOBN(0x1f612970, 0xcee2d7af), TOBN(0x233ba186, 0x515be7ed), + TOBN(0x99b2964f, 0xa090c3a2), TOBN(0x287c5947, 0x4e6bc05d), + TOBN(0x2e8efc14, 0x1fbecaa6), TOBN(0xdbbbc2db, 0x04de8ef9), + TOBN(0x2583e9ca, 0x2ad44ce8), TOBN(0x1a946834, 0xb6150bda), + TOBN(0x99c32718, 0x6af4e23c), TOBN(0x88719a10, 0xbdba5b26), + TOBN(0x1a723c12, 0xa787e6d7), TOBN(0x4b82d120, 0xa9210801), + TOBN(0x43db5bfc, 0xe0fd108e), TOBN(0x08e24fa0, 0x74e5ab31), + TOBN(0x770988c0, 0xbad946e2), TOBN(0xbbe11757, 0x7a615d6c), + TOBN(0x521f2b18, 0x177b200c), TOBN(0xd8760273, 0x3ec86a64), + TOBN(0xf12ffa06, 0xd98a0864), TOBN(0xcee3d226, 0x1ad2ee6b), + TOBN(0x1e8c94e0, 0x4a25619d), TOBN(0xabf5ae8c, 0xdb0933d7), + TOBN(0xb3970f85, 0xa6e1e4c7), TOBN(0x8aea7157, 0x5d060c7d), + TOBN(0xecfb8504, 0x58dbef0a), TOBN(0xa85521ab, 0xdf1cba64), + TOBN(0xad33170d, 0x04507a33), TOBN(0x15728e5a, 0x8aaac42d), + TOBN(0x15d22618, 0x98fa0510), TOBN(0x3995497c, 0xea956ae5), + TOBN(0xde2bcbf6, 0x95581718), TOBN(0xb5c55df0, 0x6f4c52c9), + TOBN(0x9b2783a2, 0xec07a28f), TOBN(0xe39e772c, 0x180e8603), + TOBN(0x32905e46, 0x2e36ce3b), TOBN(0xf1746c08, 0xca18217c), + TOBN(0x670c354e, 0x4abc9804), TOBN(0x9ed52907, 0x7096966d), + TOBN(0x1c62f356, 0x208552bb), TOBN(0x83655d23, 0xdca3ad96), + TOBN(0x69163fa8, 0xfd24cf5f), TOBN(0x98da4836, 0x1c55d39a), + TOBN(0xc2007cb8, 0xa163bf05), TOBN(0x49286651, 0xece45b3d), + TOBN(0xae9f2411, 0x7c4b1fe6), TOBN(0xee386bfb, 0x5a899fa5), + TOBN(0x0bff5cb6, 0xf406b7ed), TOBN(0xf44c42e9, 0xa637ed6b), + TOBN(0xe485b576, 0x625e7ec6), TOBN(0x4fe1356d, 0x6d51c245), + TOBN(0x302b0a6d, 0xf25f1437), TOBN(0xef9519b3, 0xcd3a431b), + TOBN(0x514a0879, 0x8e3404dd), TOBN(0x020bbea6, 0x3b139b22), + TOBN(0x29024e08, 0x8a67cc74), TOBN(0xc4c6628b, 0x80dc1cd1), + TOBN(0xc90fdaa2, 0x2168c234), TOBN(0xffffffff, 0xffffffff), + }; + return get_params(ret, kWords); +} + +BIGNUM *BN_get_rfc3526_prime_8192(BIGNUM *ret) { + static const BN_ULONG kWords[] = { + TOBN(0xffffffff, 0xffffffff), TOBN(0x60c980dd, 0x98edd3df), + TOBN(0xc81f56e8, 0x80b96e71), TOBN(0x9e3050e2, 0x765694df), + TOBN(0x9558e447, 0x5677e9aa), TOBN(0xc9190da6, 0xfc026e47), + TOBN(0x889a002e, 0xd5ee382b), TOBN(0x4009438b, 0x481c6cd7), + TOBN(0x359046f4, 0xeb879f92), TOBN(0xfaf36bc3, 0x1ecfa268), + TOBN(0xb1d510bd, 0x7ee74d73), TOBN(0xf9ab4819, 0x5ded7ea1), + TOBN(0x64f31cc5, 0x0846851d), TOBN(0x4597e899, 0xa0255dc1), + TOBN(0xdf310ee0, 0x74ab6a36), TOBN(0x6d2a13f8, 0x3f44f82d), + TOBN(0x062b3cf5, 0xb3a278a6), TOBN(0x79683303, 0xed5bdd3a), + TOBN(0xfa9d4b7f, 0xa2c087e8), TOBN(0x4bcbc886, 0x2f8385dd), + TOBN(0x3473fc64, 0x6cea306b), TOBN(0x13eb57a8, 0x1a23f0c7), + TOBN(0x22222e04, 0xa4037c07), TOBN(0xe3fdb8be, 0xfc848ad9), + TOBN(0x238f16cb, 0xe39d652d), TOBN(0x3423b474, 0x2bf1c978), + TOBN(0x3aab639c, 0x5ae4f568), TOBN(0x2576f693, 0x6ba42466), + TOBN(0x741fa7bf, 0x8afc47ed), TOBN(0x3bc832b6, 0x8d9dd300), + TOBN(0xd8bec4d0, 0x73b931ba), TOBN(0x38777cb6, 0xa932df8c), + TOBN(0x74a3926f, 0x12fee5e4), TOBN(0xe694f91e, 0x6dbe1159), + TOBN(0x12bf2d5b, 0x0b7474d6), TOBN(0x043e8f66, 0x3f4860ee), + TOBN(0x387fe8d7, 0x6e3c0468), TOBN(0xda56c9ec, 0x2ef29632), + TOBN(0xeb19ccb1, 0xa313d55c), TOBN(0xf550aa3d, 0x8a1fbff0), + TOBN(0x06a1d58b, 0xb7c5da76), TOBN(0xa79715ee, 0xf29be328), + TOBN(0x14cc5ed2, 0x0f8037e0), TOBN(0xcc8f6d7e, 0xbf48e1d8), + TOBN(0x4bd407b2, 0x2b4154aa), TOBN(0x0f1d45b7, 0xff585ac5), + TOBN(0x23a97a7e, 0x36cc88be), TOBN(0x59e7c97f, 0xbec7e8f3), + TOBN(0xb5a84031, 0x900b1c9e), TOBN(0xd55e702f, 0x46980c82), + TOBN(0xf482d7ce, 0x6e74fef6), TOBN(0xf032ea15, 0xd1721d03), + TOBN(0x5983ca01, 0xc64b92ec), TOBN(0x6fb8f401, 0x378cd2bf), + TOBN(0x33205151, 0x2bd7af42), TOBN(0xdb7f1447, 0xe6cc254b), + TOBN(0x44ce6cba, 0xced4bb1b), TOBN(0xda3edbeb, 0xcf9b14ed), + TOBN(0x179727b0, 0x865a8918), TOBN(0xb06a53ed, 0x9027d831), + TOBN(0xe5db382f, 0x413001ae), TOBN(0xf8ff9406, 0xad9e530e), + TOBN(0xc9751e76, 0x3dba37bd), TOBN(0xc1d4dcb2, 0x602646de), + TOBN(0x36c3fab4, 0xd27c7026), TOBN(0x4df435c9, 0x34028492), + TOBN(0x86ffb7dc, 0x90a6c08f), TOBN(0x93b4ea98, 0x8d8fddc1), + TOBN(0xd0069127, 0xd5b05aa9), TOBN(0xb81bdd76, 0x2170481c), + TOBN(0x1f612970, 0xcee2d7af), TOBN(0x233ba186, 0x515be7ed), + TOBN(0x99b2964f, 0xa090c3a2), TOBN(0x287c5947, 0x4e6bc05d), + TOBN(0x2e8efc14, 0x1fbecaa6), TOBN(0xdbbbc2db, 0x04de8ef9), + TOBN(0x2583e9ca, 0x2ad44ce8), TOBN(0x1a946834, 0xb6150bda), + TOBN(0x99c32718, 0x6af4e23c), TOBN(0x88719a10, 0xbdba5b26), + TOBN(0x1a723c12, 0xa787e6d7), TOBN(0x4b82d120, 0xa9210801), + TOBN(0x43db5bfc, 0xe0fd108e), TOBN(0x08e24fa0, 0x74e5ab31), + TOBN(0x770988c0, 0xbad946e2), TOBN(0xbbe11757, 0x7a615d6c), + TOBN(0x521f2b18, 0x177b200c), TOBN(0xd8760273, 0x3ec86a64), + TOBN(0xf12ffa06, 0xd98a0864), TOBN(0xcee3d226, 0x1ad2ee6b), + TOBN(0x1e8c94e0, 0x4a25619d), TOBN(0xabf5ae8c, 0xdb0933d7), + TOBN(0xb3970f85, 0xa6e1e4c7), TOBN(0x8aea7157, 0x5d060c7d), + TOBN(0xecfb8504, 0x58dbef0a), TOBN(0xa85521ab, 0xdf1cba64), + TOBN(0xad33170d, 0x04507a33), TOBN(0x15728e5a, 0x8aaac42d), + TOBN(0x15d22618, 0x98fa0510), TOBN(0x3995497c, 0xea956ae5), + TOBN(0xde2bcbf6, 0x95581718), TOBN(0xb5c55df0, 0x6f4c52c9), + TOBN(0x9b2783a2, 0xec07a28f), TOBN(0xe39e772c, 0x180e8603), + TOBN(0x32905e46, 0x2e36ce3b), TOBN(0xf1746c08, 0xca18217c), + TOBN(0x670c354e, 0x4abc9804), TOBN(0x9ed52907, 0x7096966d), + TOBN(0x1c62f356, 0x208552bb), TOBN(0x83655d23, 0xdca3ad96), + TOBN(0x69163fa8, 0xfd24cf5f), TOBN(0x98da4836, 0x1c55d39a), + TOBN(0xc2007cb8, 0xa163bf05), TOBN(0x49286651, 0xece45b3d), + TOBN(0xae9f2411, 0x7c4b1fe6), TOBN(0xee386bfb, 0x5a899fa5), + TOBN(0x0bff5cb6, 0xf406b7ed), TOBN(0xf44c42e9, 0xa637ed6b), + TOBN(0xe485b576, 0x625e7ec6), TOBN(0x4fe1356d, 0x6d51c245), + TOBN(0x302b0a6d, 0xf25f1437), TOBN(0xef9519b3, 0xcd3a431b), + TOBN(0x514a0879, 0x8e3404dd), TOBN(0x020bbea6, 0x3b139b22), + TOBN(0x29024e08, 0x8a67cc74), TOBN(0xc4c6628b, 0x80dc1cd1), + TOBN(0xc90fdaa2, 0x2168c234), TOBN(0xffffffff, 0xffffffff), + }; + return get_params(ret, kWords); +} + +int DH_generate_parameters_ex(DH *dh, int prime_bits, int generator, + BN_GENCB *cb) { + // We generate DH parameters as follows + // find a prime q which is prime_bits/2 bits long. + // p=(2*q)+1 or (p-1)/2 = q + // For this case, g is a generator if + // g^((p-1)/q) mod p != 1 for values of q which are the factors of p-1. + // Since the factors of p-1 are q and 2, we just need to check + // g^2 mod p != 1 and g^q mod p != 1. + // + // Having said all that, + // there is another special case method for the generators 2, 3 and 5. + // for 2, p mod 24 == 11 + // for 3, p mod 12 == 5 <<<<< does not work for safe primes. + // for 5, p mod 10 == 3 or 7 + // + // Thanks to Phil Karn for the pointers about the + // special generators and for answering some of my questions. + // + // I've implemented the second simple method :-). + // Since DH should be using a safe prime (both p and q are prime), + // this generator function can take a very very long time to run. + + // Actually there is no reason to insist that 'generator' be a generator. + // It's just as OK (and in some sense better) to use a generator of the + // order-q subgroup. + + if (prime_bits <= 0 || prime_bits > OPENSSL_DH_MAX_MODULUS_BITS) { + OPENSSL_PUT_ERROR(DH, DH_R_MODULUS_TOO_LARGE); + return 0; + } + + // Make sure |dh| has the necessary elements + auto *impl = FromOpaque(dh); + if (impl->p == nullptr) { + impl->p.reset(BN_new()); + if (impl->p == nullptr) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + } + if (impl->g == nullptr) { + impl->g.reset(BN_new()); + if (impl->g == nullptr) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + } + + BN_ULONG t1, t2, g; + if (generator <= 1) { + OPENSSL_PUT_ERROR(DH, DH_R_BAD_GENERATOR); + return 0; + } + if (generator == DH_GENERATOR_2) { + t1 = 24; + t2 = 11; + g = 2; + } else if (generator == DH_GENERATOR_5) { + t1 = 10; + t2 = 3; + g = 5; + } else { + // In the general case, don't worry if 'generator' is a generator or not: + // since we are using safe primes, it will generate either an order-q or an + // order-2q group, which both is OK. + t1 = 2; + t2 = 1; + g = generator; + } + + UniquePtr t1_bn(BN_new()), t2_bn(BN_new()); + if (t1_bn == nullptr || t2_bn == nullptr || + !BN_set_word(t1_bn.get(), t1) || // + !BN_set_word(t2_bn.get(), t2) || // + !BN_generate_prime_ex(impl->p.get(), prime_bits, 1, t1_bn.get(), + t2_bn.get(), cb) || + !BN_GENCB_call(cb, 3, 0) || // + !BN_set_word(impl->g.get(), g)) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + + return 1; +} + +static bool copy_bn(UniquePtr *dst, const BIGNUM *src) { + UniquePtr copy; + if (src) { + copy.reset(BN_dup(src)); + if (!copy) { + return false; + } + } + *dst = std::move(copy); + return true; +} + +static int int_dh_param_copy(DH *to, const DH *from, int is_x942) { + auto *to_impl = FromOpaque(to); + const auto *from_impl = FromOpaque(from); + + if (is_x942 == -1) { + is_x942 = !!from_impl->q; + } + if (!copy_bn(&to_impl->p, from_impl->p.get()) || + !copy_bn(&to_impl->g, from_impl->g.get())) { + return 0; + } + + if (!is_x942) { + return 1; + } + + if (!copy_bn(&to_impl->q, from_impl->q.get())) { + return 0; + } + + return 1; +} + +DH *DHparams_dup(const DH *dh) { + DH *ret = DH_new(); + if (!ret) { + return nullptr; + } + + if (!int_dh_param_copy(ret, dh, -1)) { + DH_free(ret); + return nullptr; + } + + return ret; +} diff --git a/third_party/boringssl/src/crypto/dh_extra/dh_asn1.c b/third_party/boringssl/src/crypto/dh_extra/dh_asn1.c deleted file mode 100644 index 9d321807..00000000 --- a/third_party/boringssl/src/crypto/dh_extra/dh_asn1.c +++ /dev/null @@ -1,160 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 2000. - */ -/* ==================================================================== - * Copyright (c) 2000-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include - -#include -#include -#include - -#include "../bytestring/internal.h" - - -static int parse_integer(CBS *cbs, BIGNUM **out) { - assert(*out == NULL); - *out = BN_new(); - if (*out == NULL) { - return 0; - } - return BN_parse_asn1_unsigned(cbs, *out); -} - -static int marshal_integer(CBB *cbb, BIGNUM *bn) { - if (bn == NULL) { - // A DH object may be missing some components. - OPENSSL_PUT_ERROR(DH, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - return BN_marshal_asn1(cbb, bn); -} - -DH *DH_parse_parameters(CBS *cbs) { - DH *ret = DH_new(); - if (ret == NULL) { - return NULL; - } - - CBS child; - if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || - !parse_integer(&child, &ret->p) || - !parse_integer(&child, &ret->g)) { - goto err; - } - - uint64_t priv_length; - if (CBS_len(&child) != 0) { - if (!CBS_get_asn1_uint64(&child, &priv_length) || - priv_length > UINT_MAX) { - goto err; - } - ret->priv_length = (unsigned)priv_length; - } - - if (CBS_len(&child) != 0) { - goto err; - } - - return ret; - -err: - OPENSSL_PUT_ERROR(DH, DH_R_DECODE_ERROR); - DH_free(ret); - return NULL; -} - -int DH_marshal_parameters(CBB *cbb, const DH *dh) { - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || - !marshal_integer(&child, dh->p) || - !marshal_integer(&child, dh->g) || - (dh->priv_length != 0 && - !CBB_add_asn1_uint64(&child, dh->priv_length)) || - !CBB_flush(cbb)) { - OPENSSL_PUT_ERROR(DH, DH_R_ENCODE_ERROR); - return 0; - } - return 1; -} - -DH *d2i_DHparams(DH **out, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - DH *ret = DH_parse_parameters(&cbs); - if (ret == NULL) { - return NULL; - } - if (out != NULL) { - DH_free(*out); - *out = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -int i2d_DHparams(const DH *in, uint8_t **outp) { - CBB cbb; - if (!CBB_init(&cbb, 0) || - !DH_marshal_parameters(&cbb, in)) { - CBB_cleanup(&cbb); - return -1; - } - return CBB_finish_i2d(&cbb, outp); -} diff --git a/third_party/boringssl/src/crypto/dh_extra/params.c b/third_party/boringssl/src/crypto/dh_extra/params.c deleted file mode 100644 index 6023ab12..00000000 --- a/third_party/boringssl/src/crypto/dh_extra/params.c +++ /dev/null @@ -1,272 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2011 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include - -#include "../fipsmodule/bn/internal.h" - - -BIGNUM *BN_get_rfc3526_prime_1536(BIGNUM *ret) { - static const BN_ULONG kPrime1536Data[] = { - TOBN(0xffffffff, 0xffffffff), TOBN(0xf1746c08, 0xca237327), - TOBN(0x670c354e, 0x4abc9804), TOBN(0x9ed52907, 0x7096966d), - TOBN(0x1c62f356, 0x208552bb), TOBN(0x83655d23, 0xdca3ad96), - TOBN(0x69163fa8, 0xfd24cf5f), TOBN(0x98da4836, 0x1c55d39a), - TOBN(0xc2007cb8, 0xa163bf05), TOBN(0x49286651, 0xece45b3d), - TOBN(0xae9f2411, 0x7c4b1fe6), TOBN(0xee386bfb, 0x5a899fa5), - TOBN(0x0bff5cb6, 0xf406b7ed), TOBN(0xf44c42e9, 0xa637ed6b), - TOBN(0xe485b576, 0x625e7ec6), TOBN(0x4fe1356d, 0x6d51c245), - TOBN(0x302b0a6d, 0xf25f1437), TOBN(0xef9519b3, 0xcd3a431b), - TOBN(0x514a0879, 0x8e3404dd), TOBN(0x020bbea6, 0x3b139b22), - TOBN(0x29024e08, 0x8a67cc74), TOBN(0xc4c6628b, 0x80dc1cd1), - TOBN(0xc90fdaa2, 0x2168c234), TOBN(0xffffffff, 0xffffffff), - }; - - static const BIGNUM kPrime1536BN = STATIC_BIGNUM(kPrime1536Data); - - BIGNUM *alloc = NULL; - if (ret == NULL) { - alloc = BN_new(); - if (alloc == NULL) { - return NULL; - } - ret = alloc; - } - - if (!BN_copy(ret, &kPrime1536BN)) { - BN_free(alloc); - return NULL; - } - - return ret; -} - -int DH_generate_parameters_ex(DH *dh, int prime_bits, int generator, - BN_GENCB *cb) { - // We generate DH parameters as follows - // find a prime q which is prime_bits/2 bits long. - // p=(2*q)+1 or (p-1)/2 = q - // For this case, g is a generator if - // g^((p-1)/q) mod p != 1 for values of q which are the factors of p-1. - // Since the factors of p-1 are q and 2, we just need to check - // g^2 mod p != 1 and g^q mod p != 1. - // - // Having said all that, - // there is another special case method for the generators 2, 3 and 5. - // for 2, p mod 24 == 11 - // for 3, p mod 12 == 5 <<<<< does not work for safe primes. - // for 5, p mod 10 == 3 or 7 - // - // Thanks to Phil Karn for the pointers about the - // special generators and for answering some of my questions. - // - // I've implemented the second simple method :-). - // Since DH should be using a safe prime (both p and q are prime), - // this generator function can take a very very long time to run. - - // Actually there is no reason to insist that 'generator' be a generator. - // It's just as OK (and in some sense better) to use a generator of the - // order-q subgroup. - - BIGNUM *t1, *t2; - int g, ok = 0; - BN_CTX *ctx = NULL; - - ctx = BN_CTX_new(); - if (ctx == NULL) { - goto err; - } - BN_CTX_start(ctx); - t1 = BN_CTX_get(ctx); - t2 = BN_CTX_get(ctx); - if (t1 == NULL || t2 == NULL) { - goto err; - } - - // Make sure |dh| has the necessary elements - if (dh->p == NULL) { - dh->p = BN_new(); - if (dh->p == NULL) { - goto err; - } - } - if (dh->g == NULL) { - dh->g = BN_new(); - if (dh->g == NULL) { - goto err; - } - } - - if (generator <= 1) { - OPENSSL_PUT_ERROR(DH, DH_R_BAD_GENERATOR); - goto err; - } - if (generator == DH_GENERATOR_2) { - if (!BN_set_word(t1, 24)) { - goto err; - } - if (!BN_set_word(t2, 11)) { - goto err; - } - g = 2; - } else if (generator == DH_GENERATOR_5) { - if (!BN_set_word(t1, 10)) { - goto err; - } - if (!BN_set_word(t2, 3)) { - goto err; - } - // BN_set_word(t3,7); just have to miss - // out on these ones :-( - g = 5; - } else { - // in the general case, don't worry if 'generator' is a - // generator or not: since we are using safe primes, - // it will generate either an order-q or an order-2q group, - // which both is OK - if (!BN_set_word(t1, 2)) { - goto err; - } - if (!BN_set_word(t2, 1)) { - goto err; - } - g = generator; - } - - if (!BN_generate_prime_ex(dh->p, prime_bits, 1, t1, t2, cb)) { - goto err; - } - if (!BN_GENCB_call(cb, 3, 0)) { - goto err; - } - if (!BN_set_word(dh->g, g)) { - goto err; - } - ok = 1; - -err: - if (!ok) { - OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); - } - - if (ctx != NULL) { - BN_CTX_end(ctx); - BN_CTX_free(ctx); - } - return ok; -} - -static int int_dh_bn_cpy(BIGNUM **dst, const BIGNUM *src) { - BIGNUM *a = NULL; - - if (src) { - a = BN_dup(src); - if (!a) { - return 0; - } - } - - BN_free(*dst); - *dst = a; - return 1; -} - -static int int_dh_param_copy(DH *to, const DH *from, int is_x942) { - if (is_x942 == -1) { - is_x942 = !!from->q; - } - if (!int_dh_bn_cpy(&to->p, from->p) || - !int_dh_bn_cpy(&to->g, from->g)) { - return 0; - } - - if (!is_x942) { - return 1; - } - - if (!int_dh_bn_cpy(&to->q, from->q) || - !int_dh_bn_cpy(&to->j, from->j)) { - return 0; - } - - OPENSSL_free(to->seed); - to->seed = NULL; - to->seedlen = 0; - - if (from->seed) { - to->seed = OPENSSL_memdup(from->seed, from->seedlen); - if (!to->seed) { - return 0; - } - to->seedlen = from->seedlen; - } - - return 1; -} - -DH *DHparams_dup(const DH *dh) { - DH *ret = DH_new(); - if (!ret) { - return NULL; - } - - if (!int_dh_param_copy(ret, dh, -1)) { - DH_free(ret); - return NULL; - } - - return ret; -} diff --git a/third_party/boringssl/src/crypto/digest/digest_extra.cc b/third_party/boringssl/src/crypto/digest/digest_extra.cc new file mode 100644 index 00000000..d38e0c11 --- /dev/null +++ b/third_party/boringssl/src/crypto/digest/digest_extra.cc @@ -0,0 +1,361 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../asn1/internal.h" +#include "../fipsmodule/digest/internal.h" +#include "../internal.h" + + +struct nid_to_digest { + int nid; + const EVP_MD *(*md_func)(); + const char *short_name; + const char *long_name; +}; + +static const struct nid_to_digest nid_to_digest_mapping[] = { + {NID_md4, EVP_md4, SN_md4, LN_md4}, + {NID_md5, EVP_md5, SN_md5, LN_md5}, + {NID_sha1, EVP_sha1, SN_sha1, LN_sha1}, + {NID_sha224, EVP_sha224, SN_sha224, LN_sha224}, + {NID_sha256, EVP_sha256, SN_sha256, LN_sha256}, + {NID_sha384, EVP_sha384, SN_sha384, LN_sha384}, + {NID_sha512, EVP_sha512, SN_sha512, LN_sha512}, + {NID_sha512_256, EVP_sha512_256, SN_sha512_256, LN_sha512_256}, + {NID_md5_sha1, EVP_md5_sha1, SN_md5_sha1, LN_md5_sha1}, + // As a remnant of signing |EVP_MD|s, OpenSSL returned the corresponding + // hash function when given a signature OID. To avoid unintended lax parsing + // of hash OIDs, this is no longer supported for lookup by OID or NID. + // Node.js, however, exposes |EVP_get_digestbyname|'s full behavior to + // consumers so we retain it there. + {NID_undef, EVP_sha1, SN_dsaWithSHA, LN_dsaWithSHA}, + {NID_undef, EVP_sha1, SN_dsaWithSHA1, LN_dsaWithSHA1}, + {NID_undef, EVP_sha1, SN_ecdsa_with_SHA1, nullptr}, + {NID_undef, EVP_md5, SN_md5WithRSAEncryption, LN_md5WithRSAEncryption}, + {NID_undef, EVP_sha1, SN_sha1WithRSAEncryption, LN_sha1WithRSAEncryption}, + {NID_undef, EVP_sha224, SN_sha224WithRSAEncryption, + LN_sha224WithRSAEncryption}, + {NID_undef, EVP_sha256, SN_sha256WithRSAEncryption, + LN_sha256WithRSAEncryption}, + {NID_undef, EVP_sha384, SN_sha384WithRSAEncryption, + LN_sha384WithRSAEncryption}, + {NID_undef, EVP_sha512, SN_sha512WithRSAEncryption, + LN_sha512WithRSAEncryption}, +}; + +const EVP_MD *EVP_get_digestbynid(int nid) { + if (nid == NID_undef) { + // Skip the |NID_undef| entries in |nid_to_digest_mapping|. + return nullptr; + } + + for (const auto &mapping : nid_to_digest_mapping) { + if (mapping.nid == nid) { + return mapping.md_func(); + } + } + + return nullptr; +} + +static const struct { + uint8_t oid[9]; + uint8_t oid_len; + int nid; +} kMDOIDs[] = { + // 1.2.840.113549.2.4 + {{0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x04}, 8, NID_md4}, + // 1.2.840.113549.2.5 + {{0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x05}, 8, NID_md5}, + // 1.3.14.3.2.26 + {{0x2b, 0x0e, 0x03, 0x02, 0x1a}, 5, NID_sha1}, + // 2.16.840.1.101.3.4.2.1 + {{0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01}, 9, NID_sha256}, + // 2.16.840.1.101.3.4.2.2 + {{0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02}, 9, NID_sha384}, + // 2.16.840.1.101.3.4.2.3 + {{0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03}, 9, NID_sha512}, + // 2.16.840.1.101.3.4.2.4 + {{0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04}, 9, NID_sha224}, +}; + +static int cbs_to_digest_nid(const CBS *cbs) { + for (const auto &md : kMDOIDs) { + if (bssl::Span(*cbs) == + bssl::Span(md.oid).first(md.oid_len)) { + return md.nid; + } + } + return NID_undef; +} + +const EVP_MD *EVP_get_digestbyobj(const ASN1_OBJECT *obj) { + int nid = obj->nid; + if (nid == NID_undef) { + // Handle objects with no saved NID. Note we don't use |OBJ_obj2nid| here to + // avoid pulling in the OID table. + CBS cbs; + CBS_init(&cbs, OBJ_get0_data(obj), OBJ_length(obj)); + nid = cbs_to_digest_nid(&cbs); + } + + return nid == NID_undef ? nullptr : EVP_get_digestbynid(nid); +} + +int EVP_parse_digest_algorithm_nid(CBS *cbs) { + CBS algorithm, oid; + if (!CBS_get_asn1(cbs, &algorithm, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&algorithm, &oid, CBS_ASN1_OBJECT)) { + OPENSSL_PUT_ERROR(DIGEST, DIGEST_R_DECODE_ERROR); + return NID_undef; + } + + int ret = cbs_to_digest_nid(&oid); + if (ret == NID_undef) { + OPENSSL_PUT_ERROR(DIGEST, DIGEST_R_UNKNOWN_HASH); + return NID_undef; + } + + // The parameters, if present, must be NULL. Historically, whether the NULL + // was included or omitted was not well-specified. When parsing an + // AlgorithmIdentifier, we allow both. (Note this code is not used when + // verifying RSASSA-PKCS1-v1_5 signatures.) + if (CBS_len(&algorithm) > 0) { + CBS param; + if (!CBS_get_asn1(&algorithm, ¶m, CBS_ASN1_NULL) || + CBS_len(¶m) != 0 || // + CBS_len(&algorithm) != 0) { + OPENSSL_PUT_ERROR(DIGEST, DIGEST_R_DECODE_ERROR); + return NID_undef; + } + } + + return ret; +} + +const EVP_MD *EVP_parse_digest_algorithm(CBS *cbs) { + int nid = EVP_parse_digest_algorithm_nid(cbs); + if (nid == NID_undef) { + return nullptr; + } + return EVP_get_digestbynid(nid); +} + +static int marshal_digest_algorithm(CBB *cbb, const EVP_MD *md, + bool with_null) { + CBB algorithm, oid, null; + if (!CBB_add_asn1(cbb, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT)) { + return 0; + } + + bool found = false; + int nid = EVP_MD_type(md); + for (const auto &mdoid : kMDOIDs) { + if (nid == mdoid.nid) { + if (!CBB_add_bytes(&oid, mdoid.oid, mdoid.oid_len)) { + return 0; + } + found = true; + break; + } + } + + if (!found) { + OPENSSL_PUT_ERROR(DIGEST, DIGEST_R_UNKNOWN_HASH); + return 0; + } + + if ((with_null && !CBB_add_asn1(&algorithm, &null, CBS_ASN1_NULL)) || // + !CBB_flush(cbb)) { + return 0; + } + + return 1; +} + +int EVP_marshal_digest_algorithm(CBB *cbb, const EVP_MD *md) { + return marshal_digest_algorithm(cbb, md, /*with_null=*/true); +} + +int EVP_marshal_digest_algorithm_no_params(CBB *cbb, const EVP_MD *md) { + return marshal_digest_algorithm(cbb, md, /*with_null=*/false); +} + +const EVP_MD *EVP_get_digestbyname(const char *name) { + for (const auto &mapping : nid_to_digest_mapping) { + const char *short_name = mapping.short_name; + const char *long_name = mapping.long_name; + if ((short_name && strcmp(short_name, name) == 0) || + (long_name && strcmp(long_name, name) == 0)) { + return mapping.md_func(); + } + } + + return nullptr; +} + +EVP_MD *EVP_MD_fetch(OSSL_LIB_CTX *libctx, const char *name, + const char *propq) { + EVP_MD *ret = const_cast(EVP_get_digestbyname(name)); + if (ret == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + } + return ret; +} + +int EVP_MD_up_ref(EVP_MD *md) { return 1; } + +void EVP_MD_free(EVP_MD *md) {} + +int EVP_Q_digest(OSSL_LIB_CTX *libctx, const char *name, const char *propq, + const void *in, size_t in_len, uint8_t *out, size_t *out_len) { + const EVP_MD *md = EVP_MD_fetch(libctx, name, propq); + if (md == nullptr) { + return 0; + } + unsigned len_u; + if (!EVP_Digest(in, in_len, out, &len_u, md, nullptr)) { + return 0; + } + *out_len = len_u; + return 1; +} + +static void blake2b256_init(EVP_MD_CTX *ctx) { + BLAKE2B256_Init(reinterpret_cast(ctx->md_data)); +} + +static void blake2b256_update(EVP_MD_CTX *ctx, const void *data, size_t len) { + BLAKE2B256_Update(reinterpret_cast(ctx->md_data), data, len); +} + +static void blake2b256_final(EVP_MD_CTX *ctx, uint8_t *md) { + BLAKE2B256_Final(md, reinterpret_cast(ctx->md_data)); +} + +static const EVP_MD evp_md_blake2b256 = { + NID_undef, BLAKE2B256_DIGEST_LENGTH, 0, + blake2b256_init, blake2b256_update, blake2b256_final, + BLAKE2B_CBLOCK, sizeof(BLAKE2B_CTX), +}; + +const EVP_MD *EVP_blake2b256() { return &evp_md_blake2b256; } + +static_assert(sizeof(BLAKE2B_CTX) <= EVP_MAX_MD_DATA_SIZE); + + +static void md4_init(EVP_MD_CTX *ctx) { + BSSL_CHECK(MD4_Init(reinterpret_cast(ctx->md_data))); +} + +static void md4_update(EVP_MD_CTX *ctx, const void *data, size_t count) { + BSSL_CHECK( + MD4_Update(reinterpret_cast(ctx->md_data), data, count)); +} + +static void md4_final(EVP_MD_CTX *ctx, uint8_t *out) { + BSSL_CHECK(MD4_Final(out, reinterpret_cast(ctx->md_data))); +} + +static const EVP_MD evp_md_md4 = { + NID_md4, // + MD4_DIGEST_LENGTH, // + 0, + md4_init, + md4_update, + md4_final, + 64, + sizeof(MD4_CTX), +}; + +const EVP_MD *EVP_md4() { return &evp_md_md4; } + +static_assert(sizeof(MD4_CTX) <= EVP_MAX_MD_DATA_SIZE); + + +static void md5_init(EVP_MD_CTX *ctx) { + BSSL_CHECK(MD5_Init(reinterpret_cast(ctx->md_data))); +} + +static void md5_update(EVP_MD_CTX *ctx, const void *data, size_t count) { + BSSL_CHECK( + MD5_Update(reinterpret_cast(ctx->md_data), data, count)); +} + +static void md5_final(EVP_MD_CTX *ctx, uint8_t *out) { + BSSL_CHECK(MD5_Final(out, reinterpret_cast(ctx->md_data))); +} + +static const EVP_MD evp_md_md5 = { + NID_md5, MD5_DIGEST_LENGTH, 0, md5_init, + md5_update, md5_final, 64, sizeof(MD5_CTX), +}; + +const EVP_MD *EVP_md5() { return &evp_md_md5; } + +static_assert(sizeof(MD5_CTX) <= EVP_MAX_MD_DATA_SIZE); + + +typedef struct { + MD5_CTX md5; + SHA_CTX sha1; +} MD5_SHA1_CTX; + +static void md5_sha1_init(EVP_MD_CTX *md_ctx) { + MD5_SHA1_CTX *ctx = reinterpret_cast(md_ctx->md_data); + BSSL_CHECK(MD5_Init(&ctx->md5) && SHA1_Init(&ctx->sha1)); +} + +static void md5_sha1_update(EVP_MD_CTX *md_ctx, const void *data, + size_t count) { + MD5_SHA1_CTX *ctx = reinterpret_cast(md_ctx->md_data); + BSSL_CHECK(MD5_Update(&ctx->md5, data, count) && + SHA1_Update(&ctx->sha1, data, count)); +} + +static void md5_sha1_final(EVP_MD_CTX *md_ctx, uint8_t *out) { + MD5_SHA1_CTX *ctx = reinterpret_cast(md_ctx->md_data); + BSSL_CHECK(MD5_Final(out, &ctx->md5) && + SHA1_Final(out + MD5_DIGEST_LENGTH, &ctx->sha1)); +} + +const EVP_MD evp_md_md5_sha1 = { + NID_md5_sha1, + MD5_DIGEST_LENGTH + SHA_DIGEST_LENGTH, + 0, + md5_sha1_init, + md5_sha1_update, + md5_sha1_final, + 64, + sizeof(MD5_SHA1_CTX), +}; + +const EVP_MD *EVP_md5_sha1() { return &evp_md_md5_sha1; } + +static_assert(sizeof(MD5_SHA1_CTX) <= EVP_MAX_MD_DATA_SIZE); diff --git a/third_party/boringssl/src/crypto/digest_extra/digest_extra.c b/third_party/boringssl/src/crypto/digest_extra/digest_extra.c deleted file mode 100644 index 8cbb28e3..00000000 --- a/third_party/boringssl/src/crypto/digest_extra/digest_extra.c +++ /dev/null @@ -1,268 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include -#include -#include - -#include "../asn1/internal.h" -#include "../internal.h" -#include "../fipsmodule/digest/internal.h" - - -struct nid_to_digest { - int nid; - const EVP_MD* (*md_func)(void); - const char *short_name; - const char *long_name; -}; - -static const struct nid_to_digest nid_to_digest_mapping[] = { - {NID_md4, EVP_md4, SN_md4, LN_md4}, - {NID_md5, EVP_md5, SN_md5, LN_md5}, - {NID_sha1, EVP_sha1, SN_sha1, LN_sha1}, - {NID_sha224, EVP_sha224, SN_sha224, LN_sha224}, - {NID_sha256, EVP_sha256, SN_sha256, LN_sha256}, - {NID_sha384, EVP_sha384, SN_sha384, LN_sha384}, - {NID_sha512, EVP_sha512, SN_sha512, LN_sha512}, - {NID_sha512_256, EVP_sha512_256, SN_sha512_256, LN_sha512_256}, - {NID_md5_sha1, EVP_md5_sha1, SN_md5_sha1, LN_md5_sha1}, - // As a remnant of signing |EVP_MD|s, OpenSSL returned the corresponding - // hash function when given a signature OID. To avoid unintended lax parsing - // of hash OIDs, this is no longer supported for lookup by OID or NID. - // Node.js, however, exposes |EVP_get_digestbyname|'s full behavior to - // consumers so we retain it there. - {NID_undef, EVP_sha1, SN_dsaWithSHA, LN_dsaWithSHA}, - {NID_undef, EVP_sha1, SN_dsaWithSHA1, LN_dsaWithSHA1}, - {NID_undef, EVP_sha1, SN_ecdsa_with_SHA1, NULL}, - {NID_undef, EVP_md5, SN_md5WithRSAEncryption, LN_md5WithRSAEncryption}, - {NID_undef, EVP_sha1, SN_sha1WithRSAEncryption, LN_sha1WithRSAEncryption}, - {NID_undef, EVP_sha224, SN_sha224WithRSAEncryption, - LN_sha224WithRSAEncryption}, - {NID_undef, EVP_sha256, SN_sha256WithRSAEncryption, - LN_sha256WithRSAEncryption}, - {NID_undef, EVP_sha384, SN_sha384WithRSAEncryption, - LN_sha384WithRSAEncryption}, - {NID_undef, EVP_sha512, SN_sha512WithRSAEncryption, - LN_sha512WithRSAEncryption}, -}; - -const EVP_MD* EVP_get_digestbynid(int nid) { - if (nid == NID_undef) { - // Skip the |NID_undef| entries in |nid_to_digest_mapping|. - return NULL; - } - - for (unsigned i = 0; i < OPENSSL_ARRAY_SIZE(nid_to_digest_mapping); i++) { - if (nid_to_digest_mapping[i].nid == nid) { - return nid_to_digest_mapping[i].md_func(); - } - } - - return NULL; -} - -static const struct { - uint8_t oid[9]; - uint8_t oid_len; - int nid; -} kMDOIDs[] = { - // 1.2.840.113549.2.4 - { {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x04}, 8, NID_md4 }, - // 1.2.840.113549.2.5 - { {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x02, 0x05}, 8, NID_md5 }, - // 1.3.14.3.2.26 - { {0x2b, 0x0e, 0x03, 0x02, 0x1a}, 5, NID_sha1 }, - // 2.16.840.1.101.3.4.2.1 - { {0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x01}, 9, NID_sha256 }, - // 2.16.840.1.101.3.4.2.2 - { {0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x02}, 9, NID_sha384 }, - // 2.16.840.1.101.3.4.2.3 - { {0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x03}, 9, NID_sha512 }, - // 2.16.840.1.101.3.4.2.4 - { {0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x02, 0x04}, 9, NID_sha224 }, -}; - -static const EVP_MD *cbs_to_md(const CBS *cbs) { - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kMDOIDs); i++) { - if (CBS_len(cbs) == kMDOIDs[i].oid_len && - OPENSSL_memcmp(CBS_data(cbs), kMDOIDs[i].oid, kMDOIDs[i].oid_len) == - 0) { - return EVP_get_digestbynid(kMDOIDs[i].nid); - } - } - - return NULL; -} - -const EVP_MD *EVP_get_digestbyobj(const ASN1_OBJECT *obj) { - // Handle objects with no corresponding OID. Note we don't use |OBJ_obj2nid| - // here to avoid pulling in the OID table. - if (obj->nid != NID_undef) { - return EVP_get_digestbynid(obj->nid); - } - - CBS cbs; - CBS_init(&cbs, OBJ_get0_data(obj), OBJ_length(obj)); - return cbs_to_md(&cbs); -} - -const EVP_MD *EVP_parse_digest_algorithm(CBS *cbs) { - CBS algorithm, oid; - if (!CBS_get_asn1(cbs, &algorithm, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&algorithm, &oid, CBS_ASN1_OBJECT)) { - OPENSSL_PUT_ERROR(DIGEST, DIGEST_R_DECODE_ERROR); - return NULL; - } - - const EVP_MD *ret = cbs_to_md(&oid); - if (ret == NULL) { - OPENSSL_PUT_ERROR(DIGEST, DIGEST_R_UNKNOWN_HASH); - return NULL; - } - - // The parameters, if present, must be NULL. Historically, whether the NULL - // was included or omitted was not well-specified. When parsing an - // AlgorithmIdentifier, we allow both. (Note this code is not used when - // verifying RSASSA-PKCS1-v1_5 signatures.) - if (CBS_len(&algorithm) > 0) { - CBS param; - if (!CBS_get_asn1(&algorithm, ¶m, CBS_ASN1_NULL) || - CBS_len(¶m) != 0 || - CBS_len(&algorithm) != 0) { - OPENSSL_PUT_ERROR(DIGEST, DIGEST_R_DECODE_ERROR); - return NULL; - } - } - - return ret; -} - -int EVP_marshal_digest_algorithm(CBB *cbb, const EVP_MD *md) { - CBB algorithm, oid, null; - if (!CBB_add_asn1(cbb, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT)) { - OPENSSL_PUT_ERROR(DIGEST, ERR_R_MALLOC_FAILURE); - return 0; - } - - int found = 0; - int nid = EVP_MD_type(md); - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kMDOIDs); i++) { - if (nid == kMDOIDs[i].nid) { - if (!CBB_add_bytes(&oid, kMDOIDs[i].oid, kMDOIDs[i].oid_len)) { - OPENSSL_PUT_ERROR(DIGEST, ERR_R_MALLOC_FAILURE); - return 0; - } - found = 1; - break; - } - } - - if (!found) { - OPENSSL_PUT_ERROR(DIGEST, DIGEST_R_UNKNOWN_HASH); - return 0; - } - - if (!CBB_add_asn1(&algorithm, &null, CBS_ASN1_NULL) || - !CBB_flush(cbb)) { - OPENSSL_PUT_ERROR(DIGEST, ERR_R_MALLOC_FAILURE); - return 0; - } - - return 1; -} - -const EVP_MD *EVP_get_digestbyname(const char *name) { - for (unsigned i = 0; i < OPENSSL_ARRAY_SIZE(nid_to_digest_mapping); i++) { - const char *short_name = nid_to_digest_mapping[i].short_name; - const char *long_name = nid_to_digest_mapping[i].long_name; - if ((short_name && strcmp(short_name, name) == 0) || - (long_name && strcmp(long_name, name) == 0)) { - return nid_to_digest_mapping[i].md_func(); - } - } - - return NULL; -} - -static void blake2b256_init(EVP_MD_CTX *ctx) { BLAKE2B256_Init(ctx->md_data); } - -static void blake2b256_update(EVP_MD_CTX *ctx, const void *data, size_t len) { - BLAKE2B256_Update(ctx->md_data, data, len); -} - -static void blake2b256_final(EVP_MD_CTX *ctx, uint8_t *md) { - BLAKE2B256_Final(md, ctx->md_data); -} - -static const EVP_MD evp_md_blake2b256 = { - NID_undef, - BLAKE2B256_DIGEST_LENGTH, - 0, - blake2b256_init, - blake2b256_update, - blake2b256_final, - BLAKE2B_CBLOCK, - sizeof(BLAKE2B_CTX), -}; - -const EVP_MD *EVP_blake2b256(void) { return &evp_md_blake2b256; } diff --git a/third_party/boringssl/src/crypto/dsa/dsa.c b/third_party/boringssl/src/crypto/dsa/dsa.c deleted file mode 100644 index b8e46531..00000000 --- a/third_party/boringssl/src/crypto/dsa/dsa.c +++ /dev/null @@ -1,969 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - * - * The DSS routines are based on patches supplied by - * Steven Schoch . */ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../fipsmodule/bn/internal.h" -#include "../internal.h" - - -// Primality test according to FIPS PUB 186[-1], Appendix 2.1: 50 rounds of -// Miller-Rabin. -#define DSS_prime_checks 50 - -static int dsa_sign_setup(const DSA *dsa, BN_CTX *ctx_in, BIGNUM **out_kinv, - BIGNUM **out_r); - -static CRYPTO_EX_DATA_CLASS g_ex_data_class = CRYPTO_EX_DATA_CLASS_INIT; - -DSA *DSA_new(void) { - DSA *dsa = OPENSSL_malloc(sizeof(DSA)); - if (dsa == NULL) { - OPENSSL_PUT_ERROR(DSA, ERR_R_MALLOC_FAILURE); - return NULL; - } - - OPENSSL_memset(dsa, 0, sizeof(DSA)); - - dsa->references = 1; - - CRYPTO_MUTEX_init(&dsa->method_mont_lock); - CRYPTO_new_ex_data(&dsa->ex_data); - - return dsa; -} - -void DSA_free(DSA *dsa) { - if (dsa == NULL) { - return; - } - - if (!CRYPTO_refcount_dec_and_test_zero(&dsa->references)) { - return; - } - - CRYPTO_free_ex_data(&g_ex_data_class, dsa, &dsa->ex_data); - - BN_clear_free(dsa->p); - BN_clear_free(dsa->q); - BN_clear_free(dsa->g); - BN_clear_free(dsa->pub_key); - BN_clear_free(dsa->priv_key); - BN_MONT_CTX_free(dsa->method_mont_p); - BN_MONT_CTX_free(dsa->method_mont_q); - CRYPTO_MUTEX_cleanup(&dsa->method_mont_lock); - OPENSSL_free(dsa); -} - -int DSA_up_ref(DSA *dsa) { - CRYPTO_refcount_inc(&dsa->references); - return 1; -} - -unsigned DSA_bits(const DSA *dsa) { return BN_num_bits(dsa->p); } - -const BIGNUM *DSA_get0_pub_key(const DSA *dsa) { return dsa->pub_key; } - -const BIGNUM *DSA_get0_priv_key(const DSA *dsa) { return dsa->priv_key; } - -const BIGNUM *DSA_get0_p(const DSA *dsa) { return dsa->p; } - -const BIGNUM *DSA_get0_q(const DSA *dsa) { return dsa->q; } - -const BIGNUM *DSA_get0_g(const DSA *dsa) { return dsa->g; } - -void DSA_get0_key(const DSA *dsa, const BIGNUM **out_pub_key, - const BIGNUM **out_priv_key) { - if (out_pub_key != NULL) { - *out_pub_key = dsa->pub_key; - } - if (out_priv_key != NULL) { - *out_priv_key = dsa->priv_key; - } -} - -void DSA_get0_pqg(const DSA *dsa, const BIGNUM **out_p, const BIGNUM **out_q, - const BIGNUM **out_g) { - if (out_p != NULL) { - *out_p = dsa->p; - } - if (out_q != NULL) { - *out_q = dsa->q; - } - if (out_g != NULL) { - *out_g = dsa->g; - } -} - -int DSA_set0_key(DSA *dsa, BIGNUM *pub_key, BIGNUM *priv_key) { - if (dsa->pub_key == NULL && pub_key == NULL) { - return 0; - } - - if (pub_key != NULL) { - BN_free(dsa->pub_key); - dsa->pub_key = pub_key; - } - if (priv_key != NULL) { - BN_free(dsa->priv_key); - dsa->priv_key = priv_key; - } - - return 1; -} - -int DSA_set0_pqg(DSA *dsa, BIGNUM *p, BIGNUM *q, BIGNUM *g) { - if ((dsa->p == NULL && p == NULL) || - (dsa->q == NULL && q == NULL) || - (dsa->g == NULL && g == NULL)) { - return 0; - } - - if (p != NULL) { - BN_free(dsa->p); - dsa->p = p; - } - if (q != NULL) { - BN_free(dsa->q); - dsa->q = q; - } - if (g != NULL) { - BN_free(dsa->g); - dsa->g = g; - } - - return 1; -} - -int DSA_generate_parameters_ex(DSA *dsa, unsigned bits, const uint8_t *seed_in, - size_t seed_len, int *out_counter, - unsigned long *out_h, BN_GENCB *cb) { - int ok = 0; - unsigned char seed[SHA256_DIGEST_LENGTH]; - unsigned char md[SHA256_DIGEST_LENGTH]; - unsigned char buf[SHA256_DIGEST_LENGTH], buf2[SHA256_DIGEST_LENGTH]; - BIGNUM *r0, *W, *X, *c, *test; - BIGNUM *g = NULL, *q = NULL, *p = NULL; - BN_MONT_CTX *mont = NULL; - int k, n = 0, m = 0; - unsigned i; - int counter = 0; - int r = 0; - BN_CTX *ctx = NULL; - unsigned int h = 2; - unsigned qsize; - const EVP_MD *evpmd; - - evpmd = (bits >= 2048) ? EVP_sha256() : EVP_sha1(); - qsize = EVP_MD_size(evpmd); - - if (bits < 512) { - bits = 512; - } - - bits = (bits + 63) / 64 * 64; - - if (seed_in != NULL) { - if (seed_len < (size_t)qsize) { - return 0; - } - if (seed_len > (size_t)qsize) { - // Only consume as much seed as is expected. - seed_len = qsize; - } - OPENSSL_memcpy(seed, seed_in, seed_len); - } - - ctx = BN_CTX_new(); - if (ctx == NULL) { - goto err; - } - BN_CTX_start(ctx); - - r0 = BN_CTX_get(ctx); - g = BN_CTX_get(ctx); - W = BN_CTX_get(ctx); - q = BN_CTX_get(ctx); - X = BN_CTX_get(ctx); - c = BN_CTX_get(ctx); - p = BN_CTX_get(ctx); - test = BN_CTX_get(ctx); - - if (test == NULL || !BN_lshift(test, BN_value_one(), bits - 1)) { - goto err; - } - - for (;;) { - // Find q. - for (;;) { - // step 1 - if (!BN_GENCB_call(cb, BN_GENCB_GENERATED, m++)) { - goto err; - } - - int use_random_seed = (seed_in == NULL); - if (use_random_seed) { - if (!RAND_bytes(seed, qsize)) { - goto err; - } - } else { - // If we come back through, use random seed next time. - seed_in = NULL; - } - OPENSSL_memcpy(buf, seed, qsize); - OPENSSL_memcpy(buf2, seed, qsize); - // precompute "SEED + 1" for step 7: - for (i = qsize - 1; i < qsize; i--) { - buf[i]++; - if (buf[i] != 0) { - break; - } - } - - // step 2 - if (!EVP_Digest(seed, qsize, md, NULL, evpmd, NULL) || - !EVP_Digest(buf, qsize, buf2, NULL, evpmd, NULL)) { - goto err; - } - for (i = 0; i < qsize; i++) { - md[i] ^= buf2[i]; - } - - // step 3 - md[0] |= 0x80; - md[qsize - 1] |= 0x01; - if (!BN_bin2bn(md, qsize, q)) { - goto err; - } - - // step 4 - r = BN_is_prime_fasttest_ex(q, DSS_prime_checks, ctx, use_random_seed, cb); - if (r > 0) { - break; - } - if (r != 0) { - goto err; - } - - // do a callback call - // step 5 - } - - if (!BN_GENCB_call(cb, 2, 0) || !BN_GENCB_call(cb, 3, 0)) { - goto err; - } - - // step 6 - counter = 0; - // "offset = 2" - - n = (bits - 1) / 160; - - for (;;) { - if ((counter != 0) && !BN_GENCB_call(cb, BN_GENCB_GENERATED, counter)) { - goto err; - } - - // step 7 - BN_zero(W); - // now 'buf' contains "SEED + offset - 1" - for (k = 0; k <= n; k++) { - // obtain "SEED + offset + k" by incrementing: - for (i = qsize - 1; i < qsize; i--) { - buf[i]++; - if (buf[i] != 0) { - break; - } - } - - if (!EVP_Digest(buf, qsize, md, NULL, evpmd, NULL)) { - goto err; - } - - // step 8 - if (!BN_bin2bn(md, qsize, r0) || - !BN_lshift(r0, r0, (qsize << 3) * k) || - !BN_add(W, W, r0)) { - goto err; - } - } - - // more of step 8 - if (!BN_mask_bits(W, bits - 1) || - !BN_copy(X, W) || - !BN_add(X, X, test)) { - goto err; - } - - // step 9 - if (!BN_lshift1(r0, q) || - !BN_mod(c, X, r0, ctx) || - !BN_sub(r0, c, BN_value_one()) || - !BN_sub(p, X, r0)) { - goto err; - } - - // step 10 - if (BN_cmp(p, test) >= 0) { - // step 11 - r = BN_is_prime_fasttest_ex(p, DSS_prime_checks, ctx, 1, cb); - if (r > 0) { - goto end; // found it - } - if (r != 0) { - goto err; - } - } - - // step 13 - counter++; - // "offset = offset + n + 1" - - // step 14 - if (counter >= 4096) { - break; - } - } - } -end: - if (!BN_GENCB_call(cb, 2, 1)) { - goto err; - } - - // We now need to generate g - // Set r0=(p-1)/q - if (!BN_sub(test, p, BN_value_one()) || - !BN_div(r0, NULL, test, q, ctx)) { - goto err; - } - - mont = BN_MONT_CTX_new_for_modulus(p, ctx); - if (mont == NULL || - !BN_set_word(test, h)) { - goto err; - } - - for (;;) { - // g=test^r0%p - if (!BN_mod_exp_mont(g, test, r0, p, ctx, mont)) { - goto err; - } - if (!BN_is_one(g)) { - break; - } - if (!BN_add(test, test, BN_value_one())) { - goto err; - } - h++; - } - - if (!BN_GENCB_call(cb, 3, 1)) { - goto err; - } - - ok = 1; - -err: - if (ok) { - BN_free(dsa->p); - BN_free(dsa->q); - BN_free(dsa->g); - dsa->p = BN_dup(p); - dsa->q = BN_dup(q); - dsa->g = BN_dup(g); - if (dsa->p == NULL || dsa->q == NULL || dsa->g == NULL) { - ok = 0; - goto err; - } - if (out_counter != NULL) { - *out_counter = counter; - } - if (out_h != NULL) { - *out_h = h; - } - } - - if (ctx) { - BN_CTX_end(ctx); - BN_CTX_free(ctx); - } - - BN_MONT_CTX_free(mont); - - return ok; -} - -DSA *DSAparams_dup(const DSA *dsa) { - DSA *ret = DSA_new(); - if (ret == NULL) { - return NULL; - } - ret->p = BN_dup(dsa->p); - ret->q = BN_dup(dsa->q); - ret->g = BN_dup(dsa->g); - if (ret->p == NULL || ret->q == NULL || ret->g == NULL) { - DSA_free(ret); - return NULL; - } - return ret; -} - -int DSA_generate_key(DSA *dsa) { - int ok = 0; - BN_CTX *ctx = NULL; - BIGNUM *pub_key = NULL, *priv_key = NULL; - - ctx = BN_CTX_new(); - if (ctx == NULL) { - goto err; - } - - priv_key = dsa->priv_key; - if (priv_key == NULL) { - priv_key = BN_new(); - if (priv_key == NULL) { - goto err; - } - } - - if (!BN_rand_range_ex(priv_key, 1, dsa->q)) { - goto err; - } - - pub_key = dsa->pub_key; - if (pub_key == NULL) { - pub_key = BN_new(); - if (pub_key == NULL) { - goto err; - } - } - - if (!BN_MONT_CTX_set_locked(&dsa->method_mont_p, &dsa->method_mont_lock, - dsa->p, ctx) || - !BN_mod_exp_mont_consttime(pub_key, dsa->g, priv_key, dsa->p, ctx, - dsa->method_mont_p)) { - goto err; - } - - dsa->priv_key = priv_key; - dsa->pub_key = pub_key; - ok = 1; - -err: - if (dsa->pub_key == NULL) { - BN_free(pub_key); - } - if (dsa->priv_key == NULL) { - BN_free(priv_key); - } - BN_CTX_free(ctx); - - return ok; -} - -DSA_SIG *DSA_SIG_new(void) { - DSA_SIG *sig; - sig = OPENSSL_malloc(sizeof(DSA_SIG)); - if (!sig) { - return NULL; - } - sig->r = NULL; - sig->s = NULL; - return sig; -} - -void DSA_SIG_free(DSA_SIG *sig) { - if (!sig) { - return; - } - - BN_free(sig->r); - BN_free(sig->s); - OPENSSL_free(sig); -} - -void DSA_SIG_get0(const DSA_SIG *sig, const BIGNUM **out_r, - const BIGNUM **out_s) { - if (out_r != NULL) { - *out_r = sig->r; - } - if (out_s != NULL) { - *out_s = sig->s; - } -} - -int DSA_SIG_set0(DSA_SIG *sig, BIGNUM *r, BIGNUM *s) { - if (r == NULL || s == NULL) { - return 0; - } - BN_free(sig->r); - BN_free(sig->s); - sig->r = r; - sig->s = s; - return 1; -} - -// mod_mul_consttime sets |r| to |a| * |b| modulo |mont->N|, treating |a| and -// |b| as secret. This function internally uses Montgomery reduction, but -// neither inputs nor outputs are in Montgomery form. -static int mod_mul_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, - const BN_MONT_CTX *mont, BN_CTX *ctx) { - BN_CTX_start(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - // |BN_mod_mul_montgomery| removes a factor of R, so we cancel it with a - // single |BN_to_montgomery| which adds one factor of R. - int ok = tmp != NULL && - BN_to_montgomery(tmp, a, mont, ctx) && - BN_mod_mul_montgomery(r, tmp, b, mont, ctx); - BN_CTX_end(ctx); - return ok; -} - -DSA_SIG *DSA_do_sign(const uint8_t *digest, size_t digest_len, const DSA *dsa) { - if (!dsa_check_parameters(dsa)) { - return NULL; - } - - BIGNUM *kinv = NULL, *r = NULL, *s = NULL; - BIGNUM m; - BIGNUM xr; - BN_CTX *ctx = NULL; - DSA_SIG *ret = NULL; - - BN_init(&m); - BN_init(&xr); - s = BN_new(); - if (s == NULL) { - goto err; - } - ctx = BN_CTX_new(); - if (ctx == NULL) { - goto err; - } - -redo: - if (!dsa_sign_setup(dsa, ctx, &kinv, &r)) { - goto err; - } - - if (digest_len > BN_num_bytes(dsa->q)) { - // If the digest length is greater than the size of |dsa->q| use the - // BN_num_bits(dsa->q) leftmost bits of the digest, see FIPS 186-3, 4.2. - // Note the above check that |dsa->q| is a multiple of 8 bits. - digest_len = BN_num_bytes(dsa->q); - } - - if (BN_bin2bn(digest, digest_len, &m) == NULL) { - goto err; - } - - // |m| is bounded by 2^(num_bits(q)), which is slightly looser than q. This - // violates |bn_mod_add_consttime| and |mod_mul_consttime|'s preconditions. - // (The underlying algorithms could accept looser bounds, but we reduce for - // simplicity.) - size_t q_width = bn_minimal_width(dsa->q); - if (!bn_resize_words(&m, q_width) || - !bn_resize_words(&xr, q_width)) { - goto err; - } - bn_reduce_once_in_place(m.d, 0 /* no carry word */, dsa->q->d, - xr.d /* scratch space */, q_width); - - // Compute s = inv(k) (m + xr) mod q. Note |dsa->method_mont_q| is - // initialized by |dsa_sign_setup|. - if (!mod_mul_consttime(&xr, dsa->priv_key, r, dsa->method_mont_q, ctx) || - !bn_mod_add_consttime(s, &xr, &m, dsa->q, ctx) || - !mod_mul_consttime(s, s, kinv, dsa->method_mont_q, ctx)) { - goto err; - } - - // Redo if r or s is zero as required by FIPS 186-3: this is - // very unlikely. - if (BN_is_zero(r) || BN_is_zero(s)) { - goto redo; - } - ret = DSA_SIG_new(); - if (ret == NULL) { - goto err; - } - ret->r = r; - ret->s = s; - -err: - if (ret == NULL) { - OPENSSL_PUT_ERROR(DSA, ERR_R_BN_LIB); - BN_free(r); - BN_free(s); - } - BN_CTX_free(ctx); - BN_clear_free(&m); - BN_clear_free(&xr); - BN_clear_free(kinv); - - return ret; -} - -int DSA_do_verify(const uint8_t *digest, size_t digest_len, DSA_SIG *sig, - const DSA *dsa) { - int valid; - if (!DSA_do_check_signature(&valid, digest, digest_len, sig, dsa)) { - return -1; - } - return valid; -} - -int DSA_do_check_signature(int *out_valid, const uint8_t *digest, - size_t digest_len, DSA_SIG *sig, const DSA *dsa) { - *out_valid = 0; - if (!dsa_check_parameters(dsa)) { - return 0; - } - - int ret = 0; - BIGNUM u1, u2, t1; - BN_init(&u1); - BN_init(&u2); - BN_init(&t1); - BN_CTX *ctx = BN_CTX_new(); - if (ctx == NULL) { - goto err; - } - - if (BN_is_zero(sig->r) || BN_is_negative(sig->r) || - BN_ucmp(sig->r, dsa->q) >= 0) { - ret = 1; - goto err; - } - if (BN_is_zero(sig->s) || BN_is_negative(sig->s) || - BN_ucmp(sig->s, dsa->q) >= 0) { - ret = 1; - goto err; - } - - // Calculate W = inv(S) mod Q - // save W in u2 - if (BN_mod_inverse(&u2, sig->s, dsa->q, ctx) == NULL) { - goto err; - } - - // save M in u1 - unsigned q_bits = BN_num_bits(dsa->q); - if (digest_len > (q_bits >> 3)) { - // if the digest length is greater than the size of q use the - // BN_num_bits(dsa->q) leftmost bits of the digest, see - // fips 186-3, 4.2 - digest_len = (q_bits >> 3); - } - - if (BN_bin2bn(digest, digest_len, &u1) == NULL) { - goto err; - } - - // u1 = M * w mod q - if (!BN_mod_mul(&u1, &u1, &u2, dsa->q, ctx)) { - goto err; - } - - // u2 = r * w mod q - if (!BN_mod_mul(&u2, sig->r, &u2, dsa->q, ctx)) { - goto err; - } - - if (!BN_MONT_CTX_set_locked((BN_MONT_CTX **)&dsa->method_mont_p, - (CRYPTO_MUTEX *)&dsa->method_mont_lock, dsa->p, - ctx)) { - goto err; - } - - if (!BN_mod_exp2_mont(&t1, dsa->g, &u1, dsa->pub_key, &u2, dsa->p, ctx, - dsa->method_mont_p)) { - goto err; - } - - // BN_copy(&u1,&t1); - // let u1 = u1 mod q - if (!BN_mod(&u1, &t1, dsa->q, ctx)) { - goto err; - } - - // V is now in u1. If the signature is correct, it will be - // equal to R. - *out_valid = BN_ucmp(&u1, sig->r) == 0; - ret = 1; - -err: - if (ret != 1) { - OPENSSL_PUT_ERROR(DSA, ERR_R_BN_LIB); - } - BN_CTX_free(ctx); - BN_free(&u1); - BN_free(&u2); - BN_free(&t1); - - return ret; -} - -int DSA_sign(int type, const uint8_t *digest, size_t digest_len, - uint8_t *out_sig, unsigned int *out_siglen, const DSA *dsa) { - DSA_SIG *s; - - s = DSA_do_sign(digest, digest_len, dsa); - if (s == NULL) { - *out_siglen = 0; - return 0; - } - - *out_siglen = i2d_DSA_SIG(s, &out_sig); - DSA_SIG_free(s); - return 1; -} - -int DSA_verify(int type, const uint8_t *digest, size_t digest_len, - const uint8_t *sig, size_t sig_len, const DSA *dsa) { - int valid; - if (!DSA_check_signature(&valid, digest, digest_len, sig, sig_len, dsa)) { - return -1; - } - return valid; -} - -int DSA_check_signature(int *out_valid, const uint8_t *digest, - size_t digest_len, const uint8_t *sig, size_t sig_len, - const DSA *dsa) { - DSA_SIG *s = NULL; - int ret = 0; - uint8_t *der = NULL; - - s = DSA_SIG_new(); - if (s == NULL) { - goto err; - } - - const uint8_t *sigp = sig; - if (d2i_DSA_SIG(&s, &sigp, sig_len) == NULL || sigp != sig + sig_len) { - goto err; - } - - // Ensure that the signature uses DER and doesn't have trailing garbage. - int der_len = i2d_DSA_SIG(s, &der); - if (der_len < 0 || (size_t)der_len != sig_len || - OPENSSL_memcmp(sig, der, sig_len)) { - goto err; - } - - ret = DSA_do_check_signature(out_valid, digest, digest_len, s, dsa); - -err: - OPENSSL_free(der); - DSA_SIG_free(s); - return ret; -} - -// der_len_len returns the number of bytes needed to represent a length of |len| -// in DER. -static size_t der_len_len(size_t len) { - if (len < 0x80) { - return 1; - } - size_t ret = 1; - while (len > 0) { - ret++; - len >>= 8; - } - return ret; -} - -int DSA_size(const DSA *dsa) { - size_t order_len = BN_num_bytes(dsa->q); - // Compute the maximum length of an |order_len| byte integer. Defensively - // assume that the leading 0x00 is included. - size_t integer_len = 1 /* tag */ + der_len_len(order_len + 1) + 1 + order_len; - if (integer_len < order_len) { - return 0; - } - // A DSA signature is two INTEGERs. - size_t value_len = 2 * integer_len; - if (value_len < integer_len) { - return 0; - } - // Add the header. - size_t ret = 1 /* tag */ + der_len_len(value_len) + value_len; - if (ret < value_len) { - return 0; - } - return ret; -} - -static int dsa_sign_setup(const DSA *dsa, BN_CTX *ctx, BIGNUM **out_kinv, - BIGNUM **out_r) { - if (!dsa->p || !dsa->q || !dsa->g) { - OPENSSL_PUT_ERROR(DSA, DSA_R_MISSING_PARAMETERS); - return 0; - } - - int ret = 0; - BIGNUM k; - BN_init(&k); - BIGNUM *r = BN_new(); - BIGNUM *kinv = BN_new(); - if (r == NULL || kinv == NULL || - // Get random k - !BN_rand_range_ex(&k, 1, dsa->q) || - !BN_MONT_CTX_set_locked((BN_MONT_CTX **)&dsa->method_mont_p, - (CRYPTO_MUTEX *)&dsa->method_mont_lock, dsa->p, - ctx) || - !BN_MONT_CTX_set_locked((BN_MONT_CTX **)&dsa->method_mont_q, - (CRYPTO_MUTEX *)&dsa->method_mont_lock, dsa->q, - ctx) || - // Compute r = (g^k mod p) mod q - !BN_mod_exp_mont_consttime(r, dsa->g, &k, dsa->p, ctx, - dsa->method_mont_p) || - // Note |BN_mod| below is not constant-time and may leak information about - // |r|. |dsa->p| may be significantly larger than |dsa->q|, so this is not - // easily performed in constant-time with Montgomery reduction. - // - // However, |r| at this point is g^k (mod p). It is almost the value of - // |r| revealed in the signature anyway (g^k (mod p) (mod q)), going from - // it to |k| would require computing a discrete log. - !BN_mod(r, r, dsa->q, ctx) || - // Compute part of 's = inv(k) (m + xr) mod q' using Fermat's Little - // Theorem. - !bn_mod_inverse_prime(kinv, &k, dsa->q, ctx, dsa->method_mont_q)) { - OPENSSL_PUT_ERROR(DSA, ERR_R_BN_LIB); - goto err; - } - - BN_clear_free(*out_kinv); - *out_kinv = kinv; - kinv = NULL; - - BN_clear_free(*out_r); - *out_r = r; - r = NULL; - - ret = 1; - -err: - BN_clear_free(&k); - BN_clear_free(r); - BN_clear_free(kinv); - return ret; -} - -int DSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_unused *unused, - CRYPTO_EX_dup *dup_unused, CRYPTO_EX_free *free_func) { - int index; - if (!CRYPTO_get_ex_new_index(&g_ex_data_class, &index, argl, argp, - free_func)) { - return -1; - } - return index; -} - -int DSA_set_ex_data(DSA *dsa, int idx, void *arg) { - return CRYPTO_set_ex_data(&dsa->ex_data, idx, arg); -} - -void *DSA_get_ex_data(const DSA *dsa, int idx) { - return CRYPTO_get_ex_data(&dsa->ex_data, idx); -} - -DH *DSA_dup_DH(const DSA *dsa) { - if (dsa == NULL) { - return NULL; - } - - DH *ret = DH_new(); - if (ret == NULL) { - goto err; - } - if (dsa->q != NULL) { - ret->priv_length = BN_num_bits(dsa->q); - if ((ret->q = BN_dup(dsa->q)) == NULL) { - goto err; - } - } - if ((dsa->p != NULL && (ret->p = BN_dup(dsa->p)) == NULL) || - (dsa->g != NULL && (ret->g = BN_dup(dsa->g)) == NULL) || - (dsa->pub_key != NULL && (ret->pub_key = BN_dup(dsa->pub_key)) == NULL) || - (dsa->priv_key != NULL && - (ret->priv_key = BN_dup(dsa->priv_key)) == NULL)) { - goto err; - } - - return ret; - -err: - DH_free(ret); - return NULL; -} diff --git a/third_party/boringssl/src/crypto/dsa/dsa.cc b/third_party/boringssl/src/crypto/dsa/dsa.cc new file mode 100644 index 00000000..e346758b --- /dev/null +++ b/third_party/boringssl/src/crypto/dsa/dsa.cc @@ -0,0 +1,921 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../fipsmodule/bn/internal.h" +#include "../fipsmodule/dh/internal.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +static_assert(OPENSSL_DSA_MAX_MODULUS_BITS <= + BN_MONTGOMERY_MAX_WORDS * BN_BITS2, + "Max DSA size too big for Montgomery arithmetic"); + +// Primality test according to FIPS PUB 186[-1], Appendix 2.1: 50 rounds of +// Miller-Rabin. +#define DSS_prime_checks 50 + +static int dsa_sign_setup(const DSAImpl *dsa, BN_CTX *ctx_in, BIGNUM **out_kinv, + BIGNUM **out_r); + +static ExDataClass g_ex_data_class; + +DSA *DSA_new() { return New(); } + +DSAImpl::DSAImpl() : RefCounted(CheckSubClass()) { + CRYPTO_new_ex_data(&ex_data); +} + +DSAImpl::~DSAImpl() { CRYPTO_free_ex_data(&g_ex_data_class, &ex_data); } + +void DSA_free(DSA *dsa) { + if (dsa == nullptr) { + return; + } + auto *impl = FromOpaque(dsa); + impl->DecRefInternal(); +} + +int DSA_up_ref(DSA *dsa) { + auto *impl = FromOpaque(dsa); + impl->UpRefInternal(); + return 1; +} + +unsigned DSA_bits(const DSA *dsa) { + return BN_num_bits(FromOpaque(dsa)->p.get()); +} + +const BIGNUM *DSA_get0_pub_key(const DSA *dsa) { + return FromOpaque(dsa)->pub_key.get(); +} + +const BIGNUM *DSA_get0_priv_key(const DSA *dsa) { + return FromOpaque(dsa)->priv_key.get(); +} + +const BIGNUM *DSA_get0_p(const DSA *dsa) { return FromOpaque(dsa)->p.get(); } + +const BIGNUM *DSA_get0_q(const DSA *dsa) { return FromOpaque(dsa)->q.get(); } + +const BIGNUM *DSA_get0_g(const DSA *dsa) { return FromOpaque(dsa)->g.get(); } + +void DSA_get0_key(const DSA *dsa, const BIGNUM **out_pub_key, + const BIGNUM **out_priv_key) { + auto *impl = FromOpaque(dsa); + if (out_pub_key != nullptr) { + *out_pub_key = impl->pub_key.get(); + } + if (out_priv_key != nullptr) { + *out_priv_key = impl->priv_key.get(); + } +} + +void DSA_get0_pqg(const DSA *dsa, const BIGNUM **out_p, const BIGNUM **out_q, + const BIGNUM **out_g) { + auto *impl = FromOpaque(dsa); + if (out_p != nullptr) { + *out_p = impl->p.get(); + } + if (out_q != nullptr) { + *out_q = impl->q.get(); + } + if (out_g != nullptr) { + *out_g = impl->g.get(); + } +} + +int DSA_set0_key(DSA *dsa, BIGNUM *pub_key, BIGNUM *priv_key) { + auto *impl = FromOpaque(dsa); + + if (impl->pub_key == nullptr && pub_key == nullptr) { + return 0; + } + + if (pub_key != nullptr) { + impl->pub_key.reset(pub_key); + } + if (priv_key != nullptr) { + impl->priv_key.reset(priv_key); + } + + return 1; +} + +int DSA_set0_pqg(DSA *dsa, BIGNUM *p, BIGNUM *q, BIGNUM *g) { + auto *impl = FromOpaque(dsa); + + if ((impl->p == nullptr && p == nullptr) || + (impl->q == nullptr && q == nullptr) || + (impl->g == nullptr && g == nullptr)) { + return 0; + } + + if (p != nullptr) { + impl->p.reset(p); + } + if (q != nullptr) { + impl->q.reset(q); + } + if (g != nullptr) { + impl->g.reset(g); + } + + impl->method_mont_p = nullptr; + impl->method_mont_q = nullptr; + return 1; +} + +int DSA_generate_parameters_ex(DSA *dsa, unsigned bits, const uint8_t *seed_in, + size_t seed_len, int *out_counter, + unsigned long *out_h, BN_GENCB *cb) { + auto *impl = FromOpaque(dsa); + + if (bits > OPENSSL_DSA_MAX_MODULUS_BITS) { + OPENSSL_PUT_ERROR(DSA, DSA_R_INVALID_PARAMETERS); + return 0; + } + + unsigned char seed[SHA256_DIGEST_LENGTH]; + unsigned char md[SHA256_DIGEST_LENGTH]; + unsigned char buf[SHA256_DIGEST_LENGTH], buf2[SHA256_DIGEST_LENGTH]; + BIGNUM *r0, *W, *X, *c, *test; + BIGNUM *g = nullptr, *q = nullptr, *p = nullptr; + int k, n = 0, m = 0; + int counter = 0; + int r = 0; + unsigned int h = 2; + const EVP_MD *evpmd; + + evpmd = (bits >= 2048) ? EVP_sha256() : EVP_sha1(); + size_t qsize = EVP_MD_size(evpmd); + + if (bits < 512) { + bits = 512; + } + + bits = (bits + 63) / 64 * 64; + + if (seed_in != nullptr) { + if (seed_len < qsize) { + return 0; + } + if (seed_len > qsize) { + // Only consume as much seed as is expected. + seed_len = qsize; + } + OPENSSL_memcpy(seed, seed_in, seed_len); + } + + UniquePtr ctx(BN_CTX_new()); + if (ctx == nullptr) { + return 0; + } + BN_CTXScope scope(ctx.get()); + + r0 = BN_CTX_get(ctx.get()); + g = BN_CTX_get(ctx.get()); + W = BN_CTX_get(ctx.get()); + q = BN_CTX_get(ctx.get()); + X = BN_CTX_get(ctx.get()); + c = BN_CTX_get(ctx.get()); + p = BN_CTX_get(ctx.get()); + test = BN_CTX_get(ctx.get()); + + if (test == nullptr || !BN_lshift(test, BN_value_one(), bits - 1)) { + return 0; + } + + for (;;) { + // Find q. + for (;;) { + // step 1 + if (!BN_GENCB_call(cb, BN_GENCB_GENERATED, m++)) { + return 0; + } + + int use_random_seed = (seed_in == nullptr); + if (use_random_seed) { + if (!RAND_bytes(seed, qsize)) { + return 0; + } + // DSA parameters are public. + CONSTTIME_DECLASSIFY(seed, qsize); + } else { + // If we come back through, use random seed next time. + seed_in = nullptr; + } + OPENSSL_memcpy(buf, seed, qsize); + OPENSSL_memcpy(buf2, seed, qsize); + // precompute "SEED + 1" for step 7: + for (size_t i = qsize - 1; i < qsize; i--) { + buf[i]++; + if (buf[i] != 0) { + break; + } + } + + // step 2 + if (!EVP_Digest(seed, qsize, md, nullptr, evpmd, nullptr) || + !EVP_Digest(buf, qsize, buf2, nullptr, evpmd, nullptr)) { + return 0; + } + for (size_t i = 0; i < qsize; i++) { + md[i] ^= buf2[i]; + } + + // step 3 + md[0] |= 0x80; + md[qsize - 1] |= 0x01; + if (!BN_bin2bn(md, qsize, q)) { + return 0; + } + + // step 4 + r = BN_is_prime_fasttest_ex(q, DSS_prime_checks, ctx.get(), + use_random_seed, cb); + if (r > 0) { + break; + } + if (r != 0) { + return 0; + } + + // do a callback call + // step 5 + } + + if (!BN_GENCB_call(cb, 2, 0) || !BN_GENCB_call(cb, 3, 0)) { + return 0; + } + + // step 6 + counter = 0; + // "offset = 2" + + n = (bits - 1) / 160; + + for (;;) { + if ((counter != 0) && !BN_GENCB_call(cb, BN_GENCB_GENERATED, counter)) { + return 0; + } + + // step 7 + BN_zero(W); + // now 'buf' contains "SEED + offset - 1" + for (k = 0; k <= n; k++) { + // obtain "SEED + offset + k" by incrementing: + for (size_t i = qsize - 1; i < qsize; i--) { + buf[i]++; + if (buf[i] != 0) { + break; + } + } + + if (!EVP_Digest(buf, qsize, md, nullptr, evpmd, nullptr)) { + return 0; + } + + // step 8 + if (!BN_bin2bn(md, qsize, r0) || !BN_lshift(r0, r0, (qsize << 3) * k) || + !BN_add(W, W, r0)) { + return 0; + } + } + + // more of step 8 + if (!BN_mask_bits(W, bits - 1) || !BN_copy(X, W) || !BN_add(X, X, test)) { + return 0; + } + + // step 9 + if (!BN_lshift1(r0, q) || !BN_mod(c, X, r0, ctx.get()) || + !BN_sub(r0, c, BN_value_one()) || !BN_sub(p, X, r0)) { + return 0; + } + + // step 10 + if (BN_cmp(p, test) >= 0) { + // step 11 + r = BN_is_prime_fasttest_ex(p, DSS_prime_checks, ctx.get(), 1, cb); + if (r > 0) { + goto end; // found it + } + if (r != 0) { + return 0; + } + } + + // step 13 + counter++; + // "offset = offset + n + 1" + + // step 14 + if (counter >= 4096) { + break; + } + } + } +end: + if (!BN_GENCB_call(cb, 2, 1)) { + return 0; + } + + // We now need to generate g + // Set r0=(p-1)/q + if (!BN_sub(test, p, BN_value_one()) || + !BN_div(r0, nullptr, test, q, ctx.get())) { + return 0; + } + + UniquePtr mont(BN_MONT_CTX_new_for_modulus(p, ctx.get())); + if (mont == nullptr || !BN_set_word(test, h)) { + return 0; + } + + for (;;) { + // g=test^r0%p + if (!BN_mod_exp_mont(g, test, r0, p, ctx.get(), mont.get())) { + return 0; + } + if (!BN_is_one(g)) { + break; + } + if (!BN_add(test, test, BN_value_one())) { + return 0; + } + h++; + } + + if (!BN_GENCB_call(cb, 3, 1)) { + return 0; + } + + impl->p.reset(BN_dup(p)); + impl->q.reset(BN_dup(q)); + impl->g.reset(BN_dup(g)); + if (impl->p == nullptr || impl->q == nullptr || impl->g == nullptr) { + return 0; + } + if (out_counter != nullptr) { + *out_counter = counter; + } + if (out_h != nullptr) { + *out_h = h; + } + + return 1; +} + +DSA *DSAparams_dup(const DSA *dsa) { + auto *impl = FromOpaque(dsa); + DSAImpl *ret = FromOpaque(DSA_new()); + if (ret == nullptr) { + return nullptr; + } + ret->p.reset(BN_dup(impl->p.get())); + ret->q.reset(BN_dup(impl->q.get())); + ret->g.reset(BN_dup(impl->g.get())); + if (ret->p == nullptr || ret->q == nullptr || ret->g == nullptr) { + DSA_free(ret); + return nullptr; + } + return ret; +} + +int DSA_generate_key(DSA *dsa) { + auto *impl = FromOpaque(dsa); + + if (!dsa_check_key(impl)) { + return 0; + } + + UniquePtr ctx(BN_CTX_new()); + UniquePtr pub_key(BN_new()), priv_key(BN_new()); + if (ctx == nullptr || pub_key == nullptr || priv_key == nullptr) { + return 0; + } + + if (!BN_rand_range_ex(priv_key.get(), 1, impl->q.get()) || + !BN_MONT_CTX_set_locked(&impl->method_mont_p, &impl->method_mont_lock, + impl->p.get(), ctx.get()) || + !BN_mod_exp_mont_consttime(pub_key.get(), impl->g.get(), priv_key.get(), + impl->p.get(), ctx.get(), + impl->method_mont_p.get())) { + return 0; + } + + // The public key is computed from the private key, but is public. + bn_declassify(pub_key.get()); + + impl->priv_key = std::move(priv_key); + impl->pub_key = std::move(pub_key); + return 1; +} + +DSA_SIG *DSA_SIG_new() { return New(); } + +void DSA_SIG_free(DSA_SIG *sig) { + if (!sig) { + return; + } + + BN_free(sig->r); + BN_free(sig->s); + Delete(sig); +} + +void DSA_SIG_get0(const DSA_SIG *sig, const BIGNUM **out_r, + const BIGNUM **out_s) { + if (out_r != nullptr) { + *out_r = sig->r; + } + if (out_s != nullptr) { + *out_s = sig->s; + } +} + +int DSA_SIG_set0(DSA_SIG *sig, BIGNUM *r, BIGNUM *s) { + if (r == nullptr || s == nullptr) { + return 0; + } + BN_free(sig->r); + BN_free(sig->s); + sig->r = r; + sig->s = s; + return 1; +} + +// mod_mul_consttime sets |r| to |a| * |b| modulo |mont->N|, treating |a| and +// |b| as secret. This function internally uses Montgomery reduction, but +// neither inputs nor outputs are in Montgomery form. +static int mod_mul_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, + const BN_MONT_CTX *mont, BN_CTX *ctx) { + BN_CTXScope scope(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + // |BN_mod_mul_montgomery| removes a factor of R, so we cancel it with a + // single |BN_to_montgomery| which adds one factor of R. + return tmp != nullptr && // + BN_to_montgomery(tmp, a, mont, ctx) && + BN_mod_mul_montgomery(r, tmp, b, mont, ctx); +} + +DSA_SIG *DSA_do_sign(const uint8_t *digest, size_t digest_len, const DSA *dsa) { + auto *impl = FromOpaque(dsa); + + if (!dsa_check_key(impl)) { + return nullptr; + } + + if (impl->priv_key == nullptr) { + OPENSSL_PUT_ERROR(DSA, DSA_R_MISSING_PARAMETERS); + return nullptr; + } + + BIGNUM *kinv = nullptr, *r = nullptr, *s = nullptr; + BIGNUM m; + BIGNUM xr; + BN_CTX *ctx = nullptr; + DSA_SIG *ret = nullptr; + + BN_init(&m); + BN_init(&xr); + s = BN_new(); + { + if (s == nullptr) { + goto err; + } + ctx = BN_CTX_new(); + if (ctx == nullptr) { + goto err; + } + + // Cap iterations so that invalid parameters do not infinite loop. This does + // not impact valid parameters because the probability of requiring even one + // retry is negligible, let alone 32. Unfortunately, DSA was mis-specified, + // so invalid parameters are reachable from most callers handling untrusted + // private keys. (The |dsa_check_key| call above is not sufficient. Checking + // whether arbitrary parameters form a valid DSA group is expensive.) + static const int kMaxIterations = 32; + int iters = 0; + redo: + if (!dsa_sign_setup(impl, ctx, &kinv, &r)) { + goto err; + } + + if (digest_len > BN_num_bytes(impl->q.get())) { + // If the digest length is greater than the size of |impl->q| use the + // BN_num_bits(impl->q) leftmost bits of the digest, see FIPS 186-3, 4.2. + // Note the above check that |impl->q| is a multiple of 8 bits. + digest_len = BN_num_bytes(impl->q.get()); + } + + if (BN_bin2bn(digest, digest_len, &m) == nullptr) { + goto err; + } + + // |m| is bounded by 2^(num_bits(q)), which is slightly looser than q. This + // violates |bn_mod_add_consttime| and |mod_mul_consttime|'s preconditions. + // (The underlying algorithms could accept looser bounds, but we reduce for + // simplicity.) + size_t q_width = bn_minimal_width(impl->q.get()); + if (!bn_resize_words(&m, q_width) || !bn_resize_words(&xr, q_width)) { + goto err; + } + bn_reduce_once_in_place(m.d, 0 /* no carry word */, impl->q->d, + xr.d /* scratch space */, q_width); + + // Compute s = inv(k) (m + xr) mod q. Note |impl->method_mont_q| is + // initialized by |dsa_sign_setup|. + if (!mod_mul_consttime(&xr, impl->priv_key.get(), r, + impl->method_mont_q.get(), ctx) || + !bn_mod_add_consttime(s, &xr, &m, impl->q.get(), ctx) || + !mod_mul_consttime(s, s, kinv, impl->method_mont_q.get(), ctx)) { + goto err; + } + + // The signature is computed from the private key, but is public. + bn_declassify(r); + bn_declassify(s); + + // Redo if r or s is zero as required by FIPS 186-3: this is + // very unlikely. + if (BN_is_zero(r) || BN_is_zero(s)) { + iters++; + if (iters > kMaxIterations) { + OPENSSL_PUT_ERROR(DSA, DSA_R_TOO_MANY_ITERATIONS); + goto err; + } + goto redo; + } + + ret = DSA_SIG_new(); + if (ret == nullptr) { + goto err; + } + ret->r = r; + ret->s = s; + } + +err: + if (ret == nullptr) { + OPENSSL_PUT_ERROR(DSA, ERR_R_BN_LIB); + BN_free(r); + BN_free(s); + } + BN_CTX_free(ctx); + BN_clear_free(&m); + BN_clear_free(&xr); + BN_clear_free(kinv); + + return ret; +} + +int DSA_do_verify(const uint8_t *digest, size_t digest_len, const DSA_SIG *sig, + const DSA *dsa) { + int valid; + if (!DSA_do_check_signature(&valid, digest, digest_len, sig, dsa)) { + return -1; + } + return valid; +} + +int DSA_do_check_signature(int *out_valid, const uint8_t *digest, + size_t digest_len, const DSA_SIG *sig, + const DSA *dsa) { + auto *impl = FromOpaque(dsa); + + *out_valid = 0; + if (!dsa_check_key(impl)) { + return 0; + } + + if (impl->pub_key == nullptr) { + OPENSSL_PUT_ERROR(DSA, DSA_R_MISSING_PARAMETERS); + return 0; + } + + int ret = 0; + BIGNUM u1, u2, t1; + BN_init(&u1); + BN_init(&u2); + BN_init(&t1); + BN_CTX *ctx = BN_CTX_new(); + { + if (ctx == nullptr) { + goto err; + } + + if (BN_is_zero(sig->r) || BN_is_negative(sig->r) || + BN_ucmp(sig->r, impl->q.get()) >= 0) { + ret = 1; + goto err; + } + if (BN_is_zero(sig->s) || BN_is_negative(sig->s) || + BN_ucmp(sig->s, impl->q.get()) >= 0) { + ret = 1; + goto err; + } + + if (!BN_MONT_CTX_set_locked(&impl->method_mont_p, &impl->method_mont_lock, + impl->p.get(), ctx) || + !BN_MONT_CTX_set_locked(&impl->method_mont_q, &impl->method_mont_lock, + impl->q.get(), ctx)) { + goto err; + } + + // Calculate W = inv(S) mod Q, in the Montgomery domain. This is slightly + // more efficiently computed as FromMont(s)^-1 = (s * R^-1)^-1 = s^-1 * R, + // instead of ToMont(s^-1) = s^-1 * R. + if (!BN_from_montgomery(&u2, sig->s, impl->method_mont_q.get(), ctx) || + !BN_mod_inverse(&u2, &u2, impl->q.get(), ctx)) { + goto err; + } + + // save M in u1 + unsigned q_bits = BN_num_bits(impl->q.get()); + if (digest_len > (q_bits >> 3)) { + // if the digest length is greater than the size of q use the + // BN_num_bits(impl->q) leftmost bits of the digest, see + // fips 186-3, 4.2 + digest_len = (q_bits >> 3); + } + + if (BN_bin2bn(digest, digest_len, &u1) == nullptr) { + goto err; + } + + // u1 = M * w mod q. w was stored in the Montgomery domain while M was not, + // so the result will already be out of the Montgomery domain. + if (!BN_mod_mul_montgomery(&u1, &u1, &u2, impl->method_mont_q.get(), ctx)) { + goto err; + } + + // u2 = r * w mod q. w was stored in the Montgomery domain while r was not, + // so the result will already be out of the Montgomery domain. + if (!BN_mod_mul_montgomery(&u2, sig->r, &u2, impl->method_mont_q.get(), + ctx)) { + goto err; + } + + if (!BN_mod_exp2_mont(&t1, impl->g.get(), &u1, impl->pub_key.get(), &u2, + impl->p.get(), ctx, impl->method_mont_p.get())) { + goto err; + } + + // let u1 = u1 mod q + if (!BN_mod(&u1, &t1, impl->q.get(), ctx)) { + goto err; + } + + // V is now in u1. If the signature is correct, it will be + // equal to R. + *out_valid = BN_ucmp(&u1, sig->r) == 0; + ret = 1; + } + +err: + if (ret != 1) { + OPENSSL_PUT_ERROR(DSA, ERR_R_BN_LIB); + } + BN_CTX_free(ctx); + BN_free(&u1); + BN_free(&u2); + BN_free(&t1); + + return ret; +} + +int DSA_sign(int type, const uint8_t *digest, size_t digest_len, + uint8_t *out_sig, unsigned int *out_siglen, const DSA *dsa) { + DSA_SIG *s = DSA_do_sign(digest, digest_len, dsa); + if (s == nullptr) { + *out_siglen = 0; + return 0; + } + + *out_siglen = i2d_DSA_SIG(s, &out_sig); + DSA_SIG_free(s); + return 1; +} + +int DSA_verify(int type, const uint8_t *digest, size_t digest_len, + const uint8_t *sig, size_t sig_len, const DSA *dsa) { + int valid; + if (!DSA_check_signature(&valid, digest, digest_len, sig, sig_len, dsa)) { + return -1; + } + return valid; +} + +int DSA_check_signature(int *out_valid, const uint8_t *digest, + size_t digest_len, const uint8_t *sig, size_t sig_len, + const DSA *dsa) { + DSA_SIG *s = nullptr; + int ret = 0; + uint8_t *der = nullptr; + + s = DSA_SIG_new(); + { + if (s == nullptr) { + goto err; + } + + const uint8_t *sigp = sig; + if (d2i_DSA_SIG(&s, &sigp, sig_len) == nullptr || sigp != sig + sig_len) { + goto err; + } + + // Ensure that the signature uses DER and doesn't have trailing garbage. + int der_len = i2d_DSA_SIG(s, &der); + if (der_len < 0 || (size_t)der_len != sig_len || + OPENSSL_memcmp(sig, der, sig_len)) { + goto err; + } + + ret = DSA_do_check_signature(out_valid, digest, digest_len, s, dsa); + } + +err: + OPENSSL_free(der); + DSA_SIG_free(s); + return ret; +} + +// der_len_len returns the number of bytes needed to represent a length of |len| +// in DER. +static size_t der_len_len(size_t len) { + if (len < 0x80) { + return 1; + } + size_t ret = 1; + while (len > 0) { + ret++; + len >>= 8; + } + return ret; +} + +int DSA_size(const DSA *dsa) { + auto *impl = FromOpaque(dsa); + + if (impl->q == nullptr) { + return 0; + } + + size_t order_len = BN_num_bytes(impl->q.get()); + // Compute the maximum length of an |order_len| byte integer. Defensively + // assume that the leading 0x00 is included. + size_t integer_len = 1 /* tag */ + der_len_len(order_len + 1) + 1 + order_len; + if (integer_len < order_len) { + return 0; + } + // A DSA signature is two INTEGERs. + size_t value_len = 2 * integer_len; + if (value_len < integer_len) { + return 0; + } + // Add the header. + size_t ret = 1 /* tag */ + der_len_len(value_len) + value_len; + if (ret < value_len) { + return 0; + } + return ret; +} + +static int dsa_sign_setup(const DSAImpl *dsa, BN_CTX *ctx, BIGNUM **out_kinv, + BIGNUM **out_r) { + int ret = 0; + BIGNUM k; + BN_init(&k); + BIGNUM *r = BN_new(); + BIGNUM *kinv = BN_new(); + if (r == nullptr || kinv == nullptr || + // Get random k + !BN_rand_range_ex(&k, 1, dsa->q.get()) || + !BN_MONT_CTX_set_locked(&dsa->method_mont_p, &dsa->method_mont_lock, + dsa->p.get(), ctx) || + !BN_MONT_CTX_set_locked(&dsa->method_mont_q, &dsa->method_mont_lock, + dsa->q.get(), ctx) || + // Compute r = (g^k mod p) mod q + !BN_mod_exp_mont_consttime(r, dsa->g.get(), &k, dsa->p.get(), ctx, + dsa->method_mont_p.get())) { + OPENSSL_PUT_ERROR(DSA, ERR_R_BN_LIB); + goto err; + } + // Note |BN_mod| below is not constant-time and may leak information about + // |r|. |dsa->p| may be significantly larger than |dsa->q|, so this is not + // easily performed in constant-time with Montgomery reduction. + // + // However, |r| at this point is g^k (mod p). It is almost the value of |r| + // revealed in the signature anyway (g^k (mod p) (mod q)), going from it to + // |k| would require computing a discrete log. + bn_declassify(r); + if (!BN_mod(r, r, dsa->q.get(), ctx) || + // Compute part of 's = inv(k) (m + xr) mod q' using Fermat's Little + // Theorem. + !bn_mod_inverse_prime(kinv, &k, dsa->q.get(), ctx, dsa->method_mont_q.get())) { + OPENSSL_PUT_ERROR(DSA, ERR_R_BN_LIB); + goto err; + } + + BN_clear_free(*out_kinv); + *out_kinv = kinv; + kinv = nullptr; + + BN_clear_free(*out_r); + *out_r = r; + r = nullptr; + + ret = 1; + +err: + BN_clear_free(&k); + BN_clear_free(r); + BN_clear_free(kinv); + return ret; +} + +int DSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_unused *unused, + CRYPTO_EX_dup *dup_unused, CRYPTO_EX_free *free_func) { + return CRYPTO_get_ex_new_index_ex(&g_ex_data_class, argl, argp, free_func); +} + +int DSA_set_ex_data(DSA *dsa, int idx, void *arg) { + auto *impl = FromOpaque(dsa); + return CRYPTO_set_ex_data(&impl->ex_data, idx, arg); +} + +void *DSA_get_ex_data(const DSA *dsa, int idx) { + auto *impl = FromOpaque(dsa); + return CRYPTO_get_ex_data(&impl->ex_data, idx); +} + +static bool copy_bn(UniquePtr *dst, const BIGNUM *src) { + UniquePtr copy; + if (src) { + copy.reset(BN_dup(src)); + if (!copy) { + return false; + } + } + *dst = std::move(copy); + return true; +} + +DH *DSA_dup_DH(const DSA *dsa) { + auto *impl = FromOpaque(dsa); + if (dsa == nullptr) { + return nullptr; + } + + UniquePtr ret(DH_new()); + auto *dh = FromOpaque(ret.get()); + if (ret == nullptr) { + return nullptr; + } + if (impl->q != nullptr) { + dh->priv_length = BN_num_bits(impl->q.get()); + if (!copy_bn(&dh->q, impl->q.get())) { + return nullptr; + } + } + if (!copy_bn(&dh->p, impl->p.get()) || // + !copy_bn(&dh->g, impl->g.get()) || // + !copy_bn(&dh->pub_key, impl->pub_key.get()) || // + !copy_bn(&dh->priv_key, impl->priv_key.get())) { + return nullptr; + } + + return ret.release(); +} diff --git a/third_party/boringssl/src/crypto/dsa/dsa_asn1.c b/third_party/boringssl/src/crypto/dsa/dsa_asn1.c deleted file mode 100644 index 3f3bd488..00000000 --- a/third_party/boringssl/src/crypto/dsa/dsa_asn1.c +++ /dev/null @@ -1,390 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 2000. */ -/* ==================================================================== - * Copyright (c) 2000-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include - -#include -#include -#include -#include - -#include "internal.h" -#include "../bytestring/internal.h" - - -#define OPENSSL_DSA_MAX_MODULUS_BITS 10000 - -// This function is in dsa_asn1.c rather than dsa.c because it is reachable from -// |EVP_PKEY| parsers. This makes it easier for the static linker to drop most -// of the DSA implementation. -int dsa_check_parameters(const DSA *dsa) { - if (!dsa->p || !dsa->q || !dsa->g) { - OPENSSL_PUT_ERROR(DSA, DSA_R_MISSING_PARAMETERS); - return 0; - } - - // Reject invalid parameters. In particular, signing will infinite loop if |g| - // is zero. - if (BN_is_zero(dsa->p) || BN_is_zero(dsa->q) || BN_is_zero(dsa->g)) { - OPENSSL_PUT_ERROR(DSA, DSA_R_INVALID_PARAMETERS); - return 0; - } - - // FIPS 186-4 allows only three different sizes for q. - unsigned q_bits = BN_num_bits(dsa->q); - if (q_bits != 160 && q_bits != 224 && q_bits != 256) { - OPENSSL_PUT_ERROR(DSA, DSA_R_BAD_Q_VALUE); - return 0; - } - - // Bound |dsa->p| to avoid a DoS vector. Note this limit is much larger than - // the one in FIPS 186-4, which only allows L = 1024, 2048, and 3072. - if (BN_num_bits(dsa->p) > OPENSSL_DSA_MAX_MODULUS_BITS) { - OPENSSL_PUT_ERROR(DSA, DSA_R_MODULUS_TOO_LARGE); - return 0; - } - - return 1; -} - -static int parse_integer(CBS *cbs, BIGNUM **out) { - assert(*out == NULL); - *out = BN_new(); - if (*out == NULL) { - return 0; - } - return BN_parse_asn1_unsigned(cbs, *out); -} - -static int marshal_integer(CBB *cbb, BIGNUM *bn) { - if (bn == NULL) { - // A DSA object may be missing some components. - OPENSSL_PUT_ERROR(DSA, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - return BN_marshal_asn1(cbb, bn); -} - -DSA_SIG *DSA_SIG_parse(CBS *cbs) { - DSA_SIG *ret = DSA_SIG_new(); - if (ret == NULL) { - return NULL; - } - CBS child; - if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || - !parse_integer(&child, &ret->r) || - !parse_integer(&child, &ret->s) || - CBS_len(&child) != 0) { - OPENSSL_PUT_ERROR(DSA, DSA_R_DECODE_ERROR); - DSA_SIG_free(ret); - return NULL; - } - return ret; -} - -int DSA_SIG_marshal(CBB *cbb, const DSA_SIG *sig) { - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || - !marshal_integer(&child, sig->r) || - !marshal_integer(&child, sig->s) || - !CBB_flush(cbb)) { - OPENSSL_PUT_ERROR(DSA, DSA_R_ENCODE_ERROR); - return 0; - } - return 1; -} - -DSA *DSA_parse_public_key(CBS *cbs) { - DSA *ret = DSA_new(); - if (ret == NULL) { - return NULL; - } - CBS child; - if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || - !parse_integer(&child, &ret->pub_key) || - !parse_integer(&child, &ret->p) || - !parse_integer(&child, &ret->q) || - !parse_integer(&child, &ret->g) || - CBS_len(&child) != 0) { - OPENSSL_PUT_ERROR(DSA, DSA_R_DECODE_ERROR); - goto err; - } - if (!dsa_check_parameters(ret)) { - goto err; - } - return ret; - -err: - DSA_free(ret); - return NULL; -} - -int DSA_marshal_public_key(CBB *cbb, const DSA *dsa) { - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || - !marshal_integer(&child, dsa->pub_key) || - !marshal_integer(&child, dsa->p) || - !marshal_integer(&child, dsa->q) || - !marshal_integer(&child, dsa->g) || - !CBB_flush(cbb)) { - OPENSSL_PUT_ERROR(DSA, DSA_R_ENCODE_ERROR); - return 0; - } - return 1; -} - -DSA *DSA_parse_parameters(CBS *cbs) { - DSA *ret = DSA_new(); - if (ret == NULL) { - return NULL; - } - CBS child; - if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || - !parse_integer(&child, &ret->p) || - !parse_integer(&child, &ret->q) || - !parse_integer(&child, &ret->g) || - CBS_len(&child) != 0) { - OPENSSL_PUT_ERROR(DSA, DSA_R_DECODE_ERROR); - goto err; - } - if (!dsa_check_parameters(ret)) { - goto err; - } - return ret; - -err: - DSA_free(ret); - return NULL; -} - -int DSA_marshal_parameters(CBB *cbb, const DSA *dsa) { - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || - !marshal_integer(&child, dsa->p) || - !marshal_integer(&child, dsa->q) || - !marshal_integer(&child, dsa->g) || - !CBB_flush(cbb)) { - OPENSSL_PUT_ERROR(DSA, DSA_R_ENCODE_ERROR); - return 0; - } - return 1; -} - -DSA *DSA_parse_private_key(CBS *cbs) { - DSA *ret = DSA_new(); - if (ret == NULL) { - return NULL; - } - - CBS child; - uint64_t version; - if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1_uint64(&child, &version)) { - OPENSSL_PUT_ERROR(DSA, DSA_R_DECODE_ERROR); - goto err; - } - - if (version != 0) { - OPENSSL_PUT_ERROR(DSA, DSA_R_BAD_VERSION); - goto err; - } - - if (!parse_integer(&child, &ret->p) || - !parse_integer(&child, &ret->q) || - !parse_integer(&child, &ret->g) || - !parse_integer(&child, &ret->pub_key) || - !parse_integer(&child, &ret->priv_key) || - CBS_len(&child) != 0) { - OPENSSL_PUT_ERROR(DSA, DSA_R_DECODE_ERROR); - goto err; - } - if (!dsa_check_parameters(ret)) { - goto err; - } - return ret; - -err: - DSA_free(ret); - return NULL; -} - -int DSA_marshal_private_key(CBB *cbb, const DSA *dsa) { - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1_uint64(&child, 0 /* version */) || - !marshal_integer(&child, dsa->p) || - !marshal_integer(&child, dsa->q) || - !marshal_integer(&child, dsa->g) || - !marshal_integer(&child, dsa->pub_key) || - !marshal_integer(&child, dsa->priv_key) || - !CBB_flush(cbb)) { - OPENSSL_PUT_ERROR(DSA, DSA_R_ENCODE_ERROR); - return 0; - } - return 1; -} - -DSA_SIG *d2i_DSA_SIG(DSA_SIG **out_sig, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - DSA_SIG *ret = DSA_SIG_parse(&cbs); - if (ret == NULL) { - return NULL; - } - if (out_sig != NULL) { - DSA_SIG_free(*out_sig); - *out_sig = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -int i2d_DSA_SIG(const DSA_SIG *in, uint8_t **outp) { - CBB cbb; - if (!CBB_init(&cbb, 0) || - !DSA_SIG_marshal(&cbb, in)) { - CBB_cleanup(&cbb); - return -1; - } - return CBB_finish_i2d(&cbb, outp); -} - -DSA *d2i_DSAPublicKey(DSA **out, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - DSA *ret = DSA_parse_public_key(&cbs); - if (ret == NULL) { - return NULL; - } - if (out != NULL) { - DSA_free(*out); - *out = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -int i2d_DSAPublicKey(const DSA *in, uint8_t **outp) { - CBB cbb; - if (!CBB_init(&cbb, 0) || - !DSA_marshal_public_key(&cbb, in)) { - CBB_cleanup(&cbb); - return -1; - } - return CBB_finish_i2d(&cbb, outp); -} - -DSA *d2i_DSAPrivateKey(DSA **out, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - DSA *ret = DSA_parse_private_key(&cbs); - if (ret == NULL) { - return NULL; - } - if (out != NULL) { - DSA_free(*out); - *out = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -int i2d_DSAPrivateKey(const DSA *in, uint8_t **outp) { - CBB cbb; - if (!CBB_init(&cbb, 0) || - !DSA_marshal_private_key(&cbb, in)) { - CBB_cleanup(&cbb); - return -1; - } - return CBB_finish_i2d(&cbb, outp); -} - -DSA *d2i_DSAparams(DSA **out, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - DSA *ret = DSA_parse_parameters(&cbs); - if (ret == NULL) { - return NULL; - } - if (out != NULL) { - DSA_free(*out); - *out = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -int i2d_DSAparams(const DSA *in, uint8_t **outp) { - CBB cbb; - if (!CBB_init(&cbb, 0) || - !DSA_marshal_parameters(&cbb, in)) { - CBB_cleanup(&cbb); - return -1; - } - return CBB_finish_i2d(&cbb, outp); -} diff --git a/third_party/boringssl/src/crypto/dsa/dsa_asn1.cc b/third_party/boringssl/src/crypto/dsa/dsa_asn1.cc new file mode 100644 index 00000000..a1338681 --- /dev/null +++ b/third_party/boringssl/src/crypto/dsa/dsa_asn1.cc @@ -0,0 +1,306 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +// This function is in dsa_asn1.c rather than dsa.c because it is reachable from +// |EVP_PKEY| parsers. This makes it easier for the static linker to drop most +// of the DSA implementation. +int bssl::dsa_check_key(const DSAImpl *dsa) { + if (!dsa->p || !dsa->q || !dsa->g) { + OPENSSL_PUT_ERROR(DSA, DSA_R_MISSING_PARAMETERS); + return 0; + } + + // Fully checking for invalid DSA groups is expensive, so security and + // correctness of the signature scheme depend on how |dsa| was computed. I.e. + // we leave "assurance of domain parameter validity" from FIPS 186-4 to the + // caller. However, we check bounds on all values to avoid DoS vectors even + // when domain parameters are invalid. In particular, signing will infinite + // loop if |g| is zero. + if (BN_is_negative(dsa->p.get()) || BN_is_negative(dsa->q.get()) || + BN_is_zero(dsa->p.get()) || BN_is_zero(dsa->q.get()) || + !BN_is_odd(dsa->p.get()) || !BN_is_odd(dsa->q.get()) || + // |q| must be a prime divisor of |p - 1|, which implies |q < p|. + BN_cmp(dsa->q.get(), dsa->p.get()) >= 0 || + // |g| is in the multiplicative group of |p|. + BN_is_negative(dsa->g.get()) || BN_is_zero(dsa->g.get()) || + BN_cmp(dsa->g.get(), dsa->p.get()) >= 0) { + OPENSSL_PUT_ERROR(DSA, DSA_R_INVALID_PARAMETERS); + return 0; + } + + // FIPS 186-4 allows only three different sizes for q. + unsigned q_bits = BN_num_bits(dsa->q.get()); + if (q_bits != 160 && q_bits != 224 && q_bits != 256) { + OPENSSL_PUT_ERROR(DSA, DSA_R_BAD_Q_VALUE); + return 0; + } + + // Bound |dsa->p| to avoid a DoS vector. Note this limit is much larger than + // the one in FIPS 186-4, which only allows L = 1024, 2048, and 3072. + if (BN_num_bits(dsa->p.get()) > OPENSSL_DSA_MAX_MODULUS_BITS) { + OPENSSL_PUT_ERROR(DSA, DSA_R_MODULUS_TOO_LARGE); + return 0; + } + + if (dsa->pub_key != nullptr) { + // The public key is also in the multiplicative group of |p|. + if (BN_is_negative(dsa->pub_key.get()) || BN_is_zero(dsa->pub_key.get()) || + BN_cmp(dsa->pub_key.get(), dsa->p.get()) >= 0) { + OPENSSL_PUT_ERROR(DSA, DSA_R_INVALID_PARAMETERS); + return 0; + } + } + + if (dsa->priv_key != nullptr) { + // The private key is a non-zero element of the scalar field, determined by + // |q|. + if (BN_is_negative(dsa->priv_key.get()) || + constant_time_declassify_int(BN_is_zero(dsa->priv_key.get())) || + constant_time_declassify_int( + BN_cmp(dsa->priv_key.get(), dsa->q.get()) >= 0)) { + OPENSSL_PUT_ERROR(DSA, DSA_R_INVALID_PARAMETERS); + return 0; + } + } + + return 1; +} + +static int parse_integer(CBS *cbs, UniquePtr *out) { + assert(*out == nullptr); + out->reset(BN_new()); + if (*out == nullptr) { + return 0; + } + return BN_parse_asn1_unsigned(cbs, out->get()); +} + +static int marshal_integer(CBB *cbb, BIGNUM *bn) { + if (bn == nullptr) { + // A DSA object may be missing some components. + OPENSSL_PUT_ERROR(DSA, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + return BN_marshal_asn1(cbb, bn); +} + +DSA_SIG *DSA_SIG_parse(CBS *cbs) { + DSA_SIG *ret = DSA_SIG_new(); + if (ret == nullptr) { + return nullptr; + } + CBS child; + UniquePtr r, s; + if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || + !parse_integer(&child, &r) || + !parse_integer(&child, &s) || + CBS_len(&child) != 0) { + OPENSSL_PUT_ERROR(DSA, DSA_R_DECODE_ERROR); + DSA_SIG_free(ret); + return nullptr; + } + ret->r = r.release(); + ret->s = s.release(); + return ret; +} + +int DSA_SIG_marshal(CBB *cbb, const DSA_SIG *sig) { + CBB child; + if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || + !marshal_integer(&child, sig->r) || + !marshal_integer(&child, sig->s) || + !CBB_flush(cbb)) { + OPENSSL_PUT_ERROR(DSA, DSA_R_ENCODE_ERROR); + return 0; + } + return 1; +} + +DSA *DSA_parse_public_key(CBS *cbs) { + UniquePtr ret(FromOpaque(DSA_new())); + if (ret == nullptr) { + return nullptr; + } + CBS child; + if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || + !parse_integer(&child, &ret->pub_key) || + !parse_integer(&child, &ret->p) || + !parse_integer(&child, &ret->q) || + !parse_integer(&child, &ret->g) || + CBS_len(&child) != 0) { + OPENSSL_PUT_ERROR(DSA, DSA_R_DECODE_ERROR); + return nullptr; + } + if (!dsa_check_key(ret.get())) { + return nullptr; + } + return ret.release(); +} + +int DSA_marshal_public_key(CBB *cbb, const DSA *dsa) { + const auto *impl = FromOpaque(dsa); + + CBB child; + if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || + !marshal_integer(&child, impl->pub_key.get()) || + !marshal_integer(&child, impl->p.get()) || + !marshal_integer(&child, impl->q.get()) || + !marshal_integer(&child, impl->g.get()) || !CBB_flush(cbb)) { + OPENSSL_PUT_ERROR(DSA, DSA_R_ENCODE_ERROR); + return 0; + } + return 1; +} + +DSA *DSA_parse_parameters(CBS *cbs) { + UniquePtr ret(FromOpaque(DSA_new())); + if (ret == nullptr) { + return nullptr; + } + CBS child; + if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || + !parse_integer(&child, &ret->p) || + !parse_integer(&child, &ret->q) || + !parse_integer(&child, &ret->g) || + CBS_len(&child) != 0) { + OPENSSL_PUT_ERROR(DSA, DSA_R_DECODE_ERROR); + return nullptr; + } + if (!dsa_check_key(ret.get())) { + return nullptr; + } + return ret.release(); +} + +int DSA_marshal_parameters(CBB *cbb, const DSA *dsa) { + const auto *impl = FromOpaque(dsa); + + CBB child; + if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || + !marshal_integer(&child, impl->p.get()) || + !marshal_integer(&child, impl->q.get()) || + !marshal_integer(&child, impl->g.get()) || !CBB_flush(cbb)) { + OPENSSL_PUT_ERROR(DSA, DSA_R_ENCODE_ERROR); + return 0; + } + return 1; +} + +DSA *DSA_parse_private_key(CBS *cbs) { + UniquePtr ret(FromOpaque(DSA_new())); + if (ret == nullptr) { + return nullptr; + } + + CBS child; + uint64_t version; + if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1_uint64(&child, &version)) { + OPENSSL_PUT_ERROR(DSA, DSA_R_DECODE_ERROR); + return nullptr; + } + + if (version != 0) { + OPENSSL_PUT_ERROR(DSA, DSA_R_BAD_VERSION); + return nullptr; + } + + if (!parse_integer(&child, &ret->p) || + !parse_integer(&child, &ret->q) || + !parse_integer(&child, &ret->g) || + !parse_integer(&child, &ret->pub_key) || + !parse_integer(&child, &ret->priv_key) || + CBS_len(&child) != 0) { + OPENSSL_PUT_ERROR(DSA, DSA_R_DECODE_ERROR); + return nullptr; + } + if (!dsa_check_key(ret.get())) { + return nullptr; + } + + return ret.release(); +} + +int DSA_marshal_private_key(CBB *cbb, const DSA *dsa) { + const auto *impl = FromOpaque(dsa); + + CBB child; + if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&child, 0 /* version */) || + !marshal_integer(&child, impl->p.get()) || + !marshal_integer(&child, impl->q.get()) || + !marshal_integer(&child, impl->g.get()) || + !marshal_integer(&child, impl->pub_key.get()) || + !marshal_integer(&child, impl->priv_key.get()) || !CBB_flush(cbb)) { + OPENSSL_PUT_ERROR(DSA, DSA_R_ENCODE_ERROR); + return 0; + } + return 1; +} + +DSA_SIG *d2i_DSA_SIG(DSA_SIG **out_sig, const uint8_t **inp, long len) { + return D2IFromCBS(out_sig, inp, len, DSA_SIG_parse); +} + +int i2d_DSA_SIG(const DSA_SIG *in, uint8_t **outp) { + return I2DFromCBB( + /*initial_capacity=*/256, outp, + [&](CBB *cbb) -> bool { return DSA_SIG_marshal(cbb, in); }); +} + +DSA *d2i_DSAPublicKey(DSA **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, DSA_parse_public_key); +} + +int i2d_DSAPublicKey(const DSA *in, uint8_t **outp) { + return I2DFromCBB( + /*initial_capacity=*/256, outp, + [&](CBB *cbb) -> bool { return DSA_marshal_public_key(cbb, in); }); +} + +DSA *d2i_DSAPrivateKey(DSA **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, DSA_parse_private_key); +} + +int i2d_DSAPrivateKey(const DSA *in, uint8_t **outp) { + return I2DFromCBB( + /*initial_capacity=*/256, outp, + [&](CBB *cbb) -> bool { return DSA_marshal_private_key(cbb, in); }); +} + +DSA *d2i_DSAparams(DSA **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, DSA_parse_parameters); +} + +int i2d_DSAparams(const DSA *in, uint8_t **outp) { + return I2DFromCBB( + /*initial_capacity=*/256, outp, + [&](CBB *cbb) -> bool { return DSA_marshal_parameters(cbb, in); }); +} diff --git a/third_party/boringssl/src/crypto/dsa/internal.h b/third_party/boringssl/src/crypto/dsa/internal.h index 2d86edb2..70f77e76 100644 --- a/third_party/boringssl/src/crypto/dsa/internal.h +++ b/third_party/boringssl/src/crypto/dsa/internal.h @@ -1,34 +1,56 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_DSA_INTERNAL_H -#define OPENSSL_HEADER_DSA_INTERNAL_H +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_DSA_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_DSA_INTERNAL_H #include -#if defined(__cplusplus) -extern "C" { -#endif +#include "../internal.h" +#include "../mem_internal.h" -// dsa_check_parameters checks that |dsa|'s group is within DoS bounds. It -// returns one on success and zero on error. -int dsa_check_parameters(const DSA *dsa); +DECLARE_OPAQUE_STRUCT(dsa_st, DSAImpl) +BSSL_NAMESPACE_BEGIN -#if defined(__cplusplus) -} // extern C -#endif +class DSAImpl : public dsa_st, public RefCounted { + public: + DSAImpl(); -#endif // OPENSSL_HEADER_DSA_INTERNAL_H + UniquePtr p; + UniquePtr q; + UniquePtr g; + + UniquePtr pub_key; + UniquePtr priv_key; + + // Normally used to cache montgomery values + mutable Mutex method_mont_lock; + mutable UniquePtr method_mont_p; + mutable UniquePtr method_mont_q; + CRYPTO_EX_DATA ex_data; + + private: + friend RefCounted; + ~DSAImpl(); +}; + +// dsa_check_key performs cheap self-checks on |dsa|, and ensures it is within +// DoS bounds. It returns one on success and zero on error. +int dsa_check_key(const DSAImpl *dsa); + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_DSA_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/ec/ec_asn1.cc b/third_party/boringssl/src/crypto/ec/ec_asn1.cc new file mode 100644 index 00000000..a34b4107 --- /dev/null +++ b/third_party/boringssl/src/crypto/ec/ec_asn1.cc @@ -0,0 +1,542 @@ +// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../fipsmodule/ec/internal.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +static const CBS_ASN1_TAG kParametersTag = + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0; +static const CBS_ASN1_TAG kPublicKeyTag = + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 1; + +static auto get_all_groups() { + return std::array{ + EC_group_p224(), + EC_group_p256(), + EC_group_p384(), + EC_group_p521(), + }; +} + +EC_KEY *bssl::ec_key_parse_private_key( + CBS *cbs, const EC_GROUP *group, + Span allowed_groups) { + // If a group was supplied externally, no other groups can be parsed. + if (group != nullptr) { + allowed_groups = Span(&group, 1); + } + + CBS ec_private_key, private_key; + uint64_t version; + if (!CBS_get_asn1(cbs, &ec_private_key, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1_uint64(&ec_private_key, &version) || // + version != 1 || + !CBS_get_asn1(&ec_private_key, &private_key, CBS_ASN1_OCTETSTRING)) { + OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); + return nullptr; + } + + // Parse the optional parameters field. + if (CBS_peek_asn1_tag(&ec_private_key, kParametersTag)) { + // Per SEC 1, as an alternative to omitting it, one is allowed to specify + // this field and put in a NULL to mean inheriting this value. This was + // omitted in a previous version of this logic without problems, so leave it + // unimplemented. + CBS child; + if (!CBS_get_asn1(&ec_private_key, &child, kParametersTag)) { + OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); + return nullptr; + } + const EC_GROUP *inner_group = + ec_key_parse_parameters(&child, allowed_groups); + if (inner_group == nullptr) { + // If the caller already supplied a group, any explicit group is required + // to match. On mismatch, |ec_key_parse_parameters| will fail to recognize + // any other groups, so remap the error. + if (group != nullptr && + ERR_equals(ERR_peek_last_error(), ERR_LIB_EC, EC_R_UNKNOWN_GROUP)) { + ERR_clear_error(); + OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); + } + return nullptr; + } + // Overriding |allowed_groups| above ensures the only returned group will be + // the matching one. + assert(group == nullptr || inner_group == group); + group = inner_group; + if (CBS_len(&child) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); + return nullptr; + } + } + + // The group must have been specified either externally, or explicitly in the + // structure. + if (group == nullptr) { + OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS); + return nullptr; + } + + UniquePtr ret(FromOpaque(EC_KEY_new())); + if (ret == nullptr || !EC_KEY_set_group(ret.get(), group)) { + return nullptr; + } + + // Although RFC 5915 specifies the length of the key, OpenSSL historically + // got this wrong, so accept any length. See upstream's + // 30cd4ff294252c4b6a4b69cbef6a5b4117705d22. + UniquePtr priv_key( + BN_bin2bn(CBS_data(&private_key), CBS_len(&private_key), nullptr)); + ret->pub_key = EC_POINT_new(group); + if (priv_key == nullptr || ret->pub_key == nullptr || + !EC_KEY_set_private_key(ret.get(), priv_key.get())) { + return nullptr; + } + + if (CBS_peek_asn1_tag(&ec_private_key, kPublicKeyTag)) { + CBS child, public_key; + uint8_t padding; + if (!CBS_get_asn1(&ec_private_key, &child, kPublicKeyTag) || + !CBS_get_asn1(&child, &public_key, CBS_ASN1_BITSTRING) || + // As in a SubjectPublicKeyInfo, the byte-encoded public key is then + // encoded as a BIT STRING with bits ordered as in the DER encoding. + !CBS_get_u8(&public_key, &padding) || // + padding != 0 || + // Explicitly check |public_key| is non-empty to save the conversion + // form later. + CBS_len(&public_key) == 0 || + !EC_POINT_oct2point(group, ret->pub_key, CBS_data(&public_key), + CBS_len(&public_key), nullptr) || + CBS_len(&child) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); + return nullptr; + } + + // Save the point conversion form. + // TODO(davidben): Consider removing this. + ret->conv_form = + (point_conversion_form_t)(CBS_data(&public_key)[0] & ~0x01); + } else { + // Compute the public key instead. + if (!ec_point_mul_scalar_base(group, &ret->pub_key->raw, + &ret->priv_key->scalar)) { + return nullptr; + } + // Remember the original private-key-only encoding. + // TODO(davidben): Consider removing this. + ret->enc_flag |= EC_PKEY_NO_PUBKEY; + } + + if (CBS_len(&ec_private_key) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); + return nullptr; + } + + // Ensure the resulting key is valid. + if (!EC_KEY_check_key(ret.get())) { + return nullptr; + } + + return ret.release(); +} + +EC_KEY *EC_KEY_parse_private_key(CBS *cbs, const EC_GROUP *group) { + return ec_key_parse_private_key(cbs, group, get_all_groups()); +} + +int EC_KEY_marshal_private_key(CBB *cbb, const EC_KEY *key, + unsigned enc_flags) { + const ECKey *key_impl = FromOpaque(key); + if (key_impl == nullptr || key_impl->group == nullptr || + key_impl->priv_key == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + CBB ec_private_key, private_key; + if (!CBB_add_asn1(cbb, &ec_private_key, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&ec_private_key, 1 /* version */) || + !CBB_add_asn1(&ec_private_key, &private_key, CBS_ASN1_OCTETSTRING) || + !BN_bn2cbb_padded(&private_key, + BN_num_bytes(EC_GROUP_get0_order(key_impl->group)), + EC_KEY_get0_private_key(key_impl))) { + OPENSSL_PUT_ERROR(EC, EC_R_ENCODE_ERROR); + return 0; + } + + if (!(enc_flags & EC_PKEY_NO_PARAMETERS)) { + CBB child; + if (!CBB_add_asn1(&ec_private_key, &child, kParametersTag) || + !EC_KEY_marshal_curve_name(&child, key_impl->group) || + !CBB_flush(&ec_private_key)) { + OPENSSL_PUT_ERROR(EC, EC_R_ENCODE_ERROR); + return 0; + } + } + + // TODO(fork): replace this flexibility with sensible default? + if (!(enc_flags & EC_PKEY_NO_PUBKEY) && key_impl->pub_key != nullptr) { + CBB child, public_key; + if (!CBB_add_asn1(&ec_private_key, &child, kPublicKeyTag) || + !CBB_add_asn1(&child, &public_key, CBS_ASN1_BITSTRING) || + // As in a SubjectPublicKeyInfo, the byte-encoded public key is then + // encoded as a BIT STRING with bits ordered as in the DER encoding. + !CBB_add_u8(&public_key, 0 /* padding */) || + !EC_POINT_point2cbb(&public_key, key_impl->group, key_impl->pub_key, + key_impl->conv_form, nullptr) || + !CBB_flush(&ec_private_key)) { + OPENSSL_PUT_ERROR(EC, EC_R_ENCODE_ERROR); + return 0; + } + } + + if (!CBB_flush(cbb)) { + OPENSSL_PUT_ERROR(EC, EC_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +// kPrimeFieldOID is the encoding of 1.2.840.10045.1.1. +static const uint8_t kPrimeField[] = {0x2a, 0x86, 0x48, 0xce, 0x3d, 0x01, 0x01}; + +namespace { +struct explicit_prime_curve { + CBS prime, a, b, base_x, base_y, order; +}; +} // namespace + +static int parse_explicit_prime_curve(CBS *in, + struct explicit_prime_curve *out) { + // See RFC 3279, section 2.3.5. Note that RFC 3279 calls this structure an + // ECParameters while RFC 5480 calls it a SpecifiedECDomain. + CBS params, field_id, field_type, curve, base, cofactor; + int has_cofactor; + uint64_t version; + if (!CBS_get_asn1(in, ¶ms, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1_uint64(¶ms, &version) || // + version != 1 || // + !CBS_get_asn1(¶ms, &field_id, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&field_id, &field_type, CBS_ASN1_OBJECT) || + CBS_len(&field_type) != sizeof(kPrimeField) || + OPENSSL_memcmp(CBS_data(&field_type), kPrimeField, sizeof(kPrimeField)) != + 0 || + !CBS_get_asn1(&field_id, &out->prime, CBS_ASN1_INTEGER) || + !CBS_is_unsigned_asn1_integer(&out->prime) || // + CBS_len(&field_id) != 0 || + !CBS_get_asn1(¶ms, &curve, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&curve, &out->a, CBS_ASN1_OCTETSTRING) || + !CBS_get_asn1(&curve, &out->b, CBS_ASN1_OCTETSTRING) || + // |curve| has an optional BIT STRING seed which we ignore. + !CBS_get_optional_asn1(&curve, nullptr, nullptr, CBS_ASN1_BITSTRING) || + CBS_len(&curve) != 0 || + !CBS_get_asn1(¶ms, &base, CBS_ASN1_OCTETSTRING) || + !CBS_get_asn1(¶ms, &out->order, CBS_ASN1_INTEGER) || + !CBS_is_unsigned_asn1_integer(&out->order) || + !CBS_get_optional_asn1(¶ms, &cofactor, &has_cofactor, + CBS_ASN1_INTEGER) || + CBS_len(¶ms) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); + return 0; + } + + if (has_cofactor) { + // We only support prime-order curves so the cofactor must be one. + if (CBS_len(&cofactor) != 1 || // + CBS_data(&cofactor)[0] != 1) { + OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); + return 0; + } + } + + // Require that the base point use uncompressed form. + uint8_t form; + if (!CBS_get_u8(&base, &form) || form != POINT_CONVERSION_UNCOMPRESSED) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_FORM); + return 0; + } + + if (CBS_len(&base) % 2 != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); + return 0; + } + size_t field_len = CBS_len(&base) / 2; + CBS_init(&out->base_x, CBS_data(&base), field_len); + CBS_init(&out->base_y, CBS_data(&base) + field_len, field_len); + + return 1; +} + +// integers_equal returns one if |bytes| is a big-endian encoding of |bn|, and +// zero otherwise. +static int integers_equal(const CBS *bytes, const BIGNUM *bn) { + // Although, in SEC 1, Field-Element-to-Octet-String has a fixed width, + // OpenSSL mis-encodes the |a| and |b|, so we tolerate any number of leading + // zeros. (This matters for P-521 whose |b| has a leading 0.) + CBS copy = *bytes; + while (CBS_len(©) > 0 && CBS_data(©)[0] == 0) { + CBS_skip(©, 1); + } + + if (CBS_len(©) > EC_MAX_BYTES) { + return 0; + } + uint8_t buf[EC_MAX_BYTES]; + if (!BN_bn2bin_padded(buf, CBS_len(©), bn)) { + ERR_clear_error(); + return 0; + } + + return CBS_mem_equal(©, buf, CBS_len(©)); +} + +const EC_GROUP *bssl::ec_key_parse_curve_name( + CBS *cbs, Span allowed_groups) { + CBS named_curve; + if (!CBS_get_asn1(cbs, &named_curve, CBS_ASN1_OBJECT)) { + OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); + return nullptr; + } + + // Look for a matching curve. + for (const EC_GROUP *group : allowed_groups) { + if (named_curve == Span(group->oid, group->oid_len)) { + return group; + } + } + + OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); + return nullptr; +} + +EC_GROUP *EC_KEY_parse_curve_name(CBS *cbs) { + // This function only ever returns a static |EC_GROUP|, but currently returns + // a non-const pointer for historical reasons. + return const_cast(ec_key_parse_curve_name(cbs, get_all_groups())); +} + +int EC_KEY_marshal_curve_name(CBB *cbb, const EC_GROUP *group) { + if (group->oid_len == 0) { + OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); + return 0; + } + + return CBB_add_asn1_element(cbb, CBS_ASN1_OBJECT, group->oid, group->oid_len); +} + +const EC_GROUP *bssl::ec_key_parse_parameters( + CBS *cbs, Span allowed_groups) { + if (!CBS_peek_asn1_tag(cbs, CBS_ASN1_SEQUENCE)) { + return ec_key_parse_curve_name(cbs, allowed_groups); + } + + // OpenSSL sometimes produces ECPrivateKeys with explicitly-encoded versions + // of named curves. + // + // TODO(davidben): Remove support for this. + struct explicit_prime_curve curve; + if (!parse_explicit_prime_curve(cbs, &curve)) { + return nullptr; + } + + UniquePtr p(BN_new()); + UniquePtr a(BN_new()); + UniquePtr b(BN_new()); + UniquePtr x(BN_new()); + UniquePtr y(BN_new()); + if (p == nullptr || a == nullptr || b == nullptr || x == nullptr || + y == nullptr) { + return nullptr; + } + + for (const EC_GROUP *group : allowed_groups) { + if (!integers_equal(&curve.order, EC_GROUP_get0_order(group))) { + continue; + } + + // The order alone uniquely identifies the group, but we check the other + // parameters to avoid misinterpreting the group. + if (!EC_GROUP_get_curve_GFp(group, p.get(), a.get(), b.get(), nullptr)) { + return nullptr; + } + if (!integers_equal(&curve.prime, p.get()) || + !integers_equal(&curve.a, a.get()) || + !integers_equal(&curve.b, b.get())) { + break; + } + if (!EC_POINT_get_affine_coordinates_GFp( + group, EC_GROUP_get0_generator(group), x.get(), y.get(), nullptr)) { + return nullptr; + } + if (!integers_equal(&curve.base_x, x.get()) || + !integers_equal(&curve.base_y, y.get())) { + break; + } + return group; + } + + OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); + return nullptr; +} + +EC_GROUP *EC_KEY_parse_parameters(CBS *cbs) { + // This function only ever returns a static |EC_GROUP|, but currently returns + // a non-const pointer for historical reasons. + return const_cast(ec_key_parse_parameters(cbs, get_all_groups())); +} + +int EC_POINT_point2cbb(CBB *out, const EC_GROUP *group, const EC_POINT *point, + point_conversion_form_t form, BN_CTX *ctx) { + size_t len = EC_POINT_point2oct(group, point, form, nullptr, 0, ctx); + if (len == 0) { + return 0; + } + uint8_t *p; + return CBB_add_space(out, &p, len) && + EC_POINT_point2oct(group, point, form, p, len, ctx) == len; +} + +EC_KEY *d2i_ECPrivateKey(EC_KEY **out, const uint8_t **inp, long len) { + // This function treats its |out| parameter differently from other |d2i| + // functions. If supplied, take the group from |*out|. + const EC_GROUP *group = nullptr; + if (out != nullptr && *out != nullptr) { + group = EC_KEY_get0_group(*out); + } + + return D2IFromCBS(out, inp, len, [&](CBS *cbs) { + return EC_KEY_parse_private_key(cbs, group); + }); +} + +int i2d_ECPrivateKey(const EC_KEY *key, uint8_t **outp) { + return I2DFromCBB( + /*initial_capacity=*/64, outp, [&](CBB *cbb) -> bool { + return EC_KEY_marshal_private_key(cbb, key, EC_KEY_get_enc_flags(key)); + }); +} + +EC_GROUP *d2i_ECPKParameters(EC_GROUP **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, EC_KEY_parse_parameters); +} + +int i2d_ECPKParameters(const EC_GROUP *group, uint8_t **outp) { + if (group == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return -1; + } + return I2DFromCBB( + /*initial_capacity=*/16, outp, + [&](CBB *cbb) -> bool { return EC_KEY_marshal_curve_name(cbb, group); }); +} + +EC_KEY *d2i_ECParameters(EC_KEY **out_key, const uint8_t **inp, long len) { + return D2IFromCBS(out_key, inp, len, [](CBS *cbs) -> UniquePtr { + const EC_GROUP *group = EC_KEY_parse_parameters(cbs); + if (group == nullptr) { + return nullptr; + } + UniquePtr ret(EC_KEY_new()); + if (ret == nullptr || !EC_KEY_set_group(ret.get(), group)) { + return nullptr; + } + return ret; + }); +} + +int i2d_ECParameters(const EC_KEY *key, uint8_t **outp) { + const ECKey *key_impl = FromOpaque(key); + if (key_impl == nullptr || key_impl->group == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return -1; + } + return I2DFromCBB( + /*initial_capacity=*/16, outp, [&](CBB *cbb) -> bool { + return EC_KEY_marshal_curve_name(cbb, key_impl->group); + }); +} + +EC_KEY *o2i_ECPublicKey(EC_KEY **keyp, const uint8_t **inp, long len) { + ECKey *ret = nullptr; + + if (keyp == nullptr || *keyp == nullptr || + FromOpaque(*keyp)->group == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return nullptr; + } + ret = FromOpaque(*keyp); + if (ret->pub_key == nullptr && + (ret->pub_key = EC_POINT_new(ret->group)) == nullptr) { + return nullptr; + } + if (!EC_POINT_oct2point(ret->group, ret->pub_key, *inp, len, nullptr)) { + OPENSSL_PUT_ERROR(EC, ERR_R_EC_LIB); + return nullptr; + } + // save the point conversion form + ret->conv_form = (point_conversion_form_t)(*inp[0] & ~0x01); + *inp += len; + return ret; +} + +int i2o_ECPublicKey(const EC_KEY *key, uint8_t **outp) { + if (key == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + const ECKey *key_impl = FromOpaque(key); + // No initial capacity because |EC_POINT_point2cbb| will internally reserve + // the right size in one shot, so it's best to leave this at zero. + int ret = I2DFromCBB( + /*initial_capacity=*/0, outp, [&](CBB *cbb) -> bool { + return EC_POINT_point2cbb(cbb, key_impl->group, key_impl->pub_key, + key_impl->conv_form, nullptr); + }); + // Historically, this function used the wrong return value on error. + return ret > 0 ? ret : 0; +} + +size_t EC_get_builtin_curves(EC_builtin_curve *out_curves, + size_t max_num_curves) { + auto all = get_all_groups(); + max_num_curves = std::min(all.size(), max_num_curves); + for (size_t i = 0; i < max_num_curves; i++) { + const EC_GROUP *group = all[i]; + out_curves[i].nid = group->curve_name; + out_curves[i].comment = group->comment; + } + return all.size(); +} diff --git a/third_party/boringssl/src/crypto/ec/ec_derive.cc b/third_party/boringssl/src/crypto/ec/ec_derive.cc new file mode 100644 index 00000000..6f6f7865 --- /dev/null +++ b/third_party/boringssl/src/crypto/ec/ec_derive.cc @@ -0,0 +1,94 @@ +// Copyright 2019 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "../fipsmodule/ec/internal.h" + + +EC_KEY *EC_KEY_derive_from_secret(const EC_GROUP *group, const uint8_t *secret, + size_t secret_len) { +#define EC_KEY_DERIVE_MAX_NAME_LEN 16 + const char *name = EC_curve_nid2nist(EC_GROUP_get_curve_name(group)); + if (name == nullptr || strlen(name) > EC_KEY_DERIVE_MAX_NAME_LEN) { + OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); + return nullptr; + } + + // Assemble a label string to provide some key separation in case |secret| is + // misused, but ultimately it's on the caller to ensure |secret| is suitably + // separated. + static const char kLabel[] = "derive EC key "; + char info[sizeof(kLabel) + EC_KEY_DERIVE_MAX_NAME_LEN]; + OPENSSL_strlcpy(info, kLabel, sizeof(info)); + OPENSSL_strlcat(info, name, sizeof(info)); + + // Generate 128 bits beyond the group order so the bias is at most 2^-128. +#define EC_KEY_DERIVE_EXTRA_BITS 128 +#define EC_KEY_DERIVE_EXTRA_BYTES (EC_KEY_DERIVE_EXTRA_BITS / 8) + + if (EC_GROUP_order_bits(group) <= EC_KEY_DERIVE_EXTRA_BITS + 8) { + // The reduction strategy below requires the group order be large enough. + // (The actual bound is a bit tighter, but our curves are much larger than + // 128-bit.) + OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); + return nullptr; + } + + uint8_t derived[EC_KEY_DERIVE_EXTRA_BYTES + EC_MAX_BYTES]; + size_t derived_len = + BN_num_bytes(EC_GROUP_get0_order(group)) + EC_KEY_DERIVE_EXTRA_BYTES; + assert(derived_len <= sizeof(derived)); + if (!HKDF(derived, derived_len, EVP_sha256(), secret, secret_len, + /*salt=*/nullptr, /*salt_len=*/0, (const uint8_t *)info, + strlen(info))) { + return nullptr; + } + + bssl::UniquePtr key(EC_KEY_new()); + bssl::UniquePtr ctx(BN_CTX_new()); + bssl::UniquePtr priv(BN_bin2bn(derived, derived_len, nullptr)); + bssl::UniquePtr pub(EC_POINT_new(group)); + if (key == nullptr || ctx == nullptr || priv == nullptr || pub == nullptr || + // Reduce |priv| with Montgomery reduction. First, convert "from" + // Montgomery form to compute |priv| * R^-1 mod |order|. This requires + // |priv| be under order * R, which is true if the group order is large + // enough. 2^(num_bytes(order)) < 2^8 * order, so: + // + // priv < 2^8 * order * 2^128 < order * order < order * R + !BN_from_montgomery(priv.get(), priv.get(), &group->order, ctx.get()) || + // Multiply by R^2 and do another Montgomery reduction to compute + // priv * R^-1 * R^2 * R^-1 = priv mod order. + !BN_to_montgomery(priv.get(), priv.get(), &group->order, ctx.get()) || + !EC_POINT_mul(group, pub.get(), priv.get(), nullptr, nullptr, + ctx.get()) || + !EC_KEY_set_group(key.get(), group) || + !EC_KEY_set_public_key(key.get(), pub.get()) || + !EC_KEY_set_private_key(key.get(), priv.get())) { + OPENSSL_cleanse(derived, sizeof(derived)); + return nullptr; + } + + OPENSSL_cleanse(derived, sizeof(derived)); + return key.release(); +} diff --git a/third_party/boringssl/src/crypto/ec/hash_to_curve.cc b/third_party/boringssl/src/crypto/ec/hash_to_curve.cc new file mode 100644 index 00000000..6ce88ee1 --- /dev/null +++ b/third_party/boringssl/src/crypto/ec/hash_to_curve.cc @@ -0,0 +1,601 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include + +#include "../fipsmodule/bn/internal.h" +#include "../fipsmodule/ec/internal.h" +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +namespace { + +// This file implements hash-to-curve, as described in RFC 9380. +// +// This hash-to-curve implementation is written generically with the +// expectation that we will eventually wish to support other curves. If it +// becomes a performance bottleneck, some possible optimizations by +// specializing it to the curve: +// +// - Rather than using a generic |ec_felem_exp|, specialize the exponentiation +// to c2 with a faster addition chain. +// +// - |ec_felem_mul| and |ec_felem_sqr| are generic Montgomery code. Given the +// few curves, we could specialize |map_to_curve_simple_swu|. But doing this +// reasonably without duplicating code in C is difficult. (C++ templates +// would be useful here.) +// +// - P-521's Z and c2 have small power-of-two absolute values. We could save +// two multiplications in SSWU. (Other curves have reasonable values of Z +// and inconvenient c2.) This is unlikely to be worthwhile without C++ +// templates to make specializing more convenient. + +// expand_message_xmd implements the operation described in section 5.3.1 of +// RFC 9380. It returns one on success and zero on error. +int expand_message_xmd(const EVP_MD *md, uint8_t *out, size_t out_len, + const uint8_t *msg, size_t msg_len, const uint8_t *dst, + size_t dst_len) { + // See https://github.com/cfrg/draft-irtf-cfrg-hash-to-curve/issues/352 + if (dst_len == 0) { + OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + const size_t block_size = EVP_MD_block_size(md); + const size_t md_size = EVP_MD_size(md); + ScopedEVP_MD_CTX ctx; + + // Long DSTs are hashed down to size. See section 5.3.3. + static_assert(EVP_MAX_MD_SIZE < 256, "hashed DST still too large"); + uint8_t dst_buf[EVP_MAX_MD_SIZE]; + if (dst_len >= 256) { + static const char kPrefix[] = "H2C-OVERSIZE-DST-"; + if (!EVP_DigestInit_ex(ctx.get(), md, nullptr) || + !EVP_DigestUpdate(ctx.get(), kPrefix, sizeof(kPrefix) - 1) || + !EVP_DigestUpdate(ctx.get(), dst, dst_len) || + !EVP_DigestFinal_ex(ctx.get(), dst_buf, nullptr)) { + return 0; + } + dst = dst_buf; + dst_len = md_size; + } + uint8_t dst_len_u8 = (uint8_t)dst_len; + + // Compute b_0. + static const uint8_t kZeros[EVP_MAX_MD_BLOCK_SIZE] = {0}; + // If |out_len| exceeds 16 bits then |i| will wrap below causing an error to + // be returned. This depends on the static assert above. + uint8_t l_i_b_str_zero[3] = {static_cast(out_len >> 8), + static_cast(out_len), 0}; + uint8_t b_0[EVP_MAX_MD_SIZE]; + if (!EVP_DigestInit_ex(ctx.get(), md, nullptr) || + !EVP_DigestUpdate(ctx.get(), kZeros, block_size) || + !EVP_DigestUpdate(ctx.get(), msg, msg_len) || + !EVP_DigestUpdate(ctx.get(), l_i_b_str_zero, sizeof(l_i_b_str_zero)) || + !EVP_DigestUpdate(ctx.get(), dst, dst_len) || + !EVP_DigestUpdate(ctx.get(), &dst_len_u8, 1) || + !EVP_DigestFinal_ex(ctx.get(), b_0, nullptr)) { + return 0; + } + + uint8_t b_i[EVP_MAX_MD_SIZE]; + uint8_t i = 1; + while (out_len > 0) { + if (i == 0) { + // Input was too large. + OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); + return 0; + } + if (i > 1) { + for (size_t j = 0; j < md_size; j++) { + b_i[j] ^= b_0[j]; + } + } else { + OPENSSL_memcpy(b_i, b_0, md_size); + } + + if (!EVP_DigestInit_ex(ctx.get(), md, nullptr) || + !EVP_DigestUpdate(ctx.get(), b_i, md_size) || + !EVP_DigestUpdate(ctx.get(), &i, 1) || + !EVP_DigestUpdate(ctx.get(), dst, dst_len) || + !EVP_DigestUpdate(ctx.get(), &dst_len_u8, 1) || + !EVP_DigestFinal_ex(ctx.get(), b_i, nullptr)) { + return 0; + } + + size_t todo = out_len >= md_size ? md_size : out_len; + OPENSSL_memcpy(out, b_i, todo); + out += todo; + out_len -= todo; + i++; + } + + return 1; +} + +// num_bytes_to_derive determines the number of bytes to derive when hashing to +// a number modulo |modulus|. See the hash_to_field operation defined in +// section 5.2 of RFC 9380. +int num_bytes_to_derive(size_t *out, const BIGNUM *modulus, unsigned k) { + size_t bits = BN_num_bits(modulus); + size_t L = (bits + k + 7) / 8; + // We require 2^(8*L) < 2^(2*bits - 2) <= n^2 so to fit in bounds for + // |felem_reduce| and |ec_scalar_reduce|. All defined hash-to-curve suites + // define |k| to be well under this bound. (|k| is usually around half of + // |p_bits|.) + if (L * 8 >= 2 * bits - 2 || L > 2 * EC_MAX_BYTES) { + assert(0); + OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); + return 0; + } + + *out = L; + return 1; +} + +// big_endian_to_words decodes |in| as a big-endian integer and writes the +// result to |out|. |num_words| must be large enough to contain the output. +void big_endian_to_words(BN_ULONG *out, size_t num_words, const uint8_t *in, + size_t len) { + assert(len <= num_words * sizeof(BN_ULONG)); + // Ensure any excess bytes are zeroed. + OPENSSL_memset(out, 0, num_words * sizeof(BN_ULONG)); + uint8_t *out_u8 = (uint8_t *)out; + for (size_t i = 0; i < len; i++) { + out_u8[len - 1 - i] = in[i]; + } +} + +// hash_to_field implements the operation described in section 5.2 +// of RFC 9380, with count = 2. |k| is the security factor. +int hash_to_field2(const EC_GROUP *group, const EVP_MD *md, EC_FELEM *out1, + EC_FELEM *out2, const uint8_t *dst, size_t dst_len, + unsigned k, const uint8_t *msg, size_t msg_len) { + size_t L; + uint8_t buf[4 * EC_MAX_BYTES]; + if (!num_bytes_to_derive(&L, &group->field.N, k) || + !expand_message_xmd(md, buf, 2 * L, msg, msg_len, dst, dst_len)) { + return 0; + } + BN_ULONG words[2 * EC_MAX_WORDS]; + size_t num_words = 2 * group->field.N.width; + big_endian_to_words(words, num_words, buf, L); + ec_felem_reduce(group, out1, words, num_words); + big_endian_to_words(words, num_words, buf + L, L); + ec_felem_reduce(group, out2, words, num_words); + return 1; +} + +// hash_to_field1 implements the operation described in section 5.2 +// of RFC 9380, with count = 1. |k| is the security factor. +int hash_to_field1(const EC_GROUP *group, const EVP_MD *md, EC_FELEM *out, + const uint8_t *dst, size_t dst_len, unsigned k, + const uint8_t *msg, size_t msg_len) { + size_t L; + uint8_t buf[2 * EC_MAX_BYTES]; + if (!num_bytes_to_derive(&L, &group->field.N, k) || + !expand_message_xmd(md, buf, L, msg, msg_len, dst, dst_len)) { + return 0; + } + BN_ULONG words[2 * EC_MAX_WORDS]; + size_t num_words = 2 * group->field.N.width; + big_endian_to_words(words, num_words, buf, L); + ec_felem_reduce(group, out, words, num_words); + return 1; +} + +// hash_to_scalar behaves like |hash_to_field2| but returns a value modulo the +// group order rather than a field element. |k| is the security factor. +int hash_to_scalar(const EC_GROUP *group, const EVP_MD *md, EC_SCALAR *out, + const uint8_t *dst, size_t dst_len, unsigned k, + const uint8_t *msg, size_t msg_len) { + const BIGNUM *order = EC_GROUP_get0_order(group); + size_t L; + uint8_t buf[EC_MAX_BYTES * 2]; + if (!num_bytes_to_derive(&L, order, k) || + !expand_message_xmd(md, buf, L, msg, msg_len, dst, dst_len)) { + return 0; + } + + BN_ULONG words[2 * EC_MAX_WORDS]; + size_t num_words = 2 * order->width; + big_endian_to_words(words, num_words, buf, L); + ec_scalar_reduce(group, out, words, num_words); + return 1; +} + +void mul_A(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *in) { + assert(group->a_is_minus3); + EC_FELEM tmp; + ec_felem_add(group, &tmp, in, in); // tmp = 2*in + ec_felem_add(group, &tmp, &tmp, &tmp); // tmp = 4*in + ec_felem_sub(group, out, in, &tmp); // out = -3*in +} + +// sgn0 implements the operation described in section 4.1.2 of RFC 9380. +BN_ULONG sgn0(const EC_GROUP *group, const EC_FELEM *a) { + uint8_t buf[EC_MAX_BYTES]; + size_t len; + ec_felem_to_bytes(group, buf, &len, a); + return buf[len - 1] & 1; +} + +[[maybe_unused]] static int is_3mod4(const EC_GROUP *group) { + return group->field.N.width > 0 && (group->field.N.d[0] & 3) == 3; +} + +// sqrt_ratio_3mod4 implements the operation described in appendix F.2.1.2 +// of RFC 9380. +BN_ULONG sqrt_ratio_3mod4(const EC_GROUP *group, const EC_FELEM *Z, + const BN_ULONG *c1, size_t num_c1, const EC_FELEM *c2, + EC_FELEM *out_y, const EC_FELEM *u, + const EC_FELEM *v) { + assert(is_3mod4(group)); + + EC_FELEM tv1, tv2, tv3, y1, y2; + ec_felem_sqr(group, &tv1, v); // 1. tv1 = v^2 + ec_felem_mul(group, &tv2, u, v); // 2. tv2 = u * v + ec_felem_mul(group, &tv1, &tv1, &tv2); // 3. tv1 = tv1 * tv2 + ec_felem_exp(group, &y1, &tv1, c1, num_c1); // 4. y1 = tv1^c1 + ec_felem_mul(group, &y1, &y1, &tv2); // 5. y1 = y1 * tv2 + ec_felem_mul(group, &y2, &y1, c2); // 6. y2 = y1 * c2 + ec_felem_sqr(group, &tv3, &y1); // 7. tv3 = y1^2 + ec_felem_mul(group, &tv3, &tv3, v); // 8. tv3 = tv3 * v + + // 9. isQR = tv3 == u + // 10. y = CMOV(y2, y1, isQR) + // 11. return (isQR, y) + // + // Note the specification's CMOV function and our |ec_felem_select| have the + // opposite argument order. + ec_felem_sub(group, &tv1, &tv3, u); + const BN_ULONG isQR = ~ec_felem_non_zero_mask(group, &tv1); + ec_felem_select(group, out_y, isQR, &y1, &y2); + return isQR; +} + +// map_to_curve_simple_swu implements the operation described in section 6.6.2 +// of RFC 9380, using the straight-line implementation in appendix F.2. +void map_to_curve_simple_swu(const EC_GROUP *group, const EC_FELEM *Z, + const BN_ULONG *c1, size_t num_c1, + const EC_FELEM *c2, EC_JACOBIAN *out, + const EC_FELEM *u) { + // This function requires the prime be 3 mod 4, and that A = -3. + assert(is_3mod4(group)); + assert(group->a_is_minus3); + + EC_FELEM tv1, tv2, tv3, tv4, tv5, tv6, x, y, y1; + ec_felem_sqr(group, &tv1, u); // 1. tv1 = u^2 + ec_felem_mul(group, &tv1, Z, &tv1); // 2. tv1 = Z * tv1 + ec_felem_sqr(group, &tv2, &tv1); // 3. tv2 = tv1^2 + ec_felem_add(group, &tv2, &tv2, &tv1); // 4. tv2 = tv2 + tv1 + ec_felem_add(group, &tv3, &tv2, ec_felem_one(group)); // 5. tv3 = tv2 + 1 + ec_felem_mul(group, &tv3, &group->b, &tv3); // 6. tv3 = B * tv3 + + // 7. tv4 = CMOV(Z, -tv2, tv2 != 0) + const BN_ULONG tv2_non_zero = ec_felem_non_zero_mask(group, &tv2); + ec_felem_neg(group, &tv4, &tv2); + ec_felem_select(group, &tv4, tv2_non_zero, &tv4, Z); + + mul_A(group, &tv4, &tv4); // 8. tv4 = A * tv4 + ec_felem_sqr(group, &tv2, &tv3); // 9. tv2 = tv3^2 + ec_felem_sqr(group, &tv6, &tv4); // 10. tv6 = tv4^2 + mul_A(group, &tv5, &tv6); // 11. tv5 = A * tv6 + ec_felem_add(group, &tv2, &tv2, &tv5); // 12. tv2 = tv2 + tv5 + ec_felem_mul(group, &tv2, &tv2, &tv3); // 13. tv2 = tv2 * tv3 + ec_felem_mul(group, &tv6, &tv6, &tv4); // 14. tv6 = tv6 * tv4 + ec_felem_mul(group, &tv5, &group->b, &tv6); // 15. tv5 = B * tv6 + ec_felem_add(group, &tv2, &tv2, &tv5); // 16. tv2 = tv2 + tv5 + ec_felem_mul(group, &x, &tv1, &tv3); // 17. x = tv1 * tv3 + + // 18. (is_gx1_square, y1) = sqrt_ratio(tv2, tv6) + const BN_ULONG is_gx1_square = + sqrt_ratio_3mod4(group, Z, c1, num_c1, c2, &y1, &tv2, &tv6); + + ec_felem_mul(group, &y, &tv1, u); // 19. y = tv1 * u + ec_felem_mul(group, &y, &y, &y1); // 20. y = y * y1 + + // 21. x = CMOV(x, tv3, is_gx1_square) + ec_felem_select(group, &x, is_gx1_square, &tv3, &x); + // 22. y = CMOV(y, y1, is_gx1_square) + ec_felem_select(group, &y, is_gx1_square, &y1, &y); + + // 23. e1 = sgn0(u) == sgn0(y) + BN_ULONG sgn0_u = sgn0(group, u); + BN_ULONG sgn0_y = sgn0(group, &y); + BN_ULONG not_e1 = sgn0_u ^ sgn0_y; + not_e1 = ((BN_ULONG)0) - not_e1; + + // 24. y = CMOV(-y, y, e1) + ec_felem_neg(group, &tv1, &y); + ec_felem_select(group, &y, not_e1, &tv1, &y); + + // 25. x = x / tv4 + // + // Our output is in projective coordinates, so rather than inverting |tv4| + // now, represent (x / tv4, y) as (x * tv4, y * tv4^3, tv4). This is much more + // efficient if the caller will do further computation on the output. (If the + // caller will immediately convert to affine coordinates, it is slightly less + // efficient, but only by a few field multiplications.) + ec_felem_mul(group, &out->X, &x, &tv4); + ec_felem_mul(group, &out->Y, &y, &tv6); + out->Z = tv4; +} + +int hash_to_curve(const EC_GROUP *group, const EVP_MD *md, const EC_FELEM *Z, + const EC_FELEM *c2, unsigned k, EC_JACOBIAN *out, + const uint8_t *dst, size_t dst_len, const uint8_t *msg, + size_t msg_len) { + EC_FELEM u0, u1; + if (!hash_to_field2(group, md, &u0, &u1, dst, dst_len, k, msg, msg_len)) { + return 0; + } + + // Compute |c1| = (p - 3) / 4. + BN_ULONG c1[EC_MAX_WORDS]; + size_t num_c1 = group->field.N.width; + if (!bn_copy_words(c1, num_c1, &group->field.N)) { + return 0; + } + bn_rshift_words(c1, c1, /*shift=*/2, /*num=*/num_c1); + + EC_JACOBIAN Q0, Q1; + map_to_curve_simple_swu(group, Z, c1, num_c1, c2, &Q0, &u0); + map_to_curve_simple_swu(group, Z, c1, num_c1, c2, &Q1, &u1); + + group->meth->add(group, out, &Q0, &Q1); // R = Q0 + Q1 + // All our curves have cofactor one, so |clear_cofactor| is a no-op. + return 1; +} + +int encode_to_curve(const EC_GROUP *group, const EVP_MD *md, const EC_FELEM *Z, + const EC_FELEM *c2, unsigned k, EC_JACOBIAN *out, + const uint8_t *dst, size_t dst_len, const uint8_t *msg, + size_t msg_len) { + EC_FELEM u; + if (!hash_to_field1(group, md, &u, dst, dst_len, k, msg, msg_len)) { + return 0; + } + + // Compute |c1| = (p - 3) / 4. + BN_ULONG c1[EC_MAX_WORDS]; + size_t num_c1 = group->field.N.width; + if (!bn_copy_words(c1, num_c1, &group->field.N)) { + return 0; + } + bn_rshift_words(c1, c1, /*shift=*/2, /*num=*/num_c1); + + map_to_curve_simple_swu(group, Z, c1, num_c1, c2, out, &u); + // All our curves have cofactor one, so |clear_cofactor| is a no-op. + return 1; +} + +int felem_from_u8(const EC_GROUP *group, EC_FELEM *out, uint8_t a) { + uint8_t bytes[EC_MAX_BYTES] = {0}; + size_t len = BN_num_bytes(&group->field.N); + bytes[len - 1] = a; + return ec_felem_from_bytes(group, out, bytes, len); +} + +// kP256Sqrt10 is sqrt(10) in P-256's field. It was computed as follows in +// python3: +// +// p = 2**256 - 2**224 + 2**192 + 2**96 - 1 +// c2 = pow(10, (p+1)//4, p) +// assert pow(c2, 2, p) == 10 +// ", ".join("0x%02x" % b for b in c2.to_bytes(256//8, 'big')) +const uint8_t kP256Sqrt10[] = {0xda, 0x53, 0x8e, 0x3b, 0xe1, 0xd8, 0x9b, 0x99, + 0xc9, 0x78, 0xfc, 0x67, 0x51, 0x80, 0xaa, 0xb2, + 0x7b, 0x8d, 0x1f, 0xf8, 0x4c, 0x55, 0xd5, 0xb6, + 0x2c, 0xcd, 0x34, 0x27, 0xe4, 0x33, 0xc4, 0x7f}; + +// kP384Sqrt12 is sqrt(12) in P-384's field. It was computed as follows in +// python3: +// +// p = 2**384 - 2**128 - 2**96 + 2**32 - 1 +// c2 = pow(12, (p+1)//4, p) +// assert pow(c2, 2, p) == 12 +// ", ".join("0x%02x" % b for b in c2.to_bytes(384//8, 'big')) +const uint8_t kP384Sqrt12[] = { + 0x2a, 0xcc, 0xb4, 0xa6, 0x56, 0xb0, 0x24, 0x9c, 0x71, 0xf0, 0x50, 0x0e, + 0x83, 0xda, 0x2f, 0xdd, 0x7f, 0x98, 0xe3, 0x83, 0xd6, 0x8b, 0x53, 0x87, + 0x1f, 0x87, 0x2f, 0xcb, 0x9c, 0xcb, 0x80, 0xc5, 0x3c, 0x0d, 0xe1, 0xf8, + 0xa8, 0x0f, 0x7e, 0x19, 0x14, 0xe2, 0xec, 0x69, 0xf5, 0xa6, 0x26, 0xb3}; + +} // namespace + +int bssl::ec_hash_to_curve_p256_xmd_sha256_sswu( + const EC_GROUP *group, EC_JACOBIAN *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len) { + // See section 8.3 of RFC 9380. + if (EC_GROUP_get_curve_name(group) != NID_X9_62_prime256v1) { + OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); + return 0; + } + + // Z = -10, c2 = sqrt(10) + EC_FELEM Z, c2; + if (!felem_from_u8(group, &Z, 10) || + !ec_felem_from_bytes(group, &c2, kP256Sqrt10, sizeof(kP256Sqrt10))) { + return 0; + } + ec_felem_neg(group, &Z, &Z); + + return hash_to_curve(group, EVP_sha256(), &Z, &c2, /*k=*/128, out, dst, + dst_len, msg, msg_len); +} + +int EC_hash_to_curve_p256_xmd_sha256_sswu(const EC_GROUP *group, EC_POINT *out, + const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len) { + if (EC_GROUP_cmp(group, out->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + return ec_hash_to_curve_p256_xmd_sha256_sswu(group, &out->raw, dst, dst_len, + msg, msg_len); +} + +int bssl::ec_hash_to_curve_p384_xmd_sha384_sswu( + const EC_GROUP *group, EC_JACOBIAN *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len) { + // See section 8.3 of RFC 9380. + if (EC_GROUP_get_curve_name(group) != NID_secp384r1) { + OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); + return 0; + } + + // Z = -12, c2 = sqrt(12) + EC_FELEM Z, c2; + if (!felem_from_u8(group, &Z, 12) || + !ec_felem_from_bytes(group, &c2, kP384Sqrt12, sizeof(kP384Sqrt12))) { + return 0; + } + ec_felem_neg(group, &Z, &Z); + + return hash_to_curve(group, EVP_sha384(), &Z, &c2, /*k=*/192, out, dst, + dst_len, msg, msg_len); +} + +int EC_hash_to_curve_p384_xmd_sha384_sswu(const EC_GROUP *group, EC_POINT *out, + const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len) { + if (EC_GROUP_cmp(group, out->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + return ec_hash_to_curve_p384_xmd_sha384_sswu(group, &out->raw, dst, dst_len, + msg, msg_len); +} + +int bssl::ec_encode_to_curve_p256_xmd_sha256_sswu( + const EC_GROUP *group, EC_JACOBIAN *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len) { + // See section 8.3 of RFC 9380. + if (EC_GROUP_get_curve_name(group) != NID_X9_62_prime256v1) { + OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); + return 0; + } + + // Z = -10, c2 = sqrt(10) + EC_FELEM Z, c2; + if (!felem_from_u8(group, &Z, 10) || + !ec_felem_from_bytes(group, &c2, kP256Sqrt10, sizeof(kP256Sqrt10))) { + return 0; + } + ec_felem_neg(group, &Z, &Z); + + return encode_to_curve(group, EVP_sha256(), &Z, &c2, /*k=*/128, out, dst, + dst_len, msg, msg_len); +} + +int EC_encode_to_curve_p256_xmd_sha256_sswu(const EC_GROUP *group, + EC_POINT *out, const uint8_t *dst, + size_t dst_len, const uint8_t *msg, + size_t msg_len) { + if (EC_GROUP_cmp(group, out->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + return ec_encode_to_curve_p256_xmd_sha256_sswu(group, &out->raw, dst, dst_len, + msg, msg_len); +} + +int bssl::ec_encode_to_curve_p384_xmd_sha384_sswu( + const EC_GROUP *group, EC_JACOBIAN *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len) { + // See section 8.3 of RFC 9380. + if (EC_GROUP_get_curve_name(group) != NID_secp384r1) { + OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); + return 0; + } + + // Z = -12, c2 = sqrt(12) + EC_FELEM Z, c2; + if (!felem_from_u8(group, &Z, 12) || + !ec_felem_from_bytes(group, &c2, kP384Sqrt12, sizeof(kP384Sqrt12))) { + return 0; + } + ec_felem_neg(group, &Z, &Z); + + return encode_to_curve(group, EVP_sha384(), &Z, &c2, /*k=*/192, out, dst, + dst_len, msg, msg_len); +} + +int EC_encode_to_curve_p384_xmd_sha384_sswu(const EC_GROUP *group, + EC_POINT *out, const uint8_t *dst, + size_t dst_len, const uint8_t *msg, + size_t msg_len) { + if (EC_GROUP_cmp(group, out->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + return ec_encode_to_curve_p384_xmd_sha384_sswu(group, &out->raw, dst, dst_len, + msg, msg_len); +} + +int bssl::ec_hash_to_scalar_p384_xmd_sha384(const EC_GROUP *group, + EC_SCALAR *out, const uint8_t *dst, + size_t dst_len, const uint8_t *msg, + size_t msg_len) { + if (EC_GROUP_get_curve_name(group) != NID_secp384r1) { + OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); + return 0; + } + + return hash_to_scalar(group, EVP_sha384(), out, dst, dst_len, /*k=*/192, msg, + msg_len); +} + +int bssl::ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( + const EC_GROUP *group, EC_JACOBIAN *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len) { + // See section 8.3 of draft-irtf-cfrg-hash-to-curve-07. + if (EC_GROUP_get_curve_name(group) != NID_secp384r1) { + OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); + return 0; + } + + // Z = -12, c2 = sqrt(12) + EC_FELEM Z, c2; + if (!felem_from_u8(group, &Z, 12) || + !ec_felem_from_bytes(group, &c2, kP384Sqrt12, sizeof(kP384Sqrt12))) { + return 0; + } + ec_felem_neg(group, &Z, &Z); + + return hash_to_curve(group, EVP_sha512(), &Z, &c2, /*k=*/192, out, dst, + dst_len, msg, msg_len); +} + +int bssl::ec_hash_to_scalar_p384_xmd_sha512_draft07( + const EC_GROUP *group, EC_SCALAR *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len) { + if (EC_GROUP_get_curve_name(group) != NID_secp384r1) { + OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); + return 0; + } + + return hash_to_scalar(group, EVP_sha512(), out, dst, dst_len, /*k=*/192, msg, + msg_len); +} diff --git a/third_party/boringssl/src/crypto/ec/internal.h b/third_party/boringssl/src/crypto/ec/internal.h new file mode 100644 index 00000000..6e464854 --- /dev/null +++ b/third_party/boringssl/src/crypto/ec/internal.h @@ -0,0 +1,114 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_EC_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_EC_INTERNAL_H + +#include + +#include + +#include "../fipsmodule/ec/internal.h" + + +BSSL_NAMESPACE_BEGIN + +// Parsing functions. + +// ec_key_parse_curve_name behaves like |EC_KEY_parse_curve_name| but only +// supports the groups in |allowed_groups|. If no syntax errors were found but +// the group is unknown, it will fail with an error of |EC_R_UNKNOWN_GROUP|. +const EC_GROUP *ec_key_parse_curve_name( + CBS *cbs, bssl::Span allowed_groups); + +// ec_key_parse_parameters behaves like |EC_KEY_parse_parameters| but only +// supports the groups in |allowed_groups|. If no syntax errors were found but +// the group is unknown, it will fail with an error of |EC_R_UNKNOWN_GROUP|. +const EC_GROUP *ec_key_parse_parameters( + CBS *cbs, bssl::Span allowed_groups); + +// ec_key_parse_private_key behaves like |EC_KEY_parse_private_key| but only +// supports the groups in |allowed_groups|. If |group| is non-NULL, +// |allowed_groups| is ignored and instead only |group| is supported. +// +// TODO(crbug.com/boringssl/414361735): This should return a bssl::UniquePtr, +// but cannot until it is made C++ linkage. +EC_KEY *ec_key_parse_private_key( + CBS *cbs, const EC_GROUP *group, + bssl::Span allowed_groups); + + +// Hash-to-curve. +// +// Internal |EC_JACOBIAN| versions of the corresponding public APIs. + +// ec_hash_to_curve_p256_xmd_sha256_sswu hashes |msg| to a point on |group| and +// writes the result to |out|, implementing the P256_XMD:SHA-256_SSWU_RO_ suite +// from RFC 9380. It returns one on success and zero on error. +OPENSSL_EXPORT int ec_hash_to_curve_p256_xmd_sha256_sswu( + const EC_GROUP *group, EC_JACOBIAN *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len); + +// ec_hash_to_curve_p384_xmd_sha384_sswu hashes |msg| to a point on |group| and +// writes the result to |out|, implementing the P384_XMD:SHA-384_SSWU_RO_ suite +// from RFC 9380. It returns one on success and zero on error. +OPENSSL_EXPORT int ec_hash_to_curve_p384_xmd_sha384_sswu( + const EC_GROUP *group, EC_JACOBIAN *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len); + +// ec_encode_to_curve_p256_xmd_sha256_sswu hashes |msg| to a point on |group| +// and writes the result to |out|, implementing the P256_XMD:SHA-256_SSWU_NU_ +// suite from RFC 9380. It returns one on success and zero on error. +OPENSSL_EXPORT int ec_encode_to_curve_p256_xmd_sha256_sswu( + const EC_GROUP *group, EC_JACOBIAN *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len); + +// ec_encode_to_curve_p384_xmd_sha384_sswu hashes |msg| to a point on |group| +// and writes the result to |out|, implementing the P384_XMD:SHA-384_SSWU_NU_ +// suite from RFC 9380. It returns one on success and zero on error. +OPENSSL_EXPORT int ec_encode_to_curve_p384_xmd_sha384_sswu( + const EC_GROUP *group, EC_JACOBIAN *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len); + +// ec_hash_to_scalar_p384_xmd_sha384 hashes |msg| to a scalar on |group| +// and writes the result to |out|, using the hash_to_field operation from the +// P384_XMD:SHA-384_SSWU_RO_ suite from RFC 9380, but generating a value modulo +// the group order rather than a field element. +OPENSSL_EXPORT int ec_hash_to_scalar_p384_xmd_sha384( + const EC_GROUP *group, EC_SCALAR *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len); + +// ec_hash_to_curve_p384_xmd_sha512_sswu_draft07 hashes |msg| to a point on +// |group| and writes the result to |out|, implementing the +// P384_XMD:SHA-512_SSWU_RO_ suite from draft-irtf-cfrg-hash-to-curve-07. It +// returns one on success and zero on error. +// +// TODO(https://crbug.com/1414562): Migrate this to the final version. +OPENSSL_EXPORT int ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( + const EC_GROUP *group, EC_JACOBIAN *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len); + +// ec_hash_to_scalar_p384_xmd_sha512_draft07 hashes |msg| to a scalar on |group| +// and writes the result to |out|, using the hash_to_field operation from the +// P384_XMD:SHA-512_SSWU_RO_ suite from draft-irtf-cfrg-hash-to-curve-07, but +// generating a value modulo the group order rather than a field element. +// +// TODO(https://crbug.com/1414562): Migrate this to the final version. +OPENSSL_EXPORT int ec_hash_to_scalar_p384_xmd_sha512_draft07( + const EC_GROUP *group, EC_SCALAR *out, const uint8_t *dst, size_t dst_len, + const uint8_t *msg, size_t msg_len); + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_EC_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/ec_extra/ec_asn1.c b/third_party/boringssl/src/crypto/ec_extra/ec_asn1.c deleted file mode 100644 index 56cbbed1..00000000 --- a/third_party/boringssl/src/crypto/ec_extra/ec_asn1.c +++ /dev/null @@ -1,559 +0,0 @@ -/* Written by Nils Larsch for the OpenSSL project. */ -/* ==================================================================== - * Copyright (c) 2000-2003 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include "../fipsmodule/ec/internal.h" -#include "../bytestring/internal.h" -#include "../internal.h" - - -static const unsigned kParametersTag = - CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0; -static const unsigned kPublicKeyTag = - CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 1; - -EC_KEY *EC_KEY_parse_private_key(CBS *cbs, const EC_GROUP *group) { - CBS ec_private_key, private_key; - uint64_t version; - if (!CBS_get_asn1(cbs, &ec_private_key, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1_uint64(&ec_private_key, &version) || - version != 1 || - !CBS_get_asn1(&ec_private_key, &private_key, CBS_ASN1_OCTETSTRING)) { - OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); - return NULL; - } - - // Parse the optional parameters field. - EC_GROUP *inner_group = NULL; - EC_KEY *ret = NULL; - BIGNUM *priv_key = NULL; - if (CBS_peek_asn1_tag(&ec_private_key, kParametersTag)) { - // Per SEC 1, as an alternative to omitting it, one is allowed to specify - // this field and put in a NULL to mean inheriting this value. This was - // omitted in a previous version of this logic without problems, so leave it - // unimplemented. - CBS child; - if (!CBS_get_asn1(&ec_private_key, &child, kParametersTag)) { - OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); - goto err; - } - inner_group = EC_KEY_parse_parameters(&child); - if (inner_group == NULL) { - goto err; - } - if (group == NULL) { - group = inner_group; - } else if (EC_GROUP_cmp(group, inner_group, NULL) != 0) { - // If a group was supplied externally, it must match. - OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); - goto err; - } - if (CBS_len(&child) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); - goto err; - } - } - - if (group == NULL) { - OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS); - goto err; - } - - ret = EC_KEY_new(); - if (ret == NULL || !EC_KEY_set_group(ret, group)) { - goto err; - } - - // Although RFC 5915 specifies the length of the key, OpenSSL historically - // got this wrong, so accept any length. See upstream's - // 30cd4ff294252c4b6a4b69cbef6a5b4117705d22. - priv_key = BN_bin2bn(CBS_data(&private_key), CBS_len(&private_key), NULL); - ret->pub_key = EC_POINT_new(group); - if (priv_key == NULL || ret->pub_key == NULL || - !EC_KEY_set_private_key(ret, priv_key)) { - goto err; - } - - if (CBS_peek_asn1_tag(&ec_private_key, kPublicKeyTag)) { - CBS child, public_key; - uint8_t padding; - if (!CBS_get_asn1(&ec_private_key, &child, kPublicKeyTag) || - !CBS_get_asn1(&child, &public_key, CBS_ASN1_BITSTRING) || - // As in a SubjectPublicKeyInfo, the byte-encoded public key is then - // encoded as a BIT STRING with bits ordered as in the DER encoding. - !CBS_get_u8(&public_key, &padding) || - padding != 0 || - // Explicitly check |public_key| is non-empty to save the conversion - // form later. - CBS_len(&public_key) == 0 || - !EC_POINT_oct2point(group, ret->pub_key, CBS_data(&public_key), - CBS_len(&public_key), NULL) || - CBS_len(&child) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); - goto err; - } - - // Save the point conversion form. - // TODO(davidben): Consider removing this. - ret->conv_form = - (point_conversion_form_t)(CBS_data(&public_key)[0] & ~0x01); - } else { - // Compute the public key instead. - if (!ec_point_mul_scalar_base(group, &ret->pub_key->raw, - &ret->priv_key->scalar)) { - goto err; - } - // Remember the original private-key-only encoding. - // TODO(davidben): Consider removing this. - ret->enc_flag |= EC_PKEY_NO_PUBKEY; - } - - if (CBS_len(&ec_private_key) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); - goto err; - } - - // Ensure the resulting key is valid. - if (!EC_KEY_check_key(ret)) { - goto err; - } - - BN_free(priv_key); - EC_GROUP_free(inner_group); - return ret; - -err: - EC_KEY_free(ret); - BN_free(priv_key); - EC_GROUP_free(inner_group); - return NULL; -} - -int EC_KEY_marshal_private_key(CBB *cbb, const EC_KEY *key, - unsigned enc_flags) { - if (key == NULL || key->group == NULL || key->priv_key == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - - CBB ec_private_key, private_key; - if (!CBB_add_asn1(cbb, &ec_private_key, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1_uint64(&ec_private_key, 1 /* version */) || - !CBB_add_asn1(&ec_private_key, &private_key, CBS_ASN1_OCTETSTRING) || - !BN_bn2cbb_padded(&private_key, - BN_num_bytes(EC_GROUP_get0_order(key->group)), - EC_KEY_get0_private_key(key))) { - OPENSSL_PUT_ERROR(EC, EC_R_ENCODE_ERROR); - return 0; - } - - if (!(enc_flags & EC_PKEY_NO_PARAMETERS)) { - CBB child; - if (!CBB_add_asn1(&ec_private_key, &child, kParametersTag) || - !EC_KEY_marshal_curve_name(&child, key->group) || - !CBB_flush(&ec_private_key)) { - OPENSSL_PUT_ERROR(EC, EC_R_ENCODE_ERROR); - return 0; - } - } - - // TODO(fork): replace this flexibility with sensible default? - if (!(enc_flags & EC_PKEY_NO_PUBKEY) && key->pub_key != NULL) { - CBB child, public_key; - if (!CBB_add_asn1(&ec_private_key, &child, kPublicKeyTag) || - !CBB_add_asn1(&child, &public_key, CBS_ASN1_BITSTRING) || - // As in a SubjectPublicKeyInfo, the byte-encoded public key is then - // encoded as a BIT STRING with bits ordered as in the DER encoding. - !CBB_add_u8(&public_key, 0 /* padding */) || - !EC_POINT_point2cbb(&public_key, key->group, key->pub_key, - key->conv_form, NULL) || - !CBB_flush(&ec_private_key)) { - OPENSSL_PUT_ERROR(EC, EC_R_ENCODE_ERROR); - return 0; - } - } - - if (!CBB_flush(cbb)) { - OPENSSL_PUT_ERROR(EC, EC_R_ENCODE_ERROR); - return 0; - } - - return 1; -} - -// kPrimeFieldOID is the encoding of 1.2.840.10045.1.1. -static const uint8_t kPrimeField[] = {0x2a, 0x86, 0x48, 0xce, 0x3d, 0x01, 0x01}; - -static int parse_explicit_prime_curve(CBS *in, CBS *out_prime, CBS *out_a, - CBS *out_b, CBS *out_base_x, - CBS *out_base_y, CBS *out_order) { - // See RFC 3279, section 2.3.5. Note that RFC 3279 calls this structure an - // ECParameters while RFC 5480 calls it a SpecifiedECDomain. - CBS params, field_id, field_type, curve, base, cofactor; - int has_cofactor; - uint64_t version; - if (!CBS_get_asn1(in, ¶ms, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1_uint64(¶ms, &version) || - version != 1 || - !CBS_get_asn1(¶ms, &field_id, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&field_id, &field_type, CBS_ASN1_OBJECT) || - CBS_len(&field_type) != sizeof(kPrimeField) || - OPENSSL_memcmp(CBS_data(&field_type), kPrimeField, sizeof(kPrimeField)) != - 0 || - !CBS_get_asn1(&field_id, out_prime, CBS_ASN1_INTEGER) || - !CBS_is_unsigned_asn1_integer(out_prime) || - CBS_len(&field_id) != 0 || - !CBS_get_asn1(¶ms, &curve, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&curve, out_a, CBS_ASN1_OCTETSTRING) || - !CBS_get_asn1(&curve, out_b, CBS_ASN1_OCTETSTRING) || - // |curve| has an optional BIT STRING seed which we ignore. - !CBS_get_optional_asn1(&curve, NULL, NULL, CBS_ASN1_BITSTRING) || - CBS_len(&curve) != 0 || - !CBS_get_asn1(¶ms, &base, CBS_ASN1_OCTETSTRING) || - !CBS_get_asn1(¶ms, out_order, CBS_ASN1_INTEGER) || - !CBS_is_unsigned_asn1_integer(out_order) || - !CBS_get_optional_asn1(¶ms, &cofactor, &has_cofactor, - CBS_ASN1_INTEGER) || - CBS_len(¶ms) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); - return 0; - } - - if (has_cofactor) { - // We only support prime-order curves so the cofactor must be one. - if (CBS_len(&cofactor) != 1 || - CBS_data(&cofactor)[0] != 1) { - OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); - return 0; - } - } - - // Require that the base point use uncompressed form. - uint8_t form; - if (!CBS_get_u8(&base, &form) || form != POINT_CONVERSION_UNCOMPRESSED) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_FORM); - return 0; - } - - if (CBS_len(&base) % 2 != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); - return 0; - } - size_t field_len = CBS_len(&base) / 2; - CBS_init(out_base_x, CBS_data(&base), field_len); - CBS_init(out_base_y, CBS_data(&base) + field_len, field_len); - - return 1; -} - -// integers_equal returns one if |a| and |b| are equal, up to leading zeros, and -// zero otherwise. -static int integers_equal(const CBS *a, const uint8_t *b, size_t b_len) { - // Remove leading zeros from |a| and |b|. - CBS a_copy = *a; - while (CBS_len(&a_copy) > 0 && CBS_data(&a_copy)[0] == 0) { - CBS_skip(&a_copy, 1); - } - while (b_len > 0 && b[0] == 0) { - b++; - b_len--; - } - return CBS_mem_equal(&a_copy, b, b_len); -} - -EC_GROUP *EC_KEY_parse_curve_name(CBS *cbs) { - CBS named_curve; - if (!CBS_get_asn1(cbs, &named_curve, CBS_ASN1_OBJECT)) { - OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); - return NULL; - } - - // Look for a matching curve. - const struct built_in_curves *const curves = OPENSSL_built_in_curves(); - for (size_t i = 0; i < OPENSSL_NUM_BUILT_IN_CURVES; i++) { - const struct built_in_curve *curve = &curves->curves[i]; - if (CBS_len(&named_curve) == curve->oid_len && - OPENSSL_memcmp(CBS_data(&named_curve), curve->oid, curve->oid_len) == - 0) { - return EC_GROUP_new_by_curve_name(curve->nid); - } - } - - OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); - return NULL; -} - -int EC_KEY_marshal_curve_name(CBB *cbb, const EC_GROUP *group) { - int nid = EC_GROUP_get_curve_name(group); - if (nid == NID_undef) { - OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); - return 0; - } - - const struct built_in_curves *const curves = OPENSSL_built_in_curves(); - for (size_t i = 0; i < OPENSSL_NUM_BUILT_IN_CURVES; i++) { - const struct built_in_curve *curve = &curves->curves[i]; - if (curve->nid == nid) { - CBB child; - return CBB_add_asn1(cbb, &child, CBS_ASN1_OBJECT) && - CBB_add_bytes(&child, curve->oid, curve->oid_len) && - CBB_flush(cbb); - } - } - - OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); - return 0; -} - -EC_GROUP *EC_KEY_parse_parameters(CBS *cbs) { - if (!CBS_peek_asn1_tag(cbs, CBS_ASN1_SEQUENCE)) { - return EC_KEY_parse_curve_name(cbs); - } - - // OpenSSL sometimes produces ECPrivateKeys with explicitly-encoded versions - // of named curves. - // - // TODO(davidben): Remove support for this. - CBS prime, a, b, base_x, base_y, order; - if (!parse_explicit_prime_curve(cbs, &prime, &a, &b, &base_x, &base_y, - &order)) { - return NULL; - } - - // Look for a matching prime curve. - const struct built_in_curves *const curves = OPENSSL_built_in_curves(); - for (size_t i = 0; i < OPENSSL_NUM_BUILT_IN_CURVES; i++) { - const struct built_in_curve *curve = &curves->curves[i]; - const unsigned param_len = curve->param_len; - // |curve->params| is ordered p, a, b, x, y, order, each component - // zero-padded up to the field length. Although SEC 1 states that the - // Field-Element-to-Octet-String conversion also pads, OpenSSL mis-encodes - // |a| and |b|, so this comparison must allow omitting leading zeros. (This - // is relevant for P-521 whose |b| has a leading 0.) - if (integers_equal(&prime, curve->params, param_len) && - integers_equal(&a, curve->params + param_len, param_len) && - integers_equal(&b, curve->params + param_len * 2, param_len) && - integers_equal(&base_x, curve->params + param_len * 3, param_len) && - integers_equal(&base_y, curve->params + param_len * 4, param_len) && - integers_equal(&order, curve->params + param_len * 5, param_len)) { - return EC_GROUP_new_by_curve_name(curve->nid); - } - } - - OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); - return NULL; -} - -int EC_POINT_point2cbb(CBB *out, const EC_GROUP *group, const EC_POINT *point, - point_conversion_form_t form, BN_CTX *ctx) { - size_t len = EC_POINT_point2oct(group, point, form, NULL, 0, ctx); - if (len == 0) { - return 0; - } - uint8_t *p; - return CBB_add_space(out, &p, len) && - EC_POINT_point2oct(group, point, form, p, len, ctx) == len; -} - -EC_KEY *d2i_ECPrivateKey(EC_KEY **out, const uint8_t **inp, long len) { - // This function treats its |out| parameter differently from other |d2i| - // functions. If supplied, take the group from |*out|. - const EC_GROUP *group = NULL; - if (out != NULL && *out != NULL) { - group = EC_KEY_get0_group(*out); - } - - if (len < 0) { - OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - EC_KEY *ret = EC_KEY_parse_private_key(&cbs, group); - if (ret == NULL) { - return NULL; - } - if (out != NULL) { - EC_KEY_free(*out); - *out = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -int i2d_ECPrivateKey(const EC_KEY *key, uint8_t **outp) { - CBB cbb; - if (!CBB_init(&cbb, 0) || - !EC_KEY_marshal_private_key(&cbb, key, EC_KEY_get_enc_flags(key))) { - CBB_cleanup(&cbb); - return -1; - } - return CBB_finish_i2d(&cbb, outp); -} - -EC_KEY *d2i_ECParameters(EC_KEY **out_key, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - EC_GROUP *group = EC_KEY_parse_parameters(&cbs); - if (group == NULL) { - return NULL; - } - - EC_KEY *ret = EC_KEY_new(); - if (ret == NULL || !EC_KEY_set_group(ret, group)) { - EC_GROUP_free(group); - EC_KEY_free(ret); - return NULL; - } - EC_GROUP_free(group); - - if (out_key != NULL) { - EC_KEY_free(*out_key); - *out_key = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -int i2d_ECParameters(const EC_KEY *key, uint8_t **outp) { - if (key == NULL || key->group == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return -1; - } - - CBB cbb; - if (!CBB_init(&cbb, 0) || - !EC_KEY_marshal_curve_name(&cbb, key->group)) { - CBB_cleanup(&cbb); - return -1; - } - return CBB_finish_i2d(&cbb, outp); -} - -EC_KEY *o2i_ECPublicKey(EC_KEY **keyp, const uint8_t **inp, long len) { - EC_KEY *ret = NULL; - - if (keyp == NULL || *keyp == NULL || (*keyp)->group == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return NULL; - } - ret = *keyp; - if (ret->pub_key == NULL && - (ret->pub_key = EC_POINT_new(ret->group)) == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE); - return NULL; - } - if (!EC_POINT_oct2point(ret->group, ret->pub_key, *inp, len, NULL)) { - OPENSSL_PUT_ERROR(EC, ERR_R_EC_LIB); - return NULL; - } - // save the point conversion form - ret->conv_form = (point_conversion_form_t)(*inp[0] & ~0x01); - *inp += len; - return ret; -} - -int i2o_ECPublicKey(const EC_KEY *key, uint8_t **outp) { - size_t buf_len = 0; - int new_buffer = 0; - - if (key == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - - buf_len = EC_POINT_point2oct(key->group, key->pub_key, key->conv_form, NULL, - 0, NULL); - - if (outp == NULL || buf_len == 0) { - // out == NULL => just return the length of the octet string - return buf_len; - } - - if (*outp == NULL) { - *outp = OPENSSL_malloc(buf_len); - if (*outp == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE); - return 0; - } - new_buffer = 1; - } - if (!EC_POINT_point2oct(key->group, key->pub_key, key->conv_form, *outp, - buf_len, NULL)) { - OPENSSL_PUT_ERROR(EC, ERR_R_EC_LIB); - if (new_buffer) { - OPENSSL_free(*outp); - *outp = NULL; - } - return 0; - } - - if (!new_buffer) { - *outp += buf_len; - } - return buf_len; -} diff --git a/third_party/boringssl/src/crypto/ec_extra/ec_derive.c b/third_party/boringssl/src/crypto/ec_extra/ec_derive.c deleted file mode 100644 index 6904d7bd..00000000 --- a/third_party/boringssl/src/crypto/ec_extra/ec_derive.c +++ /dev/null @@ -1,95 +0,0 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - -#include -#include -#include -#include -#include - -#include "../fipsmodule/ec/internal.h" - - -EC_KEY *EC_KEY_derive_from_secret(const EC_GROUP *group, const uint8_t *secret, - size_t secret_len) { -#define EC_KEY_DERIVE_MAX_NAME_LEN 16 - const char *name = EC_curve_nid2nist(EC_GROUP_get_curve_name(group)); - if (name == NULL || strlen(name) > EC_KEY_DERIVE_MAX_NAME_LEN) { - OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); - return NULL; - } - - // Assemble a label string to provide some key separation in case |secret| is - // misused, but ultimately it's on the caller to ensure |secret| is suitably - // separated. - static const char kLabel[] = "derive EC key "; - char info[sizeof(kLabel) + EC_KEY_DERIVE_MAX_NAME_LEN]; - OPENSSL_strlcpy(info, kLabel, sizeof(info)); - OPENSSL_strlcat(info, name, sizeof(info)); - - // Generate 128 bits beyond the group order so the bias is at most 2^-128. -#define EC_KEY_DERIVE_EXTRA_BITS 128 -#define EC_KEY_DERIVE_EXTRA_BYTES (EC_KEY_DERIVE_EXTRA_BITS / 8) - - if (EC_GROUP_order_bits(group) <= EC_KEY_DERIVE_EXTRA_BITS + 8) { - // The reduction strategy below requires the group order be large enough. - // (The actual bound is a bit tighter, but our curves are much larger than - // 128-bit.) - OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); - return NULL; - } - - uint8_t derived[EC_KEY_DERIVE_EXTRA_BYTES + EC_MAX_BYTES]; - size_t derived_len = BN_num_bytes(&group->order) + EC_KEY_DERIVE_EXTRA_BYTES; - assert(derived_len <= sizeof(derived)); - if (!HKDF(derived, derived_len, EVP_sha256(), secret, secret_len, - /*salt=*/NULL, /*salt_len=*/0, (const uint8_t *)info, - strlen(info))) { - return NULL; - } - - EC_KEY *key = EC_KEY_new(); - BN_CTX *ctx = BN_CTX_new(); - BIGNUM *priv = BN_bin2bn(derived, derived_len, NULL); - EC_POINT *pub = EC_POINT_new(group); - if (key == NULL || ctx == NULL || priv == NULL || pub == NULL || - // Reduce |priv| with Montgomery reduction. First, convert "from" - // Montgomery form to compute |priv| * R^-1 mod |order|. This requires - // |priv| be under order * R, which is true if the group order is large - // enough. 2^(num_bytes(order)) < 2^8 * order, so: - // - // priv < 2^8 * order * 2^128 < order * order < order * R - !BN_from_montgomery(priv, priv, group->order_mont, ctx) || - // Multiply by R^2 and do another Montgomery reduction to compute - // priv * R^-1 * R^2 * R^-1 = priv mod order. - !BN_to_montgomery(priv, priv, group->order_mont, ctx) || - !EC_POINT_mul(group, pub, priv, NULL, NULL, ctx) || - !EC_KEY_set_group(key, group) || !EC_KEY_set_public_key(key, pub) || - !EC_KEY_set_private_key(key, priv)) { - EC_KEY_free(key); - key = NULL; - goto err; - } - -err: - OPENSSL_cleanse(derived, sizeof(derived)); - BN_CTX_free(ctx); - BN_free(priv); - EC_POINT_free(pub); - return key; -} diff --git a/third_party/boringssl/src/crypto/ec_extra/hash_to_curve.c b/third_party/boringssl/src/crypto/ec_extra/hash_to_curve.c deleted file mode 100644 index fa7ff590..00000000 --- a/third_party/boringssl/src/crypto/ec_extra/hash_to_curve.c +++ /dev/null @@ -1,384 +0,0 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include - -#include - -#include "internal.h" -#include "../fipsmodule/bn/internal.h" -#include "../fipsmodule/ec/internal.h" -#include "../internal.h" - - -// This file implements hash-to-curve, as described in -// draft-irtf-cfrg-hash-to-curve-07. -// -// This hash-to-curve implementation is written generically with the -// expectation that we will eventually wish to support other curves. If it -// becomes a performance bottleneck, some possible optimizations by -// specializing it to the curve: -// -// - Rather than using a generic |felem_exp|, specialize the exponentation to -// c2 with a faster addition chain. -// -// - |felem_mul| and |felem_sqr| are indirect calls to generic Montgomery -// code. Given the few curves, we could specialize -// |map_to_curve_simple_swu|. But doing this reasonably without duplicating -// code in C is difficult. (C++ templates would be useful here.) -// -// - P-521's Z and c2 have small power-of-two absolute values. We could save -// two multiplications in SSWU. (Other curves have reasonable values of Z -// and inconvenient c2.) This is unlikely to be worthwhile without C++ -// templates to make specializing more convenient. - -// expand_message_xmd implements the operation described in section 5.3.1 of -// draft-irtf-cfrg-hash-to-curve-07. It returns one on success and zero on -// allocation failure or if |out_len| was too large. -static int expand_message_xmd(const EVP_MD *md, uint8_t *out, size_t out_len, - const uint8_t *msg, size_t msg_len, - const uint8_t *dst, size_t dst_len) { - int ret = 0; - const size_t block_size = EVP_MD_block_size(md); - const size_t md_size = EVP_MD_size(md); - EVP_MD_CTX ctx; - EVP_MD_CTX_init(&ctx); - - // Long DSTs are hashed down to size. See section 5.3.3. - static_assert(EVP_MAX_MD_SIZE < 256, "hashed DST still too large"); - uint8_t dst_buf[EVP_MAX_MD_SIZE]; - if (dst_len >= 256) { - static const char kPrefix[] = "H2C-OVERSIZE-DST-"; - if (!EVP_DigestInit_ex(&ctx, md, NULL) || - !EVP_DigestUpdate(&ctx, kPrefix, sizeof(kPrefix) - 1) || - !EVP_DigestUpdate(&ctx, dst, dst_len) || - !EVP_DigestFinal_ex(&ctx, dst_buf, NULL)) { - goto err; - } - dst = dst_buf; - dst_len = md_size; - } - uint8_t dst_len_u8 = (uint8_t)dst_len; - - // Compute b_0. - static const uint8_t kZeros[EVP_MAX_MD_BLOCK_SIZE] = {0}; - // If |out_len| exceeds 16 bits then |i| will wrap below causing an error to - // be returned. This depends on the static assert above. - uint8_t l_i_b_str_zero[3] = {out_len >> 8, out_len, 0}; - uint8_t b_0[EVP_MAX_MD_SIZE]; - if (!EVP_DigestInit_ex(&ctx, md, NULL) || - !EVP_DigestUpdate(&ctx, kZeros, block_size) || - !EVP_DigestUpdate(&ctx, msg, msg_len) || - !EVP_DigestUpdate(&ctx, l_i_b_str_zero, sizeof(l_i_b_str_zero)) || - !EVP_DigestUpdate(&ctx, dst, dst_len) || - !EVP_DigestUpdate(&ctx, &dst_len_u8, 1) || - !EVP_DigestFinal_ex(&ctx, b_0, NULL)) { - goto err; - } - - uint8_t b_i[EVP_MAX_MD_SIZE]; - uint8_t i = 1; - while (out_len > 0) { - if (i == 0) { - // Input was too large. - OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); - goto err; - } - if (i > 1) { - for (size_t j = 0; j < md_size; j++) { - b_i[j] ^= b_0[j]; - } - } else { - OPENSSL_memcpy(b_i, b_0, md_size); - } - - if (!EVP_DigestInit_ex(&ctx, md, NULL) || - !EVP_DigestUpdate(&ctx, b_i, md_size) || - !EVP_DigestUpdate(&ctx, &i, 1) || - !EVP_DigestUpdate(&ctx, dst, dst_len) || - !EVP_DigestUpdate(&ctx, &dst_len_u8, 1) || - !EVP_DigestFinal_ex(&ctx, b_i, NULL)) { - goto err; - } - - size_t todo = out_len >= md_size ? md_size : out_len; - OPENSSL_memcpy(out, b_i, todo); - out += todo; - out_len -= todo; - i++; - } - - ret = 1; - -err: - EVP_MD_CTX_cleanup(&ctx); - return ret; -} - -// num_bytes_to_derive determines the number of bytes to derive when hashing to -// a number modulo |modulus|. See the hash_to_field operation defined in -// section 5.2 of draft-irtf-cfrg-hash-to-curve-07. -static int num_bytes_to_derive(size_t *out, const BIGNUM *modulus, unsigned k) { - size_t bits = BN_num_bits(modulus); - size_t L = (bits + k + 7) / 8; - // We require 2^(8*L) < 2^(2*bits - 2) <= n^2 so to fit in bounds for - // |felem_reduce| and |ec_scalar_reduce|. All defined hash-to-curve suites - // define |k| to be well under this bound. (|k| is usually around half of - // |p_bits|.) - if (L * 8 >= 2 * bits - 2 || - L > 2 * EC_MAX_BYTES) { - assert(0); - OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); - return 0; - } - - *out = L; - return 1; -} - -// big_endian_to_words decodes |in| as a big-endian integer and writes the -// result to |out|. |num_words| must be large enough to contain the output. -static void big_endian_to_words(BN_ULONG *out, size_t num_words, - const uint8_t *in, size_t len) { - assert(len <= num_words * sizeof(BN_ULONG)); - // Ensure any excess bytes are zeroed. - OPENSSL_memset(out, 0, num_words * sizeof(BN_ULONG)); - uint8_t *out_u8 = (uint8_t *)out; - for (size_t i = 0; i < len; i++) { - out_u8[len - 1 - i] = in[i]; - } -} - -// hash_to_field implements the operation described in section 5.2 -// of draft-irtf-cfrg-hash-to-curve-07, with count = 2. |k| is the security -// factor. -static int hash_to_field2(const EC_GROUP *group, const EVP_MD *md, - EC_FELEM *out1, EC_FELEM *out2, const uint8_t *dst, - size_t dst_len, unsigned k, const uint8_t *msg, - size_t msg_len) { - size_t L; - uint8_t buf[4 * EC_MAX_BYTES]; - if (!num_bytes_to_derive(&L, &group->field, k) || - !expand_message_xmd(md, buf, 2 * L, msg, msg_len, dst, dst_len)) { - return 0; - } - BN_ULONG words[2 * EC_MAX_WORDS]; - size_t num_words = 2 * group->field.width; - big_endian_to_words(words, num_words, buf, L); - group->meth->felem_reduce(group, out1, words, num_words); - big_endian_to_words(words, num_words, buf + L, L); - group->meth->felem_reduce(group, out2, words, num_words); - return 1; -} - -// hash_to_scalar behaves like |hash_to_field2| but returns a value modulo the -// group order rather than a field element. |k| is the security factor. -static int hash_to_scalar(const EC_GROUP *group, const EVP_MD *md, - EC_SCALAR *out, const uint8_t *dst, size_t dst_len, - unsigned k, const uint8_t *msg, size_t msg_len) { - size_t L; - uint8_t buf[EC_MAX_BYTES * 2]; - if (!num_bytes_to_derive(&L, &group->order, k) || - !expand_message_xmd(md, buf, L, msg, msg_len, dst, dst_len)) { - return 0; - } - - BN_ULONG words[2 * EC_MAX_WORDS]; - size_t num_words = 2 * group->order.width; - big_endian_to_words(words, num_words, buf, L); - ec_scalar_reduce(group, out, words, num_words); - return 1; -} - -static inline void mul_A(const EC_GROUP *group, EC_FELEM *out, - const EC_FELEM *in) { - assert(group->a_is_minus3); - EC_FELEM tmp; - ec_felem_add(group, &tmp, in, in); // tmp = 2*in - ec_felem_add(group, &tmp, &tmp, &tmp); // tmp = 4*in - ec_felem_sub(group, out, in, &tmp); // out = -3*in -} - -static inline void mul_minus_A(const EC_GROUP *group, EC_FELEM *out, - const EC_FELEM *in) { - assert(group->a_is_minus3); - EC_FELEM tmp; - ec_felem_add(group, &tmp, in, in); // tmp = 2*in - ec_felem_add(group, out, &tmp, in); // out = 3*in -} - -// sgn0_le implements the operation described in section 4.1.2 of -// draft-irtf-cfrg-hash-to-curve-07. -static BN_ULONG sgn0_le(const EC_GROUP *group, const EC_FELEM *a) { - uint8_t buf[EC_MAX_BYTES]; - size_t len; - ec_felem_to_bytes(group, buf, &len, a); - return buf[len - 1] & 1; -} - -// map_to_curve_simple_swu implements the operation described in section 6.6.2 -// of draft-irtf-cfrg-hash-to-curve-07, using the optimization in appendix -// D.2.1. It returns one on success and zero on error. -static int map_to_curve_simple_swu(const EC_GROUP *group, const EC_FELEM *Z, - const BN_ULONG *c1, size_t num_c1, - const EC_FELEM *c2, EC_RAW_POINT *out, - const EC_FELEM *u) { - void (*const felem_mul)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a, - const EC_FELEM *b) = group->meth->felem_mul; - void (*const felem_sqr)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a) = - group->meth->felem_sqr; - - // This function requires the prime be 3 mod 4, and that A = -3. - if (group->field.width == 0 || (group->field.d[0] & 3) != 3 || - !group->a_is_minus3) { - OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); - return 0; - } - - EC_FELEM tv1, tv2, tv3, tv4, xd, x1n, x2n, tmp, gxd, gx1, y1, y2; - felem_sqr(group, &tv1, u); // tv1 = u^2 - felem_mul(group, &tv3, Z, &tv1); // tv3 = Z * tv1 - felem_sqr(group, &tv2, &tv3); // tv2 = tv3^2 - ec_felem_add(group, &xd, &tv2, &tv3); // xd = tv2 + tv3 - ec_felem_add(group, &x1n, &xd, &group->one); // x1n = xd + 1 - felem_mul(group, &x1n, &x1n, &group->b); // x1n = x1n * B - mul_minus_A(group, &xd, &xd); // xd = -A * xd - BN_ULONG e1 = ec_felem_non_zero_mask(group, &xd); // e1 = xd == 0 [flipped] - mul_A(group, &tmp, Z); - ec_felem_select(group, &xd, e1, &xd, &tmp); // xd = CMOV(xd, Z * A, e1) - felem_sqr(group, &tv2, &xd); // tv2 = xd^2 - felem_mul(group, &gxd, &tv2, &xd); // gxd = tv2 * xd = xd^3 - mul_A(group, &tv2, &tv2); // tv2 = A * tv2 - felem_sqr(group, &gx1, &x1n); // gx1 = x1n^2 - ec_felem_add(group, &gx1, &gx1, &tv2); // gx1 = gx1 + tv2 - felem_mul(group, &gx1, &gx1, &x1n); // gx1 = gx1 * x1n - felem_mul(group, &tv2, &group->b, &gxd); // tv2 = B * gxd - ec_felem_add(group, &gx1, &gx1, &tv2); // gx1 = gx1 + tv2 - felem_sqr(group, &tv4, &gxd); // tv4 = gxd^2 - felem_mul(group, &tv2, &gx1, &gxd); // tv2 = gx1 * gxd - felem_mul(group, &tv4, &tv4, &tv2); // tv4 = tv4 * tv2 - group->meth->felem_exp(group, &y1, &tv4, c1, num_c1); // y1 = tv4^c1 - felem_mul(group, &y1, &y1, &tv2); // y1 = y1 * tv2 - felem_mul(group, &x2n, &tv3, &x1n); // x2n = tv3 * x1n - felem_mul(group, &y2, &y1, c2); // y2 = y1 * c2 - felem_mul(group, &y2, &y2, &tv1); // y2 = y2 * tv1 - felem_mul(group, &y2, &y2, u); // y2 = y2 * u - felem_sqr(group, &tv2, &y1); // tv2 = y1^2 - felem_mul(group, &tv2, &tv2, &gxd); // tv2 = tv2 * gxd - ec_felem_sub(group, &tv3, &tv2, &gx1); - BN_ULONG e2 = - ec_felem_non_zero_mask(group, &tv3); // e2 = tv2 == gx1 [flipped] - ec_felem_select(group, &x1n, e2, &x2n, &x1n); // xn = CMOV(x2n, x1n, e2) - ec_felem_select(group, &y1, e2, &y2, &y1); // y = CMOV(y2, y1, e2) - BN_ULONG sgn0_u = sgn0_le(group, u); - BN_ULONG sgn0_y = sgn0_le(group, &y1); - BN_ULONG e3 = sgn0_u ^ sgn0_y; - e3 = ((BN_ULONG)0) - e3; // e3 = sgn0(u) == sgn0(y) [flipped] - ec_felem_neg(group, &y2, &y1); - ec_felem_select(group, &y1, e3, &y2, &y1); // y = CMOV(-y, y, e3) - - // Appendix D.1 describes how to convert (x1n, xd, y1, 1) to Jacobian - // coordinates. Note yd = 1. Also note that gxd computed above is xd^3. - felem_mul(group, &out->X, &x1n, &xd); // X = xn * xd - felem_mul(group, &out->Y, &y1, &gxd); // Y = yn * gxd = yn * xd^3 - out->Z = xd; // Z = xd - return 1; -} - -static int hash_to_curve(const EC_GROUP *group, const EVP_MD *md, - const EC_FELEM *Z, const EC_FELEM *c2, unsigned k, - EC_RAW_POINT *out, const uint8_t *dst, size_t dst_len, - const uint8_t *msg, size_t msg_len) { - EC_FELEM u0, u1; - if (!hash_to_field2(group, md, &u0, &u1, dst, dst_len, k, msg, msg_len)) { - return 0; - } - - // Compute |c1| = (p - 3) / 4. - BN_ULONG c1[EC_MAX_WORDS]; - size_t num_c1 = group->field.width; - if (!bn_copy_words(c1, num_c1, &group->field)) { - return 0; - } - bn_rshift_words(c1, c1, /*shift=*/2, /*num=*/num_c1); - - EC_RAW_POINT Q0, Q1; - if (!map_to_curve_simple_swu(group, Z, c1, num_c1, c2, &Q0, &u0) || - !map_to_curve_simple_swu(group, Z, c1, num_c1, c2, &Q1, &u1)) { - return 0; - } - - group->meth->add(group, out, &Q0, &Q1); // R = Q0 + Q1 - // All our curves have cofactor one, so |clear_cofactor| is a no-op. - return 1; -} - -static int felem_from_u8(const EC_GROUP *group, EC_FELEM *out, uint8_t a) { - uint8_t bytes[EC_MAX_BYTES] = {0}; - size_t len = BN_num_bytes(&group->field); - bytes[len - 1] = a; - return ec_felem_from_bytes(group, out, bytes, len); -} - -int ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( - const EC_GROUP *group, EC_RAW_POINT *out, const uint8_t *dst, - size_t dst_len, const uint8_t *msg, size_t msg_len) { - // See section 8.3 of draft-irtf-cfrg-hash-to-curve-07. - if (EC_GROUP_get_curve_name(group) != NID_secp384r1) { - OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); - return 0; - } - - // kSqrt1728 was computed as follows in python3: - // - // p = 2**384 - 2**128 - 2**96 + 2**32 - 1 - // z3 = 12**3 - // c2 = pow(z3, (p+1)//4, p) - // assert z3 == pow(c2, 2, p) - // ", ".join("0x%02x" % b for b in c2.to_bytes(384//8, 'big') - - static const uint8_t kSqrt1728[] = { - 0x01, 0x98, 0x77, 0xcc, 0x10, 0x41, 0xb7, 0x55, 0x57, 0x43, 0xc0, 0xae, - 0x2e, 0x3a, 0x3e, 0x61, 0xfb, 0x2a, 0xaa, 0x2e, 0x0e, 0x87, 0xea, 0x55, - 0x7a, 0x56, 0x3d, 0x8b, 0x59, 0x8a, 0x09, 0x40, 0xd0, 0xa6, 0x97, 0xa9, - 0xe0, 0xb9, 0xe9, 0x2c, 0xfa, 0xa3, 0x14, 0xf5, 0x83, 0xc9, 0xd0, 0x66 - }; - - // Z = -12, c2 = sqrt(1728) - EC_FELEM Z, c2; - if (!felem_from_u8(group, &Z, 12) || - !ec_felem_from_bytes(group, &c2, kSqrt1728, sizeof(kSqrt1728))) { - return 0; - } - ec_felem_neg(group, &Z, &Z); - - return hash_to_curve(group, EVP_sha512(), &Z, &c2, /*k=*/192, out, dst, - dst_len, msg, msg_len); -} - -int ec_hash_to_scalar_p384_xmd_sha512_draft07( - const EC_GROUP *group, EC_SCALAR *out, const uint8_t *dst, size_t dst_len, - const uint8_t *msg, size_t msg_len) { - if (EC_GROUP_get_curve_name(group) != NID_secp384r1) { - OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); - return 0; - } - - return hash_to_scalar(group, EVP_sha512(), out, dst, dst_len, /*k=*/192, msg, - msg_len); -} diff --git a/third_party/boringssl/src/crypto/ec_extra/internal.h b/third_party/boringssl/src/crypto/ec_extra/internal.h deleted file mode 100644 index 55314ac7..00000000 --- a/third_party/boringssl/src/crypto/ec_extra/internal.h +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_EC_EXTRA_INTERNAL_H -#define OPENSSL_HEADER_EC_EXTRA_INTERNAL_H - -#include - -#include "../fipsmodule/ec/internal.h" - -#if defined(__cplusplus) -extern "C" { -#endif - - -// Hash-to-curve. -// -// The following functions implement primitives from -// draft-irtf-cfrg-hash-to-curve. The |dst| parameter in each function is the -// domain separation tag and must be unique for each protocol and between the -// |hash_to_curve| and |hash_to_scalar| variants. See section 3.1 of the spec -// for additional guidance on this parameter. - -// ec_hash_to_curve_p384_xmd_sha512_sswu_draft07 hashes |msg| to a point on -// |group| and writes the result to |out|, implementing the -// P384_XMD:SHA-512_SSWU_RO_ suite from draft-irtf-cfrg-hash-to-curve-07. It -// returns one on success and zero on error. -OPENSSL_EXPORT int ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( - const EC_GROUP *group, EC_RAW_POINT *out, const uint8_t *dst, - size_t dst_len, const uint8_t *msg, size_t msg_len); - -// ec_hash_to_scalar_p384_xmd_sha512_draft07 hashes |msg| to a scalar on |group| -// and writes the result to |out|, using the hash_to_field operation from the -// P384_XMD:SHA-512_SSWU_RO_ suite from draft-irtf-cfrg-hash-to-curve-07, but -// generating a value modulo the group order rather than a field element. -OPENSSL_EXPORT int ec_hash_to_scalar_p384_xmd_sha512_draft07( - const EC_GROUP *group, EC_SCALAR *out, const uint8_t *dst, size_t dst_len, - const uint8_t *msg, size_t msg_len); - - -#if defined(__cplusplus) -} // extern C -#endif - -#endif // OPENSSL_HEADER_EC_EXTRA_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/ecdh/ecdh.cc b/third_party/boringssl/src/crypto/ecdh/ecdh.cc new file mode 100644 index 00000000..cbd363df --- /dev/null +++ b/third_party/boringssl/src/crypto/ecdh/ecdh.cc @@ -0,0 +1,75 @@ +// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include + +#include "../fipsmodule/ec/internal.h" +#include "../internal.h" + + +using namespace bssl; + +int ECDH_compute_key(void *out, size_t out_len, const EC_POINT *pub_key, + const EC_KEY *priv_key, + void *(*kdf)(const void *in, size_t inlen, void *out, + size_t *out_len)) { + if (FromOpaque(priv_key)->priv_key == nullptr) { + OPENSSL_PUT_ERROR(ECDH, ECDH_R_NO_PRIVATE_VALUE); + return -1; + } + const EC_SCALAR *const priv = &FromOpaque(priv_key)->priv_key->scalar; + const EC_GROUP *const group = EC_KEY_get0_group(priv_key); + if (EC_GROUP_cmp(group, pub_key->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return -1; + } + + EC_JACOBIAN shared_point; + uint8_t buf[EC_MAX_BYTES]; + size_t buf_len; + if (!ec_point_mul_scalar(group, &shared_point, &pub_key->raw, priv) || + !ec_get_x_coordinate_as_bytes(group, buf, &buf_len, sizeof(buf), + &shared_point)) { + OPENSSL_PUT_ERROR(ECDH, ECDH_R_POINT_ARITHMETIC_FAILURE); + return -1; + } + + if (kdf != nullptr) { + if (kdf(buf, buf_len, out, &out_len) == nullptr) { + OPENSSL_PUT_ERROR(ECDH, ECDH_R_KDF_FAILED); + return -1; + } + } else { + // no KDF, just copy as much as we can + if (buf_len < out_len) { + out_len = buf_len; + } + OPENSSL_memcpy(out, buf, out_len); + } + + if (out_len > INT_MAX) { + OPENSSL_PUT_ERROR(ECDH, ERR_R_OVERFLOW); + return -1; + } + + return (int)out_len; +} diff --git a/third_party/boringssl/src/crypto/ecdh_extra/ecdh_extra.c b/third_party/boringssl/src/crypto/ecdh_extra/ecdh_extra.c deleted file mode 100644 index 237d973a..00000000 --- a/third_party/boringssl/src/crypto/ecdh_extra/ecdh_extra.c +++ /dev/null @@ -1,124 +0,0 @@ -/* ==================================================================== - * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. - * - * The Elliptic Curve Public-Key Crypto Library (ECC Code) included - * herein is developed by SUN MICROSYSTEMS, INC., and is contributed - * to the OpenSSL project. - * - * The ECC Code is licensed pursuant to the OpenSSL open source - * license provided below. - * - * The ECDH software is originally written by Douglas Stebila of - * Sun Microsystems Laboratories. - * - */ -/* ==================================================================== - * Copyright (c) 2000-2002 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include - -#include -#include -#include - -#include "../fipsmodule/ec/internal.h" -#include "../internal.h" - - -int ECDH_compute_key(void *out, size_t out_len, const EC_POINT *pub_key, - const EC_KEY *priv_key, - void *(*kdf)(const void *in, size_t inlen, void *out, - size_t *out_len)) { - if (priv_key->priv_key == NULL) { - OPENSSL_PUT_ERROR(ECDH, ECDH_R_NO_PRIVATE_VALUE); - return -1; - } - const EC_SCALAR *const priv = &priv_key->priv_key->scalar; - const EC_GROUP *const group = EC_KEY_get0_group(priv_key); - if (EC_GROUP_cmp(group, pub_key->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return -1; - } - - EC_RAW_POINT shared_point; - uint8_t buf[EC_MAX_BYTES]; - size_t buf_len; - if (!ec_point_mul_scalar(group, &shared_point, &pub_key->raw, priv) || - !ec_get_x_coordinate_as_bytes(group, buf, &buf_len, sizeof(buf), - &shared_point)) { - OPENSSL_PUT_ERROR(ECDH, ECDH_R_POINT_ARITHMETIC_FAILURE); - return -1; - } - - if (kdf != NULL) { - if (kdf(buf, buf_len, out, &out_len) == NULL) { - OPENSSL_PUT_ERROR(ECDH, ECDH_R_KDF_FAILED); - return -1; - } - } else { - // no KDF, just copy as much as we can - if (buf_len < out_len) { - out_len = buf_len; - } - OPENSSL_memcpy(out, buf, out_len); - } - - if (out_len > INT_MAX) { - OPENSSL_PUT_ERROR(ECDH, ERR_R_OVERFLOW); - return -1; - } - - return (int)out_len; -} diff --git a/third_party/boringssl/src/crypto/ecdsa/ecdsa_asn1.cc b/third_party/boringssl/src/crypto/ecdsa/ecdsa_asn1.cc new file mode 100644 index 00000000..d4d0b4b6 --- /dev/null +++ b/third_party/boringssl/src/crypto/ecdsa/ecdsa_asn1.cc @@ -0,0 +1,337 @@ +// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../fipsmodule/ecdsa/internal.h" +#include "../internal.h" +#include "../mem_internal.h" + + +using namespace bssl; + +static ECDSA_SIG *ecdsa_sig_from_fixed(const EC_KEY *key, const uint8_t *in, + size_t len) { + const EC_GROUP *group = EC_KEY_get0_group(key); + if (group == nullptr) { + OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER); + return nullptr; + } + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); + if (len != 2 * scalar_len) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); + return nullptr; + } + ECDSA_SIG *ret = ECDSA_SIG_new(); + if (ret == nullptr || !BN_bin2bn(in, scalar_len, ret->r) || + !BN_bin2bn(in + scalar_len, scalar_len, ret->s)) { + ECDSA_SIG_free(ret); + return nullptr; + } + return ret; +} + +static int ecdsa_sig_to_fixed(const EC_KEY *key, uint8_t *out, size_t *out_len, + size_t max_out, const ECDSA_SIG *sig) { + const EC_GROUP *group = EC_KEY_get0_group(key); + if (group == nullptr) { + OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); + if (max_out < 2 * scalar_len) { + OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL); + return 0; + } + if (BN_is_negative(sig->r) || !BN_bn2bin_padded(out, scalar_len, sig->r) || + BN_is_negative(sig->s) || + !BN_bn2bin_padded(out + scalar_len, scalar_len, sig->s)) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); + return 0; + } + *out_len = 2 * scalar_len; + return 1; +} + +int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig, + unsigned int *out_sig_len, const EC_KEY *eckey) { + const ECKey *eckey_impl = FromOpaque(eckey); + if (eckey_impl->ecdsa_meth && eckey_impl->ecdsa_meth->sign) { + return eckey_impl->ecdsa_meth->sign(digest, digest_len, sig, out_sig_len, + (EC_KEY *)eckey /* cast away const */); + } + + *out_sig_len = 0; + uint8_t fixed[ECDSA_MAX_FIXED_LEN]; + size_t fixed_len; + if (!ecdsa_sign_fixed(digest, digest_len, fixed, &fixed_len, sizeof(fixed), + eckey)) { + return 0; + } + + // TODO(davidben): We can actually do better and go straight from the DER + // format to the fixed-width format without a malloc. + UniquePtr s(ecdsa_sig_from_fixed(eckey, fixed, fixed_len)); + if (s == nullptr) { + return 0; + } + + CBB cbb; + CBB_init_fixed(&cbb, sig, ECDSA_size(eckey)); + size_t len; + if (!ECDSA_SIG_marshal(&cbb, s.get()) || !CBB_finish(&cbb, nullptr, &len)) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_ENCODE_ERROR); + return 0; + } + *out_sig_len = static_cast(len); + return 1; +} + +int ECDSA_verify(int type, const uint8_t *digest, size_t digest_len, + const uint8_t *sig, size_t sig_len, const EC_KEY *eckey) { + // Decode the ECDSA signature. + // + // TODO(davidben): We can actually do better and go straight from the DER + // format to the fixed-width format without a malloc. + int ret = 0; + uint8_t *der = nullptr; + UniquePtr s(ECDSA_SIG_from_bytes(sig, sig_len)); + if (s == nullptr) { + goto err; + } + + // Defend against potential laxness in the DER parser. + size_t der_len; + if (!ECDSA_SIG_to_bytes(&der, &der_len, s.get()) || der_len != sig_len || + OPENSSL_memcmp(sig, der, sig_len) != 0) { + // This should never happen. crypto/bytestring is strictly DER. + OPENSSL_PUT_ERROR(ECDSA, ERR_R_INTERNAL_ERROR); + goto err; + } + + uint8_t fixed[ECDSA_MAX_FIXED_LEN]; + size_t fixed_len; + ret = ecdsa_sig_to_fixed(eckey, fixed, &fixed_len, sizeof(fixed), s.get()) && + ecdsa_verify_fixed(digest, digest_len, fixed, fixed_len, eckey); + +err: + OPENSSL_free(der); + return ret; +} + + +size_t ECDSA_size(const EC_KEY *key) { + if (key == nullptr) { + return 0; + } + + const EC_GROUP *group = EC_KEY_get0_group(key); + if (group == nullptr) { + return 0; + } + + size_t group_order_size = BN_num_bytes(EC_GROUP_get0_order(group)); + return ECDSA_SIG_max_len(group_order_size); +} + +ECDSA_SIG *ECDSA_SIG_new() { + ECDSA_SIG *sig = New(); + if (sig == nullptr) { + return nullptr; + } + sig->r = BN_new(); + sig->s = BN_new(); + if (sig->r == nullptr || sig->s == nullptr) { + ECDSA_SIG_free(sig); + return nullptr; + } + return sig; +} + +void ECDSA_SIG_free(ECDSA_SIG *sig) { + if (sig == nullptr) { + return; + } + + BN_free(sig->r); + BN_free(sig->s); + Delete(sig); +} + +const BIGNUM *ECDSA_SIG_get0_r(const ECDSA_SIG *sig) { return sig->r; } + +const BIGNUM *ECDSA_SIG_get0_s(const ECDSA_SIG *sig) { return sig->s; } + +void ECDSA_SIG_get0(const ECDSA_SIG *sig, const BIGNUM **out_r, + const BIGNUM **out_s) { + if (out_r != nullptr) { + *out_r = sig->r; + } + if (out_s != nullptr) { + *out_s = sig->s; + } +} + +int ECDSA_SIG_set0(ECDSA_SIG *sig, BIGNUM *r, BIGNUM *s) { + if (r == nullptr || s == nullptr) { + return 0; + } + BN_free(sig->r); + BN_free(sig->s); + sig->r = r; + sig->s = s; + return 1; +} + +int ECDSA_do_verify(const uint8_t *digest, size_t digest_len, + const ECDSA_SIG *sig, const EC_KEY *eckey) { + uint8_t fixed[ECDSA_MAX_FIXED_LEN]; + size_t fixed_len; + return ecdsa_sig_to_fixed(eckey, fixed, &fixed_len, sizeof(fixed), sig) && + ecdsa_verify_fixed(digest, digest_len, fixed, fixed_len, eckey); +} + +// This function is only exported for testing and is not called in production +// code. +ECDSA_SIG *ECDSA_sign_with_nonce_and_leak_private_key_for_testing( + const uint8_t *digest, size_t digest_len, const EC_KEY *eckey, + const uint8_t *nonce, size_t nonce_len) { + uint8_t sig[ECDSA_MAX_FIXED_LEN]; + size_t sig_len; + if (!ecdsa_sign_fixed_with_nonce_for_known_answer_test( + digest, digest_len, sig, &sig_len, sizeof(sig), eckey, nonce, + nonce_len)) { + return nullptr; + } + + return ecdsa_sig_from_fixed(eckey, sig, sig_len); +} + +ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len, + const EC_KEY *eckey) { + uint8_t sig[ECDSA_MAX_FIXED_LEN]; + size_t sig_len; + if (!ecdsa_sign_fixed(digest, digest_len, sig, &sig_len, sizeof(sig), + eckey)) { + return nullptr; + } + + return ecdsa_sig_from_fixed(eckey, sig, sig_len); +} + +ECDSA_SIG *ECDSA_SIG_parse(CBS *cbs) { + ECDSA_SIG *ret = ECDSA_SIG_new(); + if (ret == nullptr) { + return nullptr; + } + CBS child; + if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || + !BN_parse_asn1_unsigned(&child, ret->r) || + !BN_parse_asn1_unsigned(&child, ret->s) || CBS_len(&child) != 0) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); + ECDSA_SIG_free(ret); + return nullptr; + } + return ret; +} + +ECDSA_SIG *ECDSA_SIG_from_bytes(const uint8_t *in, size_t in_len) { + CBS cbs; + CBS_init(&cbs, in, in_len); + ECDSA_SIG *ret = ECDSA_SIG_parse(&cbs); + if (ret == nullptr || CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); + ECDSA_SIG_free(ret); + return nullptr; + } + return ret; +} + +int ECDSA_SIG_marshal(CBB *cbb, const ECDSA_SIG *sig) { + CBB child; + if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || + !BN_marshal_asn1(&child, sig->r) || !BN_marshal_asn1(&child, sig->s) || + !CBB_flush(cbb)) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_ENCODE_ERROR); + return 0; + } + return 1; +} + +int ECDSA_SIG_to_bytes(uint8_t **out_bytes, size_t *out_len, + const ECDSA_SIG *sig) { + CBB cbb; + CBB_zero(&cbb); + if (!CBB_init(&cbb, 0) || !ECDSA_SIG_marshal(&cbb, sig) || + !CBB_finish(&cbb, out_bytes, out_len)) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_ENCODE_ERROR); + CBB_cleanup(&cbb); + return 0; + } + return 1; +} + +// der_len_len returns the number of bytes needed to represent a length of |len| +// in DER. +static size_t der_len_len(size_t len) { + if (len < 0x80) { + return 1; + } + size_t ret = 1; + while (len > 0) { + ret++; + len >>= 8; + } + return ret; +} + +size_t ECDSA_SIG_max_len(size_t order_len) { + // Compute the maximum length of an |order_len| byte integer. Defensively + // assume that the leading 0x00 is included. + size_t integer_len = 1 /* tag */ + der_len_len(order_len + 1) + 1 + order_len; + if (integer_len < order_len) { + return 0; + } + // An ECDSA signature is two INTEGERs. + size_t value_len = 2 * integer_len; + if (value_len < integer_len) { + return 0; + } + // Add the header. + size_t ret = 1 /* tag */ + der_len_len(value_len) + value_len; + if (ret < value_len) { + return 0; + } + return ret; +} + +ECDSA_SIG *d2i_ECDSA_SIG(ECDSA_SIG **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, ECDSA_SIG_parse); +} + +int i2d_ECDSA_SIG(const ECDSA_SIG *sig, uint8_t **outp) { + return I2DFromCBB( + /*initial_capacity=*/64, outp, + [&](CBB *cbb) -> bool { return ECDSA_SIG_marshal(cbb, sig); }); +} diff --git a/third_party/boringssl/src/crypto/ecdsa/ecdsa_p1363.cc b/third_party/boringssl/src/crypto/ecdsa/ecdsa_p1363.cc new file mode 100644 index 00000000..91e40acf --- /dev/null +++ b/third_party/boringssl/src/crypto/ecdsa/ecdsa_p1363.cc @@ -0,0 +1,54 @@ +// Copyright 2025 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include + +#include "../fipsmodule/ecdsa/internal.h" + + +using namespace bssl; + +int ECDSA_sign_p1363(const uint8_t *digest, size_t digest_len, uint8_t *sig, + size_t *out_sig_len, size_t max_sig_len, + const EC_KEY *eckey) { + return ecdsa_sign_fixed(digest, digest_len, sig, out_sig_len, max_sig_len, + eckey); +} + +int ECDSA_verify_p1363(const uint8_t *digest, size_t digest_len, + const uint8_t *sig, size_t sig_len, + const EC_KEY *eckey) { + return ecdsa_verify_fixed(digest, digest_len, sig, sig_len, eckey); +} + +size_t ECDSA_size_p1363(const EC_KEY *key) { + if (key == nullptr) { + return 0; + } + + const EC_GROUP *group = EC_KEY_get0_group(key); + if (group == nullptr) { + return 0; + } + + size_t group_order_size = BN_num_bytes(EC_GROUP_get0_order(group)); + return 2 * group_order_size; +} diff --git a/third_party/boringssl/src/crypto/ecdsa_extra/ecdsa_asn1.c b/third_party/boringssl/src/crypto/ecdsa_extra/ecdsa_asn1.c deleted file mode 100644 index e6212cc3..00000000 --- a/third_party/boringssl/src/crypto/ecdsa_extra/ecdsa_asn1.c +++ /dev/null @@ -1,267 +0,0 @@ -/* ==================================================================== - * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include "../bytestring/internal.h" -#include "../fipsmodule/ec/internal.h" -#include "../internal.h" - - -int ECDSA_sign(int type, const uint8_t *digest, size_t digest_len, uint8_t *sig, - unsigned int *sig_len, const EC_KEY *eckey) { - if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) { - return eckey->ecdsa_meth->sign(digest, digest_len, sig, sig_len, - (EC_KEY*) eckey /* cast away const */); - } - - int ret = 0; - ECDSA_SIG *s = ECDSA_do_sign(digest, digest_len, eckey); - if (s == NULL) { - *sig_len = 0; - goto err; - } - - CBB cbb; - CBB_zero(&cbb); - size_t len; - if (!CBB_init_fixed(&cbb, sig, ECDSA_size(eckey)) || - !ECDSA_SIG_marshal(&cbb, s) || - !CBB_finish(&cbb, NULL, &len)) { - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_ENCODE_ERROR); - CBB_cleanup(&cbb); - *sig_len = 0; - goto err; - } - *sig_len = (unsigned)len; - ret = 1; - -err: - ECDSA_SIG_free(s); - return ret; -} - -int ECDSA_verify(int type, const uint8_t *digest, size_t digest_len, - const uint8_t *sig, size_t sig_len, const EC_KEY *eckey) { - ECDSA_SIG *s; - int ret = 0; - uint8_t *der = NULL; - - // Decode the ECDSA signature. - s = ECDSA_SIG_from_bytes(sig, sig_len); - if (s == NULL) { - goto err; - } - - // Defend against potential laxness in the DER parser. - size_t der_len; - if (!ECDSA_SIG_to_bytes(&der, &der_len, s) || - der_len != sig_len || OPENSSL_memcmp(sig, der, sig_len) != 0) { - // This should never happen. crypto/bytestring is strictly DER. - OPENSSL_PUT_ERROR(ECDSA, ERR_R_INTERNAL_ERROR); - goto err; - } - - ret = ECDSA_do_verify(digest, digest_len, s, eckey); - -err: - OPENSSL_free(der); - ECDSA_SIG_free(s); - return ret; -} - - -size_t ECDSA_size(const EC_KEY *key) { - if (key == NULL) { - return 0; - } - - size_t group_order_size; - if (key->ecdsa_meth && key->ecdsa_meth->group_order_size) { - group_order_size = key->ecdsa_meth->group_order_size(key); - } else { - const EC_GROUP *group = EC_KEY_get0_group(key); - if (group == NULL) { - return 0; - } - - group_order_size = BN_num_bytes(EC_GROUP_get0_order(group)); - } - - return ECDSA_SIG_max_len(group_order_size); -} - -ECDSA_SIG *ECDSA_SIG_parse(CBS *cbs) { - ECDSA_SIG *ret = ECDSA_SIG_new(); - if (ret == NULL) { - return NULL; - } - CBS child; - if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || - !BN_parse_asn1_unsigned(&child, ret->r) || - !BN_parse_asn1_unsigned(&child, ret->s) || - CBS_len(&child) != 0) { - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); - ECDSA_SIG_free(ret); - return NULL; - } - return ret; -} - -ECDSA_SIG *ECDSA_SIG_from_bytes(const uint8_t *in, size_t in_len) { - CBS cbs; - CBS_init(&cbs, in, in_len); - ECDSA_SIG *ret = ECDSA_SIG_parse(&cbs); - if (ret == NULL || CBS_len(&cbs) != 0) { - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); - ECDSA_SIG_free(ret); - return NULL; - } - return ret; -} - -int ECDSA_SIG_marshal(CBB *cbb, const ECDSA_SIG *sig) { - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || - !BN_marshal_asn1(&child, sig->r) || - !BN_marshal_asn1(&child, sig->s) || - !CBB_flush(cbb)) { - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_ENCODE_ERROR); - return 0; - } - return 1; -} - -int ECDSA_SIG_to_bytes(uint8_t **out_bytes, size_t *out_len, - const ECDSA_SIG *sig) { - CBB cbb; - CBB_zero(&cbb); - if (!CBB_init(&cbb, 0) || - !ECDSA_SIG_marshal(&cbb, sig) || - !CBB_finish(&cbb, out_bytes, out_len)) { - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_ENCODE_ERROR); - CBB_cleanup(&cbb); - return 0; - } - return 1; -} - -// der_len_len returns the number of bytes needed to represent a length of |len| -// in DER. -static size_t der_len_len(size_t len) { - if (len < 0x80) { - return 1; - } - size_t ret = 1; - while (len > 0) { - ret++; - len >>= 8; - } - return ret; -} - -size_t ECDSA_SIG_max_len(size_t order_len) { - // Compute the maximum length of an |order_len| byte integer. Defensively - // assume that the leading 0x00 is included. - size_t integer_len = 1 /* tag */ + der_len_len(order_len + 1) + 1 + order_len; - if (integer_len < order_len) { - return 0; - } - // An ECDSA signature is two INTEGERs. - size_t value_len = 2 * integer_len; - if (value_len < integer_len) { - return 0; - } - // Add the header. - size_t ret = 1 /* tag */ + der_len_len(value_len) + value_len; - if (ret < value_len) { - return 0; - } - return ret; -} - -ECDSA_SIG *d2i_ECDSA_SIG(ECDSA_SIG **out, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - ECDSA_SIG *ret = ECDSA_SIG_parse(&cbs); - if (ret == NULL) { - return NULL; - } - if (out != NULL) { - ECDSA_SIG_free(*out); - *out = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -int i2d_ECDSA_SIG(const ECDSA_SIG *sig, uint8_t **outp) { - CBB cbb; - if (!CBB_init(&cbb, 0) || - !ECDSA_SIG_marshal(&cbb, sig)) { - CBB_cleanup(&cbb); - return -1; - } - return CBB_finish_i2d(&cbb, outp); -} diff --git a/third_party/boringssl/src/crypto/engine/engine.c b/third_party/boringssl/src/crypto/engine/engine.c deleted file mode 100644 index 973a57c8..00000000 --- a/third_party/boringssl/src/crypto/engine/engine.c +++ /dev/null @@ -1,99 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include "../internal.h" - - -struct engine_st { - RSA_METHOD *rsa_method; - ECDSA_METHOD *ecdsa_method; -}; - -ENGINE *ENGINE_new(void) { - ENGINE *engine = OPENSSL_malloc(sizeof(ENGINE)); - if (engine == NULL) { - return NULL; - } - - OPENSSL_memset(engine, 0, sizeof(ENGINE)); - return engine; -} - -int ENGINE_free(ENGINE *engine) { - // Methods are currently required to be static so are not unref'ed. - OPENSSL_free(engine); - return 1; -} - -// set_method takes a pointer to a method and its given size and sets -// |*out_member| to point to it. This function might want to be extended in the -// future to support making a copy of the method so that a stable ABI for -// ENGINEs can be supported. But, for the moment, all *_METHODS must be -// static. -static int set_method(void **out_member, const void *method, size_t method_size, - size_t compiled_size) { - const struct openssl_method_common_st *common = method; - if (method_size != compiled_size || !common->is_static) { - return 0; - } - - *out_member = (void*) method; - return 1; -} - -int ENGINE_set_RSA_method(ENGINE *engine, const RSA_METHOD *method, - size_t method_size) { - return set_method((void **)&engine->rsa_method, method, method_size, - sizeof(RSA_METHOD)); -} - -RSA_METHOD *ENGINE_get_RSA_method(const ENGINE *engine) { - return engine->rsa_method; -} - -int ENGINE_set_ECDSA_method(ENGINE *engine, const ECDSA_METHOD *method, - size_t method_size) { - return set_method((void **)&engine->ecdsa_method, method, method_size, - sizeof(ECDSA_METHOD)); -} - -ECDSA_METHOD *ENGINE_get_ECDSA_method(const ENGINE *engine) { - return engine->ecdsa_method; -} - -void METHOD_ref(void *method_in) { - assert(((struct openssl_method_common_st*) method_in)->is_static); -} - -void METHOD_unref(void *method_in) { - struct openssl_method_common_st *method = method_in; - - if (method == NULL) { - return; - } - assert(method->is_static); -} - -OPENSSL_DECLARE_ERROR_REASON(ENGINE, OPERATION_NOT_SUPPORTED) diff --git a/third_party/boringssl/src/crypto/engine/engine.cc b/third_party/boringssl/src/crypto/engine/engine.cc new file mode 100644 index 00000000..4d39cd83 --- /dev/null +++ b/third_party/boringssl/src/crypto/engine/engine.cc @@ -0,0 +1,95 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" + + +using namespace bssl; + +struct engine_st { + RSA_METHOD *rsa_method; + ECDSA_METHOD *ecdsa_method; +}; + +ENGINE *ENGINE_new() { return New(); } + +int ENGINE_free(ENGINE *engine) { + // Methods are currently required to be static so are not unref'ed. + Delete(engine); + return 1; +} + +// set_method takes a pointer to a method and its given size and sets +// |*out_member| to point to it. This function might want to be extended in the +// future to support making a copy of the method so that a stable ABI for +// ENGINEs can be supported. But, for the moment, all *_METHODS must be +// static. +static int set_method(void **out_member, const void *method, size_t method_size, + size_t compiled_size) { + const struct openssl_method_common_st *common = + reinterpret_cast(method); + if (method_size != compiled_size || !common->is_static) { + return 0; + } + + *out_member = (void *)method; + return 1; +} + +int ENGINE_set_RSA_method(ENGINE *engine, const RSA_METHOD *method, + size_t method_size) { + return set_method((void **)&engine->rsa_method, method, method_size, + sizeof(RSA_METHOD)); +} + +RSA_METHOD *ENGINE_get_RSA_method(const ENGINE *engine) { + return engine->rsa_method; +} + +int ENGINE_set_ECDSA_method(ENGINE *engine, const ECDSA_METHOD *method, + size_t method_size) { + return set_method((void **)&engine->ecdsa_method, method, method_size, + sizeof(ECDSA_METHOD)); +} + +ECDSA_METHOD *ENGINE_get_ECDSA_method(const ENGINE *engine) { + return engine->ecdsa_method; +} + +void METHOD_ref(void *method_in) { + assert(((struct openssl_method_common_st *)method_in)->is_static); +} + +void METHOD_unref(void *method_in) { + struct openssl_method_common_st *method = + reinterpret_cast(method_in); + + if (method == nullptr) { + return; + } + assert(method->is_static); +} + +OPENSSL_DECLARE_ERROR_REASON(ENGINE, OPERATION_NOT_SUPPORTED) diff --git a/third_party/boringssl/src/crypto/err/err.c b/third_party/boringssl/src/crypto/err/err.c deleted file mode 100644 index 9b6d2381..00000000 --- a/third_party/boringssl/src/crypto/err/err.c +++ /dev/null @@ -1,873 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -/* ==================================================================== - * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include -#include - -#if defined(OPENSSL_WINDOWS) -OPENSSL_MSVC_PRAGMA(warning(push, 3)) -#include -OPENSSL_MSVC_PRAGMA(warning(pop)) -#endif - -#include -#include - -#include "../internal.h" -#include "./internal.h" - - -struct err_error_st { - // file contains the filename where the error occurred. - const char *file; - // data contains a NUL-terminated string with optional data. It must be freed - // with |OPENSSL_free|. - char *data; - // packed contains the error library and reason, as packed by ERR_PACK. - uint32_t packed; - // line contains the line number where the error occurred. - uint16_t line; - // mark indicates a reversion point in the queue. See |ERR_pop_to_mark|. - unsigned mark : 1; -}; - -// ERR_STATE contains the per-thread, error queue. -typedef struct err_state_st { - // errors contains the ERR_NUM_ERRORS most recent errors, organised as a ring - // buffer. - struct err_error_st errors[ERR_NUM_ERRORS]; - // top contains the index one past the most recent error. If |top| equals - // |bottom| then the queue is empty. - unsigned top; - // bottom contains the index of the last error in the queue. - unsigned bottom; - - // to_free, if not NULL, contains a pointer owned by this structure that was - // previously a |data| pointer of one of the elements of |errors|. - void *to_free; -} ERR_STATE; - -extern const uint32_t kOpenSSLReasonValues[]; -extern const size_t kOpenSSLReasonValuesLen; -extern const char kOpenSSLReasonStringData[]; - -// err_clear clears the given queued error. -static void err_clear(struct err_error_st *error) { - OPENSSL_free(error->data); - OPENSSL_memset(error, 0, sizeof(struct err_error_st)); -} - -static void err_copy(struct err_error_st *dst, const struct err_error_st *src) { - err_clear(dst); - dst->file = src->file; - if (src->data != NULL) { - dst->data = OPENSSL_strdup(src->data); - } - dst->packed = src->packed; - dst->line = src->line; -} - -// global_next_library contains the next custom library value to return. -static int global_next_library = ERR_NUM_LIBS; - -// global_next_library_mutex protects |global_next_library| from concurrent -// updates. -static struct CRYPTO_STATIC_MUTEX global_next_library_mutex = - CRYPTO_STATIC_MUTEX_INIT; - -static void err_state_free(void *statep) { - ERR_STATE *state = statep; - - if (state == NULL) { - return; - } - - for (unsigned i = 0; i < ERR_NUM_ERRORS; i++) { - err_clear(&state->errors[i]); - } - OPENSSL_free(state->to_free); - OPENSSL_free(state); -} - -// err_get_state gets the ERR_STATE object for the current thread. -static ERR_STATE *err_get_state(void) { - ERR_STATE *state = CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_ERR); - if (state == NULL) { - state = OPENSSL_malloc(sizeof(ERR_STATE)); - if (state == NULL) { - return NULL; - } - OPENSSL_memset(state, 0, sizeof(ERR_STATE)); - if (!CRYPTO_set_thread_local(OPENSSL_THREAD_LOCAL_ERR, state, - err_state_free)) { - return NULL; - } - } - - return state; -} - -static uint32_t get_error_values(int inc, int top, const char **file, int *line, - const char **data, int *flags) { - unsigned i = 0; - ERR_STATE *state; - struct err_error_st *error; - uint32_t ret; - - state = err_get_state(); - if (state == NULL || state->bottom == state->top) { - return 0; - } - - if (top) { - assert(!inc); - // last error - i = state->top; - } else { - i = (state->bottom + 1) % ERR_NUM_ERRORS; - } - - error = &state->errors[i]; - ret = error->packed; - - if (file != NULL && line != NULL) { - if (error->file == NULL) { - *file = "NA"; - *line = 0; - } else { - *file = error->file; - *line = error->line; - } - } - - if (data != NULL) { - if (error->data == NULL) { - *data = ""; - if (flags != NULL) { - *flags = 0; - } - } else { - *data = error->data; - if (flags != NULL) { - *flags = ERR_FLAG_STRING; - } - // If this error is being removed, take ownership of data from - // the error. The semantics are such that the caller doesn't - // take ownership either. Instead the error system takes - // ownership and retains it until the next call that affects the - // error queue. - if (inc) { - if (error->data != NULL) { - OPENSSL_free(state->to_free); - state->to_free = error->data; - } - error->data = NULL; - } - } - } - - if (inc) { - assert(!top); - err_clear(error); - state->bottom = i; - } - - return ret; -} - -uint32_t ERR_get_error(void) { - return get_error_values(1 /* inc */, 0 /* bottom */, NULL, NULL, NULL, NULL); -} - -uint32_t ERR_get_error_line(const char **file, int *line) { - return get_error_values(1 /* inc */, 0 /* bottom */, file, line, NULL, NULL); -} - -uint32_t ERR_get_error_line_data(const char **file, int *line, - const char **data, int *flags) { - return get_error_values(1 /* inc */, 0 /* bottom */, file, line, data, flags); -} - -uint32_t ERR_peek_error(void) { - return get_error_values(0 /* peek */, 0 /* bottom */, NULL, NULL, NULL, NULL); -} - -uint32_t ERR_peek_error_line(const char **file, int *line) { - return get_error_values(0 /* peek */, 0 /* bottom */, file, line, NULL, NULL); -} - -uint32_t ERR_peek_error_line_data(const char **file, int *line, - const char **data, int *flags) { - return get_error_values(0 /* peek */, 0 /* bottom */, file, line, data, - flags); -} - -uint32_t ERR_peek_last_error(void) { - return get_error_values(0 /* peek */, 1 /* top */, NULL, NULL, NULL, NULL); -} - -uint32_t ERR_peek_last_error_line(const char **file, int *line) { - return get_error_values(0 /* peek */, 1 /* top */, file, line, NULL, NULL); -} - -uint32_t ERR_peek_last_error_line_data(const char **file, int *line, - const char **data, int *flags) { - return get_error_values(0 /* peek */, 1 /* top */, file, line, data, flags); -} - -void ERR_clear_error(void) { - ERR_STATE *const state = err_get_state(); - unsigned i; - - if (state == NULL) { - return; - } - - for (i = 0; i < ERR_NUM_ERRORS; i++) { - err_clear(&state->errors[i]); - } - OPENSSL_free(state->to_free); - state->to_free = NULL; - - state->top = state->bottom = 0; -} - -void ERR_remove_thread_state(const CRYPTO_THREADID *tid) { - if (tid != NULL) { - assert(0); - return; - } - - ERR_clear_error(); -} - -int ERR_get_next_error_library(void) { - int ret; - - CRYPTO_STATIC_MUTEX_lock_write(&global_next_library_mutex); - ret = global_next_library++; - CRYPTO_STATIC_MUTEX_unlock_write(&global_next_library_mutex); - - return ret; -} - -void ERR_remove_state(unsigned long pid) { - ERR_clear_error(); -} - -void ERR_clear_system_error(void) { - errno = 0; -} - -// err_string_cmp is a compare function for searching error values with -// |bsearch| in |err_string_lookup|. -static int err_string_cmp(const void *a, const void *b) { - const uint32_t a_key = *((const uint32_t*) a) >> 15; - const uint32_t b_key = *((const uint32_t*) b) >> 15; - - if (a_key < b_key) { - return -1; - } else if (a_key > b_key) { - return 1; - } else { - return 0; - } -} - -// err_string_lookup looks up the string associated with |lib| and |key| in -// |values| and |string_data|. It returns the string or NULL if not found. -static const char *err_string_lookup(uint32_t lib, uint32_t key, - const uint32_t *values, - size_t num_values, - const char *string_data) { - // |values| points to data in err_data.h, which is generated by - // err_data_generate.go. It's an array of uint32_t values. Each value has the - // following structure: - // | lib | key | offset | - // |6 bits| 11 bits | 15 bits | - // - // The |lib| value is a library identifier: one of the |ERR_LIB_*| values. - // The |key| is a reason code, depending on the context. - // The |offset| is the number of bytes from the start of |string_data| where - // the (NUL terminated) string for this value can be found. - // - // Values are sorted based on treating the |lib| and |key| part as an - // unsigned integer. - if (lib >= (1 << 6) || key >= (1 << 11)) { - return NULL; - } - uint32_t search_key = lib << 26 | key << 15; - const uint32_t *result = bsearch(&search_key, values, num_values, - sizeof(uint32_t), err_string_cmp); - if (result == NULL) { - return NULL; - } - - return &string_data[(*result) & 0x7fff]; -} - -static const char *const kLibraryNames[ERR_NUM_LIBS] = { - "invalid library (0)", - "unknown library", // ERR_LIB_NONE - "system library", // ERR_LIB_SYS - "bignum routines", // ERR_LIB_BN - "RSA routines", // ERR_LIB_RSA - "Diffie-Hellman routines", // ERR_LIB_DH - "public key routines", // ERR_LIB_EVP - "memory buffer routines", // ERR_LIB_BUF - "object identifier routines", // ERR_LIB_OBJ - "PEM routines", // ERR_LIB_PEM - "DSA routines", // ERR_LIB_DSA - "X.509 certificate routines", // ERR_LIB_X509 - "ASN.1 encoding routines", // ERR_LIB_ASN1 - "configuration file routines", // ERR_LIB_CONF - "common libcrypto routines", // ERR_LIB_CRYPTO - "elliptic curve routines", // ERR_LIB_EC - "SSL routines", // ERR_LIB_SSL - "BIO routines", // ERR_LIB_BIO - "PKCS7 routines", // ERR_LIB_PKCS7 - "PKCS8 routines", // ERR_LIB_PKCS8 - "X509 V3 routines", // ERR_LIB_X509V3 - "random number generator", // ERR_LIB_RAND - "ENGINE routines", // ERR_LIB_ENGINE - "OCSP routines", // ERR_LIB_OCSP - "UI routines", // ERR_LIB_UI - "COMP routines", // ERR_LIB_COMP - "ECDSA routines", // ERR_LIB_ECDSA - "ECDH routines", // ERR_LIB_ECDH - "HMAC routines", // ERR_LIB_HMAC - "Digest functions", // ERR_LIB_DIGEST - "Cipher functions", // ERR_LIB_CIPHER - "HKDF functions", // ERR_LIB_HKDF - "Trust Token functions", // ERR_LIB_TRUST_TOKEN - "User defined functions", // ERR_LIB_USER -}; - -static const char *err_lib_error_string(uint32_t packed_error) { - const uint32_t lib = ERR_GET_LIB(packed_error); - - if (lib >= ERR_NUM_LIBS) { - return NULL; - } - return kLibraryNames[lib]; -} - -const char *ERR_lib_error_string(uint32_t packed_error) { - const char *ret = err_lib_error_string(packed_error); - return ret == NULL ? "unknown library" : ret; -} - -const char *ERR_func_error_string(uint32_t packed_error) { - return "OPENSSL_internal"; -} - -static const char *err_reason_error_string(uint32_t packed_error) { - const uint32_t lib = ERR_GET_LIB(packed_error); - const uint32_t reason = ERR_GET_REASON(packed_error); - - if (lib == ERR_LIB_SYS) { - if (reason < 127) { - return strerror(reason); - } - return NULL; - } - - if (reason < ERR_NUM_LIBS) { - return kLibraryNames[reason]; - } - - if (reason < 100) { - switch (reason) { - case ERR_R_MALLOC_FAILURE: - return "malloc failure"; - case ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED: - return "function should not have been called"; - case ERR_R_PASSED_NULL_PARAMETER: - return "passed a null parameter"; - case ERR_R_INTERNAL_ERROR: - return "internal error"; - case ERR_R_OVERFLOW: - return "overflow"; - default: - return NULL; - } - } - - return err_string_lookup(lib, reason, kOpenSSLReasonValues, - kOpenSSLReasonValuesLen, kOpenSSLReasonStringData); -} - -const char *ERR_reason_error_string(uint32_t packed_error) { - const char *ret = err_reason_error_string(packed_error); - return ret == NULL ? "unknown error" : ret; -} - -char *ERR_error_string(uint32_t packed_error, char *ret) { - static char buf[ERR_ERROR_STRING_BUF_LEN]; - - if (ret == NULL) { - // TODO(fork): remove this. - ret = buf; - } - -#if !defined(NDEBUG) - // This is aimed to help catch callers who don't provide - // |ERR_ERROR_STRING_BUF_LEN| bytes of space. - OPENSSL_memset(ret, 0, ERR_ERROR_STRING_BUF_LEN); -#endif - - return ERR_error_string_n(packed_error, ret, ERR_ERROR_STRING_BUF_LEN); -} - -char *ERR_error_string_n(uint32_t packed_error, char *buf, size_t len) { - if (len == 0) { - return NULL; - } - - unsigned lib = ERR_GET_LIB(packed_error); - unsigned reason = ERR_GET_REASON(packed_error); - - const char *lib_str = err_lib_error_string(packed_error); - const char *reason_str = err_reason_error_string(packed_error); - - char lib_buf[64], reason_buf[64]; - if (lib_str == NULL) { - BIO_snprintf(lib_buf, sizeof(lib_buf), "lib(%u)", lib); - lib_str = lib_buf; - } - - if (reason_str == NULL) { - BIO_snprintf(reason_buf, sizeof(reason_buf), "reason(%u)", reason); - reason_str = reason_buf; - } - - BIO_snprintf(buf, len, "error:%08" PRIx32 ":%s:OPENSSL_internal:%s", - packed_error, lib_str, reason_str); - - if (strlen(buf) == len - 1) { - // output may be truncated; make sure we always have 5 colon-separated - // fields, i.e. 4 colons. - static const unsigned num_colons = 4; - unsigned i; - char *s = buf; - - if (len <= num_colons) { - // In this situation it's not possible to ensure that the correct number - // of colons are included in the output. - return buf; - } - - for (i = 0; i < num_colons; i++) { - char *colon = strchr(s, ':'); - char *last_pos = &buf[len - 1] - num_colons + i; - - if (colon == NULL || colon > last_pos) { - // set colon |i| at last possible position (buf[len-1] is the - // terminating 0). If we're setting this colon, then all whole of the - // rest of the string must be colons in order to have the correct - // number. - OPENSSL_memset(last_pos, ':', num_colons - i); - break; - } - - s = colon + 1; - } - } - - return buf; -} - -void ERR_print_errors_cb(ERR_print_errors_callback_t callback, void *ctx) { - char buf[ERR_ERROR_STRING_BUF_LEN]; - char buf2[1024]; - const char *file, *data; - int line, flags; - uint32_t packed_error; - - // thread_hash is the least-significant bits of the |ERR_STATE| pointer value - // for this thread. - const unsigned long thread_hash = (uintptr_t) err_get_state(); - - for (;;) { - packed_error = ERR_get_error_line_data(&file, &line, &data, &flags); - if (packed_error == 0) { - break; - } - - ERR_error_string_n(packed_error, buf, sizeof(buf)); - BIO_snprintf(buf2, sizeof(buf2), "%lu:%s:%s:%d:%s\n", thread_hash, buf, - file, line, (flags & ERR_FLAG_STRING) ? data : ""); - if (callback(buf2, strlen(buf2), ctx) <= 0) { - break; - } - } -} - -static int print_errors_to_file(const char* msg, size_t msg_len, void* ctx) { - assert(msg[msg_len] == '\0'); - FILE* fp = ctx; - int res = fputs(msg, fp); - return res < 0 ? 0 : 1; -} - -void ERR_print_errors_fp(FILE *file) { - ERR_print_errors_cb(print_errors_to_file, file); -} - -// err_set_error_data sets the data on the most recent error. -static void err_set_error_data(char *data) { - ERR_STATE *const state = err_get_state(); - struct err_error_st *error; - - if (state == NULL || state->top == state->bottom) { - OPENSSL_free(data); - return; - } - - error = &state->errors[state->top]; - - OPENSSL_free(error->data); - error->data = data; -} - -void ERR_put_error(int library, int unused, int reason, const char *file, - unsigned line) { - ERR_STATE *const state = err_get_state(); - struct err_error_st *error; - - if (state == NULL) { - return; - } - - if (library == ERR_LIB_SYS && reason == 0) { -#if defined(OPENSSL_WINDOWS) - reason = GetLastError(); -#else - reason = errno; -#endif - } - - state->top = (state->top + 1) % ERR_NUM_ERRORS; - if (state->top == state->bottom) { - state->bottom = (state->bottom + 1) % ERR_NUM_ERRORS; - } - - error = &state->errors[state->top]; - err_clear(error); - error->file = file; - error->line = line; - error->packed = ERR_PACK(library, reason); -} - -// ERR_add_error_data_vdata takes a variable number of const char* pointers, -// concatenates them and sets the result as the data on the most recent -// error. -static void err_add_error_vdata(unsigned num, va_list args) { - size_t alloced, new_len, len = 0, substr_len; - char *buf; - const char *substr; - unsigned i; - - alloced = 80; - buf = OPENSSL_malloc(alloced + 1); - if (buf == NULL) { - return; - } - - for (i = 0; i < num; i++) { - substr = va_arg(args, const char *); - if (substr == NULL) { - continue; - } - - substr_len = strlen(substr); - new_len = len + substr_len; - if (new_len > alloced) { - char *new_buf; - - if (alloced + 20 + 1 < alloced) { - // overflow. - OPENSSL_free(buf); - return; - } - - alloced = new_len + 20; - new_buf = OPENSSL_realloc(buf, alloced + 1); - if (new_buf == NULL) { - OPENSSL_free(buf); - return; - } - buf = new_buf; - } - - OPENSSL_memcpy(buf + len, substr, substr_len); - len = new_len; - } - - buf[len] = 0; - err_set_error_data(buf); -} - -void ERR_add_error_data(unsigned count, ...) { - va_list args; - va_start(args, count); - err_add_error_vdata(count, args); - va_end(args); -} - -void ERR_add_error_dataf(const char *format, ...) { - va_list ap; - char *buf; - static const unsigned buf_len = 256; - - // A fixed-size buffer is used because va_copy (which would be needed in - // order to call vsnprintf twice and measure the buffer) wasn't defined until - // C99. - buf = OPENSSL_malloc(buf_len + 1); - if (buf == NULL) { - return; - } - - va_start(ap, format); - BIO_vsnprintf(buf, buf_len, format, ap); - buf[buf_len] = 0; - va_end(ap); - - err_set_error_data(buf); -} - -void ERR_set_error_data(char *data, int flags) { - if (!(flags & ERR_FLAG_STRING)) { - // We do not support non-string error data. - assert(0); - return; - } - if (flags & ERR_FLAG_MALLOCED) { - err_set_error_data(data); - } else { - char *copy = OPENSSL_strdup(data); - if (copy != NULL) { - err_set_error_data(copy); - } - } -} - -int ERR_set_mark(void) { - ERR_STATE *const state = err_get_state(); - - if (state == NULL || state->bottom == state->top) { - return 0; - } - state->errors[state->top].mark = 1; - return 1; -} - -int ERR_pop_to_mark(void) { - ERR_STATE *const state = err_get_state(); - - if (state == NULL) { - return 0; - } - - while (state->bottom != state->top) { - struct err_error_st *error = &state->errors[state->top]; - - if (error->mark) { - error->mark = 0; - return 1; - } - - err_clear(error); - if (state->top == 0) { - state->top = ERR_NUM_ERRORS - 1; - } else { - state->top--; - } - } - - return 0; -} - -void ERR_load_crypto_strings(void) {} - -void ERR_free_strings(void) {} - -void ERR_load_BIO_strings(void) {} - -void ERR_load_ERR_strings(void) {} - -void ERR_load_RAND_strings(void) {} - -struct err_save_state_st { - struct err_error_st *errors; - size_t num_errors; -}; - -void ERR_SAVE_STATE_free(ERR_SAVE_STATE *state) { - if (state == NULL) { - return; - } - for (size_t i = 0; i < state->num_errors; i++) { - err_clear(&state->errors[i]); - } - OPENSSL_free(state->errors); - OPENSSL_free(state); -} - -ERR_SAVE_STATE *ERR_save_state(void) { - ERR_STATE *const state = err_get_state(); - if (state == NULL || state->top == state->bottom) { - return NULL; - } - - ERR_SAVE_STATE *ret = OPENSSL_malloc(sizeof(ERR_SAVE_STATE)); - if (ret == NULL) { - return NULL; - } - - // Errors are stored in the range (bottom, top]. - size_t num_errors = state->top >= state->bottom - ? state->top - state->bottom - : ERR_NUM_ERRORS + state->top - state->bottom; - assert(num_errors < ERR_NUM_ERRORS); - ret->errors = OPENSSL_malloc(num_errors * sizeof(struct err_error_st)); - if (ret->errors == NULL) { - OPENSSL_free(ret); - return NULL; - } - OPENSSL_memset(ret->errors, 0, num_errors * sizeof(struct err_error_st)); - ret->num_errors = num_errors; - - for (size_t i = 0; i < num_errors; i++) { - size_t j = (state->bottom + i + 1) % ERR_NUM_ERRORS; - err_copy(&ret->errors[i], &state->errors[j]); - } - return ret; -} - -void ERR_restore_state(const ERR_SAVE_STATE *state) { - if (state == NULL || state->num_errors == 0) { - ERR_clear_error(); - return; - } - - ERR_STATE *const dst = err_get_state(); - if (dst == NULL) { - return; - } - - for (size_t i = 0; i < state->num_errors; i++) { - err_copy(&dst->errors[i], &state->errors[i]); - } - dst->top = state->num_errors - 1; - dst->bottom = ERR_NUM_ERRORS - 1; -} diff --git a/third_party/boringssl/src/crypto/err/err.cc b/third_party/boringssl/src/crypto/err/err.cc new file mode 100644 index 00000000..b3fe3164 --- /dev/null +++ b/third_party/boringssl/src/crypto/err/err.cc @@ -0,0 +1,818 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Ensure we can't call OPENSSL_malloc circularly. +#define _BORINGSSL_PROHIBIT_OPENSSL_MALLOC +#include + +#include +#include +#include +#include +#include +#include + +#if defined(OPENSSL_WINDOWS) +#include +#endif + +#include + +#include "../internal.h" +#include "./internal.h" + + +using namespace bssl; + +namespace { +struct err_error_st { + // file contains the filename where the error occurred. + const char *file; + // data contains a NUL-terminated string with optional data. It is allocated + // with system |malloc| and must be freed with |free| (not |OPENSSL_free|) + char *data; + // packed contains the error library and reason, as packed by ERR_PACK. + uint32_t packed; + // line contains the line number where the error occurred. + uint16_t line; + // mark indicates a reversion point in the queue. See |ERR_pop_to_mark|. + unsigned mark : 1; +}; + +// ERR_STATE contains the per-thread, error queue. +typedef struct err_state_st { + // errors contains up to ERR_NUM_ERRORS - 1 most recent errors, organised as a + // ring buffer. + struct err_error_st errors[ERR_NUM_ERRORS]; + // top contains the index of the most recent error. If |top| equals |bottom| + // then the queue is empty. + unsigned top; + // bottom contains the index before the least recent error in the queue. + unsigned bottom; + + // to_free, if not NULL, contains a pointer owned by this structure that was + // previously a |data| pointer of one of the elements of |errors|. + void *to_free; +} ERR_STATE; +} // namespace + +BSSL_NAMESPACE_BEGIN + +extern const uint32_t kOpenSSLReasonValues[]; +extern const size_t kOpenSSLReasonValuesLen; +extern const char kOpenSSLReasonStringData[]; + +BSSL_NAMESPACE_END + +static char *strdup_libc_malloc(const char *str) { + // |strdup| is not in C until C23, so MSVC triggers deprecation warnings, and + // glibc and musl gate it on a feature macro. Reimplementing it is easier. + size_t len = strlen(str); + char *ret = reinterpret_cast(malloc(len + 1)); + if (ret != nullptr) { + memcpy(ret, str, len + 1); + } + return ret; +} + +// err_clear clears the given queued error. +static void err_clear(struct err_error_st *error) { + free(error->data); + OPENSSL_memset(error, 0, sizeof(struct err_error_st)); +} + +static void err_copy(struct err_error_st *dst, const struct err_error_st *src) { + err_clear(dst); + dst->file = src->file; + if (src->data != nullptr) { + // We can't use OPENSSL_strdup because we don't want to call OPENSSL_malloc, + // which can affect the error stack. + dst->data = strdup_libc_malloc(src->data); + } + dst->packed = src->packed; + dst->line = src->line; +} + + +// global_next_library contains the next custom library value to return. +static int global_next_library = ERR_NUM_LIBS; + +// global_next_library_mutex protects |global_next_library| from concurrent +// updates. +static StaticMutex global_next_library_mutex; + +static void err_state_free(void *statep) { + ERR_STATE *state = reinterpret_cast(statep); + + if (state == nullptr) { + return; + } + + for (unsigned i = 0; i < ERR_NUM_ERRORS; i++) { + err_clear(&state->errors[i]); + } + free(state->to_free); + free(state); +} + +// err_get_state gets the ERR_STATE object for the current thread. +static ERR_STATE *err_get_state() { + ERR_STATE *state = reinterpret_cast( + CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_ERR)); + if (state == nullptr) { + state = reinterpret_cast(malloc(sizeof(ERR_STATE))); + if (state == nullptr) { + return nullptr; + } + OPENSSL_memset(state, 0, sizeof(ERR_STATE)); + if (!CRYPTO_set_thread_local(OPENSSL_THREAD_LOCAL_ERR, state, + err_state_free)) { + return nullptr; + } + } + + return state; +} + +static uint32_t get_error_values(int inc, int top, const char **file, int *line, + const char **data, int *flags) { + unsigned i = 0; + ERR_STATE *state; + struct err_error_st *error; + uint32_t ret; + + state = err_get_state(); + if (state == nullptr || state->bottom == state->top) { + return 0; + } + + if (top) { + assert(!inc); + // last error + i = state->top; + } else { + i = (state->bottom + 1) % ERR_NUM_ERRORS; + } + + error = &state->errors[i]; + ret = error->packed; + + if (file != nullptr && line != nullptr) { + if (error->file == nullptr) { + *file = "NA"; + *line = 0; + } else { + *file = error->file; + *line = error->line; + } + } + + if (data != nullptr) { + if (error->data == nullptr) { + *data = ""; + if (flags != nullptr) { + *flags = 0; + } + } else { + *data = error->data; + if (flags != nullptr) { + // Without |ERR_FLAG_MALLOCED|, rust-openssl assumes the string has a + // static lifetime. In both cases, we retain ownership of the string, + // and the caller is not expected to free it. + *flags = ERR_FLAG_STRING | ERR_FLAG_MALLOCED; + } + // If this error is being removed, take ownership of data from + // the error. The semantics are such that the caller doesn't + // take ownership either. Instead the error system takes + // ownership and retains it until the next call that affects the + // error queue. + if (inc) { + if (error->data != nullptr) { + free(state->to_free); + state->to_free = error->data; + } + error->data = nullptr; + } + } + } + + if (inc) { + assert(!top); + err_clear(error); + state->bottom = i; + } + + return ret; +} + +uint32_t ERR_get_error() { + return get_error_values(1 /* inc */, 0 /* bottom */, nullptr, nullptr, + nullptr, nullptr); +} + +uint32_t ERR_get_error_line(const char **file, int *line) { + return get_error_values(1 /* inc */, 0 /* bottom */, file, line, nullptr, + nullptr); +} + +uint32_t ERR_get_error_line_data(const char **file, int *line, + const char **data, int *flags) { + return get_error_values(1 /* inc */, 0 /* bottom */, file, line, data, flags); +} + +uint32_t ERR_peek_error() { + return get_error_values(0 /* peek */, 0 /* bottom */, nullptr, nullptr, + nullptr, nullptr); +} + +uint32_t ERR_peek_error_line(const char **file, int *line) { + return get_error_values(0 /* peek */, 0 /* bottom */, file, line, nullptr, + nullptr); +} + +uint32_t ERR_peek_error_line_data(const char **file, int *line, + const char **data, int *flags) { + return get_error_values(0 /* peek */, 0 /* bottom */, file, line, data, + flags); +} + +uint32_t ERR_peek_last_error() { + return get_error_values(0 /* peek */, 1 /* top */, nullptr, nullptr, nullptr, + nullptr); +} + +uint32_t ERR_peek_last_error_line(const char **file, int *line) { + return get_error_values(0 /* peek */, 1 /* top */, file, line, nullptr, + nullptr); +} + +uint32_t ERR_peek_last_error_line_data(const char **file, int *line, + const char **data, int *flags) { + return get_error_values(0 /* peek */, 1 /* top */, file, line, data, flags); +} + +void ERR_clear_error() { + ERR_STATE *const state = err_get_state(); + unsigned i; + + if (state == nullptr) { + return; + } + + for (i = 0; i < ERR_NUM_ERRORS; i++) { + err_clear(&state->errors[i]); + } + free(state->to_free); + state->to_free = nullptr; + + state->top = state->bottom = 0; +} + +void ERR_remove_thread_state(const CRYPTO_THREADID *tid) { + if (tid != nullptr) { + assert(0); + return; + } + + ERR_clear_error(); +} + +int ERR_get_next_error_library() { + MutexWriteLock lock(&global_next_library_mutex); + return global_next_library++; +} + +void ERR_remove_state(unsigned long pid) { ERR_clear_error(); } + +void ERR_clear_system_error() { errno = 0; } + +// err_string_cmp is a compare function for searching error values with +// |bsearch| in |err_string_lookup|. +static int err_string_cmp(const void *a, const void *b) { + const uint32_t a_key = *((const uint32_t *)a) >> 15; + const uint32_t b_key = *((const uint32_t *)b) >> 15; + + if (a_key < b_key) { + return -1; + } else if (a_key > b_key) { + return 1; + } else { + return 0; + } +} + +// err_string_lookup looks up the string associated with |lib| and |key| in +// |values| and |string_data|. It returns the string or NULL if not found. +static const char *err_string_lookup(uint32_t lib, uint32_t key, + const uint32_t *values, size_t num_values, + const char *string_data) { + // |values| points to data in err_data.h, which is generated by + // err_data_generate.go. It's an array of uint32_t values. Each value has the + // following structure: + // | lib | key | offset | + // |6 bits| 11 bits | 15 bits | + // + // The |lib| value is a library identifier: one of the |ERR_LIB_*| values. + // The |key| is a reason code, depending on the context. + // The |offset| is the number of bytes from the start of |string_data| where + // the (NUL terminated) string for this value can be found. + // + // Values are sorted based on treating the |lib| and |key| part as an + // unsigned integer. + if (lib >= (1 << 6) || key >= (1 << 11)) { + return nullptr; + } + uint32_t search_key = lib << 26 | key << 15; + const uint32_t *result = reinterpret_cast(bsearch( + &search_key, values, num_values, sizeof(uint32_t), err_string_cmp)); + if (result == nullptr) { + return nullptr; + } + + return &string_data[(*result) & 0x7fff]; +} + +namespace { +typedef struct library_name_st { + const char *str; + const char *symbol; + const char *reason_symbol; +} LIBRARY_NAME; +} // namespace + +static const LIBRARY_NAME kLibraryNames[ERR_NUM_LIBS] = { + {"invalid library (0)", nullptr, nullptr}, + {"unknown library", "NONE", "NONE_LIB"}, + {"system library", "SYS", "SYS_LIB"}, + {"bignum routines", "BN", "BN_LIB"}, + {"RSA routines", "RSA", "RSA_LIB"}, + {"Diffie-Hellman routines", "DH", "DH_LIB"}, + {"public key routines", "EVP", "EVP_LIB"}, + {"memory buffer routines", "BUF", "BUF_LIB"}, + {"object identifier routines", "OBJ", "OBJ_LIB"}, + {"PEM routines", "PEM", "PEM_LIB"}, + {"DSA routines", "DSA", "DSA_LIB"}, + {"X.509 certificate routines", "X509", "X509_LIB"}, + {"ASN.1 encoding routines", "ASN1", "ASN1_LIB"}, + {"configuration file routines", "CONF", "CONF_LIB"}, + {"common libcrypto routines", "CRYPTO", "CRYPTO_LIB"}, + {"elliptic curve routines", "EC", "EC_LIB"}, + {"SSL routines", "SSL", "SSL_LIB"}, + {"BIO routines", "BIO", "BIO_LIB"}, + {"PKCS7 routines", "PKCS7", "PKCS7_LIB"}, + {"PKCS8 routines", "PKCS8", "PKCS8_LIB"}, + {"X509 V3 routines", "X509V3", "X509V3_LIB"}, + {"random number generator", "RAND", "RAND_LIB"}, + {"ENGINE routines", "ENGINE", "ENGINE_LIB"}, + {"OCSP routines", "OCSP", "OCSP_LIB"}, + {"UI routines", "UI", "UI_LIB"}, + {"COMP routines", "COMP", "COMP_LIB"}, + {"ECDSA routines", "ECDSA", "ECDSA_LIB"}, + {"ECDH routines", "ECDH", "ECDH_LIB"}, + {"HMAC routines", "HMAC", "HMAC_LIB"}, + {"Digest functions", "DIGEST", "DIGEST_LIB"}, + {"Cipher functions", "CIPHER", "CIPHER_LIB"}, + {"HKDF functions", "HKDF", "HKDF_LIB"}, + {"Trust Token functions", "TRUST_TOKEN", "TRUST_TOKEN_LIB"}, + {"User defined functions", "USER", "USER_LIB"}, +}; + +static const char *err_lib_error_string(uint32_t packed_error) { + const uint32_t lib = ERR_GET_LIB(packed_error); + return lib >= ERR_NUM_LIBS ? nullptr : kLibraryNames[lib].str; +} + +const char *ERR_lib_error_string(uint32_t packed_error) { + const char *ret = err_lib_error_string(packed_error); + return ret == nullptr ? "unknown library" : ret; +} + +const char *ERR_lib_symbol_name(uint32_t packed_error) { + const uint32_t lib = ERR_GET_LIB(packed_error); + return lib >= ERR_NUM_LIBS ? nullptr : kLibraryNames[lib].symbol; +} + +const char *ERR_func_error_string(uint32_t packed_error) { + return "OPENSSL_internal"; +} + +static const char *err_reason_error_string(uint32_t packed_error, int symbol) { + const uint32_t lib = ERR_GET_LIB(packed_error); + const uint32_t reason = ERR_GET_REASON(packed_error); + + if (lib == ERR_LIB_SYS) { + if (!symbol && reason < 127) { + return strerror(reason); + } + return nullptr; + } + + if (reason < ERR_NUM_LIBS) { + return symbol ? kLibraryNames[reason].reason_symbol + : kLibraryNames[reason].str; + } + + if (reason < 100) { + // TODO(davidben): All our other reason strings match the symbol name. Only + // the common ones differ. Should we just consistently return the symbol + // name? + switch (reason) { + case ERR_R_MALLOC_FAILURE: + return symbol ? "MALLOC_FAILURE" : "malloc failure"; + case ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED: + return symbol ? "SHOULD_NOT_HAVE_BEEN_CALLED" + : "function should not have been called"; + case ERR_R_PASSED_NULL_PARAMETER: + return symbol ? "PASSED_NULL_PARAMETER" : "passed a null parameter"; + case ERR_R_INTERNAL_ERROR: + return symbol ? "INTERNAL_ERROR" : "internal error"; + case ERR_R_OVERFLOW: + return symbol ? "OVERFLOW" : "overflow"; + default: + return nullptr; + } + } + + // Unlike OpenSSL, BoringSSL's reason strings already match symbol name, so we + // do not need to check |symbol|. + return err_string_lookup(lib, reason, kOpenSSLReasonValues, + kOpenSSLReasonValuesLen, kOpenSSLReasonStringData); +} + +const char *ERR_reason_error_string(uint32_t packed_error) { + const char *ret = err_reason_error_string(packed_error, /*symbol=*/0); + return ret == nullptr ? "unknown error" : ret; +} + +const char *ERR_reason_symbol_name(uint32_t packed_error) { + return err_reason_error_string(packed_error, /*symbol=*/1); +} + +char *ERR_error_string(uint32_t packed_error, char *ret) { + static char buf[ERR_ERROR_STRING_BUF_LEN]; + + if (ret == nullptr) { + // TODO(fork): remove this. + ret = buf; + } + +#if !defined(NDEBUG) + // This is aimed to help catch callers who don't provide + // |ERR_ERROR_STRING_BUF_LEN| bytes of space. + OPENSSL_memset(ret, 0, ERR_ERROR_STRING_BUF_LEN); +#endif + + return ERR_error_string_n(packed_error, ret, ERR_ERROR_STRING_BUF_LEN); +} + +char *ERR_error_string_n(uint32_t packed_error, char *buf, size_t len) { + if (len == 0) { + return nullptr; + } + + unsigned lib = ERR_GET_LIB(packed_error); + unsigned reason = ERR_GET_REASON(packed_error); + + const char *lib_str = err_lib_error_string(packed_error); + const char *reason_str = err_reason_error_string(packed_error, /*symbol=*/0); + + char lib_buf[32], reason_buf[32]; + if (lib_str == nullptr) { + snprintf(lib_buf, sizeof(lib_buf), "lib(%u)", lib); + lib_str = lib_buf; + } + + if (reason_str == nullptr) { + snprintf(reason_buf, sizeof(reason_buf), "reason(%u)", reason); + reason_str = reason_buf; + } + + int ret = snprintf(buf, len, "error:%08" PRIx32 ":%s:OPENSSL_internal:%s", + packed_error, lib_str, reason_str); + if (ret >= 0 && (size_t)ret >= len) { + // The output was truncated; make sure we always have 5 colon-separated + // fields, i.e. 4 colons. + static const unsigned num_colons = 4; + unsigned i; + char *s = buf; + + if (len <= num_colons) { + // In this situation it's not possible to ensure that the correct number + // of colons are included in the output. + return buf; + } + + for (i = 0; i < num_colons; i++) { + char *colon = strchr(s, ':'); + char *last_pos = &buf[len - 1] - num_colons + i; + + if (colon == nullptr || colon > last_pos) { + // set colon |i| at last possible position (buf[len-1] is the + // terminating 0). If we're setting this colon, then all whole of the + // rest of the string must be colons in order to have the correct + // number. + OPENSSL_memset(last_pos, ':', num_colons - i); + break; + } + + s = colon + 1; + } + } + + return buf; +} + +void ERR_print_errors_cb(ERR_print_errors_callback_t callback, void *ctx) { + char buf[ERR_ERROR_STRING_BUF_LEN]; + char buf2[1024]; + const char *file, *data; + int line, flags; + uint32_t packed_error; + + // thread_hash is the least-significant bits of the |ERR_STATE| pointer value + // for this thread. + const unsigned long thread_hash = (uintptr_t)err_get_state(); + + for (;;) { + packed_error = ERR_get_error_line_data(&file, &line, &data, &flags); + if (packed_error == 0) { + break; + } + + ERR_error_string_n(packed_error, buf, sizeof(buf)); + snprintf(buf2, sizeof(buf2), "%lu:%s:%s:%d:%s\n", thread_hash, buf, file, + line, (flags & ERR_FLAG_STRING) ? data : ""); + if (callback(buf2, strlen(buf2), ctx) <= 0) { + break; + } + } +} + +static int print_errors_to_file(const char *msg, size_t msg_len, void *ctx) { + assert(msg[msg_len] == '\0'); + FILE *fp = reinterpret_cast(ctx); + int res = fputs(msg, fp); + return res < 0 ? 0 : 1; +} + +void ERR_print_errors_fp(FILE *file) { + ERR_print_errors_cb(print_errors_to_file, file); +} + +// err_set_error_data sets the data on the most recent error. +static void err_set_error_data(char *data) { + ERR_STATE *const state = err_get_state(); + struct err_error_st *error; + + if (state == nullptr || state->top == state->bottom) { + free(data); + return; + } + + error = &state->errors[state->top]; + + free(error->data); + error->data = data; +} + +void ERR_put_error(int library, int unused, int reason, const char *file, + unsigned line) { + ERR_STATE *const state = err_get_state(); + struct err_error_st *error; + + if (state == nullptr) { + return; + } + + if (library == ERR_LIB_SYS && reason == 0) { +#if defined(OPENSSL_WINDOWS) + reason = GetLastError(); +#else + reason = errno; +#endif + } + + state->top = (state->top + 1) % ERR_NUM_ERRORS; + if (state->top == state->bottom) { + state->bottom = (state->bottom + 1) % ERR_NUM_ERRORS; + } + + error = &state->errors[state->top]; + err_clear(error); + error->file = file; + error->line = line; + error->packed = ERR_PACK(library, reason); +} + +// ERR_add_error_data_vdata takes a variable number of const char* pointers, +// concatenates them and sets the result as the data on the most recent +// error. +static void err_add_error_vdata(unsigned num, va_list args) { + size_t total_size = 0; + const char *substr; + char *buf; + + va_list args_copy; + va_copy(args_copy, args); + for (size_t i = 0; i < num; i++) { + substr = va_arg(args_copy, const char *); + if (substr == nullptr) { + continue; + } + size_t substr_len = strlen(substr); + if (SIZE_MAX - total_size < substr_len) { + return; // Would overflow. + } + total_size += substr_len; + } + va_end(args_copy); + if (total_size == SIZE_MAX) { + return; // Would overflow. + } + total_size += 1; // NUL terminator. + if ((buf = reinterpret_cast(malloc(total_size))) == nullptr) { + return; + } + buf[0] = '\0'; + for (size_t i = 0; i < num; i++) { + substr = va_arg(args, const char *); + if (substr == nullptr) { + continue; + } + if (OPENSSL_strlcat(buf, substr, total_size) >= total_size) { + assert(0); // should not be possible. + } + } + err_set_error_data(buf); +} + +void ERR_add_error_data(unsigned count, ...) { + va_list args; + va_start(args, count); + err_add_error_vdata(count, args); + va_end(args); +} + +void ERR_add_error_dataf(const char *format, ...) { + char *buf = nullptr; + va_list ap; + + va_start(ap, format); + if (OPENSSL_vasprintf_internal(&buf, format, ap, /*system_malloc=*/1) == -1) { + return; + } + va_end(ap); + + err_set_error_data(buf); +} + +void ERR_set_error_data(char *data, int flags) { + if (!(flags & ERR_FLAG_STRING)) { + // We do not support non-string error data. + assert(0); + return; + } + // We can not use OPENSSL_strdup because we don't want to call OPENSSL_malloc, + // which can affect the error stack. + char *copy = strdup_libc_malloc(data); + if (copy != nullptr) { + err_set_error_data(copy); + } + if (flags & ERR_FLAG_MALLOCED) { + // We can not take ownership of |data| directly because it is allocated with + // |OPENSSL_malloc| and we will free it with system |free| later. + OPENSSL_free(data); + } +} + +int ERR_set_mark() { + ERR_STATE *const state = err_get_state(); + + if (state == nullptr || state->bottom == state->top) { + return 0; + } + state->errors[state->top].mark = 1; + return 1; +} + +int ERR_pop_to_mark() { + ERR_STATE *const state = err_get_state(); + + if (state == nullptr) { + return 0; + } + + while (state->bottom != state->top) { + struct err_error_st *error = &state->errors[state->top]; + + if (error->mark) { + error->mark = 0; + return 1; + } + + err_clear(error); + if (state->top == 0) { + state->top = ERR_NUM_ERRORS - 1; + } else { + state->top--; + } + } + + return 0; +} + +void ERR_load_crypto_strings() {} + +void ERR_free_strings() {} + +void ERR_load_BIO_strings() {} + +void ERR_load_ERR_strings() {} + +void ERR_load_RAND_strings() {} + +BSSL_NAMESPACE_BEGIN + +struct err_save_state_st { + struct err_error_st *errors; + size_t num_errors; +}; + +BSSL_NAMESPACE_END + +void bssl::ERR_SAVE_STATE_free(ERR_SAVE_STATE *state) { + if (state == nullptr) { + return; + } + for (size_t i = 0; i < state->num_errors; i++) { + err_clear(&state->errors[i]); + } + free(state->errors); + free(state); +} + +ERR_SAVE_STATE *bssl::ERR_save_state() { + ERR_STATE *const state = err_get_state(); + if (state == nullptr || state->top == state->bottom) { + return nullptr; + } + + ERR_SAVE_STATE *ret = + reinterpret_cast(malloc(sizeof(ERR_SAVE_STATE))); + if (ret == nullptr) { + return nullptr; + } + + // Errors are stored in the range (bottom, top]. + size_t num_errors = state->top >= state->bottom + ? state->top - state->bottom + : ERR_NUM_ERRORS + state->top - state->bottom; + assert(num_errors < ERR_NUM_ERRORS); + ret->errors = reinterpret_cast( + malloc(num_errors * sizeof(struct err_error_st))); + if (ret->errors == nullptr) { + free(ret); + return nullptr; + } + OPENSSL_memset(ret->errors, 0, num_errors * sizeof(struct err_error_st)); + ret->num_errors = num_errors; + + for (size_t i = 0; i < num_errors; i++) { + size_t j = (state->bottom + i + 1) % ERR_NUM_ERRORS; + err_copy(&ret->errors[i], &state->errors[j]); + } + return ret; +} + +void bssl::ERR_restore_state(const ERR_SAVE_STATE *state) { + if (state == nullptr || state->num_errors == 0) { + ERR_clear_error(); + return; + } + + if (state->num_errors >= ERR_NUM_ERRORS) { + abort(); + } + + ERR_STATE *const dst = err_get_state(); + if (dst == nullptr) { + return; + } + + for (size_t i = 0; i < state->num_errors; i++) { + err_copy(&dst->errors[i], &state->errors[i]); + } + dst->top = (unsigned)(state->num_errors - 1); + dst->bottom = ERR_NUM_ERRORS - 1; +} diff --git a/third_party/boringssl/src/crypto/err/internal.h b/third_party/boringssl/src/crypto/err/internal.h index 179f756b..e81768c7 100644 --- a/third_party/boringssl/src/crypto/err/internal.h +++ b/third_party/boringssl/src/crypto/err/internal.h @@ -1,26 +1,24 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #ifndef OPENSSL_HEADER_CRYPTO_ERR_INTERNAL_H #define OPENSSL_HEADER_CRYPTO_ERR_INTERNAL_H #include -#if defined(__cplusplus) -extern "C" { -#endif +BSSL_NAMESPACE_BEGIN // Private error queue functions. @@ -35,24 +33,13 @@ OPENSSL_EXPORT void ERR_SAVE_STATE_free(ERR_SAVE_STATE *state); // ERR_save_state returns a newly-allocated |ERR_SAVE_STATE| structure // containing the current state of the error queue or NULL on allocation // error. It should be released with |ERR_SAVE_STATE_free|. -OPENSSL_EXPORT ERR_SAVE_STATE *ERR_save_state(void); +OPENSSL_EXPORT ERR_SAVE_STATE *ERR_save_state(); // ERR_restore_state clears the error queue and replaces it with |state|. OPENSSL_EXPORT void ERR_restore_state(const ERR_SAVE_STATE *state); - -#if defined(__cplusplus) -} // extern C - -extern "C++" { - -BSSL_NAMESPACE_BEGIN - BORINGSSL_MAKE_DELETER(ERR_SAVE_STATE, ERR_SAVE_STATE_free) BSSL_NAMESPACE_END -} // extern C++ -#endif - #endif // OPENSSL_HEADER_CRYPTO_ERR_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/evp/evp.c b/third_party/boringssl/src/crypto/evp/evp.c deleted file mode 100644 index bb316450..00000000 --- a/third_party/boringssl/src/crypto/evp/evp.c +++ /dev/null @@ -1,456 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../internal.h" - - -// Node depends on |EVP_R_NOT_XOF_OR_INVALID_LENGTH|. -// -// TODO(davidben): Fix Node to not touch the error queue itself and remove this. -OPENSSL_DECLARE_ERROR_REASON(EVP, NOT_XOF_OR_INVALID_LENGTH) - -// The HPKE module uses the EVP error namespace, but it lives in another -// directory. -OPENSSL_DECLARE_ERROR_REASON(EVP, EMPTY_PSK) - -EVP_PKEY *EVP_PKEY_new(void) { - EVP_PKEY *ret; - - ret = OPENSSL_malloc(sizeof(EVP_PKEY)); - if (ret == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return NULL; - } - - OPENSSL_memset(ret, 0, sizeof(EVP_PKEY)); - ret->type = EVP_PKEY_NONE; - ret->references = 1; - - return ret; -} - -static void free_it(EVP_PKEY *pkey) { - if (pkey->ameth && pkey->ameth->pkey_free) { - pkey->ameth->pkey_free(pkey); - pkey->pkey.ptr = NULL; - pkey->type = EVP_PKEY_NONE; - } -} - -void EVP_PKEY_free(EVP_PKEY *pkey) { - if (pkey == NULL) { - return; - } - - if (!CRYPTO_refcount_dec_and_test_zero(&pkey->references)) { - return; - } - - free_it(pkey); - OPENSSL_free(pkey); -} - -int EVP_PKEY_up_ref(EVP_PKEY *pkey) { - CRYPTO_refcount_inc(&pkey->references); - return 1; -} - -int EVP_PKEY_is_opaque(const EVP_PKEY *pkey) { - if (pkey->ameth && pkey->ameth->pkey_opaque) { - return pkey->ameth->pkey_opaque(pkey); - } - return 0; -} - -int EVP_PKEY_cmp(const EVP_PKEY *a, const EVP_PKEY *b) { - if (a->type != b->type) { - return -1; - } - - if (a->ameth) { - int ret; - // Compare parameters if the algorithm has them - if (a->ameth->param_cmp) { - ret = a->ameth->param_cmp(a, b); - if (ret <= 0) { - return ret; - } - } - - if (a->ameth->pub_cmp) { - return a->ameth->pub_cmp(a, b); - } - } - - return -2; -} - -int EVP_PKEY_copy_parameters(EVP_PKEY *to, const EVP_PKEY *from) { - if (to->type != from->type) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_KEY_TYPES); - goto err; - } - - if (EVP_PKEY_missing_parameters(from)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PARAMETERS); - goto err; - } - - if (from->ameth && from->ameth->param_copy) { - return from->ameth->param_copy(to, from); - } - -err: - return 0; -} - -int EVP_PKEY_missing_parameters(const EVP_PKEY *pkey) { - if (pkey->ameth && pkey->ameth->param_missing) { - return pkey->ameth->param_missing(pkey); - } - return 0; -} - -int EVP_PKEY_size(const EVP_PKEY *pkey) { - if (pkey && pkey->ameth && pkey->ameth->pkey_size) { - return pkey->ameth->pkey_size(pkey); - } - return 0; -} - -int EVP_PKEY_bits(const EVP_PKEY *pkey) { - if (pkey && pkey->ameth && pkey->ameth->pkey_bits) { - return pkey->ameth->pkey_bits(pkey); - } - return 0; -} - -int EVP_PKEY_id(const EVP_PKEY *pkey) { - return pkey->type; -} - -// evp_pkey_asn1_find returns the ASN.1 method table for the given |nid|, which -// should be one of the |EVP_PKEY_*| values. It returns NULL if |nid| is -// unknown. -static const EVP_PKEY_ASN1_METHOD *evp_pkey_asn1_find(int nid) { - switch (nid) { - case EVP_PKEY_RSA: - return &rsa_asn1_meth; - case EVP_PKEY_EC: - return &ec_asn1_meth; - case EVP_PKEY_DSA: - return &dsa_asn1_meth; - case EVP_PKEY_ED25519: - return &ed25519_asn1_meth; - case EVP_PKEY_X25519: - return &x25519_asn1_meth; - default: - return NULL; - } -} - -int EVP_PKEY_type(int nid) { - const EVP_PKEY_ASN1_METHOD *meth = evp_pkey_asn1_find(nid); - if (meth == NULL) { - return NID_undef; - } - return meth->pkey_id; -} - -int EVP_PKEY_set1_RSA(EVP_PKEY *pkey, RSA *key) { - if (EVP_PKEY_assign_RSA(pkey, key)) { - RSA_up_ref(key); - return 1; - } - return 0; -} - -int EVP_PKEY_assign_RSA(EVP_PKEY *pkey, RSA *key) { - return EVP_PKEY_assign(pkey, EVP_PKEY_RSA, key); -} - -RSA *EVP_PKEY_get0_RSA(const EVP_PKEY *pkey) { - if (pkey->type != EVP_PKEY_RSA) { - OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_RSA_KEY); - return NULL; - } - return pkey->pkey.rsa; -} - -RSA *EVP_PKEY_get1_RSA(const EVP_PKEY *pkey) { - RSA *rsa = EVP_PKEY_get0_RSA(pkey); - if (rsa != NULL) { - RSA_up_ref(rsa); - } - return rsa; -} - -int EVP_PKEY_set1_DSA(EVP_PKEY *pkey, DSA *key) { - if (EVP_PKEY_assign_DSA(pkey, key)) { - DSA_up_ref(key); - return 1; - } - return 0; -} - -int EVP_PKEY_assign_DSA(EVP_PKEY *pkey, DSA *key) { - return EVP_PKEY_assign(pkey, EVP_PKEY_DSA, key); -} - -DSA *EVP_PKEY_get0_DSA(const EVP_PKEY *pkey) { - if (pkey->type != EVP_PKEY_DSA) { - OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_A_DSA_KEY); - return NULL; - } - return pkey->pkey.dsa; -} - -DSA *EVP_PKEY_get1_DSA(const EVP_PKEY *pkey) { - DSA *dsa = EVP_PKEY_get0_DSA(pkey); - if (dsa != NULL) { - DSA_up_ref(dsa); - } - return dsa; -} - -int EVP_PKEY_set1_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) { - if (EVP_PKEY_assign_EC_KEY(pkey, key)) { - EC_KEY_up_ref(key); - return 1; - } - return 0; -} - -int EVP_PKEY_assign_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) { - return EVP_PKEY_assign(pkey, EVP_PKEY_EC, key); -} - -EC_KEY *EVP_PKEY_get0_EC_KEY(const EVP_PKEY *pkey) { - if (pkey->type != EVP_PKEY_EC) { - OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_EC_KEY_KEY); - return NULL; - } - return pkey->pkey.ec; -} - -EC_KEY *EVP_PKEY_get1_EC_KEY(const EVP_PKEY *pkey) { - EC_KEY *ec_key = EVP_PKEY_get0_EC_KEY(pkey); - if (ec_key != NULL) { - EC_KEY_up_ref(ec_key); - } - return ec_key; -} - -DH *EVP_PKEY_get0_DH(const EVP_PKEY *pkey) { return NULL; } -DH *EVP_PKEY_get1_DH(const EVP_PKEY *pkey) { return NULL; } - -int EVP_PKEY_assign(EVP_PKEY *pkey, int type, void *key) { - if (!EVP_PKEY_set_type(pkey, type)) { - return 0; - } - pkey->pkey.ptr = key; - return key != NULL; -} - -int EVP_PKEY_set_type(EVP_PKEY *pkey, int type) { - const EVP_PKEY_ASN1_METHOD *ameth; - - if (pkey && pkey->pkey.ptr) { - free_it(pkey); - } - - ameth = evp_pkey_asn1_find(type); - if (ameth == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); - ERR_add_error_dataf("algorithm %d", type); - return 0; - } - - if (pkey) { - pkey->ameth = ameth; - pkey->type = pkey->ameth->pkey_id; - } - - return 1; -} - -EVP_PKEY *EVP_PKEY_new_raw_private_key(int type, ENGINE *unused, - const uint8_t *in, size_t len) { - EVP_PKEY *ret = EVP_PKEY_new(); - if (ret == NULL || - !EVP_PKEY_set_type(ret, type)) { - goto err; - } - - if (ret->ameth->set_priv_raw == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - goto err; - } - - if (!ret->ameth->set_priv_raw(ret, in, len)) { - goto err; - } - - return ret; - -err: - EVP_PKEY_free(ret); - return NULL; -} - -EVP_PKEY *EVP_PKEY_new_raw_public_key(int type, ENGINE *unused, - const uint8_t *in, size_t len) { - EVP_PKEY *ret = EVP_PKEY_new(); - if (ret == NULL || - !EVP_PKEY_set_type(ret, type)) { - goto err; - } - - if (ret->ameth->set_pub_raw == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - goto err; - } - - if (!ret->ameth->set_pub_raw(ret, in, len)) { - goto err; - } - - return ret; - -err: - EVP_PKEY_free(ret); - return NULL; -} - -int EVP_PKEY_get_raw_private_key(const EVP_PKEY *pkey, uint8_t *out, - size_t *out_len) { - if (pkey->ameth->get_priv_raw == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - - return pkey->ameth->get_priv_raw(pkey, out, out_len); -} - -int EVP_PKEY_get_raw_public_key(const EVP_PKEY *pkey, uint8_t *out, - size_t *out_len) { - if (pkey->ameth->get_pub_raw == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - - return pkey->ameth->get_pub_raw(pkey, out, out_len); -} - -int EVP_PKEY_cmp_parameters(const EVP_PKEY *a, const EVP_PKEY *b) { - if (a->type != b->type) { - return -1; - } - if (a->ameth && a->ameth->param_cmp) { - return a->ameth->param_cmp(a, b); - } - return -2; -} - -int EVP_PKEY_CTX_set_signature_md(EVP_PKEY_CTX *ctx, const EVP_MD *md) { - return EVP_PKEY_CTX_ctrl(ctx, -1, EVP_PKEY_OP_TYPE_SIG, EVP_PKEY_CTRL_MD, 0, - (void *)md); -} - -int EVP_PKEY_CTX_get_signature_md(EVP_PKEY_CTX *ctx, const EVP_MD **out_md) { - return EVP_PKEY_CTX_ctrl(ctx, -1, EVP_PKEY_OP_TYPE_SIG, EVP_PKEY_CTRL_GET_MD, - 0, (void *)out_md); -} - -void *EVP_PKEY_get0(const EVP_PKEY *pkey) { - // Node references, but never calls this function, so for now we return NULL. - // If other projects require complete support, call |EVP_PKEY_get0_RSA|, etc., - // rather than reading |pkey->pkey.ptr| directly. This avoids problems if our - // internal representation does not match the type the caller expects from - // OpenSSL. - return NULL; -} - -void OpenSSL_add_all_algorithms(void) {} - -void OPENSSL_add_all_algorithms_conf(void) {} - -void OpenSSL_add_all_ciphers(void) {} - -void OpenSSL_add_all_digests(void) {} - -void EVP_cleanup(void) {} - -int EVP_PKEY_base_id(const EVP_PKEY *pkey) { - // OpenSSL has two notions of key type because it supports multiple OIDs for - // the same algorithm: NID_rsa vs NID_rsaEncryption and five distinct spelling - // of DSA. We do not support these, so the base ID is simply the ID. - return EVP_PKEY_id(pkey); -} diff --git a/third_party/boringssl/src/crypto/evp/evp.cc b/third_party/boringssl/src/crypto/evp/evp.cc new file mode 100644 index 00000000..1b182592 --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/evp.cc @@ -0,0 +1,451 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +// Node depends on |EVP_R_NOT_XOF_OR_INVALID_LENGTH|. +// +// TODO(davidben): Fix Node to not touch the error queue itself and remove this. +OPENSSL_DECLARE_ERROR_REASON(EVP, NOT_XOF_OR_INVALID_LENGTH) + +// The HPKE module uses the EVP error namespace, but it lives in another +// directory. +OPENSSL_DECLARE_ERROR_REASON(EVP, EMPTY_PSK) + +EVP_PKEY *EVP_PKEY_new() { return New(); } + +EvpPkey::EvpPkey() : RefCounted(CheckSubClass()) {} + +EvpPkey::~EvpPkey() { evp_pkey_set0(this, nullptr, nullptr); } + +void EVP_PKEY_free(EVP_PKEY *pkey) { + if (pkey == nullptr) { + return; + } + + auto *impl = FromOpaque(pkey); + impl->DecRefInternal(); +} + +int EVP_PKEY_up_ref(EVP_PKEY *pkey) { + auto *impl = FromOpaque(pkey); + impl->UpRefInternal(); + return 1; +} + +EVP_PKEY *EVP_PKEY_dup_ref(const EVP_PKEY *pkey) { + auto pkey_ref = const_cast(pkey); + // We know that this call always returns one. + EVP_PKEY_up_ref(pkey_ref); + return pkey_ref; +} + +int EVP_PKEY_is_opaque(const EVP_PKEY *pkey) { + auto *impl = FromOpaque(pkey); + if (impl->ameth && impl->ameth->pkey_opaque) { + return impl->ameth->pkey_opaque(impl); + } + return 0; +} + +int EVP_PKEY_eq(const EVP_PKEY *a, const EVP_PKEY *b) { + // This also checks that |EVP_PKEY_id| matches. + if (!EVP_PKEY_parameters_eq(a, b)) { + return 0; + } + + auto *a_impl = FromOpaque(a); + auto *b_impl = FromOpaque(b); + return a_impl->ameth != nullptr && a_impl->ameth->pub_equal != nullptr && + a_impl->pkey != nullptr && b_impl->pkey != nullptr && + a_impl->ameth->pub_equal(a_impl, b_impl); +} + +int EVP_PKEY_cmp(const EVP_PKEY *a, const EVP_PKEY *b) { + return EVP_PKEY_eq(a, b); +} + +int EVP_PKEY_copy_parameters(EVP_PKEY *to, const EVP_PKEY *from) { + auto *to_impl = FromOpaque(to); + auto *from_impl = FromOpaque(from); + + if (EVP_PKEY_id(to_impl) == EVP_PKEY_NONE) { + // TODO(crbug.com/42290409): This shouldn't leave |to| in a half-empty state + // on error. The complexity here largely comes from parameterless DSA keys, + // which we no longer support, so this function can probably be trimmed + // down. + evp_pkey_set0(to_impl, from_impl->ameth, nullptr); + } else if (EVP_PKEY_id(to_impl) != EVP_PKEY_id(from_impl)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_KEY_TYPES); + return 0; + } + + if (EVP_PKEY_missing_parameters(from_impl)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PARAMETERS); + return 0; + } + + // Once set, parameters may not change. + if (!EVP_PKEY_missing_parameters(to_impl)) { + if (EVP_PKEY_parameters_eq(to_impl, from_impl) == 1) { + return 1; + } + OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_PARAMETERS); + return 0; + } + + if (from_impl->ameth && from_impl->ameth->param_copy) { + return from_impl->ameth->param_copy(to_impl, from_impl); + } + + // TODO(https://crbug.com/42290406): If the algorithm takes no parameters, + // copying them should vacuously succeed. Better yet, simplify this whole + // notion of parameter copying above. + return 0; +} + +int EVP_PKEY_missing_parameters(const EVP_PKEY *pkey) { + auto *impl = FromOpaque(pkey); + if (impl->ameth == nullptr) { + return 0; // EVP_PKEY_NONE is not parameterized, so nothing is missing. + } + if (impl->pkey == nullptr) { + // This is an invalid, half-empty object. Report something is missing to + // stop other parameter-based functions. + return 1; + } + if (impl->ameth->param_missing) { + return impl->ameth->param_missing(impl); + } + return 0; // Not parameterized, so nothing is missing. +} + +int EVP_PKEY_size(const EVP_PKEY *pkey) { + auto *impl = FromOpaque(pkey); + if (impl && impl->ameth && impl->ameth->pkey_size) { + return impl->ameth->pkey_size(impl); + } + return 0; +} + +int EVP_PKEY_bits(const EVP_PKEY *pkey) { + auto *impl = FromOpaque(pkey); + if (impl && impl->ameth && impl->ameth->pkey_bits) { + return impl->ameth->pkey_bits(impl); + } + return 0; +} + +int EVP_PKEY_id(const EVP_PKEY *pkey) { + auto *impl = FromOpaque(pkey); + return impl->ameth != nullptr ? impl->ameth->pkey_id : EVP_PKEY_NONE; +} + +void bssl::evp_pkey_set0(EvpPkey *pkey, const EVP_PKEY_ASN1_METHOD *method, + void *pkey_data) { + if (pkey->ameth && pkey->ameth->pkey_free) { + pkey->ameth->pkey_free(pkey); + } + pkey->ameth = method; + pkey->pkey = pkey_data; +} + +int EVP_PKEY_type(int nid) { + // In OpenSSL, this was used to map between type aliases. BoringSSL supports + // no type aliases, so this function is just the identity. + return nid; +} + +int EVP_PKEY_assign(EVP_PKEY *pkey, int type, void *key) { + // This function can only be used to assign RSA, DSA, EC, and DH keys. Other + // key types have internal representations which are not exposed through the + // public API. + switch (type) { + case EVP_PKEY_RSA: + return EVP_PKEY_assign_RSA(pkey, reinterpret_cast(key)); + case EVP_PKEY_DSA: + return EVP_PKEY_assign_DSA(pkey, reinterpret_cast(key)); + case EVP_PKEY_EC: + return EVP_PKEY_assign_EC_KEY(pkey, reinterpret_cast(key)); + case EVP_PKEY_DH: + return EVP_PKEY_assign_DH(pkey, reinterpret_cast(key)); + } + + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + ERR_add_error_dataf("algorithm %d", type); + return 0; +} + +int EVP_PKEY_set_type(EVP_PKEY *pkey, int type) { + auto *impl = FromOpaque(pkey); + if (impl && impl->pkey) { + // Some callers rely on |pkey| getting cleared even if |type| is + // unsupported, usually setting |type| to |EVP_PKEY_NONE|. + evp_pkey_set0(impl, nullptr, nullptr); + } + + // This function broadly isn't useful. It initializes |EVP_PKEY| for a type, + // but forgets to put anything in the |pkey|. The one pattern where it does + // anything is |EVP_PKEY_X25519|, where it's needed to make + // |EVP_PKEY_set1_tls_encodedpoint| work, so we support only that. + const EVP_PKEY_ALG *alg; + if (type == EVP_PKEY_X25519) { + alg = EVP_pkey_x25519(); + } else { + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + ERR_add_error_dataf("algorithm %d", type); + return 0; + } + + if (impl) { + evp_pkey_set0(impl, alg->method, nullptr); + } + + return 1; +} + +EVP_PKEY *EVP_PKEY_from_raw_private_key(const EVP_PKEY_ALG *alg, + const uint8_t *in, size_t len) { + if (alg->method->set_priv_raw == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return nullptr; + } + UniquePtr ret(FromOpaque(EVP_PKEY_new())); + if (ret == nullptr || !alg->method->set_priv_raw(ret.get(), in, len)) { + return nullptr; + } + return ret.release(); +} + +EVP_PKEY *EVP_PKEY_from_private_seed(const EVP_PKEY_ALG *alg, const uint8_t *in, + size_t len) { + if (alg->method->set_priv_seed == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return nullptr; + } + UniquePtr ret(FromOpaque(EVP_PKEY_new())); + if (ret == nullptr || !alg->method->set_priv_seed(ret.get(), in, len)) { + return nullptr; + } + return ret.release(); +} + +EVP_PKEY *EVP_PKEY_from_raw_public_key(const EVP_PKEY_ALG *alg, + const uint8_t *in, size_t len) { + if (alg->method->set_pub_raw == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return nullptr; + } + UniquePtr ret(FromOpaque(EVP_PKEY_new())); + if (ret == nullptr || !alg->method->set_pub_raw(ret.get(), in, len)) { + return nullptr; + } + return ret.release(); +} + +EVP_PKEY *EVP_PKEY_new_raw_private_key(int type, ENGINE *unused, + const uint8_t *in, size_t len) { + for (const EVP_PKEY_ALG *alg : GetDefaultEVPAlgorithms()) { + if (alg->method && alg->method->pkey_id == type) { + return EVP_PKEY_from_raw_private_key(alg, in, len); + } + } + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return nullptr; +} + +EVP_PKEY *EVP_PKEY_new_raw_public_key(int type, ENGINE *unused, + const uint8_t *in, size_t len) { + for (const EVP_PKEY_ALG *alg : GetDefaultEVPAlgorithms()) { + if (alg->method && alg->method->pkey_id == type) { + return EVP_PKEY_from_raw_public_key(alg, in, len); + } + } + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return nullptr; +} + +int EVP_PKEY_get_raw_private_key(const EVP_PKEY *pkey, uint8_t *out, + size_t *out_len) { + auto *impl = FromOpaque(pkey); + + if (impl->ameth == nullptr || impl->ameth->get_priv_raw == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + return impl->ameth->get_priv_raw(impl, out, out_len); +} + +int EVP_PKEY_get_private_seed(const EVP_PKEY *pkey, uint8_t *out, + size_t *out_len) { + auto *impl = FromOpaque(pkey); + + if (impl->ameth == nullptr || impl->ameth->get_priv_seed == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + return impl->ameth->get_priv_seed(impl, out, out_len); +} + +int EVP_PKEY_get_raw_public_key(const EVP_PKEY *pkey, uint8_t *out, + size_t *out_len) { + auto *impl = FromOpaque(pkey); + + if (impl->ameth == nullptr || impl->ameth->get_pub_raw == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + return impl->ameth->get_pub_raw(impl, out, out_len); +} + +int EVP_PKEY_parameters_eq(const EVP_PKEY *a, const EVP_PKEY *b) { + if (EVP_PKEY_id(a) != EVP_PKEY_id(b)) { + return 0; + } + + auto *a_impl = FromOpaque(a); + auto *b_impl = FromOpaque(b); + if (a_impl->ameth && a_impl->ameth->param_equal) { + return a_impl->ameth->param_equal(a_impl, b_impl); + } + // If the algorithm does not use parameters, the two null value compare as + // vacuously equal. + return 1; +} + +int EVP_PKEY_cmp_parameters(const EVP_PKEY *a, const EVP_PKEY *b) { + return EVP_PKEY_parameters_eq(a, b); +} + +int EVP_PKEY_CTX_set_signature_md(EVP_PKEY_CTX *ctx, const EVP_MD *md) { + return EVP_PKEY_CTX_ctrl(ctx, -1, EVP_PKEY_OP_TYPE_SIG, EVP_PKEY_CTRL_MD, 0, + (void *)md); +} + +int EVP_PKEY_CTX_get_signature_md(EVP_PKEY_CTX *ctx, const EVP_MD **out_md) { + return EVP_PKEY_CTX_ctrl(ctx, -1, EVP_PKEY_OP_TYPE_SIG, EVP_PKEY_CTRL_GET_MD, + 0, (void *)out_md); +} + +int EVP_PKEY_CTX_set1_signature_context_string(EVP_PKEY_CTX *ctx, + const uint8_t *context, + size_t context_len) { + Span context_string(context, context_len); + return EVP_PKEY_CTX_ctrl(ctx, -1, EVP_PKEY_OP_TYPE_SIG, + EVP_PKEY_CTRL_SIGNATURE_CONTEXT_STRING, 0, + &context_string); +} + +void *EVP_PKEY_get0(const EVP_PKEY *pkey) { + // Node references, but never calls this function, so for now we return NULL. + // If other projects require complete support, call |EVP_PKEY_get0_RSA|, etc., + // rather than reading |pkey->pkey| directly. This avoids problems if our + // internal representation does not match the type the caller expects from + // OpenSSL. + return nullptr; +} + +void OpenSSL_add_all_algorithms() {} + +void OPENSSL_add_all_algorithms_conf() {} + +void OpenSSL_add_all_ciphers() {} + +void OpenSSL_add_all_digests() {} + +void EVP_cleanup() {} + +int EVP_default_properties_is_fips_enabled(OSSL_LIB_CTX *libctx) { + return FIPS_mode(); +} + +int EVP_PKEY_set1_tls_encodedpoint(EVP_PKEY *pkey, const uint8_t *in, + size_t len) { + auto *impl = FromOpaque(pkey); + + if (impl->ameth == nullptr || impl->ameth->set1_tls_encodedpoint == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + return impl->ameth->set1_tls_encodedpoint(impl, in, len); +} + +size_t EVP_PKEY_get1_tls_encodedpoint(const EVP_PKEY *pkey, uint8_t **out_ptr) { + auto *impl = FromOpaque(pkey); + + if (impl->ameth == nullptr || impl->ameth->get1_tls_encodedpoint == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + return impl->ameth->get1_tls_encodedpoint(impl, out_ptr); +} + +int EVP_PKEY_base_id(const EVP_PKEY *pkey) { + // OpenSSL has two notions of key type because it supports multiple OIDs for + // the same algorithm: NID_rsa vs NID_rsaEncryption and five distinct spelling + // of DSA. We do not support these, so the base ID is simply the ID. + return EVP_PKEY_id(pkey); +} + +int EVP_PKEY_has_public(const EVP_PKEY *pkey) { + auto *impl = FromOpaque(pkey); + if (impl == nullptr || impl->ameth == nullptr || impl->pkey == nullptr || + impl->ameth->pub_present == nullptr) { + return 0; + } + return impl->ameth->pub_present(impl); +} + +int EVP_PKEY_has_private(const EVP_PKEY *pkey) { + auto *impl = FromOpaque(pkey); + if (impl == nullptr || impl->ameth == nullptr || impl->pkey == nullptr || + impl->ameth->priv_present == nullptr) { + return 0; + } + return impl->ameth->priv_present(impl); +} + +EVP_PKEY *EVP_PKEY_copy_public(const EVP_PKEY *pkey) { + auto *impl = FromOpaque(pkey); + if (impl == nullptr || impl->ameth == nullptr || impl->pkey == nullptr || + impl->ameth->pub_copy == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return nullptr; + } + UniquePtr ret(FromOpaque(EVP_PKEY_new())); + if (ret == nullptr || !impl->ameth->pub_copy(ret.get(), impl)) { + return nullptr; + } + return ret.release(); +} diff --git a/third_party/boringssl/src/crypto/evp/evp_asn1.c b/third_party/boringssl/src/crypto/evp/evp_asn1.c deleted file mode 100644 index da099816..00000000 --- a/third_party/boringssl/src/crypto/evp/evp_asn1.c +++ /dev/null @@ -1,547 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../bytestring/internal.h" -#include "../internal.h" - - -static const EVP_PKEY_ASN1_METHOD *const kASN1Methods[] = { - &rsa_asn1_meth, - &ec_asn1_meth, - &dsa_asn1_meth, - &ed25519_asn1_meth, - &x25519_asn1_meth, -}; - -static int parse_key_type(CBS *cbs, int *out_type) { - CBS oid; - if (!CBS_get_asn1(cbs, &oid, CBS_ASN1_OBJECT)) { - return 0; - } - - for (unsigned i = 0; i < OPENSSL_ARRAY_SIZE(kASN1Methods); i++) { - const EVP_PKEY_ASN1_METHOD *method = kASN1Methods[i]; - if (CBS_len(&oid) == method->oid_len && - OPENSSL_memcmp(CBS_data(&oid), method->oid, method->oid_len) == 0) { - *out_type = method->pkey_id; - return 1; - } - } - - return 0; -} - -EVP_PKEY *EVP_parse_public_key(CBS *cbs) { - // Parse the SubjectPublicKeyInfo. - CBS spki, algorithm, key; - int type; - uint8_t padding; - if (!CBS_get_asn1(cbs, &spki, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&spki, &key, CBS_ASN1_BITSTRING) || - CBS_len(&spki) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return NULL; - } - if (!parse_key_type(&algorithm, &type)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); - return NULL; - } - if (// Every key type defined encodes the key as a byte string with the same - // conversion to BIT STRING. - !CBS_get_u8(&key, &padding) || - padding != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return NULL; - } - - // Set up an |EVP_PKEY| of the appropriate type. - EVP_PKEY *ret = EVP_PKEY_new(); - if (ret == NULL || - !EVP_PKEY_set_type(ret, type)) { - goto err; - } - - // Call into the type-specific SPKI decoding function. - if (ret->ameth->pub_decode == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); - goto err; - } - if (!ret->ameth->pub_decode(ret, &algorithm, &key)) { - goto err; - } - - return ret; - -err: - EVP_PKEY_free(ret); - return NULL; -} - -int EVP_marshal_public_key(CBB *cbb, const EVP_PKEY *key) { - if (key->ameth == NULL || key->ameth->pub_encode == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); - return 0; - } - - return key->ameth->pub_encode(cbb, key); -} - -EVP_PKEY *EVP_parse_private_key(CBS *cbs) { - // Parse the PrivateKeyInfo. - CBS pkcs8, algorithm, key; - uint64_t version; - int type; - if (!CBS_get_asn1(cbs, &pkcs8, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1_uint64(&pkcs8, &version) || - version != 0 || - !CBS_get_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&pkcs8, &key, CBS_ASN1_OCTETSTRING)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return NULL; - } - if (!parse_key_type(&algorithm, &type)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); - return NULL; - } - - // A PrivateKeyInfo ends with a SET of Attributes which we ignore. - - // Set up an |EVP_PKEY| of the appropriate type. - EVP_PKEY *ret = EVP_PKEY_new(); - if (ret == NULL || - !EVP_PKEY_set_type(ret, type)) { - goto err; - } - - // Call into the type-specific PrivateKeyInfo decoding function. - if (ret->ameth->priv_decode == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); - goto err; - } - if (!ret->ameth->priv_decode(ret, &algorithm, &key)) { - goto err; - } - - return ret; - -err: - EVP_PKEY_free(ret); - return NULL; -} - -int EVP_marshal_private_key(CBB *cbb, const EVP_PKEY *key) { - if (key->ameth == NULL || key->ameth->priv_encode == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); - return 0; - } - - return key->ameth->priv_encode(cbb, key); -} - -static EVP_PKEY *old_priv_decode(CBS *cbs, int type) { - EVP_PKEY *ret = EVP_PKEY_new(); - if (ret == NULL) { - return NULL; - } - - switch (type) { - case EVP_PKEY_EC: { - EC_KEY *ec_key = EC_KEY_parse_private_key(cbs, NULL); - if (ec_key == NULL || !EVP_PKEY_assign_EC_KEY(ret, ec_key)) { - EC_KEY_free(ec_key); - goto err; - } - return ret; - } - case EVP_PKEY_DSA: { - DSA *dsa = DSA_parse_private_key(cbs); - if (dsa == NULL || !EVP_PKEY_assign_DSA(ret, dsa)) { - DSA_free(dsa); - goto err; - } - return ret; - } - case EVP_PKEY_RSA: { - RSA *rsa = RSA_parse_private_key(cbs); - if (rsa == NULL || !EVP_PKEY_assign_RSA(ret, rsa)) { - RSA_free(rsa); - goto err; - } - return ret; - } - default: - OPENSSL_PUT_ERROR(EVP, EVP_R_UNKNOWN_PUBLIC_KEY_TYPE); - goto err; - } - -err: - EVP_PKEY_free(ret); - return NULL; -} - -EVP_PKEY *d2i_PrivateKey(int type, EVP_PKEY **out, const uint8_t **inp, - long len) { - if (len < 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return NULL; - } - - // Parse with the legacy format. - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - EVP_PKEY *ret = old_priv_decode(&cbs, type); - if (ret == NULL) { - // Try again with PKCS#8. - ERR_clear_error(); - CBS_init(&cbs, *inp, (size_t)len); - ret = EVP_parse_private_key(&cbs); - if (ret == NULL) { - return NULL; - } - if (ret->type != type) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_KEY_TYPES); - EVP_PKEY_free(ret); - return NULL; - } - } - - if (out != NULL) { - EVP_PKEY_free(*out); - *out = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -// num_elements parses one SEQUENCE from |in| and returns the number of elements -// in it. On parse error, it returns zero. -static size_t num_elements(const uint8_t *in, size_t in_len) { - CBS cbs, sequence; - CBS_init(&cbs, in, (size_t)in_len); - - if (!CBS_get_asn1(&cbs, &sequence, CBS_ASN1_SEQUENCE)) { - return 0; - } - - size_t count = 0; - while (CBS_len(&sequence) > 0) { - if (!CBS_get_any_asn1_element(&sequence, NULL, NULL, NULL)) { - return 0; - } - - count++; - } - - return count; -} - -EVP_PKEY *d2i_AutoPrivateKey(EVP_PKEY **out, const uint8_t **inp, long len) { - if (len < 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return NULL; - } - - // Parse the input as a PKCS#8 PrivateKeyInfo. - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - EVP_PKEY *ret = EVP_parse_private_key(&cbs); - if (ret != NULL) { - if (out != NULL) { - EVP_PKEY_free(*out); - *out = ret; - } - *inp = CBS_data(&cbs); - return ret; - } - ERR_clear_error(); - - // Count the elements to determine the legacy key format. - switch (num_elements(*inp, (size_t)len)) { - case 4: - return d2i_PrivateKey(EVP_PKEY_EC, out, inp, len); - - case 6: - return d2i_PrivateKey(EVP_PKEY_DSA, out, inp, len); - - default: - return d2i_PrivateKey(EVP_PKEY_RSA, out, inp, len); - } -} - -int i2d_PublicKey(const EVP_PKEY *key, uint8_t **outp) { - switch (key->type) { - case EVP_PKEY_RSA: - return i2d_RSAPublicKey(key->pkey.rsa, outp); - case EVP_PKEY_DSA: - return i2d_DSAPublicKey(key->pkey.dsa, outp); - case EVP_PKEY_EC: - return i2o_ECPublicKey(key->pkey.ec, outp); - default: - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_PUBLIC_KEY_TYPE); - return -1; - } -} - -EVP_PKEY *d2i_PublicKey(int type, EVP_PKEY **out, const uint8_t **inp, - long len) { - EVP_PKEY *ret = EVP_PKEY_new(); - if (ret == NULL) { - return NULL; - } - - CBS cbs; - CBS_init(&cbs, *inp, len < 0 ? 0 : (size_t)len); - switch (type) { - case EVP_PKEY_RSA: { - RSA *rsa = RSA_parse_public_key(&cbs); - if (rsa == NULL || !EVP_PKEY_assign_RSA(ret, rsa)) { - RSA_free(rsa); - goto err; - } - break; - } - - // Unlike OpenSSL, we do not support EC keys with this API. The raw EC - // public key serialization requires knowing the group. In OpenSSL, calling - // this function with |EVP_PKEY_EC| and setting |out| to NULL does not work. - // It requires |*out| to include a partially-initialized |EVP_PKEY| to - // extract the group. - default: - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_PUBLIC_KEY_TYPE); - goto err; - } - - *inp = CBS_data(&cbs); - if (out != NULL) { - EVP_PKEY_free(*out); - *out = ret; - } - return ret; - -err: - EVP_PKEY_free(ret); - return NULL; -} - -EVP_PKEY *d2i_PUBKEY(EVP_PKEY **out, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - EVP_PKEY *ret = EVP_parse_public_key(&cbs); - if (ret == NULL) { - return NULL; - } - if (out != NULL) { - EVP_PKEY_free(*out); - *out = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -int i2d_PUBKEY(const EVP_PKEY *pkey, uint8_t **outp) { - if (pkey == NULL) { - return 0; - } - - CBB cbb; - if (!CBB_init(&cbb, 128) || - !EVP_marshal_public_key(&cbb, pkey)) { - CBB_cleanup(&cbb); - return -1; - } - return CBB_finish_i2d(&cbb, outp); -} - -RSA *d2i_RSA_PUBKEY(RSA **out, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - EVP_PKEY *pkey = EVP_parse_public_key(&cbs); - if (pkey == NULL) { - return NULL; - } - RSA *rsa = EVP_PKEY_get1_RSA(pkey); - EVP_PKEY_free(pkey); - if (rsa == NULL) { - return NULL; - } - if (out != NULL) { - RSA_free(*out); - *out = rsa; - } - *inp = CBS_data(&cbs); - return rsa; -} - -int i2d_RSA_PUBKEY(const RSA *rsa, uint8_t **outp) { - if (rsa == NULL) { - return 0; - } - - int ret = -1; - EVP_PKEY *pkey = EVP_PKEY_new(); - if (pkey == NULL || - !EVP_PKEY_set1_RSA(pkey, (RSA *)rsa)) { - goto err; - } - - ret = i2d_PUBKEY(pkey, outp); - -err: - EVP_PKEY_free(pkey); - return ret; -} - -DSA *d2i_DSA_PUBKEY(DSA **out, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - EVP_PKEY *pkey = EVP_parse_public_key(&cbs); - if (pkey == NULL) { - return NULL; - } - DSA *dsa = EVP_PKEY_get1_DSA(pkey); - EVP_PKEY_free(pkey); - if (dsa == NULL) { - return NULL; - } - if (out != NULL) { - DSA_free(*out); - *out = dsa; - } - *inp = CBS_data(&cbs); - return dsa; -} - -int i2d_DSA_PUBKEY(const DSA *dsa, uint8_t **outp) { - if (dsa == NULL) { - return 0; - } - - int ret = -1; - EVP_PKEY *pkey = EVP_PKEY_new(); - if (pkey == NULL || - !EVP_PKEY_set1_DSA(pkey, (DSA *)dsa)) { - goto err; - } - - ret = i2d_PUBKEY(pkey, outp); - -err: - EVP_PKEY_free(pkey); - return ret; -} - -EC_KEY *d2i_EC_PUBKEY(EC_KEY **out, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - EVP_PKEY *pkey = EVP_parse_public_key(&cbs); - if (pkey == NULL) { - return NULL; - } - EC_KEY *ec_key = EVP_PKEY_get1_EC_KEY(pkey); - EVP_PKEY_free(pkey); - if (ec_key == NULL) { - return NULL; - } - if (out != NULL) { - EC_KEY_free(*out); - *out = ec_key; - } - *inp = CBS_data(&cbs); - return ec_key; -} - -int i2d_EC_PUBKEY(const EC_KEY *ec_key, uint8_t **outp) { - if (ec_key == NULL) { - return 0; - } - - int ret = -1; - EVP_PKEY *pkey = EVP_PKEY_new(); - if (pkey == NULL || - !EVP_PKEY_set1_EC_KEY(pkey, (EC_KEY *)ec_key)) { - goto err; - } - - ret = i2d_PUBKEY(pkey, outp); - -err: - EVP_PKEY_free(pkey); - return ret; -} diff --git a/third_party/boringssl/src/crypto/evp/evp_asn1.cc b/third_party/boringssl/src/crypto/evp/evp_asn1.cc new file mode 100644 index 00000000..e32a6f71 --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/evp_asn1.cc @@ -0,0 +1,439 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +EVP_PKEY *EVP_PKEY_from_subject_public_key_info(const uint8_t *in, size_t len, + const EVP_PKEY_ALG *const *algs, + size_t num_algs) { + // Parse the SubjectPublicKeyInfo. + CBS cbs, spki, algorithm, oid, key; + CBS_init(&cbs, in, len); + if (!CBS_get_asn1(&cbs, &spki, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || + !CBS_get_asn1(&spki, &key, CBS_ASN1_BITSTRING) || + CBS_len(&spki) != 0 || // + CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return nullptr; + } + + UniquePtr ret(FromOpaque(EVP_PKEY_new())); + if (ret == nullptr) { + return nullptr; + } + for (const EVP_PKEY_ALG *alg : Span(algs, num_algs)) { + if (alg->method->pub_decode == nullptr || + Span(alg->method->oid, alg->method->oid_len) != oid) { + continue; + } + // Every key type we support encodes the key as a byte string with the same + // conversion to BIT STRING, so perform that common conversion ahead of + // time, but only after the OID is recognized as supported. + CBS key_bytes = key; + uint8_t padding; + if (!CBS_get_u8(&key_bytes, &padding) || padding != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return nullptr; + } + CBS params = algorithm; + switch (alg->method->pub_decode(alg, ret.get(), ¶ms, &key_bytes)) { + case evp_decode_error: + return nullptr; + case evp_decode_ok: + return ret.release(); + case evp_decode_unsupported: + // Continue trying other algorithms. + break; + } + } + + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return nullptr; +} + +int EVP_marshal_public_key(CBB *cbb, const EVP_PKEY *key) { + auto *impl = FromOpaque(key); + if (impl->ameth == nullptr || impl->ameth->pub_encode == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return 0; + } + if (impl->pkey == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return 0; + } + + return impl->ameth->pub_encode(cbb, impl); +} + +EVP_PKEY *EVP_PKEY_from_private_key_info(const uint8_t *in, size_t len, + const EVP_PKEY_ALG *const *algs, + size_t num_algs) { + // Parse the PrivateKeyInfo. + CBS cbs, pkcs8, oid, algorithm, key; + uint64_t version; + CBS_init(&cbs, in, len); + if (!CBS_get_asn1(&cbs, &pkcs8, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1_uint64(&pkcs8, &version) || version != 0 || + !CBS_get_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || + !CBS_get_asn1(&pkcs8, &key, CBS_ASN1_OCTETSTRING) || + // A PrivateKeyInfo ends with a SET of Attributes which we ignore. + CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return nullptr; + } + + UniquePtr ret(FromOpaque(EVP_PKEY_new())); + if (ret == nullptr) { + return nullptr; + } + for (const EVP_PKEY_ALG *alg : Span(algs, num_algs)) { + if (alg->method->priv_decode == nullptr || + Span(alg->method->oid, alg->method->oid_len) != oid) { + continue; + } + CBS params = algorithm, key_copy = key; + switch (alg->method->priv_decode(alg, ret.get(), ¶ms, &key_copy)) { + case evp_decode_error: + return nullptr; + case evp_decode_ok: + return ret.release(); + case evp_decode_unsupported: + // Continue trying other algorithms. + break; + } + } + + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return nullptr; +} + +int EVP_marshal_private_key(CBB *cbb, const EVP_PKEY *key) { + auto *impl = FromOpaque(key); + if (impl->ameth == nullptr || impl->ameth->priv_encode == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return 0; + } + if (impl->pkey == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return 0; + } + + return impl->ameth->priv_encode(cbb, impl); +} + +EVP_PKEY *EVP_parse_public_key(CBS *cbs) { + CBS elem; + if (!CBS_get_asn1_element(cbs, &elem, CBS_ASN1_SEQUENCE)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return nullptr; + } + + auto algs = GetDefaultEVPAlgorithms(); + return EVP_PKEY_from_subject_public_key_info(CBS_data(&elem), CBS_len(&elem), + algs.data(), algs.size()); +} + +EVP_PKEY *EVP_parse_private_key(CBS *cbs) { + CBS elem; + if (!CBS_get_asn1_element(cbs, &elem, CBS_ASN1_SEQUENCE)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return nullptr; + } + + auto algs = GetDefaultEVPAlgorithms(); + return EVP_PKEY_from_private_key_info(CBS_data(&elem), CBS_len(&elem), + algs.data(), algs.size()); +} + +static bssl::UniquePtr old_priv_decode(CBS *cbs, int type) { + UniquePtr ret(EVP_PKEY_new()); + if (ret == nullptr) { + return nullptr; + } + + switch (type) { + case EVP_PKEY_EC: { + UniquePtr ec_key(EC_KEY_parse_private_key(cbs, nullptr)); + if (ec_key == nullptr) { + return nullptr; + } + EVP_PKEY_assign_EC_KEY(ret.get(), ec_key.release()); + return ret; + } + case EVP_PKEY_DSA: { + UniquePtr dsa(DSA_parse_private_key(cbs)); + if (dsa == nullptr) { + return nullptr; + } + EVP_PKEY_assign_DSA(ret.get(), dsa.release()); + return ret; + } + case EVP_PKEY_RSA: { + UniquePtr rsa(RSA_parse_private_key(cbs)); + if (rsa == nullptr) { + return nullptr; + } + EVP_PKEY_assign_RSA(ret.get(), rsa.release()); + return ret; + } + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_UNKNOWN_PUBLIC_KEY_TYPE); + return nullptr; + } +} + +EVP_PKEY *d2i_PrivateKey(int type, EVP_PKEY **out, const uint8_t **inp, + long len) { + return D2IFromCBS(out, inp, len, [&](CBS *cbs) -> UniquePtr { + // Parse with the legacy format. + CBS copy = *cbs; + UniquePtr ret = old_priv_decode(cbs, type); + if (ret == nullptr) { + // Try again with PKCS#8. + ERR_clear_error(); + *cbs = copy; + ret.reset(EVP_parse_private_key(cbs)); + if (ret == nullptr) { + return nullptr; + } + if (EVP_PKEY_id(ret.get()) != type) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_KEY_TYPES); + return nullptr; + } + } + return ret; + }); +} + +// num_elements parses one SEQUENCE from |in| and returns the number of elements +// in it. On parse error, it returns zero. +static size_t num_elements(const uint8_t *in, size_t in_len) { + CBS cbs, sequence; + CBS_init(&cbs, in, (size_t)in_len); + + if (!CBS_get_asn1(&cbs, &sequence, CBS_ASN1_SEQUENCE)) { + return 0; + } + + size_t count = 0; + while (CBS_len(&sequence) > 0) { + if (!CBS_get_any_asn1_element(&sequence, nullptr, nullptr, nullptr)) { + return 0; + } + + count++; + } + + return count; +} + +EVP_PKEY *d2i_AutoPrivateKey(EVP_PKEY **out, const uint8_t **inp, long len) { + if (len < 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return nullptr; + } + + // Parse the input as a PKCS#8 PrivateKeyInfo. + CBS cbs; + CBS_init(&cbs, *inp, (size_t)len); + EVP_PKEY *ret = EVP_parse_private_key(&cbs); + if (ret != nullptr) { + if (out != nullptr) { + EVP_PKEY_free(*out); + *out = ret; + } + *inp = CBS_data(&cbs); + return ret; + } + ERR_clear_error(); + + // Count the elements to determine the legacy key format. + switch (num_elements(*inp, (size_t)len)) { + case 4: + return d2i_PrivateKey(EVP_PKEY_EC, out, inp, len); + + case 6: + return d2i_PrivateKey(EVP_PKEY_DSA, out, inp, len); + + default: + return d2i_PrivateKey(EVP_PKEY_RSA, out, inp, len); + } +} + +int i2d_PublicKey(const EVP_PKEY *key, uint8_t **outp) { + switch (EVP_PKEY_id(key)) { + case EVP_PKEY_RSA: + return i2d_RSAPublicKey(EVP_PKEY_get0_RSA(key), outp); + case EVP_PKEY_DSA: + return i2d_DSAPublicKey(EVP_PKEY_get0_DSA(key), outp); + case EVP_PKEY_EC: + return i2o_ECPublicKey(EVP_PKEY_get0_EC_KEY(key), outp); + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_PUBLIC_KEY_TYPE); + return -1; + } +} + +EVP_PKEY *d2i_PublicKey(int type, EVP_PKEY **out, const uint8_t **inp, + long len) { + return D2IFromCBS(out, inp, len, [&](CBS *cbs) -> UniquePtr { + UniquePtr ret(EVP_PKEY_new()); + if (ret == nullptr) { + return nullptr; + } + switch (type) { + case EVP_PKEY_RSA: { + UniquePtr rsa(RSA_parse_public_key(cbs)); + if (rsa == nullptr) { + return nullptr; + } + EVP_PKEY_assign_RSA(ret.get(), rsa.release()); + return ret; + } + + // Unlike OpenSSL, we do not support EC keys with this API. The raw EC + // public key serialization requires knowing the group. In OpenSSL, + // calling this function with |EVP_PKEY_EC| and setting |out| to + // nullptr does not work. It requires |*out| to include a + // partially-initialized |EVP_PKEY| to extract the group. + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_PUBLIC_KEY_TYPE); + return nullptr; + } + }); +} + +EVP_PKEY *d2i_PUBKEY(EVP_PKEY **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, EVP_parse_public_key); +} + +int i2d_PUBKEY(const EVP_PKEY *pkey, uint8_t **outp) { + if (pkey == nullptr) { + return 0; + } + return I2DFromCBB( + /*initial_capacity=*/128, outp, + [&](CBB *cbb) -> bool { return EVP_marshal_public_key(cbb, pkey); }); +} + +static bssl::UniquePtr parse_spki( + CBS *cbs, Span algs) { + CBS spki; + if (!CBS_get_asn1_element(cbs, &spki, CBS_ASN1_SEQUENCE)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return nullptr; + } + return UniquePtr(EVP_PKEY_from_subject_public_key_info( + CBS_data(&spki), CBS_len(&spki), algs.data(), algs.size())); +} + +static bssl::UniquePtr parse_spki(CBS *cbs, const EVP_PKEY_ALG *alg) { + return parse_spki(cbs, Span(&alg, 1)); +} + +RSA *d2i_RSA_PUBKEY(RSA **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, [](CBS *cbs) -> UniquePtr { + UniquePtr pkey = parse_spki(cbs, EVP_pkey_rsa()); + if (pkey == nullptr) { + return nullptr; + } + return UniquePtr(EVP_PKEY_get1_RSA(pkey.get())); + }); +} + +int i2d_RSA_PUBKEY(const RSA *rsa, uint8_t **outp) { + if (rsa == nullptr) { + return 0; + } + + UniquePtr pkey(EVP_PKEY_new()); + if (pkey == nullptr || + !EVP_PKEY_set1_RSA(pkey.get(), const_cast(rsa))) { + return -1; + } + + return i2d_PUBKEY(pkey.get(), outp); +} + +DSA *d2i_DSA_PUBKEY(DSA **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, [](CBS *cbs) -> UniquePtr { + UniquePtr pkey = parse_spki(cbs, EVP_pkey_dsa()); + if (pkey == nullptr) { + return nullptr; + } + return UniquePtr(EVP_PKEY_get1_DSA(pkey.get())); + }); +} + +int i2d_DSA_PUBKEY(const DSA *dsa, uint8_t **outp) { + if (dsa == nullptr) { + return 0; + } + + UniquePtr pkey(EVP_PKEY_new()); + if (pkey == nullptr || + !EVP_PKEY_set1_DSA(pkey.get(), const_cast(dsa))) { + return -1; + } + + return i2d_PUBKEY(pkey.get(), outp); +} + +EC_KEY *d2i_EC_PUBKEY(EC_KEY **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, [](CBS *cbs) -> UniquePtr { + const EVP_PKEY_ALG *const algs[] = {EVP_pkey_ec_p224(), EVP_pkey_ec_p256(), + EVP_pkey_ec_p384(), EVP_pkey_ec_p521()}; + UniquePtr pkey = parse_spki(cbs, algs); + if (pkey == nullptr) { + return nullptr; + } + return UniquePtr(EVP_PKEY_get1_EC_KEY(pkey.get())); + }); +} + +int i2d_EC_PUBKEY(const EC_KEY *ec_key, uint8_t **outp) { + if (ec_key == nullptr) { + return 0; + } + + UniquePtr pkey(EVP_PKEY_new()); + if (pkey == nullptr || + !EVP_PKEY_set1_EC_KEY(pkey.get(), const_cast(ec_key))) { + return -1; + } + + return i2d_PUBKEY(pkey.get(), outp); +} diff --git a/third_party/boringssl/src/crypto/evp/evp_ctx.c b/third_party/boringssl/src/crypto/evp/evp_ctx.c deleted file mode 100644 index 5f31ddb3..00000000 --- a/third_party/boringssl/src/crypto/evp/evp_ctx.c +++ /dev/null @@ -1,487 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include -#include - -#include "../internal.h" -#include "internal.h" - - -static const EVP_PKEY_METHOD *const evp_methods[] = { - &rsa_pkey_meth, - &ec_pkey_meth, - &ed25519_pkey_meth, - &x25519_pkey_meth, - &hkdf_pkey_meth, -}; - -static const EVP_PKEY_METHOD *evp_pkey_meth_find(int type) { - for (size_t i = 0; i < sizeof(evp_methods)/sizeof(EVP_PKEY_METHOD*); i++) { - if (evp_methods[i]->pkey_id == type) { - return evp_methods[i]; - } - } - - return NULL; -} - -static EVP_PKEY_CTX *evp_pkey_ctx_new(EVP_PKEY *pkey, ENGINE *e, - const EVP_PKEY_METHOD *pmeth) { - EVP_PKEY_CTX *ret = OPENSSL_malloc(sizeof(EVP_PKEY_CTX)); - if (!ret) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(EVP_PKEY_CTX)); - - ret->engine = e; - ret->pmeth = pmeth; - ret->operation = EVP_PKEY_OP_UNDEFINED; - - if (pkey) { - EVP_PKEY_up_ref(pkey); - ret->pkey = pkey; - } - - if (pmeth->init) { - if (pmeth->init(ret) <= 0) { - EVP_PKEY_free(ret->pkey); - OPENSSL_free(ret); - return NULL; - } - } - - return ret; -} - -EVP_PKEY_CTX *EVP_PKEY_CTX_new(EVP_PKEY *pkey, ENGINE *e) { - if (pkey == NULL || pkey->ameth == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_PASSED_NULL_PARAMETER); - return NULL; - } - - const EVP_PKEY_METHOD *pkey_method = pkey->ameth->pkey_method; - if (pkey_method == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); - ERR_add_error_dataf("algorithm %d", pkey->ameth->pkey_id); - return NULL; - } - - return evp_pkey_ctx_new(pkey, e, pkey_method); -} - -EVP_PKEY_CTX *EVP_PKEY_CTX_new_id(int id, ENGINE *e) { - const EVP_PKEY_METHOD *pkey_method = evp_pkey_meth_find(id); - if (pkey_method == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); - ERR_add_error_dataf("algorithm %d", id); - return NULL; - } - - return evp_pkey_ctx_new(NULL, e, pkey_method); -} - -void EVP_PKEY_CTX_free(EVP_PKEY_CTX *ctx) { - if (ctx == NULL) { - return; - } - if (ctx->pmeth && ctx->pmeth->cleanup) { - ctx->pmeth->cleanup(ctx); - } - EVP_PKEY_free(ctx->pkey); - EVP_PKEY_free(ctx->peerkey); - OPENSSL_free(ctx); -} - -EVP_PKEY_CTX *EVP_PKEY_CTX_dup(EVP_PKEY_CTX *ctx) { - if (!ctx->pmeth || !ctx->pmeth->copy) { - return NULL; - } - - EVP_PKEY_CTX *ret = OPENSSL_malloc(sizeof(EVP_PKEY_CTX)); - if (!ret) { - return NULL; - } - - OPENSSL_memset(ret, 0, sizeof(EVP_PKEY_CTX)); - - ret->pmeth = ctx->pmeth; - ret->engine = ctx->engine; - ret->operation = ctx->operation; - - if (ctx->pkey != NULL) { - EVP_PKEY_up_ref(ctx->pkey); - ret->pkey = ctx->pkey; - } - - if (ctx->peerkey != NULL) { - EVP_PKEY_up_ref(ctx->peerkey); - ret->peerkey = ctx->peerkey; - } - - if (ctx->pmeth->copy(ret, ctx) <= 0) { - ret->pmeth = NULL; - EVP_PKEY_CTX_free(ret); - OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP); - return NULL; - } - - return ret; -} - -EVP_PKEY *EVP_PKEY_CTX_get0_pkey(EVP_PKEY_CTX *ctx) { return ctx->pkey; } - -int EVP_PKEY_CTX_ctrl(EVP_PKEY_CTX *ctx, int keytype, int optype, int cmd, - int p1, void *p2) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->ctrl) { - OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); - return 0; - } - if (keytype != -1 && ctx->pmeth->pkey_id != keytype) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - - if (ctx->operation == EVP_PKEY_OP_UNDEFINED) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NO_OPERATION_SET); - return 0; - } - - if (optype != -1 && !(ctx->operation & optype)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_OPERATION); - return 0; - } - - return ctx->pmeth->ctrl(ctx, cmd, p1, p2); -} - -int EVP_PKEY_sign_init(EVP_PKEY_CTX *ctx) { - if (ctx == NULL || ctx->pmeth == NULL || - (ctx->pmeth->sign == NULL && ctx->pmeth->sign_message == NULL)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - - ctx->operation = EVP_PKEY_OP_SIGN; - return 1; -} - -int EVP_PKEY_sign(EVP_PKEY_CTX *ctx, uint8_t *sig, size_t *sig_len, - const uint8_t *digest, size_t digest_len) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->sign) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - if (ctx->operation != EVP_PKEY_OP_SIGN) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATON_NOT_INITIALIZED); - return 0; - } - return ctx->pmeth->sign(ctx, sig, sig_len, digest, digest_len); -} - -int EVP_PKEY_verify_init(EVP_PKEY_CTX *ctx) { - if (ctx == NULL || ctx->pmeth == NULL || - (ctx->pmeth->verify == NULL && ctx->pmeth->verify_message == NULL)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - ctx->operation = EVP_PKEY_OP_VERIFY; - return 1; -} - -int EVP_PKEY_verify(EVP_PKEY_CTX *ctx, const uint8_t *sig, size_t sig_len, - const uint8_t *digest, size_t digest_len) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->verify) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - if (ctx->operation != EVP_PKEY_OP_VERIFY) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATON_NOT_INITIALIZED); - return 0; - } - return ctx->pmeth->verify(ctx, sig, sig_len, digest, digest_len); -} - -int EVP_PKEY_encrypt_init(EVP_PKEY_CTX *ctx) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->encrypt) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - ctx->operation = EVP_PKEY_OP_ENCRYPT; - return 1; -} - -int EVP_PKEY_encrypt(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *outlen, - const uint8_t *in, size_t inlen) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->encrypt) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - if (ctx->operation != EVP_PKEY_OP_ENCRYPT) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATON_NOT_INITIALIZED); - return 0; - } - return ctx->pmeth->encrypt(ctx, out, outlen, in, inlen); -} - -int EVP_PKEY_decrypt_init(EVP_PKEY_CTX *ctx) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->decrypt) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - ctx->operation = EVP_PKEY_OP_DECRYPT; - return 1; -} - -int EVP_PKEY_decrypt(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *outlen, - const uint8_t *in, size_t inlen) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->decrypt) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - if (ctx->operation != EVP_PKEY_OP_DECRYPT) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATON_NOT_INITIALIZED); - return 0; - } - return ctx->pmeth->decrypt(ctx, out, outlen, in, inlen); -} - -int EVP_PKEY_verify_recover_init(EVP_PKEY_CTX *ctx) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->verify_recover) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - ctx->operation = EVP_PKEY_OP_VERIFYRECOVER; - return 1; -} - -int EVP_PKEY_verify_recover(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *out_len, - const uint8_t *sig, size_t sig_len) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->verify_recover) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - if (ctx->operation != EVP_PKEY_OP_VERIFYRECOVER) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATON_NOT_INITIALIZED); - return 0; - } - return ctx->pmeth->verify_recover(ctx, out, out_len, sig, sig_len); -} - -int EVP_PKEY_derive_init(EVP_PKEY_CTX *ctx) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->derive) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - ctx->operation = EVP_PKEY_OP_DERIVE; - return 1; -} - -int EVP_PKEY_derive_set_peer(EVP_PKEY_CTX *ctx, EVP_PKEY *peer) { - int ret; - if (!ctx || !ctx->pmeth || - !(ctx->pmeth->derive || ctx->pmeth->encrypt || ctx->pmeth->decrypt) || - !ctx->pmeth->ctrl) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - if (ctx->operation != EVP_PKEY_OP_DERIVE && - ctx->operation != EVP_PKEY_OP_ENCRYPT && - ctx->operation != EVP_PKEY_OP_DECRYPT) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATON_NOT_INITIALIZED); - return 0; - } - - ret = ctx->pmeth->ctrl(ctx, EVP_PKEY_CTRL_PEER_KEY, 0, peer); - - if (ret <= 0) { - return 0; - } - - if (ret == 2) { - return 1; - } - - if (!ctx->pkey) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); - return 0; - } - - if (ctx->pkey->type != peer->type) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_KEY_TYPES); - return 0; - } - - // ran@cryptocom.ru: For clarity. The error is if parameters in peer are - // present (!missing) but don't match. EVP_PKEY_cmp_parameters may return - // 1 (match), 0 (don't match) and -2 (comparison is not defined). -1 - // (different key types) is impossible here because it is checked earlier. - // -2 is OK for us here, as well as 1, so we can check for 0 only. - if (!EVP_PKEY_missing_parameters(peer) && - !EVP_PKEY_cmp_parameters(ctx->pkey, peer)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_PARAMETERS); - return 0; - } - - EVP_PKEY_free(ctx->peerkey); - ctx->peerkey = peer; - - ret = ctx->pmeth->ctrl(ctx, EVP_PKEY_CTRL_PEER_KEY, 1, peer); - - if (ret <= 0) { - ctx->peerkey = NULL; - return 0; - } - - EVP_PKEY_up_ref(peer); - return 1; -} - -int EVP_PKEY_derive(EVP_PKEY_CTX *ctx, uint8_t *key, size_t *out_key_len) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->derive) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - if (ctx->operation != EVP_PKEY_OP_DERIVE) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATON_NOT_INITIALIZED); - return 0; - } - return ctx->pmeth->derive(ctx, key, out_key_len); -} - -int EVP_PKEY_keygen_init(EVP_PKEY_CTX *ctx) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->keygen) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - ctx->operation = EVP_PKEY_OP_KEYGEN; - return 1; -} - -int EVP_PKEY_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY **out_pkey) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->keygen) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - if (ctx->operation != EVP_PKEY_OP_KEYGEN) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATON_NOT_INITIALIZED); - return 0; - } - - if (!out_pkey) { - return 0; - } - - if (!*out_pkey) { - *out_pkey = EVP_PKEY_new(); - if (!*out_pkey) { - OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP); - return 0; - } - } - - if (!ctx->pmeth->keygen(ctx, *out_pkey)) { - EVP_PKEY_free(*out_pkey); - *out_pkey = NULL; - return 0; - } - return 1; -} - -int EVP_PKEY_paramgen_init(EVP_PKEY_CTX *ctx) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->paramgen) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - ctx->operation = EVP_PKEY_OP_PARAMGEN; - return 1; -} - -int EVP_PKEY_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY **out_pkey) { - if (!ctx || !ctx->pmeth || !ctx->pmeth->paramgen) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - if (ctx->operation != EVP_PKEY_OP_PARAMGEN) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATON_NOT_INITIALIZED); - return 0; - } - - if (!out_pkey) { - return 0; - } - - if (!*out_pkey) { - *out_pkey = EVP_PKEY_new(); - if (!*out_pkey) { - OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP); - return 0; - } - } - - if (!ctx->pmeth->paramgen(ctx, *out_pkey)) { - EVP_PKEY_free(*out_pkey); - *out_pkey = NULL; - return 0; - } - return 1; -} diff --git a/third_party/boringssl/src/crypto/evp/evp_ctx.cc b/third_party/boringssl/src/crypto/evp/evp_ctx.cc new file mode 100644 index 00000000..ffb9b6f2 --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/evp_ctx.cc @@ -0,0 +1,541 @@ +// Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "../params_internal.h" +#include "internal.h" + + +using namespace bssl; + +static UniquePtr evp_pkey_ctx_new( + EvpPkey *pkey, const EVP_PKEY_ALG *alg, const EVP_PKEY_CTX_METHOD *pmeth) { + assert(pkey != nullptr || alg != nullptr); + UniquePtr ret = MakeUnique(); + if (!ret) { + return nullptr; + } + + ret->pmeth = pmeth; + ret->operation = EVP_PKEY_OP_UNDEFINED; + ret->pkey = UpRef(pkey); + + if (pmeth->init && pmeth->init(ret.get(), alg) <= 0) { + ret->pmeth = nullptr; // Don't call |pmeth->cleanup|. + return nullptr; + } + + return ret; +} + +EVP_PKEY_CTX *EVP_PKEY_CTX_new(EVP_PKEY *pkey, ENGINE *e) { + auto *pkey_impl = FromOpaque(pkey); + if (pkey_impl == nullptr || pkey_impl->ameth == nullptr) { + OPENSSL_PUT_ERROR(EVP, ERR_R_PASSED_NULL_PARAMETER); + return nullptr; + } + if (pkey_impl->pkey == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return nullptr; + } + + const EVP_PKEY_CTX_METHOD *pkey_method = pkey_impl->ameth->pkey_method; + if (pkey_method == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + ERR_add_error_dataf("algorithm %d", pkey_impl->ameth->pkey_id); + return nullptr; + } + + return evp_pkey_ctx_new(pkey_impl, nullptr, pkey_method).release(); +} + +EVP_PKEY_CTX *EVP_PKEY_CTX_new_id(int id, ENGINE *e) { + // |EVP_PKEY_RSA_PSS| is intentionally omitted from this list. These are types + // that can be created without an |EVP_PKEY|, and we do not support + // |EVP_PKEY_RSA_PSS| keygen. + const EVP_PKEY_ALG *alg = nullptr; + switch (id) { + case EVP_PKEY_RSA: + alg = EVP_pkey_rsa(); + break; + case EVP_PKEY_EC: + alg = evp_pkey_ec_no_curve(); + break; + case EVP_PKEY_ED25519: + alg = EVP_pkey_ed25519(); + break; + case EVP_PKEY_X25519: + alg = EVP_pkey_x25519(); + break; + case EVP_PKEY_HKDF: + alg = evp_pkey_hkdf(); + break; + case EVP_PKEY_ML_DSA_44: + alg = EVP_pkey_ml_dsa_44(); + break; + case EVP_PKEY_ML_DSA_65: + alg = EVP_pkey_ml_dsa_65(); + break; + case EVP_PKEY_ML_DSA_87: + alg = EVP_pkey_ml_dsa_87(); + break; + case EVP_PKEY_ML_KEM_768: + alg = EVP_pkey_ml_kem_768(); + break; + case EVP_PKEY_ML_KEM_1024: + alg = EVP_pkey_ml_kem_1024(); + break; + } + if (alg == nullptr || alg->pkey_method == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + ERR_add_error_dataf("algorithm %d", id); + return nullptr; + } + return evp_pkey_ctx_new_alg(alg).release(); +} + +UniquePtr bssl::evp_pkey_ctx_new_alg(const EVP_PKEY_ALG *alg) { + return evp_pkey_ctx_new(nullptr, alg, alg->pkey_method); +} + +EvpPkeyCtx::~EvpPkeyCtx() { + if (pmeth && pmeth->cleanup) { + pmeth->cleanup(this); + } +} + +void EVP_PKEY_CTX_free(EVP_PKEY_CTX *ctx) { Delete(FromOpaque(ctx)); } + +EVP_PKEY_CTX *EVP_PKEY_CTX_dup(EVP_PKEY_CTX *ctx) { + auto *impl = FromOpaque(ctx); + + if (!impl->pmeth || !impl->pmeth->copy) { + return nullptr; + } + + UniquePtr ret = MakeUnique(); + if (!ret) { + return nullptr; + } + + ret->pmeth = impl->pmeth; + ret->operation = impl->operation; + ret->pkey = UpRef(impl->pkey); + ret->peerkey = UpRef(impl->peerkey); + if (impl->pmeth->copy(ret.get(), impl) <= 0) { + OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP); + return nullptr; + } + + return ret.release(); +} + +EVP_PKEY *EVP_PKEY_CTX_get0_pkey(EVP_PKEY_CTX *ctx) { + auto *impl = FromOpaque(ctx); + return impl->pkey.get(); +} + +int bssl::EVP_PKEY_CTX_ctrl(EVP_PKEY_CTX *ctx, int keytype, int optype, int cmd, + int p1, void *p2) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->ctrl) { + OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); + return 0; + } + if (keytype != -1 && impl->pmeth->pkey_id != keytype) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + if (impl->operation == EVP_PKEY_OP_UNDEFINED) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_OPERATION_SET); + return 0; + } + + if (optype != -1 && !(impl->operation & optype)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_OPERATION); + return 0; + } + + return impl->pmeth->ctrl(impl, cmd, p1, p2); +} + +int EVP_PKEY_sign_init(EVP_PKEY_CTX *ctx) { + auto *impl = FromOpaque(ctx); + if (!ctx || impl->pmeth == nullptr || + (impl->pmeth->sign == nullptr && impl->pmeth->sign_message == nullptr)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + impl->operation = EVP_PKEY_OP_SIGN; + return 1; +} + +int EVP_PKEY_sign(EVP_PKEY_CTX *ctx, uint8_t *sig, size_t *sig_len, + const uint8_t *digest, size_t digest_len) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->sign) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + if (impl->operation != EVP_PKEY_OP_SIGN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_INITIALIZED); + return 0; + } + return impl->pmeth->sign(impl, sig, sig_len, digest, digest_len); +} + +int EVP_PKEY_verify_init(EVP_PKEY_CTX *ctx) { + auto *impl = FromOpaque(ctx); + if (!impl || impl->pmeth == nullptr || + (impl->pmeth->verify == nullptr && + impl->pmeth->verify_message == nullptr)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + impl->operation = EVP_PKEY_OP_VERIFY; + return 1; +} + +int EVP_PKEY_verify(EVP_PKEY_CTX *ctx, const uint8_t *sig, size_t sig_len, + const uint8_t *digest, size_t digest_len) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->verify) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + if (impl->operation != EVP_PKEY_OP_VERIFY) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_INITIALIZED); + return 0; + } + return impl->pmeth->verify(impl, sig, sig_len, digest, digest_len); +} + +int EVP_PKEY_encrypt_init(EVP_PKEY_CTX *ctx) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->encrypt) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + impl->operation = EVP_PKEY_OP_ENCRYPT; + return 1; +} + +int EVP_PKEY_encrypt(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *outlen, + const uint8_t *in, size_t inlen) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->encrypt) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + if (impl->operation != EVP_PKEY_OP_ENCRYPT) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_INITIALIZED); + return 0; + } + return impl->pmeth->encrypt(impl, out, outlen, in, inlen); +} + +int EVP_PKEY_decrypt_init(EVP_PKEY_CTX *ctx) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->decrypt) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + impl->operation = EVP_PKEY_OP_DECRYPT; + return 1; +} + +int EVP_PKEY_decrypt(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *outlen, + const uint8_t *in, size_t inlen) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->decrypt) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + if (impl->operation != EVP_PKEY_OP_DECRYPT) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_INITIALIZED); + return 0; + } + return impl->pmeth->decrypt(impl, out, outlen, in, inlen); +} + +int EVP_PKEY_verify_recover_init(EVP_PKEY_CTX *ctx) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->verify_recover) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + impl->operation = EVP_PKEY_OP_VERIFYRECOVER; + return 1; +} + +int EVP_PKEY_verify_recover(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *out_len, + const uint8_t *sig, size_t sig_len) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->verify_recover) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + if (impl->operation != EVP_PKEY_OP_VERIFYRECOVER) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_INITIALIZED); + return 0; + } + return impl->pmeth->verify_recover(impl, out, out_len, sig, sig_len); +} + +int EVP_PKEY_derive_init(EVP_PKEY_CTX *ctx) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->derive) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + impl->operation = EVP_PKEY_OP_DERIVE; + return 1; +} + +int EVP_PKEY_derive_set_peer(EVP_PKEY_CTX *ctx, EVP_PKEY *peer) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || + !(impl->pmeth->derive || impl->pmeth->encrypt || impl->pmeth->decrypt) || + !impl->pmeth->ctrl) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + if (impl->operation != EVP_PKEY_OP_DERIVE && + impl->operation != EVP_PKEY_OP_ENCRYPT && + impl->operation != EVP_PKEY_OP_DECRYPT) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_INITIALIZED); + return 0; + } + + int ret = impl->pmeth->ctrl(impl, EVP_PKEY_CTRL_PEER_KEY, 0, peer); + + if (ret <= 0) { + return 0; + } + + if (ret == 2) { + return 1; + } + + if (!impl->pkey || !FromOpaque(peer)->pkey) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return 0; + } + + if (EVP_PKEY_id(impl->pkey.get()) != EVP_PKEY_id(peer)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_KEY_TYPES); + return 0; + } + + if (!EVP_PKEY_missing_parameters(peer) && + !EVP_PKEY_parameters_eq(impl->pkey.get(), peer)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DIFFERENT_PARAMETERS); + return 0; + } + + impl->peerkey = UpRef(FromOpaque(peer)); + ret = impl->pmeth->ctrl(impl, EVP_PKEY_CTRL_PEER_KEY, 1, peer); + if (ret <= 0) { + impl->peerkey = nullptr; + return 0; + } + + return 1; +} + +int EVP_PKEY_derive(EVP_PKEY_CTX *ctx, uint8_t *key, size_t *out_key_len) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->derive) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + if (impl->operation != EVP_PKEY_OP_DERIVE) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_INITIALIZED); + return 0; + } + return impl->pmeth->derive(impl, key, out_key_len); +} + +EVP_PKEY *EVP_PKEY_generate_from_alg(const EVP_PKEY_ALG *alg) { + UniquePtr ctx = evp_pkey_ctx_new_alg(alg); + EVP_PKEY *pkey = nullptr; + if (ctx == nullptr || // + !EVP_PKEY_keygen_init(ctx.get()) || // + !EVP_PKEY_keygen(ctx.get(), &pkey)) { + return nullptr; + } + return pkey; +} + +int EVP_PKEY_keygen_init(EVP_PKEY_CTX *ctx) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->keygen) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + impl->operation = EVP_PKEY_OP_KEYGEN; + return 1; +} + +int EVP_PKEY_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY **out_pkey) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->keygen) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + if (impl->operation != EVP_PKEY_OP_KEYGEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_INITIALIZED); + return 0; + } + + if (!out_pkey) { + return 0; + } + + if (!*out_pkey) { + *out_pkey = EVP_PKEY_new(); + if (!*out_pkey) { + OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP); + return 0; + } + } + + if (!impl->pmeth->keygen(impl, FromOpaque(*out_pkey))) { + EVP_PKEY_free(*out_pkey); + *out_pkey = nullptr; + return 0; + } + return 1; +} + +int EVP_PKEY_paramgen_init(EVP_PKEY_CTX *ctx) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->paramgen) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + impl->operation = EVP_PKEY_OP_PARAMGEN; + return 1; +} + +int EVP_PKEY_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY **out_pkey) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->paramgen) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + if (impl->operation != EVP_PKEY_OP_PARAMGEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_INITIALIZED); + return 0; + } + + if (!out_pkey) { + return 0; + } + + if (!*out_pkey) { + *out_pkey = EVP_PKEY_new(); + if (!*out_pkey) { + OPENSSL_PUT_ERROR(EVP, ERR_LIB_EVP); + return 0; + } + } + + if (!impl->pmeth->paramgen(impl, FromOpaque(*out_pkey))) { + EVP_PKEY_free(*out_pkey); + *out_pkey = nullptr; + return 0; + } + return 1; +} + +int EVP_PKEY_encapsulate_init(EVP_PKEY_CTX *ctx, const OSSL_PARAM *params) { + if (params != nullptr && !IsEndParam(*params)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PARAMETERS); + return 0; + } + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->encap) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + impl->operation = EVP_PKEY_OP_ENCAPSULATE; + return 1; +} + +int EVP_PKEY_encapsulate(EVP_PKEY_CTX *ctx, uint8_t *out_ciphertext, + size_t *out_ciphertext_len, uint8_t *out_secret, + size_t *out_secret_len) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->encap) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + if (impl->operation != EVP_PKEY_OP_ENCAPSULATE) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_INITIALIZED); + return 0; + } + if (!impl->pkey) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return 0; + } + return impl->pmeth->encap(impl, out_ciphertext, out_ciphertext_len, + out_secret, out_secret_len); +} + +int EVP_PKEY_decapsulate_init(EVP_PKEY_CTX *ctx, const OSSL_PARAM *params) { + if (params != nullptr && !IsEndParam(*params)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PARAMETERS); + return 0; + } + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->decap) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + impl->operation = EVP_PKEY_OP_DECAPSULATE; + return 1; +} + +int EVP_PKEY_decapsulate(EVP_PKEY_CTX *ctx, uint8_t *out_secret, + size_t *out_secret_len, const uint8_t *ciphertext, + size_t ciphertext_len) { + auto *impl = FromOpaque(ctx); + if (!impl || !impl->pmeth || !impl->pmeth->decap) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + if (impl->operation != EVP_PKEY_OP_DECAPSULATE) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_INITIALIZED); + return 0; + } + if (!impl->pkey) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return 0; + } + return impl->pmeth->decap(impl, out_secret, out_secret_len, ciphertext, + ciphertext_len); +} diff --git a/third_party/boringssl/src/crypto/evp/evp_kem.cc b/third_party/boringssl/src/crypto/evp/evp_kem.cc new file mode 100644 index 00000000..e3504330 --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/evp_kem.cc @@ -0,0 +1,82 @@ +// Copyright 2026 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include "../internal.h" +#include "internal.h" + +using namespace bssl; + +// Checks whether KEM function invocation is valid. +// `ciphertext_len` may be nullptr if it is not required to match. +// `secret_len` must always match. +static bool check_kem_invocation(const EVP_KEM *kem, + const size_t *ciphertext_len, + size_t secret_len, + const bssl::EvpPkey *pkey_impl) { + if (pkey_impl == nullptr || pkey_impl->pkey == nullptr || + pkey_impl->ameth == nullptr) { + OPENSSL_PUT_ERROR(EVP, ERR_R_PASSED_NULL_PARAMETER); + return false; + } + if (kem->pkey_id != EVP_PKEY_id(pkey_impl)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_ALGORITHM); + return false; + } + if (ciphertext_len && *ciphertext_len != kem->ciphertext_len) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_CIPHERTEXT_LENGTH); + return false; + } + if (secret_len != kem->secret_len) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_SECRET_LENGTH); + return false; + } + return true; +} + +size_t EVP_KEM_ciphertext_len(const EVP_KEM *kem) { + return kem->ciphertext_len; +} + +size_t EVP_KEM_secret_len(const EVP_KEM *kem) { + return kem->secret_len; +} + +int EVP_KEM_encap(const EVP_KEM *kem, uint8_t *out_ciphertext, + size_t ciphertext_len, uint8_t *out_secret, size_t secret_len, + const EVP_PKEY *peer_key) { + auto *pkey_impl = FromOpaque(peer_key); + if (!check_kem_invocation(kem, &ciphertext_len, secret_len, pkey_impl)) { + return 0; + } + return kem->encap(out_ciphertext, ciphertext_len, out_secret, secret_len, + pkey_impl); +} + +int EVP_KEM_decap(const EVP_KEM *kem, uint8_t *out_secret, size_t secret_len, + const uint8_t *ciphertext, size_t ciphertext_len, + const EVP_PKEY *key) { + auto *pkey_impl = FromOpaque(key); + if (!check_kem_invocation(kem, nullptr, secret_len, pkey_impl)) { + return 0; + } + return kem->decap(out_secret, secret_len, ciphertext, ciphertext_len, + pkey_impl); +} diff --git a/third_party/boringssl/src/crypto/evp/internal.h b/third_party/boringssl/src/crypto/evp/internal.h index 0037de82..10485b64 100644 --- a/third_party/boringssl/src/crypto/evp/internal.h +++ b/third_party/boringssl/src/crypto/evp/internal.h @@ -1,122 +1,160 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#ifndef OPENSSL_HEADER_EVP_INTERNAL_H -#define OPENSSL_HEADER_EVP_INTERNAL_H - -#include - -#include - -#if defined(__cplusplus) -extern "C" { -#endif +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_EVP_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_EVP_INTERNAL_H + +#include + +#include + +#include + +#include "../internal.h" +#include "../mem_internal.h" + + +DECLARE_OPAQUE_STRUCT(evp_pkey_st, EvpPkey) +DECLARE_OPAQUE_STRUCT(evp_pkey_ctx_st, EvpPkeyCtx) + +BSSL_NAMESPACE_BEGIN + +typedef struct evp_pkey_asn1_method_st EVP_PKEY_ASN1_METHOD; +typedef struct evp_pkey_ctx_method_st EVP_PKEY_CTX_METHOD; + +BSSL_NAMESPACE_END + +struct evp_pkey_alg_st { + // method and pkey_method implement operations for this |EVP_PKEY_ALG|. + const bssl::EVP_PKEY_ASN1_METHOD *method; + const bssl::EVP_PKEY_CTX_METHOD *pkey_method; +}; + +BSSL_NAMESPACE_BEGIN +enum evp_decode_result_t { + evp_decode_error = 0, + evp_decode_ok = 1, + evp_decode_unsupported = 2, +}; struct evp_pkey_asn1_method_st { + // pkey_id contains one of the |EVP_PKEY_*| values and corresponds to the OID + // in the key type's AlgorithmIdentifier. int pkey_id; uint8_t oid[9]; uint8_t oid_len; - const EVP_PKEY_METHOD *pkey_method; + const EVP_PKEY_CTX_METHOD *pkey_method; // pub_decode decodes |params| and |key| as a SubjectPublicKeyInfo - // and writes the result into |out|. It returns one on success and zero on - // error. |params| is the AlgorithmIdentifier after the OBJECT IDENTIFIER - // type field, and |key| is the contents of the subjectPublicKey with the - // leading padding byte checked and removed. Although X.509 uses BIT STRINGs - // to represent SubjectPublicKeyInfo, every key type defined encodes the key - // as a byte string with the same conversion to BIT STRING. - int (*pub_decode)(EVP_PKEY *out, CBS *params, CBS *key); + // and writes the result into |out|. It returns |evp_decode_ok| on success, + // and |evp_decode_error| on error, and |evp_decode_unsupported| if the input + // was not supported by this |EVP_PKEY_ALG|. In case of + // |evp_decode_unsupported|, it does not add an error to the error queue. May + // modify |params| and |key|. Callers must make a copy if calling in a loop. + // + // |params| is the AlgorithmIdentifier after the OBJECT IDENTIFIER type field, + // and |key| is the contents of the subjectPublicKey with the leading padding + // byte checked and removed. Although X.509 uses BIT STRINGs to represent + // SubjectPublicKeyInfo, every key type defined encodes the key as a byte + // string with the same conversion to BIT STRING. + evp_decode_result_t (*pub_decode)(const EVP_PKEY_ALG *alg, EvpPkey *out, + CBS *params, CBS *key); // pub_encode encodes |key| as a SubjectPublicKeyInfo and appends the result // to |out|. It returns one on success and zero on error. - int (*pub_encode)(CBB *out, const EVP_PKEY *key); + int (*pub_encode)(CBB *out, const EvpPkey *key); - int (*pub_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); + bool (*pub_equal)(const EvpPkey *a, const EvpPkey *b); + + // pub_present returns true iff the |pk| has a public key. (If so, validity + // is not guaranteed and should be checked separately.) + bool (*pub_present)(const EvpPkey *pk); + + // pub_copy sets the key data of |out| to a newly allocated key data structure + // which contains a copy of only the public key of |pk|, freeing any key + // previously in |out|. Returns true on success or false on failure. + bool (*pub_copy)(EvpPkey *out, const EvpPkey *pk); // priv_decode decodes |params| and |key| as a PrivateKeyInfo and writes the - // result into |out|. It returns one on success and zero on error. |params| is - // the AlgorithmIdentifier after the OBJECT IDENTIFIER type field, and |key| - // is the contents of the OCTET STRING privateKey field. - int (*priv_decode)(EVP_PKEY *out, CBS *params, CBS *key); + // result into |out|. It returns |evp_decode_ok| on success, and + // |evp_decode_error| on error, and |evp_decode_unsupported| if the key type + // was not supported by this |EVP_PKEY_ALG|. In case of + // |evp_decode_unsupported|, it does not add an error to the error queue. May + // modify |params| and |key|. Callers must make a copy if calling in a loop. + // + // |params| is the AlgorithmIdentifier after the OBJECT IDENTIFIER type field, + // and |key| is the contents of the OCTET STRING privateKey field. + evp_decode_result_t (*priv_decode)(const EVP_PKEY_ALG *alg, EvpPkey *out, + CBS *params, CBS *key); // priv_encode encodes |key| as a PrivateKeyInfo and appends the result to // |out|. It returns one on success and zero on error. - int (*priv_encode)(CBB *out, const EVP_PKEY *key); - - int (*set_priv_raw)(EVP_PKEY *pkey, const uint8_t *in, size_t len); - int (*set_pub_raw)(EVP_PKEY *pkey, const uint8_t *in, size_t len); - int (*get_priv_raw)(const EVP_PKEY *pkey, uint8_t *out, size_t *out_len); - int (*get_pub_raw)(const EVP_PKEY *pkey, uint8_t *out, size_t *out_len); + int (*priv_encode)(CBB *out, const EvpPkey *key); + + // priv_present returns true iff the |pk| has a private key. (If so, validity + // is not guaranteed and should be checked separately.) + bool (*priv_present)(const EvpPkey *pk); + + int (*set_priv_raw)(EvpPkey *pkey, const uint8_t *in, size_t len); + int (*set_priv_seed)(EvpPkey *pkey, const uint8_t *in, size_t len); + int (*set_pub_raw)(EvpPkey *pkey, const uint8_t *in, size_t len); + int (*get_priv_raw)(const EvpPkey *pkey, uint8_t *out, size_t *out_len); + int (*get_priv_seed)(const EvpPkey *pkey, uint8_t *out, size_t *out_len); + int (*get_pub_raw)(const EvpPkey *pkey, uint8_t *out, size_t *out_len); + + // TODO(davidben): Can these be merged with the functions above? OpenSSL does + // not implement |EVP_PKEY_get_raw_public_key|, etc., for |EVP_PKEY_EC|, but + // the distinction seems unimportant. OpenSSL 3.0 has since renamed + // |EVP_PKEY_get1_tls_encodedpoint| to |EVP_PKEY_get1_encoded_public_key|, and + // what is the difference between "raw" and an "encoded" public key. + // + // One nuisance is the notion of "raw" is slightly ambiguous for EC keys. Is + // it a DER ECPrivateKey or just the scalar? + int (*set1_tls_encodedpoint)(EvpPkey *pkey, const uint8_t *in, size_t len); + size_t (*get1_tls_encodedpoint)(const EvpPkey *pkey, uint8_t **out_ptr); // pkey_opaque returns 1 if the |pk| is opaque. Opaque keys are backed by // custom implementations which do not expose key material and parameters. - int (*pkey_opaque)(const EVP_PKEY *pk); + int (*pkey_opaque)(const EvpPkey *pk); - int (*pkey_size)(const EVP_PKEY *pk); - int (*pkey_bits)(const EVP_PKEY *pk); + int (*pkey_size)(const EvpPkey *pk); + int (*pkey_bits)(const EvpPkey *pk); - int (*param_missing)(const EVP_PKEY *pk); - int (*param_copy)(EVP_PKEY *to, const EVP_PKEY *from); - int (*param_cmp)(const EVP_PKEY *a, const EVP_PKEY *b); + int (*param_missing)(const EvpPkey *pk); + int (*param_copy)(EvpPkey *to, const EvpPkey *from); + bool (*param_equal)(const EvpPkey *a, const EvpPkey *b); - void (*pkey_free)(EVP_PKEY *pkey); + void (*pkey_free)(EvpPkey *pkey); } /* EVP_PKEY_ASN1_METHOD */; +class EvpPkey : public evp_pkey_st, public RefCounted { + public: + EvpPkey(); + + // pkey contains a pointer to a structure dependent on |ameth|. + void *pkey = nullptr; + + // ameth contains a pointer to a method table that determines the key type, or + // nullptr if the key is empty. + const bssl::EVP_PKEY_ASN1_METHOD *ameth = nullptr; + + private: + ~EvpPkey(); + friend RefCounted; +} /* EVP_PKEY */; #define EVP_PKEY_OP_UNDEFINED 0 #define EVP_PKEY_OP_KEYGEN (1 << 2) @@ -127,15 +165,14 @@ struct evp_pkey_asn1_method_st { #define EVP_PKEY_OP_DECRYPT (1 << 7) #define EVP_PKEY_OP_DERIVE (1 << 8) #define EVP_PKEY_OP_PARAMGEN (1 << 9) +#define EVP_PKEY_OP_ENCAPSULATE (1 << 10) +#define EVP_PKEY_OP_DECAPSULATE (1 << 11) #define EVP_PKEY_OP_TYPE_SIG \ (EVP_PKEY_OP_SIGN | EVP_PKEY_OP_VERIFY | EVP_PKEY_OP_VERIFYRECOVER) #define EVP_PKEY_OP_TYPE_CRYPT (EVP_PKEY_OP_ENCRYPT | EVP_PKEY_OP_DECRYPT) -#define EVP_PKEY_OP_TYPE_NOGEN \ - (EVP_PKEY_OP_SIG | EVP_PKEY_OP_CRYPT | EVP_PKEY_OP_DERIVE) - #define EVP_PKEY_OP_TYPE_GEN (EVP_PKEY_OP_KEYGEN | EVP_PKEY_OP_PARAMGEN) // EVP_PKEY_CTX_ctrl performs |cmd| on |ctx|. The |keytype| and |optype| @@ -179,97 +216,198 @@ OPENSSL_EXPORT int EVP_PKEY_CTX_ctrl(EVP_PKEY_CTX *ctx, int keytype, int optype, #define EVP_PKEY_CTRL_GET_RSA_MGF1_MD (EVP_PKEY_ALG_CTRL + 10) #define EVP_PKEY_CTRL_RSA_OAEP_LABEL (EVP_PKEY_ALG_CTRL + 11) #define EVP_PKEY_CTRL_GET_RSA_OAEP_LABEL (EVP_PKEY_ALG_CTRL + 12) -#define EVP_PKEY_CTRL_EC_PARAMGEN_CURVE_NID (EVP_PKEY_ALG_CTRL + 13) +#define EVP_PKEY_CTRL_EC_PARAMGEN_GROUP (EVP_PKEY_ALG_CTRL + 13) #define EVP_PKEY_CTRL_HKDF_MODE (EVP_PKEY_ALG_CTRL + 14) #define EVP_PKEY_CTRL_HKDF_MD (EVP_PKEY_ALG_CTRL + 15) #define EVP_PKEY_CTRL_HKDF_KEY (EVP_PKEY_ALG_CTRL + 16) #define EVP_PKEY_CTRL_HKDF_SALT (EVP_PKEY_ALG_CTRL + 17) #define EVP_PKEY_CTRL_HKDF_INFO (EVP_PKEY_ALG_CTRL + 18) +#define EVP_PKEY_CTRL_DH_PAD (EVP_PKEY_ALG_CTRL + 19) +#define EVP_PKEY_CTRL_SIGNATURE_CONTEXT_STRING (EVP_PKEY_ALG_CTRL + 20) + +class EvpPkeyCtx : public evp_pkey_ctx_st { + public: + static constexpr bool kAllowUniquePtr = true; + + // TODO(crbug.com/487376811): Ideally this destructor should be virtual so + // that we can emit vtables in libcrypto. In that case we would be able to + // replace |pmeth| with virtual methods and subclassing. + ~EvpPkeyCtx(); -struct evp_pkey_ctx_st { // Method associated with this operation - const EVP_PKEY_METHOD *pmeth; - // Engine that implements this method or NULL if builtin - ENGINE *engine; - // Key: may be NULL - EVP_PKEY *pkey; - // Peer key for key agreement, may be NULL - EVP_PKEY *peerkey; + const bssl::EVP_PKEY_CTX_METHOD *pmeth = nullptr; + // Key: may be nullptr + bssl::UniquePtr pkey; + // Peer key for key agreement, may be nullptr + bssl::UniquePtr peerkey; // operation contains one of the |EVP_PKEY_OP_*| values. - int operation; - // Algorithm specific data - void *data; -} /* EVP_PKEY_CTX */; - -struct evp_pkey_method_st { + int operation = EVP_PKEY_OP_UNDEFINED; + // Algorithm specific data. + // TODO(crbug.com/487376811): Since a |EVP_PKEY_CTX| never has its type change + // after creation, this should instead be a base class, with the + // algorithm-specific data on the subclass, coming from the same allocation. + void *data = nullptr; +}; + +struct evp_pkey_ctx_method_st { int pkey_id; - int (*init)(EVP_PKEY_CTX *ctx); - int (*copy)(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src); - void (*cleanup)(EVP_PKEY_CTX *ctx); + // |alg| may be nullptr. If non-null, |ctx| will have a key set. + int (*init)(EvpPkeyCtx *ctx, const EVP_PKEY_ALG *alg); + int (*copy)(EvpPkeyCtx *dst, EvpPkeyCtx *src); + void (*cleanup)(EvpPkeyCtx *ctx); - int (*keygen)(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey); + int (*keygen)(EvpPkeyCtx *ctx, EvpPkey *pkey); - int (*sign)(EVP_PKEY_CTX *ctx, uint8_t *sig, size_t *siglen, - const uint8_t *tbs, size_t tbslen); + int (*sign)(EvpPkeyCtx *ctx, uint8_t *sig, size_t *siglen, const uint8_t *tbs, + size_t tbslen); - int (*sign_message)(EVP_PKEY_CTX *ctx, uint8_t *sig, size_t *siglen, + int (*sign_message)(EvpPkeyCtx *ctx, uint8_t *sig, size_t *siglen, const uint8_t *tbs, size_t tbslen); - int (*verify)(EVP_PKEY_CTX *ctx, const uint8_t *sig, size_t siglen, + int (*verify)(EvpPkeyCtx *ctx, const uint8_t *sig, size_t siglen, const uint8_t *tbs, size_t tbslen); - int (*verify_message)(EVP_PKEY_CTX *ctx, const uint8_t *sig, size_t siglen, + int (*verify_message)(EvpPkeyCtx *ctx, const uint8_t *sig, size_t siglen, const uint8_t *tbs, size_t tbslen); - int (*verify_recover)(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *out_len, + int (*verify_recover)(EvpPkeyCtx *ctx, uint8_t *out, size_t *out_len, const uint8_t *sig, size_t sig_len); - int (*encrypt)(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *outlen, + int (*encrypt)(EvpPkeyCtx *ctx, uint8_t *out, size_t *outlen, const uint8_t *in, size_t inlen); - int (*decrypt)(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *outlen, + int (*decrypt)(EvpPkeyCtx *ctx, uint8_t *out, size_t *outlen, const uint8_t *in, size_t inlen); - int (*derive)(EVP_PKEY_CTX *ctx, uint8_t *key, size_t *keylen); - - int (*paramgen)(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey); + int (*derive)(EvpPkeyCtx *ctx, uint8_t *key, size_t *keylen); - int (*ctrl)(EVP_PKEY_CTX *ctx, int type, int p1, void *p2); -} /* EVP_PKEY_METHOD */; + int (*paramgen)(EvpPkeyCtx *ctx, EvpPkey *pkey); -typedef struct { - // key is the concatenation of the private seed and public key. It is stored - // as a single 64-bit array to allow passing to |ED25519_sign|. If - // |has_private| is false, the first 32 bytes are uninitialized and the public - // key is in the last 32 bytes. - uint8_t key[64]; - char has_private; -} ED25519_KEY; + int (*encap)(EvpPkeyCtx *ctx, uint8_t *out_ciphertext, + size_t *out_ciphertext_len, uint8_t *out_secret, + size_t *out_secret_len); -#define ED25519_PUBLIC_KEY_OFFSET 32 + int (*decap)(EvpPkeyCtx *ctx, uint8_t *out_secret, size_t *out_secret_len, + const uint8_t *ciphertext, size_t ciphertext_len); -typedef struct { - uint8_t pub[32]; - uint8_t priv[32]; - char has_private; -} X25519_KEY; + int (*ctrl)(EvpPkeyCtx *ctx, int type, int p1, void *p2); +} /* EVP_PKEY_CTX_METHOD */; -extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meth; -extern const EVP_PKEY_ASN1_METHOD ec_asn1_meth; -extern const EVP_PKEY_ASN1_METHOD rsa_asn1_meth; -extern const EVP_PKEY_ASN1_METHOD ed25519_asn1_meth; -extern const EVP_PKEY_ASN1_METHOD x25519_asn1_meth; +BSSL_NAMESPACE_END -extern const EVP_PKEY_METHOD rsa_pkey_meth; -extern const EVP_PKEY_METHOD ec_pkey_meth; -extern const EVP_PKEY_METHOD ed25519_pkey_meth; -extern const EVP_PKEY_METHOD x25519_pkey_meth; -extern const EVP_PKEY_METHOD hkdf_pkey_meth; - - -#if defined(__cplusplus) -} // extern C -#endif +// TODO(chlily): Make compatible with `EVP_HPKE_KEM`. +struct evp_kem_st { + // Identifies the type of EVP_PKEYs compatible with this KEM. + int pkey_id; -#endif // OPENSSL_HEADER_EVP_INTERNAL_H + // Constant lengths of ciphertexts and secrets produced/consumed by this KEM. + size_t ciphertext_len; + size_t secret_len; + + int (*encap)(uint8_t *out_ciphertext, size_t ciphertext_len, + uint8_t *out_secret, size_t secret_len, + const EVP_PKEY *peer_key); + int (*decap)(uint8_t *out_secret, size_t secret_len, + const uint8_t *ciphertext, size_t ciphertext_len, + const EVP_PKEY *key); +} /* EVP_KEM */; + +BSSL_NAMESPACE_BEGIN + +// KemAdapter is templated on an instance of EVP_KEM, and generates static +// methods matching the behavior and function signatures for `encap` and `decap` +// in EVP_PKEY_CTX_METHOD. +template +struct KemAdapter { + KemAdapter() = delete; + + static int EncapMethod(EvpPkeyCtx *ctx, uint8_t *out_ciphertext, + size_t *out_ciphertext_len, uint8_t *out_secret, + size_t *out_secret_len) { + if (out_ciphertext == nullptr) { + if (out_ciphertext_len != nullptr) { + *out_ciphertext_len = KEM.ciphertext_len; + } + if (out_secret_len != nullptr) { + *out_secret_len = KEM.secret_len; + } + return 1; + } + if (*out_ciphertext_len < KEM.ciphertext_len || + *out_secret_len < KEM.secret_len) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + if (KEM.encap(out_ciphertext, KEM.ciphertext_len, out_secret, + KEM.secret_len, ctx->pkey.get())) { + *out_ciphertext_len = KEM.ciphertext_len; + *out_secret_len = KEM.secret_len; + return 1; + } + return 0; + } + + static int DecapMethod(EvpPkeyCtx *ctx, uint8_t *out_secret, + size_t *out_secret_len, const uint8_t *ciphertext, + size_t ciphertext_len) { + if (out_secret == nullptr) { + *out_secret_len = KEM.secret_len; + return 1; + } + if (*out_secret_len < KEM.secret_len) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + if (KEM.decap(out_secret, KEM.secret_len, ciphertext, ciphertext_len, + ctx->pkey.get())) { + *out_secret_len = KEM.secret_len; + return 1; + } + return 0; + } +}; + +// evp_pkey_ec_no_curve returns an internal curveless EC |EVP_PKEY_ALG|. This +// cannot be used to parse anything and is only useful for key generation. +const EVP_PKEY_ALG *evp_pkey_ec_no_curve(); + +// evp_pkey_hkdf returns an internal |EVP_PKEY_ALG| used to implement +// |EVP_PKEY_HKDF|. It has no associated key type. +const EVP_PKEY_ALG *evp_pkey_hkdf(); + +// evp_pkey_ctx_new_alg behaves like |EVP_PKEY_CTX_new_id| but takes an +// |EVP_PKEY_ALG|. +UniquePtr evp_pkey_ctx_new_alg(const EVP_PKEY_ALG *alg); + +// evp_pkey_set0 sets |pkey|'s method to |method| and data to |pkey_data|, +// freeing any key that may previously have been configured. This function takes +// ownership of |pkey_data|, which must be of the type expected by |method|. +void evp_pkey_set0(EvpPkey *pkey, const EVP_PKEY_ASN1_METHOD *method, + void *pkey_data); + +inline auto GetDefaultEVPAlgorithms() { + // A set of algorithms to use by default in |EVP_parse_public_key| and + // |EVP_parse_private_key|. + return std::array{ + EVP_pkey_ec_p224(), + EVP_pkey_ec_p256(), + EVP_pkey_ec_p384(), + EVP_pkey_ec_p521(), + EVP_pkey_ed25519(), + EVP_pkey_rsa(), + EVP_pkey_x25519(), + EVP_pkey_ml_dsa_44(), + EVP_pkey_ml_dsa_65(), + EVP_pkey_ml_dsa_87(), + EVP_pkey_ml_kem_768(), + EVP_pkey_ml_kem_1024(), + // TODO(crbug.com/438761503): Remove DSA from this set, after callers that + // need DSA pass in |EVP_pkey_dsa| explicitly. + EVP_pkey_dsa(), + }; +} + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_EVP_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/evp/p_dh.cc b/third_party/boringssl/src/crypto/evp/p_dh.cc new file mode 100644 index 00000000..eaf7fadc --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/p_dh.cc @@ -0,0 +1,326 @@ +// Copyright 2006-2019 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +namespace { + +extern const EVP_PKEY_CTX_METHOD dh_pkey_meth; + +static void dh_free(EvpPkey *pkey) { + DH_free(reinterpret_cast(pkey->pkey)); + pkey->pkey = nullptr; +} + +static int dh_size(const EvpPkey *pkey) { + return DH_size(reinterpret_cast(pkey->pkey)); +} + +static int dh_bits(const EvpPkey *pkey) { + return DH_bits(reinterpret_cast(pkey->pkey)); +} + +static int dh_param_missing(const EvpPkey *pkey) { + const DH *dh = reinterpret_cast(pkey->pkey); + return dh == nullptr || DH_get0_p(dh) == nullptr || DH_get0_g(dh) == nullptr; +} + +static int dh_param_copy(EvpPkey *to, const EvpPkey *from) { + if (dh_param_missing(from)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PARAMETERS); + return 0; + } + if (to->pkey == nullptr) { + to->pkey = DH_new(); + if (to->pkey == nullptr) { + return 0; + } + } + + const DH *dh = reinterpret_cast(from->pkey); + const BIGNUM *q_old = DH_get0_q(dh); + BIGNUM *p = BN_dup(DH_get0_p(dh)); + BIGNUM *q = q_old == nullptr ? nullptr : BN_dup(q_old); + BIGNUM *g = BN_dup(DH_get0_g(dh)); + if (p == nullptr || (q_old != nullptr && q == nullptr) || g == nullptr || + !DH_set0_pqg(reinterpret_cast(to->pkey), p, q, g)) { + BN_free(p); + BN_free(q); + BN_free(g); + return 0; + } + + // |DH_set0_pqg| took ownership of |p|, |q|, and |g|. + return 1; +} + +static bool dh_param_equal(const EvpPkey *a, const EvpPkey *b) { + if (dh_param_missing(a) || dh_param_missing(b)) { + return false; + } + + // Matching OpenSSL, only compare p and g for PKCS#3-style Diffie-Hellman. + // OpenSSL only checks q in X9.42-style Diffie-Hellman ("DHX"). + const DH *a_dh = reinterpret_cast(a->pkey); + const DH *b_dh = reinterpret_cast(b->pkey); + return BN_cmp(DH_get0_p(a_dh), DH_get0_p(b_dh)) == 0 && + BN_cmp(DH_get0_g(a_dh), DH_get0_g(b_dh)) == 0; +} + +static bool dh_pub_equal(const EvpPkey *a, const EvpPkey *b) { + if (!dh_param_equal(a, b)) { + return false; + } + + const DH *a_dh = reinterpret_cast(a->pkey); + const DH *b_dh = reinterpret_cast(b->pkey); + return BN_cmp(DH_get0_pub_key(a_dh), DH_get0_pub_key(b_dh)) == 0; +} + +static bool dh_has_pub(const EvpPkey *pk) { + const DH *pk_dh = reinterpret_cast(pk->pkey); + return DH_get0_pub_key(pk_dh) != nullptr; +} + +static bool dh_pub_copy(EvpPkey *out, const EvpPkey *pk) { + const DH *pk_dh = reinterpret_cast(pk->pkey); + const BIGNUM *public_key = DH_get0_pub_key(pk_dh); + if (public_key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PUBLIC_KEY); + return false; + } + UniquePtr public_copy_dh(DHparams_dup(pk_dh)); + BIGNUM *public_key_copy = BN_dup(public_key); + if (public_copy_dh == nullptr || public_key_copy == nullptr || + !DH_set0_key(public_copy_dh.get(), public_key_copy, nullptr)) { + BN_free(public_key_copy); + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return false; + } + evp_pkey_set0(out, pk->ameth, public_copy_dh.release()); + return true; +} + +static bool dh_has_priv(const EvpPkey *pk) { + const DH *pk_dh = reinterpret_cast(pk->pkey); + return DH_get0_priv_key(pk_dh) != nullptr; +} + +static const EVP_PKEY_ASN1_METHOD dh_asn1_meth = { + /*pkey_id=*/EVP_PKEY_DH, + /*oid=*/{0}, + /*oid_len=*/0, + /*pkey_method=*/&dh_pkey_meth, + /*pub_decode=*/nullptr, + /*pub_encode=*/nullptr, + /*pub_equal=*/dh_pub_equal, + /*pub_present=*/dh_has_pub, + /*pub_dup=*/dh_pub_copy, + /*priv_decode=*/nullptr, + /*priv_encode=*/nullptr, + /*priv_present=*/dh_has_priv, + /*set_priv_raw=*/nullptr, + /*set_priv_seed=*/nullptr, + /*set_pub_raw=*/nullptr, + /*get_priv_raw=*/nullptr, + /*get_priv_seed=*/nullptr, + /*get_pub_raw=*/nullptr, + /*set1_tls_encodedpoint=*/nullptr, + /*get1_tls_encodedpoint=*/nullptr, + /*pkey_opaque=*/nullptr, + /*pkey_size=*/dh_size, + /*pkey_bits=*/dh_bits, + /*param_missing=*/dh_param_missing, + /*param_copy=*/dh_param_copy, + /*param_equal=*/dh_param_equal, + /*pkey_free=*/dh_free, +}; + +struct DH_PKEY_CTX { + bool pad = false; +}; + +static int pkey_dh_init(EvpPkeyCtx *ctx, const EVP_PKEY_ALG *) { + DH_PKEY_CTX *dctx = New(); + if (dctx == nullptr) { + return 0; + } + + ctx->data = dctx; + return 1; +} + +static int pkey_dh_copy(EvpPkeyCtx *dst, EvpPkeyCtx *src) { + if (!pkey_dh_init(dst, nullptr)) { + return 0; + } + + const DH_PKEY_CTX *sctx = reinterpret_cast(src->data); + DH_PKEY_CTX *dctx = reinterpret_cast(dst->data); + dctx->pad = sctx->pad; + return 1; +} + +static void pkey_dh_cleanup(EvpPkeyCtx *ctx) { + Delete(reinterpret_cast(ctx->data)); +} + +static int pkey_dh_keygen(EvpPkeyCtx *ctx, EvpPkey *pkey) { + DH *dh = DH_new(); + if (dh == nullptr || !EVP_PKEY_assign_DH(pkey, dh)) { + DH_free(dh); + return 0; + } + + if (ctx->pkey != nullptr && + !EVP_PKEY_copy_parameters(pkey, ctx->pkey.get())) { + return 0; + } + + return DH_generate_key(dh); +} + +static int pkey_dh_derive(EvpPkeyCtx *ctx, uint8_t *out, size_t *out_len) { + DH_PKEY_CTX *dctx = reinterpret_cast(ctx->data); + if (ctx->pkey == nullptr || ctx->peerkey == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); + return 0; + } + + DH *our_key = reinterpret_cast(ctx->pkey->pkey); + DH *peer_key = reinterpret_cast(ctx->peerkey->pkey); + if (our_key == nullptr || peer_key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); + return 0; + } + + const BIGNUM *pub_key = DH_get0_pub_key(peer_key); + if (pub_key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); + return 0; + } + + if (out == nullptr) { + *out_len = DH_size(our_key); + return 1; + } + + if (*out_len < (size_t)DH_size(our_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + int ret = dctx->pad ? DH_compute_key_padded(out, pub_key, our_key) + : DH_compute_key(out, pub_key, our_key); + if (ret < 0) { + return 0; + } + + assert(ret <= DH_size(our_key)); + *out_len = (size_t)ret; + return 1; +} + +static int pkey_dh_ctrl(EvpPkeyCtx *ctx, int type, int p1, void *p2) { + DH_PKEY_CTX *dctx = reinterpret_cast(ctx->data); + switch (type) { + case EVP_PKEY_CTRL_PEER_KEY: + // |EVP_PKEY_derive_set_peer| requires the key implement this command, + // even if it is a no-op. + return 1; + + case EVP_PKEY_CTRL_DH_PAD: + dctx->pad = p1; + return 1; + + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); + return 0; + } +} + +const EVP_PKEY_CTX_METHOD dh_pkey_meth = { + /*pkey_id=*/EVP_PKEY_DH, + /*init=*/pkey_dh_init, + /*copy=*/pkey_dh_copy, + /*cleanup=*/pkey_dh_cleanup, + /*keygen=*/pkey_dh_keygen, + /*sign=*/nullptr, + /*sign_message=*/nullptr, + /*verify=*/nullptr, + /*verify_message=*/nullptr, + /*verify_recover=*/nullptr, + /*encrypt=*/nullptr, + /*decrypt=*/nullptr, + /*derive=*/pkey_dh_derive, + /*paramgen=*/nullptr, + /*encap=*/nullptr, + /*decap=*/nullptr, + /*ctrl=*/pkey_dh_ctrl, +}; + +} // namespace + +int EVP_PKEY_set1_DH(EVP_PKEY *pkey, DH *key) { + if (EVP_PKEY_assign_DH(pkey, key)) { + DH_up_ref(key); + return 1; + } + return 0; +} + +int EVP_PKEY_assign_DH(EVP_PKEY *pkey, DH *key) { + if (key == nullptr) { + return 0; + } + evp_pkey_set0(FromOpaque(pkey), &dh_asn1_meth, key); + return 1; +} + +DH *EVP_PKEY_get0_DH(const EVP_PKEY *pkey) { + if (EVP_PKEY_id(pkey) != EVP_PKEY_DH) { + OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_A_DH_KEY); + return nullptr; + } + return reinterpret_cast(const_cast(FromOpaque(pkey))->pkey); +} + +DH *EVP_PKEY_get1_DH(const EVP_PKEY *pkey) { + DH *dh = EVP_PKEY_get0_DH(pkey); + if (dh != nullptr) { + DH_up_ref(dh); + } + return dh; +} + +int EVP_PKEY_CTX_set_dh_pad(EVP_PKEY_CTX *ctx, int pad) { + return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_DH, EVP_PKEY_OP_DERIVE, + EVP_PKEY_CTRL_DH_PAD, pad, nullptr); +} diff --git a/third_party/boringssl/src/crypto/evp/p_dsa.cc b/third_party/boringssl/src/crypto/evp/p_dsa.cc new file mode 100644 index 00000000..6b6201c4 --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/p_dsa.cc @@ -0,0 +1,333 @@ +// Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + +#include "../dsa/internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +namespace { + +extern const EVP_PKEY_ASN1_METHOD dsa_asn1_meth; + +static bssl::evp_decode_result_t dsa_pub_decode(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + // See RFC 3279, section 2.3.2. + + // Decode parameters. RFC 3279 permits DSA parameters to be omitted, in which + // case they are implicitly determined from the issuing certificate, or + // somewhere unspecified and out-of-band. We do not support this mode. + UniquePtr dsa(FromOpaque(DSA_parse_parameters(params))); + if (dsa == nullptr || CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + dsa->pub_key.reset(BN_new()); + if (dsa->pub_key == nullptr) { + return evp_decode_error; + } + + if (!BN_parse_asn1_unsigned(key, dsa->pub_key.get()) || CBS_len(key) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + EVP_PKEY_assign_DSA(out, dsa.release()); + return evp_decode_ok; +} + +static int dsa_pub_encode(CBB *out, const EvpPkey *key) { + const DSAImpl *dsa = reinterpret_cast(key->pkey); + const int has_params = + dsa->p != nullptr && dsa->q != nullptr && dsa->g != nullptr; + + // See RFC 5480, section 2. + CBB spki, algorithm, key_bitstring; + if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, dsa_asn1_meth.oid, + dsa_asn1_meth.oid_len) || + (has_params && !DSA_marshal_parameters(&algorithm, dsa)) || + !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || + !CBB_add_u8(&key_bitstring, 0 /* padding */) || + !BN_marshal_asn1(&key_bitstring, dsa->pub_key.get()) || !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static bssl::evp_decode_result_t dsa_priv_decode(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + // See PKCS#11, v2.40, section 2.5. + + // Decode parameters. + UniquePtr dsa(FromOpaque(DSA_parse_parameters(params))); + if (dsa == nullptr || CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + dsa->priv_key.reset(BN_new()); + if (dsa->priv_key == nullptr) { + return evp_decode_error; + } + if (!BN_parse_asn1_unsigned(key, dsa->priv_key.get()) || CBS_len(key) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + // To avoid DoS attacks when importing private keys, check bounds on |dsa|. + // This bounds |dsa->priv_key| against |dsa->q| and bounds |dsa->q|'s bit + // width. + if (!dsa_check_key(dsa.get())) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + // Calculate the public key. + UniquePtr ctx(BN_CTX_new()); + dsa->pub_key.reset(BN_new()); + if (ctx == nullptr || dsa->pub_key == nullptr || + !BN_mod_exp_mont_consttime(dsa->pub_key.get(), dsa->g.get(), + dsa->priv_key.get(), dsa->p.get(), ctx.get(), + nullptr)) { + return evp_decode_error; + } + + EVP_PKEY_assign_DSA(out, dsa.release()); + return evp_decode_ok; +} + +static int dsa_priv_encode(CBB *out, const EvpPkey *key) { + const DSAImpl *dsa = reinterpret_cast(key->pkey); + if (dsa == nullptr || dsa->priv_key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PARAMETERS); + return 0; + } + + // See PKCS#11, v2.40, section 2.5. + CBB pkcs8, algorithm, private_key; + if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || + !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, dsa_asn1_meth.oid, + dsa_asn1_meth.oid_len) || + !DSA_marshal_parameters(&algorithm, dsa) || + !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || + !BN_marshal_asn1(&private_key, dsa->priv_key.get()) || !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static int int_dsa_size(const EvpPkey *pkey) { + const DSAImpl *dsa = reinterpret_cast(pkey->pkey); + return DSA_size(dsa); +} + +static int dsa_bits(const EvpPkey *pkey) { + const DSAImpl *dsa = reinterpret_cast(pkey->pkey); + return BN_num_bits(DSA_get0_p(dsa)); +} + +static int dsa_missing_parameters(const EvpPkey *pkey) { + const DSAImpl *dsa = reinterpret_cast(pkey->pkey); + if (DSA_get0_p(dsa) == nullptr || DSA_get0_q(dsa) == nullptr || + DSA_get0_g(dsa) == nullptr) { + return 1; + } + return 0; +} + +static int dup_bn_into(UniquePtr *out, const BIGNUM *src) { + UniquePtr copy(BN_dup(src)); + if (copy == nullptr) { + return 0; + } + *out = std::move(copy); + return 1; +} + +static int dsa_copy_parameters(EvpPkey *to, const EvpPkey *from) { + if (to->pkey == nullptr) { + to->pkey = DSA_new(); + if (to->pkey == nullptr) { + return 0; + } + } + DSAImpl *to_dsa = reinterpret_cast(to->pkey); + const DSAImpl *from_dsa = reinterpret_cast(from->pkey); + if (!dup_bn_into(&to_dsa->p, from_dsa->p.get()) || + !dup_bn_into(&to_dsa->q, from_dsa->q.get()) || + !dup_bn_into(&to_dsa->g, from_dsa->g.get())) { + return 0; + } + + return 1; +} + +static bool dsa_equal_parameters(const EvpPkey *a, const EvpPkey *b) { + const DSAImpl *a_dsa = reinterpret_cast(a->pkey); + const DSAImpl *b_dsa = reinterpret_cast(b->pkey); + return BN_cmp(DSA_get0_p(a_dsa), DSA_get0_p(b_dsa)) == 0 && + BN_cmp(DSA_get0_q(a_dsa), DSA_get0_q(b_dsa)) == 0 && + BN_cmp(DSA_get0_g(a_dsa), DSA_get0_g(b_dsa)) == 0; +} + +static bool dsa_pub_equal(const EvpPkey *a, const EvpPkey *b) { + const DSAImpl *a_dsa = reinterpret_cast(a->pkey); + const DSAImpl *b_dsa = reinterpret_cast(b->pkey); + return BN_cmp(DSA_get0_pub_key(b_dsa), DSA_get0_pub_key(a_dsa)) == 0; +} + +static bool dsa_pub_present(const EvpPkey *pk) { + const DSA *pk_dsa = reinterpret_cast(pk->pkey); + return DSA_get0_pub_key(pk_dsa) != nullptr; +} + +static bool dsa_pub_copy(EvpPkey *out, const EvpPkey *pk) { + const DSA *pk_dsa = reinterpret_cast(pk->pkey); + const BIGNUM *public_key = DSA_get0_pub_key(pk_dsa); + if (public_key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PUBLIC_KEY); + return false; + } + UniquePtr public_copy_dsa(DSAparams_dup(pk_dsa)); + BIGNUM *public_key_copy = BN_dup(public_key); + if (public_copy_dsa == nullptr || public_key_copy == nullptr || + !DSA_set0_key(public_copy_dsa.get(), public_key_copy, nullptr)) { + BN_free(public_key_copy); + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return false; + } + evp_pkey_set0(out, pk->ameth, public_copy_dsa.release()); + return true; +} + +static bool dsa_priv_present(const EvpPkey *pk) { + const DSA *pk_dsa = reinterpret_cast(pk->pkey); + return DSA_get0_priv_key(pk_dsa) != nullptr; +} + +static void int_dsa_free(EvpPkey *pkey) { + DSA_free(reinterpret_cast(pkey->pkey)); + pkey->pkey = nullptr; +} + +const EVP_PKEY_ASN1_METHOD dsa_asn1_meth = { + EVP_PKEY_DSA, + // 1.2.840.10040.4.1 + {0x2a, 0x86, 0x48, 0xce, 0x38, 0x04, 0x01}, + 7, + + /*pkey_method=*/nullptr, + + dsa_pub_decode, + dsa_pub_encode, + dsa_pub_equal, + dsa_pub_present, + dsa_pub_copy, + + dsa_priv_decode, + dsa_priv_encode, + dsa_priv_present, + + /*set_priv_raw=*/nullptr, + /*set_priv_seed=*/nullptr, + /*set_pub_raw=*/nullptr, + /*get_priv_raw=*/nullptr, + /*get_priv_seed=*/nullptr, + /*get_pub_raw=*/nullptr, + /*set1_tls_encodedpoint=*/nullptr, + /*get1_tls_encodedpoint=*/nullptr, + + /*pkey_opaque=*/nullptr, + + int_dsa_size, + dsa_bits, + + dsa_missing_parameters, + dsa_copy_parameters, + dsa_equal_parameters, + + int_dsa_free, +}; + +} // namespace + +const EVP_PKEY_ALG *EVP_pkey_dsa() { + static const EVP_PKEY_ALG kAlg = {&dsa_asn1_meth, nullptr}; + return &kAlg; +} + +int EVP_PKEY_CTX_set_dsa_paramgen_bits(EVP_PKEY_CTX *ctx, int nbits) { + // BoringSSL does not support DSA in |EVP_PKEY_CTX|. + OPENSSL_PUT_ERROR(EVP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; +} + +int EVP_PKEY_CTX_set_dsa_paramgen_q_bits(EVP_PKEY_CTX *ctx, int qbits) { + // BoringSSL does not support DSA in |EVP_PKEY_CTX|. + OPENSSL_PUT_ERROR(EVP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; +} + +int EVP_PKEY_set1_DSA(EVP_PKEY *pkey, DSA *key) { + if (EVP_PKEY_assign_DSA(pkey, key)) { + DSA_up_ref(key); + return 1; + } + return 0; +} + +int EVP_PKEY_assign_DSA(EVP_PKEY *pkey, DSA *key) { + if (key == nullptr) { + return 0; + } + evp_pkey_set0(FromOpaque(pkey), &dsa_asn1_meth, key); + return 1; +} + +DSA *EVP_PKEY_get0_DSA(const EVP_PKEY *pkey) { + if (EVP_PKEY_id(pkey) != EVP_PKEY_DSA) { + OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_A_DSA_KEY); + return nullptr; + } + return reinterpret_cast(FromOpaque(pkey)->pkey); +} + +DSA *EVP_PKEY_get1_DSA(const EVP_PKEY *pkey) { + DSA *dsa = EVP_PKEY_get0_DSA(pkey); + if (dsa != nullptr) { + DSA_up_ref(dsa); + } + return dsa; +} diff --git a/third_party/boringssl/src/crypto/evp/p_dsa_asn1.c b/third_party/boringssl/src/crypto/evp/p_dsa_asn1.c deleted file mode 100644 index ebc50c85..00000000 --- a/third_party/boringssl/src/crypto/evp/p_dsa_asn1.c +++ /dev/null @@ -1,279 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL project - * 2006. - */ -/* ==================================================================== - * Copyright (c) 2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include -#include -#include - -#include "internal.h" - - -static int dsa_pub_decode(EVP_PKEY *out, CBS *params, CBS *key) { - // See RFC 3279, section 2.3.2. - - // Parameters may or may not be present. - DSA *dsa; - if (CBS_len(params) == 0) { - dsa = DSA_new(); - if (dsa == NULL) { - return 0; - } - } else { - dsa = DSA_parse_parameters(params); - if (dsa == NULL || CBS_len(params) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - goto err; - } - } - - dsa->pub_key = BN_new(); - if (dsa->pub_key == NULL) { - goto err; - } - - if (!BN_parse_asn1_unsigned(key, dsa->pub_key) || - CBS_len(key) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - goto err; - } - - EVP_PKEY_assign_DSA(out, dsa); - return 1; - -err: - DSA_free(dsa); - return 0; -} - -static int dsa_pub_encode(CBB *out, const EVP_PKEY *key) { - const DSA *dsa = key->pkey.dsa; - const int has_params = dsa->p != NULL && dsa->q != NULL && dsa->g != NULL; - - // See RFC 5480, section 2. - CBB spki, algorithm, oid, key_bitstring; - if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, dsa_asn1_meth.oid, dsa_asn1_meth.oid_len) || - (has_params && - !DSA_marshal_parameters(&algorithm, dsa)) || - !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || - !CBB_add_u8(&key_bitstring, 0 /* padding */) || - !BN_marshal_asn1(&key_bitstring, dsa->pub_key) || - !CBB_flush(out)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); - return 0; - } - - return 1; -} - -static int dsa_priv_decode(EVP_PKEY *out, CBS *params, CBS *key) { - // See PKCS#11, v2.40, section 2.5. - - // Decode parameters. - BN_CTX *ctx = NULL; - DSA *dsa = DSA_parse_parameters(params); - if (dsa == NULL || CBS_len(params) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - goto err; - } - - dsa->priv_key = BN_new(); - dsa->pub_key = BN_new(); - if (dsa->priv_key == NULL || dsa->pub_key == NULL) { - goto err; - } - - // Decode the key. To avoid DoS attacks when importing private keys, we bound - // |dsa->priv_key| against |dsa->q|, which itself bound by - // |DSA_parse_parameters|. (We cannot call |BN_num_bits| on |dsa->priv_key|. - // That would leak a secret bit width.) - if (!BN_parse_asn1_unsigned(key, dsa->priv_key) || - CBS_len(key) != 0 || - BN_cmp(dsa->priv_key, dsa->q) >= 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - goto err; - } - - // Calculate the public key. - ctx = BN_CTX_new(); - if (ctx == NULL || - !BN_mod_exp_mont_consttime(dsa->pub_key, dsa->g, dsa->priv_key, dsa->p, - ctx, NULL)) { - goto err; - } - - BN_CTX_free(ctx); - EVP_PKEY_assign_DSA(out, dsa); - return 1; - -err: - BN_CTX_free(ctx); - DSA_free(dsa); - return 0; -} - -static int dsa_priv_encode(CBB *out, const EVP_PKEY *key) { - const DSA *dsa = key->pkey.dsa; - if (dsa == NULL || dsa->priv_key == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PARAMETERS); - return 0; - } - - // See PKCS#11, v2.40, section 2.5. - CBB pkcs8, algorithm, oid, private_key; - if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || - !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, dsa_asn1_meth.oid, dsa_asn1_meth.oid_len) || - !DSA_marshal_parameters(&algorithm, dsa) || - !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || - !BN_marshal_asn1(&private_key, dsa->priv_key) || - !CBB_flush(out)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); - return 0; - } - - return 1; -} - -static int int_dsa_size(const EVP_PKEY *pkey) { - return DSA_size(pkey->pkey.dsa); -} - -static int dsa_bits(const EVP_PKEY *pkey) { - return BN_num_bits(pkey->pkey.dsa->p); -} - -static int dsa_missing_parameters(const EVP_PKEY *pkey) { - DSA *dsa; - dsa = pkey->pkey.dsa; - if (dsa->p == NULL || dsa->q == NULL || dsa->g == NULL) { - return 1; - } - return 0; -} - -static int dup_bn_into(BIGNUM **out, BIGNUM *src) { - BIGNUM *a; - - a = BN_dup(src); - if (a == NULL) { - return 0; - } - BN_free(*out); - *out = a; - - return 1; -} - -static int dsa_copy_parameters(EVP_PKEY *to, const EVP_PKEY *from) { - if (!dup_bn_into(&to->pkey.dsa->p, from->pkey.dsa->p) || - !dup_bn_into(&to->pkey.dsa->q, from->pkey.dsa->q) || - !dup_bn_into(&to->pkey.dsa->g, from->pkey.dsa->g)) { - return 0; - } - - return 1; -} - -static int dsa_cmp_parameters(const EVP_PKEY *a, const EVP_PKEY *b) { - return BN_cmp(a->pkey.dsa->p, b->pkey.dsa->p) == 0 && - BN_cmp(a->pkey.dsa->q, b->pkey.dsa->q) == 0 && - BN_cmp(a->pkey.dsa->g, b->pkey.dsa->g) == 0; -} - -static int dsa_pub_cmp(const EVP_PKEY *a, const EVP_PKEY *b) { - return BN_cmp(b->pkey.dsa->pub_key, a->pkey.dsa->pub_key) == 0; -} - -static void int_dsa_free(EVP_PKEY *pkey) { DSA_free(pkey->pkey.dsa); } - -const EVP_PKEY_ASN1_METHOD dsa_asn1_meth = { - EVP_PKEY_DSA, - // 1.2.840.10040.4.1 - {0x2a, 0x86, 0x48, 0xce, 0x38, 0x04, 0x01}, 7, - - NULL /* pkey_method */, - - dsa_pub_decode, - dsa_pub_encode, - dsa_pub_cmp, - - dsa_priv_decode, - dsa_priv_encode, - - NULL /* set_priv_raw */, - NULL /* set_pub_raw */, - NULL /* get_priv_raw */, - NULL /* get_pub_raw */, - - NULL /* pkey_opaque */, - - int_dsa_size, - dsa_bits, - - dsa_missing_parameters, - dsa_copy_parameters, - dsa_cmp_parameters, - - int_dsa_free, -}; diff --git a/third_party/boringssl/src/crypto/evp/p_ec.c b/third_party/boringssl/src/crypto/evp/p_ec.c deleted file mode 100644 index ddb64a42..00000000 --- a/third_party/boringssl/src/crypto/evp/p_ec.c +++ /dev/null @@ -1,286 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 2006. - */ -/* ==================================================================== - * Copyright (c) 2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../fipsmodule/ec/internal.h" -#include "../internal.h" - - -typedef struct { - // message digest - const EVP_MD *md; - EC_GROUP *gen_group; -} EC_PKEY_CTX; - - -static int pkey_ec_init(EVP_PKEY_CTX *ctx) { - EC_PKEY_CTX *dctx; - dctx = OPENSSL_malloc(sizeof(EC_PKEY_CTX)); - if (!dctx) { - return 0; - } - OPENSSL_memset(dctx, 0, sizeof(EC_PKEY_CTX)); - - ctx->data = dctx; - - return 1; -} - -static int pkey_ec_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) { - EC_PKEY_CTX *dctx, *sctx; - if (!pkey_ec_init(dst)) { - return 0; - } - sctx = src->data; - dctx = dst->data; - - dctx->md = sctx->md; - - return 1; -} - -static void pkey_ec_cleanup(EVP_PKEY_CTX *ctx) { - EC_PKEY_CTX *dctx = ctx->data; - if (!dctx) { - return; - } - - EC_GROUP_free(dctx->gen_group); - OPENSSL_free(dctx); -} - -static int pkey_ec_sign(EVP_PKEY_CTX *ctx, uint8_t *sig, size_t *siglen, - const uint8_t *tbs, size_t tbslen) { - unsigned int sltmp; - EC_KEY *ec = ctx->pkey->pkey.ec; - - if (!sig) { - *siglen = ECDSA_size(ec); - return 1; - } else if (*siglen < (size_t)ECDSA_size(ec)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - - if (!ECDSA_sign(0, tbs, tbslen, sig, &sltmp, ec)) { - return 0; - } - *siglen = (size_t)sltmp; - return 1; -} - -static int pkey_ec_verify(EVP_PKEY_CTX *ctx, const uint8_t *sig, size_t siglen, - const uint8_t *tbs, size_t tbslen) { - return ECDSA_verify(0, tbs, tbslen, sig, siglen, ctx->pkey->pkey.ec); -} - -static int pkey_ec_derive(EVP_PKEY_CTX *ctx, uint8_t *key, - size_t *keylen) { - int ret; - size_t outlen; - const EC_POINT *pubkey = NULL; - EC_KEY *eckey; - - if (!ctx->pkey || !ctx->peerkey) { - OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); - return 0; - } - - eckey = ctx->pkey->pkey.ec; - - if (!key) { - const EC_GROUP *group; - group = EC_KEY_get0_group(eckey); - *keylen = (EC_GROUP_get_degree(group) + 7) / 8; - return 1; - } - pubkey = EC_KEY_get0_public_key(ctx->peerkey->pkey.ec); - - // NB: unlike PKCS#3 DH, if *outlen is less than maximum size this is - // not an error, the result is truncated. - - outlen = *keylen; - - ret = ECDH_compute_key(key, outlen, pubkey, eckey, 0); - if (ret < 0) { - return 0; - } - *keylen = ret; - return 1; -} - -static int pkey_ec_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) { - EC_PKEY_CTX *dctx = ctx->data; - - switch (type) { - case EVP_PKEY_CTRL_MD: { - const EVP_MD *md = p2; - int md_type = EVP_MD_type(md); - if (md_type != NID_sha1 && md_type != NID_sha224 && - md_type != NID_sha256 && md_type != NID_sha384 && - md_type != NID_sha512) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_DIGEST_TYPE); - return 0; - } - dctx->md = md; - return 1; - } - - case EVP_PKEY_CTRL_GET_MD: - *(const EVP_MD **)p2 = dctx->md; - return 1; - - case EVP_PKEY_CTRL_PEER_KEY: - // Default behaviour is OK - return 1; - - case EVP_PKEY_CTRL_EC_PARAMGEN_CURVE_NID: { - EC_GROUP *group = EC_GROUP_new_by_curve_name(p1); - if (group == NULL) { - return 0; - } - EC_GROUP_free(dctx->gen_group); - dctx->gen_group = group; - return 1; - } - - default: - OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); - return 0; - } -} - -static int pkey_ec_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) { - EC_PKEY_CTX *dctx = ctx->data; - const EC_GROUP *group = dctx->gen_group; - if (group == NULL) { - if (ctx->pkey == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NO_PARAMETERS_SET); - return 0; - } - group = EC_KEY_get0_group(ctx->pkey->pkey.ec); - } - EC_KEY *ec = EC_KEY_new(); - if (ec == NULL || - !EC_KEY_set_group(ec, group) || - !EC_KEY_generate_key(ec)) { - EC_KEY_free(ec); - return 0; - } - EVP_PKEY_assign_EC_KEY(pkey, ec); - return 1; -} - -static int pkey_ec_paramgen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) { - EC_PKEY_CTX *dctx = ctx->data; - if (dctx->gen_group == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NO_PARAMETERS_SET); - return 0; - } - EC_KEY *ec = EC_KEY_new(); - if (ec == NULL || - !EC_KEY_set_group(ec, dctx->gen_group)) { - EC_KEY_free(ec); - return 0; - } - EVP_PKEY_assign_EC_KEY(pkey, ec); - return 1; -} - -const EVP_PKEY_METHOD ec_pkey_meth = { - EVP_PKEY_EC, - pkey_ec_init, - pkey_ec_copy, - pkey_ec_cleanup, - pkey_ec_keygen, - pkey_ec_sign, - NULL /* sign_message */, - pkey_ec_verify, - NULL /* verify_message */, - NULL /* verify_recover */, - NULL /* encrypt */, - NULL /* decrypt */, - pkey_ec_derive, - pkey_ec_paramgen, - pkey_ec_ctrl, -}; - -int EVP_PKEY_CTX_set_ec_paramgen_curve_nid(EVP_PKEY_CTX *ctx, int nid) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, EVP_PKEY_OP_TYPE_GEN, - EVP_PKEY_CTRL_EC_PARAMGEN_CURVE_NID, nid, NULL); -} - -int EVP_PKEY_CTX_set_ec_param_enc(EVP_PKEY_CTX *ctx, int encoding) { - // BoringSSL only supports named curve syntax. - if (encoding != OPENSSL_EC_NAMED_CURVE) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PARAMETERS); - return 0; - } - return 1; -} diff --git a/third_party/boringssl/src/crypto/evp/p_ec.cc b/third_party/boringssl/src/crypto/evp/p_ec.cc new file mode 100644 index 00000000..ac9a489d --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/p_ec.cc @@ -0,0 +1,615 @@ +// Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ec/internal.h" +#include "../fipsmodule/ec/internal.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +namespace { + +struct EVP_PKEY_ALG_EC : public EVP_PKEY_ALG { + // ec_group returns the |EC_GROUP| for this algorithm. + const EC_GROUP *(*ec_group)(); +}; + +extern const EVP_PKEY_ASN1_METHOD ec_asn1_meth; +extern const EVP_PKEY_CTX_METHOD ec_pkey_meth; + +static int eckey_pub_encode(CBB *out, const EvpPkey *key) { + const EC_KEY *ec_key = reinterpret_cast(key->pkey); + const EC_GROUP *group = EC_KEY_get0_group(ec_key); + const EC_POINT *public_key = EC_KEY_get0_public_key(ec_key); + + // See RFC 5480, section 2. + CBB spki, algorithm, key_bitstring; + if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, ec_asn1_meth.oid, + ec_asn1_meth.oid_len) || + !EC_KEY_marshal_curve_name(&algorithm, group) || + !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || + !CBB_add_u8(&key_bitstring, 0 /* padding */) || + !EC_POINT_point2cbb(&key_bitstring, group, public_key, + POINT_CONVERSION_UNCOMPRESSED, nullptr) || + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static bssl::evp_decode_result_t eckey_pub_decode(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + const auto *ec_alg = static_cast(alg); + if (ec_alg->ec_group == nullptr) { + return evp_decode_unsupported; + } + + // See RFC 5480, section 2. + + // Check that |params| matches |alg|. Only the namedCurve form is allowed. + const EC_GROUP *group = ec_alg->ec_group(); + if (ec_key_parse_curve_name(params, Span(&group, 1)) == nullptr) { + if (ERR_equals(ERR_peek_last_error(), ERR_LIB_EC, EC_R_UNKNOWN_GROUP)) { + ERR_clear_error(); + return evp_decode_unsupported; + } + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + if (CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + UniquePtr eckey(EC_KEY_new()); + if (eckey == nullptr || // + !EC_KEY_set_group(eckey.get(), group) || + !EC_KEY_oct2key(eckey.get(), CBS_data(key), CBS_len(key), nullptr)) { + return evp_decode_error; + } + + EVP_PKEY_assign_EC_KEY(out, eckey.release()); + return evp_decode_ok; +} + +static bool eckey_pub_equal(const EvpPkey *a, const EvpPkey *b) { + const EC_KEY *a_ec = reinterpret_cast(a->pkey); + const EC_KEY *b_ec = reinterpret_cast(b->pkey); + const EC_GROUP *group = EC_KEY_get0_group(b_ec); + const EC_POINT *pa = EC_KEY_get0_public_key(a_ec), + *pb = EC_KEY_get0_public_key(b_ec); + return EC_POINT_cmp(group, pa, pb, nullptr) == 0; +} + +static bssl::evp_decode_result_t eckey_priv_decode(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + const auto *ec_alg = static_cast(alg); + if (ec_alg->ec_group == nullptr) { + return evp_decode_unsupported; + } + + // See RFC 5915. + const EC_GROUP *group = ec_alg->ec_group(); + if (ec_key_parse_parameters(params, Span(&group, 1)) == nullptr) { + if (ERR_equals(ERR_peek_last_error(), ERR_LIB_EC, EC_R_UNKNOWN_GROUP)) { + ERR_clear_error(); + return evp_decode_unsupported; + } + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + if (CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + UniquePtr ec_key(ec_key_parse_private_key(key, group, {})); + if (ec_key == nullptr || CBS_len(key) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + EVP_PKEY_assign_EC_KEY(out, ec_key.release()); + return evp_decode_ok; +} + +static int eckey_priv_encode(CBB *out, const EvpPkey *key) { + const EC_KEY *ec_key = reinterpret_cast(key->pkey); + + // Omit the redundant copy of the curve name. This contradicts RFC 5915 but + // aligns with PKCS #11. SEC 1 only says they may be omitted if known by other + // means. Both OpenSSL and NSS omit the redundant parameters, so we omit them + // as well. + unsigned enc_flags = EC_KEY_get_enc_flags(ec_key) | EC_PKEY_NO_PARAMETERS; + + // See RFC 5915. + CBB pkcs8, algorithm, private_key; + if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || + !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, ec_asn1_meth.oid, + ec_asn1_meth.oid_len) || + !EC_KEY_marshal_curve_name(&algorithm, EC_KEY_get0_group(ec_key)) || + !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || + !EC_KEY_marshal_private_key(&private_key, ec_key, enc_flags) || + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static int eckey_set1_tls_encodedpoint(EvpPkey *pkey, const uint8_t *in, + size_t len) { + EC_KEY *ec_key = reinterpret_cast(pkey->pkey); + if (ec_key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return 0; + } + + return EC_KEY_oct2key(ec_key, in, len, nullptr); +} + +static size_t eckey_get1_tls_encodedpoint(const EvpPkey *pkey, + uint8_t **out_ptr) { + const EC_KEY *ec_key = reinterpret_cast(pkey->pkey); + if (ec_key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return 0; + } + + return EC_KEY_key2buf(ec_key, POINT_CONVERSION_UNCOMPRESSED, out_ptr, + nullptr); +} + +static int int_ec_size(const EvpPkey *pkey) { + const EC_KEY *ec_key = reinterpret_cast(pkey->pkey); + return ECDSA_size(ec_key); +} + +static int ec_bits(const EvpPkey *pkey) { + const EC_KEY *ec_key = reinterpret_cast(pkey->pkey); + const EC_GROUP *group = EC_KEY_get0_group(ec_key); + if (group == nullptr) { + ERR_clear_error(); + return 0; + } + return EC_GROUP_order_bits(group); +} + +static int ec_missing_parameters(const EvpPkey *pkey) { + const EC_KEY *ec_key = reinterpret_cast(pkey->pkey); + return ec_key == nullptr || EC_KEY_get0_group(ec_key) == nullptr; +} + +static int ec_copy_parameters(EvpPkey *to, const EvpPkey *from) { + const EC_KEY *from_key = reinterpret_cast(from->pkey); + if (from_key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return 0; + } + const EC_GROUP *group = EC_KEY_get0_group(from_key); + if (group == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PARAMETERS); + return 0; + } + if (to->pkey == nullptr) { + to->pkey = EC_KEY_new(); + if (to->pkey == nullptr) { + return 0; + } + } + return EC_KEY_set_group(reinterpret_cast(to->pkey), group); +} + +static bool ec_equal_parameters(const EvpPkey *a, const EvpPkey *b) { + const EC_KEY *a_ec = reinterpret_cast(a->pkey); + const EC_KEY *b_ec = reinterpret_cast(b->pkey); + if (a_ec == nullptr || b_ec == nullptr) { + return false; + } + const EC_GROUP *group_a = EC_KEY_get0_group(a_ec), + *group_b = EC_KEY_get0_group(b_ec); + if (group_a == nullptr || group_b == nullptr) { + return false; + } + // EC_GROUP_cmp returns zero on equality. + return EC_GROUP_cmp(group_a, group_b, nullptr) == 0; +} + +static void int_ec_free(EvpPkey *pkey) { + EC_KEY_free(reinterpret_cast(pkey->pkey)); + pkey->pkey = nullptr; +} + +static int eckey_opaque(const EvpPkey *pkey) { + const EC_KEY *ec_key = reinterpret_cast(pkey->pkey); + return EC_KEY_is_opaque(ec_key); +} + +static bool eckey_pub_present(const EvpPkey *pkey) { + const EC_KEY *ec_key = reinterpret_cast(pkey->pkey); + return EC_KEY_get0_public_key(ec_key) != nullptr; +} + +static bool eckey_pub_copy(EvpPkey *out, const EvpPkey *pkey) { + const EC_KEY *ec_key = reinterpret_cast(pkey->pkey); + const EC_POINT *public_key = EC_KEY_get0_public_key(ec_key); + if (public_key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PUBLIC_KEY); + return false; + } + UniquePtr public_copy_ec_key(EC_KEY_new()); + if (public_copy_ec_key == nullptr || + !EC_KEY_set_group(public_copy_ec_key.get(), public_key->group) || + !EC_KEY_set_public_key(public_copy_ec_key.get(), public_key)) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return false; + } + evp_pkey_set0(out, pkey->ameth, public_copy_ec_key.release()); + return true; +} + +static bool eckey_priv_present(const EvpPkey *pkey) { + const EC_KEY *ec_key = reinterpret_cast(pkey->pkey); + return EC_KEY_get0_private_key(ec_key) != nullptr; +} + +const EVP_PKEY_ASN1_METHOD ec_asn1_meth = { + EVP_PKEY_EC, + // 1.2.840.10045.2.1 + {0x2a, 0x86, 0x48, 0xce, 0x3d, 0x02, 0x01}, + 7, + + &ec_pkey_meth, + + eckey_pub_decode, + eckey_pub_encode, + eckey_pub_equal, + eckey_pub_present, + eckey_pub_copy, + + eckey_priv_decode, + eckey_priv_encode, + eckey_priv_present, + + /*set_priv_raw=*/nullptr, + /*set_priv_seed=*/nullptr, + /*set_pub_raw=*/nullptr, + /*get_priv_raw=*/nullptr, + /*get_priv_seed=*/nullptr, + /*get_pub_raw=*/nullptr, + eckey_set1_tls_encodedpoint, + eckey_get1_tls_encodedpoint, + + eckey_opaque, + + int_ec_size, + ec_bits, + + ec_missing_parameters, + ec_copy_parameters, + ec_equal_parameters, + + int_ec_free, +}; + +struct EC_PKEY_CTX { + // message digest + const EVP_MD *md = nullptr; + const EC_GROUP *gen_group = nullptr; +}; + +static int pkey_ec_init(EvpPkeyCtx *ctx, const EVP_PKEY_ALG *alg) { + EC_PKEY_CTX *dctx = New(); + if (!dctx) { + return 0; + } + + const auto *ec_alg = static_cast(alg); + if (ec_alg != nullptr && ec_alg->ec_group != nullptr) { + dctx->gen_group = ec_alg->ec_group(); + } + + ctx->data = dctx; + return 1; +} + +static int pkey_ec_copy(EvpPkeyCtx *dst, EvpPkeyCtx *src) { + if (!pkey_ec_init(dst, nullptr)) { + return 0; + } + + const EC_PKEY_CTX *sctx = reinterpret_cast(src->data); + EC_PKEY_CTX *dctx = reinterpret_cast(dst->data); + dctx->md = sctx->md; + dctx->gen_group = sctx->gen_group; + return 1; +} + +static void pkey_ec_cleanup(EvpPkeyCtx *ctx) { + Delete(reinterpret_cast(ctx->data)); +} + +static int pkey_ec_sign(EvpPkeyCtx *ctx, uint8_t *sig, size_t *siglen, + const uint8_t *tbs, size_t tbslen) { + const EC_KEY *ec = reinterpret_cast(ctx->pkey->pkey); + if (!sig) { + *siglen = ECDSA_size(ec); + return 1; + } else if (*siglen < (size_t)ECDSA_size(ec)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + unsigned int sltmp; + if (!ECDSA_sign(0, tbs, tbslen, sig, &sltmp, ec)) { + return 0; + } + *siglen = (size_t)sltmp; + return 1; +} + +static int pkey_ec_verify(EvpPkeyCtx *ctx, const uint8_t *sig, size_t siglen, + const uint8_t *tbs, size_t tbslen) { + const EC_KEY *ec_key = reinterpret_cast(ctx->pkey->pkey); + return ECDSA_verify(0, tbs, tbslen, sig, siglen, ec_key); +} + +static int pkey_ec_derive(EvpPkeyCtx *ctx, uint8_t *key, size_t *keylen) { + if (!ctx->pkey || !ctx->peerkey) { + OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); + return 0; + } + + const EC_KEY *eckey = reinterpret_cast(ctx->pkey->pkey); + if (!key) { + const EC_GROUP *group; + group = EC_KEY_get0_group(eckey); + *keylen = (EC_GROUP_get_degree(group) + 7) / 8; + return 1; + } + + const EC_KEY *eckey_peer = reinterpret_cast(ctx->peerkey->pkey); + const EC_POINT *pubkey = EC_KEY_get0_public_key(eckey_peer); + + // NB: unlike PKCS#3 DH, if *outlen is less than maximum size this is + // not an error, the result is truncated. + size_t outlen = *keylen; + int ret = ECDH_compute_key(key, outlen, pubkey, eckey, nullptr); + if (ret < 0) { + return 0; + } + *keylen = ret; + return 1; +} + +static int pkey_ec_ctrl(EvpPkeyCtx *ctx, int type, int p1, void *p2) { + EC_PKEY_CTX *dctx = reinterpret_cast(ctx->data); + + switch (type) { + case EVP_PKEY_CTRL_MD: { + const EVP_MD *md = reinterpret_cast(p2); + int md_type = EVP_MD_type(md); + if (md_type != NID_sha1 && md_type != NID_sha224 && + md_type != NID_sha256 && md_type != NID_sha384 && + md_type != NID_sha512) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_DIGEST_TYPE); + return 0; + } + dctx->md = md; + return 1; + } + + case EVP_PKEY_CTRL_GET_MD: + *(const EVP_MD **)p2 = dctx->md; + return 1; + + case EVP_PKEY_CTRL_PEER_KEY: + // Default behaviour is OK + return 1; + + case EVP_PKEY_CTRL_EC_PARAMGEN_GROUP: { + dctx->gen_group = static_cast(p2); + return 1; + } + + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); + return 0; + } +} + +static int pkey_ec_keygen(EvpPkeyCtx *ctx, EvpPkey *pkey) { + EC_PKEY_CTX *dctx = reinterpret_cast(ctx->data); + const EC_GROUP *group = dctx->gen_group; + if (group == nullptr) { + if (ctx->pkey == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_PARAMETERS_SET); + return 0; + } + group = EC_KEY_get0_group(reinterpret_cast(ctx->pkey->pkey)); + } + EC_KEY *ec = EC_KEY_new(); + if (ec == nullptr || !EC_KEY_set_group(ec, group) || + !EC_KEY_generate_key(ec)) { + EC_KEY_free(ec); + return 0; + } + EVP_PKEY_assign_EC_KEY(pkey, ec); + return 1; +} + +static int pkey_ec_paramgen(EvpPkeyCtx *ctx, EvpPkey *pkey) { + EC_PKEY_CTX *dctx = reinterpret_cast(ctx->data); + if (dctx->gen_group == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_PARAMETERS_SET); + return 0; + } + EC_KEY *ec = EC_KEY_new(); + if (ec == nullptr || !EC_KEY_set_group(ec, dctx->gen_group)) { + EC_KEY_free(ec); + return 0; + } + EVP_PKEY_assign_EC_KEY(pkey, ec); + return 1; +} + +const EVP_PKEY_CTX_METHOD ec_pkey_meth = { + EVP_PKEY_EC, + pkey_ec_init, + pkey_ec_copy, + pkey_ec_cleanup, + pkey_ec_keygen, + pkey_ec_sign, + nullptr /* sign_message */, + pkey_ec_verify, + nullptr /* verify_message */, + nullptr /* verify_recover */, + nullptr /* encrypt */, + nullptr /* decrypt */, + pkey_ec_derive, + pkey_ec_paramgen, + /*encap=*/nullptr, + /*decap=*/nullptr, + pkey_ec_ctrl, +}; + +} // namespace + +const EVP_PKEY_ALG *EVP_pkey_ec_p224() { + static const EVP_PKEY_ALG_EC kAlg = {{&ec_asn1_meth, &ec_pkey_meth}, + &EC_group_p224}; + return &kAlg; +} + +const EVP_PKEY_ALG *EVP_pkey_ec_p256() { + static const EVP_PKEY_ALG_EC kAlg = {{&ec_asn1_meth, &ec_pkey_meth}, + &EC_group_p256}; + return &kAlg; +} + +const EVP_PKEY_ALG *EVP_pkey_ec_p384() { + static const EVP_PKEY_ALG_EC kAlg = {{&ec_asn1_meth, &ec_pkey_meth}, + &EC_group_p384}; + return &kAlg; +} + +const EVP_PKEY_ALG *EVP_pkey_ec_p521() { + static const EVP_PKEY_ALG_EC kAlg = {{&ec_asn1_meth, &ec_pkey_meth}, + &EC_group_p521}; + return &kAlg; +} + +const EVP_PKEY_ALG *bssl::evp_pkey_ec_no_curve() { + static const EVP_PKEY_ALG_EC kAlg = {{&ec_asn1_meth, &ec_pkey_meth}, nullptr}; + return &kAlg; +} + +int EVP_PKEY_set1_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) { + if (EVP_PKEY_assign_EC_KEY(pkey, key)) { + EC_KEY_up_ref(key); + return 1; + } + return 0; +} + +int EVP_PKEY_assign_EC_KEY(EVP_PKEY *pkey, EC_KEY *key) { + if (key == nullptr) { + return 0; + } + evp_pkey_set0(FromOpaque(pkey), &ec_asn1_meth, key); + return 1; +} + +EC_KEY *EVP_PKEY_get0_EC_KEY(const EVP_PKEY *pkey) { + if (EVP_PKEY_id(pkey) != EVP_PKEY_EC) { + OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_A_EC_KEY); + return nullptr; + } + return reinterpret_cast(FromOpaque(pkey)->pkey); +} + +EC_KEY *EVP_PKEY_get1_EC_KEY(const EVP_PKEY *pkey) { + EC_KEY *ec_key = EVP_PKEY_get0_EC_KEY(pkey); + if (ec_key != nullptr) { + EC_KEY_up_ref(ec_key); + } + return ec_key; +} + +int EVP_PKEY_get_ec_curve_nid(const EVP_PKEY *pkey) { + const EC_KEY *ec_key = EVP_PKEY_get0_EC_KEY(pkey); + if (ec_key == nullptr) { + return NID_undef; + } + const EC_GROUP *group = EC_KEY_get0_group(ec_key); + if (group == nullptr) { + return NID_undef; + } + return EC_GROUP_get_curve_name(group); +} + +int EVP_PKEY_get_ec_point_conv_form(const EVP_PKEY *pkey) { + const EC_KEY *ec_key = EVP_PKEY_get0_EC_KEY(pkey); + if (ec_key == nullptr) { + return 0; + } + return EC_KEY_get_conv_form(ec_key); +} + +int EVP_PKEY_CTX_set_ec_paramgen_curve_nid(EVP_PKEY_CTX *ctx, int nid) { + const EC_GROUP *group = EC_GROUP_new_by_curve_name(nid); + if (group == nullptr) { + return 0; + } + return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_EC, EVP_PKEY_OP_TYPE_GEN, + EVP_PKEY_CTRL_EC_PARAMGEN_GROUP, 0, + const_cast(group)); +} + +int EVP_PKEY_CTX_set_ec_param_enc(EVP_PKEY_CTX *ctx, int encoding) { + // BoringSSL only supports named curve syntax. + if (encoding != OPENSSL_EC_NAMED_CURVE) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PARAMETERS); + return 0; + } + return 1; +} diff --git a/third_party/boringssl/src/crypto/evp/p_ec_asn1.c b/third_party/boringssl/src/crypto/evp/p_ec_asn1.c deleted file mode 100644 index dd421217..00000000 --- a/third_party/boringssl/src/crypto/evp/p_ec_asn1.c +++ /dev/null @@ -1,257 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 2006. - */ -/* ==================================================================== - * Copyright (c) 2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include -#include -#include -#include - -#include "internal.h" - - -static int eckey_pub_encode(CBB *out, const EVP_PKEY *key) { - const EC_KEY *ec_key = key->pkey.ec; - const EC_GROUP *group = EC_KEY_get0_group(ec_key); - const EC_POINT *public_key = EC_KEY_get0_public_key(ec_key); - - // See RFC 5480, section 2. - CBB spki, algorithm, oid, key_bitstring; - if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, ec_asn1_meth.oid, ec_asn1_meth.oid_len) || - !EC_KEY_marshal_curve_name(&algorithm, group) || - !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || - !CBB_add_u8(&key_bitstring, 0 /* padding */) || - !EC_POINT_point2cbb(&key_bitstring, group, public_key, - POINT_CONVERSION_UNCOMPRESSED, NULL) || - !CBB_flush(out)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); - return 0; - } - - return 1; -} - -static int eckey_pub_decode(EVP_PKEY *out, CBS *params, CBS *key) { - // See RFC 5480, section 2. - - // The parameters are a named curve. - EC_POINT *point = NULL; - EC_KEY *eckey = NULL; - EC_GROUP *group = EC_KEY_parse_curve_name(params); - if (group == NULL || CBS_len(params) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - goto err; - } - - eckey = EC_KEY_new(); - if (eckey == NULL || !EC_KEY_set_group(eckey, group)) { - goto err; - } - - point = EC_POINT_new(group); - if (point == NULL || - !EC_POINT_oct2point(group, point, CBS_data(key), CBS_len(key), NULL) || - !EC_KEY_set_public_key(eckey, point)) { - goto err; - } - - EC_GROUP_free(group); - EC_POINT_free(point); - EVP_PKEY_assign_EC_KEY(out, eckey); - return 1; - -err: - EC_GROUP_free(group); - EC_POINT_free(point); - EC_KEY_free(eckey); - return 0; -} - -static int eckey_pub_cmp(const EVP_PKEY *a, const EVP_PKEY *b) { - int r; - const EC_GROUP *group = EC_KEY_get0_group(b->pkey.ec); - const EC_POINT *pa = EC_KEY_get0_public_key(a->pkey.ec), - *pb = EC_KEY_get0_public_key(b->pkey.ec); - r = EC_POINT_cmp(group, pa, pb, NULL); - if (r == 0) { - return 1; - } else if (r == 1) { - return 0; - } else { - return -2; - } -} - -static int eckey_priv_decode(EVP_PKEY *out, CBS *params, CBS *key) { - // See RFC 5915. - EC_GROUP *group = EC_KEY_parse_parameters(params); - if (group == NULL || CBS_len(params) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - EC_GROUP_free(group); - return 0; - } - - EC_KEY *ec_key = EC_KEY_parse_private_key(key, group); - EC_GROUP_free(group); - if (ec_key == NULL || CBS_len(key) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - EC_KEY_free(ec_key); - return 0; - } - - EVP_PKEY_assign_EC_KEY(out, ec_key); - return 1; -} - -static int eckey_priv_encode(CBB *out, const EVP_PKEY *key) { - const EC_KEY *ec_key = key->pkey.ec; - - // Omit the redundant copy of the curve name. This contradicts RFC 5915 but - // aligns with PKCS #11. SEC 1 only says they may be omitted if known by other - // means. Both OpenSSL and NSS omit the redundant parameters, so we omit them - // as well. - unsigned enc_flags = EC_KEY_get_enc_flags(ec_key) | EC_PKEY_NO_PARAMETERS; - - // See RFC 5915. - CBB pkcs8, algorithm, oid, private_key; - if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || - !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, ec_asn1_meth.oid, ec_asn1_meth.oid_len) || - !EC_KEY_marshal_curve_name(&algorithm, EC_KEY_get0_group(ec_key)) || - !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || - !EC_KEY_marshal_private_key(&private_key, ec_key, enc_flags) || - !CBB_flush(out)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); - return 0; - } - - return 1; -} - -static int int_ec_size(const EVP_PKEY *pkey) { - return ECDSA_size(pkey->pkey.ec); -} - -static int ec_bits(const EVP_PKEY *pkey) { - const EC_GROUP *group = EC_KEY_get0_group(pkey->pkey.ec); - if (group == NULL) { - ERR_clear_error(); - return 0; - } - return BN_num_bits(EC_GROUP_get0_order(group)); -} - -static int ec_missing_parameters(const EVP_PKEY *pkey) { - return EC_KEY_get0_group(pkey->pkey.ec) == NULL; -} - -static int ec_copy_parameters(EVP_PKEY *to, const EVP_PKEY *from) { - return EC_KEY_set_group(to->pkey.ec, EC_KEY_get0_group(from->pkey.ec)); -} - -static int ec_cmp_parameters(const EVP_PKEY *a, const EVP_PKEY *b) { - const EC_GROUP *group_a = EC_KEY_get0_group(a->pkey.ec), - *group_b = EC_KEY_get0_group(b->pkey.ec); - if (EC_GROUP_cmp(group_a, group_b, NULL) != 0) { - // mismatch - return 0; - } - return 1; -} - -static void int_ec_free(EVP_PKEY *pkey) { EC_KEY_free(pkey->pkey.ec); } - -static int eckey_opaque(const EVP_PKEY *pkey) { - return EC_KEY_is_opaque(pkey->pkey.ec); -} - -const EVP_PKEY_ASN1_METHOD ec_asn1_meth = { - EVP_PKEY_EC, - // 1.2.840.10045.2.1 - {0x2a, 0x86, 0x48, 0xce, 0x3d, 0x02, 0x01}, 7, - - &ec_pkey_meth, - - eckey_pub_decode, - eckey_pub_encode, - eckey_pub_cmp, - - eckey_priv_decode, - eckey_priv_encode, - - NULL /* set_priv_raw */, - NULL /* set_pub_raw */, - NULL /* get_priv_raw */, - NULL /* get_pub_raw */, - - eckey_opaque, - - int_ec_size, - ec_bits, - - ec_missing_parameters, - ec_copy_parameters, - ec_cmp_parameters, - - int_ec_free, -}; diff --git a/third_party/boringssl/src/crypto/evp/p_ed25519.c b/third_party/boringssl/src/crypto/evp/p_ed25519.c deleted file mode 100644 index b3d8cc98..00000000 --- a/third_party/boringssl/src/crypto/evp/p_ed25519.c +++ /dev/null @@ -1,104 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include - -#include "internal.h" - - -// Ed25519 has no parameters to copy. -static int pkey_ed25519_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) { return 1; } - -static int pkey_ed25519_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) { - ED25519_KEY *key = OPENSSL_malloc(sizeof(ED25519_KEY)); - if (key == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - - if (!EVP_PKEY_set_type(pkey, EVP_PKEY_ED25519)) { - OPENSSL_free(key); - return 0; - } - - uint8_t pubkey_unused[32]; - ED25519_keypair(pubkey_unused, key->key); - key->has_private = 1; - - OPENSSL_free(pkey->pkey.ptr); - pkey->pkey.ptr = key; - return 1; -} - -static int pkey_ed25519_sign_message(EVP_PKEY_CTX *ctx, uint8_t *sig, - size_t *siglen, const uint8_t *tbs, - size_t tbslen) { - ED25519_KEY *key = ctx->pkey->pkey.ptr; - if (!key->has_private) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); - return 0; - } - - if (sig == NULL) { - *siglen = 64; - return 1; - } - - if (*siglen < 64) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - - if (!ED25519_sign(sig, tbs, tbslen, key->key)) { - return 0; - } - - *siglen = 64; - return 1; -} - -static int pkey_ed25519_verify_message(EVP_PKEY_CTX *ctx, const uint8_t *sig, - size_t siglen, const uint8_t *tbs, - size_t tbslen) { - ED25519_KEY *key = ctx->pkey->pkey.ptr; - if (siglen != 64 || - !ED25519_verify(tbs, tbslen, sig, key->key + ED25519_PUBLIC_KEY_OFFSET)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_SIGNATURE); - return 0; - } - - return 1; -} - -const EVP_PKEY_METHOD ed25519_pkey_meth = { - EVP_PKEY_ED25519, - NULL /* init */, - pkey_ed25519_copy, - NULL /* cleanup */, - pkey_ed25519_keygen, - NULL /* sign */, - pkey_ed25519_sign_message, - NULL /* verify */, - pkey_ed25519_verify_message, - NULL /* verify_recover */, - NULL /* encrypt */, - NULL /* decrypt */, - NULL /* derive */, - NULL /* paramgen */, - NULL /* ctrl */, -}; diff --git a/third_party/boringssl/src/crypto/evp/p_ed25519.cc b/third_party/boringssl/src/crypto/evp/p_ed25519.cc new file mode 100644 index 00000000..3ca66b7c --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/p_ed25519.cc @@ -0,0 +1,359 @@ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + +using namespace bssl; + +namespace { + +struct ED25519_KEY { + // key is the concatenation of the private seed and public key. It is stored + // as a single 64-bit array to allow passing to |ED25519_sign|. If + // |has_private| is false, the first 32 bytes are uninitialized and the public + // key is in the last 32 bytes. + uint8_t key[64]; + bool has_private; +}; + +extern const EVP_PKEY_ASN1_METHOD ed25519_asn1_meth; +extern const EVP_PKEY_CTX_METHOD ed25519_pkey_meth; + +#define ED25519_PUBLIC_KEY_OFFSET 32 + +static void ed25519_free(EvpPkey *pkey) { + ED25519_KEY *key = reinterpret_cast(pkey->pkey); + Delete(key); + pkey->pkey = nullptr; +} + +static int ed25519_set_priv_raw(EvpPkey *pkey, const uint8_t *in, size_t len) { + if (len != 32) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + ED25519_KEY *key = New(); + if (key == nullptr) { + return 0; + } + + // The RFC 8032 encoding stores only the 32-byte seed, so we must recover the + // full representation which we use from it. + uint8_t pubkey_unused[32]; + ED25519_keypair_from_seed(pubkey_unused, key->key, in); + key->has_private = true; + evp_pkey_set0(pkey, &ed25519_asn1_meth, key); + return 1; +} + +static int ed25519_set_pub_raw(EvpPkey *pkey, const uint8_t *in, size_t len) { + if (len != 32) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + ED25519_KEY *key = New(); + if (key == nullptr) { + return 0; + } + + OPENSSL_memcpy(key->key + ED25519_PUBLIC_KEY_OFFSET, in, 32); + key->has_private = false; + evp_pkey_set0(pkey, &ed25519_asn1_meth, key); + return 1; +} + +static int ed25519_get_priv_raw(const EvpPkey *pkey, uint8_t *out, + size_t *out_len) { + const ED25519_KEY *key = reinterpret_cast(pkey->pkey); + if (!key->has_private) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + + if (out == nullptr) { + *out_len = 32; + return 1; + } + + if (*out_len < 32) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + // The raw private key format is the first 32 bytes of the private key. + OPENSSL_memcpy(out, key->key, 32); + *out_len = 32; + return 1; +} + +static int ed25519_get_pub_raw(const EvpPkey *pkey, uint8_t *out, + size_t *out_len) { + const ED25519_KEY *key = reinterpret_cast(pkey->pkey); + if (out == nullptr) { + *out_len = 32; + return 1; + } + + if (*out_len < 32) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + OPENSSL_memcpy(out, key->key + ED25519_PUBLIC_KEY_OFFSET, 32); + *out_len = 32; + return 1; +} + +static bssl::evp_decode_result_t ed25519_pub_decode(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + // See RFC 8410, section 4. + + // The parameters must be omitted. Public keys have length 32. + if (CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + return ed25519_set_pub_raw(out, CBS_data(key), CBS_len(key)) + ? evp_decode_ok + : evp_decode_error; +} + +static int ed25519_pub_encode(CBB *out, const EvpPkey *pkey) { + const ED25519_KEY *key = reinterpret_cast(pkey->pkey); + + // See RFC 8410, section 4. + CBB spki, algorithm, key_bitstring; + if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, ed25519_asn1_meth.oid, + ed25519_asn1_meth.oid_len) || + !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || + !CBB_add_u8(&key_bitstring, 0 /* padding */) || + !CBB_add_bytes(&key_bitstring, key->key + ED25519_PUBLIC_KEY_OFFSET, + 32) || + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static bool ed25519_pub_equal(const EvpPkey *a, const EvpPkey *b) { + const ED25519_KEY *a_key = reinterpret_cast(a->pkey); + const ED25519_KEY *b_key = reinterpret_cast(b->pkey); + return OPENSSL_memcmp(a_key->key + ED25519_PUBLIC_KEY_OFFSET, + b_key->key + ED25519_PUBLIC_KEY_OFFSET, 32) == 0; +} + +static bssl::evp_decode_result_t ed25519_priv_decode(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + // See RFC 8410, section 7. + + // Parameters must be empty. The key is a 32-byte value wrapped in an extra + // OCTET STRING layer. + CBS inner; + if (CBS_len(params) != 0 || + !CBS_get_asn1(key, &inner, CBS_ASN1_OCTETSTRING) || CBS_len(key) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + return ed25519_set_priv_raw(out, CBS_data(&inner), CBS_len(&inner)) + ? evp_decode_ok + : evp_decode_error; +} + +static int ed25519_priv_encode(CBB *out, const EvpPkey *pkey) { + const ED25519_KEY *key = reinterpret_cast(pkey->pkey); + if (!key->has_private) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + + // See RFC 8410, section 7. + CBB pkcs8, algorithm, private_key, inner; + if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || + !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, ed25519_asn1_meth.oid, + ed25519_asn1_meth.oid_len) || + !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || + !CBB_add_asn1(&private_key, &inner, CBS_ASN1_OCTETSTRING) || + // The PKCS#8 encoding stores only the 32-byte seed which is the first 32 + // bytes of the private key. + !CBB_add_bytes(&inner, key->key, 32) || // + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static bool ed25519_pub_present(const EvpPkey *) { return true; } + +static bool ed25519_pub_copy(EvpPkey *out, const EvpPkey *pkey) { + const ED25519_KEY *pkey_ed25519 = + reinterpret_cast(pkey->pkey); + ED25519_KEY *public_copy = New(); + if (public_copy == nullptr) { + return false; + } + OPENSSL_memcpy(public_copy->key + ED25519_PUBLIC_KEY_OFFSET, + pkey_ed25519->key + ED25519_PUBLIC_KEY_OFFSET, + ED25519_PUBLIC_KEY_LEN); + public_copy->has_private = false; + evp_pkey_set0(out, pkey->ameth, public_copy); + return true; +} + +static bool ed25519_priv_present(const EvpPkey *pkey) { + const ED25519_KEY *key = reinterpret_cast(pkey->pkey); + return key->has_private; +} + +static int ed25519_size(const EvpPkey *pkey) { return 64; } + +static int ed25519_bits(const EvpPkey *pkey) { return 253; } + +const EVP_PKEY_ASN1_METHOD ed25519_asn1_meth = { + EVP_PKEY_ED25519, + {0x2b, 0x65, 0x70}, + 3, + &ed25519_pkey_meth, + ed25519_pub_decode, + ed25519_pub_encode, + ed25519_pub_equal, + ed25519_pub_present, + ed25519_pub_copy, + ed25519_priv_decode, + ed25519_priv_encode, + ed25519_priv_present, + ed25519_set_priv_raw, + /*set_priv_seed=*/nullptr, + ed25519_set_pub_raw, + ed25519_get_priv_raw, + /*get_priv_seed=*/nullptr, + ed25519_get_pub_raw, + /*set1_tls_encodedpoint=*/nullptr, + /*get1_tls_encodedpoint=*/nullptr, + /*pkey_opaque=*/nullptr, + ed25519_size, + ed25519_bits, + /*param_missing=*/nullptr, + /*param_copy=*/nullptr, + /*param_equal=*/nullptr, + ed25519_free, +}; + +// Ed25519 has no parameters to copy. +static int pkey_ed25519_copy(EvpPkeyCtx *dst, EvpPkeyCtx *src) { return 1; } + +static int pkey_ed25519_keygen(EvpPkeyCtx *ctx, EvpPkey *pkey) { + ED25519_KEY *key = New(); + if (key == nullptr) { + return 0; + } + + uint8_t pubkey_unused[32]; + ED25519_keypair(pubkey_unused, key->key); + key->has_private = true; + + evp_pkey_set0(pkey, &ed25519_asn1_meth, key); + return 1; +} + +static int pkey_ed25519_sign_message(EvpPkeyCtx *ctx, uint8_t *sig, + size_t *siglen, const uint8_t *tbs, + size_t tbslen) { + const ED25519_KEY *key = + reinterpret_cast(ctx->pkey->pkey); + if (!key->has_private) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + + if (sig == nullptr) { + *siglen = 64; + return 1; + } + + if (*siglen < 64) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + if (!ED25519_sign(sig, tbs, tbslen, key->key)) { + return 0; + } + + *siglen = 64; + return 1; +} + +static int pkey_ed25519_verify_message(EvpPkeyCtx *ctx, const uint8_t *sig, + size_t siglen, const uint8_t *tbs, + size_t tbslen) { + const ED25519_KEY *key = + reinterpret_cast(ctx->pkey->pkey); + if (siglen != 64 || + !ED25519_verify(tbs, tbslen, sig, key->key + ED25519_PUBLIC_KEY_OFFSET)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_SIGNATURE); + return 0; + } + + return 1; +} + +const EVP_PKEY_CTX_METHOD ed25519_pkey_meth = { + /*pkey_id=*/EVP_PKEY_ED25519, + /*init=*/nullptr, + /*copy=*/pkey_ed25519_copy, + /*cleanup=*/nullptr, + /*keygen=*/pkey_ed25519_keygen, + /*sign=*/nullptr, + /*sign_message=*/pkey_ed25519_sign_message, + /*verify=*/nullptr, + /*verify_message=*/pkey_ed25519_verify_message, + /*verify_recover=*/nullptr, + /*encrypt=*/nullptr, + /*decrypt=*/nullptr, + /*derive=*/nullptr, + /*paramgen=*/nullptr, + /*encap=*/nullptr, + /*decap=*/nullptr, + /*ctrl=*/nullptr, +}; + +} // namespace + +const EVP_PKEY_ALG *EVP_pkey_ed25519() { + static const EVP_PKEY_ALG kAlg = {&ed25519_asn1_meth, &ed25519_pkey_meth}; + return &kAlg; +} diff --git a/third_party/boringssl/src/crypto/evp/p_ed25519_asn1.c b/third_party/boringssl/src/crypto/evp/p_ed25519_asn1.c deleted file mode 100644 index f823c0db..00000000 --- a/third_party/boringssl/src/crypto/evp/p_ed25519_asn1.c +++ /dev/null @@ -1,224 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include -#include - -#include "internal.h" -#include "../internal.h" - - -static void ed25519_free(EVP_PKEY *pkey) { - OPENSSL_free(pkey->pkey.ptr); - pkey->pkey.ptr = NULL; -} - -static int ed25519_set_priv_raw(EVP_PKEY *pkey, const uint8_t *in, size_t len) { - if (len != 32) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - - ED25519_KEY *key = OPENSSL_malloc(sizeof(ED25519_KEY)); - if (key == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - - // The RFC 8032 encoding stores only the 32-byte seed, so we must recover the - // full representation which we use from it. - uint8_t pubkey_unused[32]; - ED25519_keypair_from_seed(pubkey_unused, key->key, in); - key->has_private = 1; - - ed25519_free(pkey); - pkey->pkey.ptr = key; - return 1; -} - -static int ed25519_set_pub_raw(EVP_PKEY *pkey, const uint8_t *in, size_t len) { - if (len != 32) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - - ED25519_KEY *key = OPENSSL_malloc(sizeof(ED25519_KEY)); - if (key == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - - OPENSSL_memcpy(key->key + ED25519_PUBLIC_KEY_OFFSET, in, 32); - key->has_private = 0; - - ed25519_free(pkey); - pkey->pkey.ptr = key; - return 1; -} - -static int ed25519_get_priv_raw(const EVP_PKEY *pkey, uint8_t *out, - size_t *out_len) { - const ED25519_KEY *key = pkey->pkey.ptr; - if (!key->has_private) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); - return 0; - } - - if (out == NULL) { - *out_len = 32; - return 1; - } - - if (*out_len < 32) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - - // The raw private key format is the first 32 bytes of the private key. - OPENSSL_memcpy(out, key->key, 32); - *out_len = 32; - return 1; -} - -static int ed25519_get_pub_raw(const EVP_PKEY *pkey, uint8_t *out, - size_t *out_len) { - const ED25519_KEY *key = pkey->pkey.ptr; - if (out == NULL) { - *out_len = 32; - return 1; - } - - if (*out_len < 32) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - - OPENSSL_memcpy(out, key->key + ED25519_PUBLIC_KEY_OFFSET, 32); - *out_len = 32; - return 1; -} - -static int ed25519_pub_decode(EVP_PKEY *out, CBS *params, CBS *key) { - // See RFC 8410, section 4. - - // The parameters must be omitted. Public keys have length 32. - if (CBS_len(params) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - - return ed25519_set_pub_raw(out, CBS_data(key), CBS_len(key)); -} - -static int ed25519_pub_encode(CBB *out, const EVP_PKEY *pkey) { - const ED25519_KEY *key = pkey->pkey.ptr; - - // See RFC 8410, section 4. - CBB spki, algorithm, oid, key_bitstring; - if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, ed25519_asn1_meth.oid, ed25519_asn1_meth.oid_len) || - !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || - !CBB_add_u8(&key_bitstring, 0 /* padding */) || - !CBB_add_bytes(&key_bitstring, key->key + ED25519_PUBLIC_KEY_OFFSET, - 32) || - !CBB_flush(out)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); - return 0; - } - - return 1; -} - -static int ed25519_pub_cmp(const EVP_PKEY *a, const EVP_PKEY *b) { - const ED25519_KEY *a_key = a->pkey.ptr; - const ED25519_KEY *b_key = b->pkey.ptr; - return OPENSSL_memcmp(a_key->key + ED25519_PUBLIC_KEY_OFFSET, - b_key->key + ED25519_PUBLIC_KEY_OFFSET, 32) == 0; -} - -static int ed25519_priv_decode(EVP_PKEY *out, CBS *params, CBS *key) { - // See RFC 8410, section 7. - - // Parameters must be empty. The key is a 32-byte value wrapped in an extra - // OCTET STRING layer. - CBS inner; - if (CBS_len(params) != 0 || - !CBS_get_asn1(key, &inner, CBS_ASN1_OCTETSTRING) || - CBS_len(key) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - - return ed25519_set_priv_raw(out, CBS_data(&inner), CBS_len(&inner)); -} - -static int ed25519_priv_encode(CBB *out, const EVP_PKEY *pkey) { - ED25519_KEY *key = pkey->pkey.ptr; - if (!key->has_private) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); - return 0; - } - - // See RFC 8410, section 7. - CBB pkcs8, algorithm, oid, private_key, inner; - if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || - !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, ed25519_asn1_meth.oid, ed25519_asn1_meth.oid_len) || - !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || - !CBB_add_asn1(&private_key, &inner, CBS_ASN1_OCTETSTRING) || - // The PKCS#8 encoding stores only the 32-byte seed which is the first 32 - // bytes of the private key. - !CBB_add_bytes(&inner, key->key, 32) || - !CBB_flush(out)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); - return 0; - } - - return 1; -} - -static int ed25519_size(const EVP_PKEY *pkey) { return 64; } - -static int ed25519_bits(const EVP_PKEY *pkey) { return 253; } - -const EVP_PKEY_ASN1_METHOD ed25519_asn1_meth = { - EVP_PKEY_ED25519, - {0x2b, 0x65, 0x70}, - 3, - &ed25519_pkey_meth, - ed25519_pub_decode, - ed25519_pub_encode, - ed25519_pub_cmp, - ed25519_priv_decode, - ed25519_priv_encode, - ed25519_set_priv_raw, - ed25519_set_pub_raw, - ed25519_get_priv_raw, - ed25519_get_pub_raw, - NULL /* pkey_opaque */, - ed25519_size, - ed25519_bits, - NULL /* param_missing */, - NULL /* param_copy */, - NULL /* param_cmp */, - ed25519_free, -}; diff --git a/third_party/boringssl/src/crypto/evp/p_hkdf.c b/third_party/boringssl/src/crypto/evp/p_hkdf.c deleted file mode 100644 index 932372df..00000000 --- a/third_party/boringssl/src/crypto/evp/p_hkdf.c +++ /dev/null @@ -1,241 +0,0 @@ -/* Copyright (c) 2022, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include -#include -#include - -#include "../internal.h" -#include "internal.h" - - -typedef struct { - int mode; - const EVP_MD *md; - uint8_t *key; - size_t key_len; - uint8_t *salt; - size_t salt_len; - CBB info; -} HKDF_PKEY_CTX; - -static int pkey_hkdf_init(EVP_PKEY_CTX *ctx) { - HKDF_PKEY_CTX *hctx = OPENSSL_malloc(sizeof(HKDF_PKEY_CTX)); - if (hctx == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - - OPENSSL_memset(hctx, 0, sizeof(HKDF_PKEY_CTX)); - if (!CBB_init(&hctx->info, 0)) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - OPENSSL_free(hctx); - return 0; - } - - ctx->data = hctx; - return 1; -} - -static int pkey_hkdf_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) { - if (!pkey_hkdf_init(dst)) { - return 0; - } - - HKDF_PKEY_CTX *hctx_dst = dst->data; - const HKDF_PKEY_CTX *hctx_src = src->data; - hctx_dst->mode = hctx_src->mode; - hctx_dst->md = hctx_src->md; - - if (hctx_src->key_len != 0) { - hctx_dst->key = OPENSSL_memdup(hctx_src->key, hctx_src->key_len); - if (hctx_src->key == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - hctx_dst->key_len = hctx_src->key_len; - } - - if (hctx_src->salt_len != 0) { - hctx_dst->salt = OPENSSL_memdup(hctx_src->salt, hctx_src->salt_len); - if (hctx_src->salt == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - hctx_dst->salt_len = hctx_src->salt_len; - } - - if (!CBB_add_bytes(&hctx_dst->info, CBB_data(&hctx_src->info), - CBB_len(&hctx_src->info))) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - - return 1; -} - -static void pkey_hkdf_cleanup(EVP_PKEY_CTX *ctx) { - HKDF_PKEY_CTX *hctx = ctx->data; - if (hctx != NULL) { - OPENSSL_free(hctx->key); - OPENSSL_free(hctx->salt); - CBB_cleanup(&hctx->info); - OPENSSL_free(hctx); - ctx->data = NULL; - } -} - -static int pkey_hkdf_derive(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *out_len) { - HKDF_PKEY_CTX *hctx = ctx->data; - if (hctx->md == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PARAMETERS); - return 0; - } - if (hctx->key_len == 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); - return 0; - } - - if (out == NULL) { - if (hctx->mode == EVP_PKEY_HKDEF_MODE_EXTRACT_ONLY) { - *out_len = EVP_MD_size(hctx->md); - } - // HKDF-Expand is variable-length and returns |*out_len| bytes. "Output" the - // input length by leaving it alone. - return 1; - } - - switch (hctx->mode) { - case EVP_PKEY_HKDEF_MODE_EXTRACT_AND_EXPAND: - return HKDF(out, *out_len, hctx->md, hctx->key, hctx->key_len, hctx->salt, - hctx->salt_len, CBB_data(&hctx->info), CBB_len(&hctx->info)); - - case EVP_PKEY_HKDEF_MODE_EXTRACT_ONLY: - if (*out_len < EVP_MD_size(hctx->md)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - return HKDF_extract(out, out_len, hctx->md, hctx->key, hctx->key_len, - hctx->salt, hctx->salt_len); - - case EVP_PKEY_HKDEF_MODE_EXPAND_ONLY: - return HKDF_expand(out, *out_len, hctx->md, hctx->key, hctx->key_len, - CBB_data(&hctx->info), CBB_len(&hctx->info)); - } - OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); - return 0; -} - -static int pkey_hkdf_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) { - HKDF_PKEY_CTX *hctx = ctx->data; - switch (type) { - case EVP_PKEY_CTRL_HKDF_MODE: - if (p1 != EVP_PKEY_HKDEF_MODE_EXTRACT_AND_EXPAND && - p1 != EVP_PKEY_HKDEF_MODE_EXTRACT_ONLY && - p1 != EVP_PKEY_HKDEF_MODE_EXPAND_ONLY) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_OPERATION); - return 0; - } - hctx->mode = p1; - return 1; - case EVP_PKEY_CTRL_HKDF_MD: - hctx->md = p2; - return 1; - case EVP_PKEY_CTRL_HKDF_KEY: { - const CBS *key = p2; - if (!CBS_stow(key, &hctx->key, &hctx->key_len)) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - return 1; - } - case EVP_PKEY_CTRL_HKDF_SALT: { - const CBS *salt = p2; - if (!CBS_stow(salt, &hctx->salt, &hctx->salt_len)) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - return 1; - } - case EVP_PKEY_CTRL_HKDF_INFO: { - const CBS *info = p2; - // |EVP_PKEY_CTX_add1_hkdf_info| appends to the info string, rather than - // replacing it. - if (!CBB_add_bytes(&hctx->info, CBS_data(info), CBS_len(info))) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - return 1; - } - default: - OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); - return 0; - } -} - -const EVP_PKEY_METHOD hkdf_pkey_meth = { - EVP_PKEY_HKDF, - pkey_hkdf_init, - pkey_hkdf_copy, - pkey_hkdf_cleanup, - /*keygen=*/NULL, - /*sign=*/NULL, - /*sign_message=*/NULL, - /*verify=*/NULL, - /*verify_message=*/NULL, - /*verify_recover=*/NULL, - /*encrypt=*/NULL, - /*decrypt=*/NULL, - pkey_hkdf_derive, - /*paramgen=*/NULL, - pkey_hkdf_ctrl, -}; - -int EVP_PKEY_CTX_hkdf_mode(EVP_PKEY_CTX *ctx, int mode) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_HKDF, EVP_PKEY_OP_DERIVE, - EVP_PKEY_CTRL_HKDF_MODE, mode, NULL); -} - -int EVP_PKEY_CTX_set_hkdf_md(EVP_PKEY_CTX *ctx, const EVP_MD *md) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_HKDF, EVP_PKEY_OP_DERIVE, - EVP_PKEY_CTRL_HKDF_MD, 0, (void *)md); -} - -int EVP_PKEY_CTX_set1_hkdf_key(EVP_PKEY_CTX *ctx, const uint8_t *key, - size_t key_len) { - CBS cbs; - CBS_init(&cbs, key, key_len); - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_HKDF, EVP_PKEY_OP_DERIVE, - EVP_PKEY_CTRL_HKDF_KEY, 0, &cbs); -} - -int EVP_PKEY_CTX_set1_hkdf_salt(EVP_PKEY_CTX *ctx, const uint8_t *salt, - size_t salt_len) { - CBS cbs; - CBS_init(&cbs, salt, salt_len); - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_HKDF, EVP_PKEY_OP_DERIVE, - EVP_PKEY_CTRL_HKDF_SALT, 0, &cbs); -} - -int EVP_PKEY_CTX_add1_hkdf_info(EVP_PKEY_CTX *ctx, const uint8_t *info, - size_t info_len) { - CBS cbs; - CBS_init(&cbs, info, info_len); - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_HKDF, EVP_PKEY_OP_DERIVE, - EVP_PKEY_CTRL_HKDF_INFO, 0, &cbs); -} diff --git a/third_party/boringssl/src/crypto/evp/p_hkdf.cc b/third_party/boringssl/src/crypto/evp/p_hkdf.cc new file mode 100644 index 00000000..14cb76cb --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/p_hkdf.cc @@ -0,0 +1,204 @@ +// Copyright 2022 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +namespace { + +struct HKDF_PKEY_CTX { + int mode = 0; + const EVP_MD *md = nullptr; + Array key; + Array salt; + Vector info; +}; + +static int pkey_hkdf_init(EvpPkeyCtx *ctx, const EVP_PKEY_ALG *) { + ctx->data = New(); + return 1; +} + +static int pkey_hkdf_copy(EvpPkeyCtx *dst, EvpPkeyCtx *src) { + if (!pkey_hkdf_init(dst, nullptr)) { + return 0; + } + + HKDF_PKEY_CTX *hctx_dst = reinterpret_cast(dst->data); + const HKDF_PKEY_CTX *hctx_src = + reinterpret_cast(src->data); + hctx_dst->mode = hctx_src->mode; + hctx_dst->md = hctx_src->md; + + if (!hctx_dst->key.CopyFrom(hctx_src->key) || + !hctx_dst->salt.CopyFrom(hctx_src->salt) || + !hctx_dst->info.CopyFrom(hctx_src->info)) { + return 0; + } + + return 1; +} + +static void pkey_hkdf_cleanup(EvpPkeyCtx *ctx) { + Delete(reinterpret_cast(ctx->data)); +} + +static int pkey_hkdf_derive(EvpPkeyCtx *ctx, uint8_t *out, size_t *out_len) { + HKDF_PKEY_CTX *hctx = reinterpret_cast(ctx->data); + if (hctx->md == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PARAMETERS); + return 0; + } + if (hctx->key.empty()) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return 0; + } + + if (out == nullptr) { + if (hctx->mode == EVP_PKEY_HKDEF_MODE_EXTRACT_ONLY) { + *out_len = EVP_MD_size(hctx->md); + } + // HKDF-Expand is variable-length and returns |*out_len| bytes. "Output" the + // input length by leaving it alone. + return 1; + } + + switch (hctx->mode) { + case EVP_PKEY_HKDEF_MODE_EXTRACT_AND_EXPAND: + return HKDF(out, *out_len, hctx->md, hctx->key.data(), hctx->key.size(), + hctx->salt.data(), hctx->salt.size(), hctx->info.data(), + hctx->info.size()); + + case EVP_PKEY_HKDEF_MODE_EXTRACT_ONLY: + if (*out_len < EVP_MD_size(hctx->md)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + return HKDF_extract(out, out_len, hctx->md, hctx->key.data(), + hctx->key.size(), hctx->salt.data(), + hctx->salt.size()); + + case EVP_PKEY_HKDEF_MODE_EXPAND_ONLY: + return HKDF_expand(out, *out_len, hctx->md, hctx->key.data(), + hctx->key.size(), hctx->info.data(), + hctx->info.size()); + } + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; +} + +static int pkey_hkdf_ctrl(EvpPkeyCtx *ctx, int type, int p1, void *p2) { + HKDF_PKEY_CTX *hctx = reinterpret_cast(ctx->data); + switch (type) { + case EVP_PKEY_CTRL_HKDF_MODE: + if (p1 != EVP_PKEY_HKDEF_MODE_EXTRACT_AND_EXPAND && + p1 != EVP_PKEY_HKDEF_MODE_EXTRACT_ONLY && + p1 != EVP_PKEY_HKDEF_MODE_EXPAND_ONLY) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_OPERATION); + return 0; + } + hctx->mode = p1; + return 1; + case EVP_PKEY_CTRL_HKDF_MD: + hctx->md = reinterpret_cast(p2); + return 1; + case EVP_PKEY_CTRL_HKDF_KEY: { + const auto *key = reinterpret_cast *>(p2); + return hctx->key.CopyFrom(*key); + } + case EVP_PKEY_CTRL_HKDF_SALT: { + const auto *salt = reinterpret_cast *>(p2); + return hctx->salt.CopyFrom(*salt); + } + case EVP_PKEY_CTRL_HKDF_INFO: { + const auto *info = reinterpret_cast *>(p2); + // |EVP_PKEY_CTX_add1_hkdf_info| appends to the info string, rather than + // replacing it. + return hctx->info.Append(*info); + } + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); + return 0; + } +} + +const EVP_PKEY_CTX_METHOD hkdf_pkey_meth = { + EVP_PKEY_HKDF, + pkey_hkdf_init, + pkey_hkdf_copy, + pkey_hkdf_cleanup, + /*keygen=*/nullptr, + /*sign=*/nullptr, + /*sign_message=*/nullptr, + /*verify=*/nullptr, + /*verify_message=*/nullptr, + /*verify_recover=*/nullptr, + /*encrypt=*/nullptr, + /*decrypt=*/nullptr, + pkey_hkdf_derive, + /*paramgen=*/nullptr, + /*encap=*/nullptr, + /*decap=*/nullptr, + pkey_hkdf_ctrl, +}; + +} // namespace + +const EVP_PKEY_ALG *bssl::evp_pkey_hkdf() { + static const EVP_PKEY_ALG kAlg = {nullptr, &hkdf_pkey_meth}; + return &kAlg; +} + +int EVP_PKEY_CTX_hkdf_mode(EVP_PKEY_CTX *ctx, int mode) { + return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_HKDF, EVP_PKEY_OP_DERIVE, + EVP_PKEY_CTRL_HKDF_MODE, mode, nullptr); +} + +int EVP_PKEY_CTX_set_hkdf_md(EVP_PKEY_CTX *ctx, const EVP_MD *md) { + return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_HKDF, EVP_PKEY_OP_DERIVE, + EVP_PKEY_CTRL_HKDF_MD, 0, (void *)md); +} + +int EVP_PKEY_CTX_set1_hkdf_key(EVP_PKEY_CTX *ctx, const uint8_t *key, + size_t key_len) { + Span span(key, key_len); + return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_HKDF, EVP_PKEY_OP_DERIVE, + EVP_PKEY_CTRL_HKDF_KEY, 0, &span); +} + +int EVP_PKEY_CTX_set1_hkdf_salt(EVP_PKEY_CTX *ctx, const uint8_t *salt, + size_t salt_len) { + Span span(salt, salt_len); + return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_HKDF, EVP_PKEY_OP_DERIVE, + EVP_PKEY_CTRL_HKDF_SALT, 0, &span); +} + +int EVP_PKEY_CTX_add1_hkdf_info(EVP_PKEY_CTX *ctx, const uint8_t *info, + size_t info_len) { + Span span(info, info_len); + return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_HKDF, EVP_PKEY_OP_DERIVE, + EVP_PKEY_CTRL_HKDF_INFO, 0, &span); +} diff --git a/third_party/boringssl/src/crypto/evp/p_mldsa.cc b/third_party/boringssl/src/crypto/evp/p_mldsa.cc new file mode 100644 index 00000000..3e4c6627 --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/p_mldsa.cc @@ -0,0 +1,552 @@ +// Copyright 2025 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include + +#include +#include +#include +#include +#include + +#include "../fipsmodule/bcm_interface.h" +#include "../mem_internal.h" +#include "internal.h" + +using namespace bssl; + +namespace { + +constexpr CBS_ASN1_TAG kSeedTag = CBS_ASN1_CONTEXT_SPECIFIC | 0; + +constexpr uint8_t kMLDSA44OID[] = {OBJ_ENC_ML_DSA_44}; +constexpr uint8_t kMLDSA65OID[] = {OBJ_ENC_ML_DSA_65}; +constexpr uint8_t kMLDSA87OID[] = {OBJ_ENC_ML_DSA_87}; + +constexpr int kMaxContextLength = 255; + +// We must generate EVP bindings for three ML-DSA algorithms. Define a traits +// type that captures the functions and other parameters of an ML-DSA algorithm. +#define MAKE_MLDSA_TRAITS(kl) \ + struct MLDSA##kl##Traits { \ + using PublicKey = MLDSA##kl##_public_key; \ + using PrivateKey = MLDSA##kl##_private_key; \ + static constexpr size_t kPublicKeyBytes = MLDSA##kl##_PUBLIC_KEY_BYTES; \ + static constexpr size_t kSignatureBytes = MLDSA##kl##_SIGNATURE_BYTES; \ + static constexpr int kType = EVP_PKEY_ML_DSA_##kl; \ + static constexpr Span kOID = kMLDSA##kl##OID; \ + static constexpr auto PrivateKeyFromSeed = \ + &MLDSA##kl##_private_key_from_seed; \ + static constexpr auto GenerateKey = &MLDSA##kl##_generate_key; \ + static constexpr auto Sign = &MLDSA##kl##_sign; \ + static constexpr auto ParsePublicKey = &MLDSA##kl##_parse_public_key; \ + static constexpr auto PublicOfPrivate = \ + &BCM_mldsa##kl##_public_of_private; \ + static constexpr auto MarshalPublicKey = &MLDSA##kl##_marshal_public_key; \ + static constexpr auto PublicKeysEqual = \ + &BCM_mldsa##kl##_public_keys_equal; \ + static constexpr auto Verify = &MLDSA##kl##_verify; \ + static_assert(std::is_trivially_copyable_v, \ + "PublicKey type must be trivially copyable."); \ + }; + +MAKE_MLDSA_TRAITS(44) +MAKE_MLDSA_TRAITS(65) +MAKE_MLDSA_TRAITS(87) + +// For each ML-DSA variant, the |EvpPkey| must hold a public or private key. +// EVP uses the same type for public and private keys, so the representation +// must support both. The private key type contains the public key struct in it, +// so we use a pointer to either a PrivateKeyData or +// PublicKeyData, with a common base class to dispatch between them. +// +// TODO(crbug.com/404286922): In C++20, we need fewer |typename|s in front of +// dependent type names. + +template +class PrivateKeyData; + +template +class KeyData { + public: + // Returns the underlying public key for the key. + const typename Traits::PublicKey *GetPublicKey() const; + + // Returns the PrivateKeyData struct for the key, or nullptr if this is a + // public key. + PrivateKeyData *AsPrivateKeyData(); + const PrivateKeyData *AsPrivateKeyData() const { + return const_cast(this)->AsPrivateKeyData(); + } + + // A KeyData cannot be freed directly. Rather, it must use this wrapper which + // calls the correct subclass's destructor. + static void Free(KeyData *data); + + protected: + explicit KeyData(bool is_private) : is_private_(is_private) {} + ~KeyData() = default; + bool is_private_; +}; + +template +class PublicKeyData : public KeyData { + public: + enum { kAllowUniquePtr = true }; + PublicKeyData() : KeyData(/*is_private=*/false) {} + + // Allows copying the PublicKey. + explicit PublicKeyData(const typename Traits::PublicKey &key) + : KeyData(/*is_private=*/false), pub(key) {} + + typename Traits::PublicKey pub; +}; + +template +class PrivateKeyData : public KeyData { + public: + enum { kAllowUniquePtr = true }; + PrivateKeyData() : KeyData(/*is_private=*/true) {} + typename Traits::PrivateKey priv; + uint8_t seed[MLDSA_SEED_BYTES]; +}; + +template +const typename Traits::PublicKey *KeyData::GetPublicKey() const { + auto *priv_data = AsPrivateKeyData(); + if (priv_data != nullptr) { + return Traits::PublicOfPrivate(&priv_data->priv); + } + return &static_cast *>(this)->pub; +} + +template +PrivateKeyData *KeyData::AsPrivateKeyData() { + if (is_private_) { + return static_cast *>(this); + } + return nullptr; +} + +template +void KeyData::Free(KeyData *data) { + if (data == nullptr) { + return; + } + // Delete the more specific subclass. This is moot for now, because neither + // type has a non-trivial destructor. + auto *priv_data = data->AsPrivateKeyData(); + if (priv_data) { + Delete(priv_data); + } else { + Delete(static_cast *>(data)); + } +} + +struct MldsaPkeyCtx { + bssl::Array context; +}; + + +// Finally, MLDSAImplementation instantiates the methods themselves. + +template +struct MLDSAImplementation { + static KeyData *GetKeyData(EvpPkey *pkey) { + assert(pkey->ameth == &asn1_method); + return static_cast *>(pkey->pkey); + } + + static const KeyData *GetKeyData(const EvpPkey *pkey) { + return GetKeyData(const_cast(pkey)); + } + + static void PkeyFree(EvpPkey *pkey) { + KeyData::Free(GetKeyData(pkey)); + pkey->pkey = nullptr; + } + + static int SetPrivateSeed(EvpPkey *pkey, const uint8_t *in, size_t len) { + auto priv = MakeUnique>(); + if (priv == nullptr) { + return 0; + } + + if (len != MLDSA_SEED_BYTES || + !Traits::PrivateKeyFromSeed(&priv->priv, in, len)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + OPENSSL_memcpy(priv->seed, in, len); + evp_pkey_set0(pkey, &asn1_method, priv.release()); + return 1; + } + + static int SetRawPublic(EvpPkey *pkey, const uint8_t *in, size_t len) { + auto pub = MakeUnique>(); + if (pub == nullptr) { + return 0; + } + CBS cbs; + CBS_init(&cbs, in, len); + if (!Traits::ParsePublicKey(&pub->pub, &cbs) || CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + evp_pkey_set0(pkey, &asn1_method, pub.release()); + return 1; + } + + static int GetPrivateSeed(const EvpPkey *pkey, uint8_t *out, + size_t *out_len) { + const auto *priv = GetKeyData(pkey)->AsPrivateKeyData(); + if (priv == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + if (out == nullptr) { + *out_len = MLDSA_SEED_BYTES; + return 1; + } + if (*out_len < MLDSA_SEED_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + OPENSSL_memcpy(out, priv->seed, MLDSA_SEED_BYTES); + *out_len = MLDSA_SEED_BYTES; + return 1; + } + + static int GetRawPublic(const EvpPkey *pkey, uint8_t *out, size_t *out_len) { + const auto *pub = GetKeyData(pkey)->GetPublicKey(); + if (out == nullptr) { + *out_len = Traits::kPublicKeyBytes; + return 1; + } + if (*out_len < Traits::kPublicKeyBytes) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + CBB cbb; + CBB_init_fixed(&cbb, out, Traits::kPublicKeyBytes); + BSSL_CHECK(Traits::MarshalPublicKey(&cbb, pub)); + BSSL_CHECK(CBB_len(&cbb) == Traits::kPublicKeyBytes); + *out_len = Traits::kPublicKeyBytes; + return 1; + } + + static evp_decode_result_t DecodePublic(const EVP_PKEY_ALG *alg, EvpPkey *out, + CBS *params, CBS *key) { + // The parameters must be omitted. See + // draft-ietf-lamps-dilithium-certificates-13, Section 2. + if (CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + return SetRawPublic(out, CBS_data(key), CBS_len(key)) ? evp_decode_ok + : evp_decode_error; + } + + static int EncodePublic(CBB *out, const EvpPkey *pkey) { + const auto *pub = GetKeyData(pkey)->GetPublicKey(); + // See draft-ietf-lamps-dilithium-certificates-13, Sections 2 and 4. + CBB spki, algorithm, key_bitstring; + if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, Traits::kOID.data(), + Traits::kOID.size()) || + !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || + !CBB_add_u8(&key_bitstring, 0 /* no unused bits */) || + !Traits::MarshalPublicKey(&key_bitstring, pub) || + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + return 1; + } + + static bool EqualPublic(const EvpPkey *a, const EvpPkey *b) { + const auto *a_pub = GetKeyData(a)->GetPublicKey(); + const auto *b_pub = GetKeyData(b)->GetPublicKey(); + return Traits::PublicKeysEqual(a_pub, b_pub); + } + + static bool HasPublic(const EvpPkey *pk) { return true; } + + static bool CopyPublic(EvpPkey *out, const EvpPkey *pk) { + auto *public_copy = + New>(*GetKeyData(pk)->GetPublicKey()); + if (public_copy == nullptr) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return false; + } + evp_pkey_set0(out, pk->ameth, public_copy); + return true; + } + + static evp_decode_result_t DecodePrivate(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + // The parameters must be omitted. See + // draft-ietf-lamps-dilithium-certificates-13, Section 2. + if (CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + // See draft-ietf-lamps-dilithium-certificates-13, Section 6. Three + // different encodings were specified, adding complexity to the question of + // whether a private key is valid. We only implement the "seed" + // representation. Give this case a different error for easier diagnostics. + // + // The "expandedKey" representation was a last-minute accommodation for + // legacy hardware, which should be updated to use seeds. Supporting it + // complicates the notion of a private key with both seedful and seedless + // variants. + // + // The "both" representation is technically unsound and + // dangerous, so we do not implement it. Systems composed of components, + // some of which look at one half of the "both" representation, and half of + // the other, will appear to interop, but break when an input is + // inconsistent. The expanded key can be computed from the seed, so there is + // no purpose in this form. + CBS seed; + if (!CBS_get_asn1(key, &seed, kSeedTag)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_PRIVATE_KEY_WAS_NOT_SEED); + return evp_decode_error; + } + if (CBS_len(key) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + return SetPrivateSeed(out, CBS_data(&seed), CBS_len(&seed)) + ? evp_decode_ok + : evp_decode_error; + } + + static int EncodePrivate(CBB *out, const EvpPkey *pkey) { + const auto *priv = GetKeyData(pkey)->AsPrivateKeyData(); + if (priv == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + // See draft-ietf-lamps-dilithium-certificates-13, Sections 2 and 6. We + // encode only the seed representation. + CBB pkcs8, algorithm, private_key; + if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || + !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, Traits::kOID.data(), + Traits::kOID.size()) || + !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || + !CBB_add_asn1_element(&private_key, kSeedTag, priv->seed, + sizeof(priv->seed)) || + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + return 1; + } + + static bool HasPrivate(const EvpPkey *pk) { + return GetKeyData(pk)->AsPrivateKeyData() != nullptr; + } + + static int PkeySize(const EvpPkey *pkey) { return Traits::kSignatureBytes; } + static int PkeyBits(const EvpPkey *pkey) { + // OpenSSL counts the bits in the public key serialization. + return Traits::kPublicKeyBytes * 8; + } + + static int Init(EvpPkeyCtx *ctx, const EVP_PKEY_ALG *) { + MldsaPkeyCtx *mctx = New(); + if (mctx == nullptr) { + return 0; + } + ctx->data = mctx; + return 1; + } + + static void Cleanup(EvpPkeyCtx *ctx) { + Delete(static_cast(ctx->data)); + } + + static int CopyContext(EvpPkeyCtx *dst, EvpPkeyCtx *src) { + if (!Init(dst, nullptr)) { + return 0; + } + MldsaPkeyCtx *sctx = static_cast(src->data); + MldsaPkeyCtx *dctx = static_cast(dst->data); + if (!dctx->context.CopyFrom(sctx->context)) { + return 0; + } + return 1; + } + + static int KeyGen(EvpPkeyCtx *ctx, EvpPkey *pkey) { + auto priv = MakeUnique>(); + if (priv == nullptr) { + return 0; + } + uint8_t unused_public[Traits::kPublicKeyBytes]; + if (!Traits::GenerateKey(unused_public, priv->seed, &priv->priv)) { + return 0; + } + evp_pkey_set0(pkey, &asn1_method, priv.release()); + return 1; + } + + static int SignMessage(EvpPkeyCtx *ctx, uint8_t *sig, size_t *siglen, + const uint8_t *tbs, size_t tbslen) { + const auto *priv_data = GetKeyData(ctx->pkey.get())->AsPrivateKeyData(); + if (priv_data == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + if (sig == nullptr) { + *siglen = Traits::kSignatureBytes; + return 1; + } + if (*siglen < Traits::kSignatureBytes) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + MldsaPkeyCtx *mctx = static_cast(ctx->data); + if (!Traits::Sign(sig, &priv_data->priv, tbs, tbslen, mctx->context.data(), + mctx->context.size())) { + return 0; + } + *siglen = Traits::kSignatureBytes; + return 1; + } + + static int VerifyMessage(EvpPkeyCtx *ctx, const uint8_t *sig, size_t siglen, + const uint8_t *tbs, size_t tbslen) { + const auto *pub = GetKeyData(ctx->pkey.get())->GetPublicKey(); + MldsaPkeyCtx *mctx = static_cast(ctx->data); + if (!Traits::Verify(pub, sig, siglen, tbs, tbslen, mctx->context.data(), + mctx->context.size())) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_SIGNATURE); + return 0; + } + return 1; + } + + static int Ctrl(EvpPkeyCtx *ctx, int type, int p1, void *p2) { + MldsaPkeyCtx *mctx = static_cast(ctx->data); + switch (type) { + case EVP_PKEY_CTRL_SIGNATURE_CONTEXT_STRING: { + const auto *context_string = + reinterpret_cast *>(p2); + if (context_string == nullptr || + context_string->size() > kMaxContextLength) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PARAMETERS); + return 0; + } + return mctx->context.CopyFrom(*context_string); + } + + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); + return 0; + } + } + + static constexpr EVP_PKEY_CTX_METHOD pkey_method = { + Traits::kType, + &Init, + &CopyContext, + &Cleanup, + &KeyGen, + /*sign=*/nullptr, + &SignMessage, + /*verify=*/nullptr, + &VerifyMessage, + /*verify_recover=*/nullptr, + /*encrypt=*/nullptr, + /*decrypt=*/nullptr, + /*derive=*/nullptr, + /*paramgen=*/nullptr, + /*encap=*/nullptr, + /*decap=*/nullptr, + &Ctrl, + }; + + static constexpr EVP_PKEY_ASN1_METHOD BuildASN1Method() { + EVP_PKEY_ASN1_METHOD ret = { + Traits::kType, + // The OID is filled in below. + /*oid=*/{}, + /*oid_len=*/0, + &pkey_method, + &DecodePublic, + &EncodePublic, + &EqualPublic, + &HasPublic, + &CopyPublic, + &DecodePrivate, + &EncodePrivate, + &HasPrivate, + // While exporting the seed as the "raw" private key would be natural, + // OpenSSL connected these APIs to the "raw private key", so we export + // the seed separately. + /*set_priv_raw=*/nullptr, + &SetPrivateSeed, + &SetRawPublic, + /*get_priv_raw=*/nullptr, + &GetPrivateSeed, + &GetRawPublic, + /*set1_tls_encodedpoint=*/nullptr, + /*get1_tls_encodedpoint=*/nullptr, + /*pkey_opaque=*/nullptr, + &PkeySize, + &PkeyBits, + /*param_missing=*/nullptr, + /*param_copy=*/nullptr, + /*param_equal=*/nullptr, + &PkeyFree, + }; + // TODO(crbug.com/404286922): Use std::copy in C++20, when it's constexpr. + // TODO(crbug.com/450823446): Better yet, make this field an InplaceVector + // and give it a suitable constructor. + constexpr auto oid = Traits::kOID; + static_assert(oid.size() <= sizeof(ret.oid)); + for (size_t i = 0; i < oid.size(); i++) { + ret.oid[i] = oid[i]; + } + ret.oid_len = oid.size(); + return ret; + } + + static constexpr EVP_PKEY_ASN1_METHOD asn1_method = BuildASN1Method(); + static constexpr EVP_PKEY_ALG pkey_alg = {&asn1_method, &pkey_method}; +}; + +} // namespace + +const EVP_PKEY_ALG *EVP_pkey_ml_dsa_44() { + return &MLDSAImplementation::pkey_alg; +} + +const EVP_PKEY_ALG *EVP_pkey_ml_dsa_65() { + return &MLDSAImplementation::pkey_alg; +} + +const EVP_PKEY_ALG *EVP_pkey_ml_dsa_87() { + return &MLDSAImplementation::pkey_alg; +} diff --git a/third_party/boringssl/src/crypto/evp/p_mlkem.cc b/third_party/boringssl/src/crypto/evp/p_mlkem.cc new file mode 100644 index 00000000..2687ba84 --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/p_mlkem.cc @@ -0,0 +1,485 @@ +// Copyright 2026 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "../fipsmodule/bcm_interface.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + +using namespace bssl; + +namespace { + +constexpr CBS_ASN1_TAG kSeedTag = CBS_ASN1_CONTEXT_SPECIFIC | 0; + +constexpr uint8_t kMLKEM768OID[] = {OBJ_ENC_ML_KEM_768}; +constexpr uint8_t kMLKEM1024OID[] = {OBJ_ENC_ML_KEM_1024}; + +// Generate EVP bindings for multiple ML-KEM algorithms. +#define MAKE_MLKEM_TRAITS(x) \ + struct MLKEM##x##Traits { \ + using PublicKey = MLKEM##x##_public_key; \ + using PrivateKey = MLKEM##x##_private_key; \ + static constexpr size_t kPublicKeyBytes = MLKEM##x##_PUBLIC_KEY_BYTES; \ + static constexpr size_t kCiphertextBytes = MLKEM##x##_CIPHERTEXT_BYTES; \ + static constexpr int kType = EVP_PKEY_ML_KEM_##x; \ + static constexpr Span kOID = kMLKEM##x##OID; \ + static constexpr auto GenerateKey = &MLKEM##x##_generate_key; \ + static constexpr auto PrivateKeyFromSeed = \ + &MLKEM##x##_private_key_from_seed; \ + static constexpr auto PublicOfPrivate = &BCM_mlkem##x##_public_of_private; \ + static constexpr auto PublicKeysEqual = &BCM_mlkem##x##_public_keys_equal; \ + static constexpr auto Encap = &MLKEM##x##_encap; \ + static constexpr auto Decap = &MLKEM##x##_decap; \ + static constexpr auto MarshalPublicKey = &MLKEM##x##_marshal_public_key; \ + static constexpr auto ParsePublicKey = &MLKEM##x##_parse_public_key; \ + static_assert(std::is_trivially_copyable_v, \ + "PublicKey type must be trivially copyable."); \ + }; + +MAKE_MLKEM_TRAITS(768) +MAKE_MLKEM_TRAITS(1024) + +template +class PrivateKeyData; + +// The private key type contains the public key struct in it, so we use a +// pointer to either a PrivateKeyData or PublicKeyData, with a +// common base class to dispatch between them. +template +class KeyData { + public: + // Returns the underlying public key for the key. + const typename Traits::PublicKey *GetPublicKey() const; + + // Returns the PrivateKeyData struct for the key, or nullptr if this is a + // public key. + PrivateKeyData *AsPrivateKeyData(); + const PrivateKeyData *AsPrivateKeyData() const { + return const_cast(this)->AsPrivateKeyData(); + } + + // A KeyData cannot be freed directly. Rather, it must use this wrapper which + // calls the correct subclass's destructor. + static void Free(KeyData *data); + + protected: + explicit KeyData(bool is_private) : is_private_(is_private) {} + ~KeyData() = default; + bool is_private_; +}; + +template +class PublicKeyData : public KeyData { + public: + enum { kAllowUniquePtr = true }; + PublicKeyData() : KeyData(/*is_private=*/false) {} + + // Allows copying the PublicKey. + explicit PublicKeyData(const typename Traits::PublicKey &key) + : KeyData(/*is_private=*/false), pub(key) {} + + typename Traits::PublicKey pub; +}; + +template +class PrivateKeyData : public KeyData { + public: + enum { kAllowUniquePtr = true }; + PrivateKeyData() : KeyData(/*is_private=*/true) {} + typename Traits::PrivateKey priv; + uint8_t seed[MLKEM_SEED_BYTES]; +}; + +template +const typename Traits::PublicKey *KeyData::GetPublicKey() const { + auto *priv_data = AsPrivateKeyData(); + if (priv_data != nullptr) { + return Traits::PublicOfPrivate(&priv_data->priv); + } + return &static_cast *>(this)->pub; +} + +template +PrivateKeyData *KeyData::AsPrivateKeyData() { + if (is_private_) { + return static_cast *>(this); + } + return nullptr; +} + +template +void KeyData::Free(KeyData *data) { + if (data == nullptr) { + return; + } + // Delete the more specific subclass. This is moot for now, because neither + // type has a non-trivial destructor. + auto *priv_data = data->AsPrivateKeyData(); + if (priv_data) { + Delete(priv_data); + } else { + Delete(static_cast *>(data)); + } +} + +template +struct MLKEMImplementation { + static KeyData *GetKeyData(EvpPkey *pkey) { + assert(pkey->ameth == &asn1_method); + return static_cast *>(pkey->pkey); + } + + static const KeyData *GetKeyData(const EvpPkey *pkey) { + return GetKeyData(const_cast(pkey)); + } + + static void PkeyFree(EvpPkey *pkey) { + KeyData::Free(GetKeyData(pkey)); + pkey->pkey = nullptr; + } + + static int SetPrivateSeed(EvpPkey *pkey, const uint8_t *in, size_t len) { + auto priv = MakeUnique>(); + if (priv == nullptr) { + return 0; + } + + if (len != MLKEM_SEED_BYTES || + !Traits::PrivateKeyFromSeed(&priv->priv, in, len)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + OPENSSL_memcpy(priv->seed, in, len); + evp_pkey_set0(pkey, &asn1_method, priv.release()); + return 1; + } + + static int SetRawPublic(EvpPkey *pkey, const uint8_t *in, size_t len) { + auto pub = MakeUnique>(); + if (pub == nullptr) { + return 0; + } + CBS cbs; + CBS_init(&cbs, in, len); + if (!Traits::ParsePublicKey(&pub->pub, &cbs) || CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + evp_pkey_set0(pkey, &asn1_method, pub.release()); + return 1; + } + + static int GetPrivateSeed(const EvpPkey *pkey, uint8_t *out, + size_t *out_len) { + const auto *priv = GetKeyData(pkey)->AsPrivateKeyData(); + if (priv == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + if (out == nullptr) { + *out_len = MLKEM_SEED_BYTES; + return 1; + } + if (*out_len < MLKEM_SEED_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + OPENSSL_memcpy(out, priv->seed, MLKEM_SEED_BYTES); + *out_len = MLKEM_SEED_BYTES; + return 1; + } + + static int GetRawPublic(const EvpPkey *pkey, uint8_t *out, size_t *out_len) { + const auto *pub = GetKeyData(pkey)->GetPublicKey(); + if (out == nullptr) { + *out_len = Traits::kPublicKeyBytes; + return 1; + } + if (*out_len < Traits::kPublicKeyBytes) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + CBB cbb; + CBB_init_fixed(&cbb, out, Traits::kPublicKeyBytes); + BSSL_CHECK(Traits::MarshalPublicKey(&cbb, pub)); + BSSL_CHECK(CBB_len(&cbb) == Traits::kPublicKeyBytes); + *out_len = Traits::kPublicKeyBytes; + return 1; + } + + static evp_decode_result_t DecodePublic(const EVP_PKEY_ALG *alg, EvpPkey *out, + CBS *params, CBS *key) { + // Parameters must be absent. See RFC 9935, section 3. + if (CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + return SetRawPublic(out, CBS_data(key), CBS_len(key)) ? evp_decode_ok + : evp_decode_error; + } + + static int EncodePublic(CBB *out, const EvpPkey *pkey) { + const auto *pub = GetKeyData(pkey)->GetPublicKey(); + // See RFC 9935, section 4. + CBB spki, algorithm, key_bitstring; + if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, Traits::kOID.data(), + Traits::kOID.size()) || + !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || + !CBB_add_u8(&key_bitstring, 0 /* no unused bits */) || + !Traits::MarshalPublicKey(&key_bitstring, pub) || // + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + return 1; + } + + static bool EqualPublic(const EvpPkey *a, const EvpPkey *b) { + const auto *a_pub = GetKeyData(a)->GetPublicKey(); + const auto *b_pub = GetKeyData(b)->GetPublicKey(); + return Traits::PublicKeysEqual(a_pub, b_pub); + } + + static bool HasPublic(const EvpPkey *pk) { return true; } + + static bool CopyPublic(EvpPkey *out, const EvpPkey *pk) { + auto *public_copy = + New>(*GetKeyData(pk)->GetPublicKey()); + if (public_copy == nullptr) { + return false; + } + evp_pkey_set0(out, pk->ameth, public_copy); + return true; + } + + static evp_decode_result_t DecodePrivate(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + // Parameters must be absent. See RFC 9935, section 3. + if (CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + // See RFC 9935, section 6. Three different encodings are specified. We only + // implement the "seed" representation. + CBS seed; + if (!CBS_get_asn1(key, &seed, kSeedTag)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_PRIVATE_KEY_WAS_NOT_SEED); + return evp_decode_error; + } + if (CBS_len(key) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + return SetPrivateSeed(out, CBS_data(&seed), CBS_len(&seed)) + ? evp_decode_ok + : evp_decode_error; + } + + static int EncodePrivate(CBB *out, const EvpPkey *pkey) { + const auto *priv = GetKeyData(pkey)->AsPrivateKeyData(); + if (priv == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + + // See RFC 9935, section 6. Three different encodings are specified. We only + // implement the "seed" representation. + CBB pkcs8, algorithm, private_key; + if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || + !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, Traits::kOID.data(), + Traits::kOID.size()) || + !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || + !CBB_add_asn1_element(&private_key, kSeedTag, priv->seed, + sizeof(priv->seed)) || + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + return 1; + } + + static bool HasPrivate(const EvpPkey *pk) { + return GetKeyData(pk)->AsPrivateKeyData() != nullptr; + } + + static int PkeySize(const EvpPkey *pkey) { return Traits::kCiphertextBytes; } + static int PkeyBits(const EvpPkey *pkey) { + return Traits::kPublicKeyBytes * 8; + } + + static int CopyCtx(EvpPkeyCtx *dst, EvpPkeyCtx *src) { return 1; } + + static int KeyGen(EvpPkeyCtx *ctx, EvpPkey *pkey) { + auto priv = MakeUnique>(); + if (priv == nullptr) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; + } + uint8_t unused_public[Traits::kPublicKeyBytes]; + Traits::GenerateKey(unused_public, priv->seed, &priv->priv); + evp_pkey_set0(pkey, &asn1_method, priv.release()); + return 1; + } + + static int KemEncap(uint8_t *out_ciphertext, size_t ciphertext_len, + uint8_t *out_secret, size_t secret_len, + const EVP_PKEY *peer_key) { + const auto *peer_pubkey = GetKeyData(FromOpaque(peer_key))->GetPublicKey(); + if (ciphertext_len != Traits::kCiphertextBytes) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_CIPHERTEXT_LENGTH); + return 0; + } + if (secret_len != MLKEM_SHARED_SECRET_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_SECRET_LENGTH); + return 0; + } + Traits::Encap(out_ciphertext, out_secret, peer_pubkey); + return 1; + } + + static int KemDecap(uint8_t *out_secret, size_t secret_len, + const uint8_t *ciphertext, size_t ciphertext_len, + const EVP_PKEY *key) { + const auto *priv = GetKeyData(FromOpaque(key))->AsPrivateKeyData(); + if (priv == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + if (secret_len != MLKEM_SHARED_SECRET_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_SECRET_LENGTH); + return 0; + } + return Traits::Decap(out_secret, ciphertext, ciphertext_len, &priv->priv); + } + + static constexpr EVP_KEM evp_kem = { + /*pkey_id=*/Traits::kType, + /*ciphertext_len=*/Traits::kCiphertextBytes, + /*secret_len=*/MLKEM_SHARED_SECRET_BYTES, + &KemEncap, + &KemDecap, + }; + + static constexpr EVP_PKEY_CTX_METHOD pkey_method = { + Traits::kType, + /*init=*/nullptr, + &CopyCtx, + /*cleanup=*/nullptr, + &KeyGen, + /*sign=*/nullptr, + /*sign_message=*/nullptr, + /*verify=*/nullptr, + /*verify_message=*/nullptr, + /*verify_recover=*/nullptr, + /*encrypt=*/nullptr, + /*decrypt=*/nullptr, + /*derive=*/nullptr, + /*paramgen=*/nullptr, + &KemAdapter::EncapMethod, + &KemAdapter::DecapMethod, + /*ctrl=*/nullptr, + }; + + static constexpr EVP_PKEY_ASN1_METHOD BuildASN1Method() { + EVP_PKEY_ASN1_METHOD ret = { + Traits::kType, + // The OID is filled in below. + /*oid=*/{}, + /*oid_len=*/0, + + &pkey_method, + + &DecodePublic, + &EncodePublic, + &EqualPublic, + &HasPublic, + &CopyPublic, + &DecodePrivate, + &EncodePrivate, + &HasPrivate, + + /*set_priv_raw=*/nullptr, + &SetPrivateSeed, + &SetRawPublic, + /*get_priv_raw=*/nullptr, + &GetPrivateSeed, + &GetRawPublic, + + /*set1_tls_encodedpoint=*/nullptr, + /*get1_tls_encodedpoint=*/nullptr, + /*pkey_opaque=*/nullptr, + &PkeySize, + &PkeyBits, + + /*param_missing=*/nullptr, + /*param_copy=*/nullptr, + /*param_equal=*/nullptr, + + &PkeyFree, + }; + // TODO(crbug.com/404286922): Use std::copy in C++20, when it's constexpr. + // TODO(crbug.com/450823446): Better yet, make this field an InplaceVector + // and give it a suitable constructor. + constexpr auto oid = Traits::kOID; + static_assert(oid.size() <= sizeof(ret.oid)); + for (size_t i = 0; i < oid.size(); i++) { + ret.oid[i] = oid[i]; + } + ret.oid_len = oid.size(); + return ret; + } + + static constexpr EVP_PKEY_ASN1_METHOD asn1_method = BuildASN1Method(); + static constexpr EVP_PKEY_ALG pkey_alg = {&asn1_method, &pkey_method}; +}; + +} // namespace + +const EVP_PKEY_ALG *EVP_pkey_ml_kem_768() { + return &MLKEMImplementation::pkey_alg; +} + +const EVP_PKEY_ALG *EVP_pkey_ml_kem_1024() { + return &MLKEMImplementation::pkey_alg; +} + +const EVP_KEM *EVP_kem_ml_kem_768() { + return &MLKEMImplementation::evp_kem; +} + +const EVP_KEM *EVP_kem_ml_kem_1024() { + return &MLKEMImplementation::evp_kem; +} diff --git a/third_party/boringssl/src/crypto/evp/p_rsa.c b/third_party/boringssl/src/crypto/evp/p_rsa.c deleted file mode 100644 index 7872a922..00000000 --- a/third_party/boringssl/src/crypto/evp/p_rsa.c +++ /dev/null @@ -1,648 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 2006. - */ -/* ==================================================================== - * Copyright (c) 2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "../internal.h" -#include "../fipsmodule/rsa/internal.h" -#include "internal.h" - - -typedef struct { - // Key gen parameters - int nbits; - BIGNUM *pub_exp; - // RSA padding mode - int pad_mode; - // message digest - const EVP_MD *md; - // message digest for MGF1 - const EVP_MD *mgf1md; - // PSS salt length - int saltlen; - // tbuf is a buffer which is either NULL, or is the size of the RSA modulus. - // It's used to store the output of RSA operations. - uint8_t *tbuf; - // OAEP label - uint8_t *oaep_label; - size_t oaep_labellen; -} RSA_PKEY_CTX; - -typedef struct { - uint8_t *data; - size_t len; -} RSA_OAEP_LABEL_PARAMS; - -static int pkey_rsa_init(EVP_PKEY_CTX *ctx) { - RSA_PKEY_CTX *rctx; - rctx = OPENSSL_malloc(sizeof(RSA_PKEY_CTX)); - if (!rctx) { - return 0; - } - OPENSSL_memset(rctx, 0, sizeof(RSA_PKEY_CTX)); - - rctx->nbits = 2048; - rctx->pad_mode = RSA_PKCS1_PADDING; - rctx->saltlen = -2; - - ctx->data = rctx; - - return 1; -} - -static int pkey_rsa_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) { - RSA_PKEY_CTX *dctx, *sctx; - if (!pkey_rsa_init(dst)) { - return 0; - } - sctx = src->data; - dctx = dst->data; - dctx->nbits = sctx->nbits; - if (sctx->pub_exp) { - dctx->pub_exp = BN_dup(sctx->pub_exp); - if (!dctx->pub_exp) { - return 0; - } - } - - dctx->pad_mode = sctx->pad_mode; - dctx->md = sctx->md; - dctx->mgf1md = sctx->mgf1md; - dctx->saltlen = sctx->saltlen; - if (sctx->oaep_label) { - OPENSSL_free(dctx->oaep_label); - dctx->oaep_label = OPENSSL_memdup(sctx->oaep_label, sctx->oaep_labellen); - if (!dctx->oaep_label) { - return 0; - } - dctx->oaep_labellen = sctx->oaep_labellen; - } - - return 1; -} - -static void pkey_rsa_cleanup(EVP_PKEY_CTX *ctx) { - RSA_PKEY_CTX *rctx = ctx->data; - - if (rctx == NULL) { - return; - } - - BN_free(rctx->pub_exp); - OPENSSL_free(rctx->tbuf); - OPENSSL_free(rctx->oaep_label); - OPENSSL_free(rctx); -} - -static int setup_tbuf(RSA_PKEY_CTX *ctx, EVP_PKEY_CTX *pk) { - if (ctx->tbuf) { - return 1; - } - ctx->tbuf = OPENSSL_malloc(EVP_PKEY_size(pk->pkey)); - if (!ctx->tbuf) { - return 0; - } - return 1; -} - -static int pkey_rsa_sign(EVP_PKEY_CTX *ctx, uint8_t *sig, size_t *siglen, - const uint8_t *tbs, size_t tbslen) { - RSA_PKEY_CTX *rctx = ctx->data; - RSA *rsa = ctx->pkey->pkey.rsa; - const size_t key_len = EVP_PKEY_size(ctx->pkey); - - if (!sig) { - *siglen = key_len; - return 1; - } - - if (*siglen < key_len) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - - if (rctx->md) { - unsigned out_len; - switch (rctx->pad_mode) { - case RSA_PKCS1_PADDING: - if (!RSA_sign(EVP_MD_type(rctx->md), tbs, tbslen, sig, &out_len, rsa)) { - return 0; - } - *siglen = out_len; - return 1; - - case RSA_PKCS1_PSS_PADDING: - return RSA_sign_pss_mgf1(rsa, siglen, sig, *siglen, tbs, tbslen, - rctx->md, rctx->mgf1md, rctx->saltlen); - - default: - return 0; - } - } - - return RSA_sign_raw(rsa, siglen, sig, *siglen, tbs, tbslen, rctx->pad_mode); -} - -static int pkey_rsa_verify(EVP_PKEY_CTX *ctx, const uint8_t *sig, - size_t siglen, const uint8_t *tbs, - size_t tbslen) { - RSA_PKEY_CTX *rctx = ctx->data; - RSA *rsa = ctx->pkey->pkey.rsa; - - if (rctx->md) { - switch (rctx->pad_mode) { - case RSA_PKCS1_PADDING: - return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen, sig, siglen, rsa); - - case RSA_PKCS1_PSS_PADDING: - return RSA_verify_pss_mgf1(rsa, tbs, tbslen, rctx->md, rctx->mgf1md, - rctx->saltlen, sig, siglen); - - default: - return 0; - } - } - - size_t rslen; - const size_t key_len = EVP_PKEY_size(ctx->pkey); - if (!setup_tbuf(rctx, ctx) || - !RSA_verify_raw(rsa, &rslen, rctx->tbuf, key_len, sig, siglen, - rctx->pad_mode) || - rslen != tbslen || - CRYPTO_memcmp(tbs, rctx->tbuf, rslen) != 0) { - return 0; - } - - return 1; -} - -static int pkey_rsa_verify_recover(EVP_PKEY_CTX *ctx, uint8_t *out, - size_t *out_len, const uint8_t *sig, - size_t sig_len) { - RSA_PKEY_CTX *rctx = ctx->data; - RSA *rsa = ctx->pkey->pkey.rsa; - const size_t key_len = EVP_PKEY_size(ctx->pkey); - - if (out == NULL) { - *out_len = key_len; - return 1; - } - - if (*out_len < key_len) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - - if (rctx->md == NULL) { - return RSA_verify_raw(rsa, out_len, out, *out_len, sig, sig_len, - rctx->pad_mode); - } - - if (rctx->pad_mode != RSA_PKCS1_PADDING) { - return 0; - } - - // Assemble the encoded hash, using a placeholder hash value. - static const uint8_t kDummyHash[EVP_MAX_MD_SIZE] = {0}; - const size_t hash_len = EVP_MD_size(rctx->md); - uint8_t *asn1_prefix; - size_t asn1_prefix_len; - int asn1_prefix_allocated; - if (!setup_tbuf(rctx, ctx) || - !RSA_add_pkcs1_prefix(&asn1_prefix, &asn1_prefix_len, - &asn1_prefix_allocated, EVP_MD_type(rctx->md), - kDummyHash, hash_len)) { - return 0; - } - - size_t rslen; - int ok = 1; - if (!RSA_verify_raw(rsa, &rslen, rctx->tbuf, key_len, sig, sig_len, - RSA_PKCS1_PADDING) || - rslen != asn1_prefix_len || - // Compare all but the hash suffix. - CRYPTO_memcmp(rctx->tbuf, asn1_prefix, asn1_prefix_len - hash_len) != 0) { - ok = 0; - } - - if (asn1_prefix_allocated) { - OPENSSL_free(asn1_prefix); - } - - if (!ok) { - return 0; - } - - if (out != NULL) { - OPENSSL_memcpy(out, rctx->tbuf + rslen - hash_len, hash_len); - } - *out_len = hash_len; - - return 1; -} - -static int pkey_rsa_encrypt(EVP_PKEY_CTX *ctx, uint8_t *out, size_t *outlen, - const uint8_t *in, size_t inlen) { - RSA_PKEY_CTX *rctx = ctx->data; - RSA *rsa = ctx->pkey->pkey.rsa; - const size_t key_len = EVP_PKEY_size(ctx->pkey); - - if (!out) { - *outlen = key_len; - return 1; - } - - if (*outlen < key_len) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - - if (rctx->pad_mode == RSA_PKCS1_OAEP_PADDING) { - if (!setup_tbuf(rctx, ctx) || - !RSA_padding_add_PKCS1_OAEP_mgf1(rctx->tbuf, key_len, in, inlen, - rctx->oaep_label, rctx->oaep_labellen, - rctx->md, rctx->mgf1md) || - !RSA_encrypt(rsa, outlen, out, *outlen, rctx->tbuf, key_len, - RSA_NO_PADDING)) { - return 0; - } - return 1; - } - - return RSA_encrypt(rsa, outlen, out, *outlen, in, inlen, rctx->pad_mode); -} - -static int pkey_rsa_decrypt(EVP_PKEY_CTX *ctx, uint8_t *out, - size_t *outlen, const uint8_t *in, - size_t inlen) { - RSA_PKEY_CTX *rctx = ctx->data; - RSA *rsa = ctx->pkey->pkey.rsa; - const size_t key_len = EVP_PKEY_size(ctx->pkey); - - if (!out) { - *outlen = key_len; - return 1; - } - - if (*outlen < key_len) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - - if (rctx->pad_mode == RSA_PKCS1_OAEP_PADDING) { - size_t padded_len; - if (!setup_tbuf(rctx, ctx) || - !RSA_decrypt(rsa, &padded_len, rctx->tbuf, key_len, in, inlen, - RSA_NO_PADDING) || - !RSA_padding_check_PKCS1_OAEP_mgf1( - out, outlen, key_len, rctx->tbuf, padded_len, rctx->oaep_label, - rctx->oaep_labellen, rctx->md, rctx->mgf1md)) { - return 0; - } - return 1; - } - - return RSA_decrypt(rsa, outlen, out, key_len, in, inlen, rctx->pad_mode); -} - -static int check_padding_md(const EVP_MD *md, int padding) { - if (!md) { - return 1; - } - - if (padding == RSA_NO_PADDING) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PADDING_MODE); - return 0; - } - - return 1; -} - -static int is_known_padding(int padding_mode) { - switch (padding_mode) { - case RSA_PKCS1_PADDING: - case RSA_NO_PADDING: - case RSA_PKCS1_OAEP_PADDING: - case RSA_PKCS1_PSS_PADDING: - return 1; - default: - return 0; - } -} - -static int pkey_rsa_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) { - RSA_PKEY_CTX *rctx = ctx->data; - switch (type) { - case EVP_PKEY_CTRL_RSA_PADDING: - if (!is_known_padding(p1) || !check_padding_md(rctx->md, p1) || - (p1 == RSA_PKCS1_PSS_PADDING && - 0 == (ctx->operation & (EVP_PKEY_OP_SIGN | EVP_PKEY_OP_VERIFY))) || - (p1 == RSA_PKCS1_OAEP_PADDING && - 0 == (ctx->operation & EVP_PKEY_OP_TYPE_CRYPT))) { - OPENSSL_PUT_ERROR(EVP, EVP_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE); - return 0; - } - if ((p1 == RSA_PKCS1_PSS_PADDING || p1 == RSA_PKCS1_OAEP_PADDING) && - rctx->md == NULL) { - rctx->md = EVP_sha1(); - } - rctx->pad_mode = p1; - return 1; - - case EVP_PKEY_CTRL_GET_RSA_PADDING: - *(int *)p2 = rctx->pad_mode; - return 1; - - case EVP_PKEY_CTRL_RSA_PSS_SALTLEN: - case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN: - if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PSS_SALTLEN); - return 0; - } - if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN) { - *(int *)p2 = rctx->saltlen; - } else { - if (p1 < -2) { - return 0; - } - rctx->saltlen = p1; - } - return 1; - - case EVP_PKEY_CTRL_RSA_KEYGEN_BITS: - if (p1 < 256) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_KEYBITS); - return 0; - } - rctx->nbits = p1; - return 1; - - case EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP: - if (!p2) { - return 0; - } - BN_free(rctx->pub_exp); - rctx->pub_exp = p2; - return 1; - - case EVP_PKEY_CTRL_RSA_OAEP_MD: - case EVP_PKEY_CTRL_GET_RSA_OAEP_MD: - if (rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PADDING_MODE); - return 0; - } - if (type == EVP_PKEY_CTRL_GET_RSA_OAEP_MD) { - *(const EVP_MD **)p2 = rctx->md; - } else { - rctx->md = p2; - } - return 1; - - case EVP_PKEY_CTRL_MD: - if (!check_padding_md(p2, rctx->pad_mode)) { - return 0; - } - rctx->md = p2; - return 1; - - case EVP_PKEY_CTRL_GET_MD: - *(const EVP_MD **)p2 = rctx->md; - return 1; - - case EVP_PKEY_CTRL_RSA_MGF1_MD: - case EVP_PKEY_CTRL_GET_RSA_MGF1_MD: - if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING && - rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_MGF1_MD); - return 0; - } - if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD) { - if (rctx->mgf1md) { - *(const EVP_MD **)p2 = rctx->mgf1md; - } else { - *(const EVP_MD **)p2 = rctx->md; - } - } else { - rctx->mgf1md = p2; - } - return 1; - - case EVP_PKEY_CTRL_RSA_OAEP_LABEL: { - if (rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PADDING_MODE); - return 0; - } - OPENSSL_free(rctx->oaep_label); - RSA_OAEP_LABEL_PARAMS *params = p2; - rctx->oaep_label = params->data; - rctx->oaep_labellen = params->len; - return 1; - } - - case EVP_PKEY_CTRL_GET_RSA_OAEP_LABEL: - if (rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PADDING_MODE); - return 0; - } - CBS_init((CBS *)p2, rctx->oaep_label, rctx->oaep_labellen); - return 1; - - default: - OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); - return 0; - } -} - -static int pkey_rsa_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) { - RSA *rsa = NULL; - RSA_PKEY_CTX *rctx = ctx->data; - - if (!rctx->pub_exp) { - rctx->pub_exp = BN_new(); - if (!rctx->pub_exp || !BN_set_word(rctx->pub_exp, RSA_F4)) { - return 0; - } - } - rsa = RSA_new(); - if (!rsa) { - return 0; - } - - if (!RSA_generate_key_ex(rsa, rctx->nbits, rctx->pub_exp, NULL)) { - RSA_free(rsa); - return 0; - } - - EVP_PKEY_assign_RSA(pkey, rsa); - return 1; -} - -const EVP_PKEY_METHOD rsa_pkey_meth = { - EVP_PKEY_RSA, - pkey_rsa_init, - pkey_rsa_copy, - pkey_rsa_cleanup, - pkey_rsa_keygen, - pkey_rsa_sign, - NULL /* sign_message */, - pkey_rsa_verify, - NULL /* verify_message */, - pkey_rsa_verify_recover, - pkey_rsa_encrypt, - pkey_rsa_decrypt, - NULL /* derive */, - NULL /* paramgen */, - pkey_rsa_ctrl, -}; - -int EVP_PKEY_CTX_set_rsa_padding(EVP_PKEY_CTX *ctx, int padding) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, EVP_PKEY_CTRL_RSA_PADDING, - padding, NULL); -} - -int EVP_PKEY_CTX_get_rsa_padding(EVP_PKEY_CTX *ctx, int *out_padding) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, -1, EVP_PKEY_CTRL_GET_RSA_PADDING, - 0, out_padding); -} - -int EVP_PKEY_CTX_set_rsa_pss_keygen_md(EVP_PKEY_CTX *ctx, const EVP_MD *md) { - return 0; -} - -int EVP_PKEY_CTX_set_rsa_pss_keygen_saltlen(EVP_PKEY_CTX *ctx, int salt_len) { - return 0; -} - -int EVP_PKEY_CTX_set_rsa_pss_keygen_mgf1_md(EVP_PKEY_CTX *ctx, - const EVP_MD *md) { - return 0; -} - -int EVP_PKEY_CTX_set_rsa_pss_saltlen(EVP_PKEY_CTX *ctx, int salt_len) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, - (EVP_PKEY_OP_SIGN | EVP_PKEY_OP_VERIFY), - EVP_PKEY_CTRL_RSA_PSS_SALTLEN, salt_len, NULL); -} - -int EVP_PKEY_CTX_get_rsa_pss_saltlen(EVP_PKEY_CTX *ctx, int *out_salt_len) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, - (EVP_PKEY_OP_SIGN | EVP_PKEY_OP_VERIFY), - EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN, 0, out_salt_len); -} - -int EVP_PKEY_CTX_set_rsa_keygen_bits(EVP_PKEY_CTX *ctx, int bits) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, - EVP_PKEY_CTRL_RSA_KEYGEN_BITS, bits, NULL); -} - -int EVP_PKEY_CTX_set_rsa_keygen_pubexp(EVP_PKEY_CTX *ctx, BIGNUM *e) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_KEYGEN, - EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP, 0, e); -} - -int EVP_PKEY_CTX_set_rsa_oaep_md(EVP_PKEY_CTX *ctx, const EVP_MD *md) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_CRYPT, - EVP_PKEY_CTRL_RSA_OAEP_MD, 0, (void *)md); -} - -int EVP_PKEY_CTX_get_rsa_oaep_md(EVP_PKEY_CTX *ctx, const EVP_MD **out_md) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_CRYPT, - EVP_PKEY_CTRL_GET_RSA_OAEP_MD, 0, (void*) out_md); -} - -int EVP_PKEY_CTX_set_rsa_mgf1_md(EVP_PKEY_CTX *ctx, const EVP_MD *md) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, - EVP_PKEY_OP_TYPE_SIG | EVP_PKEY_OP_TYPE_CRYPT, - EVP_PKEY_CTRL_RSA_MGF1_MD, 0, (void*) md); -} - -int EVP_PKEY_CTX_get_rsa_mgf1_md(EVP_PKEY_CTX *ctx, const EVP_MD **out_md) { - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, - EVP_PKEY_OP_TYPE_SIG | EVP_PKEY_OP_TYPE_CRYPT, - EVP_PKEY_CTRL_GET_RSA_MGF1_MD, 0, (void*) out_md); -} - -int EVP_PKEY_CTX_set0_rsa_oaep_label(EVP_PKEY_CTX *ctx, uint8_t *label, - size_t label_len) { - RSA_OAEP_LABEL_PARAMS params = {label, label_len}; - return EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_CRYPT, - EVP_PKEY_CTRL_RSA_OAEP_LABEL, 0, ¶ms); -} - -int EVP_PKEY_CTX_get0_rsa_oaep_label(EVP_PKEY_CTX *ctx, - const uint8_t **out_label) { - CBS label; - if (!EVP_PKEY_CTX_ctrl(ctx, EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_CRYPT, - EVP_PKEY_CTRL_GET_RSA_OAEP_LABEL, 0, &label)) { - return -1; - } - if (CBS_len(&label) > INT_MAX) { - OPENSSL_PUT_ERROR(EVP, ERR_R_OVERFLOW); - return -1; - } - *out_label = CBS_data(&label); - return (int)CBS_len(&label); -} diff --git a/third_party/boringssl/src/crypto/evp/p_rsa.cc b/third_party/boringssl/src/crypto/evp/p_rsa.cc new file mode 100644 index 00000000..90038538 --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/p_rsa.cc @@ -0,0 +1,1117 @@ +// Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../fipsmodule/rsa/internal.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "../rsa/internal.h" +#include "internal.h" + + +using namespace bssl; + +namespace { + +struct EVP_PKEY_ALG_RSA_PSS : public EVP_PKEY_ALG { + rsa_pss_params_t pss_params; +}; + +extern const EVP_PKEY_ASN1_METHOD rsa_asn1_meth; +extern const EVP_PKEY_ASN1_METHOD rsa_pss_asn1_meth; +extern const EVP_PKEY_CTX_METHOD rsa_pkey_meth; +extern const EVP_PKEY_CTX_METHOD rsa_pss_pkey_meth; + +static int rsa_pub_encode(CBB *out, const EvpPkey *key) { + // See RFC 3279, section 2.3.1. + const RSA *rsa = reinterpret_cast(key->pkey); + CBB spki, algorithm, null, key_bitstring; + if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, rsa_asn1_meth.oid, + rsa_asn1_meth.oid_len) || + !CBB_add_asn1(&algorithm, &null, CBS_ASN1_NULL) || + !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || + !CBB_add_u8(&key_bitstring, 0 /* padding */) || + !RSA_marshal_public_key(&key_bitstring, rsa) || // + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static bssl::evp_decode_result_t rsa_pub_decode(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + // See RFC 3279, section 2.3.1. + + // The parameters must be NULL. + CBS null; + if (!CBS_get_asn1(params, &null, CBS_ASN1_NULL) || CBS_len(&null) != 0 || + CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + UniquePtr rsa(RSA_public_key_from_bytes(CBS_data(key), CBS_len(key))); + if (rsa == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + EVP_PKEY_assign_RSA(out, rsa.release()); + return evp_decode_ok; +} + +static bool rsa_pub_equal(const EvpPkey *a, const EvpPkey *b) { + // We currently assume that all |EVP_PKEY_RSA_PSS| keys have the same + // parameters, so this vacuously compares parameters. If we ever support + // multiple PSS parameter sets, we probably should compare them too. Note, + // however, that OpenSSL does not compare parameters here. + const RSA *a_rsa = reinterpret_cast(a->pkey); + const RSA *b_rsa = reinterpret_cast(b->pkey); + return BN_cmp(RSA_get0_n(b_rsa), RSA_get0_n(a_rsa)) == 0 && + BN_cmp(RSA_get0_e(b_rsa), RSA_get0_e(a_rsa)) == 0; +} + +static bool rsa_pub_present(const EvpPkey *pk) { + const RSA *pk_rsa = reinterpret_cast(pk->pkey); + // An RSA public key should always have n and e. It's possible for a (private) + // key to have n and d, but not e, so we must explicitly check for the + // presence of e. + return RSA_get0_n(pk_rsa) != nullptr && RSA_get0_e(pk_rsa) != nullptr; +} + +static bool rsa_pub_copy(EvpPkey *out, const EvpPkey *pkey) { + const RSAImpl *pk_rsa = reinterpret_cast(pkey->pkey); + const BIGNUM *pk_n = RSA_get0_n(pk_rsa); + const BIGNUM *pk_e = RSA_get0_e(pk_rsa); + if (pk_n == nullptr || pk_e == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PUBLIC_KEY); + return false; + } + UniquePtr public_copy_rsa(RSA_new_public_key(pk_n, pk_e)); + if (!public_copy_rsa) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return false; + } + FromOpaque(public_copy_rsa.get())->pss_params = pk_rsa->pss_params; + evp_pkey_set0(out, pkey->ameth, public_copy_rsa.release()); + return true; +} + +static int rsa_priv_encode(CBB *out, const EvpPkey *key) { + const RSA *rsa = reinterpret_cast(key->pkey); + CBB pkcs8, algorithm, null, private_key; + if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || + !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, rsa_asn1_meth.oid, + rsa_asn1_meth.oid_len) || + !CBB_add_asn1(&algorithm, &null, CBS_ASN1_NULL) || + !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || + !RSA_marshal_private_key(&private_key, rsa) || // + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static bssl::evp_decode_result_t rsa_priv_decode(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + // Per RFC 8017, A.1, the parameters have type NULL. + CBS null; + if (!CBS_get_asn1(params, &null, CBS_ASN1_NULL) || CBS_len(&null) != 0 || + CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + UniquePtr rsa(RSA_private_key_from_bytes(CBS_data(key), CBS_len(key))); + if (rsa == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + EVP_PKEY_assign_RSA(out, rsa.release()); + return evp_decode_ok; +} + +static bool rsa_priv_present(const EvpPkey *pk) { + const RSA *pk_rsa = reinterpret_cast(pk->pkey); + return RSA_get0_n(pk_rsa) != nullptr && RSA_get0_d(pk_rsa) != nullptr; +} + +static bssl::evp_decode_result_t rsa_decode_pss_params( + rsa_pss_params_t expected, CBS *params) { + if (CBS_len(params) == 0) { + return evp_decode_unsupported; + } + rsa_pss_params_t pss_params; + if (!rsa_parse_pss_params(params, &pss_params, + /*allow_explicit_trailer=*/false) || + CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + return pss_params == expected ? evp_decode_ok : evp_decode_unsupported; +} + +static int rsa_pub_encode_pss(CBB *out, const EvpPkey *key) { + const RSAImpl *rsa = reinterpret_cast(key->pkey); + CBB spki, algorithm, key_bitstring; + if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, rsa_pss_asn1_meth.oid, + rsa_pss_asn1_meth.oid_len) || + !rsa_marshal_pss_params(&algorithm, rsa->pss_params) || + !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || + !CBB_add_u8(&key_bitstring, 0 /* padding */) || + !RSA_marshal_public_key(&key_bitstring, rsa) || // + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static bssl::evp_decode_result_t rsa_pub_decode_pss(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + const auto *alg_pss = static_cast(alg); + evp_decode_result_t ret = rsa_decode_pss_params(alg_pss->pss_params, params); + if (ret != evp_decode_ok) { + return ret; + } + + UniquePtr rsa( + FromOpaque(RSA_public_key_from_bytes(CBS_data(key), CBS_len(key)))); + if (rsa == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + rsa->pss_params = alg_pss->pss_params; + evp_pkey_set0(out, &rsa_pss_asn1_meth, rsa.release()); + return evp_decode_ok; +} + +static int rsa_priv_encode_pss(CBB *out, const EvpPkey *key) { + const RSAImpl *rsa = reinterpret_cast(key->pkey); + CBB pkcs8, algorithm, private_key; + if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || + !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, rsa_pss_asn1_meth.oid, + rsa_pss_asn1_meth.oid_len) || + !rsa_marshal_pss_params(&algorithm, rsa->pss_params) || + !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || + !RSA_marshal_private_key(&private_key, rsa) || // + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static bssl::evp_decode_result_t rsa_priv_decode_pss(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + const auto *alg_pss = static_cast(alg); + evp_decode_result_t ret = rsa_decode_pss_params(alg_pss->pss_params, params); + if (ret != evp_decode_ok) { + return ret; + } + + UniquePtr rsa( + FromOpaque(RSA_private_key_from_bytes(CBS_data(key), CBS_len(key)))); + if (rsa == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + rsa->pss_params = alg_pss->pss_params; + evp_pkey_set0(out, &rsa_pss_asn1_meth, rsa.release()); + return evp_decode_ok; +} + +static int rsa_opaque(const EvpPkey *pkey) { + const RSA *rsa = reinterpret_cast(pkey->pkey); + return RSA_is_opaque(rsa); +} + +static int int_rsa_size(const EvpPkey *pkey) { + const RSA *rsa = reinterpret_cast(pkey->pkey); + return RSA_size(rsa); +} + +static int rsa_bits(const EvpPkey *pkey) { + const RSA *rsa = reinterpret_cast(pkey->pkey); + return RSA_bits(rsa); +} + +static void int_rsa_free(EvpPkey *pkey) { + RSA_free(reinterpret_cast(pkey->pkey)); + pkey->pkey = nullptr; +} + +static int rsa_pss_params_missing(const EvpPkey *pkey) { + const RSA *rsa = reinterpret_cast(pkey->pkey); + return rsa == nullptr || FromOpaque(rsa)->pss_params == rsa_pss_none; +} + +static int rsa_pss_params_copy(EvpPkey *to, const EvpPkey *from) { + const RSA *from_key = reinterpret_cast(from->pkey); + if (from_key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return 0; + } + rsa_pss_params_t pss_params = FromOpaque(from_key)->pss_params; + if (pss_params == rsa_pss_none) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MISSING_PARAMETERS); + return 0; + } + if (to->pkey == nullptr) { + to->pkey = RSA_new(); + if (to->pkey == nullptr) { + return 0; + } + } + FromOpaque(reinterpret_cast(to->pkey))->pss_params = pss_params; + return 1; +} + +static bool rsa_pss_params_equal(const EvpPkey *a, const EvpPkey *b) { + const RSA *a_rsa = reinterpret_cast(a->pkey); + const RSA *b_rsa = reinterpret_cast(b->pkey); + if (a_rsa == nullptr || b_rsa == nullptr) { + return false; + } + rsa_pss_params_t a_pss_params = FromOpaque(a_rsa)->pss_params; + rsa_pss_params_t b_pss_params = FromOpaque(b_rsa)->pss_params; + if (a_pss_params == rsa_pss_none || b_pss_params == rsa_pss_none) { + return false; + } + return a_pss_params == b_pss_params; +} + +const EVP_PKEY_ASN1_METHOD rsa_asn1_meth = { + EVP_PKEY_RSA, + // 1.2.840.113549.1.1.1 + {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x01}, + 9, + + &rsa_pkey_meth, + + rsa_pub_decode, + rsa_pub_encode, + rsa_pub_equal, + rsa_pub_present, + rsa_pub_copy, + + rsa_priv_decode, + rsa_priv_encode, + rsa_priv_present, + + /*set_priv_raw=*/nullptr, + /*set_priv_seed=*/nullptr, + /*set_pub_raw=*/nullptr, + /*get_priv_raw=*/nullptr, + /*get_priv_seed=*/nullptr, + /*get_pub_raw=*/nullptr, + /*set1_tls_encodedpoint=*/nullptr, + /*get1_tls_encodedpoint=*/nullptr, + + rsa_opaque, + + int_rsa_size, + rsa_bits, + + /*param_missing=*/nullptr, + /*param_copy=*/nullptr, + /*param_equal=*/nullptr, + + int_rsa_free, +}; + +const EVP_PKEY_ASN1_METHOD rsa_pss_asn1_meth = { + EVP_PKEY_RSA_PSS, + // 1.2.840.113549.1.1.10 + {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x0a}, + 9, + + &rsa_pss_pkey_meth, + + rsa_pub_decode_pss, + rsa_pub_encode_pss, + rsa_pub_equal, + rsa_pub_present, + rsa_pub_copy, + + rsa_priv_decode_pss, + rsa_priv_encode_pss, + rsa_priv_present, + + /*set_priv_raw=*/nullptr, + /*set_priv_seed=*/nullptr, + /*set_pub_raw=*/nullptr, + /*get_priv_raw=*/nullptr, + /*get_priv_seed=*/nullptr, + /*get_pub_raw=*/nullptr, + /*set1_tls_encodedpoint=*/nullptr, + /*get1_tls_encodedpoint=*/nullptr, + + rsa_opaque, + + int_rsa_size, + rsa_bits, + + rsa_pss_params_missing, + rsa_pss_params_copy, + rsa_pss_params_equal, + + int_rsa_free, +}; + + +struct RSA_PKEY_CTX { + // Key gen parameters + int nbits = 2048; + UniquePtr pub_exp; + // RSA padding mode + int pad_mode = RSA_PKCS1_PADDING; + // message digest + const EVP_MD *md = nullptr; + // message digest for MGF1 + const EVP_MD *mgf1md = nullptr; + // PSS salt length + int saltlen = RSA_PSS_SALTLEN_DIGEST; + // restrict_pss_params, if true, indicates that the PSS signing/verifying + // parameters are restricted by the key's parameters. |md| and |mgf1md| may + // not change, and |saltlen| must be at least |md|'s hash length. + bool restrict_pss_params = false; + Array oaep_label; +}; + +static bool is_pss_only(const EvpPkeyCtx *ctx) { + return ctx->pmeth->pkey_id == EVP_PKEY_RSA_PSS; +} + +static int pkey_rsa_init(EvpPkeyCtx *ctx, const EVP_PKEY_ALG *alg) { + RSA_PKEY_CTX *rctx = New(); + if (!rctx) { + return 0; + } + + if (is_pss_only(ctx)) { + rctx->pad_mode = RSA_PKCS1_PSS_PADDING; + // Pick up PSS parameters from the key or algorithm. We don't currently + // support keygen from PSS, so the algorithm does not currently do anything. + rsa_pss_params_t pss_params = rsa_pss_none; + const auto *alg_pss = static_cast(alg); + if (alg_pss != nullptr) { + pss_params = alg_pss->pss_params; + } else if (ctx->pkey != nullptr && ctx->pkey->pkey != nullptr) { + pss_params = static_cast(ctx->pkey->pkey)->pss_params; + } + const EVP_MD *md = rsa_pss_params_get_md(pss_params); + if (md != nullptr) { + rctx->md = rctx->mgf1md = md; + // All our supported modes use the digest length as the salt length. + rctx->saltlen = EVP_MD_size(rctx->md); + rctx->restrict_pss_params = true; + } + } + + ctx->data = rctx; + return 1; +} + +static int pkey_rsa_copy(EvpPkeyCtx *dst, EvpPkeyCtx *src) { + RSA_PKEY_CTX *dctx, *sctx; + if (!pkey_rsa_init(dst, nullptr)) { + return 0; + } + sctx = reinterpret_cast(src->data); + dctx = reinterpret_cast(dst->data); + dctx->nbits = sctx->nbits; + if (sctx->pub_exp) { + dctx->pub_exp.reset(BN_dup(sctx->pub_exp.get())); + if (!dctx->pub_exp) { + return 0; + } + } + + dctx->pad_mode = sctx->pad_mode; + dctx->md = sctx->md; + dctx->mgf1md = sctx->mgf1md; + dctx->saltlen = sctx->saltlen; + dctx->restrict_pss_params = sctx->restrict_pss_params; + if (!dctx->oaep_label.CopyFrom(sctx->oaep_label)) { + return 0; + } + + return 1; +} + +static void pkey_rsa_cleanup(EvpPkeyCtx *ctx) { + Delete(reinterpret_cast(ctx->data)); +} + +static int pkey_rsa_sign(EvpPkeyCtx *ctx, uint8_t *sig, size_t *siglen, + const uint8_t *tbs, size_t tbslen) { + RSA_PKEY_CTX *rctx = reinterpret_cast(ctx->data); + RSA *rsa = reinterpret_cast(ctx->pkey->pkey); + const size_t key_len = EVP_PKEY_size(ctx->pkey.get()); + + if (!sig) { + *siglen = key_len; + return 1; + } + + if (*siglen < key_len) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + if (rctx->md) { + unsigned out_len; + switch (rctx->pad_mode) { + case RSA_PKCS1_PADDING: + if (!RSA_sign(EVP_MD_type(rctx->md), tbs, tbslen, sig, &out_len, rsa)) { + return 0; + } + *siglen = out_len; + return 1; + + case RSA_PKCS1_PSS_PADDING: + return RSA_sign_pss_mgf1(rsa, siglen, sig, *siglen, tbs, tbslen, + rctx->md, rctx->mgf1md, rctx->saltlen); + + default: + return 0; + } + } + + return RSA_sign_raw(rsa, siglen, sig, *siglen, tbs, tbslen, rctx->pad_mode); +} + +static int pkey_rsa_verify(EvpPkeyCtx *ctx, const uint8_t *sig, size_t siglen, + const uint8_t *tbs, size_t tbslen) { + RSA_PKEY_CTX *rctx = reinterpret_cast(ctx->data); + RSA *rsa = reinterpret_cast(ctx->pkey->pkey); + + if (rctx->md) { + switch (rctx->pad_mode) { + case RSA_PKCS1_PADDING: + return RSA_verify(EVP_MD_type(rctx->md), tbs, tbslen, sig, siglen, rsa); + + case RSA_PKCS1_PSS_PADDING: + return RSA_verify_pss_mgf1(rsa, tbs, tbslen, rctx->md, rctx->mgf1md, + rctx->saltlen, sig, siglen); + + default: + return 0; + } + } + + size_t rslen; + const size_t key_len = EVP_PKEY_size(ctx->pkey.get()); + Array tbuf; + if (!tbuf.InitForOverwrite(key_len) || + !RSA_verify_raw(rsa, &rslen, tbuf.data(), tbuf.size(), sig, siglen, + rctx->pad_mode)) { + return 0; + } + if (rslen != tbslen || CRYPTO_memcmp(tbs, tbuf.data(), rslen) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_SIGNATURE); + return 0; + } + + return 1; +} + +static int pkey_rsa_verify_recover(EvpPkeyCtx *ctx, uint8_t *out, + size_t *out_len, const uint8_t *sig, + size_t sig_len) { + RSA_PKEY_CTX *rctx = reinterpret_cast(ctx->data); + RSA *rsa = reinterpret_cast(ctx->pkey->pkey); + const size_t key_len = EVP_PKEY_size(ctx->pkey.get()); + + if (out == nullptr) { + *out_len = key_len; + return 1; + } + + if (*out_len < key_len) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + if (rctx->md == nullptr) { + return RSA_verify_raw(rsa, out_len, out, *out_len, sig, sig_len, + rctx->pad_mode); + } + + if (rctx->pad_mode != RSA_PKCS1_PADDING) { + return 0; + } + + // Assemble the encoded hash, using a placeholder hash value. + static const uint8_t kDummyHash[EVP_MAX_MD_SIZE] = {0}; + const size_t hash_len = EVP_MD_size(rctx->md); + uint8_t *asn1_prefix; + size_t asn1_prefix_len; + int asn1_prefix_allocated; + if (!RSA_add_pkcs1_prefix(&asn1_prefix, &asn1_prefix_len, + &asn1_prefix_allocated, EVP_MD_type(rctx->md), + kDummyHash, hash_len)) { + return 0; + } + UniquePtr free_asn1_prefix(asn1_prefix_allocated ? asn1_prefix + : nullptr); + + Array tbuf; + size_t rslen; + if (!tbuf.InitForOverwrite(key_len) || + !RSA_verify_raw(rsa, &rslen, tbuf.data(), tbuf.size(), sig, sig_len, + RSA_PKCS1_PADDING) || + rslen != asn1_prefix_len || + // Compare all but the hash suffix. + CRYPTO_memcmp(tbuf.data(), asn1_prefix, asn1_prefix_len - hash_len) != + 0) { + return 0; + } + + if (out != nullptr) { + OPENSSL_memcpy(out, tbuf.data() + rslen - hash_len, hash_len); + } + *out_len = hash_len; + + return 1; +} + +static int pkey_rsa_encrypt(EvpPkeyCtx *ctx, uint8_t *out, size_t *outlen, + const uint8_t *in, size_t inlen) { + RSA_PKEY_CTX *rctx = reinterpret_cast(ctx->data); + RSA *rsa = reinterpret_cast(ctx->pkey->pkey); + const size_t key_len = EVP_PKEY_size(ctx->pkey.get()); + + if (!out) { + *outlen = key_len; + return 1; + } + + if (*outlen < key_len) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + if (rctx->pad_mode == RSA_PKCS1_OAEP_PADDING) { + Array tbuf; + if (!tbuf.InitForOverwrite(key_len) || + !RSA_padding_add_PKCS1_OAEP_mgf1( + tbuf.data(), tbuf.size(), in, inlen, rctx->oaep_label.data(), + rctx->oaep_label.size(), rctx->md, rctx->mgf1md) || + !RSA_encrypt(rsa, outlen, out, *outlen, tbuf.data(), tbuf.size(), + RSA_NO_PADDING)) { + return 0; + } + return 1; + } + + return RSA_encrypt(rsa, outlen, out, *outlen, in, inlen, rctx->pad_mode); +} + +static int pkey_rsa_decrypt(EvpPkeyCtx *ctx, uint8_t *out, size_t *outlen, + const uint8_t *in, size_t inlen) { + RSA_PKEY_CTX *rctx = reinterpret_cast(ctx->data); + RSA *rsa = reinterpret_cast(ctx->pkey->pkey); + const size_t key_len = EVP_PKEY_size(ctx->pkey.get()); + + if (!out) { + *outlen = key_len; + return 1; + } + + if (*outlen < key_len) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + if (rctx->pad_mode == RSA_PKCS1_OAEP_PADDING) { + Array tbuf; + size_t padded_len; + if (!tbuf.InitForOverwrite(key_len) || + !RSA_decrypt(rsa, &padded_len, tbuf.data(), tbuf.size(), in, inlen, + RSA_NO_PADDING) || + !RSA_padding_check_PKCS1_OAEP_mgf1(out, outlen, key_len, tbuf.data(), + padded_len, rctx->oaep_label.data(), + rctx->oaep_label.size(), rctx->md, + rctx->mgf1md)) { + return 0; + } + return 1; + } + + return RSA_decrypt(rsa, outlen, out, key_len, in, inlen, rctx->pad_mode); +} + +static int check_padding_md(const EVP_MD *md, int padding) { + if (!md) { + return 1; + } + + if (padding == RSA_NO_PADDING) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PADDING_MODE); + return 0; + } + + return 1; +} + +static int is_known_padding(int padding_mode) { + switch (padding_mode) { + case RSA_PKCS1_PADDING: + case RSA_NO_PADDING: + case RSA_PKCS1_OAEP_PADDING: + case RSA_PKCS1_PSS_PADDING: + return 1; + default: + return 0; + } +} + +static int pkey_rsa_ctrl(EvpPkeyCtx *ctx, int type, int p1, void *p2) { + RSA_PKEY_CTX *rctx = reinterpret_cast(ctx->data); + switch (type) { + case EVP_PKEY_CTRL_RSA_PADDING: + // PSS keys cannot be switched to other padding types. + if (is_pss_only(ctx) && p1 != RSA_PKCS1_PSS_PADDING) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE); + return 0; + } + if (!is_known_padding(p1) || !check_padding_md(rctx->md, p1) || + (p1 == RSA_PKCS1_PSS_PADDING && + 0 == (ctx->operation & (EVP_PKEY_OP_SIGN | EVP_PKEY_OP_VERIFY))) || + (p1 == RSA_PKCS1_OAEP_PADDING && + 0 == (ctx->operation & EVP_PKEY_OP_TYPE_CRYPT))) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ILLEGAL_OR_UNSUPPORTED_PADDING_MODE); + return 0; + } + if (p1 == RSA_PKCS1_OAEP_PADDING && rctx->md == nullptr) { + rctx->md = EVP_sha1(); + } + rctx->pad_mode = p1; + return 1; + + case EVP_PKEY_CTRL_GET_RSA_PADDING: + *(int *)p2 = rctx->pad_mode; + return 1; + + case EVP_PKEY_CTRL_RSA_PSS_SALTLEN: + case EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN: + if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PSS_SALTLEN); + return 0; + } + if (type == EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN) { + *(int *)p2 = rctx->saltlen; + } else { + // Negative salt lengths are special values. + if (p1 < 0) { + if (p1 != RSA_PSS_SALTLEN_DIGEST && p1 != RSA_PSS_SALTLEN_AUTO) { + return 0; + } + // All our PSS restrictions accept saltlen == hashlen, so allow + // |RSA_PSS_SALTLEN_DIGEST|. Reject |RSA_PSS_SALTLEN_AUTO| for + // simplicity. + if (rctx->restrict_pss_params && p1 != RSA_PSS_SALTLEN_DIGEST) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PSS_SALTLEN); + return 0; + } + } else if (rctx->restrict_pss_params && + static_cast(p1) < EVP_MD_size(rctx->md)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PSS_SALTLEN); + return 0; + } + rctx->saltlen = p1; + } + return 1; + + case EVP_PKEY_CTRL_RSA_KEYGEN_BITS: + if (p1 < 256) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_KEYBITS); + return 0; + } + rctx->nbits = p1; + return 1; + + case EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP: + if (!p2) { + return 0; + } + rctx->pub_exp.reset(reinterpret_cast(p2)); + return 1; + + case EVP_PKEY_CTRL_RSA_OAEP_MD: + case EVP_PKEY_CTRL_GET_RSA_OAEP_MD: + if (rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PADDING_MODE); + return 0; + } + if (type == EVP_PKEY_CTRL_GET_RSA_OAEP_MD) { + *(const EVP_MD **)p2 = rctx->md; + } else { + rctx->md = reinterpret_cast(p2); + } + return 1; + + case EVP_PKEY_CTRL_MD: { + const EVP_MD *md = reinterpret_cast(p2); + if (!check_padding_md(md, rctx->pad_mode)) { + return 0; + } + if (rctx->restrict_pss_params && + EVP_MD_type(rctx->md) != EVP_MD_type(md)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_DIGEST_TYPE); + return 0; + } + rctx->md = md; + return 1; + } + + case EVP_PKEY_CTRL_GET_MD: + *(const EVP_MD **)p2 = rctx->md; + return 1; + + case EVP_PKEY_CTRL_RSA_MGF1_MD: + case EVP_PKEY_CTRL_GET_RSA_MGF1_MD: + if (rctx->pad_mode != RSA_PKCS1_PSS_PADDING && + rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_MGF1_MD); + return 0; + } + if (type == EVP_PKEY_CTRL_GET_RSA_MGF1_MD) { + if (rctx->mgf1md) { + *(const EVP_MD **)p2 = rctx->mgf1md; + } else { + *(const EVP_MD **)p2 = rctx->md; + } + } else { + const EVP_MD *md = reinterpret_cast(p2); + if (rctx->restrict_pss_params && + EVP_MD_type(rctx->mgf1md) != EVP_MD_type(md)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_MGF1_MD); + return 0; + } + rctx->mgf1md = md; + } + return 1; + + case EVP_PKEY_CTRL_RSA_OAEP_LABEL: { + if (rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PADDING_MODE); + return 0; + } + // |EVP_PKEY_CTRL_RSA_OAEP_LABEL| takes ownership of |label|'s underlying + // buffer (via |Reset|), but only on success. + auto *label = reinterpret_cast *>(p2); + rctx->oaep_label.Reset(label->data(), label->size()); + return 1; + } + + case EVP_PKEY_CTRL_GET_RSA_OAEP_LABEL: + if (rctx->pad_mode != RSA_PKCS1_OAEP_PADDING) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PADDING_MODE); + return 0; + } + *reinterpret_cast(p2) = CBS(rctx->oaep_label); + return 1; + + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); + return 0; + } +} + +static int pkey_rsa_keygen(EvpPkeyCtx *ctx, EvpPkey *pkey) { + RSA_PKEY_CTX *rctx = reinterpret_cast(ctx->data); + if (!rctx->pub_exp) { + rctx->pub_exp.reset(BN_new()); + if (!rctx->pub_exp || !BN_set_word(rctx->pub_exp.get(), RSA_F4)) { + return 0; + } + } + UniquePtr rsa(RSA_new()); + if (!rsa) { + return 0; + } + + if (!RSA_generate_key_ex(rsa.get(), rctx->nbits, rctx->pub_exp.get(), + nullptr)) { + return 0; + } + + EVP_PKEY_assign_RSA(pkey, rsa.release()); + return 1; +} + +const EVP_PKEY_CTX_METHOD rsa_pkey_meth = { + EVP_PKEY_RSA, + pkey_rsa_init, + pkey_rsa_copy, + pkey_rsa_cleanup, + pkey_rsa_keygen, + pkey_rsa_sign, + /*sign_message=*/nullptr, + pkey_rsa_verify, + /*verify_message=*/nullptr, + pkey_rsa_verify_recover, + pkey_rsa_encrypt, + pkey_rsa_decrypt, + /*derive=*/nullptr, + /*paramgen=*/nullptr, + /*encap=*/nullptr, + /*decap=*/nullptr, + pkey_rsa_ctrl, +}; + +const EVP_PKEY_CTX_METHOD rsa_pss_pkey_meth = { + EVP_PKEY_RSA_PSS, + pkey_rsa_init, + pkey_rsa_copy, + pkey_rsa_cleanup, + // In OpenSSL, |EVP_PKEY_RSA_PSS| supports key generation and fills in PSS + // parameters based on a separate set of keygen-targetted setters: + // |EVP_PKEY_CTX_set_rsa_pss_keygen_saltlen|, + // |EVP_PKEY_CTX_set_rsa_pss_keygen_mgf1_md|, and + // |EVP_PKEY_CTX_rsa_pss_key_digest|. We do not currently implement this + // because we only support one parameter set. + /*keygen=*/nullptr, + pkey_rsa_sign, + /*sign_message=*/nullptr, + pkey_rsa_verify, + /*verify_message=*/nullptr, + /*verify_recover=*/nullptr, + /*encrypt=*/nullptr, + /*decrypt=*/nullptr, + /*derive=*/nullptr, + /*paramgen=*/nullptr, + /*encap=*/nullptr, + /*decap=*/nullptr, + pkey_rsa_ctrl, +}; + +} // namespace + +const EVP_PKEY_ALG *EVP_pkey_rsa() { + static const EVP_PKEY_ALG kAlg = {&rsa_asn1_meth, &rsa_pkey_meth}; + return &kAlg; +} + +const EVP_PKEY_ALG *EVP_pkey_rsa_pss_sha256() { + static const EVP_PKEY_ALG_RSA_PSS kAlg = { + {&rsa_pss_asn1_meth, &rsa_pss_pkey_meth}, rsa_pss_sha256}; + return &kAlg; +} + +const EVP_PKEY_ALG *EVP_pkey_rsa_pss_sha384() { + static const EVP_PKEY_ALG_RSA_PSS kAlg = { + {&rsa_pss_asn1_meth, &rsa_pss_pkey_meth}, rsa_pss_sha384}; + return &kAlg; +} + +const EVP_PKEY_ALG *EVP_pkey_rsa_pss_sha512() { + static const EVP_PKEY_ALG_RSA_PSS kAlg = { + {&rsa_pss_asn1_meth, &rsa_pss_pkey_meth}, rsa_pss_sha512}; + return &kAlg; +} + +EVP_PKEY *EVP_RSA_gen(unsigned bits) { + // TODO(crbug.com/487376811): After EVP_PKEY_CTX is switched to C++ + // subclassing, it should be possible to stack-allocate enough the + // RSA-specific subclass. + UniquePtr ctx = evp_pkey_ctx_new_alg(EVP_pkey_rsa()); + EVP_PKEY *pkey = nullptr; + if (ctx == nullptr || // + !EVP_PKEY_keygen_init(ctx.get()) || + !EVP_PKEY_CTX_set_rsa_keygen_bits(ctx.get(), bits) || + !EVP_PKEY_keygen(ctx.get(), &pkey)) { + return nullptr; + } + return pkey; +} + +int EVP_PKEY_set1_RSA(EVP_PKEY *pkey, RSA *key) { + if (EVP_PKEY_assign_RSA(pkey, key)) { + RSA_up_ref(key); + return 1; + } + return 0; +} + +int EVP_PKEY_assign_RSA(EVP_PKEY *pkey, RSA *key) { + if (key == nullptr) { + return 0; + } + evp_pkey_set0(FromOpaque(pkey), &rsa_asn1_meth, key); + return 1; +} + +RSA *EVP_PKEY_get0_RSA(const EVP_PKEY *pkey) { + int pkey_id = EVP_PKEY_id(pkey); + if (pkey_id != EVP_PKEY_RSA && pkey_id != EVP_PKEY_RSA_PSS) { + OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_RSA_KEY); + return nullptr; + } + return reinterpret_cast(FromOpaque(pkey)->pkey); +} + +RSA *EVP_PKEY_get1_RSA(const EVP_PKEY *pkey) { + RSA *rsa = EVP_PKEY_get0_RSA(pkey); + if (rsa != nullptr) { + RSA_up_ref(rsa); + } + return rsa; +} + +static int rsa_or_rsa_pss_ctrl(EvpPkeyCtx *ctx, int optype, int cmd, int p1, + void *p2) { + if (!ctx || !ctx->pmeth || !ctx->pmeth->ctrl) { + OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); + return 0; + } + if (ctx->pmeth->pkey_id != EVP_PKEY_RSA && + ctx->pmeth->pkey_id != EVP_PKEY_RSA_PSS) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + return EVP_PKEY_CTX_ctrl(ctx, /*keytype=*/-1, optype, cmd, p1, p2); +} + +int EVP_PKEY_CTX_set_rsa_padding(EVP_PKEY_CTX *ctx, int padding) { + return rsa_or_rsa_pss_ctrl(FromOpaque(ctx), -1, EVP_PKEY_CTRL_RSA_PADDING, + padding, nullptr); +} + +int EVP_PKEY_CTX_get_rsa_padding(EVP_PKEY_CTX *ctx, int *out_padding) { + return rsa_or_rsa_pss_ctrl(FromOpaque(ctx), -1, EVP_PKEY_CTRL_GET_RSA_PADDING, + 0, out_padding); +} + +int EVP_PKEY_CTX_set_rsa_pss_keygen_md(EVP_PKEY_CTX *ctx, const EVP_MD *md) { + // We currently do not support keygen with |EVP_PKEY_RSA_PSS|. + return 0; +} + +int EVP_PKEY_CTX_set_rsa_pss_keygen_saltlen(EVP_PKEY_CTX *ctx, int salt_len) { + // We currently do not support keygen with |EVP_PKEY_RSA_PSS|. + return 0; +} + +int EVP_PKEY_CTX_set_rsa_pss_keygen_mgf1_md(EVP_PKEY_CTX *ctx, + const EVP_MD *md) { + // We currently do not support keygen with |EVP_PKEY_RSA_PSS|. + return 0; +} + +int EVP_PKEY_CTX_set_rsa_pss_saltlen(EVP_PKEY_CTX *ctx, int salt_len) { + return rsa_or_rsa_pss_ctrl(FromOpaque(ctx), + (EVP_PKEY_OP_SIGN | EVP_PKEY_OP_VERIFY), + EVP_PKEY_CTRL_RSA_PSS_SALTLEN, salt_len, nullptr); +} + +int EVP_PKEY_CTX_get_rsa_pss_saltlen(EVP_PKEY_CTX *ctx, int *out_salt_len) { + return rsa_or_rsa_pss_ctrl( + FromOpaque(ctx), (EVP_PKEY_OP_SIGN | EVP_PKEY_OP_VERIFY), + EVP_PKEY_CTRL_GET_RSA_PSS_SALTLEN, 0, out_salt_len); +} + +int EVP_PKEY_CTX_set_rsa_keygen_bits(EVP_PKEY_CTX *ctx, int bits) { + return rsa_or_rsa_pss_ctrl(FromOpaque(ctx), EVP_PKEY_OP_KEYGEN, + EVP_PKEY_CTRL_RSA_KEYGEN_BITS, bits, nullptr); +} + +int EVP_PKEY_CTX_set_rsa_keygen_pubexp(EVP_PKEY_CTX *ctx, BIGNUM *e) { + return rsa_or_rsa_pss_ctrl(FromOpaque(ctx), EVP_PKEY_OP_KEYGEN, + EVP_PKEY_CTRL_RSA_KEYGEN_PUBEXP, 0, e); +} + +int EVP_PKEY_CTX_set_rsa_oaep_md(EVP_PKEY_CTX *ctx, const EVP_MD *md) { + return EVP_PKEY_CTX_ctrl(FromOpaque(ctx), EVP_PKEY_RSA, + EVP_PKEY_OP_TYPE_CRYPT, EVP_PKEY_CTRL_RSA_OAEP_MD, 0, + (void *)md); +} + +int EVP_PKEY_CTX_get_rsa_oaep_md(EVP_PKEY_CTX *ctx, const EVP_MD **out_md) { + return EVP_PKEY_CTX_ctrl(FromOpaque(ctx), EVP_PKEY_RSA, + EVP_PKEY_OP_TYPE_CRYPT, + EVP_PKEY_CTRL_GET_RSA_OAEP_MD, 0, (void *)out_md); +} + +int EVP_PKEY_CTX_set_rsa_mgf1_md(EVP_PKEY_CTX *ctx, const EVP_MD *md) { + return rsa_or_rsa_pss_ctrl(FromOpaque(ctx), + EVP_PKEY_OP_TYPE_SIG | EVP_PKEY_OP_TYPE_CRYPT, + EVP_PKEY_CTRL_RSA_MGF1_MD, 0, (void *)md); +} + +int EVP_PKEY_CTX_get_rsa_mgf1_md(EVP_PKEY_CTX *ctx, const EVP_MD **out_md) { + return rsa_or_rsa_pss_ctrl(FromOpaque(ctx), + EVP_PKEY_OP_TYPE_SIG | EVP_PKEY_OP_TYPE_CRYPT, + EVP_PKEY_CTRL_GET_RSA_MGF1_MD, 0, (void *)out_md); +} + +int EVP_PKEY_CTX_set0_rsa_oaep_label(EVP_PKEY_CTX *ctx, uint8_t *label, + size_t label_len) { + Span span(label, label_len); + return EVP_PKEY_CTX_ctrl(FromOpaque(ctx), EVP_PKEY_RSA, + EVP_PKEY_OP_TYPE_CRYPT, EVP_PKEY_CTRL_RSA_OAEP_LABEL, + 0, &span); +} + +int EVP_PKEY_CTX_get0_rsa_oaep_label(EVP_PKEY_CTX *ctx, + const uint8_t **out_label) { + CBS label; + if (!EVP_PKEY_CTX_ctrl(FromOpaque(ctx), EVP_PKEY_RSA, EVP_PKEY_OP_TYPE_CRYPT, + EVP_PKEY_CTRL_GET_RSA_OAEP_LABEL, 0, &label)) { + return -1; + } + if (CBS_len(&label) > INT_MAX) { + OPENSSL_PUT_ERROR(EVP, ERR_R_OVERFLOW); + return -1; + } + *out_label = CBS_data(&label); + return (int)CBS_len(&label); +} diff --git a/third_party/boringssl/src/crypto/evp/p_rsa_asn1.c b/third_party/boringssl/src/crypto/evp/p_rsa_asn1.c deleted file mode 100644 index 2e4942a1..00000000 --- a/third_party/boringssl/src/crypto/evp/p_rsa_asn1.c +++ /dev/null @@ -1,196 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 2006. - */ -/* ==================================================================== - * Copyright (c) 2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include -#include -#include -#include - -#include "../fipsmodule/rsa/internal.h" -#include "internal.h" - - -static int rsa_pub_encode(CBB *out, const EVP_PKEY *key) { - // See RFC 3279, section 2.3.1. - CBB spki, algorithm, oid, null, key_bitstring; - if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, rsa_asn1_meth.oid, rsa_asn1_meth.oid_len) || - !CBB_add_asn1(&algorithm, &null, CBS_ASN1_NULL) || - !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || - !CBB_add_u8(&key_bitstring, 0 /* padding */) || - !RSA_marshal_public_key(&key_bitstring, key->pkey.rsa) || - !CBB_flush(out)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); - return 0; - } - - return 1; -} - -static int rsa_pub_decode(EVP_PKEY *out, CBS *params, CBS *key) { - // See RFC 3279, section 2.3.1. - - // The parameters must be NULL. - CBS null; - if (!CBS_get_asn1(params, &null, CBS_ASN1_NULL) || - CBS_len(&null) != 0 || - CBS_len(params) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - - RSA *rsa = RSA_parse_public_key(key); - if (rsa == NULL || CBS_len(key) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - RSA_free(rsa); - return 0; - } - - EVP_PKEY_assign_RSA(out, rsa); - return 1; -} - -static int rsa_pub_cmp(const EVP_PKEY *a, const EVP_PKEY *b) { - return BN_cmp(b->pkey.rsa->n, a->pkey.rsa->n) == 0 && - BN_cmp(b->pkey.rsa->e, a->pkey.rsa->e) == 0; -} - -static int rsa_priv_encode(CBB *out, const EVP_PKEY *key) { - CBB pkcs8, algorithm, oid, null, private_key; - if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || - !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, rsa_asn1_meth.oid, rsa_asn1_meth.oid_len) || - !CBB_add_asn1(&algorithm, &null, CBS_ASN1_NULL) || - !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || - !RSA_marshal_private_key(&private_key, key->pkey.rsa) || - !CBB_flush(out)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); - return 0; - } - - return 1; -} - -static int rsa_priv_decode(EVP_PKEY *out, CBS *params, CBS *key) { - // Per RFC 3447, A.1, the parameters have type NULL. - CBS null; - if (!CBS_get_asn1(params, &null, CBS_ASN1_NULL) || - CBS_len(&null) != 0 || - CBS_len(params) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - - RSA *rsa = RSA_parse_private_key(key); - if (rsa == NULL || CBS_len(key) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - RSA_free(rsa); - return 0; - } - - EVP_PKEY_assign_RSA(out, rsa); - return 1; -} - -static int rsa_opaque(const EVP_PKEY *pkey) { - return RSA_is_opaque(pkey->pkey.rsa); -} - -static int int_rsa_size(const EVP_PKEY *pkey) { - return RSA_size(pkey->pkey.rsa); -} - -static int rsa_bits(const EVP_PKEY *pkey) { - return RSA_bits(pkey->pkey.rsa); -} - -static void int_rsa_free(EVP_PKEY *pkey) { RSA_free(pkey->pkey.rsa); } - -const EVP_PKEY_ASN1_METHOD rsa_asn1_meth = { - EVP_PKEY_RSA, - // 1.2.840.113549.1.1.1 - {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, 0x01}, 9, - - &rsa_pkey_meth, - - rsa_pub_decode, - rsa_pub_encode, - rsa_pub_cmp, - - rsa_priv_decode, - rsa_priv_encode, - - NULL /* set_priv_raw */, - NULL /* set_pub_raw */, - NULL /* get_priv_raw */, - NULL /* get_pub_raw */, - - rsa_opaque, - - int_rsa_size, - rsa_bits, - - 0,0,0, - - int_rsa_free, -}; diff --git a/third_party/boringssl/src/crypto/evp/p_x25519.c b/third_party/boringssl/src/crypto/evp/p_x25519.c deleted file mode 100644 index ed7df39a..00000000 --- a/third_party/boringssl/src/crypto/evp/p_x25519.c +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include - -#include "internal.h" - - -// X25519 has no parameters to copy. -static int pkey_x25519_copy(EVP_PKEY_CTX *dst, EVP_PKEY_CTX *src) { return 1; } - -static int pkey_x25519_keygen(EVP_PKEY_CTX *ctx, EVP_PKEY *pkey) { - X25519_KEY *key = OPENSSL_malloc(sizeof(X25519_KEY)); - if (key == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - - if (!EVP_PKEY_set_type(pkey, EVP_PKEY_X25519)) { - OPENSSL_free(key); - return 0; - } - - X25519_keypair(key->pub, key->priv); - key->has_private = 1; - - OPENSSL_free(pkey->pkey.ptr); - pkey->pkey.ptr = key; - return 1; -} - -static int pkey_x25519_derive(EVP_PKEY_CTX *ctx, uint8_t *out, - size_t *out_len) { - if (ctx->pkey == NULL || ctx->peerkey == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); - return 0; - } - - const X25519_KEY *our_key = ctx->pkey->pkey.ptr; - const X25519_KEY *peer_key = ctx->peerkey->pkey.ptr; - if (our_key == NULL || peer_key == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); - return 0; - } - - if (!our_key->has_private) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); - return 0; - } - - if (out != NULL) { - if (*out_len < 32) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - if (!X25519(out, our_key->priv, peer_key->pub)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); - return 0; - } - } - - *out_len = 32; - return 1; -} - -static int pkey_x25519_ctrl(EVP_PKEY_CTX *ctx, int type, int p1, void *p2) { - switch (type) { - case EVP_PKEY_CTRL_PEER_KEY: - // |EVP_PKEY_derive_set_peer| requires the key implement this command, - // even if it is a no-op. - return 1; - - default: - OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); - return 0; - } -} - -const EVP_PKEY_METHOD x25519_pkey_meth = { - EVP_PKEY_X25519, - NULL /* init */, - pkey_x25519_copy, - NULL /* cleanup */, - pkey_x25519_keygen, - NULL /* sign */, - NULL /* sign_message */, - NULL /* verify */, - NULL /* verify_message */, - NULL /* verify_recover */, - NULL /* encrypt */, - NULL /* decrypt */, - pkey_x25519_derive, - NULL /* paramgen */, - pkey_x25519_ctrl, -}; diff --git a/third_party/boringssl/src/crypto/evp/p_x25519.cc b/third_party/boringssl/src/crypto/evp/p_x25519.cc new file mode 100644 index 00000000..db38d26f --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/p_x25519.cc @@ -0,0 +1,371 @@ +// Copyright 2019 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +namespace { + +struct X25519_KEY { + uint8_t pub[32]; + uint8_t priv[32]; + bool has_private; +}; + +extern const EVP_PKEY_ASN1_METHOD x25519_asn1_meth; +extern const EVP_PKEY_CTX_METHOD x25519_pkey_meth; + +static void x25519_free(EvpPkey *pkey) { + X25519_KEY *key = reinterpret_cast(pkey->pkey); + OPENSSL_free(key); + pkey->pkey = nullptr; +} + +static int x25519_set_priv_raw(EvpPkey *pkey, const uint8_t *in, size_t len) { + if (len != 32) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + X25519_KEY *key = New(); + if (key == nullptr) { + return 0; + } + + OPENSSL_memcpy(key->priv, in, 32); + X25519_public_from_private(key->pub, key->priv); + key->has_private = true; + + evp_pkey_set0(pkey, &x25519_asn1_meth, key); + return 1; +} + +static int x25519_set_pub_raw(EvpPkey *pkey, const uint8_t *in, size_t len) { + if (len != 32) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + X25519_KEY *key = New(); + if (key == nullptr) { + return 0; + } + + OPENSSL_memcpy(key->pub, in, 32); + key->has_private = false; + + evp_pkey_set0(pkey, &x25519_asn1_meth, key); + return 1; +} + +static int x25519_get_priv_raw(const EvpPkey *pkey, uint8_t *out, + size_t *out_len) { + const X25519_KEY *key = reinterpret_cast(pkey->pkey); + if (!key->has_private) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + + if (out == nullptr) { + *out_len = 32; + return 1; + } + + if (*out_len < 32) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + OPENSSL_memcpy(out, key->priv, 32); + *out_len = 32; + return 1; +} + +static int x25519_get_pub_raw(const EvpPkey *pkey, uint8_t *out, + size_t *out_len) { + const X25519_KEY *key = reinterpret_cast(pkey->pkey); + if (out == nullptr) { + *out_len = 32; + return 1; + } + + if (*out_len < 32) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + + OPENSSL_memcpy(out, key->pub, 32); + *out_len = 32; + return 1; +} + +static int x25519_set1_tls_encodedpoint(EvpPkey *pkey, const uint8_t *in, + size_t len) { + return x25519_set_pub_raw(pkey, in, len); +} + +static size_t x25519_get1_tls_encodedpoint(const EvpPkey *pkey, + uint8_t **out_ptr) { + const X25519_KEY *key = reinterpret_cast(pkey->pkey); + if (key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); + return 0; + } + + *out_ptr = reinterpret_cast(OPENSSL_memdup(key->pub, 32)); + return *out_ptr == nullptr ? 0 : 32; +} + +static bssl::evp_decode_result_t x25519_pub_decode(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + // See RFC 8410, section 4. + + // The parameters must be omitted. Public keys have length 32. + if (CBS_len(params) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + return x25519_set_pub_raw(out, CBS_data(key), CBS_len(key)) + ? evp_decode_ok + : evp_decode_error; +} + +static int x25519_pub_encode(CBB *out, const EvpPkey *pkey) { + const X25519_KEY *key = reinterpret_cast(pkey->pkey); + + // See RFC 8410, section 4. + CBB spki, algorithm, key_bitstring; + if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, x25519_asn1_meth.oid, + x25519_asn1_meth.oid_len) || + !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || + !CBB_add_u8(&key_bitstring, 0 /* padding */) || + !CBB_add_bytes(&key_bitstring, key->pub, 32) || // + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static bool x25519_pub_equal(const EvpPkey *a, const EvpPkey *b) { + const X25519_KEY *a_key = reinterpret_cast(a->pkey); + const X25519_KEY *b_key = reinterpret_cast(b->pkey); + return OPENSSL_memcmp(a_key->pub, b_key->pub, 32) == 0; +} + +static bssl::evp_decode_result_t x25519_priv_decode(const EVP_PKEY_ALG *alg, + EvpPkey *out, CBS *params, + CBS *key) { + // See RFC 8410, section 7. + + // Parameters must be empty. The key is a 32-byte value wrapped in an extra + // OCTET STRING layer. + CBS inner; + if (CBS_len(params) != 0 || + !CBS_get_asn1(key, &inner, CBS_ASN1_OCTETSTRING) || CBS_len(key) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return evp_decode_error; + } + + return x25519_set_priv_raw(out, CBS_data(&inner), CBS_len(&inner)) + ? evp_decode_ok + : evp_decode_error; +} + +static int x25519_priv_encode(CBB *out, const EvpPkey *pkey) { + const X25519_KEY *key = reinterpret_cast(pkey->pkey); + if (!key->has_private) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + + // See RFC 8410, section 7. + CBB pkcs8, algorithm, private_key, inner; + if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || + !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, x25519_asn1_meth.oid, + x25519_asn1_meth.oid_len) || + !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || + !CBB_add_asn1(&private_key, &inner, CBS_ASN1_OCTETSTRING) || + // The PKCS#8 encoding stores only the 32-byte seed which is the first 32 + // bytes of the private key. + !CBB_add_bytes(&inner, key->priv, 32) || // + !CBB_flush(out)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); + return 0; + } + + return 1; +} + +static bool x25519_pub_present(const EvpPkey *) { return true; } + +static bool x25519_pub_copy(EvpPkey *out, const EvpPkey *pkey) { + const X25519_KEY *pkey_x25519 = + reinterpret_cast(pkey->pkey); + X25519_KEY *public_copy = New(); + if (public_copy == nullptr) { + return false; + } + OPENSSL_memcpy(public_copy->pub, pkey_x25519->pub, 32); + public_copy->has_private = false; + evp_pkey_set0(out, pkey->ameth, public_copy); + return true; +} + +static bool x25519_priv_present(const EvpPkey *pk) { + const X25519_KEY *key = reinterpret_cast(pk->pkey); + return key->has_private; +} + +static int x25519_size(const EvpPkey *pkey) { return 32; } + +static int x25519_bits(const EvpPkey *pkey) { return 253; } + +const EVP_PKEY_ASN1_METHOD x25519_asn1_meth = { + EVP_PKEY_X25519, + {0x2b, 0x65, 0x6e}, + 3, + &x25519_pkey_meth, + x25519_pub_decode, + x25519_pub_encode, + x25519_pub_equal, + x25519_pub_present, + x25519_pub_copy, + x25519_priv_decode, + x25519_priv_encode, + x25519_priv_present, + x25519_set_priv_raw, + /*set_priv_seed=*/nullptr, + x25519_set_pub_raw, + x25519_get_priv_raw, + /*get_priv_seed=*/nullptr, + x25519_get_pub_raw, + x25519_set1_tls_encodedpoint, + x25519_get1_tls_encodedpoint, + /*pkey_opaque=*/nullptr, + x25519_size, + x25519_bits, + /*param_missing=*/nullptr, + /*param_copy=*/nullptr, + /*param_equal=*/nullptr, + x25519_free, +}; + +// X25519 has no parameters to copy. +static int pkey_x25519_copy(EvpPkeyCtx *dst, EvpPkeyCtx *src) { return 1; } + +static int pkey_x25519_keygen(EvpPkeyCtx *ctx, EvpPkey *pkey) { + X25519_KEY *key = New(); + if (key == nullptr) { + return 0; + } + + X25519_keypair(key->pub, key->priv); + key->has_private = true; + evp_pkey_set0(pkey, &x25519_asn1_meth, key); + return 1; +} + +static int pkey_x25519_derive(EvpPkeyCtx *ctx, uint8_t *out, size_t *out_len) { + if (ctx->pkey == nullptr || ctx->peerkey == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); + return 0; + } + + const X25519_KEY *our_key = + reinterpret_cast(ctx->pkey->pkey); + const X25519_KEY *peer_key = + reinterpret_cast(ctx->peerkey->pkey); + if (our_key == nullptr || peer_key == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_KEYS_NOT_SET); + return 0; + } + + if (!our_key->has_private) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + + if (out != nullptr) { + if (*out_len < 32) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + if (!X25519(out, our_key->priv, peer_key->pub)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + } + + *out_len = 32; + return 1; +} + +static int pkey_x25519_ctrl(EvpPkeyCtx *ctx, int type, int p1, void *p2) { + switch (type) { + case EVP_PKEY_CTRL_PEER_KEY: + // |EVP_PKEY_derive_set_peer| requires the key implement this command, + // even if it is a no-op. + return 1; + + default: + OPENSSL_PUT_ERROR(EVP, EVP_R_COMMAND_NOT_SUPPORTED); + return 0; + } +} + +const EVP_PKEY_CTX_METHOD x25519_pkey_meth = { + /*pkey_id=*/EVP_PKEY_X25519, + /*init=*/nullptr, + /*copy=*/pkey_x25519_copy, + /*cleanup=*/nullptr, + /*keygen=*/pkey_x25519_keygen, + /*sign=*/nullptr, + /*sign_message=*/nullptr, + /*verify=*/nullptr, + /*verify_message=*/nullptr, + /*verify_recover=*/nullptr, + /*encrypt=*/nullptr, + /*decrypt=*/nullptr, + /*derive=*/pkey_x25519_derive, + /*paramgen=*/nullptr, + /*encap=*/nullptr, + /*decap=*/nullptr, + /*ctrl=*/pkey_x25519_ctrl, +}; + +} // namespace + +const EVP_PKEY_ALG *EVP_pkey_x25519() { + static const EVP_PKEY_ALG kAlg = {&x25519_asn1_meth, &x25519_pkey_meth}; + return &kAlg; +} diff --git a/third_party/boringssl/src/crypto/evp/p_x25519_asn1.c b/third_party/boringssl/src/crypto/evp/p_x25519_asn1.c deleted file mode 100644 index 182f6a2d..00000000 --- a/third_party/boringssl/src/crypto/evp/p_x25519_asn1.c +++ /dev/null @@ -1,249 +0,0 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include -#include - -#include "internal.h" -#include "../internal.h" - - -static void x25519_free(EVP_PKEY *pkey) { - OPENSSL_free(pkey->pkey.ptr); - pkey->pkey.ptr = NULL; -} - -static int x25519_set_priv_raw(EVP_PKEY *pkey, const uint8_t *in, size_t len) { - if (len != 32) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - - X25519_KEY *key = OPENSSL_malloc(sizeof(X25519_KEY)); - if (key == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - - OPENSSL_memcpy(key->priv, in, 32); - X25519_public_from_private(key->pub, key->priv); - key->has_private = 1; - - x25519_free(pkey); - pkey->pkey.ptr = key; - return 1; -} - -static int x25519_set_pub_raw(EVP_PKEY *pkey, const uint8_t *in, size_t len) { - if (len != 32) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - - X25519_KEY *key = OPENSSL_malloc(sizeof(X25519_KEY)); - if (key == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - - OPENSSL_memcpy(key->pub, in, 32); - key->has_private = 0; - - x25519_free(pkey); - pkey->pkey.ptr = key; - return 1; -} - -static int x25519_get_priv_raw(const EVP_PKEY *pkey, uint8_t *out, - size_t *out_len) { - const X25519_KEY *key = pkey->pkey.ptr; - if (!key->has_private) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); - return 0; - } - - if (out == NULL) { - *out_len = 32; - return 1; - } - - if (*out_len < 32) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - - OPENSSL_memcpy(out, key->priv, 32); - *out_len = 32; - return 1; -} - -static int x25519_get_pub_raw(const EVP_PKEY *pkey, uint8_t *out, - size_t *out_len) { - const X25519_KEY *key = pkey->pkey.ptr; - if (out == NULL) { - *out_len = 32; - return 1; - } - - if (*out_len < 32) { - OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); - return 0; - } - - OPENSSL_memcpy(out, key->pub, 32); - *out_len = 32; - return 1; -} - -static int x25519_pub_decode(EVP_PKEY *out, CBS *params, CBS *key) { - // See RFC 8410, section 4. - - // The parameters must be omitted. Public keys have length 32. - if (CBS_len(params) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - - return x25519_set_pub_raw(out, CBS_data(key), CBS_len(key)); -} - -static int x25519_pub_encode(CBB *out, const EVP_PKEY *pkey) { - const X25519_KEY *key = pkey->pkey.ptr; - - // See RFC 8410, section 4. - CBB spki, algorithm, oid, key_bitstring; - if (!CBB_add_asn1(out, &spki, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&spki, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, x25519_asn1_meth.oid, x25519_asn1_meth.oid_len) || - !CBB_add_asn1(&spki, &key_bitstring, CBS_ASN1_BITSTRING) || - !CBB_add_u8(&key_bitstring, 0 /* padding */) || - !CBB_add_bytes(&key_bitstring, key->pub, 32) || - !CBB_flush(out)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); - return 0; - } - - return 1; -} - -static int x25519_pub_cmp(const EVP_PKEY *a, const EVP_PKEY *b) { - const X25519_KEY *a_key = a->pkey.ptr; - const X25519_KEY *b_key = b->pkey.ptr; - return OPENSSL_memcmp(a_key->pub, b_key->pub, 32) == 0; -} - -static int x25519_priv_decode(EVP_PKEY *out, CBS *params, CBS *key) { - // See RFC 8410, section 7. - - // Parameters must be empty. The key is a 32-byte value wrapped in an extra - // OCTET STRING layer. - CBS inner; - if (CBS_len(params) != 0 || - !CBS_get_asn1(key, &inner, CBS_ASN1_OCTETSTRING) || - CBS_len(key) != 0) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - - return x25519_set_priv_raw(out, CBS_data(&inner), CBS_len(&inner)); -} - -static int x25519_priv_encode(CBB *out, const EVP_PKEY *pkey) { - X25519_KEY *key = pkey->pkey.ptr; - if (!key->has_private) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); - return 0; - } - - // See RFC 8410, section 7. - CBB pkcs8, algorithm, oid, private_key, inner; - if (!CBB_add_asn1(out, &pkcs8, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1_uint64(&pkcs8, 0 /* version */) || - !CBB_add_asn1(&pkcs8, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, x25519_asn1_meth.oid, x25519_asn1_meth.oid_len) || - !CBB_add_asn1(&pkcs8, &private_key, CBS_ASN1_OCTETSTRING) || - !CBB_add_asn1(&private_key, &inner, CBS_ASN1_OCTETSTRING) || - // The PKCS#8 encoding stores only the 32-byte seed which is the first 32 - // bytes of the private key. - !CBB_add_bytes(&inner, key->priv, 32) || - !CBB_flush(out)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_ENCODE_ERROR); - return 0; - } - - return 1; -} - -static int x25519_size(const EVP_PKEY *pkey) { return 32; } - -static int x25519_bits(const EVP_PKEY *pkey) { return 253; } - -const EVP_PKEY_ASN1_METHOD x25519_asn1_meth = { - EVP_PKEY_X25519, - {0x2b, 0x65, 0x6e}, - 3, - &x25519_pkey_meth, - x25519_pub_decode, - x25519_pub_encode, - x25519_pub_cmp, - x25519_priv_decode, - x25519_priv_encode, - x25519_set_priv_raw, - x25519_set_pub_raw, - x25519_get_priv_raw, - x25519_get_pub_raw, - NULL /* pkey_opaque */, - x25519_size, - x25519_bits, - NULL /* param_missing */, - NULL /* param_copy */, - NULL /* param_cmp */, - x25519_free, -}; - -int EVP_PKEY_set1_tls_encodedpoint(EVP_PKEY *pkey, const uint8_t *in, - size_t len) { - // TODO(davidben): In OpenSSL, this function also works for |EVP_PKEY_EC| - // keys. Add support if it ever comes up. - if (pkey->type != EVP_PKEY_X25519) { - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_PUBLIC_KEY_TYPE); - return 0; - } - - return x25519_set_pub_raw(pkey, in, len); -} - -size_t EVP_PKEY_get1_tls_encodedpoint(const EVP_PKEY *pkey, uint8_t **out_ptr) { - // TODO(davidben): In OpenSSL, this function also works for |EVP_PKEY_EC| - // keys. Add support if it ever comes up. - if (pkey->type != EVP_PKEY_X25519) { - OPENSSL_PUT_ERROR(EVP, EVP_R_UNSUPPORTED_PUBLIC_KEY_TYPE); - return 0; - } - - const X25519_KEY *key = pkey->pkey.ptr; - if (key == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NO_KEY_SET); - return 0; - } - - *out_ptr = OPENSSL_memdup(key->pub, 32); - return *out_ptr == NULL ? 0 : 32; -} diff --git a/third_party/boringssl/src/crypto/evp/p_xwing.cc b/third_party/boringssl/src/crypto/evp/p_xwing.cc new file mode 100644 index 00000000..2e009682 --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/p_xwing.cc @@ -0,0 +1,273 @@ +// Copyright 2026 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +namespace { + +struct XWING_KEY { + static constexpr bool kAllowUniquePtr = true; + + uint8_t pub[XWING_PUBLIC_KEY_BYTES]; + XWING_private_key priv; + bool has_private; +}; + +extern const EVP_PKEY_ASN1_METHOD xwing_asn1_meth; +extern const EVP_PKEY_CTX_METHOD xwing_pkey_meth; + +static void xwing_free(EvpPkey *pkey) { + Delete(reinterpret_cast(pkey->pkey)); +} + +static bool xwing_pub_equal(const EvpPkey *a, const EvpPkey *b) { + const XWING_KEY *a_key = reinterpret_cast(a->pkey); + const XWING_KEY *b_key = reinterpret_cast(b->pkey); + return OPENSSL_memcmp(a_key->pub, b_key->pub, XWING_PUBLIC_KEY_BYTES) == 0; +} + +static bool xwing_pub_present(const EvpPkey *) { return true; } + +static bool xwing_pub_copy(EvpPkey *out, const EvpPkey *pkey) { + const XWING_KEY *pkey_xwing = reinterpret_cast(pkey->pkey); + auto public_copy = MakeUnique(); + if (public_copy == nullptr) { + return false; + } + OPENSSL_memcpy(public_copy->pub, pkey_xwing->pub, XWING_PUBLIC_KEY_BYTES); + public_copy->has_private = false; + evp_pkey_set0(out, pkey->ameth, public_copy.release()); + return true; +} + +static bool xwing_priv_present(const EvpPkey *pk) { + const XWING_KEY *key = reinterpret_cast(pk->pkey); + return key->has_private; +} + +static int xwing_set_priv_seed(EvpPkey *pkey, const uint8_t *in, size_t len) { + auto key = MakeUnique(); + if (key == nullptr) { + return 0; + } + CBS cbs; + CBS_init(&cbs, in, len); + if (!XWING_parse_private_key(&key->priv, &cbs) || CBS_len(&cbs) != 0 || + !XWING_public_from_private(key->pub, &key->priv)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + key->has_private = true; + evp_pkey_set0(pkey, &xwing_asn1_meth, key.release()); + return 1; +} + +static int xwing_get_priv_seed(const EvpPkey *pkey, uint8_t *out, + size_t *out_len) { + if (out == nullptr) { + *out_len = XWING_PRIVATE_KEY_BYTES; + return 1; + } + const XWING_KEY *key = reinterpret_cast(pkey->pkey); + if (key == nullptr || !key->has_private) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + if (*out_len < XWING_PRIVATE_KEY_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + CBB cbb; + CBB_init_fixed(&cbb, out, XWING_PRIVATE_KEY_BYTES); + if (!XWING_marshal_private_key(&cbb, &key->priv)) { + return 0; + } + *out_len = CBB_len(&cbb); + assert(*out_len == XWING_PRIVATE_KEY_BYTES); + return 1; +} + +static int xwing_set_pub_raw(EvpPkey *pkey, const uint8_t *in, size_t len) { + if (len != XWING_PUBLIC_KEY_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + auto key = MakeUnique(); + if (key == nullptr) { + return 0; + } + OPENSSL_memcpy(key->pub, in, len); + key->has_private = false; + evp_pkey_set0(pkey, &xwing_asn1_meth, key.release()); + return 1; +} + +static int xwing_get_pub_raw(const EvpPkey *pkey, uint8_t *out, + size_t *out_len) { + if (out == nullptr) { + *out_len = XWING_PUBLIC_KEY_BYTES; + return 1; + } + if (*out_len < XWING_PUBLIC_KEY_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_BUFFER_TOO_SMALL); + return 0; + } + const XWING_KEY *key = reinterpret_cast(pkey->pkey); + OPENSSL_memcpy(out, key->pub, XWING_PUBLIC_KEY_BYTES); + *out_len = XWING_PUBLIC_KEY_BYTES; + return 1; +} + +static int xwing_size(const EvpPkey *pkey) { return XWING_CIPHERTEXT_BYTES; } + +static int xwing_bits(const EvpPkey *pkey) { + return XWING_PUBLIC_KEY_BYTES * 8; +} + +// X-Wing has no parameters to copy. +static int pkey_xwing_copy_ctx(EvpPkeyCtx *dst, EvpPkeyCtx *src) { return 1; } + +static int pkey_xwing_keygen(EvpPkeyCtx *ctx, EvpPkey *pkey) { + auto key = MakeUnique(); + if (key == nullptr || !XWING_generate_key(key->pub, &key->priv)) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; + } + key->has_private = true; + evp_pkey_set0(pkey, &xwing_asn1_meth, key.release()); + return 1; +} + +static int xwing_kem_encap(uint8_t *out_ciphertext, size_t ciphertext_len, + uint8_t *out_secret, size_t secret_len, + const EVP_PKEY *peer_key) { + if (ciphertext_len != XWING_CIPHERTEXT_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_CIPHERTEXT_LENGTH); + return 0; + } + if (secret_len != XWING_SHARED_SECRET_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_SECRET_LENGTH); + return 0; + } + const XWING_KEY *peer_pubkey = + reinterpret_cast(FromOpaque(peer_key)->pkey); + return XWING_encap(out_ciphertext, out_secret, peer_pubkey->pub); +} + +static int xwing_kem_decap(uint8_t *out_secret, size_t secret_len, + const uint8_t *ciphertext, size_t ciphertext_len, + const EVP_PKEY *key) { + const XWING_KEY *priv = reinterpret_cast(FromOpaque(key)->pkey); + if (priv == nullptr || !priv->has_private) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NOT_A_PRIVATE_KEY); + return 0; + } + if (secret_len != XWING_SHARED_SECRET_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_SECRET_LENGTH); + return 0; + } + // XWING_decap does not accept wrong ciphertext lengths, so we must check for + // the proper length here. For consistency, we don't add an error to the error + // queue when a KEM decap fails due to incorrect ciphertext length. + if (ciphertext_len != XWING_CIPHERTEXT_BYTES) { + return 0; + } + return XWING_decap(out_secret, ciphertext, &priv->priv); +} + +static const EVP_KEM xwing_evp_kem = { + EVP_PKEY_XWING, // + XWING_CIPHERTEXT_BYTES, // + XWING_SHARED_SECRET_BYTES, // + &xwing_kem_encap, // + &xwing_kem_decap, // +}; + +const EVP_PKEY_CTX_METHOD xwing_pkey_meth = { + /*pkey_id=*/EVP_PKEY_XWING, + /*init=*/nullptr, + &pkey_xwing_copy_ctx, + /*cleanup=*/nullptr, + &pkey_xwing_keygen, + /*sign=*/nullptr, + /*sign_message=*/nullptr, + /*verify=*/nullptr, + /*verify_message=*/nullptr, + /*verify_recover=*/nullptr, + /*encrypt=*/nullptr, + /*decrypt=*/nullptr, + /*derive=*/nullptr, + /*paramgen=*/nullptr, + &KemAdapter::EncapMethod, + &KemAdapter::DecapMethod, + /*ctrl=*/nullptr, +}; + +const EVP_PKEY_ASN1_METHOD xwing_asn1_meth = { + EVP_PKEY_XWING, + /*oid=*/{}, + /*oid_len=*/0, + &xwing_pkey_meth, + + /*pub_decode=*/nullptr, + /*pub_encode=*/nullptr, + &xwing_pub_equal, + &xwing_pub_present, + &xwing_pub_copy, + + /*priv_decode=*/nullptr, + /*priv_encode=*/nullptr, + &xwing_priv_present, + + /*set_priv_raw=*/nullptr, + &xwing_set_priv_seed, + &xwing_set_pub_raw, + /*get_priv_raw=*/nullptr, + &xwing_get_priv_seed, + &xwing_get_pub_raw, + + /*set1_tls_encodedpoint=*/nullptr, + /*get1_tls_encodedpoint=*/nullptr, + /*pkey_opaque=*/nullptr, + + &xwing_size, + &xwing_bits, + + /*param_missing=*/nullptr, + /*param_copy=*/nullptr, + /*param_equal=*/nullptr, + + /*pkey_free=*/&xwing_free, +}; + +} // namespace + +const EVP_PKEY_ALG *EVP_pkey_xwing() { + static const EVP_PKEY_ALG kAlg = {&xwing_asn1_meth, &xwing_pkey_meth}; + return &kAlg; +} + +const EVP_KEM *EVP_kem_xwing() { return &xwing_evp_kem; } diff --git a/third_party/boringssl/src/crypto/evp/pbkdf.c b/third_party/boringssl/src/crypto/evp/pbkdf.c deleted file mode 100644 index f23a74bd..00000000 --- a/third_party/boringssl/src/crypto/evp/pbkdf.c +++ /dev/null @@ -1,146 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 1999. - */ -/* ==================================================================== - * Copyright (c) 1999 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include - -#include - -#include "../internal.h" - - -int PKCS5_PBKDF2_HMAC(const char *password, size_t password_len, - const uint8_t *salt, size_t salt_len, unsigned iterations, - const EVP_MD *digest, size_t key_len, uint8_t *out_key) { - // See RFC 8018, section 5.2. - int ret = 0; - size_t md_len = EVP_MD_size(digest); - uint32_t i = 1; - HMAC_CTX hctx; - HMAC_CTX_init(&hctx); - - if (!HMAC_Init_ex(&hctx, password, password_len, digest, NULL)) { - goto err; - } - - while (key_len > 0) { - size_t todo = md_len; - if (todo > key_len) { - todo = key_len; - } - - uint8_t i_buf[4]; - i_buf[0] = (uint8_t)((i >> 24) & 0xff); - i_buf[1] = (uint8_t)((i >> 16) & 0xff); - i_buf[2] = (uint8_t)((i >> 8) & 0xff); - i_buf[3] = (uint8_t)(i & 0xff); - - // Compute U_1. - uint8_t digest_tmp[EVP_MAX_MD_SIZE]; - if (!HMAC_Init_ex(&hctx, NULL, 0, NULL, NULL) || - !HMAC_Update(&hctx, salt, salt_len) || - !HMAC_Update(&hctx, i_buf, 4) || - !HMAC_Final(&hctx, digest_tmp, NULL)) { - goto err; - } - - OPENSSL_memcpy(out_key, digest_tmp, todo); - for (unsigned j = 1; j < iterations; j++) { - // Compute the remaining U_* values and XOR. - if (!HMAC_Init_ex(&hctx, NULL, 0, NULL, NULL) || - !HMAC_Update(&hctx, digest_tmp, md_len) || - !HMAC_Final(&hctx, digest_tmp, NULL)) { - goto err; - } - for (size_t k = 0; k < todo; k++) { - out_key[k] ^= digest_tmp[k]; - } - } - - key_len -= todo; - out_key += todo; - i++; - } - - // RFC 8018 describes iterations (c) as being a "positive integer", so a - // value of 0 is an error. - // - // Unfortunately not all consumers of PKCS5_PBKDF2_HMAC() check their return - // value, expecting it to succeed and unconditionally using |out_key|. As a - // precaution for such callsites in external code, the old behavior of - // iterations < 1 being treated as iterations == 1 is preserved, but - // additionally an error result is returned. - // - // TODO(eroman): Figure out how to remove this compatibility hack, or change - // the default to something more sensible like 2048. - if (iterations == 0) { - goto err; - } - - ret = 1; - -err: - HMAC_CTX_cleanup(&hctx); - return ret; -} - -int PKCS5_PBKDF2_HMAC_SHA1(const char *password, size_t password_len, - const uint8_t *salt, size_t salt_len, - unsigned iterations, size_t key_len, - uint8_t *out_key) { - return PKCS5_PBKDF2_HMAC(password, password_len, salt, salt_len, iterations, - EVP_sha1(), key_len, out_key); -} diff --git a/third_party/boringssl/src/crypto/evp/pbkdf.cc b/third_party/boringssl/src/crypto/evp/pbkdf.cc new file mode 100644 index 00000000..6388876e --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/pbkdf.cc @@ -0,0 +1,100 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include + +#include "../internal.h" + + +using namespace bssl; + +int PKCS5_PBKDF2_HMAC(const char *password, size_t password_len, + const uint8_t *salt, size_t salt_len, uint32_t iterations, + const EVP_MD *digest, size_t key_len, uint8_t *out_key) { + // See RFC 8018, section 5.2. + ScopedHMAC_CTX hctx; + if (!HMAC_Init_ex(hctx.get(), password, password_len, digest, nullptr)) { + return 0; + } + + uint32_t i = 1; + size_t md_len = EVP_MD_size(digest); + while (key_len > 0) { + size_t todo = md_len; + if (todo > key_len) { + todo = key_len; + } + + uint8_t i_buf[4]; + i_buf[0] = (uint8_t)((i >> 24) & 0xff); + i_buf[1] = (uint8_t)((i >> 16) & 0xff); + i_buf[2] = (uint8_t)((i >> 8) & 0xff); + i_buf[3] = (uint8_t)(i & 0xff); + + // Compute U_1. + uint8_t digest_tmp[EVP_MAX_MD_SIZE]; + if (!HMAC_Init_ex(hctx.get(), nullptr, 0, nullptr, nullptr) || + !HMAC_Update(hctx.get(), salt, salt_len) || + !HMAC_Update(hctx.get(), i_buf, 4) || + !HMAC_Final(hctx.get(), digest_tmp, nullptr)) { + return 0; + } + + OPENSSL_memcpy(out_key, digest_tmp, todo); + for (uint32_t j = 1; j < iterations; j++) { + // Compute the remaining U_* values and XOR. + if (!HMAC_Init_ex(hctx.get(), nullptr, 0, nullptr, nullptr) || + !HMAC_Update(hctx.get(), digest_tmp, md_len) || + !HMAC_Final(hctx.get(), digest_tmp, nullptr)) { + return 0; + } + for (size_t k = 0; k < todo; k++) { + out_key[k] ^= digest_tmp[k]; + } + } + + key_len -= todo; + out_key += todo; + i++; + } + + // RFC 8018 describes iterations (c) as being a "positive integer", so a + // value of 0 is an error. + // + // Unfortunately not all consumers of PKCS5_PBKDF2_HMAC() check their return + // value, expecting it to succeed and unconditionally using |out_key|. As a + // precaution for such callsites in external code, the old behavior of + // iterations < 1 being treated as iterations == 1 is preserved, but + // additionally an error result is returned. + // + // TODO(eroman): Figure out how to remove this compatibility hack, or change + // the default to something more sensible like 2048. + if (iterations == 0) { + return 0; + } + + return 1; +} + +int PKCS5_PBKDF2_HMAC_SHA1(const char *password, size_t password_len, + const uint8_t *salt, size_t salt_len, + uint32_t iterations, size_t key_len, + uint8_t *out_key) { + return PKCS5_PBKDF2_HMAC(password, password_len, salt, salt_len, iterations, + EVP_sha1(), key_len, out_key); +} diff --git a/third_party/boringssl/src/crypto/evp/print.c b/third_party/boringssl/src/crypto/evp/print.c deleted file mode 100644 index 0f4b65ea..00000000 --- a/third_party/boringssl/src/crypto/evp/print.c +++ /dev/null @@ -1,489 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "../internal.h" -#include "../fipsmodule/rsa/internal.h" - - -static int bn_print(BIO *bp, const char *number, const BIGNUM *num, - uint8_t *buf, int off) { - if (num == NULL) { - return 1; - } - - if (!BIO_indent(bp, off, 128)) { - return 0; - } - if (BN_is_zero(num)) { - if (BIO_printf(bp, "%s 0\n", number) <= 0) { - return 0; - } - return 1; - } - - if (BN_num_bytes(num) <= sizeof(long)) { - const char *neg = BN_is_negative(num) ? "-" : ""; - if (BIO_printf(bp, "%s %s%lu (%s0x%lx)\n", number, neg, - (unsigned long)num->d[0], neg, - (unsigned long)num->d[0]) <= 0) { - return 0; - } - } else { - buf[0] = 0; - if (BIO_printf(bp, "%s%s", number, - (BN_is_negative(num)) ? " (Negative)" : "") <= 0) { - return 0; - } - int n = BN_bn2bin(num, &buf[1]); - - if (buf[1] & 0x80) { - n++; - } else { - buf++; - } - - int i; - for (i = 0; i < n; i++) { - if ((i % 15) == 0) { - if (BIO_puts(bp, "\n") <= 0 || - !BIO_indent(bp, off + 4, 128)) { - return 0; - } - } - if (BIO_printf(bp, "%02x%s", buf[i], ((i + 1) == n) ? "" : ":") <= 0) { - return 0; - } - } - if (BIO_write(bp, "\n", 1) <= 0) { - return 0; - } - } - return 1; -} - -static void update_buflen(const BIGNUM *b, size_t *pbuflen) { - if (!b) { - return; - } - - size_t len = BN_num_bytes(b); - if (*pbuflen < len) { - *pbuflen = len; - } -} - -// RSA keys. - -static int do_rsa_print(BIO *out, const RSA *rsa, int off, - int include_private) { - const char *s, *str; - uint8_t *m = NULL; - int ret = 0, mod_len = 0; - size_t buf_len = 0; - - update_buflen(rsa->n, &buf_len); - update_buflen(rsa->e, &buf_len); - - if (include_private) { - update_buflen(rsa->d, &buf_len); - update_buflen(rsa->p, &buf_len); - update_buflen(rsa->q, &buf_len); - update_buflen(rsa->dmp1, &buf_len); - update_buflen(rsa->dmq1, &buf_len); - update_buflen(rsa->iqmp, &buf_len); - } - - m = (uint8_t *)OPENSSL_malloc(buf_len + 10); - if (m == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - goto err; - } - - if (rsa->n != NULL) { - mod_len = BN_num_bits(rsa->n); - } - - if (!BIO_indent(out, off, 128)) { - goto err; - } - - if (include_private && rsa->d) { - if (BIO_printf(out, "Private-Key: (%d bit)\n", mod_len) <= 0) { - goto err; - } - str = "modulus:"; - s = "publicExponent:"; - } else { - if (BIO_printf(out, "Public-Key: (%d bit)\n", mod_len) <= 0) { - goto err; - } - str = "Modulus:"; - s = "Exponent:"; - } - if (!bn_print(out, str, rsa->n, m, off) || - !bn_print(out, s, rsa->e, m, off)) { - goto err; - } - - if (include_private) { - if (!bn_print(out, "privateExponent:", rsa->d, m, off) || - !bn_print(out, "prime1:", rsa->p, m, off) || - !bn_print(out, "prime2:", rsa->q, m, off) || - !bn_print(out, "exponent1:", rsa->dmp1, m, off) || - !bn_print(out, "exponent2:", rsa->dmq1, m, off) || - !bn_print(out, "coefficient:", rsa->iqmp, m, off)) { - goto err; - } - } - ret = 1; - -err: - OPENSSL_free(m); - return ret; -} - -static int rsa_pub_print(BIO *bp, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *ctx) { - return do_rsa_print(bp, pkey->pkey.rsa, indent, 0); -} - -static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *ctx) { - return do_rsa_print(bp, pkey->pkey.rsa, indent, 1); -} - - -// DSA keys. - -static int do_dsa_print(BIO *bp, const DSA *x, int off, int ptype) { - uint8_t *m = NULL; - int ret = 0; - size_t buf_len = 0; - const char *ktype = NULL; - - const BIGNUM *priv_key, *pub_key; - - priv_key = NULL; - if (ptype == 2) { - priv_key = x->priv_key; - } - - pub_key = NULL; - if (ptype > 0) { - pub_key = x->pub_key; - } - - ktype = "DSA-Parameters"; - if (ptype == 2) { - ktype = "Private-Key"; - } else if (ptype == 1) { - ktype = "Public-Key"; - } - - update_buflen(x->p, &buf_len); - update_buflen(x->q, &buf_len); - update_buflen(x->g, &buf_len); - update_buflen(priv_key, &buf_len); - update_buflen(pub_key, &buf_len); - - m = (uint8_t *)OPENSSL_malloc(buf_len + 10); - if (m == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - goto err; - } - - if (priv_key) { - if (!BIO_indent(bp, off, 128) || - BIO_printf(bp, "%s: (%u bit)\n", ktype, BN_num_bits(x->p)) <= 0) { - goto err; - } - } - - if (!bn_print(bp, "priv:", priv_key, m, off) || - !bn_print(bp, "pub: ", pub_key, m, off) || - !bn_print(bp, "P: ", x->p, m, off) || - !bn_print(bp, "Q: ", x->q, m, off) || - !bn_print(bp, "G: ", x->g, m, off)) { - goto err; - } - ret = 1; - -err: - OPENSSL_free(m); - return ret; -} - -static int dsa_param_print(BIO *bp, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *ctx) { - return do_dsa_print(bp, pkey->pkey.dsa, indent, 0); -} - -static int dsa_pub_print(BIO *bp, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *ctx) { - return do_dsa_print(bp, pkey->pkey.dsa, indent, 1); -} - -static int dsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *ctx) { - return do_dsa_print(bp, pkey->pkey.dsa, indent, 2); -} - - -// EC keys. - -static int do_EC_KEY_print(BIO *bp, const EC_KEY *x, int off, int ktype) { - uint8_t *buffer = NULL; - const char *ecstr; - size_t buf_len = 0, i; - int ret = 0, reason = ERR_R_BIO_LIB; - BIGNUM *order = NULL; - BN_CTX *ctx = NULL; - const EC_GROUP *group; - const EC_POINT *public_key; - const BIGNUM *priv_key; - uint8_t *pub_key_bytes = NULL; - size_t pub_key_bytes_len = 0; - - if (x == NULL || (group = EC_KEY_get0_group(x)) == NULL) { - reason = ERR_R_PASSED_NULL_PARAMETER; - goto err; - } - - ctx = BN_CTX_new(); - if (ctx == NULL) { - reason = ERR_R_MALLOC_FAILURE; - goto err; - } - - if (ktype > 0) { - public_key = EC_KEY_get0_public_key(x); - if (public_key != NULL) { - pub_key_bytes_len = EC_POINT_point2oct( - group, public_key, EC_KEY_get_conv_form(x), NULL, 0, ctx); - if (pub_key_bytes_len == 0) { - reason = ERR_R_MALLOC_FAILURE; - goto err; - } - pub_key_bytes = OPENSSL_malloc(pub_key_bytes_len); - if (pub_key_bytes == NULL) { - reason = ERR_R_MALLOC_FAILURE; - goto err; - } - pub_key_bytes_len = - EC_POINT_point2oct(group, public_key, EC_KEY_get_conv_form(x), - pub_key_bytes, pub_key_bytes_len, ctx); - if (pub_key_bytes_len == 0) { - reason = ERR_R_MALLOC_FAILURE; - goto err; - } - buf_len = pub_key_bytes_len; - } - } - - if (ktype == 2) { - priv_key = EC_KEY_get0_private_key(x); - if (priv_key && (i = (size_t)BN_num_bytes(priv_key)) > buf_len) { - buf_len = i; - } - } else { - priv_key = NULL; - } - - if (ktype > 0) { - buf_len += 10; - if ((buffer = OPENSSL_malloc(buf_len)) == NULL) { - reason = ERR_R_MALLOC_FAILURE; - goto err; - } - } - if (ktype == 2) { - ecstr = "Private-Key"; - } else if (ktype == 1) { - ecstr = "Public-Key"; - } else { - ecstr = "ECDSA-Parameters"; - } - - if (!BIO_indent(bp, off, 128)) { - goto err; - } - order = BN_new(); - if (order == NULL || !EC_GROUP_get_order(group, order, NULL) || - BIO_printf(bp, "%s: (%u bit)\n", ecstr, BN_num_bits(order)) <= 0) { - goto err; - } - - if ((priv_key != NULL) && - !bn_print(bp, "priv:", priv_key, buffer, off)) { - goto err; - } - if (pub_key_bytes != NULL) { - BIO_hexdump(bp, pub_key_bytes, pub_key_bytes_len, off); - } - // TODO(fork): implement - /* - if (!ECPKParameters_print(bp, group, off)) - goto err; */ - ret = 1; - -err: - if (!ret) { - OPENSSL_PUT_ERROR(EVP, reason); - } - OPENSSL_free(pub_key_bytes); - BN_free(order); - BN_CTX_free(ctx); - OPENSSL_free(buffer); - return ret; -} - -static int eckey_param_print(BIO *bp, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *ctx) { - return do_EC_KEY_print(bp, pkey->pkey.ec, indent, 0); -} - -static int eckey_pub_print(BIO *bp, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *ctx) { - return do_EC_KEY_print(bp, pkey->pkey.ec, indent, 1); -} - - -static int eckey_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *ctx) { - return do_EC_KEY_print(bp, pkey->pkey.ec, indent, 2); -} - - -typedef struct { - int type; - int (*pub_print)(BIO *out, const EVP_PKEY *pkey, int indent, ASN1_PCTX *pctx); - int (*priv_print)(BIO *out, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *pctx); - int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *pctx); -} EVP_PKEY_PRINT_METHOD; - -static EVP_PKEY_PRINT_METHOD kPrintMethods[] = { - { - EVP_PKEY_RSA, - rsa_pub_print, - rsa_priv_print, - NULL /* param_print */, - }, - { - EVP_PKEY_DSA, - dsa_pub_print, - dsa_priv_print, - dsa_param_print, - }, - { - EVP_PKEY_EC, - eckey_pub_print, - eckey_priv_print, - eckey_param_print, - }, -}; - -static size_t kPrintMethodsLen = OPENSSL_ARRAY_SIZE(kPrintMethods); - -static EVP_PKEY_PRINT_METHOD *find_method(int type) { - for (size_t i = 0; i < kPrintMethodsLen; i++) { - if (kPrintMethods[i].type == type) { - return &kPrintMethods[i]; - } - } - return NULL; -} - -static int print_unsupported(BIO *out, const EVP_PKEY *pkey, int indent, - const char *kstr) { - BIO_indent(out, indent, 128); - BIO_printf(out, "%s algorithm unsupported\n", kstr); - return 1; -} - -int EVP_PKEY_print_public(BIO *out, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *pctx) { - EVP_PKEY_PRINT_METHOD *method = find_method(pkey->type); - if (method != NULL && method->pub_print != NULL) { - return method->pub_print(out, pkey, indent, pctx); - } - return print_unsupported(out, pkey, indent, "Public Key"); -} - -int EVP_PKEY_print_private(BIO *out, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *pctx) { - EVP_PKEY_PRINT_METHOD *method = find_method(pkey->type); - if (method != NULL && method->priv_print != NULL) { - return method->priv_print(out, pkey, indent, pctx); - } - return print_unsupported(out, pkey, indent, "Private Key"); -} - -int EVP_PKEY_print_params(BIO *out, const EVP_PKEY *pkey, int indent, - ASN1_PCTX *pctx) { - EVP_PKEY_PRINT_METHOD *method = find_method(pkey->type); - if (method != NULL && method->param_print != NULL) { - return method->param_print(out, pkey, indent, pctx); - } - return print_unsupported(out, pkey, indent, "Parameters"); -} diff --git a/third_party/boringssl/src/crypto/evp/print.cc b/third_party/boringssl/src/crypto/evp/print.cc new file mode 100644 index 00000000..2dce12fe --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/print.cc @@ -0,0 +1,287 @@ +// Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" + + +using namespace bssl; + +static int print_hex(BIO *bp, const uint8_t *data, size_t len, int off) { + for (size_t i = 0; i < len; i++) { + if ((i % 15) == 0) { + if (BIO_puts(bp, "\n") <= 0 || // + !BIO_indent(bp, off + 4, 128)) { + return 0; + } + } + if (BIO_printf(bp, "%02x%s", data[i], (i + 1 == len) ? "" : ":") <= 0) { + return 0; + } + } + if (BIO_write(bp, "\n", 1) <= 0) { + return 0; + } + return 1; +} + +static int bn_print(BIO *bp, const char *name, const BIGNUM *num, int off) { + if (num == nullptr) { + return 1; + } + + if (!BIO_indent(bp, off, 128)) { + return 0; + } + if (BN_is_zero(num)) { + if (BIO_printf(bp, "%s 0\n", name) <= 0) { + return 0; + } + return 1; + } + + uint64_t u64; + if (BN_get_u64(num, &u64)) { + const char *neg = BN_is_negative(num) ? "-" : ""; + return BIO_printf(bp, "%s %s%" PRIu64 " (%s0x%" PRIx64 ")\n", name, neg, + u64, neg, u64) > 0; + } + + if (BIO_printf(bp, "%s%s", name, + (BN_is_negative(num)) ? " (Negative)" : "") <= 0) { + return 0; + } + + // Print |num| in hex, adding a leading zero, as in ASN.1, if the high bit + // is set. + // + // TODO(davidben): Do we need to do this? We already print "(Negative)" above + // and negative values are never valid in keys anyway. + size_t len = BN_num_bytes(num); + uint8_t *buf = reinterpret_cast(OPENSSL_malloc(len + 1)); + if (buf == nullptr) { + return 0; + } + + buf[0] = 0; + BN_bn2bin(num, buf + 1); + int ret; + if (len > 0 && (buf[1] & 0x80) != 0) { + // Print the whole buffer. + ret = print_hex(bp, buf, len + 1, off); + } else { + // Skip the leading zero. + ret = print_hex(bp, buf + 1, len, off); + } + OPENSSL_free(buf); + return ret; +} + +// RSA keys. + +static int do_rsa_print(BIO *out, const RSA *rsa, int off, + int include_private) { + int mod_len = 0; + if (RSA_get0_n(rsa) != nullptr) { + mod_len = RSA_bits(rsa); + } + + if (!BIO_indent(out, off, 128)) { + return 0; + } + + const char *s, *str; + if (include_private && RSA_get0_d(rsa) != nullptr) { + if (BIO_printf(out, "Private-Key: (%d bit)\n", mod_len) <= 0) { + return 0; + } + str = "modulus:"; + s = "publicExponent:"; + } else { + if (BIO_printf(out, "Public-Key: (%d bit)\n", mod_len) <= 0) { + return 0; + } + str = "Modulus:"; + s = "Exponent:"; + } + if (!bn_print(out, str, RSA_get0_n(rsa), off) || + !bn_print(out, s, RSA_get0_e(rsa), off)) { + return 0; + } + + if (include_private) { + if (!bn_print(out, "privateExponent:", RSA_get0_d(rsa), off) || + !bn_print(out, "prime1:", RSA_get0_p(rsa), off) || + !bn_print(out, "prime2:", RSA_get0_q(rsa), off) || + !bn_print(out, "exponent1:", RSA_get0_dmp1(rsa), off) || + !bn_print(out, "exponent2:", RSA_get0_dmq1(rsa), off) || + !bn_print(out, "coefficient:", RSA_get0_iqmp(rsa), off)) { + return 0; + } + } + + return 1; +} + +static int rsa_pub_print(BIO *bp, const EVP_PKEY *pkey, int indent) { + return do_rsa_print(bp, EVP_PKEY_get0_RSA(pkey), indent, 0); +} + +static int rsa_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent) { + return do_rsa_print(bp, EVP_PKEY_get0_RSA(pkey), indent, 1); +} + + +// EC keys. + +static int do_EC_KEY_print(BIO *bp, const EC_KEY *x, int off, int ktype) { + const EC_GROUP *group; + if (x == nullptr || (group = EC_KEY_get0_group(x)) == nullptr) { + OPENSSL_PUT_ERROR(EVP, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + const char *ecstr; + if (ktype == 2) { + ecstr = "Private-Key"; + } else if (ktype == 1) { + ecstr = "Public-Key"; + } else { + ecstr = "ECDSA-Parameters"; + } + + if (!BIO_indent(bp, off, 128)) { + return 0; + } + int curve_name = EC_GROUP_get_curve_name(group); + if (BIO_printf(bp, "%s: (%s)\n", ecstr, + curve_name == NID_undef + ? "unknown curve" + : EC_curve_nid2nist(curve_name)) <= 0) { + return 0; + } + + if (ktype == 2) { + const BIGNUM *priv_key = EC_KEY_get0_private_key(x); + if (priv_key != nullptr && // + !bn_print(bp, "priv:", priv_key, off)) { + return 0; + } + } + + if (ktype > 0 && EC_KEY_get0_public_key(x) != nullptr) { + uint8_t *pub = nullptr; + size_t pub_len = EC_KEY_key2buf(x, EC_KEY_get_conv_form(x), &pub, nullptr); + if (pub_len == 0) { + return 0; + } + int ret = BIO_indent(bp, off, 128) && // + BIO_puts(bp, "pub:") > 0 && // + print_hex(bp, pub, pub_len, off); + OPENSSL_free(pub); + if (!ret) { + return 0; + } + } + + return 1; +} + +static int eckey_param_print(BIO *bp, const EVP_PKEY *pkey, int indent) { + return do_EC_KEY_print(bp, EVP_PKEY_get0_EC_KEY(pkey), indent, 0); +} + +static int eckey_pub_print(BIO *bp, const EVP_PKEY *pkey, int indent) { + return do_EC_KEY_print(bp, EVP_PKEY_get0_EC_KEY(pkey), indent, 1); +} + + +static int eckey_priv_print(BIO *bp, const EVP_PKEY *pkey, int indent) { + return do_EC_KEY_print(bp, EVP_PKEY_get0_EC_KEY(pkey), indent, 2); +} + + +typedef struct { + int type; + int (*pub_print)(BIO *out, const EVP_PKEY *pkey, int indent); + int (*priv_print)(BIO *out, const EVP_PKEY *pkey, int indent); + int (*param_print)(BIO *out, const EVP_PKEY *pkey, int indent); +} EVP_PKEY_PRINT_METHOD; + +static const EVP_PKEY_PRINT_METHOD kPrintMethods[] = { + { + EVP_PKEY_RSA, + rsa_pub_print, + rsa_priv_print, + /*param_print=*/nullptr, + }, + { + EVP_PKEY_EC, + eckey_pub_print, + eckey_priv_print, + eckey_param_print, + }, +}; + +static const EVP_PKEY_PRINT_METHOD *find_method(int type) { + for (const auto &p : kPrintMethods) { + if (p.type == type) { + return &p; + } + } + return nullptr; +} + +static int print_unsupported(BIO *out, const EVP_PKEY *pkey, int indent, + const char *kstr) { + BIO_indent(out, indent, 128); + BIO_printf(out, "%s algorithm unsupported\n", kstr); + return 1; +} + +int EVP_PKEY_print_public(BIO *out, const EVP_PKEY *pkey, int indent, + ASN1_PCTX *pctx) { + const EVP_PKEY_PRINT_METHOD *method = find_method(EVP_PKEY_id(pkey)); + if (method != nullptr && method->pub_print != nullptr) { + return method->pub_print(out, pkey, indent); + } + return print_unsupported(out, pkey, indent, "Public Key"); +} + +int EVP_PKEY_print_private(BIO *out, const EVP_PKEY *pkey, int indent, + ASN1_PCTX *pctx) { + const EVP_PKEY_PRINT_METHOD *method = find_method(EVP_PKEY_id(pkey)); + if (method != nullptr && method->priv_print != nullptr) { + return method->priv_print(out, pkey, indent); + } + return print_unsupported(out, pkey, indent, "Private Key"); +} + +int EVP_PKEY_print_params(BIO *out, const EVP_PKEY *pkey, int indent, + ASN1_PCTX *pctx) { + const EVP_PKEY_PRINT_METHOD *method = find_method(EVP_PKEY_id(pkey)); + if (method != nullptr && method->param_print != nullptr) { + return method->param_print(out, pkey, indent); + } + return print_unsupported(out, pkey, indent, "Parameters"); +} diff --git a/third_party/boringssl/src/crypto/evp/scrypt.c b/third_party/boringssl/src/crypto/evp/scrypt.c deleted file mode 100644 index 14a5e02c..00000000 --- a/third_party/boringssl/src/crypto/evp/scrypt.c +++ /dev/null @@ -1,210 +0,0 @@ -/* - * Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - */ - -#include - -#include - -#include -#include - -#include "../internal.h" - - -// This file implements scrypt, described in RFC 7914. -// -// Note scrypt refers to both "blocks" and a "block size" parameter, r. These -// are two different notions of blocks. A Salsa20 block is 64 bytes long, -// represented in this implementation by 16 |uint32_t|s. |r| determines the -// number of 64-byte Salsa20 blocks in a scryptBlockMix block, which is 2 * |r| -// Salsa20 blocks. This implementation refers to them as Salsa20 blocks and -// scrypt blocks, respectively. - -// A block_t is a Salsa20 block. -typedef struct { uint32_t words[16]; } block_t; - -static_assert(sizeof(block_t) == 64, "block_t has padding"); - -// salsa208_word_specification implements the Salsa20/8 core function, also -// described in RFC 7914, section 3. It modifies the block at |inout| -// in-place. -static void salsa208_word_specification(block_t *inout) { - block_t x; - OPENSSL_memcpy(&x, inout, sizeof(x)); - - for (int i = 8; i > 0; i -= 2) { - x.words[4] ^= CRYPTO_rotl_u32(x.words[0] + x.words[12], 7); - x.words[8] ^= CRYPTO_rotl_u32(x.words[4] + x.words[0], 9); - x.words[12] ^= CRYPTO_rotl_u32(x.words[8] + x.words[4], 13); - x.words[0] ^= CRYPTO_rotl_u32(x.words[12] + x.words[8], 18); - x.words[9] ^= CRYPTO_rotl_u32(x.words[5] + x.words[1], 7); - x.words[13] ^= CRYPTO_rotl_u32(x.words[9] + x.words[5], 9); - x.words[1] ^= CRYPTO_rotl_u32(x.words[13] + x.words[9], 13); - x.words[5] ^= CRYPTO_rotl_u32(x.words[1] + x.words[13], 18); - x.words[14] ^= CRYPTO_rotl_u32(x.words[10] + x.words[6], 7); - x.words[2] ^= CRYPTO_rotl_u32(x.words[14] + x.words[10], 9); - x.words[6] ^= CRYPTO_rotl_u32(x.words[2] + x.words[14], 13); - x.words[10] ^= CRYPTO_rotl_u32(x.words[6] + x.words[2], 18); - x.words[3] ^= CRYPTO_rotl_u32(x.words[15] + x.words[11], 7); - x.words[7] ^= CRYPTO_rotl_u32(x.words[3] + x.words[15], 9); - x.words[11] ^= CRYPTO_rotl_u32(x.words[7] + x.words[3], 13); - x.words[15] ^= CRYPTO_rotl_u32(x.words[11] + x.words[7], 18); - x.words[1] ^= CRYPTO_rotl_u32(x.words[0] + x.words[3], 7); - x.words[2] ^= CRYPTO_rotl_u32(x.words[1] + x.words[0], 9); - x.words[3] ^= CRYPTO_rotl_u32(x.words[2] + x.words[1], 13); - x.words[0] ^= CRYPTO_rotl_u32(x.words[3] + x.words[2], 18); - x.words[6] ^= CRYPTO_rotl_u32(x.words[5] + x.words[4], 7); - x.words[7] ^= CRYPTO_rotl_u32(x.words[6] + x.words[5], 9); - x.words[4] ^= CRYPTO_rotl_u32(x.words[7] + x.words[6], 13); - x.words[5] ^= CRYPTO_rotl_u32(x.words[4] + x.words[7], 18); - x.words[11] ^= CRYPTO_rotl_u32(x.words[10] + x.words[9], 7); - x.words[8] ^= CRYPTO_rotl_u32(x.words[11] + x.words[10], 9); - x.words[9] ^= CRYPTO_rotl_u32(x.words[8] + x.words[11], 13); - x.words[10] ^= CRYPTO_rotl_u32(x.words[9] + x.words[8], 18); - x.words[12] ^= CRYPTO_rotl_u32(x.words[15] + x.words[14], 7); - x.words[13] ^= CRYPTO_rotl_u32(x.words[12] + x.words[15], 9); - x.words[14] ^= CRYPTO_rotl_u32(x.words[13] + x.words[12], 13); - x.words[15] ^= CRYPTO_rotl_u32(x.words[14] + x.words[13], 18); - } - - for (int i = 0; i < 16; ++i) { - inout->words[i] += x.words[i]; - } -} - -// xor_block sets |*out| to be |*a| XOR |*b|. -static void xor_block(block_t *out, const block_t *a, const block_t *b) { - for (size_t i = 0; i < 16; i++) { - out->words[i] = a->words[i] ^ b->words[i]; - } -} - -// scryptBlockMix implements the function described in RFC 7914, section 4. B' -// is written to |out|. |out| and |B| may not alias and must be each one scrypt -// block (2 * |r| Salsa20 blocks) long. -static void scryptBlockMix(block_t *out, const block_t *B, uint64_t r) { - assert(out != B); - - block_t X; - OPENSSL_memcpy(&X, &B[r * 2 - 1], sizeof(X)); - for (uint64_t i = 0; i < r * 2; i++) { - xor_block(&X, &X, &B[i]); - salsa208_word_specification(&X); - - // This implements the permutation in step 3. - OPENSSL_memcpy(&out[i / 2 + (i & 1) * r], &X, sizeof(X)); - } -} - -// scryptROMix implements the function described in RFC 7914, section 5. |B| is -// an scrypt block (2 * |r| Salsa20 blocks) and is modified in-place. |T| and -// |V| are scratch space allocated by the caller. |T| must have space for one -// scrypt block (2 * |r| Salsa20 blocks). |V| must have space for |N| scrypt -// blocks (2 * |r| * |N| Salsa20 blocks). -static void scryptROMix(block_t *B, uint64_t r, uint64_t N, block_t *T, - block_t *V) { - // Steps 1 and 2. - OPENSSL_memcpy(V, B, 2 * r * sizeof(block_t)); - for (uint64_t i = 1; i < N; i++) { - scryptBlockMix(&V[2 * r * i /* scrypt block i */], - &V[2 * r * (i - 1) /* scrypt block i-1 */], r); - } - scryptBlockMix(B, &V[2 * r * (N - 1) /* scrypt block N-1 */], r); - - // Step 3. - for (uint64_t i = 0; i < N; i++) { - // Note this assumes |N| <= 2^32 and is a power of 2. - uint32_t j = B[2 * r - 1].words[0] & (N - 1); - for (size_t k = 0; k < 2 * r; k++) { - xor_block(&T[k], &B[k], &V[2 * r * j + k]); - } - scryptBlockMix(B, T, r); - } -} - -// SCRYPT_PR_MAX is the maximum value of p * r. This is equivalent to the -// bounds on p in section 6: -// -// p <= ((2^32-1) * hLen) / MFLen iff -// p <= ((2^32-1) * 32) / (128 * r) iff -// p * r <= (2^30-1) -#define SCRYPT_PR_MAX ((1 << 30) - 1) - -// SCRYPT_MAX_MEM is the default maximum memory that may be allocated by -// |EVP_PBE_scrypt|. -#define SCRYPT_MAX_MEM (1024 * 1024 * 32) - -int EVP_PBE_scrypt(const char *password, size_t password_len, - const uint8_t *salt, size_t salt_len, uint64_t N, uint64_t r, - uint64_t p, size_t max_mem, uint8_t *out_key, - size_t key_len) { - if (r == 0 || p == 0 || p > SCRYPT_PR_MAX / r || - // |N| must be a power of two. - N < 2 || (N & (N - 1)) || - // We only support |N| <= 2^32 in |scryptROMix|. - N > UINT64_C(1) << 32 || - // Check that |N| < 2^(128×r / 8). - (16 * r <= 63 && N >= UINT64_C(1) << (16 * r))) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PARAMETERS); - return 0; - } - - // Determine the amount of memory needed. B, T, and V are |p|, 1, and |N| - // scrypt blocks, respectively. Each scrypt block is 2*|r| |block_t|s. - if (max_mem == 0) { - max_mem = SCRYPT_MAX_MEM; - } - - size_t max_scrypt_blocks = max_mem / (2 * r * sizeof(block_t)); - if (max_scrypt_blocks < p + 1 || - max_scrypt_blocks - p - 1 < N) { - OPENSSL_PUT_ERROR(EVP, EVP_R_MEMORY_LIMIT_EXCEEDED); - return 0; - } - - // Allocate and divide up the scratch space. |max_mem| fits in a size_t, which - // is no bigger than uint64_t, so none of these operations may overflow. - static_assert(UINT64_MAX >= ((size_t)-1), "size_t exceeds uint64_t"); - size_t B_blocks = p * 2 * r; - size_t B_bytes = B_blocks * sizeof(block_t); - size_t T_blocks = 2 * r; - size_t V_blocks = N * 2 * r; - block_t *B = OPENSSL_malloc((B_blocks + T_blocks + V_blocks) * sizeof(block_t)); - if (B == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return 0; - } - - int ret = 0; - block_t *T = B + B_blocks; - block_t *V = T + T_blocks; - - // NOTE: PKCS5_PBKDF2_HMAC can only fail due to allocation failure - // or |iterations| of 0 (we pass 1 here). This is consistent with - // the documented failure conditions of EVP_PBE_scrypt. - if (!PKCS5_PBKDF2_HMAC(password, password_len, salt, salt_len, 1, - EVP_sha256(), B_bytes, (uint8_t *)B)) { - goto err; - } - - for (uint64_t i = 0; i < p; i++) { - scryptROMix(B + 2 * r * i, r, N, T, V); - } - - if (!PKCS5_PBKDF2_HMAC(password, password_len, (const uint8_t *)B, B_bytes, 1, - EVP_sha256(), key_len, out_key)) { - goto err; - } - - ret = 1; - -err: - OPENSSL_free(B); - return ret; -} diff --git a/third_party/boringssl/src/crypto/evp/scrypt.cc b/third_party/boringssl/src/crypto/evp/scrypt.cc new file mode 100644 index 00000000..8c229b2e --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/scrypt.cc @@ -0,0 +1,218 @@ +// Copyright 2015-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + +#include "../internal.h" + + +// This file implements scrypt, described in RFC 7914. +// +// Note scrypt refers to both "blocks" and a "block size" parameter, r. These +// are two different notions of blocks. A Salsa20 block is 64 bytes long, +// represented in this implementation by 16 |uint32_t|s. |r| determines the +// number of 64-byte Salsa20 blocks in a scryptBlockMix block, which is 2 * |r| +// Salsa20 blocks. This implementation refers to them as Salsa20 blocks and +// scrypt blocks, respectively. + +using namespace bssl; + +// A block_t is a Salsa20 block. +typedef struct { + uint32_t words[16]; +} block_t; + +static_assert(sizeof(block_t) == 64, "block_t has padding"); + +// salsa208_word_specification implements the Salsa20/8 core function, also +// described in RFC 7914, section 3. It modifies the block at |inout| +// in-place. +static void salsa208_word_specification(block_t *inout) { + block_t x; + OPENSSL_memcpy(&x, inout, sizeof(x)); + + for (int i = 8; i > 0; i -= 2) { + x.words[4] ^= CRYPTO_rotl_u32(x.words[0] + x.words[12], 7); + x.words[8] ^= CRYPTO_rotl_u32(x.words[4] + x.words[0], 9); + x.words[12] ^= CRYPTO_rotl_u32(x.words[8] + x.words[4], 13); + x.words[0] ^= CRYPTO_rotl_u32(x.words[12] + x.words[8], 18); + x.words[9] ^= CRYPTO_rotl_u32(x.words[5] + x.words[1], 7); + x.words[13] ^= CRYPTO_rotl_u32(x.words[9] + x.words[5], 9); + x.words[1] ^= CRYPTO_rotl_u32(x.words[13] + x.words[9], 13); + x.words[5] ^= CRYPTO_rotl_u32(x.words[1] + x.words[13], 18); + x.words[14] ^= CRYPTO_rotl_u32(x.words[10] + x.words[6], 7); + x.words[2] ^= CRYPTO_rotl_u32(x.words[14] + x.words[10], 9); + x.words[6] ^= CRYPTO_rotl_u32(x.words[2] + x.words[14], 13); + x.words[10] ^= CRYPTO_rotl_u32(x.words[6] + x.words[2], 18); + x.words[3] ^= CRYPTO_rotl_u32(x.words[15] + x.words[11], 7); + x.words[7] ^= CRYPTO_rotl_u32(x.words[3] + x.words[15], 9); + x.words[11] ^= CRYPTO_rotl_u32(x.words[7] + x.words[3], 13); + x.words[15] ^= CRYPTO_rotl_u32(x.words[11] + x.words[7], 18); + x.words[1] ^= CRYPTO_rotl_u32(x.words[0] + x.words[3], 7); + x.words[2] ^= CRYPTO_rotl_u32(x.words[1] + x.words[0], 9); + x.words[3] ^= CRYPTO_rotl_u32(x.words[2] + x.words[1], 13); + x.words[0] ^= CRYPTO_rotl_u32(x.words[3] + x.words[2], 18); + x.words[6] ^= CRYPTO_rotl_u32(x.words[5] + x.words[4], 7); + x.words[7] ^= CRYPTO_rotl_u32(x.words[6] + x.words[5], 9); + x.words[4] ^= CRYPTO_rotl_u32(x.words[7] + x.words[6], 13); + x.words[5] ^= CRYPTO_rotl_u32(x.words[4] + x.words[7], 18); + x.words[11] ^= CRYPTO_rotl_u32(x.words[10] + x.words[9], 7); + x.words[8] ^= CRYPTO_rotl_u32(x.words[11] + x.words[10], 9); + x.words[9] ^= CRYPTO_rotl_u32(x.words[8] + x.words[11], 13); + x.words[10] ^= CRYPTO_rotl_u32(x.words[9] + x.words[8], 18); + x.words[12] ^= CRYPTO_rotl_u32(x.words[15] + x.words[14], 7); + x.words[13] ^= CRYPTO_rotl_u32(x.words[12] + x.words[15], 9); + x.words[14] ^= CRYPTO_rotl_u32(x.words[13] + x.words[12], 13); + x.words[15] ^= CRYPTO_rotl_u32(x.words[14] + x.words[13], 18); + } + + for (int i = 0; i < 16; ++i) { + inout->words[i] += x.words[i]; + } +} + +// xor_block sets |*out| to be |*a| XOR |*b|. +static void xor_block(block_t *out, const block_t *a, const block_t *b) { + for (size_t i = 0; i < 16; i++) { + out->words[i] = a->words[i] ^ b->words[i]; + } +} + +// scryptBlockMix implements the function described in RFC 7914, section 4. B' +// is written to |out|. |out| and |B| may not alias and must be each one scrypt +// block (2 * |r| Salsa20 blocks) long. +static void scryptBlockMix(block_t *out, const block_t *B, uint64_t r) { + assert(out != B); + + block_t X; + OPENSSL_memcpy(&X, &B[r * 2 - 1], sizeof(X)); + for (uint64_t i = 0; i < r * 2; i++) { + xor_block(&X, &X, &B[i]); + salsa208_word_specification(&X); + + // This implements the permutation in step 3. + OPENSSL_memcpy(&out[i / 2 + (i & 1) * r], &X, sizeof(X)); + } +} + +// scryptROMix implements the function described in RFC 7914, section 5. |B| is +// an scrypt block (2 * |r| Salsa20 blocks) and is modified in-place. |T| and +// |V| are scratch space allocated by the caller. |T| must have space for one +// scrypt block (2 * |r| Salsa20 blocks). |V| must have space for |N| scrypt +// blocks (2 * |r| * |N| Salsa20 blocks). +static void scryptROMix(block_t *B, uint64_t r, uint64_t N, block_t *T, + block_t *V) { + // Steps 1 and 2. + OPENSSL_memcpy(V, B, 2 * r * sizeof(block_t)); + for (uint64_t i = 1; i < N; i++) { + scryptBlockMix(&V[2 * r * i /* scrypt block i */], + &V[2 * r * (i - 1) /* scrypt block i-1 */], r); + } + scryptBlockMix(B, &V[2 * r * (N - 1) /* scrypt block N-1 */], r); + + // Step 3. + for (uint64_t i = 0; i < N; i++) { + // Note this assumes |N| <= 2^32 and is a power of 2. + uint32_t j = B[2 * r - 1].words[0] & (N - 1); + for (size_t k = 0; k < 2 * r; k++) { + xor_block(&T[k], &B[k], &V[2 * r * j + k]); + } + scryptBlockMix(B, T, r); + } +} + +// SCRYPT_PR_MAX is the maximum value of p * r. This is equivalent to the +// bounds on p in section 6: +// +// p <= ((2^32-1) * hLen) / MFLen iff +// p <= ((2^32-1) * 32) / (128 * r) iff +// p * r <= (2^30-1) +#define SCRYPT_PR_MAX ((1 << 30) - 1) + +// SCRYPT_MAX_MEM is the default maximum memory that may be allocated by +// |EVP_PBE_scrypt|. +#define SCRYPT_MAX_MEM (1024 * 1024 * 65) + +int EVP_PBE_scrypt(const char *password, size_t password_len, + const uint8_t *salt, size_t salt_len, uint64_t N, uint64_t r, + uint64_t p, size_t max_mem, uint8_t *out_key, + size_t key_len) { + if (r == 0 || p == 0 || p > SCRYPT_PR_MAX / r || + // |N| must be a power of two. + N < 2 || (N & (N - 1)) || + // We only support |N| <= 2^32 in |scryptROMix|. + N > UINT64_C(1) << 32 || + // Check that |N| < 2^(128×r / 8). + (16 * r <= 63 && N >= UINT64_C(1) << (16 * r))) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PARAMETERS); + return 0; + } + + // Determine the amount of memory needed. B, T, and V are |p|, 1, and |N| + // scrypt blocks, respectively. Each scrypt block is 2*|r| |block_t|s. + if (max_mem == 0) { + max_mem = SCRYPT_MAX_MEM; + } + + size_t max_scrypt_blocks = max_mem / (2 * r * sizeof(block_t)); + if (max_scrypt_blocks < p + 1 || max_scrypt_blocks - p - 1 < N) { + OPENSSL_PUT_ERROR(EVP, EVP_R_MEMORY_LIMIT_EXCEEDED); + return 0; + } + + // Allocate and divide up the scratch space. |max_mem| fits in a size_t, which + // is no bigger than uint64_t, so none of these operations may overflow. + static_assert(UINT64_MAX >= SIZE_MAX, "size_t exceeds uint64_t"); + size_t B_blocks = p * 2 * r; + size_t B_bytes = B_blocks * sizeof(block_t); + size_t T_blocks = 2 * r; + size_t V_blocks = N * 2 * r; + block_t *B = reinterpret_cast( + OPENSSL_calloc(B_blocks + T_blocks + V_blocks, sizeof(block_t))); + if (B == nullptr) { + return 0; + } + + int ret = 0; + block_t *T = B + B_blocks; + block_t *V = T + T_blocks; + + // NOTE: PKCS5_PBKDF2_HMAC can only fail due to allocation failure + // or |iterations| of 0 (we pass 1 here). This is consistent with + // the documented failure conditions of EVP_PBE_scrypt. + if (!PKCS5_PBKDF2_HMAC(password, password_len, salt, salt_len, 1, + EVP_sha256(), B_bytes, (uint8_t *)B)) { + goto err; + } + + for (uint64_t i = 0; i < p; i++) { + scryptROMix(B + 2 * r * i, r, N, T, V); + } + + if (!PKCS5_PBKDF2_HMAC(password, password_len, (const uint8_t *)B, B_bytes, 1, + EVP_sha256(), key_len, out_key)) { + goto err; + } + + ret = 1; + +err: + OPENSSL_free(B); + return ret; +} diff --git a/third_party/boringssl/src/crypto/evp/sign.c b/third_party/boringssl/src/crypto/evp/sign.c deleted file mode 100644 index ced86bdf..00000000 --- a/third_party/boringssl/src/crypto/evp/sign.c +++ /dev/null @@ -1,151 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include "internal.h" - - -int EVP_SignInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl) { - return EVP_DigestInit_ex(ctx, type, impl); -} - -int EVP_SignInit(EVP_MD_CTX *ctx, const EVP_MD *type) { - return EVP_DigestInit(ctx, type); -} - -int EVP_SignUpdate(EVP_MD_CTX *ctx, const void *data, size_t len) { - return EVP_DigestUpdate(ctx, data, len); -} - -int EVP_SignFinal(const EVP_MD_CTX *ctx, uint8_t *sig, - unsigned int *out_sig_len, EVP_PKEY *pkey) { - uint8_t m[EVP_MAX_MD_SIZE]; - unsigned int m_len; - int ret = 0; - EVP_MD_CTX tmp_ctx; - EVP_PKEY_CTX *pkctx = NULL; - size_t sig_len = EVP_PKEY_size(pkey); - - *out_sig_len = 0; - EVP_MD_CTX_init(&tmp_ctx); - if (!EVP_MD_CTX_copy_ex(&tmp_ctx, ctx) || - !EVP_DigestFinal_ex(&tmp_ctx, m, &m_len)) { - goto out; - } - EVP_MD_CTX_cleanup(&tmp_ctx); - - pkctx = EVP_PKEY_CTX_new(pkey, NULL); - if (!pkctx || !EVP_PKEY_sign_init(pkctx) || - !EVP_PKEY_CTX_set_signature_md(pkctx, ctx->digest) || - !EVP_PKEY_sign(pkctx, sig, &sig_len, m, m_len)) { - goto out; - } - *out_sig_len = sig_len; - ret = 1; - -out: - if (pkctx) { - EVP_PKEY_CTX_free(pkctx); - } - - return ret; -} - -int EVP_VerifyInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl) { - return EVP_DigestInit_ex(ctx, type, impl); -} - -int EVP_VerifyInit(EVP_MD_CTX *ctx, const EVP_MD *type) { - return EVP_DigestInit(ctx, type); -} - -int EVP_VerifyUpdate(EVP_MD_CTX *ctx, const void *data, size_t len) { - return EVP_DigestUpdate(ctx, data, len); -} - -int EVP_VerifyFinal(EVP_MD_CTX *ctx, const uint8_t *sig, size_t sig_len, - EVP_PKEY *pkey) { - uint8_t m[EVP_MAX_MD_SIZE]; - unsigned int m_len; - int ret = 0; - EVP_MD_CTX tmp_ctx; - EVP_PKEY_CTX *pkctx = NULL; - - EVP_MD_CTX_init(&tmp_ctx); - if (!EVP_MD_CTX_copy_ex(&tmp_ctx, ctx) || - !EVP_DigestFinal_ex(&tmp_ctx, m, &m_len)) { - EVP_MD_CTX_cleanup(&tmp_ctx); - goto out; - } - EVP_MD_CTX_cleanup(&tmp_ctx); - - pkctx = EVP_PKEY_CTX_new(pkey, NULL); - if (!pkctx || - !EVP_PKEY_verify_init(pkctx) || - !EVP_PKEY_CTX_set_signature_md(pkctx, ctx->digest)) { - goto out; - } - ret = EVP_PKEY_verify(pkctx, sig, sig_len, m, m_len); - -out: - EVP_PKEY_CTX_free(pkctx); - return ret; -} - diff --git a/third_party/boringssl/src/crypto/evp/sign.cc b/third_party/boringssl/src/crypto/evp/sign.cc new file mode 100644 index 00000000..e14acc4e --- /dev/null +++ b/third_party/boringssl/src/crypto/evp/sign.cc @@ -0,0 +1,95 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + +#include "internal.h" + + +int EVP_SignInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl) { + return EVP_DigestInit_ex(ctx, type, impl); +} + +int EVP_SignInit(EVP_MD_CTX *ctx, const EVP_MD *type) { + return EVP_DigestInit(ctx, type); +} + +int EVP_SignUpdate(EVP_MD_CTX *ctx, const void *data, size_t len) { + return EVP_DigestUpdate(ctx, data, len); +} + +int EVP_SignFinal(const EVP_MD_CTX *ctx, uint8_t *sig, unsigned *out_sig_len, + EVP_PKEY *pkey) { + // Ensure the final result will fit in |unsigned|. + size_t sig_len = EVP_PKEY_size(pkey); + if (sig_len > UINT_MAX) { + sig_len = UINT_MAX; + } + + *out_sig_len = 0; + uint8_t m[EVP_MAX_MD_SIZE]; + unsigned m_len; + bssl::ScopedEVP_MD_CTX tmp_ctx; + if (!EVP_MD_CTX_copy_ex(tmp_ctx.get(), ctx) || + !EVP_DigestFinal_ex(tmp_ctx.get(), m, &m_len)) { + return 0; + } + + bssl::UniquePtr pkctx(EVP_PKEY_CTX_new(pkey, nullptr)); + if (!pkctx || // + !EVP_PKEY_sign_init(pkctx.get()) || + !EVP_PKEY_CTX_set_signature_md(pkctx.get(), ctx->digest) || + !EVP_PKEY_sign(pkctx.get(), sig, &sig_len, m, m_len)) { + return 0; + } + *out_sig_len = static_cast(sig_len); + return 1; +} + +int EVP_VerifyInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *impl) { + return EVP_DigestInit_ex(ctx, type, impl); +} + +int EVP_VerifyInit(EVP_MD_CTX *ctx, const EVP_MD *type) { + return EVP_DigestInit(ctx, type); +} + +int EVP_VerifyUpdate(EVP_MD_CTX *ctx, const void *data, size_t len) { + return EVP_DigestUpdate(ctx, data, len); +} + +int EVP_VerifyFinal(EVP_MD_CTX *ctx, const uint8_t *sig, size_t sig_len, + EVP_PKEY *pkey) { + uint8_t m[EVP_MAX_MD_SIZE]; + unsigned m_len; + bssl::ScopedEVP_MD_CTX tmp_ctx; + if (!EVP_MD_CTX_copy_ex(tmp_ctx.get(), ctx) || + !EVP_DigestFinal_ex(tmp_ctx.get(), m, &m_len)) { + return 0; + } + + bssl::UniquePtr pkctx(EVP_PKEY_CTX_new(pkey, nullptr)); + if (!pkctx || + !EVP_PKEY_verify_init(pkctx.get()) || + !EVP_PKEY_CTX_set_signature_md(pkctx.get(), ctx->digest)) { + return 0; + } + return EVP_PKEY_verify(pkctx.get(), sig, sig_len, m, m_len); +} + diff --git a/third_party/boringssl/src/crypto/ex_data.c b/third_party/boringssl/src/crypto/ex_data.c deleted file mode 100644 index 71d60a52..00000000 --- a/third_party/boringssl/src/crypto/ex_data.c +++ /dev/null @@ -1,261 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -/* ==================================================================== - * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include "internal.h" - - -DEFINE_STACK_OF(CRYPTO_EX_DATA_FUNCS) - -struct crypto_ex_data_func_st { - long argl; // Arbitary long - void *argp; // Arbitary void pointer - CRYPTO_EX_free *free_func; -}; - -int CRYPTO_get_ex_new_index(CRYPTO_EX_DATA_CLASS *ex_data_class, int *out_index, - long argl, void *argp, CRYPTO_EX_free *free_func) { - CRYPTO_EX_DATA_FUNCS *funcs; - int ret = 0; - - funcs = OPENSSL_malloc(sizeof(CRYPTO_EX_DATA_FUNCS)); - if (funcs == NULL) { - OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); - return 0; - } - - funcs->argl = argl; - funcs->argp = argp; - funcs->free_func = free_func; - - CRYPTO_STATIC_MUTEX_lock_write(&ex_data_class->lock); - - if (ex_data_class->meth == NULL) { - ex_data_class->meth = sk_CRYPTO_EX_DATA_FUNCS_new_null(); - } - - if (ex_data_class->meth == NULL || - !sk_CRYPTO_EX_DATA_FUNCS_push(ex_data_class->meth, funcs)) { - OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); - OPENSSL_free(funcs); - goto err; - } - - *out_index = sk_CRYPTO_EX_DATA_FUNCS_num(ex_data_class->meth) - 1 + - ex_data_class->num_reserved; - ret = 1; - -err: - CRYPTO_STATIC_MUTEX_unlock_write(&ex_data_class->lock); - return ret; -} - -int CRYPTO_set_ex_data(CRYPTO_EX_DATA *ad, int index, void *val) { - int n, i; - - if (ad->sk == NULL) { - ad->sk = sk_void_new_null(); - if (ad->sk == NULL) { - OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); - return 0; - } - } - - n = sk_void_num(ad->sk); - - // Add NULL values until the stack is long enough. - for (i = n; i <= index; i++) { - if (!sk_void_push(ad->sk, NULL)) { - OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); - return 0; - } - } - - sk_void_set(ad->sk, index, val); - return 1; -} - -void *CRYPTO_get_ex_data(const CRYPTO_EX_DATA *ad, int idx) { - if (ad->sk == NULL || idx < 0 || (size_t)idx >= sk_void_num(ad->sk)) { - return NULL; - } - return sk_void_value(ad->sk, idx); -} - -// get_func_pointers takes a copy of the CRYPTO_EX_DATA_FUNCS pointers, if any, -// for the given class. If there are some pointers, it sets |*out| to point to -// a fresh stack of them. Otherwise it sets |*out| to NULL. It returns one on -// success or zero on error. -static int get_func_pointers(STACK_OF(CRYPTO_EX_DATA_FUNCS) **out, - CRYPTO_EX_DATA_CLASS *ex_data_class) { - size_t n; - - *out = NULL; - - // CRYPTO_EX_DATA_FUNCS structures are static once set, so we can take a - // shallow copy of the list under lock and then use the structures without - // the lock held. - CRYPTO_STATIC_MUTEX_lock_read(&ex_data_class->lock); - n = sk_CRYPTO_EX_DATA_FUNCS_num(ex_data_class->meth); - if (n > 0) { - *out = sk_CRYPTO_EX_DATA_FUNCS_dup(ex_data_class->meth); - } - CRYPTO_STATIC_MUTEX_unlock_read(&ex_data_class->lock); - - if (n > 0 && *out == NULL) { - OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); - return 0; - } - - return 1; -} - -void CRYPTO_new_ex_data(CRYPTO_EX_DATA *ad) { - ad->sk = NULL; -} - -void CRYPTO_free_ex_data(CRYPTO_EX_DATA_CLASS *ex_data_class, void *obj, - CRYPTO_EX_DATA *ad) { - if (ad->sk == NULL) { - // Nothing to do. - return; - } - - STACK_OF(CRYPTO_EX_DATA_FUNCS) *func_pointers; - if (!get_func_pointers(&func_pointers, ex_data_class)) { - // TODO(davidben): This leaks memory on malloc error. - return; - } - - for (size_t i = 0; i < sk_CRYPTO_EX_DATA_FUNCS_num(func_pointers); i++) { - CRYPTO_EX_DATA_FUNCS *func_pointer = - sk_CRYPTO_EX_DATA_FUNCS_value(func_pointers, i); - if (func_pointer->free_func) { - void *ptr = CRYPTO_get_ex_data(ad, i + ex_data_class->num_reserved); - func_pointer->free_func(obj, ptr, ad, i + ex_data_class->num_reserved, - func_pointer->argl, func_pointer->argp); - } - } - - sk_CRYPTO_EX_DATA_FUNCS_free(func_pointers); - - sk_void_free(ad->sk); - ad->sk = NULL; -} - -void CRYPTO_cleanup_all_ex_data(void) {} diff --git a/third_party/boringssl/src/crypto/ex_data.cc b/third_party/boringssl/src/crypto/ex_data.cc new file mode 100644 index 00000000..ea66bc6d --- /dev/null +++ b/third_party/boringssl/src/crypto/ex_data.cc @@ -0,0 +1,140 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include "internal.h" +#include "mem_internal.h" + + +BSSL_NAMESPACE_BEGIN + +struct ExDataFuncs { + long argl; // Arbitrary long + void *argp; // Arbitrary void pointer + CRYPTO_EX_free *free_func; + // next points to the next |ExDataFuncs| or NULL if this is the last + // one. It may only be read if synchronized with a read from |num_funcs|. + ExDataFuncs *next; +}; + +int CRYPTO_get_ex_new_index_ex(ExDataClass *ex_data_class, long argl, + void *argp, CRYPTO_EX_free *free_func) { + ExDataFuncs *funcs = New(); + if (funcs == nullptr) { + return -1; + } + + funcs->argl = argl; + funcs->argp = argp; + funcs->free_func = free_func; + funcs->next = nullptr; + + MutexWriteLock lock(&ex_data_class->lock); + + uint32_t num_funcs = ex_data_class->num_funcs.load(); + // The index must fit in |int|. + if (num_funcs > (size_t)(INT_MAX - ex_data_class->num_reserved)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_OVERFLOW); + return -1; + } + + // Append |funcs| to the linked list. + if (ex_data_class->last == nullptr) { + assert(num_funcs == 0); + ex_data_class->funcs = funcs; + ex_data_class->last = funcs; + } else { + ex_data_class->last->next = funcs; + ex_data_class->last = funcs; + } + + ex_data_class->num_funcs.store(num_funcs + 1); + return (int)num_funcs + ex_data_class->num_reserved; +} + +int CRYPTO_set_ex_data(CRYPTO_EX_DATA *ad, int index, void *val) { + if (index < 0) { + // A caller that can accidentally pass in an invalid index into this + // function will hit an memory error if |index| happened to be valid, and + // expected |val| to be of a different type. + abort(); + } + + if (ad->sk == nullptr) { + ad->sk = sk_void_new_null(); + if (ad->sk == nullptr) { + return 0; + } + } + + // Add NULL values until the stack is long enough. + for (size_t i = sk_void_num(ad->sk); i <= (size_t)index; i++) { + if (!sk_void_push(ad->sk, nullptr)) { + return 0; + } + } + + sk_void_set(ad->sk, (size_t)index, val); + return 1; +} + +void *CRYPTO_get_ex_data(const CRYPTO_EX_DATA *ad, int idx) { + if (ad->sk == nullptr || idx < 0 || (size_t)idx >= sk_void_num(ad->sk)) { + return nullptr; + } + return sk_void_value(ad->sk, idx); +} + +void CRYPTO_new_ex_data(CRYPTO_EX_DATA *ad) { ad->sk = nullptr; } + +void CRYPTO_free_ex_data(ExDataClass *ex_data_class, CRYPTO_EX_DATA *ad) { + if (ad->sk == nullptr) { + // Nothing to do. + return; + } + + uint32_t num_funcs = ex_data_class->num_funcs.load(); + // |CRYPTO_get_ex_new_index_ex| will not allocate indices beyond |INT_MAX|. + assert(num_funcs <= (size_t)(INT_MAX - ex_data_class->num_reserved)); + + // Defer dereferencing |ex_data_class->funcs| and |funcs->next|. It must come + // after the |num_funcs| comparison to be correctly synchronized. + ExDataFuncs *const *funcs = &ex_data_class->funcs; + for (uint32_t i = 0; i < num_funcs; i++) { + if ((*funcs)->free_func != nullptr) { + int index = (int)i + ex_data_class->num_reserved; + void *ptr = CRYPTO_get_ex_data(ad, index); + (*funcs)->free_func(/*parent=*/nullptr, ptr, /*ad*/ nullptr, index, + (*funcs)->argl, (*funcs)->argp); + } + funcs = &(*funcs)->next; + } + + sk_void_free(ad->sk); + ad->sk = nullptr; +} + +BSSL_NAMESPACE_END + +void CRYPTO_cleanup_all_ex_data() {} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/aes.c b/third_party/boringssl/src/crypto/fipsmodule/aes/aes.c deleted file mode 100644 index 60f35457..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/aes/aes.c +++ /dev/null @@ -1,106 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#include - -#include - -#include "internal.h" -#include "../modes/internal.h" - - -// Be aware that different sets of AES functions use incompatible key -// representations, varying in format of the key schedule, the |AES_KEY.rounds| -// value, or both. Therefore they cannot mix. Also, on AArch64, the plain-C -// code, above, is incompatible with the |aes_hw_*| functions. - -void AES_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { - if (hwaes_capable()) { - aes_hw_encrypt(in, out, key); - } else if (vpaes_capable()) { - vpaes_encrypt(in, out, key); - } else { - aes_nohw_encrypt(in, out, key); - } -} - -void AES_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { - if (hwaes_capable()) { - aes_hw_decrypt(in, out, key); - } else if (vpaes_capable()) { - vpaes_decrypt(in, out, key); - } else { - aes_nohw_decrypt(in, out, key); - } -} - -int AES_set_encrypt_key(const uint8_t *key, unsigned bits, AES_KEY *aeskey) { - if (bits != 128 && bits != 192 && bits != 256) { - return -2; - } - if (hwaes_capable()) { - return aes_hw_set_encrypt_key(key, bits, aeskey); - } else if (vpaes_capable()) { - return vpaes_set_encrypt_key(key, bits, aeskey); - } else { - return aes_nohw_set_encrypt_key(key, bits, aeskey); - } -} - -int AES_set_decrypt_key(const uint8_t *key, unsigned bits, AES_KEY *aeskey) { - if (bits != 128 && bits != 192 && bits != 256) { - return -2; - } - if (hwaes_capable()) { - return aes_hw_set_decrypt_key(key, bits, aeskey); - } else if (vpaes_capable()) { - return vpaes_set_decrypt_key(key, bits, aeskey); - } else { - return aes_nohw_set_decrypt_key(key, bits, aeskey); - } -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/aes.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/aes/aes.cc.inc new file mode 100644 index 00000000..56a110f3 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/aes/aes.cc.inc @@ -0,0 +1,208 @@ +// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../bcm_interface.h" +#include "internal.h" + + +using namespace bssl; + +// Be aware that different sets of AES functions use incompatible key +// representations, varying in format of the key schedule, the |AES_KEY.rounds| +// value, or both. Therefore they cannot mix. Also, on AArch64, the plain-C +// code, above, is incompatible with the |aes_hw_*| functions. + +bcm_infallible bssl::BCM_aes_encrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key) { + if (hwaes_capable()) { + aes_hw_encrypt(in, out, key); + } else if (vpaes_capable()) { + vpaes_encrypt(in, out, key); + } else { + aes_nohw_encrypt(in, out, key); + } + return bcm_infallible::not_approved; +} + +bcm_infallible bssl::BCM_aes_decrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key) { + if (hwaes_capable()) { + aes_hw_decrypt(in, out, key); + } else if (vpaes_capable()) { + vpaes_decrypt(in, out, key); + } else { + aes_nohw_decrypt(in, out, key); + } + return bcm_infallible::not_approved; +} + +bcm_status bssl::BCM_aes_set_encrypt_key(const uint8_t *key, unsigned bits, + AES_KEY *aeskey) { + int ret = -1; + if (hwaes_capable()) { + ret = aes_hw_set_encrypt_key(key, bits, aeskey); + } else if (vpaes_capable()) { + ret = vpaes_set_encrypt_key(key, bits, aeskey); + } else { + ret = aes_nohw_set_encrypt_key(key, bits, aeskey); + } + if (ret < 0) { + return bcm_status::failure; + } + BSSL_CHECK(ret == 0); + return bcm_status::not_approved; +} + +bcm_status bssl::BCM_aes_set_decrypt_key(const uint8_t *key, unsigned bits, + AES_KEY *aeskey) { + int ret = -1; + if (hwaes_capable()) { + ret = aes_hw_set_decrypt_key(key, bits, aeskey); + } else if (vpaes_capable()) { + ret = vpaes_set_decrypt_key(key, bits, aeskey); + } else { + ret = aes_nohw_set_decrypt_key(key, bits, aeskey); + } + if (ret < 0) { + return bcm_status::failure; + } + BSSL_CHECK(ret == 0); + return bcm_status::not_approved; +} + +#if defined(HWAES) && (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) +// On x86 and x86_64, |aes_hw_set_decrypt_key|, we implement +// |aes_hw_encrypt_key_to_decrypt_key| in assembly and rely on C code to combine +// the operations. +int bssl::aes_hw_set_decrypt_key(const uint8_t *user_key, int bits, + AES_KEY *key) { + int ret = aes_hw_set_encrypt_key(user_key, bits, key); + if (ret == 0) { + aes_hw_encrypt_key_to_decrypt_key(key); + } + return ret; +} + +int bssl::aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, + AES_KEY *key) { + if (aes_hw_set_encrypt_key_alt_preferred()) { + return aes_hw_set_encrypt_key_alt(user_key, bits, key); + } else { + return aes_hw_set_encrypt_key_base(user_key, bits, key); + } +} +#endif + +#if defined(VPAES) && defined(OPENSSL_X86) +// On x86, there is no |vpaes_ctr32_encrypt_blocks|, so we implement it +// ourselves. This avoids all callers needing to account for a missing function. +void bssl::vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, + size_t blocks, const AES_KEY *key, + const uint8_t iv[16]) { + uint32_t ctr = CRYPTO_load_u32_be(iv + 12); + uint8_t iv_buf[16], enc[16]; + OPENSSL_memcpy(iv_buf, iv, 12); + for (size_t i = 0; i < blocks; i++) { + CRYPTO_store_u32_be(iv_buf + 12, ctr); + vpaes_encrypt(iv_buf, enc, key); + CRYPTO_xor16(out, in, enc); + ctr++; + in += 16; + out += 16; + } +} +#endif + +#if defined(BSAES) +void bssl::vpaes_ctr32_encrypt_blocks_with_bsaes(const uint8_t *in, + uint8_t *out, size_t blocks, + const AES_KEY *key, + const uint8_t ivec[16]) { + // |bsaes_ctr32_encrypt_blocks| is faster than |vpaes_ctr32_encrypt_blocks|, + // but it takes at least one full 8-block batch to amortize the conversion. + if (blocks < 8) { + vpaes_ctr32_encrypt_blocks(in, out, blocks, key, ivec); + return; + } + + size_t bsaes_blocks = blocks; + if (bsaes_blocks % 8 < 6) { + // |bsaes_ctr32_encrypt_blocks| internally works in 8-block batches. If the + // final batch is too small (under six blocks), it is faster to loop over + // |vpaes_encrypt|. Round |bsaes_blocks| down to a multiple of 8. + bsaes_blocks -= bsaes_blocks % 8; + } + + AES_KEY bsaes; + vpaes_encrypt_key_to_bsaes(&bsaes, key); + bsaes_ctr32_encrypt_blocks(in, out, bsaes_blocks, &bsaes, ivec); + OPENSSL_cleanse(&bsaes, sizeof(bsaes)); + + in += 16 * bsaes_blocks; + out += 16 * bsaes_blocks; + blocks -= bsaes_blocks; + + uint8_t new_ivec[16]; + memcpy(new_ivec, ivec, 12); + uint32_t ctr = CRYPTO_load_u32_be(ivec + 12) + bsaes_blocks; + CRYPTO_store_u32_be(new_ivec + 12, ctr); + + // Finish any remaining blocks with |vpaes_ctr32_encrypt_blocks|. + vpaes_ctr32_encrypt_blocks(in, out, blocks, key, new_ivec); +} +#endif // BSAES + +ctr128_f bssl::aes_ctr_set_key(AES_KEY *aes_key, int *out_is_hwaes, + block128_f *out_block, const uint8_t *key, + size_t key_bytes) { + // This function assumes the key length was previously validated. + assert(key_bytes == 128 / 8 || key_bytes == 192 / 8 || key_bytes == 256 / 8); + if (hwaes_capable()) { + aes_hw_set_encrypt_key(key, (int)key_bytes * 8, aes_key); + if (out_is_hwaes) { + *out_is_hwaes = 1; + } + if (out_block) { + *out_block = aes_hw_encrypt; + } + return aes_hw_ctr32_encrypt_blocks; + } + + if (vpaes_capable()) { + vpaes_set_encrypt_key(key, (int)key_bytes * 8, aes_key); + if (out_block) { + *out_block = vpaes_encrypt; + } + if (out_is_hwaes) { + *out_is_hwaes = 0; + } +#if defined(BSAES) + assert(bsaes_capable()); + return vpaes_ctr32_encrypt_blocks_with_bsaes; +#else + return vpaes_ctr32_encrypt_blocks; +#endif + } + + aes_nohw_set_encrypt_key(key, (int)key_bytes * 8, aes_key); + if (out_is_hwaes) { + *out_is_hwaes = 0; + } + if (out_block) { + *out_block = aes_nohw_encrypt; + } + return aes_nohw_ctr32_encrypt_blocks; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/aes_nohw.c b/third_party/boringssl/src/crypto/fipsmodule/aes/aes_nohw.c deleted file mode 100644 index b5990b84..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/aes/aes_nohw.c +++ /dev/null @@ -1,1281 +0,0 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include "../../internal.h" -#include "internal.h" - -#if defined(OPENSSL_SSE2) -#include -#endif - - -// This file contains a constant-time implementation of AES, bitsliced with -// 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block -// batches, respectively. The 128-bit implementation requires SSE2 intrinsics. -// -// This implementation is based on the algorithms described in the following -// references: -// - https://bearssl.org/constanttime.html#aes -// - https://eprint.iacr.org/2009/129.pdf -// - https://eprint.iacr.org/2009/191.pdf - - -// Word operations. -// -// An aes_word_t is the word used for this AES implementation. Throughout this -// file, bits and bytes are ordered little-endian, though "left" and "right" -// shifts match the operations themselves, which makes them reversed in a -// little-endian, left-to-right reading. -// -// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an -// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE| -// bits each, each corresponding to a byte in an AES block in column-major -// order (AES's byte order). We refer to these as "logical bytes". Note, in the -// 32-bit and 64-bit implementations, they are smaller than a byte. (The -// contents of a logical byte will be described later.) -// -// MSVC does not support C bit operators on |__m128i|, so the wrapper functions -// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and -// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift -// value ranges from 0 to 15 independent of |aes_word_t| and -// |AES_NOHW_BATCH_SIZE|. -// -// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which -// uses row-major order. Matching the AES order was easier to reason about, and -// we do not have PSHUFB available to arbitrarily permute bytes. - -#if defined(OPENSSL_SSE2) -typedef __m128i aes_word_t; -// AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in -// MSVC, so we define a constant. -#define AES_NOHW_WORD_SIZE 16 -#define AES_NOHW_BATCH_SIZE 8 -#define AES_NOHW_ROW0_MASK \ - _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff) -#define AES_NOHW_ROW1_MASK \ - _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00) -#define AES_NOHW_ROW2_MASK \ - _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000) -#define AES_NOHW_ROW3_MASK \ - _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000) -#define AES_NOHW_COL01_MASK \ - _mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff) -#define AES_NOHW_COL2_MASK \ - _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000) -#define AES_NOHW_COL3_MASK \ - _mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000) - -static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { - return _mm_and_si128(a, b); -} - -static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { - return _mm_or_si128(a, b); -} - -static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { - return _mm_xor_si128(a, b); -} - -static inline aes_word_t aes_nohw_not(aes_word_t a) { - return _mm_xor_si128( - a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff)); -} - -// These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128| -// must be constants. -#define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \ - _mm_slli_si128((a), (i)) -#define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \ - _mm_srli_si128((a), (i)) -#else // !OPENSSL_SSE2 -#if defined(OPENSSL_64_BIT) -typedef uint64_t aes_word_t; -#define AES_NOHW_WORD_SIZE 8 -#define AES_NOHW_BATCH_SIZE 4 -#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f) -#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0) -#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00) -#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000) -#define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff) -#define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000) -#define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000) -#else // !OPENSSL_64_BIT -typedef uint32_t aes_word_t; -#define AES_NOHW_WORD_SIZE 4 -#define AES_NOHW_BATCH_SIZE 2 -#define AES_NOHW_ROW0_MASK 0x03030303 -#define AES_NOHW_ROW1_MASK 0x0c0c0c0c -#define AES_NOHW_ROW2_MASK 0x30303030 -#define AES_NOHW_ROW3_MASK 0xc0c0c0c0 -#define AES_NOHW_COL01_MASK 0x0000ffff -#define AES_NOHW_COL2_MASK 0x00ff0000 -#define AES_NOHW_COL3_MASK 0xff000000 -#endif // OPENSSL_64_BIT - -static inline aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { - return a & b; -} - -static inline aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { - return a | b; -} - -static inline aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { - return a ^ b; -} - -static inline aes_word_t aes_nohw_not(aes_word_t a) { return ~a; } - -static inline aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) { - return a << (i * AES_NOHW_BATCH_SIZE); -} - -static inline aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) { - return a >> (i * AES_NOHW_BATCH_SIZE); -} -#endif // OPENSSL_SSE2 - -static_assert(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t), - "batch size does not match word size"); -static_assert(AES_NOHW_WORD_SIZE == sizeof(aes_word_t), - "AES_NOHW_WORD_SIZE is incorrect"); - - -// Block representations. -// -// This implementation uses three representations for AES blocks. First, the -// public API represents blocks as uint8_t[16] in the usual way. Second, most -// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|. -// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words -// containing bitsliced blocks a, b, c, d, this would be as follows (vertical -// bars divide logical bytes): -// -// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... -// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... -// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... -// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... -// ... -// -// Finally, an individual block may be stored as an intermediate form in an -// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each -// block, so that block[0]'s ith logical byte contains least-significant -// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of -// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as -// "compacting" the block. Note this is no-op with 128-bit words because then -// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit -// words, one block would be stored in two words: -// -// block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... -// block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ... -// -// Observe that the distances between corresponding bits in bitsliced and -// compact bit orders match. If we line up corresponding words of each block, -// the bitsliced and compact representations may be converted by tranposing bits -// in corresponding logical bytes. Continuing the 64-bit example: -// -// block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... -// block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ... -// block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ... -// block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ... -// -// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... -// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... -// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... -// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... -// -// Note also that bitwise operations and (logical) byte permutations on an -// |aes_word_t| work equally for the bitsliced and compact words. -// -// We use the compact form in the |AES_KEY| representation to save work -// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists -// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately -// before or after |aes_nohw_transpose|. - -#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t)) - -// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise -// specified, it is in bitsliced form. -typedef struct { - aes_word_t w[8]; -} AES_NOHW_BATCH; - -// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is -// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH| -// |AES_KEY|s so it should not be used as a long-term key representation. -typedef struct { - // keys is an array of batches, one for each round key. Each batch stores - // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form. - AES_NOHW_BATCH keys[AES_MAXNR + 1]; -} AES_NOHW_SCHEDULE; - -// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in -// compact form. -static inline void aes_nohw_batch_set(AES_NOHW_BATCH *batch, - const aes_word_t in[AES_NOHW_BLOCK_WORDS], - size_t i) { - // Note the words are interleaved. The order comes from |aes_nohw_transpose|. - // If |i| is zero and this is the 64-bit implementation, in[0] contains bits - // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at - // w[4] so that bits 0 and 4 are in the correct position. (In general, bits - // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares - // will be correctly placed.) - assert(i < AES_NOHW_BATCH_SIZE); -#if defined(OPENSSL_SSE2) - batch->w[i] = in[0]; -#elif defined(OPENSSL_64_BIT) - batch->w[i] = in[0]; - batch->w[i + 4] = in[1]; -#else - batch->w[i] = in[0]; - batch->w[i + 2] = in[1]; - batch->w[i + 4] = in[2]; - batch->w[i + 6] = in[3]; -#endif -} - -// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in -// compact form. -static inline void aes_nohw_batch_get(const AES_NOHW_BATCH *batch, - aes_word_t out[AES_NOHW_BLOCK_WORDS], - size_t i) { - assert(i < AES_NOHW_BATCH_SIZE); -#if defined(OPENSSL_SSE2) - out[0] = batch->w[i]; -#elif defined(OPENSSL_64_BIT) - out[0] = batch->w[i]; - out[1] = batch->w[i + 4]; -#else - out[0] = batch->w[i]; - out[1] = batch->w[i + 2]; - out[2] = batch->w[i + 4]; - out[3] = batch->w[i + 6]; -#endif -} - -#if !defined(OPENSSL_SSE2) -// aes_nohw_delta_swap returns |a| with bits |a & mask| and -// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap. -static inline aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask, - aes_word_t shift) { - // See - // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/ - aes_word_t b = (a ^ (a >> shift)) & mask; - return a ^ b ^ (b << shift); -} - -// In the 32-bit and 64-bit implementations, a block spans multiple words. -// |aes_nohw_compact_block| must permute bits across different words. First we -// implement |aes_nohw_compact_word| which performs a smaller version of the -// transformation which stays within a single word. -// -// These transformations are generalizations of the output of -// http://programming.sirrida.de/calcperm.php on smaller inputs. -#if defined(OPENSSL_64_BIT) -static inline uint64_t aes_nohw_compact_word(uint64_t a) { - // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap - // quartets of those chunks: - // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => - // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 - a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); - // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks): - // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 => - // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 - a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); - // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks): - // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 => - // 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15 - a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); - return a; -} - -static inline uint64_t aes_nohw_uncompact_word(uint64_t a) { - // Reverse the steps of |aes_nohw_uncompact_word|. - a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); - a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); - a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); - return a; -} -#else // !OPENSSL_64_BIT -static inline uint32_t aes_nohw_compact_word(uint32_t a) { - // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap: - // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => - // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 - // Note: 0x00cc = 0b0000_0000_1100_1100 - // 0x00cc << 6 = 0b0011_0011_0000_0000 - a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); - // Now we swap groups of four bits (still numbering by pairs): - // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 => - // 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15 - // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000 - a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); - return a; -} - -static inline uint32_t aes_nohw_uncompact_word(uint32_t a) { - // Reverse the steps of |aes_nohw_uncompact_word|. - a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); - a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); - return a; -} - -static inline uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1, - uint8_t a2, uint8_t a3) { - return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) | - ((uint32_t)a3 << 24); -} -#endif // OPENSSL_64_BIT -#endif // !OPENSSL_SSE2 - -static inline void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], - const uint8_t in[16]) { - memcpy(out, in, 16); -#if defined(OPENSSL_SSE2) - // No conversions needed. -#elif defined(OPENSSL_64_BIT) - uint64_t a0 = aes_nohw_compact_word(out[0]); - uint64_t a1 = aes_nohw_compact_word(out[1]); - out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32); - out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32); -#else - uint32_t a0 = aes_nohw_compact_word(out[0]); - uint32_t a1 = aes_nohw_compact_word(out[1]); - uint32_t a2 = aes_nohw_compact_word(out[2]); - uint32_t a3 = aes_nohw_compact_word(out[3]); - // Note clang, when building for ARM Thumb2, will sometimes miscompile - // expressions such as (a0 & 0x0000ff00) << 8, particularly when building - // without optimizations. This bug was introduced in - // https://reviews.llvm.org/rL340261 and fixed in - // https://reviews.llvm.org/rL351310. The following is written to avoid this. - out[0] = aes_nohw_word_from_bytes(a0, a1, a2, a3); - out[1] = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8); - out[2] = aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16); - out[3] = aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24); -#endif -} - -static inline void aes_nohw_uncompact_block( - uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { -#if defined(OPENSSL_SSE2) - memcpy(out, in, 16); // No conversions needed. -#elif defined(OPENSSL_64_BIT) - uint64_t a0 = in[0]; - uint64_t a1 = in[1]; - uint64_t b0 = - aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32)); - uint64_t b1 = - aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32)); - memcpy(out, &b0, 8); - memcpy(out + 8, &b1, 8); -#else - uint32_t a0 = in[0]; - uint32_t a1 = in[1]; - uint32_t a2 = in[2]; - uint32_t a3 = in[3]; - // Note clang, when building for ARM Thumb2, will sometimes miscompile - // expressions such as (a0 & 0x0000ff00) << 8, particularly when building - // without optimizations. This bug was introduced in - // https://reviews.llvm.org/rL340261 and fixed in - // https://reviews.llvm.org/rL351310. The following is written to avoid this. - uint32_t b0 = aes_nohw_word_from_bytes(a0, a1, a2, a3); - uint32_t b1 = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8); - uint32_t b2 = - aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16); - uint32_t b3 = - aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24); - b0 = aes_nohw_uncompact_word(b0); - b1 = aes_nohw_uncompact_word(b1); - b2 = aes_nohw_uncompact_word(b2); - b3 = aes_nohw_uncompact_word(b3); - memcpy(out, &b0, 4); - memcpy(out + 4, &b1, 4); - memcpy(out + 8, &b2, 4); - memcpy(out + 12, &b3, 4); -#endif -} - -// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in -// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and -// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it -// is repeated to the full width of |aes_word_t|. -#if defined(OPENSSL_SSE2) -// This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require -// constant shift values. -#define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b, \ - /* uint32_t */ mask, /* const */ shift) \ - do { \ - __m128i swap = \ - _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \ - _mm_set_epi32((mask), (mask), (mask), (mask))); \ - *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift))); \ - *(b) = _mm_xor_si128(*(b), swap); \ - \ - } while (0) -#else -static inline void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b, - uint32_t mask, aes_word_t shift) { -#if defined(OPENSSL_64_BIT) - aes_word_t mask_w = (((uint64_t)mask) << 32) | mask; -#else - aes_word_t mask_w = mask; -#endif - // This is a variation on a delta swap. - aes_word_t swap = ((*a >> shift) ^ *b) & mask_w; - *a ^= swap << shift; - *b ^= swap; -} -#endif // OPENSSL_SSE2 - -// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides -// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares -// and transposes each square. -static void aes_nohw_transpose(AES_NOHW_BATCH *batch) { - // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101). - aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1); - aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1); - aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1); - aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1); - -#if AES_NOHW_BATCH_SIZE >= 4 - // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011). - aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2); - aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2); - aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2); - aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2); -#endif - -#if AES_NOHW_BATCH_SIZE >= 8 - // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111). - aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4); - aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4); - aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4); - aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4); -#endif -} - -// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|. -// |num_blocks| must be at most |AES_NOHW_BATCH|. -static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in, - size_t num_blocks) { - // Don't leave unused blocks uninitialized. - memset(out, 0, sizeof(AES_NOHW_BATCH)); - assert(num_blocks <= AES_NOHW_BATCH_SIZE); - for (size_t i = 0; i < num_blocks; i++) { - aes_word_t block[AES_NOHW_BLOCK_WORDS]; - aes_nohw_compact_block(block, in + 16 * i); - aes_nohw_batch_set(out, block, i); - } - - aes_nohw_transpose(out); -} - -// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|. -// |num_blocks| must be at most |AES_NOHW_BATCH|. -static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks, - const AES_NOHW_BATCH *batch) { - AES_NOHW_BATCH copy = *batch; - aes_nohw_transpose(©); - - assert(num_blocks <= AES_NOHW_BATCH_SIZE); - for (size_t i = 0; i < num_blocks; i++) { - aes_word_t block[AES_NOHW_BLOCK_WORDS]; - aes_nohw_batch_get(©, block, i); - aes_nohw_uncompact_block(out + 16 * i, block); - } -} - - -// AES round steps. - -static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch, - const AES_NOHW_BATCH *key) { - for (size_t i = 0; i < 8; i++) { - batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]); - } -} - -static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) { - // See https://eprint.iacr.org/2009/191.pdf, Appendix C. - aes_word_t x0 = batch->w[7]; - aes_word_t x1 = batch->w[6]; - aes_word_t x2 = batch->w[5]; - aes_word_t x3 = batch->w[4]; - aes_word_t x4 = batch->w[3]; - aes_word_t x5 = batch->w[2]; - aes_word_t x6 = batch->w[1]; - aes_word_t x7 = batch->w[0]; - - // Figure 2, the top linear transformation. - aes_word_t y14 = aes_nohw_xor(x3, x5); - aes_word_t y13 = aes_nohw_xor(x0, x6); - aes_word_t y9 = aes_nohw_xor(x0, x3); - aes_word_t y8 = aes_nohw_xor(x0, x5); - aes_word_t t0 = aes_nohw_xor(x1, x2); - aes_word_t y1 = aes_nohw_xor(t0, x7); - aes_word_t y4 = aes_nohw_xor(y1, x3); - aes_word_t y12 = aes_nohw_xor(y13, y14); - aes_word_t y2 = aes_nohw_xor(y1, x0); - aes_word_t y5 = aes_nohw_xor(y1, x6); - aes_word_t y3 = aes_nohw_xor(y5, y8); - aes_word_t t1 = aes_nohw_xor(x4, y12); - aes_word_t y15 = aes_nohw_xor(t1, x5); - aes_word_t y20 = aes_nohw_xor(t1, x1); - aes_word_t y6 = aes_nohw_xor(y15, x7); - aes_word_t y10 = aes_nohw_xor(y15, t0); - aes_word_t y11 = aes_nohw_xor(y20, y9); - aes_word_t y7 = aes_nohw_xor(x7, y11); - aes_word_t y17 = aes_nohw_xor(y10, y11); - aes_word_t y19 = aes_nohw_xor(y10, y8); - aes_word_t y16 = aes_nohw_xor(t0, y11); - aes_word_t y21 = aes_nohw_xor(y13, y16); - aes_word_t y18 = aes_nohw_xor(x0, y16); - - // Figure 3, the middle non-linear section. - aes_word_t t2 = aes_nohw_and(y12, y15); - aes_word_t t3 = aes_nohw_and(y3, y6); - aes_word_t t4 = aes_nohw_xor(t3, t2); - aes_word_t t5 = aes_nohw_and(y4, x7); - aes_word_t t6 = aes_nohw_xor(t5, t2); - aes_word_t t7 = aes_nohw_and(y13, y16); - aes_word_t t8 = aes_nohw_and(y5, y1); - aes_word_t t9 = aes_nohw_xor(t8, t7); - aes_word_t t10 = aes_nohw_and(y2, y7); - aes_word_t t11 = aes_nohw_xor(t10, t7); - aes_word_t t12 = aes_nohw_and(y9, y11); - aes_word_t t13 = aes_nohw_and(y14, y17); - aes_word_t t14 = aes_nohw_xor(t13, t12); - aes_word_t t15 = aes_nohw_and(y8, y10); - aes_word_t t16 = aes_nohw_xor(t15, t12); - aes_word_t t17 = aes_nohw_xor(t4, t14); - aes_word_t t18 = aes_nohw_xor(t6, t16); - aes_word_t t19 = aes_nohw_xor(t9, t14); - aes_word_t t20 = aes_nohw_xor(t11, t16); - aes_word_t t21 = aes_nohw_xor(t17, y20); - aes_word_t t22 = aes_nohw_xor(t18, y19); - aes_word_t t23 = aes_nohw_xor(t19, y21); - aes_word_t t24 = aes_nohw_xor(t20, y18); - aes_word_t t25 = aes_nohw_xor(t21, t22); - aes_word_t t26 = aes_nohw_and(t21, t23); - aes_word_t t27 = aes_nohw_xor(t24, t26); - aes_word_t t28 = aes_nohw_and(t25, t27); - aes_word_t t29 = aes_nohw_xor(t28, t22); - aes_word_t t30 = aes_nohw_xor(t23, t24); - aes_word_t t31 = aes_nohw_xor(t22, t26); - aes_word_t t32 = aes_nohw_and(t31, t30); - aes_word_t t33 = aes_nohw_xor(t32, t24); - aes_word_t t34 = aes_nohw_xor(t23, t33); - aes_word_t t35 = aes_nohw_xor(t27, t33); - aes_word_t t36 = aes_nohw_and(t24, t35); - aes_word_t t37 = aes_nohw_xor(t36, t34); - aes_word_t t38 = aes_nohw_xor(t27, t36); - aes_word_t t39 = aes_nohw_and(t29, t38); - aes_word_t t40 = aes_nohw_xor(t25, t39); - aes_word_t t41 = aes_nohw_xor(t40, t37); - aes_word_t t42 = aes_nohw_xor(t29, t33); - aes_word_t t43 = aes_nohw_xor(t29, t40); - aes_word_t t44 = aes_nohw_xor(t33, t37); - aes_word_t t45 = aes_nohw_xor(t42, t41); - aes_word_t z0 = aes_nohw_and(t44, y15); - aes_word_t z1 = aes_nohw_and(t37, y6); - aes_word_t z2 = aes_nohw_and(t33, x7); - aes_word_t z3 = aes_nohw_and(t43, y16); - aes_word_t z4 = aes_nohw_and(t40, y1); - aes_word_t z5 = aes_nohw_and(t29, y7); - aes_word_t z6 = aes_nohw_and(t42, y11); - aes_word_t z7 = aes_nohw_and(t45, y17); - aes_word_t z8 = aes_nohw_and(t41, y10); - aes_word_t z9 = aes_nohw_and(t44, y12); - aes_word_t z10 = aes_nohw_and(t37, y3); - aes_word_t z11 = aes_nohw_and(t33, y4); - aes_word_t z12 = aes_nohw_and(t43, y13); - aes_word_t z13 = aes_nohw_and(t40, y5); - aes_word_t z14 = aes_nohw_and(t29, y2); - aes_word_t z15 = aes_nohw_and(t42, y9); - aes_word_t z16 = aes_nohw_and(t45, y14); - aes_word_t z17 = aes_nohw_and(t41, y8); - - // Figure 4, bottom linear transformation. - aes_word_t t46 = aes_nohw_xor(z15, z16); - aes_word_t t47 = aes_nohw_xor(z10, z11); - aes_word_t t48 = aes_nohw_xor(z5, z13); - aes_word_t t49 = aes_nohw_xor(z9, z10); - aes_word_t t50 = aes_nohw_xor(z2, z12); - aes_word_t t51 = aes_nohw_xor(z2, z5); - aes_word_t t52 = aes_nohw_xor(z7, z8); - aes_word_t t53 = aes_nohw_xor(z0, z3); - aes_word_t t54 = aes_nohw_xor(z6, z7); - aes_word_t t55 = aes_nohw_xor(z16, z17); - aes_word_t t56 = aes_nohw_xor(z12, t48); - aes_word_t t57 = aes_nohw_xor(t50, t53); - aes_word_t t58 = aes_nohw_xor(z4, t46); - aes_word_t t59 = aes_nohw_xor(z3, t54); - aes_word_t t60 = aes_nohw_xor(t46, t57); - aes_word_t t61 = aes_nohw_xor(z14, t57); - aes_word_t t62 = aes_nohw_xor(t52, t58); - aes_word_t t63 = aes_nohw_xor(t49, t58); - aes_word_t t64 = aes_nohw_xor(z4, t59); - aes_word_t t65 = aes_nohw_xor(t61, t62); - aes_word_t t66 = aes_nohw_xor(z1, t63); - aes_word_t s0 = aes_nohw_xor(t59, t63); - aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62)); - aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60)); - aes_word_t t67 = aes_nohw_xor(t64, t65); - aes_word_t s3 = aes_nohw_xor(t53, t66); - aes_word_t s4 = aes_nohw_xor(t51, t66); - aes_word_t s5 = aes_nohw_xor(t47, t65); - aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3)); - aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67)); - - batch->w[0] = s7; - batch->w[1] = s6; - batch->w[2] = s5; - batch->w[3] = s4; - batch->w[4] = s3; - batch->w[5] = s2; - batch->w[6] = s1; - batch->w[7] = s0; -} - -// aes_nohw_sub_bytes_inv_affine inverts the affine transform portion of the AES -// S-box, defined in FIPS PUB 197, section 5.1.1, step 2. -static void aes_nohw_sub_bytes_inv_affine(AES_NOHW_BATCH *batch) { - aes_word_t a0 = batch->w[0]; - aes_word_t a1 = batch->w[1]; - aes_word_t a2 = batch->w[2]; - aes_word_t a3 = batch->w[3]; - aes_word_t a4 = batch->w[4]; - aes_word_t a5 = batch->w[5]; - aes_word_t a6 = batch->w[6]; - aes_word_t a7 = batch->w[7]; - - // Apply the circulant [0 0 1 0 0 1 0 1]. This is the inverse of the circulant - // [1 0 0 0 1 1 1 1]. - aes_word_t b0 = aes_nohw_xor(a2, aes_nohw_xor(a5, a7)); - aes_word_t b1 = aes_nohw_xor(a3, aes_nohw_xor(a6, a0)); - aes_word_t b2 = aes_nohw_xor(a4, aes_nohw_xor(a7, a1)); - aes_word_t b3 = aes_nohw_xor(a5, aes_nohw_xor(a0, a2)); - aes_word_t b4 = aes_nohw_xor(a6, aes_nohw_xor(a1, a3)); - aes_word_t b5 = aes_nohw_xor(a7, aes_nohw_xor(a2, a4)); - aes_word_t b6 = aes_nohw_xor(a0, aes_nohw_xor(a3, a5)); - aes_word_t b7 = aes_nohw_xor(a1, aes_nohw_xor(a4, a6)); - - // XOR 0x05. Equivalently, we could XOR 0x63 before applying the circulant, - // but 0x05 has lower Hamming weight. (0x05 is the circulant applied to 0x63.) - batch->w[0] = aes_nohw_not(b0); - batch->w[1] = b1; - batch->w[2] = aes_nohw_not(b2); - batch->w[3] = b3; - batch->w[4] = b4; - batch->w[5] = b5; - batch->w[6] = b6; - batch->w[7] = b7; -} - -static void aes_nohw_inv_sub_bytes(AES_NOHW_BATCH *batch) { - // We implement the inverse S-box using the forwards implementation with the - // technique described in https://www.bearssl.org/constanttime.html#aes. - // - // The forwards S-box inverts its input and applies an affine transformation: - // S(x) = A(Inv(x)). Thus Inv(x) = InvA(S(x)). The inverse S-box is then: - // - // InvS(x) = Inv(InvA(x)). - // = InvA(S(InvA(x))) - aes_nohw_sub_bytes_inv_affine(batch); - aes_nohw_sub_bytes(batch); - aes_nohw_sub_bytes_inv_affine(batch); -} - -// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated -// to the right by |n|. This is a macro because |aes_nohw_shift_*| require -// constant shift counts in the SSE2 implementation. -#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \ - (aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \ - aes_nohw_shift_left((v), 16 - (n)*4))) - -static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) { - for (size_t i = 0; i < 8; i++) { - aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK); - aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK); - aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK); - aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK); - row1 = aes_nohw_rotate_cols_right(row1, 1); - row2 = aes_nohw_rotate_cols_right(row2, 2); - row3 = aes_nohw_rotate_cols_right(row3, 3); - batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3)); - } -} - -static void aes_nohw_inv_shift_rows(AES_NOHW_BATCH *batch) { - for (size_t i = 0; i < 8; i++) { - aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK); - aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK); - aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK); - aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK); - row1 = aes_nohw_rotate_cols_right(row1, 3); - row2 = aes_nohw_rotate_cols_right(row2, 2); - row3 = aes_nohw_rotate_cols_right(row3, 1); - batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3)); - } -} - -// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated -// down by one. -static inline aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) { -#if defined(OPENSSL_SSE2) - return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24)); -#elif defined(OPENSSL_64_BIT) - return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) | - ((v << 12) & UINT64_C(0xf000f000f000f000)); -#else - return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0); -#endif -} - -// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated -// by two. -static inline aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) { -#if defined(OPENSSL_SSE2) - return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16)); -#elif defined(OPENSSL_64_BIT) - return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) | - ((v << 8) & UINT64_C(0xff00ff00ff00ff00)); -#else - return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0); -#endif -} - -static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) { - // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A. - aes_word_t a0 = batch->w[0]; - aes_word_t a1 = batch->w[1]; - aes_word_t a2 = batch->w[2]; - aes_word_t a3 = batch->w[3]; - aes_word_t a4 = batch->w[4]; - aes_word_t a5 = batch->w[5]; - aes_word_t a6 = batch->w[6]; - aes_word_t a7 = batch->w[7]; - - aes_word_t r0 = aes_nohw_rotate_rows_down(a0); - aes_word_t a0_r0 = aes_nohw_xor(a0, r0); - aes_word_t r1 = aes_nohw_rotate_rows_down(a1); - aes_word_t a1_r1 = aes_nohw_xor(a1, r1); - aes_word_t r2 = aes_nohw_rotate_rows_down(a2); - aes_word_t a2_r2 = aes_nohw_xor(a2, r2); - aes_word_t r3 = aes_nohw_rotate_rows_down(a3); - aes_word_t a3_r3 = aes_nohw_xor(a3, r3); - aes_word_t r4 = aes_nohw_rotate_rows_down(a4); - aes_word_t a4_r4 = aes_nohw_xor(a4, r4); - aes_word_t r5 = aes_nohw_rotate_rows_down(a5); - aes_word_t a5_r5 = aes_nohw_xor(a5, r5); - aes_word_t r6 = aes_nohw_rotate_rows_down(a6); - aes_word_t a6_r6 = aes_nohw_xor(a6, r6); - aes_word_t r7 = aes_nohw_rotate_rows_down(a7); - aes_word_t a7_r7 = aes_nohw_xor(a7, r7); - - batch->w[0] = - aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0)); - batch->w[1] = - aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7), - aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1))); - batch->w[2] = - aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2)); - batch->w[3] = - aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7), - aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3))); - batch->w[4] = - aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7), - aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4))); - batch->w[5] = - aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5)); - batch->w[6] = - aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6)); - batch->w[7] = - aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7)); -} - -static void aes_nohw_inv_mix_columns(AES_NOHW_BATCH *batch) { - aes_word_t a0 = batch->w[0]; - aes_word_t a1 = batch->w[1]; - aes_word_t a2 = batch->w[2]; - aes_word_t a3 = batch->w[3]; - aes_word_t a4 = batch->w[4]; - aes_word_t a5 = batch->w[5]; - aes_word_t a6 = batch->w[6]; - aes_word_t a7 = batch->w[7]; - - // bsaes-x86_64.pl describes the following decomposition of the inverse - // MixColumns matrix, credited to Jussi Kivilinna. This gives a much simpler - // multiplication. - // - // | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | - // | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | - // | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | - // | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | - // - // First, apply the [5 0 4 0] matrix. Multiplying by 4 in F_(2^8) is described - // by the following bit equations: - // - // b0 = a6 - // b1 = a6 ^ a7 - // b2 = a0 ^ a7 - // b3 = a1 ^ a6 - // b4 = a2 ^ a6 ^ a7 - // b5 = a3 ^ a7 - // b6 = a4 - // b7 = a5 - // - // Each coefficient is given by: - // - // b_ij = 05·a_ij ⊕ 04·a_i(j+2) = 04·(a_ij ⊕ a_i(j+2)) ⊕ a_ij - // - // We combine the two equations below. Note a_i(j+2) is a row rotation. - aes_word_t a0_r0 = aes_nohw_xor(a0, aes_nohw_rotate_rows_twice(a0)); - aes_word_t a1_r1 = aes_nohw_xor(a1, aes_nohw_rotate_rows_twice(a1)); - aes_word_t a2_r2 = aes_nohw_xor(a2, aes_nohw_rotate_rows_twice(a2)); - aes_word_t a3_r3 = aes_nohw_xor(a3, aes_nohw_rotate_rows_twice(a3)); - aes_word_t a4_r4 = aes_nohw_xor(a4, aes_nohw_rotate_rows_twice(a4)); - aes_word_t a5_r5 = aes_nohw_xor(a5, aes_nohw_rotate_rows_twice(a5)); - aes_word_t a6_r6 = aes_nohw_xor(a6, aes_nohw_rotate_rows_twice(a6)); - aes_word_t a7_r7 = aes_nohw_xor(a7, aes_nohw_rotate_rows_twice(a7)); - - batch->w[0] = aes_nohw_xor(a0, a6_r6); - batch->w[1] = aes_nohw_xor(a1, aes_nohw_xor(a6_r6, a7_r7)); - batch->w[2] = aes_nohw_xor(a2, aes_nohw_xor(a0_r0, a7_r7)); - batch->w[3] = aes_nohw_xor(a3, aes_nohw_xor(a1_r1, a6_r6)); - batch->w[4] = - aes_nohw_xor(aes_nohw_xor(a4, a2_r2), aes_nohw_xor(a6_r6, a7_r7)); - batch->w[5] = aes_nohw_xor(a5, aes_nohw_xor(a3_r3, a7_r7)); - batch->w[6] = aes_nohw_xor(a6, a4_r4); - batch->w[7] = aes_nohw_xor(a7, a5_r5); - - // Apply the [02 03 01 01] matrix, which is just MixColumns. - aes_nohw_mix_columns(batch); -} - -static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key, - size_t num_rounds, AES_NOHW_BATCH *batch) { - aes_nohw_add_round_key(batch, &key->keys[0]); - for (size_t i = 1; i < num_rounds; i++) { - aes_nohw_sub_bytes(batch); - aes_nohw_shift_rows(batch); - aes_nohw_mix_columns(batch); - aes_nohw_add_round_key(batch, &key->keys[i]); - } - aes_nohw_sub_bytes(batch); - aes_nohw_shift_rows(batch); - aes_nohw_add_round_key(batch, &key->keys[num_rounds]); -} - -static void aes_nohw_decrypt_batch(const AES_NOHW_SCHEDULE *key, - size_t num_rounds, AES_NOHW_BATCH *batch) { - aes_nohw_add_round_key(batch, &key->keys[num_rounds]); - aes_nohw_inv_shift_rows(batch); - aes_nohw_inv_sub_bytes(batch); - for (size_t i = num_rounds - 1; i > 0; i--) { - aes_nohw_add_round_key(batch, &key->keys[i]); - aes_nohw_inv_mix_columns(batch); - aes_nohw_inv_shift_rows(batch); - aes_nohw_inv_sub_bytes(batch); - } - aes_nohw_add_round_key(batch, &key->keys[0]); -} - - -// Key schedule. - -static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out, - const AES_KEY *key) { - for (size_t i = 0; i <= key->rounds; i++) { - // Copy the round key into each block in the batch. - for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) { - aes_word_t tmp[AES_NOHW_BLOCK_WORDS]; - memcpy(tmp, key->rd_key + 4 * i, 16); - aes_nohw_batch_set(&out->keys[i], tmp, j); - } - aes_nohw_transpose(&out->keys[i]); - } -} - -static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10, - 0x20, 0x40, 0x80, 0x1b, 0x36}; - -// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in -// |rcon|, stored in a |aes_word_t|. -static inline aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) { - rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1); -#if defined(OPENSSL_SSE2) - return _mm_set_epi32(0, 0, 0, rcon); -#else - return ((aes_word_t)rcon); -#endif -} - -static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], - const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { - AES_NOHW_BATCH batch; - memset(&batch, 0, sizeof(batch)); - aes_nohw_batch_set(&batch, in, 0); - aes_nohw_transpose(&batch); - aes_nohw_sub_bytes(&batch); - aes_nohw_transpose(&batch); - aes_nohw_batch_get(&batch, out, 0); -} - -static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) { - key->rounds = 10; - - aes_word_t block[AES_NOHW_BLOCK_WORDS]; - aes_nohw_compact_block(block, in); - memcpy(key->rd_key, block, 16); - - for (size_t i = 1; i <= 10; i++) { - aes_word_t sub[AES_NOHW_BLOCK_WORDS]; - aes_nohw_sub_block(sub, block); - uint8_t rcon = aes_nohw_rcon[i - 1]; - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Incorporate |rcon| and the transformed word into the first word. - block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j)); - block[j] = aes_nohw_xor( - block[j], - aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); - // Propagate to the remaining words. Note this is reordered from the usual - // formulation to avoid needing masks. - aes_word_t v = block[j]; - block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4)); - block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8)); - block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12)); - } - memcpy(key->rd_key + 4 * i, block, 16); - } -} - -static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24]) { - key->rounds = 12; - - aes_word_t storage1[AES_NOHW_BLOCK_WORDS], storage2[AES_NOHW_BLOCK_WORDS]; - aes_word_t *block1 = storage1, *block2 = storage2; - - // AES-192's key schedule is complex because each key schedule iteration - // produces six words, but we compute on blocks and each block is four words. - // We maintain a sliding window of two blocks, filled to 1.5 blocks at a time. - // We loop below every three blocks or two key schedule iterations. - // - // On entry to the loop, |block1| and the first half of |block2| contain the - // previous key schedule iteration. |block1| has been written to |key|, but - // |block2| has not as it is incomplete. - aes_nohw_compact_block(block1, in); - memcpy(key->rd_key, block1, 16); - - uint8_t half_block[16] = {0}; - memcpy(half_block, in + 16, 8); - aes_nohw_compact_block(block2, half_block); - - for (size_t i = 0; i < 4; i++) { - aes_word_t sub[AES_NOHW_BLOCK_WORDS]; - aes_nohw_sub_block(sub, block2); - uint8_t rcon = aes_nohw_rcon[2 * i]; - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Compute the first two words of the next key schedule iteration, which - // go in the second half of |block2|. The first two words of the previous - // iteration are in the first half of |block1|. Apply |rcon| here too - // because the shifts match. - block2[j] = aes_nohw_or( - block2[j], - aes_nohw_shift_left( - aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)), 8)); - // Incorporate the transformed word and propagate. Note the last word of - // the previous iteration corresponds to the second word of |copy|. This - // is incorporated into the first word of the next iteration, or the third - // word of |block2|. - block2[j] = aes_nohw_xor( - block2[j], aes_nohw_and(aes_nohw_shift_left( - aes_nohw_rotate_rows_down(sub[j]), 4), - AES_NOHW_COL2_MASK)); - block2[j] = aes_nohw_xor( - block2[j], - aes_nohw_and(aes_nohw_shift_left(block2[j], 4), AES_NOHW_COL3_MASK)); - - // Compute the remaining four words, which fill |block1|. Begin by moving - // the corresponding words of the previous iteration: the second half of - // |block1| and the first half of |block2|. - block1[j] = aes_nohw_shift_right(block1[j], 8); - block1[j] = aes_nohw_or(block1[j], aes_nohw_shift_left(block2[j], 8)); - // Incorporate the second word, computed previously in |block2|, and - // propagate. - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12)); - aes_word_t v = block1[j]; - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); - } - - // This completes two round keys. Note half of |block2| was computed in the - // previous loop iteration but was not yet output. - memcpy(key->rd_key + 4 * (3 * i + 1), block2, 16); - memcpy(key->rd_key + 4 * (3 * i + 2), block1, 16); - - aes_nohw_sub_block(sub, block1); - rcon = aes_nohw_rcon[2 * i + 1]; - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Compute the first four words of the next key schedule iteration in - // |block2|. Begin by moving the corresponding words of the previous - // iteration: the second half of |block2| and the first half of |block1|. - block2[j] = aes_nohw_shift_right(block2[j], 8); - block2[j] = aes_nohw_or(block2[j], aes_nohw_shift_left(block1[j], 8)); - // Incorporate rcon and the transformed word. Note the last word of the - // previous iteration corresponds to the last word of |copy|. - block2[j] = aes_nohw_xor(block2[j], aes_nohw_rcon_slice(rcon, j)); - block2[j] = aes_nohw_xor( - block2[j], - aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); - // Propagate to the remaining words. - aes_word_t v = block2[j]; - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4)); - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); - - // Compute the last two words, which go in the first half of |block1|. The - // last two words of the previous iteration are in the second half of - // |block1|. - block1[j] = aes_nohw_shift_right(block1[j], 8); - // Propagate blocks and mask off the excess. - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(block1[j], 4)); - block1[j] = aes_nohw_and(block1[j], AES_NOHW_COL01_MASK); - } - - // |block2| has a complete round key. |block1| will be completed in the next - // iteration. - memcpy(key->rd_key + 4 * (3 * i + 3), block2, 16); - - // Swap blocks to restore the invariant. - aes_word_t *tmp = block1; - block1 = block2; - block2 = tmp; - } -} - -static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) { - key->rounds = 14; - - // Each key schedule iteration produces two round keys. - aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS]; - aes_nohw_compact_block(block1, in); - memcpy(key->rd_key, block1, 16); - - aes_nohw_compact_block(block2, in + 16); - memcpy(key->rd_key + 4, block2, 16); - - for (size_t i = 2; i <= 14; i += 2) { - aes_word_t sub[AES_NOHW_BLOCK_WORDS]; - aes_nohw_sub_block(sub, block2); - uint8_t rcon = aes_nohw_rcon[i / 2 - 1]; - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Incorporate |rcon| and the transformed word into the first word. - block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)); - block1[j] = aes_nohw_xor( - block1[j], - aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); - // Propagate to the remaining words. - aes_word_t v = block1[j]; - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); - block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); - } - memcpy(key->rd_key + 4 * i, block1, 16); - - if (i == 14) { - break; - } - - aes_nohw_sub_block(sub, block1); - for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { - // Incorporate the transformed word into the first word. - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12)); - // Propagate to the remaining words. - aes_word_t v = block2[j]; - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4)); - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); - block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); - } - memcpy(key->rd_key + 4 * (i + 1), block2, 16); - } -} - - -// External API. - -int aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits, - AES_KEY *aeskey) { - switch (bits) { - case 128: - aes_nohw_setup_key_128(aeskey, key); - return 0; - case 192: - aes_nohw_setup_key_192(aeskey, key); - return 0; - case 256: - aes_nohw_setup_key_256(aeskey, key); - return 0; - } - return 1; -} - -int aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits, - AES_KEY *aeskey) { - return aes_nohw_set_encrypt_key(key, bits, aeskey); -} - -void aes_nohw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { - AES_NOHW_SCHEDULE sched; - aes_nohw_expand_round_keys(&sched, key); - AES_NOHW_BATCH batch; - aes_nohw_to_batch(&batch, in, /*num_blocks=*/1); - aes_nohw_encrypt_batch(&sched, key->rounds, &batch); - aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); -} - -void aes_nohw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { - AES_NOHW_SCHEDULE sched; - aes_nohw_expand_round_keys(&sched, key); - AES_NOHW_BATCH batch; - aes_nohw_to_batch(&batch, in, /*num_blocks=*/1); - aes_nohw_decrypt_batch(&sched, key->rounds, &batch); - aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); -} - -static inline void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16], - const uint8_t b[16]) { - for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) { - aes_word_t x, y; - memcpy(&x, a + i, sizeof(aes_word_t)); - memcpy(&y, b + i, sizeof(aes_word_t)); - x = aes_nohw_xor(x, y); - memcpy(out + i, &x, sizeof(aes_word_t)); - } -} - -void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, - size_t blocks, const AES_KEY *key, - const uint8_t ivec[16]) { - if (blocks == 0) { - return; - } - - AES_NOHW_SCHEDULE sched; - aes_nohw_expand_round_keys(&sched, key); - - // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|. - alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16]; - alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16]; - for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { - memcpy(ivs + 16 * i, ivec, 16); - } - - uint32_t ctr = CRYPTO_load_u32_be(ivs + 12); - for (;;) { - // Update counters. - for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { - CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + i); - } - - size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks; - AES_NOHW_BATCH batch; - aes_nohw_to_batch(&batch, ivs, todo); - aes_nohw_encrypt_batch(&sched, key->rounds, &batch); - aes_nohw_from_batch(enc_ivs, todo, &batch); - - for (size_t i = 0; i < todo; i++) { - aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i); - } - - blocks -= todo; - if (blocks == 0) { - break; - } - - in += 16 * AES_NOHW_BATCH_SIZE; - out += 16 * AES_NOHW_BATCH_SIZE; - ctr += AES_NOHW_BATCH_SIZE; - } -} - -void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t *ivec, const int enc) { - assert(len % 16 == 0); - size_t blocks = len / 16; - if (blocks == 0) { - return; - } - - AES_NOHW_SCHEDULE sched; - aes_nohw_expand_round_keys(&sched, key); - alignas(AES_NOHW_WORD_SIZE) uint8_t iv[16]; - memcpy(iv, ivec, 16); - - if (enc) { - // CBC encryption is not parallelizable. - while (blocks > 0) { - aes_nohw_xor_block(iv, iv, in); - - AES_NOHW_BATCH batch; - aes_nohw_to_batch(&batch, iv, /*num_blocks=*/1); - aes_nohw_encrypt_batch(&sched, key->rounds, &batch); - aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); - - memcpy(iv, out, 16); - - in += 16; - out += 16; - blocks--; - } - memcpy(ivec, iv, 16); - return; - } - - for (;;) { - size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks; - // Make a copy of the input so we can decrypt in-place. - alignas(AES_NOHW_WORD_SIZE) uint8_t copy[AES_NOHW_BATCH_SIZE * 16]; - memcpy(copy, in, todo * 16); - - AES_NOHW_BATCH batch; - aes_nohw_to_batch(&batch, in, todo); - aes_nohw_decrypt_batch(&sched, key->rounds, &batch); - aes_nohw_from_batch(out, todo, &batch); - - aes_nohw_xor_block(out, out, iv); - for (size_t i = 1; i < todo; i++) { - aes_nohw_xor_block(out + 16 * i, out + 16 * i, copy + 16 * (i - 1)); - } - - // Save the last block as the IV. - memcpy(iv, copy + 16 * (todo - 1), 16); - - blocks -= todo; - if (blocks == 0) { - break; - } - - in += 16 * AES_NOHW_BATCH_SIZE; - out += 16 * AES_NOHW_BATCH_SIZE; - } - - memcpy(ivec, iv, 16); -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/aes_nohw.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/aes/aes_nohw.cc.inc new file mode 100644 index 00000000..20f6a6ba --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/aes/aes_nohw.cc.inc @@ -0,0 +1,1279 @@ +// Copyright 2019 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "../../internal.h" +#include "internal.h" + +#if defined(OPENSSL_SSE2) +#include +#endif + + +using namespace bssl; + +// This file contains a constant-time implementation of AES, bitsliced with +// 32-bit, 64-bit, or 128-bit words, operating on two-, four-, and eight-block +// batches, respectively. The 128-bit implementation requires SSE2 intrinsics. +// +// This implementation is based on the algorithms described in the following +// references: +// - https://bearssl.org/constanttime.html#aes +// - https://eprint.iacr.org/2009/129.pdf +// - https://eprint.iacr.org/2009/191.pdf + + +// Word operations. +// +// An aes_word_t is the word used for this AES implementation. Throughout this +// file, bits and bytes are ordered little-endian, though "left" and "right" +// shifts match the operations themselves, which makes them reversed in a +// little-endian, left-to-right reading. +// +// Eight |aes_word_t|s contain |AES_NOHW_BATCH_SIZE| blocks. The bits in an +// |aes_word_t| are divided into 16 consecutive groups of |AES_NOHW_BATCH_SIZE| +// bits each, each corresponding to a byte in an AES block in column-major +// order (AES's byte order). We refer to these as "logical bytes". Note, in the +// 32-bit and 64-bit implementations, they are smaller than a byte. (The +// contents of a logical byte will be described later.) +// +// MSVC does not support C bit operators on |__m128i|, so the wrapper functions +// |aes_nohw_and|, etc., should be used instead. Note |aes_nohw_shift_left| and +// |aes_nohw_shift_right| measure the shift in logical bytes. That is, the shift +// value ranges from 0 to 15 independent of |aes_word_t| and +// |AES_NOHW_BATCH_SIZE|. +// +// This ordering is different from https://eprint.iacr.org/2009/129.pdf, which +// uses row-major order. Matching the AES order was easier to reason about, and +// we do not have PSHUFB available to arbitrarily permute bytes. + +#if defined(OPENSSL_SSE2) +typedef __m128i aes_word_t; +// AES_NOHW_WORD_SIZE is sizeof(aes_word_t). alignas(sizeof(T)) does not work in +// MSVC, so we define a constant. +#define AES_NOHW_WORD_SIZE 16 +#define AES_NOHW_BATCH_SIZE 8 +#define AES_NOHW_ROW0_MASK \ + _mm_set_epi32(0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff) +#define AES_NOHW_ROW1_MASK \ + _mm_set_epi32(0x0000ff00, 0x0000ff00, 0x0000ff00, 0x0000ff00) +#define AES_NOHW_ROW2_MASK \ + _mm_set_epi32(0x00ff0000, 0x00ff0000, 0x00ff0000, 0x00ff0000) +#define AES_NOHW_ROW3_MASK \ + _mm_set_epi32(0xff000000, 0xff000000, 0xff000000, 0xff000000) +#define AES_NOHW_COL01_MASK \ + _mm_set_epi32(0x00000000, 0x00000000, 0xffffffff, 0xffffffff) +#define AES_NOHW_COL2_MASK \ + _mm_set_epi32(0x00000000, 0xffffffff, 0x00000000, 0x00000000) +#define AES_NOHW_COL3_MASK \ + _mm_set_epi32(0xffffffff, 0x00000000, 0x00000000, 0x00000000) + +static aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { + return _mm_and_si128(a, b); +} + +static aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { + return _mm_or_si128(a, b); +} + +static aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { + return _mm_xor_si128(a, b); +} + +static aes_word_t aes_nohw_not(aes_word_t a) { + return _mm_xor_si128( + a, _mm_set_epi32(0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff)); +} + +// These are macros because parameters to |_mm_slli_si128| and |_mm_srli_si128| +// must be constants. +#define aes_nohw_shift_left(/* aes_word_t */ a, /* const */ i) \ + _mm_slli_si128((a), (i)) +#define aes_nohw_shift_right(/* aes_word_t */ a, /* const */ i) \ + _mm_srli_si128((a), (i)) +#else // !OPENSSL_SSE2 +#if defined(OPENSSL_64_BIT) +typedef uint64_t aes_word_t; +#define AES_NOHW_WORD_SIZE 8 +#define AES_NOHW_BATCH_SIZE 4 +#define AES_NOHW_ROW0_MASK UINT64_C(0x000f000f000f000f) +#define AES_NOHW_ROW1_MASK UINT64_C(0x00f000f000f000f0) +#define AES_NOHW_ROW2_MASK UINT64_C(0x0f000f000f000f00) +#define AES_NOHW_ROW3_MASK UINT64_C(0xf000f000f000f000) +#define AES_NOHW_COL01_MASK UINT64_C(0x00000000ffffffff) +#define AES_NOHW_COL2_MASK UINT64_C(0x0000ffff00000000) +#define AES_NOHW_COL3_MASK UINT64_C(0xffff000000000000) +#else // !OPENSSL_64_BIT +typedef uint32_t aes_word_t; +#define AES_NOHW_WORD_SIZE 4 +#define AES_NOHW_BATCH_SIZE 2 +#define AES_NOHW_ROW0_MASK 0x03030303 +#define AES_NOHW_ROW1_MASK 0x0c0c0c0c +#define AES_NOHW_ROW2_MASK 0x30303030 +#define AES_NOHW_ROW3_MASK 0xc0c0c0c0 +#define AES_NOHW_COL01_MASK 0x0000ffff +#define AES_NOHW_COL2_MASK 0x00ff0000 +#define AES_NOHW_COL3_MASK 0xff000000 +#endif // OPENSSL_64_BIT + +static aes_word_t aes_nohw_and(aes_word_t a, aes_word_t b) { return a & b; } + +static aes_word_t aes_nohw_or(aes_word_t a, aes_word_t b) { return a | b; } + +static aes_word_t aes_nohw_xor(aes_word_t a, aes_word_t b) { return a ^ b; } + +static aes_word_t aes_nohw_not(aes_word_t a) { return ~a; } + +static aes_word_t aes_nohw_shift_left(aes_word_t a, aes_word_t i) { + return a << (i * AES_NOHW_BATCH_SIZE); +} + +static aes_word_t aes_nohw_shift_right(aes_word_t a, aes_word_t i) { + return a >> (i * AES_NOHW_BATCH_SIZE); +} +#endif // OPENSSL_SSE2 + +static_assert(AES_NOHW_BATCH_SIZE * 128 == 8 * 8 * sizeof(aes_word_t), + "batch size does not match word size"); +static_assert(AES_NOHW_WORD_SIZE == sizeof(aes_word_t), + "AES_NOHW_WORD_SIZE is incorrect"); + + +// Block representations. +// +// This implementation uses three representations for AES blocks. First, the +// public API represents blocks as uint8_t[16] in the usual way. Second, most +// AES steps are evaluated in bitsliced form, stored in an |AES_NOHW_BATCH|. +// This stores |AES_NOHW_BATCH_SIZE| blocks in bitsliced order. For 64-bit words +// containing bitsliced blocks a, b, c, d, this would be as follows (vertical +// bars divide logical bytes): +// +// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... +// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... +// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... +// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... +// ... +// +// Finally, an individual block may be stored as an intermediate form in an +// aes_word_t[AES_NOHW_BLOCK_WORDS]. In this form, we permute the bits in each +// block, so that block[0]'s ith logical byte contains least-significant +// |AES_NOHW_BATCH_SIZE| bits of byte i, block[1] contains the next group of +// |AES_NOHW_BATCH_SIZE| bits, and so on. We refer to this transformation as +// "compacting" the block. Note this is no-op with 128-bit words because then +// |AES_NOHW_BLOCK_WORDS| is one and |AES_NOHW_BATCH_SIZE| is eight. For 64-bit +// words, one block would be stored in two words: +// +// block[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... +// block[1] = a4 a5 a6 a7 | a12 a13 a14 a15 | a20 a21 a22 a23 ... +// +// Observe that the distances between corresponding bits in bitsliced and +// compact bit orders match. If we line up corresponding words of each block, +// the bitsliced and compact representations may be converted by transposing +// bits in corresponding logical bytes. Continuing the 64-bit example: +// +// block_a[0] = a0 a1 a2 a3 | a8 a9 a10 a11 | a16 a17 a18 a19 ... +// block_b[0] = b0 b1 b2 b3 | b8 b9 b10 b11 | b16 b17 b18 b19 ... +// block_c[0] = c0 c1 c2 c3 | c8 c9 c10 c11 | c16 c17 c18 c19 ... +// block_d[0] = d0 d1 d2 d3 | d8 d9 d10 d11 | d16 d17 d18 d19 ... +// +// batch.w[0] = a0 b0 c0 d0 | a8 b8 c8 d8 | a16 b16 c16 d16 ... +// batch.w[1] = a1 b1 c1 d1 | a9 b9 c9 d9 | a17 b17 c17 d17 ... +// batch.w[2] = a2 b2 c2 d2 | a10 b10 c10 d10 | a18 b18 c18 d18 ... +// batch.w[3] = a3 b3 c3 d3 | a11 b11 c11 d11 | a19 b19 c19 d19 ... +// +// Note also that bitwise operations and (logical) byte permutations on an +// |aes_word_t| work equally for the bitsliced and compact words. +// +// We use the compact form in the |AES_KEY| representation to save work +// inflating round keys into |AES_NOHW_BATCH|. The compact form also exists +// temporarily while moving blocks in or out of an |AES_NOHW_BATCH|, immediately +// before or after |aes_nohw_transpose|. + +#define AES_NOHW_BLOCK_WORDS (16 / sizeof(aes_word_t)) + +// An AES_NOHW_BATCH stores |AES_NOHW_BATCH_SIZE| blocks. Unless otherwise +// specified, it is in bitsliced form. +typedef struct { + aes_word_t w[8]; +} AES_NOHW_BATCH; + +// An AES_NOHW_SCHEDULE is an expanded bitsliced AES key schedule. It is +// suitable for encryption or decryption. It is as large as |AES_NOHW_BATCH| +// |AES_KEY|s so it should not be used as a long-term key representation. +typedef struct { + // keys is an array of batches, one for each round key. Each batch stores + // |AES_NOHW_BATCH_SIZE| copies of the round key in bitsliced form. + AES_NOHW_BATCH keys[AES_MAXNR + 1]; +} AES_NOHW_SCHEDULE; + +// aes_nohw_batch_set sets the |i|th block of |batch| to |in|. |batch| is in +// compact form. +static void aes_nohw_batch_set(AES_NOHW_BATCH *batch, + const aes_word_t in[AES_NOHW_BLOCK_WORDS], + size_t i) { + // Note the words are interleaved. The order comes from |aes_nohw_transpose|. + // If |i| is zero and this is the 64-bit implementation, in[0] contains bits + // 0-3 and in[1] contains bits 4-7. We place in[0] at w[0] and in[1] at + // w[4] so that bits 0 and 4 are in the correct position. (In general, bits + // along diagonals of |AES_NOHW_BATCH_SIZE| by |AES_NOHW_BATCH_SIZE| squares + // will be correctly placed.) + assert(i < AES_NOHW_BATCH_SIZE); +#if defined(OPENSSL_SSE2) + batch->w[i] = in[0]; +#elif defined(OPENSSL_64_BIT) + batch->w[i] = in[0]; + batch->w[i + 4] = in[1]; +#else + batch->w[i] = in[0]; + batch->w[i + 2] = in[1]; + batch->w[i + 4] = in[2]; + batch->w[i + 6] = in[3]; +#endif +} + +// aes_nohw_batch_get writes the |i|th block of |batch| to |out|. |batch| is in +// compact form. +static void aes_nohw_batch_get(const AES_NOHW_BATCH *batch, + aes_word_t out[AES_NOHW_BLOCK_WORDS], size_t i) { + assert(i < AES_NOHW_BATCH_SIZE); +#if defined(OPENSSL_SSE2) + out[0] = batch->w[i]; +#elif defined(OPENSSL_64_BIT) + out[0] = batch->w[i]; + out[1] = batch->w[i + 4]; +#else + out[0] = batch->w[i]; + out[1] = batch->w[i + 2]; + out[2] = batch->w[i + 4]; + out[3] = batch->w[i + 6]; +#endif +} + +#if !defined(OPENSSL_SSE2) +// aes_nohw_delta_swap returns |a| with bits |a & mask| and +// |a & (mask << shift)| swapped. |mask| and |mask << shift| may not overlap. +static aes_word_t aes_nohw_delta_swap(aes_word_t a, aes_word_t mask, + aes_word_t shift) { + // See + // https://reflectionsonsecurity.wordpress.com/2014/05/11/efficient-bit-permutation-using-delta-swaps/ + aes_word_t b = (a ^ (a >> shift)) & mask; + return a ^ b ^ (b << shift); +} + +// In the 32-bit and 64-bit implementations, a block spans multiple words. +// |aes_nohw_compact_block| must permute bits across different words. First we +// implement |aes_nohw_compact_word| which performs a smaller version of the +// transformation which stays within a single word. +// +// These transformations are generalizations of the output of +// http://programming.sirrida.de/calcperm.php on smaller inputs. +#if defined(OPENSSL_64_BIT) +static uint64_t aes_nohw_compact_word(uint64_t a) { + // Numbering the 64/2 = 16 4-bit chunks, least to most significant, we swap + // quartets of those chunks: + // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => + // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 + a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); + // Swap quartets of 8-bit chunks (still numbering by 4-bit chunks): + // 0 2 1 3 | 4 6 5 7 | 8 10 9 11 | 12 14 13 15 => + // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 + a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); + // Swap quartets of 16-bit chunks (still numbering by 4-bit chunks): + // 0 2 4 6 | 1 3 5 7 | 8 10 12 14 | 9 11 13 15 => + // 0 2 4 6 | 8 10 12 14 | 1 3 5 7 | 9 11 13 15 + a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); + return a; +} + +static uint64_t aes_nohw_uncompact_word(uint64_t a) { + // Reverse the steps of |aes_nohw_uncompact_word|. + a = aes_nohw_delta_swap(a, UINT64_C(0x00000000ffff0000), 16); + a = aes_nohw_delta_swap(a, UINT64_C(0x0000ff000000ff00), 8); + a = aes_nohw_delta_swap(a, UINT64_C(0x00f000f000f000f0), 4); + return a; +} +#else // !OPENSSL_64_BIT +static uint32_t aes_nohw_compact_word(uint32_t a) { + // Numbering the 32/2 = 16 pairs of bits, least to most significant, we swap: + // 0 1 2 3 | 4 5 6 7 | 8 9 10 11 | 12 13 14 15 => + // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 + // Note: 0x00cc = 0b0000_0000_1100_1100 + // 0x00cc << 6 = 0b0011_0011_0000_0000 + a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); + // Now we swap groups of four bits (still numbering by pairs): + // 0 4 2 6 | 1 5 3 7 | 8 12 10 14 | 9 13 11 15 => + // 0 4 8 12 | 1 5 9 13 | 2 6 10 14 | 3 7 11 15 + // Note: 0x0000_f0f0 << 12 = 0x0f0f_0000 + a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); + return a; +} + +static uint32_t aes_nohw_uncompact_word(uint32_t a) { + // Reverse the steps of |aes_nohw_uncompact_word|. + a = aes_nohw_delta_swap(a, 0x0000f0f0, 12); + a = aes_nohw_delta_swap(a, 0x00cc00cc, 6); + return a; +} + +static uint32_t aes_nohw_word_from_bytes(uint8_t a0, uint8_t a1, uint8_t a2, + uint8_t a3) { + return (uint32_t)a0 | ((uint32_t)a1 << 8) | ((uint32_t)a2 << 16) | + ((uint32_t)a3 << 24); +} +#endif // OPENSSL_64_BIT +#endif // !OPENSSL_SSE2 + +static void aes_nohw_compact_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], + const uint8_t in[16]) { + memcpy(out, in, 16); +#if defined(OPENSSL_SSE2) + // No conversions needed. +#elif defined(OPENSSL_64_BIT) + uint64_t a0 = aes_nohw_compact_word(out[0]); + uint64_t a1 = aes_nohw_compact_word(out[1]); + out[0] = (a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32); + out[1] = (a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32); +#else + uint32_t a0 = aes_nohw_compact_word(out[0]); + uint32_t a1 = aes_nohw_compact_word(out[1]); + uint32_t a2 = aes_nohw_compact_word(out[2]); + uint32_t a3 = aes_nohw_compact_word(out[3]); + // Note clang, when building for ARM Thumb2, will sometimes miscompile + // expressions such as (a0 & 0x0000ff00) << 8, particularly when building + // without optimizations. This bug was introduced in + // https://reviews.llvm.org/rL340261 and fixed in + // https://reviews.llvm.org/rL351310. The following is written to avoid this. + out[0] = aes_nohw_word_from_bytes(a0, a1, a2, a3); + out[1] = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8); + out[2] = aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16); + out[3] = aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24); +#endif +} + +static void aes_nohw_uncompact_block( + uint8_t out[16], const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { +#if defined(OPENSSL_SSE2) + memcpy(out, in, 16); // No conversions needed. +#elif defined(OPENSSL_64_BIT) + uint64_t a0 = in[0]; + uint64_t a1 = in[1]; + uint64_t b0 = + aes_nohw_uncompact_word((a0 & UINT64_C(0x00000000ffffffff)) | (a1 << 32)); + uint64_t b1 = + aes_nohw_uncompact_word((a1 & UINT64_C(0xffffffff00000000)) | (a0 >> 32)); + memcpy(out, &b0, 8); + memcpy(out + 8, &b1, 8); +#else + uint32_t a0 = in[0]; + uint32_t a1 = in[1]; + uint32_t a2 = in[2]; + uint32_t a3 = in[3]; + // Note clang, when building for ARM Thumb2, will sometimes miscompile + // expressions such as (a0 & 0x0000ff00) << 8, particularly when building + // without optimizations. This bug was introduced in + // https://reviews.llvm.org/rL340261 and fixed in + // https://reviews.llvm.org/rL351310. The following is written to avoid this. + uint32_t b0 = aes_nohw_word_from_bytes(a0, a1, a2, a3); + uint32_t b1 = aes_nohw_word_from_bytes(a0 >> 8, a1 >> 8, a2 >> 8, a3 >> 8); + uint32_t b2 = + aes_nohw_word_from_bytes(a0 >> 16, a1 >> 16, a2 >> 16, a3 >> 16); + uint32_t b3 = + aes_nohw_word_from_bytes(a0 >> 24, a1 >> 24, a2 >> 24, a3 >> 24); + b0 = aes_nohw_uncompact_word(b0); + b1 = aes_nohw_uncompact_word(b1); + b2 = aes_nohw_uncompact_word(b2); + b3 = aes_nohw_uncompact_word(b3); + memcpy(out, &b0, 4); + memcpy(out + 4, &b1, 4); + memcpy(out + 8, &b2, 4); + memcpy(out + 12, &b3, 4); +#endif +} + +// aes_nohw_swap_bits is a variation on a delta swap. It swaps the bits in +// |*a & (mask << shift)| with the bits in |*b & mask|. |mask| and +// |mask << shift| must not overlap. |mask| is specified as a |uint32_t|, but it +// is repeated to the full width of |aes_word_t|. +#if defined(OPENSSL_SSE2) +// This must be a macro because |_mm_srli_epi32| and |_mm_slli_epi32| require +// constant shift values. +#define aes_nohw_swap_bits(/*__m128i* */ a, /*__m128i* */ b, \ + /* uint32_t */ mask, /* const */ shift) \ + do { \ + __m128i swap = \ + _mm_and_si128(_mm_xor_si128(_mm_srli_epi32(*(a), (shift)), *(b)), \ + _mm_set_epi32((mask), (mask), (mask), (mask))); \ + *(a) = _mm_xor_si128(*(a), _mm_slli_epi32(swap, (shift))); \ + *(b) = _mm_xor_si128(*(b), swap); \ + \ + } while (0) +#else +static void aes_nohw_swap_bits(aes_word_t *a, aes_word_t *b, uint32_t mask, + aes_word_t shift) { +#if defined(OPENSSL_64_BIT) + aes_word_t mask_w = (((uint64_t)mask) << 32) | mask; +#else + aes_word_t mask_w = mask; +#endif + // This is a variation on a delta swap. + aes_word_t swap = ((*a >> shift) ^ *b) & mask_w; + *a ^= swap << shift; + *b ^= swap; +} +#endif // OPENSSL_SSE2 + +// aes_nohw_transpose converts |batch| to and from bitsliced form. It divides +// the 8 × word_size bits into AES_NOHW_BATCH_SIZE × AES_NOHW_BATCH_SIZE squares +// and transposes each square. +static void aes_nohw_transpose(AES_NOHW_BATCH *batch) { + // Swap bits with index 0 and 1 mod 2 (0x55 = 0b01010101). + aes_nohw_swap_bits(&batch->w[0], &batch->w[1], 0x55555555, 1); + aes_nohw_swap_bits(&batch->w[2], &batch->w[3], 0x55555555, 1); + aes_nohw_swap_bits(&batch->w[4], &batch->w[5], 0x55555555, 1); + aes_nohw_swap_bits(&batch->w[6], &batch->w[7], 0x55555555, 1); + +#if AES_NOHW_BATCH_SIZE >= 4 + // Swap bits with index 0-1 and 2-3 mod 4 (0x33 = 0b00110011). + aes_nohw_swap_bits(&batch->w[0], &batch->w[2], 0x33333333, 2); + aes_nohw_swap_bits(&batch->w[1], &batch->w[3], 0x33333333, 2); + aes_nohw_swap_bits(&batch->w[4], &batch->w[6], 0x33333333, 2); + aes_nohw_swap_bits(&batch->w[5], &batch->w[7], 0x33333333, 2); +#endif + +#if AES_NOHW_BATCH_SIZE >= 8 + // Swap bits with index 0-3 and 4-7 mod 8 (0x0f = 0b00001111). + aes_nohw_swap_bits(&batch->w[0], &batch->w[4], 0x0f0f0f0f, 4); + aes_nohw_swap_bits(&batch->w[1], &batch->w[5], 0x0f0f0f0f, 4); + aes_nohw_swap_bits(&batch->w[2], &batch->w[6], 0x0f0f0f0f, 4); + aes_nohw_swap_bits(&batch->w[3], &batch->w[7], 0x0f0f0f0f, 4); +#endif +} + +// aes_nohw_to_batch initializes |out| with the |num_blocks| blocks from |in|. +// |num_blocks| must be at most |AES_NOHW_BATCH|. +static void aes_nohw_to_batch(AES_NOHW_BATCH *out, const uint8_t *in, + size_t num_blocks) { + // Don't leave unused blocks uninitialized. + memset(out, 0, sizeof(AES_NOHW_BATCH)); + assert(num_blocks <= AES_NOHW_BATCH_SIZE); + for (size_t i = 0; i < num_blocks; i++) { + aes_word_t block[AES_NOHW_BLOCK_WORDS]; + aes_nohw_compact_block(block, in + 16 * i); + aes_nohw_batch_set(out, block, i); + } + + aes_nohw_transpose(out); +} + +// aes_nohw_to_batch writes the first |num_blocks| blocks in |batch| to |out|. +// |num_blocks| must be at most |AES_NOHW_BATCH|. +static void aes_nohw_from_batch(uint8_t *out, size_t num_blocks, + const AES_NOHW_BATCH *batch) { + AES_NOHW_BATCH copy = *batch; + aes_nohw_transpose(©); + + assert(num_blocks <= AES_NOHW_BATCH_SIZE); + for (size_t i = 0; i < num_blocks; i++) { + aes_word_t block[AES_NOHW_BLOCK_WORDS]; + aes_nohw_batch_get(©, block, i); + aes_nohw_uncompact_block(out + 16 * i, block); + } +} + + +// AES round steps. + +static void aes_nohw_add_round_key(AES_NOHW_BATCH *batch, + const AES_NOHW_BATCH *key) { + for (size_t i = 0; i < 8; i++) { + batch->w[i] = aes_nohw_xor(batch->w[i], key->w[i]); + } +} + +static void aes_nohw_sub_bytes(AES_NOHW_BATCH *batch) { + // See https://eprint.iacr.org/2009/191.pdf, Appendix C. + aes_word_t x0 = batch->w[7]; + aes_word_t x1 = batch->w[6]; + aes_word_t x2 = batch->w[5]; + aes_word_t x3 = batch->w[4]; + aes_word_t x4 = batch->w[3]; + aes_word_t x5 = batch->w[2]; + aes_word_t x6 = batch->w[1]; + aes_word_t x7 = batch->w[0]; + + // Figure 2, the top linear transformation. + aes_word_t y14 = aes_nohw_xor(x3, x5); + aes_word_t y13 = aes_nohw_xor(x0, x6); + aes_word_t y9 = aes_nohw_xor(x0, x3); + aes_word_t y8 = aes_nohw_xor(x0, x5); + aes_word_t t0 = aes_nohw_xor(x1, x2); + aes_word_t y1 = aes_nohw_xor(t0, x7); + aes_word_t y4 = aes_nohw_xor(y1, x3); + aes_word_t y12 = aes_nohw_xor(y13, y14); + aes_word_t y2 = aes_nohw_xor(y1, x0); + aes_word_t y5 = aes_nohw_xor(y1, x6); + aes_word_t y3 = aes_nohw_xor(y5, y8); + aes_word_t t1 = aes_nohw_xor(x4, y12); + aes_word_t y15 = aes_nohw_xor(t1, x5); + aes_word_t y20 = aes_nohw_xor(t1, x1); + aes_word_t y6 = aes_nohw_xor(y15, x7); + aes_word_t y10 = aes_nohw_xor(y15, t0); + aes_word_t y11 = aes_nohw_xor(y20, y9); + aes_word_t y7 = aes_nohw_xor(x7, y11); + aes_word_t y17 = aes_nohw_xor(y10, y11); + aes_word_t y19 = aes_nohw_xor(y10, y8); + aes_word_t y16 = aes_nohw_xor(t0, y11); + aes_word_t y21 = aes_nohw_xor(y13, y16); + aes_word_t y18 = aes_nohw_xor(x0, y16); + + // Figure 3, the middle non-linear section. + aes_word_t t2 = aes_nohw_and(y12, y15); + aes_word_t t3 = aes_nohw_and(y3, y6); + aes_word_t t4 = aes_nohw_xor(t3, t2); + aes_word_t t5 = aes_nohw_and(y4, x7); + aes_word_t t6 = aes_nohw_xor(t5, t2); + aes_word_t t7 = aes_nohw_and(y13, y16); + aes_word_t t8 = aes_nohw_and(y5, y1); + aes_word_t t9 = aes_nohw_xor(t8, t7); + aes_word_t t10 = aes_nohw_and(y2, y7); + aes_word_t t11 = aes_nohw_xor(t10, t7); + aes_word_t t12 = aes_nohw_and(y9, y11); + aes_word_t t13 = aes_nohw_and(y14, y17); + aes_word_t t14 = aes_nohw_xor(t13, t12); + aes_word_t t15 = aes_nohw_and(y8, y10); + aes_word_t t16 = aes_nohw_xor(t15, t12); + aes_word_t t17 = aes_nohw_xor(t4, t14); + aes_word_t t18 = aes_nohw_xor(t6, t16); + aes_word_t t19 = aes_nohw_xor(t9, t14); + aes_word_t t20 = aes_nohw_xor(t11, t16); + aes_word_t t21 = aes_nohw_xor(t17, y20); + aes_word_t t22 = aes_nohw_xor(t18, y19); + aes_word_t t23 = aes_nohw_xor(t19, y21); + aes_word_t t24 = aes_nohw_xor(t20, y18); + aes_word_t t25 = aes_nohw_xor(t21, t22); + aes_word_t t26 = aes_nohw_and(t21, t23); + aes_word_t t27 = aes_nohw_xor(t24, t26); + aes_word_t t28 = aes_nohw_and(t25, t27); + aes_word_t t29 = aes_nohw_xor(t28, t22); + aes_word_t t30 = aes_nohw_xor(t23, t24); + aes_word_t t31 = aes_nohw_xor(t22, t26); + aes_word_t t32 = aes_nohw_and(t31, t30); + aes_word_t t33 = aes_nohw_xor(t32, t24); + aes_word_t t34 = aes_nohw_xor(t23, t33); + aes_word_t t35 = aes_nohw_xor(t27, t33); + aes_word_t t36 = aes_nohw_and(t24, t35); + aes_word_t t37 = aes_nohw_xor(t36, t34); + aes_word_t t38 = aes_nohw_xor(t27, t36); + aes_word_t t39 = aes_nohw_and(t29, t38); + aes_word_t t40 = aes_nohw_xor(t25, t39); + aes_word_t t41 = aes_nohw_xor(t40, t37); + aes_word_t t42 = aes_nohw_xor(t29, t33); + aes_word_t t43 = aes_nohw_xor(t29, t40); + aes_word_t t44 = aes_nohw_xor(t33, t37); + aes_word_t t45 = aes_nohw_xor(t42, t41); + aes_word_t z0 = aes_nohw_and(t44, y15); + aes_word_t z1 = aes_nohw_and(t37, y6); + aes_word_t z2 = aes_nohw_and(t33, x7); + aes_word_t z3 = aes_nohw_and(t43, y16); + aes_word_t z4 = aes_nohw_and(t40, y1); + aes_word_t z5 = aes_nohw_and(t29, y7); + aes_word_t z6 = aes_nohw_and(t42, y11); + aes_word_t z7 = aes_nohw_and(t45, y17); + aes_word_t z8 = aes_nohw_and(t41, y10); + aes_word_t z9 = aes_nohw_and(t44, y12); + aes_word_t z10 = aes_nohw_and(t37, y3); + aes_word_t z11 = aes_nohw_and(t33, y4); + aes_word_t z12 = aes_nohw_and(t43, y13); + aes_word_t z13 = aes_nohw_and(t40, y5); + aes_word_t z14 = aes_nohw_and(t29, y2); + aes_word_t z15 = aes_nohw_and(t42, y9); + aes_word_t z16 = aes_nohw_and(t45, y14); + aes_word_t z17 = aes_nohw_and(t41, y8); + + // Figure 4, bottom linear transformation. + aes_word_t t46 = aes_nohw_xor(z15, z16); + aes_word_t t47 = aes_nohw_xor(z10, z11); + aes_word_t t48 = aes_nohw_xor(z5, z13); + aes_word_t t49 = aes_nohw_xor(z9, z10); + aes_word_t t50 = aes_nohw_xor(z2, z12); + aes_word_t t51 = aes_nohw_xor(z2, z5); + aes_word_t t52 = aes_nohw_xor(z7, z8); + aes_word_t t53 = aes_nohw_xor(z0, z3); + aes_word_t t54 = aes_nohw_xor(z6, z7); + aes_word_t t55 = aes_nohw_xor(z16, z17); + aes_word_t t56 = aes_nohw_xor(z12, t48); + aes_word_t t57 = aes_nohw_xor(t50, t53); + aes_word_t t58 = aes_nohw_xor(z4, t46); + aes_word_t t59 = aes_nohw_xor(z3, t54); + aes_word_t t60 = aes_nohw_xor(t46, t57); + aes_word_t t61 = aes_nohw_xor(z14, t57); + aes_word_t t62 = aes_nohw_xor(t52, t58); + aes_word_t t63 = aes_nohw_xor(t49, t58); + aes_word_t t64 = aes_nohw_xor(z4, t59); + aes_word_t t65 = aes_nohw_xor(t61, t62); + aes_word_t t66 = aes_nohw_xor(z1, t63); + aes_word_t s0 = aes_nohw_xor(t59, t63); + aes_word_t s6 = aes_nohw_xor(t56, aes_nohw_not(t62)); + aes_word_t s7 = aes_nohw_xor(t48, aes_nohw_not(t60)); + aes_word_t t67 = aes_nohw_xor(t64, t65); + aes_word_t s3 = aes_nohw_xor(t53, t66); + aes_word_t s4 = aes_nohw_xor(t51, t66); + aes_word_t s5 = aes_nohw_xor(t47, t65); + aes_word_t s1 = aes_nohw_xor(t64, aes_nohw_not(s3)); + aes_word_t s2 = aes_nohw_xor(t55, aes_nohw_not(t67)); + + batch->w[0] = s7; + batch->w[1] = s6; + batch->w[2] = s5; + batch->w[3] = s4; + batch->w[4] = s3; + batch->w[5] = s2; + batch->w[6] = s1; + batch->w[7] = s0; +} + +// aes_nohw_sub_bytes_inv_affine inverts the affine transform portion of the AES +// S-box, defined in FIPS PUB 197, section 5.1.1, step 2. +static void aes_nohw_sub_bytes_inv_affine(AES_NOHW_BATCH *batch) { + aes_word_t a0 = batch->w[0]; + aes_word_t a1 = batch->w[1]; + aes_word_t a2 = batch->w[2]; + aes_word_t a3 = batch->w[3]; + aes_word_t a4 = batch->w[4]; + aes_word_t a5 = batch->w[5]; + aes_word_t a6 = batch->w[6]; + aes_word_t a7 = batch->w[7]; + + // Apply the circulant [0 0 1 0 0 1 0 1]. This is the inverse of the circulant + // [1 0 0 0 1 1 1 1]. + aes_word_t b0 = aes_nohw_xor(a2, aes_nohw_xor(a5, a7)); + aes_word_t b1 = aes_nohw_xor(a3, aes_nohw_xor(a6, a0)); + aes_word_t b2 = aes_nohw_xor(a4, aes_nohw_xor(a7, a1)); + aes_word_t b3 = aes_nohw_xor(a5, aes_nohw_xor(a0, a2)); + aes_word_t b4 = aes_nohw_xor(a6, aes_nohw_xor(a1, a3)); + aes_word_t b5 = aes_nohw_xor(a7, aes_nohw_xor(a2, a4)); + aes_word_t b6 = aes_nohw_xor(a0, aes_nohw_xor(a3, a5)); + aes_word_t b7 = aes_nohw_xor(a1, aes_nohw_xor(a4, a6)); + + // XOR 0x05. Equivalently, we could XOR 0x63 before applying the circulant, + // but 0x05 has lower Hamming weight. (0x05 is the circulant applied to 0x63.) + batch->w[0] = aes_nohw_not(b0); + batch->w[1] = b1; + batch->w[2] = aes_nohw_not(b2); + batch->w[3] = b3; + batch->w[4] = b4; + batch->w[5] = b5; + batch->w[6] = b6; + batch->w[7] = b7; +} + +static void aes_nohw_inv_sub_bytes(AES_NOHW_BATCH *batch) { + // We implement the inverse S-box using the forwards implementation with the + // technique described in https://www.bearssl.org/constanttime.html#aes. + // + // The forwards S-box inverts its input and applies an affine transformation: + // S(x) = A(Inv(x)). Thus Inv(x) = InvA(S(x)). The inverse S-box is then: + // + // InvS(x) = Inv(InvA(x)). + // = InvA(S(InvA(x))) + aes_nohw_sub_bytes_inv_affine(batch); + aes_nohw_sub_bytes(batch); + aes_nohw_sub_bytes_inv_affine(batch); +} + +// aes_nohw_rotate_cols_right returns |v| with the columns in each row rotated +// to the right by |n|. This is a macro because |aes_nohw_shift_*| require +// constant shift counts in the SSE2 implementation. +#define aes_nohw_rotate_cols_right(/* aes_word_t */ v, /* const */ n) \ + (aes_nohw_or(aes_nohw_shift_right((v), (n)*4), \ + aes_nohw_shift_left((v), 16 - (n)*4))) + +static void aes_nohw_shift_rows(AES_NOHW_BATCH *batch) { + for (size_t i = 0; i < 8; i++) { + aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK); + aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK); + aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK); + aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK); + row1 = aes_nohw_rotate_cols_right(row1, 1); + row2 = aes_nohw_rotate_cols_right(row2, 2); + row3 = aes_nohw_rotate_cols_right(row3, 3); + batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3)); + } +} + +static void aes_nohw_inv_shift_rows(AES_NOHW_BATCH *batch) { + for (size_t i = 0; i < 8; i++) { + aes_word_t row0 = aes_nohw_and(batch->w[i], AES_NOHW_ROW0_MASK); + aes_word_t row1 = aes_nohw_and(batch->w[i], AES_NOHW_ROW1_MASK); + aes_word_t row2 = aes_nohw_and(batch->w[i], AES_NOHW_ROW2_MASK); + aes_word_t row3 = aes_nohw_and(batch->w[i], AES_NOHW_ROW3_MASK); + row1 = aes_nohw_rotate_cols_right(row1, 3); + row2 = aes_nohw_rotate_cols_right(row2, 2); + row3 = aes_nohw_rotate_cols_right(row3, 1); + batch->w[i] = aes_nohw_or(aes_nohw_or(row0, row1), aes_nohw_or(row2, row3)); + } +} + +// aes_nohw_rotate_rows_down returns |v| with the rows in each column rotated +// down by one. +static aes_word_t aes_nohw_rotate_rows_down(aes_word_t v) { +#if defined(OPENSSL_SSE2) + return _mm_or_si128(_mm_srli_epi32(v, 8), _mm_slli_epi32(v, 24)); +#elif defined(OPENSSL_64_BIT) + return ((v >> 4) & UINT64_C(0x0fff0fff0fff0fff)) | + ((v << 12) & UINT64_C(0xf000f000f000f000)); +#else + return ((v >> 2) & 0x3f3f3f3f) | ((v << 6) & 0xc0c0c0c0); +#endif +} + +// aes_nohw_rotate_rows_twice returns |v| with the rows in each column rotated +// by two. +static aes_word_t aes_nohw_rotate_rows_twice(aes_word_t v) { +#if defined(OPENSSL_SSE2) + return _mm_or_si128(_mm_srli_epi32(v, 16), _mm_slli_epi32(v, 16)); +#elif defined(OPENSSL_64_BIT) + return ((v >> 8) & UINT64_C(0x00ff00ff00ff00ff)) | + ((v << 8) & UINT64_C(0xff00ff00ff00ff00)); +#else + return ((v >> 4) & 0x0f0f0f0f) | ((v << 4) & 0xf0f0f0f0); +#endif +} + +static void aes_nohw_mix_columns(AES_NOHW_BATCH *batch) { + // See https://eprint.iacr.org/2009/129.pdf, section 4.4 and appendix A. + aes_word_t a0 = batch->w[0]; + aes_word_t a1 = batch->w[1]; + aes_word_t a2 = batch->w[2]; + aes_word_t a3 = batch->w[3]; + aes_word_t a4 = batch->w[4]; + aes_word_t a5 = batch->w[5]; + aes_word_t a6 = batch->w[6]; + aes_word_t a7 = batch->w[7]; + + aes_word_t r0 = aes_nohw_rotate_rows_down(a0); + aes_word_t a0_r0 = aes_nohw_xor(a0, r0); + aes_word_t r1 = aes_nohw_rotate_rows_down(a1); + aes_word_t a1_r1 = aes_nohw_xor(a1, r1); + aes_word_t r2 = aes_nohw_rotate_rows_down(a2); + aes_word_t a2_r2 = aes_nohw_xor(a2, r2); + aes_word_t r3 = aes_nohw_rotate_rows_down(a3); + aes_word_t a3_r3 = aes_nohw_xor(a3, r3); + aes_word_t r4 = aes_nohw_rotate_rows_down(a4); + aes_word_t a4_r4 = aes_nohw_xor(a4, r4); + aes_word_t r5 = aes_nohw_rotate_rows_down(a5); + aes_word_t a5_r5 = aes_nohw_xor(a5, r5); + aes_word_t r6 = aes_nohw_rotate_rows_down(a6); + aes_word_t a6_r6 = aes_nohw_xor(a6, r6); + aes_word_t r7 = aes_nohw_rotate_rows_down(a7); + aes_word_t a7_r7 = aes_nohw_xor(a7, r7); + + batch->w[0] = + aes_nohw_xor(aes_nohw_xor(a7_r7, r0), aes_nohw_rotate_rows_twice(a0_r0)); + batch->w[1] = + aes_nohw_xor(aes_nohw_xor(a0_r0, a7_r7), + aes_nohw_xor(r1, aes_nohw_rotate_rows_twice(a1_r1))); + batch->w[2] = + aes_nohw_xor(aes_nohw_xor(a1_r1, r2), aes_nohw_rotate_rows_twice(a2_r2)); + batch->w[3] = + aes_nohw_xor(aes_nohw_xor(a2_r2, a7_r7), + aes_nohw_xor(r3, aes_nohw_rotate_rows_twice(a3_r3))); + batch->w[4] = + aes_nohw_xor(aes_nohw_xor(a3_r3, a7_r7), + aes_nohw_xor(r4, aes_nohw_rotate_rows_twice(a4_r4))); + batch->w[5] = + aes_nohw_xor(aes_nohw_xor(a4_r4, r5), aes_nohw_rotate_rows_twice(a5_r5)); + batch->w[6] = + aes_nohw_xor(aes_nohw_xor(a5_r5, r6), aes_nohw_rotate_rows_twice(a6_r6)); + batch->w[7] = + aes_nohw_xor(aes_nohw_xor(a6_r6, r7), aes_nohw_rotate_rows_twice(a7_r7)); +} + +static void aes_nohw_inv_mix_columns(AES_NOHW_BATCH *batch) { + aes_word_t a0 = batch->w[0]; + aes_word_t a1 = batch->w[1]; + aes_word_t a2 = batch->w[2]; + aes_word_t a3 = batch->w[3]; + aes_word_t a4 = batch->w[4]; + aes_word_t a5 = batch->w[5]; + aes_word_t a6 = batch->w[6]; + aes_word_t a7 = batch->w[7]; + + // bsaes-x86_64.pl describes the following decomposition of the inverse + // MixColumns matrix, credited to Jussi Kivilinna. This gives a much simpler + // multiplication. + // + // | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 | + // | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 | + // | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 | + // | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 | + // + // First, apply the [5 0 4 0] matrix. Multiplying by 4 in F_(2^8) is described + // by the following bit equations: + // + // b0 = a6 + // b1 = a6 ^ a7 + // b2 = a0 ^ a7 + // b3 = a1 ^ a6 + // b4 = a2 ^ a6 ^ a7 + // b5 = a3 ^ a7 + // b6 = a4 + // b7 = a5 + // + // Each coefficient is given by: + // + // b_ij = 05·a_ij ⊕ 04·a_i(j+2) = 04·(a_ij ⊕ a_i(j+2)) ⊕ a_ij + // + // We combine the two equations below. Note a_i(j+2) is a row rotation. + aes_word_t a0_r0 = aes_nohw_xor(a0, aes_nohw_rotate_rows_twice(a0)); + aes_word_t a1_r1 = aes_nohw_xor(a1, aes_nohw_rotate_rows_twice(a1)); + aes_word_t a2_r2 = aes_nohw_xor(a2, aes_nohw_rotate_rows_twice(a2)); + aes_word_t a3_r3 = aes_nohw_xor(a3, aes_nohw_rotate_rows_twice(a3)); + aes_word_t a4_r4 = aes_nohw_xor(a4, aes_nohw_rotate_rows_twice(a4)); + aes_word_t a5_r5 = aes_nohw_xor(a5, aes_nohw_rotate_rows_twice(a5)); + aes_word_t a6_r6 = aes_nohw_xor(a6, aes_nohw_rotate_rows_twice(a6)); + aes_word_t a7_r7 = aes_nohw_xor(a7, aes_nohw_rotate_rows_twice(a7)); + + batch->w[0] = aes_nohw_xor(a0, a6_r6); + batch->w[1] = aes_nohw_xor(a1, aes_nohw_xor(a6_r6, a7_r7)); + batch->w[2] = aes_nohw_xor(a2, aes_nohw_xor(a0_r0, a7_r7)); + batch->w[3] = aes_nohw_xor(a3, aes_nohw_xor(a1_r1, a6_r6)); + batch->w[4] = + aes_nohw_xor(aes_nohw_xor(a4, a2_r2), aes_nohw_xor(a6_r6, a7_r7)); + batch->w[5] = aes_nohw_xor(a5, aes_nohw_xor(a3_r3, a7_r7)); + batch->w[6] = aes_nohw_xor(a6, a4_r4); + batch->w[7] = aes_nohw_xor(a7, a5_r5); + + // Apply the [02 03 01 01] matrix, which is just MixColumns. + aes_nohw_mix_columns(batch); +} + +static void aes_nohw_encrypt_batch(const AES_NOHW_SCHEDULE *key, + size_t num_rounds, AES_NOHW_BATCH *batch) { + aes_nohw_add_round_key(batch, &key->keys[0]); + for (size_t i = 1; i < num_rounds; i++) { + aes_nohw_sub_bytes(batch); + aes_nohw_shift_rows(batch); + aes_nohw_mix_columns(batch); + aes_nohw_add_round_key(batch, &key->keys[i]); + } + aes_nohw_sub_bytes(batch); + aes_nohw_shift_rows(batch); + aes_nohw_add_round_key(batch, &key->keys[num_rounds]); +} + +static void aes_nohw_decrypt_batch(const AES_NOHW_SCHEDULE *key, + size_t num_rounds, AES_NOHW_BATCH *batch) { + aes_nohw_add_round_key(batch, &key->keys[num_rounds]); + aes_nohw_inv_shift_rows(batch); + aes_nohw_inv_sub_bytes(batch); + for (size_t i = num_rounds - 1; i > 0; i--) { + aes_nohw_add_round_key(batch, &key->keys[i]); + aes_nohw_inv_mix_columns(batch); + aes_nohw_inv_shift_rows(batch); + aes_nohw_inv_sub_bytes(batch); + } + aes_nohw_add_round_key(batch, &key->keys[0]); +} + + +// Key schedule. + +static void aes_nohw_expand_round_keys(AES_NOHW_SCHEDULE *out, + const AES_KEY *key) { + for (size_t i = 0; i <= key->rounds; i++) { + // Copy the round key into each block in the batch. + for (size_t j = 0; j < AES_NOHW_BATCH_SIZE; j++) { + aes_word_t tmp[AES_NOHW_BLOCK_WORDS]; + memcpy(tmp, key->rd_key + 4 * i, 16); + aes_nohw_batch_set(&out->keys[i], tmp, j); + } + aes_nohw_transpose(&out->keys[i]); + } +} + +static const uint8_t aes_nohw_rcon[10] = {0x01, 0x02, 0x04, 0x08, 0x10, + 0x20, 0x40, 0x80, 0x1b, 0x36}; + +// aes_nohw_rcon_slice returns the |i|th group of |AES_NOHW_BATCH_SIZE| bits in +// |rcon|, stored in a |aes_word_t|. +static aes_word_t aes_nohw_rcon_slice(uint8_t rcon, size_t i) { + rcon = (rcon >> (i * AES_NOHW_BATCH_SIZE)) & ((1 << AES_NOHW_BATCH_SIZE) - 1); +#if defined(OPENSSL_SSE2) + return _mm_set_epi32(0, 0, 0, rcon); +#else + return ((aes_word_t)rcon); +#endif +} + +static void aes_nohw_sub_block(aes_word_t out[AES_NOHW_BLOCK_WORDS], + const aes_word_t in[AES_NOHW_BLOCK_WORDS]) { + AES_NOHW_BATCH batch; + memset(&batch, 0, sizeof(batch)); + aes_nohw_batch_set(&batch, in, 0); + aes_nohw_transpose(&batch); + aes_nohw_sub_bytes(&batch); + aes_nohw_transpose(&batch); + aes_nohw_batch_get(&batch, out, 0); +} + +static void aes_nohw_setup_key_128(AES_KEY *key, const uint8_t in[16]) { + key->rounds = 10; + + aes_word_t block[AES_NOHW_BLOCK_WORDS]; + aes_nohw_compact_block(block, in); + memcpy(key->rd_key, block, 16); + + for (size_t i = 1; i <= 10; i++) { + aes_word_t sub[AES_NOHW_BLOCK_WORDS]; + aes_nohw_sub_block(sub, block); + uint8_t rcon = aes_nohw_rcon[i - 1]; + for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { + // Incorporate |rcon| and the transformed word into the first word. + block[j] = aes_nohw_xor(block[j], aes_nohw_rcon_slice(rcon, j)); + block[j] = aes_nohw_xor( + block[j], + aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); + // Propagate to the remaining words. Note this is reordered from the usual + // formulation to avoid needing masks. + aes_word_t v = block[j]; + block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 4)); + block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 8)); + block[j] = aes_nohw_xor(block[j], aes_nohw_shift_left(v, 12)); + } + memcpy(key->rd_key + 4 * i, block, 16); + } +} + +static void aes_nohw_setup_key_192(AES_KEY *key, const uint8_t in[24]) { + key->rounds = 12; + + aes_word_t storage1[AES_NOHW_BLOCK_WORDS], storage2[AES_NOHW_BLOCK_WORDS]; + aes_word_t *block1 = storage1, *block2 = storage2; + + // AES-192's key schedule is complex because each key schedule iteration + // produces six words, but we compute on blocks and each block is four words. + // We maintain a sliding window of two blocks, filled to 1.5 blocks at a time. + // We loop below every three blocks or two key schedule iterations. + // + // On entry to the loop, |block1| and the first half of |block2| contain the + // previous key schedule iteration. |block1| has been written to |key|, but + // |block2| has not as it is incomplete. + aes_nohw_compact_block(block1, in); + memcpy(key->rd_key, block1, 16); + + uint8_t half_block[16] = {0}; + memcpy(half_block, in + 16, 8); + aes_nohw_compact_block(block2, half_block); + + for (size_t i = 0; i < 4; i++) { + aes_word_t sub[AES_NOHW_BLOCK_WORDS]; + aes_nohw_sub_block(sub, block2); + uint8_t rcon = aes_nohw_rcon[2 * i]; + for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { + // Compute the first two words of the next key schedule iteration, which + // go in the second half of |block2|. The first two words of the previous + // iteration are in the first half of |block1|. Apply |rcon| here too + // because the shifts match. + block2[j] = aes_nohw_or( + block2[j], + aes_nohw_shift_left( + aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)), 8)); + // Incorporate the transformed word and propagate. Note the last word of + // the previous iteration corresponds to the second word of |copy|. This + // is incorporated into the first word of the next iteration, or the third + // word of |block2|. + block2[j] = aes_nohw_xor( + block2[j], aes_nohw_and(aes_nohw_shift_left( + aes_nohw_rotate_rows_down(sub[j]), 4), + AES_NOHW_COL2_MASK)); + block2[j] = aes_nohw_xor( + block2[j], + aes_nohw_and(aes_nohw_shift_left(block2[j], 4), AES_NOHW_COL3_MASK)); + + // Compute the remaining four words, which fill |block1|. Begin by moving + // the corresponding words of the previous iteration: the second half of + // |block1| and the first half of |block2|. + block1[j] = aes_nohw_shift_right(block1[j], 8); + block1[j] = aes_nohw_or(block1[j], aes_nohw_shift_left(block2[j], 8)); + // Incorporate the second word, computed previously in |block2|, and + // propagate. + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12)); + aes_word_t v = block1[j]; + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4)); + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); + } + + // This completes two round keys. Note half of |block2| was computed in the + // previous loop iteration but was not yet output. + memcpy(key->rd_key + 4 * (3 * i + 1), block2, 16); + memcpy(key->rd_key + 4 * (3 * i + 2), block1, 16); + + aes_nohw_sub_block(sub, block1); + rcon = aes_nohw_rcon[2 * i + 1]; + for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { + // Compute the first four words of the next key schedule iteration in + // |block2|. Begin by moving the corresponding words of the previous + // iteration: the second half of |block2| and the first half of |block1|. + block2[j] = aes_nohw_shift_right(block2[j], 8); + block2[j] = aes_nohw_or(block2[j], aes_nohw_shift_left(block1[j], 8)); + // Incorporate rcon and the transformed word. Note the last word of the + // previous iteration corresponds to the last word of |copy|. + block2[j] = aes_nohw_xor(block2[j], aes_nohw_rcon_slice(rcon, j)); + block2[j] = aes_nohw_xor( + block2[j], + aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); + // Propagate to the remaining words. + aes_word_t v = block2[j]; + block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4)); + block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); + block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); + + // Compute the last two words, which go in the first half of |block1|. The + // last two words of the previous iteration are in the second half of + // |block1|. + block1[j] = aes_nohw_shift_right(block1[j], 8); + // Propagate blocks and mask off the excess. + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_right(block2[j], 12)); + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(block1[j], 4)); + block1[j] = aes_nohw_and(block1[j], AES_NOHW_COL01_MASK); + } + + // |block2| has a complete round key. |block1| will be completed in the next + // iteration. + memcpy(key->rd_key + 4 * (3 * i + 3), block2, 16); + + // Swap blocks to restore the invariant. + aes_word_t *tmp = block1; + block1 = block2; + block2 = tmp; + } +} + +static void aes_nohw_setup_key_256(AES_KEY *key, const uint8_t in[32]) { + key->rounds = 14; + + // Each key schedule iteration produces two round keys. + aes_word_t block1[AES_NOHW_BLOCK_WORDS], block2[AES_NOHW_BLOCK_WORDS]; + aes_nohw_compact_block(block1, in); + memcpy(key->rd_key, block1, 16); + + aes_nohw_compact_block(block2, in + 16); + memcpy(key->rd_key + 4, block2, 16); + + for (size_t i = 2; i <= 14; i += 2) { + aes_word_t sub[AES_NOHW_BLOCK_WORDS]; + aes_nohw_sub_block(sub, block2); + uint8_t rcon = aes_nohw_rcon[i / 2 - 1]; + for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { + // Incorporate |rcon| and the transformed word into the first word. + block1[j] = aes_nohw_xor(block1[j], aes_nohw_rcon_slice(rcon, j)); + block1[j] = aes_nohw_xor( + block1[j], + aes_nohw_shift_right(aes_nohw_rotate_rows_down(sub[j]), 12)); + // Propagate to the remaining words. + aes_word_t v = block1[j]; + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 4)); + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 8)); + block1[j] = aes_nohw_xor(block1[j], aes_nohw_shift_left(v, 12)); + } + memcpy(key->rd_key + 4 * i, block1, 16); + + if (i == 14) { + break; + } + + aes_nohw_sub_block(sub, block1); + for (size_t j = 0; j < AES_NOHW_BLOCK_WORDS; j++) { + // Incorporate the transformed word into the first word. + block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_right(sub[j], 12)); + // Propagate to the remaining words. + aes_word_t v = block2[j]; + block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 4)); + block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 8)); + block2[j] = aes_nohw_xor(block2[j], aes_nohw_shift_left(v, 12)); + } + memcpy(key->rd_key + 4 * (i + 1), block2, 16); + } +} + + +// External API. + +int bssl::aes_nohw_set_encrypt_key(const uint8_t *key, unsigned bits, + AES_KEY *aeskey) { + switch (bits) { + case 128: + aes_nohw_setup_key_128(aeskey, key); + return 0; + case 192: + aes_nohw_setup_key_192(aeskey, key); + return 0; + case 256: + aes_nohw_setup_key_256(aeskey, key); + return 0; + } + return 1; +} + +int bssl::aes_nohw_set_decrypt_key(const uint8_t *key, unsigned bits, + AES_KEY *aeskey) { + return aes_nohw_set_encrypt_key(key, bits, aeskey); +} + +void bssl::aes_nohw_encrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key) { + AES_NOHW_SCHEDULE sched; + aes_nohw_expand_round_keys(&sched, key); + AES_NOHW_BATCH batch; + aes_nohw_to_batch(&batch, in, /*num_blocks=*/1); + aes_nohw_encrypt_batch(&sched, key->rounds, &batch); + aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); +} + +void bssl::aes_nohw_decrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key) { + AES_NOHW_SCHEDULE sched; + aes_nohw_expand_round_keys(&sched, key); + AES_NOHW_BATCH batch; + aes_nohw_to_batch(&batch, in, /*num_blocks=*/1); + aes_nohw_decrypt_batch(&sched, key->rounds, &batch); + aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); +} + +static void aes_nohw_xor_block(uint8_t out[16], const uint8_t a[16], + const uint8_t b[16]) { + for (size_t i = 0; i < 16; i += sizeof(aes_word_t)) { + aes_word_t x, y; + memcpy(&x, a + i, sizeof(aes_word_t)); + memcpy(&y, b + i, sizeof(aes_word_t)); + x = aes_nohw_xor(x, y); + memcpy(out + i, &x, sizeof(aes_word_t)); + } +} + +void bssl::aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, + size_t blocks, const AES_KEY *key, + const uint8_t ivec[16]) { + if (blocks == 0) { + return; + } + + AES_NOHW_SCHEDULE sched; + aes_nohw_expand_round_keys(&sched, key); + + // Make |AES_NOHW_BATCH_SIZE| copies of |ivec|. + alignas(AES_NOHW_WORD_SIZE) uint8_t ivs[AES_NOHW_BATCH_SIZE * 16]; + alignas(AES_NOHW_WORD_SIZE) uint8_t enc_ivs[AES_NOHW_BATCH_SIZE * 16]; + for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { + memcpy(ivs + 16 * i, ivec, 16); + } + + uint32_t ctr = CRYPTO_load_u32_be(ivs + 12); + for (;;) { + // Update counters. + for (size_t i = 0; i < AES_NOHW_BATCH_SIZE; i++) { + CRYPTO_store_u32_be(ivs + 16 * i + 12, ctr + (uint32_t)i); + } + + size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks; + AES_NOHW_BATCH batch; + aes_nohw_to_batch(&batch, ivs, todo); + aes_nohw_encrypt_batch(&sched, key->rounds, &batch); + aes_nohw_from_batch(enc_ivs, todo, &batch); + + for (size_t i = 0; i < todo; i++) { + aes_nohw_xor_block(out + 16 * i, in + 16 * i, enc_ivs + 16 * i); + } + + blocks -= todo; + if (blocks == 0) { + break; + } + + in += 16 * AES_NOHW_BATCH_SIZE; + out += 16 * AES_NOHW_BATCH_SIZE; + ctr += AES_NOHW_BATCH_SIZE; + } +} + +void bssl::aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t *ivec, + const int enc) { + assert(len % 16 == 0); + size_t blocks = len / 16; + if (blocks == 0) { + return; + } + + AES_NOHW_SCHEDULE sched; + aes_nohw_expand_round_keys(&sched, key); + alignas(AES_NOHW_WORD_SIZE) uint8_t iv[16]; + memcpy(iv, ivec, 16); + + if (enc) { + // CBC encryption is not parallelizable. + while (blocks > 0) { + aes_nohw_xor_block(iv, iv, in); + + AES_NOHW_BATCH batch; + aes_nohw_to_batch(&batch, iv, /*num_blocks=*/1); + aes_nohw_encrypt_batch(&sched, key->rounds, &batch); + aes_nohw_from_batch(out, /*num_blocks=*/1, &batch); + + memcpy(iv, out, 16); + + in += 16; + out += 16; + blocks--; + } + memcpy(ivec, iv, 16); + return; + } + + for (;;) { + size_t todo = blocks >= AES_NOHW_BATCH_SIZE ? AES_NOHW_BATCH_SIZE : blocks; + // Make a copy of the input so we can decrypt in-place. + alignas(AES_NOHW_WORD_SIZE) uint8_t copy[AES_NOHW_BATCH_SIZE * 16]; + memcpy(copy, in, todo * 16); + + AES_NOHW_BATCH batch; + aes_nohw_to_batch(&batch, in, todo); + aes_nohw_decrypt_batch(&sched, key->rounds, &batch); + aes_nohw_from_batch(out, todo, &batch); + + aes_nohw_xor_block(out, out, iv); + for (size_t i = 1; i < todo; i++) { + aes_nohw_xor_block(out + 16 * i, out + 16 * i, copy + 16 * (i - 1)); + } + + // Save the last block as the IV. + memcpy(iv, copy + 16 * (todo - 1), 16); + + blocks -= todo; + if (blocks == 0) { + break; + } + + in += 16 * AES_NOHW_BATCH_SIZE; + out += 16 * AES_NOHW_BATCH_SIZE; + } + + memcpy(ivec, iv, 16); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/cbc.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/aes/cbc.cc.inc new file mode 100644 index 00000000..aa9b759a --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/aes/cbc.cc.inc @@ -0,0 +1,132 @@ +// Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "internal.h" +#include "../../internal.h" + + +using namespace bssl; + +void bssl::CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + block128_f block) { + assert(key != nullptr && ivec != nullptr); + if (len == 0) { + // Avoid |ivec| == |iv| in the |memcpy| below, which is not legal in C. + return; + } + + assert(in != nullptr && out != nullptr); + size_t n; + const uint8_t *iv = ivec; + while (len >= 16) { + CRYPTO_xor16(out, in, iv); + (*block)(out, out, key); + iv = out; + len -= 16; + in += 16; + out += 16; + } + + while (len) { + for (n = 0; n < 16 && n < len; ++n) { + out[n] = in[n] ^ iv[n]; + } + for (; n < 16; ++n) { + out[n] = iv[n]; + } + (*block)(out, out, key); + iv = out; + if (len <= 16) { + break; + } + len -= 16; + in += 16; + out += 16; + } + + OPENSSL_memcpy(ivec, iv, 16); +} + +void bssl::CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + block128_f block) { + assert(key != nullptr && ivec != nullptr); + if (len == 0) { + // Avoid |ivec| == |iv| in the |memcpy| below, which is not legal in C. + return; + } + + assert(in != nullptr && out != nullptr); + + const uintptr_t inptr = (uintptr_t) in; + const uintptr_t outptr = (uintptr_t) out; + // If |in| and |out| alias, |in| must be ahead. + assert(inptr >= outptr || inptr + len <= outptr); + + size_t n; + alignas(16) uint8_t tmp[16]; + if ((inptr >= 32 && outptr <= inptr - 32) || inptr < outptr) { + // If |out| is at least two blocks behind |in| or completely disjoint, there + // is no need to decrypt to a temporary block. + const uint8_t *iv = ivec; + while (len >= 16) { + (*block)(in, out, key); + CRYPTO_xor16(out, out, iv); + iv = in; + len -= 16; + in += 16; + out += 16; + } + OPENSSL_memcpy(ivec, iv, 16); + } else { + static_assert(16 % sizeof(crypto_word_t) == 0, + "block cannot be evenly divided into words"); + + while (len >= 16) { + (*block)(in, tmp, key); + for (n = 0; n < 16; n += sizeof(crypto_word_t)) { + crypto_word_t c = CRYPTO_load_word_le(in + n); + CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(tmp + n) ^ + CRYPTO_load_word_le(ivec + n)); + CRYPTO_store_word_le(ivec + n, c); + } + len -= 16; + in += 16; + out += 16; + } + } + + while (len) { + uint8_t c; + (*block)(in, tmp, key); + for (n = 0; n < 16 && n < len; ++n) { + c = in[n]; + out[n] = tmp[n] ^ ivec[n]; + ivec[n] = c; + } + if (len <= 16) { + for (; n < 16; ++n) { + ivec[n] = in[n]; + } + break; + } + len -= 16; + in += 16; + out += 16; + } +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/cfb.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/aes/cfb.cc.inc new file mode 100644 index 00000000..3db78b15 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/aes/cfb.cc.inc @@ -0,0 +1,168 @@ +// Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "internal.h" + + +using namespace bssl; + +static_assert(16 % sizeof(size_t) == 0, "block cannot be divided into size_t"); + +void bssl::CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + unsigned *num, int enc, block128_f block) { + assert(in && out && key && ivec && num); + + unsigned n = *num; + + if (enc) { + while (n && len) { + *(out++) = ivec[n] ^= *(in++); + --len; + n = (n + 1) % 16; + } + while (len >= 16) { + (*block)(ivec, ivec, key); + for (; n < 16; n += sizeof(crypto_word_t)) { + crypto_word_t tmp = + CRYPTO_load_word_le(ivec + n) ^ CRYPTO_load_word_le(in + n); + CRYPTO_store_word_le(ivec + n, tmp); + CRYPTO_store_word_le(out + n, tmp); + } + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) { + (*block)(ivec, ivec, key); + while (len--) { + out[n] = ivec[n] ^= in[n]; + ++n; + } + } + *num = n; + return; + } else { + while (n && len) { + uint8_t c; + *(out++) = ivec[n] ^ (c = *(in++)); + ivec[n] = c; + --len; + n = (n + 1) % 16; + } + while (len >= 16) { + (*block)(ivec, ivec, key); + for (; n < 16; n += sizeof(crypto_word_t)) { + crypto_word_t t = CRYPTO_load_word_le(in + n); + CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(ivec + n) ^ t); + CRYPTO_store_word_le(ivec + n, t); + } + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) { + (*block)(ivec, ivec, key); + while (len--) { + uint8_t c; + out[n] = ivec[n] ^ (c = in[n]); + ivec[n] = c; + ++n; + } + } + *num = n; + return; + } +} + + +/* This expects a single block of size nbits for both in and out. Note that + it corrupts any extra bits in the last byte of out */ +static void cfbr_encrypt_block(const uint8_t *in, uint8_t *out, unsigned nbits, + const AES_KEY *key, uint8_t ivec[16], int enc, + block128_f block) { + int n, rem, num; + uint8_t ovec[16 * 2 + 1]; /* +1 because we dererefence (but don't use) one + byte off the end */ + + if (nbits <= 0 || nbits > 128) { + return; + } + + // fill in the first half of the new IV with the current IV + OPENSSL_memcpy(ovec, ivec, 16); + // construct the new IV + (*block)(ivec, ivec, key); + num = (nbits + 7) / 8; + if (enc) { + // encrypt the input + for (n = 0; n < num; ++n) { + out[n] = (ovec[16 + n] = in[n] ^ ivec[n]); + } + } else { + // decrypt the input + for (n = 0; n < num; ++n) { + out[n] = (ovec[16 + n] = in[n]) ^ ivec[n]; + } + } + // shift ovec left... + rem = nbits % 8; + num = nbits / 8; + if (rem == 0) { + OPENSSL_memcpy(ivec, ovec + num, 16); + } else { + for (n = 0; n < 16; ++n) { + ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem); + } + } + + // it is not necessary to cleanse ovec, since the IV is not secret +} + +// N.B. This expects the input to be packed, MS bit first +void bssl::CRYPTO_cfb128_1_encrypt(const uint8_t *in, uint8_t *out, size_t bits, + const AES_KEY *key, uint8_t ivec[16], + unsigned *num, int enc, block128_f block) { + size_t n; + uint8_t c[1], d[1]; + + assert(in && out && key && ivec && num); + assert(*num == 0); + + for (n = 0; n < bits; ++n) { + c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0; + cfbr_encrypt_block(c, d, 1, key, ivec, enc, block); + out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) | + ((d[0] & 0x80) >> (unsigned int)(n % 8)); + } +} + +void bssl::CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out, + size_t length, const AES_KEY *key, + unsigned char ivec[16], unsigned *num, + int enc, block128_f block) { + size_t n; + + assert(in && out && key && ivec && num); + assert(*num == 0); + + for (n = 0; n < length; ++n) { + cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block); + } +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/ctr.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/aes/ctr.cc.inc new file mode 100644 index 00000000..1a333305 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/aes/ctr.cc.inc @@ -0,0 +1,102 @@ +// Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "internal.h" +#include "../../internal.h" + + +using namespace bssl; + +static_assert(16 % sizeof(crypto_word_t) == 0, + "block cannot be divided into crypto_word_t"); + +// increment upper 96 bits of 128-bit counter by 1 +static void ctr96_inc(uint8_t *counter) { + uint32_t n = 12, c = 1; + + do { + --n; + c += counter[n]; + counter[n] = (uint8_t) c; + c >>= 8; + } while (n); +} + +void bssl::CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, + size_t len, const AES_KEY *key, + uint8_t ivec[16], uint8_t ecount_buf[16], + unsigned int *num, ctr128_f func) { + unsigned int n, ctr32; + + assert(key && ecount_buf && num); + assert(len == 0 || (in && out)); + assert(*num < 16); + + n = *num; + + while (n && len) { + *(out++) = *(in++) ^ ecount_buf[n]; + --len; + n = (n + 1) % 16; + } + + ctr32 = CRYPTO_load_u32_be(ivec + 12); + while (len >= 16) { + size_t blocks = len / 16; + // 1<<28 is just a not-so-small yet not-so-large number... + // Below condition is practically never met, but it has to + // be checked for code correctness. + if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28)) { + blocks = (1U << 28); + } + // As (*func) operates on 32-bit counter, caller + // has to handle overflow. 'if' below detects the + // overflow, which is then handled by limiting the + // amount of blocks to the exact overflow point... + ctr32 += (uint32_t)blocks; + if (ctr32 < blocks) { + blocks -= ctr32; + ctr32 = 0; + } + (*func)(in, out, blocks, key, ivec); + // (*func) does not update ivec, caller does: + CRYPTO_store_u32_be(ivec + 12, ctr32); + // ... overflow was detected, propagate carry. + if (ctr32 == 0) { + ctr96_inc(ivec); + } + blocks *= 16; + len -= blocks; + out += blocks; + in += blocks; + } + if (len) { + OPENSSL_memset(ecount_buf, 0, 16); + (*func)(ecount_buf, ecount_buf, 1, key, ivec); + ++ctr32; + CRYPTO_store_u32_be(ivec + 12, ctr32); + if (ctr32 == 0) { + ctr96_inc(ivec); + } + while (len--) { + out[n] = in[n] ^ ecount_buf[n]; + ++n; + } + } + + *num = n; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/gcm.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/aes/gcm.cc.inc new file mode 100644 index 00000000..73beb8b3 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/aes/gcm.cc.inc @@ -0,0 +1,614 @@ +// Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include + +#include "../../internal.h" +#include "../aes/internal.h" +#include "internal.h" + + +using namespace bssl; + +// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four +// bits of a |size_t|. +static const size_t kSizeTWithoutLower4Bits = (size_t) -16; + + +#define GCM_MUL(key, ctx, Xi) bssl::gcm_gmult_nohw((ctx)->Xi, (key)->Htable) +#define GHASH(key, ctx, in, len) \ + gcm_ghash_nohw((ctx)->Xi, (key)->Htable, in, len) +// GHASH_CHUNK is "stride parameter" missioned to mitigate cache +// trashing effect. In other words idea is to hash data while it's +// still in L1 cache after encryption pass... +#define GHASH_CHUNK (3 * 1024) + +#if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86) +static void gcm_reduce_1bit(u128 *V) { + if (sizeof(crypto_word_t) == 8) { + uint64_t T = UINT64_C(0xe100000000000000) & (0 - (V->hi & 1)); + V->hi = (V->lo << 63) | (V->hi >> 1); + V->lo = (V->lo >> 1) ^ T; + } else { + uint32_t T = 0xe1000000U & (0 - (uint32_t)(V->hi & 1)); + V->hi = (V->lo << 63) | (V->hi >> 1); + V->lo = (V->lo >> 1) ^ ((uint64_t)T << 32); + } +} + +void bssl::gcm_init_ssse3(u128 Htable[16], const uint64_t H[2]) { + Htable[0].hi = 0; + Htable[0].lo = 0; + u128 V; + V.hi = H[1]; + V.lo = H[0]; + + Htable[8] = V; + gcm_reduce_1bit(&V); + Htable[4] = V; + gcm_reduce_1bit(&V); + Htable[2] = V; + gcm_reduce_1bit(&V); + Htable[1] = V; + Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo; + V = Htable[4]; + Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo; + Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo; + Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo; + V = Htable[8]; + Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo; + Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo; + Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo; + Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo; + Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo; + Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo; + Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo; + + // Treat |Htable| as a 16x16 byte table and transpose it. Thus, Htable[i] + // contains the i'th byte of j*H for all j. + uint8_t *Hbytes = (uint8_t *)Htable; + for (int i = 0; i < 16; i++) { + for (int j = 0; j < i; j++) { + uint8_t tmp = Hbytes[16*i + j]; + Hbytes[16*i + j] = Hbytes[16*j + i]; + Hbytes[16*j + i] = tmp; + } + } +} +#endif // GHASH_ASM_X86_64 || GHASH_ASM_X86 + +#ifdef GCM_FUNCREF +#undef GCM_MUL +#define GCM_MUL(key, ctx, Xi) (*gcm_gmult_p)((ctx)->Xi, (key)->Htable) +#undef GHASH +#define GHASH(key, ctx, in, len) \ + (*gcm_ghash_p)((ctx)->Xi, (key)->Htable, in, len) +#endif // GCM_FUNCREF + +#if defined(HW_GCM) && defined(OPENSSL_X86_64) +static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + uint8_t Xi[16], const u128 Htable[16], + enum gcm_impl_t impl) { + switch (impl) { + case gcm_x86_vaes_avx2: + len &= kSizeTWithoutLower4Bits; + aes_gcm_enc_update_vaes_avx2(in, out, len, key, ivec, Htable, Xi); + CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16); + return len; + case gcm_x86_vaes_avx512: + len &= kSizeTWithoutLower4Bits; + aes_gcm_enc_update_vaes_avx512(in, out, len, key, ivec, Htable, Xi); + CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16); + return len; + default: + return aesni_gcm_encrypt(in, out, len, key, ivec, Htable, Xi); + } +} + +static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + uint8_t Xi[16], const u128 Htable[16], + enum gcm_impl_t impl) { + switch (impl) { + case gcm_x86_vaes_avx2: + len &= kSizeTWithoutLower4Bits; + aes_gcm_dec_update_vaes_avx2(in, out, len, key, ivec, Htable, Xi); + CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16); + return len; + case gcm_x86_vaes_avx512: + len &= kSizeTWithoutLower4Bits; + aes_gcm_dec_update_vaes_avx512(in, out, len, key, ivec, Htable, Xi); + CRYPTO_store_u32_be(&ivec[12], CRYPTO_load_u32_be(&ivec[12]) + len / 16); + return len; + default: + return aesni_gcm_decrypt(in, out, len, key, ivec, Htable, Xi); + } +} +#endif // HW_GCM && X86_64 + +#if defined(HW_GCM) && defined(OPENSSL_AARCH64) + +static size_t hw_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + uint8_t Xi[16], const u128 Htable[16], + enum gcm_impl_t impl) { + const size_t len_blocks = len & kSizeTWithoutLower4Bits; + if (!len_blocks) { + return 0; + } + if (impl == gcm_arm64_aes_eor3) { + aes_gcm_enc_kernel_eor3(in, len_blocks * 8, out, Xi, ivec, key, Htable); + return len_blocks; + } + aes_gcm_enc_kernel(in, len_blocks * 8, out, Xi, ivec, key, Htable); + return len_blocks; +} + +static size_t hw_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + uint8_t Xi[16], const u128 Htable[16], + enum gcm_impl_t impl) { + const size_t len_blocks = len & kSizeTWithoutLower4Bits; + if (!len_blocks) { + return 0; + } + if (impl == gcm_arm64_aes_eor3) { + aes_gcm_dec_kernel_eor3(in, len_blocks * 8, out, Xi, ivec, key, Htable); + return len_blocks; + } + aes_gcm_dec_kernel(in, len_blocks * 8, out, Xi, ivec, key, Htable); + return len_blocks; +} + +#endif // HW_GCM && AARCH64 + +void bssl::CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash, + u128 out_table[16], const uint8_t gcm_key[16]) { + // H is passed to |gcm_init_*| as a pair of byte-swapped, 64-bit values. + uint64_t H[2] = {CRYPTO_load_u64_be(gcm_key), + CRYPTO_load_u64_be(gcm_key + 8)}; + +#if defined(GHASH_ASM_X86_64) + if (crypto_gcm_clmul_enabled()) { + if (CRYPTO_is_VPCLMULQDQ_capable() && CRYPTO_is_AVX2_capable()) { + if (CRYPTO_is_AVX512BW_capable() && CRYPTO_is_AVX512VL_capable() && + CRYPTO_is_BMI2_capable() && !CRYPTO_cpu_avoid_zmm_registers()) { + gcm_init_vpclmulqdq_avx512(out_table, H); + *out_mult = gcm_gmult_vpclmulqdq_avx512; + *out_hash = gcm_ghash_vpclmulqdq_avx512; + return; + } + gcm_init_vpclmulqdq_avx2(out_table, H); + *out_mult = gcm_gmult_vpclmulqdq_avx2; + *out_hash = gcm_ghash_vpclmulqdq_avx2; + return; + } + if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) { + gcm_init_avx(out_table, H); + *out_mult = gcm_gmult_avx; + *out_hash = gcm_ghash_avx; + return; + } + gcm_init_clmul(out_table, H); + *out_mult = gcm_gmult_clmul; + *out_hash = gcm_ghash_clmul; + return; + } + if (CRYPTO_is_SSSE3_capable()) { + gcm_init_ssse3(out_table, H); + *out_mult = gcm_gmult_ssse3; + *out_hash = gcm_ghash_ssse3; + return; + } +#elif defined(GHASH_ASM_X86) + if (crypto_gcm_clmul_enabled()) { + gcm_init_clmul(out_table, H); + *out_mult = gcm_gmult_clmul; + *out_hash = gcm_ghash_clmul; + return; + } + if (CRYPTO_is_SSSE3_capable()) { + gcm_init_ssse3(out_table, H); + *out_mult = gcm_gmult_ssse3; + *out_hash = gcm_ghash_ssse3; + return; + } +#elif defined(GHASH_ASM_ARM) + if (gcm_pmull_capable()) { + gcm_init_v8(out_table, H); + *out_mult = gcm_gmult_v8; + *out_hash = gcm_ghash_v8; + return; + } + + if (gcm_neon_capable()) { + gcm_init_neon(out_table, H); + *out_mult = gcm_gmult_neon; + *out_hash = gcm_ghash_neon; + return; + } +#endif + + gcm_init_nohw(out_table, H); + *out_mult = gcm_gmult_nohw; + *out_hash = gcm_ghash_nohw; +} + +void bssl::CRYPTO_gcm128_init_aes_key(GCM128_KEY *gcm_key, const uint8_t *key, + size_t key_bytes) { + switch (key_bytes) { + case 16: + boringssl_fips_inc_counter(fips_counter_evp_aes_128_gcm); + break; + + case 32: + boringssl_fips_inc_counter(fips_counter_evp_aes_256_gcm); + break; + } + + OPENSSL_memset(gcm_key, 0, sizeof(*gcm_key)); + int is_hwaes; + gcm_key->ctr = aes_ctr_set_key(&gcm_key->aes, &is_hwaes, &gcm_key->block, key, + key_bytes); + + uint8_t ghash_key[16]; + OPENSSL_memset(ghash_key, 0, sizeof(ghash_key)); + gcm_key->block(ghash_key, ghash_key, &gcm_key->aes); + + CRYPTO_ghash_init(&gcm_key->gmult, &gcm_key->ghash, gcm_key->Htable, + ghash_key); + +#if !defined(OPENSSL_NO_ASM) +#if defined(OPENSSL_X86_64) + if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx512 && + CRYPTO_is_VAES_capable()) { + gcm_key->impl = gcm_x86_vaes_avx512; + } else if (gcm_key->ghash == gcm_ghash_vpclmulqdq_avx2 && + CRYPTO_is_VAES_capable()) { + gcm_key->impl = gcm_x86_vaes_avx2; + } else if (gcm_key->ghash == gcm_ghash_avx && is_hwaes) { + gcm_key->impl = gcm_x86_aesni; + } +#elif defined(OPENSSL_AARCH64) + if (gcm_eor3_capable() && is_hwaes) { + gcm_key->impl = gcm_arm64_aes_eor3; + } else if (gcm_pmull_capable() && is_hwaes) { + gcm_key->impl = gcm_arm64_aes; + } +#endif +#endif +} + +void bssl::CRYPTO_gcm128_init_ctx(const GCM128_KEY *key, GCM128_CONTEXT *ctx, + const uint8_t *iv, size_t iv_len) { +#ifdef GCM_FUNCREF + void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) = key->gmult; +#endif + + OPENSSL_memset(&ctx->Yi, 0, sizeof(ctx->Yi)); + OPENSSL_memset(&ctx->Xi, 0, sizeof(ctx->Xi)); + ctx->len.aad = 0; + ctx->len.msg = 0; + ctx->ares = 0; + ctx->mres = 0; + + uint32_t ctr; + if (iv_len == 12) { + OPENSSL_memcpy(ctx->Yi, iv, 12); + ctx->Yi[15] = 1; + ctr = 1; + } else { + uint64_t len0 = iv_len; + + while (iv_len >= 16) { + CRYPTO_xor16(ctx->Yi, ctx->Yi, iv); + GCM_MUL(key, ctx, Yi); + iv += 16; + iv_len -= 16; + } + if (iv_len) { + for (size_t i = 0; i < iv_len; ++i) { + ctx->Yi[i] ^= iv[i]; + } + GCM_MUL(key, ctx, Yi); + } + + uint8_t len_block[16]; + OPENSSL_memset(len_block, 0, 8); + CRYPTO_store_u64_be(len_block + 8, len0 << 3); + CRYPTO_xor16(ctx->Yi, ctx->Yi, len_block); + + GCM_MUL(key, ctx, Yi); + ctr = CRYPTO_load_u32_be(ctx->Yi + 12); + } + + key->block(ctx->Yi, ctx->EK0, &key->aes); + ++ctr; + CRYPTO_store_u32_be(ctx->Yi + 12, ctr); +} + +int bssl::CRYPTO_gcm128_aad(const GCM128_KEY *key, GCM128_CONTEXT *ctx, + const uint8_t *aad, size_t aad_len) { +#ifdef GCM_FUNCREF + void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) = key->gmult; + void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp, + size_t len) = key->ghash; +#endif + + if (ctx->len.msg != 0) { + // The caller must have finished the AAD before providing other input. + return 0; + } + + uint64_t alen = ctx->len.aad + aad_len; + if (alen > (UINT64_C(1) << 61) || (sizeof(aad_len) == 8 && alen < aad_len)) { + return 0; + } + ctx->len.aad = alen; + + unsigned n = ctx->ares; + if (n) { + while (n && aad_len) { + ctx->Xi[n] ^= *(aad++); + --aad_len; + n = (n + 1) % 16; + } + if (n == 0) { + GCM_MUL(key, ctx, Xi); + } else { + ctx->ares = n; + return 1; + } + } + + // Process a whole number of blocks. + size_t len_blocks = aad_len & kSizeTWithoutLower4Bits; + if (len_blocks != 0) { + GHASH(key, ctx, aad, len_blocks); + aad += len_blocks; + aad_len -= len_blocks; + } + + // Process the remainder. + if (aad_len != 0) { + n = (unsigned int)aad_len; + for (size_t i = 0; i < aad_len; ++i) { + ctx->Xi[i] ^= aad[i]; + } + } + + ctx->ares = n; + return 1; +} + +int bssl::CRYPTO_gcm128_encrypt(const GCM128_KEY *key, GCM128_CONTEXT *ctx, + const uint8_t *in, uint8_t *out, size_t len) { +#ifdef GCM_FUNCREF + void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) = key->gmult; + void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp, + size_t len) = key->ghash; +#endif + + uint64_t mlen = ctx->len.msg + len; + if (mlen > ((UINT64_C(1) << 36) - 32) || + (sizeof(len) == 8 && mlen < len)) { + return 0; + } + ctx->len.msg = mlen; + + if (ctx->ares) { + // First call to encrypt finalizes GHASH(AAD) + GCM_MUL(key, ctx, Xi); + ctx->ares = 0; + } + + unsigned n = ctx->mres; + if (n) { + while (n && len) { + ctx->Xi[n] ^= *(out++) = *(in++) ^ ctx->EKi[n]; + --len; + n = (n + 1) % 16; + } + if (n == 0) { + GCM_MUL(key, ctx, Xi); + } else { + ctx->mres = n; + return 1; + } + } + +#if defined(HW_GCM) + if (key->impl != gcm_separate && len > 0) { + // |hw_gcm_encrypt| may not process all the input given to it. It may + // not process *any* of its input if it is deemed too small. + size_t bulk = hw_gcm_encrypt(in, out, len, &key->aes, ctx->Yi, ctx->Xi, + key->Htable, key->impl); + in += bulk; + out += bulk; + len -= bulk; + } +#endif + + uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12); + ctr128_f stream = key->ctr; + while (len >= GHASH_CHUNK) { + (*stream)(in, out, GHASH_CHUNK / 16, &key->aes, ctx->Yi); + ctr += GHASH_CHUNK / 16; + CRYPTO_store_u32_be(ctx->Yi + 12, ctr); + GHASH(key, ctx, out, GHASH_CHUNK); + out += GHASH_CHUNK; + in += GHASH_CHUNK; + len -= GHASH_CHUNK; + } + + size_t len_blocks = len & kSizeTWithoutLower4Bits; + if (len_blocks != 0) { + size_t j = len_blocks / 16; + (*stream)(in, out, j, &key->aes, ctx->Yi); + ctr += (uint32_t)j; + CRYPTO_store_u32_be(ctx->Yi + 12, ctr); + in += len_blocks; + len -= len_blocks; + GHASH(key, ctx, out, len_blocks); + out += len_blocks; + } + + if (len) { + key->block(ctx->Yi, ctx->EKi, &key->aes); + ++ctr; + CRYPTO_store_u32_be(ctx->Yi + 12, ctr); + while (len--) { + ctx->Xi[n] ^= out[n] = in[n] ^ ctx->EKi[n]; + ++n; + } + } + + ctx->mres = n; + return 1; +} + +int bssl::CRYPTO_gcm128_decrypt(const GCM128_KEY *key, GCM128_CONTEXT *ctx, + const uint8_t *in, uint8_t *out, size_t len) { +#ifdef GCM_FUNCREF + void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) = key->gmult; + void (*gcm_ghash_p)(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp, + size_t len) = key->ghash; +#endif + + uint64_t mlen = ctx->len.msg + len; + if (mlen > ((UINT64_C(1) << 36) - 32) || + (sizeof(len) == 8 && mlen < len)) { + return 0; + } + ctx->len.msg = mlen; + + if (ctx->ares) { + // First call to decrypt finalizes GHASH(AAD) + GCM_MUL(key, ctx, Xi); + ctx->ares = 0; + } + + unsigned n = ctx->mres; + if (n) { + while (n && len) { + uint8_t c = *(in++); + *(out++) = c ^ ctx->EKi[n]; + ctx->Xi[n] ^= c; + --len; + n = (n + 1) % 16; + } + if (n == 0) { + GCM_MUL(key, ctx, Xi); + } else { + ctx->mres = n; + return 1; + } + } + +#if defined(HW_GCM) + if (key->impl != gcm_separate && len > 0) { + // |hw_gcm_decrypt| may not process all the input given to it. It may + // not process *any* of its input if it is deemed too small. + size_t bulk = hw_gcm_decrypt(in, out, len, &key->aes, ctx->Yi, ctx->Xi, + key->Htable, key->impl); + in += bulk; + out += bulk; + len -= bulk; + } +#endif + + uint32_t ctr = CRYPTO_load_u32_be(ctx->Yi + 12); + ctr128_f stream = key->ctr; + while (len >= GHASH_CHUNK) { + GHASH(key, ctx, in, GHASH_CHUNK); + (*stream)(in, out, GHASH_CHUNK / 16, &key->aes, ctx->Yi); + ctr += GHASH_CHUNK / 16; + CRYPTO_store_u32_be(ctx->Yi + 12, ctr); + out += GHASH_CHUNK; + in += GHASH_CHUNK; + len -= GHASH_CHUNK; + } + + size_t len_blocks = len & kSizeTWithoutLower4Bits; + if (len_blocks != 0) { + size_t j = len_blocks / 16; + GHASH(key, ctx, in, len_blocks); + (*stream)(in, out, j, &key->aes, ctx->Yi); + ctr += (uint32_t)j; + CRYPTO_store_u32_be(ctx->Yi + 12, ctr); + out += len_blocks; + in += len_blocks; + len -= len_blocks; + } + + if (len) { + key->block(ctx->Yi, ctx->EKi, &key->aes); + ++ctr; + CRYPTO_store_u32_be(ctx->Yi + 12, ctr); + while (len--) { + uint8_t c = in[n]; + ctx->Xi[n] ^= c; + out[n] = c ^ ctx->EKi[n]; + ++n; + } + } + + ctx->mres = n; + return 1; +} + +int bssl::CRYPTO_gcm128_finish(const GCM128_KEY *key, GCM128_CONTEXT *ctx, + const uint8_t *tag, size_t len) { +#ifdef GCM_FUNCREF + void (*gcm_gmult_p)(uint8_t Xi[16], const u128 Htable[16]) = key->gmult; +#endif + + if (ctx->mres || ctx->ares) { + GCM_MUL(key, ctx, Xi); + } + + uint8_t len_block[16]; + CRYPTO_store_u64_be(len_block, ctx->len.aad << 3); + CRYPTO_store_u64_be(len_block + 8, ctx->len.msg << 3); + CRYPTO_xor16(ctx->Xi, ctx->Xi, len_block); + GCM_MUL(key, ctx, Xi); + CRYPTO_xor16(ctx->Xi, ctx->Xi, ctx->EK0); + + if (tag && len <= sizeof(ctx->Xi)) { + return CRYPTO_memcmp(ctx->Xi, tag, len) == 0; + } else { + return 0; + } +} + +void bssl::CRYPTO_gcm128_tag(const GCM128_KEY *key, GCM128_CONTEXT *ctx, + uint8_t *tag, size_t len) { + CRYPTO_gcm128_finish(key, ctx, nullptr, 0); + OPENSSL_memcpy(tag, ctx->Xi, len <= sizeof(ctx->Xi) ? len : sizeof(ctx->Xi)); +} + +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +int bssl::crypto_gcm_clmul_enabled() { +#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64) + return CRYPTO_is_PCLMUL_capable() && CRYPTO_is_SSSE3_capable(); +#else + return 0; +#endif +} +#endif diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/gcm_nohw.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/aes/gcm_nohw.cc.inc new file mode 100644 index 00000000..a4c6d1c0 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/aes/gcm_nohw.cc.inc @@ -0,0 +1,304 @@ +// Copyright 2019 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../../internal.h" +#include "internal.h" + +#if !defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_SSE2) +#include +#endif + + +using namespace bssl; + +// This file contains a constant-time implementation of GHASH based on the notes +// in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction +// algorithm described in +// https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. +// +// Unlike the BearSSL notes, we use uint128_t in the 64-bit implementation. Our +// primary compilers (clang, clang-cl, and gcc) all support it. MSVC will run +// the 32-bit implementation, but we can use its intrinsics if necessary. + +#if defined(BORINGSSL_HAS_UINT128) + +static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, + uint64_t b) { + // One term every four bits means the largest term is 64/4 = 16, which barely + // overflows into the next term. Using one term every five bits would cost 25 + // multiplications instead of 16. It is faster to mask off the bottom four + // bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits + // separately. + uint64_t a0 = a & UINT64_C(0x1111111111111110); + uint64_t a1 = a & UINT64_C(0x2222222222222220); + uint64_t a2 = a & UINT64_C(0x4444444444444440); + uint64_t a3 = a & UINT64_C(0x8888888888888880); + + uint64_t b0 = b & UINT64_C(0x1111111111111111); + uint64_t b1 = b & UINT64_C(0x2222222222222222); + uint64_t b2 = b & UINT64_C(0x4444444444444444); + uint64_t b3 = b & UINT64_C(0x8888888888888888); + + uint128_t c0 = (a0 * (uint128_t)b0) ^ (a1 * (uint128_t)b3) ^ + (a2 * (uint128_t)b2) ^ (a3 * (uint128_t)b1); + uint128_t c1 = (a0 * (uint128_t)b1) ^ (a1 * (uint128_t)b0) ^ + (a2 * (uint128_t)b3) ^ (a3 * (uint128_t)b2); + uint128_t c2 = (a0 * (uint128_t)b2) ^ (a1 * (uint128_t)b1) ^ + (a2 * (uint128_t)b0) ^ (a3 * (uint128_t)b3); + uint128_t c3 = (a0 * (uint128_t)b3) ^ (a1 * (uint128_t)b2) ^ + (a2 * (uint128_t)b1) ^ (a3 * (uint128_t)b0); + + // Multiply the bottom four bits of |a| with |b|. + uint64_t a0_mask = UINT64_C(0) - (a & 1); + uint64_t a1_mask = UINT64_C(0) - ((a >> 1) & 1); + uint64_t a2_mask = UINT64_C(0) - ((a >> 2) & 1); + uint64_t a3_mask = UINT64_C(0) - ((a >> 3) & 1); + uint128_t extra = (a0_mask & b) ^ ((uint128_t)(a1_mask & b) << 1) ^ + ((uint128_t)(a2_mask & b) << 2) ^ + ((uint128_t)(a3_mask & b) << 3); + + *out_lo = (((uint64_t)c0) & UINT64_C(0x1111111111111111)) ^ + (((uint64_t)c1) & UINT64_C(0x2222222222222222)) ^ + (((uint64_t)c2) & UINT64_C(0x4444444444444444)) ^ + (((uint64_t)c3) & UINT64_C(0x8888888888888888)) ^ ((uint64_t)extra); + *out_hi = (((uint64_t)(c0 >> 64)) & UINT64_C(0x1111111111111111)) ^ + (((uint64_t)(c1 >> 64)) & UINT64_C(0x2222222222222222)) ^ + (((uint64_t)(c2 >> 64)) & UINT64_C(0x4444444444444444)) ^ + (((uint64_t)(c3 >> 64)) & UINT64_C(0x8888888888888888)) ^ + ((uint64_t)(extra >> 64)); +} + +#elif defined(OPENSSL_SSE2) + +static __m128i gcm_mul32_nohw(uint32_t a, uint32_t b) { + // One term every four bits means the largest term is 32/4 = 8, which does not + // overflow into the next term. + __m128i aa = _mm_setr_epi32(a, 0, a, 0); + __m128i bb = _mm_setr_epi32(b, 0, b, 0); + + __m128i a0a0 = + _mm_and_si128(aa, _mm_setr_epi32(0x11111111, 0, 0x11111111, 0)); + __m128i a2a2 = + _mm_and_si128(aa, _mm_setr_epi32(0x44444444, 0, 0x44444444, 0)); + __m128i b0b1 = + _mm_and_si128(bb, _mm_setr_epi32(0x11111111, 0, 0x22222222, 0)); + __m128i b2b3 = + _mm_and_si128(bb, _mm_setr_epi32(0x44444444, 0, 0x88888888, 0)); + + __m128i c0c1 = + _mm_xor_si128(_mm_mul_epu32(a0a0, b0b1), _mm_mul_epu32(a2a2, b2b3)); + __m128i c2c3 = + _mm_xor_si128(_mm_mul_epu32(a2a2, b0b1), _mm_mul_epu32(a0a0, b2b3)); + + __m128i a1a1 = + _mm_and_si128(aa, _mm_setr_epi32(0x22222222, 0, 0x22222222, 0)); + __m128i a3a3 = + _mm_and_si128(aa, _mm_setr_epi32(0x88888888, 0, 0x88888888, 0)); + __m128i b3b0 = + _mm_and_si128(bb, _mm_setr_epi32(0x88888888, 0, 0x11111111, 0)); + __m128i b1b2 = + _mm_and_si128(bb, _mm_setr_epi32(0x22222222, 0, 0x44444444, 0)); + + c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a1a1, b3b0)); + c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a3a3, b1b2)); + c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a3a3, b3b0)); + c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a1a1, b1b2)); + + c0c1 = _mm_and_si128( + c0c1, _mm_setr_epi32(0x11111111, 0x11111111, 0x22222222, 0x22222222)); + c2c3 = _mm_and_si128( + c2c3, _mm_setr_epi32(0x44444444, 0x44444444, 0x88888888, 0x88888888)); + + c0c1 = _mm_xor_si128(c0c1, c2c3); + // c0 ^= c1 + c0c1 = _mm_xor_si128(c0c1, _mm_srli_si128(c0c1, 8)); + return c0c1; +} + +static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, + uint64_t b) { + uint32_t a0 = a & 0xffffffff; + uint32_t a1 = a >> 32; + uint32_t b0 = b & 0xffffffff; + uint32_t b1 = b >> 32; + // Karatsuba multiplication. + __m128i lo = gcm_mul32_nohw(a0, b0); + __m128i hi = gcm_mul32_nohw(a1, b1); + __m128i mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1); + mid = _mm_xor_si128(mid, lo); + mid = _mm_xor_si128(mid, hi); + __m128i ret = _mm_unpacklo_epi64(lo, hi); + mid = _mm_slli_si128(mid, 4); + mid = _mm_and_si128(mid, _mm_setr_epi32(0, 0xffffffff, 0xffffffff, 0)); + ret = _mm_xor_si128(ret, mid); + memcpy(out_lo, &ret, 8); + memcpy(out_hi, ((char*)&ret) + 8, 8); +} + +#else // !BORINGSSL_HAS_UINT128 && !OPENSSL_SSE2 + +static uint64_t gcm_mul32_nohw(uint32_t a, uint32_t b) { + // One term every four bits means the largest term is 32/4 = 8, which does not + // overflow into the next term. + uint32_t a0 = a & 0x11111111; + uint32_t a1 = a & 0x22222222; + uint32_t a2 = a & 0x44444444; + uint32_t a3 = a & 0x88888888; + + uint32_t b0 = b & 0x11111111; + uint32_t b1 = b & 0x22222222; + uint32_t b2 = b & 0x44444444; + uint32_t b3 = b & 0x88888888; + + uint64_t c0 = (a0 * (uint64_t)b0) ^ (a1 * (uint64_t)b3) ^ + (a2 * (uint64_t)b2) ^ (a3 * (uint64_t)b1); + uint64_t c1 = (a0 * (uint64_t)b1) ^ (a1 * (uint64_t)b0) ^ + (a2 * (uint64_t)b3) ^ (a3 * (uint64_t)b2); + uint64_t c2 = (a0 * (uint64_t)b2) ^ (a1 * (uint64_t)b1) ^ + (a2 * (uint64_t)b0) ^ (a3 * (uint64_t)b3); + uint64_t c3 = (a0 * (uint64_t)b3) ^ (a1 * (uint64_t)b2) ^ + (a2 * (uint64_t)b1) ^ (a3 * (uint64_t)b0); + + return (c0 & UINT64_C(0x1111111111111111)) | + (c1 & UINT64_C(0x2222222222222222)) | + (c2 & UINT64_C(0x4444444444444444)) | + (c3 & UINT64_C(0x8888888888888888)); +} + +static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, + uint64_t b) { + uint32_t a0 = a & 0xffffffff; + uint32_t a1 = a >> 32; + uint32_t b0 = b & 0xffffffff; + uint32_t b1 = b >> 32; + // Karatsuba multiplication. + uint64_t lo = gcm_mul32_nohw(a0, b0); + uint64_t hi = gcm_mul32_nohw(a1, b1); + uint64_t mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi; + *out_lo = lo ^ (mid << 32); + *out_hi = hi ^ (mid >> 32); +} + +#endif // BORINGSSL_HAS_UINT128 + +void bssl::gcm_init_nohw(u128 Htable[16], const uint64_t Xi[2]) { + // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This + // avoids a shift by 1 in the multiplication, needed to account for bit + // reversal losing a bit after multiplication, that is, + // rev128(X) * rev128(Y) = rev255(X*Y). + // + // Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation + // applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped. + // + // See also slide 16 of + // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf + Htable[0].lo = Xi[1]; + Htable[0].hi = Xi[0]; + + uint64_t carry = Htable[0].hi >> 63; + carry = 0u - carry; + + Htable[0].hi <<= 1; + Htable[0].hi |= Htable[0].lo >> 63; + Htable[0].lo <<= 1; + + // The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we + // conditionally add 0xc200...0001. + Htable[0].lo ^= carry & 1; + Htable[0].hi ^= carry & UINT64_C(0xc200000000000000); + + // This implementation does not use the rest of |Htable|. +} + +static void gcm_polyval_nohw(uint64_t Xi[2], const u128 *H) { + // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0| + // through |r3|. Note there is no byte or bit reversal because we are + // evaluating POLYVAL. + uint64_t r0, r1; + gcm_mul64_nohw(&r0, &r1, Xi[0], H->lo); + uint64_t r2, r3; + gcm_mul64_nohw(&r2, &r3, Xi[1], H->hi); + uint64_t mid0, mid1; + gcm_mul64_nohw(&mid0, &mid1, Xi[0] ^ Xi[1], H->hi ^ H->lo); + mid0 ^= r0 ^ r2; + mid1 ^= r1 ^ r3; + r2 ^= mid1; + r1 ^= mid0; + + // Now we multiply our 256-bit result by x^-128 and reduce. |r2| and + // |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We + // have: + // + // 1 = x^121 + x^126 + x^127 + x^128 + // x^-128 = x^-7 + x^-2 + x^-1 + 1 + // + // This is the GHASH reduction step, but with bits flowing in reverse. + + // The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require + // another reduction steps. Instead, we gather the excess bits, incorporate + // them into |r0| and |r1| and reduce once. See slides 17-19 + // of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. + r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57); + + // 1 + r2 ^= r0; + r3 ^= r1; + + // x^-1 + r2 ^= r0 >> 1; + r2 ^= r1 << 63; + r3 ^= r1 >> 1; + + // x^-2 + r2 ^= r0 >> 2; + r2 ^= r1 << 62; + r3 ^= r1 >> 2; + + // x^-7 + r2 ^= r0 >> 7; + r2 ^= r1 << 57; + r3 ^= r1 >> 7; + + Xi[0] = r2; + Xi[1] = r3; +} + +void bssl::gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]) { + uint64_t swapped[2]; + swapped[0] = CRYPTO_load_u64_be(Xi + 8); + swapped[1] = CRYPTO_load_u64_be(Xi); + gcm_polyval_nohw(swapped, &Htable[0]); + CRYPTO_store_u64_be(Xi, swapped[1]); + CRYPTO_store_u64_be(Xi + 8, swapped[0]); +} + +void bssl::gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], + const uint8_t *inp, size_t len) { + uint64_t swapped[2]; + swapped[0] = CRYPTO_load_u64_be(Xi + 8); + swapped[1] = CRYPTO_load_u64_be(Xi); + + while (len >= 16) { + swapped[0] ^= CRYPTO_load_u64_be(inp + 8); + swapped[1] ^= CRYPTO_load_u64_be(inp); + gcm_polyval_nohw(swapped, &Htable[0]); + inp += 16; + len -= 16; + } + + CRYPTO_store_u64_be(Xi, swapped[1]); + CRYPTO_store_u64_be(Xi + 8, swapped[0]); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/internal.h b/third_party/boringssl/src/crypto/fipsmodule/aes/internal.h index 0685bc41..db627037 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/aes/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/aes/internal.h @@ -1,28 +1,55 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_AES_INTERNAL_H -#define OPENSSL_HEADER_AES_INTERNAL_H +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_AES_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_AES_INTERNAL_H #include #include "../../internal.h" +#include "../bcm_interface.h" -#if defined(__cplusplus) -extern "C" { -#endif +BSSL_NAMESPACE_BEGIN + +// block128_f is the type of an AES block cipher implementation. +// +// Unlike upstream OpenSSL, it and the other functions in this file hard-code +// |AES_KEY|. It is undefined in C to call a function pointer with anything +// other than the original type. Thus we either must match |block128_f| to the +// type signature of |BCM_aes_encrypt| and friends or pass in |void*| wrapper +// functions. +// +// These functions are called exclusively with AES, so we use the former. +typedef void (*block128_f)(const uint8_t in[16], uint8_t out[16], + const AES_KEY *key); + +// ctr128_f is the type of a function that performs CTR-mode encryption. +typedef void (*ctr128_f)(const uint8_t *in, uint8_t *out, size_t blocks, + const AES_KEY *key, const uint8_t ivec[16]); + +// aes_ctr_set_key initialises |*aes_key| using |key_bytes| bytes from |key|, +// where |key_bytes| must either be 16, 24 or 32. If not NULL, |*out_block| is +// set to a function that encrypts single blocks. If not NULL, |*out_is_hwaes| +// is set to whether the hardware AES implementation was used. It returns a +// function for optimised CTR-mode. +ctr128_f aes_ctr_set_key(AES_KEY *aes_key, int *out_is_hwaes, + block128_f *out_block, const uint8_t *key, + size_t key_bytes); + + +// AES implementations. #if !defined(OPENSSL_NO_ASM) @@ -30,41 +57,30 @@ extern "C" { #define HWAES #define HWAES_ECB -OPENSSL_INLINE int hwaes_capable(void) { return CRYPTO_is_AESNI_capable(); } +inline int hwaes_capable() { return CRYPTO_is_AESNI_capable(); } #define VPAES -#if defined(OPENSSL_X86_64) -#define VPAES_CTR32 -#endif #define VPAES_CBC -OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_SSSE3_capable(); } +inline int vpaes_capable() { return CRYPTO_is_SSSE3_capable(); } #elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) #define HWAES -OPENSSL_INLINE int hwaes_capable(void) { return CRYPTO_is_ARMv8_AES_capable(); } +inline int hwaes_capable() { return CRYPTO_is_ARMv8_AES_capable(); } #if defined(OPENSSL_ARM) #define BSAES #define VPAES -#define VPAES_CTR32 -OPENSSL_INLINE int bsaes_capable(void) { return CRYPTO_is_NEON_capable(); } -OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); } +inline int bsaes_capable() { return CRYPTO_is_NEON_capable(); } +inline int vpaes_capable() { return CRYPTO_is_NEON_capable(); } #endif #if defined(OPENSSL_AARCH64) #define VPAES #define VPAES_CBC -#define VPAES_CTR32 -OPENSSL_INLINE int vpaes_capable(void) { return CRYPTO_is_NEON_capable(); } +inline int vpaes_capable() { return CRYPTO_is_NEON_capable(); } #endif -#elif defined(OPENSSL_PPC64LE) -#define HWAES - -OPENSSL_INLINE int hwaes_capable(void) { - return CRYPTO_is_PPC64LE_vcrypto_capable(); -} #endif #endif // !NO_ASM @@ -72,52 +88,82 @@ OPENSSL_INLINE int hwaes_capable(void) { #if defined(HWAES) -int aes_hw_set_encrypt_key(const uint8_t *user_key, const int bits, - AES_KEY *key); -int aes_hw_set_decrypt_key(const uint8_t *user_key, const int bits, - AES_KEY *key); -void aes_hw_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key); -void aes_hw_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key); -void aes_hw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, - const AES_KEY *key, uint8_t *ivec, const int enc); -void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, const uint8_t ivec[16]); +extern "C" int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, + AES_KEY *key); +extern "C" int aes_hw_set_decrypt_key(const uint8_t *user_key, int bits, + AES_KEY *key); +extern "C" void aes_hw_encrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key); +extern "C" void aes_hw_decrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key); +extern "C" void aes_hw_cbc_encrypt(const uint8_t *in, uint8_t *out, + size_t length, const AES_KEY *key, + uint8_t *ivec, int enc); +extern "C" void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, + size_t len, const AES_KEY *key, + const uint8_t ivec[16]); + +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +// On x86 and x86_64, |aes_hw_set_decrypt_key| is implemented in terms of +// |aes_hw_set_encrypt_key| and a conversion function. +extern "C" void aes_hw_encrypt_key_to_decrypt_key(AES_KEY *key); + +// There are two variants of this function, one which uses aeskeygenassist +// ("base") and one which uses aesenclast + pshufb ("alt"). aesenclast is +// overall faster but is slower on some older processors. It doesn't use AVX, +// but AVX is used as a proxy to detecting this. See +// https://groups.google.com/g/mailing.openssl.dev/c/OuFXwW4NfO8/m/7d2ZXVjkxVkJ +// +// TODO(davidben): It is unclear if the aeskeygenassist version is still +// worthwhile. However, the aesenclast version requires SSSE3. SSSE3 long +// predates AES-NI, but it's not clear if AES-NI implies SSSE3. In OpenSSL, the +// CCM AES-NI assembly seems to assume it does. +inline int aes_hw_set_encrypt_key_alt_capable() { + return hwaes_capable() && CRYPTO_is_SSSE3_capable(); +} +inline int aes_hw_set_encrypt_key_alt_preferred() { + return hwaes_capable() && CRYPTO_is_AVX_capable(); +} +extern "C" int aes_hw_set_encrypt_key_base(const uint8_t *user_key, int bits, + AES_KEY *key); +extern "C" int aes_hw_set_encrypt_key_alt(const uint8_t *user_key, int bits, + AES_KEY *key); +#endif // OPENSSL_X86 || OPENSSL_X86_64 #else // If HWAES isn't defined then we provide dummy functions for each of the hwaes // functions. -OPENSSL_INLINE int hwaes_capable(void) { return 0; } +inline int hwaes_capable() { return 0; } -OPENSSL_INLINE int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, - AES_KEY *key) { +inline int aes_hw_set_encrypt_key(const uint8_t *user_key, int bits, + AES_KEY *key) { abort(); } -OPENSSL_INLINE int aes_hw_set_decrypt_key(const uint8_t *user_key, int bits, - AES_KEY *key) { +inline int aes_hw_set_decrypt_key(const uint8_t *user_key, int bits, + AES_KEY *key) { abort(); } -OPENSSL_INLINE void aes_hw_encrypt(const uint8_t *in, uint8_t *out, - const AES_KEY *key) { +inline void aes_hw_encrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key) { abort(); } -OPENSSL_INLINE void aes_hw_decrypt(const uint8_t *in, uint8_t *out, - const AES_KEY *key) { +inline void aes_hw_decrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key) { abort(); } -OPENSSL_INLINE void aes_hw_cbc_encrypt(const uint8_t *in, uint8_t *out, - size_t length, const AES_KEY *key, - uint8_t *ivec, int enc) { +inline void aes_hw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, + const AES_KEY *key, uint8_t *ivec, int enc) { abort(); } -OPENSSL_INLINE void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, - size_t len, const AES_KEY *key, - const uint8_t ivec[16]) { +inline void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, + size_t len, const AES_KEY *key, + const uint8_t ivec[16]) { abort(); } @@ -125,90 +171,77 @@ OPENSSL_INLINE void aes_hw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, #if defined(HWAES_ECB) -void aes_hw_ecb_encrypt(const uint8_t *in, uint8_t *out, size_t length, - const AES_KEY *key, const int enc); +extern "C" void aes_hw_ecb_encrypt(const uint8_t *in, uint8_t *out, + size_t length, const AES_KEY *key, int enc); #endif // HWAES_ECB #if defined(BSAES) // Note |bsaes_cbc_encrypt| requires |enc| to be zero. -void bsaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, - const AES_KEY *key, uint8_t ivec[16], int enc); -void bsaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, const uint8_t ivec[16]); +extern "C" void bsaes_cbc_encrypt(const uint8_t *in, uint8_t *out, + size_t length, const AES_KEY *key, + uint8_t ivec[16], int enc); +extern "C" void bsaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, + size_t len, const AES_KEY *key, + const uint8_t ivec[16]); // VPAES to BSAES conversions are available on all BSAES platforms. -void vpaes_encrypt_key_to_bsaes(AES_KEY *out_bsaes, const AES_KEY *vpaes); -void vpaes_decrypt_key_to_bsaes(AES_KEY *out_bsaes, const AES_KEY *vpaes); -#else -OPENSSL_INLINE char bsaes_capable(void) { return 0; } - -// On other platforms, bsaes_capable() will always return false and so the -// following will never be called. -OPENSSL_INLINE void bsaes_cbc_encrypt(const uint8_t *in, uint8_t *out, - size_t length, const AES_KEY *key, - uint8_t ivec[16], int enc) { - abort(); -} - -OPENSSL_INLINE void bsaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, - size_t len, const AES_KEY *key, - const uint8_t ivec[16]) { - abort(); -} - -OPENSSL_INLINE void vpaes_encrypt_key_to_bsaes(AES_KEY *out_bsaes, - const AES_KEY *vpaes) { - abort(); -} - -OPENSSL_INLINE void vpaes_decrypt_key_to_bsaes(AES_KEY *out_bsaes, - const AES_KEY *vpaes) { - abort(); -} +extern "C" void vpaes_encrypt_key_to_bsaes(AES_KEY *out_bsaes, + const AES_KEY *vpaes); +extern "C" void vpaes_decrypt_key_to_bsaes(AES_KEY *out_bsaes, + const AES_KEY *vpaes); +void vpaes_ctr32_encrypt_blocks_with_bsaes(const uint8_t *in, uint8_t *out, + size_t blocks, const AES_KEY *key, + const uint8_t ivec[16]); #endif // !BSAES #if defined(VPAES) // On platforms where VPAES gets defined (just above), then these functions are // provided by asm. -int vpaes_set_encrypt_key(const uint8_t *userKey, int bits, AES_KEY *key); -int vpaes_set_decrypt_key(const uint8_t *userKey, int bits, AES_KEY *key); +extern "C" int vpaes_set_encrypt_key(const uint8_t *userKey, int bits, + AES_KEY *key); +extern "C" int vpaes_set_decrypt_key(const uint8_t *userKey, int bits, + AES_KEY *key); -void vpaes_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key); -void vpaes_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key); +extern "C" void vpaes_encrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key); +extern "C" void vpaes_decrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key); #if defined(VPAES_CBC) -void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, - const AES_KEY *key, uint8_t *ivec, int enc); -#endif -#if defined(VPAES_CTR32) -void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, const uint8_t ivec[16]); +extern "C" void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, + size_t length, const AES_KEY *key, + uint8_t *ivec, int enc); #endif +extern "C" void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, + size_t len, const AES_KEY *key, + const uint8_t ivec[16]); #else -OPENSSL_INLINE char vpaes_capable(void) { return 0; } +inline int vpaes_capable() { return 0; } // On other platforms, vpaes_capable() will always return false and so the // following will never be called. -OPENSSL_INLINE int vpaes_set_encrypt_key(const uint8_t *userKey, int bits, - AES_KEY *key) { +inline int vpaes_set_encrypt_key(const uint8_t *userKey, int bits, + AES_KEY *key) { + abort(); +} +inline int vpaes_set_decrypt_key(const uint8_t *userKey, int bits, + AES_KEY *key) { abort(); } -OPENSSL_INLINE int vpaes_set_decrypt_key(const uint8_t *userKey, int bits, - AES_KEY *key) { +inline void vpaes_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { abort(); } -OPENSSL_INLINE void vpaes_encrypt(const uint8_t *in, uint8_t *out, - const AES_KEY *key) { +inline void vpaes_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { abort(); } -OPENSSL_INLINE void vpaes_decrypt(const uint8_t *in, uint8_t *out, - const AES_KEY *key) { +inline void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t length, + const AES_KEY *key, uint8_t *ivec, int enc) { abort(); } -OPENSSL_INLINE void vpaes_cbc_encrypt(const uint8_t *in, uint8_t *out, - size_t length, const AES_KEY *key, - uint8_t *ivec, int enc) { +inline void vpaes_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, + size_t len, const AES_KEY *key, + const uint8_t ivec[16]) { abort(); } #endif // !VPAES @@ -224,11 +257,334 @@ void aes_nohw_ctr32_encrypt_blocks(const uint8_t *in, uint8_t *out, size_t blocks, const AES_KEY *key, const uint8_t ivec[16]); void aes_nohw_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t *ivec, const int enc); + const AES_KEY *key, uint8_t *ivec, int enc); + +// Modes + +inline void CRYPTO_xor16(uint8_t out[16], const uint8_t a[16], + const uint8_t b[16]) { + // TODO(davidben): Ideally we'd leave this to the compiler, which could use + // vector registers, etc. But the compiler doesn't know that |in| and |out| + // cannot partially alias. |restrict| is slightly two strict (we allow exact + // aliasing), but perhaps in-place could be a separate function? + static_assert(16 % sizeof(crypto_word_t) == 0, + "block cannot be evenly divided into words"); + for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) { + CRYPTO_store_word_le( + out + i, CRYPTO_load_word_le(a + i) ^ CRYPTO_load_word_le(b + i)); + } +} + + +// CTR. + +// CRYPTO_ctr128_encrypt_ctr32 encrypts (or decrypts, it's the same in CTR mode) +// |len| bytes from |in| to |out| using |block| in counter mode. There's no +// requirement that |len| be a multiple of any value and any partial blocks are +// stored in |ecount_buf| and |*num|, which must be zeroed before the initial +// call. The counter is a 128-bit, big-endian value in |ivec| and is +// incremented by this function. If the counter overflows, it wraps around. +// |ctr| must be a function that performs CTR mode but only deals with the lower +// 32 bits of the counter. +void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + uint8_t ecount_buf[16], unsigned *num, + ctr128_f ctr); + + +// GCM. +// +// This API differs from the upstream API slightly. The |GCM128_CONTEXT| does +// not have a |key| pointer that points to the key as upstream's version does. +// Instead, every function takes a |key| parameter. This way |GCM128_CONTEXT| +// can be safely copied. Additionally, |gcm_key| is split into a separate +// struct. + +// gcm_impl_t specifies an assembly implementation of AES-GCM. +enum gcm_impl_t { + gcm_separate = 0, // No combined AES-GCM, but may have AES-CTR and GHASH. + gcm_x86_aesni, + gcm_x86_vaes_avx2, + gcm_x86_vaes_avx512, + gcm_arm64_aes, + gcm_arm64_aes_eor3, +}; + +typedef struct { uint64_t hi,lo; } u128; + +// gmult_func multiplies |Xi| by the GCM key and writes the result back to +// |Xi|. +typedef void (*gmult_func)(uint8_t Xi[16], const u128 Htable[16]); + +// ghash_func repeatedly multiplies |Xi| by the GCM key and adds in blocks from +// |inp|. The result is written back to |Xi| and the |len| argument must be a +// multiple of 16. +typedef void (*ghash_func)(uint8_t Xi[16], const u128 Htable[16], + const uint8_t *inp, size_t len); + +typedef struct gcm128_key_st { + u128 Htable[16]; + gmult_func gmult; + ghash_func ghash; + AES_KEY aes; + + ctr128_f ctr; + block128_f block; + enum gcm_impl_t impl; +} GCM128_KEY; + +// GCM128_CONTEXT contains state for a single GCM operation. The structure +// should be zero-initialized before use. +typedef struct { + // The following 5 names follow names in GCM specification + uint8_t Yi[16]; + uint8_t EKi[16]; + uint8_t EK0[16]; + struct { + uint64_t aad; + uint64_t msg; + } len; + uint8_t Xi[16]; + unsigned mres, ares; +} GCM128_CONTEXT; + +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +// crypto_gcm_clmul_enabled returns one if the CLMUL implementation of GCM is +// used. +int crypto_gcm_clmul_enabled(); +#endif + +// CRYPTO_ghash_init writes a precomputed table of powers of |gcm_key| to +// |out_table| and sets |*out_mult| and |*out_hash| to (potentially hardware +// accelerated) functions for performing operations in the GHASH field. +void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash, + u128 out_table[16], const uint8_t gcm_key[16]); + +// CRYPTO_gcm128_init_aes_key initialises |gcm_key| to with AES key |key|. +void CRYPTO_gcm128_init_aes_key(GCM128_KEY *gcm_key, const uint8_t *key, + size_t key_bytes); + +// CRYPTO_gcm128_init_ctx initializes |ctx| to encrypt with |key| and |iv|. +void CRYPTO_gcm128_init_ctx(const GCM128_KEY *key, GCM128_CONTEXT *ctx, + const uint8_t *iv, size_t iv_len); + +// CRYPTO_gcm128_aad adds to the authenticated data for an instance of GCM. +// This must be called before and data is encrypted. |key| must be the same +// value that was passed to |CRYPTO_gcm128_init_ctx|. It returns one on success +// and zero otherwise. +int CRYPTO_gcm128_aad(const GCM128_KEY *key, GCM128_CONTEXT *ctx, + const uint8_t *aad, size_t aad_len); + +// CRYPTO_gcm128_encrypt encrypts |len| bytes from |in| to |out|. |key| must be +// the same value that was passed to |CRYPTO_gcm128_init_ctx|. It returns one on +// success and zero otherwise. +int CRYPTO_gcm128_encrypt(const GCM128_KEY *key, GCM128_CONTEXT *ctx, + const uint8_t *in, uint8_t *out, size_t len); + +// CRYPTO_gcm128_decrypt decrypts |len| bytes from |in| to |out|. |key| must be +// the same value that was passed to |CRYPTO_gcm128_init_ctx|. It returns one on +// success and zero otherwise. +int CRYPTO_gcm128_decrypt(const GCM128_KEY *key, GCM128_CONTEXT *ctx, + const uint8_t *in, uint8_t *out, size_t len); + +// CRYPTO_gcm128_finish calculates the authenticator and compares it against +// |len| bytes of |tag|. |key| must be the same value that was passed to +// |CRYPTO_gcm128_init_ctx|. It returns one on success and zero otherwise. +int CRYPTO_gcm128_finish(const GCM128_KEY *key, GCM128_CONTEXT *ctx, + const uint8_t *tag, size_t len); + +// CRYPTO_gcm128_tag calculates the authenticator and copies it into |tag|. +// The minimum of |len| and 16 bytes are copied into |tag|. |key| must be the +// same value that was passed to |CRYPTO_gcm128_init_ctx|. +void CRYPTO_gcm128_tag(const GCM128_KEY *key, GCM128_CONTEXT *ctx, uint8_t *tag, + size_t len); + + +// GCM assembly. + +void gcm_init_nohw(u128 Htable[16], const uint64_t H[2]); +void gcm_gmult_nohw(uint8_t Xi[16], const u128 Htable[16]); +void gcm_ghash_nohw(uint8_t Xi[16], const u128 Htable[16], const uint8_t *inp, + size_t len); + +#if !defined(OPENSSL_NO_ASM) + +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) +#define GCM_FUNCREF +extern "C" void gcm_init_clmul(u128 Htable[16], const uint64_t Xi[2]); +extern "C" void gcm_gmult_clmul(uint8_t Xi[16], const u128 Htable[16]); +extern "C" void gcm_ghash_clmul(uint8_t Xi[16], const u128 Htable[16], + const uint8_t *inp, size_t len); + +void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]); +extern "C" void gcm_gmult_ssse3(uint8_t Xi[16], const u128 Htable[16]); +extern "C" void gcm_ghash_ssse3(uint8_t Xi[16], const u128 Htable[16], + const uint8_t *in, size_t len); +#if defined(OPENSSL_X86_64) +#define GHASH_ASM_X86_64 +extern "C" void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]); +extern "C" void gcm_gmult_avx(uint8_t Xi[16], const u128 Htable[16]); +extern "C" void gcm_ghash_avx(uint8_t Xi[16], const u128 Htable[16], + const uint8_t *in, size_t len); + +#define HW_GCM +extern "C" size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + const u128 Htable[16], uint8_t Xi[16]); +extern "C" size_t aesni_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + const u128 Htable[16], uint8_t Xi[16]); + +extern "C" void gcm_init_vpclmulqdq_avx2(u128 Htable[16], const uint64_t H[2]); +extern "C" void gcm_gmult_vpclmulqdq_avx2(uint8_t Xi[16], + const u128 Htable[16]); +extern "C" void gcm_ghash_vpclmulqdq_avx2(uint8_t Xi[16], const u128 Htable[16], + const uint8_t *in, size_t len); +extern "C" void aes_gcm_enc_update_vaes_avx2(const uint8_t *in, uint8_t *out, + size_t len, const AES_KEY *key, + const uint8_t ivec[16], + const u128 Htable[16], + uint8_t Xi[16]); +extern "C" void aes_gcm_dec_update_vaes_avx2(const uint8_t *in, uint8_t *out, + size_t len, const AES_KEY *key, + const uint8_t ivec[16], + const u128 Htable[16], + uint8_t Xi[16]); + +extern "C" void gcm_init_vpclmulqdq_avx512(u128 Htable[16], + const uint64_t H[2]); +extern "C" void gcm_gmult_vpclmulqdq_avx512(uint8_t Xi[16], + const u128 Htable[16]); +extern "C" void gcm_ghash_vpclmulqdq_avx512(uint8_t Xi[16], + const u128 Htable[16], + const uint8_t *in, size_t len); +extern "C" void aes_gcm_enc_update_vaes_avx512(const uint8_t *in, uint8_t *out, + size_t len, const AES_KEY *key, + const uint8_t ivec[16], + const u128 Htable[16], + uint8_t Xi[16]); +extern "C" void aes_gcm_dec_update_vaes_avx512(const uint8_t *in, uint8_t *out, + size_t len, const AES_KEY *key, + const uint8_t ivec[16], + const u128 Htable[16], + uint8_t Xi[16]); + +#endif // OPENSSL_X86_64 + +#if defined(OPENSSL_X86) +#define GHASH_ASM_X86 +#endif // OPENSSL_X86 + +#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) + +#define GHASH_ASM_ARM +#define GCM_FUNCREF -#if defined(__cplusplus) -} // extern C +inline int gcm_pmull_capable() { return CRYPTO_is_ARMv8_PMULL_capable(); } +inline int gcm_eor3_capable() { + // SHA3 and EOR3 belong to the same ISA extension. + return CRYPTO_is_ARMv8_PMULL_capable() && CRYPTO_is_ARMv8_SHA3_capable(); +} + +extern "C" void gcm_init_v8(u128 Htable[16], const uint64_t H[2]); +extern "C" void gcm_gmult_v8(uint8_t Xi[16], const u128 Htable[16]); +extern "C" void gcm_ghash_v8(uint8_t Xi[16], const u128 Htable[16], + const uint8_t *inp, size_t len); + +inline int gcm_neon_capable() { return CRYPTO_is_NEON_capable(); } + +extern "C" void gcm_init_neon(u128 Htable[16], const uint64_t H[2]); +extern "C" void gcm_gmult_neon(uint8_t Xi[16], const u128 Htable[16]); +extern "C" void gcm_ghash_neon(uint8_t Xi[16], const u128 Htable[16], + const uint8_t *inp, size_t len); + +#if defined(OPENSSL_AARCH64) +#define HW_GCM +// These functions are defined in aesv8-gcm-armv8.pl. +extern "C" void aes_gcm_enc_kernel(const uint8_t *in, uint64_t in_bits, + void *out, void *Xi, uint8_t *ivec, + const AES_KEY *key, const u128 Htable[16]); +extern "C" void aes_gcm_dec_kernel(const uint8_t *in, uint64_t in_bits, + void *out, void *Xi, uint8_t *ivec, + const AES_KEY *key, const u128 Htable[16]); +extern "C" void aes_gcm_enc_kernel_eor3(const uint8_t *in, uint64_t in_bits, + void *out, void *Xi, uint8_t *ivec, + const AES_KEY *key, + const u128 Htable[16]); +extern "C" void aes_gcm_dec_kernel_eor3(const uint8_t *in, uint64_t in_bits, + void *out, void *Xi, uint8_t *ivec, + const AES_KEY *key, + const u128 Htable[16]); #endif -#endif // OPENSSL_HEADER_AES_INTERNAL_H +#endif +#endif // OPENSSL_NO_ASM + + +// CBC. + +// cbc128_f is the type of a function that performs CBC-mode encryption. +typedef void (*cbc128_f)(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], int enc); + +// CRYPTO_cbc128_encrypt encrypts |len| bytes from |in| to |out| using the +// given IV and block cipher in CBC mode. The input need not be a multiple of +// 128 bits long, but the output will round up to the nearest 128 bit multiple, +// zero padding the input if needed. The IV will be updated on return. +void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + block128_f block); + +// CRYPTO_cbc128_decrypt decrypts |len| bytes from |in| to |out| using the +// given IV and block cipher in CBC mode. If |len| is not a multiple of 128 +// bits then only that many bytes will be written, but a multiple of 128 bits +// is always read from |in|. The IV will be updated on return. +void CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + block128_f block); + + +// OFB. + +// CRYPTO_ofb128_encrypt encrypts (or decrypts, it's the same with OFB mode) +// |len| bytes from |in| to |out| using |block| in OFB mode. There's no +// requirement that |len| be a multiple of any value and any partial blocks are +// stored in |ivec| and |*num|, the latter must be zero before the initial +// call. +void CRYPTO_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], unsigned *num, + block128_f block); + + +// CFB. + +// CRYPTO_cfb128_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes +// from |in| to |out| using |block| in CFB mode. There's no requirement that +// |len| be a multiple of any value and any partial blocks are stored in |ivec| +// and |*num|, the latter must be zero before the initial call. +void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], unsigned *num, + int enc, block128_f block); + +// CRYPTO_cfb128_8_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes +// from |in| to |out| using |block| in CFB-8 mode. Prior to the first call +// |num| should be set to zero. +void CRYPTO_cfb128_8_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + unsigned *num, int enc, block128_f block); + +// CRYPTO_cfb128_1_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes +// from |in| to |out| using |block| in CFB-1 mode. Prior to the first call +// |num| should be set to zero. +void CRYPTO_cfb128_1_encrypt(const uint8_t *in, uint8_t *out, size_t bits, + const AES_KEY *key, uint8_t ivec[16], + unsigned *num, int enc, block128_f block); + +size_t CRYPTO_cts128_encrypt_block(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + block128_f block); + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_AES_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/key_wrap.c b/third_party/boringssl/src/crypto/fipsmodule/aes/key_wrap.c deleted file mode 100644 index 95b12d28..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/aes/key_wrap.c +++ /dev/null @@ -1,242 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2001-2011 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#include - -#include -#include -#include - -#include - -#include "../../internal.h" -#include "../service_indicator/internal.h" - - -// kDefaultIV is the default IV value given in RFC 3394, 2.2.3.1. -static const uint8_t kDefaultIV[] = { - 0xa6, 0xa6, 0xa6, 0xa6, 0xa6, 0xa6, 0xa6, 0xa6, -}; - -static const unsigned kBound = 6; - -int AES_wrap_key(const AES_KEY *key, const uint8_t *iv, uint8_t *out, - const uint8_t *in, size_t in_len) { - // See RFC 3394, section 2.2.1. Additionally, note that section 2 requires the - // plaintext be at least two 8-byte blocks. - - if (in_len > INT_MAX - 8 || in_len < 16 || in_len % 8 != 0) { - return -1; - } - - if (iv == NULL) { - iv = kDefaultIV; - } - - OPENSSL_memmove(out + 8, in, in_len); - uint8_t A[AES_BLOCK_SIZE]; - OPENSSL_memcpy(A, iv, 8); - - size_t n = in_len / 8; - - for (unsigned j = 0; j < kBound; j++) { - for (size_t i = 1; i <= n; i++) { - OPENSSL_memcpy(A + 8, out + 8 * i, 8); - AES_encrypt(A, A, key); - - uint32_t t = (uint32_t)(n * j + i); - A[7] ^= t & 0xff; - A[6] ^= (t >> 8) & 0xff; - A[5] ^= (t >> 16) & 0xff; - A[4] ^= (t >> 24) & 0xff; - OPENSSL_memcpy(out + 8 * i, A + 8, 8); - } - } - - OPENSSL_memcpy(out, A, 8); - FIPS_service_indicator_update_state(); - return (int)in_len + 8; -} - -// aes_unwrap_key_inner performs steps one and two from -// https://tools.ietf.org/html/rfc3394#section-2.2.2 -static int aes_unwrap_key_inner(const AES_KEY *key, uint8_t *out, - uint8_t out_iv[8], const uint8_t *in, - size_t in_len) { - // See RFC 3394, section 2.2.2. Additionally, note that section 2 requires the - // plaintext be at least two 8-byte blocks, so the ciphertext must be at least - // three blocks. - - if (in_len > INT_MAX || in_len < 24 || in_len % 8 != 0) { - return 0; - } - - uint8_t A[AES_BLOCK_SIZE]; - OPENSSL_memcpy(A, in, 8); - OPENSSL_memmove(out, in + 8, in_len - 8); - - size_t n = (in_len / 8) - 1; - - for (unsigned j = kBound - 1; j < kBound; j--) { - for (size_t i = n; i > 0; i--) { - uint32_t t = (uint32_t)(n * j + i); - A[7] ^= t & 0xff; - A[6] ^= (t >> 8) & 0xff; - A[5] ^= (t >> 16) & 0xff; - A[4] ^= (t >> 24) & 0xff; - OPENSSL_memcpy(A + 8, out + 8 * (i - 1), 8); - AES_decrypt(A, A, key); - OPENSSL_memcpy(out + 8 * (i - 1), A + 8, 8); - } - } - - memcpy(out_iv, A, 8); - return 1; -} - -int AES_unwrap_key(const AES_KEY *key, const uint8_t *iv, uint8_t *out, - const uint8_t *in, size_t in_len) { - uint8_t calculated_iv[8]; - if (!aes_unwrap_key_inner(key, out, calculated_iv, in, in_len)) { - return -1; - } - - if (iv == NULL) { - iv = kDefaultIV; - } - if (CRYPTO_memcmp(calculated_iv, iv, 8) != 0) { - return -1; - } - - FIPS_service_indicator_update_state(); - return (int)in_len - 8; -} - -// kPaddingConstant is used in Key Wrap with Padding. See -// https://tools.ietf.org/html/rfc5649#section-3 -static const uint8_t kPaddingConstant[4] = {0xa6, 0x59, 0x59, 0xa6}; - -int AES_wrap_key_padded(const AES_KEY *key, uint8_t *out, size_t *out_len, - size_t max_out, const uint8_t *in, size_t in_len) { - // See https://tools.ietf.org/html/rfc5649#section-4.1 - const uint64_t in_len64 = in_len; - const size_t padded_len = (in_len + 7) & ~7; - *out_len = 0; - if (in_len == 0 || in_len64 > 0xffffffffu || in_len + 7 < in_len || - padded_len + 8 < padded_len || max_out < padded_len + 8) { - return 0; - } - - uint8_t block[AES_BLOCK_SIZE]; - memcpy(block, kPaddingConstant, sizeof(kPaddingConstant)); - CRYPTO_store_u32_be(block + 4, (uint32_t)in_len); - - if (in_len <= 8) { - memset(block + 8, 0, 8); - memcpy(block + 8, in, in_len); - AES_encrypt(block, out, key); - *out_len = AES_BLOCK_SIZE; - return 1; - } - - uint8_t *padded_in = OPENSSL_malloc(padded_len); - if (padded_in == NULL) { - return 0; - } - assert(padded_len >= 8); - memset(padded_in + padded_len - 8, 0, 8); - memcpy(padded_in, in, in_len); - FIPS_service_indicator_lock_state(); - const int ret = AES_wrap_key(key, block, out, padded_in, padded_len); - FIPS_service_indicator_unlock_state(); - OPENSSL_free(padded_in); - if (ret < 0) { - return 0; - } - *out_len = ret; - FIPS_service_indicator_update_state(); - return 1; -} - -int AES_unwrap_key_padded(const AES_KEY *key, uint8_t *out, size_t *out_len, - size_t max_out, const uint8_t *in, size_t in_len) { - *out_len = 0; - if (in_len < AES_BLOCK_SIZE || max_out < in_len - 8) { - return 0; - } - - uint8_t iv[8]; - if (in_len == AES_BLOCK_SIZE) { - uint8_t block[AES_BLOCK_SIZE]; - AES_decrypt(in, block, key); - memcpy(iv, block, sizeof(iv)); - memcpy(out, block + 8, 8); - } else if (!aes_unwrap_key_inner(key, out, iv, in, in_len)) { - return 0; - } - assert(in_len % 8 == 0); - - crypto_word_t ok = constant_time_eq_int( - CRYPTO_memcmp(iv, kPaddingConstant, sizeof(kPaddingConstant)), 0); - - const size_t claimed_len = CRYPTO_load_u32_be(iv + 4); - ok &= ~constant_time_is_zero_w(claimed_len); - ok &= constant_time_eq_w((claimed_len - 1) >> 3, (in_len - 9) >> 3); - - // Check that padding bytes are all zero. - for (size_t i = in_len - 15; i < in_len - 8; i++) { - ok &= constant_time_is_zero_w(constant_time_ge_8(i, claimed_len) & out[i]); - } - - *out_len = constant_time_select_w(ok, claimed_len, 0); - const int ret = ok & 1; - if (ret) { - FIPS_service_indicator_update_state(); - } - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/key_wrap.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/aes/key_wrap.cc.inc new file mode 100644 index 00000000..5d1908a9 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/aes/key_wrap.cc.inc @@ -0,0 +1,211 @@ +// Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include + +#include "../bcm_interface.h" +#include "../../internal.h" +#include "../service_indicator/internal.h" + + +using namespace bssl; + +// kDefaultIV is the default IV value given in RFC 3394, 2.2.3.1. +static const uint8_t kDefaultIV[] = { + 0xa6, 0xa6, 0xa6, 0xa6, 0xa6, 0xa6, 0xa6, 0xa6, +}; + +static const unsigned kBound = 6; + +int AES_wrap_key(const AES_KEY *key, const uint8_t *iv, uint8_t *out, + const uint8_t *in, size_t in_len) { + // See RFC 3394, section 2.2.1. Additionally, note that section 2 requires the + // plaintext be at least two 8-byte blocks. + + if (in_len > INT_MAX - 8 || in_len < 16 || in_len % 8 != 0) { + return -1; + } + + if (iv == nullptr) { + iv = kDefaultIV; + } + + OPENSSL_memmove(out + 8, in, in_len); + uint8_t A[AES_BLOCK_SIZE]; + OPENSSL_memcpy(A, iv, 8); + + size_t n = in_len / 8; + + for (unsigned j = 0; j < kBound; j++) { + for (size_t i = 1; i <= n; i++) { + OPENSSL_memcpy(A + 8, out + 8 * i, 8); + BCM_aes_encrypt(A, A, key); + + uint32_t t = (uint32_t)(n * j + i); + A[7] ^= t & 0xff; + A[6] ^= (t >> 8) & 0xff; + A[5] ^= (t >> 16) & 0xff; + A[4] ^= (t >> 24) & 0xff; + OPENSSL_memcpy(out + 8 * i, A + 8, 8); + } + } + + OPENSSL_memcpy(out, A, 8); + FIPS_service_indicator_update_state(); + return (int)in_len + 8; +} + +// aes_unwrap_key_inner performs steps one and two from +// https://tools.ietf.org/html/rfc3394#section-2.2.2 +static int aes_unwrap_key_inner(const AES_KEY *key, uint8_t *out, + uint8_t out_iv[8], const uint8_t *in, + size_t in_len) { + // See RFC 3394, section 2.2.2. Additionally, note that section 2 requires the + // plaintext be at least two 8-byte blocks, so the ciphertext must be at least + // three blocks. + + if (in_len > INT_MAX || in_len < 24 || in_len % 8 != 0) { + return 0; + } + + uint8_t A[AES_BLOCK_SIZE]; + OPENSSL_memcpy(A, in, 8); + OPENSSL_memmove(out, in + 8, in_len - 8); + + size_t n = (in_len / 8) - 1; + + for (unsigned j = kBound - 1; j < kBound; j--) { + for (size_t i = n; i > 0; i--) { + uint32_t t = (uint32_t)(n * j + i); + A[7] ^= t & 0xff; + A[6] ^= (t >> 8) & 0xff; + A[5] ^= (t >> 16) & 0xff; + A[4] ^= (t >> 24) & 0xff; + OPENSSL_memcpy(A + 8, out + 8 * (i - 1), 8); + BCM_aes_decrypt(A, A, key); + OPENSSL_memcpy(out + 8 * (i - 1), A + 8, 8); + } + } + + memcpy(out_iv, A, 8); + return 1; +} + +int AES_unwrap_key(const AES_KEY *key, const uint8_t *iv, uint8_t *out, + const uint8_t *in, size_t in_len) { + uint8_t calculated_iv[8]; + if (!aes_unwrap_key_inner(key, out, calculated_iv, in, in_len)) { + return -1; + } + + if (iv == nullptr) { + iv = kDefaultIV; + } + if (CRYPTO_memcmp(calculated_iv, iv, 8) != 0) { + return -1; + } + + FIPS_service_indicator_update_state(); + return (int)in_len - 8; +} + +// kPaddingConstant is used in Key Wrap with Padding. See +// https://tools.ietf.org/html/rfc5649#section-3 +static const uint8_t kPaddingConstant[4] = {0xa6, 0x59, 0x59, 0xa6}; + +int AES_wrap_key_padded(const AES_KEY *key, uint8_t *out, size_t *out_len, + size_t max_out, const uint8_t *in, size_t in_len) { + // See https://tools.ietf.org/html/rfc5649#section-4.1 + const uint64_t in_len64 = in_len; + const size_t padded_len = (in_len + 7) & ~7; + *out_len = 0; + if (in_len == 0 || in_len64 > 0xffffffffu || in_len + 7 < in_len || + padded_len + 8 < padded_len || max_out < padded_len + 8) { + return 0; + } + + uint8_t block[AES_BLOCK_SIZE]; + memcpy(block, kPaddingConstant, sizeof(kPaddingConstant)); + CRYPTO_store_u32_be(block + 4, (uint32_t)in_len); + + if (in_len <= 8) { + memset(block + 8, 0, 8); + memcpy(block + 8, in, in_len); + BCM_aes_encrypt(block, out, key); + *out_len = AES_BLOCK_SIZE; + return 1; + } + + uint8_t *padded_in = reinterpret_cast(OPENSSL_malloc(padded_len)); + if (padded_in == nullptr) { + return 0; + } + assert(padded_len >= 8); + memset(padded_in + padded_len - 8, 0, 8); + memcpy(padded_in, in, in_len); + FIPS_service_indicator_lock_state(); + const int ret = AES_wrap_key(key, block, out, padded_in, padded_len); + FIPS_service_indicator_unlock_state(); + OPENSSL_free(padded_in); + if (ret < 0) { + return 0; + } + *out_len = ret; + FIPS_service_indicator_update_state(); + return 1; +} + +int AES_unwrap_key_padded(const AES_KEY *key, uint8_t *out, size_t *out_len, + size_t max_out, const uint8_t *in, size_t in_len) { + *out_len = 0; + if (in_len < AES_BLOCK_SIZE || max_out < in_len - 8) { + return 0; + } + + uint8_t iv[8]; + if (in_len == AES_BLOCK_SIZE) { + uint8_t block[AES_BLOCK_SIZE]; + BCM_aes_decrypt(in, block, key); + memcpy(iv, block, sizeof(iv)); + memcpy(out, block + 8, 8); + } else if (!aes_unwrap_key_inner(key, out, iv, in, in_len)) { + return 0; + } + assert(in_len % 8 == 0); + + crypto_word_t ok = constant_time_eq_int( + CRYPTO_memcmp(iv, kPaddingConstant, sizeof(kPaddingConstant)), 0); + + const size_t claimed_len = CRYPTO_load_u32_be(iv + 4); + ok &= ~constant_time_is_zero_w(claimed_len); + ok &= constant_time_eq_w((claimed_len - 1) >> 3, (in_len - 9) >> 3); + + // Check that padding bytes are all zero. + for (size_t i = in_len - 15; i < in_len - 8; i++) { + ok &= constant_time_is_zero_w(constant_time_ge_8(i, claimed_len) & out[i]); + } + + *out_len = constant_time_select_w(ok, claimed_len, 0); + const int ret = ok & 1; + if (ret) { + FIPS_service_indicator_update_state(); + } + return ret; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/mode_wrappers.c b/third_party/boringssl/src/crypto/fipsmodule/aes/mode_wrappers.c deleted file mode 100644 index 10d98a6a..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/aes/mode_wrappers.c +++ /dev/null @@ -1,124 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2002-2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#include - -#include - -#include "../aes/internal.h" -#include "../modes/internal.h" -#include "../service_indicator/internal.h" - - -void AES_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[AES_BLOCK_SIZE], - uint8_t ecount_buf[AES_BLOCK_SIZE], unsigned int *num) { - if (hwaes_capable()) { - CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, ivec, ecount_buf, num, - aes_hw_ctr32_encrypt_blocks); - } else if (vpaes_capable()) { -#if defined(VPAES_CTR32) - // TODO(davidben): On ARM, where |BSAES| is additionally defined, this could - // use |vpaes_ctr32_encrypt_blocks_with_bsaes|. - CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, ivec, ecount_buf, num, - vpaes_ctr32_encrypt_blocks); -#else - CRYPTO_ctr128_encrypt(in, out, len, key, ivec, ecount_buf, num, - vpaes_encrypt); -#endif - } else { - CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, ivec, ecount_buf, num, - aes_nohw_ctr32_encrypt_blocks); - } - - FIPS_service_indicator_update_state(); -} - -void AES_ecb_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key, - const int enc) { - assert(in && out && key); - assert((AES_ENCRYPT == enc) || (AES_DECRYPT == enc)); - - if (AES_ENCRYPT == enc) { - AES_encrypt(in, out, key); - } else { - AES_decrypt(in, out, key); - } - - FIPS_service_indicator_update_state(); -} - -void AES_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t *ivec, const int enc) { - if (hwaes_capable()) { - aes_hw_cbc_encrypt(in, out, len, key, ivec, enc); - } else if (!vpaes_capable()) { - aes_nohw_cbc_encrypt(in, out, len, key, ivec, enc); - } else if (enc) { - CRYPTO_cbc128_encrypt(in, out, len, key, ivec, AES_encrypt); - } else { - CRYPTO_cbc128_decrypt(in, out, len, key, ivec, AES_decrypt); - } - - FIPS_service_indicator_update_state(); -} - -void AES_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t length, - const AES_KEY *key, uint8_t *ivec, int *num) { - unsigned num_u = (unsigned)(*num); - CRYPTO_ofb128_encrypt(in, out, length, key, ivec, &num_u, AES_encrypt); - *num = (int)num_u; -} - -void AES_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t length, - const AES_KEY *key, uint8_t *ivec, int *num, - int enc) { - unsigned num_u = (unsigned)(*num); - CRYPTO_cfb128_encrypt(in, out, length, key, ivec, &num_u, enc, AES_encrypt); - *num = (int)num_u; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/mode_wrappers.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/aes/mode_wrappers.cc.inc new file mode 100644 index 00000000..34f14b00 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/aes/mode_wrappers.cc.inc @@ -0,0 +1,97 @@ +// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../aes/internal.h" +#include "../service_indicator/internal.h" + + +using namespace bssl; + +namespace { +void aes_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { + BCM_aes_encrypt(in, out, key); +} + +void aes_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key) { + BCM_aes_decrypt(in, out, key); +} +} // namespace + +void AES_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[AES_BLOCK_SIZE], + uint8_t ecount_buf[AES_BLOCK_SIZE], unsigned int *num) { + if (hwaes_capable()) { + CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, ivec, ecount_buf, num, + aes_hw_ctr32_encrypt_blocks); + } else if (vpaes_capable()) { + // TODO(davidben): On ARM, where |BSAES| is additionally defined, this could + // use |vpaes_ctr32_encrypt_blocks_with_bsaes|. + CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, ivec, ecount_buf, num, + vpaes_ctr32_encrypt_blocks); + } else { + CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, ivec, ecount_buf, num, + aes_nohw_ctr32_encrypt_blocks); + } + + FIPS_service_indicator_update_state(); +} + +void AES_ecb_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY *key, + const int enc) { + assert(in && out && key); + assert((AES_ENCRYPT == enc) || (AES_DECRYPT == enc)); + + if (AES_ENCRYPT == enc) { + BCM_aes_encrypt(in, out, key); + } else { + BCM_aes_decrypt(in, out, key); + } + + FIPS_service_indicator_update_state(); +} + +void AES_cbc_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t *ivec, const int enc) { + assert(len % AES_BLOCK_SIZE == 0); + if (hwaes_capable()) { + aes_hw_cbc_encrypt(in, out, len, key, ivec, enc); + } else if (!vpaes_capable()) { + aes_nohw_cbc_encrypt(in, out, len, key, ivec, enc); + } else if (enc) { + CRYPTO_cbc128_encrypt(in, out, len, key, ivec, aes_encrypt); + } else { + CRYPTO_cbc128_decrypt(in, out, len, key, ivec, aes_decrypt); + } + + FIPS_service_indicator_update_state(); +} + +void AES_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t length, + const AES_KEY *key, uint8_t *ivec, int *num) { + unsigned num_u = (unsigned)(*num); + CRYPTO_ofb128_encrypt(in, out, length, key, ivec, &num_u, aes_encrypt); + *num = (int)num_u; +} + +void AES_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t length, + const AES_KEY *key, uint8_t *ivec, int *num, + int enc) { + unsigned num_u = (unsigned)(*num); + CRYPTO_cfb128_encrypt(in, out, length, key, ivec, &num_u, enc, aes_encrypt); + *num = (int)num_u; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/aes/ofb.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/aes/ofb.cc.inc new file mode 100644 index 00000000..23e0ac66 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/aes/ofb.cc.inc @@ -0,0 +1,55 @@ +// Copyright 2008-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "internal.h" + + +using namespace bssl; + +static_assert(16 % sizeof(size_t) == 0, "block cannot be divided into size_t"); + +void bssl::CRYPTO_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t len, + const AES_KEY *key, uint8_t ivec[16], + unsigned *num, block128_f block) { + assert(key != nullptr && ivec != nullptr && num != nullptr); + assert(len == 0 || (in != nullptr && out != nullptr)); + + unsigned n = *num; + + while (n && len) { + *(out++) = *(in++) ^ ivec[n]; + --len; + n = (n + 1) % 16; + } + + while (len >= 16) { + (*block)(ivec, ivec, key); + CRYPTO_xor16(out, in, ivec); + len -= 16; + out += 16; + in += 16; + n = 0; + } + if (len) { + (*block)(ivec, ivec, key); + while (len--) { + out[n] = in[n] ^ ivec[n]; + ++n; + } + } + *num = n; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bcm.c b/third_party/boringssl/src/crypto/fipsmodule/bcm.c deleted file mode 100644 index 17921347..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bcm.c +++ /dev/null @@ -1,275 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#if !defined(_GNU_SOURCE) -#define _GNU_SOURCE // needed for syscall() on Linux. -#endif - -#include - -#include -#if defined(BORINGSSL_FIPS) -#include -#include -#endif - -#include -#include -#include - -#include "../internal.h" - -#include "aes/aes.c" -#include "aes/aes_nohw.c" -#include "aes/key_wrap.c" -#include "aes/mode_wrappers.c" -#include "bn/add.c" -#include "bn/asm/x86_64-gcc.c" -#include "bn/bn.c" -#include "bn/bytes.c" -#include "bn/cmp.c" -#include "bn/ctx.c" -#include "bn/div.c" -#include "bn/div_extra.c" -#include "bn/exponentiation.c" -#include "bn/gcd.c" -#include "bn/gcd_extra.c" -#include "bn/generic.c" -#include "bn/jacobi.c" -#include "bn/montgomery.c" -#include "bn/montgomery_inv.c" -#include "bn/mul.c" -#include "bn/prime.c" -#include "bn/random.c" -#include "bn/rsaz_exp.c" -#include "bn/shift.c" -#include "bn/sqrt.c" -#include "cipher/aead.c" -#include "cipher/cipher.c" -#include "cipher/e_aes.c" -#include "cipher/e_aesccm.c" -#include "cmac/cmac.c" -#include "dh/check.c" -#include "dh/dh.c" -#include "digest/digest.c" -#include "digest/digests.c" -#include "digestsign/digestsign.c" -#include "ecdh/ecdh.c" -#include "ecdsa/ecdsa.c" -#include "ec/ec.c" -#include "ec/ec_key.c" -#include "ec/ec_montgomery.c" -#include "ec/felem.c" -#include "ec/oct.c" -#include "ec/p224-64.c" -#include "ec/p256.c" -#include "ec/p256-nistz.c" -#include "ec/scalar.c" -#include "ec/simple.c" -#include "ec/simple_mul.c" -#include "ec/util.c" -#include "ec/wnaf.c" -#include "hmac/hmac.c" -#include "md4/md4.c" -#include "md5/md5.c" -#include "modes/cbc.c" -#include "modes/cfb.c" -#include "modes/ctr.c" -#include "modes/gcm.c" -#include "modes/gcm_nohw.c" -#include "modes/ofb.c" -#include "modes/polyval.c" -#include "rand/ctrdrbg.c" -#include "rand/fork_detect.c" -#include "rand/rand.c" -#include "rand/urandom.c" -#include "rsa/blinding.c" -#include "rsa/padding.c" -#include "rsa/rsa.c" -#include "rsa/rsa_impl.c" -#include "self_check/fips.c" -#include "self_check/self_check.c" -#include "service_indicator/service_indicator.c" -#include "sha/sha1-altivec.c" -#include "sha/sha1.c" -#include "sha/sha256.c" -#include "sha/sha512.c" -#include "tls/kdf.c" - - -#if defined(BORINGSSL_FIPS) - -#if !defined(OPENSSL_ASAN) - -// These symbols are filled in by delocate.go (in static builds) or a linker -// script (in shared builds). They point to the start and end of the module, and -// the location of the integrity hash, respectively. -extern const uint8_t BORINGSSL_bcm_text_start[]; -extern const uint8_t BORINGSSL_bcm_text_end[]; -extern const uint8_t BORINGSSL_bcm_text_hash[]; -#if defined(BORINGSSL_SHARED_LIBRARY) -extern const uint8_t BORINGSSL_bcm_rodata_start[]; -extern const uint8_t BORINGSSL_bcm_rodata_end[]; -#endif - -// assert_within is used to sanity check that certain symbols are within the -// bounds of the integrity check. It checks that start <= symbol < end and -// aborts otherwise. -static void assert_within(const void *start, const void *symbol, - const void *end) { - const uintptr_t start_val = (uintptr_t) start; - const uintptr_t symbol_val = (uintptr_t) symbol; - const uintptr_t end_val = (uintptr_t) end; - - if (start_val <= symbol_val && symbol_val < end_val) { - return; - } - - fprintf( - stderr, - "FIPS module doesn't span expected symbol. Expected %p <= %p < %p\n", - start, symbol, end); - BORINGSSL_FIPS_abort(); -} - -#if defined(OPENSSL_ANDROID) && defined(OPENSSL_AARCH64) -static void BORINGSSL_maybe_set_module_text_permissions(int permission) { - // Android may be compiled in execute-only-memory mode, in which case the - // .text segment cannot be read. That conflicts with the need for a FIPS - // module to hash its own contents, therefore |mprotect| is used to make - // the module's .text readable for the duration of the hashing process. In - // other build configurations this is a no-op. - const uintptr_t page_size = getpagesize(); - const uintptr_t page_start = - ((uintptr_t)BORINGSSL_bcm_text_start) & ~(page_size - 1); - - if (mprotect((void *)page_start, - ((uintptr_t)BORINGSSL_bcm_text_end) - page_start, - permission) != 0) { - perror("BoringSSL: mprotect"); - } -} -#else -static void BORINGSSL_maybe_set_module_text_permissions(int permission) {} -#endif // !ANDROID - -#endif // !ASAN - -static void __attribute__((constructor)) -BORINGSSL_bcm_power_on_self_test(void) { - CRYPTO_library_init(); - -#if !defined(OPENSSL_ASAN) - // Integrity tests cannot run under ASAN because it involves reading the full - // .text section, which triggers the global-buffer overflow detection. - if (!BORINGSSL_integrity_test()) { - goto err; - } -#endif // OPENSSL_ASAN - - if (!boringssl_self_test_startup()) { - goto err; - } - - return; - -err: - BORINGSSL_FIPS_abort(); -} - -#if !defined(OPENSSL_ASAN) -int BORINGSSL_integrity_test(void) { - const uint8_t *const start = BORINGSSL_bcm_text_start; - const uint8_t *const end = BORINGSSL_bcm_text_end; - - assert_within(start, AES_encrypt, end); - assert_within(start, RSA_sign, end); - assert_within(start, RAND_bytes, end); - assert_within(start, EC_GROUP_cmp, end); - assert_within(start, SHA256_Update, end); - assert_within(start, ECDSA_do_verify, end); - assert_within(start, EVP_AEAD_CTX_seal, end); - -#if defined(BORINGSSL_SHARED_LIBRARY) - const uint8_t *const rodata_start = BORINGSSL_bcm_rodata_start; - const uint8_t *const rodata_end = BORINGSSL_bcm_rodata_end; -#else - // In the static build, read-only data is placed within the .text segment. - const uint8_t *const rodata_start = BORINGSSL_bcm_text_start; - const uint8_t *const rodata_end = BORINGSSL_bcm_text_end; -#endif - - assert_within(rodata_start, kPrimes, rodata_end); - assert_within(rodata_start, kP256Params, rodata_end); - assert_within(rodata_start, kPKCS1SigPrefixes, rodata_end); - - uint8_t result[SHA256_DIGEST_LENGTH]; - const EVP_MD *const kHashFunction = EVP_sha256(); - if (!boringssl_self_test_sha256() || - !boringssl_self_test_hmac_sha256()) { - return 0; - } - - static const uint8_t kHMACKey[64] = {0}; - unsigned result_len; - HMAC_CTX hmac_ctx; - HMAC_CTX_init(&hmac_ctx); - if (!HMAC_Init_ex(&hmac_ctx, kHMACKey, sizeof(kHMACKey), kHashFunction, - NULL /* no ENGINE */)) { - fprintf(stderr, "HMAC_Init_ex failed.\n"); - return 0; - } - - BORINGSSL_maybe_set_module_text_permissions(PROT_READ | PROT_EXEC); -#if defined(BORINGSSL_SHARED_LIBRARY) - uint64_t length = end - start; - HMAC_Update(&hmac_ctx, (const uint8_t *) &length, sizeof(length)); - HMAC_Update(&hmac_ctx, start, length); - - length = rodata_end - rodata_start; - HMAC_Update(&hmac_ctx, (const uint8_t *) &length, sizeof(length)); - HMAC_Update(&hmac_ctx, rodata_start, length); -#else - HMAC_Update(&hmac_ctx, start, end - start); -#endif - BORINGSSL_maybe_set_module_text_permissions(PROT_EXEC); - - if (!HMAC_Final(&hmac_ctx, result, &result_len) || - result_len != sizeof(result)) { - fprintf(stderr, "HMAC failed.\n"); - return 0; - } - HMAC_CTX_cleanse(&hmac_ctx); // FIPS 140-3, AS05.10. - - const uint8_t *expected = BORINGSSL_bcm_text_hash; - - if (!check_test(expected, result, sizeof(result), "FIPS integrity test")) { -#if !defined(BORINGSSL_FIPS_BREAK_TESTS) - return 0; -#endif - } - - OPENSSL_cleanse(result, sizeof(result)); // FIPS 140-3, AS05.10. - return 1; -} -#endif // OPENSSL_ASAN - -void BORINGSSL_FIPS_abort(void) { - for (;;) { - abort(); - exit(1); - } -} - -#endif // BORINGSSL_FIPS diff --git a/third_party/boringssl/src/crypto/fipsmodule/bcm.cc b/third_party/boringssl/src/crypto/fipsmodule/bcm.cc new file mode 100644 index 00000000..1a5a6fd6 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bcm.cc @@ -0,0 +1,287 @@ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if !defined(_GNU_SOURCE) +#define _GNU_SOURCE // needed for syscall() on Linux. +#endif + +#include + +#include +#if defined(BORINGSSL_FIPS) +#include +#include +#endif + +#include +#include +#include +#include + +#include "../bcm_support.h" +#include "../internal.h" +#include "bcm_interface.h" + +// The .cc.inc files are not written as headers, but .cc files which we +// currently need to combine together in the style of a unity or jumbo build. +// -Wheader-hygiene interprets them as headers. +// +// TODO(crbug.com/362530616): When delocate is removed, build these files as +// separate compilation units again. +OPENSSL_CLANG_PRAGMA("clang diagnostic push") +OPENSSL_CLANG_PRAGMA("clang diagnostic ignored \"-Wheader-hygiene\"") +#include "aes/aes.cc.inc" +#include "aes/aes_nohw.cc.inc" +#include "aes/cbc.cc.inc" +#include "aes/cfb.cc.inc" +#include "aes/ctr.cc.inc" +#include "aes/gcm.cc.inc" +#include "aes/gcm_nohw.cc.inc" +#include "aes/key_wrap.cc.inc" +#include "aes/mode_wrappers.cc.inc" +#include "aes/ofb.cc.inc" +#include "bn/add.cc.inc" +#include "bn/asm/x86_64-gcc.cc.inc" +#include "bn/bn.cc.inc" +#include "bn/bytes.cc.inc" +#include "bn/cmp.cc.inc" +#include "bn/ctx.cc.inc" +#include "bn/div.cc.inc" +#include "bn/div_extra.cc.inc" +#include "bn/exponentiation.cc.inc" +#include "bn/gcd.cc.inc" +#include "bn/gcd_extra.cc.inc" +#include "bn/generic.cc.inc" +#include "bn/jacobi.cc.inc" +#include "bn/montgomery.cc.inc" +#include "bn/montgomery_inv.cc.inc" +#include "bn/mul.cc.inc" +#include "bn/prime.cc.inc" +#include "bn/random.cc.inc" +#include "bn/rsaz_exp.cc.inc" +#include "bn/shift.cc.inc" +#include "bn/sqrt.cc.inc" +#include "cipher/aead.cc.inc" +#include "cipher/cipher.cc.inc" +#include "cipher/e_aes.cc.inc" +#include "cipher/e_aesccm.cc.inc" +#include "cmac/cmac.cc.inc" +#include "dh/check.cc.inc" +#include "dh/dh.cc.inc" +#include "digest/digest.cc.inc" +#include "digest/digests.cc.inc" +#include "digestsign/digestsign.cc.inc" +#include "ec/ec.cc.inc" +#include "ec/ec_key.cc.inc" +#include "ec/ec_montgomery.cc.inc" +#include "ec/felem.cc.inc" +#include "ec/oct.cc.inc" +#include "ec/p256-nistz.cc.inc" +#include "ec/p256.cc.inc" +#include "ec/scalar.cc.inc" +#include "ec/simple.cc.inc" +#include "ec/simple_mul.cc.inc" +#include "ec/util.cc.inc" +#include "ec/wnaf.cc.inc" +#include "ecdh/ecdh.cc.inc" +#include "ecdsa/ecdsa.cc.inc" +#include "entropy/jitter.cc.inc" +#include "hkdf/hkdf.cc.inc" +#include "hmac/hmac.cc.inc" +#include "keccak/keccak.cc.inc" +#include "mldsa/mldsa.cc.inc" +#include "mlkem/mlkem.cc.inc" +#include "rand/ctrdrbg.cc.inc" +#include "rand/rand.cc.inc" +#include "rsa/padding.cc.inc" +#include "rsa/rsa.cc.inc" +#include "rsa/rsa_impl.cc.inc" +#include "self_check/fips.cc.inc" +#include "self_check/self_check.cc.inc" +#include "service_indicator/service_indicator.cc.inc" +#include "sha/sha1.cc.inc" +#include "sha/sha256.cc.inc" +#include "sha/sha512.cc.inc" +#include "slhdsa/fors.cc.inc" +#include "slhdsa/merkle.cc.inc" +#include "slhdsa/slhdsa.cc.inc" +#include "slhdsa/thash.cc.inc" +#include "slhdsa/wots.cc.inc" +#include "tls/kdf.cc.inc" +OPENSSL_CLANG_PRAGMA("clang diagnostic pop") + + +using namespace bssl; + +#if defined(BORINGSSL_FIPS) + +#if !defined(OPENSSL_ASAN) + +// These symbols are filled in by delocate.go (in static builds) or a linker +// script (in shared builds). They point to the start and end of the module, and +// the location of the integrity hash, respectively. +extern const uint8_t BORINGSSL_bcm_text_start[]; +extern const uint8_t BORINGSSL_bcm_text_end[]; +extern const uint8_t BORINGSSL_bcm_text_hash[SHA256_DIGEST_LENGTH]; +#if defined(BORINGSSL_SHARED_LIBRARY) +extern const uint8_t BORINGSSL_bcm_rodata_start[]; +extern const uint8_t BORINGSSL_bcm_rodata_end[]; +#endif + +// assert_within is used to sanity check that certain symbols are within the +// bounds of the integrity check. It checks that start <= symbol < end and +// aborts otherwise. +static void assert_within(const void *start, const void *symbol, + const void *end) { + const uintptr_t start_val = (uintptr_t)start; + const uintptr_t symbol_val = (uintptr_t)symbol; + const uintptr_t end_val = (uintptr_t)end; + + if (start_val <= symbol_val && symbol_val < end_val) { + return; + } + + fprintf(CRYPTO_get_stderr(), + "FIPS module doesn't span expected symbol. Expected %p <= %p < %p\n", + start, symbol, end); + BORINGSSL_FIPS_abort(); +} + +#if defined(OPENSSL_ANDROID) && defined(OPENSSL_AARCH64) +static void BORINGSSL_maybe_set_module_text_permissions(int permission) { + // Android may be compiled in execute-only-memory mode, in which case the + // .text segment cannot be read. That conflicts with the need for a FIPS + // module to hash its own contents, therefore |mprotect| is used to make + // the module's .text readable for the duration of the hashing process. In + // other build configurations this is a no-op. + const uintptr_t page_size = getpagesize(); + const uintptr_t page_start = + ((uintptr_t)BORINGSSL_bcm_text_start) & ~(page_size - 1); + + if (mprotect((void *)page_start, + ((uintptr_t)BORINGSSL_bcm_text_end) - page_start, + permission) != 0) { + perror("BoringSSL: mprotect"); + } +} +#else +static void BORINGSSL_maybe_set_module_text_permissions(int permission) {} +#endif // !ANDROID + +#endif // !ASAN + +static void + __attribute__((constructor)) BORINGSSL_bcm_power_on_self_test(void) { +#if !defined(OPENSSL_ASAN) + // Integrity tests cannot run under ASAN because it involves reading the full + // .text section, which triggers the global-buffer overflow detection. + if (!BORINGSSL_integrity_test()) { + goto err; + } +#endif // OPENSSL_ASAN + + if (!boringssl_self_test_startup()) { + goto err; + } + + return; + +err: + BORINGSSL_FIPS_abort(); +} + +#if !defined(OPENSSL_ASAN) +int BORINGSSL_integrity_test() { + const uint8_t *const start = BORINGSSL_bcm_text_start; + const uint8_t *const end = BORINGSSL_bcm_text_end; + + assert_within(start, reinterpret_cast(BCM_aes_encrypt), end); + assert_within(start, reinterpret_cast(RSA_sign), end); + assert_within(start, reinterpret_cast(BCM_rand_bytes), end); + assert_within(start, reinterpret_cast(EC_GROUP_cmp), end); + assert_within(start, reinterpret_cast(BCM_sha256_update), end); + assert_within(start, reinterpret_cast(ecdsa_verify_fixed), end); + assert_within(start, reinterpret_cast(EVP_AEAD_CTX_seal), end); + +#if defined(BORINGSSL_SHARED_LIBRARY) + const uint8_t *const rodata_start = BORINGSSL_bcm_rodata_start; + const uint8_t *const rodata_end = BORINGSSL_bcm_rodata_end; +#else + // In the static build, read-only data is placed within the .text segment. + const uint8_t *const rodata_start = BORINGSSL_bcm_text_start; + const uint8_t *const rodata_end = BORINGSSL_bcm_text_end; +#endif + + assert_within(rodata_start, kPrimes, rodata_end); + assert_within(rodata_start, kP256Field, rodata_end); + assert_within(rodata_start, kPKCS1SigPrefixes, rodata_end); + + uint8_t result[SHA256_DIGEST_LENGTH]; + if (!boringssl_self_test_sha256() || !boringssl_self_test_hmac_sha256()) { + return 0; + } + + static const uint8_t kHMACKey[64] = {0}; + unsigned result_len; + HMAC_CTX hmac_ctx; + HMAC_CTX_init(&hmac_ctx); + if (!HMAC_Init_ex(&hmac_ctx, kHMACKey, sizeof(kHMACKey), EVP_sha256(), + nullptr /* no ENGINE */)) { + fprintf(CRYPTO_get_stderr(), "HMAC_Init_ex failed.\n"); + return 0; + } + + BORINGSSL_maybe_set_module_text_permissions(PROT_READ | PROT_EXEC); +#if defined(BORINGSSL_SHARED_LIBRARY) + uint64_t length = end - start; + HMAC_Update(&hmac_ctx, (const uint8_t *)&length, sizeof(length)); + HMAC_Update(&hmac_ctx, start, length); + + length = rodata_end - rodata_start; + HMAC_Update(&hmac_ctx, (const uint8_t *)&length, sizeof(length)); + HMAC_Update(&hmac_ctx, rodata_start, length); +#else + HMAC_Update(&hmac_ctx, start, end - start); +#endif + BORINGSSL_maybe_set_module_text_permissions(PROT_EXEC); + + if (!HMAC_Final(&hmac_ctx, result, &result_len)) { + fprintf(CRYPTO_get_stderr(), "HMAC failed.\n"); + return 0; + } + HMAC_CTX_cleanse(&hmac_ctx); // FIPS 140-3, AS05.10. + + if (!BORINGSSL_check_test(BORINGSSL_bcm_text_hash, Span(result, result_len), + "FIPS integrity test")) { +#if !defined(BORINGSSL_FIPS_BREAK_TESTS) + return 0; +#endif + } + + OPENSSL_cleanse(result, sizeof(result)); // FIPS 140-3, AS05.10. + return 1; +} + +const uint8_t *FIPS_module_hash() { return BORINGSSL_bcm_text_hash; } + +#endif // OPENSSL_ASAN + +void bssl::BORINGSSL_FIPS_abort() { + for (;;) { + abort(); + exit(1); + } +} + +#endif // BORINGSSL_FIPS diff --git a/third_party/boringssl/src/crypto/fipsmodule/bcm_interface.h b/third_party/boringssl/src/crypto/fipsmodule/bcm_interface.h new file mode 100644 index 00000000..d4ea95d5 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bcm_interface.h @@ -0,0 +1,959 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_BCM_INTERFACE_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_BCM_INTERFACE_H + +#include +#include +#include +#include +#include + + +// This header will eventually become the interface between BCM and the +// rest of libcrypto. More cleanly separating the two is still a work in +// progress (see https://crbug.com/boringssl/722) so, at the moment, we +// consider this no different from any other header in BCM. +// +// Over time, calls from libcrypto to BCM will all move to this header +// and the separation will become more meaningful. + +BSSL_NAMESPACE_BEGIN + +// Enumerated types for return values from bcm functions, both infallible +// and fallible functions. Two success values are used to correspond to the +// FIPS service indicator. For the moment, the official service indicator +// remains the counter, not these values. Once we fully transition to +// these return values from bcm we will change that. +enum class bcm_infallible_t { + approved, + not_approved, +}; + +enum class bcm_status_t { + approved, + not_approved, + failure, +}; +typedef enum bcm_status_t bcm_status; +typedef enum bcm_infallible_t bcm_infallible; + +inline int bcm_success(bcm_status status) { + return status == bcm_status::approved || status == bcm_status::not_approved; +} + +inline bcm_status_t bcm_as_approved_status(int result) { + return result ? bcm_status::approved : bcm_status::failure; +} + +inline bcm_status_t bcm_as_not_approved_status(int result) { + return result ? bcm_status::not_approved : bcm_status::failure; +} + +// Random number generator. + +#if defined(BORINGSSL_FIPS) + +// We overread from /dev/urandom or RDRAND by a factor of 10 and XOR to whiten. +// TODO(bbe): disentangle this value which is used to calculate the size of the +// stack buffer in RAND_need entropy based on a calculation. +#define BORINGSSL_FIPS_OVERREAD 10 + +#endif // BORINGSSL_FIPS + +// BCM_rand_load_entropy supplies |entropy_len| bytes of entropy to the BCM +// module. The |want_additional_input| parameter is true iff the entropy was +// obtained from a source other than the system, e.g. directly from the CPU. +bcm_infallible BCM_rand_load_entropy(const uint8_t *entropy, size_t entropy_len, + int want_additional_input); + +// BCM_rand_bytes is the same as the public |RAND_bytes| function, other +// than returning a bcm_infallible status indicator. +bcm_infallible BCM_rand_bytes(uint8_t *out, size_t out_len); + +// BCM_rand_bytes_hwrng attempts to fill |out| with |len| bytes of entropy from +// the CPU hardware random number generator if one is present. +// bcm_status_approved is returned on success, and a failure status is +// returned otherwise. +bcm_status BCM_rand_bytes_hwrng(uint8_t *out, size_t len); + +// BCM_rand_bytes_with_additional_data samples from the RNG after mixing 32 +// bytes from |user_additional_data| in. +bcm_infallible BCM_rand_bytes_with_additional_data( + uint8_t *out, size_t out_len, const uint8_t user_additional_data[32]); + + +// SHA-1 + +// BCM_sha1_init initialises |sha|. +bcm_infallible BCM_sha1_init(SHA_CTX *sha); + +// SHA1_transform is a low-level function that performs a single, SHA-1 +// block transformation using the state from |sha| and |SHA_CBLOCK| bytes from +// |block|. +bcm_infallible BCM_sha1_transform(SHA_CTX *c, const uint8_t data[SHA_CBLOCK]); + +// BCM_sha1_update adds |len| bytes from |data| to |sha|. +bcm_infallible BCM_sha1_update(SHA_CTX *c, const void *data, size_t len); + +// BCM_sha1_final adds the final padding to |sha| and writes the resulting +// digest to |out|, which must have at least |SHA_DIGEST_LENGTH| bytes of space. +bcm_infallible BCM_sha1_final(uint8_t out[SHA_DIGEST_LENGTH], SHA_CTX *c); + + +// BCM_fips_186_2_prf derives |out_len| bytes from |xkey| using the PRF +// defined in FIPS 186-2, Appendix 3.1, with change notice 1 applied. The b +// parameter is 160 and seed, XKEY, is also 160 bits. The optional XSEED user +// input is all zeros. +// +// The PRF generates a sequence of 320-bit numbers. Each number is encoded as a +// 40-byte string in big-endian and then concatenated to form |out|. If +// |out_len| is not a multiple of 40, the result is truncated. This matches the +// construction used in Section 7 of RFC 4186 and Section 7 of RFC 4187. +// +// This PRF is based on SHA-1, a weak hash function, and should not be used +// in new protocols. It is provided for compatibility with some legacy EAP +// methods. +bcm_infallible BCM_fips_186_2_prf(uint8_t *out, size_t out_len, + const uint8_t xkey[SHA_DIGEST_LENGTH]); + + +// SHA-224 + +// BCM_sha224_unit initialises |sha|. +bcm_infallible BCM_sha224_init(SHA256_CTX *sha); + +// BCM_sha224_update adds |len| bytes from |data| to |sha|. +bcm_infallible BCM_sha224_update(SHA256_CTX *sha, const void *data, size_t len); + +// BCM_sha224_final adds the final padding to |sha| and writes the resulting +// digest to |out|, which must have at least |SHA224_DIGEST_LENGTH| bytes of +// space. It aborts on programmer error. +bcm_infallible BCM_sha224_final(uint8_t out[SHA224_DIGEST_LENGTH], + SHA256_CTX *sha); + + +// SHA-256 + +// BCM_sha256_init initialises |sha|. +bcm_infallible BCM_sha256_init(SHA256_CTX *sha); + +// BCM_sha256_update adds |len| bytes from |data| to |sha|. +bcm_infallible BCM_sha256_update(SHA256_CTX *sha, const void *data, size_t len); + +// BCM_sha256_final adds the final padding to |sha| and writes the resulting +// digest to |out|, which must have at least |SHA256_DIGEST_LENGTH| bytes of +// space. It aborts on programmer error. +bcm_infallible BCM_sha256_final(uint8_t out[SHA256_DIGEST_LENGTH], + SHA256_CTX *sha); + +// BCM_sha256_transform is a low-level function that performs a single, SHA-256 +// block transformation using the state from |sha| and |SHA256_CBLOCK| bytes +// from |block|. +bcm_infallible BCM_sha256_transform(SHA256_CTX *sha, + const uint8_t block[SHA256_CBLOCK]); + +// BCM_sha256_transform_blocks is a low-level function that takes |num_blocks| * +// |SHA256_CBLOCK| bytes of data and performs SHA-256 transforms on it to update +// |state|. +bcm_infallible BCM_sha256_transform_blocks(uint32_t state[8], + const uint8_t *data, + size_t num_blocks); + + +// SHA-384. + +// BCM_sha384_init initialises |sha|. +bcm_infallible BCM_sha384_init(SHA512_CTX *sha); + +// BCM_sha384_update adds |len| bytes from |data| to |sha|. +bcm_infallible BCM_sha384_update(SHA512_CTX *sha, const void *data, size_t len); + +// BCM_sha384_final adds the final padding to |sha| and writes the resulting +// digest to |out|, which must have at least |SHA384_DIGEST_LENGTH| bytes of +// space. It may abort on programmer error. +bcm_infallible BCM_sha384_final(uint8_t out[SHA384_DIGEST_LENGTH], + SHA512_CTX *sha); + + +// SHA-512. + +// BCM_sha512_init initialises |sha|. +bcm_infallible BCM_sha512_init(SHA512_CTX *sha); + +// BCM_sha512_update adds |len| bytes from |data| to |sha|. +bcm_infallible BCM_sha512_update(SHA512_CTX *sha, const void *data, size_t len); + +// BCM_sha512_final adds the final padding to |sha| and writes the resulting +// digest to |out|, which must have at least |SHA512_DIGEST_LENGTH| bytes of +// space. +bcm_infallible BCM_sha512_final(uint8_t out[SHA512_DIGEST_LENGTH], + SHA512_CTX *sha); + +// BCM_sha512_transform is a low-level function that performs a single, SHA-512 +// block transformation using the state from |sha| and |SHA512_CBLOCK| bytes +// from |block|. +bcm_infallible BCM_sha512_transform(SHA512_CTX *sha, + const uint8_t block[SHA512_CBLOCK]); + + +// SHA-512-256 +// +// See https://nvlpubs.nist.gov/nistpubs/FIPS/NIST.FIPS.180-4.pdf section 5.3.6 + +// BCM_sha512_256_init initialises |sha|. +bcm_infallible BCM_sha512_256_init(SHA512_CTX *sha); + +// BCM_sha512_256_update adds |len| bytes from |data| to |sha|. +bcm_infallible BCM_sha512_256_update(SHA512_CTX *sha, const void *data, + size_t len); + +// BCM_sha512_256_final adds the final padding to |sha| and writes the resulting +// digest to |out|, which must have at least |SHA512_256_DIGEST_LENGTH| bytes of +// space. It may abort on programmer error. +bcm_infallible BCM_sha512_256_final(uint8_t out[SHA512_256_DIGEST_LENGTH], + SHA512_CTX *sha); + + +// ML-DSA +// +// Where not commented, these functions have the same signature as the +// corresponding public function. + +// BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES is the number of bytes of uniformly +// random entropy necessary to generate a signature in randomized mode. +#define BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES 32 + +// BCM_MLDSA65_PRIVATE_KEY_BYTES is the number of bytes in an encoded ML-DSA-65 +// private key. +#define BCM_MLDSA65_PRIVATE_KEY_BYTES 4032 + +OPENSSL_EXPORT bcm_status BCM_mldsa65_generate_key( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA65_private_key *out_private_key); + +OPENSSL_EXPORT bcm_status BCM_mldsa65_private_key_from_seed( + MLDSA65_private_key *out_private_key, const uint8_t seed[MLDSA_SEED_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa65_public_from_private( + MLDSA65_public_key *out_public_key, const MLDSA65_private_key *private_key); + +// BCM_mldsa65_public_of_private returns the public half of |private_key|. +const MLDSA65_public_key *BCM_mldsa65_public_of_private( + const MLDSA65_private_key *private_key); + +OPENSSL_EXPORT bcm_status +BCM_mldsa65_check_key_fips(MLDSA65_private_key *private_key); + +OPENSSL_EXPORT bcm_status BCM_mldsa65_generate_key_fips( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA65_private_key *out_private_key); + +OPENSSL_EXPORT bcm_status BCM_mldsa65_private_key_from_seed_fips( + MLDSA65_private_key *out_private_key, const uint8_t seed[MLDSA_SEED_BYTES]); + +OPENSSL_EXPORT bcm_status +BCM_mldsa65_sign(uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const MLDSA65_private_key *private_key, const uint8_t *msg, + size_t msg_len, const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT bcm_status BCM_mldsa65_verify( + const MLDSA65_public_key *public_key, + const uint8_t signature[MLDSA65_SIGNATURE_BYTES], const uint8_t *msg, + size_t msg_len, const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT void BCM_mldsa65_prehash_init( + MLDSA65_prehash *out_prehash_ctx, const MLDSA65_public_key *public_key, + const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT void BCM_mldsa65_prehash_update( + MLDSA65_prehash *inout_prehash_ctx, const uint8_t *msg, size_t msg_len); + +OPENSSL_EXPORT void BCM_mldsa65_prehash_finalize( + uint8_t out_msg_rep[MLDSA_MU_BYTES], MLDSA65_prehash *inout_prehash_ctx); + +OPENSSL_EXPORT bcm_status BCM_mldsa65_sign_message_representative( + uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const MLDSA65_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa65_verify_message_representative( + const MLDSA65_public_key *public_key, + const uint8_t signature[MLDSA65_SIGNATURE_BYTES], + const uint8_t msg_rep[MLDSA_MU_BYTES]); + +OPENSSL_EXPORT bcm_status +BCM_mldsa65_marshal_public_key(CBB *out, const MLDSA65_public_key *public_key); + +OPENSSL_EXPORT bcm_status +BCM_mldsa65_parse_public_key(MLDSA65_public_key *public_key, CBS *in); + +OPENSSL_EXPORT bcm_status +BCM_mldsa65_parse_private_key(MLDSA65_private_key *private_key, CBS *in); + +// BCM_mldsa65_generate_key_external_entropy generates a public/private key pair +// using the given seed, writes the encoded public key to +// |out_encoded_public_key| and sets |out_private_key| to the private key. +OPENSSL_EXPORT bcm_status BCM_mldsa65_generate_key_external_entropy( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + MLDSA65_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa65_generate_key_external_entropy_fips( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + MLDSA65_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]); + +// BCM_mldsa5_sign_internal signs |msg| using |private_key| and writes the +// signature to |out_encoded_signature|. The |context_prefix| and |context| are +// prefixed to the message, in that order, before signing. The |randomizer| +// value can be set to zero bytes in order to make a deterministic signature, or +// else filled with entropy for the usual |MLDSA_sign| behavior. +OPENSSL_EXPORT bcm_status BCM_mldsa65_sign_internal( + uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const MLDSA65_private_key *private_key, const uint8_t *msg, size_t msg_len, + const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len, + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa65_sign_mu_internal( + uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const MLDSA65_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES], + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]); + +// BCM_mldsa5_verify_internal verifies that |encoded_signature| is a valid +// signature of |msg| by |public_key|. The |context_prefix| and |context| are +// prefixed to the message before verification, in that order. +OPENSSL_EXPORT bcm_status BCM_mldsa65_verify_internal( + const MLDSA65_public_key *public_key, + const uint8_t encoded_signature[MLDSA65_SIGNATURE_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix, + size_t context_prefix_len, const uint8_t *context, size_t context_len); + +// BCM_mldsa65_marshal_private_key serializes |private_key| to |out| in the +// NIST format for ML-DSA-65 private keys. +OPENSSL_EXPORT bcm_status BCM_mldsa65_marshal_private_key( + CBB *out, const MLDSA65_private_key *private_key); + +// BCM_mldsa65_public_keys_equal returns one if |a| and |b| are equal and zero +// otherwise. +int BCM_mldsa65_public_keys_equal(const MLDSA65_public_key *a, + const MLDSA65_public_key *b); + + +// BCM_MLDSA87_PRIVATE_KEY_BYTES is the number of bytes in an encoded ML-DSA-87 +// private key. +#define BCM_MLDSA87_PRIVATE_KEY_BYTES 4896 + +OPENSSL_EXPORT bcm_status BCM_mldsa87_generate_key( + uint8_t out_encoded_public_key[MLDSA87_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA87_private_key *out_private_key); + +OPENSSL_EXPORT bcm_status BCM_mldsa87_private_key_from_seed( + MLDSA87_private_key *out_private_key, const uint8_t seed[MLDSA_SEED_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa87_public_from_private( + MLDSA87_public_key *out_public_key, const MLDSA87_private_key *private_key); + +// BCM_mldsa87_public_of_private returns the public half of |private_key|. +const MLDSA87_public_key *BCM_mldsa87_public_of_private( + const MLDSA87_private_key *private_key); + +OPENSSL_EXPORT bcm_status +BCM_mldsa87_check_key_fips(MLDSA87_private_key *private_key); + +OPENSSL_EXPORT bcm_status BCM_mldsa87_generate_key_fips( + uint8_t out_encoded_public_key[MLDSA87_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA87_private_key *out_private_key); + +OPENSSL_EXPORT bcm_status BCM_mldsa87_private_key_from_seed_fips( + MLDSA87_private_key *out_private_key, const uint8_t seed[MLDSA_SEED_BYTES]); + +OPENSSL_EXPORT bcm_status +BCM_mldsa87_sign(uint8_t out_encoded_signature[MLDSA87_SIGNATURE_BYTES], + const MLDSA87_private_key *private_key, const uint8_t *msg, + size_t msg_len, const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT bcm_status +BCM_mldsa87_verify(const MLDSA87_public_key *public_key, + const uint8_t *signature, const uint8_t *msg, size_t msg_len, + const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT void BCM_mldsa87_prehash_init( + MLDSA87_prehash *out_prehash_ctx, const MLDSA87_public_key *public_key, + const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT void BCM_mldsa87_prehash_update( + MLDSA87_prehash *inout_prehash_ctx, const uint8_t *msg, size_t msg_len); + +OPENSSL_EXPORT void BCM_mldsa87_prehash_finalize( + uint8_t out_msg_rep[MLDSA_MU_BYTES], MLDSA87_prehash *inout_prehash_ctx); + +OPENSSL_EXPORT bcm_status BCM_mldsa87_sign_message_representative( + uint8_t out_encoded_signature[MLDSA87_SIGNATURE_BYTES], + const MLDSA87_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa87_verify_message_representative( + const MLDSA87_public_key *public_key, + const uint8_t signature[MLDSA87_SIGNATURE_BYTES], + const uint8_t msg_rep[MLDSA_MU_BYTES]); + +OPENSSL_EXPORT bcm_status +BCM_mldsa87_marshal_public_key(CBB *out, const MLDSA87_public_key *public_key); + +OPENSSL_EXPORT bcm_status +BCM_mldsa87_parse_public_key(MLDSA87_public_key *public_key, CBS *in); + +OPENSSL_EXPORT bcm_status +BCM_mldsa87_parse_private_key(MLDSA87_private_key *private_key, CBS *in); + +// BCM_mldsa87_generate_key_external_entropy generates a public/private key pair +// using the given seed, writes the encoded public key to +// |out_encoded_public_key| and sets |out_private_key| to the private key. +OPENSSL_EXPORT bcm_status BCM_mldsa87_generate_key_external_entropy( + uint8_t out_encoded_public_key[MLDSA87_PUBLIC_KEY_BYTES], + MLDSA87_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa87_generate_key_external_entropy_fips( + uint8_t out_encoded_public_key[MLDSA87_PUBLIC_KEY_BYTES], + MLDSA87_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]); + +// BCM_mldsa87_sign_internal signs |msg| using |private_key| and writes the +// signature to |out_encoded_signature|. The |context_prefix| and |context| are +// prefixed to the message, in that order, before signing. The |randomizer| +// value can be set to zero bytes in order to make a deterministic signature, or +// else filled with entropy for the usual |MLDSA_sign| behavior. +OPENSSL_EXPORT bcm_status BCM_mldsa87_sign_internal( + uint8_t out_encoded_signature[MLDSA87_SIGNATURE_BYTES], + const MLDSA87_private_key *private_key, const uint8_t *msg, size_t msg_len, + const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len, + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa87_sign_mu_internal( + uint8_t out_encoded_signature[MLDSA87_SIGNATURE_BYTES], + const MLDSA87_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES], + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]); + +// BCM_mldsa87_verify_internal verifies that |encoded_signature| is a valid +// signature of |msg| by |public_key|. The |context_prefix| and |context| are +// prefixed to the message before verification, in that order. +OPENSSL_EXPORT bcm_status BCM_mldsa87_verify_internal( + const MLDSA87_public_key *public_key, + const uint8_t encoded_signature[MLDSA87_SIGNATURE_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix, + size_t context_prefix_len, const uint8_t *context, size_t context_len); + +// BCM_mldsa87_marshal_private_key serializes |private_key| to |out| in the +// NIST format for ML-DSA-87 private keys. +OPENSSL_EXPORT bcm_status BCM_mldsa87_marshal_private_key( + CBB *out, const MLDSA87_private_key *private_key); + +// BCM_mldsa87_public_keys_equal returns one if |a| and |b| are equal and zero +// otherwise. +int BCM_mldsa87_public_keys_equal(const MLDSA87_public_key *a, + const MLDSA87_public_key *b); + +// BCM_MLDSA44_PRIVATE_KEY_BYTES is the number of bytes in an encoded ML-DSA-44 +// private key. +#define BCM_MLDSA44_PRIVATE_KEY_BYTES 2560 + +OPENSSL_EXPORT bcm_status BCM_mldsa44_generate_key( + uint8_t out_encoded_public_key[MLDSA44_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA44_private_key *out_private_key); + +OPENSSL_EXPORT bcm_status BCM_mldsa44_private_key_from_seed( + MLDSA44_private_key *out_private_key, const uint8_t seed[MLDSA_SEED_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa44_public_from_private( + MLDSA44_public_key *out_public_key, const MLDSA44_private_key *private_key); + +// BCM_mldsa44_public_of_private returns the public half of |private_key|. +const MLDSA44_public_key *BCM_mldsa44_public_of_private( + const MLDSA44_private_key *private_key); + +OPENSSL_EXPORT bcm_status +BCM_mldsa44_check_key_fips(MLDSA44_private_key *private_key); + +OPENSSL_EXPORT bcm_status BCM_mldsa44_generate_key_fips( + uint8_t out_encoded_public_key[MLDSA44_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA44_private_key *out_private_key); + +OPENSSL_EXPORT bcm_status BCM_mldsa44_private_key_from_seed_fips( + MLDSA44_private_key *out_private_key, const uint8_t seed[MLDSA_SEED_BYTES]); + +OPENSSL_EXPORT bcm_status +BCM_mldsa44_sign(uint8_t out_encoded_signature[MLDSA44_SIGNATURE_BYTES], + const MLDSA44_private_key *private_key, const uint8_t *msg, + size_t msg_len, const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT bcm_status +BCM_mldsa44_verify(const MLDSA44_public_key *public_key, + const uint8_t *signature, const uint8_t *msg, size_t msg_len, + const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT void BCM_mldsa44_prehash_init( + MLDSA44_prehash *out_prehash_ctx, const MLDSA44_public_key *public_key, + const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT void BCM_mldsa44_prehash_update( + MLDSA44_prehash *inout_prehash_ctx, const uint8_t *msg, size_t msg_len); + +OPENSSL_EXPORT void BCM_mldsa44_prehash_finalize( + uint8_t out_msg_rep[MLDSA_MU_BYTES], MLDSA44_prehash *inout_prehash_ctx); + +OPENSSL_EXPORT bcm_status BCM_mldsa44_sign_message_representative( + uint8_t out_encoded_signature[MLDSA44_SIGNATURE_BYTES], + const MLDSA44_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa44_verify_message_representative( + const MLDSA44_public_key *public_key, + const uint8_t signature[MLDSA44_SIGNATURE_BYTES], + const uint8_t msg_rep[MLDSA_MU_BYTES]); + +OPENSSL_EXPORT bcm_status +BCM_mldsa44_marshal_public_key(CBB *out, const MLDSA44_public_key *public_key); + +OPENSSL_EXPORT bcm_status +BCM_mldsa44_parse_public_key(MLDSA44_public_key *public_key, CBS *in); + +OPENSSL_EXPORT bcm_status +BCM_mldsa44_parse_private_key(MLDSA44_private_key *private_key, CBS *in); + +// BCM_mldsa44_generate_key_external_entropy generates a public/private key pair +// using the given seed, writes the encoded public key to +// |out_encoded_public_key| and sets |out_private_key| to the private key. +OPENSSL_EXPORT bcm_status BCM_mldsa44_generate_key_external_entropy( + uint8_t out_encoded_public_key[MLDSA44_PUBLIC_KEY_BYTES], + MLDSA44_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa44_generate_key_external_entropy_fips( + uint8_t out_encoded_public_key[MLDSA44_PUBLIC_KEY_BYTES], + MLDSA44_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]); + +// BCM_mldsa44_sign_internal signs |msg| using |private_key| and writes the +// signature to |out_encoded_signature|. The |context_prefix| and |context| are +// prefixed to the message, in that order, before signing. The |randomizer| +// value can be set to zero bytes in order to make a deterministic signature, or +// else filled with entropy for the usual |MLDSA_sign| behavior. +OPENSSL_EXPORT bcm_status BCM_mldsa44_sign_internal( + uint8_t out_encoded_signature[MLDSA44_SIGNATURE_BYTES], + const MLDSA44_private_key *private_key, const uint8_t *msg, size_t msg_len, + const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len, + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_mldsa44_sign_mu_internal( + uint8_t out_encoded_signature[MLDSA44_SIGNATURE_BYTES], + const MLDSA44_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES], + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]); + +// BCM_mldsa44_verify_internal verifies that |encoded_signature| is a valid +// signature of |msg| by |public_key|. The |context_prefix| and |context| are +// prefixed to the message before verification, in that order. +OPENSSL_EXPORT bcm_status BCM_mldsa44_verify_internal( + const MLDSA44_public_key *public_key, + const uint8_t encoded_signature[MLDSA44_SIGNATURE_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix, + size_t context_prefix_len, const uint8_t *context, size_t context_len); + +// BCM_mldsa44_marshal_private_key serializes |private_key| to |out| in the +// NIST format for ML-DSA-44 private keys. +OPENSSL_EXPORT bcm_status BCM_mldsa44_marshal_private_key( + CBB *out, const MLDSA44_private_key *private_key); + +// BCM_mldsa44_public_keys_equal returns one if |a| and |b| are equal and zero +// otherwise. +int BCM_mldsa44_public_keys_equal(const MLDSA44_public_key *a, + const MLDSA44_public_key *b); + + +// ML-KEM +// +// Where not commented, these functions have the same signature as the +// corresponding public function. + +// BCM_MLKEM_ENCAP_ENTROPY is the number of bytes of uniformly random entropy +// necessary to encapsulate a secret. The entropy will be leaked to the +// decapsulating party. +#define BCM_MLKEM_ENCAP_ENTROPY 32 + +// BCM_MLKEM768_PRIVATE_KEY_BYTES is the length of the data produced by +// |BCM_mlkem768_marshal_private_key|. +#define BCM_MLKEM768_PRIVATE_KEY_BYTES 2400 + +// BCM_MLKEM1024_PRIVATE_KEY_BYTES is the length of the data produced by +// |BCM_mlkem1024_marshal_private_key|. +#define BCM_MLKEM1024_PRIVATE_KEY_BYTES 3168 + +OPENSSL_EXPORT bcm_infallible BCM_mlkem768_generate_key( + uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + MLKEM768_private_key *out_private_key); + +OPENSSL_EXPORT bcm_status +BCM_mlkem768_private_key_from_seed(MLKEM768_private_key *out_private_key, + const uint8_t *seed, size_t seed_len); + +OPENSSL_EXPORT bcm_status BCM_mlkem768_generate_key_fips( + uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + MLKEM768_private_key *out_private_key); + +OPENSSL_EXPORT bcm_status +BCM_mlkem768_check_fips(const MLKEM768_private_key *private_key); + +OPENSSL_EXPORT bcm_infallible +BCM_mlkem768_public_from_private(MLKEM768_public_key *out_public_key, + const MLKEM768_private_key *private_key); + +OPENSSL_EXPORT const MLKEM768_public_key *BCM_mlkem768_public_of_private( + const MLKEM768_private_key *private_key); + +OPENSSL_EXPORT bcm_infallible +BCM_mlkem768_encap(uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const MLKEM768_public_key *public_key); + +OPENSSL_EXPORT bcm_status +BCM_mlkem768_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const uint8_t *ciphertext, size_t ciphertext_len, + const MLKEM768_private_key *private_key); + +OPENSSL_EXPORT bcm_status BCM_mlkem768_marshal_public_key( + CBB *out, const MLKEM768_public_key *public_key); + +// BCM_mlkem768_public_keys_equal returns one if |a| and |b| are equal and zero +// otherwise. +int BCM_mlkem768_public_keys_equal(const MLKEM768_public_key *a, + const MLKEM768_public_key *b); + +OPENSSL_EXPORT bcm_status +BCM_mlkem768_parse_public_key(MLKEM768_public_key *out_public_key, CBS *in); + +// BCM_mlkem768_parse_private_key parses a private key, in NIST's format for +// private keys, from |in| and writes the result to |out_private_key|. It +// returns one on success or zero on parse error or if there are trailing bytes +// in |in|. This format is verbose and should be avoided. Private keys should be +// stored as seeds and parsed using |BCM_mlkem768_private_key_from_seed|. +OPENSSL_EXPORT bcm_status +BCM_mlkem768_parse_private_key(MLKEM768_private_key *out_private_key, CBS *in); + +// BCM_mlkem768_generate_key_external_seed is a deterministic function to create +// a pair of ML-KEM-768 keys, using the supplied seed. The seed needs to be +// uniformly random. This function should only be used for tests; regular +// callers should use the non-deterministic |BCM_mlkem768_generate_key| +// directly. +OPENSSL_EXPORT bcm_infallible BCM_mlkem768_generate_key_external_seed( + uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES], + MLKEM768_private_key *out_private_key, + const uint8_t seed[MLKEM_SEED_BYTES]); + +// BCM_mlkem768_encap_external_entropy behaves like |MLKEM768_encap|, but uses +// |MLKEM_ENCAP_ENTROPY| bytes of |entropy| for randomization. The decapsulating +// side will be able to recover |entropy| in full. This function should only be +// used for tests, regular callers should use the non-deterministic +// |BCM_mlkem768_encap| directly. +OPENSSL_EXPORT bcm_infallible BCM_mlkem768_encap_external_entropy( + uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const MLKEM768_public_key *public_key, + const uint8_t entropy[BCM_MLKEM_ENCAP_ENTROPY]); + +// BCM_mlkem768_marshal_private_key serializes |private_key| to |out| in the +// NIST format for ML-KEM-768 private keys. (Note that one can also save just +// the seed value produced by |BCM_mlkem768_generate_key|, which is +// significantly smaller.) +OPENSSL_EXPORT bcm_status BCM_mlkem768_marshal_private_key( + CBB *out, const MLKEM768_private_key *private_key); + +OPENSSL_EXPORT bcm_infallible BCM_mlkem1024_generate_key( + uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + MLKEM1024_private_key *out_private_key); + +OPENSSL_EXPORT bcm_status BCM_mlkem1024_generate_key_fips( + uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + MLKEM1024_private_key *out_private_key); + +OPENSSL_EXPORT bcm_status +BCM_mlkem1024_check_fips(const MLKEM1024_private_key *private_key); + +OPENSSL_EXPORT bcm_status +BCM_mlkem1024_private_key_from_seed(MLKEM1024_private_key *out_private_key, + const uint8_t *seed, size_t seed_len); + +OPENSSL_EXPORT bcm_infallible +BCM_mlkem1024_public_from_private(MLKEM1024_public_key *out_public_key, + const MLKEM1024_private_key *private_key); + +OPENSSL_EXPORT const MLKEM1024_public_key *BCM_mlkem1024_public_of_private( + const MLKEM1024_private_key *private_key); + +OPENSSL_EXPORT bcm_infallible +BCM_mlkem1024_encap(uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const MLKEM1024_public_key *public_key); + +OPENSSL_EXPORT bcm_status +BCM_mlkem1024_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const uint8_t *ciphertext, size_t ciphertext_len, + const MLKEM1024_private_key *private_key); + +OPENSSL_EXPORT bcm_status BCM_mlkem1024_marshal_public_key( + CBB *out, const MLKEM1024_public_key *public_key); + +// BCM_mlkem1024_public_keys_equal returns one if |a| and |b| are equal and zero +// otherwise. +int BCM_mlkem1024_public_keys_equal(const MLKEM1024_public_key *a, + const MLKEM1024_public_key *b); + +OPENSSL_EXPORT bcm_status +BCM_mlkem1024_parse_public_key(MLKEM1024_public_key *out_public_key, CBS *in); + +// BCM_mlkem1024_parse_private_key parses a private key, in NIST's format for +// private keys, from |in| and writes the result to |out_private_key|. It +// returns one on success or zero on parse error or if there are trailing bytes +// in |in|. This format is verbose and should be avoided. Private keys should be +// stored as seeds and parsed using |BCM_mlkem1024_private_key_from_seed|. +OPENSSL_EXPORT bcm_status BCM_mlkem1024_parse_private_key( + MLKEM1024_private_key *out_private_key, CBS *in); + +// BCM_mlkem1024_generate_key_external_seed is a deterministic function to +// create a pair of ML-KEM-1024 keys, using the supplied seed. The seed needs to +// be uniformly random. This function should only be used for tests, regular +// callers should use the non-deterministic |BCM_mlkem1024_generate_key| +// directly. +OPENSSL_EXPORT bcm_infallible BCM_mlkem1024_generate_key_external_seed( + uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES], + MLKEM1024_private_key *out_private_key, + const uint8_t seed[MLKEM_SEED_BYTES]); + +// BCM_mlkem1024_encap_external_entropy behaves like |MLKEM1024_encap|, but uses +// |MLKEM_ENCAP_ENTROPY| bytes of |entropy| for randomization. The +// decapsulating side will be able to recover |entropy| in full. This function +// should only be used for tests, regular callers should use the +// non-deterministic |BCM_mlkem1024_encap| directly. +OPENSSL_EXPORT bcm_infallible BCM_mlkem1024_encap_external_entropy( + uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const MLKEM1024_public_key *public_key, + const uint8_t entropy[BCM_MLKEM_ENCAP_ENTROPY]); + +// BCM_mlkem1024_marshal_private_key serializes |private_key| to |out| in the +// NIST format for ML-KEM-1024 private keys. (Note that one can also save just +// the seed value produced by |BCM_mlkem1024_generate_key|, which is +// significantly smaller.) +OPENSSL_EXPORT bcm_status BCM_mlkem1024_marshal_private_key( + CBB *out, const MLKEM1024_private_key *private_key); + + +// SLH-DSA + +// Output length of the hash function. +#define BCM_SLHDSA_SHA2_128S_N 16 +#define BCM_SLHDSA_SHAKE_256F_N 32 + +// The number of bytes at the beginning of M', the augmented message, before the +// context. +#define BCM_SLHDSA_M_PRIME_HEADER_LEN 2 + +// SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES is the number of bytes in an +// SLH-DSA-SHA2-128s public key. +#define BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES 32 +#define BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES 64 + +// BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES is the number of bytes in an +// SLH-DSA-SHA2-128s private key. +#define BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES 64 +#define BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES 128 + +// BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES is the number of bytes in an +// SLH-DSA-SHA2-128s signature. +#define BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES 7856 +#define BCM_SLHDSA_SHAKE_256F_SIGNATURE_BYTES 49856 + +// BCM_slhdsa_sha2_128s_generate_key_from_seed generates an SLH-DSA-SHA2-128s +// key pair from a 48-byte seed and writes the result to |out_public_key| and +// |out_secret_key|. +OPENSSL_EXPORT bcm_infallible BCM_slhdsa_sha2_128s_generate_key_from_seed( + uint8_t out_public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + uint8_t out_secret_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t seed[3 * BCM_SLHDSA_SHA2_128S_N]); + +OPENSSL_EXPORT bcm_infallible BCM_slhdsa_shake_256f_generate_key_from_seed( + uint8_t out_public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + uint8_t out_secret_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES], + const uint8_t seed[3 * BCM_SLHDSA_SHAKE_256F_N]); + +// BCM_slhdsa_sha2_128s_generate_key_from_seed_fips does the same thing as +// `BCM_slhdsa_sha2_128s_generate_key_from_seed` but implements the required +// second check before generating a key by testing for nullptr arguments. +OPENSSL_EXPORT bcm_status BCM_slhdsa_sha2_128s_generate_key_from_seed_fips( + uint8_t out_public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + uint8_t out_secret_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t seed[3 * BCM_SLHDSA_SHA2_128S_N]); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_shake_256f_generate_key_from_seed_fips( + uint8_t out_public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + uint8_t out_secret_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES], + const uint8_t seed[3 * BCM_SLHDSA_SHAKE_256F_N]); + +// BCM_slhdsa_sha2_128s_sign_internal acts like |SLHDSA_SHA2_128S_sign| but +// accepts an explicit entropy input, which can be PK.seed (bytes 32..48 of +// the private key) to generate deterministic signatures. It also takes the +// input message in three parts so that the "internal" version of the signing +// function, from section 9.2, can be implemented. The |header| argument may be +// NULL to omit it. +OPENSSL_EXPORT bcm_infallible BCM_slhdsa_sha2_128s_sign_internal( + uint8_t out_signature[BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES], + const uint8_t secret_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN], const uint8_t *context, + size_t context_len, const uint8_t *msg, size_t msg_len, + const uint8_t entropy[BCM_SLHDSA_SHA2_128S_N]); + +OPENSSL_EXPORT bcm_infallible BCM_slhdsa_shake_256f_sign_internal( + uint8_t out_signature[BCM_SLHDSA_SHAKE_256F_SIGNATURE_BYTES], + const uint8_t secret_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES], + const uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN], const uint8_t *context, + size_t context_len, const uint8_t *msg, size_t msg_len, + const uint8_t entropy[BCM_SLHDSA_SHAKE_256F_N]); + +// BCM_slhdsa_sha2_128s_verify_internal acts like |SLHDSA_SHA2_128S_verify| but +// takes the input message in three parts so that the "internal" version of the +// verification function, from section 9.3, can be implemented. The |header| +// argument may be NULL to omit it. +OPENSSL_EXPORT bcm_status BCM_slhdsa_sha2_128s_verify_internal( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN], const uint8_t *context, + size_t context_len, const uint8_t *msg, size_t msg_len); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_shake_256f_verify_internal( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + const uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN], const uint8_t *context, + size_t context_len, const uint8_t *msg, size_t msg_len); + +OPENSSL_EXPORT bcm_infallible BCM_slhdsa_sha2_128s_generate_key( + uint8_t out_public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + uint8_t out_private_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES]); + +OPENSSL_EXPORT bcm_infallible BCM_slhdsa_shake_256f_generate_key( + uint8_t out_public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + uint8_t out_private_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_sha2_128s_generate_key_fips( + uint8_t out_public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + uint8_t out_private_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_shake_256f_generate_key_fips( + uint8_t out_public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + uint8_t out_private_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES]); + +OPENSSL_EXPORT bcm_infallible BCM_slhdsa_sha2_128s_public_from_private( + uint8_t out_public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES]); + +OPENSSL_EXPORT bcm_infallible BCM_slhdsa_shake_256f_public_from_private( + uint8_t out_public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES]); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_sha2_128s_sign( + uint8_t out_signature[BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_shake_256f_sign( + uint8_t out_signature[BCM_SLHDSA_SHAKE_256F_SIGNATURE_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_sha2_128s_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_shake_256f_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_sha2_128s_prehash_sign( + uint8_t out_signature[BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_shake_256f_prehash_sign( + uint8_t out_signature[BCM_SLHDSA_SHAKE_256F_SIGNATURE_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_sha2_128s_prehash_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len); + +OPENSSL_EXPORT bcm_status BCM_slhdsa_shake_256f_prehash_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len); + + +// AES + +// BCM_aes_encrypt encrypts a single block from |in| to |out| with |key|. The +// |in| and |out| pointers may overlap. +bcm_infallible BCM_aes_encrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key); +// BCM_aes_decrypt decrypts a single block from |in| to |out| with |key|. The +// |in| and |out| pointers may overlap. +bcm_infallible BCM_aes_decrypt(const uint8_t *in, uint8_t *out, + const AES_KEY *key); + +// BCM_aes_set_encrypt_key configures |aeskey| to encrypt with the |bits|-bit +// key, |key|. |key| must point to |bits|/8 bytes. It will return failure if +// |bits| is an invalid AES key size. +bcm_status BCM_aes_set_encrypt_key(const uint8_t *key, unsigned bits, + AES_KEY *aeskey); + +// BCM_aes_set_decrypt_key configures |aeskey| to decrypt with the |bits|-bit +// key, |key|. |key| must point to |bits|/8 bytes. It will return failure if +// |bits| is an invalid AES key size. +bcm_status BCM_aes_set_decrypt_key(const uint8_t *key, unsigned bits, + AES_KEY *aeskey); + +BSSL_NAMESPACE_END + + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_BCM_INTERFACE_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/add.c b/third_party/boringssl/src/crypto/fipsmodule/bn/add.c deleted file mode 100644 index 38a84506..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/add.c +++ /dev/null @@ -1,316 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include - -#include "internal.h" - - -int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { - const BIGNUM *tmp; - int a_neg = a->neg, ret; - - // a + b a+b - // a + -b a-b - // -a + b b-a - // -a + -b -(a+b) - if (a_neg ^ b->neg) { - // only one is negative - if (a_neg) { - tmp = a; - a = b; - b = tmp; - } - - // we are now a - b - if (BN_ucmp(a, b) < 0) { - if (!BN_usub(r, b, a)) { - return 0; - } - r->neg = 1; - } else { - if (!BN_usub(r, a, b)) { - return 0; - } - r->neg = 0; - } - return 1; - } - - ret = BN_uadd(r, a, b); - r->neg = a_neg; - return ret; -} - -int bn_uadd_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { - // Widths are public, so we normalize to make |a| the larger one. - if (a->width < b->width) { - const BIGNUM *tmp = a; - a = b; - b = tmp; - } - - int max = a->width; - int min = b->width; - if (!bn_wexpand(r, max + 1)) { - return 0; - } - r->width = max + 1; - - BN_ULONG carry = bn_add_words(r->d, a->d, b->d, min); - for (int i = min; i < max; i++) { - // |r| and |a| may alias, so use a temporary. - BN_ULONG tmp = carry + a->d[i]; - carry = tmp < a->d[i]; - r->d[i] = tmp; - } - - r->d[max] = carry; - return 1; -} - -int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { - if (!bn_uadd_consttime(r, a, b)) { - return 0; - } - bn_set_minimal_width(r); - return 1; -} - -int BN_add_word(BIGNUM *a, BN_ULONG w) { - BN_ULONG l; - int i; - - // degenerate case: w is zero - if (!w) { - return 1; - } - - // degenerate case: a is zero - if (BN_is_zero(a)) { - return BN_set_word(a, w); - } - - // handle 'a' when negative - if (a->neg) { - a->neg = 0; - i = BN_sub_word(a, w); - if (!BN_is_zero(a)) { - a->neg = !(a->neg); - } - return i; - } - - for (i = 0; w != 0 && i < a->width; i++) { - a->d[i] = l = a->d[i] + w; - w = (w > l) ? 1 : 0; - } - - if (w && i == a->width) { - if (!bn_wexpand(a, a->width + 1)) { - return 0; - } - a->width++; - a->d[i] = w; - } - - return 1; -} - -int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { - int add = 0, neg = 0; - const BIGNUM *tmp; - - // a - b a-b - // a - -b a+b - // -a - b -(a+b) - // -a - -b b-a - if (a->neg) { - if (b->neg) { - tmp = a; - a = b; - b = tmp; - } else { - add = 1; - neg = 1; - } - } else { - if (b->neg) { - add = 1; - neg = 0; - } - } - - if (add) { - if (!BN_uadd(r, a, b)) { - return 0; - } - - r->neg = neg; - return 1; - } - - if (BN_ucmp(a, b) < 0) { - if (!BN_usub(r, b, a)) { - return 0; - } - r->neg = 1; - } else { - if (!BN_usub(r, a, b)) { - return 0; - } - r->neg = 0; - } - - return 1; -} - -int bn_usub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { - // |b| may have more words than |a| given non-minimal inputs, but all words - // beyond |a->width| must then be zero. - int b_width = b->width; - if (b_width > a->width) { - if (!bn_fits_in_words(b, a->width)) { - OPENSSL_PUT_ERROR(BN, BN_R_ARG2_LT_ARG3); - return 0; - } - b_width = a->width; - } - - if (!bn_wexpand(r, a->width)) { - return 0; - } - - BN_ULONG borrow = bn_sub_words(r->d, a->d, b->d, b_width); - for (int i = b_width; i < a->width; i++) { - // |r| and |a| may alias, so use a temporary. - BN_ULONG tmp = a->d[i]; - r->d[i] = a->d[i] - borrow; - borrow = tmp < r->d[i]; - } - - if (borrow) { - OPENSSL_PUT_ERROR(BN, BN_R_ARG2_LT_ARG3); - return 0; - } - - r->width = a->width; - r->neg = 0; - return 1; -} - -int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { - if (!bn_usub_consttime(r, a, b)) { - return 0; - } - bn_set_minimal_width(r); - return 1; -} - -int BN_sub_word(BIGNUM *a, BN_ULONG w) { - int i; - - // degenerate case: w is zero - if (!w) { - return 1; - } - - // degenerate case: a is zero - if (BN_is_zero(a)) { - i = BN_set_word(a, w); - if (i != 0) { - BN_set_negative(a, 1); - } - return i; - } - - // handle 'a' when negative - if (a->neg) { - a->neg = 0; - i = BN_add_word(a, w); - a->neg = 1; - return i; - } - - if ((bn_minimal_width(a) == 1) && (a->d[0] < w)) { - a->d[0] = w - a->d[0]; - a->neg = 1; - return 1; - } - - i = 0; - for (;;) { - if (a->d[i] >= w) { - a->d[i] -= w; - break; - } else { - a->d[i] -= w; - i++; - w = 1; - } - } - - if ((a->d[i] == 0) && (i == (a->width - 1))) { - a->width--; - } - - return 1; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/add.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/add.cc.inc new file mode 100644 index 00000000..38c7d5b3 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/add.cc.inc @@ -0,0 +1,271 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + +#include "../../internal.h" +#include "internal.h" + + +using namespace bssl; + +int BN_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { + const BIGNUM *tmp; + int a_neg = a->neg, ret; + + // a + b a+b + // a + -b a-b + // -a + b b-a + // -a + -b -(a+b) + if (a_neg ^ b->neg) { + // only one is negative + if (a_neg) { + tmp = a; + a = b; + b = tmp; + } + + // we are now a - b + if (BN_ucmp(a, b) < 0) { + if (!BN_usub(r, b, a)) { + return 0; + } + r->neg = 1; + } else { + if (!BN_usub(r, a, b)) { + return 0; + } + r->neg = 0; + } + return 1; + } + + ret = BN_uadd(r, a, b); + r->neg = a_neg; + return ret; +} + +int bssl::bn_uadd_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { + // Widths are public, so we normalize to make |a| the larger one. + if (a->width < b->width) { + const BIGNUM *tmp = a; + a = b; + b = tmp; + } + + int max = a->width; + int min = b->width; + if (!bn_wexpand(r, max + 1)) { + return 0; + } + r->width = max + 1; + + BN_ULONG carry = bn_add_words(r->d, a->d, b->d, min); + for (int i = min; i < max; i++) { + r->d[i] = CRYPTO_addc_w(a->d[i], 0, carry, &carry); + } + + r->d[max] = carry; + return 1; +} + +int BN_uadd(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { + if (!bn_uadd_consttime(r, a, b)) { + return 0; + } + bn_set_minimal_width(r); + return 1; +} + +int BN_add_word(BIGNUM *a, BN_ULONG w) { + BN_ULONG l; + int i; + + // degenerate case: w is zero + if (!w) { + return 1; + } + + // degenerate case: a is zero + if (BN_is_zero(a)) { + return BN_set_word(a, w); + } + + // handle 'a' when negative + if (a->neg) { + a->neg = 0; + i = BN_sub_word(a, w); + if (!BN_is_zero(a)) { + a->neg = !(a->neg); + } + return i; + } + + for (i = 0; w != 0 && i < a->width; i++) { + a->d[i] = l = a->d[i] + w; + w = (w > l) ? 1 : 0; + } + + if (w && i == a->width) { + if (!bn_wexpand(a, a->width + 1)) { + return 0; + } + a->width++; + a->d[i] = w; + } + + return 1; +} + +int BN_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { + int add = 0, neg = 0; + const BIGNUM *tmp; + + // a - b a-b + // a - -b a+b + // -a - b -(a+b) + // -a - -b b-a + if (a->neg) { + if (b->neg) { + tmp = a; + a = b; + b = tmp; + } else { + add = 1; + neg = 1; + } + } else { + if (b->neg) { + add = 1; + neg = 0; + } + } + + if (add) { + if (!BN_uadd(r, a, b)) { + return 0; + } + + r->neg = neg; + return 1; + } + + if (BN_ucmp(a, b) < 0) { + if (!BN_usub(r, b, a)) { + return 0; + } + r->neg = 1; + } else { + if (!BN_usub(r, a, b)) { + return 0; + } + r->neg = 0; + } + + return 1; +} + +int bssl::bn_usub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { + // |b| may have more words than |a| given non-minimal inputs, but all words + // beyond |a->width| must then be zero. + int b_width = b->width; + if (b_width > a->width) { + if (!bn_fits_in_words(b, a->width)) { + OPENSSL_PUT_ERROR(BN, BN_R_ARG2_LT_ARG3); + return 0; + } + b_width = a->width; + } + + if (!bn_wexpand(r, a->width)) { + return 0; + } + + BN_ULONG borrow = bn_sub_words(r->d, a->d, b->d, b_width); + for (int i = b_width; i < a->width; i++) { + r->d[i] = CRYPTO_subc_w(a->d[i], 0, borrow, &borrow); + } + + if (borrow) { + OPENSSL_PUT_ERROR(BN, BN_R_ARG2_LT_ARG3); + return 0; + } + + r->width = a->width; + r->neg = 0; + return 1; +} + +int BN_usub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b) { + if (!bn_usub_consttime(r, a, b)) { + return 0; + } + bn_set_minimal_width(r); + return 1; +} + +int BN_sub_word(BIGNUM *a, BN_ULONG w) { + int i; + + // degenerate case: w is zero + if (!w) { + return 1; + } + + // degenerate case: a is zero + if (BN_is_zero(a)) { + i = BN_set_word(a, w); + if (i != 0) { + BN_set_negative(a, 1); + } + return i; + } + + // handle 'a' when negative + if (a->neg) { + a->neg = 0; + i = BN_add_word(a, w); + a->neg = 1; + return i; + } + + if ((bn_minimal_width(a) == 1) && (a->d[0] < w)) { + a->d[0] = w - a->d[0]; + a->neg = 1; + return 1; + } + + i = 0; + for (;;) { + if (a->d[i] >= w) { + a->d[i] -= w; + break; + } else { + a->d[i] -= w; + i++; + w = 1; + } + } + + if ((a->d[i] == 0) && (i == (a->width - 1))) { + a->width--; + } + + return 1; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/asm/x86_64-gcc.c b/third_party/boringssl/src/crypto/fipsmodule/bn/asm/x86_64-gcc.c deleted file mode 100644 index 30fff217..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/asm/x86_64-gcc.c +++ /dev/null @@ -1,541 +0,0 @@ -/* x86_64 BIGNUM accelerator version 0.1, December 2002. - * - * Implemented by Andy Polyakov for the OpenSSL - * project. - * - * Rights for redistribution and usage in source and binary forms are - * granted according to the OpenSSL license. Warranty of any kind is - * disclaimed. - * - * Q. Version 0.1? It doesn't sound like Andy, he used to assign real - * versions, like 1.0... - * A. Well, that's because this code is basically a quick-n-dirty - * proof-of-concept hack. As you can see it's implemented with - * inline assembler, which means that you're bound to GCC and that - * there might be enough room for further improvement. - * - * Q. Why inline assembler? - * A. x86_64 features own ABI which I'm not familiar with. This is - * why I decided to let the compiler take care of subroutine - * prologue/epilogue as well as register allocation. For reference. - * Win64 implements different ABI for AMD64, different from Linux. - * - * Q. How much faster does it get? - * A. 'apps/openssl speed rsa dsa' output with no-asm: - * - * sign verify sign/s verify/s - * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 - * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 - * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 - * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 - * sign verify sign/s verify/s - * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 - * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 - * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 - * - * 'apps/openssl speed rsa dsa' output with this module: - * - * sign verify sign/s verify/s - * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 - * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 - * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 - * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 - * sign verify sign/s verify/s - * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 - * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 - * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 - * - * For the reference. IA-32 assembler implementation performs - * very much like 64-bit code compiled with no-asm on the same - * machine. - */ - -#include - -// TODO(davidben): Get this file working on MSVC x64. -#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ - (defined(__GNUC__) || defined(__clang__)) - -#include "../internal.h" - - -#undef mul -#undef mul_add - -// "m"(a), "+m"(r) is the way to favor DirectPath µ-code; -// "g"(0) let the compiler to decide where does it -// want to keep the value of zero; -#define mul_add(r, a, word, carry) \ - do { \ - register BN_ULONG high, low; \ - __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \ - __asm__("addq %2,%0; adcq %3,%1" \ - : "+r"(carry), "+d"(high) \ - : "a"(low), "g"(0) \ - : "cc"); \ - __asm__("addq %2,%0; adcq %3,%1" \ - : "+m"(r), "+d"(high) \ - : "r"(carry), "g"(0) \ - : "cc"); \ - (carry) = high; \ - } while (0) - -#define mul(r, a, word, carry) \ - do { \ - register BN_ULONG high, low; \ - __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \ - __asm__("addq %2,%0; adcq %3,%1" \ - : "+r"(carry), "+d"(high) \ - : "a"(low), "g"(0) \ - : "cc"); \ - (r) = (carry); \ - (carry) = high; \ - } while (0) -#undef sqr -#define sqr(r0, r1, a) __asm__("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc"); - -BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, - BN_ULONG w) { - BN_ULONG c1 = 0; - - if (num == 0) { - return (c1); - } - - while (num & ~3) { - mul_add(rp[0], ap[0], w, c1); - mul_add(rp[1], ap[1], w, c1); - mul_add(rp[2], ap[2], w, c1); - mul_add(rp[3], ap[3], w, c1); - ap += 4; - rp += 4; - num -= 4; - } - if (num) { - mul_add(rp[0], ap[0], w, c1); - if (--num == 0) { - return c1; - } - mul_add(rp[1], ap[1], w, c1); - if (--num == 0) { - return c1; - } - mul_add(rp[2], ap[2], w, c1); - return c1; - } - - return c1; -} - -BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, - BN_ULONG w) { - BN_ULONG c1 = 0; - - if (num == 0) { - return c1; - } - - while (num & ~3) { - mul(rp[0], ap[0], w, c1); - mul(rp[1], ap[1], w, c1); - mul(rp[2], ap[2], w, c1); - mul(rp[3], ap[3], w, c1); - ap += 4; - rp += 4; - num -= 4; - } - if (num) { - mul(rp[0], ap[0], w, c1); - if (--num == 0) { - return c1; - } - mul(rp[1], ap[1], w, c1); - if (--num == 0) { - return c1; - } - mul(rp[2], ap[2], w, c1); - } - return c1; -} - -void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, size_t n) { - if (n == 0) { - return; - } - - while (n & ~3) { - sqr(r[0], r[1], a[0]); - sqr(r[2], r[3], a[1]); - sqr(r[4], r[5], a[2]); - sqr(r[6], r[7], a[3]); - a += 4; - r += 8; - n -= 4; - } - if (n) { - sqr(r[0], r[1], a[0]); - if (--n == 0) { - return; - } - sqr(r[2], r[3], a[1]); - if (--n == 0) { - return; - } - sqr(r[4], r[5], a[2]); - } -} - -BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - size_t n) { - BN_ULONG ret; - size_t i = 0; - - if (n == 0) { - return 0; - } - - __asm__ volatile ( - " subq %0,%0 \n" // clear carry - " jmp 1f \n" - ".p2align 4 \n" - "1:" - " movq (%4,%2,8),%0 \n" - " adcq (%5,%2,8),%0 \n" - " movq %0,(%3,%2,8) \n" - " lea 1(%2),%2 \n" - " dec %1 \n" - " jnz 1b \n" - " sbbq %0,%0 \n" - : "=&r"(ret), "+c"(n), "+r"(i) - : "r"(rp), "r"(ap), "r"(bp) - : "cc", "memory"); - - return ret & 1; -} - -BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - size_t n) { - BN_ULONG ret; - size_t i = 0; - - if (n == 0) { - return 0; - } - - __asm__ volatile ( - " subq %0,%0 \n" // clear borrow - " jmp 1f \n" - ".p2align 4 \n" - "1:" - " movq (%4,%2,8),%0 \n" - " sbbq (%5,%2,8),%0 \n" - " movq %0,(%3,%2,8) \n" - " lea 1(%2),%2 \n" - " dec %1 \n" - " jnz 1b \n" - " sbbq %0,%0 \n" - : "=&r"(ret), "+c"(n), "+r"(i) - : "r"(rp), "r"(ap), "r"(bp) - : "cc", "memory"); - - return ret & 1; -} - -// mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) -// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) -// sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) -// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) - -// Keep in mind that carrying into high part of multiplication result can not -// overflow, because it cannot be all-ones. -#define mul_add_c(a, b, c0, c1, c2) \ - do { \ - BN_ULONG t1, t2; \ - __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \ - __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \ - : "+r"(c0), "+r"(c1), "+r"(c2) \ - : "r"(t1), "r"(t2), "g"(0) \ - : "cc"); \ - } while (0) - -#define sqr_add_c(a, i, c0, c1, c2) \ - do { \ - BN_ULONG t1, t2; \ - __asm__("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \ - __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \ - : "+r"(c0), "+r"(c1), "+r"(c2) \ - : "r"(t1), "r"(t2), "g"(0) \ - : "cc"); \ - } while (0) - -#define mul_add_c2(a, b, c0, c1, c2) \ - do { \ - BN_ULONG t1, t2; \ - __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \ - __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \ - : "+r"(c0), "+r"(c1), "+r"(c2) \ - : "r"(t1), "r"(t2), "g"(0) \ - : "cc"); \ - __asm__("addq %3,%0; adcq %4,%1; adcq %5,%2" \ - : "+r"(c0), "+r"(c1), "+r"(c2) \ - : "r"(t1), "r"(t2), "g"(0) \ - : "cc"); \ - } while (0) - -#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) - -void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) { - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - mul_add_c(a[0], b[0], c1, c2, c3); - r[0] = c1; - c1 = 0; - mul_add_c(a[0], b[1], c2, c3, c1); - mul_add_c(a[1], b[0], c2, c3, c1); - r[1] = c2; - c2 = 0; - mul_add_c(a[2], b[0], c3, c1, c2); - mul_add_c(a[1], b[1], c3, c1, c2); - mul_add_c(a[0], b[2], c3, c1, c2); - r[2] = c3; - c3 = 0; - mul_add_c(a[0], b[3], c1, c2, c3); - mul_add_c(a[1], b[2], c1, c2, c3); - mul_add_c(a[2], b[1], c1, c2, c3); - mul_add_c(a[3], b[0], c1, c2, c3); - r[3] = c1; - c1 = 0; - mul_add_c(a[4], b[0], c2, c3, c1); - mul_add_c(a[3], b[1], c2, c3, c1); - mul_add_c(a[2], b[2], c2, c3, c1); - mul_add_c(a[1], b[3], c2, c3, c1); - mul_add_c(a[0], b[4], c2, c3, c1); - r[4] = c2; - c2 = 0; - mul_add_c(a[0], b[5], c3, c1, c2); - mul_add_c(a[1], b[4], c3, c1, c2); - mul_add_c(a[2], b[3], c3, c1, c2); - mul_add_c(a[3], b[2], c3, c1, c2); - mul_add_c(a[4], b[1], c3, c1, c2); - mul_add_c(a[5], b[0], c3, c1, c2); - r[5] = c3; - c3 = 0; - mul_add_c(a[6], b[0], c1, c2, c3); - mul_add_c(a[5], b[1], c1, c2, c3); - mul_add_c(a[4], b[2], c1, c2, c3); - mul_add_c(a[3], b[3], c1, c2, c3); - mul_add_c(a[2], b[4], c1, c2, c3); - mul_add_c(a[1], b[5], c1, c2, c3); - mul_add_c(a[0], b[6], c1, c2, c3); - r[6] = c1; - c1 = 0; - mul_add_c(a[0], b[7], c2, c3, c1); - mul_add_c(a[1], b[6], c2, c3, c1); - mul_add_c(a[2], b[5], c2, c3, c1); - mul_add_c(a[3], b[4], c2, c3, c1); - mul_add_c(a[4], b[3], c2, c3, c1); - mul_add_c(a[5], b[2], c2, c3, c1); - mul_add_c(a[6], b[1], c2, c3, c1); - mul_add_c(a[7], b[0], c2, c3, c1); - r[7] = c2; - c2 = 0; - mul_add_c(a[7], b[1], c3, c1, c2); - mul_add_c(a[6], b[2], c3, c1, c2); - mul_add_c(a[5], b[3], c3, c1, c2); - mul_add_c(a[4], b[4], c3, c1, c2); - mul_add_c(a[3], b[5], c3, c1, c2); - mul_add_c(a[2], b[6], c3, c1, c2); - mul_add_c(a[1], b[7], c3, c1, c2); - r[8] = c3; - c3 = 0; - mul_add_c(a[2], b[7], c1, c2, c3); - mul_add_c(a[3], b[6], c1, c2, c3); - mul_add_c(a[4], b[5], c1, c2, c3); - mul_add_c(a[5], b[4], c1, c2, c3); - mul_add_c(a[6], b[3], c1, c2, c3); - mul_add_c(a[7], b[2], c1, c2, c3); - r[9] = c1; - c1 = 0; - mul_add_c(a[7], b[3], c2, c3, c1); - mul_add_c(a[6], b[4], c2, c3, c1); - mul_add_c(a[5], b[5], c2, c3, c1); - mul_add_c(a[4], b[6], c2, c3, c1); - mul_add_c(a[3], b[7], c2, c3, c1); - r[10] = c2; - c2 = 0; - mul_add_c(a[4], b[7], c3, c1, c2); - mul_add_c(a[5], b[6], c3, c1, c2); - mul_add_c(a[6], b[5], c3, c1, c2); - mul_add_c(a[7], b[4], c3, c1, c2); - r[11] = c3; - c3 = 0; - mul_add_c(a[7], b[5], c1, c2, c3); - mul_add_c(a[6], b[6], c1, c2, c3); - mul_add_c(a[5], b[7], c1, c2, c3); - r[12] = c1; - c1 = 0; - mul_add_c(a[6], b[7], c2, c3, c1); - mul_add_c(a[7], b[6], c2, c3, c1); - r[13] = c2; - c2 = 0; - mul_add_c(a[7], b[7], c3, c1, c2); - r[14] = c3; - r[15] = c1; -} - -void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) { - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - mul_add_c(a[0], b[0], c1, c2, c3); - r[0] = c1; - c1 = 0; - mul_add_c(a[0], b[1], c2, c3, c1); - mul_add_c(a[1], b[0], c2, c3, c1); - r[1] = c2; - c2 = 0; - mul_add_c(a[2], b[0], c3, c1, c2); - mul_add_c(a[1], b[1], c3, c1, c2); - mul_add_c(a[0], b[2], c3, c1, c2); - r[2] = c3; - c3 = 0; - mul_add_c(a[0], b[3], c1, c2, c3); - mul_add_c(a[1], b[2], c1, c2, c3); - mul_add_c(a[2], b[1], c1, c2, c3); - mul_add_c(a[3], b[0], c1, c2, c3); - r[3] = c1; - c1 = 0; - mul_add_c(a[3], b[1], c2, c3, c1); - mul_add_c(a[2], b[2], c2, c3, c1); - mul_add_c(a[1], b[3], c2, c3, c1); - r[4] = c2; - c2 = 0; - mul_add_c(a[2], b[3], c3, c1, c2); - mul_add_c(a[3], b[2], c3, c1, c2); - r[5] = c3; - c3 = 0; - mul_add_c(a[3], b[3], c1, c2, c3); - r[6] = c1; - r[7] = c2; -} - -void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) { - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - sqr_add_c(a, 0, c1, c2, c3); - r[0] = c1; - c1 = 0; - sqr_add_c2(a, 1, 0, c2, c3, c1); - r[1] = c2; - c2 = 0; - sqr_add_c(a, 1, c3, c1, c2); - sqr_add_c2(a, 2, 0, c3, c1, c2); - r[2] = c3; - c3 = 0; - sqr_add_c2(a, 3, 0, c1, c2, c3); - sqr_add_c2(a, 2, 1, c1, c2, c3); - r[3] = c1; - c1 = 0; - sqr_add_c(a, 2, c2, c3, c1); - sqr_add_c2(a, 3, 1, c2, c3, c1); - sqr_add_c2(a, 4, 0, c2, c3, c1); - r[4] = c2; - c2 = 0; - sqr_add_c2(a, 5, 0, c3, c1, c2); - sqr_add_c2(a, 4, 1, c3, c1, c2); - sqr_add_c2(a, 3, 2, c3, c1, c2); - r[5] = c3; - c3 = 0; - sqr_add_c(a, 3, c1, c2, c3); - sqr_add_c2(a, 4, 2, c1, c2, c3); - sqr_add_c2(a, 5, 1, c1, c2, c3); - sqr_add_c2(a, 6, 0, c1, c2, c3); - r[6] = c1; - c1 = 0; - sqr_add_c2(a, 7, 0, c2, c3, c1); - sqr_add_c2(a, 6, 1, c2, c3, c1); - sqr_add_c2(a, 5, 2, c2, c3, c1); - sqr_add_c2(a, 4, 3, c2, c3, c1); - r[7] = c2; - c2 = 0; - sqr_add_c(a, 4, c3, c1, c2); - sqr_add_c2(a, 5, 3, c3, c1, c2); - sqr_add_c2(a, 6, 2, c3, c1, c2); - sqr_add_c2(a, 7, 1, c3, c1, c2); - r[8] = c3; - c3 = 0; - sqr_add_c2(a, 7, 2, c1, c2, c3); - sqr_add_c2(a, 6, 3, c1, c2, c3); - sqr_add_c2(a, 5, 4, c1, c2, c3); - r[9] = c1; - c1 = 0; - sqr_add_c(a, 5, c2, c3, c1); - sqr_add_c2(a, 6, 4, c2, c3, c1); - sqr_add_c2(a, 7, 3, c2, c3, c1); - r[10] = c2; - c2 = 0; - sqr_add_c2(a, 7, 4, c3, c1, c2); - sqr_add_c2(a, 6, 5, c3, c1, c2); - r[11] = c3; - c3 = 0; - sqr_add_c(a, 6, c1, c2, c3); - sqr_add_c2(a, 7, 5, c1, c2, c3); - r[12] = c1; - c1 = 0; - sqr_add_c2(a, 7, 6, c2, c3, c1); - r[13] = c2; - c2 = 0; - sqr_add_c(a, 7, c3, c1, c2); - r[14] = c3; - r[15] = c1; -} - -void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) { - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - sqr_add_c(a, 0, c1, c2, c3); - r[0] = c1; - c1 = 0; - sqr_add_c2(a, 1, 0, c2, c3, c1); - r[1] = c2; - c2 = 0; - sqr_add_c(a, 1, c3, c1, c2); - sqr_add_c2(a, 2, 0, c3, c1, c2); - r[2] = c3; - c3 = 0; - sqr_add_c2(a, 3, 0, c1, c2, c3); - sqr_add_c2(a, 2, 1, c1, c2, c3); - r[3] = c1; - c1 = 0; - sqr_add_c(a, 2, c2, c3, c1); - sqr_add_c2(a, 3, 1, c2, c3, c1); - r[4] = c2; - c2 = 0; - sqr_add_c2(a, 3, 2, c3, c1, c2); - r[5] = c3; - c3 = 0; - sqr_add_c(a, 3, c1, c2, c3); - r[6] = c1; - r[7] = c2; -} - -#undef mul_add -#undef mul -#undef sqr -#undef mul_add_c -#undef sqr_add_c -#undef mul_add_c2 -#undef sqr_add_c2 - -#endif // !NO_ASM && X86_64 && (__GNUC__ || __clang__) diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/asm/x86_64-gcc.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/asm/x86_64-gcc.cc.inc new file mode 100644 index 00000000..920cba2d --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/asm/x86_64-gcc.cc.inc @@ -0,0 +1,573 @@ +// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* x86_64 BIGNUM accelerator version 0.1, December 2002. + * + * Implemented by Andy Polyakov for the OpenSSL + * project. + * + * Rights for redistribution and usage in source and binary forms are + * granted according to the License. Warranty of any kind is disclaimed. + * + * Q. Version 0.1? It doesn't sound like Andy, he used to assign real + * versions, like 1.0... + * A. Well, that's because this code is basically a quick-n-dirty + * proof-of-concept hack. As you can see it's implemented with + * inline assembler, which means that you're bound to GCC and that + * there might be enough room for further improvement. + * + * Q. Why inline assembler? + * A. x86_64 features own ABI which I'm not familiar with. This is + * why I decided to let the compiler take care of subroutine + * prologue/epilogue as well as register allocation. For reference. + * Win64 implements different ABI for AMD64, different from Linux. + * + * Q. How much faster does it get? + * A. 'apps/openssl speed rsa dsa' output with no-asm: + * + * sign verify sign/s verify/s + * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 + * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 + * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 + * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 + * sign verify sign/s verify/s + * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 + * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 + * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 + * + * 'apps/openssl speed rsa dsa' output with this module: + * + * sign verify sign/s verify/s + * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 + * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 + * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 + * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 + * sign verify sign/s verify/s + * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 + * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 + * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 + * + * For the reference. IA-32 assembler implementation performs + * very much like 64-bit code compiled with no-asm on the same + * machine. + */ + +#include + +// TODO(davidben): Get this file working on MSVC x64. +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ + (defined(__GNUC__) || defined(__clang__)) + +#include "../internal.h" + + +using namespace bssl; + +#undef mul +#undef mul_add + +// "m"(a), "+m"(r) is the way to favor DirectPath µ-code; +#define mul_add(r, a, word, carry) \ + do { \ + BN_ULONG high, low; \ + __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \ + __asm__("addq %2,%0; adcq $0,%1" \ + : "+r"(carry), "+d"(high) \ + : "a"(low) \ + : "cc"); \ + __asm__("addq %2,%0; adcq $0,%1" \ + : "+m"(r), "+d"(high) \ + : "r"(carry) \ + : "cc"); \ + (carry) = high; \ + } while (0) + +#define mul(r, a, word, carry) \ + do { \ + BN_ULONG high, low; \ + __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \ + __asm__("addq %2,%0; adcq $0,%1" \ + : "+r"(carry), "+d"(high) \ + : "a"(low) \ + : "cc"); \ + (r) = (carry); \ + (carry) = high; \ + } while (0) + +// r0:r1:carry = r0:r1 + a^2 + carry:0 +#define sqr_add(r0, r1, a, carry) \ + do { \ + BN_ULONG high, low; \ + /* lo:hi = a^2 */ \ + __asm__("mulq %2" : "=a"(low), "=d"(high) : "a"(a) : "cc"); \ + /* carry:hi = lo:hi + carry:0 = a^2 + carry */ \ + __asm__("addq %2,%0; adcq $0,%1" \ + : "+r"(carry), "+d"(high) \ + : "a"(low) \ + : "cc"); \ + /* r0:r1:carry = carry:hi + r0:r1 */ \ + __asm__("addq %2,%0; adcq %3,%1; movq $0, %2; adcq $0, %2" \ + : "+m"(r0), "+m"(r1), "+r"(carry) \ + : "d"(high) \ + : "cc"); \ + } while (0) + +BN_ULONG bssl::bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, + BN_ULONG w) { + BN_ULONG c1 = 0; + + if (num == 0) { + return (c1); + } + + while (num & ~3) { + mul_add(rp[0], ap[0], w, c1); + mul_add(rp[1], ap[1], w, c1); + mul_add(rp[2], ap[2], w, c1); + mul_add(rp[3], ap[3], w, c1); + ap += 4; + rp += 4; + num -= 4; + } + if (num) { + mul_add(rp[0], ap[0], w, c1); + if (--num == 0) { + return c1; + } + mul_add(rp[1], ap[1], w, c1); + if (--num == 0) { + return c1; + } + mul_add(rp[2], ap[2], w, c1); + return c1; + } + + return c1; +} + +BN_ULONG bssl::bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, + BN_ULONG w) { + BN_ULONG c1 = 0; + + if (num == 0) { + return c1; + } + + while (num & ~3) { + mul(rp[0], ap[0], w, c1); + mul(rp[1], ap[1], w, c1); + mul(rp[2], ap[2], w, c1); + mul(rp[3], ap[3], w, c1); + ap += 4; + rp += 4; + num -= 4; + } + if (num) { + mul(rp[0], ap[0], w, c1); + if (--num == 0) { + return c1; + } + mul(rp[1], ap[1], w, c1); + if (--num == 0) { + return c1; + } + mul(rp[2], ap[2], w, c1); + } + return c1; +} + +void bssl::bn_sqr_add_words(BN_ULONG *r, const BN_ULONG *a, size_t n) { + if (n == 0) { + return; + } + + BN_ULONG carry = 0; + while (n & ~3) { + sqr_add(r[0], r[1], a[0], carry); + sqr_add(r[2], r[3], a[1], carry); + sqr_add(r[4], r[5], a[2], carry); + sqr_add(r[6], r[7], a[3], carry); + a += 4; + r += 8; + n -= 4; + } + if (n) { + sqr_add(r[0], r[1], a[0], carry); + if (--n == 0) { + return; + } + sqr_add(r[2], r[3], a[1], carry); + if (--n == 0) { + return; + } + sqr_add(r[4], r[5], a[2], carry); + } +} + +BN_ULONG bssl::bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, size_t n) { + BN_ULONG ret; + size_t i = 0; + + if (n == 0) { + return 0; + } + + __asm__ volatile( + " subq %0,%0 \n" // clear carry + " jmp 1f \n" + ".p2align 4 \n" + "1:" + " movq (%4,%2,8),%0 \n" + " adcq (%5,%2,8),%0 \n" + " movq %0,(%3,%2,8) \n" + " lea 1(%2),%2 \n" + " dec %1 \n" + " jnz 1b \n" + " sbbq %0,%0 \n" + : "=&r"(ret), "+&c"(n), "+&r"(i) + : "r"(rp), "r"(ap), "r"(bp) + : "cc", "memory"); + + return ret & 1; +} + +BN_ULONG bssl::bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, size_t n) { + BN_ULONG ret; + size_t i = 0; + + if (n == 0) { + return 0; + } + + __asm__ volatile( + " subq %0,%0 \n" // clear borrow + " jmp 1f \n" + ".p2align 4 \n" + "1:" + " movq (%4,%2,8),%0 \n" + " sbbq (%5,%2,8),%0 \n" + " movq %0,(%3,%2,8) \n" + " lea 1(%2),%2 \n" + " dec %1 \n" + " jnz 1b \n" + " sbbq %0,%0 \n" + : "=&r"(ret), "+&c"(n), "+&r"(i) + : "r"(rp), "r"(ap), "r"(bp) + : "cc", "memory"); + + return ret & 1; +} + +// mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) +// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) +// sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) +// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) + +// Keep in mind that carrying into high part of multiplication result can not +// overflow, because it cannot be all-ones. +#define mul_add_c(a, b, c0, c1, c2) \ + do { \ + BN_ULONG t1, t2; \ + __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \ + __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \ + : "+&r"(c0), "+r"(c1), "+r"(c2) \ + : "r"(t1), "r"(t2) \ + : "cc"); \ + } while (0) + +#define sqr_add_c(a, i, c0, c1, c2) \ + do { \ + BN_ULONG t1, t2; \ + __asm__("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \ + __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \ + : "+&r"(c0), "+r"(c1), "+r"(c2) \ + : "r"(t1), "r"(t2) \ + : "cc"); \ + } while (0) + +#define mul_add_c2(a, b, c0, c1, c2) \ + do { \ + BN_ULONG t1, t2; \ + __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \ + __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \ + : "+&r"(c0), "+r"(c1), "+r"(c2) \ + : "r"(t1), "r"(t2) \ + : "cc"); \ + __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \ + : "+&r"(c0), "+r"(c1), "+r"(c2) \ + : "r"(t1), "r"(t2) \ + : "cc"); \ + } while (0) + +#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) + +void bssl::bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], + const BN_ULONG b[8]) { + BN_ULONG c1, c2, c3; + + c1 = 0; + c2 = 0; + c3 = 0; + mul_add_c(a[0], b[0], c1, c2, c3); + r[0] = c1; + c1 = 0; + mul_add_c(a[0], b[1], c2, c3, c1); + mul_add_c(a[1], b[0], c2, c3, c1); + r[1] = c2; + c2 = 0; + mul_add_c(a[2], b[0], c3, c1, c2); + mul_add_c(a[1], b[1], c3, c1, c2); + mul_add_c(a[0], b[2], c3, c1, c2); + r[2] = c3; + c3 = 0; + mul_add_c(a[0], b[3], c1, c2, c3); + mul_add_c(a[1], b[2], c1, c2, c3); + mul_add_c(a[2], b[1], c1, c2, c3); + mul_add_c(a[3], b[0], c1, c2, c3); + r[3] = c1; + c1 = 0; + mul_add_c(a[4], b[0], c2, c3, c1); + mul_add_c(a[3], b[1], c2, c3, c1); + mul_add_c(a[2], b[2], c2, c3, c1); + mul_add_c(a[1], b[3], c2, c3, c1); + mul_add_c(a[0], b[4], c2, c3, c1); + r[4] = c2; + c2 = 0; + mul_add_c(a[0], b[5], c3, c1, c2); + mul_add_c(a[1], b[4], c3, c1, c2); + mul_add_c(a[2], b[3], c3, c1, c2); + mul_add_c(a[3], b[2], c3, c1, c2); + mul_add_c(a[4], b[1], c3, c1, c2); + mul_add_c(a[5], b[0], c3, c1, c2); + r[5] = c3; + c3 = 0; + mul_add_c(a[6], b[0], c1, c2, c3); + mul_add_c(a[5], b[1], c1, c2, c3); + mul_add_c(a[4], b[2], c1, c2, c3); + mul_add_c(a[3], b[3], c1, c2, c3); + mul_add_c(a[2], b[4], c1, c2, c3); + mul_add_c(a[1], b[5], c1, c2, c3); + mul_add_c(a[0], b[6], c1, c2, c3); + r[6] = c1; + c1 = 0; + mul_add_c(a[0], b[7], c2, c3, c1); + mul_add_c(a[1], b[6], c2, c3, c1); + mul_add_c(a[2], b[5], c2, c3, c1); + mul_add_c(a[3], b[4], c2, c3, c1); + mul_add_c(a[4], b[3], c2, c3, c1); + mul_add_c(a[5], b[2], c2, c3, c1); + mul_add_c(a[6], b[1], c2, c3, c1); + mul_add_c(a[7], b[0], c2, c3, c1); + r[7] = c2; + c2 = 0; + mul_add_c(a[7], b[1], c3, c1, c2); + mul_add_c(a[6], b[2], c3, c1, c2); + mul_add_c(a[5], b[3], c3, c1, c2); + mul_add_c(a[4], b[4], c3, c1, c2); + mul_add_c(a[3], b[5], c3, c1, c2); + mul_add_c(a[2], b[6], c3, c1, c2); + mul_add_c(a[1], b[7], c3, c1, c2); + r[8] = c3; + c3 = 0; + mul_add_c(a[2], b[7], c1, c2, c3); + mul_add_c(a[3], b[6], c1, c2, c3); + mul_add_c(a[4], b[5], c1, c2, c3); + mul_add_c(a[5], b[4], c1, c2, c3); + mul_add_c(a[6], b[3], c1, c2, c3); + mul_add_c(a[7], b[2], c1, c2, c3); + r[9] = c1; + c1 = 0; + mul_add_c(a[7], b[3], c2, c3, c1); + mul_add_c(a[6], b[4], c2, c3, c1); + mul_add_c(a[5], b[5], c2, c3, c1); + mul_add_c(a[4], b[6], c2, c3, c1); + mul_add_c(a[3], b[7], c2, c3, c1); + r[10] = c2; + c2 = 0; + mul_add_c(a[4], b[7], c3, c1, c2); + mul_add_c(a[5], b[6], c3, c1, c2); + mul_add_c(a[6], b[5], c3, c1, c2); + mul_add_c(a[7], b[4], c3, c1, c2); + r[11] = c3; + c3 = 0; + mul_add_c(a[7], b[5], c1, c2, c3); + mul_add_c(a[6], b[6], c1, c2, c3); + mul_add_c(a[5], b[7], c1, c2, c3); + r[12] = c1; + c1 = 0; + mul_add_c(a[6], b[7], c2, c3, c1); + mul_add_c(a[7], b[6], c2, c3, c1); + r[13] = c2; + c2 = 0; + mul_add_c(a[7], b[7], c3, c1, c2); + r[14] = c3; + r[15] = c1; +} + +void bssl::bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], + const BN_ULONG b[4]) { + BN_ULONG c1, c2, c3; + + c1 = 0; + c2 = 0; + c3 = 0; + mul_add_c(a[0], b[0], c1, c2, c3); + r[0] = c1; + c1 = 0; + mul_add_c(a[0], b[1], c2, c3, c1); + mul_add_c(a[1], b[0], c2, c3, c1); + r[1] = c2; + c2 = 0; + mul_add_c(a[2], b[0], c3, c1, c2); + mul_add_c(a[1], b[1], c3, c1, c2); + mul_add_c(a[0], b[2], c3, c1, c2); + r[2] = c3; + c3 = 0; + mul_add_c(a[0], b[3], c1, c2, c3); + mul_add_c(a[1], b[2], c1, c2, c3); + mul_add_c(a[2], b[1], c1, c2, c3); + mul_add_c(a[3], b[0], c1, c2, c3); + r[3] = c1; + c1 = 0; + mul_add_c(a[3], b[1], c2, c3, c1); + mul_add_c(a[2], b[2], c2, c3, c1); + mul_add_c(a[1], b[3], c2, c3, c1); + r[4] = c2; + c2 = 0; + mul_add_c(a[2], b[3], c3, c1, c2); + mul_add_c(a[3], b[2], c3, c1, c2); + r[5] = c3; + c3 = 0; + mul_add_c(a[3], b[3], c1, c2, c3); + r[6] = c1; + r[7] = c2; +} + +void bssl::bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) { + BN_ULONG c1, c2, c3; + + c1 = 0; + c2 = 0; + c3 = 0; + sqr_add_c(a, 0, c1, c2, c3); + r[0] = c1; + c1 = 0; + sqr_add_c2(a, 1, 0, c2, c3, c1); + r[1] = c2; + c2 = 0; + sqr_add_c(a, 1, c3, c1, c2); + sqr_add_c2(a, 2, 0, c3, c1, c2); + r[2] = c3; + c3 = 0; + sqr_add_c2(a, 3, 0, c1, c2, c3); + sqr_add_c2(a, 2, 1, c1, c2, c3); + r[3] = c1; + c1 = 0; + sqr_add_c(a, 2, c2, c3, c1); + sqr_add_c2(a, 3, 1, c2, c3, c1); + sqr_add_c2(a, 4, 0, c2, c3, c1); + r[4] = c2; + c2 = 0; + sqr_add_c2(a, 5, 0, c3, c1, c2); + sqr_add_c2(a, 4, 1, c3, c1, c2); + sqr_add_c2(a, 3, 2, c3, c1, c2); + r[5] = c3; + c3 = 0; + sqr_add_c(a, 3, c1, c2, c3); + sqr_add_c2(a, 4, 2, c1, c2, c3); + sqr_add_c2(a, 5, 1, c1, c2, c3); + sqr_add_c2(a, 6, 0, c1, c2, c3); + r[6] = c1; + c1 = 0; + sqr_add_c2(a, 7, 0, c2, c3, c1); + sqr_add_c2(a, 6, 1, c2, c3, c1); + sqr_add_c2(a, 5, 2, c2, c3, c1); + sqr_add_c2(a, 4, 3, c2, c3, c1); + r[7] = c2; + c2 = 0; + sqr_add_c(a, 4, c3, c1, c2); + sqr_add_c2(a, 5, 3, c3, c1, c2); + sqr_add_c2(a, 6, 2, c3, c1, c2); + sqr_add_c2(a, 7, 1, c3, c1, c2); + r[8] = c3; + c3 = 0; + sqr_add_c2(a, 7, 2, c1, c2, c3); + sqr_add_c2(a, 6, 3, c1, c2, c3); + sqr_add_c2(a, 5, 4, c1, c2, c3); + r[9] = c1; + c1 = 0; + sqr_add_c(a, 5, c2, c3, c1); + sqr_add_c2(a, 6, 4, c2, c3, c1); + sqr_add_c2(a, 7, 3, c2, c3, c1); + r[10] = c2; + c2 = 0; + sqr_add_c2(a, 7, 4, c3, c1, c2); + sqr_add_c2(a, 6, 5, c3, c1, c2); + r[11] = c3; + c3 = 0; + sqr_add_c(a, 6, c1, c2, c3); + sqr_add_c2(a, 7, 5, c1, c2, c3); + r[12] = c1; + c1 = 0; + sqr_add_c2(a, 7, 6, c2, c3, c1); + r[13] = c2; + c2 = 0; + sqr_add_c(a, 7, c3, c1, c2); + r[14] = c3; + r[15] = c1; +} + +void bssl::bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) { + BN_ULONG c1, c2, c3; + + c1 = 0; + c2 = 0; + c3 = 0; + sqr_add_c(a, 0, c1, c2, c3); + r[0] = c1; + c1 = 0; + sqr_add_c2(a, 1, 0, c2, c3, c1); + r[1] = c2; + c2 = 0; + sqr_add_c(a, 1, c3, c1, c2); + sqr_add_c2(a, 2, 0, c3, c1, c2); + r[2] = c3; + c3 = 0; + sqr_add_c2(a, 3, 0, c1, c2, c3); + sqr_add_c2(a, 2, 1, c1, c2, c3); + r[3] = c1; + c1 = 0; + sqr_add_c(a, 2, c2, c3, c1); + sqr_add_c2(a, 3, 1, c2, c3, c1); + r[4] = c2; + c2 = 0; + sqr_add_c2(a, 3, 2, c3, c1, c2); + r[5] = c3; + c3 = 0; + sqr_add_c(a, 3, c1, c2, c3); + r[6] = c1; + r[7] = c2; +} + +#undef mul_add +#undef mul +#undef sqr +#undef mul_add_c +#undef sqr_add_c +#undef mul_add_c2 +#undef sqr_add_c2 + +#endif // !NO_ASM && X86_64 && (__GNUC__ || __clang__) diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/bn.c b/third_party/boringssl/src/crypto/fipsmodule/bn/bn.c deleted file mode 100644 index 006e3eb7..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/bn.c +++ /dev/null @@ -1,439 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include -#include - -#include "internal.h" -#include "../delocate.h" - - -BIGNUM *BN_new(void) { - BIGNUM *bn = OPENSSL_malloc(sizeof(BIGNUM)); - - if (bn == NULL) { - OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE); - return NULL; - } - - OPENSSL_memset(bn, 0, sizeof(BIGNUM)); - bn->flags = BN_FLG_MALLOCED; - - return bn; -} - -void BN_init(BIGNUM *bn) { - OPENSSL_memset(bn, 0, sizeof(BIGNUM)); -} - -void BN_free(BIGNUM *bn) { - if (bn == NULL) { - return; - } - - if ((bn->flags & BN_FLG_STATIC_DATA) == 0) { - OPENSSL_free(bn->d); - } - - if (bn->flags & BN_FLG_MALLOCED) { - OPENSSL_free(bn); - } else { - bn->d = NULL; - } -} - -void BN_clear_free(BIGNUM *bn) { - BN_free(bn); -} - -BIGNUM *BN_dup(const BIGNUM *src) { - BIGNUM *copy; - - if (src == NULL) { - return NULL; - } - - copy = BN_new(); - if (copy == NULL) { - return NULL; - } - - if (!BN_copy(copy, src)) { - BN_free(copy); - return NULL; - } - - return copy; -} - -BIGNUM *BN_copy(BIGNUM *dest, const BIGNUM *src) { - if (src == dest) { - return dest; - } - - if (!bn_wexpand(dest, src->width)) { - return NULL; - } - - OPENSSL_memcpy(dest->d, src->d, sizeof(src->d[0]) * src->width); - - dest->width = src->width; - dest->neg = src->neg; - return dest; -} - -void BN_clear(BIGNUM *bn) { - if (bn->d != NULL) { - OPENSSL_memset(bn->d, 0, bn->dmax * sizeof(bn->d[0])); - } - - bn->width = 0; - bn->neg = 0; -} - -DEFINE_METHOD_FUNCTION(BIGNUM, BN_value_one) { - static const BN_ULONG kOneLimbs[1] = { 1 }; - out->d = (BN_ULONG*) kOneLimbs; - out->width = 1; - out->dmax = 1; - out->neg = 0; - out->flags = BN_FLG_STATIC_DATA; -} - -// BN_num_bits_word returns the minimum number of bits needed to represent the -// value in |l|. -unsigned BN_num_bits_word(BN_ULONG l) { - // |BN_num_bits| is often called on RSA prime factors. These have public bit - // lengths, but all bits beyond the high bit are secret, so count bits in - // constant time. - BN_ULONG x, mask; - int bits = (l != 0); - -#if BN_BITS2 > 32 - // Look at the upper half of |x|. |x| is at most 64 bits long. - x = l >> 32; - // Set |mask| to all ones if |x| (the top 32 bits of |l|) is non-zero and all - // all zeros otherwise. - mask = 0u - x; - mask = (0u - (mask >> (BN_BITS2 - 1))); - // If |x| is non-zero, the lower half is included in the bit count in full, - // and we count the upper half. Otherwise, we count the lower half. - bits += 32 & mask; - l ^= (x ^ l) & mask; // |l| is |x| if |mask| and remains |l| otherwise. -#endif - - // The remaining blocks are analogous iterations at lower powers of two. - x = l >> 16; - mask = 0u - x; - mask = (0u - (mask >> (BN_BITS2 - 1))); - bits += 16 & mask; - l ^= (x ^ l) & mask; - - x = l >> 8; - mask = 0u - x; - mask = (0u - (mask >> (BN_BITS2 - 1))); - bits += 8 & mask; - l ^= (x ^ l) & mask; - - x = l >> 4; - mask = 0u - x; - mask = (0u - (mask >> (BN_BITS2 - 1))); - bits += 4 & mask; - l ^= (x ^ l) & mask; - - x = l >> 2; - mask = 0u - x; - mask = (0u - (mask >> (BN_BITS2 - 1))); - bits += 2 & mask; - l ^= (x ^ l) & mask; - - x = l >> 1; - mask = 0u - x; - mask = (0u - (mask >> (BN_BITS2 - 1))); - bits += 1 & mask; - - return bits; -} - -unsigned BN_num_bits(const BIGNUM *bn) { - const int width = bn_minimal_width(bn); - if (width == 0) { - return 0; - } - - return (width - 1) * BN_BITS2 + BN_num_bits_word(bn->d[width - 1]); -} - -unsigned BN_num_bytes(const BIGNUM *bn) { - return (BN_num_bits(bn) + 7) / 8; -} - -void BN_zero(BIGNUM *bn) { - bn->width = bn->neg = 0; -} - -int BN_one(BIGNUM *bn) { - return BN_set_word(bn, 1); -} - -int BN_set_word(BIGNUM *bn, BN_ULONG value) { - if (value == 0) { - BN_zero(bn); - return 1; - } - - if (!bn_wexpand(bn, 1)) { - return 0; - } - - bn->neg = 0; - bn->d[0] = value; - bn->width = 1; - return 1; -} - -int BN_set_u64(BIGNUM *bn, uint64_t value) { -#if BN_BITS2 == 64 - return BN_set_word(bn, value); -#elif BN_BITS2 == 32 - if (value <= BN_MASK2) { - return BN_set_word(bn, (BN_ULONG)value); - } - - if (!bn_wexpand(bn, 2)) { - return 0; - } - - bn->neg = 0; - bn->d[0] = (BN_ULONG)value; - bn->d[1] = (BN_ULONG)(value >> 32); - bn->width = 2; - return 1; -#else -#error "BN_BITS2 must be 32 or 64." -#endif -} - -int bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num) { - if (!bn_wexpand(bn, num)) { - return 0; - } - OPENSSL_memmove(bn->d, words, num * sizeof(BN_ULONG)); - // |bn_wexpand| verified that |num| isn't too large. - bn->width = (int)num; - bn->neg = 0; - return 1; -} - -void bn_set_static_words(BIGNUM *bn, const BN_ULONG *words, size_t num) { - if ((bn->flags & BN_FLG_STATIC_DATA) == 0) { - OPENSSL_free(bn->d); - } - bn->d = (BN_ULONG *)words; - - bn->width = num; - bn->dmax = num; - bn->neg = 0; - bn->flags |= BN_FLG_STATIC_DATA; -} - -int bn_fits_in_words(const BIGNUM *bn, size_t num) { - // All words beyond |num| must be zero. - BN_ULONG mask = 0; - for (size_t i = num; i < (size_t)bn->width; i++) { - mask |= bn->d[i]; - } - return mask == 0; -} - -int bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn) { - if (bn->neg) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - - size_t width = (size_t)bn->width; - if (width > num) { - if (!bn_fits_in_words(bn, num)) { - OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); - return 0; - } - width = num; - } - - OPENSSL_memset(out, 0, sizeof(BN_ULONG) * num); - OPENSSL_memcpy(out, bn->d, sizeof(BN_ULONG) * width); - return 1; -} - -int BN_is_negative(const BIGNUM *bn) { - return bn->neg != 0; -} - -void BN_set_negative(BIGNUM *bn, int sign) { - if (sign && !BN_is_zero(bn)) { - bn->neg = 1; - } else { - bn->neg = 0; - } -} - -int bn_wexpand(BIGNUM *bn, size_t words) { - BN_ULONG *a; - - if (words <= (size_t)bn->dmax) { - return 1; - } - - if (words > (INT_MAX / (4 * BN_BITS2))) { - OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); - return 0; - } - - if (bn->flags & BN_FLG_STATIC_DATA) { - OPENSSL_PUT_ERROR(BN, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA); - return 0; - } - - a = OPENSSL_malloc(sizeof(BN_ULONG) * words); - if (a == NULL) { - OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE); - return 0; - } - - OPENSSL_memcpy(a, bn->d, sizeof(BN_ULONG) * bn->width); - - OPENSSL_free(bn->d); - bn->d = a; - bn->dmax = (int)words; - - return 1; -} - -int bn_expand(BIGNUM *bn, size_t bits) { - if (bits + BN_BITS2 - 1 < bits) { - OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); - return 0; - } - return bn_wexpand(bn, (bits+BN_BITS2-1)/BN_BITS2); -} - -int bn_resize_words(BIGNUM *bn, size_t words) { -#if defined(OPENSSL_PPC64LE) - // This is a workaround for a miscompilation bug in Clang 7.0.1 on POWER. - // The unittests catch the miscompilation, if it occurs, and it manifests - // as a crash in |bn_fits_in_words|. - // - // The bug only triggers if building in FIPS mode and with -O3. Clang 8.0.1 - // has the same bug but this workaround is not effective there---I've not - // been able to find a workaround for 8.0.1. - // - // At the time of writing (2019-08-08), Clang git does *not* have this bug - // and does not need this workaroud. The current git version should go on to - // be Clang 10 thus, once we can depend on that, this can be removed. - if (value_barrier_w((size_t)bn->width == words)) { - return 1; - } -#endif - - if ((size_t)bn->width <= words) { - if (!bn_wexpand(bn, words)) { - return 0; - } - OPENSSL_memset(bn->d + bn->width, 0, - (words - bn->width) * sizeof(BN_ULONG)); - bn->width = words; - return 1; - } - - // All words beyond the new width must be zero. - if (!bn_fits_in_words(bn, words)) { - OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); - return 0; - } - bn->width = words; - return 1; -} - -void bn_select_words(BN_ULONG *r, BN_ULONG mask, const BN_ULONG *a, - const BN_ULONG *b, size_t num) { - for (size_t i = 0; i < num; i++) { - static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), - "crypto_word_t is too small"); - r[i] = constant_time_select_w(mask, a[i], b[i]); - } -} - -int bn_minimal_width(const BIGNUM *bn) { - int ret = bn->width; - while (ret > 0 && bn->d[ret - 1] == 0) { - ret--; - } - return ret; -} - -void bn_set_minimal_width(BIGNUM *bn) { - bn->width = bn_minimal_width(bn); - if (bn->width == 0) { - bn->neg = 0; - } -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/bn.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/bn.cc.inc new file mode 100644 index 00000000..38dea250 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/bn.cc.inc @@ -0,0 +1,377 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include + +#include "../../mem_internal.h" +#include "../delocate.h" +#include "internal.h" + + +using namespace bssl; + +// BN_MAX_WORDS is the maximum number of words allowed in a |BIGNUM|. It is +// sized so byte and bit counts of a |BIGNUM| always fit in |int|, with room to +// spare. +#define BN_MAX_WORDS (INT_MAX / (4 * BN_BITS2)) + +BIGNUM *BN_new() { + BIGNUM *bn = New(); + + if (bn == nullptr) { + return nullptr; + } + + OPENSSL_memset(bn, 0, sizeof(BIGNUM)); + bn->flags = BN_FLG_MALLOCED; + + return bn; +} + +BIGNUM *BN_secure_new() { return BN_new(); } + +void BN_init(BIGNUM *bn) { OPENSSL_memset(bn, 0, sizeof(BIGNUM)); } + +void BN_free(BIGNUM *bn) { + if (bn == nullptr) { + return; + } + + if ((bn->flags & BN_FLG_STATIC_DATA) == 0) { + OPENSSL_free(bn->d); + } + + if (bn->flags & BN_FLG_MALLOCED) { + Delete(bn); + } else { + bn->d = nullptr; + } +} + +void BN_clear_free(BIGNUM *bn) { BN_free(bn); } + +BIGNUM *BN_dup(const BIGNUM *src) { + BIGNUM *copy; + + if (src == nullptr) { + return nullptr; + } + + copy = BN_new(); + if (copy == nullptr) { + return nullptr; + } + + if (!BN_copy(copy, src)) { + BN_free(copy); + return nullptr; + } + + return copy; +} + +BIGNUM *BN_copy(BIGNUM *dest, const BIGNUM *src) { + if (src == dest) { + return dest; + } + + if (!bn_wexpand(dest, src->width)) { + return nullptr; + } + + OPENSSL_memcpy(dest->d, src->d, sizeof(src->d[0]) * src->width); + + dest->width = src->width; + dest->neg = src->neg; + return dest; +} + +void BN_clear(BIGNUM *bn) { + if (bn->d != nullptr) { + OPENSSL_memset(bn->d, 0, bn->dmax * sizeof(bn->d[0])); + } + + bn->width = 0; + bn->neg = 0; +} + +DEFINE_METHOD_FUNCTION(BIGNUM, BN_value_one) { + static const BN_ULONG kOneLimbs[1] = {1}; + out->d = (BN_ULONG *)kOneLimbs; + out->width = 1; + out->dmax = 1; + out->neg = 0; + out->flags = BN_FLG_STATIC_DATA; +} + +// BN_num_bits_word returns the minimum number of bits needed to represent the +// value in |l|. +unsigned BN_num_bits_word(BN_ULONG l) { + // |BN_num_bits| is often called on RSA prime factors. These have public bit + // lengths, but all bits beyond the high bit are secret, so count bits in + // constant time. + BN_ULONG x, mask; + int bits = (l != 0); + +#if BN_BITS2 > 32 + // Look at the upper half of |x|. |x| is at most 64 bits long. + x = l >> 32; + // Set |mask| to all ones if |x| (the top 32 bits of |l|) is non-zero and all + // all zeros otherwise. + mask = 0u - x; + mask = (0u - (mask >> (BN_BITS2 - 1))); + // If |x| is non-zero, the lower half is included in the bit count in full, + // and we count the upper half. Otherwise, we count the lower half. + bits += 32 & mask; + l ^= (x ^ l) & mask; // |l| is |x| if |mask| and remains |l| otherwise. +#endif + + // The remaining blocks are analogous iterations at lower powers of two. + x = l >> 16; + mask = 0u - x; + mask = (0u - (mask >> (BN_BITS2 - 1))); + bits += 16 & mask; + l ^= (x ^ l) & mask; + + x = l >> 8; + mask = 0u - x; + mask = (0u - (mask >> (BN_BITS2 - 1))); + bits += 8 & mask; + l ^= (x ^ l) & mask; + + x = l >> 4; + mask = 0u - x; + mask = (0u - (mask >> (BN_BITS2 - 1))); + bits += 4 & mask; + l ^= (x ^ l) & mask; + + x = l >> 2; + mask = 0u - x; + mask = (0u - (mask >> (BN_BITS2 - 1))); + bits += 2 & mask; + l ^= (x ^ l) & mask; + + x = l >> 1; + mask = 0u - x; + mask = (0u - (mask >> (BN_BITS2 - 1))); + bits += 1 & mask; + + return bits; +} + +unsigned BN_num_bits(const BIGNUM *bn) { + const int width = bn_minimal_width(bn); + if (width == 0) { + return 0; + } + + return (width - 1) * BN_BITS2 + BN_num_bits_word(bn->d[width - 1]); +} + +unsigned BN_num_bytes(const BIGNUM *bn) { return (BN_num_bits(bn) + 7) / 8; } + +void BN_zero(BIGNUM *bn) { bn->width = bn->neg = 0; } + +int BN_one(BIGNUM *bn) { return BN_set_word(bn, 1); } + +int BN_set_word(BIGNUM *bn, BN_ULONG value) { + if (value == 0) { + BN_zero(bn); + return 1; + } + + if (!bn_wexpand(bn, 1)) { + return 0; + } + + bn->neg = 0; + bn->d[0] = value; + bn->width = 1; + return 1; +} + +int BN_set_u64(BIGNUM *bn, uint64_t value) { +#if BN_BITS2 == 64 + return BN_set_word(bn, value); +#elif BN_BITS2 == 32 + if (value <= BN_MASK2) { + return BN_set_word(bn, (BN_ULONG)value); + } + + if (!bn_wexpand(bn, 2)) { + return 0; + } + + bn->neg = 0; + bn->d[0] = (BN_ULONG)value; + bn->d[1] = (BN_ULONG)(value >> 32); + bn->width = 2; + return 1; +#else +#error "BN_BITS2 must be 32 or 64." +#endif +} + +int bssl::bn_set_words(BIGNUM *bn, const BN_ULONG *words, size_t num) { + if (!bn_wexpand(bn, num)) { + return 0; + } + OPENSSL_memmove(bn->d, words, num * sizeof(BN_ULONG)); + // |bn_wexpand| verified that |num| isn't too large. + bn->width = (int)num; + bn->neg = 0; + return 1; +} + +void bssl::bn_set_static_words(BIGNUM *bn, const BN_ULONG *words, size_t num) { + if ((bn->flags & BN_FLG_STATIC_DATA) == 0) { + OPENSSL_free(bn->d); + } + bn->d = (BN_ULONG *)words; + + assert(num <= BN_MAX_WORDS); + bn->width = (int)num; + bn->dmax = (int)num; + bn->neg = 0; + bn->flags |= BN_FLG_STATIC_DATA; +} + +int bssl::bn_fits_in_words(const BIGNUM *bn, size_t num) { + // All words beyond |num| must be zero. + BN_ULONG mask = 0; + for (size_t i = num; i < (size_t)bn->width; i++) { + mask |= bn->d[i]; + } + return mask == 0; +} + +int bssl::bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn) { + if (bn->neg) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + + size_t width = (size_t)bn->width; + if (width > num) { + if (!bn_fits_in_words(bn, num)) { + OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); + return 0; + } + width = num; + } + + OPENSSL_memset(out, 0, sizeof(BN_ULONG) * num); + OPENSSL_memcpy(out, bn->d, sizeof(BN_ULONG) * width); + return 1; +} + +int BN_is_negative(const BIGNUM *bn) { return bn->neg != 0; } + +void BN_set_negative(BIGNUM *bn, int sign) { + if (sign && !BN_is_zero(bn)) { + bn->neg = 1; + } else { + bn->neg = 0; + } +} + +int bssl::bn_wexpand(BIGNUM *bn, size_t words) { + BN_ULONG *a; + + if (words <= (size_t)bn->dmax) { + return 1; + } + + if (words > BN_MAX_WORDS) { + OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); + return 0; + } + + if (bn->flags & BN_FLG_STATIC_DATA) { + OPENSSL_PUT_ERROR(BN, BN_R_EXPAND_ON_STATIC_BIGNUM_DATA); + return 0; + } + + a = reinterpret_cast(OPENSSL_calloc(words, sizeof(BN_ULONG))); + if (a == nullptr) { + return 0; + } + + OPENSSL_memcpy(a, bn->d, sizeof(BN_ULONG) * bn->width); + + OPENSSL_free(bn->d); + bn->d = a; + bn->dmax = (int)words; + + return 1; +} + +int bssl::bn_expand(BIGNUM *bn, size_t bits) { + if (bits + BN_BITS2 - 1 < bits) { + OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); + return 0; + } + return bn_wexpand(bn, (bits + BN_BITS2 - 1) / BN_BITS2); +} + +int bssl::bn_resize_words(BIGNUM *bn, size_t words) { + if ((size_t)bn->width <= words) { + if (!bn_wexpand(bn, words)) { + return 0; + } + OPENSSL_memset(bn->d + bn->width, 0, + (words - bn->width) * sizeof(BN_ULONG)); + bn->width = (int)words; + return 1; + } + + // All words beyond the new width must be zero. + if (!bn_fits_in_words(bn, words)) { + OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); + return 0; + } + bn->width = (int)words; + return 1; +} + +void bssl::bn_select_words(BN_ULONG *r, BN_ULONG mask, const BN_ULONG *a, + const BN_ULONG *b, size_t num) { + for (size_t i = 0; i < num; i++) { + static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), + "crypto_word_t is too small"); + r[i] = constant_time_select_w(mask, a[i], b[i]); + } +} + +int bssl::bn_minimal_width(const BIGNUM *bn) { + int ret = bn->width; + while (ret > 0 && bn->d[ret - 1] == 0) { + ret--; + } + return ret; +} + +void bssl::bn_set_minimal_width(BIGNUM *bn) { + bn->width = bn_minimal_width(bn); + if (bn->width == 0) { + bn->neg = 0; + } +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/bytes.c b/third_party/boringssl/src/crypto/fipsmodule/bn/bytes.c deleted file mode 100644 index 38d71a3c..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/bytes.c +++ /dev/null @@ -1,246 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include "internal.h" - -void bn_big_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in, - size_t in_len) { - for (size_t i = 0; i < out_len; i++) { - if (in_len < sizeof(BN_ULONG)) { - // Load the last partial word. - BN_ULONG word = 0; - for (size_t j = 0; j < in_len; j++) { - word = (word << 8) | in[j]; - } - in_len = 0; - out[i] = word; - // Fill the remainder with zeros. - OPENSSL_memset(out + i + 1, 0, (out_len - i - 1) * sizeof(BN_ULONG)); - break; - } - - in_len -= sizeof(BN_ULONG); - out[i] = CRYPTO_load_word_be(in + in_len); - } - - // The caller should have sized the output to avoid truncation. - assert(in_len == 0); -} - -BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) { - BIGNUM *bn = NULL; - if (ret == NULL) { - bn = BN_new(); - if (bn == NULL) { - return NULL; - } - ret = bn; - } - - if (len == 0) { - ret->width = 0; - return ret; - } - - size_t num_words = ((len - 1) / BN_BYTES) + 1; - if (!bn_wexpand(ret, num_words)) { - BN_free(bn); - return NULL; - } - - // |bn_wexpand| must check bounds on |num_words| to write it into - // |ret->dmax|. - assert(num_words <= INT_MAX); - ret->width = (int)num_words; - ret->neg = 0; - - bn_big_endian_to_words(ret->d, ret->width, in, len); - return ret; -} - -BIGNUM *BN_le2bn(const uint8_t *in, size_t len, BIGNUM *ret) { - BIGNUM *bn = NULL; - if (ret == NULL) { - bn = BN_new(); - if (bn == NULL) { - return NULL; - } - ret = bn; - } - - if (len == 0) { - ret->width = 0; - ret->neg = 0; - return ret; - } - - // Reserve enough space in |ret|. - size_t num_words = ((len - 1) / BN_BYTES) + 1; - if (!bn_wexpand(ret, num_words)) { - BN_free(bn); - return NULL; - } - ret->width = num_words; - - // Make sure the top bytes will be zeroed. - ret->d[num_words - 1] = 0; - - // We only support little-endian platforms, so we can simply memcpy the - // internal representation. - OPENSSL_memcpy(ret->d, in, len); - return ret; -} - -// fits_in_bytes returns one if the |num_words| words in |words| can be -// represented in |num_bytes| bytes. -static int fits_in_bytes(const BN_ULONG *words, size_t num_words, - size_t num_bytes) { - const uint8_t *bytes = (const uint8_t *)words; - size_t tot_bytes = num_words * sizeof(BN_ULONG); - uint8_t mask = 0; - for (size_t i = num_bytes; i < tot_bytes; i++) { - mask |= bytes[i]; - } - return mask == 0; -} - -void bn_words_to_big_endian(uint8_t *out, size_t out_len, const BN_ULONG *in, - size_t in_len) { - // The caller should have selected an output length without truncation. - assert(fits_in_bytes(in, in_len, out_len)); - - // We only support little-endian platforms, so the internal representation is - // also little-endian as bytes. We can simply copy it in reverse. - const uint8_t *bytes = (const uint8_t *)in; - size_t num_bytes = in_len * sizeof(BN_ULONG); - if (out_len < num_bytes) { - num_bytes = out_len; - } - - for (size_t i = 0; i < num_bytes; i++) { - out[out_len - i - 1] = bytes[i]; - } - // Pad out the rest of the buffer with zeroes. - OPENSSL_memset(out, 0, out_len - num_bytes); -} - -size_t BN_bn2bin(const BIGNUM *in, uint8_t *out) { - size_t n = BN_num_bytes(in); - bn_words_to_big_endian(out, n, in->d, in->width); - return n; -} - -int BN_bn2le_padded(uint8_t *out, size_t len, const BIGNUM *in) { - if (!fits_in_bytes(in->d, in->width, len)) { - return 0; - } - - // We only support little-endian platforms, so we can simply memcpy into the - // internal representation. - const uint8_t *bytes = (const uint8_t *)in->d; - size_t num_bytes = in->width * BN_BYTES; - if (len < num_bytes) { - num_bytes = len; - } - - OPENSSL_memcpy(out, bytes, num_bytes); - // Pad out the rest of the buffer with zeroes. - OPENSSL_memset(out + num_bytes, 0, len - num_bytes); - return 1; -} - -int BN_bn2bin_padded(uint8_t *out, size_t len, const BIGNUM *in) { - if (!fits_in_bytes(in->d, in->width, len)) { - return 0; - } - - bn_words_to_big_endian(out, len, in->d, in->width); - return 1; -} - -BN_ULONG BN_get_word(const BIGNUM *bn) { - switch (bn_minimal_width(bn)) { - case 0: - return 0; - case 1: - return bn->d[0]; - default: - return BN_MASK2; - } -} - -int BN_get_u64(const BIGNUM *bn, uint64_t *out) { - switch (bn_minimal_width(bn)) { - case 0: - *out = 0; - return 1; - case 1: - *out = bn->d[0]; - return 1; -#if defined(OPENSSL_32_BIT) - case 2: - *out = (uint64_t) bn->d[0] | (((uint64_t) bn->d[1]) << 32); - return 1; -#endif - default: - return 0; - } -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/bytes.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/bytes.cc.inc new file mode 100644 index 00000000..4d3fadaf --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/bytes.cc.inc @@ -0,0 +1,230 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "internal.h" + + +using namespace bssl; + +void bssl::bn_big_endian_to_words(BN_ULONG *out, size_t out_len, + const uint8_t *in, size_t in_len) { + // The caller should have sized |out| to fit |in| without truncating. This + // condition ensures we do not overflow |out|, so use a runtime check. + BSSL_CHECK(in_len <= out_len * sizeof(BN_ULONG)); + + // Load whole words. + while (in_len >= sizeof(BN_ULONG)) { + in_len -= sizeof(BN_ULONG); + out[0] = CRYPTO_load_word_be(in + in_len); + out++; + out_len--; + } + + // Load the last partial word. + if (in_len != 0) { + BN_ULONG word = 0; + for (size_t i = 0; i < in_len; i++) { + word = (word << 8) | in[i]; + } + out[0] = word; + out++; + out_len--; + } + + // Fill the remainder with zeros. + OPENSSL_memset(out, 0, out_len * sizeof(BN_ULONG)); +} + +BIGNUM *BN_bin2bn(const uint8_t *in, size_t len, BIGNUM *ret) { + BIGNUM *bn = nullptr; + if (ret == nullptr) { + bn = BN_new(); + if (bn == nullptr) { + return nullptr; + } + ret = bn; + } + + if (len == 0) { + ret->width = 0; + ret->neg = 0; + return ret; + } + + size_t num_words = ((len - 1) / BN_BYTES) + 1; + if (!bn_wexpand(ret, num_words)) { + BN_free(bn); + return nullptr; + } + + // |bn_wexpand| must check bounds on |num_words| to write it into + // |ret->dmax|. + assert(num_words <= INT_MAX); + ret->width = (int)num_words; + ret->neg = 0; + + bn_big_endian_to_words(ret->d, ret->width, in, len); + return ret; +} + +BIGNUM *BN_lebin2bn(const uint8_t *in, size_t len, BIGNUM *ret) { + BIGNUM *bn = nullptr; + if (ret == nullptr) { + bn = BN_new(); + if (bn == nullptr) { + return nullptr; + } + ret = bn; + } + + if (len == 0) { + ret->width = 0; + ret->neg = 0; + return ret; + } + + // Reserve enough space in |ret|. + size_t num_words = ((len - 1) / BN_BYTES) + 1; + if (!bn_wexpand(ret, num_words)) { + BN_free(bn); + return nullptr; + } + ret->width = (int)num_words; + ret->neg = 0; + + // Make sure the top bytes will be zeroed. + ret->d[num_words - 1] = 0; + + // We only support little-endian platforms, so we can simply memcpy the + // internal representation. + OPENSSL_memcpy(ret->d, in, len); + return ret; +} + +BIGNUM *BN_le2bn(const uint8_t *in, size_t len, BIGNUM *ret) { + return BN_lebin2bn(in, len, ret); +} + +// fits_in_bytes returns one if the |num_words| words in |words| can be +// represented in |num_bytes| bytes. +static int fits_in_bytes(const BN_ULONG *words, size_t num_words, + size_t num_bytes) { + const uint8_t *bytes = (const uint8_t *)words; + size_t tot_bytes = num_words * sizeof(BN_ULONG); + uint8_t mask = 0; + for (size_t i = num_bytes; i < tot_bytes; i++) { + mask |= bytes[i]; + } + return mask == 0; +} + +void bssl::bn_assert_fits_in_bytes(const BIGNUM *bn, size_t num) { + const uint8_t *bytes = (const uint8_t *)bn->d; + size_t tot_bytes = bn->width * sizeof(BN_ULONG); + if (tot_bytes > num) { + CONSTTIME_DECLASSIFY(bytes + num, tot_bytes - num); + for (size_t i = num; i < tot_bytes; i++) { + assert(bytes[i] == 0); + } + (void)bytes; + } +} + +void bssl::bn_words_to_big_endian(uint8_t *out, size_t out_len, + const BN_ULONG *in, size_t in_len) { + // The caller should have selected an output length without truncation. + declassify_assert(fits_in_bytes(in, in_len, out_len)); + + // We only support little-endian platforms, so the internal representation is + // also little-endian as bytes. We can simply copy it in reverse. + const uint8_t *bytes = (const uint8_t *)in; + size_t num_bytes = in_len * sizeof(BN_ULONG); + if (out_len < num_bytes) { + num_bytes = out_len; + } + + for (size_t i = 0; i < num_bytes; i++) { + out[out_len - i - 1] = bytes[i]; + } + // Pad out the rest of the buffer with zeroes. + OPENSSL_memset(out, 0, out_len - num_bytes); +} + +size_t BN_bn2bin(const BIGNUM *in, uint8_t *out) { + size_t n = BN_num_bytes(in); + bn_words_to_big_endian(out, n, in->d, in->width); + return n; +} + +int BN_bn2le_padded(uint8_t *out, size_t len, const BIGNUM *in) { + if (!fits_in_bytes(in->d, in->width, len)) { + return 0; + } + + // We only support little-endian platforms, so we can simply memcpy into the + // internal representation. + const uint8_t *bytes = (const uint8_t *)in->d; + size_t num_bytes = in->width * BN_BYTES; + if (len < num_bytes) { + num_bytes = len; + } + + OPENSSL_memcpy(out, bytes, num_bytes); + // Pad out the rest of the buffer with zeroes. + OPENSSL_memset(out + num_bytes, 0, len - num_bytes); + return 1; +} + +int BN_bn2bin_padded(uint8_t *out, size_t len, const BIGNUM *in) { + if (!fits_in_bytes(in->d, in->width, len)) { + return 0; + } + + bn_words_to_big_endian(out, len, in->d, in->width); + return 1; +} + +BN_ULONG BN_get_word(const BIGNUM *bn) { + switch (bn_minimal_width(bn)) { + case 0: + return 0; + case 1: + return bn->d[0]; + default: + return BN_MASK2; + } +} + +int BN_get_u64(const BIGNUM *bn, uint64_t *out) { + switch (bn_minimal_width(bn)) { + case 0: + *out = 0; + return 1; + case 1: + *out = bn->d[0]; + return 1; +#if defined(OPENSSL_32_BIT) + case 2: + *out = (uint64_t) bn->d[0] | (((uint64_t) bn->d[1]) << 32); + return 1; +#endif + default: + return 0; + } +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/cmp.c b/third_party/boringssl/src/crypto/fipsmodule/bn/cmp.c deleted file mode 100644 index 84456a21..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/cmp.c +++ /dev/null @@ -1,201 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include - -#include "internal.h" -#include "../../internal.h" - - -static int bn_cmp_words_consttime(const BN_ULONG *a, size_t a_len, - const BN_ULONG *b, size_t b_len) { - static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), - "crypto_word_t is too small"); - int ret = 0; - // Process the common words in little-endian order. - size_t min = a_len < b_len ? a_len : b_len; - for (size_t i = 0; i < min; i++) { - crypto_word_t eq = constant_time_eq_w(a[i], b[i]); - crypto_word_t lt = constant_time_lt_w(a[i], b[i]); - ret = - constant_time_select_int(eq, ret, constant_time_select_int(lt, -1, 1)); - } - - // If |a| or |b| has non-zero words beyond |min|, they take precedence. - if (a_len < b_len) { - crypto_word_t mask = 0; - for (size_t i = a_len; i < b_len; i++) { - mask |= b[i]; - } - ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, -1); - } else if (b_len < a_len) { - crypto_word_t mask = 0; - for (size_t i = b_len; i < a_len; i++) { - mask |= a[i]; - } - ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, 1); - } - - return ret; -} - -int BN_ucmp(const BIGNUM *a, const BIGNUM *b) { - return bn_cmp_words_consttime(a->d, a->width, b->d, b->width); -} - -int BN_cmp(const BIGNUM *a, const BIGNUM *b) { - if ((a == NULL) || (b == NULL)) { - if (a != NULL) { - return -1; - } else if (b != NULL) { - return 1; - } else { - return 0; - } - } - - // We do not attempt to process the sign bit in constant time. Negative - // |BIGNUM|s should never occur in crypto, only calculators. - if (a->neg != b->neg) { - if (a->neg) { - return -1; - } - return 1; - } - - int ret = BN_ucmp(a, b); - return a->neg ? -ret : ret; -} - -int bn_less_than_words(const BN_ULONG *a, const BN_ULONG *b, size_t len) { - return bn_cmp_words_consttime(a, len, b, len) < 0; -} - -int BN_abs_is_word(const BIGNUM *bn, BN_ULONG w) { - if (bn->width == 0) { - return w == 0; - } - BN_ULONG mask = bn->d[0] ^ w; - for (int i = 1; i < bn->width; i++) { - mask |= bn->d[i]; - } - return mask == 0; -} - -int BN_cmp_word(const BIGNUM *a, BN_ULONG b) { - BIGNUM b_bn; - BN_init(&b_bn); - - b_bn.d = &b; - b_bn.width = b > 0; - b_bn.dmax = 1; - b_bn.flags = BN_FLG_STATIC_DATA; - return BN_cmp(a, &b_bn); -} - -int BN_is_zero(const BIGNUM *bn) { - return bn_fits_in_words(bn, 0); -} - -int BN_is_one(const BIGNUM *bn) { - return bn->neg == 0 && BN_abs_is_word(bn, 1); -} - -int BN_is_word(const BIGNUM *bn, BN_ULONG w) { - return BN_abs_is_word(bn, w) && (w == 0 || bn->neg == 0); -} - -int BN_is_odd(const BIGNUM *bn) { - return bn->width > 0 && (bn->d[0] & 1) == 1; -} - -int BN_is_pow2(const BIGNUM *bn) { - int width = bn_minimal_width(bn); - if (width == 0 || bn->neg) { - return 0; - } - - for (int i = 0; i < width - 1; i++) { - if (bn->d[i] != 0) { - return 0; - } - } - - return 0 == (bn->d[width-1] & (bn->d[width-1] - 1)); -} - -int BN_equal_consttime(const BIGNUM *a, const BIGNUM *b) { - BN_ULONG mask = 0; - // If |a| or |b| has more words than the other, all those words must be zero. - for (int i = a->width; i < b->width; i++) { - mask |= b->d[i]; - } - for (int i = b->width; i < a->width; i++) { - mask |= a->d[i]; - } - // Common words must match. - int min = a->width < b->width ? a->width : b->width; - for (int i = 0; i < min; i++) { - mask |= (a->d[i] ^ b->d[i]); - } - // The sign bit must match. - mask |= (a->neg ^ b->neg); - return mask == 0; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/cmp.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/cmp.cc.inc new file mode 100644 index 00000000..aed2de17 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/cmp.cc.inc @@ -0,0 +1,161 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include + +#include "internal.h" +#include "../../internal.h" + + +using namespace bssl; + +static int bn_cmp_words_consttime(const BN_ULONG *a, size_t a_len, + const BN_ULONG *b, size_t b_len) { + static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), + "crypto_word_t is too small"); + int ret = 0; + // Process the common words in little-endian order. + size_t min = a_len < b_len ? a_len : b_len; + for (size_t i = 0; i < min; i++) { + crypto_word_t eq = constant_time_eq_w(a[i], b[i]); + crypto_word_t lt = constant_time_lt_w(a[i], b[i]); + ret = + constant_time_select_int(eq, ret, constant_time_select_int(lt, -1, 1)); + } + + // If |a| or |b| has non-zero words beyond |min|, they take precedence. + if (a_len < b_len) { + crypto_word_t mask = 0; + for (size_t i = a_len; i < b_len; i++) { + mask |= b[i]; + } + ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, -1); + } else if (b_len < a_len) { + crypto_word_t mask = 0; + for (size_t i = b_len; i < a_len; i++) { + mask |= a[i]; + } + ret = constant_time_select_int(constant_time_is_zero_w(mask), ret, 1); + } + + return ret; +} + +int BN_ucmp(const BIGNUM *a, const BIGNUM *b) { + return bn_cmp_words_consttime(a->d, a->width, b->d, b->width); +} + +int BN_cmp(const BIGNUM *a, const BIGNUM *b) { + if ((a == nullptr) || (b == nullptr)) { + if (a != nullptr) { + return -1; + } else if (b != nullptr) { + return 1; + } else { + return 0; + } + } + + // We do not attempt to process the sign bit in constant time. Negative + // |BIGNUM|s should never occur in crypto, only calculators. + if (a->neg != b->neg) { + if (a->neg) { + return -1; + } + return 1; + } + + int ret = BN_ucmp(a, b); + return a->neg ? -ret : ret; +} + +int bssl::bn_less_than_words(const BN_ULONG *a, const BN_ULONG *b, size_t len) { + return bn_cmp_words_consttime(a, len, b, len) < 0; +} + +int BN_abs_is_word(const BIGNUM *bn, BN_ULONG w) { + if (bn->width == 0) { + return w == 0; + } + BN_ULONG mask = bn->d[0] ^ w; + for (int i = 1; i < bn->width; i++) { + mask |= bn->d[i]; + } + return mask == 0; +} + +int BN_cmp_word(const BIGNUM *a, BN_ULONG b) { + BIGNUM b_bn; + BN_init(&b_bn); + + b_bn.d = &b; + b_bn.width = b > 0; + b_bn.dmax = 1; + b_bn.flags = BN_FLG_STATIC_DATA; + return BN_cmp(a, &b_bn); +} + +int BN_is_zero(const BIGNUM *bn) { + return bn_fits_in_words(bn, 0); +} + +int BN_is_one(const BIGNUM *bn) { + return bn->neg == 0 && BN_abs_is_word(bn, 1); +} + +int BN_is_word(const BIGNUM *bn, BN_ULONG w) { + return BN_abs_is_word(bn, w) && (w == 0 || bn->neg == 0); +} + +int BN_is_odd(const BIGNUM *bn) { + return bn->width > 0 && (bn->d[0] & 1) == 1; +} + +int BN_is_pow2(const BIGNUM *bn) { + int width = bn_minimal_width(bn); + if (width == 0 || bn->neg) { + return 0; + } + + for (int i = 0; i < width - 1; i++) { + if (bn->d[i] != 0) { + return 0; + } + } + + return 0 == (bn->d[width-1] & (bn->d[width-1] - 1)); +} + +int BN_equal_consttime(const BIGNUM *a, const BIGNUM *b) { + BN_ULONG mask = 0; + // If |a| or |b| has more words than the other, all those words must be zero. + for (int i = a->width; i < b->width; i++) { + mask |= b->d[i]; + } + for (int i = b->width; i < a->width; i++) { + mask |= a->d[i]; + } + // Common words must match. + int min = a->width < b->width ? a->width : b->width; + for (int i = 0; i < min; i++) { + mask |= (a->d[i] ^ b->d[i]); + } + // The sign bit must match. + mask |= (a->neg ^ b->neg); + return mask == 0; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/ctx.c b/third_party/boringssl/src/crypto/fipsmodule/bn/ctx.c deleted file mode 100644 index f8c7ebfa..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/ctx.c +++ /dev/null @@ -1,236 +0,0 @@ -/* Written by Ulf Moeller for the OpenSSL project. */ -/* ==================================================================== - * Copyright (c) 1998-2004 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - - -#include - -#include -#include - -#include -#include - -#include "../../internal.h" - - -// The stack frame info is resizing, set a first-time expansion size; -#define BN_CTX_START_FRAMES 32 - - -// BN_STACK - -// A |BN_STACK| is a stack of |size_t| values. -typedef struct { - // Array of indexes into |ctx->bignums|. - size_t *indexes; - // Number of stack frames, and the size of the allocated array - size_t depth, size; -} BN_STACK; - -static void BN_STACK_init(BN_STACK *); -static void BN_STACK_cleanup(BN_STACK *); -static int BN_STACK_push(BN_STACK *, size_t idx); -static size_t BN_STACK_pop(BN_STACK *); - - -// BN_CTX - -DEFINE_STACK_OF(BIGNUM) - -// The opaque BN_CTX type -struct bignum_ctx { - // bignums is the stack of |BIGNUM|s managed by this |BN_CTX|. - STACK_OF(BIGNUM) *bignums; - // stack is the stack of |BN_CTX_start| frames. It is the value of |used| at - // the time |BN_CTX_start| was called. - BN_STACK stack; - // used is the number of |BIGNUM|s from |bignums| that have been used. - size_t used; - // error is one if any operation on this |BN_CTX| failed. All subsequent - // operations will fail. - char error; - // defer_error is one if an operation on this |BN_CTX| has failed, but no - // error has been pushed to the queue yet. This is used to defer errors from - // |BN_CTX_start| to |BN_CTX_get|. - char defer_error; -}; - -BN_CTX *BN_CTX_new(void) { - BN_CTX *ret = OPENSSL_malloc(sizeof(BN_CTX)); - if (!ret) { - OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE); - return NULL; - } - - // Initialise the structure - ret->bignums = NULL; - BN_STACK_init(&ret->stack); - ret->used = 0; - ret->error = 0; - ret->defer_error = 0; - return ret; -} - -void BN_CTX_free(BN_CTX *ctx) { - if (ctx == NULL) { - return; - } - - // All |BN_CTX_start| calls must be matched with |BN_CTX_end|, otherwise the - // function may use more memory than expected, potentially without bound if - // done in a loop. Assert that all |BIGNUM|s have been released. - assert(ctx->used == 0 || ctx->error); - sk_BIGNUM_pop_free(ctx->bignums, BN_free); - BN_STACK_cleanup(&ctx->stack); - OPENSSL_free(ctx); -} - -void BN_CTX_start(BN_CTX *ctx) { - if (ctx->error) { - // Once an operation has failed, |ctx->stack| no longer matches the number - // of |BN_CTX_end| calls to come. Do nothing. - return; - } - - if (!BN_STACK_push(&ctx->stack, ctx->used)) { - ctx->error = 1; - // |BN_CTX_start| cannot fail, so defer the error to |BN_CTX_get|. - ctx->defer_error = 1; - } -} - -BIGNUM *BN_CTX_get(BN_CTX *ctx) { - // Once any operation has failed, they all do. - if (ctx->error) { - if (ctx->defer_error) { - OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES); - ctx->defer_error = 0; - } - return NULL; - } - - if (ctx->bignums == NULL) { - ctx->bignums = sk_BIGNUM_new_null(); - if (ctx->bignums == NULL) { - OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE); - ctx->error = 1; - return NULL; - } - } - - if (ctx->used == sk_BIGNUM_num(ctx->bignums)) { - BIGNUM *bn = BN_new(); - if (bn == NULL || !sk_BIGNUM_push(ctx->bignums, bn)) { - OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES); - BN_free(bn); - ctx->error = 1; - return NULL; - } - } - - BIGNUM *ret = sk_BIGNUM_value(ctx->bignums, ctx->used); - BN_zero(ret); - // This is bounded by |sk_BIGNUM_num|, so it cannot overflow. - ctx->used++; - return ret; -} - -void BN_CTX_end(BN_CTX *ctx) { - if (ctx->error) { - // Once an operation has failed, |ctx->stack| no longer matches the number - // of |BN_CTX_end| calls to come. Do nothing. - return; - } - - ctx->used = BN_STACK_pop(&ctx->stack); -} - - -// BN_STACK - -static void BN_STACK_init(BN_STACK *st) { - st->indexes = NULL; - st->depth = st->size = 0; -} - -static void BN_STACK_cleanup(BN_STACK *st) { - OPENSSL_free(st->indexes); -} - -static int BN_STACK_push(BN_STACK *st, size_t idx) { - if (st->depth == st->size) { - // This function intentionally does not push to the error queue on error. - // Error-reporting is deferred to |BN_CTX_get|. - size_t new_size = st->size != 0 ? st->size * 3 / 2 : BN_CTX_START_FRAMES; - if (new_size <= st->size || new_size > ((size_t)-1) / sizeof(size_t)) { - return 0; - } - size_t *new_indexes = - OPENSSL_realloc(st->indexes, new_size * sizeof(size_t)); - if (new_indexes == NULL) { - return 0; - } - st->indexes = new_indexes; - st->size = new_size; - } - - st->indexes[st->depth] = idx; - st->depth++; - return 1; -} - -static size_t BN_STACK_pop(BN_STACK *st) { - assert(st->depth > 0); - st->depth--; - return st->indexes[st->depth]; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/ctx.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/ctx.cc.inc new file mode 100644 index 00000000..1d67281c --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/ctx.cc.inc @@ -0,0 +1,127 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" + + +using namespace bssl; + +DECLARE_OPAQUE_STRUCT(bignum_ctx, BignumCtx) + +BSSL_NAMESPACE_BEGIN + +class BignumCtx : public bignum_ctx { + public: + ~BignumCtx() { + // All |BN_CTX_start| calls must be matched with |BN_CTX_end|, otherwise the + // function may use more memory than expected, potentially without bound if + // done in a loop. Assert that all |BIGNUM|s have been released. + assert(used_ == 0 || error_); + } + + // bignums_ is the stack of |BIGNUM|s managed by this |BN_CTX|. + Vector> bignums_; + // stack_ is the stack of |BN_CTX_start| frames. It is the value of |used_| at + // the time |BN_CTX_start| was called. + Vector stack_; + // used_ is the number of |BIGNUM|s from |bignums_| that have been used. + size_t used_ = 0; + // error_ is whether any operation on this |BN_CTX| failed. All subsequent + // operations will fail. + bool error_ = false; + // defer_error_ is whether an operation on this |BN_CTX| has failed, but no + // error has been pushed to the queue yet. This is used to defer errors from + // |BN_CTX_start| to |BN_CTX_get|. + bool defer_error_ = false; +}; + +BSSL_NAMESPACE_END + +BN_CTX *BN_CTX_new() { return New(); } + +void BN_CTX_free(BN_CTX *ctx) { + if (ctx != nullptr) { + Delete(FromOpaque(ctx)); + } +} + +void BN_CTX_start(BN_CTX *ctx) { + auto *impl = FromOpaque(ctx); + + if (impl->error_) { + // Once an operation has failed, |impl->stack| no longer matches the number + // of |BN_CTX_end| calls to come. Do nothing. + return; + } + + if (!impl->stack_.Push(impl->used_)) { + impl->error_ = true; + // |BN_CTX_start| cannot fail, so defer the error to |BN_CTX_get|. + impl->defer_error_ = true; + ERR_clear_error(); + } +} + +BIGNUM *BN_CTX_get(BN_CTX *ctx) { + auto *impl = FromOpaque(ctx); + + // Once any operation has failed, they all do. + if (impl->error_) { + if (impl->defer_error_) { + OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES); + impl->defer_error_ = false; + } + return nullptr; + } + + if (impl->used_ == impl->bignums_.size()) { + UniquePtr bn(BN_new()); + if (bn == nullptr || !impl->bignums_.Push(std::move(bn))) { + OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_TEMPORARY_VARIABLES); + impl->error_ = true; + return nullptr; + } + } + + BIGNUM *ret = impl->bignums_[impl->used_].get(); + BN_zero(ret); + // This is bounded by |impl->bignums_.size()|, so it cannot overflow. + impl->used_++; + return ret; +} + +void BN_CTX_end(BN_CTX *ctx) { + auto *impl = FromOpaque(ctx); + + if (impl->error_) { + // Once an operation has failed, |impl->stack_| no longer matches the number + // of |BN_CTX_end| calls to come. Do nothing. + return; + } + + assert(!impl->stack_.empty()); + impl->used_ = impl->stack_.back(); + impl->stack_.pop_back(); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/div.c b/third_party/boringssl/src/crypto/fipsmodule/bn/div.c deleted file mode 100644 index 02b9931c..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/div.c +++ /dev/null @@ -1,902 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include - -#include "internal.h" - - -// bn_div_words divides a double-width |h|,|l| by |d| and returns the result, -// which must fit in a |BN_ULONG|. -OPENSSL_UNUSED static BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, - BN_ULONG d) { - BN_ULONG dh, dl, q, ret = 0, th, tl, t; - int i, count = 2; - - if (d == 0) { - return BN_MASK2; - } - - i = BN_num_bits_word(d); - assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); - - i = BN_BITS2 - i; - if (h >= d) { - h -= d; - } - - if (i) { - d <<= i; - h = (h << i) | (l >> (BN_BITS2 - i)); - l <<= i; - } - dh = (d & BN_MASK2h) >> BN_BITS4; - dl = (d & BN_MASK2l); - for (;;) { - if ((h >> BN_BITS4) == dh) { - q = BN_MASK2l; - } else { - q = h / dh; - } - - th = q * dh; - tl = dl * q; - for (;;) { - t = h - th; - if ((t & BN_MASK2h) || - ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) { - break; - } - q--; - th -= dh; - tl -= dl; - } - t = (tl >> BN_BITS4); - tl = (tl << BN_BITS4) & BN_MASK2h; - th += t; - - if (l < tl) { - th++; - } - l -= tl; - if (h < th) { - h += d; - q--; - } - h -= th; - - if (--count == 0) { - break; - } - - ret = q << BN_BITS4; - h = (h << BN_BITS4) | (l >> BN_BITS4); - l = (l & BN_MASK2l) << BN_BITS4; - } - - ret |= q; - return ret; -} - -static inline void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out, - BN_ULONG n0, BN_ULONG n1, BN_ULONG d0) { - // GCC and Clang generate function calls to |__udivdi3| and |__umoddi3| when - // the |BN_ULLONG|-based C code is used. - // - // GCC bugs: - // * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=14224 - // * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=43721 - // * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=54183 - // * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58897 - // * https://gcc.gnu.org/bugzilla/show_bug.cgi?id=65668 - // - // Clang bugs: - // * https://llvm.org/bugs/show_bug.cgi?id=6397 - // * https://llvm.org/bugs/show_bug.cgi?id=12418 - // - // These issues aren't specific to x86 and x86_64, so it might be worthwhile - // to add more assembly language implementations. -#if defined(BN_CAN_USE_INLINE_ASM) && defined(OPENSSL_X86) - __asm__ volatile("divl %4" - : "=a"(*quotient_out), "=d"(*rem_out) - : "a"(n1), "d"(n0), "rm"(d0) - : "cc"); -#elif defined(BN_CAN_USE_INLINE_ASM) && defined(OPENSSL_X86_64) - __asm__ volatile("divq %4" - : "=a"(*quotient_out), "=d"(*rem_out) - : "a"(n1), "d"(n0), "rm"(d0) - : "cc"); -#else -#if defined(BN_CAN_DIVIDE_ULLONG) - BN_ULLONG n = (((BN_ULLONG)n0) << BN_BITS2) | n1; - *quotient_out = (BN_ULONG)(n / d0); -#else - *quotient_out = bn_div_words(n0, n1, d0); -#endif - *rem_out = n1 - (*quotient_out * d0); -#endif -} - -// BN_div computes "quotient := numerator / divisor", rounding towards zero, -// and sets up |rem| such that "quotient * divisor + rem = numerator" holds. -// -// Thus: -// -// quotient->neg == numerator->neg ^ divisor->neg -// (unless the result is zero) -// rem->neg == numerator->neg -// (unless the remainder is zero) -// -// If |quotient| or |rem| is NULL, the respective value is not returned. -// -// This was specifically designed to contain fewer branches that may leak -// sensitive information; see "New Branch Prediction Vulnerabilities in OpenSSL -// and Necessary Software Countermeasures" by Onur Acıçmez, Shay Gueron, and -// Jean-Pierre Seifert. -int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator, - const BIGNUM *divisor, BN_CTX *ctx) { - int norm_shift, loop; - BIGNUM wnum; - BN_ULONG *resp, *wnump; - BN_ULONG d0, d1; - int num_n, div_n; - - // This function relies on the historical minimal-width |BIGNUM| invariant. - // It is already not constant-time (constant-time reductions should use - // Montgomery logic), so we shrink all inputs and intermediate values to - // retain the previous behavior. - - // Invalid zero-padding would have particularly bad consequences. - int numerator_width = bn_minimal_width(numerator); - int divisor_width = bn_minimal_width(divisor); - if ((numerator_width > 0 && numerator->d[numerator_width - 1] == 0) || - (divisor_width > 0 && divisor->d[divisor_width - 1] == 0)) { - OPENSSL_PUT_ERROR(BN, BN_R_NOT_INITIALIZED); - return 0; - } - - if (BN_is_zero(divisor)) { - OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO); - return 0; - } - - BN_CTX_start(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - BIGNUM *snum = BN_CTX_get(ctx); - BIGNUM *sdiv = BN_CTX_get(ctx); - BIGNUM *res = NULL; - if (quotient == NULL) { - res = BN_CTX_get(ctx); - } else { - res = quotient; - } - if (sdiv == NULL || res == NULL) { - goto err; - } - - // First we normalise the numbers - norm_shift = BN_BITS2 - (BN_num_bits(divisor) % BN_BITS2); - if (!BN_lshift(sdiv, divisor, norm_shift)) { - goto err; - } - bn_set_minimal_width(sdiv); - sdiv->neg = 0; - norm_shift += BN_BITS2; - if (!BN_lshift(snum, numerator, norm_shift)) { - goto err; - } - bn_set_minimal_width(snum); - snum->neg = 0; - - // Since we don't want to have special-case logic for the case where snum is - // larger than sdiv, we pad snum with enough zeroes without changing its - // value. - if (snum->width <= sdiv->width + 1) { - if (!bn_wexpand(snum, sdiv->width + 2)) { - goto err; - } - for (int i = snum->width; i < sdiv->width + 2; i++) { - snum->d[i] = 0; - } - snum->width = sdiv->width + 2; - } else { - if (!bn_wexpand(snum, snum->width + 1)) { - goto err; - } - snum->d[snum->width] = 0; - snum->width++; - } - - div_n = sdiv->width; - num_n = snum->width; - loop = num_n - div_n; - // Lets setup a 'window' into snum - // This is the part that corresponds to the current - // 'area' being divided - wnum.neg = 0; - wnum.d = &(snum->d[loop]); - wnum.width = div_n; - // only needed when BN_ucmp messes up the values between width and max - wnum.dmax = snum->dmax - loop; // so we don't step out of bounds - - // Get the top 2 words of sdiv - // div_n=sdiv->width; - d0 = sdiv->d[div_n - 1]; - d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2]; - - // pointer to the 'top' of snum - wnump = &(snum->d[num_n - 1]); - - // Setup |res|. |numerator| and |res| may alias, so we save |numerator->neg| - // for later. - const int numerator_neg = numerator->neg; - res->neg = (numerator_neg ^ divisor->neg); - if (!bn_wexpand(res, loop + 1)) { - goto err; - } - res->width = loop - 1; - resp = &(res->d[loop - 1]); - - // space for temp - if (!bn_wexpand(tmp, div_n + 1)) { - goto err; - } - - // if res->width == 0 then clear the neg value otherwise decrease - // the resp pointer - if (res->width == 0) { - res->neg = 0; - } else { - resp--; - } - - for (int i = 0; i < loop - 1; i++, wnump--, resp--) { - BN_ULONG q, l0; - // the first part of the loop uses the top two words of snum and sdiv to - // calculate a BN_ULONG q such that | wnum - sdiv * q | < sdiv - BN_ULONG n0, n1, rm = 0; - - n0 = wnump[0]; - n1 = wnump[-1]; - if (n0 == d0) { - q = BN_MASK2; - } else { - // n0 < d0 - bn_div_rem_words(&q, &rm, n0, n1, d0); - -#ifdef BN_ULLONG - BN_ULLONG t2 = (BN_ULLONG)d1 * q; - for (;;) { - if (t2 <= ((((BN_ULLONG)rm) << BN_BITS2) | wnump[-2])) { - break; - } - q--; - rm += d0; - if (rm < d0) { - break; // don't let rm overflow - } - t2 -= d1; - } -#else // !BN_ULLONG - BN_ULONG t2l, t2h; - BN_UMULT_LOHI(t2l, t2h, d1, q); - for (;;) { - if (t2h < rm || - (t2h == rm && t2l <= wnump[-2])) { - break; - } - q--; - rm += d0; - if (rm < d0) { - break; // don't let rm overflow - } - if (t2l < d1) { - t2h--; - } - t2l -= d1; - } -#endif // !BN_ULLONG - } - - l0 = bn_mul_words(tmp->d, sdiv->d, div_n, q); - tmp->d[div_n] = l0; - wnum.d--; - // ingore top values of the bignums just sub the two - // BN_ULONG arrays with bn_sub_words - if (bn_sub_words(wnum.d, wnum.d, tmp->d, div_n + 1)) { - // Note: As we have considered only the leading - // two BN_ULONGs in the calculation of q, sdiv * q - // might be greater than wnum (but then (q-1) * sdiv - // is less or equal than wnum) - q--; - if (bn_add_words(wnum.d, wnum.d, sdiv->d, div_n)) { - // we can't have an overflow here (assuming - // that q != 0, but if q == 0 then tmp is - // zero anyway) - (*wnump)++; - } - } - // store part of the result - *resp = q; - } - - bn_set_minimal_width(snum); - - if (rem != NULL) { - if (!BN_rshift(rem, snum, norm_shift)) { - goto err; - } - if (!BN_is_zero(rem)) { - rem->neg = numerator_neg; - } - } - - bn_set_minimal_width(res); - BN_CTX_end(ctx); - return 1; - -err: - BN_CTX_end(ctx); - return 0; -} - -int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx) { - if (!(BN_mod(r, m, d, ctx))) { - return 0; - } - if (!r->neg) { - return 1; - } - - // now -|d| < r < 0, so we have to set r := r + |d|. - return (d->neg ? BN_sub : BN_add)(r, r, d); -} - -BN_ULONG bn_reduce_once(BN_ULONG *r, const BN_ULONG *a, BN_ULONG carry, - const BN_ULONG *m, size_t num) { - assert(r != a); - // |r| = |a| - |m|. |bn_sub_words| performs the bulk of the subtraction, and - // then we apply the borrow to |carry|. - carry -= bn_sub_words(r, a, m, num); - // We know 0 <= |a| < 2*|m|, so -|m| <= |r| < |m|. - // - // If 0 <= |r| < |m|, |r| fits in |num| words and |carry| is zero. We then - // wish to select |r| as the answer. Otherwise -m <= r < 0 and we wish to - // return |r| + |m|, or |a|. |carry| must then be -1 or all ones. In both - // cases, |carry| is a suitable input to |bn_select_words|. - // - // Although |carry| may be one if it was one on input and |bn_sub_words| - // returns zero, this would give |r| > |m|, violating our input assumptions. - assert(carry == 0 || carry == (BN_ULONG)-1); - bn_select_words(r, carry, a /* r < 0 */, r /* r >= 0 */, num); - return carry; -} - -BN_ULONG bn_reduce_once_in_place(BN_ULONG *r, BN_ULONG carry, const BN_ULONG *m, - BN_ULONG *tmp, size_t num) { - // See |bn_reduce_once| for why this logic works. - carry -= bn_sub_words(tmp, r, m, num); - assert(carry == 0 || carry == (BN_ULONG)-1); - bn_select_words(r, carry, r /* tmp < 0 */, tmp /* tmp >= 0 */, num); - return carry; -} - -void bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, - const BN_ULONG *m, BN_ULONG *tmp, size_t num) { - // r = a - b - BN_ULONG borrow = bn_sub_words(r, a, b, num); - // tmp = a - b + m - bn_add_words(tmp, r, m, num); - bn_select_words(r, 0 - borrow, tmp /* r < 0 */, r /* r >= 0 */, num); -} - -void bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, - const BN_ULONG *m, BN_ULONG *tmp, size_t num) { - BN_ULONG carry = bn_add_words(r, a, b, num); - bn_reduce_once_in_place(r, carry, m, tmp, num); -} - -int bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder, - const BIGNUM *numerator, const BIGNUM *divisor, - unsigned divisor_min_bits, BN_CTX *ctx) { - if (BN_is_negative(numerator) || BN_is_negative(divisor)) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - if (BN_is_zero(divisor)) { - OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO); - return 0; - } - - // This function implements long division in binary. It is not very efficient, - // but it is simple, easy to make constant-time, and performant enough for RSA - // key generation. - - int ret = 0; - BN_CTX_start(ctx); - BIGNUM *q = quotient, *r = remainder; - if (quotient == NULL || quotient == numerator || quotient == divisor) { - q = BN_CTX_get(ctx); - } - if (remainder == NULL || remainder == numerator || remainder == divisor) { - r = BN_CTX_get(ctx); - } - BIGNUM *tmp = BN_CTX_get(ctx); - if (q == NULL || r == NULL || tmp == NULL || - !bn_wexpand(q, numerator->width) || - !bn_wexpand(r, divisor->width) || - !bn_wexpand(tmp, divisor->width)) { - goto err; - } - - OPENSSL_memset(q->d, 0, numerator->width * sizeof(BN_ULONG)); - q->width = numerator->width; - q->neg = 0; - - OPENSSL_memset(r->d, 0, divisor->width * sizeof(BN_ULONG)); - r->width = divisor->width; - r->neg = 0; - - // Incorporate |numerator| into |r|, one bit at a time, reducing after each - // step. We maintain the invariant that |0 <= r < divisor| and - // |q * divisor + r = n| where |n| is the portion of |numerator| incorporated - // so far. - // - // First, we short-circuit the loop: if we know |divisor| has at least - // |divisor_min_bits| bits, the top |divisor_min_bits - 1| can be incorporated - // without reductions. This significantly speeds up |RSA_check_key|. For - // simplicity, we round down to a whole number of words. - assert(divisor_min_bits <= BN_num_bits(divisor)); - int initial_words = 0; - if (divisor_min_bits > 0) { - initial_words = (divisor_min_bits - 1) / BN_BITS2; - if (initial_words > numerator->width) { - initial_words = numerator->width; - } - OPENSSL_memcpy(r->d, numerator->d + numerator->width - initial_words, - initial_words * sizeof(BN_ULONG)); - } - - for (int i = numerator->width - initial_words - 1; i >= 0; i--) { - for (int bit = BN_BITS2 - 1; bit >= 0; bit--) { - // Incorporate the next bit of the numerator, by computing - // r = 2*r or 2*r + 1. Note the result fits in one more word. We store the - // extra word in |carry|. - BN_ULONG carry = bn_add_words(r->d, r->d, r->d, divisor->width); - r->d[0] |= (numerator->d[i] >> bit) & 1; - // |r| was previously fully-reduced, so we know: - // 2*0 <= r <= 2*(divisor-1) + 1 - // 0 <= r <= 2*divisor - 1 < 2*divisor. - // Thus |r| satisfies the preconditions for |bn_reduce_once_in_place|. - BN_ULONG subtracted = bn_reduce_once_in_place(r->d, carry, divisor->d, - tmp->d, divisor->width); - // The corresponding bit of the quotient is set iff we needed to subtract. - q->d[i] |= (~subtracted & 1) << bit; - } - } - - if ((quotient != NULL && !BN_copy(quotient, q)) || - (remainder != NULL && !BN_copy(remainder, r))) { - goto err; - } - - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -static BIGNUM *bn_scratch_space_from_ctx(size_t width, BN_CTX *ctx) { - BIGNUM *ret = BN_CTX_get(ctx); - if (ret == NULL || - !bn_wexpand(ret, width)) { - return NULL; - } - ret->neg = 0; - ret->width = width; - return ret; -} - -// bn_resized_from_ctx returns |bn| with width at least |width| or NULL on -// error. This is so it may be used with low-level "words" functions. If -// necessary, it allocates a new |BIGNUM| with a lifetime of the current scope -// in |ctx|, so the caller does not need to explicitly free it. |bn| must fit in -// |width| words. -static const BIGNUM *bn_resized_from_ctx(const BIGNUM *bn, size_t width, - BN_CTX *ctx) { - if ((size_t)bn->width >= width) { - // Any excess words must be zero. - assert(bn_fits_in_words(bn, width)); - return bn; - } - BIGNUM *ret = bn_scratch_space_from_ctx(width, ctx); - if (ret == NULL || - !BN_copy(ret, bn) || - !bn_resize_words(ret, width)) { - return NULL; - } - return ret; -} - -int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, - BN_CTX *ctx) { - if (!BN_add(r, a, b)) { - return 0; - } - return BN_nnmod(r, r, m, ctx); -} - -int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, - const BIGNUM *m) { - BN_CTX *ctx = BN_CTX_new(); - int ok = ctx != NULL && - bn_mod_add_consttime(r, a, b, m, ctx); - BN_CTX_free(ctx); - return ok; -} - -int bn_mod_add_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, - const BIGNUM *m, BN_CTX *ctx) { - BN_CTX_start(ctx); - a = bn_resized_from_ctx(a, m->width, ctx); - b = bn_resized_from_ctx(b, m->width, ctx); - BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx); - int ok = a != NULL && b != NULL && tmp != NULL && - bn_wexpand(r, m->width); - if (ok) { - bn_mod_add_words(r->d, a->d, b->d, m->d, tmp->d, m->width); - r->width = m->width; - r->neg = 0; - } - BN_CTX_end(ctx); - return ok; -} - -int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, - BN_CTX *ctx) { - if (!BN_sub(r, a, b)) { - return 0; - } - return BN_nnmod(r, r, m, ctx); -} - -int bn_mod_sub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, - const BIGNUM *m, BN_CTX *ctx) { - BN_CTX_start(ctx); - a = bn_resized_from_ctx(a, m->width, ctx); - b = bn_resized_from_ctx(b, m->width, ctx); - BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx); - int ok = a != NULL && b != NULL && tmp != NULL && - bn_wexpand(r, m->width); - if (ok) { - bn_mod_sub_words(r->d, a->d, b->d, m->d, tmp->d, m->width); - r->width = m->width; - r->neg = 0; - } - BN_CTX_end(ctx); - return ok; -} - -int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, - const BIGNUM *m) { - BN_CTX *ctx = BN_CTX_new(); - int ok = ctx != NULL && - bn_mod_sub_consttime(r, a, b, m, ctx); - BN_CTX_free(ctx); - return ok; -} - -int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, - BN_CTX *ctx) { - BIGNUM *t; - int ret = 0; - - BN_CTX_start(ctx); - t = BN_CTX_get(ctx); - if (t == NULL) { - goto err; - } - - if (a == b) { - if (!BN_sqr(t, a, ctx)) { - goto err; - } - } else { - if (!BN_mul(t, a, b, ctx)) { - goto err; - } - } - - if (!BN_nnmod(r, t, m, ctx)) { - goto err; - } - - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) { - if (!BN_sqr(r, a, ctx)) { - return 0; - } - - // r->neg == 0, thus we don't need BN_nnmod - return BN_mod(r, r, m, ctx); -} - -int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m, - BN_CTX *ctx) { - BIGNUM *abs_m = NULL; - int ret; - - if (!BN_nnmod(r, a, m, ctx)) { - return 0; - } - - if (m->neg) { - abs_m = BN_dup(m); - if (abs_m == NULL) { - return 0; - } - abs_m->neg = 0; - } - - ret = bn_mod_lshift_consttime(r, r, n, (abs_m ? abs_m : m), ctx); - - BN_free(abs_m); - return ret; -} - -int bn_mod_lshift_consttime(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m, - BN_CTX *ctx) { - if (!BN_copy(r, a)) { - return 0; - } - for (int i = 0; i < n; i++) { - if (!bn_mod_lshift1_consttime(r, r, m, ctx)) { - return 0; - } - } - return 1; -} - -int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m) { - BN_CTX *ctx = BN_CTX_new(); - int ok = ctx != NULL && - bn_mod_lshift_consttime(r, a, n, m, ctx); - BN_CTX_free(ctx); - return ok; -} - -int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) { - if (!BN_lshift1(r, a)) { - return 0; - } - - return BN_nnmod(r, r, m, ctx); -} - -int bn_mod_lshift1_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, - BN_CTX *ctx) { - return bn_mod_add_consttime(r, a, a, m, ctx); -} - -int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m) { - BN_CTX *ctx = BN_CTX_new(); - int ok = ctx != NULL && - bn_mod_lshift1_consttime(r, a, m, ctx); - BN_CTX_free(ctx); - return ok; -} - -BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) { - BN_ULONG ret = 0; - int i, j; - - if (!w) { - // actually this an error (division by zero) - return (BN_ULONG) - 1; - } - - if (a->width == 0) { - return 0; - } - - // normalize input for |bn_div_rem_words|. - j = BN_BITS2 - BN_num_bits_word(w); - w <<= j; - if (!BN_lshift(a, a, j)) { - return (BN_ULONG) - 1; - } - - for (i = a->width - 1; i >= 0; i--) { - BN_ULONG l = a->d[i]; - BN_ULONG d; - BN_ULONG unused_rem; - bn_div_rem_words(&d, &unused_rem, ret, l, w); - ret = l - (d * w); - a->d[i] = d; - } - - bn_set_minimal_width(a); - ret >>= j; - return ret; -} - -BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) { -#ifndef BN_CAN_DIVIDE_ULLONG - BN_ULONG ret = 0; -#else - BN_ULLONG ret = 0; -#endif - int i; - - if (w == 0) { - return (BN_ULONG) -1; - } - -#ifndef BN_CAN_DIVIDE_ULLONG - // If |w| is too long and we don't have |BN_ULLONG| division then we need to - // fall back to using |BN_div_word|. - if (w > ((BN_ULONG)1 << BN_BITS4)) { - BIGNUM *tmp = BN_dup(a); - if (tmp == NULL) { - return (BN_ULONG)-1; - } - ret = BN_div_word(tmp, w); - BN_free(tmp); - return ret; - } -#endif - - for (i = a->width - 1; i >= 0; i--) { -#ifndef BN_CAN_DIVIDE_ULLONG - ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w; - ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w; -#else - ret = (BN_ULLONG)(((ret << (BN_ULLONG)BN_BITS2) | a->d[i]) % (BN_ULLONG)w); -#endif - } - return (BN_ULONG)ret; -} - -int BN_mod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) { - if (e == 0 || a->width == 0) { - BN_zero(r); - return 1; - } - - size_t num_words = 1 + ((e - 1) / BN_BITS2); - - // If |a| definitely has less than |e| bits, just BN_copy. - if ((size_t) a->width < num_words) { - return BN_copy(r, a) != NULL; - } - - // Otherwise, first make sure we have enough space in |r|. - // Note that this will fail if num_words > INT_MAX. - if (!bn_wexpand(r, num_words)) { - return 0; - } - - // Copy the content of |a| into |r|. - OPENSSL_memcpy(r->d, a->d, num_words * sizeof(BN_ULONG)); - - // If |e| isn't word-aligned, we have to mask off some of our bits. - size_t top_word_exponent = e % (sizeof(BN_ULONG) * 8); - if (top_word_exponent != 0) { - r->d[num_words - 1] &= (((BN_ULONG) 1) << top_word_exponent) - 1; - } - - // Fill in the remaining fields of |r|. - r->neg = a->neg; - r->width = (int) num_words; - bn_set_minimal_width(r); - return 1; -} - -int BN_nnmod_pow2(BIGNUM *r, const BIGNUM *a, size_t e) { - if (!BN_mod_pow2(r, a, e)) { - return 0; - } - - // If the returned value was non-negative, we're done. - if (BN_is_zero(r) || !r->neg) { - return 1; - } - - size_t num_words = 1 + (e - 1) / BN_BITS2; - - // Expand |r| to the size of our modulus. - if (!bn_wexpand(r, num_words)) { - return 0; - } - - // Clear the upper words of |r|. - OPENSSL_memset(&r->d[r->width], 0, (num_words - r->width) * BN_BYTES); - - // Set parameters of |r|. - r->neg = 0; - r->width = (int) num_words; - - // Now, invert every word. The idea here is that we want to compute 2^e-|x|, - // which is actually equivalent to the twos-complement representation of |x| - // in |e| bits, which is -x = ~x + 1. - for (int i = 0; i < r->width; i++) { - r->d[i] = ~r->d[i]; - } - - // If our exponent doesn't span the top word, we have to mask the rest. - size_t top_word_exponent = e % BN_BITS2; - if (top_word_exponent != 0) { - r->d[r->width - 1] &= (((BN_ULONG) 1) << top_word_exponent) - 1; - } - - // Keep the minimal-width invariant for |BIGNUM|. - bn_set_minimal_width(r); - - // Finally, add one, for the reason described above. - return BN_add(r, r, BN_value_one()); -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/div.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/div.cc.inc new file mode 100644 index 00000000..af522dfc --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/div.cc.inc @@ -0,0 +1,738 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include "internal.h" + +#if (defined(OPENSSL_X86) || defined(OPENSSL_X86_64)) && defined(_MSC_VER) && \ + !defined(__clang__) +#define HAVE_MSVC_DIV_INTRINSICS +#include +#if defined(OPENSSL_X86) +#pragma intrinsic(_udiv64) +#else +#pragma intrinsic(_udiv128) +#endif +#endif + + +using namespace bssl; + +// bn_div_words divides a double-width |h|,|l| by |d| and returns the result, +// which must fit in a |BN_ULONG|, i.e. |h < d|. +[[maybe_unused]] +static BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) { + assert(h < d); + BN_ULONG dh, dl, q, ret = 0, th, tl, t; + int i, count = 2; + + if (d == 0) { + return BN_MASK2; + } + + i = BN_num_bits_word(d); + assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i)); + + i = BN_BITS2 - i; + if (h >= d) { + h -= d; + } + + if (i) { + d <<= i; + h = (h << i) | (l >> (BN_BITS2 - i)); + l <<= i; + } + dh = (d & BN_MASK2h) >> BN_BITS4; + dl = (d & BN_MASK2l); + for (;;) { + if ((h >> BN_BITS4) == dh) { + q = BN_MASK2l; + } else { + q = h / dh; + } + + th = q * dh; + tl = dl * q; + for (;;) { + t = h - th; + if ((t & BN_MASK2h) || + ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) { + break; + } + q--; + th -= dh; + tl -= dl; + } + t = (tl >> BN_BITS4); + tl = (tl << BN_BITS4) & BN_MASK2h; + th += t; + + if (l < tl) { + th++; + } + l -= tl; + if (h < th) { + h += d; + q--; + } + h -= th; + + if (--count == 0) { + break; + } + + ret = q << BN_BITS4; + h = (h << BN_BITS4) | (l >> BN_BITS4); + l = (l & BN_MASK2l) << BN_BITS4; + } + + ret |= q; + return ret; +} + +// bn_div_rem_words divides a double-width numerator (high half |nh| and low +// half |nl|) with a single-width divisor. It sets |*quotient_out| and +// |*rem_out| to be the quotient and numerator, respectively. The quotient must +// fit in a |BN_ULONG|, i.e. |nh < d|. +static void bn_div_rem_words(BN_ULONG *quotient_out, BN_ULONG *rem_out, + BN_ULONG nh, BN_ULONG nl, BN_ULONG d) { + assert(nh < d); + // This operation is the x86 and x86_64 DIV instruction, but it is difficult + // for the compiler to emit it. Dividing a |BN_ULLONG| by a |BN_ULONG| does + // not work because, a priori, the quotient may not fit in |BN_ULONG| and DIV + // will trap on overflow, not truncate. The compiler will instead emit a call + // to a more expensive support function (e.g. |__udivdi3|). Thus we use inline + // assembly or intrinsics to get the instruction. + // + // These is specific to x86 and x86_64; Arm and RISC-V do not have double-wide + // division instructions. +#if defined(BN_CAN_USE_INLINE_ASM) && defined(OPENSSL_X86) + __asm__ volatile("divl %4" + : "=a"(*quotient_out), "=d"(*rem_out) + : "a"(nl), "d"(nh), "rm"(d) + : "cc"); +#elif defined(BN_CAN_USE_INLINE_ASM) && defined(OPENSSL_X86_64) + __asm__ volatile("divq %4" + : "=a"(*quotient_out), "=d"(*rem_out) + : "a"(nl), "d"(nh), "rm"(d) + : "cc"); +#elif defined(HAVE_MSVC_DIV_INTRINSICS) && defined(OPENSSL_X86) + BN_ULLONG n = (((BN_ULLONG)nh) << BN_BITS2) | nl; + unsigned rem; + *quotient_out = _udiv64(n, d, &rem); + *rem_out = rem; +#elif defined(HAVE_MSVC_DIV_INTRINSICS) && defined(OPENSSL_X86_64) + unsigned __int64 rem; + *quotient_out = _udiv128(nh, nl, d, &rem); + *rem_out = rem; +#else +#if defined(BN_CAN_DIVIDE_ULLONG) + BN_ULLONG n = (((BN_ULLONG)nh) << BN_BITS2) | nl; + *quotient_out = (BN_ULONG)(n / d); +#else + *quotient_out = bn_div_words(nh, nl, d); +#endif // BN_CAN_DIVIDE_ULLONG + *rem_out = nl - (*quotient_out * d); +#endif +} + +int BN_div(BIGNUM *quotient, BIGNUM *rem, const BIGNUM *numerator, + const BIGNUM *divisor, BN_CTX *ctx) { + // This function implements long division, per Knuth, The Art of Computer + // Programming, Volume 2, Chapter 4.3.1, Algorithm D. This algorithm only + // divides non-negative integers, but we round towards zero, so we divide + // absolute values and adjust the signs separately. + // + // Inputs to this function are assumed public and may be leaked by timing and + // cache side channels. Division with secret inputs should use other + // implementation strategies such as Montgomery reduction. + if (BN_is_zero(divisor)) { + OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO); + return 0; + } + + BN_CTXScope scope(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + BIGNUM *snum = BN_CTX_get(ctx); + BIGNUM *sdiv = BN_CTX_get(ctx); + BIGNUM *res = quotient == nullptr ? BN_CTX_get(ctx) : quotient; + int norm_shift, num_n, loop, div_n; + BN_ULONG d0, d1; + if (tmp == nullptr || snum == nullptr || sdiv == nullptr || res == nullptr) { + return 0; + } + + // Knuth step D1: Normalise the numbers such that the divisor's MSB is set. + // This ensures, in Knuth's terminology, that v1 >= b/2, needed for the + // quotient estimation step. + norm_shift = BN_BITS2 - (BN_num_bits(divisor) % BN_BITS2); + if (!BN_lshift(sdiv, divisor, norm_shift) || + !BN_lshift(snum, numerator, norm_shift)) { + return 0; + } + + // This algorithm relies on |sdiv| being minimal width. We do not use this + // function on secret inputs, so leaking this is fine. Also minimize |snum| to + // avoid looping on leading zeros, as we're not trying to be leak-free. + bn_set_minimal_width(sdiv); + bn_set_minimal_width(snum); + div_n = sdiv->width; + d0 = sdiv->d[div_n - 1]; + d1 = (div_n == 1) ? 0 : sdiv->d[div_n - 2]; + assert(d0 & (((BN_ULONG)1) << (BN_BITS2 - 1))); + + // Extend |snum| with zeros to satisfy the long division invariants: + // - |snum| must have at least |div_n| + 1 words. + // - |snum|'s most significant word must be zero to guarantee the first loop + // iteration works with a prefix greater than |sdiv|. (This is the extra u0 + // digit in Knuth step D1.) + num_n = snum->width <= div_n ? div_n + 1 : snum->width + 1; + if (!bn_resize_words(snum, num_n)) { + return 0; + } + + // Knuth step D2: The quotient's width is the difference between numerator and + // denominator. Also set up its sign and size a temporary for the loop. + loop = num_n - div_n; + res->neg = snum->neg ^ sdiv->neg; + if (!bn_wexpand(res, loop) || // + !bn_wexpand(tmp, div_n + 1)) { + return 0; + } + res->width = loop; + + // Knuth steps D2 through D7: Compute the quotient with a word-by-word long + // division. Note that Knuth indexes words from most to least significant, so + // our index is reversed. Each loop iteration computes res->d[i] of the + // quotient and updates snum with the running remainder. Before each loop + // iteration, the div_n words beginning at snum->d[i+1] must be less than + // snum. + for (int i = loop - 1; i >= 0; i--) { + // The next word of the quotient, q, is floor(wnum / sdiv), where wnum is + // the div_n + 1 words beginning at snum->d[i]. i starts at + // num_n - div_n - 1, so there are at least div_n + 1 words available. + // + // Knuth step D3: Compute q', an estimate of q by looking at the top words + // of wnum and sdiv. We must estimate such that q' = q or q' = q + 1. + BN_ULONG q, rm = 0; + BN_ULONG *wnum = snum->d + i; + BN_ULONG n0 = wnum[div_n]; + BN_ULONG n1 = wnum[div_n - 1]; + if (n0 == d0) { + // Estimate q' = b - 1, where b is the base. + q = BN_MASK2; + // Knuth also runs the fixup routine in this case, but this would require + // computing rm and is unnecessary. q' is already close enough. That is, + // the true quotient, q is either b - 1 or b - 2. + // + // By the loop invariant, q <= b - 1, so we must show that q >= b - 2. We + // do this by showing wnum / sdiv >= b - 2. Suppose wnum / sdiv < b - 2. + // wnum and sdiv have the same most significant word, so: + // + // wnum >= n0 * b^div_n + // sdiv < (n0 + 1) * b^(d_div - 1) + // + // Thus: + // + // b - 2 > wnum / sdiv + // > (n0 * b^div_n) / (n0 + 1) * b^(div_n - 1) + // = (n0 * b) / (n0 + 1) + // + // (n0 + 1) * (b - 2) > n0 * b + // n0 * b + b - 2 * n0 - 2 > n0 * b + // b - 2 > 2 * n0 + // b/2 - 1 > n0 + // + // This contradicts the normalization condition, so q >= b - 2 and our + // estimate is close enough. + } else { + // Estimate q' = floor(n0n1 / d0). Per Theorem B, q' - 2 <= q <= q', which + // is slightly outside of our bounds. + assert(n0 < d0); + bn_div_rem_words(&q, &rm, n0, n1, d0); + + // Fix the estimate by examining one more word and adjusting q' as needed. + // This is the second half of step D3 and is sufficient per exercises 19, + // 20, and 21. Although only one iteration is needed to correct q + 2 to + // q + 1, Knuth uses a loop. A loop will often also correct q + 1 to q, + // saving the slightly more expensive underflow handling below. + if (div_n > 1) { + BN_ULONG n2 = wnum[div_n - 2]; +#ifdef BN_ULLONG + BN_ULLONG t2 = (BN_ULLONG)d1 * q; + for (;;) { + if (t2 <= ((((BN_ULLONG)rm) << BN_BITS2) | n2)) { + break; + } + q--; + rm += d0; + if (rm < d0) { + // If rm overflows, the true value exceeds BN_ULONG and the next + // t2 comparison should exit the loop. + break; + } + t2 -= d1; + } +#else // !BN_ULLONG + BN_ULONG t2l, t2h; + BN_UMULT_LOHI(t2l, t2h, d1, q); + for (;;) { + if (t2h < rm || (t2h == rm && t2l <= n2)) { + break; + } + q--; + rm += d0; + if (rm < d0) { + // If rm overflows, the true value exceeds BN_ULONG and the next + // t2 comparison should exit the loop. + break; + } + if (t2l < d1) { + t2h--; + } + t2l -= d1; + } +#endif // !BN_ULLONG + } + } + + // Knuth step D4 through D6: Now q' = q or q' = q + 1, and + // -sdiv < wnum - sdiv * q < sdiv. If q' = q + 1, the subtraction will + // underflow, and we fix it up below. + tmp->d[div_n] = bn_mul_words(tmp->d, sdiv->d, div_n, q); + if (bn_sub_words(wnum, wnum, tmp->d, div_n + 1)) { + q--; + // The final addition is expected to overflow, canceling the underflow. + wnum[div_n] += bn_add_words(wnum, wnum, sdiv->d, div_n); + } + + // q is now correct, and wnum has been updated to the running remainder. + res->d[i] = q; + } + + // Trim leading zeros and correct any negative zeros. + bn_set_minimal_width(snum); + bn_set_minimal_width(res); + + // Knuth step D8: Unnormalize. snum now contains the remainder. + if (rem != nullptr && !BN_rshift(rem, snum, norm_shift)) { + return 0; + } + + return 1; +} + +int BN_nnmod(BIGNUM *r, const BIGNUM *m, const BIGNUM *d, BN_CTX *ctx) { + if (!(BN_mod(r, m, d, ctx))) { + return 0; + } + if (!r->neg) { + return 1; + } + + // now -d < r < 0, so we have to set r := r + d. Ignoring the sign bits, this + // is r = d - r. + return BN_usub(r, d, r); +} + +BN_ULONG bssl::bn_reduce_once(BN_ULONG *r, const BN_ULONG *a, BN_ULONG carry, + const BN_ULONG *m, size_t num) { + assert(r != a); + // |r| = |a| - |m|. |bn_sub_words| performs the bulk of the subtraction, and + // then we apply the borrow to |carry|. + carry -= bn_sub_words(r, a, m, num); + // We know 0 <= |a| < 2*|m|, so -|m| <= |r| < |m|. + // + // If 0 <= |r| < |m|, |r| fits in |num| words and |carry| is zero. We then + // wish to select |r| as the answer. Otherwise -m <= r < 0 and we wish to + // return |r| + |m|, or |a|. |carry| must then be -1 or all ones. In both + // cases, |carry| is a suitable input to |bn_select_words|. + // + // Although |carry| may be one if it was one on input and |bn_sub_words| + // returns zero, this would give |r| > |m|, violating our input assumptions. + declassify_assert(carry + 1 <= 1); + bn_select_words(r, carry, a /* r < 0 */, r /* r >= 0 */, num); + return carry; +} + +BN_ULONG bssl::bn_reduce_once_in_place(BN_ULONG *r, BN_ULONG carry, + const BN_ULONG *m, BN_ULONG *tmp, + size_t num) { + // See |bn_reduce_once| for why this logic works. + carry -= bn_sub_words(tmp, r, m, num); + declassify_assert(carry + 1 <= 1); + bn_select_words(r, carry, r /* tmp < 0 */, tmp /* tmp >= 0 */, num); + return carry; +} + +void bssl::bn_mod_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, + const BN_ULONG *m, BN_ULONG *tmp, size_t num) { + // r = a - b + BN_ULONG borrow = bn_sub_words(r, a, b, num); + // tmp = a - b + m + bn_add_words(tmp, r, m, num); + bn_select_words(r, 0 - borrow, tmp /* r < 0 */, r /* r >= 0 */, num); +} + +void bssl::bn_mod_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, + const BN_ULONG *m, BN_ULONG *tmp, size_t num) { + BN_ULONG carry = bn_add_words(r, a, b, num); + bn_reduce_once_in_place(r, carry, m, tmp, num); +} + +int bssl::bn_div_consttime(BIGNUM *quotient, BIGNUM *remainder, + const BIGNUM *numerator, const BIGNUM *divisor, + unsigned divisor_min_bits, BN_CTX *ctx) { + if (BN_is_negative(numerator) || BN_is_negative(divisor)) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + if (BN_is_zero(divisor)) { + OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO); + return 0; + } + + // This function implements long division in binary. It is not very efficient, + // but it is simple, easy to make constant-time, and performant enough for RSA + // key generation. + + BN_CTXScope scope(ctx); + BIGNUM *q = quotient, *r = remainder; + if (quotient == nullptr || quotient == numerator || quotient == divisor) { + q = BN_CTX_get(ctx); + } + if (remainder == nullptr || remainder == numerator || remainder == divisor) { + r = BN_CTX_get(ctx); + } + BIGNUM *tmp = BN_CTX_get(ctx); + int initial_words; + if (q == nullptr || r == nullptr || tmp == nullptr || + !bn_wexpand(q, numerator->width) || !bn_wexpand(r, divisor->width) || + !bn_wexpand(tmp, divisor->width)) { + return 0; + } + + OPENSSL_memset(q->d, 0, numerator->width * sizeof(BN_ULONG)); + q->width = numerator->width; + q->neg = 0; + + OPENSSL_memset(r->d, 0, divisor->width * sizeof(BN_ULONG)); + r->width = divisor->width; + r->neg = 0; + + // Incorporate |numerator| into |r|, one bit at a time, reducing after each + // step. We maintain the invariant that |0 <= r < divisor| and + // |q * divisor + r = n| where |n| is the portion of |numerator| incorporated + // so far. + // + // First, we short-circuit the loop: if we know |divisor| has at least + // |divisor_min_bits| bits, the top |divisor_min_bits - 1| can be incorporated + // without reductions. This significantly speeds up |RSA_check_key|. For + // simplicity, we round down to a whole number of words. + declassify_assert(divisor_min_bits <= BN_num_bits(divisor)); + initial_words = 0; + if (divisor_min_bits > 0) { + initial_words = (divisor_min_bits - 1) / BN_BITS2; + if (initial_words > numerator->width) { + initial_words = numerator->width; + } + OPENSSL_memcpy(r->d, numerator->d + numerator->width - initial_words, + initial_words * sizeof(BN_ULONG)); + } + + for (int i = numerator->width - initial_words - 1; i >= 0; i--) { + for (int bit = BN_BITS2 - 1; bit >= 0; bit--) { + // Incorporate the next bit of the numerator, by computing + // r = 2*r or 2*r + 1. Note the result fits in one more word. We store the + // extra word in |carry|. + BN_ULONG carry = bn_add_words(r->d, r->d, r->d, divisor->width); + r->d[0] |= (numerator->d[i] >> bit) & 1; + // |r| was previously fully-reduced, so we know: + // 2*0 <= r <= 2*(divisor-1) + 1 + // 0 <= r <= 2*divisor - 1 < 2*divisor. + // Thus |r| satisfies the preconditions for |bn_reduce_once_in_place|. + BN_ULONG subtracted = bn_reduce_once_in_place(r->d, carry, divisor->d, + tmp->d, divisor->width); + // The corresponding bit of the quotient is set iff we needed to subtract. + q->d[i] |= (~subtracted & 1) << bit; + } + } + + if ((quotient != nullptr && !BN_copy(quotient, q)) || + (remainder != nullptr && !BN_copy(remainder, r))) { + return 0; + } + + return 1; +} + +static BIGNUM *bn_scratch_space_from_ctx(size_t width, BN_CTX *ctx) { + BIGNUM *ret = BN_CTX_get(ctx); + if (ret == nullptr || !bn_wexpand(ret, width)) { + return nullptr; + } + ret->neg = 0; + ret->width = (int)width; + return ret; +} + +// bn_resized_from_ctx returns |bn| with width at least |width| or NULL on +// error. This is so it may be used with low-level "words" functions. If +// necessary, it allocates a new |BIGNUM| with a lifetime of the current scope +// in |ctx|, so the caller does not need to explicitly free it. |bn| must fit in +// |width| words. +static const BIGNUM *bn_resized_from_ctx(const BIGNUM *bn, size_t width, + BN_CTX *ctx) { + if ((size_t)bn->width >= width) { + // Any excess words must be zero. + assert(bn_fits_in_words(bn, width)); + return bn; + } + BIGNUM *ret = bn_scratch_space_from_ctx(width, ctx); + if (ret == nullptr || !BN_copy(ret, bn) || !bn_resize_words(ret, width)) { + return nullptr; + } + return ret; +} + +int BN_mod_add(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, + BN_CTX *ctx) { + if (!BN_add(r, a, b)) { + return 0; + } + return BN_nnmod(r, r, m, ctx); +} + +int BN_mod_add_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, + const BIGNUM *m) { + UniquePtr ctx(BN_CTX_new()); + return ctx != nullptr && bn_mod_add_consttime(r, a, b, m, ctx.get()); +} + +int bssl::bn_mod_add_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, + const BIGNUM *m, BN_CTX *ctx) { + BN_CTXScope scope(ctx); + a = bn_resized_from_ctx(a, m->width, ctx); + b = bn_resized_from_ctx(b, m->width, ctx); + BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx); + if (a == nullptr || b == nullptr || tmp == nullptr || + !bn_wexpand(r, m->width)) { + return 0; + } + bn_mod_add_words(r->d, a->d, b->d, m->d, tmp->d, m->width); + r->width = m->width; + r->neg = 0; + return 1; +} + +int BN_mod_sub(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, + BN_CTX *ctx) { + if (!BN_sub(r, a, b)) { + return 0; + } + return BN_nnmod(r, r, m, ctx); +} + +int bssl::bn_mod_sub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, + const BIGNUM *m, BN_CTX *ctx) { + BN_CTXScope scope(ctx); + a = bn_resized_from_ctx(a, m->width, ctx); + b = bn_resized_from_ctx(b, m->width, ctx); + BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx); + if (a == nullptr || b == nullptr || tmp == nullptr || + !bn_wexpand(r, m->width)) { + return 0; + } + bn_mod_sub_words(r->d, a->d, b->d, m->d, tmp->d, m->width); + r->width = m->width; + r->neg = 0; + return 1; +} + +int BN_mod_sub_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, + const BIGNUM *m) { + UniquePtr ctx(BN_CTX_new()); + return ctx != nullptr && bn_mod_sub_consttime(r, a, b, m, ctx.get()); +} + +int BN_mod_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, const BIGNUM *m, + BN_CTX *ctx) { + BN_CTXScope scope(ctx); + BIGNUM *t = BN_CTX_get(ctx); + if (t == nullptr) { + return 0; + } + + if (a == b) { + if (!BN_sqr(t, a, ctx)) { + return 0; + } + } else { + if (!BN_mul(t, a, b, ctx)) { + return 0; + } + } + + if (!BN_nnmod(r, t, m, ctx)) { + return 0; + } + + return 1; +} + +int BN_mod_sqr(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) { + if (!BN_sqr(r, a, ctx)) { + return 0; + } + + // r->neg == 0, thus we don't need BN_nnmod + return BN_mod(r, r, m, ctx); +} + +int BN_mod_lshift(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m, + BN_CTX *ctx) { + if (!BN_nnmod(r, a, m, ctx)) { + return 0; + } + + UniquePtr abs_m; + if (m->neg) { + abs_m.reset(BN_dup(m)); + if (abs_m == nullptr) { + return 0; + } + abs_m->neg = 0; + } + + return bn_mod_lshift_consttime(r, r, n, (abs_m ? abs_m.get() : m), ctx); +} + +int bssl::bn_mod_lshift_consttime(BIGNUM *r, const BIGNUM *a, int n, + const BIGNUM *m, BN_CTX *ctx) { + if (!BN_copy(r, a) || !bn_resize_words(r, m->width)) { + return 0; + } + + BN_CTXScope scope(ctx); + BIGNUM *tmp = bn_scratch_space_from_ctx(m->width, ctx); + if (tmp == nullptr) { + return 0; + } + for (int i = 0; i < n; i++) { + bn_mod_add_words(r->d, r->d, r->d, m->d, tmp->d, m->width); + } + r->neg = 0; + return 1; +} + +int BN_mod_lshift_quick(BIGNUM *r, const BIGNUM *a, int n, const BIGNUM *m) { + UniquePtr ctx(BN_CTX_new()); + return ctx != nullptr && bn_mod_lshift_consttime(r, a, n, m, ctx.get()); +} + +int BN_mod_lshift1(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, BN_CTX *ctx) { + if (!BN_lshift1(r, a)) { + return 0; + } + + return BN_nnmod(r, r, m, ctx); +} + +int bssl::bn_mod_lshift1_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *m, + BN_CTX *ctx) { + return bn_mod_add_consttime(r, a, a, m, ctx); +} + +int BN_mod_lshift1_quick(BIGNUM *r, const BIGNUM *a, const BIGNUM *m) { + UniquePtr ctx(BN_CTX_new()); + return ctx != nullptr && bn_mod_lshift1_consttime(r, a, m, ctx.get()); +} + +BN_ULONG BN_div_word(BIGNUM *a, BN_ULONG w) { + BN_ULONG ret = 0; + int i, j; + + if (!w) { + // actually this an error (division by zero) + return (BN_ULONG)-1; + } + + if (a->width == 0) { + return 0; + } + + // normalize input for |bn_div_rem_words|. + j = BN_BITS2 - BN_num_bits_word(w); + w <<= j; + if (!BN_lshift(a, a, j)) { + return (BN_ULONG)-1; + } + + for (i = a->width - 1; i >= 0; i--) { + BN_ULONG l = a->d[i]; + BN_ULONG d; + BN_ULONG unused_rem; + bn_div_rem_words(&d, &unused_rem, ret, l, w); + ret = l - (d * w); + a->d[i] = d; + } + + bn_set_minimal_width(a); + ret >>= j; + return ret; +} + +BN_ULONG BN_mod_word(const BIGNUM *a, BN_ULONG w) { +#ifndef BN_CAN_DIVIDE_ULLONG + BN_ULONG ret = 0; +#else + BN_ULLONG ret = 0; +#endif + int i; + + if (w == 0) { + return (BN_ULONG)-1; + } + +#ifndef BN_CAN_DIVIDE_ULLONG + // If |w| is too long and we don't have |BN_ULLONG| division then we need to + // fall back to using |BN_div_word|. + if (w > ((BN_ULONG)1 << BN_BITS4)) { + BIGNUM *tmp = BN_dup(a); + if (tmp == nullptr) { + return (BN_ULONG)-1; + } + ret = BN_div_word(tmp, w); + BN_free(tmp); + return ret; + } +#endif + + for (i = a->width - 1; i >= 0; i--) { +#ifndef BN_CAN_DIVIDE_ULLONG + ret = ((ret << BN_BITS4) | ((a->d[i] >> BN_BITS4) & BN_MASK2l)) % w; + ret = ((ret << BN_BITS4) | (a->d[i] & BN_MASK2l)) % w; +#else + ret = (BN_ULLONG)(((ret << (BN_ULLONG)BN_BITS2) | a->d[i]) % (BN_ULLONG)w); +#endif + } + return (BN_ULONG)ret; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/div_extra.c b/third_party/boringssl/src/crypto/fipsmodule/bn/div_extra.c deleted file mode 100644 index 7f03f28d..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/div_extra.c +++ /dev/null @@ -1,87 +0,0 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - -#include "internal.h" - - -// The following functions use a Barrett reduction variant to avoid leaking the -// numerator. See http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html -// -// We use 32-bit numerator and 16-bit divisor for simplicity. This allows -// computing |m| and |q| without architecture-specific code. - -// mod_u16 returns |n| mod |d|. |p| and |m| are the "magic numbers" for |d| (see -// reference). For proof of correctness in Coq, see -// https://github.com/davidben/fiat-crypto/blob/barrett/src/Arithmetic/BarrettReduction/RidiculousFish.v -// Note the Coq version of |mod_u16| additionally includes the computation of -// |p| and |m| from |bn_mod_u16_consttime| below. -static uint16_t mod_u16(uint32_t n, uint16_t d, uint32_t p, uint32_t m) { - // Compute floor(n/d) per steps 3 through 5. - uint32_t q = ((uint64_t)m * n) >> 32; - // Note there is a typo in the reference. We right-shift by one, not two. - uint32_t t = ((n - q) >> 1) + q; - t = t >> (p - 1); - - // Multiply and subtract to get the remainder. - n -= d * t; - assert(n < d); - return n; -} - -// shift_and_add_mod_u16 returns |r| * 2^32 + |a| mod |d|. |p| and |m| are the -// "magic numbers" for |d| (see reference). -static uint16_t shift_and_add_mod_u16(uint16_t r, uint32_t a, uint16_t d, - uint32_t p, uint32_t m) { - // Incorporate |a| in two 16-bit chunks. - uint32_t t = r; - t <<= 16; - t |= a >> 16; - t = mod_u16(t, d, p, m); - - t <<= 16; - t |= a & 0xffff; - t = mod_u16(t, d, p, m); - return t; -} - -uint16_t bn_mod_u16_consttime(const BIGNUM *bn, uint16_t d) { - if (d <= 1) { - return 0; - } - - // Compute the "magic numbers" for |d|. See steps 1 and 2. - // This computes p = ceil(log_2(d)). - uint32_t p = BN_num_bits_word(d - 1); - // This operation is not constant-time, but |p| and |d| are public values. - // Note that |p| is at most 16, so the computation fits in |uint64_t|. - assert(p <= 16); - uint32_t m = ((UINT64_C(1) << (32 + p)) + d - 1) / d; - - uint16_t ret = 0; - for (int i = bn->width - 1; i >= 0; i--) { -#if BN_BITS2 == 32 - ret = shift_and_add_mod_u16(ret, bn->d[i], d, p, m); -#elif BN_BITS2 == 64 - ret = shift_and_add_mod_u16(ret, bn->d[i] >> 32, d, p, m); - ret = shift_and_add_mod_u16(ret, bn->d[i] & 0xffffffff, d, p, m); -#else -#error "Unknown BN_ULONG size" -#endif - } - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/div_extra.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/div_extra.cc.inc new file mode 100644 index 00000000..52e1be63 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/div_extra.cc.inc @@ -0,0 +1,90 @@ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "internal.h" + + +using namespace bssl; + +// The following functions use a Barrett reduction variant to avoid leaking the +// numerator. See +// http://ridiculousfish.com/blog/posts/labor-of-division-episode-i.html +// +// We use 32-bit numerator and 16-bit divisor for simplicity. This allows +// computing |m| and |q| without architecture-specific code. + +// mod_u16 returns |n| mod |d|. |p| and |m| are the "magic numbers" for |d| (see +// reference). For proof of correctness in Coq, see +// https://github.com/davidben/fiat-crypto/blob/barrett/src/Arithmetic/BarrettReduction/RidiculousFish.v +// Note the Coq version of |mod_u16| additionally includes the computation of +// |p| and |m| from |bn_mod_u16_consttime| below. +static uint16_t mod_u16(uint32_t n, uint16_t d, uint32_t p, uint32_t m) { + // Compute floor(n/d) per steps 3 through 5. + uint32_t q = ((uint64_t)m * n) >> 32; + // Note there is a typo in the reference. We right-shift by one, not two. + uint32_t t = ((n - q) >> 1) + q; + t = t >> (p - 1); + + // Multiply and subtract to get the remainder. + n -= d * t; + declassify_assert(n < d); + return n; +} + +// shift_and_add_mod_u16 returns |r| * 2^32 + |a| mod |d|. |p| and |m| are the +// "magic numbers" for |d| (see reference). +static uint16_t shift_and_add_mod_u16(uint16_t r, uint32_t a, uint16_t d, + uint32_t p, uint32_t m) { + // Incorporate |a| in two 16-bit chunks. + uint32_t t = r; + t <<= 16; + t |= a >> 16; + t = mod_u16(t, d, p, m); + + t <<= 16; + t |= a & 0xffff; + t = mod_u16(t, d, p, m); + return t; +} + +uint16_t bssl::bn_mod_u16_consttime(const BIGNUM *bn, uint16_t d) { + if (d <= 1) { + return 0; + } + + // Compute the "magic numbers" for |d|. See steps 1 and 2. + // This computes p = ceil(log_2(d)). + uint32_t p = BN_num_bits_word(d - 1); + // This operation is not constant-time, but |p| and |d| are public values. + // Note that |p| is at most 16, so the computation fits in |uint64_t|. + assert(p <= 16); + uint32_t m = (uint32_t)(((UINT64_C(1) << (32 + p)) + d - 1) / d); + + uint16_t ret = 0; + for (int i = bn->width - 1; i >= 0; i--) { +#if BN_BITS2 == 32 + ret = shift_and_add_mod_u16(ret, bn->d[i], d, p, m); +#elif BN_BITS2 == 64 + ret = shift_and_add_mod_u16(ret, bn->d[i] >> 32, d, p, m); + ret = shift_and_add_mod_u16(ret, bn->d[i] & 0xffffffff, d, p, m); +#else +#error "Unknown BN_ULONG size" +#endif + } + return ret; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/exponentiation.c b/third_party/boringssl/src/crypto/fipsmodule/bn/exponentiation.c deleted file mode 100644 index 9b609b3a..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/exponentiation.c +++ /dev/null @@ -1,1286 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -/* ==================================================================== - * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include - -#include -#include - -#include "internal.h" -#include "rsaz_exp.h" - - -int BN_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { - int i, bits, ret = 0; - BIGNUM *v, *rr; - - BN_CTX_start(ctx); - if (r == a || r == p) { - rr = BN_CTX_get(ctx); - } else { - rr = r; - } - - v = BN_CTX_get(ctx); - if (rr == NULL || v == NULL) { - goto err; - } - - if (BN_copy(v, a) == NULL) { - goto err; - } - bits = BN_num_bits(p); - - if (BN_is_odd(p)) { - if (BN_copy(rr, a) == NULL) { - goto err; - } - } else { - if (!BN_one(rr)) { - goto err; - } - } - - for (i = 1; i < bits; i++) { - if (!BN_sqr(v, v, ctx)) { - goto err; - } - if (BN_is_bit_set(p, i)) { - if (!BN_mul(rr, rr, v, ctx)) { - goto err; - } - } - } - - if (r != rr && !BN_copy(r, rr)) { - goto err; - } - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -typedef struct bn_recp_ctx_st { - BIGNUM N; // the divisor - BIGNUM Nr; // the reciprocal - int num_bits; - int shift; - int flags; -} BN_RECP_CTX; - -static void BN_RECP_CTX_init(BN_RECP_CTX *recp) { - BN_init(&recp->N); - BN_init(&recp->Nr); - recp->num_bits = 0; - recp->shift = 0; - recp->flags = 0; -} - -static void BN_RECP_CTX_free(BN_RECP_CTX *recp) { - if (recp == NULL) { - return; - } - - BN_free(&recp->N); - BN_free(&recp->Nr); -} - -static int BN_RECP_CTX_set(BN_RECP_CTX *recp, const BIGNUM *d, BN_CTX *ctx) { - if (!BN_copy(&(recp->N), d)) { - return 0; - } - BN_zero(&recp->Nr); - recp->num_bits = BN_num_bits(d); - recp->shift = 0; - - return 1; -} - -// len is the expected size of the result We actually calculate with an extra -// word of precision, so we can do faster division if the remainder is not -// required. -// r := 2^len / m -static int BN_reciprocal(BIGNUM *r, const BIGNUM *m, int len, BN_CTX *ctx) { - int ret = -1; - BIGNUM *t; - - BN_CTX_start(ctx); - t = BN_CTX_get(ctx); - if (t == NULL) { - goto err; - } - - if (!BN_set_bit(t, len)) { - goto err; - } - - if (!BN_div(r, NULL, t, m, ctx)) { - goto err; - } - - ret = len; - -err: - BN_CTX_end(ctx); - return ret; -} - -static int BN_div_recp(BIGNUM *dv, BIGNUM *rem, const BIGNUM *m, - BN_RECP_CTX *recp, BN_CTX *ctx) { - int i, j, ret = 0; - BIGNUM *a, *b, *d, *r; - - BN_CTX_start(ctx); - a = BN_CTX_get(ctx); - b = BN_CTX_get(ctx); - if (dv != NULL) { - d = dv; - } else { - d = BN_CTX_get(ctx); - } - - if (rem != NULL) { - r = rem; - } else { - r = BN_CTX_get(ctx); - } - - if (a == NULL || b == NULL || d == NULL || r == NULL) { - goto err; - } - - if (BN_ucmp(m, &recp->N) < 0) { - BN_zero(d); - if (!BN_copy(r, m)) { - goto err; - } - BN_CTX_end(ctx); - return 1; - } - - // We want the remainder - // Given input of ABCDEF / ab - // we need multiply ABCDEF by 3 digests of the reciprocal of ab - - // i := max(BN_num_bits(m), 2*BN_num_bits(N)) - i = BN_num_bits(m); - j = recp->num_bits << 1; - if (j > i) { - i = j; - } - - // Nr := round(2^i / N) - if (i != recp->shift) { - recp->shift = - BN_reciprocal(&(recp->Nr), &(recp->N), i, - ctx); // BN_reciprocal returns i, or -1 for an error - } - - if (recp->shift == -1) { - goto err; - } - - // d := |round(round(m / 2^BN_num_bits(N)) * recp->Nr / 2^(i - - // BN_num_bits(N)))| - // = |round(round(m / 2^BN_num_bits(N)) * round(2^i / N) / 2^(i - - // BN_num_bits(N)))| - // <= |(m / 2^BN_num_bits(N)) * (2^i / N) * (2^BN_num_bits(N) / 2^i)| - // = |m/N| - if (!BN_rshift(a, m, recp->num_bits)) { - goto err; - } - if (!BN_mul(b, a, &(recp->Nr), ctx)) { - goto err; - } - if (!BN_rshift(d, b, i - recp->num_bits)) { - goto err; - } - d->neg = 0; - - if (!BN_mul(b, &(recp->N), d, ctx)) { - goto err; - } - if (!BN_usub(r, m, b)) { - goto err; - } - r->neg = 0; - - j = 0; - while (BN_ucmp(r, &(recp->N)) >= 0) { - if (j++ > 2) { - OPENSSL_PUT_ERROR(BN, BN_R_BAD_RECIPROCAL); - goto err; - } - if (!BN_usub(r, r, &(recp->N))) { - goto err; - } - if (!BN_add_word(d, 1)) { - goto err; - } - } - - r->neg = BN_is_zero(r) ? 0 : m->neg; - d->neg = m->neg ^ recp->N.neg; - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -static int BN_mod_mul_reciprocal(BIGNUM *r, const BIGNUM *x, const BIGNUM *y, - BN_RECP_CTX *recp, BN_CTX *ctx) { - int ret = 0; - BIGNUM *a; - const BIGNUM *ca; - - BN_CTX_start(ctx); - a = BN_CTX_get(ctx); - if (a == NULL) { - goto err; - } - - if (y != NULL) { - if (x == y) { - if (!BN_sqr(a, x, ctx)) { - goto err; - } - } else { - if (!BN_mul(a, x, y, ctx)) { - goto err; - } - } - ca = a; - } else { - ca = x; // Just do the mod - } - - ret = BN_div_recp(NULL, r, ca, recp, ctx); - -err: - BN_CTX_end(ctx); - return ret; -} - -// BN_window_bits_for_exponent_size returns sliding window size for mod_exp with -// a |b| bit exponent. -// -// For window size 'w' (w >= 2) and a random 'b' bits exponent, the number of -// multiplications is a constant plus on average -// -// 2^(w-1) + (b-w)/(w+1); -// -// here 2^(w-1) is for precomputing the table (we actually need entries only -// for windows that have the lowest bit set), and (b-w)/(w+1) is an -// approximation for the expected number of w-bit windows, not counting the -// first one. -// -// Thus we should use -// -// w >= 6 if b > 671 -// w = 5 if 671 > b > 239 -// w = 4 if 239 > b > 79 -// w = 3 if 79 > b > 23 -// w <= 2 if 23 > b -// -// (with draws in between). Very small exponents are often selected -// with low Hamming weight, so we use w = 1 for b <= 23. -static int BN_window_bits_for_exponent_size(int b) { - if (b > 671) { - return 6; - } - if (b > 239) { - return 5; - } - if (b > 79) { - return 4; - } - if (b > 23) { - return 3; - } - return 1; -} - -// TABLE_SIZE is the maximum precomputation table size for *variable* sliding -// windows. This must be 2^(max_window - 1), where max_window is the largest -// value returned from |BN_window_bits_for_exponent_size|. -#define TABLE_SIZE 32 - -// TABLE_BITS_SMALL is the smallest value returned from -// |BN_window_bits_for_exponent_size| when |b| is at most |BN_BITS2| * -// |BN_SMALL_MAX_WORDS| words. -#define TABLE_BITS_SMALL 5 - -// TABLE_SIZE_SMALL is the same as |TABLE_SIZE|, but when |b| is at most -// |BN_BITS2| * |BN_SMALL_MAX_WORDS|. -#define TABLE_SIZE_SMALL (1 << (TABLE_BITS_SMALL - 1)) - -static int mod_exp_recp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, - const BIGNUM *m, BN_CTX *ctx) { - int i, j, ret = 0, wstart, window; - int start = 1; - BIGNUM *aa; - // Table of variables obtained from 'ctx' - BIGNUM *val[TABLE_SIZE]; - BN_RECP_CTX recp; - - // This function is only called on even moduli. - assert(!BN_is_odd(m)); - - int bits = BN_num_bits(p); - if (bits == 0) { - return BN_one(r); - } - - BN_CTX_start(ctx); - aa = BN_CTX_get(ctx); - val[0] = BN_CTX_get(ctx); - if (!aa || !val[0]) { - goto err; - } - - BN_RECP_CTX_init(&recp); - if (m->neg) { - // ignore sign of 'm' - if (!BN_copy(aa, m)) { - goto err; - } - aa->neg = 0; - if (BN_RECP_CTX_set(&recp, aa, ctx) <= 0) { - goto err; - } - } else { - if (BN_RECP_CTX_set(&recp, m, ctx) <= 0) { - goto err; - } - } - - if (!BN_nnmod(val[0], a, m, ctx)) { - goto err; // 1 - } - if (BN_is_zero(val[0])) { - BN_zero(r); - ret = 1; - goto err; - } - - window = BN_window_bits_for_exponent_size(bits); - if (window > 1) { - if (!BN_mod_mul_reciprocal(aa, val[0], val[0], &recp, ctx)) { - goto err; // 2 - } - j = 1 << (window - 1); - for (i = 1; i < j; i++) { - if (((val[i] = BN_CTX_get(ctx)) == NULL) || - !BN_mod_mul_reciprocal(val[i], val[i - 1], aa, &recp, ctx)) { - goto err; - } - } - } - - start = 1; // This is used to avoid multiplication etc - // when there is only the value '1' in the - // buffer. - wstart = bits - 1; // The top bit of the window - - if (!BN_one(r)) { - goto err; - } - - for (;;) { - int wvalue; // The 'value' of the window - int wend; // The bottom bit of the window - - if (!BN_is_bit_set(p, wstart)) { - if (!start) { - if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) { - goto err; - } - } - if (wstart == 0) { - break; - } - wstart--; - continue; - } - - // We now have wstart on a 'set' bit, we now need to work out - // how bit a window to do. To do this we need to scan - // forward until the last set bit before the end of the - // window - wvalue = 1; - wend = 0; - for (i = 1; i < window; i++) { - if (wstart - i < 0) { - break; - } - if (BN_is_bit_set(p, wstart - i)) { - wvalue <<= (i - wend); - wvalue |= 1; - wend = i; - } - } - - // wend is the size of the current window - j = wend + 1; - // add the 'bytes above' - if (!start) { - for (i = 0; i < j; i++) { - if (!BN_mod_mul_reciprocal(r, r, r, &recp, ctx)) { - goto err; - } - } - } - - // wvalue will be an odd number < 2^window - if (!BN_mod_mul_reciprocal(r, r, val[wvalue >> 1], &recp, ctx)) { - goto err; - } - - // move the 'window' down further - wstart -= wend + 1; - start = 0; - if (wstart < 0) { - break; - } - } - ret = 1; - -err: - BN_CTX_end(ctx); - BN_RECP_CTX_free(&recp); - return ret; -} - -int BN_mod_exp(BIGNUM *r, const BIGNUM *a, const BIGNUM *p, const BIGNUM *m, - BN_CTX *ctx) { - if (m->neg) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - if (a->neg || BN_ucmp(a, m) >= 0) { - if (!BN_nnmod(r, a, m, ctx)) { - return 0; - } - a = r; - } - - if (BN_is_odd(m)) { - return BN_mod_exp_mont(r, a, p, m, ctx, NULL); - } - - return mod_exp_recp(r, a, p, m, ctx); -} - -int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, - const BIGNUM *m, BN_CTX *ctx, const BN_MONT_CTX *mont) { - if (!BN_is_odd(m)) { - OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); - return 0; - } - if (m->neg) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - if (a->neg || BN_ucmp(a, m) >= 0) { - OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); - return 0; - } - - int bits = BN_num_bits(p); - if (bits == 0) { - // x**0 mod 1 is still zero. - if (BN_abs_is_word(m, 1)) { - BN_zero(rr); - return 1; - } - return BN_one(rr); - } - - int ret = 0; - BIGNUM *val[TABLE_SIZE]; - BN_MONT_CTX *new_mont = NULL; - - BN_CTX_start(ctx); - BIGNUM *r = BN_CTX_get(ctx); - val[0] = BN_CTX_get(ctx); - if (r == NULL || val[0] == NULL) { - goto err; - } - - // Allocate a montgomery context if it was not supplied by the caller. - if (mont == NULL) { - new_mont = BN_MONT_CTX_new_consttime(m, ctx); - if (new_mont == NULL) { - goto err; - } - mont = new_mont; - } - - // We exponentiate by looking at sliding windows of the exponent and - // precomputing powers of |a|. Windows may be shifted so they always end on a - // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1) - // for i = 0 to 2^(window-1), all in Montgomery form. - int window = BN_window_bits_for_exponent_size(bits); - if (!BN_to_montgomery(val[0], a, mont, ctx)) { - goto err; - } - if (window > 1) { - BIGNUM *d = BN_CTX_get(ctx); - if (d == NULL || - !BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx)) { - goto err; - } - for (int i = 1; i < 1 << (window - 1); i++) { - val[i] = BN_CTX_get(ctx); - if (val[i] == NULL || - !BN_mod_mul_montgomery(val[i], val[i - 1], d, mont, ctx)) { - goto err; - } - } - } - - // |p| is non-zero, so at least one window is non-zero. To save some - // multiplications, defer initializing |r| until then. - int r_is_one = 1; - int wstart = bits - 1; // The top bit of the window. - for (;;) { - if (!BN_is_bit_set(p, wstart)) { - if (!r_is_one && !BN_mod_mul_montgomery(r, r, r, mont, ctx)) { - goto err; - } - if (wstart == 0) { - break; - } - wstart--; - continue; - } - - // We now have wstart on a set bit. Find the largest window we can use. - int wvalue = 1; - int wsize = 0; - for (int i = 1; i < window && i <= wstart; i++) { - if (BN_is_bit_set(p, wstart - i)) { - wvalue <<= (i - wsize); - wvalue |= 1; - wsize = i; - } - } - - // Shift |r| to the end of the window. - if (!r_is_one) { - for (int i = 0; i < wsize + 1; i++) { - if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) { - goto err; - } - } - } - - assert(wvalue & 1); - assert(wvalue < (1 << window)); - if (r_is_one) { - if (!BN_copy(r, val[wvalue >> 1])) { - goto err; - } - } else if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) { - goto err; - } - - r_is_one = 0; - if (wstart == wsize) { - break; - } - wstart -= wsize + 1; - } - - // |p| is non-zero, so |r_is_one| must be cleared at some point. - assert(!r_is_one); - - if (!BN_from_montgomery(rr, r, mont, ctx)) { - goto err; - } - ret = 1; - -err: - BN_MONT_CTX_free(new_mont); - BN_CTX_end(ctx); - return ret; -} - -void bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num, - const BN_ULONG *p, size_t num_p, - const BN_MONT_CTX *mont) { - if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) { - abort(); - } - assert(BN_is_odd(&mont->N)); - - // Count the number of bits in |p|. Note this function treats |p| as public. - while (num_p != 0 && p[num_p - 1] == 0) { - num_p--; - } - if (num_p == 0) { - bn_from_montgomery_small(r, num, mont->RR.d, num, mont); - return; - } - unsigned bits = BN_num_bits_word(p[num_p - 1]) + (num_p - 1) * BN_BITS2; - assert(bits != 0); - - // We exponentiate by looking at sliding windows of the exponent and - // precomputing powers of |a|. Windows may be shifted so they always end on a - // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1) for - // i = 0 to 2^(window-1), all in Montgomery form. - unsigned window = BN_window_bits_for_exponent_size(bits); - if (window > TABLE_BITS_SMALL) { - window = TABLE_BITS_SMALL; // Tolerate excessively large |p|. - } - BN_ULONG val[TABLE_SIZE_SMALL][BN_SMALL_MAX_WORDS]; - OPENSSL_memcpy(val[0], a, num * sizeof(BN_ULONG)); - if (window > 1) { - BN_ULONG d[BN_SMALL_MAX_WORDS]; - bn_mod_mul_montgomery_small(d, val[0], val[0], num, mont); - for (unsigned i = 1; i < 1u << (window - 1); i++) { - bn_mod_mul_montgomery_small(val[i], val[i - 1], d, num, mont); - } - } - - // |p| is non-zero, so at least one window is non-zero. To save some - // multiplications, defer initializing |r| until then. - int r_is_one = 1; - unsigned wstart = bits - 1; // The top bit of the window. - for (;;) { - if (!bn_is_bit_set_words(p, num_p, wstart)) { - if (!r_is_one) { - bn_mod_mul_montgomery_small(r, r, r, num, mont); - } - if (wstart == 0) { - break; - } - wstart--; - continue; - } - - // We now have wstart on a set bit. Find the largest window we can use. - unsigned wvalue = 1; - unsigned wsize = 0; - for (unsigned i = 1; i < window && i <= wstart; i++) { - if (bn_is_bit_set_words(p, num_p, wstart - i)) { - wvalue <<= (i - wsize); - wvalue |= 1; - wsize = i; - } - } - - // Shift |r| to the end of the window. - if (!r_is_one) { - for (unsigned i = 0; i < wsize + 1; i++) { - bn_mod_mul_montgomery_small(r, r, r, num, mont); - } - } - - assert(wvalue & 1); - assert(wvalue < (1u << window)); - if (r_is_one) { - OPENSSL_memcpy(r, val[wvalue >> 1], num * sizeof(BN_ULONG)); - } else { - bn_mod_mul_montgomery_small(r, r, val[wvalue >> 1], num, mont); - } - r_is_one = 0; - if (wstart == wsize) { - break; - } - wstart -= wsize + 1; - } - - // |p| is non-zero, so |r_is_one| must be cleared at some point. - assert(!r_is_one); - OPENSSL_cleanse(val, sizeof(val)); -} - -void bn_mod_inverse0_prime_mont_small(BN_ULONG *r, const BN_ULONG *a, - size_t num, const BN_MONT_CTX *mont) { - if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) { - abort(); - } - - // Per Fermat's Little Theorem, a^-1 = a^(p-2) (mod p) for p prime. - BN_ULONG p_minus_two[BN_SMALL_MAX_WORDS]; - const BN_ULONG *p = mont->N.d; - OPENSSL_memcpy(p_minus_two, p, num * sizeof(BN_ULONG)); - if (p_minus_two[0] >= 2) { - p_minus_two[0] -= 2; - } else { - p_minus_two[0] -= 2; - for (size_t i = 1; i < num; i++) { - if (p_minus_two[i]-- != 0) { - break; - } - } - } - - bn_mod_exp_mont_small(r, a, num, p_minus_two, num, mont); -} - -static void copy_to_prebuf(const BIGNUM *b, int top, BN_ULONG *table, int idx, - int window) { - int ret = bn_copy_words(table + idx * top, top, b); - assert(ret); // |b| is guaranteed to fit. - (void)ret; -} - -static int copy_from_prebuf(BIGNUM *b, int top, const BN_ULONG *table, int idx, - int window) { - if (!bn_wexpand(b, top)) { - return 0; - } - - OPENSSL_memset(b->d, 0, sizeof(BN_ULONG) * top); - const int width = 1 << window; - for (int i = 0; i < width; i++, table += top) { - BN_ULONG mask = constant_time_eq_int(i, idx); - for (int j = 0; j < top; j++) { - b->d[j] |= table[j] & mask; - } - } - - b->width = top; - return 1; -} - -#define MOD_EXP_CTIME_MIN_CACHE_LINE_MASK \ - (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - 1) - -// Window sizes optimized for fixed window size modular exponentiation -// algorithm (BN_mod_exp_mont_consttime). -// -// To achieve the security goals of BN_mode_exp_mont_consttime, the maximum -// size of the window must not exceed -// log_2(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH). -// -// Window size thresholds are defined for cache line sizes of 32 and 64, cache -// line sizes where log_2(32)=5 and log_2(64)=6 respectively. A window size of -// 7 should only be used on processors that have a 128 byte or greater cache -// line size. -#if MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 64 - -#define BN_window_bits_for_ctime_exponent_size(b) \ - ((b) > 937 ? 6 : (b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1) -#define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (6) - -#elif MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH == 32 - -#define BN_window_bits_for_ctime_exponent_size(b) \ - ((b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1) -#define BN_MAX_WINDOW_BITS_FOR_CTIME_EXPONENT_SIZE (5) - -#endif - -// Given a pointer value, compute the next address that is a cache line -// multiple. -#define MOD_EXP_CTIME_ALIGN(x_) \ - ((unsigned char *)(x_) + \ - (MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH - \ - (((size_t)(x_)) & (MOD_EXP_CTIME_MIN_CACHE_LINE_MASK)))) - -// This variant of |BN_mod_exp_mont| uses fixed windows and fixed memory access -// patterns to protect secret exponents (cf. the hyper-threading timing attacks -// pointed out by Colin Percival, -// http://www.daemonology.net/hyperthreading-considered-harmful/) -int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, - const BIGNUM *m, BN_CTX *ctx, - const BN_MONT_CTX *mont) { - int i, ret = 0, window, wvalue; - BN_MONT_CTX *new_mont = NULL; - - int numPowers; - unsigned char *powerbufFree = NULL; - int powerbufLen = 0; - BN_ULONG *powerbuf = NULL; - BIGNUM tmp, am; - - if (!BN_is_odd(m)) { - OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); - return 0; - } - if (m->neg) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - if (a->neg || BN_ucmp(a, m) >= 0) { - OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); - return 0; - } - - // Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak - // whether the top bits are zero. - int max_bits = p->width * BN_BITS2; - int bits = max_bits; - if (bits == 0) { - // x**0 mod 1 is still zero. - if (BN_abs_is_word(m, 1)) { - BN_zero(rr); - return 1; - } - return BN_one(rr); - } - - // Allocate a montgomery context if it was not supplied by the caller. - if (mont == NULL) { - new_mont = BN_MONT_CTX_new_consttime(m, ctx); - if (new_mont == NULL) { - goto err; - } - mont = new_mont; - } - - // Use the width in |mont->N|, rather than the copy in |m|. The assembly - // implementation assumes it can use |top| to size R. - int top = mont->N.width; - -#if defined(OPENSSL_BN_ASM_MONT5) || defined(RSAZ_ENABLED) - // Share one large stack-allocated buffer between the RSAZ and non-RSAZ code - // paths. If we were to use separate static buffers for each then there is - // some chance that both large buffers would be allocated on the stack, - // causing the stack space requirement to be truly huge (~10KB). - alignas(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH) BN_ULONG - storage[MOD_EXP_CTIME_STORAGE_LEN]; -#endif -#if defined(RSAZ_ENABLED) - // If the size of the operands allow it, perform the optimized RSAZ - // exponentiation. For further information see crypto/fipsmodule/bn/rsaz_exp.c - // and accompanying assembly modules. - if (a->width == 16 && p->width == 16 && BN_num_bits(m) == 1024 && - rsaz_avx2_preferred()) { - if (!bn_wexpand(rr, 16)) { - goto err; - } - RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0], - storage); - rr->width = 16; - rr->neg = 0; - ret = 1; - goto err; - } -#endif - - // Get the window size to use with size of p. - window = BN_window_bits_for_ctime_exponent_size(bits); -#if defined(OPENSSL_BN_ASM_MONT5) - if (window >= 5) { - window = 5; // ~5% improvement for RSA2048 sign, and even for RSA4096 - // Reserve space for the |mont->N| copy. - powerbufLen += top * sizeof(mont->N.d[0]); - } -#endif - - // Allocate a buffer large enough to hold all of the pre-computed - // powers of |am|, |am| itself, and |tmp|. - numPowers = 1 << window; - powerbufLen += - sizeof(m->d[0]) * - (top * numPowers + ((2 * top) > numPowers ? (2 * top) : numPowers)); - -#if defined(OPENSSL_BN_ASM_MONT5) - if ((size_t)powerbufLen <= sizeof(storage)) { - powerbuf = storage; - } - // |storage| is more than large enough to handle 1024-bit inputs. - assert(powerbuf != NULL || top * BN_BITS2 > 1024); -#endif - if (powerbuf == NULL) { - powerbufFree = - OPENSSL_malloc(powerbufLen + MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH); - if (powerbufFree == NULL) { - goto err; - } - powerbuf = (BN_ULONG *)MOD_EXP_CTIME_ALIGN(powerbufFree); - } - OPENSSL_memset(powerbuf, 0, powerbufLen); - - // Place |tmp| and |am| right after powers table. - tmp.d = powerbuf + top * numPowers; - am.d = tmp.d + top; - tmp.width = am.width = 0; - tmp.dmax = am.dmax = top; - tmp.neg = am.neg = 0; - tmp.flags = am.flags = BN_FLG_STATIC_DATA; - - if (!bn_one_to_montgomery(&tmp, mont, ctx) || - !bn_resize_words(&tmp, top)) { - goto err; - } - - // Prepare a^1 in the Montgomery domain. - assert(!a->neg); - assert(BN_ucmp(a, m) < 0); - if (!BN_to_montgomery(&am, a, mont, ctx) || - !bn_resize_words(&am, top)) { - goto err; - } - -#if defined(OPENSSL_BN_ASM_MONT5) - // This optimization uses ideas from https://eprint.iacr.org/2011/239, - // specifically optimization of cache-timing attack countermeasures, - // pre-computation optimization, and Almost Montgomery Multiplication. - // - // The paper discusses a 4-bit window to optimize 512-bit modular - // exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer - // important. - // - // |bn_mul_mont_gather5| and |bn_power5| implement the "almost" reduction - // variant, so the values here may not be fully reduced. They are bounded by R - // (i.e. they fit in |top| words), not |m|. Additionally, we pass these - // "almost" reduced inputs into |bn_mul_mont|, which implements the normal - // reduction variant. Given those inputs, |bn_mul_mont| may not give reduced - // output, but it will still produce "almost" reduced output. - // - // TODO(davidben): Using "almost" reduction complicates analysis of this code, - // and its interaction with other parts of the project. Determine whether this - // is actually necessary for performance. - if (window == 5 && top > 1) { - // Copy |mont->N| to improve cache locality. - BN_ULONG *np = am.d + top; - for (i = 0; i < top; i++) { - np[i] = mont->N.d[i]; - } - - // Fill |powerbuf| with the first 32 powers of |am|. - const BN_ULONG *n0 = mont->n0; - bn_scatter5(tmp.d, top, powerbuf, 0); - bn_scatter5(am.d, am.width, powerbuf, 1); - bn_mul_mont(tmp.d, am.d, am.d, np, n0, top); - bn_scatter5(tmp.d, top, powerbuf, 2); - - // Square to compute powers of two. - for (i = 4; i < 32; i *= 2) { - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_scatter5(tmp.d, top, powerbuf, i); - } - // Compute odd powers |i| based on |i - 1|, then all powers |i * 2^j|. - for (i = 3; i < 32; i += 2) { - bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); - bn_scatter5(tmp.d, top, powerbuf, i); - for (int j = 2 * i; j < 32; j *= 2) { - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_scatter5(tmp.d, top, powerbuf, j); - } - } - - bits--; - for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--) { - wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); - } - bn_gather5(tmp.d, top, powerbuf, wvalue); - - // At this point |bits| is 4 mod 5 and at least -1. (|bits| is the first bit - // that has not been read yet.) - assert(bits >= -1 && (bits == -1 || bits % 5 == 4)); - - // Scan the exponent one window at a time starting from the most - // significant bits. - if (top & 7) { - while (bits >= 0) { - for (wvalue = 0, i = 0; i < 5; i++, bits--) { - wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); - } - - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_mul_mont(tmp.d, tmp.d, tmp.d, np, n0, top); - bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); - } - } else { - const uint8_t *p_bytes = (const uint8_t *)p->d; - assert(bits < max_bits); - // |p = 0| has been handled as a special case, so |max_bits| is at least - // one word. - assert(max_bits >= 64); - - // If the first bit to be read lands in the last byte, unroll the first - // iteration to avoid reading past the bounds of |p->d|. (After the first - // iteration, we are guaranteed to be past the last byte.) Note |bits| - // here is the top bit, inclusive. - if (bits - 4 >= max_bits - 8) { - // Read five bits from |bits-4| through |bits|, inclusive. - wvalue = p_bytes[p->width * BN_BYTES - 1]; - wvalue >>= (bits - 4) & 7; - wvalue &= 0x1f; - bits -= 5; - bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); - } - while (bits >= 0) { - // Read five bits from |bits-4| through |bits|, inclusive. - int first_bit = bits - 4; - uint16_t val; - OPENSSL_memcpy(&val, p_bytes + (first_bit >> 3), sizeof(val)); - val >>= first_bit & 7; - val &= 0x1f; - bits -= 5; - bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, val); - } - } - // The result is now in |tmp| in Montgomery form, but it may not be fully - // reduced. This is within bounds for |BN_from_montgomery| (tmp < R <= m*R) - // so it will, when converting from Montgomery form, produce a fully reduced - // result. - // - // This differs from Figure 2 of the paper, which uses AMM(h, 1) to convert - // from Montgomery form with unreduced output, followed by an extra - // reduction step. In the paper's terminology, we replace steps 9 and 10 - // with MM(h, 1). - } else -#endif - { - copy_to_prebuf(&tmp, top, powerbuf, 0, window); - copy_to_prebuf(&am, top, powerbuf, 1, window); - - // If the window size is greater than 1, then calculate - // val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) - // (even powers could instead be computed as (a^(i/2))^2 - // to use the slight performance advantage of sqr over mul). - if (window > 1) { - if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx)) { - goto err; - } - - copy_to_prebuf(&tmp, top, powerbuf, 2, window); - - for (i = 3; i < numPowers; i++) { - // Calculate a^i = a^(i-1) * a - if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx)) { - goto err; - } - - copy_to_prebuf(&tmp, top, powerbuf, i, window); - } - } - - bits--; - for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) { - wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); - } - if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, window)) { - goto err; - } - - // Scan the exponent one window at a time starting from the most - // significant bits. - while (bits >= 0) { - wvalue = 0; // The 'value' of the window - - // Scan the window, squaring the result as we go - for (i = 0; i < window; i++, bits--) { - if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp, mont, ctx)) { - goto err; - } - wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); - } - - // Fetch the appropriate pre-computed value from the pre-buf - if (!copy_from_prebuf(&am, top, powerbuf, wvalue, window)) { - goto err; - } - - // Multiply the result into the intermediate result - if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx)) { - goto err; - } - } - } - - // Convert the final result from Montgomery to standard format. If we used the - // |OPENSSL_BN_ASM_MONT5| codepath, |tmp| may not be fully reduced. It is only - // bounded by R rather than |m|. However, that is still within bounds for - // |BN_from_montgomery|, which implements full Montgomery reduction, not - // "almost" Montgomery reduction. - if (!BN_from_montgomery(rr, &tmp, mont, ctx)) { - goto err; - } - ret = 1; - -err: - BN_MONT_CTX_free(new_mont); - if (powerbuf != NULL && powerbufFree == NULL) { - OPENSSL_cleanse(powerbuf, powerbufLen); - } - OPENSSL_free(powerbufFree); - return (ret); -} - -int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, - const BIGNUM *m, BN_CTX *ctx, - const BN_MONT_CTX *mont) { - BIGNUM a_bignum; - BN_init(&a_bignum); - - int ret = 0; - - // BN_mod_exp_mont requires reduced inputs. - if (bn_minimal_width(m) == 1) { - a %= m->d[0]; - } - - if (!BN_set_word(&a_bignum, a)) { - OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR); - goto err; - } - - ret = BN_mod_exp_mont(rr, &a_bignum, p, m, ctx, mont); - -err: - BN_free(&a_bignum); - - return ret; -} - -#define TABLE_SIZE 32 - -int BN_mod_exp2_mont(BIGNUM *rr, const BIGNUM *a1, const BIGNUM *p1, - const BIGNUM *a2, const BIGNUM *p2, const BIGNUM *m, - BN_CTX *ctx, const BN_MONT_CTX *mont) { - BIGNUM tmp; - BN_init(&tmp); - - int ret = 0; - BN_MONT_CTX *new_mont = NULL; - - // Allocate a montgomery context if it was not supplied by the caller. - if (mont == NULL) { - new_mont = BN_MONT_CTX_new_for_modulus(m, ctx); - if (new_mont == NULL) { - goto err; - } - mont = new_mont; - } - - // BN_mod_mul_montgomery removes one Montgomery factor, so passing one - // Montgomery-encoded and one non-Montgomery-encoded value gives a - // non-Montgomery-encoded result. - if (!BN_mod_exp_mont(rr, a1, p1, m, ctx, mont) || - !BN_mod_exp_mont(&tmp, a2, p2, m, ctx, mont) || - !BN_to_montgomery(rr, rr, mont, ctx) || - !BN_mod_mul_montgomery(rr, rr, &tmp, mont, ctx)) { - goto err; - } - - ret = 1; - -err: - BN_MONT_CTX_free(new_mont); - BN_free(&tmp); - - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/exponentiation.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/exponentiation.cc.inc new file mode 100644 index 00000000..869c78eb --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/exponentiation.cc.inc @@ -0,0 +1,743 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include +#include + +#include "internal.h" +#include "rsaz_exp.h" + + +using namespace bssl; + +#if defined(OPENSSL_BN_ASM_MONT5) + +// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it +// by |ap| modulo |np|, and stores the result in |rp|. The values are |num| +// words long and represented in Montgomery form. |n0| is a pointer to the +// corresponding field in |BN_MONT_CTX|. |table| must be aligned to at least +// 16 bytes. |power| must be less than 32 and is treated as secret. +// +// WARNING: This function implements Almost Montgomery Multiplication from +// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. +// However, even if they are fully reduced, the output may not be. +static void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG *n0, int num, int power) { + if (bn_mulx4x_mont_gather5_capable(num)) { + bn_mulx4x_mont_gather5(rp, ap, table, np, n0, num, power); + } else if (bn_mul4x_mont_gather5_capable(num)) { + bn_mul4x_mont_gather5(rp, ap, table, np, n0, num, power); + } else { + bn_mul_mont_gather5_nohw(rp, ap, table, np, n0, num, power); + } +} + +// bn_power5 squares |ap| five times and multiplies it by the value stored at +// index |power| of |table|, modulo |np|. It stores the result in |rp|. The +// values are |num| words long and represented in Montgomery form. |n0| is a +// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible +// by 8. |power| must be less than 32 and is treated as secret. +// +// WARNING: This function implements Almost Montgomery Multiplication from +// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. +// However, even if they are fully reduced, the output may not be. +static void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, + const BN_ULONG *np, const BN_ULONG *n0, int num, + int power) { + assert(bn_power5_capable(num)); + if (bn_powerx5_capable(num)) { + bn_powerx5(rp, ap, table, np, n0, num, power); + } else { + bn_power5_nohw(rp, ap, table, np, n0, num, power); + } +} + +#endif // defined(OPENSSL_BN_ASM_MONT5) + +// BN_window_bits_for_exponent_size returns sliding window size for mod_exp with +// a |b| bit exponent. +// +// For window size 'w' (w >= 2) and a random 'b' bits exponent, the number of +// multiplications is a constant plus on average +// +// 2^(w-1) + (b-w)/(w+1); +// +// here 2^(w-1) is for precomputing the table (we actually need entries only +// for windows that have the lowest bit set), and (b-w)/(w+1) is an +// approximation for the expected number of w-bit windows, not counting the +// first one. +// +// Thus we should use +// +// w >= 6 if b > 671 +// w = 5 if 671 > b > 239 +// w = 4 if 239 > b > 79 +// w = 3 if 79 > b > 23 +// w <= 2 if 23 > b +// +// (with draws in between). Very small exponents are often selected +// with low Hamming weight, so we use w = 1 for b <= 23. +static int BN_window_bits_for_exponent_size(size_t b) { + if (b > 671) { + return 6; + } + if (b > 239) { + return 5; + } + if (b > 79) { + return 4; + } + if (b > 23) { + return 3; + } + return 1; +} + +// TABLE_SIZE is the maximum precomputation table size for *variable* sliding +// windows. This must be 2^(max_window - 1), where max_window is the largest +// value returned from |BN_window_bits_for_exponent_size|. +#define TABLE_SIZE 32 + +// TABLE_BITS_SMALL is the smallest value returned from +// |BN_window_bits_for_exponent_size| when |b| is at most |BN_BITS2| * +// |BN_SMALL_MAX_WORDS| words. +#define TABLE_BITS_SMALL 5 + +// TABLE_SIZE_SMALL is the same as |TABLE_SIZE|, but when |b| is at most +// |BN_BITS2| * |BN_SMALL_MAX_WORDS|. +#define TABLE_SIZE_SMALL (1 << (TABLE_BITS_SMALL - 1)) + +int BN_mod_exp_mont(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, + const BIGNUM *m, BN_CTX *ctx, const BN_MONT_CTX *mont) { + if (!BN_is_odd(m)) { + OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); + return 0; + } + if (m->neg) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + // |a| is secret, but |a < m| is not. + if (a->neg || constant_time_declassify_int(BN_ucmp(a, m)) >= 0) { + OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); + return 0; + } + + int bits = BN_num_bits(p); + if (bits == 0) { + // x**0 mod 1 is still zero. + if (BN_abs_is_word(m, 1)) { + BN_zero(rr); + return 1; + } + return BN_one(rr); + } + + BIGNUM *val[TABLE_SIZE]; + + BN_CTXScope scope(ctx); + BIGNUM *r = BN_CTX_get(ctx); + val[0] = BN_CTX_get(ctx); + if (r == nullptr || val[0] == nullptr) { + return 0; + } + + // Allocate a montgomery context if it was not supplied by the caller. + UniquePtr new_mont; + if (mont == nullptr) { + new_mont.reset(BN_MONT_CTX_new_consttime(m, ctx)); + if (new_mont == nullptr) { + return 0; + } + mont = new_mont.get(); + } + + // We exponentiate by looking at sliding windows of the exponent and + // precomputing powers of |a|. Windows may be shifted so they always end on a + // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1) + // for i = 0 to 2^(window-1), all in Montgomery form. + int window = BN_window_bits_for_exponent_size(bits); + if (!BN_to_montgomery(val[0], a, mont, ctx)) { + return 0; + } + if (window > 1) { + BIGNUM *d = BN_CTX_get(ctx); + if (d == nullptr || !BN_mod_mul_montgomery(d, val[0], val[0], mont, ctx)) { + return 0; + } + for (int i = 1; i < 1 << (window - 1); i++) { + val[i] = BN_CTX_get(ctx); + if (val[i] == nullptr || + !BN_mod_mul_montgomery(val[i], val[i - 1], d, mont, ctx)) { + return 0; + } + } + } + + // |p| is non-zero, so at least one window is non-zero. To save some + // multiplications, defer initializing |r| until then. + int r_is_one = 1; + int wstart = bits - 1; // The top bit of the window. + for (;;) { + if (!BN_is_bit_set(p, wstart)) { + if (!r_is_one && !BN_mod_mul_montgomery(r, r, r, mont, ctx)) { + return 0; + } + if (wstart == 0) { + break; + } + wstart--; + continue; + } + + // We now have wstart on a set bit. Find the largest window we can use. + int wvalue = 1; + int wsize = 0; + for (int i = 1; i < window && i <= wstart; i++) { + if (BN_is_bit_set(p, wstart - i)) { + wvalue <<= (i - wsize); + wvalue |= 1; + wsize = i; + } + } + + // Shift |r| to the end of the window. + if (!r_is_one) { + for (int i = 0; i < wsize + 1; i++) { + if (!BN_mod_mul_montgomery(r, r, r, mont, ctx)) { + return 0; + } + } + } + + assert(wvalue & 1); + assert(wvalue < (1 << window)); + if (r_is_one) { + if (!BN_copy(r, val[wvalue >> 1])) { + return 0; + } + } else if (!BN_mod_mul_montgomery(r, r, val[wvalue >> 1], mont, ctx)) { + return 0; + } + + r_is_one = 0; + if (wstart == wsize) { + break; + } + wstart -= wsize + 1; + } + + // |p| is non-zero, so |r_is_one| must be cleared at some point. + assert(!r_is_one); + + return BN_from_montgomery(rr, r, mont, ctx); +} + +void bssl::bn_mod_exp_mont_small(BN_ULONG *r, const BN_ULONG *a, size_t num, + const BN_ULONG *p, size_t num_p, + const BN_MONT_CTX *mont) { + if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS || + num_p > SIZE_MAX / BN_BITS2) { + abort(); + } + assert(BN_is_odd(&mont->N)); + + // Count the number of bits in |p|, skipping leading zeros. Note this function + // treats |p| as public. + while (num_p != 0 && p[num_p - 1] == 0) { + num_p--; + } + if (num_p == 0) { + bn_from_montgomery_small(r, num, mont->RR.d, num, mont); + return; + } + size_t bits = BN_num_bits_word(p[num_p - 1]) + (num_p - 1) * BN_BITS2; + assert(bits != 0); + + // We exponentiate by looking at sliding windows of the exponent and + // precomputing powers of |a|. Windows may be shifted so they always end on a + // set bit, so only precompute odd powers. We compute val[i] = a^(2*i + 1) for + // i = 0 to 2^(window-1), all in Montgomery form. + unsigned window = BN_window_bits_for_exponent_size(bits); + if (window > TABLE_BITS_SMALL) { + window = TABLE_BITS_SMALL; // Tolerate excessively large |p|. + } + BN_ULONG val[TABLE_SIZE_SMALL][BN_SMALL_MAX_WORDS]; + OPENSSL_memcpy(val[0], a, num * sizeof(BN_ULONG)); + if (window > 1) { + BN_ULONG d[BN_SMALL_MAX_WORDS]; + bn_mod_mul_montgomery_small(d, val[0], val[0], num, mont); + for (unsigned i = 1; i < 1u << (window - 1); i++) { + bn_mod_mul_montgomery_small(val[i], val[i - 1], d, num, mont); + } + } + + // |p| is non-zero, so at least one window is non-zero. To save some + // multiplications, defer initializing |r| until then. + int r_is_one = 1; + size_t wstart = bits - 1; // The top bit of the window. + for (;;) { + if (!bn_is_bit_set_words(p, num_p, wstart)) { + if (!r_is_one) { + bn_mod_mul_montgomery_small(r, r, r, num, mont); + } + if (wstart == 0) { + break; + } + wstart--; + continue; + } + + // We now have wstart on a set bit. Find the largest window we can use. + unsigned wvalue = 1; + unsigned wsize = 0; + for (unsigned i = 1; i < window && i <= wstart; i++) { + if (bn_is_bit_set_words(p, num_p, wstart - i)) { + wvalue <<= (i - wsize); + wvalue |= 1; + wsize = i; + } + } + + // Shift |r| to the end of the window. + if (!r_is_one) { + for (unsigned i = 0; i < wsize + 1; i++) { + bn_mod_mul_montgomery_small(r, r, r, num, mont); + } + } + + assert(wvalue & 1); + assert(wvalue < (1u << window)); + if (r_is_one) { + OPENSSL_memcpy(r, val[wvalue >> 1], num * sizeof(BN_ULONG)); + } else { + bn_mod_mul_montgomery_small(r, r, val[wvalue >> 1], num, mont); + } + r_is_one = 0; + if (wstart == wsize) { + break; + } + wstart -= wsize + 1; + } + + // |p| is non-zero, so |r_is_one| must be cleared at some point. + assert(!r_is_one); + OPENSSL_cleanse(val, sizeof(val)); +} + +void bssl::bn_mod_inverse0_prime_mont_small(BN_ULONG *r, const BN_ULONG *a, + size_t num, + const BN_MONT_CTX *mont) { + if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS || num == 0) { + abort(); + } + + // Per Fermat's Little Theorem, a^-1 = a^(p-2) (mod p) for p prime. + BN_ULONG p_minus_two[BN_SMALL_MAX_WORDS]; + const BN_ULONG *p = mont->N.d; + OPENSSL_memcpy(p_minus_two, p, num * sizeof(BN_ULONG)); + if (p_minus_two[0] >= 2) { + p_minus_two[0] -= 2; + } else { + p_minus_two[0] -= 2; + for (size_t i = 1; i < num; i++) { + if (p_minus_two[i]-- != 0) { + break; + } + } + } + + bn_mod_exp_mont_small(r, a, num, p_minus_two, num, mont); +} + +static void copy_to_prebuf(const BIGNUM *b, int top, BN_ULONG *table, int idx, + int window) { + int ret = bn_copy_words(table + idx * top, top, b); + assert(ret); // |b| is guaranteed to fit. + (void)ret; +} + +static int copy_from_prebuf(BIGNUM *b, int top, const BN_ULONG *table, int idx, + int window) { + if (!bn_wexpand(b, top)) { + return 0; + } + + OPENSSL_memset(b->d, 0, sizeof(BN_ULONG) * top); + const int width = 1 << window; + for (int i = 0; i < width; i++, table += top) { + // Use a value barrier to prevent Clang from adding a branch when |i != idx| + // and making this copy not constant time. Clang is still allowed to learn + // that |mask| is constant across the inner loop, so this won't inhibit any + // vectorization it might do. + BN_ULONG mask = value_barrier_w(constant_time_eq_int(i, idx)); + for (int j = 0; j < top; j++) { + b->d[j] |= table[j] & mask; + } + } + + b->width = top; + return 1; +} + +// Window sizes optimized for fixed window size modular exponentiation +// algorithm (BN_mod_exp_mont_consttime). +// +// TODO(davidben): These window sizes were originally set for 64-byte cache +// lines with a cache-line-dependent constant-time mitigation. They can probably +// be revised now that our implementation is no longer cache-time-dependent. +#define BN_window_bits_for_ctime_exponent_size(b) \ + ((b) > 937 ? 6 : (b) > 306 ? 5 : (b) > 89 ? 4 : (b) > 22 ? 3 : 1) +#define BN_MAX_MOD_EXP_CTIME_WINDOW (6) + +// This variant of |BN_mod_exp_mont| uses fixed windows and fixed memory access +// patterns to protect secret exponents (cf. the hyper-threading timing attacks +// pointed out by Colin Percival, +// http://www.daemonology.net/hyperthreading-considered-harmful/) +int BN_mod_exp_mont_consttime(BIGNUM *rr, const BIGNUM *a, const BIGNUM *p, + const BIGNUM *m, BN_CTX *ctx, + const BN_MONT_CTX *mont) { + int i, ret = 0, wvalue; + + void *powerbuf_free = nullptr; + size_t powerbuf_len = 0; + BN_ULONG *powerbuf = nullptr; + + if (!BN_is_odd(m)) { + OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); + return 0; + } + if (m->neg) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + // |a| is secret, but it is required to be in range, so these comparisons may + // be leaked. + if (a->neg || constant_time_declassify_int(BN_ucmp(a, m) >= 0)) { + OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); + return 0; + } + + // Use all bits stored in |p|, rather than |BN_num_bits|, so we do not leak + // whether the top bits are zero. + int max_bits = p->width * BN_BITS2; + int bits = max_bits; + if (bits == 0) { + // x**0 mod 1 is still zero. + if (BN_abs_is_word(m, 1)) { + BN_zero(rr); + return 1; + } + return BN_one(rr); + } + + // Allocate a montgomery context if it was not supplied by the caller. + int top, num_powers, window; + UniquePtr new_mont; + if (mont == nullptr) { + new_mont.reset(BN_MONT_CTX_new_consttime(m, ctx)); + if (new_mont == nullptr) { + goto err; + } + mont = new_mont.get(); + } + + // Use the width in |mont->N|, rather than the copy in |m|. The assembly + // implementation assumes it can use |top| to size R. + top = mont->N.width; + +#if defined(OPENSSL_BN_ASM_MONT5) || defined(RSAZ_ENABLED) + // Share one large stack-allocated buffer between the RSAZ and non-RSAZ code + // paths. If we were to use separate static buffers for each then there is + // some chance that both large buffers would be allocated on the stack, + // causing the stack space requirement to be truly huge (~10KB). + alignas(MOD_EXP_CTIME_ALIGN) BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]; +#endif +#if defined(RSAZ_ENABLED) + // If the size of the operands allow it, perform the optimized RSAZ + // exponentiation. For further information see crypto/fipsmodule/bn/rsaz_exp.c + // and accompanying assembly modules. + if (a->width == 16 && p->width == 16 && BN_num_bits(m) == 1024 && + rsaz_avx2_preferred()) { + if (!bn_wexpand(rr, 16)) { + goto err; + } + RSAZ_1024_mod_exp_avx2(rr->d, a->d, p->d, m->d, mont->RR.d, mont->n0[0], + storage); + rr->width = 16; + rr->neg = 0; + ret = 1; + goto err; + } +#endif + + // Get the window size to use with size of p. + window = BN_window_bits_for_ctime_exponent_size(bits); + assert(window <= BN_MAX_MOD_EXP_CTIME_WINDOW); + + // Calculating |powerbuf_len| below cannot overflow because of the bound on + // Montgomery reduction. + assert((size_t)top <= BN_MONTGOMERY_MAX_WORDS); + static_assert( + BN_MONTGOMERY_MAX_WORDS <= + INT_MAX / sizeof(BN_ULONG) / ((1 << BN_MAX_MOD_EXP_CTIME_WINDOW) + 3), + "powerbuf_len may overflow"); + +#if defined(OPENSSL_BN_ASM_MONT5) + if (window >= 5) { + window = 5; // ~5% improvement for RSA2048 sign, and even for RSA4096 + // Reserve space for the |mont->N| copy. + powerbuf_len += top * sizeof(mont->N.d[0]); + } +#endif + + // Allocate a buffer large enough to hold all of the pre-computed + // powers of |am|, |am| itself, and |tmp|. + num_powers = 1 << window; + powerbuf_len += sizeof(m->d[0]) * top * (num_powers + 2); + +#if defined(OPENSSL_BN_ASM_MONT5) + if (powerbuf_len <= sizeof(storage)) { + powerbuf = storage; + } + // |storage| is more than large enough to handle 1024-bit inputs. + assert(powerbuf != nullptr || top * BN_BITS2 > 1024); +#endif + if (powerbuf == nullptr) { + powerbuf_free = OPENSSL_malloc(powerbuf_len + MOD_EXP_CTIME_ALIGN); + if (powerbuf_free == nullptr) { + goto err; + } + powerbuf = reinterpret_cast( + align_pointer(powerbuf_free, MOD_EXP_CTIME_ALIGN)); + } + OPENSSL_memset(powerbuf, 0, powerbuf_len); + + // Place |tmp| and |am| right after powers table. + BIGNUM tmp, am; + tmp.d = powerbuf + top * num_powers; + am.d = tmp.d + top; + tmp.width = am.width = 0; + tmp.dmax = am.dmax = top; + tmp.neg = am.neg = 0; + tmp.flags = am.flags = BN_FLG_STATIC_DATA; + + if (!bn_one_to_montgomery(&tmp, mont, ctx) || !bn_resize_words(&tmp, top)) { + goto err; + } + + // Prepare a^1 in the Montgomery domain. + assert(!a->neg); + declassify_assert(BN_ucmp(a, m) < 0); + if (!BN_to_montgomery(&am, a, mont, ctx) || !bn_resize_words(&am, top)) { + goto err; + } + +#if defined(OPENSSL_BN_ASM_MONT5) + // This optimization uses ideas from https://eprint.iacr.org/2011/239, + // specifically optimization of cache-timing attack countermeasures, + // pre-computation optimization, and Almost Montgomery Multiplication. + // + // The paper discusses a 4-bit window to optimize 512-bit modular + // exponentiation, used in RSA-1024 with CRT, but RSA-1024 is no longer + // important. + // + // |bn_mul_mont_gather5| and |bn_power5| implement the "almost" reduction + // variant, so the values here may not be fully reduced. They are bounded by R + // (i.e. they fit in |top| words), not |m|. Additionally, we pass these + // "almost" reduced inputs into |bn_mul_mont_words|, which implements the + // normal reduction variant. Given those inputs, |bn_mul_mont_words| may not + // give reduced output, but it will still produce "almost" reduced output. + // + // TODO(davidben): Using "almost" reduction complicates analysis of this code, + // and its interaction with other parts of the project. Determine whether this + // is actually necessary for performance. + if (window == 5 && top > 1) { + // Copy |mont->N| to improve cache locality. + BN_ULONG *np = am.d + top; + for (i = 0; i < top; i++) { + np[i] = mont->N.d[i]; + } + + // Fill |powerbuf| with the first 32 powers of |am|. + const BN_ULONG *n0 = mont->n0; + bn_scatter5(tmp.d, top, powerbuf, 0); + bn_scatter5(am.d, am.width, powerbuf, 1); + bn_mul_mont_words(tmp.d, am.d, am.d, np, n0, top); + bn_scatter5(tmp.d, top, powerbuf, 2); + + // Square to compute powers of two. + for (i = 4; i < 32; i *= 2) { + bn_mul_mont_words(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_scatter5(tmp.d, top, powerbuf, i); + } + // Compute odd powers |i| based on |i - 1|, then all powers |i * 2^j|. + for (i = 3; i < 32; i += 2) { + bn_mul_mont_gather5(tmp.d, am.d, powerbuf, np, n0, top, i - 1); + bn_scatter5(tmp.d, top, powerbuf, i); + for (int j = 2 * i; j < 32; j *= 2) { + bn_mul_mont_words(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_scatter5(tmp.d, top, powerbuf, j); + } + } + + bits--; + for (wvalue = 0, i = bits % 5; i >= 0; i--, bits--) { + wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); + } + bn_gather5(tmp.d, top, powerbuf, wvalue); + + // At this point |bits| is 4 mod 5 and at least -1. (|bits| is the first bit + // that has not been read yet.) + assert(bits >= -1 && (bits == -1 || bits % 5 == 4)); + + // Scan the exponent one window at a time starting from the most + // significant bits. + if (!bn_power5_capable(top)) { + while (bits >= 0) { + for (wvalue = 0, i = 0; i < 5; i++, bits--) { + wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); + } + + bn_mul_mont_words(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont_words(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont_words(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont_words(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont_words(tmp.d, tmp.d, tmp.d, np, n0, top); + bn_mul_mont_gather5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); + } + } else { + const uint8_t *p_bytes = (const uint8_t *)p->d; + assert(bits < max_bits); + // |p = 0| has been handled as a special case, so |max_bits| is at least + // one word. + assert(max_bits >= 64); + + // If the first bit to be read lands in the last byte, unroll the first + // iteration to avoid reading past the bounds of |p->d|. (After the first + // iteration, we are guaranteed to be past the last byte.) Note |bits| + // here is the top bit, inclusive. + if (bits - 4 >= max_bits - 8) { + // Read five bits from |bits-4| through |bits|, inclusive. + wvalue = p_bytes[p->width * BN_BYTES - 1]; + wvalue >>= (bits - 4) & 7; + wvalue &= 0x1f; + bits -= 5; + bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, wvalue); + } + while (bits >= 0) { + // Read five bits from |bits-4| through |bits|, inclusive. + int first_bit = bits - 4; + uint16_t val; + OPENSSL_memcpy(&val, p_bytes + (first_bit >> 3), sizeof(val)); + val >>= first_bit & 7; + val &= 0x1f; + bits -= 5; + bn_power5(tmp.d, tmp.d, powerbuf, np, n0, top, val); + } + } + // The result is now in |tmp| in Montgomery form, but it may not be fully + // reduced. This is within bounds for |BN_from_montgomery| (tmp < R <= m*R) + // so it will, when converting from Montgomery form, produce a fully reduced + // result. + // + // This differs from Figure 2 of the paper, which uses AMM(h, 1) to convert + // from Montgomery form with unreduced output, followed by an extra + // reduction step. In the paper's terminology, we replace steps 9 and 10 + // with MM(h, 1). + } else +#endif + { + copy_to_prebuf(&tmp, top, powerbuf, 0, window); + copy_to_prebuf(&am, top, powerbuf, 1, window); + + // If the window size is greater than 1, then calculate + // val[i=2..2^winsize-1]. Powers are computed as a*a^(i-1) + // (even powers could instead be computed as (a^(i/2))^2 + // to use the slight performance advantage of sqr over mul). + if (window > 1) { + if (!BN_mod_mul_montgomery(&tmp, &am, &am, mont, ctx)) { + goto err; + } + + copy_to_prebuf(&tmp, top, powerbuf, 2, window); + + for (i = 3; i < num_powers; i++) { + // Calculate a^i = a^(i-1) * a + if (!BN_mod_mul_montgomery(&tmp, &am, &tmp, mont, ctx)) { + goto err; + } + + copy_to_prebuf(&tmp, top, powerbuf, i, window); + } + } + + bits--; + for (wvalue = 0, i = bits % window; i >= 0; i--, bits--) { + wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); + } + if (!copy_from_prebuf(&tmp, top, powerbuf, wvalue, window)) { + goto err; + } + + // Scan the exponent one window at a time starting from the most + // significant bits. + while (bits >= 0) { + wvalue = 0; // The 'value' of the window + + // Scan the window, squaring the result as we go + for (i = 0; i < window; i++, bits--) { + if (!BN_mod_mul_montgomery(&tmp, &tmp, &tmp, mont, ctx)) { + goto err; + } + wvalue = (wvalue << 1) + BN_is_bit_set(p, bits); + } + + // Fetch the appropriate pre-computed value from the pre-buf + if (!copy_from_prebuf(&am, top, powerbuf, wvalue, window)) { + goto err; + } + + // Multiply the result into the intermediate result + if (!BN_mod_mul_montgomery(&tmp, &tmp, &am, mont, ctx)) { + goto err; + } + } + } + + // Convert the final result from Montgomery to standard format. If we used the + // |OPENSSL_BN_ASM_MONT5| codepath, |tmp| may not be fully reduced. It is only + // bounded by R rather than |m|. However, that is still within bounds for + // |BN_from_montgomery|, which implements full Montgomery reduction, not + // "almost" Montgomery reduction. + if (!BN_from_montgomery(rr, &tmp, mont, ctx)) { + goto err; + } + ret = 1; + +err: + if (powerbuf != nullptr && powerbuf_free == nullptr) { + OPENSSL_cleanse(powerbuf, powerbuf_len); + } + OPENSSL_free(powerbuf_free); + return ret; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/gcd.c b/third_party/boringssl/src/crypto/fipsmodule/bn/gcd.c deleted file mode 100644 index bd0fa6f5..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/gcd.c +++ /dev/null @@ -1,378 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -/* ==================================================================== - * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include - -#include "internal.h" - - -int BN_mod_inverse_odd(BIGNUM *out, int *out_no_inverse, const BIGNUM *a, - const BIGNUM *n, BN_CTX *ctx) { - *out_no_inverse = 0; - - if (!BN_is_odd(n)) { - OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); - return 0; - } - - if (BN_is_negative(a) || BN_cmp(a, n) >= 0) { - OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); - return 0; - } - - BIGNUM *A, *B, *X, *Y; - int ret = 0; - int sign; - - BN_CTX_start(ctx); - A = BN_CTX_get(ctx); - B = BN_CTX_get(ctx); - X = BN_CTX_get(ctx); - Y = BN_CTX_get(ctx); - if (Y == NULL) { - goto err; - } - - BIGNUM *R = out; - - BN_zero(Y); - if (!BN_one(X) || BN_copy(B, a) == NULL || BN_copy(A, n) == NULL) { - goto err; - } - A->neg = 0; - sign = -1; - // From B = a mod |n|, A = |n| it follows that - // - // 0 <= B < A, - // -sign*X*a == B (mod |n|), - // sign*Y*a == A (mod |n|). - - // Binary inversion algorithm; requires odd modulus. This is faster than the - // general algorithm if the modulus is sufficiently small (about 400 .. 500 - // bits on 32-bit systems, but much more on 64-bit systems) - int shift; - - while (!BN_is_zero(B)) { - // 0 < B < |n|, - // 0 < A <= |n|, - // (1) -sign*X*a == B (mod |n|), - // (2) sign*Y*a == A (mod |n|) - - // Now divide B by the maximum possible power of two in the integers, - // and divide X by the same value mod |n|. - // When we're done, (1) still holds. - shift = 0; - while (!BN_is_bit_set(B, shift)) { - // note that 0 < B - shift++; - - if (BN_is_odd(X)) { - if (!BN_uadd(X, X, n)) { - goto err; - } - } - // now X is even, so we can easily divide it by two - if (!BN_rshift1(X, X)) { - goto err; - } - } - if (shift > 0) { - if (!BN_rshift(B, B, shift)) { - goto err; - } - } - - // Same for A and Y. Afterwards, (2) still holds. - shift = 0; - while (!BN_is_bit_set(A, shift)) { - // note that 0 < A - shift++; - - if (BN_is_odd(Y)) { - if (!BN_uadd(Y, Y, n)) { - goto err; - } - } - // now Y is even - if (!BN_rshift1(Y, Y)) { - goto err; - } - } - if (shift > 0) { - if (!BN_rshift(A, A, shift)) { - goto err; - } - } - - // We still have (1) and (2). - // Both A and B are odd. - // The following computations ensure that - // - // 0 <= B < |n|, - // 0 < A < |n|, - // (1) -sign*X*a == B (mod |n|), - // (2) sign*Y*a == A (mod |n|), - // - // and that either A or B is even in the next iteration. - if (BN_ucmp(B, A) >= 0) { - // -sign*(X + Y)*a == B - A (mod |n|) - if (!BN_uadd(X, X, Y)) { - goto err; - } - // NB: we could use BN_mod_add_quick(X, X, Y, n), but that - // actually makes the algorithm slower - if (!BN_usub(B, B, A)) { - goto err; - } - } else { - // sign*(X + Y)*a == A - B (mod |n|) - if (!BN_uadd(Y, Y, X)) { - goto err; - } - // as above, BN_mod_add_quick(Y, Y, X, n) would slow things down - if (!BN_usub(A, A, B)) { - goto err; - } - } - } - - if (!BN_is_one(A)) { - *out_no_inverse = 1; - OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE); - goto err; - } - - // The while loop (Euclid's algorithm) ends when - // A == gcd(a,n); - // we have - // sign*Y*a == A (mod |n|), - // where Y is non-negative. - - if (sign < 0) { - if (!BN_sub(Y, n, Y)) { - goto err; - } - } - // Now Y*a == A (mod |n|). - - // Y*a == 1 (mod |n|) - if (!Y->neg && BN_ucmp(Y, n) < 0) { - if (!BN_copy(R, Y)) { - goto err; - } - } else { - if (!BN_nnmod(R, Y, n, ctx)) { - goto err; - } - } - - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -BIGNUM *BN_mod_inverse(BIGNUM *out, const BIGNUM *a, const BIGNUM *n, - BN_CTX *ctx) { - BIGNUM *new_out = NULL; - if (out == NULL) { - new_out = BN_new(); - if (new_out == NULL) { - OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE); - return NULL; - } - out = new_out; - } - - int ok = 0; - BIGNUM *a_reduced = NULL; - if (a->neg || BN_ucmp(a, n) >= 0) { - a_reduced = BN_dup(a); - if (a_reduced == NULL) { - goto err; - } - if (!BN_nnmod(a_reduced, a_reduced, n, ctx)) { - goto err; - } - a = a_reduced; - } - - int no_inverse; - if (!BN_is_odd(n)) { - if (!bn_mod_inverse_consttime(out, &no_inverse, a, n, ctx)) { - goto err; - } - } else if (!BN_mod_inverse_odd(out, &no_inverse, a, n, ctx)) { - goto err; - } - - ok = 1; - -err: - if (!ok) { - BN_free(new_out); - out = NULL; - } - BN_free(a_reduced); - return out; -} - -int BN_mod_inverse_blinded(BIGNUM *out, int *out_no_inverse, const BIGNUM *a, - const BN_MONT_CTX *mont, BN_CTX *ctx) { - *out_no_inverse = 0; - - if (BN_is_negative(a) || BN_cmp(a, &mont->N) >= 0) { - OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); - return 0; - } - - int ret = 0; - BIGNUM blinding_factor; - BN_init(&blinding_factor); - - if (!BN_rand_range_ex(&blinding_factor, 1, &mont->N) || - !BN_mod_mul_montgomery(out, &blinding_factor, a, mont, ctx) || - !BN_mod_inverse_odd(out, out_no_inverse, out, &mont->N, ctx) || - !BN_mod_mul_montgomery(out, &blinding_factor, out, mont, ctx)) { - OPENSSL_PUT_ERROR(BN, ERR_R_BN_LIB); - goto err; - } - - ret = 1; - -err: - BN_free(&blinding_factor); - return ret; -} - -int bn_mod_inverse_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p, - BN_CTX *ctx, const BN_MONT_CTX *mont_p) { - BN_CTX_start(ctx); - BIGNUM *p_minus_2 = BN_CTX_get(ctx); - int ok = p_minus_2 != NULL && - BN_copy(p_minus_2, p) && - BN_sub_word(p_minus_2, 2) && - BN_mod_exp_mont(out, a, p_minus_2, p, ctx, mont_p); - BN_CTX_end(ctx); - return ok; -} - -int bn_mod_inverse_secret_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p, - BN_CTX *ctx, const BN_MONT_CTX *mont_p) { - BN_CTX_start(ctx); - BIGNUM *p_minus_2 = BN_CTX_get(ctx); - int ok = p_minus_2 != NULL && - BN_copy(p_minus_2, p) && - BN_sub_word(p_minus_2, 2) && - BN_mod_exp_mont_consttime(out, a, p_minus_2, p, ctx, mont_p); - BN_CTX_end(ctx); - return ok; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/gcd.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/gcd.cc.inc new file mode 100644 index 00000000..4fdb2e96 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/gcd.cc.inc @@ -0,0 +1,281 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "internal.h" + + +using namespace bssl; + +int BN_mod_inverse_odd(BIGNUM *out, int *out_no_inverse, const BIGNUM *a, + const BIGNUM *n, BN_CTX *ctx) { + *out_no_inverse = 0; + + if (!BN_is_odd(n)) { + OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); + return 0; + } + + if (BN_is_negative(a) || BN_cmp(a, n) >= 0) { + OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); + return 0; + } + + int sign; + BN_CTXScope scope(ctx); + BIGNUM *A = BN_CTX_get(ctx); + BIGNUM *B = BN_CTX_get(ctx); + BIGNUM *X = BN_CTX_get(ctx); + BIGNUM *Y = BN_CTX_get(ctx); + BIGNUM *R = out; + if (Y == nullptr) { + return 0; + } + + BN_zero(Y); + if (!BN_one(X) || BN_copy(B, a) == nullptr || BN_copy(A, n) == nullptr) { + return 0; + } + A->neg = 0; + sign = -1; + // From B = a mod |n|, A = |n| it follows that + // + // 0 <= B < A, + // -sign*X*a == B (mod |n|), + // sign*Y*a == A (mod |n|). + + // Binary inversion algorithm; requires odd modulus. This is faster than the + // general algorithm if the modulus is sufficiently small (about 400 .. 500 + // bits on 32-bit systems, but much more on 64-bit systems) + int shift; + + while (!BN_is_zero(B)) { + // 0 < B < |n|, + // 0 < A <= |n|, + // (1) -sign*X*a == B (mod |n|), + // (2) sign*Y*a == A (mod |n|) + + // Now divide B by the maximum possible power of two in the integers, + // and divide X by the same value mod |n|. + // When we're done, (1) still holds. + shift = 0; + while (!BN_is_bit_set(B, shift)) { + // note that 0 < B + shift++; + + if (BN_is_odd(X)) { + if (!BN_uadd(X, X, n)) { + return 0; + } + } + // now X is even, so we can easily divide it by two + if (!BN_rshift1(X, X)) { + return 0; + } + } + if (shift > 0) { + if (!BN_rshift(B, B, shift)) { + return 0; + } + } + + // Same for A and Y. Afterwards, (2) still holds. + shift = 0; + while (!BN_is_bit_set(A, shift)) { + // note that 0 < A + shift++; + + if (BN_is_odd(Y)) { + if (!BN_uadd(Y, Y, n)) { + return 0; + } + } + // now Y is even + if (!BN_rshift1(Y, Y)) { + return 0; + } + } + if (shift > 0) { + if (!BN_rshift(A, A, shift)) { + return 0; + } + } + + // We still have (1) and (2). + // Both A and B are odd. + // The following computations ensure that + // + // 0 <= B < |n|, + // 0 < A < |n|, + // (1) -sign*X*a == B (mod |n|), + // (2) sign*Y*a == A (mod |n|), + // + // and that either A or B is even in the next iteration. + if (BN_ucmp(B, A) >= 0) { + // -sign*(X + Y)*a == B - A (mod |n|) + if (!BN_uadd(X, X, Y)) { + return 0; + } + // NB: we could use BN_mod_add_quick(X, X, Y, n), but that + // actually makes the algorithm slower + if (!BN_usub(B, B, A)) { + return 0; + } + } else { + // sign*(X + Y)*a == A - B (mod |n|) + if (!BN_uadd(Y, Y, X)) { + return 0; + } + // as above, BN_mod_add_quick(Y, Y, X, n) would slow things down + if (!BN_usub(A, A, B)) { + return 0; + } + } + } + + if (!BN_is_one(A)) { + *out_no_inverse = 1; + OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE); + return 0; + } + + // The while loop (Euclid's algorithm) ends when + // A == gcd(a,n); + // we have + // sign*Y*a == A (mod |n|), + // where Y is non-negative. + + if (sign < 0) { + if (!BN_sub(Y, n, Y)) { + return 0; + } + } + // Now Y*a == A (mod |n|). + + // Y*a == 1 (mod |n|) + if (Y->neg || BN_ucmp(Y, n) >= 0) { + if (!BN_nnmod(Y, Y, n, ctx)) { + return 0; + } + } + if (!BN_copy(R, Y)) { + return 0; + } + + return 1; +} + +BIGNUM *BN_mod_inverse(BIGNUM *out, const BIGNUM *a, const BIGNUM *n, + BN_CTX *ctx) { + UniquePtr new_out; + if (out == nullptr) { + new_out.reset(BN_new()); + if (new_out == nullptr) { + return nullptr; + } + out = new_out.get(); + } + + UniquePtr a_reduced; + if (a->neg || BN_ucmp(a, n) >= 0) { + a_reduced.reset(BN_dup(a)); + if (a_reduced == nullptr) { + return nullptr; + } + if (!BN_nnmod(a_reduced.get(), a_reduced.get(), n, ctx)) { + return nullptr; + } + a = a_reduced.get(); + } + + int no_inverse; + if (!BN_is_odd(n)) { + if (!bn_mod_inverse_consttime(out, &no_inverse, a, n, ctx)) { + return nullptr; + } + } else if (!BN_mod_inverse_odd(out, &no_inverse, a, n, ctx)) { + return nullptr; + } + + new_out.release(); // Passed to the caller via |out|. + return out; +} + +int BN_mod_inverse_blinded(BIGNUM *out, int *out_no_inverse, const BIGNUM *a, + const BN_MONT_CTX *mont, BN_CTX *ctx) { + *out_no_inverse = 0; + + // |a| is secret, but it is required to be in range, so these comparisons may + // be leaked. + if (BN_is_negative(a) || + constant_time_declassify_int(BN_cmp(a, &mont->N) >= 0)) { + OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); + return 0; + } + + UniquePtr blinding_factor(BN_new()); + if (blinding_factor == nullptr) { + return 0; + } + + // |BN_mod_inverse_odd| is leaky, so generate a secret blinding factor and + // blind |a|. This works because (ar)^-1 * r = a^-1, supposing r is + // invertible. If r is not invertible, this function will fail. However, we + // only use this in RSA, where stumbling on an uninvertible element means + // stumbling on the key's factorization. That is, if this function fails, the + // RSA key was not actually a product of two large primes. + // + // TODO(crbug.com/boringssl/677): When the PRNG output is marked secret by + // default, the explicit |bn_secret| call can be removed. + if (!BN_rand_range_ex(blinding_factor.get(), 1, &mont->N)) { + return 0; + } + bn_secret(blinding_factor.get()); + if (!BN_mod_mul_montgomery(out, blinding_factor.get(), a, mont, ctx)) { + return 0; + } + + // Once blinded, |out| is no longer secret, so it may be passed to a leaky + // mod inverse function. Note |blinding_factor| is secret, so |out| will be + // secret again after multiplying. + bn_declassify(out); + if (!BN_mod_inverse_odd(out, out_no_inverse, out, &mont->N, ctx) || + !BN_mod_mul_montgomery(out, blinding_factor.get(), out, mont, ctx)) { + return 0; + } + + return 1; +} + +int bssl::bn_mod_inverse_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p, + BN_CTX *ctx, const BN_MONT_CTX *mont_p) { + BN_CTXScope scope(ctx); + BIGNUM *p_minus_2 = BN_CTX_get(ctx); + return p_minus_2 != nullptr && BN_copy(p_minus_2, p) && + BN_sub_word(p_minus_2, 2) && + BN_mod_exp_mont(out, a, p_minus_2, p, ctx, mont_p); +} + +int bssl::bn_mod_inverse_secret_prime(BIGNUM *out, const BIGNUM *a, + const BIGNUM *p, BN_CTX *ctx, + const BN_MONT_CTX *mont_p) { + BN_CTXScope scope(ctx); + BIGNUM *p_minus_2 = BN_CTX_get(ctx); + return p_minus_2 != nullptr && BN_copy(p_minus_2, p) && + BN_sub_word(p_minus_2, 2) && + BN_mod_exp_mont_consttime(out, a, p_minus_2, p, ctx, mont_p); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/gcd_extra.c b/third_party/boringssl/src/crypto/fipsmodule/bn/gcd_extra.c deleted file mode 100644 index 53ab1705..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/gcd_extra.c +++ /dev/null @@ -1,326 +0,0 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - -#include - -#include "internal.h" - - -static BN_ULONG word_is_odd_mask(BN_ULONG a) { return (BN_ULONG)0 - (a & 1); } - -static void maybe_rshift1_words(BN_ULONG *a, BN_ULONG mask, BN_ULONG *tmp, - size_t num) { - bn_rshift1_words(tmp, a, num); - bn_select_words(a, mask, tmp, a, num); -} - -static void maybe_rshift1_words_carry(BN_ULONG *a, BN_ULONG carry, - BN_ULONG mask, BN_ULONG *tmp, - size_t num) { - maybe_rshift1_words(a, mask, tmp, num); - if (num != 0) { - carry &= mask; - a[num - 1] |= carry << (BN_BITS2-1); - } -} - -static BN_ULONG maybe_add_words(BN_ULONG *a, BN_ULONG mask, const BN_ULONG *b, - BN_ULONG *tmp, size_t num) { - BN_ULONG carry = bn_add_words(tmp, a, b, num); - bn_select_words(a, mask, tmp, a, num); - return carry & mask; -} - -static int bn_gcd_consttime(BIGNUM *r, unsigned *out_shift, const BIGNUM *x, - const BIGNUM *y, BN_CTX *ctx) { - size_t width = x->width > y->width ? x->width : y->width; - if (width == 0) { - *out_shift = 0; - BN_zero(r); - return 1; - } - - // This is a constant-time implementation of Stein's algorithm (binary GCD). - int ret = 0; - BN_CTX_start(ctx); - BIGNUM *u = BN_CTX_get(ctx); - BIGNUM *v = BN_CTX_get(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - if (u == NULL || v == NULL || tmp == NULL || - !BN_copy(u, x) || - !BN_copy(v, y) || - !bn_resize_words(u, width) || - !bn_resize_words(v, width) || - !bn_resize_words(tmp, width)) { - goto err; - } - - // Each loop iteration halves at least one of |u| and |v|. Thus we need at - // most the combined bit width of inputs for at least one value to be zero. - unsigned x_bits = x->width * BN_BITS2, y_bits = y->width * BN_BITS2; - unsigned num_iters = x_bits + y_bits; - if (num_iters < x_bits) { - OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); - goto err; - } - - unsigned shift = 0; - for (unsigned i = 0; i < num_iters; i++) { - BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]); - - // If both |u| and |v| are odd, subtract the smaller from the larger. - BN_ULONG u_less_than_v = - (BN_ULONG)0 - bn_sub_words(tmp->d, u->d, v->d, width); - bn_select_words(u->d, both_odd & ~u_less_than_v, tmp->d, u->d, width); - bn_sub_words(tmp->d, v->d, u->d, width); - bn_select_words(v->d, both_odd & u_less_than_v, tmp->d, v->d, width); - - // At least one of |u| and |v| is now even. - BN_ULONG u_is_odd = word_is_odd_mask(u->d[0]); - BN_ULONG v_is_odd = word_is_odd_mask(v->d[0]); - assert(!(u_is_odd & v_is_odd)); - - // If both are even, the final GCD gains a factor of two. - shift += 1 & (~u_is_odd & ~v_is_odd); - - // Halve any which are even. - maybe_rshift1_words(u->d, ~u_is_odd, tmp->d, width); - maybe_rshift1_words(v->d, ~v_is_odd, tmp->d, width); - } - - // One of |u| or |v| is zero at this point. The algorithm usually makes |u| - // zero, unless |y| was already zero on input. Fix this by combining the - // values. - assert(BN_is_zero(u) || BN_is_zero(v)); - for (size_t i = 0; i < width; i++) { - v->d[i] |= u->d[i]; - } - - *out_shift = shift; - ret = bn_set_words(r, v->d, width); - -err: - BN_CTX_end(ctx); - return ret; -} - -int BN_gcd(BIGNUM *r, const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx) { - unsigned shift; - return bn_gcd_consttime(r, &shift, x, y, ctx) && - BN_lshift(r, r, shift); -} - -int bn_is_relatively_prime(int *out_relatively_prime, const BIGNUM *x, - const BIGNUM *y, BN_CTX *ctx) { - int ret = 0; - BN_CTX_start(ctx); - unsigned shift; - BIGNUM *gcd = BN_CTX_get(ctx); - if (gcd == NULL || - !bn_gcd_consttime(gcd, &shift, x, y, ctx)) { - goto err; - } - - // Check that 2^|shift| * |gcd| is one. - if (gcd->width == 0) { - *out_relatively_prime = 0; - } else { - BN_ULONG mask = shift | (gcd->d[0] ^ 1); - for (int i = 1; i < gcd->width; i++) { - mask |= gcd->d[i]; - } - *out_relatively_prime = mask == 0; - } - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -int bn_lcm_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) { - BN_CTX_start(ctx); - unsigned shift; - BIGNUM *gcd = BN_CTX_get(ctx); - int ret = gcd != NULL && // - bn_mul_consttime(r, a, b, ctx) && - bn_gcd_consttime(gcd, &shift, a, b, ctx) && - // |gcd| has a secret bit width. - bn_div_consttime(r, NULL, r, gcd, /*divisor_min_bits=*/0, ctx) && - bn_rshift_secret_shift(r, r, shift, ctx); - BN_CTX_end(ctx); - return ret; -} - -int bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse, const BIGNUM *a, - const BIGNUM *n, BN_CTX *ctx) { - *out_no_inverse = 0; - if (BN_is_negative(a) || BN_ucmp(a, n) >= 0) { - OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); - return 0; - } - if (BN_is_zero(a)) { - if (BN_is_one(n)) { - BN_zero(r); - return 1; - } - *out_no_inverse = 1; - OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE); - return 0; - } - - // This is a constant-time implementation of the extended binary GCD - // algorithm. It is adapted from the Handbook of Applied Cryptography, section - // 14.4.3, algorithm 14.51, and modified to bound coefficients and avoid - // negative numbers. - // - // For more details and proof of correctness, see - // https://github.com/mit-plv/fiat-crypto/pull/333. In particular, see |step| - // and |mod_inverse_consttime| for the algorithm in Gallina and see - // |mod_inverse_consttime_spec| for the correctness result. - - if (!BN_is_odd(a) && !BN_is_odd(n)) { - *out_no_inverse = 1; - OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE); - return 0; - } - - // This function exists to compute the RSA private exponent, where |a| is one - // word. We'll thus use |a_width| when available. - size_t n_width = n->width, a_width = a->width; - if (a_width > n_width) { - a_width = n_width; - } - - int ret = 0; - BN_CTX_start(ctx); - BIGNUM *u = BN_CTX_get(ctx); - BIGNUM *v = BN_CTX_get(ctx); - BIGNUM *A = BN_CTX_get(ctx); - BIGNUM *B = BN_CTX_get(ctx); - BIGNUM *C = BN_CTX_get(ctx); - BIGNUM *D = BN_CTX_get(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - BIGNUM *tmp2 = BN_CTX_get(ctx); - if (u == NULL || v == NULL || A == NULL || B == NULL || C == NULL || - D == NULL || tmp == NULL || tmp2 == NULL || - !BN_copy(u, a) || - !BN_copy(v, n) || - !BN_one(A) || - !BN_one(D) || - // For convenience, size |u| and |v| equivalently. - !bn_resize_words(u, n_width) || - !bn_resize_words(v, n_width) || - // |A| and |C| are bounded by |m|. - !bn_resize_words(A, n_width) || - !bn_resize_words(C, n_width) || - // |B| and |D| are bounded by |a|. - !bn_resize_words(B, a_width) || - !bn_resize_words(D, a_width) || - // |tmp| and |tmp2| may be used at either size. - !bn_resize_words(tmp, n_width) || - !bn_resize_words(tmp2, n_width)) { - goto err; - } - - // Each loop iteration halves at least one of |u| and |v|. Thus we need at - // most the combined bit width of inputs for at least one value to be zero. - unsigned a_bits = a_width * BN_BITS2, n_bits = n_width * BN_BITS2; - unsigned num_iters = a_bits + n_bits; - if (num_iters < a_bits) { - OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); - goto err; - } - - // Before and after each loop iteration, the following hold: - // - // u = A*a - B*n - // v = D*n - C*a - // 0 < u <= a - // 0 <= v <= n - // 0 <= A < n - // 0 <= B <= a - // 0 <= C < n - // 0 <= D <= a - // - // After each loop iteration, u and v only get smaller, and at least one of - // them shrinks by at least a factor of two. - for (unsigned i = 0; i < num_iters; i++) { - BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]); - - // If both |u| and |v| are odd, subtract the smaller from the larger. - BN_ULONG v_less_than_u = - (BN_ULONG)0 - bn_sub_words(tmp->d, v->d, u->d, n_width); - bn_select_words(v->d, both_odd & ~v_less_than_u, tmp->d, v->d, n_width); - bn_sub_words(tmp->d, u->d, v->d, n_width); - bn_select_words(u->d, both_odd & v_less_than_u, tmp->d, u->d, n_width); - - // If we updated one of the values, update the corresponding coefficient. - BN_ULONG carry = bn_add_words(tmp->d, A->d, C->d, n_width); - carry -= bn_sub_words(tmp2->d, tmp->d, n->d, n_width); - bn_select_words(tmp->d, carry, tmp->d, tmp2->d, n_width); - bn_select_words(A->d, both_odd & v_less_than_u, tmp->d, A->d, n_width); - bn_select_words(C->d, both_odd & ~v_less_than_u, tmp->d, C->d, n_width); - - bn_add_words(tmp->d, B->d, D->d, a_width); - bn_sub_words(tmp2->d, tmp->d, a->d, a_width); - bn_select_words(tmp->d, carry, tmp->d, tmp2->d, a_width); - bn_select_words(B->d, both_odd & v_less_than_u, tmp->d, B->d, a_width); - bn_select_words(D->d, both_odd & ~v_less_than_u, tmp->d, D->d, a_width); - - // Our loop invariants hold at this point. Additionally, exactly one of |u| - // and |v| is now even. - BN_ULONG u_is_even = ~word_is_odd_mask(u->d[0]); - BN_ULONG v_is_even = ~word_is_odd_mask(v->d[0]); - assert(u_is_even != v_is_even); - - // Halve the even one and adjust the corresponding coefficient. - maybe_rshift1_words(u->d, u_is_even, tmp->d, n_width); - BN_ULONG A_or_B_is_odd = - word_is_odd_mask(A->d[0]) | word_is_odd_mask(B->d[0]); - BN_ULONG A_carry = - maybe_add_words(A->d, A_or_B_is_odd & u_is_even, n->d, tmp->d, n_width); - BN_ULONG B_carry = - maybe_add_words(B->d, A_or_B_is_odd & u_is_even, a->d, tmp->d, a_width); - maybe_rshift1_words_carry(A->d, A_carry, u_is_even, tmp->d, n_width); - maybe_rshift1_words_carry(B->d, B_carry, u_is_even, tmp->d, a_width); - - maybe_rshift1_words(v->d, v_is_even, tmp->d, n_width); - BN_ULONG C_or_D_is_odd = - word_is_odd_mask(C->d[0]) | word_is_odd_mask(D->d[0]); - BN_ULONG C_carry = - maybe_add_words(C->d, C_or_D_is_odd & v_is_even, n->d, tmp->d, n_width); - BN_ULONG D_carry = - maybe_add_words(D->d, C_or_D_is_odd & v_is_even, a->d, tmp->d, a_width); - maybe_rshift1_words_carry(C->d, C_carry, v_is_even, tmp->d, n_width); - maybe_rshift1_words_carry(D->d, D_carry, v_is_even, tmp->d, a_width); - } - - assert(BN_is_zero(v)); - if (!BN_is_one(u)) { - *out_no_inverse = 1; - OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE); - goto err; - } - - ret = BN_copy(r, A) != NULL; - -err: - BN_CTX_end(ctx); - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/gcd_extra.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/gcd_extra.cc.inc new file mode 100644 index 00000000..a48762fe --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/gcd_extra.cc.inc @@ -0,0 +1,327 @@ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include + +#include "internal.h" + + +using namespace bssl; + +static BN_ULONG word_is_odd_mask(BN_ULONG a) { return (BN_ULONG)0 - (a & 1); } + +static void maybe_rshift1_words(BN_ULONG *a, BN_ULONG mask, BN_ULONG *tmp, + size_t num) { + bn_rshift1_words(tmp, a, num); + bn_select_words(a, mask, tmp, a, num); +} + +static void maybe_rshift1_words_carry(BN_ULONG *a, BN_ULONG carry, + BN_ULONG mask, BN_ULONG *tmp, + size_t num) { + maybe_rshift1_words(a, mask, tmp, num); + if (num != 0) { + carry &= mask; + a[num - 1] |= carry << (BN_BITS2 - 1); + } +} + +static BN_ULONG maybe_add_words(BN_ULONG *a, BN_ULONG mask, const BN_ULONG *b, + BN_ULONG *tmp, size_t num) { + BN_ULONG carry = bn_add_words(tmp, a, b, num); + bn_select_words(a, mask, tmp, a, num); + return carry & mask; +} + +static int bn_gcd_consttime(BIGNUM *r, unsigned *out_shift, const BIGNUM *x, + const BIGNUM *y, BN_CTX *ctx) { + size_t width = x->width > y->width ? x->width : y->width; + if (width == 0) { + *out_shift = 0; + BN_zero(r); + return 1; + } + + // This is a constant-time implementation of Stein's algorithm (binary GCD). + BN_CTXScope scope(ctx); + BIGNUM *u = BN_CTX_get(ctx); + BIGNUM *v = BN_CTX_get(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + unsigned x_bits, y_bits, num_iters, shift; + if (u == nullptr || v == nullptr || tmp == nullptr || // + !BN_copy(u, x) || // + !BN_copy(v, y) || // + !bn_resize_words(u, width) || // + !bn_resize_words(v, width) || // + !bn_resize_words(tmp, width)) { + return 0; + } + + // Each loop iteration halves at least one of |u| and |v|. Thus we need at + // most the combined bit width of inputs for at least one value to be zero. + x_bits = x->width * BN_BITS2; + y_bits = y->width * BN_BITS2; + num_iters = x_bits + y_bits; + if (num_iters < x_bits) { + OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); + return 0; + } + + shift = 0; + for (unsigned i = 0; i < num_iters; i++) { + BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]); + + // If both |u| and |v| are odd, subtract the smaller from the larger. + BN_ULONG u_less_than_v = + (BN_ULONG)0 - bn_sub_words(tmp->d, u->d, v->d, width); + bn_select_words(u->d, both_odd & ~u_less_than_v, tmp->d, u->d, width); + bn_sub_words(tmp->d, v->d, u->d, width); + bn_select_words(v->d, both_odd & u_less_than_v, tmp->d, v->d, width); + + // At least one of |u| and |v| is now even. + BN_ULONG u_is_odd = word_is_odd_mask(u->d[0]); + BN_ULONG v_is_odd = word_is_odd_mask(v->d[0]); + declassify_assert(!(u_is_odd & v_is_odd)); + + // If both are even, the final GCD gains a factor of two. + shift += 1 & (~u_is_odd & ~v_is_odd); + + // Halve any which are even. + maybe_rshift1_words(u->d, ~u_is_odd, tmp->d, width); + maybe_rshift1_words(v->d, ~v_is_odd, tmp->d, width); + } + + // One of |u| or |v| is zero at this point. The algorithm usually makes |u| + // zero, unless |y| was already zero on input. Fix this by combining the + // values. + declassify_assert(BN_is_zero(u) | BN_is_zero(v)); + for (size_t i = 0; i < width; i++) { + v->d[i] |= u->d[i]; + } + + *out_shift = shift; + return bn_set_words(r, v->d, width); +} + +int BN_gcd(BIGNUM *r, const BIGNUM *x, const BIGNUM *y, BN_CTX *ctx) { + unsigned shift; + return bn_gcd_consttime(r, &shift, x, y, ctx) && BN_lshift(r, r, shift); +} + +int bssl::bn_is_relatively_prime(int *out_relatively_prime, const BIGNUM *x, + const BIGNUM *y, BN_CTX *ctx) { + BN_CTXScope scope(ctx); + unsigned shift; + BIGNUM *gcd = BN_CTX_get(ctx); + if (gcd == nullptr || !bn_gcd_consttime(gcd, &shift, x, y, ctx)) { + return 0; + } + + // Check that 2^|shift| * |gcd| is one. + if (gcd->width == 0) { + *out_relatively_prime = 0; + } else { + BN_ULONG mask = shift | (gcd->d[0] ^ 1); + for (int i = 1; i < gcd->width; i++) { + mask |= gcd->d[i]; + } + *out_relatively_prime = mask == 0; + } + + return 1; +} + +int bssl::bn_lcm_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, + BN_CTX *ctx) { + BN_CTXScope scope(ctx); + unsigned shift; + BIGNUM *gcd = BN_CTX_get(ctx); + return gcd != nullptr && // + bn_mul_consttime(r, a, b, ctx) && + bn_gcd_consttime(gcd, &shift, a, b, ctx) && + // |gcd| has a secret bit width. + bn_div_consttime(r, nullptr, r, gcd, /*divisor_min_bits=*/0, ctx) && + bn_rshift_secret_shift(r, r, shift, ctx); +} + +int bssl::bn_mod_inverse_consttime(BIGNUM *r, int *out_no_inverse, + const BIGNUM *a, const BIGNUM *n, + BN_CTX *ctx) { + *out_no_inverse = 0; + if (BN_is_negative(a) || BN_ucmp(a, n) >= 0) { + OPENSSL_PUT_ERROR(BN, BN_R_INPUT_NOT_REDUCED); + return 0; + } + if (BN_is_zero(a)) { + if (BN_is_one(n)) { + BN_zero(r); + return 1; + } + *out_no_inverse = 1; + OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE); + return 0; + } + + // This is a constant-time implementation of the extended binary GCD + // algorithm. It is adapted from the Handbook of Applied Cryptography, section + // 14.4.3, algorithm 14.51, and modified to bound coefficients and avoid + // negative numbers. + // + // For more details and proof of correctness, see + // https://github.com/mit-plv/fiat-crypto/pull/333. In particular, see |step| + // and |mod_inverse_consttime| for the algorithm in Gallina and see + // |mod_inverse_consttime_spec| for the correctness result. + + if (!BN_is_odd(a) && !BN_is_odd(n)) { + *out_no_inverse = 1; + OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE); + return 0; + } + + // This function exists to compute the RSA private exponent, where |a| is one + // word. We'll thus use |a_width| when available. + size_t n_width = n->width, a_width = a->width; + if (a_width > n_width) { + a_width = n_width; + } + + BN_CTXScope scope(ctx); + BIGNUM *u = BN_CTX_get(ctx); + BIGNUM *v = BN_CTX_get(ctx); + BIGNUM *A = BN_CTX_get(ctx); + BIGNUM *B = BN_CTX_get(ctx); + BIGNUM *C = BN_CTX_get(ctx); + BIGNUM *D = BN_CTX_get(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + BIGNUM *tmp2 = BN_CTX_get(ctx); + size_t a_bits, num_iters, n_bits; + if (u == nullptr || // + v == nullptr || // + A == nullptr || // + B == nullptr || // + C == nullptr || // + D == nullptr || // + tmp == nullptr || // + tmp2 == nullptr || // + !BN_copy(u, a) || // + !BN_copy(v, n) || // + !BN_one(A) || // + !BN_one(D) || + // For convenience, size |u| and |v| equivalently. + !bn_resize_words(u, n_width) || // + !bn_resize_words(v, n_width) || + // |A| and |C| are bounded by |m|. + !bn_resize_words(A, n_width) || // + !bn_resize_words(C, n_width) || + // |B| and |D| are bounded by |a|. + !bn_resize_words(B, a_width) || // + !bn_resize_words(D, a_width) || + // |tmp| and |tmp2| may be used at either size. + !bn_resize_words(tmp, n_width) || // + !bn_resize_words(tmp2, n_width)) { + return 0; + } + + // Each loop iteration halves at least one of |u| and |v|. Thus we need at + // most the combined bit width of inputs for at least one value to be zero. + // |a_bits| and |n_bits| cannot overflow because |bn_wexpand| ensures bit + // counts fit in even |int|. + a_bits = a_width * BN_BITS2; + n_bits = n_width * BN_BITS2; + num_iters = a_bits + n_bits; + if (num_iters < a_bits) { + OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); + return 0; + } + + // Before and after each loop iteration, the following hold: + // + // u = A*a - B*n + // v = D*n - C*a + // 0 < u <= a + // 0 <= v <= n + // 0 <= A < n + // 0 <= B <= a + // 0 <= C < n + // 0 <= D <= a + // + // After each loop iteration, u and v only get smaller, and at least one of + // them shrinks by at least a factor of two. + for (size_t i = 0; i < num_iters; i++) { + BN_ULONG both_odd = word_is_odd_mask(u->d[0]) & word_is_odd_mask(v->d[0]); + + // If both |u| and |v| are odd, subtract the smaller from the larger. + BN_ULONG v_less_than_u = + (BN_ULONG)0 - bn_sub_words(tmp->d, v->d, u->d, n_width); + bn_select_words(v->d, both_odd & ~v_less_than_u, tmp->d, v->d, n_width); + bn_sub_words(tmp->d, u->d, v->d, n_width); + bn_select_words(u->d, both_odd & v_less_than_u, tmp->d, u->d, n_width); + + // If we updated one of the values, update the corresponding coefficient. + BN_ULONG carry = bn_add_words(tmp->d, A->d, C->d, n_width); + carry -= bn_sub_words(tmp2->d, tmp->d, n->d, n_width); + bn_select_words(tmp->d, carry, tmp->d, tmp2->d, n_width); + bn_select_words(A->d, both_odd & v_less_than_u, tmp->d, A->d, n_width); + bn_select_words(C->d, both_odd & ~v_less_than_u, tmp->d, C->d, n_width); + + bn_add_words(tmp->d, B->d, D->d, a_width); + bn_sub_words(tmp2->d, tmp->d, a->d, a_width); + bn_select_words(tmp->d, carry, tmp->d, tmp2->d, a_width); + bn_select_words(B->d, both_odd & v_less_than_u, tmp->d, B->d, a_width); + bn_select_words(D->d, both_odd & ~v_less_than_u, tmp->d, D->d, a_width); + + // Our loop invariants hold at this point. Additionally, exactly one of |u| + // and |v| is now even. + BN_ULONG u_is_even = ~word_is_odd_mask(u->d[0]); + BN_ULONG v_is_even = ~word_is_odd_mask(v->d[0]); + declassify_assert(u_is_even != v_is_even); + + // Halve the even one and adjust the corresponding coefficient. + maybe_rshift1_words(u->d, u_is_even, tmp->d, n_width); + BN_ULONG A_or_B_is_odd = + word_is_odd_mask(A->d[0]) | word_is_odd_mask(B->d[0]); + BN_ULONG A_carry = + maybe_add_words(A->d, A_or_B_is_odd & u_is_even, n->d, tmp->d, n_width); + BN_ULONG B_carry = + maybe_add_words(B->d, A_or_B_is_odd & u_is_even, a->d, tmp->d, a_width); + maybe_rshift1_words_carry(A->d, A_carry, u_is_even, tmp->d, n_width); + maybe_rshift1_words_carry(B->d, B_carry, u_is_even, tmp->d, a_width); + + maybe_rshift1_words(v->d, v_is_even, tmp->d, n_width); + BN_ULONG C_or_D_is_odd = + word_is_odd_mask(C->d[0]) | word_is_odd_mask(D->d[0]); + BN_ULONG C_carry = + maybe_add_words(C->d, C_or_D_is_odd & v_is_even, n->d, tmp->d, n_width); + BN_ULONG D_carry = + maybe_add_words(D->d, C_or_D_is_odd & v_is_even, a->d, tmp->d, a_width); + maybe_rshift1_words_carry(C->d, C_carry, v_is_even, tmp->d, n_width); + maybe_rshift1_words_carry(D->d, D_carry, v_is_even, tmp->d, a_width); + } + + declassify_assert(BN_is_zero(v)); + // While the inputs and output are secret, this function considers whether the + // input was invertible to be public. It is used as part of RSA key + // generation, where inputs are chosen to already be invertible. + if (constant_time_declassify_int(!BN_is_one(u))) { + *out_no_inverse = 1; + OPENSSL_PUT_ERROR(BN, BN_R_NO_INVERSE); + return 0; + } + + return BN_copy(r, A) != nullptr; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/generic.c b/third_party/boringssl/src/crypto/fipsmodule/bn/generic.c deleted file mode 100644 index ee80a3ce..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/generic.c +++ /dev/null @@ -1,711 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include "internal.h" - - -// This file has two other implementations: x86 assembly language in -// asm/bn-586.pl and x86_64 inline assembly in asm/x86_64-gcc.c. -#if defined(OPENSSL_NO_ASM) || \ - !(defined(OPENSSL_X86) || \ - (defined(OPENSSL_X86_64) && (defined(__GNUC__) || defined(__clang__)))) - -#ifdef BN_ULLONG -#define mul_add(r, a, w, c) \ - do { \ - BN_ULLONG t; \ - t = (BN_ULLONG)(w) * (a) + (r) + (c); \ - (r) = Lw(t); \ - (c) = Hw(t); \ - } while (0) - -#define mul(r, a, w, c) \ - do { \ - BN_ULLONG t; \ - t = (BN_ULLONG)(w) * (a) + (c); \ - (r) = Lw(t); \ - (c) = Hw(t); \ - } while (0) - -#define sqr(r0, r1, a) \ - do { \ - BN_ULLONG t; \ - t = (BN_ULLONG)(a) * (a); \ - (r0) = Lw(t); \ - (r1) = Hw(t); \ - } while (0) - -#else - -#define mul_add(r, a, w, c) \ - do { \ - BN_ULONG high, low, ret, tmp = (a); \ - ret = (r); \ - BN_UMULT_LOHI(low, high, w, tmp); \ - ret += (c); \ - (c) = (ret < (c)) ? 1 : 0; \ - (c) += high; \ - ret += low; \ - (c) += (ret < low) ? 1 : 0; \ - (r) = ret; \ - } while (0) - -#define mul(r, a, w, c) \ - do { \ - BN_ULONG high, low, ret, ta = (a); \ - BN_UMULT_LOHI(low, high, w, ta); \ - ret = low + (c); \ - (c) = high; \ - (c) += (ret < low) ? 1 : 0; \ - (r) = ret; \ - } while (0) - -#define sqr(r0, r1, a) \ - do { \ - BN_ULONG tmp = (a); \ - BN_UMULT_LOHI(r0, r1, tmp, tmp); \ - } while (0) - -#endif // !BN_ULLONG - -BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, - BN_ULONG w) { - BN_ULONG c1 = 0; - - if (num == 0) { - return c1; - } - - while (num & ~3) { - mul_add(rp[0], ap[0], w, c1); - mul_add(rp[1], ap[1], w, c1); - mul_add(rp[2], ap[2], w, c1); - mul_add(rp[3], ap[3], w, c1); - ap += 4; - rp += 4; - num -= 4; - } - - while (num) { - mul_add(rp[0], ap[0], w, c1); - ap++; - rp++; - num--; - } - - return c1; -} - -BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, - BN_ULONG w) { - BN_ULONG c1 = 0; - - if (num == 0) { - return c1; - } - - while (num & ~3) { - mul(rp[0], ap[0], w, c1); - mul(rp[1], ap[1], w, c1); - mul(rp[2], ap[2], w, c1); - mul(rp[3], ap[3], w, c1); - ap += 4; - rp += 4; - num -= 4; - } - while (num) { - mul(rp[0], ap[0], w, c1); - ap++; - rp++; - num--; - } - return c1; -} - -void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, size_t n) { - if (n == 0) { - return; - } - - while (n & ~3) { - sqr(r[0], r[1], a[0]); - sqr(r[2], r[3], a[1]); - sqr(r[4], r[5], a[2]); - sqr(r[6], r[7], a[3]); - a += 4; - r += 8; - n -= 4; - } - while (n) { - sqr(r[0], r[1], a[0]); - a++; - r += 2; - n--; - } -} - -#ifdef BN_ULLONG -BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, - size_t n) { - BN_ULLONG ll = 0; - - if (n == 0) { - return 0; - } - - while (n & ~3) { - ll += (BN_ULLONG)a[0] + b[0]; - r[0] = (BN_ULONG)ll; - ll >>= BN_BITS2; - ll += (BN_ULLONG)a[1] + b[1]; - r[1] = (BN_ULONG)ll; - ll >>= BN_BITS2; - ll += (BN_ULLONG)a[2] + b[2]; - r[2] = (BN_ULONG)ll; - ll >>= BN_BITS2; - ll += (BN_ULLONG)a[3] + b[3]; - r[3] = (BN_ULONG)ll; - ll >>= BN_BITS2; - a += 4; - b += 4; - r += 4; - n -= 4; - } - while (n) { - ll += (BN_ULLONG)a[0] + b[0]; - r[0] = (BN_ULONG)ll; - ll >>= BN_BITS2; - a++; - b++; - r++; - n--; - } - return (BN_ULONG)ll; -} - -#else // !BN_ULLONG - -BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, - size_t n) { - BN_ULONG c, l, t; - - if (n == 0) { - return (BN_ULONG)0; - } - - c = 0; - while (n & ~3) { - t = a[0]; - t += c; - c = (t < c); - l = t + b[0]; - c += (l < t); - r[0] = l; - t = a[1]; - t += c; - c = (t < c); - l = t + b[1]; - c += (l < t); - r[1] = l; - t = a[2]; - t += c; - c = (t < c); - l = t + b[2]; - c += (l < t); - r[2] = l; - t = a[3]; - t += c; - c = (t < c); - l = t + b[3]; - c += (l < t); - r[3] = l; - a += 4; - b += 4; - r += 4; - n -= 4; - } - while (n) { - t = a[0]; - t += c; - c = (t < c); - l = t + b[0]; - c += (l < t); - r[0] = l; - a++; - b++; - r++; - n--; - } - return (BN_ULONG)c; -} - -#endif // !BN_ULLONG - -BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, - size_t n) { - BN_ULONG t1, t2; - int c = 0; - - if (n == 0) { - return (BN_ULONG)0; - } - - while (n & ~3) { - t1 = a[0]; - t2 = b[0]; - r[0] = t1 - t2 - c; - if (t1 != t2) { - c = (t1 < t2); - } - t1 = a[1]; - t2 = b[1]; - r[1] = t1 - t2 - c; - if (t1 != t2) { - c = (t1 < t2); - } - t1 = a[2]; - t2 = b[2]; - r[2] = t1 - t2 - c; - if (t1 != t2) { - c = (t1 < t2); - } - t1 = a[3]; - t2 = b[3]; - r[3] = t1 - t2 - c; - if (t1 != t2) { - c = (t1 < t2); - } - a += 4; - b += 4; - r += 4; - n -= 4; - } - while (n) { - t1 = a[0]; - t2 = b[0]; - r[0] = t1 - t2 - c; - if (t1 != t2) { - c = (t1 < t2); - } - a++; - b++; - r++; - n--; - } - return c; -} - -// mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) -// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) -// sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) -// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) - -#ifdef BN_ULLONG - -// Keep in mind that additions to multiplication result can not overflow, -// because its high half cannot be all-ones. -#define mul_add_c(a, b, c0, c1, c2) \ - do { \ - BN_ULONG hi; \ - BN_ULLONG t = (BN_ULLONG)(a) * (b); \ - t += (c0); /* no carry */ \ - (c0) = (BN_ULONG)Lw(t); \ - hi = (BN_ULONG)Hw(t); \ - (c1) += (hi); \ - if ((c1) < hi) { \ - (c2)++; \ - } \ - } while (0) - -#define mul_add_c2(a, b, c0, c1, c2) \ - do { \ - BN_ULONG hi; \ - BN_ULLONG t = (BN_ULLONG)(a) * (b); \ - BN_ULLONG tt = t + (c0); /* no carry */ \ - (c0) = (BN_ULONG)Lw(tt); \ - hi = (BN_ULONG)Hw(tt); \ - (c1) += hi; \ - if ((c1) < hi) { \ - (c2)++; \ - } \ - t += (c0); /* no carry */ \ - (c0) = (BN_ULONG)Lw(t); \ - hi = (BN_ULONG)Hw(t); \ - (c1) += hi; \ - if ((c1) < hi) { \ - (c2)++; \ - } \ - } while (0) - -#define sqr_add_c(a, i, c0, c1, c2) \ - do { \ - BN_ULONG hi; \ - BN_ULLONG t = (BN_ULLONG)(a)[i] * (a)[i]; \ - t += (c0); /* no carry */ \ - (c0) = (BN_ULONG)Lw(t); \ - hi = (BN_ULONG)Hw(t); \ - (c1) += hi; \ - if ((c1) < hi) { \ - (c2)++; \ - } \ - } while (0) - -#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) - -#else - -// Keep in mind that additions to hi can not overflow, because the high word of -// a multiplication result cannot be all-ones. -#define mul_add_c(a, b, c0, c1, c2) \ - do { \ - BN_ULONG ta = (a), tb = (b); \ - BN_ULONG lo, hi; \ - BN_UMULT_LOHI(lo, hi, ta, tb); \ - (c0) += lo; \ - hi += ((c0) < lo) ? 1 : 0; \ - (c1) += hi; \ - (c2) += ((c1) < hi) ? 1 : 0; \ - } while (0) - -#define mul_add_c2(a, b, c0, c1, c2) \ - do { \ - BN_ULONG ta = (a), tb = (b); \ - BN_ULONG lo, hi, tt; \ - BN_UMULT_LOHI(lo, hi, ta, tb); \ - (c0) += lo; \ - tt = hi + (((c0) < lo) ? 1 : 0); \ - (c1) += tt; \ - (c2) += ((c1) < tt) ? 1 : 0; \ - (c0) += lo; \ - hi += (c0 < lo) ? 1 : 0; \ - (c1) += hi; \ - (c2) += ((c1) < hi) ? 1 : 0; \ - } while (0) - -#define sqr_add_c(a, i, c0, c1, c2) \ - do { \ - BN_ULONG ta = (a)[i]; \ - BN_ULONG lo, hi; \ - BN_UMULT_LOHI(lo, hi, ta, ta); \ - (c0) += lo; \ - hi += (c0 < lo) ? 1 : 0; \ - (c1) += hi; \ - (c2) += ((c1) < hi) ? 1 : 0; \ - } while (0) - -#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) - -#endif // !BN_ULLONG - -void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) { - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - mul_add_c(a[0], b[0], c1, c2, c3); - r[0] = c1; - c1 = 0; - mul_add_c(a[0], b[1], c2, c3, c1); - mul_add_c(a[1], b[0], c2, c3, c1); - r[1] = c2; - c2 = 0; - mul_add_c(a[2], b[0], c3, c1, c2); - mul_add_c(a[1], b[1], c3, c1, c2); - mul_add_c(a[0], b[2], c3, c1, c2); - r[2] = c3; - c3 = 0; - mul_add_c(a[0], b[3], c1, c2, c3); - mul_add_c(a[1], b[2], c1, c2, c3); - mul_add_c(a[2], b[1], c1, c2, c3); - mul_add_c(a[3], b[0], c1, c2, c3); - r[3] = c1; - c1 = 0; - mul_add_c(a[4], b[0], c2, c3, c1); - mul_add_c(a[3], b[1], c2, c3, c1); - mul_add_c(a[2], b[2], c2, c3, c1); - mul_add_c(a[1], b[3], c2, c3, c1); - mul_add_c(a[0], b[4], c2, c3, c1); - r[4] = c2; - c2 = 0; - mul_add_c(a[0], b[5], c3, c1, c2); - mul_add_c(a[1], b[4], c3, c1, c2); - mul_add_c(a[2], b[3], c3, c1, c2); - mul_add_c(a[3], b[2], c3, c1, c2); - mul_add_c(a[4], b[1], c3, c1, c2); - mul_add_c(a[5], b[0], c3, c1, c2); - r[5] = c3; - c3 = 0; - mul_add_c(a[6], b[0], c1, c2, c3); - mul_add_c(a[5], b[1], c1, c2, c3); - mul_add_c(a[4], b[2], c1, c2, c3); - mul_add_c(a[3], b[3], c1, c2, c3); - mul_add_c(a[2], b[4], c1, c2, c3); - mul_add_c(a[1], b[5], c1, c2, c3); - mul_add_c(a[0], b[6], c1, c2, c3); - r[6] = c1; - c1 = 0; - mul_add_c(a[0], b[7], c2, c3, c1); - mul_add_c(a[1], b[6], c2, c3, c1); - mul_add_c(a[2], b[5], c2, c3, c1); - mul_add_c(a[3], b[4], c2, c3, c1); - mul_add_c(a[4], b[3], c2, c3, c1); - mul_add_c(a[5], b[2], c2, c3, c1); - mul_add_c(a[6], b[1], c2, c3, c1); - mul_add_c(a[7], b[0], c2, c3, c1); - r[7] = c2; - c2 = 0; - mul_add_c(a[7], b[1], c3, c1, c2); - mul_add_c(a[6], b[2], c3, c1, c2); - mul_add_c(a[5], b[3], c3, c1, c2); - mul_add_c(a[4], b[4], c3, c1, c2); - mul_add_c(a[3], b[5], c3, c1, c2); - mul_add_c(a[2], b[6], c3, c1, c2); - mul_add_c(a[1], b[7], c3, c1, c2); - r[8] = c3; - c3 = 0; - mul_add_c(a[2], b[7], c1, c2, c3); - mul_add_c(a[3], b[6], c1, c2, c3); - mul_add_c(a[4], b[5], c1, c2, c3); - mul_add_c(a[5], b[4], c1, c2, c3); - mul_add_c(a[6], b[3], c1, c2, c3); - mul_add_c(a[7], b[2], c1, c2, c3); - r[9] = c1; - c1 = 0; - mul_add_c(a[7], b[3], c2, c3, c1); - mul_add_c(a[6], b[4], c2, c3, c1); - mul_add_c(a[5], b[5], c2, c3, c1); - mul_add_c(a[4], b[6], c2, c3, c1); - mul_add_c(a[3], b[7], c2, c3, c1); - r[10] = c2; - c2 = 0; - mul_add_c(a[4], b[7], c3, c1, c2); - mul_add_c(a[5], b[6], c3, c1, c2); - mul_add_c(a[6], b[5], c3, c1, c2); - mul_add_c(a[7], b[4], c3, c1, c2); - r[11] = c3; - c3 = 0; - mul_add_c(a[7], b[5], c1, c2, c3); - mul_add_c(a[6], b[6], c1, c2, c3); - mul_add_c(a[5], b[7], c1, c2, c3); - r[12] = c1; - c1 = 0; - mul_add_c(a[6], b[7], c2, c3, c1); - mul_add_c(a[7], b[6], c2, c3, c1); - r[13] = c2; - c2 = 0; - mul_add_c(a[7], b[7], c3, c1, c2); - r[14] = c3; - r[15] = c1; -} - -void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) { - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - mul_add_c(a[0], b[0], c1, c2, c3); - r[0] = c1; - c1 = 0; - mul_add_c(a[0], b[1], c2, c3, c1); - mul_add_c(a[1], b[0], c2, c3, c1); - r[1] = c2; - c2 = 0; - mul_add_c(a[2], b[0], c3, c1, c2); - mul_add_c(a[1], b[1], c3, c1, c2); - mul_add_c(a[0], b[2], c3, c1, c2); - r[2] = c3; - c3 = 0; - mul_add_c(a[0], b[3], c1, c2, c3); - mul_add_c(a[1], b[2], c1, c2, c3); - mul_add_c(a[2], b[1], c1, c2, c3); - mul_add_c(a[3], b[0], c1, c2, c3); - r[3] = c1; - c1 = 0; - mul_add_c(a[3], b[1], c2, c3, c1); - mul_add_c(a[2], b[2], c2, c3, c1); - mul_add_c(a[1], b[3], c2, c3, c1); - r[4] = c2; - c2 = 0; - mul_add_c(a[2], b[3], c3, c1, c2); - mul_add_c(a[3], b[2], c3, c1, c2); - r[5] = c3; - c3 = 0; - mul_add_c(a[3], b[3], c1, c2, c3); - r[6] = c1; - r[7] = c2; -} - -void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) { - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - sqr_add_c(a, 0, c1, c2, c3); - r[0] = c1; - c1 = 0; - sqr_add_c2(a, 1, 0, c2, c3, c1); - r[1] = c2; - c2 = 0; - sqr_add_c(a, 1, c3, c1, c2); - sqr_add_c2(a, 2, 0, c3, c1, c2); - r[2] = c3; - c3 = 0; - sqr_add_c2(a, 3, 0, c1, c2, c3); - sqr_add_c2(a, 2, 1, c1, c2, c3); - r[3] = c1; - c1 = 0; - sqr_add_c(a, 2, c2, c3, c1); - sqr_add_c2(a, 3, 1, c2, c3, c1); - sqr_add_c2(a, 4, 0, c2, c3, c1); - r[4] = c2; - c2 = 0; - sqr_add_c2(a, 5, 0, c3, c1, c2); - sqr_add_c2(a, 4, 1, c3, c1, c2); - sqr_add_c2(a, 3, 2, c3, c1, c2); - r[5] = c3; - c3 = 0; - sqr_add_c(a, 3, c1, c2, c3); - sqr_add_c2(a, 4, 2, c1, c2, c3); - sqr_add_c2(a, 5, 1, c1, c2, c3); - sqr_add_c2(a, 6, 0, c1, c2, c3); - r[6] = c1; - c1 = 0; - sqr_add_c2(a, 7, 0, c2, c3, c1); - sqr_add_c2(a, 6, 1, c2, c3, c1); - sqr_add_c2(a, 5, 2, c2, c3, c1); - sqr_add_c2(a, 4, 3, c2, c3, c1); - r[7] = c2; - c2 = 0; - sqr_add_c(a, 4, c3, c1, c2); - sqr_add_c2(a, 5, 3, c3, c1, c2); - sqr_add_c2(a, 6, 2, c3, c1, c2); - sqr_add_c2(a, 7, 1, c3, c1, c2); - r[8] = c3; - c3 = 0; - sqr_add_c2(a, 7, 2, c1, c2, c3); - sqr_add_c2(a, 6, 3, c1, c2, c3); - sqr_add_c2(a, 5, 4, c1, c2, c3); - r[9] = c1; - c1 = 0; - sqr_add_c(a, 5, c2, c3, c1); - sqr_add_c2(a, 6, 4, c2, c3, c1); - sqr_add_c2(a, 7, 3, c2, c3, c1); - r[10] = c2; - c2 = 0; - sqr_add_c2(a, 7, 4, c3, c1, c2); - sqr_add_c2(a, 6, 5, c3, c1, c2); - r[11] = c3; - c3 = 0; - sqr_add_c(a, 6, c1, c2, c3); - sqr_add_c2(a, 7, 5, c1, c2, c3); - r[12] = c1; - c1 = 0; - sqr_add_c2(a, 7, 6, c2, c3, c1); - r[13] = c2; - c2 = 0; - sqr_add_c(a, 7, c3, c1, c2); - r[14] = c3; - r[15] = c1; -} - -void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) { - BN_ULONG c1, c2, c3; - - c1 = 0; - c2 = 0; - c3 = 0; - sqr_add_c(a, 0, c1, c2, c3); - r[0] = c1; - c1 = 0; - sqr_add_c2(a, 1, 0, c2, c3, c1); - r[1] = c2; - c2 = 0; - sqr_add_c(a, 1, c3, c1, c2); - sqr_add_c2(a, 2, 0, c3, c1, c2); - r[2] = c3; - c3 = 0; - sqr_add_c2(a, 3, 0, c1, c2, c3); - sqr_add_c2(a, 2, 1, c1, c2, c3); - r[3] = c1; - c1 = 0; - sqr_add_c(a, 2, c2, c3, c1); - sqr_add_c2(a, 3, 1, c2, c3, c1); - r[4] = c2; - c2 = 0; - sqr_add_c2(a, 3, 2, c3, c1, c2); - r[5] = c3; - c3 = 0; - sqr_add_c(a, 3, c1, c2, c3); - r[6] = c1; - r[7] = c2; -} - -#undef mul_add -#undef mul -#undef sqr -#undef mul_add_c -#undef mul_add_c2 -#undef sqr_add_c -#undef sqr_add_c2 - -#endif diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/generic.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/generic.cc.inc new file mode 100644 index 00000000..801df67a --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/generic.cc.inc @@ -0,0 +1,579 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "internal.h" + + +using namespace bssl; + +#if !defined(BN_MUL_ASM) + +#ifdef BN_ULLONG +#define mul_add(r, a, w, c) \ + do { \ + BN_ULLONG t; \ + t = (BN_ULLONG)(w) * (a) + (r) + (c); \ + (r) = (BN_ULONG)(t); \ + (c) = (BN_ULONG)((t) >> BN_BITS2); \ + } while (0) + +#define mul(r, a, w, c) \ + do { \ + BN_ULLONG t; \ + t = (BN_ULLONG)(w) * (a) + (c); \ + (r) = (BN_ULONG)(t); \ + (c) = (BN_ULONG)((t) >> BN_BITS2); \ + } while (0) + +#define sqr(r0, r1, a) \ + do { \ + BN_ULLONG t; \ + t = (BN_ULLONG)(a) * (a); \ + (r0) = (BN_ULONG)(t); \ + (r1) = (BN_ULONG)((t) >> BN_BITS2); \ + } while (0) + +#else + +#define mul_add(r, a, w, c) \ + do { \ + BN_ULONG high, low, ret, tmp = (a); \ + ret = (r); \ + BN_UMULT_LOHI(low, high, w, tmp); \ + ret += (c); \ + (c) = (ret < (c)) ? 1 : 0; \ + (c) += high; \ + ret += low; \ + (c) += (ret < low) ? 1 : 0; \ + (r) = ret; \ + } while (0) + +#define mul(r, a, w, c) \ + do { \ + BN_ULONG high, low, ret, ta = (a); \ + BN_UMULT_LOHI(low, high, w, ta); \ + ret = low + (c); \ + (c) = high; \ + (c) += (ret < low) ? 1 : 0; \ + (r) = ret; \ + } while (0) + +#define sqr(r0, r1, a) \ + do { \ + BN_ULONG tmp = (a); \ + BN_UMULT_LOHI(r0, r1, tmp, tmp); \ + } while (0) + +#endif // !BN_ULLONG + +BN_ULONG bssl::bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, + BN_ULONG w) { + BN_ULONG c1 = 0; + + if (num == 0) { + return c1; + } + + while (num & ~3) { + mul_add(rp[0], ap[0], w, c1); + mul_add(rp[1], ap[1], w, c1); + mul_add(rp[2], ap[2], w, c1); + mul_add(rp[3], ap[3], w, c1); + ap += 4; + rp += 4; + num -= 4; + } + + while (num) { + mul_add(rp[0], ap[0], w, c1); + ap++; + rp++; + num--; + } + + return c1; +} + +BN_ULONG bssl::bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, + BN_ULONG w) { + BN_ULONG c1 = 0; + + if (num == 0) { + return c1; + } + + while (num & ~3) { + mul(rp[0], ap[0], w, c1); + mul(rp[1], ap[1], w, c1); + mul(rp[2], ap[2], w, c1); + mul(rp[3], ap[3], w, c1); + ap += 4; + rp += 4; + num -= 4; + } + while (num) { + mul(rp[0], ap[0], w, c1); + ap++; + rp++; + num--; + } + return c1; +} + +void bssl::bn_sqr_add_words(BN_ULONG *r, const BN_ULONG *a, size_t n) { + if (n == 0) { + return; + } + + BN_ULONG carry = 0, lo, hi; + while (n & ~3) { + sqr(lo, hi, a[0]); + r[0] = CRYPTO_addc_w(r[0], lo, carry, &carry); + r[1] = CRYPTO_addc_w(r[1], hi, carry, &carry); + sqr(lo, hi, a[1]); + r[2] = CRYPTO_addc_w(r[2], lo, carry, &carry); + r[3] = CRYPTO_addc_w(r[3], hi, carry, &carry); + sqr(lo, hi, a[2]); + r[4] = CRYPTO_addc_w(r[4], lo, carry, &carry); + r[5] = CRYPTO_addc_w(r[5], hi, carry, &carry); + sqr(lo, hi, a[3]); + r[6] = CRYPTO_addc_w(r[6], lo, carry, &carry); + r[7] = CRYPTO_addc_w(r[7], hi, carry, &carry); + a += 4; + r += 8; + n -= 4; + } + while (n) { + sqr(lo, hi, a[0]); + r[0] = CRYPTO_addc_w(r[0], lo, carry, &carry); + r[1] = CRYPTO_addc_w(r[1], hi, carry, &carry); + a++; + r += 2; + n--; + } +} + +// mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) +// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) +// sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) +// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) + +#ifdef BN_ULLONG + +// Keep in mind that additions to multiplication result can not overflow, +// because its high half cannot be all-ones. +#define mul_add_c(a, b, c0, c1, c2) \ + do { \ + BN_ULONG hi; \ + BN_ULLONG t = (BN_ULLONG)(a) * (b); \ + t += (c0); /* no carry */ \ + (c0) = (BN_ULONG)(t); \ + hi = (BN_ULONG)((t) >> BN_BITS2); \ + (c1) += (hi); \ + (c2) += (c1) < hi; \ + } while (0) + +#define mul_add_c2(a, b, c0, c1, c2) \ + do { \ + BN_ULONG hi; \ + BN_ULLONG t = (BN_ULLONG)(a) * (b); \ + BN_ULLONG tt = t + (c0); /* no carry */ \ + (c0) = (BN_ULONG)(tt); \ + hi = (BN_ULONG)((tt) >> BN_BITS2); \ + (c1) += hi; \ + (c2) += (c1) < hi; \ + t += (c0); /* no carry */ \ + (c0) = (BN_ULONG)(t); \ + hi = (BN_ULONG)((t) >> BN_BITS2); \ + (c1) += hi; \ + (c2) += (c1) < hi; \ + } while (0) + +#define sqr_add_c(a, i, c0, c1, c2) \ + do { \ + BN_ULONG hi; \ + BN_ULLONG t = (BN_ULLONG)(a)[i] * (a)[i]; \ + t += (c0); /* no carry */ \ + (c0) = (BN_ULONG)(t); \ + hi = (BN_ULONG)((t) >> BN_BITS2); \ + (c1) += hi; \ + (c2) += (c1) < hi; \ + } while (0) + +#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) + +#else + +// Keep in mind that additions to hi can not overflow, because the high word of +// a multiplication result cannot be all-ones. +#define mul_add_c(a, b, c0, c1, c2) \ + do { \ + BN_ULONG ta = (a), tb = (b); \ + BN_ULONG lo, hi; \ + BN_UMULT_LOHI(lo, hi, ta, tb); \ + (c0) += lo; \ + hi += ((c0) < lo) ? 1 : 0; \ + (c1) += hi; \ + (c2) += ((c1) < hi) ? 1 : 0; \ + } while (0) + +#define mul_add_c2(a, b, c0, c1, c2) \ + do { \ + BN_ULONG ta = (a), tb = (b); \ + BN_ULONG lo, hi, tt; \ + BN_UMULT_LOHI(lo, hi, ta, tb); \ + (c0) += lo; \ + tt = hi + (((c0) < lo) ? 1 : 0); \ + (c1) += tt; \ + (c2) += ((c1) < tt) ? 1 : 0; \ + (c0) += lo; \ + hi += (c0 < lo) ? 1 : 0; \ + (c1) += hi; \ + (c2) += ((c1) < hi) ? 1 : 0; \ + } while (0) + +#define sqr_add_c(a, i, c0, c1, c2) \ + do { \ + BN_ULONG ta = (a)[i]; \ + BN_ULONG lo, hi; \ + BN_UMULT_LOHI(lo, hi, ta, ta); \ + (c0) += lo; \ + hi += (c0 < lo) ? 1 : 0; \ + (c1) += hi; \ + (c2) += ((c1) < hi) ? 1 : 0; \ + } while (0) + +#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) + +#endif // !BN_ULLONG + +void bssl::bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], + const BN_ULONG b[8]) { + BN_ULONG c1, c2, c3; + + c1 = 0; + c2 = 0; + c3 = 0; + mul_add_c(a[0], b[0], c1, c2, c3); + r[0] = c1; + c1 = 0; + mul_add_c(a[0], b[1], c2, c3, c1); + mul_add_c(a[1], b[0], c2, c3, c1); + r[1] = c2; + c2 = 0; + mul_add_c(a[2], b[0], c3, c1, c2); + mul_add_c(a[1], b[1], c3, c1, c2); + mul_add_c(a[0], b[2], c3, c1, c2); + r[2] = c3; + c3 = 0; + mul_add_c(a[0], b[3], c1, c2, c3); + mul_add_c(a[1], b[2], c1, c2, c3); + mul_add_c(a[2], b[1], c1, c2, c3); + mul_add_c(a[3], b[0], c1, c2, c3); + r[3] = c1; + c1 = 0; + mul_add_c(a[4], b[0], c2, c3, c1); + mul_add_c(a[3], b[1], c2, c3, c1); + mul_add_c(a[2], b[2], c2, c3, c1); + mul_add_c(a[1], b[3], c2, c3, c1); + mul_add_c(a[0], b[4], c2, c3, c1); + r[4] = c2; + c2 = 0; + mul_add_c(a[0], b[5], c3, c1, c2); + mul_add_c(a[1], b[4], c3, c1, c2); + mul_add_c(a[2], b[3], c3, c1, c2); + mul_add_c(a[3], b[2], c3, c1, c2); + mul_add_c(a[4], b[1], c3, c1, c2); + mul_add_c(a[5], b[0], c3, c1, c2); + r[5] = c3; + c3 = 0; + mul_add_c(a[6], b[0], c1, c2, c3); + mul_add_c(a[5], b[1], c1, c2, c3); + mul_add_c(a[4], b[2], c1, c2, c3); + mul_add_c(a[3], b[3], c1, c2, c3); + mul_add_c(a[2], b[4], c1, c2, c3); + mul_add_c(a[1], b[5], c1, c2, c3); + mul_add_c(a[0], b[6], c1, c2, c3); + r[6] = c1; + c1 = 0; + mul_add_c(a[0], b[7], c2, c3, c1); + mul_add_c(a[1], b[6], c2, c3, c1); + mul_add_c(a[2], b[5], c2, c3, c1); + mul_add_c(a[3], b[4], c2, c3, c1); + mul_add_c(a[4], b[3], c2, c3, c1); + mul_add_c(a[5], b[2], c2, c3, c1); + mul_add_c(a[6], b[1], c2, c3, c1); + mul_add_c(a[7], b[0], c2, c3, c1); + r[7] = c2; + c2 = 0; + mul_add_c(a[7], b[1], c3, c1, c2); + mul_add_c(a[6], b[2], c3, c1, c2); + mul_add_c(a[5], b[3], c3, c1, c2); + mul_add_c(a[4], b[4], c3, c1, c2); + mul_add_c(a[3], b[5], c3, c1, c2); + mul_add_c(a[2], b[6], c3, c1, c2); + mul_add_c(a[1], b[7], c3, c1, c2); + r[8] = c3; + c3 = 0; + mul_add_c(a[2], b[7], c1, c2, c3); + mul_add_c(a[3], b[6], c1, c2, c3); + mul_add_c(a[4], b[5], c1, c2, c3); + mul_add_c(a[5], b[4], c1, c2, c3); + mul_add_c(a[6], b[3], c1, c2, c3); + mul_add_c(a[7], b[2], c1, c2, c3); + r[9] = c1; + c1 = 0; + mul_add_c(a[7], b[3], c2, c3, c1); + mul_add_c(a[6], b[4], c2, c3, c1); + mul_add_c(a[5], b[5], c2, c3, c1); + mul_add_c(a[4], b[6], c2, c3, c1); + mul_add_c(a[3], b[7], c2, c3, c1); + r[10] = c2; + c2 = 0; + mul_add_c(a[4], b[7], c3, c1, c2); + mul_add_c(a[5], b[6], c3, c1, c2); + mul_add_c(a[6], b[5], c3, c1, c2); + mul_add_c(a[7], b[4], c3, c1, c2); + r[11] = c3; + c3 = 0; + mul_add_c(a[7], b[5], c1, c2, c3); + mul_add_c(a[6], b[6], c1, c2, c3); + mul_add_c(a[5], b[7], c1, c2, c3); + r[12] = c1; + c1 = 0; + mul_add_c(a[6], b[7], c2, c3, c1); + mul_add_c(a[7], b[6], c2, c3, c1); + r[13] = c2; + c2 = 0; + mul_add_c(a[7], b[7], c3, c1, c2); + r[14] = c3; + r[15] = c1; +} + +void bssl::bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], + const BN_ULONG b[4]) { + BN_ULONG c1, c2, c3; + + c1 = 0; + c2 = 0; + c3 = 0; + mul_add_c(a[0], b[0], c1, c2, c3); + r[0] = c1; + c1 = 0; + mul_add_c(a[0], b[1], c2, c3, c1); + mul_add_c(a[1], b[0], c2, c3, c1); + r[1] = c2; + c2 = 0; + mul_add_c(a[2], b[0], c3, c1, c2); + mul_add_c(a[1], b[1], c3, c1, c2); + mul_add_c(a[0], b[2], c3, c1, c2); + r[2] = c3; + c3 = 0; + mul_add_c(a[0], b[3], c1, c2, c3); + mul_add_c(a[1], b[2], c1, c2, c3); + mul_add_c(a[2], b[1], c1, c2, c3); + mul_add_c(a[3], b[0], c1, c2, c3); + r[3] = c1; + c1 = 0; + mul_add_c(a[3], b[1], c2, c3, c1); + mul_add_c(a[2], b[2], c2, c3, c1); + mul_add_c(a[1], b[3], c2, c3, c1); + r[4] = c2; + c2 = 0; + mul_add_c(a[2], b[3], c3, c1, c2); + mul_add_c(a[3], b[2], c3, c1, c2); + r[5] = c3; + c3 = 0; + mul_add_c(a[3], b[3], c1, c2, c3); + r[6] = c1; + r[7] = c2; +} + +void bssl::bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) { + BN_ULONG c1, c2, c3; + + c1 = 0; + c2 = 0; + c3 = 0; + sqr_add_c(a, 0, c1, c2, c3); + r[0] = c1; + c1 = 0; + sqr_add_c2(a, 1, 0, c2, c3, c1); + r[1] = c2; + c2 = 0; + sqr_add_c(a, 1, c3, c1, c2); + sqr_add_c2(a, 2, 0, c3, c1, c2); + r[2] = c3; + c3 = 0; + sqr_add_c2(a, 3, 0, c1, c2, c3); + sqr_add_c2(a, 2, 1, c1, c2, c3); + r[3] = c1; + c1 = 0; + sqr_add_c(a, 2, c2, c3, c1); + sqr_add_c2(a, 3, 1, c2, c3, c1); + sqr_add_c2(a, 4, 0, c2, c3, c1); + r[4] = c2; + c2 = 0; + sqr_add_c2(a, 5, 0, c3, c1, c2); + sqr_add_c2(a, 4, 1, c3, c1, c2); + sqr_add_c2(a, 3, 2, c3, c1, c2); + r[5] = c3; + c3 = 0; + sqr_add_c(a, 3, c1, c2, c3); + sqr_add_c2(a, 4, 2, c1, c2, c3); + sqr_add_c2(a, 5, 1, c1, c2, c3); + sqr_add_c2(a, 6, 0, c1, c2, c3); + r[6] = c1; + c1 = 0; + sqr_add_c2(a, 7, 0, c2, c3, c1); + sqr_add_c2(a, 6, 1, c2, c3, c1); + sqr_add_c2(a, 5, 2, c2, c3, c1); + sqr_add_c2(a, 4, 3, c2, c3, c1); + r[7] = c2; + c2 = 0; + sqr_add_c(a, 4, c3, c1, c2); + sqr_add_c2(a, 5, 3, c3, c1, c2); + sqr_add_c2(a, 6, 2, c3, c1, c2); + sqr_add_c2(a, 7, 1, c3, c1, c2); + r[8] = c3; + c3 = 0; + sqr_add_c2(a, 7, 2, c1, c2, c3); + sqr_add_c2(a, 6, 3, c1, c2, c3); + sqr_add_c2(a, 5, 4, c1, c2, c3); + r[9] = c1; + c1 = 0; + sqr_add_c(a, 5, c2, c3, c1); + sqr_add_c2(a, 6, 4, c2, c3, c1); + sqr_add_c2(a, 7, 3, c2, c3, c1); + r[10] = c2; + c2 = 0; + sqr_add_c2(a, 7, 4, c3, c1, c2); + sqr_add_c2(a, 6, 5, c3, c1, c2); + r[11] = c3; + c3 = 0; + sqr_add_c(a, 6, c1, c2, c3); + sqr_add_c2(a, 7, 5, c1, c2, c3); + r[12] = c1; + c1 = 0; + sqr_add_c2(a, 7, 6, c2, c3, c1); + r[13] = c2; + c2 = 0; + sqr_add_c(a, 7, c3, c1, c2); + r[14] = c3; + r[15] = c1; +} + +void bssl::bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) { + BN_ULONG c1, c2, c3; + + c1 = 0; + c2 = 0; + c3 = 0; + sqr_add_c(a, 0, c1, c2, c3); + r[0] = c1; + c1 = 0; + sqr_add_c2(a, 1, 0, c2, c3, c1); + r[1] = c2; + c2 = 0; + sqr_add_c(a, 1, c3, c1, c2); + sqr_add_c2(a, 2, 0, c3, c1, c2); + r[2] = c3; + c3 = 0; + sqr_add_c2(a, 3, 0, c1, c2, c3); + sqr_add_c2(a, 2, 1, c1, c2, c3); + r[3] = c1; + c1 = 0; + sqr_add_c(a, 2, c2, c3, c1); + sqr_add_c2(a, 3, 1, c2, c3, c1); + r[4] = c2; + c2 = 0; + sqr_add_c2(a, 3, 2, c3, c1, c2); + r[5] = c3; + c3 = 0; + sqr_add_c(a, 3, c1, c2, c3); + r[6] = c1; + r[7] = c2; +} + +#undef mul_add +#undef mul +#undef sqr +#undef mul_add_c +#undef mul_add_c2 +#undef sqr_add_c +#undef sqr_add_c2 + +#endif // !BN_MUL_ASM + +#if !defined(BN_ADD_ASM) + +BN_ULONG bssl::bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, + size_t n) { + if (n == 0) { + return 0; + } + + BN_ULONG carry = 0; + while (n & ~3) { + r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry); + r[1] = CRYPTO_addc_w(a[1], b[1], carry, &carry); + r[2] = CRYPTO_addc_w(a[2], b[2], carry, &carry); + r[3] = CRYPTO_addc_w(a[3], b[3], carry, &carry); + a += 4; + b += 4; + r += 4; + n -= 4; + } + while (n) { + r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry); + a++; + b++; + r++; + n--; + } + return carry; +} + +BN_ULONG bssl::bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, + size_t n) { + if (n == 0) { + return (BN_ULONG)0; + } + + BN_ULONG borrow = 0; + while (n & ~3) { + r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow); + r[1] = CRYPTO_subc_w(a[1], b[1], borrow, &borrow); + r[2] = CRYPTO_subc_w(a[2], b[2], borrow, &borrow); + r[3] = CRYPTO_subc_w(a[3], b[3], borrow, &borrow); + a += 4; + b += 4; + r += 4; + n -= 4; + } + while (n) { + r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow); + a++; + b++; + r++; + n--; + } + return borrow; +} + +#endif // !BN_ADD_ASM diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/internal.h b/third_party/boringssl/src/crypto/fipsmodule/bn/internal.h index 50fd362d..486fae10 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/internal.h @@ -1,142 +1,32 @@ -/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -/* ==================================================================== - * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ -/* ==================================================================== - * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. - * - * Portions of the attached software ("Contribution") are developed by - * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project. - * - * The Contribution is licensed pursuant to the Eric Young open source - * license provided above. - * - * The binary polynomial arithmetic software is originally written by - * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems - * Laboratories. */ - -#ifndef OPENSSL_HEADER_BN_INTERNAL_H -#define OPENSSL_HEADER_BN_INTERNAL_H +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_BN_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_BN_INTERNAL_H #include #if defined(OPENSSL_X86_64) && defined(_MSC_VER) -OPENSSL_MSVC_PRAGMA(warning(push, 3)) #include -OPENSSL_MSVC_PRAGMA(warning(pop)) #pragma intrinsic(__umulh, _umul128) #endif #include "../../internal.h" -#if defined(__cplusplus) -extern "C" { -#endif + +BSSL_NAMESPACE_BEGIN #if defined(OPENSSL_64_BIT) @@ -149,6 +39,7 @@ extern "C" { #endif #define BN_BITS2 64 +#define BN_BITS2_LG 6 #define BN_BYTES 8 #define BN_BITS4 32 #define BN_MASK2 (0xffffffffffffffffUL) @@ -165,6 +56,7 @@ extern "C" { #define BN_ULLONG uint64_t #define BN_CAN_DIVIDE_ULLONG #define BN_BITS2 32 +#define BN_BITS2_LG 5 #define BN_BYTES 4 #define BN_BITS4 16 #define BN_MASK2 (0xffffffffUL) @@ -189,14 +81,20 @@ extern "C" { #define BN_CAN_USE_INLINE_ASM #endif -// |BN_mod_exp_mont_consttime| is based on the assumption that the L1 data -// cache line width of the target processor is at least the following value. -#define MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH 64 - -// The number of |BN_ULONG|s needed for the |BN_mod_exp_mont_consttime| stack- -// allocated storage buffer. The buffer is just the right size for the RSAZ -// and is about ~1KB larger than what's necessary (4480 bytes) for 1024-bit -// inputs. +// MOD_EXP_CTIME_ALIGN is the alignment needed for |BN_mod_exp_mont_consttime|'s +// tables. +// +// TODO(davidben): Historically, this alignment came from cache line +// assumptions, which we've since removed. Is 64-byte alignment still necessary +// or ideal? The true alignment requirement seems to now be 32 bytes, coming +// from RSAZ's use of VMOVDQA to a YMM register. Non-x86_64 has even fewer +// requirements. +#define MOD_EXP_CTIME_ALIGN 64 + +// MOD_EXP_CTIME_STORAGE_LEN is the number of |BN_ULONG|s needed for the +// |BN_mod_exp_mont_consttime| stack-allocated storage buffer. The buffer is +// just the right size for the RSAZ and is about ~1KB larger than what's +// necessary (4480 bytes) for 1024-bit inputs. #define MOD_EXP_CTIME_STORAGE_LEN \ (((320u * 3u) + (32u * 9u * 16u)) / sizeof(BN_ULONG)) @@ -206,13 +104,8 @@ extern "C" { sizeof(x) / sizeof(BN_ULONG), 0, BN_FLG_STATIC_DATA \ } -#if defined(BN_ULLONG) -#define Lw(t) ((BN_ULONG)(t)) -#define Hw(t) ((BN_ULONG)((t) >> BN_BITS2)) -#endif - -// bn_minimal_width returns the minimal value of |bn->top| which fits the -// value of |bn|. +// bn_minimal_width returns the minimal number of words needed to represent +// |bn|. int bn_minimal_width(const BIGNUM *bn); // bn_set_minimal_width sets |bn->width| to |bn_minimal_width(bn)|. If |bn| is @@ -228,7 +121,7 @@ int bn_wexpand(BIGNUM *bn, size_t words); // than a number of words. int bn_expand(BIGNUM *bn, size_t bits); -// bn_resize_words adjusts |bn->top| to be |words|. It returns one on success +// bn_resize_words adjusts |bn->width| to be |words|. It returns one on success // and zero on allocation error or if |bn|'s value is too large. OPENSSL_EXPORT int bn_resize_words(BIGNUM *bn, size_t words); @@ -257,50 +150,118 @@ int bn_fits_in_words(const BIGNUM *bn, size_t num); // is representable in |num| words. Otherwise, it returns zero. int bn_copy_words(BN_ULONG *out, size_t num, const BIGNUM *bn); +// bn_assert_fits_in_bytes asserts that |bn| fits in |num| bytes. This is a +// no-op in release builds, but triggers an assert in debug builds, and +// declassifies all bytes which are therefore known to be zero in constant-time +// validation. +void bn_assert_fits_in_bytes(const BIGNUM *bn, size_t num); + +// bn_secret marks |bn|'s contents, but not its width or sign, as secret. See +// |CONSTTIME_SECRET| for details. +inline void bn_secret(BIGNUM *bn) { + CONSTTIME_SECRET(bn->d, bn->width * sizeof(BN_ULONG)); +} + +// bn_declassify marks |bn|'s value as public. See |CONSTTIME_DECLASSIFY| for +// details. +inline void bn_declassify(BIGNUM *bn) { + CONSTTIME_DECLASSIFY(bn->d, bn->width * sizeof(BN_ULONG)); +} + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) +// See asm/bn-586.pl. +#define BN_ADD_ASM +#define BN_MUL_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ + (defined(__GNUC__) || defined(__clang__)) +// See asm/x86_64-gcc.c +#define BN_ADD_ASM +#define BN_MUL_ASM +#endif + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) +// See asm/bn-armv8.pl. +#define BN_ADD_ASM +#endif + // bn_mul_add_words multiples |ap| by |w|, adds the result to |rp|, and places // the result in |rp|. |ap| and |rp| must both be |num| words long. It returns // the carry word of the operation. |ap| and |rp| may be equal but otherwise may // not alias. -BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, - BN_ULONG w); +#if defined(BN_MUL_ASM) +extern "C" +#endif + BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, + BN_ULONG w); // bn_mul_words multiples |ap| by |w| and places the result in |rp|. |ap| and // |rp| must both be |num| words long. It returns the carry word of the // operation. |ap| and |rp| may be equal but otherwise may not alias. -BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, BN_ULONG w); +#if defined(BN_MUL_ASM) +extern "C" +#endif + BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, + BN_ULONG w); -// bn_sqr_words sets |rp[2*i]| and |rp[2*i+1]| to |ap[i]|'s square, for all |i| -// up to |num|. |ap| is an array of |num| words and |rp| an array of |2*num| -// words. |ap| and |rp| may not alias. +// bn_sqr_add_words computes |tmp| where |tmp[2*i]| and |tmp[2*i+1]| are +// |ap[i]|'s square, for all |i| up to |num|, and adds the result to |rp|. If +// the result does not fit in |2*num| words, the final carry bit is truncated. +// |ap| is an array of |num| words and |rp| an array of |2*num| words. |ap| and +// |rp| may not alias. // // This gives the contribution of the |ap[i]*ap[i]| terms when squaring |ap|. -void bn_sqr_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num); +#if defined(BN_MUL_ASM) +extern "C" +#endif + void bn_sqr_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num); // bn_add_words adds |ap| to |bp| and places the result in |rp|, each of which // are |num| words long. It returns the carry bit, which is one if the operation // overflowed and zero otherwise. Any pair of |ap|, |bp|, and |rp| may be equal // to each other but otherwise may not alias. -BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - size_t num); + +#if defined(BN_ADD_ASM) +extern "C" +#endif + BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + size_t num); // bn_sub_words subtracts |bp| from |ap| and places the result in |rp|. It // returns the borrow bit, which is one if the computation underflowed and zero // otherwise. Any pair of |ap|, |bp|, and |rp| may be equal to each other but // otherwise may not alias. -BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - size_t num); +#if defined(BN_ADD_ASM) +extern "C" +#endif + BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + size_t num); // bn_mul_comba4 sets |r| to the product of |a| and |b|. -void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]); +#if defined(BN_MUL_ASM) +extern "C" +#endif + void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]); // bn_mul_comba8 sets |r| to the product of |a| and |b|. -void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]); +#if defined(BN_MUL_ASM) +extern "C" +#endif + void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], + const BN_ULONG b[8]); // bn_sqr_comba8 sets |r| to |a|^2. -void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]); +#if defined(BN_MUL_ASM) +extern "C" +#endif + void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]); // bn_sqr_comba4 sets |r| to |a|^2. -void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]); +#if defined(BN_MUL_ASM) +extern "C" +#endif + void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]); // bn_less_than_words returns one if |a| < |b| and zero otherwise, where |a| // and |b| both are |len| words long. It runs in constant time. @@ -344,76 +305,167 @@ int bn_rand_range_words(BN_ULONG *out, BN_ULONG min_inclusive, int bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, BN_ULONG min_inclusive, const BIGNUM *max_exclusive); +// BN_MONTGOMERY_MAX_WORDS is the maximum number of words allowed in a |BIGNUM| +// used with Montgomery reduction. Ideally this limit would be applied to all +// |BIGNUM|s, in |bn_wexpand|, but the exactfloat library needs to create 8 MiB +// values for other operations. +// +// This limit is set so that one number fits within 2 KiB, giving room to +// allocate a few of them on the stack. It is also set to limit the DoS impact +// of large RSA, DH, and DSA keys, which scale cubically. +#define BN_MONTGOMERY_MAX_WORDS (16384 / BN_BITS2) + +BSSL_NAMESPACE_END + +struct bn_mont_ctx_st { + // RR is R^2, reduced modulo |N|. It is used to convert to Montgomery form. It + // is guaranteed to have the same width as |N|. + BIGNUM RR; + // N is the modulus. It is always stored in minimal form, so |N.width| + // determines R. + BIGNUM N; + BN_ULONG n0[BN_MONT_CTX_N0_LIMBS]; // least significant words of (R*Ri-1)/N +}; + +BSSL_NAMESPACE_BEGIN + #if !defined(OPENSSL_NO_ASM) && \ (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) #define OPENSSL_BN_ASM_MONT -// bn_mul_mont writes |ap| * |bp| mod |np| to |rp|, each |num| words +// bn_mul_mont_words writes |ap| * |bp| mod |np| to |rp|, each |num| words // long. Inputs and outputs are in Montgomery form. |n0| is a pointer to the -// corresponding field in |BN_MONT_CTX|. It returns one if |bn_mul_mont| handles -// inputs of this size and zero otherwise. +// corresponding field in |BN_MONT_CTX|. // // If at least one of |ap| or |bp| is fully reduced, |rp| will be fully reduced. // If neither is fully-reduced, the output may not be either. // +// This function allocates up to 2 * |num| words (plus a constant allocation) on +// the stack, so |num| should be at most |BN_MONTGOMERY_MAX_WORDS|. +// Additionally, |num| must be at least 128 / |BN_BITS2|. +// // TODO(davidben): The x86_64 implementation expects a 32-bit input and masks // off upper bits. The aarch64 implementation expects a 64-bit input and does // not. |size_t| is the safer option but not strictly correct for x86_64. But -// this function implicitly already has a bound on the size of |num| because it -// internally creates |num|-sized stack allocation. +// the |BN_MONTGOMERY_MAX_WORDS| bound makes this moot. // // See also discussion in |ToWord| in abi_test.h for notes on smaller-than-word // inputs. -int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, - const BN_ULONG *np, const BN_ULONG *n0, size_t num); +extern "C" void bn_mul_mont_words(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], + size_t num); + +#if defined(OPENSSL_X86_64) +inline int bn_mulx_adx_capable() { + // MULX is in BMI2. + return CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable(); +} +extern "C" void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], + size_t num); +inline int bn_mul4x_mont_capable(size_t num) { + return num >= 8 && (num & 3) == 0; +} +extern "C" void bn_mul4x_mont(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], + size_t num); +inline int bn_mulx4x_mont_capable(size_t num) { + return bn_mul4x_mont_capable(num) && bn_mulx_adx_capable(); +} +extern "C" void bn_mulx4x_mont(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], + size_t num); +inline int bn_sqr8x_mont_capable(size_t num) { + return num >= 8 && (num & 7) == 0; +} +extern "C" void bn_sqr8x_mont(BN_ULONG *rp, const BN_ULONG *ap, + BN_ULONG mulx_adx_capable, const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], + size_t num); +#elif defined(OPENSSL_ARM) +inline int bn_mul8x_mont_neon_capable(size_t num) { + return (num & 7) == 0 && CRYPTO_is_NEON_capable(); +} +extern "C" void bn_mul8x_mont_neon(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], + size_t num); +extern "C" void bn_mul_mont_nohw(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], + size_t num); #endif +#endif // OPENSSL_BN_ASM_MONT + #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) #define OPENSSL_BN_ASM_MONT5 -// bn_mul_mont_gather5 multiples loads index |power| of |table|, multiplies it -// by |ap| modulo |np|, and stores the result in |rp|. The values are |num| -// words long and represented in Montgomery form. |n0| is a pointer to the -// corresponding field in |BN_MONT_CTX|. -// -// WARNING: This function implements Almost Montgomery Multiplication from -// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. -// However, even if they are fully reduced, the output may not be. -void bn_mul_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, - const BN_ULONG *table, const BN_ULONG *np, - const BN_ULONG *n0, int num, int power); +// The following functions implement |bn_mul_mont_gather5|. See +// |bn_mul_mont_gather5| for details. +inline int bn_mul4x_mont_gather5_capable(int num) { return (num & 7) == 0; } +extern "C" void bn_mul4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], + int num, int power); + +inline int bn_mulx4x_mont_gather5_capable(int num) { + return bn_mul4x_mont_gather5_capable(num) && CRYPTO_is_ADX_capable() && + CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable(); +} +extern "C" void bn_mulx4x_mont_gather5(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *table, + const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], + int num, int power); + +extern "C" void bn_mul_mont_gather5_nohw( + BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], int num, int power); // bn_scatter5 stores |inp| to index |power| of |table|. |inp| and each entry of -// |table| are |num| words long. |power| must be less than 32. |table| must be -// 32*|num| words long. -void bn_scatter5(const BN_ULONG *inp, size_t num, BN_ULONG *table, - size_t power); +// |table| are |num| words long. |power| must be less than 32 and is treated as +// public. |table| must be 32*|num| words long. |table| must be aligned to at +// least 16 bytes. +extern "C" void bn_scatter5(const BN_ULONG *inp, size_t num, BN_ULONG *table, + size_t power); // bn_gather5 loads index |power| of |table| and stores it in |out|. |out| and -// each entry of |table| are |num| words long. |power| must be less than 32. -void bn_gather5(BN_ULONG *out, size_t num, const BN_ULONG *table, size_t power); - -// bn_power5 squares |ap| five times and multiplies it by the value stored at -// index |power| of |table|, modulo |np|. It stores the result in |rp|. The -// values are |num| words long and represented in Montgomery form. |n0| is a -// pointer to the corresponding field in |BN_MONT_CTX|. |num| must be divisible -// by 8. -// -// WARNING: This function implements Almost Montgomery Multiplication from -// https://eprint.iacr.org/2011/239. The inputs do not need to be fully reduced. -// However, even if they are fully reduced, the output may not be. -void bn_power5(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *table, - const BN_ULONG *np, const BN_ULONG *n0, int num, int power); +// each entry of |table| are |num| words long. |power| must be less than 32 and +// is treated as secret. |table| must be aligned to at least 16 bytes. +extern "C" void bn_gather5(BN_ULONG *out, size_t num, const BN_ULONG *table, + size_t power); + +// The following functions implement |bn_power5|. See |bn_power5| for details. +extern "C" void bn_power5_nohw(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], int num, + int power); + +inline int bn_power5_capable(int num) { return (num & 7) == 0; } + +inline int bn_powerx5_capable(int num) { + return bn_power5_capable(num) && CRYPTO_is_ADX_capable() && + CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable(); +} +extern "C" void bn_powerx5(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *table, const BN_ULONG *np, + const BN_ULONG n0[BN_MONT_CTX_N0_LIMBS], int num, + int power); + #endif // !OPENSSL_NO_ASM && OPENSSL_X86_64 uint64_t bn_mont_n0(const BIGNUM *n); -// bn_mod_exp_base_2_consttime calculates r = 2**p (mod n). |p| must be larger -// than log_2(n); i.e. 2**p must be larger than |n|. |n| must be positive and -// odd. |p| and the bit width of |n| are assumed public, but |n| is otherwise -// treated as secret. -int bn_mod_exp_base_2_consttime(BIGNUM *r, unsigned p, const BIGNUM *n, - BN_CTX *ctx); +// bn_mont_ctx_set_RR_consttime initializes |mont->RR|. It returns one on +// success and zero on error. |mont->N| and |mont->n0| must have been +// initialized already. The bit width of |mont->N| is assumed public, but +// |mont->N| is otherwise treated as secret. +int bn_mont_ctx_set_RR_consttime(BN_MONT_CTX *mont, BN_CTX *ctx); #if defined(_MSC_VER) #if defined(OPENSSL_X86_64) @@ -439,7 +491,7 @@ int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx); // bn_is_bit_set_words returns one if bit |bit| is set in |a| and zero // otherwise. -int bn_is_bit_set_words(const BN_ULONG *a, size_t num, unsigned bit); +int bn_is_bit_set_words(const BN_ULONG *a, size_t num, size_t bit); // bn_one_to_montgomery sets |r| to one in Montgomery form. It returns one on // success and zero on error. This function treats the bit width of the modulus @@ -461,7 +513,7 @@ int bn_odd_number_is_obviously_composite(const BIGNUM *bn); // A BN_MILLER_RABIN stores state common to each Miller-Rabin iteration. It is // initialized within an existing |BN_CTX| scope and may not be used after // that scope is released with |BN_CTX_end|. Field names match those in FIPS -// 186-4, section C.3.1. +// 186-5, section B.3.1. typedef struct { // w1 is w-1. BIGNUM *w1; @@ -577,6 +629,13 @@ OPENSSL_EXPORT int bn_is_relatively_prime(int *out_relatively_prime, OPENSSL_EXPORT int bn_lcm_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx); +// bn_mont_ctx_init zero-initialies |mont|. +void bn_mont_ctx_init(BN_MONT_CTX *mont); + +// bn_mont_ctx_cleanup releases memory associated with |mont|, without freeing +// |mont| itself. +void bn_mont_ctx_cleanup(BN_MONT_CTX *mont); + // Constant-time modular arithmetic. // @@ -635,6 +694,15 @@ int bn_mod_inverse_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p, int bn_mod_inverse_secret_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx, const BN_MONT_CTX *mont_p); +// BN_MONT_CTX_set_locked takes |lock| and checks whether |*pmont| is NULL. If +// so, it creates a new |BN_MONT_CTX| and sets the modulus for it to |mod|. It +// then stores it as |*pmont|. It returns one on success and zero on error. Note +// this function assumes |mod| is public. +// +// If |*pmont| is already non-NULL then it does nothing and returns one. +int BN_MONT_CTX_set_locked(UniquePtr *pmont, Mutex *lock, + const BIGNUM *mod, BN_CTX *bn_ctx); + // Low-level operations for small numbers. // @@ -657,8 +725,8 @@ int bn_mod_inverse_secret_prime(BIGNUM *out, const BIGNUM *a, const BIGNUM *p, void bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a, const BN_ULONG *b, size_t num_b); -// bn_sqr_small sets |r| to |a|^2. |num_a| must be at most |BN_SMALL_MAX_WORDS|. -// |num_r| must be |num_a|*2. |r| and |a| may not alias. +// bn_sqr_small sets |r| to |a|^2. |num_r| must be |num_a|*2. |r| and |a| may +// not alias. void bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a); // In the following functions, the modulus must be at most |BN_SMALL_MAX_WORDS| @@ -690,9 +758,10 @@ void bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a, // bn_mod_exp_mont_small sets |r| to |a|^|p| mod |mont->N|. It returns one on // success and zero on programmer or internal error. Both inputs and outputs are // in the Montgomery domain. |r| and |a| are |num| words long, which must be -// |mont->N.width| and at most |BN_SMALL_MAX_WORDS|. |a| must be fully-reduced. -// This function runs in time independent of |a|, but |p| and |mont->N| are -// public values. |a| must be fully-reduced and may alias with |r|. +// |mont->N.width| and at most |BN_SMALL_MAX_WORDS|. |num_p|, measured in bits, +// must fit in |size_t|. |a| must be fully-reduced. This function runs in time +// independent of |a|, but |p| and |mont->N| are public values. |a| must be +// fully-reduced and may alias with |r|. // // Note this function differs from |BN_mod_exp_mont| which uses Montgomery // reduction but takes input and output outside the Montgomery domain. Combine @@ -715,8 +784,8 @@ void bn_mod_inverse0_prime_mont_small(BN_ULONG *r, const BN_ULONG *a, // bn_big_endian_to_words interprets |in_len| bytes from |in| as a big-endian, // unsigned integer and writes the result to |out_len| words in |out|. |out_len| -// must be large enough to represent any |in_len|-byte value. That is, |out_len| -// must be at least |BN_BYTES * in_len|. +// must be large enough to represent any |in_len|-byte value. That is, |in_len| +// must be at most |BN_BYTES * out_len|. void bn_big_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in, size_t in_len); @@ -729,9 +798,7 @@ void bn_big_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in, void bn_words_to_big_endian(uint8_t *out, size_t out_len, const BN_ULONG *in, size_t in_len); +BSSL_NAMESPACE_END -#if defined(__cplusplus) -} // extern C -#endif -#endif // OPENSSL_HEADER_BN_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_BN_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/jacobi.c b/third_party/boringssl/src/crypto/fipsmodule/bn/jacobi.c deleted file mode 100644 index d1a9d506..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/jacobi.c +++ /dev/null @@ -1,146 +0,0 @@ -/* ==================================================================== - * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include - -#include "internal.h" - - -// least significant word -#define BN_lsw(n) (((n)->width == 0) ? (BN_ULONG) 0 : (n)->d[0]) - -int bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) { - // In 'tab', only odd-indexed entries are relevant: - // For any odd BIGNUM n, - // tab[BN_lsw(n) & 7] - // is $(-1)^{(n^2-1)/8}$ (using TeX notation). - // Note that the sign of n does not matter. - static const int tab[8] = {0, 1, 0, -1, 0, -1, 0, 1}; - - // The Jacobi symbol is only defined for odd modulus. - if (!BN_is_odd(b)) { - OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); - return -2; - } - - // Require b be positive. - if (BN_is_negative(b)) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return -2; - } - - int ret = -2; - BN_CTX_start(ctx); - BIGNUM *A = BN_CTX_get(ctx); - BIGNUM *B = BN_CTX_get(ctx); - if (B == NULL) { - goto end; - } - - if (!BN_copy(A, a) || - !BN_copy(B, b)) { - goto end; - } - - // Adapted from logic to compute the Kronecker symbol, originally implemented - // according to Henri Cohen, "A Course in Computational Algebraic Number - // Theory" (algorithm 1.4.10). - - ret = 1; - - while (1) { - // Cohen's step 3: - - // B is positive and odd - if (BN_is_zero(A)) { - ret = BN_is_one(B) ? ret : 0; - goto end; - } - - // now A is non-zero - int i = 0; - while (!BN_is_bit_set(A, i)) { - i++; - } - if (!BN_rshift(A, A, i)) { - ret = -2; - goto end; - } - if (i & 1) { - // i is odd - // multiply 'ret' by $(-1)^{(B^2-1)/8}$ - ret = ret * tab[BN_lsw(B) & 7]; - } - - // Cohen's step 4: - // multiply 'ret' by $(-1)^{(A-1)(B-1)/4}$ - if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2) { - ret = -ret; - } - - // (A, B) := (B mod |A|, |A|) - if (!BN_nnmod(B, B, A, ctx)) { - ret = -2; - goto end; - } - BIGNUM *tmp = A; - A = B; - B = tmp; - tmp->neg = 0; - } - -end: - BN_CTX_end(ctx); - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/jacobi.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/jacobi.cc.inc new file mode 100644 index 00000000..f04bcc98 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/jacobi.cc.inc @@ -0,0 +1,101 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "internal.h" + + +using namespace bssl; + +// least significant word +#define BN_lsw(n) (((n)->width == 0) ? (BN_ULONG)0 : (n)->d[0]) + +int bssl::bn_jacobi(const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) { + // In 'tab', only odd-indexed entries are relevant: + // For any odd BIGNUM n, + // tab[BN_lsw(n) & 7] + // is $(-1)^{(n^2-1)/8}$ (using TeX notation). + // Note that the sign of n does not matter. + static const int tab[8] = {0, 1, 0, -1, 0, -1, 0, 1}; + + // The Jacobi symbol is only defined for odd modulus. + if (!BN_is_odd(b)) { + OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); + return -2; + } + + // Require b be positive. + if (BN_is_negative(b)) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return -2; + } + + BN_CTXScope scope(ctx); + BIGNUM *A = BN_CTX_get(ctx); + BIGNUM *B = BN_CTX_get(ctx); + if (B == nullptr) { + return -2; + } + + if (!BN_copy(A, a) || + !BN_copy(B, b)) { + return -2; + } + + // Adapted from logic to compute the Kronecker symbol, originally implemented + // according to Henri Cohen, "A Course in Computational Algebraic Number + // Theory" (algorithm 1.4.10). + + int ret = 1; + while (1) { + // Cohen's step 3: + + // B is positive and odd + if (BN_is_zero(A)) { + return BN_is_one(B) ? ret : 0; + } + + // now A is non-zero + int i = 0; + while (!BN_is_bit_set(A, i)) { + i++; + } + if (!BN_rshift(A, A, i)) { + return -2; + } + if (i & 1) { + // i is odd + // multiply 'ret' by $(-1)^{(B^2-1)/8}$ + ret = ret * tab[BN_lsw(B) & 7]; + } + + // Cohen's step 4: + // multiply 'ret' by $(-1)^{(A-1)(B-1)/4}$ + if ((A->neg ? ~BN_lsw(A) : BN_lsw(A)) & BN_lsw(B) & 2) { + ret = -ret; + } + + // (A, B) := (B mod |A|, |A|) + if (!BN_nnmod(B, B, A, ctx)) { + return -2; + } + BIGNUM *tmp = A; + A = B; + B = tmp; + tmp->neg = 0; + } +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery.c b/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery.c deleted file mode 100644 index d04e91a1..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery.c +++ /dev/null @@ -1,500 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -/* ==================================================================== - * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include -#include - -#include -#include -#include - -#include "internal.h" -#include "../../internal.h" - - -BN_MONT_CTX *BN_MONT_CTX_new(void) { - BN_MONT_CTX *ret = OPENSSL_malloc(sizeof(BN_MONT_CTX)); - - if (ret == NULL) { - return NULL; - } - - OPENSSL_memset(ret, 0, sizeof(BN_MONT_CTX)); - BN_init(&ret->RR); - BN_init(&ret->N); - - return ret; -} - -void BN_MONT_CTX_free(BN_MONT_CTX *mont) { - if (mont == NULL) { - return; - } - - BN_free(&mont->RR); - BN_free(&mont->N); - OPENSSL_free(mont); -} - -BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, const BN_MONT_CTX *from) { - if (to == from) { - return to; - } - - if (!BN_copy(&to->RR, &from->RR) || - !BN_copy(&to->N, &from->N)) { - return NULL; - } - to->n0[0] = from->n0[0]; - to->n0[1] = from->n0[1]; - return to; -} - -static int bn_mont_ctx_set_N_and_n0(BN_MONT_CTX *mont, const BIGNUM *mod) { - if (BN_is_zero(mod)) { - OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO); - return 0; - } - if (!BN_is_odd(mod)) { - OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); - return 0; - } - if (BN_is_negative(mod)) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - - // Save the modulus. - if (!BN_copy(&mont->N, mod)) { - OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR); - return 0; - } - // |mont->N| is always stored minimally. Computing RR efficiently leaks the - // size of the modulus. While the modulus may be private in RSA (one of the - // primes), their sizes are public, so this is fine. - bn_set_minimal_width(&mont->N); - - // Find n0 such that n0 * N == -1 (mod r). - // - // Only certain BN_BITS2<=32 platforms actually make use of n0[1]. For the - // others, we could use a shorter R value and use faster |BN_ULONG|-based - // math instead of |uint64_t|-based math, which would be double-precision. - // However, currently only the assembler files know which is which. - static_assert(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2, - "BN_MONT_CTX_N0_LIMBS value is invalid"); - static_assert(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t), - "uint64_t is insufficient precision for n0"); - uint64_t n0 = bn_mont_n0(&mont->N); - mont->n0[0] = (BN_ULONG)n0; -#if BN_MONT_CTX_N0_LIMBS == 2 - mont->n0[1] = (BN_ULONG)(n0 >> BN_BITS2); -#else - mont->n0[1] = 0; -#endif - return 1; -} - -int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) { - if (!bn_mont_ctx_set_N_and_n0(mont, mod)) { - return 0; - } - - BN_CTX *new_ctx = NULL; - if (ctx == NULL) { - new_ctx = BN_CTX_new(); - if (new_ctx == NULL) { - return 0; - } - ctx = new_ctx; - } - - // Save RR = R**2 (mod N). R is the smallest power of 2**BN_BITS2 such that R - // > mod. Even though the assembly on some 32-bit platforms works with 64-bit - // values, using |BN_BITS2| here, rather than |BN_MONT_CTX_N0_LIMBS * - // BN_BITS2|, is correct because R**2 will still be a multiple of the latter - // as |BN_MONT_CTX_N0_LIMBS| is either one or two. - unsigned lgBigR = mont->N.width * BN_BITS2; - BN_zero(&mont->RR); - int ok = BN_set_bit(&mont->RR, lgBigR * 2) && - BN_mod(&mont->RR, &mont->RR, &mont->N, ctx) && - bn_resize_words(&mont->RR, mont->N.width); - BN_CTX_free(new_ctx); - return ok; -} - -BN_MONT_CTX *BN_MONT_CTX_new_for_modulus(const BIGNUM *mod, BN_CTX *ctx) { - BN_MONT_CTX *mont = BN_MONT_CTX_new(); - if (mont == NULL || - !BN_MONT_CTX_set(mont, mod, ctx)) { - BN_MONT_CTX_free(mont); - return NULL; - } - return mont; -} - -BN_MONT_CTX *BN_MONT_CTX_new_consttime(const BIGNUM *mod, BN_CTX *ctx) { - BN_MONT_CTX *mont = BN_MONT_CTX_new(); - if (mont == NULL || - !bn_mont_ctx_set_N_and_n0(mont, mod)) { - goto err; - } - unsigned lgBigR = mont->N.width * BN_BITS2; - if (!bn_mod_exp_base_2_consttime(&mont->RR, lgBigR * 2, &mont->N, ctx) || - !bn_resize_words(&mont->RR, mont->N.width)) { - goto err; - } - return mont; - -err: - BN_MONT_CTX_free(mont); - return NULL; -} - -int BN_MONT_CTX_set_locked(BN_MONT_CTX **pmont, CRYPTO_MUTEX *lock, - const BIGNUM *mod, BN_CTX *bn_ctx) { - CRYPTO_MUTEX_lock_read(lock); - BN_MONT_CTX *ctx = *pmont; - CRYPTO_MUTEX_unlock_read(lock); - - if (ctx) { - return 1; - } - - CRYPTO_MUTEX_lock_write(lock); - if (*pmont == NULL) { - *pmont = BN_MONT_CTX_new_for_modulus(mod, bn_ctx); - } - const int ok = *pmont != NULL; - CRYPTO_MUTEX_unlock_write(lock); - return ok; -} - -int BN_to_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont, - BN_CTX *ctx) { - return BN_mod_mul_montgomery(ret, a, &mont->RR, mont, ctx); -} - -static int bn_from_montgomery_in_place(BN_ULONG *r, size_t num_r, BN_ULONG *a, - size_t num_a, const BN_MONT_CTX *mont) { - const BN_ULONG *n = mont->N.d; - size_t num_n = mont->N.width; - if (num_r != num_n || num_a != 2 * num_n) { - OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - - // Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On - // input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r| - // includes |carry| which is stored separately. - BN_ULONG n0 = mont->n0[0]; - BN_ULONG carry = 0; - for (size_t i = 0; i < num_n; i++) { - BN_ULONG v = bn_mul_add_words(a + i, n, num_n, a[i] * n0); - v += carry + a[i + num_n]; - carry |= (v != a[i + num_n]); - carry &= (v <= a[i + num_n]); - a[i + num_n] = v; - } - - // Shift |num_n| words to divide by R. We have |a| < 2 * |n|. Note that |a| - // includes |carry| which is stored separately. - a += num_n; - - // |a| thus requires at most one additional subtraction |n| to be reduced. - bn_reduce_once(r, a, carry, n, num_n); - return 1; -} - -static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, - const BN_MONT_CTX *mont) { - if (r->neg) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - - const BIGNUM *n = &mont->N; - if (n->width == 0) { - ret->width = 0; - return 1; - } - - int max = 2 * n->width; // carry is stored separately - if (!bn_resize_words(r, max) || - !bn_wexpand(ret, n->width)) { - return 0; - } - - ret->width = n->width; - ret->neg = 0; - return bn_from_montgomery_in_place(ret->d, ret->width, r->d, r->width, mont); -} - -int BN_from_montgomery(BIGNUM *r, const BIGNUM *a, const BN_MONT_CTX *mont, - BN_CTX *ctx) { - int ret = 0; - BIGNUM *t; - - BN_CTX_start(ctx); - t = BN_CTX_get(ctx); - if (t == NULL || - !BN_copy(t, a)) { - goto err; - } - - ret = BN_from_montgomery_word(r, t, mont); - -err: - BN_CTX_end(ctx); - - return ret; -} - -int bn_one_to_montgomery(BIGNUM *r, const BN_MONT_CTX *mont, BN_CTX *ctx) { - // If the high bit of |n| is set, R = 2^(width*BN_BITS2) < 2 * |n|, so we - // compute R - |n| rather than perform Montgomery reduction. - const BIGNUM *n = &mont->N; - if (n->width > 0 && (n->d[n->width - 1] >> (BN_BITS2 - 1)) != 0) { - if (!bn_wexpand(r, n->width)) { - return 0; - } - r->d[0] = 0 - n->d[0]; - for (int i = 1; i < n->width; i++) { - r->d[i] = ~n->d[i]; - } - r->width = n->width; - r->neg = 0; - return 1; - } - - return BN_from_montgomery(r, &mont->RR, mont, ctx); -} - -static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a, - const BIGNUM *b, - const BN_MONT_CTX *mont, - BN_CTX *ctx) { - int ret = 0; - - BN_CTX_start(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - if (tmp == NULL) { - goto err; - } - - if (a == b) { - if (!bn_sqr_consttime(tmp, a, ctx)) { - goto err; - } - } else { - if (!bn_mul_consttime(tmp, a, b, ctx)) { - goto err; - } - } - - // reduce from aRR to aR - if (!BN_from_montgomery_word(r, tmp, mont)) { - goto err; - } - - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, - const BN_MONT_CTX *mont, BN_CTX *ctx) { - if (a->neg || b->neg) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - -#if defined(OPENSSL_BN_ASM_MONT) - // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86. - int num = mont->N.width; - if (num >= (128 / BN_BITS2) && - a->width == num && - b->width == num) { - if (!bn_wexpand(r, num)) { - return 0; - } - if (!bn_mul_mont(r->d, a->d, b->d, mont->N.d, mont->n0, num)) { - // The check above ensures this won't happen. - assert(0); - OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR); - return 0; - } - r->neg = 0; - r->width = num; - return 1; - } -#endif - - return bn_mod_mul_montgomery_fallback(r, a, b, mont, ctx); -} - -int bn_less_than_montgomery_R(const BIGNUM *bn, const BN_MONT_CTX *mont) { - return !BN_is_negative(bn) && - bn_fits_in_words(bn, mont->N.width); -} - -void bn_to_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num, - const BN_MONT_CTX *mont) { - bn_mod_mul_montgomery_small(r, a, mont->RR.d, num, mont); -} - -void bn_from_montgomery_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, - size_t num_a, const BN_MONT_CTX *mont) { - if (num_r != (size_t)mont->N.width || num_r > BN_SMALL_MAX_WORDS || - num_a > 2 * num_r) { - abort(); - } - BN_ULONG tmp[BN_SMALL_MAX_WORDS * 2] = {0}; - OPENSSL_memcpy(tmp, a, num_a * sizeof(BN_ULONG)); - if (!bn_from_montgomery_in_place(r, num_r, tmp, 2 * num_r, mont)) { - abort(); - } - OPENSSL_cleanse(tmp, 2 * num_r * sizeof(BN_ULONG)); -} - -void bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a, - const BN_ULONG *b, size_t num, - const BN_MONT_CTX *mont) { - if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) { - abort(); - } - -#if defined(OPENSSL_BN_ASM_MONT) - // |bn_mul_mont| requires at least 128 bits of limbs, at least for x86. - if (num >= (128 / BN_BITS2)) { - if (!bn_mul_mont(r, a, b, mont->N.d, mont->n0, num)) { - abort(); // The check above ensures this won't happen. - } - return; - } -#endif - - // Compute the product. - BN_ULONG tmp[2 * BN_SMALL_MAX_WORDS]; - if (a == b) { - bn_sqr_small(tmp, 2 * num, a, num); - } else { - bn_mul_small(tmp, 2 * num, a, num, b, num); - } - - // Reduce. - if (!bn_from_montgomery_in_place(r, num, tmp, 2 * num, mont)) { - abort(); - } - OPENSSL_cleanse(tmp, 2 * num * sizeof(BN_ULONG)); -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery.cc.inc new file mode 100644 index 00000000..4bc6247c --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery.cc.inc @@ -0,0 +1,408 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +void bssl::bn_mont_ctx_init(BN_MONT_CTX *mont) { + OPENSSL_memset(mont, 0, sizeof(BN_MONT_CTX)); + BN_init(&mont->RR); + BN_init(&mont->N); +} + +void bssl::bn_mont_ctx_cleanup(BN_MONT_CTX *mont) { + BN_free(&mont->RR); + BN_free(&mont->N); +} + +BN_MONT_CTX *BN_MONT_CTX_new() { + BN_MONT_CTX *ret = New(); + if (ret == nullptr) { + return nullptr; + } + + bn_mont_ctx_init(ret); + return ret; +} + +void BN_MONT_CTX_free(BN_MONT_CTX *mont) { + if (mont == nullptr) { + return; + } + bn_mont_ctx_cleanup(mont); + Delete(mont); +} + +BN_MONT_CTX *BN_MONT_CTX_copy(BN_MONT_CTX *to, const BN_MONT_CTX *from) { + if (to == from) { + return to; + } + + if (!BN_copy(&to->RR, &from->RR) || !BN_copy(&to->N, &from->N)) { + return nullptr; + } + for (size_t i = 0; i < BN_MONT_CTX_N0_LIMBS; i++) { + to->n0[i] = from->n0[i]; + } + return to; +} + +static int bn_mont_ctx_set_N_and_n0(BN_MONT_CTX *mont, const BIGNUM *mod) { + if (BN_is_zero(mod)) { + OPENSSL_PUT_ERROR(BN, BN_R_DIV_BY_ZERO); + return 0; + } + if (!BN_is_odd(mod)) { + OPENSSL_PUT_ERROR(BN, BN_R_CALLED_WITH_EVEN_MODULUS); + return 0; + } + if (BN_is_negative(mod)) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + if (!bn_fits_in_words(mod, BN_MONTGOMERY_MAX_WORDS)) { + OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); + return 0; + } + + // Save the modulus. + if (!BN_copy(&mont->N, mod)) { + OPENSSL_PUT_ERROR(BN, ERR_R_INTERNAL_ERROR); + return 0; + } + // |mont->N| is always stored minimally. Computing RR efficiently leaks the + // size of the modulus. While the modulus may be private in RSA (one of the + // primes), their sizes are public, so this is fine. + bn_set_minimal_width(&mont->N); + + // Find n0 such that n0 * N == -1 (mod r). + // + // Only certain BN_BITS2<=32 platforms actually make use of n0[1]. For the + // others, we could use a shorter R value and use faster |BN_ULONG|-based + // math instead of |uint64_t|-based math, which would be double-precision. + // However, currently only the assembler files know which is which. + static_assert(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2, + "BN_MONT_CTX_N0_LIMBS value is invalid"); + static_assert(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t), + "uint64_t is insufficient precision for n0"); + uint64_t n0 = bn_mont_n0(&mont->N); + mont->n0[0] = (BN_ULONG)n0; +#if BN_MONT_CTX_N0_LIMBS == 2 + mont->n0[1] = (BN_ULONG)(n0 >> BN_BITS2); +#endif + return 1; +} + +int BN_MONT_CTX_set(BN_MONT_CTX *mont, const BIGNUM *mod, BN_CTX *ctx) { + if (!bn_mont_ctx_set_N_and_n0(mont, mod)) { + return 0; + } + + BN_CTX *new_ctx = nullptr; + if (ctx == nullptr) { + new_ctx = BN_CTX_new(); + if (new_ctx == nullptr) { + return 0; + } + ctx = new_ctx; + } + + // Save RR = R**2 (mod N). R is the smallest power of 2**BN_BITS2 such that R + // > mod. Even though the assembly on some 32-bit platforms works with 64-bit + // values, using |BN_BITS2| here, rather than |BN_MONT_CTX_N0_LIMBS * + // BN_BITS2|, is correct because R**2 will still be a multiple of the latter + // as |BN_MONT_CTX_N0_LIMBS| is either one or two. + unsigned lgBigR = mont->N.width * BN_BITS2; + BN_zero(&mont->RR); + int ok = BN_set_bit(&mont->RR, lgBigR * 2) && + BN_mod(&mont->RR, &mont->RR, &mont->N, ctx) && + bn_resize_words(&mont->RR, mont->N.width); + BN_CTX_free(new_ctx); + return ok; +} + +BN_MONT_CTX *BN_MONT_CTX_new_for_modulus(const BIGNUM *mod, BN_CTX *ctx) { + BN_MONT_CTX *mont = BN_MONT_CTX_new(); + if (mont == nullptr || !BN_MONT_CTX_set(mont, mod, ctx)) { + BN_MONT_CTX_free(mont); + return nullptr; + } + return mont; +} + +BN_MONT_CTX *BN_MONT_CTX_new_consttime(const BIGNUM *mod, BN_CTX *ctx) { + BN_MONT_CTX *mont = BN_MONT_CTX_new(); + if (mont == nullptr || !bn_mont_ctx_set_N_and_n0(mont, mod) || + !bn_mont_ctx_set_RR_consttime(mont, ctx)) { + BN_MONT_CTX_free(mont); + return nullptr; + } + return mont; +} + +int bssl::BN_MONT_CTX_set_locked(UniquePtr *pmont, Mutex *lock, + const BIGNUM *mod, BN_CTX *bn_ctx) { + lock->LockRead(); + BN_MONT_CTX *ctx = pmont->get(); + lock->UnlockRead(); + + if (ctx) { + return 1; + } + + MutexWriteLock write_lock(lock); + if (*pmont == nullptr) { + pmont->reset(BN_MONT_CTX_new_for_modulus(mod, bn_ctx)); + } + return *pmont != nullptr; +} + +int BN_to_montgomery(BIGNUM *ret, const BIGNUM *a, const BN_MONT_CTX *mont, + BN_CTX *ctx) { + return BN_mod_mul_montgomery(ret, a, &mont->RR, mont, ctx); +} + +static int bn_from_montgomery_in_place(BN_ULONG *r, size_t num_r, BN_ULONG *a, + size_t num_a, const BN_MONT_CTX *mont) { + const BN_ULONG *n = mont->N.d; + size_t num_n = mont->N.width; + if (num_r != num_n || num_a != 2 * num_n) { + OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + // Add multiples of |n| to |r| until R = 2^(nl * BN_BITS2) divides it. On + // input, we had |r| < |n| * R, so now |r| < 2 * |n| * R. Note that |r| + // includes |carry| which is stored separately. + BN_ULONG n0 = mont->n0[0]; + BN_ULONG carry = 0; + for (size_t i = 0; i < num_n; i++) { + BN_ULONG v = bn_mul_add_words(a + i, n, num_n, a[i] * n0); + a[i + num_n] = CRYPTO_addc_w(a[i + num_n], v, carry, &carry); + } + + // Shift |num_n| words to divide by R. We have |a| < 2 * |n|. Note that |a| + // includes |carry| which is stored separately. + a += num_n; + + // |a| thus requires at most one additional subtraction |n| to be reduced. + bn_reduce_once(r, a, carry, n, num_n); + return 1; +} + +static int BN_from_montgomery_word(BIGNUM *ret, BIGNUM *r, + const BN_MONT_CTX *mont) { + if (r->neg) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + + const BIGNUM *n = &mont->N; + if (n->width == 0) { + ret->width = 0; + return 1; + } + + int max = 2 * n->width; // carry is stored separately + if (!bn_resize_words(r, max) || !bn_wexpand(ret, n->width)) { + return 0; + } + + ret->width = n->width; + ret->neg = 0; + return bn_from_montgomery_in_place(ret->d, ret->width, r->d, r->width, mont); +} + +int BN_from_montgomery(BIGNUM *r, const BIGNUM *a, const BN_MONT_CTX *mont, + BN_CTX *ctx) { + BN_CTXScope scope(ctx); + BIGNUM *t = BN_CTX_get(ctx); + if (t == nullptr || !BN_copy(t, a)) { + return 0; + } + + return BN_from_montgomery_word(r, t, mont); +} + +int bssl::bn_one_to_montgomery(BIGNUM *r, const BN_MONT_CTX *mont, + BN_CTX *ctx) { + // If the high bit of |n| is set, R = 2^(width*BN_BITS2) < 2 * |n|, so we + // compute R - |n| rather than perform Montgomery reduction. + const BIGNUM *n = &mont->N; + if (n->width > 0 && (n->d[n->width - 1] >> (BN_BITS2 - 1)) != 0) { + if (!bn_wexpand(r, n->width)) { + return 0; + } + r->d[0] = 0 - n->d[0]; + for (int i = 1; i < n->width; i++) { + r->d[i] = ~n->d[i]; + } + r->width = n->width; + r->neg = 0; + return 1; + } + + return BN_from_montgomery(r, &mont->RR, mont, ctx); +} + +static int bn_mod_mul_montgomery_fallback(BIGNUM *r, const BIGNUM *a, + const BIGNUM *b, + const BN_MONT_CTX *mont, + BN_CTX *ctx) { + BN_CTXScope scope(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + if (tmp == nullptr) { + return 0; + } + + if (a == b) { + if (!bn_sqr_consttime(tmp, a, ctx)) { + return 0; + } + } else { + if (!bn_mul_consttime(tmp, a, b, ctx)) { + return 0; + } + } + + // reduce from aRR to aR + if (!BN_from_montgomery_word(r, tmp, mont)) { + return 0; + } + + return 1; +} + +int BN_mod_mul_montgomery(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, + const BN_MONT_CTX *mont, BN_CTX *ctx) { + if (a->neg || b->neg) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + +#if defined(OPENSSL_BN_ASM_MONT) + // |bn_mul_mont_words| requires at least 128 bits of limbs. + int num = mont->N.width; + if (num >= (128 / BN_BITS2) && a->width == num && b->width == num) { + if (!bn_wexpand(r, num)) { + return 0; + } + // This bound is implied by |bn_mont_ctx_set_N_and_n0|. |bn_mul_mont_words| + // allocates |num| words on the stack, so |num| cannot be too large. + assert((size_t)num <= BN_MONTGOMERY_MAX_WORDS); + bn_mul_mont_words(r->d, a->d, b->d, mont->N.d, mont->n0, num); + r->neg = 0; + r->width = num; + return 1; + } +#endif + + return bn_mod_mul_montgomery_fallback(r, a, b, mont, ctx); +} + +int bssl::bn_less_than_montgomery_R(const BIGNUM *bn, const BN_MONT_CTX *mont) { + return !BN_is_negative(bn) && bn_fits_in_words(bn, mont->N.width); +} + +void bssl::bn_to_montgomery_small(BN_ULONG *r, const BN_ULONG *a, size_t num, + const BN_MONT_CTX *mont) { + bn_mod_mul_montgomery_small(r, a, mont->RR.d, num, mont); +} + +void bssl::bn_from_montgomery_small(BN_ULONG *r, size_t num_r, + const BN_ULONG *a, size_t num_a, + const BN_MONT_CTX *mont) { + if (num_r != (size_t)mont->N.width || num_r > BN_SMALL_MAX_WORDS || + num_a > 2 * num_r) { + abort(); + } + BN_ULONG tmp[BN_SMALL_MAX_WORDS * 2] = {0}; + OPENSSL_memcpy(tmp, a, num_a * sizeof(BN_ULONG)); + if (!bn_from_montgomery_in_place(r, num_r, tmp, 2 * num_r, mont)) { + abort(); + } + OPENSSL_cleanse(tmp, 2 * num_r * sizeof(BN_ULONG)); +} + +void bssl::bn_mod_mul_montgomery_small(BN_ULONG *r, const BN_ULONG *a, + const BN_ULONG *b, size_t num, + const BN_MONT_CTX *mont) { + if (num != (size_t)mont->N.width || num > BN_SMALL_MAX_WORDS) { + abort(); + } + +#if defined(OPENSSL_BN_ASM_MONT) + // |bn_mul_mont_words| requires at least 128 bits of limbs. + if (num >= (128 / BN_BITS2)) { + bn_mul_mont_words(r, a, b, mont->N.d, mont->n0, num); + return; + } +#endif + + // Compute the product. + BN_ULONG tmp[2 * BN_SMALL_MAX_WORDS]; + if (a == b) { + bn_sqr_small(tmp, 2 * num, a, num); + } else { + bn_mul_small(tmp, 2 * num, a, num, b, num); + } + + // Reduce. + if (!bn_from_montgomery_in_place(r, num, tmp, 2 * num, mont)) { + abort(); + } + OPENSSL_cleanse(tmp, 2 * num * sizeof(BN_ULONG)); +} + +#if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_X86_64) +void bssl::bn_mul_mont_words(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG *n0, size_t num) { + if (ap == bp && bn_sqr8x_mont_capable(num)) { + bn_sqr8x_mont(rp, ap, bn_mulx_adx_capable(), np, n0, num); + } else if (bn_mulx4x_mont_capable(num)) { + bn_mulx4x_mont(rp, ap, bp, np, n0, num); + } else if (bn_mul4x_mont_capable(num)) { + bn_mul4x_mont(rp, ap, bp, np, n0, num); + } else { + bn_mul_mont_nohw(rp, ap, bp, np, n0, num); + } +} +#endif + +#if defined(OPENSSL_BN_ASM_MONT) && defined(OPENSSL_ARM) +void bssl::bn_mul_mont_words(BN_ULONG *rp, const BN_ULONG *ap, + const BN_ULONG *bp, const BN_ULONG *np, + const BN_ULONG *n0, size_t num) { + if (bn_mul8x_mont_neon_capable(num)) { + bn_mul8x_mont_neon(rp, ap, bp, np, n0, num); + } else { + bn_mul_mont_nohw(rp, ap, bp, np, n0, num); + } +} +#endif diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery_inv.c b/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery_inv.c deleted file mode 100644 index 137af1dd..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery_inv.c +++ /dev/null @@ -1,185 +0,0 @@ -/* Copyright 2016 Brian Smith. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - -#include "internal.h" -#include "../../internal.h" - - -static uint64_t bn_neg_inv_mod_r_u64(uint64_t n); - -static_assert(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2, - "BN_MONT_CTX_N0_LIMBS value is invalid"); -static_assert(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t), - "uint64_t is insufficient precision for n0"); - -// LG_LITTLE_R is log_2(r). -#define LG_LITTLE_R (BN_MONT_CTX_N0_LIMBS * BN_BITS2) - -uint64_t bn_mont_n0(const BIGNUM *n) { - // These conditions are checked by the caller, |BN_MONT_CTX_set| or - // |BN_MONT_CTX_new_consttime|. - assert(!BN_is_zero(n)); - assert(!BN_is_negative(n)); - assert(BN_is_odd(n)); - - // r == 2**(BN_MONT_CTX_N0_LIMBS * BN_BITS2) and LG_LITTLE_R == lg(r). This - // ensures that we can do integer division by |r| by simply ignoring - // |BN_MONT_CTX_N0_LIMBS| limbs. Similarly, we can calculate values modulo - // |r| by just looking at the lowest |BN_MONT_CTX_N0_LIMBS| limbs. This is - // what makes Montgomery multiplication efficient. - // - // As shown in Algorithm 1 of "Fast Prime Field Elliptic Curve Cryptography - // with 256 Bit Primes" by Shay Gueron and Vlad Krasnov, in the loop of a - // multi-limb Montgomery multiplication of |a * b (mod n)|, given the - // unreduced product |t == a * b|, we repeatedly calculate: - // - // t1 := t % r |t1| is |t|'s lowest limb (see previous paragraph). - // t2 := t1*n0*n - // t3 := t + t2 - // t := t3 / r copy all limbs of |t3| except the lowest to |t|. - // - // In the last step, it would only make sense to ignore the lowest limb of - // |t3| if it were zero. The middle steps ensure that this is the case: - // - // t3 == 0 (mod r) - // t + t2 == 0 (mod r) - // t + t1*n0*n == 0 (mod r) - // t1*n0*n == -t (mod r) - // t*n0*n == -t (mod r) - // n0*n == -1 (mod r) - // n0 == -1/n (mod r) - // - // Thus, in each iteration of the loop, we multiply by the constant factor - // |n0|, the negative inverse of n (mod r). - - // n_mod_r = n % r. As explained above, this is done by taking the lowest - // |BN_MONT_CTX_N0_LIMBS| limbs of |n|. - uint64_t n_mod_r = n->d[0]; -#if BN_MONT_CTX_N0_LIMBS == 2 - if (n->width > 1) { - n_mod_r |= (uint64_t)n->d[1] << BN_BITS2; - } -#endif - - return bn_neg_inv_mod_r_u64(n_mod_r); -} - -// bn_neg_inv_r_mod_n_u64 calculates the -1/n mod r; i.e. it calculates |v| -// such that u*r - v*n == 1. |r| is the constant defined in |bn_mont_n0|. |n| -// must be odd. -// -// This is derived from |xbinGCD| in Henry S. Warren, Jr.'s "Montgomery -// Multiplication" (http://www.hackersdelight.org/MontgomeryMultiplication.pdf). -// It is very similar to the MODULAR-INVERSE function in Stephen R. Dussé's and -// Burton S. Kaliski Jr.'s "A Cryptographic Library for the Motorola DSP56000" -// (http://link.springer.com/chapter/10.1007%2F3-540-46877-3_21). -// -// This is inspired by Joppe W. Bos's "Constant Time Modular Inversion" -// (http://www.joppebos.com/files/CTInversion.pdf) so that the inversion is -// constant-time with respect to |n|. We assume uint64_t additions, -// subtractions, shifts, and bitwise operations are all constant time, which -// may be a large leap of faith on 32-bit targets. We avoid division and -// multiplication, which tend to be the most problematic in terms of timing -// leaks. -// -// Most GCD implementations return values such that |u*r + v*n == 1|, so the -// caller would have to negate the resultant |v| for the purpose of Montgomery -// multiplication. This implementation does the negation implicitly by doing -// the computations as a difference instead of a sum. -static uint64_t bn_neg_inv_mod_r_u64(uint64_t n) { - assert(n % 2 == 1); - - // alpha == 2**(lg r - 1) == r / 2. - static const uint64_t alpha = UINT64_C(1) << (LG_LITTLE_R - 1); - - const uint64_t beta = n; - - uint64_t u = 1; - uint64_t v = 0; - - // The invariant maintained from here on is: - // 2**(lg r - i) == u*2*alpha - v*beta. - for (size_t i = 0; i < LG_LITTLE_R; ++i) { -#if BN_BITS2 == 64 && defined(BN_ULLONG) - assert((BN_ULLONG)(1) << (LG_LITTLE_R - i) == - ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta)); -#endif - - // Delete a common factor of 2 in u and v if |u| is even. Otherwise, set - // |u = (u + beta) / 2| and |v = (v / 2) + alpha|. - - uint64_t u_is_odd = UINT64_C(0) - (u & 1); // Either 0xff..ff or 0. - - // The addition can overflow, so use Dietz's method for it. - // - // Dietz calculates (x+y)/2 by (x⊕y)>>1 + x&y. This is valid for all - // (unsigned) x and y, even when x+y overflows. Evidence for 32-bit values - // (embedded in 64 bits to so that overflow can be ignored): - // - // (declare-fun x () (_ BitVec 64)) - // (declare-fun y () (_ BitVec 64)) - // (assert (let ( - // (one (_ bv1 64)) - // (thirtyTwo (_ bv32 64))) - // (and - // (bvult x (bvshl one thirtyTwo)) - // (bvult y (bvshl one thirtyTwo)) - // (not (= - // (bvadd (bvlshr (bvxor x y) one) (bvand x y)) - // (bvlshr (bvadd x y) one))) - // ))) - // (check-sat) - uint64_t beta_if_u_is_odd = beta & u_is_odd; // Either |beta| or 0. - u = ((u ^ beta_if_u_is_odd) >> 1) + (u & beta_if_u_is_odd); - - uint64_t alpha_if_u_is_odd = alpha & u_is_odd; // Either |alpha| or 0. - v = (v >> 1) + alpha_if_u_is_odd; - } - - // The invariant now shows that u*r - v*n == 1 since r == 2 * alpha. -#if BN_BITS2 == 64 && defined(BN_ULLONG) - assert(1 == ((BN_ULLONG)u * 2 * alpha) - ((BN_ULLONG)v * beta)); -#endif - - return v; -} - -int bn_mod_exp_base_2_consttime(BIGNUM *r, unsigned p, const BIGNUM *n, - BN_CTX *ctx) { - assert(!BN_is_zero(n)); - assert(!BN_is_negative(n)); - assert(BN_is_odd(n)); - - BN_zero(r); - - unsigned n_bits = BN_num_bits(n); - assert(n_bits != 0); - assert(p > n_bits); - if (n_bits == 1) { - return 1; - } - - // Set |r| to the larger power of two smaller than |n|, then shift with - // reductions the rest of the way. - if (!BN_set_bit(r, n_bits - 1) || - !bn_mod_lshift_consttime(r, r, p - (n_bits - 1), n, ctx)) { - return 0; - } - - return 1; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery_inv.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery_inv.cc.inc new file mode 100644 index 00000000..e4ee0896 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/montgomery_inv.cc.inc @@ -0,0 +1,179 @@ +// Copyright 2016 Brian Smith. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "internal.h" +#include "../../internal.h" + + +using namespace bssl; + +static uint64_t bn_neg_inv_mod_u64(uint64_t n); + +static_assert(BN_MONT_CTX_N0_LIMBS == 1 || BN_MONT_CTX_N0_LIMBS == 2, + "BN_MONT_CTX_N0_LIMBS value is invalid"); +static_assert(sizeof(BN_ULONG) * BN_MONT_CTX_N0_LIMBS == sizeof(uint64_t), + "uint64_t is insufficient precision for n0"); + +uint64_t bssl::bn_mont_n0(const BIGNUM *n) { + // These conditions are checked by the caller, |BN_MONT_CTX_set| or + // |BN_MONT_CTX_new_consttime|. + assert(!BN_is_zero(n)); + assert(!BN_is_negative(n)); + assert(BN_is_odd(n)); + + // r == 2**(BN_MONT_CTX_N0_LIMBS * BN_BITS2) ensures that we can do integer + // division by |r| by simply ignoring |BN_MONT_CTX_N0_LIMBS| limbs. Similarly, + // we can calculate values modulo |r| by just looking at the lowest + // |BN_MONT_CTX_N0_LIMBS| limbs. This is what makes Montgomery multiplication + // efficient. + // + // As shown in Algorithm 1 of "Fast Prime Field Elliptic Curve Cryptography + // with 256 Bit Primes" by Shay Gueron and Vlad Krasnov, in the loop of a + // multi-limb Montgomery multiplication of |a * b (mod n)|, given the + // unreduced product |t == a * b|, we repeatedly calculate: + // + // t1 := t % r |t1| is |t|'s lowest limb (see previous paragraph). + // t2 := t1*n0*n + // t3 := t + t2 + // t := t3 / r copy all limbs of |t3| except the lowest to |t|. + // + // In the last step, it would only make sense to ignore the lowest limb of + // |t3| if it were zero. The middle steps ensure that this is the case: + // + // t3 == 0 (mod r) + // t + t2 == 0 (mod r) + // t + t1*n0*n == 0 (mod r) + // t1*n0*n == -t (mod r) + // t*n0*n == -t (mod r) + // n0*n == -1 (mod r) + // n0 == -1/n (mod r) + // + // Thus, in each iteration of the loop, we multiply by the constant factor + // |n0|, the negative inverse of n (mod r). + + // n_mod_r = n % r. As explained above, this is done by taking the lowest + // |BN_MONT_CTX_N0_LIMBS| limbs of |n|. + uint64_t n_mod_r = n->d[0]; +#if BN_MONT_CTX_N0_LIMBS == 2 + if (n->width > 1) { + n_mod_r |= (uint64_t)n->d[1] << BN_BITS2; + } +#endif + + // A 64-bit inverse is enough precision to invert by r. (r is also currently + // always 2^64.) + return bn_neg_inv_mod_u64(n_mod_r); +} + +// bn_neg_inv_mod_u64 calculates -1/n mod 2^64. |n| must be odd. +static uint64_t bn_neg_inv_mod_u64(uint64_t n) { + // This is a modified version of the technique described in + // https://crypto.stackexchange.com/a/47496 and + // https://bearssl.org/bigint.html#montgomery-reduction-and-multiplication. We + // modify it to compute the negative inverse directly so that, on 32-bit, + // negation happens before we go to double-word precision, instead of at the + // end. + // + // If r = -n^-1 (mod m), then r * (r*n + 2) is -n^(-1) (mod m^2). This is + // because, for some k, r*n = k*m - 1. Then: + // + // r*n * (r*n + 2) = (k*m - 1) * (k*m + 1) = k^2*m^2 - 1 = -1 (mod m^2) + // + // We start with the negative inverse mod some small power of 2 and square the + // modulus up to 2^64. n = n^-1 (mod 8) for all odd n, so r = -n (mod 8). From + // there, four iterations are enough for 2^32 and five for 2^64. + assert(n % 2 == 1); +#if defined(OPENSSL_32_BIT) + // Compute the result mod 2^32 first. + uint32_t n32 = static_cast(n); + uint32_t r = 0u - n32; + for (int i = 0; i < 4; i++) { + r *= r * n32 + 2; + } + // Run one more double-word iteration to get the result mod 2^64. + return r * (r * n + 2); +#else + uint64_t r = 0u - n; + for (int i = 0; i < 5; i++) { + r *= r * n + 2; + } + return r; +#endif +} + +int bssl::bn_mont_ctx_set_RR_consttime(BN_MONT_CTX *mont, BN_CTX *ctx) { + assert(!BN_is_zero(&mont->N)); + assert(!BN_is_negative(&mont->N)); + assert(BN_is_odd(&mont->N)); + assert(bn_minimal_width(&mont->N) == mont->N.width); + + unsigned n_bits = BN_num_bits(&mont->N); + assert(n_bits != 0); + if (n_bits == 1) { + BN_zero(&mont->RR); + return bn_resize_words(&mont->RR, mont->N.width); + } + + unsigned lgBigR = mont->N.width * BN_BITS2; + assert(lgBigR >= n_bits); + + // RR is R, or 2^lgBigR, in the Montgomery domain. We can compute 2 in the + // Montgomery domain, 2R or 2^(lgBigR+1), and then use Montgomery + // square-and-multiply to exponentiate. + // + // The square steps take 2^n R to (2^n)*(2^n) R = 2^2n R. This is the same as + // doubling 2^n R, n times (doubling any x, n times, computes 2^n * x). When n + // is below some threshold, doubling is faster; when above, squaring is + // faster. From benchmarking various 32-bit and 64-bit architectures, the word + // count seems to work well as a threshold. (Doubling scales linearly and + // Montgomery reduction scales quadratically, so the threshold should scale + // roughly linearly.) + // + // The multiply steps take 2^n R to 2*2^n R = 2^(n+1) R. It is faster to + // double the value instead, so the square-and-multiply exponentiation would + // become square-and-double. However, when using the word count as the + // threshold, it turns out that no multiply/double steps will be needed at + // all, because squaring any x, i times, computes x^(2^i): + // + // (2^threshold)^(2^BN_BITS2_LG) R + // (2^mont->N.width)^BN_BITS2 R + // = 2^(mont->N.width*BN_BITS2) R + // = 2^lgBigR R + // = RR + int threshold = mont->N.width; + + // Calculate 2^threshold R = 2^(threshold + lgBigR) by doubling. The + // first n_bits - 1 doubles can be skipped because we don't need to reduce. + if (!BN_set_bit(&mont->RR, n_bits - 1) || + !bn_mod_lshift_consttime(&mont->RR, &mont->RR, + threshold + (lgBigR - (n_bits - 1)), + &mont->N, ctx)) { + return 0; + } + + // The above steps are the same regardless of the threshold. The steps below + // need to be modified if the threshold changes. + assert(threshold == mont->N.width); + for (unsigned i = 0; i < BN_BITS2_LG; i++) { + if (!BN_mod_mul_montgomery(&mont->RR, &mont->RR, &mont->RR, mont, ctx)) { + return 0; + } + } + + return bn_resize_words(&mont->RR, mont->N.width); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/mul.c b/third_party/boringssl/src/crypto/fipsmodule/bn/mul.c deleted file mode 100644 index fe4e4d7a..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/mul.c +++ /dev/null @@ -1,748 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include -#include - -#include "internal.h" -#include "../../internal.h" - - -#define BN_MUL_RECURSIVE_SIZE_NORMAL 16 -#define BN_SQR_RECURSIVE_SIZE_NORMAL BN_MUL_RECURSIVE_SIZE_NORMAL - - -static void bn_abs_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, - size_t num, BN_ULONG *tmp) { - BN_ULONG borrow = bn_sub_words(tmp, a, b, num); - bn_sub_words(r, b, a, num); - bn_select_words(r, 0 - borrow, r /* tmp < 0 */, tmp /* tmp >= 0 */, num); -} - -static void bn_mul_normal(BN_ULONG *r, const BN_ULONG *a, size_t na, - const BN_ULONG *b, size_t nb) { - if (na < nb) { - size_t itmp = na; - na = nb; - nb = itmp; - const BN_ULONG *ltmp = a; - a = b; - b = ltmp; - } - BN_ULONG *rr = &(r[na]); - if (nb == 0) { - OPENSSL_memset(r, 0, na * sizeof(BN_ULONG)); - return; - } - rr[0] = bn_mul_words(r, a, na, b[0]); - - for (;;) { - if (--nb == 0) { - return; - } - rr[1] = bn_mul_add_words(&(r[1]), a, na, b[1]); - if (--nb == 0) { - return; - } - rr[2] = bn_mul_add_words(&(r[2]), a, na, b[2]); - if (--nb == 0) { - return; - } - rr[3] = bn_mul_add_words(&(r[3]), a, na, b[3]); - if (--nb == 0) { - return; - } - rr[4] = bn_mul_add_words(&(r[4]), a, na, b[4]); - rr += 4; - r += 4; - b += 4; - } -} - -// bn_sub_part_words sets |r| to |a| - |b|. It returns the borrow bit, which is -// one if the operation underflowed and zero otherwise. |cl| is the common -// length, that is, the shorter of len(a) or len(b). |dl| is the delta length, -// that is, len(a) - len(b). |r|'s length matches the larger of |a| and |b|, or -// cl + abs(dl). -// -// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention -// is confusing. -static BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, - const BN_ULONG *b, int cl, int dl) { - assert(cl >= 0); - BN_ULONG borrow = bn_sub_words(r, a, b, cl); - if (dl == 0) { - return borrow; - } - - r += cl; - a += cl; - b += cl; - - if (dl < 0) { - // |a| is shorter than |b|. Complete the subtraction as if the excess words - // in |a| were zeros. - dl = -dl; - for (int i = 0; i < dl; i++) { - r[i] = 0u - b[i] - borrow; - borrow |= r[i] != 0; - } - } else { - // |b| is shorter than |a|. Complete the subtraction as if the excess words - // in |b| were zeros. - for (int i = 0; i < dl; i++) { - // |r| and |a| may alias, so use a temporary. - BN_ULONG tmp = a[i]; - r[i] = a[i] - borrow; - borrow = tmp < r[i]; - } - } - - return borrow; -} - -// bn_abs_sub_part_words computes |r| = |a| - |b|, storing the absolute value -// and returning a mask of all ones if the result was negative and all zeros if -// the result was positive. |cl| and |dl| follow the |bn_sub_part_words| calling -// convention. -// -// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention -// is confusing. -static BN_ULONG bn_abs_sub_part_words(BN_ULONG *r, const BN_ULONG *a, - const BN_ULONG *b, int cl, int dl, - BN_ULONG *tmp) { - BN_ULONG borrow = bn_sub_part_words(tmp, a, b, cl, dl); - bn_sub_part_words(r, b, a, cl, -dl); - int r_len = cl + (dl < 0 ? -dl : dl); - borrow = 0 - borrow; - bn_select_words(r, borrow, r /* tmp < 0 */, tmp /* tmp >= 0 */, r_len); - return borrow; -} - -int bn_abs_sub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, - BN_CTX *ctx) { - int cl = a->width < b->width ? a->width : b->width; - int dl = a->width - b->width; - int r_len = a->width < b->width ? b->width : a->width; - BN_CTX_start(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - int ok = tmp != NULL && - bn_wexpand(r, r_len) && - bn_wexpand(tmp, r_len); - if (ok) { - bn_abs_sub_part_words(r->d, a->d, b->d, cl, dl, tmp->d); - r->width = r_len; - } - BN_CTX_end(ctx); - return ok; -} - -// Karatsuba recursive multiplication algorithm -// (cf. Knuth, The Art of Computer Programming, Vol. 2) - -// bn_mul_recursive sets |r| to |a| * |b|, using |t| as scratch space. |r| has -// length 2*|n2|, |a| has length |n2| + |dna|, |b| has length |n2| + |dnb|, and -// |t| has length 4*|n2|. |n2| must be a power of two. Finally, we must have -// -|BN_MUL_RECURSIVE_SIZE_NORMAL|/2 <= |dna| <= 0 and -// -|BN_MUL_RECURSIVE_SIZE_NORMAL|/2 <= |dnb| <= 0. -// -// TODO(davidben): Simplify and |size_t| the calling convention around lengths -// here. -static void bn_mul_recursive(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, - int n2, int dna, int dnb, BN_ULONG *t) { - // |n2| is a power of two. - assert(n2 != 0 && (n2 & (n2 - 1)) == 0); - // Check |dna| and |dnb| are in range. - assert(-BN_MUL_RECURSIVE_SIZE_NORMAL/2 <= dna && dna <= 0); - assert(-BN_MUL_RECURSIVE_SIZE_NORMAL/2 <= dnb && dnb <= 0); - - // Only call bn_mul_comba 8 if n2 == 8 and the - // two arrays are complete [steve] - if (n2 == 8 && dna == 0 && dnb == 0) { - bn_mul_comba8(r, a, b); - return; - } - - // Else do normal multiply - if (n2 < BN_MUL_RECURSIVE_SIZE_NORMAL) { - bn_mul_normal(r, a, n2 + dna, b, n2 + dnb); - if (dna + dnb < 0) { - OPENSSL_memset(&r[2 * n2 + dna + dnb], 0, - sizeof(BN_ULONG) * -(dna + dnb)); - } - return; - } - - // Split |a| and |b| into a0,a1 and b0,b1, where a0 and b0 have size |n|. - // Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used - // for recursive calls. - // Split |r| into r0,r1,r2,r3. We must contribute a0*b0 to r0,r1, a0*a1+b0*b1 - // to r1,r2, and a1*b1 to r2,r3. The middle term we will compute as: - // - // a0*a1 + b0*b1 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0 - // - // Note that we know |n| >= |BN_MUL_RECURSIVE_SIZE_NORMAL|/2 above, so - // |tna| and |tnb| are non-negative. - int n = n2 / 2, tna = n + dna, tnb = n + dnb; - - // t0 = a0 - a1 and t1 = b1 - b0. The result will be multiplied, so we XOR - // their sign masks, giving the sign of (a0 - a1)*(b1 - b0). t0 and t1 - // themselves store the absolute value. - BN_ULONG neg = bn_abs_sub_part_words(t, a, &a[n], tna, n - tna, &t[n2]); - neg ^= bn_abs_sub_part_words(&t[n], &b[n], b, tnb, tnb - n, &t[n2]); - - // Compute: - // t2,t3 = t0 * t1 = |(a0 - a1)*(b1 - b0)| - // r0,r1 = a0 * b0 - // r2,r3 = a1 * b1 - if (n == 4 && dna == 0 && dnb == 0) { - bn_mul_comba4(&t[n2], t, &t[n]); - - bn_mul_comba4(r, a, b); - bn_mul_comba4(&r[n2], &a[n], &b[n]); - } else if (n == 8 && dna == 0 && dnb == 0) { - bn_mul_comba8(&t[n2], t, &t[n]); - - bn_mul_comba8(r, a, b); - bn_mul_comba8(&r[n2], &a[n], &b[n]); - } else { - BN_ULONG *p = &t[n2 * 2]; - bn_mul_recursive(&t[n2], t, &t[n], n, 0, 0, p); - bn_mul_recursive(r, a, b, n, 0, 0, p); - bn_mul_recursive(&r[n2], &a[n], &b[n], n, dna, dnb, p); - } - - // t0,t1,c = r0,r1 + r2,r3 = a0*b0 + a1*b1 - BN_ULONG c = bn_add_words(t, r, &r[n2], n2); - - // t2,t3,c = t0,t1,c + neg*t2,t3 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0. - // The second term is stored as the absolute value, so we do this with a - // constant-time select. - BN_ULONG c_neg = c - bn_sub_words(&t[n2 * 2], t, &t[n2], n2); - BN_ULONG c_pos = c + bn_add_words(&t[n2], t, &t[n2], n2); - bn_select_words(&t[n2], neg, &t[n2 * 2], &t[n2], n2); - static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), - "crypto_word_t is too small"); - c = constant_time_select_w(neg, c_neg, c_pos); - - // We now have our three components. Add them together. - // r1,r2,c = r1,r2 + t2,t3,c - c += bn_add_words(&r[n], &r[n], &t[n2], n2); - - // Propagate the carry bit to the end. - for (int i = n + n2; i < n2 + n2; i++) { - BN_ULONG old = r[i]; - r[i] = old + c; - c = r[i] < old; - } - - // The product should fit without carries. - assert(c == 0); -} - -// bn_mul_part_recursive sets |r| to |a| * |b|, using |t| as scratch space. |r| -// has length 4*|n|, |a| has length |n| + |tna|, |b| has length |n| + |tnb|, and -// |t| has length 8*|n|. |n| must be a power of two. Additionally, we must have -// 0 <= tna < n and 0 <= tnb < n, and |tna| and |tnb| must differ by at most -// one. -// -// TODO(davidben): Make this take |size_t| and perhaps the actual lengths of |a| -// and |b|. -static void bn_mul_part_recursive(BN_ULONG *r, const BN_ULONG *a, - const BN_ULONG *b, int n, int tna, int tnb, - BN_ULONG *t) { - // |n| is a power of two. - assert(n != 0 && (n & (n - 1)) == 0); - // Check |tna| and |tnb| are in range. - assert(0 <= tna && tna < n); - assert(0 <= tnb && tnb < n); - assert(-1 <= tna - tnb && tna - tnb <= 1); - - int n2 = n * 2; - if (n < 8) { - bn_mul_normal(r, a, n + tna, b, n + tnb); - OPENSSL_memset(r + n2 + tna + tnb, 0, n2 - tna - tnb); - return; - } - - // Split |a| and |b| into a0,a1 and b0,b1, where a0 and b0 have size |n|. |a1| - // and |b1| have size |tna| and |tnb|, respectively. - // Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used - // for recursive calls. - // Split |r| into r0,r1,r2,r3. We must contribute a0*b0 to r0,r1, a0*a1+b0*b1 - // to r1,r2, and a1*b1 to r2,r3. The middle term we will compute as: - // - // a0*a1 + b0*b1 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0 - - // t0 = a0 - a1 and t1 = b1 - b0. The result will be multiplied, so we XOR - // their sign masks, giving the sign of (a0 - a1)*(b1 - b0). t0 and t1 - // themselves store the absolute value. - BN_ULONG neg = bn_abs_sub_part_words(t, a, &a[n], tna, n - tna, &t[n2]); - neg ^= bn_abs_sub_part_words(&t[n], &b[n], b, tnb, tnb - n, &t[n2]); - - // Compute: - // t2,t3 = t0 * t1 = |(a0 - a1)*(b1 - b0)| - // r0,r1 = a0 * b0 - // r2,r3 = a1 * b1 - if (n == 8) { - bn_mul_comba8(&t[n2], t, &t[n]); - bn_mul_comba8(r, a, b); - - bn_mul_normal(&r[n2], &a[n], tna, &b[n], tnb); - // |bn_mul_normal| only writes |tna| + |tna| words. Zero the rest. - OPENSSL_memset(&r[n2 + tna + tnb], 0, sizeof(BN_ULONG) * (n2 - tna - tnb)); - } else { - BN_ULONG *p = &t[n2 * 2]; - bn_mul_recursive(&t[n2], t, &t[n], n, 0, 0, p); - bn_mul_recursive(r, a, b, n, 0, 0, p); - - OPENSSL_memset(&r[n2], 0, sizeof(BN_ULONG) * n2); - if (tna < BN_MUL_RECURSIVE_SIZE_NORMAL && - tnb < BN_MUL_RECURSIVE_SIZE_NORMAL) { - bn_mul_normal(&r[n2], &a[n], tna, &b[n], tnb); - } else { - int i = n; - for (;;) { - i /= 2; - if (i < tna || i < tnb) { - // E.g., n == 16, i == 8 and tna == 11. |tna| and |tnb| are within one - // of each other, so if |tna| is larger and tna > i, then we know - // tnb >= i, and this call is valid. - bn_mul_part_recursive(&r[n2], &a[n], &b[n], i, tna - i, tnb - i, p); - break; - } - if (i == tna || i == tnb) { - // If there is only a bottom half to the number, just do it. We know - // the larger of |tna - i| and |tnb - i| is zero. The other is zero or - // -1 by because of |tna| and |tnb| differ by at most one. - bn_mul_recursive(&r[n2], &a[n], &b[n], i, tna - i, tnb - i, p); - break; - } - - // This loop will eventually terminate when |i| falls below - // |BN_MUL_RECURSIVE_SIZE_NORMAL| because we know one of |tna| and |tnb| - // exceeds that. - } - } - } - - // t0,t1,c = r0,r1 + r2,r3 = a0*b0 + a1*b1 - BN_ULONG c = bn_add_words(t, r, &r[n2], n2); - - // t2,t3,c = t0,t1,c + neg*t2,t3 = (a0 - a1)*(b1 - b0) + a1*b1 + a0*b0. - // The second term is stored as the absolute value, so we do this with a - // constant-time select. - BN_ULONG c_neg = c - bn_sub_words(&t[n2 * 2], t, &t[n2], n2); - BN_ULONG c_pos = c + bn_add_words(&t[n2], t, &t[n2], n2); - bn_select_words(&t[n2], neg, &t[n2 * 2], &t[n2], n2); - static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), - "crypto_word_t is too small"); - c = constant_time_select_w(neg, c_neg, c_pos); - - // We now have our three components. Add them together. - // r1,r2,c = r1,r2 + t2,t3,c - c += bn_add_words(&r[n], &r[n], &t[n2], n2); - - // Propagate the carry bit to the end. - for (int i = n + n2; i < n2 + n2; i++) { - BN_ULONG old = r[i]; - r[i] = old + c; - c = r[i] < old; - } - - // The product should fit without carries. - assert(c == 0); -} - -// bn_mul_impl implements |BN_mul| and |bn_mul_consttime|. Note this function -// breaks |BIGNUM| invariants and may return a negative zero. This is handled by -// the callers. -static int bn_mul_impl(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, - BN_CTX *ctx) { - int al = a->width; - int bl = b->width; - if (al == 0 || bl == 0) { - BN_zero(r); - return 1; - } - - int ret = 0; - BIGNUM *rr; - BN_CTX_start(ctx); - if (r == a || r == b) { - rr = BN_CTX_get(ctx); - if (rr == NULL) { - goto err; - } - } else { - rr = r; - } - rr->neg = a->neg ^ b->neg; - - int i = al - bl; - if (i == 0) { - if (al == 8) { - if (!bn_wexpand(rr, 16)) { - goto err; - } - rr->width = 16; - bn_mul_comba8(rr->d, a->d, b->d); - goto end; - } - } - - int top = al + bl; - static const int kMulNormalSize = 16; - if (al >= kMulNormalSize && bl >= kMulNormalSize) { - if (-1 <= i && i <= 1) { - // Find the largest power of two less than or equal to the larger length. - int j; - if (i >= 0) { - j = BN_num_bits_word((BN_ULONG)al); - } else { - j = BN_num_bits_word((BN_ULONG)bl); - } - j = 1 << (j - 1); - assert(j <= al || j <= bl); - BIGNUM *t = BN_CTX_get(ctx); - if (t == NULL) { - goto err; - } - if (al > j || bl > j) { - // We know |al| and |bl| are at most one from each other, so if al > j, - // bl >= j, and vice versa. Thus we can use |bn_mul_part_recursive|. - // - // TODO(davidben): This codepath is almost unused in standard - // algorithms. Is this optimization necessary? See notes in - // https://boringssl-review.googlesource.com/q/I0bd604e2cd6a75c266f64476c23a730ca1721ea6 - assert(al >= j && bl >= j); - if (!bn_wexpand(t, j * 8) || - !bn_wexpand(rr, j * 4)) { - goto err; - } - bn_mul_part_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d); - } else { - // al <= j && bl <= j. Additionally, we know j <= al or j <= bl, so one - // of al - j or bl - j is zero. The other, by the bound on |i| above, is - // zero or -1. Thus, we can use |bn_mul_recursive|. - if (!bn_wexpand(t, j * 4) || - !bn_wexpand(rr, j * 2)) { - goto err; - } - bn_mul_recursive(rr->d, a->d, b->d, j, al - j, bl - j, t->d); - } - rr->width = top; - goto end; - } - } - - if (!bn_wexpand(rr, top)) { - goto err; - } - rr->width = top; - bn_mul_normal(rr->d, a->d, al, b->d, bl); - -end: - if (r != rr && !BN_copy(r, rr)) { - goto err; - } - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) { - if (!bn_mul_impl(r, a, b, ctx)) { - return 0; - } - - // This additionally fixes any negative zeros created by |bn_mul_impl|. - bn_set_minimal_width(r); - return 1; -} - -int bn_mul_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) { - // Prevent negative zeros. - if (a->neg || b->neg) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - - return bn_mul_impl(r, a, b, ctx); -} - -void bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a, - const BN_ULONG *b, size_t num_b) { - if (num_r != num_a + num_b) { - abort(); - } - // TODO(davidben): Should this call |bn_mul_comba4| too? |BN_mul| does not - // hit that code. - if (num_a == 8 && num_b == 8) { - bn_mul_comba8(r, a, b); - } else { - bn_mul_normal(r, a, num_a, b, num_b); - } -} - -// tmp must have 2*n words -static void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, size_t n, - BN_ULONG *tmp) { - if (n == 0) { - return; - } - - size_t max = n * 2; - const BN_ULONG *ap = a; - BN_ULONG *rp = r; - rp[0] = rp[max - 1] = 0; - rp++; - - // Compute the contribution of a[i] * a[j] for all i < j. - if (n > 1) { - ap++; - rp[n - 1] = bn_mul_words(rp, ap, n - 1, ap[-1]); - rp += 2; - } - if (n > 2) { - for (size_t i = n - 2; i > 0; i--) { - ap++; - rp[i] = bn_mul_add_words(rp, ap, i, ap[-1]); - rp += 2; - } - } - - // The final result fits in |max| words, so none of the following operations - // will overflow. - - // Double |r|, giving the contribution of a[i] * a[j] for all i != j. - bn_add_words(r, r, r, max); - - // Add in the contribution of a[i] * a[i] for all i. - bn_sqr_words(tmp, a, n); - bn_add_words(r, r, tmp, max); -} - -// bn_sqr_recursive sets |r| to |a|^2, using |t| as scratch space. |r| has -// length 2*|n2|, |a| has length |n2|, and |t| has length 4*|n2|. |n2| must be -// a power of two. -static void bn_sqr_recursive(BN_ULONG *r, const BN_ULONG *a, size_t n2, - BN_ULONG *t) { - // |n2| is a power of two. - assert(n2 != 0 && (n2 & (n2 - 1)) == 0); - - if (n2 == 4) { - bn_sqr_comba4(r, a); - return; - } - if (n2 == 8) { - bn_sqr_comba8(r, a); - return; - } - if (n2 < BN_SQR_RECURSIVE_SIZE_NORMAL) { - bn_sqr_normal(r, a, n2, t); - return; - } - - // Split |a| into a0,a1, each of size |n|. - // Split |t| into t0,t1,t2,t3, each of size |n|, with the remaining 4*|n| used - // for recursive calls. - // Split |r| into r0,r1,r2,r3. We must contribute a0^2 to r0,r1, 2*a0*a1 to - // r1,r2, and a1^2 to r2,r3. - size_t n = n2 / 2; - BN_ULONG *t_recursive = &t[n2 * 2]; - - // t0 = |a0 - a1|. - bn_abs_sub_words(t, a, &a[n], n, &t[n]); - // t2,t3 = t0^2 = |a0 - a1|^2 = a0^2 - 2*a0*a1 + a1^2 - bn_sqr_recursive(&t[n2], t, n, t_recursive); - - // r0,r1 = a0^2 - bn_sqr_recursive(r, a, n, t_recursive); - - // r2,r3 = a1^2 - bn_sqr_recursive(&r[n2], &a[n], n, t_recursive); - - // t0,t1,c = r0,r1 + r2,r3 = a0^2 + a1^2 - BN_ULONG c = bn_add_words(t, r, &r[n2], n2); - // t2,t3,c = t0,t1,c - t2,t3 = 2*a0*a1 - c -= bn_sub_words(&t[n2], t, &t[n2], n2); - - // We now have our three components. Add them together. - // r1,r2,c = r1,r2 + t2,t3,c - c += bn_add_words(&r[n], &r[n], &t[n2], n2); - - // Propagate the carry bit to the end. - for (size_t i = n + n2; i < n2 + n2; i++) { - BN_ULONG old = r[i]; - r[i] = old + c; - c = r[i] < old; - } - - // The square should fit without carries. - assert(c == 0); -} - -int BN_mul_word(BIGNUM *bn, BN_ULONG w) { - if (!bn->width) { - return 1; - } - - if (w == 0) { - BN_zero(bn); - return 1; - } - - BN_ULONG ll = bn_mul_words(bn->d, bn->d, bn->width, w); - if (ll) { - if (!bn_wexpand(bn, bn->width + 1)) { - return 0; - } - bn->d[bn->width++] = ll; - } - - return 1; -} - -int bn_sqr_consttime(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) { - int al = a->width; - if (al <= 0) { - r->width = 0; - r->neg = 0; - return 1; - } - - int ret = 0; - BN_CTX_start(ctx); - BIGNUM *rr = (a != r) ? r : BN_CTX_get(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - if (!rr || !tmp) { - goto err; - } - - int max = 2 * al; // Non-zero (from above) - if (!bn_wexpand(rr, max)) { - goto err; - } - - if (al == 4) { - bn_sqr_comba4(rr->d, a->d); - } else if (al == 8) { - bn_sqr_comba8(rr->d, a->d); - } else { - if (al < BN_SQR_RECURSIVE_SIZE_NORMAL) { - BN_ULONG t[BN_SQR_RECURSIVE_SIZE_NORMAL * 2]; - bn_sqr_normal(rr->d, a->d, al, t); - } else { - // If |al| is a power of two, we can use |bn_sqr_recursive|. - if (al != 0 && (al & (al - 1)) == 0) { - if (!bn_wexpand(tmp, al * 4)) { - goto err; - } - bn_sqr_recursive(rr->d, a->d, al, tmp->d); - } else { - if (!bn_wexpand(tmp, max)) { - goto err; - } - bn_sqr_normal(rr->d, a->d, al, tmp->d); - } - } - } - - rr->neg = 0; - rr->width = max; - - if (rr != r && !BN_copy(r, rr)) { - goto err; - } - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) { - if (!bn_sqr_consttime(r, a, ctx)) { - return 0; - } - - bn_set_minimal_width(r); - return 1; -} - -void bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, size_t num_a) { - if (num_r != 2 * num_a || num_a > BN_SMALL_MAX_WORDS) { - abort(); - } - if (num_a == 4) { - bn_sqr_comba4(r, a); - } else if (num_a == 8) { - bn_sqr_comba8(r, a); - } else { - BN_ULONG tmp[2 * BN_SMALL_MAX_WORDS]; - bn_sqr_normal(r, a, num_a, tmp); - OPENSSL_cleanse(tmp, 2 * num_a * sizeof(BN_ULONG)); - } -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/mul.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/mul.cc.inc new file mode 100644 index 00000000..82746ad1 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/mul.cc.inc @@ -0,0 +1,347 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include + +#include "../../internal.h" +#include "internal.h" + + +using namespace bssl; + +static void bn_mul_normal(BN_ULONG *r, const BN_ULONG *a, size_t na, + const BN_ULONG *b, size_t nb) { + if (na < nb) { + size_t itmp = na; + na = nb; + nb = itmp; + const BN_ULONG *ltmp = a; + a = b; + b = ltmp; + } + BN_ULONG *rr = &(r[na]); + if (nb == 0) { + OPENSSL_memset(r, 0, na * sizeof(BN_ULONG)); + return; + } + rr[0] = bn_mul_words(r, a, na, b[0]); + + for (;;) { + if (--nb == 0) { + return; + } + rr[1] = bn_mul_add_words(&(r[1]), a, na, b[1]); + if (--nb == 0) { + return; + } + rr[2] = bn_mul_add_words(&(r[2]), a, na, b[2]); + if (--nb == 0) { + return; + } + rr[3] = bn_mul_add_words(&(r[3]), a, na, b[3]); + if (--nb == 0) { + return; + } + rr[4] = bn_mul_add_words(&(r[4]), a, na, b[4]); + rr += 4; + r += 4; + b += 4; + } +} + +// bn_sub_part_words sets |r| to |a| - |b|. It returns the borrow bit, which is +// one if the operation underflowed and zero otherwise. |cl| is the common +// length, that is, the shorter of len(a) or len(b). |dl| is the delta length, +// that is, len(a) - len(b). |r|'s length matches the larger of |a| and |b|, or +// cl + abs(dl). +// +// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention +// is confusing. +static BN_ULONG bn_sub_part_words(BN_ULONG *r, const BN_ULONG *a, + const BN_ULONG *b, int cl, int dl) { + assert(cl >= 0); + BN_ULONG borrow = bn_sub_words(r, a, b, cl); + if (dl == 0) { + return borrow; + } + + r += cl; + a += cl; + b += cl; + + if (dl < 0) { + // |a| is shorter than |b|. Complete the subtraction as if the excess words + // in |a| were zeros. + dl = -dl; + for (int i = 0; i < dl; i++) { + r[i] = CRYPTO_subc_w(0, b[i], borrow, &borrow); + } + } else { + // |b| is shorter than |a|. Complete the subtraction as if the excess words + // in |b| were zeros. + for (int i = 0; i < dl; i++) { + r[i] = CRYPTO_subc_w(a[i], 0, borrow, &borrow); + } + } + + return borrow; +} + +// bn_abs_sub_part_words computes |r| = |a| - |b|, storing the absolute value +// and returning a mask of all ones if the result was negative and all zeros if +// the result was positive. |cl| and |dl| follow the |bn_sub_part_words| calling +// convention. +// +// TODO(davidben): Make this take |size_t|. The |cl| + |dl| calling convention +// is confusing. +// +// TODO(davidben): This function used to be used as part of a general Karatsuba +// multiplication implementation, which had to account for differently-sized +// inputs. Now it is only used as part of RSA key generation, which does not +// need all this. +static BN_ULONG bn_abs_sub_part_words(BN_ULONG *r, const BN_ULONG *a, + const BN_ULONG *b, int cl, int dl, + BN_ULONG *tmp) { + BN_ULONG borrow = bn_sub_part_words(tmp, a, b, cl, dl); + bn_sub_part_words(r, b, a, cl, -dl); + int r_len = cl + (dl < 0 ? -dl : dl); + borrow = 0 - borrow; + bn_select_words(r, borrow, r /* tmp < 0 */, tmp /* tmp >= 0 */, r_len); + return borrow; +} + +int bssl::bn_abs_sub_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, + BN_CTX *ctx) { + int cl = a->width < b->width ? a->width : b->width; + int dl = a->width - b->width; + int r_len = a->width < b->width ? b->width : a->width; + BN_CTXScope scope(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + if (tmp == nullptr || !bn_wexpand(r, r_len) || !bn_wexpand(tmp, r_len)) { + return 0; + } + bn_abs_sub_part_words(r->d, a->d, b->d, cl, dl, tmp->d); + r->width = r_len; + r->neg = 0; + return 1; +} + +// bn_mul_impl implements |BN_mul| and |bn_mul_consttime|. Note this function +// breaks |BIGNUM| invariants and may return a negative zero. This is handled by +// the callers. +static int bn_mul_impl(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, + BN_CTX *ctx) { + int al = a->width; + int bl = b->width; + if (al == 0 || bl == 0) { + BN_zero(r); + return 1; + } + + int i, top; + BIGNUM *rr; + BN_CTXScope scope(ctx); + if (r == a || r == b) { + rr = BN_CTX_get(ctx); + if (rr == nullptr) { + return 0; + } + } else { + rr = r; + } + rr->neg = a->neg ^ b->neg; + + i = al - bl; + if (i == 0) { + if (al == 8) { + if (!bn_wexpand(rr, 16)) { + return 0; + } + rr->width = 16; + bn_mul_comba8(rr->d, a->d, b->d); + goto end; + } + } + + top = al + bl; + if (!bn_wexpand(rr, top)) { + return 0; + } + rr->width = top; + bn_mul_normal(rr->d, a->d, al, b->d, bl); + +end: + if (r != rr && !BN_copy(r, rr)) { + return 0; + } + return 1; +} + +int BN_mul(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) { + if (!bn_mul_impl(r, a, b, ctx)) { + return 0; + } + + // This additionally fixes any negative zeros created by |bn_mul_impl|. + bn_set_minimal_width(r); + return 1; +} + +int bssl::bn_mul_consttime(BIGNUM *r, const BIGNUM *a, const BIGNUM *b, + BN_CTX *ctx) { + // Prevent negative zeros. + if (a->neg || b->neg) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + + return bn_mul_impl(r, a, b, ctx); +} + +void bssl::bn_mul_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, + size_t num_a, const BN_ULONG *b, size_t num_b) { + if (num_r != num_a + num_b) { + abort(); + } + // TODO(davidben): Should this call |bn_mul_comba4| too? |BN_mul| does not + // hit that code. + if (num_a == 8 && num_b == 8) { + bn_mul_comba8(r, a, b); + } else { + bn_mul_normal(r, a, num_a, b, num_b); + } +} + +static void bn_sqr_normal(BN_ULONG *r, const BN_ULONG *a, size_t n) { + if (n == 0) { + return; + } + + size_t max = n * 2; + const BN_ULONG *ap = a; + BN_ULONG *rp = r; + rp[0] = rp[max - 1] = 0; + rp++; + + // Compute the contribution of a[i] * a[j] for all i < j. + if (n > 1) { + ap++; + rp[n - 1] = bn_mul_words(rp, ap, n - 1, ap[-1]); + rp += 2; + } + if (n > 2) { + for (size_t i = n - 2; i > 0; i--) { + ap++; + rp[i] = bn_mul_add_words(rp, ap, i, ap[-1]); + rp += 2; + } + } + + // The final result fits in |max| words, so none of the following operations + // will overflow. + + // Double |r|, giving the contribution of a[i] * a[j] for all i != j. + bn_add_words(r, r, r, max); + + // Add in the contribution of a[i] * a[i] for all i. + bn_sqr_add_words(r, a, n); +} + +int BN_mul_word(BIGNUM *bn, BN_ULONG w) { + if (!bn->width) { + return 1; + } + + if (w == 0) { + BN_zero(bn); + return 1; + } + + BN_ULONG ll = bn_mul_words(bn->d, bn->d, bn->width, w); + if (ll) { + if (!bn_wexpand(bn, bn->width + 1)) { + return 0; + } + bn->d[bn->width++] = ll; + } + + return 1; +} + +int bssl::bn_sqr_consttime(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) { + int al = a->width; + if (al <= 0) { + r->width = 0; + r->neg = 0; + return 1; + } + + BN_CTXScope scope(ctx); + BIGNUM *rr = (a != r) ? r : BN_CTX_get(ctx); + if (!rr) { + return 0; + } + + int max = 2 * al; // Non-zero (from above) + if (!bn_wexpand(rr, max)) { + return 0; + } + + if (al == 4) { + bn_sqr_comba4(rr->d, a->d); + } else if (al == 8) { + bn_sqr_comba8(rr->d, a->d); + } else { + bn_sqr_normal(rr->d, a->d, al); + } + + rr->neg = 0; + rr->width = max; + + if (rr != r && !BN_copy(r, rr)) { + return 0; + } + return 1; +} + +int BN_sqr(BIGNUM *r, const BIGNUM *a, BN_CTX *ctx) { + if (!bn_sqr_consttime(r, a, ctx)) { + return 0; + } + + bn_set_minimal_width(r); + return 1; +} + +void bssl::bn_sqr_small(BN_ULONG *r, size_t num_r, const BN_ULONG *a, + size_t num_a) { + assert(r != a); + if (num_r != 2 * num_a) { + abort(); + } + if (num_a == 4) { + bn_sqr_comba4(r, a); + } else if (num_a == 8) { + bn_sqr_comba8(r, a); + } else { + bn_sqr_normal(r, a, num_a); + } +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/prime.c b/third_party/boringssl/src/crypto/fipsmodule/bn/prime.c deleted file mode 100644 index 05785582..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/prime.c +++ /dev/null @@ -1,1078 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -/* ==================================================================== - * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include - -#include "internal.h" -#include "../../internal.h" - - -// kPrimes contains the first 1024 primes. -static const uint16_t kPrimes[] = { - 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, - 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, - 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, - 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, - 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, - 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, - 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, - 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, - 509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, - 599, 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, - 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, - 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, - 829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, - 919, 929, 937, 941, 947, 953, 967, 971, 977, 983, 991, 997, - 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, - 1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, - 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, - 1259, 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, - 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, - 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511, - 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583, 1597, 1601, - 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657, 1663, 1667, 1669, 1693, - 1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747, 1753, 1759, 1777, 1783, - 1787, 1789, 1801, 1811, 1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877, - 1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987, - 1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053, 2063, 2069, - 2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129, 2131, 2137, 2141, 2143, - 2153, 2161, 2179, 2203, 2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267, - 2269, 2273, 2281, 2287, 2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347, - 2351, 2357, 2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423, - 2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531, 2539, 2543, - 2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657, - 2659, 2663, 2671, 2677, 2683, 2687, 2689, 2693, 2699, 2707, 2711, 2713, - 2719, 2729, 2731, 2741, 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801, - 2803, 2819, 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903, - 2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999, 3001, 3011, - 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079, 3083, 3089, 3109, 3119, - 3121, 3137, 3163, 3167, 3169, 3181, 3187, 3191, 3203, 3209, 3217, 3221, - 3229, 3251, 3253, 3257, 3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323, - 3329, 3331, 3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413, - 3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511, 3517, 3527, - 3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607, - 3613, 3617, 3623, 3631, 3637, 3643, 3659, 3671, 3673, 3677, 3691, 3697, - 3701, 3709, 3719, 3727, 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797, - 3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907, - 3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989, 4001, 4003, - 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057, 4073, 4079, 4091, 4093, - 4099, 4111, 4127, 4129, 4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211, - 4217, 4219, 4229, 4231, 4241, 4243, 4253, 4259, 4261, 4271, 4273, 4283, - 4289, 4297, 4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409, - 4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481, 4483, 4493, 4507, 4513, - 4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583, 4591, 4597, 4603, 4621, - 4637, 4639, 4643, 4649, 4651, 4657, 4663, 4673, 4679, 4691, 4703, 4721, - 4723, 4729, 4733, 4751, 4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813, - 4817, 4831, 4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937, - 4943, 4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003, 5009, 5011, - 5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087, 5099, 5101, 5107, 5113, - 5119, 5147, 5153, 5167, 5171, 5179, 5189, 5197, 5209, 5227, 5231, 5233, - 5237, 5261, 5273, 5279, 5281, 5297, 5303, 5309, 5323, 5333, 5347, 5351, - 5381, 5387, 5393, 5399, 5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443, - 5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507, 5519, 5521, 5527, 5531, - 5557, 5563, 5569, 5573, 5581, 5591, 5623, 5639, 5641, 5647, 5651, 5653, - 5657, 5659, 5669, 5683, 5689, 5693, 5701, 5711, 5717, 5737, 5741, 5743, - 5749, 5779, 5783, 5791, 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849, - 5851, 5857, 5861, 5867, 5869, 5879, 5881, 5897, 5903, 5923, 5927, 5939, - 5953, 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053, 6067, 6073, - 6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133, 6143, 6151, 6163, 6173, - 6197, 6199, 6203, 6211, 6217, 6221, 6229, 6247, 6257, 6263, 6269, 6271, - 6277, 6287, 6299, 6301, 6311, 6317, 6323, 6329, 6337, 6343, 6353, 6359, - 6361, 6367, 6373, 6379, 6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473, - 6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563, 6569, 6571, 6577, 6581, - 6599, 6607, 6619, 6637, 6653, 6659, 6661, 6673, 6679, 6689, 6691, 6701, - 6703, 6709, 6719, 6733, 6737, 6761, 6763, 6779, 6781, 6791, 6793, 6803, - 6823, 6827, 6829, 6833, 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907, - 6911, 6917, 6947, 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997, - 7001, 7013, 7019, 7027, 7039, 7043, 7057, 7069, 7079, 7103, 7109, 7121, - 7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207, 7211, 7213, 7219, 7229, - 7237, 7243, 7247, 7253, 7283, 7297, 7307, 7309, 7321, 7331, 7333, 7349, - 7351, 7369, 7393, 7411, 7417, 7433, 7451, 7457, 7459, 7477, 7481, 7487, - 7489, 7499, 7507, 7517, 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561, - 7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621, 7639, 7643, 7649, 7669, - 7673, 7681, 7687, 7691, 7699, 7703, 7717, 7723, 7727, 7741, 7753, 7757, - 7759, 7789, 7793, 7817, 7823, 7829, 7841, 7853, 7867, 7873, 7877, 7879, - 7883, 7901, 7907, 7919, 7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009, - 8011, 8017, 8039, 8053, 8059, 8069, 8081, 8087, 8089, 8093, 8101, 8111, - 8117, 8123, 8147, 8161, -}; - -// BN_prime_checks_for_size returns the number of Miller-Rabin iterations -// necessary for generating a 'bits'-bit candidate prime. -// -// -// This table is generated using the algorithm of FIPS PUB 186-4 -// Digital Signature Standard (DSS), section F.1, page 117. -// (https://doi.org/10.6028/NIST.FIPS.186-4) -// The following magma script was used to generate the output: -// securitybits:=125; -// k:=1024; -// for t:=1 to 65 do -// for M:=3 to Floor(2*Sqrt(k-1)-1) do -// S:=0; -// // Sum over m -// for m:=3 to M do -// s:=0; -// // Sum over j -// for j:=2 to m do -// s+:=(RealField(32)!2)^-(j+(k-1)/j); -// end for; -// S+:=2^(m-(m-1)*t)*s; -// end for; -// A:=2^(k-2-M*t); -// B:=8*(Pi(RealField(32))^2-6)/3*2^(k-2)*S; -// pkt:=2.00743*Log(2)*k*2^-k*(A+B); -// seclevel:=Floor(-Log(2,pkt)); -// if seclevel ge securitybits then -// printf "k: %5o, security: %o bits (t: %o, M: %o)\n",k,seclevel,t,M; -// break; -// end if; -// end for; -// if seclevel ge securitybits then break; end if; -// end for; -// -// It can be run online at: http://magma.maths.usyd.edu.au/calc -// And will output: -// k: 1024, security: 129 bits (t: 6, M: 23) -// k is the number of bits of the prime, securitybits is the level we want to -// reach. -// prime length | RSA key size | # MR tests | security level -// -------------+--------------|------------+--------------- -// (b) >= 6394 | >= 12788 | 3 | 256 bit -// (b) >= 3747 | >= 7494 | 3 | 192 bit -// (b) >= 1345 | >= 2690 | 4 | 128 bit -// (b) >= 1080 | >= 2160 | 5 | 128 bit -// (b) >= 852 | >= 1704 | 5 | 112 bit -// (b) >= 476 | >= 952 | 5 | 80 bit -// (b) >= 400 | >= 800 | 6 | 80 bit -// (b) >= 347 | >= 694 | 7 | 80 bit -// (b) >= 308 | >= 616 | 8 | 80 bit -// (b) >= 55 | >= 110 | 27 | 64 bit -// (b) >= 6 | >= 12 | 34 | 64 bit -static int BN_prime_checks_for_size(int bits) { - if (bits >= 3747) { - return 3; - } - if (bits >= 1345) { - return 4; - } - if (bits >= 476) { - return 5; - } - if (bits >= 400) { - return 6; - } - if (bits >= 347) { - return 7; - } - if (bits >= 308) { - return 8; - } - if (bits >= 55) { - return 27; - } - return 34; -} - -// num_trial_division_primes returns the number of primes to try with trial -// division before using more expensive checks. For larger numbers, the value -// of excluding a candidate with trial division is larger. -static size_t num_trial_division_primes(const BIGNUM *n) { - if (n->width * BN_BITS2 > 1024) { - return OPENSSL_ARRAY_SIZE(kPrimes); - } - return OPENSSL_ARRAY_SIZE(kPrimes) / 2; -} - -// BN_PRIME_CHECKS_BLINDED is the iteration count for blinding the constant-time -// primality test. See |BN_primality_test| for details. This number is selected -// so that, for a candidate N-bit RSA prime, picking |BN_PRIME_CHECKS_BLINDED| -// random N-bit numbers will have at least |BN_prime_checks_for_size(N)| values -// in range with high probability. -// -// The following Python script computes the blinding factor needed for the -// corresponding iteration count. -/* -import math - -# We choose candidate RSA primes between sqrt(2)/2 * 2^N and 2^N and select -# witnesses by generating random N-bit numbers. Thus the probability of -# selecting one in range is at least sqrt(2)/2. -p = math.sqrt(2) / 2 - -# Target around 2^-8 probability of the blinding being insufficient given that -# key generation is a one-time, noisy operation. -epsilon = 2**-8 - -def choose(a, b): - r = 1 - for i in xrange(b): - r *= a - i - r /= (i + 1) - return r - -def failure_rate(min_uniform, iterations): - """ Returns the probability that, for |iterations| candidate witnesses, fewer - than |min_uniform| of them will be uniform. """ - prob = 0.0 - for i in xrange(min_uniform): - prob += (choose(iterations, i) * - p**i * (1-p)**(iterations - i)) - return prob - -for min_uniform in (3, 4, 5, 6, 8, 13, 19, 28): - # Find the smallest number of iterations under the target failure rate. - iterations = min_uniform - while True: - prob = failure_rate(min_uniform, iterations) - if prob < epsilon: - print min_uniform, iterations, prob - break - iterations += 1 - -Output: - 3 9 0.00368894873911 - 4 11 0.00363319494662 - 5 13 0.00336215573898 - 6 15 0.00300145783158 - 8 19 0.00225214119331 - 13 27 0.00385610026955 - 19 38 0.0021410539126 - 28 52 0.00325405801769 - -16 iterations suffices for 400-bit primes and larger (6 uniform samples needed), -which is already well below the minimum acceptable key size for RSA. -*/ -#define BN_PRIME_CHECKS_BLINDED 16 - -static int probable_prime(BIGNUM *rnd, int bits); -static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add, - const BIGNUM *rem, BN_CTX *ctx); -static int probable_prime_dh_safe(BIGNUM *rnd, int bits, const BIGNUM *add, - const BIGNUM *rem, BN_CTX *ctx); - -BN_GENCB *BN_GENCB_new(void) { - BN_GENCB *callback = OPENSSL_malloc(sizeof(BN_GENCB)); - if (callback == NULL) { - OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE); - return NULL; - } - OPENSSL_memset(callback, 0, sizeof(BN_GENCB)); - return callback; -} - -void BN_GENCB_free(BN_GENCB *callback) { OPENSSL_free(callback); } - -void BN_GENCB_set(BN_GENCB *callback, - int (*f)(int event, int n, struct bn_gencb_st *), - void *arg) { - callback->callback = f; - callback->arg = arg; -} - -int BN_GENCB_call(BN_GENCB *callback, int event, int n) { - if (!callback) { - return 1; - } - - return callback->callback(event, n, callback); -} - -void *BN_GENCB_get_arg(const BN_GENCB *callback) { return callback->arg; } - -int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add, - const BIGNUM *rem, BN_GENCB *cb) { - BIGNUM *t; - int found = 0; - int i, j, c1 = 0; - BN_CTX *ctx; - int checks = BN_prime_checks_for_size(bits); - - if (bits < 2) { - // There are no prime numbers this small. - OPENSSL_PUT_ERROR(BN, BN_R_BITS_TOO_SMALL); - return 0; - } else if (bits == 2 && safe) { - // The smallest safe prime (7) is three bits. - OPENSSL_PUT_ERROR(BN, BN_R_BITS_TOO_SMALL); - return 0; - } - - ctx = BN_CTX_new(); - if (ctx == NULL) { - goto err; - } - BN_CTX_start(ctx); - t = BN_CTX_get(ctx); - if (!t) { - goto err; - } - -loop: - // make a random number and set the top and bottom bits - if (add == NULL) { - if (!probable_prime(ret, bits)) { - goto err; - } - } else { - if (safe) { - if (!probable_prime_dh_safe(ret, bits, add, rem, ctx)) { - goto err; - } - } else { - if (!probable_prime_dh(ret, bits, add, rem, ctx)) { - goto err; - } - } - } - - if (!BN_GENCB_call(cb, BN_GENCB_GENERATED, c1++)) { - // aborted - goto err; - } - - if (!safe) { - i = BN_is_prime_fasttest_ex(ret, checks, ctx, 0, cb); - if (i == -1) { - goto err; - } else if (i == 0) { - goto loop; - } - } else { - // for "safe prime" generation, check that (p-1)/2 is prime. Since a prime - // is odd, We just need to divide by 2 - if (!BN_rshift1(t, ret)) { - goto err; - } - - // Interleave |ret| and |t|'s primality tests to avoid paying the full - // iteration count on |ret| only to quickly discover |t| is composite. - // - // TODO(davidben): This doesn't quite work because an iteration count of 1 - // still runs the blinding mechanism. - for (i = 0; i < checks; i++) { - j = BN_is_prime_fasttest_ex(ret, 1, ctx, 0, NULL); - if (j == -1) { - goto err; - } else if (j == 0) { - goto loop; - } - - j = BN_is_prime_fasttest_ex(t, 1, ctx, 0, NULL); - if (j == -1) { - goto err; - } else if (j == 0) { - goto loop; - } - - if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i)) { - goto err; - } - // We have a safe prime test pass - } - } - - // we have a prime :-) - found = 1; - -err: - if (ctx != NULL) { - BN_CTX_end(ctx); - BN_CTX_free(ctx); - } - - return found; -} - -static int bn_trial_division(uint16_t *out, const BIGNUM *bn) { - const size_t num_primes = num_trial_division_primes(bn); - for (size_t i = 1; i < num_primes; i++) { - if (bn_mod_u16_consttime(bn, kPrimes[i]) == 0) { - *out = kPrimes[i]; - return 1; - } - } - return 0; -} - -int bn_odd_number_is_obviously_composite(const BIGNUM *bn) { - uint16_t prime; - return bn_trial_division(&prime, bn) && !BN_is_word(bn, prime); -} - -int bn_miller_rabin_init(BN_MILLER_RABIN *miller_rabin, const BN_MONT_CTX *mont, - BN_CTX *ctx) { - // This function corresponds to steps 1 through 3 of FIPS 186-4, C.3.1. - const BIGNUM *w = &mont->N; - // Note we do not call |BN_CTX_start| in this function. We intentionally - // allocate values in the containing scope so they outlive this function. - miller_rabin->w1 = BN_CTX_get(ctx); - miller_rabin->m = BN_CTX_get(ctx); - miller_rabin->one_mont = BN_CTX_get(ctx); - miller_rabin->w1_mont = BN_CTX_get(ctx); - if (miller_rabin->w1 == NULL || - miller_rabin->m == NULL || - miller_rabin->one_mont == NULL || - miller_rabin->w1_mont == NULL) { - return 0; - } - - // See FIPS 186-4, C.3.1, steps 1 through 3. - if (!bn_usub_consttime(miller_rabin->w1, w, BN_value_one())) { - return 0; - } - miller_rabin->a = BN_count_low_zero_bits(miller_rabin->w1); - if (!bn_rshift_secret_shift(miller_rabin->m, miller_rabin->w1, - miller_rabin->a, ctx)) { - return 0; - } - miller_rabin->w_bits = BN_num_bits(w); - - // Precompute some values in Montgomery form. - if (!bn_one_to_montgomery(miller_rabin->one_mont, mont, ctx) || - // w - 1 is -1 mod w, so we can compute it in the Montgomery domain, -R, - // with a subtraction. (|one_mont| cannot be zero.) - !bn_usub_consttime(miller_rabin->w1_mont, w, miller_rabin->one_mont)) { - return 0; - } - - return 1; -} - -int bn_miller_rabin_iteration(const BN_MILLER_RABIN *miller_rabin, - int *out_is_possibly_prime, const BIGNUM *b, - const BN_MONT_CTX *mont, BN_CTX *ctx) { - // This function corresponds to steps 4.3 through 4.5 of FIPS 186-4, C.3.1. - int ret = 0; - BN_CTX_start(ctx); - - // Step 4.3. We use Montgomery-encoding for better performance and to avoid - // timing leaks. - const BIGNUM *w = &mont->N; - BIGNUM *z = BN_CTX_get(ctx); - if (z == NULL || - !BN_mod_exp_mont_consttime(z, b, miller_rabin->m, w, ctx, mont) || - !BN_to_montgomery(z, z, mont, ctx)) { - goto err; - } - - // is_possibly_prime is all ones if we have determined |b| is not a composite - // witness for |w|. This is equivalent to going to step 4.7 in the original - // algorithm. To avoid timing leaks, we run the algorithm to the end for prime - // inputs. - crypto_word_t is_possibly_prime = 0; - - // Step 4.4. If z = 1 or z = w-1, b is not a composite witness and w is still - // possibly prime. - is_possibly_prime = BN_equal_consttime(z, miller_rabin->one_mont) | - BN_equal_consttime(z, miller_rabin->w1_mont); - is_possibly_prime = 0 - is_possibly_prime; // Make it all zeros or all ones. - - // Step 4.5. - // - // To avoid leaking |a|, we run the loop to |w_bits| and mask off all - // iterations once |j| = |a|. - for (int j = 1; j < miller_rabin->w_bits; j++) { - if (constant_time_eq_int(j, miller_rabin->a) & ~is_possibly_prime) { - // If the loop is done and we haven't seen z = 1 or z = w-1 yet, the - // value is composite and we can break in variable time. - break; - } - - // Step 4.5.1. - if (!BN_mod_mul_montgomery(z, z, z, mont, ctx)) { - goto err; - } - - // Step 4.5.2. If z = w-1 and the loop is not done, this is not a composite - // witness. - crypto_word_t z_is_w1_mont = BN_equal_consttime(z, miller_rabin->w1_mont); - z_is_w1_mont = 0 - z_is_w1_mont; // Make it all zeros or all ones. - is_possibly_prime |= z_is_w1_mont; // Go to step 4.7 if |z_is_w1_mont|. - - // Step 4.5.3. If z = 1 and the loop is not done, the previous value of z - // was not -1. There are no non-trivial square roots of 1 modulo a prime, so - // w is composite and we may exit in variable time. - if (BN_equal_consttime(z, miller_rabin->one_mont) & ~is_possibly_prime) { - break; - } - } - - *out_is_possibly_prime = is_possibly_prime & 1; - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -int BN_primality_test(int *out_is_probably_prime, const BIGNUM *w, int checks, - BN_CTX *ctx, int do_trial_division, BN_GENCB *cb) { - // This function's secrecy and performance requirements come from RSA key - // generation. We generate RSA keys by selecting two large, secret primes with - // rejection sampling. - // - // We thus treat |w| as secret if turns out to be a large prime. However, if - // |w| is composite, we treat this and |w| itself as public. (Conversely, if - // |w| is prime, that it is prime is public. Only the value is secret.) This - // is fine for RSA key generation, but note it is important that we use - // rejection sampling, with each candidate prime chosen independently. This - // would not work for, e.g., an algorithm which looked for primes in - // consecutive integers. These assumptions allow us to discard composites - // quickly. We additionally treat |w| as public when it is a small prime to - // simplify trial decryption and some edge cases. - // - // One RSA key generation will call this function on exactly two primes and - // many more composites. The overall cost is a combination of several factors: - // - // 1. Checking if |w| is divisible by a small prime is much faster than - // learning it is composite by Miller-Rabin (see below for details on that - // cost). Trial division by p saves 1/p of Miller-Rabin calls, so this is - // worthwhile until p exceeds the ratio of the two costs. - // - // 2. For a random (i.e. non-adversarial) candidate large prime and candidate - // witness, the probability of false witness is very low. (This is why FIPS - // 186-4 only requires a few iterations.) Thus composites not discarded by - // trial decryption, in practice, cost one Miller-Rabin iteration. Only the - // two actual primes cost the full iteration count. - // - // 3. A Miller-Rabin iteration is a modular exponentiation plus |a| additional - // modular squares, where |a| is the number of factors of two in |w-1|. |a| - // is likely small (the distribution falls exponentially), but it is also - // potentially secret, so we loop up to its log(w) upper bound when |w| is - // prime. When |w| is composite, we break early, so only two calls pay this - // cost. (Note that all calls pay the modular exponentiation which is, - // itself, log(w) modular multiplications and squares.) - // - // 4. While there are only two prime calls, they multiplicatively pay the full - // costs of (2) and (3). - // - // 5. After the primes are chosen, RSA keys derive some values from the - // primes, but this cost is negligible in comparison. - - *out_is_probably_prime = 0; - - if (BN_cmp(w, BN_value_one()) <= 0) { - return 1; - } - - if (!BN_is_odd(w)) { - // The only even prime is two. - *out_is_probably_prime = BN_is_word(w, 2); - return 1; - } - - // Miller-Rabin does not work for three. - if (BN_is_word(w, 3)) { - *out_is_probably_prime = 1; - return 1; - } - - if (do_trial_division) { - // Perform additional trial division checks to discard small primes. - uint16_t prime; - if (bn_trial_division(&prime, w)) { - *out_is_probably_prime = BN_is_word(w, prime); - return 1; - } - if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, -1)) { - return 0; - } - } - - if (checks == BN_prime_checks_for_generation) { - checks = BN_prime_checks_for_size(BN_num_bits(w)); - } - - BN_CTX *new_ctx = NULL; - if (ctx == NULL) { - new_ctx = BN_CTX_new(); - if (new_ctx == NULL) { - return 0; - } - ctx = new_ctx; - } - - // See C.3.1 from FIPS 186-4. - int ret = 0; - BN_CTX_start(ctx); - BIGNUM *b = BN_CTX_get(ctx); - BN_MONT_CTX *mont = BN_MONT_CTX_new_consttime(w, ctx); - BN_MILLER_RABIN miller_rabin; - if (b == NULL || mont == NULL || - // Steps 1-3. - !bn_miller_rabin_init(&miller_rabin, mont, ctx)) { - goto err; - } - - // The following loop performs in inner iteration of the Miller-Rabin - // Primality test (Step 4). - // - // The algorithm as specified in FIPS 186-4 leaks information on |w|, the RSA - // private key. Instead, we run through each iteration unconditionally, - // performing modular multiplications, masking off any effects to behave - // equivalently to the specified algorithm. - // - // We also blind the number of values of |b| we try. Steps 4.1–4.2 say to - // discard out-of-range values. To avoid leaking information on |w|, we use - // |bn_rand_secret_range| which, rather than discarding bad values, adjusts - // them to be in range. Though not uniformly selected, these adjusted values - // are still usable as Miller-Rabin checks. - // - // Miller-Rabin is already probabilistic, so we could reach the desired - // confidence levels by just suitably increasing the iteration count. However, - // to align with FIPS 186-4, we use a more pessimal analysis: we do not count - // the non-uniform values towards the iteration count. As a result, this - // function is more complex and has more timing risk than necessary. - // - // We count both total iterations and uniform ones and iterate until we've - // reached at least |BN_PRIME_CHECKS_BLINDED| and |iterations|, respectively. - // If the latter is large enough, it will be the limiting factor with high - // probability and we won't leak information. - // - // Note this blinding does not impact most calls when picking primes because - // composites are rejected early. Only the two secret primes see extra work. - - crypto_word_t uniform_iterations = 0; - // Using |constant_time_lt_w| seems to prevent the compiler from optimizing - // this into two jumps. - for (int i = 1; (i <= BN_PRIME_CHECKS_BLINDED) | - constant_time_lt_w(uniform_iterations, checks); - i++) { - // Step 4.1-4.2 - int is_uniform; - if (!bn_rand_secret_range(b, &is_uniform, 2, miller_rabin.w1)) { - goto err; - } - uniform_iterations += is_uniform; - - // Steps 4.3-4.5 - int is_possibly_prime = 0; - if (!bn_miller_rabin_iteration(&miller_rabin, &is_possibly_prime, b, mont, - ctx)) { - goto err; - } - - if (!is_possibly_prime) { - // Step 4.6. We did not see z = w-1 before z = 1, so w must be composite. - *out_is_probably_prime = 0; - ret = 1; - goto err; - } - - // Step 4.7 - if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i - 1)) { - goto err; - } - } - - assert(uniform_iterations >= (crypto_word_t)checks); - *out_is_probably_prime = 1; - ret = 1; - -err: - BN_MONT_CTX_free(mont); - BN_CTX_end(ctx); - BN_CTX_free(new_ctx); - return ret; -} - -int BN_is_prime_ex(const BIGNUM *candidate, int checks, BN_CTX *ctx, - BN_GENCB *cb) { - return BN_is_prime_fasttest_ex(candidate, checks, ctx, 0, cb); -} - -int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx, - int do_trial_division, BN_GENCB *cb) { - int is_probably_prime; - if (!BN_primality_test(&is_probably_prime, a, checks, ctx, do_trial_division, - cb)) { - return -1; - } - return is_probably_prime; -} - -int BN_enhanced_miller_rabin_primality_test( - enum bn_primality_result_t *out_result, const BIGNUM *w, int checks, - BN_CTX *ctx, BN_GENCB *cb) { - // Enhanced Miller-Rabin is only valid on odd integers greater than 3. - if (!BN_is_odd(w) || BN_cmp_word(w, 3) <= 0) { - OPENSSL_PUT_ERROR(BN, BN_R_INVALID_INPUT); - return 0; - } - - if (checks == BN_prime_checks_for_generation) { - checks = BN_prime_checks_for_size(BN_num_bits(w)); - } - - int ret = 0; - BN_MONT_CTX *mont = NULL; - - BN_CTX_start(ctx); - - BIGNUM *w1 = BN_CTX_get(ctx); - if (w1 == NULL || - !BN_copy(w1, w) || - !BN_sub_word(w1, 1)) { - goto err; - } - - // Write w1 as m*2^a (Steps 1 and 2). - int a = 0; - while (!BN_is_bit_set(w1, a)) { - a++; - } - BIGNUM *m = BN_CTX_get(ctx); - if (m == NULL || - !BN_rshift(m, w1, a)) { - goto err; - } - - BIGNUM *b = BN_CTX_get(ctx); - BIGNUM *g = BN_CTX_get(ctx); - BIGNUM *z = BN_CTX_get(ctx); - BIGNUM *x = BN_CTX_get(ctx); - BIGNUM *x1 = BN_CTX_get(ctx); - if (b == NULL || - g == NULL || - z == NULL || - x == NULL || - x1 == NULL) { - goto err; - } - - // Montgomery setup for computations mod w - mont = BN_MONT_CTX_new_for_modulus(w, ctx); - if (mont == NULL) { - goto err; - } - - // The following loop performs in inner iteration of the Enhanced Miller-Rabin - // Primality test (Step 4). - for (int i = 1; i <= checks; i++) { - // Step 4.1-4.2 - if (!BN_rand_range_ex(b, 2, w1)) { - goto err; - } - - // Step 4.3-4.4 - if (!BN_gcd(g, b, w, ctx)) { - goto err; - } - if (BN_cmp_word(g, 1) > 0) { - *out_result = bn_composite; - ret = 1; - goto err; - } - - // Step 4.5 - if (!BN_mod_exp_mont(z, b, m, w, ctx, mont)) { - goto err; - } - - // Step 4.6 - if (BN_is_one(z) || BN_cmp(z, w1) == 0) { - goto loop; - } - - // Step 4.7 - for (int j = 1; j < a; j++) { - if (!BN_copy(x, z) || !BN_mod_mul(z, x, x, w, ctx)) { - goto err; - } - if (BN_cmp(z, w1) == 0) { - goto loop; - } - if (BN_is_one(z)) { - goto composite; - } - } - - // Step 4.8-4.9 - if (!BN_copy(x, z) || !BN_mod_mul(z, x, x, w, ctx)) { - goto err; - } - - // Step 4.10-4.11 - if (!BN_is_one(z) && !BN_copy(x, z)) { - goto err; - } - - composite: - // Step 4.12-4.14 - if (!BN_copy(x1, x) || - !BN_sub_word(x1, 1) || - !BN_gcd(g, x1, w, ctx)) { - goto err; - } - if (BN_cmp_word(g, 1) > 0) { - *out_result = bn_composite; - } else { - *out_result = bn_non_prime_power_composite; - } - - ret = 1; - goto err; - - loop: - // Step 4.15 - if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i - 1)) { - goto err; - } - } - - *out_result = bn_probably_prime; - ret = 1; - -err: - BN_MONT_CTX_free(mont); - BN_CTX_end(ctx); - - return ret; -} - -static int probable_prime(BIGNUM *rnd, int bits) { - do { - if (!BN_rand(rnd, bits, BN_RAND_TOP_TWO, BN_RAND_BOTTOM_ODD)) { - return 0; - } - } while (bn_odd_number_is_obviously_composite(rnd)); - return 1; -} - -static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add, - const BIGNUM *rem, BN_CTX *ctx) { - int ret = 0; - BIGNUM *t1; - - BN_CTX_start(ctx); - if ((t1 = BN_CTX_get(ctx)) == NULL) { - goto err; - } - - if (!BN_rand(rnd, bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ODD)) { - goto err; - } - - // we need ((rnd-rem) % add) == 0 - - if (!BN_mod(t1, rnd, add, ctx)) { - goto err; - } - if (!BN_sub(rnd, rnd, t1)) { - goto err; - } - if (rem == NULL) { - if (!BN_add_word(rnd, 1)) { - goto err; - } - } else { - if (!BN_add(rnd, rnd, rem)) { - goto err; - } - } - // we now have a random number 'rand' to test. - - const size_t num_primes = num_trial_division_primes(rnd); -loop: - for (size_t i = 1; i < num_primes; i++) { - // check that rnd is a prime - if (bn_mod_u16_consttime(rnd, kPrimes[i]) <= 1) { - if (!BN_add(rnd, rnd, add)) { - goto err; - } - goto loop; - } - } - - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -static int probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd, - const BIGNUM *rem, BN_CTX *ctx) { - int ret = 0; - BIGNUM *t1, *qadd, *q; - - bits--; - BN_CTX_start(ctx); - t1 = BN_CTX_get(ctx); - q = BN_CTX_get(ctx); - qadd = BN_CTX_get(ctx); - if (qadd == NULL) { - goto err; - } - - if (!BN_rshift1(qadd, padd)) { - goto err; - } - - if (!BN_rand(q, bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ODD)) { - goto err; - } - - // we need ((rnd-rem) % add) == 0 - if (!BN_mod(t1, q, qadd, ctx)) { - goto err; - } - - if (!BN_sub(q, q, t1)) { - goto err; - } - - if (rem == NULL) { - if (!BN_add_word(q, 1)) { - goto err; - } - } else { - if (!BN_rshift1(t1, rem)) { - goto err; - } - if (!BN_add(q, q, t1)) { - goto err; - } - } - - // we now have a random number 'rand' to test. - if (!BN_lshift1(p, q)) { - goto err; - } - if (!BN_add_word(p, 1)) { - goto err; - } - - const size_t num_primes = num_trial_division_primes(p); -loop: - for (size_t i = 1; i < num_primes; i++) { - // check that p and q are prime - // check that for p and q - // gcd(p-1,primes) == 1 (except for 2) - if (bn_mod_u16_consttime(p, kPrimes[i]) == 0 || - bn_mod_u16_consttime(q, kPrimes[i]) == 0) { - if (!BN_add(p, p, padd)) { - goto err; - } - if (!BN_add(q, q, qadd)) { - goto err; - } - goto loop; - } - } - - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/prime.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/prime.cc.inc new file mode 100644 index 00000000..14eac475 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/prime.cc.inc @@ -0,0 +1,931 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +// kPrimes contains the first 1024 primes. +static const uint16_t kPrimes[] = { + 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, + 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, + 97, 101, 103, 107, 109, 113, 127, 131, 137, 139, 149, 151, + 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, + 227, 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, + 283, 293, 307, 311, 313, 317, 331, 337, 347, 349, 353, 359, + 367, 373, 379, 383, 389, 397, 401, 409, 419, 421, 431, 433, + 439, 443, 449, 457, 461, 463, 467, 479, 487, 491, 499, 503, + 509, 521, 523, 541, 547, 557, 563, 569, 571, 577, 587, 593, + 599, 601, 607, 613, 617, 619, 631, 641, 643, 647, 653, 659, + 661, 673, 677, 683, 691, 701, 709, 719, 727, 733, 739, 743, + 751, 757, 761, 769, 773, 787, 797, 809, 811, 821, 823, 827, + 829, 839, 853, 857, 859, 863, 877, 881, 883, 887, 907, 911, + 919, 929, 937, 941, 947, 953, 967, 971, 977, 983, 991, 997, + 1009, 1013, 1019, 1021, 1031, 1033, 1039, 1049, 1051, 1061, 1063, 1069, + 1087, 1091, 1093, 1097, 1103, 1109, 1117, 1123, 1129, 1151, 1153, 1163, + 1171, 1181, 1187, 1193, 1201, 1213, 1217, 1223, 1229, 1231, 1237, 1249, + 1259, 1277, 1279, 1283, 1289, 1291, 1297, 1301, 1303, 1307, 1319, 1321, + 1327, 1361, 1367, 1373, 1381, 1399, 1409, 1423, 1427, 1429, 1433, 1439, + 1447, 1451, 1453, 1459, 1471, 1481, 1483, 1487, 1489, 1493, 1499, 1511, + 1523, 1531, 1543, 1549, 1553, 1559, 1567, 1571, 1579, 1583, 1597, 1601, + 1607, 1609, 1613, 1619, 1621, 1627, 1637, 1657, 1663, 1667, 1669, 1693, + 1697, 1699, 1709, 1721, 1723, 1733, 1741, 1747, 1753, 1759, 1777, 1783, + 1787, 1789, 1801, 1811, 1823, 1831, 1847, 1861, 1867, 1871, 1873, 1877, + 1879, 1889, 1901, 1907, 1913, 1931, 1933, 1949, 1951, 1973, 1979, 1987, + 1993, 1997, 1999, 2003, 2011, 2017, 2027, 2029, 2039, 2053, 2063, 2069, + 2081, 2083, 2087, 2089, 2099, 2111, 2113, 2129, 2131, 2137, 2141, 2143, + 2153, 2161, 2179, 2203, 2207, 2213, 2221, 2237, 2239, 2243, 2251, 2267, + 2269, 2273, 2281, 2287, 2293, 2297, 2309, 2311, 2333, 2339, 2341, 2347, + 2351, 2357, 2371, 2377, 2381, 2383, 2389, 2393, 2399, 2411, 2417, 2423, + 2437, 2441, 2447, 2459, 2467, 2473, 2477, 2503, 2521, 2531, 2539, 2543, + 2549, 2551, 2557, 2579, 2591, 2593, 2609, 2617, 2621, 2633, 2647, 2657, + 2659, 2663, 2671, 2677, 2683, 2687, 2689, 2693, 2699, 2707, 2711, 2713, + 2719, 2729, 2731, 2741, 2749, 2753, 2767, 2777, 2789, 2791, 2797, 2801, + 2803, 2819, 2833, 2837, 2843, 2851, 2857, 2861, 2879, 2887, 2897, 2903, + 2909, 2917, 2927, 2939, 2953, 2957, 2963, 2969, 2971, 2999, 3001, 3011, + 3019, 3023, 3037, 3041, 3049, 3061, 3067, 3079, 3083, 3089, 3109, 3119, + 3121, 3137, 3163, 3167, 3169, 3181, 3187, 3191, 3203, 3209, 3217, 3221, + 3229, 3251, 3253, 3257, 3259, 3271, 3299, 3301, 3307, 3313, 3319, 3323, + 3329, 3331, 3343, 3347, 3359, 3361, 3371, 3373, 3389, 3391, 3407, 3413, + 3433, 3449, 3457, 3461, 3463, 3467, 3469, 3491, 3499, 3511, 3517, 3527, + 3529, 3533, 3539, 3541, 3547, 3557, 3559, 3571, 3581, 3583, 3593, 3607, + 3613, 3617, 3623, 3631, 3637, 3643, 3659, 3671, 3673, 3677, 3691, 3697, + 3701, 3709, 3719, 3727, 3733, 3739, 3761, 3767, 3769, 3779, 3793, 3797, + 3803, 3821, 3823, 3833, 3847, 3851, 3853, 3863, 3877, 3881, 3889, 3907, + 3911, 3917, 3919, 3923, 3929, 3931, 3943, 3947, 3967, 3989, 4001, 4003, + 4007, 4013, 4019, 4021, 4027, 4049, 4051, 4057, 4073, 4079, 4091, 4093, + 4099, 4111, 4127, 4129, 4133, 4139, 4153, 4157, 4159, 4177, 4201, 4211, + 4217, 4219, 4229, 4231, 4241, 4243, 4253, 4259, 4261, 4271, 4273, 4283, + 4289, 4297, 4327, 4337, 4339, 4349, 4357, 4363, 4373, 4391, 4397, 4409, + 4421, 4423, 4441, 4447, 4451, 4457, 4463, 4481, 4483, 4493, 4507, 4513, + 4517, 4519, 4523, 4547, 4549, 4561, 4567, 4583, 4591, 4597, 4603, 4621, + 4637, 4639, 4643, 4649, 4651, 4657, 4663, 4673, 4679, 4691, 4703, 4721, + 4723, 4729, 4733, 4751, 4759, 4783, 4787, 4789, 4793, 4799, 4801, 4813, + 4817, 4831, 4861, 4871, 4877, 4889, 4903, 4909, 4919, 4931, 4933, 4937, + 4943, 4951, 4957, 4967, 4969, 4973, 4987, 4993, 4999, 5003, 5009, 5011, + 5021, 5023, 5039, 5051, 5059, 5077, 5081, 5087, 5099, 5101, 5107, 5113, + 5119, 5147, 5153, 5167, 5171, 5179, 5189, 5197, 5209, 5227, 5231, 5233, + 5237, 5261, 5273, 5279, 5281, 5297, 5303, 5309, 5323, 5333, 5347, 5351, + 5381, 5387, 5393, 5399, 5407, 5413, 5417, 5419, 5431, 5437, 5441, 5443, + 5449, 5471, 5477, 5479, 5483, 5501, 5503, 5507, 5519, 5521, 5527, 5531, + 5557, 5563, 5569, 5573, 5581, 5591, 5623, 5639, 5641, 5647, 5651, 5653, + 5657, 5659, 5669, 5683, 5689, 5693, 5701, 5711, 5717, 5737, 5741, 5743, + 5749, 5779, 5783, 5791, 5801, 5807, 5813, 5821, 5827, 5839, 5843, 5849, + 5851, 5857, 5861, 5867, 5869, 5879, 5881, 5897, 5903, 5923, 5927, 5939, + 5953, 5981, 5987, 6007, 6011, 6029, 6037, 6043, 6047, 6053, 6067, 6073, + 6079, 6089, 6091, 6101, 6113, 6121, 6131, 6133, 6143, 6151, 6163, 6173, + 6197, 6199, 6203, 6211, 6217, 6221, 6229, 6247, 6257, 6263, 6269, 6271, + 6277, 6287, 6299, 6301, 6311, 6317, 6323, 6329, 6337, 6343, 6353, 6359, + 6361, 6367, 6373, 6379, 6389, 6397, 6421, 6427, 6449, 6451, 6469, 6473, + 6481, 6491, 6521, 6529, 6547, 6551, 6553, 6563, 6569, 6571, 6577, 6581, + 6599, 6607, 6619, 6637, 6653, 6659, 6661, 6673, 6679, 6689, 6691, 6701, + 6703, 6709, 6719, 6733, 6737, 6761, 6763, 6779, 6781, 6791, 6793, 6803, + 6823, 6827, 6829, 6833, 6841, 6857, 6863, 6869, 6871, 6883, 6899, 6907, + 6911, 6917, 6947, 6949, 6959, 6961, 6967, 6971, 6977, 6983, 6991, 6997, + 7001, 7013, 7019, 7027, 7039, 7043, 7057, 7069, 7079, 7103, 7109, 7121, + 7127, 7129, 7151, 7159, 7177, 7187, 7193, 7207, 7211, 7213, 7219, 7229, + 7237, 7243, 7247, 7253, 7283, 7297, 7307, 7309, 7321, 7331, 7333, 7349, + 7351, 7369, 7393, 7411, 7417, 7433, 7451, 7457, 7459, 7477, 7481, 7487, + 7489, 7499, 7507, 7517, 7523, 7529, 7537, 7541, 7547, 7549, 7559, 7561, + 7573, 7577, 7583, 7589, 7591, 7603, 7607, 7621, 7639, 7643, 7649, 7669, + 7673, 7681, 7687, 7691, 7699, 7703, 7717, 7723, 7727, 7741, 7753, 7757, + 7759, 7789, 7793, 7817, 7823, 7829, 7841, 7853, 7867, 7873, 7877, 7879, + 7883, 7901, 7907, 7919, 7927, 7933, 7937, 7949, 7951, 7963, 7993, 8009, + 8011, 8017, 8039, 8053, 8059, 8069, 8081, 8087, 8089, 8093, 8101, 8111, + 8117, 8123, 8147, 8161, +}; + +// BN_prime_checks_for_size returns the number of Miller-Rabin iterations +// necessary for generating a 'bits'-bit candidate prime. +// +// +// This table is generated using the algorithm of FIPS PUB 186-5 +// Digital Signature Standard (DSS), section C.1, page 72. +// (https://doi.org/10.6028/NIST.FIPS.186-5). +// The following magma script was used to generate the output: +// securitybits:=125; +// k:=1024; +// for t:=1 to 65 do +// for M:=3 to Floor(2*Sqrt(k-1)-1) do +// S:=0; +// // Sum over m +// for m:=3 to M do +// s:=0; +// // Sum over j +// for j:=2 to m do +// s+:=(RealField(32)!2)^-(j+(k-1)/j); +// end for; +// S+:=2^(m-(m-1)*t)*s; +// end for; +// A:=2^(k-2-M*t); +// B:=8*(Pi(RealField(32))^2-6)/3*2^(k-2)*S; +// pkt:=2.00743*Log(2)*k*2^-k*(A+B); +// seclevel:=Floor(-Log(2,pkt)); +// if seclevel ge securitybits then +// printf "k: %5o, security: %o bits (t: %o, M: %o)\n",k,seclevel,t,M; +// break; +// end if; +// end for; +// if seclevel ge securitybits then break; end if; +// end for; +// +// It can be run online at: http://magma.maths.usyd.edu.au/calc +// And will output: +// k: 1024, security: 129 bits (t: 6, M: 23) +// k is the number of bits of the prime, securitybits is the level we want to +// reach. +// prime length | RSA key size | # MR tests | security level +// -------------+--------------|------------+--------------- +// (b) >= 6394 | >= 12788 | 3 | 256 bit +// (b) >= 3747 | >= 7494 | 3 | 192 bit +// (b) >= 1345 | >= 2690 | 4 | 128 bit +// (b) >= 1080 | >= 2160 | 5 | 128 bit +// (b) >= 852 | >= 1704 | 5 | 112 bit +// (b) >= 476 | >= 952 | 5 | 80 bit +// (b) >= 400 | >= 800 | 6 | 80 bit +// (b) >= 347 | >= 694 | 7 | 80 bit +// (b) >= 308 | >= 616 | 8 | 80 bit +// (b) >= 55 | >= 110 | 27 | 64 bit +// (b) >= 6 | >= 12 | 34 | 64 bit +static int BN_prime_checks_for_size(int bits) { + if (bits >= 3747) { + return 3; + } + if (bits >= 1345) { + return 4; + } + if (bits >= 476) { + return 5; + } + if (bits >= 400) { + return 6; + } + if (bits >= 347) { + return 7; + } + if (bits >= 308) { + return 8; + } + if (bits >= 55) { + return 27; + } + return 34; +} + +// num_trial_division_primes returns the number of primes to try with trial +// division before using more expensive checks. For larger numbers, the value +// of excluding a candidate with trial division is larger. +static size_t num_trial_division_primes(const BIGNUM *n) { + if (n->width * BN_BITS2 > 1024) { + return std::size(kPrimes); + } + return std::size(kPrimes) / 2; +} + +// BN_PRIME_CHECKS_BLINDED is the iteration count for blinding the constant-time +// primality test. See |BN_primality_test| for details. This number is selected +// so that, for a candidate N-bit RSA prime, picking |BN_PRIME_CHECKS_BLINDED| +// random N-bit numbers will have at least |BN_prime_checks_for_size(N)| values +// in range with high probability. +// +// The following Python script computes the blinding factor needed for the +// corresponding iteration count. +/* +import math + +# We choose candidate RSA primes between sqrt(2)/2 * 2^N and 2^N and select +# witnesses by generating random N-bit numbers. Thus the probability of +# selecting one in range is at least sqrt(2)/2. +p = math.sqrt(2) / 2 + +# Target around 2^-8 probability of the blinding being insufficient given that +# key generation is a one-time, noisy operation. +epsilon = 2**-8 + +def choose(a, b): + r = 1 + for i in xrange(b): + r *= a - i + r /= (i + 1) + return r + +def failure_rate(min_uniform, iterations): + """ Returns the probability that, for |iterations| candidate witnesses, fewer + than |min_uniform| of them will be uniform. """ + prob = 0.0 + for i in xrange(min_uniform): + prob += (choose(iterations, i) * + p**i * (1-p)**(iterations - i)) + return prob + +for min_uniform in (3, 4, 5, 6, 8, 13, 19, 28): + # Find the smallest number of iterations under the target failure rate. + iterations = min_uniform + while True: + prob = failure_rate(min_uniform, iterations) + if prob < epsilon: + print min_uniform, iterations, prob + break + iterations += 1 + +Output: + 3 9 0.00368894873911 + 4 11 0.00363319494662 + 5 13 0.00336215573898 + 6 15 0.00300145783158 + 8 19 0.00225214119331 + 13 27 0.00385610026955 + 19 38 0.0021410539126 + 28 52 0.00325405801769 + +16 iterations suffices for 400-bit primes and larger (6 uniform samples needed), +which is already well below the minimum acceptable key size for RSA. +*/ +#define BN_PRIME_CHECKS_BLINDED 16 + +static int probable_prime(BIGNUM *rnd, int bits); +static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add, + const BIGNUM *rem, BN_CTX *ctx); +static int probable_prime_dh_safe(BIGNUM *rnd, int bits, const BIGNUM *add, + const BIGNUM *rem, BN_CTX *ctx); + +BN_GENCB *BN_GENCB_new() { return New(); } + +void BN_GENCB_free(BN_GENCB *callback) { Delete(callback); } + +void BN_GENCB_set(BN_GENCB *callback, + int (*f)(int event, int n, struct bn_gencb_st *), void *arg) { + callback->callback = f; + callback->arg = arg; +} + +int BN_GENCB_call(BN_GENCB *callback, int event, int n) { + if (!callback) { + return 1; + } + + return callback->callback(event, n, callback); +} + +void *BN_GENCB_get_arg(const BN_GENCB *callback) { return callback->arg; } + +int BN_generate_prime_ex(BIGNUM *ret, int bits, int safe, const BIGNUM *add, + const BIGNUM *rem, BN_GENCB *cb) { + BIGNUM *t; + int i, j, c1 = 0; + int checks = BN_prime_checks_for_size(bits); + + if (bits < 2) { + // There are no prime numbers this small. + OPENSSL_PUT_ERROR(BN, BN_R_BITS_TOO_SMALL); + return 0; + } else if (bits == 2 && safe) { + // The smallest safe prime (7) is three bits. + OPENSSL_PUT_ERROR(BN, BN_R_BITS_TOO_SMALL); + return 0; + } + + UniquePtr ctx(BN_CTX_new()); + if (ctx == nullptr) { + return 0; + } + BN_CTXScope scope(ctx.get()); + t = BN_CTX_get(ctx.get()); + if (!t) { + return 0; + } + +loop: + // make a random number and set the top and bottom bits + if (add == nullptr) { + if (!probable_prime(ret, bits)) { + return 0; + } + } else { + if (safe) { + if (!probable_prime_dh_safe(ret, bits, add, rem, ctx.get())) { + return 0; + } + } else { + if (!probable_prime_dh(ret, bits, add, rem, ctx.get())) { + return 0; + } + } + } + + if (!BN_GENCB_call(cb, BN_GENCB_GENERATED, c1++)) { + // aborted + return 0; + } + + if (!safe) { + i = BN_is_prime_fasttest_ex(ret, checks, ctx.get(), 0, cb); + if (i == -1) { + return 0; + } else if (i == 0) { + goto loop; + } + } else { + // for "safe prime" generation, check that (p-1)/2 is prime. Since a prime + // is odd, We just need to divide by 2 + if (!BN_rshift1(t, ret)) { + return 0; + } + + // Interleave |ret| and |t|'s primality tests to avoid paying the full + // iteration count on |ret| only to quickly discover |t| is composite. + // + // TODO(davidben): This doesn't quite work because an iteration count of 1 + // still runs the blinding mechanism. + for (i = 0; i < checks; i++) { + j = BN_is_prime_fasttest_ex(ret, 1, ctx.get(), 0, nullptr); + if (j == -1) { + return 0; + } else if (j == 0) { + goto loop; + } + + j = BN_is_prime_fasttest_ex(t, 1, ctx.get(), 0, nullptr); + if (j == -1) { + return 0; + } else if (j == 0) { + goto loop; + } + + if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i)) { + return 0; + } + // We have a safe prime test pass + } + } + + // we have a prime :-) + return 1; +} + +static int bn_trial_division(uint16_t *out, const BIGNUM *bn) { + const size_t num_primes = num_trial_division_primes(bn); + for (size_t i = 1; i < num_primes; i++) { + // During RSA key generation, |bn| may be secret, but only if |bn| was + // prime, so it is safe to leak failed trial divisions. + if (constant_time_declassify_int(bn_mod_u16_consttime(bn, kPrimes[i]) == + 0)) { + *out = kPrimes[i]; + return 1; + } + } + return 0; +} + +int bssl::bn_odd_number_is_obviously_composite(const BIGNUM *bn) { + uint16_t prime; + return bn_trial_division(&prime, bn) && !BN_is_word(bn, prime); +} + +int bssl::bn_miller_rabin_init(BN_MILLER_RABIN *miller_rabin, + const BN_MONT_CTX *mont, BN_CTX *ctx) { + // This function corresponds to steps 1 through 3 of FIPS 186-5, B.3.1. + const BIGNUM *w = &mont->N; + // Note we do not call |BN_CTX_start| in this function. We intentionally + // allocate values in the containing scope so they outlive this function. + miller_rabin->w1 = BN_CTX_get(ctx); + miller_rabin->m = BN_CTX_get(ctx); + miller_rabin->one_mont = BN_CTX_get(ctx); + miller_rabin->w1_mont = BN_CTX_get(ctx); + if (miller_rabin->w1 == nullptr || // + miller_rabin->m == nullptr || // + miller_rabin->one_mont == nullptr || // + miller_rabin->w1_mont == nullptr) { + return 0; + } + + // See FIPS 186-5, B.3.1, steps 1 through 3. + if (!bn_usub_consttime(miller_rabin->w1, w, BN_value_one())) { + return 0; + } + miller_rabin->a = BN_count_low_zero_bits(miller_rabin->w1); + if (!bn_rshift_secret_shift(miller_rabin->m, miller_rabin->w1, + miller_rabin->a, ctx)) { + return 0; + } + miller_rabin->w_bits = BN_num_bits(w); + + // Precompute some values in Montgomery form. + if (!bn_one_to_montgomery(miller_rabin->one_mont, mont, ctx) || + // w - 1 is -1 mod w, so we can compute it in the Montgomery domain, -R, + // with a subtraction. (|one_mont| cannot be zero.) + !bn_usub_consttime(miller_rabin->w1_mont, w, miller_rabin->one_mont)) { + return 0; + } + + return 1; +} + +int bssl::bn_miller_rabin_iteration(const BN_MILLER_RABIN *miller_rabin, + int *out_is_possibly_prime, const BIGNUM *b, + const BN_MONT_CTX *mont, BN_CTX *ctx) { + // This function corresponds to steps 4.3 through 4.5 of FIPS 186-5, B.3.1. + BN_CTXScope scope(ctx); + + // Step 4.3. We use Montgomery-encoding for better performance and to avoid + // timing leaks. + const BIGNUM *w = &mont->N; + BIGNUM *z = BN_CTX_get(ctx); + crypto_word_t is_possibly_prime; + if (z == nullptr || + !BN_mod_exp_mont_consttime(z, b, miller_rabin->m, w, ctx, mont) || + !BN_to_montgomery(z, z, mont, ctx)) { + return 0; + } + + // is_possibly_prime is all ones if we have determined |b| is not a composite + // witness for |w|. This is equivalent to going to step 4.7 in the original + // algorithm. To avoid timing leaks, we run the algorithm to the end for prime + // inputs. + is_possibly_prime = 0; + + // Step 4.4. If z = 1 or z = w-1, b is not a composite witness and w is still + // possibly prime. + is_possibly_prime = BN_equal_consttime(z, miller_rabin->one_mont) | + BN_equal_consttime(z, miller_rabin->w1_mont); + is_possibly_prime = 0 - is_possibly_prime; // Make it all zeros or all ones. + + // Step 4.5. + // + // To avoid leaking |a|, we run the loop to |w_bits| and mask off all + // iterations once |j| = |a|. + for (int j = 1; j < miller_rabin->w_bits; j++) { + if (constant_time_declassify_w(constant_time_eq_int(j, miller_rabin->a) & + ~is_possibly_prime)) { + // If the loop is done and we haven't seen z = 1 or z = w-1 yet, the + // value is composite and we can break in variable time. + break; + } + + // Step 4.5.1. + if (!BN_mod_mul_montgomery(z, z, z, mont, ctx)) { + return 0; + } + + // Step 4.5.2. If z = w-1 and the loop is not done, this is not a composite + // witness. + crypto_word_t z_is_w1_mont = BN_equal_consttime(z, miller_rabin->w1_mont); + z_is_w1_mont = 0 - z_is_w1_mont; // Make it all zeros or all ones. + is_possibly_prime |= z_is_w1_mont; // Go to step 4.7 if |z_is_w1_mont|. + + // Step 4.5.3. If z = 1 and the loop is not done, the previous value of z + // was not -1. There are no non-trivial square roots of 1 modulo a prime, so + // w is composite and we may exit in variable time. + if (constant_time_declassify_w( + BN_equal_consttime(z, miller_rabin->one_mont) & + ~is_possibly_prime)) { + break; + } + } + + *out_is_possibly_prime = constant_time_declassify_w(is_possibly_prime) & 1; + return 1; +} + +int BN_primality_test(int *out_is_probably_prime, const BIGNUM *w, int checks, + BN_CTX *ctx, int do_trial_division, BN_GENCB *cb) { + // This function's secrecy and performance requirements come from RSA key + // generation. We generate RSA keys by selecting two large, secret primes with + // rejection sampling. + // + // We thus treat |w| as secret if turns out to be a large prime. However, if + // |w| is composite, we treat this and |w| itself as public. (Conversely, if + // |w| is prime, that it is prime is public. Only the value is secret.) This + // is fine for RSA key generation, but note it is important that we use + // rejection sampling, with each candidate prime chosen independently. This + // would not work for, e.g., an algorithm which looked for primes in + // consecutive integers. These assumptions allow us to discard composites + // quickly. We additionally treat |w| as public when it is a small prime to + // simplify trial decryption and some edge cases. + // + // One RSA key generation will call this function on exactly two primes and + // many more composites. The overall cost is a combination of several factors: + // + // 1. Checking if |w| is divisible by a small prime is much faster than + // learning it is composite by Miller-Rabin (see below for details on that + // cost). Trial division by p saves 1/p of Miller-Rabin calls, so this is + // worthwhile until p exceeds the ratio of the two costs. + // + // 2. For a random (i.e. non-adversarial) candidate large prime and candidate + // witness, the probability of false witness is very low. (This is why FIPS + // 186-5 only requires a few iterations.) Thus composites not discarded by + // trial decryption, in practice, cost one Miller-Rabin iteration. Only the + // two actual primes cost the full iteration count. + // + // 3. A Miller-Rabin iteration is a modular exponentiation plus |a| additional + // modular squares, where |a| is the number of factors of two in |w-1|. |a| + // is likely small (the distribution falls exponentially), but it is also + // potentially secret, so we loop up to its log(w) upper bound when |w| is + // prime. When |w| is composite, we break early, so only two calls pay this + // cost. (Note that all calls pay the modular exponentiation which is, + // itself, log(w) modular multiplications and squares.) + // + // 4. While there are only two prime calls, they multiplicatively pay the full + // costs of (2) and (3). + // + // 5. After the primes are chosen, RSA keys derive some values from the + // primes, but this cost is negligible in comparison. + + *out_is_probably_prime = 0; + + if (BN_cmp(w, BN_value_one()) <= 0) { + return 1; + } + + if (!BN_is_odd(w)) { + // The only even prime is two. + *out_is_probably_prime = BN_is_word(w, 2); + return 1; + } + + // Miller-Rabin does not work for three. + if (BN_is_word(w, 3)) { + *out_is_probably_prime = 1; + return 1; + } + + if (do_trial_division) { + // Perform additional trial division checks to discard small primes. + uint16_t prime; + if (bn_trial_division(&prime, w)) { + *out_is_probably_prime = BN_is_word(w, prime); + return 1; + } + if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, -1)) { + return 0; + } + } + + if (checks == BN_prime_checks_for_generation) { + checks = BN_prime_checks_for_size(BN_num_bits(w)); + } + + UniquePtr new_ctx; + if (ctx == nullptr) { + new_ctx.reset(BN_CTX_new()); + if (new_ctx == nullptr) { + return 0; + } + ctx = new_ctx.get(); + } + + // See B.3.1 from FIPS 186-5. + BN_CTXScope scope(ctx); + BIGNUM *b = BN_CTX_get(ctx); + UniquePtr mont(BN_MONT_CTX_new_consttime(w, ctx)); + BN_MILLER_RABIN miller_rabin; + crypto_word_t uniform_iterations = 0; + if (b == nullptr || mont == nullptr || + // Steps 1-3. + !bn_miller_rabin_init(&miller_rabin, mont.get(), ctx)) { + return 0; + } + + // The following loop performs in inner iteration of the Miller-Rabin + // Primality test (Step 4). + // + // The algorithm as specified in FIPS 186-5 leaks information on |w|, the RSA + // private key. Instead, we run through each iteration unconditionally, + // performing modular multiplications, masking off any effects to behave + // equivalently to the specified algorithm. + // + // We also blind the number of values of |b| we try. Steps 4.1–4.2 say to + // discard out-of-range values. To avoid leaking information on |w|, we use + // |bn_rand_secret_range| which, rather than discarding bad values, adjusts + // them to be in range. Though not uniformly selected, these adjusted values + // are still usable as Miller-Rabin checks. + // + // Miller-Rabin is already probabilistic, so we could reach the desired + // confidence levels by just suitably increasing the iteration count. However, + // to align with FIPS 186-5, we use a more pessimal analysis: we do not count + // the non-uniform values towards the iteration count. As a result, this + // function is more complex and has more timing risk than necessary. + // + // We count both total iterations and uniform ones and iterate until we've + // reached at least |BN_PRIME_CHECKS_BLINDED| and |iterations|, respectively. + // If the latter is large enough, it will be the limiting factor with high + // probability and we won't leak information. + // + // Note this blinding does not impact most calls when picking primes because + // composites are rejected early. Only the two secret primes see extra work. + + // Using |constant_time_lt_w| seems to prevent the compiler from optimizing + // this into two jumps. + for (int i = 1; constant_time_declassify_w( + (i <= BN_PRIME_CHECKS_BLINDED) | + constant_time_lt_w(uniform_iterations, checks)); + i++) { + // Step 4.1-4.2 + int is_uniform; + if (!bn_rand_secret_range(b, &is_uniform, 2, miller_rabin.w1)) { + return 0; + } + uniform_iterations += is_uniform; + + // Steps 4.3-4.5 + int is_possibly_prime = 0; + if (!bn_miller_rabin_iteration(&miller_rabin, &is_possibly_prime, b, + mont.get(), ctx)) { + return 0; + } + + if (!is_possibly_prime) { + // Step 4.6. We did not see z = w-1 before z = 1, so w must be composite. + *out_is_probably_prime = 0; + return 1; + } + + // Step 4.7 + if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i - 1)) { + return 0; + } + } + + declassify_assert(uniform_iterations >= (crypto_word_t)checks); + *out_is_probably_prime = 1; + return 1; +} + +int BN_is_prime_ex(const BIGNUM *candidate, int checks, BN_CTX *ctx, + BN_GENCB *cb) { + return BN_is_prime_fasttest_ex(candidate, checks, ctx, 0, cb); +} + +int BN_is_prime_fasttest_ex(const BIGNUM *a, int checks, BN_CTX *ctx, + int do_trial_division, BN_GENCB *cb) { + int is_probably_prime; + if (!BN_primality_test(&is_probably_prime, a, checks, ctx, do_trial_division, + cb)) { + return -1; + } + return is_probably_prime; +} + +int BN_enhanced_miller_rabin_primality_test( + enum bn_primality_result_t *out_result, const BIGNUM *w, int checks, + BN_CTX *ctx, BN_GENCB *cb) { + // Enhanced Miller-Rabin is only valid on odd integers greater than 3. + if (!BN_is_odd(w) || BN_cmp_word(w, 3) <= 0) { + OPENSSL_PUT_ERROR(BN, BN_R_INVALID_INPUT); + return 0; + } + + if (checks == BN_prime_checks_for_generation) { + checks = BN_prime_checks_for_size(BN_num_bits(w)); + } + + BN_CTXScope scope(ctx); + BIGNUM *w1 = BN_CTX_get(ctx); + if (w1 == nullptr || !BN_copy(w1, w) || !BN_sub_word(w1, 1)) { + return 0; + } + + // Write w1 as m*2^a (Steps 1 and 2). + int a = 0; + while (!BN_is_bit_set(w1, a)) { + a++; + } + BIGNUM *m = BN_CTX_get(ctx); + if (m == nullptr || !BN_rshift(m, w1, a)) { + return 0; + } + + BIGNUM *b = BN_CTX_get(ctx); + BIGNUM *g = BN_CTX_get(ctx); + BIGNUM *z = BN_CTX_get(ctx); + BIGNUM *x = BN_CTX_get(ctx); + BIGNUM *x1 = BN_CTX_get(ctx); + if (b == nullptr || g == nullptr || z == nullptr || x == nullptr || + x1 == nullptr) { + return 0; + } + + // Montgomery setup for computations mod w + UniquePtr mont(BN_MONT_CTX_new_for_modulus(w, ctx)); + if (mont == nullptr) { + return 0; + } + + // The following loop performs in inner iteration of the Enhanced Miller-Rabin + // Primality test (Step 4). + for (int i = 1; i <= checks; i++) { + // Step 4.1-4.2 + if (!BN_rand_range_ex(b, 2, w1)) { + return 0; + } + + // Step 4.3-4.4 + if (!BN_gcd(g, b, w, ctx)) { + return 0; + } + if (BN_cmp_word(g, 1) > 0) { + *out_result = bn_composite; + return 1; + } + + // Step 4.5 + if (!BN_mod_exp_mont(z, b, m, w, ctx, mont.get())) { + return 0; + } + + // Step 4.6 + if (BN_is_one(z) || BN_cmp(z, w1) == 0) { + goto loop; + } + + // Step 4.7 + for (int j = 1; j < a; j++) { + if (!BN_copy(x, z) || !BN_mod_mul(z, x, x, w, ctx)) { + return 0; + } + if (BN_cmp(z, w1) == 0) { + goto loop; + } + if (BN_is_one(z)) { + goto composite; + } + } + + // Step 4.8-4.9 + if (!BN_copy(x, z) || !BN_mod_mul(z, x, x, w, ctx)) { + return 0; + } + + // Step 4.10-4.11 + if (!BN_is_one(z) && !BN_copy(x, z)) { + return 0; + } + + composite: + // Step 4.12-4.14 + if (!BN_copy(x1, x) || !BN_sub_word(x1, 1) || !BN_gcd(g, x1, w, ctx)) { + return 0; + } + if (BN_cmp_word(g, 1) > 0) { + *out_result = bn_composite; + } else { + *out_result = bn_non_prime_power_composite; + } + + return 1; + + loop: + // Step 4.15 + if (!BN_GENCB_call(cb, BN_GENCB_PRIME_TEST, i - 1)) { + return 0; + } + } + + *out_result = bn_probably_prime; + return 1; +} + +static int probable_prime(BIGNUM *rnd, int bits) { + do { + if (!BN_rand(rnd, bits, BN_RAND_TOP_TWO, BN_RAND_BOTTOM_ODD)) { + return 0; + } + } while (bn_odd_number_is_obviously_composite(rnd)); + return 1; +} + +static int probable_prime_dh(BIGNUM *rnd, int bits, const BIGNUM *add, + const BIGNUM *rem, BN_CTX *ctx) { + BN_CTXScope scope(ctx); + BIGNUM *t1; + if ((t1 = BN_CTX_get(ctx)) == nullptr) { + return 0; + } + + if (!BN_rand(rnd, bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ODD)) { + return 0; + } + + // we need ((rnd-rem) % add) == 0 + if (!BN_mod(t1, rnd, add, ctx)) { + return 0; + } + if (!BN_sub(rnd, rnd, t1)) { + return 0; + } + if (rem == nullptr) { + if (!BN_add_word(rnd, 1)) { + return 0; + } + } else { + if (!BN_add(rnd, rnd, rem)) { + return 0; + } + } + // we now have a random number 'rand' to test. + + size_t num_primes = num_trial_division_primes(rnd); +loop: + for (size_t i = 1; i < num_primes; i++) { + // check that rnd is a prime + if (bn_mod_u16_consttime(rnd, kPrimes[i]) <= 1) { + if (!BN_add(rnd, rnd, add)) { + return 0; + } + goto loop; + } + } + + return 1; +} + +static int probable_prime_dh_safe(BIGNUM *p, int bits, const BIGNUM *padd, + const BIGNUM *rem, BN_CTX *ctx) { + bits--; + BN_CTXScope scope(ctx); + BIGNUM *t1 = BN_CTX_get(ctx); + BIGNUM *q = BN_CTX_get(ctx); + BIGNUM *qadd = BN_CTX_get(ctx); + if (qadd == nullptr) { + return 0; + } + + if (!BN_rshift1(qadd, padd)) { + return 0; + } + + if (!BN_rand(q, bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ODD)) { + return 0; + } + + // we need ((rnd-rem) % add) == 0 + if (!BN_mod(t1, q, qadd, ctx)) { + return 0; + } + + if (!BN_sub(q, q, t1)) { + return 0; + } + + if (rem == nullptr) { + if (!BN_add_word(q, 1)) { + return 0; + } + } else { + if (!BN_rshift1(t1, rem)) { + return 0; + } + if (!BN_add(q, q, t1)) { + return 0; + } + } + + // we now have a random number 'rand' to test. + if (!BN_lshift1(p, q)) { + return 0; + } + if (!BN_add_word(p, 1)) { + return 0; + } + + size_t num_primes = num_trial_division_primes(p); +loop: + for (size_t i = 1; i < num_primes; i++) { + // check that p and q are prime + // check that for p and q + // gcd(p-1,primes) == 1 (except for 2) + if (bn_mod_u16_consttime(p, kPrimes[i]) == 0 || + bn_mod_u16_consttime(q, kPrimes[i]) == 0) { + if (!BN_add(p, p, padd)) { + return 0; + } + if (!BN_add(q, q, qadd)) { + return 0; + } + goto loop; + } + } + + return 1; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/random.c b/third_party/boringssl/src/crypto/fipsmodule/bn/random.c deleted file mode 100644 index 4966778e..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/random.c +++ /dev/null @@ -1,349 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -/* ==================================================================== - * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include - -#include -#include - -#include "../../internal.h" -#include "../rand/internal.h" -#include "../service_indicator/internal.h" -#include "internal.h" - - -int BN_rand(BIGNUM *rnd, int bits, int top, int bottom) { - if (rnd == NULL) { - return 0; - } - - if (top != BN_RAND_TOP_ANY && top != BN_RAND_TOP_ONE && - top != BN_RAND_TOP_TWO) { - OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - - if (bottom != BN_RAND_BOTTOM_ANY && bottom != BN_RAND_BOTTOM_ODD) { - OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - - if (bits == 0) { - BN_zero(rnd); - return 1; - } - - if (bits > INT_MAX - (BN_BITS2 - 1)) { - OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); - return 0; - } - - int words = (bits + BN_BITS2 - 1) / BN_BITS2; - int bit = (bits - 1) % BN_BITS2; - const BN_ULONG kOne = 1; - const BN_ULONG kThree = 3; - BN_ULONG mask = bit < BN_BITS2 - 1 ? (kOne << (bit + 1)) - 1 : BN_MASK2; - if (!bn_wexpand(rnd, words)) { - return 0; - } - - FIPS_service_indicator_lock_state(); - RAND_bytes((uint8_t *)rnd->d, words * sizeof(BN_ULONG)); - FIPS_service_indicator_unlock_state(); - - rnd->d[words - 1] &= mask; - if (top != BN_RAND_TOP_ANY) { - if (top == BN_RAND_TOP_TWO && bits > 1) { - if (bit == 0) { - rnd->d[words - 1] |= 1; - rnd->d[words - 2] |= kOne << (BN_BITS2 - 1); - } else { - rnd->d[words - 1] |= kThree << (bit - 1); - } - } else { - rnd->d[words - 1] |= kOne << bit; - } - } - if (bottom == BN_RAND_BOTTOM_ODD) { - rnd->d[0] |= 1; - } - - rnd->neg = 0; - rnd->width = words; - return 1; -} - -int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom) { - return BN_rand(rnd, bits, top, bottom); -} - -// bn_less_than_word_mask returns a mask of all ones if the number represented -// by |len| words at |a| is less than |b| and zero otherwise. It performs this -// computation in time independent of the value of |a|. |b| is assumed public. -static crypto_word_t bn_less_than_word_mask(const BN_ULONG *a, size_t len, - BN_ULONG b) { - if (b == 0) { - return CONSTTIME_FALSE_W; - } - if (len == 0) { - return CONSTTIME_TRUE_W; - } - - // |a| < |b| iff a[1..len-1] are all zero and a[0] < b. - static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), - "crypto_word_t is too small"); - crypto_word_t mask = 0; - for (size_t i = 1; i < len; i++) { - mask |= a[i]; - } - // |mask| is now zero iff a[1..len-1] are all zero. - mask = constant_time_is_zero_w(mask); - mask &= constant_time_lt_w(a[0], b); - return mask; -} - -int bn_in_range_words(const BN_ULONG *a, BN_ULONG min_inclusive, - const BN_ULONG *max_exclusive, size_t len) { - crypto_word_t mask = ~bn_less_than_word_mask(a, len, min_inclusive); - return mask & bn_less_than_words(a, max_exclusive, len); -} - -static int bn_range_to_mask(size_t *out_words, BN_ULONG *out_mask, - size_t min_inclusive, const BN_ULONG *max_exclusive, - size_t len) { - // The magnitude of |max_exclusive| is assumed public. - size_t words = len; - while (words > 0 && max_exclusive[words - 1] == 0) { - words--; - } - if (words == 0 || - (words == 1 && max_exclusive[0] <= min_inclusive)) { - OPENSSL_PUT_ERROR(BN, BN_R_INVALID_RANGE); - return 0; - } - BN_ULONG mask = max_exclusive[words - 1]; - // This sets all bits in |mask| below the most significant bit. - mask |= mask >> 1; - mask |= mask >> 2; - mask |= mask >> 4; - mask |= mask >> 8; - mask |= mask >> 16; -#if defined(OPENSSL_64_BIT) - mask |= mask >> 32; -#endif - - *out_words = words; - *out_mask = mask; - return 1; -} - -int bn_rand_range_words(BN_ULONG *out, BN_ULONG min_inclusive, - const BN_ULONG *max_exclusive, size_t len, - const uint8_t additional_data[32]) { - // This function implements the equivalent of steps 4 through 7 of FIPS 186-4 - // appendices B.4.2 and B.5.2. When called in those contexts, |max_exclusive| - // is n and |min_inclusive| is one. - - // Compute the bit length of |max_exclusive| (step 1), in terms of a number of - // |words| worth of entropy to fill and a mask of bits to clear in the top - // word. - size_t words; - BN_ULONG mask; - if (!bn_range_to_mask(&words, &mask, min_inclusive, max_exclusive, len)) { - return 0; - } - - // Fill any unused words with zero. - OPENSSL_memset(out + words, 0, (len - words) * sizeof(BN_ULONG)); - - unsigned count = 100; - do { - if (!--count) { - OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_ITERATIONS); - return 0; - } - - // Steps 4 and 5. Use |words| and |mask| together to obtain a string of N - // bits, where N is the bit length of |max_exclusive|. - FIPS_service_indicator_lock_state(); - RAND_bytes_with_additional_data((uint8_t *)out, words * sizeof(BN_ULONG), - additional_data); - FIPS_service_indicator_unlock_state(); - out[words - 1] &= mask; - - // If out >= max_exclusive or out < min_inclusive, retry. This implements - // the equivalent of steps 6 and 7 without leaking the value of |out|. - } while (!bn_in_range_words(out, min_inclusive, max_exclusive, words)); - return 1; -} - -int BN_rand_range_ex(BIGNUM *r, BN_ULONG min_inclusive, - const BIGNUM *max_exclusive) { - static const uint8_t kDefaultAdditionalData[32] = {0}; - if (!bn_wexpand(r, max_exclusive->width) || - !bn_rand_range_words(r->d, min_inclusive, max_exclusive->d, - max_exclusive->width, kDefaultAdditionalData)) { - return 0; - } - - r->neg = 0; - r->width = max_exclusive->width; - return 1; -} - -int bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, BN_ULONG min_inclusive, - const BIGNUM *max_exclusive) { - size_t words; - BN_ULONG mask; - if (!bn_range_to_mask(&words, &mask, min_inclusive, max_exclusive->d, - max_exclusive->width) || - !bn_wexpand(r, words)) { - return 0; - } - - assert(words > 0); - assert(mask != 0); - // The range must be large enough for bit tricks to fix invalid values. - if (words == 1 && min_inclusive > mask >> 1) { - OPENSSL_PUT_ERROR(BN, BN_R_INVALID_RANGE); - return 0; - } - - // Select a uniform random number with num_bits(max_exclusive) bits. - FIPS_service_indicator_lock_state(); - RAND_bytes((uint8_t *)r->d, words * sizeof(BN_ULONG)); - FIPS_service_indicator_unlock_state(); - r->d[words - 1] &= mask; - - // Check, in constant-time, if the value is in range. - *out_is_uniform = - bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words); - crypto_word_t in_range = *out_is_uniform; - in_range = 0 - in_range; - - // If the value is not in range, force it to be in range. - r->d[0] |= constant_time_select_w(in_range, 0, min_inclusive); - r->d[words - 1] &= constant_time_select_w(in_range, BN_MASK2, mask >> 1); - assert(bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words)); - - r->neg = 0; - r->width = words; - return 1; -} - -int BN_rand_range(BIGNUM *r, const BIGNUM *range) { - return BN_rand_range_ex(r, 0, range); -} - -int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range) { - return BN_rand_range(r, range); -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/random.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/random.cc.inc new file mode 100644 index 00000000..591cb8fa --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/random.cc.inc @@ -0,0 +1,261 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include + +#include "../../internal.h" +#include "../bcm_interface.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +int BN_rand(BIGNUM *rnd, int bits, int top, int bottom) { + if (rnd == nullptr) { + return 0; + } + + if (top != BN_RAND_TOP_ANY && top != BN_RAND_TOP_ONE && + top != BN_RAND_TOP_TWO) { + OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + if (bottom != BN_RAND_BOTTOM_ANY && bottom != BN_RAND_BOTTOM_ODD) { + OPENSSL_PUT_ERROR(BN, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + if (bits == 0) { + BN_zero(rnd); + return 1; + } + + if (bits > INT_MAX - (BN_BITS2 - 1)) { + OPENSSL_PUT_ERROR(BN, BN_R_BIGNUM_TOO_LONG); + return 0; + } + + int words = (bits + BN_BITS2 - 1) / BN_BITS2; + int bit = (bits - 1) % BN_BITS2; + const BN_ULONG kOne = 1; + const BN_ULONG kThree = 3; + BN_ULONG mask = bit < BN_BITS2 - 1 ? (kOne << (bit + 1)) - 1 : BN_MASK2; + if (!bn_wexpand(rnd, words)) { + return 0; + } + + FIPS_service_indicator_lock_state(); + BCM_rand_bytes((uint8_t *)rnd->d, words * sizeof(BN_ULONG)); + FIPS_service_indicator_unlock_state(); + + rnd->d[words - 1] &= mask; + if (top != BN_RAND_TOP_ANY) { + if (top == BN_RAND_TOP_TWO && bits > 1) { + if (bit == 0) { + rnd->d[words - 1] |= 1; + rnd->d[words - 2] |= kOne << (BN_BITS2 - 1); + } else { + rnd->d[words - 1] |= kThree << (bit - 1); + } + } else { + rnd->d[words - 1] |= kOne << bit; + } + } + if (bottom == BN_RAND_BOTTOM_ODD) { + rnd->d[0] |= 1; + } + + rnd->neg = 0; + rnd->width = words; + return 1; +} + +int BN_pseudo_rand(BIGNUM *rnd, int bits, int top, int bottom) { + return BN_rand(rnd, bits, top, bottom); +} + +// bn_less_than_word_mask returns a mask of all ones if the number represented +// by |len| words at |a| is less than |b| and zero otherwise. It performs this +// computation in time independent of the value of |a|. |b| is assumed public. +static crypto_word_t bn_less_than_word_mask(const BN_ULONG *a, size_t len, + BN_ULONG b) { + if (b == 0) { + return CONSTTIME_FALSE_W; + } + if (len == 0) { + return CONSTTIME_TRUE_W; + } + + // |a| < |b| iff a[1..len-1] are all zero and a[0] < b. + static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), + "crypto_word_t is too small"); + crypto_word_t mask = 0; + for (size_t i = 1; i < len; i++) { + mask |= a[i]; + } + // |mask| is now zero iff a[1..len-1] are all zero. + mask = constant_time_is_zero_w(mask); + mask &= constant_time_lt_w(a[0], b); + return mask; +} + +int bssl::bn_in_range_words(const BN_ULONG *a, BN_ULONG min_inclusive, + const BN_ULONG *max_exclusive, size_t len) { + crypto_word_t mask = ~bn_less_than_word_mask(a, len, min_inclusive); + return mask & bn_less_than_words(a, max_exclusive, len); +} + +static int bn_range_to_mask(size_t *out_words, BN_ULONG *out_mask, + size_t min_inclusive, const BN_ULONG *max_exclusive, + size_t len) { + // The magnitude of |max_exclusive| is assumed public. + size_t words = len; + while (words > 0 && max_exclusive[words - 1] == 0) { + words--; + } + if (words == 0 || (words == 1 && max_exclusive[0] <= min_inclusive)) { + OPENSSL_PUT_ERROR(BN, BN_R_INVALID_RANGE); + return 0; + } + BN_ULONG mask = max_exclusive[words - 1]; + // This sets all bits in |mask| below the most significant bit. + mask |= mask >> 1; + mask |= mask >> 2; + mask |= mask >> 4; + mask |= mask >> 8; + mask |= mask >> 16; +#if defined(OPENSSL_64_BIT) + mask |= mask >> 32; +#endif + + *out_words = words; + *out_mask = mask; + return 1; +} + +int bssl::bn_rand_range_words(BN_ULONG *out, BN_ULONG min_inclusive, + const BN_ULONG *max_exclusive, size_t len, + const uint8_t additional_data[32]) { + // This function implements the equivalent of steps 1 through 4 of FIPS 186-5 + // appendices A.2.2 and A.3.2, repeating the process on failure. When called + // in those contexts, |max_exclusive| is n and |min_inclusive| is one. + + // Compute the bit length of |max_exclusive| (step 1), in terms of a number of + // |words| worth of entropy to fill and a mask of bits to clear in the top + // word. + size_t words; + BN_ULONG mask; + if (!bn_range_to_mask(&words, &mask, min_inclusive, max_exclusive, len)) { + return 0; + } + + // Fill any unused words with zero. + OPENSSL_memset(out + words, 0, (len - words) * sizeof(BN_ULONG)); + + unsigned count = 100; + do { + if (!--count) { + OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_ITERATIONS); + return 0; + } + + // Use |words| and |mask| together to obtain a string of N bits, where N is + // the bit length of |max_exclusive|. + FIPS_service_indicator_lock_state(); + BCM_rand_bytes_with_additional_data( + (uint8_t *)out, words * sizeof(BN_ULONG), additional_data); + FIPS_service_indicator_unlock_state(); + out[words - 1] &= mask; + + // If out >= max_exclusive or out < min_inclusive, retry. The result of this + // comparison may be treated as public. It only reveals how many attempts + // were needed before we found a value in range. This is independent of the + // final secret output, and has a distribution that depends only on + // |min_inclusive| and |max_exclusive|, both of which are public. + } while (!constant_time_declassify_int( + bn_in_range_words(out, min_inclusive, max_exclusive, words))); + return 1; +} + +int BN_rand_range_ex(BIGNUM *r, BN_ULONG min_inclusive, + const BIGNUM *max_exclusive) { + static const uint8_t kDefaultAdditionalData[32] = {0}; + if (!bn_wexpand(r, max_exclusive->width) || + !bn_rand_range_words(r->d, min_inclusive, max_exclusive->d, + max_exclusive->width, kDefaultAdditionalData)) { + return 0; + } + + r->neg = 0; + r->width = max_exclusive->width; + return 1; +} + +int bssl::bn_rand_secret_range(BIGNUM *r, int *out_is_uniform, + BN_ULONG min_inclusive, + const BIGNUM *max_exclusive) { + size_t words; + BN_ULONG mask; + if (!bn_range_to_mask(&words, &mask, min_inclusive, max_exclusive->d, + max_exclusive->width) || + !bn_wexpand(r, words)) { + return 0; + } + + assert(words > 0); + assert(mask != 0); + // The range must be large enough for bit tricks to fix invalid values. + if (words == 1 && min_inclusive > mask >> 1) { + OPENSSL_PUT_ERROR(BN, BN_R_INVALID_RANGE); + return 0; + } + + // Select a uniform random number with num_bits(max_exclusive) bits. + FIPS_service_indicator_lock_state(); + BCM_rand_bytes((uint8_t *)r->d, words * sizeof(BN_ULONG)); + FIPS_service_indicator_unlock_state(); + r->d[words - 1] &= mask; + + // Check, in constant-time, if the value is in range. + *out_is_uniform = + bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words); + crypto_word_t in_range = *out_is_uniform; + in_range = 0 - in_range; + + // If the value is not in range, force it to be in range. + r->d[0] |= constant_time_select_w(in_range, 0, min_inclusive); + r->d[words - 1] &= constant_time_select_w(in_range, BN_MASK2, mask >> 1); + declassify_assert( + bn_in_range_words(r->d, min_inclusive, max_exclusive->d, words)); + + r->neg = 0; + r->width = (int)words; + return 1; +} + +int BN_rand_range(BIGNUM *r, const BIGNUM *range) { + return BN_rand_range_ex(r, 0, range); +} + +int BN_pseudo_rand_range(BIGNUM *r, const BIGNUM *range) { + return BN_rand_range(r, range); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/rsaz_exp.c b/third_party/boringssl/src/crypto/fipsmodule/bn/rsaz_exp.c deleted file mode 100644 index 7b455b55..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/rsaz_exp.c +++ /dev/null @@ -1,137 +0,0 @@ -/* - * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. - * Copyright (c) 2012, Intel Corporation. All Rights Reserved. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) - * (1) Intel Corporation, Israel Development Center, Haifa, Israel - * (2) University of Haifa, Israel - */ - -#include "rsaz_exp.h" - -#if defined(RSAZ_ENABLED) - -#include - -#include - -#include "internal.h" -#include "../../internal.h" - - -// one is 1 in RSAZ's representation. -alignas(64) static const BN_ULONG one[40] = { - 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; -// two80 is 2^80 in RSAZ's representation. Note RSAZ uses base 2^29, so this is -// 2^(29*2 + 22) = 2^80, not 2^(64*2 + 22). -alignas(64) static const BN_ULONG two80[40] = { - 0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; - -void RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], - const BN_ULONG base_norm[16], - const BN_ULONG exponent[16], - const BN_ULONG m_norm[16], const BN_ULONG RR[16], - BN_ULONG k0, - BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]) { - static_assert(MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH % 64 == 0, - "MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH is too small"); - assert((uintptr_t)storage % 64 == 0); - - BN_ULONG *a_inv, *m, *result, *table_s = storage + 40 * 3, *R2 = table_s; - // Note |R2| aliases |table_s|. - if (((((uintptr_t)storage & 4095) + 320) >> 12) != 0) { - result = storage; - a_inv = storage + 40; - m = storage + 40 * 2; // should not cross page - } else { - m = storage; // should not cross page - result = storage + 40; - a_inv = storage + 40 * 2; - } - - rsaz_1024_norm2red_avx2(m, m_norm); - rsaz_1024_norm2red_avx2(a_inv, base_norm); - rsaz_1024_norm2red_avx2(R2, RR); - - // Convert |R2| from the usual radix, giving R = 2^1024, to RSAZ's radix, - // giving R = 2^(36*29) = 2^1044. - rsaz_1024_mul_avx2(R2, R2, R2, m, k0); - // R2 = 2^2048 * 2^2048 / 2^1044 = 2^3052 - rsaz_1024_mul_avx2(R2, R2, two80, m, k0); - // R2 = 2^3052 * 2^80 / 2^1044 = 2^2088 = (2^1044)^2 - - // table[0] = 1 - // table[1] = a_inv^1 - rsaz_1024_mul_avx2(result, R2, one, m, k0); - rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, 0); - rsaz_1024_scatter5_avx2(table_s, a_inv, 1); - // table[2] = a_inv^2 - rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 2); - // table[4] = a_inv^4 - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 4); - // table[8] = a_inv^8 - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 8); - // table[16] = a_inv^16 - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, 16); - for (int i = 3; i < 32; i += 2) { - // table[i] = table[i-1] * a_inv = a_inv^i - rsaz_1024_gather5_avx2(result, table_s, i - 1); - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - rsaz_1024_scatter5_avx2(table_s, result, i); - for (int j = 2 * i; j < 32; j *= 2) { - // table[j] = table[j/2]^2 = a_inv^j - rsaz_1024_sqr_avx2(result, result, m, k0, 1); - rsaz_1024_scatter5_avx2(table_s, result, j); - } - } - - // Load the first window. - const uint8_t *p_str = (const uint8_t *)exponent; - int wvalue = p_str[127] >> 3; - rsaz_1024_gather5_avx2(result, table_s, wvalue); - - int index = 1014; - while (index > -1) { // Loop for the remaining 127 windows. - rsaz_1024_sqr_avx2(result, result, m, k0, 5); - - uint16_t wvalue_16; - memcpy(&wvalue_16, &p_str[index / 8], sizeof(wvalue_16)); - wvalue = wvalue_16; - wvalue = (wvalue >> (index % 8)) & 31; - index -= 5; - - rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); // Borrow |a_inv|. - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - } - - // Square four times. - rsaz_1024_sqr_avx2(result, result, m, k0, 4); - - wvalue = p_str[0] & 15; - - rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); // Borrow |a_inv|. - rsaz_1024_mul_avx2(result, result, a_inv, m, k0); - - // Convert from Montgomery. - rsaz_1024_mul_avx2(result, result, one, m, k0); - - rsaz_1024_red2norm_avx2(result_norm, result); - BN_ULONG scratch[16]; - bn_reduce_once_in_place(result_norm, /*carry=*/0, m_norm, scratch, 16); - - OPENSSL_cleanse(storage, MOD_EXP_CTIME_STORAGE_LEN * sizeof(BN_ULONG)); -} - -#endif // RSAZ_ENABLED diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/rsaz_exp.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/rsaz_exp.cc.inc new file mode 100644 index 00000000..105daae3 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/rsaz_exp.cc.inc @@ -0,0 +1,144 @@ +// Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2012, Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) +// (1) Intel Corporation, Israel Development Center, Haifa, Israel +// (2) University of Haifa, Israel + +#include "rsaz_exp.h" + +#if defined(RSAZ_ENABLED) + +#include + +#include + +#include "internal.h" +#include "../../internal.h" + + +using namespace bssl; + +// rsaz_one is 1 in RSAZ's representation. +alignas(64) static const BN_ULONG rsaz_one[40] = { + 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; +// rsaz_two80 is 2^80 in RSAZ's representation. Note RSAZ uses base 2^29, so this is +// 2^(29*2 + 22) = 2^80, not 2^(64*2 + 22). +alignas(64) static const BN_ULONG rsaz_two80[40] = { + 0, 0, 1 << 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + +void bssl::RSAZ_1024_mod_exp_avx2(BN_ULONG result_norm[16], + const BN_ULONG base_norm[16], + const BN_ULONG exponent[16], + const BN_ULONG m_norm[16], + const BN_ULONG RR[16], BN_ULONG k0, + BN_ULONG storage[MOD_EXP_CTIME_STORAGE_LEN]) { + static_assert(MOD_EXP_CTIME_ALIGN % 64 == 0, + "MOD_EXP_CTIME_ALIGN is too small"); + assert((uintptr_t)storage % 64 == 0); + + BN_ULONG *a_inv, *m, *result, *table_s = storage + 40 * 3, *R2 = table_s; + // Note |R2| aliases |table_s|. + if (((((uintptr_t)storage & 4095) + 320) >> 12) != 0) { + result = storage; + a_inv = storage + 40; + m = storage + 40 * 2; // should not cross page + } else { + m = storage; // should not cross page + result = storage + 40; + a_inv = storage + 40 * 2; + } + + rsaz_1024_norm2red_avx2(m, m_norm); + rsaz_1024_norm2red_avx2(a_inv, base_norm); + rsaz_1024_norm2red_avx2(R2, RR); + + // Convert |R2| from the usual radix, giving R = 2^1024, to RSAZ's radix, + // giving R = 2^(36*29) = 2^1044. + rsaz_1024_mul_avx2(R2, R2, R2, m, k0); + // R2 = 2^2048 * 2^2048 / 2^1044 = 2^3052 + rsaz_1024_mul_avx2(R2, R2, rsaz_two80, m, k0); + // R2 = 2^3052 * 2^80 / 2^1044 = 2^2088 = (2^1044)^2 + + // table[0] = 1 + // table[1] = a_inv^1 + rsaz_1024_mul_avx2(result, R2, rsaz_one, m, k0); + rsaz_1024_mul_avx2(a_inv, a_inv, R2, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, 0); + rsaz_1024_scatter5_avx2(table_s, a_inv, 1); + // table[2] = a_inv^2 + rsaz_1024_sqr_avx2(result, a_inv, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 2); + // table[4] = a_inv^4 + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 4); + // table[8] = a_inv^8 + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 8); + // table[16] = a_inv^16 + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, 16); + for (int i = 3; i < 32; i += 2) { + // table[i] = table[i-1] * a_inv = a_inv^i + rsaz_1024_gather5_avx2(result, table_s, i - 1); + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + rsaz_1024_scatter5_avx2(table_s, result, i); + for (int j = 2 * i; j < 32; j *= 2) { + // table[j] = table[j/2]^2 = a_inv^j + rsaz_1024_sqr_avx2(result, result, m, k0, 1); + rsaz_1024_scatter5_avx2(table_s, result, j); + } + } + + // Load the first window. + const uint8_t *p_str = (const uint8_t *)exponent; + int wvalue = p_str[127] >> 3; + rsaz_1024_gather5_avx2(result, table_s, wvalue); + + int index = 1014; + while (index > -1) { // Loop for the remaining 127 windows. + rsaz_1024_sqr_avx2(result, result, m, k0, 5); + + uint16_t wvalue_16; + memcpy(&wvalue_16, &p_str[index / 8], sizeof(wvalue_16)); + wvalue = wvalue_16; + wvalue = (wvalue >> (index % 8)) & 31; + index -= 5; + + rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); // Borrow |a_inv|. + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + } + + // Square four times. + rsaz_1024_sqr_avx2(result, result, m, k0, 4); + + wvalue = p_str[0] & 15; + + rsaz_1024_gather5_avx2(a_inv, table_s, wvalue); // Borrow |a_inv|. + rsaz_1024_mul_avx2(result, result, a_inv, m, k0); + + // Convert from Montgomery. + rsaz_1024_mul_avx2(result, result, rsaz_one, m, k0); + + rsaz_1024_red2norm_avx2(result_norm, result); + BN_ULONG scratch[16]; + bn_reduce_once_in_place(result_norm, /*carry=*/0, m_norm, scratch, 16); + + OPENSSL_cleanse(storage, MOD_EXP_CTIME_STORAGE_LEN * sizeof(BN_ULONG)); +} + +#endif // RSAZ_ENABLED diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/rsaz_exp.h b/third_party/boringssl/src/crypto/fipsmodule/bn/rsaz_exp.h index bc7a439e..9bd3cb9a 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/rsaz_exp.h +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/rsaz_exp.h @@ -1,57 +1,57 @@ -/* - * Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. - * Copyright (c) 2012, Intel Corporation. All Rights Reserved. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) - * (1) Intel Corporation, Israel Development Center, Haifa, Israel - * (2) University of Haifa, Israel - */ - -#ifndef OPENSSL_HEADER_BN_RSAZ_EXP_H -#define OPENSSL_HEADER_BN_RSAZ_EXP_H +// Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2012, Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) +// (1) Intel Corporation, Israel Development Center, Haifa, Israel +// (2) University of Haifa, Israel + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_BN_RSAZ_EXP_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_BN_RSAZ_EXP_H #include -#include "internal.h" #include "../../internal.h" +#include "internal.h" + -#if defined(__cplusplus) -extern "C" { -#endif +BSSL_NAMESPACE_BEGIN #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) #define RSAZ_ENABLED - // RSAZ_1024_mod_exp_avx2 sets |result| to |base_norm| raised to |exponent| // modulo |m_norm|. |base_norm| must be fully-reduced and |exponent| must have // the high bit set (it is 1024 bits wide). |RR| and |k0| must be |RR| and |n0|, // respectively, extracted from |m_norm|'s |BN_MONT_CTX|. |storage_words| is a -// temporary buffer that must be aligned to |MOD_EXP_CTIME_MIN_CACHE_LINE_WIDTH| -// bytes. +// temporary buffer that must be aligned to |MOD_EXP_CTIME_ALIGN| bytes. void RSAZ_1024_mod_exp_avx2(BN_ULONG result[16], const BN_ULONG base_norm[16], const BN_ULONG exponent[16], const BN_ULONG m_norm[16], const BN_ULONG RR[16], BN_ULONG k0, BN_ULONG storage_words[MOD_EXP_CTIME_STORAGE_LEN]); -OPENSSL_INLINE int rsaz_avx2_capable(void) { - return CRYPTO_is_AVX2_capable(); -} +inline int rsaz_avx2_capable() { return bssl::CRYPTO_is_AVX2_capable(); } -OPENSSL_INLINE int rsaz_avx2_preferred(void) { - if (CRYPTO_is_BMI1_capable() && CRYPTO_is_BMI2_capable() && - CRYPTO_is_ADX_capable()) { +inline int rsaz_avx2_preferred() { + if (bssl::CRYPTO_is_BMI1_capable() && bssl::CRYPTO_is_BMI2_capable() && + bssl::CRYPTO_is_ADX_capable()) { // If BMI1, BMI2, and ADX are available, x86_64-mont5.pl is faster. See the // .Lmulx4x_enter and .Lpowerx5_enter branches. return 0; } - return CRYPTO_is_AVX2_capable(); + return bssl::CRYPTO_is_AVX2_capable(); } @@ -64,30 +64,34 @@ OPENSSL_INLINE int rsaz_avx2_preferred(void) { // rsaz_1024_norm2red_avx2 converts |norm| from |BIGNUM| to RSAZ representation // and writes the result to |red|. -void rsaz_1024_norm2red_avx2(BN_ULONG red[40], const BN_ULONG norm[16]); +extern "C" void rsaz_1024_norm2red_avx2(BN_ULONG red[40], + const BN_ULONG norm[16]); // rsaz_1024_mul_avx2 computes |a| * |b| mod |n| and writes the result to |ret|. // Inputs and outputs are in Montgomery form, using RSAZ's representation. |k| // is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|. -void rsaz_1024_mul_avx2(BN_ULONG ret[40], const BN_ULONG a[40], - const BN_ULONG b[40], const BN_ULONG n[40], BN_ULONG k); +extern "C" void rsaz_1024_mul_avx2(BN_ULONG ret[40], const BN_ULONG a[40], + const BN_ULONG b[40], const BN_ULONG n[40], + BN_ULONG k); // rsaz_1024_mul_avx2 computes |a|^(2*|count|) mod |n| and writes the result to // |ret|. Inputs and outputs are in Montgomery form, using RSAZ's // representation. |k| is -|n|^-1 mod 2^64 or |n0| from |BN_MONT_CTX|. -void rsaz_1024_sqr_avx2(BN_ULONG ret[40], const BN_ULONG a[40], - const BN_ULONG n[40], BN_ULONG k, int count); +extern "C" void rsaz_1024_sqr_avx2(BN_ULONG ret[40], const BN_ULONG a[40], + const BN_ULONG n[40], BN_ULONG k, int count); // rsaz_1024_scatter5_avx2 stores |val| at index |i| of |tbl|. |i| must be -// positive and at most 31. Note the table only uses 18 |BN_ULONG|s per entry -// instead of 40. It packs two 29-bit limbs into each |BN_ULONG| and only stores -// 36 limbs rather than the padded 40. -void rsaz_1024_scatter5_avx2(BN_ULONG tbl[32 * 18], const BN_ULONG val[40], - int i); - -// rsaz_1024_gather5_avx2 loads index |i| of |tbl| and writes it to |val|. -void rsaz_1024_gather5_avx2(BN_ULONG val[40], const BN_ULONG tbl[32 * 18], - int i); +// positive and at most 31. It is treated as public. Note the table only uses 18 +// |BN_ULONG|s per entry instead of 40. It packs two 29-bit limbs into each +// |BN_ULONG| and only stores 36 limbs rather than the padded 40. +extern "C" void rsaz_1024_scatter5_avx2(BN_ULONG tbl[32 * 18], + const BN_ULONG val[40], int i); + +// rsaz_1024_gather5_avx2 loads index |i| of |tbl| and writes it to |val|. |i| +// must be positive and at most 31. It is treated as secret. |tbl| must be +// aligned to 32 bytes. +extern "C" void rsaz_1024_gather5_avx2(BN_ULONG val[40], + const BN_ULONG tbl[32 * 18], int i); // rsaz_1024_red2norm_avx2 converts |red| from RSAZ to |BIGNUM| representation // and writes the result to |norm|. The result will be <= the modulus. @@ -95,13 +99,11 @@ void rsaz_1024_gather5_avx2(BN_ULONG val[40], const BN_ULONG tbl[32 * 18], // WARNING: The result of this operation may not be fully reduced. |norm| may be // the modulus instead of zero. This function should be followed by a call to // |bn_reduce_once|. -void rsaz_1024_red2norm_avx2(BN_ULONG norm[16], const BN_ULONG red[40]); - +extern "C" void rsaz_1024_red2norm_avx2(BN_ULONG norm[16], + const BN_ULONG red[40]); #endif // !OPENSSL_NO_ASM && OPENSSL_X86_64 -#if defined(__cplusplus) -} // extern "C" -#endif +BSSL_NAMESPACE_END -#endif // OPENSSL_HEADER_BN_RSAZ_EXP_H +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_BN_RSAZ_EXP_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/shift.c b/third_party/boringssl/src/crypto/fipsmodule/bn/shift.c deleted file mode 100644 index 55f864ea..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/shift.c +++ /dev/null @@ -1,363 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include - -#include "internal.h" - - -int BN_lshift(BIGNUM *r, const BIGNUM *a, int n) { - int i, nw, lb, rb; - BN_ULONG *t, *f; - BN_ULONG l; - - if (n < 0) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - - r->neg = a->neg; - nw = n / BN_BITS2; - if (!bn_wexpand(r, a->width + nw + 1)) { - return 0; - } - lb = n % BN_BITS2; - rb = BN_BITS2 - lb; - f = a->d; - t = r->d; - t[a->width + nw] = 0; - if (lb == 0) { - for (i = a->width - 1; i >= 0; i--) { - t[nw + i] = f[i]; - } - } else { - for (i = a->width - 1; i >= 0; i--) { - l = f[i]; - t[nw + i + 1] |= l >> rb; - t[nw + i] = l << lb; - } - } - OPENSSL_memset(t, 0, nw * sizeof(t[0])); - r->width = a->width + nw + 1; - bn_set_minimal_width(r); - - return 1; -} - -int BN_lshift1(BIGNUM *r, const BIGNUM *a) { - BN_ULONG *ap, *rp, t, c; - int i; - - if (r != a) { - r->neg = a->neg; - if (!bn_wexpand(r, a->width + 1)) { - return 0; - } - r->width = a->width; - } else { - if (!bn_wexpand(r, a->width + 1)) { - return 0; - } - } - ap = a->d; - rp = r->d; - c = 0; - for (i = 0; i < a->width; i++) { - t = *(ap++); - *(rp++) = (t << 1) | c; - c = t >> (BN_BITS2 - 1); - } - if (c) { - *rp = 1; - r->width++; - } - - return 1; -} - -void bn_rshift_words(BN_ULONG *r, const BN_ULONG *a, unsigned shift, - size_t num) { - unsigned shift_bits = shift % BN_BITS2; - size_t shift_words = shift / BN_BITS2; - if (shift_words >= num) { - OPENSSL_memset(r, 0, num * sizeof(BN_ULONG)); - return; - } - if (shift_bits == 0) { - OPENSSL_memmove(r, a + shift_words, (num - shift_words) * sizeof(BN_ULONG)); - } else { - for (size_t i = shift_words; i < num - 1; i++) { - r[i - shift_words] = - (a[i] >> shift_bits) | (a[i + 1] << (BN_BITS2 - shift_bits)); - } - r[num - 1 - shift_words] = a[num - 1] >> shift_bits; - } - OPENSSL_memset(r + num - shift_words, 0, shift_words * sizeof(BN_ULONG)); -} - -int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) { - if (n < 0) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - - if (!bn_wexpand(r, a->width)) { - return 0; - } - bn_rshift_words(r->d, a->d, n, a->width); - r->neg = a->neg; - r->width = a->width; - bn_set_minimal_width(r); - return 1; -} - -int bn_rshift_secret_shift(BIGNUM *r, const BIGNUM *a, unsigned n, - BN_CTX *ctx) { - int ret = 0; - BN_CTX_start(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - if (tmp == NULL || - !BN_copy(r, a) || - !bn_wexpand(tmp, r->width)) { - goto err; - } - - // Shift conditionally by powers of two. - unsigned max_bits = BN_BITS2 * r->width; - for (unsigned i = 0; (max_bits >> i) != 0; i++) { - BN_ULONG mask = (n >> i) & 1; - mask = 0 - mask; - bn_rshift_words(tmp->d, r->d, 1u << i, r->width); - bn_select_words(r->d, mask, tmp->d /* apply shift */, - r->d /* ignore shift */, r->width); - } - - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -void bn_rshift1_words(BN_ULONG *r, const BN_ULONG *a, size_t num) { - if (num == 0) { - return; - } - for (size_t i = 0; i < num - 1; i++) { - r[i] = (a[i] >> 1) | (a[i + 1] << (BN_BITS2 - 1)); - } - r[num - 1] = a[num - 1] >> 1; -} - -int BN_rshift1(BIGNUM *r, const BIGNUM *a) { - if (!bn_wexpand(r, a->width)) { - return 0; - } - bn_rshift1_words(r->d, a->d, a->width); - r->width = a->width; - r->neg = a->neg; - bn_set_minimal_width(r); - return 1; -} - -int BN_set_bit(BIGNUM *a, int n) { - if (n < 0) { - return 0; - } - - int i = n / BN_BITS2; - int j = n % BN_BITS2; - if (a->width <= i) { - if (!bn_wexpand(a, i + 1)) { - return 0; - } - for (int k = a->width; k < i + 1; k++) { - a->d[k] = 0; - } - a->width = i + 1; - } - - a->d[i] |= (((BN_ULONG)1) << j); - - return 1; -} - -int BN_clear_bit(BIGNUM *a, int n) { - int i, j; - - if (n < 0) { - return 0; - } - - i = n / BN_BITS2; - j = n % BN_BITS2; - if (a->width <= i) { - return 0; - } - - a->d[i] &= (~(((BN_ULONG)1) << j)); - bn_set_minimal_width(a); - return 1; -} - -int bn_is_bit_set_words(const BN_ULONG *a, size_t num, unsigned bit) { - unsigned i = bit / BN_BITS2; - unsigned j = bit % BN_BITS2; - if (i >= num) { - return 0; - } - return (a[i] >> j) & 1; -} - -int BN_is_bit_set(const BIGNUM *a, int n) { - if (n < 0) { - return 0; - } - return bn_is_bit_set_words(a->d, a->width, n); -} - -int BN_mask_bits(BIGNUM *a, int n) { - if (n < 0) { - return 0; - } - - int w = n / BN_BITS2; - int b = n % BN_BITS2; - if (w >= a->width) { - return 1; - } - if (b == 0) { - a->width = w; - } else { - a->width = w + 1; - a->d[w] &= ~(BN_MASK2 << b); - } - - bn_set_minimal_width(a); - return 1; -} - -static int bn_count_low_zero_bits_word(BN_ULONG l) { - static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), - "crypto_word_t is too small"); - static_assert(sizeof(int) <= sizeof(crypto_word_t), - "crypto_word_t is too small"); - static_assert(BN_BITS2 == sizeof(BN_ULONG) * 8, "BN_ULONG has padding bits"); - // C has very bizarre rules for types smaller than an int. - static_assert(sizeof(BN_ULONG) >= sizeof(int), - "BN_ULONG gets promoted to int"); - - crypto_word_t mask; - int bits = 0; - -#if BN_BITS2 > 32 - // Check if the lower half of |x| are all zero. - mask = constant_time_is_zero_w(l << (BN_BITS2 - 32)); - // If the lower half is all zeros, it is included in the bit count and we - // count the upper half. Otherwise, we count the lower half. - bits += 32 & mask; - l = constant_time_select_w(mask, l >> 32, l); -#endif - - // The remaining blocks are analogous iterations at lower powers of two. - mask = constant_time_is_zero_w(l << (BN_BITS2 - 16)); - bits += 16 & mask; - l = constant_time_select_w(mask, l >> 16, l); - - mask = constant_time_is_zero_w(l << (BN_BITS2 - 8)); - bits += 8 & mask; - l = constant_time_select_w(mask, l >> 8, l); - - mask = constant_time_is_zero_w(l << (BN_BITS2 - 4)); - bits += 4 & mask; - l = constant_time_select_w(mask, l >> 4, l); - - mask = constant_time_is_zero_w(l << (BN_BITS2 - 2)); - bits += 2 & mask; - l = constant_time_select_w(mask, l >> 2, l); - - mask = constant_time_is_zero_w(l << (BN_BITS2 - 1)); - bits += 1 & mask; - - return bits; -} - -int BN_count_low_zero_bits(const BIGNUM *bn) { - static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), - "crypto_word_t is too small"); - static_assert(sizeof(int) <= sizeof(crypto_word_t), - "crypto_word_t is too small"); - - int ret = 0; - crypto_word_t saw_nonzero = 0; - for (int i = 0; i < bn->width; i++) { - crypto_word_t nonzero = ~constant_time_is_zero_w(bn->d[i]); - crypto_word_t first_nonzero = ~saw_nonzero & nonzero; - saw_nonzero |= nonzero; - - int bits = bn_count_low_zero_bits_word(bn->d[i]); - ret |= first_nonzero & (i * BN_BITS2 + bits); - } - - // If got to the end of |bn| and saw no non-zero words, |bn| is zero. |ret| - // will then remain zero. - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/shift.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/shift.cc.inc new file mode 100644 index 00000000..4799bac9 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/shift.cc.inc @@ -0,0 +1,317 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include "internal.h" + + +using namespace bssl; + +int BN_lshift(BIGNUM *r, const BIGNUM *a, int n) { + int i, nw, lb, rb; + BN_ULONG *t, *f; + BN_ULONG l; + + if (n < 0) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + + r->neg = a->neg; + nw = n / BN_BITS2; + if (!bn_wexpand(r, a->width + nw + 1)) { + return 0; + } + lb = n % BN_BITS2; + rb = BN_BITS2 - lb; + f = a->d; + t = r->d; + t[a->width + nw] = 0; + if (lb == 0) { + for (i = a->width - 1; i >= 0; i--) { + t[nw + i] = f[i]; + } + } else { + for (i = a->width - 1; i >= 0; i--) { + l = f[i]; + t[nw + i + 1] |= l >> rb; + t[nw + i] = l << lb; + } + } + OPENSSL_memset(t, 0, nw * sizeof(t[0])); + r->width = a->width + nw + 1; + bn_set_minimal_width(r); + + return 1; +} + +int BN_lshift1(BIGNUM *r, const BIGNUM *a) { + BN_ULONG *ap, *rp, t, c; + int i; + + if (r != a) { + r->neg = a->neg; + if (!bn_wexpand(r, a->width + 1)) { + return 0; + } + r->width = a->width; + } else { + if (!bn_wexpand(r, a->width + 1)) { + return 0; + } + } + ap = a->d; + rp = r->d; + c = 0; + for (i = 0; i < a->width; i++) { + t = *(ap++); + *(rp++) = (t << 1) | c; + c = t >> (BN_BITS2 - 1); + } + if (c) { + *rp = 1; + r->width++; + } + + return 1; +} + +void bssl::bn_rshift_words(BN_ULONG *r, const BN_ULONG *a, unsigned shift, + size_t num) { + unsigned shift_bits = shift % BN_BITS2; + size_t shift_words = shift / BN_BITS2; + if (shift_words >= num) { + OPENSSL_memset(r, 0, num * sizeof(BN_ULONG)); + return; + } + if (shift_bits == 0) { + OPENSSL_memmove(r, a + shift_words, (num - shift_words) * sizeof(BN_ULONG)); + } else { + for (size_t i = shift_words; i < num - 1; i++) { + r[i - shift_words] = + (a[i] >> shift_bits) | (a[i + 1] << (BN_BITS2 - shift_bits)); + } + r[num - 1 - shift_words] = a[num - 1] >> shift_bits; + } + OPENSSL_memset(r + num - shift_words, 0, shift_words * sizeof(BN_ULONG)); +} + +int BN_rshift(BIGNUM *r, const BIGNUM *a, int n) { + if (n < 0) { + OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); + return 0; + } + + if (!bn_wexpand(r, a->width)) { + return 0; + } + bn_rshift_words(r->d, a->d, n, a->width); + r->neg = a->neg; + r->width = a->width; + bn_set_minimal_width(r); + return 1; +} + +int bssl::bn_rshift_secret_shift(BIGNUM *r, const BIGNUM *a, unsigned n, + BN_CTX *ctx) { + BN_CTXScope scope(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + unsigned max_bits; + if (tmp == nullptr || !BN_copy(r, a) || !bn_wexpand(tmp, r->width)) { + return 0; + } + + // Shift conditionally by powers of two. + max_bits = BN_BITS2 * r->width; + for (unsigned i = 0; (max_bits >> i) != 0; i++) { + BN_ULONG mask = (n >> i) & 1; + mask = 0 - mask; + bn_rshift_words(tmp->d, r->d, 1u << i, r->width); + bn_select_words(r->d, mask, tmp->d /* apply shift */, + r->d /* ignore shift */, r->width); + } + + return 1; +} + +void bssl::bn_rshift1_words(BN_ULONG *r, const BN_ULONG *a, size_t num) { + if (num == 0) { + return; + } + for (size_t i = 0; i < num - 1; i++) { + r[i] = (a[i] >> 1) | (a[i + 1] << (BN_BITS2 - 1)); + } + r[num - 1] = a[num - 1] >> 1; +} + +int BN_rshift1(BIGNUM *r, const BIGNUM *a) { + if (!bn_wexpand(r, a->width)) { + return 0; + } + bn_rshift1_words(r->d, a->d, a->width); + r->width = a->width; + r->neg = a->neg; + bn_set_minimal_width(r); + return 1; +} + +int BN_set_bit(BIGNUM *a, int n) { + if (n < 0) { + return 0; + } + + int i = n / BN_BITS2; + int j = n % BN_BITS2; + if (a->width <= i) { + if (!bn_wexpand(a, i + 1)) { + return 0; + } + for (int k = a->width; k < i + 1; k++) { + a->d[k] = 0; + } + a->width = i + 1; + } + + a->d[i] |= (((BN_ULONG)1) << j); + + return 1; +} + +int BN_clear_bit(BIGNUM *a, int n) { + int i, j; + + if (n < 0) { + return 0; + } + + i = n / BN_BITS2; + j = n % BN_BITS2; + if (a->width <= i) { + return 0; + } + + a->d[i] &= (~(((BN_ULONG)1) << j)); + bn_set_minimal_width(a); + return 1; +} + +int bssl::bn_is_bit_set_words(const BN_ULONG *a, size_t num, size_t bit) { + size_t i = bit / BN_BITS2; + size_t j = bit % BN_BITS2; + if (i >= num) { + return 0; + } + return (a[i] >> j) & 1; +} + +int BN_is_bit_set(const BIGNUM *a, int n) { + if (n < 0) { + return 0; + } + return bn_is_bit_set_words(a->d, a->width, n); +} + +int BN_mask_bits(BIGNUM *a, int n) { + if (n < 0) { + return 0; + } + + int w = n / BN_BITS2; + int b = n % BN_BITS2; + if (w >= a->width) { + return 1; + } + if (b == 0) { + a->width = w; + } else { + a->width = w + 1; + a->d[w] &= ~(BN_MASK2 << b); + } + + bn_set_minimal_width(a); + return 1; +} + +static int bn_count_low_zero_bits_word(BN_ULONG l) { + static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), + "crypto_word_t is too small"); + static_assert(sizeof(int) <= sizeof(crypto_word_t), + "crypto_word_t is too small"); + static_assert(BN_BITS2 == sizeof(BN_ULONG) * 8, "BN_ULONG has padding bits"); + // C has very bizarre rules for types smaller than an int. + static_assert(sizeof(BN_ULONG) >= sizeof(int), + "BN_ULONG gets promoted to int"); + + crypto_word_t mask; + int bits = 0; + +#if BN_BITS2 > 32 + // Check if the lower half of |x| are all zero. + mask = constant_time_is_zero_w(l << (BN_BITS2 - 32)); + // If the lower half is all zeros, it is included in the bit count and we + // count the upper half. Otherwise, we count the lower half. + bits += 32 & mask; + l = constant_time_select_w(mask, l >> 32, l); +#endif + + // The remaining blocks are analogous iterations at lower powers of two. + mask = constant_time_is_zero_w(l << (BN_BITS2 - 16)); + bits += 16 & mask; + l = constant_time_select_w(mask, l >> 16, l); + + mask = constant_time_is_zero_w(l << (BN_BITS2 - 8)); + bits += 8 & mask; + l = constant_time_select_w(mask, l >> 8, l); + + mask = constant_time_is_zero_w(l << (BN_BITS2 - 4)); + bits += 4 & mask; + l = constant_time_select_w(mask, l >> 4, l); + + mask = constant_time_is_zero_w(l << (BN_BITS2 - 2)); + bits += 2 & mask; + l = constant_time_select_w(mask, l >> 2, l); + + mask = constant_time_is_zero_w(l << (BN_BITS2 - 1)); + bits += 1 & mask; + + return bits; +} + +int BN_count_low_zero_bits(const BIGNUM *bn) { + static_assert(sizeof(BN_ULONG) <= sizeof(crypto_word_t), + "crypto_word_t is too small"); + static_assert(sizeof(int) <= sizeof(crypto_word_t), + "crypto_word_t is too small"); + + int ret = 0; + crypto_word_t saw_nonzero = 0; + for (int i = 0; i < bn->width; i++) { + crypto_word_t nonzero = ~constant_time_is_zero_w(bn->d[i]); + crypto_word_t first_nonzero = ~saw_nonzero & nonzero; + saw_nonzero |= nonzero; + + int bits = bn_count_low_zero_bits_word(bn->d[i]); + ret |= first_nonzero & (i * BN_BITS2 + bits); + } + + // If got to the end of |bn| and saw no non-zero words, |bn| is zero. |ret| + // will then remain zero. + return ret; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/sqrt.c b/third_party/boringssl/src/crypto/fipsmodule/bn/sqrt.c deleted file mode 100644 index 9180d540..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/bn/sqrt.c +++ /dev/null @@ -1,500 +0,0 @@ -/* Written by Lenka Fibikova - * and Bodo Moeller for the OpenSSL project. */ -/* ==================================================================== - * Copyright (c) 1998-2000 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include - -#include "internal.h" - - -BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { - // Compute a square root of |a| mod |p| using the Tonelli/Shanks algorithm - // (cf. Henri Cohen, "A Course in Algebraic Computational Number Theory", - // algorithm 1.5.1). |p| is assumed to be a prime. - - BIGNUM *ret = in; - int err = 1; - int r; - BIGNUM *A, *b, *q, *t, *x, *y; - int e, i, j; - - if (!BN_is_odd(p) || BN_abs_is_word(p, 1)) { - if (BN_abs_is_word(p, 2)) { - if (ret == NULL) { - ret = BN_new(); - } - if (ret == NULL || - !BN_set_word(ret, BN_is_bit_set(a, 0))) { - if (ret != in) { - BN_free(ret); - } - return NULL; - } - return ret; - } - - OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME); - return NULL; - } - - if (BN_is_zero(a) || BN_is_one(a)) { - if (ret == NULL) { - ret = BN_new(); - } - if (ret == NULL || - !BN_set_word(ret, BN_is_one(a))) { - if (ret != in) { - BN_free(ret); - } - return NULL; - } - return ret; - } - - BN_CTX_start(ctx); - A = BN_CTX_get(ctx); - b = BN_CTX_get(ctx); - q = BN_CTX_get(ctx); - t = BN_CTX_get(ctx); - x = BN_CTX_get(ctx); - y = BN_CTX_get(ctx); - if (y == NULL) { - goto end; - } - - if (ret == NULL) { - ret = BN_new(); - } - if (ret == NULL) { - goto end; - } - - // A = a mod p - if (!BN_nnmod(A, a, p, ctx)) { - goto end; - } - - // now write |p| - 1 as 2^e*q where q is odd - e = 1; - while (!BN_is_bit_set(p, e)) { - e++; - } - // we'll set q later (if needed) - - if (e == 1) { - // The easy case: (|p|-1)/2 is odd, so 2 has an inverse - // modulo (|p|-1)/2, and square roots can be computed - // directly by modular exponentiation. - // We have - // 2 * (|p|+1)/4 == 1 (mod (|p|-1)/2), - // so we can use exponent (|p|+1)/4, i.e. (|p|-3)/4 + 1. - if (!BN_rshift(q, p, 2)) { - goto end; - } - q->neg = 0; - if (!BN_add_word(q, 1) || - !BN_mod_exp_mont(ret, A, q, p, ctx, NULL)) { - goto end; - } - err = 0; - goto vrfy; - } - - if (e == 2) { - // |p| == 5 (mod 8) - // - // In this case 2 is always a non-square since - // Legendre(2,p) = (-1)^((p^2-1)/8) for any odd prime. - // So if a really is a square, then 2*a is a non-square. - // Thus for - // b := (2*a)^((|p|-5)/8), - // i := (2*a)*b^2 - // we have - // i^2 = (2*a)^((1 + (|p|-5)/4)*2) - // = (2*a)^((p-1)/2) - // = -1; - // so if we set - // x := a*b*(i-1), - // then - // x^2 = a^2 * b^2 * (i^2 - 2*i + 1) - // = a^2 * b^2 * (-2*i) - // = a*(-i)*(2*a*b^2) - // = a*(-i)*i - // = a. - // - // (This is due to A.O.L. Atkin, - // , - // November 1992.) - - // t := 2*a - if (!bn_mod_lshift1_consttime(t, A, p, ctx)) { - goto end; - } - - // b := (2*a)^((|p|-5)/8) - if (!BN_rshift(q, p, 3)) { - goto end; - } - q->neg = 0; - if (!BN_mod_exp_mont(b, t, q, p, ctx, NULL)) { - goto end; - } - - // y := b^2 - if (!BN_mod_sqr(y, b, p, ctx)) { - goto end; - } - - // t := (2*a)*b^2 - 1 - if (!BN_mod_mul(t, t, y, p, ctx) || - !BN_sub_word(t, 1)) { - goto end; - } - - // x = a*b*t - if (!BN_mod_mul(x, A, b, p, ctx) || - !BN_mod_mul(x, x, t, p, ctx)) { - goto end; - } - - if (!BN_copy(ret, x)) { - goto end; - } - err = 0; - goto vrfy; - } - - // e > 2, so we really have to use the Tonelli/Shanks algorithm. - // First, find some y that is not a square. - if (!BN_copy(q, p)) { - goto end; // use 'q' as temp - } - q->neg = 0; - i = 2; - do { - // For efficiency, try small numbers first; - // if this fails, try random numbers. - if (i < 22) { - if (!BN_set_word(y, i)) { - goto end; - } - } else { - if (!BN_pseudo_rand(y, BN_num_bits(p), 0, 0)) { - goto end; - } - if (BN_ucmp(y, p) >= 0) { - if (!(p->neg ? BN_add : BN_sub)(y, y, p)) { - goto end; - } - } - // now 0 <= y < |p| - if (BN_is_zero(y)) { - if (!BN_set_word(y, i)) { - goto end; - } - } - } - - r = bn_jacobi(y, q, ctx); // here 'q' is |p| - if (r < -1) { - goto end; - } - if (r == 0) { - // m divides p - OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME); - goto end; - } - } while (r == 1 && ++i < 82); - - if (r != -1) { - // Many rounds and still no non-square -- this is more likely - // a bug than just bad luck. - // Even if p is not prime, we should have found some y - // such that r == -1. - OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_ITERATIONS); - goto end; - } - - // Here's our actual 'q': - if (!BN_rshift(q, q, e)) { - goto end; - } - - // Now that we have some non-square, we can find an element - // of order 2^e by computing its q'th power. - if (!BN_mod_exp_mont(y, y, q, p, ctx, NULL)) { - goto end; - } - if (BN_is_one(y)) { - OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME); - goto end; - } - - // Now we know that (if p is indeed prime) there is an integer - // k, 0 <= k < 2^e, such that - // - // a^q * y^k == 1 (mod p). - // - // As a^q is a square and y is not, k must be even. - // q+1 is even, too, so there is an element - // - // X := a^((q+1)/2) * y^(k/2), - // - // and it satisfies - // - // X^2 = a^q * a * y^k - // = a, - // - // so it is the square root that we are looking for. - - // t := (q-1)/2 (note that q is odd) - if (!BN_rshift1(t, q)) { - goto end; - } - - // x := a^((q-1)/2) - if (BN_is_zero(t)) { // special case: p = 2^e + 1 - if (!BN_nnmod(t, A, p, ctx)) { - goto end; - } - if (BN_is_zero(t)) { - // special case: a == 0 (mod p) - BN_zero(ret); - err = 0; - goto end; - } else if (!BN_one(x)) { - goto end; - } - } else { - if (!BN_mod_exp_mont(x, A, t, p, ctx, NULL)) { - goto end; - } - if (BN_is_zero(x)) { - // special case: a == 0 (mod p) - BN_zero(ret); - err = 0; - goto end; - } - } - - // b := a*x^2 (= a^q) - if (!BN_mod_sqr(b, x, p, ctx) || - !BN_mod_mul(b, b, A, p, ctx)) { - goto end; - } - - // x := a*x (= a^((q+1)/2)) - if (!BN_mod_mul(x, x, A, p, ctx)) { - goto end; - } - - while (1) { - // Now b is a^q * y^k for some even k (0 <= k < 2^E - // where E refers to the original value of e, which we - // don't keep in a variable), and x is a^((q+1)/2) * y^(k/2). - // - // We have a*b = x^2, - // y^2^(e-1) = -1, - // b^2^(e-1) = 1. - if (BN_is_one(b)) { - if (!BN_copy(ret, x)) { - goto end; - } - err = 0; - goto vrfy; - } - - // Find the smallest i, 0 < i < e, such that b^(2^i) = 1 - for (i = 1; i < e; i++) { - if (i == 1) { - if (!BN_mod_sqr(t, b, p, ctx)) { - goto end; - } - } else { - if (!BN_mod_mul(t, t, t, p, ctx)) { - goto end; - } - } - if (BN_is_one(t)) { - break; - } - } - // If not found, a is not a square or p is not a prime. - if (i >= e) { - OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE); - goto end; - } - - // t := y^2^(e - i - 1) - if (!BN_copy(t, y)) { - goto end; - } - for (j = e - i - 1; j > 0; j--) { - if (!BN_mod_sqr(t, t, p, ctx)) { - goto end; - } - } - if (!BN_mod_mul(y, t, t, p, ctx) || - !BN_mod_mul(x, x, t, p, ctx) || - !BN_mod_mul(b, b, y, p, ctx)) { - goto end; - } - - // e decreases each iteration, so this loop will terminate. - assert(i < e); - e = i; - } - -vrfy: - if (!err) { - // Verify the result. The input might have been not a square. - if (!BN_mod_sqr(x, ret, p, ctx)) { - err = 1; - } - - if (!err && 0 != BN_cmp(x, A)) { - OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE); - err = 1; - } - } - -end: - if (err) { - if (ret != in) { - BN_clear_free(ret); - } - ret = NULL; - } - BN_CTX_end(ctx); - return ret; -} - -int BN_sqrt(BIGNUM *out_sqrt, const BIGNUM *in, BN_CTX *ctx) { - BIGNUM *estimate, *tmp, *delta, *last_delta, *tmp2; - int ok = 0, last_delta_valid = 0; - - if (in->neg) { - OPENSSL_PUT_ERROR(BN, BN_R_NEGATIVE_NUMBER); - return 0; - } - if (BN_is_zero(in)) { - BN_zero(out_sqrt); - return 1; - } - - BN_CTX_start(ctx); - if (out_sqrt == in) { - estimate = BN_CTX_get(ctx); - } else { - estimate = out_sqrt; - } - tmp = BN_CTX_get(ctx); - last_delta = BN_CTX_get(ctx); - delta = BN_CTX_get(ctx); - if (estimate == NULL || tmp == NULL || last_delta == NULL || delta == NULL) { - OPENSSL_PUT_ERROR(BN, ERR_R_MALLOC_FAILURE); - goto err; - } - - // We estimate that the square root of an n-bit number is 2^{n/2}. - if (!BN_lshift(estimate, BN_value_one(), BN_num_bits(in)/2)) { - goto err; - } - - // This is Newton's method for finding a root of the equation |estimate|^2 - - // |in| = 0. - for (;;) { - // |estimate| = 1/2 * (|estimate| + |in|/|estimate|) - if (!BN_div(tmp, NULL, in, estimate, ctx) || - !BN_add(tmp, tmp, estimate) || - !BN_rshift1(estimate, tmp) || - // |tmp| = |estimate|^2 - !BN_sqr(tmp, estimate, ctx) || - // |delta| = |in| - |tmp| - !BN_sub(delta, in, tmp)) { - OPENSSL_PUT_ERROR(BN, ERR_R_BN_LIB); - goto err; - } - - delta->neg = 0; - // The difference between |in| and |estimate| squared is required to always - // decrease. This ensures that the loop always terminates, but I don't have - // a proof that it always finds the square root for a given square. - if (last_delta_valid && BN_cmp(delta, last_delta) >= 0) { - break; - } - - last_delta_valid = 1; - - tmp2 = last_delta; - last_delta = delta; - delta = tmp2; - } - - if (BN_cmp(tmp, in) != 0) { - OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE); - goto err; - } - - ok = 1; - -err: - if (ok && out_sqrt == in && !BN_copy(out_sqrt, estimate)) { - ok = 0; - } - BN_CTX_end(ctx); - return ok; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/bn/sqrt.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/bn/sqrt.cc.inc new file mode 100644 index 00000000..30804403 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/bn/sqrt.cc.inc @@ -0,0 +1,370 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "internal.h" + + +using namespace bssl; + +BIGNUM *BN_mod_sqrt(BIGNUM *in, const BIGNUM *a, const BIGNUM *p, BN_CTX *ctx) { + // Compute a square root of |a| mod |p| using the Tonelli/Shanks algorithm + // (cf. Henri Cohen, "A Course in Algebraic Computational Number Theory", + // algorithm 1.5.1). |p| is assumed to be a prime. + + BIGNUM *ret = in; + int err = 1; + int r; + BIGNUM *A, *b, *q, *t, *x, *y; + int e, i, j; + + if (!BN_is_odd(p) || BN_abs_is_word(p, 1)) { + if (BN_abs_is_word(p, 2)) { + if (ret == nullptr) { + ret = BN_new(); + } + if (ret == nullptr || !BN_set_word(ret, BN_is_bit_set(a, 0))) { + if (ret != in) { + BN_free(ret); + } + return nullptr; + } + return ret; + } + + OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME); + return nullptr; + } + + if (BN_is_zero(a) || BN_is_one(a)) { + if (ret == nullptr) { + ret = BN_new(); + } + if (ret == nullptr || !BN_set_word(ret, BN_is_one(a))) { + if (ret != in) { + BN_free(ret); + } + return nullptr; + } + return ret; + } + + BN_CTXScope scope(ctx); + A = BN_CTX_get(ctx); + b = BN_CTX_get(ctx); + q = BN_CTX_get(ctx); + t = BN_CTX_get(ctx); + x = BN_CTX_get(ctx); + y = BN_CTX_get(ctx); + if (y == nullptr) { + goto end; + } + + if (ret == nullptr) { + ret = BN_new(); + } + if (ret == nullptr) { + goto end; + } + + // A = a mod p + if (!BN_nnmod(A, a, p, ctx)) { + goto end; + } + + // now write |p| - 1 as 2^e*q where q is odd + e = 1; + while (!BN_is_bit_set(p, e)) { + e++; + } + // we'll set q later (if needed) + + if (e == 1) { + // The easy case: (|p|-1)/2 is odd, so 2 has an inverse + // modulo (|p|-1)/2, and square roots can be computed + // directly by modular exponentiation. + // We have + // 2 * (|p|+1)/4 == 1 (mod (|p|-1)/2), + // so we can use exponent (|p|+1)/4, i.e. (|p|-3)/4 + 1. + if (!BN_rshift(q, p, 2)) { + goto end; + } + q->neg = 0; + if (!BN_add_word(q, 1) || !BN_mod_exp_mont(ret, A, q, p, ctx, nullptr)) { + goto end; + } + err = 0; + goto vrfy; + } + + if (e == 2) { + // |p| == 5 (mod 8) + // + // In this case 2 is always a non-square since + // Legendre(2,p) = (-1)^((p^2-1)/8) for any odd prime. + // So if a really is a square, then 2*a is a non-square. + // Thus for + // b := (2*a)^((|p|-5)/8), + // i := (2*a)*b^2 + // we have + // i^2 = (2*a)^((1 + (|p|-5)/4)*2) + // = (2*a)^((p-1)/2) + // = -1; + // so if we set + // x := a*b*(i-1), + // then + // x^2 = a^2 * b^2 * (i^2 - 2*i + 1) + // = a^2 * b^2 * (-2*i) + // = a*(-i)*(2*a*b^2) + // = a*(-i)*i + // = a. + // + // (This is due to A.O.L. Atkin, + // , + // November 1992.) + + // t := 2*a + if (!bn_mod_lshift1_consttime(t, A, p, ctx)) { + goto end; + } + + // b := (2*a)^((|p|-5)/8) + if (!BN_rshift(q, p, 3)) { + goto end; + } + q->neg = 0; + if (!BN_mod_exp_mont(b, t, q, p, ctx, nullptr)) { + goto end; + } + + // y := b^2 + if (!BN_mod_sqr(y, b, p, ctx)) { + goto end; + } + + // t := (2*a)*b^2 - 1 + if (!BN_mod_mul(t, t, y, p, ctx) || + !BN_sub_word(t, 1)) { + goto end; + } + + // x = a*b*t + if (!BN_mod_mul(x, A, b, p, ctx) || + !BN_mod_mul(x, x, t, p, ctx)) { + goto end; + } + + if (!BN_copy(ret, x)) { + goto end; + } + err = 0; + goto vrfy; + } + + // e > 2, so we really have to use the Tonelli/Shanks algorithm. + // First, find some y that is not a square. + if (!BN_copy(q, p)) { + goto end; // use 'q' as temp + } + q->neg = 0; + i = 2; + do { + // For efficiency, try small numbers first; + // if this fails, try random numbers. + if (i < 22) { + if (!BN_set_word(y, i)) { + goto end; + } + } else { + if (!BN_rand_range_ex(y, 22, p)) { + goto end; + } + } + + r = bn_jacobi(y, q, ctx); // here 'q' is |p| + if (r < -1) { + goto end; + } + if (r == 0) { + // m divides p + OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME); + goto end; + } + } while (r == 1 && ++i < 82); + + if (r != -1) { + // Many rounds and still no non-square -- this is more likely + // a bug than just bad luck. + // Even if p is not prime, we should have found some y + // such that r == -1. + OPENSSL_PUT_ERROR(BN, BN_R_TOO_MANY_ITERATIONS); + goto end; + } + + // Here's our actual 'q': + if (!BN_rshift(q, q, e)) { + goto end; + } + + // Now that we have some non-square, we can find an element + // of order 2^e by computing its q'th power. + if (!BN_mod_exp_mont(y, y, q, p, ctx, nullptr)) { + goto end; + } + if (BN_is_one(y)) { + OPENSSL_PUT_ERROR(BN, BN_R_P_IS_NOT_PRIME); + goto end; + } + + // Now we know that (if p is indeed prime) there is an integer + // k, 0 <= k < 2^e, such that + // + // a^q * y^k == 1 (mod p). + // + // As a^q is a square and y is not, k must be even. + // q+1 is even, too, so there is an element + // + // X := a^((q+1)/2) * y^(k/2), + // + // and it satisfies + // + // X^2 = a^q * a * y^k + // = a, + // + // so it is the square root that we are looking for. + + // t := (q-1)/2 (note that q is odd) + if (!BN_rshift1(t, q)) { + goto end; + } + + // x := a^((q-1)/2) + if (BN_is_zero(t)) { // special case: p = 2^e + 1 + if (!BN_nnmod(t, A, p, ctx)) { + goto end; + } + if (BN_is_zero(t)) { + // special case: a == 0 (mod p) + BN_zero(ret); + err = 0; + goto end; + } else if (!BN_one(x)) { + goto end; + } + } else { + if (!BN_mod_exp_mont(x, A, t, p, ctx, nullptr)) { + goto end; + } + if (BN_is_zero(x)) { + // special case: a == 0 (mod p) + BN_zero(ret); + err = 0; + goto end; + } + } + + // b := a*x^2 (= a^q) + if (!BN_mod_sqr(b, x, p, ctx) || + !BN_mod_mul(b, b, A, p, ctx)) { + goto end; + } + + // x := a*x (= a^((q+1)/2)) + if (!BN_mod_mul(x, x, A, p, ctx)) { + goto end; + } + + while (1) { + // Now b is a^q * y^k for some even k (0 <= k < 2^E + // where E refers to the original value of e, which we + // don't keep in a variable), and x is a^((q+1)/2) * y^(k/2). + // + // We have a*b = x^2, + // y^2^(e-1) = -1, + // b^2^(e-1) = 1. + if (BN_is_one(b)) { + if (!BN_copy(ret, x)) { + goto end; + } + err = 0; + goto vrfy; + } + + // Find the smallest i, 0 < i < e, such that b^(2^i) = 1 + for (i = 1; i < e; i++) { + if (i == 1) { + if (!BN_mod_sqr(t, b, p, ctx)) { + goto end; + } + } else { + if (!BN_mod_mul(t, t, t, p, ctx)) { + goto end; + } + } + if (BN_is_one(t)) { + break; + } + } + // If not found, a is not a square or p is not a prime. + if (i >= e) { + OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE); + goto end; + } + + // t := y^2^(e - i - 1) + if (!BN_copy(t, y)) { + goto end; + } + for (j = e - i - 1; j > 0; j--) { + if (!BN_mod_sqr(t, t, p, ctx)) { + goto end; + } + } + if (!BN_mod_mul(y, t, t, p, ctx) || + !BN_mod_mul(x, x, t, p, ctx) || + !BN_mod_mul(b, b, y, p, ctx)) { + goto end; + } + + // e decreases each iteration, so this loop will terminate. + assert(i < e); + e = i; + } + +vrfy: + if (!err) { + // Verify the result. The input might have been not a square. + if (!BN_mod_sqr(x, ret, p, ctx)) { + err = 1; + } + + if (!err && 0 != BN_cmp(x, A)) { + OPENSSL_PUT_ERROR(BN, BN_R_NOT_A_SQUARE); + err = 1; + } + } + +end: + if (err) { + if (ret != in) { + BN_clear_free(ret); + } + ret = nullptr; + } + return ret; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/cipher/aead.c b/third_party/boringssl/src/crypto/fipsmodule/cipher/aead.c deleted file mode 100644 index 97f0b0df..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/cipher/aead.c +++ /dev/null @@ -1,287 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include -#include -#include - -#include "internal.h" -#include "../../internal.h" - - -size_t EVP_AEAD_key_length(const EVP_AEAD *aead) { return aead->key_len; } - -size_t EVP_AEAD_nonce_length(const EVP_AEAD *aead) { return aead->nonce_len; } - -size_t EVP_AEAD_max_overhead(const EVP_AEAD *aead) { return aead->overhead; } - -size_t EVP_AEAD_max_tag_len(const EVP_AEAD *aead) { return aead->max_tag_len; } - -void EVP_AEAD_CTX_zero(EVP_AEAD_CTX *ctx) { - OPENSSL_memset(ctx, 0, sizeof(EVP_AEAD_CTX)); -} - -EVP_AEAD_CTX *EVP_AEAD_CTX_new(const EVP_AEAD *aead, const uint8_t *key, - size_t key_len, size_t tag_len) { - EVP_AEAD_CTX *ctx = OPENSSL_malloc(sizeof(EVP_AEAD_CTX)); - EVP_AEAD_CTX_zero(ctx); - - if (EVP_AEAD_CTX_init(ctx, aead, key, key_len, tag_len, NULL)) { - return ctx; - } - - EVP_AEAD_CTX_free(ctx); - return NULL; -} - -void EVP_AEAD_CTX_free(EVP_AEAD_CTX *ctx) { - if (ctx == NULL) { - return; - } - EVP_AEAD_CTX_cleanup(ctx); - OPENSSL_free(ctx); -} - -int EVP_AEAD_CTX_init(EVP_AEAD_CTX *ctx, const EVP_AEAD *aead, - const uint8_t *key, size_t key_len, size_t tag_len, - ENGINE *impl) { - if (!aead->init) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_NO_DIRECTION_SET); - ctx->aead = NULL; - return 0; - } - return EVP_AEAD_CTX_init_with_direction(ctx, aead, key, key_len, tag_len, - evp_aead_open); -} - -int EVP_AEAD_CTX_init_with_direction(EVP_AEAD_CTX *ctx, const EVP_AEAD *aead, - const uint8_t *key, size_t key_len, - size_t tag_len, - enum evp_aead_direction_t dir) { - if (key_len != aead->key_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_KEY_SIZE); - ctx->aead = NULL; - return 0; - } - - ctx->aead = aead; - - int ok; - if (aead->init) { - ok = aead->init(ctx, key, key_len, tag_len); - } else { - ok = aead->init_with_direction(ctx, key, key_len, tag_len, dir); - } - - if (!ok) { - ctx->aead = NULL; - } - - return ok; -} - -void EVP_AEAD_CTX_cleanup(EVP_AEAD_CTX *ctx) { - if (ctx->aead == NULL) { - return; - } - ctx->aead->cleanup(ctx); - ctx->aead = NULL; -} - -// check_alias returns 1 if |out| is compatible with |in| and 0 otherwise. If -// |in| and |out| alias, we require that |in| == |out|. -static int check_alias(const uint8_t *in, size_t in_len, const uint8_t *out, - size_t out_len) { - if (!buffers_alias(in, in_len, out, out_len)) { - return 1; - } - - return in == out; -} - -int EVP_AEAD_CTX_seal(const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len, - size_t max_out_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, - const uint8_t *ad, size_t ad_len) { - if (in_len + ctx->aead->overhead < in_len /* overflow */) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - goto error; - } - - if (max_out_len < in_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - goto error; - } - - if (!check_alias(in, in_len, out, max_out_len)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_OUTPUT_ALIASES_INPUT); - goto error; - } - - size_t out_tag_len; - if (ctx->aead->seal_scatter(ctx, out, out + in_len, &out_tag_len, - max_out_len - in_len, nonce, nonce_len, in, - in_len, NULL, 0, ad, ad_len)) { - *out_len = in_len + out_tag_len; - return 1; - } - -error: - // In the event of an error, clear the output buffer so that a caller - // that doesn't check the return value doesn't send raw data. - OPENSSL_memset(out, 0, max_out_len); - *out_len = 0; - return 0; -} - -int EVP_AEAD_CTX_seal_scatter( - const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag, size_t - *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce, size_t - nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in, - size_t extra_in_len, const uint8_t *ad, size_t ad_len) { - // |in| and |out| may alias exactly, |out_tag| may not alias. - if (!check_alias(in, in_len, out, in_len) || - buffers_alias(out, in_len, out_tag, max_out_tag_len) || - buffers_alias(in, in_len, out_tag, max_out_tag_len)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_OUTPUT_ALIASES_INPUT); - goto error; - } - - if (!ctx->aead->seal_scatter_supports_extra_in && extra_in_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_OPERATION); - goto error; - } - - if (ctx->aead->seal_scatter(ctx, out, out_tag, out_tag_len, max_out_tag_len, - nonce, nonce_len, in, in_len, extra_in, - extra_in_len, ad, ad_len)) { - return 1; - } - -error: - // In the event of an error, clear the output buffer so that a caller - // that doesn't check the return value doesn't send raw data. - OPENSSL_memset(out, 0, in_len); - OPENSSL_memset(out_tag, 0, max_out_tag_len); - *out_tag_len = 0; - return 0; -} - -int EVP_AEAD_CTX_open(const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len, - size_t max_out_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, - const uint8_t *ad, size_t ad_len) { - if (!check_alias(in, in_len, out, max_out_len)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_OUTPUT_ALIASES_INPUT); - goto error; - } - - if (ctx->aead->open) { - if (!ctx->aead->open(ctx, out, out_len, max_out_len, nonce, nonce_len, in, - in_len, ad, ad_len)) { - goto error; - } - return 1; - } - - // AEADs that use the default implementation of open() must set |tag_len| at - // initialization time. - assert(ctx->tag_len); - - if (in_len < ctx->tag_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - goto error; - } - - size_t plaintext_len = in_len - ctx->tag_len; - if (max_out_len < plaintext_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - goto error; - } - if (EVP_AEAD_CTX_open_gather(ctx, out, nonce, nonce_len, in, plaintext_len, - in + plaintext_len, ctx->tag_len, ad, ad_len)) { - *out_len = plaintext_len; - return 1; - } - -error: - // In the event of an error, clear the output buffer so that a caller - // that doesn't check the return value doesn't try and process bad - // data. - OPENSSL_memset(out, 0, max_out_len); - *out_len = 0; - return 0; -} - -int EVP_AEAD_CTX_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out, - const uint8_t *nonce, size_t nonce_len, - const uint8_t *in, size_t in_len, - const uint8_t *in_tag, size_t in_tag_len, - const uint8_t *ad, size_t ad_len) { - if (!check_alias(in, in_len, out, in_len)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_OUTPUT_ALIASES_INPUT); - goto error; - } - - if (!ctx->aead->open_gather) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_CTRL_NOT_IMPLEMENTED); - goto error; - } - - if (ctx->aead->open_gather(ctx, out, nonce, nonce_len, in, in_len, in_tag, - in_tag_len, ad, ad_len)) { - return 1; - } - -error: - // In the event of an error, clear the output buffer so that a caller - // that doesn't check the return value doesn't try and process bad - // data. - OPENSSL_memset(out, 0, in_len); - return 0; -} - -const EVP_AEAD *EVP_AEAD_CTX_aead(const EVP_AEAD_CTX *ctx) { return ctx->aead; } - -int EVP_AEAD_CTX_get_iv(const EVP_AEAD_CTX *ctx, const uint8_t **out_iv, - size_t *out_len) { - if (ctx->aead->get_iv == NULL) { - return 0; - } - - return ctx->aead->get_iv(ctx, out_iv, out_len); -} - -int EVP_AEAD_CTX_tag_len(const EVP_AEAD_CTX *ctx, size_t *out_tag_len, - const size_t in_len, const size_t extra_in_len) { - assert(ctx->aead->seal_scatter_supports_extra_in || !extra_in_len); - - if (ctx->aead->tag_len) { - *out_tag_len = ctx->aead->tag_len(ctx, in_len, extra_in_len); - return 1; - } - - if (extra_in_len + ctx->tag_len < extra_in_len) { - OPENSSL_PUT_ERROR(CIPHER, ERR_R_OVERFLOW); - *out_tag_len = 0; - return 0; - } - *out_tag_len = extra_in_len + ctx->tag_len; - return 1; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/cipher/aead.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/cipher/aead.cc.inc new file mode 100644 index 00000000..0e721ac6 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/cipher/aead.cc.inc @@ -0,0 +1,617 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + +#include +#include +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +size_t EVP_AEAD_key_length(const EVP_AEAD *aead) { return aead->key_len; } + +size_t EVP_AEAD_nonce_length(const EVP_AEAD *aead) { return aead->nonce_len; } + +size_t EVP_AEAD_max_overhead(const EVP_AEAD *aead) { return aead->overhead; } + +size_t EVP_AEAD_max_tag_len(const EVP_AEAD *aead) { return aead->max_tag_len; } + +void EVP_AEAD_CTX_zero(EVP_AEAD_CTX *ctx) { + OPENSSL_memset(ctx, 0, sizeof(EVP_AEAD_CTX)); +} + +EVP_AEAD_CTX *EVP_AEAD_CTX_new(const EVP_AEAD *aead, const uint8_t *key, + size_t key_len, size_t tag_len) { + EVP_AEAD_CTX *ctx = New(); + if (!ctx) { + return nullptr; + } + EVP_AEAD_CTX_zero(ctx); + + if (EVP_AEAD_CTX_init(ctx, aead, key, key_len, tag_len, nullptr)) { + return ctx; + } + + EVP_AEAD_CTX_free(ctx); + return nullptr; +} + +void EVP_AEAD_CTX_free(EVP_AEAD_CTX *ctx) { + if (ctx == nullptr) { + return; + } + EVP_AEAD_CTX_cleanup(ctx); + Delete(ctx); +} + +int EVP_AEAD_CTX_init(EVP_AEAD_CTX *ctx, const EVP_AEAD *aead, + const uint8_t *key, size_t key_len, size_t tag_len, + ENGINE *impl) { + if (!aead->init) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_NO_DIRECTION_SET); + ctx->aead = nullptr; + return 0; + } + return EVP_AEAD_CTX_init_with_direction(ctx, aead, key, key_len, tag_len, + evp_aead_open); +} + +int EVP_AEAD_CTX_init_with_direction(EVP_AEAD_CTX *ctx, const EVP_AEAD *aead, + const uint8_t *key, size_t key_len, + size_t tag_len, + enum evp_aead_direction_t dir) { + if (key_len != aead->key_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_KEY_SIZE); + ctx->aead = nullptr; + return 0; + } + + ctx->aead = aead; + + int ok; + if (aead->init) { + ok = aead->init(ctx, key, key_len, tag_len); + } else { + ok = aead->init_with_direction(ctx, key, key_len, tag_len, dir); + } + + if (!ok) { + ctx->aead = nullptr; + } + + return ok; +} + +void EVP_AEAD_CTX_cleanup(EVP_AEAD_CTX *ctx) { + if (ctx->aead == nullptr) { + return; + } + ctx->aead->cleanup(ctx); + ctx->aead = nullptr; +} + +// check_alias returns 1 if |out| is compatible with |in| and 0 otherwise. If +// |in| and |out| alias, we require that |in| == |out|. +static int check_alias(const uint8_t *in, size_t in_len, const uint8_t *out, + size_t out_len) { + if (!buffers_alias(in, in_len, out, out_len)) { + return 1; + } + + return in == out; +} + +int EVP_AEAD_CTX_seal(const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len, + size_t max_out_len, const uint8_t *nonce, + size_t nonce_len, const uint8_t *in, size_t in_len, + const uint8_t *ad, size_t ad_len) { + bool ok = false; + Cleanup cleanup([&] { + if (!ok) { + // In the event of an error, clear the output buffer so that a caller + // that doesn't check the return value doesn't send raw data. + OPENSSL_memset(out, 0, max_out_len); + *out_len = 0; + } + }); + + if (max_out_len < in_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + + CRYPTO_IOVEC iovec[1]; + iovec[0].in = in; + iovec[0].out = out; + iovec[0].len = in_len; + CRYPTO_IVEC aadvec[1]; + aadvec[0].in = ad; + aadvec[0].len = ad_len; + if (!EVP_AEAD_CTX_sealv(ctx, iovec, 1, out + in_len, out_len, + max_out_len - in_len, nonce, nonce_len, aadvec, 1)) { + *out_len = 0; + return 0; + } + *out_len += in_len; + ok = true; + return 1; +} + +int EVP_AEAD_CTX_seal_scatter(const EVP_AEAD_CTX *ctx, uint8_t *out, + uint8_t *out_tag, size_t *out_tag_len, + size_t max_out_tag_len, const uint8_t *nonce, + size_t nonce_len, const uint8_t *in, + size_t in_len, const uint8_t *extra_in, + size_t extra_in_len, const uint8_t *ad, + size_t ad_len) { + bool ok = false; + Cleanup cleanup([&] { + if (!ok) { + // In the event of an error, clear the output buffer so that a caller + // that doesn't check the return value doesn't send raw data. + OPENSSL_memset(out, 0, in_len); + OPENSSL_memset(out_tag, 0, max_out_tag_len); + *out_tag_len = 0; + } + }); + + // |out_tag| contains both the encryption of |extra_in| and the tag. + Span out_tag_span(out_tag, max_out_tag_len); + if (out_tag_span.size() < extra_in_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + Span extra_out = out_tag_span.first(extra_in_len); + Span tag_only_out = out_tag_span.subspan(extra_in_len); + + CRYPTO_IOVEC iovec[2]; + iovec[0].in = in; + iovec[0].out = out; + iovec[0].len = in_len; + iovec[1].in = extra_in; + iovec[1].out = extra_out.data(); + iovec[1].len = extra_in_len; + CRYPTO_IVEC aadvec[1]; + aadvec[0].in = ad; + aadvec[0].len = ad_len; + if (!EVP_AEAD_CTX_sealv(ctx, iovec, extra_in_len ? 2 : 1, tag_only_out.data(), + out_tag_len, tag_only_out.size(), nonce, nonce_len, + aadvec, 1)) { + *out_tag_len = 0; + return 0; + } + *out_tag_len += extra_in_len; + ok = true; + return 1; +} + +static bool check_iovec_internal_alias(Span iovecs) { + for (size_t i = 0; i < iovecs.size(); ++i) { + // Same index check. + if (!check_alias(iovecs[i].in, iovecs[i].len, iovecs[i].out, + iovecs[i].len)) { + return false; + } +#if !defined(NDEBUG) + // Unrealistic cases; they'd be harmful but also extremely unlikely anyone + // will ever get those wrong. Thus skip them in release builds. + for (size_t j = i + 1; j < iovecs.size(); ++j) { + if (buffers_alias(iovecs[i].in, iovecs[i].len, // + iovecs[j].out, iovecs[j].len) || + buffers_alias(iovecs[i].out, iovecs[i].len, // + iovecs[j].in, iovecs[j].len) || + buffers_alias(iovecs[i].out, iovecs[i].len, // + iovecs[j].out, iovecs[j].len)) { + return false; + } + } +#endif + } + return true; +} + +#if !defined(NDEBUG) +static bool check_ivec_buf_alias(Span ivecs, + const uint8_t *buf, size_t buf_len) { + for (const CRYPTO_IVEC &ivec : ivecs) { + if (buffers_alias(ivec.in, ivec.len, buf, buf_len)) { + return false; + } + } + return true; +} + +static bool check_iovec_out_ivec_alias(Span iovecs, + Span ivecs) { + for (const CRYPTO_IOVEC &iovec : iovecs) { + if (!check_ivec_buf_alias(ivecs, iovec.out, iovec.len)) { + return false; + } + } + return true; +} + +static bool check_iovec_buf_alias(Span iovecs, + const uint8_t *buf, size_t buf_len) { + for (const CRYPTO_IOVEC &iovec : iovecs) { + if (buffers_alias(iovec.in, iovec.len, buf, buf_len)) { + return false; + } + if (buffers_alias(iovec.out, iovec.len, buf, buf_len)) { + return false; + } + } + return true; +} + +static bool check_iovec_out_buf_alias(Span iovecs, + const uint8_t *buf, size_t buf_len) { + for (const CRYPTO_IOVEC &iovec : iovecs) { + if (buffers_alias(iovec.out, iovec.len, buf, buf_len)) { + return false; + } + } + return true; +} +#endif + +static bool check_iovec_alias(Span iovecs, + Span aadvecs, + const uint8_t *out, size_t out_len, + const uint8_t *in1, size_t in1_len, + const uint8_t *in2, size_t in2_len) { + return +#if !defined(NDEBUG) + // Unrealistic cases; they'd be harmful but also extremely unlikely anyone + // will ever get those wrong. Thus skip them in release builds. + // + // iovec.out <-> aadvec. + check_iovec_out_ivec_alias(iovecs, aadvecs) && + // iovec <-> out. + check_iovec_buf_alias(iovecs, out, out_len) && + // iovec.out <-> in1. + check_iovec_out_buf_alias(iovecs, in1, in1_len) && + // iovec.out <-> in2. + check_iovec_out_buf_alias(iovecs, in2, in2_len) && + // aadvec <-> out. + check_ivec_buf_alias(aadvecs, out, out_len) && + // out <-> in1. + !buffers_alias(out, out_len, in1, in1_len) && + // out <-> in2. + !buffers_alias(out, out_len, in2, in2_len) && +#endif + // iovec <-> iovec. + check_iovec_internal_alias(iovecs); +} + +static void clear_iovec(Span iovecs) { + for (const CRYPTO_IOVEC &iovec : iovecs) { + OPENSSL_memset(iovec.out, 0, iovec.len); + } +} + +int EVP_AEAD_CTX_sealv(const EVP_AEAD_CTX *ctx, const CRYPTO_IOVEC *iovec, + size_t num_iovec, uint8_t *out_tag, size_t *out_tag_len, + size_t max_out_tag_len, const uint8_t *nonce, + size_t nonce_len, const CRYPTO_IVEC *aadvec, + size_t num_aadvec) { + Span iovecs(iovec, num_iovec); + Span aadvecs(aadvec, num_aadvec); + + bool ok = false; + Cleanup cleanup([&] { + if (!ok) { + // In the event of an error, clear the output buffer so that a caller + // that doesn't check the return value doesn't send raw data. + clear_iovec(iovecs); + OPENSSL_memset(out_tag, 0, max_out_tag_len); + *out_tag_len = 0; + } + }); + + if (!bssl::iovec::IsValid(iovecs) || !bssl::iovec::IsValid(aadvecs)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + // Enforce aliasing rules: no output may alias any input, with the one + // exception that an iovec member's |in| and |out| pointers may be identical + // for in-place operation. + if (!check_iovec_alias(iovecs, aadvecs, out_tag, max_out_tag_len, nonce, + nonce_len, nullptr, 0)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_OUTPUT_ALIASES_INPUT); + return 0; + } + + if (!ctx->aead->sealv) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_CTRL_NOT_IMPLEMENTED); + return 0; + } + + if (ctx->aead->sealv(ctx, iovecs, Span(out_tag, max_out_tag_len), out_tag_len, + Span(nonce, nonce_len), aadvecs)) { + ok = true; + return 1; + } + + return 0; +} + +int EVP_AEAD_CTX_open(const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len, + size_t max_out_len, const uint8_t *nonce, + size_t nonce_len, const uint8_t *in, size_t in_len, + const uint8_t *ad, size_t ad_len) { + bool ok = false; + Cleanup cleanup([&] { + if (!ok) { + // In the event of an error, clear the output buffer so that a caller + // that doesn't check the return value doesn't try and process bad + // data. + OPENSSL_memset(out, 0, max_out_len); + *out_len = 0; + } + }); + + if (ctx->tag_len) { + // If the tag length is known, the caller only needs to provide enough + // space for in_len - tag_len. + if (in_len < ctx->tag_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + size_t plaintext_len = in_len - ctx->tag_len; + if (max_out_len < plaintext_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + + CRYPTO_IOVEC iovec[1]; + iovec[0].in = in; + iovec[0].out = out; + iovec[0].len = plaintext_len; + CRYPTO_IVEC aadvec[1]; + aadvec[0].in = ad; + aadvec[0].len = ad_len; + if (!EVP_AEAD_CTX_openv_detached(ctx, iovec, 1, nonce, nonce_len, + in + plaintext_len, ctx->tag_len, aadvec, + 1)) { + return 0; + } + *out_len = plaintext_len; + ok = true; + return 1; + } + + if (max_out_len < in_len) { + // Variable tag length AEADs need to be able to decrypt the entire + // plaintext before they can split it up. So the caller has to provide + // sufficient max_out_len for temporary data. + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + CRYPTO_IOVEC iovec[1]; + iovec[0].in = in; + iovec[0].out = out; + iovec[0].len = in_len; + CRYPTO_IVEC aadvec[1]; + aadvec[0].in = ad; + aadvec[0].len = ad_len; + if (!EVP_AEAD_CTX_openv(ctx, iovec, 1, out_len, nonce, nonce_len, aadvec, + 1)) { + return 0; + } + ok = true; + return 1; +} + +int EVP_AEAD_CTX_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out, + const uint8_t *nonce, size_t nonce_len, + const uint8_t *in, size_t in_len, + const uint8_t *in_tag, size_t in_tag_len, + const uint8_t *ad, size_t ad_len) { + bool ok = false; + Cleanup cleanup([&] { + if (!ok) { + // In the event of an error, clear the output buffer so that a caller + // that doesn't check the return value doesn't try and process bad + // data. + OPENSSL_memset(out, 0, in_len); + } + }); + + CRYPTO_IOVEC iovec[1]; + iovec[0].in = in; + iovec[0].out = out; + iovec[0].len = in_len; + CRYPTO_IVEC aadvec[1]; + aadvec[0].in = ad; + aadvec[0].len = ad_len; + if (!EVP_AEAD_CTX_openv_detached(ctx, iovec, 1, nonce, nonce_len, in_tag, + in_tag_len, aadvec, 1)) { + return 0; + } + ok = true; + return 1; +} + +int EVP_AEAD_CTX_openv(const EVP_AEAD_CTX *ctx, const CRYPTO_IOVEC *iovec, + size_t num_iovec, size_t *out_total_bytes, + const uint8_t *nonce, size_t nonce_len, + const CRYPTO_IVEC *aadvec, size_t num_aadvec) { + Span iovecs(iovec, num_iovec); + Span aadvecs(aadvec, num_aadvec); + + bool ok = false; + Cleanup cleanup([&] { + if (!ok) { + // In the event of an error, clear the output buffer so that a caller + // that doesn't check the return value doesn't try and process bad + // data. + clear_iovec(iovecs); + *out_total_bytes = 0; + } + }); + + if (!bssl::iovec::IsValid(iovecs) || !bssl::iovec::IsValid(aadvecs)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + // Enforce aliasing rules: no output may alias any input, with the one + // exception that an iovec member's |in| and |out| pointers may be identical + // for in-place operation. + if (!check_iovec_alias(iovecs, aadvecs, nullptr, 0, nonce, nonce_len, nullptr, + 0)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_OUTPUT_ALIASES_INPUT); + return 0; + } + + if (!ctx->aead->openv) { + if (ctx->tag_len && ctx->aead->openv_detached) { + // Try with a detached tag. + InplaceVector detached_iovecs; + detached_iovecs.CopyFrom(iovecs); + + uint8_t tagbuf[EVP_AEAD_MAX_OVERHEAD]; + std::optional> tag = bssl::iovec::GetAndRemoveSuffix( + Span(tagbuf).first(ctx->tag_len), Span(detached_iovecs)); + + if (!tag.has_value()) { // I.e. no |ctx->tag_len| bytes available. + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + if (ctx->aead->openv_detached(ctx, detached_iovecs, + Span(nonce, nonce_len), *tag, aadvecs)) { + ok = true; + *out_total_bytes = bssl::iovec::TotalLength(Span(detached_iovecs)); + return 1; + } + return 0; + } + + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_CTRL_NOT_IMPLEMENTED); + return 0; + } + + if (ctx->aead->openv(ctx, iovecs, out_total_bytes, Span(nonce, nonce_len), + aadvecs)) { + ok = true; + return 1; + } + + return 0; +} + +int EVP_AEAD_CTX_openv_detached(const EVP_AEAD_CTX *ctx, + const CRYPTO_IOVEC *iovec, size_t num_iovec, + const uint8_t *nonce, size_t nonce_len, + const uint8_t *in_tag, size_t in_tag_len, + const CRYPTO_IVEC *aadvec, size_t num_aadvec) { + Span iovecs(iovec, num_iovec); + Span aadvecs(aadvec, num_aadvec); + + bool ok = false; + Cleanup cleanup([&] { + if (!ok) { + // In the event of an error, clear the output buffer so that a caller + // that doesn't check the return value doesn't try and process bad + // data. + clear_iovec(iovecs); + } + }); + + if (!bssl::iovec::IsValid(iovecs) || !bssl::iovec::IsValid(aadvecs)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + if (in_tag_len > EVP_AEAD_MAX_OPEN_OVERHEAD) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_TAG_SIZE); + return 0; + } + + // Enforce aliasing rules: no output may alias any input, with the one + // exception that an iovec member's |in| and |out| pointers may be identical + // for in-place operation. + if (!check_iovec_alias(iovecs, aadvecs, nullptr, 0, nonce, nonce_len, in_tag, + in_tag_len)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_OUTPUT_ALIASES_INPUT); + return 0; + } + + if (!ctx->aead->openv_detached) { + // AEADs with variable overhead may provide openv instead of openv_detached. + // While one might call openv and then, on success, discard the result if + // the length was wrong, this requires callers to predict the plaintext + // length first. We do not expect callers to do this, especially in the TLS + // CBC construction, where this length is sensitive to the Lucky 13 attack. + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_CTRL_NOT_IMPLEMENTED); + return 0; + } + + if (ctx->aead->openv_detached(ctx, iovecs, Span(nonce, nonce_len), + Span(in_tag, in_tag_len), aadvecs)) { + ok = true; + return 1; + } + + return 0; +} + +const EVP_AEAD *EVP_AEAD_CTX_aead(const EVP_AEAD_CTX *ctx) { return ctx->aead; } + +int EVP_AEAD_CTX_get_iv(const EVP_AEAD_CTX *ctx, const uint8_t **out_iv, + size_t *out_len) { + if (ctx->aead->get_iv == nullptr) { + OPENSSL_PUT_ERROR(CIPHER, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + return ctx->aead->get_iv(ctx, out_iv, out_len); +} + +int EVP_AEAD_CTX_tag_len(const EVP_AEAD_CTX *ctx, size_t *out_tag_len, + const size_t in_len, const size_t extra_in_len) { + size_t tag_len; + if (ctx->aead->tag_len) { + if (in_len + extra_in_len < in_len) { + OPENSSL_PUT_ERROR(CIPHER, ERR_R_OVERFLOW); + *out_tag_len = 0; + return 0; + } + tag_len = ctx->aead->tag_len(ctx, in_len + extra_in_len); + } else { + tag_len = ctx->tag_len; + } + + if (extra_in_len + tag_len < extra_in_len) { + OPENSSL_PUT_ERROR(CIPHER, ERR_R_OVERFLOW); + *out_tag_len = 0; + return 0; + } + *out_tag_len = extra_in_len + tag_len; + return 1; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/cipher/cipher.c b/third_party/boringssl/src/crypto/fipsmodule/cipher/cipher.c deleted file mode 100644 index a1269862..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/cipher/cipher.c +++ /dev/null @@ -1,670 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include -#include -#include - -#include "internal.h" -#include "../service_indicator/internal.h" -#include "../../internal.h" - - -void EVP_CIPHER_CTX_init(EVP_CIPHER_CTX *ctx) { - OPENSSL_memset(ctx, 0, sizeof(EVP_CIPHER_CTX)); -} - -EVP_CIPHER_CTX *EVP_CIPHER_CTX_new(void) { - EVP_CIPHER_CTX *ctx = OPENSSL_malloc(sizeof(EVP_CIPHER_CTX)); - if (ctx) { - EVP_CIPHER_CTX_init(ctx); - } - return ctx; -} - -int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c) { - if (c->cipher != NULL && c->cipher->cleanup) { - c->cipher->cleanup(c); - } - OPENSSL_free(c->cipher_data); - - OPENSSL_memset(c, 0, sizeof(EVP_CIPHER_CTX)); - return 1; -} - -void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *ctx) { - if (ctx) { - EVP_CIPHER_CTX_cleanup(ctx); - OPENSSL_free(ctx); - } -} - -int EVP_CIPHER_CTX_copy(EVP_CIPHER_CTX *out, const EVP_CIPHER_CTX *in) { - if (in == NULL || in->cipher == NULL) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INPUT_NOT_INITIALIZED); - return 0; - } - - EVP_CIPHER_CTX_cleanup(out); - OPENSSL_memcpy(out, in, sizeof(EVP_CIPHER_CTX)); - - if (in->cipher_data && in->cipher->ctx_size) { - out->cipher_data = OPENSSL_malloc(in->cipher->ctx_size); - if (!out->cipher_data) { - out->cipher = NULL; - OPENSSL_PUT_ERROR(CIPHER, ERR_R_MALLOC_FAILURE); - return 0; - } - OPENSSL_memcpy(out->cipher_data, in->cipher_data, in->cipher->ctx_size); - } - - if (in->cipher->flags & EVP_CIPH_CUSTOM_COPY) { - if (!in->cipher->ctrl((EVP_CIPHER_CTX *)in, EVP_CTRL_COPY, 0, out)) { - out->cipher = NULL; - return 0; - } - } - - return 1; -} - -int EVP_CIPHER_CTX_reset(EVP_CIPHER_CTX *ctx) { - EVP_CIPHER_CTX_cleanup(ctx); - EVP_CIPHER_CTX_init(ctx); - return 1; -} - -int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, - ENGINE *engine, const uint8_t *key, const uint8_t *iv, - int enc) { - if (enc == -1) { - enc = ctx->encrypt; - } else { - if (enc) { - enc = 1; - } - ctx->encrypt = enc; - } - - if (cipher) { - // Ensure a context left from last time is cleared (the previous check - // attempted to avoid this if the same ENGINE and EVP_CIPHER could be - // used). - if (ctx->cipher) { - EVP_CIPHER_CTX_cleanup(ctx); - // Restore encrypt and flags - ctx->encrypt = enc; - } - - ctx->cipher = cipher; - if (ctx->cipher->ctx_size) { - ctx->cipher_data = OPENSSL_malloc(ctx->cipher->ctx_size); - if (!ctx->cipher_data) { - ctx->cipher = NULL; - OPENSSL_PUT_ERROR(CIPHER, ERR_R_MALLOC_FAILURE); - return 0; - } - } else { - ctx->cipher_data = NULL; - } - - ctx->key_len = cipher->key_len; - ctx->flags = 0; - - if (ctx->cipher->flags & EVP_CIPH_CTRL_INIT) { - if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_INIT, 0, NULL)) { - ctx->cipher = NULL; - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INITIALIZATION_ERROR); - return 0; - } - } - } else if (!ctx->cipher) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_NO_CIPHER_SET); - return 0; - } - - // we assume block size is a power of 2 in *cryptUpdate - assert(ctx->cipher->block_size == 1 || ctx->cipher->block_size == 8 || - ctx->cipher->block_size == 16); - - if (!(EVP_CIPHER_CTX_flags(ctx) & EVP_CIPH_CUSTOM_IV)) { - switch (EVP_CIPHER_CTX_mode(ctx)) { - case EVP_CIPH_STREAM_CIPHER: - case EVP_CIPH_ECB_MODE: - break; - - case EVP_CIPH_CFB_MODE: - ctx->num = 0; - OPENSSL_FALLTHROUGH; - - case EVP_CIPH_CBC_MODE: - assert(EVP_CIPHER_CTX_iv_length(ctx) <= sizeof(ctx->iv)); - if (iv) { - OPENSSL_memcpy(ctx->oiv, iv, EVP_CIPHER_CTX_iv_length(ctx)); - } - OPENSSL_memcpy(ctx->iv, ctx->oiv, EVP_CIPHER_CTX_iv_length(ctx)); - break; - - case EVP_CIPH_CTR_MODE: - case EVP_CIPH_OFB_MODE: - ctx->num = 0; - // Don't reuse IV for CTR mode - if (iv) { - OPENSSL_memcpy(ctx->iv, iv, EVP_CIPHER_CTX_iv_length(ctx)); - } - break; - - default: - return 0; - } - } - - if (key || (ctx->cipher->flags & EVP_CIPH_ALWAYS_CALL_INIT)) { - if (!ctx->cipher->init(ctx, key, iv, enc)) { - return 0; - } - } - - ctx->buf_len = 0; - ctx->final_used = 0; - return 1; -} - -int EVP_EncryptInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, - ENGINE *impl, const uint8_t *key, const uint8_t *iv) { - return EVP_CipherInit_ex(ctx, cipher, impl, key, iv, 1); -} - -int EVP_DecryptInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, - ENGINE *impl, const uint8_t *key, const uint8_t *iv) { - return EVP_CipherInit_ex(ctx, cipher, impl, key, iv, 0); -} - -// block_remainder returns the number of bytes to remove from |len| to get a -// multiple of |ctx|'s block size. -static int block_remainder(const EVP_CIPHER_CTX *ctx, int len) { - // |block_size| must be a power of two. - assert(ctx->cipher->block_size != 0); - assert((ctx->cipher->block_size & (ctx->cipher->block_size - 1)) == 0); - return len & (ctx->cipher->block_size - 1); -} - -int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len, - const uint8_t *in, int in_len) { - // Ciphers that use blocks may write up to |bl| extra bytes. Ensure the output - // does not overflow |*out_len|. - int bl = ctx->cipher->block_size; - if (bl > 1 && in_len > INT_MAX - bl) { - OPENSSL_PUT_ERROR(CIPHER, ERR_R_OVERFLOW); - return 0; - } - - if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) { - int ret = ctx->cipher->cipher(ctx, out, in, in_len); - if (ret < 0) { - return 0; - } else { - *out_len = ret; - } - return 1; - } - - if (in_len <= 0) { - *out_len = 0; - return in_len == 0; - } - - if (ctx->buf_len == 0 && block_remainder(ctx, in_len) == 0) { - if (ctx->cipher->cipher(ctx, out, in, in_len)) { - *out_len = in_len; - return 1; - } else { - *out_len = 0; - return 0; - } - } - - int i = ctx->buf_len; - assert(bl <= (int)sizeof(ctx->buf)); - if (i != 0) { - if (bl - i > in_len) { - OPENSSL_memcpy(&ctx->buf[i], in, in_len); - ctx->buf_len += in_len; - *out_len = 0; - return 1; - } else { - int j = bl - i; - OPENSSL_memcpy(&ctx->buf[i], in, j); - if (!ctx->cipher->cipher(ctx, out, ctx->buf, bl)) { - return 0; - } - in_len -= j; - in += j; - out += bl; - *out_len = bl; - } - } else { - *out_len = 0; - } - - i = block_remainder(ctx, in_len); - in_len -= i; - if (in_len > 0) { - if (!ctx->cipher->cipher(ctx, out, in, in_len)) { - return 0; - } - *out_len += in_len; - } - - if (i != 0) { - OPENSSL_memcpy(ctx->buf, &in[in_len], i); - } - ctx->buf_len = i; - return 1; -} - -int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) { - int n; - unsigned int i, b, bl; - - if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) { - // When EVP_CIPH_FLAG_CUSTOM_CIPHER is set, the return value of |cipher| is - // the number of bytes written, or -1 on error. Otherwise the return value - // is one on success and zero on error. - const int num_bytes = ctx->cipher->cipher(ctx, out, NULL, 0); - if (num_bytes < 0) { - return 0; - } - *out_len = num_bytes; - goto out; - } - - b = ctx->cipher->block_size; - assert(b <= sizeof(ctx->buf)); - if (b == 1) { - *out_len = 0; - goto out; - } - - bl = ctx->buf_len; - if (ctx->flags & EVP_CIPH_NO_PADDING) { - if (bl) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_DATA_NOT_MULTIPLE_OF_BLOCK_LENGTH); - return 0; - } - *out_len = 0; - goto out; - } - - n = b - bl; - for (i = bl; i < b; i++) { - ctx->buf[i] = n; - } - if (!ctx->cipher->cipher(ctx, out, ctx->buf, b)) { - return 0; - } - *out_len = b; - -out: - EVP_Cipher_verify_service_indicator(ctx); - return 1; -} - -int EVP_DecryptUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len, - const uint8_t *in, int in_len) { - // Ciphers that use blocks may write up to |bl| extra bytes. Ensure the output - // does not overflow |*out_len|. - unsigned int b = ctx->cipher->block_size; - if (b > 1 && in_len > INT_MAX - (int)b) { - OPENSSL_PUT_ERROR(CIPHER, ERR_R_OVERFLOW); - return 0; - } - - if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) { - int r = ctx->cipher->cipher(ctx, out, in, in_len); - if (r < 0) { - *out_len = 0; - return 0; - } else { - *out_len = r; - } - return 1; - } - - if (in_len <= 0) { - *out_len = 0; - return in_len == 0; - } - - if (ctx->flags & EVP_CIPH_NO_PADDING) { - return EVP_EncryptUpdate(ctx, out, out_len, in, in_len); - } - - assert(b <= sizeof(ctx->final)); - int fix_len = 0; - if (ctx->final_used) { - OPENSSL_memcpy(out, ctx->final, b); - out += b; - fix_len = 1; - } - - if (!EVP_EncryptUpdate(ctx, out, out_len, in, in_len)) { - return 0; - } - - // if we have 'decrypted' a multiple of block size, make sure - // we have a copy of this last block - if (b > 1 && !ctx->buf_len) { - *out_len -= b; - ctx->final_used = 1; - OPENSSL_memcpy(ctx->final, &out[*out_len], b); - } else { - ctx->final_used = 0; - } - - if (fix_len) { - *out_len += b; - } - - return 1; -} - -int EVP_DecryptFinal_ex(EVP_CIPHER_CTX *ctx, unsigned char *out, int *out_len) { - int i, n; - unsigned int b; - *out_len = 0; - - if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) { - i = ctx->cipher->cipher(ctx, out, NULL, 0); - if (i < 0) { - return 0; - } else { - *out_len = i; - } - goto out; - } - - b = ctx->cipher->block_size; - if (ctx->flags & EVP_CIPH_NO_PADDING) { - if (ctx->buf_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_DATA_NOT_MULTIPLE_OF_BLOCK_LENGTH); - return 0; - } - *out_len = 0; - goto out; - } - - if (b > 1) { - if (ctx->buf_len || !ctx->final_used) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_WRONG_FINAL_BLOCK_LENGTH); - return 0; - } - assert(b <= sizeof(ctx->final)); - - // The following assumes that the ciphertext has been authenticated. - // Otherwise it provides a padding oracle. - n = ctx->final[b - 1]; - if (n == 0 || n > (int)b) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - for (i = 0; i < n; i++) { - if (ctx->final[--b] != n) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - } - - n = ctx->cipher->block_size - n; - for (i = 0; i < n; i++) { - out[i] = ctx->final[i]; - } - *out_len = n; - } else { - *out_len = 0; - } - -out: - EVP_Cipher_verify_service_indicator(ctx); - return 1; -} - -int EVP_Cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, - size_t in_len) { - const int ret = ctx->cipher->cipher(ctx, out, in, in_len); - - // |EVP_CIPH_FLAG_CUSTOM_CIPHER| never sets the FIPS indicator via - // |EVP_Cipher| because it's complicated whether the operation has completed - // or not. E.g. AES-GCM with a non-NULL |in| argument hasn't completed an - // operation. Callers should use the |EVP_AEAD| API or, at least, - // |EVP_CipherUpdate| etc. - // - // This call can't be pushed into |EVP_Cipher_verify_service_indicator| - // because whether |ret| indicates success or not depends on whether - // |EVP_CIPH_FLAG_CUSTOM_CIPHER| is set. (This unreasonable, but matches - // OpenSSL.) - if (!(ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) && ret) { - EVP_Cipher_verify_service_indicator(ctx); - } - - return ret; -} - -int EVP_CipherUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len, - const uint8_t *in, int in_len) { - if (ctx->encrypt) { - return EVP_EncryptUpdate(ctx, out, out_len, in, in_len); - } else { - return EVP_DecryptUpdate(ctx, out, out_len, in, in_len); - } -} - -int EVP_CipherFinal_ex(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) { - if (ctx->encrypt) { - return EVP_EncryptFinal_ex(ctx, out, out_len); - } else { - return EVP_DecryptFinal_ex(ctx, out, out_len); - } -} - -const EVP_CIPHER *EVP_CIPHER_CTX_cipher(const EVP_CIPHER_CTX *ctx) { - return ctx->cipher; -} - -int EVP_CIPHER_CTX_nid(const EVP_CIPHER_CTX *ctx) { - return ctx->cipher->nid; -} - -int EVP_CIPHER_CTX_encrypting(const EVP_CIPHER_CTX *ctx) { - return ctx->encrypt; -} - -unsigned EVP_CIPHER_CTX_block_size(const EVP_CIPHER_CTX *ctx) { - return ctx->cipher->block_size; -} - -unsigned EVP_CIPHER_CTX_key_length(const EVP_CIPHER_CTX *ctx) { - return ctx->key_len; -} - -unsigned EVP_CIPHER_CTX_iv_length(const EVP_CIPHER_CTX *ctx) { - return ctx->cipher->iv_len; -} - -void *EVP_CIPHER_CTX_get_app_data(const EVP_CIPHER_CTX *ctx) { - return ctx->app_data; -} - -void EVP_CIPHER_CTX_set_app_data(EVP_CIPHER_CTX *ctx, void *data) { - ctx->app_data = data; -} - -uint32_t EVP_CIPHER_CTX_flags(const EVP_CIPHER_CTX *ctx) { - return ctx->cipher->flags & ~EVP_CIPH_MODE_MASK; -} - -uint32_t EVP_CIPHER_CTX_mode(const EVP_CIPHER_CTX *ctx) { - return ctx->cipher->flags & EVP_CIPH_MODE_MASK; -} - -int EVP_CIPHER_CTX_ctrl(EVP_CIPHER_CTX *ctx, int command, int arg, void *ptr) { - int ret; - if (!ctx->cipher) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_NO_CIPHER_SET); - return 0; - } - - if (!ctx->cipher->ctrl) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_CTRL_NOT_IMPLEMENTED); - return 0; - } - - ret = ctx->cipher->ctrl(ctx, command, arg, ptr); - if (ret == -1) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_CTRL_OPERATION_NOT_IMPLEMENTED); - return 0; - } - - return ret; -} - -int EVP_CIPHER_CTX_set_padding(EVP_CIPHER_CTX *ctx, int pad) { - if (pad) { - ctx->flags &= ~EVP_CIPH_NO_PADDING; - } else { - ctx->flags |= EVP_CIPH_NO_PADDING; - } - return 1; -} - -int EVP_CIPHER_CTX_set_key_length(EVP_CIPHER_CTX *c, unsigned key_len) { - if (c->key_len == key_len) { - return 1; - } - - if (key_len == 0 || !(c->cipher->flags & EVP_CIPH_VARIABLE_LENGTH)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_KEY_LENGTH); - return 0; - } - - c->key_len = key_len; - return 1; -} - -int EVP_CIPHER_nid(const EVP_CIPHER *cipher) { return cipher->nid; } - -unsigned EVP_CIPHER_block_size(const EVP_CIPHER *cipher) { - return cipher->block_size; -} - -unsigned EVP_CIPHER_key_length(const EVP_CIPHER *cipher) { - return cipher->key_len; -} - -unsigned EVP_CIPHER_iv_length(const EVP_CIPHER *cipher) { - return cipher->iv_len; -} - -uint32_t EVP_CIPHER_flags(const EVP_CIPHER *cipher) { - return cipher->flags & ~EVP_CIPH_MODE_MASK; -} - -uint32_t EVP_CIPHER_mode(const EVP_CIPHER *cipher) { - return cipher->flags & EVP_CIPH_MODE_MASK; -} - -int EVP_CipherInit(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, - const uint8_t *key, const uint8_t *iv, int enc) { - if (cipher) { - EVP_CIPHER_CTX_init(ctx); - } - return EVP_CipherInit_ex(ctx, cipher, NULL, key, iv, enc); -} - -int EVP_EncryptInit(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, - const uint8_t *key, const uint8_t *iv) { - return EVP_CipherInit(ctx, cipher, key, iv, 1); -} - -int EVP_DecryptInit(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, - const uint8_t *key, const uint8_t *iv) { - return EVP_CipherInit(ctx, cipher, key, iv, 0); -} - -int EVP_CipherFinal(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) { - return EVP_CipherFinal_ex(ctx, out, out_len); -} - -int EVP_EncryptFinal(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) { - return EVP_EncryptFinal_ex(ctx, out, out_len); -} - -int EVP_DecryptFinal(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) { - return EVP_DecryptFinal_ex(ctx, out, out_len); -} - -int EVP_add_cipher_alias(const char *a, const char *b) { - return 1; -} - -void EVP_CIPHER_CTX_set_flags(const EVP_CIPHER_CTX *ctx, uint32_t flags) {} diff --git a/third_party/boringssl/src/crypto/fipsmodule/cipher/cipher.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/cipher/cipher.cc.inc new file mode 100644 index 00000000..eed5d0ad --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/cipher/cipher.cc.inc @@ -0,0 +1,829 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +void EVP_CIPHER_CTX_init(EVP_CIPHER_CTX *ctx) { + OPENSSL_memset(ctx, 0, sizeof(EVP_CIPHER_CTX)); +} + +EVP_CIPHER_CTX *EVP_CIPHER_CTX_new() { + EVP_CIPHER_CTX *ctx = New(); + if (ctx) { + EVP_CIPHER_CTX_init(ctx); + } + return ctx; +} + +int EVP_CIPHER_CTX_cleanup(EVP_CIPHER_CTX *c) { + if (c->cipher != nullptr && c->cipher->cleanup) { + c->cipher->cleanup(c); + } + OPENSSL_free(c->cipher_data); + + OPENSSL_memset(c, 0, sizeof(EVP_CIPHER_CTX)); + return 1; +} + +void EVP_CIPHER_CTX_free(EVP_CIPHER_CTX *ctx) { + if (ctx) { + EVP_CIPHER_CTX_cleanup(ctx); + Delete(ctx); + } +} + +int EVP_CIPHER_CTX_copy(EVP_CIPHER_CTX *out, const EVP_CIPHER_CTX *in) { + if (in == nullptr || in->cipher == nullptr) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INPUT_NOT_INITIALIZED); + return 0; + } + + if (in->poisoned) { + OPENSSL_PUT_ERROR(CIPHER, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + EVP_CIPHER_CTX_cleanup(out); + OPENSSL_memcpy(out, in, sizeof(EVP_CIPHER_CTX)); + + if (in->cipher_data && in->cipher->ctx_size) { + out->cipher_data = OPENSSL_memdup(in->cipher_data, in->cipher->ctx_size); + if (!out->cipher_data) { + out->cipher = nullptr; + return 0; + } + } + + if (in->cipher->flags & EVP_CIPH_CUSTOM_COPY) { + if (!in->cipher->ctrl((EVP_CIPHER_CTX *)in, EVP_CTRL_COPY, 0, out)) { + out->cipher = nullptr; + return 0; + } + } + + return 1; +} + +int EVP_CIPHER_CTX_reset(EVP_CIPHER_CTX *ctx) { + EVP_CIPHER_CTX_cleanup(ctx); + EVP_CIPHER_CTX_init(ctx); + return 1; +} + +int EVP_CipherInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, + ENGINE *engine, const uint8_t *key, const uint8_t *iv, + int enc) { + if (enc == -1) { + enc = ctx->encrypt; + } else { + if (enc) { + enc = 1; + } + ctx->encrypt = enc; + } + + if (cipher) { + // Ensure a context left from last time is cleared (the previous check + // attempted to avoid this if the same ENGINE and EVP_CIPHER could be + // used). + if (ctx->cipher) { + EVP_CIPHER_CTX_cleanup(ctx); + // Restore encrypt and flags + ctx->encrypt = enc; + } + + ctx->cipher = cipher; + if (ctx->cipher->ctx_size) { + ctx->cipher_data = OPENSSL_malloc(ctx->cipher->ctx_size); + if (!ctx->cipher_data) { + ctx->cipher = nullptr; + return 0; + } + } else { + ctx->cipher_data = nullptr; + } + + ctx->key_len = cipher->key_len; + ctx->flags = 0; + + if (ctx->cipher->flags & EVP_CIPH_CTRL_INIT) { + if (!EVP_CIPHER_CTX_ctrl(ctx, EVP_CTRL_INIT, 0, nullptr)) { + ctx->cipher = nullptr; + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INITIALIZATION_ERROR); + return 0; + } + } + } else if (!ctx->cipher) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_NO_CIPHER_SET); + return 0; + } + + // we assume block size is a power of 2 in *cryptUpdate + assert(ctx->cipher->block_size == 1 || ctx->cipher->block_size == 8 || + ctx->cipher->block_size == 16); + + if (!(EVP_CIPHER_CTX_flags(ctx) & EVP_CIPH_CUSTOM_IV)) { + switch (EVP_CIPHER_CTX_mode(ctx)) { + case EVP_CIPH_STREAM_CIPHER: + case EVP_CIPH_ECB_MODE: + break; + + case EVP_CIPH_CFB_MODE: + ctx->num = 0; + [[fallthrough]]; + + case EVP_CIPH_CBC_MODE: + assert(EVP_CIPHER_CTX_iv_length(ctx) <= sizeof(ctx->iv)); + if (iv) { + OPENSSL_memcpy(ctx->oiv, iv, EVP_CIPHER_CTX_iv_length(ctx)); + } + OPENSSL_memcpy(ctx->iv, ctx->oiv, EVP_CIPHER_CTX_iv_length(ctx)); + break; + + case EVP_CIPH_CTR_MODE: + case EVP_CIPH_OFB_MODE: + ctx->num = 0; + // Don't reuse IV for CTR mode + if (iv) { + OPENSSL_memcpy(ctx->iv, iv, EVP_CIPHER_CTX_iv_length(ctx)); + } + break; + + default: + return 0; + } + } + + if (key || (ctx->cipher->flags & EVP_CIPH_ALWAYS_CALL_INIT)) { + if (!ctx->cipher->init(ctx, key, iv, enc)) { + return 0; + } + } + + ctx->buf_len = 0; + ctx->final_used = 0; + // Clear the poisoned flag to permit reuse of a CTX that previously had a + // failed operation. + ctx->poisoned = 0; + return 1; +} + +int EVP_EncryptInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, + ENGINE *impl, const uint8_t *key, const uint8_t *iv) { + return EVP_CipherInit_ex(ctx, cipher, impl, key, iv, 1); +} + +int EVP_DecryptInit_ex(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, + ENGINE *impl, const uint8_t *key, const uint8_t *iv) { + return EVP_CipherInit_ex(ctx, cipher, impl, key, iv, 0); +} + +// block_remainder returns the number of bytes to remove from |len| to get a +// multiple of |ctx|'s block size. +static size_t block_remainder(const EVP_CIPHER_CTX *ctx, size_t len) { + // |block_size| must be a power of two. + assert(ctx->cipher->block_size != 0); + assert((ctx->cipher->block_size & (ctx->cipher->block_size - 1)) == 0); + return len & (ctx->cipher->block_size - 1); +} + +int EVP_EncryptUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len, + const uint8_t *in, int in_len) { + *out_len = 0; + if (in_len < 0) { + OPENSSL_PUT_ERROR(CIPHER, ERR_R_OVERFLOW); + return 0; + } + size_t in_len_sz = static_cast(in_len); + size_t out_len_sz; + if ((ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) && out == nullptr) { + if (!EVP_CipherUpdateAAD(ctx, in, in_len_sz)) { + return 0; + } + out_len_sz = in_len_sz; // Even though no output was written! + } else { + // in_len_sz is < INT_MAX which is no more than half of SIZE_MAX. + size_t max_out_len = + std::min(in_len_sz + ctx->cipher->block_size - 1, size_t{INT_MAX}); + if (!EVP_EncryptUpdate_ex(ctx, out, &out_len_sz, max_out_len, in, + in_len_sz)) { + return 0; + } + } + *out_len = static_cast(out_len_sz); + return 1; +} + +template +static int WrapWithPoison(EVP_CIPHER_CTX *ctx, F f) { + if (ctx->poisoned) { + // |ctx| has been left in an indeterminate state by a previous failed + // operation. Do not allow proceeding. + OPENSSL_PUT_ERROR(CIPHER, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (!f()) { + // Functions using |WrapWithPoison| may leave |ctx| in an indeterminate + // state. Mark the object as poisoned. + ctx->poisoned = 1; + return 0; + } + return 1; +} + +static int EVP_EncryptUpdate_ex_internal(EVP_CIPHER_CTX *ctx, uint8_t *out, + size_t *out_len, size_t max_out_len, + const uint8_t *in, size_t in_len) { + *out_len = 0; + + // Ciphers that use blocks may write up to |block_size| extra bytes. Ensure + // the output does not overflow |*out_len|. + Span in_span(in, in_len); + size_t block_size = ctx->cipher->block_size; + + if (in_span.empty()) { + return 1; + } + + size_t buf_len = ctx->buf_len; + assert(block_size <= sizeof(ctx->buf)); + Span out_span(out, max_out_len); + if (buf_len != 0) { + if (block_size - buf_len > in_span.size()) { + CopyToPrefix(in_span, Span(ctx->buf).subspan(buf_len)); + ctx->buf_len += in_span.size(); + return 1; + } else { + size_t j = block_size - buf_len; + CopyToPrefix(in_span.first(j), Span(ctx->buf).subspan(buf_len)); + if (out_span.size() < block_size) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + if (!ctx->cipher->cipher_update(ctx, out_span.data(), ctx->buf, + block_size)) { + return 0; + } + in_span = in_span.subspan(j); + out_span = out_span.subspan(block_size); + } + } + + size_t whole_blocks = in_span.size() - block_remainder(ctx, in_span.size()); + if (whole_blocks > 0) { + if (out_span.size() < whole_blocks) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + if (!ctx->cipher->cipher_update(ctx, out_span.data(), in_span.data(), + whole_blocks)) { + return 0; + } + in_span = in_span.subspan(whole_blocks); + out_span = out_span.subspan(whole_blocks); + } + + assert(in_span.size() < block_size); + CopyToPrefix(in_span, ctx->buf); + ctx->buf_len = in_span.size(); + + *out_len = max_out_len - out_span.size(); + return 1; +} + +int EVP_EncryptUpdate_ex(EVP_CIPHER_CTX *ctx, uint8_t *out, size_t *out_len, + size_t max_out_len, const uint8_t *in, size_t in_len) { + *out_len = 0; + return WrapWithPoison(ctx, [&] { + return EVP_EncryptUpdate_ex_internal(ctx, out, out_len, max_out_len, in, + in_len); + }); +} + +int EVP_EncryptFinal_ex(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) { + size_t out_len_sz; + int ret = + EVP_EncryptFinal_ex2(ctx, out, &out_len_sz, ctx->cipher->block_size); + static_assert(EVP_MAX_BLOCK_LENGTH <= INT_MAX); + *out_len = static_cast(out_len_sz); + return ret; +} + +static int EVP_EncryptFinal_ex2_internal(EVP_CIPHER_CTX *ctx, uint8_t *out, + size_t *out_len, size_t max_out_len) { + *out_len = 0; + + size_t block_size = ctx->cipher->block_size; + assert(block_size <= sizeof(ctx->buf)); + if (block_size == 1) { + if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) { + return ctx->cipher->cipher_final(ctx); + } + return 1; + } + + size_t buf_len = ctx->buf_len; + if (ctx->flags & EVP_CIPH_NO_PADDING) { + if (buf_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_DATA_NOT_MULTIPLE_OF_BLOCK_LENGTH); + return 0; + } + return 1; + } + + size_t padding = block_size - buf_len; + for (size_t i = buf_len; i < block_size; i++) { + ctx->buf[i] = padding; + } + + Span out_span(out, max_out_len); + if (out_span.size() < block_size) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + if (!ctx->cipher->cipher_update(ctx, out_span.data(), ctx->buf, block_size)) { + return 0; + } + out_span = out_span.subspan(block_size); + + *out_len = max_out_len - out_span.size(); + return 1; +} + +int EVP_EncryptFinal_ex2(EVP_CIPHER_CTX *ctx, uint8_t *out, size_t *out_len, + size_t max_out_len) { + *out_len = 0; + return WrapWithPoison(ctx, [&] { + if (!EVP_EncryptFinal_ex2_internal(ctx, out, out_len, max_out_len)) { + return 0; + } + EVP_Cipher_verify_service_indicator(ctx); + return 1; + }); +} + +int EVP_DecryptUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len, + const uint8_t *in, int in_len) { + *out_len = 0; + if (in_len < 0) { + OPENSSL_PUT_ERROR(CIPHER, ERR_R_OVERFLOW); + return 0; + } + size_t in_len_sz = static_cast(in_len); + size_t out_len_sz; + if ((ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) && out == nullptr) { + if (!EVP_CipherUpdateAAD(ctx, in, in_len_sz)) { + return 0; + } + out_len_sz = in_len_sz; + } else { + // in_len_sz is < INT_MAX which is no more than half of SIZE_MAX. + size_t max_out_len = + std::min(in_len_sz + ctx->cipher->block_size - 1, size_t{INT_MAX}); + if (!EVP_DecryptUpdate_ex(ctx, out, &out_len_sz, max_out_len, in, + in_len_sz)) { + return 0; + } + } + *out_len = static_cast(out_len_sz); + return 1; +} + +static int EVP_DecryptUpdate_ex_internal(EVP_CIPHER_CTX *ctx, uint8_t *out, + size_t *out_len, size_t max_out_len, + const uint8_t *in, size_t in_len) { + *out_len = 0; + + // Ciphers that use blocks may write up to |block_size| extra bytes. Ensure + // the output does not overflow |*out_len|. + Span in_span(in, in_len); + size_t block_size = ctx->cipher->block_size; + + if (in_span.empty()) { + return 1; + } + + Span out_span(out, max_out_len); + if (ctx->flags & EVP_CIPH_NO_PADDING) { + // Use the shared block handling logic from encryption. + return EVP_EncryptUpdate_ex_internal(ctx, out_span.data(), out_len, + out_span.size(), in_span.data(), + in_span.size()); + } + + assert(block_size <= sizeof(ctx->final)); + if (ctx->final_used) { + if (out_span.size() < block_size) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + CopyToPrefix(Span(ctx->final).first(block_size), out_span); + ctx->final_used = 0; + out_span = out_span.subspan(block_size); + } + + // Use the shared block handling logic from encryption. + if (block_size > 1 && + block_remainder(ctx, ctx->buf_len + in_span.size()) == 0) { + // Decryption would end on a block boundary. In this case, although we + // can decrypt up to the block boundary, we cannot output the final + // plaintext block yet. It may be the final block, with padding to + // remove. + // + // Instead, output all but the final block's decryption, then decrypt the + // final block into ctx->final, to be processed later. + + // NOTE: Not _really_ necessary, but let's try aligning the second + // EVP_EncryptUpdate_ex call to a block boundary to mess with the buffer + // less. + size_t head = in_span.size() > block_size ? in_span.size() - block_size : 0; + size_t head_out_len; + if (!EVP_EncryptUpdate_ex_internal(ctx, out_span.data(), &head_out_len, + out_span.size(), in_span.data(), head)) { + return 0; + } + in_span = in_span.subspan(head); + out_span = out_span.subspan(head_out_len); + size_t final_size; + if (!EVP_EncryptUpdate_ex_internal(ctx, ctx->final, &final_size, + sizeof(ctx->final), in_span.data(), + in_span.size())) { + return 0; + } + ctx->final_used = 1; + assert(final_size == block_size); + assert(ctx->buf_len == 0); + } else { + // Buffer will be non-empty. + size_t written_out_len; + if (!EVP_EncryptUpdate_ex_internal(ctx, out_span.data(), &written_out_len, + out_span.size(), in_span.data(), + in_span.size())) { + return 0; + } + assert(block_size == 1 || ctx->buf_len != 0); + out_span = out_span.subspan(written_out_len); + } + + *out_len = max_out_len - out_span.size(); + return 1; +} + +int EVP_DecryptUpdate_ex(EVP_CIPHER_CTX *ctx, uint8_t *out, size_t *out_len, + size_t max_out_len, const uint8_t *in, size_t in_len) { + return WrapWithPoison(ctx, [&] { + return EVP_DecryptUpdate_ex_internal(ctx, out, out_len, max_out_len, in, + in_len); + }); +} + +int EVP_DecryptFinal_ex(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) { + size_t out_len_sz; + int ret = + EVP_DecryptFinal_ex2(ctx, out, &out_len_sz, ctx->cipher->block_size); + static_assert(EVP_MAX_BLOCK_LENGTH <= INT_MAX); + *out_len = static_cast(out_len_sz); + return ret; +} + +static int EVP_DecryptFinal_ex2_internal(EVP_CIPHER_CTX *ctx, + unsigned char *out, size_t *out_len, + size_t max_out_len) { + *out_len = 0; + + size_t block_size = ctx->cipher->block_size; + assert(block_size <= sizeof(ctx->buf)); + if (block_size == 1) { + if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) { + return ctx->cipher->cipher_final(ctx); + } + return 1; + } + + size_t buf_len = ctx->buf_len; + if (ctx->flags & EVP_CIPH_NO_PADDING) { + if (buf_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_DATA_NOT_MULTIPLE_OF_BLOCK_LENGTH); + return 0; + } + return 1; + } + + if (buf_len || !ctx->final_used) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_WRONG_FINAL_BLOCK_LENGTH); + return 0; + } + assert(block_size <= sizeof(ctx->final)); + + // The following assumes that the ciphertext has been authenticated. + // Otherwise it provides a padding oracle. + size_t padding = ctx->final[block_size - 1]; + if (padding == 0 || padding > block_size) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + for (size_t i = block_size - padding; i < block_size; i++) { + if (ctx->final[i] != padding) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + } + + Span out_span(out, max_out_len); + size_t payload = ctx->cipher->block_size - padding; + if (out_span.size() < payload) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + CopyToPrefix(Span(ctx->final).first(payload), out_span); + out_span = out_span.subspan(payload); + + *out_len = max_out_len - out_span.size(); + return 1; +} + +int EVP_DecryptFinal_ex2(EVP_CIPHER_CTX *ctx, unsigned char *out, + size_t *out_len, size_t max_out_len) { + *out_len = 0; + return WrapWithPoison(ctx, [&] { + if (!EVP_DecryptFinal_ex2_internal(ctx, out, out_len, max_out_len)) { + return 0; + } + EVP_Cipher_verify_service_indicator(ctx); + return 1; + }); +} + +int EVP_Cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, + size_t in_len) { + const int kError = + (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) ? -1 : 0; + + if ((ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) && + in_len > size_t{INT_MAX}) { + // Can't represent the return value? That'd be bad. + OPENSSL_PUT_ERROR(CIPHER, ERR_R_OVERFLOW); + return kError; + } + + size_t out_len; + if ((ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) && in == nullptr) { + if (!ctx->cipher->cipher_final(ctx)) { + return kError; + } + out_len = 0; + } else if ((ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) && + out == nullptr) { + if (!ctx->cipher->update_aad(ctx, in, in_len)) { + return kError; + } + out_len = in_len; // Yes, even though no output was written! + } else { + if (!ctx->cipher->cipher_update(ctx, out, in, in_len)) { + return kError; + } + out_len = in_len; + } + + // |EVP_CIPH_FLAG_CUSTOM_CIPHER| never sets the FIPS indicator via + // |EVP_Cipher| because it's complicated whether the operation has completed + // or not. E.g. AES-GCM with a non-NULL |in| argument hasn't completed an + // operation. Callers should use the |EVP_AEAD| API or, at least, + // |EVP_CipherUpdate| etc. + // + // This call can't be pushed into |EVP_Cipher_verify_service_indicator| + // because whether |ret| indicates success or not depends on whether + // |EVP_CIPH_FLAG_CUSTOM_CIPHER| is set. (This unreasonable, but matches + // OpenSSL.) + if (!(ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)) { + EVP_Cipher_verify_service_indicator(ctx); + } + + // Custom ciphers return byte count; regular ciphers return boolean. + if (ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER) { + return static_cast(out_len); + } + return 1; +} + +int EVP_CipherUpdate(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len, + const uint8_t *in, int in_len) { + if (ctx->encrypt) { + return EVP_EncryptUpdate(ctx, out, out_len, in, in_len); + } else { + return EVP_DecryptUpdate(ctx, out, out_len, in, in_len); + } +} + +int EVP_CipherUpdate_ex(EVP_CIPHER_CTX *ctx, uint8_t *out, size_t *out_len, + size_t max_out_len, const uint8_t *in, size_t in_len) { + if (ctx->encrypt) { + return EVP_EncryptUpdate_ex(ctx, out, out_len, max_out_len, in, in_len); + } else { + return EVP_DecryptUpdate_ex(ctx, out, out_len, max_out_len, in, in_len); + } +} + +int EVP_CipherUpdateAAD(EVP_CIPHER_CTX *ctx, const uint8_t *in, size_t in_len) { + return WrapWithPoison(ctx, [&] { + if (!(ctx->cipher->flags & EVP_CIPH_FLAG_CUSTOM_CIPHER)) { + OPENSSL_PUT_ERROR(CIPHER, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + return ctx->cipher->update_aad(ctx, in, in_len); + }); +} + +int EVP_CipherFinal_ex(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) { + if (ctx->encrypt) { + return EVP_EncryptFinal_ex(ctx, out, out_len); + } else { + return EVP_DecryptFinal_ex(ctx, out, out_len); + } +} + +int EVP_CipherFinal_ex2(EVP_CIPHER_CTX *ctx, uint8_t *out, size_t *out_len, + size_t max_out_len) { + if (ctx->encrypt) { + return EVP_EncryptFinal_ex2(ctx, out, out_len, max_out_len); + } else { + return EVP_DecryptFinal_ex2(ctx, out, out_len, max_out_len); + } +} + +const EVP_CIPHER *EVP_CIPHER_CTX_cipher(const EVP_CIPHER_CTX *ctx) { + return ctx->cipher; +} + +int EVP_CIPHER_CTX_nid(const EVP_CIPHER_CTX *ctx) { return ctx->cipher->nid; } + +int EVP_CIPHER_CTX_encrypting(const EVP_CIPHER_CTX *ctx) { + return ctx->encrypt; +} + +unsigned EVP_CIPHER_CTX_block_size(const EVP_CIPHER_CTX *ctx) { + return ctx->cipher->block_size; +} + +unsigned EVP_CIPHER_CTX_key_length(const EVP_CIPHER_CTX *ctx) { + return ctx->key_len; +} + +unsigned EVP_CIPHER_CTX_iv_length(const EVP_CIPHER_CTX *ctx) { + if (EVP_CIPHER_mode(ctx->cipher) == EVP_CIPH_GCM_MODE) { + int length; + int res = EVP_CIPHER_CTX_ctrl((EVP_CIPHER_CTX *)ctx, EVP_CTRL_GET_IVLEN, 0, + &length); + // EVP_CIPHER_CTX_ctrl returning an error should be impossible under this + // circumstance. If it somehow did, fallback to the static cipher iv_len. + if (res == 1) { + return length; + } + } + return ctx->cipher->iv_len; +} + +void *EVP_CIPHER_CTX_get_app_data(const EVP_CIPHER_CTX *ctx) { + return ctx->app_data; +} + +void EVP_CIPHER_CTX_set_app_data(EVP_CIPHER_CTX *ctx, void *data) { + ctx->app_data = data; +} + +uint32_t EVP_CIPHER_CTX_flags(const EVP_CIPHER_CTX *ctx) { + return ctx->cipher->flags & ~EVP_CIPH_MODE_MASK; +} + +uint32_t EVP_CIPHER_CTX_mode(const EVP_CIPHER_CTX *ctx) { + return ctx->cipher->flags & EVP_CIPH_MODE_MASK; +} + +int EVP_CIPHER_CTX_ctrl(EVP_CIPHER_CTX *ctx, int command, int arg, void *ptr) { + int ret; + if (!ctx->cipher) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_NO_CIPHER_SET); + return 0; + } + + if (!ctx->cipher->ctrl) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_CTRL_NOT_IMPLEMENTED); + return 0; + } + + ret = ctx->cipher->ctrl(ctx, command, arg, ptr); + if (ret == -1) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_CTRL_OPERATION_NOT_IMPLEMENTED); + return 0; + } + + return ret; +} + +int EVP_CIPHER_CTX_set_padding(EVP_CIPHER_CTX *ctx, int pad) { + if (pad) { + ctx->flags &= ~EVP_CIPH_NO_PADDING; + } else { + ctx->flags |= EVP_CIPH_NO_PADDING; + } + return 1; +} + +int EVP_CIPHER_CTX_set_key_length(EVP_CIPHER_CTX *c, unsigned key_len) { + if (c->key_len == key_len) { + return 1; + } + + if (key_len == 0 || !(c->cipher->flags & EVP_CIPH_VARIABLE_LENGTH)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_KEY_LENGTH); + return 0; + } + + c->key_len = key_len; + return 1; +} + +int EVP_CIPHER_nid(const EVP_CIPHER *cipher) { return cipher->nid; } + +unsigned EVP_CIPHER_block_size(const EVP_CIPHER *cipher) { + return cipher->block_size; +} + +unsigned EVP_CIPHER_key_length(const EVP_CIPHER *cipher) { + return cipher->key_len; +} + +unsigned EVP_CIPHER_iv_length(const EVP_CIPHER *cipher) { + return cipher->iv_len; +} + +uint32_t EVP_CIPHER_flags(const EVP_CIPHER *cipher) { + return cipher->flags & ~EVP_CIPH_MODE_MASK; +} + +uint32_t EVP_CIPHER_mode(const EVP_CIPHER *cipher) { + return cipher->flags & EVP_CIPH_MODE_MASK; +} + +int EVP_CipherInit(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, + const uint8_t *key, const uint8_t *iv, int enc) { + if (cipher) { + EVP_CIPHER_CTX_init(ctx); + } + return EVP_CipherInit_ex(ctx, cipher, nullptr, key, iv, enc); +} + +int EVP_EncryptInit(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, + const uint8_t *key, const uint8_t *iv) { + return EVP_CipherInit(ctx, cipher, key, iv, 1); +} + +int EVP_DecryptInit(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, + const uint8_t *key, const uint8_t *iv) { + return EVP_CipherInit(ctx, cipher, key, iv, 0); +} + +int EVP_CipherFinal(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) { + return EVP_CipherFinal_ex(ctx, out, out_len); +} + +int EVP_EncryptFinal(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) { + return EVP_EncryptFinal_ex(ctx, out, out_len); +} + +int EVP_DecryptFinal(EVP_CIPHER_CTX *ctx, uint8_t *out, int *out_len) { + return EVP_DecryptFinal_ex(ctx, out, out_len); +} + +int EVP_add_cipher_alias(const char *a, const char *b) { return 1; } + +void EVP_CIPHER_CTX_set_flags(const EVP_CIPHER_CTX *ctx, uint32_t flags) {} diff --git a/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aes.c b/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aes.c deleted file mode 100644 index 8d5ed4ca..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aes.c +++ /dev/null @@ -1,1468 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2001-2011 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../../internal.h" -#include "../aes/internal.h" -#include "../modes/internal.h" -#include "../service_indicator/internal.h" -#include "../delocate.h" - - -OPENSSL_MSVC_PRAGMA(warning(push)) -OPENSSL_MSVC_PRAGMA(warning(disable: 4702)) // Unreachable code. - -#define AES_GCM_NONCE_LENGTH 12 - -#if defined(BSAES) -static void vpaes_ctr32_encrypt_blocks_with_bsaes(const uint8_t *in, - uint8_t *out, size_t blocks, - const AES_KEY *key, - const uint8_t ivec[16]) { - // |bsaes_ctr32_encrypt_blocks| is faster than |vpaes_ctr32_encrypt_blocks|, - // but it takes at least one full 8-block batch to amortize the conversion. - if (blocks < 8) { - vpaes_ctr32_encrypt_blocks(in, out, blocks, key, ivec); - return; - } - - size_t bsaes_blocks = blocks; - if (bsaes_blocks % 8 < 6) { - // |bsaes_ctr32_encrypt_blocks| internally works in 8-block batches. If the - // final batch is too small (under six blocks), it is faster to loop over - // |vpaes_encrypt|. Round |bsaes_blocks| down to a multiple of 8. - bsaes_blocks -= bsaes_blocks % 8; - } - - AES_KEY bsaes; - vpaes_encrypt_key_to_bsaes(&bsaes, key); - bsaes_ctr32_encrypt_blocks(in, out, bsaes_blocks, &bsaes, ivec); - OPENSSL_cleanse(&bsaes, sizeof(bsaes)); - - in += 16 * bsaes_blocks; - out += 16 * bsaes_blocks; - blocks -= bsaes_blocks; - - uint8_t new_ivec[16]; - memcpy(new_ivec, ivec, 12); - uint32_t ctr = CRYPTO_load_u32_be(ivec + 12) + bsaes_blocks; - CRYPTO_store_u32_be(new_ivec + 12, ctr); - - // Finish any remaining blocks with |vpaes_ctr32_encrypt_blocks|. - vpaes_ctr32_encrypt_blocks(in, out, blocks, key, new_ivec); -} -#endif // BSAES - -typedef struct { - union { - double align; - AES_KEY ks; - } ks; - block128_f block; - union { - cbc128_f cbc; - ctr128_f ctr; - } stream; -} EVP_AES_KEY; - -typedef struct { - GCM128_CONTEXT gcm; - union { - double align; - AES_KEY ks; - } ks; // AES key schedule to use - int key_set; // Set if key initialised - int iv_set; // Set if an iv is set - uint8_t *iv; // Temporary IV store - int ivlen; // IV length - int taglen; - int iv_gen; // It is OK to generate IVs - ctr128_f ctr; -} EVP_AES_GCM_CTX; - -static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, - const uint8_t *iv, int enc) { - int ret; - EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; - const int mode = ctx->cipher->flags & EVP_CIPH_MODE_MASK; - - if (mode == EVP_CIPH_CTR_MODE) { - switch (ctx->key_len) { - case 16: - boringssl_fips_inc_counter(fips_counter_evp_aes_128_ctr); - break; - - case 32: - boringssl_fips_inc_counter(fips_counter_evp_aes_256_ctr); - break; - } - } - - if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) { - if (hwaes_capable()) { - ret = aes_hw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks); - dat->block = aes_hw_decrypt; - dat->stream.cbc = NULL; - if (mode == EVP_CIPH_CBC_MODE) { - dat->stream.cbc = aes_hw_cbc_encrypt; - } - } else if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) { - assert(vpaes_capable()); - ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks); - if (ret == 0) { - vpaes_decrypt_key_to_bsaes(&dat->ks.ks, &dat->ks.ks); - } - // If |dat->stream.cbc| is provided, |dat->block| is never used. - dat->block = NULL; - dat->stream.cbc = bsaes_cbc_encrypt; - } else if (vpaes_capable()) { - ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks); - dat->block = vpaes_decrypt; - dat->stream.cbc = NULL; -#if defined(VPAES_CBC) - if (mode == EVP_CIPH_CBC_MODE) { - dat->stream.cbc = vpaes_cbc_encrypt; - } -#endif - } else { - ret = aes_nohw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks); - dat->block = aes_nohw_decrypt; - dat->stream.cbc = NULL; - if (mode == EVP_CIPH_CBC_MODE) { - dat->stream.cbc = aes_nohw_cbc_encrypt; - } - } - } else if (hwaes_capable()) { - ret = aes_hw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); - dat->block = aes_hw_encrypt; - dat->stream.cbc = NULL; - if (mode == EVP_CIPH_CBC_MODE) { - dat->stream.cbc = aes_hw_cbc_encrypt; - } else if (mode == EVP_CIPH_CTR_MODE) { - dat->stream.ctr = aes_hw_ctr32_encrypt_blocks; - } - } else if (vpaes_capable()) { - ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); - dat->block = vpaes_encrypt; - dat->stream.cbc = NULL; -#if defined(VPAES_CBC) - if (mode == EVP_CIPH_CBC_MODE) { - dat->stream.cbc = vpaes_cbc_encrypt; - } -#endif - if (mode == EVP_CIPH_CTR_MODE) { -#if defined(BSAES) - assert(bsaes_capable()); - dat->stream.ctr = vpaes_ctr32_encrypt_blocks_with_bsaes; -#elif defined(VPAES_CTR32) - dat->stream.ctr = vpaes_ctr32_encrypt_blocks; -#endif - } - } else { - ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); - dat->block = aes_nohw_encrypt; - dat->stream.cbc = NULL; - if (mode == EVP_CIPH_CBC_MODE) { - dat->stream.cbc = aes_nohw_cbc_encrypt; - } - } - - if (ret < 0) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_AES_KEY_SETUP_FAILED); - return 0; - } - - return 1; -} - -static int aes_cbc_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, - size_t len) { - EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; - - if (dat->stream.cbc) { - (*dat->stream.cbc)(in, out, len, &dat->ks.ks, ctx->iv, ctx->encrypt); - } else if (ctx->encrypt) { - CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, ctx->iv, dat->block); - } else { - CRYPTO_cbc128_decrypt(in, out, len, &dat->ks.ks, ctx->iv, dat->block); - } - - return 1; -} - -static int aes_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, - size_t len) { - size_t bl = ctx->cipher->block_size; - EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; - - if (len < bl) { - return 1; - } - - len -= bl; - for (size_t i = 0; i <= len; i += bl) { - (*dat->block)(in + i, out + i, &dat->ks.ks); - } - - return 1; -} - -static int aes_ctr_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, - size_t len) { - EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; - - if (dat->stream.ctr) { - CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks.ks, ctx->iv, ctx->buf, - &ctx->num, dat->stream.ctr); - } else { - CRYPTO_ctr128_encrypt(in, out, len, &dat->ks.ks, ctx->iv, ctx->buf, - &ctx->num, dat->block); - } - return 1; -} - -static int aes_ofb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, - size_t len) { - EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; - - CRYPTO_ofb128_encrypt(in, out, len, &dat->ks.ks, ctx->iv, &ctx->num, - dat->block); - return 1; -} - -ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key, - block128_f *out_block, const uint8_t *key, - size_t key_bytes) { - if (hwaes_capable()) { - aes_hw_set_encrypt_key(key, key_bytes * 8, aes_key); - if (gcm_key != NULL) { - CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_hw_encrypt, 1); - } - if (out_block) { - *out_block = aes_hw_encrypt; - } - return aes_hw_ctr32_encrypt_blocks; - } - - if (vpaes_capable()) { - vpaes_set_encrypt_key(key, key_bytes * 8, aes_key); - if (out_block) { - *out_block = vpaes_encrypt; - } - if (gcm_key != NULL) { - CRYPTO_gcm128_init_key(gcm_key, aes_key, vpaes_encrypt, 0); - } -#if defined(BSAES) - assert(bsaes_capable()); - return vpaes_ctr32_encrypt_blocks_with_bsaes; -#elif defined(VPAES_CTR32) - return vpaes_ctr32_encrypt_blocks; -#else - return NULL; -#endif - } - - aes_nohw_set_encrypt_key(key, key_bytes * 8, aes_key); - if (gcm_key != NULL) { - CRYPTO_gcm128_init_key(gcm_key, aes_key, aes_nohw_encrypt, 0); - } - if (out_block) { - *out_block = aes_nohw_encrypt; - } - return aes_nohw_ctr32_encrypt_blocks; -} - -#if defined(OPENSSL_32_BIT) -#define EVP_AES_GCM_CTX_PADDING (4+8) -#else -#define EVP_AES_GCM_CTX_PADDING 8 -#endif - -static EVP_AES_GCM_CTX *aes_gcm_from_cipher_ctx(EVP_CIPHER_CTX *ctx) { - static_assert( - alignof(EVP_AES_GCM_CTX) <= 16, - "EVP_AES_GCM_CTX needs more alignment than this function provides"); - - // |malloc| guarantees up to 4-byte alignment on 32-bit and 8-byte alignment - // on 64-bit systems, so we need to adjust to reach 16-byte alignment. - assert(ctx->cipher->ctx_size == - sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING); - - char *ptr = ctx->cipher_data; -#if defined(OPENSSL_32_BIT) - assert((uintptr_t)ptr % 4 == 0); - ptr += (uintptr_t)ptr & 4; -#endif - assert((uintptr_t)ptr % 8 == 0); - ptr += (uintptr_t)ptr & 8; - return (EVP_AES_GCM_CTX *)ptr; -} - -static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, - const uint8_t *iv, int enc) { - EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx); - if (!iv && !key) { - return 1; - } - - switch (ctx->key_len) { - case 16: - boringssl_fips_inc_counter(fips_counter_evp_aes_128_gcm); - break; - - case 32: - boringssl_fips_inc_counter(fips_counter_evp_aes_256_gcm); - break; - } - - if (key) { - OPENSSL_memset(&gctx->gcm, 0, sizeof(gctx->gcm)); - gctx->ctr = aes_ctr_set_key(&gctx->ks.ks, &gctx->gcm.gcm_key, NULL, key, - ctx->key_len); - // If we have an iv can set it directly, otherwise use saved IV. - if (iv == NULL && gctx->iv_set) { - iv = gctx->iv; - } - if (iv) { - CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, iv, gctx->ivlen); - gctx->iv_set = 1; - } - gctx->key_set = 1; - } else { - // If key set use IV, otherwise copy - if (gctx->key_set) { - CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, iv, gctx->ivlen); - } else { - OPENSSL_memcpy(gctx->iv, iv, gctx->ivlen); - } - gctx->iv_set = 1; - gctx->iv_gen = 0; - } - return 1; -} - -static void aes_gcm_cleanup(EVP_CIPHER_CTX *c) { - EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c); - OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm)); - if (gctx->iv != c->iv) { - OPENSSL_free(gctx->iv); - } -} - -// increment counter (64-bit int) by 1 -static void ctr64_inc(uint8_t *counter) { - int n = 8; - uint8_t c; - - do { - --n; - c = counter[n]; - ++c; - counter[n] = c; - if (c) { - return; - } - } while (n); -} - -static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) { - EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(c); - switch (type) { - case EVP_CTRL_INIT: - gctx->key_set = 0; - gctx->iv_set = 0; - gctx->ivlen = c->cipher->iv_len; - gctx->iv = c->iv; - gctx->taglen = -1; - gctx->iv_gen = 0; - return 1; - - case EVP_CTRL_AEAD_SET_IVLEN: - if (arg <= 0) { - return 0; - } - - // Allocate memory for IV if needed - if (arg > EVP_MAX_IV_LENGTH && arg > gctx->ivlen) { - if (gctx->iv != c->iv) { - OPENSSL_free(gctx->iv); - } - gctx->iv = OPENSSL_malloc(arg); - if (!gctx->iv) { - return 0; - } - } - gctx->ivlen = arg; - return 1; - - case EVP_CTRL_AEAD_SET_TAG: - if (arg <= 0 || arg > 16 || c->encrypt) { - return 0; - } - OPENSSL_memcpy(c->buf, ptr, arg); - gctx->taglen = arg; - return 1; - - case EVP_CTRL_AEAD_GET_TAG: - if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0) { - return 0; - } - OPENSSL_memcpy(ptr, c->buf, arg); - return 1; - - case EVP_CTRL_AEAD_SET_IV_FIXED: - // Special case: -1 length restores whole IV - if (arg == -1) { - OPENSSL_memcpy(gctx->iv, ptr, gctx->ivlen); - gctx->iv_gen = 1; - return 1; - } - // Fixed field must be at least 4 bytes and invocation field - // at least 8. - if (arg < 4 || (gctx->ivlen - arg) < 8) { - return 0; - } - if (arg) { - OPENSSL_memcpy(gctx->iv, ptr, arg); - } - if (c->encrypt) { - // |RAND_bytes| calls within the fipsmodule should be wrapped with state - // lock functions to avoid updating the service indicator with the DRBG - // functions. - FIPS_service_indicator_lock_state(); - RAND_bytes(gctx->iv + arg, gctx->ivlen - arg); - FIPS_service_indicator_unlock_state(); - } - gctx->iv_gen = 1; - return 1; - - case EVP_CTRL_GCM_IV_GEN: - if (gctx->iv_gen == 0 || gctx->key_set == 0) { - return 0; - } - CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, gctx->iv, gctx->ivlen); - if (arg <= 0 || arg > gctx->ivlen) { - arg = gctx->ivlen; - } - OPENSSL_memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg); - // Invocation field will be at least 8 bytes in size and - // so no need to check wrap around or increment more than - // last 8 bytes. - ctr64_inc(gctx->iv + gctx->ivlen - 8); - gctx->iv_set = 1; - return 1; - - case EVP_CTRL_GCM_SET_IV_INV: - if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt) { - return 0; - } - OPENSSL_memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg); - CRYPTO_gcm128_setiv(&gctx->gcm, &gctx->ks.ks, gctx->iv, gctx->ivlen); - gctx->iv_set = 1; - return 1; - - case EVP_CTRL_COPY: { - EVP_CIPHER_CTX *out = ptr; - EVP_AES_GCM_CTX *gctx_out = aes_gcm_from_cipher_ctx(out); - // |EVP_CIPHER_CTX_copy| copies this generically, but we must redo it in - // case |out->cipher_data| and |in->cipher_data| are differently aligned. - OPENSSL_memcpy(gctx_out, gctx, sizeof(EVP_AES_GCM_CTX)); - if (gctx->iv == c->iv) { - gctx_out->iv = out->iv; - } else { - gctx_out->iv = OPENSSL_malloc(gctx->ivlen); - if (!gctx_out->iv) { - return 0; - } - OPENSSL_memcpy(gctx_out->iv, gctx->iv, gctx->ivlen); - } - return 1; - } - - default: - return -1; - } -} - -static int aes_gcm_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, - size_t len) { - EVP_AES_GCM_CTX *gctx = aes_gcm_from_cipher_ctx(ctx); - - // If not set up, return error - if (!gctx->key_set) { - return -1; - } - if (!gctx->iv_set) { - return -1; - } - - if (in) { - if (out == NULL) { - if (!CRYPTO_gcm128_aad(&gctx->gcm, in, len)) { - return -1; - } - } else if (ctx->encrypt) { - if (gctx->ctr) { - if (!CRYPTO_gcm128_encrypt_ctr32(&gctx->gcm, &gctx->ks.ks, in, out, len, - gctx->ctr)) { - return -1; - } - } else { - if (!CRYPTO_gcm128_encrypt(&gctx->gcm, &gctx->ks.ks, in, out, len)) { - return -1; - } - } - } else { - if (gctx->ctr) { - if (!CRYPTO_gcm128_decrypt_ctr32(&gctx->gcm, &gctx->ks.ks, in, out, len, - gctx->ctr)) { - return -1; - } - } else { - if (!CRYPTO_gcm128_decrypt(&gctx->gcm, &gctx->ks.ks, in, out, len)) { - return -1; - } - } - } - return len; - } else { - if (!ctx->encrypt) { - if (gctx->taglen < 0 || - !CRYPTO_gcm128_finish(&gctx->gcm, ctx->buf, gctx->taglen)) { - return -1; - } - gctx->iv_set = 0; - return 0; - } - CRYPTO_gcm128_tag(&gctx->gcm, ctx->buf, 16); - gctx->taglen = 16; - // Don't reuse the IV - gctx->iv_set = 0; - return 0; - } -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_128_cbc) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_128_cbc; - out->block_size = 16; - out->key_len = 16; - out->iv_len = 16; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_CBC_MODE; - out->init = aes_init_key; - out->cipher = aes_cbc_cipher; -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_128_ctr) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_128_ctr; - out->block_size = 1; - out->key_len = 16; - out->iv_len = 16; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_CTR_MODE; - out->init = aes_init_key; - out->cipher = aes_ctr_cipher; -} - -DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_ecb_generic) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_128_ecb; - out->block_size = 16; - out->key_len = 16; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_ECB_MODE; - out->init = aes_init_key; - out->cipher = aes_ecb_cipher; -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_128_ofb) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_128_ofb128; - out->block_size = 1; - out->key_len = 16; - out->iv_len = 16; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_OFB_MODE; - out->init = aes_init_key; - out->cipher = aes_ofb_cipher; -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_128_gcm) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_128_gcm; - out->block_size = 1; - out->key_len = 16; - out->iv_len = AES_GCM_NONCE_LENGTH; - out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING; - out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY | - EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT | - EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER; - out->init = aes_gcm_init_key; - out->cipher = aes_gcm_cipher; - out->cleanup = aes_gcm_cleanup; - out->ctrl = aes_gcm_ctrl; -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_192_cbc) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_192_cbc; - out->block_size = 16; - out->key_len = 24; - out->iv_len = 16; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_CBC_MODE; - out->init = aes_init_key; - out->cipher = aes_cbc_cipher; -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_192_ctr) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_192_ctr; - out->block_size = 1; - out->key_len = 24; - out->iv_len = 16; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_CTR_MODE; - out->init = aes_init_key; - out->cipher = aes_ctr_cipher; -} - -DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_ecb_generic) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_192_ecb; - out->block_size = 16; - out->key_len = 24; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_ECB_MODE; - out->init = aes_init_key; - out->cipher = aes_ecb_cipher; -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_192_ofb) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_192_ofb128; - out->block_size = 1; - out->key_len = 24; - out->iv_len = 16; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_OFB_MODE; - out->init = aes_init_key; - out->cipher = aes_ofb_cipher; -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_192_gcm) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_192_gcm; - out->block_size = 1; - out->key_len = 24; - out->iv_len = AES_GCM_NONCE_LENGTH; - out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING; - out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY | - EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT | - EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER; - out->init = aes_gcm_init_key; - out->cipher = aes_gcm_cipher; - out->cleanup = aes_gcm_cleanup; - out->ctrl = aes_gcm_ctrl; -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_256_cbc) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_256_cbc; - out->block_size = 16; - out->key_len = 32; - out->iv_len = 16; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_CBC_MODE; - out->init = aes_init_key; - out->cipher = aes_cbc_cipher; -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_256_ctr) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_256_ctr; - out->block_size = 1; - out->key_len = 32; - out->iv_len = 16; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_CTR_MODE; - out->init = aes_init_key; - out->cipher = aes_ctr_cipher; -} - -DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_ecb_generic) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_256_ecb; - out->block_size = 16; - out->key_len = 32; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_ECB_MODE; - out->init = aes_init_key; - out->cipher = aes_ecb_cipher; -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_256_ofb) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_256_ofb128; - out->block_size = 1; - out->key_len = 32; - out->iv_len = 16; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_OFB_MODE; - out->init = aes_init_key; - out->cipher = aes_ofb_cipher; -} - -DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_256_gcm) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_256_gcm; - out->block_size = 1; - out->key_len = 32; - out->iv_len = AES_GCM_NONCE_LENGTH; - out->ctx_size = sizeof(EVP_AES_GCM_CTX) + EVP_AES_GCM_CTX_PADDING; - out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY | - EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT | - EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER; - out->init = aes_gcm_init_key; - out->cipher = aes_gcm_cipher; - out->cleanup = aes_gcm_cleanup; - out->ctrl = aes_gcm_ctrl; -} - -#if defined(HWAES_ECB) - -static int aes_hw_ecb_cipher(EVP_CIPHER_CTX *ctx, uint8_t *out, - const uint8_t *in, size_t len) { - size_t bl = ctx->cipher->block_size; - - if (len < bl) { - return 1; - } - - aes_hw_ecb_encrypt(in, out, len, ctx->cipher_data, ctx->encrypt); - - return 1; -} - -DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_128_ecb) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_128_ecb; - out->block_size = 16; - out->key_len = 16; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_ECB_MODE; - out->init = aes_init_key; - out->cipher = aes_hw_ecb_cipher; -} - -DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_192_ecb) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_192_ecb; - out->block_size = 16; - out->key_len = 24; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_ECB_MODE; - out->init = aes_init_key; - out->cipher = aes_hw_ecb_cipher; -} - -DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_256_ecb) { - memset(out, 0, sizeof(EVP_CIPHER)); - - out->nid = NID_aes_256_ecb; - out->block_size = 16; - out->key_len = 32; - out->ctx_size = sizeof(EVP_AES_KEY); - out->flags = EVP_CIPH_ECB_MODE; - out->init = aes_init_key; - out->cipher = aes_hw_ecb_cipher; -} - -#define EVP_ECB_CIPHER_FUNCTION(keybits) \ - const EVP_CIPHER *EVP_aes_##keybits##_ecb(void) { \ - if (hwaes_capable()) { \ - return aes_hw_##keybits##_ecb(); \ - } \ - return aes_##keybits##_ecb_generic(); \ - } - -#else - -#define EVP_ECB_CIPHER_FUNCTION(keybits) \ - const EVP_CIPHER *EVP_aes_##keybits##_ecb(void) { \ - return aes_##keybits##_ecb_generic(); \ - } - -#endif // HWAES_ECB - -EVP_ECB_CIPHER_FUNCTION(128) -EVP_ECB_CIPHER_FUNCTION(192) -EVP_ECB_CIPHER_FUNCTION(256) - - -#define EVP_AEAD_AES_GCM_TAG_LEN 16 - -struct aead_aes_gcm_ctx { - union { - double align; - AES_KEY ks; - } ks; - GCM128_KEY gcm_key; - ctr128_f ctr; -}; - -static int aead_aes_gcm_init_impl(struct aead_aes_gcm_ctx *gcm_ctx, - size_t *out_tag_len, const uint8_t *key, - size_t key_len, size_t tag_len) { - const size_t key_bits = key_len * 8; - - switch (key_bits) { - case 128: - boringssl_fips_inc_counter(fips_counter_evp_aes_128_gcm); - break; - - case 256: - boringssl_fips_inc_counter(fips_counter_evp_aes_256_gcm); - break; - } - - if (key_bits != 128 && key_bits != 192 && key_bits != 256) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); - return 0; // EVP_AEAD_CTX_init should catch this. - } - - if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) { - tag_len = EVP_AEAD_AES_GCM_TAG_LEN; - } - - if (tag_len > EVP_AEAD_AES_GCM_TAG_LEN) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE); - return 0; - } - - gcm_ctx->ctr = - aes_ctr_set_key(&gcm_ctx->ks.ks, &gcm_ctx->gcm_key, NULL, key, key_len); - *out_tag_len = tag_len; - return 1; -} - -static_assert(sizeof(((EVP_AEAD_CTX *)NULL)->state) >= - sizeof(struct aead_aes_gcm_ctx), - "AEAD state is too small"); -static_assert(alignof(union evp_aead_ctx_st_state) >= - alignof(struct aead_aes_gcm_ctx), - "AEAD state has insufficient alignment"); - -static int aead_aes_gcm_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t requested_tag_len) { - struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *) &ctx->state; - - size_t actual_tag_len; - if (!aead_aes_gcm_init_impl(gcm_ctx, &actual_tag_len, key, key_len, - requested_tag_len)) { - return 0; - } - - ctx->tag_len = actual_tag_len; - return 1; -} - -static void aead_aes_gcm_cleanup(EVP_AEAD_CTX *ctx) {} - -static int aead_aes_gcm_seal_scatter_impl( - const struct aead_aes_gcm_ctx *gcm_ctx, - uint8_t *out, uint8_t *out_tag, size_t *out_tag_len, size_t max_out_tag_len, - const uint8_t *nonce, size_t nonce_len, - const uint8_t *in, size_t in_len, - const uint8_t *extra_in, size_t extra_in_len, - const uint8_t *ad, size_t ad_len, - size_t tag_len) { - if (extra_in_len + tag_len < tag_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - if (max_out_tag_len < extra_in_len + tag_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - if (nonce_len == 0) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); - return 0; - } - - const AES_KEY *key = &gcm_ctx->ks.ks; - - GCM128_CONTEXT gcm; - OPENSSL_memset(&gcm, 0, sizeof(gcm)); - OPENSSL_memcpy(&gcm.gcm_key, &gcm_ctx->gcm_key, sizeof(gcm.gcm_key)); - CRYPTO_gcm128_setiv(&gcm, key, nonce, nonce_len); - - if (ad_len > 0 && !CRYPTO_gcm128_aad(&gcm, ad, ad_len)) { - return 0; - } - - if (gcm_ctx->ctr) { - if (!CRYPTO_gcm128_encrypt_ctr32(&gcm, key, in, out, in_len, - gcm_ctx->ctr)) { - return 0; - } - } else { - if (!CRYPTO_gcm128_encrypt(&gcm, key, in, out, in_len)) { - return 0; - } - } - - if (extra_in_len) { - if (gcm_ctx->ctr) { - if (!CRYPTO_gcm128_encrypt_ctr32(&gcm, key, extra_in, out_tag, - extra_in_len, gcm_ctx->ctr)) { - return 0; - } - } else { - if (!CRYPTO_gcm128_encrypt(&gcm, key, extra_in, out_tag, extra_in_len)) { - return 0; - } - } - } - - CRYPTO_gcm128_tag(&gcm, out_tag + extra_in_len, tag_len); - *out_tag_len = tag_len + extra_in_len; - - return 1; -} - -static int aead_aes_gcm_seal_scatter(const EVP_AEAD_CTX *ctx, uint8_t *out, - uint8_t *out_tag, size_t *out_tag_len, - size_t max_out_tag_len, - const uint8_t *nonce, size_t nonce_len, - const uint8_t *in, size_t in_len, - const uint8_t *extra_in, - size_t extra_in_len, - const uint8_t *ad, size_t ad_len) { - const struct aead_aes_gcm_ctx *gcm_ctx = - (const struct aead_aes_gcm_ctx *)&ctx->state; - return aead_aes_gcm_seal_scatter_impl( - gcm_ctx, out, out_tag, out_tag_len, max_out_tag_len, nonce, nonce_len, in, - in_len, extra_in, extra_in_len, ad, ad_len, ctx->tag_len); -} - -static int aead_aes_gcm_open_gather_impl(const struct aead_aes_gcm_ctx *gcm_ctx, - uint8_t *out, - const uint8_t *nonce, size_t nonce_len, - const uint8_t *in, size_t in_len, - const uint8_t *in_tag, - size_t in_tag_len, - const uint8_t *ad, size_t ad_len, - size_t tag_len) { - uint8_t tag[EVP_AEAD_AES_GCM_TAG_LEN]; - - if (nonce_len == 0) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); - return 0; - } - - if (in_tag_len != tag_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - const AES_KEY *key = &gcm_ctx->ks.ks; - - GCM128_CONTEXT gcm; - OPENSSL_memset(&gcm, 0, sizeof(gcm)); - OPENSSL_memcpy(&gcm.gcm_key, &gcm_ctx->gcm_key, sizeof(gcm.gcm_key)); - CRYPTO_gcm128_setiv(&gcm, key, nonce, nonce_len); - - if (!CRYPTO_gcm128_aad(&gcm, ad, ad_len)) { - return 0; - } - - if (gcm_ctx->ctr) { - if (!CRYPTO_gcm128_decrypt_ctr32(&gcm, key, in, out, in_len, - gcm_ctx->ctr)) { - return 0; - } - } else { - if (!CRYPTO_gcm128_decrypt(&gcm, key, in, out, in_len)) { - return 0; - } - } - - CRYPTO_gcm128_tag(&gcm, tag, tag_len); - if (CRYPTO_memcmp(tag, in_tag, tag_len) != 0) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - return 1; -} - -static int aead_aes_gcm_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out, - const uint8_t *nonce, size_t nonce_len, - const uint8_t *in, size_t in_len, - const uint8_t *in_tag, size_t in_tag_len, - const uint8_t *ad, size_t ad_len) { - struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *)&ctx->state; - if (!aead_aes_gcm_open_gather_impl(gcm_ctx, out, nonce, nonce_len, in, in_len, - in_tag, in_tag_len, ad, ad_len, - ctx->tag_len)) { - return 0; - } - - AEAD_GCM_verify_service_indicator(ctx); - return 1; -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 16; - out->nonce_len = AES_GCM_NONCE_LENGTH; - out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; - out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; - out->seal_scatter_supports_extra_in = 1; - - out->init = aead_aes_gcm_init; - out->cleanup = aead_aes_gcm_cleanup; - out->seal_scatter = aead_aes_gcm_seal_scatter; - out->open_gather = aead_aes_gcm_open_gather; -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_192_gcm) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 24; - out->nonce_len = AES_GCM_NONCE_LENGTH; - out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; - out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; - out->seal_scatter_supports_extra_in = 1; - - out->init = aead_aes_gcm_init; - out->cleanup = aead_aes_gcm_cleanup; - out->seal_scatter = aead_aes_gcm_seal_scatter; - out->open_gather = aead_aes_gcm_open_gather; -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 32; - out->nonce_len = AES_GCM_NONCE_LENGTH; - out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; - out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; - out->seal_scatter_supports_extra_in = 1; - - out->init = aead_aes_gcm_init; - out->cleanup = aead_aes_gcm_cleanup; - out->seal_scatter = aead_aes_gcm_seal_scatter; - out->open_gather = aead_aes_gcm_open_gather; -} - -static int aead_aes_gcm_init_randnonce(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, - size_t requested_tag_len) { - if (requested_tag_len != EVP_AEAD_DEFAULT_TAG_LENGTH) { - if (requested_tag_len < AES_GCM_NONCE_LENGTH) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - requested_tag_len -= AES_GCM_NONCE_LENGTH; - } - - if (!aead_aes_gcm_init(ctx, key, key_len, requested_tag_len)) { - return 0; - } - - ctx->tag_len += AES_GCM_NONCE_LENGTH; - return 1; -} - -static int aead_aes_gcm_seal_scatter_randnonce( - const EVP_AEAD_CTX *ctx, - uint8_t *out, uint8_t *out_tag, size_t *out_tag_len, size_t max_out_tag_len, - const uint8_t *external_nonce, size_t external_nonce_len, - const uint8_t *in, size_t in_len, - const uint8_t *extra_in, size_t extra_in_len, - const uint8_t *ad, size_t ad_len) { - if (external_nonce_len != 0) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); - return 0; - } - - uint8_t nonce[AES_GCM_NONCE_LENGTH]; - if (max_out_tag_len < sizeof(nonce)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - - // |RAND_bytes| calls within the fipsmodule should be wrapped with state lock - // functions to avoid updating the service indicator with the DRBG functions. - FIPS_service_indicator_lock_state(); - RAND_bytes(nonce, sizeof(nonce)); - FIPS_service_indicator_unlock_state(); - - const struct aead_aes_gcm_ctx *gcm_ctx = - (const struct aead_aes_gcm_ctx *)&ctx->state; - if (!aead_aes_gcm_seal_scatter_impl(gcm_ctx, out, out_tag, out_tag_len, - max_out_tag_len - AES_GCM_NONCE_LENGTH, - nonce, sizeof(nonce), in, in_len, - extra_in, extra_in_len, ad, ad_len, - ctx->tag_len - AES_GCM_NONCE_LENGTH)) { - return 0; - } - - assert(*out_tag_len + sizeof(nonce) <= max_out_tag_len); - memcpy(out_tag + *out_tag_len, nonce, sizeof(nonce)); - *out_tag_len += sizeof(nonce); - - AEAD_GCM_verify_service_indicator(ctx); - return 1; -} - -static int aead_aes_gcm_open_gather_randnonce( - const EVP_AEAD_CTX *ctx, uint8_t *out, - const uint8_t *external_nonce, size_t external_nonce_len, - const uint8_t *in, size_t in_len, - const uint8_t *in_tag, size_t in_tag_len, - const uint8_t *ad, size_t ad_len) { - if (external_nonce_len != 0) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); - return 0; - } - - if (in_tag_len < AES_GCM_NONCE_LENGTH) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - const uint8_t *nonce = in_tag + in_tag_len - AES_GCM_NONCE_LENGTH; - - const struct aead_aes_gcm_ctx *gcm_ctx = - (const struct aead_aes_gcm_ctx *)&ctx->state; - if (!aead_aes_gcm_open_gather_impl( - gcm_ctx, out, nonce, AES_GCM_NONCE_LENGTH, in, in_len, in_tag, - in_tag_len - AES_GCM_NONCE_LENGTH, ad, ad_len, - ctx->tag_len - AES_GCM_NONCE_LENGTH)) { - return 0; - } - - AEAD_GCM_verify_service_indicator(ctx); - return 1; -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_randnonce) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 16; - out->nonce_len = 0; - out->overhead = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH; - out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH; - out->seal_scatter_supports_extra_in = 1; - - out->init = aead_aes_gcm_init_randnonce; - out->cleanup = aead_aes_gcm_cleanup; - out->seal_scatter = aead_aes_gcm_seal_scatter_randnonce; - out->open_gather = aead_aes_gcm_open_gather_randnonce; -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_randnonce) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 32; - out->nonce_len = 0; - out->overhead = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH; - out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH; - out->seal_scatter_supports_extra_in = 1; - - out->init = aead_aes_gcm_init_randnonce; - out->cleanup = aead_aes_gcm_cleanup; - out->seal_scatter = aead_aes_gcm_seal_scatter_randnonce; - out->open_gather = aead_aes_gcm_open_gather_randnonce; -} - -struct aead_aes_gcm_tls12_ctx { - struct aead_aes_gcm_ctx gcm_ctx; - uint64_t min_next_nonce; -}; - -static_assert(sizeof(((EVP_AEAD_CTX *)NULL)->state) >= - sizeof(struct aead_aes_gcm_tls12_ctx), - "AEAD state is too small"); -static_assert(alignof(union evp_aead_ctx_st_state) >= - alignof(struct aead_aes_gcm_tls12_ctx), - "AEAD state has insufficient alignment"); - -static int aead_aes_gcm_tls12_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t requested_tag_len) { - struct aead_aes_gcm_tls12_ctx *gcm_ctx = - (struct aead_aes_gcm_tls12_ctx *) &ctx->state; - - gcm_ctx->min_next_nonce = 0; - - size_t actual_tag_len; - if (!aead_aes_gcm_init_impl(&gcm_ctx->gcm_ctx, &actual_tag_len, key, key_len, - requested_tag_len)) { - return 0; - } - - ctx->tag_len = actual_tag_len; - return 1; -} - -static int aead_aes_gcm_tls12_seal_scatter( - const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag, - size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in, - size_t extra_in_len, const uint8_t *ad, size_t ad_len) { - struct aead_aes_gcm_tls12_ctx *gcm_ctx = - (struct aead_aes_gcm_tls12_ctx *) &ctx->state; - - if (nonce_len != AES_GCM_NONCE_LENGTH) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - // The given nonces must be strictly monotonically increasing. - uint64_t given_counter = - CRYPTO_load_u64_be(nonce + nonce_len - sizeof(uint64_t)); - if (given_counter == UINT64_MAX || given_counter < gcm_ctx->min_next_nonce) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE); - return 0; - } - - gcm_ctx->min_next_nonce = given_counter + 1; - - if (!aead_aes_gcm_seal_scatter(ctx, out, out_tag, out_tag_len, - max_out_tag_len, nonce, nonce_len, in, in_len, - extra_in, extra_in_len, ad, ad_len)) { - return 0; - } - - AEAD_GCM_verify_service_indicator(ctx); - return 1; -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_tls12) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 16; - out->nonce_len = AES_GCM_NONCE_LENGTH; - out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; - out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; - out->seal_scatter_supports_extra_in = 1; - - out->init = aead_aes_gcm_tls12_init; - out->cleanup = aead_aes_gcm_cleanup; - out->seal_scatter = aead_aes_gcm_tls12_seal_scatter; - out->open_gather = aead_aes_gcm_open_gather; -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_tls12) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 32; - out->nonce_len = AES_GCM_NONCE_LENGTH; - out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; - out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; - out->seal_scatter_supports_extra_in = 1; - - out->init = aead_aes_gcm_tls12_init; - out->cleanup = aead_aes_gcm_cleanup; - out->seal_scatter = aead_aes_gcm_tls12_seal_scatter; - out->open_gather = aead_aes_gcm_open_gather; -} - -struct aead_aes_gcm_tls13_ctx { - struct aead_aes_gcm_ctx gcm_ctx; - uint64_t min_next_nonce; - uint64_t mask; - uint8_t first; -}; - -static_assert(sizeof(((EVP_AEAD_CTX *)NULL)->state) >= - sizeof(struct aead_aes_gcm_tls13_ctx), - "AEAD state is too small"); -static_assert(alignof(union evp_aead_ctx_st_state) >= - alignof(struct aead_aes_gcm_tls13_ctx), - "AEAD state has insufficient alignment"); - -static int aead_aes_gcm_tls13_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t requested_tag_len) { - struct aead_aes_gcm_tls13_ctx *gcm_ctx = - (struct aead_aes_gcm_tls13_ctx *) &ctx->state; - - gcm_ctx->min_next_nonce = 0; - gcm_ctx->first = 1; - - size_t actual_tag_len; - if (!aead_aes_gcm_init_impl(&gcm_ctx->gcm_ctx, &actual_tag_len, key, key_len, - requested_tag_len)) { - return 0; - } - - ctx->tag_len = actual_tag_len; - return 1; -} - -static int aead_aes_gcm_tls13_seal_scatter( - const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag, - size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in, - size_t extra_in_len, const uint8_t *ad, size_t ad_len) { - struct aead_aes_gcm_tls13_ctx *gcm_ctx = - (struct aead_aes_gcm_tls13_ctx *) &ctx->state; - - if (nonce_len != AES_GCM_NONCE_LENGTH) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); - return 0; - } - - // The given nonces must be strictly monotonically increasing. See - // https://tools.ietf.org/html/rfc8446#section-5.3 for details of the TLS 1.3 - // nonce construction. - uint64_t given_counter = - CRYPTO_load_u64_be(nonce + nonce_len - sizeof(uint64_t)); - - if (gcm_ctx->first) { - // In the first call the sequence number will be zero and therefore the - // given nonce will be 0 ^ mask = mask. - gcm_ctx->mask = given_counter; - gcm_ctx->first = 0; - } - given_counter ^= gcm_ctx->mask; - - if (given_counter == UINT64_MAX || - given_counter < gcm_ctx->min_next_nonce) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE); - return 0; - } - - gcm_ctx->min_next_nonce = given_counter + 1; - - if (!aead_aes_gcm_seal_scatter(ctx, out, out_tag, out_tag_len, - max_out_tag_len, nonce, nonce_len, in, in_len, - extra_in, extra_in_len, ad, ad_len)) { - return 0; - } - - AEAD_GCM_verify_service_indicator(ctx); - return 1; -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_tls13) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 16; - out->nonce_len = AES_GCM_NONCE_LENGTH; - out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; - out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; - out->seal_scatter_supports_extra_in = 1; - - out->init = aead_aes_gcm_tls13_init; - out->cleanup = aead_aes_gcm_cleanup; - out->seal_scatter = aead_aes_gcm_tls13_seal_scatter; - out->open_gather = aead_aes_gcm_open_gather; -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_tls13) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 32; - out->nonce_len = AES_GCM_NONCE_LENGTH; - out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; - out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; - out->seal_scatter_supports_extra_in = 1; - - out->init = aead_aes_gcm_tls13_init; - out->cleanup = aead_aes_gcm_cleanup; - out->seal_scatter = aead_aes_gcm_tls13_seal_scatter; - out->open_gather = aead_aes_gcm_open_gather; -} - -int EVP_has_aes_hardware(void) { -#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) - return hwaes_capable() && crypto_gcm_clmul_enabled(); -#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) - return hwaes_capable() && CRYPTO_is_ARMv8_PMULL_capable(); -#elif defined(OPENSSL_PPC64LE) - return CRYPTO_is_PPC64LE_vcrypto_capable(); -#else - return 0; -#endif -} - -OPENSSL_MSVC_PRAGMA(warning(pop)) diff --git a/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aes.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aes.cc.inc new file mode 100644 index 00000000..4ddf4b95 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aes.cc.inc @@ -0,0 +1,1241 @@ +// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../../internal.h" +#include "../aes/internal.h" +#include "../bcm_interface.h" +#include "../delocate.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +#define AES_GCM_NONCE_LENGTH 12 + +typedef struct { + union { + double align; + AES_KEY ks; + } ks; + block128_f block; + union { + cbc128_f cbc; + ctr128_f ctr; + } stream; +} EVP_AES_KEY; + +typedef struct { + GCM128_KEY key; + GCM128_CONTEXT gcm; + int key_set; // Set if key initialised + int iv_set; // Set if an iv is set + uint8_t *iv; // Temporary IV store + int ivlen; // IV length + int taglen; + int iv_gen; // It is OK to generate IVs + ctr128_f ctr; +} EVP_AES_GCM_CTX; + +static int aes_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, + const uint8_t *iv, int enc) { + int ret; + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + const int mode = ctx->cipher->flags & EVP_CIPH_MODE_MASK; + + if (mode == EVP_CIPH_CTR_MODE) { + switch (ctx->key_len) { + case 16: + boringssl_fips_inc_counter(fips_counter_evp_aes_128_ctr); + break; + + case 32: + boringssl_fips_inc_counter(fips_counter_evp_aes_256_ctr); + break; + } + } + + if ((mode == EVP_CIPH_ECB_MODE || mode == EVP_CIPH_CBC_MODE) && !enc) { + if (hwaes_capable()) { + ret = aes_hw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks); + dat->block = aes_hw_decrypt; + dat->stream.cbc = nullptr; + if (mode == EVP_CIPH_CBC_MODE) { + dat->stream.cbc = aes_hw_cbc_encrypt; + } +#if defined(BSAES) + } else if (bsaes_capable() && mode == EVP_CIPH_CBC_MODE) { + assert(vpaes_capable()); + ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks); + if (ret == 0) { + vpaes_decrypt_key_to_bsaes(&dat->ks.ks, &dat->ks.ks); + } + // If |dat->stream.cbc| is provided, |dat->block| is never used. + dat->block = nullptr; + dat->stream.cbc = bsaes_cbc_encrypt; +#endif + } else if (vpaes_capable()) { + ret = vpaes_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks); + dat->block = vpaes_decrypt; + dat->stream.cbc = nullptr; +#if defined(VPAES_CBC) + if (mode == EVP_CIPH_CBC_MODE) { + dat->stream.cbc = vpaes_cbc_encrypt; + } +#endif + } else { + ret = aes_nohw_set_decrypt_key(key, ctx->key_len * 8, &dat->ks.ks); + dat->block = aes_nohw_decrypt; + dat->stream.cbc = nullptr; + if (mode == EVP_CIPH_CBC_MODE) { + dat->stream.cbc = aes_nohw_cbc_encrypt; + } + } + } else if (hwaes_capable()) { + ret = aes_hw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); + dat->block = aes_hw_encrypt; + dat->stream.cbc = nullptr; + if (mode == EVP_CIPH_CBC_MODE) { + dat->stream.cbc = aes_hw_cbc_encrypt; + } else if (mode == EVP_CIPH_CTR_MODE) { + dat->stream.ctr = aes_hw_ctr32_encrypt_blocks; + } + } else if (vpaes_capable()) { + ret = vpaes_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); + dat->block = vpaes_encrypt; + dat->stream.cbc = nullptr; +#if defined(VPAES_CBC) + if (mode == EVP_CIPH_CBC_MODE) { + dat->stream.cbc = vpaes_cbc_encrypt; + } +#endif + if (mode == EVP_CIPH_CTR_MODE) { +#if defined(BSAES) + assert(bsaes_capable()); + dat->stream.ctr = vpaes_ctr32_encrypt_blocks_with_bsaes; +#else + dat->stream.ctr = vpaes_ctr32_encrypt_blocks; +#endif + } + } else { + ret = aes_nohw_set_encrypt_key(key, ctx->key_len * 8, &dat->ks.ks); + dat->block = aes_nohw_encrypt; + dat->stream.cbc = nullptr; + if (mode == EVP_CIPH_CBC_MODE) { + dat->stream.cbc = aes_nohw_cbc_encrypt; + } else if (mode == EVP_CIPH_CTR_MODE) { + dat->stream.ctr = aes_nohw_ctr32_encrypt_blocks; + } + } + + if (ret < 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_AES_KEY_SETUP_FAILED); + return 0; + } + + return 1; +} + +static int aes_cbc_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + + if (dat->stream.cbc) { + (*dat->stream.cbc)(in, out, len, &dat->ks.ks, ctx->iv, ctx->encrypt); + } else if (ctx->encrypt) { + CRYPTO_cbc128_encrypt(in, out, len, &dat->ks.ks, ctx->iv, dat->block); + } else { + CRYPTO_cbc128_decrypt(in, out, len, &dat->ks.ks, ctx->iv, dat->block); + } + + return 1; +} + +static int aes_ecb_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + size_t bl = ctx->cipher->block_size; + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + + if (len < bl) { + return 1; + } + + len -= bl; + for (size_t i = 0; i <= len; i += bl) { + (*dat->block)(in + i, out + i, &dat->ks.ks); + } + + return 1; +} + +static int aes_ctr_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + CRYPTO_ctr128_encrypt_ctr32(in, out, len, &dat->ks.ks, ctx->iv, ctx->buf, + &ctx->num, dat->stream.ctr); + return 1; +} + +static int aes_ofb_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + EVP_AES_KEY *dat = (EVP_AES_KEY *)ctx->cipher_data; + + CRYPTO_ofb128_encrypt(in, out, len, &dat->ks.ks, ctx->iv, &ctx->num, + dat->block); + return 1; +} + +static int aes_gcm_init_key(EVP_CIPHER_CTX *ctx, const uint8_t *key, + const uint8_t *iv, int enc) { + EVP_AES_GCM_CTX *gctx = reinterpret_cast(ctx->cipher_data); + if (!iv && !key) { + return 1; + } + + // We must configure first the key, then the IV, but the caller may pass both + // together, or separately in either order. + if (key) { + OPENSSL_memset(&gctx->gcm, 0, sizeof(gctx->gcm)); + CRYPTO_gcm128_init_aes_key(&gctx->key, key, ctx->key_len); + // Use the IV if specified. Otherwise, use the saved IV, if any. + if (iv == nullptr && gctx->iv_set) { + iv = gctx->iv; + } + if (iv) { + CRYPTO_gcm128_init_ctx(&gctx->key, &gctx->gcm, iv, gctx->ivlen); + gctx->iv_set = 1; + } + gctx->key_set = 1; + } else { + if (gctx->key_set) { + CRYPTO_gcm128_init_ctx(&gctx->key, &gctx->gcm, iv, gctx->ivlen); + } else { + // The caller specified the IV before the key. Save the IV for later. + OPENSSL_memcpy(gctx->iv, iv, gctx->ivlen); + } + gctx->iv_set = 1; + gctx->iv_gen = 0; + } + return 1; +} + +static void aes_gcm_cleanup(EVP_CIPHER_CTX *c) { + EVP_AES_GCM_CTX *gctx = reinterpret_cast(c->cipher_data); + OPENSSL_cleanse(&gctx->key, sizeof(gctx->key)); + OPENSSL_cleanse(&gctx->gcm, sizeof(gctx->gcm)); + if (gctx->iv != c->iv) { + OPENSSL_free(gctx->iv); + } +} + +static int aes_gcm_ctrl(EVP_CIPHER_CTX *c, int type, int arg, void *ptr) { + EVP_AES_GCM_CTX *gctx = reinterpret_cast(c->cipher_data); + switch (type) { + case EVP_CTRL_INIT: + gctx->key_set = 0; + gctx->iv_set = 0; + gctx->ivlen = c->cipher->iv_len; + gctx->iv = c->iv; + gctx->taglen = -1; + gctx->iv_gen = 0; + return 1; + + case EVP_CTRL_AEAD_SET_IVLEN: + if (arg <= 0) { + return 0; + } + if (arg == gctx->ivlen) { + // Some applications set the IV and then do a no-op length change, + // expecting the IV to remain set. + return 1; + } + + // Clear any existing IV or partial IV that may have previously been set. + gctx->iv_set = 0; + gctx->iv_gen = 0; + + if (arg <= EVP_MAX_IV_LENGTH) { + // IVs less than EVP_MAX_IV_LENGTH are stored in the built-in buffer. + if (gctx->iv != c->iv) { + OPENSSL_free(gctx->iv); + gctx->iv = c->iv; + } + } else if (arg > gctx->ivlen) { + // The existing buffer isn't large enough. + uint8_t *new_iv = static_cast(OPENSSL_malloc(arg)); + if (new_iv == nullptr) { + return 0; + } + if (gctx->iv != c->iv) { + OPENSSL_free(gctx->iv); + } + gctx->iv = new_iv; + } + gctx->ivlen = arg; + return 1; + + case EVP_CTRL_GET_IVLEN: + *(int *)ptr = gctx->ivlen; + return 1; + + case EVP_CTRL_AEAD_SET_TAG: + if (arg <= 0 || arg > 16 || c->encrypt) { + return 0; + } + OPENSSL_memcpy(c->buf, ptr, arg); + gctx->taglen = arg; + return 1; + + case EVP_CTRL_AEAD_GET_TAG: + if (arg <= 0 || arg > 16 || !c->encrypt || gctx->taglen < 0) { + return 0; + } + OPENSSL_memcpy(ptr, c->buf, arg); + return 1; + + case EVP_CTRL_AEAD_SET_IV_FIXED: + // Special case: -1 length restores whole IV + if (arg == -1) { + OPENSSL_memcpy(gctx->iv, ptr, gctx->ivlen); + gctx->iv_gen = 1; + return 1; + } + // Fixed field must be at least 4 bytes and invocation field + // at least 8. + if (arg < 4 || (gctx->ivlen - arg) < 8) { + return 0; + } + OPENSSL_memcpy(gctx->iv, ptr, arg); + if (c->encrypt) { + // |BCM_rand_bytes| calls within the fipsmodule should be wrapped with + // state lock functions to avoid updating the service indicator with the + // DRBG functions. + FIPS_service_indicator_lock_state(); + BCM_rand_bytes(gctx->iv + arg, gctx->ivlen - arg); + FIPS_service_indicator_unlock_state(); + } + gctx->iv_gen = 1; + return 1; + + case EVP_CTRL_GCM_IV_GEN: { + if (gctx->iv_gen == 0 || gctx->key_set == 0) { + return 0; + } + CRYPTO_gcm128_init_ctx(&gctx->key, &gctx->gcm, gctx->iv, gctx->ivlen); + if (arg <= 0 || arg > gctx->ivlen) { + arg = gctx->ivlen; + } + OPENSSL_memcpy(ptr, gctx->iv + gctx->ivlen - arg, arg); + // Invocation field will be at least 8 bytes in size, so no need to check + // wrap around or increment more than last 8 bytes. + uint8_t *ctr = gctx->iv + gctx->ivlen - 8; + CRYPTO_store_u64_be(ctr, CRYPTO_load_u64_be(ctr) + 1); + gctx->iv_set = 1; + return 1; + } + + case EVP_CTRL_GCM_SET_IV_INV: + if (gctx->iv_gen == 0 || gctx->key_set == 0 || c->encrypt) { + return 0; + } + OPENSSL_memcpy(gctx->iv + gctx->ivlen - arg, ptr, arg); + CRYPTO_gcm128_init_ctx(&gctx->key, &gctx->gcm, gctx->iv, gctx->ivlen); + gctx->iv_set = 1; + return 1; + + case EVP_CTRL_COPY: { + EVP_CIPHER_CTX *out = reinterpret_cast(ptr); + EVP_AES_GCM_CTX *gctx_out = + reinterpret_cast(out->cipher_data); + if (gctx->iv == c->iv) { + gctx_out->iv = out->iv; + } else { + gctx_out->iv = + reinterpret_cast(OPENSSL_memdup(gctx->iv, gctx->ivlen)); + if (!gctx_out->iv) { + return 0; + } + } + return 1; + } + + default: + return -1; + } +} + +static int aes_gcm_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + EVP_AES_GCM_CTX *gctx = reinterpret_cast(ctx->cipher_data); + + // If not set up, return error + if (!gctx->key_set || !gctx->iv_set) { + return 0; + } + + if (ctx->encrypt) { + return CRYPTO_gcm128_encrypt(&gctx->key, &gctx->gcm, in, out, len); + } else { + return CRYPTO_gcm128_decrypt(&gctx->key, &gctx->gcm, in, out, len); + } +} + +static int aes_gcm_cipher_final(EVP_CIPHER_CTX *ctx) { + EVP_AES_GCM_CTX *gctx = reinterpret_cast(ctx->cipher_data); + + // If not set up, return error + if (!gctx->key_set || !gctx->iv_set) { + return 0; + } + + if (!ctx->encrypt) { + if (gctx->taglen < 0 || + !CRYPTO_gcm128_finish(&gctx->key, &gctx->gcm, ctx->buf, gctx->taglen)) { + return 0; + } + gctx->iv_set = 0; + return 1; + } + CRYPTO_gcm128_tag(&gctx->key, &gctx->gcm, ctx->buf, 16); + gctx->taglen = 16; + // Don't reuse the IV + gctx->iv_set = 0; + return 1; +} + +static int aes_gcm_update_aad(EVP_CIPHER_CTX *ctx, const uint8_t *in, + size_t len) { + EVP_AES_GCM_CTX *gctx = reinterpret_cast(ctx->cipher_data); + + // If not set up, return error + if (!gctx->key_set || !gctx->iv_set) { + return 0; + } + + return CRYPTO_gcm128_aad(&gctx->key, &gctx->gcm, in, len); +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_128_cbc) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_128_cbc; + out->block_size = 16; + out->key_len = 16; + out->iv_len = 16; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_CBC_MODE; + out->init = aes_init_key; + out->cipher_update = aes_cbc_cipher_update; +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_128_ctr) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_128_ctr; + out->block_size = 1; + out->key_len = 16; + out->iv_len = 16; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_CTR_MODE; + out->init = aes_init_key; + out->cipher_update = aes_ctr_cipher_update; +} + +DEFINE_LOCAL_DATA(EVP_CIPHER, aes_128_ecb_generic) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_128_ecb; + out->block_size = 16; + out->key_len = 16; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_ECB_MODE; + out->init = aes_init_key; + out->cipher_update = aes_ecb_cipher_update; +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_128_ofb) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_128_ofb128; + out->block_size = 1; + out->key_len = 16; + out->iv_len = 16; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_OFB_MODE; + out->init = aes_init_key; + out->cipher_update = aes_ofb_cipher_update; +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_128_gcm) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_128_gcm; + out->block_size = 1; + out->key_len = 16; + out->iv_len = AES_GCM_NONCE_LENGTH; + out->ctx_size = sizeof(EVP_AES_GCM_CTX); + out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY | + EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT | + EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER; + out->init = aes_gcm_init_key; + out->cipher_update = aes_gcm_cipher_update; + out->cipher_final = aes_gcm_cipher_final; + out->update_aad = aes_gcm_update_aad; + out->cleanup = aes_gcm_cleanup; + out->ctrl = aes_gcm_ctrl; +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_192_cbc) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_192_cbc; + out->block_size = 16; + out->key_len = 24; + out->iv_len = 16; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_CBC_MODE; + out->init = aes_init_key; + out->cipher_update = aes_cbc_cipher_update; +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_192_ctr) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_192_ctr; + out->block_size = 1; + out->key_len = 24; + out->iv_len = 16; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_CTR_MODE; + out->init = aes_init_key; + out->cipher_update = aes_ctr_cipher_update; +} + +DEFINE_LOCAL_DATA(EVP_CIPHER, aes_192_ecb_generic) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_192_ecb; + out->block_size = 16; + out->key_len = 24; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_ECB_MODE; + out->init = aes_init_key; + out->cipher_update = aes_ecb_cipher_update; +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_192_ofb) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_192_ofb128; + out->block_size = 1; + out->key_len = 24; + out->iv_len = 16; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_OFB_MODE; + out->init = aes_init_key; + out->cipher_update = aes_ofb_cipher_update; +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_192_gcm) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_192_gcm; + out->block_size = 1; + out->key_len = 24; + out->iv_len = AES_GCM_NONCE_LENGTH; + out->ctx_size = sizeof(EVP_AES_GCM_CTX); + out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY | + EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT | + EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER; + out->init = aes_gcm_init_key; + out->cipher_update = aes_gcm_cipher_update; + out->cipher_final = aes_gcm_cipher_final; + out->update_aad = aes_gcm_update_aad; + out->cleanup = aes_gcm_cleanup; + out->ctrl = aes_gcm_ctrl; +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_256_cbc) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_256_cbc; + out->block_size = 16; + out->key_len = 32; + out->iv_len = 16; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_CBC_MODE; + out->init = aes_init_key; + out->cipher_update = aes_cbc_cipher_update; +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_256_ctr) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_256_ctr; + out->block_size = 1; + out->key_len = 32; + out->iv_len = 16; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_CTR_MODE; + out->init = aes_init_key; + out->cipher_update = aes_ctr_cipher_update; +} + +DEFINE_LOCAL_DATA(EVP_CIPHER, aes_256_ecb_generic) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_256_ecb; + out->block_size = 16; + out->key_len = 32; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_ECB_MODE; + out->init = aes_init_key; + out->cipher_update = aes_ecb_cipher_update; +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_256_ofb) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_256_ofb128; + out->block_size = 1; + out->key_len = 32; + out->iv_len = 16; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_OFB_MODE; + out->init = aes_init_key; + out->cipher_update = aes_ofb_cipher_update; +} + +DEFINE_METHOD_FUNCTION(EVP_CIPHER, EVP_aes_256_gcm) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_256_gcm; + out->block_size = 1; + out->key_len = 32; + out->iv_len = AES_GCM_NONCE_LENGTH; + out->ctx_size = sizeof(EVP_AES_GCM_CTX); + out->flags = EVP_CIPH_GCM_MODE | EVP_CIPH_CUSTOM_IV | EVP_CIPH_CUSTOM_COPY | + EVP_CIPH_FLAG_CUSTOM_CIPHER | EVP_CIPH_ALWAYS_CALL_INIT | + EVP_CIPH_CTRL_INIT | EVP_CIPH_FLAG_AEAD_CIPHER; + out->init = aes_gcm_init_key; + out->cipher_update = aes_gcm_cipher_update; + out->cipher_final = aes_gcm_cipher_final; + out->update_aad = aes_gcm_update_aad; + out->cleanup = aes_gcm_cleanup; + out->ctrl = aes_gcm_ctrl; +} + +#if defined(HWAES_ECB) + +static int aes_hw_ecb_cipher_update(EVP_CIPHER_CTX *ctx, uint8_t *out, + const uint8_t *in, size_t len) { + size_t bl = ctx->cipher->block_size; + + if (len < bl) { + return 1; + } + + aes_hw_ecb_encrypt(in, out, len, + reinterpret_cast(ctx->cipher_data), + ctx->encrypt); + + return 1; +} + +DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_128_ecb) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_128_ecb; + out->block_size = 16; + out->key_len = 16; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_ECB_MODE; + out->init = aes_init_key; + out->cipher_update = aes_hw_ecb_cipher_update; +} + +DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_192_ecb) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_192_ecb; + out->block_size = 16; + out->key_len = 24; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_ECB_MODE; + out->init = aes_init_key; + out->cipher_update = aes_hw_ecb_cipher_update; +} + +DEFINE_LOCAL_DATA(EVP_CIPHER, aes_hw_256_ecb) { + memset(out, 0, sizeof(EVP_CIPHER)); + + out->nid = NID_aes_256_ecb; + out->block_size = 16; + out->key_len = 32; + out->ctx_size = sizeof(EVP_AES_KEY); + out->flags = EVP_CIPH_ECB_MODE; + out->init = aes_init_key; + out->cipher_update = aes_hw_ecb_cipher_update; +} + +#define EVP_ECB_CIPHER_FUNCTION(keybits) \ + const EVP_CIPHER *EVP_aes_##keybits##_ecb() { \ + if (hwaes_capable()) { \ + return aes_hw_##keybits##_ecb(); \ + } \ + return aes_##keybits##_ecb_generic(); \ + } + +#else + +#define EVP_ECB_CIPHER_FUNCTION(keybits) \ + const EVP_CIPHER *EVP_aes_##keybits##_ecb() { \ + return aes_##keybits##_ecb_generic(); \ + } + +#endif // HWAES_ECB + +EVP_ECB_CIPHER_FUNCTION(128) +EVP_ECB_CIPHER_FUNCTION(192) +EVP_ECB_CIPHER_FUNCTION(256) + + +#define EVP_AEAD_AES_GCM_TAG_LEN 16 + +namespace { +struct aead_aes_gcm_ctx { + GCM128_KEY key; +}; +} // namespace + +static int aead_aes_gcm_init_impl(struct aead_aes_gcm_ctx *gcm_ctx, + size_t *out_tag_len, const uint8_t *key, + size_t key_len, size_t tag_len) { + const size_t key_bits = key_len * 8; + if (key_bits != 128 && key_bits != 192 && key_bits != 256) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); + return 0; // EVP_AEAD_CTX_init should catch this. + } + + if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) { + tag_len = EVP_AEAD_AES_GCM_TAG_LEN; + } + + if (tag_len > EVP_AEAD_AES_GCM_TAG_LEN) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE); + return 0; + } + + CRYPTO_gcm128_init_aes_key(&gcm_ctx->key, key, key_len); + *out_tag_len = tag_len; + return 1; +} + +static_assert(sizeof(((EVP_AEAD_CTX *)nullptr)->state) >= + sizeof(struct aead_aes_gcm_ctx), + "AEAD state is too small"); +static_assert(alignof(union evp_aead_ctx_st_state) >= + alignof(struct aead_aes_gcm_ctx), + "AEAD state has insufficient alignment"); + +static int aead_aes_gcm_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t requested_tag_len) { + struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *)&ctx->state; + + size_t actual_tag_len; + if (!aead_aes_gcm_init_impl(gcm_ctx, &actual_tag_len, key, key_len, + requested_tag_len)) { + return 0; + } + + ctx->tag_len = actual_tag_len; + return 1; +} + +static void aead_aes_gcm_cleanup(EVP_AEAD_CTX *ctx) {} + +static int aead_aes_gcm_sealv_impl(const struct aead_aes_gcm_ctx *gcm_ctx, + Span iovecs, + Span out_tag, size_t *out_tag_len, + Span nonce, + Span aadvecs, + size_t tag_len) { + if (out_tag.size() < tag_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + + if (nonce.size() == 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); + return 0; + } + + const GCM128_KEY *key = &gcm_ctx->key; + GCM128_CONTEXT gcm; + CRYPTO_gcm128_init_ctx(key, &gcm, nonce.data(), nonce.size()); + + for (const CRYPTO_IVEC &aadvec : aadvecs) { + if (!CRYPTO_gcm128_aad(key, &gcm, aadvec.in, aadvec.len)) { + return 0; + } + } + + for (const CRYPTO_IOVEC &iovec : iovecs) { + if (!CRYPTO_gcm128_encrypt(key, &gcm, iovec.in, iovec.out, iovec.len)) { + return 0; + } + } + + CRYPTO_gcm128_tag(key, &gcm, out_tag.data(), tag_len); + *out_tag_len = tag_len; + + return 1; +} + +static int aead_aes_gcm_sealv(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, size_t *out_tag_len, + Span nonce, + Span aadvecs) { + const struct aead_aes_gcm_ctx *gcm_ctx = + (const struct aead_aes_gcm_ctx *)&ctx->state; + return aead_aes_gcm_sealv_impl(gcm_ctx, iovecs, out_tag, out_tag_len, nonce, + aadvecs, ctx->tag_len); +} + +static int aead_aes_gcm_openv_detached_impl( + const struct aead_aes_gcm_ctx *gcm_ctx, Span iovecs, + Span nonce, Span in_tag, + Span aadvecs, size_t tag_len) { + uint8_t tag[EVP_AEAD_AES_GCM_TAG_LEN]; + + if (nonce.size() == 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); + return 0; + } + + if (in_tag.size() != tag_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + const GCM128_KEY *key = &gcm_ctx->key; + GCM128_CONTEXT gcm; + CRYPTO_gcm128_init_ctx(key, &gcm, nonce.data(), nonce.size()); + + for (const CRYPTO_IVEC &aadvec : aadvecs) { + if (!CRYPTO_gcm128_aad(key, &gcm, aadvec.in, aadvec.len)) { + return 0; + } + } + + for (const CRYPTO_IOVEC &iovec : iovecs) { + if (!CRYPTO_gcm128_decrypt(key, &gcm, iovec.in, iovec.out, iovec.len)) { + return 0; + } + } + + CRYPTO_gcm128_tag(key, &gcm, tag, tag_len); + if (CRYPTO_memcmp(tag, in_tag.data(), tag_len) != 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + return 1; +} + +static int aead_aes_gcm_openv_detached(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span nonce, + Span in_tag, + Span aadvecs) { + struct aead_aes_gcm_ctx *gcm_ctx = (struct aead_aes_gcm_ctx *)&ctx->state; + if (!aead_aes_gcm_openv_detached_impl(gcm_ctx, iovecs, nonce, in_tag, aadvecs, + ctx->tag_len)) { + return 0; + } + + AEAD_GCM_verify_service_indicator(ctx); + return 1; +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 16; + out->nonce_len = AES_GCM_NONCE_LENGTH; + out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; + out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; + + out->init = aead_aes_gcm_init; + out->cleanup = aead_aes_gcm_cleanup; + out->sealv = aead_aes_gcm_sealv; + out->openv_detached = aead_aes_gcm_openv_detached; +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_192_gcm) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 24; + out->nonce_len = AES_GCM_NONCE_LENGTH; + out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; + out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; + + out->init = aead_aes_gcm_init; + out->cleanup = aead_aes_gcm_cleanup; + out->sealv = aead_aes_gcm_sealv; + out->openv_detached = aead_aes_gcm_openv_detached; +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 32; + out->nonce_len = AES_GCM_NONCE_LENGTH; + out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; + out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; + + out->init = aead_aes_gcm_init; + out->cleanup = aead_aes_gcm_cleanup; + out->sealv = aead_aes_gcm_sealv; + out->openv_detached = aead_aes_gcm_openv_detached; +} + +static int aead_aes_gcm_init_randnonce(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, + size_t requested_tag_len) { + if (requested_tag_len != EVP_AEAD_DEFAULT_TAG_LENGTH) { + if (requested_tag_len < AES_GCM_NONCE_LENGTH) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + requested_tag_len -= AES_GCM_NONCE_LENGTH; + } + + if (!aead_aes_gcm_init(ctx, key, key_len, requested_tag_len)) { + return 0; + } + + ctx->tag_len += AES_GCM_NONCE_LENGTH; + return 1; +} + +static int aead_aes_gcm_sealv_randnonce(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, + size_t *out_tag_len, + Span external_nonce, + Span aadvecs) { + if (external_nonce.size() != 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); + return 0; + } + + uint8_t nonce[AES_GCM_NONCE_LENGTH]; + if (out_tag.size() < sizeof(nonce)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + + // |BCM_rand_bytes| calls within the fipsmodule should be wrapped with state + // lock functions to avoid updating the service indicator with the DRBG + // functions. + FIPS_service_indicator_lock_state(); + BCM_rand_bytes(nonce, sizeof(nonce)); + FIPS_service_indicator_unlock_state(); + + const struct aead_aes_gcm_ctx *gcm_ctx = + (const struct aead_aes_gcm_ctx *)&ctx->state; + if (!aead_aes_gcm_sealv_impl( + gcm_ctx, iovecs, out_tag.first(out_tag.size() - AES_GCM_NONCE_LENGTH), + out_tag_len, nonce, aadvecs, ctx->tag_len - AES_GCM_NONCE_LENGTH)) { + return 0; + } + + assert(*out_tag_len + sizeof(nonce) <= out_tag.size()); + CopyToPrefix(nonce, out_tag.subspan(*out_tag_len)); + *out_tag_len += sizeof(nonce); + + AEAD_GCM_verify_service_indicator(ctx); + return 1; +} + +static int aead_aes_gcm_openv_detached_randnonce( + const EVP_AEAD_CTX *ctx, Span iovecs, + Span external_nonce, Span in_tag, + Span aadvecs) { + if (external_nonce.size() != 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); + return 0; + } + + if (in_tag.size() < AES_GCM_NONCE_LENGTH) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + const struct aead_aes_gcm_ctx *gcm_ctx = + (const struct aead_aes_gcm_ctx *)&ctx->state; + if (!aead_aes_gcm_openv_detached_impl( + gcm_ctx, iovecs, in_tag.last(AES_GCM_NONCE_LENGTH), + in_tag.first(in_tag.size() - AES_GCM_NONCE_LENGTH), aadvecs, + ctx->tag_len - AES_GCM_NONCE_LENGTH)) { + return 0; + } + + AEAD_GCM_verify_service_indicator(ctx); + return 1; +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_randnonce) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 16; + out->nonce_len = 0; + out->overhead = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH; + out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH; + + out->init = aead_aes_gcm_init_randnonce; + out->cleanup = aead_aes_gcm_cleanup; + out->sealv = aead_aes_gcm_sealv_randnonce; + out->openv_detached = aead_aes_gcm_openv_detached_randnonce; +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_randnonce) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 32; + out->nonce_len = 0; + out->overhead = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH; + out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN + AES_GCM_NONCE_LENGTH; + + out->init = aead_aes_gcm_init_randnonce; + out->cleanup = aead_aes_gcm_cleanup; + out->sealv = aead_aes_gcm_sealv_randnonce; + out->openv_detached = aead_aes_gcm_openv_detached_randnonce; +} + +namespace { +struct aead_aes_gcm_tls12_ctx { + struct aead_aes_gcm_ctx gcm_ctx; + uint64_t min_next_nonce; +}; +} // namespace + +static_assert(sizeof(((EVP_AEAD_CTX *)nullptr)->state) >= + sizeof(struct aead_aes_gcm_tls12_ctx), + "AEAD state is too small"); +static_assert(alignof(union evp_aead_ctx_st_state) >= + alignof(struct aead_aes_gcm_tls12_ctx), + "AEAD state has insufficient alignment"); + +static int aead_aes_gcm_tls12_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t requested_tag_len) { + struct aead_aes_gcm_tls12_ctx *gcm_ctx = + (struct aead_aes_gcm_tls12_ctx *)&ctx->state; + + gcm_ctx->min_next_nonce = 0; + + size_t actual_tag_len; + if (!aead_aes_gcm_init_impl(&gcm_ctx->gcm_ctx, &actual_tag_len, key, key_len, + requested_tag_len)) { + return 0; + } + + ctx->tag_len = actual_tag_len; + return 1; +} + +static int aead_aes_gcm_tls12_sealv(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, size_t *out_tag_len, + Span nonce, + Span aadvecs) { + struct aead_aes_gcm_tls12_ctx *gcm_ctx = + (struct aead_aes_gcm_tls12_ctx *)&ctx->state; + + if (nonce.size() != AES_GCM_NONCE_LENGTH) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + // The given nonces must be strictly monotonically increasing. + uint64_t given_counter = + CRYPTO_load_u64_be(nonce.last(sizeof(uint64_t)).data()); + if (given_counter == UINT64_MAX || given_counter < gcm_ctx->min_next_nonce) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE); + return 0; + } + + gcm_ctx->min_next_nonce = given_counter + 1; + + if (!aead_aes_gcm_sealv(ctx, iovecs, out_tag, out_tag_len, nonce, aadvecs)) { + return 0; + } + + AEAD_GCM_verify_service_indicator(ctx); + return 1; +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_tls12) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 16; + out->nonce_len = AES_GCM_NONCE_LENGTH; + out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; + out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; + + out->init = aead_aes_gcm_tls12_init; + out->cleanup = aead_aes_gcm_cleanup; + out->sealv = aead_aes_gcm_tls12_sealv; + out->openv_detached = aead_aes_gcm_openv_detached; +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_tls12) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 32; + out->nonce_len = AES_GCM_NONCE_LENGTH; + out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; + out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; + + out->init = aead_aes_gcm_tls12_init; + out->cleanup = aead_aes_gcm_cleanup; + out->sealv = aead_aes_gcm_tls12_sealv; + out->openv_detached = aead_aes_gcm_openv_detached; +} + +namespace { +struct aead_aes_gcm_tls13_ctx { + struct aead_aes_gcm_ctx gcm_ctx; + uint64_t min_next_nonce; + uint64_t mask; +}; +} // namespace + +static_assert(sizeof(((EVP_AEAD_CTX *)nullptr)->state) >= + sizeof(struct aead_aes_gcm_tls13_ctx), + "AEAD state is too small"); +static_assert(alignof(union evp_aead_ctx_st_state) >= + alignof(struct aead_aes_gcm_tls13_ctx), + "AEAD state has insufficient alignment"); + +static int aead_aes_gcm_tls13_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t requested_tag_len) { + struct aead_aes_gcm_tls13_ctx *gcm_ctx = + (struct aead_aes_gcm_tls13_ctx *)&ctx->state; + + gcm_ctx->min_next_nonce = 0; + + size_t actual_tag_len; + if (!aead_aes_gcm_init_impl(&gcm_ctx->gcm_ctx, &actual_tag_len, key, key_len, + requested_tag_len)) { + return 0; + } + + ctx->tag_len = actual_tag_len; + return 1; +} + +static int aead_aes_gcm_tls13_sealv(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, size_t *out_tag_len, + Span nonce, + Span aadvecs) { + struct aead_aes_gcm_tls13_ctx *gcm_ctx = + (struct aead_aes_gcm_tls13_ctx *)&ctx->state; + + if (nonce.size() != AES_GCM_NONCE_LENGTH) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_UNSUPPORTED_NONCE_SIZE); + return 0; + } + + // The given nonces must be strictly monotonically increasing. See + // https://tools.ietf.org/html/rfc8446#section-5.3 for details of the TLS 1.3 + // nonce construction. + uint64_t given_counter = + CRYPTO_load_u64_be(nonce.last(sizeof(uint64_t)).data()); + + if (gcm_ctx->min_next_nonce == 0) { + // In the first call the sequence number will be zero and therefore the + // given nonce will be 0 ^ mask = mask. + gcm_ctx->mask = given_counter; + gcm_ctx->min_next_nonce = 1; + } else { + given_counter ^= gcm_ctx->mask; + if (given_counter == UINT64_MAX || + given_counter < gcm_ctx->min_next_nonce) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE); + return 0; + } + gcm_ctx->min_next_nonce = given_counter + 1; + } + + if (!aead_aes_gcm_sealv(ctx, iovecs, out_tag, out_tag_len, nonce, aadvecs)) { + return 0; + } + + AEAD_GCM_verify_service_indicator(ctx); + return 1; +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_gcm_tls13) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 16; + out->nonce_len = AES_GCM_NONCE_LENGTH; + out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; + out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; + + out->init = aead_aes_gcm_tls13_init; + out->cleanup = aead_aes_gcm_cleanup; + out->sealv = aead_aes_gcm_tls13_sealv; + out->openv_detached = aead_aes_gcm_openv_detached; +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_256_gcm_tls13) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 32; + out->nonce_len = AES_GCM_NONCE_LENGTH; + out->overhead = EVP_AEAD_AES_GCM_TAG_LEN; + out->max_tag_len = EVP_AEAD_AES_GCM_TAG_LEN; + + out->init = aead_aes_gcm_tls13_init; + out->cleanup = aead_aes_gcm_cleanup; + out->sealv = aead_aes_gcm_tls13_sealv; + out->openv_detached = aead_aes_gcm_openv_detached; +} + +int EVP_has_aes_hardware() { +#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) + return hwaes_capable() && crypto_gcm_clmul_enabled(); +#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) + return hwaes_capable() && CRYPTO_is_ARMv8_PMULL_capable(); +#else + return 0; +#endif +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aesccm.c b/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aesccm.c deleted file mode 100644 index c00bf61e..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aesccm.c +++ /dev/null @@ -1,453 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2008 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#include - -#include - -#include -#include -#include - -#include "../delocate.h" -#include "../service_indicator/internal.h" -#include "internal.h" - - -struct ccm128_context { - block128_f block; - ctr128_f ctr; - unsigned M, L; -}; - -struct ccm128_state { - union { - uint64_t u[2]; - uint8_t c[16]; - } nonce, cmac; -}; - -static int CRYPTO_ccm128_init(struct ccm128_context *ctx, const AES_KEY *key, - block128_f block, ctr128_f ctr, unsigned M, - unsigned L) { - if (M < 4 || M > 16 || (M & 1) != 0 || L < 2 || L > 8) { - return 0; - } - ctx->block = block; - ctx->ctr = ctr; - ctx->M = M; - ctx->L = L; - return 1; -} - -static size_t CRYPTO_ccm128_max_input(const struct ccm128_context *ctx) { - return ctx->L >= sizeof(size_t) ? (size_t)-1 - : (((size_t)1) << (ctx->L * 8)) - 1; -} - -static int ccm128_init_state(const struct ccm128_context *ctx, - struct ccm128_state *state, const AES_KEY *key, - const uint8_t *nonce, size_t nonce_len, - const uint8_t *aad, size_t aad_len, - size_t plaintext_len) { - const block128_f block = ctx->block; - const unsigned M = ctx->M; - const unsigned L = ctx->L; - - // |L| determines the expected |nonce_len| and the limit for |plaintext_len|. - if (plaintext_len > CRYPTO_ccm128_max_input(ctx) || - nonce_len != 15 - L) { - return 0; - } - - // Assemble the first block for computing the MAC. - OPENSSL_memset(state, 0, sizeof(*state)); - state->nonce.c[0] = (uint8_t)((L - 1) | ((M - 2) / 2) << 3); - if (aad_len != 0) { - state->nonce.c[0] |= 0x40; // Set AAD Flag - } - OPENSSL_memcpy(&state->nonce.c[1], nonce, nonce_len); - for (unsigned i = 0; i < L; i++) { - state->nonce.c[15 - i] = (uint8_t)(plaintext_len >> (8 * i)); - } - - (*block)(state->nonce.c, state->cmac.c, key); - size_t blocks = 1; - - if (aad_len != 0) { - unsigned i; - // Cast to u64 to avoid the compiler complaining about invalid shifts. - uint64_t aad_len_u64 = aad_len; - if (aad_len_u64 < 0x10000 - 0x100) { - state->cmac.c[0] ^= (uint8_t)(aad_len_u64 >> 8); - state->cmac.c[1] ^= (uint8_t)aad_len_u64; - i = 2; - } else if (aad_len_u64 <= 0xffffffff) { - state->cmac.c[0] ^= 0xff; - state->cmac.c[1] ^= 0xfe; - state->cmac.c[2] ^= (uint8_t)(aad_len_u64 >> 24); - state->cmac.c[3] ^= (uint8_t)(aad_len_u64 >> 16); - state->cmac.c[4] ^= (uint8_t)(aad_len_u64 >> 8); - state->cmac.c[5] ^= (uint8_t)aad_len_u64; - i = 6; - } else { - state->cmac.c[0] ^= 0xff; - state->cmac.c[1] ^= 0xff; - state->cmac.c[2] ^= (uint8_t)(aad_len_u64 >> 56); - state->cmac.c[3] ^= (uint8_t)(aad_len_u64 >> 48); - state->cmac.c[4] ^= (uint8_t)(aad_len_u64 >> 40); - state->cmac.c[5] ^= (uint8_t)(aad_len_u64 >> 32); - state->cmac.c[6] ^= (uint8_t)(aad_len_u64 >> 24); - state->cmac.c[7] ^= (uint8_t)(aad_len_u64 >> 16); - state->cmac.c[8] ^= (uint8_t)(aad_len_u64 >> 8); - state->cmac.c[9] ^= (uint8_t)aad_len_u64; - i = 10; - } - - do { - for (; i < 16 && aad_len != 0; i++) { - state->cmac.c[i] ^= *aad; - aad++; - aad_len--; - } - (*block)(state->cmac.c, state->cmac.c, key); - blocks++; - i = 0; - } while (aad_len != 0); - } - - // Per RFC 3610, section 2.6, the total number of block cipher operations done - // must not exceed 2^61. There are two block cipher operations remaining per - // message block, plus one block at the end to encrypt the MAC. - size_t remaining_blocks = 2 * ((plaintext_len + 15) / 16) + 1; - if (plaintext_len + 15 < plaintext_len || - remaining_blocks + blocks < blocks || - (uint64_t) remaining_blocks + blocks > UINT64_C(1) << 61) { - return 0; - } - - // Assemble the first block for encrypting and decrypting. The bottom |L| - // bytes are replaced with a counter and all bit the encoding of |L| is - // cleared in the first byte. - state->nonce.c[0] &= 7; - return 1; -} - -static int ccm128_encrypt(const struct ccm128_context *ctx, - struct ccm128_state *state, const AES_KEY *key, - uint8_t *out, const uint8_t *in, size_t len) { - // The counter for encryption begins at one. - for (unsigned i = 0; i < ctx->L; i++) { - state->nonce.c[15 - i] = 0; - } - state->nonce.c[15] = 1; - - uint8_t partial_buf[16]; - unsigned num = 0; - if (ctx->ctr != NULL) { - CRYPTO_ctr128_encrypt_ctr32(in, out, len, key, state->nonce.c, partial_buf, - &num, ctx->ctr); - } else { - CRYPTO_ctr128_encrypt(in, out, len, key, state->nonce.c, partial_buf, &num, - ctx->block); - } - return 1; -} - -static int ccm128_compute_mac(const struct ccm128_context *ctx, - struct ccm128_state *state, const AES_KEY *key, - uint8_t *out_tag, size_t tag_len, - const uint8_t *in, size_t len) { - block128_f block = ctx->block; - if (tag_len != ctx->M) { - return 0; - } - - // Incorporate |in| into the MAC. - union { - uint64_t u[2]; - uint8_t c[16]; - } tmp; - while (len >= 16) { - OPENSSL_memcpy(tmp.c, in, 16); - state->cmac.u[0] ^= tmp.u[0]; - state->cmac.u[1] ^= tmp.u[1]; - (*block)(state->cmac.c, state->cmac.c, key); - in += 16; - len -= 16; - } - if (len > 0) { - for (size_t i = 0; i < len; i++) { - state->cmac.c[i] ^= in[i]; - } - (*block)(state->cmac.c, state->cmac.c, key); - } - - // Encrypt the MAC with counter zero. - for (unsigned i = 0; i < ctx->L; i++) { - state->nonce.c[15 - i] = 0; - } - (*block)(state->nonce.c, tmp.c, key); - state->cmac.u[0] ^= tmp.u[0]; - state->cmac.u[1] ^= tmp.u[1]; - - OPENSSL_memcpy(out_tag, state->cmac.c, tag_len); - return 1; -} - -static int CRYPTO_ccm128_encrypt(const struct ccm128_context *ctx, - const AES_KEY *key, uint8_t *out, - uint8_t *out_tag, size_t tag_len, - const uint8_t *nonce, size_t nonce_len, - const uint8_t *in, size_t len, - const uint8_t *aad, size_t aad_len) { - struct ccm128_state state; - return ccm128_init_state(ctx, &state, key, nonce, nonce_len, aad, aad_len, - len) && - ccm128_compute_mac(ctx, &state, key, out_tag, tag_len, in, len) && - ccm128_encrypt(ctx, &state, key, out, in, len); -} - -static int CRYPTO_ccm128_decrypt(const struct ccm128_context *ctx, - const AES_KEY *key, uint8_t *out, - uint8_t *out_tag, size_t tag_len, - const uint8_t *nonce, size_t nonce_len, - const uint8_t *in, size_t len, - const uint8_t *aad, size_t aad_len) { - struct ccm128_state state; - return ccm128_init_state(ctx, &state, key, nonce, nonce_len, aad, aad_len, - len) && - ccm128_encrypt(ctx, &state, key, out, in, len) && - ccm128_compute_mac(ctx, &state, key, out_tag, tag_len, out, len); -} - -#define EVP_AEAD_AES_CCM_MAX_TAG_LEN 16 - -struct aead_aes_ccm_ctx { - union { - double align; - AES_KEY ks; - } ks; - struct ccm128_context ccm; -}; - -static_assert(sizeof(((EVP_AEAD_CTX *)NULL)->state) >= - sizeof(struct aead_aes_ccm_ctx), - "AEAD state is too small"); -static_assert(alignof(union evp_aead_ctx_st_state) >= - alignof(struct aead_aes_ccm_ctx), - "AEAD state has insufficient alignment"); - -static int aead_aes_ccm_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t tag_len, unsigned M, - unsigned L) { - assert(M == EVP_AEAD_max_overhead(ctx->aead)); - assert(M == EVP_AEAD_max_tag_len(ctx->aead)); - assert(15 - L == EVP_AEAD_nonce_length(ctx->aead)); - - if (key_len != EVP_AEAD_key_length(ctx->aead)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); - return 0; // EVP_AEAD_CTX_init should catch this. - } - - if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) { - tag_len = M; - } - - if (tag_len != M) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE); - return 0; - } - - struct aead_aes_ccm_ctx *ccm_ctx = (struct aead_aes_ccm_ctx *)&ctx->state; - - block128_f block; - ctr128_f ctr = aes_ctr_set_key(&ccm_ctx->ks.ks, NULL, &block, key, key_len); - ctx->tag_len = tag_len; - if (!CRYPTO_ccm128_init(&ccm_ctx->ccm, &ccm_ctx->ks.ks, block, ctr, M, L)) { - OPENSSL_PUT_ERROR(CIPHER, ERR_R_INTERNAL_ERROR); - return 0; - } - - return 1; -} - -static void aead_aes_ccm_cleanup(EVP_AEAD_CTX *ctx) {} - -static int aead_aes_ccm_seal_scatter( - const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag, - size_t *out_tag_len, size_t max_out_tag_len, const uint8_t *nonce, - size_t nonce_len, const uint8_t *in, size_t in_len, const uint8_t *extra_in, - size_t extra_in_len, const uint8_t *ad, size_t ad_len) { - const struct aead_aes_ccm_ctx *ccm_ctx = - (struct aead_aes_ccm_ctx *)&ctx->state; - - if (in_len > CRYPTO_ccm128_max_input(&ccm_ctx->ccm)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - if (max_out_tag_len < ctx->tag_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); - return 0; - } - - if (nonce_len != EVP_AEAD_nonce_length(ctx->aead)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); - return 0; - } - - if (!CRYPTO_ccm128_encrypt(&ccm_ctx->ccm, &ccm_ctx->ks.ks, out, out_tag, - ctx->tag_len, nonce, nonce_len, in, in_len, ad, - ad_len)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - *out_tag_len = ctx->tag_len; - AEAD_CCM_verify_service_indicator(ctx); - return 1; -} - -static int aead_aes_ccm_open_gather(const EVP_AEAD_CTX *ctx, uint8_t *out, - const uint8_t *nonce, size_t nonce_len, - const uint8_t *in, size_t in_len, - const uint8_t *in_tag, size_t in_tag_len, - const uint8_t *ad, size_t ad_len) { - const struct aead_aes_ccm_ctx *ccm_ctx = - (struct aead_aes_ccm_ctx *)&ctx->state; - - if (in_len > CRYPTO_ccm128_max_input(&ccm_ctx->ccm)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - if (nonce_len != EVP_AEAD_nonce_length(ctx->aead)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); - return 0; - } - - if (in_tag_len != ctx->tag_len) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - uint8_t tag[EVP_AEAD_AES_CCM_MAX_TAG_LEN]; - assert(ctx->tag_len <= EVP_AEAD_AES_CCM_MAX_TAG_LEN); - if (!CRYPTO_ccm128_decrypt(&ccm_ctx->ccm, &ccm_ctx->ks.ks, out, tag, - ctx->tag_len, nonce, nonce_len, in, in_len, ad, - ad_len)) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); - return 0; - } - - if (CRYPTO_memcmp(tag, in_tag, ctx->tag_len) != 0) { - OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); - return 0; - } - - AEAD_CCM_verify_service_indicator(ctx); - return 1; -} - -static int aead_aes_ccm_bluetooth_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t tag_len) { - return aead_aes_ccm_init(ctx, key, key_len, tag_len, 4, 2); -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_ccm_bluetooth) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 16; - out->nonce_len = 13; - out->overhead = 4; - out->max_tag_len = 4; - - out->init = aead_aes_ccm_bluetooth_init; - out->cleanup = aead_aes_ccm_cleanup; - out->seal_scatter = aead_aes_ccm_seal_scatter; - out->open_gather = aead_aes_ccm_open_gather; -} - -static int aead_aes_ccm_bluetooth_8_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t tag_len) { - return aead_aes_ccm_init(ctx, key, key_len, tag_len, 8, 2); -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_ccm_bluetooth_8) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 16; - out->nonce_len = 13; - out->overhead = 8; - out->max_tag_len = 8; - - out->init = aead_aes_ccm_bluetooth_8_init; - out->cleanup = aead_aes_ccm_cleanup; - out->seal_scatter = aead_aes_ccm_seal_scatter; - out->open_gather = aead_aes_ccm_open_gather; -} - -static int aead_aes_ccm_matter_init(EVP_AEAD_CTX *ctx, const uint8_t *key, - size_t key_len, size_t tag_len) { - return aead_aes_ccm_init(ctx, key, key_len, tag_len, 16, 2); -} - -DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_ccm_matter) { - memset(out, 0, sizeof(EVP_AEAD)); - - out->key_len = 16; - out->nonce_len = 13; - out->overhead = 16; - out->max_tag_len = 16; - - out->init = aead_aes_ccm_matter_init; - out->cleanup = aead_aes_ccm_cleanup; - out->seal_scatter = aead_aes_ccm_seal_scatter; - out->open_gather = aead_aes_ccm_open_gather; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aesccm.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aesccm.cc.inc new file mode 100644 index 00000000..f100cf93 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/cipher/e_aesccm.cc.inc @@ -0,0 +1,440 @@ +// Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include + +#include "../aes/internal.h" +#include "../delocate.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +struct ccm128_context { + block128_f block; + ctr128_f ctr; + unsigned M, L; +}; + +struct ccm128_state { + alignas(16) uint8_t nonce[16]; + alignas(16) uint8_t cmac[16]; +}; + +static int CRYPTO_ccm128_init(struct ccm128_context *ctx, const AES_KEY *key, + block128_f block, ctr128_f ctr, unsigned M, + unsigned L) { + if (M < 4 || M > 16 || (M & 1) != 0 || L < 2 || L > 8) { + return 0; + } + ctx->block = block; + ctx->ctr = ctr; + ctx->M = M; + ctx->L = L; + return 1; +} + +static size_t CRYPTO_ccm128_max_input(const struct ccm128_context *ctx) { + return ctx->L >= sizeof(size_t) ? SIZE_MAX + : (((size_t)1) << (ctx->L * 8)) - 1; +} + +static int ccm128_init_state(const struct ccm128_context *ctx, + struct ccm128_state *state, const AES_KEY *key, + Span nonce, + Span aadvecs, + size_t plaintext_len) { + const block128_f block = ctx->block; + const unsigned M = ctx->M; + const unsigned L = ctx->L; + + // |L| determines the expected |nonce_len| and the limit for |plaintext_len|. + if (plaintext_len > CRYPTO_ccm128_max_input(ctx)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + if (nonce.size() != 15 - L) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); + return 0; + } + + // Assemble the first block for computing the MAC. + OPENSSL_memset(state, 0, sizeof(*state)); + state->nonce[0] = (uint8_t)((L - 1) | ((M - 2) / 2) << 3); + size_t aad_len = bssl::iovec::TotalLength(aadvecs); + if (aad_len != 0) { + state->nonce[0] |= 0x40; // Set AAD Flag + } + OPENSSL_memcpy(&state->nonce[1], nonce.data(), nonce.size()); + for (unsigned i = 0; i < L; i++) { + state->nonce[15 - i] = (uint8_t)(plaintext_len >> (8 * i)); + } + + (*block)(state->nonce, state->cmac, key); + size_t blocks = 1; + + if (aad_len != 0) { + unsigned i; + // Cast to u64 to avoid the compiler complaining about invalid shifts. + uint64_t aad_len_u64 = aad_len; + if (aad_len_u64 < 0x10000 - 0x100) { + state->cmac[0] ^= (uint8_t)(aad_len_u64 >> 8); + state->cmac[1] ^= (uint8_t)aad_len_u64; + i = 2; + } else if (aad_len_u64 <= 0xffffffff) { + state->cmac[0] ^= 0xff; + state->cmac[1] ^= 0xfe; + state->cmac[2] ^= (uint8_t)(aad_len_u64 >> 24); + state->cmac[3] ^= (uint8_t)(aad_len_u64 >> 16); + state->cmac[4] ^= (uint8_t)(aad_len_u64 >> 8); + state->cmac[5] ^= (uint8_t)aad_len_u64; + i = 6; + } else { + state->cmac[0] ^= 0xff; + state->cmac[1] ^= 0xff; + state->cmac[2] ^= (uint8_t)(aad_len_u64 >> 56); + state->cmac[3] ^= (uint8_t)(aad_len_u64 >> 48); + state->cmac[4] ^= (uint8_t)(aad_len_u64 >> 40); + state->cmac[5] ^= (uint8_t)(aad_len_u64 >> 32); + state->cmac[6] ^= (uint8_t)(aad_len_u64 >> 24); + state->cmac[7] ^= (uint8_t)(aad_len_u64 >> 16); + state->cmac[8] ^= (uint8_t)(aad_len_u64 >> 8); + state->cmac[9] ^= (uint8_t)aad_len_u64; + i = 10; + } + + while (!aadvecs.empty() && aadvecs.front().len == 0) { + aadvecs = aadvecs.subspan(1); + } + size_t offset = 0; + while (!aadvecs.empty()) { + for (; i < 16 && !aadvecs.empty(); i++) { + state->cmac[i] ^= aadvecs.front().in[offset]; + ++offset; + if (offset >= aadvecs.front().len) { + aadvecs = aadvecs.subspan(1); + while (!aadvecs.empty() && aadvecs.front().len == 0) { + aadvecs = aadvecs.subspan(1); + } + offset = 0; + } + } + (*block)(state->cmac, state->cmac, key); + blocks++; + i = 0; + } + } + + // Per RFC 3610, section 2.6, the total number of block cipher operations done + // must not exceed 2^61. There are two block cipher operations remaining per + // message block, plus one block at the end to encrypt the MAC. + size_t remaining_blocks = 2 * ((plaintext_len + 15) / 16) + 1; + if (plaintext_len + 15 < plaintext_len || + remaining_blocks + blocks < blocks || + (uint64_t)remaining_blocks + blocks > UINT64_C(1) << 61) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + // Assemble the first block for encrypting and decrypting. The bottom |L| + // bytes are replaced with a counter and all bit the encoding of |L| is + // cleared in the first byte. + state->nonce[0] &= 7; + return 1; +} + +static int ccm128_encrypt(const struct ccm128_context *ctx, + struct ccm128_state *state, const AES_KEY *key, + Span iovecs) { + // The counter for encryption begins at one. + for (unsigned i = 0; i < ctx->L; i++) { + state->nonce[15 - i] = 0; + } + state->nonce[15] = 1; + + uint8_t partial_buf[16]; + unsigned num = 0; + for (const CRYPTO_IOVEC &iovec : iovecs) { + CRYPTO_ctr128_encrypt_ctr32(iovec.in, iovec.out, iovec.len, key, + state->nonce, partial_buf, &num, ctx->ctr); + } + return 1; +} + +static int ccm128_compute_mac(const struct ccm128_context *ctx, + struct ccm128_state *state, const AES_KEY *key, + Span out_tag, + Span iovecs, bool encrypt) { + block128_f block = ctx->block; + if (out_tag.size() != ctx->M) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE); + return 0; + } + + auto update_mac = [&](const uint8_t *in, size_t len) { + while (len >= 16) { + CRYPTO_xor16(state->cmac, state->cmac, in); + (*block)(state->cmac, state->cmac, key); + in += 16; + len -= 16; + } + if (len != 0) { + for (size_t i = 0; i < len; i++) { + state->cmac[i] ^= in[i]; + } + (*block)(state->cmac, state->cmac, key); + } + return true; + }; + + // Incorporate the plaintext into the MAC. + if (encrypt) { + bssl::iovec::ForEachBlockRange<16>(iovecs, update_mac, update_mac); + } else { + bssl::iovec::ForEachOutBlockRange<16>(iovecs, update_mac, update_mac); + } + + // Encrypt the MAC with counter zero. + for (unsigned i = 0; i < ctx->L; i++) { + state->nonce[15 - i] = 0; + } + alignas(16) uint8_t tmp[16]; + (*block)(state->nonce, tmp, key); + CRYPTO_xor16(state->cmac, state->cmac, tmp); + + CopySpan(Span(state->cmac).first(out_tag.size()), out_tag); + return 1; +} + +static int CRYPTO_ccm128_encrypt(const struct ccm128_context *ctx, + const AES_KEY *key, + Span iovecs, + Span out_tag, + Span nonce, + Span aadvecs) { + struct ccm128_state state; + return ccm128_init_state(ctx, &state, key, nonce, aadvecs, + bssl::iovec::TotalLength(iovecs)) && + ccm128_compute_mac(ctx, &state, key, out_tag, iovecs, + /*encrypt=*/true) && + ccm128_encrypt(ctx, &state, key, iovecs); +} + +static int CRYPTO_ccm128_decrypt(const struct ccm128_context *ctx, + const AES_KEY *key, + Span iovecs, + Span out_tag, + Span nonce, + Span aadvecs) { + struct ccm128_state state; + return ccm128_init_state(ctx, &state, key, nonce, aadvecs, + bssl::iovec::TotalLength(iovecs)) && + ccm128_encrypt(ctx, &state, key, iovecs) && + ccm128_compute_mac(ctx, &state, key, out_tag, iovecs, + /*encrypt=*/false); +} + +#define EVP_AEAD_AES_CCM_MAX_TAG_LEN 16 + +namespace { +struct aead_aes_ccm_ctx { + union { + double align; + AES_KEY ks; + } ks; + struct ccm128_context ccm; +}; +} // namespace + +static_assert(sizeof(((EVP_AEAD_CTX *)nullptr)->state) >= + sizeof(struct aead_aes_ccm_ctx), + "AEAD state is too small"); +static_assert(alignof(union evp_aead_ctx_st_state) >= + alignof(struct aead_aes_ccm_ctx), + "AEAD state has insufficient alignment"); + +static int aead_aes_ccm_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t tag_len, unsigned M, + unsigned L) { + assert(M == EVP_AEAD_max_overhead(ctx->aead)); + assert(M == EVP_AEAD_max_tag_len(ctx->aead)); + assert(15 - L == EVP_AEAD_nonce_length(ctx->aead)); + + if (key_len != EVP_AEAD_key_length(ctx->aead)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_KEY_LENGTH); + return 0; // EVP_AEAD_CTX_init should catch this. + } + + if (tag_len == EVP_AEAD_DEFAULT_TAG_LENGTH) { + tag_len = M; + } + + if (tag_len != M) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TAG_TOO_LARGE); + return 0; + } + + struct aead_aes_ccm_ctx *ccm_ctx = (struct aead_aes_ccm_ctx *)&ctx->state; + + block128_f block; + ctr128_f ctr = + aes_ctr_set_key(&ccm_ctx->ks.ks, nullptr, &block, key, key_len); + ctx->tag_len = tag_len; + if (!CRYPTO_ccm128_init(&ccm_ctx->ccm, &ccm_ctx->ks.ks, block, ctr, M, L)) { + OPENSSL_PUT_ERROR(CIPHER, ERR_R_INTERNAL_ERROR); + return 0; + } + + return 1; +} + +static void aead_aes_ccm_cleanup(EVP_AEAD_CTX *ctx) {} + +static int aead_aes_ccm_sealv(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span out_tag, size_t *out_tag_len, + Span nonce, + Span aadvecs) { + const struct aead_aes_ccm_ctx *ccm_ctx = + (struct aead_aes_ccm_ctx *)&ctx->state; + + if (bssl::iovec::TotalLength(iovecs) > + CRYPTO_ccm128_max_input(&ccm_ctx->ccm)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + if (out_tag.size() < ctx->tag_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BUFFER_TOO_SMALL); + return 0; + } + + if (nonce.size() != EVP_AEAD_nonce_length(ctx->aead)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); + return 0; + } + + if (!CRYPTO_ccm128_encrypt(&ccm_ctx->ccm, &ccm_ctx->ks.ks, iovecs, + out_tag.first(ctx->tag_len), nonce, aadvecs)) { + return 0; + } + + *out_tag_len = ctx->tag_len; + AEAD_CCM_verify_service_indicator(ctx); + return 1; +} + +static int aead_aes_ccm_openv_detached(const EVP_AEAD_CTX *ctx, + Span iovecs, + Span nonce, + Span in_tag, + Span aadvecs) { + const struct aead_aes_ccm_ctx *ccm_ctx = + (struct aead_aes_ccm_ctx *)&ctx->state; + + if (bssl::iovec::TotalLength(iovecs) > + CRYPTO_ccm128_max_input(&ccm_ctx->ccm)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_TOO_LARGE); + return 0; + } + + if (nonce.size() != EVP_AEAD_nonce_length(ctx->aead)) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_INVALID_NONCE_SIZE); + return 0; + } + + if (in_tag.size() != ctx->tag_len) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + uint8_t tag[EVP_AEAD_AES_CCM_MAX_TAG_LEN]; + if (!CRYPTO_ccm128_decrypt(&ccm_ctx->ccm, &ccm_ctx->ks.ks, iovecs, + Span(tag).first(ctx->tag_len), nonce, aadvecs)) { + return 0; + } + + if (CRYPTO_memcmp(tag, in_tag.data(), ctx->tag_len) != 0) { + OPENSSL_PUT_ERROR(CIPHER, CIPHER_R_BAD_DECRYPT); + return 0; + } + + AEAD_CCM_verify_service_indicator(ctx); + return 1; +} + +static int aead_aes_ccm_bluetooth_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t tag_len) { + return aead_aes_ccm_init(ctx, key, key_len, tag_len, 4, 2); +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_ccm_bluetooth) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 16; + out->nonce_len = 13; + out->overhead = 4; + out->max_tag_len = 4; + + out->init = aead_aes_ccm_bluetooth_init; + out->cleanup = aead_aes_ccm_cleanup; + out->sealv = aead_aes_ccm_sealv; + out->openv_detached = aead_aes_ccm_openv_detached; +} + +static int aead_aes_ccm_bluetooth_8_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t tag_len) { + return aead_aes_ccm_init(ctx, key, key_len, tag_len, 8, 2); +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_ccm_bluetooth_8) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 16; + out->nonce_len = 13; + out->overhead = 8; + out->max_tag_len = 8; + + out->init = aead_aes_ccm_bluetooth_8_init; + out->cleanup = aead_aes_ccm_cleanup; + out->sealv = aead_aes_ccm_sealv; + out->openv_detached = aead_aes_ccm_openv_detached; +} + +static int aead_aes_ccm_matter_init(EVP_AEAD_CTX *ctx, const uint8_t *key, + size_t key_len, size_t tag_len) { + return aead_aes_ccm_init(ctx, key, key_len, tag_len, 16, 2); +} + +DEFINE_METHOD_FUNCTION(EVP_AEAD, EVP_aead_aes_128_ccm_matter) { + memset(out, 0, sizeof(EVP_AEAD)); + + out->key_len = 16; + out->nonce_len = 13; + out->overhead = 16; + out->max_tag_len = 16; + + out->init = aead_aes_ccm_matter_init; + out->cleanup = aead_aes_ccm_cleanup; + out->sealv = aead_aes_ccm_sealv; + out->openv_detached = aead_aes_ccm_openv_detached; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/cipher/internal.h b/third_party/boringssl/src/crypto/fipsmodule/cipher/internal.h index 6ec9a3b3..0983b157 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/cipher/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/cipher/internal.h @@ -1,73 +1,36 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#ifndef OPENSSL_HEADER_CIPHER_INTERNAL_H -#define OPENSSL_HEADER_CIPHER_INTERNAL_H +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_CIPHER_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_CIPHER_INTERNAL_H #include #include #include +#include #include "../../internal.h" -#include "../modes/internal.h" +#include "../../mem_internal.h" +#include "../aes/internal.h" + +#include +#include +#include +#include -#if defined(__cplusplus) extern "C" { -#endif // EVP_CIPH_MODE_MASK contains the bits of |flags| that represent the mode. @@ -79,7 +42,6 @@ struct evp_aead_st { uint8_t nonce_len; uint8_t overhead; uint8_t max_tag_len; - int seal_scatter_supports_extra_in; // init initialises an |EVP_AEAD_CTX|. If this call returns zero then // |cleanup| will not be called for that context. @@ -89,27 +51,30 @@ struct evp_aead_st { size_t tag_len, enum evp_aead_direction_t dir); void (*cleanup)(EVP_AEAD_CTX *); - int (*open)(const EVP_AEAD_CTX *ctx, uint8_t *out, size_t *out_len, - size_t max_out_len, const uint8_t *nonce, size_t nonce_len, - const uint8_t *in, size_t in_len, const uint8_t *ad, - size_t ad_len); + // AEADs need to provide one of the following sets of methods: + // + // - openv + sealv: variable tag lenght AEAD. + // - openv_detached + sealv: fixed tag length AEAD. - int (*seal_scatter)(const EVP_AEAD_CTX *ctx, uint8_t *out, uint8_t *out_tag, - size_t *out_tag_len, size_t max_out_tag_len, - const uint8_t *nonce, size_t nonce_len, const uint8_t *in, - size_t in_len, const uint8_t *extra_in, - size_t extra_in_len, const uint8_t *ad, size_t ad_len); + int (*openv)(const EVP_AEAD_CTX *ctx, bssl::Span iovecs, + size_t *out_total_bytes, bssl::Span nonce, + bssl::Span aadvecs); - int (*open_gather)(const EVP_AEAD_CTX *ctx, uint8_t *out, - const uint8_t *nonce, size_t nonce_len, const uint8_t *in, - size_t in_len, const uint8_t *in_tag, size_t in_tag_len, - const uint8_t *ad, size_t ad_len); + int (*sealv)(const EVP_AEAD_CTX *ctx, bssl::Span iovecs, + bssl::Span out_tag, size_t *out_tag_len, + bssl::Span nonce, + bssl::Span aadvecs); + + int (*openv_detached)(const EVP_AEAD_CTX *ctx, + bssl::Span iovecs, + bssl::Span nonce, + bssl::Span in_tag, + bssl::Span aadvecs); int (*get_iv)(const EVP_AEAD_CTX *ctx, const uint8_t **out_iv, size_t *out_len); - size_t (*tag_len)(const EVP_AEAD_CTX *ctx, size_t in_Len, - size_t extra_in_len); + size_t (*tag_len)(const EVP_AEAD_CTX *ctx, size_t in_len); }; struct evp_cipher_st { @@ -134,14 +99,35 @@ struct evp_cipher_st { // flags contains the OR of a number of flags. See |EVP_CIPH_*|. uint32_t flags; - // app_data is a pointer to opaque, user data. - void *app_data; - int (*init)(EVP_CIPHER_CTX *ctx, const uint8_t *key, const uint8_t *iv, int enc); - int (*cipher)(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, - size_t inl); + // cipher encrypts/decrypts |in|, write output to |out|. Writes exactly |len| + // bytes, which must be a multiple of the |block_size|. + // + // For ciphers where encryption and decryption operations differ, |init| + // shall set an internal state for this. + // + // Returns 1 on success, or 0 on error. + int (*cipher_update)(EVP_CIPHER_CTX *ctx, uint8_t *out, const uint8_t *in, + size_t len); + + // cipher_final finalizes the cipher, performing possible final + // authentication checks. + // + // Only used for |EVP_CIPH_FLAG_CUSTOM_CIPHER| ciphers. + // + // Returns 1 on success, or 0 on error. When decrypting, if an error is + // returned, the decrypted data must not be used. + int (*cipher_final)(EVP_CIPHER_CTX *ctx); + + // update_aad adds |in| (of length |inl|) to the authenticated data for the + // encryption operation. + // + // Only used for |EVP_CIPH_FLAG_CUSTOM_CIPHER| ciphers. + // + // Returns 1 on success, or 0 on error. + int (*update_aad)(EVP_CIPHER_CTX *ctx, const uint8_t *in, size_t inl); // cleanup, if non-NULL, releases memory associated with the context. It is // called if |EVP_CTRL_INIT| succeeds. Note that |init| may not have been @@ -151,17 +137,324 @@ struct evp_cipher_st { int (*ctrl)(EVP_CIPHER_CTX *, int type, int arg, void *ptr); }; -// aes_ctr_set_key initialises |*aes_key| using |key_bytes| bytes from |key|, -// where |key_bytes| must either be 16, 24 or 32. If not NULL, |*out_block| is -// set to a function that encrypts single blocks. If not NULL, |*gcm_key| is -// initialised to do GHASH with the given key. It returns a function for -// optimised CTR-mode, or NULL if CTR-mode should be built using |*out_block|. -ctr128_f aes_ctr_set_key(AES_KEY *aes_key, GCM128_KEY *gcm_key, - block128_f *out_block, const uint8_t *key, - size_t key_bytes); - -#if defined(__cplusplus) } // extern C -#endif -#endif // OPENSSL_HEADER_CIPHER_INTERNAL_H +BSSL_NAMESPACE_BEGIN + +// CopySpan copies an entire span of bytes from |from| to |to|. +// +// The spans need to have the same length. +inline void CopySpan(Span from, Span to) { + BSSL_CHECK(from.size() == to.size()); + std::copy(from.begin(), from.end(), to.begin()); +} + +// CopyToPrefix copies a span of bytes from |from| into |to|. It aborts +// if there is not enough space. +// +// TODO(crbug.com/404286922): Can we simplify this in a C++20 world (e.g. +// std::ranges::copy)? Must preserve range checking on the destination span. +inline void CopyToPrefix(Span from, Span to) { + CopySpan(from, to.first(from.size())); +} + +// Generic CRYPTO_IOVEC/CRYPTO_IVEC helpers. +namespace iovec { + +// IsValid returns whether the given |CRYPTO_IVEC| or |CRYPTO_IOVEC| is +// valid for use with public APIs, i.e. does not contain more than |SIZE_MAX| +// bytes and not more than |CRYPTO_IOVEC_MAX| chunks. Note that the `EVP_AEAD` +// methods need to accept an arbitrary number of chunks. +template +inline bool IsValid(Span ivecs) { + if (ivecs.size() > CRYPTO_IOVEC_MAX) { + return false; + } + size_t allowed = SIZE_MAX; + for (const IVec &ivec : ivecs) { + size_t len = ivec.len; + if (len > allowed) { + return false; + } + allowed -= len; + } + return true; +} + +// Length returns the total length in bytes of a given |CRYPTO_IVEC| or +// |CRYPTO_IOVEC|. +template +inline size_t TotalLength(Span ivecs) { + size_t total = 0; + for (const IVec &ivec : ivecs) { + total += ivec.len; + } + return total; +} + +// GetAndRemoveSuffix takes |suffix_buf.size()| final bytes from the given +// |CRYPTO_IVEC| or |CRYPTO_IOVEC| (mutating said iovec to no longer contain +// those bytes) and returns them. +// +// If the byte range is contained in a single chunk of |ivecs|, it will just +// return that span pointing into |ivecs|; otherwise, it will copy the bytes +// into |out| and return that. +// +// If |ivecs| is too short, returns |nullopt|. +template +inline std::optional> GetAndRemoveSuffix( + Span suffix_buf, Span ivecs) { + // Get the trivial case out. + if (suffix_buf.empty()) { + return suffix_buf; + } + // Strip trailing zero length chunks. + while (!ivecs.empty() && ivecs.back().len == 0) { + ivecs = ivecs.first(ivecs.size() - 1); + } + if (ivecs.empty()) { + return std::nullopt; + } + // Is the requested chunk entirely contained? If so, just return it. + if (ivecs.back().len >= suffix_buf.size()) { + ivecs.back().len -= suffix_buf.size(); + return Span(ivecs.back().*ReadFrom + ivecs.back().len, suffix_buf.size()); + } + // Otherwise, collect it into the buffer while trimming |ivecs|. + Span remaining = suffix_buf; + while (!ivecs.empty()) { + Span src(ivecs.back().*ReadFrom, ivecs.back().len); + if (src.size() >= remaining.size()) { + CopySpan(src.last(remaining.size()), remaining); + ivecs.back().len -= remaining.size(); + return suffix_buf; + } + CopySpan(src, remaining.last(src.size())); + remaining = remaining.first(remaining.size() - src.size()); + ivecs.back().len = 0; + ivecs = ivecs.first(ivecs.size() - 1); + } + return std::nullopt; +} + +// GetAndRemoveOutSuffix is like |GetAndRemoveSuffix| but takes from a +// |CRYPTO_IOVEC|'s |out| member instead. +inline std::optional> GetAndRemoveOutSuffix( + Span out, Span iovecs) { + return GetAndRemoveSuffix(out, iovecs); +} + +namespace internal { +inline void CopySpanToIOVec(Span out, CRYPTO_IOVEC head, + Span rest) { + for (;;) { + size_t to_copy = std::min(head.len, out.size()); + CopySpan(out.first(to_copy), Span(head.out, to_copy)); + out = out.subspan(to_copy); + if (out.empty()) { + break; + } + head = rest.front(); // Checkfails if insufficient space in CRYPTO_IOVEC. + rest = rest.subspan(1); + } +} +} // namespace internal + +// ForEachBlockRange iterates over the |ivecs| as follows: +// +// - |f_whole| gets called on whole blocks crossing |ivecs| chunk boundaries, or +// ranges of whole blocks that are entirely in chunks. +// - |f| gets called exactly once, on the last block range which may +// end up with a partial block. +// - Both functions receive an |in| pointer that either points into |ivecs| or +// into a chunk assembly buffer and a |len| which indicates the number of +// bytes from |in| that can be accessed. If the function returns 0, +// iteration stops. +// - If |WriteOut| is set, |f_whole| and |f| receive an extra |out| +// argument to which they can write output. This output will be placed into +// the |ivecs|'s |out| members either during or after the call. If iteration +// was stopped, the contents of |out| are indeterminate. +// - The return value is true if iteration was not stopped by the callbacks. +template < + size_t BlockSize, bool WriteOut = false, typename IVec, + typename ReadFromT = const uint8_t *, ReadFromT IVec::*ReadFrom = &IVec::in, + typename /* bool(const uint8_t *in, [uint8_t *out,] size_t len) */ FWhole, + typename /* bool(const uint8_t *in, [uint8_t *out,] size_t len) */ + FFinal> +inline bool ForEachBlockRange(Span ivecs, const FWhole &f_whole, + const FFinal &f_final) { + using MutableIVec = std::remove_const_t; + // Helper to make the function calls simpler. + auto call_func = [&](const auto &f, const IVec &ivec) { + if constexpr (WriteOut) { + return f(ivec.*ReadFrom, ivec.out, ivec.len); + } else { + return f(ivec.*ReadFrom, ivec.len); + } + }; + // Helper to cut from the start of an IVec. + auto remove_prefix = [&](MutableIVec &ivec, size_t by) { + ivec.*ReadFrom += by; + if constexpr (WriteOut) { + ivec.out += by; + } + ivec.len -= by; + }; + // Helper to copy a range to an iovec list. + auto maybe_copy_to_iovec = [&](Span out, MutableIVec head, + Span rest) { + if constexpr (WriteOut) { + internal::CopySpanToIOVec(out, head, rest); + } + }; + + // Ensure the last item in |ivecs| is nonempty. This is necessary for + // detecting being at the end and calling |f_final| at the appropriate time. + Span ivecs_trimmed = ivecs; + while (!ivecs_trimmed.empty() && ivecs_trimmed.back().len == 0) { + ivecs_trimmed = ivecs_trimmed.first(ivecs_trimmed.size() - 1); + } + if (ivecs_trimmed.empty()) { + return call_func(f_final, IVec{}); + } + + // Now there are at least two non-empty |ivecs|, and neither the first nor the + // last can be empty. + + MutableIVec current_range_head = ivecs_trimmed.front(); + Span current_range_rest = ivecs_trimmed.subspan(1); + while (!current_range_rest.empty()) { + // Process as many whole blocks as possible. + size_t whole_blocks_len = (current_range_head.len / BlockSize) * BlockSize; + if (whole_blocks_len != 0) { + MutableIVec whole_part = current_range_head; + whole_part.len = whole_blocks_len; + if (!call_func(f_whole, whole_part)) { + return false; + } + remove_prefix(current_range_head, whole_blocks_len); + } + + if (current_range_head.len == 0) { + current_range_head = current_range_rest.front(); + current_range_rest = current_range_rest.subspan(1); + continue; + } + + // Collect a whole block. + alignas(BlockSize) InplaceVector in; + alignas(BlockSize) uint8_t out[BlockSize]; + MutableIVec collect_from_head = current_range_head; + Span collect_from_rest = current_range_rest; + while (in.size() <= BlockSize) { + size_t remaining = BlockSize - in.size(); + if (remaining < collect_from_head.len) { + // Got enough to complete the block _and more_. + in.Append(Span(collect_from_head.*ReadFrom, remaining)); + remove_prefix(collect_from_head, remaining); + break; + } + // Consume all of |ivec| and advance. + in.Append(Span(collect_from_head.*ReadFrom, collect_from_head.len)); + if (collect_from_rest.empty()) { + // Nothing left - so this is the final block. + auto finalout = Span(out).first(in.size()); + MutableIVec finalvec = {}; + finalvec.len = in.size(); + finalvec.*ReadFrom = in.data(); + if constexpr (WriteOut) { + finalvec.out = finalout.data(); + } + if (!call_func(f_final, finalvec)) { + return false; + } + maybe_copy_to_iovec(finalout, current_range_head, current_range_rest); + return true; + } + collect_from_head = collect_from_rest.front(); + collect_from_rest = collect_from_rest.subspan(1); + } + assert(in.size() == BlockSize); + + // The above loop ensures this condition by the |break| only happening if + // |collect_from_head| has at least one byte remaining, and the loop + // otherwise ensuring as an invariant that the final chunk - which is + // nonempty - is among |collect_from_head| and |collect_from_rest|. + // + // As such, at least one byte is remaining, and thus calling |f_whole| is + // appropriate. + assert(collect_from_head.len != 0 || !collect_from_rest.empty()); + + // Process the block. + MutableIVec wholevec = {}; + wholevec.len = in.size(); + wholevec.*ReadFrom = in.data(); + if constexpr (WriteOut) { + wholevec.out = out; + } + if (!call_func(f_whole, wholevec)) { + return false; + } + maybe_copy_to_iovec(Span(out).first(in.size()), current_range_head, + current_range_rest); + + // Set the new position. + current_range_head = collect_from_head; + current_range_rest = collect_from_rest; + } + + // If current_range_head.len is zero, then the last item of ivecs is empty. + // That however was excluded at the start of the function to ensure |f_final| + // is always used for the last call. + assert(current_range_head.len != 0); + + return call_func(f_final, current_range_head); +} + +// ForEachOutBlockRange is like |ForEachBlockRange| but reads from a +// |CRYPTO_IOVEC|'s |out| member instead. +template < + size_t BlockSize, + typename /* int(const uint8_t *in, [uint8_t *out,] size_t len) */ FWhole, + typename /* int(const uint8_t *in, [uint8_t *out,] size_t len) */ + FFinal> +inline int ForEachOutBlockRange(Span iovecs, + const FWhole &f_whole, const FFinal &f_final) { + return ForEachBlockRange(iovecs, f_whole, + f_final); +} + +// ForEachBlockRange_Dynamic is simply |ForEachBlockRange| with a +// runtime dispatch on the block size. +template < + bool WriteOut = false, typename IVec, typename ReadFromT = const uint8_t *, + ReadFromT IVec::*ReadFrom = &IVec::in, + typename /* int(const uint8_t *in, uint8_t *out, size_t len) */ FWhole, + typename /* int(const uint8_t *in, uint8_t *out, size_t len) */ + FFinal> +inline int ForEachBlockRange_Dynamic(size_t block_size, Span ivecs, + const FWhole &f_whole, + const FFinal &f_final) { + switch (block_size) { + case 8: + return ForEachBlockRange<8, WriteOut, IVec, ReadFromT, ReadFrom>( + ivecs, f_whole, f_final); + break; + case 16: + return ForEachBlockRange<16, WriteOut, IVec, ReadFromT, ReadFrom>( + ivecs, f_whole, f_final); + break; + default: + return 0; + } +} + +} // namespace iovec + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_CIPHER_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/cmac/cmac.c b/third_party/boringssl/src/crypto/fipsmodule/cmac/cmac.c deleted file mode 100644 index f5c805dc..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/cmac/cmac.c +++ /dev/null @@ -1,318 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2010 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#include - -#include -#include - -#include -#include -#include - -#include "../../internal.h" -#include "../service_indicator/internal.h" - - -struct cmac_ctx_st { - EVP_CIPHER_CTX cipher_ctx; - // k1 and k2 are the CMAC subkeys. See - // https://tools.ietf.org/html/rfc4493#section-2.3 - uint8_t k1[AES_BLOCK_SIZE]; - uint8_t k2[AES_BLOCK_SIZE]; - // Last (possibly partial) scratch - uint8_t block[AES_BLOCK_SIZE]; - // block_used contains the number of valid bytes in |block|. - unsigned block_used; -}; - -static void CMAC_CTX_init(CMAC_CTX *ctx) { - EVP_CIPHER_CTX_init(&ctx->cipher_ctx); -} - -static void CMAC_CTX_cleanup(CMAC_CTX *ctx) { - EVP_CIPHER_CTX_cleanup(&ctx->cipher_ctx); - OPENSSL_cleanse(ctx->k1, sizeof(ctx->k1)); - OPENSSL_cleanse(ctx->k2, sizeof(ctx->k2)); - OPENSSL_cleanse(ctx->block, sizeof(ctx->block)); -} - -int AES_CMAC(uint8_t out[16], const uint8_t *key, size_t key_len, - const uint8_t *in, size_t in_len) { - const EVP_CIPHER *cipher; - switch (key_len) { - // WARNING: this code assumes that all supported key sizes are FIPS - // Approved. - case 16: - cipher = EVP_aes_128_cbc(); - break; - case 32: - cipher = EVP_aes_256_cbc(); - break; - default: - return 0; - } - - size_t scratch_out_len; - CMAC_CTX ctx; - CMAC_CTX_init(&ctx); - - // We have to verify that all the CMAC services actually succeed before - // updating the indicator state, so we lock the state here. - FIPS_service_indicator_lock_state(); - const int ok = CMAC_Init(&ctx, key, key_len, cipher, NULL /* engine */) && - CMAC_Update(&ctx, in, in_len) && - CMAC_Final(&ctx, out, &scratch_out_len); - FIPS_service_indicator_unlock_state(); - - if (ok) { - FIPS_service_indicator_update_state(); - } - CMAC_CTX_cleanup(&ctx); - return ok; -} - -CMAC_CTX *CMAC_CTX_new(void) { - CMAC_CTX *ctx = OPENSSL_malloc(sizeof(*ctx)); - if (ctx != NULL) { - CMAC_CTX_init(ctx); - } - return ctx; -} - -void CMAC_CTX_free(CMAC_CTX *ctx) { - if (ctx == NULL) { - return; - } - - CMAC_CTX_cleanup(ctx); - OPENSSL_free(ctx); -} - -int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in) { - if (!EVP_CIPHER_CTX_copy(&out->cipher_ctx, &in->cipher_ctx)) { - return 0; - } - OPENSSL_memcpy(out->k1, in->k1, AES_BLOCK_SIZE); - OPENSSL_memcpy(out->k2, in->k2, AES_BLOCK_SIZE); - OPENSSL_memcpy(out->block, in->block, AES_BLOCK_SIZE); - out->block_used = in->block_used; - return 1; -} - -// binary_field_mul_x_128 treats the 128 bits at |in| as an element of GF(2¹²⁸) -// with a hard-coded reduction polynomial and sets |out| as x times the input. -// -// See https://tools.ietf.org/html/rfc4493#section-2.3 -static void binary_field_mul_x_128(uint8_t out[16], const uint8_t in[16]) { - unsigned i; - - // Shift |in| to left, including carry. - for (i = 0; i < 15; i++) { - out[i] = (in[i] << 1) | (in[i+1] >> 7); - } - - // If MSB set fixup with R. - const uint8_t carry = in[0] >> 7; - out[i] = (in[i] << 1) ^ ((0 - carry) & 0x87); -} - -// binary_field_mul_x_64 behaves like |binary_field_mul_x_128| but acts on an -// element of GF(2⁶⁴). -// -// See https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-38b.pdf -static void binary_field_mul_x_64(uint8_t out[8], const uint8_t in[8]) { - unsigned i; - - // Shift |in| to left, including carry. - for (i = 0; i < 7; i++) { - out[i] = (in[i] << 1) | (in[i+1] >> 7); - } - - // If MSB set fixup with R. - const uint8_t carry = in[0] >> 7; - out[i] = (in[i] << 1) ^ ((0 - carry) & 0x1b); -} - -static const uint8_t kZeroIV[AES_BLOCK_SIZE] = {0}; - -int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t key_len, - const EVP_CIPHER *cipher, ENGINE *engine) { - int ret = 0; - uint8_t scratch[AES_BLOCK_SIZE]; - - // We have to avoid the underlying AES-CBC |EVP_CIPHER| services updating the - // indicator state, so we lock the state here. - FIPS_service_indicator_lock_state(); - - size_t block_size = EVP_CIPHER_block_size(cipher); - if ((block_size != AES_BLOCK_SIZE && block_size != 8 /* 3-DES */) || - EVP_CIPHER_key_length(cipher) != key_len || - !EVP_EncryptInit_ex(&ctx->cipher_ctx, cipher, NULL, key, kZeroIV) || - !EVP_Cipher(&ctx->cipher_ctx, scratch, kZeroIV, block_size) || - // Reset context again ready for first data. - !EVP_EncryptInit_ex(&ctx->cipher_ctx, NULL, NULL, NULL, kZeroIV)) { - goto out; - } - - if (block_size == AES_BLOCK_SIZE) { - binary_field_mul_x_128(ctx->k1, scratch); - binary_field_mul_x_128(ctx->k2, ctx->k1); - } else { - binary_field_mul_x_64(ctx->k1, scratch); - binary_field_mul_x_64(ctx->k2, ctx->k1); - } - ctx->block_used = 0; - ret = 1; - -out: - FIPS_service_indicator_unlock_state(); - return ret; -} - -int CMAC_Reset(CMAC_CTX *ctx) { - ctx->block_used = 0; - return EVP_EncryptInit_ex(&ctx->cipher_ctx, NULL, NULL, NULL, kZeroIV); -} - -int CMAC_Update(CMAC_CTX *ctx, const uint8_t *in, size_t in_len) { - int ret = 0; - - // We have to avoid the underlying AES-CBC |EVP_Cipher| services updating the - // indicator state, so we lock the state here. - FIPS_service_indicator_lock_state(); - - size_t block_size = EVP_CIPHER_CTX_block_size(&ctx->cipher_ctx); - assert(block_size <= AES_BLOCK_SIZE); - uint8_t scratch[AES_BLOCK_SIZE]; - - if (ctx->block_used > 0) { - size_t todo = block_size - ctx->block_used; - if (in_len < todo) { - todo = in_len; - } - - OPENSSL_memcpy(ctx->block + ctx->block_used, in, todo); - in += todo; - in_len -= todo; - ctx->block_used += todo; - - // If |in_len| is zero then either |ctx->block_used| is less than - // |block_size|, in which case we can stop here, or |ctx->block_used| is - // exactly |block_size| but there's no more data to process. In the latter - // case we don't want to process this block now because it might be the last - // block and that block is treated specially. - if (in_len == 0) { - ret = 1; - goto out; - } - - assert(ctx->block_used == block_size); - - if (!EVP_Cipher(&ctx->cipher_ctx, scratch, ctx->block, block_size)) { - goto out; - } - } - - // Encrypt all but one of the remaining blocks. - while (in_len > block_size) { - if (!EVP_Cipher(&ctx->cipher_ctx, scratch, in, block_size)) { - goto out; - } - in += block_size; - in_len -= block_size; - } - - OPENSSL_memcpy(ctx->block, in, in_len); - ctx->block_used = in_len; - ret = 1; - -out: - FIPS_service_indicator_unlock_state(); - return ret; -} - -int CMAC_Final(CMAC_CTX *ctx, uint8_t *out, size_t *out_len) { - int ret = 0; - size_t block_size = EVP_CIPHER_CTX_block_size(&ctx->cipher_ctx); - assert(block_size <= AES_BLOCK_SIZE); - - // We have to avoid the underlying AES-CBC |EVP_Cipher| services updating the - // indicator state, so we lock the state here. - FIPS_service_indicator_lock_state(); - - *out_len = block_size; - if (out == NULL) { - ret = 1; - goto out; - } - - const uint8_t *mask = ctx->k1; - - if (ctx->block_used != block_size) { - // If the last block is incomplete, terminate it with a single 'one' bit - // followed by zeros. - ctx->block[ctx->block_used] = 0x80; - OPENSSL_memset(ctx->block + ctx->block_used + 1, 0, - block_size - (ctx->block_used + 1)); - - mask = ctx->k2; - } - - for (unsigned i = 0; i < block_size; i++) { - out[i] = ctx->block[i] ^ mask[i]; - } - ret = EVP_Cipher(&ctx->cipher_ctx, out, out, block_size); - -out: - FIPS_service_indicator_unlock_state(); - if (ret) { - FIPS_service_indicator_update_state(); - } - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/cmac/cmac.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/cmac/cmac.cc.inc new file mode 100644 index 00000000..18d09afd --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/cmac/cmac.cc.inc @@ -0,0 +1,293 @@ +// Copyright 2010-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "../service_indicator/internal.h" + + +using namespace bssl; + +struct cmac_ctx_st { + EVP_CIPHER_CTX cipher_ctx; + // k1 and k2 are the CMAC subkeys. See + // https://tools.ietf.org/html/rfc4493#section-2.3 + uint8_t k1[AES_BLOCK_SIZE]; + uint8_t k2[AES_BLOCK_SIZE]; + // Last (possibly partial) scratch + uint8_t block[AES_BLOCK_SIZE]; + // block_used contains the number of valid bytes in |block|. + unsigned block_used; +}; + +static void CMAC_CTX_init(CMAC_CTX *ctx) { + EVP_CIPHER_CTX_init(&ctx->cipher_ctx); +} + +static void CMAC_CTX_cleanup(CMAC_CTX *ctx) { + EVP_CIPHER_CTX_cleanup(&ctx->cipher_ctx); + OPENSSL_cleanse(ctx->k1, sizeof(ctx->k1)); + OPENSSL_cleanse(ctx->k2, sizeof(ctx->k2)); + OPENSSL_cleanse(ctx->block, sizeof(ctx->block)); +} + +int AES_CMAC(uint8_t out[16], const uint8_t *key, size_t key_len, + const uint8_t *in, size_t in_len) { + const EVP_CIPHER *cipher; + switch (key_len) { + // WARNING: this code assumes that all supported key sizes are FIPS + // Approved. + case 16: + cipher = EVP_aes_128_cbc(); + break; + case 32: + cipher = EVP_aes_256_cbc(); + break; + default: + return 0; + } + + size_t scratch_out_len; + CMAC_CTX ctx; + CMAC_CTX_init(&ctx); + + // We have to verify that all the CMAC services actually succeed before + // updating the indicator state, so we lock the state here. + FIPS_service_indicator_lock_state(); + const int ok = CMAC_Init(&ctx, key, key_len, cipher, nullptr /* engine */) && + CMAC_Update(&ctx, in, in_len) && + CMAC_Final(&ctx, out, &scratch_out_len); + FIPS_service_indicator_unlock_state(); + + if (ok) { + FIPS_service_indicator_update_state(); + } + CMAC_CTX_cleanup(&ctx); + return ok; +} + +CMAC_CTX *CMAC_CTX_new() { + CMAC_CTX *ctx = New(); + if (ctx != nullptr) { + CMAC_CTX_init(ctx); + } + return ctx; +} + +void CMAC_CTX_free(CMAC_CTX *ctx) { + if (ctx == nullptr) { + return; + } + + CMAC_CTX_cleanup(ctx); + Delete(ctx); +} + +int CMAC_CTX_copy(CMAC_CTX *out, const CMAC_CTX *in) { + if (!EVP_CIPHER_CTX_copy(&out->cipher_ctx, &in->cipher_ctx)) { + return 0; + } + OPENSSL_memcpy(out->k1, in->k1, AES_BLOCK_SIZE); + OPENSSL_memcpy(out->k2, in->k2, AES_BLOCK_SIZE); + OPENSSL_memcpy(out->block, in->block, AES_BLOCK_SIZE); + out->block_used = in->block_used; + return 1; +} + +// binary_field_mul_x_128 treats the 128 bits at |in| as an element of GF(2¹²⁸) +// with a hard-coded reduction polynomial and sets |out| as x times the input. +// +// See https://tools.ietf.org/html/rfc4493#section-2.3 +static void binary_field_mul_x_128(uint8_t out[16], const uint8_t in[16]) { + unsigned i; + + // Shift |in| to left, including carry. + for (i = 0; i < 15; i++) { + out[i] = (in[i] << 1) | (in[i + 1] >> 7); + } + + // If MSB set fixup with R. + const uint8_t carry = in[0] >> 7; + out[i] = (in[i] << 1) ^ ((0 - carry) & 0x87); +} + +// binary_field_mul_x_64 behaves like |binary_field_mul_x_128| but acts on an +// element of GF(2⁶⁴). +// +// See https://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-38b.pdf +static void binary_field_mul_x_64(uint8_t out[8], const uint8_t in[8]) { + unsigned i; + + // Shift |in| to left, including carry. + for (i = 0; i < 7; i++) { + out[i] = (in[i] << 1) | (in[i + 1] >> 7); + } + + // If MSB set fixup with R. + const uint8_t carry = in[0] >> 7; + out[i] = (in[i] << 1) ^ ((0 - carry) & 0x1b); +} + +static const uint8_t kZeroIV[AES_BLOCK_SIZE] = {0}; + +int CMAC_Init(CMAC_CTX *ctx, const void *key, size_t key_len, + const EVP_CIPHER *cipher, ENGINE *engine) { + int ret = 0; + uint8_t scratch[AES_BLOCK_SIZE]; + + // We have to avoid the underlying AES-CBC |EVP_CIPHER| services updating the + // indicator state, so we lock the state here. + FIPS_service_indicator_lock_state(); + + size_t block_size = EVP_CIPHER_block_size(cipher); + if ((block_size != AES_BLOCK_SIZE && block_size != 8 /* 3-DES */) || + EVP_CIPHER_key_length(cipher) != key_len || + !EVP_EncryptInit_ex(&ctx->cipher_ctx, cipher, nullptr, + reinterpret_cast(key), kZeroIV) || + !EVP_Cipher(&ctx->cipher_ctx, scratch, kZeroIV, block_size) || + // Reset context again ready for first data. + !EVP_EncryptInit_ex(&ctx->cipher_ctx, nullptr, nullptr, nullptr, + kZeroIV)) { + goto out; + } + + if (block_size == AES_BLOCK_SIZE) { + binary_field_mul_x_128(ctx->k1, scratch); + binary_field_mul_x_128(ctx->k2, ctx->k1); + } else { + binary_field_mul_x_64(ctx->k1, scratch); + binary_field_mul_x_64(ctx->k2, ctx->k1); + } + ctx->block_used = 0; + ret = 1; + +out: + FIPS_service_indicator_unlock_state(); + return ret; +} + +int CMAC_Reset(CMAC_CTX *ctx) { + ctx->block_used = 0; + return EVP_EncryptInit_ex(&ctx->cipher_ctx, nullptr, nullptr, nullptr, + kZeroIV); +} + +int CMAC_Update(CMAC_CTX *ctx, const uint8_t *in, size_t in_len) { + int ret = 0; + + // We have to avoid the underlying AES-CBC |EVP_Cipher| services updating the + // indicator state, so we lock the state here. + FIPS_service_indicator_lock_state(); + + size_t block_size = EVP_CIPHER_CTX_block_size(&ctx->cipher_ctx); + assert(block_size <= AES_BLOCK_SIZE); + uint8_t scratch[AES_BLOCK_SIZE]; + + if (ctx->block_used > 0) { + size_t todo = block_size - ctx->block_used; + if (in_len < todo) { + todo = in_len; + } + + OPENSSL_memcpy(ctx->block + ctx->block_used, in, todo); + in += todo; + in_len -= todo; + ctx->block_used += todo; + + // If |in_len| is zero then either |ctx->block_used| is less than + // |block_size|, in which case we can stop here, or |ctx->block_used| is + // exactly |block_size| but there's no more data to process. In the latter + // case we don't want to process this block now because it might be the last + // block and that block is treated specially. + if (in_len == 0) { + ret = 1; + goto out; + } + + assert(ctx->block_used == block_size); + + if (!EVP_Cipher(&ctx->cipher_ctx, scratch, ctx->block, block_size)) { + goto out; + } + } + + // Encrypt all but one of the remaining blocks. + while (in_len > block_size) { + if (!EVP_Cipher(&ctx->cipher_ctx, scratch, in, block_size)) { + goto out; + } + in += block_size; + in_len -= block_size; + } + + OPENSSL_memcpy(ctx->block, in, in_len); + // |in_len| is bounded by |block_size|, which fits in |unsigned|. + static_assert(EVP_MAX_BLOCK_LENGTH < UINT_MAX, + "EVP_MAX_BLOCK_LENGTH is too large"); + ctx->block_used = (unsigned)in_len; + ret = 1; + +out: + FIPS_service_indicator_unlock_state(); + return ret; +} + +int CMAC_Final(CMAC_CTX *ctx, uint8_t *out, size_t *out_len) { + int ret = 0; + size_t block_size = EVP_CIPHER_CTX_block_size(&ctx->cipher_ctx); + assert(block_size <= AES_BLOCK_SIZE); + + // We have to avoid the underlying AES-CBC |EVP_Cipher| services updating the + // indicator state, so we lock the state here. + FIPS_service_indicator_lock_state(); + + *out_len = block_size; + const uint8_t *mask = ctx->k1; + if (out == nullptr) { + ret = 1; + goto out; + } + + if (ctx->block_used != block_size) { + // If the last block is incomplete, terminate it with a single 'one' bit + // followed by zeros. + ctx->block[ctx->block_used] = 0x80; + OPENSSL_memset(ctx->block + ctx->block_used + 1, 0, + block_size - (ctx->block_used + 1)); + + mask = ctx->k2; + } + + for (unsigned i = 0; i < block_size; i++) { + out[i] = ctx->block[i] ^ mask[i]; + } + ret = EVP_Cipher(&ctx->cipher_ctx, out, out, block_size); + +out: + FIPS_service_indicator_unlock_state(); + if (ret) { + FIPS_service_indicator_update_state(); + } + return ret; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/delocate.h b/third_party/boringssl/src/crypto/fipsmodule/delocate.h index d6564e48..75015480 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/delocate.h +++ b/third_party/boringssl/src/crypto/fipsmodule/delocate.h @@ -1,64 +1,70 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. -#ifndef OPENSSL_HEADER_FIPSMODULE_DELOCATE_H -#define OPENSSL_HEADER_FIPSMODULE_DELOCATE_H +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_DELOCATE_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_DELOCATE_H #include #include "../internal.h" +#if defined(BORINGSSL_PREFIX) +#define BCM_ADD_PREFIX(name) BORINGSSL_ADD_PREFIX(bcm_##name) +#else +#define BCM_ADD_PREFIX(name) bcm_##name +#endif + #if !defined(BORINGSSL_SHARED_LIBRARY) && defined(BORINGSSL_FIPS) && \ !defined(OPENSSL_ASAN) && !defined(OPENSSL_MSAN) -#define DEFINE_BSS_GET(type, name) \ - static type name __attribute__((used)); \ - type *name##_bss_get(void) __attribute__((const)); -// For FIPS builds we require that CRYPTO_ONCE_INIT be zero. -#define DEFINE_STATIC_ONCE(name) DEFINE_BSS_GET(CRYPTO_once_t, name) -// For FIPS builds we require that CRYPTO_STATIC_MUTEX_INIT be zero. -#define DEFINE_STATIC_MUTEX(name) \ - DEFINE_BSS_GET(struct CRYPTO_STATIC_MUTEX, name) -// For FIPS builds we require that CRYPTO_EX_DATA_CLASS_INIT be zero. -#define DEFINE_STATIC_EX_DATA_CLASS(name) \ - DEFINE_BSS_GET(CRYPTO_EX_DATA_CLASS, name) +#define DEFINE_BSS_GET(type, name, init_expr) \ + /* delocate needs C linkage and for |name| to be unique across BCM. */ \ + extern "C" { \ + extern type BCM_ADD_PREFIX(name); \ + type BCM_ADD_PREFIX(name) init_expr; \ + type *BCM_ADD_PREFIX(name##_bss_get)() __attribute__((const)); \ + } /* extern "C" */ \ + \ + /* The getter functions are exported, but static variables are usually named \ + * with short names. Define a static wrapper function so the caller can use \ + * a short name, while the symbol itself is prefixed. */ \ + static type *name##_bss_get() { return BCM_ADD_PREFIX(name##_bss_get)(); } #else -#define DEFINE_BSS_GET(type, name) \ - static type name; \ - static type *name##_bss_get(void) { return &name; } -#define DEFINE_STATIC_ONCE(name) \ - static CRYPTO_once_t name = CRYPTO_ONCE_INIT; \ - static CRYPTO_once_t *name##_bss_get(void) { return &name; } -#define DEFINE_STATIC_MUTEX(name) \ - static struct CRYPTO_STATIC_MUTEX name = CRYPTO_STATIC_MUTEX_INIT; \ - static struct CRYPTO_STATIC_MUTEX *name##_bss_get(void) { return &name; } -#define DEFINE_STATIC_EX_DATA_CLASS(name) \ - static CRYPTO_EX_DATA_CLASS name = CRYPTO_EX_DATA_CLASS_INIT; \ - static CRYPTO_EX_DATA_CLASS *name##_bss_get(void) { return &name; } +#define DEFINE_BSS_GET(type, name, init_expr) \ + static type name init_expr; \ + static type *name##_bss_get() { return &name; } #endif -#define DEFINE_DATA(type, name, accessor_decorations) \ - DEFINE_BSS_GET(type, name##_storage) \ - DEFINE_STATIC_ONCE(name##_once) \ - static void name##_do_init(type *out); \ - static void name##_init(void) { name##_do_init(name##_storage_bss_get()); } \ - accessor_decorations type *name(void) { \ - CRYPTO_once(name##_once_bss_get(), name##_init); \ - /* See http://c-faq.com/ansi/constmismatch.html for why the following \ - * cast is needed. */ \ - return (const type *)name##_storage_bss_get(); \ - } \ +// For FIPS builds we require each of these objects be all zero. +#define DEFINE_STATIC_ONCE(name) \ + DEFINE_BSS_GET(bssl::CRYPTO_once_t, name, = CRYPTO_ONCE_INIT) +#define DEFINE_STATIC_MUTEX(name) \ + DEFINE_BSS_GET(bssl::StaticMutex, name, /* default ctor */) +#define DEFINE_STATIC_EX_DATA_CLASS(name) \ + DEFINE_BSS_GET(bssl::ExDataClass, name, /* default ctor */) + +#define DEFINE_DATA(type, name, accessor_decorations) \ + DEFINE_BSS_GET(type, name##_storage, {}) \ + DEFINE_STATIC_ONCE(name##_once) \ + static void name##_do_init(type *out); \ + static void name##_init() { name##_do_init(name##_storage_bss_get()); } \ + accessor_decorations type *name() { \ + bssl::CRYPTO_once(name##_once_bss_get(), name##_init); \ + /* See http://c-faq.com/ansi/constmismatch.html for why the following \ + * cast is needed. */ \ + return (const type *)name##_storage_bss_get(); \ + } \ static void name##_do_init(type *out) // DEFINE_METHOD_FUNCTION defines a function named |name| which returns a @@ -86,4 +92,4 @@ #define DEFINE_LOCAL_DATA(type, name) DEFINE_DATA(type, name, static const) -#endif // OPENSSL_HEADER_FIPSMODULE_DELOCATE_H +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_DELOCATE_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/dh/check.c b/third_party/boringssl/src/crypto/fipsmodule/dh/check.c deleted file mode 100644 index 5b6e03a5..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/dh/check.c +++ /dev/null @@ -1,217 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - - -int DH_check_pub_key(const DH *dh, const BIGNUM *pub_key, int *out_flags) { - *out_flags = 0; - - BN_CTX *ctx = BN_CTX_new(); - if (ctx == NULL) { - return 0; - } - BN_CTX_start(ctx); - - int ok = 0; - - // Check |pub_key| is greater than 1. - BIGNUM *tmp = BN_CTX_get(ctx); - if (tmp == NULL || - !BN_set_word(tmp, 1)) { - goto err; - } - if (BN_cmp(pub_key, tmp) <= 0) { - *out_flags |= DH_CHECK_PUBKEY_TOO_SMALL; - } - - // Check |pub_key| is less than |dh->p| - 1. - if (!BN_copy(tmp, dh->p) || - !BN_sub_word(tmp, 1)) { - goto err; - } - if (BN_cmp(pub_key, tmp) >= 0) { - *out_flags |= DH_CHECK_PUBKEY_TOO_LARGE; - } - - if (dh->q != NULL) { - // Check |pub_key|^|dh->q| is 1 mod |dh->p|. This is necessary for RFC 5114 - // groups which are not safe primes but pick a generator on a prime-order - // subgroup of size |dh->q|. - if (!BN_mod_exp_mont(tmp, pub_key, dh->q, dh->p, ctx, NULL)) { - goto err; - } - if (!BN_is_one(tmp)) { - *out_flags |= DH_CHECK_PUBKEY_INVALID; - } - } - - ok = 1; - -err: - BN_CTX_end(ctx); - BN_CTX_free(ctx); - return ok; -} - - -int DH_check(const DH *dh, int *out_flags) { - // Check that p is a safe prime and if g is 2, 3 or 5, check that it is a - // suitable generator where: - // for 2, p mod 24 == 11 - // for 3, p mod 12 == 5 - // for 5, p mod 10 == 3 or 7 - // should hold. - int ok = 0, r; - BN_CTX *ctx = NULL; - BN_ULONG l; - BIGNUM *t1 = NULL, *t2 = NULL; - - *out_flags = 0; - ctx = BN_CTX_new(); - if (ctx == NULL) { - goto err; - } - BN_CTX_start(ctx); - t1 = BN_CTX_get(ctx); - if (t1 == NULL) { - goto err; - } - t2 = BN_CTX_get(ctx); - if (t2 == NULL) { - goto err; - } - - if (dh->q) { - if (BN_cmp(dh->g, BN_value_one()) <= 0) { - *out_flags |= DH_CHECK_NOT_SUITABLE_GENERATOR; - } else if (BN_cmp(dh->g, dh->p) >= 0) { - *out_flags |= DH_CHECK_NOT_SUITABLE_GENERATOR; - } else { - // Check g^q == 1 mod p - if (!BN_mod_exp_mont(t1, dh->g, dh->q, dh->p, ctx, NULL)) { - goto err; - } - if (!BN_is_one(t1)) { - *out_flags |= DH_CHECK_NOT_SUITABLE_GENERATOR; - } - } - r = BN_is_prime_ex(dh->q, BN_prime_checks_for_validation, ctx, NULL); - if (r < 0) { - goto err; - } - if (!r) { - *out_flags |= DH_CHECK_Q_NOT_PRIME; - } - // Check p == 1 mod q i.e. q divides p - 1 - if (!BN_div(t1, t2, dh->p, dh->q, ctx)) { - goto err; - } - if (!BN_is_one(t2)) { - *out_flags |= DH_CHECK_INVALID_Q_VALUE; - } - if (dh->j && BN_cmp(dh->j, t1)) { - *out_flags |= DH_CHECK_INVALID_J_VALUE; - } - } else if (BN_is_word(dh->g, DH_GENERATOR_2)) { - l = BN_mod_word(dh->p, 24); - if (l == (BN_ULONG)-1) { - goto err; - } - if (l != 11) { - *out_flags |= DH_CHECK_NOT_SUITABLE_GENERATOR; - } - } else if (BN_is_word(dh->g, DH_GENERATOR_5)) { - l = BN_mod_word(dh->p, 10); - if (l == (BN_ULONG)-1) { - goto err; - } - if (l != 3 && l != 7) { - *out_flags |= DH_CHECK_NOT_SUITABLE_GENERATOR; - } - } else { - *out_flags |= DH_CHECK_UNABLE_TO_CHECK_GENERATOR; - } - - r = BN_is_prime_ex(dh->p, BN_prime_checks_for_validation, ctx, NULL); - if (r < 0) { - goto err; - } - if (!r) { - *out_flags |= DH_CHECK_P_NOT_PRIME; - } else if (!dh->q) { - if (!BN_rshift1(t1, dh->p)) { - goto err; - } - r = BN_is_prime_ex(t1, BN_prime_checks_for_validation, ctx, NULL); - if (r < 0) { - goto err; - } - if (!r) { - *out_flags |= DH_CHECK_P_NOT_SAFE_PRIME; - } - } - ok = 1; - -err: - if (ctx != NULL) { - BN_CTX_end(ctx); - BN_CTX_free(ctx); - } - return ok; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/dh/check.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/dh/check.cc.inc new file mode 100644 index 00000000..ec8e3584 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/dh/check.cc.inc @@ -0,0 +1,198 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "../bn/internal.h" +#include "internal.h" + + +using namespace bssl; + +static_assert(OPENSSL_DH_MAX_MODULUS_BITS <= BN_MONTGOMERY_MAX_WORDS * BN_BITS2, + "Max DH size too big for Montgomery arithmetic"); + +int bssl::dh_check_params_fast(const DH *dh) { + auto *impl = FromOpaque(dh); + + // Most operations scale with p and q. + if (BN_is_negative(impl->p.get()) || !BN_is_odd(impl->p.get()) || + BN_num_bits(impl->p.get()) > OPENSSL_DH_MAX_MODULUS_BITS) { + OPENSSL_PUT_ERROR(DH, DH_R_INVALID_PARAMETERS); + return 0; + } + + // q must be bounded by p. + if (impl->q != nullptr && (BN_is_negative(impl->q.get()) || + BN_ucmp(impl->q.get(), impl->p.get()) > 0)) { + OPENSSL_PUT_ERROR(DH, DH_R_INVALID_PARAMETERS); + return 0; + } + + // g must be an element of p's multiplicative group. + if (BN_is_negative(impl->g.get()) || BN_is_zero(impl->g.get()) || + BN_ucmp(impl->g.get(), impl->p.get()) >= 0) { + OPENSSL_PUT_ERROR(DH, DH_R_INVALID_PARAMETERS); + return 0; + } + + return 1; +} + +int DH_check_pub_key(const DH *dh, const BIGNUM *pub_key, int *out_flags) { + auto *impl = FromOpaque(dh); + + *out_flags = 0; + if (!dh_check_params_fast(dh)) { + return 0; + } + + UniquePtr ctx(BN_CTX_new()); + if (ctx == nullptr) { + return 0; + } + BN_CTXScope scope(ctx.get()); + + // Check |pub_key| is greater than 1. + if (BN_cmp(pub_key, BN_value_one()) <= 0) { + *out_flags |= DH_CHECK_PUBKEY_TOO_SMALL; + } + + // Check |pub_key| is less than |impl->p| - 1. + BIGNUM *tmp = BN_CTX_get(ctx.get()); + if (tmp == nullptr || !BN_copy(tmp, impl->p.get()) || !BN_sub_word(tmp, 1)) { + return 0; + } + if (BN_cmp(pub_key, tmp) >= 0) { + *out_flags |= DH_CHECK_PUBKEY_TOO_LARGE; + } + + if (impl->q != nullptr) { + // Check |pub_key|^|impl->q| is 1 mod |impl->p|. This is necessary for RFC + // 5114 groups which are not safe primes but pick a generator on a + // prime-order subgroup of size |impl->q|. + if (!BN_mod_exp_mont(tmp, pub_key, impl->q.get(), impl->p.get(), ctx.get(), + nullptr)) { + return 0; + } + if (!BN_is_one(tmp)) { + *out_flags |= DH_CHECK_PUBKEY_INVALID; + } + } + + return 1; +} + +int DH_check(const DH *dh, int *out_flags) { + auto *impl = FromOpaque(dh); + + *out_flags = 0; + if (!dh_check_params_fast(dh)) { + return 0; + } + + // Check that p is a safe prime and if g is 2, 3 or 5, check that it is a + // suitable generator where: + // for 2, p mod 24 == 11 + // for 3, p mod 12 == 5 + // for 5, p mod 10 == 3 or 7 + // should hold. + UniquePtr ctx(BN_CTX_new()); + if (ctx == nullptr) { + return 0; + } + BN_CTXScope scope(ctx.get()); + BIGNUM *t1 = BN_CTX_get(ctx.get()); + if (t1 == nullptr) { + return 0; + } + BIGNUM *t2 = BN_CTX_get(ctx.get()); + if (t2 == nullptr) { + return 0; + } + + if (impl->q) { + if (BN_cmp(impl->g.get(), BN_value_one()) <= 0) { + *out_flags |= DH_CHECK_NOT_SUITABLE_GENERATOR; + } else if (BN_cmp(impl->g.get(), impl->p.get()) >= 0) { + *out_flags |= DH_CHECK_NOT_SUITABLE_GENERATOR; + } else { + // Check g^q == 1 mod p + if (!BN_mod_exp_mont(t1, impl->g.get(), impl->q.get(), impl->p.get(), + ctx.get(), nullptr)) { + return 0; + } + if (!BN_is_one(t1)) { + *out_flags |= DH_CHECK_NOT_SUITABLE_GENERATOR; + } + } + int r = BN_is_prime_ex(impl->q.get(), BN_prime_checks_for_validation, + ctx.get(), nullptr); + if (r < 0) { + return 0; + } + if (!r) { + *out_flags |= DH_CHECK_Q_NOT_PRIME; + } + // Check p == 1 mod q i.e. q divides p - 1 + if (!BN_div(t1, t2, impl->p.get(), impl->q.get(), ctx.get())) { + return 0; + } + if (!BN_is_one(t2)) { + *out_flags |= DH_CHECK_INVALID_Q_VALUE; + } + } else if (BN_is_word(impl->g.get(), DH_GENERATOR_2)) { + BN_ULONG l = BN_mod_word(impl->p.get(), 24); + if (l == (BN_ULONG)-1) { + return 0; + } + if (l != 11) { + *out_flags |= DH_CHECK_NOT_SUITABLE_GENERATOR; + } + } else if (BN_is_word(impl->g.get(), DH_GENERATOR_5)) { + BN_ULONG l = BN_mod_word(impl->p.get(), 10); + if (l == (BN_ULONG)-1) { + return 0; + } + if (l != 3 && l != 7) { + *out_flags |= DH_CHECK_NOT_SUITABLE_GENERATOR; + } + } else { + *out_flags |= DH_CHECK_UNABLE_TO_CHECK_GENERATOR; + } + + int r = BN_is_prime_ex(impl->p.get(), BN_prime_checks_for_validation, + ctx.get(), nullptr); + if (r < 0) { + return 0; + } + if (!r) { + *out_flags |= DH_CHECK_P_NOT_PRIME; + } else if (!impl->q) { + if (!BN_rshift1(t1, impl->p.get())) { + return 0; + } + r = BN_is_prime_ex(t1, BN_prime_checks_for_validation, ctx.get(), nullptr); + if (r < 0) { + return 0; + } + if (!r) { + *out_flags |= DH_CHECK_P_NOT_SAFE_PRIME; + } + } + return 1; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/dh/dh.c b/third_party/boringssl/src/crypto/fipsmodule/dh/dh.c deleted file mode 100644 index 31eaff4f..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/dh/dh.c +++ /dev/null @@ -1,474 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include -#include -#include -#include - -#include "../../internal.h" -#include "../bn/internal.h" -#include "../service_indicator/internal.h" -#include "internal.h" - - -#define OPENSSL_DH_MAX_MODULUS_BITS 10000 - -DH *DH_new(void) { - DH *dh = OPENSSL_malloc(sizeof(DH)); - if (dh == NULL) { - OPENSSL_PUT_ERROR(DH, ERR_R_MALLOC_FAILURE); - return NULL; - } - - OPENSSL_memset(dh, 0, sizeof(DH)); - - CRYPTO_MUTEX_init(&dh->method_mont_p_lock); - - dh->references = 1; - - return dh; -} - -void DH_free(DH *dh) { - if (dh == NULL) { - return; - } - - if (!CRYPTO_refcount_dec_and_test_zero(&dh->references)) { - return; - } - - BN_MONT_CTX_free(dh->method_mont_p); - BN_clear_free(dh->p); - BN_clear_free(dh->g); - BN_clear_free(dh->q); - BN_clear_free(dh->j); - OPENSSL_free(dh->seed); - BN_clear_free(dh->counter); - BN_clear_free(dh->pub_key); - BN_clear_free(dh->priv_key); - CRYPTO_MUTEX_cleanup(&dh->method_mont_p_lock); - - OPENSSL_free(dh); -} - -unsigned DH_bits(const DH *dh) { return BN_num_bits(dh->p); } - -const BIGNUM *DH_get0_pub_key(const DH *dh) { return dh->pub_key; } - -const BIGNUM *DH_get0_priv_key(const DH *dh) { return dh->priv_key; } - -const BIGNUM *DH_get0_p(const DH *dh) { return dh->p; } - -const BIGNUM *DH_get0_q(const DH *dh) { return dh->q; } - -const BIGNUM *DH_get0_g(const DH *dh) { return dh->g; } - -void DH_get0_key(const DH *dh, const BIGNUM **out_pub_key, - const BIGNUM **out_priv_key) { - if (out_pub_key != NULL) { - *out_pub_key = dh->pub_key; - } - if (out_priv_key != NULL) { - *out_priv_key = dh->priv_key; - } -} - -int DH_set0_key(DH *dh, BIGNUM *pub_key, BIGNUM *priv_key) { - if (pub_key != NULL) { - BN_free(dh->pub_key); - dh->pub_key = pub_key; - } - - if (priv_key != NULL) { - BN_free(dh->priv_key); - dh->priv_key = priv_key; - } - - return 1; -} - -void DH_get0_pqg(const DH *dh, const BIGNUM **out_p, const BIGNUM **out_q, - const BIGNUM **out_g) { - if (out_p != NULL) { - *out_p = dh->p; - } - if (out_q != NULL) { - *out_q = dh->q; - } - if (out_g != NULL) { - *out_g = dh->g; - } -} - -int DH_set0_pqg(DH *dh, BIGNUM *p, BIGNUM *q, BIGNUM *g) { - if ((dh->p == NULL && p == NULL) || - (dh->g == NULL && g == NULL)) { - return 0; - } - - if (p != NULL) { - BN_free(dh->p); - dh->p = p; - } - - if (q != NULL) { - BN_free(dh->q); - dh->q = q; - } - - if (g != NULL) { - BN_free(dh->g); - dh->g = g; - } - - return 1; -} - -int DH_set_length(DH *dh, unsigned priv_length) { - dh->priv_length = priv_length; - return 1; -} - -int DH_generate_key(DH *dh) { - boringssl_ensure_ffdh_self_test(); - - int ok = 0; - int generate_new_key = 0; - BN_CTX *ctx = NULL; - BIGNUM *pub_key = NULL, *priv_key = NULL; - - if (BN_num_bits(dh->p) > OPENSSL_DH_MAX_MODULUS_BITS) { - OPENSSL_PUT_ERROR(DH, DH_R_MODULUS_TOO_LARGE); - goto err; - } - - ctx = BN_CTX_new(); - if (ctx == NULL) { - goto err; - } - - if (dh->priv_key == NULL) { - priv_key = BN_new(); - if (priv_key == NULL) { - goto err; - } - generate_new_key = 1; - } else { - priv_key = dh->priv_key; - } - - if (dh->pub_key == NULL) { - pub_key = BN_new(); - if (pub_key == NULL) { - goto err; - } - } else { - pub_key = dh->pub_key; - } - - if (!BN_MONT_CTX_set_locked(&dh->method_mont_p, &dh->method_mont_p_lock, - dh->p, ctx)) { - goto err; - } - - if (generate_new_key) { - if (dh->q) { - if (!BN_rand_range_ex(priv_key, 2, dh->q)) { - goto err; - } - } else { - // secret exponent length - unsigned priv_bits = dh->priv_length; - if (priv_bits == 0) { - const unsigned p_bits = BN_num_bits(dh->p); - if (p_bits == 0) { - goto err; - } - - priv_bits = p_bits - 1; - } - - if (!BN_rand(priv_key, priv_bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ANY)) { - goto err; - } - } - } - - if (!BN_mod_exp_mont_consttime(pub_key, dh->g, priv_key, dh->p, ctx, - dh->method_mont_p)) { - goto err; - } - - dh->pub_key = pub_key; - dh->priv_key = priv_key; - ok = 1; - -err: - if (ok != 1) { - OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); - } - - if (dh->pub_key == NULL) { - BN_free(pub_key); - } - if (dh->priv_key == NULL) { - BN_free(priv_key); - } - BN_CTX_free(ctx); - return ok; -} - -static int dh_compute_key(DH *dh, BIGNUM *out_shared_key, - const BIGNUM *peers_key, BN_CTX *ctx) { - if (BN_num_bits(dh->p) > OPENSSL_DH_MAX_MODULUS_BITS) { - OPENSSL_PUT_ERROR(DH, DH_R_MODULUS_TOO_LARGE); - return 0; - } - - if (dh->priv_key == NULL) { - OPENSSL_PUT_ERROR(DH, DH_R_NO_PRIVATE_VALUE); - return 0; - } - - int check_result; - if (!DH_check_pub_key(dh, peers_key, &check_result) || check_result) { - OPENSSL_PUT_ERROR(DH, DH_R_INVALID_PUBKEY); - return 0; - } - - int ret = 0; - BN_CTX_start(ctx); - BIGNUM *p_minus_1 = BN_CTX_get(ctx); - - if (!p_minus_1 || - !BN_MONT_CTX_set_locked(&dh->method_mont_p, &dh->method_mont_p_lock, - dh->p, ctx)) { - goto err; - } - - if (!BN_mod_exp_mont_consttime(out_shared_key, peers_key, dh->priv_key, dh->p, - ctx, dh->method_mont_p) || - !BN_copy(p_minus_1, dh->p) || - !BN_sub_word(p_minus_1, 1)) { - OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); - goto err; - } - - // This performs the check required by SP 800-56Ar3 section 5.7.1.1 step two. - if (BN_cmp_word(out_shared_key, 1) <= 0 || - BN_cmp(out_shared_key, p_minus_1) == 0) { - OPENSSL_PUT_ERROR(DH, DH_R_INVALID_PUBKEY); - goto err; - } - - ret = 1; - - err: - BN_CTX_end(ctx); - return ret; -} - -int dh_compute_key_padded_no_self_test(unsigned char *out, - const BIGNUM *peers_key, DH *dh) { - BN_CTX *ctx = BN_CTX_new(); - if (ctx == NULL) { - return -1; - } - BN_CTX_start(ctx); - - int dh_size = DH_size(dh); - int ret = -1; - BIGNUM *shared_key = BN_CTX_get(ctx); - if (shared_key && - dh_compute_key(dh, shared_key, peers_key, ctx) && - BN_bn2bin_padded(out, dh_size, shared_key)) { - ret = dh_size; - } - - BN_CTX_end(ctx); - BN_CTX_free(ctx); - return ret; -} - -int DH_compute_key_padded(unsigned char *out, const BIGNUM *peers_key, DH *dh) { - boringssl_ensure_ffdh_self_test(); - - return dh_compute_key_padded_no_self_test(out, peers_key, dh); -} - -int DH_compute_key(unsigned char *out, const BIGNUM *peers_key, DH *dh) { - boringssl_ensure_ffdh_self_test(); - - BN_CTX *ctx = BN_CTX_new(); - if (ctx == NULL) { - return -1; - } - BN_CTX_start(ctx); - - int ret = -1; - BIGNUM *shared_key = BN_CTX_get(ctx); - if (shared_key && dh_compute_key(dh, shared_key, peers_key, ctx)) { - ret = BN_bn2bin(shared_key, out); - } - - BN_CTX_end(ctx); - BN_CTX_free(ctx); - return ret; -} - -int DH_compute_key_hashed(DH *dh, uint8_t *out, size_t *out_len, - size_t max_out_len, const BIGNUM *peers_key, - const EVP_MD *digest) { - *out_len = (size_t)-1; - - const size_t digest_len = EVP_MD_size(digest); - if (digest_len > max_out_len) { - return 0; - } - - FIPS_service_indicator_lock_state(); - - int ret = 0; - const size_t dh_len = DH_size(dh); - uint8_t *shared_bytes = OPENSSL_malloc(dh_len); - unsigned out_len_unsigned; - if (!shared_bytes || - // SP 800-56A is ambiguous about whether the output should be padded prior - // to revision three. But revision three, section C.1, awkwardly specifies - // padding to the length of p. - // - // Also, padded output avoids side-channels, so is always strongly - // advisable. - DH_compute_key_padded(shared_bytes, peers_key, dh) != (int)dh_len || - !EVP_Digest(shared_bytes, dh_len, out, &out_len_unsigned, digest, NULL) || - out_len_unsigned != digest_len) { - goto err; - } - - *out_len = digest_len; - ret = 1; - - err: - FIPS_service_indicator_unlock_state(); - OPENSSL_free(shared_bytes); - return ret; -} - -int DH_size(const DH *dh) { return BN_num_bytes(dh->p); } - -unsigned DH_num_bits(const DH *dh) { return BN_num_bits(dh->p); } - -int DH_up_ref(DH *dh) { - CRYPTO_refcount_inc(&dh->references); - return 1; -} - -DH *DH_get_rfc7919_2048(void) { - // This is the prime from https://tools.ietf.org/html/rfc7919#appendix-A.1, - // which is specifically approved for FIPS in appendix D of SP 800-56Ar3. - static const BN_ULONG kFFDHE2048Data[] = { - TOBN(0xffffffff, 0xffffffff), TOBN(0x886b4238, 0x61285c97), - TOBN(0xc6f34a26, 0xc1b2effa), TOBN(0xc58ef183, 0x7d1683b2), - TOBN(0x3bb5fcbc, 0x2ec22005), TOBN(0xc3fe3b1b, 0x4c6fad73), - TOBN(0x8e4f1232, 0xeef28183), TOBN(0x9172fe9c, 0xe98583ff), - TOBN(0xc03404cd, 0x28342f61), TOBN(0x9e02fce1, 0xcdf7e2ec), - TOBN(0x0b07a7c8, 0xee0a6d70), TOBN(0xae56ede7, 0x6372bb19), - TOBN(0x1d4f42a3, 0xde394df4), TOBN(0xb96adab7, 0x60d7f468), - TOBN(0xd108a94b, 0xb2c8e3fb), TOBN(0xbc0ab182, 0xb324fb61), - TOBN(0x30acca4f, 0x483a797a), TOBN(0x1df158a1, 0x36ade735), - TOBN(0xe2a689da, 0xf3efe872), TOBN(0x984f0c70, 0xe0e68b77), - TOBN(0xb557135e, 0x7f57c935), TOBN(0x85636555, 0x3ded1af3), - TOBN(0x2433f51f, 0x5f066ed0), TOBN(0xd3df1ed5, 0xd5fd6561), - TOBN(0xf681b202, 0xaec4617a), TOBN(0x7d2fe363, 0x630c75d8), - TOBN(0xcc939dce, 0x249b3ef9), TOBN(0xa9e13641, 0x146433fb), - TOBN(0xd8b9c583, 0xce2d3695), TOBN(0xafdc5620, 0x273d3cf1), - TOBN(0xadf85458, 0xa2bb4a9a), TOBN(0xffffffff, 0xffffffff), - }; - - BIGNUM *const ffdhe2048_p = BN_new(); - BIGNUM *const ffdhe2048_q = BN_new(); - BIGNUM *const ffdhe2048_g = BN_new(); - DH *const dh = DH_new(); - - if (!ffdhe2048_p || !ffdhe2048_q || !ffdhe2048_g || !dh) { - goto err; - } - - bn_set_static_words(ffdhe2048_p, kFFDHE2048Data, - OPENSSL_ARRAY_SIZE(kFFDHE2048Data)); - - if (!BN_rshift1(ffdhe2048_q, ffdhe2048_p) || - !BN_set_word(ffdhe2048_g, 2) || - !DH_set0_pqg(dh, ffdhe2048_p, ffdhe2048_q, ffdhe2048_g)) { - goto err; - } - - return dh; - - err: - BN_free(ffdhe2048_p); - BN_free(ffdhe2048_q); - BN_free(ffdhe2048_g); - DH_free(dh); - return NULL; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/dh/dh.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/dh/dh.cc.inc new file mode 100644 index 00000000..9837c6c6 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/dh/dh.cc.inc @@ -0,0 +1,401 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include + +#include +#include +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "../bn/internal.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +DH *DH_new() { return New(); } + +void DH_free(DH *dh) { + if (dh != nullptr) { + FromOpaque(dh)->DecRefInternal(); + } +} + +unsigned DH_bits(const DH *dh) { return BN_num_bits(FromOpaque(dh)->p.get()); } + +const BIGNUM *DH_get0_pub_key(const DH *dh) { + return FromOpaque(dh)->pub_key.get(); +} + +const BIGNUM *DH_get0_priv_key(const DH *dh) { + return FromOpaque(dh)->priv_key.get(); +} + +const BIGNUM *DH_get0_p(const DH *dh) { return FromOpaque(dh)->p.get(); } + +const BIGNUM *DH_get0_q(const DH *dh) { return FromOpaque(dh)->q.get(); } + +const BIGNUM *DH_get0_g(const DH *dh) { return FromOpaque(dh)->g.get(); } + +void DH_get0_key(const DH *dh, const BIGNUM **out_pub_key, + const BIGNUM **out_priv_key) { + auto *impl = FromOpaque(dh); + if (out_pub_key != nullptr) { + *out_pub_key = impl->pub_key.get(); + } + if (out_priv_key != nullptr) { + *out_priv_key = impl->priv_key.get(); + } +} + +int DH_set0_key(DH *dh, BIGNUM *pub_key, BIGNUM *priv_key) { + auto *impl = FromOpaque(dh); + if (pub_key != nullptr) { + impl->pub_key.reset(pub_key); + } + + if (priv_key != nullptr) { + impl->priv_key.reset(priv_key); + } + + return 1; +} + +void DH_get0_pqg(const DH *dh, const BIGNUM **out_p, const BIGNUM **out_q, + const BIGNUM **out_g) { + auto *impl = FromOpaque(dh); + if (out_p != nullptr) { + *out_p = impl->p.get(); + } + if (out_q != nullptr) { + *out_q = impl->q.get(); + } + if (out_g != nullptr) { + *out_g = impl->g.get(); + } +} + +int DH_set0_pqg(DH *dh, BIGNUM *p, BIGNUM *q, BIGNUM *g) { + auto *impl = FromOpaque(dh); + if ((impl->p == nullptr && p == nullptr) || + (impl->g == nullptr && g == nullptr)) { + return 0; + } + + if (p != nullptr) { + impl->p.reset(p); + } + + if (q != nullptr) { + impl->q.reset(q); + } + + if (g != nullptr) { + impl->g.reset(g); + } + + // Invalidate the cached Montgomery parameters. + impl->method_mont_p = nullptr; + return 1; +} + +int DH_set_length(DH *dh, unsigned priv_length) { + auto *impl = FromOpaque(dh); + impl->priv_length = priv_length; + return 1; +} + +int DH_generate_key(DH *dh) { + boringssl_ensure_ffdh_self_test(); + + if (!dh_check_params_fast(dh)) { + return 0; + } + + auto *impl = FromOpaque(dh); + UniquePtr ctx(BN_CTX_new()); + if (ctx == nullptr) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + + if (!BN_MONT_CTX_set_locked(&impl->method_mont_p, &impl->method_mont_p_lock, + impl->p.get(), ctx.get())) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + + // Only generate a private key if there's already one. Otherwise, + // |DH_generate_key| recomputes the public key. + const BIGNUM *priv_key = impl->priv_key.get(); + UniquePtr new_priv_key; + if (priv_key == nullptr) { + new_priv_key.reset(BN_new()); + if (new_priv_key == nullptr) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + if (impl->q) { + // Section 5.6.1.1.4 of SP 800-56A Rev3 generates a private key uniformly + // from [1, min(2^N-1, q-1)]. + // + // Although SP 800-56A Rev3 now permits a private key length N, + // |impl->priv_length| historically was ignored when q is available. We + // continue to ignore it and interpret such a configuration as N = len(q). + if (!BN_rand_range_ex(new_priv_key.get(), 1, impl->q.get())) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + } else { + // If q is unspecified, we expect p to be a safe prime, with g generating + // the (p-1)/2 subgroup. So, we use q = (p-1)/2. (If g generates a smaller + // prime-order subgroup, q will still divide (p-1)/2.) + // + // We set N from |impl->priv_length|. Section 5.6.1.1.4 of SP 800-56A Rev3 + // says to reject N > len(q), or N > num_bits(p) - 1. However, this logic + // originally aligned with PKCS#3, which allows num_bits(p). Instead, we + // clamp |impl->priv_length| before invoking the algorithm. + + // Compute M = min(2^N, q). + UniquePtr priv_key_limit(BN_new()); + if (priv_key_limit == nullptr) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + if (impl->priv_length == 0 || + impl->priv_length >= BN_num_bits(impl->p.get()) - 1) { + // M = q = (p - 1) / 2. + if (!BN_rshift1(priv_key_limit.get(), impl->p.get())) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + } else { + // M = 2^N. + if (!BN_set_bit(priv_key_limit.get(), impl->priv_length)) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + } + + // Choose a private key uniformly from [1, M-1]. + if (!BN_rand_range_ex(new_priv_key.get(), 1, priv_key_limit.get())) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + } + priv_key = new_priv_key.get(); + } + + UniquePtr new_pub_key(BN_new()); + if (new_pub_key == nullptr || + !BN_mod_exp_mont_consttime(new_pub_key.get(), impl->g.get(), priv_key, + impl->p.get(), ctx.get(), + impl->method_mont_p.get())) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + + impl->pub_key = std::move(new_pub_key); + if (new_priv_key != nullptr) { + impl->priv_key = std::move(new_priv_key); + } + return 1; +} + +static int dh_compute_key(DH *dh, BIGNUM *out_shared_key, + const BIGNUM *peers_key, BN_CTX *ctx) { + auto *impl = FromOpaque(dh); + + if (!dh_check_params_fast(dh)) { + return 0; + } + + if (impl->priv_key == nullptr) { + OPENSSL_PUT_ERROR(DH, DH_R_NO_PRIVATE_VALUE); + return 0; + } + + int check_result; + if (!DH_check_pub_key(dh, peers_key, &check_result) || check_result) { + OPENSSL_PUT_ERROR(DH, DH_R_INVALID_PUBKEY); + return 0; + } + + BN_CTXScope scope(ctx); + BIGNUM *p_minus_1 = BN_CTX_get(ctx); + if (!p_minus_1 || + !BN_MONT_CTX_set_locked(&impl->method_mont_p, &impl->method_mont_p_lock, + impl->p.get(), ctx)) { + return 0; + } + + if (!BN_mod_exp_mont_consttime(out_shared_key, peers_key, + impl->priv_key.get(), impl->p.get(), ctx, + impl->method_mont_p.get()) || + !BN_copy(p_minus_1, impl->p.get()) || !BN_sub_word(p_minus_1, 1)) { + OPENSSL_PUT_ERROR(DH, ERR_R_BN_LIB); + return 0; + } + + // This performs the check required by SP 800-56Ar3 section 5.7.1.1 step two. + if (BN_cmp_word(out_shared_key, 1) <= 0 || + BN_cmp(out_shared_key, p_minus_1) == 0) { + OPENSSL_PUT_ERROR(DH, DH_R_INVALID_PUBKEY); + return 0; + } + + return 1; +} + +int bssl::dh_compute_key_padded_no_self_test(unsigned char *out, + const BIGNUM *peers_key, DH *dh) { + UniquePtr ctx(BN_CTX_new()); + if (ctx == nullptr) { + return -1; + } + BN_CTXScope scope(ctx.get()); + int dh_size = DH_size(dh); + BIGNUM *shared_key = BN_CTX_get(ctx.get()); + if (shared_key == nullptr || + !dh_compute_key(dh, shared_key, peers_key, ctx.get()) || + !BN_bn2bin_padded(out, dh_size, shared_key)) { + return -1; + } + return dh_size; +} + +int DH_compute_key_padded(unsigned char *out, const BIGNUM *peers_key, DH *dh) { + boringssl_ensure_ffdh_self_test(); + + return dh_compute_key_padded_no_self_test(out, peers_key, dh); +} + +int DH_compute_key(unsigned char *out, const BIGNUM *peers_key, DH *dh) { + boringssl_ensure_ffdh_self_test(); + + UniquePtr ctx(BN_CTX_new()); + if (ctx == nullptr) { + return -1; + } + BN_CTXScope scope(ctx.get()); + BIGNUM *shared_key = BN_CTX_get(ctx.get()); + if (shared_key == nullptr || + !dh_compute_key(dh, shared_key, peers_key, ctx.get())) { + return -1; + } + // A |BIGNUM|'s byte count fits in |int|. + return static_cast(BN_bn2bin(shared_key, out)); +} + +int DH_compute_key_hashed(DH *dh, uint8_t *out, size_t *out_len, + size_t max_out_len, const BIGNUM *peers_key, + const EVP_MD *digest) { + *out_len = SIZE_MAX; + + const size_t digest_len = EVP_MD_size(digest); + if (digest_len > max_out_len) { + return 0; + } + + FIPS_service_indicator_lock_state(); + + int ret = 0; + const size_t dh_len = DH_size(dh); + uint8_t *shared_bytes = reinterpret_cast(OPENSSL_malloc(dh_len)); + unsigned out_len_unsigned; + if (!shared_bytes || + // SP 800-56A is ambiguous about whether the output should be padded prior + // to revision three. But revision three, section C.1, awkwardly specifies + // padding to the length of p. + // + // Also, padded output avoids side-channels, so is always strongly + // advisable. + DH_compute_key_padded(shared_bytes, peers_key, dh) != (int)dh_len || + !EVP_Digest(shared_bytes, dh_len, out, &out_len_unsigned, digest, + nullptr) || + out_len_unsigned != digest_len) { + goto err; + } + + *out_len = digest_len; + ret = 1; + +err: + FIPS_service_indicator_unlock_state(); + OPENSSL_free(shared_bytes); + return ret; +} + +int DH_size(const DH *dh) { return BN_num_bytes(FromOpaque(dh)->p.get()); } + +int DH_up_ref(DH *dh) { + auto *impl = FromOpaque(dh); + impl->UpRefInternal(); + return 1; +} + +DH *DH_get_rfc7919_2048() { + // This is the prime from https://tools.ietf.org/html/rfc7919#appendix-A.1, + // which is specifically approved for FIPS in appendix D of SP 800-56Ar3. + static const BN_ULONG kFFDHE2048Data[] = { + TOBN(0xffffffff, 0xffffffff), TOBN(0x886b4238, 0x61285c97), + TOBN(0xc6f34a26, 0xc1b2effa), TOBN(0xc58ef183, 0x7d1683b2), + TOBN(0x3bb5fcbc, 0x2ec22005), TOBN(0xc3fe3b1b, 0x4c6fad73), + TOBN(0x8e4f1232, 0xeef28183), TOBN(0x9172fe9c, 0xe98583ff), + TOBN(0xc03404cd, 0x28342f61), TOBN(0x9e02fce1, 0xcdf7e2ec), + TOBN(0x0b07a7c8, 0xee0a6d70), TOBN(0xae56ede7, 0x6372bb19), + TOBN(0x1d4f42a3, 0xde394df4), TOBN(0xb96adab7, 0x60d7f468), + TOBN(0xd108a94b, 0xb2c8e3fb), TOBN(0xbc0ab182, 0xb324fb61), + TOBN(0x30acca4f, 0x483a797a), TOBN(0x1df158a1, 0x36ade735), + TOBN(0xe2a689da, 0xf3efe872), TOBN(0x984f0c70, 0xe0e68b77), + TOBN(0xb557135e, 0x7f57c935), TOBN(0x85636555, 0x3ded1af3), + TOBN(0x2433f51f, 0x5f066ed0), TOBN(0xd3df1ed5, 0xd5fd6561), + TOBN(0xf681b202, 0xaec4617a), TOBN(0x7d2fe363, 0x630c75d8), + TOBN(0xcc939dce, 0x249b3ef9), TOBN(0xa9e13641, 0x146433fb), + TOBN(0xd8b9c583, 0xce2d3695), TOBN(0xafdc5620, 0x273d3cf1), + TOBN(0xadf85458, 0xa2bb4a9a), TOBN(0xffffffff, 0xffffffff), + }; + + UniquePtr ffdhe2048_p(BN_new()); + UniquePtr ffdhe2048_q(BN_new()); + UniquePtr ffdhe2048_g(BN_new()); + UniquePtr dh(DH_new()); + if (!ffdhe2048_p || !ffdhe2048_q || !ffdhe2048_g || !dh) { + return nullptr; + } + + bn_set_static_words(ffdhe2048_p.get(), kFFDHE2048Data, + std::size(kFFDHE2048Data)); + + if (!BN_rshift1(ffdhe2048_q.get(), ffdhe2048_p.get()) || + !BN_set_word(ffdhe2048_g.get(), 2) || + !DH_set0_pqg(dh.get(), ffdhe2048_p.get(), ffdhe2048_q.get(), + ffdhe2048_g.get())) { + return nullptr; + } + // |DH_set0_pqg| takes ownership on success. + ffdhe2048_p.release(); + ffdhe2048_q.release(); + ffdhe2048_g.release(); + + return dh.release(); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/dh/internal.h b/third_party/boringssl/src/crypto/fipsmodule/dh/internal.h index c40172d6..337afa61 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/dh/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/dh/internal.h @@ -1,36 +1,63 @@ -/* Copyright (c) 2022, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2022 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_DH_INTERNAL_H #define OPENSSL_HEADER_CRYPTO_FIPSMODULE_DH_INTERNAL_H #include -#if defined(__cplusplus) -extern "C" { -#endif +#include "../../internal.h" +#include "../../mem_internal.h" +DECLARE_OPAQUE_STRUCT(dh_st, DHImpl) + +BSSL_NAMESPACE_BEGIN + +class DHImpl : public dh_st, public RefCounted { + public: + DHImpl() : RefCounted(CheckSubClass()) {} + + UniquePtr p; + UniquePtr g; + UniquePtr q; + UniquePtr pub_key; // g^x mod p + UniquePtr priv_key; // x + + // priv_length contains the length, in bits, of the private value. If zero, + // the private value will be the same length as |p|. + unsigned priv_length = 0; + + mutable Mutex method_mont_p_lock; + mutable UniquePtr method_mont_p; + + private: + friend RefCounted; + ~DHImpl() = default; +}; + +// dh_check_params_fast checks basic invariants on |dh|'s domain parameters. It +// does not check that |dh| forms a valid group, only that the sizes are within +// DoS bounds. +int dh_check_params_fast(const DH *dh); + // dh_compute_key_padded_no_self_test does the same as |DH_compute_key_padded|, // but doesn't try to run the self-test first. This is for use in the self tests // themselves, to prevent an infinite loop. int dh_compute_key_padded_no_self_test(unsigned char *out, const BIGNUM *peers_key, DH *dh); - -#if defined(__cplusplus) -} -#endif +BSSL_NAMESPACE_END #endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_DH_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/digest/digest.c b/third_party/boringssl/src/crypto/fipsmodule/digest/digest.c deleted file mode 100644 index cb723d6c..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/digest/digest.c +++ /dev/null @@ -1,287 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include - -#include "internal.h" -#include "../../internal.h" - - -int EVP_MD_type(const EVP_MD *md) { return md->type; } - -int EVP_MD_nid(const EVP_MD *md) { return EVP_MD_type(md); } - -uint32_t EVP_MD_flags(const EVP_MD *md) { return md->flags; } - -size_t EVP_MD_size(const EVP_MD *md) { return md->md_size; } - -size_t EVP_MD_block_size(const EVP_MD *md) { return md->block_size; } - - -void EVP_MD_CTX_init(EVP_MD_CTX *ctx) { - OPENSSL_memset(ctx, 0, sizeof(EVP_MD_CTX)); -} - -EVP_MD_CTX *EVP_MD_CTX_new(void) { - EVP_MD_CTX *ctx = OPENSSL_malloc(sizeof(EVP_MD_CTX)); - - if (ctx) { - EVP_MD_CTX_init(ctx); - } - - return ctx; -} - -EVP_MD_CTX *EVP_MD_CTX_create(void) { return EVP_MD_CTX_new(); } - -int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx) { - OPENSSL_free(ctx->md_data); - - assert(ctx->pctx == NULL || ctx->pctx_ops != NULL); - if (ctx->pctx_ops) { - ctx->pctx_ops->free(ctx->pctx); - } - - EVP_MD_CTX_init(ctx); - - return 1; -} - -void EVP_MD_CTX_cleanse(EVP_MD_CTX *ctx) { - OPENSSL_cleanse(ctx->md_data, ctx->digest->ctx_size); - EVP_MD_CTX_cleanup(ctx); -} - -void EVP_MD_CTX_free(EVP_MD_CTX *ctx) { - if (!ctx) { - return; - } - - EVP_MD_CTX_cleanup(ctx); - OPENSSL_free(ctx); -} - -void EVP_MD_CTX_destroy(EVP_MD_CTX *ctx) { EVP_MD_CTX_free(ctx); } - -int EVP_DigestFinalXOF(EVP_MD_CTX *ctx, uint8_t *out, size_t len) { - OPENSSL_PUT_ERROR(DIGEST, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; -} - -uint32_t EVP_MD_meth_get_flags(const EVP_MD *md) { return EVP_MD_flags(md); } - -void EVP_MD_CTX_set_flags(EVP_MD_CTX *ctx, int flags) {} - -int EVP_MD_CTX_copy_ex(EVP_MD_CTX *out, const EVP_MD_CTX *in) { - // |in->digest| may be NULL if this is a signing |EVP_MD_CTX| for, e.g., - // Ed25519 which does not hash with |EVP_MD_CTX|. - if (in == NULL || (in->pctx == NULL && in->digest == NULL)) { - OPENSSL_PUT_ERROR(DIGEST, DIGEST_R_INPUT_NOT_INITIALIZED); - return 0; - } - - EVP_PKEY_CTX *pctx = NULL; - assert(in->pctx == NULL || in->pctx_ops != NULL); - if (in->pctx) { - pctx = in->pctx_ops->dup(in->pctx); - if (!pctx) { - OPENSSL_PUT_ERROR(DIGEST, ERR_R_MALLOC_FAILURE); - return 0; - } - } - - uint8_t *tmp_buf = NULL; - if (in->digest != NULL) { - if (out->digest != in->digest) { - assert(in->digest->ctx_size != 0); - tmp_buf = OPENSSL_malloc(in->digest->ctx_size); - if (tmp_buf == NULL) { - if (pctx) { - in->pctx_ops->free(pctx); - } - OPENSSL_PUT_ERROR(DIGEST, ERR_R_MALLOC_FAILURE); - return 0; - } - } else { - // |md_data| will be the correct size in this case. It's removed from - // |out| so that |EVP_MD_CTX_cleanup| doesn't free it, and then it's - // reused. - tmp_buf = out->md_data; - out->md_data = NULL; - } - } - - EVP_MD_CTX_cleanup(out); - - out->digest = in->digest; - out->md_data = tmp_buf; - if (in->digest != NULL) { - OPENSSL_memcpy(out->md_data, in->md_data, in->digest->ctx_size); - } - out->pctx = pctx; - out->pctx_ops = in->pctx_ops; - assert(out->pctx == NULL || out->pctx_ops != NULL); - - return 1; -} - -void EVP_MD_CTX_move(EVP_MD_CTX *out, EVP_MD_CTX *in) { - EVP_MD_CTX_cleanup(out); - // While not guaranteed, |EVP_MD_CTX| is currently safe to move with |memcpy|. - OPENSSL_memcpy(out, in, sizeof(EVP_MD_CTX)); - EVP_MD_CTX_init(in); -} - -int EVP_MD_CTX_copy(EVP_MD_CTX *out, const EVP_MD_CTX *in) { - EVP_MD_CTX_init(out); - return EVP_MD_CTX_copy_ex(out, in); -} - -int EVP_MD_CTX_reset(EVP_MD_CTX *ctx) { - EVP_MD_CTX_cleanup(ctx); - EVP_MD_CTX_init(ctx); - return 1; -} - -int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *engine) { - if (ctx->digest != type) { - assert(type->ctx_size != 0); - uint8_t *md_data = OPENSSL_malloc(type->ctx_size); - if (md_data == NULL) { - OPENSSL_PUT_ERROR(DIGEST, ERR_R_MALLOC_FAILURE); - return 0; - } - - OPENSSL_free(ctx->md_data); - ctx->md_data = md_data; - ctx->digest = type; - } - - assert(ctx->pctx == NULL || ctx->pctx_ops != NULL); - - ctx->digest->init(ctx); - return 1; -} - -int EVP_DigestInit(EVP_MD_CTX *ctx, const EVP_MD *type) { - EVP_MD_CTX_init(ctx); - return EVP_DigestInit_ex(ctx, type, NULL); -} - -int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *data, size_t len) { - ctx->digest->update(ctx, data, len); - return 1; -} - -int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, uint8_t *md_out, unsigned int *size) { - assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE); - ctx->digest->final(ctx, md_out); - if (size != NULL) { - *size = ctx->digest->md_size; - } - OPENSSL_cleanse(ctx->md_data, ctx->digest->ctx_size); - return 1; -} - -int EVP_DigestFinal(EVP_MD_CTX *ctx, uint8_t *md, unsigned int *size) { - (void)EVP_DigestFinal_ex(ctx, md, size); - EVP_MD_CTX_cleanup(ctx); - return 1; -} - -int EVP_Digest(const void *data, size_t count, uint8_t *out_md, - unsigned int *out_size, const EVP_MD *type, ENGINE *impl) { - EVP_MD_CTX ctx; - int ret; - - EVP_MD_CTX_init(&ctx); - ret = EVP_DigestInit_ex(&ctx, type, impl) && - EVP_DigestUpdate(&ctx, data, count) && - EVP_DigestFinal_ex(&ctx, out_md, out_size); - EVP_MD_CTX_cleanup(&ctx); - - return ret; -} - - -const EVP_MD *EVP_MD_CTX_md(const EVP_MD_CTX *ctx) { - if (ctx == NULL) { - return NULL; - } - return ctx->digest; -} - -size_t EVP_MD_CTX_size(const EVP_MD_CTX *ctx) { - return EVP_MD_size(EVP_MD_CTX_md(ctx)); -} - -size_t EVP_MD_CTX_block_size(const EVP_MD_CTX *ctx) { - return EVP_MD_block_size(EVP_MD_CTX_md(ctx)); -} - -int EVP_MD_CTX_type(const EVP_MD_CTX *ctx) { - return EVP_MD_type(EVP_MD_CTX_md(ctx)); -} - -int EVP_add_digest(const EVP_MD *digest) { - return 1; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/digest/digest.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/digest/digest.cc.inc new file mode 100644 index 00000000..49b067b7 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/digest/digest.cc.inc @@ -0,0 +1,223 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +int EVP_MD_type(const EVP_MD *md) { return md->type; } + +int EVP_MD_nid(const EVP_MD *md) { return EVP_MD_type(md); } + +uint32_t EVP_MD_flags(const EVP_MD *md) { return md->flags; } + +size_t EVP_MD_size(const EVP_MD *md) { return md->md_size; } + +size_t EVP_MD_block_size(const EVP_MD *md) { return md->block_size; } + + +void EVP_MD_CTX_init(EVP_MD_CTX *ctx) { + ctx->digest = nullptr; + ctx->pctx = nullptr; + ctx->pctx_ops = nullptr; +} + +EVP_MD_CTX *EVP_MD_CTX_new() { + EVP_MD_CTX *ctx = New(); + + if (ctx) { + EVP_MD_CTX_init(ctx); + } + + return ctx; +} + +EVP_MD_CTX *EVP_MD_CTX_create() { return EVP_MD_CTX_new(); } + +int EVP_MD_CTX_cleanup(EVP_MD_CTX *ctx) { + assert(ctx->pctx == nullptr || ctx->pctx_ops != nullptr); + if (ctx->pctx_ops) { + ctx->pctx_ops->free(ctx->pctx); + } + + EVP_MD_CTX_init(ctx); + + return 1; +} + +void EVP_MD_CTX_cleanse(EVP_MD_CTX *ctx) { + OPENSSL_cleanse(ctx->md_data, sizeof(ctx->md_data)); + EVP_MD_CTX_cleanup(ctx); +} + +void EVP_MD_CTX_free(EVP_MD_CTX *ctx) { + if (!ctx) { + return; + } + + EVP_MD_CTX_cleanup(ctx); + Delete(ctx); +} + +void EVP_MD_CTX_destroy(EVP_MD_CTX *ctx) { EVP_MD_CTX_free(ctx); } + +int EVP_DigestFinalXOF(EVP_MD_CTX *ctx, uint8_t *out, size_t len) { + OPENSSL_PUT_ERROR(DIGEST, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; +} + +uint32_t EVP_MD_meth_get_flags(const EVP_MD *md) { return EVP_MD_flags(md); } + +void EVP_MD_CTX_set_flags(EVP_MD_CTX *ctx, int flags) {} + +int EVP_MD_CTX_copy_ex(EVP_MD_CTX *out, const EVP_MD_CTX *in) { + // |in->digest| may be NULL if this is a signing |EVP_MD_CTX| for, e.g., + // Ed25519 which does not hash with |EVP_MD_CTX|. + if (in == nullptr || (in->pctx == nullptr && in->digest == nullptr)) { + OPENSSL_PUT_ERROR(DIGEST, DIGEST_R_INPUT_NOT_INITIALIZED); + return 0; + } + if (out == in) { + OPENSSL_PUT_ERROR(DIGEST, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + EVP_PKEY_CTX *pctx = nullptr; + assert(in->pctx == nullptr || in->pctx_ops != nullptr); + if (in->pctx) { + pctx = in->pctx_ops->dup(in->pctx); + if (!pctx) { + return 0; + } + } + + EVP_MD_CTX_cleanup(out); + + out->digest = in->digest; + if (in->digest != nullptr) { + OPENSSL_memcpy(out->md_data, in->md_data, in->digest->ctx_size); + } + out->pctx = pctx; + out->pctx_ops = in->pctx_ops; + assert(out->pctx == nullptr || out->pctx_ops != nullptr); + + return 1; +} + +void EVP_MD_CTX_move(EVP_MD_CTX *out, EVP_MD_CTX *in) { + EVP_MD_CTX_cleanup(out); + // While not guaranteed, |EVP_MD_CTX| is currently safe to move with |memcpy|. + // bssl-crypto currently relies on this, however, so if we change this, we + // need to box the |HMAC_CTX|. (Relying on this is only fine because we assume + // BoringSSL and bssl-crypto will always be updated atomically. We do not + // allow any version skew between the two.) + OPENSSL_memcpy(out, in, sizeof(EVP_MD_CTX)); + EVP_MD_CTX_init(in); +} + +int EVP_MD_CTX_copy(EVP_MD_CTX *out, const EVP_MD_CTX *in) { + EVP_MD_CTX_init(out); + return EVP_MD_CTX_copy_ex(out, in); +} + +int EVP_MD_CTX_reset(EVP_MD_CTX *ctx) { + EVP_MD_CTX_cleanup(ctx); + EVP_MD_CTX_init(ctx); + return 1; +} + +int EVP_DigestInit_ex(EVP_MD_CTX *ctx, const EVP_MD *type, ENGINE *engine) { + if (ctx->digest != type) { + assert(type->ctx_size != 0); + assert(type->ctx_size <= sizeof(ctx->md_data)); + ctx->digest = type; + } + + assert(ctx->pctx == nullptr || ctx->pctx_ops != nullptr); + + ctx->digest->init(ctx); + return 1; +} + +int EVP_DigestInit(EVP_MD_CTX *ctx, const EVP_MD *type) { + EVP_MD_CTX_init(ctx); + return EVP_DigestInit_ex(ctx, type, nullptr); +} + +int EVP_DigestUpdate(EVP_MD_CTX *ctx, const void *data, size_t len) { + ctx->digest->update(ctx, data, len); + return 1; +} + +int EVP_DigestFinal_ex(EVP_MD_CTX *ctx, uint8_t *md_out, unsigned int *size) { + assert(ctx->digest->md_size <= EVP_MAX_MD_SIZE); + ctx->digest->final(ctx, md_out); + if (size != nullptr) { + *size = ctx->digest->md_size; + } + OPENSSL_cleanse(ctx->md_data, ctx->digest->ctx_size); + return 1; +} + +int EVP_DigestFinal(EVP_MD_CTX *ctx, uint8_t *md, unsigned int *size) { + (void)EVP_DigestFinal_ex(ctx, md, size); + EVP_MD_CTX_cleanup(ctx); + return 1; +} + +int EVP_Digest(const void *data, size_t count, uint8_t *out_md, + unsigned int *out_size, const EVP_MD *type, ENGINE *impl) { + ScopedEVP_MD_CTX ctx; + return EVP_DigestInit_ex(ctx.get(), type, impl) && + EVP_DigestUpdate(ctx.get(), data, count) && + EVP_DigestFinal_ex(ctx.get(), out_md, out_size); +} + +const EVP_MD *EVP_MD_CTX_get0_md(const EVP_MD_CTX *ctx) { + if (ctx == nullptr) { + return nullptr; + } + return ctx->digest; +} + +const EVP_MD *EVP_MD_CTX_md(const EVP_MD_CTX *ctx) { + return EVP_MD_CTX_get0_md(ctx); +} + +size_t EVP_MD_CTX_size(const EVP_MD_CTX *ctx) { + return EVP_MD_size(EVP_MD_CTX_get0_md(ctx)); +} + +size_t EVP_MD_CTX_block_size(const EVP_MD_CTX *ctx) { + return EVP_MD_block_size(EVP_MD_CTX_get0_md(ctx)); +} + +int EVP_MD_CTX_type(const EVP_MD_CTX *ctx) { + return EVP_MD_type(EVP_MD_CTX_get0_md(ctx)); +} + +EVP_PKEY_CTX *EVP_MD_CTX_pkey_ctx(const EVP_MD_CTX *ctx) { return ctx->pctx; } + +int EVP_add_digest(const EVP_MD *digest) { return 1; } diff --git a/third_party/boringssl/src/crypto/fipsmodule/digest/digests.c b/third_party/boringssl/src/crypto/fipsmodule/digest/digests.c deleted file mode 100644 index f006ebbc..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/digest/digests.c +++ /dev/null @@ -1,304 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include -#include - -#include "internal.h" -#include "../delocate.h" -#include "../../internal.h" - -#if defined(NDEBUG) -#define CHECK(x) (void) (x) -#else -#define CHECK(x) assert(x) -#endif - - -static void md4_init(EVP_MD_CTX *ctx) { - CHECK(MD4_Init(ctx->md_data)); -} - -static void md4_update(EVP_MD_CTX *ctx, const void *data, size_t count) { - CHECK(MD4_Update(ctx->md_data, data, count)); -} - -static void md4_final(EVP_MD_CTX *ctx, uint8_t *out) { - CHECK(MD4_Final(out, ctx->md_data)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md4) { - out->type = NID_md4; - out->md_size = MD4_DIGEST_LENGTH; - out->flags = 0; - out->init = md4_init; - out->update = md4_update; - out->final = md4_final; - out->block_size = 64; - out->ctx_size = sizeof(MD4_CTX); -} - - -static void md5_init(EVP_MD_CTX *ctx) { - CHECK(MD5_Init(ctx->md_data)); -} - -static void md5_update(EVP_MD_CTX *ctx, const void *data, size_t count) { - CHECK(MD5_Update(ctx->md_data, data, count)); -} - -static void md5_final(EVP_MD_CTX *ctx, uint8_t *out) { - CHECK(MD5_Final(out, ctx->md_data)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md5) { - out->type = NID_md5; - out->md_size = MD5_DIGEST_LENGTH; - out->flags = 0; - out->init = md5_init; - out->update = md5_update; - out->final = md5_final; - out->block_size = 64; - out->ctx_size = sizeof(MD5_CTX); -} - - -static void sha1_init(EVP_MD_CTX *ctx) { - CHECK(SHA1_Init(ctx->md_data)); -} - -static void sha1_update(EVP_MD_CTX *ctx, const void *data, size_t count) { - CHECK(SHA1_Update(ctx->md_data, data, count)); -} - -static void sha1_final(EVP_MD_CTX *ctx, uint8_t *md) { - CHECK(SHA1_Final(md, ctx->md_data)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha1) { - out->type = NID_sha1; - out->md_size = SHA_DIGEST_LENGTH; - out->flags = 0; - out->init = sha1_init; - out->update = sha1_update; - out->final = sha1_final; - out->block_size = 64; - out->ctx_size = sizeof(SHA_CTX); -} - - -static void sha224_init(EVP_MD_CTX *ctx) { - CHECK(SHA224_Init(ctx->md_data)); -} - -static void sha224_update(EVP_MD_CTX *ctx, const void *data, size_t count) { - CHECK(SHA224_Update(ctx->md_data, data, count)); -} - -static void sha224_final(EVP_MD_CTX *ctx, uint8_t *md) { - CHECK(SHA224_Final(md, ctx->md_data)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha224) { - out->type = NID_sha224; - out->md_size = SHA224_DIGEST_LENGTH; - out->flags = 0; - out->init = sha224_init; - out->update = sha224_update; - out->final = sha224_final; - out->block_size = 64; - out->ctx_size = sizeof(SHA256_CTX); -} - - -static void sha256_init(EVP_MD_CTX *ctx) { - CHECK(SHA256_Init(ctx->md_data)); -} - -static void sha256_update(EVP_MD_CTX *ctx, const void *data, size_t count) { - CHECK(SHA256_Update(ctx->md_data, data, count)); -} - -static void sha256_final(EVP_MD_CTX *ctx, uint8_t *md) { - CHECK(SHA256_Final(md, ctx->md_data)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha256) { - out->type = NID_sha256; - out->md_size = SHA256_DIGEST_LENGTH; - out->flags = 0; - out->init = sha256_init; - out->update = sha256_update; - out->final = sha256_final; - out->block_size = 64; - out->ctx_size = sizeof(SHA256_CTX); -} - - -static void sha384_init(EVP_MD_CTX *ctx) { - CHECK(SHA384_Init(ctx->md_data)); -} - -static void sha384_update(EVP_MD_CTX *ctx, const void *data, size_t count) { - CHECK(SHA384_Update(ctx->md_data, data, count)); -} - -static void sha384_final(EVP_MD_CTX *ctx, uint8_t *md) { - CHECK(SHA384_Final(md, ctx->md_data)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha384) { - out->type = NID_sha384; - out->md_size = SHA384_DIGEST_LENGTH; - out->flags = 0; - out->init = sha384_init; - out->update = sha384_update; - out->final = sha384_final; - out->block_size = 128; - out->ctx_size = sizeof(SHA512_CTX); -} - - -static void sha512_init(EVP_MD_CTX *ctx) { - CHECK(SHA512_Init(ctx->md_data)); -} - -static void sha512_update(EVP_MD_CTX *ctx, const void *data, size_t count) { - CHECK(SHA512_Update(ctx->md_data, data, count)); -} - -static void sha512_final(EVP_MD_CTX *ctx, uint8_t *md) { - CHECK(SHA512_Final(md, ctx->md_data)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha512) { - out->type = NID_sha512; - out->md_size = SHA512_DIGEST_LENGTH; - out->flags = 0; - out->init = sha512_init; - out->update = sha512_update; - out->final = sha512_final; - out->block_size = 128; - out->ctx_size = sizeof(SHA512_CTX); -} - - -static void sha512_256_init(EVP_MD_CTX *ctx) { - CHECK(SHA512_256_Init(ctx->md_data)); -} - -static void sha512_256_update(EVP_MD_CTX *ctx, const void *data, size_t count) { - CHECK(SHA512_256_Update(ctx->md_data, data, count)); -} - -static void sha512_256_final(EVP_MD_CTX *ctx, uint8_t *md) { - CHECK(SHA512_256_Final(md, ctx->md_data)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha512_256) { - out->type = NID_sha512_256; - out->md_size = SHA512_256_DIGEST_LENGTH; - out->flags = 0; - out->init = sha512_256_init; - out->update = sha512_256_update; - out->final = sha512_256_final; - out->block_size = 128; - out->ctx_size = sizeof(SHA512_CTX); -} - - -typedef struct { - MD5_CTX md5; - SHA_CTX sha1; -} MD5_SHA1_CTX; - -static void md5_sha1_init(EVP_MD_CTX *md_ctx) { - MD5_SHA1_CTX *ctx = md_ctx->md_data; - CHECK(MD5_Init(&ctx->md5) && SHA1_Init(&ctx->sha1)); -} - -static void md5_sha1_update(EVP_MD_CTX *md_ctx, const void *data, - size_t count) { - MD5_SHA1_CTX *ctx = md_ctx->md_data; - CHECK(MD5_Update(&ctx->md5, data, count) && - SHA1_Update(&ctx->sha1, data, count)); -} - -static void md5_sha1_final(EVP_MD_CTX *md_ctx, uint8_t *out) { - MD5_SHA1_CTX *ctx = md_ctx->md_data; - CHECK(MD5_Final(out, &ctx->md5) && - SHA1_Final(out + MD5_DIGEST_LENGTH, &ctx->sha1)); -} - -DEFINE_METHOD_FUNCTION(EVP_MD, EVP_md5_sha1) { - out->type = NID_md5_sha1; - out->md_size = MD5_DIGEST_LENGTH + SHA_DIGEST_LENGTH; - out->flags = 0; - out->init = md5_sha1_init; - out->update = md5_sha1_update; - out->final = md5_sha1_final; - out->block_size = 64; - out->ctx_size = sizeof(MD5_SHA1_CTX); -} - -#undef CHECK diff --git a/third_party/boringssl/src/crypto/fipsmodule/digest/digests.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/digest/digests.cc.inc new file mode 100644 index 00000000..a246a511 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/digest/digests.cc.inc @@ -0,0 +1,184 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include "../../internal.h" +#include "../bcm_interface.h" +#include "../delocate.h" +#include "internal.h" + + +using namespace bssl; + +#if defined(NDEBUG) +#define CHECK(x) (void)(x) +#else +#define CHECK(x) assert(x) +#endif + +static void sha1_init(EVP_MD_CTX *ctx) { + BCM_sha1_init(reinterpret_cast(ctx->md_data)); +} + +static void sha1_update(EVP_MD_CTX *ctx, const void *data, size_t count) { + BCM_sha1_update(reinterpret_cast(ctx->md_data), data, count); +} + +static void sha1_final(EVP_MD_CTX *ctx, uint8_t *md) { + BCM_sha1_final(md, reinterpret_cast(ctx->md_data)); +} + +DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha1) { + out->type = NID_sha1; + out->md_size = SHA_DIGEST_LENGTH; + out->flags = 0; + out->init = sha1_init; + out->update = sha1_update; + out->final = sha1_final; + out->block_size = 64; + out->ctx_size = sizeof(SHA_CTX); +} + +static_assert(sizeof(SHA_CTX) <= EVP_MAX_MD_DATA_SIZE); + + +static void sha224_init(EVP_MD_CTX *ctx) { + BCM_sha224_init(reinterpret_cast(ctx->md_data)); +} + +static void sha224_update(EVP_MD_CTX *ctx, const void *data, size_t count) { + BCM_sha224_update(reinterpret_cast(ctx->md_data), data, count); +} + +static void sha224_final(EVP_MD_CTX *ctx, uint8_t *md) { + BCM_sha224_final(md, reinterpret_cast(ctx->md_data)); +} + +DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha224) { + out->type = NID_sha224; + out->md_size = SHA224_DIGEST_LENGTH; + out->flags = 0; + out->init = sha224_init; + out->update = sha224_update; + out->final = sha224_final; + out->block_size = 64; + out->ctx_size = sizeof(SHA256_CTX); +} + +static_assert(sizeof(SHA256_CTX) <= EVP_MAX_MD_DATA_SIZE); + +static void sha256_init(EVP_MD_CTX *ctx) { + BCM_sha256_init(reinterpret_cast(ctx->md_data)); +} + +static void sha256_update(EVP_MD_CTX *ctx, const void *data, size_t count) { + BCM_sha256_update(reinterpret_cast(ctx->md_data), data, count); +} + +static void sha256_final(EVP_MD_CTX *ctx, uint8_t *md) { + BCM_sha256_final(md, reinterpret_cast(ctx->md_data)); +} + +DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha256) { + out->type = NID_sha256; + out->md_size = SHA256_DIGEST_LENGTH; + out->flags = 0; + out->init = sha256_init; + out->update = sha256_update; + out->final = sha256_final; + out->block_size = 64; + out->ctx_size = sizeof(SHA256_CTX); +} + + +static void sha384_init(EVP_MD_CTX *ctx) { + BCM_sha384_init(reinterpret_cast(ctx->md_data)); +} + +static void sha384_update(EVP_MD_CTX *ctx, const void *data, size_t count) { + BCM_sha384_update(reinterpret_cast(ctx->md_data), data, count); +} + +static void sha384_final(EVP_MD_CTX *ctx, uint8_t *md) { + BCM_sha384_final(md, reinterpret_cast(ctx->md_data)); +} + +DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha384) { + out->type = NID_sha384; + out->md_size = SHA384_DIGEST_LENGTH; + out->flags = 0; + out->init = sha384_init; + out->update = sha384_update; + out->final = sha384_final; + out->block_size = 128; + out->ctx_size = sizeof(SHA512_CTX); +} + +static_assert(sizeof(SHA512_CTX) <= EVP_MAX_MD_DATA_SIZE); + +static void sha512_init(EVP_MD_CTX *ctx) { + BCM_sha512_init(reinterpret_cast(ctx->md_data)); +} + +static void sha512_update(EVP_MD_CTX *ctx, const void *data, size_t count) { + BCM_sha512_update(reinterpret_cast(ctx->md_data), data, count); +} + +static void sha512_final(EVP_MD_CTX *ctx, uint8_t *md) { + BCM_sha512_final(md, reinterpret_cast(ctx->md_data)); +} + +DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha512) { + out->type = NID_sha512; + out->md_size = SHA512_DIGEST_LENGTH; + out->flags = 0; + out->init = sha512_init; + out->update = sha512_update; + out->final = sha512_final; + out->block_size = 128; + out->ctx_size = sizeof(SHA512_CTX); +} + + +static void sha512_256_init(EVP_MD_CTX *ctx) { + BCM_sha512_256_init(reinterpret_cast(ctx->md_data)); +} + +static void sha512_256_update(EVP_MD_CTX *ctx, const void *data, size_t count) { + BCM_sha512_256_update(reinterpret_cast(ctx->md_data), data, + count); +} + +static void sha512_256_final(EVP_MD_CTX *ctx, uint8_t *md) { + BCM_sha512_256_final(md, reinterpret_cast(ctx->md_data)); +} + +DEFINE_METHOD_FUNCTION(EVP_MD, EVP_sha512_256) { + out->type = NID_sha512_256; + out->md_size = SHA512_256_DIGEST_LENGTH; + out->flags = 0; + out->init = sha512_256_init; + out->update = sha512_256_update; + out->final = sha512_256_final; + out->block_size = 128; + out->ctx_size = sizeof(SHA512_CTX); +} + +#undef CHECK diff --git a/third_party/boringssl/src/crypto/fipsmodule/digest/internal.h b/third_party/boringssl/src/crypto/fipsmodule/digest/internal.h index 2d06ed07..1174fcc5 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/digest/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/digest/internal.h @@ -1,61 +1,19 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#ifndef OPENSSL_HEADER_DIGEST_INTERNAL_H -#define OPENSSL_HEADER_DIGEST_INTERNAL_H +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_DIGEST_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_DIGEST_INTERNAL_H #include @@ -64,8 +22,10 @@ extern "C" { #endif +// env_md_st is typoed ("evp" -> "env"), but the typo comes from OpenSSL and +// some consumers forward-declare these structures so we're leaving it alone. struct env_md_st { - // type contains a NID identifing the digest function. (For example, + // type contains a NID identifying the digest function. (For example, // NID_md5.) int type; @@ -97,11 +57,11 @@ struct env_md_st { struct evp_md_pctx_ops { // free is called when an |EVP_MD_CTX| is being freed and the |pctx| also // needs to be freed. - void (*free) (EVP_PKEY_CTX *pctx); + void (*free)(EVP_PKEY_CTX *pctx); // dup is called when an |EVP_MD_CTX| is copied and so the |pctx| also needs // to be copied. - EVP_PKEY_CTX* (*dup) (EVP_PKEY_CTX *pctx); + EVP_PKEY_CTX *(*dup)(EVP_PKEY_CTX *pctx); }; @@ -109,4 +69,4 @@ struct evp_md_pctx_ops { } // extern C #endif -#endif // OPENSSL_HEADER_DIGEST_INTERNAL +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_DIGEST_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/digest/md32_common.h b/third_party/boringssl/src/crypto/fipsmodule/digest/md32_common.h index 129ec48f..9e237b2b 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/digest/md32_common.h +++ b/third_party/boringssl/src/crypto/fipsmodule/digest/md32_common.h @@ -1,63 +1,28 @@ -/* ==================================================================== - * Copyright (c) 1999-2007 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#ifndef OPENSSL_HEADER_DIGEST_MD32_COMMON_H -#define OPENSSL_HEADER_DIGEST_MD32_COMMON_H +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_DIGEST_MD32_COMMON_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_DIGEST_MD32_COMMON_H #include +#include #include #include "../../internal.h" -#if defined(__cplusplus) -extern "C" { -#endif +BSSL_NAMESPACE_BEGIN // This is a generic 32-bit "collector" for message digest algorithms. It @@ -82,70 +47,80 @@ extern "C" { // |h| is the hash state and is updated by a function of type // |crypto_md32_block_func|. |data| is the partial unprocessed block and has // |num| bytes. |Nl| and |Nh| maintain the number of bits processed so far. - -// A crypto_md32_block_func should incorporate |num_blocks| of input from |data| -// into |state|. It is assumed the caller has sized |state| and |data| for the -// hash function. -typedef void (*crypto_md32_block_func)(uint32_t *state, const uint8_t *data, - size_t num_blocks); - -// crypto_md32_update adds |len| bytes from |in| to the digest. |data| must be a -// buffer of length |block_size| with the first |*num| bytes containing a -// partial block. This function combines the partial block with |in| and -// incorporates any complete blocks into the digest state |h|. It then updates -// |data| and |*num| with the new partial block and updates |*Nh| and |*Nl| with -// the data consumed. -static inline void crypto_md32_update(crypto_md32_block_func block_func, - uint32_t *h, uint8_t *data, - size_t block_size, unsigned *num, - uint32_t *Nh, uint32_t *Nl, - const uint8_t *in, size_t len) { - if (len == 0) { +// +// The template parameter is then a traits struct defined as follows: +// +// struct HashTraits { +// // HashContext is the hash type defined above. +// using HashContext = _CTX; +// +// // kBlockSize is the block size of the hash function. +// static constexpr size_t kBlockSize = ; +// +// // kLengthIsBigEndian determines whether the final length is encoded in +// // big or little endian. +// static constexpr bool kLengthIsBigEndian = ...; +// +// // HashBlocks incorporates |num_blocks| blocks of input from |data| +// // into |state|. It is assumed the caller has sized |state| and |data| +// // for the hash function. +// static void HashBlocks(uint32_t *state, const uint8_t *data, +// size_t num_blocks) { +// _block_data_order(state, data, num_blocks); +// } +// }; +// +// The reason for this formulation is to encourage the compiler to specialize +// all the code for the block size and block function. + +// crypto_md32_update hashes |in| to |ctx|. +template +inline void crypto_md32_update(typename Traits::HashContext *ctx, + Span in) { + static_assert(Traits::kBlockSize == sizeof(ctx->data), "block size is wrong"); + if (in.empty()) { return; } - uint32_t l = *Nl + (((uint32_t)len) << 3); - if (l < *Nl) { + uint32_t l = ctx->Nl + ((static_cast(in.size())) << 3); + if (l < ctx->Nl) { // Handle carries. - (*Nh)++; + ctx->Nh++; } - *Nh += (uint32_t)(len >> 29); - *Nl = l; + ctx->Nh += static_cast(in.size() >> 29); + ctx->Nl = l; - size_t n = *num; + size_t n = ctx->num; if (n != 0) { - if (len >= block_size || len + n >= block_size) { - OPENSSL_memcpy(data + n, in, block_size - n); - block_func(h, data, 1); - n = block_size - n; - in += n; - len -= n; - *num = 0; + if (in.size() >= Traits::kBlockSize || + in.size() + n >= Traits::kBlockSize) { + OPENSSL_memcpy(ctx->data + n, in.data(), Traits::kBlockSize - n); + Traits::HashBlocks(ctx->h, ctx->data, 1); + in = in.subspan(Traits::kBlockSize - n); + ctx->num = 0; // Keep |data| zeroed when unused. - OPENSSL_memset(data, 0, block_size); + OPENSSL_memset(ctx->data, 0, Traits::kBlockSize); } else { - OPENSSL_memcpy(data + n, in, len); - *num += (unsigned)len; + OPENSSL_memcpy(ctx->data + n, in.data(), in.size()); + ctx->num += static_cast(in.size()); return; } } - n = len / block_size; + n = in.size() / Traits::kBlockSize; if (n > 0) { - block_func(h, in, n); - n *= block_size; - in += n; - len -= n; + Traits::HashBlocks(ctx->h, in.data(), n); + in = in.subspan(n * Traits::kBlockSize); } - if (len != 0) { - *num = (unsigned)len; - OPENSSL_memcpy(data, in, len); + if (!in.empty()) { + ctx->num = static_cast(in.size()); + OPENSSL_memcpy(ctx->data, in.data(), in.size()); } } // crypto_md32_final incorporates the partial block and trailing length into the -// digest state |h|. The trailing length is encoded in little-endian if +// digest state in |ctx|. The trailing length is encoded in little-endian if // |is_big_endian| is zero and big-endian otherwise. |data| must be a buffer of // length |block_size| with the first |*num| bytes containing a partial block. // |Nh| and |Nl| contain the total number of bits processed. On return, this @@ -154,42 +129,38 @@ static inline void crypto_md32_update(crypto_md32_block_func block_func, // // This function does not serialize |h| into a final digest. This is the // responsibility of the caller. -static inline void crypto_md32_final(crypto_md32_block_func block_func, - uint32_t *h, uint8_t *data, - size_t block_size, unsigned *num, - uint32_t Nh, uint32_t Nl, - int is_big_endian) { +template +inline void crypto_md32_final(typename Traits::HashContext *ctx) { + static_assert(Traits::kBlockSize == sizeof(ctx->data), "block size is wrong"); // |data| always has room for at least one byte. A full block would have // been consumed. - size_t n = *num; - assert(n < block_size); - data[n] = 0x80; + size_t n = ctx->num; + assert(n < Traits::kBlockSize); + ctx->data[n] = 0x80; n++; // Fill the block with zeros if there isn't room for a 64-bit length. - if (n > block_size - 8) { - OPENSSL_memset(data + n, 0, block_size - n); + if (n > Traits::kBlockSize - 8) { + OPENSSL_memset(ctx->data + n, 0, Traits::kBlockSize - n); n = 0; - block_func(h, data, 1); + Traits::HashBlocks(ctx->h, ctx->data, 1); } - OPENSSL_memset(data + n, 0, block_size - 8 - n); + OPENSSL_memset(ctx->data + n, 0, Traits::kBlockSize - 8 - n); // Append a 64-bit length to the block and process it. - if (is_big_endian) { - CRYPTO_store_u32_be(data + block_size - 8, Nh); - CRYPTO_store_u32_be(data + block_size - 4, Nl); + if constexpr (Traits::kLengthIsBigEndian) { + CRYPTO_store_u32_be(ctx->data + Traits::kBlockSize - 8, ctx->Nh); + CRYPTO_store_u32_be(ctx->data + Traits::kBlockSize - 4, ctx->Nl); } else { - CRYPTO_store_u32_le(data + block_size - 8, Nl); - CRYPTO_store_u32_le(data + block_size - 4, Nh); + CRYPTO_store_u32_le(ctx->data + Traits::kBlockSize - 8, ctx->Nl); + CRYPTO_store_u32_le(ctx->data + Traits::kBlockSize - 4, ctx->Nh); } - block_func(h, data, 1); - *num = 0; - OPENSSL_memset(data, 0, block_size); + Traits::HashBlocks(ctx->h, ctx->data, 1); + ctx->num = 0; + OPENSSL_memset(ctx->data, 0, Traits::kBlockSize); } -#if defined(__cplusplus) -} // extern C -#endif +BSSL_NAMESPACE_END -#endif // OPENSSL_HEADER_DIGEST_MD32_COMMON_H +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_DIGEST_MD32_COMMON_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/digestsign/digestsign.c b/third_party/boringssl/src/crypto/fipsmodule/digestsign/digestsign.c deleted file mode 100644 index e5b6bc76..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/digestsign/digestsign.c +++ /dev/null @@ -1,267 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 2006. - */ -/* ==================================================================== - * Copyright (c) 2006,2007 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include - -#include "../../evp/internal.h" -#include "../delocate.h" -#include "../digest/internal.h" -#include "../service_indicator/internal.h" - - -enum evp_sign_verify_t { - evp_sign, - evp_verify, -}; - -DEFINE_LOCAL_DATA(struct evp_md_pctx_ops, md_pctx_ops) { - out->free = EVP_PKEY_CTX_free; - out->dup = EVP_PKEY_CTX_dup; -}; - -static int uses_prehash(EVP_MD_CTX *ctx, enum evp_sign_verify_t op) { - return (op == evp_sign) ? (ctx->pctx->pmeth->sign != NULL) - : (ctx->pctx->pmeth->verify != NULL); -} - -static int do_sigver_init(EVP_MD_CTX *ctx, EVP_PKEY_CTX **pctx, - const EVP_MD *type, ENGINE *e, EVP_PKEY *pkey, - enum evp_sign_verify_t op) { - if (ctx->pctx == NULL) { - ctx->pctx = EVP_PKEY_CTX_new(pkey, e); - } - if (ctx->pctx == NULL) { - return 0; - } - ctx->pctx_ops = md_pctx_ops(); - - if (op == evp_verify) { - if (!EVP_PKEY_verify_init(ctx->pctx)) { - return 0; - } - } else { - if (!EVP_PKEY_sign_init(ctx->pctx)) { - return 0; - } - } - - if (type != NULL && - !EVP_PKEY_CTX_set_signature_md(ctx->pctx, type)) { - return 0; - } - - if (uses_prehash(ctx, op)) { - if (type == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_NO_DEFAULT_DIGEST); - return 0; - } - if (!EVP_DigestInit_ex(ctx, type, e)) { - return 0; - } - } - - if (pctx) { - *pctx = ctx->pctx; - } - return 1; -} - -int EVP_DigestSignInit(EVP_MD_CTX *ctx, EVP_PKEY_CTX **pctx, const EVP_MD *type, - ENGINE *e, EVP_PKEY *pkey) { - return do_sigver_init(ctx, pctx, type, e, pkey, evp_sign); -} - -int EVP_DigestVerifyInit(EVP_MD_CTX *ctx, EVP_PKEY_CTX **pctx, - const EVP_MD *type, ENGINE *e, EVP_PKEY *pkey) { - return do_sigver_init(ctx, pctx, type, e, pkey, evp_verify); -} - -int EVP_DigestSignUpdate(EVP_MD_CTX *ctx, const void *data, size_t len) { - if (!uses_prehash(ctx, evp_sign)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - - return EVP_DigestUpdate(ctx, data, len); -} - -int EVP_DigestVerifyUpdate(EVP_MD_CTX *ctx, const void *data, size_t len) { - if (!uses_prehash(ctx, evp_verify)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - - return EVP_DigestUpdate(ctx, data, len); -} - -int EVP_DigestSignFinal(EVP_MD_CTX *ctx, uint8_t *out_sig, - size_t *out_sig_len) { - if (!uses_prehash(ctx, evp_sign)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - - if (out_sig) { - EVP_MD_CTX tmp_ctx; - int ret; - uint8_t md[EVP_MAX_MD_SIZE]; - unsigned int mdlen; - - FIPS_service_indicator_lock_state(); - EVP_MD_CTX_init(&tmp_ctx); - ret = EVP_MD_CTX_copy_ex(&tmp_ctx, ctx) && - EVP_DigestFinal_ex(&tmp_ctx, md, &mdlen) && - EVP_PKEY_sign(ctx->pctx, out_sig, out_sig_len, md, mdlen); - EVP_MD_CTX_cleanup(&tmp_ctx); - FIPS_service_indicator_unlock_state(); - - if (ret) { - EVP_DigestSign_verify_service_indicator(ctx); - } - - return ret; - } else { - size_t s = EVP_MD_size(ctx->digest); - return EVP_PKEY_sign(ctx->pctx, out_sig, out_sig_len, NULL, s); - } -} - -int EVP_DigestVerifyFinal(EVP_MD_CTX *ctx, const uint8_t *sig, - size_t sig_len) { - if (!uses_prehash(ctx, evp_verify)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - return 0; - } - - EVP_MD_CTX tmp_ctx; - int ret; - uint8_t md[EVP_MAX_MD_SIZE]; - unsigned int mdlen; - - FIPS_service_indicator_lock_state(); - EVP_MD_CTX_init(&tmp_ctx); - ret = EVP_MD_CTX_copy_ex(&tmp_ctx, ctx) && - EVP_DigestFinal_ex(&tmp_ctx, md, &mdlen) && - EVP_PKEY_verify(ctx->pctx, sig, sig_len, md, mdlen); - FIPS_service_indicator_unlock_state(); - EVP_MD_CTX_cleanup(&tmp_ctx); - - if (ret) { - EVP_DigestVerify_verify_service_indicator(ctx); - } - - return ret; -} - -int EVP_DigestSign(EVP_MD_CTX *ctx, uint8_t *out_sig, size_t *out_sig_len, - const uint8_t *data, size_t data_len) { - FIPS_service_indicator_lock_state(); - int ret = 0; - - if (uses_prehash(ctx, evp_sign)) { - // If |out_sig| is NULL, the caller is only querying the maximum output - // length. |data| should only be incorporated in the final call. - if (out_sig != NULL && - !EVP_DigestSignUpdate(ctx, data, data_len)) { - goto end; - } - - ret = EVP_DigestSignFinal(ctx, out_sig, out_sig_len); - goto end; - } - - if (ctx->pctx->pmeth->sign_message == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - goto end; - } - - ret = ctx->pctx->pmeth->sign_message(ctx->pctx, out_sig, out_sig_len, data, - data_len); - -end: - FIPS_service_indicator_unlock_state(); - if (ret) { - EVP_DigestSign_verify_service_indicator(ctx); - } - return ret; -} - -int EVP_DigestVerify(EVP_MD_CTX *ctx, const uint8_t *sig, size_t sig_len, - const uint8_t *data, size_t len) { - FIPS_service_indicator_lock_state(); - int ret = 0; - - if (uses_prehash(ctx, evp_verify)) { - ret = EVP_DigestVerifyUpdate(ctx, data, len) && - EVP_DigestVerifyFinal(ctx, sig, sig_len); - goto end; - } - - if (ctx->pctx->pmeth->verify_message == NULL) { - OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); - goto end; - } - - ret = ctx->pctx->pmeth->verify_message(ctx->pctx, sig, sig_len, data, len); - -end: - FIPS_service_indicator_unlock_state(); - if (ret) { - EVP_DigestVerify_verify_service_indicator(ctx); - } - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/digestsign/digestsign.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/digestsign/digestsign.cc.inc new file mode 100644 index 00000000..5324107b --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/digestsign/digestsign.cc.inc @@ -0,0 +1,227 @@ +// Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../../evp/internal.h" +#include "../delocate.h" +#include "../digest/internal.h" +#include "../service_indicator/internal.h" + + +using namespace bssl; + +enum evp_sign_verify_t { + evp_sign, + evp_verify, +}; + +DEFINE_LOCAL_DATA(struct evp_md_pctx_ops, md_pctx_ops) { + out->free = EVP_PKEY_CTX_free; + out->dup = EVP_PKEY_CTX_dup; +} + +static int uses_prehash(EVP_MD_CTX *ctx, enum evp_sign_verify_t op) { + return (op == evp_sign) ? (FromOpaque(ctx->pctx)->pmeth->sign != nullptr) + : (FromOpaque(ctx->pctx)->pmeth->verify != nullptr); +} + +static int do_sigver_init(EVP_MD_CTX *ctx, EVP_PKEY_CTX **pctx, + const EVP_MD *type, ENGINE *e, EVP_PKEY *pkey, + enum evp_sign_verify_t op) { + if (ctx->pctx == nullptr) { + ctx->pctx = EVP_PKEY_CTX_new(pkey, e); + } + if (ctx->pctx == nullptr) { + return 0; + } + ctx->pctx_ops = md_pctx_ops(); + + if (op == evp_verify) { + if (!EVP_PKEY_verify_init(ctx->pctx)) { + return 0; + } + } else { + if (!EVP_PKEY_sign_init(ctx->pctx)) { + return 0; + } + } + + if (type != nullptr && !EVP_PKEY_CTX_set_signature_md(ctx->pctx, type)) { + return 0; + } + + if (uses_prehash(ctx, op)) { + if (type == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_NO_DEFAULT_DIGEST); + return 0; + } + if (!EVP_DigestInit_ex(ctx, type, e)) { + return 0; + } + } + + if (pctx) { + *pctx = ctx->pctx; + } + return 1; +} + +int EVP_DigestSignInit(EVP_MD_CTX *ctx, EVP_PKEY_CTX **pctx, const EVP_MD *type, + ENGINE *e, EVP_PKEY *pkey) { + return do_sigver_init(ctx, pctx, type, e, pkey, evp_sign); +} + +int EVP_DigestVerifyInit(EVP_MD_CTX *ctx, EVP_PKEY_CTX **pctx, + const EVP_MD *type, ENGINE *e, EVP_PKEY *pkey) { + return do_sigver_init(ctx, pctx, type, e, pkey, evp_verify); +} + +int EVP_DigestSignUpdate(EVP_MD_CTX *ctx, const void *data, size_t len) { + if (!uses_prehash(ctx, evp_sign)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + return EVP_DigestUpdate(ctx, data, len); +} + +int EVP_DigestVerifyUpdate(EVP_MD_CTX *ctx, const void *data, size_t len) { + if (!uses_prehash(ctx, evp_verify)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + return EVP_DigestUpdate(ctx, data, len); +} + +int EVP_DigestSignFinal(EVP_MD_CTX *ctx, uint8_t *out_sig, + size_t *out_sig_len) { + if (!uses_prehash(ctx, evp_sign)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + if (out_sig) { + EVP_MD_CTX tmp_ctx; + int ret; + uint8_t md[EVP_MAX_MD_SIZE]; + unsigned int mdlen; + + FIPS_service_indicator_lock_state(); + EVP_MD_CTX_init(&tmp_ctx); + ret = EVP_MD_CTX_copy_ex(&tmp_ctx, ctx) && + EVP_DigestFinal_ex(&tmp_ctx, md, &mdlen) && + EVP_PKEY_sign(ctx->pctx, out_sig, out_sig_len, md, mdlen); + EVP_MD_CTX_cleanup(&tmp_ctx); + FIPS_service_indicator_unlock_state(); + + if (ret) { + EVP_DigestSign_verify_service_indicator(ctx); + } + + return ret; + } else { + size_t s = EVP_MD_size(ctx->digest); + return EVP_PKEY_sign(ctx->pctx, out_sig, out_sig_len, nullptr, s); + } +} + +int EVP_DigestVerifyFinal(EVP_MD_CTX *ctx, const uint8_t *sig, + size_t sig_len) { + if (!uses_prehash(ctx, evp_verify)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + EVP_MD_CTX tmp_ctx; + int ret; + uint8_t md[EVP_MAX_MD_SIZE]; + unsigned int mdlen; + + FIPS_service_indicator_lock_state(); + EVP_MD_CTX_init(&tmp_ctx); + ret = EVP_MD_CTX_copy_ex(&tmp_ctx, ctx) && + EVP_DigestFinal_ex(&tmp_ctx, md, &mdlen) && + EVP_PKEY_verify(ctx->pctx, sig, sig_len, md, mdlen); + FIPS_service_indicator_unlock_state(); + EVP_MD_CTX_cleanup(&tmp_ctx); + + if (ret) { + EVP_DigestVerify_verify_service_indicator(ctx); + } + + return ret; +} + +int EVP_DigestSign(EVP_MD_CTX *ctx, uint8_t *out_sig, size_t *out_sig_len, + const uint8_t *data, size_t data_len) { + FIPS_service_indicator_lock_state(); + int ret = 0; + + if (uses_prehash(ctx, evp_sign)) { + // If |out_sig| is NULL, the caller is only querying the maximum output + // length. |data| should only be incorporated in the final call. + if (out_sig != nullptr && !EVP_DigestSignUpdate(ctx, data, data_len)) { + goto end; + } + + ret = EVP_DigestSignFinal(ctx, out_sig, out_sig_len); + goto end; + } + + if (FromOpaque(ctx->pctx)->pmeth->sign_message == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + goto end; + } + + ret = FromOpaque(ctx->pctx)->pmeth->sign_message( + FromOpaque(ctx->pctx), out_sig, out_sig_len, data, data_len); + +end: + FIPS_service_indicator_unlock_state(); + if (ret) { + EVP_DigestSign_verify_service_indicator(ctx); + } + return ret; +} + +int EVP_DigestVerify(EVP_MD_CTX *ctx, const uint8_t *sig, size_t sig_len, + const uint8_t *data, size_t len) { + FIPS_service_indicator_lock_state(); + int ret = 0; + + if (uses_prehash(ctx, evp_verify)) { + ret = EVP_DigestVerifyUpdate(ctx, data, len) && + EVP_DigestVerifyFinal(ctx, sig, sig_len); + goto end; + } + + if (FromOpaque(ctx->pctx)->pmeth->verify_message == nullptr) { + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + goto end; + } + + ret = FromOpaque(ctx->pctx)->pmeth->verify_message(FromOpaque(ctx->pctx), sig, + sig_len, data, len); + +end: + FIPS_service_indicator_unlock_state(); + if (ret) { + EVP_DigestVerify_verify_service_indicator(ctx); + } + return ret; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/builtin_curves.h b/third_party/boringssl/src/crypto/fipsmodule/ec/builtin_curves.h new file mode 100644 index 00000000..bc5cbb47 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/builtin_curves.h @@ -0,0 +1,277 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This file is generated by make_tables.go. + +// P-224 +[[maybe_unused]] static const uint64_t kP224FieldN0 = 0xffffffffffffffff; +[[maybe_unused]] static const uint64_t kP224OrderN0 = 0xd6e242706a1fc2eb; +#if defined(OPENSSL_64_BIT) +[[maybe_unused]] static const uint64_t kP224Field[] = { + 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, + 0x00000000ffffffff}; +[[maybe_unused]] static const uint64_t kP224Order[] = { + 0x13dd29455c5c2a3d, 0xffff16a2e0b8f03e, 0xffffffffffffffff, + 0x00000000ffffffff}; +[[maybe_unused]] static const uint64_t kP224B[] = { + 0x270b39432355ffb4, 0x5044b0b7d7bfd8ba, 0x0c04b3abf5413256, + 0x00000000b4050a85}; +[[maybe_unused]] static const uint64_t kP224GX[] = { + 0x343280d6115c1d21, 0x4a03c1d356c21122, 0x6bb4bf7f321390b9, + 0x00000000b70e0cbd}; +[[maybe_unused]] static const uint64_t kP224GY[] = { + 0x44d5819985007e34, 0xcd4375a05a074764, 0xb5f723fb4c22dfe6, + 0x00000000bd376388}; +[[maybe_unused]] static const uint64_t kP224FieldR[] = { + 0xffffffff00000000, 0xffffffffffffffff, 0x0000000000000000, + 0x0000000000000000}; +[[maybe_unused]] static const uint64_t kP224FieldRR[] = { + 0xffffffff00000001, 0xffffffff00000000, 0xfffffffe00000000, + 0x00000000ffffffff}; +[[maybe_unused]] static const uint64_t kP224OrderRR[] = { + 0x29947a695f517d15, 0xabc8ff5931d63f4b, 0x6ad15f7cd9714856, + 0x00000000b1e97961}; +[[maybe_unused]] static const uint64_t kP224MontB[] = { + 0xe768cdf663c059cd, 0x107ac2f3ccf01310, 0x3dceba98c8528151, + 0x000000007fc02f93}; +[[maybe_unused]] static const uint64_t kP224MontGX[] = { + 0xbc9052266d0a4aea, 0x852597366018bfaa, 0x6dd3af9bf96bec05, + 0x00000000a21b5e60}; +[[maybe_unused]] static const uint64_t kP224MontGY[] = { + 0x2edca1e5eff3ede8, 0xf8cd672b05335a6b, 0xaea9c5ae03dfe878, + 0x00000000614786f1}; +#elif defined(OPENSSL_32_BIT) +[[maybe_unused]] static const uint32_t kP224Field[] = { + 0x00000001, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, + 0xffffffff}; +[[maybe_unused]] static const uint32_t kP224Order[] = { + 0x5c5c2a3d, 0x13dd2945, 0xe0b8f03e, 0xffff16a2, 0xffffffff, 0xffffffff, + 0xffffffff}; +[[maybe_unused]] static const uint32_t kP224B[] = { + 0x2355ffb4, 0x270b3943, 0xd7bfd8ba, 0x5044b0b7, 0xf5413256, 0x0c04b3ab, + 0xb4050a85}; +[[maybe_unused]] static const uint32_t kP224GX[] = { + 0x115c1d21, 0x343280d6, 0x56c21122, 0x4a03c1d3, 0x321390b9, 0x6bb4bf7f, + 0xb70e0cbd}; +[[maybe_unused]] static const uint32_t kP224GY[] = { + 0x85007e34, 0x44d58199, 0x5a074764, 0xcd4375a0, 0x4c22dfe6, 0xb5f723fb, + 0xbd376388}; +[[maybe_unused]] static const uint32_t kP224FieldR[] = { + 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, + 0x00000000}; +[[maybe_unused]] static const uint32_t kP224FieldRR[] = { + 0x00000001, 0x00000000, 0x00000000, 0xfffffffe, 0xffffffff, 0xffffffff, + 0x00000000}; +[[maybe_unused]] static const uint32_t kP224OrderRR[] = { + 0x3ad01289, 0x6bdaae6c, 0x97a54552, 0x6ad09d91, 0xb1e97961, 0x1822bc47, + 0xd4baa4cf}; +[[maybe_unused]] static const uint32_t kP224MontB[] = { + 0xe768cdf7, 0xccf01310, 0x743b1cc0, 0xc8528150, 0x3dceba98, 0x7fc02f93, + 0x9c3fa633}; +[[maybe_unused]] static const uint32_t kP224MontGX[] = { + 0xbc905227, 0x6018bfaa, 0xf22fe220, 0xf96bec04, 0x6dd3af9b, 0xa21b5e60, + 0x92f5b516}; +[[maybe_unused]] static const uint32_t kP224MontGY[] = { + 0x2edca1e6, 0x05335a6b, 0xe8c15513, 0x03dfe878, 0xaea9c5ae, 0x614786f1, + 0x100c1218}; +#else +#error "unknown word size" +#endif + +// P-256 +[[maybe_unused]] static const uint64_t kP256FieldN0 = 0x0000000000000001; +[[maybe_unused]] static const uint64_t kP256OrderN0 = 0xccd1c8aaee00bc4f; +#if defined(OPENSSL_64_BIT) +[[maybe_unused]] static const uint64_t kP256Field[] = { + 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, + 0xffffffff00000001}; +[[maybe_unused]] static const uint64_t kP256Order[] = { + 0xf3b9cac2fc632551, 0xbce6faada7179e84, 0xffffffffffffffff, + 0xffffffff00000000}; +[[maybe_unused]] static const uint64_t kP256FieldR[] = { + 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, + 0x00000000fffffffe}; +[[maybe_unused]] static const uint64_t kP256FieldRR[] = { + 0x0000000000000003, 0xfffffffbffffffff, 0xfffffffffffffffe, + 0x00000004fffffffd}; +[[maybe_unused]] static const uint64_t kP256OrderRR[] = { + 0x83244c95be79eea2, 0x4699799c49bd6fa6, 0x2845b2392b6bec59, + 0x66e12d94f3d95620}; +[[maybe_unused]] static const uint64_t kP256MontB[] = { + 0xd89cdf6229c4bddf, 0xacf005cd78843090, 0xe5a220abf7212ed6, + 0xdc30061d04874834}; +[[maybe_unused]] static const uint64_t kP256MontGX[] = { + 0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510, + 0x18905f76a53755c6}; +[[maybe_unused]] static const uint64_t kP256MontGY[] = { + 0xddf25357ce95560a, 0x8b4ab8e4ba19e45c, 0xd2e88688dd21f325, + 0x8571ff1825885d85}; +#elif defined(OPENSSL_32_BIT) +[[maybe_unused]] static const uint32_t kP256Field[] = { + 0xffffffff, 0xffffffff, 0xffffffff, 0x00000000, 0x00000000, 0x00000000, + 0x00000001, 0xffffffff}; +[[maybe_unused]] static const uint32_t kP256Order[] = { + 0xfc632551, 0xf3b9cac2, 0xa7179e84, 0xbce6faad, 0xffffffff, 0xffffffff, + 0x00000000, 0xffffffff}; +[[maybe_unused]] static const uint32_t kP256FieldR[] = { + 0x00000001, 0x00000000, 0x00000000, 0xffffffff, 0xffffffff, 0xffffffff, + 0xfffffffe, 0x00000000}; +[[maybe_unused]] static const uint32_t kP256FieldRR[] = { + 0x00000003, 0x00000000, 0xffffffff, 0xfffffffb, 0xfffffffe, 0xffffffff, + 0xfffffffd, 0x00000004}; +[[maybe_unused]] static const uint32_t kP256OrderRR[] = { + 0xbe79eea2, 0x83244c95, 0x49bd6fa6, 0x4699799c, 0x2b6bec59, 0x2845b239, + 0xf3d95620, 0x66e12d94}; +[[maybe_unused]] static const uint32_t kP256MontB[] = { + 0x29c4bddf, 0xd89cdf62, 0x78843090, 0xacf005cd, 0xf7212ed6, 0xe5a220ab, + 0x04874834, 0xdc30061d}; +[[maybe_unused]] static const uint32_t kP256MontGX[] = { + 0x18a9143c, 0x79e730d4, 0x5fedb601, 0x75ba95fc, 0x77622510, 0x79fb732b, + 0xa53755c6, 0x18905f76}; +[[maybe_unused]] static const uint32_t kP256MontGY[] = { + 0xce95560a, 0xddf25357, 0xba19e45c, 0x8b4ab8e4, 0xdd21f325, 0xd2e88688, + 0x25885d85, 0x8571ff18}; +#else +#error "unknown word size" +#endif + +// P-384 +[[maybe_unused]] static const uint64_t kP384FieldN0 = 0x0000000100000001; +[[maybe_unused]] static const uint64_t kP384OrderN0 = 0x6ed46089e88fdc45; +#if defined(OPENSSL_64_BIT) +[[maybe_unused]] static const uint64_t kP384Field[] = { + 0x00000000ffffffff, 0xffffffff00000000, 0xfffffffffffffffe, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}; +[[maybe_unused]] static const uint64_t kP384Order[] = { + 0xecec196accc52973, 0x581a0db248b0a77a, 0xc7634d81f4372ddf, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff}; +[[maybe_unused]] static const uint64_t kP384FieldR[] = { + 0xffffffff00000001, 0x00000000ffffffff, 0x0000000000000001, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}; +[[maybe_unused]] static const uint64_t kP384FieldRR[] = { + 0xfffffffe00000001, 0x0000000200000000, 0xfffffffe00000000, + 0x0000000200000000, 0x0000000000000001, 0x0000000000000000}; +[[maybe_unused]] static const uint64_t kP384OrderRR[] = { + 0x2d319b2419b409a9, 0xff3d81e5df1aa419, 0xbc3e483afcb82947, + 0xd40d49174aab1cc5, 0x3fb05b7a28266895, 0x0c84ee012b39bf21}; +[[maybe_unused]] static const uint64_t kP384MontB[] = { + 0x081188719d412dcc, 0xf729add87a4c32ec, 0x77f2209b1920022e, + 0xe3374bee94938ae2, 0xb62b21f41f022094, 0xcd08114b604fbff9}; +[[maybe_unused]] static const uint64_t kP384MontGX[] = { + 0x3dd0756649c0b528, 0x20e378e2a0d6ce38, 0x879c3afc541b4d6e, + 0x6454868459a30eff, 0x812ff723614ede2b, 0x4d3aadc2299e1513}; +[[maybe_unused]] static const uint64_t kP384MontGY[] = { + 0x23043dad4b03a4fe, 0xa1bfa8bf7bb4a9ac, 0x8bade7562e83b050, + 0xc6c3521968f4ffd9, 0xdd8002263969a840, 0x2b78abc25a15c5e9}; +#elif defined(OPENSSL_32_BIT) +[[maybe_unused]] static const uint32_t kP384Field[] = { + 0xffffffff, 0x00000000, 0x00000000, 0xffffffff, 0xfffffffe, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}; +[[maybe_unused]] static const uint32_t kP384Order[] = { + 0xccc52973, 0xecec196a, 0x48b0a77a, 0x581a0db2, 0xf4372ddf, 0xc7634d81, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}; +[[maybe_unused]] static const uint32_t kP384FieldR[] = { + 0x00000001, 0xffffffff, 0xffffffff, 0x00000000, 0x00000001, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; +[[maybe_unused]] static const uint32_t kP384FieldRR[] = { + 0x00000001, 0xfffffffe, 0x00000000, 0x00000002, 0x00000000, 0xfffffffe, + 0x00000000, 0x00000002, 0x00000001, 0x00000000, 0x00000000, 0x00000000}; +[[maybe_unused]] static const uint32_t kP384OrderRR[] = { + 0x19b409a9, 0x2d319b24, 0xdf1aa419, 0xff3d81e5, 0xfcb82947, 0xbc3e483a, + 0x4aab1cc5, 0xd40d4917, 0x28266895, 0x3fb05b7a, 0x2b39bf21, 0x0c84ee01}; +[[maybe_unused]] static const uint32_t kP384MontB[] = { + 0x9d412dcc, 0x08118871, 0x7a4c32ec, 0xf729add8, 0x1920022e, 0x77f2209b, + 0x94938ae2, 0xe3374bee, 0x1f022094, 0xb62b21f4, 0x604fbff9, 0xcd08114b}; +[[maybe_unused]] static const uint32_t kP384MontGX[] = { + 0x49c0b528, 0x3dd07566, 0xa0d6ce38, 0x20e378e2, 0x541b4d6e, 0x879c3afc, + 0x59a30eff, 0x64548684, 0x614ede2b, 0x812ff723, 0x299e1513, 0x4d3aadc2}; +[[maybe_unused]] static const uint32_t kP384MontGY[] = { + 0x4b03a4fe, 0x23043dad, 0x7bb4a9ac, 0xa1bfa8bf, 0x2e83b050, 0x8bade756, + 0x68f4ffd9, 0xc6c35219, 0x3969a840, 0xdd800226, 0x5a15c5e9, 0x2b78abc2}; +#else +#error "unknown word size" +#endif + +// P-521 +[[maybe_unused]] static const uint64_t kP521FieldN0 = 0x0000000000000001; +[[maybe_unused]] static const uint64_t kP521OrderN0 = 0x1d2f5ccd79a995c7; +#if defined(OPENSSL_64_BIT) +[[maybe_unused]] static const uint64_t kP521Field[] = { + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0x00000000000001ff}; +[[maybe_unused]] static const uint64_t kP521Order[] = { + 0xbb6fb71e91386409, 0x3bb5c9b8899c47ae, 0x7fcc0148f709a5d0, + 0x51868783bf2f966b, 0xfffffffffffffffa, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0x00000000000001ff}; +[[maybe_unused]] static const uint64_t kP521FieldR[] = { + 0x0080000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}; +[[maybe_unused]] static const uint64_t kP521FieldRR[] = { + 0x0000000000000000, 0x0000400000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000}; +[[maybe_unused]] static const uint64_t kP521OrderRR[] = { + 0x137cd04dcf15dd04, 0xf707badce5547ea3, 0x12a78d38794573ff, + 0xd3721ef557f75e06, 0xdd6e23d82e49c7db, 0xcff3d142b7756e3e, + 0x5bcc6d61a8e567bc, 0x2d8e03d1492d0d45, 0x000000000000003d}; +[[maybe_unused]] static const uint64_t kP521MontB[] = { + 0x8014654fae586387, 0x78f7a28fea35a81f, 0x839ab9efc41e961a, + 0xbd8b29605e9dd8df, 0xf0ab0c9ca8f63f49, 0xf9dc5a44c8c77884, + 0x77516d392dccd98a, 0x0fc94d10d05b42a0, 0x000000000000004d}; +[[maybe_unused]] static const uint64_t kP521MontGX[] = { + 0xb331a16381adc101, 0x4dfcbf3f18e172de, 0x6f19a459e0c2b521, + 0x947f0ee093d17fd4, 0xdd50a5af3bf7f3ac, 0x90fc1457b035a69e, + 0x214e32409c829fda, 0xe6cf1f65b311cada, 0x0000000000000074}; +[[maybe_unused]] static const uint64_t kP521MontGY[] = { + 0x28460e4a5a9e268e, 0x20445f4a3b4fe8b3, 0xb09a9e3843513961, + 0x2062a85c809fd683, 0x164bf7394caf7a13, 0x340bd7de8b939f33, + 0xeccc7aa224abcda2, 0x022e452fda163e8d, 0x00000000000001e0}; +#elif defined(OPENSSL_32_BIT) +[[maybe_unused]] static const uint32_t kP521Field[] = { + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff}; +[[maybe_unused]] static const uint32_t kP521Order[] = { + 0x91386409, 0xbb6fb71e, 0x899c47ae, 0x3bb5c9b8, 0xf709a5d0, 0x7fcc0148, + 0xbf2f966b, 0x51868783, 0xfffffffa, 0xffffffff, 0xffffffff, 0xffffffff, + 0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff, 0x000001ff}; +[[maybe_unused]] static const uint32_t kP521FieldR[] = { + 0x00800000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; +[[maybe_unused]] static const uint32_t kP521FieldRR[] = { + 0x00000000, 0x00004000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, + 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000}; +[[maybe_unused]] static const uint32_t kP521OrderRR[] = { + 0x61c64ca7, 0x1163115a, 0x4374a642, 0x18354a56, 0x0791d9dc, 0x5d4dd6d3, + 0xd3402705, 0x4fb35b72, 0xb7756e3a, 0xcff3d142, 0xa8e567bc, 0x5bcc6d61, + 0x492d0d45, 0x2d8e03d1, 0x8c44383d, 0x5b5a3afe, 0x0000019a}; +[[maybe_unused]] static const uint32_t kP521MontB[] = { + 0x8014654f, 0xea35a81f, 0x78f7a28f, 0xc41e961a, 0x839ab9ef, 0x5e9dd8df, + 0xbd8b2960, 0xa8f63f49, 0xf0ab0c9c, 0xc8c77884, 0xf9dc5a44, 0x2dccd98a, + 0x77516d39, 0xd05b42a0, 0x0fc94d10, 0xb0c70e4d, 0x0000015c}; +[[maybe_unused]] static const uint32_t kP521MontGX[] = { + 0xb331a163, 0x18e172de, 0x4dfcbf3f, 0xe0c2b521, 0x6f19a459, 0x93d17fd4, + 0x947f0ee0, 0x3bf7f3ac, 0xdd50a5af, 0xb035a69e, 0x90fc1457, 0x9c829fda, + 0x214e3240, 0xb311cada, 0xe6cf1f65, 0x5b820274, 0x00000103}; +[[maybe_unused]] static const uint32_t kP521MontGY[] = { + 0x28460e4a, 0x3b4fe8b3, 0x20445f4a, 0x43513961, 0xb09a9e38, 0x809fd683, + 0x2062a85c, 0x4caf7a13, 0x164bf739, 0x8b939f33, 0x340bd7de, 0x24abcda2, + 0xeccc7aa2, 0xda163e8d, 0x022e452f, 0x3c4d1de0, 0x000000b5}; +#else +#error "unknown word size" +#endif diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/ec.c b/third_party/boringssl/src/crypto/fipsmodule/ec/ec.c deleted file mode 100644 index 4e632e19..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/ec.c +++ /dev/null @@ -1,1271 +0,0 @@ -/* Originally written by Bodo Moeller for the OpenSSL project. - * ==================================================================== - * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ -/* ==================================================================== - * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. - * - * Portions of the attached software ("Contribution") are developed by - * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project. - * - * The Contribution is licensed pursuant to the OpenSSL open source - * license provided above. - * - * The elliptic curve binary polynomial software is originally written by - * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems - * Laboratories. */ - -#include - -#include -#include - -#include -#include -#include -#include - -#include "internal.h" -#include "../../internal.h" -#include "../bn/internal.h" -#include "../delocate.h" - - -static void ec_point_free(EC_POINT *point, int free_group); - -static const uint8_t kP224Params[6 * 28] = { - // p = 2^224 - 2^96 + 1 - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x01, - // a - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFE, - // b - 0xB4, 0x05, 0x0A, 0x85, 0x0C, 0x04, 0xB3, 0xAB, 0xF5, 0x41, 0x32, 0x56, - 0x50, 0x44, 0xB0, 0xB7, 0xD7, 0xBF, 0xD8, 0xBA, 0x27, 0x0B, 0x39, 0x43, - 0x23, 0x55, 0xFF, 0xB4, - // x - 0xB7, 0x0E, 0x0C, 0xBD, 0x6B, 0xB4, 0xBF, 0x7F, 0x32, 0x13, 0x90, 0xB9, - 0x4A, 0x03, 0xC1, 0xD3, 0x56, 0xC2, 0x11, 0x22, 0x34, 0x32, 0x80, 0xD6, - 0x11, 0x5C, 0x1D, 0x21, - // y - 0xbd, 0x37, 0x63, 0x88, 0xb5, 0xf7, 0x23, 0xfb, 0x4c, 0x22, 0xdf, 0xe6, - 0xcd, 0x43, 0x75, 0xa0, 0x5a, 0x07, 0x47, 0x64, 0x44, 0xd5, 0x81, 0x99, - 0x85, 0x00, 0x7e, 0x34, - // order - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0x16, 0xA2, 0xE0, 0xB8, 0xF0, 0x3E, 0x13, 0xDD, 0x29, 0x45, - 0x5C, 0x5C, 0x2A, 0x3D, -}; - -static const uint8_t kP256Params[6 * 32] = { - // p = 2^256 - 2^224 + 2^192 + 2^96 - 1 - 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - // a - 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFC, - // b - 0x5A, 0xC6, 0x35, 0xD8, 0xAA, 0x3A, 0x93, 0xE7, 0xB3, 0xEB, 0xBD, 0x55, - 0x76, 0x98, 0x86, 0xBC, 0x65, 0x1D, 0x06, 0xB0, 0xCC, 0x53, 0xB0, 0xF6, - 0x3B, 0xCE, 0x3C, 0x3E, 0x27, 0xD2, 0x60, 0x4B, - // x - 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8, 0xBC, 0xE6, 0xE5, - 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D, 0x81, 0x2D, 0xEB, 0x33, 0xA0, - 0xF4, 0xA1, 0x39, 0x45, 0xD8, 0x98, 0xC2, 0x96, - // y - 0x4f, 0xe3, 0x42, 0xe2, 0xfe, 0x1a, 0x7f, 0x9b, 0x8e, 0xe7, 0xeb, 0x4a, - 0x7c, 0x0f, 0x9e, 0x16, 0x2b, 0xce, 0x33, 0x57, 0x6b, 0x31, 0x5e, 0xce, - 0xcb, 0xb6, 0x40, 0x68, 0x37, 0xbf, 0x51, 0xf5, - // order - 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD, 0xA7, 0x17, 0x9E, 0x84, - 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63, 0x25, 0x51, -}; - -static const uint8_t kP384Params[6 * 48] = { - // p = 2^384 - 2^128 - 2^96 + 2^32 - 1 - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF, - // a - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC, - // b - 0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, - 0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12, - 0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D, - 0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF, - // x - 0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, - 0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98, - 0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D, - 0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7, - // y - 0x36, 0x17, 0xde, 0x4a, 0x96, 0x26, 0x2c, 0x6f, 0x5d, 0x9e, 0x98, 0xbf, - 0x92, 0x92, 0xdc, 0x29, 0xf8, 0xf4, 0x1d, 0xbd, 0x28, 0x9a, 0x14, 0x7c, - 0xe9, 0xda, 0x31, 0x13, 0xb5, 0xf0, 0xb8, 0xc0, 0x0a, 0x60, 0xb1, 0xce, - 0x1d, 0x7e, 0x81, 0x9d, 0x7a, 0x43, 0x1d, 0x7c, 0x90, 0xea, 0x0e, 0x5f, - // order - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xC7, 0x63, 0x4D, 0x81, 0xF4, 0x37, 0x2D, 0xDF, 0x58, 0x1A, 0x0D, 0xB2, - 0x48, 0xB0, 0xA7, 0x7A, 0xEC, 0xEC, 0x19, 0x6A, 0xCC, 0xC5, 0x29, 0x73, -}; - -static const uint8_t kP521Params[6 * 66] = { - // p = 2^521 - 1 - 0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - // a - 0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFC, - // b - 0x00, 0x51, 0x95, 0x3E, 0xB9, 0x61, 0x8E, 0x1C, 0x9A, 0x1F, 0x92, 0x9A, - 0x21, 0xA0, 0xB6, 0x85, 0x40, 0xEE, 0xA2, 0xDA, 0x72, 0x5B, 0x99, 0xB3, - 0x15, 0xF3, 0xB8, 0xB4, 0x89, 0x91, 0x8E, 0xF1, 0x09, 0xE1, 0x56, 0x19, - 0x39, 0x51, 0xEC, 0x7E, 0x93, 0x7B, 0x16, 0x52, 0xC0, 0xBD, 0x3B, 0xB1, - 0xBF, 0x07, 0x35, 0x73, 0xDF, 0x88, 0x3D, 0x2C, 0x34, 0xF1, 0xEF, 0x45, - 0x1F, 0xD4, 0x6B, 0x50, 0x3F, 0x00, - // x - 0x00, 0xC6, 0x85, 0x8E, 0x06, 0xB7, 0x04, 0x04, 0xE9, 0xCD, 0x9E, 0x3E, - 0xCB, 0x66, 0x23, 0x95, 0xB4, 0x42, 0x9C, 0x64, 0x81, 0x39, 0x05, 0x3F, - 0xB5, 0x21, 0xF8, 0x28, 0xAF, 0x60, 0x6B, 0x4D, 0x3D, 0xBA, 0xA1, 0x4B, - 0x5E, 0x77, 0xEF, 0xE7, 0x59, 0x28, 0xFE, 0x1D, 0xC1, 0x27, 0xA2, 0xFF, - 0xA8, 0xDE, 0x33, 0x48, 0xB3, 0xC1, 0x85, 0x6A, 0x42, 0x9B, 0xF9, 0x7E, - 0x7E, 0x31, 0xC2, 0xE5, 0xBD, 0x66, - // y - 0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, 0xc0, 0x04, 0x5c, 0x8a, - 0x5f, 0xb4, 0x2c, 0x7d, 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b, - 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e, 0x66, 0x2c, 0x97, 0xee, - 0x72, 0x99, 0x5e, 0xf4, 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad, - 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72, 0xc2, 0x40, 0x88, 0xbe, - 0x94, 0x76, 0x9f, 0xd1, 0x66, 0x50, - // order - 0x01, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, - 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFA, 0x51, 0x86, - 0x87, 0x83, 0xBF, 0x2F, 0x96, 0x6B, 0x7F, 0xCC, 0x01, 0x48, 0xF7, 0x09, - 0xA5, 0xD0, 0x3B, 0xB5, 0xC9, 0xB8, 0x89, 0x9C, 0x47, 0xAE, 0xBB, 0x6F, - 0xB7, 0x1E, 0x91, 0x38, 0x64, 0x09, -}; - -DEFINE_METHOD_FUNCTION(struct built_in_curves, OPENSSL_built_in_curves) { - // 1.3.132.0.35 - static const uint8_t kOIDP521[] = {0x2b, 0x81, 0x04, 0x00, 0x23}; - out->curves[0].nid = NID_secp521r1; - out->curves[0].oid = kOIDP521; - out->curves[0].oid_len = sizeof(kOIDP521); - out->curves[0].comment = "NIST P-521"; - out->curves[0].param_len = 66; - out->curves[0].params = kP521Params; - out->curves[0].method = EC_GFp_mont_method(); - - // 1.3.132.0.34 - static const uint8_t kOIDP384[] = {0x2b, 0x81, 0x04, 0x00, 0x22}; - out->curves[1].nid = NID_secp384r1; - out->curves[1].oid = kOIDP384; - out->curves[1].oid_len = sizeof(kOIDP384); - out->curves[1].comment = "NIST P-384"; - out->curves[1].param_len = 48; - out->curves[1].params = kP384Params; - out->curves[1].method = EC_GFp_mont_method(); - - // 1.2.840.10045.3.1.7 - static const uint8_t kOIDP256[] = {0x2a, 0x86, 0x48, 0xce, - 0x3d, 0x03, 0x01, 0x07}; - out->curves[2].nid = NID_X9_62_prime256v1; - out->curves[2].oid = kOIDP256; - out->curves[2].oid_len = sizeof(kOIDP256); - out->curves[2].comment = "NIST P-256"; - out->curves[2].param_len = 32; - out->curves[2].params = kP256Params; - out->curves[2].method = -#if !defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ - !defined(OPENSSL_SMALL) - EC_GFp_nistz256_method(); -#else - EC_GFp_nistp256_method(); -#endif - - // 1.3.132.0.33 - static const uint8_t kOIDP224[] = {0x2b, 0x81, 0x04, 0x00, 0x21}; - out->curves[3].nid = NID_secp224r1; - out->curves[3].oid = kOIDP224; - out->curves[3].oid_len = sizeof(kOIDP224); - out->curves[3].comment = "NIST P-224"; - out->curves[3].param_len = 28; - out->curves[3].params = kP224Params; - out->curves[3].method = -#if defined(BORINGSSL_HAS_UINT128) && !defined(OPENSSL_SMALL) - EC_GFp_nistp224_method(); -#else - EC_GFp_mont_method(); -#endif -} - -EC_GROUP *ec_group_new(const EC_METHOD *meth) { - EC_GROUP *ret; - - if (meth == NULL) { - OPENSSL_PUT_ERROR(EC, EC_R_SLOT_FULL); - return NULL; - } - - if (meth->group_init == 0) { - OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return NULL; - } - - ret = OPENSSL_malloc(sizeof(EC_GROUP)); - if (ret == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE); - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(EC_GROUP)); - - ret->references = 1; - ret->meth = meth; - BN_init(&ret->order); - - if (!meth->group_init(ret)) { - OPENSSL_free(ret); - return NULL; - } - - return ret; -} - -static int ec_group_set_generator(EC_GROUP *group, const EC_AFFINE *generator, - const BIGNUM *order) { - assert(group->generator == NULL); - - if (!BN_copy(&group->order, order)) { - return 0; - } - // Store the order in minimal form, so it can be used with |BN_ULONG| arrays. - bn_set_minimal_width(&group->order); - - BN_MONT_CTX_free(group->order_mont); - group->order_mont = BN_MONT_CTX_new_for_modulus(&group->order, NULL); - if (group->order_mont == NULL) { - return 0; - } - - group->field_greater_than_order = BN_cmp(&group->field, order) > 0; - if (group->field_greater_than_order) { - BIGNUM tmp; - BN_init(&tmp); - int ok = - BN_sub(&tmp, &group->field, order) && - bn_copy_words(group->field_minus_order.words, group->field.width, &tmp); - BN_free(&tmp); - if (!ok) { - return 0; - } - } - - group->generator = EC_POINT_new(group); - if (group->generator == NULL) { - return 0; - } - ec_affine_to_jacobian(group, &group->generator->raw, generator); - assert(ec_felem_equal(group, &group->one, &group->generator->raw.Z)); - - // Avoid a reference cycle. |group->generator| does not maintain an owning - // pointer to |group|. - int is_zero = CRYPTO_refcount_dec_and_test_zero(&group->references); - - assert(!is_zero); - (void)is_zero; - return 1; -} - -EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, - const BIGNUM *b, BN_CTX *ctx) { - if (BN_num_bytes(p) > EC_MAX_BYTES) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_FIELD); - return NULL; - } - - BN_CTX *new_ctx = NULL; - if (ctx == NULL) { - ctx = new_ctx = BN_CTX_new(); - if (ctx == NULL) { - return NULL; - } - } - - // Historically, |a| and |b| were not required to be fully reduced. - // TODO(davidben): Can this be removed? - EC_GROUP *ret = NULL; - BN_CTX_start(ctx); - BIGNUM *a_reduced = BN_CTX_get(ctx); - BIGNUM *b_reduced = BN_CTX_get(ctx); - if (a_reduced == NULL || b_reduced == NULL || - !BN_nnmod(a_reduced, a, p, ctx) || - !BN_nnmod(b_reduced, b, p, ctx)) { - goto err; - } - - ret = ec_group_new(EC_GFp_mont_method()); - if (ret == NULL || - !ret->meth->group_set_curve(ret, p, a_reduced, b_reduced, ctx)) { - EC_GROUP_free(ret); - ret = NULL; - goto err; - } - -err: - BN_CTX_end(ctx); - BN_CTX_free(new_ctx); - return ret; -} - -int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator, - const BIGNUM *order, const BIGNUM *cofactor) { - if (group->curve_name != NID_undef || group->generator != NULL || - generator->group != group) { - // |EC_GROUP_set_generator| may only be used with |EC_GROUP|s returned by - // |EC_GROUP_new_curve_GFp| and may only used once on each group. - // |generator| must have been created from |EC_GROUP_new_curve_GFp|, not a - // copy, so that |generator->group->generator| is set correctly. - OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - - if (BN_num_bytes(order) > EC_MAX_BYTES) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER); - return 0; - } - - // Require a cofactor of one for custom curves, which implies prime order. - if (!BN_is_one(cofactor)) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_COFACTOR); - return 0; - } - - // Require that p < 2×order. This simplifies some ECDSA operations. - // - // Note any curve which did not satisfy this must have been invalid or use a - // tiny prime (less than 17). See the proof in |field_element_to_scalar| in - // the ECDSA implementation. - int ret = 0; - BIGNUM *tmp = BN_new(); - if (tmp == NULL || - !BN_lshift1(tmp, order)) { - goto err; - } - if (BN_cmp(tmp, &group->field) <= 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER); - goto err; - } - - EC_AFFINE affine; - if (!ec_jacobian_to_affine(group, &affine, &generator->raw) || - !ec_group_set_generator(group, &affine, order)) { - goto err; - } - - ret = 1; - -err: - BN_free(tmp); - return ret; -} - -static EC_GROUP *ec_group_new_from_data(const struct built_in_curve *curve) { - EC_GROUP *group = NULL; - BIGNUM *p = NULL, *a = NULL, *b = NULL, *order = NULL; - int ok = 0; - - BN_CTX *ctx = BN_CTX_new(); - if (ctx == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE); - goto err; - } - - const unsigned param_len = curve->param_len; - const uint8_t *params = curve->params; - - if (!(p = BN_bin2bn(params + 0 * param_len, param_len, NULL)) || - !(a = BN_bin2bn(params + 1 * param_len, param_len, NULL)) || - !(b = BN_bin2bn(params + 2 * param_len, param_len, NULL)) || - !(order = BN_bin2bn(params + 5 * param_len, param_len, NULL))) { - OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB); - goto err; - } - - group = ec_group_new(curve->method); - if (group == NULL || - !group->meth->group_set_curve(group, p, a, b, ctx)) { - OPENSSL_PUT_ERROR(EC, ERR_R_EC_LIB); - goto err; - } - - EC_AFFINE G; - EC_FELEM x, y; - if (!ec_felem_from_bytes(group, &x, params + 3 * param_len, param_len) || - !ec_felem_from_bytes(group, &y, params + 4 * param_len, param_len) || - !ec_point_set_affine_coordinates(group, &G, &x, &y)) { - goto err; - } - - if (!ec_group_set_generator(group, &G, order)) { - goto err; - } - - ok = 1; - -err: - if (!ok) { - EC_GROUP_free(group); - group = NULL; - } - BN_CTX_free(ctx); - BN_free(p); - BN_free(a); - BN_free(b); - BN_free(order); - return group; -} - -// Built-in groups are allocated lazily and static once allocated. -// TODO(davidben): Make these actually static. https://crbug.com/boringssl/20. -struct built_in_groups_st { - EC_GROUP *groups[OPENSSL_NUM_BUILT_IN_CURVES]; -}; -DEFINE_BSS_GET(struct built_in_groups_st, built_in_groups) -DEFINE_STATIC_MUTEX(built_in_groups_lock) - -EC_GROUP *EC_GROUP_new_by_curve_name(int nid) { - struct built_in_groups_st *groups = built_in_groups_bss_get(); - EC_GROUP **group_ptr = NULL; - const struct built_in_curves *const curves = OPENSSL_built_in_curves(); - const struct built_in_curve *curve = NULL; - for (size_t i = 0; i < OPENSSL_NUM_BUILT_IN_CURVES; i++) { - if (curves->curves[i].nid == nid) { - curve = &curves->curves[i]; - group_ptr = &groups->groups[i]; - break; - } - } - - if (curve == NULL) { - OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); - return NULL; - } - - CRYPTO_STATIC_MUTEX_lock_read(built_in_groups_lock_bss_get()); - EC_GROUP *ret = *group_ptr; - CRYPTO_STATIC_MUTEX_unlock_read(built_in_groups_lock_bss_get()); - if (ret != NULL) { - return ret; - } - - ret = ec_group_new_from_data(curve); - if (ret == NULL) { - return NULL; - } - - EC_GROUP *to_free = NULL; - CRYPTO_STATIC_MUTEX_lock_write(built_in_groups_lock_bss_get()); - if (*group_ptr == NULL) { - *group_ptr = ret; - // Filling in |ret->curve_name| makes |EC_GROUP_free| and |EC_GROUP_dup| - // into no-ops. At this point, |ret| is considered static. - ret->curve_name = nid; - } else { - to_free = ret; - ret = *group_ptr; - } - CRYPTO_STATIC_MUTEX_unlock_write(built_in_groups_lock_bss_get()); - - EC_GROUP_free(to_free); - return ret; -} - -void EC_GROUP_free(EC_GROUP *group) { - if (group == NULL || - // Built-in curves are static. - group->curve_name != NID_undef || - !CRYPTO_refcount_dec_and_test_zero(&group->references)) { - return; - } - - if (group->meth->group_finish != NULL) { - group->meth->group_finish(group); - } - - ec_point_free(group->generator, 0 /* don't free group */); - BN_free(&group->order); - BN_MONT_CTX_free(group->order_mont); - - OPENSSL_free(group); -} - -EC_GROUP *EC_GROUP_dup(const EC_GROUP *a) { - if (a == NULL || - // Built-in curves are static. - a->curve_name != NID_undef) { - return (EC_GROUP *)a; - } - - // Groups are logically immutable (but for |EC_GROUP_set_generator| which must - // be called early on), so we simply take a reference. - EC_GROUP *group = (EC_GROUP *)a; - CRYPTO_refcount_inc(&group->references); - return group; -} - -int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ignored) { - // Note this function returns 0 if equal and non-zero otherwise. - if (a == b) { - return 0; - } - if (a->curve_name != b->curve_name) { - return 1; - } - if (a->curve_name != NID_undef) { - // Built-in curves may be compared by curve name alone. - return 0; - } - - // |a| and |b| are both custom curves. We compare the entire curve - // structure. If |a| or |b| is incomplete (due to legacy OpenSSL mistakes, - // custom curve construction is sadly done in two parts) but otherwise not the - // same object, we consider them always unequal. - return a->meth != b->meth || - a->generator == NULL || - b->generator == NULL || - BN_cmp(&a->order, &b->order) != 0 || - BN_cmp(&a->field, &b->field) != 0 || - !ec_felem_equal(a, &a->a, &b->a) || - !ec_felem_equal(a, &a->b, &b->b) || - !ec_GFp_simple_points_equal(a, &a->generator->raw, &b->generator->raw); -} - -const EC_POINT *EC_GROUP_get0_generator(const EC_GROUP *group) { - return group->generator; -} - -const BIGNUM *EC_GROUP_get0_order(const EC_GROUP *group) { - assert(!BN_is_zero(&group->order)); - return &group->order; -} - -int EC_GROUP_get_order(const EC_GROUP *group, BIGNUM *order, BN_CTX *ctx) { - if (BN_copy(order, EC_GROUP_get0_order(group)) == NULL) { - return 0; - } - return 1; -} - -int EC_GROUP_order_bits(const EC_GROUP *group) { - return BN_num_bits(&group->order); -} - -int EC_GROUP_get_cofactor(const EC_GROUP *group, BIGNUM *cofactor, - BN_CTX *ctx) { - // All |EC_GROUP|s have cofactor 1. - return BN_set_word(cofactor, 1); -} - -int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *out_p, BIGNUM *out_a, - BIGNUM *out_b, BN_CTX *ctx) { - return ec_GFp_simple_group_get_curve(group, out_p, out_a, out_b); -} - -int EC_GROUP_get_curve_name(const EC_GROUP *group) { return group->curve_name; } - -unsigned EC_GROUP_get_degree(const EC_GROUP *group) { - return BN_num_bits(&group->field); -} - -const char *EC_curve_nid2nist(int nid) { - switch (nid) { - case NID_secp224r1: - return "P-224"; - case NID_X9_62_prime256v1: - return "P-256"; - case NID_secp384r1: - return "P-384"; - case NID_secp521r1: - return "P-521"; - } - return NULL; -} - -int EC_curve_nist2nid(const char *name) { - if (strcmp(name, "P-224") == 0) { - return NID_secp224r1; - } - if (strcmp(name, "P-256") == 0) { - return NID_X9_62_prime256v1; - } - if (strcmp(name, "P-384") == 0) { - return NID_secp384r1; - } - if (strcmp(name, "P-521") == 0) { - return NID_secp521r1; - } - return NID_undef; -} - -EC_POINT *EC_POINT_new(const EC_GROUP *group) { - if (group == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return NULL; - } - - EC_POINT *ret = OPENSSL_malloc(sizeof *ret); - if (ret == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE); - return NULL; - } - - ret->group = EC_GROUP_dup(group); - ec_GFp_simple_point_init(&ret->raw); - return ret; -} - -static void ec_point_free(EC_POINT *point, int free_group) { - if (!point) { - return; - } - if (free_group) { - EC_GROUP_free(point->group); - } - OPENSSL_free(point); -} - -void EC_POINT_free(EC_POINT *point) { - ec_point_free(point, 1 /* free group */); -} - -void EC_POINT_clear_free(EC_POINT *point) { EC_POINT_free(point); } - -int EC_POINT_copy(EC_POINT *dest, const EC_POINT *src) { - if (EC_GROUP_cmp(dest->group, src->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - if (dest == src) { - return 1; - } - ec_GFp_simple_point_copy(&dest->raw, &src->raw); - return 1; -} - -EC_POINT *EC_POINT_dup(const EC_POINT *a, const EC_GROUP *group) { - if (a == NULL) { - return NULL; - } - - EC_POINT *ret = EC_POINT_new(group); - if (ret == NULL || - !EC_POINT_copy(ret, a)) { - EC_POINT_free(ret); - return NULL; - } - - return ret; -} - -int EC_POINT_set_to_infinity(const EC_GROUP *group, EC_POINT *point) { - if (EC_GROUP_cmp(group, point->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - ec_GFp_simple_point_set_to_infinity(group, &point->raw); - return 1; -} - -int EC_POINT_is_at_infinity(const EC_GROUP *group, const EC_POINT *point) { - if (EC_GROUP_cmp(group, point->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - return ec_GFp_simple_is_at_infinity(group, &point->raw); -} - -int EC_POINT_is_on_curve(const EC_GROUP *group, const EC_POINT *point, - BN_CTX *ctx) { - if (EC_GROUP_cmp(group, point->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - return ec_GFp_simple_is_on_curve(group, &point->raw); -} - -int EC_POINT_cmp(const EC_GROUP *group, const EC_POINT *a, const EC_POINT *b, - BN_CTX *ctx) { - if (EC_GROUP_cmp(group, a->group, NULL) != 0 || - EC_GROUP_cmp(group, b->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return -1; - } - - // Note |EC_POINT_cmp| returns zero for equality and non-zero for inequality. - return ec_GFp_simple_points_equal(group, &a->raw, &b->raw) ? 0 : 1; -} - -int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, - const EC_POINT *point, BIGNUM *x, - BIGNUM *y, BN_CTX *ctx) { - if (group->meth->point_get_affine_coordinates == 0) { - OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - if (EC_GROUP_cmp(group, point->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - EC_FELEM x_felem, y_felem; - if (!group->meth->point_get_affine_coordinates(group, &point->raw, - x == NULL ? NULL : &x_felem, - y == NULL ? NULL : &y_felem) || - (x != NULL && !ec_felem_to_bignum(group, x, &x_felem)) || - (y != NULL && !ec_felem_to_bignum(group, y, &y_felem))) { - return 0; - } - return 1; -} - -int EC_POINT_get_affine_coordinates(const EC_GROUP *group, - const EC_POINT *point, BIGNUM *x, BIGNUM *y, - BN_CTX *ctx) { - return EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx); -} - -void ec_affine_to_jacobian(const EC_GROUP *group, EC_RAW_POINT *out, - const EC_AFFINE *p) { - out->X = p->X; - out->Y = p->Y; - out->Z = group->one; -} - -int ec_jacobian_to_affine(const EC_GROUP *group, EC_AFFINE *out, - const EC_RAW_POINT *p) { - return group->meth->point_get_affine_coordinates(group, p, &out->X, &out->Y); -} - -int ec_jacobian_to_affine_batch(const EC_GROUP *group, EC_AFFINE *out, - const EC_RAW_POINT *in, size_t num) { - if (group->meth->jacobian_to_affine_batch == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - return group->meth->jacobian_to_affine_batch(group, out, in, num); -} - -int ec_point_set_affine_coordinates(const EC_GROUP *group, EC_AFFINE *out, - const EC_FELEM *x, const EC_FELEM *y) { - void (*const felem_mul)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a, - const EC_FELEM *b) = group->meth->felem_mul; - void (*const felem_sqr)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a) = - group->meth->felem_sqr; - - // Check if the point is on the curve. - EC_FELEM lhs, rhs; - felem_sqr(group, &lhs, y); // lhs = y^2 - felem_sqr(group, &rhs, x); // rhs = x^2 - ec_felem_add(group, &rhs, &rhs, &group->a); // rhs = x^2 + a - felem_mul(group, &rhs, &rhs, x); // rhs = x^3 + ax - ec_felem_add(group, &rhs, &rhs, &group->b); // rhs = x^3 + ax + b - if (!ec_felem_equal(group, &lhs, &rhs)) { - OPENSSL_PUT_ERROR(EC, EC_R_POINT_IS_NOT_ON_CURVE); - // In the event of an error, defend against the caller not checking the - // return value by setting a known safe value. Note this may not be possible - // if the caller is in the process of constructing an arbitrary group and - // the generator is missing. - if (group->generator != NULL) { - assert(ec_felem_equal(group, &group->one, &group->generator->raw.Z)); - out->X = group->generator->raw.X; - out->Y = group->generator->raw.Y; - } - return 0; - } - - out->X = *x; - out->Y = *y; - return 1; -} - -int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *point, - const BIGNUM *x, const BIGNUM *y, - BN_CTX *ctx) { - if (EC_GROUP_cmp(group, point->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - - if (x == NULL || y == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - - EC_FELEM x_felem, y_felem; - EC_AFFINE affine; - if (!ec_bignum_to_felem(group, &x_felem, x) || - !ec_bignum_to_felem(group, &y_felem, y) || - !ec_point_set_affine_coordinates(group, &affine, &x_felem, &y_felem)) { - // In the event of an error, defend against the caller not checking the - // return value by setting a known safe value. - ec_set_to_safe_point(group, &point->raw); - return 0; - } - - ec_affine_to_jacobian(group, &point->raw, &affine); - return 1; -} - -int EC_POINT_set_affine_coordinates(const EC_GROUP *group, EC_POINT *point, - const BIGNUM *x, const BIGNUM *y, - BN_CTX *ctx) { - return EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx); -} - -int EC_POINT_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, - const EC_POINT *b, BN_CTX *ctx) { - if (EC_GROUP_cmp(group, r->group, NULL) != 0 || - EC_GROUP_cmp(group, a->group, NULL) != 0 || - EC_GROUP_cmp(group, b->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - group->meth->add(group, &r->raw, &a->raw, &b->raw); - return 1; -} - -int EC_POINT_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, - BN_CTX *ctx) { - if (EC_GROUP_cmp(group, r->group, NULL) != 0 || - EC_GROUP_cmp(group, a->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - group->meth->dbl(group, &r->raw, &a->raw); - return 1; -} - - -int EC_POINT_invert(const EC_GROUP *group, EC_POINT *a, BN_CTX *ctx) { - if (EC_GROUP_cmp(group, a->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - ec_GFp_simple_invert(group, &a->raw); - return 1; -} - -static int arbitrary_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out, - const BIGNUM *in, BN_CTX *ctx) { - if (ec_bignum_to_scalar(group, out, in)) { - return 1; - } - - ERR_clear_error(); - - // This is an unusual input, so we do not guarantee constant-time processing. - const BIGNUM *order = &group->order; - BN_CTX_start(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - int ok = tmp != NULL && - BN_nnmod(tmp, in, order, ctx) && - ec_bignum_to_scalar(group, out, tmp); - BN_CTX_end(ctx); - return ok; -} - -int ec_point_mul_no_self_test(const EC_GROUP *group, EC_POINT *r, - const BIGNUM *g_scalar, const EC_POINT *p, - const BIGNUM *p_scalar, BN_CTX *ctx) { - // Previously, this function set |r| to the point at infinity if there was - // nothing to multiply. But, nobody should be calling this function with - // nothing to multiply in the first place. - if ((g_scalar == NULL && p_scalar == NULL) || - (p == NULL) != (p_scalar == NULL)) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - - if (EC_GROUP_cmp(group, r->group, NULL) != 0 || - (p != NULL && EC_GROUP_cmp(group, p->group, NULL) != 0)) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - - int ret = 0; - BN_CTX *new_ctx = NULL; - if (ctx == NULL) { - new_ctx = BN_CTX_new(); - if (new_ctx == NULL) { - goto err; - } - ctx = new_ctx; - } - - // If both |g_scalar| and |p_scalar| are non-NULL, - // |ec_point_mul_scalar_public| would share the doublings between the two - // products, which would be more efficient. However, we conservatively assume - // the caller needs a constant-time operation. (ECDSA verification does not - // use this function.) - // - // Previously, the low-level constant-time multiplication function aligned - // with this function's calling convention, but this was misleading. Curves - // which combined the two multiplications did not avoid the doubling case - // in the incomplete addition formula and were not constant-time. - - if (g_scalar != NULL) { - EC_SCALAR scalar; - if (!arbitrary_bignum_to_scalar(group, &scalar, g_scalar, ctx) || - !ec_point_mul_scalar_base(group, &r->raw, &scalar)) { - goto err; - } - } - - if (p_scalar != NULL) { - EC_SCALAR scalar; - EC_RAW_POINT tmp; - if (!arbitrary_bignum_to_scalar(group, &scalar, p_scalar, ctx) || - !ec_point_mul_scalar(group, &tmp, &p->raw, &scalar)) { - goto err; - } - if (g_scalar == NULL) { - OPENSSL_memcpy(&r->raw, &tmp, sizeof(EC_RAW_POINT)); - } else { - group->meth->add(group, &r->raw, &r->raw, &tmp); - } - } - - ret = 1; - -err: - BN_CTX_free(new_ctx); - return ret; -} - -int EC_POINT_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *g_scalar, - const EC_POINT *p, const BIGNUM *p_scalar, BN_CTX *ctx) { - boringssl_ensure_ecc_self_test(); - - return ec_point_mul_no_self_test(group, r, g_scalar, p, p_scalar, ctx); -} - -int ec_point_mul_scalar_public(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_SCALAR *g_scalar, const EC_RAW_POINT *p, - const EC_SCALAR *p_scalar) { - if (g_scalar == NULL || p_scalar == NULL || p == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - - if (group->meth->mul_public == NULL) { - return group->meth->mul_public_batch(group, r, g_scalar, p, p_scalar, 1); - } - - group->meth->mul_public(group, r, g_scalar, p, p_scalar); - return 1; -} - -int ec_point_mul_scalar_public_batch(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_SCALAR *g_scalar, - const EC_RAW_POINT *points, - const EC_SCALAR *scalars, size_t num) { - if (group->meth->mul_public_batch == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - - return group->meth->mul_public_batch(group, r, g_scalar, points, scalars, - num); -} - -int ec_point_mul_scalar(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p, const EC_SCALAR *scalar) { - if (p == NULL || scalar == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - - group->meth->mul(group, r, p, scalar); - - // Check the result is on the curve to defend against fault attacks or bugs. - // This has negligible cost compared to the multiplication. - if (!ec_GFp_simple_is_on_curve(group, r)) { - OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); - return 0; - } - - return 1; -} - -int ec_point_mul_scalar_base(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_SCALAR *scalar) { - if (scalar == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - - group->meth->mul_base(group, r, scalar); - - // Check the result is on the curve to defend against fault attacks or bugs. - // This has negligible cost compared to the multiplication. - if (!ec_GFp_simple_is_on_curve(group, r)) { - OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); - return 0; - } - - return 1; -} - -int ec_point_mul_scalar_batch(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p0, const EC_SCALAR *scalar0, - const EC_RAW_POINT *p1, const EC_SCALAR *scalar1, - const EC_RAW_POINT *p2, - const EC_SCALAR *scalar2) { - if (group->meth->mul_batch == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - - group->meth->mul_batch(group, r, p0, scalar0, p1, scalar1, p2, scalar2); - - // Check the result is on the curve to defend against fault attacks or bugs. - // This has negligible cost compared to the multiplication. - if (!ec_GFp_simple_is_on_curve(group, r)) { - OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); - return 0; - } - - return 1; -} - -int ec_init_precomp(const EC_GROUP *group, EC_PRECOMP *out, - const EC_RAW_POINT *p) { - if (group->meth->init_precomp == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - - return group->meth->init_precomp(group, out, p); -} - -int ec_point_mul_scalar_precomp(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_PRECOMP *p0, const EC_SCALAR *scalar0, - const EC_PRECOMP *p1, const EC_SCALAR *scalar1, - const EC_PRECOMP *p2, - const EC_SCALAR *scalar2) { - if (group->meth->mul_precomp == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - - group->meth->mul_precomp(group, r, p0, scalar0, p1, scalar1, p2, scalar2); - - // Check the result is on the curve to defend against fault attacks or bugs. - // This has negligible cost compared to the multiplication. - if (!ec_GFp_simple_is_on_curve(group, r)) { - OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); - return 0; - } - - return 1; -} - -void ec_point_select(const EC_GROUP *group, EC_RAW_POINT *out, BN_ULONG mask, - const EC_RAW_POINT *a, const EC_RAW_POINT *b) { - ec_felem_select(group, &out->X, mask, &a->X, &b->X); - ec_felem_select(group, &out->Y, mask, &a->Y, &b->Y); - ec_felem_select(group, &out->Z, mask, &a->Z, &b->Z); -} - -void ec_affine_select(const EC_GROUP *group, EC_AFFINE *out, BN_ULONG mask, - const EC_AFFINE *a, const EC_AFFINE *b) { - ec_felem_select(group, &out->X, mask, &a->X, &b->X); - ec_felem_select(group, &out->Y, mask, &a->Y, &b->Y); -} - -void ec_precomp_select(const EC_GROUP *group, EC_PRECOMP *out, BN_ULONG mask, - const EC_PRECOMP *a, const EC_PRECOMP *b) { - static_assert(sizeof(out->comb) == sizeof(*out), - "out->comb does not span the entire structure"); - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(out->comb); i++) { - ec_affine_select(group, &out->comb[i], mask, &a->comb[i], &b->comb[i]); - } -} - -int ec_cmp_x_coordinate(const EC_GROUP *group, const EC_RAW_POINT *p, - const EC_SCALAR *r) { - return group->meth->cmp_x_coordinate(group, p, r); -} - -int ec_get_x_coordinate_as_scalar(const EC_GROUP *group, EC_SCALAR *out, - const EC_RAW_POINT *p) { - uint8_t bytes[EC_MAX_BYTES]; - size_t len; - if (!ec_get_x_coordinate_as_bytes(group, bytes, &len, sizeof(bytes), p)) { - return 0; - } - - // The x-coordinate is bounded by p, but we need a scalar, bounded by the - // order. These may not have the same size. However, we must have p < 2×order, - // assuming p is not tiny (p >= 17). - // - // Thus |bytes| will fit in |order.width + 1| words, and we can reduce by - // performing at most one subtraction. - // - // Proof: We only work with prime order curves, so the number of points on - // the curve is the order. Thus Hasse's theorem gives: - // - // |order - (p + 1)| <= 2×sqrt(p) - // p + 1 - order <= 2×sqrt(p) - // p + 1 - 2×sqrt(p) <= order - // p + 1 - 2×(p/4) < order (p/4 > sqrt(p) for p >= 17) - // p/2 < p/2 + 1 < order - // p < 2×order - // - // Additionally, one can manually check this property for built-in curves. It - // is enforced for legacy custom curves in |EC_GROUP_set_generator|. - const BIGNUM *order = &group->order; - BN_ULONG words[EC_MAX_WORDS + 1]; - bn_big_endian_to_words(words, order->width + 1, bytes, len); - bn_reduce_once(out->words, words, /*carry=*/words[order->width], order->d, - order->width); - return 1; -} - -int ec_get_x_coordinate_as_bytes(const EC_GROUP *group, uint8_t *out, - size_t *out_len, size_t max_out, - const EC_RAW_POINT *p) { - size_t len = BN_num_bytes(&group->field); - assert(len <= EC_MAX_BYTES); - if (max_out < len) { - OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL); - return 0; - } - - EC_FELEM x; - if (!group->meth->point_get_affine_coordinates(group, p, &x, NULL)) { - return 0; - } - - ec_felem_to_bytes(group, out, out_len, &x); - *out_len = len; - return 1; -} - -void ec_set_to_safe_point(const EC_GROUP *group, EC_RAW_POINT *out) { - if (group->generator != NULL) { - ec_GFp_simple_point_copy(out, &group->generator->raw); - } else { - // The generator can be missing if the caller is in the process of - // constructing an arbitrary group. In this case, we give up and use the - // point at infinity. - ec_GFp_simple_point_set_to_infinity(group, out); - } -} - -void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag) {} - -int EC_GROUP_get_asn1_flag(const EC_GROUP *group) { - return OPENSSL_EC_NAMED_CURVE; -} - -const EC_METHOD *EC_GROUP_method_of(const EC_GROUP *group) { - // This function exists purely to give callers a way to call - // |EC_METHOD_get_field_type|. cryptography.io crashes if |EC_GROUP_method_of| - // returns NULL, so return some other garbage pointer. - return (const EC_METHOD *)0x12340000; -} - -int EC_METHOD_get_field_type(const EC_METHOD *meth) { - return NID_X9_62_prime_field; -} - -void EC_GROUP_set_point_conversion_form(EC_GROUP *group, - point_conversion_form_t form) { - if (form != POINT_CONVERSION_UNCOMPRESSED) { - abort(); - } -} - -size_t EC_get_builtin_curves(EC_builtin_curve *out_curves, - size_t max_num_curves) { - const struct built_in_curves *const curves = OPENSSL_built_in_curves(); - - for (size_t i = 0; i < max_num_curves && i < OPENSSL_NUM_BUILT_IN_CURVES; - i++) { - out_curves[i].comment = curves->curves[i].comment; - out_curves[i].nid = curves->curves[i].nid; - } - - return OPENSSL_NUM_BUILT_IN_CURVES; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/ec.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/ec.cc.inc new file mode 100644 index 00000000..6d7507fb --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/ec.cc.inc @@ -0,0 +1,970 @@ +// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include +#include +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "../bn/internal.h" +#include "../delocate.h" +#include "internal.h" + +#include "builtin_curves.h" + + +using namespace bssl; + +static void ec_point_free(EC_POINT *point, int free_group); + +static void ec_group_init_static_mont(BN_MONT_CTX *mont, size_t num_words, + const BN_ULONG *modulus, + const BN_ULONG *rr, uint64_t n0) { + bn_set_static_words(&mont->N, modulus, num_words); + bn_set_static_words(&mont->RR, rr, num_words); +#if defined(OPENSSL_64_BIT) + mont->n0[0] = n0; +#elif defined(OPENSSL_32_BIT) + mont->n0[0] = (uint32_t)n0; + mont->n0[1] = (uint32_t)(n0 >> 32); +#else +#error "unknown word length" +#endif +} + +static void ec_group_set_a_minus3(EC_GROUP *group) { + const EC_FELEM *one = ec_felem_one(group); + group->a_is_minus3 = 1; + ec_felem_neg(group, &group->a, one); + ec_felem_sub(group, &group->a, &group->a, one); + ec_felem_sub(group, &group->a, &group->a, one); +} + +DEFINE_METHOD_FUNCTION(EC_GROUP, EC_group_p224) { + out->curve_name = NID_secp224r1; + out->comment = "NIST P-224"; + static const uint8_t kOIDP224[] = {OBJ_ENC_secp224r1}; + static_assert(sizeof(kOIDP224) <= sizeof(out->oid)); + OPENSSL_memcpy(out->oid, kOIDP224, sizeof(kOIDP224)); + out->oid_len = sizeof(kOIDP224); + + ec_group_init_static_mont(&out->field, std::size(kP224Field), kP224Field, + kP224FieldRR, kP224FieldN0); + ec_group_init_static_mont(&out->order, std::size(kP224Order), kP224Order, + kP224OrderRR, kP224OrderN0); + + out->meth = EC_GFp_mont_method(); + OPENSSL_memcpy(out->generator.raw.X.words, kP224MontGX, sizeof(kP224MontGX)); + OPENSSL_memcpy(out->generator.raw.Y.words, kP224MontGY, sizeof(kP224MontGY)); + OPENSSL_memcpy(out->generator.raw.Z.words, kP224FieldR, sizeof(kP224FieldR)); + OPENSSL_memcpy(out->b.words, kP224MontB, sizeof(kP224MontB)); + out->generator.group = out; + + ec_group_set_a_minus3(out); + out->has_order = 1; + out->field_greater_than_order = 1; +} + +DEFINE_METHOD_FUNCTION(EC_GROUP, EC_group_p256) { + out->curve_name = NID_X9_62_prime256v1; + out->comment = "NIST P-256"; + static const uint8_t kOIDP256[] = {OBJ_ENC_X9_62_prime256v1}; + static_assert(sizeof(kOIDP256) <= sizeof(out->oid)); + OPENSSL_memcpy(out->oid, kOIDP256, sizeof(kOIDP256)); + out->oid_len = sizeof(kOIDP256); + + ec_group_init_static_mont(&out->field, std::size(kP256Field), kP256Field, + kP256FieldRR, kP256FieldN0); + ec_group_init_static_mont(&out->order, std::size(kP256Order), kP256Order, + kP256OrderRR, kP256OrderN0); + +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ + !defined(OPENSSL_SMALL) + out->meth = EC_GFp_nistz256_method(); +#else + out->meth = EC_GFp_nistp256_method(); +#endif + out->generator.group = out; + OPENSSL_memcpy(out->generator.raw.X.words, kP256MontGX, sizeof(kP256MontGX)); + OPENSSL_memcpy(out->generator.raw.Y.words, kP256MontGY, sizeof(kP256MontGY)); + OPENSSL_memcpy(out->generator.raw.Z.words, kP256FieldR, sizeof(kP256FieldR)); + OPENSSL_memcpy(out->b.words, kP256MontB, sizeof(kP256MontB)); + + ec_group_set_a_minus3(out); + out->has_order = 1; + out->field_greater_than_order = 1; +} + +DEFINE_METHOD_FUNCTION(EC_GROUP, EC_group_p384) { + out->curve_name = NID_secp384r1; + out->comment = "NIST P-384"; + static const uint8_t kOIDP384[] = {OBJ_ENC_secp384r1}; + static_assert(sizeof(kOIDP384) <= sizeof(out->oid)); + OPENSSL_memcpy(out->oid, kOIDP384, sizeof(kOIDP384)); + out->oid_len = sizeof(kOIDP384); + + ec_group_init_static_mont(&out->field, std::size(kP384Field), kP384Field, + kP384FieldRR, kP384FieldN0); + ec_group_init_static_mont(&out->order, std::size(kP384Order), kP384Order, + kP384OrderRR, kP384OrderN0); + + out->meth = EC_GFp_mont_method(); + out->generator.group = out; + OPENSSL_memcpy(out->generator.raw.X.words, kP384MontGX, sizeof(kP384MontGX)); + OPENSSL_memcpy(out->generator.raw.Y.words, kP384MontGY, sizeof(kP384MontGY)); + OPENSSL_memcpy(out->generator.raw.Z.words, kP384FieldR, sizeof(kP384FieldR)); + OPENSSL_memcpy(out->b.words, kP384MontB, sizeof(kP384MontB)); + + ec_group_set_a_minus3(out); + out->has_order = 1; + out->field_greater_than_order = 1; +} + +DEFINE_METHOD_FUNCTION(EC_GROUP, EC_group_p521) { + out->curve_name = NID_secp521r1; + out->comment = "NIST P-521"; + static const uint8_t kOIDP521[] = {OBJ_ENC_secp521r1}; + static_assert(sizeof(kOIDP521) <= sizeof(out->oid)); + OPENSSL_memcpy(out->oid, kOIDP521, sizeof(kOIDP521)); + out->oid_len = sizeof(kOIDP521); + + ec_group_init_static_mont(&out->field, std::size(kP521Field), kP521Field, + kP521FieldRR, kP521FieldN0); + ec_group_init_static_mont(&out->order, std::size(kP521Order), kP521Order, + kP521OrderRR, kP521OrderN0); + + out->meth = EC_GFp_mont_method(); + out->generator.group = out; + OPENSSL_memcpy(out->generator.raw.X.words, kP521MontGX, sizeof(kP521MontGX)); + OPENSSL_memcpy(out->generator.raw.Y.words, kP521MontGY, sizeof(kP521MontGY)); + OPENSSL_memcpy(out->generator.raw.Z.words, kP521FieldR, sizeof(kP521FieldR)); + OPENSSL_memcpy(out->b.words, kP521MontB, sizeof(kP521MontB)); + + ec_group_set_a_minus3(out); + out->has_order = 1; + out->field_greater_than_order = 1; +} + +EC_GROUP *EC_GROUP_new_curve_GFp(const BIGNUM *p, const BIGNUM *a, + const BIGNUM *b, BN_CTX *ctx) { + if (BN_num_bytes(p) > EC_MAX_BYTES) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_FIELD); + return nullptr; + } + + UniquePtr new_ctx; + if (ctx == nullptr) { + new_ctx.reset(BN_CTX_new()); + if (new_ctx == nullptr) { + return nullptr; + } + ctx = new_ctx.get(); + } + + // Historically, |a| and |b| were not required to be fully reduced. + // TODO(davidben): Can this be removed? + BN_CTXScope scope(ctx); + BIGNUM *a_reduced = BN_CTX_get(ctx); + BIGNUM *b_reduced = BN_CTX_get(ctx); + if (a_reduced == nullptr || b_reduced == nullptr || + !BN_nnmod(a_reduced, a, p, ctx) || // + !BN_nnmod(b_reduced, b, p, ctx)) { + return nullptr; + } + + UniquePtr ret(New(EC_GFp_mont_method())); + if (ret == nullptr) { + return nullptr; + } + if (!ec_GFp_simple_group_set_curve(ret.get(), p, a_reduced, b_reduced, ctx)) { + return nullptr; + } + + return ret.release(); +} + +int EC_GROUP_set_generator(EC_GROUP *group, const EC_POINT *generator, + const BIGNUM *order, const BIGNUM *cofactor) { + if (group->curve_name != NID_undef || group->has_order || + generator->group != group) { + // |EC_GROUP_set_generator| may only be used with |EC_GROUP|s returned by + // |EC_GROUP_new_curve_GFp| and may only used once on each group. + // |generator| must have been created from |EC_GROUP_new_curve_GFp|, not a + // copy, so that |generator->group->generator| is set correctly. + OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + if (BN_num_bytes(order) > EC_MAX_BYTES) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER); + return 0; + } + + // Require a cofactor of one for custom curves, which implies prime order. + if (!BN_is_one(cofactor)) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_COFACTOR); + return 0; + } + + // Require that p < 2×order. This simplifies some ECDSA operations. + // + // Note any curve which did not satisfy this must have been invalid or use a + // tiny prime (less than 17). See the proof in |field_element_to_scalar| in + // the ECDSA implementation. + UniquePtr tmp(BN_new()); + if (tmp == nullptr || !BN_lshift1(tmp.get(), order)) { + return 0; + } + if (BN_cmp(tmp.get(), &group->field.N) <= 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER); + return 0; + } + + EC_AFFINE affine; + if (!ec_jacobian_to_affine(group, &affine, &generator->raw) || + !BN_MONT_CTX_set(&group->order, order, nullptr)) { + return 0; + } + + group->field_greater_than_order = BN_cmp(&group->field.N, order) > 0; + group->generator.raw.X = affine.X; + group->generator.raw.Y = affine.Y; + // |raw.Z| was set to 1 by |EC_GROUP_new_curve_GFp|. + group->has_order = 1; + return 1; +} + +EC_GROUP *EC_GROUP_new_by_curve_name(int nid) { + switch (nid) { + case NID_secp224r1: + return (EC_GROUP *)EC_group_p224(); + case NID_X9_62_prime256v1: + return (EC_GROUP *)EC_group_p256(); + case NID_secp384r1: + return (EC_GROUP *)EC_group_p384(); + case NID_secp521r1: + return (EC_GROUP *)EC_group_p521(); + default: + OPENSSL_PUT_ERROR(EC, EC_R_UNKNOWN_GROUP); + return nullptr; + } +} + +ECCustomGroup::ECCustomGroup(const EC_METHOD *m) + : ec_group_st({}), RefCounted(CheckSubClass()) { + meth = m; + bn_mont_ctx_init(&field); + bn_mont_ctx_init(&order); + generator.group = this; +} + +ECCustomGroup::~ECCustomGroup() { + bn_mont_ctx_cleanup(&order); + bn_mont_ctx_cleanup(&field); +} + +void EC_GROUP_free(EC_GROUP *group) { + if (group == nullptr || + // Built-in curves are static. + group->curve_name != NID_undef) { + return; + } + auto *custom = static_cast(group); + custom->DecRefInternal(); +} + +EC_GROUP *EC_GROUP_dup(const EC_GROUP *a) { + if (a == nullptr || + // Built-in curves are static. + a->curve_name != NID_undef) { + return (EC_GROUP *)a; + } + auto *custom = static_cast(a); + + // Groups are logically immutable (but for |EC_GROUP_set_generator| which must + // be called early on), so we simply take a reference. + ECCustomGroup *group = const_cast(custom); + group->UpRefInternal(); + return group; +} + +int EC_GROUP_cmp(const EC_GROUP *a, const EC_GROUP *b, BN_CTX *ignored) { + // Note this function returns 0 if equal and non-zero otherwise. + if (a == b) { + return 0; + } + if (a->curve_name != b->curve_name) { + return 1; + } + if (a->curve_name != NID_undef) { + // Built-in curves may be compared by curve name alone. + return 0; + } + + // |a| and |b| are both custom curves. We compare the entire curve + // structure. If |a| or |b| is incomplete (due to legacy OpenSSL mistakes, + // custom curve construction is sadly done in two parts) but otherwise not the + // same object, we consider them always unequal. + return a->meth != b->meth || // + !a->has_order || !b->has_order || + BN_cmp(&a->order.N, &b->order.N) != 0 || + BN_cmp(&a->field.N, &b->field.N) != 0 || + !ec_felem_equal(a, &a->a, &b->a) || // + !ec_felem_equal(a, &a->b, &b->b) || + !ec_GFp_simple_points_equal(a, &a->generator.raw, &b->generator.raw); +} + +const EC_POINT *EC_GROUP_get0_generator(const EC_GROUP *group) { + return group->has_order ? &group->generator : nullptr; +} + +const BIGNUM *EC_GROUP_get0_order(const EC_GROUP *group) { + assert(group->has_order); + return &group->order.N; +} + +int EC_GROUP_get_order(const EC_GROUP *group, BIGNUM *order, BN_CTX *ctx) { + if (BN_copy(order, EC_GROUP_get0_order(group)) == nullptr) { + return 0; + } + return 1; +} + +int EC_GROUP_order_bits(const EC_GROUP *group) { + return BN_num_bits(&group->order.N); +} + +int EC_GROUP_get_cofactor(const EC_GROUP *group, BIGNUM *cofactor, + BN_CTX *ctx) { + // All |EC_GROUP|s have cofactor 1. + return BN_set_word(cofactor, 1); +} + +int EC_GROUP_get_curve_GFp(const EC_GROUP *group, BIGNUM *out_p, BIGNUM *out_a, + BIGNUM *out_b, BN_CTX *ctx) { + return ec_GFp_simple_group_get_curve(group, out_p, out_a, out_b); +} + +int EC_GROUP_get_curve_name(const EC_GROUP *group) { return group->curve_name; } + +unsigned EC_GROUP_get_degree(const EC_GROUP *group) { + return BN_num_bits(&group->field.N); +} + +const char *EC_curve_nid2nist(int nid) { + switch (nid) { + case NID_secp224r1: + return "P-224"; + case NID_X9_62_prime256v1: + return "P-256"; + case NID_secp384r1: + return "P-384"; + case NID_secp521r1: + return "P-521"; + } + return nullptr; +} + +int EC_curve_nist2nid(const char *name) { + if (strcmp(name, "P-224") == 0) { + return NID_secp224r1; + } + if (strcmp(name, "P-256") == 0) { + return NID_X9_62_prime256v1; + } + if (strcmp(name, "P-384") == 0) { + return NID_secp384r1; + } + if (strcmp(name, "P-521") == 0) { + return NID_secp521r1; + } + return NID_undef; +} + +EC_POINT *EC_POINT_new(const EC_GROUP *group) { + if (group == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return nullptr; + } + + EC_POINT *ret = New(); + if (ret == nullptr) { + return nullptr; + } + + ret->group = EC_GROUP_dup(group); + ec_GFp_simple_point_init(&ret->raw); + return ret; +} + +static void ec_point_free(EC_POINT *point, int free_group) { + if (!point) { + return; + } + if (free_group) { + EC_GROUP_free(point->group); + } + Delete(point); +} + +void EC_POINT_free(EC_POINT *point) { + ec_point_free(point, 1 /* free group */); +} + +void EC_POINT_clear_free(EC_POINT *point) { EC_POINT_free(point); } + +int EC_POINT_copy(EC_POINT *dest, const EC_POINT *src) { + if (EC_GROUP_cmp(dest->group, src->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + if (dest == src) { + return 1; + } + ec_GFp_simple_point_copy(&dest->raw, &src->raw); + return 1; +} + +EC_POINT *EC_POINT_dup(const EC_POINT *a, const EC_GROUP *group) { + if (a == nullptr) { + return nullptr; + } + + EC_POINT *ret = EC_POINT_new(group); + if (ret == nullptr || !EC_POINT_copy(ret, a)) { + EC_POINT_free(ret); + return nullptr; + } + + return ret; +} + +int EC_POINT_set_to_infinity(const EC_GROUP *group, EC_POINT *point) { + if (EC_GROUP_cmp(group, point->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + ec_GFp_simple_point_set_to_infinity(group, &point->raw); + return 1; +} + +int EC_POINT_is_at_infinity(const EC_GROUP *group, const EC_POINT *point) { + if (EC_GROUP_cmp(group, point->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + return ec_GFp_simple_is_at_infinity(group, &point->raw); +} + +int EC_POINT_is_on_curve(const EC_GROUP *group, const EC_POINT *point, + BN_CTX *ctx) { + if (EC_GROUP_cmp(group, point->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + return ec_GFp_simple_is_on_curve(group, &point->raw); +} + +int EC_POINT_cmp(const EC_GROUP *group, const EC_POINT *a, const EC_POINT *b, + BN_CTX *ctx) { + if (EC_GROUP_cmp(group, a->group, nullptr) != 0 || + EC_GROUP_cmp(group, b->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return -1; + } + + // Note |EC_POINT_cmp| returns zero for equality and non-zero for inequality. + return ec_GFp_simple_points_equal(group, &a->raw, &b->raw) ? 0 : 1; +} + +int EC_POINT_get_affine_coordinates_GFp(const EC_GROUP *group, + const EC_POINT *point, BIGNUM *x, + BIGNUM *y, BN_CTX *ctx) { + if (group->meth->point_get_affine_coordinates == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (EC_GROUP_cmp(group, point->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + EC_FELEM x_felem, y_felem; + if (!group->meth->point_get_affine_coordinates( + group, &point->raw, x == nullptr ? nullptr : &x_felem, + y == nullptr ? nullptr : &y_felem) || + (x != nullptr && !ec_felem_to_bignum(group, x, &x_felem)) || + (y != nullptr && !ec_felem_to_bignum(group, y, &y_felem))) { + return 0; + } + return 1; +} + +int EC_POINT_get_affine_coordinates(const EC_GROUP *group, + const EC_POINT *point, BIGNUM *x, BIGNUM *y, + BN_CTX *ctx) { + return EC_POINT_get_affine_coordinates_GFp(group, point, x, y, ctx); +} + +void bssl::ec_affine_to_jacobian(const EC_GROUP *group, EC_JACOBIAN *out, + const EC_AFFINE *p) { + out->X = p->X; + out->Y = p->Y; + out->Z = *ec_felem_one(group); +} + +int bssl::ec_jacobian_to_affine(const EC_GROUP *group, EC_AFFINE *out, + const EC_JACOBIAN *p) { + return group->meth->point_get_affine_coordinates(group, p, &out->X, &out->Y); +} + +int bssl::ec_jacobian_to_affine_batch(const EC_GROUP *group, EC_AFFINE *out, + const EC_JACOBIAN *in, size_t num) { + if (group->meth->jacobian_to_affine_batch == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + return group->meth->jacobian_to_affine_batch(group, out, in, num); +} + +int bssl::ec_point_set_affine_coordinates(const EC_GROUP *group, EC_AFFINE *out, + const EC_FELEM *x, + const EC_FELEM *y) { + // Check if the point is on the curve. + EC_FELEM lhs, rhs; + ec_felem_sqr(group, &lhs, y); // lhs = y^2 + ec_felem_sqr(group, &rhs, x); // rhs = x^2 + ec_felem_add(group, &rhs, &rhs, &group->a); // rhs = x^2 + a + ec_felem_mul(group, &rhs, &rhs, x); // rhs = x^3 + ax + ec_felem_add(group, &rhs, &rhs, &group->b); // rhs = x^3 + ax + b + if (!ec_felem_equal(group, &lhs, &rhs)) { + OPENSSL_PUT_ERROR(EC, EC_R_POINT_IS_NOT_ON_CURVE); + // In the event of an error, defend against the caller not checking the + // return value by setting a known safe value. Note this may not be possible + // if the caller is in the process of constructing an arbitrary group and + // the generator is missing. + if (group->has_order) { + out->X = group->generator.raw.X; + out->Y = group->generator.raw.Y; + } + return 0; + } + + out->X = *x; + out->Y = *y; + return 1; +} + +int EC_POINT_set_affine_coordinates_GFp(const EC_GROUP *group, EC_POINT *point, + const BIGNUM *x, const BIGNUM *y, + BN_CTX *ctx) { + if (EC_GROUP_cmp(group, point->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + + if (x == nullptr || y == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + EC_FELEM x_felem, y_felem; + EC_AFFINE affine; + if (!ec_bignum_to_felem(group, &x_felem, x) || + !ec_bignum_to_felem(group, &y_felem, y) || + !ec_point_set_affine_coordinates(group, &affine, &x_felem, &y_felem)) { + // In the event of an error, defend against the caller not checking the + // return value by setting a known safe value. + ec_set_to_safe_point(group, &point->raw); + return 0; + } + + ec_affine_to_jacobian(group, &point->raw, &affine); + return 1; +} + +int EC_POINT_set_affine_coordinates(const EC_GROUP *group, EC_POINT *point, + const BIGNUM *x, const BIGNUM *y, + BN_CTX *ctx) { + return EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx); +} + +int EC_POINT_add(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, + const EC_POINT *b, BN_CTX *ctx) { + if (EC_GROUP_cmp(group, r->group, nullptr) != 0 || + EC_GROUP_cmp(group, a->group, nullptr) != 0 || + EC_GROUP_cmp(group, b->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + group->meth->add(group, &r->raw, &a->raw, &b->raw); + return 1; +} + +int EC_POINT_dbl(const EC_GROUP *group, EC_POINT *r, const EC_POINT *a, + BN_CTX *ctx) { + if (EC_GROUP_cmp(group, r->group, nullptr) != 0 || + EC_GROUP_cmp(group, a->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + group->meth->dbl(group, &r->raw, &a->raw); + return 1; +} + + +int EC_POINT_invert(const EC_GROUP *group, EC_POINT *a, BN_CTX *ctx) { + if (EC_GROUP_cmp(group, a->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + ec_GFp_simple_invert(group, &a->raw); + return 1; +} + +static int arbitrary_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out, + const BIGNUM *in, BN_CTX *ctx) { + if (ec_bignum_to_scalar(group, out, in)) { + return 1; + } + + ERR_clear_error(); + + // This is an unusual input, so we do not guarantee constant-time processing. + BN_CTXScope scope(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + return tmp != nullptr && BN_nnmod(tmp, in, EC_GROUP_get0_order(group), ctx) && + ec_bignum_to_scalar(group, out, tmp); +} + +int bssl::ec_point_mul_no_self_test(const EC_GROUP *group, EC_POINT *r, + const BIGNUM *g_scalar, const EC_POINT *p, + const BIGNUM *p_scalar, BN_CTX *ctx) { + // Previously, this function set |r| to the point at infinity if there was + // nothing to multiply. But, nobody should be calling this function with + // nothing to multiply in the first place. + if ((g_scalar == nullptr && p_scalar == nullptr) || + (p == nullptr) != (p_scalar == nullptr)) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + if (EC_GROUP_cmp(group, r->group, nullptr) != 0 || + (p != nullptr && EC_GROUP_cmp(group, p->group, nullptr) != 0)) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + + UniquePtr new_ctx; + if (ctx == nullptr) { + new_ctx.reset(BN_CTX_new()); + if (new_ctx == nullptr) { + return 0; + } + ctx = new_ctx.get(); + } + + // If both |g_scalar| and |p_scalar| are non-NULL, + // |ec_point_mul_scalar_public| would share the doublings between the two + // products, which would be more efficient. However, we conservatively assume + // the caller needs a constant-time operation. (ECDSA verification does not + // use this function.) + // + // Previously, the low-level constant-time multiplication function aligned + // with this function's calling convention, but this was misleading. Curves + // which combined the two multiplications did not avoid the doubling case + // in the incomplete addition formula and were not constant-time. + + if (g_scalar != nullptr) { + EC_SCALAR scalar; + if (!arbitrary_bignum_to_scalar(group, &scalar, g_scalar, ctx) || + !ec_point_mul_scalar_base(group, &r->raw, &scalar)) { + return 0; + } + } + + if (p_scalar != nullptr) { + EC_SCALAR scalar; + EC_JACOBIAN tmp; + if (!arbitrary_bignum_to_scalar(group, &scalar, p_scalar, ctx) || + !ec_point_mul_scalar(group, &tmp, &p->raw, &scalar)) { + return 0; + } + if (g_scalar == nullptr) { + OPENSSL_memcpy(&r->raw, &tmp, sizeof(EC_JACOBIAN)); + } else { + group->meth->add(group, &r->raw, &r->raw, &tmp); + } + } + + return 1; +} + +int EC_POINT_mul(const EC_GROUP *group, EC_POINT *r, const BIGNUM *g_scalar, + const EC_POINT *p, const BIGNUM *p_scalar, BN_CTX *ctx) { + boringssl_ensure_ecc_self_test(); + + return ec_point_mul_no_self_test(group, r, g_scalar, p, p_scalar, ctx); +} + +int bssl::ec_point_mul_scalar_public(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_SCALAR *g_scalar, + const EC_JACOBIAN *p, + const EC_SCALAR *p_scalar) { + if (g_scalar == nullptr || p_scalar == nullptr || p == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + if (group->meth->mul_public == nullptr) { + return group->meth->mul_public_batch(group, r, g_scalar, p, p_scalar, 1); + } + + group->meth->mul_public(group, r, g_scalar, p, p_scalar); + return 1; +} + +int bssl::ec_point_mul_scalar_public_batch( + const EC_GROUP *group, EC_JACOBIAN *r, const EC_SCALAR *g_scalar, + const EC_JACOBIAN *points, const EC_SCALAR *scalars, size_t num) { + if (group->meth->mul_public_batch == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + return group->meth->mul_public_batch(group, r, g_scalar, points, scalars, + num); +} + +int bssl::ec_point_mul_scalar(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *p, const EC_SCALAR *scalar) { + if (p == nullptr || scalar == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + group->meth->mul(group, r, p, scalar); + + // Check the result is on the curve to defend against fault attacks or bugs. + // This has negligible cost compared to the multiplication. + if (!ec_GFp_simple_is_on_curve(group, r)) { + OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); + return 0; + } + + return 1; +} + +int bssl::ec_point_mul_scalar_base(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_SCALAR *scalar) { + if (scalar == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + group->meth->mul_base(group, r, scalar); + + // Check the result is on the curve to defend against fault attacks or bugs. + // This has negligible cost compared to the multiplication. This can only + // happen on bug or CPU fault, so it okay to leak this. The alternative would + // be to proceed with bad data. + if (!constant_time_declassify_int(ec_GFp_simple_is_on_curve(group, r))) { + OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); + return 0; + } + + return 1; +} + +int bssl::ec_point_mul_scalar_batch( + const EC_GROUP *group, EC_JACOBIAN *r, const EC_JACOBIAN *p0, + const EC_SCALAR *scalar0, const EC_JACOBIAN *p1, const EC_SCALAR *scalar1, + const EC_JACOBIAN *p2, const EC_SCALAR *scalar2) { + if (group->meth->mul_batch == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + group->meth->mul_batch(group, r, p0, scalar0, p1, scalar1, p2, scalar2); + + // Check the result is on the curve to defend against fault attacks or bugs. + // This has negligible cost compared to the multiplication. + if (!ec_GFp_simple_is_on_curve(group, r)) { + OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); + return 0; + } + + return 1; +} + +int bssl::ec_init_precomp(const EC_GROUP *group, EC_PRECOMP *out, + const EC_JACOBIAN *p) { + if (group->meth->init_precomp == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + return group->meth->init_precomp(group, out, p); +} + +int bssl::ec_point_mul_scalar_precomp( + const EC_GROUP *group, EC_JACOBIAN *r, const EC_PRECOMP *p0, + const EC_SCALAR *scalar0, const EC_PRECOMP *p1, const EC_SCALAR *scalar1, + const EC_PRECOMP *p2, const EC_SCALAR *scalar2) { + if (group->meth->mul_precomp == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + + group->meth->mul_precomp(group, r, p0, scalar0, p1, scalar1, p2, scalar2); + + // Check the result is on the curve to defend against fault attacks or bugs. + // This has negligible cost compared to the multiplication. + if (!ec_GFp_simple_is_on_curve(group, r)) { + OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); + return 0; + } + + return 1; +} + +void bssl::ec_point_select(const EC_GROUP *group, EC_JACOBIAN *out, + BN_ULONG mask, const EC_JACOBIAN *a, + const EC_JACOBIAN *b) { + ec_felem_select(group, &out->X, mask, &a->X, &b->X); + ec_felem_select(group, &out->Y, mask, &a->Y, &b->Y); + ec_felem_select(group, &out->Z, mask, &a->Z, &b->Z); +} + +void bssl::ec_affine_select(const EC_GROUP *group, EC_AFFINE *out, + BN_ULONG mask, const EC_AFFINE *a, + const EC_AFFINE *b) { + ec_felem_select(group, &out->X, mask, &a->X, &b->X); + ec_felem_select(group, &out->Y, mask, &a->Y, &b->Y); +} + +void bssl::ec_precomp_select(const EC_GROUP *group, EC_PRECOMP *out, + BN_ULONG mask, const EC_PRECOMP *a, + const EC_PRECOMP *b) { + static_assert(sizeof(out->comb) == sizeof(*out), + "out->comb does not span the entire structure"); + for (size_t i = 0; i < std::size(out->comb); i++) { + ec_affine_select(group, &out->comb[i], mask, &a->comb[i], &b->comb[i]); + } +} + +int bssl::ec_cmp_x_coordinate(const EC_GROUP *group, const EC_JACOBIAN *p, + const EC_SCALAR *r) { + return group->meth->cmp_x_coordinate(group, p, r); +} + +int bssl::ec_get_x_coordinate_as_scalar(const EC_GROUP *group, EC_SCALAR *out, + const EC_JACOBIAN *p) { + uint8_t bytes[EC_MAX_BYTES]; + size_t len; + if (!ec_get_x_coordinate_as_bytes(group, bytes, &len, sizeof(bytes), p)) { + return 0; + } + + // The x-coordinate is bounded by p, but we need a scalar, bounded by the + // order. These may not have the same size. However, we must have p < 2×order, + // assuming p is not tiny (p >= 17). + // + // Thus |bytes| will fit in |order.width + 1| words, and we can reduce by + // performing at most one subtraction. + // + // Proof: We only work with prime order curves, so the number of points on + // the curve is the order. Thus Hasse's theorem gives: + // + // |order - (p + 1)| <= 2×sqrt(p) + // p + 1 - order <= 2×sqrt(p) + // p + 1 - 2×sqrt(p) <= order + // p + 1 - 2×(p/4) < order (p/4 > sqrt(p) for p >= 17) + // p/2 < p/2 + 1 < order + // p < 2×order + // + // Additionally, one can manually check this property for built-in curves. It + // is enforced for legacy custom curves in |EC_GROUP_set_generator|. + const BIGNUM *order = EC_GROUP_get0_order(group); + BN_ULONG words[EC_MAX_WORDS + 1] = {0}; + bn_big_endian_to_words(words, order->width + 1, bytes, len); + bn_reduce_once(out->words, words, /*carry=*/words[order->width], order->d, + order->width); + return 1; +} + +int bssl::ec_get_x_coordinate_as_bytes(const EC_GROUP *group, uint8_t *out, + size_t *out_len, size_t max_out, + const EC_JACOBIAN *p) { + size_t len = BN_num_bytes(&group->field.N); + assert(len <= EC_MAX_BYTES); + if (max_out < len) { + OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL); + return 0; + } + + EC_FELEM x; + if (!group->meth->point_get_affine_coordinates(group, p, &x, nullptr)) { + return 0; + } + + ec_felem_to_bytes(group, out, out_len, &x); + *out_len = len; + return 1; +} + +void bssl::ec_set_to_safe_point(const EC_GROUP *group, EC_JACOBIAN *out) { + if (group->has_order) { + ec_GFp_simple_point_copy(out, &group->generator.raw); + } else { + // The generator can be missing if the caller is in the process of + // constructing an arbitrary group. In this case, we give up and use the + // point at infinity. + ec_GFp_simple_point_set_to_infinity(group, out); + } +} + +void EC_GROUP_set_asn1_flag(EC_GROUP *group, int flag) {} + +int EC_GROUP_get_asn1_flag(const EC_GROUP *group) { + return OPENSSL_EC_NAMED_CURVE; +} + +const EC_METHOD *EC_GROUP_method_of(const EC_GROUP *group) { + // This function exists purely to give callers a way to call + // |EC_METHOD_get_field_type|. cryptography.io crashes if |EC_GROUP_method_of| + // returns NULL, so return some other garbage pointer. + return (const EC_METHOD *)0x12340000; +} + +int EC_METHOD_get_field_type(const EC_METHOD *meth) { + return NID_X9_62_prime_field; +} + +void EC_GROUP_set_point_conversion_form(EC_GROUP *group, + point_conversion_form_t form) { + if (form != POINT_CONVERSION_UNCOMPRESSED) { + abort(); + } +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/ec_key.c b/third_party/boringssl/src/crypto/fipsmodule/ec/ec_key.c deleted file mode 100644 index e676e3c0..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/ec_key.c +++ /dev/null @@ -1,489 +0,0 @@ -/* Originally written by Bodo Moeller for the OpenSSL project. - * ==================================================================== - * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ -/* ==================================================================== - * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. - * - * Portions of the attached software ("Contribution") are developed by - * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project. - * - * The Contribution is licensed pursuant to the OpenSSL open source - * license provided above. - * - * The elliptic curve binary polynomial software is originally written by - * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems - * Laboratories. */ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../delocate.h" -#include "../service_indicator/internal.h" -#include "../../internal.h" - - -DEFINE_STATIC_EX_DATA_CLASS(g_ec_ex_data_class) - -static EC_WRAPPED_SCALAR *ec_wrapped_scalar_new(const EC_GROUP *group) { - EC_WRAPPED_SCALAR *wrapped = OPENSSL_malloc(sizeof(EC_WRAPPED_SCALAR)); - if (wrapped == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE); - return NULL; - } - - OPENSSL_memset(wrapped, 0, sizeof(EC_WRAPPED_SCALAR)); - wrapped->bignum.d = wrapped->scalar.words; - wrapped->bignum.width = group->order.width; - wrapped->bignum.dmax = group->order.width; - wrapped->bignum.flags = BN_FLG_STATIC_DATA; - return wrapped; -} - -static void ec_wrapped_scalar_free(EC_WRAPPED_SCALAR *scalar) { - OPENSSL_free(scalar); -} - -EC_KEY *EC_KEY_new(void) { return EC_KEY_new_method(NULL); } - -EC_KEY *EC_KEY_new_method(const ENGINE *engine) { - EC_KEY *ret = OPENSSL_malloc(sizeof(EC_KEY)); - if (ret == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE); - return NULL; - } - - OPENSSL_memset(ret, 0, sizeof(EC_KEY)); - - if (engine) { - ret->ecdsa_meth = ENGINE_get_ECDSA_method(engine); - } - if (ret->ecdsa_meth) { - METHOD_ref(ret->ecdsa_meth); - } - - ret->conv_form = POINT_CONVERSION_UNCOMPRESSED; - ret->references = 1; - - CRYPTO_new_ex_data(&ret->ex_data); - - if (ret->ecdsa_meth && ret->ecdsa_meth->init && !ret->ecdsa_meth->init(ret)) { - CRYPTO_free_ex_data(g_ec_ex_data_class_bss_get(), ret, &ret->ex_data); - if (ret->ecdsa_meth) { - METHOD_unref(ret->ecdsa_meth); - } - OPENSSL_free(ret); - return NULL; - } - - return ret; -} - -EC_KEY *EC_KEY_new_by_curve_name(int nid) { - EC_KEY *ret = EC_KEY_new(); - if (ret == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE); - return NULL; - } - ret->group = EC_GROUP_new_by_curve_name(nid); - if (ret->group == NULL) { - EC_KEY_free(ret); - return NULL; - } - return ret; -} - -void EC_KEY_free(EC_KEY *r) { - if (r == NULL) { - return; - } - - if (!CRYPTO_refcount_dec_and_test_zero(&r->references)) { - return; - } - - if (r->ecdsa_meth) { - if (r->ecdsa_meth->finish) { - r->ecdsa_meth->finish(r); - } - METHOD_unref(r->ecdsa_meth); - } - - EC_GROUP_free(r->group); - EC_POINT_free(r->pub_key); - ec_wrapped_scalar_free(r->priv_key); - - CRYPTO_free_ex_data(g_ec_ex_data_class_bss_get(), r, &r->ex_data); - - OPENSSL_free(r); -} - -EC_KEY *EC_KEY_dup(const EC_KEY *src) { - if (src == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return NULL; - } - - EC_KEY *ret = EC_KEY_new(); - if (ret == NULL) { - return NULL; - } - - if ((src->group != NULL && - !EC_KEY_set_group(ret, src->group)) || - (src->pub_key != NULL && - !EC_KEY_set_public_key(ret, src->pub_key)) || - (src->priv_key != NULL && - !EC_KEY_set_private_key(ret, EC_KEY_get0_private_key(src)))) { - EC_KEY_free(ret); - return NULL; - } - - ret->enc_flag = src->enc_flag; - ret->conv_form = src->conv_form; - return ret; -} - -int EC_KEY_up_ref(EC_KEY *r) { - CRYPTO_refcount_inc(&r->references); - return 1; -} - -int EC_KEY_is_opaque(const EC_KEY *key) { - return key->ecdsa_meth && (key->ecdsa_meth->flags & ECDSA_FLAG_OPAQUE); -} - -const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key) { return key->group; } - -int EC_KEY_set_group(EC_KEY *key, const EC_GROUP *group) { - // If |key| already has a group, it is an error to switch to another one. - if (key->group != NULL) { - if (EC_GROUP_cmp(key->group, group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); - return 0; - } - return 1; - } - - assert(key->priv_key == NULL); - assert(key->pub_key == NULL); - - EC_GROUP_free(key->group); - key->group = EC_GROUP_dup(group); - return key->group != NULL; -} - -const BIGNUM *EC_KEY_get0_private_key(const EC_KEY *key) { - return key->priv_key != NULL ? &key->priv_key->bignum : NULL; -} - -int EC_KEY_set_private_key(EC_KEY *key, const BIGNUM *priv_key) { - if (key->group == NULL) { - OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS); - return 0; - } - - EC_WRAPPED_SCALAR *scalar = ec_wrapped_scalar_new(key->group); - if (scalar == NULL) { - return 0; - } - if (!ec_bignum_to_scalar(key->group, &scalar->scalar, priv_key)) { - OPENSSL_PUT_ERROR(EC, EC_R_WRONG_ORDER); - ec_wrapped_scalar_free(scalar); - return 0; - } - ec_wrapped_scalar_free(key->priv_key); - key->priv_key = scalar; - return 1; -} - -const EC_POINT *EC_KEY_get0_public_key(const EC_KEY *key) { - return key->pub_key; -} - -int EC_KEY_set_public_key(EC_KEY *key, const EC_POINT *pub_key) { - if (key->group == NULL) { - OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS); - return 0; - } - - if (pub_key != NULL && EC_GROUP_cmp(key->group, pub_key->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); - return 0; - } - - EC_POINT_free(key->pub_key); - key->pub_key = EC_POINT_dup(pub_key, key->group); - return (key->pub_key == NULL) ? 0 : 1; -} - -unsigned int EC_KEY_get_enc_flags(const EC_KEY *key) { return key->enc_flag; } - -void EC_KEY_set_enc_flags(EC_KEY *key, unsigned int flags) { - key->enc_flag = flags; -} - -point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *key) { - return key->conv_form; -} - -void EC_KEY_set_conv_form(EC_KEY *key, point_conversion_form_t cform) { - key->conv_form = cform; -} - -int EC_KEY_check_key(const EC_KEY *eckey) { - if (!eckey || !eckey->group || !eckey->pub_key) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - - if (EC_POINT_is_at_infinity(eckey->group, eckey->pub_key)) { - OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); - return 0; - } - - // Test whether the public key is on the elliptic curve. - if (!EC_POINT_is_on_curve(eckey->group, eckey->pub_key, NULL)) { - OPENSSL_PUT_ERROR(EC, EC_R_POINT_IS_NOT_ON_CURVE); - return 0; - } - - // Check the public and private keys match. - // - // NOTE: this is a FIPS pair-wise consistency check for the ECDH case. See SP - // 800-56Ar3, page 36. - if (eckey->priv_key != NULL) { - EC_RAW_POINT point; - if (!ec_point_mul_scalar_base(eckey->group, &point, - &eckey->priv_key->scalar)) { - OPENSSL_PUT_ERROR(EC, ERR_R_EC_LIB); - return 0; - } - if (!ec_GFp_simple_points_equal(eckey->group, &point, - &eckey->pub_key->raw)) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_PRIVATE_KEY); - return 0; - } - } - - return 1; -} - -int EC_KEY_check_fips(const EC_KEY *key) { - int ret = 0; - FIPS_service_indicator_lock_state(); - - if (EC_KEY_is_opaque(key)) { - // Opaque keys can't be checked. - OPENSSL_PUT_ERROR(EC, EC_R_PUBLIC_KEY_VALIDATION_FAILED); - goto end; - } - - if (!EC_KEY_check_key(key)) { - goto end; - } - - if (key->priv_key) { - uint8_t data[16] = {0}; - ECDSA_SIG *sig = ECDSA_do_sign(data, sizeof(data), key); - if (boringssl_fips_break_test("ECDSA_PWCT")) { - data[0] = ~data[0]; - } - int ok = sig != NULL && - ECDSA_do_verify(data, sizeof(data), sig, key); - ECDSA_SIG_free(sig); - if (!ok) { - OPENSSL_PUT_ERROR(EC, EC_R_PUBLIC_KEY_VALIDATION_FAILED); - goto end; - } - } - - ret = 1; - -end: - FIPS_service_indicator_unlock_state(); - if (ret) { - EC_KEY_keygen_verify_service_indicator(key); - } - - return ret; -} - -int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, const BIGNUM *x, - const BIGNUM *y) { - EC_POINT *point = NULL; - int ok = 0; - - if (!key || !key->group || !x || !y) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - - point = EC_POINT_new(key->group); - if (point == NULL || - !EC_POINT_set_affine_coordinates_GFp(key->group, point, x, y, NULL) || - !EC_KEY_set_public_key(key, point) || - !EC_KEY_check_key(key)) { - goto err; - } - - ok = 1; - -err: - EC_POINT_free(point); - return ok; -} - -size_t EC_KEY_key2buf(const EC_KEY *key, point_conversion_form_t form, - unsigned char **out_buf, BN_CTX *ctx) { - if (key == NULL || key->pub_key == NULL || key->group == NULL) { - return 0; - } - - const size_t len = - EC_POINT_point2oct(key->group, key->pub_key, form, NULL, 0, ctx); - if (len == 0) { - return 0; - } - - uint8_t *buf = OPENSSL_malloc(len); - if (buf == NULL) { - return 0; - } - - if (EC_POINT_point2oct(key->group, key->pub_key, form, buf, len, ctx) != - len) { - OPENSSL_free(buf); - return 0; - } - - *out_buf = buf; - return len; -} - -int EC_KEY_generate_key(EC_KEY *key) { - if (key == NULL || key->group == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - - // Check that the group order is FIPS compliant (FIPS 186-4 B.4.2). - if (BN_num_bits(EC_GROUP_get0_order(key->group)) < 160) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER); - return 0; - } - - static const uint8_t kDefaultAdditionalData[32] = {0}; - EC_WRAPPED_SCALAR *priv_key = ec_wrapped_scalar_new(key->group); - EC_POINT *pub_key = EC_POINT_new(key->group); - if (priv_key == NULL || pub_key == NULL || - // Generate the private key by testing candidates (FIPS 186-4 B.4.2). - !ec_random_nonzero_scalar(key->group, &priv_key->scalar, - kDefaultAdditionalData) || - !ec_point_mul_scalar_base(key->group, &pub_key->raw, &priv_key->scalar)) { - EC_POINT_free(pub_key); - ec_wrapped_scalar_free(priv_key); - return 0; - } - - ec_wrapped_scalar_free(key->priv_key); - key->priv_key = priv_key; - EC_POINT_free(key->pub_key); - key->pub_key = pub_key; - return 1; -} - -int EC_KEY_generate_key_fips(EC_KEY *eckey) { - boringssl_ensure_ecc_self_test(); - - if (EC_KEY_generate_key(eckey) && EC_KEY_check_fips(eckey)) { - return 1; - } - - EC_POINT_free(eckey->pub_key); - ec_wrapped_scalar_free(eckey->priv_key); - eckey->pub_key = NULL; - eckey->priv_key = NULL; - return 0; -} - -int EC_KEY_get_ex_new_index(long argl, void *argp, CRYPTO_EX_unused *unused, - CRYPTO_EX_dup *dup_unused, - CRYPTO_EX_free *free_func) { - int index; - if (!CRYPTO_get_ex_new_index(g_ec_ex_data_class_bss_get(), &index, argl, argp, - free_func)) { - return -1; - } - return index; -} - -int EC_KEY_set_ex_data(EC_KEY *d, int idx, void *arg) { - return CRYPTO_set_ex_data(&d->ex_data, idx, arg); -} - -void *EC_KEY_get_ex_data(const EC_KEY *d, int idx) { - return CRYPTO_get_ex_data(&d->ex_data, idx); -} - -void EC_KEY_set_asn1_flag(EC_KEY *key, int flag) {} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/ec_key.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/ec_key.cc.inc new file mode 100644 index 00000000..e666baf1 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/ec_key.cc.inc @@ -0,0 +1,536 @@ +// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "../bcm_interface.h" +#include "../delocate.h" +#include "../ecdsa/internal.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +DEFINE_STATIC_EX_DATA_CLASS(g_ec_ex_data_class) + +static bssl::EC_WRAPPED_SCALAR *ec_wrapped_scalar_new(const EC_GROUP *group) { + EC_WRAPPED_SCALAR *wrapped = New(); + if (wrapped == nullptr) { + return nullptr; + } + + wrapped->bignum.d = wrapped->scalar.words; + wrapped->bignum.width = group->order.N.width; + wrapped->bignum.dmax = group->order.N.width; + wrapped->bignum.flags = BN_FLG_STATIC_DATA; + return wrapped; +} + +static void ec_wrapped_scalar_free(EC_WRAPPED_SCALAR *scalar) { + Delete(scalar); +} + +ECKey::ECKey(const ENGINE *engine) + : RefCounted(CheckSubClass()), + ecdsa_meth(engine ? ENGINE_get_ECDSA_method(engine) : nullptr) { + if (ecdsa_meth) { + METHOD_ref(ecdsa_meth); + } + CRYPTO_new_ex_data(&ex_data); +} + +EC_KEY *EC_KEY_new() { return EC_KEY_new_method(nullptr); } + +EC_KEY *EC_KEY_new_method(const ENGINE *engine) { + UniquePtr ret(New(engine)); + if (ret == nullptr) { + return nullptr; + } + if (ret->ecdsa_meth && ret->ecdsa_meth->init && + !ret->ecdsa_meth->init(ret.get())) { + METHOD_unref(ret->ecdsa_meth); + ret->ecdsa_meth = nullptr; + return nullptr; + } + return ret.release(); +} + +EC_KEY *EC_KEY_new_by_curve_name(int nid) { + ECKey *ret = FromOpaque(EC_KEY_new()); + if (ret == nullptr) { + return nullptr; + } + ret->group = EC_GROUP_new_by_curve_name(nid); + if (ret->group == nullptr) { + EC_KEY_free(ret); + return nullptr; + } + return ret; +} + +ECKey::~ECKey() { + if (ecdsa_meth) { + if (ecdsa_meth->finish) { + ecdsa_meth->finish(this); + } + METHOD_unref(ecdsa_meth); + } + + CRYPTO_free_ex_data(g_ec_ex_data_class_bss_get(), &ex_data); + + EC_GROUP_free(group); + EC_POINT_free(pub_key); + ec_wrapped_scalar_free(priv_key); +} + +void EC_KEY_free(EC_KEY *r) { + if (r == nullptr) { + return; + } + + auto *impl = FromOpaque(r); + impl->DecRefInternal(); +} + +EC_KEY *EC_KEY_dup(const EC_KEY *src) { + if (src == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return nullptr; + } + + UniquePtr ret(FromOpaque(EC_KEY_new())); + if (ret == nullptr) { + return nullptr; + } + + auto *impl = FromOpaque(src); + if ((impl->group != nullptr && !EC_KEY_set_group(ret.get(), impl->group)) || + (impl->pub_key != nullptr && + !EC_KEY_set_public_key(ret.get(), impl->pub_key)) || + (impl->priv_key != nullptr && + !EC_KEY_set_private_key(ret.get(), EC_KEY_get0_private_key(impl)))) { + return nullptr; + } + + ret->enc_flag = impl->enc_flag; + ret->conv_form = impl->conv_form; + return ret.release(); +} + +int EC_KEY_up_ref(EC_KEY *r) { + auto *impl = FromOpaque(r); + impl->UpRefInternal(); + return 1; +} + +int EC_KEY_is_opaque(const EC_KEY *key) { + auto *impl = FromOpaque(key); + return impl->ecdsa_meth && (impl->ecdsa_meth->flags & ECDSA_FLAG_OPAQUE); +} + +const EC_GROUP *EC_KEY_get0_group(const EC_KEY *key) { + auto *impl = FromOpaque(key); + return impl->group; +} + +int EC_KEY_set_group(EC_KEY *key, const EC_GROUP *group) { + auto *impl = FromOpaque(key); + + // If |impl| already has a group, it is an error to switch to another one. + if (impl->group != nullptr) { + if (EC_GROUP_cmp(impl->group, group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); + return 0; + } + return 1; + } + + assert(impl->priv_key == nullptr); + assert(impl->pub_key == nullptr); + + EC_GROUP_free(impl->group); + impl->group = EC_GROUP_dup(group); + return impl->group != nullptr; +} + +const BIGNUM *EC_KEY_get0_private_key(const EC_KEY *key) { + auto *impl = FromOpaque(key); + return impl->priv_key != nullptr ? &impl->priv_key->bignum : nullptr; +} + +int EC_KEY_set_private_key(EC_KEY *key, const BIGNUM *priv_key) { + auto *impl = FromOpaque(key); + if (impl->group == nullptr) { + OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS); + return 0; + } + + EC_WRAPPED_SCALAR *scalar = ec_wrapped_scalar_new(impl->group); + if (scalar == nullptr) { + return 0; + } + if (!ec_bignum_to_scalar(impl->group, &scalar->scalar, priv_key) || + // Zero is not a valid private key, so it is safe to leak the result of + // this comparison. + constant_time_declassify_int( + ec_scalar_is_zero(impl->group, &scalar->scalar))) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_PRIVATE_KEY); + ec_wrapped_scalar_free(scalar); + return 0; + } + ec_wrapped_scalar_free(impl->priv_key); + impl->priv_key = scalar; + return 1; +} + +const EC_POINT *EC_KEY_get0_public_key(const EC_KEY *key) { + auto *impl = FromOpaque(key); + return impl->pub_key; +} + +int EC_KEY_set_public_key(EC_KEY *key, const EC_POINT *pub_key) { + auto *impl = FromOpaque(key); + if (impl->group == nullptr) { + OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS); + return 0; + } + + if (pub_key != nullptr && EC_POINT_is_at_infinity(pub_key->group, pub_key)) { + OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); + return 0; + } + + if (pub_key != nullptr && + EC_GROUP_cmp(impl->group, pub_key->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_GROUP_MISMATCH); + return 0; + } + + EC_POINT_free(impl->pub_key); + impl->pub_key = EC_POINT_dup(pub_key, impl->group); + return (impl->pub_key == nullptr) ? 0 : 1; +} + +unsigned int EC_KEY_get_enc_flags(const EC_KEY *key) { + auto *impl = FromOpaque(key); + return impl->enc_flag; +} + +void EC_KEY_set_enc_flags(EC_KEY *key, unsigned int flags) { + auto *impl = FromOpaque(key); + impl->enc_flag = flags; +} + +point_conversion_form_t EC_KEY_get_conv_form(const EC_KEY *key) { + auto *impl = FromOpaque(key); + return impl->conv_form; +} + +void EC_KEY_set_conv_form(EC_KEY *key, point_conversion_form_t cform) { + auto *impl = FromOpaque(key); + impl->conv_form = cform; +} + +int EC_KEY_check_key(const EC_KEY *eckey) { + auto *impl = FromOpaque(eckey); + + if (!eckey || !impl->group || !impl->pub_key) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + if (EC_POINT_is_at_infinity(impl->group, impl->pub_key)) { + OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); + return 0; + } + + // Test whether the public key is on the elliptic curve. + if (!EC_POINT_is_on_curve(impl->group, impl->pub_key, nullptr)) { + OPENSSL_PUT_ERROR(EC, EC_R_POINT_IS_NOT_ON_CURVE); + return 0; + } + + // Check the public and private keys match. + // + // NOTE: this is a FIPS pair-wise consistency check for the ECDH case. See SP + // 800-56Ar3, page 36. + if (impl->priv_key != nullptr) { + EC_JACOBIAN point; + if (!ec_point_mul_scalar_base(impl->group, &point, + &impl->priv_key->scalar)) { + OPENSSL_PUT_ERROR(EC, ERR_R_EC_LIB); + return 0; + } + // Leaking this comparison only leaks whether |eckey|'s public key was + // correct. + if (!constant_time_declassify_int(ec_GFp_simple_points_equal( + impl->group, &point, &impl->pub_key->raw))) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_PRIVATE_KEY); + return 0; + } + } + + return 1; +} + +int EC_KEY_check_fips(const EC_KEY *key) { + auto *impl = FromOpaque(key); + + int ret = 0; + FIPS_service_indicator_lock_state(); + + if (!EC_KEY_check_key(impl)) { + goto end; + } + + if (impl->priv_key) { + uint8_t digest[SHA256_DIGEST_LENGTH] = {0}; + uint8_t sig[ECDSA_MAX_FIXED_LEN]; + size_t sig_len; + if (!ecdsa_sign_fixed(digest, sizeof(digest), sig, &sig_len, sizeof(sig), + impl)) { + goto end; + } + if (boringssl_fips_break_test("ECDSA_PWCT")) { + digest[0] = ~digest[0]; + } + if (!ecdsa_verify_fixed(digest, sizeof(digest), sig, sig_len, impl)) { + OPENSSL_PUT_ERROR(EC, EC_R_PUBLIC_KEY_VALIDATION_FAILED); + goto end; + } + } + + ret = 1; + +end: + FIPS_service_indicator_unlock_state(); + if (ret) { + EC_KEY_keygen_verify_service_indicator(impl); + } + + return ret; +} + +int EC_KEY_set_public_key_affine_coordinates(EC_KEY *key, const BIGNUM *x, + const BIGNUM *y) { + auto *impl = FromOpaque(key); + + if (!key || !impl->group || !x || !y) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + UniquePtr point(EC_POINT_new(impl->group)); + if (point == nullptr || + !EC_POINT_set_affine_coordinates_GFp(impl->group, point.get(), x, y, + nullptr) || + !EC_KEY_set_public_key(key, point.get()) || // + !EC_KEY_check_key(key)) { + return 0; + } + + return 1; +} + +int EC_KEY_oct2key(EC_KEY *key, const uint8_t *in, size_t len, BN_CTX *ctx) { + auto *impl = FromOpaque(key); + + if (impl->group == nullptr) { + OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS); + return 0; + } + + UniquePtr point(EC_POINT_new(impl->group)); + return point != nullptr && + EC_POINT_oct2point(impl->group, point.get(), in, len, ctx) && + EC_KEY_set_public_key(key, point.get()); +} + +size_t EC_KEY_key2buf(const EC_KEY *key, point_conversion_form_t form, + uint8_t **out_buf, BN_CTX *ctx) { + auto *impl = FromOpaque(key); + + if (impl == nullptr || impl->pub_key == nullptr || impl->group == nullptr) { + OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS); + return 0; + } + + return EC_POINT_point2buf(impl->group, impl->pub_key, form, out_buf, ctx); +} + +int EC_KEY_oct2priv(EC_KEY *key, const uint8_t *in, size_t len) { + auto *impl = FromOpaque(key); + + if (impl->group == nullptr) { + OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS); + return 0; + } + + if (len != BN_num_bytes(EC_GROUP_get0_order(impl->group))) { + OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); + return 0; + } + + BIGNUM *priv_key = BN_bin2bn(in, len, nullptr); + int ok = priv_key != nullptr && // + EC_KEY_set_private_key(key, priv_key); + BN_free(priv_key); + return ok; +} + +size_t EC_KEY_priv2oct(const EC_KEY *key, uint8_t *out, size_t max_out) { + auto *impl = FromOpaque(key); + + if (impl->group == nullptr || impl->priv_key == nullptr) { + OPENSSL_PUT_ERROR(EC, EC_R_MISSING_PARAMETERS); + return 0; + } + + size_t len = BN_num_bytes(EC_GROUP_get0_order(impl->group)); + if (out == nullptr) { + return len; + } + + if (max_out < len) { + OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL); + return 0; + } + + size_t bytes_written; + ec_scalar_to_bytes(impl->group, out, &bytes_written, &impl->priv_key->scalar); + assert(bytes_written == len); + return len; +} + +size_t EC_KEY_priv2buf(const EC_KEY *key, uint8_t **out_buf) { + *out_buf = nullptr; + size_t len = EC_KEY_priv2oct(key, nullptr, 0); + if (len == 0) { + return 0; + } + + uint8_t *buf = reinterpret_cast(OPENSSL_malloc(len)); + if (buf == nullptr) { + return 0; + } + + len = EC_KEY_priv2oct(key, buf, len); + if (len == 0) { + OPENSSL_free(buf); + return 0; + } + + *out_buf = buf; + return len; +} + +int EC_KEY_generate_key(EC_KEY *key) { + auto *impl = FromOpaque(key); + + if (impl == nullptr || impl->group == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + // Generate an ECDSA key pair via rejection sampling. This function implements + // FIPS 186-5, A.2.2, repeating the process on failure. + + // Check the group order is large enough. See step 1 of FIPS 186-5, A.2.2. + if (EC_GROUP_order_bits(impl->group) < 224) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER); + return 0; + } + + static const uint8_t kDefaultAdditionalData[32] = {0}; + EC_WRAPPED_SCALAR *priv_key = ec_wrapped_scalar_new(impl->group); + EC_POINT *pub_key = EC_POINT_new(impl->group); + if (priv_key == nullptr || pub_key == nullptr || + !ec_random_nonzero_scalar(impl->group, &priv_key->scalar, + kDefaultAdditionalData) || + !ec_point_mul_scalar_base(impl->group, &pub_key->raw, + &priv_key->scalar)) { + EC_POINT_free(pub_key); + ec_wrapped_scalar_free(priv_key); + return 0; + } + + // The public key is derived from the private key, but it is public. + // + // TODO(crbug.com/boringssl/677): This isn't quite right. While |pub_key| + // represents a public point, it is still in Jacobian form and the exact + // Jacobian representation is secret. We need to make it affine first. See + // discussion in the bug. + CONSTTIME_DECLASSIFY(&pub_key->raw, sizeof(pub_key->raw)); + + ec_wrapped_scalar_free(impl->priv_key); + impl->priv_key = priv_key; + EC_POINT_free(impl->pub_key); + impl->pub_key = pub_key; + return 1; +} + +int EC_KEY_generate_key_fips(EC_KEY *eckey) { + auto *impl = FromOpaque(eckey); + + if (impl == nullptr || impl->group == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + boringssl_ensure_ecc_self_test(); + + if (EC_KEY_generate_key(impl) && EC_KEY_check_fips(impl)) { + return 1; + } + + EC_POINT_free(impl->pub_key); + ec_wrapped_scalar_free(impl->priv_key); + impl->pub_key = nullptr; + impl->priv_key = nullptr; + return 0; +} + +int EC_KEY_get_ex_new_index(long argl, void *argp, CRYPTO_EX_unused *unused, + CRYPTO_EX_dup *dup_unused, + CRYPTO_EX_free *free_func) { + return CRYPTO_get_ex_new_index_ex(g_ec_ex_data_class_bss_get(), argl, argp, + free_func); +} + +int EC_KEY_set_ex_data(EC_KEY *d, int idx, void *arg) { + auto *impl = FromOpaque(d); + + return CRYPTO_set_ex_data(&impl->ex_data, idx, arg); +} + +void *EC_KEY_get_ex_data(const EC_KEY *d, int idx) { + auto *impl = FromOpaque(d); + + return CRYPTO_get_ex_data(&impl->ex_data, idx); +} + +void EC_KEY_set_asn1_flag(EC_KEY *key, int flag) {} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/ec_montgomery.c b/third_party/boringssl/src/crypto/fipsmodule/ec/ec_montgomery.c deleted file mode 100644 index 21d5d40d..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/ec_montgomery.c +++ /dev/null @@ -1,524 +0,0 @@ -/* Originally written by Bodo Moeller and Nils Larsch for the OpenSSL project. - * ==================================================================== - * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ -/* ==================================================================== - * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. - * - * Portions of the attached software ("Contribution") are developed by - * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project. - * - * The Contribution is licensed pursuant to the OpenSSL open source - * license provided above. - * - * The elliptic curve binary polynomial software is originally written by - * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems - * Laboratories. */ - -#include - -#include -#include -#include - -#include "../bn/internal.h" -#include "../delocate.h" -#include "internal.h" - - -int ec_GFp_mont_group_init(EC_GROUP *group) { - int ok; - - ok = ec_GFp_simple_group_init(group); - group->mont = NULL; - return ok; -} - -void ec_GFp_mont_group_finish(EC_GROUP *group) { - BN_MONT_CTX_free(group->mont); - group->mont = NULL; - ec_GFp_simple_group_finish(group); -} - -int ec_GFp_mont_group_set_curve(EC_GROUP *group, const BIGNUM *p, - const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx) { - BN_MONT_CTX_free(group->mont); - group->mont = BN_MONT_CTX_new_for_modulus(p, ctx); - if (group->mont == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB); - return 0; - } - - if (!ec_GFp_simple_group_set_curve(group, p, a, b, ctx)) { - BN_MONT_CTX_free(group->mont); - group->mont = NULL; - return 0; - } - - return 1; -} - -static void ec_GFp_mont_felem_to_montgomery(const EC_GROUP *group, - EC_FELEM *out, const EC_FELEM *in) { - bn_to_montgomery_small(out->words, in->words, group->field.width, - group->mont); -} - -static void ec_GFp_mont_felem_from_montgomery(const EC_GROUP *group, - EC_FELEM *out, - const EC_FELEM *in) { - bn_from_montgomery_small(out->words, group->field.width, in->words, - group->field.width, group->mont); -} - -static void ec_GFp_mont_felem_inv0(const EC_GROUP *group, EC_FELEM *out, - const EC_FELEM *a) { - bn_mod_inverse0_prime_mont_small(out->words, a->words, group->field.width, - group->mont); -} - -void ec_GFp_mont_felem_mul(const EC_GROUP *group, EC_FELEM *r, - const EC_FELEM *a, const EC_FELEM *b) { - bn_mod_mul_montgomery_small(r->words, a->words, b->words, group->field.width, - group->mont); -} - -void ec_GFp_mont_felem_sqr(const EC_GROUP *group, EC_FELEM *r, - const EC_FELEM *a) { - bn_mod_mul_montgomery_small(r->words, a->words, a->words, group->field.width, - group->mont); -} - -void ec_GFp_mont_felem_to_bytes(const EC_GROUP *group, uint8_t *out, - size_t *out_len, const EC_FELEM *in) { - EC_FELEM tmp; - ec_GFp_mont_felem_from_montgomery(group, &tmp, in); - ec_GFp_simple_felem_to_bytes(group, out, out_len, &tmp); -} - -int ec_GFp_mont_felem_from_bytes(const EC_GROUP *group, EC_FELEM *out, - const uint8_t *in, size_t len) { - if (!ec_GFp_simple_felem_from_bytes(group, out, in, len)) { - return 0; - } - - ec_GFp_mont_felem_to_montgomery(group, out, out); - return 1; -} - -static void ec_GFp_mont_felem_reduce(const EC_GROUP *group, EC_FELEM *out, - const BN_ULONG *words, size_t num) { - // Convert "from" Montgomery form so the value is reduced mod p. - bn_from_montgomery_small(out->words, group->field.width, words, num, - group->mont); - // Convert "to" Montgomery form to remove the R^-1 factor added. - ec_GFp_mont_felem_to_montgomery(group, out, out); - // Convert to Montgomery form to match this implementation's representation. - ec_GFp_mont_felem_to_montgomery(group, out, out); -} - -static void ec_GFp_mont_felem_exp(const EC_GROUP *group, EC_FELEM *out, - const EC_FELEM *a, const BN_ULONG *exp, - size_t num_exp) { - bn_mod_exp_mont_small(out->words, a->words, group->field.width, exp, num_exp, - group->mont); -} - -static int ec_GFp_mont_point_get_affine_coordinates(const EC_GROUP *group, - const EC_RAW_POINT *point, - EC_FELEM *x, EC_FELEM *y) { - if (ec_GFp_simple_is_at_infinity(group, point)) { - OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); - return 0; - } - - // Transform (X, Y, Z) into (x, y) := (X/Z^2, Y/Z^3). Note the check above - // ensures |point->Z| is non-zero, so the inverse always exists. - EC_FELEM z1, z2; - ec_GFp_mont_felem_inv0(group, &z2, &point->Z); - ec_GFp_mont_felem_sqr(group, &z1, &z2); - - if (x != NULL) { - ec_GFp_mont_felem_mul(group, x, &point->X, &z1); - } - - if (y != NULL) { - ec_GFp_mont_felem_mul(group, &z1, &z1, &z2); - ec_GFp_mont_felem_mul(group, y, &point->Y, &z1); - } - - return 1; -} - -static int ec_GFp_mont_jacobian_to_affine_batch(const EC_GROUP *group, - EC_AFFINE *out, - const EC_RAW_POINT *in, - size_t num) { - if (num == 0) { - return 1; - } - - // Compute prefix products of all Zs. Use |out[i].X| as scratch space - // to store these values. - out[0].X = in[0].Z; - for (size_t i = 1; i < num; i++) { - ec_GFp_mont_felem_mul(group, &out[i].X, &out[i - 1].X, &in[i].Z); - } - - // Some input was infinity iff the product of all Zs is zero. - if (ec_felem_non_zero_mask(group, &out[num - 1].X) == 0) { - OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); - return 0; - } - - // Invert the product of all Zs. - EC_FELEM zinvprod; - ec_GFp_mont_felem_inv0(group, &zinvprod, &out[num - 1].X); - for (size_t i = num - 1; i < num; i--) { - // Our loop invariant is that |zinvprod| is Z0^-1 * Z1^-1 * ... * Zi^-1. - // Recover Zi^-1 by multiplying by the previous product. - EC_FELEM zinv, zinv2; - if (i == 0) { - zinv = zinvprod; - } else { - ec_GFp_mont_felem_mul(group, &zinv, &zinvprod, &out[i - 1].X); - // Maintain the loop invariant for the next iteration. - ec_GFp_mont_felem_mul(group, &zinvprod, &zinvprod, &in[i].Z); - } - - // Compute affine coordinates: x = X * Z^-2 and y = Y * Z^-3. - ec_GFp_mont_felem_sqr(group, &zinv2, &zinv); - ec_GFp_mont_felem_mul(group, &out[i].X, &in[i].X, &zinv2); - ec_GFp_mont_felem_mul(group, &out[i].Y, &in[i].Y, &zinv2); - ec_GFp_mont_felem_mul(group, &out[i].Y, &out[i].Y, &zinv); - } - - return 1; -} - -void ec_GFp_mont_add(const EC_GROUP *group, EC_RAW_POINT *out, - const EC_RAW_POINT *a, const EC_RAW_POINT *b) { - if (a == b) { - ec_GFp_mont_dbl(group, out, a); - return; - } - - // The method is taken from: - // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#addition-add-2007-bl - // - // Coq transcription and correctness proof: - // - // - EC_FELEM x_out, y_out, z_out; - BN_ULONG z1nz = ec_felem_non_zero_mask(group, &a->Z); - BN_ULONG z2nz = ec_felem_non_zero_mask(group, &b->Z); - - // z1z1 = z1z1 = z1**2 - EC_FELEM z1z1; - ec_GFp_mont_felem_sqr(group, &z1z1, &a->Z); - - // z2z2 = z2**2 - EC_FELEM z2z2; - ec_GFp_mont_felem_sqr(group, &z2z2, &b->Z); - - // u1 = x1*z2z2 - EC_FELEM u1; - ec_GFp_mont_felem_mul(group, &u1, &a->X, &z2z2); - - // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 - EC_FELEM two_z1z2; - ec_felem_add(group, &two_z1z2, &a->Z, &b->Z); - ec_GFp_mont_felem_sqr(group, &two_z1z2, &two_z1z2); - ec_felem_sub(group, &two_z1z2, &two_z1z2, &z1z1); - ec_felem_sub(group, &two_z1z2, &two_z1z2, &z2z2); - - // s1 = y1 * z2**3 - EC_FELEM s1; - ec_GFp_mont_felem_mul(group, &s1, &b->Z, &z2z2); - ec_GFp_mont_felem_mul(group, &s1, &s1, &a->Y); - - // u2 = x2*z1z1 - EC_FELEM u2; - ec_GFp_mont_felem_mul(group, &u2, &b->X, &z1z1); - - // h = u2 - u1 - EC_FELEM h; - ec_felem_sub(group, &h, &u2, &u1); - - BN_ULONG xneq = ec_felem_non_zero_mask(group, &h); - - // z_out = two_z1z2 * h - ec_GFp_mont_felem_mul(group, &z_out, &h, &two_z1z2); - - // z1z1z1 = z1 * z1z1 - EC_FELEM z1z1z1; - ec_GFp_mont_felem_mul(group, &z1z1z1, &a->Z, &z1z1); - - // s2 = y2 * z1**3 - EC_FELEM s2; - ec_GFp_mont_felem_mul(group, &s2, &b->Y, &z1z1z1); - - // r = (s2 - s1)*2 - EC_FELEM r; - ec_felem_sub(group, &r, &s2, &s1); - ec_felem_add(group, &r, &r, &r); - - BN_ULONG yneq = ec_felem_non_zero_mask(group, &r); - - // This case will never occur in the constant-time |ec_GFp_mont_mul|. - BN_ULONG is_nontrivial_double = ~xneq & ~yneq & z1nz & z2nz; - if (is_nontrivial_double) { - ec_GFp_mont_dbl(group, out, a); - return; - } - - // I = (2h)**2 - EC_FELEM i; - ec_felem_add(group, &i, &h, &h); - ec_GFp_mont_felem_sqr(group, &i, &i); - - // J = h * I - EC_FELEM j; - ec_GFp_mont_felem_mul(group, &j, &h, &i); - - // V = U1 * I - EC_FELEM v; - ec_GFp_mont_felem_mul(group, &v, &u1, &i); - - // x_out = r**2 - J - 2V - ec_GFp_mont_felem_sqr(group, &x_out, &r); - ec_felem_sub(group, &x_out, &x_out, &j); - ec_felem_sub(group, &x_out, &x_out, &v); - ec_felem_sub(group, &x_out, &x_out, &v); - - // y_out = r(V-x_out) - 2 * s1 * J - ec_felem_sub(group, &y_out, &v, &x_out); - ec_GFp_mont_felem_mul(group, &y_out, &y_out, &r); - EC_FELEM s1j; - ec_GFp_mont_felem_mul(group, &s1j, &s1, &j); - ec_felem_sub(group, &y_out, &y_out, &s1j); - ec_felem_sub(group, &y_out, &y_out, &s1j); - - ec_felem_select(group, &x_out, z1nz, &x_out, &b->X); - ec_felem_select(group, &out->X, z2nz, &x_out, &a->X); - ec_felem_select(group, &y_out, z1nz, &y_out, &b->Y); - ec_felem_select(group, &out->Y, z2nz, &y_out, &a->Y); - ec_felem_select(group, &z_out, z1nz, &z_out, &b->Z); - ec_felem_select(group, &out->Z, z2nz, &z_out, &a->Z); -} - -void ec_GFp_mont_dbl(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *a) { - if (group->a_is_minus3) { - // The method is taken from: - // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b - // - // Coq transcription and correctness proof: - // - // - EC_FELEM delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta; - // delta = z^2 - ec_GFp_mont_felem_sqr(group, &delta, &a->Z); - // gamma = y^2 - ec_GFp_mont_felem_sqr(group, &gamma, &a->Y); - // beta = x*gamma - ec_GFp_mont_felem_mul(group, &beta, &a->X, &gamma); - - // alpha = 3*(x-delta)*(x+delta) - ec_felem_sub(group, &ftmp, &a->X, &delta); - ec_felem_add(group, &ftmp2, &a->X, &delta); - - ec_felem_add(group, &tmptmp, &ftmp2, &ftmp2); - ec_felem_add(group, &ftmp2, &ftmp2, &tmptmp); - ec_GFp_mont_felem_mul(group, &alpha, &ftmp, &ftmp2); - - // x' = alpha^2 - 8*beta - ec_GFp_mont_felem_sqr(group, &r->X, &alpha); - ec_felem_add(group, &fourbeta, &beta, &beta); - ec_felem_add(group, &fourbeta, &fourbeta, &fourbeta); - ec_felem_add(group, &tmptmp, &fourbeta, &fourbeta); - ec_felem_sub(group, &r->X, &r->X, &tmptmp); - - // z' = (y + z)^2 - gamma - delta - ec_felem_add(group, &delta, &gamma, &delta); - ec_felem_add(group, &ftmp, &a->Y, &a->Z); - ec_GFp_mont_felem_sqr(group, &r->Z, &ftmp); - ec_felem_sub(group, &r->Z, &r->Z, &delta); - - // y' = alpha*(4*beta - x') - 8*gamma^2 - ec_felem_sub(group, &r->Y, &fourbeta, &r->X); - ec_felem_add(group, &gamma, &gamma, &gamma); - ec_GFp_mont_felem_sqr(group, &gamma, &gamma); - ec_GFp_mont_felem_mul(group, &r->Y, &alpha, &r->Y); - ec_felem_add(group, &gamma, &gamma, &gamma); - ec_felem_sub(group, &r->Y, &r->Y, &gamma); - } else { - // The method is taken from: - // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl - // - // Coq transcription and correctness proof: - // - // - EC_FELEM xx, yy, yyyy, zz; - ec_GFp_mont_felem_sqr(group, &xx, &a->X); - ec_GFp_mont_felem_sqr(group, &yy, &a->Y); - ec_GFp_mont_felem_sqr(group, &yyyy, &yy); - ec_GFp_mont_felem_sqr(group, &zz, &a->Z); - - // s = 2*((x_in + yy)^2 - xx - yyyy) - EC_FELEM s; - ec_felem_add(group, &s, &a->X, &yy); - ec_GFp_mont_felem_sqr(group, &s, &s); - ec_felem_sub(group, &s, &s, &xx); - ec_felem_sub(group, &s, &s, &yyyy); - ec_felem_add(group, &s, &s, &s); - - // m = 3*xx + a*zz^2 - EC_FELEM m; - ec_GFp_mont_felem_sqr(group, &m, &zz); - ec_GFp_mont_felem_mul(group, &m, &group->a, &m); - ec_felem_add(group, &m, &m, &xx); - ec_felem_add(group, &m, &m, &xx); - ec_felem_add(group, &m, &m, &xx); - - // x_out = m^2 - 2*s - ec_GFp_mont_felem_sqr(group, &r->X, &m); - ec_felem_sub(group, &r->X, &r->X, &s); - ec_felem_sub(group, &r->X, &r->X, &s); - - // z_out = (y_in + z_in)^2 - yy - zz - ec_felem_add(group, &r->Z, &a->Y, &a->Z); - ec_GFp_mont_felem_sqr(group, &r->Z, &r->Z); - ec_felem_sub(group, &r->Z, &r->Z, &yy); - ec_felem_sub(group, &r->Z, &r->Z, &zz); - - // y_out = m*(s-x_out) - 8*yyyy - ec_felem_add(group, &yyyy, &yyyy, &yyyy); - ec_felem_add(group, &yyyy, &yyyy, &yyyy); - ec_felem_add(group, &yyyy, &yyyy, &yyyy); - ec_felem_sub(group, &r->Y, &s, &r->X); - ec_GFp_mont_felem_mul(group, &r->Y, &r->Y, &m); - ec_felem_sub(group, &r->Y, &r->Y, &yyyy); - } -} - -static int ec_GFp_mont_cmp_x_coordinate(const EC_GROUP *group, - const EC_RAW_POINT *p, - const EC_SCALAR *r) { - if (!group->field_greater_than_order || - group->field.width != group->order.width) { - // Do not bother optimizing this case. p > order in all commonly-used - // curves. - return ec_GFp_simple_cmp_x_coordinate(group, p, r); - } - - if (ec_GFp_simple_is_at_infinity(group, p)) { - return 0; - } - - // We wish to compare X/Z^2 with r. This is equivalent to comparing X with - // r*Z^2. Note that X and Z are represented in Montgomery form, while r is - // not. - EC_FELEM r_Z2, Z2_mont, X; - ec_GFp_mont_felem_mul(group, &Z2_mont, &p->Z, &p->Z); - // r < order < p, so this is valid. - OPENSSL_memcpy(r_Z2.words, r->words, group->field.width * sizeof(BN_ULONG)); - ec_GFp_mont_felem_mul(group, &r_Z2, &r_Z2, &Z2_mont); - ec_GFp_mont_felem_from_montgomery(group, &X, &p->X); - - if (ec_felem_equal(group, &r_Z2, &X)) { - return 1; - } - - // During signing the x coefficient is reduced modulo the group order. - // Therefore there is a small possibility, less than 1/2^128, that group_order - // < p.x < P. in that case we need not only to compare against |r| but also to - // compare against r+group_order. - if (bn_less_than_words(r->words, group->field_minus_order.words, - group->field.width)) { - // We can ignore the carry because: r + group_order < p < 2^256. - bn_add_words(r_Z2.words, r->words, group->order.d, group->field.width); - ec_GFp_mont_felem_mul(group, &r_Z2, &r_Z2, &Z2_mont); - if (ec_felem_equal(group, &r_Z2, &X)) { - return 1; - } - } - - return 0; -} - -DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_mont_method) { - out->group_init = ec_GFp_mont_group_init; - out->group_finish = ec_GFp_mont_group_finish; - out->group_set_curve = ec_GFp_mont_group_set_curve; - out->point_get_affine_coordinates = ec_GFp_mont_point_get_affine_coordinates; - out->jacobian_to_affine_batch = ec_GFp_mont_jacobian_to_affine_batch; - out->add = ec_GFp_mont_add; - out->dbl = ec_GFp_mont_dbl; - out->mul = ec_GFp_mont_mul; - out->mul_base = ec_GFp_mont_mul_base; - out->mul_batch = ec_GFp_mont_mul_batch; - out->mul_public_batch = ec_GFp_mont_mul_public_batch; - out->init_precomp = ec_GFp_mont_init_precomp; - out->mul_precomp = ec_GFp_mont_mul_precomp; - out->felem_mul = ec_GFp_mont_felem_mul; - out->felem_sqr = ec_GFp_mont_felem_sqr; - out->felem_to_bytes = ec_GFp_mont_felem_to_bytes; - out->felem_from_bytes = ec_GFp_mont_felem_from_bytes; - out->felem_reduce = ec_GFp_mont_felem_reduce; - out->felem_exp = ec_GFp_mont_felem_exp; - out->scalar_inv0_montgomery = ec_simple_scalar_inv0_montgomery; - out->scalar_to_montgomery_inv_vartime = - ec_simple_scalar_to_montgomery_inv_vartime; - out->cmp_x_coordinate = ec_GFp_mont_cmp_x_coordinate; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/ec_montgomery.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/ec_montgomery.cc.inc new file mode 100644 index 00000000..4f1f1d38 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/ec_montgomery.cc.inc @@ -0,0 +1,379 @@ +// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include "../bn/internal.h" +#include "../delocate.h" +#include "internal.h" + + +using namespace bssl; + +static void ec_GFp_mont_felem_inv0(const EC_GROUP *group, EC_FELEM *out, + const EC_FELEM *a) { + bn_mod_inverse0_prime_mont_small(out->words, a->words, group->field.N.width, + &group->field); +} + +static int ec_GFp_mont_point_get_affine_coordinates(const EC_GROUP *group, + const EC_JACOBIAN *point, + EC_FELEM *x, EC_FELEM *y) { + if (constant_time_declassify_int( + ec_GFp_simple_is_at_infinity(group, point))) { + OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); + return 0; + } + + // Transform (X, Y, Z) into (x, y) := (X/Z^2, Y/Z^3). Note the check above + // ensures |point->Z| is non-zero, so the inverse always exists. + EC_FELEM z1, z2; + ec_GFp_mont_felem_inv0(group, &z2, &point->Z); + ec_felem_sqr(group, &z1, &z2); + + if (x != nullptr) { + ec_felem_mul(group, x, &point->X, &z1); + } + + if (y != nullptr) { + ec_felem_mul(group, &z1, &z1, &z2); + ec_felem_mul(group, y, &point->Y, &z1); + } + + return 1; +} + +static int ec_GFp_mont_jacobian_to_affine_batch(const EC_GROUP *group, + EC_AFFINE *out, + const EC_JACOBIAN *in, + size_t num) { + if (num == 0) { + return 1; + } + + // Compute prefix products of all Zs. Use |out[i].X| as scratch space + // to store these values. + out[0].X = in[0].Z; + for (size_t i = 1; i < num; i++) { + ec_felem_mul(group, &out[i].X, &out[i - 1].X, &in[i].Z); + } + + // Some input was infinity iff the product of all Zs is zero. + if (ec_felem_non_zero_mask(group, &out[num - 1].X) == 0) { + OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); + return 0; + } + + // Invert the product of all Zs. + EC_FELEM zinvprod; + ec_GFp_mont_felem_inv0(group, &zinvprod, &out[num - 1].X); + for (size_t i = num - 1; i < num; i--) { + // Our loop invariant is that |zinvprod| is Z0^-1 * Z1^-1 * ... * Zi^-1. + // Recover Zi^-1 by multiplying by the previous product. + EC_FELEM zinv, zinv2; + if (i == 0) { + zinv = zinvprod; + } else { + ec_felem_mul(group, &zinv, &zinvprod, &out[i - 1].X); + // Maintain the loop invariant for the next iteration. + ec_felem_mul(group, &zinvprod, &zinvprod, &in[i].Z); + } + + // Compute affine coordinates: x = X * Z^-2 and y = Y * Z^-3. + ec_felem_sqr(group, &zinv2, &zinv); + ec_felem_mul(group, &out[i].X, &in[i].X, &zinv2); + ec_felem_mul(group, &out[i].Y, &in[i].Y, &zinv2); + ec_felem_mul(group, &out[i].Y, &out[i].Y, &zinv); + } + + return 1; +} + +void bssl::ec_GFp_mont_add(const EC_GROUP *group, EC_JACOBIAN *out, + const EC_JACOBIAN *a, const EC_JACOBIAN *b) { + if (a == b) { + ec_GFp_mont_dbl(group, out, a); + return; + } + + // The method is taken from: + // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#addition-add-2007-bl + // + // Coq transcription and correctness proof: + // + // + EC_FELEM x_out, y_out, z_out; + BN_ULONG z1nz = ec_felem_non_zero_mask(group, &a->Z); + BN_ULONG z2nz = ec_felem_non_zero_mask(group, &b->Z); + + // z1z1 = z1z1 = z1**2 + EC_FELEM z1z1; + ec_felem_sqr(group, &z1z1, &a->Z); + + // z2z2 = z2**2 + EC_FELEM z2z2; + ec_felem_sqr(group, &z2z2, &b->Z); + + // u1 = x1*z2z2 + EC_FELEM u1; + ec_felem_mul(group, &u1, &a->X, &z2z2); + + // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 + EC_FELEM two_z1z2; + ec_felem_add(group, &two_z1z2, &a->Z, &b->Z); + ec_felem_sqr(group, &two_z1z2, &two_z1z2); + ec_felem_sub(group, &two_z1z2, &two_z1z2, &z1z1); + ec_felem_sub(group, &two_z1z2, &two_z1z2, &z2z2); + + // s1 = y1 * z2**3 + EC_FELEM s1; + ec_felem_mul(group, &s1, &b->Z, &z2z2); + ec_felem_mul(group, &s1, &s1, &a->Y); + + // u2 = x2*z1z1 + EC_FELEM u2; + ec_felem_mul(group, &u2, &b->X, &z1z1); + + // h = u2 - u1 + EC_FELEM h; + ec_felem_sub(group, &h, &u2, &u1); + + BN_ULONG xneq = ec_felem_non_zero_mask(group, &h); + + // z_out = two_z1z2 * h + ec_felem_mul(group, &z_out, &h, &two_z1z2); + + // z1z1z1 = z1 * z1z1 + EC_FELEM z1z1z1; + ec_felem_mul(group, &z1z1z1, &a->Z, &z1z1); + + // s2 = y2 * z1**3 + EC_FELEM s2; + ec_felem_mul(group, &s2, &b->Y, &z1z1z1); + + // r = (s2 - s1)*2 + EC_FELEM r; + ec_felem_sub(group, &r, &s2, &s1); + ec_felem_add(group, &r, &r, &r); + + BN_ULONG yneq = ec_felem_non_zero_mask(group, &r); + + // This case will never occur in the constant-time |ec_GFp_mont_mul|. + BN_ULONG is_nontrivial_double = ~xneq & ~yneq & z1nz & z2nz; + if (constant_time_declassify_w(is_nontrivial_double)) { + ec_GFp_mont_dbl(group, out, a); + return; + } + + // I = (2h)**2 + EC_FELEM i; + ec_felem_add(group, &i, &h, &h); + ec_felem_sqr(group, &i, &i); + + // J = h * I + EC_FELEM j; + ec_felem_mul(group, &j, &h, &i); + + // V = U1 * I + EC_FELEM v; + ec_felem_mul(group, &v, &u1, &i); + + // x_out = r**2 - J - 2V + ec_felem_sqr(group, &x_out, &r); + ec_felem_sub(group, &x_out, &x_out, &j); + ec_felem_sub(group, &x_out, &x_out, &v); + ec_felem_sub(group, &x_out, &x_out, &v); + + // y_out = r(V-x_out) - 2 * s1 * J + ec_felem_sub(group, &y_out, &v, &x_out); + ec_felem_mul(group, &y_out, &y_out, &r); + EC_FELEM s1j; + ec_felem_mul(group, &s1j, &s1, &j); + ec_felem_sub(group, &y_out, &y_out, &s1j); + ec_felem_sub(group, &y_out, &y_out, &s1j); + + ec_felem_select(group, &x_out, z1nz, &x_out, &b->X); + ec_felem_select(group, &out->X, z2nz, &x_out, &a->X); + ec_felem_select(group, &y_out, z1nz, &y_out, &b->Y); + ec_felem_select(group, &out->Y, z2nz, &y_out, &a->Y); + ec_felem_select(group, &z_out, z1nz, &z_out, &b->Z); + ec_felem_select(group, &out->Z, z2nz, &z_out, &a->Z); +} + +void bssl::ec_GFp_mont_dbl(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *a) { + if (group->a_is_minus3) { + // The method is taken from: + // http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b + // + // Coq transcription and correctness proof: + // + // + EC_FELEM delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta; + // delta = z^2 + ec_felem_sqr(group, &delta, &a->Z); + // gamma = y^2 + ec_felem_sqr(group, &gamma, &a->Y); + // beta = x*gamma + ec_felem_mul(group, &beta, &a->X, &gamma); + + // alpha = 3*(x-delta)*(x+delta) + ec_felem_sub(group, &ftmp, &a->X, &delta); + ec_felem_add(group, &ftmp2, &a->X, &delta); + + ec_felem_add(group, &tmptmp, &ftmp2, &ftmp2); + ec_felem_add(group, &ftmp2, &ftmp2, &tmptmp); + ec_felem_mul(group, &alpha, &ftmp, &ftmp2); + + // x' = alpha^2 - 8*beta + ec_felem_sqr(group, &r->X, &alpha); + ec_felem_add(group, &fourbeta, &beta, &beta); + ec_felem_add(group, &fourbeta, &fourbeta, &fourbeta); + ec_felem_add(group, &tmptmp, &fourbeta, &fourbeta); + ec_felem_sub(group, &r->X, &r->X, &tmptmp); + + // z' = (y + z)^2 - gamma - delta + ec_felem_add(group, &delta, &gamma, &delta); + ec_felem_add(group, &ftmp, &a->Y, &a->Z); + ec_felem_sqr(group, &r->Z, &ftmp); + ec_felem_sub(group, &r->Z, &r->Z, &delta); + + // y' = alpha*(4*beta - x') - 8*gamma^2 + ec_felem_sub(group, &r->Y, &fourbeta, &r->X); + ec_felem_add(group, &gamma, &gamma, &gamma); + ec_felem_sqr(group, &gamma, &gamma); + ec_felem_mul(group, &r->Y, &alpha, &r->Y); + ec_felem_add(group, &gamma, &gamma, &gamma); + ec_felem_sub(group, &r->Y, &r->Y, &gamma); + } else { + // The method is taken from: + // http://www.hyperelliptic.org/EFD/g1p/auto-shortw-jacobian.html#doubling-dbl-2007-bl + // + // Coq transcription and correctness proof: + // + // + EC_FELEM xx, yy, yyyy, zz; + ec_felem_sqr(group, &xx, &a->X); + ec_felem_sqr(group, &yy, &a->Y); + ec_felem_sqr(group, &yyyy, &yy); + ec_felem_sqr(group, &zz, &a->Z); + + // s = 2*((x_in + yy)^2 - xx - yyyy) + EC_FELEM s; + ec_felem_add(group, &s, &a->X, &yy); + ec_felem_sqr(group, &s, &s); + ec_felem_sub(group, &s, &s, &xx); + ec_felem_sub(group, &s, &s, &yyyy); + ec_felem_add(group, &s, &s, &s); + + // m = 3*xx + a*zz^2 + EC_FELEM m; + ec_felem_sqr(group, &m, &zz); + ec_felem_mul(group, &m, &group->a, &m); + ec_felem_add(group, &m, &m, &xx); + ec_felem_add(group, &m, &m, &xx); + ec_felem_add(group, &m, &m, &xx); + + // x_out = m^2 - 2*s + ec_felem_sqr(group, &r->X, &m); + ec_felem_sub(group, &r->X, &r->X, &s); + ec_felem_sub(group, &r->X, &r->X, &s); + + // z_out = (y_in + z_in)^2 - yy - zz + ec_felem_add(group, &r->Z, &a->Y, &a->Z); + ec_felem_sqr(group, &r->Z, &r->Z); + ec_felem_sub(group, &r->Z, &r->Z, &yy); + ec_felem_sub(group, &r->Z, &r->Z, &zz); + + // y_out = m*(s-x_out) - 8*yyyy + ec_felem_add(group, &yyyy, &yyyy, &yyyy); + ec_felem_add(group, &yyyy, &yyyy, &yyyy); + ec_felem_add(group, &yyyy, &yyyy, &yyyy); + ec_felem_sub(group, &r->Y, &s, &r->X); + ec_felem_mul(group, &r->Y, &r->Y, &m); + ec_felem_sub(group, &r->Y, &r->Y, &yyyy); + } +} + +static int ec_GFp_mont_cmp_x_coordinate(const EC_GROUP *group, + const EC_JACOBIAN *p, + const EC_SCALAR *r) { + if (!group->field_greater_than_order || + group->field.N.width != group->order.N.width) { + // Do not bother optimizing this case. p > order in all commonly-used + // curves. + return ec_GFp_simple_cmp_x_coordinate(group, p, r); + } + + if (ec_GFp_simple_is_at_infinity(group, p)) { + return 0; + } + + // We wish to compare X/Z^2 with r. This is equivalent to comparing X with + // r*Z^2. Note that X and Z are represented in Montgomery form, while r is + // not. + EC_FELEM r_Z2, Z2_mont, X; + ec_felem_sqr(group, &Z2_mont, &p->Z); + // r < order < p, so this is valid. + OPENSSL_memcpy(r_Z2.words, r->words, group->field.N.width * sizeof(BN_ULONG)); + ec_felem_mul(group, &r_Z2, &r_Z2, &Z2_mont); + ec_felem_from_montgomery(group, &X, &p->X); + + if (ec_felem_equal(group, &r_Z2, &X)) { + return 1; + } + + // During signing the x coefficient is reduced modulo the group order. + // Therefore there is a small possibility, less than 1/2^128, that group_order + // < p.x < P. in that case we need not only to compare against |r| but also to + // compare against r+group_order. + BN_ULONG carry = bn_add_words(r_Z2.words, r->words, group->order.N.d, + group->field.N.width); + if (carry == 0 && + bn_less_than_words(r_Z2.words, group->field.N.d, group->field.N.width)) { + // r + group_order < p, so compare (r + group_order) * Z^2 against X. + ec_felem_mul(group, &r_Z2, &r_Z2, &Z2_mont); + if (ec_felem_equal(group, &r_Z2, &X)) { + return 1; + } + } + + return 0; +} + +BSSL_NAMESPACE_BEGIN + +DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_mont_method) { + out->point_get_affine_coordinates = ec_GFp_mont_point_get_affine_coordinates; + out->jacobian_to_affine_batch = ec_GFp_mont_jacobian_to_affine_batch; + out->add = ec_GFp_mont_add; + out->dbl = ec_GFp_mont_dbl; + out->mul = ec_GFp_mont_mul; + out->mul_base = ec_GFp_mont_mul_base; + out->mul_batch = ec_GFp_mont_mul_batch; + out->mul_public_batch = ec_GFp_mont_mul_public_batch; + out->init_precomp = ec_GFp_mont_init_precomp; + out->mul_precomp = ec_GFp_mont_mul_precomp; + out->scalar_inv0_montgomery = ec_simple_scalar_inv0_montgomery; + out->scalar_to_montgomery_inv_vartime = + ec_simple_scalar_to_montgomery_inv_vartime; + out->cmp_x_coordinate = ec_GFp_mont_cmp_x_coordinate; +} + +BSSL_NAMESPACE_END diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/felem.c b/third_party/boringssl/src/crypto/fipsmodule/ec/felem.c deleted file mode 100644 index e462514c..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/felem.c +++ /dev/null @@ -1,100 +0,0 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include -#include -#include - -#include - -#include "internal.h" -#include "../bn/internal.h" -#include "../../internal.h" - - -int ec_bignum_to_felem(const EC_GROUP *group, EC_FELEM *out, const BIGNUM *in) { - uint8_t bytes[EC_MAX_BYTES]; - size_t len = BN_num_bytes(&group->field); - assert(sizeof(bytes) >= len); - if (BN_is_negative(in) || - BN_cmp(in, &group->field) >= 0 || - !BN_bn2bin_padded(bytes, len, in)) { - OPENSSL_PUT_ERROR(EC, EC_R_COORDINATES_OUT_OF_RANGE); - return 0; - } - - return ec_felem_from_bytes(group, out, bytes, len); -} - -int ec_felem_to_bignum(const EC_GROUP *group, BIGNUM *out, const EC_FELEM *in) { - uint8_t bytes[EC_MAX_BYTES]; - size_t len; - ec_felem_to_bytes(group, bytes, &len, in); - return BN_bin2bn(bytes, len, out) != NULL; -} - -void ec_felem_to_bytes(const EC_GROUP *group, uint8_t *out, size_t *out_len, - const EC_FELEM *in) { - group->meth->felem_to_bytes(group, out, out_len, in); -} - -int ec_felem_from_bytes(const EC_GROUP *group, EC_FELEM *out, const uint8_t *in, - size_t len) { - return group->meth->felem_from_bytes(group, out, in, len); -} - -void ec_felem_neg(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a) { - // -a is zero if a is zero and p-a otherwise. - BN_ULONG mask = ec_felem_non_zero_mask(group, a); - BN_ULONG borrow = - bn_sub_words(out->words, group->field.d, a->words, group->field.width); - assert(borrow == 0); - (void)borrow; - for (int i = 0; i < group->field.width; i++) { - out->words[i] &= mask; - } -} - -void ec_felem_add(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a, - const EC_FELEM *b) { - EC_FELEM tmp; - bn_mod_add_words(out->words, a->words, b->words, group->field.d, tmp.words, - group->field.width); -} - -void ec_felem_sub(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a, - const EC_FELEM *b) { - EC_FELEM tmp; - bn_mod_sub_words(out->words, a->words, b->words, group->field.d, tmp.words, - group->field.width); -} - -BN_ULONG ec_felem_non_zero_mask(const EC_GROUP *group, const EC_FELEM *a) { - BN_ULONG mask = 0; - for (int i = 0; i < group->field.width; i++) { - mask |= a->words[i]; - } - return ~constant_time_is_zero_w(mask); -} - -void ec_felem_select(const EC_GROUP *group, EC_FELEM *out, BN_ULONG mask, - const EC_FELEM *a, const EC_FELEM *b) { - bn_select_words(out->words, mask, a->words, b->words, group->field.width); -} - -int ec_felem_equal(const EC_GROUP *group, const EC_FELEM *a, - const EC_FELEM *b) { - return CRYPTO_memcmp(a->words, b->words, - group->field.width * sizeof(BN_ULONG)) == 0; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/felem.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/felem.cc.inc new file mode 100644 index 00000000..c634cc1d --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/felem.cc.inc @@ -0,0 +1,167 @@ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include + +#include "../../internal.h" +#include "../bn/internal.h" +#include "internal.h" + + +using namespace bssl; + +const EC_FELEM *bssl::ec_felem_one(const EC_GROUP *group) { + // We reuse generator.Z as a cache for 1 in the field. + return &group->generator.raw.Z; +} + +int bssl::ec_bignum_to_felem(const EC_GROUP *group, EC_FELEM *out, + const BIGNUM *in) { + uint8_t bytes[EC_MAX_BYTES]; + size_t len = BN_num_bytes(&group->field.N); + assert(sizeof(bytes) >= len); + if (BN_is_negative(in) || BN_cmp(in, &group->field.N) >= 0 || + !BN_bn2bin_padded(bytes, len, in)) { + OPENSSL_PUT_ERROR(EC, EC_R_COORDINATES_OUT_OF_RANGE); + return 0; + } + + return ec_felem_from_bytes(group, out, bytes, len); +} + +int bssl::ec_felem_to_bignum(const EC_GROUP *group, BIGNUM *out, + const EC_FELEM *in) { + uint8_t bytes[EC_MAX_BYTES]; + size_t len; + ec_felem_to_bytes(group, bytes, &len, in); + return BN_bin2bn(bytes, len, out) != nullptr; +} + +void bssl::ec_felem_to_bytes(const EC_GROUP *group, uint8_t *out, + size_t *out_len, const EC_FELEM *in) { + EC_FELEM tmp; + ec_felem_from_montgomery(group, &tmp, in); + size_t len = BN_num_bytes(&group->field.N); + bn_words_to_big_endian(out, len, tmp.words, group->field.N.width); + *out_len = len; +} + +int bssl::ec_felem_from_bytes(const EC_GROUP *group, EC_FELEM *out, + const uint8_t *in, size_t len) { + if (len != BN_num_bytes(&group->field.N)) { + OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); + return 0; + } + + bn_big_endian_to_words(out->words, group->field.N.width, in, len); + if (!bn_less_than_words(out->words, group->field.N.d, group->field.N.width)) { + OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); + return 0; + } + + ec_felem_to_montgomery(group, out, out); + return 1; +} + +void bssl::ec_felem_neg(const EC_GROUP *group, EC_FELEM *out, + const EC_FELEM *a) { + // -a is zero if a is zero and p-a otherwise. + BN_ULONG mask = ec_felem_non_zero_mask(group, a); + BN_ULONG borrow = bn_sub_words(out->words, group->field.N.d, a->words, + group->field.N.width); + assert(borrow == 0); + (void)borrow; + for (int i = 0; i < group->field.N.width; i++) { + out->words[i] &= mask; + } +} + +void bssl::ec_felem_add(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a, + const EC_FELEM *b) { + EC_FELEM tmp; + bn_mod_add_words(out->words, a->words, b->words, group->field.N.d, tmp.words, + group->field.N.width); +} + +void bssl::ec_felem_sub(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a, + const EC_FELEM *b) { + EC_FELEM tmp; + bn_mod_sub_words(out->words, a->words, b->words, group->field.N.d, tmp.words, + group->field.N.width); +} + +BN_ULONG bssl::ec_felem_non_zero_mask(const EC_GROUP *group, + const EC_FELEM *a) { + BN_ULONG mask = 0; + for (int i = 0; i < group->field.N.width; i++) { + mask |= a->words[i]; + } + return ~constant_time_is_zero_w(mask); +} + +void bssl::ec_felem_select(const EC_GROUP *group, EC_FELEM *out, BN_ULONG mask, + const EC_FELEM *a, const EC_FELEM *b) { + bn_select_words(out->words, mask, a->words, b->words, group->field.N.width); +} + +int bssl::ec_felem_equal(const EC_GROUP *group, const EC_FELEM *a, + const EC_FELEM *b) { + return CRYPTO_memcmp(a->words, b->words, + group->field.N.width * sizeof(BN_ULONG)) == 0; +} + +void bssl::ec_felem_mul(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a, + const EC_FELEM *b) { + bn_mod_mul_montgomery_small(out->words, a->words, b->words, + group->field.N.width, &group->field); +} + +void bssl::ec_felem_sqr(const EC_GROUP *group, EC_FELEM *out, + const EC_FELEM *a) { + bn_mod_mul_montgomery_small(out->words, a->words, a->words, + group->field.N.width, &group->field); +} + +void bssl::ec_felem_to_montgomery(const EC_GROUP *group, EC_FELEM *out, + const EC_FELEM *a) { + bn_to_montgomery_small(out->words, a->words, group->field.N.width, + &group->field); +} + +void bssl::ec_felem_from_montgomery(const EC_GROUP *group, EC_FELEM *out, + const EC_FELEM *a) { + bn_from_montgomery_small(out->words, group->field.N.width, a->words, + group->field.N.width, &group->field); +} + +void bssl::ec_felem_reduce(const EC_GROUP *group, EC_FELEM *out, + const BN_ULONG *words, size_t num) { + // Convert "from" Montgomery form so the value is reduced mod p. + bn_from_montgomery_small(out->words, group->field.N.width, words, num, + &group->field); + // Convert "to" Montgomery form to remove the R^-1 factor added. + ec_felem_to_montgomery(group, out, out); + // Convert to Montgomery form to match this implementation's representation. + ec_felem_to_montgomery(group, out, out); +} + +void bssl::ec_felem_exp(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a, + const BN_ULONG *exp, size_t num_exp) { + bn_mod_exp_mont_small(out->words, a->words, group->field.N.width, exp, + num_exp, &group->field); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/internal.h b/third_party/boringssl/src/crypto/fipsmodule/ec/internal.h index f6c8e8a6..abcb77e7 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/internal.h @@ -1,72 +1,20 @@ -/* Originally written by Bodo Moeller for the OpenSSL project. - * ==================================================================== - * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ -/* ==================================================================== - * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. - * - * Portions of the attached software ("Contribution") are developed by - * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project. - * - * The Contribution is licensed pursuant to the OpenSSL open source - * license provided above. - * - * The elliptic curve binary polynomial software is originally written by - * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems - * Laboratories. */ - -#ifndef OPENSSL_HEADER_EC_INTERNAL_H -#define OPENSSL_HEADER_EC_INTERNAL_H +// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_EC_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_EC_INTERNAL_H #include @@ -76,12 +24,13 @@ #include #include +#include "../../mem_internal.h" #include "../bn/internal.h" -#if defined(__cplusplus) -extern "C" { -#endif +DECLARE_OPAQUE_STRUCT(ec_key_st, ECKey) + +BSSL_NAMESPACE_BEGIN // EC internals. @@ -91,6 +40,8 @@ extern "C" { // be the largest fields anyone plausibly uses. #define EC_MAX_BYTES 66 #define EC_MAX_WORDS ((EC_MAX_BYTES + BN_BYTES - 1) / BN_BYTES) +#define EC_MAX_COMPRESSED (EC_MAX_BYTES + 1) +#define EC_MAX_UNCOMPRESSED (2 * EC_MAX_BYTES + 1) static_assert(EC_MAX_WORDS <= BN_SMALL_MAX_WORDS, "bn_*_small functions not usable"); @@ -107,14 +58,14 @@ typedef struct { // ec_bignum_to_scalar converts |in| to an |EC_SCALAR| and writes it to // |*out|. It returns one on success and zero if |in| is out of range. -OPENSSL_EXPORT int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out, - const BIGNUM *in); +int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out, + const BIGNUM *in); // ec_scalar_to_bytes serializes |in| as a big-endian bytestring to |out| and // sets |*out_len| to the number of bytes written. The number of bytes written // is |BN_num_bytes(&group->order)|, which is at most |EC_MAX_BYTES|. -OPENSSL_EXPORT void ec_scalar_to_bytes(const EC_GROUP *group, uint8_t *out, - size_t *out_len, const EC_SCALAR *in); +void ec_scalar_to_bytes(const EC_GROUP *group, uint8_t *out, size_t *out_len, + const EC_SCALAR *in); // ec_scalar_from_bytes deserializes |in| and stores the resulting scalar over // group |group| to |out|. It returns one on success and zero if |in| is @@ -128,6 +79,11 @@ int ec_scalar_from_bytes(const EC_GROUP *group, EC_SCALAR *out, void ec_scalar_reduce(const EC_GROUP *group, EC_SCALAR *out, const BN_ULONG *words, size_t num); +// ec_random_nonzero_scalar sets |out| to a uniformly selected random value from +// zero to |group->order| - 1. It returns one on success and zero on error. +int ec_random_scalar(const EC_GROUP *group, EC_SCALAR *out, + const uint8_t additional_data[32]); + // ec_random_nonzero_scalar sets |out| to a uniformly selected random value from // 1 to |group->order| - 1. It returns one on success and zero on error. int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out, @@ -189,12 +145,15 @@ void ec_scalar_select(const EC_GROUP *group, EC_SCALAR *out, BN_ULONG mask, // An EC_FELEM represents a field element. Only the first |field->width| words // are used. An |EC_FELEM| is specific to an |EC_GROUP| and must not be mixed -// between groups. Additionally, the representation (whether or not elements are -// represented in Montgomery-form) may vary between |EC_METHOD|s. +// between groups. Unless otherwise stated, all inputs and outputs are in +// Montgomery form. typedef struct { BN_ULONG words[EC_MAX_WORDS]; } EC_FELEM; +// ec_felem_one returns one in |group|'s field. +const EC_FELEM *ec_felem_one(const EC_GROUP *group); + // ec_bignum_to_felem converts |in| to an |EC_FELEM|. It returns one on success // and zero if |in| is out of range. int ec_bignum_to_felem(const EC_GROUP *group, EC_FELEM *out, const BIGNUM *in); @@ -237,20 +196,50 @@ void ec_felem_select(const EC_GROUP *group, EC_FELEM *out, BN_ULONG mask, // ec_felem_equal returns one if |a| and |b| are equal and zero otherwise. int ec_felem_equal(const EC_GROUP *group, const EC_FELEM *a, const EC_FELEM *b); +// ec_felem_mul sets |out| to |a| * |b|. +void ec_felem_mul(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a, + const EC_FELEM *b); + +// ec_felem_sqr sets |out| to |a|^2. +void ec_felem_sqr(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a); + +// ec_felem_to_montgomery sets |out| to |a| converted to Montgomery form. +void ec_felem_to_montgomery(const EC_GROUP *group, EC_FELEM *out, + const EC_FELEM *a); + +// ec_felem_from_montgomery sets |out| to |a| converted from Montgomery form. +void ec_felem_from_montgomery(const EC_GROUP *group, EC_FELEM *out, + const EC_FELEM *a); + +// ec_felem_reduce sets |out| to |words|, reduced modulo the field size, p. +// |words| must be less than p^2. |num| must be at most twice the width of p. +// This function treats |words| as secret. +void ec_felem_reduce(const EC_GROUP *group, EC_FELEM *out, + const BN_ULONG *words, size_t num); + +// ec_felem_exp sets |out| to |a|^|exp|. It treats |a| is secret but |exp| as +// public. +// +// TODO(crbug.com/42290435): hash-to-curve uses this as part of computing a +// square root, which is what compressed coordinates ultimately needs to avoid +// |BIGNUM|. Can we unify this a bit? By generalizing to arbitrary +// exponentiation, we also miss an opportunity to use a specialized addition +// chain. We also miss our specialized field arithmetic for P-256. +void ec_felem_exp(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a, + const BN_ULONG *exp, size_t num_exp); + // Points. // // Points may represented in affine coordinates as |EC_AFFINE| or Jacobian -// coordinates as |EC_RAW_POINT|. Affine coordinates directly represent a +// coordinates as |EC_JACOBIAN|. Affine coordinates directly represent a // point on the curve, but point addition over affine coordinates requires // costly field inversions, so arithmetic is done in Jacobian coordinates. // Converting from affine to Jacobian is cheap, while converting from Jacobian // to affine costs a field inversion. (Jacobian coordinates amortize the field // inversions needed in a sequence of point operations.) -// -// TODO(davidben): Rename |EC_RAW_POINT| to |EC_JACOBIAN|. -// An EC_RAW_POINT represents an elliptic curve point in Jacobian coordinates. +// An EC_JACOBIAN represents an elliptic curve point in Jacobian coordinates. // Unlike |EC_POINT|, it is a plain struct which can be stack-allocated and // needs no cleanup. It is specific to an |EC_GROUP| and must not be mixed // between groups. @@ -258,7 +247,7 @@ typedef struct { // X, Y, and Z are Jacobian projective coordinates. They represent // (X/Z^2, Y/Z^3) if Z != 0 and the point at infinity otherwise. EC_FELEM X, Y, Z; -} EC_RAW_POINT; +} EC_JACOBIAN; // An EC_AFFINE represents an elliptic curve point in affine coordinates. // coordinates. Note the point at infinity cannot be represented in affine @@ -269,7 +258,7 @@ typedef struct { // ec_affine_to_jacobian converts |p| to Jacobian form and writes the result to // |*out|. This operation is very cheap and only costs a few copies. -void ec_affine_to_jacobian(const EC_GROUP *group, EC_RAW_POINT *out, +void ec_affine_to_jacobian(const EC_GROUP *group, EC_JACOBIAN *out, const EC_AFFINE *p); // ec_jacobian_to_affine converts |p| to affine form and writes the result to @@ -280,7 +269,7 @@ void ec_affine_to_jacobian(const EC_GROUP *group, EC_RAW_POINT *out, // If only extracting the x-coordinate, use |ec_get_x_coordinate_*| which is // slightly faster. int ec_jacobian_to_affine(const EC_GROUP *group, EC_AFFINE *out, - const EC_RAW_POINT *p); + const EC_JACOBIAN *p); // ec_jacobian_to_affine_batch converts |num| points in |in| from Jacobian // coordinates to affine coordinates and writes the results to |out|. It returns @@ -289,7 +278,7 @@ int ec_jacobian_to_affine(const EC_GROUP *group, EC_AFFINE *out, // This function is not implemented for all curves. Add implementations as // needed. int ec_jacobian_to_affine_batch(const EC_GROUP *group, EC_AFFINE *out, - const EC_RAW_POINT *in, size_t num); + const EC_JACOBIAN *in, size_t num); // ec_point_set_affine_coordinates sets |out|'s to a point with affine // coordinates |x| and |y|. It returns one if the point is on the curve and @@ -307,12 +296,12 @@ int ec_point_mul_no_self_test(const EC_GROUP *group, EC_POINT *r, // ec_point_mul_scalar sets |r| to |p| * |scalar|. Both inputs are considered // secret. -int ec_point_mul_scalar(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p, const EC_SCALAR *scalar); +int ec_point_mul_scalar(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *p, const EC_SCALAR *scalar); // ec_point_mul_scalar_base sets |r| to generator * |scalar|. |scalar| is // treated as secret. -int ec_point_mul_scalar_base(const EC_GROUP *group, EC_RAW_POINT *r, +int ec_point_mul_scalar_base(const EC_GROUP *group, EC_JACOBIAN *r, const EC_SCALAR *scalar); // ec_point_mul_scalar_batch sets |r| to |p0| * |scalar0| + |p1| * |scalar1| + @@ -333,10 +322,10 @@ int ec_point_mul_scalar_base(const EC_GROUP *group, EC_RAW_POINT *r, // none. If generalizing to tuned curves, this may be useful. However, we still // must double up to the least efficient input, so precomputed tables can only // save table setup and allow a wider window size. -int ec_point_mul_scalar_batch(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p0, const EC_SCALAR *scalar0, - const EC_RAW_POINT *p1, const EC_SCALAR *scalar1, - const EC_RAW_POINT *p2, const EC_SCALAR *scalar2); +int ec_point_mul_scalar_batch(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *p0, const EC_SCALAR *scalar0, + const EC_JACOBIAN *p1, const EC_SCALAR *scalar1, + const EC_JACOBIAN *p2, const EC_SCALAR *scalar2); #define EC_MONT_PRECOMP_COMB_SIZE 5 @@ -355,7 +344,7 @@ typedef union { // This function is not implemented for all curves. Add implementations as // needed. int ec_init_precomp(const EC_GROUP *group, EC_PRECOMP *out, - const EC_RAW_POINT *p); + const EC_JACOBIAN *p); // ec_point_mul_scalar_precomp sets |r| to |p0| * |scalar0| + |p1| * |scalar1| + // |p2| * |scalar2|. |p1| or |p2| may be NULL to skip the corresponding term. @@ -379,7 +368,7 @@ int ec_init_precomp(const EC_GROUP *group, EC_PRECOMP *out, // none. If generalizing to tuned curves, we should add a parameter for the base // point and arrange for the generic implementation to have base point tables // available. -int ec_point_mul_scalar_precomp(const EC_GROUP *group, EC_RAW_POINT *r, +int ec_point_mul_scalar_precomp(const EC_GROUP *group, EC_JACOBIAN *r, const EC_PRECOMP *p0, const EC_SCALAR *scalar0, const EC_PRECOMP *p1, const EC_SCALAR *scalar1, const EC_PRECOMP *p2, const EC_SCALAR *scalar2); @@ -387,11 +376,9 @@ int ec_point_mul_scalar_precomp(const EC_GROUP *group, EC_RAW_POINT *r, // ec_point_mul_scalar_public sets |r| to // generator * |g_scalar| + |p| * |p_scalar|. It assumes that the inputs are // public so there is no concern about leaking their values through timing. -OPENSSL_EXPORT int ec_point_mul_scalar_public(const EC_GROUP *group, - EC_RAW_POINT *r, - const EC_SCALAR *g_scalar, - const EC_RAW_POINT *p, - const EC_SCALAR *p_scalar); +int ec_point_mul_scalar_public(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_SCALAR *g_scalar, const EC_JACOBIAN *p, + const EC_SCALAR *p_scalar); // ec_point_mul_scalar_public_batch sets |r| to the sum of generator * // |g_scalar| and |points[i]| * |scalars[i]| where |points| and |scalars| have @@ -401,15 +388,15 @@ OPENSSL_EXPORT int ec_point_mul_scalar_public(const EC_GROUP *group, // // This function is not implemented for all curves. Add implementations as // needed. -int ec_point_mul_scalar_public_batch(const EC_GROUP *group, EC_RAW_POINT *r, +int ec_point_mul_scalar_public_batch(const EC_GROUP *group, EC_JACOBIAN *r, const EC_SCALAR *g_scalar, - const EC_RAW_POINT *points, + const EC_JACOBIAN *points, const EC_SCALAR *scalars, size_t num); // ec_point_select, in constant time, sets |out| to |a| if |mask| is all ones // and |b| if |mask| is all zeros. -void ec_point_select(const EC_GROUP *group, EC_RAW_POINT *out, BN_ULONG mask, - const EC_RAW_POINT *a, const EC_RAW_POINT *b); +void ec_point_select(const EC_GROUP *group, EC_JACOBIAN *out, BN_ULONG mask, + const EC_JACOBIAN *a, const EC_JACOBIAN *b); // ec_affine_select behaves like |ec_point_select| but acts on affine points. void ec_affine_select(const EC_GROUP *group, EC_AFFINE *out, BN_ULONG mask, @@ -421,15 +408,15 @@ void ec_precomp_select(const EC_GROUP *group, EC_PRECOMP *out, BN_ULONG mask, // ec_cmp_x_coordinate compares the x (affine) coordinate of |p|, mod the group // order, with |r|. It returns one if the values match and zero if |p| is the -// point at infinity of the values do not match. -int ec_cmp_x_coordinate(const EC_GROUP *group, const EC_RAW_POINT *p, +// point at infinity of the values do not match. |p| is treated as public. +int ec_cmp_x_coordinate(const EC_GROUP *group, const EC_JACOBIAN *p, const EC_SCALAR *r); // ec_get_x_coordinate_as_scalar sets |*out| to |p|'s x-coordinate, modulo // |group->order|. It returns one on success and zero if |p| is the point at // infinity. int ec_get_x_coordinate_as_scalar(const EC_GROUP *group, EC_SCALAR *out, - const EC_RAW_POINT *p); + const EC_JACOBIAN *p); // ec_get_x_coordinate_as_bytes writes |p|'s affine x-coordinate to |out|, which // must have at must |max_out| bytes. It sets |*out_len| to the number of bytes @@ -437,13 +424,20 @@ int ec_get_x_coordinate_as_scalar(const EC_GROUP *group, EC_SCALAR *out, // field. This function returns one on success and zero on failure. int ec_get_x_coordinate_as_bytes(const EC_GROUP *group, uint8_t *out, size_t *out_len, size_t max_out, - const EC_RAW_POINT *p); + const EC_JACOBIAN *p); + +// ec_point_byte_len returns the number of bytes in the byte representation of +// a non-infinity point in |group|, encoded according to |form|, or zero if +// |form| is invalid. +size_t ec_point_byte_len(const EC_GROUP *group, point_conversion_form_t form); -// ec_point_to_bytes behaves like |EC_POINT_point2oct| but takes an -// |EC_AFFINE|. +// ec_point_to_bytes encodes |point| according to |form| and writes the result +// |buf|. It returns the size of the output on success or zero on error. At most +// |max_out| bytes will be written. The buffer should be at least +// |ec_point_byte_len| long to guarantee success. size_t ec_point_to_bytes(const EC_GROUP *group, const EC_AFFINE *point, point_conversion_form_t form, uint8_t *buf, - size_t len); + size_t max_out); // ec_point_from_uncompressed parses |in| as a point in uncompressed form and // sets the result to |out|. It returns one on success and zero if the input was @@ -454,192 +448,173 @@ int ec_point_from_uncompressed(const EC_GROUP *group, EC_AFFINE *out, // ec_set_to_safe_point sets |out| to an arbitrary point on |group|, either the // generator or the point at infinity. This is used to guard against callers of // external APIs not checking the return value. -void ec_set_to_safe_point(const EC_GROUP *group, EC_RAW_POINT *out); +void ec_set_to_safe_point(const EC_GROUP *group, EC_JACOBIAN *out); // ec_affine_jacobian_equal returns one if |a| and |b| represent the same point // and zero otherwise. It treats both inputs as secret. int ec_affine_jacobian_equal(const EC_GROUP *group, const EC_AFFINE *a, - const EC_RAW_POINT *b); + const EC_JACOBIAN *b); +BSSL_NAMESPACE_END // Implementation details. struct ec_method_st { - int (*group_init)(EC_GROUP *); - void (*group_finish)(EC_GROUP *); - int (*group_set_curve)(EC_GROUP *, const BIGNUM *p, const BIGNUM *a, - const BIGNUM *b, BN_CTX *); - // point_get_affine_coordinates sets |*x| and |*y| to the affine coordinates // of |p|. Either |x| or |y| may be NULL to omit it. It returns one on success - // and zero if |p| is the point at infinity. - int (*point_get_affine_coordinates)(const EC_GROUP *, const EC_RAW_POINT *p, - EC_FELEM *x, EC_FELEM *y); + // and zero if |p| is the point at infinity. It leaks whether |p| was the + // point at infinity, but otherwise treats |p| as secret. + int (*point_get_affine_coordinates)(const EC_GROUP *, + const bssl::EC_JACOBIAN *p, + bssl::EC_FELEM *x, bssl::EC_FELEM *y); // jacobian_to_affine_batch implements |ec_jacobian_to_affine_batch|. - int (*jacobian_to_affine_batch)(const EC_GROUP *group, EC_AFFINE *out, - const EC_RAW_POINT *in, size_t num); + int (*jacobian_to_affine_batch)(const EC_GROUP *group, bssl::EC_AFFINE *out, + const bssl::EC_JACOBIAN *in, size_t num); // add sets |r| to |a| + |b|. - void (*add)(const EC_GROUP *group, EC_RAW_POINT *r, const EC_RAW_POINT *a, - const EC_RAW_POINT *b); + void (*add)(const EC_GROUP *group, bssl::EC_JACOBIAN *r, + const bssl::EC_JACOBIAN *a, const bssl::EC_JACOBIAN *b); // dbl sets |r| to |a| + |a|. - void (*dbl)(const EC_GROUP *group, EC_RAW_POINT *r, const EC_RAW_POINT *a); + void (*dbl)(const EC_GROUP *group, bssl::EC_JACOBIAN *r, + const bssl::EC_JACOBIAN *a); // mul sets |r| to |scalar|*|p|. - void (*mul)(const EC_GROUP *group, EC_RAW_POINT *r, const EC_RAW_POINT *p, - const EC_SCALAR *scalar); + void (*mul)(const EC_GROUP *group, bssl::EC_JACOBIAN *r, + const bssl::EC_JACOBIAN *p, const bssl::EC_SCALAR *scalar); // mul_base sets |r| to |scalar|*generator. - void (*mul_base)(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_SCALAR *scalar); + void (*mul_base)(const EC_GROUP *group, bssl::EC_JACOBIAN *r, + const bssl::EC_SCALAR *scalar); // mul_batch implements |ec_mul_scalar_batch|. - void (*mul_batch)(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p0, const EC_SCALAR *scalar0, - const EC_RAW_POINT *p1, const EC_SCALAR *scalar1, - const EC_RAW_POINT *p2, const EC_SCALAR *scalar2); + void (*mul_batch)(const EC_GROUP *group, bssl::EC_JACOBIAN *r, + const bssl::EC_JACOBIAN *p0, const bssl::EC_SCALAR *scalar0, + const bssl::EC_JACOBIAN *p1, const bssl::EC_SCALAR *scalar1, + const bssl::EC_JACOBIAN *p2, + const bssl::EC_SCALAR *scalar2); // mul_public sets |r| to |g_scalar|*generator + |p_scalar|*|p|. It assumes // that the inputs are public so there is no concern about leaking their // values through timing. // // This function may be omitted if |mul_public_batch| is provided. - void (*mul_public)(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_SCALAR *g_scalar, const EC_RAW_POINT *p, - const EC_SCALAR *p_scalar); + void (*mul_public)(const EC_GROUP *group, bssl::EC_JACOBIAN *r, + const bssl::EC_SCALAR *g_scalar, + const bssl::EC_JACOBIAN *p, + const bssl::EC_SCALAR *p_scalar); // mul_public_batch implements |ec_point_mul_scalar_public_batch|. - int (*mul_public_batch)(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_SCALAR *g_scalar, const EC_RAW_POINT *points, - const EC_SCALAR *scalars, size_t num); + int (*mul_public_batch)(const EC_GROUP *group, bssl::EC_JACOBIAN *r, + const bssl::EC_SCALAR *g_scalar, + const bssl::EC_JACOBIAN *points, + const bssl::EC_SCALAR *scalars, size_t num); // init_precomp implements |ec_init_precomp|. - int (*init_precomp)(const EC_GROUP *group, EC_PRECOMP *out, - const EC_RAW_POINT *p); + int (*init_precomp)(const EC_GROUP *group, bssl::EC_PRECOMP *out, + const bssl::EC_JACOBIAN *p); // mul_precomp implements |ec_point_mul_scalar_precomp|. - void (*mul_precomp)(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_PRECOMP *p0, const EC_SCALAR *scalar0, - const EC_PRECOMP *p1, const EC_SCALAR *scalar1, - const EC_PRECOMP *p2, const EC_SCALAR *scalar2); - - // felem_mul and felem_sqr implement multiplication and squaring, - // respectively, so that the generic |EC_POINT_add| and |EC_POINT_dbl| - // implementations can work both with |EC_GFp_mont_method| and the tuned - // operations. - // - // TODO(davidben): This constrains |EC_FELEM|'s internal representation, adds - // many indirect calls in the middle of the generic code, and a bunch of - // conversions. If p224-64.c were easily convertable to Montgomery form, we - // could say |EC_FELEM| is always in Montgomery form. If we routed the rest of - // simple.c to |EC_METHOD|, we could give |EC_POINT| an |EC_METHOD|-specific - // representation and say |EC_FELEM| is purely a |EC_GFp_mont_method| type. - void (*felem_mul)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a, - const EC_FELEM *b); - void (*felem_sqr)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a); - - void (*felem_to_bytes)(const EC_GROUP *group, uint8_t *out, size_t *out_len, - const EC_FELEM *in); - int (*felem_from_bytes)(const EC_GROUP *group, EC_FELEM *out, - const uint8_t *in, size_t len); - - // felem_reduce sets |out| to |words|, reduced modulo the field size, p. - // |words| must be less than p^2. |num| must be at most twice the width of p. - // This function treats |words| as secret. - // - // This function is only used in hash-to-curve and may be omitted in curves - // that do not support it. - void (*felem_reduce)(const EC_GROUP *group, EC_FELEM *out, - const BN_ULONG *words, size_t num); - - // felem_exp sets |out| to |a|^|exp|. It treats |a| is secret but |exp| as - // public. - // - // This function is used in hash-to-curve and may be NULL in curves not used - // with hash-to-curve. - void (*felem_exp)(const EC_GROUP *group, EC_FELEM *out, const EC_FELEM *a, - const BN_ULONG *exp, size_t num_exp); + void (*mul_precomp)(const EC_GROUP *group, bssl::EC_JACOBIAN *r, + const bssl::EC_PRECOMP *p0, + const bssl::EC_SCALAR *scalar0, + const bssl::EC_PRECOMP *p1, + const bssl::EC_SCALAR *scalar1, + const bssl::EC_PRECOMP *p2, + const bssl::EC_SCALAR *scalar2); // scalar_inv0_montgomery implements |ec_scalar_inv0_montgomery|. - void (*scalar_inv0_montgomery)(const EC_GROUP *group, EC_SCALAR *out, - const EC_SCALAR *in); + void (*scalar_inv0_montgomery)(const EC_GROUP *group, bssl::EC_SCALAR *out, + const bssl::EC_SCALAR *in); // scalar_to_montgomery_inv_vartime implements // |ec_scalar_to_montgomery_inv_vartime|. - int (*scalar_to_montgomery_inv_vartime)(const EC_GROUP *group, EC_SCALAR *out, - const EC_SCALAR *in); + int (*scalar_to_montgomery_inv_vartime)(const EC_GROUP *group, + bssl::EC_SCALAR *out, + const bssl::EC_SCALAR *in); // cmp_x_coordinate compares the x (affine) coordinate of |p|, mod the group // order, with |r|. It returns one if the values match and zero if |p| is the // point at infinity of the values do not match. - int (*cmp_x_coordinate)(const EC_GROUP *group, const EC_RAW_POINT *p, - const EC_SCALAR *r); + int (*cmp_x_coordinate)(const EC_GROUP *group, const bssl::EC_JACOBIAN *p, + const bssl::EC_SCALAR *r); } /* EC_METHOD */; -const EC_METHOD *EC_GFp_mont_method(void); +BSSL_NAMESPACE_BEGIN + +const EC_METHOD *EC_GFp_mont_method(); + +BSSL_NAMESPACE_END + +struct ec_point_st { + // group is an owning reference to |group|, unless this is + // |group->generator|. + EC_GROUP *group; + // raw is the group-specific point data. Functions that take |EC_POINT| + // typically check consistency with |EC_GROUP| while functions that take + // |EC_JACOBIAN| do not. Thus accesses to this field should be externally + // checked for consistency. + bssl::EC_JACOBIAN raw; +} /* EC_POINT */; struct ec_group_st { const EC_METHOD *meth; // Unlike all other |EC_POINT|s, |generator| does not own |generator->group| // to avoid a reference cycle. Additionally, Z is guaranteed to be one, so X - // and Y are suitable for use as an |EC_AFFINE|. - EC_POINT *generator; - BIGNUM order; + // and Y are suitable for use as an |EC_AFFINE|. Before |has_order| is set, Z + // is one, but X and Y are uninitialized. + EC_POINT generator; - int curve_name; // optional NID for named curve + BN_MONT_CTX order; + BN_MONT_CTX field; - BN_MONT_CTX *order_mont; // data for ECDSA inverse + bssl::EC_FELEM a, b; // Curve coefficients. - // The following members are handled by the method functions, - // even if they appear generic + // comment is a human-readable string describing the curve. + const char *comment; - BIGNUM field; // For curves over GF(p), this is the modulus. + // curve_name is the optional NID for named curve. + // + // If curve_name is NID_undef, the actual type is ECCustomGroup and the + // refcount must be respected when allocating/freeing. + int curve_name; - EC_FELEM a, b; // Curve coefficients. + uint8_t oid[9]; + uint8_t oid_len; // a_is_minus3 is one if |a| is -3 mod |field| and zero otherwise. Point // arithmetic is optimized for -3. int a_is_minus3; - // field_greater_than_order is one if |field| is greate than |order| and zero + // has_order is one if |generator| and |order| have been initialized. + int has_order; + + // field_greater_than_order is one if |field| is greater than |order| and zero // otherwise. int field_greater_than_order; +} /* EC_GROUP */; - // field_minus_order, if |field_greater_than_order| is true, is |field| minus - // |order| represented as an |EC_FELEM|. Otherwise, it is zero. - // - // Note: unlike |EC_FELEM|s used as intermediate values internal to the - // |EC_METHOD|, this value is not encoded in Montgomery form. - EC_FELEM field_minus_order; - - CRYPTO_refcount_t references; - - BN_MONT_CTX *mont; // Montgomery structure. +BSSL_NAMESPACE_BEGIN - EC_FELEM one; // The value one. -} /* EC_GROUP */; +class ECCustomGroup : public ec_group_st, public RefCounted { + public: + explicit ECCustomGroup(const EC_METHOD *meth); -struct ec_point_st { - // group is an owning reference to |group|, unless this is - // |group->generator|. - EC_GROUP *group; - // raw is the group-specific point data. Functions that take |EC_POINT| - // typically check consistency with |EC_GROUP| while functions that take - // |EC_RAW_POINT| do not. Thus accesses to this field should be externally - // checked for consistency. - EC_RAW_POINT raw; -} /* EC_POINT */; + private: + ~ECCustomGroup(); + friend RefCounted; +}; -EC_GROUP *ec_group_new(const EC_METHOD *meth); +EC_GROUP *ec_group_new(const EC_METHOD *meth, const BIGNUM *p, const BIGNUM *a, + const BIGNUM *b, BN_CTX *ctx); -void ec_GFp_mont_mul(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p, const EC_SCALAR *scalar); -void ec_GFp_mont_mul_base(const EC_GROUP *group, EC_RAW_POINT *r, +void ec_GFp_mont_mul(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *p, const EC_SCALAR *scalar); +void ec_GFp_mont_mul_base(const EC_GROUP *group, EC_JACOBIAN *r, const EC_SCALAR *scalar); -void ec_GFp_mont_mul_batch(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p0, const EC_SCALAR *scalar0, - const EC_RAW_POINT *p1, const EC_SCALAR *scalar1, - const EC_RAW_POINT *p2, const EC_SCALAR *scalar2); +void ec_GFp_mont_mul_batch(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *p0, const EC_SCALAR *scalar0, + const EC_JACOBIAN *p1, const EC_SCALAR *scalar1, + const EC_JACOBIAN *p2, const EC_SCALAR *scalar2); int ec_GFp_mont_init_precomp(const EC_GROUP *group, EC_PRECOMP *out, - const EC_RAW_POINT *p); -void ec_GFp_mont_mul_precomp(const EC_GROUP *group, EC_RAW_POINT *r, + const EC_JACOBIAN *p); +void ec_GFp_mont_mul_precomp(const EC_GROUP *group, EC_JACOBIAN *r, const EC_PRECOMP *p0, const EC_SCALAR *scalar0, const EC_PRECOMP *p1, const EC_SCALAR *scalar1, const EC_PRECOMP *p2, const EC_SCALAR *scalar2); @@ -655,29 +630,27 @@ void ec_GFp_mont_mul_precomp(const EC_GROUP *group, EC_RAW_POINT *r, void ec_compute_wNAF(const EC_GROUP *group, int8_t *out, const EC_SCALAR *scalar, size_t bits, int w); -int ec_GFp_mont_mul_public_batch(const EC_GROUP *group, EC_RAW_POINT *r, +int ec_GFp_mont_mul_public_batch(const EC_GROUP *group, EC_JACOBIAN *r, const EC_SCALAR *g_scalar, - const EC_RAW_POINT *points, + const EC_JACOBIAN *points, const EC_SCALAR *scalars, size_t num); // method functions in simple.c -int ec_GFp_simple_group_init(EC_GROUP *); -void ec_GFp_simple_group_finish(EC_GROUP *); int ec_GFp_simple_group_set_curve(EC_GROUP *, const BIGNUM *p, const BIGNUM *a, const BIGNUM *b, BN_CTX *); int ec_GFp_simple_group_get_curve(const EC_GROUP *, BIGNUM *p, BIGNUM *a, BIGNUM *b); -void ec_GFp_simple_point_init(EC_RAW_POINT *); -void ec_GFp_simple_point_copy(EC_RAW_POINT *, const EC_RAW_POINT *); -void ec_GFp_simple_point_set_to_infinity(const EC_GROUP *, EC_RAW_POINT *); -void ec_GFp_mont_add(const EC_GROUP *, EC_RAW_POINT *r, const EC_RAW_POINT *a, - const EC_RAW_POINT *b); -void ec_GFp_mont_dbl(const EC_GROUP *, EC_RAW_POINT *r, const EC_RAW_POINT *a); -void ec_GFp_simple_invert(const EC_GROUP *, EC_RAW_POINT *); -int ec_GFp_simple_is_at_infinity(const EC_GROUP *, const EC_RAW_POINT *); -int ec_GFp_simple_is_on_curve(const EC_GROUP *, const EC_RAW_POINT *); -int ec_GFp_simple_points_equal(const EC_GROUP *, const EC_RAW_POINT *a, - const EC_RAW_POINT *b); +void ec_GFp_simple_point_init(EC_JACOBIAN *); +void ec_GFp_simple_point_copy(EC_JACOBIAN *, const EC_JACOBIAN *); +void ec_GFp_simple_point_set_to_infinity(const EC_GROUP *, EC_JACOBIAN *); +void ec_GFp_mont_add(const EC_GROUP *, EC_JACOBIAN *r, const EC_JACOBIAN *a, + const EC_JACOBIAN *b); +void ec_GFp_mont_dbl(const EC_GROUP *, EC_JACOBIAN *r, const EC_JACOBIAN *a); +void ec_GFp_simple_invert(const EC_GROUP *, EC_JACOBIAN *); +int ec_GFp_simple_is_at_infinity(const EC_GROUP *, const EC_JACOBIAN *); +int ec_GFp_simple_is_on_curve(const EC_GROUP *, const EC_JACOBIAN *); +int ec_GFp_simple_points_equal(const EC_GROUP *, const EC_JACOBIAN *a, + const EC_JACOBIAN *b); void ec_simple_scalar_inv0_montgomery(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a); @@ -685,37 +658,17 @@ int ec_simple_scalar_to_montgomery_inv_vartime(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a); -int ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, const EC_RAW_POINT *p, +int ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, const EC_JACOBIAN *p, const EC_SCALAR *r); -void ec_GFp_simple_felem_to_bytes(const EC_GROUP *group, uint8_t *out, - size_t *out_len, const EC_FELEM *in); -int ec_GFp_simple_felem_from_bytes(const EC_GROUP *group, EC_FELEM *out, - const uint8_t *in, size_t len); - -// method functions in montgomery.c -int ec_GFp_mont_group_init(EC_GROUP *); -int ec_GFp_mont_group_set_curve(EC_GROUP *, const BIGNUM *p, const BIGNUM *a, - const BIGNUM *b, BN_CTX *); -void ec_GFp_mont_group_finish(EC_GROUP *); -void ec_GFp_mont_felem_mul(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a, - const EC_FELEM *b); -void ec_GFp_mont_felem_sqr(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a); - -void ec_GFp_mont_felem_to_bytes(const EC_GROUP *group, uint8_t *out, - size_t *out_len, const EC_FELEM *in); -int ec_GFp_mont_felem_from_bytes(const EC_GROUP *group, EC_FELEM *out, - const uint8_t *in, size_t len); - void ec_GFp_nistp_recode_scalar_bits(crypto_word_t *sign, crypto_word_t *digit, crypto_word_t in); -const EC_METHOD *EC_GFp_nistp224_method(void); -const EC_METHOD *EC_GFp_nistp256_method(void); +const EC_METHOD *EC_GFp_nistp256_method(); // EC_GFp_nistz256_method is a GFp method using montgomery multiplication, with // x86-64 optimized P256. See http://eprint.iacr.org/2013/816. -const EC_METHOD *EC_GFp_nistz256_method(void); +const EC_METHOD *EC_GFp_nistz256_method(); // An EC_WRAPPED_SCALAR is an |EC_SCALAR| with a parallel |BIGNUM| // representation. It exists to support the |EC_KEY_get0_private_key| API. @@ -724,53 +677,31 @@ typedef struct { EC_SCALAR scalar; } EC_WRAPPED_SCALAR; -struct ec_key_st { - EC_GROUP *group; +class ECKey : public ec_key_st, public RefCounted { + public: + explicit ECKey(const ENGINE *engine); + + EC_GROUP *group = nullptr; // Ideally |pub_key| would be an |EC_AFFINE| so serializing it does not pay an // inversion each time, but the |EC_KEY_get0_public_key| API implies public // keys are stored in an |EC_POINT|-compatible form. - EC_POINT *pub_key; - EC_WRAPPED_SCALAR *priv_key; + EC_POINT *pub_key = nullptr; + bssl::EC_WRAPPED_SCALAR *priv_key = nullptr; - unsigned int enc_flag; - point_conversion_form_t conv_form; + unsigned int enc_flag = 0; + point_conversion_form_t conv_form = POINT_CONVERSION_UNCOMPRESSED; - CRYPTO_refcount_t references; + ECDSA_METHOD *ecdsa_meth = nullptr; - ECDSA_METHOD *ecdsa_meth; + CRYPTO_EX_DATA ex_data = {}; - CRYPTO_EX_DATA ex_data; + private: + ~ECKey(); + friend RefCounted; } /* EC_KEY */; -struct built_in_curve { - int nid; - const uint8_t *oid; - uint8_t oid_len; - // comment is a human-readable string describing the curve. - const char *comment; - // param_len is the number of bytes needed to store a field element. - uint8_t param_len; - // params points to an array of 6*|param_len| bytes which hold the field - // elements of the following (in big-endian order): prime, a, b, generator x, - // generator y, order. - const uint8_t *params; - const EC_METHOD *method; -}; - -#define OPENSSL_NUM_BUILT_IN_CURVES 4 - -struct built_in_curves { - struct built_in_curve curves[OPENSSL_NUM_BUILT_IN_CURVES]; -}; - -// OPENSSL_built_in_curves returns a pointer to static information about -// standard curves. The array is terminated with an entry where |nid| is -// |NID_undef|. -const struct built_in_curves *OPENSSL_built_in_curves(void); +BSSL_NAMESPACE_END -#if defined(__cplusplus) -} // extern C -#endif -#endif // OPENSSL_HEADER_EC_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_EC_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/oct.c b/third_party/boringssl/src/crypto/fipsmodule/ec/oct.c deleted file mode 100644 index ddd0f37a..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/oct.c +++ /dev/null @@ -1,328 +0,0 @@ -/* Originally written by Bodo Moeller for the OpenSSL project. - * ==================================================================== - * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ -/* ==================================================================== - * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. - * - * Portions of the attached software ("Contribution") are developed by - * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project. - * - * The Contribution is licensed pursuant to the OpenSSL open source - * license provided above. - * - * The elliptic curve binary polynomial software is originally written by - * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems - * Laboratories. */ - -#include - -#include -#include - -#include "internal.h" - - -size_t ec_point_to_bytes(const EC_GROUP *group, const EC_AFFINE *point, - point_conversion_form_t form, uint8_t *buf, - size_t len) { - if (form != POINT_CONVERSION_COMPRESSED && - form != POINT_CONVERSION_UNCOMPRESSED) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_FORM); - return 0; - } - - const size_t field_len = BN_num_bytes(&group->field); - size_t output_len = 1 /* type byte */ + field_len; - if (form == POINT_CONVERSION_UNCOMPRESSED) { - // Uncompressed points have a second coordinate. - output_len += field_len; - } - - // if 'buf' is NULL, just return required length - if (buf != NULL) { - if (len < output_len) { - OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL); - return 0; - } - - size_t field_len_out; - ec_felem_to_bytes(group, buf + 1, &field_len_out, &point->X); - assert(field_len_out == field_len); - - if (form == POINT_CONVERSION_UNCOMPRESSED) { - ec_felem_to_bytes(group, buf + 1 + field_len, &field_len_out, &point->Y); - assert(field_len_out == field_len); - buf[0] = form; - } else { - uint8_t y_buf[EC_MAX_BYTES]; - ec_felem_to_bytes(group, y_buf, &field_len_out, &point->Y); - buf[0] = form + (y_buf[field_len_out - 1] & 1); - } - } - - return output_len; -} - -int ec_point_from_uncompressed(const EC_GROUP *group, EC_AFFINE *out, - const uint8_t *in, size_t len) { - const size_t field_len = BN_num_bytes(&group->field); - if (len != 1 + 2 * field_len || in[0] != POINT_CONVERSION_UNCOMPRESSED) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_ENCODING); - return 0; - } - - EC_FELEM x, y; - if (!ec_felem_from_bytes(group, &x, in + 1, field_len) || - !ec_felem_from_bytes(group, &y, in + 1 + field_len, field_len) || - !ec_point_set_affine_coordinates(group, out, &x, &y)) { - return 0; - } - - return 1; -} - -static int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point, - const uint8_t *buf, size_t len, - BN_CTX *ctx) { - if (len == 0) { - OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL); - return 0; - } - - point_conversion_form_t form = buf[0]; - if (form == POINT_CONVERSION_UNCOMPRESSED) { - EC_AFFINE affine; - if (!ec_point_from_uncompressed(group, &affine, buf, len)) { - // In the event of an error, defend against the caller not checking the - // return value by setting a known safe value. - ec_set_to_safe_point(group, &point->raw); - return 0; - } - ec_affine_to_jacobian(group, &point->raw, &affine); - return 1; - } - - const int y_bit = form & 1; - const size_t field_len = BN_num_bytes(&group->field); - form = form & ~1u; - if (form != POINT_CONVERSION_COMPRESSED || - len != 1 /* type byte */ + field_len) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_ENCODING); - return 0; - } - - // TODO(davidben): Integrate compressed coordinates with the lower-level EC - // abstractions. This requires a way to compute square roots, which is tricky - // for primes which are not 3 (mod 4), namely P-224 and custom curves. P-224's - // prime is particularly inconvenient for compressed coordinates. See - // https://cr.yp.to/papers/sqroot.pdf - BN_CTX *new_ctx = NULL; - if (ctx == NULL) { - ctx = new_ctx = BN_CTX_new(); - if (ctx == NULL) { - return 0; - } - } - - int ret = 0; - BN_CTX_start(ctx); - BIGNUM *x = BN_CTX_get(ctx); - if (x == NULL || !BN_bin2bn(buf + 1, field_len, x)) { - goto err; - } - if (BN_ucmp(x, &group->field) >= 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_ENCODING); - goto err; - } - - if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) { - goto err; - } - - ret = 1; - -err: - BN_CTX_end(ctx); - BN_CTX_free(new_ctx); - return ret; -} - -int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point, - const uint8_t *buf, size_t len, BN_CTX *ctx) { - if (EC_GROUP_cmp(group, point->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - return ec_GFp_simple_oct2point(group, point, buf, len, ctx); -} - -size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, - point_conversion_form_t form, uint8_t *buf, - size_t len, BN_CTX *ctx) { - if (EC_GROUP_cmp(group, point->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - EC_AFFINE affine; - if (!ec_jacobian_to_affine(group, &affine, &point->raw)) { - return 0; - } - return ec_point_to_bytes(group, &affine, form, buf, len); -} - -int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, - EC_POINT *point, const BIGNUM *x, - int y_bit, BN_CTX *ctx) { - if (EC_GROUP_cmp(group, point->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - - if (BN_is_negative(x) || BN_cmp(x, &group->field) >= 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_COMPRESSED_POINT); - return 0; - } - - BN_CTX *new_ctx = NULL; - int ret = 0; - - ERR_clear_error(); - - if (ctx == NULL) { - ctx = new_ctx = BN_CTX_new(); - if (ctx == NULL) { - return 0; - } - } - - y_bit = (y_bit != 0); - - BN_CTX_start(ctx); - BIGNUM *tmp1 = BN_CTX_get(ctx); - BIGNUM *tmp2 = BN_CTX_get(ctx); - BIGNUM *a = BN_CTX_get(ctx); - BIGNUM *b = BN_CTX_get(ctx); - BIGNUM *y = BN_CTX_get(ctx); - if (y == NULL || - !EC_GROUP_get_curve_GFp(group, NULL, a, b, ctx)) { - goto err; - } - - // Recover y. We have a Weierstrass equation - // y^2 = x^3 + a*x + b, - // so y is one of the square roots of x^3 + a*x + b. - - // tmp1 := x^3 - if (!BN_mod_sqr(tmp2, x, &group->field, ctx) || - !BN_mod_mul(tmp1, tmp2, x, &group->field, ctx)) { - goto err; - } - - // tmp1 := tmp1 + a*x - if (group->a_is_minus3) { - if (!bn_mod_lshift1_consttime(tmp2, x, &group->field, ctx) || - !bn_mod_add_consttime(tmp2, tmp2, x, &group->field, ctx) || - !bn_mod_sub_consttime(tmp1, tmp1, tmp2, &group->field, ctx)) { - goto err; - } - } else { - if (!BN_mod_mul(tmp2, a, x, &group->field, ctx) || - !bn_mod_add_consttime(tmp1, tmp1, tmp2, &group->field, ctx)) { - goto err; - } - } - - // tmp1 := tmp1 + b - if (!bn_mod_add_consttime(tmp1, tmp1, b, &group->field, ctx)) { - goto err; - } - - if (!BN_mod_sqrt(y, tmp1, &group->field, ctx)) { - unsigned long err = ERR_peek_last_error(); - - if (ERR_GET_LIB(err) == ERR_LIB_BN && - ERR_GET_REASON(err) == BN_R_NOT_A_SQUARE) { - ERR_clear_error(); - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_COMPRESSED_POINT); - } else { - OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB); - } - goto err; - } - - if (y_bit != BN_is_odd(y)) { - if (BN_is_zero(y)) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_COMPRESSION_BIT); - goto err; - } - if (!BN_usub(y, &group->field, y)) { - goto err; - } - } - if (y_bit != BN_is_odd(y)) { - OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); - goto err; - } - - if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) { - goto err; - } - - ret = 1; - -err: - BN_CTX_end(ctx); - BN_CTX_free(new_ctx); - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/oct.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/oct.cc.inc new file mode 100644 index 00000000..1ed7cbd2 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/oct.cc.inc @@ -0,0 +1,296 @@ +// Copyright 2011-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "internal.h" + + +using namespace bssl; + +size_t bssl::ec_point_byte_len(const EC_GROUP *group, + point_conversion_form_t form) { + if (form != POINT_CONVERSION_COMPRESSED && + form != POINT_CONVERSION_UNCOMPRESSED) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_FORM); + return 0; + } + + const size_t field_len = BN_num_bytes(&group->field.N); + size_t output_len = 1 /* type byte */ + field_len; + if (form == POINT_CONVERSION_UNCOMPRESSED) { + // Uncompressed points have a second coordinate. + output_len += field_len; + } + return output_len; +} + +size_t bssl::ec_point_to_bytes(const EC_GROUP *group, const EC_AFFINE *point, + point_conversion_form_t form, uint8_t *buf, + size_t max_out) { + size_t output_len = ec_point_byte_len(group, form); + if (max_out < output_len) { + OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL); + return 0; + } + + size_t field_len; + ec_felem_to_bytes(group, buf + 1, &field_len, &point->X); + assert(field_len == BN_num_bytes(&group->field.N)); + + if (form == POINT_CONVERSION_UNCOMPRESSED) { + ec_felem_to_bytes(group, buf + 1 + field_len, &field_len, &point->Y); + assert(field_len == BN_num_bytes(&group->field.N)); + buf[0] = form; + } else { + uint8_t y_buf[EC_MAX_BYTES]; + ec_felem_to_bytes(group, y_buf, &field_len, &point->Y); + buf[0] = form + (y_buf[field_len - 1] & 1); + } + + return output_len; +} + +int bssl::ec_point_from_uncompressed(const EC_GROUP *group, EC_AFFINE *out, + const uint8_t *in, size_t len) { + const size_t field_len = BN_num_bytes(&group->field.N); + if (len != 1 + 2 * field_len || in[0] != POINT_CONVERSION_UNCOMPRESSED) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_ENCODING); + return 0; + } + + EC_FELEM x, y; + if (!ec_felem_from_bytes(group, &x, in + 1, field_len) || + !ec_felem_from_bytes(group, &y, in + 1 + field_len, field_len) || + !ec_point_set_affine_coordinates(group, out, &x, &y)) { + return 0; + } + + return 1; +} + +static int ec_GFp_simple_oct2point(const EC_GROUP *group, EC_POINT *point, + const uint8_t *buf, size_t len, + BN_CTX *ctx) { + if (len == 0) { + OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL); + return 0; + } + + uint8_t form = buf[0]; + if (form == static_cast(POINT_CONVERSION_UNCOMPRESSED)) { + EC_AFFINE affine; + if (!ec_point_from_uncompressed(group, &affine, buf, len)) { + // In the event of an error, defend against the caller not checking the + // return value by setting a known safe value. + ec_set_to_safe_point(group, &point->raw); + return 0; + } + ec_affine_to_jacobian(group, &point->raw, &affine); + return 1; + } + + const int y_bit = form & 1; + const size_t field_len = BN_num_bytes(&group->field.N); + form = form & ~1u; + if (form != static_cast(POINT_CONVERSION_COMPRESSED) || + len != 1 /* type byte */ + field_len) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_ENCODING); + return 0; + } + + // TODO(davidben): Integrate compressed coordinates with the lower-level EC + // abstractions. This requires a way to compute square roots, which is tricky + // for primes which are not 3 (mod 4), namely P-224 and custom curves. P-224's + // prime is particularly inconvenient for compressed coordinates. See + // https://cr.yp.to/papers/sqroot.pdf + UniquePtr new_ctx; + if (ctx == nullptr) { + new_ctx.reset(BN_CTX_new()); + if (new_ctx == nullptr) { + return 0; + } + ctx = new_ctx.get(); + } + + BN_CTXScope scope(ctx); + BIGNUM *x = BN_CTX_get(ctx); + if (x == nullptr || !BN_bin2bn(buf + 1, field_len, x)) { + return 0; + } + if (BN_ucmp(x, &group->field.N) >= 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_ENCODING); + return 0; + } + + if (!EC_POINT_set_compressed_coordinates_GFp(group, point, x, y_bit, ctx)) { + return 0; + } + + return 1; +} + +int EC_POINT_oct2point(const EC_GROUP *group, EC_POINT *point, + const uint8_t *buf, size_t len, BN_CTX *ctx) { + if (EC_GROUP_cmp(group, point->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + return ec_GFp_simple_oct2point(group, point, buf, len, ctx); +} + +size_t EC_POINT_point2oct(const EC_GROUP *group, const EC_POINT *point, + point_conversion_form_t form, uint8_t *buf, + size_t max_out, BN_CTX *ctx) { + if (EC_GROUP_cmp(group, point->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + if (buf == nullptr) { + // When |buf| is NULL, just return the number of bytes that would be + // written, without doing an expensive Jacobian-to-affine conversion. + if (ec_GFp_simple_is_at_infinity(group, &point->raw)) { + OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); + return 0; + } + return ec_point_byte_len(group, form); + } + EC_AFFINE affine; + if (!ec_jacobian_to_affine(group, &affine, &point->raw)) { + return 0; + } + return ec_point_to_bytes(group, &affine, form, buf, max_out); +} + +size_t EC_POINT_point2buf(const EC_GROUP *group, const EC_POINT *point, + point_conversion_form_t form, uint8_t **out_buf, + BN_CTX *ctx) { + *out_buf = nullptr; + size_t len = EC_POINT_point2oct(group, point, form, nullptr, 0, ctx); + if (len == 0) { + return 0; + } + uint8_t *buf = reinterpret_cast(OPENSSL_malloc(len)); + if (buf == nullptr) { + return 0; + } + len = EC_POINT_point2oct(group, point, form, buf, len, ctx); + if (len == 0) { + OPENSSL_free(buf); + return 0; + } + *out_buf = buf; + return len; +} + +int EC_POINT_set_compressed_coordinates_GFp(const EC_GROUP *group, + EC_POINT *point, const BIGNUM *x, + int y_bit, BN_CTX *ctx) { + if (EC_GROUP_cmp(group, point->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + + const BIGNUM *field = &group->field.N; + if (BN_is_negative(x) || BN_cmp(x, field) >= 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_COMPRESSED_POINT); + return 0; + } + + ERR_clear_error(); + + UniquePtr new_ctx; + if (ctx == nullptr) { + new_ctx.reset(BN_CTX_new()); + if (new_ctx == nullptr) { + return 0; + } + ctx = new_ctx.get(); + } + + y_bit = (y_bit != 0); + + BN_CTXScope scope(ctx); + BIGNUM *tmp1 = BN_CTX_get(ctx); + BIGNUM *tmp2 = BN_CTX_get(ctx); + BIGNUM *a = BN_CTX_get(ctx); + BIGNUM *b = BN_CTX_get(ctx); + BIGNUM *y = BN_CTX_get(ctx); + if (y == nullptr || !EC_GROUP_get_curve_GFp(group, nullptr, a, b, ctx)) { + return 0; + } + + // Recover y. We have a Weierstrass equation + // y^2 = x^3 + a*x + b, + // so y is one of the square roots of x^3 + a*x + b. + + // tmp1 := x^3 + if (!BN_mod_sqr(tmp2, x, field, ctx) || + !BN_mod_mul(tmp1, tmp2, x, field, ctx)) { + return 0; + } + + // tmp1 := tmp1 + a*x + if (group->a_is_minus3) { + if (!bn_mod_lshift1_consttime(tmp2, x, field, ctx) || + !bn_mod_add_consttime(tmp2, tmp2, x, field, ctx) || + !bn_mod_sub_consttime(tmp1, tmp1, tmp2, field, ctx)) { + return 0; + } + } else { + if (!BN_mod_mul(tmp2, a, x, field, ctx) || + !bn_mod_add_consttime(tmp1, tmp1, tmp2, field, ctx)) { + return 0; + } + } + + // tmp1 := tmp1 + b + if (!bn_mod_add_consttime(tmp1, tmp1, b, field, ctx)) { + return 0; + } + + if (!BN_mod_sqrt(y, tmp1, field, ctx)) { + if (ERR_equals(ERR_peek_last_error(), ERR_LIB_BN, BN_R_NOT_A_SQUARE)) { + ERR_clear_error(); + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_COMPRESSED_POINT); + } else { + OPENSSL_PUT_ERROR(EC, ERR_R_BN_LIB); + } + return 0; + } + + if (y_bit != BN_is_odd(y)) { + if (BN_is_zero(y)) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_COMPRESSION_BIT); + return 0; + } + if (!BN_usub(y, field, y)) { + return 0; + } + } + if (y_bit != BN_is_odd(y)) { + OPENSSL_PUT_ERROR(EC, ERR_R_INTERNAL_ERROR); + return 0; + } + + if (!EC_POINT_set_affine_coordinates_GFp(group, point, x, y, ctx)) { + return 0; + } + + return 1; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/p224-64.c b/third_party/boringssl/src/crypto/fipsmodule/ec/p224-64.c deleted file mode 100644 index 0f51970d..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/p224-64.c +++ /dev/null @@ -1,1164 +0,0 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -// A 64-bit implementation of the NIST P-224 elliptic curve point multiplication -// -// Inspired by Daniel J. Bernstein's public domain nistp224 implementation -// and Adam Langley's public domain 64-bit C implementation of curve25519. - -#include - -#include -#include -#include -#include - -#include - -#include "internal.h" -#include "../delocate.h" -#include "../../internal.h" - - -#if defined(BORINGSSL_HAS_UINT128) && !defined(OPENSSL_SMALL) - -// Field elements are represented as a_0 + 2^56*a_1 + 2^112*a_2 + 2^168*a_3 -// using 64-bit coefficients called 'limbs', and sometimes (for multiplication -// results) as b_0 + 2^56*b_1 + 2^112*b_2 + 2^168*b_3 + 2^224*b_4 + 2^280*b_5 + -// 2^336*b_6 using 128-bit coefficients called 'widelimbs'. A 4-p224_limb -// representation is an 'p224_felem'; a 7-p224_widelimb representation is a -// 'p224_widefelem'. Even within felems, bits of adjacent limbs overlap, and we -// don't always reduce the representations: we ensure that inputs to each -// p224_felem multiplication satisfy a_i < 2^60, so outputs satisfy b_i < -// 4*2^60*2^60, and fit into a 128-bit word without overflow. The coefficients -// are then again partially reduced to obtain an p224_felem satisfying a_i < -// 2^57. We only reduce to the unique minimal representation at the end of the -// computation. - -typedef uint64_t p224_limb; -typedef uint128_t p224_widelimb; - -typedef p224_limb p224_felem[4]; -typedef p224_widelimb p224_widefelem[7]; - -// Precomputed multiples of the standard generator -// Points are given in coordinates (X, Y, Z) where Z normally is 1 -// (0 for the point at infinity). -// For each field element, slice a_0 is word 0, etc. -// -// The table has 2 * 16 elements, starting with the following: -// index | bits | point -// ------+---------+------------------------------ -// 0 | 0 0 0 0 | 0G -// 1 | 0 0 0 1 | 1G -// 2 | 0 0 1 0 | 2^56G -// 3 | 0 0 1 1 | (2^56 + 1)G -// 4 | 0 1 0 0 | 2^112G -// 5 | 0 1 0 1 | (2^112 + 1)G -// 6 | 0 1 1 0 | (2^112 + 2^56)G -// 7 | 0 1 1 1 | (2^112 + 2^56 + 1)G -// 8 | 1 0 0 0 | 2^168G -// 9 | 1 0 0 1 | (2^168 + 1)G -// 10 | 1 0 1 0 | (2^168 + 2^56)G -// 11 | 1 0 1 1 | (2^168 + 2^56 + 1)G -// 12 | 1 1 0 0 | (2^168 + 2^112)G -// 13 | 1 1 0 1 | (2^168 + 2^112 + 1)G -// 14 | 1 1 1 0 | (2^168 + 2^112 + 2^56)G -// 15 | 1 1 1 1 | (2^168 + 2^112 + 2^56 + 1)G -// followed by a copy of this with each element multiplied by 2^28. -// -// The reason for this is so that we can clock bits into four different -// locations when doing simple scalar multiplies against the base point, -// and then another four locations using the second 16 elements. -static const p224_felem g_p224_pre_comp[2][16][3] = { - {{{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, - {{0x3280d6115c1d21, 0xc1d356c2112234, 0x7f321390b94a03, 0xb70e0cbd6bb4bf}, - {0xd5819985007e34, 0x75a05a07476444, 0xfb4c22dfe6cd43, 0xbd376388b5f723}, - {1, 0, 0, 0}}, - {{0xfd9675666ebbe9, 0xbca7664d40ce5e, 0x2242df8d8a2a43, 0x1f49bbb0f99bc5}, - {0x29e0b892dc9c43, 0xece8608436e662, 0xdc858f185310d0, 0x9812dd4eb8d321}, - {1, 0, 0, 0}}, - {{0x6d3e678d5d8eb8, 0x559eed1cb362f1, 0x16e9a3bbce8a3f, 0xeedcccd8c2a748}, - {0xf19f90ed50266d, 0xabf2b4bf65f9df, 0x313865468fafec, 0x5cb379ba910a17}, - {1, 0, 0, 0}}, - {{0x0641966cab26e3, 0x91fb2991fab0a0, 0xefec27a4e13a0b, 0x0499aa8a5f8ebe}, - {0x7510407766af5d, 0x84d929610d5450, 0x81d77aae82f706, 0x6916f6d4338c5b}, - {1, 0, 0, 0}}, - {{0xea95ac3b1f15c6, 0x086000905e82d4, 0xdd323ae4d1c8b1, 0x932b56be7685a3}, - {0x9ef93dea25dbbf, 0x41665960f390f0, 0xfdec76dbe2a8a7, 0x523e80f019062a}, - {1, 0, 0, 0}}, - {{0x822fdd26732c73, 0xa01c83531b5d0f, 0x363f37347c1ba4, 0xc391b45c84725c}, - {0xbbd5e1b2d6ad24, 0xddfbcde19dfaec, 0xc393da7e222a7f, 0x1efb7890ede244}, - {1, 0, 0, 0}}, - {{0x4c9e90ca217da1, 0xd11beca79159bb, 0xff8d33c2c98b7c, 0x2610b39409f849}, - {0x44d1352ac64da0, 0xcdbb7b2c46b4fb, 0x966c079b753c89, 0xfe67e4e820b112}, - {1, 0, 0, 0}}, - {{0xe28cae2df5312d, 0xc71b61d16f5c6e, 0x79b7619a3e7c4c, 0x05c73240899b47}, - {0x9f7f6382c73e3a, 0x18615165c56bda, 0x641fab2116fd56, 0x72855882b08394}, - {1, 0, 0, 0}}, - {{0x0469182f161c09, 0x74a98ca8d00fb5, 0xb89da93489a3e0, 0x41c98768fb0c1d}, - {0xe5ea05fb32da81, 0x3dce9ffbca6855, 0x1cfe2d3fbf59e6, 0x0e5e03408738a7}, - {1, 0, 0, 0}}, - {{0xdab22b2333e87f, 0x4430137a5dd2f6, 0xe03ab9f738beb8, 0xcb0c5d0dc34f24}, - {0x764a7df0c8fda5, 0x185ba5c3fa2044, 0x9281d688bcbe50, 0xc40331df893881}, - {1, 0, 0, 0}}, - {{0xb89530796f0f60, 0xade92bd26909a3, 0x1a0c83fb4884da, 0x1765bf22a5a984}, - {0x772a9ee75db09e, 0x23bc6c67cec16f, 0x4c1edba8b14e2f, 0xe2a215d9611369}, - {1, 0, 0, 0}}, - {{0x571e509fb5efb3, 0xade88696410552, 0xc8ae85fada74fe, 0x6c7e4be83bbde3}, - {0xff9f51160f4652, 0xb47ce2495a6539, 0xa2946c53b582f4, 0x286d2db3ee9a60}, - {1, 0, 0, 0}}, - {{0x40bbd5081a44af, 0x0995183b13926c, 0xbcefba6f47f6d0, 0x215619e9cc0057}, - {0x8bc94d3b0df45e, 0xf11c54a3694f6f, 0x8631b93cdfe8b5, 0xe7e3f4b0982db9}, - {1, 0, 0, 0}}, - {{0xb17048ab3e1c7b, 0xac38f36ff8a1d8, 0x1c29819435d2c6, 0xc813132f4c07e9}, - {0x2891425503b11f, 0x08781030579fea, 0xf5426ba5cc9674, 0x1e28ebf18562bc}, - {1, 0, 0, 0}}, - {{0x9f31997cc864eb, 0x06cd91d28b5e4c, 0xff17036691a973, 0xf1aef351497c58}, - {0xdd1f2d600564ff, 0xdead073b1402db, 0x74a684435bd693, 0xeea7471f962558}, - {1, 0, 0, 0}}}, - {{{0, 0, 0, 0}, {0, 0, 0, 0}, {0, 0, 0, 0}}, - {{0x9665266dddf554, 0x9613d78b60ef2d, 0xce27a34cdba417, 0xd35ab74d6afc31}, - {0x85ccdd22deb15e, 0x2137e5783a6aab, 0xa141cffd8c93c6, 0x355a1830e90f2d}, - {1, 0, 0, 0}}, - {{0x1a494eadaade65, 0xd6da4da77fe53c, 0xe7992996abec86, 0x65c3553c6090e3}, - {0xfa610b1fb09346, 0xf1c6540b8a4aaf, 0xc51a13ccd3cbab, 0x02995b1b18c28a}, - {1, 0, 0, 0}}, - {{0x7874568e7295ef, 0x86b419fbe38d04, 0xdc0690a7550d9a, 0xd3966a44beac33}, - {0x2b7280ec29132f, 0xbeaa3b6a032df3, 0xdc7dd88ae41200, 0xd25e2513e3a100}, - {1, 0, 0, 0}}, - {{0x924857eb2efafd, 0xac2bce41223190, 0x8edaa1445553fc, 0x825800fd3562d5}, - {0x8d79148ea96621, 0x23a01c3dd9ed8d, 0xaf8b219f9416b5, 0xd8db0cc277daea}, - {1, 0, 0, 0}}, - {{0x76a9c3b1a700f0, 0xe9acd29bc7e691, 0x69212d1a6b0327, 0x6322e97fe154be}, - {0x469fc5465d62aa, 0x8d41ed18883b05, 0x1f8eae66c52b88, 0xe4fcbe9325be51}, - {1, 0, 0, 0}}, - {{0x825fdf583cac16, 0x020b857c7b023a, 0x683c17744b0165, 0x14ffd0a2daf2f1}, - {0x323b36184218f9, 0x4944ec4e3b47d4, 0xc15b3080841acf, 0x0bced4b01a28bb}, - {1, 0, 0, 0}}, - {{0x92ac22230df5c4, 0x52f33b4063eda8, 0xcb3f19870c0c93, 0x40064f2ba65233}, - {0xfe16f0924f8992, 0x012da25af5b517, 0x1a57bb24f723a6, 0x06f8bc76760def}, - {1, 0, 0, 0}}, - {{0x4a7084f7817cb9, 0xbcab0738ee9a78, 0x3ec11e11d9c326, 0xdc0fe90e0f1aae}, - {0xcf639ea5f98390, 0x5c350aa22ffb74, 0x9afae98a4047b7, 0x956ec2d617fc45}, - {1, 0, 0, 0}}, - {{0x4306d648c1be6a, 0x9247cd8bc9a462, 0xf5595e377d2f2e, 0xbd1c3caff1a52e}, - {0x045e14472409d0, 0x29f3e17078f773, 0x745a602b2d4f7d, 0x191837685cdfbb}, - {1, 0, 0, 0}}, - {{0x5b6ee254a8cb79, 0x4953433f5e7026, 0xe21faeb1d1def4, 0xc4c225785c09de}, - {0x307ce7bba1e518, 0x31b125b1036db8, 0x47e91868839e8f, 0xc765866e33b9f3}, - {1, 0, 0, 0}}, - {{0x3bfece24f96906, 0x4794da641e5093, 0xde5df64f95db26, 0x297ecd89714b05}, - {0x701bd3ebb2c3aa, 0x7073b4f53cb1d5, 0x13c5665658af16, 0x9895089d66fe58}, - {1, 0, 0, 0}}, - {{0x0fef05f78c4790, 0x2d773633b05d2e, 0x94229c3a951c94, 0xbbbd70df4911bb}, - {0xb2c6963d2c1168, 0x105f47a72b0d73, 0x9fdf6111614080, 0x7b7e94b39e67b0}, - {1, 0, 0, 0}}, - {{0xad1a7d6efbe2b3, 0xf012482c0da69d, 0x6b3bdf12438345, 0x40d7558d7aa4d9}, - {0x8a09fffb5c6d3d, 0x9a356e5d9ffd38, 0x5973f15f4f9b1c, 0xdcd5f59f63c3ea}, - {1, 0, 0, 0}}, - {{0xacf39f4c5ca7ab, 0x4c8071cc5fd737, 0xc64e3602cd1184, 0x0acd4644c9abba}, - {0x6c011a36d8bf6e, 0xfecd87ba24e32a, 0x19f6f56574fad8, 0x050b204ced9405}, - {1, 0, 0, 0}}, - {{0xed4f1cae7d9a96, 0x5ceef7ad94c40a, 0x778e4a3bf3ef9b, 0x7405783dc3b55e}, - {0x32477c61b6e8c6, 0xb46a97570f018b, 0x91176d0a7e95d1, 0x3df90fbc4c7d0e}, - {1, 0, 0, 0}}}}; - - -// Helper functions to convert field elements to/from internal representation - -static void p224_generic_to_felem(p224_felem out, const EC_FELEM *in) { - // |p224_felem|'s minimal representation uses four 56-bit words. |EC_FELEM| - // uses four 64-bit words. (The top-most word only has 32 bits.) - out[0] = in->words[0] & 0x00ffffffffffffff; - out[1] = ((in->words[0] >> 56) | (in->words[1] << 8)) & 0x00ffffffffffffff; - out[2] = ((in->words[1] >> 48) | (in->words[2] << 16)) & 0x00ffffffffffffff; - out[3] = ((in->words[2] >> 40) | (in->words[3] << 24)) & 0x00ffffffffffffff; -} - -// Requires 0 <= in < 2*p (always call p224_felem_reduce first) -static void p224_felem_to_generic(EC_FELEM *out, const p224_felem in) { - // Reduce to unique minimal representation. - static const int64_t two56 = ((p224_limb)1) << 56; - // 0 <= in < 2*p, p = 2^224 - 2^96 + 1 - // if in > p , reduce in = in - 2^224 + 2^96 - 1 - int64_t tmp[4], a; - tmp[0] = in[0]; - tmp[1] = in[1]; - tmp[2] = in[2]; - tmp[3] = in[3]; - // Case 1: a = 1 iff in >= 2^224 - a = (in[3] >> 56); - tmp[0] -= a; - tmp[1] += a << 40; - tmp[3] &= 0x00ffffffffffffff; - // Case 2: a = 0 iff p <= in < 2^224, i.e., the high 128 bits are all 1 and - // the lower part is non-zero - a = ((in[3] & in[2] & (in[1] | 0x000000ffffffffff)) + 1) | - (((int64_t)(in[0] + (in[1] & 0x000000ffffffffff)) - 1) >> 63); - a &= 0x00ffffffffffffff; - // turn a into an all-one mask (if a = 0) or an all-zero mask - a = (a - 1) >> 63; - // subtract 2^224 - 2^96 + 1 if a is all-one - tmp[3] &= a ^ 0xffffffffffffffff; - tmp[2] &= a ^ 0xffffffffffffffff; - tmp[1] &= (a ^ 0xffffffffffffffff) | 0x000000ffffffffff; - tmp[0] -= 1 & a; - - // eliminate negative coefficients: if tmp[0] is negative, tmp[1] must - // be non-zero, so we only need one step - a = tmp[0] >> 63; - tmp[0] += two56 & a; - tmp[1] -= 1 & a; - - // carry 1 -> 2 -> 3 - tmp[2] += tmp[1] >> 56; - tmp[1] &= 0x00ffffffffffffff; - - tmp[3] += tmp[2] >> 56; - tmp[2] &= 0x00ffffffffffffff; - - // Now 0 <= tmp < p - p224_felem tmp2; - tmp2[0] = tmp[0]; - tmp2[1] = tmp[1]; - tmp2[2] = tmp[2]; - tmp2[3] = tmp[3]; - - // |p224_felem|'s minimal representation uses four 56-bit words. |EC_FELEM| - // uses four 64-bit words. (The top-most word only has 32 bits.) - out->words[0] = tmp2[0] | (tmp2[1] << 56); - out->words[1] = (tmp2[1] >> 8) | (tmp2[2] << 48); - out->words[2] = (tmp2[2] >> 16) | (tmp2[3] << 40); - out->words[3] = tmp2[3] >> 24; -} - - -// Field operations, using the internal representation of field elements. -// NB! These operations are specific to our point multiplication and cannot be -// expected to be correct in general - e.g., multiplication with a large scalar -// will cause an overflow. - -static void p224_felem_assign(p224_felem out, const p224_felem in) { - out[0] = in[0]; - out[1] = in[1]; - out[2] = in[2]; - out[3] = in[3]; -} - -// Sum two field elements: out += in -static void p224_felem_sum(p224_felem out, const p224_felem in) { - out[0] += in[0]; - out[1] += in[1]; - out[2] += in[2]; - out[3] += in[3]; -} - -// Subtract field elements: out -= in -// Assumes in[i] < 2^57 -static void p224_felem_diff(p224_felem out, const p224_felem in) { - static const p224_limb two58p2 = - (((p224_limb)1) << 58) + (((p224_limb)1) << 2); - static const p224_limb two58m2 = - (((p224_limb)1) << 58) - (((p224_limb)1) << 2); - static const p224_limb two58m42m2 = - (((p224_limb)1) << 58) - (((p224_limb)1) << 42) - (((p224_limb)1) << 2); - - // Add 0 mod 2^224-2^96+1 to ensure out > in - out[0] += two58p2; - out[1] += two58m42m2; - out[2] += two58m2; - out[3] += two58m2; - - out[0] -= in[0]; - out[1] -= in[1]; - out[2] -= in[2]; - out[3] -= in[3]; -} - -// Subtract in unreduced 128-bit mode: out -= in -// Assumes in[i] < 2^119 -static void p224_widefelem_diff(p224_widefelem out, const p224_widefelem in) { - static const p224_widelimb two120 = ((p224_widelimb)1) << 120; - static const p224_widelimb two120m64 = - (((p224_widelimb)1) << 120) - (((p224_widelimb)1) << 64); - static const p224_widelimb two120m104m64 = (((p224_widelimb)1) << 120) - - (((p224_widelimb)1) << 104) - - (((p224_widelimb)1) << 64); - - // Add 0 mod 2^224-2^96+1 to ensure out > in - out[0] += two120; - out[1] += two120m64; - out[2] += two120m64; - out[3] += two120; - out[4] += two120m104m64; - out[5] += two120m64; - out[6] += two120m64; - - out[0] -= in[0]; - out[1] -= in[1]; - out[2] -= in[2]; - out[3] -= in[3]; - out[4] -= in[4]; - out[5] -= in[5]; - out[6] -= in[6]; -} - -// Subtract in mixed mode: out128 -= in64 -// in[i] < 2^63 -static void p224_felem_diff_128_64(p224_widefelem out, const p224_felem in) { - static const p224_widelimb two64p8 = - (((p224_widelimb)1) << 64) + (((p224_widelimb)1) << 8); - static const p224_widelimb two64m8 = - (((p224_widelimb)1) << 64) - (((p224_widelimb)1) << 8); - static const p224_widelimb two64m48m8 = (((p224_widelimb)1) << 64) - - (((p224_widelimb)1) << 48) - - (((p224_widelimb)1) << 8); - - // Add 0 mod 2^224-2^96+1 to ensure out > in - out[0] += two64p8; - out[1] += two64m48m8; - out[2] += two64m8; - out[3] += two64m8; - - out[0] -= in[0]; - out[1] -= in[1]; - out[2] -= in[2]; - out[3] -= in[3]; -} - -// Multiply a field element by a scalar: out = out * scalar -// The scalars we actually use are small, so results fit without overflow -static void p224_felem_scalar(p224_felem out, const p224_limb scalar) { - out[0] *= scalar; - out[1] *= scalar; - out[2] *= scalar; - out[3] *= scalar; -} - -// Multiply an unreduced field element by a scalar: out = out * scalar -// The scalars we actually use are small, so results fit without overflow -static void p224_widefelem_scalar(p224_widefelem out, - const p224_widelimb scalar) { - out[0] *= scalar; - out[1] *= scalar; - out[2] *= scalar; - out[3] *= scalar; - out[4] *= scalar; - out[5] *= scalar; - out[6] *= scalar; -} - -// Square a field element: out = in^2 -static void p224_felem_square(p224_widefelem out, const p224_felem in) { - p224_limb tmp0, tmp1, tmp2; - tmp0 = 2 * in[0]; - tmp1 = 2 * in[1]; - tmp2 = 2 * in[2]; - out[0] = ((p224_widelimb)in[0]) * in[0]; - out[1] = ((p224_widelimb)in[0]) * tmp1; - out[2] = ((p224_widelimb)in[0]) * tmp2 + ((p224_widelimb)in[1]) * in[1]; - out[3] = ((p224_widelimb)in[3]) * tmp0 + ((p224_widelimb)in[1]) * tmp2; - out[4] = ((p224_widelimb)in[3]) * tmp1 + ((p224_widelimb)in[2]) * in[2]; - out[5] = ((p224_widelimb)in[3]) * tmp2; - out[6] = ((p224_widelimb)in[3]) * in[3]; -} - -// Multiply two field elements: out = in1 * in2 -static void p224_felem_mul(p224_widefelem out, const p224_felem in1, - const p224_felem in2) { - out[0] = ((p224_widelimb)in1[0]) * in2[0]; - out[1] = ((p224_widelimb)in1[0]) * in2[1] + ((p224_widelimb)in1[1]) * in2[0]; - out[2] = ((p224_widelimb)in1[0]) * in2[2] + ((p224_widelimb)in1[1]) * in2[1] + - ((p224_widelimb)in1[2]) * in2[0]; - out[3] = ((p224_widelimb)in1[0]) * in2[3] + ((p224_widelimb)in1[1]) * in2[2] + - ((p224_widelimb)in1[2]) * in2[1] + ((p224_widelimb)in1[3]) * in2[0]; - out[4] = ((p224_widelimb)in1[1]) * in2[3] + ((p224_widelimb)in1[2]) * in2[2] + - ((p224_widelimb)in1[3]) * in2[1]; - out[5] = ((p224_widelimb)in1[2]) * in2[3] + ((p224_widelimb)in1[3]) * in2[2]; - out[6] = ((p224_widelimb)in1[3]) * in2[3]; -} - -// Reduce seven 128-bit coefficients to four 64-bit coefficients. -// Requires in[i] < 2^126, -// ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 -static void p224_felem_reduce(p224_felem out, const p224_widefelem in) { - static const p224_widelimb two127p15 = - (((p224_widelimb)1) << 127) + (((p224_widelimb)1) << 15); - static const p224_widelimb two127m71 = - (((p224_widelimb)1) << 127) - (((p224_widelimb)1) << 71); - static const p224_widelimb two127m71m55 = (((p224_widelimb)1) << 127) - - (((p224_widelimb)1) << 71) - - (((p224_widelimb)1) << 55); - p224_widelimb output[5]; - - // Add 0 mod 2^224-2^96+1 to ensure all differences are positive - output[0] = in[0] + two127p15; - output[1] = in[1] + two127m71m55; - output[2] = in[2] + two127m71; - output[3] = in[3]; - output[4] = in[4]; - - // Eliminate in[4], in[5], in[6] - output[4] += in[6] >> 16; - output[3] += (in[6] & 0xffff) << 40; - output[2] -= in[6]; - - output[3] += in[5] >> 16; - output[2] += (in[5] & 0xffff) << 40; - output[1] -= in[5]; - - output[2] += output[4] >> 16; - output[1] += (output[4] & 0xffff) << 40; - output[0] -= output[4]; - - // Carry 2 -> 3 -> 4 - output[3] += output[2] >> 56; - output[2] &= 0x00ffffffffffffff; - - output[4] = output[3] >> 56; - output[3] &= 0x00ffffffffffffff; - - // Now output[2] < 2^56, output[3] < 2^56, output[4] < 2^72 - - // Eliminate output[4] - output[2] += output[4] >> 16; - // output[2] < 2^56 + 2^56 = 2^57 - output[1] += (output[4] & 0xffff) << 40; - output[0] -= output[4]; - - // Carry 0 -> 1 -> 2 -> 3 - output[1] += output[0] >> 56; - out[0] = output[0] & 0x00ffffffffffffff; - - output[2] += output[1] >> 56; - // output[2] < 2^57 + 2^72 - out[1] = output[1] & 0x00ffffffffffffff; - output[3] += output[2] >> 56; - // output[3] <= 2^56 + 2^16 - out[2] = output[2] & 0x00ffffffffffffff; - - // out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, - // out[3] <= 2^56 + 2^16 (due to final carry), - // so out < 2*p - out[3] = output[3]; -} - -// Get negative value: out = -in -// Requires in[i] < 2^63, -// ensures out[0] < 2^56, out[1] < 2^56, out[2] < 2^56, out[3] <= 2^56 + 2^16 -static void p224_felem_neg(p224_felem out, const p224_felem in) { - p224_widefelem tmp = {0}; - p224_felem_diff_128_64(tmp, in); - p224_felem_reduce(out, tmp); -} - -// Zero-check: returns 1 if input is 0, and 0 otherwise. We know that field -// elements are reduced to in < 2^225, so we only need to check three cases: 0, -// 2^224 - 2^96 + 1, and 2^225 - 2^97 + 2 -static p224_limb p224_felem_is_zero(const p224_felem in) { - p224_limb zero = in[0] | in[1] | in[2] | in[3]; - zero = (((int64_t)(zero)-1) >> 63) & 1; - - p224_limb two224m96p1 = (in[0] ^ 1) | (in[1] ^ 0x00ffff0000000000) | - (in[2] ^ 0x00ffffffffffffff) | - (in[3] ^ 0x00ffffffffffffff); - two224m96p1 = (((int64_t)(two224m96p1)-1) >> 63) & 1; - p224_limb two225m97p2 = (in[0] ^ 2) | (in[1] ^ 0x00fffe0000000000) | - (in[2] ^ 0x00ffffffffffffff) | - (in[3] ^ 0x01ffffffffffffff); - two225m97p2 = (((int64_t)(two225m97p2)-1) >> 63) & 1; - return (zero | two224m96p1 | two225m97p2); -} - -// Invert a field element -// Computation chain copied from djb's code -static void p224_felem_inv(p224_felem out, const p224_felem in) { - p224_felem ftmp, ftmp2, ftmp3, ftmp4; - p224_widefelem tmp; - - p224_felem_square(tmp, in); - p224_felem_reduce(ftmp, tmp); // 2 - p224_felem_mul(tmp, in, ftmp); - p224_felem_reduce(ftmp, tmp); // 2^2 - 1 - p224_felem_square(tmp, ftmp); - p224_felem_reduce(ftmp, tmp); // 2^3 - 2 - p224_felem_mul(tmp, in, ftmp); - p224_felem_reduce(ftmp, tmp); // 2^3 - 1 - p224_felem_square(tmp, ftmp); - p224_felem_reduce(ftmp2, tmp); // 2^4 - 2 - p224_felem_square(tmp, ftmp2); - p224_felem_reduce(ftmp2, tmp); // 2^5 - 4 - p224_felem_square(tmp, ftmp2); - p224_felem_reduce(ftmp2, tmp); // 2^6 - 8 - p224_felem_mul(tmp, ftmp2, ftmp); - p224_felem_reduce(ftmp, tmp); // 2^6 - 1 - p224_felem_square(tmp, ftmp); - p224_felem_reduce(ftmp2, tmp); // 2^7 - 2 - for (size_t i = 0; i < 5; ++i) { // 2^12 - 2^6 - p224_felem_square(tmp, ftmp2); - p224_felem_reduce(ftmp2, tmp); - } - p224_felem_mul(tmp, ftmp2, ftmp); - p224_felem_reduce(ftmp2, tmp); // 2^12 - 1 - p224_felem_square(tmp, ftmp2); - p224_felem_reduce(ftmp3, tmp); // 2^13 - 2 - for (size_t i = 0; i < 11; ++i) { // 2^24 - 2^12 - p224_felem_square(tmp, ftmp3); - p224_felem_reduce(ftmp3, tmp); - } - p224_felem_mul(tmp, ftmp3, ftmp2); - p224_felem_reduce(ftmp2, tmp); // 2^24 - 1 - p224_felem_square(tmp, ftmp2); - p224_felem_reduce(ftmp3, tmp); // 2^25 - 2 - for (size_t i = 0; i < 23; ++i) { // 2^48 - 2^24 - p224_felem_square(tmp, ftmp3); - p224_felem_reduce(ftmp3, tmp); - } - p224_felem_mul(tmp, ftmp3, ftmp2); - p224_felem_reduce(ftmp3, tmp); // 2^48 - 1 - p224_felem_square(tmp, ftmp3); - p224_felem_reduce(ftmp4, tmp); // 2^49 - 2 - for (size_t i = 0; i < 47; ++i) { // 2^96 - 2^48 - p224_felem_square(tmp, ftmp4); - p224_felem_reduce(ftmp4, tmp); - } - p224_felem_mul(tmp, ftmp3, ftmp4); - p224_felem_reduce(ftmp3, tmp); // 2^96 - 1 - p224_felem_square(tmp, ftmp3); - p224_felem_reduce(ftmp4, tmp); // 2^97 - 2 - for (size_t i = 0; i < 23; ++i) { // 2^120 - 2^24 - p224_felem_square(tmp, ftmp4); - p224_felem_reduce(ftmp4, tmp); - } - p224_felem_mul(tmp, ftmp2, ftmp4); - p224_felem_reduce(ftmp2, tmp); // 2^120 - 1 - for (size_t i = 0; i < 6; ++i) { // 2^126 - 2^6 - p224_felem_square(tmp, ftmp2); - p224_felem_reduce(ftmp2, tmp); - } - p224_felem_mul(tmp, ftmp2, ftmp); - p224_felem_reduce(ftmp, tmp); // 2^126 - 1 - p224_felem_square(tmp, ftmp); - p224_felem_reduce(ftmp, tmp); // 2^127 - 2 - p224_felem_mul(tmp, ftmp, in); - p224_felem_reduce(ftmp, tmp); // 2^127 - 1 - for (size_t i = 0; i < 97; ++i) { // 2^224 - 2^97 - p224_felem_square(tmp, ftmp); - p224_felem_reduce(ftmp, tmp); - } - p224_felem_mul(tmp, ftmp, ftmp3); - p224_felem_reduce(out, tmp); // 2^224 - 2^96 - 1 -} - -// Copy in constant time: -// if icopy == 1, copy in to out, -// if icopy == 0, copy out to itself. -static void p224_copy_conditional(p224_felem out, const p224_felem in, - p224_limb icopy) { - // icopy is a (64-bit) 0 or 1, so copy is either all-zero or all-one - const p224_limb copy = -icopy; - for (size_t i = 0; i < 4; ++i) { - const p224_limb tmp = copy & (in[i] ^ out[i]); - out[i] ^= tmp; - } -} - -// ELLIPTIC CURVE POINT OPERATIONS -// -// Points are represented in Jacobian projective coordinates: -// (X, Y, Z) corresponds to the affine point (X/Z^2, Y/Z^3), -// or to the point at infinity if Z == 0. - -// Double an elliptic curve point: -// (X', Y', Z') = 2 * (X, Y, Z), where -// X' = (3 * (X - Z^2) * (X + Z^2))^2 - 8 * X * Y^2 -// Y' = 3 * (X - Z^2) * (X + Z^2) * (4 * X * Y^2 - X') - 8 * Y^2 -// Z' = (Y + Z)^2 - Y^2 - Z^2 = 2 * Y * Z -// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed, -// while x_out == y_in is not (maybe this works, but it's not tested). -static void p224_point_double(p224_felem x_out, p224_felem y_out, - p224_felem z_out, const p224_felem x_in, - const p224_felem y_in, const p224_felem z_in) { - p224_widefelem tmp, tmp2; - p224_felem delta, gamma, beta, alpha, ftmp, ftmp2; - - p224_felem_assign(ftmp, x_in); - p224_felem_assign(ftmp2, x_in); - - // delta = z^2 - p224_felem_square(tmp, z_in); - p224_felem_reduce(delta, tmp); - - // gamma = y^2 - p224_felem_square(tmp, y_in); - p224_felem_reduce(gamma, tmp); - - // beta = x*gamma - p224_felem_mul(tmp, x_in, gamma); - p224_felem_reduce(beta, tmp); - - // alpha = 3*(x-delta)*(x+delta) - p224_felem_diff(ftmp, delta); - // ftmp[i] < 2^57 + 2^58 + 2 < 2^59 - p224_felem_sum(ftmp2, delta); - // ftmp2[i] < 2^57 + 2^57 = 2^58 - p224_felem_scalar(ftmp2, 3); - // ftmp2[i] < 3 * 2^58 < 2^60 - p224_felem_mul(tmp, ftmp, ftmp2); - // tmp[i] < 2^60 * 2^59 * 4 = 2^121 - p224_felem_reduce(alpha, tmp); - - // x' = alpha^2 - 8*beta - p224_felem_square(tmp, alpha); - // tmp[i] < 4 * 2^57 * 2^57 = 2^116 - p224_felem_assign(ftmp, beta); - p224_felem_scalar(ftmp, 8); - // ftmp[i] < 8 * 2^57 = 2^60 - p224_felem_diff_128_64(tmp, ftmp); - // tmp[i] < 2^116 + 2^64 + 8 < 2^117 - p224_felem_reduce(x_out, tmp); - - // z' = (y + z)^2 - gamma - delta - p224_felem_sum(delta, gamma); - // delta[i] < 2^57 + 2^57 = 2^58 - p224_felem_assign(ftmp, y_in); - p224_felem_sum(ftmp, z_in); - // ftmp[i] < 2^57 + 2^57 = 2^58 - p224_felem_square(tmp, ftmp); - // tmp[i] < 4 * 2^58 * 2^58 = 2^118 - p224_felem_diff_128_64(tmp, delta); - // tmp[i] < 2^118 + 2^64 + 8 < 2^119 - p224_felem_reduce(z_out, tmp); - - // y' = alpha*(4*beta - x') - 8*gamma^2 - p224_felem_scalar(beta, 4); - // beta[i] < 4 * 2^57 = 2^59 - p224_felem_diff(beta, x_out); - // beta[i] < 2^59 + 2^58 + 2 < 2^60 - p224_felem_mul(tmp, alpha, beta); - // tmp[i] < 4 * 2^57 * 2^60 = 2^119 - p224_felem_square(tmp2, gamma); - // tmp2[i] < 4 * 2^57 * 2^57 = 2^116 - p224_widefelem_scalar(tmp2, 8); - // tmp2[i] < 8 * 2^116 = 2^119 - p224_widefelem_diff(tmp, tmp2); - // tmp[i] < 2^119 + 2^120 < 2^121 - p224_felem_reduce(y_out, tmp); -} - -// Add two elliptic curve points: -// (X_1, Y_1, Z_1) + (X_2, Y_2, Z_2) = (X_3, Y_3, Z_3), where -// X_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1)^2 - (Z_1^2 * X_2 - Z_2^2 * X_1)^3 - -// 2 * Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^2 -// Y_3 = (Z_1^3 * Y_2 - Z_2^3 * Y_1) * (Z_2^2 * X_1 * (Z_1^2 * X_2 - Z_2^2 * -// X_1)^2 - X_3) - -// Z_2^3 * Y_1 * (Z_1^2 * X_2 - Z_2^2 * X_1)^3 -// Z_3 = (Z_1^2 * X_2 - Z_2^2 * X_1) * (Z_1 * Z_2) -// -// This runs faster if 'mixed' is set, which requires Z_2 = 1 or Z_2 = 0. - -// This function is not entirely constant-time: it includes a branch for -// checking whether the two input points are equal, (while not equal to the -// point at infinity). This case never happens during single point -// multiplication, so there is no timing leak for ECDH or ECDSA signing. -static void p224_point_add(p224_felem x3, p224_felem y3, p224_felem z3, - const p224_felem x1, const p224_felem y1, - const p224_felem z1, const int mixed, - const p224_felem x2, const p224_felem y2, - const p224_felem z2) { - p224_felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, x_out, y_out, z_out; - p224_widefelem tmp, tmp2; - p224_limb z1_is_zero, z2_is_zero, x_equal, y_equal; - - if (!mixed) { - // ftmp2 = z2^2 - p224_felem_square(tmp, z2); - p224_felem_reduce(ftmp2, tmp); - - // ftmp4 = z2^3 - p224_felem_mul(tmp, ftmp2, z2); - p224_felem_reduce(ftmp4, tmp); - - // ftmp4 = z2^3*y1 - p224_felem_mul(tmp2, ftmp4, y1); - p224_felem_reduce(ftmp4, tmp2); - - // ftmp2 = z2^2*x1 - p224_felem_mul(tmp2, ftmp2, x1); - p224_felem_reduce(ftmp2, tmp2); - } else { - // We'll assume z2 = 1 (special case z2 = 0 is handled later) - - // ftmp4 = z2^3*y1 - p224_felem_assign(ftmp4, y1); - - // ftmp2 = z2^2*x1 - p224_felem_assign(ftmp2, x1); - } - - // ftmp = z1^2 - p224_felem_square(tmp, z1); - p224_felem_reduce(ftmp, tmp); - - // ftmp3 = z1^3 - p224_felem_mul(tmp, ftmp, z1); - p224_felem_reduce(ftmp3, tmp); - - // tmp = z1^3*y2 - p224_felem_mul(tmp, ftmp3, y2); - // tmp[i] < 4 * 2^57 * 2^57 = 2^116 - - // ftmp3 = z1^3*y2 - z2^3*y1 - p224_felem_diff_128_64(tmp, ftmp4); - // tmp[i] < 2^116 + 2^64 + 8 < 2^117 - p224_felem_reduce(ftmp3, tmp); - - // tmp = z1^2*x2 - p224_felem_mul(tmp, ftmp, x2); - // tmp[i] < 4 * 2^57 * 2^57 = 2^116 - - // ftmp = z1^2*x2 - z2^2*x1 - p224_felem_diff_128_64(tmp, ftmp2); - // tmp[i] < 2^116 + 2^64 + 8 < 2^117 - p224_felem_reduce(ftmp, tmp); - - // the formulae are incorrect if the points are equal - // so we check for this and do doubling if this happens - x_equal = p224_felem_is_zero(ftmp); - y_equal = p224_felem_is_zero(ftmp3); - z1_is_zero = p224_felem_is_zero(z1); - z2_is_zero = p224_felem_is_zero(z2); - // In affine coordinates, (X_1, Y_1) == (X_2, Y_2) - p224_limb is_nontrivial_double = - x_equal & y_equal & (1 - z1_is_zero) & (1 - z2_is_zero); - if (is_nontrivial_double) { - p224_point_double(x3, y3, z3, x1, y1, z1); - return; - } - - // ftmp5 = z1*z2 - if (!mixed) { - p224_felem_mul(tmp, z1, z2); - p224_felem_reduce(ftmp5, tmp); - } else { - // special case z2 = 0 is handled later - p224_felem_assign(ftmp5, z1); - } - - // z_out = (z1^2*x2 - z2^2*x1)*(z1*z2) - p224_felem_mul(tmp, ftmp, ftmp5); - p224_felem_reduce(z_out, tmp); - - // ftmp = (z1^2*x2 - z2^2*x1)^2 - p224_felem_assign(ftmp5, ftmp); - p224_felem_square(tmp, ftmp); - p224_felem_reduce(ftmp, tmp); - - // ftmp5 = (z1^2*x2 - z2^2*x1)^3 - p224_felem_mul(tmp, ftmp, ftmp5); - p224_felem_reduce(ftmp5, tmp); - - // ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - p224_felem_mul(tmp, ftmp2, ftmp); - p224_felem_reduce(ftmp2, tmp); - - // tmp = z2^3*y1*(z1^2*x2 - z2^2*x1)^3 - p224_felem_mul(tmp, ftmp4, ftmp5); - // tmp[i] < 4 * 2^57 * 2^57 = 2^116 - - // tmp2 = (z1^3*y2 - z2^3*y1)^2 - p224_felem_square(tmp2, ftmp3); - // tmp2[i] < 4 * 2^57 * 2^57 < 2^116 - - // tmp2 = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 - p224_felem_diff_128_64(tmp2, ftmp5); - // tmp2[i] < 2^116 + 2^64 + 8 < 2^117 - - // ftmp5 = 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - p224_felem_assign(ftmp5, ftmp2); - p224_felem_scalar(ftmp5, 2); - // ftmp5[i] < 2 * 2^57 = 2^58 - - /* x_out = (z1^3*y2 - z2^3*y1)^2 - (z1^2*x2 - z2^2*x1)^3 - - 2*z2^2*x1*(z1^2*x2 - z2^2*x1)^2 */ - p224_felem_diff_128_64(tmp2, ftmp5); - // tmp2[i] < 2^117 + 2^64 + 8 < 2^118 - p224_felem_reduce(x_out, tmp2); - - // ftmp2 = z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out - p224_felem_diff(ftmp2, x_out); - // ftmp2[i] < 2^57 + 2^58 + 2 < 2^59 - - // tmp2 = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) - p224_felem_mul(tmp2, ftmp3, ftmp2); - // tmp2[i] < 4 * 2^57 * 2^59 = 2^118 - - /* y_out = (z1^3*y2 - z2^3*y1)*(z2^2*x1*(z1^2*x2 - z2^2*x1)^2 - x_out) - - z2^3*y1*(z1^2*x2 - z2^2*x1)^3 */ - p224_widefelem_diff(tmp2, tmp); - // tmp2[i] < 2^118 + 2^120 < 2^121 - p224_felem_reduce(y_out, tmp2); - - // the result (x_out, y_out, z_out) is incorrect if one of the inputs is - // the point at infinity, so we need to check for this separately - - // if point 1 is at infinity, copy point 2 to output, and vice versa - p224_copy_conditional(x_out, x2, z1_is_zero); - p224_copy_conditional(x_out, x1, z2_is_zero); - p224_copy_conditional(y_out, y2, z1_is_zero); - p224_copy_conditional(y_out, y1, z2_is_zero); - p224_copy_conditional(z_out, z2, z1_is_zero); - p224_copy_conditional(z_out, z1, z2_is_zero); - p224_felem_assign(x3, x_out); - p224_felem_assign(y3, y_out); - p224_felem_assign(z3, z_out); -} - -// p224_select_point selects the |idx|th point from a precomputation table and -// copies it to out. -static void p224_select_point(const uint64_t idx, size_t size, - const p224_felem pre_comp[/*size*/][3], - p224_felem out[3]) { - p224_limb *outlimbs = &out[0][0]; - OPENSSL_memset(outlimbs, 0, 3 * sizeof(p224_felem)); - - for (size_t i = 0; i < size; i++) { - const p224_limb *inlimbs = &pre_comp[i][0][0]; - uint64_t mask = i ^ idx; - mask |= mask >> 4; - mask |= mask >> 2; - mask |= mask >> 1; - mask &= 1; - mask--; - for (size_t j = 0; j < 4 * 3; j++) { - outlimbs[j] |= inlimbs[j] & mask; - } - } -} - -// p224_get_bit returns the |i|th bit in |in|. -static crypto_word_t p224_get_bit(const EC_SCALAR *in, size_t i) { - if (i >= 224) { - return 0; - } - static_assert(sizeof(in->words[0]) == 8, "BN_ULONG is not 64-bit"); - return (in->words[i >> 6] >> (i & 63)) & 1; -} - -// Takes the Jacobian coordinates (X, Y, Z) of a point and returns -// (X', Y') = (X/Z^2, Y/Z^3) -static int ec_GFp_nistp224_point_get_affine_coordinates( - const EC_GROUP *group, const EC_RAW_POINT *point, EC_FELEM *x, - EC_FELEM *y) { - if (ec_GFp_simple_is_at_infinity(group, point)) { - OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); - return 0; - } - - p224_felem z1, z2; - p224_widefelem tmp; - p224_generic_to_felem(z1, &point->Z); - p224_felem_inv(z2, z1); - p224_felem_square(tmp, z2); - p224_felem_reduce(z1, tmp); - - if (x != NULL) { - p224_felem x_in, x_out; - p224_generic_to_felem(x_in, &point->X); - p224_felem_mul(tmp, x_in, z1); - p224_felem_reduce(x_out, tmp); - p224_felem_to_generic(x, x_out); - } - - if (y != NULL) { - p224_felem y_in, y_out; - p224_generic_to_felem(y_in, &point->Y); - p224_felem_mul(tmp, z1, z2); - p224_felem_reduce(z1, tmp); - p224_felem_mul(tmp, y_in, z1); - p224_felem_reduce(y_out, tmp); - p224_felem_to_generic(y, y_out); - } - - return 1; -} - -static void ec_GFp_nistp224_add(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *a, const EC_RAW_POINT *b) { - p224_felem x1, y1, z1, x2, y2, z2; - p224_generic_to_felem(x1, &a->X); - p224_generic_to_felem(y1, &a->Y); - p224_generic_to_felem(z1, &a->Z); - p224_generic_to_felem(x2, &b->X); - p224_generic_to_felem(y2, &b->Y); - p224_generic_to_felem(z2, &b->Z); - p224_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2, z2); - // The outputs are already reduced, but still need to be contracted. - p224_felem_to_generic(&r->X, x1); - p224_felem_to_generic(&r->Y, y1); - p224_felem_to_generic(&r->Z, z1); -} - -static void ec_GFp_nistp224_dbl(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *a) { - p224_felem x, y, z; - p224_generic_to_felem(x, &a->X); - p224_generic_to_felem(y, &a->Y); - p224_generic_to_felem(z, &a->Z); - p224_point_double(x, y, z, x, y, z); - // The outputs are already reduced, but still need to be contracted. - p224_felem_to_generic(&r->X, x); - p224_felem_to_generic(&r->Y, y); - p224_felem_to_generic(&r->Z, z); -} - -static void ec_GFp_nistp224_make_precomp(p224_felem out[17][3], - const EC_RAW_POINT *p) { - OPENSSL_memset(out[0], 0, sizeof(p224_felem) * 3); - - p224_generic_to_felem(out[1][0], &p->X); - p224_generic_to_felem(out[1][1], &p->Y); - p224_generic_to_felem(out[1][2], &p->Z); - - for (size_t j = 2; j <= 16; ++j) { - if (j & 1) { - p224_point_add(out[j][0], out[j][1], out[j][2], out[1][0], out[1][1], - out[1][2], 0, out[j - 1][0], out[j - 1][1], out[j - 1][2]); - } else { - p224_point_double(out[j][0], out[j][1], out[j][2], out[j / 2][0], - out[j / 2][1], out[j / 2][2]); - } - } -} - -static void ec_GFp_nistp224_point_mul(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p, - const EC_SCALAR *scalar) { - p224_felem p_pre_comp[17][3]; - ec_GFp_nistp224_make_precomp(p_pre_comp, p); - - // Set nq to the point at infinity. - p224_felem nq[3], tmp[4]; - OPENSSL_memset(nq, 0, 3 * sizeof(p224_felem)); - - int skip = 1; // Save two point operations in the first round. - for (size_t i = 220; i < 221; i--) { - if (!skip) { - p224_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); - } - - // Add every 5 doublings. - if (i % 5 == 0) { - crypto_word_t bits = p224_get_bit(scalar, i + 4) << 5; - bits |= p224_get_bit(scalar, i + 3) << 4; - bits |= p224_get_bit(scalar, i + 2) << 3; - bits |= p224_get_bit(scalar, i + 1) << 2; - bits |= p224_get_bit(scalar, i) << 1; - bits |= p224_get_bit(scalar, i - 1); - crypto_word_t sign, digit; - ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); - - // Select the point to add or subtract. - p224_select_point(digit, 17, (const p224_felem(*)[3])p_pre_comp, tmp); - p224_felem_neg(tmp[3], tmp[1]); // (X, -Y, Z) is the negative point - p224_copy_conditional(tmp[1], tmp[3], sign); - - if (!skip) { - p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */, - tmp[0], tmp[1], tmp[2]); - } else { - OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem)); - skip = 0; - } - } - } - - // Reduce the output to its unique minimal representation. - p224_felem_to_generic(&r->X, nq[0]); - p224_felem_to_generic(&r->Y, nq[1]); - p224_felem_to_generic(&r->Z, nq[2]); -} - -static void ec_GFp_nistp224_point_mul_base(const EC_GROUP *group, - EC_RAW_POINT *r, - const EC_SCALAR *scalar) { - // Set nq to the point at infinity. - p224_felem nq[3], tmp[3]; - OPENSSL_memset(nq, 0, 3 * sizeof(p224_felem)); - - int skip = 1; // Save two point operations in the first round. - for (size_t i = 27; i < 28; i--) { - // double - if (!skip) { - p224_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); - } - - // First, look 28 bits upwards. - crypto_word_t bits = p224_get_bit(scalar, i + 196) << 3; - bits |= p224_get_bit(scalar, i + 140) << 2; - bits |= p224_get_bit(scalar, i + 84) << 1; - bits |= p224_get_bit(scalar, i + 28); - // Select the point to add, in constant time. - p224_select_point(bits, 16, g_p224_pre_comp[1], tmp); - - if (!skip) { - p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, - tmp[0], tmp[1], tmp[2]); - } else { - OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem)); - skip = 0; - } - - // Second, look at the current position/ - bits = p224_get_bit(scalar, i + 168) << 3; - bits |= p224_get_bit(scalar, i + 112) << 2; - bits |= p224_get_bit(scalar, i + 56) << 1; - bits |= p224_get_bit(scalar, i); - // Select the point to add, in constant time. - p224_select_point(bits, 16, g_p224_pre_comp[0], tmp); - p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, - tmp[0], tmp[1], tmp[2]); - } - - // Reduce the output to its unique minimal representation. - p224_felem_to_generic(&r->X, nq[0]); - p224_felem_to_generic(&r->Y, nq[1]); - p224_felem_to_generic(&r->Z, nq[2]); -} - -static void ec_GFp_nistp224_point_mul_public(const EC_GROUP *group, - EC_RAW_POINT *r, - const EC_SCALAR *g_scalar, - const EC_RAW_POINT *p, - const EC_SCALAR *p_scalar) { - // TODO(davidben): If P-224 ECDSA verify performance ever matters, using - // |ec_compute_wNAF| for |p_scalar| would likely be an easy improvement. - p224_felem p_pre_comp[17][3]; - ec_GFp_nistp224_make_precomp(p_pre_comp, p); - - // Set nq to the point at infinity. - p224_felem nq[3], tmp[3]; - OPENSSL_memset(nq, 0, 3 * sizeof(p224_felem)); - - // Loop over both scalars msb-to-lsb, interleaving additions of multiples of - // the generator (two in each of the last 28 rounds) and additions of p (every - // 5th round). - int skip = 1; // Save two point operations in the first round. - for (size_t i = 220; i < 221; i--) { - if (!skip) { - p224_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); - } - - // Add multiples of the generator. - if (i <= 27) { - // First, look 28 bits upwards. - crypto_word_t bits = p224_get_bit(g_scalar, i + 196) << 3; - bits |= p224_get_bit(g_scalar, i + 140) << 2; - bits |= p224_get_bit(g_scalar, i + 84) << 1; - bits |= p224_get_bit(g_scalar, i + 28); - - size_t index = (size_t)bits; - p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, - g_p224_pre_comp[1][index][0], g_p224_pre_comp[1][index][1], - g_p224_pre_comp[1][index][2]); - assert(!skip); - - // Second, look at the current position. - bits = p224_get_bit(g_scalar, i + 168) << 3; - bits |= p224_get_bit(g_scalar, i + 112) << 2; - bits |= p224_get_bit(g_scalar, i + 56) << 1; - bits |= p224_get_bit(g_scalar, i); - index = (size_t)bits; - p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, - g_p224_pre_comp[0][index][0], g_p224_pre_comp[0][index][1], - g_p224_pre_comp[0][index][2]); - } - - // Incorporate |p_scalar| every 5 doublings. - if (i % 5 == 0) { - crypto_word_t bits = p224_get_bit(p_scalar, i + 4) << 5; - bits |= p224_get_bit(p_scalar, i + 3) << 4; - bits |= p224_get_bit(p_scalar, i + 2) << 3; - bits |= p224_get_bit(p_scalar, i + 1) << 2; - bits |= p224_get_bit(p_scalar, i) << 1; - bits |= p224_get_bit(p_scalar, i - 1); - crypto_word_t sign, digit; - ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); - - // Select the point to add or subtract. - OPENSSL_memcpy(tmp, p_pre_comp[digit], 3 * sizeof(p224_felem)); - if (sign) { - p224_felem_neg(tmp[1], tmp[1]); // (X, -Y, Z) is the negative point - } - - if (!skip) { - p224_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 0 /* mixed */, - tmp[0], tmp[1], tmp[2]); - } else { - OPENSSL_memcpy(nq, tmp, 3 * sizeof(p224_felem)); - skip = 0; - } - } - } - - // Reduce the output to its unique minimal representation. - p224_felem_to_generic(&r->X, nq[0]); - p224_felem_to_generic(&r->Y, nq[1]); - p224_felem_to_generic(&r->Z, nq[2]); -} - -static void ec_GFp_nistp224_felem_mul(const EC_GROUP *group, EC_FELEM *r, - const EC_FELEM *a, const EC_FELEM *b) { - p224_felem felem1, felem2; - p224_widefelem wide; - p224_generic_to_felem(felem1, a); - p224_generic_to_felem(felem2, b); - p224_felem_mul(wide, felem1, felem2); - p224_felem_reduce(felem1, wide); - p224_felem_to_generic(r, felem1); -} - -static void ec_GFp_nistp224_felem_sqr(const EC_GROUP *group, EC_FELEM *r, - const EC_FELEM *a) { - p224_felem felem; - p224_generic_to_felem(felem, a); - p224_widefelem wide; - p224_felem_square(wide, felem); - p224_felem_reduce(felem, wide); - p224_felem_to_generic(r, felem); -} - -DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp224_method) { - out->group_init = ec_GFp_simple_group_init; - out->group_finish = ec_GFp_simple_group_finish; - out->group_set_curve = ec_GFp_simple_group_set_curve; - out->point_get_affine_coordinates = - ec_GFp_nistp224_point_get_affine_coordinates; - out->add = ec_GFp_nistp224_add; - out->dbl = ec_GFp_nistp224_dbl; - out->mul = ec_GFp_nistp224_point_mul; - out->mul_base = ec_GFp_nistp224_point_mul_base; - out->mul_public = ec_GFp_nistp224_point_mul_public; - out->felem_mul = ec_GFp_nistp224_felem_mul; - out->felem_sqr = ec_GFp_nistp224_felem_sqr; - out->felem_to_bytes = ec_GFp_simple_felem_to_bytes; - out->felem_from_bytes = ec_GFp_simple_felem_from_bytes; - out->scalar_inv0_montgomery = ec_simple_scalar_inv0_montgomery; - out->scalar_to_montgomery_inv_vartime = - ec_simple_scalar_to_montgomery_inv_vartime; - out->cmp_x_coordinate = ec_GFp_simple_cmp_x_coordinate; -} - -#endif // BORINGSSL_HAS_UINT128 && !SMALL diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz-table.h b/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz-table.h index b81480bd..89739db4 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz-table.h +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz-table.h @@ -1,12 +1,17 @@ -/* - * Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. - * Copyright (c) 2015, Intel Inc. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - */ +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2015, Intel Inc. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // This is the precomputed constant time access table for the code in // p256-nistz.c, for the default generator. The table consists of 37 @@ -22,7 +27,7 @@ // This file is generated by make_tables.go. -static const alignas(4096) PRECOMP256_ROW ecp_nistz256_precomputed[37] = { +alignas(4096) static const PRECOMP256_ROW ecp_nistz256_precomputed[37] = { {{{TOBN(0x79e730d4, 0x18a9143c), TOBN(0x75ba95fc, 0x5fedb601), TOBN(0x79fb732b, 0x77622510), TOBN(0x18905f76, 0xa53755c6)}, {TOBN(0xddf25357, 0xce95560a), TOBN(0x8b4ab8e4, 0xba19e45c), diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz.c b/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz.c deleted file mode 100644 index 996c2fe3..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz.c +++ /dev/null @@ -1,636 +0,0 @@ -/* - * Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. - * Copyright (c) 2014, Intel Corporation. All Rights Reserved. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) - * (1) Intel Corporation, Israel Development Center, Haifa, Israel - * (2) University of Haifa, Israel - * - * Reference: - * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with - * 256 Bit Primes" - */ - -#include - -#include -#include -#include - -#include -#include -#include - -#include "../bn/internal.h" -#include "../delocate.h" -#include "../../internal.h" -#include "internal.h" -#include "p256-nistz.h" - -#if !defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ - !defined(OPENSSL_SMALL) - -typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; - -// One converted into the Montgomery domain -static const BN_ULONG ONE[P256_LIMBS] = { - TOBN(0x00000000, 0x00000001), TOBN(0xffffffff, 0x00000000), - TOBN(0xffffffff, 0xffffffff), TOBN(0x00000000, 0xfffffffe), -}; - -// Precomputed tables for the default generator -#include "p256-nistz-table.h" - -// Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in -// util.c for details -static crypto_word_t booth_recode_w5(crypto_word_t in) { - crypto_word_t s, d; - - s = ~((in >> 5) - 1); - d = (1 << 6) - in - 1; - d = (d & s) | (in & ~s); - d = (d >> 1) + (d & 1); - - return (d << 1) + (s & 1); -} - -static crypto_word_t booth_recode_w7(crypto_word_t in) { - crypto_word_t s, d; - - s = ~((in >> 7) - 1); - d = (1 << 8) - in - 1; - d = (d & s) | (in & ~s); - d = (d >> 1) + (d & 1); - - return (d << 1) + (s & 1); -} - -// copy_conditional copies |src| to |dst| if |move| is one and leaves it as-is -// if |move| is zero. -// -// WARNING: this breaks the usual convention of constant-time functions -// returning masks. -static void copy_conditional(BN_ULONG dst[P256_LIMBS], - const BN_ULONG src[P256_LIMBS], BN_ULONG move) { - BN_ULONG mask1 = ((BN_ULONG)0) - move; - BN_ULONG mask2 = ~mask1; - - dst[0] = (src[0] & mask1) ^ (dst[0] & mask2); - dst[1] = (src[1] & mask1) ^ (dst[1] & mask2); - dst[2] = (src[2] & mask1) ^ (dst[2] & mask2); - dst[3] = (src[3] & mask1) ^ (dst[3] & mask2); - if (P256_LIMBS == 8) { - dst[4] = (src[4] & mask1) ^ (dst[4] & mask2); - dst[5] = (src[5] & mask1) ^ (dst[5] & mask2); - dst[6] = (src[6] & mask1) ^ (dst[6] & mask2); - dst[7] = (src[7] & mask1) ^ (dst[7] & mask2); - } -} - -// is_not_zero returns one iff in != 0 and zero otherwise. -// -// WARNING: this breaks the usual convention of constant-time functions -// returning masks. -// -// (define-fun is_not_zero ((in (_ BitVec 64))) (_ BitVec 64) -// (bvlshr (bvor in (bvsub #x0000000000000000 in)) #x000000000000003f) -// ) -// -// (declare-fun x () (_ BitVec 64)) -// -// (assert (and (= x #x0000000000000000) (= (is_not_zero x) #x0000000000000001))) -// (check-sat) -// -// (assert (and (not (= x #x0000000000000000)) (= (is_not_zero x) #x0000000000000000))) -// (check-sat) -// -static BN_ULONG is_not_zero(BN_ULONG in) { - in |= (0 - in); - in >>= BN_BITS2 - 1; - return in; -} - -// ecp_nistz256_mod_inverse_sqr_mont sets |r| to (|in| * 2^-256)^-2 * 2^256 mod -// p. That is, |r| is the modular inverse square of |in| for input and output in -// the Montgomery domain. -static void ecp_nistz256_mod_inverse_sqr_mont(BN_ULONG r[P256_LIMBS], - const BN_ULONG in[P256_LIMBS]) { - // This implements the addition chain described in - // https://briansmith.org/ecc-inversion-addition-chains-01#p256_field_inversion - BN_ULONG x2[P256_LIMBS], x3[P256_LIMBS], x6[P256_LIMBS], x12[P256_LIMBS], - x15[P256_LIMBS], x30[P256_LIMBS], x32[P256_LIMBS]; - ecp_nistz256_sqr_mont(x2, in); // 2^2 - 2^1 - ecp_nistz256_mul_mont(x2, x2, in); // 2^2 - 2^0 - - ecp_nistz256_sqr_mont(x3, x2); // 2^3 - 2^1 - ecp_nistz256_mul_mont(x3, x3, in); // 2^3 - 2^0 - - ecp_nistz256_sqr_mont(x6, x3); - for (int i = 1; i < 3; i++) { - ecp_nistz256_sqr_mont(x6, x6); - } // 2^6 - 2^3 - ecp_nistz256_mul_mont(x6, x6, x3); // 2^6 - 2^0 - - ecp_nistz256_sqr_mont(x12, x6); - for (int i = 1; i < 6; i++) { - ecp_nistz256_sqr_mont(x12, x12); - } // 2^12 - 2^6 - ecp_nistz256_mul_mont(x12, x12, x6); // 2^12 - 2^0 - - ecp_nistz256_sqr_mont(x15, x12); - for (int i = 1; i < 3; i++) { - ecp_nistz256_sqr_mont(x15, x15); - } // 2^15 - 2^3 - ecp_nistz256_mul_mont(x15, x15, x3); // 2^15 - 2^0 - - ecp_nistz256_sqr_mont(x30, x15); - for (int i = 1; i < 15; i++) { - ecp_nistz256_sqr_mont(x30, x30); - } // 2^30 - 2^15 - ecp_nistz256_mul_mont(x30, x30, x15); // 2^30 - 2^0 - - ecp_nistz256_sqr_mont(x32, x30); - ecp_nistz256_sqr_mont(x32, x32); // 2^32 - 2^2 - ecp_nistz256_mul_mont(x32, x32, x2); // 2^32 - 2^0 - - BN_ULONG ret[P256_LIMBS]; - ecp_nistz256_sqr_mont(ret, x32); - for (int i = 1; i < 31 + 1; i++) { - ecp_nistz256_sqr_mont(ret, ret); - } // 2^64 - 2^32 - ecp_nistz256_mul_mont(ret, ret, in); // 2^64 - 2^32 + 2^0 - - for (int i = 0; i < 96 + 32; i++) { - ecp_nistz256_sqr_mont(ret, ret); - } // 2^192 - 2^160 + 2^128 - ecp_nistz256_mul_mont(ret, ret, x32); // 2^192 - 2^160 + 2^128 + 2^32 - 2^0 - - for (int i = 0; i < 32; i++) { - ecp_nistz256_sqr_mont(ret, ret); - } // 2^224 - 2^192 + 2^160 + 2^64 - 2^32 - ecp_nistz256_mul_mont(ret, ret, x32); // 2^224 - 2^192 + 2^160 + 2^64 - 2^0 - - for (int i = 0; i < 30; i++) { - ecp_nistz256_sqr_mont(ret, ret); - } // 2^254 - 2^222 + 2^190 + 2^94 - 2^30 - ecp_nistz256_mul_mont(ret, ret, x30); // 2^254 - 2^222 + 2^190 + 2^94 - 2^0 - - ecp_nistz256_sqr_mont(ret, ret); - ecp_nistz256_sqr_mont(r, ret); // 2^256 - 2^224 + 2^192 + 2^96 - 2^2 -} - -// r = p * p_scalar -static void ecp_nistz256_windowed_mul(const EC_GROUP *group, P256_POINT *r, - const EC_RAW_POINT *p, - const EC_SCALAR *p_scalar) { - assert(p != NULL); - assert(p_scalar != NULL); - assert(group->field.width == P256_LIMBS); - - static const size_t kWindowSize = 5; - static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1; - - // A |P256_POINT| is (3 * 32) = 96 bytes, and the 64-byte alignment should - // add no more than 63 bytes of overhead. Thus, |table| should require - // ~1599 ((96 * 16) + 63) bytes of stack space. - alignas(64) P256_POINT table[16]; - uint8_t p_str[33]; - OPENSSL_memcpy(p_str, p_scalar->words, 32); - p_str[32] = 0; - - // table[0] is implicitly (0,0,0) (the point at infinity), therefore it is - // not stored. All other values are actually stored with an offset of -1 in - // table. - P256_POINT *row = table; - assert(group->field.width == P256_LIMBS); - OPENSSL_memcpy(row[1 - 1].X, p->X.words, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(row[1 - 1].Y, p->Y.words, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(row[1 - 1].Z, p->Z.words, P256_LIMBS * sizeof(BN_ULONG)); - - ecp_nistz256_point_double(&row[2 - 1], &row[1 - 1]); - ecp_nistz256_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]); - ecp_nistz256_point_double(&row[4 - 1], &row[2 - 1]); - ecp_nistz256_point_double(&row[6 - 1], &row[3 - 1]); - ecp_nistz256_point_double(&row[8 - 1], &row[4 - 1]); - ecp_nistz256_point_double(&row[12 - 1], &row[6 - 1]); - ecp_nistz256_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]); - ecp_nistz256_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]); - ecp_nistz256_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]); - ecp_nistz256_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]); - ecp_nistz256_point_double(&row[14 - 1], &row[7 - 1]); - ecp_nistz256_point_double(&row[10 - 1], &row[5 - 1]); - ecp_nistz256_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]); - ecp_nistz256_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]); - ecp_nistz256_point_double(&row[16 - 1], &row[8 - 1]); - - BN_ULONG tmp[P256_LIMBS]; - alignas(32) P256_POINT h; - size_t index = 255; - crypto_word_t wvalue = p_str[(index - 1) / 8]; - wvalue = (wvalue >> ((index - 1) % 8)) & kMask; - - ecp_nistz256_select_w5(r, table, booth_recode_w5(wvalue) >> 1); - - while (index >= 5) { - if (index != 255) { - size_t off = (index - 1) / 8; - - wvalue = (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8; - wvalue = (wvalue >> ((index - 1) % 8)) & kMask; - - wvalue = booth_recode_w5(wvalue); - - ecp_nistz256_select_w5(&h, table, wvalue >> 1); - - ecp_nistz256_neg(tmp, h.Y); - copy_conditional(h.Y, tmp, (wvalue & 1)); - - ecp_nistz256_point_add(r, r, &h); - } - - index -= kWindowSize; - - ecp_nistz256_point_double(r, r); - ecp_nistz256_point_double(r, r); - ecp_nistz256_point_double(r, r); - ecp_nistz256_point_double(r, r); - ecp_nistz256_point_double(r, r); - } - - // Final window - wvalue = p_str[0]; - wvalue = (wvalue << 1) & kMask; - - wvalue = booth_recode_w5(wvalue); - - ecp_nistz256_select_w5(&h, table, wvalue >> 1); - - ecp_nistz256_neg(tmp, h.Y); - copy_conditional(h.Y, tmp, wvalue & 1); - - ecp_nistz256_point_add(r, r, &h); -} - -static crypto_word_t calc_first_wvalue(size_t *index, const uint8_t p_str[33]) { - static const size_t kWindowSize = 7; - static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; - *index = kWindowSize; - - crypto_word_t wvalue = (p_str[0] << 1) & kMask; - return booth_recode_w7(wvalue); -} - -static crypto_word_t calc_wvalue(size_t *index, const uint8_t p_str[33]) { - static const size_t kWindowSize = 7; - static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; - - const size_t off = (*index - 1) / 8; - crypto_word_t wvalue = - (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8; - wvalue = (wvalue >> ((*index - 1) % 8)) & kMask; - *index += kWindowSize; - - return booth_recode_w7(wvalue); -} - -static void ecp_nistz256_point_mul(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p, - const EC_SCALAR *scalar) { - alignas(32) P256_POINT out; - ecp_nistz256_windowed_mul(group, &out, p, scalar); - - assert(group->field.width == P256_LIMBS); - OPENSSL_memcpy(r->X.words, out.X, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Y.words, out.Y, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Z.words, out.Z, P256_LIMBS * sizeof(BN_ULONG)); -} - -static void ecp_nistz256_point_mul_base(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_SCALAR *scalar) { - uint8_t p_str[33]; - OPENSSL_memcpy(p_str, scalar->words, 32); - p_str[32] = 0; - - // First window - size_t index = 0; - crypto_word_t wvalue = calc_first_wvalue(&index, p_str); - - alignas(32) P256_POINT_AFFINE t; - alignas(32) P256_POINT p; - ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[0], wvalue >> 1); - ecp_nistz256_neg(p.Z, t.Y); - copy_conditional(t.Y, p.Z, wvalue & 1); - - // Convert |t| from affine to Jacobian coordinates. We set Z to zero if |t| - // is infinity and |ONE| otherwise. |t| was computed from the table, so it - // is infinity iff |wvalue >> 1| is zero. - OPENSSL_memcpy(p.X, t.X, sizeof(p.X)); - OPENSSL_memcpy(p.Y, t.Y, sizeof(p.Y)); - OPENSSL_memset(p.Z, 0, sizeof(p.Z)); - copy_conditional(p.Z, ONE, is_not_zero(wvalue >> 1)); - - for (int i = 1; i < 37; i++) { - wvalue = calc_wvalue(&index, p_str); - - ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[i], wvalue >> 1); - - alignas(32) BN_ULONG neg_Y[P256_LIMBS]; - ecp_nistz256_neg(neg_Y, t.Y); - copy_conditional(t.Y, neg_Y, wvalue & 1); - - // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are the - // same non-infinity point. - ecp_nistz256_point_add_affine(&p, &p, &t); - } - - assert(group->field.width == P256_LIMBS); - OPENSSL_memcpy(r->X.words, p.X, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Y.words, p.Y, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Z.words, p.Z, P256_LIMBS * sizeof(BN_ULONG)); -} - -static void ecp_nistz256_points_mul_public(const EC_GROUP *group, - EC_RAW_POINT *r, - const EC_SCALAR *g_scalar, - const EC_RAW_POINT *p_, - const EC_SCALAR *p_scalar) { - assert(p_ != NULL && p_scalar != NULL && g_scalar != NULL); - - alignas(32) P256_POINT p; - uint8_t p_str[33]; - OPENSSL_memcpy(p_str, g_scalar->words, 32); - p_str[32] = 0; - - // First window - size_t index = 0; - size_t wvalue = calc_first_wvalue(&index, p_str); - - // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p| - // is infinity and |ONE| otherwise. |p| was computed from the table, so it - // is infinity iff |wvalue >> 1| is zero. - if ((wvalue >> 1) != 0) { - OPENSSL_memcpy(p.X, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].X, - sizeof(p.X)); - OPENSSL_memcpy(p.Y, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].Y, - sizeof(p.Y)); - OPENSSL_memcpy(p.Z, ONE, sizeof(p.Z)); - } else { - OPENSSL_memset(p.X, 0, sizeof(p.X)); - OPENSSL_memset(p.Y, 0, sizeof(p.Y)); - OPENSSL_memset(p.Z, 0, sizeof(p.Z)); - } - - if ((wvalue & 1) == 1) { - ecp_nistz256_neg(p.Y, p.Y); - } - - for (int i = 1; i < 37; i++) { - wvalue = calc_wvalue(&index, p_str); - if ((wvalue >> 1) == 0) { - continue; - } - - alignas(32) P256_POINT_AFFINE t; - OPENSSL_memcpy(&t, &ecp_nistz256_precomputed[i][(wvalue >> 1) - 1], - sizeof(t)); - if ((wvalue & 1) == 1) { - ecp_nistz256_neg(t.Y, t.Y); - } - - // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are - // the same non-infinity point, so it is important that we compute the - // |g_scalar| term before the |p_scalar| term. - ecp_nistz256_point_add_affine(&p, &p, &t); - } - - alignas(32) P256_POINT tmp; - ecp_nistz256_windowed_mul(group, &tmp, p_, p_scalar); - ecp_nistz256_point_add(&p, &p, &tmp); - - assert(group->field.width == P256_LIMBS); - OPENSSL_memcpy(r->X.words, p.X, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Y.words, p.Y, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Z.words, p.Z, P256_LIMBS * sizeof(BN_ULONG)); -} - -static int ecp_nistz256_get_affine(const EC_GROUP *group, - const EC_RAW_POINT *point, EC_FELEM *x, - EC_FELEM *y) { - if (ec_GFp_simple_is_at_infinity(group, point)) { - OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); - return 0; - } - - BN_ULONG z_inv2[P256_LIMBS]; - assert(group->field.width == P256_LIMBS); - ecp_nistz256_mod_inverse_sqr_mont(z_inv2, point->Z.words); - - if (x != NULL) { - ecp_nistz256_mul_mont(x->words, z_inv2, point->X.words); - } - - if (y != NULL) { - ecp_nistz256_sqr_mont(z_inv2, z_inv2); // z^-4 - ecp_nistz256_mul_mont(y->words, point->Y.words, point->Z.words); // y * z - ecp_nistz256_mul_mont(y->words, y->words, z_inv2); // y * z^-3 - } - - return 1; -} - -static void ecp_nistz256_add(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *a_, const EC_RAW_POINT *b_) { - P256_POINT a, b; - OPENSSL_memcpy(a.X, a_->X.words, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(a.Y, a_->Y.words, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(a.Z, a_->Z.words, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(b.X, b_->X.words, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(b.Y, b_->Y.words, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(b.Z, b_->Z.words, P256_LIMBS * sizeof(BN_ULONG)); - ecp_nistz256_point_add(&a, &a, &b); - OPENSSL_memcpy(r->X.words, a.X, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Y.words, a.Y, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Z.words, a.Z, P256_LIMBS * sizeof(BN_ULONG)); -} - -static void ecp_nistz256_dbl(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *a_) { - P256_POINT a; - OPENSSL_memcpy(a.X, a_->X.words, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(a.Y, a_->Y.words, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(a.Z, a_->Z.words, P256_LIMBS * sizeof(BN_ULONG)); - ecp_nistz256_point_double(&a, &a); - OPENSSL_memcpy(r->X.words, a.X, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Y.words, a.Y, P256_LIMBS * sizeof(BN_ULONG)); - OPENSSL_memcpy(r->Z.words, a.Z, P256_LIMBS * sizeof(BN_ULONG)); -} - -static void ecp_nistz256_inv0_mod_ord(const EC_GROUP *group, EC_SCALAR *out, - const EC_SCALAR *in) { - // table[i] stores a power of |in| corresponding to the matching enum value. - enum { - // The following indices specify the power in binary. - i_1 = 0, - i_10, - i_11, - i_101, - i_111, - i_1010, - i_1111, - i_10101, - i_101010, - i_101111, - // The following indices specify 2^N-1, or N ones in a row. - i_x6, - i_x8, - i_x16, - i_x32 - }; - BN_ULONG table[15][P256_LIMBS]; - - // https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion - // - // Even though this code path spares 12 squarings, 4.5%, and 13 - // multiplications, 25%, the overall sign operation is not that much faster, - // not more that 2%. Most of the performance of this function comes from the - // scalar operations. - - // Pre-calculate powers. - OPENSSL_memcpy(table[i_1], in->words, P256_LIMBS * sizeof(BN_ULONG)); - - ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1); - - ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]); - - ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]); - - ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]); - - ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1); - - ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]); - - ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1); - ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]); - - ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1); - - ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]); - - ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]); - - ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2); - ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]); - - ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8); - ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]); - - ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16); - ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]); - - // Compute |in| raised to the order-2. - ecp_nistz256_ord_sqr_mont(out->words, table[i_x32], 64); - ecp_nistz256_ord_mul_mont(out->words, out->words, table[i_x32]); - static const struct { - uint8_t p, i; - } kChain[27] = {{32, i_x32}, {6, i_101111}, {5, i_111}, {4, i_11}, - {5, i_1111}, {5, i_10101}, {4, i_101}, {3, i_101}, - {3, i_101}, {5, i_111}, {9, i_101111}, {6, i_1111}, - {2, i_1}, {5, i_1}, {6, i_1111}, {5, i_111}, - {4, i_111}, {5, i_111}, {5, i_101}, {3, i_11}, - {10, i_101111}, {2, i_11}, {5, i_11}, {5, i_11}, - {3, i_1}, {7, i_10101}, {6, i_1111}}; - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kChain); i++) { - ecp_nistz256_ord_sqr_mont(out->words, out->words, kChain[i].p); - ecp_nistz256_ord_mul_mont(out->words, out->words, table[kChain[i].i]); - } -} - -static int ecp_nistz256_scalar_to_montgomery_inv_vartime(const EC_GROUP *group, - EC_SCALAR *out, - const EC_SCALAR *in) { -#if defined(OPENSSL_X86_64) - if (!CRYPTO_is_AVX_capable()) { - // No AVX support; fallback to generic code. - return ec_simple_scalar_to_montgomery_inv_vartime(group, out, in); - } -#endif - - assert(group->order.width == P256_LIMBS); - if (!beeu_mod_inverse_vartime(out->words, in->words, group->order.d)) { - return 0; - } - - // The result should be returned in the Montgomery domain. - ec_scalar_to_montgomery(group, out, out); - return 1; -} - -static int ecp_nistz256_cmp_x_coordinate(const EC_GROUP *group, - const EC_RAW_POINT *p, - const EC_SCALAR *r) { - if (ec_GFp_simple_is_at_infinity(group, p)) { - return 0; - } - - assert(group->order.width == P256_LIMBS); - assert(group->field.width == P256_LIMBS); - - // We wish to compare X/Z^2 with r. This is equivalent to comparing X with - // r*Z^2. Note that X and Z are represented in Montgomery form, while r is - // not. - BN_ULONG r_Z2[P256_LIMBS], Z2_mont[P256_LIMBS], X[P256_LIMBS]; - ecp_nistz256_mul_mont(Z2_mont, p->Z.words, p->Z.words); - ecp_nistz256_mul_mont(r_Z2, r->words, Z2_mont); - ecp_nistz256_from_mont(X, p->X.words); - - if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) { - return 1; - } - - // During signing the x coefficient is reduced modulo the group order. - // Therefore there is a small possibility, less than 1/2^128, that group_order - // < p.x < P. in that case we need not only to compare against |r| but also to - // compare against r+group_order. - if (bn_less_than_words(r->words, group->field_minus_order.words, - P256_LIMBS)) { - // We can ignore the carry because: r + group_order < p < 2^256. - bn_add_words(r_Z2, r->words, group->order.d, P256_LIMBS); - ecp_nistz256_mul_mont(r_Z2, r_Z2, Z2_mont); - if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) { - return 1; - } - } - - return 0; -} - -DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) { - out->group_init = ec_GFp_mont_group_init; - out->group_finish = ec_GFp_mont_group_finish; - out->group_set_curve = ec_GFp_mont_group_set_curve; - out->point_get_affine_coordinates = ecp_nistz256_get_affine; - out->add = ecp_nistz256_add; - out->dbl = ecp_nistz256_dbl; - out->mul = ecp_nistz256_point_mul; - out->mul_base = ecp_nistz256_point_mul_base; - out->mul_public = ecp_nistz256_points_mul_public; - out->felem_mul = ec_GFp_mont_felem_mul; - out->felem_sqr = ec_GFp_mont_felem_sqr; - out->felem_to_bytes = ec_GFp_mont_felem_to_bytes; - out->felem_from_bytes = ec_GFp_mont_felem_from_bytes; - out->scalar_inv0_montgomery = ecp_nistz256_inv0_mod_ord; - out->scalar_to_montgomery_inv_vartime = - ecp_nistz256_scalar_to_montgomery_inv_vartime; - out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate; -} - -#endif /* !defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ - !defined(OPENSSL_SMALL) */ diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz.cc.inc new file mode 100644 index 00000000..6200d073 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz.cc.inc @@ -0,0 +1,740 @@ +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2014, Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) +// (1) Intel Corporation, Israel Development Center, Haifa, Israel +// (2) University of Haifa, Israel +// +// Reference: +// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with +// 256 Bit Primes" + +#include + +#include +#include +#include + +#include +#include +#include + +#include "../../internal.h" +#include "../bn/internal.h" +#include "../delocate.h" +#include "internal.h" +#include "p256-nistz.h" + + +using namespace bssl; + +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ + !defined(OPENSSL_SMALL) + +typedef P256_POINT_AFFINE PRECOMP256_ROW[64]; + +// One converted into the Montgomery domain +static const BN_ULONG ONE_MONT[P256_LIMBS] = { + TOBN(0x00000000, 0x00000001), + TOBN(0xffffffff, 0x00000000), + TOBN(0xffffffff, 0xffffffff), + TOBN(0x00000000, 0xfffffffe), +}; + +// Precomputed tables for the default generator +#include "p256-nistz-table.h" + +// Recode window to a signed digit, see |ec_GFp_nistp_recode_scalar_bits| in +// util.c for details +static crypto_word_t booth_recode_w5(crypto_word_t in) { + crypto_word_t s, d; + + s = ~((in >> 5) - 1); + d = (1 << 6) - in - 1; + d = (d & s) | (in & ~s); + d = (d >> 1) + (d & 1); + + return (d << 1) + (s & 1); +} + +static crypto_word_t booth_recode_w7(crypto_word_t in) { + crypto_word_t s, d; + + s = ~((in >> 7) - 1); + d = (1 << 8) - in - 1; + d = (d & s) | (in & ~s); + d = (d >> 1) + (d & 1); + + return (d << 1) + (s & 1); +} + +// copy_conditional copies |src| to |dst| if |move| is one and leaves it as-is +// if |move| is zero. +// +// WARNING: this breaks the usual convention of constant-time functions +// returning masks. +static void copy_conditional(BN_ULONG dst[P256_LIMBS], + const BN_ULONG src[P256_LIMBS], BN_ULONG move) { + BN_ULONG mask1 = ((BN_ULONG)0) - move; + BN_ULONG mask2 = ~mask1; + + dst[0] = (src[0] & mask1) ^ (dst[0] & mask2); + dst[1] = (src[1] & mask1) ^ (dst[1] & mask2); + dst[2] = (src[2] & mask1) ^ (dst[2] & mask2); + dst[3] = (src[3] & mask1) ^ (dst[3] & mask2); + if (P256_LIMBS == 8) { + dst[4] = (src[4] & mask1) ^ (dst[4] & mask2); + dst[5] = (src[5] & mask1) ^ (dst[5] & mask2); + dst[6] = (src[6] & mask1) ^ (dst[6] & mask2); + dst[7] = (src[7] & mask1) ^ (dst[7] & mask2); + } +} + +// is_not_zero returns one iff in != 0 and zero otherwise. +// +// WARNING: this breaks the usual convention of constant-time functions +// returning masks. +// +// (define-fun is_not_zero ((in (_ BitVec 64))) (_ BitVec 64) +// (bvlshr (bvor in (bvsub #x0000000000000000 in)) #x000000000000003f) +// ) +// +// (declare-fun x () (_ BitVec 64)) +// +// (assert (and (= x #x0000000000000000) (= (is_not_zero x) +// #x0000000000000001))) (check-sat) +// +// (assert (and (not (= x #x0000000000000000)) (= (is_not_zero x) +// #x0000000000000000))) (check-sat) +// +static BN_ULONG is_not_zero(BN_ULONG in) { + in |= (0 - in); + in >>= BN_BITS2 - 1; + return in; +} + +#if defined(OPENSSL_X86_64) +// Dispatch between CPU variations. The "_adx" suffixed functions use MULX in +// addition to ADCX/ADOX. MULX is part of BMI2, not ADX, so we must check both +// capabilities. +static void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_mul_mont_adx(res, a, b); + } else { + ecp_nistz256_mul_mont_nohw(res, a, b); + } +} + +static void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS]) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_sqr_mont_adx(res, a); + } else { + ecp_nistz256_sqr_mont_nohw(res, a); + } +} + +static void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_ord_mul_mont_adx(res, a, b); + } else { + ecp_nistz256_ord_mul_mont_nohw(res, a, b); + } +} + +static void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + BN_ULONG rep) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_ord_sqr_mont_adx(res, a, rep); + } else { + ecp_nistz256_ord_sqr_mont_nohw(res, a, rep); + } +} + +static void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16], + int index) { + if (CRYPTO_is_AVX2_capable()) { + ecp_nistz256_select_w5_avx2(val, in_t, index); + } else { + ecp_nistz256_select_w5_nohw(val, in_t, index); + } +} + +static void ecp_nistz256_select_w7(P256_POINT_AFFINE *val, + const P256_POINT_AFFINE in_t[64], + int index) { + if (CRYPTO_is_AVX2_capable()) { + ecp_nistz256_select_w7_avx2(val, in_t, index); + } else { + ecp_nistz256_select_w7_nohw(val, in_t, index); + } +} + +static void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_point_double_adx(r, a); + } else { + ecp_nistz256_point_double_nohw(r, a); + } +} + +static void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, + const P256_POINT *b) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_point_add_adx(r, a, b); + } else { + ecp_nistz256_point_add_nohw(r, a, b); + } +} + +static void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a, + const P256_POINT_AFFINE *b) { + if (CRYPTO_is_BMI2_capable() && CRYPTO_is_ADX_capable()) { + ecp_nistz256_point_add_affine_adx(r, a, b); + } else { + ecp_nistz256_point_add_affine_nohw(r, a, b); + } +} +#endif // OPENSSL_X86_64 + +// ecp_nistz256_from_mont sets |res| to |in|, converted from Montgomery domain +// by multiplying with 1. +static void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], + const BN_ULONG in[P256_LIMBS]) { + static const BN_ULONG ONE[P256_LIMBS] = {1}; + ecp_nistz256_mul_mont(res, in, ONE); +} + +// ecp_nistz256_mod_inverse_sqr_mont sets |r| to (|in| * 2^-256)^-2 * 2^256 mod +// p. That is, |r| is the modular inverse square of |in| for input and output in +// the Montgomery domain. +static void ecp_nistz256_mod_inverse_sqr_mont(BN_ULONG r[P256_LIMBS], + const BN_ULONG in[P256_LIMBS]) { + // This implements the addition chain described in + // https://briansmith.org/ecc-inversion-addition-chains-01#p256_field_inversion + BN_ULONG x2[P256_LIMBS], x3[P256_LIMBS], x6[P256_LIMBS], x12[P256_LIMBS], + x15[P256_LIMBS], x30[P256_LIMBS], x32[P256_LIMBS]; + ecp_nistz256_sqr_mont(x2, in); // 2^2 - 2^1 + ecp_nistz256_mul_mont(x2, x2, in); // 2^2 - 2^0 + + ecp_nistz256_sqr_mont(x3, x2); // 2^3 - 2^1 + ecp_nistz256_mul_mont(x3, x3, in); // 2^3 - 2^0 + + ecp_nistz256_sqr_mont(x6, x3); + for (int i = 1; i < 3; i++) { + ecp_nistz256_sqr_mont(x6, x6); + } // 2^6 - 2^3 + ecp_nistz256_mul_mont(x6, x6, x3); // 2^6 - 2^0 + + ecp_nistz256_sqr_mont(x12, x6); + for (int i = 1; i < 6; i++) { + ecp_nistz256_sqr_mont(x12, x12); + } // 2^12 - 2^6 + ecp_nistz256_mul_mont(x12, x12, x6); // 2^12 - 2^0 + + ecp_nistz256_sqr_mont(x15, x12); + for (int i = 1; i < 3; i++) { + ecp_nistz256_sqr_mont(x15, x15); + } // 2^15 - 2^3 + ecp_nistz256_mul_mont(x15, x15, x3); // 2^15 - 2^0 + + ecp_nistz256_sqr_mont(x30, x15); + for (int i = 1; i < 15; i++) { + ecp_nistz256_sqr_mont(x30, x30); + } // 2^30 - 2^15 + ecp_nistz256_mul_mont(x30, x30, x15); // 2^30 - 2^0 + + ecp_nistz256_sqr_mont(x32, x30); + ecp_nistz256_sqr_mont(x32, x32); // 2^32 - 2^2 + ecp_nistz256_mul_mont(x32, x32, x2); // 2^32 - 2^0 + + BN_ULONG ret[P256_LIMBS]; + ecp_nistz256_sqr_mont(ret, x32); + for (int i = 1; i < 31 + 1; i++) { + ecp_nistz256_sqr_mont(ret, ret); + } // 2^64 - 2^32 + ecp_nistz256_mul_mont(ret, ret, in); // 2^64 - 2^32 + 2^0 + + for (int i = 0; i < 96 + 32; i++) { + ecp_nistz256_sqr_mont(ret, ret); + } // 2^192 - 2^160 + 2^128 + ecp_nistz256_mul_mont(ret, ret, x32); // 2^192 - 2^160 + 2^128 + 2^32 - 2^0 + + for (int i = 0; i < 32; i++) { + ecp_nistz256_sqr_mont(ret, ret); + } // 2^224 - 2^192 + 2^160 + 2^64 - 2^32 + ecp_nistz256_mul_mont(ret, ret, x32); // 2^224 - 2^192 + 2^160 + 2^64 - 2^0 + + for (int i = 0; i < 30; i++) { + ecp_nistz256_sqr_mont(ret, ret); + } // 2^254 - 2^222 + 2^190 + 2^94 - 2^30 + ecp_nistz256_mul_mont(ret, ret, x30); // 2^254 - 2^222 + 2^190 + 2^94 - 2^0 + + ecp_nistz256_sqr_mont(ret, ret); + ecp_nistz256_sqr_mont(r, ret); // 2^256 - 2^224 + 2^192 + 2^96 - 2^2 +} + +// r = p * p_scalar +static void ecp_nistz256_windowed_mul(const EC_GROUP *group, P256_POINT *r, + const EC_JACOBIAN *p, + const EC_SCALAR *p_scalar) { + assert(p != nullptr); + assert(p_scalar != nullptr); + assert(group->field.N.width == P256_LIMBS); + + static const size_t kWindowSize = 5; + static const crypto_word_t kMask = (1 << (5 /* kWindowSize */ + 1)) - 1; + + // A |P256_POINT| is (3 * 32) = 96 bytes, and the 64-byte alignment should + // add no more than 63 bytes of overhead. Thus, |table| should require + // ~1599 ((96 * 16) + 63) bytes of stack space. + alignas(64) P256_POINT table[16]; + uint8_t p_str[33]; + OPENSSL_memcpy(p_str, p_scalar->words, 32); + p_str[32] = 0; + + // table[0] is implicitly (0,0,0) (the point at infinity), therefore it is + // not stored. All other values are actually stored with an offset of -1 in + // table. + P256_POINT *row = table; + assert(group->field.N.width == P256_LIMBS); + OPENSSL_memcpy(row[1 - 1].X, p->X.words, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(row[1 - 1].Y, p->Y.words, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(row[1 - 1].Z, p->Z.words, P256_LIMBS * sizeof(BN_ULONG)); + + ecp_nistz256_point_double(&row[2 - 1], &row[1 - 1]); + ecp_nistz256_point_add(&row[3 - 1], &row[2 - 1], &row[1 - 1]); + ecp_nistz256_point_double(&row[4 - 1], &row[2 - 1]); + ecp_nistz256_point_double(&row[6 - 1], &row[3 - 1]); + ecp_nistz256_point_double(&row[8 - 1], &row[4 - 1]); + ecp_nistz256_point_double(&row[12 - 1], &row[6 - 1]); + ecp_nistz256_point_add(&row[5 - 1], &row[4 - 1], &row[1 - 1]); + ecp_nistz256_point_add(&row[7 - 1], &row[6 - 1], &row[1 - 1]); + ecp_nistz256_point_add(&row[9 - 1], &row[8 - 1], &row[1 - 1]); + ecp_nistz256_point_add(&row[13 - 1], &row[12 - 1], &row[1 - 1]); + ecp_nistz256_point_double(&row[14 - 1], &row[7 - 1]); + ecp_nistz256_point_double(&row[10 - 1], &row[5 - 1]); + ecp_nistz256_point_add(&row[15 - 1], &row[14 - 1], &row[1 - 1]); + ecp_nistz256_point_add(&row[11 - 1], &row[10 - 1], &row[1 - 1]); + ecp_nistz256_point_double(&row[16 - 1], &row[8 - 1]); + + BN_ULONG tmp[P256_LIMBS]; + alignas(32) P256_POINT h; + size_t index = 255; + crypto_word_t wvalue = p_str[(index - 1) / 8]; + wvalue = (wvalue >> ((index - 1) % 8)) & kMask; + + ecp_nistz256_select_w5(r, table, booth_recode_w5(wvalue) >> 1); + + while (index >= 5) { + if (index != 255) { + size_t off = (index - 1) / 8; + + wvalue = (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8; + wvalue = (wvalue >> ((index - 1) % 8)) & kMask; + + wvalue = booth_recode_w5(wvalue); + + ecp_nistz256_select_w5(&h, table, wvalue >> 1); + + ecp_nistz256_neg(tmp, h.Y); + copy_conditional(h.Y, tmp, (wvalue & 1)); + + ecp_nistz256_point_add(r, r, &h); + } + + index -= kWindowSize; + + ecp_nistz256_point_double(r, r); + ecp_nistz256_point_double(r, r); + ecp_nistz256_point_double(r, r); + ecp_nistz256_point_double(r, r); + ecp_nistz256_point_double(r, r); + } + + // Final window + wvalue = p_str[0]; + wvalue = (wvalue << 1) & kMask; + + wvalue = booth_recode_w5(wvalue); + + ecp_nistz256_select_w5(&h, table, wvalue >> 1); + + ecp_nistz256_neg(tmp, h.Y); + copy_conditional(h.Y, tmp, wvalue & 1); + + ecp_nistz256_point_add(r, r, &h); +} + +static crypto_word_t calc_first_wvalue(size_t *index, const uint8_t p_str[33]) { + static const size_t kWindowSize = 7; + static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; + *index = kWindowSize; + + crypto_word_t wvalue = (p_str[0] << 1) & kMask; + return booth_recode_w7(wvalue); +} + +static crypto_word_t calc_wvalue(size_t *index, const uint8_t p_str[33]) { + static const size_t kWindowSize = 7; + static const crypto_word_t kMask = (1 << (7 /* kWindowSize */ + 1)) - 1; + + const size_t off = (*index - 1) / 8; + crypto_word_t wvalue = + (crypto_word_t)p_str[off] | (crypto_word_t)p_str[off + 1] << 8; + wvalue = (wvalue >> ((*index - 1) % 8)) & kMask; + *index += kWindowSize; + + return booth_recode_w7(wvalue); +} + +static void ecp_nistz256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *p, + const EC_SCALAR *scalar) { + alignas(32) P256_POINT out; + ecp_nistz256_windowed_mul(group, &out, p, scalar); + + assert(group->field.N.width == P256_LIMBS); + OPENSSL_memcpy(r->X.words, out.X, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Y.words, out.Y, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Z.words, out.Z, P256_LIMBS * sizeof(BN_ULONG)); +} + +static void ecp_nistz256_point_mul_base(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_SCALAR *scalar) { + uint8_t p_str[33]; + OPENSSL_memcpy(p_str, scalar->words, 32); + p_str[32] = 0; + + // First window + size_t index = 0; + crypto_word_t wvalue = calc_first_wvalue(&index, p_str); + + alignas(32) P256_POINT_AFFINE t; + alignas(32) P256_POINT p; + ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[0], wvalue >> 1); + ecp_nistz256_neg(p.Z, t.Y); + copy_conditional(t.Y, p.Z, wvalue & 1); + + // Convert |t| from affine to Jacobian coordinates. We set Z to zero if |t| + // is infinity and |ONE_MONT| otherwise. |t| was computed from the table, so + // it is infinity iff |wvalue >> 1| is zero. + OPENSSL_memcpy(p.X, t.X, sizeof(p.X)); + OPENSSL_memcpy(p.Y, t.Y, sizeof(p.Y)); + OPENSSL_memset(p.Z, 0, sizeof(p.Z)); + copy_conditional(p.Z, ONE_MONT, is_not_zero(wvalue >> 1)); + + for (int i = 1; i < 37; i++) { + wvalue = calc_wvalue(&index, p_str); + + ecp_nistz256_select_w7(&t, ecp_nistz256_precomputed[i], wvalue >> 1); + + alignas(32) BN_ULONG neg_Y[P256_LIMBS]; + ecp_nistz256_neg(neg_Y, t.Y); + copy_conditional(t.Y, neg_Y, wvalue & 1); + + // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are the + // same non-infinity point. + ecp_nistz256_point_add_affine(&p, &p, &t); + } + + assert(group->field.N.width == P256_LIMBS); + OPENSSL_memcpy(r->X.words, p.X, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Y.words, p.Y, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Z.words, p.Z, P256_LIMBS * sizeof(BN_ULONG)); +} + +static void ecp_nistz256_points_mul_public(const EC_GROUP *group, + EC_JACOBIAN *r, + const EC_SCALAR *g_scalar, + const EC_JACOBIAN *p_, + const EC_SCALAR *p_scalar) { + assert(p_ != nullptr && p_scalar != nullptr && g_scalar != nullptr); + + alignas(32) P256_POINT p; + uint8_t p_str[33]; + OPENSSL_memcpy(p_str, g_scalar->words, 32); + p_str[32] = 0; + + // First window + size_t index = 0; + size_t wvalue = calc_first_wvalue(&index, p_str); + + // Convert |p| from affine to Jacobian coordinates. We set Z to zero if |p| + // is infinity and |ONE_MONT| otherwise. |p| was computed from the table, so + // it is infinity iff |wvalue >> 1| is zero. + if ((wvalue >> 1) != 0) { + OPENSSL_memcpy(p.X, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].X, + sizeof(p.X)); + OPENSSL_memcpy(p.Y, &ecp_nistz256_precomputed[0][(wvalue >> 1) - 1].Y, + sizeof(p.Y)); + OPENSSL_memcpy(p.Z, ONE_MONT, sizeof(p.Z)); + } else { + OPENSSL_memset(p.X, 0, sizeof(p.X)); + OPENSSL_memset(p.Y, 0, sizeof(p.Y)); + OPENSSL_memset(p.Z, 0, sizeof(p.Z)); + } + + if ((wvalue & 1) == 1) { + ecp_nistz256_neg(p.Y, p.Y); + } + + for (int i = 1; i < 37; i++) { + wvalue = calc_wvalue(&index, p_str); + if ((wvalue >> 1) == 0) { + continue; + } + + alignas(32) P256_POINT_AFFINE t; + OPENSSL_memcpy(&t, &ecp_nistz256_precomputed[i][(wvalue >> 1) - 1], + sizeof(t)); + if ((wvalue & 1) == 1) { + ecp_nistz256_neg(t.Y, t.Y); + } + + // Note |ecp_nistz256_point_add_affine| does not work if |p| and |t| are + // the same non-infinity point, so it is important that we compute the + // |g_scalar| term before the |p_scalar| term. + ecp_nistz256_point_add_affine(&p, &p, &t); + } + + alignas(32) P256_POINT tmp; + ecp_nistz256_windowed_mul(group, &tmp, p_, p_scalar); + ecp_nistz256_point_add(&p, &p, &tmp); + + assert(group->field.N.width == P256_LIMBS); + OPENSSL_memcpy(r->X.words, p.X, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Y.words, p.Y, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Z.words, p.Z, P256_LIMBS * sizeof(BN_ULONG)); +} + +static int ecp_nistz256_get_affine(const EC_GROUP *group, + const EC_JACOBIAN *point, EC_FELEM *x, + EC_FELEM *y) { + if (constant_time_declassify_int( + ec_GFp_simple_is_at_infinity(group, point))) { + OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); + return 0; + } + + BN_ULONG z_inv2[P256_LIMBS]; + assert(group->field.N.width == P256_LIMBS); + ecp_nistz256_mod_inverse_sqr_mont(z_inv2, point->Z.words); + + if (x != nullptr) { + ecp_nistz256_mul_mont(x->words, z_inv2, point->X.words); + } + + if (y != nullptr) { + ecp_nistz256_sqr_mont(z_inv2, z_inv2); // z^-4 + ecp_nistz256_mul_mont(y->words, point->Y.words, point->Z.words); // y * z + ecp_nistz256_mul_mont(y->words, y->words, z_inv2); // y * z^-3 + } + + return 1; +} + +static void ecp_nistz256_add(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *a_, const EC_JACOBIAN *b_) { + P256_POINT a, b; + OPENSSL_memcpy(a.X, a_->X.words, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(a.Y, a_->Y.words, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(a.Z, a_->Z.words, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(b.X, b_->X.words, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(b.Y, b_->Y.words, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(b.Z, b_->Z.words, P256_LIMBS * sizeof(BN_ULONG)); + ecp_nistz256_point_add(&a, &a, &b); + OPENSSL_memcpy(r->X.words, a.X, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Y.words, a.Y, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Z.words, a.Z, P256_LIMBS * sizeof(BN_ULONG)); +} + +static void ecp_nistz256_dbl(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *a_) { + P256_POINT a; + OPENSSL_memcpy(a.X, a_->X.words, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(a.Y, a_->Y.words, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(a.Z, a_->Z.words, P256_LIMBS * sizeof(BN_ULONG)); + ecp_nistz256_point_double(&a, &a); + OPENSSL_memcpy(r->X.words, a.X, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Y.words, a.Y, P256_LIMBS * sizeof(BN_ULONG)); + OPENSSL_memcpy(r->Z.words, a.Z, P256_LIMBS * sizeof(BN_ULONG)); +} + +static void ecp_nistz256_inv0_mod_ord(const EC_GROUP *group, EC_SCALAR *out, + const EC_SCALAR *in) { + // table[i] stores a power of |in| corresponding to the matching enum value. + enum { + // The following indices specify the power in binary. + i_1 = 0, + i_10, + i_11, + i_101, + i_111, + i_1010, + i_1111, + i_10101, + i_101010, + i_101111, + // The following indices specify 2^N-1, or N ones in a row. + i_x6, + i_x8, + i_x16, + i_x32 + }; + BN_ULONG table[15][P256_LIMBS]; + + // https://briansmith.org/ecc-inversion-addition-chains-01#p256_scalar_inversion + // + // Even though this code path spares 12 squarings, 4.5%, and 13 + // multiplications, 25%, the overall sign operation is not that much faster, + // not more that 2%. Most of the performance of this function comes from the + // scalar operations. + + // Pre-calculate powers. + OPENSSL_memcpy(table[i_1], in->words, P256_LIMBS * sizeof(BN_ULONG)); + + ecp_nistz256_ord_sqr_mont(table[i_10], table[i_1], 1); + + ecp_nistz256_ord_mul_mont(table[i_11], table[i_1], table[i_10]); + + ecp_nistz256_ord_mul_mont(table[i_101], table[i_11], table[i_10]); + + ecp_nistz256_ord_mul_mont(table[i_111], table[i_101], table[i_10]); + + ecp_nistz256_ord_sqr_mont(table[i_1010], table[i_101], 1); + + ecp_nistz256_ord_mul_mont(table[i_1111], table[i_1010], table[i_101]); + + ecp_nistz256_ord_sqr_mont(table[i_10101], table[i_1010], 1); + ecp_nistz256_ord_mul_mont(table[i_10101], table[i_10101], table[i_1]); + + ecp_nistz256_ord_sqr_mont(table[i_101010], table[i_10101], 1); + + ecp_nistz256_ord_mul_mont(table[i_101111], table[i_101010], table[i_101]); + + ecp_nistz256_ord_mul_mont(table[i_x6], table[i_101010], table[i_10101]); + + ecp_nistz256_ord_sqr_mont(table[i_x8], table[i_x6], 2); + ecp_nistz256_ord_mul_mont(table[i_x8], table[i_x8], table[i_11]); + + ecp_nistz256_ord_sqr_mont(table[i_x16], table[i_x8], 8); + ecp_nistz256_ord_mul_mont(table[i_x16], table[i_x16], table[i_x8]); + + ecp_nistz256_ord_sqr_mont(table[i_x32], table[i_x16], 16); + ecp_nistz256_ord_mul_mont(table[i_x32], table[i_x32], table[i_x16]); + + // Compute |in| raised to the order-2. + ecp_nistz256_ord_sqr_mont(out->words, table[i_x32], 64); + ecp_nistz256_ord_mul_mont(out->words, out->words, table[i_x32]); + static const struct { + uint8_t p, i; + } kChain[27] = {{32, i_x32}, {6, i_101111}, {5, i_111}, {4, i_11}, + {5, i_1111}, {5, i_10101}, {4, i_101}, {3, i_101}, + {3, i_101}, {5, i_111}, {9, i_101111}, {6, i_1111}, + {2, i_1}, {5, i_1}, {6, i_1111}, {5, i_111}, + {4, i_111}, {5, i_111}, {5, i_101}, {3, i_11}, + {10, i_101111}, {2, i_11}, {5, i_11}, {5, i_11}, + {3, i_1}, {7, i_10101}, {6, i_1111}}; + for (const auto &step : kChain) { + ecp_nistz256_ord_sqr_mont(out->words, out->words, step.p); + ecp_nistz256_ord_mul_mont(out->words, out->words, table[step.i]); + } +} + +static int ecp_nistz256_scalar_to_montgomery_inv_vartime(const EC_GROUP *group, + EC_SCALAR *out, + const EC_SCALAR *in) { +#if defined(OPENSSL_X86_64) + if (!CRYPTO_is_AVX_capable()) { + // No AVX support; fallback to generic code. + return ec_simple_scalar_to_montgomery_inv_vartime(group, out, in); + } +#endif + + assert(group->order.N.width == P256_LIMBS); + if (!beeu_mod_inverse_vartime(out->words, in->words, group->order.N.d)) { + return 0; + } + + // The result should be returned in the Montgomery domain. + ec_scalar_to_montgomery(group, out, out); + return 1; +} + +static int ecp_nistz256_cmp_x_coordinate(const EC_GROUP *group, + const EC_JACOBIAN *p, + const EC_SCALAR *r) { + if (ec_GFp_simple_is_at_infinity(group, p)) { + return 0; + } + + assert(group->order.N.width == P256_LIMBS); + assert(group->field.N.width == P256_LIMBS); + + // We wish to compare X/Z^2 with r. This is equivalent to comparing X with + // r*Z^2. Note that X and Z are represented in Montgomery form, while r is + // not. + BN_ULONG r_Z2[P256_LIMBS], Z2_mont[P256_LIMBS], X[P256_LIMBS]; + ecp_nistz256_mul_mont(Z2_mont, p->Z.words, p->Z.words); + ecp_nistz256_mul_mont(r_Z2, r->words, Z2_mont); + ecp_nistz256_from_mont(X, p->X.words); + + if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) { + return 1; + } + + // During signing the x coefficient is reduced modulo the group order. + // Therefore there is a small possibility, less than 1/2^128, that group_order + // < p.x < P. in that case we need not only to compare against |r| but also to + // compare against r+group_order. + BN_ULONG carry = bn_add_words(r_Z2, r->words, group->order.N.d, P256_LIMBS); + if (carry == 0 && bn_less_than_words(r_Z2, group->field.N.d, P256_LIMBS)) { + // r + group_order < p, so compare (r + group_order) * Z^2 against X. + ecp_nistz256_mul_mont(r_Z2, r_Z2, Z2_mont); + if (OPENSSL_memcmp(r_Z2, X, sizeof(r_Z2)) == 0) { + return 1; + } + } + + return 0; +} + +BSSL_NAMESPACE_BEGIN + +DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistz256_method) { + out->point_get_affine_coordinates = ecp_nistz256_get_affine; + out->add = ecp_nistz256_add; + out->dbl = ecp_nistz256_dbl; + out->mul = ecp_nistz256_point_mul; + out->mul_base = ecp_nistz256_point_mul_base; + out->mul_public = ecp_nistz256_points_mul_public; + out->scalar_inv0_montgomery = ecp_nistz256_inv0_mod_ord; + out->scalar_to_montgomery_inv_vartime = + ecp_nistz256_scalar_to_montgomery_inv_vartime; + out->cmp_x_coordinate = ecp_nistz256_cmp_x_coordinate; +} + +BSSL_NAMESPACE_END + +#endif /* !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ + !defined(OPENSSL_SMALL) */ diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz.h b/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz.h index 3f5ea021..c6963887 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz.h +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/p256-nistz.h @@ -1,23 +1,28 @@ -/* - * Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. - * Copyright (c) 2014, Intel Corporation. All Rights Reserved. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - * - * Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) - * (1) Intel Corporation, Israel Development Center, Haifa, Israel - * (2) University of Haifa, Israel - * - * Reference: - * S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with - * 256 Bit Primes" - */ - -#ifndef OPENSSL_HEADER_EC_P256_X86_64_H -#define OPENSSL_HEADER_EC_P256_X86_64_H +// Copyright 2014-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2014, Intel Corporation. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// Originally written by Shay Gueron (1, 2), and Vlad Krasnov (1) +// (1) Intel Corporation, Israel Development Center, Haifa, Israel +// (2) University of Haifa, Israel +// +// Reference: +// S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with +// 256 Bit Primes" + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_EC_P256_NISTZ_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_EC_P256_NISTZ_H #include @@ -48,21 +53,29 @@ extern "C" { void ecp_nistz256_neg(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); // ecp_nistz256_mul_mont sets |res| to |a| * |b| * 2^-256 mod P. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_mul_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +void ecp_nistz256_mul_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +#else void ecp_nistz256_mul_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]); +#endif // ecp_nistz256_sqr_mont sets |res| to |a| * |a| * 2^-256 mod P. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_sqr_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS]); +void ecp_nistz256_sqr_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS]); +#else void ecp_nistz256_sqr_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS]); - -// ecp_nistz256_from_mont sets |res| to |in|, converted from Montgomery domain -// by multiplying with 1. -static inline void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], - const BN_ULONG in[P256_LIMBS]) { - static const BN_ULONG ONE[P256_LIMBS] = { 1 }; - ecp_nistz256_mul_mont(res, in, ONE); -} +#endif // P-256 scalar operations. @@ -72,15 +85,31 @@ static inline void ecp_nistz256_from_mont(BN_ULONG res[P256_LIMBS], // ecp_nistz256_ord_mul_mont sets |res| to |a| * |b| where inputs and outputs // are in Montgomery form. That is, |res| is |a| * |b| * 2^-256 mod N. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_ord_mul_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +void ecp_nistz256_ord_mul_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], + const BN_ULONG b[P256_LIMBS]); +#else void ecp_nistz256_ord_mul_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], const BN_ULONG b[P256_LIMBS]); +#endif // ecp_nistz256_ord_sqr_mont sets |res| to |a|^(2*|rep|) where inputs and // outputs are in Montgomery form. That is, |res| is // (|a| * 2^-256)^(2*|rep|) * 2^256 mod N. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_ord_sqr_mont_nohw(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], BN_ULONG rep); +void ecp_nistz256_ord_sqr_mont_adx(BN_ULONG res[P256_LIMBS], + const BN_ULONG a[P256_LIMBS], BN_ULONG rep); +#else void ecp_nistz256_ord_sqr_mont(BN_ULONG res[P256_LIMBS], const BN_ULONG a[P256_LIMBS], BN_ULONG rep); +#endif // beeu_mod_inverse_vartime sets out = a^-1 mod p using a Euclidean algorithm. // Assumption: 0 < a < p < 2^(256) and p is odd. @@ -111,27 +140,60 @@ typedef struct { // ecp_nistz256_select_w5 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 16 // and all zeros (the point at infinity) if |index| is 0. This is done in // constant time. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_select_w5_nohw(P256_POINT *val, const P256_POINT in_t[16], + int index); +void ecp_nistz256_select_w5_avx2(P256_POINT *val, const P256_POINT in_t[16], + int index); +#else void ecp_nistz256_select_w5(P256_POINT *val, const P256_POINT in_t[16], int index); +#endif // ecp_nistz256_select_w7 sets |*val| to |in_t[index-1]| if 1 <= |index| <= 64 // and all zeros (the point at infinity) if |index| is 0. This is done in // constant time. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_select_w7_nohw(P256_POINT_AFFINE *val, + const P256_POINT_AFFINE in_t[64], int index); +void ecp_nistz256_select_w7_avx2(P256_POINT_AFFINE *val, + const P256_POINT_AFFINE in_t[64], int index); +#else void ecp_nistz256_select_w7(P256_POINT_AFFINE *val, const P256_POINT_AFFINE in_t[64], int index); +#endif // ecp_nistz256_point_double sets |r| to |a| doubled. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_point_double_nohw(P256_POINT *r, const P256_POINT *a); +void ecp_nistz256_point_double_adx(P256_POINT *r, const P256_POINT *a); +#else void ecp_nistz256_point_double(P256_POINT *r, const P256_POINT *a); +#endif // ecp_nistz256_point_add adds |a| to |b| and places the result in |r|. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_point_add_nohw(P256_POINT *r, const P256_POINT *a, + const P256_POINT *b); +void ecp_nistz256_point_add_adx(P256_POINT *r, const P256_POINT *a, + const P256_POINT *b); +#else void ecp_nistz256_point_add(P256_POINT *r, const P256_POINT *a, const P256_POINT *b); +#endif // ecp_nistz256_point_add_affine adds |a| to |b| and places the result in // |r|. |a| and |b| must not represent the same point unless they are both // infinity. +#if defined(OPENSSL_X86_64) +void ecp_nistz256_point_add_affine_adx(P256_POINT *r, const P256_POINT *a, + const P256_POINT_AFFINE *b); +void ecp_nistz256_point_add_affine_nohw(P256_POINT *r, const P256_POINT *a, + const P256_POINT_AFFINE *b); +#else void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a, const P256_POINT_AFFINE *b); +#endif #endif /* !defined(OPENSSL_NO_ASM) && \ (defined(OPENSSL_X86_64) || defined(OPENSSL_AARCH64)) && \ @@ -139,7 +201,7 @@ void ecp_nistz256_point_add_affine(P256_POINT *r, const P256_POINT *a, #if defined(__cplusplus) -} // extern C++ +} // extern C #endif -#endif // OPENSSL_HEADER_EC_P256_X86_64_H +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_EC_P256_NISTZ_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/p256.c b/third_party/boringssl/src/crypto/fipsmodule/ec/p256.c deleted file mode 100644 index 816e6f1a..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/p256.c +++ /dev/null @@ -1,752 +0,0 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -// An implementation of the NIST P-256 elliptic curve point multiplication. -// 256-bit Montgomery form for 64 and 32-bit. Field operations are generated by -// Fiat, which lives in //third_party/fiat. - -#include - -#include -#include -#include -#include - -#include -#include - -#include "../../internal.h" -#include "../delocate.h" -#include "./internal.h" - -#if defined(OPENSSL_NO_ASM) -#define FIAT_P256_NO_ASM -#endif - -#if defined(BORINGSSL_HAS_UINT128) -#define BORINGSSL_NISTP256_64BIT 1 -#include "../../../third_party/fiat/p256_64.h" -#else -#include "../../../third_party/fiat/p256_32.h" -#endif - - -// utility functions, handwritten - -#if defined(BORINGSSL_NISTP256_64BIT) -#define FIAT_P256_NLIMBS 4 -typedef uint64_t fiat_p256_limb_t; -typedef uint64_t fiat_p256_felem[FIAT_P256_NLIMBS]; -static const fiat_p256_felem fiat_p256_one = {0x1, 0xffffffff00000000, - 0xffffffffffffffff, 0xfffffffe}; -#else // 64BIT; else 32BIT -#define FIAT_P256_NLIMBS 8 -typedef uint32_t fiat_p256_limb_t; -typedef uint32_t fiat_p256_felem[FIAT_P256_NLIMBS]; -static const fiat_p256_felem fiat_p256_one = { - 0x1, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0x0}; -#endif // 64BIT - - -static fiat_p256_limb_t fiat_p256_nz( - const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) { - fiat_p256_limb_t ret; - fiat_p256_nonzero(&ret, in1); - return ret; -} - -static void fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS], - const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) { - for (size_t i = 0; i < FIAT_P256_NLIMBS; i++) { - out[i] = in1[i]; - } -} - -static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS], - fiat_p256_limb_t t, - const fiat_p256_limb_t z[FIAT_P256_NLIMBS], - const fiat_p256_limb_t nz[FIAT_P256_NLIMBS]) { - fiat_p256_selectznz(out, !!t, z, nz); -} - -static void fiat_p256_from_words(fiat_p256_felem out, - const BN_ULONG in[32 / sizeof(BN_ULONG)]) { - // Typically, |BN_ULONG| and |fiat_p256_limb_t| will be the same type, but on - // 64-bit platforms without |uint128_t|, they are different. However, on - // little-endian systems, |uint64_t[4]| and |uint32_t[8]| have the same - // layout. - OPENSSL_memcpy(out, in, 32); -} - -static void fiat_p256_from_generic(fiat_p256_felem out, const EC_FELEM *in) { - fiat_p256_from_words(out, in->words); -} - -static void fiat_p256_to_generic(EC_FELEM *out, const fiat_p256_felem in) { - // See |fiat_p256_from_words|. - OPENSSL_memcpy(out->words, in, 32); -} - -// fiat_p256_inv_square calculates |out| = |in|^{-2} -// -// Based on Fermat's Little Theorem: -// a^p = a (mod p) -// a^{p-1} = 1 (mod p) -// a^{p-3} = a^{-2} (mod p) -static void fiat_p256_inv_square(fiat_p256_felem out, - const fiat_p256_felem in) { - // This implements the addition chain described in - // https://briansmith.org/ecc-inversion-addition-chains-01#p256_field_inversion - fiat_p256_felem x2, x3, x6, x12, x15, x30, x32; - fiat_p256_square(x2, in); // 2^2 - 2^1 - fiat_p256_mul(x2, x2, in); // 2^2 - 2^0 - - fiat_p256_square(x3, x2); // 2^3 - 2^1 - fiat_p256_mul(x3, x3, in); // 2^3 - 2^0 - - fiat_p256_square(x6, x3); - for (int i = 1; i < 3; i++) { - fiat_p256_square(x6, x6); - } // 2^6 - 2^3 - fiat_p256_mul(x6, x6, x3); // 2^6 - 2^0 - - fiat_p256_square(x12, x6); - for (int i = 1; i < 6; i++) { - fiat_p256_square(x12, x12); - } // 2^12 - 2^6 - fiat_p256_mul(x12, x12, x6); // 2^12 - 2^0 - - fiat_p256_square(x15, x12); - for (int i = 1; i < 3; i++) { - fiat_p256_square(x15, x15); - } // 2^15 - 2^3 - fiat_p256_mul(x15, x15, x3); // 2^15 - 2^0 - - fiat_p256_square(x30, x15); - for (int i = 1; i < 15; i++) { - fiat_p256_square(x30, x30); - } // 2^30 - 2^15 - fiat_p256_mul(x30, x30, x15); // 2^30 - 2^0 - - fiat_p256_square(x32, x30); - fiat_p256_square(x32, x32); // 2^32 - 2^2 - fiat_p256_mul(x32, x32, x2); // 2^32 - 2^0 - - fiat_p256_felem ret; - fiat_p256_square(ret, x32); - for (int i = 1; i < 31 + 1; i++) { - fiat_p256_square(ret, ret); - } // 2^64 - 2^32 - fiat_p256_mul(ret, ret, in); // 2^64 - 2^32 + 2^0 - - for (int i = 0; i < 96 + 32; i++) { - fiat_p256_square(ret, ret); - } // 2^192 - 2^160 + 2^128 - fiat_p256_mul(ret, ret, x32); // 2^192 - 2^160 + 2^128 + 2^32 - 2^0 - - for (int i = 0; i < 32; i++) { - fiat_p256_square(ret, ret); - } // 2^224 - 2^192 + 2^160 + 2^64 - 2^32 - fiat_p256_mul(ret, ret, x32); // 2^224 - 2^192 + 2^160 + 2^64 - 2^0 - - for (int i = 0; i < 30; i++) { - fiat_p256_square(ret, ret); - } // 2^254 - 2^222 + 2^190 + 2^94 - 2^30 - fiat_p256_mul(ret, ret, x30); // 2^254 - 2^222 + 2^190 + 2^94 - 2^0 - - fiat_p256_square(ret, ret); - fiat_p256_square(out, ret); // 2^256 - 2^224 + 2^192 + 2^96 - 2^2 -} - -// Group operations -// ---------------- -// -// Building on top of the field operations we have the operations on the -// elliptic curve group itself. Points on the curve are represented in Jacobian -// coordinates. -// -// Both operations were transcribed to Coq and proven to correspond to naive -// implementations using Affine coordinates, for all suitable fields. In the -// Coq proofs, issues of constant-time execution and memory layout (aliasing) -// conventions were not considered. Specification of affine coordinates: -// -// As a sanity check, a proof that these points form a commutative group: -// - -// fiat_p256_point_double calculates 2*(x_in, y_in, z_in) -// -// The method is taken from: -// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b -// -// Coq transcription and correctness proof: -// -// -// -// Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed. -// while x_out == y_in is not (maybe this works, but it's not tested). -static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out, - fiat_p256_felem z_out, - const fiat_p256_felem x_in, - const fiat_p256_felem y_in, - const fiat_p256_felem z_in) { - fiat_p256_felem delta, gamma, beta, ftmp, ftmp2, tmptmp, alpha, fourbeta; - // delta = z^2 - fiat_p256_square(delta, z_in); - // gamma = y^2 - fiat_p256_square(gamma, y_in); - // beta = x*gamma - fiat_p256_mul(beta, x_in, gamma); - - // alpha = 3*(x-delta)*(x+delta) - fiat_p256_sub(ftmp, x_in, delta); - fiat_p256_add(ftmp2, x_in, delta); - - fiat_p256_add(tmptmp, ftmp2, ftmp2); - fiat_p256_add(ftmp2, ftmp2, tmptmp); - fiat_p256_mul(alpha, ftmp, ftmp2); - - // x' = alpha^2 - 8*beta - fiat_p256_square(x_out, alpha); - fiat_p256_add(fourbeta, beta, beta); - fiat_p256_add(fourbeta, fourbeta, fourbeta); - fiat_p256_add(tmptmp, fourbeta, fourbeta); - fiat_p256_sub(x_out, x_out, tmptmp); - - // z' = (y + z)^2 - gamma - delta - fiat_p256_add(delta, gamma, delta); - fiat_p256_add(ftmp, y_in, z_in); - fiat_p256_square(z_out, ftmp); - fiat_p256_sub(z_out, z_out, delta); - - // y' = alpha*(4*beta - x') - 8*gamma^2 - fiat_p256_sub(y_out, fourbeta, x_out); - fiat_p256_add(gamma, gamma, gamma); - fiat_p256_square(gamma, gamma); - fiat_p256_mul(y_out, alpha, y_out); - fiat_p256_add(gamma, gamma, gamma); - fiat_p256_sub(y_out, y_out, gamma); -} - -// fiat_p256_point_add calculates (x1, y1, z1) + (x2, y2, z2) -// -// The method is taken from: -// http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl, -// adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity). -// -// Coq transcription and correctness proof: -// -// -// -// This function includes a branch for checking whether the two input points -// are equal, (while not equal to the point at infinity). This case never -// happens during single point multiplication, so there is no timing leak for -// ECDH or ECDSA signing. -static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3, - fiat_p256_felem z3, const fiat_p256_felem x1, - const fiat_p256_felem y1, - const fiat_p256_felem z1, const int mixed, - const fiat_p256_felem x2, - const fiat_p256_felem y2, - const fiat_p256_felem z2) { - fiat_p256_felem x_out, y_out, z_out; - fiat_p256_limb_t z1nz = fiat_p256_nz(z1); - fiat_p256_limb_t z2nz = fiat_p256_nz(z2); - - // z1z1 = z1z1 = z1**2 - fiat_p256_felem z1z1; - fiat_p256_square(z1z1, z1); - - fiat_p256_felem u1, s1, two_z1z2; - if (!mixed) { - // z2z2 = z2**2 - fiat_p256_felem z2z2; - fiat_p256_square(z2z2, z2); - - // u1 = x1*z2z2 - fiat_p256_mul(u1, x1, z2z2); - - // two_z1z2 = (z1 + z2)**2 - (z1z1 + z2z2) = 2z1z2 - fiat_p256_add(two_z1z2, z1, z2); - fiat_p256_square(two_z1z2, two_z1z2); - fiat_p256_sub(two_z1z2, two_z1z2, z1z1); - fiat_p256_sub(two_z1z2, two_z1z2, z2z2); - - // s1 = y1 * z2**3 - fiat_p256_mul(s1, z2, z2z2); - fiat_p256_mul(s1, s1, y1); - } else { - // We'll assume z2 = 1 (special case z2 = 0 is handled later). - - // u1 = x1*z2z2 - fiat_p256_copy(u1, x1); - // two_z1z2 = 2z1z2 - fiat_p256_add(two_z1z2, z1, z1); - // s1 = y1 * z2**3 - fiat_p256_copy(s1, y1); - } - - // u2 = x2*z1z1 - fiat_p256_felem u2; - fiat_p256_mul(u2, x2, z1z1); - - // h = u2 - u1 - fiat_p256_felem h; - fiat_p256_sub(h, u2, u1); - - fiat_p256_limb_t xneq = fiat_p256_nz(h); - - // z_out = two_z1z2 * h - fiat_p256_mul(z_out, h, two_z1z2); - - // z1z1z1 = z1 * z1z1 - fiat_p256_felem z1z1z1; - fiat_p256_mul(z1z1z1, z1, z1z1); - - // s2 = y2 * z1**3 - fiat_p256_felem s2; - fiat_p256_mul(s2, y2, z1z1z1); - - // r = (s2 - s1)*2 - fiat_p256_felem r; - fiat_p256_sub(r, s2, s1); - fiat_p256_add(r, r, r); - - fiat_p256_limb_t yneq = fiat_p256_nz(r); - - fiat_p256_limb_t is_nontrivial_double = constant_time_is_zero_w(xneq | yneq) & - ~constant_time_is_zero_w(z1nz) & - ~constant_time_is_zero_w(z2nz); - if (is_nontrivial_double) { - fiat_p256_point_double(x3, y3, z3, x1, y1, z1); - return; - } - - // I = (2h)**2 - fiat_p256_felem i; - fiat_p256_add(i, h, h); - fiat_p256_square(i, i); - - // J = h * I - fiat_p256_felem j; - fiat_p256_mul(j, h, i); - - // V = U1 * I - fiat_p256_felem v; - fiat_p256_mul(v, u1, i); - - // x_out = r**2 - J - 2V - fiat_p256_square(x_out, r); - fiat_p256_sub(x_out, x_out, j); - fiat_p256_sub(x_out, x_out, v); - fiat_p256_sub(x_out, x_out, v); - - // y_out = r(V-x_out) - 2 * s1 * J - fiat_p256_sub(y_out, v, x_out); - fiat_p256_mul(y_out, y_out, r); - fiat_p256_felem s1j; - fiat_p256_mul(s1j, s1, j); - fiat_p256_sub(y_out, y_out, s1j); - fiat_p256_sub(y_out, y_out, s1j); - - fiat_p256_cmovznz(x_out, z1nz, x2, x_out); - fiat_p256_cmovznz(x3, z2nz, x1, x_out); - fiat_p256_cmovznz(y_out, z1nz, y2, y_out); - fiat_p256_cmovznz(y3, z2nz, y1, y_out); - fiat_p256_cmovznz(z_out, z1nz, z2, z_out); - fiat_p256_cmovznz(z3, z2nz, z1, z_out); -} - -#include "./p256_table.h" - -// fiat_p256_select_point_affine selects the |idx-1|th point from a -// precomputation table and copies it to out. If |idx| is zero, the output is -// the point at infinity. -static void fiat_p256_select_point_affine( - const fiat_p256_limb_t idx, size_t size, - const fiat_p256_felem pre_comp[/*size*/][2], fiat_p256_felem out[3]) { - OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3); - for (size_t i = 0; i < size; i++) { - fiat_p256_limb_t mismatch = i ^ (idx - 1); - fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]); - fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]); - } - fiat_p256_cmovznz(out[2], idx, out[2], fiat_p256_one); -} - -// fiat_p256_select_point selects the |idx|th point from a precomputation table -// and copies it to out. -static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size, - const fiat_p256_felem pre_comp[/*size*/][3], - fiat_p256_felem out[3]) { - OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3); - for (size_t i = 0; i < size; i++) { - fiat_p256_limb_t mismatch = i ^ idx; - fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]); - fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]); - fiat_p256_cmovznz(out[2], mismatch, pre_comp[i][2], out[2]); - } -} - -// fiat_p256_get_bit returns the |i|th bit in |in|. -static crypto_word_t fiat_p256_get_bit(const EC_SCALAR *in, int i) { - if (i < 0 || i >= 256) { - return 0; - } -#if defined(OPENSSL_64_BIT) - static_assert(sizeof(BN_ULONG) == 8, "BN_ULONG was not 64-bit"); - return (in->words[i >> 6] >> (i & 63)) & 1; -#else - static_assert(sizeof(BN_ULONG) == 4, "BN_ULONG was not 32-bit"); - return (in->words[i >> 5] >> (i & 31)) & 1; -#endif -} - -// OPENSSL EC_METHOD FUNCTIONS - -// Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') = -// (X/Z^2, Y/Z^3). -static int ec_GFp_nistp256_point_get_affine_coordinates( - const EC_GROUP *group, const EC_RAW_POINT *point, EC_FELEM *x_out, - EC_FELEM *y_out) { - if (ec_GFp_simple_is_at_infinity(group, point)) { - OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); - return 0; - } - - fiat_p256_felem z1, z2; - fiat_p256_from_generic(z1, &point->Z); - fiat_p256_inv_square(z2, z1); - - if (x_out != NULL) { - fiat_p256_felem x; - fiat_p256_from_generic(x, &point->X); - fiat_p256_mul(x, x, z2); - fiat_p256_to_generic(x_out, x); - } - - if (y_out != NULL) { - fiat_p256_felem y; - fiat_p256_from_generic(y, &point->Y); - fiat_p256_square(z2, z2); // z^-4 - fiat_p256_mul(y, y, z1); // y * z - fiat_p256_mul(y, y, z2); // y * z^-3 - fiat_p256_to_generic(y_out, y); - } - - return 1; -} - -static void ec_GFp_nistp256_add(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *a, const EC_RAW_POINT *b) { - fiat_p256_felem x1, y1, z1, x2, y2, z2; - fiat_p256_from_generic(x1, &a->X); - fiat_p256_from_generic(y1, &a->Y); - fiat_p256_from_generic(z1, &a->Z); - fiat_p256_from_generic(x2, &b->X); - fiat_p256_from_generic(y2, &b->Y); - fiat_p256_from_generic(z2, &b->Z); - fiat_p256_point_add(x1, y1, z1, x1, y1, z1, 0 /* both Jacobian */, x2, y2, - z2); - fiat_p256_to_generic(&r->X, x1); - fiat_p256_to_generic(&r->Y, y1); - fiat_p256_to_generic(&r->Z, z1); -} - -static void ec_GFp_nistp256_dbl(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *a) { - fiat_p256_felem x, y, z; - fiat_p256_from_generic(x, &a->X); - fiat_p256_from_generic(y, &a->Y); - fiat_p256_from_generic(z, &a->Z); - fiat_p256_point_double(x, y, z, x, y, z); - fiat_p256_to_generic(&r->X, x); - fiat_p256_to_generic(&r->Y, y); - fiat_p256_to_generic(&r->Z, z); -} - -static void ec_GFp_nistp256_point_mul(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p, - const EC_SCALAR *scalar) { - fiat_p256_felem p_pre_comp[17][3]; - OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp)); - // Precompute multiples. - fiat_p256_from_generic(p_pre_comp[1][0], &p->X); - fiat_p256_from_generic(p_pre_comp[1][1], &p->Y); - fiat_p256_from_generic(p_pre_comp[1][2], &p->Z); - for (size_t j = 2; j <= 16; ++j) { - if (j & 1) { - fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2], - p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2], - 0, p_pre_comp[j - 1][0], p_pre_comp[j - 1][1], - p_pre_comp[j - 1][2]); - } else { - fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1], - p_pre_comp[j][2], p_pre_comp[j / 2][0], - p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]); - } - } - - // Set nq to the point at infinity. - fiat_p256_felem nq[3] = {{0}, {0}, {0}}, ftmp, tmp[3]; - - // Loop over |scalar| msb-to-lsb, incorporating |p_pre_comp| every 5th round. - int skip = 1; // Save two point operations in the first round. - for (size_t i = 255; i < 256; i--) { - // double - if (!skip) { - fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); - } - - // do other additions every 5 doublings - if (i % 5 == 0) { - crypto_word_t bits = fiat_p256_get_bit(scalar, i + 4) << 5; - bits |= fiat_p256_get_bit(scalar, i + 3) << 4; - bits |= fiat_p256_get_bit(scalar, i + 2) << 3; - bits |= fiat_p256_get_bit(scalar, i + 1) << 2; - bits |= fiat_p256_get_bit(scalar, i) << 1; - bits |= fiat_p256_get_bit(scalar, i - 1); - crypto_word_t sign, digit; - ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); - - // select the point to add or subtract, in constant time. - fiat_p256_select_point((fiat_p256_limb_t)digit, 17, - (const fiat_p256_felem(*)[3])p_pre_comp, tmp); - fiat_p256_opp(ftmp, tmp[1]); // (X, -Y, Z) is the negative point. - fiat_p256_cmovznz(tmp[1], (fiat_p256_limb_t)sign, tmp[1], ftmp); - - if (!skip) { - fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], - 0 /* mixed */, tmp[0], tmp[1], tmp[2]); - } else { - fiat_p256_copy(nq[0], tmp[0]); - fiat_p256_copy(nq[1], tmp[1]); - fiat_p256_copy(nq[2], tmp[2]); - skip = 0; - } - } - } - - fiat_p256_to_generic(&r->X, nq[0]); - fiat_p256_to_generic(&r->Y, nq[1]); - fiat_p256_to_generic(&r->Z, nq[2]); -} - -static void ec_GFp_nistp256_point_mul_base(const EC_GROUP *group, - EC_RAW_POINT *r, - const EC_SCALAR *scalar) { - // Set nq to the point at infinity. - fiat_p256_felem nq[3] = {{0}, {0}, {0}}, tmp[3]; - - int skip = 1; // Save two point operations in the first round. - for (size_t i = 31; i < 32; i--) { - if (!skip) { - fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); - } - - // First, look 32 bits upwards. - crypto_word_t bits = fiat_p256_get_bit(scalar, i + 224) << 3; - bits |= fiat_p256_get_bit(scalar, i + 160) << 2; - bits |= fiat_p256_get_bit(scalar, i + 96) << 1; - bits |= fiat_p256_get_bit(scalar, i + 32); - // Select the point to add, in constant time. - fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15, - fiat_p256_g_pre_comp[1], tmp); - - if (!skip) { - fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], - 1 /* mixed */, tmp[0], tmp[1], tmp[2]); - } else { - fiat_p256_copy(nq[0], tmp[0]); - fiat_p256_copy(nq[1], tmp[1]); - fiat_p256_copy(nq[2], tmp[2]); - skip = 0; - } - - // Second, look at the current position. - bits = fiat_p256_get_bit(scalar, i + 192) << 3; - bits |= fiat_p256_get_bit(scalar, i + 128) << 2; - bits |= fiat_p256_get_bit(scalar, i + 64) << 1; - bits |= fiat_p256_get_bit(scalar, i); - // Select the point to add, in constant time. - fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15, - fiat_p256_g_pre_comp[0], tmp); - fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], 1 /* mixed */, - tmp[0], tmp[1], tmp[2]); - } - - fiat_p256_to_generic(&r->X, nq[0]); - fiat_p256_to_generic(&r->Y, nq[1]); - fiat_p256_to_generic(&r->Z, nq[2]); -} - -static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group, - EC_RAW_POINT *r, - const EC_SCALAR *g_scalar, - const EC_RAW_POINT *p, - const EC_SCALAR *p_scalar) { -#define P256_WSIZE_PUBLIC 4 - // Precompute multiples of |p|. p_pre_comp[i] is (2*i+1) * |p|. - fiat_p256_felem p_pre_comp[1 << (P256_WSIZE_PUBLIC - 1)][3]; - fiat_p256_from_generic(p_pre_comp[0][0], &p->X); - fiat_p256_from_generic(p_pre_comp[0][1], &p->Y); - fiat_p256_from_generic(p_pre_comp[0][2], &p->Z); - fiat_p256_felem p2[3]; - fiat_p256_point_double(p2[0], p2[1], p2[2], p_pre_comp[0][0], - p_pre_comp[0][1], p_pre_comp[0][2]); - for (size_t i = 1; i < OPENSSL_ARRAY_SIZE(p_pre_comp); i++) { - fiat_p256_point_add(p_pre_comp[i][0], p_pre_comp[i][1], p_pre_comp[i][2], - p_pre_comp[i - 1][0], p_pre_comp[i - 1][1], - p_pre_comp[i - 1][2], 0 /* not mixed */, p2[0], p2[1], - p2[2]); - } - - // Set up the coefficients for |p_scalar|. - int8_t p_wNAF[257]; - ec_compute_wNAF(group, p_wNAF, p_scalar, 256, P256_WSIZE_PUBLIC); - - // Set |ret| to the point at infinity. - int skip = 1; // Save some point operations. - fiat_p256_felem ret[3] = {{0}, {0}, {0}}; - for (int i = 256; i >= 0; i--) { - if (!skip) { - fiat_p256_point_double(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2]); - } - - // For the |g_scalar|, we use the precomputed table without the - // constant-time lookup. - if (i <= 31) { - // First, look 32 bits upwards. - crypto_word_t bits = fiat_p256_get_bit(g_scalar, i + 224) << 3; - bits |= fiat_p256_get_bit(g_scalar, i + 160) << 2; - bits |= fiat_p256_get_bit(g_scalar, i + 96) << 1; - bits |= fiat_p256_get_bit(g_scalar, i + 32); - if (bits != 0) { - size_t index = (size_t)(bits - 1); - fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2], - 1 /* mixed */, fiat_p256_g_pre_comp[1][index][0], - fiat_p256_g_pre_comp[1][index][1], - fiat_p256_one); - skip = 0; - } - - // Second, look at the current position. - bits = fiat_p256_get_bit(g_scalar, i + 192) << 3; - bits |= fiat_p256_get_bit(g_scalar, i + 128) << 2; - bits |= fiat_p256_get_bit(g_scalar, i + 64) << 1; - bits |= fiat_p256_get_bit(g_scalar, i); - if (bits != 0) { - size_t index = (size_t)(bits - 1); - fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2], - 1 /* mixed */, fiat_p256_g_pre_comp[0][index][0], - fiat_p256_g_pre_comp[0][index][1], - fiat_p256_one); - skip = 0; - } - } - - int digit = p_wNAF[i]; - if (digit != 0) { - assert(digit & 1); - size_t idx = (size_t)(digit < 0 ? (-digit) >> 1 : digit >> 1); - fiat_p256_felem *y = &p_pre_comp[idx][1], tmp; - if (digit < 0) { - fiat_p256_opp(tmp, p_pre_comp[idx][1]); - y = &tmp; - } - if (!skip) { - fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2], - 0 /* not mixed */, p_pre_comp[idx][0], *y, - p_pre_comp[idx][2]); - } else { - fiat_p256_copy(ret[0], p_pre_comp[idx][0]); - fiat_p256_copy(ret[1], *y); - fiat_p256_copy(ret[2], p_pre_comp[idx][2]); - skip = 0; - } - } - } - - fiat_p256_to_generic(&r->X, ret[0]); - fiat_p256_to_generic(&r->Y, ret[1]); - fiat_p256_to_generic(&r->Z, ret[2]); -} - -static int ec_GFp_nistp256_cmp_x_coordinate(const EC_GROUP *group, - const EC_RAW_POINT *p, - const EC_SCALAR *r) { - if (ec_GFp_simple_is_at_infinity(group, p)) { - return 0; - } - - // We wish to compare X/Z^2 with r. This is equivalent to comparing X with - // r*Z^2. Note that X and Z are represented in Montgomery form, while r is - // not. - fiat_p256_felem Z2_mont; - fiat_p256_from_generic(Z2_mont, &p->Z); - fiat_p256_mul(Z2_mont, Z2_mont, Z2_mont); - - fiat_p256_felem r_Z2; - fiat_p256_from_words(r_Z2, r->words); // r < order < p, so this is valid. - fiat_p256_mul(r_Z2, r_Z2, Z2_mont); - - fiat_p256_felem X; - fiat_p256_from_generic(X, &p->X); - fiat_p256_from_montgomery(X, X); - - if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) { - return 1; - } - - // During signing the x coefficient is reduced modulo the group order. - // Therefore there is a small possibility, less than 1/2^128, that group_order - // < p.x < P. in that case we need not only to compare against |r| but also to - // compare against r+group_order. - assert(group->field.width == group->order.width); - if (bn_less_than_words(r->words, group->field_minus_order.words, - group->field.width)) { - // We can ignore the carry because: r + group_order < p < 2^256. - EC_FELEM tmp; - bn_add_words(tmp.words, r->words, group->order.d, group->order.width); - fiat_p256_from_generic(r_Z2, &tmp); - fiat_p256_mul(r_Z2, r_Z2, Z2_mont); - if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) { - return 1; - } - } - - return 0; -} - -DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp256_method) { - out->group_init = ec_GFp_mont_group_init; - out->group_finish = ec_GFp_mont_group_finish; - out->group_set_curve = ec_GFp_mont_group_set_curve; - out->point_get_affine_coordinates = - ec_GFp_nistp256_point_get_affine_coordinates; - out->add = ec_GFp_nistp256_add; - out->dbl = ec_GFp_nistp256_dbl; - out->mul = ec_GFp_nistp256_point_mul; - out->mul_base = ec_GFp_nistp256_point_mul_base; - out->mul_public = ec_GFp_nistp256_point_mul_public; - out->felem_mul = ec_GFp_mont_felem_mul; - out->felem_sqr = ec_GFp_mont_felem_sqr; - out->felem_to_bytes = ec_GFp_mont_felem_to_bytes; - out->felem_from_bytes = ec_GFp_mont_felem_from_bytes; - out->scalar_inv0_montgomery = ec_simple_scalar_inv0_montgomery; - out->scalar_to_montgomery_inv_vartime = - ec_simple_scalar_to_montgomery_inv_vartime; - out->cmp_x_coordinate = ec_GFp_nistp256_cmp_x_coordinate; -} - -#undef BORINGSSL_NISTP256_64BIT diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/p256.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/p256.cc.inc new file mode 100644 index 00000000..49235c6b --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/p256.cc.inc @@ -0,0 +1,578 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// An implementation of the NIST P-256 elliptic curve point multiplication. +// 256-bit Montgomery form for 64 and 32-bit. Field operations are generated by +// Fiat, which lives in //third_party/fiat. + +#include + +#include +#include +#include +#include + +#include +#include + +#include + +#include "../../internal.h" +#include "../delocate.h" +#include "./internal.h" + +#include "../../../third_party/fiat/p256_field.c.inc" +#include "../../../third_party/fiat/p256_point.br.c.inc" + + +using namespace bssl; + +// utility functions, handwritten + +#if defined(OPENSSL_64_BIT) +#define FIAT_P256_NLIMBS 4 +typedef uint64_t fiat_p256_limb_t; +typedef uint64_t fiat_p256_felem[FIAT_P256_NLIMBS]; +static const fiat_p256_felem fiat_p256_one = {0x1, 0xffffffff00000000, + 0xffffffffffffffff, 0xfffffffe}; +#else // 64BIT; else 32BIT +#define FIAT_P256_NLIMBS 8 +typedef uint32_t fiat_p256_limb_t; +typedef uint32_t fiat_p256_felem[FIAT_P256_NLIMBS]; +static const fiat_p256_felem fiat_p256_one = { + 0x1, 0x0, 0x0, 0xffffffff, 0xffffffff, 0xffffffff, 0xfffffffe, 0x0}; +#endif // 64BIT + + +static void fiat_p256_copy(fiat_p256_limb_t out[FIAT_P256_NLIMBS], + const fiat_p256_limb_t in1[FIAT_P256_NLIMBS]) { + for (size_t i = 0; i < FIAT_P256_NLIMBS; i++) { + out[i] = in1[i]; + } +} + +static void fiat_p256_cmovznz(fiat_p256_limb_t out[FIAT_P256_NLIMBS], + fiat_p256_limb_t t, + const fiat_p256_limb_t z[FIAT_P256_NLIMBS], + const fiat_p256_limb_t nz[FIAT_P256_NLIMBS]) { + fiat_p256_selectznz(out, !!t, z, nz); +} + +static void fiat_p256_from_words(fiat_p256_felem out, + const BN_ULONG in[32 / sizeof(BN_ULONG)]) { + // Typically, |BN_ULONG| and |fiat_p256_limb_t| will be the same type, but on + // 64-bit platforms without |uint128_t|, they are different. However, on + // little-endian systems, |uint64_t[4]| and |uint32_t[8]| have the same + // layout. + OPENSSL_memcpy(out, in, 32); +} + +static void fiat_p256_from_generic(fiat_p256_felem out, const EC_FELEM *in) { + fiat_p256_from_words(out, in->words); +} + +static void fiat_p256_to_generic(EC_FELEM *out, const fiat_p256_felem in) { + // See |fiat_p256_from_words|. + OPENSSL_memcpy(out->words, in, 32); +} + +// fiat_p256_inv_square calculates |out| = |in|^{-2} +// +// Based on Fermat's Little Theorem: +// a^p = a (mod p) +// a^{p-1} = 1 (mod p) +// a^{p-3} = a^{-2} (mod p) +static void fiat_p256_inv_square(fiat_p256_felem out, + const fiat_p256_felem in) { + // This implements the addition chain described in + // https://briansmith.org/ecc-inversion-addition-chains-01#p256_field_inversion + fiat_p256_felem x2, x3, x6, x12, x15, x30, x32; + fiat_p256_square(x2, in); // 2^2 - 2^1 + fiat_p256_mul(x2, x2, in); // 2^2 - 2^0 + + fiat_p256_square(x3, x2); // 2^3 - 2^1 + fiat_p256_mul(x3, x3, in); // 2^3 - 2^0 + + fiat_p256_square(x6, x3); + for (int i = 1; i < 3; i++) { + fiat_p256_square(x6, x6); + } // 2^6 - 2^3 + fiat_p256_mul(x6, x6, x3); // 2^6 - 2^0 + + fiat_p256_square(x12, x6); + for (int i = 1; i < 6; i++) { + fiat_p256_square(x12, x12); + } // 2^12 - 2^6 + fiat_p256_mul(x12, x12, x6); // 2^12 - 2^0 + + fiat_p256_square(x15, x12); + for (int i = 1; i < 3; i++) { + fiat_p256_square(x15, x15); + } // 2^15 - 2^3 + fiat_p256_mul(x15, x15, x3); // 2^15 - 2^0 + + fiat_p256_square(x30, x15); + for (int i = 1; i < 15; i++) { + fiat_p256_square(x30, x30); + } // 2^30 - 2^15 + fiat_p256_mul(x30, x30, x15); // 2^30 - 2^0 + + fiat_p256_square(x32, x30); + fiat_p256_square(x32, x32); // 2^32 - 2^2 + fiat_p256_mul(x32, x32, x2); // 2^32 - 2^0 + + fiat_p256_felem ret; + fiat_p256_square(ret, x32); + for (int i = 1; i < 31 + 1; i++) { + fiat_p256_square(ret, ret); + } // 2^64 - 2^32 + fiat_p256_mul(ret, ret, in); // 2^64 - 2^32 + 2^0 + + for (int i = 0; i < 96 + 32; i++) { + fiat_p256_square(ret, ret); + } // 2^192 - 2^160 + 2^128 + fiat_p256_mul(ret, ret, x32); // 2^192 - 2^160 + 2^128 + 2^32 - 2^0 + + for (int i = 0; i < 32; i++) { + fiat_p256_square(ret, ret); + } // 2^224 - 2^192 + 2^160 + 2^64 - 2^32 + fiat_p256_mul(ret, ret, x32); // 2^224 - 2^192 + 2^160 + 2^64 - 2^0 + + for (int i = 0; i < 30; i++) { + fiat_p256_square(ret, ret); + } // 2^254 - 2^222 + 2^190 + 2^94 - 2^30 + fiat_p256_mul(ret, ret, x30); // 2^254 - 2^222 + 2^190 + 2^94 - 2^0 + + fiat_p256_square(ret, ret); + fiat_p256_square(out, ret); // 2^256 - 2^224 + 2^192 + 2^96 - 2^2 +} + +// Group operations +// ---------------- +// +// Building on top of the field operations we have the operations on the +// elliptic curve group itself. Points on the curve are represented in Jacobian +// coordinates. + +static void fiat_p256_point_double(fiat_p256_felem x_out, fiat_p256_felem y_out, + fiat_p256_felem z_out, + const fiat_p256_felem x_in, + const fiat_p256_felem y_in, + const fiat_p256_felem z_in) { + uint8_t out[3*32], in[3*32]; + static_assert(sizeof(fiat_p256_felem) == 32); + OPENSSL_memcpy(&in[0], x_in, 32); + OPENSSL_memcpy(&in[32], y_in, 32); + OPENSSL_memcpy(&in[64], z_in, 32); + p256_point_double((br_word_t)out, (br_word_t)in); + OPENSSL_memcpy(x_out, &out[0], 32); + OPENSSL_memcpy(y_out, &out[32], 32); + OPENSSL_memcpy(z_out, &out[64], 32); +} + +static void fiat_p256_point_add(fiat_p256_felem x3, fiat_p256_felem y3, + fiat_p256_felem z3, const fiat_p256_felem x1, + const fiat_p256_felem y1, + const fiat_p256_felem z1, + const fiat_p256_felem x2, + const fiat_p256_felem y2, + const fiat_p256_felem z2) { + uint8_t out[3 * 32], in1[3 * 32], in2[3 * 32]; + static_assert(sizeof(fiat_p256_felem) == 32); + OPENSSL_memcpy(&in1[0], x1, 32); + OPENSSL_memcpy(&in1[32], y1, 32); + OPENSSL_memcpy(&in1[64], z1, 32); + OPENSSL_memcpy(&in2[0], x2, 32); + OPENSSL_memcpy(&in2[32], y2, 32); + OPENSSL_memcpy(&in2[64], z2, 32); + p256_point_add_vartime_if_doubling((br_word_t)out, (br_word_t)in1, + (br_word_t)in2); + OPENSSL_memcpy(x3, &out[0], 32); + OPENSSL_memcpy(y3, &out[32], 32); + OPENSSL_memcpy(z3, &out[64], 32); +} +#include "./p256_table.h" + +// fiat_p256_select_point_affine selects the |idx-1|th point from a +// precomputation table and copies it to out. If |idx| is zero, the output is +// the point at infinity. +static void fiat_p256_select_point_affine( + const fiat_p256_limb_t idx, size_t size, + const fiat_p256_felem pre_comp[/*size*/][2], fiat_p256_felem out[3]) { + OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3); + for (size_t i = 0; i < size; i++) { + fiat_p256_limb_t mismatch = i ^ (idx - 1); + fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]); + fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]); + } + fiat_p256_cmovznz(out[2], idx, out[2], fiat_p256_one); +} + +// fiat_p256_select_point selects the |idx|th point from a precomputation table +// and copies it to out. +static void fiat_p256_select_point(const fiat_p256_limb_t idx, size_t size, + const fiat_p256_felem pre_comp[/*size*/][3], + fiat_p256_felem out[3]) { + OPENSSL_memset(out, 0, sizeof(fiat_p256_felem) * 3); + for (size_t i = 0; i < size; i++) { + fiat_p256_limb_t mismatch = i ^ idx; + fiat_p256_cmovznz(out[0], mismatch, pre_comp[i][0], out[0]); + fiat_p256_cmovznz(out[1], mismatch, pre_comp[i][1], out[1]); + fiat_p256_cmovznz(out[2], mismatch, pre_comp[i][2], out[2]); + } +} + +// fiat_p256_get_bit returns the |i|th bit in |in|. +static crypto_word_t fiat_p256_get_bit(const EC_SCALAR *in, int i) { + if (i < 0 || i >= 256) { + return 0; + } +#if defined(OPENSSL_64_BIT) + static_assert(sizeof(BN_ULONG) == 8, "BN_ULONG was not 64-bit"); + return (in->words[i >> 6] >> (i & 63)) & 1; +#else + static_assert(sizeof(BN_ULONG) == 4, "BN_ULONG was not 32-bit"); + return (in->words[i >> 5] >> (i & 31)) & 1; +#endif +} + +// OPENSSL EC_METHOD FUNCTIONS + +// Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') = +// (X/Z^2, Y/Z^3). +static int ec_GFp_nistp256_point_get_affine_coordinates( + const EC_GROUP *group, const EC_JACOBIAN *point, EC_FELEM *x_out, + EC_FELEM *y_out) { + if (constant_time_declassify_int( + ec_GFp_simple_is_at_infinity(group, point))) { + OPENSSL_PUT_ERROR(EC, EC_R_POINT_AT_INFINITY); + return 0; + } + + fiat_p256_felem z1, z2; + fiat_p256_from_generic(z1, &point->Z); + fiat_p256_inv_square(z2, z1); + + if (x_out != nullptr) { + fiat_p256_felem x; + fiat_p256_from_generic(x, &point->X); + fiat_p256_mul(x, x, z2); + fiat_p256_to_generic(x_out, x); + } + + if (y_out != nullptr) { + fiat_p256_felem y; + fiat_p256_from_generic(y, &point->Y); + fiat_p256_square(z2, z2); // z^-4 + fiat_p256_mul(y, y, z1); // y * z + fiat_p256_mul(y, y, z2); // y * z^-3 + fiat_p256_to_generic(y_out, y); + } + + return 1; +} + +static void ec_GFp_nistp256_add(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *a, const EC_JACOBIAN *b) { + fiat_p256_felem x1, y1, z1, x2, y2, z2; + fiat_p256_from_generic(x1, &a->X); + fiat_p256_from_generic(y1, &a->Y); + fiat_p256_from_generic(z1, &a->Z); + fiat_p256_from_generic(x2, &b->X); + fiat_p256_from_generic(y2, &b->Y); + fiat_p256_from_generic(z2, &b->Z); + fiat_p256_point_add(x1, y1, z1, x1, y1, z1, x2, y2, z2); + fiat_p256_to_generic(&r->X, x1); + fiat_p256_to_generic(&r->Y, y1); + fiat_p256_to_generic(&r->Z, z1); +} + +static void ec_GFp_nistp256_dbl(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *a) { + fiat_p256_felem x, y, z; + fiat_p256_from_generic(x, &a->X); + fiat_p256_from_generic(y, &a->Y); + fiat_p256_from_generic(z, &a->Z); + fiat_p256_point_double(x, y, z, x, y, z); + fiat_p256_to_generic(&r->X, x); + fiat_p256_to_generic(&r->Y, y); + fiat_p256_to_generic(&r->Z, z); +} + +static void ec_GFp_nistp256_point_mul(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *p, + const EC_SCALAR *scalar) { + fiat_p256_felem p_pre_comp[17][3]; + OPENSSL_memset(&p_pre_comp, 0, sizeof(p_pre_comp)); + // Precompute multiples. + fiat_p256_from_generic(p_pre_comp[1][0], &p->X); + fiat_p256_from_generic(p_pre_comp[1][1], &p->Y); + fiat_p256_from_generic(p_pre_comp[1][2], &p->Z); + for (size_t j = 2; j <= 16; ++j) { + if (j & 1) { + fiat_p256_point_add(p_pre_comp[j][0], p_pre_comp[j][1], p_pre_comp[j][2], + p_pre_comp[1][0], p_pre_comp[1][1], p_pre_comp[1][2], + p_pre_comp[j - 1][0], p_pre_comp[j - 1][1], + p_pre_comp[j - 1][2]); + } else { + fiat_p256_point_double(p_pre_comp[j][0], p_pre_comp[j][1], + p_pre_comp[j][2], p_pre_comp[j / 2][0], + p_pre_comp[j / 2][1], p_pre_comp[j / 2][2]); + } + } + + // Set nq to the point at infinity. + fiat_p256_felem nq[3] = {{0}, {0}, {0}}, ftmp, tmp[3]; + + // Loop over |scalar| msb-to-lsb, incorporating |p_pre_comp| every 5th round. + int skip = 1; // Save two point operations in the first round. + for (size_t i = 255; i < 256; i--) { + // double + if (!skip) { + fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); + } + + // do other additions every 5 doublings + if (i % 5 == 0) { + crypto_word_t bits = fiat_p256_get_bit(scalar, i + 4) << 5; + bits |= fiat_p256_get_bit(scalar, i + 3) << 4; + bits |= fiat_p256_get_bit(scalar, i + 2) << 3; + bits |= fiat_p256_get_bit(scalar, i + 1) << 2; + bits |= fiat_p256_get_bit(scalar, i) << 1; + bits |= fiat_p256_get_bit(scalar, i - 1); + crypto_word_t sign, digit; + ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits); + + // select the point to add or subtract, in constant time. + fiat_p256_select_point((fiat_p256_limb_t)digit, 17, + (const fiat_p256_felem(*)[3])p_pre_comp, tmp); + fiat_p256_opp(ftmp, tmp[1]); // (X, -Y, Z) is the negative point. + fiat_p256_cmovznz(tmp[1], (fiat_p256_limb_t)sign, tmp[1], ftmp); + + if (!skip) { + fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0], + tmp[1], tmp[2]); + } else { + fiat_p256_copy(nq[0], tmp[0]); + fiat_p256_copy(nq[1], tmp[1]); + fiat_p256_copy(nq[2], tmp[2]); + skip = 0; + } + } + } + + fiat_p256_to_generic(&r->X, nq[0]); + fiat_p256_to_generic(&r->Y, nq[1]); + fiat_p256_to_generic(&r->Z, nq[2]); +} + +static void ec_GFp_nistp256_point_mul_base(const EC_GROUP *group, + EC_JACOBIAN *r, + const EC_SCALAR *scalar) { + // Set nq to the point at infinity. + fiat_p256_felem nq[3] = {{0}, {0}, {0}}, tmp[3]; + + int skip = 1; // Save two point operations in the first round. + for (size_t i = 31; i < 32; i--) { + if (!skip) { + fiat_p256_point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]); + } + + // First, look 32 bits upwards. + crypto_word_t bits = fiat_p256_get_bit(scalar, i + 224) << 3; + bits |= fiat_p256_get_bit(scalar, i + 160) << 2; + bits |= fiat_p256_get_bit(scalar, i + 96) << 1; + bits |= fiat_p256_get_bit(scalar, i + 32); + // Select the point to add, in constant time. + fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15, + fiat_p256_g_pre_comp[1], tmp); + + if (!skip) { + fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0], + tmp[1], tmp[2]); + } else { + fiat_p256_copy(nq[0], tmp[0]); + fiat_p256_copy(nq[1], tmp[1]); + fiat_p256_copy(nq[2], tmp[2]); + skip = 0; + } + + // Second, look at the current position. + bits = fiat_p256_get_bit(scalar, i + 192) << 3; + bits |= fiat_p256_get_bit(scalar, i + 128) << 2; + bits |= fiat_p256_get_bit(scalar, i + 64) << 1; + bits |= fiat_p256_get_bit(scalar, i); + // Select the point to add, in constant time. + fiat_p256_select_point_affine((fiat_p256_limb_t)bits, 15, + fiat_p256_g_pre_comp[0], tmp); + fiat_p256_point_add(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2], tmp[0], + tmp[1], tmp[2]); + } + + fiat_p256_to_generic(&r->X, nq[0]); + fiat_p256_to_generic(&r->Y, nq[1]); + fiat_p256_to_generic(&r->Z, nq[2]); +} + +static void ec_GFp_nistp256_point_mul_public(const EC_GROUP *group, + EC_JACOBIAN *r, + const EC_SCALAR *g_scalar, + const EC_JACOBIAN *p, + const EC_SCALAR *p_scalar) { +#define P256_WSIZE_PUBLIC 4 + // Precompute multiples of |p|. p_pre_comp[i] is (2*i+1) * |p|. + fiat_p256_felem p_pre_comp[1 << (P256_WSIZE_PUBLIC - 1)][3]; + fiat_p256_from_generic(p_pre_comp[0][0], &p->X); + fiat_p256_from_generic(p_pre_comp[0][1], &p->Y); + fiat_p256_from_generic(p_pre_comp[0][2], &p->Z); + fiat_p256_felem p2[3]; + fiat_p256_point_double(p2[0], p2[1], p2[2], p_pre_comp[0][0], + p_pre_comp[0][1], p_pre_comp[0][2]); + for (size_t i = 1; i < std::size(p_pre_comp); i++) { + fiat_p256_point_add(p_pre_comp[i][0], p_pre_comp[i][1], p_pre_comp[i][2], + p_pre_comp[i - 1][0], p_pre_comp[i - 1][1], + p_pre_comp[i - 1][2], p2[0], p2[1], p2[2]); + } + + // Set up the coefficients for |p_scalar|. + int8_t p_wNAF[257]; + ec_compute_wNAF(group, p_wNAF, p_scalar, 256, P256_WSIZE_PUBLIC); + + // Set |ret| to the point at infinity. + int skip = 1; // Save some point operations. + fiat_p256_felem ret[3] = {{0}, {0}, {0}}; + for (int i = 256; i >= 0; i--) { + if (!skip) { + fiat_p256_point_double(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2]); + } + + // For the |g_scalar|, we use the precomputed table without the + // constant-time lookup. + if (i <= 31) { + // First, look 32 bits upwards. + crypto_word_t bits = fiat_p256_get_bit(g_scalar, i + 224) << 3; + bits |= fiat_p256_get_bit(g_scalar, i + 160) << 2; + bits |= fiat_p256_get_bit(g_scalar, i + 96) << 1; + bits |= fiat_p256_get_bit(g_scalar, i + 32); + if (bits != 0) { + size_t index = (size_t)(bits - 1); + fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2], + fiat_p256_g_pre_comp[1][index][0], + fiat_p256_g_pre_comp[1][index][1], fiat_p256_one); + skip = 0; + } + + // Second, look at the current position. + bits = fiat_p256_get_bit(g_scalar, i + 192) << 3; + bits |= fiat_p256_get_bit(g_scalar, i + 128) << 2; + bits |= fiat_p256_get_bit(g_scalar, i + 64) << 1; + bits |= fiat_p256_get_bit(g_scalar, i); + if (bits != 0) { + size_t index = (size_t)(bits - 1); + fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2], + fiat_p256_g_pre_comp[0][index][0], + fiat_p256_g_pre_comp[0][index][1], fiat_p256_one); + skip = 0; + } + } + + int digit = p_wNAF[i]; + if (digit != 0) { + assert(digit & 1); + size_t idx = (size_t)(digit < 0 ? (-digit) >> 1 : digit >> 1); + fiat_p256_felem *y = &p_pre_comp[idx][1], tmp; + if (digit < 0) { + fiat_p256_opp(tmp, p_pre_comp[idx][1]); + y = &tmp; + } + if (!skip) { + fiat_p256_point_add(ret[0], ret[1], ret[2], ret[0], ret[1], ret[2], + p_pre_comp[idx][0], *y, p_pre_comp[idx][2]); + } else { + fiat_p256_copy(ret[0], p_pre_comp[idx][0]); + fiat_p256_copy(ret[1], *y); + fiat_p256_copy(ret[2], p_pre_comp[idx][2]); + skip = 0; + } + } + } + + fiat_p256_to_generic(&r->X, ret[0]); + fiat_p256_to_generic(&r->Y, ret[1]); + fiat_p256_to_generic(&r->Z, ret[2]); +} + +static int ec_GFp_nistp256_cmp_x_coordinate(const EC_GROUP *group, + const EC_JACOBIAN *p, + const EC_SCALAR *r) { + if (ec_GFp_simple_is_at_infinity(group, p)) { + return 0; + } + + // We wish to compare X/Z^2 with r. This is equivalent to comparing X with + // r*Z^2. Note that X and Z are represented in Montgomery form, while r is + // not. + fiat_p256_felem Z2_mont; + fiat_p256_from_generic(Z2_mont, &p->Z); + fiat_p256_mul(Z2_mont, Z2_mont, Z2_mont); + + fiat_p256_felem r_Z2; + fiat_p256_from_words(r_Z2, r->words); // r < order < p, so this is valid. + fiat_p256_mul(r_Z2, r_Z2, Z2_mont); + + fiat_p256_felem X; + fiat_p256_from_generic(X, &p->X); + fiat_p256_from_montgomery(X, X); + + if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) { + return 1; + } + + // During signing the x coefficient is reduced modulo the group order. + // Therefore there is a small possibility, less than 1/2^128, that group_order + // < p.x < P. in that case we need not only to compare against |r| but also to + // compare against r+group_order. + assert(group->field.N.width == group->order.N.width); + EC_FELEM tmp; + BN_ULONG carry = + bn_add_words(tmp.words, r->words, group->order.N.d, group->field.N.width); + if (carry == 0 && + bn_less_than_words(tmp.words, group->field.N.d, group->field.N.width)) { + fiat_p256_from_generic(r_Z2, &tmp); + fiat_p256_mul(r_Z2, r_Z2, Z2_mont); + if (OPENSSL_memcmp(&r_Z2, &X, sizeof(r_Z2)) == 0) { + return 1; + } + } + + return 0; +} + +BSSL_NAMESPACE_BEGIN + +DEFINE_METHOD_FUNCTION(EC_METHOD, EC_GFp_nistp256_method) { + out->point_get_affine_coordinates = + ec_GFp_nistp256_point_get_affine_coordinates; + out->add = ec_GFp_nistp256_add; + out->dbl = ec_GFp_nistp256_dbl; + out->mul = ec_GFp_nistp256_point_mul; + out->mul_base = ec_GFp_nistp256_point_mul_base; + out->mul_public = ec_GFp_nistp256_point_mul_public; + out->scalar_inv0_montgomery = ec_simple_scalar_inv0_montgomery; + out->scalar_to_montgomery_inv_vartime = + ec_simple_scalar_to_montgomery_inv_vartime; + out->cmp_x_coordinate = ec_GFp_nistp256_cmp_x_coordinate; +} + +BSSL_NAMESPACE_END diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/p256_table.h b/third_party/boringssl/src/crypto/fipsmodule/ec/p256_table.h index 14129a36..e16eabaa 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/p256_table.h +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/p256_table.h @@ -1,16 +1,16 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // This file is generated by make_tables.go. @@ -50,7 +50,7 @@ // Tables for other points have table[i] = iG for i in 0 .. 16. // fiat_p256_g_pre_comp is the table of precomputed base points -#if defined(BORINGSSL_NISTP256_64BIT) +#if defined(OPENSSL_64_BIT) static const fiat_p256_felem fiat_p256_g_pre_comp[2][15][2] = { {{{0x79e730d418a9143c, 0x75ba95fc5fedb601, 0x79fb732b77622510, 0x18905f76a53755c6}, diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/scalar.c b/third_party/boringssl/src/crypto/fipsmodule/ec/scalar.c deleted file mode 100644 index 036049e0..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/scalar.c +++ /dev/null @@ -1,169 +0,0 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include -#include -#include - -#include "internal.h" -#include "../bn/internal.h" -#include "../../internal.h" - - -int ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out, - const BIGNUM *in) { - if (!bn_copy_words(out->words, group->order.width, in) || - !bn_less_than_words(out->words, group->order.d, group->order.width)) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR); - return 0; - } - return 1; -} - -int ec_scalar_equal_vartime(const EC_GROUP *group, const EC_SCALAR *a, - const EC_SCALAR *b) { - return OPENSSL_memcmp(a->words, b->words, - group->order.width * sizeof(BN_ULONG)) == 0; -} - -int ec_scalar_is_zero(const EC_GROUP *group, const EC_SCALAR *a) { - BN_ULONG mask = 0; - for (int i = 0; i < group->order.width; i++) { - mask |= a->words[i]; - } - return mask == 0; -} - -int ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out, - const uint8_t additional_data[32]) { - return bn_rand_range_words(out->words, 1, group->order.d, group->order.width, - additional_data); -} - -void ec_scalar_to_bytes(const EC_GROUP *group, uint8_t *out, size_t *out_len, - const EC_SCALAR *in) { - size_t len = BN_num_bytes(&group->order); - bn_words_to_big_endian(out, len, in->words, group->order.width); - *out_len = len; -} - -int ec_scalar_from_bytes(const EC_GROUP *group, EC_SCALAR *out, - const uint8_t *in, size_t len) { - if (len != BN_num_bytes(&group->order)) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR); - return 0; - } - - bn_big_endian_to_words(out->words, group->order.width, in, len); - - if (!bn_less_than_words(out->words, group->order.d, group->order.width)) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR); - return 0; - } - - return 1; -} - -void ec_scalar_reduce(const EC_GROUP *group, EC_SCALAR *out, - const BN_ULONG *words, size_t num) { - // Convert "from" Montgomery form so the value is reduced modulo the order. - bn_from_montgomery_small(out->words, group->order.width, words, num, - group->order_mont); - // Convert "to" Montgomery form to remove the R^-1 factor added. - ec_scalar_to_montgomery(group, out, out); -} - -void ec_scalar_add(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a, - const EC_SCALAR *b) { - const BIGNUM *order = &group->order; - BN_ULONG tmp[EC_MAX_WORDS]; - bn_mod_add_words(r->words, a->words, b->words, order->d, tmp, order->width); - OPENSSL_cleanse(tmp, sizeof(tmp)); -} - -void ec_scalar_sub(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a, - const EC_SCALAR *b) { - const BIGNUM *order = &group->order; - BN_ULONG tmp[EC_MAX_WORDS]; - bn_mod_sub_words(r->words, a->words, b->words, order->d, tmp, order->width); - OPENSSL_cleanse(tmp, sizeof(tmp)); -} - -void ec_scalar_neg(const EC_GROUP *group, EC_SCALAR *r, const EC_SCALAR *a) { - EC_SCALAR zero; - OPENSSL_memset(&zero, 0, sizeof(EC_SCALAR)); - ec_scalar_sub(group, r, &zero, a); -} - -void ec_scalar_select(const EC_GROUP *group, EC_SCALAR *out, BN_ULONG mask, - const EC_SCALAR *a, const EC_SCALAR *b) { - const BIGNUM *order = &group->order; - bn_select_words(out->words, mask, a->words, b->words, order->width); -} - -void ec_scalar_to_montgomery(const EC_GROUP *group, EC_SCALAR *r, - const EC_SCALAR *a) { - const BIGNUM *order = &group->order; - bn_to_montgomery_small(r->words, a->words, order->width, group->order_mont); -} - -void ec_scalar_from_montgomery(const EC_GROUP *group, EC_SCALAR *r, - const EC_SCALAR *a) { - const BIGNUM *order = &group->order; - bn_from_montgomery_small(r->words, order->width, a->words, order->width, - group->order_mont); -} - -void ec_scalar_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r, - const EC_SCALAR *a, const EC_SCALAR *b) { - const BIGNUM *order = &group->order; - bn_mod_mul_montgomery_small(r->words, a->words, b->words, order->width, - group->order_mont); -} - -void ec_simple_scalar_inv0_montgomery(const EC_GROUP *group, EC_SCALAR *r, - const EC_SCALAR *a) { - const BIGNUM *order = &group->order; - bn_mod_inverse0_prime_mont_small(r->words, a->words, order->width, - group->order_mont); -} - -int ec_simple_scalar_to_montgomery_inv_vartime(const EC_GROUP *group, - EC_SCALAR *r, - const EC_SCALAR *a) { - if (ec_scalar_is_zero(group, a)) { - return 0; - } - - // This implementation (in fact) runs in constant time, - // even though for this interface it is not mandatory. - - // r = a^-1 in the Montgomery domain. This is - // |ec_scalar_to_montgomery| followed by |ec_scalar_inv0_montgomery|, but - // |ec_scalar_inv0_montgomery| followed by |ec_scalar_from_montgomery| is - // equivalent and slightly more efficient. - ec_scalar_inv0_montgomery(group, r, a); - ec_scalar_from_montgomery(group, r, r); - return 1; -} - -void ec_scalar_inv0_montgomery(const EC_GROUP *group, EC_SCALAR *r, - const EC_SCALAR *a) { - group->meth->scalar_inv0_montgomery(group, r, a); -} - -int ec_scalar_to_montgomery_inv_vartime(const EC_GROUP *group, EC_SCALAR *r, - const EC_SCALAR *a) { - return group->meth->scalar_to_montgomery_inv_vartime(group, r, a); -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/scalar.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/scalar.cc.inc new file mode 100644 index 00000000..95813258 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/scalar.cc.inc @@ -0,0 +1,184 @@ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include "../../internal.h" +#include "../bn/internal.h" +#include "internal.h" + + +using namespace bssl; + +int bssl::ec_bignum_to_scalar(const EC_GROUP *group, EC_SCALAR *out, + const BIGNUM *in) { + // Scalars, which are often secret, must be reduced modulo the order. Those + // that are not will be discarded, so leaking the result of the comparison is + // safe. + if (!bn_copy_words(out->words, group->order.N.width, in) || + !constant_time_declassify_int(bn_less_than_words( + out->words, group->order.N.d, group->order.N.width))) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR); + return 0; + } + return 1; +} + +int bssl::ec_scalar_equal_vartime(const EC_GROUP *group, const EC_SCALAR *a, + const EC_SCALAR *b) { + return OPENSSL_memcmp(a->words, b->words, + group->order.N.width * sizeof(BN_ULONG)) == 0; +} + +int bssl::ec_scalar_is_zero(const EC_GROUP *group, const EC_SCALAR *a) { + BN_ULONG mask = 0; + for (int i = 0; i < group->order.N.width; i++) { + mask |= a->words[i]; + } + return mask == 0; +} + +int bssl::ec_random_scalar(const EC_GROUP *group, EC_SCALAR *out, + const uint8_t additional_data[32]) { + return bn_rand_range_words(out->words, 0, group->order.N.d, + group->order.N.width, additional_data); +} + +int bssl::ec_random_nonzero_scalar(const EC_GROUP *group, EC_SCALAR *out, + const uint8_t additional_data[32]) { + return bn_rand_range_words(out->words, 1, group->order.N.d, + group->order.N.width, additional_data); +} + +void bssl::ec_scalar_to_bytes(const EC_GROUP *group, uint8_t *out, + size_t *out_len, const EC_SCALAR *in) { + size_t len = BN_num_bytes(&group->order.N); + bn_words_to_big_endian(out, len, in->words, group->order.N.width); + *out_len = len; +} + +int bssl::ec_scalar_from_bytes(const EC_GROUP *group, EC_SCALAR *out, + const uint8_t *in, size_t len) { + if (len != BN_num_bytes(&group->order.N)) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR); + return 0; + } + + bn_big_endian_to_words(out->words, group->order.N.width, in, len); + + if (!bn_less_than_words(out->words, group->order.N.d, group->order.N.width)) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_SCALAR); + return 0; + } + + return 1; +} + +void bssl::ec_scalar_reduce(const EC_GROUP *group, EC_SCALAR *out, + const BN_ULONG *words, size_t num) { + // Convert "from" Montgomery form so the value is reduced modulo the order. + bn_from_montgomery_small(out->words, group->order.N.width, words, num, + &group->order); + // Convert "to" Montgomery form to remove the R^-1 factor added. + ec_scalar_to_montgomery(group, out, out); +} + +void bssl::ec_scalar_add(const EC_GROUP *group, EC_SCALAR *r, + const EC_SCALAR *a, const EC_SCALAR *b) { + const BIGNUM *order = &group->order.N; + BN_ULONG tmp[EC_MAX_WORDS]; + bn_mod_add_words(r->words, a->words, b->words, order->d, tmp, order->width); + OPENSSL_cleanse(tmp, sizeof(tmp)); +} + +void bssl::ec_scalar_sub(const EC_GROUP *group, EC_SCALAR *r, + const EC_SCALAR *a, const EC_SCALAR *b) { + const BIGNUM *order = &group->order.N; + BN_ULONG tmp[EC_MAX_WORDS]; + bn_mod_sub_words(r->words, a->words, b->words, order->d, tmp, order->width); + OPENSSL_cleanse(tmp, sizeof(tmp)); +} + +void bssl::ec_scalar_neg(const EC_GROUP *group, EC_SCALAR *r, + const EC_SCALAR *a) { + EC_SCALAR zero; + OPENSSL_memset(&zero, 0, sizeof(EC_SCALAR)); + ec_scalar_sub(group, r, &zero, a); +} + +void bssl::ec_scalar_select(const EC_GROUP *group, EC_SCALAR *out, + BN_ULONG mask, const EC_SCALAR *a, + const EC_SCALAR *b) { + const BIGNUM *order = &group->order.N; + bn_select_words(out->words, mask, a->words, b->words, order->width); +} + +void bssl::ec_scalar_to_montgomery(const EC_GROUP *group, EC_SCALAR *r, + const EC_SCALAR *a) { + const BIGNUM *order = &group->order.N; + bn_to_montgomery_small(r->words, a->words, order->width, &group->order); +} + +void bssl::ec_scalar_from_montgomery(const EC_GROUP *group, EC_SCALAR *r, + const EC_SCALAR *a) { + const BIGNUM *order = &group->order.N; + bn_from_montgomery_small(r->words, order->width, a->words, order->width, + &group->order); +} + +void bssl::ec_scalar_mul_montgomery(const EC_GROUP *group, EC_SCALAR *r, + const EC_SCALAR *a, const EC_SCALAR *b) { + const BIGNUM *order = &group->order.N; + bn_mod_mul_montgomery_small(r->words, a->words, b->words, order->width, + &group->order); +} + +void bssl::ec_simple_scalar_inv0_montgomery(const EC_GROUP *group, EC_SCALAR *r, + const EC_SCALAR *a) { + const BIGNUM *order = &group->order.N; + bn_mod_inverse0_prime_mont_small(r->words, a->words, order->width, + &group->order); +} + +int bssl::ec_simple_scalar_to_montgomery_inv_vartime(const EC_GROUP *group, + EC_SCALAR *r, + const EC_SCALAR *a) { + if (ec_scalar_is_zero(group, a)) { + return 0; + } + + // This implementation (in fact) runs in constant time, + // even though for this interface it is not mandatory. + + // r = a^-1 in the Montgomery domain. This is + // |ec_scalar_to_montgomery| followed by |ec_scalar_inv0_montgomery|, but + // |ec_scalar_inv0_montgomery| followed by |ec_scalar_from_montgomery| is + // equivalent and slightly more efficient. + ec_scalar_inv0_montgomery(group, r, a); + ec_scalar_from_montgomery(group, r, r); + return 1; +} + +void bssl::ec_scalar_inv0_montgomery(const EC_GROUP *group, EC_SCALAR *r, + const EC_SCALAR *a) { + group->meth->scalar_inv0_montgomery(group, r, a); +} + +int bssl::ec_scalar_to_montgomery_inv_vartime(const EC_GROUP *group, + EC_SCALAR *r, + const EC_SCALAR *a) { + return group->meth->scalar_to_montgomery_inv_vartime(group, r, a); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/simple.c b/third_party/boringssl/src/crypto/fipsmodule/ec/simple.c deleted file mode 100644 index 58d8121b..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/simple.c +++ /dev/null @@ -1,352 +0,0 @@ -/* Originally written by Bodo Moeller for the OpenSSL project. - * ==================================================================== - * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ -/* ==================================================================== - * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. - * - * Portions of the attached software ("Contribution") are developed by - * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project. - * - * The Contribution is licensed pursuant to the OpenSSL open source - * license provided above. - * - * The elliptic curve binary polynomial software is originally written by - * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems - * Laboratories. */ - -#include - -#include - -#include -#include -#include - -#include "internal.h" -#include "../../internal.h" - - -// Most method functions in this file are designed to work with non-trivial -// representations of field elements if necessary (see ecp_mont.c): while -// standard modular addition and subtraction are used, the field_mul and -// field_sqr methods will be used for multiplication, and field_encode and -// field_decode (if defined) will be used for converting between -// representations. -// -// Functions here specifically assume that if a non-trivial representation is -// used, it is a Montgomery representation (i.e. 'encoding' means multiplying -// by some factor R). - -int ec_GFp_simple_group_init(EC_GROUP *group) { - BN_init(&group->field); - group->a_is_minus3 = 0; - return 1; -} - -void ec_GFp_simple_group_finish(EC_GROUP *group) { - BN_free(&group->field); -} - -int ec_GFp_simple_group_set_curve(EC_GROUP *group, const BIGNUM *p, - const BIGNUM *a, const BIGNUM *b, - BN_CTX *ctx) { - // p must be a prime > 3 - if (BN_num_bits(p) <= 2 || !BN_is_odd(p)) { - OPENSSL_PUT_ERROR(EC, EC_R_INVALID_FIELD); - return 0; - } - - int ret = 0; - BN_CTX_start(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - if (tmp == NULL) { - goto err; - } - - // group->field - if (!BN_copy(&group->field, p)) { - goto err; - } - BN_set_negative(&group->field, 0); - // Store the field in minimal form, so it can be used with |BN_ULONG| arrays. - bn_set_minimal_width(&group->field); - - if (!ec_bignum_to_felem(group, &group->a, a) || - !ec_bignum_to_felem(group, &group->b, b) || - !ec_bignum_to_felem(group, &group->one, BN_value_one())) { - goto err; - } - - // group->a_is_minus3 - if (!BN_copy(tmp, a) || - !BN_add_word(tmp, 3)) { - goto err; - } - group->a_is_minus3 = (0 == BN_cmp(tmp, &group->field)); - - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -int ec_GFp_simple_group_get_curve(const EC_GROUP *group, BIGNUM *p, BIGNUM *a, - BIGNUM *b) { - if ((p != NULL && !BN_copy(p, &group->field)) || - (a != NULL && !ec_felem_to_bignum(group, a, &group->a)) || - (b != NULL && !ec_felem_to_bignum(group, b, &group->b))) { - return 0; - } - return 1; -} - -void ec_GFp_simple_point_init(EC_RAW_POINT *point) { - OPENSSL_memset(&point->X, 0, sizeof(EC_FELEM)); - OPENSSL_memset(&point->Y, 0, sizeof(EC_FELEM)); - OPENSSL_memset(&point->Z, 0, sizeof(EC_FELEM)); -} - -void ec_GFp_simple_point_copy(EC_RAW_POINT *dest, const EC_RAW_POINT *src) { - OPENSSL_memcpy(&dest->X, &src->X, sizeof(EC_FELEM)); - OPENSSL_memcpy(&dest->Y, &src->Y, sizeof(EC_FELEM)); - OPENSSL_memcpy(&dest->Z, &src->Z, sizeof(EC_FELEM)); -} - -void ec_GFp_simple_point_set_to_infinity(const EC_GROUP *group, - EC_RAW_POINT *point) { - // Although it is strictly only necessary to zero Z, we zero the entire point - // in case |point| was stack-allocated and yet to be initialized. - ec_GFp_simple_point_init(point); -} - -void ec_GFp_simple_invert(const EC_GROUP *group, EC_RAW_POINT *point) { - ec_felem_neg(group, &point->Y, &point->Y); -} - -int ec_GFp_simple_is_at_infinity(const EC_GROUP *group, - const EC_RAW_POINT *point) { - return ec_felem_non_zero_mask(group, &point->Z) == 0; -} - -int ec_GFp_simple_is_on_curve(const EC_GROUP *group, - const EC_RAW_POINT *point) { - // We have a curve defined by a Weierstrass equation - // y^2 = x^3 + a*x + b. - // The point to consider is given in Jacobian projective coordinates - // where (X, Y, Z) represents (x, y) = (X/Z^2, Y/Z^3). - // Substituting this and multiplying by Z^6 transforms the above equation - // into - // Y^2 = X^3 + a*X*Z^4 + b*Z^6. - // To test this, we add up the right-hand side in 'rh'. - // - // This function may be used when double-checking the secret result of a point - // multiplication, so we proceed in constant-time. - - void (*const felem_mul)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a, - const EC_FELEM *b) = group->meth->felem_mul; - void (*const felem_sqr)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a) = - group->meth->felem_sqr; - - // rh := X^2 - EC_FELEM rh; - felem_sqr(group, &rh, &point->X); - - EC_FELEM tmp, Z4, Z6; - felem_sqr(group, &tmp, &point->Z); - felem_sqr(group, &Z4, &tmp); - felem_mul(group, &Z6, &Z4, &tmp); - - // rh := rh + a*Z^4 - if (group->a_is_minus3) { - ec_felem_add(group, &tmp, &Z4, &Z4); - ec_felem_add(group, &tmp, &tmp, &Z4); - ec_felem_sub(group, &rh, &rh, &tmp); - } else { - felem_mul(group, &tmp, &Z4, &group->a); - ec_felem_add(group, &rh, &rh, &tmp); - } - - // rh := (rh + a*Z^4)*X - felem_mul(group, &rh, &rh, &point->X); - - // rh := rh + b*Z^6 - felem_mul(group, &tmp, &group->b, &Z6); - ec_felem_add(group, &rh, &rh, &tmp); - - // 'lh' := Y^2 - felem_sqr(group, &tmp, &point->Y); - - ec_felem_sub(group, &tmp, &tmp, &rh); - BN_ULONG not_equal = ec_felem_non_zero_mask(group, &tmp); - - // If Z = 0, the point is infinity, which is always on the curve. - BN_ULONG not_infinity = ec_felem_non_zero_mask(group, &point->Z); - - return 1 & ~(not_infinity & not_equal); -} - -int ec_GFp_simple_points_equal(const EC_GROUP *group, const EC_RAW_POINT *a, - const EC_RAW_POINT *b) { - // This function is implemented in constant-time for two reasons. First, - // although EC points are usually public, their Jacobian Z coordinates may be - // secret, or at least are not obviously public. Second, more complex - // protocols will sometimes manipulate secret points. - // - // This does mean that we pay a 6M+2S Jacobian comparison when comparing two - // publicly affine points costs no field operations at all. If needed, we can - // restore this optimization by keeping better track of affine vs. Jacobian - // forms. See https://crbug.com/boringssl/326. - - // If neither |a| or |b| is infinity, we have to decide whether - // (X_a/Z_a^2, Y_a/Z_a^3) = (X_b/Z_b^2, Y_b/Z_b^3), - // or equivalently, whether - // (X_a*Z_b^2, Y_a*Z_b^3) = (X_b*Z_a^2, Y_b*Z_a^3). - - void (*const felem_mul)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a, - const EC_FELEM *b) = group->meth->felem_mul; - void (*const felem_sqr)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a) = - group->meth->felem_sqr; - - EC_FELEM tmp1, tmp2, Za23, Zb23; - felem_sqr(group, &Zb23, &b->Z); // Zb23 = Z_b^2 - felem_mul(group, &tmp1, &a->X, &Zb23); // tmp1 = X_a * Z_b^2 - felem_sqr(group, &Za23, &a->Z); // Za23 = Z_a^2 - felem_mul(group, &tmp2, &b->X, &Za23); // tmp2 = X_b * Z_a^2 - ec_felem_sub(group, &tmp1, &tmp1, &tmp2); - const BN_ULONG x_not_equal = ec_felem_non_zero_mask(group, &tmp1); - - felem_mul(group, &Zb23, &Zb23, &b->Z); // Zb23 = Z_b^3 - felem_mul(group, &tmp1, &a->Y, &Zb23); // tmp1 = Y_a * Z_b^3 - felem_mul(group, &Za23, &Za23, &a->Z); // Za23 = Z_a^3 - felem_mul(group, &tmp2, &b->Y, &Za23); // tmp2 = Y_b * Z_a^3 - ec_felem_sub(group, &tmp1, &tmp1, &tmp2); - const BN_ULONG y_not_equal = ec_felem_non_zero_mask(group, &tmp1); - const BN_ULONG x_and_y_equal = ~(x_not_equal | y_not_equal); - - const BN_ULONG a_not_infinity = ec_felem_non_zero_mask(group, &a->Z); - const BN_ULONG b_not_infinity = ec_felem_non_zero_mask(group, &b->Z); - const BN_ULONG a_and_b_infinity = ~(a_not_infinity | b_not_infinity); - - const BN_ULONG equal = - a_and_b_infinity | (a_not_infinity & b_not_infinity & x_and_y_equal); - return equal & 1; -} - -int ec_affine_jacobian_equal(const EC_GROUP *group, const EC_AFFINE *a, - const EC_RAW_POINT *b) { - // If |b| is not infinity, we have to decide whether - // (X_a, Y_a) = (X_b/Z_b^2, Y_b/Z_b^3), - // or equivalently, whether - // (X_a*Z_b^2, Y_a*Z_b^3) = (X_b, Y_b). - - void (*const felem_mul)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a, - const EC_FELEM *b) = group->meth->felem_mul; - void (*const felem_sqr)(const EC_GROUP *, EC_FELEM *r, const EC_FELEM *a) = - group->meth->felem_sqr; - - EC_FELEM tmp, Zb2; - felem_sqr(group, &Zb2, &b->Z); // Zb2 = Z_b^2 - felem_mul(group, &tmp, &a->X, &Zb2); // tmp = X_a * Z_b^2 - ec_felem_sub(group, &tmp, &tmp, &b->X); - const BN_ULONG x_not_equal = ec_felem_non_zero_mask(group, &tmp); - - felem_mul(group, &tmp, &a->Y, &Zb2); // tmp = Y_a * Z_b^2 - felem_mul(group, &tmp, &tmp, &b->Z); // tmp = Y_a * Z_b^3 - ec_felem_sub(group, &tmp, &tmp, &b->Y); - const BN_ULONG y_not_equal = ec_felem_non_zero_mask(group, &tmp); - const BN_ULONG x_and_y_equal = ~(x_not_equal | y_not_equal); - - const BN_ULONG b_not_infinity = ec_felem_non_zero_mask(group, &b->Z); - - const BN_ULONG equal = b_not_infinity & x_and_y_equal; - return equal & 1; -} - -int ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, const EC_RAW_POINT *p, - const EC_SCALAR *r) { - if (ec_GFp_simple_is_at_infinity(group, p)) { - // |ec_get_x_coordinate_as_scalar| will check this internally, but this way - // we do not push to the error queue. - return 0; - } - - EC_SCALAR x; - return ec_get_x_coordinate_as_scalar(group, &x, p) && - ec_scalar_equal_vartime(group, &x, r); -} - -void ec_GFp_simple_felem_to_bytes(const EC_GROUP *group, uint8_t *out, - size_t *out_len, const EC_FELEM *in) { - size_t len = BN_num_bytes(&group->field); - bn_words_to_big_endian(out, len, in->words, group->field.width); - *out_len = len; -} - -int ec_GFp_simple_felem_from_bytes(const EC_GROUP *group, EC_FELEM *out, - const uint8_t *in, size_t len) { - if (len != BN_num_bytes(&group->field)) { - OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); - return 0; - } - - bn_big_endian_to_words(out->words, group->field.width, in, len); - - if (!bn_less_than_words(out->words, group->field.d, group->field.width)) { - OPENSSL_PUT_ERROR(EC, EC_R_DECODE_ERROR); - return 0; - } - - return 1; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/simple.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/simple.cc.inc new file mode 100644 index 00000000..bd3dd1e4 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/simple.cc.inc @@ -0,0 +1,247 @@ +// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include + +#include "../../internal.h" +#include "internal.h" + + +using namespace bssl; + +// Most method functions in this file are designed to work with non-trivial +// representations of field elements if necessary (see ec_montgomery.cc.inc): +// while standard modular addition and subtraction are used, the field_mul and +// field_sqr methods will be used for multiplication, and field_encode and +// field_decode (if defined) will be used for converting between +// representations. +// +// Functions here specifically assume that if a non-trivial representation is +// used, it is a Montgomery representation (i.e. 'encoding' means multiplying +// by some factor R). +// +// TODO(crbug.com/505908440): ec_montgomery.cc.inc is now the only field element +// representation. Fold these files together. + +int bssl::ec_GFp_simple_group_set_curve(EC_GROUP *group, const BIGNUM *p, + const BIGNUM *a, const BIGNUM *b, + BN_CTX *ctx) { + // p must be a prime > 3 + if (BN_num_bits(p) <= 2 || !BN_is_odd(p)) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_FIELD); + return 0; + } + + BN_CTXScope scope(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + if (tmp == nullptr) { + return 0; + } + + if (!BN_MONT_CTX_set(&group->field, p, ctx) || + !ec_bignum_to_felem(group, &group->a, a) || + !ec_bignum_to_felem(group, &group->b, b) || + // Reuse Z from the generator to cache the value one. + !ec_bignum_to_felem(group, &group->generator.raw.Z, BN_value_one())) { + return 0; + } + + // group->a_is_minus3 + if (!BN_copy(tmp, a) || + !BN_add_word(tmp, 3)) { + return 0; + } + group->a_is_minus3 = (0 == BN_cmp(tmp, &group->field.N)); + + return 1; +} + +int bssl::ec_GFp_simple_group_get_curve(const EC_GROUP *group, BIGNUM *p, + BIGNUM *a, BIGNUM *b) { + if ((p != nullptr && !BN_copy(p, &group->field.N)) || + (a != nullptr && !ec_felem_to_bignum(group, a, &group->a)) || + (b != nullptr && !ec_felem_to_bignum(group, b, &group->b))) { + return 0; + } + return 1; +} + +void bssl::ec_GFp_simple_point_init(EC_JACOBIAN *point) { + OPENSSL_memset(&point->X, 0, sizeof(EC_FELEM)); + OPENSSL_memset(&point->Y, 0, sizeof(EC_FELEM)); + OPENSSL_memset(&point->Z, 0, sizeof(EC_FELEM)); +} + +void bssl::ec_GFp_simple_point_copy(EC_JACOBIAN *dest, const EC_JACOBIAN *src) { + OPENSSL_memcpy(&dest->X, &src->X, sizeof(EC_FELEM)); + OPENSSL_memcpy(&dest->Y, &src->Y, sizeof(EC_FELEM)); + OPENSSL_memcpy(&dest->Z, &src->Z, sizeof(EC_FELEM)); +} + +void bssl::ec_GFp_simple_point_set_to_infinity(const EC_GROUP *group, + EC_JACOBIAN *point) { + // Although it is strictly only necessary to zero Z, we zero the entire point + // in case |point| was stack-allocated and yet to be initialized. + ec_GFp_simple_point_init(point); +} + +void bssl::ec_GFp_simple_invert(const EC_GROUP *group, EC_JACOBIAN *point) { + ec_felem_neg(group, &point->Y, &point->Y); +} + +int bssl::ec_GFp_simple_is_at_infinity(const EC_GROUP *group, + const EC_JACOBIAN *point) { + return ec_felem_non_zero_mask(group, &point->Z) == 0; +} + +int bssl::ec_GFp_simple_is_on_curve(const EC_GROUP *group, + const EC_JACOBIAN *point) { + // We have a curve defined by a Weierstrass equation + // y^2 = x^3 + a*x + b. + // The point to consider is given in Jacobian projective coordinates + // where (X, Y, Z) represents (x, y) = (X/Z^2, Y/Z^3). + // Substituting this and multiplying by Z^6 transforms the above equation + // into + // Y^2 = X^3 + a*X*Z^4 + b*Z^6. + // To test this, we add up the right-hand side in 'rh'. + // + // This function may be used when double-checking the secret result of a point + // multiplication, so we proceed in constant-time. + + // rh := X^2 + EC_FELEM rh; + ec_felem_sqr(group, &rh, &point->X); + + EC_FELEM tmp, Z4, Z6; + ec_felem_sqr(group, &tmp, &point->Z); + ec_felem_sqr(group, &Z4, &tmp); + ec_felem_mul(group, &Z6, &Z4, &tmp); + + // rh := rh + a*Z^4 + if (group->a_is_minus3) { + ec_felem_add(group, &tmp, &Z4, &Z4); + ec_felem_add(group, &tmp, &tmp, &Z4); + ec_felem_sub(group, &rh, &rh, &tmp); + } else { + ec_felem_mul(group, &tmp, &Z4, &group->a); + ec_felem_add(group, &rh, &rh, &tmp); + } + + // rh := (rh + a*Z^4)*X + ec_felem_mul(group, &rh, &rh, &point->X); + + // rh := rh + b*Z^6 + ec_felem_mul(group, &tmp, &group->b, &Z6); + ec_felem_add(group, &rh, &rh, &tmp); + + // 'lh' := Y^2 + ec_felem_sqr(group, &tmp, &point->Y); + + ec_felem_sub(group, &tmp, &tmp, &rh); + BN_ULONG not_equal = ec_felem_non_zero_mask(group, &tmp); + + // If Z = 0, the point is infinity, which is always on the curve. + BN_ULONG not_infinity = ec_felem_non_zero_mask(group, &point->Z); + + return 1 & ~(not_infinity & not_equal); +} + +int bssl::ec_GFp_simple_points_equal(const EC_GROUP *group, + const EC_JACOBIAN *a, + const EC_JACOBIAN *b) { + // This function is implemented in constant-time for two reasons. First, + // although EC points are usually public, their Jacobian Z coordinates may be + // secret, or at least are not obviously public. Second, more complex + // protocols will sometimes manipulate secret points. + // + // This does mean that we pay a 6M+2S Jacobian comparison when comparing two + // publicly affine points costs no field operations at all. If needed, we can + // restore this optimization by keeping better track of affine vs. Jacobian + // forms. See https://crbug.com/boringssl/326. + + // If neither |a| or |b| is infinity, we have to decide whether + // (X_a/Z_a^2, Y_a/Z_a^3) = (X_b/Z_b^2, Y_b/Z_b^3), + // or equivalently, whether + // (X_a*Z_b^2, Y_a*Z_b^3) = (X_b*Z_a^2, Y_b*Z_a^3). + + EC_FELEM tmp1, tmp2, Za23, Zb23; + ec_felem_sqr(group, &Zb23, &b->Z); // Zb23 = Z_b^2 + ec_felem_mul(group, &tmp1, &a->X, &Zb23); // tmp1 = X_a * Z_b^2 + ec_felem_sqr(group, &Za23, &a->Z); // Za23 = Z_a^2 + ec_felem_mul(group, &tmp2, &b->X, &Za23); // tmp2 = X_b * Z_a^2 + ec_felem_sub(group, &tmp1, &tmp1, &tmp2); + const BN_ULONG x_not_equal = ec_felem_non_zero_mask(group, &tmp1); + + ec_felem_mul(group, &Zb23, &Zb23, &b->Z); // Zb23 = Z_b^3 + ec_felem_mul(group, &tmp1, &a->Y, &Zb23); // tmp1 = Y_a * Z_b^3 + ec_felem_mul(group, &Za23, &Za23, &a->Z); // Za23 = Z_a^3 + ec_felem_mul(group, &tmp2, &b->Y, &Za23); // tmp2 = Y_b * Z_a^3 + ec_felem_sub(group, &tmp1, &tmp1, &tmp2); + const BN_ULONG y_not_equal = ec_felem_non_zero_mask(group, &tmp1); + const BN_ULONG x_and_y_equal = ~(x_not_equal | y_not_equal); + + const BN_ULONG a_not_infinity = ec_felem_non_zero_mask(group, &a->Z); + const BN_ULONG b_not_infinity = ec_felem_non_zero_mask(group, &b->Z); + const BN_ULONG a_and_b_infinity = ~(a_not_infinity | b_not_infinity); + + const BN_ULONG equal = + a_and_b_infinity | (a_not_infinity & b_not_infinity & x_and_y_equal); + return equal & 1; +} + +int bssl::ec_affine_jacobian_equal(const EC_GROUP *group, const EC_AFFINE *a, + const EC_JACOBIAN *b) { + // If |b| is not infinity, we have to decide whether + // (X_a, Y_a) = (X_b/Z_b^2, Y_b/Z_b^3), + // or equivalently, whether + // (X_a*Z_b^2, Y_a*Z_b^3) = (X_b, Y_b). + + EC_FELEM tmp, Zb2; + ec_felem_sqr(group, &Zb2, &b->Z); // Zb2 = Z_b^2 + ec_felem_mul(group, &tmp, &a->X, &Zb2); // tmp = X_a * Z_b^2 + ec_felem_sub(group, &tmp, &tmp, &b->X); + const BN_ULONG x_not_equal = ec_felem_non_zero_mask(group, &tmp); + + ec_felem_mul(group, &tmp, &a->Y, &Zb2); // tmp = Y_a * Z_b^2 + ec_felem_mul(group, &tmp, &tmp, &b->Z); // tmp = Y_a * Z_b^3 + ec_felem_sub(group, &tmp, &tmp, &b->Y); + const BN_ULONG y_not_equal = ec_felem_non_zero_mask(group, &tmp); + const BN_ULONG x_and_y_equal = ~(x_not_equal | y_not_equal); + + const BN_ULONG b_not_infinity = ec_felem_non_zero_mask(group, &b->Z); + + const BN_ULONG equal = b_not_infinity & x_and_y_equal; + return equal & 1; +} + +int bssl::ec_GFp_simple_cmp_x_coordinate(const EC_GROUP *group, + const EC_JACOBIAN *p, + const EC_SCALAR *r) { + if (ec_GFp_simple_is_at_infinity(group, p)) { + // |ec_get_x_coordinate_as_scalar| will check this internally, but this way + // we do not push to the error queue. + return 0; + } + + EC_SCALAR x; + return ec_get_x_coordinate_as_scalar(group, &x, p) && + ec_scalar_equal_vartime(group, &x, r); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/simple_mul.c b/third_party/boringssl/src/crypto/fipsmodule/ec/simple_mul.c deleted file mode 100644 index 024155d9..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/simple_mul.c +++ /dev/null @@ -1,269 +0,0 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - -#include "internal.h" -#include "../bn/internal.h" -#include "../../internal.h" - - -void ec_GFp_mont_mul(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p, const EC_SCALAR *scalar) { - // This is a generic implementation for uncommon curves that not do not - // warrant a tuned one. It uses unsigned digits so that the doubling case in - // |ec_GFp_mont_add| is always unreachable, erring on safety and simplicity. - - // Compute a table of the first 32 multiples of |p| (including infinity). - EC_RAW_POINT precomp[32]; - ec_GFp_simple_point_set_to_infinity(group, &precomp[0]); - ec_GFp_simple_point_copy(&precomp[1], p); - for (size_t j = 2; j < OPENSSL_ARRAY_SIZE(precomp); j++) { - if (j & 1) { - ec_GFp_mont_add(group, &precomp[j], &precomp[1], &precomp[j - 1]); - } else { - ec_GFp_mont_dbl(group, &precomp[j], &precomp[j / 2]); - } - } - - // Divide bits in |scalar| into windows. - unsigned bits = BN_num_bits(&group->order); - int r_is_at_infinity = 1; - for (unsigned i = bits - 1; i < bits; i--) { - if (!r_is_at_infinity) { - ec_GFp_mont_dbl(group, r, r); - } - if (i % 5 == 0) { - // Compute the next window value. - const size_t width = group->order.width; - uint8_t window = bn_is_bit_set_words(scalar->words, width, i + 4) << 4; - window |= bn_is_bit_set_words(scalar->words, width, i + 3) << 3; - window |= bn_is_bit_set_words(scalar->words, width, i + 2) << 2; - window |= bn_is_bit_set_words(scalar->words, width, i + 1) << 1; - window |= bn_is_bit_set_words(scalar->words, width, i); - - // Select the entry in constant-time. - EC_RAW_POINT tmp; - OPENSSL_memset(&tmp, 0, sizeof(EC_RAW_POINT)); - for (size_t j = 0; j < OPENSSL_ARRAY_SIZE(precomp); j++) { - BN_ULONG mask = constant_time_eq_w(j, window); - ec_point_select(group, &tmp, mask, &precomp[j], &tmp); - } - - if (r_is_at_infinity) { - ec_GFp_simple_point_copy(r, &tmp); - r_is_at_infinity = 0; - } else { - ec_GFp_mont_add(group, r, r, &tmp); - } - } - } - if (r_is_at_infinity) { - ec_GFp_simple_point_set_to_infinity(group, r); - } -} - -void ec_GFp_mont_mul_base(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_SCALAR *scalar) { - ec_GFp_mont_mul(group, r, &group->generator->raw, scalar); -} - -static void ec_GFp_mont_batch_precomp(const EC_GROUP *group, EC_RAW_POINT *out, - size_t num, const EC_RAW_POINT *p) { - assert(num > 1); - ec_GFp_simple_point_set_to_infinity(group, &out[0]); - ec_GFp_simple_point_copy(&out[1], p); - for (size_t j = 2; j < num; j++) { - if (j & 1) { - ec_GFp_mont_add(group, &out[j], &out[1], &out[j - 1]); - } else { - ec_GFp_mont_dbl(group, &out[j], &out[j / 2]); - } - } -} - -static void ec_GFp_mont_batch_get_window(const EC_GROUP *group, - EC_RAW_POINT *out, - const EC_RAW_POINT precomp[17], - const EC_SCALAR *scalar, unsigned i) { - const size_t width = group->order.width; - uint8_t window = bn_is_bit_set_words(scalar->words, width, i + 4) << 5; - window |= bn_is_bit_set_words(scalar->words, width, i + 3) << 4; - window |= bn_is_bit_set_words(scalar->words, width, i + 2) << 3; - window |= bn_is_bit_set_words(scalar->words, width, i + 1) << 2; - window |= bn_is_bit_set_words(scalar->words, width, i) << 1; - if (i > 0) { - window |= bn_is_bit_set_words(scalar->words, width, i - 1); - } - crypto_word_t sign, digit; - ec_GFp_nistp_recode_scalar_bits(&sign, &digit, window); - - // Select the entry in constant-time. - OPENSSL_memset(out, 0, sizeof(EC_RAW_POINT)); - for (size_t j = 0; j < 17; j++) { - BN_ULONG mask = constant_time_eq_w(j, digit); - ec_point_select(group, out, mask, &precomp[j], out); - } - - // Negate if necessary. - EC_FELEM neg_Y; - ec_felem_neg(group, &neg_Y, &out->Y); - crypto_word_t sign_mask = sign; - sign_mask = 0u - sign_mask; - ec_felem_select(group, &out->Y, sign_mask, &neg_Y, &out->Y); -} - -void ec_GFp_mont_mul_batch(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_RAW_POINT *p0, const EC_SCALAR *scalar0, - const EC_RAW_POINT *p1, const EC_SCALAR *scalar1, - const EC_RAW_POINT *p2, const EC_SCALAR *scalar2) { - EC_RAW_POINT precomp[3][17]; - ec_GFp_mont_batch_precomp(group, precomp[0], 17, p0); - ec_GFp_mont_batch_precomp(group, precomp[1], 17, p1); - if (p2 != NULL) { - ec_GFp_mont_batch_precomp(group, precomp[2], 17, p2); - } - - // Divide bits in |scalar| into windows. - unsigned bits = BN_num_bits(&group->order); - int r_is_at_infinity = 1; - for (unsigned i = bits; i <= bits; i--) { - if (!r_is_at_infinity) { - ec_GFp_mont_dbl(group, r, r); - } - if (i % 5 == 0) { - EC_RAW_POINT tmp; - ec_GFp_mont_batch_get_window(group, &tmp, precomp[0], scalar0, i); - if (r_is_at_infinity) { - ec_GFp_simple_point_copy(r, &tmp); - r_is_at_infinity = 0; - } else { - ec_GFp_mont_add(group, r, r, &tmp); - } - - ec_GFp_mont_batch_get_window(group, &tmp, precomp[1], scalar1, i); - ec_GFp_mont_add(group, r, r, &tmp); - - if (p2 != NULL) { - ec_GFp_mont_batch_get_window(group, &tmp, precomp[2], scalar2, i); - ec_GFp_mont_add(group, r, r, &tmp); - } - } - } - if (r_is_at_infinity) { - ec_GFp_simple_point_set_to_infinity(group, r); - } -} - -static unsigned ec_GFp_mont_comb_stride(const EC_GROUP *group) { - return (BN_num_bits(&group->field) + EC_MONT_PRECOMP_COMB_SIZE - 1) / - EC_MONT_PRECOMP_COMB_SIZE; -} - -int ec_GFp_mont_init_precomp(const EC_GROUP *group, EC_PRECOMP *out, - const EC_RAW_POINT *p) { - // comb[i - 1] stores the ith element of the comb. That is, if i is - // b4 * 2^4 + b3 * 2^3 + ... + b0 * 2^0, it stores k * |p|, where k is - // b4 * 2^(4*stride) + b3 * 2^(3*stride) + ... + b0 * 2^(0*stride). stride - // here is |ec_GFp_mont_comb_stride|. We store at index i - 1 because the 0th - // comb entry is always infinity. - EC_RAW_POINT comb[(1 << EC_MONT_PRECOMP_COMB_SIZE) - 1]; - unsigned stride = ec_GFp_mont_comb_stride(group); - - // We compute the comb sequentially by the highest set bit. Initially, all - // entries up to 2^0 are filled. - comb[(1 << 0) - 1] = *p; - for (unsigned i = 1; i < EC_MONT_PRECOMP_COMB_SIZE; i++) { - // Compute entry 2^i by doubling the entry for 2^(i-1) |stride| times. - unsigned bit = 1 << i; - ec_GFp_mont_dbl(group, &comb[bit - 1], &comb[bit / 2 - 1]); - for (unsigned j = 1; j < stride; j++) { - ec_GFp_mont_dbl(group, &comb[bit - 1], &comb[bit - 1]); - } - // Compute entries from 2^i + 1 to 2^i + (2^i - 1) by adding entry 2^i to - // a previous entry. - for (unsigned j = 1; j < bit; j++) { - ec_GFp_mont_add(group, &comb[bit + j - 1], &comb[bit - 1], &comb[j - 1]); - } - } - - // Store the comb in affine coordinates to shrink the table. (This reduces - // cache pressure and makes the constant-time selects faster.) - static_assert(OPENSSL_ARRAY_SIZE(comb) == OPENSSL_ARRAY_SIZE(out->comb), - "comb sizes did not match"); - return ec_jacobian_to_affine_batch(group, out->comb, comb, - OPENSSL_ARRAY_SIZE(comb)); -} - -static void ec_GFp_mont_get_comb_window(const EC_GROUP *group, - EC_RAW_POINT *out, - const EC_PRECOMP *precomp, - const EC_SCALAR *scalar, unsigned i) { - const size_t width = group->order.width; - unsigned stride = ec_GFp_mont_comb_stride(group); - // Select the bits corresponding to the comb shifted up by |i|. - unsigned window = 0; - for (unsigned j = 0; j < EC_MONT_PRECOMP_COMB_SIZE; j++) { - window |= bn_is_bit_set_words(scalar->words, width, j * stride + i) - << j; - } - - // Select precomp->comb[window - 1]. If |window| is zero, |match| will always - // be zero, which will leave |out| at infinity. - OPENSSL_memset(out, 0, sizeof(EC_RAW_POINT)); - for (unsigned j = 0; j < OPENSSL_ARRAY_SIZE(precomp->comb); j++) { - BN_ULONG match = constant_time_eq_w(window, j + 1); - ec_felem_select(group, &out->X, match, &precomp->comb[j].X, &out->X); - ec_felem_select(group, &out->Y, match, &precomp->comb[j].Y, &out->Y); - } - BN_ULONG is_infinity = constant_time_is_zero_w(window); - ec_felem_select(group, &out->Z, is_infinity, &out->Z, &group->one); -} - -void ec_GFp_mont_mul_precomp(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_PRECOMP *p0, const EC_SCALAR *scalar0, - const EC_PRECOMP *p1, const EC_SCALAR *scalar1, - const EC_PRECOMP *p2, const EC_SCALAR *scalar2) { - unsigned stride = ec_GFp_mont_comb_stride(group); - int r_is_at_infinity = 1; - for (unsigned i = stride - 1; i < stride; i--) { - if (!r_is_at_infinity) { - ec_GFp_mont_dbl(group, r, r); - } - - EC_RAW_POINT tmp; - ec_GFp_mont_get_comb_window(group, &tmp, p0, scalar0, i); - if (r_is_at_infinity) { - ec_GFp_simple_point_copy(r, &tmp); - r_is_at_infinity = 0; - } else { - ec_GFp_mont_add(group, r, r, &tmp); - } - - if (p1 != NULL) { - ec_GFp_mont_get_comb_window(group, &tmp, p1, scalar1, i); - ec_GFp_mont_add(group, r, r, &tmp); - } - - if (p2 != NULL) { - ec_GFp_mont_get_comb_window(group, &tmp, p2, scalar2, i); - ec_GFp_mont_add(group, r, r, &tmp); - } - } - if (r_is_at_infinity) { - ec_GFp_simple_point_set_to_infinity(group, r); - } -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/simple_mul.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/simple_mul.cc.inc new file mode 100644 index 00000000..e4a665f9 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/simple_mul.cc.inc @@ -0,0 +1,274 @@ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + +#include "internal.h" +#include "../bn/internal.h" +#include "../../internal.h" + + +using namespace bssl; + +void bssl::ec_GFp_mont_mul(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_JACOBIAN *p, const EC_SCALAR *scalar) { + // This is a generic implementation for uncommon curves that not do not + // warrant a tuned one. It uses unsigned digits so that the doubling case in + // |ec_GFp_mont_add| is always unreachable, erring on safety and simplicity. + + // Compute a table of the first 32 multiples of |p| (including infinity). + EC_JACOBIAN precomp[32]; + ec_GFp_simple_point_set_to_infinity(group, &precomp[0]); + ec_GFp_simple_point_copy(&precomp[1], p); + for (size_t j = 2; j < std::size(precomp); j++) { + if (j & 1) { + ec_GFp_mont_add(group, &precomp[j], &precomp[1], &precomp[j - 1]); + } else { + ec_GFp_mont_dbl(group, &precomp[j], &precomp[j / 2]); + } + } + + // Divide bits in |scalar| into windows. + unsigned bits = EC_GROUP_order_bits(group); + int r_is_at_infinity = 1; + for (unsigned i = bits - 1; i < bits; i--) { + if (!r_is_at_infinity) { + ec_GFp_mont_dbl(group, r, r); + } + if (i % 5 == 0) { + // Compute the next window value. + const size_t width = group->order.N.width; + uint8_t window = bn_is_bit_set_words(scalar->words, width, i + 4) << 4; + window |= bn_is_bit_set_words(scalar->words, width, i + 3) << 3; + window |= bn_is_bit_set_words(scalar->words, width, i + 2) << 2; + window |= bn_is_bit_set_words(scalar->words, width, i + 1) << 1; + window |= bn_is_bit_set_words(scalar->words, width, i); + + // Select the entry in constant-time. + EC_JACOBIAN tmp; + OPENSSL_memset(&tmp, 0, sizeof(EC_JACOBIAN)); + for (size_t j = 0; j < std::size(precomp); j++) { + BN_ULONG mask = constant_time_eq_w(j, window); + ec_point_select(group, &tmp, mask, &precomp[j], &tmp); + } + + if (r_is_at_infinity) { + ec_GFp_simple_point_copy(r, &tmp); + r_is_at_infinity = 0; + } else { + ec_GFp_mont_add(group, r, r, &tmp); + } + } + } + if (r_is_at_infinity) { + ec_GFp_simple_point_set_to_infinity(group, r); + } +} + +void bssl::ec_GFp_mont_mul_base(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_SCALAR *scalar) { + ec_GFp_mont_mul(group, r, &group->generator.raw, scalar); +} + +static void ec_GFp_mont_batch_precomp(const EC_GROUP *group, EC_JACOBIAN *out, + size_t num, const EC_JACOBIAN *p) { + assert(num > 1); + ec_GFp_simple_point_set_to_infinity(group, &out[0]); + ec_GFp_simple_point_copy(&out[1], p); + for (size_t j = 2; j < num; j++) { + if (j & 1) { + ec_GFp_mont_add(group, &out[j], &out[1], &out[j - 1]); + } else { + ec_GFp_mont_dbl(group, &out[j], &out[j / 2]); + } + } +} + +static void ec_GFp_mont_batch_get_window(const EC_GROUP *group, + EC_JACOBIAN *out, + const EC_JACOBIAN precomp[17], + const EC_SCALAR *scalar, unsigned i) { + const size_t width = group->order.N.width; + uint8_t window = bn_is_bit_set_words(scalar->words, width, i + 4) << 5; + window |= bn_is_bit_set_words(scalar->words, width, i + 3) << 4; + window |= bn_is_bit_set_words(scalar->words, width, i + 2) << 3; + window |= bn_is_bit_set_words(scalar->words, width, i + 1) << 2; + window |= bn_is_bit_set_words(scalar->words, width, i) << 1; + if (i > 0) { + window |= bn_is_bit_set_words(scalar->words, width, i - 1); + } + crypto_word_t sign, digit; + ec_GFp_nistp_recode_scalar_bits(&sign, &digit, window); + + // Select the entry in constant-time. + OPENSSL_memset(out, 0, sizeof(EC_JACOBIAN)); + for (size_t j = 0; j < 17; j++) { + BN_ULONG mask = constant_time_eq_w(j, digit); + ec_point_select(group, out, mask, &precomp[j], out); + } + + // Negate if necessary. + EC_FELEM neg_Y; + ec_felem_neg(group, &neg_Y, &out->Y); + crypto_word_t sign_mask = sign; + sign_mask = 0u - sign_mask; + ec_felem_select(group, &out->Y, sign_mask, &neg_Y, &out->Y); +} + +void bssl::ec_GFp_mont_mul_batch( + const EC_GROUP *group, EC_JACOBIAN *r, const EC_JACOBIAN *p0, + const EC_SCALAR *scalar0, const EC_JACOBIAN *p1, const EC_SCALAR *scalar1, + const EC_JACOBIAN *p2, const EC_SCALAR *scalar2) { + EC_JACOBIAN precomp[3][17]; + ec_GFp_mont_batch_precomp(group, precomp[0], 17, p0); + ec_GFp_mont_batch_precomp(group, precomp[1], 17, p1); + if (p2 != nullptr) { + ec_GFp_mont_batch_precomp(group, precomp[2], 17, p2); + } + + // Divide bits in |scalar| into windows. + unsigned bits = EC_GROUP_order_bits(group); + int r_is_at_infinity = 1; + for (unsigned i = bits; i <= bits; i--) { + if (!r_is_at_infinity) { + ec_GFp_mont_dbl(group, r, r); + } + if (i % 5 == 0) { + EC_JACOBIAN tmp; + ec_GFp_mont_batch_get_window(group, &tmp, precomp[0], scalar0, i); + if (r_is_at_infinity) { + ec_GFp_simple_point_copy(r, &tmp); + r_is_at_infinity = 0; + } else { + ec_GFp_mont_add(group, r, r, &tmp); + } + + ec_GFp_mont_batch_get_window(group, &tmp, precomp[1], scalar1, i); + ec_GFp_mont_add(group, r, r, &tmp); + + if (p2 != nullptr) { + ec_GFp_mont_batch_get_window(group, &tmp, precomp[2], scalar2, i); + ec_GFp_mont_add(group, r, r, &tmp); + } + } + } + if (r_is_at_infinity) { + ec_GFp_simple_point_set_to_infinity(group, r); + } +} + +static unsigned ec_GFp_mont_comb_stride(const EC_GROUP *group) { + return (EC_GROUP_get_degree(group) + EC_MONT_PRECOMP_COMB_SIZE - 1) / + EC_MONT_PRECOMP_COMB_SIZE; +} + +int bssl::ec_GFp_mont_init_precomp(const EC_GROUP *group, EC_PRECOMP *out, + const EC_JACOBIAN *p) { + // comb[i - 1] stores the ith element of the comb. That is, if i is + // b4 * 2^4 + b3 * 2^3 + ... + b0 * 2^0, it stores k * |p|, where k is + // b4 * 2^(4*stride) + b3 * 2^(3*stride) + ... + b0 * 2^(0*stride). stride + // here is |ec_GFp_mont_comb_stride|. We store at index i - 1 because the 0th + // comb entry is always infinity. + EC_JACOBIAN comb[(1 << EC_MONT_PRECOMP_COMB_SIZE) - 1]; + unsigned stride = ec_GFp_mont_comb_stride(group); + + // We compute the comb sequentially by the highest set bit. Initially, all + // entries up to 2^0 are filled. + comb[(1 << 0) - 1] = *p; + for (unsigned i = 1; i < EC_MONT_PRECOMP_COMB_SIZE; i++) { + // Compute entry 2^i by doubling the entry for 2^(i-1) |stride| times. + unsigned bit = 1 << i; + ec_GFp_mont_dbl(group, &comb[bit - 1], &comb[bit / 2 - 1]); + for (unsigned j = 1; j < stride; j++) { + ec_GFp_mont_dbl(group, &comb[bit - 1], &comb[bit - 1]); + } + // Compute entries from 2^i + 1 to 2^i + (2^i - 1) by adding entry 2^i to + // a previous entry. + for (unsigned j = 1; j < bit; j++) { + ec_GFp_mont_add(group, &comb[bit + j - 1], &comb[bit - 1], &comb[j - 1]); + } + } + + // Store the comb in affine coordinates to shrink the table. (This reduces + // cache pressure and makes the constant-time selects faster.) + static_assert( + std::extent_v == std::extent_vcomb)>, + "comb sizes did not match"); + return ec_jacobian_to_affine_batch(group, out->comb, comb, std::size(comb)); +} + +static void ec_GFp_mont_get_comb_window(const EC_GROUP *group, + EC_JACOBIAN *out, + const EC_PRECOMP *precomp, + const EC_SCALAR *scalar, unsigned i) { + const size_t width = group->order.N.width; + unsigned stride = ec_GFp_mont_comb_stride(group); + // Select the bits corresponding to the comb shifted up by |i|. + unsigned window = 0; + for (unsigned j = 0; j < EC_MONT_PRECOMP_COMB_SIZE; j++) { + window |= bn_is_bit_set_words(scalar->words, width, j * stride + i) + << j; + } + + // Select precomp->comb[window - 1]. If |window| is zero, |match| will always + // be zero, which will leave |out| at infinity. + OPENSSL_memset(out, 0, sizeof(EC_JACOBIAN)); + for (unsigned j = 0; j < std::size(precomp->comb); j++) { + BN_ULONG match = constant_time_eq_w(window, j + 1); + ec_felem_select(group, &out->X, match, &precomp->comb[j].X, &out->X); + ec_felem_select(group, &out->Y, match, &precomp->comb[j].Y, &out->Y); + } + BN_ULONG is_infinity = constant_time_is_zero_w(window); + ec_felem_select(group, &out->Z, is_infinity, &out->Z, ec_felem_one(group)); +} + +void bssl::ec_GFp_mont_mul_precomp( + const EC_GROUP *group, EC_JACOBIAN *r, const EC_PRECOMP *p0, + const EC_SCALAR *scalar0, const EC_PRECOMP *p1, const EC_SCALAR *scalar1, + const EC_PRECOMP *p2, const EC_SCALAR *scalar2) { + unsigned stride = ec_GFp_mont_comb_stride(group); + int r_is_at_infinity = 1; + for (unsigned i = stride - 1; i < stride; i--) { + if (!r_is_at_infinity) { + ec_GFp_mont_dbl(group, r, r); + } + + EC_JACOBIAN tmp; + ec_GFp_mont_get_comb_window(group, &tmp, p0, scalar0, i); + if (r_is_at_infinity) { + ec_GFp_simple_point_copy(r, &tmp); + r_is_at_infinity = 0; + } else { + ec_GFp_mont_add(group, r, r, &tmp); + } + + if (p1 != nullptr) { + ec_GFp_mont_get_comb_window(group, &tmp, p1, scalar1, i); + ec_GFp_mont_add(group, r, r, &tmp); + } + + if (p2 != nullptr) { + ec_GFp_mont_get_comb_window(group, &tmp, p2, scalar2, i); + ec_GFp_mont_add(group, r, r, &tmp); + } + } + if (r_is_at_infinity) { + ec_GFp_simple_point_set_to_infinity(group, r); + } +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/util.c b/third_party/boringssl/src/crypto/fipsmodule/ec/util.c deleted file mode 100644 index c4323f2f..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/util.c +++ /dev/null @@ -1,255 +0,0 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - -#include "internal.h" - - -// This function looks at 5+1 scalar bits (5 current, 1 adjacent less -// significant bit), and recodes them into a signed digit for use in fast point -// multiplication: the use of signed rather than unsigned digits means that -// fewer points need to be precomputed, given that point inversion is easy (a -// precomputed point dP makes -dP available as well). -// -// BACKGROUND: -// -// Signed digits for multiplication were introduced by Booth ("A signed binary -// multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV, -// pt. 2 (1951), pp. 236-240), in that case for multiplication of integers. -// Booth's original encoding did not generally improve the density of nonzero -// digits over the binary representation, and was merely meant to simplify the -// handling of signed factors given in two's complement; but it has since been -// shown to be the basis of various signed-digit representations that do have -// further advantages, including the wNAF, using the following general -// approach: -// -// (1) Given a binary representation -// -// b_k ... b_2 b_1 b_0, -// -// of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1 -// by using bit-wise subtraction as follows: -// -// b_k b_(k-1) ... b_2 b_1 b_0 -// - b_k ... b_3 b_2 b_1 b_0 -// ----------------------------------------- -// s_(k+1) s_k ... s_3 s_2 s_1 s_0 -// -// A left-shift followed by subtraction of the original value yields a new -// representation of the same value, using signed bits s_i = b_(i-1) - b_i. -// This representation from Booth's paper has since appeared in the -// literature under a variety of different names including "reversed binary -// form", "alternating greedy expansion", "mutual opposite form", and -// "sign-alternating {+-1}-representation". -// -// An interesting property is that among the nonzero bits, values 1 and -1 -// strictly alternate. -// -// (2) Various window schemes can be applied to the Booth representation of -// integers: for example, right-to-left sliding windows yield the wNAF -// (a signed-digit encoding independently discovered by various researchers -// in the 1990s), and left-to-right sliding windows yield a left-to-right -// equivalent of the wNAF (independently discovered by various researchers -// around 2004). -// -// To prevent leaking information through side channels in point multiplication, -// we need to recode the given integer into a regular pattern: sliding windows -// as in wNAFs won't do, we need their fixed-window equivalent -- which is a few -// decades older: we'll be using the so-called "modified Booth encoding" due to -// MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49 -// (1961), pp. 67-91), in a radix-2^5 setting. That is, we always combine five -// signed bits into a signed digit: -// -// s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j) -// -// The sign-alternating property implies that the resulting digit values are -// integers from -16 to 16. -// -// Of course, we don't actually need to compute the signed digits s_i as an -// intermediate step (that's just a nice way to see how this scheme relates -// to the wNAF): a direct computation obtains the recoded digit from the -// six bits b_(5j + 4) ... b_(5j - 1). -// -// This function takes those six bits as an integer (0 .. 63), writing the -// recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute -// value, in the range 0 .. 16). Note that this integer essentially provides -// the input bits "shifted to the left" by one position: for example, the input -// to compute the least significant recoded digit, given that there's no bit -// b_-1, has to be b_4 b_3 b_2 b_1 b_0 0. -// -// DOUBLING CASE: -// -// Point addition formulas for short Weierstrass curves are often incomplete. -// Edge cases such as P + P or P + ∞ must be handled separately. This -// complicates constant-time requirements. P + ∞ cannot be avoided (any window -// may be zero) and is handled with constant-time selects. P + P (where P is not -// ∞) usually is not. Instead, windowing strategies are chosen to avoid this -// case. Whether this happens depends on the group order. -// -// Let w be the window width (in this function, w = 5). The non-trivial doubling -// case in single-point scalar multiplication may occur if and only if the -// 2^(w-1) bit of the group order is zero. -// -// Note the above only holds if the scalar is fully reduced and the group order -// is a prime that is much larger than 2^w. It also only holds when windows -// are applied from most significant to least significant, doubling between each -// window. It does not apply to more complex table strategies such as -// |EC_GFp_nistz256_method|. -// -// PROOF: -// -// Let n be the group order. Let l be the number of bits needed to represent n. -// Assume there exists some 0 <= k < n such that signed w-bit windowed -// multiplication hits the doubling case. -// -// Windowed multiplication consists of iterating over groups of s_i (defined -// above based on k's binary representation) from most to least significant. At -// iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant -// window), we: -// -// 1. Double the accumulator A, w times. Let A_i be the value of A at this -// point. -// -// 2. Set A to T_i + A_i, where T_i is a precomputed multiple of P -// corresponding to the window s_(i+w-1) ... s_i. -// -// Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as -// multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i. -// Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is -// the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i = -// 2^w * (a_(i+w) + t_(i+w)). -// -// t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it -// in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i. -// This is computed as: -// -// b_(i+w-2) b_(i+w-3) ... b_i b_(i-1) -// - b_(i+w-1) b_(i+w-2) ... b_(i+1) b_i -// -------------------------------------------- -// t_i = s_(i+w-1) s_(i+w-2) ... s_(i+1) s_i -// -// Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer -// represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i. -// -// t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x) -// = x - 2^(w-1)*b_(i+w-1) + b_(i-1) -// -// Or, using C notation for bit operations: -// -// t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1 -// -// Note b_(i-1) is added in left-shifted by one (or doubled) from its place. -// This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed -// by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C -// notation, this is: -// -// a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w -// -// Observe that, while t_i may be positive or negative, a_i is bounded by -// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up -// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for -// all groups. That would imply the subsequent a_i is zero, which means all -// terms thus far were zero.) -// -// Returning to our doubling position, we have a_j = t_j (mod n). We now -// determine the value of a_j - t_j, which must be divisible by n. Our bounds on -// a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w -// divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if -// a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n. -// -// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then, -// -// n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j -// <= k/2^j + 2^w - t_j -// < n/2^w + 2^w + 2^(w-1) -// -// n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final -// addition may hit the doubling case. -// -// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L -// such that k_H is the contribution from b_(l-1) .. b_w, k_M is the -// contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0. -// That is: -// -// - 2^w divides k_H -// - k_M is 0 or 2^(w-1) -// - 0 <= k_L < 2^(w-1) -// -// Divide n into n_H + n_M + n_L similarly. We thus have: -// -// t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1 -// = k & ((1<<(w-1)) - 1) - k & (1<<(w-1)) -// = k_L - k_M -// -// a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w -// = (k>>w) << w + ((k>>(w-1)) & 1) << w -// = k_H + 2*k_M -// -// n = a_0 - t_0 -// n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M) -// = k_H + 3*k_M - k_L -// -// k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be -// 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H. -// Then, -// -// n_M + n_L = 3*(2^(w-1)) - k_L -// > 3*(2^(w-1)) - 2^(w-1) -// = 2^w -// -// Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose -// k_H < n_H - 2*2^w. Then, -// -// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L -// < n_H - 2*2^w + 3*(2^(w-1)) - k_L -// n_M + n_L < -2^(w-1) - k_L -// -// Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus, -// -// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L -// = n_H - 2^w + 3*(2^(w-1)) - k_L -// n_M + n_L = 2^(w-1) - k_L -// <= 2^(w-1) -// -// Equality would mean 2^(w-1) divides n, which is impossible if n is prime. -// Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition. -// -// This proof constructs k, so, to show the converse, let k_H = n_H - 2^w, -// k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point -// doubling in the final addition and is the only such scalar. -// -// COMMON CURVES: -// -// The group orders for common curves end in the following bit patterns: -// -// P-521: ...00001001; w = 4 is okay -// P-384: ...01110011; w = 2, 5, 6, 7 are okay -// P-256: ...01010001; w = 5, 7 are okay -// P-224: ...00111101; w = 3, 4, 5, 6 are okay -void ec_GFp_nistp_recode_scalar_bits(crypto_word_t *sign, crypto_word_t *digit, - crypto_word_t in) { - crypto_word_t s, d; - - s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as - * 6-bit value */ - d = (1 << 6) - in - 1; - d = (d & s) | (in & ~s); - d = (d >> 1) + (d & 1); - - *sign = s & 1; - *digit = d; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/util.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/util.cc.inc new file mode 100644 index 00000000..5b98ae09 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/util.cc.inc @@ -0,0 +1,258 @@ +// Copyright 2015 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "internal.h" + + +using namespace bssl; + +// This function looks at 5+1 scalar bits (5 current, 1 adjacent less +// significant bit), and recodes them into a signed digit for use in fast point +// multiplication: the use of signed rather than unsigned digits means that +// fewer points need to be precomputed, given that point inversion is easy (a +// precomputed point dP makes -dP available as well). +// +// BACKGROUND: +// +// Signed digits for multiplication were introduced by Booth ("A signed binary +// multiplication technique", Quart. Journ. Mech. and Applied Math., vol. IV, +// pt. 2 (1951), pp. 236-240), in that case for multiplication of integers. +// Booth's original encoding did not generally improve the density of nonzero +// digits over the binary representation, and was merely meant to simplify the +// handling of signed factors given in two's complement; but it has since been +// shown to be the basis of various signed-digit representations that do have +// further advantages, including the wNAF, using the following general +// approach: +// +// (1) Given a binary representation +// +// b_k ... b_2 b_1 b_0, +// +// of a nonnegative integer (b_k in {0, 1}), rewrite it in digits 0, 1, -1 +// by using bit-wise subtraction as follows: +// +// b_k b_(k-1) ... b_2 b_1 b_0 +// - b_k ... b_3 b_2 b_1 b_0 +// ----------------------------------------- +// s_(k+1) s_k ... s_3 s_2 s_1 s_0 +// +// A left-shift followed by subtraction of the original value yields a new +// representation of the same value, using signed bits s_i = b_(i-1) - b_i. +// This representation from Booth's paper has since appeared in the +// literature under a variety of different names including "reversed binary +// form", "alternating greedy expansion", "mutual opposite form", and +// "sign-alternating {+-1}-representation". +// +// An interesting property is that among the nonzero bits, values 1 and -1 +// strictly alternate. +// +// (2) Various window schemes can be applied to the Booth representation of +// integers: for example, right-to-left sliding windows yield the wNAF +// (a signed-digit encoding independently discovered by various researchers +// in the 1990s), and left-to-right sliding windows yield a left-to-right +// equivalent of the wNAF (independently discovered by various researchers +// around 2004). +// +// To prevent leaking information through side channels in point multiplication, +// we need to recode the given integer into a regular pattern: sliding windows +// as in wNAFs won't do, we need their fixed-window equivalent -- which is a few +// decades older: we'll be using the so-called "modified Booth encoding" due to +// MacSorley ("High-speed arithmetic in binary computers", Proc. IRE, vol. 49 +// (1961), pp. 67-91), in a radix-2^5 setting. That is, we always combine five +// signed bits into a signed digit: +// +// s_(5j + 4) s_(5j + 3) s_(5j + 2) s_(5j + 1) s_(5j) +// +// The sign-alternating property implies that the resulting digit values are +// integers from -16 to 16. +// +// Of course, we don't actually need to compute the signed digits s_i as an +// intermediate step (that's just a nice way to see how this scheme relates +// to the wNAF): a direct computation obtains the recoded digit from the +// six bits b_(5j + 4) ... b_(5j - 1). +// +// This function takes those six bits as an integer (0 .. 63), writing the +// recoded digit to *sign (0 for positive, 1 for negative) and *digit (absolute +// value, in the range 0 .. 16). Note that this integer essentially provides +// the input bits "shifted to the left" by one position: for example, the input +// to compute the least significant recoded digit, given that there's no bit +// b_-1, has to be b_4 b_3 b_2 b_1 b_0 0. +// +// DOUBLING CASE: +// +// Point addition formulas for short Weierstrass curves are often incomplete. +// Edge cases such as P + P or P + ∞ must be handled separately. This +// complicates constant-time requirements. P + ∞ cannot be avoided (any window +// may be zero) and is handled with constant-time selects. P + P (where P is not +// ∞) usually is not. Instead, windowing strategies are chosen to avoid this +// case. Whether this happens depends on the group order. +// +// Let w be the window width (in this function, w = 5). The non-trivial doubling +// case in single-point scalar multiplication may occur if and only if the +// 2^(w-1) bit of the group order is zero. +// +// Note the above only holds if the scalar is fully reduced and the group order +// is a prime that is much larger than 2^w. It also only holds when windows +// are applied from most significant to least significant, doubling between each +// window. It does not apply to more complex table strategies such as +// |EC_GFp_nistz256_method|. +// +// PROOF: +// +// Let n be the group order. Let l be the number of bits needed to represent n. +// Assume there exists some 0 <= k < n such that signed w-bit windowed +// multiplication hits the doubling case. +// +// Windowed multiplication consists of iterating over groups of s_i (defined +// above based on k's binary representation) from most to least significant. At +// iteration i (for i = ..., 3w, 2w, w, 0, starting from the most significant +// window), we: +// +// 1. Double the accumulator A, w times. Let A_i be the value of A at this +// point. +// +// 2. Set A to T_i + A_i, where T_i is a precomputed multiple of P +// corresponding to the window s_(i+w-1) ... s_i. +// +// Let j be the index such that A_j = T_j ≠ ∞. Looking at A_i and T_i as +// multiples of P, define a_i and t_i to be scalar coefficients of A_i and T_i. +// Thus a_j = t_j ≠ 0 (mod n). Note a_i and t_i may not be reduced mod n. t_i is +// the value of the w signed bits s_(i+w-1) ... s_i. a_i is computed as a_i = +// 2^w * (a_(i+w) + t_(i+w)). +// +// t_i is bounded by -2^(w-1) <= t_i <= 2^(w-1). Additionally, we may write it +// in terms of unsigned bits b_i. t_i consists of signed bits s_(i+w-1) ... s_i. +// This is computed as: +// +// b_(i+w-2) b_(i+w-3) ... b_i b_(i-1) +// - b_(i+w-1) b_(i+w-2) ... b_(i+1) b_i +// -------------------------------------------- +// t_i = s_(i+w-1) s_(i+w-2) ... s_(i+1) s_i +// +// Observe that b_(i+w-2) through b_i occur in both terms. Let x be the integer +// represented by that bit string, i.e. 2^(w-2)*b_(i+w-2) + ... + b_i. +// +// t_i = (2*x + b_(i-1)) - (2^(w-1)*b_(i+w-1) + x) +// = x - 2^(w-1)*b_(i+w-1) + b_(i-1) +// +// Or, using C notation for bit operations: +// +// t_i = (k>>i) & ((1<<(w-1)) - 1) - (k>>i) & (1<<(w-1)) + (k>>(i-1)) & 1 +// +// Note b_(i-1) is added in left-shifted by one (or doubled) from its place. +// This is compensated by t_(i-w)'s subtraction term. Thus, a_i may be computed +// by adding b_l b_(l-1) ... b_(i+1) b_i and an extra copy of b_(i-1). In C +// notation, this is: +// +// a_i = (k>>(i+w)) << w + ((k>>(i+w-1)) & 1) << w +// +// Observe that, while t_i may be positive or negative, a_i is bounded by +// 0 <= a_i < n + 2^w. Additionally, a_i can only be zero if b_(i+w-1) and up +// are all zero. (Note this implies a non-trivial P + (-P) is unreachable for +// all groups. That would imply the subsequent a_i is zero, which means all +// terms thus far were zero.) +// +// Returning to our doubling position, we have a_j = t_j (mod n). We now +// determine the value of a_j - t_j, which must be divisible by n. Our bounds on +// a_j and t_j imply a_j - t_j is 0 or n. If it is 0, a_j = t_j. However, 2^w +// divides a_j and -2^(w-1) <= t_j <= 2^(w-1), so this can only happen if +// a_j = t_j = 0, which is a trivial doubling. Therefore, a_j - t_j = n. +// +// Now we determine j. Suppose j > 0. w divides j, so j >= w. Then, +// +// n = a_j - t_j = (k>>(j+w)) << w + ((k>>(j+w-1)) & 1) << w - t_j +// <= k/2^j + 2^w - t_j +// < n/2^w + 2^w + 2^(w-1) +// +// n is much larger than 2^w, so this is impossible. Thus, j = 0: only the final +// addition may hit the doubling case. +// +// Finally, we consider bit patterns for n and k. Divide k into k_H + k_M + k_L +// such that k_H is the contribution from b_(l-1) .. b_w, k_M is the +// contribution from b_(w-1), and k_L is the contribution from b_(w-2) ... b_0. +// That is: +// +// - 2^w divides k_H +// - k_M is 0 or 2^(w-1) +// - 0 <= k_L < 2^(w-1) +// +// Divide n into n_H + n_M + n_L similarly. We thus have: +// +// t_0 = (k>>0) & ((1<<(w-1)) - 1) - (k>>0) & (1<<(w-1)) + (k>>(0-1)) & 1 +// = k & ((1<<(w-1)) - 1) - k & (1<<(w-1)) +// = k_L - k_M +// +// a_0 = (k>>(0+w)) << w + ((k>>(0+w-1)) & 1) << w +// = (k>>w) << w + ((k>>(w-1)) & 1) << w +// = k_H + 2*k_M +// +// n = a_0 - t_0 +// n_H + n_M + n_L = (k_H + 2*k_M) - (k_L - k_M) +// = k_H + 3*k_M - k_L +// +// k_H - k_L < k and k < n, so k_H - k_L ≠ n. Therefore k_M is not 0 and must be +// 2^(w-1). Now we consider k_H and n_H. We know k_H <= n_H. Suppose k_H = n_H. +// Then, +// +// n_M + n_L = 3*(2^(w-1)) - k_L +// > 3*(2^(w-1)) - 2^(w-1) +// = 2^w +// +// Contradiction (n_M + n_L is the bottom w bits of n). Thus k_H < n_H. Suppose +// k_H < n_H - 2*2^w. Then, +// +// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L +// < n_H - 2*2^w + 3*(2^(w-1)) - k_L +// n_M + n_L < -2^(w-1) - k_L +// +// Contradiction. Thus, k_H = n_H - 2^w. (Note 2^w divides n_H and k_H.) Thus, +// +// n_H + n_M + n_L = k_H + 3*(2^(w-1)) - k_L +// = n_H - 2^w + 3*(2^(w-1)) - k_L +// n_M + n_L = 2^(w-1) - k_L +// <= 2^(w-1) +// +// Equality would mean 2^(w-1) divides n, which is impossible if n is prime. +// Thus n_M + n_L < 2^(w-1), so n_M is zero, proving our condition. +// +// This proof constructs k, so, to show the converse, let k_H = n_H - 2^w, +// k_M = 2^(w-1), k_L = 2^(w-1) - n_L. This will result in a non-trivial point +// doubling in the final addition and is the only such scalar. +// +// COMMON CURVES: +// +// The group orders for common curves end in the following bit patterns: +// +// P-521: ...00001001; w = 4 is okay +// P-384: ...01110011; w = 2, 5, 6, 7 are okay +// P-256: ...01010001; w = 5, 7 are okay +// P-224: ...00111101; w = 3, 4, 5, 6 are okay +void bssl::ec_GFp_nistp_recode_scalar_bits(crypto_word_t *sign, + crypto_word_t *digit, + crypto_word_t in) { + crypto_word_t s, d; + + s = ~((in >> 5) - 1); /* sets all bits to MSB(in), 'in' seen as + * 6-bit value */ + d = (1 << 6) - in - 1; + d = (d & s) | (in & ~s); + d = (d >> 1) + (d & 1); + + *sign = s & 1; + *digit = d; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/wnaf.c b/third_party/boringssl/src/crypto/fipsmodule/ec/wnaf.c deleted file mode 100644 index 65cc8945..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ec/wnaf.c +++ /dev/null @@ -1,270 +0,0 @@ -/* Originally written by Bodo Moeller for the OpenSSL project. - * ==================================================================== - * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ -/* ==================================================================== - * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. - * - * Portions of the attached software ("Contribution") are developed by - * SUN MICROSYSTEMS, INC., and are contributed to the OpenSSL project. - * - * The Contribution is licensed pursuant to the OpenSSL open source - * license provided above. - * - * The elliptic curve binary polynomial software is originally written by - * Sheueling Chang Shantz and Douglas Stebila of Sun Microsystems - * Laboratories. */ - -#include - -#include -#include - -#include -#include -#include -#include - -#include "internal.h" -#include "../bn/internal.h" -#include "../../internal.h" - - -// This file implements the wNAF-based interleaving multi-exponentiation method -// at: -// http://link.springer.com/chapter/10.1007%2F3-540-45537-X_13 -// http://www.bmoeller.de/pdf/TI-01-08.multiexp.pdf - -void ec_compute_wNAF(const EC_GROUP *group, int8_t *out, - const EC_SCALAR *scalar, size_t bits, int w) { - // 'int8_t' can represent integers with absolute values less than 2^7. - assert(0 < w && w <= 7); - assert(bits != 0); - int bit = 1 << w; // 2^w, at most 128 - int next_bit = bit << 1; // 2^(w+1), at most 256 - int mask = next_bit - 1; // at most 255 - - int window_val = scalar->words[0] & mask; - for (size_t j = 0; j < bits + 1; j++) { - assert(0 <= window_val && window_val <= next_bit); - int digit = 0; - if (window_val & 1) { - assert(0 < window_val && window_val < next_bit); - if (window_val & bit) { - digit = window_val - next_bit; - // We know -next_bit < digit < 0 and window_val - digit = next_bit. - - // modified wNAF - if (j + w + 1 >= bits) { - // special case for generating modified wNAFs: - // no new bits will be added into window_val, - // so using a positive digit here will decrease - // the total length of the representation - - digit = window_val & (mask >> 1); - // We know 0 < digit < bit and window_val - digit = bit. - } - } else { - digit = window_val; - // We know 0 < digit < bit and window_val - digit = 0. - } - - window_val -= digit; - - // Now window_val is 0 or 2^(w+1) in standard wNAF generation. - // For modified window NAFs, it may also be 2^w. - // - // See the comments above for the derivation of each of these bounds. - assert(window_val == 0 || window_val == next_bit || window_val == bit); - assert(-bit < digit && digit < bit); - - // window_val was odd, so digit is also odd. - assert(digit & 1); - } - - out[j] = digit; - - // Incorporate the next bit. Previously, |window_val| <= |next_bit|, so if - // we shift and add at most one copy of |bit|, this will continue to hold - // afterwards. - window_val >>= 1; - window_val += - bit * bn_is_bit_set_words(scalar->words, group->order.width, j + w + 1); - assert(window_val <= next_bit); - } - - // bits + 1 entries should be sufficient to consume all bits. - assert(window_val == 0); -} - -// compute_precomp sets |out[i]| to (2*i+1)*p, for i from 0 to |len|. -static void compute_precomp(const EC_GROUP *group, EC_RAW_POINT *out, - const EC_RAW_POINT *p, size_t len) { - ec_GFp_simple_point_copy(&out[0], p); - EC_RAW_POINT two_p; - ec_GFp_mont_dbl(group, &two_p, p); - for (size_t i = 1; i < len; i++) { - ec_GFp_mont_add(group, &out[i], &out[i - 1], &two_p); - } -} - -static void lookup_precomp(const EC_GROUP *group, EC_RAW_POINT *out, - const EC_RAW_POINT *precomp, int digit) { - if (digit < 0) { - digit = -digit; - ec_GFp_simple_point_copy(out, &precomp[digit >> 1]); - ec_GFp_simple_invert(group, out); - } else { - ec_GFp_simple_point_copy(out, &precomp[digit >> 1]); - } -} - -// EC_WNAF_WINDOW_BITS is the window size to use for |ec_GFp_mont_mul_public|. -#define EC_WNAF_WINDOW_BITS 4 - -// EC_WNAF_TABLE_SIZE is the table size to use for |ec_GFp_mont_mul_public|. -#define EC_WNAF_TABLE_SIZE (1 << (EC_WNAF_WINDOW_BITS - 1)) - -// EC_WNAF_STACK is the number of points worth of data to stack-allocate and -// avoid a malloc. -#define EC_WNAF_STACK 3 - -int ec_GFp_mont_mul_public_batch(const EC_GROUP *group, EC_RAW_POINT *r, - const EC_SCALAR *g_scalar, - const EC_RAW_POINT *points, - const EC_SCALAR *scalars, size_t num) { - size_t bits = BN_num_bits(&group->order); - size_t wNAF_len = bits + 1; - - int ret = 0; - int8_t wNAF_stack[EC_WNAF_STACK][EC_MAX_BYTES * 8 + 1]; - int8_t (*wNAF_alloc)[EC_MAX_BYTES * 8 + 1] = NULL; - int8_t (*wNAF)[EC_MAX_BYTES * 8 + 1]; - EC_RAW_POINT precomp_stack[EC_WNAF_STACK][EC_WNAF_TABLE_SIZE]; - EC_RAW_POINT (*precomp_alloc)[EC_WNAF_TABLE_SIZE] = NULL; - EC_RAW_POINT (*precomp)[EC_WNAF_TABLE_SIZE]; - if (num <= EC_WNAF_STACK) { - wNAF = wNAF_stack; - precomp = precomp_stack; - } else { - if (num >= ((size_t)-1) / sizeof(wNAF_alloc[0]) || - num >= ((size_t)-1) / sizeof(precomp_alloc[0])) { - OPENSSL_PUT_ERROR(EC, ERR_R_OVERFLOW); - goto err; - } - wNAF_alloc = OPENSSL_malloc(num * sizeof(wNAF_alloc[0])); - precomp_alloc = OPENSSL_malloc(num * sizeof(precomp_alloc[0])); - if (wNAF_alloc == NULL || precomp_alloc == NULL) { - OPENSSL_PUT_ERROR(EC, ERR_R_MALLOC_FAILURE); - goto err; - } - wNAF = wNAF_alloc; - precomp = precomp_alloc; - } - - int8_t g_wNAF[EC_MAX_BYTES * 8 + 1]; - EC_RAW_POINT g_precomp[EC_WNAF_TABLE_SIZE]; - assert(wNAF_len <= OPENSSL_ARRAY_SIZE(g_wNAF)); - const EC_RAW_POINT *g = &group->generator->raw; - if (g_scalar != NULL) { - ec_compute_wNAF(group, g_wNAF, g_scalar, bits, EC_WNAF_WINDOW_BITS); - compute_precomp(group, g_precomp, g, EC_WNAF_TABLE_SIZE); - } - - for (size_t i = 0; i < num; i++) { - assert(wNAF_len <= OPENSSL_ARRAY_SIZE(wNAF[i])); - ec_compute_wNAF(group, wNAF[i], &scalars[i], bits, EC_WNAF_WINDOW_BITS); - compute_precomp(group, precomp[i], &points[i], EC_WNAF_TABLE_SIZE); - } - - EC_RAW_POINT tmp; - int r_is_at_infinity = 1; - for (size_t k = wNAF_len - 1; k < wNAF_len; k--) { - if (!r_is_at_infinity) { - ec_GFp_mont_dbl(group, r, r); - } - - if (g_scalar != NULL && g_wNAF[k] != 0) { - lookup_precomp(group, &tmp, g_precomp, g_wNAF[k]); - if (r_is_at_infinity) { - ec_GFp_simple_point_copy(r, &tmp); - r_is_at_infinity = 0; - } else { - ec_GFp_mont_add(group, r, r, &tmp); - } - } - - for (size_t i = 0; i < num; i++) { - if (wNAF[i][k] != 0) { - lookup_precomp(group, &tmp, precomp[i], wNAF[i][k]); - if (r_is_at_infinity) { - ec_GFp_simple_point_copy(r, &tmp); - r_is_at_infinity = 0; - } else { - ec_GFp_mont_add(group, r, r, &tmp); - } - } - } - } - - if (r_is_at_infinity) { - ec_GFp_simple_point_set_to_infinity(group, r); - } - - ret = 1; - -err: - OPENSSL_free(wNAF_alloc); - OPENSSL_free(precomp_alloc); - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ec/wnaf.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ec/wnaf.cc.inc new file mode 100644 index 00000000..74def20d --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ec/wnaf.cc.inc @@ -0,0 +1,224 @@ +// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include +#include +#include + +#include "../../internal.h" +#include "../bn/internal.h" +#include "internal.h" + + +using namespace bssl; + +// This file implements the wNAF-based interleaving multi-exponentiation method +// at: +// http://link.springer.com/chapter/10.1007%2F3-540-45537-X_13 +// http://www.bmoeller.de/pdf/TI-01-08.multiexp.pdf + +void bssl::ec_compute_wNAF(const EC_GROUP *group, int8_t *out, + const EC_SCALAR *scalar, size_t bits, int w) { + // 'int8_t' can represent integers with absolute values less than 2^7. + assert(0 < w && w <= 7); + assert(bits != 0); + int bit = 1 << w; // 2^w, at most 128 + int next_bit = bit << 1; // 2^(w+1), at most 256 + int mask = next_bit - 1; // at most 255 + + int window_val = scalar->words[0] & mask; + for (size_t j = 0; j < bits + 1; j++) { + assert(0 <= window_val && window_val <= next_bit); + int digit = 0; + if (window_val & 1) { + assert(0 < window_val && window_val < next_bit); + if (window_val & bit) { + digit = window_val - next_bit; + // We know -next_bit < digit < 0 and window_val - digit = next_bit. + + // modified wNAF + if (j + w + 1 >= bits) { + // special case for generating modified wNAFs: + // no new bits will be added into window_val, + // so using a positive digit here will decrease + // the total length of the representation + + digit = window_val & (mask >> 1); + // We know 0 < digit < bit and window_val - digit = bit. + } + } else { + digit = window_val; + // We know 0 < digit < bit and window_val - digit = 0. + } + + window_val -= digit; + + // Now window_val is 0 or 2^(w+1) in standard wNAF generation. + // For modified window NAFs, it may also be 2^w. + // + // See the comments above for the derivation of each of these bounds. + assert(window_val == 0 || window_val == next_bit || window_val == bit); + assert(-bit < digit && digit < bit); + + // window_val was odd, so digit is also odd. + assert(digit & 1); + } + + out[j] = digit; + + // Incorporate the next bit. Previously, |window_val| <= |next_bit|, so if + // we shift and add at most one copy of |bit|, this will continue to hold + // afterwards. + window_val >>= 1; + window_val += bit * bn_is_bit_set_words(scalar->words, group->order.N.width, + j + w + 1); + assert(window_val <= next_bit); + } + + // bits + 1 entries should be sufficient to consume all bits. + assert(window_val == 0); +} + +// compute_precomp sets |out[i]| to (2*i+1)*p, for i from 0 to |len|. +static void compute_precomp(const EC_GROUP *group, EC_JACOBIAN *out, + const EC_JACOBIAN *p, size_t len) { + ec_GFp_simple_point_copy(&out[0], p); + EC_JACOBIAN two_p; + ec_GFp_mont_dbl(group, &two_p, p); + for (size_t i = 1; i < len; i++) { + ec_GFp_mont_add(group, &out[i], &out[i - 1], &two_p); + } +} + +static void lookup_precomp(const EC_GROUP *group, EC_JACOBIAN *out, + const EC_JACOBIAN *precomp, int digit) { + if (digit < 0) { + digit = -digit; + ec_GFp_simple_point_copy(out, &precomp[digit >> 1]); + ec_GFp_simple_invert(group, out); + } else { + ec_GFp_simple_point_copy(out, &precomp[digit >> 1]); + } +} + +// EC_WNAF_WINDOW_BITS is the window size to use for |ec_GFp_mont_mul_public|. +#define EC_WNAF_WINDOW_BITS 4 + +// EC_WNAF_TABLE_SIZE is the table size to use for |ec_GFp_mont_mul_public|. +#define EC_WNAF_TABLE_SIZE (1 << (EC_WNAF_WINDOW_BITS - 1)) + +// EC_WNAF_STACK is the number of points worth of data to stack-allocate and +// avoid a malloc. +#define EC_WNAF_STACK 3 + +int bssl::ec_GFp_mont_mul_public_batch(const EC_GROUP *group, EC_JACOBIAN *r, + const EC_SCALAR *g_scalar, + const EC_JACOBIAN *points, + const EC_SCALAR *scalars, size_t num) { + size_t bits = EC_GROUP_order_bits(group); + size_t wNAF_len = bits + 1; + + // Stack-allocated space, which will be used if the task is small enough. + int8_t wNAF_stack[EC_WNAF_STACK][EC_MAX_BYTES * 8 + 1]; + EC_JACOBIAN precomp_stack[EC_WNAF_STACK][EC_WNAF_TABLE_SIZE]; + + // Allocated pointers, which will remain NULL unless needed. + EC_JACOBIAN(*precomp_alloc)[EC_WNAF_TABLE_SIZE] = nullptr; + int8_t (*wNAF_alloc)[EC_MAX_BYTES * 8 + 1] = nullptr; + + // These fields point either to the stack or heap buffers of the same name. + int8_t(*wNAF)[EC_MAX_BYTES * 8 + 1]; + EC_JACOBIAN(*precomp)[EC_WNAF_TABLE_SIZE]; + + if (num <= EC_WNAF_STACK) { + wNAF = wNAF_stack; + precomp = precomp_stack; + } else { + wNAF_alloc = reinterpret_cast( + OPENSSL_calloc(num, sizeof(wNAF_alloc[0]))); + if (wNAF_alloc == nullptr) { + return 0; + } + precomp_alloc = reinterpret_cast( + OPENSSL_calloc(num, sizeof(precomp_alloc[0]))); + if (precomp_alloc == nullptr) { + OPENSSL_free(wNAF_alloc); + return 0; + } + + wNAF = wNAF_alloc; + precomp = precomp_alloc; + } + + int8_t g_wNAF[EC_MAX_BYTES * 8 + 1]; + EC_JACOBIAN g_precomp[EC_WNAF_TABLE_SIZE]; + assert(wNAF_len <= std::size(g_wNAF)); + const EC_JACOBIAN *g = &group->generator.raw; + if (g_scalar != nullptr) { + ec_compute_wNAF(group, g_wNAF, g_scalar, bits, EC_WNAF_WINDOW_BITS); + compute_precomp(group, g_precomp, g, EC_WNAF_TABLE_SIZE); + } + + for (size_t i = 0; i < num; i++) { + assert(wNAF_len <= std::size(wNAF[i])); + ec_compute_wNAF(group, wNAF[i], &scalars[i], bits, EC_WNAF_WINDOW_BITS); + compute_precomp(group, precomp[i], &points[i], EC_WNAF_TABLE_SIZE); + } + + EC_JACOBIAN tmp; + int r_is_at_infinity = 1; + for (size_t k = wNAF_len - 1; k < wNAF_len; k--) { + if (!r_is_at_infinity) { + ec_GFp_mont_dbl(group, r, r); + } + + if (g_scalar != nullptr && g_wNAF[k] != 0) { + lookup_precomp(group, &tmp, g_precomp, g_wNAF[k]); + if (r_is_at_infinity) { + ec_GFp_simple_point_copy(r, &tmp); + r_is_at_infinity = 0; + } else { + ec_GFp_mont_add(group, r, r, &tmp); + } + } + + for (size_t i = 0; i < num; i++) { + if (wNAF[i][k] != 0) { + lookup_precomp(group, &tmp, precomp[i], wNAF[i][k]); + if (r_is_at_infinity) { + ec_GFp_simple_point_copy(r, &tmp); + r_is_at_infinity = 0; + } else { + ec_GFp_mont_add(group, r, r, &tmp); + } + } + } + } + + if (r_is_at_infinity) { + ec_GFp_simple_point_set_to_infinity(group, r); + } + + OPENSSL_free(wNAF_alloc); + OPENSSL_free(precomp_alloc); + return 1; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ecdh/ecdh.c b/third_party/boringssl/src/crypto/fipsmodule/ecdh/ecdh.c deleted file mode 100644 index 25d07023..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ecdh/ecdh.c +++ /dev/null @@ -1,130 +0,0 @@ -/* ==================================================================== - * Copyright 2002 Sun Microsystems, Inc. ALL RIGHTS RESERVED. - * - * The Elliptic Curve Public-Key Crypto Library (ECC Code) included - * herein is developed by SUN MICROSYSTEMS, INC., and is contributed - * to the OpenSSL project. - * - * The ECC Code is licensed pursuant to the OpenSSL open source - * license provided below. - * - * The ECDH software is originally written by Douglas Stebila of - * Sun Microsystems Laboratories. - * - */ -/* ==================================================================== - * Copyright (c) 2000-2002 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include - -#include -#include -#include -#include -#include - -#include "../../internal.h" -#include "../ec/internal.h" -#include "../service_indicator/internal.h" - - -int ECDH_compute_key_fips(uint8_t *out, size_t out_len, const EC_POINT *pub_key, - const EC_KEY *priv_key) { - boringssl_ensure_ecc_self_test(); - - if (priv_key->priv_key == NULL) { - OPENSSL_PUT_ERROR(ECDH, ECDH_R_NO_PRIVATE_VALUE); - return 0; - } - const EC_SCALAR *const priv = &priv_key->priv_key->scalar; - const EC_GROUP *const group = EC_KEY_get0_group(priv_key); - if (EC_GROUP_cmp(group, pub_key->group, NULL) != 0) { - OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); - return 0; - } - - EC_RAW_POINT shared_point; - uint8_t buf[EC_MAX_BYTES]; - size_t buflen; - if (!ec_point_mul_scalar(group, &shared_point, &pub_key->raw, priv) || - !ec_get_x_coordinate_as_bytes(group, buf, &buflen, sizeof(buf), - &shared_point)) { - OPENSSL_PUT_ERROR(ECDH, ECDH_R_POINT_ARITHMETIC_FAILURE); - return 0; - } - - FIPS_service_indicator_lock_state(); - switch (out_len) { - case SHA224_DIGEST_LENGTH: - SHA224(buf, buflen, out); - break; - case SHA256_DIGEST_LENGTH: - SHA256(buf, buflen, out); - break; - case SHA384_DIGEST_LENGTH: - SHA384(buf, buflen, out); - break; - case SHA512_DIGEST_LENGTH: - SHA512(buf, buflen, out); - break; - default: - OPENSSL_PUT_ERROR(ECDH, ECDH_R_UNKNOWN_DIGEST_LENGTH); - FIPS_service_indicator_unlock_state(); - return 0; - } - FIPS_service_indicator_unlock_state(); - - ECDH_verify_service_indicator(priv_key); - return 1; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ecdh/ecdh.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ecdh/ecdh.cc.inc new file mode 100644 index 00000000..81c5482a --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ecdh/ecdh.cc.inc @@ -0,0 +1,91 @@ +// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. +// Copyright (c) 2002, Oracle and/or its affiliates. All rights reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include + +#include "../../internal.h" +#include "../bcm_interface.h" +#include "../ec/internal.h" +#include "../service_indicator/internal.h" + + +using namespace bssl; + +int ECDH_compute_key_fips(uint8_t *out, size_t out_len, const EC_POINT *pub_key, + const EC_KEY *priv_key) { + boringssl_ensure_ecc_self_test(); + + if (FromOpaque(priv_key)->priv_key == nullptr) { + OPENSSL_PUT_ERROR(ECDH, ECDH_R_NO_PRIVATE_VALUE); + return 0; + } + const EC_SCALAR *const priv = &FromOpaque(priv_key)->priv_key->scalar; + const EC_GROUP *const group = EC_KEY_get0_group(priv_key); + if (EC_GROUP_cmp(group, pub_key->group, nullptr) != 0) { + OPENSSL_PUT_ERROR(EC, EC_R_INCOMPATIBLE_OBJECTS); + return 0; + } + + EC_JACOBIAN shared_point; + uint8_t buf[EC_MAX_BYTES]; + size_t buflen; + if (!ec_point_mul_scalar(group, &shared_point, &pub_key->raw, priv) || + !ec_get_x_coordinate_as_bytes(group, buf, &buflen, sizeof(buf), + &shared_point)) { + OPENSSL_PUT_ERROR(ECDH, ECDH_R_POINT_ARITHMETIC_FAILURE); + return 0; + } + + FIPS_service_indicator_lock_state(); + SHA256_CTX ctx; + SHA512_CTX ctx_512; + switch (out_len) { + case SHA224_DIGEST_LENGTH: + BCM_sha224_init(&ctx); + BCM_sha224_update(&ctx, buf, buflen); + BCM_sha224_final(out, &ctx); + break; + case SHA256_DIGEST_LENGTH: + BCM_sha256_init(&ctx); + BCM_sha256_update(&ctx, buf, buflen); + BCM_sha256_final(out, &ctx); + break; + case SHA384_DIGEST_LENGTH: + BCM_sha384_init(&ctx_512); + BCM_sha384_update(&ctx_512, buf, buflen); + BCM_sha384_final(out, &ctx_512); + break; + case SHA512_DIGEST_LENGTH: + BCM_sha512_init(&ctx_512); + BCM_sha512_update(&ctx_512, buf, buflen); + BCM_sha512_final(out, &ctx_512); + break; + default: + OPENSSL_PUT_ERROR(ECDH, ECDH_R_UNKNOWN_DIGEST_LENGTH); + FIPS_service_indicator_unlock_state(); + return 0; + } + FIPS_service_indicator_unlock_state(); + + ECDH_verify_service_indicator(priv_key); + return 1; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ecdsa/ecdsa.c b/third_party/boringssl/src/crypto/fipsmodule/ecdsa/ecdsa.c deleted file mode 100644 index 95b367f1..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/ecdsa/ecdsa.c +++ /dev/null @@ -1,354 +0,0 @@ -/* ==================================================================== - * Copyright (c) 1998-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include - -#include -#include -#include -#include - -#include "../../internal.h" -#include "../bn/internal.h" -#include "../ec/internal.h" -#include "../service_indicator/internal.h" -#include "internal.h" - - -// digest_to_scalar interprets |digest_len| bytes from |digest| as a scalar for -// ECDSA. -static void digest_to_scalar(const EC_GROUP *group, EC_SCALAR *out, - const uint8_t *digest, size_t digest_len) { - const BIGNUM *order = &group->order; - size_t num_bits = BN_num_bits(order); - // Need to truncate digest if it is too long: first truncate whole bytes. - size_t num_bytes = (num_bits + 7) / 8; - if (digest_len > num_bytes) { - digest_len = num_bytes; - } - bn_big_endian_to_words(out->words, order->width, digest, digest_len); - - // If it is still too long, truncate remaining bits with a shift. - if (8 * digest_len > num_bits) { - bn_rshift_words(out->words, out->words, 8 - (num_bits & 0x7), order->width); - } - - // |out| now has the same bit width as |order|, but this only bounds by - // 2*|order|. Subtract the order if out of range. - // - // Montgomery multiplication accepts the looser bounds, so this isn't strictly - // necessary, but it is a cleaner abstraction and has no performance impact. - BN_ULONG tmp[EC_MAX_WORDS]; - bn_reduce_once_in_place(out->words, 0 /* no carry */, order->d, tmp, - order->width); -} - -ECDSA_SIG *ECDSA_SIG_new(void) { - ECDSA_SIG *sig = OPENSSL_malloc(sizeof(ECDSA_SIG)); - if (sig == NULL) { - return NULL; - } - sig->r = BN_new(); - sig->s = BN_new(); - if (sig->r == NULL || sig->s == NULL) { - ECDSA_SIG_free(sig); - return NULL; - } - return sig; -} - -void ECDSA_SIG_free(ECDSA_SIG *sig) { - if (sig == NULL) { - return; - } - - BN_free(sig->r); - BN_free(sig->s); - OPENSSL_free(sig); -} - -const BIGNUM *ECDSA_SIG_get0_r(const ECDSA_SIG *sig) { - return sig->r; -} - -const BIGNUM *ECDSA_SIG_get0_s(const ECDSA_SIG *sig) { - return sig->s; -} - -void ECDSA_SIG_get0(const ECDSA_SIG *sig, const BIGNUM **out_r, - const BIGNUM **out_s) { - if (out_r != NULL) { - *out_r = sig->r; - } - if (out_s != NULL) { - *out_s = sig->s; - } -} - -int ECDSA_SIG_set0(ECDSA_SIG *sig, BIGNUM *r, BIGNUM *s) { - if (r == NULL || s == NULL) { - return 0; - } - BN_free(sig->r); - BN_free(sig->s); - sig->r = r; - sig->s = s; - return 1; -} - -int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len, - const ECDSA_SIG *sig, const EC_KEY *eckey) { - const EC_GROUP *group = EC_KEY_get0_group(eckey); - const EC_POINT *pub_key = EC_KEY_get0_public_key(eckey); - if (group == NULL || pub_key == NULL || sig == NULL) { - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_MISSING_PARAMETERS); - return 0; - } - - EC_SCALAR r, s, u1, u2, s_inv_mont, m; - if (BN_is_zero(sig->r) || - !ec_bignum_to_scalar(group, &r, sig->r) || - BN_is_zero(sig->s) || - !ec_bignum_to_scalar(group, &s, sig->s)) { - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); - return 0; - } - - // s_inv_mont = s^-1 in the Montgomery domain. - if (!ec_scalar_to_montgomery_inv_vartime(group, &s_inv_mont, &s)) { - OPENSSL_PUT_ERROR(ECDSA, ERR_R_INTERNAL_ERROR); - return 0; - } - - // u1 = m * s^-1 mod order - // u2 = r * s^-1 mod order - // - // |s_inv_mont| is in Montgomery form while |m| and |r| are not, so |u1| and - // |u2| will be taken out of Montgomery form, as desired. - digest_to_scalar(group, &m, digest, digest_len); - ec_scalar_mul_montgomery(group, &u1, &m, &s_inv_mont); - ec_scalar_mul_montgomery(group, &u2, &r, &s_inv_mont); - - EC_RAW_POINT point; - if (!ec_point_mul_scalar_public(group, &point, &u1, &pub_key->raw, &u2)) { - OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB); - return 0; - } - - if (!ec_cmp_x_coordinate(group, &point, &r)) { - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); - return 0; - } - - return 1; -} - -int ECDSA_do_verify(const uint8_t *digest, size_t digest_len, - const ECDSA_SIG *sig, const EC_KEY *eckey) { - boringssl_ensure_ecc_self_test(); - - return ecdsa_do_verify_no_self_test(digest, digest_len, sig, eckey); -} - -static ECDSA_SIG *ecdsa_sign_impl(const EC_GROUP *group, int *out_retry, - const EC_SCALAR *priv_key, const EC_SCALAR *k, - const uint8_t *digest, size_t digest_len) { - *out_retry = 0; - - // Check that the size of the group order is FIPS compliant (FIPS 186-4 - // B.5.2). - const BIGNUM *order = EC_GROUP_get0_order(group); - if (BN_num_bits(order) < 160) { - OPENSSL_PUT_ERROR(ECDSA, EC_R_INVALID_GROUP_ORDER); - return NULL; - } - - // Compute r, the x-coordinate of k * generator. - EC_RAW_POINT tmp_point; - EC_SCALAR r; - if (!ec_point_mul_scalar_base(group, &tmp_point, k) || - !ec_get_x_coordinate_as_scalar(group, &r, &tmp_point)) { - return NULL; - } - - if (ec_scalar_is_zero(group, &r)) { - *out_retry = 1; - return NULL; - } - - // s = priv_key * r. Note if only one parameter is in the Montgomery domain, - // |ec_scalar_mod_mul_montgomery| will compute the answer in the normal - // domain. - EC_SCALAR s; - ec_scalar_to_montgomery(group, &s, &r); - ec_scalar_mul_montgomery(group, &s, priv_key, &s); - - // s = m + priv_key * r. - EC_SCALAR tmp; - digest_to_scalar(group, &tmp, digest, digest_len); - ec_scalar_add(group, &s, &s, &tmp); - - // s = k^-1 * (m + priv_key * r). First, we compute k^-1 in the Montgomery - // domain. This is |ec_scalar_to_montgomery| followed by - // |ec_scalar_inv0_montgomery|, but |ec_scalar_inv0_montgomery| followed by - // |ec_scalar_from_montgomery| is equivalent and slightly more efficient. - // Then, as above, only one parameter is in the Montgomery domain, so the - // result is in the normal domain. Finally, note k is non-zero (or computing r - // would fail), so the inverse must exist. - ec_scalar_inv0_montgomery(group, &tmp, k); // tmp = k^-1 R^2 - ec_scalar_from_montgomery(group, &tmp, &tmp); // tmp = k^-1 R - ec_scalar_mul_montgomery(group, &s, &s, &tmp); - if (ec_scalar_is_zero(group, &s)) { - *out_retry = 1; - return NULL; - } - - ECDSA_SIG *ret = ECDSA_SIG_new(); - if (ret == NULL || // - !bn_set_words(ret->r, r.words, order->width) || - !bn_set_words(ret->s, s.words, order->width)) { - ECDSA_SIG_free(ret); - return NULL; - } - return ret; -} - -ECDSA_SIG *ecdsa_sign_with_nonce_for_known_answer_test(const uint8_t *digest, - size_t digest_len, - const EC_KEY *eckey, - const uint8_t *nonce, - size_t nonce_len) { - if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) { - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_NOT_IMPLEMENTED); - return NULL; - } - - const EC_GROUP *group = EC_KEY_get0_group(eckey); - if (group == NULL || eckey->priv_key == NULL) { - OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER); - return NULL; - } - const EC_SCALAR *priv_key = &eckey->priv_key->scalar; - - EC_SCALAR k; - if (!ec_scalar_from_bytes(group, &k, nonce, nonce_len)) { - return NULL; - } - int retry_ignored; - return ecdsa_sign_impl(group, &retry_ignored, priv_key, &k, digest, - digest_len); -} - -// This function is only exported for testing and is not called in production -// code. -ECDSA_SIG *ECDSA_sign_with_nonce_and_leak_private_key_for_testing( - const uint8_t *digest, size_t digest_len, const EC_KEY *eckey, - const uint8_t *nonce, size_t nonce_len) { - boringssl_ensure_ecc_self_test(); - - return ecdsa_sign_with_nonce_for_known_answer_test(digest, digest_len, eckey, - nonce, nonce_len); -} - -ECDSA_SIG *ECDSA_do_sign(const uint8_t *digest, size_t digest_len, - const EC_KEY *eckey) { - boringssl_ensure_ecc_self_test(); - - if (eckey->ecdsa_meth && eckey->ecdsa_meth->sign) { - OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_NOT_IMPLEMENTED); - return NULL; - } - - const EC_GROUP *group = EC_KEY_get0_group(eckey); - if (group == NULL || eckey->priv_key == NULL) { - OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER); - return NULL; - } - const BIGNUM *order = EC_GROUP_get0_order(group); - const EC_SCALAR *priv_key = &eckey->priv_key->scalar; - - // Pass a SHA512 hash of the private key and digest as additional data - // into the RBG. This is a hardening measure against entropy failure. - static_assert(SHA512_DIGEST_LENGTH >= 32, - "additional_data is too large for SHA-512"); - - FIPS_service_indicator_lock_state(); - - SHA512_CTX sha; - uint8_t additional_data[SHA512_DIGEST_LENGTH]; - SHA512_Init(&sha); - SHA512_Update(&sha, priv_key->words, order->width * sizeof(BN_ULONG)); - SHA512_Update(&sha, digest, digest_len); - SHA512_Final(additional_data, &sha); - - ECDSA_SIG *ret = NULL; - for (;;) { - EC_SCALAR k; - if (!ec_random_nonzero_scalar(group, &k, additional_data)) { - ret = NULL; - goto out; - } - - int retry; - ret = ecdsa_sign_impl(group, &retry, priv_key, &k, digest, digest_len); - if (ret != NULL || !retry) { - goto out; - } - } - -out: - FIPS_service_indicator_unlock_state(); - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ecdsa/ecdsa.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/ecdsa/ecdsa.cc.inc new file mode 100644 index 00000000..463fc669 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/ecdsa/ecdsa.cc.inc @@ -0,0 +1,288 @@ +// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include + +#include "../../internal.h" +#include "../bcm_interface.h" +#include "../bn/internal.h" +#include "../ec/internal.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +// digest_to_scalar interprets |digest_len| bytes from |digest| as a scalar for +// ECDSA. +static void digest_to_scalar(const EC_GROUP *group, EC_SCALAR *out, + const uint8_t *digest, size_t digest_len) { + const BIGNUM *order = EC_GROUP_get0_order(group); + size_t num_bits = BN_num_bits(order); + // Need to truncate digest if it is too long: first truncate whole bytes. + size_t num_bytes = (num_bits + 7) / 8; + if (digest_len > num_bytes) { + digest_len = num_bytes; + } + bn_big_endian_to_words(out->words, order->width, digest, digest_len); + + // If it is still too long, truncate remaining bits with a shift. + if (8 * digest_len > num_bits) { + bn_rshift_words(out->words, out->words, 8 - (num_bits & 0x7), order->width); + } + + // |out| now has the same bit width as |order|, but this only bounds by + // 2*|order|. Subtract the order if out of range. + // + // Montgomery multiplication accepts the looser bounds, so this isn't strictly + // necessary, but it is a cleaner abstraction and has no performance impact. + BN_ULONG tmp[EC_MAX_WORDS]; + bn_reduce_once_in_place(out->words, 0 /* no carry */, order->d, tmp, + order->width); +} + +int bssl::ecdsa_verify_fixed_no_self_test(const uint8_t *digest, + size_t digest_len, const uint8_t *sig, + size_t sig_len, const EC_KEY *eckey) { + const EC_GROUP *group = EC_KEY_get0_group(eckey); + const EC_POINT *pub_key = EC_KEY_get0_public_key(eckey); + if (group == nullptr || pub_key == nullptr || sig == nullptr) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_MISSING_PARAMETERS); + return 0; + } + + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); + EC_SCALAR r, s, u1, u2, s_inv_mont, m; + if (sig_len != 2 * scalar_len || + !ec_scalar_from_bytes(group, &r, sig, scalar_len) || + ec_scalar_is_zero(group, &r) || + !ec_scalar_from_bytes(group, &s, sig + scalar_len, scalar_len) || + ec_scalar_is_zero(group, &s)) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); + return 0; + } + + // s_inv_mont = s^-1 in the Montgomery domain. + if (!ec_scalar_to_montgomery_inv_vartime(group, &s_inv_mont, &s)) { + OPENSSL_PUT_ERROR(ECDSA, ERR_R_INTERNAL_ERROR); + return 0; + } + + // u1 = m * s^-1 mod order + // u2 = r * s^-1 mod order + // + // |s_inv_mont| is in Montgomery form while |m| and |r| are not, so |u1| and + // |u2| will be taken out of Montgomery form, as desired. + digest_to_scalar(group, &m, digest, digest_len); + ec_scalar_mul_montgomery(group, &u1, &m, &s_inv_mont); + ec_scalar_mul_montgomery(group, &u2, &r, &s_inv_mont); + + EC_JACOBIAN point; + if (!ec_point_mul_scalar_public(group, &point, &u1, &pub_key->raw, &u2)) { + OPENSSL_PUT_ERROR(ECDSA, ERR_R_EC_LIB); + return 0; + } + + if (!ec_cmp_x_coordinate(group, &point, &r)) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_BAD_SIGNATURE); + return 0; + } + + return 1; +} + +int bssl::ecdsa_verify_fixed(const uint8_t *digest, size_t digest_len, + const uint8_t *sig, size_t sig_len, + const EC_KEY *key) { + boringssl_ensure_ecc_self_test(); + + return ecdsa_verify_fixed_no_self_test(digest, digest_len, sig, sig_len, key); +} + +static int ecdsa_sign_impl(const EC_GROUP *group, int *out_retry, uint8_t *sig, + size_t *out_sig_len, size_t max_sig_len, + const EC_SCALAR *priv_key, const EC_SCALAR *k, + const uint8_t *digest, size_t digest_len) { + *out_retry = 0; + + const BIGNUM *order = EC_GROUP_get0_order(group); + size_t sig_len = 2 * BN_num_bytes(order); + if (sig_len > max_sig_len) { + OPENSSL_PUT_ERROR(EC, EC_R_BUFFER_TOO_SMALL); + return 0; + } + + // Compute r, the x-coordinate of k * generator. + EC_JACOBIAN tmp_point; + EC_SCALAR r; + if (!ec_point_mul_scalar_base(group, &tmp_point, k) || + !ec_get_x_coordinate_as_scalar(group, &r, &tmp_point)) { + return 0; + } + + if (constant_time_declassify_int(ec_scalar_is_zero(group, &r))) { + *out_retry = 1; + return 0; + } + + // s = priv_key * r. Note if only one parameter is in the Montgomery domain, + // |ec_scalar_mod_mul_montgomery| will compute the answer in the normal + // domain. + EC_SCALAR s; + ec_scalar_to_montgomery(group, &s, &r); + ec_scalar_mul_montgomery(group, &s, priv_key, &s); + + // s = m + priv_key * r. + EC_SCALAR tmp; + digest_to_scalar(group, &tmp, digest, digest_len); + ec_scalar_add(group, &s, &s, &tmp); + + // s = k^-1 * (m + priv_key * r). First, we compute k^-1 in the Montgomery + // domain. This is |ec_scalar_to_montgomery| followed by + // |ec_scalar_inv0_montgomery|, but |ec_scalar_inv0_montgomery| followed by + // |ec_scalar_from_montgomery| is equivalent and slightly more efficient. + // Then, as above, only one parameter is in the Montgomery domain, so the + // result is in the normal domain. Finally, note k is non-zero (or computing r + // would fail), so the inverse must exist. + ec_scalar_inv0_montgomery(group, &tmp, k); // tmp = k^-1 R^2 + ec_scalar_from_montgomery(group, &tmp, &tmp); // tmp = k^-1 R + ec_scalar_mul_montgomery(group, &s, &s, &tmp); + if (constant_time_declassify_int(ec_scalar_is_zero(group, &s))) { + *out_retry = 1; + return 0; + } + + CONSTTIME_DECLASSIFY(r.words, sizeof(r.words)); + CONSTTIME_DECLASSIFY(s.words, sizeof(r.words)); + size_t len; + ec_scalar_to_bytes(group, sig, &len, &r); + assert(len == sig_len / 2); + ec_scalar_to_bytes(group, sig + len, &len, &s); + assert(len == sig_len / 2); + *out_sig_len = sig_len; + return 1; +} + +int bssl::ecdsa_sign_fixed_with_nonce_for_known_answer_test( + const uint8_t *digest, size_t digest_len, uint8_t *sig, size_t *out_sig_len, + size_t max_sig_len, const EC_KEY *eckey, const uint8_t *nonce, + size_t nonce_len) { + const ECKey *impl = FromOpaque(eckey); + + if (impl->ecdsa_meth && impl->ecdsa_meth->sign) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_NOT_IMPLEMENTED); + return 0; + } + + const EC_GROUP *group = EC_KEY_get0_group(impl); + if (group == nullptr || impl->priv_key == nullptr) { + OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + const EC_SCALAR *priv_key = &impl->priv_key->scalar; + + EC_SCALAR k; + if (!ec_scalar_from_bytes(group, &k, nonce, nonce_len)) { + return 0; + } + int retry_ignored; + return ecdsa_sign_impl(group, &retry_ignored, sig, out_sig_len, max_sig_len, + priv_key, &k, digest, digest_len); +} + +int bssl::ecdsa_sign_fixed(const uint8_t *digest, size_t digest_len, + uint8_t *sig, size_t *out_sig_len, + size_t max_sig_len, const EC_KEY *eckey) { + const ECKey *impl = FromOpaque(eckey); + + boringssl_ensure_ecc_self_test(); + + if (impl->ecdsa_meth && impl->ecdsa_meth->sign) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_NOT_IMPLEMENTED); + return 0; + } + + const EC_GROUP *group = EC_KEY_get0_group(impl); + if (group == nullptr || impl->priv_key == nullptr) { + OPENSSL_PUT_ERROR(ECDSA, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + const BIGNUM *order = EC_GROUP_get0_order(group); + const EC_SCALAR *priv_key = &impl->priv_key->scalar; + + // Generate the ECDSA per-message secret number by rejection sampling. This + // function implements FIPS 186-5, A.3.2, repeating the process on failure. + + // Check the group order is large enough. See step 1 of FIPS 186-5, A.3.2. + if (BN_num_bits(order) < 224) { + OPENSSL_PUT_ERROR(EC, EC_R_INVALID_GROUP_ORDER); + return 0; + } + + // Pass a SHA512 hash of the private key and digest as additional data + // into the RBG. This is a hardening measure against entropy failure. + static_assert(SHA512_DIGEST_LENGTH >= 32, + "additional_data is too large for SHA-512"); + + FIPS_service_indicator_lock_state(); + + SHA512_CTX sha; + uint8_t additional_data[SHA512_DIGEST_LENGTH]; + BCM_sha512_init(&sha); + BCM_sha512_update(&sha, priv_key->words, order->width * sizeof(BN_ULONG)); + BCM_sha512_update(&sha, digest, digest_len); + BCM_sha512_final(additional_data, &sha); + + // Cap iterations so callers who supply invalid values as custom groups do not + // infinite loop. This does not impact valid parameters (e.g. those covered by + // FIPS) because the probability of requiring even one retry is negligible, + // let alone 32. + static const int kMaxIterations = 32; + int ret = 0; + int iters = 0; + for (;;) { + EC_SCALAR k; + if (!ec_random_nonzero_scalar(group, &k, additional_data)) { + goto out; + } + + // TODO(davidben): Move this inside |ec_random_nonzero_scalar| or lower, so + // that all scalars we generate are, by default, secret. + CONSTTIME_SECRET(k.words, sizeof(k.words)); + + int retry; + ret = ecdsa_sign_impl(group, &retry, sig, out_sig_len, max_sig_len, + priv_key, &k, digest, digest_len); + if (ret || !retry) { + goto out; + } + + iters++; + if (iters > kMaxIterations) { + OPENSSL_PUT_ERROR(ECDSA, ECDSA_R_TOO_MANY_ITERATIONS); + goto out; + } + } + +out: + FIPS_service_indicator_unlock_state(); + return ret; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/ecdsa/internal.h b/third_party/boringssl/src/crypto/fipsmodule/ecdsa/internal.h index 645959fb..622b688c 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/ecdsa/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/ecdsa/internal.h @@ -1,45 +1,57 @@ -/* Copyright (c) 2021, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2021 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_ECDSA_INTERNAL_H #define OPENSSL_HEADER_CRYPTO_FIPSMODULE_ECDSA_INTERNAL_H #include -#if defined(__cplusplus) -extern "C" { -#endif +#include "../ec/internal.h" -// ecdsa_sign_with_nonce_for_known_answer_test behaves like |ECDSA_do_sign| but -// takes a fixed nonce. This function is used as part of known-answer tests in -// the FIPS module. -ECDSA_SIG *ecdsa_sign_with_nonce_for_known_answer_test(const uint8_t *digest, - size_t digest_len, - const EC_KEY *eckey, - const uint8_t *nonce, - size_t nonce_len); +BSSL_NAMESPACE_BEGIN -// ecdsa_do_verify_no_self_test does the same as |ECDSA_do_verify|, but doesn't +// ECDSA_MAX_FIXED_LEN is the maximum length of an ECDSA signature in the +// fixed-width, big-endian format from IEEE P1363. +#define ECDSA_MAX_FIXED_LEN (2 * EC_MAX_BYTES) + +// ecdsa_sign_fixed behaves like |ECDSA_sign| but uses the fixed-width, +// big-endian format from IEEE P1363. +int ecdsa_sign_fixed(const uint8_t *digest, size_t digest_len, uint8_t *sig, + size_t *out_sig_len, size_t max_sig_len, + const EC_KEY *key); + +// ecdsa_sign_fixed_with_nonce_for_known_answer_test behaves like +// |ecdsa_sign_fixed| but takes a caller-supplied nonce. This function is used +// as part of known-answer tests in the FIPS module. +int ecdsa_sign_fixed_with_nonce_for_known_answer_test( + const uint8_t *digest, size_t digest_len, uint8_t *sig, size_t *out_sig_len, + size_t max_sig_len, const EC_KEY *key, const uint8_t *nonce, + size_t nonce_len); + +// ecdsa_verify_fixed behaves like |ECDSA_verify| but uses the fixed-width, +// big-endian format from IEEE P1363. +int ecdsa_verify_fixed(const uint8_t *digest, size_t digest_len, + const uint8_t *sig, size_t sig_len, const EC_KEY *key); + +// ecdsa_verify_fixed_no_self_test behaves like ecdsa_verify_fixed, but doesn't // try to run the self-test first. This is for use in the self tests themselves, // to prevent an infinite loop. -int ecdsa_do_verify_no_self_test(const uint8_t *digest, size_t digest_len, - const ECDSA_SIG *sig, const EC_KEY *eckey); - +int ecdsa_verify_fixed_no_self_test(const uint8_t *digest, size_t digest_len, + const uint8_t *sig, size_t sig_len, + const EC_KEY *key); -#if defined(__cplusplus) -} -#endif +BSSL_NAMESPACE_END #endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_ECDSA_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/entropy/internal.h b/third_party/boringssl/src/crypto/fipsmodule/entropy/internal.h new file mode 100644 index 00000000..a9f2a814 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/entropy/internal.h @@ -0,0 +1,38 @@ +// Copyright 2025 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_ENTROPY_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_ENTROPY_INTERNAL_H + +#include + +#if defined(OPENSSL_LINUX) || defined(OPENSSL_MACOS) + +BSSL_NAMESPACE_BEGIN +namespace entropy { + +// GetSeed fills `out` with random bytes from the jitter source. +OPENSSL_EXPORT bool GetSeed(uint8_t out[48]); + +// GetSamples fetches `n` raw delta time samples. +OPENSSL_EXPORT bool GetSamples(uint64_t *out, size_t n); + +// GetVersion returns the version of the entropy module. +int GetVersion(); + +} // namespace entropy +BSSL_NAMESPACE_END + +#endif // LINUX || MACOS +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_ENTROPY_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/entropy/jitter.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/entropy/jitter.cc.inc new file mode 100644 index 00000000..77d278bb --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/entropy/jitter.cc.inc @@ -0,0 +1,463 @@ +// Copyright 2025 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// BoringCrypto Jitter Entropy version 20250725. + +#include + +#if defined(OPENSSL_LINUX) || defined(OPENSSL_MACOS) + +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include "internal.h" +#include "sha512.cc.inc" + +#if defined(__x86_64__) +#include +#endif + + +BSSL_NAMESPACE_BEGIN +namespace entropy { +namespace { + +#if defined(__x86_64__) +static uint64_t GetTimestamp() { return _rdtsc(); } +#elif defined(__aarch64__) +static uint64_t GetTimestamp() { + // Ideally this would use __arm_rsr64 from . Clang has supported + // it Clang 3.7 (2016), but GCC did not add it until GCC 14.1.0 (2024). See + // https://crbug.com/440670941. When our minimum GCC is past that point, + // switch this back to __arm_rsr64. + uint64_t ret; + __asm__ volatile("mrs %0, cntvct_el0" : "=r"(ret)); + return ret; +} +#else +static uint64_t GetTimestamp() { + struct timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return ts.tv_sec * 1000000000ULL + ts.tv_nsec; +} +#endif + +class MemoryOffsetLCG { + public: + MemoryOffsetLCG() : state(GetTimestamp() & 0xFFFFFFFF) {} + uint32_t Next() { + state = state * 1664525 + 1013904223; + return state; + } + + private: + uint32_t state; +}; + +class MemoryAccessSampler { + public: + MemoryAccessSampler(size_t array_size, unsigned num_samples) + : array_size_(array_size), + num_samples_(num_samples), + array_(reinterpret_cast(OPENSSL_malloc(array_size_))) { + if (array_ == nullptr || // + array_size_ == 0 || // + array_size_ > (1u << 26) || // + array_size_ & (array_size_ - 1)) { + abort(); + } + } + + ~MemoryAccessSampler() { OPENSSL_free(const_cast(array_)); } + + MemoryAccessSampler(const MemoryAccessSampler &) = delete; + MemoryAccessSampler &operator=(const MemoryAccessSampler &) = delete; + + MemoryAccessSampler(MemoryAccessSampler &&other) + : array_size_(other.array_size_), + num_samples_(other.num_samples_), + lcg_(other.lcg_), + array_(other.array_) { + other.array_ = nullptr; + } + + bool Next(uint64_t *out) { + // Perform some memory accesses and measure how long it took. The LCG is + // intended to defeat any CPU predictors and thus expose this code to as + // much system entropy as possible. + for (unsigned i = 0; i < num_samples_; i++) { + // The lower bits of an LCG tend to fall into short cycles and so are + // discarded here. + array_[(lcg_.Next() >> 6) & (array_size_ - 1)] += 1; + } + + *out = GetTimestamp(); + return true; + } + + private: + const size_t array_size_; + const unsigned num_samples_; + MemoryOffsetLCG lcg_; + volatile uint8_t *array_; +}; + +template +class DeltaSampler { + public: + explicit DeltaSampler(T &&sub_sampler) + : sub_sampler_(std::forward(sub_sampler)) {} + + // Next function to return the delta between two subsequent samples + bool Next(uint64_t *out) { + uint64_t sample; + if (!sub_sampler_.Next(&sample)) { + return false; + } + + if (!initialized_) { + last_sample_ = sample; + if (!sub_sampler_.Next(&sample)) { + return false; + } + initialized_ = true; + } + + *out = sample - last_sample_; + last_sample_ = sample; + return true; + } + + private: + bool initialized_ = false; + T sub_sampler_; + uint64_t last_sample_ = 0; +}; + +template +DeltaSampler(U &&sub_sampler) -> DeltaSampler>; + +template +class MaskSampler { + public: + explicit MaskSampler(uint8_t mask, T &&sub_sampler) + : mask_(mask), sub_sampler_(std::forward(sub_sampler)) {} + + bool Next(uint8_t *out) { + uint64_t sample; + if (!sub_sampler_.Next(&sample)) { + return false; + } + + *out = sample & mask_; + return true; + } + + private: + const uint8_t mask_; + T sub_sampler_; +}; + +template +MaskSampler(uint8_t mask, U &&sub_sampler) -> MaskSampler>; + +// The estimated entropy per sample from MaskSampler. +constexpr float kH = 0.8; + +// kAlphaLog2 is log_2(alpha), where alpha is the standard false-positive +// probability from SP 800-90B. +static constexpr float kAlphaLog2 = -20; +// kAlpha is the variable of the same name from section 4.4.1 of SP 800-90B. +constexpr float kAlpha = 1.0 / (1 << static_cast(-kAlphaLog2)); + +// Ceil rounds up its non-negative argument to the next integer. (std::ceil +// isn't constexpr until C++23.) +constexpr unsigned Ceil(float val) { + auto truncated = static_cast(val); + if (val == static_cast(truncated)) { + return truncated; + } + if (val > 0) { + return truncated + 1; + } + __builtin_unreachable(); +} + +template +class RepetitionCountTest { + public: + static constexpr unsigned kThreshold = 1 + Ceil(-kAlphaLog2 / kH); + static_assert(kThreshold == 26); + + explicit RepetitionCountTest(T &&sub_sampler) + : sub_sampler_(std::forward(sub_sampler)) {} + + bool Next(uint8_t *out) { + uint8_t sample; + if (!sub_sampler_.Next(&sample)) { + return false; + } + if (sample == last_sample_) { + count_++; + } else { + count_ = 1; + last_sample_ = sample; + } + if (count_ >= kThreshold) { + return false; + } + *out = sample; + return true; + } + + private: + T sub_sampler_; + unsigned count_ = 0; + uint8_t last_sample_ = 0; +}; + +template +RepetitionCountTest(U &&sub_sampler) -> RepetitionCountTest>; + +constexpr double BinomialPMF(int64_t k, int64_t n, double p) { + if (k < 0 || k > n) { + return 0.0; + } + + double result = 1.0; + for (int64_t i = 0; i < k; ++i) { + result *= (n - i); + result /= (i + 1); + } + + for (int64_t i = 0; i < k; ++i) { + result *= p; + } + for (int64_t i = 0; i < n - k; ++i) { + result *= (1 - p); + } + + return result; +} + +// CritBinom implements the Excel function of the same name. +constexpr unsigned CritBinom(unsigned trials, double probability_s, + double alpha) { + if (probability_s < 0.0 || probability_s > 1.0 || alpha < 0.0 || + alpha > 1.0) { + __builtin_unreachable(); + } + + double cumulative = 0.0; + for (unsigned k = 0; k <= trials; ++k) { + cumulative += BinomialPMF(k, trials, probability_s); + if (cumulative >= alpha) { + return k; + } + } + + return trials; +} + +// ExpTaylor calculates e^x using the Taylor series: e^x = 1 + x + x²/2! + +// x³/3! + ... +constexpr double ExpTaylor(double x) { + double sum = 1.0; + double term = 1.0; + + for (int i = 1; i < 25; ++i) { + term *= x / i; + sum += term; + } + + return sum; +} + +// Power2 calculates 2^exp by calculating e^(ln(2) * exp) = e^(ln(2)) ^ exp = +// 2^exp. (std::pow isn't constexpr until C++26.) +constexpr double Power2(double exp) { + constexpr double ln2 = 0.693147180559945309417232121458; + return ExpTaylor(exp * ln2); +} + +// AdaptiveProportionTestCutoff implements the function from the footnote on +// page 27 of SP 800-90B. +constexpr unsigned AdaptiveProportionTestCutoff(unsigned W, float H, + float alpha) { + return 1 + CritBinom(W, Power2(-H), 1.0 - alpha); +} + +// These are the example values from table 2 of SP 800-90B, to show that +// we're calculating the values correctly. +static_assert(AdaptiveProportionTestCutoff(512, 0.5, kAlpha) == 410); +static_assert(AdaptiveProportionTestCutoff(512, 1, kAlpha) == 311); +static_assert(AdaptiveProportionTestCutoff(512, 2, kAlpha) == 177); +static_assert(AdaptiveProportionTestCutoff(512, 4, kAlpha) == 62); +static_assert(AdaptiveProportionTestCutoff(512, 8, kAlpha) == 13); + +template +class AdaptiveProportionTest { + public: + // The size of the sliding window, representing the number of recent samples + // to analyze. + static constexpr unsigned kWindowSize = 512; + + // The maximum number of times any single byte value is allowed to appear + // within the sliding window. + static constexpr unsigned kThreshold = + AdaptiveProportionTestCutoff(kWindowSize, kH, kAlpha); + static_assert(kThreshold == 348); + + explicit AdaptiveProportionTest(T &&sub_sampler) + : sub_sampler_(std::forward(sub_sampler)) { + counts_.fill(0); + } + + bool Next(uint8_t *out) { + uint8_t sample; + if (!sub_sampler_.Next(&sample)) { + return false; + } + *out = sample; + + if (samples_processed_ >= kWindowSize) { + const uint8_t evicted_sample = buffer_[buffer_idx_]; + counts_[evicted_sample]--; + } + + buffer_[buffer_idx_] = sample; + const uint16_t new_count = ++counts_[sample]; + + if (new_count > kThreshold) { + return false; + } + + buffer_idx_ = (buffer_idx_ + 1) % kWindowSize; + samples_processed_++; + + return true; + } + + private: + T sub_sampler_; + + // A circular buffer to store the most recent `kWindowSize` samples. + std::array buffer_{}; + + // An array to store the frequency counts of each possible byte value (0-255) + // within the current window. + std::array counts_{}; + + // The current index for writing into the circular buffer. + size_t buffer_idx_ = 0; + + // The total number of samples processed. Used to determine when the buffer + // is full and eviction should begin. + size_t samples_processed_ = 0; +}; + +template +AdaptiveProportionTest(U &&sub_sampler) + -> AdaptiveProportionTest>; + +template +class SeedSampler { + public: + // NIST requires 1024 samples at start-up time. This code is structured so + // that the entropy generator is considered to be starting afresh for each + // seed. + static constexpr unsigned kNumSamples = 1024; + + explicit SeedSampler(T &&sub_sampler) + : sub_sampler_(std::forward(sub_sampler)) {} + + bool Next(uint8_t out_seed[48]) { + // HMAC-SHA384 `kNumSamples` samples with an all-zero key: + SHA512_CTX ctx; + SHA384_Init(&ctx); + + uint8_t block[kSHA384Block]; + memset(block, 0x36, sizeof(block)); + SHA384_Update(&ctx, block, sizeof(block)); + + static_assert(kNumSamples % sizeof(block) == 0); + for (unsigned i = 0; i < kNumSamples / sizeof(block); i++) { + for (unsigned j = 0; j < sizeof(block); j++) { + if (!sub_sampler_.Next(&block[j])) { + return false; + } + } + SHA384_Update(&ctx, block, sizeof(block)); + } + + SHA384_Final(out_seed, &ctx); + + SHA384_Init(&ctx); + memset(block, 0x5c, sizeof(block)); + SHA384_Update(&ctx, block, sizeof(block)); + SHA384_Update(&ctx, out_seed, kSHA384DigestLength); + SHA384_Final(out_seed, &ctx); + + return true; + } + + private: + T sub_sampler_; +}; + +template +SeedSampler(U &&sub_sampler) -> SeedSampler>; + +constexpr size_t kMemorySize = 1u << 25; +constexpr size_t kMemoryAccessesPerSample = 16; +constexpr size_t kBitsPerSample = 8; +constexpr uint8_t kMask = (1u << kBitsPerSample) - 1; + +} // namespace + +int GetVersion() { return 20250725; } + +bool GetSeed(uint8_t out[48]) { + auto sampler(SeedSampler(AdaptiveProportionTest(RepetitionCountTest( + MaskSampler(kMask, DeltaSampler(MemoryAccessSampler( + kMemorySize, kMemoryAccessesPerSample))))))); + return sampler.Next(out); +} + +bool GetSamples(uint64_t *out, size_t n) { + auto sampler( + DeltaSampler(MemoryAccessSampler(kMemorySize, kMemoryAccessesPerSample))); + for (size_t i = 0; i < n; i++) { + if (!sampler.Next(&out[i])) { + return false; + } + } + return true; +} + +} // namespace entropy +BSSL_NAMESPACE_END + +#endif // LINUX || MACOS diff --git a/third_party/boringssl/src/crypto/fipsmodule/entropy/sha512.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/entropy/sha512.cc.inc new file mode 100644 index 00000000..53512a86 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/entropy/sha512.cc.inc @@ -0,0 +1,329 @@ +// Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include + + +// This is a copy of the SHA-384 code for the purpose of isolating the jitter +// entropy source certification from any changes to the normal implementation. + +BSSL_NAMESPACE_BEGIN +namespace entropy { +namespace { + +constexpr size_t kSHA384Block = 128; +constexpr size_t kSHA384DigestLength = (384 / 8); + +struct SHA512_CTX { + uint64_t h[8]; + uint64_t Nl, Nh; + uint8_t p[kSHA384Block]; + unsigned num, md_len; +}; + +uint64_t CRYPTO_bswap8(uint64_t x) { return __builtin_bswap64(x); } + +uint64_t CRYPTO_load_u64_be(const void *ptr) { + uint64_t ret; + memcpy(&ret, ptr, sizeof(ret)); + return CRYPTO_bswap8(ret); +} + +void CRYPTO_store_u64_be(void *out, uint64_t v) { + v = CRYPTO_bswap8(v); + memcpy(out, &v, sizeof(v)); +} + +uint64_t CRYPTO_rotr_u64(uint64_t value, int shift) { + return (value >> shift) | (value << ((-shift) & 63)); +} + +void sha512_update(SHA512_CTX *c, const void *in_data, size_t len); +void sha512_final_impl(uint8_t *out, size_t md_len, SHA512_CTX *sha); + +void SHA384_Init(SHA512_CTX *sha) { + sha->h[0] = UINT64_C(0xcbbb9d5dc1059ed8); + sha->h[1] = UINT64_C(0x629a292a367cd507); + sha->h[2] = UINT64_C(0x9159015a3070dd17); + sha->h[3] = UINT64_C(0x152fecd8f70e5939); + sha->h[4] = UINT64_C(0x67332667ffc00b31); + sha->h[5] = UINT64_C(0x8eb44a8768581511); + sha->h[6] = UINT64_C(0xdb0c2e0d64f98fa7); + sha->h[7] = UINT64_C(0x47b5481dbefa4fa4); + + sha->Nl = 0; + sha->Nh = 0; + sha->num = 0; + sha->md_len = kSHA384DigestLength; + return; +} + +void SHA384_Final(uint8_t out[kSHA384DigestLength], SHA512_CTX *sha) { + // This function must be paired with |SHA384_Init|, which sets + // |sha->md_len| to |kSHA384DigestLength|. + sha512_final_impl(out, kSHA384DigestLength, sha); + return; +} + +void SHA384_Update(SHA512_CTX *sha, const void *data, size_t len) { + return sha512_update(sha, data, len); +} + +void sha512_block_data_order(uint64_t state[8], const uint8_t *in, + size_t num_blocks); + +void sha512_final_impl(uint8_t *out, size_t md_len, SHA512_CTX *sha) { + uint8_t *p = sha->p; + size_t n = sha->num; + + p[n] = 0x80; // There always is a room for one + n++; + if (n > (sizeof(sha->p) - 16)) { + memset(p + n, 0, sizeof(sha->p) - n); + n = 0; + sha512_block_data_order(sha->h, p, 1); + } + + memset(p + n, 0, sizeof(sha->p) - 16 - n); + CRYPTO_store_u64_be(p + sizeof(sha->p) - 16, sha->Nh); + CRYPTO_store_u64_be(p + sizeof(sha->p) - 8, sha->Nl); + + sha512_block_data_order(sha->h, p, 1); + + const size_t out_words = md_len / 8; + for (size_t i = 0; i < out_words; i++) { + CRYPTO_store_u64_be(out, sha->h[i]); + out += 8; + } +} + +const uint64_t K512[80] = { + UINT64_C(0x428a2f98d728ae22), UINT64_C(0x7137449123ef65cd), + UINT64_C(0xb5c0fbcfec4d3b2f), UINT64_C(0xe9b5dba58189dbbc), + UINT64_C(0x3956c25bf348b538), UINT64_C(0x59f111f1b605d019), + UINT64_C(0x923f82a4af194f9b), UINT64_C(0xab1c5ed5da6d8118), + UINT64_C(0xd807aa98a3030242), UINT64_C(0x12835b0145706fbe), + UINT64_C(0x243185be4ee4b28c), UINT64_C(0x550c7dc3d5ffb4e2), + UINT64_C(0x72be5d74f27b896f), UINT64_C(0x80deb1fe3b1696b1), + UINT64_C(0x9bdc06a725c71235), UINT64_C(0xc19bf174cf692694), + UINT64_C(0xe49b69c19ef14ad2), UINT64_C(0xefbe4786384f25e3), + UINT64_C(0x0fc19dc68b8cd5b5), UINT64_C(0x240ca1cc77ac9c65), + UINT64_C(0x2de92c6f592b0275), UINT64_C(0x4a7484aa6ea6e483), + UINT64_C(0x5cb0a9dcbd41fbd4), UINT64_C(0x76f988da831153b5), + UINT64_C(0x983e5152ee66dfab), UINT64_C(0xa831c66d2db43210), + UINT64_C(0xb00327c898fb213f), UINT64_C(0xbf597fc7beef0ee4), + UINT64_C(0xc6e00bf33da88fc2), UINT64_C(0xd5a79147930aa725), + UINT64_C(0x06ca6351e003826f), UINT64_C(0x142929670a0e6e70), + UINT64_C(0x27b70a8546d22ffc), UINT64_C(0x2e1b21385c26c926), + UINT64_C(0x4d2c6dfc5ac42aed), UINT64_C(0x53380d139d95b3df), + UINT64_C(0x650a73548baf63de), UINT64_C(0x766a0abb3c77b2a8), + UINT64_C(0x81c2c92e47edaee6), UINT64_C(0x92722c851482353b), + UINT64_C(0xa2bfe8a14cf10364), UINT64_C(0xa81a664bbc423001), + UINT64_C(0xc24b8b70d0f89791), UINT64_C(0xc76c51a30654be30), + UINT64_C(0xd192e819d6ef5218), UINT64_C(0xd69906245565a910), + UINT64_C(0xf40e35855771202a), UINT64_C(0x106aa07032bbd1b8), + UINT64_C(0x19a4c116b8d2d0c8), UINT64_C(0x1e376c085141ab53), + UINT64_C(0x2748774cdf8eeb99), UINT64_C(0x34b0bcb5e19b48a8), + UINT64_C(0x391c0cb3c5c95a63), UINT64_C(0x4ed8aa4ae3418acb), + UINT64_C(0x5b9cca4f7763e373), UINT64_C(0x682e6ff3d6b2b8a3), + UINT64_C(0x748f82ee5defb2fc), UINT64_C(0x78a5636f43172f60), + UINT64_C(0x84c87814a1f0ab72), UINT64_C(0x8cc702081a6439ec), + UINT64_C(0x90befffa23631e28), UINT64_C(0xa4506cebde82bde9), + UINT64_C(0xbef9a3f7b2c67915), UINT64_C(0xc67178f2e372532b), + UINT64_C(0xca273eceea26619c), UINT64_C(0xd186b8c721c0c207), + UINT64_C(0xeada7dd6cde0eb1e), UINT64_C(0xf57d4f7fee6ed178), + UINT64_C(0x06f067aa72176fba), UINT64_C(0x0a637dc5a2c898a6), + UINT64_C(0x113f9804bef90dae), UINT64_C(0x1b710b35131c471b), + UINT64_C(0x28db77f523047d84), UINT64_C(0x32caab7b40c72493), + UINT64_C(0x3c9ebe0a15c9bebc), UINT64_C(0x431d67c49c100d4c), + UINT64_C(0x4cc5d4becb3e42b6), UINT64_C(0x597f299cfc657e2a), + UINT64_C(0x5fcb6fab3ad6faec), UINT64_C(0x6c44198c4a475817), +}; + +#define Sigma0(x) \ + (CRYPTO_rotr_u64((x), 28) ^ CRYPTO_rotr_u64((x), 34) ^ \ + CRYPTO_rotr_u64((x), 39)) +#define Sigma1(x) \ + (CRYPTO_rotr_u64((x), 14) ^ CRYPTO_rotr_u64((x), 18) ^ \ + CRYPTO_rotr_u64((x), 41)) +#define sigma0(x) \ + (CRYPTO_rotr_u64((x), 1) ^ CRYPTO_rotr_u64((x), 8) ^ ((x) >> 7)) +#define sigma1(x) \ + (CRYPTO_rotr_u64((x), 19) ^ CRYPTO_rotr_u64((x), 61) ^ ((x) >> 6)) + +#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) +#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +#define ROUND_00_15(i, a, b, c, d, e, f, g, h) \ + do { \ + T1 += h + Sigma1(e) + Ch(e, f, g) + K512[i]; \ + h = Sigma0(a) + Maj(a, b, c); \ + d += T1; \ + h += T1; \ + } while (0) + +#define ROUND_16_80(i, j, a, b, c, d, e, f, g, h, X) \ + do { \ + s0 = X[(j + 1) & 0x0f]; \ + s0 = sigma0(s0); \ + s1 = X[(j + 14) & 0x0f]; \ + s1 = sigma1(s1); \ + T1 = X[(j) & 0x0f] += s0 + s1 + X[(j + 9) & 0x0f]; \ + ROUND_00_15(i + j, a, b, c, d, e, f, g, h); \ + } while (0) + +void sha512_block_data_order(uint64_t state[8], const uint8_t *in, size_t num) { + uint64_t a, b, c, d, e, f, g, h, s0, s1, T1; + uint64_t X[16]; + int i; + + while (num--) { + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + f = state[5]; + g = state[6]; + h = state[7]; + + T1 = X[0] = CRYPTO_load_u64_be(in); + ROUND_00_15(0, a, b, c, d, e, f, g, h); + T1 = X[1] = CRYPTO_load_u64_be(in + 8); + ROUND_00_15(1, h, a, b, c, d, e, f, g); + T1 = X[2] = CRYPTO_load_u64_be(in + 2 * 8); + ROUND_00_15(2, g, h, a, b, c, d, e, f); + T1 = X[3] = CRYPTO_load_u64_be(in + 3 * 8); + ROUND_00_15(3, f, g, h, a, b, c, d, e); + T1 = X[4] = CRYPTO_load_u64_be(in + 4 * 8); + ROUND_00_15(4, e, f, g, h, a, b, c, d); + T1 = X[5] = CRYPTO_load_u64_be(in + 5 * 8); + ROUND_00_15(5, d, e, f, g, h, a, b, c); + T1 = X[6] = CRYPTO_load_u64_be(in + 6 * 8); + ROUND_00_15(6, c, d, e, f, g, h, a, b); + T1 = X[7] = CRYPTO_load_u64_be(in + 7 * 8); + ROUND_00_15(7, b, c, d, e, f, g, h, a); + T1 = X[8] = CRYPTO_load_u64_be(in + 8 * 8); + ROUND_00_15(8, a, b, c, d, e, f, g, h); + T1 = X[9] = CRYPTO_load_u64_be(in + 9 * 8); + ROUND_00_15(9, h, a, b, c, d, e, f, g); + T1 = X[10] = CRYPTO_load_u64_be(in + 10 * 8); + ROUND_00_15(10, g, h, a, b, c, d, e, f); + T1 = X[11] = CRYPTO_load_u64_be(in + 11 * 8); + ROUND_00_15(11, f, g, h, a, b, c, d, e); + T1 = X[12] = CRYPTO_load_u64_be(in + 12 * 8); + ROUND_00_15(12, e, f, g, h, a, b, c, d); + T1 = X[13] = CRYPTO_load_u64_be(in + 13 * 8); + ROUND_00_15(13, d, e, f, g, h, a, b, c); + T1 = X[14] = CRYPTO_load_u64_be(in + 14 * 8); + ROUND_00_15(14, c, d, e, f, g, h, a, b); + T1 = X[15] = CRYPTO_load_u64_be(in + 15 * 8); + ROUND_00_15(15, b, c, d, e, f, g, h, a); + + for (i = 16; i < 80; i += 16) { + ROUND_16_80(i, 0, a, b, c, d, e, f, g, h, X); + ROUND_16_80(i, 1, h, a, b, c, d, e, f, g, X); + ROUND_16_80(i, 2, g, h, a, b, c, d, e, f, X); + ROUND_16_80(i, 3, f, g, h, a, b, c, d, e, X); + ROUND_16_80(i, 4, e, f, g, h, a, b, c, d, X); + ROUND_16_80(i, 5, d, e, f, g, h, a, b, c, X); + ROUND_16_80(i, 6, c, d, e, f, g, h, a, b, X); + ROUND_16_80(i, 7, b, c, d, e, f, g, h, a, X); + ROUND_16_80(i, 8, a, b, c, d, e, f, g, h, X); + ROUND_16_80(i, 9, h, a, b, c, d, e, f, g, X); + ROUND_16_80(i, 10, g, h, a, b, c, d, e, f, X); + ROUND_16_80(i, 11, f, g, h, a, b, c, d, e, X); + ROUND_16_80(i, 12, e, f, g, h, a, b, c, d, X); + ROUND_16_80(i, 13, d, e, f, g, h, a, b, c, X); + ROUND_16_80(i, 14, c, d, e, f, g, h, a, b, X); + ROUND_16_80(i, 15, b, c, d, e, f, g, h, a, X); + } + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + + in += 16 * 8; + } +} + +#undef Sigma0 +#undef Sigma1 +#undef sigma0 +#undef sigma1 +#undef Ch +#undef Maj +#undef ROUND_00_15 +#undef ROUND_16_80 + +void sha512_update(SHA512_CTX *c, const void *in_data, size_t len) { + uint64_t l; + uint8_t *p = c->p; + const uint8_t *data = reinterpret_cast(in_data); + + if (len == 0) { + return; + } + + l = (c->Nl + (((uint64_t)len) << 3)) & UINT64_C(0xffffffffffffffff); + if (l < c->Nl) { + c->Nh++; + } + if (sizeof(len) >= 8) { + c->Nh += (((uint64_t)len) >> 61); + } + c->Nl = l; + + if (c->num != 0) { + size_t n = sizeof(c->p) - c->num; + + if (len < n) { + memcpy(p + c->num, data, len); + c->num += (unsigned int)len; + return; + } else { + memcpy(p + c->num, data, n), c->num = 0; + len -= n; + data += n; + sha512_block_data_order(c->h, p, 1); + } + } + + if (len >= sizeof(c->p)) { + sha512_block_data_order(c->h, data, len / sizeof(c->p)); + data += len; + len %= sizeof(c->p); + data -= len; + } + + if (len != 0) { + memcpy(p, data, len); + c->num = (int)len; + } + + return; +} + +} // namespace +} // namespace entropy +BSSL_NAMESPACE_END diff --git a/third_party/boringssl/src/crypto/fipsmodule/fips_shared_support.c b/third_party/boringssl/src/crypto/fipsmodule/fips_shared_support.c deleted file mode 100644 index 2a66a1f0..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/fips_shared_support.c +++ /dev/null @@ -1,32 +0,0 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - - -#if defined(BORINGSSL_FIPS) && defined(BORINGSSL_SHARED_LIBRARY) -// BORINGSSL_bcm_text_hash is is default hash value for the FIPS integrity check -// that must be replaced with the real value during the build process. This -// value need only be distinct, i.e. so that we can safely search-and-replace it -// in an object file. -const uint8_t BORINGSSL_bcm_text_hash[64]; -const uint8_t BORINGSSL_bcm_text_hash[64] = { - 0xae, 0x2c, 0xea, 0x2a, 0xbd, 0xa6, 0xf3, 0xec, 0x97, 0x7f, 0x9b, - 0xf6, 0x94, 0x9a, 0xfc, 0x83, 0x68, 0x27, 0xcb, 0xa0, 0xa0, 0x9f, - 0x6b, 0x6f, 0xde, 0x52, 0xcd, 0xe2, 0xcd, 0xff, 0x31, 0x80, 0xa2, - 0xd4, 0xc3, 0x66, 0x0f, 0xc2, 0x6a, 0x7b, 0xf4, 0xbe, 0x39, 0xa2, - 0xd7, 0x25, 0xdb, 0x21, 0x98, 0xe9, 0xd5, 0x53, 0xbf, 0x5c, 0x32, - 0x06, 0x83, 0x34, 0x0c, 0x65, 0x89, 0x52, 0xbd, 0x1f, -}; -#endif // FIPS && SHARED_LIBRARY diff --git a/third_party/boringssl/src/crypto/fipsmodule/fips_shared_support.cc b/third_party/boringssl/src/crypto/fipsmodule/fips_shared_support.cc new file mode 100644 index 00000000..f1f9d99f --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/fips_shared_support.cc @@ -0,0 +1,28 @@ +// Copyright 2019 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + + +#if defined(BORINGSSL_FIPS) && defined(BORINGSSL_SHARED_LIBRARY) +// BORINGSSL_bcm_text_hash is is default hash value for the FIPS integrity check +// that must be replaced with the real value during the build process. This +// value need only be distinct, i.e. so that we can safely search-and-replace it +// in an object file. +extern const uint8_t BORINGSSL_bcm_text_hash[32] = { + 0xae, 0x2c, 0xea, 0x2a, 0xbd, 0xa6, 0xf3, 0xec, 0x97, 0x7f, 0x9b, + 0xf6, 0x94, 0x9a, 0xfc, 0x83, 0x68, 0x27, 0xcb, 0xa0, 0xa0, 0x9f, + 0x6b, 0x6f, 0xde, 0x52, 0xcd, 0xe2, 0xcd, 0xff, 0x31, 0x80, +}; +#endif // FIPS && SHARED_LIBRARY diff --git a/third_party/boringssl/src/crypto/fipsmodule/hkdf/hkdf.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/hkdf/hkdf.cc.inc new file mode 100644 index 00000000..dc868c1f --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/hkdf/hkdf.cc.inc @@ -0,0 +1,114 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include + +#include "../../internal.h" + + +using namespace bssl; + +int HKDF(uint8_t *out_key, size_t out_len, const EVP_MD *digest, + const uint8_t *secret, size_t secret_len, const uint8_t *salt, + size_t salt_len, const uint8_t *info, size_t info_len) { + // https://tools.ietf.org/html/rfc5869#section-2 + uint8_t prk[EVP_MAX_MD_SIZE]; + size_t prk_len; + + if (!HKDF_extract(prk, &prk_len, digest, secret, secret_len, salt, + salt_len) || + !HKDF_expand(out_key, out_len, digest, prk, prk_len, info, info_len)) { + return 0; + } + + return 1; +} + +int HKDF_extract(uint8_t *out_key, size_t *out_len, const EVP_MD *digest, + const uint8_t *secret, size_t secret_len, const uint8_t *salt, + size_t salt_len) { + // https://tools.ietf.org/html/rfc5869#section-2.2 + + // If salt is not given, HashLength zeros are used. However, HMAC does that + // internally already so we can ignore it. + unsigned len; + if (HMAC(digest, salt, salt_len, secret, secret_len, out_key, &len) == + nullptr) { + OPENSSL_PUT_ERROR(HKDF, ERR_R_HMAC_LIB); + return 0; + } + *out_len = len; + assert(*out_len == EVP_MD_size(digest)); + return 1; +} + +int HKDF_expand(uint8_t *out_key, size_t out_len, const EVP_MD *digest, + const uint8_t *prk, size_t prk_len, const uint8_t *info, + size_t info_len) { + // https://tools.ietf.org/html/rfc5869#section-2.3 + const size_t digest_len = EVP_MD_size(digest); + uint8_t previous[EVP_MAX_MD_SIZE]; + size_t n, done = 0; + unsigned i; + int ret = 0; + HMAC_CTX hmac; + + // Expand key material to desired length. + n = (out_len + digest_len - 1) / digest_len; + if (out_len + digest_len < out_len || n > 255) { + OPENSSL_PUT_ERROR(HKDF, HKDF_R_OUTPUT_TOO_LARGE); + return 0; + } + + HMAC_CTX_init(&hmac); + if (!HMAC_Init_ex(&hmac, prk, prk_len, digest, nullptr)) { + goto out; + } + + for (i = 0; i < n; i++) { + uint8_t ctr = i + 1; + size_t todo; + + if (i != 0 && (!HMAC_Init_ex(&hmac, nullptr, 0, nullptr, nullptr) || + !HMAC_Update(&hmac, previous, digest_len))) { + goto out; + } + if (!HMAC_Update(&hmac, info, info_len) || !HMAC_Update(&hmac, &ctr, 1) || + !HMAC_Final(&hmac, previous, nullptr)) { + goto out; + } + + todo = digest_len; + if (todo > out_len - done) { + todo = out_len - done; + } + OPENSSL_memcpy(out_key + done, previous, todo); + done += todo; + } + + ret = 1; + +out: + HMAC_CTX_cleanup(&hmac); + if (ret != 1) { + OPENSSL_PUT_ERROR(HKDF, ERR_R_HMAC_LIB); + } + return ret; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/hmac/hmac.c b/third_party/boringssl/src/crypto/fipsmodule/hmac/hmac.c deleted file mode 100644 index ca774bc0..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/hmac/hmac.c +++ /dev/null @@ -1,257 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include - -#include "../../internal.h" -#include "../service_indicator/internal.h" - - -uint8_t *HMAC(const EVP_MD *evp_md, const void *key, size_t key_len, - const uint8_t *data, size_t data_len, uint8_t *out, - unsigned int *out_len) { - HMAC_CTX ctx; - HMAC_CTX_init(&ctx); - - // The underlying hash functions should not set the FIPS service indicator - // until all operations have completed. - FIPS_service_indicator_lock_state(); - const int ok = HMAC_Init_ex(&ctx, key, key_len, evp_md, NULL) && - HMAC_Update(&ctx, data, data_len) && - HMAC_Final(&ctx, out, out_len); - FIPS_service_indicator_unlock_state(); - - HMAC_CTX_cleanup(&ctx); - - if (!ok) { - return NULL; - } - - HMAC_verify_service_indicator(evp_md); - return out; -} - -void HMAC_CTX_init(HMAC_CTX *ctx) { - ctx->md = NULL; - EVP_MD_CTX_init(&ctx->i_ctx); - EVP_MD_CTX_init(&ctx->o_ctx); - EVP_MD_CTX_init(&ctx->md_ctx); -} - -HMAC_CTX *HMAC_CTX_new(void) { - HMAC_CTX *ctx = OPENSSL_malloc(sizeof(HMAC_CTX)); - if (ctx != NULL) { - HMAC_CTX_init(ctx); - } - return ctx; -} - -void HMAC_CTX_cleanup(HMAC_CTX *ctx) { - EVP_MD_CTX_cleanup(&ctx->i_ctx); - EVP_MD_CTX_cleanup(&ctx->o_ctx); - EVP_MD_CTX_cleanup(&ctx->md_ctx); - OPENSSL_cleanse(ctx, sizeof(HMAC_CTX)); -} - -void HMAC_CTX_cleanse(HMAC_CTX *ctx) { - EVP_MD_CTX_cleanse(&ctx->i_ctx); - EVP_MD_CTX_cleanse(&ctx->o_ctx); - EVP_MD_CTX_cleanse(&ctx->md_ctx); - OPENSSL_cleanse(ctx, sizeof(HMAC_CTX)); -} - -void HMAC_CTX_free(HMAC_CTX *ctx) { - if (ctx == NULL) { - return; - } - - HMAC_CTX_cleanup(ctx); - OPENSSL_free(ctx); -} - -int HMAC_Init_ex(HMAC_CTX *ctx, const void *key, size_t key_len, - const EVP_MD *md, ENGINE *impl) { - int ret = 0; - FIPS_service_indicator_lock_state(); - - if (md == NULL) { - md = ctx->md; - } - - // If either |key| is non-NULL or |md| has changed, initialize with a new key - // rather than rewinding the previous one. - // - // TODO(davidben,eroman): Passing the previous |md| with a NULL |key| is - // ambiguous between using the empty key and reusing the previous key. There - // exist callers which intend the latter, but the former is an awkward edge - // case. Fix to API to avoid this. - if (md != ctx->md || key != NULL) { - uint8_t pad[EVP_MAX_MD_BLOCK_SIZE]; - uint8_t key_block[EVP_MAX_MD_BLOCK_SIZE]; - unsigned key_block_len; - - size_t block_size = EVP_MD_block_size(md); - assert(block_size <= sizeof(key_block)); - if (block_size < key_len) { - // Long keys are hashed. - if (!EVP_DigestInit_ex(&ctx->md_ctx, md, impl) || - !EVP_DigestUpdate(&ctx->md_ctx, key, key_len) || - !EVP_DigestFinal_ex(&ctx->md_ctx, key_block, &key_block_len)) { - goto out; - } - } else { - assert(key_len <= sizeof(key_block)); - OPENSSL_memcpy(key_block, key, key_len); - key_block_len = (unsigned)key_len; - } - // Keys are then padded with zeros. - if (key_block_len != EVP_MAX_MD_BLOCK_SIZE) { - OPENSSL_memset(&key_block[key_block_len], 0, sizeof(key_block) - key_block_len); - } - - for (size_t i = 0; i < EVP_MAX_MD_BLOCK_SIZE; i++) { - pad[i] = 0x36 ^ key_block[i]; - } - if (!EVP_DigestInit_ex(&ctx->i_ctx, md, impl) || - !EVP_DigestUpdate(&ctx->i_ctx, pad, EVP_MD_block_size(md))) { - goto out; - } - - for (size_t i = 0; i < EVP_MAX_MD_BLOCK_SIZE; i++) { - pad[i] = 0x5c ^ key_block[i]; - } - if (!EVP_DigestInit_ex(&ctx->o_ctx, md, impl) || - !EVP_DigestUpdate(&ctx->o_ctx, pad, EVP_MD_block_size(md))) { - goto out; - } - - ctx->md = md; - } - - ret = EVP_MD_CTX_copy_ex(&ctx->md_ctx, &ctx->i_ctx); - -out: - FIPS_service_indicator_unlock_state(); - return ret; -} - -int HMAC_Update(HMAC_CTX *ctx, const uint8_t *data, size_t data_len) { - return EVP_DigestUpdate(&ctx->md_ctx, data, data_len); -} - -int HMAC_Final(HMAC_CTX *ctx, uint8_t *out, unsigned int *out_len) { - int ret = 0; - unsigned int i; - uint8_t buf[EVP_MAX_MD_SIZE]; - - FIPS_service_indicator_lock_state(); - // TODO(davidben): The only thing that can officially fail here is - // |EVP_MD_CTX_copy_ex|, but even that should be impossible in this case. - if (!EVP_DigestFinal_ex(&ctx->md_ctx, buf, &i) || - !EVP_MD_CTX_copy_ex(&ctx->md_ctx, &ctx->o_ctx) || - !EVP_DigestUpdate(&ctx->md_ctx, buf, i) || - !EVP_DigestFinal_ex(&ctx->md_ctx, out, out_len)) { - *out_len = 0; - goto out; - } - - ret = 1; - - out: - FIPS_service_indicator_unlock_state(); - if (ret) { - HMAC_verify_service_indicator(ctx->md); - } - return ret; -} - -size_t HMAC_size(const HMAC_CTX *ctx) { return EVP_MD_size(ctx->md); } - -const EVP_MD *HMAC_CTX_get_md(const HMAC_CTX *ctx) { return ctx->md; } - -int HMAC_CTX_copy_ex(HMAC_CTX *dest, const HMAC_CTX *src) { - if (!EVP_MD_CTX_copy_ex(&dest->i_ctx, &src->i_ctx) || - !EVP_MD_CTX_copy_ex(&dest->o_ctx, &src->o_ctx) || - !EVP_MD_CTX_copy_ex(&dest->md_ctx, &src->md_ctx)) { - return 0; - } - - dest->md = src->md; - return 1; -} - -void HMAC_CTX_reset(HMAC_CTX *ctx) { - HMAC_CTX_cleanup(ctx); - HMAC_CTX_init(ctx); -} - -int HMAC_Init(HMAC_CTX *ctx, const void *key, int key_len, const EVP_MD *md) { - if (key && md) { - HMAC_CTX_init(ctx); - } - return HMAC_Init_ex(ctx, key, key_len, md, NULL); -} - -int HMAC_CTX_copy(HMAC_CTX *dest, const HMAC_CTX *src) { - HMAC_CTX_init(dest); - return HMAC_CTX_copy_ex(dest, src); -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/hmac/hmac.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/hmac/hmac.cc.inc new file mode 100644 index 00000000..62117602 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/hmac/hmac.cc.inc @@ -0,0 +1,218 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "../service_indicator/internal.h" + + +using namespace bssl; + +uint8_t *HMAC(const EVP_MD *evp_md, const void *key, size_t key_len, + const uint8_t *data, size_t data_len, uint8_t *out, + unsigned int *out_len) { + HMAC_CTX ctx; + HMAC_CTX_init(&ctx); + + // The underlying hash functions should not set the FIPS service indicator + // until all operations have completed. + FIPS_service_indicator_lock_state(); + const int ok = HMAC_Init_ex(&ctx, key, key_len, evp_md, nullptr) && + HMAC_Update(&ctx, data, data_len) && + HMAC_Final(&ctx, out, out_len); + FIPS_service_indicator_unlock_state(); + + HMAC_CTX_cleanup(&ctx); + + if (!ok) { + return nullptr; + } + + HMAC_verify_service_indicator(evp_md); + return out; +} + +void HMAC_CTX_init(HMAC_CTX *ctx) { + ctx->md = nullptr; + EVP_MD_CTX_init(&ctx->i_ctx); + EVP_MD_CTX_init(&ctx->o_ctx); + EVP_MD_CTX_init(&ctx->md_ctx); +} + +HMAC_CTX *HMAC_CTX_new() { + HMAC_CTX *ctx = New(); + if (ctx != nullptr) { + HMAC_CTX_init(ctx); + } + return ctx; +} + +void HMAC_CTX_cleanup(HMAC_CTX *ctx) { + EVP_MD_CTX_cleanup(&ctx->i_ctx); + EVP_MD_CTX_cleanup(&ctx->o_ctx); + EVP_MD_CTX_cleanup(&ctx->md_ctx); + OPENSSL_cleanse(ctx, sizeof(HMAC_CTX)); +} + +void HMAC_CTX_cleanse(HMAC_CTX *ctx) { + EVP_MD_CTX_cleanse(&ctx->i_ctx); + EVP_MD_CTX_cleanse(&ctx->o_ctx); + EVP_MD_CTX_cleanse(&ctx->md_ctx); + OPENSSL_cleanse(ctx, sizeof(HMAC_CTX)); +} + +void HMAC_CTX_free(HMAC_CTX *ctx) { + if (ctx == nullptr) { + return; + } + + HMAC_CTX_cleanup(ctx); + Delete(ctx); +} + +int HMAC_Init_ex(HMAC_CTX *ctx, const void *key, size_t key_len, + const EVP_MD *md, ENGINE *impl) { + int ret = 0; + FIPS_service_indicator_lock_state(); + + if (md == nullptr) { + md = ctx->md; + } + + // If either |key| is non-NULL or |md| has changed, initialize with a new key + // rather than rewinding the previous one. + // + // TODO(davidben,eroman): Passing the previous |md| with a NULL |key| is + // ambiguous between using the empty key and reusing the previous key. There + // exist callers which intend the latter, but the former is an awkward edge + // case. Fix to API to avoid this. + if (md != ctx->md || key != nullptr) { + uint8_t pad[EVP_MAX_MD_BLOCK_SIZE]; + uint8_t key_block[EVP_MAX_MD_BLOCK_SIZE]; + unsigned key_block_len; + + size_t block_size = EVP_MD_block_size(md); + assert(block_size <= sizeof(key_block)); + assert(EVP_MD_size(md) <= block_size); + if (block_size < key_len) { + // Long keys are hashed. + if (!EVP_DigestInit_ex(&ctx->md_ctx, md, impl) || + !EVP_DigestUpdate(&ctx->md_ctx, key, key_len) || + !EVP_DigestFinal_ex(&ctx->md_ctx, key_block, &key_block_len)) { + goto out; + } + } else { + assert(key_len <= sizeof(key_block)); + OPENSSL_memcpy(key_block, key, key_len); + key_block_len = (unsigned)key_len; + } + // Keys are then padded with zeros. + OPENSSL_memset(key_block + key_block_len, 0, block_size - key_block_len); + + for (size_t i = 0; i < block_size; i++) { + pad[i] = 0x36 ^ key_block[i]; + } + if (!EVP_DigestInit_ex(&ctx->i_ctx, md, impl) || + !EVP_DigestUpdate(&ctx->i_ctx, pad, block_size)) { + goto out; + } + + for (size_t i = 0; i < block_size; i++) { + pad[i] = 0x5c ^ key_block[i]; + } + if (!EVP_DigestInit_ex(&ctx->o_ctx, md, impl) || + !EVP_DigestUpdate(&ctx->o_ctx, pad, block_size)) { + goto out; + } + + ctx->md = md; + } + + ret = EVP_MD_CTX_copy_ex(&ctx->md_ctx, &ctx->i_ctx); + +out: + FIPS_service_indicator_unlock_state(); + return ret; +} + +int HMAC_Update(HMAC_CTX *ctx, const uint8_t *data, size_t data_len) { + return EVP_DigestUpdate(&ctx->md_ctx, data, data_len); +} + +int HMAC_Final(HMAC_CTX *ctx, uint8_t *out, unsigned int *out_len) { + int ret = 0; + unsigned int i; + uint8_t buf[EVP_MAX_MD_SIZE]; + + FIPS_service_indicator_lock_state(); + // TODO(davidben): The only thing that can officially fail here is + // |EVP_MD_CTX_copy_ex|, but even that should be impossible in this case. + if (!EVP_DigestFinal_ex(&ctx->md_ctx, buf, &i) || + !EVP_MD_CTX_copy_ex(&ctx->md_ctx, &ctx->o_ctx) || + !EVP_DigestUpdate(&ctx->md_ctx, buf, i) || + !EVP_DigestFinal_ex(&ctx->md_ctx, out, out_len)) { + goto out; + } + + ret = 1; + +out: + FIPS_service_indicator_unlock_state(); + if (ret) { + HMAC_verify_service_indicator(ctx->md); + } else if (out_len) { + *out_len = 0; + } + return ret; +} + +size_t HMAC_size(const HMAC_CTX *ctx) { return EVP_MD_size(ctx->md); } + +const EVP_MD *HMAC_CTX_get_md(const HMAC_CTX *ctx) { return ctx->md; } + +int HMAC_CTX_copy_ex(HMAC_CTX *dest, const HMAC_CTX *src) { + if (!EVP_MD_CTX_copy_ex(&dest->i_ctx, &src->i_ctx) || + !EVP_MD_CTX_copy_ex(&dest->o_ctx, &src->o_ctx) || + !EVP_MD_CTX_copy_ex(&dest->md_ctx, &src->md_ctx)) { + return 0; + } + + dest->md = src->md; + return 1; +} + +void HMAC_CTX_reset(HMAC_CTX *ctx) { + HMAC_CTX_cleanup(ctx); + HMAC_CTX_init(ctx); +} + +int HMAC_Init(HMAC_CTX *ctx, const void *key, int key_len, const EVP_MD *md) { + if (key && md) { + HMAC_CTX_init(ctx); + } + return HMAC_Init_ex(ctx, key, key_len, md, nullptr); +} + +int HMAC_CTX_copy(HMAC_CTX *dest, const HMAC_CTX *src) { + HMAC_CTX_init(dest); + return HMAC_CTX_copy_ex(dest, src); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/keccak/internal.h b/third_party/boringssl/src/crypto/fipsmodule/keccak/internal.h new file mode 100644 index 00000000..ff10c444 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/keccak/internal.h @@ -0,0 +1,96 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_KECCAK_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_KECCAK_INTERNAL_H + +#include + + +BSSL_NAMESPACE_BEGIN + +enum boringssl_keccak_config_t : int32_t { + boringssl_sha3_256, + boringssl_sha3_512, + boringssl_shake128, + boringssl_shake256, +}; + +enum boringssl_keccak_phase_t : int32_t { + boringssl_keccak_phase_absorb, + boringssl_keccak_phase_squeeze, +}; + +struct BORINGSSL_keccak_st { + // Note: the state with 64-bit integers comes first so that the size of this + // struct is easy to compute on all architectures without padding surprises + // due to alignment. + uint64_t state[25]; + enum boringssl_keccak_config_t config; + enum boringssl_keccak_phase_t phase; + size_t required_out_len; + size_t rate_bytes; + size_t absorb_offset; + size_t squeeze_offset; +}; + +// BORINGSSL_keccak hashes |in_len| bytes from |in| and writes |out_len| bytes +// of output to |out|. If the |config| specifies a fixed-output function, like +// SHA3-256, then |out_len| must be the correct length for that function. +OPENSSL_EXPORT void BORINGSSL_keccak(uint8_t *out, size_t out_len, + const uint8_t *in, size_t in_len, + enum boringssl_keccak_config_t config); + +// BORINGSSL_keccak_init prepares |ctx| for absorbing. If the |config| specifies +// a fixed-output function, like SHA3-256, then the output must be squeezed in a +// single call to |BORINGSSL_keccak_squeeze|. In that case, it is recommended to +// use |BORINGSSL_keccak| if the input can be absorbed in a single call. +OPENSSL_EXPORT void BORINGSSL_keccak_init( + struct BORINGSSL_keccak_st *ctx, enum boringssl_keccak_config_t config); + +// BORINGSSL_keccak_absorb absorbs |in_len| bytes from |in|. +OPENSSL_EXPORT void BORINGSSL_keccak_absorb(struct BORINGSSL_keccak_st *ctx, + const uint8_t *in, size_t in_len); + +// BORINGSSL_keccak_squeeze writes |out_len| bytes to |out| from |ctx|. If the +// configuration previously passed in |BORINGSSL_keccak_init| specifies a +// fixed-output function, then a single call to |BORINGSSL_keccak_squeeze| is +// allowed, where |out_len| must be the correct length for that function. +OPENSSL_EXPORT void BORINGSSL_keccak_squeeze(struct BORINGSSL_keccak_st *ctx, + uint8_t *out, size_t out_len); + +#if defined(__has_attribute) +#if __has_attribute(vector_size) +#define HAVE_KECCAK_X2 +#endif // vector_size +#endif // __has_attribute + +#if defined(HAVE_KECCAK_X2) +// BORINGSSL_keccak_squeeze_x2 performs BORINGSSL_keccak_squeeze in parallel +// with two same-length outputs. The contexts must be in equivalent state (i.e. +// same config, same amount of bytes absorbed and squeezed). +OPENSSL_EXPORT void BORINGSSL_keccak_squeeze_x2( + struct BORINGSSL_keccak_st ctx[2], uint8_t *outs[2], size_t out_len); + +// BORINGSSL_keccak_short_x2 performs BORINGSSL_keccak in parallel on two +// same-length strings with same-length outputs. |in_len| must be less than 72 +// (or actually |rate_bytes|). +OPENSSL_EXPORT void BORINGSSL_keccak_short_x2( + uint8_t *outs[2], size_t out_len, const uint8_t *ins[2], size_t in_len, + enum boringssl_keccak_config_t config); +#endif + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_KECCAK_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/keccak/keccak.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/keccak/keccak.cc.inc new file mode 100644 index 00000000..5008cd46 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/keccak/keccak.cc.inc @@ -0,0 +1,407 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "../../internal.h" +#include "./internal.h" + + +using namespace bssl; + +// keccak_f implements the Keccak-1600 permutation as described at +// https://keccak.team/keccak_specs_summary.html. Each lane is represented as a +// 64-bit value and the 5×5 lanes are stored as an array in row-major order. +// +// To support vectorization, U64 shall either be uint64_t, or an uint64_t-based +// vector type. +template +static void keccak_f(U64 state[25]) { + static const int kNumRounds = 24; + for (int round = 0; round < kNumRounds; round++) { + // θ step + U64 c[5]; + for (int x = 0; x < 5; x++) { + c[x] = state[x] ^ state[x + 5] ^ state[x + 10] ^ state[x + 15] ^ + state[x + 20]; + } + + for (int x = 0; x < 5; x++) { + const U64 d = c[(x + 4) % 5] ^ rotl(c[(x + 1) % 5], 1); + for (int y = 0; y < 5; y++) { + state[y * 5 + x] ^= d; + } + } + + // ρ and π steps. + // + // These steps involve a mapping of the state matrix. Each input point, + // (x,y), is rotated and written to the point (y, 2x + 3y). In the Keccak + // pseudo-code a separate array is used because an in-place operation would + // overwrite some values that are subsequently needed. However, the mapping + // forms a trail through 24 of the 25 values so we can do it in place with + // only a single temporary variable. + // + // Start with (1, 0). The value here will be mapped and end up at (0, 2). + // That value will end up at (2, 1), then (1, 2), and so on. After 24 + // steps, 24 of the 25 values have been hit (as this mapping is injective) + // and the sequence will repeat. All that remains is to handle the element + // at (0, 0), but the rotation for that element is zero, and it goes to (0, + // 0), so we can ignore it. + U64 prev_value = state[1]; +#define PI_RHO_STEP(index, rotation) \ + do { \ + const U64 value = rotl(prev_value, rotation); \ + prev_value = state[index]; \ + state[index] = value; \ + } while (0) + + PI_RHO_STEP(10, 1); + PI_RHO_STEP(7, 3); + PI_RHO_STEP(11, 6); + PI_RHO_STEP(17, 10); + PI_RHO_STEP(18, 15); + PI_RHO_STEP(3, 21); + PI_RHO_STEP(5, 28); + PI_RHO_STEP(16, 36); + PI_RHO_STEP(8, 45); + PI_RHO_STEP(21, 55); + PI_RHO_STEP(24, 2); + PI_RHO_STEP(4, 14); + PI_RHO_STEP(15, 27); + PI_RHO_STEP(23, 41); + PI_RHO_STEP(19, 56); + PI_RHO_STEP(13, 8); + PI_RHO_STEP(12, 25); + PI_RHO_STEP(2, 43); + PI_RHO_STEP(20, 62); + PI_RHO_STEP(14, 18); + PI_RHO_STEP(22, 39); + PI_RHO_STEP(9, 61); + PI_RHO_STEP(6, 20); + PI_RHO_STEP(1, 44); + +#undef PI_RHO_STEP + + // χ step + for (int y = 0; y < 5; y++) { + const int row_index = 5 * y; + const U64 orig_x0 = state[row_index]; + const U64 orig_x1 = state[row_index + 1]; + state[row_index] ^= ~orig_x1 & state[row_index + 2]; + state[row_index + 1] ^= ~state[row_index + 2] & state[row_index + 3]; + state[row_index + 2] ^= ~state[row_index + 3] & state[row_index + 4]; + state[row_index + 3] ^= ~state[row_index + 4] & orig_x0; + state[row_index + 4] ^= ~orig_x0 & orig_x1; + } + + // ι step + // + // From https://keccak.team/files/Keccak-reference-3.0.pdf, section + // 1.2, the round constants are based on the output of a LFSR. Thus, as + // suggested in the appendix of of + // https://keccak.team/keccak_specs_summary.html, the values are + // simply encoded here. + static const uint64_t kRoundConstants[24] = { + 0x0000000000000001, 0x0000000000008082, 0x800000000000808a, + 0x8000000080008000, 0x000000000000808b, 0x0000000080000001, + 0x8000000080008081, 0x8000000000008009, 0x000000000000008a, + 0x0000000000000088, 0x0000000080008009, 0x000000008000000a, + 0x000000008000808b, 0x800000000000008b, 0x8000000000008089, + 0x8000000000008003, 0x8000000000008002, 0x8000000000000080, + 0x000000000000800a, 0x800000008000000a, 0x8000000080008081, + 0x8000000000008080, 0x0000000080000001, 0x8000000080008008, + }; + + state[0] ^= kRoundConstants[round]; + } +} + +#if defined(HAVE_KECCAK_X2) +typedef uint64_t v2u64 __attribute__((vector_size(16))); + +static inline v2u64 v_rotl(v2u64 x, int shift) { + return (x << shift) | (x >> ((-shift) & 63)); +} + +static void keccak_f_x2(uint64_t state0[25], uint64_t state1[25]) { + v2u64 s[25]; + for (int i = 0; i < 25; i++) { + s[i] = (v2u64){state0[i], state1[i]}; + } + keccak_f(s); + for (int i = 0; i < 25; i++) { + state0[i] = s[i][0]; + state1[i] = s[i][1]; + } +} +#endif + +static void keccak_init(struct BORINGSSL_keccak_st *ctx, + enum boringssl_keccak_config_t config) { + size_t required_out_len; + size_t capacity_bytes; + switch (config) { + case boringssl_sha3_256: + capacity_bytes = 512 / 8; + required_out_len = 32; + break; + case boringssl_sha3_512: + capacity_bytes = 1024 / 8; + required_out_len = 64; + break; + case boringssl_shake128: + capacity_bytes = 256 / 8; + required_out_len = 0; + break; + case boringssl_shake256: + capacity_bytes = 512 / 8; + required_out_len = 0; + break; + default: + abort(); + } + + OPENSSL_memset(ctx, 0, sizeof(*ctx)); + ctx->config = config; + ctx->phase = boringssl_keccak_phase_absorb; + ctx->required_out_len = required_out_len; + ctx->rate_bytes = 200 - capacity_bytes; + assert(ctx->rate_bytes % 8 == 0); +} + +void bssl::BORINGSSL_keccak(uint8_t *out, size_t out_len, const uint8_t *in, + size_t in_len, + enum boringssl_keccak_config_t config) { + struct BORINGSSL_keccak_st ctx; + BORINGSSL_keccak_init(&ctx, config); + if (ctx.required_out_len != 0 && out_len != ctx.required_out_len) { + abort(); + } + BORINGSSL_keccak_absorb(&ctx, in, in_len); + BORINGSSL_keccak_squeeze(&ctx, out, out_len); +} + +#if defined(HAVE_KECCAK_X2) +void bssl::BORINGSSL_keccak_short_x2(uint8_t *outs[2], size_t out_len, + const uint8_t *ins[2], size_t in_len, + enum boringssl_keccak_config_t config) { + struct BORINGSSL_keccak_st ctx[2]; + for (size_t i = 0; i < 2; ++i) { + BORINGSSL_keccak_init(&ctx[i], config); + if (ctx[i].required_out_len != 0 && out_len != ctx[i].required_out_len) { + abort(); + } + + // NOTE: this implementation is only efficient if in_len < ctx->rate_bytes, + // as right now only keccak_f calls in BORINGSSL_keccak_squeeze and + // keccak_finalize are vectorized. So just fail in every other case for + // now. + BSSL_CHECK(in_len < ctx[i].rate_bytes); + + BORINGSSL_keccak_absorb(&ctx[i], ins[i], in_len); + } + BORINGSSL_keccak_squeeze_x2(ctx, outs, out_len); +} +#endif + +void bssl::BORINGSSL_keccak_init(struct BORINGSSL_keccak_st *ctx, + enum boringssl_keccak_config_t config) { + keccak_init(ctx, config); +} + +void bssl::BORINGSSL_keccak_absorb(struct BORINGSSL_keccak_st *ctx, + const uint8_t *in, size_t in_len) { + if (ctx->phase == boringssl_keccak_phase_squeeze) { + // It's illegal to call absorb() again after calling squeeze(). + abort(); + } + + const size_t rate_words = ctx->rate_bytes / 8; + // XOR the input. Accessing |ctx->state| as a |uint8_t*| is allowed by strict + // aliasing because we require |uint8_t| to be a character type. + uint8_t *state_bytes = (uint8_t *)ctx->state; + + // Absorb partial block. + if (ctx->absorb_offset != 0) { + assert(ctx->absorb_offset < ctx->rate_bytes); + size_t first_block_len = ctx->rate_bytes - ctx->absorb_offset; + for (size_t i = 0; i < first_block_len && i < in_len; i++) { + state_bytes[ctx->absorb_offset + i] ^= in[i]; + } + + // This input didn't fill the block. + if (first_block_len > in_len) { + ctx->absorb_offset += in_len; + return; + } + + keccak_f(ctx->state); + in += first_block_len; + in_len -= first_block_len; + } + + // Absorb full blocks. + while (in_len >= ctx->rate_bytes) { + for (size_t i = 0; i < rate_words; i++) { + ctx->state[i] ^= CRYPTO_load_u64_le(in + 8 * i); + } + keccak_f(ctx->state); + in += ctx->rate_bytes; + in_len -= ctx->rate_bytes; + } + + // Absorb partial block. + assert(in_len < ctx->rate_bytes); + for (size_t i = 0; i < in_len; i++) { + state_bytes[i] ^= in[i]; + } + ctx->absorb_offset = in_len; +} + +static uint8_t keccak_terminator(struct BORINGSSL_keccak_st *ctx) { + switch (ctx->config) { + case boringssl_sha3_256: + case boringssl_sha3_512: + return 0x06; + case boringssl_shake128: + case boringssl_shake256: + return 0x1f; + default: + abort(); + } +} + +static void keccak_finalize(struct BORINGSSL_keccak_st *ctx) { + // XOR the terminator. Accessing |ctx->state| as a |uint8_t*| is allowed by + // strict aliasing because we require |uint8_t| to be a character type. + uint8_t *state_bytes = (uint8_t *)ctx->state; + state_bytes[ctx->absorb_offset] ^= keccak_terminator(ctx); + state_bytes[ctx->rate_bytes - 1] ^= 0x80; + keccak_f(ctx->state); +} + +#if defined(HAVE_KECCAK_X2) +static void keccak_finalize_x2(struct BORINGSSL_keccak_st ctx[2]) { + for (size_t i = 0; i < 2; ++i) { + // XOR the terminator. Accessing |ctx->state| as a |uint8_t*| is allowed by + // strict aliasing because we require |uint8_t| to be a character type. + uint8_t *state_bytes = (uint8_t *)ctx[i].state; + state_bytes[ctx[i].absorb_offset] ^= keccak_terminator(&ctx[i]); + state_bytes[ctx[i].rate_bytes - 1] ^= 0x80; + } + keccak_f_x2(ctx[0].state, ctx[1].state); +} +#endif + +void bssl::BORINGSSL_keccak_squeeze(struct BORINGSSL_keccak_st *ctx, + uint8_t *out, size_t out_len) { + if (ctx->required_out_len != 0 && + (ctx->phase == boringssl_keccak_phase_squeeze || + out_len != ctx->required_out_len)) { + // The SHA-3 variants must be squeezed in a single call, to confirm that the + // output length is correct. + abort(); + } + + if (ctx->phase == boringssl_keccak_phase_absorb) { + keccak_finalize(ctx); + ctx->phase = boringssl_keccak_phase_squeeze; + } + + // Accessing |ctx->state| as a |uint8_t*| is allowed by strict aliasing + // because we require |uint8_t| to be a character type. + const uint8_t *state_bytes = (const uint8_t *)ctx->state; + while (out_len) { + if (ctx->squeeze_offset == ctx->rate_bytes) { + keccak_f(ctx->state); + ctx->squeeze_offset = 0; + } + + size_t remaining = ctx->rate_bytes - ctx->squeeze_offset; + size_t todo = out_len; + if (todo > remaining) { + todo = remaining; + } + OPENSSL_memcpy(out, &state_bytes[ctx->squeeze_offset], todo); + out += todo; + out_len -= todo; + ctx->squeeze_offset += todo; + } +} + +#if defined(HAVE_KECCAK_X2) +void bssl::BORINGSSL_keccak_squeeze_x2(struct BORINGSSL_keccak_st ctx[2], + uint8_t *outs[2], size_t out_len) { + for (size_t i = 0; i < 2; ++i) { + if (ctx[i].required_out_len != 0 && + (ctx[i].phase == boringssl_keccak_phase_squeeze || + out_len != ctx[i].required_out_len)) { + // The SHA-3 variants must be squeezed in a single call, to confirm that + // the output length is correct. + abort(); + } + } + + // These fields are processed in parallel. Everything here uses ctx[0]; at the + // end changes are mirrored back to ctx[1] just in case. +#define FOR_COMMON_FIELDS(MACRO) \ + MACRO(phase) \ + MACRO(config) \ + MACRO(absorb_offset) \ + MACRO(squeeze_offset) \ + MACRO(rate_bytes) + +#define MUST_BE_EQUAL(field) BSSL_CHECK(ctx[0].field == ctx[1].field); + FOR_COMMON_FIELDS(MUST_BE_EQUAL) +#undef MUST_BE_EQUAL + + if (ctx->phase == boringssl_keccak_phase_absorb) { + keccak_finalize_x2(ctx); + ctx->phase = boringssl_keccak_phase_squeeze; + } + + // Accessing |ctx->state| as a |uint8_t*| is allowed by strict aliasing + // because we require |uint8_t| to be a character type. + uint8_t *optr[2] = {outs[0], outs[1]}; + while (out_len) { + if (ctx->squeeze_offset == ctx->rate_bytes) { + keccak_f_x2(ctx[0].state, ctx[1].state); + ctx->squeeze_offset = 0; + } + + size_t remaining = ctx->rate_bytes - ctx->squeeze_offset; + size_t todo = out_len; + if (todo > remaining) { + todo = remaining; + } + for (size_t i = 0; i < 2; ++i) { + const uint8_t *state_bytes = (const uint8_t *)ctx[i].state; + OPENSSL_memcpy(optr[i], &state_bytes[ctx->squeeze_offset], todo); + optr[i] += todo; + } + out_len -= todo; + ctx->squeeze_offset += todo; + } + +#define COPY_FIELD_VALUE(field) ctx[1].field = ctx[0].field; + FOR_COMMON_FIELDS(COPY_FIELD_VALUE) +#undef COPY_FIELD_VALUE + +#undef FOR_COMMON_FIELDS +} +#endif // HAVE_KECCAK_X2 diff --git a/third_party/boringssl/src/crypto/fipsmodule/md4/md4.c b/third_party/boringssl/src/crypto/fipsmodule/md4/md4.c deleted file mode 100644 index 5b44653b..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/md4/md4.c +++ /dev/null @@ -1,240 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include "../../internal.h" -#include "../digest/md32_common.h" - - -uint8_t *MD4(const uint8_t *data, size_t len, uint8_t out[MD4_DIGEST_LENGTH]) { - MD4_CTX ctx; - MD4_Init(&ctx); - MD4_Update(&ctx, data, len); - MD4_Final(out, &ctx); - - return out; -} - -// Implemented from RFC 1186 The MD4 Message-Digest Algorithm. - -int MD4_Init(MD4_CTX *md4) { - OPENSSL_memset(md4, 0, sizeof(MD4_CTX)); - md4->h[0] = 0x67452301UL; - md4->h[1] = 0xefcdab89UL; - md4->h[2] = 0x98badcfeUL; - md4->h[3] = 0x10325476UL; - return 1; -} - -void md4_block_data_order(uint32_t *state, const uint8_t *data, size_t num); - -void MD4_Transform(MD4_CTX *c, const uint8_t data[MD4_CBLOCK]) { - md4_block_data_order(c->h, data, 1); -} - -int MD4_Update(MD4_CTX *c, const void *data, size_t len) { - crypto_md32_update(&md4_block_data_order, c->h, c->data, MD4_CBLOCK, &c->num, - &c->Nh, &c->Nl, data, len); - return 1; -} - -int MD4_Final(uint8_t out[MD4_DIGEST_LENGTH], MD4_CTX *c) { - crypto_md32_final(&md4_block_data_order, c->h, c->data, MD4_CBLOCK, &c->num, - c->Nh, c->Nl, /*is_big_endian=*/0); - - CRYPTO_store_u32_le(out, c->h[0]); - CRYPTO_store_u32_le(out + 4, c->h[1]); - CRYPTO_store_u32_le(out + 8, c->h[2]); - CRYPTO_store_u32_le(out + 12, c->h[3]); - return 1; -} - -// As pointed out by Wei Dai , the above can be -// simplified to the code below. Wei attributes these optimizations -// to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel. -#define F(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) -#define G(b, c, d) (((b) & (c)) | ((b) & (d)) | ((c) & (d))) -#define H(b, c, d) ((b) ^ (c) ^ (d)) - -#define R0(a, b, c, d, k, s, t) \ - do { \ - (a) += ((k) + (t) + F((b), (c), (d))); \ - (a) = CRYPTO_rotl_u32(a, s); \ - } while (0) - -#define R1(a, b, c, d, k, s, t) \ - do { \ - (a) += ((k) + (t) + G((b), (c), (d))); \ - (a) = CRYPTO_rotl_u32(a, s); \ - } while (0) - -#define R2(a, b, c, d, k, s, t) \ - do { \ - (a) += ((k) + (t) + H((b), (c), (d))); \ - (a) = CRYPTO_rotl_u32(a, s); \ - } while (0) - -void md4_block_data_order(uint32_t *state, const uint8_t *data, size_t num) { - uint32_t A, B, C, D; - uint32_t X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15; - - A = state[0]; - B = state[1]; - C = state[2]; - D = state[3]; - - for (; num--;) { - X0 = CRYPTO_load_u32_le(data); - data += 4; - X1 = CRYPTO_load_u32_le(data); - data += 4; - // Round 0 - R0(A, B, C, D, X0, 3, 0); - X2 = CRYPTO_load_u32_le(data); - data += 4; - R0(D, A, B, C, X1, 7, 0); - X3 = CRYPTO_load_u32_le(data); - data += 4; - R0(C, D, A, B, X2, 11, 0); - X4 = CRYPTO_load_u32_le(data); - data += 4; - R0(B, C, D, A, X3, 19, 0); - X5 = CRYPTO_load_u32_le(data); - data += 4; - R0(A, B, C, D, X4, 3, 0); - X6 = CRYPTO_load_u32_le(data); - data += 4; - R0(D, A, B, C, X5, 7, 0); - X7 = CRYPTO_load_u32_le(data); - data += 4; - R0(C, D, A, B, X6, 11, 0); - X8 = CRYPTO_load_u32_le(data); - data += 4; - R0(B, C, D, A, X7, 19, 0); - X9 = CRYPTO_load_u32_le(data); - data += 4; - R0(A, B, C, D, X8, 3, 0); - X10 = CRYPTO_load_u32_le(data); - data += 4; - R0(D, A, B, C, X9, 7, 0); - X11 = CRYPTO_load_u32_le(data); - data += 4; - R0(C, D, A, B, X10, 11, 0); - X12 = CRYPTO_load_u32_le(data); - data += 4; - R0(B, C, D, A, X11, 19, 0); - X13 = CRYPTO_load_u32_le(data); - data += 4; - R0(A, B, C, D, X12, 3, 0); - X14 = CRYPTO_load_u32_le(data); - data += 4; - R0(D, A, B, C, X13, 7, 0); - X15 = CRYPTO_load_u32_le(data); - data += 4; - R0(C, D, A, B, X14, 11, 0); - R0(B, C, D, A, X15, 19, 0); - // Round 1 - R1(A, B, C, D, X0, 3, 0x5A827999L); - R1(D, A, B, C, X4, 5, 0x5A827999L); - R1(C, D, A, B, X8, 9, 0x5A827999L); - R1(B, C, D, A, X12, 13, 0x5A827999L); - R1(A, B, C, D, X1, 3, 0x5A827999L); - R1(D, A, B, C, X5, 5, 0x5A827999L); - R1(C, D, A, B, X9, 9, 0x5A827999L); - R1(B, C, D, A, X13, 13, 0x5A827999L); - R1(A, B, C, D, X2, 3, 0x5A827999L); - R1(D, A, B, C, X6, 5, 0x5A827999L); - R1(C, D, A, B, X10, 9, 0x5A827999L); - R1(B, C, D, A, X14, 13, 0x5A827999L); - R1(A, B, C, D, X3, 3, 0x5A827999L); - R1(D, A, B, C, X7, 5, 0x5A827999L); - R1(C, D, A, B, X11, 9, 0x5A827999L); - R1(B, C, D, A, X15, 13, 0x5A827999L); - // Round 2 - R2(A, B, C, D, X0, 3, 0x6ED9EBA1L); - R2(D, A, B, C, X8, 9, 0x6ED9EBA1L); - R2(C, D, A, B, X4, 11, 0x6ED9EBA1L); - R2(B, C, D, A, X12, 15, 0x6ED9EBA1L); - R2(A, B, C, D, X2, 3, 0x6ED9EBA1L); - R2(D, A, B, C, X10, 9, 0x6ED9EBA1L); - R2(C, D, A, B, X6, 11, 0x6ED9EBA1L); - R2(B, C, D, A, X14, 15, 0x6ED9EBA1L); - R2(A, B, C, D, X1, 3, 0x6ED9EBA1L); - R2(D, A, B, C, X9, 9, 0x6ED9EBA1L); - R2(C, D, A, B, X5, 11, 0x6ED9EBA1L); - R2(B, C, D, A, X13, 15, 0x6ED9EBA1L); - R2(A, B, C, D, X3, 3, 0x6ED9EBA1L); - R2(D, A, B, C, X11, 9, 0x6ED9EBA1L); - R2(C, D, A, B, X7, 11, 0x6ED9EBA1L); - R2(B, C, D, A, X15, 15, 0x6ED9EBA1L); - - A = state[0] += A; - B = state[1] += B; - C = state[2] += C; - D = state[3] += D; - } -} - -#undef F -#undef G -#undef H -#undef R0 -#undef R1 -#undef R2 diff --git a/third_party/boringssl/src/crypto/fipsmodule/md5/internal.h b/third_party/boringssl/src/crypto/fipsmodule/md5/internal.h deleted file mode 100644 index 9ee9f13a..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/md5/internal.h +++ /dev/null @@ -1,37 +0,0 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_MD5_INTERNAL_H -#define OPENSSL_HEADER_MD5_INTERNAL_H - -#include - -#if defined(__cplusplus) -extern "C" { -#endif - - -#if !defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86_64) || defined(OPENSSL_X86)) -#define MD5_ASM -extern void md5_block_asm_data_order(uint32_t *state, const uint8_t *data, - size_t num); -#endif - - -#if defined(__cplusplus) -} // extern "C" -#endif - -#endif // OPENSSL_HEADER_MD5_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/md5/md5.c b/third_party/boringssl/src/crypto/fipsmodule/md5/md5.c deleted file mode 100644 index 16915267..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/md5/md5.c +++ /dev/null @@ -1,284 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include - -#include "../../internal.h" -#include "../digest/md32_common.h" -#include "internal.h" - - -uint8_t *MD5(const uint8_t *data, size_t len, uint8_t out[MD5_DIGEST_LENGTH]) { - MD5_CTX ctx; - MD5_Init(&ctx); - MD5_Update(&ctx, data, len); - MD5_Final(out, &ctx); - - return out; -} - -int MD5_Init(MD5_CTX *md5) { - OPENSSL_memset(md5, 0, sizeof(MD5_CTX)); - md5->h[0] = 0x67452301UL; - md5->h[1] = 0xefcdab89UL; - md5->h[2] = 0x98badcfeUL; - md5->h[3] = 0x10325476UL; - return 1; -} - -#if defined(MD5_ASM) -#define md5_block_data_order md5_block_asm_data_order -#else -static void md5_block_data_order(uint32_t *state, const uint8_t *data, - size_t num); -#endif - -void MD5_Transform(MD5_CTX *c, const uint8_t data[MD5_CBLOCK]) { - md5_block_data_order(c->h, data, 1); -} - -int MD5_Update(MD5_CTX *c, const void *data, size_t len) { - crypto_md32_update(&md5_block_data_order, c->h, c->data, MD5_CBLOCK, &c->num, - &c->Nh, &c->Nl, data, len); - return 1; -} - -int MD5_Final(uint8_t out[MD5_DIGEST_LENGTH], MD5_CTX *c) { - crypto_md32_final(&md5_block_data_order, c->h, c->data, MD5_CBLOCK, &c->num, - c->Nh, c->Nl, /*is_big_endian=*/0); - - CRYPTO_store_u32_le(out, c->h[0]); - CRYPTO_store_u32_le(out + 4, c->h[1]); - CRYPTO_store_u32_le(out + 8, c->h[2]); - CRYPTO_store_u32_le(out + 12, c->h[3]); - return 1; -} - -// As pointed out by Wei Dai , the above can be -// simplified to the code below. Wei attributes these optimizations -// to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel. -#define F(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) -#define G(b, c, d) ((((b) ^ (c)) & (d)) ^ (c)) -#define H(b, c, d) ((b) ^ (c) ^ (d)) -#define I(b, c, d) (((~(d)) | (b)) ^ (c)) - -#define R0(a, b, c, d, k, s, t) \ - do { \ - (a) += ((k) + (t) + F((b), (c), (d))); \ - (a) = CRYPTO_rotl_u32(a, s); \ - (a) += (b); \ - } while (0) - -#define R1(a, b, c, d, k, s, t) \ - do { \ - (a) += ((k) + (t) + G((b), (c), (d))); \ - (a) = CRYPTO_rotl_u32(a, s); \ - (a) += (b); \ - } while (0) - -#define R2(a, b, c, d, k, s, t) \ - do { \ - (a) += ((k) + (t) + H((b), (c), (d))); \ - (a) = CRYPTO_rotl_u32(a, s); \ - (a) += (b); \ - } while (0) - -#define R3(a, b, c, d, k, s, t) \ - do { \ - (a) += ((k) + (t) + I((b), (c), (d))); \ - (a) = CRYPTO_rotl_u32(a, s); \ - (a) += (b); \ - } while (0) - -#ifndef MD5_ASM -#ifdef X -#undef X -#endif -static void md5_block_data_order(uint32_t *state, const uint8_t *data, - size_t num) { - uint32_t A, B, C, D; - uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10, XX11, XX12, - XX13, XX14, XX15; -#define X(i) XX##i - - A = state[0]; - B = state[1]; - C = state[2]; - D = state[3]; - - for (; num--;) { - X(0) = CRYPTO_load_u32_le(data); - data += 4; - X(1) = CRYPTO_load_u32_le(data); - data += 4; - // Round 0 - R0(A, B, C, D, X(0), 7, 0xd76aa478L); - X(2) = CRYPTO_load_u32_le(data); - data += 4; - R0(D, A, B, C, X(1), 12, 0xe8c7b756L); - X(3) = CRYPTO_load_u32_le(data); - data += 4; - R0(C, D, A, B, X(2), 17, 0x242070dbL); - X(4) = CRYPTO_load_u32_le(data); - data += 4; - R0(B, C, D, A, X(3), 22, 0xc1bdceeeL); - X(5) = CRYPTO_load_u32_le(data); - data += 4; - R0(A, B, C, D, X(4), 7, 0xf57c0fafL); - X(6) = CRYPTO_load_u32_le(data); - data += 4; - R0(D, A, B, C, X(5), 12, 0x4787c62aL); - X(7) = CRYPTO_load_u32_le(data); - data += 4; - R0(C, D, A, B, X(6), 17, 0xa8304613L); - X(8) = CRYPTO_load_u32_le(data); - data += 4; - R0(B, C, D, A, X(7), 22, 0xfd469501L); - X(9) = CRYPTO_load_u32_le(data); - data += 4; - R0(A, B, C, D, X(8), 7, 0x698098d8L); - X(10) = CRYPTO_load_u32_le(data); - data += 4; - R0(D, A, B, C, X(9), 12, 0x8b44f7afL); - X(11) = CRYPTO_load_u32_le(data); - data += 4; - R0(C, D, A, B, X(10), 17, 0xffff5bb1L); - X(12) = CRYPTO_load_u32_le(data); - data += 4; - R0(B, C, D, A, X(11), 22, 0x895cd7beL); - X(13) = CRYPTO_load_u32_le(data); - data += 4; - R0(A, B, C, D, X(12), 7, 0x6b901122L); - X(14) = CRYPTO_load_u32_le(data); - data += 4; - R0(D, A, B, C, X(13), 12, 0xfd987193L); - X(15) = CRYPTO_load_u32_le(data); - data += 4; - R0(C, D, A, B, X(14), 17, 0xa679438eL); - R0(B, C, D, A, X(15), 22, 0x49b40821L); - // Round 1 - R1(A, B, C, D, X(1), 5, 0xf61e2562L); - R1(D, A, B, C, X(6), 9, 0xc040b340L); - R1(C, D, A, B, X(11), 14, 0x265e5a51L); - R1(B, C, D, A, X(0), 20, 0xe9b6c7aaL); - R1(A, B, C, D, X(5), 5, 0xd62f105dL); - R1(D, A, B, C, X(10), 9, 0x02441453L); - R1(C, D, A, B, X(15), 14, 0xd8a1e681L); - R1(B, C, D, A, X(4), 20, 0xe7d3fbc8L); - R1(A, B, C, D, X(9), 5, 0x21e1cde6L); - R1(D, A, B, C, X(14), 9, 0xc33707d6L); - R1(C, D, A, B, X(3), 14, 0xf4d50d87L); - R1(B, C, D, A, X(8), 20, 0x455a14edL); - R1(A, B, C, D, X(13), 5, 0xa9e3e905L); - R1(D, A, B, C, X(2), 9, 0xfcefa3f8L); - R1(C, D, A, B, X(7), 14, 0x676f02d9L); - R1(B, C, D, A, X(12), 20, 0x8d2a4c8aL); - // Round 2 - R2(A, B, C, D, X(5), 4, 0xfffa3942L); - R2(D, A, B, C, X(8), 11, 0x8771f681L); - R2(C, D, A, B, X(11), 16, 0x6d9d6122L); - R2(B, C, D, A, X(14), 23, 0xfde5380cL); - R2(A, B, C, D, X(1), 4, 0xa4beea44L); - R2(D, A, B, C, X(4), 11, 0x4bdecfa9L); - R2(C, D, A, B, X(7), 16, 0xf6bb4b60L); - R2(B, C, D, A, X(10), 23, 0xbebfbc70L); - R2(A, B, C, D, X(13), 4, 0x289b7ec6L); - R2(D, A, B, C, X(0), 11, 0xeaa127faL); - R2(C, D, A, B, X(3), 16, 0xd4ef3085L); - R2(B, C, D, A, X(6), 23, 0x04881d05L); - R2(A, B, C, D, X(9), 4, 0xd9d4d039L); - R2(D, A, B, C, X(12), 11, 0xe6db99e5L); - R2(C, D, A, B, X(15), 16, 0x1fa27cf8L); - R2(B, C, D, A, X(2), 23, 0xc4ac5665L); - // Round 3 - R3(A, B, C, D, X(0), 6, 0xf4292244L); - R3(D, A, B, C, X(7), 10, 0x432aff97L); - R3(C, D, A, B, X(14), 15, 0xab9423a7L); - R3(B, C, D, A, X(5), 21, 0xfc93a039L); - R3(A, B, C, D, X(12), 6, 0x655b59c3L); - R3(D, A, B, C, X(3), 10, 0x8f0ccc92L); - R3(C, D, A, B, X(10), 15, 0xffeff47dL); - R3(B, C, D, A, X(1), 21, 0x85845dd1L); - R3(A, B, C, D, X(8), 6, 0x6fa87e4fL); - R3(D, A, B, C, X(15), 10, 0xfe2ce6e0L); - R3(C, D, A, B, X(6), 15, 0xa3014314L); - R3(B, C, D, A, X(13), 21, 0x4e0811a1L); - R3(A, B, C, D, X(4), 6, 0xf7537e82L); - R3(D, A, B, C, X(11), 10, 0xbd3af235L); - R3(C, D, A, B, X(2), 15, 0x2ad7d2bbL); - R3(B, C, D, A, X(9), 21, 0xeb86d391L); - - A = state[0] += A; - B = state[1] += B; - C = state[2] += C; - D = state[3] += D; - } -} -#undef X -#endif - -#undef F -#undef G -#undef H -#undef I -#undef R0 -#undef R1 -#undef R2 -#undef R3 diff --git a/third_party/boringssl/src/crypto/fipsmodule/mldsa/fips_known_values.inc b/third_party/boringssl/src/crypto/fipsmodule/mldsa/fips_known_values.inc new file mode 100644 index 00000000..dc639bd2 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/mldsa/fips_known_values.inc @@ -0,0 +1,1345 @@ +const uint8_t kGenerateKeyEntropy[MLDSA_SEED_BYTES] = { + 0x47, 0x90, 0x52, 0x10, 0x30, 0x27, 0xbd, 0xf1, 0x7e, 0xb1, 0x22, + 0xd7, 0x84, 0xfb, 0x3c, 0xf0, 0x40, 0xd2, 0xf4, 0x97, 0x11, 0x7e, + 0x01, 0x38, 0xc4, 0xce, 0x9c, 0xac, 0x94, 0x71, 0xab, 0x39}; + +const uint8_t kExpectedPublicKey[MLDSA65_PUBLIC_KEY_BYTES] = { + 0x0c, 0x6f, 0x38, 0x7d, 0x2a, 0xb4, 0x33, 0x87, 0xf0, 0x21, 0xb0, 0xda, + 0x81, 0x6c, 0x71, 0xf0, 0xbc, 0x81, 0x5e, 0xf0, 0xb1, 0x6a, 0xf1, 0x12, + 0x4f, 0x35, 0x4c, 0x27, 0x3e, 0xed, 0xb4, 0x2f, 0xe5, 0x4a, 0x01, 0x9a, + 0x97, 0x9c, 0xfa, 0x12, 0xde, 0xca, 0xef, 0xe1, 0x56, 0x9a, 0x90, 0x6d, + 0x54, 0x21, 0x58, 0xa6, 0x59, 0x4c, 0x99, 0x28, 0x53, 0x79, 0xce, 0xb8, + 0x90, 0xc6, 0xed, 0x4a, 0x4c, 0xe5, 0x9b, 0x9c, 0x87, 0x41, 0xae, 0x95, + 0x26, 0x83, 0xca, 0x91, 0x38, 0x62, 0xc0, 0x32, 0x82, 0x42, 0x8e, 0xdf, + 0x92, 0xf5, 0xcf, 0xc7, 0xd5, 0x69, 0xfa, 0xfb, 0x1f, 0x18, 0x96, 0xc5, + 0x9f, 0xd8, 0xbb, 0xd6, 0xb7, 0xf4, 0x4f, 0x20, 0x12, 0x51, 0x2a, 0x08, + 0xba, 0xae, 0x5d, 0x87, 0xf9, 0x14, 0x8f, 0xb0, 0x76, 0xba, 0x1d, 0xae, + 0x38, 0x14, 0xad, 0x4f, 0x68, 0xf0, 0xc4, 0xf6, 0xdb, 0x32, 0x30, 0x9b, + 0xa3, 0x2f, 0xe3, 0x9b, 0x72, 0x5a, 0xee, 0xb0, 0x97, 0x4d, 0x01, 0xa3, + 0x70, 0x29, 0x9a, 0xd6, 0x08, 0xed, 0xbf, 0x0a, 0xa4, 0x17, 0x8d, 0x18, + 0x5f, 0x13, 0xef, 0xd2, 0x59, 0xbc, 0x50, 0x65, 0xf4, 0xef, 0xd1, 0xbb, + 0x74, 0x22, 0x38, 0xfb, 0x5d, 0x75, 0xc4, 0xe5, 0xf9, 0x8f, 0x39, 0xf2, + 0x6f, 0x93, 0xc4, 0x06, 0x1a, 0xff, 0x46, 0x7a, 0xa0, 0xaa, 0xfb, 0x1c, + 0x9c, 0xa2, 0xe0, 0x66, 0x74, 0xa1, 0x0f, 0xd5, 0x87, 0x7f, 0x45, 0x9d, + 0x5e, 0xcb, 0xec, 0xc6, 0xd0, 0x9f, 0x8a, 0x36, 0xd9, 0xce, 0x37, 0xf1, + 0x17, 0x5c, 0x7c, 0x15, 0x1d, 0x46, 0xa3, 0x5d, 0x97, 0x21, 0xbd, 0x93, + 0xf5, 0xe9, 0x8f, 0x77, 0xb4, 0xb2, 0xc2, 0x26, 0x82, 0xcb, 0xd4, 0x46, + 0xc2, 0x18, 0xa2, 0xa8, 0xfe, 0xee, 0xf5, 0x15, 0xc7, 0x03, 0x0c, 0xa3, + 0x46, 0xcf, 0x85, 0x61, 0x68, 0x8c, 0xaa, 0xd6, 0x83, 0x9f, 0xd8, 0xf4, + 0x8c, 0x0c, 0xc6, 0xe7, 0xef, 0x17, 0xe4, 0x47, 0x41, 0xcf, 0x35, 0x2a, + 0x6a, 0x19, 0xa0, 0x55, 0xc0, 0x16, 0x4f, 0xaa, 0xdc, 0x96, 0xc5, 0x34, + 0xfa, 0xee, 0xbf, 0x3f, 0x3d, 0x76, 0x06, 0x82, 0x86, 0x40, 0x7b, 0x85, + 0x7f, 0x76, 0x6d, 0x9f, 0x82, 0x9e, 0x21, 0x99, 0xd3, 0x61, 0x34, 0xf7, + 0x17, 0xf5, 0x46, 0x41, 0xad, 0x6c, 0x32, 0xd1, 0x33, 0xf8, 0x8f, 0x6e, + 0xc9, 0x13, 0xc5, 0xb3, 0x3e, 0xa7, 0x6f, 0xc9, 0x8d, 0xb0, 0x00, 0x9c, + 0x5e, 0x7b, 0xcb, 0x0b, 0x03, 0x85, 0x91, 0x4b, 0x8d, 0x74, 0xd1, 0x32, + 0x0e, 0x0b, 0x37, 0xc5, 0x80, 0x8c, 0x06, 0xd3, 0x4b, 0xab, 0xb3, 0xd4, + 0x3f, 0xb7, 0x4c, 0x46, 0xcd, 0x28, 0xde, 0x15, 0x9d, 0xf5, 0x95, 0x0c, + 0x57, 0x56, 0x7a, 0xf9, 0xde, 0x97, 0x47, 0x72, 0x95, 0xf7, 0x4a, 0x3a, + 0x1b, 0xbb, 0x51, 0xc6, 0x83, 0x7f, 0x37, 0x18, 0x6a, 0x79, 0x2c, 0xe3, + 0x82, 0x20, 0x53, 0x96, 0x10, 0xe4, 0x45, 0x32, 0xf2, 0x66, 0x35, 0x5e, + 0xb8, 0x2d, 0x8d, 0xf5, 0x90, 0x66, 0x09, 0xb2, 0xfa, 0x0e, 0xdd, 0x0b, + 0x19, 0x41, 0x4a, 0x58, 0xc0, 0x83, 0x9d, 0x6d, 0x05, 0x43, 0x84, 0x6e, + 0x5b, 0x6e, 0x7e, 0xd0, 0xc9, 0xb1, 0x55, 0x7f, 0xd0, 0x59, 0x40, 0x94, + 0x2d, 0x8c, 0xdb, 0xe1, 0xa4, 0xd8, 0x8a, 0x78, 0x3a, 0x0a, 0x50, 0x55, + 0x84, 0xc2, 0xd9, 0xd8, 0x55, 0x3a, 0xef, 0x93, 0xc2, 0xe9, 0x86, 0x43, + 0x30, 0x08, 0xd6, 0x54, 0x1f, 0xfd, 0x04, 0xac, 0x4c, 0x2d, 0x5f, 0xf5, + 0x4d, 0x4f, 0xe9, 0x70, 0x77, 0x0b, 0x9a, 0x07, 0x91, 0x1e, 0xdf, 0x9c, + 0x4e, 0xaa, 0xf5, 0x05, 0x51, 0xc9, 0x7b, 0x48, 0x35, 0xc9, 0x30, 0xd4, + 0x02, 0x14, 0x8b, 0xb4, 0x2c, 0x1f, 0x25, 0x6d, 0xf4, 0xa6, 0x52, 0x1b, + 0x63, 0x48, 0x85, 0x31, 0x82, 0xf8, 0xd5, 0xb3, 0x11, 0x77, 0x5a, 0xa3, + 0xfe, 0x2e, 0x31, 0xc7, 0xdb, 0xda, 0x0b, 0x35, 0x6f, 0x56, 0xb8, 0xb8, + 0x64, 0xb7, 0x8c, 0xde, 0x69, 0x2d, 0x88, 0x83, 0xbe, 0x2c, 0xc1, 0xc2, + 0x90, 0xd4, 0x19, 0xb7, 0x21, 0x3a, 0x71, 0x81, 0x64, 0x88, 0x7e, 0xed, + 0xc2, 0xb8, 0x92, 0xce, 0x36, 0xc0, 0x4d, 0x36, 0x66, 0x75, 0x35, 0xa0, + 0x6e, 0x1e, 0xef, 0xb1, 0x9e, 0xb1, 0x9c, 0x16, 0xf7, 0x3a, 0x7e, 0xf7, + 0x8a, 0x36, 0xd0, 0x8f, 0x9d, 0x13, 0x04, 0x4a, 0xac, 0x0d, 0x71, 0xd9, + 0x1f, 0x52, 0xa6, 0xb4, 0x52, 0xfd, 0x12, 0xd2, 0xa8, 0xc2, 0x38, 0xbf, + 0x14, 0x4b, 0x42, 0xcf, 0xfc, 0x2d, 0x72, 0xa3, 0x5d, 0xdb, 0x4e, 0xfd, + 0xbc, 0xa6, 0xde, 0x41, 0x21, 0x88, 0xe9, 0x05, 0xe5, 0x6a, 0x80, 0xf1, + 0x0e, 0xc6, 0x58, 0x24, 0x0a, 0x0b, 0xdd, 0x32, 0xef, 0x68, 0xf6, 0x5e, + 0x84, 0xba, 0xbb, 0xc4, 0x14, 0x10, 0x58, 0xcd, 0x8a, 0x00, 0xce, 0xd2, + 0x54, 0xfb, 0x9d, 0x75, 0x69, 0x2b, 0x68, 0x89, 0x4e, 0x71, 0xda, 0x6f, + 0x98, 0x88, 0x38, 0x96, 0xe6, 0x34, 0x99, 0x9f, 0x69, 0x2a, 0x9e, 0xa3, + 0xb8, 0xe7, 0x0c, 0x97, 0xb9, 0x96, 0x58, 0x2a, 0x79, 0x85, 0x8e, 0x4b, + 0xcd, 0x5e, 0xc5, 0x05, 0xbf, 0x44, 0x49, 0x60, 0x9c, 0xa5, 0x49, 0x9f, + 0x2f, 0x30, 0x07, 0x79, 0x54, 0x18, 0xc1, 0x84, 0x79, 0xc6, 0x79, 0xc5, + 0x83, 0x51, 0x44, 0x18, 0x33, 0xcb, 0xf5, 0xe6, 0x72, 0x13, 0x8e, 0x6d, + 0x3d, 0xbe, 0x1d, 0xce, 0x70, 0x31, 0x4b, 0x0d, 0x33, 0x8d, 0x9a, 0xac, + 0xbf, 0xf7, 0x17, 0xd5, 0x62, 0xe3, 0x2f, 0xc0, 0x60, 0x25, 0xf6, 0xc1, + 0x66, 0xd5, 0x49, 0x18, 0x64, 0x4f, 0x34, 0xca, 0xa9, 0xdd, 0x11, 0x81, + 0x36, 0x93, 0xdf, 0xc0, 0xc6, 0x5e, 0x21, 0xc8, 0xbd, 0x15, 0x1e, 0x92, + 0xc7, 0x8d, 0xa0, 0x85, 0xdc, 0x14, 0xb9, 0xa0, 0x6d, 0x93, 0x15, 0x2b, + 0xce, 0x8d, 0x1c, 0x37, 0x75, 0xbb, 0x5a, 0x5b, 0x41, 0x95, 0xb1, 0xff, + 0x82, 0xe9, 0xa0, 0xbb, 0x01, 0x03, 0x15, 0x4b, 0xa1, 0x4e, 0xc6, 0xc2, + 0xb2, 0x77, 0x78, 0x75, 0xd2, 0x13, 0x5e, 0xe0, 0x82, 0xee, 0x6c, 0xd9, + 0x6f, 0x1e, 0x7e, 0xc4, 0x5b, 0x89, 0x10, 0x0e, 0x0b, 0x85, 0x6f, 0xe0, + 0x86, 0xf2, 0x86, 0xc9, 0x20, 0xe0, 0xa5, 0x09, 0x3d, 0x30, 0xed, 0x50, + 0x5e, 0xb5, 0x2b, 0xb9, 0xbe, 0x55, 0x60, 0xa9, 0x25, 0x79, 0xed, 0x3f, + 0xfe, 0xce, 0xc2, 0x63, 0x5a, 0x17, 0xa0, 0xf9, 0x81, 0xe4, 0x79, 0x0b, + 0x0a, 0x8f, 0x32, 0x1f, 0x9b, 0x92, 0xf4, 0x88, 0x68, 0xf4, 0xd8, 0x06, + 0xa9, 0x01, 0x26, 0x6a, 0x5d, 0x67, 0xc7, 0x29, 0x67, 0xd3, 0x47, 0xd0, + 0x69, 0x76, 0xd8, 0x79, 0x43, 0x40, 0xaa, 0xdd, 0x82, 0x3e, 0xc9, 0x2b, + 0x1f, 0x90, 0xb9, 0x27, 0x74, 0x21, 0xb9, 0x29, 0xcd, 0x09, 0x12, 0xb9, + 0x72, 0x2c, 0x73, 0x3c, 0x0e, 0x94, 0x69, 0x37, 0xbd, 0xca, 0x54, 0x31, + 0x93, 0xce, 0x54, 0x21, 0x5b, 0x51, 0xa4, 0xc4, 0x0e, 0xf4, 0x37, 0xa1, + 0x6a, 0x9d, 0x05, 0x6b, 0xec, 0xfb, 0x88, 0x8d, 0xd1, 0x20, 0x33, 0x7d, + 0xbe, 0xf1, 0xda, 0x3b, 0xcf, 0x34, 0x9a, 0x1e, 0xbc, 0x18, 0x3a, 0x44, + 0xd3, 0x62, 0x0e, 0x2e, 0x18, 0x88, 0x05, 0x13, 0x97, 0x7f, 0x09, 0x3a, + 0xb3, 0x18, 0x86, 0x59, 0x1a, 0x9b, 0x15, 0xfc, 0x20, 0xcf, 0xe8, 0x88, + 0xf9, 0x75, 0x06, 0x3b, 0xf5, 0xac, 0x02, 0xd6, 0xee, 0x8c, 0x3c, 0x5c, + 0xda, 0x5b, 0x9e, 0x9b, 0x81, 0xff, 0x5c, 0x70, 0x1c, 0xb4, 0xe1, 0xb6, + 0x9f, 0xd5, 0x15, 0xe8, 0xc0, 0x24, 0xe7, 0x8d, 0xbd, 0x66, 0xd4, 0x82, + 0x97, 0x93, 0xf8, 0x95, 0x66, 0xc7, 0xe5, 0x3a, 0x02, 0x0d, 0xc5, 0x7d, + 0x0d, 0x4f, 0x9a, 0xb6, 0x41, 0xb8, 0x6f, 0x48, 0x68, 0x01, 0x84, 0xe8, + 0x0b, 0x2c, 0x77, 0x1d, 0x79, 0x16, 0x86, 0xc3, 0xf0, 0xa9, 0x41, 0xf1, + 0xda, 0xd5, 0x6f, 0x49, 0x8d, 0xb6, 0x6b, 0x53, 0xc1, 0x63, 0xff, 0x0a, + 0x95, 0xb8, 0x92, 0xf8, 0x21, 0xc2, 0x17, 0xc5, 0xd6, 0xcf, 0x18, 0xda, + 0x1a, 0xb9, 0x71, 0x7e, 0x03, 0x30, 0xfb, 0x40, 0xca, 0xc6, 0xd4, 0x18, + 0xe8, 0xe4, 0x8f, 0xe9, 0x23, 0xd1, 0x02, 0xb2, 0x8d, 0xa1, 0xe2, 0x0f, + 0x88, 0xc8, 0x88, 0x42, 0xb8, 0x32, 0xf7, 0x2d, 0x94, 0x2a, 0xa8, 0x99, + 0x7a, 0x25, 0x3f, 0x2b, 0xf1, 0xc1, 0xdb, 0x83, 0x53, 0xe6, 0x2f, 0x6f, + 0x58, 0x38, 0x18, 0x53, 0x6c, 0x50, 0x4f, 0xf8, 0xec, 0xe8, 0x19, 0x36, + 0x0f, 0xa8, 0xba, 0x9a, 0x2e, 0xb8, 0x8b, 0xab, 0xd7, 0x59, 0xa6, 0x81, + 0xd2, 0x34, 0x03, 0x57, 0x12, 0xaa, 0x2c, 0x92, 0x52, 0xe8, 0x90, 0x59, + 0x19, 0x20, 0xb5, 0x24, 0x1e, 0x0c, 0xa8, 0x66, 0xf9, 0x00, 0xeb, 0xa4, + 0xc5, 0x45, 0xfe, 0x94, 0x92, 0x92, 0x0e, 0xc9, 0x7b, 0x33, 0x11, 0xb0, + 0x92, 0x8e, 0xac, 0xf6, 0xa5, 0x3f, 0x49, 0xd5, 0x29, 0x3a, 0xcf, 0x4f, + 0xf6, 0x59, 0x68, 0x1d, 0x24, 0x1d, 0x67, 0xa0, 0x80, 0x72, 0x37, 0x6f, + 0xa8, 0x0c, 0x84, 0x09, 0xe0, 0x02, 0xa9, 0xa8, 0xcc, 0xc4, 0x01, 0xa2, + 0x50, 0x02, 0xc7, 0xa3, 0xf9, 0x42, 0xbe, 0x55, 0x40, 0xdb, 0xdf, 0x46, + 0x56, 0x5c, 0xd7, 0x9a, 0xe9, 0x28, 0xd4, 0xc7, 0xc7, 0x4e, 0x6b, 0x8e, + 0x5f, 0x52, 0xd4, 0x8d, 0x1b, 0x97, 0x9d, 0xad, 0x87, 0x1a, 0x5d, 0x79, + 0xfa, 0x47, 0xad, 0x3e, 0x70, 0x82, 0x3a, 0x87, 0x49, 0xd7, 0xc4, 0xb3, + 0xd5, 0xb8, 0x48, 0x9a, 0x30, 0x5e, 0x59, 0x88, 0xb4, 0xbf, 0xf9, 0x5a, + 0x7a, 0x11, 0xb3, 0x57, 0x27, 0x3c, 0x24, 0x22, 0x5a, 0x9d, 0x0c, 0x9a, + 0xd4, 0x0d, 0x26, 0xf6, 0xd2, 0x7b, 0x91, 0x68, 0xfe, 0x44, 0xff, 0x54, + 0xd0, 0xef, 0x87, 0xcb, 0x29, 0x4c, 0x23, 0xe2, 0xce, 0x4b, 0xf6, 0x36, + 0x10, 0x6d, 0x54, 0x2a, 0xf0, 0x21, 0xf9, 0x51, 0x25, 0x5f, 0x11, 0x93, + 0xc9, 0x3f, 0x2e, 0x40, 0xed, 0x28, 0x5e, 0x1a, 0xb8, 0x98, 0xc9, 0x63, + 0x94, 0xe7, 0x44, 0x11, 0x76, 0xb8, 0x3b, 0xfd, 0xc4, 0x20, 0x47, 0x26, + 0x62, 0x26, 0x86, 0x92, 0x3d, 0x2f, 0xdb, 0xed, 0xd0, 0x60, 0x0b, 0x2d, + 0x4d, 0xb9, 0xae, 0x76, 0x35, 0xb4, 0xfb, 0x22, 0x7f, 0xcb, 0x5d, 0x76, + 0xca, 0xe7, 0x07, 0xd7, 0x1b, 0xd8, 0x2b, 0x20, 0xf4, 0xad, 0xad, 0x8a, + 0x78, 0xc2, 0xb1, 0xc4, 0x9e, 0x08, 0x63, 0x98, 0x0f, 0xd8, 0x3b, 0xfc, + 0xfa, 0x7c, 0x88, 0x99, 0x50, 0x0c, 0x3b, 0x0f, 0x5a, 0x28, 0xd0, 0xa4, + 0x9f, 0x55, 0xc5, 0xf9, 0x18, 0xea, 0x76, 0x3b, 0xf5, 0x28, 0x61, 0x0e, + 0x0c, 0x17, 0x2b, 0x04, 0x19, 0x56, 0xa3, 0x83, 0xf5, 0xc6, 0x09, 0xb9, + 0x7e, 0xad, 0xa6, 0x63, 0xb1, 0x8d, 0x8d, 0x7f, 0x28, 0xdd, 0x10, 0x7a, + 0x38, 0xa9, 0xeb, 0x72, 0xd1, 0x85, 0x15, 0x93, 0xae, 0x86, 0xe6, 0x41, + 0xd8, 0xec, 0x57, 0x5b, 0x4e, 0x2f, 0x59, 0x93, 0x01, 0xac, 0x12, 0xd5, + 0x7f, 0x9d, 0x5d, 0xb3, 0x06, 0x3d, 0x80, 0xd3, 0x91, 0xd5, 0x90, 0x8b, + 0xa5, 0xef, 0x19, 0xb0, 0xf1, 0x33, 0x9e, 0x6d, 0x30, 0xbd, 0x24, 0x21, + 0x65, 0x87, 0x71, 0x0c, 0x8e, 0x27, 0x05, 0xee, 0x7d, 0xd0, 0x50, 0xf4, + 0x09, 0xf0, 0x24, 0x69, 0xf0, 0x15, 0x43, 0xa6, 0x47, 0xdd, 0x0a, 0x26, + 0x17, 0xa8, 0x1c, 0xad, 0x17, 0x9e, 0xbb, 0x3d, 0x84, 0xf3, 0x2f, 0x53, + 0xe8, 0x87, 0x28, 0x2d, 0xb1, 0xf6, 0xa3, 0xe2, 0x24, 0xa1, 0x2c, 0x35, + 0x9d, 0x1d, 0x48, 0xf0, 0xd4, 0x03, 0xde, 0x4a, 0x31, 0xda, 0x94, 0x7b, + 0x67, 0x6f, 0x97, 0x38, 0x59, 0xc6, 0xdc, 0x07, 0xd0, 0xf5, 0xe7, 0xec, + 0x33, 0x95, 0x0c, 0xf4, 0xb3, 0x6f, 0x9e, 0x71, 0x09, 0x59, 0x1d, 0x55, + 0x1b, 0x3d, 0xc8, 0x89, 0x55, 0xae, 0x95, 0x05, 0xe2, 0x09, 0x55, 0xa1, + 0x2d, 0xd3, 0x19, 0xd7, 0x89, 0x88, 0xf5, 0x3b, 0xae, 0xc5, 0xf2, 0x80, + 0x1d, 0xe4, 0x27, 0xcf, 0xbe, 0x90, 0x99, 0x8b, 0x01, 0xeb, 0xb3, 0xc8, + 0xfe, 0xf2, 0xdd, 0xd5, 0x95, 0x95, 0x9a, 0xa5, 0xe9, 0xec, 0x9f, 0x63, + 0x26, 0x20, 0xca, 0x98, 0x37, 0xe9, 0x5e, 0xcd, 0x95, 0x17, 0x4f, 0xbb, + 0xff, 0xdb, 0x8b, 0xc7, 0x80, 0xee, 0xb1, 0x5d, 0x27, 0x83, 0xd1, 0x9c, + 0x6d, 0xfd, 0x1a, 0x98, 0x99, 0x71, 0x4d, 0x06, 0xbc, 0x47, 0x34, 0xf9, + 0x23, 0x9a, 0xd4, 0x3e, 0x92, 0xd4, 0xd5, 0x6c, 0x6c, 0x5c, 0xab, 0xd0, + 0xaf, 0x19, 0xf9, 0x48, 0xfe, 0xbd, 0x62, 0x72, 0x1d, 0x1e, 0xbc, 0x8a, + 0xa6, 0x21, 0x59, 0xe6, 0xd9, 0xbc, 0x5e, 0xc9, 0x3b, 0x88, 0x38, 0x26, + 0xf4, 0x8e, 0xfc, 0x95, 0xbe, 0x6b, 0xf9, 0x1b, 0x7f, 0x37, 0xd0, 0xfc, + 0x00, 0x4d, 0x08, 0xfd, 0x75, 0xf8, 0x85, 0x48, 0x2e, 0x40, 0x91, 0x9a, + 0xa0, 0xfa, 0x9f, 0x78, 0x39, 0xde, 0x44, 0x73, 0xf9, 0xa4, 0xb7, 0x50, + 0x61, 0x9f, 0x93, 0x0a, 0x09, 0xb5, 0x47, 0x91, 0x0f, 0x21, 0x42, 0x7c, + 0xa5, 0x25, 0xdb, 0xa8, 0x4e, 0xa1, 0x77, 0x35, 0xf7, 0xb6, 0xb3, 0x0b, + 0x36, 0xeb, 0xcc, 0xb4, 0xb8, 0x70, 0xf9, 0xb2, 0xdd, 0x0b, 0x69, 0xaa, + 0xd3, 0x47, 0x03, 0xb9, 0x28, 0xba, 0x80, 0x48, 0xf5, 0xcd, 0x6a, 0x7f, + 0x16, 0x4d, 0xc5, 0xce, 0x8c, 0xbc, 0x95, 0x0f, 0x5a, 0x24, 0xb0, 0x8f, + 0x84, 0xd6, 0xed, 0x79, 0x8e, 0x46, 0x79, 0x82, 0x75, 0x0f, 0x10, 0x3f, + 0x51, 0x8e, 0xb0, 0xc9, 0x40, 0x5b, 0xa3, 0xe0, 0x35, 0xf8, 0x4a, 0xf9, + 0x7e, 0x38, 0x3f, 0xe9, 0x05, 0xb2, 0x44, 0x7d, 0x76, 0xfe, 0xfd, 0x1e, + 0x5b, 0xb7, 0xd6, 0x44, 0x13, 0x53, 0xd7, 0xa2, 0xb0, 0x00, 0xfa, 0xf1, + 0x0b, 0x4d, 0xed, 0x1b, 0x46, 0x97, 0x37, 0xa3, 0xbd, 0x90, 0x0c, 0xed, + 0xc3, 0x0c, 0x40, 0xcd, 0xba, 0x88, 0x09, 0x47, 0x0d, 0x63, 0xc5, 0xa3, + 0x33, 0x2c, 0x9a, 0xef, 0x50, 0xf9, 0xa7, 0xcb, 0x6c, 0xc9, 0xeb, 0xbe, + 0xff, 0x60, 0x88, 0x7e, 0x71, 0xa3, 0xea, 0x0a, 0x6d, 0x54, 0x8d, 0x3d, + 0x54, 0x6e, 0x15, 0xad, 0xaf, 0x4c, 0xbc, 0x62, 0x4b, 0x2e, 0x0e, 0xfb, + 0x07, 0xef, 0xab, 0x56, 0xae, 0xc3, 0xb0, 0xec, 0x11, 0xe5, 0x97, 0xc6, + 0x25, 0xd0, 0x03, 0xd5, 0xc5, 0x0e, 0x35, 0x2c, 0x7a, 0x73, 0xb9, 0x1a, + 0x81, 0xcb, 0x01, 0xf6, 0x01, 0xae, 0x35, 0x7f}; + +const uint8_t kExpectedPrivateKey[BCM_MLDSA65_PRIVATE_KEY_BYTES] = { + 0x0c, 0x6f, 0x38, 0x7d, 0x2a, 0xb4, 0x33, 0x87, 0xf0, 0x21, 0xb0, 0xda, + 0x81, 0x6c, 0x71, 0xf0, 0xbc, 0x81, 0x5e, 0xf0, 0xb1, 0x6a, 0xf1, 0x12, + 0x4f, 0x35, 0x4c, 0x27, 0x3e, 0xed, 0xb4, 0x2f, 0xbf, 0x0e, 0xfa, 0x56, + 0x57, 0x0f, 0xbe, 0xb6, 0x30, 0x07, 0x57, 0xe6, 0xc2, 0xf4, 0xc3, 0x35, + 0xbe, 0x49, 0x22, 0xba, 0xa2, 0xb7, 0x4f, 0x56, 0x52, 0xb5, 0x7c, 0x3a, + 0x45, 0x78, 0x0e, 0x2b, 0x63, 0xf6, 0xaf, 0x3b, 0x98, 0xd7, 0xa9, 0x79, + 0x3e, 0xa4, 0x2c, 0xb3, 0xf4, 0x21, 0xb4, 0x8f, 0xb8, 0x3e, 0x93, 0x20, + 0x5f, 0x61, 0x51, 0x07, 0x30, 0x25, 0x7f, 0xf9, 0x81, 0x7b, 0x77, 0x6b, + 0x9d, 0xb4, 0x05, 0x26, 0x15, 0x79, 0x8d, 0xda, 0x47, 0x6f, 0xd5, 0x45, + 0x4a, 0xe4, 0xe4, 0xfc, 0xb7, 0xeb, 0xe4, 0x40, 0x2d, 0x6b, 0xe1, 0xeb, + 0x5f, 0xcf, 0x52, 0xea, 0xfe, 0xf4, 0x38, 0x37, 0x36, 0x50, 0x31, 0x60, + 0x03, 0x68, 0x60, 0x26, 0x36, 0x50, 0x45, 0x16, 0x42, 0x22, 0x51, 0x23, + 0x54, 0x28, 0x60, 0x51, 0x32, 0x83, 0x06, 0x01, 0x63, 0x13, 0x86, 0x52, + 0x26, 0x32, 0x05, 0x86, 0x80, 0x01, 0x06, 0x74, 0x06, 0x86, 0x43, 0x87, + 0x28, 0x72, 0x85, 0x32, 0x14, 0x12, 0x71, 0x68, 0x76, 0x10, 0x32, 0x57, + 0x55, 0x27, 0x47, 0x38, 0x20, 0x50, 0x81, 0x22, 0x61, 0x67, 0x62, 0x64, + 0x51, 0x43, 0x01, 0x23, 0x38, 0x73, 0x77, 0x34, 0x02, 0x72, 0x83, 0x86, + 0x34, 0x28, 0x72, 0x64, 0x88, 0x68, 0x15, 0x86, 0x27, 0x16, 0x14, 0x02, + 0x45, 0x10, 0x43, 0x13, 0x41, 0x73, 0x45, 0x02, 0x55, 0x45, 0x06, 0x42, + 0x35, 0x28, 0x70, 0x22, 0x81, 0x48, 0x15, 0x36, 0x31, 0x82, 0x18, 0x33, + 0x46, 0x01, 0x64, 0x70, 0x32, 0x82, 0x14, 0x02, 0x41, 0x21, 0x28, 0x70, + 0x67, 0x55, 0x56, 0x44, 0x57, 0x40, 0x81, 0x05, 0x00, 0x66, 0x56, 0x81, + 0x42, 0x04, 0x34, 0x83, 0x77, 0x55, 0x05, 0x47, 0x64, 0x54, 0x36, 0x53, + 0x63, 0x63, 0x72, 0x46, 0x60, 0x12, 0x25, 0x25, 0x03, 0x25, 0x65, 0x64, + 0x84, 0x64, 0x83, 0x34, 0x30, 0x73, 0x11, 0x68, 0x32, 0x56, 0x06, 0x64, + 0x05, 0x40, 0x52, 0x47, 0x37, 0x51, 0x44, 0x06, 0x75, 0x57, 0x36, 0x22, + 0x63, 0x28, 0x25, 0x48, 0x64, 0x21, 0x38, 0x47, 0x01, 0x18, 0x18, 0x82, + 0x10, 0x00, 0x65, 0x33, 0x65, 0x70, 0x46, 0x76, 0x38, 0x63, 0x15, 0x11, + 0x31, 0x73, 0x08, 0x00, 0x37, 0x71, 0x41, 0x16, 0x21, 0x44, 0x72, 0x45, + 0x01, 0x71, 0x55, 0x05, 0x01, 0x87, 0x82, 0x14, 0x70, 0x52, 0x32, 0x00, + 0x76, 0x45, 0x74, 0x62, 0x22, 0x77, 0x47, 0x74, 0x56, 0x26, 0x56, 0x73, + 0x42, 0x84, 0x15, 0x03, 0x44, 0x64, 0x23, 0x73, 0x37, 0x28, 0x67, 0x15, + 0x76, 0x28, 0x81, 0x57, 0x52, 0x70, 0x12, 0x31, 0x13, 0x03, 0x32, 0x27, + 0x03, 0x05, 0x86, 0x27, 0x47, 0x68, 0x26, 0x75, 0x22, 0x38, 0x45, 0x04, + 0x87, 0x05, 0x46, 0x35, 0x51, 0x45, 0x37, 0x03, 0x06, 0x58, 0x78, 0x36, + 0x33, 0x07, 0x36, 0x08, 0x23, 0x41, 0x35, 0x30, 0x88, 0x25, 0x03, 0x72, + 0x26, 0x12, 0x60, 0x06, 0x62, 0x74, 0x51, 0x66, 0x41, 0x70, 0x87, 0x82, + 0x55, 0x22, 0x76, 0x26, 0x50, 0x03, 0x54, 0x03, 0x68, 0x70, 0x71, 0x61, + 0x76, 0x00, 0x45, 0x14, 0x64, 0x41, 0x00, 0x46, 0x44, 0x42, 0x62, 0x34, + 0x38, 0x28, 0x80, 0x06, 0x52, 0x20, 0x67, 0x88, 0x62, 0x75, 0x48, 0x07, + 0x87, 0x61, 0x40, 0x85, 0x55, 0x76, 0x48, 0x24, 0x04, 0x52, 0x00, 0x83, + 0x46, 0x54, 0x20, 0x66, 0x52, 0x55, 0x87, 0x50, 0x65, 0x40, 0x43, 0x65, + 0x01, 0x61, 0x87, 0x25, 0x30, 0x17, 0x72, 0x46, 0x73, 0x60, 0x74, 0x46, + 0x22, 0x76, 0x21, 0x51, 0x01, 0x30, 0x87, 0x38, 0x38, 0x57, 0x88, 0x55, + 0x22, 0x64, 0x80, 0x34, 0x83, 0x20, 0x01, 0x42, 0x04, 0x24, 0x08, 0x04, + 0x87, 0x34, 0x18, 0x26, 0x25, 0x03, 0x13, 0x02, 0x42, 0x81, 0x07, 0x47, + 0x55, 0x53, 0x51, 0x73, 0x07, 0x08, 0x72, 0x32, 0x32, 0x06, 0x13, 0x76, + 0x42, 0x61, 0x11, 0x10, 0x34, 0x22, 0x65, 0x80, 0x18, 0x05, 0x01, 0x64, + 0x44, 0x65, 0x13, 0x20, 0x81, 0x26, 0x03, 0x50, 0x15, 0x34, 0x76, 0x71, + 0x86, 0x52, 0x58, 0x87, 0x53, 0x84, 0x53, 0x57, 0x75, 0x43, 0x13, 0x71, + 0x85, 0x66, 0x41, 0x01, 0x52, 0x15, 0x60, 0x43, 0x24, 0x66, 0x42, 0x83, + 0x02, 0x37, 0x53, 0x44, 0x53, 0x88, 0x15, 0x20, 0x25, 0x77, 0x34, 0x66, + 0x23, 0x70, 0x06, 0x62, 0x81, 0x37, 0x32, 0x05, 0x16, 0x05, 0x26, 0x44, + 0x26, 0x21, 0x58, 0x85, 0x17, 0x75, 0x57, 0x70, 0x47, 0x14, 0x61, 0x21, + 0x26, 0x13, 0x00, 0x48, 0x47, 0x46, 0x27, 0x48, 0x52, 0x50, 0x67, 0x62, + 0x07, 0x27, 0x04, 0x38, 0x72, 0x60, 0x02, 0x74, 0x00, 0x60, 0x80, 0x13, + 0x58, 0x63, 0x74, 0x13, 0x47, 0x57, 0x84, 0x52, 0x18, 0x46, 0x36, 0x18, + 0x72, 0x65, 0x18, 0x20, 0x57, 0x68, 0x76, 0x11, 0x24, 0x02, 0x45, 0x06, + 0x16, 0x46, 0x78, 0x73, 0x86, 0x22, 0x25, 0x16, 0x12, 0x36, 0x66, 0x81, + 0x57, 0x82, 0x80, 0x33, 0x08, 0x74, 0x62, 0x48, 0x85, 0x27, 0x56, 0x01, + 0x88, 0x44, 0x35, 0x44, 0x01, 0x05, 0x38, 0x44, 0x60, 0x46, 0x67, 0x87, + 0x23, 0x43, 0x76, 0x66, 0x52, 0x71, 0x88, 0x87, 0x27, 0x23, 0x14, 0x78, + 0x16, 0x87, 0x80, 0x52, 0x56, 0x13, 0x17, 0x06, 0x07, 0x40, 0x86, 0x62, + 0x12, 0x72, 0x45, 0x25, 0x50, 0x03, 0x32, 0x14, 0x81, 0x70, 0x41, 0x01, + 0x32, 0x81, 0x13, 0x13, 0x13, 0x72, 0x77, 0x00, 0x48, 0x26, 0x64, 0x81, + 0x58, 0x07, 0x23, 0x85, 0x27, 0x13, 0x01, 0x60, 0x26, 0x65, 0x37, 0x61, + 0x84, 0x17, 0x38, 0x08, 0x10, 0x22, 0x01, 0x70, 0x58, 0x65, 0x68, 0x18, + 0x66, 0x40, 0x80, 0x46, 0x50, 0x87, 0x24, 0x70, 0x06, 0x88, 0x37, 0x66, + 0x15, 0x03, 0x31, 0x63, 0x05, 0x73, 0x32, 0x57, 0x72, 0x52, 0x12, 0x76, + 0x72, 0x67, 0x26, 0x13, 0x34, 0x18, 0x75, 0x20, 0x06, 0x83, 0x22, 0x26, + 0x68, 0x84, 0x44, 0x24, 0x47, 0x08, 0x84, 0x44, 0x13, 0x15, 0x86, 0x45, + 0x16, 0x58, 0x63, 0x51, 0x50, 0x34, 0x77, 0x00, 0x58, 0x81, 0x86, 0x32, + 0x78, 0x57, 0x10, 0x25, 0x58, 0x63, 0x48, 0x53, 0x78, 0x75, 0x42, 0x67, + 0x77, 0x52, 0x63, 0x03, 0x33, 0x51, 0x66, 0x11, 0x83, 0x40, 0x76, 0x01, + 0x11, 0x53, 0x64, 0x36, 0x30, 0x11, 0x71, 0x75, 0x76, 0x23, 0x26, 0x81, + 0x32, 0x27, 0x02, 0x56, 0x66, 0x75, 0x07, 0x14, 0x66, 0x35, 0x87, 0x64, + 0x62, 0x56, 0x33, 0x47, 0x82, 0x26, 0x54, 0x73, 0x38, 0x15, 0x41, 0x58, + 0x25, 0x84, 0x80, 0x04, 0x57, 0x83, 0x86, 0x83, 0x77, 0x83, 0x22, 0x83, + 0x65, 0x25, 0x34, 0x58, 0x82, 0x43, 0x33, 0x36, 0x66, 0x68, 0x60, 0x31, + 0x85, 0x48, 0x88, 0x13, 0x57, 0x24, 0x51, 0x34, 0x16, 0x48, 0x60, 0x37, + 0x24, 0x21, 0x62, 0x71, 0x06, 0x25, 0x60, 0x32, 0x82, 0x45, 0x52, 0x43, + 0x15, 0x11, 0x47, 0x77, 0x60, 0x07, 0x15, 0x04, 0x71, 0x53, 0x58, 0x82, + 0x17, 0x68, 0x40, 0x35, 0x21, 0x13, 0x88, 0x88, 0x61, 0x46, 0x24, 0x41, + 0x80, 0x10, 0x22, 0x80, 0x88, 0x37, 0x06, 0x30, 0x76, 0x67, 0x70, 0x03, + 0x18, 0x43, 0x10, 0x74, 0x46, 0x62, 0x21, 0x25, 0x45, 0x56, 0x63, 0x14, + 0x28, 0x11, 0x37, 0x26, 0x88, 0x86, 0x22, 0x35, 0x51, 0x48, 0x78, 0x78, + 0x48, 0x26, 0x40, 0x54, 0x82, 0x12, 0x70, 0x71, 0x78, 0x06, 0x05, 0x87, + 0x17, 0x41, 0x44, 0x00, 0x27, 0x84, 0x33, 0x81, 0x00, 0x16, 0x70, 0x67, + 0x20, 0x62, 0x78, 0x58, 0x46, 0x25, 0x88, 0x06, 0x50, 0x23, 0x44, 0x04, + 0x28, 0x44, 0x35, 0x17, 0x62, 0x61, 0x40, 0x41, 0x50, 0x75, 0x27, 0x50, + 0x76, 0x13, 0x40, 0x40, 0x50, 0x18, 0x22, 0x32, 0x23, 0x51, 0x34, 0x76, + 0x01, 0x25, 0x81, 0x75, 0x23, 0x66, 0x11, 0x48, 0x41, 0x34, 0x57, 0x36, + 0x21, 0x73, 0x75, 0x87, 0x01, 0x88, 0x75, 0x28, 0x72, 0x46, 0x13, 0x70, + 0x77, 0x14, 0x04, 0x80, 0x73, 0x50, 0x27, 0x56, 0x37, 0x22, 0x65, 0x76, + 0x47, 0x78, 0x41, 0x22, 0x38, 0x24, 0x26, 0x82, 0x18, 0x31, 0x10, 0x71, + 0x88, 0x62, 0x73, 0x05, 0x30, 0x27, 0x31, 0x76, 0x80, 0x78, 0x13, 0x21, + 0x31, 0x35, 0x21, 0x54, 0x64, 0x44, 0x86, 0x36, 0x52, 0x42, 0x02, 0x53, + 0x86, 0x07, 0x72, 0x04, 0x07, 0x70, 0x20, 0x80, 0x20, 0x71, 0x50, 0x03, + 0x52, 0x14, 0x24, 0x55, 0x52, 0x06, 0x20, 0x11, 0x40, 0x41, 0x14, 0x44, + 0x15, 0x23, 0x76, 0x00, 0x35, 0x41, 0x88, 0x11, 0x14, 0x57, 0x14, 0x24, + 0x11, 0x62, 0x18, 0x01, 0x23, 0x87, 0x12, 0x73, 0x54, 0x72, 0x32, 0x13, + 0x55, 0x04, 0x65, 0x26, 0x76, 0x22, 0x06, 0x51, 0x01, 0x53, 0x81, 0x08, + 0x24, 0x51, 0x11, 0x88, 0x14, 0x75, 0x37, 0x52, 0x57, 0x18, 0x34, 0x66, + 0x80, 0x78, 0x41, 0x00, 0x64, 0x57, 0x38, 0x28, 0x07, 0x30, 0x56, 0x13, + 0x37, 0x27, 0x68, 0x75, 0x01, 0x60, 0x47, 0x32, 0x40, 0x38, 0x71, 0x00, + 0x70, 0x83, 0x45, 0x32, 0x36, 0x66, 0x56, 0x61, 0x14, 0x28, 0x23, 0x46, + 0x67, 0x27, 0x41, 0x13, 0x06, 0x38, 0x68, 0x02, 0x08, 0x71, 0x68, 0x75, + 0x24, 0x51, 0x70, 0x16, 0x01, 0x61, 0x54, 0x75, 0x75, 0x76, 0x08, 0x77, + 0x76, 0x42, 0x68, 0x48, 0x31, 0x66, 0x16, 0x74, 0x84, 0x60, 0x52, 0x05, + 0x02, 0x47, 0x40, 0x26, 0x12, 0x75, 0x28, 0x73, 0x18, 0x86, 0x80, 0x78, + 0x66, 0x54, 0x28, 0x60, 0x30, 0x52, 0x76, 0x45, 0x66, 0x33, 0x52, 0x15, + 0x03, 0x44, 0x46, 0x62, 0x82, 0x67, 0x41, 0x25, 0x57, 0x84, 0x28, 0x67, + 0x62, 0x52, 0x34, 0x74, 0x53, 0x86, 0x40, 0x52, 0x62, 0x40, 0x37, 0x78, + 0x77, 0x45, 0x67, 0x88, 0x08, 0x53, 0x32, 0x21, 0x84, 0x13, 0x34, 0x01, + 0x41, 0x65, 0x62, 0x31, 0x68, 0x30, 0x07, 0x13, 0x04, 0x13, 0x84, 0x40, + 0x30, 0x77, 0x72, 0x73, 0x07, 0x86, 0x73, 0x34, 0x51, 0x52, 0x30, 0x28, + 0x18, 0x02, 0x21, 0x28, 0x61, 0x37, 0x76, 0x60, 0x11, 0x13, 0x01, 0x47, + 0x36, 0x18, 0x08, 0x10, 0x21, 0x17, 0x11, 0x20, 0x87, 0x72, 0x21, 0x04, + 0x42, 0x34, 0x58, 0x58, 0x04, 0x13, 0x58, 0x11, 0x84, 0x86, 0x75, 0x85, + 0x62, 0x73, 0x26, 0x10, 0x70, 0x37, 0x55, 0x45, 0x68, 0x06, 0x25, 0x41, + 0x53, 0x75, 0x81, 0x58, 0x84, 0x27, 0x83, 0x08, 0x37, 0x83, 0x13, 0x38, + 0x86, 0x42, 0x70, 0x82, 0x67, 0x61, 0x48, 0x54, 0x20, 0x41, 0x82, 0x32, + 0x57, 0x81, 0x64, 0x32, 0x12, 0x60, 0x53, 0x75, 0x48, 0x16, 0x22, 0x78, + 0x16, 0x21, 0x25, 0x36, 0x27, 0x37, 0x83, 0x24, 0x52, 0x82, 0x20, 0x74, + 0x74, 0x53, 0x76, 0x58, 0x46, 0x72, 0x88, 0x50, 0x23, 0x83, 0x36, 0x02, + 0x13, 0x12, 0x23, 0x37, 0x08, 0x54, 0x85, 0x11, 0x46, 0x52, 0x74, 0x85, + 0x41, 0x00, 0x06, 0x01, 0x36, 0x70, 0x51, 0x00, 0x16, 0x57, 0x05, 0x10, + 0x00, 0x86, 0x20, 0x06, 0x11, 0x05, 0x33, 0x05, 0x12, 0x08, 0x75, 0x10, + 0x19, 0x3a, 0x4a, 0xa5, 0x53, 0xcb, 0xe3, 0xe7, 0x69, 0xd3, 0x04, 0x28, + 0x29, 0x68, 0xf1, 0x94, 0x49, 0xcd, 0xee, 0xa9, 0x25, 0xe3, 0x95, 0x74, + 0x57, 0xa7, 0x1e, 0xac, 0xf1, 0x2f, 0xc5, 0xa6, 0x63, 0x2a, 0xa6, 0x15, + 0x98, 0x98, 0x6e, 0x15, 0x83, 0xc0, 0xab, 0x5d, 0x8e, 0xbf, 0x69, 0xe4, + 0x50, 0xb7, 0x47, 0x8a, 0x81, 0x63, 0x9b, 0xdf, 0x71, 0x61, 0x59, 0x62, + 0x77, 0xd1, 0x3e, 0xd2, 0xf0, 0xa3, 0xc9, 0xa4, 0xed, 0x0a, 0x28, 0xcf, + 0x5d, 0x8b, 0x3e, 0x29, 0xbc, 0x0e, 0x5c, 0x04, 0x10, 0x8d, 0x13, 0xcd, + 0x58, 0xc2, 0x02, 0x8d, 0xb8, 0x2e, 0xfa, 0x1a, 0x86, 0x12, 0x61, 0x4f, + 0x8b, 0xc7, 0x90, 0x01, 0x65, 0x61, 0x1c, 0xa5, 0x9a, 0x32, 0x32, 0x59, + 0x90, 0x56, 0xb6, 0x5c, 0xbd, 0x6a, 0x85, 0x8a, 0x14, 0xd6, 0x4d, 0xf2, + 0x68, 0xe9, 0x6b, 0x61, 0x66, 0x3a, 0x9c, 0x79, 0xed, 0xc4, 0x5f, 0x13, + 0xab, 0x8d, 0x71, 0xe4, 0x5d, 0x0d, 0xee, 0xd4, 0x11, 0xdf, 0x53, 0x21, + 0x75, 0x64, 0x61, 0x48, 0x6c, 0xf0, 0x30, 0xca, 0x60, 0x92, 0xf2, 0x02, + 0x5d, 0xc4, 0x11, 0x18, 0x14, 0xfb, 0xb4, 0xf2, 0xc2, 0x58, 0x99, 0xc8, + 0x73, 0x0b, 0xf9, 0xcf, 0x0b, 0xf7, 0x7a, 0x86, 0x36, 0xa0, 0xb5, 0x02, + 0x48, 0x5b, 0x42, 0x21, 0xf7, 0x82, 0x03, 0x16, 0x4d, 0x9f, 0x32, 0x2f, + 0xca, 0xfe, 0x06, 0x5a, 0xd4, 0xc9, 0x47, 0x73, 0x6e, 0xdc, 0x4d, 0x6a, + 0xc7, 0x70, 0x4c, 0xf9, 0x99, 0xba, 0x1c, 0xfe, 0xf4, 0xf3, 0x0d, 0x55, + 0x89, 0x74, 0x73, 0xd9, 0xef, 0x0f, 0x0b, 0xce, 0x3c, 0xc0, 0xd3, 0x50, + 0xf2, 0xc8, 0x1e, 0x64, 0x35, 0xb0, 0x3b, 0x50, 0xb0, 0x10, 0xb5, 0x6b, + 0x5b, 0x4f, 0xdd, 0xad, 0xd3, 0x41, 0x31, 0x40, 0x9b, 0x22, 0xbb, 0x7d, + 0xf5, 0x5f, 0x26, 0x23, 0x4c, 0x58, 0x3d, 0x12, 0x58, 0x5a, 0x60, 0x48, + 0x73, 0xff, 0x81, 0x4f, 0xa0, 0xaf, 0xa5, 0x86, 0xc9, 0xb3, 0xea, 0x33, + 0x00, 0x16, 0x6a, 0x8a, 0xff, 0x64, 0xcc, 0x2e, 0x6c, 0xae, 0x70, 0x2c, + 0x51, 0x4b, 0x7e, 0xea, 0x83, 0x46, 0xe4, 0x2f, 0x01, 0xfb, 0x85, 0x4c, + 0x6e, 0x37, 0x8f, 0x61, 0x1d, 0x73, 0xc9, 0x11, 0xf4, 0x2d, 0xec, 0xac, + 0x6e, 0x76, 0x0b, 0xe1, 0x03, 0xf0, 0xa0, 0x5c, 0x55, 0x81, 0xdc, 0xa7, + 0x27, 0x17, 0xcd, 0xad, 0xc7, 0x9c, 0x1a, 0x99, 0x20, 0x92, 0xb8, 0x25, + 0x05, 0x1b, 0x3f, 0x80, 0xbe, 0x35, 0x28, 0x98, 0x58, 0x47, 0x10, 0xc7, + 0xdc, 0x07, 0x2c, 0xe3, 0x22, 0x55, 0xff, 0xbc, 0x57, 0x23, 0xd5, 0x04, + 0xb0, 0x2c, 0x4e, 0x65, 0x1b, 0x60, 0xc9, 0x9b, 0xb6, 0x19, 0x7b, 0x6a, + 0xf3, 0xf6, 0xb6, 0xa5, 0x98, 0x18, 0xea, 0xe8, 0x47, 0xc5, 0x99, 0xe5, + 0x0f, 0xe5, 0x66, 0x32, 0x8e, 0x55, 0x6b, 0x61, 0x4d, 0xde, 0x30, 0x01, + 0x91, 0xc7, 0x82, 0xac, 0xc0, 0xe0, 0x17, 0x5b, 0x3f, 0xa4, 0xf3, 0x70, + 0x2d, 0x2c, 0xc5, 0x82, 0x1b, 0x6b, 0xae, 0x3e, 0xe4, 0x04, 0xa7, 0xce, + 0x36, 0x27, 0x47, 0xd1, 0xf2, 0xf8, 0xc3, 0x79, 0xce, 0xa1, 0xdf, 0xdc, + 0xc8, 0x32, 0x80, 0xb1, 0x3d, 0x39, 0xc0, 0xd4, 0x80, 0x9f, 0xcf, 0x7e, + 0x85, 0x76, 0x25, 0xcc, 0x57, 0xb1, 0xf1, 0xcf, 0x6c, 0xc5, 0x41, 0x11, + 0xac, 0x48, 0xfe, 0xd1, 0x52, 0xed, 0xde, 0xb9, 0xa4, 0x73, 0x6d, 0x85, + 0x54, 0x40, 0x5f, 0x6e, 0x5d, 0x7d, 0x1b, 0xc9, 0xcb, 0x5a, 0x1e, 0xc5, + 0xdd, 0x98, 0x94, 0x7d, 0xbe, 0x84, 0x8a, 0x40, 0x6a, 0x27, 0x45, 0x61, + 0x2c, 0x8a, 0x04, 0x9e, 0x0b, 0xa3, 0xc2, 0x95, 0xb0, 0x65, 0xf6, 0xb5, + 0xc8, 0xff, 0x13, 0x47, 0x10, 0xa9, 0xb4, 0xa1, 0x75, 0x94, 0xcd, 0x98, + 0xf4, 0x22, 0x3f, 0xa6, 0x1c, 0x9b, 0xd1, 0xaf, 0x33, 0x69, 0xcb, 0x9a, + 0x88, 0x67, 0x16, 0xf6, 0x90, 0x4e, 0xdd, 0x61, 0x91, 0x84, 0xe3, 0xf0, + 0x74, 0x73, 0x7e, 0x19, 0x4e, 0x75, 0xd7, 0x5a, 0xcb, 0x40, 0x13, 0x12, + 0x5f, 0x6f, 0xe5, 0x77, 0x0c, 0x5e, 0x3c, 0x6b, 0x87, 0xbf, 0xdf, 0x61, + 0xbe, 0x97, 0xbd, 0x38, 0xcd, 0xf7, 0x36, 0xee, 0x23, 0xcc, 0x7e, 0xd0, + 0x0a, 0xc2, 0xfe, 0x4e, 0xec, 0x9c, 0x9a, 0x07, 0xec, 0xd3, 0xb2, 0x1c, + 0x1c, 0x88, 0xfa, 0xfa, 0x26, 0x61, 0x97, 0x93, 0x87, 0xa4, 0xc8, 0xaa, + 0x4e, 0xf6, 0x8c, 0x22, 0xd4, 0xe8, 0x36, 0x97, 0x9c, 0x11, 0xe6, 0xf2, + 0x2b, 0x1c, 0x6c, 0x33, 0xea, 0xc4, 0xdc, 0xd6, 0x7b, 0xe7, 0x96, 0xe6, + 0xbc, 0x14, 0xa6, 0xc4, 0x7c, 0x78, 0x1f, 0xfa, 0x34, 0xd6, 0xaf, 0x28, + 0x29, 0xe6, 0x0a, 0xab, 0xca, 0x87, 0xaf, 0xb8, 0x2f, 0xab, 0x5f, 0x4a, + 0xc3, 0xfa, 0xd2, 0xa7, 0xef, 0xef, 0x21, 0xbd, 0xef, 0x9c, 0xb9, 0x59, + 0x1c, 0xa2, 0x8c, 0x1f, 0x0e, 0x86, 0xa5, 0xc9, 0x8b, 0x3d, 0xbe, 0x6e, + 0xba, 0x8c, 0xef, 0xa8, 0xb6, 0x0d, 0x54, 0x1e, 0x6f, 0x5d, 0x22, 0xf4, + 0x0c, 0x2e, 0x8b, 0x27, 0xaa, 0xc5, 0x9e, 0x43, 0x5a, 0xfd, 0x2f, 0x4b, + 0x49, 0x63, 0x69, 0x57, 0x02, 0x8b, 0xb2, 0xef, 0x21, 0xad, 0xdf, 0x3d, + 0x4a, 0x67, 0x0f, 0xad, 0xf7, 0x45, 0x17, 0xb3, 0x6e, 0xdd, 0x21, 0x9a, + 0xc2, 0x69, 0x3e, 0xc1, 0x9e, 0xe1, 0x96, 0x96, 0xad, 0xeb, 0x16, 0x69, + 0xa7, 0x8b, 0x14, 0x95, 0x6e, 0x47, 0xf9, 0x9f, 0x34, 0x3e, 0x66, 0x9d, + 0xcb, 0x0a, 0xb5, 0xff, 0xae, 0x2c, 0xff, 0x4e, 0x4d, 0xf5, 0x02, 0xa6, + 0xc9, 0xe8, 0x29, 0x56, 0xb3, 0xa8, 0x8c, 0xf5, 0xe6, 0x97, 0xa7, 0x00, + 0x1b, 0x98, 0x8f, 0xd3, 0xe9, 0x14, 0x0e, 0xf2, 0x85, 0x2f, 0xa9, 0xb7, + 0xb6, 0xca, 0x99, 0x06, 0x17, 0x1b, 0xf6, 0xce, 0x9e, 0x50, 0xf1, 0x33, + 0x62, 0xd0, 0xe5, 0xa0, 0x05, 0x01, 0x0d, 0xf8, 0x56, 0xf0, 0x35, 0x04, + 0xbd, 0xf2, 0x2c, 0x15, 0x75, 0x96, 0x89, 0xd9, 0x3c, 0xca, 0x84, 0x14, + 0x2b, 0x37, 0x64, 0x1b, 0x28, 0x62, 0xda, 0xb3, 0xc5, 0x2d, 0x26, 0x38, + 0x0f, 0x29, 0x39, 0x03, 0x3b, 0x75, 0x82, 0xeb, 0x6a, 0xd7, 0x1f, 0x57, + 0xb0, 0x66, 0x6f, 0xd3, 0xb6, 0xe5, 0xd2, 0x68, 0x14, 0x8f, 0xe8, 0xd7, + 0x09, 0x53, 0x98, 0xb6, 0xff, 0xa4, 0x85, 0x21, 0x7a, 0x07, 0x4a, 0xf3, + 0xe8, 0xe2, 0xc7, 0x20, 0x31, 0x77, 0x79, 0x37, 0xf3, 0x27, 0xe6, 0xd3, + 0xfb, 0x8b, 0xa8, 0xf4, 0x71, 0x27, 0xe4, 0xea, 0x63, 0xaf, 0xce, 0x9f, + 0x0b, 0x25, 0x01, 0x84, 0x31, 0x60, 0x30, 0x06, 0x11, 0x19, 0x31, 0x74, + 0xf2, 0xce, 0x17, 0x6a, 0x22, 0xfe, 0x62, 0x0e, 0xfb, 0x54, 0xdf, 0xe4, + 0xfd, 0x96, 0x69, 0x99, 0x1d, 0x95, 0xed, 0xcb, 0x9f, 0xb1, 0x89, 0xce, + 0x77, 0x84, 0x46, 0x87, 0x7d, 0x71, 0xb0, 0xd6, 0x35, 0xd1, 0x3d, 0xcd, + 0xb5, 0xa1, 0xaa, 0xee, 0xa2, 0x66, 0xb5, 0xf0, 0x1f, 0x4a, 0x12, 0x88, + 0xc8, 0x95, 0x02, 0x2c, 0x1a, 0xe8, 0x49, 0xda, 0x28, 0x10, 0x1a, 0xd0, + 0xc0, 0x33, 0x6c, 0xb6, 0x69, 0xb2, 0x1a, 0xa4, 0xf5, 0x47, 0xc8, 0x05, + 0x12, 0x22, 0xf3, 0xb5, 0xa4, 0x50, 0xa8, 0x1f, 0xab, 0x2c, 0xef, 0x5d, + 0x8d, 0xfe, 0x98, 0x9e, 0xdb, 0x1f, 0x63, 0xf3, 0x34, 0x1a, 0x82, 0xce, + 0xaf, 0x59, 0x61, 0xa5, 0x78, 0xfd, 0x5f, 0xef, 0xd9, 0xfe, 0xab, 0x06, + 0xd0, 0x14, 0xf1, 0x8a, 0x18, 0xbc, 0x32, 0xe2, 0xc0, 0x1b, 0x3d, 0xb1, + 0x72, 0x77, 0xa6, 0x94, 0x8c, 0x6e, 0x05, 0xa2, 0x6b, 0x80, 0xd0, 0x1b, + 0xd7, 0x59, 0xe3, 0x59, 0x5e, 0x6e, 0x10, 0x53, 0xd4, 0x2c, 0x68, 0x82, + 0x9a, 0xfc, 0x39, 0x9d, 0x19, 0xb3, 0x84, 0x3d, 0x4c, 0xaa, 0xe6, 0x71, + 0x59, 0x01, 0x29, 0x96, 0xcb, 0x96, 0xb2, 0x68, 0x5e, 0x8f, 0xdb, 0xc4, + 0x4e, 0xca, 0x20, 0x4b, 0xe5, 0x52, 0x37, 0xd6, 0xce, 0x36, 0xc0, 0x01, + 0xb7, 0xae, 0x7d, 0xe2, 0x2b, 0xd6, 0xcb, 0xb1, 0x9c, 0x66, 0xd9, 0x67, + 0xf8, 0x54, 0xcc, 0x2e, 0x2b, 0x2c, 0x58, 0xd0, 0x56, 0xba, 0x71, 0x62, + 0x1f, 0xda, 0xfc, 0xd2, 0xe5, 0xf2, 0xf7, 0xbd, 0x49, 0xcf, 0xa4, 0x7f, + 0x91, 0x53, 0xac, 0x08, 0x9e, 0xba, 0xd0, 0x85, 0x4d, 0x51, 0x30, 0xd1, + 0x58, 0x3c, 0x4c, 0xbe, 0x3d, 0x80, 0x7a, 0x5b, 0xe6, 0xb1, 0x49, 0x88, + 0xc6, 0xf8, 0xf7, 0xc6, 0xf4, 0xbc, 0xbb, 0x98, 0x59, 0x88, 0x61, 0xff, + 0xe1, 0x9e, 0x8a, 0xbc, 0x83, 0x11, 0xf0, 0xe0, 0x47, 0xa5, 0xea, 0x02, + 0x2e, 0xc6, 0xdd, 0x5a, 0xf5, 0x0d, 0x24, 0xf9, 0x00, 0x5f, 0xcc, 0xc9, + 0xe7, 0x16, 0x0d, 0xaa, 0x32, 0x34, 0xfe, 0x5c, 0x7b, 0x26, 0x47, 0x0a, + 0xb5, 0x61, 0xa2, 0x6c, 0x39, 0x46, 0x87, 0xca, 0xac, 0x75, 0xfe, 0x34, + 0x38, 0x92, 0x62, 0xbd, 0xfe, 0xa3, 0x75, 0x97, 0x2b, 0xf1, 0xc2, 0xbb, + 0x37, 0xf2, 0x52, 0xad, 0xdb, 0xc0, 0xfe, 0x76, 0x0f, 0x1f, 0x3a, 0xe6, + 0xae, 0xde, 0xb8, 0x67, 0x62, 0x94, 0xc4, 0x2f, 0xc8, 0x6a, 0xc8, 0x49, + 0x11, 0x90, 0x80, 0x0c, 0xca, 0x46, 0x66, 0x52, 0x7e, 0x51, 0x1d, 0x4d, + 0x19, 0x73, 0xf7, 0x10, 0xa0, 0x77, 0xdd, 0x0b, 0x38, 0x2c, 0x79, 0xa4, + 0x6e, 0xe3, 0xce, 0x5f, 0x1c, 0xf8, 0xcd, 0x96, 0x7a, 0xed, 0x48, 0x9e, + 0x14, 0x32, 0x69, 0x8a, 0xb3, 0xaf, 0xe1, 0x22, 0xb0, 0x29, 0xb3, 0x3b, + 0x61, 0x8f, 0xfc, 0xf4, 0xc3, 0xf8, 0x7d, 0x2a, 0x9c, 0x0a, 0x30, 0xe9, + 0x43, 0x56, 0xdc, 0xd8, 0xbf, 0xe3, 0x95, 0x6e, 0xa4, 0x25, 0x9b, 0x92, + 0xae, 0x30, 0xdc, 0x07, 0x76, 0x39, 0x08, 0x22, 0xe2, 0xd0, 0xf0, 0x4c, + 0xb6, 0x0b, 0xa5, 0x69, 0x2b, 0xe9, 0x58, 0x24, 0x85, 0x6b, 0x92, 0xb9, + 0x4b, 0xf7, 0x25, 0xdc, 0xbe, 0x77, 0x6c, 0xc0, 0xf3, 0xa9, 0x81, 0x01, + 0x8d, 0x95, 0x62, 0x5b, 0x8b, 0x78, 0x76, 0xb8, 0x75, 0x1f, 0xbe, 0x4e, + 0x19, 0xb1, 0x69, 0xb5, 0xdf, 0x96, 0x35, 0x51, 0x11, 0x05, 0x81, 0x29, + 0xd5, 0x20, 0x21, 0x48, 0xc2, 0xc5, 0x94, 0xdc, 0xf4, 0x2a, 0xc3, 0x45, + 0xc5, 0x88, 0xc0, 0x5f, 0x16, 0xd4, 0x42, 0x55, 0x84, 0xe9, 0x14, 0x85, + 0xe1, 0x7c, 0x01, 0x33, 0xd5, 0x6a, 0x9b, 0x56, 0x49, 0x8a, 0xd6, 0x79, + 0xc4, 0x2b, 0x3d, 0x0d, 0xce, 0x83, 0x48, 0xee, 0x92, 0x31, 0xec, 0xc0, + 0x6e, 0xf0, 0x05, 0xa9, 0xe3, 0x02, 0x8f, 0x5b, 0xdb, 0xc4, 0x1b, 0xff, + 0x55, 0x8f, 0x02, 0xbd, 0xfb, 0x37, 0xfa, 0x7c, 0x84, 0x22, 0x36, 0xd6, + 0x45, 0xf9, 0xf1, 0x95, 0xad, 0xe1, 0xb4, 0xee, 0xc2, 0x23, 0xba, 0xa6, + 0x8b, 0xbf, 0xe6, 0xa0, 0x72, 0x1b, 0x3e, 0x98, 0xed, 0x90, 0x4c, 0x33, + 0x2f, 0x90, 0x14, 0x9e, 0x45, 0x2f, 0xf6, 0x42, 0xfc, 0x99, 0xef, 0x8e, + 0xb3, 0x29, 0x37, 0x2a, 0x8a, 0xe3, 0xc0, 0xb9, 0x69, 0xcd, 0x7f, 0x9b, + 0x54, 0x37, 0x91, 0x46, 0x5a, 0xb5, 0x7a, 0x20, 0x9f, 0xff, 0x48, 0x08, + 0xc3, 0xa7, 0xf4, 0x4d, 0xa5, 0x84, 0x4e, 0x14, 0x08, 0x45, 0xa3, 0x09, + 0x03, 0x30, 0x9e, 0x7f, 0x83, 0xf4, 0x30, 0xe7, 0xfb, 0x8b, 0x49, 0xaf, + 0xe2, 0xde, 0x3c, 0x65, 0x16, 0x2d, 0xa1, 0x77, 0x53, 0xd1, 0x4b, 0xda, + 0x43, 0xcb, 0xbe, 0xaf, 0x5c, 0x47, 0x25, 0x47, 0x3a, 0x99, 0xb7, 0x1e, + 0x4e, 0x02, 0xf9, 0xba, 0x34, 0xe2, 0xb9, 0xbf, 0x48, 0x16, 0xba, 0x6f, + 0x42, 0xfa, 0xfa, 0x85, 0x9d, 0x02, 0x64, 0xd1, 0x8e, 0x8a, 0x93, 0x62, + 0x5e, 0x32, 0x35, 0x13, 0xdb, 0xb5, 0x55, 0x36, 0x29, 0x20, 0xc7, 0xf9, + 0x5b, 0x6b, 0xd1, 0x85, 0xc5, 0xcd, 0x10, 0x7a, 0x1d, 0x91, 0x83, 0x3d, + 0xc2, 0x7d, 0x23, 0x24, 0x13, 0xff, 0x9b, 0xa9, 0x00, 0x9d, 0x90, 0x3f, + 0xc6, 0xcc, 0xdf, 0x4b, 0x9f, 0xc8, 0x0f, 0x5a, 0xef, 0x4b, 0x39, 0x3b, + 0x5a, 0x46, 0x8d, 0x60, 0xb1, 0xa1, 0x22, 0xf3, 0xcc, 0x61, 0x3a, 0x77, + 0x69, 0x30, 0x02, 0x14, 0x78, 0xf4, 0xb8, 0x78, 0xce, 0x4e, 0x52, 0xaa, + 0x48, 0x39, 0xb2, 0x0f, 0x52, 0x3b, 0xb7, 0xca, 0x7a, 0x94, 0x50, 0x20, + 0x05, 0x37, 0x65, 0x21, 0x95, 0x7f, 0x99, 0x21, 0xdc, 0x76, 0xd7, 0x4d, + 0xed, 0x38, 0xb5, 0x87, 0x3a, 0xab, 0x53, 0x77, 0xf1, 0xf3, 0xfc, 0x5d, + 0x23, 0xe2, 0x09, 0x14, 0x60, 0x0b, 0x8a, 0xe6, 0x2c, 0xb4, 0x8b, 0x73, + 0x44, 0x82, 0x16, 0x58, 0x20, 0x77, 0x0b, 0xa1, 0xd0, 0x31, 0x23, 0x2b, + 0xf7, 0x5e, 0x9e, 0xc2, 0x50, 0x8e, 0x75, 0x28, 0x6a, 0x63, 0x4e, 0xed, + 0xac, 0x93, 0xdb, 0x08, 0x27, 0x50, 0x54, 0x47, 0x0d, 0xc4, 0x64, 0xc4, + 0x1d, 0x28, 0x58, 0xfc, 0xd8, 0x0e, 0x05, 0x89, 0xd0, 0x83, 0x11, 0x36, + 0x95, 0xdd, 0xcf, 0xc2, 0x4a, 0x8c, 0x47, 0xc4, 0x8a, 0xf5, 0xe2, 0x20, + 0x4a, 0x15, 0x50, 0xf1, 0xfc, 0xbe, 0xa0, 0x3e, 0xe6, 0xed, 0x7c, 0x33, + 0xd9, 0x93, 0x2b, 0xb8, 0x6b, 0xc9, 0x6c, 0x3a, 0x75, 0xd7, 0xd4, 0xd3, + 0x92, 0x9f, 0xa9, 0x58, 0x77, 0xd1, 0x23, 0x91, 0x6e, 0x5e, 0x21, 0x8b, + 0x05, 0x40, 0x6f, 0x16, 0x25, 0x5d, 0x13, 0xf3, 0x5a, 0xfc, 0x1b, 0xd0, + 0x4e, 0x43, 0x6b, 0x3f, 0x83, 0xfc, 0xb2, 0xe1, 0x8a, 0x50, 0x6a, 0xad, + 0x68, 0xe4, 0x11, 0x15, 0x77, 0x65, 0x47, 0xf8, 0x36, 0xba, 0x3a, 0x0f, + 0x97, 0x16, 0xcc, 0xaf, 0xc1, 0x84, 0x2e, 0x20, 0x40, 0x96, 0x56, 0xb9, + 0x36, 0x13, 0x9e, 0xcb, 0x0d, 0x68, 0xd0, 0xfe, 0x6c, 0x9d, 0xc3, 0x81, + 0xa0, 0xb9, 0x4b, 0x59, 0xfb, 0x13, 0xb9, 0xa4, 0xd5, 0xae, 0x32, 0x0a, + 0x5f, 0xac, 0x53, 0x83, 0x2b, 0x60, 0x57, 0x63, 0x22, 0xa8, 0x57, 0x18, + 0xbd, 0xb9, 0xe5, 0x4c, 0x3c, 0xb0, 0x4a, 0x70, 0x66, 0xc3, 0x90, 0xcf, + 0x46, 0x67, 0xaa, 0x9b, 0xbb, 0x7d, 0x48, 0x58, 0xbf, 0x22, 0x00, 0xf3, + 0x2a, 0xad, 0xe8, 0xb5, 0x80, 0xf7, 0x53, 0xf1, 0xf6, 0xd4, 0xe2, 0x2d, + 0xdf, 0xde, 0x76, 0x91, 0x45, 0x29, 0x65, 0x39, 0xfe, 0xba, 0x5b, 0xd8, + 0x77, 0x18, 0x3b, 0x89, 0xe2, 0x3a, 0xe5, 0x03, 0x79, 0xd5, 0x47, 0xa1, + 0x37, 0xb5, 0x86, 0x55, 0xef, 0x01, 0xcc, 0x35, 0x7f, 0x1b, 0x68, 0xc1, + 0xde, 0x36, 0xc9, 0x72, 0x5d, 0xf6, 0xa2, 0x04, 0x74, 0x62, 0xda, 0x30, + 0xe1, 0x7a, 0x37, 0xdd, 0x34, 0x77, 0x5b, 0x4b, 0xc3, 0xf8, 0xe9, 0x47, + 0xcf, 0x97, 0xd3, 0x4d, 0x30, 0xee, 0x20, 0x6b, 0x29, 0xd8, 0x9c, 0x7f, + 0x26, 0xab, 0x31, 0x57, 0x9d, 0xb9, 0x6a, 0xac, 0x42, 0x54, 0x99, 0x8c, + 0xa6, 0x04, 0x08, 0x73, 0x92, 0x0b, 0xf4, 0x92, 0x3f, 0x8a, 0x2f, 0xea, + 0x03, 0xdf, 0x8e, 0xa0, 0xcb, 0x99, 0x3a, 0x97, 0x18, 0xc5, 0xd2, 0x67, + 0x1b, 0xf0, 0x19, 0x63, 0xbd, 0x12, 0x89, 0x3f, 0x2c, 0xd5, 0xbe, 0x11, + 0x64, 0xdb, 0x1a, 0xe1, 0x23, 0x4a, 0x59, 0x67, 0xf3, 0x2f, 0xe1, 0x8e, + 0x4e, 0x97, 0x26, 0xbf, 0x04, 0xab, 0xe4, 0x08, 0x4e, 0x6e, 0x77, 0x3c, + 0xe0, 0x09, 0x65, 0x5e, 0xf0, 0xba, 0xac, 0x29, 0x6b, 0xf9, 0x1d, 0xea, + 0xe5, 0x8d, 0x5c, 0x96, 0x2a, 0x34, 0x32, 0x99, 0x59, 0x1e, 0xb6, 0x41, + 0x45, 0xfc, 0x42, 0x9b, 0x4a, 0x27, 0x2c, 0x29, 0x53, 0xc7, 0xa9, 0x85, + 0x5f, 0x07, 0xdd, 0xb0, 0x96, 0xbf, 0x55, 0x86, 0x1d, 0x04, 0xb3, 0x0e, + 0x32, 0x31, 0x34, 0xd6, 0x01, 0x82, 0x79, 0xb4, 0x9d, 0x48, 0xf3, 0x25, + 0x2f, 0xb4, 0xf9, 0x7d, 0xe7, 0xfa, 0x23, 0x92, 0x36, 0x87, 0x2f, 0xa2, + 0xf9, 0x99, 0x5c, 0x7e, 0x7c, 0xdb, 0x5b, 0x64, 0xcf, 0xb7, 0x74, 0x91, + 0xb7, 0x4a, 0x9f, 0xe7, 0x1f, 0x9a, 0x74, 0x1a, 0x6b, 0xdc, 0x7c, 0xa7, + 0x8a, 0x20, 0x94, 0x0a, 0x2d, 0xc7, 0xc6, 0xcf, 0x6e, 0x25, 0xe9, 0xa8, + 0x2c, 0xd9, 0xed, 0x78, 0x05, 0xa4, 0x65, 0x20, 0xa6, 0xa9, 0xed, 0x61, + 0xd9, 0xb7, 0xf8, 0x43, 0xf1, 0xcc, 0xff, 0x59, 0xcd, 0x6b, 0x9e, 0x7b, + 0x4c, 0x91, 0xe9, 0xab, 0x7a, 0x6f, 0x87, 0x7e, 0x66, 0x4a, 0xfb, 0xac, + 0xd6, 0xec, 0x95, 0xab, 0xa4, 0x4c, 0x68, 0x69, 0x4e, 0xb4, 0x80, 0xf5, + 0x32, 0x2b, 0xd4, 0xa5, 0x08, 0x88, 0x9f, 0x19, 0xf2, 0x12, 0x5c, 0xc1, + 0xcb, 0xcc, 0x46, 0xb5, 0xcc, 0x77, 0xbb, 0xc4, 0xc4, 0x01, 0xd9, 0xd6, + 0x49, 0x79, 0xe9, 0x62, 0x96, 0xe6, 0xc0, 0x43, 0xde, 0x00, 0x0d, 0xae, + 0x60, 0x06, 0x47, 0xb9, 0x5c, 0x81, 0x39, 0xa0, 0x06, 0x05, 0x2a, 0x62, + 0xab, 0xe5, 0x15, 0x47, 0xc9, 0x0a, 0xe3, 0x16, 0x6a, 0xfc, 0x4d, 0x73, + 0xbf, 0xee, 0x0b, 0x3f, 0x4d, 0x08, 0xee, 0x1c, 0x66, 0x03, 0x42, 0x56, + 0xd9, 0xfb, 0xe2, 0x8e, 0xd2, 0xf1, 0xc7, 0x41, 0x3d, 0x32, 0xa8, 0x64, + 0xa6, 0x81, 0x28, 0x7d, 0x0e, 0x6e, 0xb3, 0x2c, 0xb5, 0x64, 0xda, 0xda, + 0xf9, 0x0c, 0x4b, 0x77, 0x5a, 0xa2, 0xf3, 0xff, 0x96, 0xed, 0x47, 0x5b, + 0xe8, 0x03, 0xc1, 0x0a, 0x77, 0x25, 0x4b, 0x3c, 0xa1, 0x92, 0x30, 0xf0, + 0xa0, 0xfd, 0x3f, 0xac, 0x2d, 0xbd, 0x33, 0x05, 0x07, 0xfc, 0x48, 0x60, + 0xc2, 0xe7, 0x46, 0xc0, 0xf8, 0xcb, 0x4e, 0xed, 0xf5, 0x1e, 0xec, 0xd1, + 0x56, 0x6a, 0xcc, 0xfd, 0x4c, 0xb6, 0x18, 0xce, 0xa2, 0x79, 0xba, 0x81, + 0x3e, 0x8a, 0x41, 0x73, 0x76, 0xba, 0xde, 0xed, 0x44, 0x11, 0xd3, 0x62, + 0xa5, 0x99, 0xbc, 0x63, 0x32, 0xca, 0x42, 0xbf, 0xbf, 0x42, 0xc2, 0x70, + 0x64, 0x60, 0xd0, 0x4a, 0x9f, 0x33, 0x06, 0x0f, 0xed, 0x38, 0xa8, 0x3b, + 0xe7, 0xf6, 0x05, 0x46, 0x8d, 0x6f, 0x7b, 0x84, 0xb8, 0x90, 0x98, 0xd8, + 0x91, 0x93, 0xf8, 0x74, 0xee, 0xd6, 0x03, 0x96, 0x02, 0x15, 0x08, 0xcf, + 0x4a, 0xe4, 0xa0, 0x70, 0xa8, 0x5a, 0x3e, 0x77, 0x37, 0x20, 0xd9, 0xae, + 0x5e, 0x2e, 0x6f, 0xf7, 0xf9, 0x1a, 0x2d, 0xa9, 0x07, 0xa5, 0x48, 0x39, + 0x60, 0x95, 0x4e, 0x58, 0x6e, 0xff, 0xaa, 0x14, 0x50, 0x3a, 0x8f, 0x31, + 0x23, 0xb8, 0x54, 0x17, 0x78, 0xf2, 0x65, 0xcc, 0xf0, 0x7a, 0x8e, 0x60, + 0x35, 0x7e, 0x99, 0x89, 0x7f, 0x4f, 0xf7, 0xc2, 0x21, 0xc7, 0x11, 0xbf, + 0xf2, 0x20, 0xa3, 0x56, 0x91, 0xab, 0x26, 0xfe, 0x17, 0xf1, 0xa5, 0xf3}; + +const uint8_t kSignEntropy[MLDSA_SEED_BYTES] = { + 0x7c, 0xf6, 0x8e, 0x63, 0x14, 0x04, 0x0b, 0x08, 0x20, 0x9b, 0x00, + 0x8c, 0x31, 0x48, 0xee, 0xd3, 0xe1, 0x6d, 0x5f, 0x71, 0x3b, 0xc7, + 0x08, 0x78, 0x05, 0x4b, 0x12, 0x4f, 0xf1, 0xf3, 0x50, 0x07}; + +const uint8_t kExpectedCase1Signature[MLDSA65_SIGNATURE_BYTES] = { + 0xf8, 0xc7, 0x25, 0x84, 0x8b, 0x39, 0xd9, 0xd9, 0x80, 0xf0, 0x2f, 0xf7, + 0xa0, 0x24, 0x19, 0x08, 0x70, 0x65, 0xe2, 0xc8, 0x0a, 0xc4, 0xd3, 0xd5, + 0x97, 0x49, 0x31, 0xea, 0x7b, 0xd6, 0x64, 0xb6, 0x6e, 0x6b, 0xf3, 0xc7, + 0x5f, 0xcb, 0xe3, 0x42, 0xd5, 0x5e, 0xa2, 0xa8, 0x8a, 0x01, 0x9a, 0xfe, + 0x44, 0xe2, 0xa7, 0x87, 0xd7, 0x70, 0xab, 0x9d, 0xa8, 0xf0, 0x9c, 0x67, + 0xfb, 0x4c, 0x16, 0xa6, 0x86, 0x91, 0xe2, 0x26, 0x9b, 0xa2, 0xda, 0x94, + 0x96, 0x6d, 0xfc, 0x4e, 0xf2, 0x2f, 0x2b, 0xcf, 0xf3, 0xc2, 0x7e, 0xe5, + 0x73, 0x1a, 0xac, 0xa3, 0x2c, 0x4d, 0x82, 0x28, 0x82, 0xa5, 0x82, 0x40, + 0x3d, 0x9a, 0xf6, 0x4b, 0xee, 0xae, 0xb7, 0x37, 0xd8, 0x2d, 0x80, 0xdd, + 0xcb, 0x52, 0xce, 0xea, 0xde, 0x51, 0xb9, 0x3e, 0x34, 0x7a, 0xfd, 0x3d, + 0x79, 0x38, 0xf6, 0xe9, 0x48, 0x1c, 0x07, 0x5f, 0xb8, 0x02, 0xa5, 0xa4, + 0xab, 0x45, 0xe7, 0x66, 0xf5, 0x9a, 0x23, 0xfe, 0x72, 0xcb, 0xd0, 0xa0, + 0x18, 0x74, 0x75, 0xc2, 0x1c, 0xcf, 0xd6, 0x69, 0x62, 0xa0, 0xe3, 0xde, + 0xa7, 0x2b, 0x0a, 0xf0, 0x1c, 0xb3, 0x51, 0x41, 0xf9, 0xfe, 0x2c, 0xff, + 0x82, 0xa3, 0x46, 0x6f, 0xb5, 0x55, 0x0d, 0xaa, 0xd1, 0x0e, 0xc8, 0x31, + 0xc6, 0xdf, 0x61, 0xe8, 0x96, 0x78, 0x85, 0x08, 0xf4, 0x62, 0x48, 0xfd, + 0x1d, 0x82, 0xf7, 0x9b, 0x7b, 0xbe, 0xee, 0x47, 0xcf, 0x17, 0xf7, 0x5a, + 0xed, 0xfa, 0x89, 0x94, 0x3e, 0xc3, 0xba, 0x37, 0xa5, 0xe4, 0xfb, 0x72, + 0x32, 0x90, 0xc2, 0x46, 0xd3, 0x02, 0x3f, 0xf5, 0xb1, 0x73, 0xbd, 0x9f, + 0x09, 0xa8, 0x61, 0x13, 0x75, 0xf2, 0xa6, 0xfc, 0x19, 0x44, 0xe7, 0xfd, + 0x1c, 0xcf, 0xa2, 0x24, 0x79, 0x39, 0x3a, 0xf1, 0xe4, 0x96, 0xe0, 0xd8, + 0xcb, 0x9b, 0x1f, 0xab, 0x20, 0x7e, 0x21, 0xbb, 0xf7, 0xe2, 0x7e, 0xd0, + 0xaf, 0x2b, 0x04, 0xbf, 0x35, 0xad, 0x04, 0x09, 0x11, 0xfa, 0x45, 0xaa, + 0xee, 0x30, 0x62, 0xd4, 0x41, 0x39, 0xed, 0x00, 0xe3, 0x06, 0x7d, 0x0c, + 0x91, 0x2d, 0x4b, 0x8d, 0xdb, 0xf6, 0x98, 0x5b, 0x0b, 0x26, 0x9a, 0x79, + 0x19, 0x58, 0x2f, 0x3d, 0x80, 0xb7, 0x7a, 0x49, 0xc7, 0x6d, 0xfb, 0xd6, + 0xa6, 0x86, 0xef, 0x5b, 0xd2, 0x4d, 0xa6, 0x14, 0xb9, 0x01, 0xe9, 0x30, + 0xd1, 0xf1, 0xcf, 0x05, 0x93, 0xb1, 0x35, 0xdd, 0xa2, 0x6c, 0x99, 0x41, + 0x37, 0x61, 0xa4, 0x49, 0x72, 0xb2, 0x75, 0xe7, 0x2c, 0x16, 0xd1, 0x9e, + 0xc3, 0x57, 0x74, 0x23, 0x29, 0x66, 0x3f, 0xd2, 0xa3, 0x0f, 0xfe, 0xda, + 0xd2, 0xa8, 0x7a, 0xce, 0x18, 0x34, 0x1b, 0x15, 0xf8, 0x7d, 0x07, 0x9e, + 0x8e, 0x6b, 0x2c, 0x50, 0xa8, 0xfb, 0x81, 0x8e, 0x2b, 0xe5, 0x54, 0xb7, + 0xc9, 0x17, 0x26, 0xa5, 0xbf, 0x70, 0xf0, 0x45, 0xf0, 0xfb, 0x92, 0x55, + 0x94, 0x90, 0x53, 0x3a, 0xc3, 0x36, 0x4a, 0xe7, 0xa7, 0xae, 0x9f, 0x76, + 0x2b, 0x09, 0x12, 0xf2, 0xe6, 0x10, 0x98, 0xf1, 0x6d, 0x9f, 0x4e, 0x5d, + 0xdf, 0x79, 0x30, 0xf4, 0x1d, 0x6a, 0x21, 0xa0, 0x9b, 0xf7, 0x6c, 0x67, + 0xa5, 0x62, 0x55, 0x61, 0xa2, 0x31, 0xc9, 0xc9, 0x4b, 0x94, 0xd4, 0x91, + 0xc1, 0xba, 0xb0, 0x9d, 0x65, 0x68, 0xb9, 0x92, 0xcb, 0xd4, 0x2a, 0x86, + 0x81, 0x23, 0xff, 0x64, 0x7e, 0x70, 0xce, 0xfc, 0xab, 0x7f, 0x9f, 0xce, + 0xcb, 0x34, 0xff, 0xe7, 0xbc, 0xbb, 0x65, 0x45, 0x08, 0x7e, 0xc7, 0xa1, + 0x89, 0x89, 0x9c, 0x85, 0xbf, 0x05, 0x2e, 0x68, 0x7e, 0x33, 0x44, 0x7d, + 0x4e, 0xdb, 0xed, 0xf5, 0xe3, 0xee, 0x6a, 0x05, 0x11, 0x40, 0xad, 0xf0, + 0xa4, 0x3a, 0x92, 0xf1, 0x3b, 0x40, 0xa1, 0x61, 0xb9, 0x76, 0xc8, 0xd6, + 0x77, 0xf9, 0x9e, 0xb5, 0x95, 0x97, 0xdb, 0x1f, 0x5c, 0xbc, 0x31, 0x57, + 0x5b, 0xf4, 0x2d, 0x9a, 0x68, 0x2b, 0x1b, 0x44, 0xb3, 0xdc, 0xb6, 0x29, + 0xa5, 0x9a, 0xea, 0xcd, 0x28, 0xb0, 0xcf, 0xbc, 0xbd, 0xaf, 0x68, 0x81, + 0xd5, 0xd1, 0xfe, 0xb2, 0xd7, 0xfa, 0xd7, 0x76, 0xc8, 0xec, 0x01, 0xc4, + 0x7f, 0x73, 0xd4, 0x50, 0x30, 0xc6, 0x12, 0x3c, 0x9f, 0xa9, 0x88, 0xe9, + 0xdb, 0x6e, 0x8d, 0x1e, 0xeb, 0xa0, 0xfa, 0x46, 0x30, 0x2f, 0x84, 0x53, + 0x24, 0xe4, 0xeb, 0x4d, 0xdb, 0x83, 0xf5, 0x5b, 0x53, 0xc4, 0xeb, 0x02, + 0x39, 0xb5, 0x7f, 0x7e, 0x30, 0x53, 0x03, 0x48, 0xd3, 0x2f, 0x85, 0xf9, + 0xf8, 0x25, 0xf4, 0x80, 0x78, 0x7a, 0x7a, 0xf9, 0x95, 0x52, 0xb1, 0xc9, + 0x6b, 0x75, 0xc2, 0x55, 0x9d, 0xd2, 0xa5, 0x03, 0x54, 0x24, 0xec, 0xf3, + 0xef, 0x1b, 0x9c, 0xae, 0xde, 0xad, 0xd1, 0x7e, 0x9b, 0xdc, 0x7b, 0x7c, + 0x1b, 0x9b, 0xae, 0x00, 0x53, 0xba, 0xa0, 0x64, 0xb0, 0x16, 0xa6, 0xc8, + 0xe0, 0x24, 0xb0, 0xa1, 0x0d, 0xbd, 0xa0, 0x88, 0x49, 0x4d, 0x18, 0xd2, + 0x3d, 0xf2, 0x82, 0x3e, 0x5e, 0xbf, 0x08, 0x30, 0xa0, 0x6d, 0xa7, 0xd8, + 0xa5, 0x53, 0xd7, 0xa5, 0xaa, 0x3e, 0x63, 0xcc, 0xeb, 0x16, 0x97, 0xb9, + 0x2a, 0x4f, 0xcc, 0x92, 0x36, 0x70, 0xb9, 0xf2, 0xd3, 0xf6, 0x45, 0x90, + 0x41, 0x9e, 0x88, 0xc4, 0x28, 0x4c, 0x28, 0x72, 0x51, 0xdf, 0xd1, 0x97, + 0xa3, 0xe2, 0xe4, 0xa8, 0x6f, 0x79, 0x38, 0x56, 0x27, 0x17, 0xb5, 0x21, + 0x60, 0x3a, 0xd9, 0x21, 0x5c, 0x69, 0x41, 0xaa, 0x75, 0xa3, 0x98, 0x6c, + 0x2e, 0xb3, 0x79, 0xac, 0xd2, 0x16, 0x26, 0x09, 0x26, 0xc0, 0x05, 0x0a, + 0x6a, 0x60, 0xaf, 0x65, 0x78, 0xb2, 0x3f, 0xcd, 0xa5, 0x3b, 0xda, 0x7b, + 0x2d, 0x94, 0x66, 0xc1, 0x24, 0x4b, 0x52, 0xd8, 0x6b, 0xa5, 0x14, 0x8b, + 0xc3, 0x6d, 0xa6, 0x5c, 0xea, 0x4b, 0x32, 0x22, 0xad, 0x1d, 0xf3, 0x6b, + 0x7c, 0x89, 0xfa, 0xd6, 0xc4, 0x9e, 0x8a, 0x77, 0xef, 0x79, 0xcb, 0x59, + 0x4f, 0x2c, 0x3b, 0xba, 0xbd, 0xd9, 0xb3, 0x9d, 0x91, 0x29, 0xd5, 0xf6, + 0x57, 0xf8, 0x39, 0x7a, 0xf0, 0x0f, 0x1a, 0x89, 0xcb, 0xcb, 0x22, 0x15, + 0x08, 0x6d, 0xca, 0x01, 0x8a, 0x15, 0xb3, 0x47, 0xe6, 0x23, 0x7c, 0xe6, + 0xec, 0x10, 0xdc, 0x70, 0x16, 0xec, 0x53, 0x39, 0x1d, 0x77, 0x07, 0x8b, + 0xbf, 0x8b, 0x85, 0x92, 0x82, 0x3b, 0xf5, 0xe2, 0x8c, 0x80, 0x28, 0x4c, + 0xb8, 0x1e, 0xca, 0x57, 0x76, 0x95, 0xeb, 0x97, 0xfd, 0x3d, 0x47, 0x9a, + 0xf8, 0x3e, 0x0e, 0xda, 0xb2, 0x41, 0x09, 0xa8, 0x26, 0x1f, 0x11, 0x80, + 0xde, 0x18, 0xa5, 0x5c, 0x07, 0x54, 0x57, 0xe3, 0xd5, 0x3e, 0x3f, 0x78, + 0x3f, 0x5b, 0x2d, 0x77, 0x69, 0x15, 0x0b, 0xd6, 0x8c, 0xbc, 0x98, 0xf2, + 0x86, 0x53, 0x0b, 0x91, 0x46, 0x87, 0x03, 0xfc, 0x05, 0x7f, 0x5b, 0x40, + 0x39, 0xa8, 0xe7, 0xbf, 0xe6, 0x18, 0x09, 0xd5, 0x91, 0xa9, 0xec, 0x06, + 0x1f, 0x85, 0xc5, 0xf7, 0x94, 0xbe, 0x72, 0x03, 0x2e, 0xaa, 0xbc, 0xbb, + 0x4e, 0x8d, 0xfd, 0x27, 0x7c, 0xb4, 0x09, 0x63, 0x8d, 0xca, 0x33, 0x9e, + 0x51, 0xc3, 0xa3, 0x5c, 0xcd, 0x02, 0xe0, 0x62, 0x60, 0x53, 0xf2, 0x7f, + 0x4a, 0xa9, 0xcb, 0x46, 0xc1, 0xaf, 0x0e, 0x5e, 0x3f, 0xa9, 0x72, 0x14, + 0x99, 0x30, 0x0a, 0x81, 0x5c, 0xf3, 0x13, 0xbc, 0x61, 0xf2, 0xf5, 0x6d, + 0x89, 0xc1, 0x6f, 0x48, 0xeb, 0x43, 0xc3, 0x48, 0xcf, 0xb7, 0x18, 0x36, + 0x88, 0x9e, 0x1b, 0xc0, 0x1f, 0xa7, 0xea, 0x95, 0x12, 0xd3, 0x07, 0x4d, + 0xc6, 0xad, 0xb7, 0xef, 0x6c, 0x59, 0xb3, 0xee, 0x04, 0xec, 0x4b, 0xff, + 0x4a, 0xcc, 0x7c, 0x33, 0x7d, 0x8b, 0xd3, 0x29, 0x31, 0x77, 0xbd, 0x73, + 0x1e, 0x10, 0x53, 0x5a, 0x79, 0x70, 0xca, 0xb5, 0x9f, 0x20, 0x0b, 0x0b, + 0x31, 0xd5, 0x5d, 0xd9, 0x2b, 0x4a, 0xc4, 0x87, 0x55, 0xd9, 0xaa, 0x2e, + 0x8b, 0x65, 0xb0, 0xf4, 0x06, 0xc7, 0xf2, 0x9a, 0xa0, 0x65, 0xcc, 0x30, + 0x24, 0x8d, 0x1a, 0x61, 0x3f, 0xc7, 0x72, 0x3c, 0xf3, 0x17, 0xff, 0x0a, + 0x58, 0x82, 0xcf, 0x42, 0x96, 0x49, 0x67, 0x55, 0x79, 0x15, 0x12, 0x96, + 0xfa, 0x28, 0xfe, 0x77, 0xfc, 0x8f, 0x30, 0x96, 0x03, 0xf6, 0x79, 0x03, + 0x38, 0x49, 0x70, 0x48, 0x5c, 0xb8, 0x8e, 0x65, 0x67, 0xe1, 0x85, 0x6b, + 0x70, 0x3c, 0x69, 0xd0, 0x43, 0x0f, 0x4b, 0xa4, 0x04, 0x06, 0xd8, 0xec, + 0x26, 0xea, 0xac, 0xd5, 0xd9, 0x6c, 0x8b, 0xb4, 0x13, 0xd0, 0x07, 0xd3, + 0x41, 0x28, 0xa7, 0xbf, 0x0c, 0xe4, 0x5b, 0xa9, 0xb7, 0x3d, 0x2d, 0x5d, + 0x10, 0xbc, 0xbe, 0xa8, 0x26, 0xe0, 0x0a, 0x92, 0xae, 0xc2, 0x80, 0xd1, + 0x85, 0x94, 0x0e, 0x78, 0x24, 0xd6, 0xd5, 0x11, 0x87, 0xa3, 0xd4, 0x3c, + 0x63, 0x49, 0x12, 0x64, 0x3e, 0xee, 0xd9, 0x79, 0x2b, 0x84, 0xa1, 0x08, + 0x07, 0xcc, 0xa8, 0xc2, 0x8b, 0x91, 0x4c, 0xad, 0x39, 0x1b, 0xb7, 0x1d, + 0xa1, 0x22, 0x1a, 0x60, 0x5a, 0x5b, 0x9b, 0x5f, 0x5b, 0x10, 0x46, 0x25, + 0x34, 0xe1, 0x3f, 0x6c, 0xc2, 0xd9, 0xab, 0x32, 0x73, 0xa4, 0xde, 0xf1, + 0x34, 0x05, 0x04, 0x0a, 0x4c, 0x05, 0xa8, 0x7e, 0xad, 0x1d, 0x98, 0x50, + 0xf9, 0x11, 0xe1, 0x9a, 0x90, 0x7b, 0x3c, 0x39, 0xf9, 0x2d, 0x98, 0x28, + 0x9e, 0xf5, 0xc4, 0xa5, 0xc9, 0x4d, 0xa5, 0x74, 0xaf, 0x2f, 0x03, 0x97, + 0xd5, 0x20, 0x81, 0x40, 0xf3, 0x3e, 0x67, 0x18, 0xb9, 0x98, 0x57, 0xee, + 0xc0, 0x8e, 0x9b, 0x21, 0x4a, 0x87, 0xc2, 0x21, 0x9b, 0x42, 0xf7, 0xd1, + 0x51, 0x81, 0x23, 0x6c, 0x06, 0xa0, 0xcc, 0x09, 0x15, 0x1a, 0xa6, 0x7b, + 0xe9, 0xbe, 0x9f, 0xac, 0x8b, 0x08, 0x8b, 0xb8, 0xa3, 0x7d, 0x04, 0xb0, + 0xc0, 0x49, 0x62, 0x85, 0x7e, 0xb7, 0xd0, 0x3d, 0x1d, 0x78, 0xfc, 0x6a, + 0x76, 0xfb, 0x5c, 0x60, 0x22, 0xe4, 0xec, 0x09, 0xe0, 0x40, 0xde, 0x03, + 0xe5, 0xb1, 0x5e, 0xd7, 0x06, 0xc7, 0x06, 0xf6, 0x6c, 0x66, 0x4b, 0xb6, + 0x72, 0xaf, 0x59, 0xd7, 0x04, 0x34, 0xee, 0x64, 0xbe, 0x1b, 0xd2, 0xdf, + 0x74, 0x20, 0x05, 0xc8, 0x83, 0xf2, 0xfd, 0x62, 0xb4, 0xb6, 0x85, 0x46, + 0x14, 0x33, 0x23, 0x1d, 0xcc, 0xd3, 0x4b, 0x37, 0x96, 0xe4, 0x04, 0xb5, + 0x92, 0x0c, 0x6f, 0x52, 0xc9, 0xe2, 0xe7, 0xb2, 0xca, 0x01, 0x9e, 0xc6, + 0x51, 0x7a, 0x9e, 0xce, 0xb2, 0x76, 0xf8, 0x74, 0xa2, 0x4c, 0xaa, 0x66, + 0x9a, 0x7d, 0x64, 0x24, 0x4f, 0xe3, 0x33, 0xa4, 0xca, 0x26, 0xdd, 0x24, + 0xf0, 0xd0, 0x38, 0xf3, 0x81, 0x0a, 0xf1, 0x06, 0xd7, 0x07, 0xe9, 0x1a, + 0x85, 0x98, 0x9d, 0x3e, 0xb3, 0x80, 0x9d, 0xaf, 0xeb, 0x48, 0xee, 0xee, + 0x04, 0x6c, 0xb1, 0x6a, 0x81, 0x97, 0x26, 0x14, 0x60, 0x16, 0xfd, 0x0e, + 0x6a, 0xda, 0xb6, 0x0e, 0xff, 0xd5, 0x6c, 0x65, 0x1c, 0xd1, 0xe2, 0xfa, + 0xcc, 0x75, 0x6c, 0xb6, 0x10, 0x03, 0x5d, 0x41, 0xe6, 0x47, 0x95, 0x43, + 0xa5, 0x7a, 0xd1, 0xc5, 0xc5, 0xd9, 0xdd, 0x7f, 0x6c, 0xcb, 0x4d, 0x9c, + 0x77, 0x04, 0x57, 0x74, 0x90, 0x08, 0x03, 0x5f, 0x3a, 0xb1, 0x08, 0x3f, + 0x64, 0xaa, 0x7c, 0x75, 0x9e, 0xfa, 0x14, 0x8b, 0x3c, 0x76, 0xba, 0x20, + 0x8b, 0x9b, 0xee, 0x4d, 0xf1, 0x3a, 0x79, 0xdc, 0x97, 0xe6, 0x14, 0x28, + 0xee, 0x8b, 0xa4, 0x55, 0x9d, 0xe8, 0x08, 0x83, 0xa9, 0x27, 0x11, 0xd8, + 0x24, 0x5d, 0x0e, 0x0a, 0xab, 0x67, 0xfb, 0xd5, 0xda, 0x38, 0x36, 0x7b, + 0x25, 0x79, 0xef, 0xb1, 0x30, 0x50, 0x36, 0xf7, 0xfa, 0x8d, 0xc0, 0x82, + 0x7a, 0xec, 0x74, 0xde, 0x7a, 0x9d, 0xe4, 0x46, 0xe9, 0x23, 0x9e, 0xa9, + 0x90, 0x18, 0x6e, 0xbe, 0xc8, 0x37, 0xc7, 0x47, 0xb3, 0x17, 0x76, 0x2a, + 0x4a, 0x97, 0x75, 0x42, 0xee, 0x23, 0x7e, 0x98, 0x89, 0xd6, 0x86, 0x09, + 0x28, 0x9c, 0x24, 0xdf, 0x74, 0xa0, 0x2d, 0x3d, 0x17, 0x27, 0x9b, 0x90, + 0x51, 0xdd, 0xce, 0xc2, 0xef, 0x4f, 0xce, 0x38, 0x97, 0x69, 0x4e, 0xe9, + 0xb6, 0xdf, 0x0e, 0x5f, 0xcb, 0x1f, 0x3b, 0x31, 0xaa, 0xbf, 0xc7, 0x10, + 0x29, 0xea, 0xb7, 0x30, 0x9f, 0x43, 0x7d, 0x37, 0xcd, 0x72, 0x1c, 0x27, + 0x2b, 0xf7, 0x56, 0x9c, 0x94, 0xe7, 0xc8, 0x1a, 0x2a, 0xad, 0xa1, 0xfe, + 0x90, 0xe0, 0xdd, 0xb2, 0x75, 0xe4, 0xd9, 0x36, 0x55, 0xed, 0x9f, 0x5d, + 0x07, 0xa8, 0x5c, 0x63, 0x67, 0x8a, 0x12, 0x35, 0xa1, 0xef, 0x78, 0x85, + 0xf0, 0x37, 0x6f, 0x37, 0xce, 0x34, 0x75, 0x9a, 0x3c, 0x54, 0xcb, 0x36, + 0x8f, 0xfc, 0x41, 0x70, 0x23, 0x19, 0xd1, 0x15, 0xb3, 0x6c, 0xc5, 0x0b, + 0x51, 0x9d, 0xf2, 0x83, 0xc9, 0x4b, 0x62, 0x84, 0x12, 0xba, 0x6b, 0xee, + 0xd3, 0x4f, 0x80, 0xa7, 0xe3, 0x30, 0x19, 0xe3, 0x20, 0x44, 0xb1, 0x25, + 0xe2, 0x5a, 0xaa, 0xb0, 0x81, 0x1a, 0x6a, 0x6a, 0xd7, 0xb2, 0xc4, 0x64, + 0x87, 0x25, 0x7e, 0xe1, 0x5f, 0x82, 0x17, 0x1f, 0xce, 0x1a, 0x2d, 0x04, + 0x58, 0xad, 0x5f, 0x2a, 0x3d, 0xd2, 0xb8, 0xa9, 0xa2, 0x72, 0x19, 0x24, + 0x8d, 0x1b, 0x04, 0x89, 0x62, 0xf5, 0x6b, 0xdd, 0xb0, 0x66, 0x62, 0x59, + 0x0d, 0xb3, 0xa9, 0x03, 0x07, 0x11, 0x55, 0x51, 0xfa, 0x14, 0x49, 0x31, + 0xe6, 0x38, 0x66, 0x79, 0x07, 0xa8, 0xeb, 0xc5, 0x4b, 0x61, 0x8f, 0x59, + 0xb1, 0x18, 0x55, 0x28, 0x5c, 0x82, 0xff, 0x4a, 0xd3, 0x28, 0x47, 0xcc, + 0xc1, 0xba, 0xa4, 0xce, 0x2a, 0x9d, 0x28, 0x24, 0x69, 0x28, 0x13, 0x4f, + 0x06, 0x0f, 0xa3, 0x71, 0x6b, 0x17, 0x7b, 0xf4, 0x2c, 0x6d, 0xdb, 0xcc, + 0x6c, 0x4e, 0x8a, 0x04, 0x58, 0x20, 0x26, 0x54, 0xf9, 0x4d, 0x7c, 0x67, + 0xb8, 0x63, 0x55, 0x23, 0x94, 0xe6, 0x3c, 0xf7, 0x11, 0x61, 0x38, 0x9c, + 0x1a, 0x40, 0x5c, 0x3d, 0xb2, 0x35, 0xde, 0x35, 0x1f, 0x38, 0x48, 0xc5, + 0x15, 0x59, 0x2f, 0xa7, 0x81, 0xe8, 0x53, 0xe8, 0x46, 0x59, 0x3e, 0xf4, + 0xc8, 0x01, 0x33, 0xd6, 0xdd, 0xe6, 0xf3, 0xa5, 0x2a, 0x09, 0x54, 0x1a, + 0x92, 0x37, 0x3e, 0x9f, 0x54, 0xb6, 0x81, 0x5d, 0xf3, 0x74, 0x6b, 0x56, + 0x01, 0xc9, 0xfd, 0x4b, 0xa5, 0x54, 0x08, 0x27, 0x54, 0x21, 0xb2, 0xad, + 0x6c, 0x17, 0xdf, 0x4a, 0x29, 0x1e, 0xf8, 0xde, 0xc7, 0x51, 0x59, 0x55, + 0xa9, 0x3a, 0xf8, 0xe8, 0xa8, 0x56, 0x5c, 0x8a, 0x04, 0xff, 0x51, 0x8c, + 0x10, 0xd2, 0x22, 0xc7, 0x3b, 0x3c, 0xa4, 0xb1, 0x6d, 0x41, 0xb6, 0xaa, + 0x5b, 0x24, 0x17, 0x3b, 0x87, 0xb0, 0xa8, 0xd9, 0x93, 0x3f, 0x6b, 0xff, + 0x56, 0x83, 0xcc, 0x09, 0x5c, 0x15, 0xe8, 0x99, 0xa6, 0x17, 0xc3, 0xef, + 0xa1, 0x4b, 0xf7, 0x81, 0x7b, 0x62, 0x3a, 0x8a, 0x6a, 0x5c, 0x57, 0x4f, + 0x6d, 0xf4, 0x79, 0x7f, 0x69, 0xd9, 0xde, 0x40, 0xb8, 0xb0, 0x96, 0x70, + 0x1c, 0xaf, 0x94, 0xb5, 0x45, 0x02, 0xff, 0xf7, 0x23, 0x04, 0x7e, 0xc0, + 0xa1, 0x93, 0x2f, 0x9e, 0x7d, 0xc0, 0x81, 0x00, 0x4f, 0xed, 0xb5, 0xaa, + 0x7c, 0x25, 0x5f, 0x8b, 0x76, 0xd8, 0x45, 0xee, 0xf2, 0x9f, 0xc6, 0xb4, + 0x0c, 0x08, 0xd8, 0x7d, 0x1f, 0x25, 0xcc, 0x8f, 0x1e, 0x5b, 0xaa, 0x75, + 0x0b, 0xec, 0x61, 0xcf, 0xe2, 0x08, 0xc1, 0x45, 0x9d, 0x08, 0x7e, 0xc0, + 0xa8, 0x73, 0xc8, 0x2e, 0xcb, 0xa7, 0x5a, 0x71, 0xce, 0x8f, 0x7f, 0xe6, + 0xf6, 0x35, 0xe5, 0xcf, 0xac, 0x24, 0xf7, 0x71, 0x18, 0x27, 0x60, 0x3e, + 0xf8, 0x37, 0x3a, 0x81, 0x66, 0xee, 0x41, 0xf0, 0x72, 0x3b, 0x43, 0x40, + 0xa2, 0xd7, 0x99, 0x34, 0xda, 0xa3, 0xcc, 0xf4, 0xb7, 0x66, 0xd7, 0xad, + 0xab, 0x58, 0xee, 0x52, 0xa0, 0xb1, 0xbb, 0xd4, 0x3d, 0x30, 0x6d, 0x31, + 0xe5, 0xf8, 0x1e, 0xc8, 0x5d, 0x78, 0x35, 0x2a, 0xd5, 0x36, 0xd9, 0x08, + 0x20, 0x5f, 0x51, 0x07, 0x58, 0x02, 0x5f, 0x3a, 0x81, 0x90, 0x03, 0x35, + 0x74, 0xb4, 0x87, 0xae, 0x18, 0xe0, 0xeb, 0x73, 0x05, 0x63, 0xf1, 0x42, + 0x7e, 0x49, 0x26, 0xbd, 0xce, 0x8c, 0xab, 0x18, 0x70, 0xf7, 0x88, 0x22, + 0xc7, 0x6a, 0xe4, 0x55, 0xab, 0x31, 0x7e, 0xaa, 0x0b, 0x22, 0xff, 0xdd, + 0x4c, 0x80, 0x1f, 0x94, 0x64, 0xb0, 0x7d, 0x18, 0x69, 0x33, 0xf2, 0xc8, + 0x0a, 0x2e, 0x33, 0x8a, 0x0f, 0x47, 0x10, 0xeb, 0x6c, 0x5a, 0x6f, 0x4f, + 0x80, 0xb9, 0x29, 0x9f, 0x88, 0x88, 0x23, 0x75, 0x1e, 0xb8, 0xad, 0x76, + 0x75, 0x3a, 0x18, 0xbb, 0x9d, 0x92, 0x76, 0x53, 0x5b, 0x6d, 0x5b, 0x7a, + 0xc4, 0x34, 0x1c, 0x6e, 0xc7, 0xb4, 0x24, 0x7c, 0xd8, 0x27, 0x47, 0x74, + 0xb5, 0xf5, 0x5f, 0xcb, 0x90, 0x54, 0x00, 0x8e, 0xdd, 0xe1, 0x81, 0xf0, + 0xbb, 0xc5, 0xf9, 0x7d, 0x28, 0x07, 0x8d, 0xb2, 0x9b, 0x0c, 0x56, 0x80, + 0xa7, 0xa7, 0x2f, 0x84, 0x9a, 0xef, 0x70, 0xa9, 0x92, 0x40, 0xcc, 0xf9, + 0xd5, 0xee, 0xbe, 0x65, 0x8a, 0x79, 0xc8, 0xfb, 0x31, 0x4b, 0x6f, 0x79, + 0xfc, 0xc9, 0x52, 0x47, 0x38, 0xb9, 0x36, 0x08, 0xe7, 0xab, 0xb4, 0x60, + 0xe6, 0x9d, 0xc8, 0xb6, 0x77, 0x28, 0x53, 0x62, 0x70, 0x8a, 0x9a, 0xe1, + 0xbb, 0xc0, 0xa9, 0xae, 0x4f, 0xd5, 0x1b, 0x96, 0xdc, 0x7e, 0x8c, 0x4d, + 0x96, 0xe2, 0x23, 0x15, 0xaa, 0x6c, 0xf2, 0xca, 0x42, 0xae, 0xb8, 0xfa, + 0xea, 0x69, 0xcf, 0xe2, 0xcb, 0x95, 0x87, 0xa9, 0xfa, 0x27, 0x28, 0xdb, + 0xa4, 0xa2, 0xb2, 0x89, 0x6c, 0x8a, 0x8b, 0x70, 0x9c, 0xae, 0x45, 0xb1, + 0xce, 0x2a, 0xf4, 0x09, 0xeb, 0x52, 0x55, 0x67, 0xb8, 0x48, 0x60, 0xe2, + 0xae, 0x6d, 0x5b, 0x6f, 0xa4, 0x43, 0x08, 0xb6, 0x0b, 0x76, 0xce, 0x3c, + 0xbf, 0xb3, 0xc6, 0x09, 0x67, 0x18, 0xe6, 0x81, 0xea, 0x41, 0x13, 0x74, + 0x4a, 0x07, 0x95, 0x1f, 0xdb, 0x06, 0xa6, 0x45, 0x28, 0x0a, 0x65, 0x0e, + 0x4a, 0x4a, 0x55, 0x18, 0x9e, 0x0d, 0x22, 0x6c, 0xdb, 0x58, 0xde, 0x8f, + 0x5d, 0x56, 0xd3, 0xdf, 0xdc, 0xb9, 0x3c, 0x9e, 0xc9, 0xe5, 0x9a, 0xb1, + 0x33, 0xca, 0x09, 0x18, 0x89, 0x94, 0x08, 0xad, 0xa8, 0xc4, 0x97, 0x23, + 0x6c, 0xc4, 0x06, 0x44, 0xd7, 0xa2, 0x94, 0x88, 0x60, 0xfe, 0x56, 0x0e, + 0xd4, 0x71, 0xd8, 0xd5, 0xbf, 0x3d, 0x50, 0x92, 0xc4, 0x4a, 0xf0, 0x74, + 0x79, 0x70, 0x11, 0xaa, 0x49, 0xb7, 0x18, 0x83, 0xf7, 0xe6, 0x1b, 0x53, + 0xcd, 0x39, 0xf6, 0xe8, 0xed, 0x93, 0xa4, 0x36, 0xb9, 0x74, 0xee, 0x80, + 0x67, 0x0b, 0xa9, 0xd6, 0x0c, 0x8e, 0xdc, 0x54, 0x32, 0x81, 0x27, 0x95, + 0x9c, 0x5b, 0x5a, 0x44, 0x99, 0x0d, 0xb8, 0x89, 0xb0, 0xef, 0x01, 0x42, + 0x92, 0x6f, 0xb1, 0x28, 0x81, 0x5b, 0xd8, 0xd0, 0x54, 0xcf, 0x90, 0x4c, + 0xc7, 0x6c, 0xd3, 0x8b, 0x0f, 0xf4, 0x34, 0xca, 0x6e, 0xb0, 0x4e, 0xe8, + 0xb8, 0x1c, 0x14, 0x63, 0x91, 0x94, 0x3e, 0xcc, 0xa8, 0xbc, 0xb4, 0x28, + 0xb3, 0x78, 0x6b, 0xe3, 0x98, 0xfe, 0x80, 0xe8, 0xd9, 0xd7, 0x42, 0xd0, + 0x36, 0x1b, 0x3a, 0x22, 0xd3, 0x25, 0x64, 0x37, 0xb2, 0xa7, 0x56, 0x41, + 0xa0, 0xf3, 0xb3, 0x7f, 0x11, 0x0c, 0x0f, 0x41, 0x44, 0x92, 0xa5, 0xeb, + 0xf6, 0xd4, 0x56, 0x19, 0x41, 0xd4, 0x7e, 0x57, 0x57, 0xa3, 0xb6, 0x12, + 0x1d, 0xaf, 0xa6, 0xab, 0x62, 0xc3, 0x5d, 0x74, 0x4f, 0x44, 0xe1, 0xdc, + 0xcc, 0x45, 0xbd, 0x34, 0xce, 0x2e, 0x60, 0xcb, 0x21, 0x7d, 0x8b, 0xd8, + 0xfe, 0xbc, 0xdc, 0x2e, 0x72, 0xb8, 0xa7, 0x0e, 0x2f, 0x79, 0xc9, 0xc2, + 0xf7, 0xf1, 0xd4, 0xf4, 0x36, 0xec, 0x53, 0x31, 0x79, 0x18, 0x85, 0xa3, + 0xb4, 0xb0, 0x0e, 0xc1, 0xc9, 0x41, 0xdb, 0x7a, 0x7f, 0xd2, 0x34, 0x35, + 0x5c, 0x4b, 0x52, 0x83, 0x80, 0xe6, 0xf2, 0x18, 0x6f, 0xae, 0x37, 0x8c, + 0x95, 0x38, 0x8b, 0x44, 0x8d, 0x22, 0x84, 0xd1, 0xd5, 0xc1, 0x01, 0x9c, + 0xf0, 0x56, 0x8e, 0x48, 0x5d, 0xab, 0x2d, 0x70, 0x6a, 0x29, 0xca, 0x91, + 0x0c, 0x03, 0xaa, 0xaf, 0x8c, 0x28, 0xd2, 0x97, 0x4a, 0x60, 0xac, 0xf4, + 0x9c, 0x26, 0xc3, 0xe8, 0x7d, 0xac, 0xc0, 0xc8, 0xe4, 0xf4, 0xad, 0x94, + 0x01, 0xcc, 0x5f, 0x13, 0x83, 0x3a, 0x2c, 0xf2, 0xc8, 0xc0, 0xc6, 0x7d, + 0x1a, 0x00, 0x54, 0x7e, 0xee, 0xa1, 0xb0, 0x33, 0x21, 0x7f, 0x3f, 0x45, + 0xff, 0xdb, 0x6e, 0x9d, 0xca, 0x65, 0xe0, 0x60, 0x8f, 0x40, 0x56, 0x91, + 0x57, 0x75, 0x83, 0x8d, 0xd4, 0x03, 0x4a, 0x60, 0x0e, 0x46, 0x27, 0x02, + 0x4e, 0x9d, 0xfb, 0xfe, 0xf2, 0xbe, 0x44, 0xa2, 0x8a, 0xc3, 0x2e, 0x46, + 0x29, 0x59, 0x8d, 0xb9, 0x61, 0x5a, 0x18, 0x96, 0x1b, 0xaa, 0x7a, 0xea, + 0x45, 0xfe, 0x93, 0x17, 0xb8, 0xb8, 0x6c, 0x42, 0xb3, 0xc8, 0xf5, 0x70, + 0xe3, 0x98, 0x46, 0xf4, 0x73, 0x61, 0x32, 0xee, 0x8e, 0xb1, 0x9f, 0x12, + 0xf5, 0xeb, 0xf3, 0xb7, 0xbc, 0x94, 0x32, 0xa5, 0xc4, 0xd0, 0x60, 0x95, + 0x0d, 0xd8, 0x11, 0x15, 0xad, 0xde, 0xea, 0xb5, 0xcb, 0xdb, 0x27, 0xe4, + 0x0f, 0x55, 0x1f, 0x65, 0xff, 0x02, 0xa3, 0xb8, 0xfb, 0x38, 0xe9, 0x7e, + 0x3d, 0x1f, 0xb9, 0xce, 0x06, 0xa2, 0x57, 0xca, 0x7a, 0xd9, 0x18, 0x90, + 0xef, 0xc3, 0x52, 0x50, 0xfc, 0xa6, 0xbf, 0x84, 0x7b, 0x8e, 0xd8, 0xf7, + 0x9c, 0x6a, 0x53, 0xc9, 0x10, 0x81, 0x9c, 0xd4, 0x67, 0xef, 0x1c, 0xd2, + 0x58, 0x86, 0x66, 0x45, 0x00, 0x26, 0x78, 0x11, 0x61, 0x53, 0xce, 0x50, + 0xda, 0x0c, 0xd4, 0x1b, 0xf7, 0xa4, 0xda, 0xab, 0xe2, 0x93, 0xed, 0x8b, + 0x34, 0x42, 0x3a, 0xf4, 0xae, 0x3d, 0x46, 0x86, 0xea, 0xae, 0x5f, 0xc2, + 0x5a, 0xd8, 0xed, 0x63, 0xc4, 0xd6, 0x51, 0x57, 0x81, 0xea, 0x59, 0x68, + 0xf4, 0x58, 0x8c, 0x02, 0xfc, 0xe9, 0x0a, 0x96, 0xa7, 0x29, 0xac, 0xaf, + 0xf2, 0x39, 0x17, 0xc8, 0xe0, 0x6e, 0x2a, 0x6d, 0x21, 0xb8, 0xf4, 0xe9, + 0x02, 0xd7, 0xbf, 0xc7, 0xf6, 0x09, 0xa0, 0x67, 0x6c, 0x98, 0x83, 0xba, + 0x7b, 0x71, 0x4a, 0xea, 0x3d, 0x89, 0x0a, 0x0e, 0x86, 0x6b, 0xa0, 0x25, + 0xf1, 0x3a, 0xa3, 0x66, 0x54, 0xff, 0x83, 0x2b, 0xca, 0x1f, 0x74, 0x1d, + 0x31, 0x93, 0xfc, 0x84, 0xcd, 0x3c, 0xc9, 0xf0, 0x7e, 0x39, 0xb7, 0x95, + 0xf7, 0xa9, 0x9c, 0x67, 0xcf, 0x10, 0x0f, 0xef, 0xc7, 0xce, 0x6b, 0x46, + 0x4c, 0x2a, 0x07, 0xb4, 0x24, 0xc4, 0x0a, 0x66, 0xaa, 0x35, 0xbb, 0xe9, + 0xe5, 0xcf, 0xd4, 0x19, 0x6b, 0x37, 0x56, 0x54, 0x34, 0x73, 0xd2, 0x63, + 0x5f, 0x43, 0xb7, 0x6b, 0xa7, 0x6f, 0x96, 0x2a, 0x58, 0xfc, 0xcd, 0x4d, + 0x8c, 0x15, 0x6c, 0x4c, 0x28, 0x8b, 0x68, 0xd1, 0xa1, 0x73, 0xb2, 0x72, + 0xa8, 0x42, 0x82, 0x2f, 0x2c, 0xa3, 0xff, 0x02, 0x4d, 0x84, 0xce, 0xb6, + 0x13, 0x7b, 0xd3, 0x7a, 0xdc, 0x62, 0xe1, 0xdc, 0xdb, 0x6b, 0xb0, 0x08, + 0x69, 0xe6, 0xbc, 0x53, 0xb2, 0xb0, 0xf7, 0x57, 0xe9, 0x7f, 0x52, 0x97, + 0xcc, 0xbd, 0x65, 0x65, 0x77, 0xe0, 0x0a, 0x5a, 0xb0, 0xbb, 0xc7, 0x6e, + 0x11, 0x4a, 0x0b, 0x62, 0x0c, 0x9d, 0xaa, 0xff, 0x82, 0x1f, 0x0c, 0xd9, + 0xe4, 0xf1, 0x9c, 0x32, 0x9d, 0xdf, 0x48, 0xde, 0x14, 0xe9, 0xb1, 0x00, + 0xd9, 0xb4, 0xd4, 0xb5, 0xd4, 0x45, 0x59, 0xd6, 0x55, 0x8a, 0x28, 0xc6, + 0xf7, 0xdc, 0xe3, 0x73, 0x77, 0xec, 0xc0, 0xfa, 0x02, 0x0b, 0xd6, 0x62, + 0x87, 0x58, 0xfe, 0x3c, 0x2c, 0x8b, 0x2a, 0xaf, 0x58, 0x33, 0x93, 0x91, + 0x61, 0x9e, 0xf6, 0x35, 0x54, 0x70, 0x84, 0x22, 0x96, 0x64, 0x71, 0xd4, + 0xde, 0x8a, 0x18, 0xd5, 0x43, 0xa7, 0x6e, 0x31, 0xb2, 0x9f, 0x88, 0x41, + 0x93, 0x7d, 0x4e, 0x61, 0x17, 0x39, 0xe7, 0x1f, 0x4f, 0xa2, 0x21, 0x40, + 0x2d, 0x2b, 0x56, 0xb4, 0xbd, 0xf7, 0x7c, 0xef, 0x0a, 0x70, 0xa8, 0x56, + 0x71, 0x87, 0x6a, 0x26, 0xb9, 0xfe, 0xba, 0x9c, 0x72, 0x74, 0x7a, 0x8b, + 0xa3, 0xaa, 0x0e, 0x19, 0x29, 0x2d, 0x84, 0x8c, 0xa7, 0xd2, 0xd4, 0x27, + 0x30, 0x3b, 0x67, 0xb5, 0xb7, 0xc3, 0x02, 0x17, 0x30, 0x3b, 0x47, 0x6e, + 0xc2, 0xf3, 0xf7, 0xfd, 0x41, 0x76, 0x99, 0xec, 0x92, 0xb8, 0xbd, 0xf3, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x06, 0x0f, 0x16, 0x20, 0x24, 0x28}; +const uint8_t kExpectedCase2Signature[MLDSA65_SIGNATURE_BYTES] = { + 0xef, 0xbf, 0xd8, 0x29, 0xd3, 0x48, 0xaf, 0x86, 0x0d, 0xe5, 0x31, 0xde, + 0xfc, 0xe8, 0x90, 0xc3, 0x1d, 0x9a, 0xda, 0x49, 0xd4, 0xb2, 0xab, 0x5e, + 0x19, 0x8e, 0x31, 0x6f, 0x73, 0x3f, 0x7f, 0x77, 0xf8, 0xda, 0x71, 0x90, + 0xf9, 0xad, 0x47, 0xdd, 0x9f, 0xed, 0xb9, 0x79, 0xbc, 0x70, 0xf7, 0x80, + 0x43, 0xa1, 0xf1, 0x68, 0x71, 0x25, 0x05, 0x54, 0x1f, 0x91, 0xc3, 0xd9, + 0xc1, 0xad, 0xbc, 0xb9, 0xa2, 0xd9, 0x80, 0x8d, 0xf4, 0xe4, 0xd2, 0xa6, + 0x63, 0x6b, 0x94, 0x5f, 0x16, 0xf4, 0xe1, 0xe6, 0x28, 0x6d, 0x8a, 0xb9, + 0x58, 0x72, 0xb7, 0xda, 0x42, 0xbd, 0xa7, 0xaa, 0x85, 0x72, 0x1e, 0xa6, + 0xe9, 0xf0, 0x57, 0x13, 0x98, 0x13, 0x88, 0x23, 0x81, 0x41, 0xe4, 0x8b, + 0x8b, 0x77, 0x50, 0x98, 0x20, 0x28, 0xc5, 0x0b, 0x66, 0x16, 0x3b, 0xad, + 0xf5, 0xf4, 0xd5, 0xe7, 0xe5, 0x99, 0xab, 0x21, 0x9b, 0x0f, 0xbd, 0x02, + 0x4c, 0x46, 0x25, 0xc4, 0x2f, 0x14, 0x35, 0x52, 0x12, 0x70, 0xe3, 0x22, + 0xa9, 0xa1, 0x99, 0x44, 0x40, 0xeb, 0xb3, 0x8d, 0xc7, 0xec, 0x42, 0x55, + 0x10, 0xe9, 0x6b, 0xbb, 0x20, 0xb7, 0x1d, 0xc4, 0x23, 0x11, 0x91, 0x2b, + 0x41, 0x3c, 0xc7, 0xe2, 0xda, 0xdf, 0xb6, 0x0a, 0xac, 0x43, 0x2e, 0xfe, + 0x6e, 0x01, 0x8f, 0xb7, 0x75, 0x4f, 0xa6, 0x2e, 0x1e, 0xff, 0x04, 0xfe, + 0x06, 0x73, 0xed, 0xba, 0x96, 0xad, 0x9f, 0xff, 0xb1, 0xea, 0x89, 0xfa, + 0x9f, 0xc9, 0xae, 0x26, 0x68, 0xec, 0xe3, 0x93, 0xaa, 0x35, 0xe6, 0xc6, + 0x1b, 0x99, 0xf3, 0xc1, 0xef, 0x8f, 0x56, 0xf4, 0xd9, 0x1f, 0xf3, 0xe0, + 0x96, 0xe3, 0x77, 0x76, 0x90, 0xbc, 0xba, 0xcf, 0x3f, 0x2b, 0x24, 0xda, + 0x48, 0xf5, 0x7f, 0x1d, 0x58, 0xe9, 0x69, 0x3c, 0xd6, 0x0b, 0x21, 0x7f, + 0xad, 0x7e, 0xc7, 0xce, 0x08, 0x52, 0x6c, 0x7a, 0xb9, 0xaf, 0xdb, 0x4f, + 0xc4, 0x56, 0x05, 0x3a, 0x3d, 0x94, 0xa0, 0xdc, 0x92, 0x14, 0x20, 0x3c, + 0xf1, 0x2a, 0xf7, 0x9f, 0xab, 0x6f, 0x1b, 0x7a, 0xb1, 0x0e, 0x7d, 0x72, + 0xdc, 0x65, 0xb1, 0x10, 0xa5, 0x1f, 0x83, 0x9d, 0x4c, 0x2a, 0x37, 0x4a, + 0x0d, 0xaa, 0xd4, 0x35, 0x13, 0x3e, 0x68, 0x24, 0x12, 0x31, 0x80, 0xbc, + 0x9a, 0x1d, 0x4a, 0xea, 0x31, 0x50, 0x7d, 0x57, 0x95, 0x56, 0x3c, 0x6a, + 0x5e, 0xbf, 0x82, 0x47, 0x53, 0x5d, 0x38, 0xce, 0x2b, 0xac, 0x03, 0xe6, + 0x50, 0xfd, 0x6a, 0x6a, 0xbb, 0x79, 0x83, 0xee, 0xe2, 0x10, 0xef, 0xb1, + 0xc1, 0x2b, 0x01, 0x9d, 0xdc, 0xae, 0x2d, 0xaf, 0x18, 0x48, 0x6d, 0xd2, + 0x00, 0x62, 0x96, 0x9e, 0x9d, 0xf9, 0x1e, 0xa5, 0xbc, 0x8e, 0x5f, 0x10, + 0x08, 0x63, 0x87, 0x8d, 0x18, 0x4f, 0xd3, 0x8f, 0x99, 0xa7, 0xb6, 0x80, + 0x4d, 0xf2, 0x44, 0x22, 0x0b, 0x2e, 0x5c, 0x78, 0x36, 0x93, 0xcb, 0x68, + 0x17, 0x00, 0xfc, 0x85, 0x89, 0xd4, 0x3b, 0x1e, 0xed, 0x69, 0xa8, 0xa3, + 0x3d, 0xd1, 0x79, 0x7d, 0x18, 0xd8, 0xea, 0xde, 0xf8, 0x4f, 0xd7, 0x82, + 0x84, 0x3e, 0x57, 0xf7, 0x6e, 0x6c, 0x7a, 0xd7, 0xa2, 0x35, 0x9b, 0xe0, + 0x81, 0xfb, 0xcf, 0x3e, 0xd5, 0xbf, 0x13, 0x25, 0x9d, 0x1e, 0x0d, 0x85, + 0x69, 0x17, 0xbc, 0x45, 0xfd, 0xe1, 0xb9, 0x1c, 0xbb, 0x48, 0x46, 0x9d, + 0x4e, 0x99, 0x0c, 0x8c, 0xba, 0xb0, 0x3d, 0x6f, 0x6d, 0xea, 0x62, 0xb4, + 0x70, 0xfb, 0xf9, 0xae, 0x20, 0x44, 0xfc, 0x5d, 0x61, 0xce, 0x7d, 0x1b, + 0x44, 0xae, 0x20, 0xee, 0x0f, 0x9f, 0xaa, 0xa1, 0x3e, 0x6b, 0x53, 0xb4, + 0x2e, 0x44, 0x2b, 0xdc, 0x83, 0xa5, 0x2a, 0xd5, 0xda, 0x23, 0x10, 0xbf, + 0x3f, 0xb1, 0x3b, 0xa8, 0x0f, 0x01, 0x06, 0xcd, 0xff, 0x01, 0x3e, 0x33, + 0x98, 0xb8, 0xf7, 0x26, 0x31, 0x19, 0x07, 0x96, 0x83, 0x94, 0x67, 0x70, + 0xac, 0xeb, 0xb1, 0x2f, 0x42, 0x63, 0x36, 0xf2, 0xf4, 0x13, 0x61, 0x72, + 0x3e, 0x70, 0xbb, 0xfe, 0xe5, 0x11, 0xc5, 0x77, 0x2f, 0xa2, 0x12, 0xa3, + 0x19, 0xff, 0xea, 0x9c, 0xf2, 0xcf, 0x6a, 0x49, 0x09, 0x00, 0x54, 0x1e, + 0xae, 0x8c, 0xb3, 0xfe, 0x12, 0x68, 0x24, 0x04, 0x7c, 0x48, 0x84, 0x29, + 0xb3, 0x5f, 0x71, 0x45, 0xc7, 0x62, 0xcf, 0xaf, 0xca, 0xe2, 0x8c, 0xd5, + 0xb5, 0xc9, 0x82, 0x69, 0x98, 0x0d, 0x16, 0x58, 0x31, 0xb3, 0x38, 0x68, + 0x20, 0x82, 0xd5, 0x51, 0xb6, 0xb3, 0x57, 0xb0, 0xa6, 0x7c, 0x6d, 0xf3, + 0x82, 0x85, 0xc6, 0xf2, 0x06, 0x67, 0x40, 0x4f, 0x0c, 0x58, 0xfb, 0xc2, + 0xa2, 0x73, 0x00, 0x38, 0x00, 0xcc, 0x70, 0x93, 0x6d, 0x6c, 0x3b, 0xd4, + 0x4f, 0xcf, 0xb5, 0x1a, 0x26, 0x4e, 0xdb, 0x43, 0xf1, 0x02, 0x9c, 0xe9, + 0xe6, 0x0d, 0x1b, 0xa2, 0xae, 0xaf, 0xee, 0x48, 0x21, 0x50, 0x9c, 0xfe, + 0xe7, 0xa1, 0x0a, 0x14, 0xf2, 0x59, 0x15, 0x04, 0xd1, 0x8f, 0x37, 0xfa, + 0x22, 0x7c, 0xe0, 0x91, 0x83, 0xc8, 0x5d, 0x10, 0xcc, 0xaa, 0x38, 0x03, + 0x4b, 0xb4, 0xee, 0xea, 0x7b, 0x1b, 0x9a, 0xfd, 0x92, 0x1b, 0x35, 0x5a, + 0x0e, 0xa9, 0xfe, 0xa9, 0x11, 0x42, 0xc1, 0xc9, 0x3d, 0xb8, 0x06, 0xcb, + 0x39, 0x59, 0xb0, 0xb2, 0xf2, 0x2d, 0x25, 0x98, 0xf5, 0xde, 0x94, 0xd3, + 0x9e, 0xcd, 0x39, 0xbc, 0x6a, 0xae, 0x85, 0x62, 0x36, 0x33, 0x7b, 0xe1, + 0x2f, 0x0f, 0x77, 0x25, 0x7a, 0xe1, 0x88, 0xb4, 0xcb, 0x6a, 0x5f, 0x2a, + 0x15, 0x02, 0x0b, 0xbd, 0xe2, 0x0c, 0x7b, 0xeb, 0x15, 0xe3, 0x02, 0xd9, + 0x56, 0x78, 0xcb, 0x71, 0x07, 0xc9, 0x40, 0xa6, 0xe3, 0x96, 0x79, 0xa2, + 0xb3, 0xe5, 0xe0, 0xfe, 0x78, 0xf8, 0xa1, 0x2a, 0x5d, 0xa8, 0x5d, 0x43, + 0x08, 0x52, 0xf1, 0xcd, 0x4e, 0x13, 0xc5, 0x85, 0x8d, 0x81, 0xa3, 0x18, + 0x1a, 0x23, 0x03, 0x85, 0xf4, 0x13, 0x60, 0xf5, 0x31, 0x02, 0xfb, 0xb9, + 0x6c, 0xe5, 0x06, 0x6b, 0xf3, 0x78, 0xf9, 0x03, 0x9c, 0x62, 0xf1, 0xcc, + 0xeb, 0xe5, 0x97, 0xe8, 0xf7, 0xb2, 0x70, 0xbd, 0xca, 0x37, 0x7c, 0x15, + 0xa3, 0x2f, 0xce, 0x0b, 0xb8, 0xd2, 0xd7, 0x9d, 0xea, 0x96, 0xc7, 0x49, + 0xf6, 0xc1, 0x21, 0x7b, 0x7c, 0x41, 0x48, 0x71, 0x71, 0x21, 0xfa, 0x72, + 0x41, 0x48, 0x79, 0x80, 0x4e, 0x9a, 0xd6, 0xbc, 0x9e, 0x21, 0x76, 0x08, + 0x78, 0x33, 0xe3, 0x84, 0x5a, 0xcb, 0x0d, 0xc6, 0x59, 0x73, 0xc8, 0x9e, + 0x11, 0x55, 0x7b, 0x67, 0xff, 0x1d, 0xa5, 0x2a, 0x1f, 0x8c, 0x1d, 0x58, + 0x82, 0x1f, 0xde, 0x56, 0x79, 0x36, 0xe0, 0x80, 0x8a, 0xe2, 0xc0, 0xbc, + 0x56, 0x7d, 0xdd, 0xc1, 0x58, 0xae, 0x14, 0xe2, 0x50, 0x51, 0x8b, 0xc2, + 0xf8, 0xde, 0x6c, 0x81, 0xb2, 0x20, 0xfe, 0x09, 0x8d, 0x01, 0xb4, 0xf5, + 0xf0, 0xf6, 0x6c, 0xc9, 0x98, 0xcf, 0x86, 0x86, 0xf8, 0x9c, 0x7a, 0x0e, + 0x76, 0x38, 0xf3, 0x84, 0x48, 0xaf, 0x1f, 0x10, 0x08, 0x14, 0x43, 0xbb, + 0x04, 0x34, 0x8b, 0x72, 0x23, 0x9c, 0x81, 0x54, 0x36, 0x0f, 0x72, 0x7e, + 0xae, 0xac, 0xe9, 0x14, 0x11, 0x80, 0x40, 0xbd, 0x5d, 0xe0, 0xcd, 0xe9, + 0xea, 0x91, 0x63, 0x99, 0x22, 0x2e, 0x0a, 0x3a, 0x09, 0x06, 0xb7, 0xe5, + 0x01, 0x3d, 0xc2, 0xb7, 0x8f, 0xaf, 0x6e, 0x6a, 0xd1, 0xd5, 0x64, 0x54, + 0x69, 0x22, 0x9f, 0x4e, 0xb1, 0x7a, 0xd6, 0x7e, 0x2b, 0xb1, 0xac, 0x54, + 0xb6, 0x45, 0x70, 0x22, 0xa0, 0x7c, 0xdf, 0x59, 0xba, 0x9a, 0x80, 0x83, + 0xdd, 0x58, 0x16, 0xbc, 0x13, 0x23, 0xee, 0x14, 0x9d, 0x1c, 0xd0, 0x50, + 0x6d, 0x7f, 0x16, 0xab, 0x59, 0x54, 0xe4, 0xdb, 0x51, 0xbf, 0x7a, 0x33, + 0xf1, 0x58, 0xad, 0xd8, 0xcb, 0xe3, 0x16, 0xa5, 0x22, 0xc1, 0xe8, 0x5a, + 0x79, 0x09, 0x8a, 0x54, 0x86, 0x53, 0xd7, 0x20, 0x79, 0x30, 0x34, 0x5e, + 0x18, 0x66, 0x8e, 0x3f, 0x25, 0x02, 0x46, 0x31, 0x59, 0x1a, 0x97, 0x04, + 0x1f, 0x81, 0x2d, 0xdf, 0x11, 0xeb, 0xb0, 0xe2, 0x20, 0xcc, 0x24, 0x25, + 0x3a, 0x04, 0x06, 0x6d, 0xf5, 0x6a, 0xf0, 0x5d, 0x6f, 0xb6, 0x77, 0x47, + 0xf2, 0xfb, 0x50, 0x8d, 0xf8, 0xc4, 0xdf, 0x5d, 0x7d, 0xb2, 0x36, 0x95, + 0xe1, 0x63, 0x78, 0x29, 0x09, 0xce, 0xe1, 0x34, 0xaf, 0x8f, 0x87, 0x85, + 0x09, 0xd5, 0x0b, 0xc0, 0x25, 0x8c, 0x5f, 0x9a, 0xc8, 0xf0, 0xf7, 0x32, + 0x08, 0xae, 0x32, 0xbe, 0x9f, 0xf3, 0xc9, 0x64, 0x1f, 0x09, 0xe4, 0x9e, + 0x96, 0xbb, 0xed, 0x32, 0x0c, 0x43, 0xad, 0x86, 0x06, 0xc2, 0xb6, 0x52, + 0xce, 0xf0, 0x60, 0x4e, 0x53, 0xf7, 0x36, 0xa0, 0x37, 0x13, 0xcd, 0x9f, + 0xfc, 0xf2, 0x4b, 0x69, 0xf2, 0x3a, 0xa4, 0xa0, 0x7c, 0x4f, 0xbc, 0x1f, + 0xd0, 0x5a, 0xf6, 0x8e, 0x86, 0xe6, 0x4f, 0xe2, 0xc8, 0xb8, 0x4a, 0xb5, + 0xb5, 0xec, 0x5c, 0x5d, 0x06, 0xd2, 0xd8, 0xf0, 0xb9, 0x22, 0xd3, 0x6d, + 0x6a, 0x26, 0xae, 0x09, 0xbe, 0x1a, 0x9a, 0x80, 0xdf, 0x6c, 0x29, 0xa0, + 0xa2, 0xa2, 0xbc, 0xb9, 0xbf, 0xd9, 0xac, 0x10, 0xc5, 0x5d, 0x3d, 0xe4, + 0x89, 0x12, 0x99, 0x4e, 0xab, 0x7b, 0x6a, 0x1c, 0xd4, 0x60, 0x20, 0x20, + 0x91, 0xfe, 0xf4, 0x2f, 0x0f, 0xfa, 0x5a, 0x77, 0xd0, 0x4c, 0x72, 0x3e, + 0x20, 0x14, 0xbf, 0x40, 0xb4, 0x2d, 0x7f, 0x10, 0x93, 0x77, 0x73, 0xb7, + 0x5d, 0xce, 0x64, 0x01, 0xe8, 0x7c, 0xc3, 0xae, 0xdc, 0xc6, 0x91, 0x11, + 0xb0, 0x4c, 0x00, 0x2a, 0xdb, 0xa8, 0xbb, 0xa9, 0x3e, 0x0d, 0x2a, 0x8b, + 0x75, 0x93, 0x1b, 0xcb, 0xb6, 0xc1, 0xcd, 0x33, 0xf7, 0x5f, 0x64, 0xe7, + 0xb4, 0x07, 0x7d, 0xdf, 0x9d, 0x1b, 0x4e, 0x38, 0xc1, 0x4e, 0xe0, 0xa4, + 0x18, 0xab, 0xdc, 0x7c, 0x33, 0x50, 0xdb, 0xd5, 0x33, 0xbb, 0xb9, 0x74, + 0x6e, 0xa5, 0x9f, 0x93, 0x6a, 0x4b, 0x8a, 0xf6, 0x6f, 0x10, 0xfa, 0x85, + 0xe0, 0x72, 0xfa, 0x58, 0x25, 0x79, 0x38, 0xe0, 0xfa, 0x80, 0xde, 0x35, + 0xe7, 0x4e, 0x37, 0x54, 0x5a, 0xf8, 0xb9, 0x77, 0x15, 0xc7, 0xa1, 0x6f, + 0x91, 0x98, 0x1d, 0x3d, 0x8c, 0xd4, 0x5b, 0xe9, 0x56, 0x20, 0x87, 0x2a, + 0x6a, 0x6f, 0xd1, 0x88, 0x02, 0x16, 0x46, 0x7b, 0x96, 0x03, 0x99, 0x17, + 0x51, 0x1a, 0x74, 0x9d, 0x13, 0x5f, 0xb2, 0xa6, 0xf2, 0xf6, 0x6f, 0x8c, + 0xb8, 0xd7, 0x3c, 0x41, 0xd8, 0x51, 0xb4, 0x4d, 0x70, 0x22, 0xb6, 0x93, + 0x76, 0xab, 0x82, 0x49, 0x76, 0x8b, 0xe2, 0x99, 0x3d, 0x25, 0x97, 0x74, + 0x8b, 0x8b, 0xd6, 0xdd, 0xab, 0xf7, 0x0d, 0xa1, 0xc9, 0x96, 0xb8, 0xfa, + 0xcb, 0xfb, 0x2c, 0xb6, 0xe5, 0x60, 0x7d, 0x7f, 0x7c, 0x4b, 0x05, 0x5b, + 0xe6, 0xee, 0xc5, 0x7c, 0x60, 0xba, 0x66, 0x7a, 0xc1, 0xc6, 0x9c, 0xce, + 0xae, 0xa7, 0x93, 0xb7, 0x5a, 0xc8, 0x7f, 0x9d, 0xaa, 0xe2, 0xc0, 0x4b, + 0xb8, 0x05, 0x1d, 0x68, 0x00, 0x17, 0x2e, 0x8e, 0xad, 0xe4, 0x01, 0xa7, + 0x82, 0x96, 0xd3, 0x31, 0x0e, 0xd3, 0x8f, 0xae, 0x83, 0xd3, 0xab, 0xa4, + 0xd2, 0x5e, 0x45, 0x99, 0x47, 0x97, 0x8b, 0x88, 0xcf, 0x0f, 0xa2, 0x7b, + 0x11, 0xc3, 0xb5, 0x5a, 0x39, 0x2a, 0x39, 0x7a, 0x57, 0x09, 0x0b, 0x1e, + 0x3e, 0xe7, 0x08, 0x89, 0xee, 0xeb, 0x0c, 0xce, 0x14, 0x60, 0x33, 0x1e, + 0xa1, 0x51, 0xf2, 0x72, 0x3c, 0xe3, 0xb1, 0xd5, 0x2a, 0x11, 0x96, 0xe9, + 0x1c, 0x40, 0xcd, 0x65, 0x01, 0x2a, 0xc5, 0x75, 0xea, 0x28, 0xfd, 0x8b, + 0xf6, 0x45, 0xd6, 0x4c, 0xde, 0x31, 0x7f, 0xa4, 0xfc, 0x8f, 0x9c, 0xd9, + 0x4a, 0xf2, 0xbc, 0xf6, 0x76, 0xdd, 0xef, 0xc4, 0x44, 0xd4, 0x16, 0xac, + 0x79, 0x44, 0x63, 0xb2, 0x0b, 0x9e, 0x73, 0x49, 0x5f, 0x2b, 0xc5, 0x5a, + 0xc7, 0x0a, 0xe3, 0x18, 0xd0, 0x49, 0xf6, 0x4d, 0x8d, 0x23, 0x22, 0xa9, + 0xa7, 0x07, 0x37, 0xa0, 0x1b, 0xf3, 0x81, 0xf2, 0xb1, 0x03, 0x16, 0x5c, + 0x5a, 0x0a, 0xe6, 0xa5, 0x26, 0x5c, 0xaf, 0x03, 0x16, 0x89, 0x4b, 0xe3, + 0x93, 0xcc, 0x1d, 0xc5, 0x67, 0x30, 0x66, 0xbf, 0x3d, 0x1b, 0xc9, 0x80, + 0xdb, 0x7c, 0xfa, 0xeb, 0x79, 0x32, 0x69, 0x61, 0x90, 0x21, 0xbb, 0x35, + 0x2a, 0xb9, 0x94, 0xc2, 0xfa, 0x8a, 0xe2, 0x1c, 0xb2, 0xc5, 0x77, 0x7d, + 0xd0, 0x0e, 0x20, 0xa3, 0x39, 0xd1, 0x2b, 0x5f, 0x97, 0x82, 0xf0, 0x9e, + 0x67, 0xe3, 0x15, 0xc8, 0x36, 0x2e, 0x75, 0x20, 0x90, 0x24, 0x61, 0xd4, + 0x32, 0xc6, 0x97, 0x41, 0xf0, 0xb4, 0x0e, 0xa2, 0xa2, 0x1e, 0x2b, 0x77, + 0xc2, 0x4c, 0x17, 0x35, 0xe1, 0x1a, 0x15, 0x1e, 0x3f, 0xf7, 0xf1, 0x3b, + 0x91, 0x77, 0xe9, 0x84, 0x73, 0x2c, 0x61, 0xe9, 0x41, 0x45, 0x56, 0xd1, + 0x3b, 0xe4, 0x54, 0x79, 0x15, 0x75, 0x51, 0xe2, 0xbe, 0xed, 0x32, 0xd0, + 0xef, 0x34, 0xf1, 0x1f, 0x2e, 0xa8, 0x8e, 0x6b, 0x59, 0x65, 0xb1, 0x2a, + 0x54, 0xf7, 0x4c, 0x1a, 0x6b, 0x3c, 0x18, 0xeb, 0x97, 0x3b, 0x94, 0xd4, + 0xfe, 0xe4, 0xae, 0x76, 0x2c, 0xf0, 0xbb, 0x4a, 0x63, 0xd6, 0x87, 0xe0, + 0x94, 0xee, 0xc9, 0x5a, 0xba, 0x80, 0x28, 0x5e, 0x6b, 0x14, 0x73, 0x7b, + 0xe6, 0x4b, 0x0a, 0x81, 0x80, 0x87, 0x68, 0xe0, 0xcb, 0xb8, 0x20, 0x8e, + 0x47, 0x69, 0xd2, 0x39, 0x10, 0x8b, 0xc7, 0x29, 0x41, 0x6b, 0x8d, 0xe5, + 0x29, 0xd9, 0xad, 0xb1, 0x69, 0x8c, 0xc2, 0xa1, 0x50, 0xd2, 0xe3, 0x80, + 0xe4, 0x09, 0xa2, 0xd6, 0xae, 0x75, 0x65, 0xca, 0x45, 0x75, 0xe9, 0xab, + 0x2c, 0x99, 0xc6, 0xe9, 0x04, 0xd0, 0xcc, 0x9d, 0xea, 0xc7, 0x85, 0x4c, + 0xb2, 0xb1, 0x6f, 0x49, 0xf1, 0xe5, 0x51, 0xce, 0xa6, 0x7e, 0x68, 0xe5, + 0x2b, 0xbe, 0xed, 0x21, 0x12, 0x9f, 0x3b, 0x03, 0x6f, 0x84, 0xd6, 0x47, + 0x27, 0x76, 0x7f, 0x7b, 0xb8, 0x45, 0x8f, 0xa0, 0xff, 0xb1, 0xa8, 0x8e, + 0x4a, 0x33, 0x3b, 0xfe, 0x5a, 0x37, 0x90, 0xe7, 0x37, 0x7b, 0xfc, 0xf4, + 0xb3, 0x5f, 0x1c, 0xf7, 0x77, 0xb7, 0x09, 0x77, 0xa1, 0x4d, 0x4e, 0x59, + 0xcb, 0x8d, 0x61, 0x8f, 0xee, 0x5a, 0xfb, 0xc0, 0x0b, 0x4e, 0x05, 0x76, + 0xa0, 0xd0, 0xc8, 0x8e, 0x2c, 0xba, 0x11, 0xde, 0xa0, 0xc9, 0x69, 0xf4, + 0x53, 0xda, 0xc7, 0xda, 0x13, 0x62, 0x94, 0xe4, 0x2f, 0x73, 0x21, 0xca, + 0xaf, 0x3f, 0xd2, 0xaf, 0xbe, 0x8e, 0xea, 0x8f, 0xd6, 0x88, 0x74, 0x94, + 0x95, 0x42, 0x04, 0xb4, 0x4c, 0x9a, 0xad, 0xf6, 0x7f, 0xe0, 0xad, 0xf4, + 0x86, 0x52, 0x09, 0x36, 0x02, 0xb8, 0x61, 0x43, 0xe5, 0x2c, 0xd0, 0xd5, + 0x59, 0xc0, 0xf9, 0x99, 0x7c, 0x76, 0xc3, 0xb2, 0xb6, 0x59, 0x9a, 0x4f, + 0xd1, 0x0c, 0xc4, 0x76, 0x86, 0xfc, 0xd5, 0x23, 0x3e, 0x3c, 0x07, 0x9b, + 0x23, 0xac, 0x64, 0x58, 0x91, 0xb8, 0xd6, 0x96, 0xdc, 0x77, 0xaf, 0x32, + 0x32, 0x38, 0xda, 0xeb, 0xc5, 0x43, 0xb6, 0x96, 0x70, 0xbb, 0x63, 0x46, + 0x0b, 0xfd, 0xe5, 0x93, 0xdc, 0xb2, 0xaf, 0x80, 0x94, 0xb8, 0xd1, 0x94, + 0x59, 0x1e, 0xcb, 0x2d, 0xa5, 0xa2, 0xd6, 0xcd, 0x1a, 0x23, 0xd5, 0x61, + 0x03, 0xa7, 0x49, 0xc7, 0xb5, 0xd5, 0x1a, 0x16, 0x60, 0xc6, 0xc7, 0x94, + 0xbe, 0x79, 0x99, 0xf0, 0x11, 0x4f, 0x4b, 0x03, 0xdc, 0xfc, 0xb2, 0xd4, + 0x7a, 0xf8, 0xe0, 0xa4, 0x77, 0xd6, 0xa4, 0x01, 0xe2, 0x1f, 0xf1, 0x9b, + 0xdb, 0xba, 0x8b, 0x42, 0xe6, 0x5e, 0xa6, 0x89, 0x11, 0xab, 0xf1, 0xe9, + 0xad, 0x7b, 0x58, 0xfa, 0x68, 0x18, 0x17, 0xff, 0xe3, 0xe2, 0xa0, 0x94, + 0xb5, 0xf0, 0x72, 0x40, 0x65, 0x12, 0xfb, 0x11, 0xc9, 0x56, 0x56, 0xce, + 0xb1, 0x86, 0x5c, 0x50, 0x74, 0x49, 0x96, 0xf8, 0x43, 0x52, 0x9c, 0xaa, + 0xd2, 0x39, 0x84, 0xb3, 0x02, 0xc5, 0xa5, 0x8f, 0xbd, 0x6c, 0xc4, 0x74, + 0xf2, 0xc1, 0x0a, 0x08, 0xd0, 0xd9, 0x69, 0xff, 0xf2, 0x80, 0x50, 0x35, + 0x63, 0x88, 0x45, 0x0b, 0x8c, 0xd2, 0x48, 0x55, 0x0d, 0xba, 0xa9, 0x4c, + 0x7e, 0xd2, 0xaa, 0xb0, 0x7d, 0x46, 0x60, 0x15, 0x23, 0x4b, 0xa6, 0x4d, + 0x68, 0x85, 0x31, 0x12, 0xc5, 0x58, 0xdc, 0x2f, 0x26, 0x51, 0xf2, 0x96, + 0x7e, 0x2b, 0x67, 0xad, 0x66, 0x80, 0x33, 0x87, 0x33, 0x71, 0x52, 0x5d, + 0xd9, 0x4a, 0x1e, 0x23, 0x95, 0x70, 0xec, 0x85, 0x2d, 0x1d, 0x94, 0x33, + 0x40, 0xca, 0xef, 0x61, 0x3f, 0x77, 0x38, 0x65, 0x32, 0x9d, 0x94, 0x00, + 0xd6, 0x12, 0x4a, 0x37, 0x20, 0xe3, 0xc8, 0xc7, 0xdc, 0x59, 0x9b, 0x43, + 0xbe, 0x97, 0x33, 0x66, 0xc0, 0xc9, 0xa5, 0xbc, 0xfe, 0xec, 0x02, 0xf6, + 0x52, 0x18, 0x64, 0x24, 0x0c, 0x55, 0xc7, 0x0f, 0x62, 0x56, 0x45, 0xa2, + 0x7c, 0x93, 0xa3, 0xea, 0x0f, 0x87, 0xd1, 0xba, 0x23, 0x42, 0x80, 0xd9, + 0xc2, 0xb3, 0xd6, 0x25, 0x73, 0x10, 0x5b, 0x1a, 0xc1, 0xed, 0x4f, 0xa2, + 0x8d, 0x8e, 0x4d, 0x1b, 0x55, 0x64, 0xfd, 0xe3, 0x89, 0xab, 0xd6, 0xe6, + 0x45, 0x49, 0x6f, 0x91, 0x68, 0xb9, 0x26, 0xa4, 0xdd, 0x9f, 0x55, 0xa0, + 0x5f, 0x35, 0x69, 0x27, 0xd1, 0xb6, 0x5a, 0x3f, 0xcc, 0x52, 0x40, 0xad, + 0xda, 0x10, 0x8d, 0x2f, 0xd6, 0x8c, 0x01, 0xe1, 0x32, 0x73, 0xe1, 0x32, + 0xc3, 0xe7, 0x7e, 0xac, 0xba, 0x8c, 0xac, 0x38, 0x8f, 0x02, 0x0d, 0x67, + 0xa9, 0xc4, 0xab, 0x4e, 0xb5, 0x31, 0x77, 0xa2, 0x12, 0xd6, 0xaa, 0x58, + 0x57, 0xe7, 0xeb, 0xa6, 0x66, 0xd7, 0x36, 0xdc, 0x5c, 0x21, 0x86, 0xa4, + 0x76, 0x0a, 0x9c, 0x27, 0xb8, 0xeb, 0x0c, 0xc7, 0x2c, 0xf5, 0xe5, 0x49, + 0x8c, 0x62, 0x39, 0x82, 0xc0, 0xc4, 0x73, 0x8c, 0x74, 0x5d, 0xeb, 0xba, + 0xa5, 0x5a, 0xe2, 0x50, 0xdc, 0x06, 0xb7, 0x0b, 0x6c, 0xb9, 0x7e, 0x3d, + 0x01, 0xb7, 0x48, 0x2d, 0x4b, 0x90, 0xe7, 0x6d, 0x99, 0x58, 0xe1, 0xab, + 0x2b, 0xf1, 0x06, 0x5e, 0x25, 0x08, 0x9b, 0xc3, 0x79, 0xaf, 0xe0, 0x5e, + 0x01, 0xbb, 0x61, 0xb7, 0xc5, 0x22, 0x1c, 0xaf, 0x68, 0xef, 0x7f, 0xbb, + 0x41, 0x8b, 0x99, 0x19, 0xf0, 0x68, 0xe9, 0x94, 0x74, 0xe5, 0xda, 0xd4, + 0xf3, 0x8c, 0x4f, 0xdf, 0x09, 0x8f, 0x74, 0xd3, 0xbf, 0xe5, 0xee, 0x58, + 0xc0, 0x22, 0xa0, 0x91, 0x1e, 0xfa, 0x6b, 0x3b, 0x02, 0x2a, 0x03, 0xab, + 0x6a, 0xe2, 0xaa, 0xfe, 0x64, 0xbd, 0x77, 0xd0, 0x77, 0xc8, 0xbb, 0x75, + 0x9d, 0xb3, 0x3e, 0xe7, 0xc3, 0x57, 0x14, 0xb1, 0x75, 0x44, 0x98, 0x15, + 0x97, 0xef, 0x4c, 0xd2, 0x94, 0x50, 0x93, 0x89, 0x3a, 0x6f, 0xfa, 0x10, + 0x78, 0x90, 0xde, 0xef, 0xcb, 0x7d, 0x5d, 0xae, 0xbc, 0xfa, 0x48, 0x08, + 0x81, 0xc9, 0x0b, 0xdd, 0x90, 0x38, 0x34, 0x70, 0x66, 0x1a, 0x0e, 0xb1, + 0xe8, 0x99, 0x4e, 0x05, 0xf6, 0xa9, 0x88, 0x7f, 0xa6, 0x22, 0x19, 0x68, + 0xe1, 0xf5, 0x6b, 0xfc, 0xe7, 0xeb, 0xaa, 0x5b, 0xc4, 0x62, 0x8c, 0x19, + 0x45, 0xb9, 0x77, 0x8e, 0xcc, 0x31, 0xb4, 0xb3, 0x64, 0x43, 0xec, 0x95, + 0xe5, 0xd7, 0x63, 0x06, 0x29, 0xd8, 0x0c, 0xcb, 0xeb, 0x97, 0xa6, 0xad, + 0x37, 0x28, 0x7c, 0x8c, 0x28, 0xd6, 0x97, 0xa9, 0xc6, 0x3f, 0x9e, 0x1c, + 0x26, 0x0d, 0xf1, 0xba, 0xb1, 0x55, 0x96, 0xde, 0x86, 0xbf, 0x4e, 0x36, + 0x1f, 0x6c, 0x1c, 0xc6, 0xf5, 0x35, 0xb3, 0xb3, 0x74, 0xa0, 0x23, 0x86, + 0xda, 0x9e, 0x9a, 0xfd, 0xb7, 0xe8, 0x43, 0x67, 0x51, 0x26, 0x2d, 0xb3, + 0x72, 0x75, 0x6f, 0xd5, 0x2a, 0xae, 0xa3, 0xea, 0x04, 0xed, 0x83, 0x96, + 0xbb, 0xd1, 0x0d, 0x13, 0xdb, 0xc0, 0xb3, 0x26, 0x93, 0x54, 0x03, 0x2b, + 0x6e, 0x6a, 0x85, 0x8f, 0x50, 0x6f, 0xa3, 0xd3, 0x43, 0xda, 0x05, 0x1a, + 0x67, 0x86, 0x4e, 0xc5, 0x8f, 0xb2, 0x26, 0x71, 0xea, 0x4a, 0xcb, 0x24, + 0x4b, 0x00, 0x2e, 0xf0, 0xac, 0xaa, 0xae, 0x47, 0x0c, 0x01, 0x3b, 0x20, + 0xd0, 0x03, 0x58, 0xff, 0x4f, 0x00, 0xaf, 0xb0, 0xd1, 0x01, 0x41, 0xaf, + 0xfb, 0xdc, 0xcd, 0xa9, 0xe7, 0xa1, 0xeb, 0x81, 0x17, 0x41, 0x74, 0x6c, + 0x9b, 0xfb, 0x0d, 0xeb, 0x0c, 0x7a, 0xd0, 0x85, 0x6b, 0x0a, 0x30, 0x2b, + 0xa5, 0x1a, 0x67, 0xb2, 0x1a, 0x5e, 0x34, 0x89, 0x18, 0x37, 0x3e, 0xbe, + 0xd9, 0x81, 0x06, 0xd6, 0xcf, 0x8f, 0xdf, 0x4d, 0x94, 0x49, 0xc9, 0x12, + 0xe2, 0xb6, 0xc5, 0xc3, 0xdf, 0x05, 0xdb, 0x7c, 0xf0, 0x4a, 0xdf, 0x90, + 0xfa, 0x7e, 0xce, 0x89, 0x93, 0xf7, 0x28, 0x37, 0xd0, 0x1f, 0x3e, 0x7c, + 0x6d, 0x0e, 0x75, 0x5f, 0x55, 0xcb, 0xd1, 0x9e, 0x06, 0xed, 0xdd, 0x27, + 0xae, 0xb3, 0xce, 0xc0, 0x18, 0xce, 0x4a, 0x4f, 0x43, 0x81, 0x0d, 0x4c, + 0x29, 0x53, 0x91, 0xda, 0x5f, 0x22, 0x29, 0xed, 0x49, 0xbe, 0xed, 0x55, + 0x37, 0xf1, 0xa6, 0xfc, 0x7d, 0x10, 0xd7, 0xd0, 0xd3, 0xf2, 0xaf, 0x19, + 0xf3, 0x40, 0x2f, 0xfd, 0xe0, 0x16, 0x7e, 0x00, 0xb5, 0x4d, 0x57, 0x65, + 0xe6, 0x68, 0xef, 0xdb, 0x49, 0xe2, 0xaa, 0x39, 0xfc, 0x8d, 0x19, 0xa2, + 0x61, 0x6f, 0xdf, 0x43, 0xf4, 0x18, 0x5d, 0x16, 0xd9, 0x68, 0x92, 0x45, + 0x0d, 0x7f, 0x8d, 0x6f, 0xfd, 0x63, 0x26, 0x6d, 0x64, 0xfc, 0xf7, 0xa2, + 0x29, 0xda, 0x40, 0x65, 0x98, 0xfa, 0x2e, 0xba, 0x9f, 0x54, 0xfb, 0xc1, + 0xe4, 0x8f, 0xbc, 0x6e, 0x57, 0xe5, 0x9a, 0x9d, 0x1c, 0x63, 0x04, 0xc9, + 0x89, 0xf7, 0xe2, 0xdf, 0x3b, 0x83, 0xc8, 0x99, 0x65, 0xb7, 0x30, 0x7a, + 0x7b, 0x54, 0x95, 0x3f, 0x4e, 0x26, 0x05, 0xa6, 0x86, 0x40, 0x63, 0x4a, + 0xb8, 0x53, 0x33, 0x0c, 0x4e, 0x52, 0x26, 0x50, 0xf2, 0x81, 0x2f, 0x69, + 0xb0, 0x9d, 0x9e, 0x46, 0x2c, 0x1d, 0xfe, 0x79, 0x79, 0xff, 0x60, 0x83, + 0x05, 0x3f, 0xab, 0xcf, 0x04, 0x45, 0xe0, 0x19, 0xb7, 0xb3, 0x50, 0xc7, + 0x5e, 0x49, 0xe7, 0xed, 0x4c, 0x76, 0x60, 0x3e, 0xc9, 0x55, 0x11, 0xdc, + 0x54, 0xc7, 0x93, 0x13, 0xa8, 0x97, 0x0b, 0x46, 0xfb, 0xad, 0xa5, 0xeb, + 0xfc, 0xaa, 0x4a, 0x41, 0xc1, 0xc2, 0xee, 0xbd, 0x15, 0xb5, 0x91, 0x04, + 0x77, 0x93, 0xf1, 0x72, 0x38, 0x3f, 0x0b, 0x1e, 0x18, 0xdd, 0x93, 0x2c, + 0x1f, 0x32, 0x60, 0x12, 0x4f, 0x1b, 0x06, 0x34, 0xcb, 0xa5, 0x3a, 0x0f, + 0x07, 0xff, 0x9b, 0x39, 0xc0, 0x7c, 0xd8, 0xd7, 0x97, 0xe0, 0x08, 0x3b, + 0x55, 0xc7, 0x80, 0xc4, 0x07, 0xf6, 0x11, 0x9c, 0x5b, 0x25, 0x02, 0xc4, + 0x61, 0xb5, 0xa9, 0x8f, 0xec, 0x01, 0x5c, 0x17, 0xd8, 0x90, 0x48, 0xc5, + 0xd4, 0x2a, 0x0d, 0x8b, 0x40, 0x87, 0x46, 0x5e, 0xa2, 0x83, 0xf2, 0x64, + 0xf5, 0xb1, 0xeb, 0x52, 0xb2, 0x19, 0x9d, 0x2c, 0x29, 0x59, 0x3e, 0xde, + 0x6e, 0x33, 0xa3, 0x17, 0xd7, 0xf8, 0x58, 0x21, 0x42, 0x49, 0x7f, 0x5e, + 0x5c, 0xd9, 0x7f, 0x9f, 0xa0, 0x99, 0xa6, 0xfd, 0x66, 0x0c, 0x15, 0x64, + 0x5f, 0x1e, 0x01, 0x9d, 0x36, 0x20, 0xa9, 0xa4, 0x13, 0x2c, 0xc0, 0x80, + 0x92, 0x44, 0xd5, 0x61, 0xad, 0xd5, 0x9c, 0xf2, 0xdd, 0xc2, 0x0b, 0xc2, + 0x24, 0x1f, 0x69, 0x38, 0x48, 0x2c, 0x68, 0xf8, 0x19, 0xbe, 0x34, 0x21, + 0x38, 0xd0, 0x9c, 0xe5, 0xc0, 0xb8, 0x2b, 0x33, 0x24, 0x4c, 0x83, 0xe4, + 0x7d, 0xdb, 0x75, 0x7e, 0x60, 0xb4, 0x71, 0xaf, 0xb0, 0xc5, 0xb6, 0x4a, + 0xba, 0x9a, 0x83, 0x0b, 0x40, 0xf4, 0x96, 0x5a, 0xe2, 0x78, 0x20, 0x33, + 0xbb, 0x87, 0xba, 0x09, 0xf2, 0xd0, 0x24, 0x81, 0x23, 0xf5, 0x1b, 0x85, + 0x75, 0x12, 0x5d, 0x3e, 0xc5, 0x13, 0xc8, 0x03, 0xb7, 0xd5, 0x45, 0x70, + 0x64, 0x25, 0xfa, 0x58, 0xb5, 0x38, 0x11, 0x1f, 0xe7, 0x9e, 0x89, 0x5a, + 0x90, 0x59, 0xef, 0x7b, 0xd9, 0x14, 0x28, 0x2e, 0xf6, 0x30, 0x04, 0xf4, + 0xd0, 0x13, 0xba, 0xfc, 0x95, 0x39, 0x1a, 0xf4, 0xf4, 0xfd, 0x38, 0x83, + 0x46, 0x48, 0xc1, 0x31, 0x87, 0x0b, 0xa3, 0x1e, 0x67, 0x76, 0x83, 0xff, + 0x00, 0x52, 0x6b, 0xe8, 0xf4, 0x70, 0xa6, 0xba, 0xc0, 0xc4, 0x38, 0x9b, + 0xe0, 0x01, 0x07, 0x23, 0x3f, 0x58, 0x68, 0x7d, 0xbe, 0xdc, 0x0a, 0x4b, + 0xcc, 0xef, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x04, 0x09, 0x0e, 0x11, 0x1a, 0x1f}; + +const uint8_t kExpectedVerifySignature[] = { + 0x49, 0x23, 0xce, 0xa1, 0x29, 0x3b, 0x24, 0x00, 0xcc, 0xc3, 0xb1, 0x9f, + 0x1e, 0x80, 0x3e, 0xd8, 0x5a, 0x0d, 0x6e, 0x0b, 0xa6, 0x4f, 0x35, 0xf8, + 0x45, 0xf4, 0x20, 0xd8, 0x48, 0xe1, 0x85, 0x82, 0x05, 0x88, 0x3f, 0xdd, + 0xd4, 0xbd, 0xa0, 0xd9, 0xe3, 0x35, 0xca, 0x31, 0x02, 0x72, 0x95, 0x2c, + 0xd4, 0xfa, 0xf8, 0x76, 0x7a, 0xe1, 0x35, 0xcb, 0x7e, 0x93, 0xdd, 0x27, + 0xb9, 0x5d, 0x71, 0x0a, 0x08, 0xaf, 0xc2, 0x09, 0xbb, 0x74, 0xfa, 0xd9, + 0x46, 0x3b, 0x2f, 0x00, 0x53, 0xee, 0x74, 0x1c, 0x85, 0x15, 0xa8, 0x4b, + 0xa7, 0xf9, 0x0b, 0xbf, 0x9c, 0xde, 0x84, 0x03, 0xe3, 0x4f, 0x12, 0x21, + 0xe9, 0x1e, 0x7a, 0x06, 0x70, 0xd9, 0x68, 0x84, 0x53, 0x83, 0xcd, 0x72, + 0x4c, 0xe5, 0xcd, 0x6c, 0x30, 0x47, 0xd9, 0x28, 0x59, 0x64, 0x46, 0x57, + 0x93, 0xb5, 0x58, 0xe0, 0x8d, 0x92, 0x15, 0x16, 0xd0, 0x05, 0x09, 0x9f, + 0xdc, 0xcc, 0x2e, 0xfd, 0x8e, 0x13, 0x47, 0x41, 0x44, 0x15, 0x6c, 0x32, + 0x9f, 0x37, 0xfe, 0xe0, 0x31, 0xfb, 0xcb, 0x94, 0xec, 0x83, 0x37, 0x3e, + 0x72, 0x91, 0x0c, 0xab, 0xe0, 0x19, 0x57, 0xb8, 0xd0, 0xfb, 0x3f, 0x1c, + 0xe2, 0xb6, 0xf9, 0xc5, 0x9c, 0x91, 0x92, 0x1b, 0x2e, 0x0f, 0x6f, 0xf1, + 0xe6, 0x5a, 0xf1, 0x3c, 0x1a, 0x2c, 0xa6, 0xfb, 0x9d, 0x48, 0x86, 0x33, + 0xab, 0x94, 0xdf, 0x0b, 0x6b, 0xa8, 0xf0, 0x08, 0xb6, 0x59, 0xd4, 0xad, + 0xad, 0x9e, 0xe6, 0x07, 0x26, 0x00, 0x9a, 0x34, 0xc8, 0x2c, 0x7b, 0xd5, + 0x2e, 0xe0, 0x47, 0xd8, 0x0b, 0xd0, 0x34, 0x87, 0x62, 0xc5, 0x4d, 0x00, + 0xbc, 0x18, 0x62, 0x0b, 0x7f, 0xb7, 0xb6, 0x88, 0x43, 0x9f, 0x3f, 0x77, + 0x87, 0xec, 0xad, 0x21, 0xc2, 0xec, 0xfc, 0x99, 0x2f, 0xb6, 0x97, 0xab, + 0x40, 0xb3, 0xc2, 0x57, 0x33, 0x27, 0xde, 0x4b, 0xc6, 0x8e, 0x69, 0x47, + 0x59, 0x16, 0xe6, 0x49, 0xf7, 0x69, 0xe9, 0x3a, 0x08, 0xca, 0xd4, 0xd4, + 0x29, 0x44, 0x6d, 0xf3, 0x10, 0x56, 0x6f, 0x4b, 0x1d, 0x47, 0xb8, 0x9e, + 0x66, 0x5e, 0x11, 0x46, 0xec, 0xa8, 0x45, 0xc9, 0x66, 0xc1, 0x4a, 0x0c, + 0x62, 0x01, 0x2a, 0x05, 0xc6, 0xda, 0xb0, 0xd9, 0x95, 0x09, 0xc0, 0x97, + 0xe0, 0x33, 0xa2, 0x89, 0x36, 0xf6, 0xab, 0xa0, 0x7f, 0x65, 0xff, 0x2c, + 0xf5, 0xb9, 0x06, 0x74, 0x8b, 0xc2, 0x10, 0xc7, 0x7f, 0xdb, 0xfd, 0x6d, + 0xfe, 0xa8, 0xed, 0xd9, 0x41, 0x3b, 0x1d, 0x40, 0x96, 0xc8, 0xf5, 0x7c, + 0x23, 0x7b, 0xfc, 0x80, 0x5a, 0xb3, 0x1d, 0xfd, 0x29, 0x37, 0xfc, 0x12, + 0xa5, 0x64, 0x18, 0x93, 0xea, 0xc3, 0x15, 0xf6, 0x36, 0x3e, 0x92, 0xd6, + 0x31, 0x1c, 0xe2, 0x70, 0x89, 0xd9, 0x55, 0x4f, 0x42, 0x9e, 0x5e, 0xd2, + 0x80, 0x4a, 0x71, 0x31, 0xfc, 0x8f, 0xc2, 0x7f, 0xfc, 0xed, 0x8e, 0x48, + 0x8c, 0xc1, 0x7b, 0xa8, 0x73, 0xd3, 0x25, 0xd6, 0xe2, 0x1e, 0x14, 0x5e, + 0xbb, 0xd1, 0x3e, 0x44, 0x4c, 0x97, 0x99, 0xc4, 0xb9, 0xce, 0x42, 0xa1, + 0x0c, 0x0d, 0xa2, 0xd1, 0x60, 0xfe, 0x58, 0xf6, 0xf5, 0x4a, 0x27, 0x07, + 0xac, 0x49, 0x10, 0x71, 0x85, 0xc4, 0xdf, 0xab, 0x73, 0xde, 0x82, 0x17, + 0xb6, 0x0c, 0x97, 0x77, 0x8a, 0xd6, 0x88, 0x1e, 0x0b, 0xb5, 0x87, 0xa9, + 0xc5, 0xcb, 0x34, 0x48, 0x19, 0x86, 0xf1, 0x72, 0x68, 0xd9, 0xdc, 0x4b, + 0x6e, 0x06, 0x6c, 0x7d, 0x39, 0x47, 0xfe, 0xf9, 0x78, 0x23, 0x63, 0xa2, + 0x3c, 0xdc, 0xd0, 0x23, 0x44, 0x64, 0x49, 0x06, 0x12, 0x43, 0x35, 0xc6, + 0xcb, 0xd1, 0x87, 0x07, 0xaa, 0x9a, 0x03, 0x30, 0x93, 0xc7, 0xac, 0x92, + 0x28, 0xb5, 0xbd, 0x43, 0xc2, 0x86, 0xda, 0x36, 0xf1, 0xc1, 0x81, 0x68, + 0xd9, 0xcd, 0x00, 0x2d, 0x44, 0x16, 0xa7, 0x21, 0x37, 0x26, 0x87, 0xc8, + 0x0a, 0xf4, 0x8b, 0x40, 0x29, 0x84, 0xe8, 0x70, 0x56, 0xb4, 0x3f, 0xc6, + 0xc8, 0x96, 0xb5, 0xea, 0xc0, 0x7b, 0x4a, 0x67, 0x2b, 0x6d, 0xd0, 0x0d, + 0xd4, 0x79, 0x2d, 0xbd, 0xe1, 0xf1, 0xb3, 0x50, 0x2e, 0xac, 0x22, 0xf4, + 0x40, 0xec, 0x1b, 0xdc, 0xd5, 0x3d, 0xea, 0x7d, 0xa4, 0x53, 0x6a, 0x66, + 0x0b, 0x85, 0xa2, 0x57, 0xb6, 0x0d, 0xfe, 0x8f, 0xc5, 0x55, 0x1d, 0xcc, + 0xe1, 0x39, 0xc0, 0x32, 0xb5, 0x22, 0x35, 0x8b, 0x01, 0x0c, 0xb5, 0x4b, + 0xd5, 0x1a, 0xd7, 0x39, 0xa0, 0x32, 0x47, 0x50, 0x30, 0xef, 0x74, 0xf3, + 0x52, 0x53, 0x29, 0x8c, 0xaa, 0x24, 0x94, 0xb8, 0xc0, 0xb9, 0xc3, 0x18, + 0x5a, 0x49, 0x6d, 0x0e, 0xe4, 0xcc, 0xa0, 0xfb, 0xb9, 0x45, 0xe2, 0x2f, + 0x89, 0x00, 0x8c, 0x5a, 0xc8, 0x1c, 0x61, 0x21, 0x6c, 0x35, 0x05, 0xc9, + 0x83, 0x07, 0x32, 0x73, 0x73, 0xa6, 0x34, 0x89, 0x83, 0x86, 0xb1, 0xa8, + 0x49, 0xaa, 0xec, 0x0c, 0x9a, 0x32, 0xc6, 0x34, 0xbd, 0x9b, 0xbe, 0x74, + 0x06, 0x3d, 0x6d, 0x83, 0x7d, 0x47, 0x4b, 0xb3, 0x45, 0x78, 0x3d, 0x8b, + 0xbf, 0xca, 0xdc, 0x0c, 0xfb, 0xc5, 0x01, 0xe4, 0x0b, 0xcc, 0x9b, 0x05, + 0xbf, 0x16, 0x05, 0xff, 0x2b, 0xd8, 0x20, 0xbc, 0xe2, 0xd8, 0xa0, 0x4c, + 0xcd, 0x4f, 0xc3, 0xe7, 0x3b, 0xbd, 0x1d, 0x82, 0xf8, 0x6c, 0xce, 0x6f, + 0x62, 0x05, 0x37, 0x9c, 0xb8, 0x26, 0x6a, 0x9f, 0x76, 0xcc, 0x97, 0xac, + 0x1d, 0x8b, 0xde, 0x9b, 0x20, 0x52, 0x29, 0x3d, 0x96, 0x01, 0x31, 0x56, + 0x4a, 0xea, 0x14, 0xf1, 0xdd, 0x2f, 0x6b, 0x91, 0x46, 0x58, 0x0d, 0xa8, + 0xff, 0xcc, 0x4f, 0x95, 0xbc, 0x2e, 0x18, 0x9e, 0x55, 0xfc, 0x27, 0x6a, + 0x15, 0x64, 0x68, 0x33, 0x5f, 0xa9, 0xda, 0xe4, 0x10, 0xe1, 0x41, 0x3b, + 0x59, 0xd7, 0x61, 0x5c, 0xa4, 0x7b, 0x3d, 0x28, 0x09, 0x59, 0x45, 0x65, + 0x68, 0xe9, 0xc8, 0x09, 0x13, 0xa3, 0x61, 0xac, 0xba, 0x98, 0x6d, 0x98, + 0xe3, 0x03, 0xa1, 0xe6, 0xf3, 0x5f, 0xb0, 0x1e, 0x72, 0x0b, 0x46, 0xc8, + 0x51, 0x26, 0xbf, 0xf9, 0x3e, 0x55, 0xc0, 0x9b, 0x7e, 0x09, 0x63, 0x85, + 0x92, 0x18, 0xf2, 0xf4, 0x17, 0x2e, 0x7d, 0x05, 0x99, 0x5e, 0x1d, 0xb2, + 0x3e, 0xad, 0x68, 0x29, 0x6d, 0x21, 0xe9, 0xb1, 0x38, 0xd5, 0x3e, 0xcc, + 0x5f, 0xe5, 0xd1, 0x16, 0xa8, 0x33, 0x4f, 0xbc, 0x28, 0x21, 0x14, 0x38, + 0xdd, 0x45, 0x0d, 0xa7, 0x00, 0x41, 0x5d, 0x5f, 0x98, 0x42, 0x7c, 0x54, + 0x16, 0x4f, 0x25, 0xf1, 0x46, 0x76, 0xe4, 0x67, 0x58, 0x5b, 0x65, 0x73, + 0xdf, 0xfd, 0x1c, 0x52, 0x61, 0x4d, 0x95, 0xb7, 0x78, 0x47, 0x50, 0x2d, + 0xc9, 0xab, 0x14, 0x1c, 0xae, 0x9c, 0xdc, 0x17, 0xdd, 0xcc, 0xe2, 0x5c, + 0x03, 0x0e, 0xe0, 0x2a, 0x81, 0xf0, 0xa7, 0xb9, 0x23, 0x21, 0x04, 0xbe, + 0x5c, 0xc9, 0x55, 0x58, 0x17, 0x26, 0x0c, 0xb4, 0x52, 0x24, 0xb7, 0x5a, + 0xb7, 0x0d, 0x7b, 0xfe, 0xe9, 0xd3, 0xda, 0x16, 0x43, 0xfa, 0x3b, 0xb3, + 0xa1, 0x3e, 0x48, 0xe1, 0x68, 0xf5, 0x51, 0xb1, 0xf3, 0x62, 0x06, 0x93, + 0x26, 0xe8, 0xa2, 0x9d, 0x9a, 0x7d, 0xf7, 0xa2, 0x8f, 0xe0, 0xe7, 0xa9, + 0x6e, 0xa3, 0xce, 0x7f, 0xfb, 0x94, 0x56, 0xc8, 0x2d, 0x6b, 0xcd, 0x2f, + 0xa0, 0x1f, 0x9b, 0x7b, 0x7a, 0xd4, 0x35, 0xe6, 0xa0, 0xa9, 0x09, 0x1b, + 0x7c, 0x89, 0xc0, 0xd1, 0xd1, 0x34, 0x99, 0x59, 0xb5, 0x6b, 0xb3, 0x29, + 0x54, 0xbf, 0xf0, 0xe7, 0x81, 0xee, 0x86, 0x28, 0x6e, 0x68, 0xb2, 0x22, + 0xc2, 0x08, 0x84, 0xb7, 0x12, 0xfc, 0x14, 0x55, 0x61, 0xea, 0x36, 0x59, + 0x51, 0x14, 0x28, 0x21, 0x30, 0xc3, 0x87, 0x91, 0xad, 0x1b, 0x9d, 0x50, + 0x53, 0xf8, 0x1e, 0x2c, 0x90, 0x14, 0x54, 0x67, 0xcf, 0x39, 0x76, 0x4d, + 0xfb, 0x4c, 0x7e, 0x0c, 0x3d, 0xa5, 0x79, 0x8c, 0x03, 0x72, 0xcb, 0xf3, + 0xd0, 0x2f, 0x20, 0xc3, 0xc0, 0x00, 0x38, 0x90, 0x1d, 0x9f, 0x97, 0xd6, + 0xc2, 0x5c, 0xc8, 0xb5, 0x86, 0x94, 0x30, 0x9d, 0x7a, 0x06, 0xb4, 0x4d, + 0x40, 0x04, 0xa4, 0x67, 0xb1, 0x12, 0x38, 0xf9, 0x77, 0x45, 0xc0, 0x57, + 0x4c, 0xc3, 0x24, 0x96, 0xf6, 0xf0, 0x99, 0x97, 0x50, 0x2e, 0x78, 0xe3, + 0x61, 0x93, 0x32, 0x33, 0x43, 0x6c, 0x2b, 0x38, 0xa2, 0xd3, 0xe3, 0xc5, + 0xe1, 0x8a, 0xe1, 0xa5, 0x9a, 0x59, 0xa0, 0xdc, 0x49, 0x9c, 0x81, 0x69, + 0x24, 0x0b, 0xc6, 0xf7, 0x4e, 0x8c, 0xc7, 0x66, 0xd5, 0x98, 0xc1, 0x88, + 0x63, 0xa9, 0xeb, 0x4d, 0xaa, 0x9c, 0x6c, 0xfe, 0xb1, 0x3b, 0x69, 0x47, + 0x99, 0xc1, 0xbd, 0x8a, 0x4c, 0xf7, 0xad, 0x9c, 0x54, 0x5b, 0xca, 0x99, + 0x23, 0x3b, 0xef, 0x25, 0x77, 0xa1, 0x0f, 0x0a, 0x1c, 0xf9, 0x08, 0x7e, + 0xee, 0x26, 0xd8, 0x4d, 0x14, 0xc7, 0x62, 0xb9, 0x44, 0xb6, 0xbb, 0x3c, + 0xaa, 0x3b, 0x22, 0x48, 0xfb, 0x8e, 0x3b, 0xed, 0x08, 0x3d, 0x89, 0x9b, + 0xf8, 0x44, 0xca, 0xfa, 0x26, 0x78, 0x29, 0xde, 0xd6, 0xf1, 0x7b, 0x59, + 0x5a, 0xe6, 0xd1, 0x24, 0x27, 0x11, 0xd6, 0x7d, 0x07, 0x49, 0xdd, 0xff, + 0x6d, 0x62, 0x5b, 0x9d, 0x12, 0x81, 0x00, 0xd7, 0xda, 0x00, 0xfb, 0xf7, + 0x4f, 0x2d, 0xf4, 0x64, 0xf1, 0xfb, 0x07, 0x23, 0x97, 0xe9, 0x0b, 0x22, + 0x9a, 0x3d, 0xf6, 0x87, 0xd8, 0x23, 0xf8, 0x54, 0x36, 0xa0, 0xab, 0xb8, + 0x99, 0x8b, 0x83, 0x98, 0x67, 0x33, 0x79, 0x2f, 0x60, 0x99, 0x16, 0x26, + 0x50, 0xfc, 0x9d, 0xea, 0xdc, 0x92, 0xf4, 0x7c, 0x44, 0xef, 0xed, 0x23, + 0x5e, 0x11, 0x0c, 0x6a, 0xc9, 0xf5, 0x9b, 0x64, 0x65, 0x58, 0x34, 0xc0, + 0x18, 0xc2, 0x3c, 0xf4, 0x4f, 0x5c, 0x06, 0x71, 0x59, 0xa9, 0xcd, 0xad, + 0x8b, 0xfd, 0x93, 0x05, 0xfc, 0x48, 0x92, 0xa2, 0xed, 0x9d, 0xa5, 0x89, + 0x29, 0x35, 0x76, 0x16, 0x2a, 0x69, 0xeb, 0x0c, 0xef, 0xe6, 0x90, 0xa8, + 0x18, 0x8b, 0x7e, 0x0e, 0x2c, 0xed, 0x34, 0x37, 0x23, 0xc3, 0x24, 0x7b, + 0x67, 0x53, 0x84, 0xbe, 0x5c, 0xe9, 0x8c, 0x37, 0x8e, 0x0e, 0x53, 0xb9, + 0xaa, 0x60, 0x64, 0x5e, 0x67, 0x88, 0x88, 0x72, 0x27, 0xb6, 0x8a, 0x72, + 0x44, 0xdf, 0xe1, 0x69, 0xb7, 0x1c, 0x35, 0x79, 0xce, 0xab, 0x3a, 0x04, + 0x60, 0xa3, 0x60, 0xd8, 0x84, 0x81, 0x61, 0xbc, 0x95, 0xc6, 0x9d, 0x3f, + 0x7d, 0xbd, 0x4b, 0x1d, 0x40, 0x35, 0x97, 0x6f, 0xd9, 0x0c, 0x51, 0xa7, + 0xac, 0x5b, 0xe8, 0xa1, 0xa8, 0x9b, 0x4e, 0x3c, 0x88, 0x29, 0x23, 0xca, + 0x83, 0x1a, 0xc4, 0x5c, 0xec, 0x13, 0x1a, 0x07, 0x5d, 0xf3, 0x58, 0xa6, + 0x17, 0x1c, 0x09, 0xca, 0x5f, 0x14, 0x47, 0xf6, 0x94, 0x2f, 0xe8, 0x98, + 0x31, 0x51, 0xbe, 0x32, 0xaa, 0x8c, 0xb0, 0x79, 0x88, 0xfa, 0xa3, 0x9c, + 0xcc, 0xc1, 0xf6, 0x25, 0xa9, 0x5d, 0xa1, 0x46, 0xf1, 0x30, 0xd0, 0x41, + 0xf9, 0x5e, 0xca, 0x28, 0x06, 0x3c, 0xce, 0xb2, 0x40, 0xe3, 0xaf, 0x5e, + 0x53, 0x0e, 0xfc, 0x94, 0x9d, 0x6d, 0xc1, 0x90, 0x24, 0x00, 0x31, 0xcf, + 0x8d, 0x4a, 0xce, 0x81, 0xac, 0xe0, 0x88, 0x2e, 0xf0, 0x13, 0xac, 0x3c, + 0x48, 0xde, 0x0f, 0xdd, 0xc3, 0xcd, 0xa2, 0xc0, 0x5d, 0x67, 0x33, 0x2a, + 0xf8, 0x93, 0x25, 0x95, 0x76, 0x5f, 0x0c, 0xfc, 0x88, 0x4d, 0x9d, 0x99, + 0x69, 0x66, 0x89, 0xfe, 0x09, 0xb5, 0x78, 0xae, 0xa0, 0x22, 0x1e, 0x41, + 0xdc, 0xad, 0x8a, 0xef, 0x89, 0x74, 0x96, 0xc5, 0x83, 0x08, 0xe4, 0xec, + 0xb0, 0xaf, 0xd9, 0xdb, 0x83, 0x42, 0xea, 0x96, 0xf4, 0x3b, 0xa2, 0x9a, + 0x73, 0x04, 0x99, 0x59, 0xfe, 0x21, 0x35, 0xdc, 0xb3, 0xe8, 0x81, 0x5a, + 0xe5, 0x4e, 0x51, 0x9c, 0xc6, 0x5d, 0x81, 0xae, 0x00, 0x0a, 0xee, 0xbf, + 0xa2, 0x4d, 0xdf, 0xec, 0xcd, 0x86, 0x62, 0x9b, 0xd3, 0xee, 0x58, 0x8e, + 0x69, 0x0c, 0x7f, 0x83, 0xfc, 0xf1, 0x19, 0xde, 0x05, 0x81, 0x3d, 0xe0, + 0x46, 0x95, 0xaf, 0x39, 0x13, 0xbf, 0xac, 0xa0, 0x71, 0x00, 0xeb, 0xb8, + 0x92, 0xe7, 0x92, 0x86, 0x29, 0xf8, 0x78, 0x02, 0x19, 0xc5, 0x72, 0xe1, + 0x8f, 0xce, 0x62, 0x2a, 0x7b, 0x0f, 0xef, 0xa1, 0x79, 0xbf, 0xf9, 0x51, + 0x01, 0xc4, 0x3d, 0xbf, 0xbd, 0xcd, 0xf3, 0x62, 0x97, 0xf9, 0x68, 0x9c, + 0xb8, 0x60, 0x6d, 0xb3, 0x75, 0xb3, 0x41, 0xff, 0xb2, 0x94, 0x67, 0x6f, + 0x74, 0xbb, 0xaa, 0x3a, 0x41, 0x56, 0x0a, 0x16, 0x8c, 0xd9, 0xc5, 0xa6, + 0x22, 0x0d, 0xb5, 0x1e, 0x31, 0xf2, 0x87, 0xe3, 0xed, 0x3d, 0x5a, 0x38, + 0x59, 0x4f, 0x35, 0x99, 0xfe, 0x60, 0x94, 0xe2, 0x40, 0x90, 0xe3, 0x08, + 0x16, 0x8a, 0x19, 0x65, 0x3e, 0x1e, 0x16, 0x30, 0xe5, 0xba, 0x5e, 0x84, + 0xef, 0x76, 0x4a, 0x0e, 0x4e, 0x44, 0xd9, 0x3b, 0x6f, 0xb1, 0xe5, 0x89, + 0xf0, 0x7b, 0x09, 0x02, 0x8e, 0x61, 0xe9, 0xfd, 0xcb, 0x8f, 0x24, 0x90, + 0xde, 0x3c, 0x0d, 0xad, 0x1c, 0xa5, 0x28, 0xcf, 0x45, 0xf0, 0x12, 0x3e, + 0x64, 0x7f, 0x55, 0xa4, 0x21, 0x54, 0xff, 0x15, 0x07, 0xfe, 0xd0, 0x98, + 0x24, 0x0f, 0xf6, 0xe2, 0xa4, 0x51, 0x34, 0xdf, 0x14, 0x8e, 0x09, 0xaf, + 0x51, 0xe9, 0xa0, 0xb5, 0x59, 0x98, 0x07, 0x93, 0xfe, 0xb9, 0x57, 0x30, + 0x33, 0x68, 0xc7, 0xe9, 0x38, 0x61, 0xe1, 0x22, 0x3c, 0x1b, 0xac, 0x68, + 0x75, 0x51, 0x34, 0xcc, 0x74, 0x35, 0x1b, 0xc0, 0x2c, 0xaa, 0xf1, 0xd5, + 0xfe, 0x2e, 0x5f, 0x88, 0x59, 0x28, 0x5a, 0xfe, 0x70, 0x22, 0xb4, 0xfc, + 0x73, 0xe2, 0x62, 0x07, 0xd2, 0x8f, 0xbc, 0x5d, 0x1f, 0x61, 0xbb, 0xf2, + 0x2f, 0xc2, 0x21, 0x09, 0x58, 0x60, 0x22, 0x72, 0x92, 0xca, 0x04, 0xf8, + 0x56, 0x4e, 0x76, 0xec, 0xc4, 0x03, 0x33, 0xe1, 0x4b, 0xaa, 0x2e, 0x71, + 0x17, 0x2d, 0x71, 0xd3, 0x29, 0xf2, 0x78, 0x0f, 0xe7, 0xdf, 0x9f, 0xb1, + 0x75, 0x96, 0x17, 0x5f, 0xc0, 0x77, 0x3d, 0x6d, 0x50, 0xd1, 0x50, 0x40, + 0x1a, 0x17, 0xae, 0x93, 0xdd, 0x0f, 0x93, 0xa9, 0xb0, 0x9b, 0xc2, 0xab, + 0x7d, 0xa2, 0x8a, 0xf0, 0xe9, 0xc2, 0x5d, 0xc8, 0x33, 0xe5, 0xd9, 0x51, + 0xfc, 0x51, 0x5f, 0x26, 0x10, 0x15, 0x89, 0x12, 0x35, 0x42, 0xb3, 0x7a, + 0x10, 0x98, 0x69, 0xfa, 0xa2, 0xa0, 0x86, 0xa7, 0x25, 0x7d, 0x51, 0xcf, + 0xc3, 0xb3, 0x95, 0xfc, 0x96, 0x9f, 0x60, 0x42, 0x8c, 0x6c, 0x97, 0x18, + 0x0c, 0xfb, 0x49, 0xb7, 0x61, 0x25, 0xb8, 0xb1, 0xc9, 0x87, 0xf3, 0xf0, + 0xc5, 0xa1, 0x3d, 0x90, 0x24, 0xe5, 0xca, 0xe1, 0x0f, 0x6a, 0x76, 0x74, + 0xd5, 0x6f, 0x0e, 0x07, 0xdf, 0x0c, 0xeb, 0x0a, 0xb9, 0xfe, 0xd2, 0xa0, + 0xf5, 0x6d, 0x1d, 0xe1, 0xaa, 0x3a, 0xb8, 0x39, 0x30, 0x52, 0x8d, 0xe0, + 0x4e, 0x4e, 0xa0, 0x0e, 0x85, 0x1a, 0xdd, 0x0f, 0x14, 0xff, 0x8a, 0x6c, + 0x7b, 0x9c, 0xf2, 0x58, 0x6a, 0x62, 0x15, 0x39, 0x04, 0xf8, 0x39, 0xf4, + 0xe2, 0xad, 0x12, 0xd7, 0x28, 0x27, 0xc5, 0x49, 0xdf, 0x18, 0x59, 0xc0, + 0xa7, 0xcc, 0xe5, 0x56, 0x61, 0x99, 0xce, 0x31, 0x05, 0x0b, 0x06, 0x3a, + 0xb0, 0x92, 0x19, 0x22, 0x67, 0x77, 0xba, 0x34, 0x97, 0xdf, 0x5f, 0x9b, + 0x6e, 0xac, 0x44, 0xb6, 0xd1, 0x57, 0xac, 0xa1, 0x2b, 0x5e, 0xf1, 0x27, + 0x58, 0xba, 0xfe, 0x05, 0xd7, 0x41, 0xa6, 0x88, 0xc6, 0xbc, 0x93, 0xff, + 0x2a, 0x62, 0x0b, 0x35, 0x1b, 0xb9, 0xd9, 0x2d, 0x02, 0xc2, 0x41, 0xdc, + 0x8c, 0x7b, 0x9e, 0xda, 0xfd, 0xb3, 0x36, 0x1f, 0xc4, 0x26, 0xcb, 0x75, + 0xec, 0xcd, 0xc5, 0xe7, 0xb3, 0x1e, 0x2f, 0x30, 0x17, 0x22, 0x95, 0x7b, + 0xdd, 0xfe, 0xa9, 0xe7, 0x63, 0x3b, 0xb0, 0xcc, 0x4d, 0x8a, 0x46, 0x70, + 0x77, 0xbf, 0xe6, 0x2e, 0x3f, 0xd6, 0x1f, 0xe9, 0x86, 0xf2, 0x63, 0x09, + 0x0b, 0xa0, 0xc4, 0xfa, 0x08, 0x65, 0x99, 0x5d, 0x79, 0x4c, 0x11, 0xe6, + 0x3f, 0xd7, 0x94, 0xc3, 0x41, 0x0c, 0x47, 0xc0, 0x64, 0xe2, 0xdc, 0x88, + 0x20, 0x50, 0x25, 0xc9, 0x4b, 0xae, 0x06, 0x24, 0x4c, 0xaf, 0x33, 0x26, + 0x62, 0xe2, 0xf9, 0xaf, 0xcf, 0x48, 0xe1, 0x16, 0xaa, 0x64, 0x01, 0x70, + 0xd8, 0xc3, 0xef, 0x01, 0x6a, 0x64, 0xf0, 0xf5, 0x4a, 0xd0, 0xdd, 0x21, + 0xd5, 0x57, 0x28, 0x65, 0x3b, 0x04, 0x3e, 0xc9, 0x94, 0x59, 0x42, 0xac, + 0x9b, 0xdf, 0x4e, 0x33, 0xae, 0xd1, 0xda, 0xfc, 0xf3, 0xe6, 0xa1, 0x4a, + 0xb4, 0xa4, 0x24, 0x57, 0x5a, 0x34, 0x6f, 0x6f, 0x94, 0xe9, 0x88, 0x42, + 0xc5, 0x0d, 0xdf, 0x42, 0x44, 0x08, 0x7b, 0x99, 0x69, 0x59, 0xb6, 0x54, + 0x67, 0x7d, 0x83, 0xb8, 0xea, 0x53, 0x9b, 0x1c, 0xb2, 0xd7, 0x44, 0x20, + 0x30, 0xb3, 0x72, 0x10, 0x86, 0x87, 0x82, 0x58, 0xb4, 0xf4, 0x08, 0xd1, + 0xd9, 0x8a, 0x84, 0xcc, 0xb0, 0x3a, 0xf1, 0xde, 0x24, 0x16, 0x18, 0x7e, + 0xd0, 0x92, 0xb8, 0x72, 0x15, 0x08, 0x0c, 0xac, 0xa1, 0x96, 0x24, 0xcf, + 0x47, 0x31, 0x3b, 0xc7, 0x28, 0x11, 0xf0, 0x42, 0xe6, 0x40, 0x63, 0xa9, + 0xeb, 0xff, 0x80, 0x58, 0x0f, 0xee, 0x4e, 0xc5, 0xc0, 0xe9, 0xb5, 0xc2, + 0x18, 0x71, 0x97, 0xfe, 0x1a, 0x53, 0x93, 0xe4, 0x77, 0x92, 0x25, 0xa4, + 0x27, 0x03, 0xf6, 0x64, 0xf1, 0x63, 0x37, 0x7b, 0xb7, 0x3a, 0xe2, 0xcb, + 0x0c, 0x83, 0x0b, 0x52, 0x3f, 0xee, 0x41, 0x9a, 0x55, 0x8e, 0x85, 0x4a, + 0x23, 0x64, 0xea, 0x68, 0xe8, 0x5e, 0xd7, 0xf3, 0x9e, 0xbc, 0x6d, 0x78, + 0xba, 0xd2, 0xe5, 0xe8, 0x35, 0xfa, 0x74, 0x0f, 0x33, 0x2a, 0x0e, 0xcc, + 0xb2, 0x01, 0xaf, 0x53, 0x04, 0x70, 0x8a, 0xdd, 0x9e, 0x95, 0xcb, 0x3a, + 0x81, 0x25, 0x2f, 0x77, 0xb4, 0x31, 0xc7, 0x0b, 0x14, 0xc4, 0xd6, 0x20, + 0xa7, 0x80, 0xb1, 0xd5, 0x3d, 0x90, 0x04, 0xaf, 0x59, 0x20, 0xac, 0x6c, + 0x5e, 0xfc, 0xda, 0x7f, 0x9e, 0x70, 0x08, 0xe2, 0x5c, 0x02, 0x9c, 0x95, + 0x7d, 0xf9, 0xab, 0x2c, 0x15, 0x90, 0xc6, 0xab, 0x49, 0x21, 0x74, 0xcb, + 0x93, 0xb0, 0x44, 0x03, 0xfa, 0xc5, 0x35, 0x54, 0xb3, 0x4e, 0x10, 0x25, + 0x25, 0x94, 0x7d, 0x45, 0x74, 0xb5, 0x58, 0x85, 0xac, 0xb2, 0xc2, 0xd7, + 0xb5, 0xbe, 0xf0, 0xc8, 0x53, 0xdc, 0x62, 0xf8, 0x9b, 0x88, 0x98, 0xa7, + 0xda, 0x5a, 0x83, 0x22, 0xe4, 0x26, 0x83, 0x45, 0x41, 0x6f, 0x42, 0x61, + 0xcb, 0xc0, 0x89, 0x56, 0x2a, 0xef, 0x1f, 0xbb, 0xb2, 0xc9, 0xa1, 0xc8, + 0x16, 0x2d, 0x43, 0xc1, 0x13, 0x29, 0x76, 0xfa, 0x64, 0x4e, 0xe0, 0x66, + 0x5a, 0x9d, 0x35, 0x8f, 0x42, 0xd4, 0xe6, 0xea, 0x1d, 0xaf, 0x63, 0x73, + 0xb3, 0xff, 0x62, 0xd3, 0xdb, 0x60, 0x01, 0x7b, 0xf9, 0x03, 0xf2, 0x89, + 0x29, 0xe8, 0x48, 0xac, 0x2a, 0x71, 0x71, 0xdc, 0x80, 0xb1, 0x96, 0x5e, + 0xdd, 0x23, 0xea, 0xac, 0x44, 0xbc, 0xf3, 0x70, 0xf4, 0x40, 0x92, 0x94, + 0x8a, 0x89, 0x27, 0x02, 0xe7, 0x7f, 0x59, 0x27, 0x12, 0x11, 0xf1, 0x14, + 0x2d, 0x1c, 0x35, 0x89, 0xd6, 0xca, 0xa7, 0xe5, 0xba, 0x3c, 0x2b, 0x91, + 0x50, 0x2f, 0x87, 0x1f, 0x89, 0x12, 0x3e, 0x7e, 0x69, 0x47, 0x1e, 0x86, + 0xaf, 0x4d, 0xe6, 0x27, 0x04, 0x11, 0x61, 0x15, 0xdd, 0x0f, 0xe0, 0xe6, + 0x3f, 0x9f, 0x83, 0x13, 0x52, 0x41, 0x72, 0xba, 0x9e, 0xe2, 0x2e, 0x9c, + 0x98, 0xf5, 0x16, 0x17, 0x2a, 0xa7, 0xe5, 0x74, 0x26, 0x93, 0x88, 0xda, + 0x7f, 0x9b, 0x2e, 0x38, 0x4c, 0x73, 0x86, 0x98, 0xc6, 0xef, 0xb6, 0x34, + 0x02, 0x8c, 0x70, 0xbe, 0x81, 0x02, 0x3a, 0xa0, 0xb7, 0x33, 0x40, 0x6a, + 0x7b, 0x2c, 0xe6, 0xdc, 0x00, 0xb6, 0x86, 0x83, 0xbd, 0x8b, 0x75, 0xff, + 0xaa, 0xa0, 0x4a, 0x14, 0xf2, 0xed, 0xd0, 0xa3, 0xe3, 0x46, 0x1f, 0x63, + 0x04, 0xe6, 0xd5, 0x35, 0x14, 0xb2, 0xf4, 0x49, 0x06, 0x92, 0x94, 0x01, + 0x29, 0x32, 0x35, 0x43, 0x98, 0xdd, 0xc9, 0x58, 0x16, 0x19, 0x38, 0x7f, + 0xf1, 0x03, 0x0e, 0x9b, 0xfc, 0xc9, 0xaf, 0x4a, 0xee, 0x58, 0x66, 0xd0, + 0x8b, 0x34, 0x49, 0xfe, 0x4b, 0x08, 0xf2, 0xbe, 0xdb, 0x6e, 0x9c, 0xe8, + 0x1d, 0x99, 0xdd, 0xfc, 0x3b, 0x29, 0x95, 0x90, 0x58, 0xd7, 0x6f, 0x1a, + 0x07, 0x2d, 0x18, 0xee, 0x05, 0x15, 0x23, 0x3b, 0xce, 0x1f, 0xdf, 0x97, + 0x1c, 0x95, 0x29, 0xf2, 0x07, 0x7c, 0x5c, 0x3a, 0x3e, 0x1b, 0x4c, 0x0e, + 0xde, 0x8e, 0x59, 0x72, 0x82, 0x00, 0xd7, 0x26, 0x4b, 0xb7, 0x2f, 0xb0, + 0x8e, 0x19, 0x53, 0xdf, 0xf6, 0x58, 0xbe, 0x0d, 0xd1, 0x0c, 0x59, 0x2c, + 0xf0, 0xe0, 0xb7, 0xdf, 0x82, 0x90, 0x11, 0x26, 0x02, 0x08, 0x3e, 0xb4, + 0x50, 0x48, 0x75, 0x5e, 0x54, 0x29, 0xff, 0x2b, 0x70, 0xa4, 0x16, 0x10, + 0xcc, 0x3e, 0x40, 0x3c, 0xf0, 0xda, 0x54, 0xe6, 0x36, 0x0c, 0x5a, 0xde, + 0x12, 0x82, 0x4f, 0x1d, 0xa8, 0x58, 0x1c, 0xc3, 0x99, 0x6f, 0x1b, 0x53, + 0x91, 0x3e, 0xb8, 0xbf, 0x3b, 0x37, 0x66, 0xa7, 0xff, 0xe6, 0x46, 0x60, + 0x72, 0xc3, 0x6e, 0x56, 0xc0, 0x36, 0x66, 0x2e, 0x6d, 0xf8, 0x33, 0xfa, + 0x23, 0xfa, 0xff, 0x55, 0xa2, 0x4b, 0xb6, 0xc1, 0x2b, 0x8d, 0xc1, 0x9f, + 0x3c, 0xf2, 0xb2, 0x66, 0xd6, 0x66, 0x0f, 0x53, 0x75, 0x5f, 0x69, 0x7f, + 0xc9, 0x33, 0xce, 0x3b, 0x86, 0x43, 0x0f, 0x92, 0xdc, 0x2d, 0xb6, 0xcc, + 0x3a, 0x5d, 0xbb, 0xe8, 0xc5, 0x0c, 0x41, 0x44, 0x75, 0xb3, 0xdd, 0xf1, + 0x34, 0x8b, 0xc1, 0x37, 0x1f, 0xfc, 0x18, 0x5a, 0x2d, 0x78, 0x13, 0x13, + 0xff, 0x75, 0x70, 0x18, 0x86, 0x5a, 0x60, 0xc3, 0xd2, 0x9d, 0xeb, 0xe7, + 0x59, 0xd8, 0x3b, 0xc9, 0x5d, 0xe7, 0x32, 0xbb, 0x43, 0x59, 0xc8, 0x99, + 0xf4, 0xa3, 0x07, 0x8c, 0x0e, 0xa1, 0x99, 0x57, 0x6a, 0x5c, 0x7f, 0xcd, + 0xa6, 0x63, 0x38, 0x7d, 0xcf, 0xb8, 0x27, 0xf1, 0xc5, 0x3d, 0x38, 0xf9, + 0xc5, 0x6b, 0xba, 0x18, 0xb5, 0xb2, 0x08, 0x70, 0x92, 0xe6, 0xe1, 0xd9, + 0x1b, 0xa5, 0xec, 0x83, 0xb2, 0x12, 0x5f, 0x73, 0x31, 0x08, 0xf2, 0x70, + 0x8f, 0xb2, 0x40, 0xd6, 0xef, 0x19, 0x8f, 0x78, 0xa2, 0x05, 0xaa, 0x48, + 0xad, 0x84, 0xb6, 0x0a, 0x19, 0x2e, 0x6f, 0x22, 0x9a, 0x3d, 0x8a, 0x80, + 0x49, 0xfe, 0xff, 0xa0, 0xec, 0x99, 0xdb, 0x89, 0x38, 0x60, 0xf5, 0x56, + 0x4d, 0x03, 0x20, 0x0c, 0x96, 0x04, 0xfc, 0x63, 0xbd, 0xa4, 0x59, 0xab, + 0x1e, 0x43, 0x84, 0x04, 0xd8, 0x42, 0xe1, 0xc8, 0x03, 0xbd, 0x0e, 0x52, + 0x36, 0xa1, 0x00, 0xfe, 0xc0, 0x42, 0xf9, 0x47, 0x1f, 0x98, 0xde, 0x39, + 0x1e, 0xd0, 0x85, 0xdb, 0x70, 0x8e, 0x06, 0x39, 0xb0, 0x09, 0x2a, 0xb9, + 0xb9, 0xa8, 0x50, 0x28, 0x1a, 0x26, 0x05, 0x7f, 0xdc, 0x56, 0xcf, 0x63, + 0xf6, 0x0b, 0xfb, 0x46, 0x0f, 0x6d, 0x00, 0xec, 0x22, 0xa8, 0xe2, 0x21, + 0x5e, 0xfd, 0x4c, 0xa2, 0xd7, 0x35, 0x37, 0xd5, 0xa7, 0xad, 0xf9, 0xf3, + 0x03, 0xdf, 0xf1, 0x35, 0x63, 0x25, 0xb8, 0xba, 0x23, 0xb7, 0xd2, 0xc3, + 0xaa, 0x4d, 0x59, 0x23, 0xc8, 0xf0, 0x64, 0x15, 0x96, 0xf5, 0xac, 0x7b, + 0x7e, 0x95, 0x2d, 0xf3, 0xfe, 0x9b, 0xe9, 0x5c, 0x82, 0x61, 0xe9, 0xbc, + 0x4f, 0x80, 0x07, 0x07, 0xb5, 0x85, 0xf8, 0x65, 0xac, 0xdf, 0xf5, 0x1b, + 0x8c, 0x49, 0x7d, 0x96, 0xf6, 0xd5, 0xba, 0x6e, 0x3a, 0x3c, 0x18, 0x6c, + 0x18, 0xa1, 0xaa, 0xfd, 0x9f, 0x56, 0x01, 0x4a, 0x96, 0xe3, 0xf7, 0x73, + 0x35, 0x39, 0x62, 0xcb, 0x74, 0x8c, 0x4f, 0x4b, 0x59, 0xfc, 0x42, 0x1a, + 0x7f, 0xe2, 0x12, 0x44, 0x26, 0xbb, 0x4e, 0xd5, 0xef, 0xda, 0x1d, 0xbe, + 0xe5, 0x57, 0x94, 0x01, 0x49, 0xd7, 0xe7, 0xd6, 0x37, 0xf2, 0x01, 0xaa, + 0xeb, 0x45, 0x52, 0x93, 0xec, 0x0f, 0xd2, 0xa1, 0x10, 0xa1, 0x63, 0x06, + 0xdc, 0xb6, 0xf7, 0xb9, 0x27, 0x5f, 0x34, 0x66, 0xba, 0xfc, 0x49, 0x66, + 0x75, 0x7c, 0xc1, 0xcf, 0xdc, 0xf2, 0x28, 0x5c, 0x62, 0x7e, 0x93, 0x99, + 0xbf, 0x1c, 0x27, 0x50, 0x71, 0x96, 0x9b, 0xbc, 0xd6, 0xea, 0x12, 0x18, + 0x26, 0x70, 0x76, 0x89, 0xa3, 0x05, 0x2b, 0x2f, 0x40, 0x59, 0x5c, 0x64, + 0x6c, 0xad, 0xec, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x02, 0x0a, 0x11, 0x1a, 0x21, 0x2b}; diff --git a/third_party/boringssl/src/crypto/fipsmodule/mldsa/mldsa.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/mldsa/mldsa.cc.inc new file mode 100644 index 00000000..4bd38854 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/mldsa/mldsa.cc.inc @@ -0,0 +1,3024 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + +#include +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "../bcm_interface.h" +#include "../keccak/internal.h" + + +using namespace bssl; + +#if defined(BORINGSSL_FIPS) + +DEFINE_STATIC_ONCE(g_mldsa_keygen_self_test_once) +DEFINE_STATIC_ONCE(g_mldsa_sign_self_test_once) +DEFINE_STATIC_ONCE(g_mldsa_verify_self_test_once) + +#endif + +namespace mldsa { +namespace { + +namespace fips { +void ensure_keygen_self_test(); +void ensure_sign_self_test(); +void ensure_verify_self_test(); +} // namespace fips + +constexpr int kDegree = 256; +constexpr int kRhoBytes = 32; +constexpr int kSigmaBytes = 64; +constexpr int kKBytes = 32; +constexpr int kTrBytes = 64; +constexpr int kMuBytes = 64; +constexpr int kRhoPrimeBytes = 64; + +// 2^23 - 2^13 + 1 +constexpr uint32_t kPrime = 8380417; +// Inverse of -kPrime modulo 2^32 +constexpr uint32_t kPrimeNegInverse = 4236238847; +constexpr int kDroppedBits = 13; +constexpr uint32_t kHalfPrime = (kPrime - 1) / 2; +// 256^-1 mod kPrime, in Montgomery form. +constexpr uint32_t kInverseDegreeMontgomery = 41978; + +// Constants that vary depending on ML-DSA size. +// +// These are implemented as templates which take the K parameter to distinguish +// the ML-DSA sizes. + +template +constexpr size_t public_key_bytes() { + if constexpr (K == 6) { + return MLDSA65_PUBLIC_KEY_BYTES; + } else if constexpr (K == 8) { + return MLDSA87_PUBLIC_KEY_BYTES; + } else if constexpr (K == 4) { + return MLDSA44_PUBLIC_KEY_BYTES; + } +} + +template +constexpr size_t signature_bytes() { + if constexpr (K == 6) { + return MLDSA65_SIGNATURE_BYTES; + } else if constexpr (K == 8) { + return MLDSA87_SIGNATURE_BYTES; + } else if constexpr (K == 4) { + return MLDSA44_SIGNATURE_BYTES; + } +} + +template +constexpr int tau() { + if constexpr (K == 6) { + return 49; + } else if constexpr (K == 8) { + return 60; + } else if constexpr (K == 4) { + return 39; + } +} + +template +constexpr int lambda_bytes() { + if constexpr (K == 6) { + return 192 / 8; + } else if constexpr (K == 8) { + return 256 / 8; + } else if constexpr (K == 4) { + return 128 / 8; + } +} + +template +constexpr int gamma1_bits() { + if constexpr (K == 6 || K == 8) { + return 19; + } else if constexpr (K == 4) { + return 17; + } +} + +template +constexpr int gamma1() { + return 1 << gamma1_bits(); +} + +template +constexpr int scalar_le_gamma1_bytes() { + return ((gamma1_bits() + 1) * kDegree) / 8; +} + +template +constexpr uint32_t w1_coeffs_bits() { + if constexpr (K == 6 || K == 8) { + return 4; + } else if constexpr (K == 4) { + return 6; + } +} + +template +constexpr uint32_t w1_scalar_bytes() { + return (w1_coeffs_bits() * kDegree) / 8; +} + +template +constexpr uint32_t w1_bytes() { + return w1_scalar_bytes() * K; +} + +template +constexpr uint32_t prime_minus_one_over_gamma2() { + if constexpr (K == 6 || K == 8) { + return 32; + } else if constexpr (K == 4) { + return 88; + } +} + +template +constexpr uint32_t gamma2() { + return (kPrime - 1) / prime_minus_one_over_gamma2(); +} + +template +constexpr int beta() { + if constexpr (K == 6) { + return 196; + } else if constexpr (K == 8) { + return 120; + } else if constexpr (K == 4) { + return 78; + } +} + +template +constexpr int omega() { + if constexpr (K == 6) { + return 55; + } else if constexpr (K == 8) { + return 75; + } else if constexpr (K == 4) { + return 80; + } +} + +template +constexpr int eta() { + if constexpr (K == 6) { + return 4; + } else if constexpr (K == 8 || K == 4) { + return 2; + } +} + +template +constexpr int plus_minus_eta_bitlen() { + if constexpr (K == 6) { + return 4; + } else if constexpr (K == 8 || K == 4) { + return 3; + } +} + +// Fundamental types. + +struct scalar { + uint32_t c[kDegree]; +}; + +template +struct vector { + scalar v[K]; +}; + +template +struct matrix { + scalar v[K][L]; +}; + +/* Arithmetic */ + +// This bit of Python will be referenced in some of the following comments: +// +// q = 8380417 +// # Inverse of -q modulo 2^32 +// q_neg_inverse = 4236238847 +// # 2^64 modulo q +// montgomery_square = 2365951 +// +// def bitreverse(i): +// ret = 0 +// for n in range(8): +// bit = i & 1 +// ret <<= 1 +// ret |= bit +// i >>= 1 +// return ret +// +// def montgomery_reduce(x): +// a = (x * q_neg_inverse) % 2**32 +// b = x + a * q +// assert b & 0xFFFF_FFFF == 0 +// c = b >> 32 +// assert c < q +// return c +// +// def montgomery_transform(x): +// return montgomery_reduce(x * montgomery_square) + +// kNTTRootsMontgomery = [ +// montgomery_transform(pow(1753, bitreverse(i), q)) for i in range(256) +// ] +static const uint32_t kNTTRootsMontgomery[256] = { + 4193792, 25847, 5771523, 7861508, 237124, 7602457, 7504169, 466468, + 1826347, 2353451, 8021166, 6288512, 3119733, 5495562, 3111497, 2680103, + 2725464, 1024112, 7300517, 3585928, 7830929, 7260833, 2619752, 6271868, + 6262231, 4520680, 6980856, 5102745, 1757237, 8360995, 4010497, 280005, + 2706023, 95776, 3077325, 3530437, 6718724, 4788269, 5842901, 3915439, + 4519302, 5336701, 3574422, 5512770, 3539968, 8079950, 2348700, 7841118, + 6681150, 6736599, 3505694, 4558682, 3507263, 6239768, 6779997, 3699596, + 811944, 531354, 954230, 3881043, 3900724, 5823537, 2071892, 5582638, + 4450022, 6851714, 4702672, 5339162, 6927966, 3475950, 2176455, 6795196, + 7122806, 1939314, 4296819, 7380215, 5190273, 5223087, 4747489, 126922, + 3412210, 7396998, 2147896, 2715295, 5412772, 4686924, 7969390, 5903370, + 7709315, 7151892, 8357436, 7072248, 7998430, 1349076, 1852771, 6949987, + 5037034, 264944, 508951, 3097992, 44288, 7280319, 904516, 3958618, + 4656075, 8371839, 1653064, 5130689, 2389356, 8169440, 759969, 7063561, + 189548, 4827145, 3159746, 6529015, 5971092, 8202977, 1315589, 1341330, + 1285669, 6795489, 7567685, 6940675, 5361315, 4499357, 4751448, 3839961, + 2091667, 3407706, 2316500, 3817976, 5037939, 2244091, 5933984, 4817955, + 266997, 2434439, 7144689, 3513181, 4860065, 4621053, 7183191, 5187039, + 900702, 1859098, 909542, 819034, 495491, 6767243, 8337157, 7857917, + 7725090, 5257975, 2031748, 3207046, 4823422, 7855319, 7611795, 4784579, + 342297, 286988, 5942594, 4108315, 3437287, 5038140, 1735879, 203044, + 2842341, 2691481, 5790267, 1265009, 4055324, 1247620, 2486353, 1595974, + 4613401, 1250494, 2635921, 4832145, 5386378, 1869119, 1903435, 7329447, + 7047359, 1237275, 5062207, 6950192, 7929317, 1312455, 3306115, 6417775, + 7100756, 1917081, 5834105, 7005614, 1500165, 777191, 2235880, 3406031, + 7838005, 5548557, 6709241, 6533464, 5796124, 4656147, 594136, 4603424, + 6366809, 2432395, 2454455, 8215696, 1957272, 3369112, 185531, 7173032, + 5196991, 162844, 1616392, 3014001, 810149, 1652634, 4686184, 6581310, + 5341501, 3523897, 3866901, 269760, 2213111, 7404533, 1717735, 472078, + 7953734, 1723600, 6577327, 1910376, 6712985, 7276084, 8119771, 4546524, + 5441381, 6144432, 7959518, 6094090, 183443, 7403526, 1612842, 4834730, + 7826001, 3919660, 8332111, 7018208, 3937738, 1400424, 7534263, 1976782}; + +// Reduces x mod kPrime in constant time, where 0 <= x < 2*kPrime. +inline uint32_t reduce_once(uint32_t x) { + declassify_assert(x < 2 * kPrime); + // return x < kPrime ? x : x - kPrime; + const uint32_t subtracted = x - kPrime; + uint32_t mask = 0u - (subtracted >> 31); + // Although this is a constant-time select, we omit a value barrier here. + // Value barriers impede auto-vectorization (likely because it forces the + // value to transit through a general-purpose register). This is a difference + // of 1.4x to 1.5x in signing performance. + // + // We usually add value barriers to selects because Clang turns consecutive + // selects with the same condition into a branch instead of CMOV/CSEL. This + // condition does not occur in ML-DSA, so omitting it seems to be generally + // safe. However, see |coefficient_from_nibble|. + return (mask & x) | (~mask & subtracted); +} + +// Returns the absolute value in constant time, interpreting the high bit as a +// sign bit. +inline uint32_t abs_signed(uint32_t x) { + // return is_negative(x) ? -x : x; + uint32_t mask = 0u - (x >> 31); + return constant_time_select_32(mask, 0u - x, x); +} + +// Returns the absolute value modulo kPrime. +inline uint32_t abs_mod_prime(uint32_t x) { + declassify_assert(x < kPrime); + // return x <= kHalfPrime ? x : kPrime - x; + uint32_t mask = x - kHalfPrime - 1; + mask = 0u - (mask >> 31); + return constant_time_select_32(mask, x, kPrime - x); +} + +// Returns the maximum of two values in constant time. Each value must be less +// than kPrime. +inline uint32_t maximum_reduced(uint32_t x, uint32_t y) { + declassify_assert(x < kPrime); + declassify_assert(y < kPrime); + // return x < y ? y : x; + uint32_t mask = x - y; + mask = 0u - (mask >> 31); + return constant_time_select_32(mask, y, x); +} + +inline uint32_t mod_sub(uint32_t a, uint32_t b) { + declassify_assert(a < kPrime); + declassify_assert(b < kPrime); + uint32_t r = a - b; + // return r < 0 ? r + kPrime : r; + uint32_t mask = 0u - (r >> 31); + // See |reduce_once| for why this does not have a value barrier. + return (mask & (r + kPrime)) | (~mask & r); +} + +inline void scalar_add(scalar *out, const scalar *lhs, const scalar *rhs) { + for (int i = 0; i < kDegree; i++) { + out->c[i] = reduce_once(lhs->c[i] + rhs->c[i]); + } +} + +inline void scalar_sub(scalar *out, const scalar *lhs, const scalar *rhs) { + for (int i = 0; i < kDegree; i++) { + out->c[i] = mod_sub(lhs->c[i], rhs->c[i]); + } +} + +inline uint32_t reduce_montgomery(uint64_t x) { + declassify_assert(x <= ((uint64_t)kPrime << 32)); + uint64_t a = (uint32_t)x * kPrimeNegInverse; + uint64_t b = x + a * kPrime; + declassify_assert((b & 0xffffffff) == 0); + uint32_t c = b >> 32; + return reduce_once(c); +} + +// Multiply two scalars in the number theoretically transformed state. +inline void scalar_mult(scalar *out, const scalar *lhs, const scalar *rhs) { + for (int i = 0; i < kDegree; i++) { + out->c[i] = reduce_montgomery((uint64_t)lhs->c[i] * (uint64_t)rhs->c[i]); + } +} + +// In place number theoretic transform of a given scalar. +// +// FIPS 204, Algorithm 41 (`NTT`). +inline void scalar_ntt(scalar *s) { + // Step: 1, 2, 4, 8, ..., 128 + // Offset: 128, 64, 32, 16, ..., 1 + int offset = kDegree; + for (int step = 1; step < kDegree; step <<= 1) { + offset >>= 1; + int k = 0; + for (int i = 0; i < step; i++) { + assert(k == 2 * offset * i); + const uint32_t step_root = kNTTRootsMontgomery[step + i]; + for (int j = k; j < k + offset; j++) { + uint32_t even = s->c[j]; + // |reduce_montgomery| works on values up to kPrime*R and R > 2*kPrime. + // |step_root| < kPrime because it's static data. |s->c[...]| is < + // kPrime by the invariants of that struct. + uint32_t odd = + reduce_montgomery((uint64_t)step_root * (uint64_t)s->c[j + offset]); + s->c[j] = reduce_once(odd + even); + s->c[j + offset] = mod_sub(even, odd); + } + k += 2 * offset; + } + } +} + +// In place inverse number theoretic transform of a given scalar. +// +// FIPS 204, Algorithm 42 (`NTT^-1`). +inline void scalar_inverse_ntt(scalar *s) { + // Step: 128, 64, 32, 16, ..., 1 + // Offset: 1, 2, 4, 8, ..., 128 + int step = kDegree; + for (int offset = 1; offset < kDegree; offset <<= 1) { + step >>= 1; + int k = 0; + for (int i = 0; i < step; i++) { + assert(k == 2 * offset * i); + const uint32_t step_root = + kPrime - kNTTRootsMontgomery[step + (step - 1 - i)]; + for (int j = k; j < k + offset; j++) { + uint32_t even = s->c[j]; + uint32_t odd = s->c[j + offset]; + s->c[j] = reduce_once(odd + even); + + // |reduce_montgomery| works on values up to kPrime*R and R > 2*kPrime. + // kPrime + even < 2*kPrime because |even| < kPrime, by the invariants + // of that structure. Thus kPrime + even - odd < 2*kPrime because odd >= + // 0, because it's unsigned and less than kPrime. Lastly step_root < + // kPrime, because |kNTTRootsMontgomery| is static data. + s->c[j + offset] = reduce_montgomery((uint64_t)step_root * + (uint64_t)(kPrime + even - odd)); + } + k += 2 * offset; + } + } + for (int i = 0; i < kDegree; i++) { + s->c[i] = reduce_montgomery((uint64_t)s->c[i] * + (uint64_t)kInverseDegreeMontgomery); + } +} + +template +inline void vector_zero(vector *out) { + OPENSSL_memset(out, 0, sizeof(*out)); +} + +template +inline void vector_add(vector *out, const vector *lhs, + const vector *rhs) { + for (int i = 0; i < X; i++) { + scalar_add(&out->v[i], &lhs->v[i], &rhs->v[i]); + } +} + +template +inline void vector_sub(vector *out, const vector *lhs, + const vector *rhs) { + for (int i = 0; i < X; i++) { + scalar_sub(&out->v[i], &lhs->v[i], &rhs->v[i]); + } +} + +template +inline void vector_mult_scalar(vector *out, const vector *lhs, + const scalar *rhs) { + for (int i = 0; i < X; i++) { + scalar_mult(&out->v[i], &lhs->v[i], rhs); + } +} + +template +inline void vector_ntt(vector *a) { + for (int i = 0; i < X; i++) { + scalar_ntt(&a->v[i]); + } +} + +template +inline void vector_inverse_ntt(vector *a) { + for (int i = 0; i < X; i++) { + scalar_inverse_ntt(&a->v[i]); + } +} + +template +inline void matrix_mult(vector *out, const matrix *m, + const vector *a) { + vector_zero(out); + for (int i = 0; i < K; i++) { + for (int j = 0; j < L; j++) { + scalar product; + scalar_mult(&product, &m->v[i][j], &a->v[j]); + scalar_add(&out->v[i], &out->v[i], &product); + } + } +} + +/* Rounding & hints */ + +// FIPS 204, Algorithm 35 (`Power2Round`). +inline void power2_round(uint32_t *r1, uint32_t *r0, uint32_t r) { + *r1 = r >> kDroppedBits; + *r0 = r - (*r1 << kDroppedBits); + + uint32_t r0_adjusted = mod_sub(*r0, 1 << kDroppedBits); + uint32_t r1_adjusted = *r1 + 1; + + // Mask is set iff r0 > 2^(dropped_bits - 1). + crypto_word_t mask = + constant_time_lt_w((uint32_t)(1 << (kDroppedBits - 1)), *r0); + // r0 = mask ? r0_adjusted : r0 + *r0 = constant_time_select_32(mask, r0_adjusted, *r0); + // r1 = mask ? r1_adjusted : r1 + *r1 = constant_time_select_32(mask, r1_adjusted, *r1); +} + +// Scale back previously rounded value. +inline void scale_power2_round(uint32_t *out, uint32_t r1) { + // Pre-condition: 0 <= r1 <= 2^10 - 1 + assert(r1 < (1u << 10)); + + *out = r1 << kDroppedBits; + + // Post-condition: 0 <= out <= 2^23 - 2^13 = kPrime - 1 + assert(*out < kPrime); +} + +// FIPS 204, Algorithm 37 (`HighBits`). +template +inline uint32_t high_bits(uint32_t x) { + // Reference description (given 0 <= x < q): + // + // ``` + // int32_t r0 = x mod+- (2 * gamma2); + // if (x - r0 == q - 1) { + // return 0; + // } else { + // return (x - r0) / (2 * gamma2); + // } + // ``` + // + uint32_t r1 = (x + 127) >> 7; + if constexpr (prime_minus_one_over_gamma2() == 32) { + // Below is the formula taken from the reference implementation. + // + // Here, Gamma2 == 2^18 - 2^8 + // This returns ((ceil(x / 2^7) * (2^10 + 1) + 2^21) / 2^22) mod 2^4 + r1 = (r1 * 1025 + (1 << 21)) >> 22; + r1 &= 15; + } else if constexpr (prime_minus_one_over_gamma2() == 88) { + // 1488/2^24 is close enough to 1/1488 so that r1 becomes x/(2 gamma2) + // rounded down. + r1 = (r1 * 11275 + (1 << 23)) >> 24; + + // For corner-case r1 = (Q-1)/(2 gamma2) = 44, we have to set r1=0. + r1 ^= ((uint32_t)(((int32_t)(43 - r1)) >> 31)) & r1; + } + return r1; +} + +// FIPS 204, Algorithm 36 (`Decompose`). +template +inline void decompose(uint32_t *r1, int32_t *r0, uint32_t r) { + *r1 = high_bits(r); + + *r0 = r; + *r0 -= *r1 * 2 * (int32_t)gamma2(); + *r0 -= (((int32_t)kHalfPrime - *r0) >> 31) & (int32_t)kPrime; +} + +// FIPS 204, Algorithm 38 (`LowBits`). +template +inline int32_t low_bits(uint32_t x) { + uint32_t r1; + int32_t r0; + decompose(&r1, &r0, x); + return r0; +} + +// FIPS 204, Algorithm 39 (`MakeHint`). +// +// In the spec this takes two arguments, z and r, and is called with +// z = -ct0 +// r = w - cs2 + ct0 +// +// It then computes HighBits (algorithm 37) of z and z+r. But z+r is just w - +// cs2, so this takes three arguments and saves an addition. +template +inline int32_t make_hint(uint32_t ct0, uint32_t cs2, uint32_t w) { + uint32_t r_plus_z = mod_sub(w, cs2); + uint32_t r = reduce_once(r_plus_z + ct0); + return high_bits(r) != high_bits(r_plus_z); +} + +// FIPS 204, Algorithm 40 (`UseHint`). +template +inline uint32_t use_hint_vartime(uint32_t h, uint32_t r) { + uint32_t r1; + int32_t r0; + decompose(&r1, &r0, r); + + if (h) { + if constexpr (prime_minus_one_over_gamma2() == 32) { + if (r0 > 0) { + // (Q-1)/(2 gamma2) = m = 16, thus |mod m| in the spec turns into |& + // 15|. + return (r1 + 1) & 15; + } else { + return (r1 - 1) & 15; + } + } else { + // m = 44 + static_assert(prime_minus_one_over_gamma2() == 88); + if (r0 > 0) { + if (r1 == 43) { + return 0; + } else { + return r1 + 1; + } + } else { + if (r1 == 0) { + return 43; + } else { + return r1 - 1; + } + } + } + } + return r1; +} + +inline void scalar_power2_round(scalar *s1, scalar *s0, const scalar *s) { + for (int i = 0; i < kDegree; i++) { + power2_round(&s1->c[i], &s0->c[i], s->c[i]); + } +} + +inline void scalar_scale_power2_round(scalar *out, const scalar *in) { + for (int i = 0; i < kDegree; i++) { + scale_power2_round(&out->c[i], in->c[i]); + } +} + +template +inline void scalar_high_bits(scalar *out, const scalar *in) { + for (int i = 0; i < kDegree; i++) { + out->c[i] = high_bits(in->c[i]); + } +} + +template +inline void scalar_low_bits(scalar *out, const scalar *in) { + for (int i = 0; i < kDegree; i++) { + out->c[i] = low_bits(in->c[i]); + } +} + +inline void scalar_max(uint32_t *max, const scalar *s) { + for (int i = 0; i < kDegree; i++) { + uint32_t abs = abs_mod_prime(s->c[i]); + *max = maximum_reduced(*max, abs); + } +} + +inline void scalar_max_signed(uint32_t *max, const scalar *s) { + for (int i = 0; i < kDegree; i++) { + uint32_t abs = abs_signed(s->c[i]); + *max = maximum_reduced(*max, abs); + } +} + +template +inline void scalar_make_hint(scalar *out, const scalar *ct0, const scalar *cs2, + const scalar *w) { + for (int i = 0; i < kDegree; i++) { + out->c[i] = make_hint(ct0->c[i], cs2->c[i], w->c[i]); + } +} + +template +inline void scalar_use_hint_vartime(scalar *out, const scalar *h, + const scalar *r) { + for (int i = 0; i < kDegree; i++) { + out->c[i] = use_hint_vartime(h->c[i], r->c[i]); + } +} + +template +inline void vector_power2_round(vector *t1, vector *t0, + const vector *t) { + for (int i = 0; i < X; i++) { + scalar_power2_round(&t1->v[i], &t0->v[i], &t->v[i]); + } +} + +template +inline void vector_scale_power2_round(vector *out, const vector *in) { + for (int i = 0; i < X; i++) { + scalar_scale_power2_round(&out->v[i], &in->v[i]); + } +} + +template +inline void vector_high_bits(vector *out, const vector *in) { + for (int i = 0; i < K; i++) { + scalar_high_bits(&out->v[i], &in->v[i]); + } +} + +template +inline void vector_low_bits(vector *out, const vector *in) { + for (int i = 0; i < K; i++) { + scalar_low_bits(&out->v[i], &in->v[i]); + } +} + +template +inline uint32_t vector_max(const vector *a) { + uint32_t max = 0; + for (int i = 0; i < X; i++) { + scalar_max(&max, &a->v[i]); + } + return max; +} + +template +inline uint32_t vector_max_signed(const vector *a) { + uint32_t max = 0; + for (int i = 0; i < X; i++) { + scalar_max_signed(&max, &a->v[i]); + } + return max; +} + +// The input vector contains only zeroes and ones. +template +inline size_t vector_count_ones(const vector *a) { + size_t count = 0; + for (int i = 0; i < X; i++) { + for (int j = 0; j < kDegree; j++) { + count += a->v[i].c[j]; + } + } + return count; +} + +template +inline void vector_make_hint(vector *out, const vector *ct0, + const vector *cs2, const vector *w) { + for (int i = 0; i < K; i++) { + scalar_make_hint(&out->v[i], &ct0->v[i], &cs2->v[i], &w->v[i]); + } +} + +template +inline void vector_use_hint_vartime(vector *out, const vector *h, + const vector *r) { + for (int i = 0; i < K; i++) { + scalar_use_hint_vartime(&out->v[i], &h->v[i], &r->v[i]); + } +} + +/* Bit packing */ + +// FIPS 204, Algorithm 16 (`SimpleBitPack`). Specialized to bitlen(b) = 4. +inline void scalar_encode_4(uint8_t out[128], const scalar *s) { + // Every two elements lands on a byte boundary. + static_assert(kDegree % 2 == 0, "kDegree must be a multiple of 2"); + for (int i = 0; i < kDegree / 2; i++) { + uint32_t a = s->c[2 * i]; + uint32_t b = s->c[2 * i + 1]; + declassify_assert(a < 16); + declassify_assert(b < 16); + out[i] = a | (b << 4); + } +} + +// FIPS 204, Algorithm 16 (`SimpleBitPack`). Specialized to bitlen(b) = 6. +inline void scalar_encode_6(uint8_t out[192], const scalar *s) { + // Every four elements lands on a byte boundary. + static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4"); + for (int i = 0; i < kDegree / 4; i++) { + uint32_t a = s->c[4 * i]; + uint32_t b = s->c[4 * i + 1]; + uint32_t c = s->c[4 * i + 2]; + uint32_t d = s->c[4 * i + 3]; + declassify_assert(a < 64); + declassify_assert(b < 64); + declassify_assert(c < 64); + declassify_assert(d < 64); + out[3 * i] = a | (b << 6); + out[3 * i + 1] = (b >> 2) | (c << 4); + out[3 * i + 2] = (c >> 4) | (d << 2); + } +} + +// FIPS 204, Algorithm 16 (`SimpleBitPack`). Specialized to bitlen(b) = 10. +inline void scalar_encode_10(uint8_t out[320], const scalar *s) { + // Every four elements lands on a byte boundary. + static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4"); + for (int i = 0; i < kDegree / 4; i++) { + uint32_t a = s->c[4 * i]; + uint32_t b = s->c[4 * i + 1]; + uint32_t c = s->c[4 * i + 2]; + uint32_t d = s->c[4 * i + 3]; + declassify_assert(a < 1024); + declassify_assert(b < 1024); + declassify_assert(c < 1024); + declassify_assert(d < 1024); + out[5 * i] = (uint8_t)a; + out[5 * i + 1] = (uint8_t)((a >> 8) | (b << 2)); + out[5 * i + 2] = (uint8_t)((b >> 6) | (c << 4)); + out[5 * i + 3] = (uint8_t)((c >> 4) | (d << 6)); + out[5 * i + 4] = (uint8_t)(d >> 2); + } +} + +// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(a+b) = 4 and b = 4. +inline void scalar_encode_signed_4_4(uint8_t out[128], const scalar *s) { + // Every two elements lands on a byte boundary. + static_assert(kDegree % 2 == 0, "kDegree must be a multiple of 2"); + for (int i = 0; i < kDegree / 2; i++) { + uint32_t a = mod_sub(4, s->c[2 * i]); + uint32_t b = mod_sub(4, s->c[2 * i + 1]); + declassify_assert(a < 16); + declassify_assert(b < 16); + out[i] = a | (b << 4); + } +} + +// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(a+b) = 3 and b = 2. +inline void scalar_encode_signed_3_2(uint8_t out[96], const scalar *s) { + static_assert(kDegree % 8 == 0, "kDegree must be a multiple of 8"); + for (int i = 0; i < kDegree / 8; i++) { + uint32_t a = mod_sub(2, s->c[8 * i]); + uint32_t b = mod_sub(2, s->c[8 * i + 1]); + uint32_t c = mod_sub(2, s->c[8 * i + 2]); + uint32_t d = mod_sub(2, s->c[8 * i + 3]); + uint32_t e = mod_sub(2, s->c[8 * i + 4]); + uint32_t f = mod_sub(2, s->c[8 * i + 5]); + uint32_t g = mod_sub(2, s->c[8 * i + 6]); + uint32_t h = mod_sub(2, s->c[8 * i + 7]); + uint32_t v = (h << 21) | (g << 18) | (f << 15) | (e << 12) | (d << 9) | + (c << 6) | (b << 3) | a; + uint8_t v_bytes[sizeof(v)]; + CRYPTO_store_u32_le(v_bytes, v); + OPENSSL_memcpy(&out[i * 3], v_bytes, 3); + } +} + +// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(a+b) = 13 and b = +// 2^12. +inline void scalar_encode_signed_13_12(uint8_t out[416], const scalar *s) { + static const uint32_t kMax = 1u << 12; + // Every two elements lands on a byte boundary. + static_assert(kDegree % 8 == 0, "kDegree must be a multiple of 8"); + for (int i = 0; i < kDegree / 8; i++) { + uint32_t a = mod_sub(kMax, s->c[8 * i]); + uint32_t b = mod_sub(kMax, s->c[8 * i + 1]); + uint32_t c = mod_sub(kMax, s->c[8 * i + 2]); + uint32_t d = mod_sub(kMax, s->c[8 * i + 3]); + uint32_t e = mod_sub(kMax, s->c[8 * i + 4]); + uint32_t f = mod_sub(kMax, s->c[8 * i + 5]); + uint32_t g = mod_sub(kMax, s->c[8 * i + 6]); + uint32_t h = mod_sub(kMax, s->c[8 * i + 7]); + declassify_assert(a < (1u << 13)); + declassify_assert(b < (1u << 13)); + declassify_assert(c < (1u << 13)); + declassify_assert(d < (1u << 13)); + declassify_assert(e < (1u << 13)); + declassify_assert(f < (1u << 13)); + declassify_assert(g < (1u << 13)); + declassify_assert(h < (1u << 13)); + a |= b << 13; + a |= c << 26; + c >>= 6; + c |= d << 7; + c |= e << 20; + e >>= 12; + e |= f << 1; + e |= g << 14; + e |= h << 27; + h >>= 5; + CRYPTO_store_u32_le(&out[13 * i], a); + CRYPTO_store_u32_le(&out[13 * i + 4], c); + CRYPTO_store_u32_le(&out[13 * i + 8], e); + out[13 * i + 12] = static_cast(h); + } +} + +// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(a+b) = 20 and b = +// 2^19. +inline void scalar_encode_signed_20_19(uint8_t out[640], const scalar *s) { + static const uint32_t kMax = 1u << 19; + // Every two elements lands on a byte boundary. + static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4"); + for (int i = 0; i < kDegree / 4; i++) { + uint32_t a = mod_sub(kMax, s->c[4 * i]); + uint32_t b = mod_sub(kMax, s->c[4 * i + 1]); + uint32_t c = mod_sub(kMax, s->c[4 * i + 2]); + uint32_t d = mod_sub(kMax, s->c[4 * i + 3]); + declassify_assert(a < (1u << 20)); + declassify_assert(b < (1u << 20)); + declassify_assert(c < (1u << 20)); + declassify_assert(d < (1u << 20)); + a |= b << 20; + b >>= 12; + b |= c << 8; + b |= d << 28; + d >>= 4; + CRYPTO_store_u32_le(&out[10 * i], a); + CRYPTO_store_u32_le(&out[10 * i + 4], b); + CRYPTO_store_u16_le(&out[10 * i + 8], static_cast(d)); + } +} + +// FIPS 204, Algorithm 17 (`BitPack`). Specialized to bitlen(a+b) = 18 and b = +// 2^17. +inline void scalar_encode_signed_18_17(uint8_t out[576], const scalar *s) { + static const uint32_t kMax = 1u << 17; + static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4"); + for (int i = 0; i < kDegree / 4; i++) { + uint32_t a = mod_sub(kMax, s->c[4 * i]); + uint32_t b = mod_sub(kMax, s->c[4 * i + 1]); + uint32_t c = mod_sub(kMax, s->c[4 * i + 2]); + uint32_t d = mod_sub(kMax, s->c[4 * i + 3]); + declassify_assert(a < (1u << 18)); + declassify_assert(b < (1u << 18)); + declassify_assert(c < (1u << 18)); + declassify_assert(d < (1u << 18)); + out[9 * i] = (uint8_t)a; + out[9 * i + 1] = (uint8_t)(a >> 8); + out[9 * i + 2] = (uint8_t)(a >> 16) | (uint8_t)(b << 2); + out[9 * i + 3] = (uint8_t)(b >> 6); + out[9 * i + 4] = (uint8_t)(b >> 14) | (uint8_t)(c << 4); + out[9 * i + 5] = (uint8_t)(c >> 4); + out[9 * i + 6] = (uint8_t)(c >> 12) | (uint8_t)(d << 6); + out[9 * i + 7] = (uint8_t)(d >> 2); + out[9 * i + 8] = (uint8_t)(d >> 10); + } +} + +// FIPS 204, Algorithm 17 (`BitPack`). +inline void scalar_encode_signed(uint8_t *out, const scalar *s, int bits, + uint32_t max) { + if (bits == 3) { + assert(max == 2); + scalar_encode_signed_3_2(out, s); + } else if (bits == 4) { + assert(max == 4); + scalar_encode_signed_4_4(out, s); + } else if (bits == 20) { + assert(max == 1u << 19); + scalar_encode_signed_20_19(out, s); + } else if (bits == 18) { + assert(max == 1u << 17); + scalar_encode_signed_18_17(out, s); + } else { + assert(bits == 13); + assert(max == 1u << 12); + scalar_encode_signed_13_12(out, s); + } +} + +// FIPS 204, Algorithm 18 (`SimpleBitUnpack`). Specialized for bitlen(b) == 10. +inline void scalar_decode_10(scalar *out, const uint8_t in[320]) { + static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4"); + for (int i = 0; i < kDegree / 4; i++) { + uint32_t v = CRYPTO_load_u32_le(&in[5 * i]); + out->c[4 * i] = v & 0x3ff; + out->c[4 * i + 1] = (v >> 10) & 0x3ff; + out->c[4 * i + 2] = (v >> 20) & 0x3ff; + out->c[4 * i + 3] = (v >> 30) | (((uint32_t)in[5 * i + 4]) << 2); + } +} + +// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 4 and b = +// 4. +inline int scalar_decode_signed_4_4(scalar *out, const uint8_t in[128]) { + static_assert(kDegree % 8 == 0, "kDegree must be a multiple of 8"); + for (int i = 0; i < kDegree / 8; i++) { + uint32_t v = CRYPTO_load_u32_le(&in[4 * i]); + // None of the nibbles may be >= 9. So if the MSB of any nibble is set, none + // of the other bits may be set. First, select all the MSBs. + const uint32_t msbs = v & 0x88888888u; + // For each nibble where the MSB is set, form a mask of all the other bits. + const uint32_t mask = (msbs >> 1) | (msbs >> 2) | (msbs >> 3); + // A nibble is only out of range in the case of invalid input, in which case + // it is okay to leak the value. + if (constant_time_declassify_int((mask & v) != 0)) { + return 0; + } + + out->c[i * 8] = mod_sub(4, v & 15); + out->c[i * 8 + 1] = mod_sub(4, (v >> 4) & 15); + out->c[i * 8 + 2] = mod_sub(4, (v >> 8) & 15); + out->c[i * 8 + 3] = mod_sub(4, (v >> 12) & 15); + out->c[i * 8 + 4] = mod_sub(4, (v >> 16) & 15); + out->c[i * 8 + 5] = mod_sub(4, (v >> 20) & 15); + out->c[i * 8 + 6] = mod_sub(4, (v >> 24) & 15); + out->c[i * 8 + 7] = mod_sub(4, v >> 28); + } + return 1; +} + +// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 3 and b = +// 2. +inline int scalar_decode_signed_3_2(scalar *out, const uint8_t in[96]) { + uint32_t v; + uint8_t v_bytes[sizeof(v)] = {0}; + static_assert(kDegree % 8 == 0, "kDegree must be a multiple of 8"); + for (int i = 0; i < kDegree / 8; i++) { + OPENSSL_memcpy(v_bytes, &in[3 * i], 3); + v = CRYPTO_load_u32_le(v_bytes); + // v contains 8, 3-bit values in the lower 24 bits. None of the values may + // be >= 5. So if the MSB of any triple is set, none of the other bits may + // be set. First, select all the MSBs. + const uint32_t msbs = v & 000044444444u; + // For each triple where the MSB is set, form a mask of all the other bits. + const uint32_t mask = (msbs >> 1) | (msbs >> 2); + // A triple is only out of range in the case of invalid input, in which case + // it is okay to leak the value. + if (constant_time_declassify_int((mask & v) != 0)) { + return 0; + } + + out->c[i * 8 + 0] = mod_sub(2, (v >> 0) & 7); + out->c[i * 8 + 1] = mod_sub(2, (v >> 3) & 7); + out->c[i * 8 + 2] = mod_sub(2, (v >> 6) & 7); + out->c[i * 8 + 3] = mod_sub(2, (v >> 9) & 7); + out->c[i * 8 + 4] = mod_sub(2, (v >> 12) & 7); + out->c[i * 8 + 5] = mod_sub(2, (v >> 15) & 7); + out->c[i * 8 + 6] = mod_sub(2, (v >> 18) & 7); + out->c[i * 8 + 7] = mod_sub(2, v >> 21); + } + return 1; +} + +// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 13 and b = +// 2^12. +inline void scalar_decode_signed_13_12(scalar *out, const uint8_t in[416]) { + static const uint32_t kMax = 1u << 12; + static const uint32_t k13Bits = (1u << 13) - 1; + static const uint32_t k7Bits = (1u << 7) - 1; + + static_assert(kDegree % 8 == 0, "kDegree must be a multiple of 8"); + for (int i = 0; i < kDegree / 8; i++) { + uint32_t a = CRYPTO_load_u32_le(&in[13 * i]); + uint32_t b = CRYPTO_load_u32_le(&in[13 * i + 4]); + uint32_t c = CRYPTO_load_u32_le(&in[13 * i + 8]); + uint8_t d = in[13 * i + 12]; + + // It's not possible for a 13-bit number to be out of range when the max is + // 2^12. + out->c[i * 8] = mod_sub(kMax, a & k13Bits); + out->c[i * 8 + 1] = mod_sub(kMax, (a >> 13) & k13Bits); + out->c[i * 8 + 2] = mod_sub(kMax, (a >> 26) | ((b & k7Bits) << 6)); + out->c[i * 8 + 3] = mod_sub(kMax, (b >> 7) & k13Bits); + out->c[i * 8 + 4] = mod_sub(kMax, (b >> 20) | ((c & 1) << 12)); + out->c[i * 8 + 5] = mod_sub(kMax, (c >> 1) & k13Bits); + out->c[i * 8 + 6] = mod_sub(kMax, (c >> 14) & k13Bits); + out->c[i * 8 + 7] = mod_sub(kMax, (c >> 27) | ((uint32_t)d) << 5); + } +} + +// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 18 and b = +// 2^17. +inline void scalar_decode_signed_18_17(scalar *out, const uint8_t in[576]) { + static const uint32_t kMax = 1u << 17; + + static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4"); + for (int i = 0; i < kDegree / 4; i++) { + uint32_t a = uint32_t{in[9 * i]} | (uint32_t{in[9 * i + 1]} << 8) | + ((uint32_t{in[9 * i + 2]} & 0x3) << 16); + uint32_t b = (uint32_t{in[9 * i + 2]} >> 2) | + (uint32_t{in[9 * i + 3]} << 6) | + ((uint32_t{in[9 * i + 4]} & 0xf) << 14); + uint32_t c = (uint32_t{in[9 * i + 4]} >> 4) | + (uint32_t{in[9 * i + 5]} << 4) | + ((uint32_t{in[9 * i + 6]} & 0x3f) << 12); + uint32_t d = (uint32_t{in[9 * i + 6]} >> 6) | + (uint32_t{in[9 * i + 7]} << 2) | + (uint32_t{in[9 * i + 8]} << 10); + + out->c[i * 4] = mod_sub(kMax, a); + out->c[i * 4 + 1] = mod_sub(kMax, b); + out->c[i * 4 + 2] = mod_sub(kMax, c); + out->c[i * 4 + 3] = mod_sub(kMax, d); + } +} + +// FIPS 204, Algorithm 19 (`BitUnpack`). Specialized to bitlen(a+b) = 20 and b = +// 2^19. +inline void scalar_decode_signed_20_19(scalar *out, const uint8_t in[640]) { + static const uint32_t kMax = 1u << 19; + static const uint32_t k20Bits = (1u << 20) - 1; + + static_assert(kDegree % 4 == 0, "kDegree must be a multiple of 4"); + for (int i = 0; i < kDegree / 4; i++) { + uint32_t a = CRYPTO_load_u32_le(&in[10 * i]); + uint32_t b = CRYPTO_load_u32_le(&in[10 * i + 4]); + uint16_t c = CRYPTO_load_u16_le(&in[10 * i + 8]); + + // It's not possible for a 20-bit number to be out of range when the max is + // 2^19. + out->c[i * 4] = mod_sub(kMax, a & k20Bits); + out->c[i * 4 + 1] = mod_sub(kMax, (a >> 20) | ((b & 0xff) << 12)); + out->c[i * 4 + 2] = mod_sub(kMax, (b >> 8) & k20Bits); + out->c[i * 4 + 3] = mod_sub(kMax, (b >> 28) | ((uint32_t)c) << 4); + } +} + +// FIPS 204, Algorithm 19 (`BitUnpack`). +inline int scalar_decode_signed(scalar *out, const uint8_t *in, int bits, + uint32_t max) { + if (bits == 3) { + assert(max == 2); + return scalar_decode_signed_3_2(out, in); + } else if (bits == 4) { + assert(max == 4); + return scalar_decode_signed_4_4(out, in); + } else if (bits == 13) { + assert(max == (1u << 12)); + scalar_decode_signed_13_12(out, in); + return 1; + } else if (bits == 18) { + assert(max == (1u << 17)); + scalar_decode_signed_18_17(out, in); + return 1; + } else if (bits == 20) { + assert(max == (1u << 19)); + scalar_decode_signed_20_19(out, in); + return 1; + } else { + abort(); + } +} + +/* Expansion functions */ + +// FIPS 204, Algorithm 30 (`RejNTTPoly`). +// +// Rejection samples a Keccak stream to get uniformly distributed elements. This +// is used for matrix expansion and only operates on public inputs. +inline void scalar_from_keccak_vartime( + scalar *out, const uint8_t derived_seed[kRhoBytes + 2]) { + BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake128); + BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, kRhoBytes + 2); + assert(keccak_ctx.squeeze_offset == 0); + assert(keccak_ctx.rate_bytes == 168); + static_assert(168 % 3 == 0, "block and coefficient boundaries do not align"); + + int done = 0; + while (done < kDegree) { + uint8_t block[168]; + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + for (size_t i = 0; i < sizeof(block) && done < kDegree; i += 3) { + // FIPS 204, Algorithm 14 (`CoeffFromThreeBytes`). + uint32_t value = (uint32_t)block[i] | ((uint32_t)block[i + 1] << 8) | + (((uint32_t)block[i + 2] & 0x7f) << 16); + if (value < kPrime) { + out->c[done++] = value; + } + } + } +} + +template +inline bool coefficient_from_nibble(uint32_t nibble, uint32_t *result); + +template <> +inline bool coefficient_from_nibble<4>(uint32_t nibble, uint32_t *result) { + if (constant_time_declassify_int(nibble < 9)) { + // Knowing bounds on |nibble| seems to tempt some versions of Clang to emit + // a branch, if we don't have a barrier in |mod_sub|. + *result = mod_sub(4, value_barrier_u32(nibble)); + return true; + } + return false; +} + +template <> +inline bool coefficient_from_nibble<2>(uint32_t nibble, uint32_t *result) { + if (constant_time_declassify_int(nibble < 15)) { + // Knowing bounds on |nibble| seems to tempt some versions of Clang to emit + // a branch, if we don't have a barrier in |mod_sub|. + // Constant time "nibble % 5". + nibble = nibble - 5 * ((205 * nibble) >> 10); + *result = mod_sub(2, value_barrier_u32(nibble)); + return true; + } + return false; +} + +// FIPS 204, Algorithm 31 (`RejBoundedPoly`). +template +inline void scalar_uniform(scalar *out, + const uint8_t derived_seed[kSigmaBytes + 2]) { + BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, derived_seed, kSigmaBytes + 2); + assert(keccak_ctx.squeeze_offset == 0); + assert(keccak_ctx.rate_bytes == 136); + + int done = 0; + while (done < kDegree) { + uint8_t block[136]; + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + for (size_t i = 0; i < sizeof(block) && done < kDegree; ++i) { + uint32_t t0 = block[i] & 0x0F; + uint32_t t1 = block[i] >> 4; + // FIPS 204, Algorithm 15 (`CoefFromHalfByte`). Although both the input + // and output here are secret, it is OK to leak when we rejected a byte. + // Individual bytes of the SHAKE-256 stream are (indistinguishable from) + // independent of each other and the original seed, so leaking information + // about the rejected bytes does not reveal the input or output. + uint32_t v; + if (coefficient_from_nibble(t0, &v)) { + out->c[done++] = v; + } + if (done < kDegree && coefficient_from_nibble(t1, &v)) { + out->c[done++] = v; + } + } + } +} + +// FIPS 204, Algorithm 34 (`ExpandMask`), but just a single step. +template +inline void scalar_sample_mask(scalar *out, + const uint8_t derived_seed[kRhoPrimeBytes + 2]) { + uint8_t buf[scalar_le_gamma1_bytes()]; + BORINGSSL_keccak(buf, sizeof(buf), derived_seed, kRhoPrimeBytes + 2, + boringssl_shake256); + + scalar_decode_signed(out, buf, gamma1_bits() + 1, gamma1()); +} + +// FIPS 204, Algorithm 29 (`SampleInBall`). +inline void scalar_sample_in_ball_vartime(scalar *out, const uint8_t *seed, + int len, int tau) { + BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, seed, len); + assert(keccak_ctx.squeeze_offset == 0); + assert(keccak_ctx.rate_bytes == 136); + + uint8_t block[136]; + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + + uint64_t signs = CRYPTO_load_u64_le(block); + int offset = 8; + // SampleInBall implements a Fisher–Yates shuffle, which unavoidably leaks + // where the zeros are by memory access pattern. Although this leak happens + // before bad signatures are rejected, this is safe. See + // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/8d8f01ac_70af3f21/ + CONSTTIME_DECLASSIFY(block + offset, sizeof(block) - offset); + + OPENSSL_memset(out, 0, sizeof(*out)); + for (size_t i = kDegree - tau; i < kDegree; i++) { + size_t byte; + for (;;) { + if (offset == 136) { + BORINGSSL_keccak_squeeze(&keccak_ctx, block, sizeof(block)); + // See above. + CONSTTIME_DECLASSIFY(block, sizeof(block)); + offset = 0; + } + + byte = block[offset++]; + if (byte <= i) { + break; + } + } + + out->c[i] = out->c[byte]; + out->c[byte] = mod_sub(1, 2 * (signs & 1)); + signs >>= 1; + } +} + +// FIPS 204, Algorithm 32 (`ExpandA`). +template +inline void matrix_expand(matrix *out, const uint8_t rho[kRhoBytes]) { + static_assert(K <= 0x100, "K must fit in 8 bits"); + static_assert(L <= 0x100, "L must fit in 8 bits"); + + uint8_t derived_seed[kRhoBytes + 2]; + OPENSSL_memcpy(derived_seed, rho, kRhoBytes); + for (int i = 0; i < K; i++) { + for (int j = 0; j < L; j++) { + derived_seed[kRhoBytes + 1] = (uint8_t)i; + derived_seed[kRhoBytes] = (uint8_t)j; + scalar_from_keccak_vartime(&out->v[i][j], derived_seed); + } + } +} + +// FIPS 204, Algorithm 33 (`ExpandS`). +template +inline void vector_expand_short(vector *s1, vector *s2, + const uint8_t sigma[kSigmaBytes]) { + static_assert(K <= 0x100, "K must fit in 8 bits"); + static_assert(L <= 0x100, "L must fit in 8 bits"); + static_assert(K + L <= 0x100, "K+L must fit in 8 bits"); + + uint8_t derived_seed[kSigmaBytes + 2]; + OPENSSL_memcpy(derived_seed, sigma, kSigmaBytes); + derived_seed[kSigmaBytes] = 0; + derived_seed[kSigmaBytes + 1] = 0; + for (int i = 0; i < L; i++) { + scalar_uniform()>(&s1->v[i], derived_seed); + ++derived_seed[kSigmaBytes]; + } + for (int i = 0; i < K; i++) { + scalar_uniform()>(&s2->v[i], derived_seed); + ++derived_seed[kSigmaBytes]; + } +} + +// FIPS 204, Algorithm 34 (`ExpandMask`). +template +inline void vector_expand_mask(vector *out, + const uint8_t seed[kRhoPrimeBytes], + size_t kappa) { + assert(kappa + L <= 0x10000); + + uint8_t derived_seed[kRhoPrimeBytes + 2]; + OPENSSL_memcpy(derived_seed, seed, kRhoPrimeBytes); + for (int i = 0; i < L; i++) { + size_t index = kappa + i; + derived_seed[kRhoPrimeBytes] = index & 0xFF; + derived_seed[kRhoPrimeBytes + 1] = (index >> 8) & 0xFF; + scalar_sample_mask(&out->v[i], derived_seed); + } +} + +/* Encoding */ + +// FIPS 204, Algorithm 16 (`SimpleBitPack`). +// +// Encodes an entire vector into 32*K*|bits| bytes. Note that since 256 +// (kDegree) is divisible by 8, the individual vector entries will always fill a +// whole number of bytes, so we do not need to worry about bit packing here. +template +inline void vector_encode(uint8_t *out, const vector *a, int bits) { + if (bits == 4) { + for (int i = 0; i < K; i++) { + scalar_encode_4(out + i * bits * kDegree / 8, &a->v[i]); + } + } else if (bits == 6) { + for (int i = 0; i < K; i++) { + scalar_encode_6(out + i * bits * kDegree / 8, &a->v[i]); + } + } else { + assert(bits == 10); + for (int i = 0; i < K; i++) { + scalar_encode_10(out + i * bits * kDegree / 8, &a->v[i]); + } + } +} + +// FIPS 204, Algorithm 18 (`SimpleBitUnpack`). +template +inline void vector_decode_10(vector *out, const uint8_t *in) { + for (int i = 0; i < K; i++) { + scalar_decode_10(&out->v[i], in + i * 10 * kDegree / 8); + } +} + +// FIPS 204, Algorithm 17 (`BitPack`). +// +// Encodes an entire vector into 32*L*|bits| bytes. Note that since 256 +// (kDegree) is divisible by 8, the individual vector entries will always fill a +// whole number of bytes, so we do not need to worry about bit packing here. +template +inline void vector_encode_signed(uint8_t *out, const vector *a, int bits, + uint32_t max) { + for (int i = 0; i < X; i++) { + scalar_encode_signed(out + i * bits * kDegree / 8, &a->v[i], bits, max); + } +} + +template +inline int vector_decode_signed(vector *out, const uint8_t *in, int bits, + uint32_t max) { + for (int i = 0; i < X; i++) { + if (!scalar_decode_signed(&out->v[i], in + i * bits * kDegree / 8, bits, + max)) { + return 0; + } + } + return 1; +} + +// FIPS 204, Algorithm 28 (`w1Encode`). +template +inline void w1_encode(uint8_t out[w1_bytes()], const vector *w1) { + vector_encode(out, w1, w1_coeffs_bits()); +} + +// FIPS 204, Algorithm 20 (`HintBitPack`). +template +inline void hint_bit_pack(uint8_t out[omega() + K], const vector *h) { + OPENSSL_memset(out, 0, omega() + K); + int index = 0; + for (int i = 0; i < K; i++) { + for (int j = 0; j < kDegree; j++) { + if (h->v[i].c[j]) { + // h must have at most omega() non-zero coefficients. + BSSL_CHECK(index < omega()); + out[index++] = j; + } + } + out[omega() + i] = index; + } +} + +// FIPS 204, Algorithm 21 (`HintBitUnpack`). +template +inline int hint_bit_unpack(vector *h, const uint8_t in[omega() + K]) { + vector_zero(h); + int index = 0; + for (int i = 0; i < K; i++) { + const int limit = in[omega() + i]; + if (limit < index || limit > omega()) { + return 0; + } + + int last = -1; + while (index < limit) { + int byte = in[index++]; + if (last >= 0 && byte <= last) { + return 0; + } + last = byte; + static_assert(kDegree == 256, + "kDegree must be 256 for this write to be in bounds"); + h->v[i].c[byte] = 1; + } + } + for (; index < omega(); index++) { + if (in[index] != 0) { + return 0; + } + } + return 1; +} + +template +struct public_key { + uint8_t rho[kRhoBytes]; + vector t1; + // Pre-cached value(s). + uint8_t public_key_hash[kTrBytes]; +}; + +template +struct private_key { + public_key pub; + uint8_t k[kKBytes]; + vector s1; + vector s2; + vector t0; +}; + +template +struct signature { + uint8_t c_tilde[2 * lambda_bytes()]; + vector z; + vector h; +}; + +// FIPS 204, Algorithm 22 (`pkEncode`). +template +inline int mldsa_marshal_public_key(CBB *out, const public_key *pub) { + if (!CBB_add_bytes(out, pub->rho, sizeof(pub->rho))) { + return 0; + } + + uint8_t *vectork_output; + if (!CBB_add_space(out, &vectork_output, 320 * K)) { + return 0; + } + vector_encode(vectork_output, &pub->t1, 10); + + return 1; +} + +// FIPS 204, Algorithm 23 (`pkDecode`). +template +inline int mldsa_parse_public_key(public_key *pub, CBS *in) { + const CBS orig_in = *in; + + if (!CBS_copy_bytes(in, pub->rho, sizeof(pub->rho))) { + return 0; + } + + CBS t1_bytes; + if (!CBS_get_bytes(in, &t1_bytes, 320 * K) || CBS_len(in) != 0) { + return 0; + } + vector_decode_10(&pub->t1, CBS_data(&t1_bytes)); + + // Compute pre-cached values. + BORINGSSL_keccak(pub->public_key_hash, sizeof(pub->public_key_hash), + CBS_data(&orig_in), CBS_len(&orig_in), boringssl_shake256); + + return 1; +} + +// FIPS 204, Algorithm 24 (`skEncode`). +template +inline int mldsa_marshal_private_key(CBB *out, const private_key *priv) { + if (!CBB_add_bytes(out, priv->pub.rho, sizeof(priv->pub.rho)) || + !CBB_add_bytes(out, priv->k, sizeof(priv->k)) || + !CBB_add_bytes(out, priv->pub.public_key_hash, + sizeof(priv->pub.public_key_hash))) { + return 0; + } + + constexpr size_t scalar_bytes = + (kDegree * plus_minus_eta_bitlen() + 7) / 8; + uint8_t *vectorl_output; + if (!CBB_add_space(out, &vectorl_output, scalar_bytes * L)) { + return 0; + } + vector_encode_signed(vectorl_output, &priv->s1, plus_minus_eta_bitlen(), + eta()); + + uint8_t *s2_output; + if (!CBB_add_space(out, &s2_output, scalar_bytes * K)) { + return 0; + } + vector_encode_signed(s2_output, &priv->s2, plus_minus_eta_bitlen(), + eta()); + + uint8_t *t0_output; + if (!CBB_add_space(out, &t0_output, 416 * K)) { + return 0; + } + vector_encode_signed(t0_output, &priv->t0, 13, 1 << 12); + + return 1; +} + +// FIPS 204, Algorithm 25 (`skDecode`). This is only used for testing. The +// supported external way to construct ML-DSA keys is to use the input seed. +template +inline int mldsa_parse_private_key(private_key *priv, CBS *in) { + CBS public_key_hash, s1_bytes, s2_bytes, t0_bytes; + constexpr size_t scalar_bytes = + (kDegree * plus_minus_eta_bitlen() + 7) / 8; + if (!CBS_copy_bytes(in, priv->pub.rho, sizeof(priv->pub.rho)) || + !CBS_copy_bytes(in, priv->k, sizeof(priv->k)) || + !CBS_get_bytes(in, &public_key_hash, kTrBytes) || + !CBS_get_bytes(in, &s1_bytes, scalar_bytes * L) || + !vector_decode_signed(&priv->s1, CBS_data(&s1_bytes), + plus_minus_eta_bitlen(), eta()) || + !CBS_get_bytes(in, &s2_bytes, scalar_bytes * K) || + !vector_decode_signed(&priv->s2, CBS_data(&s2_bytes), + plus_minus_eta_bitlen(), eta()) || + !CBS_get_bytes(in, &t0_bytes, 416 * K) || + // Note: Decoding 13 bits into (-2^12, 2^12] cannot fail. + !vector_decode_signed(&priv->t0, CBS_data(&t0_bytes), 13, 1 << 12)) { + return 0; + } + + // Compute `t1`, which is not in the `skDecode` input. + uint8_t unused[public_key_bytes()]; + if (!mldsa_finish_keygen(unused, priv)) { + return 0; + } + + // As a side effect of computing `t1`, we also compute `t0` and + // `public_key_hash`. Check they match the received bytes. + uint8_t t0_computed[416 * K]; + vector_encode_signed(t0_computed, &priv->t0, 13, 1 << 12); + if (!CBS_mem_equal(&public_key_hash, priv->pub.public_key_hash, + sizeof(priv->pub.public_key_hash)) || + !CBS_mem_equal(&t0_bytes, t0_computed, sizeof(t0_computed))) { + return 0; + } + + return 1; +} + +// FIPS 204, Algorithm 26 (`sigEncode`). +template +inline int mldsa_marshal_signature(CBB *out, const signature *sign) { + if (!CBB_add_bytes(out, sign->c_tilde, sizeof(sign->c_tilde))) { + return 0; + } + + uint8_t *vectorl_output; + if (!CBB_add_space(out, &vectorl_output, scalar_le_gamma1_bytes() * L)) { + return 0; + } + vector_encode_signed(vectorl_output, &sign->z, gamma1_bits() + 1, + gamma1()); + + uint8_t *hint_output; + if (!CBB_add_space(out, &hint_output, omega() + K)) { + return 0; + } + hint_bit_pack(hint_output, &sign->h); + + return 1; +} + +// FIPS 204, Algorithm 27 (`sigDecode`). +template +inline int mldsa_parse_signature(signature *sign, CBS *in) { + CBS z_bytes; + CBS hint_bytes; + if (!CBS_copy_bytes(in, sign->c_tilde, sizeof(sign->c_tilde)) || + !CBS_get_bytes(in, &z_bytes, scalar_le_gamma1_bytes() * L) || + // Note: Decoding b+1 bits into (-2^b, 2^b] cannot fail. + !vector_decode_signed(&sign->z, CBS_data(&z_bytes), gamma1_bits() + 1, + gamma1()) || + !CBS_get_bytes(in, &hint_bytes, omega() + K) || + !hint_bit_unpack(&sign->h, CBS_data(&hint_bytes))) { + return 0; + }; + + return 1; +} + +// FIPS 204, Algorithm 6 (`ML-DSA.KeyGen_internal`), steps 3 and 5–11. +// Returns 1 on success and 0 on failure. +template +inline int mldsa_finish_keygen( + uint8_t out_encoded_public_key[public_key_bytes()], + private_key *priv) { + // Intermediate values, allocated on the heap to allow use when there is a + // limited amount of stack. + struct Values { + enum { kAllowUniquePtr = true }; + matrix a_ntt; + vector s1_ntt; + vector t; + }; + auto values = MakeUnique(); + if (values == nullptr) { + return 0; + } + + // Step 3. + matrix_expand(&values->a_ntt, priv->pub.rho); + + // Step 5. + OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt)); + vector_ntt(&values->s1_ntt); + + matrix_mult(&values->t, &values->a_ntt, &values->s1_ntt); + vector_inverse_ntt(&values->t); + vector_add(&values->t, &values->t, &priv->s2); + + // Step 6-7. + vector_power2_round(&priv->pub.t1, &priv->t0, &values->t); + // t1 is public. + CONSTTIME_DECLASSIFY(&priv->pub.t1, sizeof(priv->pub.t1)); + + // Step 8. + CBB cbb; + CBB_init_fixed(&cbb, out_encoded_public_key, public_key_bytes()); + if (!mldsa_marshal_public_key(&cbb, &priv->pub)) { + return 0; + } + assert(CBB_len(&cbb) == public_key_bytes()); + + // Step 9-11. + BORINGSSL_keccak(priv->pub.public_key_hash, sizeof(priv->pub.public_key_hash), + out_encoded_public_key, public_key_bytes(), + boringssl_shake256); + + return 1; +} + +// FIPS 204, Algorithm 6 (`ML-DSA.KeyGen_internal`). Returns 1 on success and 0 +// on failure. +template +inline int mldsa_generate_key_external_entropy_no_self_test( + uint8_t out_encoded_public_key[public_key_bytes()], + private_key *priv, const uint8_t entropy[MLDSA_SEED_BYTES]) { + // Step 1-2. + uint8_t augmented_entropy[MLDSA_SEED_BYTES + 2]; + OPENSSL_memcpy(augmented_entropy, entropy, MLDSA_SEED_BYTES); + // The k and l parameters are appended to the seed. + augmented_entropy[MLDSA_SEED_BYTES] = K; + augmented_entropy[MLDSA_SEED_BYTES + 1] = L; + uint8_t expanded_seed[kRhoBytes + kSigmaBytes + kKBytes]; + BORINGSSL_keccak(expanded_seed, sizeof(expanded_seed), augmented_entropy, + sizeof(augmented_entropy), boringssl_shake256); + const uint8_t *const rho = expanded_seed; + const uint8_t *const sigma = expanded_seed + kRhoBytes; + const uint8_t *const k = expanded_seed + kRhoBytes + kSigmaBytes; + // rho is public. + CONSTTIME_DECLASSIFY(rho, kRhoBytes); + OPENSSL_memcpy(priv->pub.rho, rho, sizeof(priv->pub.rho)); + OPENSSL_memcpy(priv->k, k, sizeof(priv->k)); + // Step 4. This is independent of A (step 3) and can be done first. + vector_expand_short(&priv->s1, &priv->s2, sigma); + // Steps 3 and 5-11. + return mldsa_finish_keygen(out_encoded_public_key, priv); +} + +template +inline int mldsa_generate_key_external_entropy( + uint8_t out_encoded_public_key[public_key_bytes()], + private_key *priv, const uint8_t entropy[MLDSA_SEED_BYTES]) { + fips::ensure_keygen_self_test(); + return mldsa_generate_key_external_entropy_no_self_test( + out_encoded_public_key, priv, entropy); +} + +// FIPS 204, Algorithm 7 (`ML-DSA.Sign_internal`), using a pre-computed mu. +// Returns 1 on success and 0 on failure. +template +inline int mldsa_sign_mu_no_self_test( + uint8_t out_encoded_signature[signature_bytes()], + const private_key *priv, const uint8_t mu[kMuBytes], + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) { + uint8_t rho_prime[kRhoPrimeBytes]; + BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, priv->k, sizeof(priv->k)); + BORINGSSL_keccak_absorb(&keccak_ctx, randomizer, + BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES); + BORINGSSL_keccak_absorb(&keccak_ctx, mu, kMuBytes); + BORINGSSL_keccak_squeeze(&keccak_ctx, rho_prime, kRhoPrimeBytes); + + // Intermediate values, allocated on the heap to allow use when there is a + // limited amount of stack. + struct Values { + enum { kAllowUniquePtr = true }; + signature sign; + vector s1_ntt; + vector s2_ntt; + vector t0_ntt; + matrix a_ntt; + vector y; + vector w; + vector w1; + vector cs1; + vector cs2; + }; + auto values = MakeUnique(); + if (values == nullptr) { + return 0; + } + OPENSSL_memcpy(&values->s1_ntt, &priv->s1, sizeof(values->s1_ntt)); + vector_ntt(&values->s1_ntt); + + OPENSSL_memcpy(&values->s2_ntt, &priv->s2, sizeof(values->s2_ntt)); + vector_ntt(&values->s2_ntt); + + OPENSSL_memcpy(&values->t0_ntt, &priv->t0, sizeof(values->t0_ntt)); + vector_ntt(&values->t0_ntt); + + matrix_expand(&values->a_ntt, priv->pub.rho); + + // kappa must not exceed 2**16/L = 13107. But the probability of it + // exceeding even 1000 iterations is vanishingly small. + for (size_t kappa = 0;; kappa += L) { + vector_expand_mask(&values->y, rho_prime, kappa); + + vector *y_ntt = &values->cs1; + OPENSSL_memcpy(y_ntt, &values->y, sizeof(*y_ntt)); + vector_ntt(y_ntt); + + matrix_mult(&values->w, &values->a_ntt, y_ntt); + vector_inverse_ntt(&values->w); + + vector_high_bits(&values->w1, &values->w); + uint8_t w1_encoded[w1_bytes()]; + w1_encode(w1_encoded, &values->w1); + + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, mu, kMuBytes); + BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, w1_bytes()); + BORINGSSL_keccak_squeeze(&keccak_ctx, values->sign.c_tilde, + 2 * lambda_bytes()); + + scalar c_ntt; + scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde, + sizeof(values->sign.c_tilde), tau()); + scalar_ntt(&c_ntt); + + vector_mult_scalar(&values->cs1, &values->s1_ntt, &c_ntt); + vector_inverse_ntt(&values->cs1); + vector_mult_scalar(&values->cs2, &values->s2_ntt, &c_ntt); + vector_inverse_ntt(&values->cs2); + + vector_add(&values->sign.z, &values->y, &values->cs1); + + vector *r0 = &values->w1; + vector_sub(r0, &values->w, &values->cs2); + vector_low_bits(r0, r0); + + // Leaking the fact that a signature was rejected is fine as the next + // attempt at a signature will be (indistinguishable from) independent of + // this one. Note, however, that we additionally leak which of the two + // branches rejected the signature. Section 5.5 of + // https://pq-crystals.org/dilithium/data/dilithium-specification-round3.pdf + // describes this leak as OK. Note we leak less than what is described by + // the paper; we do not reveal which coefficient violated the bound, and + // we hide which of the |z_max| or |r0_max| bound failed. See also + // https://boringssl-review.googlesource.com/c/boringssl/+/67747/comment/2bbab0fa_d241d35a/ + uint32_t z_max = vector_max(&values->sign.z); + uint32_t r0_max = vector_max_signed(r0); + if (constant_time_declassify_w( + constant_time_ge_w(z_max, gamma1() - beta()) | + constant_time_ge_w(r0_max, gamma2() - beta()))) { +#if defined(BORINGSSL_FIPS_BREAK_TESTS) + // In order to show that our self-tests trigger both restart cases in + // this loop, printf-logging is added when built in break-test mode. + printf("MLDSA signature restart case 1.\n"); +#endif + continue; + } + + vector *ct0 = &values->w1; + vector_mult_scalar(ct0, &values->t0_ntt, &c_ntt); + vector_inverse_ntt(ct0); + vector_make_hint(&values->sign.h, ct0, &values->cs2, &values->w); + + // See above. + uint32_t ct0_max = vector_max(ct0); + size_t h_ones = vector_count_ones(&values->sign.h); + if (constant_time_declassify_w(constant_time_ge_w(ct0_max, gamma2()) | + constant_time_lt_w(omega(), h_ones))) { +#if defined(BORINGSSL_FIPS_BREAK_TESTS) + // In order to show that our self-tests trigger both restart cases in + // this loop, printf-logging is added when built in break-test mode. + printf("MLDSA signature restart case 2.\n"); +#endif + continue; + } + + // Although computed with the private key, the signature is public. + CONSTTIME_DECLASSIFY(values->sign.c_tilde, sizeof(values->sign.c_tilde)); + CONSTTIME_DECLASSIFY(&values->sign.z, sizeof(values->sign.z)); + CONSTTIME_DECLASSIFY(&values->sign.h, sizeof(values->sign.h)); + + CBB cbb; + CBB_init_fixed(&cbb, out_encoded_signature, signature_bytes()); + if (!mldsa_marshal_signature(&cbb, &values->sign)) { + return 0; + } + + BSSL_CHECK(CBB_len(&cbb) == signature_bytes()); + return 1; + } +} + +// FIPS 204, Algorithm 7 (`ML-DSA.Sign_internal`), using a pre-computed mu. +// Returns 1 on success and 0 on failure. +template +inline int mldsa_sign_mu( + uint8_t out_encoded_signature[signature_bytes()], + const private_key *priv, const uint8_t mu[kMuBytes], + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) { + fips::ensure_sign_self_test(); + return mldsa_sign_mu_no_self_test(out_encoded_signature, priv, mu, + randomizer); +} + +// FIPS 204, Algorithm 7 (`ML-DSA.Sign_internal`). Returns 1 on success and 0 +// on failure. +template +inline int mldsa_sign_internal_no_self_test( + uint8_t out_encoded_signature[signature_bytes()], + const private_key *priv, const uint8_t *msg, size_t msg_len, + const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len, + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) { + uint8_t mu[kMuBytes]; + BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, priv->pub.public_key_hash, + sizeof(priv->pub.public_key_hash)); + BORINGSSL_keccak_absorb(&keccak_ctx, context_prefix, context_prefix_len); + BORINGSSL_keccak_absorb(&keccak_ctx, context, context_len); + BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len); + BORINGSSL_keccak_squeeze(&keccak_ctx, mu, kMuBytes); + + return mldsa_sign_mu_no_self_test(out_encoded_signature, priv, mu, + randomizer); +} + +// FIPS 204, Algorithm 7 (`ML-DSA.Sign_internal`). Returns 1 on success and 0 +// on failure. +template +inline int mldsa_sign_internal( + uint8_t out_encoded_signature[signature_bytes()], + const private_key *priv, const uint8_t *msg, size_t msg_len, + const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len, + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) { + fips::ensure_sign_self_test(); + return mldsa_sign_internal_no_self_test( + out_encoded_signature, priv, msg, msg_len, context_prefix, + context_prefix_len, context, context_len, randomizer); +} + +struct prehash_context { + BORINGSSL_keccak_st keccak_ctx; +}; + +template +inline void mldsa_prehash_init(prehash_context *out_prehash_ctx, + const public_key *pub, + const uint8_t *context_prefix, + size_t context_prefix_len, + const uint8_t *context, size_t context_len) { + BORINGSSL_keccak_init(&out_prehash_ctx->keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&out_prehash_ctx->keccak_ctx, pub->public_key_hash, + sizeof(pub->public_key_hash)); + BORINGSSL_keccak_absorb(&out_prehash_ctx->keccak_ctx, context_prefix, + context_prefix_len); + BORINGSSL_keccak_absorb(&out_prehash_ctx->keccak_ctx, context, context_len); +} + +inline void mldsa_prehash_update(prehash_context *inout_prehash_ctx, + const uint8_t *msg, size_t msg_len) { + BORINGSSL_keccak_absorb(&inout_prehash_ctx->keccak_ctx, msg, msg_len); +} + +inline void mldsa_prehash_finalize(uint8_t out_msg_rep[kMuBytes], + prehash_context *inout_prehash_ctx) { + BORINGSSL_keccak_squeeze(&inout_prehash_ctx->keccak_ctx, out_msg_rep, + kMuBytes); +} + +// FIPS 204, Algorithm 8 (`ML-DSA.Verify_internal`), using a pre-computed mu. +// Returns 1 on success and 0 on failure. +template +inline int mldsa_verify_mu_no_self_test( + const public_key *pub, + const uint8_t encoded_signature[signature_bytes()], + const uint8_t mu[kMuBytes]) { + // Intermediate values, allocated on the heap to allow use when there is a + // limited amount of stack. + struct Values { + enum { kAllowUniquePtr = true }; + signature sign; + matrix a_ntt; + vector z_ntt; + vector az_ntt; + vector ct1_ntt; + }; + auto values = MakeUnique(); + if (values == nullptr) { + return 0; + } + + CBS cbs; + CBS_init(&cbs, encoded_signature, signature_bytes()); + if (!mldsa_parse_signature(&values->sign, &cbs)) { + return 0; + } + + matrix_expand(&values->a_ntt, pub->rho); + + scalar c_ntt; + scalar_sample_in_ball_vartime(&c_ntt, values->sign.c_tilde, + sizeof(values->sign.c_tilde), tau()); + scalar_ntt(&c_ntt); + + OPENSSL_memcpy(&values->z_ntt, &values->sign.z, sizeof(values->z_ntt)); + vector_ntt(&values->z_ntt); + + matrix_mult(&values->az_ntt, &values->a_ntt, &values->z_ntt); + + vector_scale_power2_round(&values->ct1_ntt, &pub->t1); + vector_ntt(&values->ct1_ntt); + + vector_mult_scalar(&values->ct1_ntt, &values->ct1_ntt, &c_ntt); + + vector *const w1 = &values->az_ntt; + vector_sub(w1, &values->az_ntt, &values->ct1_ntt); + vector_inverse_ntt(w1); + + vector_use_hint_vartime(w1, &values->sign.h, w1); + uint8_t w1_encoded[w1_bytes()]; + w1_encode(w1_encoded, w1); + + uint8_t c_tilde[2 * lambda_bytes()]; + BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, mu, kMuBytes); + BORINGSSL_keccak_absorb(&keccak_ctx, w1_encoded, w1_bytes()); + BORINGSSL_keccak_squeeze(&keccak_ctx, c_tilde, 2 * lambda_bytes()); + + uint32_t z_max = vector_max(&values->sign.z); + return z_max < static_cast(gamma1() - beta()) && + OPENSSL_memcmp(c_tilde, values->sign.c_tilde, 2 * lambda_bytes()) == + 0; +} + +// FIPS 204, Algorithm 8 (`ML-DSA.Verify_internal`), using a pre-computed mu. +// Returns 1 on success and 0 on failure. +template +inline int mldsa_verify_mu( + const public_key *pub, + const uint8_t encoded_signature[signature_bytes()], + const uint8_t mu[kMuBytes]) { + fips::ensure_verify_self_test(); + return mldsa_verify_mu_no_self_test(pub, encoded_signature, mu); +} + +// FIPS 204, Algorithm 8 (`ML-DSA.Verify_internal`). +template +inline int mldsa_verify_internal_no_self_test( + const public_key *pub, + const uint8_t encoded_signature[signature_bytes()], const uint8_t *msg, + size_t msg_len, const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len) { + uint8_t mu[kMuBytes]; + BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak_ctx, pub->public_key_hash, + sizeof(pub->public_key_hash)); + BORINGSSL_keccak_absorb(&keccak_ctx, context_prefix, context_prefix_len); + BORINGSSL_keccak_absorb(&keccak_ctx, context, context_len); + BORINGSSL_keccak_absorb(&keccak_ctx, msg, msg_len); + BORINGSSL_keccak_squeeze(&keccak_ctx, mu, kMuBytes); + + return mldsa_verify_mu_no_self_test(pub, encoded_signature, mu); +} + +// FIPS 204, Algorithm 8 (`ML-DSA.Verify_internal`). +template +inline int mldsa_verify_internal( + const public_key *pub, + const uint8_t encoded_signature[signature_bytes()], const uint8_t *msg, + size_t msg_len, const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len) { + fips::ensure_verify_self_test(); + return mldsa_verify_internal_no_self_test( + pub, encoded_signature, msg, msg_len, context_prefix, context_prefix_len, + context, context_len); +} + +static_assert(sizeof(MLDSA65_private_key) == sizeof(private_key<6, 5>)); +static_assert(alignof(MLDSA65_private_key) == alignof(private_key<6, 5>)); + +inline const private_key<6, 5> *private_key_from_external_65( + const MLDSA65_private_key *external) { + return reinterpret_cast *>(external); +} +inline private_key<6, 5> *private_key_from_external_65( + MLDSA65_private_key *external) { + return reinterpret_cast *>(external); +} + +static_assert(sizeof(MLDSA65_public_key) == sizeof(public_key<6>)); +static_assert(alignof(MLDSA65_public_key) == alignof(public_key<6>)); + +inline const public_key<6> *public_key_from_external_65( + const MLDSA65_public_key *external) { + return reinterpret_cast *>(external); +} +inline public_key<6> *public_key_from_external_65( + MLDSA65_public_key *external) { + return reinterpret_cast *>(external); +} + +inline prehash_context *prehash_context_from_external_65( + MLDSA65_prehash *external) { + static_assert(sizeof(MLDSA65_prehash) == sizeof(prehash_context)); + static_assert(alignof(MLDSA65_prehash) == alignof(prehash_context)); + return reinterpret_cast(external); +} + +static_assert(sizeof(MLDSA87_private_key) == sizeof(private_key<8, 7>)); +static_assert(alignof(MLDSA87_private_key) == alignof(private_key<8, 7>)); +inline const private_key<8, 7> *private_key_from_external_87( + const MLDSA87_private_key *external) { + return reinterpret_cast *>(external); +} +inline private_key<8, 7> *private_key_from_external_87( + MLDSA87_private_key *external) { + return reinterpret_cast *>(external); +} + +static_assert(sizeof(MLDSA87_public_key) == sizeof(public_key<8>)); +static_assert(alignof(MLDSA87_public_key) == alignof(public_key<8>)); + +inline const public_key<8> *public_key_from_external_87( + const MLDSA87_public_key *external) { + return reinterpret_cast *>(external); +} +inline public_key<8> *public_key_from_external_87( + MLDSA87_public_key *external) { + return reinterpret_cast *>(external); +} + +inline prehash_context *prehash_context_from_external_87( + MLDSA87_prehash *external) { + static_assert(sizeof(MLDSA87_prehash) == sizeof(prehash_context)); + static_assert(alignof(MLDSA87_prehash) == alignof(prehash_context)); + return reinterpret_cast(external); +} + +static_assert(sizeof(MLDSA44_private_key) == sizeof(private_key<4, 4>)); +static_assert(alignof(MLDSA44_private_key) == alignof(private_key<4, 4>)); + +inline const private_key<4, 4> *private_key_from_external_44( + const MLDSA44_private_key *external) { + return reinterpret_cast *>(external); +} +inline private_key<4, 4> *private_key_from_external_44( + MLDSA44_private_key *external) { + return reinterpret_cast *>(external); +} + +static_assert(sizeof(MLDSA44_public_key) == sizeof(public_key<4>)); +static_assert(alignof(MLDSA44_public_key) == alignof(public_key<4>)); + +inline const public_key<4> *public_key_from_external_44( + const MLDSA44_public_key *external) { + return reinterpret_cast *>(external); +} +inline public_key<4> *public_key_from_external_44( + MLDSA44_public_key *external) { + return reinterpret_cast *>(external); +} + +inline prehash_context *prehash_context_from_external_44( + MLDSA44_prehash *external) { + static_assert(sizeof(MLDSA44_prehash) == sizeof(prehash_context)); + static_assert(alignof(MLDSA44_prehash) == alignof(prehash_context)); + return reinterpret_cast(external); +} + +namespace fips { + +#include "fips_known_values.inc" + +inline int keygen_self_test() { + struct Values { + enum { kAllowUniquePtr = true }; + private_key<6, 5> priv; + uint8_t pub_bytes[MLDSA65_PUBLIC_KEY_BYTES]; + uint8_t priv_bytes[BCM_MLDSA65_PRIVATE_KEY_BYTES]; + }; + auto values = MakeUnique(); + if (values == nullptr || + !mldsa_generate_key_external_entropy_no_self_test( + values->pub_bytes, &values->priv, kGenerateKeyEntropy)) { + return 0; + } + + CBB cbb; + CBB_init_fixed(&cbb, values->priv_bytes, sizeof(values->priv_bytes)); + if (!mldsa_marshal_private_key(&cbb, &values->priv)) { + return 0; + } + + if (!BORINGSSL_check_test(kExpectedPublicKey, values->pub_bytes, + "ML-DSA keygen public key") || + !BORINGSSL_check_test(kExpectedPrivateKey, values->priv_bytes, + "ML-DSA keygen private key")) { + return 0; + } + + return 1; +} + +inline int sign_self_test() { + struct Values { + enum { kAllowUniquePtr = true }; + private_key<6, 5> priv; + uint8_t pub_bytes[MLDSA65_PUBLIC_KEY_BYTES]; + uint8_t sig[MLDSA65_SIGNATURE_BYTES]; + }; + auto values = MakeUnique(); + if (values == nullptr || + !mldsa_generate_key_external_entropy(values->pub_bytes, &values->priv, + kSignEntropy)) { + return 0; + } + + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES] = {}; + + // This message triggers the first restart case for signing. + uint8_t message[4] = {0}; + if (!mldsa_sign_internal_no_self_test(values->sig, &values->priv, message, + sizeof(message), nullptr, 0, nullptr, 0, + randomizer)) { + return 0; + } + if (!BORINGSSL_check_test(kExpectedCase1Signature, values->sig, + "ML-DSA sign case 1")) { + return 0; + } + + // This message triggers the second restart case for signing. + message[0] = 123; + if (!mldsa_sign_internal_no_self_test(values->sig, &values->priv, message, + sizeof(message), nullptr, 0, nullptr, 0, + randomizer)) { + return 0; + } + if (!BORINGSSL_check_test(kExpectedCase2Signature, values->sig, + "ML-DSA sign case 2")) { + return 0; + } + + return 1; +} + +inline int verify_self_test() { + struct Values { + enum { kAllowUniquePtr = true }; + private_key<6, 5> priv; + uint8_t pub_bytes[MLDSA65_PUBLIC_KEY_BYTES]; + }; + auto values = MakeUnique(); + if (values == nullptr) { + return 0; + } + + if (!mldsa_generate_key_external_entropy(values->pub_bytes, &values->priv, + kSignEntropy)) { + return 0; + } + + const uint8_t message[4] = {1, 0}; + if (!mldsa_verify_internal_no_self_test<6, 5>( + &values->priv.pub, kExpectedVerifySignature, message, sizeof(message), + nullptr, 0, nullptr, 0)) { + return 0; + } + + return 1; +} + +template +inline int check_key(const private_key *priv) { + uint8_t sig[signature_bytes()]; + uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES] = {}; + if (!mldsa_sign_internal_no_self_test(sig, priv, nullptr, 0, nullptr, 0, + nullptr, 0, randomizer)) { + return 0; + } + + if (boringssl_fips_break_test("MLDSA_PWCT")) { + sig[0] ^= 1; + } + + if (!mldsa_verify_internal_no_self_test(&priv->pub, sig, nullptr, 0, + nullptr, 0, nullptr, 0)) { + return 0; + } + return 1; +} + +#if defined(BORINGSSL_FIPS) + +void ensure_keygen_self_test() { + CRYPTO_once(g_mldsa_keygen_self_test_once_bss_get(), []() { + if (!keygen_self_test()) { + BORINGSSL_FIPS_abort(); + } + }); +} + +void ensure_sign_self_test() { + CRYPTO_once(g_mldsa_sign_self_test_once_bss_get(), []() { + if (!sign_self_test()) { + BORINGSSL_FIPS_abort(); + } + }); +} + +void ensure_verify_self_test() { + CRYPTO_once(g_mldsa_verify_self_test_once_bss_get(), []() { + if (!verify_self_test()) { + BORINGSSL_FIPS_abort(); + } + }); +} + +#else + +void ensure_keygen_self_test() {} +void ensure_sign_self_test() {} +void ensure_verify_self_test() {} + +#endif + +} // namespace fips + +} // namespace +} // namespace mldsa + + +// ML-DSA-65 specific wrappers. + +bcm_status bssl::BCM_mldsa65_parse_public_key(MLDSA65_public_key *public_key, + CBS *in) { + return bcm_as_approved_status(mldsa_parse_public_key( + mldsa::public_key_from_external_65(public_key), in)); +} + +bcm_status bssl::BCM_mldsa65_marshal_private_key( + CBB *out, const MLDSA65_private_key *private_key) { + return bcm_as_approved_status(mldsa_marshal_private_key( + out, mldsa::private_key_from_external_65(private_key))); +} + +bcm_status bssl::BCM_mldsa65_parse_private_key(MLDSA65_private_key *private_key, + CBS *in) { + return bcm_as_approved_status( + mldsa_parse_private_key(mldsa::private_key_from_external_65(private_key), + in) && + CBS_len(in) == 0); +} + +bcm_status bssl::BCM_mldsa65_check_key_fips(MLDSA65_private_key *private_key) { + return bcm_as_approved_status( + mldsa::fips::check_key(mldsa::private_key_from_external_65(private_key))); +} + +// Calls |MLDSA_generate_key_external_entropy| with random bytes from +// |BCM_rand_bytes|. +bcm_status bssl::BCM_mldsa65_generate_key( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA65_private_key *out_private_key) { + BCM_rand_bytes(out_seed, MLDSA_SEED_BYTES); + CONSTTIME_SECRET(out_seed, MLDSA_SEED_BYTES); + return BCM_mldsa65_generate_key_external_entropy(out_encoded_public_key, + out_private_key, out_seed); +} + +bcm_status bssl::BCM_mldsa65_private_key_from_seed( + MLDSA65_private_key *out_private_key, + const uint8_t seed[MLDSA_SEED_BYTES]) { + uint8_t public_key[MLDSA65_PUBLIC_KEY_BYTES]; + return BCM_mldsa65_generate_key_external_entropy(public_key, out_private_key, + seed); +} + +bcm_status bssl::BCM_mldsa65_generate_key_external_entropy( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + MLDSA65_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]) { + return bcm_as_not_approved_status(mldsa_generate_key_external_entropy( + out_encoded_public_key, + mldsa::private_key_from_external_65(out_private_key), entropy)); +} + +bcm_status bssl::BCM_mldsa65_generate_key_fips( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA65_private_key *out_private_key) { + if (out_encoded_public_key == nullptr || out_private_key == nullptr) { + return bcm_status::failure; + } + if (BCM_mldsa65_generate_key(out_encoded_public_key, out_seed, + out_private_key) == bcm_status::failure) { + return bcm_status::failure; + } + return BCM_mldsa65_check_key_fips(out_private_key); +} + +bcm_status bssl::BCM_mldsa65_generate_key_external_entropy_fips( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + MLDSA65_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]) { + if (out_encoded_public_key == nullptr || out_private_key == nullptr) { + return bcm_status::failure; + } + if (BCM_mldsa65_generate_key_external_entropy(out_encoded_public_key, + out_private_key, entropy) == + bcm_status::failure) { + return bcm_status::failure; + } + return BCM_mldsa65_check_key_fips(out_private_key); +} + +bcm_status bssl::BCM_mldsa65_private_key_from_seed_fips( + MLDSA65_private_key *out_private_key, + const uint8_t seed[MLDSA_SEED_BYTES]) { + uint8_t public_key[MLDSA65_PUBLIC_KEY_BYTES]; + if (BCM_mldsa65_generate_key_external_entropy(public_key, out_private_key, + seed) == bcm_status::failure) { + return bcm_status::failure; + } + return BCM_mldsa65_check_key_fips(out_private_key); +} + +bcm_status bssl::BCM_mldsa65_public_from_private( + MLDSA65_public_key *out_public_key, + const MLDSA65_private_key *private_key) { + const auto *priv = mldsa::private_key_from_external_65(private_key); + auto *out_pub = mldsa::public_key_from_external_65(out_public_key); + *out_pub = priv->pub; + return bcm_status::approved; +} + +const MLDSA65_public_key *bssl::BCM_mldsa65_public_of_private( + const MLDSA65_private_key *private_key) { + return reinterpret_cast( + &mldsa::private_key_from_external_65(private_key)->pub); +} + +bcm_status bssl::BCM_mldsa65_sign_internal( + uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const MLDSA65_private_key *private_key, const uint8_t *msg, size_t msg_len, + const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len, + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) { + return bcm_as_approved_status(mldsa_sign_internal( + out_encoded_signature, mldsa::private_key_from_external_65(private_key), + msg, msg_len, context_prefix, context_prefix_len, context, context_len, + randomizer)); +} + +bcm_status bssl::BCM_mldsa65_sign_mu_internal( + uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const MLDSA65_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES], + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) { + return bcm_as_approved_status(mldsa_sign_mu( + out_encoded_signature, mldsa::private_key_from_external_65(private_key), + msg_rep, randomizer)); +} + +// ML-DSA signature in randomized mode, filling the random bytes with +// |BCM_rand_bytes|. +bcm_status bssl::BCM_mldsa65_sign( + uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const MLDSA65_private_key *private_key, const uint8_t *msg, size_t msg_len, + const uint8_t *context, size_t context_len) { + BSSL_CHECK(context_len <= 255); + uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]; + BCM_rand_bytes(randomizer, sizeof(randomizer)); + CONSTTIME_SECRET(randomizer, sizeof(randomizer)); + + const uint8_t context_prefix[2] = {0, static_cast(context_len)}; + return BCM_mldsa65_sign_internal( + out_encoded_signature, private_key, msg, msg_len, context_prefix, + sizeof(context_prefix), context, context_len, randomizer); +} + +// ML-DSA pre-hashed API: initializing a pre-hashing context. +void bssl::BCM_mldsa65_prehash_init(MLDSA65_prehash *out_prehash_ctx, + const MLDSA65_public_key *public_key, + const uint8_t *context, + size_t context_len) { + BSSL_CHECK(context_len <= 255); + + const uint8_t context_prefix[2] = {0, static_cast(context_len)}; + mldsa_prehash_init(mldsa::prehash_context_from_external_65(out_prehash_ctx), + mldsa::public_key_from_external_65(public_key), + context_prefix, sizeof(context_prefix), context, + context_len); +} + +// ML-DSA pre-hashed API: updating a pre-hashing context with a message chunk. +void bssl::BCM_mldsa65_prehash_update(MLDSA65_prehash *inout_prehash_ctx, + const uint8_t *msg, size_t msg_len) { + mldsa_prehash_update( + mldsa::prehash_context_from_external_65(inout_prehash_ctx), msg, msg_len); +} + +// ML-DSA pre-hashed API: obtaining a message representative to sign. +void bssl::BCM_mldsa65_prehash_finalize(uint8_t out_msg_rep[MLDSA_MU_BYTES], + MLDSA65_prehash *inout_prehash_ctx) { + mldsa_prehash_finalize( + out_msg_rep, mldsa::prehash_context_from_external_65(inout_prehash_ctx)); +} + +// ML-DSA pre-hashed API: signing a message representative. +bcm_status bssl::BCM_mldsa65_sign_message_representative( + uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const MLDSA65_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]; + BCM_rand_bytes(randomizer, sizeof(randomizer)); + CONSTTIME_SECRET(randomizer, sizeof(randomizer)); + + return bcm_as_approved_status(mldsa_sign_mu( + out_encoded_signature, mldsa::private_key_from_external_65(private_key), + msg_rep, randomizer)); +} + +// ML-DSA pre-hashed API: verifying a message representative. +bcm_status bssl::BCM_mldsa65_verify_message_representative( + const MLDSA65_public_key *public_key, + const uint8_t signature[MLDSA65_SIGNATURE_BYTES], + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + return bcm_as_approved_status(mldsa::mldsa_verify_mu<6, 5>( + mldsa::public_key_from_external_65(public_key), signature, msg_rep)); +} + +// FIPS 204, Algorithm 3 (`ML-DSA.Verify`). +bcm_status bssl::BCM_mldsa65_verify( + const MLDSA65_public_key *public_key, + const uint8_t signature[MLDSA65_SIGNATURE_BYTES], const uint8_t *msg, + size_t msg_len, const uint8_t *context, size_t context_len) { + BSSL_CHECK(context_len <= 255); + const uint8_t context_prefix[2] = {0, static_cast(context_len)}; + return BCM_mldsa65_verify_internal(public_key, signature, msg, msg_len, + context_prefix, sizeof(context_prefix), + context, context_len); +} + +bcm_status bssl::BCM_mldsa65_verify_internal( + const MLDSA65_public_key *public_key, + const uint8_t encoded_signature[MLDSA65_SIGNATURE_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix, + size_t context_prefix_len, const uint8_t *context, size_t context_len) { + return bcm_as_approved_status(mldsa::mldsa_verify_internal<6, 5>( + mldsa::public_key_from_external_65(public_key), encoded_signature, msg, + msg_len, context_prefix, context_prefix_len, context, context_len)); +} + +bcm_status bssl::BCM_mldsa65_marshal_public_key( + CBB *out, const MLDSA65_public_key *public_key) { + return bcm_as_approved_status(mldsa_marshal_public_key( + out, mldsa::public_key_from_external_65(public_key))); +} + +int bssl::BCM_mldsa65_public_keys_equal(const MLDSA65_public_key *a, + const MLDSA65_public_key *b) { + auto *a_pub = mldsa::public_key_from_external_65(a); + auto *b_pub = mldsa::public_key_from_external_65(b); + // It is sufficient to compare |public_key_hash|. When importing a public key, + // the hash must be computed. When importing a private key in expanded form + // (an internal testing-only API), the hash is provided, but we recompute it + // and check for correctness. + return OPENSSL_memcmp(a_pub->public_key_hash, b_pub->public_key_hash, + sizeof(a_pub->public_key_hash)) == 0; +} + + +// ML-DSA-87 specific wrappers. + +bcm_status bssl::BCM_mldsa87_parse_public_key(MLDSA87_public_key *public_key, + CBS *in) { + return bcm_as_approved_status(mldsa_parse_public_key( + mldsa::public_key_from_external_87(public_key), in)); +} + +bcm_status bssl::BCM_mldsa87_marshal_private_key( + CBB *out, const MLDSA87_private_key *private_key) { + return bcm_as_approved_status(mldsa_marshal_private_key( + out, mldsa::private_key_from_external_87(private_key))); +} + +bcm_status bssl::BCM_mldsa87_parse_private_key(MLDSA87_private_key *private_key, + CBS *in) { + return bcm_as_approved_status( + mldsa_parse_private_key(mldsa::private_key_from_external_87(private_key), + in) && + CBS_len(in) == 0); +} + +bcm_status bssl::BCM_mldsa87_check_key_fips(MLDSA87_private_key *private_key) { + return bcm_as_approved_status( + mldsa::fips::check_key(mldsa::private_key_from_external_87(private_key))); +} + +// Calls |MLDSA_generate_key_external_entropy| with random bytes from +// |BCM_rand_bytes|. +bcm_status bssl::BCM_mldsa87_generate_key( + uint8_t out_encoded_public_key[MLDSA87_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA87_private_key *out_private_key) { + BCM_rand_bytes(out_seed, MLDSA_SEED_BYTES); + CONSTTIME_SECRET(out_seed, MLDSA_SEED_BYTES); + return BCM_mldsa87_generate_key_external_entropy(out_encoded_public_key, + out_private_key, out_seed); +} + +bcm_status bssl::BCM_mldsa87_private_key_from_seed( + MLDSA87_private_key *out_private_key, + const uint8_t seed[MLDSA_SEED_BYTES]) { + uint8_t public_key[MLDSA87_PUBLIC_KEY_BYTES]; + return BCM_mldsa87_generate_key_external_entropy(public_key, out_private_key, + seed); +} + +bcm_status bssl::BCM_mldsa87_generate_key_external_entropy( + uint8_t out_encoded_public_key[MLDSA87_PUBLIC_KEY_BYTES], + MLDSA87_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]) { + return bcm_as_not_approved_status(mldsa_generate_key_external_entropy( + out_encoded_public_key, + mldsa::private_key_from_external_87(out_private_key), entropy)); +} + +bcm_status bssl::BCM_mldsa87_generate_key_fips( + uint8_t out_encoded_public_key[MLDSA87_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA87_private_key *out_private_key) { + if (out_encoded_public_key == nullptr || out_private_key == nullptr) { + return bcm_status::failure; + } + if (BCM_mldsa87_generate_key(out_encoded_public_key, out_seed, + out_private_key) == bcm_status::failure) { + return bcm_status::failure; + } + return BCM_mldsa87_check_key_fips(out_private_key); +} + +bcm_status bssl::BCM_mldsa87_generate_key_external_entropy_fips( + uint8_t out_encoded_public_key[MLDSA87_PUBLIC_KEY_BYTES], + MLDSA87_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]) { + if (out_encoded_public_key == nullptr || out_private_key == nullptr) { + return bcm_status::failure; + } + if (BCM_mldsa87_generate_key_external_entropy(out_encoded_public_key, + out_private_key, entropy) == + bcm_status::failure) { + return bcm_status::failure; + } + return BCM_mldsa87_check_key_fips(out_private_key); +} + +bcm_status bssl::BCM_mldsa87_private_key_from_seed_fips( + MLDSA87_private_key *out_private_key, + const uint8_t seed[MLDSA_SEED_BYTES]) { + uint8_t public_key[MLDSA87_PUBLIC_KEY_BYTES]; + if (BCM_mldsa87_generate_key_external_entropy(public_key, out_private_key, + seed) == bcm_status::failure) { + return bcm_status::failure; + } + return BCM_mldsa87_check_key_fips(out_private_key); +} + +bcm_status bssl::BCM_mldsa87_public_from_private( + MLDSA87_public_key *out_public_key, + const MLDSA87_private_key *private_key) { + const auto *priv = mldsa::private_key_from_external_87(private_key); + auto *out_pub = mldsa::public_key_from_external_87(out_public_key); + *out_pub = priv->pub; + return bcm_status::approved; +} + +const MLDSA87_public_key *bssl::BCM_mldsa87_public_of_private( + const MLDSA87_private_key *private_key) { + return reinterpret_cast( + &mldsa::private_key_from_external_87(private_key)->pub); +} + +bcm_status bssl::BCM_mldsa87_sign_internal( + uint8_t out_encoded_signature[MLDSA87_SIGNATURE_BYTES], + const MLDSA87_private_key *private_key, const uint8_t *msg, size_t msg_len, + const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len, + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) { + return bcm_as_approved_status(mldsa_sign_internal( + out_encoded_signature, mldsa::private_key_from_external_87(private_key), + msg, msg_len, context_prefix, context_prefix_len, context, context_len, + randomizer)); +} + +bcm_status bssl::BCM_mldsa87_sign_mu_internal( + uint8_t out_encoded_signature[MLDSA87_SIGNATURE_BYTES], + const MLDSA87_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES], + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) { + return bcm_as_approved_status(mldsa_sign_mu( + out_encoded_signature, mldsa::private_key_from_external_87(private_key), + msg_rep, randomizer)); +} + +// ML-DSA signature in randomized mode, filling the random bytes with +// |BCM_rand_bytes|. +bcm_status bssl::BCM_mldsa87_sign( + uint8_t out_encoded_signature[MLDSA87_SIGNATURE_BYTES], + const MLDSA87_private_key *private_key, const uint8_t *msg, size_t msg_len, + const uint8_t *context, size_t context_len) { + BSSL_CHECK(context_len <= 255); + uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]; + BCM_rand_bytes(randomizer, sizeof(randomizer)); + CONSTTIME_SECRET(randomizer, sizeof(randomizer)); + + const uint8_t context_prefix[2] = {0, static_cast(context_len)}; + return BCM_mldsa87_sign_internal( + out_encoded_signature, private_key, msg, msg_len, context_prefix, + sizeof(context_prefix), context, context_len, randomizer); +} + +// ML-DSA pre-hashed API: initializing a pre-hashing context. +void bssl::BCM_mldsa87_prehash_init(MLDSA87_prehash *out_prehash_ctx, + const MLDSA87_public_key *public_key, + const uint8_t *context, + size_t context_len) { + BSSL_CHECK(context_len <= 255); + + const uint8_t context_prefix[2] = {0, static_cast(context_len)}; + mldsa_prehash_init(mldsa::prehash_context_from_external_87(out_prehash_ctx), + mldsa::public_key_from_external_87(public_key), + context_prefix, sizeof(context_prefix), context, + context_len); +} + +// ML-DSA pre-hashed API: updating a pre-hashing context with a message chunk. +void bssl::BCM_mldsa87_prehash_update(MLDSA87_prehash *inout_prehash_ctx, + const uint8_t *msg, size_t msg_len) { + mldsa_prehash_update( + mldsa::prehash_context_from_external_87(inout_prehash_ctx), msg, msg_len); +} + +// ML-DSA pre-hashed API: obtaining a message representative to sign. +void bssl::BCM_mldsa87_prehash_finalize(uint8_t out_msg_rep[MLDSA_MU_BYTES], + MLDSA87_prehash *inout_prehash_ctx) { + mldsa_prehash_finalize( + out_msg_rep, mldsa::prehash_context_from_external_87(inout_prehash_ctx)); +} + +// ML-DSA pre-hashed API: signing a message representative. +bcm_status bssl::BCM_mldsa87_sign_message_representative( + uint8_t out_encoded_signature[MLDSA87_SIGNATURE_BYTES], + const MLDSA87_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]; + BCM_rand_bytes(randomizer, sizeof(randomizer)); + CONSTTIME_SECRET(randomizer, sizeof(randomizer)); + + return bcm_as_approved_status(mldsa_sign_mu( + out_encoded_signature, mldsa::private_key_from_external_87(private_key), + msg_rep, randomizer)); +} + +// ML-DSA pre-hashed API: verifying a message representative. +bcm_status bssl::BCM_mldsa87_verify_message_representative( + const MLDSA87_public_key *public_key, + const uint8_t signature[MLDSA87_SIGNATURE_BYTES], + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + return bcm_as_approved_status(mldsa::mldsa_verify_mu<8, 7>( + mldsa::public_key_from_external_87(public_key), signature, msg_rep)); +} + +// FIPS 204, Algorithm 3 (`ML-DSA.Verify`). +bcm_status bssl::BCM_mldsa87_verify(const MLDSA87_public_key *public_key, + const uint8_t *signature, + const uint8_t *msg, size_t msg_len, + const uint8_t *context, + size_t context_len) { + BSSL_CHECK(context_len <= 255); + const uint8_t context_prefix[2] = {0, static_cast(context_len)}; + return BCM_mldsa87_verify_internal(public_key, signature, msg, msg_len, + context_prefix, sizeof(context_prefix), + context, context_len); +} + +bcm_status bssl::BCM_mldsa87_verify_internal( + const MLDSA87_public_key *public_key, + const uint8_t encoded_signature[MLDSA87_SIGNATURE_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix, + size_t context_prefix_len, const uint8_t *context, size_t context_len) { + return bcm_as_approved_status(mldsa::mldsa_verify_internal<8, 7>( + mldsa::public_key_from_external_87(public_key), encoded_signature, msg, + msg_len, context_prefix, context_prefix_len, context, context_len)); +} + +bcm_status bssl::BCM_mldsa87_marshal_public_key( + CBB *out, const MLDSA87_public_key *public_key) { + return bcm_as_approved_status(mldsa_marshal_public_key( + out, mldsa::public_key_from_external_87(public_key))); +} + +int bssl::BCM_mldsa87_public_keys_equal(const MLDSA87_public_key *a, + const MLDSA87_public_key *b) { + auto *a_pub = mldsa::public_key_from_external_87(a); + auto *b_pub = mldsa::public_key_from_external_87(b); + // It is sufficient to compare |public_key_hash|. When importing a public key, + // the hash must be computed. When importing a private key in expanded form + // (an internal testing-only API), the hash is provided, but we recompute it + // and check for correctness. + return OPENSSL_memcmp(a_pub->public_key_hash, b_pub->public_key_hash, + sizeof(a_pub->public_key_hash)) == 0; +} + + +// ML-DSA-44 specific wrappers. + +bcm_status bssl::BCM_mldsa44_parse_public_key(MLDSA44_public_key *public_key, + CBS *in) { + return bcm_as_approved_status(mldsa_parse_public_key( + mldsa::public_key_from_external_44(public_key), in)); +} + +bcm_status bssl::BCM_mldsa44_marshal_private_key( + CBB *out, const MLDSA44_private_key *private_key) { + return bcm_as_approved_status(mldsa_marshal_private_key( + out, mldsa::private_key_from_external_44(private_key))); +} + +bcm_status bssl::BCM_mldsa44_parse_private_key(MLDSA44_private_key *private_key, + CBS *in) { + return bcm_as_approved_status( + mldsa_parse_private_key(mldsa::private_key_from_external_44(private_key), + in) && + CBS_len(in) == 0); +} + +bcm_status bssl::BCM_mldsa44_check_key_fips(MLDSA44_private_key *private_key) { + return bcm_as_approved_status( + mldsa::fips::check_key(mldsa::private_key_from_external_44(private_key))); +} + +// Calls |MLDSA_generate_key_external_entropy| with random bytes from +// |BCM_rand_bytes|. +bcm_status bssl::BCM_mldsa44_generate_key( + uint8_t out_encoded_public_key[MLDSA44_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA44_private_key *out_private_key) { + BCM_rand_bytes(out_seed, MLDSA_SEED_BYTES); + CONSTTIME_SECRET(out_seed, MLDSA_SEED_BYTES); + return BCM_mldsa44_generate_key_external_entropy(out_encoded_public_key, + out_private_key, out_seed); +} + +bcm_status bssl::BCM_mldsa44_private_key_from_seed( + MLDSA44_private_key *out_private_key, + const uint8_t seed[MLDSA_SEED_BYTES]) { + uint8_t public_key[MLDSA44_PUBLIC_KEY_BYTES]; + return BCM_mldsa44_generate_key_external_entropy(public_key, out_private_key, + seed); +} + +bcm_status bssl::BCM_mldsa44_generate_key_external_entropy( + uint8_t out_encoded_public_key[MLDSA44_PUBLIC_KEY_BYTES], + MLDSA44_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]) { + return bcm_as_not_approved_status(mldsa_generate_key_external_entropy( + out_encoded_public_key, + mldsa::private_key_from_external_44(out_private_key), entropy)); +} + +bcm_status bssl::BCM_mldsa44_generate_key_fips( + uint8_t out_encoded_public_key[MLDSA44_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], MLDSA44_private_key *out_private_key) { + if (out_encoded_public_key == nullptr || out_private_key == nullptr) { + return bcm_status::failure; + } + if (BCM_mldsa44_generate_key(out_encoded_public_key, out_seed, + out_private_key) == bcm_status::failure) { + return bcm_status::failure; + } + return BCM_mldsa44_check_key_fips(out_private_key); +} + +bcm_status bssl::BCM_mldsa44_generate_key_external_entropy_fips( + uint8_t out_encoded_public_key[MLDSA44_PUBLIC_KEY_BYTES], + MLDSA44_private_key *out_private_key, + const uint8_t entropy[MLDSA_SEED_BYTES]) { + if (out_encoded_public_key == nullptr || out_private_key == nullptr) { + return bcm_status::failure; + } + if (BCM_mldsa44_generate_key_external_entropy(out_encoded_public_key, + out_private_key, entropy) == + bcm_status::failure) { + return bcm_status::failure; + } + return BCM_mldsa44_check_key_fips(out_private_key); +} + +bcm_status bssl::BCM_mldsa44_private_key_from_seed_fips( + MLDSA44_private_key *out_private_key, + const uint8_t seed[MLDSA_SEED_BYTES]) { + uint8_t public_key[MLDSA44_PUBLIC_KEY_BYTES]; + if (BCM_mldsa44_generate_key_external_entropy(public_key, out_private_key, + seed) == bcm_status::failure) { + return bcm_status::failure; + } + return BCM_mldsa44_check_key_fips(out_private_key); +} + +bcm_status bssl::BCM_mldsa44_public_from_private( + MLDSA44_public_key *out_public_key, + const MLDSA44_private_key *private_key) { + const auto *priv = mldsa::private_key_from_external_44(private_key); + auto *out_pub = mldsa::public_key_from_external_44(out_public_key); + *out_pub = priv->pub; + return bcm_status::approved; +} + +const MLDSA44_public_key *bssl::BCM_mldsa44_public_of_private( + const MLDSA44_private_key *private_key) { + return reinterpret_cast( + &mldsa::private_key_from_external_44(private_key)->pub); +} + +bcm_status bssl::BCM_mldsa44_sign_internal( + uint8_t out_encoded_signature[MLDSA44_SIGNATURE_BYTES], + const MLDSA44_private_key *private_key, const uint8_t *msg, size_t msg_len, + const uint8_t *context_prefix, size_t context_prefix_len, + const uint8_t *context, size_t context_len, + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) { + return bcm_as_approved_status(mldsa_sign_internal( + out_encoded_signature, mldsa::private_key_from_external_44(private_key), + msg, msg_len, context_prefix, context_prefix_len, context, context_len, + randomizer)); +} + +bcm_status bssl::BCM_mldsa44_sign_mu_internal( + uint8_t out_encoded_signature[MLDSA44_SIGNATURE_BYTES], + const MLDSA44_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES], + const uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]) { + return bcm_as_approved_status(mldsa_sign_mu( + out_encoded_signature, mldsa::private_key_from_external_44(private_key), + msg_rep, randomizer)); +} + +// ML-DSA signature in randomized mode, filling the random bytes with +// |BCM_rand_bytes|. +bcm_status bssl::BCM_mldsa44_sign( + uint8_t out_encoded_signature[MLDSA44_SIGNATURE_BYTES], + const MLDSA44_private_key *private_key, const uint8_t *msg, size_t msg_len, + const uint8_t *context, size_t context_len) { + BSSL_CHECK(context_len <= 255); + uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]; + BCM_rand_bytes(randomizer, sizeof(randomizer)); + CONSTTIME_SECRET(randomizer, sizeof(randomizer)); + + const uint8_t context_prefix[2] = {0, static_cast(context_len)}; + return BCM_mldsa44_sign_internal( + out_encoded_signature, private_key, msg, msg_len, context_prefix, + sizeof(context_prefix), context, context_len, randomizer); +} + +// ML-DSA pre-hashed API: initializing a pre-hashing context. +void bssl::BCM_mldsa44_prehash_init(MLDSA44_prehash *out_prehash_ctx, + const MLDSA44_public_key *public_key, + const uint8_t *context, + size_t context_len) { + BSSL_CHECK(context_len <= 255); + + const uint8_t context_prefix[2] = {0, static_cast(context_len)}; + mldsa_prehash_init(mldsa::prehash_context_from_external_44(out_prehash_ctx), + mldsa::public_key_from_external_44(public_key), + context_prefix, sizeof(context_prefix), context, + context_len); +} + +// ML-DSA pre-hashed API: updating a pre-hashing context with a message chunk. +void bssl::BCM_mldsa44_prehash_update(MLDSA44_prehash *inout_prehash_ctx, + const uint8_t *msg, size_t msg_len) { + mldsa_prehash_update( + mldsa::prehash_context_from_external_44(inout_prehash_ctx), msg, msg_len); +} + +// ML-DSA pre-hashed API: obtaining a message representative to sign. +void bssl::BCM_mldsa44_prehash_finalize(uint8_t out_msg_rep[MLDSA_MU_BYTES], + MLDSA44_prehash *inout_prehash_ctx) { + mldsa_prehash_finalize( + out_msg_rep, mldsa::prehash_context_from_external_44(inout_prehash_ctx)); +} + +// ML-DSA pre-hashed API: signing a message representative. +bcm_status bssl::BCM_mldsa44_sign_message_representative( + uint8_t out_encoded_signature[MLDSA44_SIGNATURE_BYTES], + const MLDSA44_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + uint8_t randomizer[BCM_MLDSA_SIGNATURE_RANDOMIZER_BYTES]; + BCM_rand_bytes(randomizer, sizeof(randomizer)); + CONSTTIME_SECRET(randomizer, sizeof(randomizer)); + + return bcm_as_approved_status(mldsa_sign_mu( + out_encoded_signature, mldsa::private_key_from_external_44(private_key), + msg_rep, randomizer)); +} + +// ML-DSA pre-hashed API: verifying a message representative. +bcm_status bssl::BCM_mldsa44_verify_message_representative( + const MLDSA44_public_key *public_key, + const uint8_t signature[MLDSA44_SIGNATURE_BYTES], + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + return bcm_as_approved_status(mldsa::mldsa_verify_mu<4, 4>( + mldsa::public_key_from_external_44(public_key), signature, msg_rep)); +} + +// FIPS 204, Algorithm 3 (`ML-DSA.Verify`). +bcm_status bssl::BCM_mldsa44_verify(const MLDSA44_public_key *public_key, + const uint8_t *signature, + const uint8_t *msg, size_t msg_len, + const uint8_t *context, + size_t context_len) { + BSSL_CHECK(context_len <= 255); + const uint8_t context_prefix[2] = {0, static_cast(context_len)}; + return BCM_mldsa44_verify_internal(public_key, signature, msg, msg_len, + context_prefix, sizeof(context_prefix), + context, context_len); +} + +bcm_status bssl::BCM_mldsa44_verify_internal( + const MLDSA44_public_key *public_key, + const uint8_t encoded_signature[MLDSA44_SIGNATURE_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context_prefix, + size_t context_prefix_len, const uint8_t *context, size_t context_len) { + return bcm_as_approved_status(mldsa::mldsa_verify_internal<4, 4>( + mldsa::public_key_from_external_44(public_key), encoded_signature, msg, + msg_len, context_prefix, context_prefix_len, context, context_len)); +} + +bcm_status bssl::BCM_mldsa44_marshal_public_key( + CBB *out, const MLDSA44_public_key *public_key) { + return bcm_as_approved_status(mldsa_marshal_public_key( + out, mldsa::public_key_from_external_44(public_key))); +} + +int bssl::BCM_mldsa44_public_keys_equal(const MLDSA44_public_key *a, + const MLDSA44_public_key *b) { + auto *a_pub = mldsa::public_key_from_external_44(a); + auto *b_pub = mldsa::public_key_from_external_44(b); + // It is sufficient to compare |public_key_hash|. When importing a public key, + // the hash must be computed. When importing a private key in expanded form + // (an internal testing-only API), the hash is provided, but we recompute it + // and check for correctness. + return OPENSSL_memcmp(a_pub->public_key_hash, b_pub->public_key_hash, + sizeof(a_pub->public_key_hash)) == 0; +} + +int bssl::boringssl_self_test_mldsa() { + return mldsa::fips::keygen_self_test() && mldsa::fips::sign_self_test() && + mldsa::fips::verify_self_test(); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/mlkem/fips_known_values.inc b/third_party/boringssl/src/crypto/fipsmodule/mlkem/fips_known_values.inc new file mode 100644 index 00000000..ac5aca6b --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/mlkem/fips_known_values.inc @@ -0,0 +1,411 @@ +const uint8_t kTestEntropy[MLKEM_SEED_BYTES] = { + 0xc8, 0x77, 0x34, 0x04, 0xb3, 0xe5, 0x3e, 0x31, 0x7b, 0xab, 0x41, + 0x08, 0xa8, 0x88, 0x9c, 0x90, 0xbe, 0xfb, 0x38, 0x0f, 0x63, 0x89, + 0x70, 0xbc, 0xdc, 0x33, 0xeb, 0x2d, 0xe2, 0x99, 0x6b, 0x1a}; + +const uint8_t kExpectedPublicKeyBytes[MLKEM768_PUBLIC_KEY_BYTES] = { + 0xcb, 0x0b, 0x33, 0xa2, 0xac, 0xc4, 0x94, 0x56, 0x1a, 0x36, 0x1c, 0x0d, + 0xa3, 0x57, 0x5b, 0x63, 0x6c, 0x0d, 0x1f, 0xe1, 0xa8, 0x36, 0xd5, 0x5d, + 0x38, 0x5c, 0xac, 0xf4, 0x67, 0x39, 0x9a, 0x12, 0x5b, 0xef, 0xa1, 0x0a, + 0x57, 0x76, 0x87, 0x83, 0xaa, 0xaf, 0x1d, 0x9c, 0xce, 0x2b, 0x78, 0x9b, + 0x40, 0x82, 0x1e, 0x34, 0xb3, 0x55, 0x2d, 0xd0, 0x20, 0xcd, 0x5b, 0x46, + 0x8d, 0xda, 0xb8, 0x83, 0xa8, 0xab, 0xfb, 0x36, 0x45, 0xcb, 0x40, 0xae, + 0x69, 0x77, 0x03, 0x22, 0xa8, 0xb4, 0xaa, 0x17, 0x53, 0x90, 0x05, 0x6a, + 0x69, 0xb9, 0xc5, 0x2d, 0xbc, 0x9a, 0x81, 0xf6, 0x2a, 0x3c, 0x57, 0x96, + 0x8c, 0xb2, 0x72, 0xa7, 0xd4, 0x84, 0x77, 0xf9, 0x0b, 0x80, 0xe7, 0x1e, + 0x75, 0x61, 0x0e, 0xf9, 0xb0, 0xcf, 0x4c, 0x03, 0xca, 0x30, 0x45, 0x50, + 0x25, 0x82, 0x56, 0x02, 0xbb, 0x58, 0xab, 0x99, 0xc7, 0x8d, 0xd5, 0x37, + 0x12, 0xc8, 0x36, 0x5b, 0x00, 0x5f, 0x91, 0xf6, 0x2d, 0x9b, 0x91, 0xc2, + 0x3e, 0x75, 0x56, 0x83, 0xd2, 0xaa, 0xd4, 0x03, 0xcd, 0xe8, 0xe8, 0x3f, + 0x09, 0xc1, 0x8e, 0x84, 0xb1, 0x75, 0x85, 0x06, 0xa7, 0x4a, 0x40, 0xc2, + 0xc9, 0x90, 0xb0, 0x0f, 0x59, 0x84, 0xe5, 0xf7, 0xa8, 0xd8, 0x35, 0x33, + 0xa5, 0x82, 0x4e, 0xf6, 0xb5, 0xaf, 0xd5, 0xaa, 0xbb, 0x52, 0x30, 0x0b, + 0xb1, 0x67, 0x7b, 0xbb, 0xdc, 0x38, 0x58, 0xf4, 0x2f, 0xe4, 0xeb, 0x47, + 0xf0, 0x49, 0x76, 0x45, 0x10, 0x02, 0x19, 0xa7, 0xc6, 0x1f, 0x8a, 0x71, + 0xda, 0x51, 0x61, 0x79, 0xc4, 0x10, 0xeb, 0x3a, 0x11, 0xd9, 0x93, 0x1d, + 0x83, 0x81, 0x48, 0x3d, 0x68, 0x35, 0xff, 0xe3, 0xa6, 0xa8, 0x40, 0x74, + 0xdb, 0x7c, 0xb4, 0xb2, 0x05, 0x89, 0xc8, 0x09, 0x42, 0x27, 0x28, 0x55, + 0x01, 0x57, 0xc9, 0xf9, 0xfa, 0x36, 0xd3, 0x5c, 0x51, 0x40, 0xca, 0x1e, + 0x34, 0xc8, 0xb9, 0x27, 0xf1, 0x15, 0xc9, 0xa4, 0x18, 0x9c, 0xa8, 0x4c, + 0x55, 0xf8, 0x4d, 0x90, 0xbc, 0x88, 0xd1, 0x46, 0x01, 0x80, 0x96, 0x54, + 0x8d, 0x01, 0x6e, 0xe4, 0x26, 0x0c, 0x7a, 0x08, 0x3d, 0x7c, 0x26, 0x66, + 0xb1, 0xd1, 0x15, 0xc7, 0xc0, 0x0b, 0x06, 0x92, 0x9c, 0x26, 0xa6, 0x6c, + 0x3a, 0x46, 0x47, 0x15, 0x37, 0xb0, 0x0a, 0xc1, 0x5d, 0xe0, 0x51, 0xae, + 0x17, 0x87, 0x2f, 0x75, 0x2c, 0x64, 0xd5, 0x39, 0x09, 0xe6, 0xd0, 0x7c, + 0x3d, 0x79, 0x47, 0x78, 0x0a, 0x82, 0xc9, 0xba, 0x37, 0x07, 0x93, 0x94, + 0xa4, 0x45, 0x7c, 0x07, 0x69, 0x24, 0x1d, 0xc9, 0x75, 0x41, 0x6b, 0xc1, + 0x42, 0x28, 0x06, 0xe3, 0xfc, 0x7b, 0xd2, 0xf8, 0xbf, 0xe7, 0x5a, 0xc2, + 0xc0, 0x81, 0x63, 0xf3, 0x84, 0x32, 0x31, 0x4a, 0x8d, 0x6b, 0xe3, 0x3e, + 0x22, 0x67, 0x9a, 0x3a, 0x96, 0x09, 0xa6, 0xf9, 0x53, 0x40, 0x41, 0x2a, + 0x04, 0xa7, 0xc5, 0x7b, 0x86, 0xcd, 0xed, 0xec, 0xae, 0xfd, 0xc7, 0x11, + 0x48, 0x56, 0x77, 0x0b, 0xb7, 0xbb, 0xaf, 0xfb, 0x6c, 0x9e, 0xd6, 0x5c, + 0xbc, 0xf7, 0x9a, 0x75, 0xb5, 0xb2, 0x20, 0xd9, 0x60, 0x90, 0x65, 0x02, + 0xf9, 0x16, 0x34, 0xdf, 0x0b, 0x16, 0x2a, 0x97, 0x00, 0x5d, 0xc1, 0x65, + 0xc3, 0xb9, 0x1f, 0x2e, 0x85, 0x97, 0xbd, 0xca, 0xaa, 0x1d, 0xf2, 0x4f, + 0x74, 0xe5, 0x56, 0x94, 0xdc, 0x53, 0xbb, 0xc0, 0x75, 0x1c, 0x2b, 0x59, + 0x6b, 0x50, 0x2c, 0x25, 0x4c, 0x78, 0xd7, 0x68, 0x40, 0x0b, 0x91, 0x7a, + 0x93, 0x02, 0x83, 0x5e, 0x6c, 0x03, 0xc7, 0xc4, 0x24, 0x6f, 0x8b, 0xa4, + 0x70, 0x37, 0x1f, 0xe5, 0x33, 0x59, 0xf6, 0xbb, 0x5f, 0xb8, 0x58, 0x50, + 0x62, 0x16, 0x95, 0x2a, 0x88, 0xa3, 0xc6, 0x24, 0x94, 0x4b, 0xb0, 0x03, + 0x8e, 0xe6, 0x50, 0xd1, 0x63, 0xa2, 0xab, 0x00, 0x16, 0xf4, 0x0a, 0x7d, + 0xe6, 0xe0, 0xb9, 0x0c, 0xd8, 0x08, 0xae, 0xf6, 0x80, 0xb7, 0x48, 0xb8, + 0x10, 0xd1, 0x77, 0x57, 0x48, 0x2a, 0xc1, 0xf1, 0x4e, 0x61, 0xb4, 0x06, + 0x5a, 0xa1, 0xba, 0x32, 0x4a, 0x9a, 0xab, 0x8c, 0xa8, 0xe3, 0xa0, 0x68, + 0x39, 0xa3, 0x25, 0x55, 0x48, 0xcd, 0x12, 0x98, 0x14, 0x1b, 0x39, 0x5b, + 0xf4, 0x88, 0xbe, 0x5f, 0xac, 0xba, 0x18, 0x20, 0x74, 0x2c, 0xa1, 0x0b, + 0xfc, 0xf1, 0x00, 0x85, 0x24, 0x27, 0x3d, 0x80, 0xb6, 0x30, 0x42, 0x7b, + 0x74, 0xb0, 0xbf, 0x87, 0x51, 0x9b, 0xa8, 0x1c, 0x79, 0xbd, 0x50, 0x85, + 0x66, 0xe1, 0x93, 0xdd, 0xc5, 0x5d, 0x5d, 0xd0, 0xbe, 0xcb, 0x90, 0x36, + 0xf8, 0xc6, 0xc2, 0xea, 0xea, 0x40, 0xce, 0xa7, 0xb0, 0xae, 0xc5, 0x44, + 0x3a, 0x27, 0xc5, 0x4a, 0xf0, 0x21, 0x4f, 0xf6, 0x63, 0xb1, 0x8c, 0x20, + 0x76, 0x65, 0xb5, 0x79, 0x55, 0x10, 0xa5, 0xa7, 0x4b, 0x57, 0x96, 0xb3, + 0x03, 0xe6, 0xa8, 0x93, 0xf5, 0xc5, 0xca, 0xe0, 0x1a, 0x23, 0x30, 0x6d, + 0x09, 0x07, 0x81, 0x5e, 0x05, 0xc6, 0x1b, 0x7a, 0x0e, 0xde, 0xf3, 0x00, + 0x91, 0xf3, 0xbb, 0x4b, 0xa6, 0x8e, 0x96, 0xc3, 0x3a, 0xc8, 0x72, 0x19, + 0x23, 0x23, 0xc7, 0x94, 0x46, 0x75, 0xa2, 0xc5, 0x9e, 0xa3, 0x15, 0x86, + 0xb4, 0xa7, 0x95, 0xda, 0x16, 0xae, 0x42, 0xe8, 0xcc, 0xf6, 0x41, 0xbc, + 0x9c, 0xe8, 0x07, 0xc9, 0x05, 0x3c, 0x74, 0xb2, 0xad, 0x77, 0xc4, 0x08, + 0x56, 0x78, 0xb8, 0x65, 0x2a, 0x33, 0x39, 0x82, 0x16, 0x3c, 0x2b, 0x73, + 0x41, 0x2a, 0xbc, 0xc8, 0x36, 0x23, 0x2c, 0x23, 0x29, 0xce, 0x94, 0x8a, + 0xba, 0x13, 0x3f, 0xfa, 0xcc, 0x6e, 0x17, 0x40, 0xb0, 0xb4, 0x9c, 0xae, + 0xf9, 0x72, 0x40, 0xf7, 0x2a, 0xaf, 0x00, 0x17, 0x7e, 0x88, 0x44, 0x4e, + 0x32, 0xa4, 0x80, 0x96, 0x53, 0x20, 0xe4, 0xaa, 0x18, 0x9c, 0x69, 0x63, + 0xdb, 0x81, 0x73, 0x0e, 0x64, 0x40, 0x12, 0xc1, 0x26, 0x0e, 0xac, 0x4d, + 0x38, 0xa6, 0x4c, 0x05, 0xeb, 0x42, 0x1f, 0x88, 0x6e, 0x6c, 0xa3, 0x98, + 0x95, 0xc3, 0x5a, 0x31, 0xfc, 0x8a, 0x7c, 0xd6, 0xa5, 0x6c, 0x5a, 0xc9, + 0xf5, 0xf2, 0xb0, 0xf7, 0x05, 0x8d, 0x3a, 0x0b, 0x3f, 0x4d, 0x1b, 0x0e, + 0x38, 0x7b, 0x1c, 0xe5, 0xd0, 0x75, 0x40, 0x77, 0xc2, 0xe0, 0x8b, 0xa9, + 0xfb, 0xa6, 0xa7, 0x13, 0xf0, 0xbc, 0x0c, 0x13, 0xb0, 0x70, 0x3a, 0x2f, + 0x20, 0xe7, 0x38, 0xd8, 0x82, 0xa4, 0xc0, 0x06, 0x96, 0x95, 0x75, 0x5c, + 0xbc, 0x68, 0xce, 0xee, 0xa6, 0x44, 0xbb, 0xf2, 0x62, 0x14, 0x43, 0x10, + 0xe6, 0x0b, 0x5c, 0x66, 0x02, 0xbf, 0x38, 0xe9, 0x8f, 0x19, 0x69, 0x6c, + 0x08, 0xfa, 0x93, 0x23, 0xf1, 0x35, 0x5f, 0x33, 0xcf, 0xe2, 0x32, 0x11, + 0xda, 0x31, 0xbb, 0xd8, 0xf5, 0x7e, 0x55, 0xc3, 0x6a, 0xa3, 0x04, 0x8c, + 0xef, 0x50, 0x67, 0x0f, 0xe9, 0xbb, 0x11, 0x6b, 0xca, 0x5a, 0x23, 0x15, + 0xf7, 0x52, 0x8c, 0x69, 0xbb, 0x2d, 0x26, 0xd3, 0x78, 0x32, 0x33, 0x9b, + 0x1d, 0x02, 0x7c, 0x22, 0x4b, 0x57, 0x19, 0xec, 0xcd, 0x16, 0x63, 0x5d, + 0x1f, 0x0c, 0x67, 0xda, 0x6a, 0xc7, 0x2c, 0xf1, 0x0b, 0xcf, 0x92, 0x10, + 0xcb, 0x78, 0x57, 0x67, 0x61, 0x0e, 0xb3, 0x55, 0xb0, 0x17, 0x32, 0x4e, + 0x95, 0xf1, 0xcb, 0xb5, 0xdb, 0x5d, 0x8e, 0x38, 0x97, 0x28, 0x46, 0x1e, + 0x37, 0x4b, 0x5c, 0x03, 0xf4, 0x84, 0x0c, 0xfc, 0x49, 0x3b, 0xd2, 0xc6, + 0x97, 0xd4, 0x4f, 0xef, 0xe6, 0x5c, 0xb2, 0x96, 0x8e, 0xbc, 0x18, 0x9f, + 0xec, 0x8a, 0x67, 0x53, 0xe8, 0x2a, 0x41, 0x5c, 0x6e, 0xfe, 0xca, 0x1a, + 0xd0, 0x03, 0x4f, 0xe8, 0xd9, 0xb0, 0xe2, 0xa1, 0x6c, 0x32, 0xd5, 0x37, + 0x68, 0x9c, 0x6b, 0x30, 0x87, 0xb4, 0x6e, 0xa8, 0xc5, 0x45, 0xe4, 0x74, + 0x4a, 0x80, 0x02, 0x84, 0x78, 0x0e, 0x2c, 0x9c, 0x37, 0x07, 0x26, 0x31, + 0x88, 0x0a, 0x89, 0xd2, 0xba, 0x8b, 0x09, 0xb2, 0x16, 0x7b, 0x23, 0xa8, + 0x0c, 0x82, 0xc4, 0x22, 0xa2, 0x21, 0x2d, 0xa4, 0xa0, 0xeb, 0xe4, 0xa6, + 0xd4, 0x16, 0xa0, 0xa7, 0x19, 0x8b, 0x8f, 0x87, 0x9b, 0x53, 0x6c, 0xba, + 0x14, 0xa7, 0xc4, 0x64, 0x08, 0xb4, 0x68, 0xda, 0x85, 0xa6, 0x47, 0xbc, + 0x58, 0x34, 0xcd, 0x60, 0x43, 0xad, 0x14, 0xd4, 0xca, 0xd7, 0xb1, 0x6c, + 0xc0, 0x05, 0x68, 0xaf, 0xa7, 0x08, 0x9d, 0x6a, 0xb0, 0x70, 0x96, 0x81, + 0xb8, 0x55, 0xbf, 0xa1, 0xcc, 0xb1, 0x37, 0xe0, 0x94, 0x42, 0x05, 0x0c, + 0x83, 0xef, 0x6d, 0xf0, 0x4e, 0xba, 0xe6, 0x78, 0x49, 0xaf, 0x1f, 0x88, + 0xf0, 0x18, 0x0c, 0x17, 0x66, 0x92, 0xd8, 0x50, 0x57, 0x3b, 0x8e, 0xc9, + 0xd5, 0x8c, 0xfb, 0x9b, 0x78, 0x65, 0x01, 0x76}; + +const uint8_t kExpectedPrivateKeyBytes[2400] = { + 0xd8, 0xc9, 0x39, 0x7c, 0x31, 0x30, 0xd8, 0xec, 0xb4, 0x11, 0xa6, 0x8e, + 0xfc, 0xc8, 0x9a, 0x55, 0x3c, 0xb7, 0xe6, 0x81, 0x7e, 0x02, 0x88, 0xbd, + 0x06, 0x91, 0x60, 0x9b, 0xf5, 0x57, 0x6a, 0xf8, 0x73, 0x9e, 0x52, 0xa8, + 0x03, 0xf0, 0x97, 0xb4, 0x22, 0xb7, 0x02, 0x23, 0xcd, 0x3e, 0xa1, 0x03, + 0x20, 0x66, 0x6e, 0x20, 0x20, 0xcb, 0x42, 0xec, 0xb6, 0xc2, 0x14, 0x2c, + 0xe7, 0xa8, 0x88, 0x3d, 0xd2, 0x26, 0x9c, 0xf7, 0xb3, 0x4f, 0x00, 0x27, + 0x61, 0xc6, 0x10, 0xb2, 0x72, 0x34, 0x39, 0xf1, 0x79, 0x03, 0xcc, 0x06, + 0xd2, 0xe4, 0x29, 0x57, 0x91, 0x2f, 0x0d, 0x45, 0x19, 0x30, 0x27, 0x47, + 0xcc, 0xba, 0xbe, 0x19, 0x01, 0x1a, 0xbb, 0xd8, 0xcb, 0x74, 0xca, 0x4a, + 0xfa, 0x32, 0x2f, 0xb8, 0xbb, 0x5c, 0x19, 0x2a, 0xbf, 0x31, 0x0a, 0x82, + 0xf3, 0x29, 0x74, 0x30, 0xa4, 0x95, 0x4c, 0x42, 0x59, 0x8c, 0xb3, 0xcb, + 0xfe, 0xa9, 0xb4, 0x2e, 0x60, 0x71, 0x64, 0x51, 0x6d, 0xf9, 0x73, 0x5b, + 0xf5, 0x59, 0x9b, 0xf5, 0xd9, 0x9d, 0x3f, 0x86, 0x3a, 0x50, 0x6c, 0x23, + 0x8c, 0x96, 0xc0, 0x5e, 0x7c, 0x48, 0x43, 0xec, 0x08, 0xf5, 0xf7, 0x7e, + 0x6b, 0x54, 0x84, 0xb2, 0x6c, 0x52, 0xd5, 0x2b, 0x38, 0x85, 0xf4, 0x40, + 0x36, 0x09, 0x2f, 0xe6, 0xd0, 0x9e, 0xe6, 0xb6, 0x96, 0x7a, 0xe1, 0xc3, + 0xf7, 0xfa, 0x89, 0xc6, 0x75, 0x6f, 0x65, 0xea, 0x4a, 0x6d, 0x55, 0x33, + 0xe2, 0xc6, 0x30, 0xfb, 0x7b, 0xbd, 0xcb, 0x33, 0x4d, 0xd6, 0x84, 0x1b, + 0x28, 0x77, 0x60, 0xbf, 0x4b, 0x82, 0xd0, 0xb0, 0xc5, 0x88, 0x0c, 0x51, + 0x00, 0x22, 0x5e, 0xd9, 0x41, 0x2e, 0x7e, 0x43, 0xa6, 0x9d, 0x34, 0x5d, + 0x24, 0x80, 0xad, 0x7f, 0xd3, 0xae, 0xc7, 0x5a, 0x51, 0x74, 0x06, 0xc0, + 0x6f, 0x04, 0xae, 0x9b, 0x79, 0x6e, 0x35, 0xfa, 0x0d, 0xf3, 0xda, 0xa4, + 0xfa, 0x38, 0x4a, 0xee, 0xb3, 0xa4, 0x87, 0xa7, 0x27, 0xe2, 0x02, 0x2a, + 0x29, 0x27, 0xcf, 0xca, 0x4a, 0x20, 0x01, 0x15, 0x3d, 0x12, 0xc7, 0xcc, + 0x74, 0x99, 0x17, 0xa6, 0x7b, 0x04, 0xd7, 0xea, 0x64, 0x88, 0xa3, 0x54, + 0xb5, 0xc2, 0xa3, 0xea, 0x69, 0x94, 0xb9, 0xc3, 0x45, 0xf8, 0x2b, 0x3c, + 0xe5, 0xc1, 0x25, 0xf6, 0xc9, 0x82, 0xe3, 0x45, 0x25, 0xca, 0x71, 0x3d, + 0x73, 0x10, 0x99, 0xa5, 0xb5, 0x79, 0xdc, 0x61, 0x8f, 0x04, 0x3c, 0x61, + 0x6b, 0x90, 0x4d, 0x83, 0xa3, 0x0d, 0xb4, 0xba, 0xb7, 0x3b, 0x81, 0xca, + 0xbb, 0x59, 0x16, 0x46, 0xf4, 0x75, 0x7c, 0xaa, 0x0f, 0x90, 0xec, 0x37, + 0xa4, 0x50, 0xa8, 0x41, 0xb4, 0x33, 0xd2, 0xb7, 0x28, 0xe1, 0xda, 0x74, + 0x36, 0x3a, 0xa6, 0x96, 0x76, 0x06, 0x67, 0xd3, 0x8e, 0xda, 0xd5, 0x29, + 0x24, 0x1a, 0xbe, 0x82, 0x83, 0x6b, 0x3a, 0x90, 0xc1, 0x86, 0x34, 0x2d, + 0xd8, 0xa3, 0xa5, 0x7e, 0xe6, 0xca, 0x3f, 0xc5, 0xbf, 0xba, 0x24, 0x14, + 0xd2, 0xfc, 0x3f, 0x97, 0x8c, 0xca, 0x59, 0xa5, 0x52, 0xf1, 0x8c, 0x62, + 0x8d, 0x8a, 0x7d, 0x1b, 0x35, 0x76, 0x79, 0x98, 0x39, 0x57, 0x53, 0x1f, + 0x78, 0x82, 0x7d, 0xcd, 0x9c, 0xab, 0xb3, 0x12, 0x52, 0x86, 0x87, 0xbf, + 0x01, 0x12, 0x58, 0x45, 0x37, 0x75, 0x1a, 0xe0, 0x03, 0xbd, 0x67, 0x4c, + 0x76, 0xfa, 0x06, 0x59, 0xa2, 0x1f, 0x48, 0x73, 0x7f, 0x0c, 0x90, 0xc1, + 0x82, 0xc0, 0x98, 0x2f, 0x6a, 0x1c, 0x6f, 0x6c, 0x98, 0x70, 0x16, 0x8e, + 0x8c, 0x5b, 0x60, 0x92, 0x42, 0x59, 0xd6, 0xb2, 0x13, 0x7a, 0x11, 0xb6, + 0x15, 0x1b, 0x07, 0x50, 0x52, 0x73, 0x86, 0xbc, 0x3a, 0xff, 0x66, 0x71, + 0x57, 0xf8, 0x89, 0x7a, 0x3a, 0x95, 0xc5, 0xa1, 0x31, 0x48, 0x15, 0xc9, + 0xe0, 0x4b, 0x41, 0xb6, 0x07, 0x5a, 0x70, 0x88, 0x11, 0xce, 0xdc, 0xa7, + 0x84, 0x23, 0x85, 0x96, 0xd8, 0x20, 0x9f, 0x89, 0x42, 0xbc, 0x0b, 0xd0, + 0xd9, 0x11, 0x9f, 0x2f, 0x71, 0x37, 0xa2, 0x10, 0x98, 0x95, 0xa7, 0x1f, + 0x99, 0x48, 0x8e, 0x5b, 0x40, 0x65, 0x9f, 0x08, 0x4e, 0x04, 0x16, 0x35, + 0xc8, 0x56, 0xa1, 0x02, 0xa1, 0xaf, 0x0e, 0x5b, 0xaf, 0x02, 0x66, 0x2e, + 0xc8, 0xa5, 0x49, 0xe7, 0xa4, 0x6b, 0x03, 0xe2, 0x42, 0xb3, 0x37, 0x76, + 0x2c, 0x89, 0x00, 0x76, 0x83, 0xb3, 0x93, 0xdc, 0x78, 0x56, 0xc9, 0x02, + 0xf1, 0xd3, 0x92, 0xe5, 0x14, 0x36, 0xba, 0x37, 0x1d, 0x86, 0x87, 0x81, + 0x5f, 0x30, 0x16, 0x7f, 0xdb, 0x76, 0xe2, 0x46, 0x3d, 0x32, 0xe8, 0x47, + 0x35, 0x5b, 0x68, 0x39, 0x70, 0xa3, 0xc2, 0xb8, 0xb3, 0x6e, 0xd0, 0x22, + 0x6b, 0x39, 0x9c, 0xd9, 0xa3, 0x93, 0x69, 0xbc, 0xcb, 0xe0, 0xbc, 0x82, + 0x1b, 0xfb, 0x9f, 0x72, 0x17, 0x40, 0x72, 0xeb, 0xb2, 0x97, 0x31, 0x66, + 0x18, 0x48, 0x9a, 0xe5, 0x3a, 0x2a, 0x41, 0x13, 0x3e, 0xfc, 0xfb, 0xb4, + 0xb0, 0x96, 0x6e, 0x53, 0x37, 0x18, 0xbb, 0x26, 0x1d, 0xbe, 0x33, 0x0f, + 0x06, 0x8b, 0x8c, 0xb1, 0xfc, 0x8c, 0x24, 0x19, 0x8d, 0x7d, 0x76, 0x52, + 0x35, 0x75, 0x71, 0x36, 0x61, 0x7d, 0x75, 0x3a, 0x31, 0x9a, 0x16, 0x05, + 0x2d, 0x51, 0x63, 0x6f, 0x07, 0x64, 0xfd, 0x80, 0x16, 0x2d, 0xa3, 0x3a, + 0x15, 0xca, 0xaa, 0x76, 0xec, 0x2f, 0x91, 0x47, 0xb2, 0x29, 0x61, 0x04, + 0x80, 0x20, 0x8d, 0x3c, 0x31, 0xa6, 0x4f, 0x8b, 0xc8, 0xc0, 0x71, 0xbb, + 0x0a, 0xec, 0xbc, 0xdf, 0xdc, 0x91, 0x8c, 0xb9, 0x80, 0xb7, 0x67, 0xa9, + 0x7f, 0x30, 0x18, 0x4e, 0xd3, 0x68, 0x27, 0x50, 0x15, 0xad, 0x8c, 0x6a, + 0xab, 0xc9, 0xbe, 0xad, 0x41, 0x4b, 0xcd, 0x23, 0x73, 0x9f, 0x77, 0x58, + 0xe3, 0xd8, 0x2e, 0x16, 0xcb, 0xa0, 0x17, 0x89, 0x51, 0x31, 0x53, 0x01, + 0xa8, 0xf1, 0x9b, 0x61, 0xc3, 0x0a, 0x5b, 0x0c, 0x70, 0x36, 0x48, 0x1b, + 0xae, 0x17, 0x51, 0x1f, 0xc7, 0x7b, 0xce, 0xd4, 0x45, 0xff, 0xea, 0xb0, + 0x5f, 0x48, 0x07, 0x4f, 0x79, 0x26, 0x5c, 0xb6, 0xce, 0x4c, 0xc4, 0x02, + 0xd1, 0x31, 0x88, 0xa9, 0xc0, 0x53, 0x6a, 0x42, 0x6f, 0x9c, 0x32, 0xaa, + 0x86, 0xec, 0x25, 0xa7, 0x5b, 0x31, 0x68, 0xc9, 0xa9, 0xd8, 0x2c, 0x70, + 0xa0, 0x02, 0x31, 0x52, 0x8a, 0x2c, 0xc0, 0x0a, 0xbe, 0x8c, 0x59, 0x6c, + 0x72, 0x42, 0x21, 0xcb, 0x44, 0x07, 0x02, 0x27, 0xa0, 0x71, 0xa3, 0x7f, + 0xe3, 0x97, 0x24, 0x70, 0x47, 0x4d, 0x10, 0xfc, 0x54, 0x83, 0xdc, 0x44, + 0x09, 0x36, 0x23, 0x4a, 0x26, 0x95, 0x60, 0x96, 0xa4, 0x11, 0x1b, 0x18, + 0x9d, 0x01, 0x9c, 0x8c, 0xd7, 0x37, 0x60, 0x08, 0x02, 0xef, 0xe3, 0x81, + 0xf3, 0x48, 0x5c, 0x9c, 0xac, 0x4f, 0xb6, 0xc7, 0x66, 0xd6, 0xda, 0x25, + 0x78, 0x4a, 0xca, 0x91, 0xf1, 0x16, 0x75, 0x69, 0x8b, 0x09, 0x63, 0x0d, + 0x74, 0x49, 0x57, 0x33, 0xe1, 0x23, 0x87, 0xc5, 0x0e, 0x85, 0x27, 0x1e, + 0x5a, 0xfa, 0x3f, 0xa8, 0x82, 0xbc, 0x49, 0xc7, 0x99, 0x15, 0xe1, 0x2f, + 0x04, 0x36, 0x88, 0xd4, 0xa1, 0x9a, 0xe5, 0xb3, 0x70, 0x95, 0x85, 0x43, + 0x76, 0xb9, 0x7d, 0x3b, 0x9c, 0x61, 0xf3, 0x30, 0x59, 0x4e, 0x2a, 0xc6, + 0x3f, 0x52, 0x35, 0x81, 0x43, 0x61, 0x72, 0xc0, 0x89, 0x98, 0x67, 0x1e, + 0x6f, 0xec, 0xc6, 0x2a, 0x9a, 0x14, 0x36, 0x34, 0x4c, 0x9f, 0xfb, 0x2e, + 0xd1, 0xd2, 0x4e, 0x6b, 0xb9, 0xc9, 0x56, 0x74, 0xbe, 0xab, 0xa4, 0x7c, + 0x48, 0x84, 0x07, 0xf6, 0x04, 0xb1, 0x89, 0x0b, 0x13, 0x79, 0x13, 0xb5, + 0x31, 0xd6, 0x2c, 0x40, 0x47, 0xbd, 0xba, 0x37, 0xb1, 0x65, 0x7c, 0xb3, + 0xa3, 0x45, 0x1e, 0x0f, 0x41, 0x62, 0x07, 0xf9, 0x1e, 0x25, 0xa5, 0x78, + 0x47, 0xd2, 0xa6, 0x4b, 0x20, 0x7b, 0x9b, 0x39, 0x07, 0x09, 0xf3, 0x9e, + 0x09, 0xfb, 0xac, 0x87, 0xb6, 0xb5, 0x9b, 0xd8, 0x09, 0xa3, 0xd6, 0x69, + 0xb3, 0x3a, 0x87, 0x09, 0xb0, 0x19, 0xb3, 0xc1, 0x47, 0x8e, 0xd7, 0x3a, + 0x6e, 0xb5, 0x48, 0x89, 0x56, 0x87, 0x60, 0xb1, 0xba, 0x52, 0xd5, 0x94, + 0x22, 0x54, 0xbd, 0x7c, 0xf1, 0xa4, 0x54, 0xaa, 0x38, 0x5c, 0x76, 0xa8, + 0x52, 0x19, 0x97, 0x9a, 0x71, 0x47, 0x6c, 0xa0, 0x39, 0xd1, 0x0b, 0xd0, + 0xa6, 0xb3, 0x65, 0xdc, 0xbc, 0xa4, 0x4d, 0x62, 0x44, 0xb2, 0x47, 0x5e, + 0x77, 0x77, 0x32, 0x2f, 0x78, 0xa6, 0x97, 0xdb, 0xb5, 0xba, 0x3b, 0x4a, + 0xcb, 0x0b, 0x33, 0xa2, 0xac, 0xc4, 0x94, 0x56, 0x1a, 0x36, 0x1c, 0x0d, + 0xa3, 0x57, 0x5b, 0x63, 0x6c, 0x0d, 0x1f, 0xe1, 0xa8, 0x36, 0xd5, 0x5d, + 0x38, 0x5c, 0xac, 0xf4, 0x67, 0x39, 0x9a, 0x12, 0x5b, 0xef, 0xa1, 0x0a, + 0x57, 0x76, 0x87, 0x83, 0xaa, 0xaf, 0x1d, 0x9c, 0xce, 0x2b, 0x78, 0x9b, + 0x40, 0x82, 0x1e, 0x34, 0xb3, 0x55, 0x2d, 0xd0, 0x20, 0xcd, 0x5b, 0x46, + 0x8d, 0xda, 0xb8, 0x83, 0xa8, 0xab, 0xfb, 0x36, 0x45, 0xcb, 0x40, 0xae, + 0x69, 0x77, 0x03, 0x22, 0xa8, 0xb4, 0xaa, 0x17, 0x53, 0x90, 0x05, 0x6a, + 0x69, 0xb9, 0xc5, 0x2d, 0xbc, 0x9a, 0x81, 0xf6, 0x2a, 0x3c, 0x57, 0x96, + 0x8c, 0xb2, 0x72, 0xa7, 0xd4, 0x84, 0x77, 0xf9, 0x0b, 0x80, 0xe7, 0x1e, + 0x75, 0x61, 0x0e, 0xf9, 0xb0, 0xcf, 0x4c, 0x03, 0xca, 0x30, 0x45, 0x50, + 0x25, 0x82, 0x56, 0x02, 0xbb, 0x58, 0xab, 0x99, 0xc7, 0x8d, 0xd5, 0x37, + 0x12, 0xc8, 0x36, 0x5b, 0x00, 0x5f, 0x91, 0xf6, 0x2d, 0x9b, 0x91, 0xc2, + 0x3e, 0x75, 0x56, 0x83, 0xd2, 0xaa, 0xd4, 0x03, 0xcd, 0xe8, 0xe8, 0x3f, + 0x09, 0xc1, 0x8e, 0x84, 0xb1, 0x75, 0x85, 0x06, 0xa7, 0x4a, 0x40, 0xc2, + 0xc9, 0x90, 0xb0, 0x0f, 0x59, 0x84, 0xe5, 0xf7, 0xa8, 0xd8, 0x35, 0x33, + 0xa5, 0x82, 0x4e, 0xf6, 0xb5, 0xaf, 0xd5, 0xaa, 0xbb, 0x52, 0x30, 0x0b, + 0xb1, 0x67, 0x7b, 0xbb, 0xdc, 0x38, 0x58, 0xf4, 0x2f, 0xe4, 0xeb, 0x47, + 0xf0, 0x49, 0x76, 0x45, 0x10, 0x02, 0x19, 0xa7, 0xc6, 0x1f, 0x8a, 0x71, + 0xda, 0x51, 0x61, 0x79, 0xc4, 0x10, 0xeb, 0x3a, 0x11, 0xd9, 0x93, 0x1d, + 0x83, 0x81, 0x48, 0x3d, 0x68, 0x35, 0xff, 0xe3, 0xa6, 0xa8, 0x40, 0x74, + 0xdb, 0x7c, 0xb4, 0xb2, 0x05, 0x89, 0xc8, 0x09, 0x42, 0x27, 0x28, 0x55, + 0x01, 0x57, 0xc9, 0xf9, 0xfa, 0x36, 0xd3, 0x5c, 0x51, 0x40, 0xca, 0x1e, + 0x34, 0xc8, 0xb9, 0x27, 0xf1, 0x15, 0xc9, 0xa4, 0x18, 0x9c, 0xa8, 0x4c, + 0x55, 0xf8, 0x4d, 0x90, 0xbc, 0x88, 0xd1, 0x46, 0x01, 0x80, 0x96, 0x54, + 0x8d, 0x01, 0x6e, 0xe4, 0x26, 0x0c, 0x7a, 0x08, 0x3d, 0x7c, 0x26, 0x66, + 0xb1, 0xd1, 0x15, 0xc7, 0xc0, 0x0b, 0x06, 0x92, 0x9c, 0x26, 0xa6, 0x6c, + 0x3a, 0x46, 0x47, 0x15, 0x37, 0xb0, 0x0a, 0xc1, 0x5d, 0xe0, 0x51, 0xae, + 0x17, 0x87, 0x2f, 0x75, 0x2c, 0x64, 0xd5, 0x39, 0x09, 0xe6, 0xd0, 0x7c, + 0x3d, 0x79, 0x47, 0x78, 0x0a, 0x82, 0xc9, 0xba, 0x37, 0x07, 0x93, 0x94, + 0xa4, 0x45, 0x7c, 0x07, 0x69, 0x24, 0x1d, 0xc9, 0x75, 0x41, 0x6b, 0xc1, + 0x42, 0x28, 0x06, 0xe3, 0xfc, 0x7b, 0xd2, 0xf8, 0xbf, 0xe7, 0x5a, 0xc2, + 0xc0, 0x81, 0x63, 0xf3, 0x84, 0x32, 0x31, 0x4a, 0x8d, 0x6b, 0xe3, 0x3e, + 0x22, 0x67, 0x9a, 0x3a, 0x96, 0x09, 0xa6, 0xf9, 0x53, 0x40, 0x41, 0x2a, + 0x04, 0xa7, 0xc5, 0x7b, 0x86, 0xcd, 0xed, 0xec, 0xae, 0xfd, 0xc7, 0x11, + 0x48, 0x56, 0x77, 0x0b, 0xb7, 0xbb, 0xaf, 0xfb, 0x6c, 0x9e, 0xd6, 0x5c, + 0xbc, 0xf7, 0x9a, 0x75, 0xb5, 0xb2, 0x20, 0xd9, 0x60, 0x90, 0x65, 0x02, + 0xf9, 0x16, 0x34, 0xdf, 0x0b, 0x16, 0x2a, 0x97, 0x00, 0x5d, 0xc1, 0x65, + 0xc3, 0xb9, 0x1f, 0x2e, 0x85, 0x97, 0xbd, 0xca, 0xaa, 0x1d, 0xf2, 0x4f, + 0x74, 0xe5, 0x56, 0x94, 0xdc, 0x53, 0xbb, 0xc0, 0x75, 0x1c, 0x2b, 0x59, + 0x6b, 0x50, 0x2c, 0x25, 0x4c, 0x78, 0xd7, 0x68, 0x40, 0x0b, 0x91, 0x7a, + 0x93, 0x02, 0x83, 0x5e, 0x6c, 0x03, 0xc7, 0xc4, 0x24, 0x6f, 0x8b, 0xa4, + 0x70, 0x37, 0x1f, 0xe5, 0x33, 0x59, 0xf6, 0xbb, 0x5f, 0xb8, 0x58, 0x50, + 0x62, 0x16, 0x95, 0x2a, 0x88, 0xa3, 0xc6, 0x24, 0x94, 0x4b, 0xb0, 0x03, + 0x8e, 0xe6, 0x50, 0xd1, 0x63, 0xa2, 0xab, 0x00, 0x16, 0xf4, 0x0a, 0x7d, + 0xe6, 0xe0, 0xb9, 0x0c, 0xd8, 0x08, 0xae, 0xf6, 0x80, 0xb7, 0x48, 0xb8, + 0x10, 0xd1, 0x77, 0x57, 0x48, 0x2a, 0xc1, 0xf1, 0x4e, 0x61, 0xb4, 0x06, + 0x5a, 0xa1, 0xba, 0x32, 0x4a, 0x9a, 0xab, 0x8c, 0xa8, 0xe3, 0xa0, 0x68, + 0x39, 0xa3, 0x25, 0x55, 0x48, 0xcd, 0x12, 0x98, 0x14, 0x1b, 0x39, 0x5b, + 0xf4, 0x88, 0xbe, 0x5f, 0xac, 0xba, 0x18, 0x20, 0x74, 0x2c, 0xa1, 0x0b, + 0xfc, 0xf1, 0x00, 0x85, 0x24, 0x27, 0x3d, 0x80, 0xb6, 0x30, 0x42, 0x7b, + 0x74, 0xb0, 0xbf, 0x87, 0x51, 0x9b, 0xa8, 0x1c, 0x79, 0xbd, 0x50, 0x85, + 0x66, 0xe1, 0x93, 0xdd, 0xc5, 0x5d, 0x5d, 0xd0, 0xbe, 0xcb, 0x90, 0x36, + 0xf8, 0xc6, 0xc2, 0xea, 0xea, 0x40, 0xce, 0xa7, 0xb0, 0xae, 0xc5, 0x44, + 0x3a, 0x27, 0xc5, 0x4a, 0xf0, 0x21, 0x4f, 0xf6, 0x63, 0xb1, 0x8c, 0x20, + 0x76, 0x65, 0xb5, 0x79, 0x55, 0x10, 0xa5, 0xa7, 0x4b, 0x57, 0x96, 0xb3, + 0x03, 0xe6, 0xa8, 0x93, 0xf5, 0xc5, 0xca, 0xe0, 0x1a, 0x23, 0x30, 0x6d, + 0x09, 0x07, 0x81, 0x5e, 0x05, 0xc6, 0x1b, 0x7a, 0x0e, 0xde, 0xf3, 0x00, + 0x91, 0xf3, 0xbb, 0x4b, 0xa6, 0x8e, 0x96, 0xc3, 0x3a, 0xc8, 0x72, 0x19, + 0x23, 0x23, 0xc7, 0x94, 0x46, 0x75, 0xa2, 0xc5, 0x9e, 0xa3, 0x15, 0x86, + 0xb4, 0xa7, 0x95, 0xda, 0x16, 0xae, 0x42, 0xe8, 0xcc, 0xf6, 0x41, 0xbc, + 0x9c, 0xe8, 0x07, 0xc9, 0x05, 0x3c, 0x74, 0xb2, 0xad, 0x77, 0xc4, 0x08, + 0x56, 0x78, 0xb8, 0x65, 0x2a, 0x33, 0x39, 0x82, 0x16, 0x3c, 0x2b, 0x73, + 0x41, 0x2a, 0xbc, 0xc8, 0x36, 0x23, 0x2c, 0x23, 0x29, 0xce, 0x94, 0x8a, + 0xba, 0x13, 0x3f, 0xfa, 0xcc, 0x6e, 0x17, 0x40, 0xb0, 0xb4, 0x9c, 0xae, + 0xf9, 0x72, 0x40, 0xf7, 0x2a, 0xaf, 0x00, 0x17, 0x7e, 0x88, 0x44, 0x4e, + 0x32, 0xa4, 0x80, 0x96, 0x53, 0x20, 0xe4, 0xaa, 0x18, 0x9c, 0x69, 0x63, + 0xdb, 0x81, 0x73, 0x0e, 0x64, 0x40, 0x12, 0xc1, 0x26, 0x0e, 0xac, 0x4d, + 0x38, 0xa6, 0x4c, 0x05, 0xeb, 0x42, 0x1f, 0x88, 0x6e, 0x6c, 0xa3, 0x98, + 0x95, 0xc3, 0x5a, 0x31, 0xfc, 0x8a, 0x7c, 0xd6, 0xa5, 0x6c, 0x5a, 0xc9, + 0xf5, 0xf2, 0xb0, 0xf7, 0x05, 0x8d, 0x3a, 0x0b, 0x3f, 0x4d, 0x1b, 0x0e, + 0x38, 0x7b, 0x1c, 0xe5, 0xd0, 0x75, 0x40, 0x77, 0xc2, 0xe0, 0x8b, 0xa9, + 0xfb, 0xa6, 0xa7, 0x13, 0xf0, 0xbc, 0x0c, 0x13, 0xb0, 0x70, 0x3a, 0x2f, + 0x20, 0xe7, 0x38, 0xd8, 0x82, 0xa4, 0xc0, 0x06, 0x96, 0x95, 0x75, 0x5c, + 0xbc, 0x68, 0xce, 0xee, 0xa6, 0x44, 0xbb, 0xf2, 0x62, 0x14, 0x43, 0x10, + 0xe6, 0x0b, 0x5c, 0x66, 0x02, 0xbf, 0x38, 0xe9, 0x8f, 0x19, 0x69, 0x6c, + 0x08, 0xfa, 0x93, 0x23, 0xf1, 0x35, 0x5f, 0x33, 0xcf, 0xe2, 0x32, 0x11, + 0xda, 0x31, 0xbb, 0xd8, 0xf5, 0x7e, 0x55, 0xc3, 0x6a, 0xa3, 0x04, 0x8c, + 0xef, 0x50, 0x67, 0x0f, 0xe9, 0xbb, 0x11, 0x6b, 0xca, 0x5a, 0x23, 0x15, + 0xf7, 0x52, 0x8c, 0x69, 0xbb, 0x2d, 0x26, 0xd3, 0x78, 0x32, 0x33, 0x9b, + 0x1d, 0x02, 0x7c, 0x22, 0x4b, 0x57, 0x19, 0xec, 0xcd, 0x16, 0x63, 0x5d, + 0x1f, 0x0c, 0x67, 0xda, 0x6a, 0xc7, 0x2c, 0xf1, 0x0b, 0xcf, 0x92, 0x10, + 0xcb, 0x78, 0x57, 0x67, 0x61, 0x0e, 0xb3, 0x55, 0xb0, 0x17, 0x32, 0x4e, + 0x95, 0xf1, 0xcb, 0xb5, 0xdb, 0x5d, 0x8e, 0x38, 0x97, 0x28, 0x46, 0x1e, + 0x37, 0x4b, 0x5c, 0x03, 0xf4, 0x84, 0x0c, 0xfc, 0x49, 0x3b, 0xd2, 0xc6, + 0x97, 0xd4, 0x4f, 0xef, 0xe6, 0x5c, 0xb2, 0x96, 0x8e, 0xbc, 0x18, 0x9f, + 0xec, 0x8a, 0x67, 0x53, 0xe8, 0x2a, 0x41, 0x5c, 0x6e, 0xfe, 0xca, 0x1a, + 0xd0, 0x03, 0x4f, 0xe8, 0xd9, 0xb0, 0xe2, 0xa1, 0x6c, 0x32, 0xd5, 0x37, + 0x68, 0x9c, 0x6b, 0x30, 0x87, 0xb4, 0x6e, 0xa8, 0xc5, 0x45, 0xe4, 0x74, + 0x4a, 0x80, 0x02, 0x84, 0x78, 0x0e, 0x2c, 0x9c, 0x37, 0x07, 0x26, 0x31, + 0x88, 0x0a, 0x89, 0xd2, 0xba, 0x8b, 0x09, 0xb2, 0x16, 0x7b, 0x23, 0xa8, + 0x0c, 0x82, 0xc4, 0x22, 0xa2, 0x21, 0x2d, 0xa4, 0xa0, 0xeb, 0xe4, 0xa6, + 0xd4, 0x16, 0xa0, 0xa7, 0x19, 0x8b, 0x8f, 0x87, 0x9b, 0x53, 0x6c, 0xba, + 0x14, 0xa7, 0xc4, 0x64, 0x08, 0xb4, 0x68, 0xda, 0x85, 0xa6, 0x47, 0xbc, + 0x58, 0x34, 0xcd, 0x60, 0x43, 0xad, 0x14, 0xd4, 0xca, 0xd7, 0xb1, 0x6c, + 0xc0, 0x05, 0x68, 0xaf, 0xa7, 0x08, 0x9d, 0x6a, 0xb0, 0x70, 0x96, 0x81, + 0xb8, 0x55, 0xbf, 0xa1, 0xcc, 0xb1, 0x37, 0xe0, 0x94, 0x42, 0x05, 0x0c, + 0x83, 0xef, 0x6d, 0xf0, 0x4e, 0xba, 0xe6, 0x78, 0x49, 0xaf, 0x1f, 0x88, + 0xf0, 0x18, 0x0c, 0x17, 0x66, 0x92, 0xd8, 0x50, 0x57, 0x3b, 0x8e, 0xc9, + 0xd5, 0x8c, 0xfb, 0x9b, 0x78, 0x65, 0x01, 0x76, 0x7a, 0x0e, 0xf0, 0x7f, + 0x2b, 0xc4, 0x48, 0x81, 0xc2, 0xb7, 0xc1, 0x6e, 0x4b, 0xc7, 0x33, 0xa3, + 0x9b, 0xf1, 0xcf, 0x63, 0x77, 0x52, 0x07, 0xe1, 0x9f, 0xa9, 0x6b, 0xce, + 0x02, 0xbf, 0x06, 0x8d, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}; + +const uint8_t kExpectedCiphertext[MLKEM768_CIPHERTEXT_BYTES] = { + 0x8c, 0xfa, 0xd2, 0x09, 0x06, 0x1b, 0xff, 0x23, 0x9f, 0x84, 0x61, 0xfd, + 0x69, 0x7a, 0xa6, 0x2c, 0x57, 0x43, 0x64, 0x6b, 0x6b, 0x6b, 0xa3, 0xe1, + 0xa9, 0x4f, 0x3e, 0x29, 0xff, 0x3d, 0xb1, 0x84, 0x51, 0x1d, 0x48, 0xe3, + 0xf2, 0xf9, 0x5c, 0x9b, 0xf8, 0x05, 0xa1, 0x4c, 0xf4, 0x78, 0xce, 0xd8, + 0xc8, 0x0a, 0xc9, 0x71, 0xa7, 0x50, 0x89, 0xde, 0x66, 0x11, 0x68, 0x2c, + 0x70, 0x0f, 0xa3, 0x15, 0xa7, 0x8f, 0xa7, 0xd1, 0x49, 0x32, 0x2b, 0xe7, + 0xe3, 0x63, 0xff, 0x6b, 0x8c, 0x77, 0xd8, 0x97, 0x6f, 0x6c, 0x80, 0xc2, + 0x73, 0x7d, 0xe2, 0x39, 0x6f, 0x61, 0x4f, 0xa3, 0x85, 0x14, 0x8e, 0x68, + 0x58, 0x18, 0xba, 0x1c, 0x19, 0x01, 0x4c, 0x06, 0x73, 0x58, 0x4a, 0x9d, + 0xdd, 0x2e, 0x78, 0x82, 0x62, 0xbf, 0x15, 0x0c, 0x8e, 0xa7, 0xcf, 0xee, + 0xd7, 0x9a, 0x42, 0x30, 0xf5, 0x44, 0xdc, 0x3f, 0xdc, 0x67, 0x5c, 0x06, + 0xf0, 0xcf, 0x3b, 0x24, 0x9e, 0xfa, 0xe8, 0x4b, 0x3b, 0x00, 0x01, 0x7c, + 0x4d, 0x50, 0xa8, 0xac, 0x30, 0x74, 0xf4, 0x73, 0x98, 0x5e, 0x09, 0x92, + 0xbd, 0xe1, 0xc4, 0x3a, 0x9f, 0xd0, 0x62, 0xc8, 0x4e, 0x7f, 0xb0, 0xaa, + 0xb3, 0x8c, 0xb5, 0xf9, 0x57, 0xe3, 0x90, 0x9a, 0x94, 0x0d, 0xdb, 0x9e, + 0xf7, 0x78, 0xbf, 0x18, 0xd0, 0x02, 0x8e, 0x02, 0x04, 0xbe, 0xee, 0x87, + 0x49, 0xb1, 0xfe, 0x28, 0xd2, 0xdb, 0xd0, 0x7a, 0x12, 0x50, 0xa3, 0xc6, + 0x32, 0xd6, 0x06, 0xa6, 0xc5, 0xb0, 0xa8, 0xbe, 0x49, 0x8a, 0x8b, 0xfe, + 0xf6, 0xb1, 0xe5, 0xb6, 0xf0, 0x61, 0xc6, 0x1f, 0xe7, 0xcb, 0x4d, 0x66, + 0xdd, 0xe4, 0xd2, 0x2b, 0x73, 0xbd, 0x03, 0x79, 0x76, 0x05, 0x87, 0x58, + 0x6f, 0x6f, 0x8f, 0x20, 0x4f, 0x1f, 0x81, 0xa9, 0x9c, 0x22, 0x8d, 0xbd, + 0xef, 0x41, 0x5e, 0x39, 0x9e, 0x90, 0x4b, 0x63, 0x8b, 0x50, 0xac, 0x4f, + 0x20, 0x08, 0x82, 0x53, 0x52, 0x86, 0xce, 0x55, 0xf5, 0x21, 0xd7, 0x8c, + 0xdf, 0xf3, 0x54, 0x90, 0xed, 0x27, 0xa9, 0x95, 0xbb, 0xec, 0x7a, 0xe2, + 0xeb, 0x80, 0x29, 0xca, 0xc9, 0x4b, 0x1a, 0xfe, 0xe8, 0x40, 0xcf, 0x70, + 0x37, 0xb3, 0xb8, 0xb3, 0x04, 0xf6, 0xc3, 0x44, 0x1e, 0xf8, 0x40, 0x95, + 0xf9, 0x92, 0x97, 0x63, 0xab, 0x35, 0x11, 0x7b, 0x2d, 0x9e, 0x5b, 0xeb, + 0xfe, 0x80, 0xfe, 0xc3, 0xc6, 0xa4, 0x8c, 0xac, 0xd2, 0xa5, 0x0d, 0x11, + 0xcd, 0xbc, 0xdd, 0x57, 0x9d, 0xf4, 0xbc, 0xdf, 0xbe, 0xa2, 0xfc, 0xcd, + 0x2b, 0x83, 0x0e, 0x4e, 0x77, 0x44, 0x10, 0x93, 0xeb, 0xca, 0xec, 0xb6, + 0x26, 0x48, 0x45, 0x08, 0x72, 0xd4, 0xfe, 0xa2, 0xe5, 0xc8, 0x12, 0x8f, + 0x38, 0xcd, 0xd6, 0xaa, 0x97, 0xd4, 0xad, 0xb3, 0x0b, 0x19, 0x5b, 0x70, + 0x50, 0xbf, 0x9d, 0x45, 0xd6, 0x0d, 0xf6, 0x72, 0x08, 0x0f, 0x98, 0xbd, + 0x1f, 0x39, 0xea, 0x4c, 0x76, 0xa8, 0x6e, 0xc2, 0x24, 0xe3, 0x3d, 0xf4, + 0x40, 0xe8, 0x68, 0xfe, 0xaf, 0xeb, 0xae, 0x65, 0x84, 0x86, 0x10, 0xb6, + 0x58, 0x75, 0xb9, 0x22, 0x6d, 0x74, 0x5d, 0xa7, 0xc0, 0x17, 0x10, 0x0f, + 0x36, 0x5f, 0x99, 0xf3, 0x60, 0x39, 0xeb, 0x6e, 0x04, 0x4d, 0x29, 0xc7, + 0xec, 0x2a, 0x1d, 0x8f, 0x5c, 0x23, 0x84, 0xba, 0x65, 0xe9, 0xab, 0x32, + 0xd4, 0x62, 0xd1, 0x53, 0xb8, 0x71, 0x40, 0x9a, 0xf0, 0x05, 0xdb, 0xa0, + 0x5a, 0xca, 0xeb, 0xb0, 0xbf, 0xfe, 0x7e, 0x19, 0x32, 0x42, 0xef, 0xab, + 0xf3, 0x49, 0x13, 0x50, 0x08, 0x98, 0xcd, 0xcf, 0x5a, 0x77, 0x5d, 0xe6, + 0x3f, 0x9d, 0xa0, 0x21, 0x98, 0xcb, 0x78, 0x69, 0xd8, 0x22, 0xdb, 0xae, + 0x87, 0x2c, 0x38, 0x0a, 0x96, 0xa2, 0x30, 0x8f, 0x37, 0xe4, 0xc5, 0x94, + 0x57, 0x40, 0x30, 0xa0, 0x4e, 0x7c, 0xfe, 0x6f, 0x2e, 0x15, 0x88, 0x7b, + 0xf2, 0x22, 0x9c, 0x95, 0x7e, 0xd6, 0x2d, 0x37, 0xd4, 0x13, 0x95, 0x31, + 0xf6, 0xbb, 0xf4, 0xaf, 0x33, 0x42, 0xf0, 0x45, 0xb1, 0xa8, 0xb4, 0x09, + 0x9a, 0x6d, 0x7a, 0xdb, 0xb0, 0xf2, 0x00, 0xdc, 0x77, 0x6a, 0x43, 0xf1, + 0xbb, 0x56, 0xc5, 0x7c, 0xee, 0xa6, 0xd6, 0x10, 0xa0, 0x81, 0xca, 0x28, + 0xab, 0x48, 0xb4, 0x72, 0xbc, 0xe6, 0xe2, 0x24, 0xef, 0x49, 0xdc, 0xb5, + 0x1f, 0x83, 0xe7, 0xda, 0xdd, 0x73, 0x5c, 0x7b, 0x85, 0x4a, 0x15, 0x7b, + 0xcb, 0xf1, 0x14, 0x6b, 0x32, 0xef, 0xce, 0xaf, 0x37, 0x04, 0xa4, 0x1e, + 0xcb, 0x1b, 0x84, 0x1d, 0xdb, 0xbf, 0x2f, 0x88, 0x89, 0xf4, 0x5e, 0xb0, + 0x32, 0x99, 0x81, 0x9c, 0x8a, 0xb0, 0xfd, 0x28, 0x6f, 0xc9, 0xe9, 0xaf, + 0x60, 0x11, 0xa5, 0x8d, 0xa4, 0xfb, 0x93, 0x91, 0x7a, 0xa0, 0xd2, 0xcd, + 0xda, 0x4d, 0xf3, 0xfe, 0xc8, 0x55, 0x4f, 0x26, 0x9d, 0x56, 0x87, 0x12, + 0xfe, 0x93, 0x3e, 0x34, 0xf5, 0x3d, 0x4f, 0x6e, 0x26, 0x56, 0xf7, 0x80, + 0xa2, 0xb9, 0x0f, 0xa1, 0xf8, 0x72, 0xb0, 0xaa, 0xec, 0xe2, 0x97, 0xd0, + 0x3a, 0x6d, 0xe5, 0xe9, 0x12, 0x1b, 0x32, 0x0a, 0xdb, 0x52, 0x8f, 0x9d, + 0xd0, 0xff, 0x67, 0xcc, 0x63, 0x41, 0x32, 0x2f, 0xe7, 0x0c, 0xb0, 0xa5, + 0x73, 0xc5, 0xc3, 0x54, 0x75, 0x06, 0x8e, 0x36, 0x54, 0xea, 0x9c, 0x6b, + 0x60, 0xee, 0x10, 0x06, 0xd2, 0xb3, 0x8c, 0xe3, 0x09, 0xfe, 0x1e, 0x47, + 0xc3, 0xd5, 0x6c, 0x0d, 0x0c, 0x84, 0x5e, 0x4f, 0x01, 0xfa, 0xcf, 0xae, + 0xc1, 0xac, 0xd9, 0xb0, 0x03, 0x74, 0x8e, 0xc5, 0x57, 0x51, 0x96, 0x23, + 0x72, 0xc6, 0x81, 0x7f, 0xf1, 0x6c, 0x29, 0xea, 0x31, 0x2a, 0x23, 0xbb, + 0x88, 0x6f, 0x01, 0xa2, 0x2f, 0x69, 0xd2, 0x1c, 0xe8, 0x1e, 0x63, 0x44, + 0xd9, 0x90, 0x0b, 0x57, 0x78, 0xc6, 0xd1, 0xb3, 0xf6, 0x97, 0x99, 0xd8, + 0x09, 0xe0, 0x9d, 0x69, 0x00, 0xa0, 0xd4, 0xe0, 0x80, 0xfd, 0xdf, 0x23, + 0x3a, 0xf8, 0xc0, 0x97, 0xea, 0xb6, 0xbf, 0x87, 0x75, 0x40, 0x1e, 0x0d, + 0x1e, 0x6c, 0x84, 0xa4, 0x7e, 0xbb, 0xa3, 0x02, 0x76, 0x76, 0xc8, 0x4e, + 0x8b, 0x21, 0x3d, 0x27, 0xbc, 0x38, 0x0f, 0x79, 0xfb, 0xc5, 0xdd, 0x37, + 0x67, 0xcb, 0x61, 0x5a, 0x12, 0x85, 0x31, 0x91, 0x3e, 0x7a, 0x4b, 0x6c, + 0xfd, 0x4e, 0x54, 0x47, 0xfc, 0xc5, 0x2e, 0x34, 0xe4, 0xaa, 0x3d, 0xa2, + 0xd3, 0x1d, 0x15, 0xbc, 0xea, 0xa7, 0xe3, 0xcd, 0xd6, 0x49, 0xee, 0x06, + 0x11, 0x2b, 0xf1, 0x58, 0x81, 0xdd, 0x99, 0xe1, 0xbd, 0x63, 0xa0, 0xd7, + 0xa0, 0x4b, 0x01, 0xab, 0xf3, 0x5c, 0x35, 0xf0, 0xb0, 0xdc, 0xd0, 0x87, + 0x78, 0x02, 0xec, 0x99, 0x0e, 0xb2, 0x23, 0x8b, 0xb1, 0x44, 0x93, 0x65, + 0xd5, 0x1c, 0x00, 0x7e, 0x98, 0xb3, 0x5e, 0xf9, 0xca, 0xbc, 0x26, 0x38, + 0x18, 0x22, 0x14, 0x3c, 0xed, 0x8d, 0x54, 0xa4, 0x05, 0x00, 0x7f, 0xfb, + 0xd5, 0x73, 0x77, 0xf4, 0x98, 0xa0, 0xf7, 0x60, 0xb4, 0x47, 0x10, 0x75, + 0x30, 0x2d, 0xde, 0x9c, 0x3e, 0x08, 0x8f, 0xe5, 0xc9, 0x5a, 0xd9, 0x20, + 0xf3, 0x97, 0xb5, 0xd1, 0xb3, 0x90, 0x23, 0x6f, 0x9f, 0x5e, 0xf1, 0x0f, + 0x76, 0x18, 0xbf, 0x2b, 0x23, 0x8e, 0x45, 0x3f, 0xaf, 0x2b, 0x53, 0x78, + 0x27, 0xa2, 0xf6, 0x07, 0x2b, 0x61, 0x24, 0x5b, 0xc7, 0x2e, 0x25, 0xf1, + 0xb3, 0x4c, 0x50, 0xe8, 0x6d, 0xee, 0x56, 0x52, 0x37, 0xd0, 0x6e, 0xd6, + 0xcb, 0xc8, 0x2a, 0xb1, 0xba, 0x49, 0xc7, 0x5a, 0x55, 0x3c, 0x6f, 0x16, + 0x64, 0x08, 0xa6, 0x46, 0x09, 0x37, 0x86, 0x0b, 0xe7, 0x7e, 0x2d, 0xf4, + 0x96, 0x80, 0x41, 0x77, 0x1a, 0xf9, 0xd2, 0xe0, 0xf3, 0x64, 0x0e, 0x3f, + 0x3d, 0xe1, 0xec, 0x63, 0x40, 0x10, 0x15, 0xcf, 0x4c, 0xc9, 0x1c, 0x9b, + 0x9f, 0xe8, 0x50, 0x27, 0xc2, 0x54, 0x44, 0x14, 0xb6, 0x2e, 0xe4, 0x53, + 0xf2, 0x60, 0x8a, 0xb6, 0x1c, 0x14, 0xb6, 0x25, 0xf2, 0x44, 0x59, 0xb7, + 0x67, 0x3b, 0x94, 0x88, 0x15, 0x70, 0x6f, 0xa8}; + +const uint8_t kExpectedSharedSecret[MLKEM_SHARED_SECRET_BYTES] = { + 0x7d, 0x9f, 0x1c, 0xb4, 0xae, 0x04, 0xd7, 0x5f, 0xa6, 0x57, 0x5a, + 0xe0, 0xe4, 0x29, 0xb5, 0x73, 0xa9, 0x74, 0xb7, 0xa1, 0x25, 0xbd, + 0xfb, 0x8a, 0x6e, 0x0f, 0x19, 0xba, 0xe1, 0x16, 0xae, 0x81}; + +const uint8_t + kExpectedImplicitRejectionSharedSecret[MLKEM_SHARED_SECRET_BYTES] = { + 0xa3, 0x19, 0x2a, 0x8c, 0x88, 0xfc, 0x99, 0x6d, 0x2d, 0xf9, 0x85, + 0x8d, 0x2c, 0x55, 0x36, 0x39, 0x93, 0xf0, 0x49, 0x4d, 0x7e, 0xc0, + 0xbe, 0x5a, 0x56, 0x7b, 0x8a, 0x42, 0x43, 0xa5, 0x74, 0x5d}; diff --git a/third_party/boringssl/src/crypto/fipsmodule/mlkem/mlkem.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/mlkem/mlkem.cc.inc new file mode 100644 index 00000000..540b5b39 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/mlkem/mlkem.cc.inc @@ -0,0 +1,1701 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "../../internal.h" +#include "../bcm_interface.h" +#include "../keccak/internal.h" + + +using namespace bssl; + +#if defined(BORINGSSL_FIPS) + +DEFINE_STATIC_ONCE(g_mlkem_keygen_self_test_once) +DEFINE_STATIC_ONCE(g_mlkem_encap_self_test_once) +DEFINE_STATIC_ONCE(g_mlkem_decap_self_test_once) + +#endif + +namespace mlkem { +namespace { + +namespace fips { +void ensure_keygen_self_test(); +void ensure_encap_self_test(); +void ensure_decap_self_test(); +} // namespace fips + +// See +// https://csrc.nist.gov/pubs/fips/203/final + +inline void prf(uint8_t *out, size_t out_len, const uint8_t in[33]) { + BORINGSSL_keccak(out, out_len, in, 33, boringssl_shake256); +} + +#if defined(HAVE_KECCAK_X2) +inline void prf_x2(uint8_t *outs[2], size_t out_len, + const uint8_t *ins[2] /* 33 each */) { + BORINGSSL_keccak_short_x2(outs, out_len, ins, 33, boringssl_shake256); +} +#endif + +// Section 4.1 +void hash_h(uint8_t out[32], const uint8_t *in, size_t len) { + BORINGSSL_keccak(out, 32, in, len, boringssl_sha3_256); +} + +void hash_g(uint8_t out[64], const uint8_t *in, size_t len) { + BORINGSSL_keccak(out, 64, in, len, boringssl_sha3_512); +} + +// This is called `J` in the spec. +void kdf(uint8_t out[MLKEM_SHARED_SECRET_BYTES], + const uint8_t failure_secret[32], const uint8_t *ciphertext, + size_t ciphertext_len) { + BORINGSSL_keccak_st st; + BORINGSSL_keccak_init(&st, boringssl_shake256); + BORINGSSL_keccak_absorb(&st, failure_secret, 32); + BORINGSSL_keccak_absorb(&st, ciphertext, ciphertext_len); + BORINGSSL_keccak_squeeze(&st, out, MLKEM_SHARED_SECRET_BYTES); +} + +// Constants that are common across all sizes. +#define DEGREE 256 +const size_t kBarrettMultiplier = 5039; +const unsigned kBarrettShift = 24; +const uint16_t kPrime = 3329; +const int kLog2Prime = 12; +const uint16_t kHalfPrime = (/*kPrime=*/3329 - 1) / 2; +// kInverseDegree is 128^-1 mod 3329; 128 because kPrime does not have a 512th +// root of unity. +const uint16_t kInverseDegree = 3303; + +// Rank-specific constants. +#define RANK768 3 +const int kDU768 = 10; +const int kDV768 = 4; +#define RANK1024 4 +const int kDU1024 = 11; +const int kDV1024 = 5; + +constexpr size_t encoded_vector_size(int rank) { + return (kLog2Prime * DEGREE / 8) * static_cast(rank); +} + +constexpr size_t encoded_public_key_size(int rank) { + return encoded_vector_size(rank) + /*sizeof(rho)=*/32; +} + +static_assert(encoded_public_key_size(RANK768) == MLKEM768_PUBLIC_KEY_BYTES); +static_assert(encoded_public_key_size(RANK1024) == MLKEM1024_PUBLIC_KEY_BYTES); + +constexpr size_t compressed_vector_size(int rank) { + // `if constexpr` isn't available in C++17. + return (rank == RANK768 ? kDU768 : kDU1024) * static_cast(rank) * + DEGREE / 8; +} + +constexpr size_t ciphertext_size(int rank) { + return compressed_vector_size(rank) + + (rank == RANK768 ? kDV768 : kDV1024) * DEGREE / 8; +} + +static_assert(ciphertext_size(RANK768) == MLKEM768_CIPHERTEXT_BYTES); +static_assert(ciphertext_size(RANK1024) == MLKEM1024_CIPHERTEXT_BYTES); + +struct scalar { + // On every function entry and exit, 0 <= c < kPrime. + uint16_t c[DEGREE]; +}; + +template +struct vector { + scalar v[RANK]; +}; + +template +struct matrix { + scalar v[RANK][RANK]; +}; + +// This bit of Python will be referenced in some of the following comments: +// +// p = 3329 +// +// def bitreverse(i): +// ret = 0 +// for n in range(7): +// bit = i & 1 +// ret <<= 1 +// ret |= bit +// i >>= 1 +// return ret + +// kNTTRoots = [pow(17, bitreverse(i), p) for i in range(128)] +const uint16_t kNTTRoots[128] = { + 1, 1729, 2580, 3289, 2642, 630, 1897, 848, 1062, 1919, 193, 797, + 2786, 3260, 569, 1746, 296, 2447, 1339, 1476, 3046, 56, 2240, 1333, + 1426, 2094, 535, 2882, 2393, 2879, 1974, 821, 289, 331, 3253, 1756, + 1197, 2304, 2277, 2055, 650, 1977, 2513, 632, 2865, 33, 1320, 1915, + 2319, 1435, 807, 452, 1438, 2868, 1534, 2402, 2647, 2617, 1481, 648, + 2474, 3110, 1227, 910, 17, 2761, 583, 2649, 1637, 723, 2288, 1100, + 1409, 2662, 3281, 233, 756, 2156, 3015, 3050, 1703, 1651, 2789, 1789, + 1847, 952, 1461, 2687, 939, 2308, 2437, 2388, 733, 2337, 268, 641, + 1584, 2298, 2037, 3220, 375, 2549, 2090, 1645, 1063, 319, 2773, 757, + 2099, 561, 2466, 2594, 2804, 1092, 403, 1026, 1143, 2150, 2775, 886, + 1722, 1212, 1874, 1029, 2110, 2935, 885, 2154, +}; + +// kInverseNTTRoots = [pow(17, -bitreverse(i), p) for i in range(128)] +const uint16_t kInverseNTTRoots[128] = { + 1, 1600, 40, 749, 2481, 1432, 2699, 687, 1583, 2760, 69, 543, + 2532, 3136, 1410, 2267, 2508, 1355, 450, 936, 447, 2794, 1235, 1903, + 1996, 1089, 3273, 283, 1853, 1990, 882, 3033, 2419, 2102, 219, 855, + 2681, 1848, 712, 682, 927, 1795, 461, 1891, 2877, 2522, 1894, 1010, + 1414, 2009, 3296, 464, 2697, 816, 1352, 2679, 1274, 1052, 1025, 2132, + 1573, 76, 2998, 3040, 1175, 2444, 394, 1219, 2300, 1455, 2117, 1607, + 2443, 554, 1179, 2186, 2303, 2926, 2237, 525, 735, 863, 2768, 1230, + 2572, 556, 3010, 2266, 1684, 1239, 780, 2954, 109, 1292, 1031, 1745, + 2688, 3061, 992, 2596, 941, 892, 1021, 2390, 642, 1868, 2377, 1482, + 1540, 540, 1678, 1626, 279, 314, 1173, 2573, 3096, 48, 667, 1920, + 2229, 1041, 2606, 1692, 680, 2746, 568, 3312, +}; + +// kModRoots = [pow(17, 2*bitreverse(i) + 1, p) for i in range(128)] +const uint16_t kModRoots[128] = { + 17, 3312, 2761, 568, 583, 2746, 2649, 680, 1637, 1692, 723, 2606, + 2288, 1041, 1100, 2229, 1409, 1920, 2662, 667, 3281, 48, 233, 3096, + 756, 2573, 2156, 1173, 3015, 314, 3050, 279, 1703, 1626, 1651, 1678, + 2789, 540, 1789, 1540, 1847, 1482, 952, 2377, 1461, 1868, 2687, 642, + 939, 2390, 2308, 1021, 2437, 892, 2388, 941, 733, 2596, 2337, 992, + 268, 3061, 641, 2688, 1584, 1745, 2298, 1031, 2037, 1292, 3220, 109, + 375, 2954, 2549, 780, 2090, 1239, 1645, 1684, 1063, 2266, 319, 3010, + 2773, 556, 757, 2572, 2099, 1230, 561, 2768, 2466, 863, 2594, 735, + 2804, 525, 1092, 2237, 403, 2926, 1026, 2303, 1143, 2186, 2150, 1179, + 2775, 554, 886, 2443, 1722, 1607, 1212, 2117, 1874, 1455, 1029, 2300, + 2110, 1219, 2935, 394, 885, 2444, 2154, 1175, +}; + +// reduce_once reduces 0 <= x < 2*kPrime, mod kPrime. +inline uint16_t reduce_once(uint16_t x) { + declassify_assert(x < 2 * kPrime); + const uint16_t subtracted = x - kPrime; + uint16_t mask = 0u - (subtracted >> 15); + // Although this is a constant-time select, we omit a value barrier here. + // Value barriers impede auto-vectorization (likely because it forces the + // value to transit through a general-purpose register). On AArch64, this is a + // difference of 2x. + // + // We usually add value barriers to selects because Clang turns consecutive + // selects with the same condition into a branch instead of CMOV/CSEL. This + // condition does not occur in ML-KEM, so omitting it seems to be safe so far, + // but see |scalar_centered_binomial_distribution_eta_2_with_prf|. + return (mask & x) | (~mask & subtracted); +} + +// constant time reduce x mod kPrime using Barrett reduction. x must be less +// than kPrime + 2×kPrime². +inline uint16_t reduce(uint32_t x) { + declassify_assert(x < kPrime + 2u * kPrime * kPrime); + uint64_t product = (uint64_t)x * kBarrettMultiplier; + uint32_t quotient = (uint32_t)(product >> kBarrettShift); + uint32_t remainder = x - quotient * kPrime; + return reduce_once(remainder); +} + +inline void scalar_zero(scalar *out) { OPENSSL_memset(out, 0, sizeof(*out)); } + +template +inline void vector_zero(vector *out) { + OPENSSL_memset(out->v, 0, sizeof(scalar) * RANK); +} + +// In place number theoretic transform of a given scalar. +// Note that MLKEM's kPrime 3329 does not have a 512th root of unity, so this +// transform leaves off the last iteration of the usual FFT code, with the 128 +// relevant roots of unity being stored in |kNTTRoots|. This means the output +// should be seen as 128 elements in GF(3329^2), with the coefficients of the +// elements being consecutive entries in |s->c|. +inline void scalar_ntt(scalar *s) { + // Manually unrolled loop to maximize vectorization. +#define ITER(step, offset) \ + { \ + int k = 0; \ + for (int i = 0; i < step; i++) { \ + const uint32_t step_root = kNTTRoots[i + step]; \ + for (int j = k; j < k + offset; j++) { \ + uint16_t odd = reduce(step_root * s->c[j + offset]); \ + uint16_t even = s->c[j]; \ + s->c[j] = reduce_once(odd + even); \ + s->c[j + offset] = reduce_once(even - odd + kPrime); \ + } \ + k += 2 * offset; \ + } \ + } + // for (int step = 1; step < DEGREE / 2; step <<= 1) + ITER(1, 128) + ITER(2, 64) + ITER(4, 32) + ITER(8, 16) + ITER(16, 8) + ITER(32, 4) + ITER(64, 2) + static_assert(DEGREE == 256); +#undef ITER +} + +template +inline void vector_ntt(vector *a) { + for (int i = 0; i < RANK; i++) { + scalar_ntt(&a->v[i]); + } +} + +// In place inverse number theoretic transform of a given scalar, with pairs of +// entries of s->v being interpreted as elements of GF(3329^2). Just as with the +// number theoretic transform, this leaves off the first step of the normal iFFT +// to account for the fact that 3329 does not have a 512th root of unity, using +// the precomputed 128 roots of unity stored in |kInverseNTTRoots|. +void scalar_inverse_ntt(scalar *s) { + // Manually unrolled loop to maximize vectorization. +#define ITER(step, offset) \ + { \ + int k = 0; \ + for (int i = 0; i < step; i++) { \ + uint32_t step_root = kInverseNTTRoots[i + step]; \ + for (int j = k; j < k + offset; j++) { \ + uint16_t odd = s->c[j + offset]; \ + uint16_t even = s->c[j]; \ + s->c[j] = reduce_once(odd + even); \ + s->c[j + offset] = reduce(step_root * (even - odd + kPrime)); \ + } \ + k += 2 * offset; \ + } \ + } + // for (int offset = 2; offset < DEGREE; offset <<= 1) + ITER(64, 2) + ITER(32, 4) + ITER(16, 8) + ITER(8, 16) + ITER(4, 32) + ITER(2, 64) + ITER(1, 128) + static_assert(DEGREE == 256); +#undef ITER + + for (int i = 0; i < DEGREE; i++) { + s->c[i] = reduce(s->c[i] * kInverseDegree); + } +} + +template +void vector_inverse_ntt(vector *a) { + for (int i = 0; i < RANK; i++) { + scalar_inverse_ntt(&a->v[i]); + } +} + +inline void scalar_add(scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE; i++) { + lhs->c[i] = reduce_once(lhs->c[i] + rhs->c[i]); + } +} + +inline void scalar_sub(scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE; i++) { + lhs->c[i] = reduce_once(lhs->c[i] - rhs->c[i] + kPrime); + } +} + +// Multiplying two scalars in the number theoretically transformed state. Since +// 3329 does not have a 512th root of unity, this means we have to interpret +// the 2*ith and (2*i+1)th entries of the scalar as elements of GF(3329)[X]/(X^2 +// - 17^(2*bitreverse(i)+1)) The value of 17^(2*bitreverse(i)+1) mod 3329 is +// stored in the precomputed |kModRoots| table. Note that our Barrett transform +// only allows us to multiply two reduced numbers together, so we need some +// intermediate reduction steps, even if an uint64_t could hold 3 multiplied +// numbers. +inline void scalar_mult(scalar *out, const scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE / 2; i++) { + uint32_t real_real = (uint32_t)lhs->c[2 * i] * rhs->c[2 * i]; + uint32_t img_img = (uint32_t)lhs->c[2 * i + 1] * rhs->c[2 * i + 1]; + uint32_t real_img = (uint32_t)lhs->c[2 * i] * rhs->c[2 * i + 1]; + uint32_t img_real = (uint32_t)lhs->c[2 * i + 1] * rhs->c[2 * i]; + out->c[2 * i] = + reduce(real_real + (uint32_t)reduce(img_img) * kModRoots[i]); + out->c[2 * i + 1] = reduce(img_real + real_img); + } +} + +template +inline void vector_add(vector *lhs, const vector *rhs) { + for (int i = 0; i < RANK; i++) { + scalar_add(&lhs->v[i], &rhs->v[i]); + } +} + +template +inline void matrix_mult(vector *out, const matrix *m, + const vector *a) { + vector_zero(out); + for (int i = 0; i < RANK; i++) { + for (int j = 0; j < RANK; j++) { + scalar product; + scalar_mult(&product, &m->v[i][j], &a->v[j]); + scalar_add(&out->v[i], &product); + } + } +} + +template +inline void matrix_mult_transpose(vector *out, const matrix *m, + const vector *a) { + vector_zero(out); + for (int i = 0; i < RANK; i++) { + for (int j = 0; j < RANK; j++) { + scalar product; + scalar_mult(&product, &m->v[j][i], &a->v[j]); + scalar_add(&out->v[i], &product); + } + } +} + +template +inline void scalar_inner_product(scalar *out, const vector *lhs, + const vector *rhs) { + scalar_zero(out); + for (int i = 0; i < RANK; i++) { + scalar product; + scalar_mult(&product, &lhs->v[i], &rhs->v[i]); + scalar_add(out, &product); + } +} + +inline void scalar_from_keccak_block_vartime(scalar *out, int *done, + const uint8_t block[168]) { + for (size_t i = 0; i < 168 && *done < DEGREE; i += 3) { + uint16_t d1 = block[i] + 256 * (block[i + 1] % 16); + uint16_t d2 = block[i + 1] / 16 + 16 * block[i + 2]; + if (d1 < kPrime) { + out->c[(*done)++] = d1; + } + if (d2 < kPrime && *done < DEGREE) { + out->c[(*done)++] = d2; + } + } +} + +// Algorithm 7 from the spec. Rejection samples a Keccak stream to get +// uniformly distributed elements. This is used for matrix expansion and only +// operates on public inputs. +inline void scalar_from_keccak_vartime(scalar *out, + BORINGSSL_keccak_st *keccak_ctx) { + assert(keccak_ctx->squeeze_offset == 0); + assert(keccak_ctx->rate_bytes == 168); + static_assert(168 % 3 == 0, "block and coefficient boundaries do not align"); + + int done = 0; + while (done < DEGREE) { + uint8_t block[168]; + BORINGSSL_keccak_squeeze(keccak_ctx, block, sizeof(block)); + scalar_from_keccak_block_vartime(out, &done, block); + } +} + +#if defined(HAVE_KECCAK_X2) +inline void scalar_from_keccak_vartime_x2(scalar *out[2], + BORINGSSL_keccak_st keccak_ctx[2]) { + assert(keccak_ctx[0].squeeze_offset == 0); + assert(keccak_ctx[0].rate_bytes == 168); + assert(keccak_ctx[1].squeeze_offset == 0); + assert(keccak_ctx[1].rate_bytes == 168); + static_assert(168 % 3 == 0, "block and coefficient boundaries do not align"); + + int done[2] = {0, 0}; + while (done[0] < DEGREE || done[1] < DEGREE) { + uint8_t block[2][168]; + uint8_t *blocks[] = {block[0], block[1]}; + BORINGSSL_keccak_squeeze_x2(keccak_ctx, blocks, sizeof(block[0])); + scalar_from_keccak_block_vartime(out[0], &done[0], block[0]); + scalar_from_keccak_block_vartime(out[1], &done[1], block[1]); + } +} +#endif + +static void scalar_centered_binomial_distribution_eta_2( + scalar *out, const uint8_t entropy[128]) { + for (int i = 0; i < DEGREE; i += 2) { + uint8_t byte = entropy[i / 2]; + + uint16_t value = (byte & 1) + ((byte >> 1) & 1); + value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); + // Add |kPrime| if |value| underflowed. See |reduce_once| for a discussion + // on why the value barrier is omitted. While this could have been written + // reduce_once(value + kPrime), this is one extra addition and small range + // of |value| tempts some versions of Clang to emit a branch. + uint16_t mask = 0u - (value >> 15); + out->c[i] = ((value + kPrime) & mask) | (value & ~mask); + + byte >>= 4; + value = (byte & 1) + ((byte >> 1) & 1); + value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); + // See above. + mask = 0u - (value >> 15); + out->c[i + 1] = ((value + kPrime) & mask) | (value & ~mask); + } +} + +// Algorithm 8 from the spec, with eta fixed to two and the PRF call +// included. Creates binominally distributed elements by sampling 2*|eta| bits, +// and setting the coefficient to the count of the first bits minus the count of +// the second bits, resulting in a centered binomial distribution. Since eta is +// two this gives -2/2 with a probability of 1/16, -1/1 with probability 1/4, +// and 0 with probability 3/8. +void scalar_centered_binomial_distribution_eta_2_with_prf( + scalar *out, const uint8_t input[33]) { + uint8_t entropy[128]; + static_assert(sizeof(entropy) == 2 * /*kEta=*/2 * DEGREE / 8); + prf(entropy, sizeof(entropy), input); + scalar_centered_binomial_distribution_eta_2(out, entropy); +} + +#if defined(HAVE_KECCAK_X2) +// scalar_centered_binomial_distribution_eta_2_with_prf_x2 computes two blocks +// of scalar_centered_binomial_distribution_eta_2_with_prf. +static void scalar_centered_binomial_distribution_eta_2_with_prf_x2( + scalar out[2], uint8_t input[33]) { + uint8_t entropy[2][128]; + static_assert(sizeof(entropy[0]) == 2 * /*kEta=*/2 * DEGREE / 8); + + uint8_t input1[33]; + OPENSSL_memcpy(input1, input, 33); + ++input1[32]; // Counter. + + uint8_t *entropies[] = {entropy[0], entropy[1]}; + const uint8_t *inputs[2] = {input, input1}; + prf_x2(entropies, sizeof(entropy[0]), inputs); + + for (int k = 0; k < 2; k++) { + scalar_centered_binomial_distribution_eta_2(&out[k], entropy[k]); + } +} +#endif + +// Generates a secret vector by using +// |scalar_centered_binomial_distribution_eta_2_with_prf|, using the given seed +// appending and incrementing |counter| for entry of the vector. +template +void vector_generate_secret_eta_2(vector *out, uint8_t *counter, + const uint8_t seed[32]) { + uint8_t input[33]; + OPENSSL_memcpy(input, seed, 32); + + int i = 0; +#if defined(HAVE_KECCAK_X2) + for (; i + 2 <= RANK; i += 2) { + input[32] = *counter; + *counter += 2; + scalar_centered_binomial_distribution_eta_2_with_prf_x2(&out->v[i], input); + } +#endif + + for (; i < RANK; i++) { + input[32] = (*counter)++; + scalar_centered_binomial_distribution_eta_2_with_prf(&out->v[i], input); + } +} + +template +void matrix_expand_step(scalar *out, uint8_t input[34], int i) { + input[32] = i / RANK; + input[33] = i % RANK; + BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake128); + BORINGSSL_keccak_absorb(&keccak_ctx, input, 34); + scalar_from_keccak_vartime(out, &keccak_ctx); +} + +#if defined(HAVE_KECCAK_X2) +template +void matrix_expand_step_x2(matrix out[2], uint8_t input[34], int i) { + input[32] = i / RANK; + input[33] = i % RANK; + uint8_t input1[34]; + OPENSSL_memcpy(input1, input, 32); + input1[32] = (i + 1) / RANK; + input1[33] = (i + 1) % RANK; + BORINGSSL_keccak_st keccak_ctx[2]; + BORINGSSL_keccak_init(&keccak_ctx[0], boringssl_shake128); + BORINGSSL_keccak_init(&keccak_ctx[1], boringssl_shake128); + BORINGSSL_keccak_absorb(&keccak_ctx[0], input, 34); + BORINGSSL_keccak_absorb(&keccak_ctx[1], input1, 34); + scalar *outs[] = {&out->v[i / RANK][i % RANK], + &out->v[(i + 1) / RANK][(i + 1) % RANK]}; + scalar_from_keccak_vartime_x2(outs, keccak_ctx); +} +#endif + +// Expands the matrix of a seed for key generation and for encaps-CPA. +template +void matrix_expand(matrix *out, const uint8_t rho[32]) { + uint8_t input[34]; + OPENSSL_memcpy(input, rho, 32); + + int i = 0; +#if defined(HAVE_KECCAK_X2) + for (; i + 2 <= RANK * RANK; i += 2) { + matrix_expand_step_x2(out, input, i); + } +#endif + + for (; i < RANK * RANK; i++) { + matrix_expand_step(&out->v[i / RANK][i % RANK], input, i); + } +} + +// Encodes a scalar of 256 |BITS|-bit words into 32*|BITS| bytes by splitting +// and joining into bytes using LSB-first bit order (i.e. opposite to standard +// reading order). See below for examples. If an input is >= 1 << |BITS|, the +// result is undefined. +template +void scalar_encode(uint8_t *out, const scalar *s); + +// Encodes a scalar of 256 10-bit words into 320 bytes as follows: +// 000000Aaaaaaaaaa 000000Bbbbbbbbbb 000000Cccccccccc 000000Dddddddddd ... +// -> aaaaaaaa bbbbbbAa ccccBbbb ddCccccc Dddddddd ... +template <> +void scalar_encode<10>(uint8_t out[320], const scalar *s) { + for (int i = 0; i < DEGREE; i += 4) { + uint16_t s0 = s->c[i]; + uint16_t s1 = s->c[i + 1]; + uint16_t s2 = s->c[i + 2]; + uint16_t s3 = s->c[i + 3]; + declassify_assert((s0 | s1 | s2 | s3) < (1 << 10)); + out[0] = (uint8_t)s0; + out[1] = (uint8_t)((s0 >> 8) | (s1 << 2)); + out[2] = (uint8_t)((s1 >> 6) | (s2 << 4)); + out[3] = (uint8_t)((s2 >> 4) | (s3 << 6)); + out[4] = (uint8_t)(s3 >> 2); + out += 5; + } +} + +// Encodes a scalar of 256 12-bit words into 384 bytes as follows: +// 0000Aaaaaaaaaaaa 0000Bbbbbbbbbbbb 0000Cccccccccccc 0000Dddddddddddd ... +// -> aaaaaaaa bbbbAaaa Bbbbbbbb cccccccc ddddCccc Dddddddd .... +template <> +void scalar_encode<12>(uint8_t out[384], const scalar *s) { + for (int i = 0; i < DEGREE; i += 2) { + uint16_t s0 = s->c[i]; + uint16_t s1 = s->c[i + 1]; + declassify_assert((s0 | s1) < (1 << 12)); + out[0] = (uint8_t)s0; + out[1] = (uint8_t)((s0 >> 8) | (s1 << 4)); + out[2] = (uint8_t)(s1 >> 4); + out += 3; + } +} + +// Encodes a scalar of 256 4-bit words into 128 bytes as follows: +// 000000000000Aaaa 00000000000Bbbb 000000000000Cccc 000000000000Dddd ... +// -> BbbbAaaa DdddCccc ... +template <> +void scalar_encode<4>(uint8_t out[128], const scalar *s) { + for (int i = 0; i < DEGREE; i += 2) { + uint16_t s0 = s->c[i]; + uint16_t s1 = s->c[i + 1]; + declassify_assert((s0 | s1) < (1 << 4)); + out[0] = (uint8_t)(s0 | (s1 << 4)); + out += 1; + } +} + +// Encodes a scalar of 256 11-bit words into 352 bytes as follows: +// 00000Aaaaaaaaaaa 00000Bbbbbbbbbbb 00000Ccccccccccc 00000Ddddddddddd ... +// -> aaaaaaaa bbbbbAaa ccBbbbbb cccccccc dddddddC eeeeDddd fEeeeeee ... +template <> +void scalar_encode<11>(uint8_t out[352], const scalar *s) { + for (int i = 0; i < DEGREE; i += 8) { + uint16_t s0 = s->c[i]; + uint16_t s1 = s->c[i + 1]; + uint16_t s2 = s->c[i + 2]; + uint16_t s3 = s->c[i + 3]; + uint16_t s4 = s->c[i + 4]; + uint16_t s5 = s->c[i + 5]; + uint16_t s6 = s->c[i + 6]; + uint16_t s7 = s->c[i + 7]; + declassify_assert((s0 | s1 | s2 | s3 | s4 | s5 | s6 | s7) < (1 << 11)); + out[0] = (uint8_t)s0; + out[1] = (uint8_t)((s0 >> 8) | (s1 << 3)); + out[2] = (uint8_t)((s1 >> 5) | (s2 << 6)); + out[3] = (uint8_t)(s2 >> 2); + out[4] = (uint8_t)((s2 >> 10) | (s3 << 1)); + out[5] = (uint8_t)((s3 >> 7) | (s4 << 4)); + out[6] = (uint8_t)((s4 >> 4) | (s5 << 7)); + out[7] = (uint8_t)(s5 >> 1); + out[8] = (uint8_t)((s5 >> 9) | (s6 << 2)); + out[9] = (uint8_t)((s6 >> 6) | (s7 << 5)); + out[10] = (uint8_t)(s7 >> 3); + out += 11; + } +} + +// Encodes a scalar of 256 5-bit words into 160 bytes as follows: +// 00000000000Aaaaa 00000000000Bbbbb 00000000000Ccccc 00000000000Ddddd ... +// -> bbbAaaaa dCccccBb eeeeDddd ggFffffE HhhhhGgg ... +template <> +void scalar_encode<5>(uint8_t out[160], const scalar *s) { + for (int i = 0; i < DEGREE; i += 8) { + uint16_t s0 = s->c[i]; + uint16_t s1 = s->c[i + 1]; + uint16_t s2 = s->c[i + 2]; + uint16_t s3 = s->c[i + 3]; + uint16_t s4 = s->c[i + 4]; + uint16_t s5 = s->c[i + 5]; + uint16_t s6 = s->c[i + 6]; + uint16_t s7 = s->c[i + 7]; + declassify_assert((s0 | s1 | s2 | s3 | s4 | s5 | s6 | s7) < (1 << 5)); + out[0] = (uint8_t)(s0 | (s1 << 5)); + out[1] = (uint8_t)((s1 >> 3) | (s2 << 2) | (s3 << 7)); + out[2] = (uint8_t)((s3 >> 1) | (s4 << 4)); + out[3] = (uint8_t)((s4 >> 4) | (s5 << 1) | (s6 << 6)); + out[4] = (uint8_t)((s6 >> 2) | (s7 << 3)); + out += 5; + } +} + +// Encodes a scalar of 256 1-bit "words" into 32 bytes as follows: +// 0000000000000A 000000000000000B 000000000000000C 000000000000000D ... +// -> HGFEDCBA PONMLKJI XWVUTSRQ ... +// This order is best understood as the natural way of joining into bytes +// assuming LSB-first bit order. +template <> +void scalar_encode<1>(uint8_t out[32], const scalar *s) { + for (int i = 0; i < DEGREE; i += 8) { + uint8_t out_byte = 0; + for (int j = 0; j < 8; j++) { + declassify_assert(s->c[i + j] <= 1); + out_byte |= s->c[i + j] << j; + } + out[i / 8] = out_byte; + } +} + +// Encodes an entire vector into 32*|RANK|*|bits| bytes. Note that since 256 +// (DEGREE) is divisible by 8, the individual vector entries will always fill a +// whole number of bytes, so we do not need to worry about bit packing here. +template +void vector_encode(uint8_t *out, const vector *a) { + for (int i = 0; i < RANK; i++) { + scalar_encode(out + i * bits * DEGREE / 8, &a->v[i]); + } +} + +// The inverse of |scalar_encode|. Returns 1 iff the encoded scalar is valid, +// i.e. all components are < |kPrime|. Otherwise, returns 0 and the value of +// |out| is undefined. +template +int scalar_decode(scalar *out, const uint8_t *in); + +template <> +int scalar_decode<10>(scalar *out, const uint8_t in[320]) { + for (int i = 0; i < DEGREE; i += 4) { + uint16_t s0 = (uint16_t)(in[0] | ((in[1] & 0x03) << 8)); + uint16_t s1 = (uint16_t)((in[1] >> 2) | ((in[2] & 0x0f) << 6)); + uint16_t s2 = (uint16_t)((in[2] >> 4) | ((in[3] & 0x3f) << 4)); + uint16_t s3 = (uint16_t)((in[3] >> 6) | (in[4] << 2)); + out->c[i] = s0; + out->c[i + 1] = s1; + out->c[i + 2] = s2; + out->c[i + 3] = s3; + in += 5; + } + return 1; +} + +template <> +int scalar_decode<12>(scalar *out, const uint8_t in[384]) { + for (int i = 0; i < DEGREE; i += 2) { + uint16_t s0 = (uint16_t)(in[0] | ((in[1] & 0x0f) << 8)); + uint16_t s1 = (uint16_t)((in[1] >> 4) | (in[2] << 4)); + if (constant_time_declassify_int((s0 | s1) >= kPrime)) { + if (s0 >= kPrime || s1 >= kPrime) { + return 0; + } + } + out->c[i] = s0; + out->c[i + 1] = s1; + in += 3; + } + return 1; +} + +template <> +int scalar_decode<4>(scalar *out, const uint8_t in[128]) { + for (int i = 0; i < DEGREE; i += 2) { + uint16_t s0 = (uint16_t)(in[0] & 0x0f); + uint16_t s1 = (uint16_t)(in[0] >> 4); + // kPrime is 3329, so 4-bit values are always < kPrime. + out->c[i] = s0; + out->c[i + 1] = s1; + in += 1; + } + return 1; +} + +// scalar_decode parses |DEGREE * bits| bits from |in| into |DEGREE| values in +// |out|. It returns one on success and zero if any parsed value is >= +// |kPrime|. +template <> +int scalar_decode<11>(scalar *out, const uint8_t in[352]) { + for (int i = 0; i < DEGREE; i += 8) { + uint16_t s0 = (uint16_t)(in[0] | ((in[1] & 0x07) << 8)); + uint16_t s1 = (uint16_t)((in[1] >> 3) | ((in[2] & 0x3f) << 5)); + uint16_t s2 = + (uint16_t)((in[2] >> 6) | (in[3] << 2) | ((in[4] & 0x01) << 10)); + uint16_t s3 = (uint16_t)((in[4] >> 1) | ((in[5] & 0x0f) << 7)); + uint16_t s4 = (uint16_t)((in[5] >> 4) | ((in[6] & 0x7f) << 4)); + uint16_t s5 = + (uint16_t)((in[6] >> 7) | (in[7] << 1) | ((in[8] & 0x03) << 9)); + uint16_t s6 = (uint16_t)((in[8] >> 2) | ((in[9] & 0x1f) << 6)); + uint16_t s7 = (uint16_t)((in[9] >> 5) | (in[10] << 3)); + out->c[i] = s0; + out->c[i + 1] = s1; + out->c[i + 2] = s2; + out->c[i + 3] = s3; + out->c[i + 4] = s4; + out->c[i + 5] = s5; + out->c[i + 6] = s6; + out->c[i + 7] = s7; + in += 11; + } + return 1; +} + +template <> +int scalar_decode<5>(scalar *out, const uint8_t in[160]) { + for (int i = 0; i < DEGREE; i += 8) { + uint16_t s0 = (uint16_t)(in[0] & 0x1f); + uint16_t s1 = (uint16_t)((in[0] >> 5) | ((in[1] & 0x03) << 3)); + uint16_t s2 = (uint16_t)((in[1] >> 2) & 0x1f); + uint16_t s3 = (uint16_t)((in[1] >> 7) | ((in[2] & 0x0f) << 1)); + uint16_t s4 = (uint16_t)((in[2] >> 4) | ((in[3] & 0x01) << 4)); + uint16_t s5 = (uint16_t)((in[3] >> 1) & 0x1f); + uint16_t s6 = (uint16_t)((in[3] >> 6) | ((in[4] & 0x07) << 2)); + uint16_t s7 = (uint16_t)(in[4] >> 3); + // kPrime is 3329, so 5-bit values are always < kPrime. + out->c[i] = s0; + out->c[i + 1] = s1; + out->c[i + 2] = s2; + out->c[i + 3] = s3; + out->c[i + 4] = s4; + out->c[i + 5] = s5; + out->c[i + 6] = s6; + out->c[i + 7] = s7; + in += 5; + } + return 1; +} + +template <> +int scalar_decode<1>(scalar *out, const uint8_t in[32]) { + for (int i = 0; i < DEGREE; i += 8) { + uint8_t in_byte = in[i / 8]; + for (int j = 0; j < 8; j++) { + out->c[i + j] = (in_byte >> j) & 1; + } + } + return 1; +} + +// Decodes 32*|RANK|*|bits| bytes from |in| into |out|. It returns one on +// success or zero if any parsed value is >= |kPrime|. +template +inline int vector_decode(vector *out, const uint8_t *in) { + for (int i = 0; i < RANK; i++) { + if (!scalar_decode(&out->v[i], in + i * bits * DEGREE / 8)) { + return 0; + } + } + return 1; +} + +// Compresses (lossily) an input |x| mod 3329 into |bits| many bits by grouping +// numbers close to each other together. The formula used is +// round(2^|bits|/kPrime*x) mod 2^|bits|. +// Uses Barrett reduction to achieve constant time. Since we need both the +// remainder (for rounding) and the quotient (as the result), we cannot use +// |reduce| here, but need to do the Barrett reduction directly. +inline uint16_t compress(uint16_t x, int bits) { + uint32_t shifted = (uint32_t)x << bits; + uint64_t product = (uint64_t)shifted * kBarrettMultiplier; + uint32_t quotient = (uint32_t)(product >> kBarrettShift); + uint32_t remainder = shifted - quotient * kPrime; + + // Adjust the quotient to round correctly: + // 0 <= remainder <= kHalfPrime round to 0 + // kHalfPrime < remainder <= kPrime + kHalfPrime round to 1 + // kPrime + kHalfPrime < remainder < 2 * kPrime round to 2 + declassify_assert(remainder < 2u * kPrime); + quotient += 1 & constant_time_lt_w(kHalfPrime, remainder); + quotient += 1 & constant_time_lt_w(kPrime + kHalfPrime, remainder); + return quotient & ((1 << bits) - 1); +} + +// Decompresses |x| by using an equi-distant representative. The formula is +// round(kPrime/2^|bits|*x). Note that 2^|bits| being the divisor allows us to +// implement this logic using only bit operations. +inline uint16_t decompress(uint16_t x, int bits) { + uint32_t product = (uint32_t)x * kPrime; + uint32_t power = 1 << bits; + // This is |product| % power, since |power| is a power of 2. + uint32_t remainder = product & (power - 1); + // This is |product| / power, since |power| is a power of 2. + uint32_t lower = product >> bits; + // The rounding logic works since the first half of numbers mod |power| have a + // 0 as first bit, and the second half has a 1 as first bit, since |power| is + // a power of 2. As a 12 bit number, |remainder| is always positive, so we + // will shift in 0s for a right shift. + return lower + (remainder >> (bits - 1)); +} + +inline void scalar_compress(scalar *s, int bits) { + for (int i = 0; i < DEGREE; i++) { + s->c[i] = compress(s->c[i], bits); + } +} + +inline void scalar_decompress(scalar *s, int bits) { + for (int i = 0; i < DEGREE; i++) { + s->c[i] = decompress(s->c[i], bits); + } +} + +template +void vector_compress(vector *a, int bits) { + for (int i = 0; i < RANK; i++) { + scalar_compress(&a->v[i], bits); + } +} + +template +void vector_decompress(vector *a, int bits) { + for (int i = 0; i < RANK; i++) { + scalar_decompress(&a->v[i], bits); + } +} + +template +struct public_key { + vector t; + uint8_t rho[32]; + uint8_t public_key_hash[32]; + matrix m; +}; + +template +struct private_key { + public_key pub; + vector s; + uint8_t fo_failure_secret[32]; +}; + +template +inline void decrypt_cpa(uint8_t out[32], const private_key *priv, + const uint8_t ciphertext[MLKEM768_CIPHERTEXT_BYTES]) { + constexpr int du = RANK == RANK768 ? kDU768 : kDU1024; + constexpr int dv = RANK == RANK768 ? kDV768 : kDV1024; + + vector u; + vector_decode(&u, ciphertext); + vector_decompress(&u, du); + vector_ntt(&u); + scalar v; + scalar_decode(&v, ciphertext + compressed_vector_size(RANK)); + scalar_decompress(&v, dv); + scalar mask; + scalar_inner_product(&mask, &priv->s, &u); + scalar_inverse_ntt(&mask); + scalar_sub(&v, &mask); + scalar_compress(&v, 1); + scalar_encode<1>(out, &v); +} + +template +inline bcm_status mlkem_marshal_public_key(CBB *out, + const public_key *pub) { + uint8_t *vector_output; + if (!CBB_add_space(out, &vector_output, encoded_vector_size(RANK))) { + return bcm_status::failure; + } + vector_encode(vector_output, &pub->t); + if (!CBB_add_bytes(out, pub->rho, sizeof(pub->rho))) { + return bcm_status::failure; + } + return bcm_status::approved; +} + +template +inline bool mlkem_public_keys_equal(const public_key *a, + const public_key *b) { + return OPENSSL_memcmp(a->public_key_hash, b->public_key_hash, 32) == 0; +} + +template +void mlkem_generate_key_external_seed_no_self_test( + uint8_t *out_encoded_public_key, private_key *priv, + const uint8_t seed[MLKEM_SEED_BYTES]) { + uint8_t augmented_seed[33]; + OPENSSL_memcpy(augmented_seed, seed, 32); + augmented_seed[32] = RANK; + + uint8_t hashed[64]; + hash_g(hashed, augmented_seed, sizeof(augmented_seed)); + const uint8_t *const rho = hashed; + const uint8_t *const sigma = hashed + 32; + // rho is public. + CONSTTIME_DECLASSIFY(rho, 32); + OPENSSL_memcpy(priv->pub.rho, hashed, sizeof(priv->pub.rho)); + matrix_expand(&priv->pub.m, rho); + uint8_t counter = 0; + vector_generate_secret_eta_2(&priv->s, &counter, sigma); + vector_ntt(&priv->s); + vector error; + vector_generate_secret_eta_2(&error, &counter, sigma); + vector_ntt(&error); + matrix_mult_transpose(&priv->pub.t, &priv->pub.m, &priv->s); + vector_add(&priv->pub.t, &error); + // t is part of the public key and thus is public. + CONSTTIME_DECLASSIFY(&priv->pub.t, sizeof(priv->pub.t)); + + CBB cbb; + CBB_init_fixed(&cbb, out_encoded_public_key, encoded_public_key_size(RANK)); + if (!bcm_success(mlkem_marshal_public_key(&cbb, &priv->pub))) { + abort(); + } + + hash_h(priv->pub.public_key_hash, out_encoded_public_key, + encoded_public_key_size(RANK)); + OPENSSL_memcpy(priv->fo_failure_secret, seed + 32, 32); +} + +template +void mlkem_generate_key_external_seed(uint8_t *out_encoded_public_key, + private_key *priv, + const uint8_t seed[MLKEM_SEED_BYTES]) { + fips::ensure_keygen_self_test(); + mlkem_generate_key_external_seed_no_self_test(out_encoded_public_key, priv, + seed); +} + +// Encrypts a message with given randomness to +// the ciphertext in |out|. Without applying the Fujisaki-Okamoto transform this +// would not result in a CCA secure scheme, since lattice schemes are vulnerable +// to decryption failure oracles. +template +void encrypt_cpa(uint8_t *out, const mlkem::public_key *pub, + const uint8_t message[32], const uint8_t randomness[32]) { + constexpr int du = RANK == RANK768 ? mlkem::kDU768 : mlkem::kDU1024; + constexpr int dv = RANK == RANK768 ? mlkem::kDV768 : mlkem::kDV1024; + + uint8_t counter = 0; + mlkem::vector secret; + vector_generate_secret_eta_2(&secret, &counter, randomness); + vector_ntt(&secret); + mlkem::vector error; + vector_generate_secret_eta_2(&error, &counter, randomness); + uint8_t input[33]; + OPENSSL_memcpy(input, randomness, 32); + input[32] = counter; + mlkem::scalar scalar_error; + scalar_centered_binomial_distribution_eta_2_with_prf(&scalar_error, input); + mlkem::vector u; + matrix_mult(&u, &pub->m, &secret); + vector_inverse_ntt(&u); + vector_add(&u, &error); + mlkem::scalar v; + scalar_inner_product(&v, &pub->t, &secret); + scalar_inverse_ntt(&v); + scalar_add(&v, &scalar_error); + mlkem::scalar expanded_message; + scalar_decode<1>(&expanded_message, message); + scalar_decompress(&expanded_message, 1); + scalar_add(&v, &expanded_message); + vector_compress(&u, du); + vector_encode(out, &u); + scalar_compress(&v, dv); + scalar_encode(out + mlkem::compressed_vector_size(RANK), &v); +} + +// See section 6.3 +template +void mlkem_decap_no_self_test( + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const uint8_t *ciphertext, const private_key *priv) { + uint8_t decrypted[64]; + decrypt_cpa(decrypted, priv, ciphertext); + OPENSSL_memcpy(decrypted + 32, priv->pub.public_key_hash, + sizeof(decrypted) - 32); + uint8_t key_and_randomness[64]; + hash_g(key_and_randomness, decrypted, sizeof(decrypted)); + constexpr size_t ciphertext_len = ciphertext_size(RANK); + uint8_t expected_ciphertext[MLKEM1024_CIPHERTEXT_BYTES]; + static_assert(ciphertext_len <= sizeof(expected_ciphertext)); + encrypt_cpa(expected_ciphertext, &priv->pub, decrypted, + key_and_randomness + 32); + + uint8_t failure_key[32]; + kdf(failure_key, priv->fo_failure_secret, ciphertext, ciphertext_len); + + uint8_t mask = constant_time_eq_int_8( + CRYPTO_memcmp(ciphertext, expected_ciphertext, ciphertext_len), 0); + for (int i = 0; i < MLKEM_SHARED_SECRET_BYTES; i++) { + out_shared_secret[i] = + constant_time_select_8(mask, key_and_randomness[i], failure_key[i]); + } +} + +template +void mlkem_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const uint8_t *ciphertext, const private_key *priv) { + fips::ensure_decap_self_test(); + mlkem_decap_no_self_test(out_shared_secret, ciphertext, priv); +} + +// mlkem_parse_public_key_with_trailing_data parses |in| into |pub| but leaves +// trailing data in |in| for the caller. +template +int mlkem_parse_public_key_with_trailing_data(public_key *pub, CBS *in) { + CBS orig_in = *in; + CBS t_bytes; + if (!CBS_get_bytes(in, &t_bytes, encoded_vector_size(RANK)) || + !vector_decode(&pub->t, CBS_data(&t_bytes)) || + !CBS_copy_bytes(in, pub->rho, sizeof(pub->rho))) { + return 0; + } + matrix_expand(&pub->m, pub->rho); + size_t pub_key_len = CBS_len(&orig_in) - CBS_len(in); + assert(pub_key_len == encoded_public_key_size(RANK)); + hash_h(pub->public_key_hash, CBS_data(&orig_in), pub_key_len); + return 1; +} + +template +int mlkem_parse_public_key(public_key *pub, CBS *in) { + if (!mlkem_parse_public_key_with_trailing_data(pub, in) || // + CBS_len(in) != 0) { + return 0; + } + return 1; +} + +template +int mlkem_parse_private_key(private_key *priv, CBS *in) { + CBS s_bytes, public_key_hash; + if (!CBS_get_bytes(in, &s_bytes, encoded_vector_size(RANK)) || + !vector_decode(&priv->s, CBS_data(&s_bytes)) || + !mlkem_parse_public_key_with_trailing_data(&priv->pub, in) || + // We compute the public key hash ourselves, but check it matched. + !CBS_get_bytes(in, &public_key_hash, sizeof(priv->pub.public_key_hash)) || + !CBS_mem_equal(&public_key_hash, priv->pub.public_key_hash, + sizeof(priv->pub.public_key_hash)) || + !CBS_copy_bytes(in, priv->fo_failure_secret, + sizeof(priv->fo_failure_secret)) || + CBS_len(in) != 0) { + return 0; + } + return 1; +} + +template +int mlkem_marshal_private_key(CBB *out, const private_key *priv) { + uint8_t *s_output; + if (!CBB_add_space(out, &s_output, encoded_vector_size(RANK))) { + return 0; + } + vector_encode(s_output, &priv->s); + if (!bcm_success(mlkem_marshal_public_key(out, &priv->pub)) || + !CBB_add_bytes(out, priv->pub.public_key_hash, + sizeof(priv->pub.public_key_hash)) || + !CBB_add_bytes(out, priv->fo_failure_secret, + sizeof(priv->fo_failure_secret))) { + return 0; + } + return 1; +} + +static_assert(sizeof(MLKEM768_public_key) == sizeof(public_key)); +static_assert(alignof(MLKEM768_public_key) == alignof(public_key)); + +const public_key *public_key_768_from_external( + const MLKEM768_public_key *external) { + return reinterpret_cast *>(external); +} +public_key *public_key_768_from_external( + MLKEM768_public_key *external) { + return reinterpret_cast *>(external); +} +const MLKEM768_public_key *public_key_768_to_external( + const public_key *public_key) { + return reinterpret_cast(public_key); +} + +static_assert(sizeof(MLKEM1024_public_key) == sizeof(public_key)); +static_assert(alignof(MLKEM1024_public_key) == alignof(public_key)); + +const public_key *public_key_1024_from_external( + const MLKEM1024_public_key *external) { + return reinterpret_cast *>(external); +} +public_key *public_key_1024_from_external( + MLKEM1024_public_key *external) { + return reinterpret_cast *>(external); +} + +const MLKEM1024_public_key *public_key_1024_to_external( + const public_key *public_key) { + return reinterpret_cast(public_key); +} + +static_assert(sizeof(MLKEM768_private_key) >= sizeof(private_key)); +static_assert(alignof(MLKEM768_private_key) >= alignof(private_key)); + +const private_key *private_key_768_from_external( + const MLKEM768_private_key *external) { + return reinterpret_cast *>(external); +} +private_key *private_key_768_from_external( + MLKEM768_private_key *external) { + return reinterpret_cast *>(external); +} + +static_assert(sizeof(MLKEM1024_private_key) >= sizeof(private_key)); +static_assert(alignof(MLKEM1024_private_key) >= alignof(private_key)); + +const private_key *private_key_1024_from_external( + const MLKEM1024_private_key *external) { + return reinterpret_cast *>(external); +} +private_key *private_key_1024_from_external( + MLKEM1024_private_key *external) { + return reinterpret_cast *>(external); +} + +// See section 6.2. +template +void mlkem_encap_external_entropy_no_self_test( + uint8_t *out_ciphertext, + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const mlkem::public_key *pub, + const uint8_t entropy[BCM_MLKEM_ENCAP_ENTROPY]) { + uint8_t input[64]; + OPENSSL_memcpy(input, entropy, BCM_MLKEM_ENCAP_ENTROPY); + OPENSSL_memcpy(input + BCM_MLKEM_ENCAP_ENTROPY, pub->public_key_hash, + sizeof(input) - BCM_MLKEM_ENCAP_ENTROPY); + uint8_t key_and_randomness[64]; + mlkem::hash_g(key_and_randomness, input, sizeof(input)); + encrypt_cpa(out_ciphertext, pub, entropy, key_and_randomness + 32); + // The ciphertext is public. + CONSTTIME_DECLASSIFY(out_ciphertext, mlkem::ciphertext_size(RANK)); + static_assert(MLKEM_SHARED_SECRET_BYTES == 32); + memcpy(out_shared_secret, key_and_randomness, 32); +} + +template +void mlkem_encap_external_entropy( + uint8_t *out_ciphertext, + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const mlkem::public_key *pub, + const uint8_t entropy[BCM_MLKEM_ENCAP_ENTROPY]) { + fips::ensure_encap_self_test(); + mlkem_encap_external_entropy_no_self_test(out_ciphertext, out_shared_secret, + pub, entropy); +} + +namespace fips { + +#include "fips_known_values.inc" + +inline int keygen_self_test() { + uint8_t pub_key[MLKEM768_PUBLIC_KEY_BYTES]; + private_key priv; + static_assert(sizeof(kTestEntropy) >= MLKEM_SEED_BYTES); + mlkem_generate_key_external_seed_no_self_test(pub_key, &priv, kTestEntropy); + CBB cbb; + constexpr size_t kMarshaledPrivateKeySize = 2400; + uint8_t priv_bytes[kMarshaledPrivateKeySize]; + CBB_init_fixed(&cbb, priv_bytes, sizeof(priv_bytes)); + if (!mlkem_marshal_private_key(&cbb, &priv) || + !BORINGSSL_check_test(kExpectedPrivateKeyBytes, priv_bytes, + "ML-KEM keygen private key") || + !BORINGSSL_check_test(kExpectedPublicKeyBytes, pub_key, + "ML-KEM keygen public key")) { + return 0; + } + return 1; +} + +inline int encap_self_test() { + CBS cbs; + CBS_init(&cbs, kExpectedPublicKeyBytes, sizeof(kExpectedPublicKeyBytes)); + public_key pub; + if (!mlkem_parse_public_key(&pub, &cbs)) { + return 0; + } + uint8_t ciphertext[MLKEM768_CIPHERTEXT_BYTES]; + uint8_t shared_secret[MLKEM_SHARED_SECRET_BYTES]; + static_assert(sizeof(kTestEntropy) >= BCM_MLKEM_ENCAP_ENTROPY); + mlkem_encap_external_entropy_no_self_test(ciphertext, shared_secret, &pub, + kTestEntropy); + if (!BORINGSSL_check_test(ciphertext, kExpectedCiphertext, + "ML-KEM encap ciphertext") || + !BORINGSSL_check_test(kExpectedSharedSecret, shared_secret, + "ML-KEM encap shared secret")) { + return 0; + } + return 1; +} + +inline int decap_self_test() { + CBS cbs; + CBS_init(&cbs, kExpectedPrivateKeyBytes, sizeof(kExpectedPrivateKeyBytes)); + private_key priv; + if (!mlkem_parse_private_key(&priv, &cbs)) { + return 0; + } + uint8_t shared_secret[MLKEM_SHARED_SECRET_BYTES]; + mlkem_decap_no_self_test(shared_secret, kExpectedCiphertext, &priv); + if (!BORINGSSL_check_test(kExpectedSharedSecret, shared_secret, + "ML-KEM decap shared secret")) { + return 0; + } + + uint8_t implicit_rejection_shared_secret[MLKEM_SHARED_SECRET_BYTES]; + static_assert(sizeof(kExpectedPrivateKeyBytes) >= + sizeof(kExpectedCiphertext)); + mlkem_decap_no_self_test(implicit_rejection_shared_secret, + kExpectedPrivateKeyBytes, &priv); + if (!BORINGSSL_check_test(kExpectedImplicitRejectionSharedSecret, + implicit_rejection_shared_secret, + "ML-KEM decap implicit rejection shared secret")) { + return 0; + } + return 1; +} + +#if defined(BORINGSSL_FIPS) + +void ensure_keygen_self_test() { + CRYPTO_once(g_mlkem_keygen_self_test_once_bss_get(), []() { + if (!keygen_self_test()) { + BORINGSSL_FIPS_abort(); + } + }); +} + +void ensure_encap_self_test() { + CRYPTO_once(g_mlkem_encap_self_test_once_bss_get(), []() { + if (!encap_self_test()) { + BORINGSSL_FIPS_abort(); + } + }); +} + +void ensure_decap_self_test() { + CRYPTO_once(g_mlkem_decap_self_test_once_bss_get(), []() { + if (!decap_self_test()) { + BORINGSSL_FIPS_abort(); + } + }); +} + +#else + +void ensure_keygen_self_test() {} +void ensure_encap_self_test() {} +void ensure_decap_self_test() {} + +#endif +} // namespace fips + +} // namespace +} // namespace mlkem + +bcm_status bssl::BCM_mlkem768_check_fips( + const MLKEM768_private_key *private_key) { + const mlkem::private_key *priv = + mlkem::private_key_768_from_external(private_key); + + const uint8_t entropy[BCM_MLKEM_ENCAP_ENTROPY] = {1, 2, 3, 4}; + uint8_t ciphertext[MLKEM768_CIPHERTEXT_BYTES]; + uint8_t shared_secret[MLKEM_SHARED_SECRET_BYTES]; + mlkem_encap_external_entropy_no_self_test(ciphertext, shared_secret, + &priv->pub, entropy); + + if (boringssl_fips_break_test("MLKEM_PWCT")) { + shared_secret[0] ^= 1; + } + + uint8_t shared_secret2[MLKEM_SHARED_SECRET_BYTES]; + mlkem::mlkem_decap_no_self_test(shared_secret2, ciphertext, priv); + if (CRYPTO_memcmp(shared_secret, shared_secret2, sizeof(shared_secret)) != + 0) { + return bcm_status::failure; + } + return bcm_status::approved; +} + +bcm_status bssl::BCM_mlkem768_generate_key_fips( + uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + MLKEM768_private_key *out_private_key) { + if (out_encoded_public_key == nullptr || out_private_key == nullptr) { + return bcm_status::failure; + } + BCM_mlkem768_generate_key(out_encoded_public_key, optional_out_seed, + out_private_key); + return BCM_mlkem768_check_fips(out_private_key); +} + +bcm_infallible bssl::BCM_mlkem768_generate_key( + uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + MLKEM768_private_key *out_private_key) { + uint8_t seed[MLKEM_SEED_BYTES]; + BCM_rand_bytes(seed, sizeof(seed)); + CONSTTIME_SECRET(seed, sizeof(seed)); + if (optional_out_seed) { + OPENSSL_memcpy(optional_out_seed, seed, sizeof(seed)); + } + BCM_mlkem768_generate_key_external_seed(out_encoded_public_key, + out_private_key, seed); + return bcm_infallible::not_approved; +} + +bcm_status bssl::BCM_mlkem768_private_key_from_seed( + MLKEM768_private_key *out_private_key, const uint8_t *seed, + size_t seed_len) { + if (seed_len != MLKEM_SEED_BYTES) { + return bcm_status::failure; + } + + uint8_t public_key_bytes[MLKEM768_PUBLIC_KEY_BYTES]; + BCM_mlkem768_generate_key_external_seed(public_key_bytes, out_private_key, + seed); + return bcm_status::not_approved; +} + +bcm_status bssl::BCM_mlkem1024_check_fips( + const MLKEM1024_private_key *private_key) { + const mlkem::private_key *priv = + mlkem::private_key_1024_from_external(private_key); + + const uint8_t entropy[BCM_MLKEM_ENCAP_ENTROPY] = {1, 2, 3, 4}; + uint8_t ciphertext[MLKEM1024_CIPHERTEXT_BYTES]; + uint8_t shared_secret[MLKEM_SHARED_SECRET_BYTES]; + mlkem_encap_external_entropy_no_self_test(ciphertext, shared_secret, + &priv->pub, entropy); + + if (boringssl_fips_break_test("MLKEM_PWCT")) { + shared_secret[0] ^= 1; + } + + uint8_t shared_secret2[MLKEM_SHARED_SECRET_BYTES]; + mlkem::mlkem_decap_no_self_test(shared_secret2, ciphertext, priv); + if (CRYPTO_memcmp(shared_secret, shared_secret2, sizeof(shared_secret)) != + 0) { + return bcm_status::failure; + } + return bcm_status::approved; +} + +bcm_status bssl::BCM_mlkem1024_generate_key_fips( + uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + MLKEM1024_private_key *out_private_key) { + if (out_encoded_public_key == nullptr || out_private_key == nullptr) { + return bcm_status::failure; + } + BCM_mlkem1024_generate_key(out_encoded_public_key, optional_out_seed, + out_private_key); + return BCM_mlkem1024_check_fips(out_private_key); +} + +bcm_infallible bssl::BCM_mlkem1024_generate_key( + uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + MLKEM1024_private_key *out_private_key) { + uint8_t seed[MLKEM_SEED_BYTES]; + BCM_rand_bytes(seed, sizeof(seed)); + CONSTTIME_SECRET(seed, sizeof(seed)); + if (optional_out_seed) { + OPENSSL_memcpy(optional_out_seed, seed, sizeof(seed)); + } + BCM_mlkem1024_generate_key_external_seed(out_encoded_public_key, + out_private_key, seed); + return bcm_infallible::not_approved; +} + +bcm_status bssl::BCM_mlkem1024_private_key_from_seed( + MLKEM1024_private_key *out_private_key, const uint8_t *seed, + size_t seed_len) { + if (seed_len != MLKEM_SEED_BYTES) { + return bcm_status::failure; + } + uint8_t public_key_bytes[MLKEM1024_PUBLIC_KEY_BYTES]; + BCM_mlkem1024_generate_key_external_seed(public_key_bytes, out_private_key, + seed); + return bcm_status::not_approved; +} + +bcm_infallible bssl::BCM_mlkem768_generate_key_external_seed( + uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES], + MLKEM768_private_key *out_private_key, + const uint8_t seed[MLKEM_SEED_BYTES]) { + mlkem::private_key *priv = + mlkem::private_key_768_from_external(out_private_key); + mlkem_generate_key_external_seed(out_encoded_public_key, priv, seed); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_mlkem1024_generate_key_external_seed( + uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES], + MLKEM1024_private_key *out_private_key, + const uint8_t seed[MLKEM_SEED_BYTES]) { + mlkem::private_key *priv = + mlkem::private_key_1024_from_external(out_private_key); + mlkem_generate_key_external_seed(out_encoded_public_key, priv, seed); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_mlkem768_public_from_private( + MLKEM768_public_key *out_public_key, + const MLKEM768_private_key *private_key) { + mlkem::public_key *const pub = + mlkem::public_key_768_from_external(out_public_key); + const mlkem::private_key *const priv = + mlkem::private_key_768_from_external(private_key); + *pub = priv->pub; + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_mlkem1024_public_from_private( + MLKEM1024_public_key *out_public_key, + const MLKEM1024_private_key *private_key) { + mlkem::public_key *const pub = + mlkem::public_key_1024_from_external(out_public_key); + const mlkem::private_key *const priv = + mlkem::private_key_1024_from_external(private_key); + *pub = priv->pub; + return bcm_infallible::approved; +} + +const MLKEM768_public_key *bssl::BCM_mlkem768_public_of_private( + const MLKEM768_private_key *private_key) { + const mlkem::private_key *const priv = + mlkem::private_key_768_from_external(private_key); + return mlkem::public_key_768_to_external(&priv->pub); +} + +const MLKEM1024_public_key *bssl::BCM_mlkem1024_public_of_private( + const MLKEM1024_private_key *private_key) { + const mlkem::private_key *const priv = + mlkem::private_key_1024_from_external(private_key); + return mlkem::public_key_1024_to_external(&priv->pub); +} + +// Calls |MLKEM768_encap_external_entropy| with random bytes from +// |BCM_rand_bytes| +bcm_infallible bssl::BCM_mlkem768_encap( + uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const MLKEM768_public_key *public_key) { + uint8_t entropy[BCM_MLKEM_ENCAP_ENTROPY]; + BCM_rand_bytes(entropy, BCM_MLKEM_ENCAP_ENTROPY); + CONSTTIME_SECRET(entropy, BCM_MLKEM_ENCAP_ENTROPY); + BCM_mlkem768_encap_external_entropy(out_ciphertext, out_shared_secret, + public_key, entropy); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_mlkem1024_encap( + uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const MLKEM1024_public_key *public_key) { + uint8_t entropy[BCM_MLKEM_ENCAP_ENTROPY]; + BCM_rand_bytes(entropy, BCM_MLKEM_ENCAP_ENTROPY); + CONSTTIME_SECRET(entropy, BCM_MLKEM_ENCAP_ENTROPY); + BCM_mlkem1024_encap_external_entropy(out_ciphertext, out_shared_secret, + public_key, entropy); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_mlkem768_encap_external_entropy( + uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const MLKEM768_public_key *public_key, + const uint8_t entropy[BCM_MLKEM_ENCAP_ENTROPY]) { + const mlkem::public_key *pub = + mlkem::public_key_768_from_external(public_key); + mlkem_encap_external_entropy(out_ciphertext, out_shared_secret, pub, entropy); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_mlkem1024_encap_external_entropy( + uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const MLKEM1024_public_key *public_key, + const uint8_t entropy[BCM_MLKEM_ENCAP_ENTROPY]) { + const mlkem::public_key *pub = + mlkem::public_key_1024_from_external(public_key); + mlkem_encap_external_entropy(out_ciphertext, out_shared_secret, pub, entropy); + return bcm_infallible::approved; +} + +bcm_status bssl::BCM_mlkem768_decap( + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const uint8_t *ciphertext, size_t ciphertext_len, + const MLKEM768_private_key *private_key) { + if (ciphertext_len != MLKEM768_CIPHERTEXT_BYTES) { + BCM_rand_bytes(out_shared_secret, MLKEM_SHARED_SECRET_BYTES); + return bcm_status::failure; + } + const mlkem::private_key *priv = + mlkem::private_key_768_from_external(private_key); + mlkem_decap(out_shared_secret, ciphertext, priv); + return bcm_status::approved; +} + +bcm_status bssl::BCM_mlkem1024_decap( + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const uint8_t *ciphertext, size_t ciphertext_len, + const MLKEM1024_private_key *private_key) { + if (ciphertext_len != MLKEM1024_CIPHERTEXT_BYTES) { + BCM_rand_bytes(out_shared_secret, MLKEM_SHARED_SECRET_BYTES); + return bcm_status::failure; + } + const mlkem::private_key *priv = + mlkem::private_key_1024_from_external(private_key); + mlkem_decap(out_shared_secret, ciphertext, priv); + return bcm_status::approved; +} + +bcm_status bssl::BCM_mlkem768_marshal_public_key( + CBB *out, const MLKEM768_public_key *public_key) { + return mlkem_marshal_public_key( + out, mlkem::public_key_768_from_external(public_key)); +} + +bcm_status bssl::BCM_mlkem1024_marshal_public_key( + CBB *out, const MLKEM1024_public_key *public_key) { + return mlkem_marshal_public_key( + out, mlkem::public_key_1024_from_external(public_key)); +} + +int bssl::BCM_mlkem768_public_keys_equal(const MLKEM768_public_key *a, + const MLKEM768_public_key *b) { + const auto *pub_a = mlkem::public_key_768_from_external(a); + const auto *pub_b = mlkem::public_key_768_from_external(b); + if (mlkem_public_keys_equal(pub_a, pub_b)) { + return 1; + } + return 0; +} + +int bssl::BCM_mlkem1024_public_keys_equal(const MLKEM1024_public_key *a, + const MLKEM1024_public_key *b) { + const auto *pub_a = mlkem::public_key_1024_from_external(a); + const auto *pub_b = mlkem::public_key_1024_from_external(b); + if (mlkem_public_keys_equal(pub_a, pub_b)) { + return 1; + } + return 0; +} + +bcm_status bssl::BCM_mlkem768_parse_public_key(MLKEM768_public_key *public_key, + CBS *in) { + mlkem::public_key *pub = + mlkem::public_key_768_from_external(public_key); + if (!mlkem_parse_public_key(pub, in)) { + return bcm_status::failure; + } + return bcm_status::approved; +} + +bcm_status bssl::BCM_mlkem1024_parse_public_key( + MLKEM1024_public_key *public_key, CBS *in) { + mlkem::public_key *pub = + mlkem::public_key_1024_from_external(public_key); + if (!mlkem_parse_public_key(pub, in)) { + return bcm_status::failure; + } + return bcm_status::approved; +} + +bcm_status bssl::BCM_mlkem768_marshal_private_key( + CBB *out, const MLKEM768_private_key *private_key) { + const mlkem::private_key *const priv = + mlkem::private_key_768_from_external(private_key); + if (!mlkem_marshal_private_key(out, priv)) { + return bcm_status::failure; + } + return bcm_status::approved; +} + +bcm_status bssl::BCM_mlkem1024_marshal_private_key( + CBB *out, const MLKEM1024_private_key *private_key) { + const mlkem::private_key *const priv = + mlkem::private_key_1024_from_external(private_key); + if (!mlkem_marshal_private_key(out, priv)) { + return bcm_status::failure; + } + return bcm_status::approved; +} + +bcm_status bssl::BCM_mlkem768_parse_private_key( + MLKEM768_private_key *out_private_key, CBS *in) { + mlkem::private_key *const priv = + mlkem::private_key_768_from_external(out_private_key); + if (!mlkem_parse_private_key(priv, in)) { + return bcm_status::failure; + } + return bcm_status::approved; +} + +bcm_status bssl::BCM_mlkem1024_parse_private_key( + MLKEM1024_private_key *out_private_key, CBS *in) { + mlkem::private_key *const priv = + mlkem::private_key_1024_from_external(out_private_key); + if (!mlkem_parse_private_key(priv, in)) { + return bcm_status::failure; + } + return bcm_status::approved; +} + +int bssl::boringssl_self_test_mlkem() { + return mlkem::fips::keygen_self_test() && mlkem::fips::encap_self_test() && + mlkem::fips::decap_self_test(); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/modes/cbc.c b/third_party/boringssl/src/crypto/fipsmodule/modes/cbc.c deleted file mode 100644 index df8f9ce8..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/modes/cbc.c +++ /dev/null @@ -1,172 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2008 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#include -#include - -#include "internal.h" -#include "../../internal.h" - - -void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], - block128_f block) { - assert(key != NULL && ivec != NULL); - if (len == 0) { - // Avoid |ivec| == |iv| in the |memcpy| below, which is not legal in C. - return; - } - - assert(in != NULL && out != NULL); - size_t n; - const uint8_t *iv = ivec; - while (len >= 16) { - for (n = 0; n < 16; n += sizeof(crypto_word_t)) { - CRYPTO_store_word_le( - out + n, CRYPTO_load_word_le(in + n) ^ CRYPTO_load_word_le(iv + n)); - } - (*block)(out, out, key); - iv = out; - len -= 16; - in += 16; - out += 16; - } - - while (len) { - for (n = 0; n < 16 && n < len; ++n) { - out[n] = in[n] ^ iv[n]; - } - for (; n < 16; ++n) { - out[n] = iv[n]; - } - (*block)(out, out, key); - iv = out; - if (len <= 16) { - break; - } - len -= 16; - in += 16; - out += 16; - } - - OPENSSL_memcpy(ivec, iv, 16); -} - -void CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], - block128_f block) { - assert(key != NULL && ivec != NULL); - if (len == 0) { - // Avoid |ivec| == |iv| in the |memcpy| below, which is not legal in C. - return; - } - - assert(in != NULL && out != NULL); - - const uintptr_t inptr = (uintptr_t) in; - const uintptr_t outptr = (uintptr_t) out; - // If |in| and |out| alias, |in| must be ahead. - assert(inptr >= outptr || inptr + len <= outptr); - - size_t n; - alignas(16) uint8_t tmp[16]; - if ((inptr >= 32 && outptr <= inptr - 32) || inptr < outptr) { - // If |out| is at least two blocks behind |in| or completely disjoint, there - // is no need to decrypt to a temporary block. - static_assert(16 % sizeof(crypto_word_t) == 0, - "block cannot be evenly divided into words"); - const uint8_t *iv = ivec; - while (len >= 16) { - (*block)(in, out, key); - for (n = 0; n < 16; n += sizeof(crypto_word_t)) { - CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(out + n) ^ - CRYPTO_load_word_le(iv + n)); - } - iv = in; - len -= 16; - in += 16; - out += 16; - } - OPENSSL_memcpy(ivec, iv, 16); - } else { - static_assert(16 % sizeof(crypto_word_t) == 0, - "block cannot be evenly divided into words"); - - while (len >= 16) { - (*block)(in, tmp, key); - for (n = 0; n < 16; n += sizeof(crypto_word_t)) { - crypto_word_t c = CRYPTO_load_word_le(in + n); - CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(tmp + n) ^ - CRYPTO_load_word_le(ivec + n)); - CRYPTO_store_word_le(ivec + n, c); - } - len -= 16; - in += 16; - out += 16; - } - } - - while (len) { - uint8_t c; - (*block)(in, tmp, key); - for (n = 0; n < 16 && n < len; ++n) { - c = in[n]; - out[n] = tmp[n] ^ ivec[n]; - ivec[n] = c; - } - if (len <= 16) { - for (; n < 16; ++n) { - ivec[n] = in[n]; - } - break; - } - len -= 16; - in += 16; - out += 16; - } -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/modes/cfb.c b/third_party/boringssl/src/crypto/fipsmodule/modes/cfb.c deleted file mode 100644 index 37a81843..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/modes/cfb.c +++ /dev/null @@ -1,200 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2008 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#include -#include - -#include "internal.h" - - -static_assert(16 % sizeof(size_t) == 0, "block cannot be divided into size_t"); - -void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], unsigned *num, - int enc, block128_f block) { - assert(in && out && key && ivec && num); - - unsigned n = *num; - - if (enc) { - while (n && len) { - *(out++) = ivec[n] ^= *(in++); - --len; - n = (n + 1) % 16; - } - while (len >= 16) { - (*block)(ivec, ivec, key); - for (; n < 16; n += sizeof(crypto_word_t)) { - crypto_word_t tmp = - CRYPTO_load_word_le(ivec + n) ^ CRYPTO_load_word_le(in + n); - CRYPTO_store_word_le(ivec + n, tmp); - CRYPTO_store_word_le(out + n, tmp); - } - len -= 16; - out += 16; - in += 16; - n = 0; - } - if (len) { - (*block)(ivec, ivec, key); - while (len--) { - out[n] = ivec[n] ^= in[n]; - ++n; - } - } - *num = n; - return; - } else { - while (n && len) { - uint8_t c; - *(out++) = ivec[n] ^ (c = *(in++)); - ivec[n] = c; - --len; - n = (n + 1) % 16; - } - while (len >= 16) { - (*block)(ivec, ivec, key); - for (; n < 16; n += sizeof(crypto_word_t)) { - crypto_word_t t = CRYPTO_load_word_le(in + n); - CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(ivec + n) ^ t); - CRYPTO_store_word_le(ivec + n, t); - } - len -= 16; - out += 16; - in += 16; - n = 0; - } - if (len) { - (*block)(ivec, ivec, key); - while (len--) { - uint8_t c; - out[n] = ivec[n] ^ (c = in[n]); - ivec[n] = c; - ++n; - } - } - *num = n; - return; - } -} - - -/* This expects a single block of size nbits for both in and out. Note that - it corrupts any extra bits in the last byte of out */ -static void cfbr_encrypt_block(const uint8_t *in, uint8_t *out, unsigned nbits, - const AES_KEY *key, uint8_t ivec[16], int enc, - block128_f block) { - int n, rem, num; - uint8_t ovec[16 * 2 + 1]; /* +1 because we dererefence (but don't use) one - byte off the end */ - - if (nbits <= 0 || nbits > 128) { - return; - } - - // fill in the first half of the new IV with the current IV - OPENSSL_memcpy(ovec, ivec, 16); - // construct the new IV - (*block)(ivec, ivec, key); - num = (nbits + 7) / 8; - if (enc) { - // encrypt the input - for (n = 0; n < num; ++n) { - out[n] = (ovec[16 + n] = in[n] ^ ivec[n]); - } - } else { - // decrypt the input - for (n = 0; n < num; ++n) { - out[n] = (ovec[16 + n] = in[n]) ^ ivec[n]; - } - } - // shift ovec left... - rem = nbits % 8; - num = nbits / 8; - if (rem == 0) { - OPENSSL_memcpy(ivec, ovec + num, 16); - } else { - for (n = 0; n < 16; ++n) { - ivec[n] = ovec[n + num] << rem | ovec[n + num + 1] >> (8 - rem); - } - } - - // it is not necessary to cleanse ovec, since the IV is not secret -} - -// N.B. This expects the input to be packed, MS bit first -void CRYPTO_cfb128_1_encrypt(const uint8_t *in, uint8_t *out, size_t bits, - const AES_KEY *key, uint8_t ivec[16], - unsigned *num, int enc, block128_f block) { - size_t n; - uint8_t c[1], d[1]; - - assert(in && out && key && ivec && num); - assert(*num == 0); - - for (n = 0; n < bits; ++n) { - c[0] = (in[n / 8] & (1 << (7 - n % 8))) ? 0x80 : 0; - cfbr_encrypt_block(c, d, 1, key, ivec, enc, block); - out[n / 8] = (out[n / 8] & ~(1 << (unsigned int)(7 - n % 8))) | - ((d[0] & 0x80) >> (unsigned int)(n % 8)); - } -} - -void CRYPTO_cfb128_8_encrypt(const unsigned char *in, unsigned char *out, - size_t length, const AES_KEY *key, - unsigned char ivec[16], unsigned *num, int enc, - block128_f block) { - size_t n; - - assert(in && out && key && ivec && num); - assert(*num == 0); - - for (n = 0; n < length; ++n) { - cfbr_encrypt_block(&in[n], &out[n], 8, key, ivec, enc, block); - } -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/modes/ctr.c b/third_party/boringssl/src/crypto/fipsmodule/modes/ctr.c deleted file mode 100644 index 1688f823..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/modes/ctr.c +++ /dev/null @@ -1,199 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2008 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#include -#include - -#include "internal.h" -#include "../../internal.h" - - -// NOTE: the IV/counter CTR mode is big-endian. The code itself -// is endian-neutral. - -// increment counter (128-bit int) by 1 -static void ctr128_inc(uint8_t *counter) { - uint32_t n = 16, c = 1; - - do { - --n; - c += counter[n]; - counter[n] = (uint8_t) c; - c >>= 8; - } while (n); -} - -static_assert(16 % sizeof(crypto_word_t) == 0, - "block cannot be divided into crypto_word_t"); - -// The input encrypted as though 128bit counter mode is being used. The extra -// state information to record how much of the 128bit block we have used is -// contained in *num, and the encrypted counter is kept in ecount_buf. Both -// *num and ecount_buf must be initialised with zeros before the first call to -// CRYPTO_ctr128_encrypt(). -// -// This algorithm assumes that the counter is in the x lower bits of the IV -// (ivec), and that the application has full control over overflow and the rest -// of the IV. This implementation takes NO responsibility for checking that -// the counter doesn't overflow into the rest of the IV when incremented. -void CRYPTO_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], - uint8_t ecount_buf[16], unsigned int *num, - block128_f block) { - unsigned int n; - - assert(key && ecount_buf && num); - assert(len == 0 || (in && out)); - assert(*num < 16); - - n = *num; - - while (n && len) { - *(out++) = *(in++) ^ ecount_buf[n]; - --len; - n = (n + 1) % 16; - } - while (len >= 16) { - (*block)(ivec, ecount_buf, key); - ctr128_inc(ivec); - for (n = 0; n < 16; n += sizeof(crypto_word_t)) { - CRYPTO_store_word_le(out + n, CRYPTO_load_word_le(in + n) ^ - CRYPTO_load_word_le(ecount_buf + n)); - } - len -= 16; - out += 16; - in += 16; - n = 0; - } - if (len) { - (*block)(ivec, ecount_buf, key); - ctr128_inc(ivec); - while (len--) { - out[n] = in[n] ^ ecount_buf[n]; - ++n; - } - } - *num = n; -} - -// increment upper 96 bits of 128-bit counter by 1 -static void ctr96_inc(uint8_t *counter) { - uint32_t n = 12, c = 1; - - do { - --n; - c += counter[n]; - counter[n] = (uint8_t) c; - c >>= 8; - } while (n); -} - -void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], - uint8_t ecount_buf[16], unsigned int *num, - ctr128_f func) { - unsigned int n, ctr32; - - assert(key && ecount_buf && num); - assert(len == 0 || (in && out)); - assert(*num < 16); - - n = *num; - - while (n && len) { - *(out++) = *(in++) ^ ecount_buf[n]; - --len; - n = (n + 1) % 16; - } - - ctr32 = CRYPTO_load_u32_be(ivec + 12); - while (len >= 16) { - size_t blocks = len / 16; - // 1<<28 is just a not-so-small yet not-so-large number... - // Below condition is practically never met, but it has to - // be checked for code correctness. - if (sizeof(size_t) > sizeof(unsigned int) && blocks > (1U << 28)) { - blocks = (1U << 28); - } - // As (*func) operates on 32-bit counter, caller - // has to handle overflow. 'if' below detects the - // overflow, which is then handled by limiting the - // amount of blocks to the exact overflow point... - ctr32 += (uint32_t)blocks; - if (ctr32 < blocks) { - blocks -= ctr32; - ctr32 = 0; - } - (*func)(in, out, blocks, key, ivec); - // (*func) does not update ivec, caller does: - CRYPTO_store_u32_be(ivec + 12, ctr32); - // ... overflow was detected, propogate carry. - if (ctr32 == 0) { - ctr96_inc(ivec); - } - blocks *= 16; - len -= blocks; - out += blocks; - in += blocks; - } - if (len) { - OPENSSL_memset(ecount_buf, 0, 16); - (*func)(ecount_buf, ecount_buf, 1, key, ivec); - ++ctr32; - CRYPTO_store_u32_be(ivec + 12, ctr32); - if (ctr32 == 0) { - ctr96_inc(ivec); - } - while (len--) { - out[n] = in[n] ^ ecount_buf[n]; - ++n; - } - } - - *num = n; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/modes/gcm.c b/third_party/boringssl/src/crypto/fipsmodule/modes/gcm.c deleted file mode 100644 index 1a77ec0e..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/modes/gcm.c +++ /dev/null @@ -1,723 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2008 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#include - -#include -#include - -#include - -#include "internal.h" -#include "../../internal.h" - - -// kSizeTWithoutLower4Bits is a mask that can be used to zero the lower four -// bits of a |size_t|. -static const size_t kSizeTWithoutLower4Bits = (size_t) -16; - - -#define GCM_MUL(ctx, Xi) gcm_gmult_nohw((ctx)->Xi.u, (ctx)->gcm_key.Htable) -#define GHASH(ctx, in, len) \ - gcm_ghash_nohw((ctx)->Xi.u, (ctx)->gcm_key.Htable, in, len) -// GHASH_CHUNK is "stride parameter" missioned to mitigate cache -// trashing effect. In other words idea is to hash data while it's -// still in L1 cache after encryption pass... -#define GHASH_CHUNK (3 * 1024) - -#if defined(GHASH_ASM_X86_64) || defined(GHASH_ASM_X86) -static inline void gcm_reduce_1bit(u128 *V) { - if (sizeof(crypto_word_t) == 8) { - uint64_t T = UINT64_C(0xe100000000000000) & (0 - (V->hi & 1)); - V->hi = (V->lo << 63) | (V->hi >> 1); - V->lo = (V->lo >> 1) ^ T; - } else { - uint32_t T = 0xe1000000U & (0 - (uint32_t)(V->hi & 1)); - V->hi = (V->lo << 63) | (V->hi >> 1); - V->lo = (V->lo >> 1) ^ ((uint64_t)T << 32); - } -} - -void gcm_init_ssse3(u128 Htable[16], const uint64_t H[2]) { - Htable[0].hi = 0; - Htable[0].lo = 0; - u128 V; - V.hi = H[1]; - V.lo = H[0]; - - Htable[8] = V; - gcm_reduce_1bit(&V); - Htable[4] = V; - gcm_reduce_1bit(&V); - Htable[2] = V; - gcm_reduce_1bit(&V); - Htable[1] = V; - Htable[3].hi = V.hi ^ Htable[2].hi, Htable[3].lo = V.lo ^ Htable[2].lo; - V = Htable[4]; - Htable[5].hi = V.hi ^ Htable[1].hi, Htable[5].lo = V.lo ^ Htable[1].lo; - Htable[6].hi = V.hi ^ Htable[2].hi, Htable[6].lo = V.lo ^ Htable[2].lo; - Htable[7].hi = V.hi ^ Htable[3].hi, Htable[7].lo = V.lo ^ Htable[3].lo; - V = Htable[8]; - Htable[9].hi = V.hi ^ Htable[1].hi, Htable[9].lo = V.lo ^ Htable[1].lo; - Htable[10].hi = V.hi ^ Htable[2].hi, Htable[10].lo = V.lo ^ Htable[2].lo; - Htable[11].hi = V.hi ^ Htable[3].hi, Htable[11].lo = V.lo ^ Htable[3].lo; - Htable[12].hi = V.hi ^ Htable[4].hi, Htable[12].lo = V.lo ^ Htable[4].lo; - Htable[13].hi = V.hi ^ Htable[5].hi, Htable[13].lo = V.lo ^ Htable[5].lo; - Htable[14].hi = V.hi ^ Htable[6].hi, Htable[14].lo = V.lo ^ Htable[6].lo; - Htable[15].hi = V.hi ^ Htable[7].hi, Htable[15].lo = V.lo ^ Htable[7].lo; - - // Treat |Htable| as a 16x16 byte table and transpose it. Thus, Htable[i] - // contains the i'th byte of j*H for all j. - uint8_t *Hbytes = (uint8_t *)Htable; - for (int i = 0; i < 16; i++) { - for (int j = 0; j < i; j++) { - uint8_t tmp = Hbytes[16*i + j]; - Hbytes[16*i + j] = Hbytes[16*j + i]; - Hbytes[16*j + i] = tmp; - } - } -} -#endif // GHASH_ASM_X86_64 || GHASH_ASM_X86 - -#ifdef GCM_FUNCREF -#undef GCM_MUL -#define GCM_MUL(ctx, Xi) (*gcm_gmult_p)((ctx)->Xi.u, (ctx)->gcm_key.Htable) -#undef GHASH -#define GHASH(ctx, in, len) \ - (*gcm_ghash_p)((ctx)->Xi.u, (ctx)->gcm_key.Htable, in, len) -#endif // GCM_FUNCREF - -void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash, - u128 *out_key, u128 out_table[16], int *out_is_avx, - const uint8_t gcm_key[16]) { - *out_is_avx = 0; - - // H is stored in host byte order. - uint64_t H[2] = {CRYPTO_load_u64_be(gcm_key), - CRYPTO_load_u64_be(gcm_key + 8)}; - out_key->hi = H[0]; - out_key->lo = H[1]; - -#if defined(GHASH_ASM_X86_64) - if (crypto_gcm_clmul_enabled()) { - if (CRYPTO_is_AVX_capable() && CRYPTO_is_MOVBE_capable()) { - gcm_init_avx(out_table, H); - *out_mult = gcm_gmult_avx; - *out_hash = gcm_ghash_avx; - *out_is_avx = 1; - return; - } - gcm_init_clmul(out_table, H); - *out_mult = gcm_gmult_clmul; - *out_hash = gcm_ghash_clmul; - return; - } - if (CRYPTO_is_SSSE3_capable()) { - gcm_init_ssse3(out_table, H); - *out_mult = gcm_gmult_ssse3; - *out_hash = gcm_ghash_ssse3; - return; - } -#elif defined(GHASH_ASM_X86) - if (crypto_gcm_clmul_enabled()) { - gcm_init_clmul(out_table, H); - *out_mult = gcm_gmult_clmul; - *out_hash = gcm_ghash_clmul; - return; - } - if (CRYPTO_is_SSSE3_capable()) { - gcm_init_ssse3(out_table, H); - *out_mult = gcm_gmult_ssse3; - *out_hash = gcm_ghash_ssse3; - return; - } -#elif defined(GHASH_ASM_ARM) - if (gcm_pmull_capable()) { - gcm_init_v8(out_table, H); - *out_mult = gcm_gmult_v8; - *out_hash = gcm_ghash_v8; - return; - } - - if (gcm_neon_capable()) { - gcm_init_neon(out_table, H); - *out_mult = gcm_gmult_neon; - *out_hash = gcm_ghash_neon; - return; - } -#elif defined(GHASH_ASM_PPC64LE) - if (CRYPTO_is_PPC64LE_vcrypto_capable()) { - gcm_init_p8(out_table, H); - *out_mult = gcm_gmult_p8; - *out_hash = gcm_ghash_p8; - return; - } -#endif - - gcm_init_nohw(out_table, H); - *out_mult = gcm_gmult_nohw; - *out_hash = gcm_ghash_nohw; -} - -void CRYPTO_gcm128_init_key(GCM128_KEY *gcm_key, const AES_KEY *aes_key, - block128_f block, int block_is_hwaes) { - OPENSSL_memset(gcm_key, 0, sizeof(*gcm_key)); - gcm_key->block = block; - - uint8_t ghash_key[16]; - OPENSSL_memset(ghash_key, 0, sizeof(ghash_key)); - (*block)(ghash_key, ghash_key, aes_key); - - int is_avx; - CRYPTO_ghash_init(&gcm_key->gmult, &gcm_key->ghash, &gcm_key->H, - gcm_key->Htable, &is_avx, ghash_key); - - gcm_key->use_aesni_gcm_crypt = (is_avx && block_is_hwaes) ? 1 : 0; -} - -void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const AES_KEY *key, - const uint8_t *iv, size_t len) { -#ifdef GCM_FUNCREF - void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = - ctx->gcm_key.gmult; -#endif - - ctx->Yi.u[0] = 0; - ctx->Yi.u[1] = 0; - ctx->Xi.u[0] = 0; - ctx->Xi.u[1] = 0; - ctx->len.u[0] = 0; // AAD length - ctx->len.u[1] = 0; // message length - ctx->ares = 0; - ctx->mres = 0; - - uint32_t ctr; - if (len == 12) { - OPENSSL_memcpy(ctx->Yi.c, iv, 12); - ctx->Yi.c[15] = 1; - ctr = 1; - } else { - uint64_t len0 = len; - - while (len >= 16) { - for (size_t i = 0; i < 16; ++i) { - ctx->Yi.c[i] ^= iv[i]; - } - GCM_MUL(ctx, Yi); - iv += 16; - len -= 16; - } - if (len) { - for (size_t i = 0; i < len; ++i) { - ctx->Yi.c[i] ^= iv[i]; - } - GCM_MUL(ctx, Yi); - } - len0 <<= 3; - ctx->Yi.u[1] ^= CRYPTO_bswap8(len0); - - GCM_MUL(ctx, Yi); - ctr = CRYPTO_bswap4(ctx->Yi.d[3]); - } - - (*ctx->gcm_key.block)(ctx->Yi.c, ctx->EK0.c, key); - ++ctr; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); -} - -int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, size_t len) { -#ifdef GCM_FUNCREF - void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = - ctx->gcm_key.gmult; - void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, - size_t len) = ctx->gcm_key.ghash; -#endif - - if (ctx->len.u[1]) { - return 0; - } - - uint64_t alen = ctx->len.u[0] + len; - if (alen > (UINT64_C(1) << 61) || (sizeof(len) == 8 && alen < len)) { - return 0; - } - ctx->len.u[0] = alen; - - unsigned n = ctx->ares; - if (n) { - while (n && len) { - ctx->Xi.c[n] ^= *(aad++); - --len; - n = (n + 1) % 16; - } - if (n == 0) { - GCM_MUL(ctx, Xi); - } else { - ctx->ares = n; - return 1; - } - } - - // Process a whole number of blocks. - size_t len_blocks = len & kSizeTWithoutLower4Bits; - if (len_blocks != 0) { - GHASH(ctx, aad, len_blocks); - aad += len_blocks; - len -= len_blocks; - } - - // Process the remainder. - if (len != 0) { - n = (unsigned int)len; - for (size_t i = 0; i < len; ++i) { - ctx->Xi.c[i] ^= aad[i]; - } - } - - ctx->ares = n; - return 1; -} - -int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, const AES_KEY *key, - const uint8_t *in, uint8_t *out, size_t len) { - block128_f block = ctx->gcm_key.block; -#ifdef GCM_FUNCREF - void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = - ctx->gcm_key.gmult; - void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, - size_t len) = ctx->gcm_key.ghash; -#endif - - uint64_t mlen = ctx->len.u[1] + len; - if (mlen > ((UINT64_C(1) << 36) - 32) || - (sizeof(len) == 8 && mlen < len)) { - return 0; - } - ctx->len.u[1] = mlen; - - if (ctx->ares) { - // First call to encrypt finalizes GHASH(AAD) - GCM_MUL(ctx, Xi); - ctx->ares = 0; - } - - unsigned n = ctx->mres; - if (n) { - while (n && len) { - ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n]; - --len; - n = (n + 1) % 16; - } - if (n == 0) { - GCM_MUL(ctx, Xi); - } else { - ctx->mres = n; - return 1; - } - } - - uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]); - while (len >= GHASH_CHUNK) { - size_t j = GHASH_CHUNK; - - while (j) { - (*block)(ctx->Yi.c, ctx->EKi.c, key); - ++ctr; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) { - CRYPTO_store_word_le(out + i, - CRYPTO_load_word_le(in + i) ^ - ctx->EKi.t[i / sizeof(crypto_word_t)]); - } - out += 16; - in += 16; - j -= 16; - } - GHASH(ctx, out - GHASH_CHUNK, GHASH_CHUNK); - len -= GHASH_CHUNK; - } - size_t len_blocks = len & kSizeTWithoutLower4Bits; - if (len_blocks != 0) { - while (len >= 16) { - (*block)(ctx->Yi.c, ctx->EKi.c, key); - ++ctr; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) { - CRYPTO_store_word_le(out + i, - CRYPTO_load_word_le(in + i) ^ - ctx->EKi.t[i / sizeof(crypto_word_t)]); - } - out += 16; - in += 16; - len -= 16; - } - GHASH(ctx, out - len_blocks, len_blocks); - } - if (len) { - (*block)(ctx->Yi.c, ctx->EKi.c, key); - ++ctr; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - while (len--) { - ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n]; - ++n; - } - } - - ctx->mres = n; - return 1; -} - -int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, const AES_KEY *key, - const unsigned char *in, unsigned char *out, - size_t len) { - block128_f block = ctx->gcm_key.block; -#ifdef GCM_FUNCREF - void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = - ctx->gcm_key.gmult; - void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, - size_t len) = ctx->gcm_key.ghash; -#endif - - uint64_t mlen = ctx->len.u[1] + len; - if (mlen > ((UINT64_C(1) << 36) - 32) || - (sizeof(len) == 8 && mlen < len)) { - return 0; - } - ctx->len.u[1] = mlen; - - if (ctx->ares) { - // First call to decrypt finalizes GHASH(AAD) - GCM_MUL(ctx, Xi); - ctx->ares = 0; - } - - unsigned n = ctx->mres; - if (n) { - while (n && len) { - uint8_t c = *(in++); - *(out++) = c ^ ctx->EKi.c[n]; - ctx->Xi.c[n] ^= c; - --len; - n = (n + 1) % 16; - } - if (n == 0) { - GCM_MUL(ctx, Xi); - } else { - ctx->mres = n; - return 1; - } - } - - uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]); - while (len >= GHASH_CHUNK) { - size_t j = GHASH_CHUNK; - - GHASH(ctx, in, GHASH_CHUNK); - while (j) { - (*block)(ctx->Yi.c, ctx->EKi.c, key); - ++ctr; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) { - CRYPTO_store_word_le(out + i, - CRYPTO_load_word_le(in + i) ^ - ctx->EKi.t[i / sizeof(crypto_word_t)]); - } - out += 16; - in += 16; - j -= 16; - } - len -= GHASH_CHUNK; - } - size_t len_blocks = len & kSizeTWithoutLower4Bits; - if (len_blocks != 0) { - GHASH(ctx, in, len_blocks); - while (len >= 16) { - (*block)(ctx->Yi.c, ctx->EKi.c, key); - ++ctr; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - for (size_t i = 0; i < 16; i += sizeof(crypto_word_t)) { - CRYPTO_store_word_le(out + i, - CRYPTO_load_word_le(in + i) ^ - ctx->EKi.t[i / sizeof(crypto_word_t)]); - } - out += 16; - in += 16; - len -= 16; - } - } - if (len) { - (*block)(ctx->Yi.c, ctx->EKi.c, key); - ++ctr; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - while (len--) { - uint8_t c = in[n]; - ctx->Xi.c[n] ^= c; - out[n] = c ^ ctx->EKi.c[n]; - ++n; - } - } - - ctx->mres = n; - return 1; -} - -int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key, - const uint8_t *in, uint8_t *out, size_t len, - ctr128_f stream) { -#ifdef GCM_FUNCREF - void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = - ctx->gcm_key.gmult; - void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, - size_t len) = ctx->gcm_key.ghash; -#endif - - uint64_t mlen = ctx->len.u[1] + len; - if (mlen > ((UINT64_C(1) << 36) - 32) || - (sizeof(len) == 8 && mlen < len)) { - return 0; - } - ctx->len.u[1] = mlen; - - if (ctx->ares) { - // First call to encrypt finalizes GHASH(AAD) - GCM_MUL(ctx, Xi); - ctx->ares = 0; - } - - unsigned n = ctx->mres; - if (n) { - while (n && len) { - ctx->Xi.c[n] ^= *(out++) = *(in++) ^ ctx->EKi.c[n]; - --len; - n = (n + 1) % 16; - } - if (n == 0) { - GCM_MUL(ctx, Xi); - } else { - ctx->mres = n; - return 1; - } - } - -#if defined(AESNI_GCM) - // Check |len| to work around a C language bug. See https://crbug.com/1019588. - if (ctx->gcm_key.use_aesni_gcm_crypt && len > 0) { - // |aesni_gcm_encrypt| may not process all the input given to it. It may - // not process *any* of its input if it is deemed too small. - size_t bulk = aesni_gcm_encrypt(in, out, len, key, ctx->Yi.c, ctx->Xi.u); - in += bulk; - out += bulk; - len -= bulk; - } -#endif - - uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]); - while (len >= GHASH_CHUNK) { - (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c); - ctr += GHASH_CHUNK / 16; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - GHASH(ctx, out, GHASH_CHUNK); - out += GHASH_CHUNK; - in += GHASH_CHUNK; - len -= GHASH_CHUNK; - } - size_t len_blocks = len & kSizeTWithoutLower4Bits; - if (len_blocks != 0) { - size_t j = len_blocks / 16; - - (*stream)(in, out, j, key, ctx->Yi.c); - ctr += (unsigned int)j; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - in += len_blocks; - len -= len_blocks; - GHASH(ctx, out, len_blocks); - out += len_blocks; - } - if (len) { - (*ctx->gcm_key.block)(ctx->Yi.c, ctx->EKi.c, key); - ++ctr; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - while (len--) { - ctx->Xi.c[n] ^= out[n] = in[n] ^ ctx->EKi.c[n]; - ++n; - } - } - - ctx->mres = n; - return 1; -} - -int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, const AES_KEY *key, - const uint8_t *in, uint8_t *out, size_t len, - ctr128_f stream) { -#ifdef GCM_FUNCREF - void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = - ctx->gcm_key.gmult; - void (*gcm_ghash_p)(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, - size_t len) = ctx->gcm_key.ghash; -#endif - - uint64_t mlen = ctx->len.u[1] + len; - if (mlen > ((UINT64_C(1) << 36) - 32) || - (sizeof(len) == 8 && mlen < len)) { - return 0; - } - ctx->len.u[1] = mlen; - - if (ctx->ares) { - // First call to decrypt finalizes GHASH(AAD) - GCM_MUL(ctx, Xi); - ctx->ares = 0; - } - - unsigned n = ctx->mres; - if (n) { - while (n && len) { - uint8_t c = *(in++); - *(out++) = c ^ ctx->EKi.c[n]; - ctx->Xi.c[n] ^= c; - --len; - n = (n + 1) % 16; - } - if (n == 0) { - GCM_MUL(ctx, Xi); - } else { - ctx->mres = n; - return 1; - } - } - -#if defined(AESNI_GCM) - // Check |len| to work around a C language bug. See https://crbug.com/1019588. - if (ctx->gcm_key.use_aesni_gcm_crypt && len > 0) { - // |aesni_gcm_decrypt| may not process all the input given to it. It may - // not process *any* of its input if it is deemed too small. - size_t bulk = aesni_gcm_decrypt(in, out, len, key, ctx->Yi.c, ctx->Xi.u); - in += bulk; - out += bulk; - len -= bulk; - } -#endif - - uint32_t ctr = CRYPTO_bswap4(ctx->Yi.d[3]); - while (len >= GHASH_CHUNK) { - GHASH(ctx, in, GHASH_CHUNK); - (*stream)(in, out, GHASH_CHUNK / 16, key, ctx->Yi.c); - ctr += GHASH_CHUNK / 16; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - out += GHASH_CHUNK; - in += GHASH_CHUNK; - len -= GHASH_CHUNK; - } - size_t len_blocks = len & kSizeTWithoutLower4Bits; - if (len_blocks != 0) { - size_t j = len_blocks / 16; - - GHASH(ctx, in, len_blocks); - (*stream)(in, out, j, key, ctx->Yi.c); - ctr += (unsigned int)j; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - out += len_blocks; - in += len_blocks; - len -= len_blocks; - } - if (len) { - (*ctx->gcm_key.block)(ctx->Yi.c, ctx->EKi.c, key); - ++ctr; - ctx->Yi.d[3] = CRYPTO_bswap4(ctr); - while (len--) { - uint8_t c = in[n]; - ctx->Xi.c[n] ^= c; - out[n] = c ^ ctx->EKi.c[n]; - ++n; - } - } - - ctx->mres = n; - return 1; -} - -int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag, size_t len) { -#ifdef GCM_FUNCREF - void (*gcm_gmult_p)(uint64_t Xi[2], const u128 Htable[16]) = - ctx->gcm_key.gmult; -#endif - - if (ctx->mres || ctx->ares) { - GCM_MUL(ctx, Xi); - } - - ctx->Xi.u[0] ^= CRYPTO_bswap8(ctx->len.u[0] << 3); - ctx->Xi.u[1] ^= CRYPTO_bswap8(ctx->len.u[1] << 3); - GCM_MUL(ctx, Xi); - - ctx->Xi.u[0] ^= ctx->EK0.u[0]; - ctx->Xi.u[1] ^= ctx->EK0.u[1]; - - if (tag && len <= sizeof(ctx->Xi)) { - return CRYPTO_memcmp(ctx->Xi.c, tag, len) == 0; - } else { - return 0; - } -} - -void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, unsigned char *tag, size_t len) { - CRYPTO_gcm128_finish(ctx, NULL, 0); - OPENSSL_memcpy(tag, ctx->Xi.c, - len <= sizeof(ctx->Xi.c) ? len : sizeof(ctx->Xi.c)); -} - -#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) -int crypto_gcm_clmul_enabled(void) { -#if defined(GHASH_ASM_X86) || defined(GHASH_ASM_X86_64) - return CRYPTO_is_FXSR_capable() && CRYPTO_is_PCLMUL_capable(); -#else - return 0; -#endif -} -#endif diff --git a/third_party/boringssl/src/crypto/fipsmodule/modes/gcm_nohw.c b/third_party/boringssl/src/crypto/fipsmodule/modes/gcm_nohw.c deleted file mode 100644 index 92d54413..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/modes/gcm_nohw.c +++ /dev/null @@ -1,304 +0,0 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include "../../internal.h" -#include "internal.h" - -#if !defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_SSE2) -#include -#endif - - -// This file contains a constant-time implementation of GHASH based on the notes -// in https://bearssl.org/constanttime.html#ghash-for-gcm and the reduction -// algorithm described in -// https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. -// -// Unlike the BearSSL notes, we use uint128_t in the 64-bit implementation. Our -// primary compilers (clang, clang-cl, and gcc) all support it. MSVC will run -// the 32-bit implementation, but we can use its intrinsics if necessary. - -#if defined(BORINGSSL_HAS_UINT128) - -static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, - uint64_t b) { - // One term every four bits means the largest term is 64/4 = 16, which barely - // overflows into the next term. Using one term every five bits would cost 25 - // multiplications instead of 16. It is faster to mask off the bottom four - // bits of |a|, giving a largest term of 60/4 = 15, and apply the bottom bits - // separately. - uint64_t a0 = a & UINT64_C(0x1111111111111110); - uint64_t a1 = a & UINT64_C(0x2222222222222220); - uint64_t a2 = a & UINT64_C(0x4444444444444440); - uint64_t a3 = a & UINT64_C(0x8888888888888880); - - uint64_t b0 = b & UINT64_C(0x1111111111111111); - uint64_t b1 = b & UINT64_C(0x2222222222222222); - uint64_t b2 = b & UINT64_C(0x4444444444444444); - uint64_t b3 = b & UINT64_C(0x8888888888888888); - - uint128_t c0 = (a0 * (uint128_t)b0) ^ (a1 * (uint128_t)b3) ^ - (a2 * (uint128_t)b2) ^ (a3 * (uint128_t)b1); - uint128_t c1 = (a0 * (uint128_t)b1) ^ (a1 * (uint128_t)b0) ^ - (a2 * (uint128_t)b3) ^ (a3 * (uint128_t)b2); - uint128_t c2 = (a0 * (uint128_t)b2) ^ (a1 * (uint128_t)b1) ^ - (a2 * (uint128_t)b0) ^ (a3 * (uint128_t)b3); - uint128_t c3 = (a0 * (uint128_t)b3) ^ (a1 * (uint128_t)b2) ^ - (a2 * (uint128_t)b1) ^ (a3 * (uint128_t)b0); - - // Multiply the bottom four bits of |a| with |b|. - uint64_t a0_mask = UINT64_C(0) - (a & 1); - uint64_t a1_mask = UINT64_C(0) - ((a >> 1) & 1); - uint64_t a2_mask = UINT64_C(0) - ((a >> 2) & 1); - uint64_t a3_mask = UINT64_C(0) - ((a >> 3) & 1); - uint128_t extra = (a0_mask & b) ^ ((uint128_t)(a1_mask & b) << 1) ^ - ((uint128_t)(a2_mask & b) << 2) ^ - ((uint128_t)(a3_mask & b) << 3); - - *out_lo = (((uint64_t)c0) & UINT64_C(0x1111111111111111)) ^ - (((uint64_t)c1) & UINT64_C(0x2222222222222222)) ^ - (((uint64_t)c2) & UINT64_C(0x4444444444444444)) ^ - (((uint64_t)c3) & UINT64_C(0x8888888888888888)) ^ ((uint64_t)extra); - *out_hi = (((uint64_t)(c0 >> 64)) & UINT64_C(0x1111111111111111)) ^ - (((uint64_t)(c1 >> 64)) & UINT64_C(0x2222222222222222)) ^ - (((uint64_t)(c2 >> 64)) & UINT64_C(0x4444444444444444)) ^ - (((uint64_t)(c3 >> 64)) & UINT64_C(0x8888888888888888)) ^ - ((uint64_t)(extra >> 64)); -} - -#elif defined(OPENSSL_SSE2) - -static __m128i gcm_mul32_nohw(uint32_t a, uint32_t b) { - // One term every four bits means the largest term is 32/4 = 8, which does not - // overflow into the next term. - __m128i aa = _mm_setr_epi32(a, 0, a, 0); - __m128i bb = _mm_setr_epi32(b, 0, b, 0); - - __m128i a0a0 = - _mm_and_si128(aa, _mm_setr_epi32(0x11111111, 0, 0x11111111, 0)); - __m128i a2a2 = - _mm_and_si128(aa, _mm_setr_epi32(0x44444444, 0, 0x44444444, 0)); - __m128i b0b1 = - _mm_and_si128(bb, _mm_setr_epi32(0x11111111, 0, 0x22222222, 0)); - __m128i b2b3 = - _mm_and_si128(bb, _mm_setr_epi32(0x44444444, 0, 0x88888888, 0)); - - __m128i c0c1 = - _mm_xor_si128(_mm_mul_epu32(a0a0, b0b1), _mm_mul_epu32(a2a2, b2b3)); - __m128i c2c3 = - _mm_xor_si128(_mm_mul_epu32(a2a2, b0b1), _mm_mul_epu32(a0a0, b2b3)); - - __m128i a1a1 = - _mm_and_si128(aa, _mm_setr_epi32(0x22222222, 0, 0x22222222, 0)); - __m128i a3a3 = - _mm_and_si128(aa, _mm_setr_epi32(0x88888888, 0, 0x88888888, 0)); - __m128i b3b0 = - _mm_and_si128(bb, _mm_setr_epi32(0x88888888, 0, 0x11111111, 0)); - __m128i b1b2 = - _mm_and_si128(bb, _mm_setr_epi32(0x22222222, 0, 0x44444444, 0)); - - c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a1a1, b3b0)); - c0c1 = _mm_xor_si128(c0c1, _mm_mul_epu32(a3a3, b1b2)); - c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a3a3, b3b0)); - c2c3 = _mm_xor_si128(c2c3, _mm_mul_epu32(a1a1, b1b2)); - - c0c1 = _mm_and_si128( - c0c1, _mm_setr_epi32(0x11111111, 0x11111111, 0x22222222, 0x22222222)); - c2c3 = _mm_and_si128( - c2c3, _mm_setr_epi32(0x44444444, 0x44444444, 0x88888888, 0x88888888)); - - c0c1 = _mm_xor_si128(c0c1, c2c3); - // c0 ^= c1 - c0c1 = _mm_xor_si128(c0c1, _mm_srli_si128(c0c1, 8)); - return c0c1; -} - -static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, - uint64_t b) { - uint32_t a0 = a & 0xffffffff; - uint32_t a1 = a >> 32; - uint32_t b0 = b & 0xffffffff; - uint32_t b1 = b >> 32; - // Karatsuba multiplication. - __m128i lo = gcm_mul32_nohw(a0, b0); - __m128i hi = gcm_mul32_nohw(a1, b1); - __m128i mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1); - mid = _mm_xor_si128(mid, lo); - mid = _mm_xor_si128(mid, hi); - __m128i ret = _mm_unpacklo_epi64(lo, hi); - mid = _mm_slli_si128(mid, 4); - mid = _mm_and_si128(mid, _mm_setr_epi32(0, 0xffffffff, 0xffffffff, 0)); - ret = _mm_xor_si128(ret, mid); - memcpy(out_lo, &ret, 8); - memcpy(out_hi, ((char*)&ret) + 8, 8); -} - -#else // !BORINGSSL_HAS_UINT128 && !OPENSSL_SSE2 - -static uint64_t gcm_mul32_nohw(uint32_t a, uint32_t b) { - // One term every four bits means the largest term is 32/4 = 8, which does not - // overflow into the next term. - uint32_t a0 = a & 0x11111111; - uint32_t a1 = a & 0x22222222; - uint32_t a2 = a & 0x44444444; - uint32_t a3 = a & 0x88888888; - - uint32_t b0 = b & 0x11111111; - uint32_t b1 = b & 0x22222222; - uint32_t b2 = b & 0x44444444; - uint32_t b3 = b & 0x88888888; - - uint64_t c0 = (a0 * (uint64_t)b0) ^ (a1 * (uint64_t)b3) ^ - (a2 * (uint64_t)b2) ^ (a3 * (uint64_t)b1); - uint64_t c1 = (a0 * (uint64_t)b1) ^ (a1 * (uint64_t)b0) ^ - (a2 * (uint64_t)b3) ^ (a3 * (uint64_t)b2); - uint64_t c2 = (a0 * (uint64_t)b2) ^ (a1 * (uint64_t)b1) ^ - (a2 * (uint64_t)b0) ^ (a3 * (uint64_t)b3); - uint64_t c3 = (a0 * (uint64_t)b3) ^ (a1 * (uint64_t)b2) ^ - (a2 * (uint64_t)b1) ^ (a3 * (uint64_t)b0); - - return (c0 & UINT64_C(0x1111111111111111)) | - (c1 & UINT64_C(0x2222222222222222)) | - (c2 & UINT64_C(0x4444444444444444)) | - (c3 & UINT64_C(0x8888888888888888)); -} - -static void gcm_mul64_nohw(uint64_t *out_lo, uint64_t *out_hi, uint64_t a, - uint64_t b) { - uint32_t a0 = a & 0xffffffff; - uint32_t a1 = a >> 32; - uint32_t b0 = b & 0xffffffff; - uint32_t b1 = b >> 32; - // Karatsuba multiplication. - uint64_t lo = gcm_mul32_nohw(a0, b0); - uint64_t hi = gcm_mul32_nohw(a1, b1); - uint64_t mid = gcm_mul32_nohw(a0 ^ a1, b0 ^ b1) ^ lo ^ hi; - *out_lo = lo ^ (mid << 32); - *out_hi = hi ^ (mid >> 32); -} - -#endif // BORINGSSL_HAS_UINT128 - -void gcm_init_nohw(u128 Htable[16], const uint64_t Xi[2]) { - // We implement GHASH in terms of POLYVAL, as described in RFC 8452. This - // avoids a shift by 1 in the multiplication, needed to account for bit - // reversal losing a bit after multiplication, that is, - // rev128(X) * rev128(Y) = rev255(X*Y). - // - // Per Appendix A, we run mulX_POLYVAL. Note this is the same transformation - // applied by |gcm_init_clmul|, etc. Note |Xi| has already been byteswapped. - // - // See also slide 16 of - // https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf - Htable[0].lo = Xi[1]; - Htable[0].hi = Xi[0]; - - uint64_t carry = Htable[0].hi >> 63; - carry = 0u - carry; - - Htable[0].hi <<= 1; - Htable[0].hi |= Htable[0].lo >> 63; - Htable[0].lo <<= 1; - - // The irreducible polynomial is 1 + x^121 + x^126 + x^127 + x^128, so we - // conditionally add 0xc200...0001. - Htable[0].lo ^= carry & 1; - Htable[0].hi ^= carry & UINT64_C(0xc200000000000000); - - // This implementation does not use the rest of |Htable|. -} - -static void gcm_polyval_nohw(uint64_t Xi[2], const u128 *H) { - // Karatsuba multiplication. The product of |Xi| and |H| is stored in |r0| - // through |r3|. Note there is no byte or bit reversal because we are - // evaluating POLYVAL. - uint64_t r0, r1; - gcm_mul64_nohw(&r0, &r1, Xi[0], H->lo); - uint64_t r2, r3; - gcm_mul64_nohw(&r2, &r3, Xi[1], H->hi); - uint64_t mid0, mid1; - gcm_mul64_nohw(&mid0, &mid1, Xi[0] ^ Xi[1], H->hi ^ H->lo); - mid0 ^= r0 ^ r2; - mid1 ^= r1 ^ r3; - r2 ^= mid1; - r1 ^= mid0; - - // Now we multiply our 256-bit result by x^-128 and reduce. |r2| and - // |r3| shifts into position and we must multiply |r0| and |r1| by x^-128. We - // have: - // - // 1 = x^121 + x^126 + x^127 + x^128 - // x^-128 = x^-7 + x^-2 + x^-1 + 1 - // - // This is the GHASH reduction step, but with bits flowing in reverse. - - // The x^-7, x^-2, and x^-1 terms shift bits past x^0, which would require - // another reduction steps. Instead, we gather the excess bits, incorporate - // them into |r0| and |r1| and reduce once. See slides 17-19 - // of https://crypto.stanford.edu/RealWorldCrypto/slides/gueron.pdf. - r1 ^= (r0 << 63) ^ (r0 << 62) ^ (r0 << 57); - - // 1 - r2 ^= r0; - r3 ^= r1; - - // x^-1 - r2 ^= r0 >> 1; - r2 ^= r1 << 63; - r3 ^= r1 >> 1; - - // x^-2 - r2 ^= r0 >> 2; - r2 ^= r1 << 62; - r3 ^= r1 >> 2; - - // x^-7 - r2 ^= r0 >> 7; - r2 ^= r1 << 57; - r3 ^= r1 >> 7; - - Xi[0] = r2; - Xi[1] = r3; -} - -void gcm_gmult_nohw(uint64_t Xi[2], const u128 Htable[16]) { - uint64_t swapped[2]; - swapped[0] = CRYPTO_bswap8(Xi[1]); - swapped[1] = CRYPTO_bswap8(Xi[0]); - gcm_polyval_nohw(swapped, &Htable[0]); - Xi[0] = CRYPTO_bswap8(swapped[1]); - Xi[1] = CRYPTO_bswap8(swapped[0]); -} - -void gcm_ghash_nohw(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, - size_t len) { - uint64_t swapped[2]; - swapped[0] = CRYPTO_bswap8(Xi[1]); - swapped[1] = CRYPTO_bswap8(Xi[0]); - - while (len >= 16) { - uint64_t block[2]; - OPENSSL_memcpy(block, inp, 16); - swapped[0] ^= CRYPTO_bswap8(block[1]); - swapped[1] ^= CRYPTO_bswap8(block[0]); - gcm_polyval_nohw(swapped, &Htable[0]); - inp += 16; - len -= 16; - } - - Xi[0] = CRYPTO_bswap8(swapped[1]); - Xi[1] = CRYPTO_bswap8(swapped[0]); -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/modes/internal.h b/third_party/boringssl/src/crypto/fipsmodule/modes/internal.h deleted file mode 100644 index 0164aace..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/modes/internal.h +++ /dev/null @@ -1,415 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2008 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#ifndef OPENSSL_HEADER_MODES_INTERNAL_H -#define OPENSSL_HEADER_MODES_INTERNAL_H - -#include - -#include - -#include -#include - -#include "../../internal.h" - -#if defined(__cplusplus) -extern "C" { -#endif - - -// block128_f is the type of an AES block cipher implementation. -// -// Unlike upstream OpenSSL, it and the other functions in this file hard-code -// |AES_KEY|. It is undefined in C to call a function pointer with anything -// other than the original type. Thus we either must match |block128_f| to the -// type signature of |AES_encrypt| and friends or pass in |void*| wrapper -// functions. -// -// These functions are called exclusively with AES, so we use the former. -typedef void (*block128_f)(const uint8_t in[16], uint8_t out[16], - const AES_KEY *key); - - -// CTR. - -// ctr128_f is the type of a function that performs CTR-mode encryption. -typedef void (*ctr128_f)(const uint8_t *in, uint8_t *out, size_t blocks, - const AES_KEY *key, const uint8_t ivec[16]); - -// CRYPTO_ctr128_encrypt encrypts (or decrypts, it's the same in CTR mode) -// |len| bytes from |in| to |out| using |block| in counter mode. There's no -// requirement that |len| be a multiple of any value and any partial blocks are -// stored in |ecount_buf| and |*num|, which must be zeroed before the initial -// call. The counter is a 128-bit, big-endian value in |ivec| and is -// incremented by this function. -void CRYPTO_ctr128_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], - uint8_t ecount_buf[16], unsigned *num, - block128_f block); - -// CRYPTO_ctr128_encrypt_ctr32 acts like |CRYPTO_ctr128_encrypt| but takes -// |ctr|, a function that performs CTR mode but only deals with the lower 32 -// bits of the counter. This is useful when |ctr| can be an optimised -// function. -void CRYPTO_ctr128_encrypt_ctr32(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], - uint8_t ecount_buf[16], unsigned *num, - ctr128_f ctr); - - -// GCM. -// -// This API differs from the upstream API slightly. The |GCM128_CONTEXT| does -// not have a |key| pointer that points to the key as upstream's version does. -// Instead, every function takes a |key| parameter. This way |GCM128_CONTEXT| -// can be safely copied. Additionally, |gcm_key| is split into a separate -// struct. - -typedef struct { uint64_t hi,lo; } u128; - -// gmult_func multiplies |Xi| by the GCM key and writes the result back to -// |Xi|. -typedef void (*gmult_func)(uint64_t Xi[2], const u128 Htable[16]); - -// ghash_func repeatedly multiplies |Xi| by the GCM key and adds in blocks from -// |inp|. The result is written back to |Xi| and the |len| argument must be a -// multiple of 16. -typedef void (*ghash_func)(uint64_t Xi[2], const u128 Htable[16], - const uint8_t *inp, size_t len); - -typedef struct gcm128_key_st { - // Note the MOVBE-based, x86-64, GHASH assembly requires |H| and |Htable| to - // be the first two elements of this struct. Additionally, some assembly - // routines require a 16-byte-aligned |Htable| when hashing data, but not - // initialization. |GCM128_KEY| is not itself aligned to simplify embedding in - // |EVP_AEAD_CTX|, but |Htable|'s offset must be a multiple of 16. - u128 H; - u128 Htable[16]; - gmult_func gmult; - ghash_func ghash; - - block128_f block; - - // use_aesni_gcm_crypt is true if this context should use the assembly - // functions |aesni_gcm_encrypt| and |aesni_gcm_decrypt| to process data. - unsigned use_aesni_gcm_crypt:1; -} GCM128_KEY; - -// GCM128_CONTEXT contains state for a single GCM operation. The structure -// should be zero-initialized before use. -typedef struct { - // The following 5 names follow names in GCM specification - union { - uint64_t u[2]; - uint32_t d[4]; - uint8_t c[16]; - crypto_word_t t[16 / sizeof(crypto_word_t)]; - } Yi, EKi, EK0, len, Xi; - - // Note that the order of |Xi| and |gcm_key| is fixed by the MOVBE-based, - // x86-64, GHASH assembly. Additionally, some assembly routines require - // |gcm_key| to be 16-byte aligned. |GCM128_KEY| is not itself aligned to - // simplify embedding in |EVP_AEAD_CTX|. - alignas(16) GCM128_KEY gcm_key; - - unsigned mres, ares; -} GCM128_CONTEXT; - -#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) -// crypto_gcm_clmul_enabled returns one if the CLMUL implementation of GCM is -// used. -int crypto_gcm_clmul_enabled(void); -#endif - -// CRYPTO_ghash_init writes a precomputed table of powers of |gcm_key| to -// |out_table| and sets |*out_mult| and |*out_hash| to (potentially hardware -// accelerated) functions for performing operations in the GHASH field. If the -// AVX implementation was used |*out_is_avx| will be true. -void CRYPTO_ghash_init(gmult_func *out_mult, ghash_func *out_hash, - u128 *out_key, u128 out_table[16], int *out_is_avx, - const uint8_t gcm_key[16]); - -// CRYPTO_gcm128_init_key initialises |gcm_key| to use |block| (typically AES) -// with the given key. |block_is_hwaes| is one if |block| is |aes_hw_encrypt|. -OPENSSL_EXPORT void CRYPTO_gcm128_init_key(GCM128_KEY *gcm_key, - const AES_KEY *key, block128_f block, - int block_is_hwaes); - -// CRYPTO_gcm128_setiv sets the IV (nonce) for |ctx|. The |key| must be the -// same key that was passed to |CRYPTO_gcm128_init|. -OPENSSL_EXPORT void CRYPTO_gcm128_setiv(GCM128_CONTEXT *ctx, const AES_KEY *key, - const uint8_t *iv, size_t iv_len); - -// CRYPTO_gcm128_aad sets the authenticated data for an instance of GCM. -// This must be called before and data is encrypted. It returns one on success -// and zero otherwise. -OPENSSL_EXPORT int CRYPTO_gcm128_aad(GCM128_CONTEXT *ctx, const uint8_t *aad, - size_t len); - -// CRYPTO_gcm128_encrypt encrypts |len| bytes from |in| to |out|. The |key| -// must be the same key that was passed to |CRYPTO_gcm128_init|. It returns one -// on success and zero otherwise. -OPENSSL_EXPORT int CRYPTO_gcm128_encrypt(GCM128_CONTEXT *ctx, - const AES_KEY *key, const uint8_t *in, - uint8_t *out, size_t len); - -// CRYPTO_gcm128_decrypt decrypts |len| bytes from |in| to |out|. The |key| -// must be the same key that was passed to |CRYPTO_gcm128_init|. It returns one -// on success and zero otherwise. -OPENSSL_EXPORT int CRYPTO_gcm128_decrypt(GCM128_CONTEXT *ctx, - const AES_KEY *key, const uint8_t *in, - uint8_t *out, size_t len); - -// CRYPTO_gcm128_encrypt_ctr32 encrypts |len| bytes from |in| to |out| using -// a CTR function that only handles the bottom 32 bits of the nonce, like -// |CRYPTO_ctr128_encrypt_ctr32|. The |key| must be the same key that was -// passed to |CRYPTO_gcm128_init|. It returns one on success and zero -// otherwise. -OPENSSL_EXPORT int CRYPTO_gcm128_encrypt_ctr32(GCM128_CONTEXT *ctx, - const AES_KEY *key, - const uint8_t *in, uint8_t *out, - size_t len, ctr128_f stream); - -// CRYPTO_gcm128_decrypt_ctr32 decrypts |len| bytes from |in| to |out| using -// a CTR function that only handles the bottom 32 bits of the nonce, like -// |CRYPTO_ctr128_encrypt_ctr32|. The |key| must be the same key that was -// passed to |CRYPTO_gcm128_init|. It returns one on success and zero -// otherwise. -OPENSSL_EXPORT int CRYPTO_gcm128_decrypt_ctr32(GCM128_CONTEXT *ctx, - const AES_KEY *key, - const uint8_t *in, uint8_t *out, - size_t len, ctr128_f stream); - -// CRYPTO_gcm128_finish calculates the authenticator and compares it against -// |len| bytes of |tag|. It returns one on success and zero otherwise. -OPENSSL_EXPORT int CRYPTO_gcm128_finish(GCM128_CONTEXT *ctx, const uint8_t *tag, - size_t len); - -// CRYPTO_gcm128_tag calculates the authenticator and copies it into |tag|. -// The minimum of |len| and 16 bytes are copied into |tag|. -OPENSSL_EXPORT void CRYPTO_gcm128_tag(GCM128_CONTEXT *ctx, uint8_t *tag, - size_t len); - - -// GCM assembly. - -void gcm_init_nohw(u128 Htable[16], const uint64_t H[2]); -void gcm_gmult_nohw(uint64_t Xi[2], const u128 Htable[16]); -void gcm_ghash_nohw(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, - size_t len); - -#if !defined(OPENSSL_NO_ASM) - -#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) -#define GCM_FUNCREF -void gcm_init_clmul(u128 Htable[16], const uint64_t Xi[2]); -void gcm_gmult_clmul(uint64_t Xi[2], const u128 Htable[16]); -void gcm_ghash_clmul(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, - size_t len); - -// |gcm_gmult_ssse3| and |gcm_ghash_ssse3| require |Htable| to be -// 16-byte-aligned, but |gcm_init_ssse3| does not. -void gcm_init_ssse3(u128 Htable[16], const uint64_t Xi[2]); -void gcm_gmult_ssse3(uint64_t Xi[2], const u128 Htable[16]); -void gcm_ghash_ssse3(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, - size_t len); - -#if defined(OPENSSL_X86_64) -#define GHASH_ASM_X86_64 -void gcm_init_avx(u128 Htable[16], const uint64_t Xi[2]); -void gcm_gmult_avx(uint64_t Xi[2], const u128 Htable[16]); -void gcm_ghash_avx(uint64_t Xi[2], const u128 Htable[16], const uint8_t *in, - size_t len); - -#define AESNI_GCM -size_t aesni_gcm_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi); -size_t aesni_gcm_decrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], uint64_t *Xi); -#endif // OPENSSL_X86_64 - -#if defined(OPENSSL_X86) -#define GHASH_ASM_X86 -#endif // OPENSSL_X86 - -#elif defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) -#define GHASH_ASM_ARM -#define GCM_FUNCREF - -OPENSSL_INLINE int gcm_pmull_capable(void) { - return CRYPTO_is_ARMv8_PMULL_capable(); -} - -void gcm_init_v8(u128 Htable[16], const uint64_t Xi[2]); -void gcm_gmult_v8(uint64_t Xi[2], const u128 Htable[16]); -void gcm_ghash_v8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, - size_t len); - -OPENSSL_INLINE int gcm_neon_capable(void) { return CRYPTO_is_NEON_capable(); } - -void gcm_init_neon(u128 Htable[16], const uint64_t Xi[2]); -void gcm_gmult_neon(uint64_t Xi[2], const u128 Htable[16]); -void gcm_ghash_neon(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, - size_t len); - -#elif defined(OPENSSL_PPC64LE) -#define GHASH_ASM_PPC64LE -#define GCM_FUNCREF -void gcm_init_p8(u128 Htable[16], const uint64_t Xi[2]); -void gcm_gmult_p8(uint64_t Xi[2], const u128 Htable[16]); -void gcm_ghash_p8(uint64_t Xi[2], const u128 Htable[16], const uint8_t *inp, - size_t len); -#endif -#endif // OPENSSL_NO_ASM - - -// CBC. - -// cbc128_f is the type of a function that performs CBC-mode encryption. -typedef void (*cbc128_f)(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], int enc); - -// CRYPTO_cbc128_encrypt encrypts |len| bytes from |in| to |out| using the -// given IV and block cipher in CBC mode. The input need not be a multiple of -// 128 bits long, but the output will round up to the nearest 128 bit multiple, -// zero padding the input if needed. The IV will be updated on return. -void CRYPTO_cbc128_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], - block128_f block); - -// CRYPTO_cbc128_decrypt decrypts |len| bytes from |in| to |out| using the -// given IV and block cipher in CBC mode. If |len| is not a multiple of 128 -// bits then only that many bytes will be written, but a multiple of 128 bits -// is always read from |in|. The IV will be updated on return. -void CRYPTO_cbc128_decrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], - block128_f block); - - -// OFB. - -// CRYPTO_ofb128_encrypt encrypts (or decrypts, it's the same with OFB mode) -// |len| bytes from |in| to |out| using |block| in OFB mode. There's no -// requirement that |len| be a multiple of any value and any partial blocks are -// stored in |ivec| and |*num|, the latter must be zero before the initial -// call. -void CRYPTO_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], unsigned *num, - block128_f block); - - -// CFB. - -// CRYPTO_cfb128_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes -// from |in| to |out| using |block| in CFB mode. There's no requirement that -// |len| be a multiple of any value and any partial blocks are stored in |ivec| -// and |*num|, the latter must be zero before the initial call. -void CRYPTO_cfb128_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], unsigned *num, - int enc, block128_f block); - -// CRYPTO_cfb128_8_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes -// from |in| to |out| using |block| in CFB-8 mode. Prior to the first call -// |num| should be set to zero. -void CRYPTO_cfb128_8_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], - unsigned *num, int enc, block128_f block); - -// CRYPTO_cfb128_1_encrypt encrypts (or decrypts, if |enc| is zero) |len| bytes -// from |in| to |out| using |block| in CFB-1 mode. Prior to the first call -// |num| should be set to zero. -void CRYPTO_cfb128_1_encrypt(const uint8_t *in, uint8_t *out, size_t bits, - const AES_KEY *key, uint8_t ivec[16], - unsigned *num, int enc, block128_f block); - -size_t CRYPTO_cts128_encrypt_block(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], - block128_f block); - - -// POLYVAL. -// -// POLYVAL is a polynomial authenticator that operates over a field very -// similar to the one that GHASH uses. See -// https://tools.ietf.org/html/draft-irtf-cfrg-gcmsiv-02#section-3. - -typedef union { - uint64_t u[2]; - uint8_t c[16]; -} polyval_block; - -struct polyval_ctx { - // Note that the order of |S|, |H| and |Htable| is fixed by the MOVBE-based, - // x86-64, GHASH assembly. Additionally, some assembly routines require - // |Htable| to be 16-byte aligned. - polyval_block S; - u128 H; - alignas(16) u128 Htable[16]; - gmult_func gmult; - ghash_func ghash; -}; - -// CRYPTO_POLYVAL_init initialises |ctx| using |key|. -void CRYPTO_POLYVAL_init(struct polyval_ctx *ctx, const uint8_t key[16]); - -// CRYPTO_POLYVAL_update_blocks updates the accumulator in |ctx| given the -// blocks from |in|. Only a whole number of blocks can be processed so |in_len| -// must be a multiple of 16. -void CRYPTO_POLYVAL_update_blocks(struct polyval_ctx *ctx, const uint8_t *in, - size_t in_len); - -// CRYPTO_POLYVAL_finish writes the accumulator from |ctx| to |out|. -void CRYPTO_POLYVAL_finish(const struct polyval_ctx *ctx, uint8_t out[16]); - - -#if defined(__cplusplus) -} // extern C -#endif - -#endif // OPENSSL_HEADER_MODES_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/modes/ofb.c b/third_party/boringssl/src/crypto/fipsmodule/modes/ofb.c deleted file mode 100644 index 5effba66..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/modes/ofb.c +++ /dev/null @@ -1,94 +0,0 @@ -/* ==================================================================== - * Copyright (c) 2008 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== */ - -#include -#include - -#include "internal.h" - - -static_assert(16 % sizeof(size_t) == 0, "block cannot be divided into size_t"); - -void CRYPTO_ofb128_encrypt(const uint8_t *in, uint8_t *out, size_t len, - const AES_KEY *key, uint8_t ivec[16], unsigned *num, - block128_f block) { - assert(key != NULL && ivec != NULL && num != NULL); - assert(len == 0 || (in != NULL && out != NULL)); - - unsigned n = *num; - - while (n && len) { - *(out++) = *(in++) ^ ivec[n]; - --len; - n = (n + 1) % 16; - } - - while (len >= 16) { - (*block)(ivec, ivec, key); - for (; n < 16; n += sizeof(size_t)) { - size_t a, b; - OPENSSL_memcpy(&a, in + n, sizeof(size_t)); - OPENSSL_memcpy(&b, ivec + n, sizeof(size_t)); - - const size_t c = a ^ b; - OPENSSL_memcpy(out + n, &c, sizeof(size_t)); - } - len -= 16; - out += 16; - in += 16; - n = 0; - } - if (len) { - (*block)(ivec, ivec, key); - while (len--) { - out[n] = in[n] ^ ivec[n]; - ++n; - } - } - *num = n; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/modes/polyval.c b/third_party/boringssl/src/crypto/fipsmodule/modes/polyval.c deleted file mode 100644 index 857dc0e3..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/modes/polyval.c +++ /dev/null @@ -1,91 +0,0 @@ -/* Copyright (c) 2016, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include "internal.h" -#include "../../internal.h" - - -// byte_reverse reverses the order of the bytes in |b->c|. -static void byte_reverse(polyval_block *b) { - const uint64_t t = CRYPTO_bswap8(b->u[0]); - b->u[0] = CRYPTO_bswap8(b->u[1]); - b->u[1] = t; -} - -// reverse_and_mulX_ghash interprets the bytes |b->c| as a reversed element of -// the GHASH field, multiplies that by 'x' and serialises the result back into -// |b|, but with GHASH's backwards bit ordering. -static void reverse_and_mulX_ghash(polyval_block *b) { - uint64_t hi = b->u[0]; - uint64_t lo = b->u[1]; - const crypto_word_t carry = constant_time_eq_w(hi & 1, 1); - hi >>= 1; - hi |= lo << 63; - lo >>= 1; - lo ^= ((uint64_t) constant_time_select_w(carry, 0xe1, 0)) << 56; - - b->u[0] = CRYPTO_bswap8(lo); - b->u[1] = CRYPTO_bswap8(hi); -} - -// POLYVAL(H, X_1, ..., X_n) = -// ByteReverse(GHASH(mulX_GHASH(ByteReverse(H)), ByteReverse(X_1), ..., -// ByteReverse(X_n))). -// -// See https://tools.ietf.org/html/draft-irtf-cfrg-gcmsiv-02#appendix-A. - -void CRYPTO_POLYVAL_init(struct polyval_ctx *ctx, const uint8_t key[16]) { - polyval_block H; - OPENSSL_memcpy(H.c, key, 16); - reverse_and_mulX_ghash(&H); - - int is_avx; - CRYPTO_ghash_init(&ctx->gmult, &ctx->ghash, &ctx->H, ctx->Htable, &is_avx, - H.c); - OPENSSL_memset(&ctx->S, 0, sizeof(ctx->S)); -} - -void CRYPTO_POLYVAL_update_blocks(struct polyval_ctx *ctx, const uint8_t *in, - size_t in_len) { - assert((in_len & 15) == 0); - polyval_block reversed[32]; - - while (in_len > 0) { - size_t todo = in_len; - if (todo > sizeof(reversed)) { - todo = sizeof(reversed); - } - OPENSSL_memcpy(reversed, in, todo); - in += todo; - in_len -= todo; - - size_t blocks = todo / sizeof(polyval_block); - for (size_t i = 0; i < blocks; i++) { - byte_reverse(&reversed[i]); - } - - ctx->ghash(ctx->S.u, ctx->Htable, (const uint8_t *) reversed, todo); - } -} - -void CRYPTO_POLYVAL_finish(const struct polyval_ctx *ctx, uint8_t out[16]) { - polyval_block S = ctx->S; - byte_reverse(&S); - OPENSSL_memcpy(out, &S.c, sizeof(polyval_block)); -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/rand/ctrdrbg.c b/third_party/boringssl/src/crypto/fipsmodule/rand/ctrdrbg.c deleted file mode 100644 index 0e8995f4..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/rand/ctrdrbg.c +++ /dev/null @@ -1,220 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - -#include - -#include "internal.h" -#include "../cipher/internal.h" -#include "../service_indicator/internal.h" - - -// Section references in this file refer to SP 800-90Ar1: -// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf - -// See table 3. -static const uint64_t kMaxReseedCount = UINT64_C(1) << 48; - -CTR_DRBG_STATE *CTR_DRBG_new(const uint8_t entropy[CTR_DRBG_ENTROPY_LEN], - const uint8_t *personalization, - size_t personalization_len) { - CTR_DRBG_STATE *drbg = OPENSSL_malloc(sizeof(CTR_DRBG_STATE)); - if (drbg == NULL || - !CTR_DRBG_init(drbg, entropy, personalization, personalization_len)) { - CTR_DRBG_free(drbg); - return NULL; - } - - return drbg; -} - -void CTR_DRBG_free(CTR_DRBG_STATE *state) { OPENSSL_free(state); } - -int CTR_DRBG_init(CTR_DRBG_STATE *drbg, - const uint8_t entropy[CTR_DRBG_ENTROPY_LEN], - const uint8_t *personalization, size_t personalization_len) { - // Section 10.2.1.3.1 - if (personalization_len > CTR_DRBG_ENTROPY_LEN) { - return 0; - } - - uint8_t seed_material[CTR_DRBG_ENTROPY_LEN]; - OPENSSL_memcpy(seed_material, entropy, CTR_DRBG_ENTROPY_LEN); - - for (size_t i = 0; i < personalization_len; i++) { - seed_material[i] ^= personalization[i]; - } - - // Section 10.2.1.2 - - // kInitMask is the result of encrypting blocks with big-endian value 1, 2 - // and 3 with the all-zero AES-256 key. - static const uint8_t kInitMask[CTR_DRBG_ENTROPY_LEN] = { - 0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9, 0xa9, 0x63, 0xb4, 0xf1, - 0xc4, 0xcb, 0x73, 0x8b, 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e, - 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18, 0x72, 0x60, 0x03, 0xca, - 0x37, 0xa6, 0x2a, 0x74, 0xd1, 0xa2, 0xf5, 0x8e, 0x75, 0x06, 0x35, 0x8e, - }; - - for (size_t i = 0; i < sizeof(kInitMask); i++) { - seed_material[i] ^= kInitMask[i]; - } - - drbg->ctr = aes_ctr_set_key(&drbg->ks, NULL, &drbg->block, seed_material, 32); - OPENSSL_memcpy(drbg->counter, seed_material + 32, 16); - drbg->reseed_counter = 1; - - return 1; -} - -static_assert(CTR_DRBG_ENTROPY_LEN % AES_BLOCK_SIZE == 0, - "not a multiple of AES block size"); - -// ctr_inc adds |n| to the last four bytes of |drbg->counter|, treated as a -// big-endian number. -static void ctr32_add(CTR_DRBG_STATE *drbg, uint32_t n) { - uint32_t ctr = CRYPTO_load_u32_be(drbg->counter + 12); - CRYPTO_store_u32_be(drbg->counter + 12, ctr + n); -} - -static int ctr_drbg_update(CTR_DRBG_STATE *drbg, const uint8_t *data, - size_t data_len) { - // Per section 10.2.1.2, |data_len| must be |CTR_DRBG_ENTROPY_LEN|. Here, we - // allow shorter inputs and right-pad them with zeros. This is equivalent to - // the specified algorithm but saves a copy in |CTR_DRBG_generate|. - if (data_len > CTR_DRBG_ENTROPY_LEN) { - return 0; - } - - uint8_t temp[CTR_DRBG_ENTROPY_LEN]; - for (size_t i = 0; i < CTR_DRBG_ENTROPY_LEN; i += AES_BLOCK_SIZE) { - ctr32_add(drbg, 1); - drbg->block(drbg->counter, temp + i, &drbg->ks); - } - - for (size_t i = 0; i < data_len; i++) { - temp[i] ^= data[i]; - } - - drbg->ctr = aes_ctr_set_key(&drbg->ks, NULL, &drbg->block, temp, 32); - OPENSSL_memcpy(drbg->counter, temp + 32, 16); - - return 1; -} - -int CTR_DRBG_reseed(CTR_DRBG_STATE *drbg, - const uint8_t entropy[CTR_DRBG_ENTROPY_LEN], - const uint8_t *additional_data, - size_t additional_data_len) { - // Section 10.2.1.4 - uint8_t entropy_copy[CTR_DRBG_ENTROPY_LEN]; - - if (additional_data_len > 0) { - if (additional_data_len > CTR_DRBG_ENTROPY_LEN) { - return 0; - } - - OPENSSL_memcpy(entropy_copy, entropy, CTR_DRBG_ENTROPY_LEN); - for (size_t i = 0; i < additional_data_len; i++) { - entropy_copy[i] ^= additional_data[i]; - } - - entropy = entropy_copy; - } - - if (!ctr_drbg_update(drbg, entropy, CTR_DRBG_ENTROPY_LEN)) { - return 0; - } - - drbg->reseed_counter = 1; - - return 1; -} - -int CTR_DRBG_generate(CTR_DRBG_STATE *drbg, uint8_t *out, size_t out_len, - const uint8_t *additional_data, - size_t additional_data_len) { - // See 9.3.1 - if (out_len > CTR_DRBG_MAX_GENERATE_LENGTH) { - return 0; - } - - // See 10.2.1.5.1 - if (drbg->reseed_counter > kMaxReseedCount) { - return 0; - } - - if (additional_data_len != 0 && - !ctr_drbg_update(drbg, additional_data, additional_data_len)) { - return 0; - } - - // kChunkSize is used to interact better with the cache. Since the AES-CTR - // code assumes that it's encrypting rather than just writing keystream, the - // buffer has to be zeroed first. Without chunking, large reads would zero - // the whole buffer, flushing the L1 cache, and then do another pass (missing - // the cache every time) to “encrypt” it. The code can avoid this by - // chunking. - static const size_t kChunkSize = 8 * 1024; - - while (out_len >= AES_BLOCK_SIZE) { - size_t todo = kChunkSize; - if (todo > out_len) { - todo = out_len; - } - - todo &= ~(AES_BLOCK_SIZE-1); - const size_t num_blocks = todo / AES_BLOCK_SIZE; - - if (drbg->ctr) { - OPENSSL_memset(out, 0, todo); - ctr32_add(drbg, 1); - drbg->ctr(out, out, num_blocks, &drbg->ks, drbg->counter); - ctr32_add(drbg, num_blocks - 1); - } else { - for (size_t i = 0; i < todo; i += AES_BLOCK_SIZE) { - ctr32_add(drbg, 1); - drbg->block(drbg->counter, out + i, &drbg->ks); - } - } - - out += todo; - out_len -= todo; - } - - if (out_len > 0) { - uint8_t block[AES_BLOCK_SIZE]; - ctr32_add(drbg, 1); - drbg->block(drbg->counter, block, &drbg->ks); - - OPENSSL_memcpy(out, block, out_len); - } - - // Right-padding |additional_data| in step 2.2 is handled implicitly by - // |ctr_drbg_update|, to save a copy. - if (!ctr_drbg_update(drbg, additional_data, additional_data_len)) { - return 0; - } - - drbg->reseed_counter++; - FIPS_service_indicator_update_state(); - return 1; -} - -void CTR_DRBG_clear(CTR_DRBG_STATE *drbg) { - OPENSSL_cleanse(drbg, sizeof(CTR_DRBG_STATE)); -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/rand/ctrdrbg.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/rand/ctrdrbg.cc.inc new file mode 100644 index 00000000..04f51959 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/rand/ctrdrbg.cc.inc @@ -0,0 +1,401 @@ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + +#include "../../mem_internal.h" +#include "../aes/internal.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +// Section references in this file refer to SP 800-90Ar1: +// http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-90Ar1.pdf + +// Also see table 3. +constexpr uint64_t kMaxReseedCount = UINT64_C(1) << 48; + +// Implements the BCC function as described in Section 10.3.3. +static void bcc(uint8_t out[AES_BLOCK_SIZE], const AES_KEY *aes_key, + const uint8_t *data, size_t data_len) { + // 1. chaining_value = 0^outlen. + uint8_t *chaining_value = out; + OPENSSL_memset(chaining_value, 0, AES_BLOCK_SIZE); + + // 2. n = len (data)/outlen. + BSSL_CHECK(data_len % AES_BLOCK_SIZE == 0); + const size_t n = data_len / AES_BLOCK_SIZE; + + for (size_t i = 0; i < n; i++) { + const uint8_t *block = data + (i * AES_BLOCK_SIZE); + uint8_t input_block[AES_BLOCK_SIZE]; + + // 4.1: input_block = chaining_value ⊕ block_i. + CRYPTO_xor16(input_block, chaining_value, block); + + // 4.2: chaining_value = Block_Encrypt (Key, input_block). + BCM_aes_encrypt(input_block, chaining_value, aes_key); + } + + // 5. output_block = chaining_value. +} + +// Implements the derivation function as described in Section 10.3.2. +static int block_cipher_df(uint8_t *out, size_t out_len, const uint8_t *input, + size_t input_len) { + // Constants for AES-256 + constexpr size_t kAESKeyLen = 32; + constexpr size_t kAESOutLen = AES_BLOCK_SIZE; + constexpr size_t kMaxNumBits = 512; + + if (out_len > kMaxNumBits / 8 || input_len > (1u << 30)) { + return 0; + } + + // 4. S = L || N || input_string || 0x80. + const size_t s_rawlen = sizeof(uint32_t) + sizeof(uint32_t) + input_len + 1; + // S is padded up to a block size. + const size_t s_len = (s_rawlen + kAESOutLen - 1) & ~(kAESOutLen - 1); + uint8_t iv_plus_s[/* space used below */ kAESOutLen + 4 + 4 + + CTR_DRBG_MAX_ENTROPY_LEN + CTR_DRBG_NONCE_LEN + + CTR_DRBG_SEED_LEN + 1 + + /* padding */ 7]; + if (kAESOutLen + s_len > sizeof(iv_plus_s)) { + return 0; + } + OPENSSL_memset(iv_plus_s, 0, sizeof(iv_plus_s)); + uint8_t *s_ptr = iv_plus_s + kAESOutLen; + // 2. L = len (input_string)/8. + CRYPTO_store_u32_be(s_ptr, (uint32_t)input_len); + s_ptr += sizeof(uint32_t); + // 3. N = number_of_bits_to_return/8. + CRYPTO_store_u32_be(s_ptr, (uint32_t)out_len); + s_ptr += sizeof(uint32_t); + OPENSSL_memcpy(s_ptr, input, input_len); + s_ptr += input_len; + *s_ptr = 0x80; + + uint8_t temp[kAESKeyLen + kAESOutLen]; + size_t temp_len = 0; + + // 8. K = leftmost (0x00010203...1D1E1F, keylen). + static const uint8_t kInitialKey[kAESKeyLen] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, + 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f}; + AES_KEY aes_key; + bcm_status status = + BCM_aes_set_encrypt_key(kInitialKey, 8 * sizeof(kInitialKey), &aes_key); + BSSL_CHECK(status != bcm_status::failure); + + // 7. i = 0. + uint32_t i = 0; + while (temp_len < sizeof(temp)) { + // 9.1 IV = i || 0^(outlen - len(i)). + CRYPTO_store_u32_be(iv_plus_s, i); + + // 9.2 temp = temp || BCC (K, (IV || S)). + bcc(temp + temp_len, &aes_key, iv_plus_s, kAESOutLen + s_len); + temp_len += kAESOutLen; + + // 9.3 i = i + 1. + i++; + } + + // 10. K = leftmost (temp, keylen). + uint8_t *const k = temp; + + // 11. X = select (temp, keylen+1, keylen+outlen). + uint8_t *const x = temp + kAESKeyLen; + + // 12. temp = the Null string. + temp_len = 0; + + // Create an AES key schedule for the final encryption steps. + status = BCM_aes_set_encrypt_key(k, kAESKeyLen * 8, &aes_key); + BSSL_CHECK(status != bcm_status::failure); + + // 13. While len (temp) < number_of_bits_to_return, do: + while (temp_len < out_len) { + // 13.1 X = Block_Encrypt (K, X). + BCM_aes_encrypt(x, x, &aes_key); + + // 13.2 temp = temp || X. + size_t to_copy = std::min(kAESOutLen, out_len - temp_len); + OPENSSL_memcpy(out + temp_len, x, to_copy); + temp_len += to_copy; + } + + return 1; +} + +CTR_DRBG_STATE *CTR_DRBG_new(const uint8_t entropy[CTR_DRBG_ENTROPY_LEN], + const uint8_t *personalization, + size_t personalization_len) { + CTR_DRBG_STATE *drbg = New(); + if (drbg == nullptr || + !CTR_DRBG_init(drbg, /*df=*/false, entropy, CTR_DRBG_ENTROPY_LEN, + /*nonce=*/nullptr, personalization, personalization_len)) { + CTR_DRBG_free(drbg); + return nullptr; + } + + return drbg; +} + +CTR_DRBG_STATE *CTR_DRBG_new_df(const uint8_t *entropy, size_t entropy_len, + const uint8_t nonce[CTR_DRBG_NONCE_LEN], + const uint8_t *personalization, + size_t personalization_len) { + CTR_DRBG_STATE *drbg = New(); + if (drbg == nullptr || + !CTR_DRBG_init(drbg, /*df=*/true, entropy, entropy_len, nonce, + personalization, personalization_len)) { + CTR_DRBG_free(drbg); + return nullptr; + } + + return drbg; +} + +void CTR_DRBG_free(CTR_DRBG_STATE *state) { Delete(state); } + +int bssl::CTR_DRBG_init(CTR_DRBG_STATE *drbg, int df, const uint8_t *entropy, + size_t entropy_len, + const uint8_t nonce[CTR_DRBG_NONCE_LEN], + const uint8_t *personalization, + size_t personalization_len) { + // Section 10.2.1.3.1 and 10.2.1.3.2 + if (personalization_len > CTR_DRBG_SEED_LEN || + (!df && entropy_len != CTR_DRBG_ENTROPY_LEN) || + (df && (entropy_len < CTR_DRBG_MIN_ENTROPY_LEN || + entropy_len > CTR_DRBG_MAX_ENTROPY_LEN)) || // + (df != (nonce != nullptr))) { + return 0; + } + + uint8_t seed_material[CTR_DRBG_SEED_LEN]; + if (df) { + uint8_t pre_seed_material[CTR_DRBG_MAX_ENTROPY_LEN + CTR_DRBG_NONCE_LEN + + CTR_DRBG_SEED_LEN]; + OPENSSL_memcpy(pre_seed_material, entropy, entropy_len); + OPENSSL_memcpy(pre_seed_material + entropy_len, nonce, CTR_DRBG_NONCE_LEN); + OPENSSL_memcpy(pre_seed_material + entropy_len + CTR_DRBG_NONCE_LEN, + personalization, personalization_len); + const size_t pre_seed_material_length = + entropy_len + CTR_DRBG_NONCE_LEN + personalization_len; + + if (!block_cipher_df(seed_material, sizeof(seed_material), + pre_seed_material, pre_seed_material_length)) { + return 0; + } + } else { + OPENSSL_memcpy(seed_material, entropy, CTR_DRBG_ENTROPY_LEN); + for (size_t i = 0; i < personalization_len; i++) { + seed_material[i] ^= personalization[i]; + } + } + + // Section 10.2.1.2 + + // kInitMask is the result of encrypting blocks with big-endian value 1, 2 + // and 3 with the all-zero AES-256 key. + static const uint8_t kInitMask[CTR_DRBG_SEED_LEN] = { + 0x53, 0x0f, 0x8a, 0xfb, 0xc7, 0x45, 0x36, 0xb9, 0xa9, 0x63, 0xb4, 0xf1, + 0xc4, 0xcb, 0x73, 0x8b, 0xce, 0xa7, 0x40, 0x3d, 0x4d, 0x60, 0x6b, 0x6e, + 0x07, 0x4e, 0xc5, 0xd3, 0xba, 0xf3, 0x9d, 0x18, 0x72, 0x60, 0x03, 0xca, + 0x37, 0xa6, 0x2a, 0x74, 0xd1, 0xa2, 0xf5, 0x8e, 0x75, 0x06, 0x35, 0x8e, + }; + + for (size_t i = 0; i < sizeof(kInitMask); i++) { + seed_material[i] ^= kInitMask[i]; + } + + drbg->df = df; + drbg->ctr = + aes_ctr_set_key(&drbg->ks, nullptr, &drbg->block, seed_material, 32); + OPENSSL_memcpy(drbg->counter, seed_material + 32, 16); + drbg->reseed_counter = 1; + + return 1; +} + +static_assert(CTR_DRBG_SEED_LEN % AES_BLOCK_SIZE == 0, + "not a multiple of AES block size"); + +// ctr_inc adds |n| to the last four bytes of |drbg->counter|, treated as a +// big-endian number. +static void ctr32_add(CTR_DRBG_STATE *drbg, uint32_t n) { + uint32_t ctr = CRYPTO_load_u32_be(drbg->counter + 12); + CRYPTO_store_u32_be(drbg->counter + 12, ctr + n); +} + +static int ctr_drbg_update(CTR_DRBG_STATE *drbg, + const uint8_t data[CTR_DRBG_SEED_LEN]) { + uint8_t temp[CTR_DRBG_SEED_LEN]; + for (size_t i = 0; i < CTR_DRBG_SEED_LEN; i += AES_BLOCK_SIZE) { + ctr32_add(drbg, 1); + drbg->block(drbg->counter, temp + i, &drbg->ks); + } + + for (size_t i = 0; i < CTR_DRBG_SEED_LEN; i++) { + temp[i] ^= data[i]; + } + + drbg->ctr = aes_ctr_set_key(&drbg->ks, nullptr, &drbg->block, temp, 32); + OPENSSL_memcpy(drbg->counter, temp + 32, 16); + + return 1; +} + +int CTR_DRBG_reseed(CTR_DRBG_STATE *drbg, + const uint8_t entropy[CTR_DRBG_ENTROPY_LEN], + const uint8_t *additional_data, + size_t additional_data_len) { + return CTR_DRBG_reseed_ex(drbg, entropy, CTR_DRBG_ENTROPY_LEN, + additional_data, additional_data_len); +} + +int CTR_DRBG_reseed_ex(CTR_DRBG_STATE *drbg, const uint8_t *entropy, + size_t entropy_len, const uint8_t *additional_data, + size_t additional_data_len) { + if (additional_data_len > CTR_DRBG_SEED_LEN || + (drbg->df && (entropy_len > CTR_DRBG_MAX_ENTROPY_LEN || + entropy_len < CTR_DRBG_MIN_ENTROPY_LEN)) || + (!drbg->df && entropy_len != CTR_DRBG_ENTROPY_LEN)) { + return 0; + } + + uint8_t seed_material[CTR_DRBG_SEED_LEN]; + if (drbg->df) { + // Section 10.2.1.4.2 + uint8_t pre_seed_material[CTR_DRBG_MAX_ENTROPY_LEN + CTR_DRBG_SEED_LEN]; + static_assert(CTR_DRBG_MAX_ENTROPY_LEN <= sizeof(pre_seed_material)); + OPENSSL_memcpy(pre_seed_material, entropy, entropy_len); + OPENSSL_memcpy(pre_seed_material + entropy_len, additional_data, + additional_data_len); + const size_t pre_seed_material_len = entropy_len + additional_data_len; + + if (!block_cipher_df(seed_material, sizeof(seed_material), + pre_seed_material, pre_seed_material_len)) { + return 0; + } + } else { + // Section 10.2.1.4 + static_assert(CTR_DRBG_ENTROPY_LEN == sizeof(seed_material)); + OPENSSL_memcpy(seed_material, entropy, CTR_DRBG_ENTROPY_LEN); + if (additional_data_len > 0) { + for (size_t i = 0; i < additional_data_len; i++) { + seed_material[i] ^= additional_data[i]; + } + } + } + + if (!ctr_drbg_update(drbg, seed_material)) { + return 0; + } + + drbg->reseed_counter = 1; + + return 1; +} + +int CTR_DRBG_generate(CTR_DRBG_STATE *drbg, uint8_t *out, size_t out_len, + const uint8_t *additional_data, + size_t additional_data_len) { + // See 9.3.1 + if (out_len > CTR_DRBG_MAX_GENERATE_LENGTH) { + return 0; + } + + // See 10.2.1.5.1 + if (drbg->reseed_counter > kMaxReseedCount) { + return 0; + } + + uint8_t processed_additional_data[CTR_DRBG_SEED_LEN]; + OPENSSL_memset(processed_additional_data, 0, + sizeof(processed_additional_data)); + if (additional_data_len != 0) { + if (drbg->df) { + if (!block_cipher_df(processed_additional_data, + sizeof(processed_additional_data), additional_data, + additional_data_len)) { + return 0; + } + } else { + if (additional_data_len > sizeof(processed_additional_data)) { + return 0; + } + OPENSSL_memcpy(processed_additional_data, additional_data, + additional_data_len); + } + if (!ctr_drbg_update(drbg, processed_additional_data)) { + return 0; + } + } + + // kChunkSize is used to interact better with the cache. Since the AES-CTR + // code assumes that it's encrypting rather than just writing keystream, the + // buffer has to be zeroed first. Without chunking, large reads would zero + // the whole buffer, flushing the L1 cache, and then do another pass (missing + // the cache every time) to “encrypt” it. The code can avoid this by + // chunking. + constexpr size_t kChunkSize = 8 * 1024; + + while (out_len >= AES_BLOCK_SIZE) { + size_t todo = kChunkSize; + if (todo > out_len) { + todo = out_len; + } + + todo &= ~(AES_BLOCK_SIZE - 1); + const size_t num_blocks = todo / AES_BLOCK_SIZE; + + OPENSSL_memset(out, 0, todo); + ctr32_add(drbg, 1); + drbg->ctr(out, out, num_blocks, &drbg->ks, drbg->counter); + ctr32_add(drbg, (uint32_t)(num_blocks - 1)); + + out += todo; + out_len -= todo; + } + + if (out_len > 0) { + uint8_t block[AES_BLOCK_SIZE]; + ctr32_add(drbg, 1); + drbg->block(drbg->counter, block, &drbg->ks); + + OPENSSL_memcpy(out, block, out_len); + } + + if (!ctr_drbg_update(drbg, processed_additional_data)) { + return 0; + } + + drbg->reseed_counter++; + FIPS_service_indicator_update_state(); + return 1; +} + +void CTR_DRBG_clear(CTR_DRBG_STATE *drbg) { + OPENSSL_cleanse(drbg, sizeof(CTR_DRBG_STATE)); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/rand/fork_detect.c b/third_party/boringssl/src/crypto/fipsmodule/rand/fork_detect.c deleted file mode 100644 index 51cf18ab..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/rand/fork_detect.c +++ /dev/null @@ -1,136 +0,0 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#if !defined(_GNU_SOURCE) -#define _GNU_SOURCE // needed for madvise() and MAP_ANONYMOUS on Linux. -#endif - -#include - -#include "fork_detect.h" - -#if defined(OPENSSL_LINUX) -#include -#include -#include -#include - -#include "../delocate.h" -#include "../../internal.h" - - -#if defined(MADV_WIPEONFORK) -static_assert(MADV_WIPEONFORK == 18, "MADV_WIPEONFORK is not 18"); -#else -#define MADV_WIPEONFORK 18 -#endif - -DEFINE_STATIC_ONCE(g_fork_detect_once); -DEFINE_STATIC_MUTEX(g_fork_detect_lock); -DEFINE_BSS_GET(volatile char *, g_fork_detect_addr); -DEFINE_BSS_GET(uint64_t, g_fork_generation); -DEFINE_BSS_GET(int, g_ignore_madv_wipeonfork); - -static void init_fork_detect(void) { - if (*g_ignore_madv_wipeonfork_bss_get()) { - return; - } - - long page_size = sysconf(_SC_PAGESIZE); - if (page_size <= 0) { - return; - } - - void *addr = mmap(NULL, (size_t)page_size, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); - if (addr == MAP_FAILED) { - return; - } - - // Some versions of qemu (up to at least 5.0.0-rc4, see linux-user/syscall.c) - // ignore |madvise| calls and just return zero (i.e. success). But we need to - // know whether MADV_WIPEONFORK actually took effect. Therefore try an invalid - // call to check that the implementation of |madvise| is actually rejecting - // unknown |advice| values. - if (madvise(addr, (size_t)page_size, -1) == 0 || - madvise(addr, (size_t)page_size, MADV_WIPEONFORK) != 0) { - munmap(addr, (size_t)page_size); - return; - } - - *((volatile char *) addr) = 1; - *g_fork_detect_addr_bss_get() = addr; - *g_fork_generation_bss_get() = 1; -} - -uint64_t CRYPTO_get_fork_generation(void) { - // In a single-threaded process, there are obviously no races because there's - // only a single mutator in the address space. - // - // In a multi-threaded environment, |CRYPTO_once| ensures that the flag byte - // is initialised atomically, even if multiple threads enter this function - // concurrently. - // - // In the limit, the kernel may clear WIPEONFORK pages while a multi-threaded - // process is running. (For example, because a VM was cloned.) Therefore a - // lock is used below to synchronise the potentially multiple threads that may - // concurrently observe the cleared flag. - - CRYPTO_once(g_fork_detect_once_bss_get(), init_fork_detect); - // This pointer is |volatile| because the value pointed to may be changed by - // external forces (i.e. the kernel wiping the page) thus the compiler must - // not assume that it has exclusive access to it. - volatile char *const flag_ptr = *g_fork_detect_addr_bss_get(); - if (flag_ptr == NULL) { - // Our kernel is too old to support |MADV_WIPEONFORK|. - return 0; - } - - struct CRYPTO_STATIC_MUTEX *const lock = g_fork_detect_lock_bss_get(); - uint64_t *const generation_ptr = g_fork_generation_bss_get(); - - CRYPTO_STATIC_MUTEX_lock_read(lock); - uint64_t current_generation = *generation_ptr; - if (*flag_ptr) { - CRYPTO_STATIC_MUTEX_unlock_read(lock); - return current_generation; - } - - CRYPTO_STATIC_MUTEX_unlock_read(lock); - CRYPTO_STATIC_MUTEX_lock_write(lock); - current_generation = *generation_ptr; - if (*flag_ptr == 0) { - // A fork has occurred. - *flag_ptr = 1; - - current_generation++; - if (current_generation == 0) { - current_generation = 1; - } - *generation_ptr = current_generation; - } - CRYPTO_STATIC_MUTEX_unlock_write(lock); - - return current_generation; -} - -void CRYPTO_fork_detect_ignore_madv_wipeonfork_for_testing(void) { - *g_ignore_madv_wipeonfork_bss_get() = 1; -} - -#else // !OPENSSL_LINUX - -uint64_t CRYPTO_get_fork_generation(void) { return 0; } - -#endif // OPENSSL_LINUX diff --git a/third_party/boringssl/src/crypto/fipsmodule/rand/fork_detect.h b/third_party/boringssl/src/crypto/fipsmodule/rand/fork_detect.h deleted file mode 100644 index 8518830c..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/rand/fork_detect.h +++ /dev/null @@ -1,49 +0,0 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_CRYPTO_FORK_DETECT_H -#define OPENSSL_HEADER_CRYPTO_FORK_DETECT_H - -#include - -#if defined(__cplusplus) -extern "C" { -#endif - - -// crypto_get_fork_generation returns the fork generation number for the current -// process, or zero if not supported on the platform. The fork generation number -// is a non-zero, strictly-monotonic counter with the property that, if queried -// in an address space and then again in a subsequently forked copy, the forked -// address space will observe a greater value. -// -// This function may be used to clear cached values across a fork. When -// initializing a cache, record the fork generation. Before using the cache, -// check if the fork generation has changed. If so, drop the cache and update -// the save fork generation. Note this logic transparently handles platforms -// which always return zero. -// -// This is not reliably supported on all platforms which implement |fork|, so it -// should only be used as a hardening measure. -OPENSSL_EXPORT uint64_t CRYPTO_get_fork_generation(void); - -// CRYPTO_fork_detect_ignore_madv_wipeonfork_for_testing is an internal detail -// used for testing purposes. -OPENSSL_EXPORT void CRYPTO_fork_detect_ignore_madv_wipeonfork_for_testing(void); - -#if defined(__cplusplus) -} // extern C -#endif - -#endif // OPENSSL_HEADER_CRYPTO_FORK_DETECT_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/rand/getrandom_fillin.h b/third_party/boringssl/src/crypto/fipsmodule/rand/getrandom_fillin.h deleted file mode 100644 index c0dea35b..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/rand/getrandom_fillin.h +++ /dev/null @@ -1,66 +0,0 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_CRYPTO_RAND_GETRANDOM_FILLIN_H -#define OPENSSL_HEADER_CRYPTO_RAND_GETRANDOM_FILLIN_H - -#include - - -#if defined(OPENSSL_LINUX) - -#include - -#if defined(OPENSSL_X86_64) -#define EXPECTED_NR_getrandom 318 -#elif defined(OPENSSL_X86) -#define EXPECTED_NR_getrandom 355 -#elif defined(OPENSSL_AARCH64) -#define EXPECTED_NR_getrandom 278 -#elif defined(OPENSSL_ARM) -#define EXPECTED_NR_getrandom 384 -#elif defined(OPENSSL_PPC64LE) -#define EXPECTED_NR_getrandom 359 -#elif defined(OPENSSL_RISCV64) -#define EXPECTED_NR_getrandom 278 -#endif - -#if defined(EXPECTED_NR_getrandom) -#define USE_NR_getrandom - -#if defined(__NR_getrandom) - -#if __NR_getrandom != EXPECTED_NR_getrandom -#error "system call number for getrandom is not the expected value" -#endif - -#else // __NR_getrandom - -#define __NR_getrandom EXPECTED_NR_getrandom - -#endif // __NR_getrandom - -#endif // EXPECTED_NR_getrandom - -#if !defined(GRND_NONBLOCK) -#define GRND_NONBLOCK 1 -#endif -#if !defined(GRND_RANDOM) -#define GRND_RANDOM 2 -#endif - -#endif // OPENSSL_LINUX - - -#endif // OPENSSL_HEADER_CRYPTO_RAND_GETRANDOM_FILLIN_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/rand/internal.h b/third_party/boringssl/src/crypto/fipsmodule/rand/internal.h index 3c996f17..0913f008 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/rand/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/rand/internal.h @@ -1,155 +1,89 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_CRYPTO_RAND_INTERNAL_H -#define OPENSSL_HEADER_CRYPTO_RAND_INTERNAL_H +// Copyright 2015 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_RAND_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_RAND_INTERNAL_H #include #include -#include "../../internal.h" -#include "../modes/internal.h" - -#if defined(__cplusplus) -extern "C" { -#endif - - -#if !defined(OPENSSL_WINDOWS) && !defined(OPENSSL_FUCHSIA) && \ - !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE) && !defined(OPENSSL_TRUSTY) -#define OPENSSL_URANDOM -#endif - -// RAND_bytes_with_additional_data samples from the RNG after mixing 32 bytes -// from |user_additional_data| in. -void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len, - const uint8_t user_additional_data[32]); - -#if defined(BORINGSSL_FIPS) - -// We overread from /dev/urandom or RDRAND by a factor of 10 and XOR to whiten. -#define BORINGSSL_FIPS_OVERREAD 10 - -// CRYPTO_get_seed_entropy writes |out_entropy_len| bytes of entropy, suitable -// for seeding a DRBG, to |out_entropy|. It sets |*out_used_cpu| to one if the -// entropy came directly from the CPU and zero if it came from the OS. It -// actively obtains entropy from the CPU/OS and so should not be called from -// within the FIPS module. -void CRYPTO_get_seed_entropy(uint8_t *out_entropy, size_t out_entropy_len, - int *out_used_cpu); - -// RAND_load_entropy supplies |entropy_len| bytes of entropy to the module. The -// |want_additional_input| parameter is true iff the entropy was obtained from -// a source other than the system, e.g. directly from the CPU. -void RAND_load_entropy(const uint8_t *entropy, size_t entropy_len, - int want_additional_input); - -// RAND_need_entropy is implemented outside of the FIPS module and is called -// when the module has stopped because it has run out of entropy. -void RAND_need_entropy(size_t bytes_needed); - -#endif // BORINGSSL_FIPS - -// CRYPTO_sysrand fills |len| bytes at |buf| with entropy from the operating -// system. -void CRYPTO_sysrand(uint8_t *buf, size_t len); - -// CRYPTO_sysrand_for_seed fills |len| bytes at |buf| with entropy from the -// operating system. It may draw from the |GRND_RANDOM| pool on Android, -// depending on the vendor's configuration. -void CRYPTO_sysrand_for_seed(uint8_t *buf, size_t len); - -#if defined(OPENSSL_URANDOM) -// CRYPTO_init_sysrand initializes long-lived resources needed to draw entropy -// from the operating system. -void CRYPTO_init_sysrand(void); - -// CRYPTO_sysrand_if_available fills |len| bytes at |buf| with entropy from the -// operating system, or early /dev/urandom data, and returns 1, _if_ the entropy -// pool is initialized or if getrandom() is not available and not in FIPS mode. -// Otherwise it will not block and will instead fill |buf| with all zeros and -// return 0. -int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len); -#else -OPENSSL_INLINE void CRYPTO_init_sysrand(void) {} - -OPENSSL_INLINE int CRYPTO_sysrand_if_available(uint8_t *buf, size_t len) { - CRYPTO_sysrand(buf, len); - return 1; -} -#endif +#include "../../bcm_support.h" +#include "../aes/internal.h" + + +BSSL_NAMESPACE_BEGIN // rand_fork_unsafe_buffering_enabled returns whether fork-unsafe buffering has // been enabled via |RAND_enable_fork_unsafe_buffering|. -int rand_fork_unsafe_buffering_enabled(void); +int rand_fork_unsafe_buffering_enabled(); + +BSSL_NAMESPACE_END // CTR_DRBG_STATE contains the state of a CTR_DRBG based on AES-256. See SP // 800-90Ar1. struct ctr_drbg_state_st { AES_KEY ks; - block128_f block; - ctr128_f ctr; + bssl::block128_f block; + bssl::ctr128_f ctr; uint8_t counter[16]; uint64_t reseed_counter; + int df; }; -// CTR_DRBG_init initialises |*drbg| given |CTR_DRBG_ENTROPY_LEN| bytes of -// entropy in |entropy| and, optionally, a personalization string up to -// |CTR_DRBG_ENTROPY_LEN| bytes in length. It returns one on success and zero -// on error. -OPENSSL_EXPORT int CTR_DRBG_init(CTR_DRBG_STATE *drbg, - const uint8_t entropy[CTR_DRBG_ENTROPY_LEN], +BSSL_NAMESPACE_BEGIN + +// CTR_DRBG_init initialises |*drbg| given |entropy_len| bytes of entropy in +// |entropy| and, optionally, a personalization string up to +// |CTR_DRBG_SEED_LEN| bytes in length. It returns one on success and zero on +// error. +// +// If `df` is false then `entropy_len` must be |CTR_DRBG_ENTROPY_LEN| and +// |nonce| must be nullptr. +OPENSSL_EXPORT int CTR_DRBG_init(CTR_DRBG_STATE *drbg, int df, + const uint8_t *entropy, size_t entropy_len, + const uint8_t nonce[CTR_DRBG_NONCE_LEN], const uint8_t *personalization, size_t personalization_len); #if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) -OPENSSL_INLINE int have_rdrand(void) { - return CRYPTO_is_RDRAND_capable(); -} +inline int have_rdrand() { return CRYPTO_is_RDRAND_capable(); } // have_fast_rdrand returns true if RDRAND is supported and it's reasonably // fast. Concretely the latter is defined by whether the chip is Intel (fast) or // not (assumed slow). -OPENSSL_INLINE int have_fast_rdrand(void) { +inline int have_fast_rdrand() { return CRYPTO_is_RDRAND_capable() && CRYPTO_is_intel_cpu(); } // CRYPTO_rdrand writes eight bytes of random data from the hardware RNG to // |out|. It returns one on success or zero on hardware failure. -int CRYPTO_rdrand(uint8_t out[8]); +extern "C" int CRYPTO_rdrand(uint8_t out[8]); // CRYPTO_rdrand_multiple8_buf fills |len| bytes at |buf| with random data from // the hardware RNG. The |len| argument must be a multiple of eight. It returns // one on success and zero on hardware failure. -int CRYPTO_rdrand_multiple8_buf(uint8_t *buf, size_t len); +extern "C" int CRYPTO_rdrand_multiple8_buf(uint8_t *buf, size_t len); #else // OPENSSL_X86_64 && !OPENSSL_NO_ASM -OPENSSL_INLINE int have_rdrand(void) { - return 0; -} +inline int have_rdrand() { return 0; } -OPENSSL_INLINE int have_fast_rdrand(void) { - return 0; -} +inline int have_fast_rdrand() { return 0; } #endif // OPENSSL_X86_64 && !OPENSSL_NO_ASM +BSSL_NAMESPACE_END -#if defined(__cplusplus) -} // extern C -#endif - -#endif // OPENSSL_HEADER_CRYPTO_RAND_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_RAND_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/rand/rand.c b/third_party/boringssl/src/crypto/fipsmodule/rand/rand.c deleted file mode 100644 index bf0f486c..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/rand/rand.c +++ /dev/null @@ -1,471 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include - -#if defined(BORINGSSL_FIPS) -#include -#endif - -#include -#include -#include - -#include "internal.h" -#include "fork_detect.h" -#include "../../internal.h" -#include "../delocate.h" - - -// It's assumed that the operating system always has an unfailing source of -// entropy which is accessed via |CRYPTO_sysrand[_for_seed]|. (If the operating -// system entropy source fails, it's up to |CRYPTO_sysrand| to abort the -// process—we don't try to handle it.) -// -// In addition, the hardware may provide a low-latency RNG. Intel's rdrand -// instruction is the canonical example of this. When a hardware RNG is -// available we don't need to worry about an RNG failure arising from fork()ing -// the process or moving a VM, so we can keep thread-local RNG state and use it -// as an additional-data input to CTR-DRBG. -// -// (We assume that the OS entropy is safe from fork()ing and VM duplication. -// This might be a bit of a leap of faith, esp on Windows, but there's nothing -// that we can do about it.) - -// kReseedInterval is the number of generate calls made to CTR-DRBG before -// reseeding. -static const unsigned kReseedInterval = 4096; - -// CRNGT_BLOCK_SIZE is the number of bytes in a “block” for the purposes of the -// continuous random number generator test in FIPS 140-2, section 4.9.2. -#define CRNGT_BLOCK_SIZE 16 - -// rand_thread_state contains the per-thread state for the RNG. -struct rand_thread_state { - CTR_DRBG_STATE drbg; - uint64_t fork_generation; - // calls is the number of generate calls made on |drbg| since it was last - // (re)seeded. This is bound by |kReseedInterval|. - unsigned calls; - // last_block_valid is non-zero iff |last_block| contains data from - // |get_seed_entropy|. - int last_block_valid; - -#if defined(BORINGSSL_FIPS) - // last_block contains the previous block from |get_seed_entropy|. - uint8_t last_block[CRNGT_BLOCK_SIZE]; - // next and prev form a NULL-terminated, double-linked list of all states in - // a process. - struct rand_thread_state *next, *prev; -#endif -}; - -#if defined(BORINGSSL_FIPS) -// thread_states_list is the head of a linked-list of all |rand_thread_state| -// objects in the process, one per thread. This is needed because FIPS requires -// that they be zeroed on process exit, but thread-local destructors aren't -// called when the whole process is exiting. -DEFINE_BSS_GET(struct rand_thread_state *, thread_states_list); -DEFINE_STATIC_MUTEX(thread_states_list_lock); -DEFINE_STATIC_MUTEX(state_clear_all_lock); - -static void rand_thread_state_clear_all(void) __attribute__((destructor)); -static void rand_thread_state_clear_all(void) { - CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get()); - CRYPTO_STATIC_MUTEX_lock_write(state_clear_all_lock_bss_get()); - for (struct rand_thread_state *cur = *thread_states_list_bss_get(); - cur != NULL; cur = cur->next) { - CTR_DRBG_clear(&cur->drbg); - } - // The locks are deliberately left locked so that any threads that are still - // running will hang if they try to call |RAND_bytes|. -} -#endif - -// rand_thread_state_free frees a |rand_thread_state|. This is called when a -// thread exits. -static void rand_thread_state_free(void *state_in) { - struct rand_thread_state *state = state_in; - - if (state_in == NULL) { - return; - } - -#if defined(BORINGSSL_FIPS) - CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get()); - - if (state->prev != NULL) { - state->prev->next = state->next; - } else { - *thread_states_list_bss_get() = state->next; - } - - if (state->next != NULL) { - state->next->prev = state->prev; - } - - CRYPTO_STATIC_MUTEX_unlock_write(thread_states_list_lock_bss_get()); - - CTR_DRBG_clear(&state->drbg); -#endif - - OPENSSL_free(state); -} - -#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \ - !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE) -// rdrand should only be called if either |have_rdrand| or |have_fast_rdrand| -// returned true. -static int rdrand(uint8_t *buf, const size_t len) { - const size_t len_multiple8 = len & ~7; - if (!CRYPTO_rdrand_multiple8_buf(buf, len_multiple8)) { - return 0; - } - const size_t remainder = len - len_multiple8; - - if (remainder != 0) { - assert(remainder < 8); - - uint8_t rand_buf[8]; - if (!CRYPTO_rdrand(rand_buf)) { - return 0; - } - OPENSSL_memcpy(buf + len_multiple8, rand_buf, remainder); - } - - return 1; -} - -#else - -static int rdrand(uint8_t *buf, size_t len) { - return 0; -} - -#endif - -#if defined(BORINGSSL_FIPS) - -void CRYPTO_get_seed_entropy(uint8_t *out_entropy, size_t out_entropy_len, - int *out_want_additional_input) { - *out_want_additional_input = 0; - if (have_rdrand() && rdrand(out_entropy, out_entropy_len)) { - *out_want_additional_input = 1; - } else { - CRYPTO_sysrand_for_seed(out_entropy, out_entropy_len); - } - - if (boringssl_fips_break_test("CRNG")) { - // This breaks the "continuous random number generator test" defined in FIPS - // 140-2, section 4.9.2, and implemented in |rand_get_seed|. - OPENSSL_memset(out_entropy, 0, out_entropy_len); - } -} - -// In passive entropy mode, entropy is supplied from outside of the module via -// |RAND_load_entropy| and is stored in global instance of the following -// structure. - -struct entropy_buffer { - // bytes contains entropy suitable for seeding a DRBG. - uint8_t - bytes[CRNGT_BLOCK_SIZE + CTR_DRBG_ENTROPY_LEN * BORINGSSL_FIPS_OVERREAD]; - // bytes_valid indicates the number of bytes of |bytes| that contain valid - // data. - size_t bytes_valid; - // want_additional_input is true if any of the contents of |bytes| were - // obtained via a method other than from the kernel. In these cases entropy - // from the kernel is also provided via an additional input to the DRBG. - int want_additional_input; -}; - -DEFINE_BSS_GET(struct entropy_buffer, entropy_buffer); -DEFINE_STATIC_MUTEX(entropy_buffer_lock); - -void RAND_load_entropy(const uint8_t *entropy, size_t entropy_len, - int want_additional_input) { - struct entropy_buffer *const buffer = entropy_buffer_bss_get(); - - CRYPTO_STATIC_MUTEX_lock_write(entropy_buffer_lock_bss_get()); - const size_t space = sizeof(buffer->bytes) - buffer->bytes_valid; - if (entropy_len > space) { - entropy_len = space; - } - - OPENSSL_memcpy(&buffer->bytes[buffer->bytes_valid], entropy, entropy_len); - buffer->bytes_valid += entropy_len; - buffer->want_additional_input |= - want_additional_input && (entropy_len != 0); - CRYPTO_STATIC_MUTEX_unlock_write(entropy_buffer_lock_bss_get()); -} - -// get_seed_entropy fills |out_entropy_len| bytes of |out_entropy| from the -// global |entropy_buffer|. -static void get_seed_entropy(uint8_t *out_entropy, size_t out_entropy_len, - int *out_want_additional_input) { - struct entropy_buffer *const buffer = entropy_buffer_bss_get(); - if (out_entropy_len > sizeof(buffer->bytes)) { - abort(); - } - - CRYPTO_STATIC_MUTEX_lock_write(entropy_buffer_lock_bss_get()); - while (buffer->bytes_valid < out_entropy_len) { - CRYPTO_STATIC_MUTEX_unlock_write(entropy_buffer_lock_bss_get()); - RAND_need_entropy(out_entropy_len - buffer->bytes_valid); - CRYPTO_STATIC_MUTEX_lock_write(entropy_buffer_lock_bss_get()); - } - - *out_want_additional_input = buffer->want_additional_input; - OPENSSL_memcpy(out_entropy, buffer->bytes, out_entropy_len); - OPENSSL_memmove(buffer->bytes, &buffer->bytes[out_entropy_len], - buffer->bytes_valid - out_entropy_len); - buffer->bytes_valid -= out_entropy_len; - if (buffer->bytes_valid == 0) { - buffer->want_additional_input = 0; - } - - CRYPTO_STATIC_MUTEX_unlock_write(entropy_buffer_lock_bss_get()); -} - -// rand_get_seed fills |seed| with entropy and sets -// |*out_want_additional_input| to one if that entropy came directly from the -// CPU and zero otherwise. -static void rand_get_seed(struct rand_thread_state *state, - uint8_t seed[CTR_DRBG_ENTROPY_LEN], - int *out_want_additional_input) { - uint8_t entropy_bytes[sizeof(state->last_block) + - CTR_DRBG_ENTROPY_LEN * BORINGSSL_FIPS_OVERREAD]; - uint8_t *entropy = entropy_bytes; - size_t entropy_len = sizeof(entropy_bytes); - - if (state->last_block_valid) { - // No need to fill |state->last_block| with entropy from the read. - entropy += sizeof(state->last_block); - entropy_len -= sizeof(state->last_block); - } - - get_seed_entropy(entropy, entropy_len, out_want_additional_input); - - if (!state->last_block_valid) { - OPENSSL_memcpy(state->last_block, entropy, sizeof(state->last_block)); - entropy += sizeof(state->last_block); - entropy_len -= sizeof(state->last_block); - } - - // See FIPS 140-2, section 4.9.2. This is the “continuous random number - // generator test” which causes the program to randomly abort. Hopefully the - // rate of failure is small enough not to be a problem in practice. - if (CRYPTO_memcmp(state->last_block, entropy, sizeof(state->last_block)) == - 0) { - fprintf(stderr, "CRNGT failed.\n"); - BORINGSSL_FIPS_abort(); - } - - assert(entropy_len % CRNGT_BLOCK_SIZE == 0); - for (size_t i = CRNGT_BLOCK_SIZE; i < entropy_len; i += CRNGT_BLOCK_SIZE) { - if (CRYPTO_memcmp(entropy + i - CRNGT_BLOCK_SIZE, entropy + i, - CRNGT_BLOCK_SIZE) == 0) { - fprintf(stderr, "CRNGT failed.\n"); - BORINGSSL_FIPS_abort(); - } - } - OPENSSL_memcpy(state->last_block, entropy + entropy_len - CRNGT_BLOCK_SIZE, - CRNGT_BLOCK_SIZE); - - assert(entropy_len == BORINGSSL_FIPS_OVERREAD * CTR_DRBG_ENTROPY_LEN); - OPENSSL_memcpy(seed, entropy, CTR_DRBG_ENTROPY_LEN); - - for (size_t i = 1; i < BORINGSSL_FIPS_OVERREAD; i++) { - for (size_t j = 0; j < CTR_DRBG_ENTROPY_LEN; j++) { - seed[j] ^= entropy[CTR_DRBG_ENTROPY_LEN * i + j]; - } - } -} - -#else - -// rand_get_seed fills |seed| with entropy and sets -// |*out_want_additional_input| to one if that entropy came directly from the -// CPU and zero otherwise. -static void rand_get_seed(struct rand_thread_state *state, - uint8_t seed[CTR_DRBG_ENTROPY_LEN], - int *out_want_additional_input) { - // If not in FIPS mode, we don't overread from the system entropy source and - // we don't depend only on the hardware RDRAND. - CRYPTO_sysrand_for_seed(seed, CTR_DRBG_ENTROPY_LEN); - *out_want_additional_input = 0; -} - -#endif - -void RAND_bytes_with_additional_data(uint8_t *out, size_t out_len, - const uint8_t user_additional_data[32]) { - if (out_len == 0) { - return; - } - - const uint64_t fork_generation = CRYPTO_get_fork_generation(); - - // Additional data is mixed into every CTR-DRBG call to protect, as best we - // can, against forks & VM clones. We do not over-read this information and - // don't reseed with it so, from the point of view of FIPS, this doesn't - // provide “prediction resistance”. But, in practice, it does. - uint8_t additional_data[32]; - // Intel chips have fast RDRAND instructions while, in other cases, RDRAND can - // be _slower_ than a system call. - if (!have_fast_rdrand() || - !rdrand(additional_data, sizeof(additional_data))) { - // Without a hardware RNG to save us from address-space duplication, the OS - // entropy is used. This can be expensive (one read per |RAND_bytes| call) - // and so is disabled when we have fork detection, or if the application has - // promised not to fork. - if (fork_generation != 0 || rand_fork_unsafe_buffering_enabled()) { - OPENSSL_memset(additional_data, 0, sizeof(additional_data)); - } else if (!have_rdrand()) { - // No alternative so block for OS entropy. - CRYPTO_sysrand(additional_data, sizeof(additional_data)); - } else if (!CRYPTO_sysrand_if_available(additional_data, - sizeof(additional_data)) && - !rdrand(additional_data, sizeof(additional_data))) { - // RDRAND failed: block for OS entropy. - CRYPTO_sysrand(additional_data, sizeof(additional_data)); - } - } - - for (size_t i = 0; i < sizeof(additional_data); i++) { - additional_data[i] ^= user_additional_data[i]; - } - - struct rand_thread_state stack_state; - struct rand_thread_state *state = - CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_RAND); - - if (state == NULL) { - state = OPENSSL_malloc(sizeof(struct rand_thread_state)); - if (state == NULL || - !CRYPTO_set_thread_local(OPENSSL_THREAD_LOCAL_RAND, state, - rand_thread_state_free)) { - // If the system is out of memory, use an ephemeral state on the - // stack. - state = &stack_state; - } - - state->last_block_valid = 0; - uint8_t seed[CTR_DRBG_ENTROPY_LEN]; - int want_additional_input; - rand_get_seed(state, seed, &want_additional_input); - - uint8_t personalization[CTR_DRBG_ENTROPY_LEN] = {0}; - size_t personalization_len = 0; -#if defined(OPENSSL_URANDOM) - // If we used something other than system entropy then also - // opportunistically read from the system. This avoids solely relying on the - // hardware once the entropy pool has been initialized. - if (want_additional_input && - CRYPTO_sysrand_if_available(personalization, sizeof(personalization))) { - personalization_len = sizeof(personalization); - } -#endif - - if (!CTR_DRBG_init(&state->drbg, seed, personalization, - personalization_len)) { - abort(); - } - state->calls = 0; - state->fork_generation = fork_generation; - -#if defined(BORINGSSL_FIPS) - if (state != &stack_state) { - CRYPTO_STATIC_MUTEX_lock_write(thread_states_list_lock_bss_get()); - struct rand_thread_state **states_list = thread_states_list_bss_get(); - state->next = *states_list; - if (state->next != NULL) { - state->next->prev = state; - } - state->prev = NULL; - *states_list = state; - CRYPTO_STATIC_MUTEX_unlock_write(thread_states_list_lock_bss_get()); - } -#endif - } - - if (state->calls >= kReseedInterval || - state->fork_generation != fork_generation) { - uint8_t seed[CTR_DRBG_ENTROPY_LEN]; - int want_additional_input; - rand_get_seed(state, seed, &want_additional_input); -#if defined(BORINGSSL_FIPS) - // Take a read lock around accesses to |state->drbg|. This is needed to - // avoid returning bad entropy if we race with - // |rand_thread_state_clear_all|. - // - // This lock must be taken after any calls to |CRYPTO_sysrand| to avoid a - // bug on ppc64le. glibc may implement pthread locks by wrapping user code - // in a hardware transaction, but, on some older versions of glibc and the - // kernel, syscalls made with |syscall| did not abort the transaction. - CRYPTO_STATIC_MUTEX_lock_read(state_clear_all_lock_bss_get()); -#endif - if (!CTR_DRBG_reseed(&state->drbg, seed, NULL, 0)) { - abort(); - } - state->calls = 0; - state->fork_generation = fork_generation; - } else { -#if defined(BORINGSSL_FIPS) - CRYPTO_STATIC_MUTEX_lock_read(state_clear_all_lock_bss_get()); -#endif - } - - int first_call = 1; - while (out_len > 0) { - size_t todo = out_len; - if (todo > CTR_DRBG_MAX_GENERATE_LENGTH) { - todo = CTR_DRBG_MAX_GENERATE_LENGTH; - } - - if (!CTR_DRBG_generate(&state->drbg, out, todo, additional_data, - first_call ? sizeof(additional_data) : 0)) { - abort(); - } - - out += todo; - out_len -= todo; - // Though we only check before entering the loop, this cannot add enough to - // overflow a |size_t|. - state->calls++; - first_call = 0; - } - - if (state == &stack_state) { - CTR_DRBG_clear(&state->drbg); - } - -#if defined(BORINGSSL_FIPS) - CRYPTO_STATIC_MUTEX_unlock_read(state_clear_all_lock_bss_get()); -#endif -} - -int RAND_bytes(uint8_t *out, size_t out_len) { - static const uint8_t kZeroAdditionalData[32] = {0}; - RAND_bytes_with_additional_data(out, out_len, kZeroAdditionalData); - return 1; -} - -int RAND_pseudo_bytes(uint8_t *buf, size_t len) { - return RAND_bytes(buf, len); -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/rand/rand.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/rand/rand.cc.inc new file mode 100644 index 00000000..9b294416 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/rand/rand.cc.inc @@ -0,0 +1,482 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#if defined(BORINGSSL_FIPS) +#include +#endif + +#include +#include +#include +#include + +#include "../../bcm_support.h" +#include "../../mem_internal.h" +#include "../bcm_interface.h" +#include "../delocate.h" +#include "internal.h" + + +using namespace bssl; + +// It's assumed that the operating system always has an unfailing source of +// entropy which is accessed via |CRYPTO_sysrand|. (If the operating system +// entropy source fails, it's up to |CRYPTO_sysrand| to abort the process—we +// don't try to handle it.) +// +// In addition, the hardware may provide a low-latency RNG. Intel's rdrand +// instruction is the canonical example of this. When a hardware RNG is +// available we don't need to worry about an RNG failure arising from fork()ing +// the process or moving a VM, so we can keep thread-local RNG state and use it +// as an additional-data input to CTR-DRBG. +// +// (We assume that the OS entropy is safe from fork()ing and VM duplication. +// This might be a bit of a leap of faith, esp on Windows, but there's nothing +// that we can do about it.) + +// kReseedInterval is the number of generate calls made to CTR-DRBG before +// reseeding. +static const unsigned kReseedInterval = 4096; + +// CRNGT_BLOCK_SIZE is the number of bytes in a “block” for the purposes of the +// continuous random number generator test in FIPS 140-2, section 4.9.2. +#define CRNGT_BLOCK_SIZE 16 + +namespace { +// rand_thread_state contains the per-thread state for the RNG. +struct rand_thread_state { + CTR_DRBG_STATE drbg; + uint64_t fork_generation; + // calls is the number of generate calls made on |drbg| since it was last + // (re)seeded. This is bound by |kReseedInterval|. + unsigned calls; + // last_block_valid is non-zero iff |last_block| contains data from + // |get_seed_entropy|. + int last_block_valid; + // fork_unsafe_buffering is non-zero iff, when |drbg| was last (re)seeded, + // fork-unsafe buffering was enabled. + int fork_unsafe_buffering; + +#if defined(BORINGSSL_FIPS) + // last_block contains the previous block from |get_seed_entropy|. + uint8_t last_block[CRNGT_BLOCK_SIZE]; + // next and prev form a nullptr-terminated, double-linked list of all states + // in a process. + struct rand_thread_state *next, *prev; + // clear_drbg_lock synchronizes between uses of |drbg| and + // |rand_thread_state_clear_all| clearing it. This lock should be uncontended + // in the common case, except on shutdown. + Mutex clear_drbg_lock; +#endif +}; +} // namespace + +#if defined(BORINGSSL_FIPS) +// thread_states_list is the head of a linked-list of all |rand_thread_state| +// objects in the process, one per thread. This is needed because FIPS requires +// that they be zeroed on process exit, but thread-local destructors aren't +// called when the whole process is exiting. +DEFINE_BSS_GET(struct rand_thread_state *, thread_states_list, = nullptr) +DEFINE_STATIC_MUTEX(thread_states_list_lock) + +static void rand_thread_state_clear_all() __attribute__((destructor)); +static void rand_thread_state_clear_all() { + thread_states_list_lock_bss_get()->LockWrite(); + for (struct rand_thread_state *cur = *thread_states_list_bss_get(); + cur != nullptr; cur = cur->next) { + cur->clear_drbg_lock.LockWrite(); + CTR_DRBG_clear(&cur->drbg); + } + // The locks are deliberately left locked so that any threads that are still + // running will hang if they try to call |BCM_rand_bytes|. It also ensures + // |rand_thread_state_free| cannot free any thread state while we've taken the + // lock. +} +#endif + +// rand_thread_state_free frees a |rand_thread_state|. This is called when a +// thread exits. +static void rand_thread_state_free(void *state_in) { + struct rand_thread_state *state = + reinterpret_cast(state_in); + + if (state_in == nullptr) { + return; + } + +#if defined(BORINGSSL_FIPS) + thread_states_list_lock_bss_get()->LockWrite(); + + if (state->prev != nullptr) { + state->prev->next = state->next; + } else if (*thread_states_list_bss_get() == state) { + // |state->prev| may be nullptr either if it is the head of the list, + // or if |state| is freed before it was added to the list at all. + // Compare against the head of the list to distinguish these cases. + *thread_states_list_bss_get() = state->next; + } + + if (state->next != nullptr) { + state->next->prev = state->prev; + } + + thread_states_list_lock_bss_get()->UnlockWrite(); + + CTR_DRBG_clear(&state->drbg); +#endif + + Delete(state); +} + +#if defined(OPENSSL_X86_64) && !defined(OPENSSL_NO_ASM) && \ + !defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) +// rdrand should only be called if either |have_rdrand| or |have_fast_rdrand| +// returned true. +static int rdrand(uint8_t *buf, const size_t len) { + const size_t len_multiple8 = len & ~7; + if (!CRYPTO_rdrand_multiple8_buf(buf, len_multiple8)) { + return 0; + } + const size_t remainder = len - len_multiple8; + + if (remainder != 0) { + assert(remainder < 8); + + uint8_t rand_buf[8]; + if (!CRYPTO_rdrand(rand_buf)) { + return 0; + } + OPENSSL_memcpy(buf + len_multiple8, rand_buf, remainder); + } + + return 1; +} + +#else + +static int rdrand(uint8_t *buf, size_t len) { return 0; } + +#endif + +bcm_status bssl::BCM_rand_bytes_hwrng(uint8_t *buf, const size_t len) { + if (!have_rdrand()) { + return bcm_status::failure; + } + if (rdrand(buf, len)) { + return bcm_status::not_approved; + } + return bcm_status::failure; +} + +#if defined(BORINGSSL_FIPS) + +// In passive entropy mode, entropy is supplied from outside of the module via +// |BCM_rand_load_entropy| and is stored in global instance of the following +// structure. + +struct entropy_buffer { + // bytes contains entropy suitable for seeding a DRBG. + uint8_t bytes[CRNGT_BLOCK_SIZE + CTR_DRBG_SEED_LEN * BORINGSSL_FIPS_OVERREAD]; + // bytes_valid indicates the number of bytes of |bytes| that contain valid + // data. + size_t bytes_valid; + // want_additional_input is true if any of the contents of |bytes| were + // obtained via a method other than from the kernel. In these cases entropy + // from the kernel is also provided via an additional input to the DRBG. + int want_additional_input; +}; + +DEFINE_BSS_GET(struct entropy_buffer, entropy_buffer, = {}) +DEFINE_STATIC_MUTEX(entropy_buffer_lock) + +bcm_infallible bssl::BCM_rand_load_entropy(const uint8_t *entropy, + size_t entropy_len, + int want_additional_input) { + struct entropy_buffer *const buffer = entropy_buffer_bss_get(); + + MutexWriteLock lock(entropy_buffer_lock_bss_get()); + const size_t space = sizeof(buffer->bytes) - buffer->bytes_valid; + if (entropy_len > space) { + entropy_len = space; + } + + OPENSSL_memcpy(&buffer->bytes[buffer->bytes_valid], entropy, entropy_len); + buffer->bytes_valid += entropy_len; + buffer->want_additional_input |= want_additional_input && (entropy_len != 0); + return bcm_infallible::not_approved; +} + +// get_seed_entropy fills |out_entropy_len| bytes of |out_entropy| from the +// global |entropy_buffer|. +static void get_seed_entropy(uint8_t *out_entropy, size_t out_entropy_len, + int *out_want_additional_input) { + struct entropy_buffer *const buffer = entropy_buffer_bss_get(); + if (out_entropy_len > sizeof(buffer->bytes)) { + abort(); + } + + MutexWriteLock lock(entropy_buffer_lock_bss_get()); + while (buffer->bytes_valid < out_entropy_len) { + MutexWriteUnlock unlock(entropy_buffer_lock_bss_get()); + RAND_need_entropy(out_entropy_len - buffer->bytes_valid); + } + + *out_want_additional_input = buffer->want_additional_input; + OPENSSL_memcpy(out_entropy, buffer->bytes, out_entropy_len); + OPENSSL_memmove(buffer->bytes, &buffer->bytes[out_entropy_len], + buffer->bytes_valid - out_entropy_len); + buffer->bytes_valid -= out_entropy_len; + if (buffer->bytes_valid == 0) { + buffer->want_additional_input = 0; + } +} + +// rand_get_seed fills |seed| with entropy. In some cases, it will additionally +// fill |additional_input| with entropy to supplement |seed|. It sets +// |*out_additional_input_len| to the number of extra bytes. +static void rand_get_seed(struct rand_thread_state *state, + uint8_t seed[CTR_DRBG_SEED_LEN], + uint8_t additional_input[CTR_DRBG_SEED_LEN], + size_t *out_additional_input_len) { + uint8_t entropy_bytes[sizeof(state->last_block) + + CTR_DRBG_SEED_LEN * BORINGSSL_FIPS_OVERREAD]; + uint8_t *entropy = entropy_bytes; + size_t entropy_len = sizeof(entropy_bytes); + + if (state->last_block_valid) { + // No need to fill |state->last_block| with entropy from the read. + entropy += sizeof(state->last_block); + entropy_len -= sizeof(state->last_block); + } + + int want_additional_input; + get_seed_entropy(entropy, entropy_len, &want_additional_input); + + if (!state->last_block_valid) { + OPENSSL_memcpy(state->last_block, entropy, sizeof(state->last_block)); + entropy += sizeof(state->last_block); + entropy_len -= sizeof(state->last_block); + } + + // See FIPS 140-2, section 4.9.2. This is the “continuous random number + // generator test” which causes the program to randomly abort. Hopefully the + // rate of failure is small enough not to be a problem in practice. + if (CRYPTO_memcmp(state->last_block, entropy, sizeof(state->last_block)) == + 0) { + fprintf(CRYPTO_get_stderr(), "CRNGT failed.\n"); + BORINGSSL_FIPS_abort(); + } + + assert(entropy_len % CRNGT_BLOCK_SIZE == 0); + for (size_t i = CRNGT_BLOCK_SIZE; i < entropy_len; i += CRNGT_BLOCK_SIZE) { + if (CRYPTO_memcmp(entropy + i - CRNGT_BLOCK_SIZE, entropy + i, + CRNGT_BLOCK_SIZE) == 0) { + fprintf(CRYPTO_get_stderr(), "CRNGT failed.\n"); + BORINGSSL_FIPS_abort(); + } + } + OPENSSL_memcpy(state->last_block, entropy + entropy_len - CRNGT_BLOCK_SIZE, + CRNGT_BLOCK_SIZE); + + assert(entropy_len == BORINGSSL_FIPS_OVERREAD * CTR_DRBG_SEED_LEN); + OPENSSL_memcpy(seed, entropy, CTR_DRBG_SEED_LEN); + + for (size_t i = 1; i < BORINGSSL_FIPS_OVERREAD; i++) { + for (size_t j = 0; j < CTR_DRBG_SEED_LEN; j++) { + seed[j] ^= entropy[CTR_DRBG_SEED_LEN * i + j]; + } + } + + // If we used something other than system entropy then also read from the + // system. This avoids solely relying on the hardware. + // TODO(crbug.com/446280903): Once this change sticks, switch + // |get_seed_entropy| to draw from the OS instead of RDRAND. + *out_additional_input_len = 0; + if (want_additional_input) { + CRYPTO_sysrand(additional_input, CTR_DRBG_SEED_LEN); + *out_additional_input_len = CTR_DRBG_SEED_LEN; + } +} + +#else + +// rand_get_seed fills |seed| with entropy. In some cases, it will additionally +// fill |additional_input| with entropy to supplement |seed|. It sets +// |*out_additional_input_len| to the number of extra bytes. +static void rand_get_seed(struct rand_thread_state *state, + uint8_t seed[CTR_DRBG_SEED_LEN], + uint8_t additional_input[CTR_DRBG_SEED_LEN], + size_t *out_additional_input_len) { + // If not in FIPS mode, we don't overread from the system entropy source and + // we don't depend only on the hardware RDRAND. + CRYPTO_sysrand(seed, CTR_DRBG_SEED_LEN); + *out_additional_input_len = 0; +} + +#endif + +bcm_infallible bssl::BCM_rand_bytes_with_additional_data( + uint8_t *out, size_t out_len, const uint8_t user_additional_data[32]) { + if (out_len == 0) { + return bcm_infallible::approved; + } + + const uint64_t fork_generation = CRYPTO_get_fork_generation(); + const int fork_unsafe_buffering = rand_fork_unsafe_buffering_enabled(); + + // Additional data is mixed into every CTR-DRBG call to protect, as best we + // can, against forks & VM clones. We do not over-read this information and + // don't reseed with it so, from the point of view of FIPS, this doesn't + // provide “prediction resistance”. But, in practice, it does. + uint8_t additional_data[32]; + // Intel chips have fast RDRAND instructions while, in other cases, RDRAND can + // be _slower_ than a system call. + if (!have_fast_rdrand() || + !rdrand(additional_data, sizeof(additional_data))) { + // Without a hardware RNG to save us from address-space duplication, the OS + // entropy is used. This can be expensive (one read per |RAND_bytes| call) + // and so is disabled when we have fork detection, or if the application has + // promised not to fork. + if (fork_generation != 0 || fork_unsafe_buffering) { + OPENSSL_memset(additional_data, 0, sizeof(additional_data)); + } else { + CRYPTO_sysrand(additional_data, sizeof(additional_data)); + } + } + + for (size_t i = 0; i < sizeof(additional_data); i++) { + additional_data[i] ^= user_additional_data[i]; + } + + struct rand_thread_state stack_state; + struct rand_thread_state *state = reinterpret_cast( + CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_RAND)); + + if (state == nullptr) { + state = New(); + if (state == nullptr || + !CRYPTO_set_thread_local(OPENSSL_THREAD_LOCAL_RAND, state, + rand_thread_state_free)) { + // If the system is out of memory, use an ephemeral state on the + // stack. + state = &stack_state; + } + + state->last_block_valid = 0; + uint8_t seed[CTR_DRBG_SEED_LEN]; + uint8_t personalization[CTR_DRBG_SEED_LEN] = {0}; + size_t personalization_len = 0; + rand_get_seed(state, seed, personalization, &personalization_len); + + if (!CTR_DRBG_init(&state->drbg, /*df=*/true, seed, 32u, seed + 32, + personalization, personalization_len)) { + abort(); + } + state->calls = 0; + state->fork_generation = fork_generation; + state->fork_unsafe_buffering = fork_unsafe_buffering; + +#if defined(BORINGSSL_FIPS) + if (state != &stack_state) { + MutexWriteLock lock(thread_states_list_lock_bss_get()); + struct rand_thread_state **states_list = thread_states_list_bss_get(); + state->next = *states_list; + if (state->next != nullptr) { + state->next->prev = state; + } + state->prev = nullptr; + *states_list = state; + } +#endif + } + + if (state->calls >= kReseedInterval || + // If we've forked since |state| was last seeded, reseed. + state->fork_generation != fork_generation || + // If |state| was seeded from a state with different fork-safety + // preferences, reseed. Suppose |state| was fork-safe, then forked into + // two children, but each of the children never fork and disable fork + // safety. The children must reseed to avoid working from the same PRNG + // state. + state->fork_unsafe_buffering != fork_unsafe_buffering) { + uint8_t seed[CTR_DRBG_SEED_LEN]; + uint8_t reseed_additional_data[CTR_DRBG_SEED_LEN] = {0}; + size_t reseed_additional_data_len = 0; + rand_get_seed(state, seed, reseed_additional_data, + &reseed_additional_data_len); +#if defined(BORINGSSL_FIPS) + // Take a read lock around accesses to |state->drbg|. This is needed to + // avoid returning bad entropy if we race with + // |rand_thread_state_clear_all|. + state->clear_drbg_lock.LockRead(); +#endif + if (!CTR_DRBG_reseed_ex(&state->drbg, seed, sizeof(seed), + reseed_additional_data, + reseed_additional_data_len)) { + abort(); + } + state->calls = 0; + state->fork_generation = fork_generation; + state->fork_unsafe_buffering = fork_unsafe_buffering; + } else { +#if defined(BORINGSSL_FIPS) + state->clear_drbg_lock.LockRead(); +#endif + } + + int first_call = 1; + while (out_len > 0) { + size_t todo = out_len; + if (todo > CTR_DRBG_MAX_GENERATE_LENGTH) { + todo = CTR_DRBG_MAX_GENERATE_LENGTH; + } + + if (!CTR_DRBG_generate(&state->drbg, out, todo, additional_data, + first_call ? sizeof(additional_data) : 0)) { + abort(); + } + + out += todo; + out_len -= todo; + // Though we only check before entering the loop, this cannot add enough to + // overflow a |size_t|. + state->calls++; + first_call = 0; + } + + if (state == &stack_state) { + CTR_DRBG_clear(&state->drbg); + } + +#if defined(BORINGSSL_FIPS) + state->clear_drbg_lock.UnlockRead(); +#endif + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_rand_bytes(uint8_t *out, size_t out_len) { + static const uint8_t kZeroAdditionalData[32] = {0}; + BCM_rand_bytes_with_additional_data(out, out_len, kZeroAdditionalData); + return bcm_infallible::approved; +} + +int RAND_maybe_reseed() { + // Currently does nothing since we don't use jitter entropy yet and so the + // reseeding is quick. + return 0; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/rand/urandom.c b/third_party/boringssl/src/crypto/fipsmodule/rand/urandom.c deleted file mode 100644 index 508d4412..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/rand/urandom.c +++ /dev/null @@ -1,401 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#if !defined(_GNU_SOURCE) -#define _GNU_SOURCE // needed for syscall() on Linux. -#endif - -#include - -#include "internal.h" - -#if defined(OPENSSL_URANDOM) - -#include -#include -#include -#include -#include -#include - -#if defined(OPENSSL_LINUX) -#if defined(BORINGSSL_FIPS) -#include -#include -#endif -#include - -#if defined(OPENSSL_ANDROID) -#include -#endif - -#if !defined(OPENSSL_ANDROID) -#define OPENSSL_HAS_GETAUXVAL -#endif -// glibc prior to 2.16 does not have getauxval and sys/auxv.h. Android has some -// host builds (i.e. not building for Android itself, so |OPENSSL_ANDROID| is -// unset) which are still using a 2.15 sysroot. -// -// TODO(davidben): Remove this once Android updates their sysroot. -#if defined(__GLIBC_PREREQ) -#if !__GLIBC_PREREQ(2, 16) -#undef OPENSSL_HAS_GETAUXVAL -#endif -#endif -#if defined(OPENSSL_HAS_GETAUXVAL) -#include -#endif -#endif // OPENSSL_LINUX - -#if defined(OPENSSL_MACOS) -#include -#endif - -#if defined(OPENSSL_FREEBSD) -#define URANDOM_BLOCKS_FOR_ENTROPY -#if __FreeBSD__ >= 12 -// getrandom is supported in FreeBSD 12 and up. -#define FREEBSD_GETRANDOM -#include -#endif -#endif - -#include -#include - -#include "getrandom_fillin.h" -#include "../delocate.h" -#include "../../internal.h" - - -#if defined(USE_NR_getrandom) - -#if defined(OPENSSL_MSAN) -void __msan_unpoison(void *, size_t); -#endif - -static ssize_t boringssl_getrandom(void *buf, size_t buf_len, unsigned flags) { - ssize_t ret; - do { - ret = syscall(__NR_getrandom, buf, buf_len, flags); - } while (ret == -1 && errno == EINTR); - -#if defined(OPENSSL_MSAN) - if (ret > 0) { - // MSAN doesn't recognise |syscall| and thus doesn't notice that we have - // initialised the output buffer. - __msan_unpoison(buf, ret); - } -#endif // OPENSSL_MSAN - - return ret; -} - -#endif // USE_NR_getrandom - -// kHaveGetrandom in |urandom_fd| signals that |getrandom| or |getentropy| is -// available and should be used instead. -static const int kHaveGetrandom = -3; - -// urandom_fd is a file descriptor to /dev/urandom. It's protected by |once|. -DEFINE_BSS_GET(int, urandom_fd) - -#if defined(USE_NR_getrandom) - -// getrandom_ready is one if |getrandom| had been initialized by the time -// |init_once| was called and zero otherwise. -DEFINE_BSS_GET(int, getrandom_ready) - -// extra_getrandom_flags_for_seed contains a value that is ORed into the flags -// for getrandom() when reading entropy for a seed. -DEFINE_BSS_GET(int, extra_getrandom_flags_for_seed) - -// On Android, check a system property to decide whether to set -// |extra_getrandom_flags_for_seed| otherwise they will default to zero. If -// ro.oem_boringcrypto_hwrand is true then |extra_getrandom_flags_for_seed| will -// be set to GRND_RANDOM, causing all random data to be drawn from the same -// source as /dev/random. -static void maybe_set_extra_getrandom_flags(void) { -#if defined(BORINGSSL_FIPS) && defined(OPENSSL_ANDROID) - char value[PROP_VALUE_MAX + 1]; - int length = __system_property_get("ro.boringcrypto.hwrand", value); - if (length < 0 || length > PROP_VALUE_MAX) { - return; - } - - value[length] = 0; - if (OPENSSL_strcasecmp(value, "true") == 0) { - *extra_getrandom_flags_for_seed_bss_get() = GRND_RANDOM; - } -#endif -} - -#endif // USE_NR_getrandom - -DEFINE_STATIC_ONCE(rand_once) - -// init_once initializes the state of this module to values previously -// requested. This is the only function that modifies |urandom_fd|, which may be -// read safely after calling the once. -static void init_once(void) { -#if defined(USE_NR_getrandom) - int have_getrandom; - uint8_t dummy; - ssize_t getrandom_ret = - boringssl_getrandom(&dummy, sizeof(dummy), GRND_NONBLOCK); - if (getrandom_ret == 1) { - *getrandom_ready_bss_get() = 1; - have_getrandom = 1; - } else if (getrandom_ret == -1 && errno == EAGAIN) { - // We have getrandom, but the entropy pool has not been initialized yet. - have_getrandom = 1; - } else if (getrandom_ret == -1 && errno == ENOSYS) { - // Fallthrough to using /dev/urandom, below. - have_getrandom = 0; - } else { - // Other errors are fatal. - perror("getrandom"); - abort(); - } - - if (have_getrandom) { - *urandom_fd_bss_get() = kHaveGetrandom; - maybe_set_extra_getrandom_flags(); - return; - } -#endif // USE_NR_getrandom - -#if defined(OPENSSL_MACOS) - // getentropy is available in macOS 10.12 and up. iOS 10 and up may also - // support it, but the header is missing. See https://crbug.com/boringssl/287. - if (__builtin_available(macos 10.12, *)) { - *urandom_fd_bss_get() = kHaveGetrandom; - return; - } -#endif - -#if defined(FREEBSD_GETRANDOM) - *urandom_fd_bss_get() = kHaveGetrandom; - return; -#endif - - // Android FIPS builds must support getrandom. -#if defined(BORINGSSL_FIPS) && defined(OPENSSL_ANDROID) - perror("getrandom not found"); - abort(); -#endif - - int fd; - do { - fd = open("/dev/urandom", O_RDONLY); - } while (fd == -1 && errno == EINTR); - - if (fd < 0) { - perror("failed to open /dev/urandom"); - abort(); - } - - int flags = fcntl(fd, F_GETFD); - if (flags == -1) { - // Native Client doesn't implement |fcntl|. - if (errno != ENOSYS) { - perror("failed to get flags from urandom fd"); - abort(); - } - } else { - flags |= FD_CLOEXEC; - if (fcntl(fd, F_SETFD, flags) == -1) { - perror("failed to set FD_CLOEXEC on urandom fd"); - abort(); - } - } - *urandom_fd_bss_get() = fd; -} - -DEFINE_STATIC_ONCE(wait_for_entropy_once) - -static void wait_for_entropy(void) { - int fd = *urandom_fd_bss_get(); - if (fd == kHaveGetrandom) { - // |getrandom| and |getentropy| support blocking in |fill_with_entropy| - // directly. For |getrandom|, we first probe with a non-blocking call to aid - // debugging. -#if defined(USE_NR_getrandom) - if (*getrandom_ready_bss_get()) { - // The entropy pool was already initialized in |init_once|. - return; - } - - uint8_t dummy; - ssize_t getrandom_ret = - boringssl_getrandom(&dummy, sizeof(dummy), GRND_NONBLOCK); - if (getrandom_ret == -1 && errno == EAGAIN) { - // Attempt to get the path of the current process to aid in debugging when - // something blocks. - const char *current_process = ""; -#if defined(OPENSSL_HAS_GETAUXVAL) - const unsigned long getauxval_ret = getauxval(AT_EXECFN); - if (getauxval_ret != 0) { - current_process = (const char *)getauxval_ret; - } -#endif - - fprintf( - stderr, - "%s: getrandom indicates that the entropy pool has not been " - "initialized. Rather than continue with poor entropy, this process " - "will block until entropy is available.\n", - current_process); - - getrandom_ret = - boringssl_getrandom(&dummy, sizeof(dummy), 0 /* no flags */); - } - - if (getrandom_ret != 1) { - perror("getrandom"); - abort(); - } -#endif // USE_NR_getrandom - return; - } - -#if defined(BORINGSSL_FIPS) && !defined(URANDOM_BLOCKS_FOR_ENTROPY) - // In FIPS mode on platforms where urandom doesn't block at startup, we ensure - // that the kernel has sufficient entropy before continuing. This is - // automatically handled by getrandom, which requires that the entropy pool - // has been initialised, but for urandom we have to poll. - for (;;) { - int entropy_bits; - if (ioctl(fd, RNDGETENTCNT, &entropy_bits)) { - fprintf(stderr, - "RNDGETENTCNT on /dev/urandom failed. We cannot continue in this " - "case when in FIPS mode.\n"); - abort(); - } - - static const int kBitsNeeded = 256; - if (entropy_bits >= kBitsNeeded) { - break; - } - - usleep(250000); - } -#endif // BORINGSSL_FIPS && !URANDOM_BLOCKS_FOR_ENTROPY -} - -// fill_with_entropy writes |len| bytes of entropy into |out|. It returns one -// on success and zero on error. If |block| is one, this function will block -// until the entropy pool is initialized. Otherwise, this function may fail, -// setting |errno| to |EAGAIN| if the entropy pool has not yet been initialized. -// If |seed| is one, this function will OR in the value of -// |*extra_getrandom_flags_for_seed()| when using |getrandom|. -static int fill_with_entropy(uint8_t *out, size_t len, int block, int seed) { - if (len == 0) { - return 1; - } - -#if defined(USE_NR_getrandom) || defined(FREEBSD_GETRANDOM) - int getrandom_flags = 0; - if (!block) { - getrandom_flags |= GRND_NONBLOCK; - } -#endif - -#if defined (USE_NR_getrandom) - if (seed) { - getrandom_flags |= *extra_getrandom_flags_for_seed_bss_get(); - } -#endif - - CRYPTO_init_sysrand(); - if (block) { - CRYPTO_once(wait_for_entropy_once_bss_get(), wait_for_entropy); - } - - // Clear |errno| so it has defined value if |read| or |getrandom| - // "successfully" returns zero. - errno = 0; - while (len > 0) { - ssize_t r; - - if (*urandom_fd_bss_get() == kHaveGetrandom) { -#if defined(USE_NR_getrandom) - r = boringssl_getrandom(out, len, getrandom_flags); -#elif defined(FREEBSD_GETRANDOM) - r = getrandom(out, len, getrandom_flags); -#elif defined(OPENSSL_MACOS) - if (__builtin_available(macos 10.12, *)) { - // |getentropy| can only request 256 bytes at a time. - size_t todo = len <= 256 ? len : 256; - if (getentropy(out, todo) != 0) { - r = -1; - } else { - r = (ssize_t)todo; - } - } else { - fprintf(stderr, "urandom fd corrupt.\n"); - abort(); - } -#else // USE_NR_getrandom - fprintf(stderr, "urandom fd corrupt.\n"); - abort(); -#endif - } else { - do { - r = read(*urandom_fd_bss_get(), out, len); - } while (r == -1 && errno == EINTR); - } - - if (r <= 0) { - return 0; - } - out += r; - len -= r; - } - - return 1; -} - -void CRYPTO_init_sysrand(void) { - CRYPTO_once(rand_once_bss_get(), init_once); -} - -// CRYPTO_sysrand puts |requested| random bytes into |out|. -void CRYPTO_sysrand(uint8_t *out, size_t requested) { - if (!fill_with_entropy(out, requested, /*block=*/1, /*seed=*/0)) { - perror("entropy fill failed"); - abort(); - } -} - -void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) { - if (!fill_with_entropy(out, requested, /*block=*/1, /*seed=*/1)) { - perror("entropy fill failed"); - abort(); - } -} - -int CRYPTO_sysrand_if_available(uint8_t *out, size_t requested) { - if (fill_with_entropy(out, requested, /*block=*/0, /*seed=*/0)) { - return 1; - } else if (errno == EAGAIN) { - OPENSSL_memset(out, 0, requested); - return 0; - } else { - perror("opportunistic entropy fill failed"); - abort(); - } -} - -#endif // OPENSSL_URANDOM diff --git a/third_party/boringssl/src/crypto/fipsmodule/rsa/blinding.c b/third_party/boringssl/src/crypto/fipsmodule/rsa/blinding.c deleted file mode 100644 index 29477bd7..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/rsa/blinding.c +++ /dev/null @@ -1,243 +0,0 @@ -/* ==================================================================== - * Copyright (c) 1998-2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - * Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include -#include - -#include "internal.h" -#include "../../internal.h" - - -#define BN_BLINDING_COUNTER 32 - -struct bn_blinding_st { - BIGNUM *A; // The base blinding factor, Montgomery-encoded. - BIGNUM *Ai; // The inverse of the blinding factor, Montgomery-encoded. - unsigned counter; -}; - -static int bn_blinding_create_param(BN_BLINDING *b, const BIGNUM *e, - const BN_MONT_CTX *mont, BN_CTX *ctx); - -BN_BLINDING *BN_BLINDING_new(void) { - BN_BLINDING *ret = OPENSSL_malloc(sizeof(BN_BLINDING)); - if (ret == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(BN_BLINDING)); - - ret->A = BN_new(); - if (ret->A == NULL) { - goto err; - } - - ret->Ai = BN_new(); - if (ret->Ai == NULL) { - goto err; - } - - // The blinding values need to be created before this blinding can be used. - ret->counter = BN_BLINDING_COUNTER - 1; - - return ret; - -err: - BN_BLINDING_free(ret); - return NULL; -} - -void BN_BLINDING_free(BN_BLINDING *r) { - if (r == NULL) { - return; - } - - BN_free(r->A); - BN_free(r->Ai); - OPENSSL_free(r); -} - -void BN_BLINDING_invalidate(BN_BLINDING *b) { - b->counter = BN_BLINDING_COUNTER - 1; -} - -static int bn_blinding_update(BN_BLINDING *b, const BIGNUM *e, - const BN_MONT_CTX *mont, BN_CTX *ctx) { - if (++b->counter == BN_BLINDING_COUNTER) { - // re-create blinding parameters - if (!bn_blinding_create_param(b, e, mont, ctx)) { - goto err; - } - b->counter = 0; - } else { - if (!BN_mod_mul_montgomery(b->A, b->A, b->A, mont, ctx) || - !BN_mod_mul_montgomery(b->Ai, b->Ai, b->Ai, mont, ctx)) { - goto err; - } - } - - return 1; - -err: - // |A| and |Ai| may be in an inconsistent state so they both need to be - // replaced the next time this blinding is used. Note that this is only - // sufficient because support for |BN_BLINDING_NO_UPDATE| and - // |BN_BLINDING_NO_RECREATE| was previously dropped. - b->counter = BN_BLINDING_COUNTER - 1; - - return 0; -} - -int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, const BIGNUM *e, - const BN_MONT_CTX *mont, BN_CTX *ctx) { - // |n| is not Montgomery-encoded and |b->A| is. |BN_mod_mul_montgomery| - // cancels one Montgomery factor, so the resulting value of |n| is unencoded. - if (!bn_blinding_update(b, e, mont, ctx) || - !BN_mod_mul_montgomery(n, n, b->A, mont, ctx)) { - return 0; - } - - return 1; -} - -int BN_BLINDING_invert(BIGNUM *n, const BN_BLINDING *b, BN_MONT_CTX *mont, - BN_CTX *ctx) { - // |n| is not Montgomery-encoded and |b->A| is. |BN_mod_mul_montgomery| - // cancels one Montgomery factor, so the resulting value of |n| is unencoded. - return BN_mod_mul_montgomery(n, n, b->Ai, mont, ctx); -} - -static int bn_blinding_create_param(BN_BLINDING *b, const BIGNUM *e, - const BN_MONT_CTX *mont, BN_CTX *ctx) { - int no_inverse; - if (!BN_rand_range_ex(b->A, 1, &mont->N) || - // Compute |b->A|^-1 in Montgomery form. Note |BN_from_montgomery| + - // |BN_mod_inverse_blinded| is equivalent to, but more efficient than, - // |BN_mod_inverse_blinded| + |BN_to_montgomery|. - // - // We do not retry if |b->A| has no inverse. Finding a non-invertible - // value of |b->A| is equivalent to factoring |mont->N|. There is - // negligible probability of stumbling on one at random. - !BN_from_montgomery(b->Ai, b->A, mont, ctx) || - !BN_mod_inverse_blinded(b->Ai, &no_inverse, b->Ai, mont, ctx) || - // TODO(davidben): |BN_mod_exp_mont| internally computes the result in - // Montgomery form. Save a pair of Montgomery reductions and a - // multiplication by returning that value directly. - !BN_mod_exp_mont(b->A, b->A, e, &mont->N, ctx, mont) || - !BN_to_montgomery(b->A, b->A, mont, ctx)) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - return 0; - } - - return 1; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/rsa/internal.h b/third_party/boringssl/src/crypto/fipsmodule/rsa/internal.h index 1cb3b5f3..3a0daf56 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/rsa/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/rsa/internal.h @@ -1,110 +1,120 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#ifndef OPENSSL_HEADER_RSA_INTERNAL_H -#define OPENSSL_HEADER_RSA_INTERNAL_H +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_RSA_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_RSA_INTERNAL_H #include #include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" + + +DECLARE_OPAQUE_STRUCT(rsa_st, RSAImpl) + +BSSL_NAMESPACE_BEGIN + +// TODO(crbug.com/42290480): Raise this limit. 512-bit RSA was factored in 1999. +#define OPENSSL_RSA_MIN_MODULUS_BITS 512 + +// TODO(davidben): This is inside BCM because |RSA| is inside BCM, but BCM never +// uses this. Split the RSA type in two. +enum rsa_pss_params_t { + // No parameters. + // TODO(davidben): Remove this and use std::optional where appropriate. + rsa_pss_none = 0, + // RSA-PSS using SHA-256, MGF1 with SHA-256, salt length 32. + rsa_pss_sha256, + // RSA-PSS using SHA-384, MGF1 with SHA-384, salt length 48. + rsa_pss_sha384, + // RSA-PSS using SHA-512, MGF1 with SHA-512, salt length 64. + rsa_pss_sha512, +}; + +class RSAImpl : public rsa_st, public RefCounted { + public: + explicit RSAImpl(const ENGINE *engine); + + RSA_METHOD *meth = nullptr; + + UniquePtr n; + UniquePtr e; + UniquePtr d; + UniquePtr p; + UniquePtr q; + UniquePtr dmp1; + UniquePtr dmq1; + UniquePtr iqmp; + + // be careful using this if the RSA structure is shared + CRYPTO_EX_DATA ex_data = {}; + int flags = 0; + Mutex lock; -#if defined(__cplusplus) -extern "C" { -#endif + // Used to cache montgomery values. The creation of these values is protected + // by |lock|. + UniquePtr mont_n; + UniquePtr mont_p; + UniquePtr mont_q; + // The following fields are copies of |d|, |dmp1|, and |dmq1|, respectively, + // but with the correct widths to prevent side channels. These must use + // separate copies due to threading concerns caused by OpenSSL's API + // mistakes. See https://github.com/openssl/openssl/issues/5158 and + // the |freeze_private_key| implementation. + UniquePtr d_fixed, dmp1_fixed, dmq1_fixed; + + // iqmp_mont is q^-1 mod p in Montgomery form, using |mont_p|. + UniquePtr iqmp_mont; + + // pss_params is the RSA-PSS parameters associated with the key. This is not + // used by the low-level RSA implementation, just the EVP layer. + bssl::rsa_pss_params_t pss_params = bssl::rsa_pss_none; + + // private_key_frozen is one if the key has been used for a private key + // operation and may no longer be mutated. + unsigned private_key_frozen = 0; + + private: + friend RefCounted; + ~RSAImpl(); +}; + +#define RSA_PKCS1_PADDING_SIZE 11 // Default implementations of RSA operations. -const RSA_METHOD *RSA_default_method(void); +const RSA_METHOD *RSA_default_method(); -size_t rsa_default_size(const RSA *rsa); int rsa_default_sign_raw(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, const uint8_t *in, size_t in_len, int padding); -int rsa_default_decrypt(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, - const uint8_t *in, size_t in_len, int padding); int rsa_default_private_transform(RSA *rsa, uint8_t *out, const uint8_t *in, size_t len); -BN_BLINDING *BN_BLINDING_new(void); -void BN_BLINDING_free(BN_BLINDING *b); -void BN_BLINDING_invalidate(BN_BLINDING *b); -int BN_BLINDING_convert(BIGNUM *n, BN_BLINDING *b, const BIGNUM *e, - const BN_MONT_CTX *mont_ctx, BN_CTX *ctx); -int BN_BLINDING_invert(BIGNUM *n, const BN_BLINDING *b, BN_MONT_CTX *mont_ctx, - BN_CTX *ctx); - - +int PKCS1_MGF1(uint8_t *out, size_t len, const uint8_t *seed, size_t seed_len, + const EVP_MD *md); int RSA_padding_add_PKCS1_type_1(uint8_t *to, size_t to_len, const uint8_t *from, size_t from_len); int RSA_padding_check_PKCS1_type_1(uint8_t *out, size_t *out_len, size_t max_out, const uint8_t *from, size_t from_len); -int RSA_padding_add_PKCS1_type_2(uint8_t *to, size_t to_len, - const uint8_t *from, size_t from_len); -int RSA_padding_check_PKCS1_type_2(uint8_t *out, size_t *out_len, - size_t max_out, const uint8_t *from, - size_t from_len); -int RSA_padding_check_PKCS1_OAEP_mgf1(uint8_t *out, size_t *out_len, - size_t max_out, const uint8_t *from, - size_t from_len, const uint8_t *param, - size_t param_len, const EVP_MD *md, - const EVP_MD *mgf1md); int RSA_padding_add_none(uint8_t *to, size_t to_len, const uint8_t *from, size_t from_len); @@ -112,16 +122,23 @@ int RSA_padding_add_none(uint8_t *to, size_t to_len, const uint8_t *from, // within DoS bounds. int rsa_check_public_key(const RSA *rsa); -// RSA_private_transform calls either the method-specific |private_transform| -// function (if given) or the generic one. See the comment for -// |private_transform| in |rsa_meth_st|. -int RSA_private_transform(RSA *rsa, uint8_t *out, const uint8_t *in, - size_t len); +// rsa_private_transform_no_self_test calls either the method-specific +// |private_transform| function (if given) or the generic one. See the comment +// for |private_transform| in |rsa_meth_st|. +int rsa_private_transform_no_self_test(RSA *rsa, uint8_t *out, + const uint8_t *in, size_t len); +// rsa_private_transform acts the same as |rsa_private_transform_no_self_test| +// but, in FIPS mode, performs an RSA self test before calling the default RSA +// implementation. +int rsa_private_transform(RSA *rsa, uint8_t *out, const uint8_t *in, + size_t len); -// This constant is exported for test purposes. -extern const BN_ULONG kBoringSSLRSASqrtTwo[]; -extern const size_t kBoringSSLRSASqrtTwoLen; +// rsa_invalidate_key is called after |rsa| has been mutated, to invalidate +// fields derived from the original structure. This function assumes exclusive +// access to |rsa|. In particular, no other thread may be concurrently signing, +// etc., with |rsa|. +void rsa_invalidate_key(RSA *rsa); // Functions that avoid self-tests. @@ -142,12 +159,9 @@ int rsa_verify_raw_no_self_test(RSA *rsa, size_t *out_len, uint8_t *out, size_t in_len, int padding); int rsa_sign_no_self_test(int hash_nid, const uint8_t *digest, - unsigned digest_len, uint8_t *out, unsigned *out_len, + size_t digest_len, uint8_t *out, unsigned *out_len, RSA *rsa); +BSSL_NAMESPACE_END -#if defined(__cplusplus) -} // extern C -#endif - -#endif // OPENSSL_HEADER_RSA_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_RSA_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/rsa/padding.c b/third_party/boringssl/src/crypto/fipsmodule/rsa/padding.c deleted file mode 100644 index 605647a8..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/rsa/padding.c +++ /dev/null @@ -1,702 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 2005. - */ -/* ==================================================================== - * Copyright (c) 2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../service_indicator/internal.h" -#include "../../internal.h" - - -#define RSA_PKCS1_PADDING_SIZE 11 - -int RSA_padding_add_PKCS1_type_1(uint8_t *to, size_t to_len, - const uint8_t *from, size_t from_len) { - // See RFC 8017, section 9.2. - if (to_len < RSA_PKCS1_PADDING_SIZE) { - OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); - return 0; - } - - if (from_len > to_len - RSA_PKCS1_PADDING_SIZE) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DIGEST_TOO_BIG_FOR_RSA_KEY); - return 0; - } - - to[0] = 0; - to[1] = 1; - OPENSSL_memset(to + 2, 0xff, to_len - 3 - from_len); - to[to_len - from_len - 1] = 0; - OPENSSL_memcpy(to + to_len - from_len, from, from_len); - return 1; -} - -int RSA_padding_check_PKCS1_type_1(uint8_t *out, size_t *out_len, - size_t max_out, const uint8_t *from, - size_t from_len) { - // See RFC 8017, section 9.2. This is part of signature verification and thus - // does not need to run in constant-time. - if (from_len < 2) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_SMALL); - return 0; - } - - // Check the header. - if (from[0] != 0 || from[1] != 1) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BLOCK_TYPE_IS_NOT_01); - return 0; - } - - // Scan over padded data, looking for the 00. - size_t pad; - for (pad = 2 /* header */; pad < from_len; pad++) { - if (from[pad] == 0x00) { - break; - } - - if (from[pad] != 0xff) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_FIXED_HEADER_DECRYPT); - return 0; - } - } - - if (pad == from_len) { - OPENSSL_PUT_ERROR(RSA, RSA_R_NULL_BEFORE_BLOCK_MISSING); - return 0; - } - - if (pad < 2 /* header */ + 8) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_PAD_BYTE_COUNT); - return 0; - } - - // Skip over the 00. - pad++; - - if (from_len - pad > max_out) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE); - return 0; - } - - OPENSSL_memcpy(out, from + pad, from_len - pad); - *out_len = from_len - pad; - return 1; -} - -static void rand_nonzero(uint8_t *out, size_t len) { - FIPS_service_indicator_lock_state(); - RAND_bytes(out, len); - - for (size_t i = 0; i < len; i++) { - while (out[i] == 0) { - RAND_bytes(out + i, 1); - } - } - - FIPS_service_indicator_unlock_state(); -} - -int RSA_padding_add_PKCS1_type_2(uint8_t *to, size_t to_len, - const uint8_t *from, size_t from_len) { - // See RFC 8017, section 7.2.1. - if (to_len < RSA_PKCS1_PADDING_SIZE) { - OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); - return 0; - } - - if (from_len > to_len - RSA_PKCS1_PADDING_SIZE) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); - return 0; - } - - to[0] = 0; - to[1] = 2; - - size_t padding_len = to_len - 3 - from_len; - rand_nonzero(to + 2, padding_len); - to[2 + padding_len] = 0; - OPENSSL_memcpy(to + to_len - from_len, from, from_len); - return 1; -} - -int RSA_padding_check_PKCS1_type_2(uint8_t *out, size_t *out_len, - size_t max_out, const uint8_t *from, - size_t from_len) { - if (from_len == 0) { - OPENSSL_PUT_ERROR(RSA, RSA_R_EMPTY_PUBLIC_KEY); - return 0; - } - - // PKCS#1 v1.5 decryption. See "PKCS #1 v2.2: RSA Cryptography - // Standard", section 7.2.2. - if (from_len < RSA_PKCS1_PADDING_SIZE) { - // |from| is zero-padded to the size of the RSA modulus, a public value, so - // this can be rejected in non-constant time. - OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); - return 0; - } - - crypto_word_t first_byte_is_zero = constant_time_eq_w(from[0], 0); - crypto_word_t second_byte_is_two = constant_time_eq_w(from[1], 2); - - crypto_word_t zero_index = 0, looking_for_index = CONSTTIME_TRUE_W; - for (size_t i = 2; i < from_len; i++) { - crypto_word_t equals0 = constant_time_is_zero_w(from[i]); - zero_index = - constant_time_select_w(looking_for_index & equals0, i, zero_index); - looking_for_index = constant_time_select_w(equals0, 0, looking_for_index); - } - - // The input must begin with 00 02. - crypto_word_t valid_index = first_byte_is_zero; - valid_index &= second_byte_is_two; - - // We must have found the end of PS. - valid_index &= ~looking_for_index; - - // PS must be at least 8 bytes long, and it starts two bytes into |from|. - valid_index &= constant_time_ge_w(zero_index, 2 + 8); - - // Skip the zero byte. - zero_index++; - - // NOTE: Although this logic attempts to be constant time, the API contracts - // of this function and |RSA_decrypt| with |RSA_PKCS1_PADDING| make it - // impossible to completely avoid Bleichenbacher's attack. Consumers should - // use |RSA_PADDING_NONE| and perform the padding check in constant-time - // combined with a swap to a random session key or other mitigation. - CONSTTIME_DECLASSIFY(&valid_index, sizeof(valid_index)); - CONSTTIME_DECLASSIFY(&zero_index, sizeof(zero_index)); - - if (!valid_index) { - OPENSSL_PUT_ERROR(RSA, RSA_R_PKCS_DECODING_ERROR); - return 0; - } - - const size_t msg_len = from_len - zero_index; - if (msg_len > max_out) { - // This shouldn't happen because this function is always called with - // |max_out| as the key size and |from_len| is bounded by the key size. - OPENSSL_PUT_ERROR(RSA, RSA_R_PKCS_DECODING_ERROR); - return 0; - } - - OPENSSL_memcpy(out, &from[zero_index], msg_len); - *out_len = msg_len; - return 1; -} - -int RSA_padding_add_none(uint8_t *to, size_t to_len, const uint8_t *from, - size_t from_len) { - if (from_len > to_len) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); - return 0; - } - - if (from_len < to_len) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_SMALL); - return 0; - } - - OPENSSL_memcpy(to, from, from_len); - return 1; -} - -static int PKCS1_MGF1(uint8_t *out, size_t len, const uint8_t *seed, - size_t seed_len, const EVP_MD *md) { - int ret = 0; - EVP_MD_CTX ctx; - EVP_MD_CTX_init(&ctx); - FIPS_service_indicator_lock_state(); - - size_t md_len = EVP_MD_size(md); - - for (uint32_t i = 0; len > 0; i++) { - uint8_t counter[4]; - counter[0] = (uint8_t)(i >> 24); - counter[1] = (uint8_t)(i >> 16); - counter[2] = (uint8_t)(i >> 8); - counter[3] = (uint8_t)i; - if (!EVP_DigestInit_ex(&ctx, md, NULL) || - !EVP_DigestUpdate(&ctx, seed, seed_len) || - !EVP_DigestUpdate(&ctx, counter, sizeof(counter))) { - goto err; - } - - if (md_len <= len) { - if (!EVP_DigestFinal_ex(&ctx, out, NULL)) { - goto err; - } - out += md_len; - len -= md_len; - } else { - uint8_t digest[EVP_MAX_MD_SIZE]; - if (!EVP_DigestFinal_ex(&ctx, digest, NULL)) { - goto err; - } - OPENSSL_memcpy(out, digest, len); - len = 0; - } - } - - ret = 1; - -err: - EVP_MD_CTX_cleanup(&ctx); - FIPS_service_indicator_unlock_state(); - return ret; -} - -int RSA_padding_add_PKCS1_OAEP_mgf1(uint8_t *to, size_t to_len, - const uint8_t *from, size_t from_len, - const uint8_t *param, size_t param_len, - const EVP_MD *md, const EVP_MD *mgf1md) { - if (md == NULL) { - md = EVP_sha1(); - } - if (mgf1md == NULL) { - mgf1md = md; - } - - size_t mdlen = EVP_MD_size(md); - - if (to_len < 2 * mdlen + 2) { - OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); - return 0; - } - - size_t emlen = to_len - 1; - if (from_len > emlen - 2 * mdlen - 1) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); - return 0; - } - - if (emlen < 2 * mdlen + 1) { - OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); - return 0; - } - - to[0] = 0; - uint8_t *seed = to + 1; - uint8_t *db = to + mdlen + 1; - - uint8_t *dbmask = NULL; - int ret = 0; - FIPS_service_indicator_lock_state(); - if (!EVP_Digest(param, param_len, db, NULL, md, NULL)) { - goto out; - } - OPENSSL_memset(db + mdlen, 0, emlen - from_len - 2 * mdlen - 1); - db[emlen - from_len - mdlen - 1] = 0x01; - OPENSSL_memcpy(db + emlen - from_len - mdlen, from, from_len); - if (!RAND_bytes(seed, mdlen)) { - goto out; - } - - dbmask = OPENSSL_malloc(emlen - mdlen); - if (dbmask == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - goto out; - } - - if (!PKCS1_MGF1(dbmask, emlen - mdlen, seed, mdlen, mgf1md)) { - goto out; - } - for (size_t i = 0; i < emlen - mdlen; i++) { - db[i] ^= dbmask[i]; - } - - uint8_t seedmask[EVP_MAX_MD_SIZE]; - if (!PKCS1_MGF1(seedmask, mdlen, db, emlen - mdlen, mgf1md)) { - goto out; - } - for (size_t i = 0; i < mdlen; i++) { - seed[i] ^= seedmask[i]; - } - ret = 1; - -out: - OPENSSL_free(dbmask); - FIPS_service_indicator_unlock_state(); - return ret; -} - -int RSA_padding_check_PKCS1_OAEP_mgf1(uint8_t *out, size_t *out_len, - size_t max_out, const uint8_t *from, - size_t from_len, const uint8_t *param, - size_t param_len, const EVP_MD *md, - const EVP_MD *mgf1md) { - uint8_t *db = NULL; - - if (md == NULL) { - md = EVP_sha1(); - } - if (mgf1md == NULL) { - mgf1md = md; - } - - size_t mdlen = EVP_MD_size(md); - - // The encoded message is one byte smaller than the modulus to ensure that it - // doesn't end up greater than the modulus. Thus there's an extra "+1" here - // compared to https://tools.ietf.org/html/rfc2437#section-9.1.1.2. - if (from_len < 1 + 2*mdlen + 1) { - // 'from_len' is the length of the modulus, i.e. does not depend on the - // particular ciphertext. - goto decoding_err; - } - - size_t dblen = from_len - mdlen - 1; - FIPS_service_indicator_lock_state(); - db = OPENSSL_malloc(dblen); - if (db == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - goto err; - } - - const uint8_t *maskedseed = from + 1; - const uint8_t *maskeddb = from + 1 + mdlen; - - uint8_t seed[EVP_MAX_MD_SIZE]; - if (!PKCS1_MGF1(seed, mdlen, maskeddb, dblen, mgf1md)) { - goto err; - } - for (size_t i = 0; i < mdlen; i++) { - seed[i] ^= maskedseed[i]; - } - - if (!PKCS1_MGF1(db, dblen, seed, mdlen, mgf1md)) { - goto err; - } - for (size_t i = 0; i < dblen; i++) { - db[i] ^= maskeddb[i]; - } - - uint8_t phash[EVP_MAX_MD_SIZE]; - if (!EVP_Digest(param, param_len, phash, NULL, md, NULL)) { - goto err; - } - - crypto_word_t bad = ~constant_time_is_zero_w(CRYPTO_memcmp(db, phash, mdlen)); - bad |= ~constant_time_is_zero_w(from[0]); - - crypto_word_t looking_for_one_byte = CONSTTIME_TRUE_W; - size_t one_index = 0; - for (size_t i = mdlen; i < dblen; i++) { - crypto_word_t equals1 = constant_time_eq_w(db[i], 1); - crypto_word_t equals0 = constant_time_eq_w(db[i], 0); - one_index = - constant_time_select_w(looking_for_one_byte & equals1, i, one_index); - looking_for_one_byte = - constant_time_select_w(equals1, 0, looking_for_one_byte); - bad |= looking_for_one_byte & ~equals0; - } - - bad |= looking_for_one_byte; - - if (bad) { - goto decoding_err; - } - - one_index++; - size_t mlen = dblen - one_index; - if (max_out < mlen) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE); - goto err; - } - - OPENSSL_memcpy(out, db + one_index, mlen); - *out_len = mlen; - OPENSSL_free(db); - FIPS_service_indicator_unlock_state(); - return 1; - -decoding_err: - // to avoid chosen ciphertext attacks, the error message should not reveal - // which kind of decoding error happened - OPENSSL_PUT_ERROR(RSA, RSA_R_OAEP_DECODING_ERROR); - err: - OPENSSL_free(db); - FIPS_service_indicator_unlock_state(); - return 0; -} - -static const uint8_t kPSSZeroes[] = {0, 0, 0, 0, 0, 0, 0, 0}; - -int RSA_verify_PKCS1_PSS_mgf1(const RSA *rsa, const uint8_t *mHash, - const EVP_MD *Hash, const EVP_MD *mgf1Hash, - const uint8_t *EM, int sLen) { - int i; - int ret = 0; - int maskedDBLen, MSBits, emLen; - size_t hLen; - const uint8_t *H; - uint8_t *DB = NULL; - EVP_MD_CTX ctx; - uint8_t H_[EVP_MAX_MD_SIZE]; - EVP_MD_CTX_init(&ctx); - - if (mgf1Hash == NULL) { - mgf1Hash = Hash; - } - - hLen = EVP_MD_size(Hash); - FIPS_service_indicator_lock_state(); - - // Negative sLen has special meanings: - // -1 sLen == hLen - // -2 salt length is autorecovered from signature - // -N reserved - if (sLen == -1) { - sLen = hLen; - } else if (sLen == -2) { - sLen = -2; - } else if (sLen < -2) { - OPENSSL_PUT_ERROR(RSA, RSA_R_SLEN_CHECK_FAILED); - goto err; - } - - MSBits = (BN_num_bits(rsa->n) - 1) & 0x7; - emLen = RSA_size(rsa); - if (EM[0] & (0xFF << MSBits)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_FIRST_OCTET_INVALID); - goto err; - } - if (MSBits == 0) { - EM++; - emLen--; - } - if (emLen < (int)hLen + 2 || emLen < ((int)hLen + sLen + 2)) { - // sLen can be small negative - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE); - goto err; - } - if (EM[emLen - 1] != 0xbc) { - OPENSSL_PUT_ERROR(RSA, RSA_R_LAST_OCTET_INVALID); - goto err; - } - maskedDBLen = emLen - hLen - 1; - H = EM + maskedDBLen; - DB = OPENSSL_malloc(maskedDBLen); - if (!DB) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - goto err; - } - if (!PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash)) { - goto err; - } - for (i = 0; i < maskedDBLen; i++) { - DB[i] ^= EM[i]; - } - if (MSBits) { - DB[0] &= 0xFF >> (8 - MSBits); - } - for (i = 0; DB[i] == 0 && i < (maskedDBLen - 1); i++) { - ; - } - if (DB[i++] != 0x1) { - OPENSSL_PUT_ERROR(RSA, RSA_R_SLEN_RECOVERY_FAILED); - goto err; - } - if (sLen >= 0 && (maskedDBLen - i) != sLen) { - OPENSSL_PUT_ERROR(RSA, RSA_R_SLEN_CHECK_FAILED); - goto err; - } - if (!EVP_DigestInit_ex(&ctx, Hash, NULL) || - !EVP_DigestUpdate(&ctx, kPSSZeroes, sizeof(kPSSZeroes)) || - !EVP_DigestUpdate(&ctx, mHash, hLen) || - !EVP_DigestUpdate(&ctx, DB + i, maskedDBLen - i) || - !EVP_DigestFinal_ex(&ctx, H_, NULL)) { - goto err; - } - if (OPENSSL_memcmp(H_, H, hLen)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_SIGNATURE); - ret = 0; - } else { - ret = 1; - } - -err: - OPENSSL_free(DB); - EVP_MD_CTX_cleanup(&ctx); - FIPS_service_indicator_unlock_state(); - - return ret; -} - -int RSA_padding_add_PKCS1_PSS_mgf1(const RSA *rsa, unsigned char *EM, - const unsigned char *mHash, - const EVP_MD *Hash, const EVP_MD *mgf1Hash, - int sLenRequested) { - int ret = 0; - size_t maskedDBLen, MSBits, emLen; - size_t hLen; - unsigned char *H, *salt = NULL, *p; - - if (mgf1Hash == NULL) { - mgf1Hash = Hash; - } - - FIPS_service_indicator_lock_state(); - hLen = EVP_MD_size(Hash); - - if (BN_is_zero(rsa->n)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_EMPTY_PUBLIC_KEY); - goto err; - } - - MSBits = (BN_num_bits(rsa->n) - 1) & 0x7; - emLen = RSA_size(rsa); - if (MSBits == 0) { - assert(emLen >= 1); - *EM++ = 0; - emLen--; - } - - if (emLen < hLen + 2) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); - goto err; - } - - // Negative sLenRequested has special meanings: - // -1 sLen == hLen - // -2 salt length is maximized - // -N reserved - size_t sLen; - if (sLenRequested == -1) { - sLen = hLen; - } else if (sLenRequested == -2) { - sLen = emLen - hLen - 2; - } else if (sLenRequested < 0) { - OPENSSL_PUT_ERROR(RSA, RSA_R_SLEN_CHECK_FAILED); - goto err; - } else { - sLen = (size_t)sLenRequested; - } - - if (emLen - hLen - 2 < sLen) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); - goto err; - } - - if (sLen > 0) { - salt = OPENSSL_malloc(sLen); - if (!salt) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - goto err; - } - if (!RAND_bytes(salt, sLen)) { - goto err; - } - } - maskedDBLen = emLen - hLen - 1; - H = EM + maskedDBLen; - - EVP_MD_CTX ctx; - EVP_MD_CTX_init(&ctx); - int digest_ok = EVP_DigestInit_ex(&ctx, Hash, NULL) && - EVP_DigestUpdate(&ctx, kPSSZeroes, sizeof(kPSSZeroes)) && - EVP_DigestUpdate(&ctx, mHash, hLen) && - EVP_DigestUpdate(&ctx, salt, sLen) && - EVP_DigestFinal_ex(&ctx, H, NULL); - EVP_MD_CTX_cleanup(&ctx); - if (!digest_ok) { - goto err; - } - - // Generate dbMask in place then perform XOR on it - if (!PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash)) { - goto err; - } - - p = EM; - - // Initial PS XORs with all zeroes which is a NOP so just update - // pointer. Note from a test above this value is guaranteed to - // be non-negative. - p += emLen - sLen - hLen - 2; - *p++ ^= 0x1; - if (sLen > 0) { - for (size_t i = 0; i < sLen; i++) { - *p++ ^= salt[i]; - } - } - if (MSBits) { - EM[0] &= 0xFF >> (8 - MSBits); - } - - // H is already in place so just set final 0xbc - - EM[emLen - 1] = 0xbc; - - ret = 1; - -err: - OPENSSL_free(salt); - FIPS_service_indicator_unlock_state(); - - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/rsa/padding.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/rsa/padding.cc.inc new file mode 100644 index 00000000..44c8ce45 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/rsa/padding.cc.inc @@ -0,0 +1,371 @@ +// Copyright 2005-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "../../internal.h" +#include "../bcm_interface.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +int bssl::RSA_padding_add_PKCS1_type_1(uint8_t *to, size_t to_len, + const uint8_t *from, size_t from_len) { + // See RFC 8017, section 9.2. + if (to_len < RSA_PKCS1_PADDING_SIZE) { + OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); + return 0; + } + + if (from_len > to_len - RSA_PKCS1_PADDING_SIZE) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DIGEST_TOO_BIG_FOR_RSA_KEY); + return 0; + } + + to[0] = 0; + to[1] = 1; + OPENSSL_memset(to + 2, 0xff, to_len - 3 - from_len); + to[to_len - from_len - 1] = 0; + OPENSSL_memcpy(to + to_len - from_len, from, from_len); + return 1; +} + +int bssl::RSA_padding_check_PKCS1_type_1(uint8_t *out, size_t *out_len, + size_t max_out, const uint8_t *from, + size_t from_len) { + // See RFC 8017, section 9.2. This is part of signature verification and thus + // does not need to run in constant-time. + if (from_len < 2) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_SMALL); + return 0; + } + + // Check the header. + if (from[0] != 0 || from[1] != 1) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BLOCK_TYPE_IS_NOT_01); + return 0; + } + + // Scan over padded data, looking for the 00. + size_t pad; + for (pad = 2 /* header */; pad < from_len; pad++) { + if (from[pad] == 0x00) { + break; + } + + if (from[pad] != 0xff) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_FIXED_HEADER_DECRYPT); + return 0; + } + } + + if (pad == from_len) { + OPENSSL_PUT_ERROR(RSA, RSA_R_NULL_BEFORE_BLOCK_MISSING); + return 0; + } + + if (pad < 2 /* header */ + 8) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_PAD_BYTE_COUNT); + return 0; + } + + // Skip over the 00. + pad++; + + if (from_len - pad > max_out) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE); + return 0; + } + + OPENSSL_memcpy(out, from + pad, from_len - pad); + *out_len = from_len - pad; + return 1; +} + +int bssl::RSA_padding_add_none(uint8_t *to, size_t to_len, const uint8_t *from, + size_t from_len) { + if (from_len > to_len) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); + return 0; + } + + if (from_len < to_len) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_SMALL); + return 0; + } + + OPENSSL_memcpy(to, from, from_len); + return 1; +} + +int bssl::PKCS1_MGF1(uint8_t *out, size_t len, const uint8_t *seed, + size_t seed_len, const EVP_MD *md) { + int ret = 0; + ScopedEVP_MD_CTX ctx; + FIPS_service_indicator_lock_state(); + + size_t md_len = EVP_MD_size(md); + + for (uint32_t i = 0; len > 0; i++) { + uint8_t counter[4]; + counter[0] = (uint8_t)(i >> 24); + counter[1] = (uint8_t)(i >> 16); + counter[2] = (uint8_t)(i >> 8); + counter[3] = (uint8_t)i; + if (!EVP_DigestInit_ex(ctx.get(), md, nullptr) || + !EVP_DigestUpdate(ctx.get(), seed, seed_len) || + !EVP_DigestUpdate(ctx.get(), counter, sizeof(counter))) { + goto err; + } + + if (md_len <= len) { + if (!EVP_DigestFinal_ex(ctx.get(), out, nullptr)) { + goto err; + } + out += md_len; + len -= md_len; + } else { + uint8_t digest[EVP_MAX_MD_SIZE]; + if (!EVP_DigestFinal_ex(ctx.get(), digest, nullptr)) { + goto err; + } + OPENSSL_memcpy(out, digest, len); + len = 0; + } + } + + ret = 1; + +err: + FIPS_service_indicator_unlock_state(); + return ret; +} + +static const uint8_t kPSSZeroes[] = {0, 0, 0, 0, 0, 0, 0, 0}; + +int RSA_verify_PKCS1_PSS_mgf1(const RSA *rsa, const uint8_t *mHash, + const EVP_MD *Hash, const EVP_MD *mgf1Hash, + const uint8_t *EM, int sLen) { + if (mgf1Hash == nullptr) { + mgf1Hash = Hash; + } + + int ret = 0; + uint8_t *DB = nullptr; + const uint8_t *H; + ScopedEVP_MD_CTX ctx; + unsigned MSBits; + size_t emLen, maskedDBLen, salt_start; + FIPS_service_indicator_lock_state(); + + size_t hLen = EVP_MD_size(Hash); + if (sLen == RSA_PSS_SALTLEN_DIGEST) { + sLen = (int)hLen; + } else if (sLen == RSA_PSS_SALTLEN_AUTO) { + // Leave |sLen| negative, which will trigger the logic below to recover and + // allow any salt length. + } else if (sLen < 0) { + // Other negative values are reserved. + OPENSSL_PUT_ERROR(RSA, RSA_R_SLEN_CHECK_FAILED); + goto err; + } + + MSBits = (RSA_bits(rsa) - 1) & 0x7; + emLen = RSA_size(rsa); + if (EM[0] & (0xFF << MSBits)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_FIRST_OCTET_INVALID); + goto err; + } + if (MSBits == 0) { + EM++; + emLen--; + } + // |sLen| may be negative for the non-standard salt length recovery mode. + if (emLen < hLen + 2 || (sLen >= 0 && emLen < hLen + (size_t)sLen + 2)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE); + goto err; + } + if (EM[emLen - 1] != 0xbc) { + OPENSSL_PUT_ERROR(RSA, RSA_R_LAST_OCTET_INVALID); + goto err; + } + maskedDBLen = emLen - hLen - 1; + H = EM + maskedDBLen; + DB = reinterpret_cast(OPENSSL_malloc(maskedDBLen)); + if (!DB) { + goto err; + } + if (!PKCS1_MGF1(DB, maskedDBLen, H, hLen, mgf1Hash)) { + goto err; + } + for (size_t i = 0; i < maskedDBLen; i++) { + DB[i] ^= EM[i]; + } + if (MSBits) { + DB[0] &= 0xFF >> (8 - MSBits); + } + // This step differs slightly from EMSA-PSS-VERIFY (RFC 8017) step 10 because + // it accepts a non-standard salt recovery flow. DB should be some number of + // zeros, a one, then the salt. + for (salt_start = 0; DB[salt_start] == 0 && salt_start < maskedDBLen - 1; + salt_start++) { + ; + } + if (DB[salt_start] != 0x1) { + OPENSSL_PUT_ERROR(RSA, RSA_R_SLEN_RECOVERY_FAILED); + goto err; + } + salt_start++; + // If a salt length was specified, check it matches. + if (sLen >= 0 && maskedDBLen - salt_start != (size_t)sLen) { + OPENSSL_PUT_ERROR(RSA, RSA_R_SLEN_CHECK_FAILED); + goto err; + } + uint8_t H_[EVP_MAX_MD_SIZE]; + if (!EVP_DigestInit_ex(ctx.get(), Hash, nullptr) || + !EVP_DigestUpdate(ctx.get(), kPSSZeroes, sizeof(kPSSZeroes)) || + !EVP_DigestUpdate(ctx.get(), mHash, hLen) || + !EVP_DigestUpdate(ctx.get(), DB + salt_start, maskedDBLen - salt_start) || + !EVP_DigestFinal_ex(ctx.get(), H_, nullptr)) { + goto err; + } + if (OPENSSL_memcmp(H_, H, hLen) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_SIGNATURE); + goto err; + } + + ret = 1; + +err: + OPENSSL_free(DB); + FIPS_service_indicator_unlock_state(); + return ret; +} + +int RSA_padding_add_PKCS1_PSS_mgf1(const RSA *rsa, unsigned char *EM, + const unsigned char *mHash, + const EVP_MD *Hash, const EVP_MD *mgf1Hash, + int sLenRequested) { + int ret = 0; + ScopedEVP_MD_CTX ctx; + size_t maskedDBLen, MSBits, emLen; + size_t hLen; + unsigned char *H, *salt = nullptr, *p; + + if (mgf1Hash == nullptr) { + mgf1Hash = Hash; + } + + FIPS_service_indicator_lock_state(); + hLen = EVP_MD_size(Hash); + + unsigned rsa_bits = RSA_bits(rsa); + if (rsa_bits == 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_EMPTY_PUBLIC_KEY); + goto err; + } + + MSBits = (rsa_bits - 1) & 0x7; + emLen = RSA_size(rsa); + if (MSBits == 0) { + assert(emLen >= 1); + *EM++ = 0; + emLen--; + } + + if (emLen < hLen + 2) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); + goto err; + } + + size_t sLen; + if (sLenRequested == RSA_PSS_SALTLEN_DIGEST) { + sLen = hLen; + } else if (sLenRequested == RSA_PSS_SALTLEN_AUTO) { + // Use the maximum possible salt length. + sLen = emLen - hLen - 2; + } else if (sLenRequested < 0) { + // Other negative values are reserved. + OPENSSL_PUT_ERROR(RSA, RSA_R_SLEN_CHECK_FAILED); + goto err; + } else { + sLen = (size_t)sLenRequested; + } + + if (emLen - hLen - 2 < sLen) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); + goto err; + } + + if (sLen > 0) { + salt = reinterpret_cast(OPENSSL_malloc(sLen)); + if (!salt) { + goto err; + } + BCM_rand_bytes(salt, sLen); + } + maskedDBLen = emLen - hLen - 1; + H = EM + maskedDBLen; + + if (!EVP_DigestInit_ex(ctx.get(), Hash, nullptr) || + !EVP_DigestUpdate(ctx.get(), kPSSZeroes, sizeof(kPSSZeroes)) || + !EVP_DigestUpdate(ctx.get(), mHash, hLen) || + !EVP_DigestUpdate(ctx.get(), salt, sLen) || + !EVP_DigestFinal_ex(ctx.get(), H, nullptr)) { + goto err; + } + + // Generate dbMask in place then perform XOR on it + if (!PKCS1_MGF1(EM, maskedDBLen, H, hLen, mgf1Hash)) { + goto err; + } + + p = EM; + // Initial PS XORs with all zeroes which is a NOP so just update + // pointer. Note from a test above this value is guaranteed to + // be non-negative. + p += emLen - sLen - hLen - 2; + *p++ ^= 0x1; + if (sLen > 0) { + for (size_t i = 0; i < sLen; i++) { + *p++ ^= salt[i]; + } + } + if (MSBits) { + EM[0] &= 0xFF >> (8 - MSBits); + } + + // H is already in place so just set final 0xbc + + EM[emLen - 1] = 0xbc; + + ret = 1; + +err: + OPENSSL_free(salt); + FIPS_service_indicator_unlock_state(); + + return ret; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa.c b/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa.c deleted file mode 100644 index 733e7faa..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa.c +++ /dev/null @@ -1,959 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../bn/internal.h" -#include "../delocate.h" -#include "../../internal.h" -#include "internal.h" - - -// RSA_R_BLOCK_TYPE_IS_NOT_02 is part of the legacy SSLv23 padding scheme. -// Cryptography.io depends on this error code. -OPENSSL_DECLARE_ERROR_REASON(RSA, BLOCK_TYPE_IS_NOT_02) - -DEFINE_STATIC_EX_DATA_CLASS(g_rsa_ex_data_class) - -RSA *RSA_new(void) { return RSA_new_method(NULL); } - -RSA *RSA_new_method(const ENGINE *engine) { - RSA *rsa = OPENSSL_malloc(sizeof(RSA)); - if (rsa == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - return NULL; - } - - OPENSSL_memset(rsa, 0, sizeof(RSA)); - - if (engine) { - rsa->meth = ENGINE_get_RSA_method(engine); - } - - if (rsa->meth == NULL) { - rsa->meth = (RSA_METHOD *) RSA_default_method(); - } - METHOD_ref(rsa->meth); - - rsa->references = 1; - rsa->flags = rsa->meth->flags; - CRYPTO_MUTEX_init(&rsa->lock); - CRYPTO_new_ex_data(&rsa->ex_data); - - if (rsa->meth->init && !rsa->meth->init(rsa)) { - CRYPTO_free_ex_data(g_rsa_ex_data_class_bss_get(), rsa, &rsa->ex_data); - CRYPTO_MUTEX_cleanup(&rsa->lock); - METHOD_unref(rsa->meth); - OPENSSL_free(rsa); - return NULL; - } - - return rsa; -} - -void RSA_free(RSA *rsa) { - unsigned u; - - if (rsa == NULL) { - return; - } - - if (!CRYPTO_refcount_dec_and_test_zero(&rsa->references)) { - return; - } - - if (rsa->meth->finish) { - rsa->meth->finish(rsa); - } - METHOD_unref(rsa->meth); - - CRYPTO_free_ex_data(g_rsa_ex_data_class_bss_get(), rsa, &rsa->ex_data); - - BN_free(rsa->n); - BN_free(rsa->e); - BN_free(rsa->d); - BN_free(rsa->p); - BN_free(rsa->q); - BN_free(rsa->dmp1); - BN_free(rsa->dmq1); - BN_free(rsa->iqmp); - BN_MONT_CTX_free(rsa->mont_n); - BN_MONT_CTX_free(rsa->mont_p); - BN_MONT_CTX_free(rsa->mont_q); - BN_free(rsa->d_fixed); - BN_free(rsa->dmp1_fixed); - BN_free(rsa->dmq1_fixed); - BN_free(rsa->inv_small_mod_large_mont); - for (u = 0; u < rsa->num_blindings; u++) { - BN_BLINDING_free(rsa->blindings[u]); - } - OPENSSL_free(rsa->blindings); - OPENSSL_free(rsa->blindings_inuse); - CRYPTO_MUTEX_cleanup(&rsa->lock); - OPENSSL_free(rsa); -} - -int RSA_up_ref(RSA *rsa) { - CRYPTO_refcount_inc(&rsa->references); - return 1; -} - -unsigned RSA_bits(const RSA *rsa) { return BN_num_bits(rsa->n); } - -const BIGNUM *RSA_get0_n(const RSA *rsa) { return rsa->n; } - -const BIGNUM *RSA_get0_e(const RSA *rsa) { return rsa->e; } - -const BIGNUM *RSA_get0_d(const RSA *rsa) { return rsa->d; } - -const BIGNUM *RSA_get0_p(const RSA *rsa) { return rsa->p; } - -const BIGNUM *RSA_get0_q(const RSA *rsa) { return rsa->q; } - -const BIGNUM *RSA_get0_dmp1(const RSA *rsa) { return rsa->dmp1; } - -const BIGNUM *RSA_get0_dmq1(const RSA *rsa) { return rsa->dmq1; } - -const BIGNUM *RSA_get0_iqmp(const RSA *rsa) { return rsa->iqmp; } - -void RSA_get0_key(const RSA *rsa, const BIGNUM **out_n, const BIGNUM **out_e, - const BIGNUM **out_d) { - if (out_n != NULL) { - *out_n = rsa->n; - } - if (out_e != NULL) { - *out_e = rsa->e; - } - if (out_d != NULL) { - *out_d = rsa->d; - } -} - -void RSA_get0_factors(const RSA *rsa, const BIGNUM **out_p, - const BIGNUM **out_q) { - if (out_p != NULL) { - *out_p = rsa->p; - } - if (out_q != NULL) { - *out_q = rsa->q; - } -} - -const RSA_PSS_PARAMS *RSA_get0_pss_params(const RSA *rsa) { - // We do not support the id-RSASSA-PSS key encoding. If we add support later, - // the |maskHash| field should be filled in for OpenSSL compatibility. - return NULL; -} - -void RSA_get0_crt_params(const RSA *rsa, const BIGNUM **out_dmp1, - const BIGNUM **out_dmq1, const BIGNUM **out_iqmp) { - if (out_dmp1 != NULL) { - *out_dmp1 = rsa->dmp1; - } - if (out_dmq1 != NULL) { - *out_dmq1 = rsa->dmq1; - } - if (out_iqmp != NULL) { - *out_iqmp = rsa->iqmp; - } -} - -int RSA_set0_key(RSA *rsa, BIGNUM *n, BIGNUM *e, BIGNUM *d) { - if ((rsa->n == NULL && n == NULL) || - (rsa->e == NULL && e == NULL)) { - return 0; - } - - if (n != NULL) { - BN_free(rsa->n); - rsa->n = n; - } - if (e != NULL) { - BN_free(rsa->e); - rsa->e = e; - } - if (d != NULL) { - BN_free(rsa->d); - rsa->d = d; - } - - return 1; -} - -int RSA_set0_factors(RSA *rsa, BIGNUM *p, BIGNUM *q) { - if ((rsa->p == NULL && p == NULL) || - (rsa->q == NULL && q == NULL)) { - return 0; - } - - if (p != NULL) { - BN_free(rsa->p); - rsa->p = p; - } - if (q != NULL) { - BN_free(rsa->q); - rsa->q = q; - } - - return 1; -} - -int RSA_set0_crt_params(RSA *rsa, BIGNUM *dmp1, BIGNUM *dmq1, BIGNUM *iqmp) { - if ((rsa->dmp1 == NULL && dmp1 == NULL) || - (rsa->dmq1 == NULL && dmq1 == NULL) || - (rsa->iqmp == NULL && iqmp == NULL)) { - return 0; - } - - if (dmp1 != NULL) { - BN_free(rsa->dmp1); - rsa->dmp1 = dmp1; - } - if (dmq1 != NULL) { - BN_free(rsa->dmq1); - rsa->dmq1 = dmq1; - } - if (iqmp != NULL) { - BN_free(rsa->iqmp); - rsa->iqmp = iqmp; - } - - return 1; -} - -int RSA_public_encrypt(size_t flen, const uint8_t *from, uint8_t *to, RSA *rsa, - int padding) { - size_t out_len; - - if (!RSA_encrypt(rsa, &out_len, to, RSA_size(rsa), from, flen, padding)) { - return -1; - } - - if (out_len > INT_MAX) { - OPENSSL_PUT_ERROR(RSA, ERR_R_OVERFLOW); - return -1; - } - return out_len; -} - -static int rsa_sign_raw_no_self_test(RSA *rsa, size_t *out_len, uint8_t *out, - size_t max_out, const uint8_t *in, - size_t in_len, int padding) { - if (rsa->meth->sign_raw) { - return rsa->meth->sign_raw(rsa, out_len, out, max_out, in, in_len, padding); - } - - return rsa_default_sign_raw(rsa, out_len, out, max_out, in, in_len, padding); -} - -int RSA_sign_raw(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, - const uint8_t *in, size_t in_len, int padding) { - boringssl_ensure_rsa_self_test(); - return rsa_sign_raw_no_self_test(rsa, out_len, out, max_out, in, in_len, - padding); -} - -int RSA_private_encrypt(size_t flen, const uint8_t *from, uint8_t *to, RSA *rsa, - int padding) { - size_t out_len; - - if (!RSA_sign_raw(rsa, &out_len, to, RSA_size(rsa), from, flen, padding)) { - return -1; - } - - if (out_len > INT_MAX) { - OPENSSL_PUT_ERROR(RSA, ERR_R_OVERFLOW); - return -1; - } - return out_len; -} - -int RSA_decrypt(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, - const uint8_t *in, size_t in_len, int padding) { - if (rsa->meth->decrypt) { - return rsa->meth->decrypt(rsa, out_len, out, max_out, in, in_len, padding); - } - - return rsa_default_decrypt(rsa, out_len, out, max_out, in, in_len, padding); -} - -int RSA_private_decrypt(size_t flen, const uint8_t *from, uint8_t *to, RSA *rsa, - int padding) { - size_t out_len; - - if (!RSA_decrypt(rsa, &out_len, to, RSA_size(rsa), from, flen, padding)) { - return -1; - } - - if (out_len > INT_MAX) { - OPENSSL_PUT_ERROR(RSA, ERR_R_OVERFLOW); - return -1; - } - return out_len; -} - -int RSA_public_decrypt(size_t flen, const uint8_t *from, uint8_t *to, RSA *rsa, - int padding) { - size_t out_len; - - if (!RSA_verify_raw(rsa, &out_len, to, RSA_size(rsa), from, flen, padding)) { - return -1; - } - - if (out_len > INT_MAX) { - OPENSSL_PUT_ERROR(RSA, ERR_R_OVERFLOW); - return -1; - } - return out_len; -} - -unsigned RSA_size(const RSA *rsa) { - if (rsa->meth->size) { - return rsa->meth->size(rsa); - } - - return rsa_default_size(rsa); -} - -int RSA_is_opaque(const RSA *rsa) { - return rsa->meth && (rsa->meth->flags & RSA_FLAG_OPAQUE); -} - -int RSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_unused *unused, - CRYPTO_EX_dup *dup_unused, CRYPTO_EX_free *free_func) { - int index; - if (!CRYPTO_get_ex_new_index(g_rsa_ex_data_class_bss_get(), &index, argl, - argp, free_func)) { - return -1; - } - return index; -} - -int RSA_set_ex_data(RSA *rsa, int idx, void *arg) { - return CRYPTO_set_ex_data(&rsa->ex_data, idx, arg); -} - -void *RSA_get_ex_data(const RSA *rsa, int idx) { - return CRYPTO_get_ex_data(&rsa->ex_data, idx); -} - -// SSL_SIG_LENGTH is the size of an SSL/TLS (prior to TLS 1.2) signature: it's -// the length of an MD5 and SHA1 hash. -static const unsigned SSL_SIG_LENGTH = 36; - -// pkcs1_sig_prefix contains the ASN.1, DER encoded prefix for a hash that is -// to be signed with PKCS#1. -struct pkcs1_sig_prefix { - // nid identifies the hash function. - int nid; - // hash_len is the expected length of the hash function. - uint8_t hash_len; - // len is the number of bytes of |bytes| which are valid. - uint8_t len; - // bytes contains the DER bytes. - uint8_t bytes[19]; -}; - -// kPKCS1SigPrefixes contains the ASN.1 prefixes for PKCS#1 signatures with -// different hash functions. -static const struct pkcs1_sig_prefix kPKCS1SigPrefixes[] = { - { - NID_md5, - MD5_DIGEST_LENGTH, - 18, - {0x30, 0x20, 0x30, 0x0c, 0x06, 0x08, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, - 0x02, 0x05, 0x05, 0x00, 0x04, 0x10}, - }, - { - NID_sha1, - SHA_DIGEST_LENGTH, - 15, - {0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03, 0x02, 0x1a, 0x05, - 0x00, 0x04, 0x14}, - }, - { - NID_sha224, - SHA224_DIGEST_LENGTH, - 19, - {0x30, 0x2d, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, - 0x04, 0x02, 0x04, 0x05, 0x00, 0x04, 0x1c}, - }, - { - NID_sha256, - SHA256_DIGEST_LENGTH, - 19, - {0x30, 0x31, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, - 0x04, 0x02, 0x01, 0x05, 0x00, 0x04, 0x20}, - }, - { - NID_sha384, - SHA384_DIGEST_LENGTH, - 19, - {0x30, 0x41, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, - 0x04, 0x02, 0x02, 0x05, 0x00, 0x04, 0x30}, - }, - { - NID_sha512, - SHA512_DIGEST_LENGTH, - 19, - {0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, - 0x04, 0x02, 0x03, 0x05, 0x00, 0x04, 0x40}, - }, - { - NID_undef, 0, 0, {0}, - }, -}; - -int RSA_add_pkcs1_prefix(uint8_t **out_msg, size_t *out_msg_len, - int *is_alloced, int hash_nid, const uint8_t *digest, - size_t digest_len) { - unsigned i; - - if (hash_nid == NID_md5_sha1) { - // Special case: SSL signature, just check the length. - if (digest_len != SSL_SIG_LENGTH) { - OPENSSL_PUT_ERROR(RSA, RSA_R_INVALID_MESSAGE_LENGTH); - return 0; - } - - *out_msg = (uint8_t *)digest; - *out_msg_len = SSL_SIG_LENGTH; - *is_alloced = 0; - return 1; - } - - for (i = 0; kPKCS1SigPrefixes[i].nid != NID_undef; i++) { - const struct pkcs1_sig_prefix *sig_prefix = &kPKCS1SigPrefixes[i]; - if (sig_prefix->nid != hash_nid) { - continue; - } - - if (digest_len != sig_prefix->hash_len) { - OPENSSL_PUT_ERROR(RSA, RSA_R_INVALID_MESSAGE_LENGTH); - return 0; - } - - const uint8_t* prefix = sig_prefix->bytes; - unsigned prefix_len = sig_prefix->len; - unsigned signed_msg_len; - uint8_t *signed_msg; - - signed_msg_len = prefix_len + digest_len; - if (signed_msg_len < prefix_len) { - OPENSSL_PUT_ERROR(RSA, RSA_R_TOO_LONG); - return 0; - } - - signed_msg = OPENSSL_malloc(signed_msg_len); - if (!signed_msg) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - return 0; - } - - OPENSSL_memcpy(signed_msg, prefix, prefix_len); - OPENSSL_memcpy(signed_msg + prefix_len, digest, digest_len); - - *out_msg = signed_msg; - *out_msg_len = signed_msg_len; - *is_alloced = 1; - - return 1; - } - - OPENSSL_PUT_ERROR(RSA, RSA_R_UNKNOWN_ALGORITHM_TYPE); - return 0; -} - -int rsa_sign_no_self_test(int hash_nid, const uint8_t *digest, - unsigned digest_len, uint8_t *out, unsigned *out_len, - RSA *rsa) { - const unsigned rsa_size = RSA_size(rsa); - int ret = 0; - uint8_t *signed_msg = NULL; - size_t signed_msg_len = 0; - int signed_msg_is_alloced = 0; - size_t size_t_out_len; - - if (rsa->meth->sign) { - return rsa->meth->sign(hash_nid, digest, digest_len, out, out_len, rsa); - } - - if (!RSA_add_pkcs1_prefix(&signed_msg, &signed_msg_len, - &signed_msg_is_alloced, hash_nid, digest, - digest_len) || - !rsa_sign_raw_no_self_test(rsa, &size_t_out_len, out, rsa_size, - signed_msg, signed_msg_len, - RSA_PKCS1_PADDING)) { - goto err; - } - - *out_len = size_t_out_len; - ret = 1; - -err: - if (signed_msg_is_alloced) { - OPENSSL_free(signed_msg); - } - return ret; -} - -int RSA_sign(int hash_nid, const uint8_t *digest, unsigned digest_len, - uint8_t *out, unsigned *out_len, RSA *rsa) { - boringssl_ensure_rsa_self_test(); - - return rsa_sign_no_self_test(hash_nid, digest, digest_len, out, out_len, rsa); -} - -int RSA_sign_pss_mgf1(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, - const uint8_t *digest, size_t digest_len, - const EVP_MD *md, const EVP_MD *mgf1_md, int salt_len) { - if (digest_len != EVP_MD_size(md)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_INVALID_MESSAGE_LENGTH); - return 0; - } - - size_t padded_len = RSA_size(rsa); - uint8_t *padded = OPENSSL_malloc(padded_len); - if (padded == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - return 0; - } - - int ret = RSA_padding_add_PKCS1_PSS_mgf1(rsa, padded, digest, md, mgf1_md, - salt_len) && - RSA_sign_raw(rsa, out_len, out, max_out, padded, padded_len, - RSA_NO_PADDING); - OPENSSL_free(padded); - return ret; -} - -int rsa_verify_no_self_test(int hash_nid, const uint8_t *digest, - size_t digest_len, const uint8_t *sig, - size_t sig_len, RSA *rsa) { - if (rsa->n == NULL || rsa->e == NULL) { - OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING); - return 0; - } - - const size_t rsa_size = RSA_size(rsa); - uint8_t *buf = NULL; - int ret = 0; - uint8_t *signed_msg = NULL; - size_t signed_msg_len = 0, len; - int signed_msg_is_alloced = 0; - - if (hash_nid == NID_md5_sha1 && digest_len != SSL_SIG_LENGTH) { - OPENSSL_PUT_ERROR(RSA, RSA_R_INVALID_MESSAGE_LENGTH); - return 0; - } - - buf = OPENSSL_malloc(rsa_size); - if (!buf) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - return 0; - } - - if (!rsa_verify_raw_no_self_test(rsa, &len, buf, rsa_size, sig, sig_len, - RSA_PKCS1_PADDING) || - !RSA_add_pkcs1_prefix(&signed_msg, &signed_msg_len, - &signed_msg_is_alloced, hash_nid, digest, - digest_len)) { - goto out; - } - - // Check that no other information follows the hash value (FIPS 186-4 Section - // 5.5) and it matches the expected hash. - if (len != signed_msg_len || OPENSSL_memcmp(buf, signed_msg, len) != 0) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_SIGNATURE); - goto out; - } - - ret = 1; - -out: - OPENSSL_free(buf); - if (signed_msg_is_alloced) { - OPENSSL_free(signed_msg); - } - return ret; -} - -int RSA_verify(int hash_nid, const uint8_t *digest, size_t digest_len, - const uint8_t *sig, size_t sig_len, RSA *rsa) { - boringssl_ensure_rsa_self_test(); - return rsa_verify_no_self_test(hash_nid, digest, digest_len, sig, sig_len, - rsa); -} - -int RSA_verify_pss_mgf1(RSA *rsa, const uint8_t *digest, size_t digest_len, - const EVP_MD *md, const EVP_MD *mgf1_md, int salt_len, - const uint8_t *sig, size_t sig_len) { - if (digest_len != EVP_MD_size(md)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_INVALID_MESSAGE_LENGTH); - return 0; - } - - size_t em_len = RSA_size(rsa); - uint8_t *em = OPENSSL_malloc(em_len); - if (em == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - return 0; - } - - int ret = 0; - if (!RSA_verify_raw(rsa, &em_len, em, em_len, sig, sig_len, RSA_NO_PADDING)) { - goto err; - } - - if (em_len != RSA_size(rsa)) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - goto err; - } - - ret = RSA_verify_PKCS1_PSS_mgf1(rsa, digest, md, mgf1_md, em, salt_len); - -err: - OPENSSL_free(em); - return ret; -} - -static int check_mod_inverse(int *out_ok, const BIGNUM *a, const BIGNUM *ainv, - const BIGNUM *m, unsigned m_min_bits, - BN_CTX *ctx) { - if (BN_is_negative(ainv) || BN_cmp(ainv, m) >= 0) { - *out_ok = 0; - return 1; - } - - // Note |bn_mul_consttime| and |bn_div_consttime| do not scale linearly, but - // checking |ainv| is in range bounds the running time, assuming |m|'s bounds - // were checked by the caller. - BN_CTX_start(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - int ret = tmp != NULL && - bn_mul_consttime(tmp, a, ainv, ctx) && - bn_div_consttime(NULL, tmp, tmp, m, m_min_bits, ctx); - if (ret) { - *out_ok = BN_is_one(tmp); - } - BN_CTX_end(ctx); - return ret; -} - -int RSA_check_key(const RSA *key) { - // TODO(davidben): RSA key initialization is spread across - // |rsa_check_public_key|, |RSA_check_key|, |freeze_private_key|, and - // |BN_MONT_CTX_set_locked| as a result of API issues. See - // https://crbug.com/boringssl/316. As a result, we inconsistently check RSA - // invariants. We should fix this and integrate that logic. - - if (RSA_is_opaque(key)) { - // Opaque keys can't be checked. - return 1; - } - - if (!rsa_check_public_key(key)) { - return 0; - } - - if ((key->p != NULL) != (key->q != NULL)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_ONLY_ONE_OF_P_Q_GIVEN); - return 0; - } - - // |key->d| must be bounded by |key->n|. This ensures bounds on |RSA_bits| - // translate to bounds on the running time of private key operations. - if (key->d != NULL && - (BN_is_negative(key->d) || BN_cmp(key->d, key->n) >= 0)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_D_OUT_OF_RANGE); - return 0; - } - - if (key->d == NULL || key->p == NULL) { - // For a public key, or without p and q, there's nothing that can be - // checked. - return 1; - } - - BN_CTX *ctx = BN_CTX_new(); - if (ctx == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - return 0; - } - - BIGNUM tmp, de, pm1, qm1, dmp1, dmq1; - int ok = 0; - BN_init(&tmp); - BN_init(&de); - BN_init(&pm1); - BN_init(&qm1); - BN_init(&dmp1); - BN_init(&dmq1); - - // Check that p * q == n. Before we multiply, we check that p and q are in - // bounds, to avoid a DoS vector in |bn_mul_consttime| below. Note that - // n was bound by |rsa_check_public_key|. - if (BN_is_negative(key->p) || BN_cmp(key->p, key->n) >= 0 || - BN_is_negative(key->q) || BN_cmp(key->q, key->n) >= 0) { - OPENSSL_PUT_ERROR(RSA, RSA_R_N_NOT_EQUAL_P_Q); - goto out; - } - if (!bn_mul_consttime(&tmp, key->p, key->q, ctx)) { - OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); - goto out; - } - if (BN_cmp(&tmp, key->n) != 0) { - OPENSSL_PUT_ERROR(RSA, RSA_R_N_NOT_EQUAL_P_Q); - goto out; - } - - // d must be an inverse of e mod the Carmichael totient, lcm(p-1, q-1), but it - // may be unreduced because other implementations use the Euler totient. We - // simply check that d * e is one mod p-1 and mod q-1. Note d and e were bound - // by earlier checks in this function. - if (!bn_usub_consttime(&pm1, key->p, BN_value_one()) || - !bn_usub_consttime(&qm1, key->q, BN_value_one())) { - OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); - goto out; - } - const unsigned pm1_bits = BN_num_bits(&pm1); - const unsigned qm1_bits = BN_num_bits(&qm1); - if (!bn_mul_consttime(&de, key->d, key->e, ctx) || - !bn_div_consttime(NULL, &tmp, &de, &pm1, pm1_bits, ctx) || - !bn_div_consttime(NULL, &de, &de, &qm1, qm1_bits, ctx)) { - OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); - goto out; - } - - if (!BN_is_one(&tmp) || !BN_is_one(&de)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_D_E_NOT_CONGRUENT_TO_1); - goto out; - } - - int has_crt_values = key->dmp1 != NULL; - if (has_crt_values != (key->dmq1 != NULL) || - has_crt_values != (key->iqmp != NULL)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_INCONSISTENT_SET_OF_CRT_VALUES); - goto out; - } - - if (has_crt_values) { - int dmp1_ok, dmq1_ok, iqmp_ok; - if (!check_mod_inverse(&dmp1_ok, key->e, key->dmp1, &pm1, pm1_bits, ctx) || - !check_mod_inverse(&dmq1_ok, key->e, key->dmq1, &qm1, qm1_bits, ctx) || - // |p| is odd, so |pm1| and |p| have the same bit width. If they didn't, - // we only need a lower bound anyway. - !check_mod_inverse(&iqmp_ok, key->q, key->iqmp, key->p, pm1_bits, - ctx)) { - OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); - goto out; - } - - if (!dmp1_ok || !dmq1_ok || !iqmp_ok) { - OPENSSL_PUT_ERROR(RSA, RSA_R_CRT_VALUES_INCORRECT); - goto out; - } - } - - ok = 1; - -out: - BN_free(&tmp); - BN_free(&de); - BN_free(&pm1); - BN_free(&qm1); - BN_free(&dmp1); - BN_free(&dmq1); - BN_CTX_free(ctx); - - return ok; -} - - -// This is the product of the 132 smallest odd primes, from 3 to 751. -static const BN_ULONG kSmallFactorsLimbs[] = { - TOBN(0xc4309333, 0x3ef4e3e1), TOBN(0x71161eb6, 0xcd2d655f), - TOBN(0x95e2238c, 0x0bf94862), TOBN(0x3eb233d3, 0x24f7912b), - TOBN(0x6b55514b, 0xbf26c483), TOBN(0x0a84d817, 0x5a144871), - TOBN(0x77d12fee, 0x9b82210a), TOBN(0xdb5b93c2, 0x97f050b3), - TOBN(0x4acad6b9, 0x4d6c026b), TOBN(0xeb7751f3, 0x54aec893), - TOBN(0xdba53368, 0x36bc85c4), TOBN(0xd85a1b28, 0x7f5ec78e), - TOBN(0x2eb072d8, 0x6b322244), TOBN(0xbba51112, 0x5e2b3aea), - TOBN(0x36ed1a6c, 0x0e2486bf), TOBN(0x5f270460, 0xec0c5727), - 0x000017b1 -}; - -DEFINE_LOCAL_DATA(BIGNUM, g_small_factors) { - out->d = (BN_ULONG *) kSmallFactorsLimbs; - out->width = OPENSSL_ARRAY_SIZE(kSmallFactorsLimbs); - out->dmax = out->width; - out->neg = 0; - out->flags = BN_FLG_STATIC_DATA; -} - -int RSA_check_fips(RSA *key) { - if (RSA_is_opaque(key)) { - // Opaque keys can't be checked. - OPENSSL_PUT_ERROR(RSA, RSA_R_PUBLIC_KEY_VALIDATION_FAILED); - return 0; - } - - if (!RSA_check_key(key)) { - return 0; - } - - BN_CTX *ctx = BN_CTX_new(); - if (ctx == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - return 0; - } - - BIGNUM small_gcd; - BN_init(&small_gcd); - - int ret = 1; - - // Perform partial public key validation of RSA keys (SP 800-89 5.3.3). - // Although this is not for primality testing, SP 800-89 cites an RSA - // primality testing algorithm, so we use |BN_prime_checks_for_generation| to - // match. This is only a plausibility test and we expect the value to be - // composite, so too few iterations will cause us to reject the key, not use - // an implausible one. - enum bn_primality_result_t primality_result; - if (BN_num_bits(key->e) <= 16 || - BN_num_bits(key->e) > 256 || - !BN_is_odd(key->n) || - !BN_is_odd(key->e) || - !BN_gcd(&small_gcd, key->n, g_small_factors(), ctx) || - !BN_is_one(&small_gcd) || - !BN_enhanced_miller_rabin_primality_test(&primality_result, key->n, - BN_prime_checks_for_generation, - ctx, NULL) || - primality_result != bn_non_prime_power_composite) { - OPENSSL_PUT_ERROR(RSA, RSA_R_PUBLIC_KEY_VALIDATION_FAILED); - ret = 0; - } - - BN_free(&small_gcd); - BN_CTX_free(ctx); - - if (!ret || key->d == NULL || key->p == NULL) { - // On a failure or on only a public key, there's nothing else can be - // checked. - return ret; - } - - // FIPS pairwise consistency test (FIPS 140-2 4.9.2). Per FIPS 140-2 IG, - // section 9.9, it is not known whether |rsa| will be used for signing or - // encryption, so either pair-wise consistency self-test is acceptable. We - // perform a signing test. - uint8_t data[32] = {0}; - unsigned sig_len = RSA_size(key); - uint8_t *sig = OPENSSL_malloc(sig_len); - if (sig == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - return 0; - } - - if (!RSA_sign(NID_sha256, data, sizeof(data), sig, &sig_len, key)) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - ret = 0; - goto cleanup; - } - if (boringssl_fips_break_test("RSA_PWCT")) { - data[0] = ~data[0]; - } - if (!RSA_verify(NID_sha256, data, sizeof(data), sig, sig_len, key)) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - ret = 0; - } - -cleanup: - OPENSSL_free(sig); - - return ret; -} - -int RSA_private_transform(RSA *rsa, uint8_t *out, const uint8_t *in, - size_t len) { - if (rsa->meth->private_transform) { - return rsa->meth->private_transform(rsa, out, in, len); - } - - return rsa_default_private_transform(rsa, out, in, len); -} - -int RSA_flags(const RSA *rsa) { return rsa->flags; } - -int RSA_test_flags(const RSA *rsa, int flags) { return rsa->flags & flags; } - -int RSA_blinding_on(RSA *rsa, BN_CTX *ctx) { - return 1; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa.cc.inc new file mode 100644 index 00000000..65109588 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa.cc.inc @@ -0,0 +1,996 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../internal.h" +#include "../../mem_internal.h" +#include "../bcm_interface.h" +#include "../bn/internal.h" +#include "../delocate.h" +#include "internal.h" + + +using namespace bssl; + +// RSA_R_BLOCK_TYPE_IS_NOT_02 is part of the legacy SSLv23 padding scheme. +// Cryptography.io depends on this error code. +OPENSSL_DECLARE_ERROR_REASON(RSA, BLOCK_TYPE_IS_NOT_02) + +DEFINE_STATIC_EX_DATA_CLASS(g_rsa_ex_data_class) + +static int bn_dup_into(UniquePtr *dst, const BIGNUM *src) { + if (src == nullptr) { + OPENSSL_PUT_ERROR(RSA, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + dst->reset(BN_dup(src)); + return *dst != nullptr; +} + +RSA *RSA_new_public_key(const BIGNUM *n, const BIGNUM *e) { + RSAImpl *rsa = FromOpaque(RSA_new()); + if (rsa == nullptr || // + !bn_dup_into(&rsa->n, n) || // + !bn_dup_into(&rsa->e, e) || // + !RSA_check_key(rsa)) { + RSA_free(rsa); + return nullptr; + } + + return rsa; +} + +RSA *RSA_new_private_key(const BIGNUM *n, const BIGNUM *e, const BIGNUM *d, + const BIGNUM *p, const BIGNUM *q, const BIGNUM *dmp1, + const BIGNUM *dmq1, const BIGNUM *iqmp) { + RSAImpl *rsa = FromOpaque(RSA_new()); + if (rsa == nullptr || // + !bn_dup_into(&rsa->n, n) || // + !bn_dup_into(&rsa->e, e) || // + !bn_dup_into(&rsa->d, d) || // + !bn_dup_into(&rsa->p, p) || // + !bn_dup_into(&rsa->q, q) || // + !bn_dup_into(&rsa->dmp1, dmp1) || // + !bn_dup_into(&rsa->dmq1, dmq1) || // + !bn_dup_into(&rsa->iqmp, iqmp) || // + !RSA_check_key(rsa)) { + RSA_free(rsa); + return nullptr; + } + + return rsa; +} + +RSA *RSA_new_private_key_no_crt(const BIGNUM *n, const BIGNUM *e, + const BIGNUM *d) { + RSAImpl *rsa = FromOpaque(RSA_new()); + if (rsa == nullptr || // + !bn_dup_into(&rsa->n, n) || // + !bn_dup_into(&rsa->e, e) || // + !bn_dup_into(&rsa->d, d) || // + !RSA_check_key(rsa)) { + RSA_free(rsa); + return nullptr; + } + + return rsa; +} + +RSA *RSA_new_private_key_no_e(const BIGNUM *n, const BIGNUM *d) { + RSAImpl *rsa = FromOpaque(RSA_new()); + if (rsa == nullptr) { + return nullptr; + } + + rsa->flags |= RSA_FLAG_NO_PUBLIC_EXPONENT; + if (!bn_dup_into(&rsa->n, n) || // + !bn_dup_into(&rsa->d, d) || // + !RSA_check_key(rsa)) { + RSA_free(rsa); + return nullptr; + } + + return rsa; +} + +RSA *RSA_new_public_key_large_e(const BIGNUM *n, const BIGNUM *e) { + RSAImpl *rsa = FromOpaque(RSA_new()); + if (rsa == nullptr) { + return nullptr; + } + + rsa->flags |= RSA_FLAG_LARGE_PUBLIC_EXPONENT; + if (!bn_dup_into(&rsa->n, n) || // + !bn_dup_into(&rsa->e, e) || // + !RSA_check_key(rsa)) { + RSA_free(rsa); + return nullptr; + } + + return rsa; +} + +RSA *RSA_new_private_key_large_e(const BIGNUM *n, const BIGNUM *e, + const BIGNUM *d, const BIGNUM *p, + const BIGNUM *q, const BIGNUM *dmp1, + const BIGNUM *dmq1, const BIGNUM *iqmp) { + RSAImpl *rsa = FromOpaque(RSA_new()); + if (rsa == nullptr) { + return nullptr; + } + + rsa->flags |= RSA_FLAG_LARGE_PUBLIC_EXPONENT; + if (!bn_dup_into(&rsa->n, n) || // + !bn_dup_into(&rsa->e, e) || // + !bn_dup_into(&rsa->d, d) || // + !bn_dup_into(&rsa->p, p) || // + !bn_dup_into(&rsa->q, q) || // + !bn_dup_into(&rsa->dmp1, dmp1) || // + !bn_dup_into(&rsa->dmq1, dmq1) || // + !bn_dup_into(&rsa->iqmp, iqmp) || // + !RSA_check_key(rsa)) { + RSA_free(rsa); + return nullptr; + } + + return rsa; +} + +RSAImpl::RSAImpl(const ENGINE *engine) + : RefCounted(CheckSubClass()), + meth(engine ? ENGINE_get_RSA_method(engine) : nullptr) { + if (meth == nullptr) { + meth = const_cast(RSA_default_method()); + } + METHOD_ref(meth); + flags = meth->flags; + CRYPTO_new_ex_data(&ex_data); +} + +RSA *RSA_new() { return RSA_new_method(nullptr); } + +RSA *RSA_new_method(const ENGINE *engine) { + UniquePtr rsa(New(engine)); + if (rsa == nullptr) { + return nullptr; + } + + if (rsa->meth->init && !rsa->meth->init(rsa.get())) { + METHOD_unref(rsa->meth); + rsa->meth = nullptr; + return nullptr; + } + + return rsa.release(); +} + +RSA *RSA_new_method_no_e(const ENGINE *engine, const BIGNUM *n) { + RSAImpl *rsa = FromOpaque(RSA_new_method(engine)); + if (rsa == nullptr || !bn_dup_into(&rsa->n, n)) { + RSA_free(rsa); + return nullptr; + } + rsa->flags |= RSA_FLAG_NO_PUBLIC_EXPONENT; + return rsa; +} + +RSAImpl::~RSAImpl() { + if (meth != nullptr && meth->finish != nullptr) { + meth->finish(this); + } + METHOD_unref(meth); + + CRYPTO_free_ex_data(g_rsa_ex_data_class_bss_get(), &ex_data); +} + +void RSA_free(RSA *rsa) { + if (rsa != nullptr) { + FromOpaque(rsa)->DecRefInternal(); + } +} + +int RSA_up_ref(RSA *rsa) { + FromOpaque(rsa)->UpRefInternal(); + return 1; +} + +unsigned RSA_bits(const RSA *rsa) { + return BN_num_bits(FromOpaque(rsa)->n.get()); +} + +const BIGNUM *RSA_get0_n(const RSA *rsa) { return FromOpaque(rsa)->n.get(); } + +const BIGNUM *RSA_get0_e(const RSA *rsa) { return FromOpaque(rsa)->e.get(); } + +const BIGNUM *RSA_get0_d(const RSA *rsa) { return FromOpaque(rsa)->d.get(); } + +const BIGNUM *RSA_get0_p(const RSA *rsa) { return FromOpaque(rsa)->p.get(); } + +const BIGNUM *RSA_get0_q(const RSA *rsa) { return FromOpaque(rsa)->q.get(); } + +const BIGNUM *RSA_get0_dmp1(const RSA *rsa) { + return FromOpaque(rsa)->dmp1.get(); +} + +const BIGNUM *RSA_get0_dmq1(const RSA *rsa) { + return FromOpaque(rsa)->dmq1.get(); +} + +const BIGNUM *RSA_get0_iqmp(const RSA *rsa) { + return FromOpaque(rsa)->iqmp.get(); +} + +void RSA_get0_key(const RSA *rsa, const BIGNUM **out_n, const BIGNUM **out_e, + const BIGNUM **out_d) { + auto *impl = FromOpaque(rsa); + if (out_n != nullptr) { + *out_n = impl->n.get(); + } + if (out_e != nullptr) { + *out_e = impl->e.get(); + } + if (out_d != nullptr) { + *out_d = impl->d.get(); + } +} + +void RSA_get0_factors(const RSA *rsa, const BIGNUM **out_p, + const BIGNUM **out_q) { + auto *impl = FromOpaque(rsa); + if (out_p != nullptr) { + *out_p = impl->p.get(); + } + if (out_q != nullptr) { + *out_q = impl->q.get(); + } +} + +void RSA_get0_crt_params(const RSA *rsa, const BIGNUM **out_dmp1, + const BIGNUM **out_dmq1, const BIGNUM **out_iqmp) { + auto *impl = FromOpaque(rsa); + if (out_dmp1 != nullptr) { + *out_dmp1 = impl->dmp1.get(); + } + if (out_dmq1 != nullptr) { + *out_dmq1 = impl->dmq1.get(); + } + if (out_iqmp != nullptr) { + *out_iqmp = impl->iqmp.get(); + } +} + +int RSA_set0_key(RSA *rsa, BIGNUM *n, BIGNUM *e, BIGNUM *d) { + auto *impl = FromOpaque(rsa); + + if ((impl->n == nullptr && n == nullptr) || + (impl->e == nullptr && e == nullptr)) { + return 0; + } + + if (n != nullptr) { + impl->n.reset(n); + } + if (e != nullptr) { + impl->e.reset(e); + } + if (d != nullptr) { + impl->d.reset(d); + } + + rsa_invalidate_key(rsa); + return 1; +} + +int RSA_set0_factors(RSA *rsa, BIGNUM *p, BIGNUM *q) { + auto *impl = FromOpaque(rsa); + + if ((impl->p == nullptr && p == nullptr) || + (impl->q == nullptr && q == nullptr)) { + return 0; + } + + if (p != nullptr) { + impl->p.reset(p); + } + if (q != nullptr) { + impl->q.reset(q); + } + + rsa_invalidate_key(rsa); + return 1; +} + +int RSA_set0_crt_params(RSA *rsa, BIGNUM *dmp1, BIGNUM *dmq1, BIGNUM *iqmp) { + auto *impl = FromOpaque(rsa); + + if ((impl->dmp1 == nullptr && dmp1 == nullptr) || + (impl->dmq1 == nullptr && dmq1 == nullptr) || + (impl->iqmp == nullptr && iqmp == nullptr)) { + return 0; + } + + if (dmp1 != nullptr) { + impl->dmp1.reset(dmp1); + } + if (dmq1 != nullptr) { + impl->dmq1.reset(dmq1); + } + if (iqmp != nullptr) { + impl->iqmp.reset(iqmp); + } + + rsa_invalidate_key(rsa); + return 1; +} + +static int rsa_sign_raw_no_self_test(RSA *rsa, size_t *out_len, uint8_t *out, + size_t max_out, const uint8_t *in, + size_t in_len, int padding) { + auto *impl = FromOpaque(rsa); + + if (impl->meth->sign_raw) { + return impl->meth->sign_raw(rsa, out_len, out, max_out, in, in_len, + padding); + } + + return rsa_default_sign_raw(rsa, out_len, out, max_out, in, in_len, padding); +} + +int RSA_sign_raw(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, + const uint8_t *in, size_t in_len, int padding) { + boringssl_ensure_rsa_sign_self_test(); + return rsa_sign_raw_no_self_test(rsa, out_len, out, max_out, in, in_len, + padding); +} + +unsigned RSA_size(const RSA *rsa) { + return BN_num_bytes(FromOpaque(rsa)->n.get()); +} + +int RSA_is_opaque(const RSA *rsa) { + auto *impl = FromOpaque(rsa); + return impl->meth && (impl->meth->flags & RSA_FLAG_OPAQUE); +} + +int RSA_get_ex_new_index(long argl, void *argp, CRYPTO_EX_unused *unused, + CRYPTO_EX_dup *dup_unused, CRYPTO_EX_free *free_func) { + return CRYPTO_get_ex_new_index_ex(g_rsa_ex_data_class_bss_get(), argl, argp, + free_func); +} + +int RSA_set_ex_data(RSA *rsa, int idx, void *arg) { + auto *impl = FromOpaque(rsa); + return CRYPTO_set_ex_data(&impl->ex_data, idx, arg); +} + +void *RSA_get_ex_data(const RSA *rsa, int idx) { + auto *impl = FromOpaque(rsa); + return CRYPTO_get_ex_data(&impl->ex_data, idx); +} + +// SSL_SIG_LENGTH is the size of an SSL/TLS (prior to TLS 1.2) signature: it's +// the length of an MD5 and SHA1 hash. +static const unsigned SSL_SIG_LENGTH = 36; + +// pkcs1_sig_prefix contains the ASN.1, DER encoded prefix for a hash that is +// to be signed with PKCS#1. +struct pkcs1_sig_prefix { + // nid identifies the hash function. + int nid; + // hash_len is the expected length of the hash function. + uint8_t hash_len; + // len is the number of bytes of |bytes| which are valid. + uint8_t len; + // bytes contains the DER bytes. + uint8_t bytes[19]; +}; + +// kPKCS1SigPrefixes contains the ASN.1 prefixes for PKCS#1 signatures with +// different hash functions. +static const struct pkcs1_sig_prefix kPKCS1SigPrefixes[] = { + { + NID_md5, + MD5_DIGEST_LENGTH, + 18, + {0x30, 0x20, 0x30, 0x0c, 0x06, 0x08, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, + 0x02, 0x05, 0x05, 0x00, 0x04, 0x10}, + }, + { + NID_sha1, + SHA_DIGEST_LENGTH, + 15, + {0x30, 0x21, 0x30, 0x09, 0x06, 0x05, 0x2b, 0x0e, 0x03, 0x02, 0x1a, 0x05, + 0x00, 0x04, 0x14}, + }, + { + NID_sha224, + SHA224_DIGEST_LENGTH, + 19, + {0x30, 0x2d, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, + 0x04, 0x02, 0x04, 0x05, 0x00, 0x04, 0x1c}, + }, + { + NID_sha256, + SHA256_DIGEST_LENGTH, + 19, + {0x30, 0x31, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, + 0x04, 0x02, 0x01, 0x05, 0x00, 0x04, 0x20}, + }, + { + NID_sha384, + SHA384_DIGEST_LENGTH, + 19, + {0x30, 0x41, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, + 0x04, 0x02, 0x02, 0x05, 0x00, 0x04, 0x30}, + }, + { + NID_sha512, + SHA512_DIGEST_LENGTH, + 19, + {0x30, 0x51, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, + 0x04, 0x02, 0x03, 0x05, 0x00, 0x04, 0x40}, + }, + { + NID_undef, + 0, + 0, + {0}, + }, +}; + +static int rsa_check_digest_size(int hash_nid, size_t digest_len) { + if (hash_nid == NID_md5_sha1) { + if (digest_len != SSL_SIG_LENGTH) { + OPENSSL_PUT_ERROR(RSA, RSA_R_INVALID_MESSAGE_LENGTH); + return 0; + } + return 1; + } + + for (size_t i = 0; kPKCS1SigPrefixes[i].nid != NID_undef; i++) { + const struct pkcs1_sig_prefix *sig_prefix = &kPKCS1SigPrefixes[i]; + if (sig_prefix->nid == hash_nid) { + if (digest_len != sig_prefix->hash_len) { + OPENSSL_PUT_ERROR(RSA, RSA_R_INVALID_MESSAGE_LENGTH); + return 0; + } + return 1; + } + } + + OPENSSL_PUT_ERROR(RSA, RSA_R_UNKNOWN_ALGORITHM_TYPE); + return 0; +} + +int RSA_add_pkcs1_prefix(uint8_t **out_msg, size_t *out_msg_len, + int *is_alloced, int hash_nid, const uint8_t *digest, + size_t digest_len) { + if (!rsa_check_digest_size(hash_nid, digest_len)) { + return 0; + } + + if (hash_nid == NID_md5_sha1) { + // The length should already have been checked. + assert(digest_len == SSL_SIG_LENGTH); + *out_msg = (uint8_t *)digest; + *out_msg_len = digest_len; + *is_alloced = 0; + return 1; + } + + for (size_t i = 0; kPKCS1SigPrefixes[i].nid != NID_undef; i++) { + const struct pkcs1_sig_prefix *sig_prefix = &kPKCS1SigPrefixes[i]; + if (sig_prefix->nid != hash_nid) { + continue; + } + + // The length should already have been checked. + assert(digest_len == sig_prefix->hash_len); + const uint8_t *prefix = sig_prefix->bytes; + size_t prefix_len = sig_prefix->len; + size_t signed_msg_len = prefix_len + digest_len; + if (signed_msg_len < prefix_len) { + OPENSSL_PUT_ERROR(RSA, RSA_R_TOO_LONG); + return 0; + } + + uint8_t *signed_msg = + reinterpret_cast(OPENSSL_malloc(signed_msg_len)); + if (!signed_msg) { + return 0; + } + + OPENSSL_memcpy(signed_msg, prefix, prefix_len); + OPENSSL_memcpy(signed_msg + prefix_len, digest, digest_len); + + *out_msg = signed_msg; + *out_msg_len = signed_msg_len; + *is_alloced = 1; + + return 1; + } + + OPENSSL_PUT_ERROR(RSA, RSA_R_UNKNOWN_ALGORITHM_TYPE); + return 0; +} + +int bssl::rsa_sign_no_self_test(int hash_nid, const uint8_t *digest, + size_t digest_len, uint8_t *out, + unsigned *out_len, RSA *rsa) { + auto *impl = FromOpaque(rsa); + + if (impl->meth->sign) { + if (!rsa_check_digest_size(hash_nid, digest_len)) { + return 0; + } + // All supported digest lengths fit in |unsigned|. + assert(digest_len <= EVP_MAX_MD_SIZE); + static_assert(EVP_MAX_MD_SIZE <= UINT_MAX, "digest too long"); + return impl->meth->sign(hash_nid, digest, (unsigned)digest_len, out, + out_len, rsa); + } + + const unsigned rsa_size = RSA_size(rsa); + int ret = 0; + uint8_t *signed_msg = nullptr; + size_t signed_msg_len = 0; + int signed_msg_is_alloced = 0; + size_t size_t_out_len; + if (!RSA_add_pkcs1_prefix(&signed_msg, &signed_msg_len, + &signed_msg_is_alloced, hash_nid, digest, + digest_len) || + !rsa_sign_raw_no_self_test(rsa, &size_t_out_len, out, rsa_size, + signed_msg, signed_msg_len, + RSA_PKCS1_PADDING)) { + goto err; + } + + if (size_t_out_len > UINT_MAX) { + OPENSSL_PUT_ERROR(RSA, ERR_R_OVERFLOW); + goto err; + } + + *out_len = (unsigned)size_t_out_len; + ret = 1; + +err: + if (signed_msg_is_alloced) { + OPENSSL_free(signed_msg); + } + return ret; +} + +int RSA_sign(int hash_nid, const uint8_t *digest, size_t digest_len, + uint8_t *out, unsigned *out_len, RSA *rsa) { + boringssl_ensure_rsa_sign_self_test(); + + return rsa_sign_no_self_test(hash_nid, digest, digest_len, out, out_len, rsa); +} + +int RSA_sign_pss_mgf1(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, + const uint8_t *digest, size_t digest_len, + const EVP_MD *md, const EVP_MD *mgf1_md, int salt_len) { + if (digest_len != EVP_MD_size(md)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_INVALID_MESSAGE_LENGTH); + return 0; + } + + size_t padded_len = RSA_size(rsa); + uint8_t *padded = reinterpret_cast(OPENSSL_malloc(padded_len)); + if (padded == nullptr) { + return 0; + } + + int ret = RSA_padding_add_PKCS1_PSS_mgf1(rsa, padded, digest, md, mgf1_md, + salt_len) && + RSA_sign_raw(rsa, out_len, out, max_out, padded, padded_len, + RSA_NO_PADDING); + OPENSSL_free(padded); + return ret; +} + +int bssl::rsa_verify_no_self_test(int hash_nid, const uint8_t *digest, + size_t digest_len, const uint8_t *sig, + size_t sig_len, RSA *rsa) { + auto *impl = FromOpaque(rsa); + if (impl->n == nullptr || impl->e == nullptr) { + OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING); + return 0; + } + + const size_t rsa_size = RSA_size(rsa); + uint8_t *buf = nullptr; + int ret = 0; + uint8_t *signed_msg = nullptr; + size_t signed_msg_len = 0, len; + int signed_msg_is_alloced = 0; + + if (hash_nid == NID_md5_sha1 && digest_len != SSL_SIG_LENGTH) { + OPENSSL_PUT_ERROR(RSA, RSA_R_INVALID_MESSAGE_LENGTH); + return 0; + } + + buf = reinterpret_cast(OPENSSL_malloc(rsa_size)); + if (!buf) { + return 0; + } + + if (!rsa_verify_raw_no_self_test(rsa, &len, buf, rsa_size, sig, sig_len, + RSA_PKCS1_PADDING) || + !RSA_add_pkcs1_prefix(&signed_msg, &signed_msg_len, + &signed_msg_is_alloced, hash_nid, digest, + digest_len)) { + goto out; + } + + // Check that no other information follows the hash value (FIPS 186-5 Section + // 5.4) and it matches the expected hash. + if (len != signed_msg_len || OPENSSL_memcmp(buf, signed_msg, len) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_SIGNATURE); + goto out; + } + + ret = 1; + +out: + OPENSSL_free(buf); + if (signed_msg_is_alloced) { + OPENSSL_free(signed_msg); + } + return ret; +} + +int RSA_verify(int hash_nid, const uint8_t *digest, size_t digest_len, + const uint8_t *sig, size_t sig_len, RSA *rsa) { + boringssl_ensure_rsa_verify_self_test(); + return rsa_verify_no_self_test(hash_nid, digest, digest_len, sig, sig_len, + rsa); +} + +int RSA_verify_pss_mgf1(RSA *rsa, const uint8_t *digest, size_t digest_len, + const EVP_MD *md, const EVP_MD *mgf1_md, int salt_len, + const uint8_t *sig, size_t sig_len) { + if (digest_len != EVP_MD_size(md)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_INVALID_MESSAGE_LENGTH); + return 0; + } + + size_t em_len = RSA_size(rsa); + uint8_t *em = reinterpret_cast(OPENSSL_malloc(em_len)); + if (em == nullptr) { + return 0; + } + + int ret = 0; + if (!RSA_verify_raw(rsa, &em_len, em, em_len, sig, sig_len, RSA_NO_PADDING)) { + goto err; + } + + if (em_len != RSA_size(rsa)) { + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + goto err; + } + + ret = RSA_verify_PKCS1_PSS_mgf1(rsa, digest, md, mgf1_md, em, salt_len); + +err: + OPENSSL_free(em); + return ret; +} + +static int check_mod_inverse(int *out_ok, const BIGNUM *a, const BIGNUM *ainv, + const BIGNUM *m, unsigned m_min_bits, + BN_CTX *ctx) { + if (BN_is_negative(ainv) || + constant_time_declassify_int(BN_cmp(ainv, m) >= 0)) { + *out_ok = 0; + return 1; + } + + // Note |bn_mul_consttime| and |bn_div_consttime| do not scale linearly, but + // checking |ainv| is in range bounds the running time, assuming |m|'s bounds + // were checked by the caller. + BN_CTXScope scope(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + if (tmp == nullptr || // + !bn_mul_consttime(tmp, a, ainv, ctx) || + !bn_div_consttime(nullptr, tmp, tmp, m, m_min_bits, ctx)) { + return 0; + } + *out_ok = constant_time_declassify_int(BN_is_one(tmp)); + return 1; +} + +int RSA_check_key(const RSA *key) { + // TODO(davidben): RSA key initialization is spread across + // |rsa_check_public_key|, |RSA_check_key|, |freeze_private_key|, and + // |BN_MONT_CTX_set_locked| as a result of API issues. See + // https://crbug.com/boringssl/316. As a result, we inconsistently check RSA + // invariants. We should fix this and integrate that logic. + + if (!rsa_check_public_key(key)) { + return 0; + } + + auto *impl = FromOpaque(key); + if ((impl->p != nullptr) != (impl->q != nullptr)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_ONLY_ONE_OF_P_Q_GIVEN); + return 0; + } + + // |impl->d| must be bounded by |impl->n|. This ensures bounds on |RSA_bits| + // translate to bounds on the running time of private key operations. + if (impl->d != nullptr && (BN_is_negative(impl->d.get()) || + BN_cmp(impl->d.get(), impl->n.get()) >= 0)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_D_OUT_OF_RANGE); + return 0; + } + + if (impl->d == nullptr || impl->p == nullptr) { + // For a public key, or without p and q, there's nothing that can be + // checked. + return 1; + } + + BN_CTX *ctx = BN_CTX_new(); + if (ctx == nullptr) { + return 0; + } + + BIGNUM tmp, de, pm1, qm1, dmp1, dmq1; + int ok = 0; + bool has_crt_values; + unsigned pm1_bits, qm1_bits; + BN_init(&tmp); + BN_init(&de); + BN_init(&pm1); + BN_init(&qm1); + BN_init(&dmp1); + BN_init(&dmq1); + + // Check that p * q == n. Before we multiply, we check that p and q are in + // bounds, to avoid a DoS vector in |bn_mul_consttime| below. Note that + // n was bound by |rsa_check_public_key|. This also implicitly checks p and q + // are odd, which is a necessary condition for Montgomery reduction. + if (BN_is_negative(impl->p.get()) || + constant_time_declassify_int(BN_cmp(impl->p.get(), impl->n.get()) >= 0) || + BN_is_negative(impl->q.get()) || + constant_time_declassify_int(BN_cmp(impl->q.get(), impl->n.get()) >= 0)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_N_NOT_EQUAL_P_Q); + goto out; + } + if (!bn_mul_consttime(&tmp, impl->p.get(), impl->q.get(), ctx)) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + goto out; + } + if (BN_cmp(&tmp, impl->n.get()) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_N_NOT_EQUAL_P_Q); + goto out; + } + + // d must be an inverse of e mod the Carmichael totient, lcm(p-1, q-1), but it + // may be unreduced because other implementations use the Euler totient. We + // simply check that d * e is one mod p-1 and mod q-1. Note d and e were bound + // by earlier checks in this function. + if (!bn_usub_consttime(&pm1, impl->p.get(), BN_value_one()) || + !bn_usub_consttime(&qm1, impl->q.get(), BN_value_one())) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + goto out; + } + pm1_bits = BN_num_bits(&pm1); + qm1_bits = BN_num_bits(&qm1); + if (!bn_mul_consttime(&de, impl->d.get(), impl->e.get(), ctx) || + !bn_div_consttime(nullptr, &tmp, &de, &pm1, pm1_bits, ctx) || + !bn_div_consttime(nullptr, &de, &de, &qm1, qm1_bits, ctx)) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + goto out; + } + + if (constant_time_declassify_int(!BN_is_one(&tmp)) || + constant_time_declassify_int(!BN_is_one(&de))) { + OPENSSL_PUT_ERROR(RSA, RSA_R_D_E_NOT_CONGRUENT_TO_1); + goto out; + } + + has_crt_values = impl->dmp1 != nullptr; + if (has_crt_values != (impl->dmq1 != nullptr) || + has_crt_values != (impl->iqmp != nullptr)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_INCONSISTENT_SET_OF_CRT_VALUES); + goto out; + } + + if (has_crt_values) { + int dmp1_ok, dmq1_ok, iqmp_ok; + if (!check_mod_inverse(&dmp1_ok, impl->e.get(), impl->dmp1.get(), &pm1, + pm1_bits, ctx) || + !check_mod_inverse(&dmq1_ok, impl->e.get(), impl->dmq1.get(), &qm1, + qm1_bits, ctx) || + // |p| is odd, so |pm1| and |p| have the same bit width. If they didn't, + // we only need a lower bound anyway. + !check_mod_inverse(&iqmp_ok, impl->q.get(), impl->iqmp.get(), + impl->p.get(), pm1_bits, ctx)) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + goto out; + } + + if (!dmp1_ok || !dmq1_ok || !iqmp_ok) { + OPENSSL_PUT_ERROR(RSA, RSA_R_CRT_VALUES_INCORRECT); + goto out; + } + } + + ok = 1; + +out: + BN_free(&tmp); + BN_free(&de); + BN_free(&pm1); + BN_free(&qm1); + BN_free(&dmp1); + BN_free(&dmq1); + BN_CTX_free(ctx); + + return ok; +} + + +// This is the product of the 132 smallest odd primes, from 3 to 751. +static const BN_ULONG kSmallFactorsLimbs[] = {TOBN(0xc4309333, 0x3ef4e3e1), + TOBN(0x71161eb6, 0xcd2d655f), + TOBN(0x95e2238c, 0x0bf94862), + TOBN(0x3eb233d3, 0x24f7912b), + TOBN(0x6b55514b, 0xbf26c483), + TOBN(0x0a84d817, 0x5a144871), + TOBN(0x77d12fee, 0x9b82210a), + TOBN(0xdb5b93c2, 0x97f050b3), + TOBN(0x4acad6b9, 0x4d6c026b), + TOBN(0xeb7751f3, 0x54aec893), + TOBN(0xdba53368, 0x36bc85c4), + TOBN(0xd85a1b28, 0x7f5ec78e), + TOBN(0x2eb072d8, 0x6b322244), + TOBN(0xbba51112, 0x5e2b3aea), + TOBN(0x36ed1a6c, 0x0e2486bf), + TOBN(0x5f270460, 0xec0c5727), + 0x000017b1}; + +DEFINE_LOCAL_DATA(BIGNUM, g_small_factors) { + out->d = const_cast(kSmallFactorsLimbs); + out->width = std::size(kSmallFactorsLimbs); + out->dmax = out->width; + out->neg = 0; + out->flags = BN_FLG_STATIC_DATA; +} + +int RSA_check_fips(RSA *key) { + if (!RSA_check_key(key)) { + return 0; + } + + BN_CTX *ctx = BN_CTX_new(); + if (ctx == nullptr) { + return 0; + } + + BIGNUM small_gcd; + BN_init(&small_gcd); + + int ret = 1; + + // Perform partial public key validation of RSA keys (SP 800-89 5.3.3). + // Although this is not for primality testing, SP 800-89 cites an RSA + // primality testing algorithm, so we use |BN_prime_checks_for_generation| to + // match. This is only a plausibility test and we expect the value to be + // composite, so too few iterations will cause us to reject the key, not use + // an implausible one. + // + // |key->e| may be nullptr if created with |RSA_new_private_key_no_e|. + enum bn_primality_result_t primality_result; + auto *impl = FromOpaque(key); + if (impl->e.get() == nullptr || // + BN_num_bits(impl->e.get()) <= 16 || // + BN_num_bits(impl->e.get()) > 256 || // + !BN_is_odd(impl->n.get()) || // + !BN_is_odd(impl->e.get()) || + !BN_gcd(&small_gcd, impl->n.get(), g_small_factors(), ctx) || + !BN_is_one(&small_gcd) || + !BN_enhanced_miller_rabin_primality_test(&primality_result, impl->n.get(), + BN_prime_checks_for_generation, + ctx, nullptr) || + primality_result != bn_non_prime_power_composite) { + OPENSSL_PUT_ERROR(RSA, RSA_R_PUBLIC_KEY_VALIDATION_FAILED); + ret = 0; + } + + BN_free(&small_gcd); + BN_CTX_free(ctx); + + if (!ret || impl->d == nullptr || impl->p == nullptr) { + // On a failure or on only a public key, there's nothing else can be + // checked. + return ret; + } + + // FIPS pairwise consistency test (FIPS 140-2 4.9.2). Per FIPS 140-2 IG, + // section 9.9, it is not known whether |rsa| will be used for signing or + // encryption, so either pair-wise consistency self-test is acceptable. We + // perform a signing test. + uint8_t data[32] = {0}; + unsigned sig_len = RSA_size(key); + uint8_t *sig = reinterpret_cast(OPENSSL_malloc(sig_len)); + if (sig == nullptr) { + return 0; + } + + if (!RSA_sign(NID_sha256, data, sizeof(data), sig, &sig_len, key)) { + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + ret = 0; + goto cleanup; + } + if (boringssl_fips_break_test("RSA_PWCT")) { + data[0] = ~data[0]; + } + if (!RSA_verify(NID_sha256, data, sizeof(data), sig, sig_len, key)) { + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + ret = 0; + } + +cleanup: + OPENSSL_free(sig); + + return ret; +} + +int bssl::rsa_private_transform_no_self_test(RSA *rsa, uint8_t *out, + const uint8_t *in, size_t len) { + auto *impl = FromOpaque(rsa); + + if (impl->meth->private_transform) { + return impl->meth->private_transform(rsa, out, in, len); + } + + return rsa_default_private_transform(rsa, out, in, len); +} + +int bssl::rsa_private_transform(RSA *rsa, uint8_t *out, const uint8_t *in, + size_t len) { + boringssl_ensure_rsa_sign_self_test(); + return rsa_private_transform_no_self_test(rsa, out, in, len); +} + +int RSA_flags(const RSA *rsa) { + auto *impl = FromOpaque(rsa); + return impl->flags; +} + +int RSA_test_flags(const RSA *rsa, int flags) { + auto *impl = FromOpaque(rsa); + return impl->flags & flags; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa_impl.c b/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa_impl.c deleted file mode 100644 index 2bef70da..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa_impl.c +++ /dev/null @@ -1,1434 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include "../../internal.h" -#include "../bn/internal.h" -#include "../delocate.h" -#include "../rand/fork_detect.h" -#include "../service_indicator/internal.h" -#include "internal.h" - - -int rsa_check_public_key(const RSA *rsa) { - if (rsa->n == NULL || rsa->e == NULL) { - OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING); - return 0; - } - - unsigned n_bits = BN_num_bits(rsa->n); - if (n_bits > 16 * 1024) { - OPENSSL_PUT_ERROR(RSA, RSA_R_MODULUS_TOO_LARGE); - return 0; - } - - // Mitigate DoS attacks by limiting the exponent size. 33 bits was chosen as - // the limit based on the recommendations in [1] and [2]. Windows CryptoAPI - // doesn't support values larger than 32 bits [3], so it is unlikely that - // exponents larger than 32 bits are being used for anything Windows commonly - // does. - // - // [1] https://www.imperialviolet.org/2012/03/16/rsae.html - // [2] https://www.imperialviolet.org/2012/03/17/rsados.html - // [3] https://msdn.microsoft.com/en-us/library/aa387685(VS.85).aspx - static const unsigned kMaxExponentBits = 33; - unsigned e_bits = BN_num_bits(rsa->e); - if (e_bits > kMaxExponentBits || - // Additionally reject e = 1 or even e. e must be odd to be relatively - // prime with phi(n). - e_bits < 2 || - !BN_is_odd(rsa->e)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_E_VALUE); - return 0; - } - - // Verify |n > e|. Comparing |n_bits| to |kMaxExponentBits| is a small - // shortcut to comparing |n| and |e| directly. In reality, |kMaxExponentBits| - // is much smaller than the minimum RSA key size that any application should - // accept. - if (n_bits <= kMaxExponentBits) { - OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); - return 0; - } - assert(BN_ucmp(rsa->n, rsa->e) > 0); - - return 1; -} - -static int ensure_fixed_copy(BIGNUM **out, const BIGNUM *in, int width) { - if (*out != NULL) { - return 1; - } - BIGNUM *copy = BN_dup(in); - if (copy == NULL || - !bn_resize_words(copy, width)) { - BN_free(copy); - return 0; - } - *out = copy; - CONSTTIME_SECRET(copy->d, sizeof(BN_ULONG) * width); - - return 1; -} - -// freeze_private_key finishes initializing |rsa|'s private key components. -// After this function has returned, |rsa| may not be changed. This is needed -// because |RSA| is a public struct and, additionally, OpenSSL 1.1.0 opaquified -// it wrong (see https://github.com/openssl/openssl/issues/5158). -static int freeze_private_key(RSA *rsa, BN_CTX *ctx) { - CRYPTO_MUTEX_lock_read(&rsa->lock); - int frozen = rsa->private_key_frozen; - CRYPTO_MUTEX_unlock_read(&rsa->lock); - if (frozen) { - return 1; - } - - int ret = 0; - CRYPTO_MUTEX_lock_write(&rsa->lock); - if (rsa->private_key_frozen) { - ret = 1; - goto err; - } - - // Pre-compute various intermediate values, as well as copies of private - // exponents with correct widths. Note that other threads may concurrently - // read from |rsa->n|, |rsa->e|, etc., so any fixes must be in separate - // copies. We use |mont_n->N|, |mont_p->N|, and |mont_q->N| as copies of |n|, - // |p|, and |q| with the correct minimal widths. - - if (rsa->mont_n == NULL) { - rsa->mont_n = BN_MONT_CTX_new_for_modulus(rsa->n, ctx); - if (rsa->mont_n == NULL) { - goto err; - } - } - const BIGNUM *n_fixed = &rsa->mont_n->N; - - // The only public upper-bound of |rsa->d| is the bit length of |rsa->n|. The - // ASN.1 serialization of RSA private keys unfortunately leaks the byte length - // of |rsa->d|, but normalize it so we only leak it once, rather than per - // operation. - if (rsa->d != NULL && - !ensure_fixed_copy(&rsa->d_fixed, rsa->d, n_fixed->width)) { - goto err; - } - - if (rsa->p != NULL && rsa->q != NULL) { - // TODO: p and q are also CONSTTIME_SECRET but not yet marked as such - // because the Montgomery code does things like test whether or not values - // are zero. So the secret marking probably needs to happen inside that - // code. - - if (rsa->mont_p == NULL) { - rsa->mont_p = BN_MONT_CTX_new_consttime(rsa->p, ctx); - if (rsa->mont_p == NULL) { - goto err; - } - } - const BIGNUM *p_fixed = &rsa->mont_p->N; - - if (rsa->mont_q == NULL) { - rsa->mont_q = BN_MONT_CTX_new_consttime(rsa->q, ctx); - if (rsa->mont_q == NULL) { - goto err; - } - } - const BIGNUM *q_fixed = &rsa->mont_q->N; - - if (rsa->dmp1 != NULL && rsa->dmq1 != NULL) { - // Key generation relies on this function to compute |iqmp|. - if (rsa->iqmp == NULL) { - BIGNUM *iqmp = BN_new(); - if (iqmp == NULL || - !bn_mod_inverse_secret_prime(iqmp, rsa->q, rsa->p, ctx, - rsa->mont_p)) { - BN_free(iqmp); - goto err; - } - rsa->iqmp = iqmp; - } - - // CRT components are only publicly bounded by their corresponding - // moduli's bit lengths. |rsa->iqmp| is unused outside of this one-time - // setup, so we do not compute a fixed-width version of it. - if (!ensure_fixed_copy(&rsa->dmp1_fixed, rsa->dmp1, p_fixed->width) || - !ensure_fixed_copy(&rsa->dmq1_fixed, rsa->dmq1, q_fixed->width)) { - goto err; - } - - // Compute |inv_small_mod_large_mont|. Note that it is always modulo the - // larger prime, independent of what is stored in |rsa->iqmp|. - if (rsa->inv_small_mod_large_mont == NULL) { - BIGNUM *inv_small_mod_large_mont = BN_new(); - int ok; - if (BN_cmp(rsa->p, rsa->q) < 0) { - ok = inv_small_mod_large_mont != NULL && - bn_mod_inverse_secret_prime(inv_small_mod_large_mont, rsa->p, - rsa->q, ctx, rsa->mont_q) && - BN_to_montgomery(inv_small_mod_large_mont, - inv_small_mod_large_mont, rsa->mont_q, ctx); - } else { - ok = inv_small_mod_large_mont != NULL && - BN_to_montgomery(inv_small_mod_large_mont, rsa->iqmp, - rsa->mont_p, ctx); - } - if (!ok) { - BN_free(inv_small_mod_large_mont); - goto err; - } - rsa->inv_small_mod_large_mont = inv_small_mod_large_mont; - CONSTTIME_SECRET( - rsa->inv_small_mod_large_mont->d, - sizeof(BN_ULONG) * rsa->inv_small_mod_large_mont->width); - } - } - } - - rsa->private_key_frozen = 1; - ret = 1; - -err: - CRYPTO_MUTEX_unlock_write(&rsa->lock); - return ret; -} - -size_t rsa_default_size(const RSA *rsa) { - return BN_num_bytes(rsa->n); -} - -int RSA_encrypt(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, - const uint8_t *in, size_t in_len, int padding) { - boringssl_ensure_rsa_self_test(); - - if (!rsa_check_public_key(rsa)) { - return 0; - } - - const unsigned rsa_size = RSA_size(rsa); - BIGNUM *f, *result; - uint8_t *buf = NULL; - BN_CTX *ctx = NULL; - int i, ret = 0; - - if (max_out < rsa_size) { - OPENSSL_PUT_ERROR(RSA, RSA_R_OUTPUT_BUFFER_TOO_SMALL); - return 0; - } - - ctx = BN_CTX_new(); - if (ctx == NULL) { - goto err; - } - - BN_CTX_start(ctx); - f = BN_CTX_get(ctx); - result = BN_CTX_get(ctx); - buf = OPENSSL_malloc(rsa_size); - if (!f || !result || !buf) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - goto err; - } - - switch (padding) { - case RSA_PKCS1_PADDING: - i = RSA_padding_add_PKCS1_type_2(buf, rsa_size, in, in_len); - break; - case RSA_PKCS1_OAEP_PADDING: - // Use the default parameters: SHA-1 for both hashes and no label. - i = RSA_padding_add_PKCS1_OAEP_mgf1(buf, rsa_size, in, in_len, - NULL, 0, NULL, NULL); - break; - case RSA_NO_PADDING: - i = RSA_padding_add_none(buf, rsa_size, in, in_len); - break; - default: - OPENSSL_PUT_ERROR(RSA, RSA_R_UNKNOWN_PADDING_TYPE); - goto err; - } - - if (i <= 0) { - goto err; - } - - if (BN_bin2bn(buf, rsa_size, f) == NULL) { - goto err; - } - - if (BN_ucmp(f, rsa->n) >= 0) { - // usually the padding functions would catch this - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_MODULUS); - goto err; - } - - if (!BN_MONT_CTX_set_locked(&rsa->mont_n, &rsa->lock, rsa->n, ctx) || - !BN_mod_exp_mont(result, f, rsa->e, &rsa->mont_n->N, ctx, rsa->mont_n)) { - goto err; - } - - // put in leading 0 bytes if the number is less than the length of the - // modulus - if (!BN_bn2bin_padded(out, rsa_size, result)) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - goto err; - } - - *out_len = rsa_size; - ret = 1; - -err: - if (ctx != NULL) { - BN_CTX_end(ctx); - BN_CTX_free(ctx); - } - OPENSSL_free(buf); - - return ret; -} - -// MAX_BLINDINGS_PER_RSA defines the maximum number of cached BN_BLINDINGs per -// RSA*. Then this limit is exceeded, BN_BLINDING objects will be created and -// destroyed as needed. -#if defined(OPENSSL_TSAN) -// Smaller under TSAN so that the edge case can be hit with fewer threads. -#define MAX_BLINDINGS_PER_RSA 2 -#else -#define MAX_BLINDINGS_PER_RSA 1024 -#endif - -// rsa_blinding_get returns a BN_BLINDING to use with |rsa|. It does this by -// allocating one of the cached BN_BLINDING objects in |rsa->blindings|. If -// none are free, the cache will be extended by a extra element and the new -// BN_BLINDING is returned. -// -// On success, the index of the assigned BN_BLINDING is written to -// |*index_used| and must be passed to |rsa_blinding_release| when finished. -static BN_BLINDING *rsa_blinding_get(RSA *rsa, unsigned *index_used, - BN_CTX *ctx) { - assert(ctx != NULL); - assert(rsa->mont_n != NULL); - - BN_BLINDING *ret = NULL; - const uint64_t fork_generation = CRYPTO_get_fork_generation(); - CRYPTO_MUTEX_lock_write(&rsa->lock); - - // Wipe the blinding cache on |fork|. - if (rsa->blinding_fork_generation != fork_generation) { - for (unsigned i = 0; i < rsa->num_blindings; i++) { - // The inuse flag must be zero unless we were forked from a - // multi-threaded process, in which case calling back into BoringSSL is - // forbidden. - assert(rsa->blindings_inuse[i] == 0); - BN_BLINDING_invalidate(rsa->blindings[i]); - } - rsa->blinding_fork_generation = fork_generation; - } - - uint8_t *const free_inuse_flag = - OPENSSL_memchr(rsa->blindings_inuse, 0, rsa->num_blindings); - if (free_inuse_flag != NULL) { - *free_inuse_flag = 1; - *index_used = free_inuse_flag - rsa->blindings_inuse; - ret = rsa->blindings[*index_used]; - goto out; - } - - if (rsa->num_blindings >= MAX_BLINDINGS_PER_RSA) { - // No |BN_BLINDING| is free and nor can the cache be extended. This index - // value is magic and indicates to |rsa_blinding_release| that a - // |BN_BLINDING| was not inserted into the array. - *index_used = MAX_BLINDINGS_PER_RSA; - ret = BN_BLINDING_new(); - goto out; - } - - // Double the length of the cache. - static_assert(MAX_BLINDINGS_PER_RSA < UINT_MAX / 2, - "MAX_BLINDINGS_PER_RSA too large"); - unsigned new_num_blindings = rsa->num_blindings * 2; - if (new_num_blindings == 0) { - new_num_blindings = 1; - } - if (new_num_blindings > MAX_BLINDINGS_PER_RSA) { - new_num_blindings = MAX_BLINDINGS_PER_RSA; - } - assert(new_num_blindings > rsa->num_blindings); - - static_assert(MAX_BLINDINGS_PER_RSA < UINT_MAX / sizeof(BN_BLINDING *), - "MAX_BLINDINGS_PER_RSA too large"); - BN_BLINDING **new_blindings = - OPENSSL_malloc(sizeof(BN_BLINDING *) * new_num_blindings); - uint8_t *new_blindings_inuse = OPENSSL_malloc(new_num_blindings); - if (new_blindings == NULL || new_blindings_inuse == NULL) { - goto err; - } - - OPENSSL_memcpy(new_blindings, rsa->blindings, - sizeof(BN_BLINDING *) * rsa->num_blindings); - OPENSSL_memcpy(new_blindings_inuse, rsa->blindings_inuse, rsa->num_blindings); - - for (unsigned i = rsa->num_blindings; i < new_num_blindings; i++) { - new_blindings[i] = BN_BLINDING_new(); - if (new_blindings[i] == NULL) { - for (unsigned j = rsa->num_blindings; j < i; j++) { - BN_BLINDING_free(new_blindings[j]); - } - goto err; - } - } - memset(&new_blindings_inuse[rsa->num_blindings], 0, - new_num_blindings - rsa->num_blindings); - - new_blindings_inuse[rsa->num_blindings] = 1; - *index_used = rsa->num_blindings; - assert(*index_used != MAX_BLINDINGS_PER_RSA); - ret = new_blindings[rsa->num_blindings]; - - OPENSSL_free(rsa->blindings); - rsa->blindings = new_blindings; - OPENSSL_free(rsa->blindings_inuse); - rsa->blindings_inuse = new_blindings_inuse; - rsa->num_blindings = new_num_blindings; - - goto out; - -err: - OPENSSL_free(new_blindings_inuse); - OPENSSL_free(new_blindings); - -out: - CRYPTO_MUTEX_unlock_write(&rsa->lock); - return ret; -} - -// rsa_blinding_release marks the cached BN_BLINDING at the given index as free -// for other threads to use. -static void rsa_blinding_release(RSA *rsa, BN_BLINDING *blinding, - unsigned blinding_index) { - if (blinding_index == MAX_BLINDINGS_PER_RSA) { - // This blinding wasn't cached. - BN_BLINDING_free(blinding); - return; - } - - CRYPTO_MUTEX_lock_write(&rsa->lock); - rsa->blindings_inuse[blinding_index] = 0; - CRYPTO_MUTEX_unlock_write(&rsa->lock); -} - -// signing -int rsa_default_sign_raw(RSA *rsa, size_t *out_len, uint8_t *out, - size_t max_out, const uint8_t *in, size_t in_len, - int padding) { - const unsigned rsa_size = RSA_size(rsa); - uint8_t *buf = NULL; - int i, ret = 0; - - if (max_out < rsa_size) { - OPENSSL_PUT_ERROR(RSA, RSA_R_OUTPUT_BUFFER_TOO_SMALL); - return 0; - } - - buf = OPENSSL_malloc(rsa_size); - if (buf == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - goto err; - } - - switch (padding) { - case RSA_PKCS1_PADDING: - i = RSA_padding_add_PKCS1_type_1(buf, rsa_size, in, in_len); - break; - case RSA_NO_PADDING: - i = RSA_padding_add_none(buf, rsa_size, in, in_len); - break; - default: - OPENSSL_PUT_ERROR(RSA, RSA_R_UNKNOWN_PADDING_TYPE); - goto err; - } - - if (i <= 0) { - goto err; - } - - if (!RSA_private_transform(rsa, out, buf, rsa_size)) { - goto err; - } - - CONSTTIME_DECLASSIFY(out, rsa_size); - *out_len = rsa_size; - ret = 1; - -err: - OPENSSL_free(buf); - - return ret; -} - -int rsa_default_decrypt(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, - const uint8_t *in, size_t in_len, int padding) { - boringssl_ensure_rsa_self_test(); - - const unsigned rsa_size = RSA_size(rsa); - uint8_t *buf = NULL; - int ret = 0; - - if (max_out < rsa_size) { - OPENSSL_PUT_ERROR(RSA, RSA_R_OUTPUT_BUFFER_TOO_SMALL); - return 0; - } - - if (padding == RSA_NO_PADDING) { - buf = out; - } else { - // Allocate a temporary buffer to hold the padded plaintext. - buf = OPENSSL_malloc(rsa_size); - if (buf == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - goto err; - } - } - - if (in_len != rsa_size) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_LEN_NOT_EQUAL_TO_MOD_LEN); - goto err; - } - - if (!RSA_private_transform(rsa, buf, in, rsa_size)) { - goto err; - } - - switch (padding) { - case RSA_PKCS1_PADDING: - ret = - RSA_padding_check_PKCS1_type_2(out, out_len, rsa_size, buf, rsa_size); - break; - case RSA_PKCS1_OAEP_PADDING: - // Use the default parameters: SHA-1 for both hashes and no label. - ret = RSA_padding_check_PKCS1_OAEP_mgf1(out, out_len, rsa_size, buf, - rsa_size, NULL, 0, NULL, NULL); - break; - case RSA_NO_PADDING: - *out_len = rsa_size; - ret = 1; - break; - default: - OPENSSL_PUT_ERROR(RSA, RSA_R_UNKNOWN_PADDING_TYPE); - goto err; - } - - CONSTTIME_DECLASSIFY(&ret, sizeof(ret)); - if (!ret) { - OPENSSL_PUT_ERROR(RSA, RSA_R_PADDING_CHECK_FAILED); - } else { - CONSTTIME_DECLASSIFY(out, *out_len); - } - -err: - if (padding != RSA_NO_PADDING) { - OPENSSL_free(buf); - } - - return ret; -} - -static int mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx); - -int rsa_verify_raw_no_self_test(RSA *rsa, size_t *out_len, uint8_t *out, - size_t max_out, const uint8_t *in, - size_t in_len, int padding) { - if (!rsa_check_public_key(rsa)) { - return 0; - } - - const unsigned rsa_size = RSA_size(rsa); - BIGNUM *f, *result; - - if (max_out < rsa_size) { - OPENSSL_PUT_ERROR(RSA, RSA_R_OUTPUT_BUFFER_TOO_SMALL); - return 0; - } - - if (in_len != rsa_size) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_LEN_NOT_EQUAL_TO_MOD_LEN); - return 0; - } - - BN_CTX *ctx = BN_CTX_new(); - if (ctx == NULL) { - return 0; - } - - int ret = 0; - uint8_t *buf = NULL; - - BN_CTX_start(ctx); - f = BN_CTX_get(ctx); - result = BN_CTX_get(ctx); - if (f == NULL || result == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - goto err; - } - - if (padding == RSA_NO_PADDING) { - buf = out; - } else { - // Allocate a temporary buffer to hold the padded plaintext. - buf = OPENSSL_malloc(rsa_size); - if (buf == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - goto err; - } - } - - if (BN_bin2bn(in, in_len, f) == NULL) { - goto err; - } - - if (BN_ucmp(f, rsa->n) >= 0) { - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_MODULUS); - goto err; - } - - if (!BN_MONT_CTX_set_locked(&rsa->mont_n, &rsa->lock, rsa->n, ctx) || - !BN_mod_exp_mont(result, f, rsa->e, &rsa->mont_n->N, ctx, rsa->mont_n)) { - goto err; - } - - if (!BN_bn2bin_padded(buf, rsa_size, result)) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - goto err; - } - - switch (padding) { - case RSA_PKCS1_PADDING: - ret = - RSA_padding_check_PKCS1_type_1(out, out_len, rsa_size, buf, rsa_size); - break; - case RSA_NO_PADDING: - ret = 1; - *out_len = rsa_size; - break; - default: - OPENSSL_PUT_ERROR(RSA, RSA_R_UNKNOWN_PADDING_TYPE); - goto err; - } - - if (!ret) { - OPENSSL_PUT_ERROR(RSA, RSA_R_PADDING_CHECK_FAILED); - goto err; - } - -err: - BN_CTX_end(ctx); - BN_CTX_free(ctx); - if (buf != out) { - OPENSSL_free(buf); - } - return ret; -} - -int RSA_verify_raw(RSA *rsa, size_t *out_len, uint8_t *out, - size_t max_out, const uint8_t *in, - size_t in_len, int padding) { - boringssl_ensure_rsa_self_test(); - return rsa_verify_raw_no_self_test(rsa, out_len, out, max_out, in, in_len, - padding); -} - -int rsa_default_private_transform(RSA *rsa, uint8_t *out, const uint8_t *in, - size_t len) { - if (rsa->n == NULL || rsa->d == NULL) { - OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING); - return 0; - } - - BIGNUM *f, *result; - BN_CTX *ctx = NULL; - unsigned blinding_index = 0; - BN_BLINDING *blinding = NULL; - int ret = 0; - - ctx = BN_CTX_new(); - if (ctx == NULL) { - goto err; - } - BN_CTX_start(ctx); - f = BN_CTX_get(ctx); - result = BN_CTX_get(ctx); - - if (f == NULL || result == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_MALLOC_FAILURE); - goto err; - } - - if (BN_bin2bn(in, len, f) == NULL) { - goto err; - } - - if (BN_ucmp(f, rsa->n) >= 0) { - // Usually the padding functions would catch this. - OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_MODULUS); - goto err; - } - - if (!freeze_private_key(rsa, ctx)) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - goto err; - } - - const int do_blinding = (rsa->flags & RSA_FLAG_NO_BLINDING) == 0; - - if (rsa->e == NULL && do_blinding) { - // We cannot do blinding or verification without |e|, and continuing without - // those countermeasures is dangerous. However, the Java/Android RSA API - // requires support for keys where only |d| and |n| (and not |e|) are known. - // The callers that require that bad behavior set |RSA_FLAG_NO_BLINDING|. - OPENSSL_PUT_ERROR(RSA, RSA_R_NO_PUBLIC_EXPONENT); - goto err; - } - - if (do_blinding) { - blinding = rsa_blinding_get(rsa, &blinding_index, ctx); - if (blinding == NULL) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - goto err; - } - if (!BN_BLINDING_convert(f, blinding, rsa->e, rsa->mont_n, ctx)) { - goto err; - } - } - - if (rsa->p != NULL && rsa->q != NULL && rsa->e != NULL && rsa->dmp1 != NULL && - rsa->dmq1 != NULL && rsa->iqmp != NULL && - // Require that we can reduce |f| by |rsa->p| and |rsa->q| in constant - // time, which requires primes be the same size, rounded to the Montgomery - // coefficient. (See |mod_montgomery|.) This is not required by RFC 8017, - // but it is true for keys generated by us and all common implementations. - bn_less_than_montgomery_R(rsa->q, rsa->mont_p) && - bn_less_than_montgomery_R(rsa->p, rsa->mont_q)) { - if (!mod_exp(result, f, rsa, ctx)) { - goto err; - } - } else if (!BN_mod_exp_mont_consttime(result, f, rsa->d_fixed, rsa->n, ctx, - rsa->mont_n)) { - goto err; - } - - // Verify the result to protect against fault attacks as described in the - // 1997 paper "On the Importance of Checking Cryptographic Protocols for - // Faults" by Dan Boneh, Richard A. DeMillo, and Richard J. Lipton. Some - // implementations do this only when the CRT is used, but we do it in all - // cases. Section 6 of the aforementioned paper describes an attack that - // works when the CRT isn't used. That attack is much less likely to succeed - // than the CRT attack, but there have likely been improvements since 1997. - // - // This check is cheap assuming |e| is small; it almost always is. - if (rsa->e != NULL) { - BIGNUM *vrfy = BN_CTX_get(ctx); - if (vrfy == NULL || - !BN_mod_exp_mont(vrfy, result, rsa->e, rsa->n, ctx, rsa->mont_n) || - !BN_equal_consttime(vrfy, f)) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - goto err; - } - - } - - if (do_blinding && - !BN_BLINDING_invert(result, blinding, rsa->mont_n, ctx)) { - goto err; - } - - // The computation should have left |result| as a maximally-wide number, so - // that it and serializing does not leak information about the magnitude of - // the result. - // - // See Falko Strenzke, "Manger's Attack revisited", ICICS 2010. - assert(result->width == rsa->mont_n->N.width); - if (!BN_bn2bin_padded(out, len, result)) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - goto err; - } - - ret = 1; - -err: - if (ctx != NULL) { - BN_CTX_end(ctx); - BN_CTX_free(ctx); - } - if (blinding != NULL) { - rsa_blinding_release(rsa, blinding, blinding_index); - } - - return ret; -} - -// mod_montgomery sets |r| to |I| mod |p|. |I| must already be fully reduced -// modulo |p| times |q|. It returns one on success and zero on error. -static int mod_montgomery(BIGNUM *r, const BIGNUM *I, const BIGNUM *p, - const BN_MONT_CTX *mont_p, const BIGNUM *q, - BN_CTX *ctx) { - // Reducing in constant-time with Montgomery reduction requires I <= p * R. We - // have I < p * q, so this follows if q < R. The caller should have checked - // this already. - if (!bn_less_than_montgomery_R(q, mont_p)) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - return 0; - } - - if (// Reduce mod p with Montgomery reduction. This computes I * R^-1 mod p. - !BN_from_montgomery(r, I, mont_p, ctx) || - // Multiply by R^2 and do another Montgomery reduction to compute - // I * R^-1 * R^2 * R^-1 = I mod p. - !BN_to_montgomery(r, r, mont_p, ctx)) { - return 0; - } - - // By precomputing R^3 mod p (normally |BN_MONT_CTX| only uses R^2 mod p) and - // adjusting the API for |BN_mod_exp_mont_consttime|, we could instead compute - // I * R mod p here and save a reduction per prime. But this would require - // changing the RSAZ code and may not be worth it. Note that the RSAZ code - // uses a different radix, so it uses R' = 2^1044. There we'd actually want - // R^2 * R', and would futher benefit from a precomputed R'^2. It currently - // converts |mont_p->RR| to R'^2. - return 1; -} - -static int mod_exp(BIGNUM *r0, const BIGNUM *I, RSA *rsa, BN_CTX *ctx) { - assert(ctx != NULL); - - assert(rsa->n != NULL); - assert(rsa->e != NULL); - assert(rsa->d != NULL); - assert(rsa->p != NULL); - assert(rsa->q != NULL); - assert(rsa->dmp1 != NULL); - assert(rsa->dmq1 != NULL); - assert(rsa->iqmp != NULL); - - BIGNUM *r1, *m1; - int ret = 0; - - BN_CTX_start(ctx); - r1 = BN_CTX_get(ctx); - m1 = BN_CTX_get(ctx); - if (r1 == NULL || - m1 == NULL) { - goto err; - } - - if (!freeze_private_key(rsa, ctx)) { - goto err; - } - - // Implementing RSA with CRT in constant-time is sensitive to which prime is - // larger. Canonicalize fields so that |p| is the larger prime. - const BIGNUM *dmp1 = rsa->dmp1_fixed, *dmq1 = rsa->dmq1_fixed; - const BN_MONT_CTX *mont_p = rsa->mont_p, *mont_q = rsa->mont_q; - if (BN_cmp(rsa->p, rsa->q) < 0) { - mont_p = rsa->mont_q; - mont_q = rsa->mont_p; - dmp1 = rsa->dmq1_fixed; - dmq1 = rsa->dmp1_fixed; - } - - // Use the minimal-width versions of |n|, |p|, and |q|. Either works, but if - // someone gives us non-minimal values, these will be slightly more efficient - // on the non-Montgomery operations. - const BIGNUM *n = &rsa->mont_n->N; - const BIGNUM *p = &mont_p->N; - const BIGNUM *q = &mont_q->N; - - // This is a pre-condition for |mod_montgomery|. It was already checked by the - // caller. - assert(BN_ucmp(I, n) < 0); - - if (// |m1| is the result modulo |q|. - !mod_montgomery(r1, I, q, mont_q, p, ctx) || - !BN_mod_exp_mont_consttime(m1, r1, dmq1, q, ctx, mont_q) || - // |r0| is the result modulo |p|. - !mod_montgomery(r1, I, p, mont_p, q, ctx) || - !BN_mod_exp_mont_consttime(r0, r1, dmp1, p, ctx, mont_p) || - // Compute r0 = r0 - m1 mod p. |p| is the larger prime, so |m1| is already - // fully reduced mod |p|. - !bn_mod_sub_consttime(r0, r0, m1, p, ctx) || - // r0 = r0 * iqmp mod p. We use Montgomery multiplication to compute this - // in constant time. |inv_small_mod_large_mont| is in Montgomery form and - // r0 is not, so the result is taken out of Montgomery form. - !BN_mod_mul_montgomery(r0, r0, rsa->inv_small_mod_large_mont, mont_p, - ctx) || - // r0 = r0 * q + m1 gives the final result. Reducing modulo q gives m1, so - // it is correct mod p. Reducing modulo p gives (r0-m1)*iqmp*q + m1 = r0, - // so it is correct mod q. Finally, the result is bounded by [m1, n + m1), - // and the result is at least |m1|, so this must be the unique answer in - // [0, n). - !bn_mul_consttime(r0, r0, q, ctx) || - !bn_uadd_consttime(r0, r0, m1) || - // The result should be bounded by |n|, but fixed-width operations may - // bound the width slightly higher, so fix it. - !bn_resize_words(r0, n->width)) { - goto err; - } - - ret = 1; - -err: - BN_CTX_end(ctx); - return ret; -} - -static int ensure_bignum(BIGNUM **out) { - if (*out == NULL) { - *out = BN_new(); - } - return *out != NULL; -} - -// kBoringSSLRSASqrtTwo is the BIGNUM representation of ⌊2²⁰⁴⁷×√2⌋. This is -// chosen to give enough precision for 4096-bit RSA, the largest key size FIPS -// specifies. Key sizes beyond this will round up. -// -// To calculate, use the following Haskell code: -// -// import Text.Printf (printf) -// import Data.List (intercalate) -// -// pow2 = 4095 -// target = 2^pow2 -// -// f x = x*x - (toRational target) -// -// fprime x = 2*x -// -// newtonIteration x = x - (f x) / (fprime x) -// -// converge x = -// let n = floor x in -// if n*n - target < 0 && (n+1)*(n+1) - target > 0 -// then n -// else converge (newtonIteration x) -// -// divrem bits x = (x `div` (2^bits), x `rem` (2^bits)) -// -// bnWords :: Integer -> [Integer] -// bnWords x = -// if x == 0 -// then [] -// else let (high, low) = divrem 64 x in low : bnWords high -// -// showWord x = let (high, low) = divrem 32 x in printf "TOBN(0x%08x, 0x%08x)" high low -// -// output :: String -// output = intercalate ", " $ map showWord $ bnWords $ converge (2 ^ (pow2 `div` 2)) -// -// To verify this number, check that n² < 2⁴⁰⁹⁵ < (n+1)², where n is value -// represented here. Note the components are listed in little-endian order. Here -// is some sample Python code to check: -// -// >>> TOBN = lambda a, b: a << 32 | b -// >>> l = [ ] -// >>> n = sum(a * 2**(64*i) for i, a in enumerate(l)) -// >>> n**2 < 2**4095 < (n+1)**2 -// True -const BN_ULONG kBoringSSLRSASqrtTwo[] = { - TOBN(0x4d7c60a5, 0xe633e3e1), TOBN(0x5fcf8f7b, 0xca3ea33b), - TOBN(0xc246785e, 0x92957023), TOBN(0xf9acce41, 0x797f2805), - TOBN(0xfdfe170f, 0xd3b1f780), TOBN(0xd24f4a76, 0x3facb882), - TOBN(0x18838a2e, 0xaff5f3b2), TOBN(0xc1fcbdde, 0xa2f7dc33), - TOBN(0xdea06241, 0xf7aa81c2), TOBN(0xf6a1be3f, 0xca221307), - TOBN(0x332a5e9f, 0x7bda1ebf), TOBN(0x0104dc01, 0xfe32352f), - TOBN(0xb8cf341b, 0x6f8236c7), TOBN(0x4264dabc, 0xd528b651), - TOBN(0xf4d3a02c, 0xebc93e0c), TOBN(0x81394ab6, 0xd8fd0efd), - TOBN(0xeaa4a089, 0x9040ca4a), TOBN(0xf52f120f, 0x836e582e), - TOBN(0xcb2a6343, 0x31f3c84d), TOBN(0xc6d5a8a3, 0x8bb7e9dc), - TOBN(0x460abc72, 0x2f7c4e33), TOBN(0xcab1bc91, 0x1688458a), - TOBN(0x53059c60, 0x11bc337b), TOBN(0xd2202e87, 0x42af1f4e), - TOBN(0x78048736, 0x3dfa2768), TOBN(0x0f74a85e, 0x439c7b4a), - TOBN(0xa8b1fe6f, 0xdc83db39), TOBN(0x4afc8304, 0x3ab8a2c3), - TOBN(0xed17ac85, 0x83339915), TOBN(0x1d6f60ba, 0x893ba84c), - TOBN(0x597d89b3, 0x754abe9f), TOBN(0xb504f333, 0xf9de6484), -}; -const size_t kBoringSSLRSASqrtTwoLen = OPENSSL_ARRAY_SIZE(kBoringSSLRSASqrtTwo); - -// generate_prime sets |out| to a prime with length |bits| such that |out|-1 is -// relatively prime to |e|. If |p| is non-NULL, |out| will also not be close to -// |p|. |sqrt2| must be ⌊2^(bits-1)×√2⌋ (or a slightly overestimate for large -// sizes), and |pow2_bits_100| must be 2^(bits-100). -// -// This function fails with probability around 2^-21. -static int generate_prime(BIGNUM *out, int bits, const BIGNUM *e, - const BIGNUM *p, const BIGNUM *sqrt2, - const BIGNUM *pow2_bits_100, BN_CTX *ctx, - BN_GENCB *cb) { - if (bits < 128 || (bits % BN_BITS2) != 0) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - return 0; - } - assert(BN_is_pow2(pow2_bits_100)); - assert(BN_is_bit_set(pow2_bits_100, bits - 100)); - - // See FIPS 186-4 appendix B.3.3, steps 4 and 5. Note |bits| here is nlen/2. - - // Use the limit from steps 4.7 and 5.8 for most values of |e|. When |e| is 3, - // the 186-4 limit is too low, so we use a higher one. Note this case is not - // reachable from |RSA_generate_key_fips|. - // - // |limit| determines the failure probability. We must find a prime that is - // not 1 mod |e|. By the prime number theorem, we'll find one with probability - // p = (e-1)/e * 2/(ln(2)*bits). Note the second term is doubled because we - // discard even numbers. - // - // The failure probability is thus (1-p)^limit. To convert that to a power of - // two, we take logs. -log_2((1-p)^limit) = -limit * ln(1-p) / ln(2). - // - // >>> def f(bits, e, limit): - // ... p = (e-1.0)/e * 2.0/(math.log(2)*bits) - // ... return -limit * math.log(1 - p) / math.log(2) - // ... - // >>> f(1024, 65537, 5*1024) - // 20.842750558272634 - // >>> f(1536, 65537, 5*1536) - // 20.83294549602474 - // >>> f(2048, 65537, 5*2048) - // 20.828047576234948 - // >>> f(1024, 3, 8*1024) - // 22.222147925962307 - // >>> f(1536, 3, 8*1536) - // 22.21518251065506 - // >>> f(2048, 3, 8*2048) - // 22.211701985875937 - if (bits >= INT_MAX/32) { - OPENSSL_PUT_ERROR(RSA, RSA_R_MODULUS_TOO_LARGE); - return 0; - } - int limit = BN_is_word(e, 3) ? bits * 8 : bits * 5; - - int ret = 0, tries = 0, rand_tries = 0; - BN_CTX_start(ctx); - BIGNUM *tmp = BN_CTX_get(ctx); - if (tmp == NULL) { - goto err; - } - - for (;;) { - // Generate a random number of length |bits| where the bottom bit is set - // (steps 4.2, 4.3, 5.2 and 5.3) and the top bit is set (implied by the - // bound checked below in steps 4.4 and 5.5). - if (!BN_rand(out, bits, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ODD) || - !BN_GENCB_call(cb, BN_GENCB_GENERATED, rand_tries++)) { - goto err; - } - - if (p != NULL) { - // If |p| and |out| are too close, try again (step 5.4). - if (!bn_abs_sub_consttime(tmp, out, p, ctx)) { - goto err; - } - if (BN_cmp(tmp, pow2_bits_100) <= 0) { - continue; - } - } - - // If out < 2^(bits-1)×√2, try again (steps 4.4 and 5.5). This is equivalent - // to out <= ⌊2^(bits-1)×√2⌋, or out <= sqrt2 for FIPS key sizes. - // - // For larger keys, the comparison is approximate, leaning towards - // retrying. That is, we reject a negligible fraction of primes that are - // within the FIPS bound, but we will never accept a prime outside the - // bound, ensuring the resulting RSA key is the right size. - if (BN_cmp(out, sqrt2) <= 0) { - continue; - } - - // RSA key generation's bottleneck is discarding composites. If it fails - // trial division, do not bother computing a GCD or performing Miller-Rabin. - if (!bn_odd_number_is_obviously_composite(out)) { - // Check gcd(out-1, e) is one (steps 4.5 and 5.6). - int relatively_prime; - if (!BN_sub(tmp, out, BN_value_one()) || - !bn_is_relatively_prime(&relatively_prime, tmp, e, ctx)) { - goto err; - } - if (relatively_prime) { - // Test |out| for primality (steps 4.5.1 and 5.6.1). - int is_probable_prime; - if (!BN_primality_test(&is_probable_prime, out, - BN_prime_checks_for_generation, ctx, 0, cb)) { - goto err; - } - if (is_probable_prime) { - ret = 1; - goto err; - } - } - } - - // If we've tried too many times to find a prime, abort (steps 4.7 and - // 5.8). - tries++; - if (tries >= limit) { - OPENSSL_PUT_ERROR(RSA, RSA_R_TOO_MANY_ITERATIONS); - goto err; - } - if (!BN_GENCB_call(cb, 2, tries)) { - goto err; - } - } - -err: - BN_CTX_end(ctx); - return ret; -} - -// rsa_generate_key_impl generates an RSA key using a generalized version of -// FIPS 186-4 appendix B.3. |RSA_generate_key_fips| performs additional checks -// for FIPS-compliant key generation. -// -// This function returns one on success and zero on failure. It has a failure -// probability of about 2^-20. -static int rsa_generate_key_impl(RSA *rsa, int bits, const BIGNUM *e_value, - BN_GENCB *cb) { - // See FIPS 186-4 appendix B.3. This function implements a generalized version - // of the FIPS algorithm. |RSA_generate_key_fips| performs additional checks - // for FIPS-compliant key generation. - - // Always generate RSA keys which are a multiple of 128 bits. Round |bits| - // down as needed. - bits &= ~127; - - // Reject excessively small keys. - if (bits < 256) { - OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); - return 0; - } - - // Reject excessively large public exponents. Windows CryptoAPI and Go don't - // support values larger than 32 bits, so match their limits for generating - // keys. (|rsa_check_public_key| uses a slightly more conservative value, but - // we don't need to support generating such keys.) - // https://github.com/golang/go/issues/3161 - // https://msdn.microsoft.com/en-us/library/aa387685(VS.85).aspx - if (BN_num_bits(e_value) > 32) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_E_VALUE); - return 0; - } - - int ret = 0; - int prime_bits = bits / 2; - BN_CTX *ctx = BN_CTX_new(); - if (ctx == NULL) { - goto bn_err; - } - BN_CTX_start(ctx); - BIGNUM *totient = BN_CTX_get(ctx); - BIGNUM *pm1 = BN_CTX_get(ctx); - BIGNUM *qm1 = BN_CTX_get(ctx); - BIGNUM *sqrt2 = BN_CTX_get(ctx); - BIGNUM *pow2_prime_bits_100 = BN_CTX_get(ctx); - BIGNUM *pow2_prime_bits = BN_CTX_get(ctx); - if (totient == NULL || pm1 == NULL || qm1 == NULL || sqrt2 == NULL || - pow2_prime_bits_100 == NULL || pow2_prime_bits == NULL || - !BN_set_bit(pow2_prime_bits_100, prime_bits - 100) || - !BN_set_bit(pow2_prime_bits, prime_bits)) { - goto bn_err; - } - - // We need the RSA components non-NULL. - if (!ensure_bignum(&rsa->n) || - !ensure_bignum(&rsa->d) || - !ensure_bignum(&rsa->e) || - !ensure_bignum(&rsa->p) || - !ensure_bignum(&rsa->q) || - !ensure_bignum(&rsa->dmp1) || - !ensure_bignum(&rsa->dmq1)) { - goto bn_err; - } - - if (!BN_copy(rsa->e, e_value)) { - goto bn_err; - } - - // Compute sqrt2 >= ⌊2^(prime_bits-1)×√2⌋. - if (!bn_set_words(sqrt2, kBoringSSLRSASqrtTwo, kBoringSSLRSASqrtTwoLen)) { - goto bn_err; - } - int sqrt2_bits = kBoringSSLRSASqrtTwoLen * BN_BITS2; - assert(sqrt2_bits == (int)BN_num_bits(sqrt2)); - if (sqrt2_bits > prime_bits) { - // For key sizes up to 4096 (prime_bits = 2048), this is exactly - // ⌊2^(prime_bits-1)×√2⌋. - if (!BN_rshift(sqrt2, sqrt2, sqrt2_bits - prime_bits)) { - goto bn_err; - } - } else if (prime_bits > sqrt2_bits) { - // For key sizes beyond 4096, this is approximate. We err towards retrying - // to ensure our key is the right size and round up. - if (!BN_add_word(sqrt2, 1) || - !BN_lshift(sqrt2, sqrt2, prime_bits - sqrt2_bits)) { - goto bn_err; - } - } - assert(prime_bits == (int)BN_num_bits(sqrt2)); - - do { - // Generate p and q, each of size |prime_bits|, using the steps outlined in - // appendix FIPS 186-4 appendix B.3.3. - // - // Each call to |generate_prime| fails with probability p = 2^-21. The - // probability that either call fails is 1 - (1-p)^2, which is around 2^-20. - if (!generate_prime(rsa->p, prime_bits, rsa->e, NULL, sqrt2, - pow2_prime_bits_100, ctx, cb) || - !BN_GENCB_call(cb, 3, 0) || - !generate_prime(rsa->q, prime_bits, rsa->e, rsa->p, sqrt2, - pow2_prime_bits_100, ctx, cb) || - !BN_GENCB_call(cb, 3, 1)) { - goto bn_err; - } - - if (BN_cmp(rsa->p, rsa->q) < 0) { - BIGNUM *tmp = rsa->p; - rsa->p = rsa->q; - rsa->q = tmp; - } - - // Calculate d = e^(-1) (mod lcm(p-1, q-1)), per FIPS 186-4. This differs - // from typical RSA implementations which use (p-1)*(q-1). - // - // Note this means the size of d might reveal information about p-1 and - // q-1. However, we do operations with Chinese Remainder Theorem, so we only - // use d (mod p-1) and d (mod q-1) as exponents. Using a minimal totient - // does not affect those two values. - int no_inverse; - if (!bn_usub_consttime(pm1, rsa->p, BN_value_one()) || - !bn_usub_consttime(qm1, rsa->q, BN_value_one()) || - !bn_lcm_consttime(totient, pm1, qm1, ctx) || - !bn_mod_inverse_consttime(rsa->d, &no_inverse, rsa->e, totient, ctx)) { - goto bn_err; - } - - // Retry if |rsa->d| <= 2^|prime_bits|. See appendix B.3.1's guidance on - // values for d. - } while (BN_cmp(rsa->d, pow2_prime_bits) <= 0); - - assert(BN_num_bits(pm1) == (unsigned)prime_bits); - assert(BN_num_bits(qm1) == (unsigned)prime_bits); - if (// Calculate n. - !bn_mul_consttime(rsa->n, rsa->p, rsa->q, ctx) || - // Calculate d mod (p-1). - !bn_div_consttime(NULL, rsa->dmp1, rsa->d, pm1, prime_bits, ctx) || - // Calculate d mod (q-1) - !bn_div_consttime(NULL, rsa->dmq1, rsa->d, qm1, prime_bits, ctx)) { - goto bn_err; - } - bn_set_minimal_width(rsa->n); - - // Sanity-check that |rsa->n| has the specified size. This is implied by - // |generate_prime|'s bounds. - if (BN_num_bits(rsa->n) != (unsigned)bits) { - OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); - goto err; - } - - // Call |freeze_private_key| to compute the inverse of q mod p, by way of - // |rsa->mont_p|. - if (!freeze_private_key(rsa, ctx)) { - goto bn_err; - } - - // The key generation process is complex and thus error-prone. It could be - // disastrous to generate and then use a bad key so double-check that the key - // makes sense. - if (!RSA_check_key(rsa)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_INTERNAL_ERROR); - goto err; - } - - ret = 1; - -bn_err: - if (!ret) { - OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); - } -err: - if (ctx != NULL) { - BN_CTX_end(ctx); - BN_CTX_free(ctx); - } - return ret; -} - -static void replace_bignum(BIGNUM **out, BIGNUM **in) { - BN_free(*out); - *out = *in; - *in = NULL; -} - -static void replace_bn_mont_ctx(BN_MONT_CTX **out, BN_MONT_CTX **in) { - BN_MONT_CTX_free(*out); - *out = *in; - *in = NULL; -} - -static int RSA_generate_key_ex_maybe_fips(RSA *rsa, int bits, - const BIGNUM *e_value, BN_GENCB *cb, - int check_fips) { - boringssl_ensure_rsa_self_test(); - - RSA *tmp = NULL; - uint32_t err; - int ret = 0; - - // |rsa_generate_key_impl|'s 2^-20 failure probability is too high at scale, - // so we run the FIPS algorithm four times, bringing it down to 2^-80. We - // should just adjust the retry limit, but FIPS 186-4 prescribes that value - // and thus results in unnecessary complexity. - int failures = 0; - do { - ERR_clear_error(); - // Generate into scratch space, to avoid leaving partial work on failure. - tmp = RSA_new(); - if (tmp == NULL) { - goto out; - } - - if (rsa_generate_key_impl(tmp, bits, e_value, cb)) { - break; - } - - err = ERR_peek_error(); - RSA_free(tmp); - tmp = NULL; - failures++; - - // Only retry on |RSA_R_TOO_MANY_ITERATIONS|. This is so a caller-induced - // failure in |BN_GENCB_call| is still fatal. - } while (failures < 4 && ERR_GET_LIB(err) == ERR_LIB_RSA && - ERR_GET_REASON(err) == RSA_R_TOO_MANY_ITERATIONS); - - if (tmp == NULL || (check_fips && !RSA_check_fips(tmp))) { - goto out; - } - - replace_bignum(&rsa->n, &tmp->n); - replace_bignum(&rsa->e, &tmp->e); - replace_bignum(&rsa->d, &tmp->d); - replace_bignum(&rsa->p, &tmp->p); - replace_bignum(&rsa->q, &tmp->q); - replace_bignum(&rsa->dmp1, &tmp->dmp1); - replace_bignum(&rsa->dmq1, &tmp->dmq1); - replace_bignum(&rsa->iqmp, &tmp->iqmp); - replace_bn_mont_ctx(&rsa->mont_n, &tmp->mont_n); - replace_bn_mont_ctx(&rsa->mont_p, &tmp->mont_p); - replace_bn_mont_ctx(&rsa->mont_q, &tmp->mont_q); - replace_bignum(&rsa->d_fixed, &tmp->d_fixed); - replace_bignum(&rsa->dmp1_fixed, &tmp->dmp1_fixed); - replace_bignum(&rsa->dmq1_fixed, &tmp->dmq1_fixed); - replace_bignum(&rsa->inv_small_mod_large_mont, - &tmp->inv_small_mod_large_mont); - rsa->private_key_frozen = tmp->private_key_frozen; - ret = 1; - -out: - RSA_free(tmp); - return ret; -} - -int RSA_generate_key_ex(RSA *rsa, int bits, const BIGNUM *e_value, - BN_GENCB *cb) { - return RSA_generate_key_ex_maybe_fips(rsa, bits, e_value, cb, - /*check_fips=*/0); -} - -int RSA_generate_key_fips(RSA *rsa, int bits, BN_GENCB *cb) { - // FIPS 186-4 allows 2048-bit and 3072-bit RSA keys (1024-bit and 1536-bit - // primes, respectively) with the prime generation method we use. - // Subsequently, IG A.14 stated that larger modulus sizes can be used and ACVP - // testing supports 4096 bits. - if (bits != 2048 && bits != 3072 && bits != 4096) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_RSA_PARAMETERS); - return 0; - } - - BIGNUM *e = BN_new(); - int ret = e != NULL && - BN_set_word(e, RSA_F4) && - RSA_generate_key_ex_maybe_fips(rsa, bits, e, cb, /*check_fips=*/1); - BN_free(e); - - if (ret) { - FIPS_service_indicator_update_state(); - } - return ret; -} - -DEFINE_METHOD_FUNCTION(RSA_METHOD, RSA_default_method) { - // All of the methods are NULL to make it easier for the compiler/linker to - // drop unused functions. The wrapper functions will select the appropriate - // |rsa_default_*| implementation. - OPENSSL_memset(out, 0, sizeof(RSA_METHOD)); - out->common.is_static = 1; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa_impl.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa_impl.cc.inc new file mode 100644 index 00000000..ba08ae77 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/rsa/rsa_impl.cc.inc @@ -0,0 +1,1000 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include + +#include +#include +#include + +#include "../../bcm_support.h" +#include "../../internal.h" +#include "../../mem_internal.h" +#include "../bn/internal.h" +#include "../delocate.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +static_assert(OPENSSL_RSA_MAX_MODULUS_BITS <= + BN_MONTGOMERY_MAX_WORDS * BN_BITS2, + "Max RSA size too big for Montgomery arithmetic"); + +int bssl::rsa_check_public_key(const RSA *rsa) { + auto *impl = FromOpaque(rsa); + + if (impl->n == nullptr) { + OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING); + return 0; + } + + unsigned n_bits = BN_num_bits(impl->n.get()); + if (n_bits > OPENSSL_RSA_MAX_MODULUS_BITS) { + OPENSSL_PUT_ERROR(RSA, RSA_R_MODULUS_TOO_LARGE); + return 0; + } + + if (n_bits < OPENSSL_RSA_MIN_MODULUS_BITS) { + OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); + return 0; + } + + // RSA moduli must be positive and odd. In addition to being necessary for RSA + // in general, we cannot setup Montgomery reduction with even moduli. + if (!BN_is_odd(impl->n.get()) || BN_is_negative(impl->n.get())) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_RSA_PARAMETERS); + return 0; + } + + static const unsigned kMaxExponentBits = 33; + if (impl->e != nullptr) { + // Reject e = 1, negative e, and even e. e must be odd to be relatively + // prime with phi(n). + unsigned e_bits = BN_num_bits(impl->e.get()); + if (e_bits < 2 || BN_is_negative(impl->e.get()) || + !BN_is_odd(impl->e.get())) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_E_VALUE); + return 0; + } + if (impl->flags & RSA_FLAG_LARGE_PUBLIC_EXPONENT) { + // The caller has requested disabling DoS protections. Still, e must be + // less than n. + if (BN_ucmp(impl->n.get(), impl->e.get()) <= 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_E_VALUE); + return 0; + } + } else { + // Mitigate DoS attacks by limiting the exponent size. 33 bits was chosen + // as the limit based on the recommendations in [1] and [2]. Windows + // CryptoAPI doesn't support values larger than 32 bits [3], so it is + // unlikely that exponents larger than 32 bits are being used for anything + // Windows commonly does. + // + // [1] https://www.imperialviolet.org/2012/03/16/rsae.html + // [2] https://www.imperialviolet.org/2012/03/17/rsados.html + // [3] https://msdn.microsoft.com/en-us/library/aa387685(VS.85).aspx + if (e_bits > kMaxExponentBits) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_E_VALUE); + return 0; + } + + // The upper bound on |e_bits| and lower bound on |n_bits| imply e is + // bounded by n. + assert(BN_ucmp(impl->n.get(), impl->e.get()) > 0); + } + } else if (!(impl->flags & RSA_FLAG_NO_PUBLIC_EXPONENT)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING); + return 0; + } + + return 1; +} + +static int ensure_fixed_copy(UniquePtr *out, const BIGNUM *in, + int width) { + if (*out != nullptr) { + return 1; + } + UniquePtr copy(BN_dup(in)); + if (copy == nullptr || !bn_resize_words(copy.get(), width)) { + return 0; + } + bn_secret(copy.get()); + *out = std::move(copy); + return 1; +} + +// freeze_private_key finishes initializing |rsa|'s private key components. +// After this function has returned, |rsa| may not be changed. This is needed +// because |RSA| is a public struct and, additionally, OpenSSL 1.1.0 opaquified +// it wrong (see https://github.com/openssl/openssl/issues/5158). +static int freeze_private_key(RSAImpl *rsa, BN_CTX *ctx) { + rsa->lock.LockRead(); + int frozen = rsa->private_key_frozen; + rsa->lock.UnlockRead(); + if (frozen) { + return 1; + } + + const BIGNUM *n_fixed; + MutexWriteLock lock(&rsa->lock); + if (rsa->private_key_frozen) { + return 1; + } + + // Check the public components are within DoS bounds. + if (!rsa_check_public_key(rsa)) { + return 0; + } + + // Pre-compute various intermediate values, as well as copies of private + // exponents with correct widths. Note that other threads may concurrently + // read from |rsa->n|, |rsa->e|, etc., so any fixes must be in separate + // copies. We use |mont_n->N|, |mont_p->N|, and |mont_q->N| as copies of |n|, + // |p|, and |q| with the correct minimal widths. + + if (rsa->mont_n == nullptr) { + rsa->mont_n.reset(BN_MONT_CTX_new_for_modulus(rsa->n.get(), ctx)); + if (rsa->mont_n == nullptr) { + return 0; + } + } + n_fixed = &rsa->mont_n->N; + + // The only public upper-bound of |rsa->d| is the bit length of |rsa->n|. The + // ASN.1 serialization of RSA private keys unfortunately leaks the byte length + // of |rsa->d|, but normalize it so we only leak it once, rather than per + // operation. + if (rsa->d != nullptr && + !ensure_fixed_copy(&rsa->d_fixed, rsa->d.get(), n_fixed->width)) { + return 0; + } + + if (rsa->e != nullptr && rsa->p != nullptr && rsa->q != nullptr) { + // TODO: p and q are also CONSTTIME_SECRET but not yet marked as such + // because the Montgomery code does things like test whether or not values + // are zero. So the secret marking probably needs to happen inside that + // code. + + if (rsa->mont_p == nullptr) { + rsa->mont_p.reset(BN_MONT_CTX_new_consttime(rsa->p.get(), ctx)); + if (rsa->mont_p == nullptr) { + return 0; + } + } + + if (rsa->mont_q == nullptr) { + rsa->mont_q.reset(BN_MONT_CTX_new_consttime(rsa->q.get(), ctx)); + if (rsa->mont_q == nullptr) { + return 0; + } + } + + if (rsa->dmp1 != nullptr && rsa->dmq1 != nullptr && rsa->iqmp != nullptr) { + // CRT components are only publicly bounded by their corresponding + // moduli's bit lengths. + const BIGNUM *p_fixed = &rsa->mont_p->N; + const BIGNUM *q_fixed = &rsa->mont_q->N; + if (!ensure_fixed_copy(&rsa->dmp1_fixed, rsa->dmp1.get(), + p_fixed->width) || + !ensure_fixed_copy(&rsa->dmq1_fixed, rsa->dmq1.get(), + q_fixed->width)) { + return 0; + } + + // Compute |iqmp_mont|, which is |iqmp| in Montgomery form and with the + // correct bit width. + if (rsa->iqmp_mont == nullptr) { + UniquePtr iqmp_mont(BN_new()); + if (iqmp_mont == nullptr || + !BN_to_montgomery(iqmp_mont.get(), rsa->iqmp.get(), + rsa->mont_p.get(), ctx)) { + return 0; + } + rsa->iqmp_mont = std::move(iqmp_mont); + bn_secret(rsa->iqmp_mont.get()); + } + } + } + + rsa->private_key_frozen = 1; + return 1; +} + +void bssl::rsa_invalidate_key(RSA *rsa) { + auto *impl = FromOpaque(rsa); + impl->private_key_frozen = 0; + impl->mont_n = nullptr; + impl->mont_p = nullptr; + impl->mont_q = nullptr; + impl->d_fixed = nullptr; + impl->dmp1_fixed = nullptr; + impl->dmq1_fixed = nullptr; + impl->iqmp_mont = nullptr; +} + +int bssl::rsa_default_sign_raw(RSA *rsa, size_t *out_len, uint8_t *out, + size_t max_out, const uint8_t *in, size_t in_len, + int padding) { + const unsigned rsa_size = RSA_size(rsa); + uint8_t *buf = nullptr; + int i, ret = 0; + + if (max_out < rsa_size) { + OPENSSL_PUT_ERROR(RSA, RSA_R_OUTPUT_BUFFER_TOO_SMALL); + return 0; + } + + buf = reinterpret_cast(OPENSSL_malloc(rsa_size)); + if (buf == nullptr) { + goto err; + } + + switch (padding) { + case RSA_PKCS1_PADDING: + i = RSA_padding_add_PKCS1_type_1(buf, rsa_size, in, in_len); + break; + case RSA_NO_PADDING: + i = RSA_padding_add_none(buf, rsa_size, in, in_len); + break; + default: + OPENSSL_PUT_ERROR(RSA, RSA_R_UNKNOWN_PADDING_TYPE); + goto err; + } + + if (i <= 0) { + goto err; + } + + if (!rsa_private_transform_no_self_test(rsa, out, buf, rsa_size)) { + goto err; + } + + CONSTTIME_DECLASSIFY(out, rsa_size); + *out_len = rsa_size; + ret = 1; + +err: + OPENSSL_free(buf); + + return ret; +} + + +static int rsa_mod_exp_crt(BIGNUM *r0, const BIGNUM *I, RSAImpl *rsa, + BN_CTX *ctx); + +int bssl::rsa_verify_raw_no_self_test(RSA *rsa, size_t *out_len, uint8_t *out, + size_t max_out, const uint8_t *in, + size_t in_len, int padding) { + auto *impl = FromOpaque(rsa); + + if (impl->n == nullptr || impl->e == nullptr) { + OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING); + return 0; + } + + if (!rsa_check_public_key(rsa)) { + return 0; + } + + const unsigned rsa_size = RSA_size(rsa); + if (max_out < rsa_size) { + OPENSSL_PUT_ERROR(RSA, RSA_R_OUTPUT_BUFFER_TOO_SMALL); + return 0; + } + + if (in_len != rsa_size) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_LEN_NOT_EQUAL_TO_MOD_LEN); + return 0; + } + + UniquePtr ctx(BN_CTX_new()); + if (ctx == nullptr) { + return 0; + } + + int ret = 0; + uint8_t *buf = nullptr; + BN_CTXScope scope(ctx.get()); + BIGNUM *f = BN_CTX_get(ctx.get()); + BIGNUM *result = BN_CTX_get(ctx.get()); + if (f == nullptr || result == nullptr) { + goto err; + } + + if (padding == RSA_NO_PADDING) { + buf = out; + } else { + // Allocate a temporary buffer to hold the padded plaintext. + buf = reinterpret_cast(OPENSSL_malloc(rsa_size)); + if (buf == nullptr) { + goto err; + } + } + + if (BN_bin2bn(in, in_len, f) == nullptr) { + goto err; + } + + if (BN_ucmp(f, impl->n.get()) >= 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_MODULUS); + goto err; + } + + if (!BN_MONT_CTX_set_locked(&impl->mont_n, &impl->lock, impl->n.get(), + ctx.get()) || + !BN_mod_exp_mont(result, f, impl->e.get(), &impl->mont_n->N, ctx.get(), + impl->mont_n.get())) { + goto err; + } + + if (!BN_bn2bin_padded(buf, rsa_size, result)) { + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + goto err; + } + + switch (padding) { + case RSA_PKCS1_PADDING: + ret = + RSA_padding_check_PKCS1_type_1(out, out_len, rsa_size, buf, rsa_size); + break; + case RSA_NO_PADDING: + ret = 1; + *out_len = rsa_size; + break; + default: + OPENSSL_PUT_ERROR(RSA, RSA_R_UNKNOWN_PADDING_TYPE); + goto err; + } + + if (!ret) { + OPENSSL_PUT_ERROR(RSA, RSA_R_PADDING_CHECK_FAILED); + goto err; + } + +err: + if (buf != out) { + OPENSSL_free(buf); + } + return ret; +} + +int RSA_verify_raw(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, + const uint8_t *in, size_t in_len, int padding) { + boringssl_ensure_rsa_verify_self_test(); + return rsa_verify_raw_no_self_test(rsa, out_len, out, max_out, in, in_len, + padding); +} + +int bssl::rsa_default_private_transform(RSA *rsa, uint8_t *out, + const uint8_t *in, size_t len) { + auto *impl = FromOpaque(rsa); + + if (impl->n == nullptr || impl->d == nullptr) { + OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING); + return 0; + } + + UniquePtr ctx(BN_CTX_new()); + if (ctx == nullptr) { + return 0; + } + BN_CTXScope scope(ctx.get()); + BIGNUM *f = BN_CTX_get(ctx.get()); + BIGNUM *result = BN_CTX_get(ctx.get()); + if (f == nullptr || result == nullptr) { + return 0; + } + + // The caller should have ensured this. + assert(len == BN_num_bytes(impl->n.get())); + if (BN_bin2bn(in, len, f) == nullptr) { + return 0; + } + + // The input to the RSA private transform may be secret, but padding is + // expected to construct a value within range, so we can leak this comparison. + if (constant_time_declassify_int(BN_ucmp(f, impl->n.get()) >= 0)) { + // Usually the padding functions would catch this. + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_MODULUS); + return 0; + } + + if (!freeze_private_key(impl, ctx.get())) { + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + return 0; + } + + if (impl->e == nullptr && (impl->flags & RSA_FLAG_NO_PUBLIC_EXPONENT) == 0) { + // Unless the private key was specifically created with an API like + // |RSA_new_private_key_no_e|, don't allow RSA keys to be missing the public + // exponent, which disables some fault attack mitigations. (It should not be + // possible to construct such an |RSA| object in the public API.) + OPENSSL_PUT_ERROR(RSA, RSA_R_NO_PUBLIC_EXPONENT); + return 0; + } + + if (impl->p != nullptr && impl->q != nullptr && impl->e != nullptr && + impl->dmp1 != nullptr && impl->dmq1 != nullptr && impl->iqmp != nullptr && + // Require that we can reduce |f| by |impl->p| and |impl->q| in constant + // time, which requires primes be the same size, rounded to the Montgomery + // coefficient. (See |mod_montgomery|.) This is not required by RFC 8017, + // but it is true for keys generated by us and all common implementations. + bn_less_than_montgomery_R(impl->q.get(), impl->mont_p.get()) && + bn_less_than_montgomery_R(impl->p.get(), impl->mont_q.get())) { + if (!rsa_mod_exp_crt(result, f, impl, ctx.get())) { + return 0; + } + } else if (!BN_mod_exp_mont_consttime(result, f, impl->d_fixed.get(), + impl->n.get(), ctx.get(), + impl->mont_n.get())) { + return 0; + } + + // Verify the result to protect against fault attacks as described in the + // 1997 paper "On the Importance of Checking Cryptographic Protocols for + // Faults" by Dan Boneh, Richard A. DeMillo, and Richard J. Lipton. Some + // implementations do this only when the CRT is used, but we do it in all + // cases. Section 6 of the aforementioned paper describes an attack that + // works when the CRT isn't used. That attack is much less likely to succeed + // than the CRT attack, but there have likely been improvements since 1997. + // + // This check is cheap assuming |e| is small, which we require in + // |rsa_check_public_key|. + if (impl->e != nullptr) { + BIGNUM *vrfy = BN_CTX_get(ctx.get()); + if (vrfy == nullptr || + !BN_mod_exp_mont(vrfy, result, impl->e.get(), impl->n.get(), ctx.get(), + impl->mont_n.get()) || + !constant_time_declassify_int(BN_equal_consttime(vrfy, f))) { + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + return 0; + } + } + + // The computation should have left |result| as a maximally-wide number, so + // that it and serializing does not leak information about the magnitude of + // the result. + // + // See Falko Strenzke, "Manger's Attack revisited", ICICS 2010. + assert(result->width == impl->mont_n->N.width); + bn_assert_fits_in_bytes(result, len); + if (!BN_bn2bin_padded(out, len, result)) { + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + return 0; + } + + return 1; +} + +// mod_montgomery sets |r| to |I| mod |p|. |I| must already be fully reduced +// modulo |p| times |q|. It returns one on success and zero on error. +static int mod_montgomery(BIGNUM *r, const BIGNUM *I, const BIGNUM *p, + const BN_MONT_CTX *mont_p, const BIGNUM *q, + BN_CTX *ctx) { + // Reducing in constant-time with Montgomery reduction requires I <= p * R. We + // have I < p * q, so this follows if q < R. The caller should have checked + // this already. + if (!bn_less_than_montgomery_R(q, mont_p)) { + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + return 0; + } + + if ( // Reduce mod p with Montgomery reduction. This computes I * R^-1 mod p. + !BN_from_montgomery(r, I, mont_p, ctx) || + // Multiply by R^2 and do another Montgomery reduction to compute + // I * R^-1 * R^2 * R^-1 = I mod p. + !BN_to_montgomery(r, r, mont_p, ctx)) { + return 0; + } + + // By precomputing R^3 mod p (normally |BN_MONT_CTX| only uses R^2 mod p) and + // adjusting the API for |BN_mod_exp_mont_consttime|, we could instead compute + // I * R mod p here and save a reduction per prime. But this would require + // changing the RSAZ code and may not be worth it. Note that the RSAZ code + // uses a different radix, so it uses R' = 2^1044. There we'd actually want + // R^2 * R', and would further benefit from a precomputed R'^2. It currently + // converts |mont_p->RR| to R'^2. + return 1; +} + +static int rsa_mod_exp_crt(BIGNUM *r0, const BIGNUM *I, RSAImpl *rsa, + BN_CTX *ctx) { + assert(ctx != nullptr); + + assert(rsa->n != nullptr); + assert(rsa->e != nullptr); + assert(rsa->d != nullptr); + assert(rsa->p != nullptr); + assert(rsa->q != nullptr); + assert(rsa->dmp1 != nullptr); + assert(rsa->dmq1 != nullptr); + assert(rsa->iqmp != nullptr); + + BN_CTXScope scope(ctx); + BIGNUM *r1 = BN_CTX_get(ctx); + BIGNUM *m1 = BN_CTX_get(ctx); + if (r1 == nullptr || m1 == nullptr) { + return 0; + } + + // Use the minimal-width versions of |n|, |p|, and |q|. Either works, but if + // someone gives us non-minimal values, these will be slightly more efficient + // on the non-Montgomery operations. + BIGNUM *n = &rsa->mont_n->N; + BIGNUM *p = &rsa->mont_p->N; + BIGNUM *q = &rsa->mont_q->N; + + // This is a pre-condition for |mod_montgomery|. It was already checked by the + // caller. + declassify_assert(BN_ucmp(I, n) < 0); + + if ( // |m1| is the result modulo |q|. + !mod_montgomery(r1, I, q, rsa->mont_q.get(), p, ctx) || + !BN_mod_exp_mont_consttime(m1, r1, rsa->dmq1_fixed.get(), q, ctx, + rsa->mont_q.get()) || + // |r0| is the result modulo |p|. + !mod_montgomery(r1, I, p, rsa->mont_p.get(), q, ctx) || + !BN_mod_exp_mont_consttime(r0, r1, rsa->dmp1_fixed.get(), p, ctx, + rsa->mont_p.get()) || + // Compute r0 = r0 - m1 mod p. |m1| is reduced mod |q|, not |p|, so we + // just run |mod_montgomery| again for srsaicity. This could be more + // efficient with more cases: if |p > q|, |m1| is already reduced. If + // |p < q| but they have the same bit width, |bn_reduce_once| suffices. + // However, compared to over 2048 Montgomery multiplications above, this + // difference is not measurable. + !mod_montgomery(r1, m1, p, rsa->mont_p.get(), q, ctx) || + !bn_mod_sub_consttime(r0, r0, r1, p, ctx) || + // r0 = r0 * iqmp mod p. We use Montgomery multiplication to compute this + // in constant time. |iqmp_mont| is in Montgomery form and r0 is not, so + // the result is taken out of Montgomery form. + !BN_mod_mul_montgomery(r0, r0, rsa->iqmp_mont.get(), rsa->mont_p.get(), + ctx) || + // r0 = r0 * q + m1 gives the final result. Reducing modulo q gives m1, so + // it is correct mod p. Reducing modulo p gives (r0-m1)*iqmp*q + m1 = r0, + // so it is correct mod q. Finally, the result is bounded by [m1, n + m1), + // and the result is at least |m1|, so this must be the unique answer in + // [0, n). + !bn_mul_consttime(r0, r0, q, ctx) || // + !bn_uadd_consttime(r0, r0, m1)) { + return 0; + } + + // The result should be bounded by |n|, but fixed-width operations may + // bound the width slightly higher, so fix it. This trips constant-time checks + // because a naive data flow analysis does not realize the excess words are + // publicly zero. + declassify_assert(BN_cmp(r0, n) < 0); + bn_assert_fits_in_bytes(r0, BN_num_bytes(n)); + if (!bn_resize_words(r0, n->width)) { + return 0; + } + + return 1; +} + +static int ensure_bignum(UniquePtr *out) { + if (*out == nullptr) { + out->reset(BN_new()); + } + return *out != nullptr; +} + +// generate_prime sets |out| to a prime with length |bits| such that |out|-1 is +// relatively prime to |e|. If |p| is non-NULL, |out| will also not be close to +// |p|. |pow2_bits_100| must be 2^(bits-100). +// +// This function fails with probability around 2^-21. +static int generate_prime(BIGNUM *out, int bits, const BIGNUM *e, + const BIGNUM *p, const BIGNUM *pow2_bits_100, + BN_CTX *ctx, BN_GENCB *cb) { + if (bits < 128 || (bits % BN_BITS2) != 0) { + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + return 0; + } + assert(BN_is_pow2(pow2_bits_100)); + assert(BN_is_bit_set(pow2_bits_100, bits - 100)); + + // See FIPS 186-5 appendix A.1.3, steps 4 and 5. Note |bits| here is nlen/2. + + // Use the limit from steps 4.7 and 5.8 for most values of |e|. When |e| is 3, + // the 186-5 limit is too low, so we use a higher one. Note this case is not + // reachable from |RSA_generate_key_fips|. + // + // |limit| determines the failure probability. We must find a prime that is + // not 1 mod |e|. By the prime number theorem, we'll find one with probability + // p = (e-1)/e * 2/(ln(2)*bits). Note the second term is doubled because we + // discard even numbers. + // + // The failure probability is thus (1-p)^limit. To convert that to a power of + // two, we take logs. -log_2((1-p)^limit) = -limit * ln(1-p) / ln(2). + // + // >>> def f(bits, e, limit): + // ... p = (e-1.0)/e * 2.0/(math.log(2)*bits) + // ... return -limit * math.log(1 - p) / math.log(2) + // ... + // >>> f(1024, 65537, 5*1024) + // 20.842750558272634 + // >>> f(1536, 65537, 5*1536) + // 20.83294549602474 + // >>> f(2048, 65537, 5*2048) + // 20.828047576234948 + // >>> f(1024, 3, 8*1024) + // 22.222147925962307 + // >>> f(1536, 3, 8*1536) + // 22.21518251065506 + // >>> f(2048, 3, 8*2048) + // 22.211701985875937 + if (bits >= INT_MAX / 32) { + OPENSSL_PUT_ERROR(RSA, RSA_R_MODULUS_TOO_LARGE); + return 0; + } + int limit = BN_is_word(e, 3) ? bits * 8 : bits * 5; + + int tries = 0, rand_tries = 0; + BN_CTXScope scope(ctx); + BIGNUM *tmp = BN_CTX_get(ctx); + if (tmp == nullptr) { + return 0; + } + + for (;;) { + // Generate a random number of length |bits| where the bottom bit is set and + // top two bits are set (steps 4.2–4.4 and 5.2–5.4): + // + // - Setting the top two bits is permitted by steps 4.2.1 and 5.2.1. Doing + // so implements steps 4.4 and 5.4 by making this case impossible because + // √2 < 1.5. + // + // - Setting the bottom bit implements steps 4.3 and 5.3. + if (!BN_rand(out, bits, BN_RAND_TOP_TWO, BN_RAND_BOTTOM_ODD) || + !BN_GENCB_call(cb, BN_GENCB_GENERATED, rand_tries++)) { + return 0; + } + + if (p != nullptr) { + // If |p| and |out| are too close, try again (step 5.5). + if (!bn_abs_sub_consttime(tmp, out, p, ctx)) { + return 0; + } + if (BN_cmp(tmp, pow2_bits_100) <= 0) { + continue; + } + } + + // RSA key generation's bottleneck is discarding composites. If it fails + // trial division, do not bother computing a GCD or performing Miller-Rabin. + if (!bn_odd_number_is_obviously_composite(out)) { + // Check gcd(out-1, e) is one (steps 4.5 and 5.6). Leaking the final + // result of this comparison is safe because, if not relatively prime, the + // value will be discarded. + int relatively_prime; + if (!bn_usub_consttime(tmp, out, BN_value_one()) || + !bn_is_relatively_prime(&relatively_prime, tmp, e, ctx)) { + return 0; + } + if (constant_time_declassify_int(relatively_prime)) { + // Test |out| for primality (steps 4.5.1 and 5.6.1). + int is_probable_prime; + if (!BN_primality_test(&is_probable_prime, out, + BN_prime_checks_for_generation, ctx, 0, cb)) { + return 0; + } + if (is_probable_prime) { + return 1; + } + } + } + + // If we've tried too many times to find a prime, abort (steps 4.7 and 5.8). + tries++; + if (tries >= limit) { + OPENSSL_PUT_ERROR(RSA, RSA_R_TOO_MANY_ITERATIONS); + return 0; + } + if (!BN_GENCB_call(cb, 2, tries)) { + return 0; + } + } +} + +// rsa_generate_key_impl generates an RSA key using a generalized version of +// FIPS 186-5 appendix A.1.3. |RSA_generate_key_fips| performs additional checks +// for FIPS-compliant key generation. +// +// This function returns one on success and zero on failure. It has a failure +// probability of about 2^-20. +static int rsa_generate_key_impl(RSAImpl *rsa, int bits, const BIGNUM *e_value, + BN_GENCB *cb) { + if (bits > OPENSSL_RSA_MAX_MODULUS_BITS) { + OPENSSL_PUT_ERROR(RSA, RSA_R_MODULUS_TOO_LARGE); + return 0; + } + + // Always generate RSA keys which are a multiple of 128 bits. Round |bits| + // down as needed. + bits &= ~127; + + // Reject excessively small keys. + if (bits < OPENSSL_RSA_MIN_MODULUS_BITS) { + OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); + return 0; + } + + // Reject excessively large public exponents. Windows CryptoAPI and Go don't + // support values larger than 32 bits, so match their limits for generating + // keys. (|rsa_check_public_key| uses a slightly more conservative value, but + // we don't need to support generating such keys.) + // https://github.com/golang/go/issues/3161 + // https://msdn.microsoft.com/en-us/library/aa387685(VS.85).aspx + if (BN_num_bits(e_value) > 32) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_E_VALUE); + return 0; + } + + // The smallest reasonable RSA exponent is 3, and it definitely must be odd. + // Catching these here prevents endless loops or slow computation when trying + // to generate keys later, and results in a better error code. + if ( + // Would fail in |bn_lcm_consttime| as it only allows positive integers. + BN_is_negative(e_value) || + // Would fail in |generate_prime| as only one |rsa->p|-1 is coprime with + // an even |e_value| and that one is a little bit short. (The R in RSA + // doesn't stand for Rabin.) + !BN_is_odd(e_value) || + // Would loop endlessly because it'll always compute an |rsa->d| exponent + // of 1, which is too small. + BN_is_one(e_value)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_E_VALUE); + return 0; + } + + UniquePtr ctx(BN_CTX_new()); + if (ctx == nullptr) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + return 0; + } + + int prime_bits = bits / 2; + BN_CTXScope scope(ctx.get()); + BIGNUM *totient = BN_CTX_get(ctx.get()); + BIGNUM *pm1 = BN_CTX_get(ctx.get()); + BIGNUM *qm1 = BN_CTX_get(ctx.get()); + BIGNUM *pow2_prime_bits_100 = BN_CTX_get(ctx.get()); + BIGNUM *pow2_prime_bits = BN_CTX_get(ctx.get()); + if (totient == nullptr || pm1 == nullptr || qm1 == nullptr || + pow2_prime_bits_100 == nullptr || pow2_prime_bits == nullptr || + !BN_set_bit(pow2_prime_bits_100, prime_bits - 100) || + !BN_set_bit(pow2_prime_bits, prime_bits)) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + return 0; + } + + // We need the RSA components non-null. + if (!ensure_bignum(&rsa->n) || // + !ensure_bignum(&rsa->d) || // + !ensure_bignum(&rsa->e) || // + !ensure_bignum(&rsa->p) || // + !ensure_bignum(&rsa->q) || // + !ensure_bignum(&rsa->dmp1) || // + !ensure_bignum(&rsa->dmq1) || // + !ensure_bignum(&rsa->iqmp)) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + return 0; + } + + if (!BN_copy(rsa->e.get(), e_value)) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + return 0; + } + + do { + // Generate p and q, each of size |prime_bits|, using the steps outlined in + // appendix FIPS 186-5 appendix C.3.3. + // + // Each call to |generate_prime| fails with probability p = 2^-21. The + // probability that either call fails is 1 - (1-p)^2, which is around 2^-20. + if (!generate_prime(rsa->p.get(), prime_bits, rsa->e.get(), nullptr, + pow2_prime_bits_100, ctx.get(), cb) || + !BN_GENCB_call(cb, 3, 0) || + !generate_prime(rsa->q.get(), prime_bits, rsa->e.get(), rsa->p.get(), + pow2_prime_bits_100, ctx.get(), cb) || + !BN_GENCB_call(cb, 3, 1)) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + return 0; + } + + if (BN_cmp(rsa->p.get(), rsa->q.get()) < 0) { + std::swap(rsa->p, rsa->q); + } + + // Calculate d = e^(-1) (mod lcm(p-1, q-1)), per FIPS 186-5. This differs + // from typical RSA rsaementations which use (p-1)*(q-1). + // + // Note this means the size of d might reveal information about p-1 and + // q-1. However, we do operations with Chinese Remainder Theorem, so we only + // use d (mod p-1) and d (mod q-1) as exponents. Using a minimal totient + // does not affect those two values. + int no_inverse; + if (!bn_usub_consttime(pm1, rsa->p.get(), BN_value_one()) || + !bn_usub_consttime(qm1, rsa->q.get(), BN_value_one()) || + !bn_lcm_consttime(totient, pm1, qm1, ctx.get()) || + !bn_mod_inverse_consttime(rsa->d.get(), &no_inverse, rsa->e.get(), + totient, ctx.get())) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + return 0; + } + + // Retry if |rsa->d| <= 2^|prime_bits|. See appendix A.3.1's guidance on + // values for d. When we retry, p and q are discarded, so it is safe to leak + // this comparison. + } while ( + constant_time_declassify_int(BN_cmp(rsa->d.get(), pow2_prime_bits) <= 0)); + + assert(BN_num_bits(pm1) == (unsigned)prime_bits); + assert(BN_num_bits(qm1) == (unsigned)prime_bits); + if ( // Calculate n. + !bn_mul_consttime(rsa->n.get(), rsa->p.get(), rsa->q.get(), ctx.get()) || + // Calculate d mod (p-1). + !bn_div_consttime(nullptr, rsa->dmp1.get(), rsa->d.get(), pm1, prime_bits, + ctx.get()) || + // Calculate d mod (q-1) + !bn_div_consttime(nullptr, rsa->dmq1.get(), rsa->d.get(), qm1, prime_bits, + ctx.get())) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + return 0; + } + bn_set_minimal_width(rsa->n.get()); + + // |rsa->n| is computed from the private key, but is public. + bn_declassify(rsa->n.get()); + + // Calculate q^-1 mod p. + rsa->mont_p.reset(BN_MONT_CTX_new_consttime(rsa->p.get(), ctx.get())); + if (rsa->mont_p == nullptr || // + !bn_mod_inverse_secret_prime(rsa->iqmp.get(), rsa->q.get(), rsa->p.get(), + ctx.get(), rsa->mont_p.get())) { + OPENSSL_PUT_ERROR(RSA, ERR_LIB_BN); + return 0; + } + + // Sanity-check that |rsa->n| has the specified size. This is rsaied by + // |generate_prime|'s bounds. + if (BN_num_bits(rsa->n.get()) != (unsigned)bits) { + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + return 0; + } + + // The key generation process is complex and thus error-prone. It could be + // disastrous to generate and then use a bad key so double-check that the key + // makes sense. Also, while |rsa| is mutable, fill in the cached components. + if (!RSA_check_key(rsa) || !freeze_private_key(rsa, ctx.get())) { + OPENSSL_PUT_ERROR(RSA, RSA_R_INTERNAL_ERROR); + return 0; + } + + return 1; +} + +static int RSA_generate_key_ex_maybe_fips(RSAImpl *rsa, int bits, + const BIGNUM *e_value, BN_GENCB *cb, + int check_fips) { + boringssl_ensure_rsa_sign_self_test(); + boringssl_ensure_rsa_verify_self_test(); + + if (rsa == nullptr) { + OPENSSL_PUT_ERROR(EC, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + UniquePtr tmp; + + // |rsa_generate_key_impl|'s 2^-20 failure probability is too high at scale, + // so we run the FIPS algorithm four times, bringing it down to 2^-80. We + // should just adjust the retry limit, but FIPS 186-5 prescribes that value + // and thus results in unnecessary complexity. + int failures = 0; + do { + ERR_clear_error(); + // Generate into scratch space, to avoid leaving partial work on failure. + tmp.reset(FromOpaque(RSA_new())); + if (tmp == nullptr) { + return 0; + } + + if (rsa_generate_key_impl(tmp.get(), bits, e_value, cb)) { + break; + } + + tmp = nullptr; + failures++; + + // Only retry on |RSA_R_TOO_MANY_ITERATIONS|. This is so a caller-induced + // failure in |BN_GENCB_call| is still fatal. + } while (failures < 4 && ERR_equals(ERR_peek_error(), ERR_LIB_RSA, + RSA_R_TOO_MANY_ITERATIONS)); + + if (tmp == nullptr || (check_fips && !RSA_check_fips(tmp.get()))) { + return 0; + } + + rsa_invalidate_key(rsa); + rsa->n = std::move(tmp->n); + rsa->e = std::move(tmp->e); + rsa->d = std::move(tmp->d); + rsa->p = std::move(tmp->p); + rsa->q = std::move(tmp->q); + rsa->dmp1 = std::move(tmp->dmp1); + rsa->dmq1 = std::move(tmp->dmq1); + rsa->iqmp = std::move(tmp->iqmp); + rsa->mont_n = std::move(tmp->mont_n); + rsa->mont_p = std::move(tmp->mont_p); + rsa->mont_q = std::move(tmp->mont_q); + rsa->d_fixed = std::move(tmp->d_fixed); + rsa->dmp1_fixed = std::move(tmp->dmp1_fixed); + rsa->dmq1_fixed = std::move(tmp->dmq1_fixed); + rsa->iqmp_mont = std::move(tmp->iqmp_mont); + rsa->private_key_frozen = tmp->private_key_frozen; + return 1; +} + +int RSA_generate_key_ex(RSA *rsa, int bits, const BIGNUM *e_value, + BN_GENCB *cb) { + return RSA_generate_key_ex_maybe_fips(FromOpaque(rsa), bits, e_value, cb, + /*check_fips=*/0); +} + +int RSA_generate_key_fips(RSA *rsa, int bits, BN_GENCB *cb) { + // FIPS 186-4 allowed 2048-bit and 3072-bit RSA keys (1024-bit and 1536-bit + // primes, respectively) with the prime generation method we use. + // Subsequently, IG A.14 stated that larger modulus sizes can be used and ACVP + // testing supports 4096 bits, and FIPS 186-5 allowed all key sizes at least + // 2048. + if (bits != 2048 && bits != 3072 && bits != 4096) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_RSA_PARAMETERS); + return 0; + } + + BIGNUM *e = BN_new(); + int ret = e != nullptr && BN_set_word(e, RSA_F4) && + RSA_generate_key_ex_maybe_fips(FromOpaque(rsa), bits, e, cb, + /*check_fips=*/1); + BN_free(e); + + if (ret) { + FIPS_service_indicator_update_state(); + } + return ret; +} + +BSSL_NAMESPACE_BEGIN + +DEFINE_METHOD_FUNCTION(RSA_METHOD, RSA_default_method) { + // All of the methods are NULL to make it easier for the compiler/linker to + // drop unused functions. The wrapper functions will select the appropriate + // |rsa_default_*| implementation. + OPENSSL_memset(out, 0, sizeof(RSA_METHOD)); + out->common.is_static = 1; +} + +BSSL_NAMESPACE_END diff --git a/third_party/boringssl/src/crypto/fipsmodule/self_check/fips.c b/third_party/boringssl/src/crypto/fipsmodule/self_check/fips.c deleted file mode 100644 index ce039576..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/self_check/fips.c +++ /dev/null @@ -1,120 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include "../../internal.h" -#include "../delocate.h" - - -int FIPS_mode(void) { -#if defined(BORINGSSL_FIPS) && !defined(OPENSSL_ASAN) - return 1; -#else - return 0; -#endif -} - -int FIPS_mode_set(int on) { return on == FIPS_mode(); } - -const char *FIPS_module_name(void) { return "BoringCrypto"; } - -uint32_t FIPS_version(void) { - return 0; -} - -int FIPS_query_algorithm_status(const char *algorithm) { -#if defined(BORINGSSL_FIPS) - static const char kApprovedAlgorithms[][13] = { - "AES-CBC", - "AES-CCM", - "AES-CTR", - "AES-ECB", - "AES-GCM", - "AES-KW", - "AES-KWP", - "ctrDRBG", - "ECC-SSC", - "ECDSA-sign", - "ECDSA-verify", - "FFC-SSC", - "HMAC", - "RSA-sign", - "RSA-verify", - "SHA-1", - "SHA2-224", - "SHA2-256", - "SHA2-384", - "SHA2-512", - "SHA2-512/256", - }; - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kApprovedAlgorithms); i++) { - if (strcmp(algorithm, kApprovedAlgorithms[i]) == 0) { - return 1; - } - } -#endif // BORINGSSL_FIPS - - return 0; -} - -#if defined(BORINGSSL_FIPS_COUNTERS) - -size_t FIPS_read_counter(enum fips_counter_t counter) { - if (counter < 0 || counter > fips_counter_max) { - abort(); - } - - const size_t *array = - CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_FIPS_COUNTERS); - if (!array) { - return 0; - } - - return array[counter]; -} - -void boringssl_fips_inc_counter(enum fips_counter_t counter) { - if (counter < 0 || counter > fips_counter_max) { - abort(); - } - - size_t *array = - CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_FIPS_COUNTERS); - if (!array) { - const size_t num_bytes = sizeof(size_t) * (fips_counter_max + 1); - array = OPENSSL_malloc(num_bytes); - if (!array) { - return; - } - - OPENSSL_memset(array, 0, num_bytes); - if (!CRYPTO_set_thread_local(OPENSSL_THREAD_LOCAL_FIPS_COUNTERS, array, - OPENSSL_free)) { - // |OPENSSL_free| has already been called by |CRYPTO_set_thread_local|. - return; - } - } - - array[counter]++; -} - -#else - -size_t FIPS_read_counter(enum fips_counter_t counter) { return 0; } - -// boringssl_fips_inc_counter is a no-op, inline function in internal.h in this -// case. That should let the compiler optimise away the callsites. - -#endif diff --git a/third_party/boringssl/src/crypto/fipsmodule/self_check/fips.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/self_check/fips.cc.inc new file mode 100644 index 00000000..5ed05cbb --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/self_check/fips.cc.inc @@ -0,0 +1,129 @@ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../../internal.h" +#include "../delocate.h" + + +using namespace bssl; + +int FIPS_mode() { +#if defined(BORINGSSL_FIPS) && !defined(OPENSSL_ASAN) + return 1; +#else + return 0; +#endif +} + +int FIPS_mode_set(int on) { return on == FIPS_mode(); } + +const char *FIPS_module_name() { return "BoringCrypto"; } + +int CRYPTO_has_asm() { +#if defined(OPENSSL_NO_ASM) + return 0; +#else + return 1; +#endif +} + +uint32_t FIPS_version() { return 0; } + +int FIPS_query_algorithm_status(const char *algorithm) { +#if defined(BORINGSSL_FIPS) + static const char kApprovedAlgorithms[][13] = { + "AES-CBC", + "AES-CCM", + "AES-CTR", + "AES-ECB", + "AES-GCM", + "AES-KW", + "AES-KWP", + "ctrDRBG", + "ECC-SSC", + "ECDSA-sign", + "ECDSA-verify", + "FFC-SSC", + "HMAC", + "RSA-sign", + "RSA-verify", + "SHA-1", + "SHA2-224", + "SHA2-256", + "SHA2-384", + "SHA2-512", + "SHA2-512/256", + }; + for (const char *approved : kApprovedAlgorithms) { + if (strcmp(algorithm, approved) == 0) { + return 1; + } + } +#endif // BORINGSSL_FIPS + + return 0; +} + +#if defined(BORINGSSL_FIPS_COUNTERS) + +size_t FIPS_read_counter(enum fips_counter_t counter) { + size_t index = (size_t)counter; + if (index > fips_counter_max) { + abort(); + } + + const size_t *array = reinterpret_cast( + CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_FIPS_COUNTERS)); + if (!array) { + return 0; + } + + return array[index]; +} + +void bssl::boringssl_fips_inc_counter(enum fips_counter_t counter) { + size_t index = (size_t)counter; + if (index > fips_counter_max) { + abort(); + } + + size_t *array = reinterpret_cast( + CRYPTO_get_thread_local(OPENSSL_THREAD_LOCAL_FIPS_COUNTERS)); + if (!array) { + const size_t num_bytes = sizeof(size_t) * (fips_counter_max + 1); + array = reinterpret_cast(OPENSSL_zalloc(num_bytes)); + if (!array) { + return; + } + + if (!CRYPTO_set_thread_local(OPENSSL_THREAD_LOCAL_FIPS_COUNTERS, array, + OPENSSL_free)) { + // |OPENSSL_free| has already been called by |CRYPTO_set_thread_local|. + return; + } + } + + array[index]++; +} + +#else + +size_t FIPS_read_counter(enum fips_counter_t counter) { return 0; } + +// boringssl_fips_inc_counter is a no-op, inline function in internal.h in this +// case. That should let the compiler optimise away the callsites. + +#endif diff --git a/third_party/boringssl/src/crypto/fipsmodule/self_check/self_check.c b/third_party/boringssl/src/crypto/fipsmodule/self_check/self_check.c deleted file mode 100644 index 19f57434..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/self_check/self_check.c +++ /dev/null @@ -1,972 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../../internal.h" -#include "../dh/internal.h" -#include "../ec/internal.h" -#include "../ecdsa/internal.h" -#include "../rand/internal.h" -#include "../rsa/internal.h" -#include "../service_indicator/internal.h" -#include "../tls/internal.h" - - -// MSVC wants to put a NUL byte at the end of non-char arrays and so cannot -// compile the real logic. -#if defined(_MSC_VER) - -int BORINGSSL_self_test(void) { - return 0; -} - -#else - -static void hexdump(const uint8_t *in, size_t len) { - for (size_t i = 0; i < len; i++) { - fprintf(stderr, "%02x", in[i]); - } -} - -static int check_test(const void *expected, const void *actual, - size_t expected_len, const char *name) { - if (OPENSSL_memcmp(actual, expected, expected_len) != 0) { - fprintf(stderr, "%s failed.\nExpected: ", name); - hexdump(expected, expected_len); - fprintf(stderr, "\nCalculated: "); - hexdump(actual, expected_len); - fprintf(stderr, "\n"); - fflush(stderr); - return 0; - } - return 1; -} - -static int set_bignum(BIGNUM **out, const uint8_t *in, size_t len) { - *out = BN_bin2bn(in, len, NULL); - return *out != NULL; -} - -static int serialize_ecdsa_sig(uint8_t *out, size_t out_len, - const ECDSA_SIG *sig) { - if ((out_len & 1) || // - !BN_bn2bin_padded(out, out_len / 2, sig->r) || - !BN_bn2bin_padded(out + out_len / 2, out_len / 2, sig->s)) { - return 0; - } - return 1; -} - -static ECDSA_SIG *parse_ecdsa_sig(const uint8_t *in, size_t in_len) { - ECDSA_SIG *ret = ECDSA_SIG_new(); - if (!ret || // - (in_len & 1) || - BN_bin2bn(in, in_len/2, ret->r) == NULL || - BN_bin2bn(in + in_len/2, in_len/2, ret->s) == NULL) { - ECDSA_SIG_free(ret); - ret = NULL; - } - return ret; -} - -static RSA *self_test_rsa_key(void) { - static const uint8_t kN[] = { - 0xd3, 0x3a, 0x62, 0x9f, 0x07, 0x77, 0xb0, 0x18, 0xf3, 0xff, 0xfe, 0xcc, - 0xc9, 0xa2, 0xc2, 0x3a, 0xa6, 0x1d, 0xd8, 0xf0, 0x26, 0x5b, 0x38, 0x90, - 0x17, 0x48, 0x15, 0xce, 0x21, 0xcd, 0xd6, 0x62, 0x99, 0xe2, 0xd7, 0xda, - 0x40, 0x80, 0x3c, 0xad, 0x18, 0xb7, 0x26, 0xe9, 0x30, 0x8a, 0x23, 0x3f, - 0x68, 0x9a, 0x9c, 0x31, 0x34, 0x91, 0x99, 0x06, 0x11, 0x36, 0xb2, 0x9e, - 0x3a, 0xd0, 0xbc, 0xb9, 0x93, 0x4e, 0xb8, 0x72, 0xa1, 0x9f, 0xb6, 0x8c, - 0xd5, 0x17, 0x1f, 0x7e, 0xaa, 0x75, 0xbb, 0xdf, 0xa1, 0x70, 0x48, 0xc4, - 0xec, 0x9a, 0x51, 0xed, 0x41, 0xc9, 0x74, 0xc0, 0x3e, 0x1e, 0x85, 0x2f, - 0xbe, 0x34, 0xc7, 0x65, 0x34, 0x8b, 0x4d, 0x55, 0x4b, 0xe1, 0x45, 0x54, - 0x0d, 0x75, 0x7e, 0x89, 0x4d, 0x0c, 0xf6, 0x33, 0xe5, 0xfc, 0xfb, 0x56, - 0x1b, 0xf2, 0x39, 0x9d, 0xe0, 0xff, 0x55, 0xcf, 0x02, 0x05, 0xb9, 0x74, - 0xd2, 0x91, 0xfc, 0x87, 0xe1, 0xbb, 0x97, 0x2a, 0xe4, 0xdd, 0x20, 0xc0, - 0x38, 0x47, 0xc0, 0x76, 0x3f, 0xa1, 0x9b, 0x5c, 0x20, 0xff, 0xff, 0xc7, - 0x49, 0x3b, 0x4c, 0xaf, 0x99, 0xa6, 0x3e, 0x82, 0x5c, 0x58, 0x27, 0xce, - 0x01, 0x03, 0xc3, 0x16, 0x35, 0x20, 0xe9, 0xf0, 0x15, 0x7a, 0x41, 0xd5, - 0x1f, 0x52, 0xea, 0xdf, 0xad, 0x4c, 0xbb, 0x0d, 0xcb, 0x04, 0x91, 0xb0, - 0x95, 0xa8, 0xce, 0x25, 0xfd, 0xd2, 0x62, 0x47, 0x77, 0xee, 0x13, 0xf1, - 0x48, 0x72, 0x9e, 0xd9, 0x2d, 0xe6, 0x5f, 0xa4, 0xc6, 0x9e, 0x5a, 0xb2, - 0xc6, 0xa2, 0xf7, 0x0a, 0x16, 0x17, 0xae, 0x6b, 0x1c, 0x30, 0x7c, 0x63, - 0x08, 0x83, 0xe7, 0x43, 0xec, 0x54, 0x5e, 0x2c, 0x08, 0x0b, 0x5e, 0x46, - 0xa7, 0x10, 0x93, 0x43, 0x53, 0x4e, 0xe3, 0x16, 0x73, 0x55, 0xce, 0xf2, - 0x94, 0xc0, 0xbe, 0xb3, - }; - static const uint8_t kE[] = {0x01, 0x00, 0x01}; // 65537 - static const uint8_t kD[] = { - 0x2f, 0x2c, 0x1e, 0xd2, 0x3d, 0x2c, 0xb1, 0x9b, 0x21, 0x02, 0xce, 0xb8, - 0x95, 0x5f, 0x4f, 0xd9, 0x21, 0x38, 0x11, 0x36, 0xb0, 0x9a, 0x36, 0xab, - 0x97, 0x47, 0x75, 0xf7, 0x2e, 0xfd, 0x75, 0x1f, 0x58, 0x16, 0x9c, 0xf6, - 0x14, 0xe9, 0x8e, 0xa3, 0x69, 0x9d, 0x9d, 0x86, 0xfe, 0x5c, 0x1b, 0x3b, - 0x11, 0xf5, 0x55, 0x64, 0x77, 0xc4, 0xfc, 0x53, 0xaa, 0x8c, 0x78, 0x9f, - 0x75, 0xab, 0x20, 0x3a, 0xa1, 0x77, 0x37, 0x22, 0x02, 0x8e, 0x54, 0x8a, - 0x67, 0x1c, 0x5e, 0xe0, 0x3e, 0xd9, 0x44, 0x37, 0xd1, 0x29, 0xee, 0x56, - 0x6c, 0x30, 0x9a, 0x93, 0x4d, 0xd9, 0xdb, 0xc5, 0x03, 0x1a, 0x75, 0xcc, - 0x0f, 0xc2, 0x61, 0xb5, 0x6c, 0x62, 0x9f, 0xc6, 0xa8, 0xc7, 0x8a, 0x60, - 0x17, 0x11, 0x62, 0x4c, 0xef, 0x74, 0x31, 0x97, 0xad, 0x89, 0x2d, 0xe8, - 0x31, 0x1d, 0x8b, 0x58, 0x82, 0xe3, 0x03, 0x1a, 0x6b, 0xdf, 0x3f, 0x3e, - 0xa4, 0x27, 0x19, 0xef, 0x46, 0x7a, 0x90, 0xdf, 0xa7, 0xe7, 0xc9, 0x66, - 0xab, 0x41, 0x1d, 0x65, 0x78, 0x1c, 0x18, 0x40, 0x5c, 0xd6, 0x87, 0xb5, - 0xea, 0x29, 0x44, 0xb3, 0xf5, 0xb3, 0xd2, 0x4f, 0xce, 0x88, 0x78, 0x49, - 0x27, 0x4e, 0x0b, 0x30, 0x85, 0xfb, 0x73, 0xfd, 0x8b, 0x32, 0x15, 0xee, - 0x1f, 0xc9, 0x0e, 0x89, 0xb9, 0x43, 0x2f, 0xe9, 0x60, 0x8d, 0xda, 0xae, - 0x2b, 0x30, 0x99, 0xee, 0x88, 0x81, 0x20, 0x7b, 0x4a, 0xc3, 0x18, 0xf2, - 0x94, 0x02, 0x79, 0x94, 0xaa, 0x65, 0xd9, 0x1b, 0x45, 0x2a, 0xac, 0x6e, - 0x30, 0x48, 0x57, 0xea, 0xbe, 0x79, 0x7d, 0xfc, 0x67, 0xaa, 0x47, 0xc0, - 0xf7, 0x52, 0xfd, 0x0b, 0x63, 0x4e, 0x3d, 0x2e, 0xcc, 0x36, 0xa0, 0xdb, - 0x92, 0x0b, 0xa9, 0x1b, 0xeb, 0xc2, 0xd5, 0x08, 0xd3, 0x85, 0x87, 0xf8, - 0x5d, 0x1a, 0xf6, 0xc1, - }; - static const uint8_t kP[] = { - 0xf7, 0x06, 0xa3, 0x98, 0x8a, 0x52, 0xf8, 0x63, 0x68, 0x27, 0x4f, 0x68, - 0x7f, 0x34, 0xec, 0x8e, 0x5d, 0xf8, 0x30, 0x92, 0xb3, 0x62, 0x4c, 0xeb, - 0xdb, 0x19, 0x6b, 0x09, 0xc5, 0xa3, 0xf0, 0xbb, 0xff, 0x0f, 0xc2, 0xd4, - 0x9b, 0xc9, 0x54, 0x4f, 0xb9, 0xf9, 0xe1, 0x4c, 0xf0, 0xe3, 0x4c, 0x90, - 0xda, 0x7a, 0x01, 0xc2, 0x9f, 0xc4, 0xc8, 0x8e, 0xb1, 0x1e, 0x93, 0x75, - 0x75, 0xc6, 0x13, 0x25, 0xc3, 0xee, 0x3b, 0xcc, 0xb8, 0x72, 0x6c, 0x49, - 0xb0, 0x09, 0xfb, 0xab, 0x44, 0xeb, 0x4d, 0x40, 0xf0, 0x61, 0x6b, 0xe5, - 0xe6, 0xfe, 0x3e, 0x0a, 0x77, 0x26, 0x39, 0x76, 0x3d, 0x4c, 0x3e, 0x9b, - 0x5b, 0xc0, 0xaf, 0xa2, 0x58, 0x76, 0xb0, 0xe9, 0xda, 0x7f, 0x0e, 0x78, - 0xc9, 0x76, 0x49, 0x5c, 0xfa, 0xb3, 0xb0, 0x15, 0x4b, 0x41, 0xc7, 0x27, - 0xa4, 0x75, 0x28, 0x5c, 0x30, 0x69, 0x50, 0x29, - }; - static const uint8_t kQ[] = { - 0xda, 0xe6, 0xd2, 0xbb, 0x44, 0xff, 0x4f, 0xdf, 0x57, 0xc1, 0x11, 0xa3, - 0x51, 0xba, 0x17, 0x89, 0x4c, 0x01, 0xc0, 0x0c, 0x97, 0x34, 0x50, 0xcf, - 0x32, 0x1e, 0xc0, 0xbd, 0x7b, 0x35, 0xb5, 0x6a, 0x26, 0xcc, 0xea, 0x4c, - 0x8e, 0x87, 0x4a, 0x67, 0x8b, 0xd3, 0xe5, 0x4f, 0x3a, 0x60, 0x48, 0x59, - 0x04, 0x93, 0x39, 0xd7, 0x7c, 0xfb, 0x19, 0x1a, 0x34, 0xd5, 0xe8, 0xaf, - 0xe7, 0x22, 0x2c, 0x0d, 0xc2, 0x91, 0x69, 0xb6, 0xe9, 0x2a, 0xe9, 0x1c, - 0x4c, 0x6e, 0x8f, 0x40, 0xf5, 0xa8, 0x3e, 0x82, 0x69, 0x69, 0xbe, 0x9f, - 0x7d, 0x5c, 0x7f, 0x92, 0x78, 0x17, 0xa3, 0x6d, 0x41, 0x2d, 0x72, 0xed, - 0x3f, 0x71, 0xfa, 0x97, 0xb4, 0x63, 0xe4, 0x4f, 0xd9, 0x46, 0x03, 0xfb, - 0x00, 0xeb, 0x30, 0x70, 0xb9, 0x51, 0xd9, 0x0a, 0xd2, 0xf8, 0x50, 0xd4, - 0xfb, 0x43, 0x84, 0xf8, 0xac, 0x58, 0xc3, 0x7b, - }; - static const uint8_t kDModPMinusOne[] = { - 0xf5, 0x50, 0x8f, 0x88, 0x7d, 0xdd, 0xb5, 0xb4, 0x2a, 0x8b, 0xd7, 0x4d, - 0x23, 0xfe, 0xaf, 0xe9, 0x16, 0x22, 0xd2, 0x41, 0xed, 0x88, 0xf2, 0x70, - 0xcb, 0x4d, 0xeb, 0xc1, 0x71, 0x97, 0xc4, 0x0b, 0x3e, 0x5a, 0x2d, 0x96, - 0xab, 0xfa, 0xfd, 0x12, 0x8b, 0xd3, 0x3e, 0x4e, 0x05, 0x6f, 0x04, 0xeb, - 0x59, 0x3c, 0x0e, 0xa1, 0x73, 0xbe, 0x9d, 0x99, 0x2f, 0x05, 0xf9, 0x54, - 0x8d, 0x98, 0x1e, 0x0d, 0xc4, 0x0c, 0xc3, 0x30, 0x23, 0xff, 0xe5, 0xd0, - 0x2b, 0xd5, 0x4e, 0x2b, 0xa0, 0xae, 0xb8, 0x32, 0x84, 0x45, 0x8b, 0x3c, - 0x6d, 0xf0, 0x10, 0x36, 0x9e, 0x6a, 0xc4, 0x67, 0xca, 0xa9, 0xfc, 0x06, - 0x96, 0xd0, 0xbc, 0xda, 0xd1, 0x55, 0x55, 0x8d, 0x77, 0x21, 0xf4, 0x82, - 0x39, 0x37, 0x91, 0xd5, 0x97, 0x56, 0x78, 0xc8, 0x3c, 0xcb, 0x5e, 0xf6, - 0xdc, 0x58, 0x48, 0xb3, 0x7c, 0x94, 0x29, 0x39, - }; - static const uint8_t kDModQMinusOne[] = { - 0x64, 0x65, 0xbd, 0x7d, 0x1a, 0x96, 0x26, 0xa1, 0xfe, 0xf3, 0x94, 0x0d, - 0x5d, 0xec, 0x85, 0xe2, 0xf8, 0xb3, 0x4c, 0xcb, 0xf9, 0x85, 0x8b, 0x12, - 0x9c, 0xa0, 0x32, 0x32, 0x35, 0x92, 0x5a, 0x94, 0x47, 0x1b, 0x70, 0xd2, - 0x90, 0x04, 0x49, 0x01, 0xd8, 0xc5, 0xe4, 0xc4, 0x43, 0xb7, 0xe9, 0x36, - 0xba, 0xbc, 0x73, 0xa8, 0xfb, 0xaf, 0x86, 0xc1, 0xd8, 0x3d, 0xcb, 0xac, - 0xf1, 0xcb, 0x60, 0x7d, 0x27, 0x21, 0xde, 0x64, 0x7f, 0xe8, 0xa8, 0x65, - 0xcc, 0x40, 0x60, 0xff, 0xa0, 0x2b, 0xfc, 0x0f, 0x80, 0x1d, 0x79, 0xca, - 0x58, 0x8a, 0xd6, 0x0f, 0xed, 0x78, 0x9a, 0x02, 0x00, 0x04, 0xc2, 0x53, - 0x41, 0xe8, 0x1a, 0xd0, 0xfd, 0x71, 0x5b, 0x43, 0xac, 0x19, 0x4a, 0xb6, - 0x12, 0xa3, 0xcb, 0xe1, 0xc7, 0x7d, 0x5c, 0x98, 0x74, 0x4e, 0x63, 0x74, - 0x6b, 0x91, 0x7a, 0x29, 0x3b, 0x92, 0xb2, 0x85, - }; - static const uint8_t kQInverseModP[] = { - 0xd0, 0xde, 0x19, 0xda, 0x1e, 0xa2, 0xd8, 0x8f, 0x1c, 0x92, 0x73, 0xb0, - 0xc9, 0x90, 0xc7, 0xf5, 0xec, 0xc5, 0x89, 0x01, 0x05, 0x78, 0x11, 0x2d, - 0x74, 0x34, 0x44, 0xad, 0xd5, 0xf7, 0xa4, 0xfe, 0x9f, 0x25, 0x4d, 0x0b, - 0x92, 0xe3, 0xb8, 0x7d, 0xd3, 0xfd, 0xa5, 0xca, 0x95, 0x60, 0xa3, 0xf9, - 0x55, 0x42, 0x14, 0xb2, 0x45, 0x51, 0x9f, 0x73, 0x88, 0x43, 0x8a, 0xd1, - 0x65, 0x9e, 0xd1, 0xf7, 0x82, 0x2a, 0x2a, 0x8d, 0x70, 0x56, 0xe3, 0xef, - 0xc9, 0x0e, 0x2a, 0x2c, 0x15, 0xaf, 0x7f, 0x97, 0x81, 0x66, 0xf3, 0xb5, - 0x00, 0xa9, 0x26, 0xcc, 0x1e, 0xc2, 0x98, 0xdd, 0xd3, 0x37, 0x06, 0x79, - 0xb3, 0x60, 0x58, 0x79, 0x99, 0x3f, 0xa3, 0x15, 0x1f, 0x31, 0xe3, 0x11, - 0x88, 0x4c, 0x35, 0x57, 0xfa, 0x79, 0xd7, 0xd8, 0x72, 0xee, 0x73, 0x95, - 0x89, 0x29, 0xc7, 0x05, 0x27, 0x68, 0x90, 0x15, - }; - - RSA *rsa = RSA_new(); - if (rsa == NULL || - !set_bignum(&rsa->n, kN, sizeof(kN)) || - !set_bignum(&rsa->e, kE, sizeof(kE)) || - !set_bignum(&rsa->d, kD, sizeof(kD)) || - !set_bignum(&rsa->p, kP, sizeof(kP)) || - !set_bignum(&rsa->q, kQ, sizeof(kQ)) || - !set_bignum(&rsa->dmp1, kDModPMinusOne, sizeof(kDModPMinusOne)) || - !set_bignum(&rsa->dmq1, kDModQMinusOne, sizeof(kDModQMinusOne)) || - !set_bignum(&rsa->iqmp, kQInverseModP, sizeof(kQInverseModP))) { - RSA_free(rsa); - return NULL; - } - - return rsa; -} - -static EC_KEY *self_test_ecdsa_key(void) { - static const uint8_t kQx[] = { - 0xc8, 0x15, 0x61, 0xec, 0xf2, 0xe5, 0x4e, 0xde, 0xfe, 0x66, 0x17, - 0xdb, 0x1c, 0x7a, 0x34, 0xa7, 0x07, 0x44, 0xdd, 0xb2, 0x61, 0xf2, - 0x69, 0xb8, 0x3d, 0xac, 0xfc, 0xd2, 0xad, 0xe5, 0xa6, 0x81, - }; - static const uint8_t kQy[] = { - 0xe0, 0xe2, 0xaf, 0xa3, 0xf9, 0xb6, 0xab, 0xe4, 0xc6, 0x98, 0xef, - 0x64, 0x95, 0xf1, 0xbe, 0x49, 0xa3, 0x19, 0x6c, 0x50, 0x56, 0xac, - 0xb3, 0x76, 0x3f, 0xe4, 0x50, 0x7e, 0xec, 0x59, 0x6e, 0x88, - }; - static const uint8_t kD[] = { - 0xc6, 0xc1, 0xaa, 0xda, 0x15, 0xb0, 0x76, 0x61, 0xf8, 0x14, 0x2c, - 0x6c, 0xaf, 0x0f, 0xdb, 0x24, 0x1a, 0xff, 0x2e, 0xfe, 0x46, 0xc0, - 0x93, 0x8b, 0x74, 0xf2, 0xbc, 0xc5, 0x30, 0x52, 0xb0, 0x77, - }; - - EC_KEY *ec_key = EC_KEY_new_by_curve_name(NID_X9_62_prime256v1); - BIGNUM *qx = BN_bin2bn(kQx, sizeof(kQx), NULL); - BIGNUM *qy = BN_bin2bn(kQy, sizeof(kQy), NULL); - BIGNUM *d = BN_bin2bn(kD, sizeof(kD), NULL); - if (ec_key == NULL || qx == NULL || qy == NULL || d == NULL || - !EC_KEY_set_public_key_affine_coordinates(ec_key, qx, qy) || - !EC_KEY_set_private_key(ec_key, d)) { - EC_KEY_free(ec_key); - ec_key = NULL; - } - - BN_free(qx); - BN_free(qy); - BN_free(d); - return ec_key; -} - -static DH *self_test_dh(void) { - DH *dh = DH_get_rfc7919_2048(); - if (!dh) { - return NULL; - } - - BIGNUM *priv = BN_new(); - if (!priv) { - goto err; - } - - // kFFDHE2048PrivateKeyData is a 225-bit value. (225 because that's the - // minimum private key size in - // https://tools.ietf.org/html/rfc7919#appendix-A.1.) - static const BN_ULONG kFFDHE2048PrivateKeyData[] = { - TOBN(0x187be36b, 0xd38a4fa1), - TOBN(0x0a152f39, 0x6458f3b8), - TOBN(0x0570187e, 0xc422eeb7), - TOBN(0x00000001, 0x91173f2a), - }; - - bn_set_static_words(priv, kFFDHE2048PrivateKeyData, - OPENSSL_ARRAY_SIZE(kFFDHE2048PrivateKeyData)); - - if (!DH_set0_key(dh, NULL, priv)) { - goto err; - } - return dh; - -err: - BN_free(priv); - DH_free(dh); - return NULL; -} - - -// Lazy self-tests -// -// Self tests that are slow are deferred until the corresponding algorithm is -// actually exercised, in FIPS mode. (In non-FIPS mode these tests are only run -// when requested by |BORINGSSL_self_test|.) - -static int boringssl_self_test_rsa(void) { - int ret = 0; - uint8_t output[256]; - - RSA *const rsa_key = self_test_rsa_key(); - if (rsa_key == NULL) { - fprintf(stderr, "RSA key construction failed\n"); - goto err; - } - - // RSA Sign KAT - - static const uint8_t kRSASignDigest[32] = { - 0xd2, 0xb5, 0x6e, 0x53, 0x30, 0x6f, 0x72, 0x0d, 0x79, 0x29, 0xd8, - 0x70, 0x8b, 0xf4, 0x6f, 0x1c, 0x22, 0x30, 0x03, 0x05, 0x58, 0x2b, - 0x11, 0x5b, 0xed, 0xca, 0xc7, 0x22, 0xd8, 0xaa, 0x5a, 0xb2, - }; - static const uint8_t kRSASignSignature[256] = { - 0x64, 0xce, 0xdd, 0x91, 0x27, 0xb0, 0x4f, 0xb9, 0x14, 0xea, 0xc0, 0xb4, - 0xa2, 0x06, 0xc5, 0xd8, 0x40, 0x0f, 0x6c, 0x54, 0xac, 0xf7, 0x02, 0xde, - 0x26, 0xbb, 0xfd, 0x33, 0xe5, 0x2f, 0x4d, 0xb1, 0x53, 0xc4, 0xff, 0xd0, - 0x5f, 0xea, 0x15, 0x89, 0x83, 0x4c, 0xe3, 0x80, 0x0b, 0xe9, 0x13, 0x82, - 0x1d, 0x71, 0x92, 0x1a, 0x03, 0x60, 0x2c, 0xaf, 0xe2, 0x16, 0xc7, 0x43, - 0x3f, 0xde, 0x6b, 0x94, 0xfd, 0x6e, 0x08, 0x7b, 0x11, 0xf1, 0x34, 0x52, - 0xe5, 0xc0, 0x97, 0x66, 0x4a, 0xe0, 0x91, 0x45, 0xc8, 0xb1, 0x3d, 0x6a, - 0x54, 0xc1, 0x32, 0x0f, 0x32, 0xad, 0x25, 0x11, 0x3e, 0x49, 0xad, 0x41, - 0xce, 0x7b, 0xca, 0x95, 0x6b, 0x54, 0x5e, 0x86, 0x1b, 0xce, 0xfa, 0x2a, - 0x60, 0xe8, 0xfa, 0xbb, 0x23, 0xb2, 0x41, 0xbc, 0x7c, 0x98, 0xec, 0x73, - 0x20, 0xed, 0xb3, 0xcf, 0xab, 0x07, 0x24, 0x85, 0x6a, 0x2a, 0x61, 0x76, - 0x28, 0xf8, 0x00, 0x80, 0xeb, 0xd9, 0x3a, 0x63, 0xe2, 0x01, 0xb1, 0xee, - 0x6d, 0xe9, 0x73, 0xe9, 0xb6, 0x75, 0x2e, 0xf9, 0x81, 0xd9, 0xa8, 0x79, - 0xf6, 0x8f, 0xe3, 0x02, 0x7d, 0xf6, 0xea, 0xdc, 0x35, 0xe4, 0x62, 0x0d, - 0x91, 0xba, 0x3e, 0x7d, 0x8b, 0x82, 0xbf, 0x15, 0x74, 0x6a, 0x4e, 0x29, - 0xf8, 0x9b, 0x2c, 0x94, 0x8d, 0xa7, 0x00, 0x4d, 0x7b, 0xbf, 0x35, 0x07, - 0xeb, 0xdd, 0x10, 0xef, 0xd5, 0x2f, 0xe6, 0x98, 0x4b, 0x7e, 0x24, 0x80, - 0xe2, 0x01, 0xf2, 0x66, 0xb7, 0xd3, 0x93, 0xfe, 0x2a, 0xb3, 0x74, 0xed, - 0xec, 0x4b, 0xb1, 0x5f, 0x5f, 0xee, 0x85, 0x44, 0xa7, 0x26, 0xdf, 0xc1, - 0x2e, 0x7a, 0xf3, 0xa5, 0x8f, 0xf8, 0x64, 0xda, 0x65, 0xad, 0x91, 0xe2, - 0x90, 0x94, 0x20, 0x16, 0xb8, 0x61, 0xa5, 0x0a, 0x7d, 0xb4, 0xbf, 0xc0, - 0x10, 0xaf, 0x72, 0x67, - }; - - unsigned sig_len; - if (!rsa_sign_no_self_test(NID_sha256, kRSASignDigest, sizeof(kRSASignDigest), - output, &sig_len, rsa_key) || - !check_test(kRSASignSignature, output, sizeof(kRSASignSignature), - "RSA-sign KAT")) { - fprintf(stderr, "RSA signing test failed.\n"); - goto err; - } - - // RSA Verify KAT - - static const uint8_t kRSAVerifyDigest[32] = { - 0x09, 0x65, 0x2f, 0xd8, 0xed, 0x9d, 0xc2, 0x6d, 0xbc, 0xbf, 0xf2, - 0xa7, 0xa5, 0xed, 0xe1, 0x37, 0x13, 0x78, 0x21, 0x36, 0xcf, 0x8d, - 0x22, 0x3d, 0xab, 0x93, 0xb4, 0x12, 0xa8, 0xb5, 0x15, 0x53, - }; - static const uint8_t kRSAVerifySignature[256] = { - 0xab, 0xe2, 0xcb, 0xc1, 0x3d, 0x6b, 0xd3, 0x9d, 0x48, 0xdb, 0x53, 0x34, - 0xdd, 0xbf, 0x8d, 0x07, 0x0a, 0x93, 0xbd, 0xcb, 0x10, 0x4e, 0x2c, 0xc5, - 0xd0, 0xee, 0x48, 0x6e, 0xe2, 0x95, 0xf6, 0xb3, 0x1b, 0xda, 0x12, 0x6c, - 0x41, 0x89, 0x0b, 0x98, 0xb7, 0x3e, 0x70, 0xe6, 0xb6, 0x5d, 0x82, 0xf9, - 0x5c, 0x66, 0x31, 0x21, 0x75, 0x5a, 0x90, 0x74, 0x4c, 0x8d, 0x1c, 0x21, - 0x14, 0x8a, 0x19, 0x60, 0xbe, 0x0e, 0xca, 0x44, 0x6e, 0x9f, 0xf4, 0x97, - 0xf1, 0x34, 0x5c, 0x53, 0x7e, 0xf8, 0x11, 0x9b, 0x9a, 0x43, 0x98, 0xe9, - 0x5c, 0x5c, 0x6d, 0xe2, 0xb1, 0xc9, 0x55, 0x90, 0x5c, 0x52, 0x99, 0xd8, - 0xce, 0x7a, 0x3b, 0x6a, 0xb7, 0x63, 0x80, 0xd9, 0xba, 0xbd, 0xd1, 0x5f, - 0x61, 0x02, 0x37, 0xe1, 0xf3, 0xf2, 0xaa, 0x1c, 0x1f, 0x1e, 0x77, 0x0b, - 0x62, 0xfb, 0xb5, 0x96, 0x38, 0x1b, 0x2e, 0xbd, 0xd7, 0x7e, 0xce, 0xf9, - 0xc9, 0x0d, 0x4c, 0x92, 0xf7, 0xb6, 0xb0, 0x5f, 0xed, 0x29, 0x36, 0x28, - 0x5f, 0xa9, 0x48, 0x26, 0xe6, 0x20, 0x55, 0x32, 0x2a, 0x33, 0xb6, 0xf0, - 0x4c, 0x74, 0xce, 0x69, 0xe5, 0xd8, 0xd7, 0x37, 0xfb, 0x83, 0x8b, 0x79, - 0xd2, 0xd4, 0x8e, 0x3d, 0xaf, 0x71, 0x38, 0x75, 0x31, 0x88, 0x25, 0x31, - 0xa9, 0x5a, 0xc9, 0x64, 0xd0, 0x2e, 0xa4, 0x13, 0xbf, 0x85, 0x95, 0x29, - 0x82, 0xbb, 0xc0, 0x89, 0x52, 0x7d, 0xaf, 0xf5, 0xb8, 0x45, 0xc9, 0xa0, - 0xf4, 0xd1, 0x4e, 0xf1, 0x95, 0x6d, 0x9c, 0x3a, 0xca, 0xe8, 0x82, 0xd1, - 0x2d, 0xa6, 0x6d, 0xa0, 0xf3, 0x57, 0x94, 0xf5, 0xee, 0x32, 0x23, 0x23, - 0x33, 0x51, 0x7d, 0xb9, 0x31, 0x52, 0x32, 0xa1, 0x83, 0xb9, 0x91, 0x65, - 0x4d, 0xbe, 0xa4, 0x16, 0x15, 0x34, 0x5c, 0x88, 0x53, 0x25, 0x92, 0x67, - 0x44, 0xa5, 0x39, 0x15, - }; - if (!rsa_verify_no_self_test(NID_sha256, kRSAVerifyDigest, - sizeof(kRSAVerifyDigest), kRSAVerifySignature, - sizeof(kRSAVerifySignature), rsa_key)) { - fprintf(stderr, "RSA-verify KAT failed.\n"); - goto err; - } - - ret = 1; - -err: - RSA_free(rsa_key); - - return ret; -} - -static int boringssl_self_test_ecc(void) { - int ret = 0; - EC_KEY *ec_key = NULL; - EC_GROUP *ec_group = NULL; - EC_POINT *ec_point_in = NULL; - EC_POINT *ec_point_out = NULL; - BIGNUM *ec_scalar = NULL; - ECDSA_SIG *sig = NULL; - - ec_key = self_test_ecdsa_key(); - if (ec_key == NULL) { - fprintf(stderr, "ECDSA KeyGen failed\n"); - goto err; - } - - // ECDSA Sign/Verify KAT - - static const uint8_t kECDSASignDigest[32] = { - 0x1e, 0x35, 0x93, 0x0b, 0xe8, 0x60, 0xd0, 0x94, 0x2c, 0xa7, 0xbb, - 0xd6, 0xf6, 0xde, 0xd8, 0x7f, 0x15, 0x7e, 0x4d, 0xe2, 0x4f, 0x81, - 0xed, 0x4b, 0x87, 0x5c, 0x0e, 0x01, 0x8e, 0x89, 0xa8, 0x1f, - }; - static const uint8_t kECDSASignSig[64] = { - 0x67, 0x80, 0xc5, 0xfc, 0x70, 0x27, 0x5e, 0x2c, 0x70, 0x61, 0xa0, - 0xe7, 0x87, 0x7b, 0xb1, 0x74, 0xde, 0xad, 0xeb, 0x98, 0x87, 0x02, - 0x7f, 0x3f, 0xa8, 0x36, 0x54, 0x15, 0x8b, 0xa7, 0xf5, 0x0c, 0x68, - 0x04, 0x73, 0x40, 0x94, 0xb2, 0xd1, 0x90, 0xac, 0x2d, 0x0c, 0xd7, - 0xa5, 0x7f, 0x2f, 0x2e, 0xb2, 0x62, 0xb0, 0x09, 0x16, 0xe1, 0xa6, - 0x70, 0xb5, 0xbb, 0x0d, 0xfd, 0x8e, 0x0c, 0x02, 0x3f, - }; - - // The 'k' value for ECDSA is fixed to avoid an entropy draw. - uint8_t ecdsa_k[32] = {0}; - ecdsa_k[31] = 42; - - sig = ecdsa_sign_with_nonce_for_known_answer_test( - kECDSASignDigest, sizeof(kECDSASignDigest), ec_key, ecdsa_k, - sizeof(ecdsa_k)); - - uint8_t ecdsa_sign_output[64]; - if (sig == NULL || - !serialize_ecdsa_sig(ecdsa_sign_output, sizeof(ecdsa_sign_output), sig) || - !check_test(kECDSASignSig, ecdsa_sign_output, sizeof(ecdsa_sign_output), - "ECDSA-sign signature")) { - fprintf(stderr, "ECDSA-sign KAT failed.\n"); - goto err; - } - - static const uint8_t kECDSAVerifyDigest[32] = { - 0x78, 0x7c, 0x50, 0x5c, 0x60, 0xc9, 0xe4, 0x13, 0x6c, 0xe4, 0x48, - 0xba, 0x93, 0xff, 0x71, 0xfa, 0x9c, 0x18, 0xf4, 0x17, 0x09, 0x4f, - 0xdf, 0x5a, 0xe2, 0x75, 0xc0, 0xcc, 0xd2, 0x67, 0x97, 0xad, - }; - static const uint8_t kECDSAVerifySig[64] = { - 0x67, 0x80, 0xc5, 0xfc, 0x70, 0x27, 0x5e, 0x2c, 0x70, 0x61, 0xa0, - 0xe7, 0x87, 0x7b, 0xb1, 0x74, 0xde, 0xad, 0xeb, 0x98, 0x87, 0x02, - 0x7f, 0x3f, 0xa8, 0x36, 0x54, 0x15, 0x8b, 0xa7, 0xf5, 0x0c, 0x2d, - 0x36, 0xe5, 0x79, 0x97, 0x90, 0xbf, 0xbe, 0x21, 0x83, 0xd3, 0x3e, - 0x96, 0xf3, 0xc5, 0x1f, 0x6a, 0x23, 0x2f, 0x2a, 0x24, 0x48, 0x8c, - 0x8e, 0x5f, 0x64, 0xc3, 0x7e, 0xa2, 0xcf, 0x05, 0x29, - }; - - ECDSA_SIG_free(sig); - sig = parse_ecdsa_sig(kECDSAVerifySig, sizeof(kECDSAVerifySig)); - if (!sig || - !ecdsa_do_verify_no_self_test(kECDSAVerifyDigest, - sizeof(kECDSAVerifyDigest), sig, ec_key)) { - fprintf(stderr, "ECDSA-verify KAT failed.\n"); - goto err; - } - - // Primitive Z Computation KAT (IG 9.6). - - // kP256Point is SHA256("Primitive Z Computation KAT")×G within P-256. - static const uint8_t kP256Point[65] = { - 0x04, 0x4e, 0xc1, 0x94, 0x8c, 0x5c, 0xf4, 0x37, 0x35, 0x0d, 0xa3, - 0xf9, 0x55, 0xf9, 0x8b, 0x26, 0x23, 0x5c, 0x43, 0xe0, 0x83, 0x51, - 0x2b, 0x0d, 0x4b, 0x56, 0x24, 0xc3, 0xe4, 0xa5, 0xa8, 0xe2, 0xe9, - 0x95, 0xf2, 0xc4, 0xb9, 0xb7, 0x48, 0x7d, 0x2a, 0xae, 0xc5, 0xc0, - 0x0a, 0xcc, 0x1b, 0xd0, 0xec, 0xb8, 0xdc, 0xbe, 0x0c, 0xbe, 0x52, - 0x79, 0x93, 0x7c, 0x0b, 0x92, 0x2b, 0x7f, 0x17, 0xa5, 0x80, - }; - // kP256Scalar is SHA256("Primitive Z Computation KAT scalar"). - static const uint8_t kP256Scalar[32] = { - 0xe7, 0x60, 0x44, 0x91, 0x26, 0x9a, 0xfb, 0x5b, 0x10, 0x2d, 0x6e, - 0xa5, 0x2c, 0xb5, 0x9f, 0xeb, 0x70, 0xae, 0xde, 0x6c, 0xe3, 0xbf, - 0xb3, 0xe0, 0x10, 0x54, 0x85, 0xab, 0xd8, 0x61, 0xd7, 0x7b, - }; - // kP256PointResult is |kP256Scalar|×|kP256Point|. - static const uint8_t kP256PointResult[65] = { - 0x04, 0xf1, 0x63, 0x00, 0x88, 0xc5, 0xd5, 0xe9, 0x05, 0x52, 0xac, - 0xb6, 0xec, 0x68, 0x76, 0xb8, 0x73, 0x7f, 0x0f, 0x72, 0x34, 0xe6, - 0xbb, 0x30, 0x32, 0x22, 0x37, 0xb6, 0x2a, 0x80, 0xe8, 0x9e, 0x6e, - 0x6f, 0x36, 0x02, 0xe7, 0x21, 0xd2, 0x31, 0xdb, 0x94, 0x63, 0xb7, - 0xd8, 0x19, 0x0e, 0xc2, 0xc0, 0xa7, 0x2f, 0x15, 0x49, 0x1a, 0xa2, - 0x7c, 0x41, 0x8f, 0xaf, 0x9c, 0x40, 0xaf, 0x2e, 0x4a, 0x0c, - }; - - ec_group = EC_GROUP_new_by_curve_name(NID_X9_62_prime256v1); - if (ec_group == NULL) { - fprintf(stderr, "Failed to create P-256 group.\n"); - goto err; - } - ec_point_in = EC_POINT_new(ec_group); - ec_point_out = EC_POINT_new(ec_group); - ec_scalar = BN_new(); - uint8_t z_comp_result[65]; - if (ec_point_in == NULL || ec_point_out == NULL || ec_scalar == NULL || - !EC_POINT_oct2point(ec_group, ec_point_in, kP256Point, sizeof(kP256Point), - NULL) || - !BN_bin2bn(kP256Scalar, sizeof(kP256Scalar), ec_scalar) || - !ec_point_mul_no_self_test(ec_group, ec_point_out, NULL, ec_point_in, - ec_scalar, NULL) || - !EC_POINT_point2oct(ec_group, ec_point_out, POINT_CONVERSION_UNCOMPRESSED, - z_comp_result, sizeof(z_comp_result), NULL) || - !check_test(kP256PointResult, z_comp_result, sizeof(z_comp_result), - "Z Computation Result")) { - fprintf(stderr, "Z-computation KAT failed.\n"); - goto err; - } - - ret = 1; - -err: - EC_KEY_free(ec_key); - EC_POINT_free(ec_point_in); - EC_POINT_free(ec_point_out); - EC_GROUP_free(ec_group); - BN_free(ec_scalar); - ECDSA_SIG_free(sig); - - return ret; -} - -static int boringssl_self_test_ffdh(void) { - int ret = 0; - DH *dh = NULL; - BIGNUM *ffdhe2048_value = NULL; - - // FFC Diffie-Hellman KAT - - // kFFDHE2048PublicValueData is an arbitrary public value, mod - // kFFDHE2048Data. (The private key happens to be 4096.) - static const BN_ULONG kFFDHE2048PublicValueData[] = { - TOBN(0x187be36b, 0xd38a4fa1), TOBN(0x0a152f39, 0x6458f3b8), - TOBN(0x0570187e, 0xc422eeb7), TOBN(0x18af7482, 0x91173f2a), - TOBN(0xe9fdac6a, 0xcff4eaaa), TOBN(0xf6afebb7, 0x6e589d6c), - TOBN(0xf92f8e9a, 0xb7e33fb0), TOBN(0x70acf2aa, 0x4cf36ddd), - TOBN(0x561ab426, 0xd07137fd), TOBN(0x5f57d037, 0x430ee91e), - TOBN(0xe3e768c8, 0x60d10b8a), TOBN(0xb14884d8, 0xa18af8ce), - TOBN(0xf8a98014, 0xa12b74e4), TOBN(0x748d407c, 0x3437b7a8), - TOBN(0x627588c4, 0x9875d5a7), TOBN(0xdd24a127, 0x53c8f09d), - TOBN(0x85a997d5, 0x0cd51aec), TOBN(0x44f0c619, 0xce348458), - TOBN(0x9b894b24, 0x5f6b69a1), TOBN(0xae1302f2, 0xf6d4777e), - TOBN(0xe6678eeb, 0x375db18e), TOBN(0x2674e1d6, 0x4fbcbdc8), - TOBN(0xb297a823, 0x6fa93d28), TOBN(0x6a12fb70, 0x7c8c0510), - TOBN(0x5c6d1aeb, 0xdb06f65b), TOBN(0xe8c2954e, 0x4c1804ca), - TOBN(0x06bdeac1, 0xf5500fa7), TOBN(0x6a315604, 0x189cd76b), - TOBN(0xbae7b0b3, 0x6e362dc0), TOBN(0xa57c73bd, 0xdc70fb82), - TOBN(0xfaff50d2, 0x9d573457), TOBN(0x352bd399, 0xbe84058e), - }; - static const uint8_t kDHOutput[2048 / 8] = { - 0x2a, 0xe6, 0xd3, 0xa6, 0x13, 0x58, 0x8e, 0xce, 0x53, 0xaa, 0xf6, 0x5d, - 0x9a, 0xae, 0x02, 0x12, 0xf5, 0x80, 0x3d, 0x06, 0x09, 0x76, 0xac, 0x57, - 0x37, 0x9e, 0xab, 0x38, 0x62, 0x25, 0x05, 0x1d, 0xf3, 0xa9, 0x39, 0x60, - 0xf6, 0xae, 0x90, 0xed, 0x1e, 0xad, 0x6e, 0xe9, 0xe3, 0xba, 0x27, 0xf6, - 0xdb, 0x54, 0xdf, 0xe2, 0xbd, 0xbb, 0x7f, 0xf1, 0x81, 0xac, 0x1a, 0xfa, - 0xdb, 0x87, 0x07, 0x98, 0x76, 0x90, 0x21, 0xf2, 0xae, 0xda, 0x0d, 0x84, - 0x97, 0x64, 0x0b, 0xbf, 0xb8, 0x8d, 0x10, 0x46, 0xe2, 0xd5, 0xca, 0x1b, - 0xbb, 0xe5, 0x37, 0xb2, 0x3b, 0x35, 0xd3, 0x1b, 0x65, 0xea, 0xae, 0xf2, - 0x03, 0xe2, 0xb6, 0xde, 0x22, 0xb7, 0x86, 0x49, 0x79, 0xfe, 0xd7, 0x16, - 0xf7, 0xdc, 0x9c, 0x59, 0xf5, 0xb7, 0x70, 0xc0, 0x53, 0x42, 0x6f, 0xb1, - 0xd2, 0x4e, 0x00, 0x25, 0x4b, 0x2d, 0x5a, 0x9b, 0xd0, 0xe9, 0x27, 0x43, - 0xcc, 0x00, 0x66, 0xea, 0x94, 0x7a, 0x0b, 0xb9, 0x89, 0x0c, 0x5e, 0x94, - 0xb8, 0x3a, 0x78, 0x9c, 0x4d, 0x84, 0xe6, 0x32, 0x2c, 0x38, 0x7c, 0xf7, - 0x43, 0x9c, 0xd8, 0xb8, 0x1c, 0xce, 0x24, 0x91, 0x20, 0x67, 0x7a, 0x54, - 0x1f, 0x7e, 0x86, 0x7f, 0xa1, 0xc1, 0x03, 0x4e, 0x2c, 0x26, 0x71, 0xb2, - 0x06, 0x30, 0xb3, 0x6c, 0x15, 0xcc, 0xac, 0x25, 0xe5, 0x37, 0x3f, 0x24, - 0x8f, 0x2a, 0x89, 0x5e, 0x3d, 0x43, 0x94, 0xc9, 0x36, 0xae, 0x40, 0x00, - 0x6a, 0x0d, 0xb0, 0x6e, 0x8b, 0x2e, 0x70, 0x57, 0xe1, 0x88, 0x53, 0xd6, - 0x06, 0x80, 0x2a, 0x4e, 0x5a, 0xf0, 0x1e, 0xaa, 0xcb, 0xab, 0x06, 0x0e, - 0x27, 0x0f, 0xd9, 0x88, 0xd9, 0x01, 0xe3, 0x07, 0xeb, 0xdf, 0xc3, 0x12, - 0xe3, 0x40, 0x88, 0x7b, 0x5f, 0x59, 0x78, 0x6e, 0x26, 0x20, 0xc3, 0xdf, - 0xc8, 0xe4, 0x5e, 0xb8, - }; - - ffdhe2048_value = BN_new(); - if (ffdhe2048_value) { - bn_set_static_words(ffdhe2048_value, kFFDHE2048PublicValueData, - OPENSSL_ARRAY_SIZE(kFFDHE2048PublicValueData)); - } - - dh = self_test_dh(); - uint8_t dh_out[sizeof(kDHOutput)]; - if (dh == NULL || ffdhe2048_value == NULL || sizeof(dh_out) != DH_size(dh) || - dh_compute_key_padded_no_self_test(dh_out, ffdhe2048_value, dh) != - sizeof(dh_out) || - !check_test(kDHOutput, dh_out, sizeof(dh_out), "FFC DH")) { - fprintf(stderr, "FFDH failed.\n"); - goto err; - } - - ret = 1; - -err: - DH_free(dh); - BN_free(ffdhe2048_value); - - return ret; -} - -#if defined(BORINGSSL_FIPS) - -static void run_self_test_rsa(void) { - FIPS_service_indicator_lock_state(); - if (!boringssl_self_test_rsa()) { - BORINGSSL_FIPS_abort(); - } - FIPS_service_indicator_unlock_state(); -} - -DEFINE_STATIC_ONCE(g_self_test_once_rsa); - -void boringssl_ensure_rsa_self_test(void) { - CRYPTO_once(g_self_test_once_rsa_bss_get(), run_self_test_rsa); -} - -static void run_self_test_ecc(void) { - FIPS_service_indicator_lock_state(); - if (!boringssl_self_test_ecc()) { - BORINGSSL_FIPS_abort(); - } - FIPS_service_indicator_unlock_state(); -} - -DEFINE_STATIC_ONCE(g_self_test_once_ecc); - -void boringssl_ensure_ecc_self_test(void) { - CRYPTO_once(g_self_test_once_ecc_bss_get(), run_self_test_ecc); -} - -static void run_self_test_ffdh(void) { - FIPS_service_indicator_lock_state(); - if (!boringssl_self_test_ffdh()) { - BORINGSSL_FIPS_abort(); - } - FIPS_service_indicator_unlock_state(); -} - -DEFINE_STATIC_ONCE(g_self_test_once_ffdh); - -void boringssl_ensure_ffdh_self_test(void) { - CRYPTO_once(g_self_test_once_ffdh_bss_get(), run_self_test_ffdh); -} - -#endif // BORINGSSL_FIPS - - -// Startup self tests. -// -// These tests are run at process start when in FIPS mode. - -int boringssl_self_test_sha256(void) { - static const uint8_t kInput[16] = { - 0xff, 0x3b, 0x85, 0x7d, 0xa7, 0x23, 0x6a, 0x2b, - 0xaa, 0x0f, 0x39, 0x6b, 0x51, 0x52, 0x22, 0x17, - }; - static const uint8_t kPlaintextSHA256[32] = { - 0x7f, 0xe4, 0xd5, 0xf1, 0xa1, 0xe3, 0x82, 0x87, 0xd9, 0x58, 0xf5, - 0x11, 0xc7, 0x1d, 0x5e, 0x27, 0x5e, 0xcc, 0xd2, 0x66, 0xcf, 0xb9, - 0xc8, 0xc6, 0x60, 0xd8, 0x92, 0x1e, 0x57, 0xfd, 0x46, 0x75, - }; - uint8_t output[SHA256_DIGEST_LENGTH]; - - // SHA-256 KAT - SHA256(kInput, sizeof(kInput), output); - return check_test(kPlaintextSHA256, output, sizeof(kPlaintextSHA256), - "SHA-256 KAT"); -} - -int boringssl_self_test_sha512(void) { - static const uint8_t kInput[16] = { - 0x21, 0x25, 0x12, 0xf8, 0xd2, 0xad, 0x83, 0x22, - 0x78, 0x1c, 0x6c, 0x4d, 0x69, 0xa9, 0xda, 0xa1, - }; - static const uint8_t kPlaintextSHA512[64] = { - 0x29, 0x3c, 0x94, 0x35, 0x4e, 0x98, 0x83, 0xe5, 0xc2, 0x78, 0x36, - 0x7a, 0xe5, 0x18, 0x90, 0xbf, 0x35, 0x41, 0x01, 0x64, 0x19, 0x8d, - 0x26, 0xeb, 0xe1, 0xf8, 0x2f, 0x04, 0x8e, 0xfa, 0x8b, 0x2b, 0xc6, - 0xb2, 0x9d, 0x5d, 0x46, 0x76, 0x5a, 0xc8, 0xb5, 0x25, 0xa3, 0xea, - 0x52, 0x84, 0x47, 0x6d, 0x6d, 0xf4, 0xc9, 0x71, 0xf3, 0x3d, 0x89, - 0x4c, 0x3b, 0x20, 0x8c, 0x5b, 0x75, 0xe8, 0xf8, 0x7c, - }; - uint8_t output[SHA512_DIGEST_LENGTH]; - - // SHA-512 KAT - SHA512(kInput, sizeof(kInput), output); - return check_test(kPlaintextSHA512, output, sizeof(kPlaintextSHA512), - "SHA-512 KAT"); -} - -int boringssl_self_test_hmac_sha256(void) { - static const uint8_t kInput[16] = { - 0xda, 0xd9, 0x12, 0x93, 0xdf, 0xcf, 0x2a, 0x7c, - 0x8e, 0xcd, 0x13, 0xfe, 0x35, 0x3f, 0xa7, 0x5b, - }; - static const uint8_t kPlaintextHMACSHA256[32] = { - 0x36, 0x5f, 0x5b, 0xd5, 0xf5, 0xeb, 0xfd, 0xc7, 0x6e, 0x53, 0xa5, - 0x73, 0x6d, 0x73, 0x20, 0x13, 0xaa, 0xd3, 0xbc, 0x86, 0x4b, 0xb8, - 0x84, 0x94, 0x16, 0x46, 0x88, 0x9c, 0x48, 0xee, 0xa9, 0x0e, - }; - uint8_t output[EVP_MAX_MD_SIZE]; - - unsigned output_len; - HMAC(EVP_sha256(), kInput, sizeof(kInput), kInput, sizeof(kInput), output, - &output_len); - return output_len == sizeof(kPlaintextHMACSHA256) && - check_test(kPlaintextHMACSHA256, output, sizeof(kPlaintextHMACSHA256), - "HMAC-SHA-256 KAT"); -} - -static int boringssl_self_test_fast(void) { - static const uint8_t kAESKey[16] = "BoringCrypto Key"; - static const uint8_t kAESIV[16] = {0}; - - EVP_AEAD_CTX aead_ctx; - EVP_AEAD_CTX_zero(&aead_ctx); - int ret = 0; - - AES_KEY aes_key; - uint8_t aes_iv[16]; - uint8_t output[256]; - - // AES-CBC Encryption KAT - static const uint8_t kAESCBCEncPlaintext[32] = { - 0x07, 0x86, 0x09, 0xa6, 0xc5, 0xac, 0x25, 0x44, 0x69, 0x9a, 0xdf, - 0x68, 0x2f, 0xa3, 0x77, 0xf9, 0xbe, 0x8a, 0xb6, 0xae, 0xf5, 0x63, - 0xe8, 0xc5, 0x6a, 0x36, 0xb8, 0x4f, 0x55, 0x7f, 0xad, 0xd3, - }; - static const uint8_t kAESCBCEncCiphertext[sizeof(kAESCBCEncPlaintext)] = { - 0x56, 0x46, 0xc1, 0x41, 0xf4, 0x13, 0xd6, 0xff, 0x62, 0x92, 0x41, - 0x7a, 0x26, 0xc6, 0x86, 0xbd, 0x30, 0x5f, 0xb6, 0x57, 0xa7, 0xd2, - 0x50, 0x3a, 0xc5, 0x5e, 0x8e, 0x93, 0x40, 0xf2, 0x10, 0xd8, - }; - memcpy(aes_iv, kAESIV, sizeof(kAESIV)); - if (AES_set_encrypt_key(kAESKey, 8 * sizeof(kAESKey), &aes_key) != 0) { - fprintf(stderr, "AES_set_encrypt_key failed.\n"); - goto err; - } - AES_cbc_encrypt(kAESCBCEncPlaintext, output, sizeof(kAESCBCEncPlaintext), - &aes_key, aes_iv, AES_ENCRYPT); - if (!check_test(kAESCBCEncCiphertext, output, sizeof(kAESCBCEncCiphertext), - "AES-CBC-encrypt KAT")) { - goto err; - } - - // AES-CBC Decryption KAT - static const uint8_t kAESCBCDecCiphertext[32] = { - 0x34, 0x7a, 0xa5, 0xa0, 0x24, 0xb2, 0x82, 0x57, 0xb3, 0x65, 0x10, - 0xbe, 0x58, 0x3d, 0x4f, 0x47, 0xad, 0xb7, 0xbb, 0xee, 0xdc, 0x60, - 0x05, 0xbb, 0xbd, 0x0d, 0x0a, 0x9f, 0x06, 0xbb, 0x7b, 0x10, - }; - static const uint8_t kAESCBCDecPlaintext[sizeof(kAESCBCDecCiphertext)] = { - 0x51, 0xa7, 0xa0, 0x1f, 0x6b, 0x79, 0x6c, 0xcd, 0x48, 0x03, 0xa1, - 0x41, 0xdc, 0x56, 0xa6, 0xc2, 0x16, 0xb5, 0xd1, 0xd3, 0xb7, 0x06, - 0xb2, 0x25, 0x6f, 0xa6, 0xd0, 0xd2, 0x0e, 0x6f, 0x19, 0xb5, - }; - memcpy(aes_iv, kAESIV, sizeof(kAESIV)); - if (AES_set_decrypt_key(kAESKey, 8 * sizeof(kAESKey), &aes_key) != 0) { - fprintf(stderr, "AES_set_decrypt_key failed.\n"); - goto err; - } - AES_cbc_encrypt(kAESCBCDecCiphertext, output, sizeof(kAESCBCDecCiphertext), - &aes_key, aes_iv, AES_DECRYPT); - if (!check_test(kAESCBCDecPlaintext, output, sizeof(kAESCBCDecPlaintext), - "AES-CBC-decrypt KAT")) { - goto err; - } - - size_t out_len; - uint8_t nonce[EVP_AEAD_MAX_NONCE_LENGTH]; - OPENSSL_memset(nonce, 0, sizeof(nonce)); - if (!EVP_AEAD_CTX_init(&aead_ctx, EVP_aead_aes_128_gcm(), kAESKey, - sizeof(kAESKey), 0, NULL)) { - fprintf(stderr, "EVP_AEAD_CTX_init for AES-128-GCM failed.\n"); - goto err; - } - - // AES-GCM Encryption KAT - static const uint8_t kAESGCMEncPlaintext[32] = { - 0x8f, 0xcc, 0x40, 0x99, 0x80, 0x8e, 0x75, 0xca, 0xaf, 0xf5, 0x82, - 0x89, 0x88, 0x48, 0xa8, 0x8d, 0x80, 0x8b, 0x55, 0xab, 0x4e, 0x93, - 0x70, 0x79, 0x7d, 0x94, 0x0b, 0xe8, 0xcc, 0x1d, 0x78, 0x84, - }; - static const uint8_t kAESGCMCiphertext[sizeof(kAESGCMEncPlaintext) + 16] = { - 0x87, 0x7b, 0xd5, 0x8d, 0x96, 0x3e, 0x4b, 0xe6, 0x64, 0x94, 0x40, 0x2f, - 0x61, 0x9b, 0x7e, 0x56, 0x52, 0x7d, 0xa4, 0x5a, 0xf9, 0xa6, 0xe2, 0xdb, - 0x1c, 0x63, 0x2e, 0x97, 0x93, 0x0f, 0xfb, 0xed, 0xb5, 0x9e, 0x1c, 0x20, - 0xb2, 0xb0, 0x58, 0xda, 0x48, 0x07, 0x2d, 0xbd, 0x96, 0x0d, 0x34, 0xc6, - }; - if (!EVP_AEAD_CTX_seal(&aead_ctx, output, &out_len, sizeof(output), nonce, - EVP_AEAD_nonce_length(EVP_aead_aes_128_gcm()), - kAESGCMEncPlaintext, sizeof(kAESGCMEncPlaintext), NULL, - 0) || - !check_test(kAESGCMCiphertext, output, sizeof(kAESGCMCiphertext), - "AES-GCM-encrypt KAT")) { - fprintf(stderr, "EVP_AEAD_CTX_seal for AES-128-GCM failed.\n"); - goto err; - } - - // AES-GCM Decryption KAT - static const uint8_t kAESGCMDecCiphertext[48] = { - 0x35, 0xf3, 0x05, 0x8f, 0x87, 0x57, 0x60, 0xff, 0x09, 0xd3, 0x12, 0x0f, - 0x70, 0xc4, 0xbc, 0x9e, 0xd7, 0xa8, 0x68, 0x72, 0xe1, 0x34, 0x52, 0x20, - 0x21, 0x76, 0xf7, 0x37, 0x1a, 0xe0, 0x4f, 0xaa, 0xe1, 0xdd, 0x39, 0x19, - 0x20, 0xf5, 0xd1, 0x39, 0x53, 0xd8, 0x96, 0x78, 0x59, 0x94, 0x82, 0x3c, - }; - static const uint8_t kAESGCMDecPlaintext[sizeof(kAESGCMDecCiphertext) - 16] = - { - 0x3d, 0x44, 0x90, 0x9b, 0x91, 0xe7, 0x5e, 0xd3, 0xc2, 0xb2, 0xd0, - 0xa9, 0x99, 0x17, 0x6a, 0x45, 0x05, 0x5e, 0x99, 0x83, 0x56, 0x01, - 0xc0, 0x82, 0x40, 0x81, 0xd2, 0x48, 0x45, 0xf2, 0xcc, 0xc3, - }; - if (!EVP_AEAD_CTX_open(&aead_ctx, output, &out_len, sizeof(output), nonce, - EVP_AEAD_nonce_length(EVP_aead_aes_128_gcm()), - kAESGCMDecCiphertext, sizeof(kAESGCMDecCiphertext), - NULL, 0) || - !check_test(kAESGCMDecPlaintext, output, sizeof(kAESGCMDecPlaintext), - "AES-GCM-decrypt KAT")) { - fprintf(stderr, - "AES-GCM-decrypt KAT failed because EVP_AEAD_CTX_open failed.\n"); - goto err; - } - - // SHA-1 KAT - static const uint8_t kSHA1Input[16] = { - 0x13, 0x2f, 0xd9, 0xba, 0xd5, 0xc1, 0x82, 0x62, - 0x63, 0xba, 0xfb, 0xb6, 0x99, 0xf7, 0x07, 0xa5, - }; - static const uint8_t kSHA1Digest[20] = { - 0x94, 0x19, 0x55, 0x93, 0x0a, 0x58, 0x29, 0x38, 0xeb, 0xf5, - 0x09, 0x11, 0x6d, 0x1a, 0xfd, 0x0f, 0x1e, 0x11, 0xe3, 0xcb, - }; - SHA1(kSHA1Input, sizeof(kSHA1Input), output); - if (!check_test(kSHA1Digest, output, sizeof(kSHA1Digest), - "SHA-1 KAT")) { - goto err; - } - - if (!boringssl_self_test_sha256() || - !boringssl_self_test_sha512() || - !boringssl_self_test_hmac_sha256()) { - goto err; - } - - // DBRG KAT - static const uint8_t kDRBGEntropy[48] = { - 0xc4, 0xda, 0x07, 0x40, 0xd5, 0x05, 0xf1, 0xee, 0x28, 0x0b, 0x95, 0xe5, - 0x8c, 0x49, 0x31, 0xac, 0x6d, 0xe8, 0x46, 0xa0, 0x15, 0x2f, 0xbb, 0x4a, - 0x3f, 0x17, 0x4c, 0xf4, 0x78, 0x7a, 0x4f, 0x1a, 0x40, 0xc2, 0xb5, 0x0b, - 0xab, 0xe1, 0x4a, 0xae, 0x53, 0x0b, 0xe5, 0x88, 0x6d, 0x91, 0x0a, 0x27, - }; - static const uint8_t kDRBGPersonalization[18] = "BCMPersonalization"; - static const uint8_t kDRBGAD[16] = "BCM DRBG KAT AD "; - static const uint8_t kDRBGOutput[64] = { - 0x19, 0x1f, 0x2b, 0x49, 0x76, 0x85, 0xfd, 0x51, 0xb6, 0x56, 0xbc, - 0x1c, 0x7d, 0xd5, 0xdd, 0x44, 0x76, 0xa3, 0x5e, 0x17, 0x9b, 0x8e, - 0xb8, 0x98, 0x65, 0x12, 0xca, 0x35, 0x6c, 0xa0, 0x6f, 0xa0, 0x22, - 0xe4, 0xf6, 0xd8, 0x43, 0xed, 0x4e, 0x2d, 0x97, 0x39, 0x43, 0x3b, - 0x57, 0xfc, 0x23, 0x3f, 0x71, 0x0a, 0xe0, 0xed, 0xfe, 0xd5, 0xb8, - 0x67, 0x7a, 0x00, 0x39, 0xb2, 0x6e, 0xa9, 0x25, 0x97, - }; - static const uint8_t kDRBGEntropy2[48] = { - 0xc7, 0x16, 0x1c, 0xa3, 0x6c, 0x23, 0x09, 0xb7, 0x16, 0xe9, 0x85, 0x9b, - 0xb9, 0x6c, 0x6d, 0x49, 0xbd, 0xc8, 0x35, 0x21, 0x03, 0xa1, 0x8c, 0xd2, - 0x4e, 0xf4, 0x2e, 0xc9, 0x7e, 0xf4, 0x6b, 0xf4, 0x46, 0xeb, 0x1a, 0x45, - 0x76, 0xc1, 0x86, 0xe9, 0x35, 0x18, 0x03, 0x76, 0x3a, 0x79, 0x12, 0xfe, - }; - static const uint8_t kDRBGReseedOutput[64] = { - 0x00, 0xf2, 0x05, 0xaa, 0xfd, 0x11, 0x6c, 0x77, 0xbc, 0x81, 0x86, - 0x99, 0xca, 0x51, 0xcf, 0x80, 0x15, 0x9f, 0x02, 0x9e, 0x0b, 0xcd, - 0x26, 0xc8, 0x4b, 0x87, 0x8a, 0x15, 0x1a, 0xdd, 0xf2, 0xf3, 0xeb, - 0x94, 0x0b, 0x08, 0xc8, 0xc9, 0x57, 0xa4, 0x0b, 0x4b, 0x0f, 0x13, - 0xde, 0x7c, 0x0c, 0x6a, 0xac, 0x34, 0x4a, 0x9a, 0xf2, 0xd0, 0x83, - 0x02, 0x05, 0x17, 0xc9, 0x81, 0x8f, 0x2a, 0x81, 0x92, - }; - CTR_DRBG_STATE drbg; - if (!CTR_DRBG_init(&drbg, kDRBGEntropy, kDRBGPersonalization, - sizeof(kDRBGPersonalization)) || - !CTR_DRBG_generate(&drbg, output, sizeof(kDRBGOutput), kDRBGAD, - sizeof(kDRBGAD)) || - !check_test(kDRBGOutput, output, sizeof(kDRBGOutput), - "DRBG Generate KAT") || - !CTR_DRBG_reseed(&drbg, kDRBGEntropy2, kDRBGAD, sizeof(kDRBGAD)) || - !CTR_DRBG_generate(&drbg, output, sizeof(kDRBGReseedOutput), kDRBGAD, - sizeof(kDRBGAD)) || - !check_test(kDRBGReseedOutput, output, sizeof(kDRBGReseedOutput), - "DRBG-reseed KAT")) { - fprintf(stderr, "CTR-DRBG failed.\n"); - goto err; - } - CTR_DRBG_clear(&drbg); - - CTR_DRBG_STATE kZeroDRBG; - memset(&kZeroDRBG, 0, sizeof(kZeroDRBG)); - if (!check_test(&kZeroDRBG, &drbg, sizeof(drbg), "DRBG Clear KAT")) { - goto err; - } - - // TLS KDF KAT - static const uint8_t kTLSSecret[32] = { - 0xab, 0xc3, 0x65, 0x7b, 0x09, 0x4c, 0x76, 0x28, 0xa0, 0xb2, 0x82, - 0x99, 0x6f, 0xe7, 0x5a, 0x75, 0xf4, 0x98, 0x4f, 0xd9, 0x4d, 0x4e, - 0xcc, 0x2f, 0xcf, 0x53, 0xa2, 0xc4, 0x69, 0xa3, 0xf7, 0x31, - }; - static const char kTLSLabel[] = "FIPS self test"; - static const uint8_t kTLSSeed1[16] = { - 0x8f, 0x0d, 0xe8, 0xb6, 0x90, 0x8f, 0xb1, 0xd2, - 0x6d, 0x51, 0xf4, 0x79, 0x18, 0x63, 0x51, 0x65, - }; - static const uint8_t kTLSSeed2[16] = { - 0x7d, 0x24, 0x1a, 0x9d, 0x3c, 0x59, 0xbf, 0x3c, - 0x31, 0x1e, 0x2b, 0x21, 0x41, 0x8d, 0x32, 0x81, - }; - static const uint8_t kTLSOutput[32] = { - 0xe2, 0x1d, 0xd6, 0xc2, 0x68, 0xc7, 0x57, 0x03, 0x2c, 0x2c, 0xeb, - 0xbb, 0xb8, 0xa9, 0x7d, 0xe9, 0xee, 0xe6, 0xc9, 0x47, 0x83, 0x0a, - 0xbd, 0x11, 0x60, 0x5d, 0xd5, 0x2c, 0x47, 0xb6, 0x05, 0x88, - }; - uint8_t tls_output[sizeof(kTLSOutput)]; - if (!CRYPTO_tls1_prf(EVP_sha256(), tls_output, sizeof(tls_output), kTLSSecret, - sizeof(kTLSSecret), kTLSLabel, sizeof(kTLSLabel), - kTLSSeed1, sizeof(kTLSSeed1), kTLSSeed2, - sizeof(kTLSSeed2)) || - !check_test(kTLSOutput, tls_output, sizeof(kTLSOutput), "TLS-KDF KAT")) { - fprintf(stderr, "TLS KDF failed.\n"); - goto err; - } - - ret = 1; - -err: - EVP_AEAD_CTX_cleanup(&aead_ctx); - - return ret; -} - -int BORINGSSL_self_test(void) { - if (!boringssl_self_test_fast() || - // When requested to run self tests, also run the lazy tests. - !boringssl_self_test_rsa() || - !boringssl_self_test_ecc() || - !boringssl_self_test_ffdh()) { - return 0; - } - - return 1; -} - -#if defined(BORINGSSL_FIPS) -int boringssl_self_test_startup(void) { - return boringssl_self_test_fast(); -} -#endif - -#endif // !_MSC_VER diff --git a/third_party/boringssl/src/crypto/fipsmodule/self_check/self_check.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/self_check/self_check.cc.inc new file mode 100644 index 00000000..8ea40a18 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/self_check/self_check.cc.inc @@ -0,0 +1,1093 @@ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../../bcm_support.h" +#include "../../internal.h" +#include "../delocate.h" +#include "../dh/internal.h" +#include "../ec/internal.h" +#include "../ecdsa/internal.h" +#include "../rand/internal.h" +#include "../rsa/internal.h" +#include "../service_indicator/internal.h" +#include "../tls/internal.h" + + +using namespace bssl; + +static void hexdump(FILE *out, Span in) { + for (uint8_t b : in) { + fprintf(out, "%02x", b); + } +} + +int bssl::BORINGSSL_check_test(Span expected, + Span actual, const char *name) { + if (expected != actual) { + FILE *err = CRYPTO_get_stderr(); + fprintf(err, "%s failed.\nExpected: ", name); + hexdump(err, expected); + fprintf(err, "\nCalculated: "); + hexdump(err, actual); + fprintf(err, "\n"); + fflush(err); + return 0; + } + return 1; +} + +static int set_bignum(UniquePtr *out, Span in) { + out->reset(BN_bin2bn(in.data(), in.size(), nullptr)); + return *out != nullptr; +} + +static const uint8_t kRSASelfTestN[] = { + 0xd3, 0x3a, 0x62, 0x9f, 0x07, 0x77, 0xb0, 0x18, 0xf3, 0xff, 0xfe, 0xcc, + 0xc9, 0xa2, 0xc2, 0x3a, 0xa6, 0x1d, 0xd8, 0xf0, 0x26, 0x5b, 0x38, 0x90, + 0x17, 0x48, 0x15, 0xce, 0x21, 0xcd, 0xd6, 0x62, 0x99, 0xe2, 0xd7, 0xda, + 0x40, 0x80, 0x3c, 0xad, 0x18, 0xb7, 0x26, 0xe9, 0x30, 0x8a, 0x23, 0x3f, + 0x68, 0x9a, 0x9c, 0x31, 0x34, 0x91, 0x99, 0x06, 0x11, 0x36, 0xb2, 0x9e, + 0x3a, 0xd0, 0xbc, 0xb9, 0x93, 0x4e, 0xb8, 0x72, 0xa1, 0x9f, 0xb6, 0x8c, + 0xd5, 0x17, 0x1f, 0x7e, 0xaa, 0x75, 0xbb, 0xdf, 0xa1, 0x70, 0x48, 0xc4, + 0xec, 0x9a, 0x51, 0xed, 0x41, 0xc9, 0x74, 0xc0, 0x3e, 0x1e, 0x85, 0x2f, + 0xbe, 0x34, 0xc7, 0x65, 0x34, 0x8b, 0x4d, 0x55, 0x4b, 0xe1, 0x45, 0x54, + 0x0d, 0x75, 0x7e, 0x89, 0x4d, 0x0c, 0xf6, 0x33, 0xe5, 0xfc, 0xfb, 0x56, + 0x1b, 0xf2, 0x39, 0x9d, 0xe0, 0xff, 0x55, 0xcf, 0x02, 0x05, 0xb9, 0x74, + 0xd2, 0x91, 0xfc, 0x87, 0xe1, 0xbb, 0x97, 0x2a, 0xe4, 0xdd, 0x20, 0xc0, + 0x38, 0x47, 0xc0, 0x76, 0x3f, 0xa1, 0x9b, 0x5c, 0x20, 0xff, 0xff, 0xc7, + 0x49, 0x3b, 0x4c, 0xaf, 0x99, 0xa6, 0x3e, 0x82, 0x5c, 0x58, 0x27, 0xce, + 0x01, 0x03, 0xc3, 0x16, 0x35, 0x20, 0xe9, 0xf0, 0x15, 0x7a, 0x41, 0xd5, + 0x1f, 0x52, 0xea, 0xdf, 0xad, 0x4c, 0xbb, 0x0d, 0xcb, 0x04, 0x91, 0xb0, + 0x95, 0xa8, 0xce, 0x25, 0xfd, 0xd2, 0x62, 0x47, 0x77, 0xee, 0x13, 0xf1, + 0x48, 0x72, 0x9e, 0xd9, 0x2d, 0xe6, 0x5f, 0xa4, 0xc6, 0x9e, 0x5a, 0xb2, + 0xc6, 0xa2, 0xf7, 0x0a, 0x16, 0x17, 0xae, 0x6b, 0x1c, 0x30, 0x7c, 0x63, + 0x08, 0x83, 0xe7, 0x43, 0xec, 0x54, 0x5e, 0x2c, 0x08, 0x0b, 0x5e, 0x46, + 0xa7, 0x10, 0x93, 0x43, 0x53, 0x4e, 0xe3, 0x16, 0x73, 0x55, 0xce, 0xf2, + 0x94, 0xc0, 0xbe, 0xb3, +}; +static const uint8_t kRSASelfTestE[] = {0x01, 0x00, 0x01}; // 65537 + +static RSA *self_test_rsa_public_key() { + RSAImpl *rsa = FromOpaque(RSA_new()); + if (rsa == nullptr || // + !set_bignum(&rsa->n, kRSASelfTestN) || + !set_bignum(&rsa->e, kRSASelfTestE)) { + RSA_free(rsa); + return nullptr; + } + + return rsa; +} + +static RSA *self_test_rsa_private_key() { + static const uint8_t kD[] = { + 0x2f, 0x2c, 0x1e, 0xd2, 0x3d, 0x2c, 0xb1, 0x9b, 0x21, 0x02, 0xce, 0xb8, + 0x95, 0x5f, 0x4f, 0xd9, 0x21, 0x38, 0x11, 0x36, 0xb0, 0x9a, 0x36, 0xab, + 0x97, 0x47, 0x75, 0xf7, 0x2e, 0xfd, 0x75, 0x1f, 0x58, 0x16, 0x9c, 0xf6, + 0x14, 0xe9, 0x8e, 0xa3, 0x69, 0x9d, 0x9d, 0x86, 0xfe, 0x5c, 0x1b, 0x3b, + 0x11, 0xf5, 0x55, 0x64, 0x77, 0xc4, 0xfc, 0x53, 0xaa, 0x8c, 0x78, 0x9f, + 0x75, 0xab, 0x20, 0x3a, 0xa1, 0x77, 0x37, 0x22, 0x02, 0x8e, 0x54, 0x8a, + 0x67, 0x1c, 0x5e, 0xe0, 0x3e, 0xd9, 0x44, 0x37, 0xd1, 0x29, 0xee, 0x56, + 0x6c, 0x30, 0x9a, 0x93, 0x4d, 0xd9, 0xdb, 0xc5, 0x03, 0x1a, 0x75, 0xcc, + 0x0f, 0xc2, 0x61, 0xb5, 0x6c, 0x62, 0x9f, 0xc6, 0xa8, 0xc7, 0x8a, 0x60, + 0x17, 0x11, 0x62, 0x4c, 0xef, 0x74, 0x31, 0x97, 0xad, 0x89, 0x2d, 0xe8, + 0x31, 0x1d, 0x8b, 0x58, 0x82, 0xe3, 0x03, 0x1a, 0x6b, 0xdf, 0x3f, 0x3e, + 0xa4, 0x27, 0x19, 0xef, 0x46, 0x7a, 0x90, 0xdf, 0xa7, 0xe7, 0xc9, 0x66, + 0xab, 0x41, 0x1d, 0x65, 0x78, 0x1c, 0x18, 0x40, 0x5c, 0xd6, 0x87, 0xb5, + 0xea, 0x29, 0x44, 0xb3, 0xf5, 0xb3, 0xd2, 0x4f, 0xce, 0x88, 0x78, 0x49, + 0x27, 0x4e, 0x0b, 0x30, 0x85, 0xfb, 0x73, 0xfd, 0x8b, 0x32, 0x15, 0xee, + 0x1f, 0xc9, 0x0e, 0x89, 0xb9, 0x43, 0x2f, 0xe9, 0x60, 0x8d, 0xda, 0xae, + 0x2b, 0x30, 0x99, 0xee, 0x88, 0x81, 0x20, 0x7b, 0x4a, 0xc3, 0x18, 0xf2, + 0x94, 0x02, 0x79, 0x94, 0xaa, 0x65, 0xd9, 0x1b, 0x45, 0x2a, 0xac, 0x6e, + 0x30, 0x48, 0x57, 0xea, 0xbe, 0x79, 0x7d, 0xfc, 0x67, 0xaa, 0x47, 0xc0, + 0xf7, 0x52, 0xfd, 0x0b, 0x63, 0x4e, 0x3d, 0x2e, 0xcc, 0x36, 0xa0, 0xdb, + 0x92, 0x0b, 0xa9, 0x1b, 0xeb, 0xc2, 0xd5, 0x08, 0xd3, 0x85, 0x87, 0xf8, + 0x5d, 0x1a, 0xf6, 0xc1, + }; + static const uint8_t kP[] = { + 0xf7, 0x06, 0xa3, 0x98, 0x8a, 0x52, 0xf8, 0x63, 0x68, 0x27, 0x4f, 0x68, + 0x7f, 0x34, 0xec, 0x8e, 0x5d, 0xf8, 0x30, 0x92, 0xb3, 0x62, 0x4c, 0xeb, + 0xdb, 0x19, 0x6b, 0x09, 0xc5, 0xa3, 0xf0, 0xbb, 0xff, 0x0f, 0xc2, 0xd4, + 0x9b, 0xc9, 0x54, 0x4f, 0xb9, 0xf9, 0xe1, 0x4c, 0xf0, 0xe3, 0x4c, 0x90, + 0xda, 0x7a, 0x01, 0xc2, 0x9f, 0xc4, 0xc8, 0x8e, 0xb1, 0x1e, 0x93, 0x75, + 0x75, 0xc6, 0x13, 0x25, 0xc3, 0xee, 0x3b, 0xcc, 0xb8, 0x72, 0x6c, 0x49, + 0xb0, 0x09, 0xfb, 0xab, 0x44, 0xeb, 0x4d, 0x40, 0xf0, 0x61, 0x6b, 0xe5, + 0xe6, 0xfe, 0x3e, 0x0a, 0x77, 0x26, 0x39, 0x76, 0x3d, 0x4c, 0x3e, 0x9b, + 0x5b, 0xc0, 0xaf, 0xa2, 0x58, 0x76, 0xb0, 0xe9, 0xda, 0x7f, 0x0e, 0x78, + 0xc9, 0x76, 0x49, 0x5c, 0xfa, 0xb3, 0xb0, 0x15, 0x4b, 0x41, 0xc7, 0x27, + 0xa4, 0x75, 0x28, 0x5c, 0x30, 0x69, 0x50, 0x29, + }; + static const uint8_t kQ[] = { + 0xda, 0xe6, 0xd2, 0xbb, 0x44, 0xff, 0x4f, 0xdf, 0x57, 0xc1, 0x11, 0xa3, + 0x51, 0xba, 0x17, 0x89, 0x4c, 0x01, 0xc0, 0x0c, 0x97, 0x34, 0x50, 0xcf, + 0x32, 0x1e, 0xc0, 0xbd, 0x7b, 0x35, 0xb5, 0x6a, 0x26, 0xcc, 0xea, 0x4c, + 0x8e, 0x87, 0x4a, 0x67, 0x8b, 0xd3, 0xe5, 0x4f, 0x3a, 0x60, 0x48, 0x59, + 0x04, 0x93, 0x39, 0xd7, 0x7c, 0xfb, 0x19, 0x1a, 0x34, 0xd5, 0xe8, 0xaf, + 0xe7, 0x22, 0x2c, 0x0d, 0xc2, 0x91, 0x69, 0xb6, 0xe9, 0x2a, 0xe9, 0x1c, + 0x4c, 0x6e, 0x8f, 0x40, 0xf5, 0xa8, 0x3e, 0x82, 0x69, 0x69, 0xbe, 0x9f, + 0x7d, 0x5c, 0x7f, 0x92, 0x78, 0x17, 0xa3, 0x6d, 0x41, 0x2d, 0x72, 0xed, + 0x3f, 0x71, 0xfa, 0x97, 0xb4, 0x63, 0xe4, 0x4f, 0xd9, 0x46, 0x03, 0xfb, + 0x00, 0xeb, 0x30, 0x70, 0xb9, 0x51, 0xd9, 0x0a, 0xd2, 0xf8, 0x50, 0xd4, + 0xfb, 0x43, 0x84, 0xf8, 0xac, 0x58, 0xc3, 0x7b, + }; + static const uint8_t kDModPMinusOne[] = { + 0xf5, 0x50, 0x8f, 0x88, 0x7d, 0xdd, 0xb5, 0xb4, 0x2a, 0x8b, 0xd7, 0x4d, + 0x23, 0xfe, 0xaf, 0xe9, 0x16, 0x22, 0xd2, 0x41, 0xed, 0x88, 0xf2, 0x70, + 0xcb, 0x4d, 0xeb, 0xc1, 0x71, 0x97, 0xc4, 0x0b, 0x3e, 0x5a, 0x2d, 0x96, + 0xab, 0xfa, 0xfd, 0x12, 0x8b, 0xd3, 0x3e, 0x4e, 0x05, 0x6f, 0x04, 0xeb, + 0x59, 0x3c, 0x0e, 0xa1, 0x73, 0xbe, 0x9d, 0x99, 0x2f, 0x05, 0xf9, 0x54, + 0x8d, 0x98, 0x1e, 0x0d, 0xc4, 0x0c, 0xc3, 0x30, 0x23, 0xff, 0xe5, 0xd0, + 0x2b, 0xd5, 0x4e, 0x2b, 0xa0, 0xae, 0xb8, 0x32, 0x84, 0x45, 0x8b, 0x3c, + 0x6d, 0xf0, 0x10, 0x36, 0x9e, 0x6a, 0xc4, 0x67, 0xca, 0xa9, 0xfc, 0x06, + 0x96, 0xd0, 0xbc, 0xda, 0xd1, 0x55, 0x55, 0x8d, 0x77, 0x21, 0xf4, 0x82, + 0x39, 0x37, 0x91, 0xd5, 0x97, 0x56, 0x78, 0xc8, 0x3c, 0xcb, 0x5e, 0xf6, + 0xdc, 0x58, 0x48, 0xb3, 0x7c, 0x94, 0x29, 0x39, + }; + static const uint8_t kDModQMinusOne[] = { + 0x64, 0x65, 0xbd, 0x7d, 0x1a, 0x96, 0x26, 0xa1, 0xfe, 0xf3, 0x94, 0x0d, + 0x5d, 0xec, 0x85, 0xe2, 0xf8, 0xb3, 0x4c, 0xcb, 0xf9, 0x85, 0x8b, 0x12, + 0x9c, 0xa0, 0x32, 0x32, 0x35, 0x92, 0x5a, 0x94, 0x47, 0x1b, 0x70, 0xd2, + 0x90, 0x04, 0x49, 0x01, 0xd8, 0xc5, 0xe4, 0xc4, 0x43, 0xb7, 0xe9, 0x36, + 0xba, 0xbc, 0x73, 0xa8, 0xfb, 0xaf, 0x86, 0xc1, 0xd8, 0x3d, 0xcb, 0xac, + 0xf1, 0xcb, 0x60, 0x7d, 0x27, 0x21, 0xde, 0x64, 0x7f, 0xe8, 0xa8, 0x65, + 0xcc, 0x40, 0x60, 0xff, 0xa0, 0x2b, 0xfc, 0x0f, 0x80, 0x1d, 0x79, 0xca, + 0x58, 0x8a, 0xd6, 0x0f, 0xed, 0x78, 0x9a, 0x02, 0x00, 0x04, 0xc2, 0x53, + 0x41, 0xe8, 0x1a, 0xd0, 0xfd, 0x71, 0x5b, 0x43, 0xac, 0x19, 0x4a, 0xb6, + 0x12, 0xa3, 0xcb, 0xe1, 0xc7, 0x7d, 0x5c, 0x98, 0x74, 0x4e, 0x63, 0x74, + 0x6b, 0x91, 0x7a, 0x29, 0x3b, 0x92, 0xb2, 0x85, + }; + static const uint8_t kQInverseModP[] = { + 0xd0, 0xde, 0x19, 0xda, 0x1e, 0xa2, 0xd8, 0x8f, 0x1c, 0x92, 0x73, 0xb0, + 0xc9, 0x90, 0xc7, 0xf5, 0xec, 0xc5, 0x89, 0x01, 0x05, 0x78, 0x11, 0x2d, + 0x74, 0x34, 0x44, 0xad, 0xd5, 0xf7, 0xa4, 0xfe, 0x9f, 0x25, 0x4d, 0x0b, + 0x92, 0xe3, 0xb8, 0x7d, 0xd3, 0xfd, 0xa5, 0xca, 0x95, 0x60, 0xa3, 0xf9, + 0x55, 0x42, 0x14, 0xb2, 0x45, 0x51, 0x9f, 0x73, 0x88, 0x43, 0x8a, 0xd1, + 0x65, 0x9e, 0xd1, 0xf7, 0x82, 0x2a, 0x2a, 0x8d, 0x70, 0x56, 0xe3, 0xef, + 0xc9, 0x0e, 0x2a, 0x2c, 0x15, 0xaf, 0x7f, 0x97, 0x81, 0x66, 0xf3, 0xb5, + 0x00, 0xa9, 0x26, 0xcc, 0x1e, 0xc2, 0x98, 0xdd, 0xd3, 0x37, 0x06, 0x79, + 0xb3, 0x60, 0x58, 0x79, 0x99, 0x3f, 0xa3, 0x15, 0x1f, 0x31, 0xe3, 0x11, + 0x88, 0x4c, 0x35, 0x57, 0xfa, 0x79, 0xd7, 0xd8, 0x72, 0xee, 0x73, 0x95, + 0x89, 0x29, 0xc7, 0x05, 0x27, 0x68, 0x90, 0x15, + }; + + RSAImpl *rsa = FromOpaque(RSA_new()); + if (rsa == nullptr || // + !set_bignum(&rsa->n, kRSASelfTestN) || + !set_bignum(&rsa->e, kRSASelfTestE) || // + !set_bignum(&rsa->d, kD) || // + !set_bignum(&rsa->p, kP) || // + !set_bignum(&rsa->q, kQ) || // + !set_bignum(&rsa->dmp1, kDModPMinusOne) || + !set_bignum(&rsa->dmq1, kDModQMinusOne) || + !set_bignum(&rsa->iqmp, kQInverseModP)) { + RSA_free(rsa); + return nullptr; + } + + return rsa; +} + +static EC_KEY *self_test_ecdsa_key() { + static const uint8_t kQx[] = { + 0xc8, 0x15, 0x61, 0xec, 0xf2, 0xe5, 0x4e, 0xde, 0xfe, 0x66, 0x17, + 0xdb, 0x1c, 0x7a, 0x34, 0xa7, 0x07, 0x44, 0xdd, 0xb2, 0x61, 0xf2, + 0x69, 0xb8, 0x3d, 0xac, 0xfc, 0xd2, 0xad, 0xe5, 0xa6, 0x81, + }; + static const uint8_t kQy[] = { + 0xe0, 0xe2, 0xaf, 0xa3, 0xf9, 0xb6, 0xab, 0xe4, 0xc6, 0x98, 0xef, + 0x64, 0x95, 0xf1, 0xbe, 0x49, 0xa3, 0x19, 0x6c, 0x50, 0x56, 0xac, + 0xb3, 0x76, 0x3f, 0xe4, 0x50, 0x7e, 0xec, 0x59, 0x6e, 0x88, + }; + static const uint8_t kD[] = { + 0xc6, 0xc1, 0xaa, 0xda, 0x15, 0xb0, 0x76, 0x61, 0xf8, 0x14, 0x2c, + 0x6c, 0xaf, 0x0f, 0xdb, 0x24, 0x1a, 0xff, 0x2e, 0xfe, 0x46, 0xc0, + 0x93, 0x8b, 0x74, 0xf2, 0xbc, 0xc5, 0x30, 0x52, 0xb0, 0x77, + }; + + EC_KEY *ec_key = EC_KEY_new(); + BIGNUM *qx = BN_bin2bn(kQx, sizeof(kQx), nullptr); + BIGNUM *qy = BN_bin2bn(kQy, sizeof(kQy), nullptr); + BIGNUM *d = BN_bin2bn(kD, sizeof(kD), nullptr); + if (ec_key == nullptr || qx == nullptr || qy == nullptr || d == nullptr || + !EC_KEY_set_group(ec_key, EC_group_p256()) || + !EC_KEY_set_public_key_affine_coordinates(ec_key, qx, qy) || + !EC_KEY_set_private_key(ec_key, d)) { + EC_KEY_free(ec_key); + ec_key = nullptr; + } + + BN_free(qx); + BN_free(qy); + BN_free(d); + return ec_key; +} + +static DH *self_test_dh() { + DH *dh = DH_get_rfc7919_2048(); + if (!dh) { + return nullptr; + } + + BIGNUM *priv = BN_new(); + if (!priv) { + goto err; + } + + // kFFDHE2048PrivateKeyData is a 225-bit value. (225 because that's the + // minimum private key size in + // https://tools.ietf.org/html/rfc7919#appendix-A.1.) + static const BN_ULONG kFFDHE2048PrivateKeyData[] = { + TOBN(0x187be36b, 0xd38a4fa1), + TOBN(0x0a152f39, 0x6458f3b8), + TOBN(0x0570187e, 0xc422eeb7), + TOBN(0x00000001, 0x91173f2a), + }; + + bn_set_static_words(priv, kFFDHE2048PrivateKeyData, + std::size(kFFDHE2048PrivateKeyData)); + + if (!DH_set0_key(dh, nullptr, priv)) { + goto err; + } + return dh; + +err: + BN_free(priv); + DH_free(dh); + return nullptr; +} + + +// Lazy self-tests +// +// Self tests that are slow are deferred until the corresponding algorithm is +// actually exercised, in FIPS mode. (In non-FIPS mode these tests are only run +// when requested by |BORINGSSL_self_test|.) + +static int boringssl_self_test_rsa_sign() { + UniquePtr rsa_key(self_test_rsa_private_key()); + if (rsa_key == nullptr) { + fprintf(CRYPTO_get_stderr(), "RSA key construction failed\n"); + return 0; + } + + // RSA Sign KAT + + static const uint8_t kRSASignDigest[32] = { + 0xd2, 0xb5, 0x6e, 0x53, 0x30, 0x6f, 0x72, 0x0d, 0x79, 0x29, 0xd8, + 0x70, 0x8b, 0xf4, 0x6f, 0x1c, 0x22, 0x30, 0x03, 0x05, 0x58, 0x2b, + 0x11, 0x5b, 0xed, 0xca, 0xc7, 0x22, 0xd8, 0xaa, 0x5a, 0xb2, + }; + static const uint8_t kRSASignSignature[256] = { + 0x64, 0xce, 0xdd, 0x91, 0x27, 0xb0, 0x4f, 0xb9, 0x14, 0xea, 0xc0, 0xb4, + 0xa2, 0x06, 0xc5, 0xd8, 0x40, 0x0f, 0x6c, 0x54, 0xac, 0xf7, 0x02, 0xde, + 0x26, 0xbb, 0xfd, 0x33, 0xe5, 0x2f, 0x4d, 0xb1, 0x53, 0xc4, 0xff, 0xd0, + 0x5f, 0xea, 0x15, 0x89, 0x83, 0x4c, 0xe3, 0x80, 0x0b, 0xe9, 0x13, 0x82, + 0x1d, 0x71, 0x92, 0x1a, 0x03, 0x60, 0x2c, 0xaf, 0xe2, 0x16, 0xc7, 0x43, + 0x3f, 0xde, 0x6b, 0x94, 0xfd, 0x6e, 0x08, 0x7b, 0x11, 0xf1, 0x34, 0x52, + 0xe5, 0xc0, 0x97, 0x66, 0x4a, 0xe0, 0x91, 0x45, 0xc8, 0xb1, 0x3d, 0x6a, + 0x54, 0xc1, 0x32, 0x0f, 0x32, 0xad, 0x25, 0x11, 0x3e, 0x49, 0xad, 0x41, + 0xce, 0x7b, 0xca, 0x95, 0x6b, 0x54, 0x5e, 0x86, 0x1b, 0xce, 0xfa, 0x2a, + 0x60, 0xe8, 0xfa, 0xbb, 0x23, 0xb2, 0x41, 0xbc, 0x7c, 0x98, 0xec, 0x73, + 0x20, 0xed, 0xb3, 0xcf, 0xab, 0x07, 0x24, 0x85, 0x6a, 0x2a, 0x61, 0x76, + 0x28, 0xf8, 0x00, 0x80, 0xeb, 0xd9, 0x3a, 0x63, 0xe2, 0x01, 0xb1, 0xee, + 0x6d, 0xe9, 0x73, 0xe9, 0xb6, 0x75, 0x2e, 0xf9, 0x81, 0xd9, 0xa8, 0x79, + 0xf6, 0x8f, 0xe3, 0x02, 0x7d, 0xf6, 0xea, 0xdc, 0x35, 0xe4, 0x62, 0x0d, + 0x91, 0xba, 0x3e, 0x7d, 0x8b, 0x82, 0xbf, 0x15, 0x74, 0x6a, 0x4e, 0x29, + 0xf8, 0x9b, 0x2c, 0x94, 0x8d, 0xa7, 0x00, 0x4d, 0x7b, 0xbf, 0x35, 0x07, + 0xeb, 0xdd, 0x10, 0xef, 0xd5, 0x2f, 0xe6, 0x98, 0x4b, 0x7e, 0x24, 0x80, + 0xe2, 0x01, 0xf2, 0x66, 0xb7, 0xd3, 0x93, 0xfe, 0x2a, 0xb3, 0x74, 0xed, + 0xec, 0x4b, 0xb1, 0x5f, 0x5f, 0xee, 0x85, 0x44, 0xa7, 0x26, 0xdf, 0xc1, + 0x2e, 0x7a, 0xf3, 0xa5, 0x8f, 0xf8, 0x64, 0xda, 0x65, 0xad, 0x91, 0xe2, + 0x90, 0x94, 0x20, 0x16, 0xb8, 0x61, 0xa5, 0x0a, 0x7d, 0xb4, 0xbf, 0xc0, + 0x10, 0xaf, 0x72, 0x67, + }; + + uint8_t output[256]; + unsigned sig_len; + if (!rsa_sign_no_self_test(NID_sha256, kRSASignDigest, sizeof(kRSASignDigest), + output, &sig_len, rsa_key.get()) || + !BORINGSSL_check_test(kRSASignSignature, Span(output, sig_len), + "RSA-sign KAT")) { + fprintf(CRYPTO_get_stderr(), "RSA signing test failed.\n"); + return 0; + } + + return 1; +} + +static int boringssl_self_test_rsa_verify() { + UniquePtr rsa_key(self_test_rsa_public_key()); + if (rsa_key == nullptr) { + fprintf(CRYPTO_get_stderr(), "RSA key construction failed\n"); + return 0; + } + + // RSA Verify KAT + + static const uint8_t kRSAVerifyDigest[32] = { + 0x09, 0x65, 0x2f, 0xd8, 0xed, 0x9d, 0xc2, 0x6d, 0xbc, 0xbf, 0xf2, + 0xa7, 0xa5, 0xed, 0xe1, 0x37, 0x13, 0x78, 0x21, 0x36, 0xcf, 0x8d, + 0x22, 0x3d, 0xab, 0x93, 0xb4, 0x12, 0xa8, 0xb5, 0x15, 0x53, + }; + static const uint8_t kRSAVerifySignature[256] = { + 0xab, 0xe2, 0xcb, 0xc1, 0x3d, 0x6b, 0xd3, 0x9d, 0x48, 0xdb, 0x53, 0x34, + 0xdd, 0xbf, 0x8d, 0x07, 0x0a, 0x93, 0xbd, 0xcb, 0x10, 0x4e, 0x2c, 0xc5, + 0xd0, 0xee, 0x48, 0x6e, 0xe2, 0x95, 0xf6, 0xb3, 0x1b, 0xda, 0x12, 0x6c, + 0x41, 0x89, 0x0b, 0x98, 0xb7, 0x3e, 0x70, 0xe6, 0xb6, 0x5d, 0x82, 0xf9, + 0x5c, 0x66, 0x31, 0x21, 0x75, 0x5a, 0x90, 0x74, 0x4c, 0x8d, 0x1c, 0x21, + 0x14, 0x8a, 0x19, 0x60, 0xbe, 0x0e, 0xca, 0x44, 0x6e, 0x9f, 0xf4, 0x97, + 0xf1, 0x34, 0x5c, 0x53, 0x7e, 0xf8, 0x11, 0x9b, 0x9a, 0x43, 0x98, 0xe9, + 0x5c, 0x5c, 0x6d, 0xe2, 0xb1, 0xc9, 0x55, 0x90, 0x5c, 0x52, 0x99, 0xd8, + 0xce, 0x7a, 0x3b, 0x6a, 0xb7, 0x63, 0x80, 0xd9, 0xba, 0xbd, 0xd1, 0x5f, + 0x61, 0x02, 0x37, 0xe1, 0xf3, 0xf2, 0xaa, 0x1c, 0x1f, 0x1e, 0x77, 0x0b, + 0x62, 0xfb, 0xb5, 0x96, 0x38, 0x1b, 0x2e, 0xbd, 0xd7, 0x7e, 0xce, 0xf9, + 0xc9, 0x0d, 0x4c, 0x92, 0xf7, 0xb6, 0xb0, 0x5f, 0xed, 0x29, 0x36, 0x28, + 0x5f, 0xa9, 0x48, 0x26, 0xe6, 0x20, 0x55, 0x32, 0x2a, 0x33, 0xb6, 0xf0, + 0x4c, 0x74, 0xce, 0x69, 0xe5, 0xd8, 0xd7, 0x37, 0xfb, 0x83, 0x8b, 0x79, + 0xd2, 0xd4, 0x8e, 0x3d, 0xaf, 0x71, 0x38, 0x75, 0x31, 0x88, 0x25, 0x31, + 0xa9, 0x5a, 0xc9, 0x64, 0xd0, 0x2e, 0xa4, 0x13, 0xbf, 0x85, 0x95, 0x29, + 0x82, 0xbb, 0xc0, 0x89, 0x52, 0x7d, 0xaf, 0xf5, 0xb8, 0x45, 0xc9, 0xa0, + 0xf4, 0xd1, 0x4e, 0xf1, 0x95, 0x6d, 0x9c, 0x3a, 0xca, 0xe8, 0x82, 0xd1, + 0x2d, 0xa6, 0x6d, 0xa0, 0xf3, 0x57, 0x94, 0xf5, 0xee, 0x32, 0x23, 0x23, + 0x33, 0x51, 0x7d, 0xb9, 0x31, 0x52, 0x32, 0xa1, 0x83, 0xb9, 0x91, 0x65, + 0x4d, 0xbe, 0xa4, 0x16, 0x15, 0x34, 0x5c, 0x88, 0x53, 0x25, 0x92, 0x67, + 0x44, 0xa5, 0x39, 0x15, + }; + if (!rsa_verify_no_self_test(NID_sha256, kRSAVerifyDigest, + sizeof(kRSAVerifyDigest), kRSAVerifySignature, + sizeof(kRSAVerifySignature), rsa_key.get())) { + fprintf(CRYPTO_get_stderr(), "RSA-verify KAT failed.\n"); + return 0; + } + + return 1; +} + +static int boringssl_self_test_ecc() { + int ret = 0; + EC_KEY *ec_key = nullptr; + EC_POINT *ec_point_in = nullptr; + EC_POINT *ec_point_out = nullptr; + BIGNUM *ec_scalar = nullptr; + const EC_GROUP *ec_group = nullptr; + + // The 'k' value for ECDSA is fixed to avoid an entropy draw. + uint8_t ecdsa_k[32] = {0}; + ecdsa_k[31] = 42; + + ec_key = self_test_ecdsa_key(); + if (ec_key == nullptr) { + fprintf(CRYPTO_get_stderr(), "ECDSA KeyGen failed\n"); + goto err; + } + + // ECDSA Sign/Verify KAT + + static const uint8_t kECDSASignDigest[32] = { + 0x1e, 0x35, 0x93, 0x0b, 0xe8, 0x60, 0xd0, 0x94, 0x2c, 0xa7, 0xbb, + 0xd6, 0xf6, 0xde, 0xd8, 0x7f, 0x15, 0x7e, 0x4d, 0xe2, 0x4f, 0x81, + 0xed, 0x4b, 0x87, 0x5c, 0x0e, 0x01, 0x8e, 0x89, 0xa8, 0x1f, + }; + static const uint8_t kECDSASignSig[64] = { + 0x67, 0x80, 0xc5, 0xfc, 0x70, 0x27, 0x5e, 0x2c, 0x70, 0x61, 0xa0, + 0xe7, 0x87, 0x7b, 0xb1, 0x74, 0xde, 0xad, 0xeb, 0x98, 0x87, 0x02, + 0x7f, 0x3f, 0xa8, 0x36, 0x54, 0x15, 0x8b, 0xa7, 0xf5, 0x0c, 0x68, + 0x04, 0x73, 0x40, 0x94, 0xb2, 0xd1, 0x90, 0xac, 0x2d, 0x0c, 0xd7, + 0xa5, 0x7f, 0x2f, 0x2e, 0xb2, 0x62, 0xb0, 0x09, 0x16, 0xe1, 0xa6, + 0x70, 0xb5, 0xbb, 0x0d, 0xfd, 0x8e, 0x0c, 0x02, 0x3f, + }; + + uint8_t ecdsa_sign_output[64]; + size_t ecdsa_sign_output_len; + if (!ecdsa_sign_fixed_with_nonce_for_known_answer_test( + kECDSASignDigest, sizeof(kECDSASignDigest), ecdsa_sign_output, + &ecdsa_sign_output_len, sizeof(ecdsa_sign_output), ec_key, ecdsa_k, + sizeof(ecdsa_k)) || + !BORINGSSL_check_test(kECDSASignSig, + Span(ecdsa_sign_output, ecdsa_sign_output_len), + "ECDSA-sign signature")) { + fprintf(CRYPTO_get_stderr(), "ECDSA-sign KAT failed.\n"); + goto err; + } + + static const uint8_t kECDSAVerifyDigest[32] = { + 0x78, 0x7c, 0x50, 0x5c, 0x60, 0xc9, 0xe4, 0x13, 0x6c, 0xe4, 0x48, + 0xba, 0x93, 0xff, 0x71, 0xfa, 0x9c, 0x18, 0xf4, 0x17, 0x09, 0x4f, + 0xdf, 0x5a, 0xe2, 0x75, 0xc0, 0xcc, 0xd2, 0x67, 0x97, 0xad, + }; + static const uint8_t kECDSAVerifySig[64] = { + 0x67, 0x80, 0xc5, 0xfc, 0x70, 0x27, 0x5e, 0x2c, 0x70, 0x61, 0xa0, + 0xe7, 0x87, 0x7b, 0xb1, 0x74, 0xde, 0xad, 0xeb, 0x98, 0x87, 0x02, + 0x7f, 0x3f, 0xa8, 0x36, 0x54, 0x15, 0x8b, 0xa7, 0xf5, 0x0c, 0x2d, + 0x36, 0xe5, 0x79, 0x97, 0x90, 0xbf, 0xbe, 0x21, 0x83, 0xd3, 0x3e, + 0x96, 0xf3, 0xc5, 0x1f, 0x6a, 0x23, 0x2f, 0x2a, 0x24, 0x48, 0x8c, + 0x8e, 0x5f, 0x64, 0xc3, 0x7e, 0xa2, 0xcf, 0x05, 0x29, + }; + + if (!ecdsa_verify_fixed_no_self_test( + kECDSAVerifyDigest, sizeof(kECDSAVerifyDigest), kECDSAVerifySig, + sizeof(kECDSAVerifySig), ec_key)) { + fprintf(CRYPTO_get_stderr(), "ECDSA-verify KAT failed.\n"); + goto err; + } + + // Primitive Z Computation KAT (IG 9.6). + + // kP256Point is SHA256("Primitive Z Computation KAT")×G within P-256. + static const uint8_t kP256Point[65] = { + 0x04, 0x4e, 0xc1, 0x94, 0x8c, 0x5c, 0xf4, 0x37, 0x35, 0x0d, 0xa3, + 0xf9, 0x55, 0xf9, 0x8b, 0x26, 0x23, 0x5c, 0x43, 0xe0, 0x83, 0x51, + 0x2b, 0x0d, 0x4b, 0x56, 0x24, 0xc3, 0xe4, 0xa5, 0xa8, 0xe2, 0xe9, + 0x95, 0xf2, 0xc4, 0xb9, 0xb7, 0x48, 0x7d, 0x2a, 0xae, 0xc5, 0xc0, + 0x0a, 0xcc, 0x1b, 0xd0, 0xec, 0xb8, 0xdc, 0xbe, 0x0c, 0xbe, 0x52, + 0x79, 0x93, 0x7c, 0x0b, 0x92, 0x2b, 0x7f, 0x17, 0xa5, 0x80, + }; + // kP256Scalar is SHA256("Primitive Z Computation KAT scalar"). + static const uint8_t kP256Scalar[32] = { + 0xe7, 0x60, 0x44, 0x91, 0x26, 0x9a, 0xfb, 0x5b, 0x10, 0x2d, 0x6e, + 0xa5, 0x2c, 0xb5, 0x9f, 0xeb, 0x70, 0xae, 0xde, 0x6c, 0xe3, 0xbf, + 0xb3, 0xe0, 0x10, 0x54, 0x85, 0xab, 0xd8, 0x61, 0xd7, 0x7b, + }; + // kP256PointResult is |kP256Scalar|×|kP256Point|. + static const uint8_t kP256PointResult[65] = { + 0x04, 0xf1, 0x63, 0x00, 0x88, 0xc5, 0xd5, 0xe9, 0x05, 0x52, 0xac, + 0xb6, 0xec, 0x68, 0x76, 0xb8, 0x73, 0x7f, 0x0f, 0x72, 0x34, 0xe6, + 0xbb, 0x30, 0x32, 0x22, 0x37, 0xb6, 0x2a, 0x80, 0xe8, 0x9e, 0x6e, + 0x6f, 0x36, 0x02, 0xe7, 0x21, 0xd2, 0x31, 0xdb, 0x94, 0x63, 0xb7, + 0xd8, 0x19, 0x0e, 0xc2, 0xc0, 0xa7, 0x2f, 0x15, 0x49, 0x1a, 0xa2, + 0x7c, 0x41, 0x8f, 0xaf, 0x9c, 0x40, 0xaf, 0x2e, 0x4a, 0x0c, + }; + + ec_group = EC_group_p256(); + ec_point_in = EC_POINT_new(ec_group); + ec_point_out = EC_POINT_new(ec_group); + ec_scalar = BN_new(); + uint8_t z_comp_result[65]; + if (ec_point_in == nullptr || ec_point_out == nullptr || + ec_scalar == nullptr || + !EC_POINT_oct2point(ec_group, ec_point_in, kP256Point, sizeof(kP256Point), + nullptr) || + !BN_bin2bn(kP256Scalar, sizeof(kP256Scalar), ec_scalar) || + !ec_point_mul_no_self_test(ec_group, ec_point_out, nullptr, ec_point_in, + ec_scalar, nullptr) || + EC_POINT_point2oct(ec_group, ec_point_out, POINT_CONVERSION_UNCOMPRESSED, + z_comp_result, sizeof(z_comp_result), + nullptr) != sizeof(z_comp_result) || + !BORINGSSL_check_test(kP256PointResult, z_comp_result, + "Z Computation Result")) { + fprintf(CRYPTO_get_stderr(), "Z-computation KAT failed.\n"); + goto err; + } + + ret = 1; + +err: + EC_KEY_free(ec_key); + EC_POINT_free(ec_point_in); + EC_POINT_free(ec_point_out); + BN_free(ec_scalar); + + return ret; +} + +static int boringssl_self_test_ffdh() { + int ret = 0; + DH *dh = nullptr; + BIGNUM *ffdhe2048_value = nullptr; + + // FFC Diffie-Hellman KAT + + // kFFDHE2048PublicValueData is an arbitrary public value, mod + // kFFDHE2048Data. (The private key happens to be 4096.) + static const BN_ULONG kFFDHE2048PublicValueData[] = { + TOBN(0x187be36b, 0xd38a4fa1), TOBN(0x0a152f39, 0x6458f3b8), + TOBN(0x0570187e, 0xc422eeb7), TOBN(0x18af7482, 0x91173f2a), + TOBN(0xe9fdac6a, 0xcff4eaaa), TOBN(0xf6afebb7, 0x6e589d6c), + TOBN(0xf92f8e9a, 0xb7e33fb0), TOBN(0x70acf2aa, 0x4cf36ddd), + TOBN(0x561ab426, 0xd07137fd), TOBN(0x5f57d037, 0x430ee91e), + TOBN(0xe3e768c8, 0x60d10b8a), TOBN(0xb14884d8, 0xa18af8ce), + TOBN(0xf8a98014, 0xa12b74e4), TOBN(0x748d407c, 0x3437b7a8), + TOBN(0x627588c4, 0x9875d5a7), TOBN(0xdd24a127, 0x53c8f09d), + TOBN(0x85a997d5, 0x0cd51aec), TOBN(0x44f0c619, 0xce348458), + TOBN(0x9b894b24, 0x5f6b69a1), TOBN(0xae1302f2, 0xf6d4777e), + TOBN(0xe6678eeb, 0x375db18e), TOBN(0x2674e1d6, 0x4fbcbdc8), + TOBN(0xb297a823, 0x6fa93d28), TOBN(0x6a12fb70, 0x7c8c0510), + TOBN(0x5c6d1aeb, 0xdb06f65b), TOBN(0xe8c2954e, 0x4c1804ca), + TOBN(0x06bdeac1, 0xf5500fa7), TOBN(0x6a315604, 0x189cd76b), + TOBN(0xbae7b0b3, 0x6e362dc0), TOBN(0xa57c73bd, 0xdc70fb82), + TOBN(0xfaff50d2, 0x9d573457), TOBN(0x352bd399, 0xbe84058e), + }; + static const uint8_t kDHOutput[2048 / 8] = { + 0x2a, 0xe6, 0xd3, 0xa6, 0x13, 0x58, 0x8e, 0xce, 0x53, 0xaa, 0xf6, 0x5d, + 0x9a, 0xae, 0x02, 0x12, 0xf5, 0x80, 0x3d, 0x06, 0x09, 0x76, 0xac, 0x57, + 0x37, 0x9e, 0xab, 0x38, 0x62, 0x25, 0x05, 0x1d, 0xf3, 0xa9, 0x39, 0x60, + 0xf6, 0xae, 0x90, 0xed, 0x1e, 0xad, 0x6e, 0xe9, 0xe3, 0xba, 0x27, 0xf6, + 0xdb, 0x54, 0xdf, 0xe2, 0xbd, 0xbb, 0x7f, 0xf1, 0x81, 0xac, 0x1a, 0xfa, + 0xdb, 0x87, 0x07, 0x98, 0x76, 0x90, 0x21, 0xf2, 0xae, 0xda, 0x0d, 0x84, + 0x97, 0x64, 0x0b, 0xbf, 0xb8, 0x8d, 0x10, 0x46, 0xe2, 0xd5, 0xca, 0x1b, + 0xbb, 0xe5, 0x37, 0xb2, 0x3b, 0x35, 0xd3, 0x1b, 0x65, 0xea, 0xae, 0xf2, + 0x03, 0xe2, 0xb6, 0xde, 0x22, 0xb7, 0x86, 0x49, 0x79, 0xfe, 0xd7, 0x16, + 0xf7, 0xdc, 0x9c, 0x59, 0xf5, 0xb7, 0x70, 0xc0, 0x53, 0x42, 0x6f, 0xb1, + 0xd2, 0x4e, 0x00, 0x25, 0x4b, 0x2d, 0x5a, 0x9b, 0xd0, 0xe9, 0x27, 0x43, + 0xcc, 0x00, 0x66, 0xea, 0x94, 0x7a, 0x0b, 0xb9, 0x89, 0x0c, 0x5e, 0x94, + 0xb8, 0x3a, 0x78, 0x9c, 0x4d, 0x84, 0xe6, 0x32, 0x2c, 0x38, 0x7c, 0xf7, + 0x43, 0x9c, 0xd8, 0xb8, 0x1c, 0xce, 0x24, 0x91, 0x20, 0x67, 0x7a, 0x54, + 0x1f, 0x7e, 0x86, 0x7f, 0xa1, 0xc1, 0x03, 0x4e, 0x2c, 0x26, 0x71, 0xb2, + 0x06, 0x30, 0xb3, 0x6c, 0x15, 0xcc, 0xac, 0x25, 0xe5, 0x37, 0x3f, 0x24, + 0x8f, 0x2a, 0x89, 0x5e, 0x3d, 0x43, 0x94, 0xc9, 0x36, 0xae, 0x40, 0x00, + 0x6a, 0x0d, 0xb0, 0x6e, 0x8b, 0x2e, 0x70, 0x57, 0xe1, 0x88, 0x53, 0xd6, + 0x06, 0x80, 0x2a, 0x4e, 0x5a, 0xf0, 0x1e, 0xaa, 0xcb, 0xab, 0x06, 0x0e, + 0x27, 0x0f, 0xd9, 0x88, 0xd9, 0x01, 0xe3, 0x07, 0xeb, 0xdf, 0xc3, 0x12, + 0xe3, 0x40, 0x88, 0x7b, 0x5f, 0x59, 0x78, 0x6e, 0x26, 0x20, 0xc3, 0xdf, + 0xc8, 0xe4, 0x5e, 0xb8, + }; + + ffdhe2048_value = BN_new(); + if (ffdhe2048_value) { + bn_set_static_words(ffdhe2048_value, kFFDHE2048PublicValueData, + std::size(kFFDHE2048PublicValueData)); + } + + dh = self_test_dh(); + uint8_t dh_out[sizeof(kDHOutput)]; + if (dh == nullptr || ffdhe2048_value == nullptr || + sizeof(dh_out) != DH_size(dh) || + dh_compute_key_padded_no_self_test(dh_out, ffdhe2048_value, dh) != + sizeof(dh_out) || + !BORINGSSL_check_test(kDHOutput, dh_out, "FFC DH")) { + fprintf(CRYPTO_get_stderr(), "FFDH failed.\n"); + goto err; + } + + ret = 1; + +err: + DH_free(dh); + BN_free(ffdhe2048_value); + + return ret; +} + +#if defined(BORINGSSL_FIPS) + +static void run_self_test_rsa_sign() { + FIPS_service_indicator_lock_state(); + if (!boringssl_self_test_rsa_sign()) { + BORINGSSL_FIPS_abort(); + } + FIPS_service_indicator_unlock_state(); +} + +DEFINE_STATIC_ONCE(g_self_test_once_rsa_sign) + +void bssl::boringssl_ensure_rsa_sign_self_test() { + CRYPTO_once(g_self_test_once_rsa_sign_bss_get(), run_self_test_rsa_sign); +} + +static void run_self_test_rsa_verify() { + FIPS_service_indicator_lock_state(); + if (!boringssl_self_test_rsa_verify()) { + BORINGSSL_FIPS_abort(); + } + FIPS_service_indicator_unlock_state(); +} + +DEFINE_STATIC_ONCE(g_self_test_once_rsa_verify) + +void bssl::boringssl_ensure_rsa_verify_self_test() { + CRYPTO_once(g_self_test_once_rsa_verify_bss_get(), run_self_test_rsa_verify); +} + +static void run_self_test_ecc() { + FIPS_service_indicator_lock_state(); + if (!boringssl_self_test_ecc()) { + BORINGSSL_FIPS_abort(); + } + FIPS_service_indicator_unlock_state(); +} + +DEFINE_STATIC_ONCE(g_self_test_once_ecc) + +void bssl::boringssl_ensure_ecc_self_test() { + CRYPTO_once(g_self_test_once_ecc_bss_get(), run_self_test_ecc); +} + +static void run_self_test_ffdh() { + FIPS_service_indicator_lock_state(); + if (!boringssl_self_test_ffdh()) { + BORINGSSL_FIPS_abort(); + } + FIPS_service_indicator_unlock_state(); +} + +DEFINE_STATIC_ONCE(g_self_test_once_ffdh) + +void bssl::boringssl_ensure_ffdh_self_test() { + CRYPTO_once(g_self_test_once_ffdh_bss_get(), run_self_test_ffdh); +} + +#endif // BORINGSSL_FIPS + + +// Startup self tests. +// +// These tests are run at process start when in FIPS mode. + +int bssl::boringssl_self_test_sha256() { + static const uint8_t kInput[16] = { + 0xff, 0x3b, 0x85, 0x7d, 0xa7, 0x23, 0x6a, 0x2b, + 0xaa, 0x0f, 0x39, 0x6b, 0x51, 0x52, 0x22, 0x17, + }; + static const uint8_t kPlaintextSHA256[32] = { + 0x7f, 0xe4, 0xd5, 0xf1, 0xa1, 0xe3, 0x82, 0x87, 0xd9, 0x58, 0xf5, + 0x11, 0xc7, 0x1d, 0x5e, 0x27, 0x5e, 0xcc, 0xd2, 0x66, 0xcf, 0xb9, + 0xc8, 0xc6, 0x60, 0xd8, 0x92, 0x1e, 0x57, 0xfd, 0x46, 0x75, + }; + uint8_t output[SHA256_DIGEST_LENGTH]; + + // SHA-256 KAT + SHA256(kInput, sizeof(kInput), output); + return BORINGSSL_check_test(kPlaintextSHA256, output, "SHA-256 KAT"); +} + +int bssl::boringssl_self_test_sha512() { + static const uint8_t kInput[16] = { + 0x21, 0x25, 0x12, 0xf8, 0xd2, 0xad, 0x83, 0x22, + 0x78, 0x1c, 0x6c, 0x4d, 0x69, 0xa9, 0xda, 0xa1, + }; + static const uint8_t kPlaintextSHA512[64] = { + 0x29, 0x3c, 0x94, 0x35, 0x4e, 0x98, 0x83, 0xe5, 0xc2, 0x78, 0x36, + 0x7a, 0xe5, 0x18, 0x90, 0xbf, 0x35, 0x41, 0x01, 0x64, 0x19, 0x8d, + 0x26, 0xeb, 0xe1, 0xf8, 0x2f, 0x04, 0x8e, 0xfa, 0x8b, 0x2b, 0xc6, + 0xb2, 0x9d, 0x5d, 0x46, 0x76, 0x5a, 0xc8, 0xb5, 0x25, 0xa3, 0xea, + 0x52, 0x84, 0x47, 0x6d, 0x6d, 0xf4, 0xc9, 0x71, 0xf3, 0x3d, 0x89, + 0x4c, 0x3b, 0x20, 0x8c, 0x5b, 0x75, 0xe8, 0xf8, 0x7c, + }; + uint8_t output[SHA512_DIGEST_LENGTH]; + + // SHA-512 KAT + SHA512(kInput, sizeof(kInput), output); + return BORINGSSL_check_test(kPlaintextSHA512, output, "SHA-512 KAT"); +} + +int bssl::boringssl_self_test_hmac_sha256() { + static const uint8_t kInput[16] = { + 0xda, 0xd9, 0x12, 0x93, 0xdf, 0xcf, 0x2a, 0x7c, + 0x8e, 0xcd, 0x13, 0xfe, 0x35, 0x3f, 0xa7, 0x5b, + }; + static const uint8_t kPlaintextHMACSHA256[32] = { + 0x36, 0x5f, 0x5b, 0xd5, 0xf5, 0xeb, 0xfd, 0xc7, 0x6e, 0x53, 0xa5, + 0x73, 0x6d, 0x73, 0x20, 0x13, 0xaa, 0xd3, 0xbc, 0x86, 0x4b, 0xb8, + 0x84, 0x94, 0x16, 0x46, 0x88, 0x9c, 0x48, 0xee, 0xa9, 0x0e, + }; + uint8_t output[EVP_MAX_MD_SIZE]; + + unsigned output_len; + return nullptr != HMAC(EVP_sha256(), kInput, sizeof(kInput), kInput, + sizeof(kInput), output, &output_len) && + BORINGSSL_check_test(kPlaintextHMACSHA256, Span(output, output_len), + "HMAC-SHA-256 KAT"); +} + +static int boringssl_self_test_fast() { + static const uint8_t kAESKey[16] = { + 'B', 'o', 'r', 'i', 'n', 'g', 'C', 'r', + 'y', 'p', 't', 'o', ' ', 'K', 'e', 'y', + }; + + { + // AES-CBC Encryption KAT + static const uint8_t kAESCBCEncPlaintext[32] = { + 0x07, 0x86, 0x09, 0xa6, 0xc5, 0xac, 0x25, 0x44, 0x69, 0x9a, 0xdf, + 0x68, 0x2f, 0xa3, 0x77, 0xf9, 0xbe, 0x8a, 0xb6, 0xae, 0xf5, 0x63, + 0xe8, 0xc5, 0x6a, 0x36, 0xb8, 0x4f, 0x55, 0x7f, 0xad, 0xd3, + }; + static const uint8_t kAESCBCEncCiphertext[sizeof(kAESCBCEncPlaintext)] = { + 0x56, 0x46, 0xc1, 0x41, 0xf4, 0x13, 0xd6, 0xff, 0x62, 0x92, 0x41, + 0x7a, 0x26, 0xc6, 0x86, 0xbd, 0x30, 0x5f, 0xb6, 0x57, 0xa7, 0xd2, + 0x50, 0x3a, 0xc5, 0x5e, 0x8e, 0x93, 0x40, 0xf2, 0x10, 0xd8, + }; + AES_KEY aes_key; + if (!bcm_success( + BCM_aes_set_encrypt_key(kAESKey, 8 * sizeof(kAESKey), &aes_key))) { + fprintf(CRYPTO_get_stderr(), "BCM_aes_set_encrypt_key failed.\n"); + return 0; + } + uint8_t aes_iv[16] = {}; + uint8_t output[sizeof(kAESCBCEncPlaintext)]; + AES_cbc_encrypt(kAESCBCEncPlaintext, output, sizeof(kAESCBCEncPlaintext), + &aes_key, aes_iv, AES_ENCRYPT); + if (!BORINGSSL_check_test(kAESCBCEncCiphertext, output, + "AES-CBC-encrypt KAT")) { + return 0; + } + } + + { + // AES-CBC Decryption KAT + static const uint8_t kAESCBCDecCiphertext[32] = { + 0x34, 0x7a, 0xa5, 0xa0, 0x24, 0xb2, 0x82, 0x57, 0xb3, 0x65, 0x10, + 0xbe, 0x58, 0x3d, 0x4f, 0x47, 0xad, 0xb7, 0xbb, 0xee, 0xdc, 0x60, + 0x05, 0xbb, 0xbd, 0x0d, 0x0a, 0x9f, 0x06, 0xbb, 0x7b, 0x10, + }; + static const uint8_t kAESCBCDecPlaintext[sizeof(kAESCBCDecCiphertext)] = { + 0x51, 0xa7, 0xa0, 0x1f, 0x6b, 0x79, 0x6c, 0xcd, 0x48, 0x03, 0xa1, + 0x41, 0xdc, 0x56, 0xa6, 0xc2, 0x16, 0xb5, 0xd1, 0xd3, 0xb7, 0x06, + 0xb2, 0x25, 0x6f, 0xa6, 0xd0, 0xd2, 0x0e, 0x6f, 0x19, 0xb5, + }; + AES_KEY aes_key; + if (!bcm_success( + BCM_aes_set_decrypt_key(kAESKey, 8 * sizeof(kAESKey), &aes_key))) { + fprintf(CRYPTO_get_stderr(), "BCM_aes_set_decrypt_key failed.\n"); + return 0; + } + uint8_t aes_iv[16] = {}; + uint8_t output[sizeof(kAESCBCDecCiphertext)]; + AES_cbc_encrypt(kAESCBCDecCiphertext, output, sizeof(kAESCBCDecCiphertext), + &aes_key, aes_iv, AES_DECRYPT); + if (!BORINGSSL_check_test(kAESCBCDecPlaintext, output, + "AES-CBC-decrypt KAT")) { + return 0; + } + } + + { + size_t out_len; + ScopedEVP_AEAD_CTX aead_ctx; + if (!EVP_AEAD_CTX_init(aead_ctx.get(), EVP_aead_aes_128_gcm(), kAESKey, + sizeof(kAESKey), 0, nullptr)) { + fprintf(CRYPTO_get_stderr(), + "EVP_AEAD_CTX_init for AES-128-GCM failed.\n"); + return 0; + } + + // AES-GCM Encryption KAT + static const uint8_t kAESGCMEncPlaintext[32] = { + 0x8f, 0xcc, 0x40, 0x99, 0x80, 0x8e, 0x75, 0xca, 0xaf, 0xf5, 0x82, + 0x89, 0x88, 0x48, 0xa8, 0x8d, 0x80, 0x8b, 0x55, 0xab, 0x4e, 0x93, + 0x70, 0x79, 0x7d, 0x94, 0x0b, 0xe8, 0xcc, 0x1d, 0x78, 0x84, + }; + static const uint8_t kAESGCMCiphertext[sizeof(kAESGCMEncPlaintext) + 16] = { + 0x87, 0x7b, 0xd5, 0x8d, 0x96, 0x3e, 0x4b, 0xe6, 0x64, 0x94, 0x40, 0x2f, + 0x61, 0x9b, 0x7e, 0x56, 0x52, 0x7d, 0xa4, 0x5a, 0xf9, 0xa6, 0xe2, 0xdb, + 0x1c, 0x63, 0x2e, 0x97, 0x93, 0x0f, 0xfb, 0xed, 0xb5, 0x9e, 0x1c, 0x20, + 0xb2, 0xb0, 0x58, 0xda, 0x48, 0x07, 0x2d, 0xbd, 0x96, 0x0d, 0x34, 0xc6, + }; + uint8_t nonce[12] = {}; + uint8_t output[sizeof(kAESGCMCiphertext)]; + if (!EVP_AEAD_CTX_seal(aead_ctx.get(), output, &out_len, sizeof(output), + nonce, sizeof(nonce), kAESGCMEncPlaintext, + sizeof(kAESGCMEncPlaintext), nullptr, 0) || + !BORINGSSL_check_test(kAESGCMCiphertext, Span(output, out_len), + "AES-GCM-encrypt KAT")) { + fprintf(CRYPTO_get_stderr(), + "EVP_AEAD_CTX_seal for AES-128-GCM failed.\n"); + return 0; + } + + // AES-GCM Decryption KAT + static const uint8_t kAESGCMDecCiphertext[48] = { + 0x35, 0xf3, 0x05, 0x8f, 0x87, 0x57, 0x60, 0xff, 0x09, 0xd3, 0x12, 0x0f, + 0x70, 0xc4, 0xbc, 0x9e, 0xd7, 0xa8, 0x68, 0x72, 0xe1, 0x34, 0x52, 0x20, + 0x21, 0x76, 0xf7, 0x37, 0x1a, 0xe0, 0x4f, 0xaa, 0xe1, 0xdd, 0x39, 0x19, + 0x20, 0xf5, 0xd1, 0x39, 0x53, 0xd8, 0x96, 0x78, 0x59, 0x94, 0x82, 0x3c, + }; + static const uint8_t + kAESGCMDecPlaintext[sizeof(kAESGCMDecCiphertext) - 16] = { + 0x3d, 0x44, 0x90, 0x9b, 0x91, 0xe7, 0x5e, 0xd3, 0xc2, 0xb2, 0xd0, + 0xa9, 0x99, 0x17, 0x6a, 0x45, 0x05, 0x5e, 0x99, 0x83, 0x56, 0x01, + 0xc0, 0x82, 0x40, 0x81, 0xd2, 0x48, 0x45, 0xf2, 0xcc, 0xc3, + }; + if (!EVP_AEAD_CTX_open(aead_ctx.get(), output, &out_len, sizeof(output), + nonce, sizeof(nonce), + kAESGCMDecCiphertext, sizeof(kAESGCMDecCiphertext), + nullptr, 0) || + !BORINGSSL_check_test(kAESGCMDecPlaintext, Span(output, out_len), + "AES-GCM-decrypt KAT")) { + fprintf(CRYPTO_get_stderr(), + "AES-GCM-decrypt KAT failed because EVP_AEAD_CTX_open failed.\n"); + return 0; + } + } + + { + // SHA-1 KAT + static const uint8_t kSHA1Input[16] = { + 0x13, 0x2f, 0xd9, 0xba, 0xd5, 0xc1, 0x82, 0x62, + 0x63, 0xba, 0xfb, 0xb6, 0x99, 0xf7, 0x07, 0xa5, + }; + static const uint8_t kSHA1Digest[20] = { + 0x94, 0x19, 0x55, 0x93, 0x0a, 0x58, 0x29, 0x38, 0xeb, 0xf5, + 0x09, 0x11, 0x6d, 0x1a, 0xfd, 0x0f, 0x1e, 0x11, 0xe3, 0xcb, + }; + uint8_t output[SHA_DIGEST_LENGTH]; + SHA1(kSHA1Input, sizeof(kSHA1Input), output); + if (!BORINGSSL_check_test(kSHA1Digest, output, "SHA-1 KAT")) { + return 0; + } + } + + if (!boringssl_self_test_sha256() || // + !boringssl_self_test_sha512() || // + !boringssl_self_test_hmac_sha256()) { + return 0; + } + + { + // DBRG KAT + static const uint8_t kDRBGEntropy[32] = { + 0xc4, 0xda, 0x07, 0x40, 0xd5, 0x05, 0xf1, 0xee, 0x28, 0x0b, 0x95, + 0xe5, 0x8c, 0x49, 0x31, 0xac, 0x6d, 0xe8, 0x46, 0xa0, 0x15, 0x2f, + 0xbb, 0x4a, 0x3f, 0x17, 0x4c, 0xf4, 0x78, 0x7a, 0x4f, 0x1a, + }; + static const uint8_t kDRBGNonce[CTR_DRBG_NONCE_LEN] = { + 0x40, 0xc2, 0xb5, 0x0b, 0xab, 0xe1, 0x4a, 0xae, + 0x53, 0x0b, 0xe5, 0x88, 0x6d, 0x91, 0x0a, 0x27, + }; + static const uint8_t kDRBGPersonalization[18] = { + 'B', 'C', 'M', 'P', 'e', 'r', 's', 'o', 'n', + 'a', 'l', 'i', 'z', 'a', 't', 'i', 'o', 'n'}; + static const uint8_t kDRBGAD[16] = {'B', 'C', 'M', ' ', 'D', 'R', 'B', 'G', + ' ', 'K', 'A', 'T', ' ', 'A', 'D', ' '}; + static const uint8_t kDRBGOutput[64] = { + 0x55, 0x88, 0x81, 0x88, 0x16, 0x49, 0x68, 0xd8, 0x23, 0xc8, 0x18, + 0x57, 0x5d, 0x06, 0xc3, 0x5f, 0x60, 0x3a, 0xe8, 0xfe, 0x7c, 0x7e, + 0x1c, 0x4a, 0x6a, 0xa8, 0x91, 0x07, 0xc0, 0x0d, 0x1f, 0x70, 0x4a, + 0xbb, 0x20, 0x42, 0xd3, 0x3f, 0x19, 0xf1, 0xb1, 0xfc, 0xef, 0xa1, + 0x71, 0xfd, 0xf7, 0xaf, 0xc5, 0x12, 0x7a, 0x98, 0xad, 0x42, 0xbc, + 0x01, 0xe6, 0xa2, 0x83, 0xbc, 0x73, 0xb5, 0xba, 0x84, + }; + static const uint8_t kDRBGEntropy2[48] = { + 0xc7, 0x16, 0x1c, 0xa3, 0x6c, 0x23, 0x09, 0xb7, 0x16, 0xe9, 0x85, 0x9b, + 0xb9, 0x6c, 0x6d, 0x49, 0xbd, 0xc8, 0x35, 0x21, 0x03, 0xa1, 0x8c, 0xd2, + 0x4e, 0xf4, 0x2e, 0xc9, 0x7e, 0xf4, 0x6b, 0xf4, 0x46, 0xeb, 0x1a, 0x45, + 0x76, 0xc1, 0x86, 0xe9, 0x35, 0x18, 0x03, 0x76, 0x3a, 0x79, 0x12, 0xfe, + }; + static const uint8_t kDRBGReseedOutput[64] = { + 0xda, 0x49, 0xa1, 0x01, 0x31, 0x71, 0x77, 0xde, 0xf6, 0x8d, 0xb5, + 0x4f, 0x86, 0x0d, 0xc8, 0xd6, 0x3c, 0xaa, 0xbc, 0x72, 0x0a, 0x9c, + 0x8b, 0x68, 0xa9, 0x70, 0xf1, 0x21, 0x13, 0xce, 0xc6, 0xbc, 0xff, + 0xaf, 0xa8, 0xd5, 0x26, 0x76, 0x26, 0xcc, 0x0d, 0x89, 0x66, 0xab, + 0xc2, 0x11, 0xa8, 0x2f, 0xf1, 0x36, 0xa3, 0x2b, 0x52, 0xcd, 0x1a, + 0x2d, 0xe4, 0x82, 0xac, 0x3c, 0xbb, 0xa9, 0x17, 0x90, + }; + CTR_DRBG_STATE drbg; + uint8_t output[64]; + if (!CTR_DRBG_init(&drbg, /*df=*/true, kDRBGEntropy, sizeof(kDRBGEntropy), + kDRBGNonce, kDRBGPersonalization, + sizeof(kDRBGPersonalization)) || + !CTR_DRBG_generate(&drbg, output, sizeof(output), kDRBGAD, + sizeof(kDRBGAD)) || + !BORINGSSL_check_test(kDRBGOutput, output, "DRBG Generate KAT") || + !CTR_DRBG_reseed_ex(&drbg, kDRBGEntropy2, sizeof(kDRBGEntropy2), + kDRBGAD, sizeof(kDRBGAD)) || + !CTR_DRBG_generate(&drbg, output, sizeof(output), kDRBGAD, + sizeof(kDRBGAD)) || + !BORINGSSL_check_test(kDRBGReseedOutput, output, "DRBG-reseed KAT")) { + fprintf(CRYPTO_get_stderr(), "CTR-DRBG failed.\n"); + return 0; + } + CTR_DRBG_clear(&drbg); + + uint8_t kZeroDRBG[sizeof(drbg)] = {}; + if (!BORINGSSL_check_test( + kZeroDRBG, + Span(reinterpret_cast(&drbg), sizeof(drbg)), + "DRBG Clear KAT")) { + return 0; + } + } + + { + // TLS KDF KAT + static const uint8_t kTLSLabel[] = "FIPS self test"; + static const uint8_t kTLSSeed1[16] = { + 0x8f, 0x0d, 0xe8, 0xb6, 0x90, 0x8f, 0xb1, 0xd2, + 0x6d, 0x51, 0xf4, 0x79, 0x18, 0x63, 0x51, 0x65, + }; + static const uint8_t kTLSSeed2[16] = { + 0x7d, 0x24, 0x1a, 0x9d, 0x3c, 0x59, 0xbf, 0x3c, + 0x31, 0x1e, 0x2b, 0x21, 0x41, 0x8d, 0x32, 0x81, + }; + + static const uint8_t kTLS10Secret[32] = { + 0xab, 0xc3, 0x65, 0x7b, 0x09, 0x4c, 0x76, 0x28, 0xa0, 0xb2, 0x82, + 0x99, 0x6f, 0xe7, 0x5a, 0x75, 0xf4, 0x98, 0x4f, 0xd9, 0x4d, 0x4e, + 0xcc, 0x2f, 0xcf, 0x53, 0xa2, 0xc4, 0x69, 0xa3, 0xf7, 0x31, + }; + static const uint8_t kTLS10Output[32] = { + 0x69, 0x7c, 0x4e, 0x2c, 0xee, 0x82, 0xb1, 0xd2, 0x8b, 0xac, 0x90, + 0x7a, 0xa1, 0x8a, 0x81, 0xfe, 0xc5, 0x58, 0x45, 0x57, 0x61, 0x2f, + 0x7a, 0x8d, 0x80, 0xfb, 0x44, 0xd8, 0x81, 0x60, 0xe5, 0xf8, + }; + uint8_t tls10_output[sizeof(kTLS10Output)]; + // For this test, labels null-terminated + if (!CRYPTO_tls1_prf(EVP_md5_sha1(), tls10_output, sizeof(tls10_output), + kTLS10Secret, sizeof(kTLS10Secret), kTLSLabel, + sizeof(kTLSLabel), kTLSSeed1, sizeof(kTLSSeed1), + kTLSSeed2, sizeof(kTLSSeed2)) || + !BORINGSSL_check_test(kTLS10Output, tls10_output, "TLS10-KDF KAT")) { + fprintf(CRYPTO_get_stderr(), "TLS KDF failed.\n"); + return 0; + } + + static const uint8_t kTLS12Secret[32] = { + 0xc5, 0x43, 0x8e, 0xe2, 0x6f, 0xd4, 0xac, 0xbd, 0x25, 0x9f, 0xc9, + 0x18, 0x55, 0xdc, 0x69, 0xbf, 0x88, 0x4e, 0xe2, 0x93, 0x22, 0xfc, + 0xbf, 0xd2, 0x96, 0x6a, 0x46, 0x23, 0xd4, 0x2e, 0xc7, 0x81, + }; + static const uint8_t kTLS12Output[32] = { + 0xee, 0x4a, 0xcd, 0x3f, 0xa3, 0xd3, 0x55, 0x89, 0x9e, 0x6f, 0xf1, + 0x38, 0x46, 0x9d, 0x2b, 0x33, 0xaa, 0x7f, 0xc4, 0x7f, 0x51, 0x85, + 0x8a, 0xf3, 0x13, 0x84, 0xbf, 0x53, 0x6a, 0x65, 0x37, 0x51, + }; + uint8_t tls12_output[sizeof(kTLS12Output)]; + if (!CRYPTO_tls1_prf(EVP_sha256(), tls12_output, sizeof(tls12_output), + kTLS12Secret, sizeof(kTLS12Secret), kTLSLabel, + sizeof(kTLSLabel), kTLSSeed1, sizeof(kTLSSeed1), + kTLSSeed2, sizeof(kTLSSeed2)) || + !BORINGSSL_check_test(kTLS12Output, tls12_output, "TLS12-KDF KAT")) { + fprintf(CRYPTO_get_stderr(), "TLS KDF failed.\n"); + return 0; + } + + // TLS v1.3: derives a dummy client-early-traffic secret. + static const uint8_t kTLS13Secret[32] = { + 0x02, 0x4a, 0x0d, 0x80, 0xf3, 0x57, 0xf2, 0x49, 0x9a, 0x12, 0x44, + 0xda, 0xc2, 0x6d, 0xab, 0x66, 0xfc, 0x13, 0xed, 0x85, 0xfc, 0xa7, + 0x1d, 0xac, 0xe1, 0x46, 0x21, 0x11, 0x19, 0x52, 0x58, 0x74, + }; + static const uint8_t kTLS13Salt[16] = { + 0x54, 0x61, 0x11, 0x36, 0x75, 0x91, 0xf0, 0xf8, + 0x92, 0xec, 0x70, 0xbd, 0x78, 0x2a, 0xef, 0x61, + }; + static const uint8_t kTLS13Label[] = "c e traffic"; + static const uint8_t kTLS13ClientHelloHash[32] = { + 0x1d, 0xe8, 0x67, 0xed, 0x93, 0x6a, 0x73, 0x65, 0x9b, 0x05, 0xcf, + 0x8a, 0x22, 0x77, 0xb7, 0x37, 0x29, 0xf2, 0x44, 0x94, 0x81, 0x6a, + 0x83, 0x33, 0x7f, 0x09, 0xbb, 0x6c, 0xc2, 0x6f, 0x48, 0x9c, + }; + static const uint8_t kTLS13ExpandLabelOutput[32] = { + 0x62, 0x91, 0x52, 0x90, 0x2e, 0xc9, 0xcf, 0x9c, 0x5f, 0x1e, 0x0a, + 0xb7, 0x00, 0x33, 0x42, 0x24, 0xc4, 0xe3, 0xba, 0x01, 0x40, 0x32, + 0x06, 0xab, 0x09, 0x23, 0x8a, 0xdd, 0x01, 0xa4, 0x05, 0xcd, + }; + uint8_t tls13_extract_output[32]; + size_t tls13_extract_output_len; + uint8_t tls13_expand_label_output[32]; + if (!HKDF_extract(tls13_extract_output, &tls13_extract_output_len, + EVP_sha256(), kTLS13Secret, sizeof(kTLS13Secret), + kTLS13Salt, sizeof(kTLS13Salt)) || + tls13_extract_output_len != sizeof(tls13_extract_output) || + !CRYPTO_tls13_hkdf_expand_label( + tls13_expand_label_output, sizeof(tls13_expand_label_output), + EVP_sha256(), tls13_extract_output, sizeof(tls13_extract_output), + kTLS13Label, sizeof(kTLS13Label) - 1, kTLS13ClientHelloHash, + sizeof(kTLS13ClientHelloHash)) || + !BORINGSSL_check_test(kTLS13ExpandLabelOutput, + tls13_expand_label_output, + "CRYPTO_tls13_hkdf_expand_label")) { + fprintf(CRYPTO_get_stderr(), "TLS13-KDF failed.\n"); + return 0; + } + } + + { + // HKDF + static const uint8_t kHKDFSecret[32] = { + 0x68, 0x67, 0x85, 0x04, 0xb9, 0xb3, 0xad, 0xd1, 0x7d, 0x59, 0x67, + 0xa1, 0xa7, 0xbd, 0x37, 0x99, 0x3f, 0xd8, 0xa3, 0x3c, 0xe7, 0x30, + 0x30, 0x71, 0xf3, 0x9c, 0x09, 0x6d, 0x16, 0x35, 0xb3, 0xc9, + }; + static const uint8_t kHKDFSalt[32] = { + 0x8a, 0xab, 0x18, 0xb4, 0x9b, 0x0a, 0x17, 0xf9, 0xe8, 0xe6, 0x97, + 0x1a, 0x3d, 0xff, 0xda, 0x9b, 0x26, 0x8b, 0x3d, 0x17, 0x78, 0x0a, + 0xb3, 0xea, 0x65, 0xdb, 0x2a, 0xc0, 0x29, 0x9c, 0xfa, 0x72, + }; + static const uint8_t kHKDFInfo[32] = { + 0xe5, 0x6f, 0xf9, 0xe1, 0x18, 0x5e, 0x64, 0x8c, 0x6c, 0x8f, 0xee, + 0xc6, 0x93, 0x5a, 0xc5, 0x14, 0x8c, 0xf3, 0xd9, 0x78, 0xd2, 0x3a, + 0x86, 0xdd, 0x01, 0xdf, 0xb9, 0xe9, 0x5e, 0xe5, 0x1a, 0x56, + }; + static const uint8_t kHKDFOutput[32] = { + 0xa6, 0x29, 0xb4, 0xd7, 0xf4, 0xc1, 0x16, 0x64, 0x71, 0x5e, 0xa4, + 0xa8, 0xe6, 0x60, 0x8c, 0xf3, 0xc1, 0xa5, 0x03, 0xe2, 0x22, 0xf9, + 0x89, 0xe2, 0x12, 0x18, 0xbe, 0xef, 0x16, 0x86, 0xe0, 0xec, + }; + uint8_t hkdf_output[sizeof(kHKDFOutput)]; + if (!HKDF(hkdf_output, sizeof(hkdf_output), EVP_sha256(), kHKDFSecret, + sizeof(kHKDFSecret), kHKDFSalt, sizeof(kHKDFSalt), kHKDFInfo, + sizeof(kHKDFInfo)) || + !BORINGSSL_check_test(kHKDFOutput, hkdf_output, "HKDF")) { + fprintf(CRYPTO_get_stderr(), "HKDF failed.\n"); + return 0; + } + } + + return 1; +} + +int BORINGSSL_self_test() { + if (!boringssl_self_test_fast() || + // When requested to run self tests, also run some of the lazy tests. + !boringssl_self_test_rsa_sign() || // + !boringssl_self_test_rsa_verify() || // + !boringssl_self_test_ecc() || // + !boringssl_self_test_ffdh() || // + !boringssl_self_test_mlkem() || // + !boringssl_self_test_mldsa()) { + return 0; + } + + return 1; +} + +int BORINGSSL_self_test_all() { + if (!BORINGSSL_self_test() || + // When requested to run all self tests, add in the really slow tests. + !boringssl_self_test_slhdsa()) { + return 0; + } + + return 1; +} + +#if defined(BORINGSSL_FIPS) +int bssl::boringssl_self_test_startup() { return boringssl_self_test_fast(); } +#endif diff --git a/third_party/boringssl/src/crypto/fipsmodule/service_indicator/internal.h b/third_party/boringssl/src/crypto/fipsmodule/service_indicator/internal.h index bbc4e4ee..064618e6 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/service_indicator/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/service_indicator/internal.h @@ -1,40 +1,61 @@ -/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_SERVICE_INDICATOR_INTERNAL_H -#define OPENSSL_HEADER_SERVICE_INDICATOR_INTERNAL_H +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_SERVICE_INDICATOR_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_SERVICE_INDICATOR_INTERNAL_H #include -#include + + +BSSL_NAMESPACE_BEGIN + +// FIPS_service_indicator_before_call and |FIPS_service_indicator_after_call| +// both currently return the same local thread counter which is slowly +// incremented whenever approved services are called. The +// |CALL_SERVICE_AND_CHECK_APPROVED| macro is strongly recommended over calling +// these functions directly. +// +// |FIPS_service_indicator_before_call| is intended to be called immediately +// before an approved service, while |FIPS_service_indicator_after_call| should +// be called immediately after. If the values returned from these two functions +// are not equal, this means that the service called in between is deemed to be +// approved. If the values are still the same, this means the counter has not +// been incremented, and the service called is not approved for FIPS. +// +// In non-FIPS builds, |FIPS_service_indicator_before_call| always returns zero +// and |FIPS_service_indicator_after_call| always returns one. Thus calls always +// appear to be approved. This is intended to simplify testing. +OPENSSL_EXPORT uint64_t FIPS_service_indicator_before_call(); +OPENSSL_EXPORT uint64_t FIPS_service_indicator_after_call(); #if defined(BORINGSSL_FIPS) // FIPS_service_indicator_update_state records that an approved service has been // invoked. -void FIPS_service_indicator_update_state(void); +void FIPS_service_indicator_update_state(); // FIPS_service_indicator_lock_state and |FIPS_service_indicator_unlock_state| // stop |FIPS_service_indicator_update_state| from actually updating the service // indicator. This is used when a primitive calls a potentially approved // primitive to avoid false positives. For example, just because a key -// generation calls |RAND_bytes| (and thus the approved DRBG) doesn't mean that -// the key generation operation itself is approved. +// generation calls |BCM_rand_bytes| (and thus the approved DRBG) doesn't mean +// that the key generation operation itself is approved. // // This lock nests: i.e. locking twice is fine so long as each lock is paired // with an unlock. If the (64-bit) counter overflows, the process aborts. -void FIPS_service_indicator_lock_state(void); -void FIPS_service_indicator_unlock_state(void); +void FIPS_service_indicator_lock_state(); +void FIPS_service_indicator_unlock_state(); // The following functions may call |FIPS_service_indicator_update_state| if // their parameter specifies an approved operation. @@ -53,37 +74,39 @@ void TLSKDF_verify_service_indicator(const EVP_MD *dgst); // Service indicator functions are no-ops in non-FIPS builds. -OPENSSL_INLINE void FIPS_service_indicator_update_state(void) {} -OPENSSL_INLINE void FIPS_service_indicator_lock_state(void) {} -OPENSSL_INLINE void FIPS_service_indicator_unlock_state(void) {} +inline void FIPS_service_indicator_update_state() {} +inline void FIPS_service_indicator_lock_state() {} +inline void FIPS_service_indicator_unlock_state() {} -OPENSSL_INLINE void AEAD_GCM_verify_service_indicator( - OPENSSL_UNUSED const EVP_AEAD_CTX *ctx) {} +inline void AEAD_GCM_verify_service_indicator( + [[maybe_unused]] const EVP_AEAD_CTX *ctx) {} -OPENSSL_INLINE void AEAD_CCM_verify_service_indicator( - OPENSSL_UNUSED const EVP_AEAD_CTX *ctx) {} +inline void AEAD_CCM_verify_service_indicator( + [[maybe_unused]] const EVP_AEAD_CTX *ctx) {} -OPENSSL_INLINE void EC_KEY_keygen_verify_service_indicator( - OPENSSL_UNUSED const EC_KEY *eckey) {} +inline void EC_KEY_keygen_verify_service_indicator( + [[maybe_unused]] const EC_KEY *eckey) {} -OPENSSL_INLINE void ECDH_verify_service_indicator( - OPENSSL_UNUSED const EC_KEY *ec_key) {} +inline void ECDH_verify_service_indicator( + [[maybe_unused]] const EC_KEY *ec_key) {} -OPENSSL_INLINE void EVP_Cipher_verify_service_indicator( - OPENSSL_UNUSED const EVP_CIPHER_CTX *ctx) {} +inline void EVP_Cipher_verify_service_indicator( + [[maybe_unused]] const EVP_CIPHER_CTX *ctx) {} -OPENSSL_INLINE void EVP_DigestSign_verify_service_indicator( - OPENSSL_UNUSED const EVP_MD_CTX *ctx) {} +inline void EVP_DigestSign_verify_service_indicator( + [[maybe_unused]] const EVP_MD_CTX *ctx) {} -OPENSSL_INLINE void EVP_DigestVerify_verify_service_indicator( - OPENSSL_UNUSED const EVP_MD_CTX *ctx) {} +inline void EVP_DigestVerify_verify_service_indicator( + [[maybe_unused]] const EVP_MD_CTX *ctx) {} -OPENSSL_INLINE void HMAC_verify_service_indicator( - OPENSSL_UNUSED const EVP_MD *evp_md) {} +inline void HMAC_verify_service_indicator( + [[maybe_unused]] const EVP_MD *evp_md) {} -OPENSSL_INLINE void TLSKDF_verify_service_indicator( - OPENSSL_UNUSED const EVP_MD *dgst) {} +inline void TLSKDF_verify_service_indicator( + [[maybe_unused]] const EVP_MD *dgst) {} #endif // BORINGSSL_FIPS -#endif // OPENSSL_HEADER_SERVICE_INDICATOR_INTERNAL_H +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_SERVICE_INDICATOR_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/service_indicator/service_indicator.c b/third_party/boringssl/src/crypto/fipsmodule/service_indicator/service_indicator.c deleted file mode 100644 index febe5344..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/service_indicator/service_indicator.c +++ /dev/null @@ -1,334 +0,0 @@ -/* Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include -#include -#include -#include -#include -#include - -#include "../../evp/internal.h" -#include "../../internal.h" -#include "internal.h" - -#if defined(BORINGSSL_FIPS) - -#define STATE_UNLOCKED 0 - -// fips_service_indicator_state is a thread-local structure that stores the -// state of the FIPS service indicator. -struct fips_service_indicator_state { - // lock_state records the number of times the indicator has been locked. - // When it is zero (i.e. |STATE_UNLOCKED|) then the indicator can be updated. - uint64_t lock_state; - // counter is the indicator state. It is incremented when an approved service - // completes. - uint64_t counter; -}; - -// service_indicator_get returns a pointer to the |fips_service_indicator_state| -// for the current thread. It returns NULL on error. -// -// FIPS 140-3 requires that the module should provide the service indicator -// for approved services irrespective of whether the user queries it or not. -// Hence, it is lazily initialized in any call to an approved service. -static struct fips_service_indicator_state *service_indicator_get(void) { - struct fips_service_indicator_state *indicator = CRYPTO_get_thread_local( - OPENSSL_THREAD_LOCAL_FIPS_SERVICE_INDICATOR_STATE); - - if (indicator == NULL) { - indicator = OPENSSL_malloc(sizeof(struct fips_service_indicator_state)); - if (indicator == NULL) { - OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); - return NULL; - } - - indicator->lock_state = STATE_UNLOCKED; - indicator->counter = 0; - - if (!CRYPTO_set_thread_local( - OPENSSL_THREAD_LOCAL_FIPS_SERVICE_INDICATOR_STATE, indicator, - OPENSSL_free)) { - OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); - return NULL; - } - } - - return indicator; -} - -static uint64_t service_indicator_get_counter(void) { - struct fips_service_indicator_state *indicator = service_indicator_get(); - if (indicator == NULL) { - return 0; - } - return indicator->counter; -} - -uint64_t FIPS_service_indicator_before_call(void) { - return service_indicator_get_counter(); -} - -uint64_t FIPS_service_indicator_after_call(void) { - return service_indicator_get_counter(); -} - -void FIPS_service_indicator_update_state(void) { - struct fips_service_indicator_state *indicator = service_indicator_get(); - if (indicator && indicator->lock_state == STATE_UNLOCKED) { - indicator->counter++; - } -} - -void FIPS_service_indicator_lock_state(void) { - struct fips_service_indicator_state *indicator = service_indicator_get(); - if (indicator == NULL) { - return; - } - - // |FIPS_service_indicator_lock_state| and - // |FIPS_service_indicator_unlock_state| should not under/overflow in normal - // operation. They are still checked and errors added to facilitate testing in - // service_indicator_test.cc. This should only happen if lock/unlock are - // called in an incorrect order or multiple times in the same function. - const uint64_t new_state = indicator->lock_state + 1; - if (new_state < indicator->lock_state) { - // Overflow. This would imply that our call stack length has exceeded a - // |uint64_t| which impossible on a 64-bit system. - abort(); - } - - indicator->lock_state = new_state; -} - -void FIPS_service_indicator_unlock_state(void) { - struct fips_service_indicator_state *indicator = service_indicator_get(); - if (indicator == NULL) { - return; - } - - if (indicator->lock_state == 0) { - abort(); - } - - indicator->lock_state--; -} - -void AEAD_GCM_verify_service_indicator(const EVP_AEAD_CTX *ctx) { - const size_t key_len = EVP_AEAD_key_length(ctx->aead); - if (key_len == 16 || key_len == 32) { - FIPS_service_indicator_update_state(); - } -} - -void AEAD_CCM_verify_service_indicator(const EVP_AEAD_CTX *ctx) { - if (EVP_AEAD_key_length(ctx->aead) == 16 && ctx->tag_len == 4) { - FIPS_service_indicator_update_state(); - } -} - -// is_ec_fips_approved returns one if the curve corresponding to the given NID -// is FIPS approved, and zero otherwise. -static int is_ec_fips_approved(int curve_nid) { - switch (curve_nid) { - case NID_secp224r1: - case NID_X9_62_prime256v1: - case NID_secp384r1: - case NID_secp521r1: - return 1; - default: - return 0; - } -} - -// is_md_fips_approved_for_signing returns one if the given message digest type -// is FIPS approved for signing, and zero otherwise. -static int is_md_fips_approved_for_signing(int md_type) { - switch (md_type) { - case NID_sha224: - case NID_sha256: - case NID_sha384: - case NID_sha512: - case NID_sha512_256: - return 1; - default: - return 0; - } -} - -// is_md_fips_approved_for_verifying returns one if the given message digest -// type is FIPS approved for verifying, and zero otherwise. -static int is_md_fips_approved_for_verifying(int md_type) { - switch (md_type) { - case NID_sha1: - case NID_sha224: - case NID_sha256: - case NID_sha384: - case NID_sha512: - case NID_sha512_256: - return 1; - default: - return 0; - } -} - -static void evp_md_ctx_verify_service_indicator(const EVP_MD_CTX *ctx, - int rsa_1024_ok, - int (*md_ok)(int md_type)) { - if (EVP_MD_CTX_md(ctx) == NULL) { - // Signature schemes without a prehash are currently never FIPS approved. - goto err; - } - - EVP_PKEY_CTX *const pctx = ctx->pctx; - const EVP_PKEY *const pkey = EVP_PKEY_CTX_get0_pkey(pctx); - const int pkey_type = EVP_PKEY_id(pkey); - const int md_type = EVP_MD_CTX_type(ctx); - - // EVP_PKEY_RSA_PSS SPKIs aren't supported. - if (pkey_type == EVP_PKEY_RSA) { - // Message digest used in the private key should be of the same type - // as the given one, so we extract the MD type from the |EVP_PKEY| - // and compare it with the type in |ctx|. - const EVP_MD *pctx_md; - if (!EVP_PKEY_CTX_get_signature_md(pctx, &pctx_md)) { - goto err; - } - if (EVP_MD_type(pctx_md) != md_type) { - goto err; - } - - int padding; - if (!EVP_PKEY_CTX_get_rsa_padding(pctx, &padding)) { - goto err; - } - if (padding == RSA_PKCS1_PSS_PADDING) { - int salt_len; - const EVP_MD *mgf1_md; - if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pctx, &salt_len) || - !EVP_PKEY_CTX_get_rsa_mgf1_md(pctx, &mgf1_md) || - (salt_len != -1 && salt_len != (int)EVP_MD_size(pctx_md)) || - EVP_MD_type(mgf1_md) != md_type) { - // Only PSS where saltLen == hashLen is tested with ACVP. Cases with - // non-standard padding functions are also excluded. - goto err; - } - } - - // The approved RSA key sizes for signing are 2048, 3072 and 4096 bits. - // Note: |EVP_PKEY_size| returns the size in bytes. - size_t pkey_size = EVP_PKEY_size(ctx->pctx->pkey); - - // Check if the MD type and the RSA key size are approved. - if (md_ok(md_type) && - ((rsa_1024_ok && pkey_size == 128) || pkey_size == 256 || - pkey_size == 384 || pkey_size == 512)) { - FIPS_service_indicator_update_state(); - } - } else if (pkey_type == EVP_PKEY_EC) { - // Check if the MD type and the elliptic curve are approved. - if (md_ok(md_type) && is_ec_fips_approved(EC_GROUP_get_curve_name( - ctx->pctx->pkey->pkey.ec->group))) { - FIPS_service_indicator_update_state(); - } - } - - err: - // Ensure that junk errors aren't left on the queue. - ERR_clear_error(); -} - -void EC_KEY_keygen_verify_service_indicator(const EC_KEY *eckey) { - if (is_ec_fips_approved(EC_GROUP_get_curve_name(eckey->group))) { - FIPS_service_indicator_update_state(); - } -} - -void ECDH_verify_service_indicator(const EC_KEY *ec_key) { - if (is_ec_fips_approved(EC_GROUP_get_curve_name(EC_KEY_get0_group(ec_key)))) { - FIPS_service_indicator_update_state(); - } -} - -void EVP_Cipher_verify_service_indicator(const EVP_CIPHER_CTX *ctx) { - switch (EVP_CIPHER_CTX_nid(ctx)) { - case NID_aes_128_ecb: - case NID_aes_192_ecb: - case NID_aes_256_ecb: - - case NID_aes_128_cbc: - case NID_aes_192_cbc: - case NID_aes_256_cbc: - - case NID_aes_128_ctr: - case NID_aes_192_ctr: - case NID_aes_256_ctr: - FIPS_service_indicator_update_state(); - } -} - -void EVP_DigestVerify_verify_service_indicator(const EVP_MD_CTX *ctx) { - return evp_md_ctx_verify_service_indicator(ctx, /*rsa_1024_ok=*/1, - is_md_fips_approved_for_verifying); -} - -void EVP_DigestSign_verify_service_indicator(const EVP_MD_CTX *ctx) { - return evp_md_ctx_verify_service_indicator(ctx, /*rsa_1024_ok=*/0, - is_md_fips_approved_for_signing); -} - -void HMAC_verify_service_indicator(const EVP_MD *evp_md) { - switch (evp_md->type) { - case NID_sha1: - case NID_sha224: - case NID_sha256: - case NID_sha384: - case NID_sha512: - case NID_sha512_256: - FIPS_service_indicator_update_state(); - break; - } -} - -void TLSKDF_verify_service_indicator(const EVP_MD *md) { - // HMAC-MD5, HMAC-SHA1, and HMAC-MD5/HMAC-SHA1 (both used concurrently) are - // approved for use in the KDF in TLS 1.0/1.1. - // HMAC-SHA{256, 384, 512} are approved for use in the KDF in TLS 1.2. - // These Key Derivation functions are to be used in the context of the TLS - // protocol. - switch (EVP_MD_type(md)) { - case NID_md5: - case NID_sha1: - case NID_md5_sha1: - case NID_sha256: - case NID_sha384: - case NID_sha512: - FIPS_service_indicator_update_state(); - break; - } -} - -#else - -uint64_t FIPS_service_indicator_before_call(void) { return 0; } - -uint64_t FIPS_service_indicator_after_call(void) { - // One is returned so that the return value is always greater than zero, the - // return value of |FIPS_service_indicator_before_call|. This makes everything - // report as "approved" in non-FIPS builds. - return 1; -} - -#endif // BORINGSSL_FIPS diff --git a/third_party/boringssl/src/crypto/fipsmodule/service_indicator/service_indicator.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/service_indicator/service_indicator.cc.inc new file mode 100644 index 00000000..ef8fb350 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/service_indicator/service_indicator.cc.inc @@ -0,0 +1,330 @@ +// Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "../../evp/internal.h" +#include "../../internal.h" +#include "../../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +#if defined(BORINGSSL_FIPS) + +#define STATE_UNLOCKED 0 + +// fips_service_indicator_state is a thread-local structure that stores the +// state of the FIPS service indicator. +struct fips_service_indicator_state { + // lock_state records the number of times the indicator has been locked. + // When it is zero (i.e. |STATE_UNLOCKED|) then the indicator can be updated. + uint64_t lock_state; + // counter is the indicator state. It is incremented when an approved service + // completes. + uint64_t counter; +}; + +// service_indicator_get returns a pointer to the |fips_service_indicator_state| +// for the current thread. It returns nullptr on error. +// +// FIPS 140-3 requires that the module should provide the service indicator +// for approved services irrespective of whether the user queries it or not. +// Hence, it is lazily initialized in any call to an approved service. +static struct fips_service_indicator_state *service_indicator_get() { + struct fips_service_indicator_state *indicator = + reinterpret_cast(CRYPTO_get_thread_local( + OPENSSL_THREAD_LOCAL_FIPS_SERVICE_INDICATOR_STATE)); + + if (indicator == nullptr) { + indicator = New(); + if (indicator == nullptr) { + return nullptr; + } + + indicator->lock_state = STATE_UNLOCKED; + indicator->counter = 0; + + if (!CRYPTO_set_thread_local( + OPENSSL_THREAD_LOCAL_FIPS_SERVICE_INDICATOR_STATE, indicator, + OPENSSL_free)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return nullptr; + } + } + + return indicator; +} + +static uint64_t service_indicator_get_counter() { + struct fips_service_indicator_state *indicator = service_indicator_get(); + if (indicator == nullptr) { + return 0; + } + return indicator->counter; +} + +uint64_t bssl::FIPS_service_indicator_before_call() { + return service_indicator_get_counter(); +} + +uint64_t bssl::FIPS_service_indicator_after_call() { + return service_indicator_get_counter(); +} + +void bssl::FIPS_service_indicator_update_state() { + struct fips_service_indicator_state *indicator = service_indicator_get(); + if (indicator && indicator->lock_state == STATE_UNLOCKED) { + indicator->counter++; + } +} + +void bssl::FIPS_service_indicator_lock_state() { + struct fips_service_indicator_state *indicator = service_indicator_get(); + if (indicator == nullptr) { + return; + } + + // |FIPS_service_indicator_lock_state| and + // |FIPS_service_indicator_unlock_state| should not under/overflow in normal + // operation. They are still checked and errors added to facilitate testing in + // service_indicator_test.cc. This should only happen if lock/unlock are + // called in an incorrect order or multiple times in the same function. + const uint64_t new_state = indicator->lock_state + 1; + if (new_state < indicator->lock_state) { + // Overflow. This would imply that our call stack length has exceeded a + // |uint64_t| which impossible on a 64-bit system. + abort(); + } + + indicator->lock_state = new_state; +} + +void bssl::FIPS_service_indicator_unlock_state() { + struct fips_service_indicator_state *indicator = service_indicator_get(); + if (indicator == nullptr) { + return; + } + + if (indicator->lock_state == 0) { + abort(); + } + + indicator->lock_state--; +} + +void bssl::AEAD_GCM_verify_service_indicator(const EVP_AEAD_CTX *ctx) { + const size_t key_len = EVP_AEAD_key_length(ctx->aead); + if (key_len == 16 || key_len == 32) { + FIPS_service_indicator_update_state(); + } +} + +void bssl::AEAD_CCM_verify_service_indicator(const EVP_AEAD_CTX *ctx) { + if (EVP_AEAD_key_length(ctx->aead) == 16 && ctx->tag_len == 4) { + FIPS_service_indicator_update_state(); + } +} + +// is_ec_fips_approved returns one if the curve corresponding to the given NID +// is FIPS approved, and zero otherwise. +static int is_ec_fips_approved(int curve_nid) { + switch (curve_nid) { + case NID_secp224r1: + case NID_X9_62_prime256v1: + case NID_secp384r1: + case NID_secp521r1: + return 1; + default: + return 0; + } +} + +// is_md_fips_approved_for_signing returns one if the given message digest type +// is FIPS approved for signing, and zero otherwise. +static int is_md_fips_approved_for_signing(int md_type) { + switch (md_type) { + case NID_sha224: + case NID_sha256: + case NID_sha384: + case NID_sha512: + case NID_sha512_256: + return 1; + default: + return 0; + } +} + +// is_md_fips_approved_for_verifying returns one if the given message digest +// type is FIPS approved for verifying, and zero otherwise. +static int is_md_fips_approved_for_verifying(int md_type) { + switch (md_type) { + case NID_sha224: + case NID_sha256: + case NID_sha384: + case NID_sha512: + case NID_sha512_256: + return 1; + default: + return 0; + } +} + +static void evp_md_ctx_verify_service_indicator(const EVP_MD_CTX *ctx, + int (*md_ok)(int md_type)) { + if (EVP_MD_CTX_get0_md(ctx) == nullptr) { + // Signature schemes without a prehash are currently never FIPS approved. + return; + } + + EVP_PKEY_CTX *const pctx = ctx->pctx; + const EVP_PKEY *const pkey = EVP_PKEY_CTX_get0_pkey(pctx); + const int pkey_type = EVP_PKEY_id(pkey); + const int md_type = EVP_MD_CTX_type(ctx); + + // EVP_PKEY_RSA_PSS SPKIs aren't supported. + if (pkey_type == EVP_PKEY_RSA) { + // Message digest used in the private key should be of the same type + // as the given one, so we extract the MD type from the |EVP_PKEY| + // and compare it with the type in |ctx|. + const EVP_MD *pctx_md; + if (!EVP_PKEY_CTX_get_signature_md(pctx, &pctx_md)) { + goto err; + } + if (EVP_MD_type(pctx_md) != md_type) { + goto err; + } + + int padding; + if (!EVP_PKEY_CTX_get_rsa_padding(pctx, &padding)) { + goto err; + } + if (padding == RSA_PKCS1_PSS_PADDING) { + int salt_len; + const EVP_MD *mgf1_md; + if (!EVP_PKEY_CTX_get_rsa_pss_saltlen(pctx, &salt_len) || + !EVP_PKEY_CTX_get_rsa_mgf1_md(pctx, &mgf1_md) || + (salt_len != RSA_PSS_SALTLEN_DIGEST && + salt_len != (int)EVP_MD_size(pctx_md)) || + EVP_MD_type(mgf1_md) != md_type) { + // Only PSS where saltLen == hashLen is tested with ACVP. Cases with + // non-standard padding functions are also excluded. + goto err; + } + } + + // The approved RSA key sizes for signing are 2048, 3072 and 4096 bits. + // Note: |EVP_PKEY_size| returns the size in bytes. + size_t pkey_size = EVP_PKEY_size(FromOpaque(ctx->pctx)->pkey.get()); + + // Check if the MD type and the RSA key size are approved. + if (md_ok(md_type) && + (pkey_size == 256 || pkey_size == 384 || pkey_size == 512)) { + FIPS_service_indicator_update_state(); + } + } else if (pkey_type == EVP_PKEY_EC) { + // Check if the MD type and the elliptic curve are approved. + if (md_ok(md_type) && + is_ec_fips_approved(EC_GROUP_get_curve_name(EC_KEY_get0_group( + EVP_PKEY_get0_EC_KEY(FromOpaque(ctx->pctx)->pkey.get()))))) { + FIPS_service_indicator_update_state(); + } + } + +err: + // Ensure that junk errors aren't left on the queue. + ERR_clear_error(); +} + +void bssl::EC_KEY_keygen_verify_service_indicator(const EC_KEY *eckey) { + if (is_ec_fips_approved(EC_GROUP_get_curve_name(EC_KEY_get0_group(eckey)))) { + FIPS_service_indicator_update_state(); + } +} + +void bssl::ECDH_verify_service_indicator(const EC_KEY *ec_key) { + if (is_ec_fips_approved(EC_GROUP_get_curve_name(EC_KEY_get0_group(ec_key)))) { + FIPS_service_indicator_update_state(); + } +} + +void bssl::EVP_Cipher_verify_service_indicator(const EVP_CIPHER_CTX *ctx) { + switch (EVP_CIPHER_CTX_nid(ctx)) { + case NID_aes_128_ecb: + case NID_aes_192_ecb: + case NID_aes_256_ecb: + + case NID_aes_128_cbc: + case NID_aes_192_cbc: + case NID_aes_256_cbc: + + case NID_aes_128_ctr: + case NID_aes_192_ctr: + case NID_aes_256_ctr: + FIPS_service_indicator_update_state(); + } +} + +void bssl::EVP_DigestVerify_verify_service_indicator(const EVP_MD_CTX *ctx) { + return evp_md_ctx_verify_service_indicator(ctx, + is_md_fips_approved_for_verifying); +} + +void bssl::EVP_DigestSign_verify_service_indicator(const EVP_MD_CTX *ctx) { + return evp_md_ctx_verify_service_indicator(ctx, + is_md_fips_approved_for_signing); +} + +void bssl::HMAC_verify_service_indicator(const EVP_MD *evp_md) { + switch (EVP_MD_type(evp_md)) { + case NID_sha1: + case NID_sha224: + case NID_sha256: + case NID_sha384: + case NID_sha512: + case NID_sha512_256: + FIPS_service_indicator_update_state(); + break; + } +} + +void bssl::TLSKDF_verify_service_indicator(const EVP_MD *md) { + // HMAC-SHA{256, 384, 512} are approved for use in the KDF in TLS 1.2. These + // Key Derivation functions are to be used in the context of the TLS protocol. + switch (EVP_MD_type(md)) { + case NID_sha256: + case NID_sha384: + case NID_sha512: + FIPS_service_indicator_update_state(); + break; + } +} + +#else + +uint64_t bssl::FIPS_service_indicator_before_call() { return 0; } + +uint64_t bssl::FIPS_service_indicator_after_call() { + // One is returned so that the return value is always greater than zero, the + // return value of |FIPS_service_indicator_before_call|. This makes everything + // report as "approved" in non-FIPS builds. + return 1; +} + +#endif // BORINGSSL_FIPS diff --git a/third_party/boringssl/src/crypto/fipsmodule/sha/internal.h b/third_party/boringssl/src/crypto/fipsmodule/sha/internal.h index cc909149..0c8010b3 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/sha/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/sha/internal.h @@ -1,53 +1,205 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_SHA_INTERNAL_H -#define OPENSSL_HEADER_SHA_INTERNAL_H +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_SHA_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_SHA_INTERNAL_H #include -#if defined(__cplusplus) -extern "C" { -#endif +#include "../../internal.h" + + +BSSL_NAMESPACE_BEGIN + +// Define SHA{n}[_{variant}]_ASM if sha{n}_block_data_order[_{variant}] is +// defined in assembly. + +#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_ARM) + +#define SHA1_ASM_NOHW +#define SHA256_ASM_NOHW +#define SHA512_ASM_NOHW + +#define SHA1_ASM_HW +inline int sha1_hw_capable() { return CRYPTO_is_ARMv8_SHA1_capable(); } + +#define SHA1_ASM_NEON +extern "C" void sha1_block_data_order_neon(uint32_t state[5], + const uint8_t *data, size_t num); + +#define SHA256_ASM_HW +inline int sha256_hw_capable() { return CRYPTO_is_ARMv8_SHA256_capable(); } + +#define SHA256_ASM_NEON +extern "C" void sha256_block_data_order_neon(uint32_t state[8], + const uint8_t *data, size_t num); + +// Armv8.2 SHA-512 instructions are not available in 32-bit. +#define SHA512_ASM_NEON +extern "C" void sha512_block_data_order_neon(uint64_t state[8], + const uint8_t *data, size_t num); + +#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) + +#define SHA1_ASM_NOHW +#define SHA256_ASM_NOHW +#define SHA512_ASM_NOHW + +#define SHA1_ASM_HW +inline int sha1_hw_capable() { return CRYPTO_is_ARMv8_SHA1_capable(); } + +#define SHA256_ASM_HW +inline int sha256_hw_capable() { return CRYPTO_is_ARMv8_SHA256_capable(); } + +#define SHA512_ASM_HW +inline int sha512_hw_capable() { return CRYPTO_is_ARMv8_SHA512_capable(); } + +#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) + +#define SHA1_ASM_NOHW +#define SHA256_ASM_NOHW +#define SHA512_ASM_NOHW + +#define SHA1_ASM_SSSE3 +inline int sha1_ssse3_capable() { return CRYPTO_is_SSSE3_capable(); } +extern "C" void sha1_block_data_order_ssse3(uint32_t state[5], + const uint8_t *data, size_t num); +#define SHA1_ASM_AVX +inline int sha1_avx_capable() { + // AMD CPUs have slow SHLD/SHRD. See also the discussion in sha1-586.pl. + // + // TODO(crbug.com/42290564): Should we enable SHAEXT on 32-bit x86? + return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu(); +} +extern "C" void sha1_block_data_order_avx(uint32_t state[5], + const uint8_t *data, size_t num); + +#define SHA256_ASM_SSSE3 +inline int sha256_ssse3_capable() { return CRYPTO_is_SSSE3_capable(); } +extern "C" void sha256_block_data_order_ssse3(uint32_t state[8], + const uint8_t *data, size_t num); + +#define SHA256_ASM_AVX +inline int sha256_avx_capable() { + // AMD CPUs have slow SHLD/SHRD. See also the discussion in sha1-586.pl. + // + // TODO(crbug.com/42290564): Should we enable SHAEXT on 32-bit x86? + return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu(); +} +extern "C" void sha256_block_data_order_avx(uint32_t state[8], + const uint8_t *data, size_t num); + +#define SHA512_ASM_SSSE3 +inline int sha512_ssse3_capable() { return CRYPTO_is_SSSE3_capable(); } +extern "C" void sha512_block_data_order_ssse3(uint64_t state[8], + const uint8_t *data, size_t num); + +#elif !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) + +#define SHA1_ASM_NOHW +#define SHA256_ASM_NOHW +#define SHA512_ASM_NOHW + +#define SHA1_ASM_HW +inline int sha1_hw_capable() { + return CRYPTO_is_x86_SHA_capable() && CRYPTO_is_SSSE3_capable(); +} + +#define SHA1_ASM_AVX2 +inline int sha1_avx2_capable() { + return CRYPTO_is_AVX2_capable() && CRYPTO_is_BMI2_capable() && + CRYPTO_is_BMI1_capable(); +} +extern "C" void sha1_block_data_order_avx2(uint32_t state[5], + const uint8_t *data, size_t num); + +#define SHA1_ASM_AVX +inline int sha1_avx_capable() { + // AMD CPUs have slow SHLD/SHRD. See also the discussion in sha1-586.pl. Zen + // added the SHA extension, so this is moot on newer AMD CPUs. + return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu(); +} +extern "C" void sha1_block_data_order_avx(uint32_t state[5], + const uint8_t *data, size_t num); + +#define SHA1_ASM_SSSE3 +inline int sha1_ssse3_capable() { return CRYPTO_is_SSSE3_capable(); } +extern "C" void sha1_block_data_order_ssse3(uint32_t state[5], + const uint8_t *data, size_t num); + +#define SHA256_ASM_HW +inline int sha256_hw_capable() { + // Note that the original assembly did not check SSSE3. + return CRYPTO_is_x86_SHA_capable() && CRYPTO_is_SSSE3_capable(); +} + +#define SHA256_ASM_AVX +inline int sha256_avx_capable() { + // AMD CPUs have slow SHLD/SHRD. See also the discussion in sha1-586.pl. Zen + // added the SHA extension, so this is moot on newer AMD CPUs. + return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu(); +} +extern "C" void sha256_block_data_order_avx(uint32_t state[8], + const uint8_t *data, size_t num); + +#define SHA256_ASM_SSSE3 +inline int sha256_ssse3_capable() { return CRYPTO_is_SSSE3_capable(); } +extern "C" void sha256_block_data_order_ssse3(uint32_t state[8], + const uint8_t *data, size_t num); + +#define SHA512_ASM_AVX +inline int sha512_avx_capable() { + // AMD CPUs have slow SHLD/SHRD. See also the discussion in sha1-586.pl. + // + // TODO(crbug.com/42290564): Fixing and enabling the AVX2 implementation would + // mitigate this on newer AMD CPUs. + return CRYPTO_is_AVX_capable() && CRYPTO_is_intel_cpu(); +} +extern "C" void sha512_block_data_order_avx(uint64_t state[8], + const uint8_t *data, size_t num); -#if defined(OPENSSL_PPC64LE) || \ - (!defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ - defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64))) -// POWER has an intrinsics-based implementation of SHA-1 and thus the functions -// normally defined in assembly are available even with |OPENSSL_NO_ASM| in -// this case. -#define SHA1_ASM -void sha1_block_data_order(uint32_t *state, const uint8_t *in, - size_t num_blocks); #endif -#if !defined(OPENSSL_NO_ASM) && \ - (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ - defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) -#define SHA256_ASM -#define SHA512_ASM -void sha256_block_data_order(uint32_t *state, const uint8_t *in, - size_t num_blocks); -void sha512_block_data_order(uint64_t *state, const uint8_t *in, - size_t num_blocks); +#if defined(SHA1_ASM_HW) +extern "C" void sha1_block_data_order_hw(uint32_t state[5], const uint8_t *data, + size_t num); +#endif +#if defined(SHA1_ASM_NOHW) +extern "C" void sha1_block_data_order_nohw(uint32_t state[5], + const uint8_t *data, size_t num); #endif +#if defined(SHA256_ASM_HW) +extern "C" void sha256_block_data_order_hw(uint32_t state[8], + const uint8_t *data, size_t num); +#endif +#if defined(SHA256_ASM_NOHW) +extern "C" void sha256_block_data_order_nohw(uint32_t state[8], + const uint8_t *data, size_t num); +#endif + +#if defined(SHA512_ASM_HW) +extern "C" void sha512_block_data_order_hw(uint64_t state[8], + const uint8_t *data, size_t num); +#endif -#if defined(__cplusplus) -} // extern "C" +#if defined(SHA512_ASM_NOHW) +extern "C" void sha512_block_data_order_nohw(uint64_t state[8], + const uint8_t *data, size_t num); #endif -#endif // OPENSSL_HEADER_SHA_INTERNAL_H +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_SHA_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/sha/sha1-altivec.c b/third_party/boringssl/src/crypto/fipsmodule/sha/sha1-altivec.c deleted file mode 100644 index 3152827a..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/sha/sha1-altivec.c +++ /dev/null @@ -1,361 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -// Altivec-optimized SHA1 in C. This is tested on ppc64le only. -// -// References: -// https://software.intel.com/en-us/articles/improving-the-performance-of-the-secure-hash-algorithm-1 -// http://arctic.org/~dean/crypto/sha1.html -// -// This code used the generic SHA-1 from OpenSSL as a basis and AltiVec -// optimisations were added on top. - -#include - -#if defined(OPENSSL_PPC64LE) - -#include - -void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num); - -static uint32_t rotate(uint32_t a, int n) { return (a << n) | (a >> (32 - n)); } - -typedef vector unsigned int vec_uint32_t; -typedef vector unsigned char vec_uint8_t; - -// Vector constants -static const vec_uint8_t k_swap_endianness = {3, 2, 1, 0, 7, 6, 5, 4, - 11, 10, 9, 8, 15, 14, 13, 12}; - -// Shift amounts for byte and bit shifts and rotations -static const vec_uint8_t k_4_bytes = {32, 32, 32, 32, 32, 32, 32, 32, - 32, 32, 32, 32, 32, 32, 32, 32}; -static const vec_uint8_t k_12_bytes = {96, 96, 96, 96, 96, 96, 96, 96, - 96, 96, 96, 96, 96, 96, 96, 96}; - -#define K_00_19 0x5a827999UL -#define K_20_39 0x6ed9eba1UL -#define K_40_59 0x8f1bbcdcUL -#define K_60_79 0xca62c1d6UL - -// Vector versions of the above. -static const vec_uint32_t K_00_19_x_4 = {K_00_19, K_00_19, K_00_19, K_00_19}; -static const vec_uint32_t K_20_39_x_4 = {K_20_39, K_20_39, K_20_39, K_20_39}; -static const vec_uint32_t K_40_59_x_4 = {K_40_59, K_40_59, K_40_59, K_40_59}; -static const vec_uint32_t K_60_79_x_4 = {K_60_79, K_60_79, K_60_79, K_60_79}; - -// vector message scheduling: compute message schedule for round i..i+3 where i -// is divisible by 4. We return the schedule w[i..i+3] as a vector. In -// addition, we also precompute sum w[i..+3] and an additive constant K. This -// is done to offload some computation of f() in the integer execution units. -// -// Byte shifting code below may not be correct for big-endian systems. -static vec_uint32_t sched_00_15(vec_uint32_t *pre_added, const void *data, - vec_uint32_t k) { - const vector unsigned char unaligned_data = - vec_vsx_ld(0, (const unsigned char*) data); - const vec_uint32_t v = (vec_uint32_t) unaligned_data; - const vec_uint32_t w = vec_perm(v, v, k_swap_endianness); - vec_st(w + k, 0, pre_added); - return w; -} - -// Compute w[i..i+3] using these steps for i in [16, 20, 24, 28] -// -// w'[i ] = (w[i-3] ^ w[i-8] ^ w[i-14] ^ w[i-16]) <<< 1 -// w'[i+1] = (w[i-2] ^ w[i-7] ^ w[i-13] ^ w[i-15]) <<< 1 -// w'[i+2] = (w[i-1] ^ w[i-6] ^ w[i-12] ^ w[i-14]) <<< 1 -// w'[i+3] = ( 0 ^ w[i-5] ^ w[i-11] ^ w[i-13]) <<< 1 -// -// w[ i] = w'[ i] -// w[i+1] = w'[i+1] -// w[i+2] = w'[i+2] -// w[i+3] = w'[i+3] ^ (w'[i] <<< 1) -static vec_uint32_t sched_16_31(vec_uint32_t *pre_added, vec_uint32_t minus_4, - vec_uint32_t minus_8, vec_uint32_t minus_12, - vec_uint32_t minus_16, vec_uint32_t k) { - const vec_uint32_t minus_3 = vec_sro(minus_4, k_4_bytes); - const vec_uint32_t minus_14 = vec_sld((minus_12), (minus_16), 8); - const vec_uint32_t k_1_bit = vec_splat_u32(1); - const vec_uint32_t w_prime = - vec_rl(minus_3 ^ minus_8 ^ minus_14 ^ minus_16, k_1_bit); - const vec_uint32_t w = - w_prime ^ vec_rl(vec_slo(w_prime, k_12_bytes), k_1_bit); - vec_st(w + k, 0, pre_added); - return w; -} - -// Compute w[i..i+3] using this relation for i in [32, 36, 40 ... 76] -// w[i] = (w[i-6] ^ w[i-16] ^ w[i-28] ^ w[i-32]), 2) <<< 2 -static vec_uint32_t sched_32_79(vec_uint32_t *pre_added, vec_uint32_t minus_4, - vec_uint32_t minus_8, vec_uint32_t minus_16, - vec_uint32_t minus_28, vec_uint32_t minus_32, - vec_uint32_t k) { - const vec_uint32_t minus_6 = vec_sld(minus_4, minus_8, 8); - const vec_uint32_t k_2_bits = vec_splat_u32(2); - const vec_uint32_t w = - vec_rl(minus_6 ^ minus_16 ^ minus_28 ^ minus_32, k_2_bits); - vec_st(w + k, 0, pre_added); - return w; -} - -// As pointed out by Wei Dai , F() below can be simplified -// to the code in F_00_19. Wei attributes these optimisations to Peter -// Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define -// F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) I've just become aware of another -// tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a -#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) -#define F_20_39(b, c, d) ((b) ^ (c) ^ (d)) -#define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d))) -#define F_60_79(b, c, d) F_20_39(b, c, d) - -// We pre-added the K constants during message scheduling. -#define BODY_00_19(i, a, b, c, d, e, f) \ - do { \ - (f) = w[i] + (e) + rotate((a), 5) + F_00_19((b), (c), (d)); \ - (b) = rotate((b), 30); \ - } while (0) - -#define BODY_20_39(i, a, b, c, d, e, f) \ - do { \ - (f) = w[i] + (e) + rotate((a), 5) + F_20_39((b), (c), (d)); \ - (b) = rotate((b), 30); \ - } while (0) - -#define BODY_40_59(i, a, b, c, d, e, f) \ - do { \ - (f) = w[i] + (e) + rotate((a), 5) + F_40_59((b), (c), (d)); \ - (b) = rotate((b), 30); \ - } while (0) - -#define BODY_60_79(i, a, b, c, d, e, f) \ - do { \ - (f) = w[i] + (e) + rotate((a), 5) + F_60_79((b), (c), (d)); \ - (b) = rotate((b), 30); \ - } while (0) - -void sha1_block_data_order(uint32_t *state, const uint8_t *data, size_t num) { - uint32_t A, B, C, D, E, T; - - A = state[0]; - B = state[1]; - C = state[2]; - D = state[3]; - E = state[4]; - - for (;;) { - vec_uint32_t vw[20]; - const uint32_t *w = (const uint32_t *)&vw; - - vec_uint32_t k = K_00_19_x_4; - const vec_uint32_t w0 = sched_00_15(vw + 0, data + 0, k); - BODY_00_19(0, A, B, C, D, E, T); - BODY_00_19(1, T, A, B, C, D, E); - BODY_00_19(2, E, T, A, B, C, D); - BODY_00_19(3, D, E, T, A, B, C); - - const vec_uint32_t w4 = sched_00_15(vw + 1, data + 16, k); - BODY_00_19(4, C, D, E, T, A, B); - BODY_00_19(5, B, C, D, E, T, A); - BODY_00_19(6, A, B, C, D, E, T); - BODY_00_19(7, T, A, B, C, D, E); - - const vec_uint32_t w8 = sched_00_15(vw + 2, data + 32, k); - BODY_00_19(8, E, T, A, B, C, D); - BODY_00_19(9, D, E, T, A, B, C); - BODY_00_19(10, C, D, E, T, A, B); - BODY_00_19(11, B, C, D, E, T, A); - - const vec_uint32_t w12 = sched_00_15(vw + 3, data + 48, k); - BODY_00_19(12, A, B, C, D, E, T); - BODY_00_19(13, T, A, B, C, D, E); - BODY_00_19(14, E, T, A, B, C, D); - BODY_00_19(15, D, E, T, A, B, C); - - const vec_uint32_t w16 = sched_16_31(vw + 4, w12, w8, w4, w0, k); - BODY_00_19(16, C, D, E, T, A, B); - BODY_00_19(17, B, C, D, E, T, A); - BODY_00_19(18, A, B, C, D, E, T); - BODY_00_19(19, T, A, B, C, D, E); - - k = K_20_39_x_4; - const vec_uint32_t w20 = sched_16_31(vw + 5, w16, w12, w8, w4, k); - BODY_20_39(20, E, T, A, B, C, D); - BODY_20_39(21, D, E, T, A, B, C); - BODY_20_39(22, C, D, E, T, A, B); - BODY_20_39(23, B, C, D, E, T, A); - - const vec_uint32_t w24 = sched_16_31(vw + 6, w20, w16, w12, w8, k); - BODY_20_39(24, A, B, C, D, E, T); - BODY_20_39(25, T, A, B, C, D, E); - BODY_20_39(26, E, T, A, B, C, D); - BODY_20_39(27, D, E, T, A, B, C); - - const vec_uint32_t w28 = sched_16_31(vw + 7, w24, w20, w16, w12, k); - BODY_20_39(28, C, D, E, T, A, B); - BODY_20_39(29, B, C, D, E, T, A); - BODY_20_39(30, A, B, C, D, E, T); - BODY_20_39(31, T, A, B, C, D, E); - - const vec_uint32_t w32 = sched_32_79(vw + 8, w28, w24, w16, w4, w0, k); - BODY_20_39(32, E, T, A, B, C, D); - BODY_20_39(33, D, E, T, A, B, C); - BODY_20_39(34, C, D, E, T, A, B); - BODY_20_39(35, B, C, D, E, T, A); - - const vec_uint32_t w36 = sched_32_79(vw + 9, w32, w28, w20, w8, w4, k); - BODY_20_39(36, A, B, C, D, E, T); - BODY_20_39(37, T, A, B, C, D, E); - BODY_20_39(38, E, T, A, B, C, D); - BODY_20_39(39, D, E, T, A, B, C); - - k = K_40_59_x_4; - const vec_uint32_t w40 = sched_32_79(vw + 10, w36, w32, w24, w12, w8, k); - BODY_40_59(40, C, D, E, T, A, B); - BODY_40_59(41, B, C, D, E, T, A); - BODY_40_59(42, A, B, C, D, E, T); - BODY_40_59(43, T, A, B, C, D, E); - - const vec_uint32_t w44 = sched_32_79(vw + 11, w40, w36, w28, w16, w12, k); - BODY_40_59(44, E, T, A, B, C, D); - BODY_40_59(45, D, E, T, A, B, C); - BODY_40_59(46, C, D, E, T, A, B); - BODY_40_59(47, B, C, D, E, T, A); - - const vec_uint32_t w48 = sched_32_79(vw + 12, w44, w40, w32, w20, w16, k); - BODY_40_59(48, A, B, C, D, E, T); - BODY_40_59(49, T, A, B, C, D, E); - BODY_40_59(50, E, T, A, B, C, D); - BODY_40_59(51, D, E, T, A, B, C); - - const vec_uint32_t w52 = sched_32_79(vw + 13, w48, w44, w36, w24, w20, k); - BODY_40_59(52, C, D, E, T, A, B); - BODY_40_59(53, B, C, D, E, T, A); - BODY_40_59(54, A, B, C, D, E, T); - BODY_40_59(55, T, A, B, C, D, E); - - const vec_uint32_t w56 = sched_32_79(vw + 14, w52, w48, w40, w28, w24, k); - BODY_40_59(56, E, T, A, B, C, D); - BODY_40_59(57, D, E, T, A, B, C); - BODY_40_59(58, C, D, E, T, A, B); - BODY_40_59(59, B, C, D, E, T, A); - - k = K_60_79_x_4; - const vec_uint32_t w60 = sched_32_79(vw + 15, w56, w52, w44, w32, w28, k); - BODY_60_79(60, A, B, C, D, E, T); - BODY_60_79(61, T, A, B, C, D, E); - BODY_60_79(62, E, T, A, B, C, D); - BODY_60_79(63, D, E, T, A, B, C); - - const vec_uint32_t w64 = sched_32_79(vw + 16, w60, w56, w48, w36, w32, k); - BODY_60_79(64, C, D, E, T, A, B); - BODY_60_79(65, B, C, D, E, T, A); - BODY_60_79(66, A, B, C, D, E, T); - BODY_60_79(67, T, A, B, C, D, E); - - const vec_uint32_t w68 = sched_32_79(vw + 17, w64, w60, w52, w40, w36, k); - BODY_60_79(68, E, T, A, B, C, D); - BODY_60_79(69, D, E, T, A, B, C); - BODY_60_79(70, C, D, E, T, A, B); - BODY_60_79(71, B, C, D, E, T, A); - - const vec_uint32_t w72 = sched_32_79(vw + 18, w68, w64, w56, w44, w40, k); - BODY_60_79(72, A, B, C, D, E, T); - BODY_60_79(73, T, A, B, C, D, E); - BODY_60_79(74, E, T, A, B, C, D); - BODY_60_79(75, D, E, T, A, B, C); - - // We don't use the last value - (void)sched_32_79(vw + 19, w72, w68, w60, w48, w44, k); - BODY_60_79(76, C, D, E, T, A, B); - BODY_60_79(77, B, C, D, E, T, A); - BODY_60_79(78, A, B, C, D, E, T); - BODY_60_79(79, T, A, B, C, D, E); - - const uint32_t mask = 0xffffffffUL; - state[0] = (state[0] + E) & mask; - state[1] = (state[1] + T) & mask; - state[2] = (state[2] + A) & mask; - state[3] = (state[3] + B) & mask; - state[4] = (state[4] + C) & mask; - - data += 64; - if (--num == 0) { - break; - } - - A = state[0]; - B = state[1]; - C = state[2]; - D = state[3]; - E = state[4]; - } -} - -#endif // OPENSSL_PPC64LE - -#undef K_00_19 -#undef K_20_39 -#undef K_40_59 -#undef K_60_79 -#undef F_00_19 -#undef F_20_39 -#undef F_40_59 -#undef F_60_79 -#undef BODY_00_19 -#undef BODY_20_39 -#undef BODY_40_59 -#undef BODY_60_79 diff --git a/third_party/boringssl/src/crypto/fipsmodule/sha/sha1.c b/third_party/boringssl/src/crypto/fipsmodule/sha/sha1.c deleted file mode 100644 index f921e312..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/sha/sha1.c +++ /dev/null @@ -1,359 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include - -#include "../../internal.h" -#include "../digest/md32_common.h" -#include "../service_indicator/internal.h" -#include "internal.h" - - -int SHA1_Init(SHA_CTX *sha) { - OPENSSL_memset(sha, 0, sizeof(SHA_CTX)); - sha->h[0] = 0x67452301UL; - sha->h[1] = 0xefcdab89UL; - sha->h[2] = 0x98badcfeUL; - sha->h[3] = 0x10325476UL; - sha->h[4] = 0xc3d2e1f0UL; - return 1; -} - -uint8_t *SHA1(const uint8_t *data, size_t len, uint8_t out[SHA_DIGEST_LENGTH]) { - SHA_CTX ctx; - SHA1_Init(&ctx); - SHA1_Update(&ctx, data, len); - SHA1_Final(out, &ctx); - OPENSSL_cleanse(&ctx, sizeof(ctx)); - return out; -} - -#if !defined(SHA1_ASM) -static void sha1_block_data_order(uint32_t *state, const uint8_t *data, - size_t num); -#endif - -void SHA1_Transform(SHA_CTX *c, const uint8_t data[SHA_CBLOCK]) { - sha1_block_data_order(c->h, data, 1); -} - -int SHA1_Update(SHA_CTX *c, const void *data, size_t len) { - crypto_md32_update(&sha1_block_data_order, c->h, c->data, SHA_CBLOCK, &c->num, - &c->Nh, &c->Nl, data, len); - return 1; -} - -int SHA1_Final(uint8_t out[SHA_DIGEST_LENGTH], SHA_CTX *c) { - crypto_md32_final(&sha1_block_data_order, c->h, c->data, SHA_CBLOCK, &c->num, - c->Nh, c->Nl, /*is_big_endian=*/1); - - CRYPTO_store_u32_be(out, c->h[0]); - CRYPTO_store_u32_be(out + 4, c->h[1]); - CRYPTO_store_u32_be(out + 8, c->h[2]); - CRYPTO_store_u32_be(out + 12, c->h[3]); - CRYPTO_store_u32_be(out + 16, c->h[4]); - FIPS_service_indicator_update_state(); - return 1; -} - -#define Xupdate(a, ix, ia, ib, ic, id) \ - do { \ - (a) = ((ia) ^ (ib) ^ (ic) ^ (id)); \ - (ix) = (a) = CRYPTO_rotl_u32((a), 1); \ - } while (0) - -#define K_00_19 0x5a827999UL -#define K_20_39 0x6ed9eba1UL -#define K_40_59 0x8f1bbcdcUL -#define K_60_79 0xca62c1d6UL - -// As pointed out by Wei Dai , F() below can be simplified -// to the code in F_00_19. Wei attributes these optimisations to Peter -// Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define -// F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) I've just become aware of another -// tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a -#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) -#define F_20_39(b, c, d) ((b) ^ (c) ^ (d)) -#define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d))) -#define F_60_79(b, c, d) F_20_39(b, c, d) - -#define BODY_00_15(i, a, b, c, d, e, f, xi) \ - do { \ - (f) = (xi) + (e) + K_00_19 + CRYPTO_rotl_u32((a), 5) + \ - F_00_19((b), (c), (d)); \ - (b) = CRYPTO_rotl_u32((b), 30); \ - } while (0) - -#define BODY_16_19(i, a, b, c, d, e, f, xi, xa, xb, xc, xd) \ - do { \ - Xupdate(f, xi, xa, xb, xc, xd); \ - (f) += (e) + K_00_19 + CRYPTO_rotl_u32((a), 5) + F_00_19((b), (c), (d)); \ - (b) = CRYPTO_rotl_u32((b), 30); \ - } while (0) - -#define BODY_20_31(i, a, b, c, d, e, f, xi, xa, xb, xc, xd) \ - do { \ - Xupdate(f, xi, xa, xb, xc, xd); \ - (f) += (e) + K_20_39 + CRYPTO_rotl_u32((a), 5) + F_20_39((b), (c), (d)); \ - (b) = CRYPTO_rotl_u32((b), 30); \ - } while (0) - -#define BODY_32_39(i, a, b, c, d, e, f, xa, xb, xc, xd) \ - do { \ - Xupdate(f, xa, xa, xb, xc, xd); \ - (f) += (e) + K_20_39 + CRYPTO_rotl_u32((a), 5) + F_20_39((b), (c), (d)); \ - (b) = CRYPTO_rotl_u32((b), 30); \ - } while (0) - -#define BODY_40_59(i, a, b, c, d, e, f, xa, xb, xc, xd) \ - do { \ - Xupdate(f, xa, xa, xb, xc, xd); \ - (f) += (e) + K_40_59 + CRYPTO_rotl_u32((a), 5) + F_40_59((b), (c), (d)); \ - (b) = CRYPTO_rotl_u32((b), 30); \ - } while (0) - -#define BODY_60_79(i, a, b, c, d, e, f, xa, xb, xc, xd) \ - do { \ - Xupdate(f, xa, xa, xb, xc, xd); \ - (f) = (xa) + (e) + K_60_79 + CRYPTO_rotl_u32((a), 5) + \ - F_60_79((b), (c), (d)); \ - (b) = CRYPTO_rotl_u32((b), 30); \ - } while (0) - -#ifdef X -#undef X -#endif - -/* Originally X was an array. As it's automatic it's natural -* to expect RISC compiler to accomodate at least part of it in -* the register bank, isn't it? Unfortunately not all compilers -* "find" this expectation reasonable:-( On order to make such -* compilers generate better code I replace X[] with a bunch of -* X0, X1, etc. See the function body below... -* */ -#define X(i) XX##i - -#if !defined(SHA1_ASM) -static void sha1_block_data_order(uint32_t *state, const uint8_t *data, - size_t num) { - register uint32_t A, B, C, D, E, T; - uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10, - XX11, XX12, XX13, XX14, XX15; - - A = state[0]; - B = state[1]; - C = state[2]; - D = state[3]; - E = state[4]; - - for (;;) { - X(0) = CRYPTO_load_u32_be(data); - data += 4; - X(1) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(0, A, B, C, D, E, T, X(0)); - X(2) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(1, T, A, B, C, D, E, X(1)); - X(3) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(2, E, T, A, B, C, D, X(2)); - X(4) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(3, D, E, T, A, B, C, X(3)); - X(5) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(4, C, D, E, T, A, B, X(4)); - X(6) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(5, B, C, D, E, T, A, X(5)); - X(7) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(6, A, B, C, D, E, T, X(6)); - X(8) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(7, T, A, B, C, D, E, X(7)); - X(9) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(8, E, T, A, B, C, D, X(8)); - X(10) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(9, D, E, T, A, B, C, X(9)); - X(11) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(10, C, D, E, T, A, B, X(10)); - X(12) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(11, B, C, D, E, T, A, X(11)); - X(13) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(12, A, B, C, D, E, T, X(12)); - X(14) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(13, T, A, B, C, D, E, X(13)); - X(15) = CRYPTO_load_u32_be(data); - data += 4; - BODY_00_15(14, E, T, A, B, C, D, X(14)); - BODY_00_15(15, D, E, T, A, B, C, X(15)); - - BODY_16_19(16, C, D, E, T, A, B, X(0), X(0), X(2), X(8), X(13)); - BODY_16_19(17, B, C, D, E, T, A, X(1), X(1), X(3), X(9), X(14)); - BODY_16_19(18, A, B, C, D, E, T, X(2), X(2), X(4), X(10), X(15)); - BODY_16_19(19, T, A, B, C, D, E, X(3), X(3), X(5), X(11), X(0)); - - BODY_20_31(20, E, T, A, B, C, D, X(4), X(4), X(6), X(12), X(1)); - BODY_20_31(21, D, E, T, A, B, C, X(5), X(5), X(7), X(13), X(2)); - BODY_20_31(22, C, D, E, T, A, B, X(6), X(6), X(8), X(14), X(3)); - BODY_20_31(23, B, C, D, E, T, A, X(7), X(7), X(9), X(15), X(4)); - BODY_20_31(24, A, B, C, D, E, T, X(8), X(8), X(10), X(0), X(5)); - BODY_20_31(25, T, A, B, C, D, E, X(9), X(9), X(11), X(1), X(6)); - BODY_20_31(26, E, T, A, B, C, D, X(10), X(10), X(12), X(2), X(7)); - BODY_20_31(27, D, E, T, A, B, C, X(11), X(11), X(13), X(3), X(8)); - BODY_20_31(28, C, D, E, T, A, B, X(12), X(12), X(14), X(4), X(9)); - BODY_20_31(29, B, C, D, E, T, A, X(13), X(13), X(15), X(5), X(10)); - BODY_20_31(30, A, B, C, D, E, T, X(14), X(14), X(0), X(6), X(11)); - BODY_20_31(31, T, A, B, C, D, E, X(15), X(15), X(1), X(7), X(12)); - - BODY_32_39(32, E, T, A, B, C, D, X(0), X(2), X(8), X(13)); - BODY_32_39(33, D, E, T, A, B, C, X(1), X(3), X(9), X(14)); - BODY_32_39(34, C, D, E, T, A, B, X(2), X(4), X(10), X(15)); - BODY_32_39(35, B, C, D, E, T, A, X(3), X(5), X(11), X(0)); - BODY_32_39(36, A, B, C, D, E, T, X(4), X(6), X(12), X(1)); - BODY_32_39(37, T, A, B, C, D, E, X(5), X(7), X(13), X(2)); - BODY_32_39(38, E, T, A, B, C, D, X(6), X(8), X(14), X(3)); - BODY_32_39(39, D, E, T, A, B, C, X(7), X(9), X(15), X(4)); - - BODY_40_59(40, C, D, E, T, A, B, X(8), X(10), X(0), X(5)); - BODY_40_59(41, B, C, D, E, T, A, X(9), X(11), X(1), X(6)); - BODY_40_59(42, A, B, C, D, E, T, X(10), X(12), X(2), X(7)); - BODY_40_59(43, T, A, B, C, D, E, X(11), X(13), X(3), X(8)); - BODY_40_59(44, E, T, A, B, C, D, X(12), X(14), X(4), X(9)); - BODY_40_59(45, D, E, T, A, B, C, X(13), X(15), X(5), X(10)); - BODY_40_59(46, C, D, E, T, A, B, X(14), X(0), X(6), X(11)); - BODY_40_59(47, B, C, D, E, T, A, X(15), X(1), X(7), X(12)); - BODY_40_59(48, A, B, C, D, E, T, X(0), X(2), X(8), X(13)); - BODY_40_59(49, T, A, B, C, D, E, X(1), X(3), X(9), X(14)); - BODY_40_59(50, E, T, A, B, C, D, X(2), X(4), X(10), X(15)); - BODY_40_59(51, D, E, T, A, B, C, X(3), X(5), X(11), X(0)); - BODY_40_59(52, C, D, E, T, A, B, X(4), X(6), X(12), X(1)); - BODY_40_59(53, B, C, D, E, T, A, X(5), X(7), X(13), X(2)); - BODY_40_59(54, A, B, C, D, E, T, X(6), X(8), X(14), X(3)); - BODY_40_59(55, T, A, B, C, D, E, X(7), X(9), X(15), X(4)); - BODY_40_59(56, E, T, A, B, C, D, X(8), X(10), X(0), X(5)); - BODY_40_59(57, D, E, T, A, B, C, X(9), X(11), X(1), X(6)); - BODY_40_59(58, C, D, E, T, A, B, X(10), X(12), X(2), X(7)); - BODY_40_59(59, B, C, D, E, T, A, X(11), X(13), X(3), X(8)); - - BODY_60_79(60, A, B, C, D, E, T, X(12), X(14), X(4), X(9)); - BODY_60_79(61, T, A, B, C, D, E, X(13), X(15), X(5), X(10)); - BODY_60_79(62, E, T, A, B, C, D, X(14), X(0), X(6), X(11)); - BODY_60_79(63, D, E, T, A, B, C, X(15), X(1), X(7), X(12)); - BODY_60_79(64, C, D, E, T, A, B, X(0), X(2), X(8), X(13)); - BODY_60_79(65, B, C, D, E, T, A, X(1), X(3), X(9), X(14)); - BODY_60_79(66, A, B, C, D, E, T, X(2), X(4), X(10), X(15)); - BODY_60_79(67, T, A, B, C, D, E, X(3), X(5), X(11), X(0)); - BODY_60_79(68, E, T, A, B, C, D, X(4), X(6), X(12), X(1)); - BODY_60_79(69, D, E, T, A, B, C, X(5), X(7), X(13), X(2)); - BODY_60_79(70, C, D, E, T, A, B, X(6), X(8), X(14), X(3)); - BODY_60_79(71, B, C, D, E, T, A, X(7), X(9), X(15), X(4)); - BODY_60_79(72, A, B, C, D, E, T, X(8), X(10), X(0), X(5)); - BODY_60_79(73, T, A, B, C, D, E, X(9), X(11), X(1), X(6)); - BODY_60_79(74, E, T, A, B, C, D, X(10), X(12), X(2), X(7)); - BODY_60_79(75, D, E, T, A, B, C, X(11), X(13), X(3), X(8)); - BODY_60_79(76, C, D, E, T, A, B, X(12), X(14), X(4), X(9)); - BODY_60_79(77, B, C, D, E, T, A, X(13), X(15), X(5), X(10)); - BODY_60_79(78, A, B, C, D, E, T, X(14), X(0), X(6), X(11)); - BODY_60_79(79, T, A, B, C, D, E, X(15), X(1), X(7), X(12)); - - state[0] = (state[0] + E) & 0xffffffffL; - state[1] = (state[1] + T) & 0xffffffffL; - state[2] = (state[2] + A) & 0xffffffffL; - state[3] = (state[3] + B) & 0xffffffffL; - state[4] = (state[4] + C) & 0xffffffffL; - - if (--num == 0) { - break; - } - - A = state[0]; - B = state[1]; - C = state[2]; - D = state[3]; - E = state[4]; - } -} -#endif - -#undef Xupdate -#undef K_00_19 -#undef K_20_39 -#undef K_40_59 -#undef K_60_79 -#undef F_00_19 -#undef F_20_39 -#undef F_40_59 -#undef F_60_79 -#undef BODY_00_15 -#undef BODY_16_19 -#undef BODY_20_31 -#undef BODY_32_39 -#undef BODY_40_59 -#undef BODY_60_79 -#undef X diff --git a/third_party/boringssl/src/crypto/fipsmodule/sha/sha1.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/sha/sha1.cc.inc new file mode 100644 index 00000000..ce962d35 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/sha/sha1.cc.inc @@ -0,0 +1,404 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "../../internal.h" +#include "../bcm_interface.h" +#include "../digest/md32_common.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +bcm_infallible bssl::BCM_sha1_init(SHA_CTX *sha) { + OPENSSL_memset(sha, 0, sizeof(SHA_CTX)); + sha->h[0] = 0x67452301UL; + sha->h[1] = 0xefcdab89UL; + sha->h[2] = 0x98badcfeUL; + sha->h[3] = 0x10325476UL; + sha->h[4] = 0xc3d2e1f0UL; + return bcm_infallible::approved; +} + +#if !defined(SHA1_ASM) +static void sha1_block_data_order(uint32_t state[5], const uint8_t *data, + size_t num); +#endif + +bcm_infallible bssl::BCM_sha1_transform(SHA_CTX *c, + const uint8_t data[SHA_CBLOCK]) { + sha1_block_data_order(c->h, data, 1); + return bcm_infallible::approved; +} + +namespace { +struct SHA1Traits { + using HashContext = SHA_CTX; + static constexpr size_t kBlockSize = SHA_CBLOCK; + static constexpr bool kLengthIsBigEndian = true; + static void HashBlocks(uint32_t *state, const uint8_t *data, + size_t num_blocks) { + sha1_block_data_order(state, data, num_blocks); + } +}; +} // namespace + +bcm_infallible bssl::BCM_sha1_update(SHA_CTX *c, const void *data, size_t len) { + crypto_md32_update(c, + Span(static_cast(data), len)); + return bcm_infallible::approved; +} + +static void sha1_output_state(uint8_t out[SHA_DIGEST_LENGTH], + const SHA_CTX *ctx) { + CRYPTO_store_u32_be(out, ctx->h[0]); + CRYPTO_store_u32_be(out + 4, ctx->h[1]); + CRYPTO_store_u32_be(out + 8, ctx->h[2]); + CRYPTO_store_u32_be(out + 12, ctx->h[3]); + CRYPTO_store_u32_be(out + 16, ctx->h[4]); +} + +bcm_infallible bssl::BCM_sha1_final(uint8_t out[SHA_DIGEST_LENGTH], + SHA_CTX *c) { + crypto_md32_final(c); + sha1_output_state(out, c); + FIPS_service_indicator_update_state(); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_fips_186_2_prf(uint8_t *out, size_t out_len, + const uint8_t xkey[SHA_DIGEST_LENGTH]) { + // XKEY and XVAL are 160-bit values, but are internally right-padded up to + // block size. See FIPS 186-2, Appendix 3.3. This buffer maintains both the + // current value of XKEY and the padding. + uint8_t block[SHA_CBLOCK] = {0}; + OPENSSL_memcpy(block, xkey, SHA_DIGEST_LENGTH); + + while (out_len != 0) { + // We always use a zero XSEED, so we can merge the inner and outer loops. + // XVAL is also always equal to XKEY. + SHA_CTX ctx; + BCM_sha1_init(&ctx); + BCM_sha1_transform(&ctx, block); + + // XKEY = (1 + XKEY + w_i) mod 2^b + uint32_t carry = 1; + for (int i = 4; i >= 0; i--) { + uint32_t tmp = CRYPTO_load_u32_be(block + i * 4); + tmp = CRYPTO_addc_u32(tmp, ctx.h[i], carry, &carry); + CRYPTO_store_u32_be(block + i * 4, tmp); + } + + // Output w_i. + if (out_len < SHA_DIGEST_LENGTH) { + uint8_t buf[SHA_DIGEST_LENGTH]; + sha1_output_state(buf, &ctx); + OPENSSL_memcpy(out, buf, out_len); + break; + } + sha1_output_state(out, &ctx); + out += SHA_DIGEST_LENGTH; + out_len -= SHA_DIGEST_LENGTH; + } + return bcm_infallible::not_approved; +} + +#define Xupdate(a, ix, ia, ib, ic, id) \ + do { \ + (a) = ((ia) ^ (ib) ^ (ic) ^ (id)); \ + (ix) = (a) = CRYPTO_rotl_u32((a), 1); \ + } while (0) + +#define K_00_19 0x5a827999UL +#define K_20_39 0x6ed9eba1UL +#define K_40_59 0x8f1bbcdcUL +#define K_60_79 0xca62c1d6UL + +// As pointed out by Wei Dai , F() below can be simplified +// to the code in F_00_19. Wei attributes these optimisations to Peter +// Gutmann's SHS code, and he attributes it to Rich Schroeppel. #define +// F(x,y,z) (((x) & (y)) | ((~(x)) & (z))) I've just become aware of another +// tweak to be made, again from Wei Dai, in F_40_59, (x&a)|(y&a) -> (x|y)&a +#define F_00_19(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) +#define F_20_39(b, c, d) ((b) ^ (c) ^ (d)) +#define F_40_59(b, c, d) (((b) & (c)) | (((b) | (c)) & (d))) +#define F_60_79(b, c, d) F_20_39(b, c, d) + +#define BODY_00_15(i, a, b, c, d, e, f, xi) \ + do { \ + (f) = (xi) + (e) + K_00_19 + CRYPTO_rotl_u32((a), 5) + \ + F_00_19((b), (c), (d)); \ + (b) = CRYPTO_rotl_u32((b), 30); \ + } while (0) + +#define BODY_16_19(i, a, b, c, d, e, f, xi, xa, xb, xc, xd) \ + do { \ + Xupdate(f, xi, xa, xb, xc, xd); \ + (f) += (e) + K_00_19 + CRYPTO_rotl_u32((a), 5) + F_00_19((b), (c), (d)); \ + (b) = CRYPTO_rotl_u32((b), 30); \ + } while (0) + +#define BODY_20_31(i, a, b, c, d, e, f, xi, xa, xb, xc, xd) \ + do { \ + Xupdate(f, xi, xa, xb, xc, xd); \ + (f) += (e) + K_20_39 + CRYPTO_rotl_u32((a), 5) + F_20_39((b), (c), (d)); \ + (b) = CRYPTO_rotl_u32((b), 30); \ + } while (0) + +#define BODY_32_39(i, a, b, c, d, e, f, xa, xb, xc, xd) \ + do { \ + Xupdate(f, xa, xa, xb, xc, xd); \ + (f) += (e) + K_20_39 + CRYPTO_rotl_u32((a), 5) + F_20_39((b), (c), (d)); \ + (b) = CRYPTO_rotl_u32((b), 30); \ + } while (0) + +#define BODY_40_59(i, a, b, c, d, e, f, xa, xb, xc, xd) \ + do { \ + Xupdate(f, xa, xa, xb, xc, xd); \ + (f) += (e) + K_40_59 + CRYPTO_rotl_u32((a), 5) + F_40_59((b), (c), (d)); \ + (b) = CRYPTO_rotl_u32((b), 30); \ + } while (0) + +#define BODY_60_79(i, a, b, c, d, e, f, xa, xb, xc, xd) \ + do { \ + Xupdate(f, xa, xa, xb, xc, xd); \ + (f) = (xa) + (e) + K_60_79 + CRYPTO_rotl_u32((a), 5) + \ + F_60_79((b), (c), (d)); \ + (b) = CRYPTO_rotl_u32((b), 30); \ + } while (0) + +#ifdef X +#undef X +#endif + +/* Originally X was an array. As it's automatic it's natural + * to expect RISC compiler to accommodate at least part of it in + * the register bank, isn't it? Unfortunately not all compilers + * "find" this expectation reasonable:-( On order to make such + * compilers generate better code I replace X[] with a bunch of + * X0, X1, etc. See the function body below... + * */ +#define X(i) XX##i + +#if !defined(SHA1_ASM) + +#if !defined(SHA1_ASM_NOHW) +static void sha1_block_data_order_nohw(uint32_t state[5], const uint8_t *data, + size_t num) { + uint32_t A, B, C, D, E, T; + uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10, XX11, XX12, + XX13, XX14, XX15; + + A = state[0]; + B = state[1]; + C = state[2]; + D = state[3]; + E = state[4]; + + for (;;) { + X(0) = CRYPTO_load_u32_be(data); + data += 4; + X(1) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(0, A, B, C, D, E, T, X(0)); + X(2) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(1, T, A, B, C, D, E, X(1)); + X(3) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(2, E, T, A, B, C, D, X(2)); + X(4) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(3, D, E, T, A, B, C, X(3)); + X(5) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(4, C, D, E, T, A, B, X(4)); + X(6) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(5, B, C, D, E, T, A, X(5)); + X(7) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(6, A, B, C, D, E, T, X(6)); + X(8) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(7, T, A, B, C, D, E, X(7)); + X(9) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(8, E, T, A, B, C, D, X(8)); + X(10) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(9, D, E, T, A, B, C, X(9)); + X(11) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(10, C, D, E, T, A, B, X(10)); + X(12) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(11, B, C, D, E, T, A, X(11)); + X(13) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(12, A, B, C, D, E, T, X(12)); + X(14) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(13, T, A, B, C, D, E, X(13)); + X(15) = CRYPTO_load_u32_be(data); + data += 4; + BODY_00_15(14, E, T, A, B, C, D, X(14)); + BODY_00_15(15, D, E, T, A, B, C, X(15)); + + BODY_16_19(16, C, D, E, T, A, B, X(0), X(0), X(2), X(8), X(13)); + BODY_16_19(17, B, C, D, E, T, A, X(1), X(1), X(3), X(9), X(14)); + BODY_16_19(18, A, B, C, D, E, T, X(2), X(2), X(4), X(10), X(15)); + BODY_16_19(19, T, A, B, C, D, E, X(3), X(3), X(5), X(11), X(0)); + + BODY_20_31(20, E, T, A, B, C, D, X(4), X(4), X(6), X(12), X(1)); + BODY_20_31(21, D, E, T, A, B, C, X(5), X(5), X(7), X(13), X(2)); + BODY_20_31(22, C, D, E, T, A, B, X(6), X(6), X(8), X(14), X(3)); + BODY_20_31(23, B, C, D, E, T, A, X(7), X(7), X(9), X(15), X(4)); + BODY_20_31(24, A, B, C, D, E, T, X(8), X(8), X(10), X(0), X(5)); + BODY_20_31(25, T, A, B, C, D, E, X(9), X(9), X(11), X(1), X(6)); + BODY_20_31(26, E, T, A, B, C, D, X(10), X(10), X(12), X(2), X(7)); + BODY_20_31(27, D, E, T, A, B, C, X(11), X(11), X(13), X(3), X(8)); + BODY_20_31(28, C, D, E, T, A, B, X(12), X(12), X(14), X(4), X(9)); + BODY_20_31(29, B, C, D, E, T, A, X(13), X(13), X(15), X(5), X(10)); + BODY_20_31(30, A, B, C, D, E, T, X(14), X(14), X(0), X(6), X(11)); + BODY_20_31(31, T, A, B, C, D, E, X(15), X(15), X(1), X(7), X(12)); + + BODY_32_39(32, E, T, A, B, C, D, X(0), X(2), X(8), X(13)); + BODY_32_39(33, D, E, T, A, B, C, X(1), X(3), X(9), X(14)); + BODY_32_39(34, C, D, E, T, A, B, X(2), X(4), X(10), X(15)); + BODY_32_39(35, B, C, D, E, T, A, X(3), X(5), X(11), X(0)); + BODY_32_39(36, A, B, C, D, E, T, X(4), X(6), X(12), X(1)); + BODY_32_39(37, T, A, B, C, D, E, X(5), X(7), X(13), X(2)); + BODY_32_39(38, E, T, A, B, C, D, X(6), X(8), X(14), X(3)); + BODY_32_39(39, D, E, T, A, B, C, X(7), X(9), X(15), X(4)); + + BODY_40_59(40, C, D, E, T, A, B, X(8), X(10), X(0), X(5)); + BODY_40_59(41, B, C, D, E, T, A, X(9), X(11), X(1), X(6)); + BODY_40_59(42, A, B, C, D, E, T, X(10), X(12), X(2), X(7)); + BODY_40_59(43, T, A, B, C, D, E, X(11), X(13), X(3), X(8)); + BODY_40_59(44, E, T, A, B, C, D, X(12), X(14), X(4), X(9)); + BODY_40_59(45, D, E, T, A, B, C, X(13), X(15), X(5), X(10)); + BODY_40_59(46, C, D, E, T, A, B, X(14), X(0), X(6), X(11)); + BODY_40_59(47, B, C, D, E, T, A, X(15), X(1), X(7), X(12)); + BODY_40_59(48, A, B, C, D, E, T, X(0), X(2), X(8), X(13)); + BODY_40_59(49, T, A, B, C, D, E, X(1), X(3), X(9), X(14)); + BODY_40_59(50, E, T, A, B, C, D, X(2), X(4), X(10), X(15)); + BODY_40_59(51, D, E, T, A, B, C, X(3), X(5), X(11), X(0)); + BODY_40_59(52, C, D, E, T, A, B, X(4), X(6), X(12), X(1)); + BODY_40_59(53, B, C, D, E, T, A, X(5), X(7), X(13), X(2)); + BODY_40_59(54, A, B, C, D, E, T, X(6), X(8), X(14), X(3)); + BODY_40_59(55, T, A, B, C, D, E, X(7), X(9), X(15), X(4)); + BODY_40_59(56, E, T, A, B, C, D, X(8), X(10), X(0), X(5)); + BODY_40_59(57, D, E, T, A, B, C, X(9), X(11), X(1), X(6)); + BODY_40_59(58, C, D, E, T, A, B, X(10), X(12), X(2), X(7)); + BODY_40_59(59, B, C, D, E, T, A, X(11), X(13), X(3), X(8)); + + BODY_60_79(60, A, B, C, D, E, T, X(12), X(14), X(4), X(9)); + BODY_60_79(61, T, A, B, C, D, E, X(13), X(15), X(5), X(10)); + BODY_60_79(62, E, T, A, B, C, D, X(14), X(0), X(6), X(11)); + BODY_60_79(63, D, E, T, A, B, C, X(15), X(1), X(7), X(12)); + BODY_60_79(64, C, D, E, T, A, B, X(0), X(2), X(8), X(13)); + BODY_60_79(65, B, C, D, E, T, A, X(1), X(3), X(9), X(14)); + BODY_60_79(66, A, B, C, D, E, T, X(2), X(4), X(10), X(15)); + BODY_60_79(67, T, A, B, C, D, E, X(3), X(5), X(11), X(0)); + BODY_60_79(68, E, T, A, B, C, D, X(4), X(6), X(12), X(1)); + BODY_60_79(69, D, E, T, A, B, C, X(5), X(7), X(13), X(2)); + BODY_60_79(70, C, D, E, T, A, B, X(6), X(8), X(14), X(3)); + BODY_60_79(71, B, C, D, E, T, A, X(7), X(9), X(15), X(4)); + BODY_60_79(72, A, B, C, D, E, T, X(8), X(10), X(0), X(5)); + BODY_60_79(73, T, A, B, C, D, E, X(9), X(11), X(1), X(6)); + BODY_60_79(74, E, T, A, B, C, D, X(10), X(12), X(2), X(7)); + BODY_60_79(75, D, E, T, A, B, C, X(11), X(13), X(3), X(8)); + BODY_60_79(76, C, D, E, T, A, B, X(12), X(14), X(4), X(9)); + BODY_60_79(77, B, C, D, E, T, A, X(13), X(15), X(5), X(10)); + BODY_60_79(78, A, B, C, D, E, T, X(14), X(0), X(6), X(11)); + BODY_60_79(79, T, A, B, C, D, E, X(15), X(1), X(7), X(12)); + + state[0] = (state[0] + E) & 0xffffffffL; + state[1] = (state[1] + T) & 0xffffffffL; + state[2] = (state[2] + A) & 0xffffffffL; + state[3] = (state[3] + B) & 0xffffffffL; + state[4] = (state[4] + C) & 0xffffffffL; + + if (--num == 0) { + break; + } + + A = state[0]; + B = state[1]; + C = state[2]; + D = state[3]; + E = state[4]; + } +} +#endif // !SHA1_ASM_NOHW + +static void sha1_block_data_order(uint32_t state[5], const uint8_t *data, + size_t num) { +#if defined(SHA1_ASM_HW) + if (sha1_hw_capable()) { + sha1_block_data_order_hw(state, data, num); + return; + } +#endif +#if defined(SHA1_ASM_AVX2) + if (sha1_avx2_capable()) { + sha1_block_data_order_avx2(state, data, num); + return; + } +#endif +#if defined(SHA1_ASM_AVX) + if (sha1_avx_capable()) { + sha1_block_data_order_avx(state, data, num); + return; + } +#endif +#if defined(SHA1_ASM_SSSE3) + if (sha1_ssse3_capable()) { + sha1_block_data_order_ssse3(state, data, num); + return; + } +#endif +#if defined(SHA1_ASM_NEON) + if (CRYPTO_is_NEON_capable()) { + sha1_block_data_order_neon(state, data, num); + return; + } +#endif + sha1_block_data_order_nohw(state, data, num); +} + +#endif // !SHA1_ASM + +#undef Xupdate +#undef K_00_19 +#undef K_20_39 +#undef K_40_59 +#undef K_60_79 +#undef F_00_19 +#undef F_20_39 +#undef F_40_59 +#undef F_60_79 +#undef BODY_00_15 +#undef BODY_16_19 +#undef BODY_20_31 +#undef BODY_32_39 +#undef BODY_40_59 +#undef BODY_60_79 +#undef X diff --git a/third_party/boringssl/src/crypto/fipsmodule/sha/sha256.c b/third_party/boringssl/src/crypto/fipsmodule/sha/sha256.c deleted file mode 100644 index 454b9479..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/sha/sha256.c +++ /dev/null @@ -1,325 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include - -#include "../../internal.h" -#include "../digest/md32_common.h" -#include "../service_indicator/internal.h" -#include "internal.h" - - -int SHA224_Init(SHA256_CTX *sha) { - OPENSSL_memset(sha, 0, sizeof(SHA256_CTX)); - sha->h[0] = 0xc1059ed8UL; - sha->h[1] = 0x367cd507UL; - sha->h[2] = 0x3070dd17UL; - sha->h[3] = 0xf70e5939UL; - sha->h[4] = 0xffc00b31UL; - sha->h[5] = 0x68581511UL; - sha->h[6] = 0x64f98fa7UL; - sha->h[7] = 0xbefa4fa4UL; - sha->md_len = SHA224_DIGEST_LENGTH; - return 1; -} - -int SHA256_Init(SHA256_CTX *sha) { - OPENSSL_memset(sha, 0, sizeof(SHA256_CTX)); - sha->h[0] = 0x6a09e667UL; - sha->h[1] = 0xbb67ae85UL; - sha->h[2] = 0x3c6ef372UL; - sha->h[3] = 0xa54ff53aUL; - sha->h[4] = 0x510e527fUL; - sha->h[5] = 0x9b05688cUL; - sha->h[6] = 0x1f83d9abUL; - sha->h[7] = 0x5be0cd19UL; - sha->md_len = SHA256_DIGEST_LENGTH; - return 1; -} - -uint8_t *SHA224(const uint8_t *data, size_t len, - uint8_t out[SHA224_DIGEST_LENGTH]) { - SHA256_CTX ctx; - SHA224_Init(&ctx); - SHA224_Update(&ctx, data, len); - SHA224_Final(out, &ctx); - OPENSSL_cleanse(&ctx, sizeof(ctx)); - return out; -} - -uint8_t *SHA256(const uint8_t *data, size_t len, - uint8_t out[SHA256_DIGEST_LENGTH]) { - SHA256_CTX ctx; - SHA256_Init(&ctx); - SHA256_Update(&ctx, data, len); - SHA256_Final(out, &ctx); - OPENSSL_cleanse(&ctx, sizeof(ctx)); - return out; -} - -#ifndef SHA256_ASM -static void sha256_block_data_order(uint32_t *state, const uint8_t *in, - size_t num); -#endif - -void SHA256_Transform(SHA256_CTX *c, const uint8_t data[SHA256_CBLOCK]) { - sha256_block_data_order(c->h, data, 1); -} - -int SHA256_Update(SHA256_CTX *c, const void *data, size_t len) { - crypto_md32_update(&sha256_block_data_order, c->h, c->data, SHA256_CBLOCK, - &c->num, &c->Nh, &c->Nl, data, len); - return 1; -} - -int SHA224_Update(SHA256_CTX *ctx, const void *data, size_t len) { - return SHA256_Update(ctx, data, len); -} - -static int sha256_final_impl(uint8_t *out, SHA256_CTX *c) { - crypto_md32_final(&sha256_block_data_order, c->h, c->data, SHA256_CBLOCK, - &c->num, c->Nh, c->Nl, /*is_big_endian=*/1); - - // TODO(davidben): This overflow check one of the few places a low-level hash - // 'final' function can fail. SHA-512 does not have a corresponding check. - // These functions already misbehave if the caller arbitrarily mutates |c|, so - // can we assume one of |SHA256_Init| or |SHA224_Init| was used? - if (c->md_len > SHA256_DIGEST_LENGTH) { - return 0; - } - - assert(c->md_len % 4 == 0); - const size_t out_words = c->md_len / 4; - for (size_t i = 0; i < out_words; i++) { - CRYPTO_store_u32_be(out, c->h[i]); - out += 4; - } - - FIPS_service_indicator_update_state(); - return 1; -} - -int SHA256_Final(uint8_t out[SHA256_DIGEST_LENGTH], SHA256_CTX *c) { - // Ideally we would assert |sha->md_len| is |SHA256_DIGEST_LENGTH| to match - // the size hint, but calling code often pairs |SHA224_Init| with - // |SHA256_Final| and expects |sha->md_len| to carry the size over. - // - // TODO(davidben): Add an assert and fix code to match them up. - return sha256_final_impl(out, c); -} - -int SHA224_Final(uint8_t out[SHA224_DIGEST_LENGTH], SHA256_CTX *ctx) { - // SHA224_Init sets |ctx->md_len| to |SHA224_DIGEST_LENGTH|, so this has a - // smaller output. - assert(ctx->md_len == SHA224_DIGEST_LENGTH); - return sha256_final_impl(out, ctx); -} - -#ifndef SHA256_ASM -static const uint32_t K256[64] = { - 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, - 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL, - 0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, - 0xc19bf174UL, 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, - 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, 0x983e5152UL, - 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 0xc6e00bf3UL, 0xd5a79147UL, - 0x06ca6351UL, 0x14292967UL, 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, - 0x53380d13UL, 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, - 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, 0xd192e819UL, - 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, 0x19a4c116UL, 0x1e376c08UL, - 0x2748774cUL, 0x34b0bcb5UL, 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, - 0x682e6ff3UL, 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, - 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL}; - -// See FIPS 180-4, section 4.1.2. -#define Sigma0(x) \ - (CRYPTO_rotr_u32((x), 2) ^ CRYPTO_rotr_u32((x), 13) ^ \ - CRYPTO_rotr_u32((x), 22)) -#define Sigma1(x) \ - (CRYPTO_rotr_u32((x), 6) ^ CRYPTO_rotr_u32((x), 11) ^ \ - CRYPTO_rotr_u32((x), 25)) -#define sigma0(x) \ - (CRYPTO_rotr_u32((x), 7) ^ CRYPTO_rotr_u32((x), 18) ^ ((x) >> 3)) -#define sigma1(x) \ - (CRYPTO_rotr_u32((x), 17) ^ CRYPTO_rotr_u32((x), 19) ^ ((x) >> 10)) - -#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) -#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) - -#define ROUND_00_15(i, a, b, c, d, e, f, g, h) \ - do { \ - T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i]; \ - h = Sigma0(a) + Maj(a, b, c); \ - d += T1; \ - h += T1; \ - } while (0) - -#define ROUND_16_63(i, a, b, c, d, e, f, g, h, X) \ - do { \ - s0 = X[(i + 1) & 0x0f]; \ - s0 = sigma0(s0); \ - s1 = X[(i + 14) & 0x0f]; \ - s1 = sigma1(s1); \ - T1 = X[(i) & 0x0f] += s0 + s1 + X[(i + 9) & 0x0f]; \ - ROUND_00_15(i, a, b, c, d, e, f, g, h); \ - } while (0) - -static void sha256_block_data_order(uint32_t *state, const uint8_t *data, - size_t num) { - uint32_t a, b, c, d, e, f, g, h, s0, s1, T1; - uint32_t X[16]; - int i; - - while (num--) { - a = state[0]; - b = state[1]; - c = state[2]; - d = state[3]; - e = state[4]; - f = state[5]; - g = state[6]; - h = state[7]; - - T1 = X[0] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(0, a, b, c, d, e, f, g, h); - T1 = X[1] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(1, h, a, b, c, d, e, f, g); - T1 = X[2] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(2, g, h, a, b, c, d, e, f); - T1 = X[3] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(3, f, g, h, a, b, c, d, e); - T1 = X[4] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(4, e, f, g, h, a, b, c, d); - T1 = X[5] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(5, d, e, f, g, h, a, b, c); - T1 = X[6] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(6, c, d, e, f, g, h, a, b); - T1 = X[7] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(7, b, c, d, e, f, g, h, a); - T1 = X[8] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(8, a, b, c, d, e, f, g, h); - T1 = X[9] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(9, h, a, b, c, d, e, f, g); - T1 = X[10] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(10, g, h, a, b, c, d, e, f); - T1 = X[11] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(11, f, g, h, a, b, c, d, e); - T1 = X[12] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(12, e, f, g, h, a, b, c, d); - T1 = X[13] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(13, d, e, f, g, h, a, b, c); - T1 = X[14] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(14, c, d, e, f, g, h, a, b); - T1 = X[15] = CRYPTO_load_u32_be(data); - data += 4; - ROUND_00_15(15, b, c, d, e, f, g, h, a); - - for (i = 16; i < 64; i += 8) { - ROUND_16_63(i + 0, a, b, c, d, e, f, g, h, X); - ROUND_16_63(i + 1, h, a, b, c, d, e, f, g, X); - ROUND_16_63(i + 2, g, h, a, b, c, d, e, f, X); - ROUND_16_63(i + 3, f, g, h, a, b, c, d, e, X); - ROUND_16_63(i + 4, e, f, g, h, a, b, c, d, X); - ROUND_16_63(i + 5, d, e, f, g, h, a, b, c, X); - ROUND_16_63(i + 6, c, d, e, f, g, h, a, b, X); - ROUND_16_63(i + 7, b, c, d, e, f, g, h, a, X); - } - - state[0] += a; - state[1] += b; - state[2] += c; - state[3] += d; - state[4] += e; - state[5] += f; - state[6] += g; - state[7] += h; - } -} - -#endif // !SHA256_ASM - -void SHA256_TransformBlocks(uint32_t state[8], const uint8_t *data, - size_t num_blocks) { - sha256_block_data_order(state, data, num_blocks); -} - -#undef Sigma0 -#undef Sigma1 -#undef sigma0 -#undef sigma1 -#undef Ch -#undef Maj -#undef ROUND_00_15 -#undef ROUND_16_63 diff --git a/third_party/boringssl/src/crypto/fipsmodule/sha/sha256.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/sha/sha256.cc.inc new file mode 100644 index 00000000..20cfda9a --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/sha/sha256.cc.inc @@ -0,0 +1,315 @@ +// Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "../../internal.h" +#include "../bcm_interface.h" +#include "../digest/md32_common.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +bcm_infallible bssl::BCM_sha224_init(SHA256_CTX *sha) { + OPENSSL_memset(sha, 0, sizeof(SHA256_CTX)); + sha->h[0] = 0xc1059ed8UL; + sha->h[1] = 0x367cd507UL; + sha->h[2] = 0x3070dd17UL; + sha->h[3] = 0xf70e5939UL; + sha->h[4] = 0xffc00b31UL; + sha->h[5] = 0x68581511UL; + sha->h[6] = 0x64f98fa7UL; + sha->h[7] = 0xbefa4fa4UL; + sha->md_len = SHA224_DIGEST_LENGTH; + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_sha256_init(SHA256_CTX *sha) { + OPENSSL_memset(sha, 0, sizeof(SHA256_CTX)); + sha->h[0] = 0x6a09e667UL; + sha->h[1] = 0xbb67ae85UL; + sha->h[2] = 0x3c6ef372UL; + sha->h[3] = 0xa54ff53aUL; + sha->h[4] = 0x510e527fUL; + sha->h[5] = 0x9b05688cUL; + sha->h[6] = 0x1f83d9abUL; + sha->h[7] = 0x5be0cd19UL; + sha->md_len = SHA256_DIGEST_LENGTH; + return bcm_infallible::approved; +} + +#if !defined(SHA256_ASM) +static void sha256_block_data_order(uint32_t state[8], const uint8_t *in, + size_t num); +#endif + +bcm_infallible bssl::BCM_sha256_transform(SHA256_CTX *c, + const uint8_t data[SHA256_CBLOCK]) { + sha256_block_data_order(c->h, data, 1); + return bcm_infallible::approved; +} + +namespace { +struct SHA256Traits { + using HashContext = SHA256_CTX; + static constexpr size_t kBlockSize = SHA256_CBLOCK; + static constexpr bool kLengthIsBigEndian = true; + static void HashBlocks(uint32_t *state, const uint8_t *data, + size_t num_blocks) { + sha256_block_data_order(state, data, num_blocks); + } +}; +} // namespace + +bcm_infallible bssl::BCM_sha256_update(SHA256_CTX *c, const void *data, + size_t len) { + crypto_md32_update( + c, Span(static_cast(data), len)); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_sha224_update(SHA256_CTX *ctx, const void *data, + size_t len) { + return BCM_sha256_update(ctx, data, len); +} + +static void sha256_final_impl(uint8_t *out, size_t md_len, SHA256_CTX *c) { + crypto_md32_final(c); + + BSSL_CHECK(md_len <= SHA256_DIGEST_LENGTH); + + assert(md_len % 4 == 0); + const size_t out_words = md_len / 4; + for (size_t i = 0; i < out_words; i++) { + CRYPTO_store_u32_be(out, c->h[i]); + out += 4; + } + + FIPS_service_indicator_update_state(); +} + +bcm_infallible bssl::BCM_sha256_final(uint8_t out[SHA256_DIGEST_LENGTH], + SHA256_CTX *c) { + // Ideally we would assert |sha->md_len| is |SHA256_DIGEST_LENGTH| tomatch the + // size hint, but calling code often pairs |SHA224_Init| with |SHA256_Final| + // and expects |sha->md_len| to carry the size over. + // + // TODO(davidben): Add an assert and fix code to match them up. + sha256_final_impl(out, c->md_len, c); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_sha224_final(uint8_t out[SHA224_DIGEST_LENGTH], + SHA256_CTX *ctx) { + // This function must be paired with |SHA224_Init|, which sets |ctx->md_len| + // to |SHA224_DIGEST_LENGTH|. + assert(ctx->md_len == SHA224_DIGEST_LENGTH); + sha256_final_impl(out, SHA224_DIGEST_LENGTH, ctx); + return bcm_infallible::approved; +} + +#if !defined(SHA256_ASM) + +#if !defined(SHA256_ASM_NOHW) +static const uint32_t K256[64] = { + 0x428a2f98UL, 0x71374491UL, 0xb5c0fbcfUL, 0xe9b5dba5UL, 0x3956c25bUL, + 0x59f111f1UL, 0x923f82a4UL, 0xab1c5ed5UL, 0xd807aa98UL, 0x12835b01UL, + 0x243185beUL, 0x550c7dc3UL, 0x72be5d74UL, 0x80deb1feUL, 0x9bdc06a7UL, + 0xc19bf174UL, 0xe49b69c1UL, 0xefbe4786UL, 0x0fc19dc6UL, 0x240ca1ccUL, + 0x2de92c6fUL, 0x4a7484aaUL, 0x5cb0a9dcUL, 0x76f988daUL, 0x983e5152UL, + 0xa831c66dUL, 0xb00327c8UL, 0xbf597fc7UL, 0xc6e00bf3UL, 0xd5a79147UL, + 0x06ca6351UL, 0x14292967UL, 0x27b70a85UL, 0x2e1b2138UL, 0x4d2c6dfcUL, + 0x53380d13UL, 0x650a7354UL, 0x766a0abbUL, 0x81c2c92eUL, 0x92722c85UL, + 0xa2bfe8a1UL, 0xa81a664bUL, 0xc24b8b70UL, 0xc76c51a3UL, 0xd192e819UL, + 0xd6990624UL, 0xf40e3585UL, 0x106aa070UL, 0x19a4c116UL, 0x1e376c08UL, + 0x2748774cUL, 0x34b0bcb5UL, 0x391c0cb3UL, 0x4ed8aa4aUL, 0x5b9cca4fUL, + 0x682e6ff3UL, 0x748f82eeUL, 0x78a5636fUL, 0x84c87814UL, 0x8cc70208UL, + 0x90befffaUL, 0xa4506cebUL, 0xbef9a3f7UL, 0xc67178f2UL}; + +// See FIPS 180-4, section 4.1.2. +#define Sigma0(x) \ + (CRYPTO_rotr_u32((x), 2) ^ CRYPTO_rotr_u32((x), 13) ^ \ + CRYPTO_rotr_u32((x), 22)) +#define Sigma1(x) \ + (CRYPTO_rotr_u32((x), 6) ^ CRYPTO_rotr_u32((x), 11) ^ \ + CRYPTO_rotr_u32((x), 25)) +#define sigma0(x) \ + (CRYPTO_rotr_u32((x), 7) ^ CRYPTO_rotr_u32((x), 18) ^ ((x) >> 3)) +#define sigma1(x) \ + (CRYPTO_rotr_u32((x), 17) ^ CRYPTO_rotr_u32((x), 19) ^ ((x) >> 10)) + +#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) +#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + +#define ROUND_00_15(i, a, b, c, d, e, f, g, h) \ + do { \ + T1 += h + Sigma1(e) + Ch(e, f, g) + K256[i]; \ + h = Sigma0(a) + Maj(a, b, c); \ + d += T1; \ + h += T1; \ + } while (0) + +#define ROUND_16_63(i, a, b, c, d, e, f, g, h, X) \ + do { \ + s0 = X[(i + 1) & 0x0f]; \ + s0 = sigma0(s0); \ + s1 = X[(i + 14) & 0x0f]; \ + s1 = sigma1(s1); \ + T1 = X[(i) & 0x0f] += s0 + s1 + X[(i + 9) & 0x0f]; \ + ROUND_00_15(i, a, b, c, d, e, f, g, h); \ + } while (0) + +static void sha256_block_data_order_nohw(uint32_t state[8], const uint8_t *data, + size_t num) { + uint32_t a, b, c, d, e, f, g, h, s0, s1, T1; + uint32_t X[16]; + int i; + + while (num--) { + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + f = state[5]; + g = state[6]; + h = state[7]; + + T1 = X[0] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(0, a, b, c, d, e, f, g, h); + T1 = X[1] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(1, h, a, b, c, d, e, f, g); + T1 = X[2] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(2, g, h, a, b, c, d, e, f); + T1 = X[3] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(3, f, g, h, a, b, c, d, e); + T1 = X[4] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(4, e, f, g, h, a, b, c, d); + T1 = X[5] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(5, d, e, f, g, h, a, b, c); + T1 = X[6] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(6, c, d, e, f, g, h, a, b); + T1 = X[7] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(7, b, c, d, e, f, g, h, a); + T1 = X[8] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(8, a, b, c, d, e, f, g, h); + T1 = X[9] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(9, h, a, b, c, d, e, f, g); + T1 = X[10] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(10, g, h, a, b, c, d, e, f); + T1 = X[11] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(11, f, g, h, a, b, c, d, e); + T1 = X[12] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(12, e, f, g, h, a, b, c, d); + T1 = X[13] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(13, d, e, f, g, h, a, b, c); + T1 = X[14] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(14, c, d, e, f, g, h, a, b); + T1 = X[15] = CRYPTO_load_u32_be(data); + data += 4; + ROUND_00_15(15, b, c, d, e, f, g, h, a); + + for (i = 16; i < 64; i += 8) { + ROUND_16_63(i + 0, a, b, c, d, e, f, g, h, X); + ROUND_16_63(i + 1, h, a, b, c, d, e, f, g, X); + ROUND_16_63(i + 2, g, h, a, b, c, d, e, f, X); + ROUND_16_63(i + 3, f, g, h, a, b, c, d, e, X); + ROUND_16_63(i + 4, e, f, g, h, a, b, c, d, X); + ROUND_16_63(i + 5, d, e, f, g, h, a, b, c, X); + ROUND_16_63(i + 6, c, d, e, f, g, h, a, b, X); + ROUND_16_63(i + 7, b, c, d, e, f, g, h, a, X); + } + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + } +} + +#endif // !defined(SHA256_ASM_NOHW) + +static void sha256_block_data_order(uint32_t state[8], const uint8_t *data, + size_t num) { +#if defined(SHA256_ASM_HW) + if (sha256_hw_capable()) { + sha256_block_data_order_hw(state, data, num); + return; + } +#endif +#if defined(SHA256_ASM_AVX) + if (sha256_avx_capable()) { + sha256_block_data_order_avx(state, data, num); + return; + } +#endif +#if defined(SHA256_ASM_SSSE3) + if (sha256_ssse3_capable()) { + sha256_block_data_order_ssse3(state, data, num); + return; + } +#endif +#if defined(SHA256_ASM_NEON) + if (CRYPTO_is_NEON_capable()) { + sha256_block_data_order_neon(state, data, num); + return; + } +#endif + sha256_block_data_order_nohw(state, data, num); +} + +#endif // !defined(SHA256_ASM) + + +bcm_infallible bssl::BCM_sha256_transform_blocks(uint32_t state[8], + const uint8_t *data, + size_t num_blocks) { + if (num_blocks > 0) { + sha256_block_data_order(state, data, num_blocks); + } + return bcm_infallible::approved; +} + +#undef Sigma0 +#undef Sigma1 +#undef sigma0 +#undef sigma1 +#undef Ch +#undef Maj +#undef ROUND_00_15 +#undef ROUND_16_63 diff --git a/third_party/boringssl/src/crypto/fipsmodule/sha/sha512.c b/third_party/boringssl/src/crypto/fipsmodule/sha/sha512.c deleted file mode 100644 index 708358e5..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/sha/sha512.c +++ /dev/null @@ -1,510 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include - -#include "../../internal.h" -#include "../service_indicator/internal.h" -#include "internal.h" - - -// The 32-bit hash algorithms share a common byte-order neutral collector and -// padding function implementations that operate on unaligned data, -// ../digest/md32_common.h. SHA-512 is the only 64-bit hash algorithm, as of -// this writing, so there is no need for a common collector/padding -// implementation yet. - -static int sha512_final_impl(uint8_t *out, SHA512_CTX *sha); - -int SHA384_Init(SHA512_CTX *sha) { - sha->h[0] = UINT64_C(0xcbbb9d5dc1059ed8); - sha->h[1] = UINT64_C(0x629a292a367cd507); - sha->h[2] = UINT64_C(0x9159015a3070dd17); - sha->h[3] = UINT64_C(0x152fecd8f70e5939); - sha->h[4] = UINT64_C(0x67332667ffc00b31); - sha->h[5] = UINT64_C(0x8eb44a8768581511); - sha->h[6] = UINT64_C(0xdb0c2e0d64f98fa7); - sha->h[7] = UINT64_C(0x47b5481dbefa4fa4); - - sha->Nl = 0; - sha->Nh = 0; - sha->num = 0; - sha->md_len = SHA384_DIGEST_LENGTH; - return 1; -} - - -int SHA512_Init(SHA512_CTX *sha) { - sha->h[0] = UINT64_C(0x6a09e667f3bcc908); - sha->h[1] = UINT64_C(0xbb67ae8584caa73b); - sha->h[2] = UINT64_C(0x3c6ef372fe94f82b); - sha->h[3] = UINT64_C(0xa54ff53a5f1d36f1); - sha->h[4] = UINT64_C(0x510e527fade682d1); - sha->h[5] = UINT64_C(0x9b05688c2b3e6c1f); - sha->h[6] = UINT64_C(0x1f83d9abfb41bd6b); - sha->h[7] = UINT64_C(0x5be0cd19137e2179); - - sha->Nl = 0; - sha->Nh = 0; - sha->num = 0; - sha->md_len = SHA512_DIGEST_LENGTH; - return 1; -} - -int SHA512_256_Init(SHA512_CTX *sha) { - sha->h[0] = UINT64_C(0x22312194fc2bf72c); - sha->h[1] = UINT64_C(0x9f555fa3c84c64c2); - sha->h[2] = UINT64_C(0x2393b86b6f53b151); - sha->h[3] = UINT64_C(0x963877195940eabd); - sha->h[4] = UINT64_C(0x96283ee2a88effe3); - sha->h[5] = UINT64_C(0xbe5e1e2553863992); - sha->h[6] = UINT64_C(0x2b0199fc2c85b8aa); - sha->h[7] = UINT64_C(0x0eb72ddc81c52ca2); - - sha->Nl = 0; - sha->Nh = 0; - sha->num = 0; - sha->md_len = SHA512_256_DIGEST_LENGTH; - return 1; -} - -uint8_t *SHA384(const uint8_t *data, size_t len, - uint8_t out[SHA384_DIGEST_LENGTH]) { - SHA512_CTX ctx; - SHA384_Init(&ctx); - SHA384_Update(&ctx, data, len); - SHA384_Final(out, &ctx); - OPENSSL_cleanse(&ctx, sizeof(ctx)); - return out; -} - -uint8_t *SHA512(const uint8_t *data, size_t len, - uint8_t out[SHA512_DIGEST_LENGTH]) { - SHA512_CTX ctx; - SHA512_Init(&ctx); - SHA512_Update(&ctx, data, len); - SHA512_Final(out, &ctx); - OPENSSL_cleanse(&ctx, sizeof(ctx)); - return out; -} - -uint8_t *SHA512_256(const uint8_t *data, size_t len, - uint8_t out[SHA512_256_DIGEST_LENGTH]) { - SHA512_CTX ctx; - SHA512_256_Init(&ctx); - SHA512_256_Update(&ctx, data, len); - SHA512_256_Final(out, &ctx); - OPENSSL_cleanse(&ctx, sizeof(ctx)); - return out; -} - -#if !defined(SHA512_ASM) -static void sha512_block_data_order(uint64_t *state, const uint8_t *in, - size_t num_blocks); -#endif - - -int SHA384_Final(uint8_t out[SHA384_DIGEST_LENGTH], SHA512_CTX *sha) { - // |SHA384_Init| sets |sha->md_len| to |SHA384_DIGEST_LENGTH|, so this has a - // smaller output. - assert(sha->md_len == SHA384_DIGEST_LENGTH); - return sha512_final_impl(out, sha); -} - -int SHA384_Update(SHA512_CTX *sha, const void *data, size_t len) { - return SHA512_Update(sha, data, len); -} - -int SHA512_256_Update(SHA512_CTX *sha, const void *data, size_t len) { - return SHA512_Update(sha, data, len); -} - -int SHA512_256_Final(uint8_t out[SHA512_256_DIGEST_LENGTH], SHA512_CTX *sha) { - // |SHA512_256_Init| sets |sha->md_len| to |SHA512_256_DIGEST_LENGTH|, so this - // has a |smaller output. - assert(sha->md_len == SHA512_256_DIGEST_LENGTH); - return sha512_final_impl(out, sha); -} - -void SHA512_Transform(SHA512_CTX *c, const uint8_t block[SHA512_CBLOCK]) { - sha512_block_data_order(c->h, block, 1); -} - -int SHA512_Update(SHA512_CTX *c, const void *in_data, size_t len) { - uint64_t l; - uint8_t *p = c->p; - const uint8_t *data = in_data; - - if (len == 0) { - return 1; - } - - l = (c->Nl + (((uint64_t)len) << 3)) & UINT64_C(0xffffffffffffffff); - if (l < c->Nl) { - c->Nh++; - } - if (sizeof(len) >= 8) { - c->Nh += (((uint64_t)len) >> 61); - } - c->Nl = l; - - if (c->num != 0) { - size_t n = sizeof(c->p) - c->num; - - if (len < n) { - OPENSSL_memcpy(p + c->num, data, len); - c->num += (unsigned int)len; - return 1; - } else { - OPENSSL_memcpy(p + c->num, data, n), c->num = 0; - len -= n; - data += n; - sha512_block_data_order(c->h, p, 1); - } - } - - if (len >= sizeof(c->p)) { - sha512_block_data_order(c->h, data, len / sizeof(c->p)); - data += len; - len %= sizeof(c->p); - data -= len; - } - - if (len != 0) { - OPENSSL_memcpy(p, data, len); - c->num = (int)len; - } - - return 1; -} - -int SHA512_Final(uint8_t out[SHA512_DIGEST_LENGTH], SHA512_CTX *sha) { - // Ideally we would assert |sha->md_len| is |SHA512_DIGEST_LENGTH| to match - // the size hint, but calling code often pairs |SHA384_Init| with - // |SHA512_Final| and expects |sha->md_len| to carry the size over. - // - // TODO(davidben): Add an assert and fix code to match them up. - return sha512_final_impl(out, sha); -} - -static int sha512_final_impl(uint8_t *out, SHA512_CTX *sha) { - uint8_t *p = sha->p; - size_t n = sha->num; - - p[n] = 0x80; // There always is a room for one - n++; - if (n > (sizeof(sha->p) - 16)) { - OPENSSL_memset(p + n, 0, sizeof(sha->p) - n); - n = 0; - sha512_block_data_order(sha->h, p, 1); - } - - OPENSSL_memset(p + n, 0, sizeof(sha->p) - 16 - n); - CRYPTO_store_u64_be(p + sizeof(sha->p) - 16, sha->Nh); - CRYPTO_store_u64_be(p + sizeof(sha->p) - 8, sha->Nl); - - sha512_block_data_order(sha->h, p, 1); - - if (out == NULL) { - // TODO(davidben): This NULL check is absent in other low-level hash 'final' - // functions and is one of the few places one can fail. - return 0; - } - - assert(sha->md_len % 8 == 0); - const size_t out_words = sha->md_len / 8; - for (size_t i = 0; i < out_words; i++) { - CRYPTO_store_u64_be(out, sha->h[i]); - out += 8; - } - - FIPS_service_indicator_update_state(); - return 1; -} - -#ifndef SHA512_ASM -static const uint64_t K512[80] = { - UINT64_C(0x428a2f98d728ae22), UINT64_C(0x7137449123ef65cd), - UINT64_C(0xb5c0fbcfec4d3b2f), UINT64_C(0xe9b5dba58189dbbc), - UINT64_C(0x3956c25bf348b538), UINT64_C(0x59f111f1b605d019), - UINT64_C(0x923f82a4af194f9b), UINT64_C(0xab1c5ed5da6d8118), - UINT64_C(0xd807aa98a3030242), UINT64_C(0x12835b0145706fbe), - UINT64_C(0x243185be4ee4b28c), UINT64_C(0x550c7dc3d5ffb4e2), - UINT64_C(0x72be5d74f27b896f), UINT64_C(0x80deb1fe3b1696b1), - UINT64_C(0x9bdc06a725c71235), UINT64_C(0xc19bf174cf692694), - UINT64_C(0xe49b69c19ef14ad2), UINT64_C(0xefbe4786384f25e3), - UINT64_C(0x0fc19dc68b8cd5b5), UINT64_C(0x240ca1cc77ac9c65), - UINT64_C(0x2de92c6f592b0275), UINT64_C(0x4a7484aa6ea6e483), - UINT64_C(0x5cb0a9dcbd41fbd4), UINT64_C(0x76f988da831153b5), - UINT64_C(0x983e5152ee66dfab), UINT64_C(0xa831c66d2db43210), - UINT64_C(0xb00327c898fb213f), UINT64_C(0xbf597fc7beef0ee4), - UINT64_C(0xc6e00bf33da88fc2), UINT64_C(0xd5a79147930aa725), - UINT64_C(0x06ca6351e003826f), UINT64_C(0x142929670a0e6e70), - UINT64_C(0x27b70a8546d22ffc), UINT64_C(0x2e1b21385c26c926), - UINT64_C(0x4d2c6dfc5ac42aed), UINT64_C(0x53380d139d95b3df), - UINT64_C(0x650a73548baf63de), UINT64_C(0x766a0abb3c77b2a8), - UINT64_C(0x81c2c92e47edaee6), UINT64_C(0x92722c851482353b), - UINT64_C(0xa2bfe8a14cf10364), UINT64_C(0xa81a664bbc423001), - UINT64_C(0xc24b8b70d0f89791), UINT64_C(0xc76c51a30654be30), - UINT64_C(0xd192e819d6ef5218), UINT64_C(0xd69906245565a910), - UINT64_C(0xf40e35855771202a), UINT64_C(0x106aa07032bbd1b8), - UINT64_C(0x19a4c116b8d2d0c8), UINT64_C(0x1e376c085141ab53), - UINT64_C(0x2748774cdf8eeb99), UINT64_C(0x34b0bcb5e19b48a8), - UINT64_C(0x391c0cb3c5c95a63), UINT64_C(0x4ed8aa4ae3418acb), - UINT64_C(0x5b9cca4f7763e373), UINT64_C(0x682e6ff3d6b2b8a3), - UINT64_C(0x748f82ee5defb2fc), UINT64_C(0x78a5636f43172f60), - UINT64_C(0x84c87814a1f0ab72), UINT64_C(0x8cc702081a6439ec), - UINT64_C(0x90befffa23631e28), UINT64_C(0xa4506cebde82bde9), - UINT64_C(0xbef9a3f7b2c67915), UINT64_C(0xc67178f2e372532b), - UINT64_C(0xca273eceea26619c), UINT64_C(0xd186b8c721c0c207), - UINT64_C(0xeada7dd6cde0eb1e), UINT64_C(0xf57d4f7fee6ed178), - UINT64_C(0x06f067aa72176fba), UINT64_C(0x0a637dc5a2c898a6), - UINT64_C(0x113f9804bef90dae), UINT64_C(0x1b710b35131c471b), - UINT64_C(0x28db77f523047d84), UINT64_C(0x32caab7b40c72493), - UINT64_C(0x3c9ebe0a15c9bebc), UINT64_C(0x431d67c49c100d4c), - UINT64_C(0x4cc5d4becb3e42b6), UINT64_C(0x597f299cfc657e2a), - UINT64_C(0x5fcb6fab3ad6faec), UINT64_C(0x6c44198c4a475817), -}; - -#define Sigma0(x) \ - (CRYPTO_rotr_u64((x), 28) ^ CRYPTO_rotr_u64((x), 34) ^ \ - CRYPTO_rotr_u64((x), 39)) -#define Sigma1(x) \ - (CRYPTO_rotr_u64((x), 14) ^ CRYPTO_rotr_u64((x), 18) ^ \ - CRYPTO_rotr_u64((x), 41)) -#define sigma0(x) \ - (CRYPTO_rotr_u64((x), 1) ^ CRYPTO_rotr_u64((x), 8) ^ ((x) >> 7)) -#define sigma1(x) \ - (CRYPTO_rotr_u64((x), 19) ^ CRYPTO_rotr_u64((x), 61) ^ ((x) >> 6)) - -#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) -#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) - - -#if defined(__i386) || defined(__i386__) || defined(_M_IX86) -// This code should give better results on 32-bit CPU with less than -// ~24 registers, both size and performance wise... -static void sha512_block_data_order(uint64_t *state, const uint8_t *in, - size_t num) { - uint64_t A, E, T; - uint64_t X[9 + 80], *F; - int i; - - while (num--) { - F = X + 80; - A = state[0]; - F[1] = state[1]; - F[2] = state[2]; - F[3] = state[3]; - E = state[4]; - F[5] = state[5]; - F[6] = state[6]; - F[7] = state[7]; - - for (i = 0; i < 16; i++, F--) { - T = CRYPTO_load_u64_be(in + i * 8); - F[0] = A; - F[4] = E; - F[8] = T; - T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i]; - E = F[3] + T; - A = T + Sigma0(A) + Maj(A, F[1], F[2]); - } - - for (; i < 80; i++, F--) { - T = sigma0(F[8 + 16 - 1]); - T += sigma1(F[8 + 16 - 14]); - T += F[8 + 16] + F[8 + 16 - 9]; - - F[0] = A; - F[4] = E; - F[8] = T; - T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i]; - E = F[3] + T; - A = T + Sigma0(A) + Maj(A, F[1], F[2]); - } - - state[0] += A; - state[1] += F[1]; - state[2] += F[2]; - state[3] += F[3]; - state[4] += E; - state[5] += F[5]; - state[6] += F[6]; - state[7] += F[7]; - - in += 16 * 8; - } -} - -#else - -#define ROUND_00_15(i, a, b, c, d, e, f, g, h) \ - do { \ - T1 += h + Sigma1(e) + Ch(e, f, g) + K512[i]; \ - h = Sigma0(a) + Maj(a, b, c); \ - d += T1; \ - h += T1; \ - } while (0) - -#define ROUND_16_80(i, j, a, b, c, d, e, f, g, h, X) \ - do { \ - s0 = X[(j + 1) & 0x0f]; \ - s0 = sigma0(s0); \ - s1 = X[(j + 14) & 0x0f]; \ - s1 = sigma1(s1); \ - T1 = X[(j) & 0x0f] += s0 + s1 + X[(j + 9) & 0x0f]; \ - ROUND_00_15(i + j, a, b, c, d, e, f, g, h); \ - } while (0) - -static void sha512_block_data_order(uint64_t *state, const uint8_t *in, - size_t num) { - uint64_t a, b, c, d, e, f, g, h, s0, s1, T1; - uint64_t X[16]; - int i; - - while (num--) { - - a = state[0]; - b = state[1]; - c = state[2]; - d = state[3]; - e = state[4]; - f = state[5]; - g = state[6]; - h = state[7]; - - T1 = X[0] = CRYPTO_load_u64_be(in); - ROUND_00_15(0, a, b, c, d, e, f, g, h); - T1 = X[1] = CRYPTO_load_u64_be(in + 8); - ROUND_00_15(1, h, a, b, c, d, e, f, g); - T1 = X[2] = CRYPTO_load_u64_be(in + 2 * 8); - ROUND_00_15(2, g, h, a, b, c, d, e, f); - T1 = X[3] = CRYPTO_load_u64_be(in + 3 * 8); - ROUND_00_15(3, f, g, h, a, b, c, d, e); - T1 = X[4] = CRYPTO_load_u64_be(in + 4 * 8); - ROUND_00_15(4, e, f, g, h, a, b, c, d); - T1 = X[5] = CRYPTO_load_u64_be(in + 5 * 8); - ROUND_00_15(5, d, e, f, g, h, a, b, c); - T1 = X[6] = CRYPTO_load_u64_be(in + 6 * 8); - ROUND_00_15(6, c, d, e, f, g, h, a, b); - T1 = X[7] = CRYPTO_load_u64_be(in + 7 * 8); - ROUND_00_15(7, b, c, d, e, f, g, h, a); - T1 = X[8] = CRYPTO_load_u64_be(in + 8 * 8); - ROUND_00_15(8, a, b, c, d, e, f, g, h); - T1 = X[9] = CRYPTO_load_u64_be(in + 9 * 8); - ROUND_00_15(9, h, a, b, c, d, e, f, g); - T1 = X[10] = CRYPTO_load_u64_be(in + 10 * 8); - ROUND_00_15(10, g, h, a, b, c, d, e, f); - T1 = X[11] = CRYPTO_load_u64_be(in + 11 * 8); - ROUND_00_15(11, f, g, h, a, b, c, d, e); - T1 = X[12] = CRYPTO_load_u64_be(in + 12 * 8); - ROUND_00_15(12, e, f, g, h, a, b, c, d); - T1 = X[13] = CRYPTO_load_u64_be(in + 13 * 8); - ROUND_00_15(13, d, e, f, g, h, a, b, c); - T1 = X[14] = CRYPTO_load_u64_be(in + 14 * 8); - ROUND_00_15(14, c, d, e, f, g, h, a, b); - T1 = X[15] = CRYPTO_load_u64_be(in + 15 * 8); - ROUND_00_15(15, b, c, d, e, f, g, h, a); - - for (i = 16; i < 80; i += 16) { - ROUND_16_80(i, 0, a, b, c, d, e, f, g, h, X); - ROUND_16_80(i, 1, h, a, b, c, d, e, f, g, X); - ROUND_16_80(i, 2, g, h, a, b, c, d, e, f, X); - ROUND_16_80(i, 3, f, g, h, a, b, c, d, e, X); - ROUND_16_80(i, 4, e, f, g, h, a, b, c, d, X); - ROUND_16_80(i, 5, d, e, f, g, h, a, b, c, X); - ROUND_16_80(i, 6, c, d, e, f, g, h, a, b, X); - ROUND_16_80(i, 7, b, c, d, e, f, g, h, a, X); - ROUND_16_80(i, 8, a, b, c, d, e, f, g, h, X); - ROUND_16_80(i, 9, h, a, b, c, d, e, f, g, X); - ROUND_16_80(i, 10, g, h, a, b, c, d, e, f, X); - ROUND_16_80(i, 11, f, g, h, a, b, c, d, e, X); - ROUND_16_80(i, 12, e, f, g, h, a, b, c, d, X); - ROUND_16_80(i, 13, d, e, f, g, h, a, b, c, X); - ROUND_16_80(i, 14, c, d, e, f, g, h, a, b, X); - ROUND_16_80(i, 15, b, c, d, e, f, g, h, a, X); - } - - state[0] += a; - state[1] += b; - state[2] += c; - state[3] += d; - state[4] += e; - state[5] += f; - state[6] += g; - state[7] += h; - - in += 16 * 8; - } -} - -#endif - -#endif // !SHA512_ASM - -#undef Sigma0 -#undef Sigma1 -#undef sigma0 -#undef sigma1 -#undef Ch -#undef Maj -#undef ROUND_00_15 -#undef ROUND_16_80 diff --git a/third_party/boringssl/src/crypto/fipsmodule/sha/sha512.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/sha/sha512.cc.inc new file mode 100644 index 00000000..f2282f4d --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/sha/sha512.cc.inc @@ -0,0 +1,473 @@ +// Copyright 2004-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../../internal.h" +#include "../bcm_interface.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +// The 32-bit hash algorithms share a common byte-order neutral collector and +// padding function implementations that operate on unaligned data, +// ../digest/md32_common.h. SHA-512 is the only 64-bit hash algorithm, as of +// this writing, so there is no need for a common collector/padding +// implementation yet. + +static void sha512_final_impl(uint8_t *out, size_t md_len, SHA512_CTX *sha); + +bcm_infallible bssl::BCM_sha384_init(SHA512_CTX *sha) { + sha->h[0] = UINT64_C(0xcbbb9d5dc1059ed8); + sha->h[1] = UINT64_C(0x629a292a367cd507); + sha->h[2] = UINT64_C(0x9159015a3070dd17); + sha->h[3] = UINT64_C(0x152fecd8f70e5939); + sha->h[4] = UINT64_C(0x67332667ffc00b31); + sha->h[5] = UINT64_C(0x8eb44a8768581511); + sha->h[6] = UINT64_C(0xdb0c2e0d64f98fa7); + sha->h[7] = UINT64_C(0x47b5481dbefa4fa4); + + sha->bytes_so_far_low = 0; + sha->bytes_so_far_high = 0; + sha->num = 0; + sha->md_len = SHA384_DIGEST_LENGTH; + return bcm_infallible::approved; +} + + +bcm_infallible bssl::BCM_sha512_init(SHA512_CTX *sha) { + sha->h[0] = UINT64_C(0x6a09e667f3bcc908); + sha->h[1] = UINT64_C(0xbb67ae8584caa73b); + sha->h[2] = UINT64_C(0x3c6ef372fe94f82b); + sha->h[3] = UINT64_C(0xa54ff53a5f1d36f1); + sha->h[4] = UINT64_C(0x510e527fade682d1); + sha->h[5] = UINT64_C(0x9b05688c2b3e6c1f); + sha->h[6] = UINT64_C(0x1f83d9abfb41bd6b); + sha->h[7] = UINT64_C(0x5be0cd19137e2179); + + sha->bytes_so_far_low = 0; + sha->bytes_so_far_high = 0; + sha->num = 0; + sha->md_len = SHA512_DIGEST_LENGTH; + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_sha512_256_init(SHA512_CTX *sha) { + sha->h[0] = UINT64_C(0x22312194fc2bf72c); + sha->h[1] = UINT64_C(0x9f555fa3c84c64c2); + sha->h[2] = UINT64_C(0x2393b86b6f53b151); + sha->h[3] = UINT64_C(0x963877195940eabd); + sha->h[4] = UINT64_C(0x96283ee2a88effe3); + sha->h[5] = UINT64_C(0xbe5e1e2553863992); + sha->h[6] = UINT64_C(0x2b0199fc2c85b8aa); + sha->h[7] = UINT64_C(0x0eb72ddc81c52ca2); + + sha->bytes_so_far_low = 0; + sha->bytes_so_far_high = 0; + sha->num = 0; + sha->md_len = SHA512_256_DIGEST_LENGTH; + return bcm_infallible::approved; +} + +#if !defined(SHA512_ASM) +static void sha512_block_data_order(uint64_t state[8], const uint8_t *in, + size_t num_blocks); +#endif + + +bcm_infallible bssl::BCM_sha384_final(uint8_t out[SHA384_DIGEST_LENGTH], + SHA512_CTX *sha) { + // This function must be paired with |BCM_sha384_init|, which sets + // |sha->md_len| to |SHA384_DIGEST_LENGTH|. + assert(sha->md_len == SHA384_DIGEST_LENGTH); + sha512_final_impl(out, SHA384_DIGEST_LENGTH, sha); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_sha384_update(SHA512_CTX *sha, const void *data, + size_t len) { + return BCM_sha512_update(sha, data, len); +} + +bcm_infallible bssl::BCM_sha512_256_update(SHA512_CTX *sha, const void *data, + size_t len) { + return BCM_sha512_update(sha, data, len); +} + +bcm_infallible bssl::BCM_sha512_256_final(uint8_t out[SHA512_256_DIGEST_LENGTH], + SHA512_CTX *sha) { + // This function must be paired with |BCM_sha512_256_init|, which sets + // |sha->md_len| to |SHA512_256_DIGEST_LENGTH|. + assert(sha->md_len == SHA512_256_DIGEST_LENGTH); + sha512_final_impl(out, SHA512_256_DIGEST_LENGTH, sha); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_sha512_transform(SHA512_CTX *c, + const uint8_t block[SHA512_CBLOCK]) { + sha512_block_data_order(c->h, block, 1); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_sha512_update(SHA512_CTX *c, const void *in_data, + size_t len) { + uint8_t *p = c->p; + const uint8_t *data = reinterpret_cast(in_data); + + if (len == 0) { + return bcm_infallible::approved; + } + + c->bytes_so_far_low += len; + if (c->bytes_so_far_low < len) { + c->bytes_so_far_high++; + } + + if (c->num != 0) { + size_t n = sizeof(c->p) - c->num; + + if (len < n) { + OPENSSL_memcpy(p + c->num, data, len); + c->num += (unsigned int)len; + return bcm_infallible::approved; + } else { + OPENSSL_memcpy(p + c->num, data, n), c->num = 0; + len -= n; + data += n; + sha512_block_data_order(c->h, p, 1); + } + } + + if (len >= sizeof(c->p)) { + sha512_block_data_order(c->h, data, len / sizeof(c->p)); + data += len; + len %= sizeof(c->p); + data -= len; + } + + if (len != 0) { + OPENSSL_memcpy(p, data, len); + c->num = (int)len; + } + + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_sha512_final(uint8_t out[SHA512_DIGEST_LENGTH], + SHA512_CTX *sha) { + // Ideally we would assert |sha->md_len| is |SHA512_DIGEST_LENGTH| to match + // the size hint, but calling code often pairs |BCM_sha384_init| with + // |BCM_sha512_final| and expects |sha->md_len| to carry the size over. + // + // TODO(davidben): Add an assert and fix code to match them up. + sha512_final_impl(out, sha->md_len, sha); + return bcm_infallible::approved; +} + +static void sha512_final_impl(uint8_t *out, size_t md_len, SHA512_CTX *sha) { + uint8_t *p = sha->p; + size_t n = sha->num; + + p[n] = 0x80; // There always is a room for one + n++; + if (n > (sizeof(sha->p) - 16)) { + OPENSSL_memset(p + n, 0, sizeof(sha->p) - n); + n = 0; + sha512_block_data_order(sha->h, p, 1); + } + + OPENSSL_memset(p + n, 0, sizeof(sha->p) - 16 - n); + const uint64_t Nh = (uint64_t{sha->bytes_so_far_high} << 3) | + (sha->bytes_so_far_low >> (64 - 3)); + const uint64_t Nl = sha->bytes_so_far_low << 3; + CRYPTO_store_u64_be(p + sizeof(sha->p) - 16, Nh); + CRYPTO_store_u64_be(p + sizeof(sha->p) - 8, Nl); + + sha512_block_data_order(sha->h, p, 1); + + assert(md_len % 8 == 0); + const size_t out_words = md_len / 8; + for (size_t i = 0; i < out_words; i++) { + CRYPTO_store_u64_be(out, sha->h[i]); + out += 8; + } + + FIPS_service_indicator_update_state(); +} + +#if !defined(SHA512_ASM) + +#if !defined(SHA512_ASM_NOHW) +static const uint64_t K512[80] = { + UINT64_C(0x428a2f98d728ae22), UINT64_C(0x7137449123ef65cd), + UINT64_C(0xb5c0fbcfec4d3b2f), UINT64_C(0xe9b5dba58189dbbc), + UINT64_C(0x3956c25bf348b538), UINT64_C(0x59f111f1b605d019), + UINT64_C(0x923f82a4af194f9b), UINT64_C(0xab1c5ed5da6d8118), + UINT64_C(0xd807aa98a3030242), UINT64_C(0x12835b0145706fbe), + UINT64_C(0x243185be4ee4b28c), UINT64_C(0x550c7dc3d5ffb4e2), + UINT64_C(0x72be5d74f27b896f), UINT64_C(0x80deb1fe3b1696b1), + UINT64_C(0x9bdc06a725c71235), UINT64_C(0xc19bf174cf692694), + UINT64_C(0xe49b69c19ef14ad2), UINT64_C(0xefbe4786384f25e3), + UINT64_C(0x0fc19dc68b8cd5b5), UINT64_C(0x240ca1cc77ac9c65), + UINT64_C(0x2de92c6f592b0275), UINT64_C(0x4a7484aa6ea6e483), + UINT64_C(0x5cb0a9dcbd41fbd4), UINT64_C(0x76f988da831153b5), + UINT64_C(0x983e5152ee66dfab), UINT64_C(0xa831c66d2db43210), + UINT64_C(0xb00327c898fb213f), UINT64_C(0xbf597fc7beef0ee4), + UINT64_C(0xc6e00bf33da88fc2), UINT64_C(0xd5a79147930aa725), + UINT64_C(0x06ca6351e003826f), UINT64_C(0x142929670a0e6e70), + UINT64_C(0x27b70a8546d22ffc), UINT64_C(0x2e1b21385c26c926), + UINT64_C(0x4d2c6dfc5ac42aed), UINT64_C(0x53380d139d95b3df), + UINT64_C(0x650a73548baf63de), UINT64_C(0x766a0abb3c77b2a8), + UINT64_C(0x81c2c92e47edaee6), UINT64_C(0x92722c851482353b), + UINT64_C(0xa2bfe8a14cf10364), UINT64_C(0xa81a664bbc423001), + UINT64_C(0xc24b8b70d0f89791), UINT64_C(0xc76c51a30654be30), + UINT64_C(0xd192e819d6ef5218), UINT64_C(0xd69906245565a910), + UINT64_C(0xf40e35855771202a), UINT64_C(0x106aa07032bbd1b8), + UINT64_C(0x19a4c116b8d2d0c8), UINT64_C(0x1e376c085141ab53), + UINT64_C(0x2748774cdf8eeb99), UINT64_C(0x34b0bcb5e19b48a8), + UINT64_C(0x391c0cb3c5c95a63), UINT64_C(0x4ed8aa4ae3418acb), + UINT64_C(0x5b9cca4f7763e373), UINT64_C(0x682e6ff3d6b2b8a3), + UINT64_C(0x748f82ee5defb2fc), UINT64_C(0x78a5636f43172f60), + UINT64_C(0x84c87814a1f0ab72), UINT64_C(0x8cc702081a6439ec), + UINT64_C(0x90befffa23631e28), UINT64_C(0xa4506cebde82bde9), + UINT64_C(0xbef9a3f7b2c67915), UINT64_C(0xc67178f2e372532b), + UINT64_C(0xca273eceea26619c), UINT64_C(0xd186b8c721c0c207), + UINT64_C(0xeada7dd6cde0eb1e), UINT64_C(0xf57d4f7fee6ed178), + UINT64_C(0x06f067aa72176fba), UINT64_C(0x0a637dc5a2c898a6), + UINT64_C(0x113f9804bef90dae), UINT64_C(0x1b710b35131c471b), + UINT64_C(0x28db77f523047d84), UINT64_C(0x32caab7b40c72493), + UINT64_C(0x3c9ebe0a15c9bebc), UINT64_C(0x431d67c49c100d4c), + UINT64_C(0x4cc5d4becb3e42b6), UINT64_C(0x597f299cfc657e2a), + UINT64_C(0x5fcb6fab3ad6faec), UINT64_C(0x6c44198c4a475817), +}; + +#define Sigma0(x) \ + (CRYPTO_rotr_u64((x), 28) ^ CRYPTO_rotr_u64((x), 34) ^ \ + CRYPTO_rotr_u64((x), 39)) +#define Sigma1(x) \ + (CRYPTO_rotr_u64((x), 14) ^ CRYPTO_rotr_u64((x), 18) ^ \ + CRYPTO_rotr_u64((x), 41)) +#define sigma0(x) \ + (CRYPTO_rotr_u64((x), 1) ^ CRYPTO_rotr_u64((x), 8) ^ ((x) >> 7)) +#define sigma1(x) \ + (CRYPTO_rotr_u64((x), 19) ^ CRYPTO_rotr_u64((x), 61) ^ ((x) >> 6)) + +#define Ch(x, y, z) (((x) & (y)) ^ ((~(x)) & (z))) +#define Maj(x, y, z) (((x) & (y)) ^ ((x) & (z)) ^ ((y) & (z))) + + +#if defined(__i386) || defined(__i386__) || defined(_M_IX86) +// This code should give better results on 32-bit CPU with less than +// ~24 registers, both size and performance wise... +static void sha512_block_data_order_nohw(uint64_t state[8], const uint8_t *in, + size_t num) { + uint64_t A, E, T; + uint64_t X[9 + 80], *F; + int i; + + while (num--) { + F = X + 80; + A = state[0]; + F[1] = state[1]; + F[2] = state[2]; + F[3] = state[3]; + E = state[4]; + F[5] = state[5]; + F[6] = state[6]; + F[7] = state[7]; + + for (i = 0; i < 16; i++, F--) { + T = CRYPTO_load_u64_be(in + i * 8); + F[0] = A; + F[4] = E; + F[8] = T; + T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i]; + E = F[3] + T; + A = T + Sigma0(A) + Maj(A, F[1], F[2]); + } + + for (; i < 80; i++, F--) { + T = sigma0(F[8 + 16 - 1]); + T += sigma1(F[8 + 16 - 14]); + T += F[8 + 16] + F[8 + 16 - 9]; + + F[0] = A; + F[4] = E; + F[8] = T; + T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i]; + E = F[3] + T; + A = T + Sigma0(A) + Maj(A, F[1], F[2]); + } + + state[0] += A; + state[1] += F[1]; + state[2] += F[2]; + state[3] += F[3]; + state[4] += E; + state[5] += F[5]; + state[6] += F[6]; + state[7] += F[7]; + + in += 16 * 8; + } +} + +#else + +#define ROUND_00_15(i, a, b, c, d, e, f, g, h) \ + do { \ + T1 += h + Sigma1(e) + Ch(e, f, g) + K512[i]; \ + h = Sigma0(a) + Maj(a, b, c); \ + d += T1; \ + h += T1; \ + } while (0) + +#define ROUND_16_80(i, j, a, b, c, d, e, f, g, h, X) \ + do { \ + s0 = X[(j + 1) & 0x0f]; \ + s0 = sigma0(s0); \ + s1 = X[(j + 14) & 0x0f]; \ + s1 = sigma1(s1); \ + T1 = X[(j) & 0x0f] += s0 + s1 + X[(j + 9) & 0x0f]; \ + ROUND_00_15(i + j, a, b, c, d, e, f, g, h); \ + } while (0) + +static void sha512_block_data_order_nohw(uint64_t state[8], const uint8_t *in, + size_t num) { + uint64_t a, b, c, d, e, f, g, h, s0, s1, T1; + uint64_t X[16]; + int i; + + while (num--) { + a = state[0]; + b = state[1]; + c = state[2]; + d = state[3]; + e = state[4]; + f = state[5]; + g = state[6]; + h = state[7]; + + T1 = X[0] = CRYPTO_load_u64_be(in); + ROUND_00_15(0, a, b, c, d, e, f, g, h); + T1 = X[1] = CRYPTO_load_u64_be(in + 8); + ROUND_00_15(1, h, a, b, c, d, e, f, g); + T1 = X[2] = CRYPTO_load_u64_be(in + 2 * 8); + ROUND_00_15(2, g, h, a, b, c, d, e, f); + T1 = X[3] = CRYPTO_load_u64_be(in + 3 * 8); + ROUND_00_15(3, f, g, h, a, b, c, d, e); + T1 = X[4] = CRYPTO_load_u64_be(in + 4 * 8); + ROUND_00_15(4, e, f, g, h, a, b, c, d); + T1 = X[5] = CRYPTO_load_u64_be(in + 5 * 8); + ROUND_00_15(5, d, e, f, g, h, a, b, c); + T1 = X[6] = CRYPTO_load_u64_be(in + 6 * 8); + ROUND_00_15(6, c, d, e, f, g, h, a, b); + T1 = X[7] = CRYPTO_load_u64_be(in + 7 * 8); + ROUND_00_15(7, b, c, d, e, f, g, h, a); + T1 = X[8] = CRYPTO_load_u64_be(in + 8 * 8); + ROUND_00_15(8, a, b, c, d, e, f, g, h); + T1 = X[9] = CRYPTO_load_u64_be(in + 9 * 8); + ROUND_00_15(9, h, a, b, c, d, e, f, g); + T1 = X[10] = CRYPTO_load_u64_be(in + 10 * 8); + ROUND_00_15(10, g, h, a, b, c, d, e, f); + T1 = X[11] = CRYPTO_load_u64_be(in + 11 * 8); + ROUND_00_15(11, f, g, h, a, b, c, d, e); + T1 = X[12] = CRYPTO_load_u64_be(in + 12 * 8); + ROUND_00_15(12, e, f, g, h, a, b, c, d); + T1 = X[13] = CRYPTO_load_u64_be(in + 13 * 8); + ROUND_00_15(13, d, e, f, g, h, a, b, c); + T1 = X[14] = CRYPTO_load_u64_be(in + 14 * 8); + ROUND_00_15(14, c, d, e, f, g, h, a, b); + T1 = X[15] = CRYPTO_load_u64_be(in + 15 * 8); + ROUND_00_15(15, b, c, d, e, f, g, h, a); + + for (i = 16; i < 80; i += 16) { + ROUND_16_80(i, 0, a, b, c, d, e, f, g, h, X); + ROUND_16_80(i, 1, h, a, b, c, d, e, f, g, X); + ROUND_16_80(i, 2, g, h, a, b, c, d, e, f, X); + ROUND_16_80(i, 3, f, g, h, a, b, c, d, e, X); + ROUND_16_80(i, 4, e, f, g, h, a, b, c, d, X); + ROUND_16_80(i, 5, d, e, f, g, h, a, b, c, X); + ROUND_16_80(i, 6, c, d, e, f, g, h, a, b, X); + ROUND_16_80(i, 7, b, c, d, e, f, g, h, a, X); + ROUND_16_80(i, 8, a, b, c, d, e, f, g, h, X); + ROUND_16_80(i, 9, h, a, b, c, d, e, f, g, X); + ROUND_16_80(i, 10, g, h, a, b, c, d, e, f, X); + ROUND_16_80(i, 11, f, g, h, a, b, c, d, e, X); + ROUND_16_80(i, 12, e, f, g, h, a, b, c, d, X); + ROUND_16_80(i, 13, d, e, f, g, h, a, b, c, X); + ROUND_16_80(i, 14, c, d, e, f, g, h, a, b, X); + ROUND_16_80(i, 15, b, c, d, e, f, g, h, a, X); + } + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + state[4] += e; + state[5] += f; + state[6] += g; + state[7] += h; + + in += 16 * 8; + } +} + +#endif + +#endif // !SHA512_ASM_NOHW + +static void sha512_block_data_order(uint64_t state[8], const uint8_t *data, + size_t num) { +#if defined(SHA512_ASM_HW) + if (sha512_hw_capable()) { + sha512_block_data_order_hw(state, data, num); + return; + } +#endif +#if defined(SHA512_ASM_AVX) + if (sha512_avx_capable()) { + sha512_block_data_order_avx(state, data, num); + return; + } +#endif +#if defined(SHA512_ASM_SSSE3) + if (sha512_ssse3_capable()) { + sha512_block_data_order_ssse3(state, data, num); + return; + } +#endif +#if defined(SHA512_ASM_NEON) + if (CRYPTO_is_NEON_capable()) { + sha512_block_data_order_neon(state, data, num); + return; + } +#endif + sha512_block_data_order_nohw(state, data, num); +} + +#endif // !SHA512_ASM + +#undef Sigma0 +#undef Sigma1 +#undef sigma0 +#undef sigma1 +#undef Ch +#undef Maj +#undef ROUND_00_15 +#undef ROUND_16_80 diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/address.h b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/address.h new file mode 100644 index 00000000..4e092b91 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/address.h @@ -0,0 +1,144 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_ADDRESS_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_ADDRESS_H + +#include + +#include "../../internal.h" +#include "./params.h" + + +BSSL_NAMESPACE_BEGIN + +#define SLHDSA_ADDR_TYPE_WOTS 0 +#define SLHDSA_ADDR_TYPE_WOTSPK 1 +#define SLHDSA_ADDR_TYPE_HASHTREE 2 +#define SLHDSA_ADDR_TYPE_FORSTREE 3 +#define SLHDSA_ADDR_TYPE_FORSPK 4 +#define SLHDSA_ADDR_TYPE_WOTSPRF 5 +#define SLHDSA_ADDR_TYPE_FORSPRF 6 + +inline void slhdsa_set_chain_addr(const slh_dsa_config *config, + uint8_t addr[32], uint32_t chain) { + if (config->compressed_addresses) { + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_COMP_OFFSET_CHAIN, chain); + } else { + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_FULL_OFFSET_CHAIN, chain); + } +} + +inline void slhdsa_set_hash_addr(const slh_dsa_config *config, uint8_t addr[32], + uint32_t hash) { + if (config->compressed_addresses) { + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_COMP_OFFSET_HASH, hash); + } else { + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_FULL_OFFSET_HASH, hash); + } +} + +inline void slhdsa_set_keypair_addr(const slh_dsa_config *config, + uint8_t addr[32], uint32_t keypair) { + if (config->compressed_addresses) { + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_COMP_OFFSET_KEYPAIR, keypair); + } else { + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_FULL_OFFSET_KEYPAIR, keypair); + } +} + +inline void slhdsa_copy_keypair_addr(const slh_dsa_config *config, + uint8_t out[32], const uint8_t in[32]) { + if (config->compressed_addresses) { + bssl::OPENSSL_memcpy(out, in, SLHDSA_ADDR_COMP_OFFSET_TYPE); + bssl::OPENSSL_memcpy(out + SLHDSA_ADDR_COMP_OFFSET_KEYPAIR, + in + SLHDSA_ADDR_COMP_OFFSET_KEYPAIR, 4); + } else { + bssl::OPENSSL_memcpy(out, in, SLHDSA_ADDR_FULL_OFFSET_TYPE); + bssl::OPENSSL_memcpy(out + SLHDSA_ADDR_FULL_OFFSET_KEYPAIR, + in + SLHDSA_ADDR_FULL_OFFSET_KEYPAIR, 4); + } +} + +inline void slhdsa_set_layer_addr(const slh_dsa_config *config, + uint8_t addr[32], uint32_t layer) { + if (config->compressed_addresses) { + addr[SLHDSA_ADDR_COMP_OFFSET_LAYER] = (uint8_t)layer; + } else { + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_FULL_OFFSET_LAYER, layer); + } +} + +inline void slhdsa_set_tree_addr(const slh_dsa_config *config, uint8_t addr[32], + uint64_t tree) { + if (config->compressed_addresses) { + bssl::CRYPTO_store_u64_be(addr + SLHDSA_ADDR_COMP_OFFSET_TREE, tree); + } else { + // The tree address is 12 bytes in this configuration. Just zero the top + // four bytes. + bssl::OPENSSL_memset(addr + SLHDSA_ADDR_FULL_OFFSET_TREE, 0, 4); + bssl::CRYPTO_store_u64_be(addr + SLHDSA_ADDR_FULL_OFFSET_TREE + 4, tree); + } +} + +inline void slhdsa_set_type(const slh_dsa_config *config, uint8_t addr[32], + uint32_t type) { + // FIPS 205 relies on this setting parts of the address to 0, so we do it + // here to avoid confusion. + // + // The behavior here is only correct for the SHA-2 instantiations. + if (config->compressed_addresses) { + bssl::OPENSSL_memset(addr + SLHDSA_ADDR_COMP_ZERO_START, 0, + SLHDSA_ADDR_COMP_ZERO_LEN); + addr[SLHDSA_ADDR_COMP_OFFSET_TYPE] = (uint8_t)type; + } else { + bssl::OPENSSL_memset(addr + SLHDSA_ADDR_FULL_ZERO_START, 0, + SLHDSA_ADDR_FULL_ZERO_LEN); + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_FULL_OFFSET_TYPE, type); + } +} + +inline void slhdsa_set_tree_height(const slh_dsa_config *config, + uint8_t addr[32], uint32_t tree_height) { + if (config->compressed_addresses) { + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_COMP_OFFSET_TREE_HEIGHT, + tree_height); + } else { + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_FULL_OFFSET_TREE_HEIGHT, + tree_height); + } +} + +inline void slhdsa_set_tree_index(const slh_dsa_config *config, + uint8_t addr[32], uint32_t tree_index) { + if (config->compressed_addresses) { + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_COMP_OFFSET_TREE_INDEX, + tree_index); + } else { + bssl::CRYPTO_store_u32_be(addr + SLHDSA_ADDR_FULL_OFFSET_TREE_INDEX, + tree_index); + } +} + +inline uint32_t slhdsa_get_tree_index(const slh_dsa_config *config, + uint8_t addr[32]) { + if (config->compressed_addresses) { + return bssl::CRYPTO_load_u32_be(addr + SLHDSA_ADDR_COMP_OFFSET_TREE_INDEX); + } + return bssl::CRYPTO_load_u32_be(addr + SLHDSA_ADDR_FULL_OFFSET_TREE_INDEX); +} + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_ADDRESS_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/fips_known_values.inc b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/fips_known_values.inc new file mode 100644 index 00000000..5bff695a --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/fips_known_values.inc @@ -0,0 +1,674 @@ +const uint8_t kExpectedPublicKey[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0xbe, 0x6b, 0xd7, 0xe8, 0xe1, 0x98, + 0xea, 0xf6, 0x2d, 0x57, 0x2f, 0x13, 0xfc, 0x79, 0xf2, 0x6f}; + +const uint8_t kExpectedPrivateKey[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES] = { + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0xbe, 0x6b, 0xd7, 0xe8, 0xe1, 0x98, 0xea, + 0xf6, 0x2d, 0x57, 0x2f, 0x13, 0xfc, 0x79, 0xf2, 0x6f}; + +const uint8_t kExpectedSignatureSHA256[32] = { + 0x82, 0xd4, 0x09, 0x74, 0x4d, 0x97, 0xae, 0x30, 0x53, 0x18, 0x46, + 0x9f, 0x7b, 0x85, 0x7b, 0x91, 0xd4, 0xe3, 0x33, 0x10, 0xb7, 0x09, + 0xb5, 0x50, 0xa7, 0xc4, 0x8a, 0x46, 0x09, 0x4e, 0xc9, 0xd4}; + +const uint8_t kExpectedSignature[BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES] = { + 0x3f, 0xd6, 0x91, 0x93, 0xee, 0x97, 0x08, 0xbd, 0xea, 0x11, 0x0b, 0xa2, + 0x9f, 0x23, 0x5f, 0xf2, 0xec, 0x98, 0x88, 0xd1, 0x27, 0x61, 0xf8, 0x4d, + 0xc6, 0xe3, 0xf0, 0xd7, 0xeb, 0x48, 0xd0, 0x5c, 0xac, 0xf6, 0xe8, 0x7f, + 0xb7, 0xe9, 0x58, 0xf2, 0x21, 0x47, 0x21, 0x64, 0x03, 0xf1, 0xcd, 0x17, + 0xab, 0x8d, 0xfb, 0x3d, 0xf1, 0x60, 0xd8, 0xc5, 0xaa, 0x43, 0xcf, 0x56, + 0x2d, 0x82, 0x61, 0x99, 0xcc, 0x72, 0x56, 0x12, 0x3f, 0x06, 0xdb, 0x84, + 0x2d, 0xd8, 0x31, 0x7e, 0x60, 0x1e, 0x4d, 0x8c, 0xf2, 0x15, 0x57, 0xa3, + 0x78, 0x90, 0x59, 0xbe, 0x6a, 0x7e, 0xc7, 0xc7, 0xe2, 0x6d, 0x6c, 0x81, + 0xf1, 0xc7, 0xc0, 0x2a, 0x98, 0xdd, 0x64, 0x05, 0x86, 0x99, 0x1f, 0x09, + 0x16, 0xef, 0xad, 0xd6, 0x13, 0x7a, 0xf7, 0x8c, 0x38, 0xd3, 0x2e, 0xde, + 0x10, 0xd6, 0x6e, 0x38, 0x24, 0xa2, 0x24, 0x40, 0xf5, 0x7c, 0x63, 0x42, + 0xf1, 0xab, 0x72, 0x01, 0xc4, 0x54, 0xf6, 0x66, 0x34, 0x32, 0x39, 0xbb, + 0x35, 0x13, 0x45, 0xd1, 0xea, 0x6d, 0x87, 0xff, 0x8b, 0x4a, 0x65, 0x7f, + 0x84, 0xbd, 0xb1, 0x03, 0xe1, 0xd7, 0x43, 0xea, 0xbe, 0xe3, 0x78, 0xa5, + 0xc5, 0xf5, 0xf9, 0xb2, 0x00, 0xe3, 0x13, 0x06, 0x36, 0xf3, 0xe8, 0xee, + 0x07, 0xf2, 0x03, 0x48, 0xfd, 0x51, 0x1b, 0xb5, 0x10, 0xaa, 0x1d, 0xa9, + 0x99, 0xad, 0x10, 0xc4, 0x59, 0xbf, 0x66, 0xdd, 0x27, 0x8d, 0xd8, 0x6a, + 0x53, 0xe5, 0x9d, 0x1a, 0xc1, 0x17, 0x36, 0x2e, 0x2e, 0xaf, 0xc0, 0xc8, + 0x96, 0x89, 0xbc, 0x4e, 0x7c, 0xa5, 0xcc, 0xa9, 0x2a, 0x76, 0xc3, 0x1d, + 0xa0, 0xa1, 0xa0, 0x27, 0xbf, 0x93, 0x21, 0xf0, 0x67, 0x7c, 0x10, 0x5f, + 0x7f, 0x3c, 0x68, 0xbd, 0x7b, 0x2d, 0x0a, 0x4a, 0xde, 0x1b, 0xd4, 0xaa, + 0x85, 0x5c, 0xa1, 0x56, 0x9e, 0x61, 0x6c, 0x97, 0x65, 0xae, 0xc4, 0x13, + 0xa8, 0x1a, 0x82, 0x87, 0x66, 0x1b, 0x72, 0x27, 0x43, 0xca, 0xb8, 0xc6, + 0x4e, 0xe9, 0xbe, 0x4a, 0x77, 0x66, 0xea, 0xf4, 0xa0, 0x71, 0x94, 0x52, + 0xee, 0xc6, 0xef, 0x38, 0x7b, 0x24, 0x94, 0xc3, 0x7a, 0xc9, 0x35, 0x93, + 0x5b, 0x19, 0xc5, 0x4d, 0xf1, 0x7d, 0xb0, 0xb9, 0x60, 0xaf, 0x1e, 0xa9, + 0x24, 0x85, 0x92, 0x50, 0x78, 0x13, 0x1d, 0xb6, 0x6d, 0x4a, 0x4c, 0x1b, + 0xa0, 0x51, 0x86, 0xc9, 0xb4, 0x79, 0xe5, 0x09, 0x4e, 0x61, 0xf4, 0x80, + 0x07, 0x04, 0xd6, 0x44, 0x7b, 0xf6, 0xc0, 0x90, 0xc0, 0x46, 0xce, 0xa3, + 0x92, 0x1a, 0xfd, 0x85, 0xd7, 0xbc, 0xd5, 0x60, 0x0d, 0xcb, 0xc3, 0x02, + 0xbc, 0x82, 0x75, 0xe4, 0x1f, 0xc4, 0xce, 0x5f, 0x35, 0x6b, 0xdf, 0x31, + 0x1e, 0x39, 0x18, 0xcd, 0x4c, 0xc7, 0xcc, 0x1e, 0x20, 0x19, 0x4a, 0x87, + 0xf4, 0x07, 0xb3, 0x15, 0x02, 0xe4, 0x1a, 0x63, 0x1c, 0xa1, 0x3c, 0x0b, + 0xae, 0x33, 0x2c, 0x9a, 0xc4, 0x23, 0x02, 0x44, 0x96, 0x14, 0x09, 0x64, + 0xea, 0x84, 0x08, 0x52, 0xed, 0x26, 0x2c, 0x29, 0xc0, 0x56, 0x34, 0xaa, + 0x20, 0xba, 0x44, 0x39, 0x8d, 0x3e, 0xce, 0xed, 0x6a, 0x3f, 0xca, 0xcd, + 0xd3, 0xbe, 0xa0, 0xee, 0x49, 0xb4, 0xdc, 0x24, 0xa7, 0xe1, 0x7a, 0x89, + 0x8f, 0x75, 0xb7, 0x19, 0x71, 0xc6, 0x5e, 0x14, 0x9c, 0xbc, 0xde, 0xd7, + 0x52, 0xd9, 0x32, 0x6a, 0xd3, 0xd4, 0xc5, 0x5f, 0x39, 0xf0, 0xd4, 0xda, + 0xe2, 0x50, 0xfc, 0x14, 0x73, 0xd2, 0xc2, 0x28, 0xd0, 0x84, 0x46, 0xed, + 0xe9, 0x7c, 0x40, 0x16, 0xec, 0x72, 0xbf, 0x77, 0xef, 0xac, 0xe8, 0xce, + 0xbf, 0x58, 0x65, 0x29, 0x9d, 0x0d, 0xe2, 0xc9, 0xbf, 0x19, 0xdd, 0x9b, + 0x8c, 0xb9, 0x1c, 0x32, 0x24, 0x3b, 0xa5, 0x7a, 0x1f, 0x78, 0x85, 0x77, + 0xb0, 0x6c, 0x42, 0xb8, 0xc0, 0x75, 0xf5, 0xc0, 0x45, 0xe3, 0x44, 0x7c, + 0xbd, 0x75, 0x6c, 0x5b, 0xe4, 0x91, 0x72, 0x9a, 0x5f, 0xe7, 0x1e, 0xe9, + 0xa0, 0x6f, 0x26, 0xbb, 0x31, 0x9d, 0xe4, 0xec, 0x32, 0xae, 0x8d, 0xae, + 0x7d, 0x14, 0x33, 0x04, 0x19, 0x26, 0x42, 0x09, 0x91, 0x0c, 0x5f, 0xaa, + 0x1a, 0x74, 0xfa, 0x31, 0x67, 0xad, 0x71, 0x49, 0xd8, 0xf3, 0xe7, 0x8c, + 0x91, 0x38, 0x0e, 0x10, 0x5b, 0xe5, 0x1e, 0xd2, 0x9a, 0x0c, 0x4d, 0x7b, + 0x60, 0xac, 0x6a, 0x94, 0x9d, 0x4f, 0x8c, 0xd0, 0x1d, 0x2f, 0xf7, 0x3d, + 0x06, 0x46, 0xc8, 0xd6, 0x25, 0x86, 0xfe, 0x67, 0xf2, 0x01, 0xe1, 0x73, + 0x02, 0xa1, 0xdd, 0xf2, 0x6d, 0xe6, 0x9d, 0xb9, 0x83, 0x21, 0x38, 0x74, + 0x04, 0xec, 0xe3, 0x9e, 0x21, 0xc0, 0xed, 0x91, 0x47, 0x46, 0x44, 0x6e, + 0xf7, 0x76, 0xe7, 0xad, 0xbe, 0xef, 0xfe, 0xcd, 0xda, 0x1d, 0x60, 0xc8, + 0x5f, 0x92, 0xd7, 0x21, 0xb4, 0x3c, 0x11, 0x5d, 0x24, 0x50, 0x44, 0xb2, + 0xf8, 0x59, 0x7e, 0xc4, 0x7a, 0x25, 0xea, 0x19, 0x4f, 0x1c, 0x8e, 0xb5, + 0x0d, 0xd6, 0x7a, 0xe6, 0x0a, 0x11, 0x1f, 0x6b, 0x28, 0x9f, 0xc0, 0x02, + 0xc6, 0x1b, 0xca, 0x06, 0xdd, 0x59, 0xa3, 0xe0, 0xd0, 0x25, 0x3e, 0x10, + 0x0a, 0x49, 0x54, 0xdc, 0xe5, 0x07, 0x8b, 0xcc, 0xa6, 0x3d, 0x41, 0xe5, + 0x00, 0x70, 0xf7, 0x13, 0xed, 0x63, 0x7a, 0x4b, 0x69, 0x21, 0xaa, 0xf2, + 0xd5, 0xf7, 0xa4, 0x9c, 0x52, 0x51, 0xef, 0xb9, 0xef, 0xee, 0x3b, 0xa4, + 0xdf, 0x16, 0x29, 0x08, 0x0f, 0x6f, 0xbd, 0x4e, 0x53, 0xa2, 0x5c, 0x75, + 0x5f, 0x04, 0x5a, 0xdf, 0x48, 0xd6, 0xeb, 0xf9, 0x80, 0x85, 0xd3, 0x1d, + 0x7f, 0x23, 0x01, 0x04, 0x8b, 0x78, 0xba, 0xba, 0xc3, 0x74, 0xbe, 0x7d, + 0x5b, 0x3e, 0x65, 0x3d, 0xcd, 0x5d, 0x96, 0xe3, 0x40, 0x40, 0x7f, 0x50, + 0xfe, 0xa0, 0x53, 0xa6, 0xd9, 0x71, 0x15, 0x33, 0x39, 0x24, 0x83, 0xee, + 0x69, 0x6f, 0x3c, 0x0a, 0x5e, 0xdc, 0x83, 0xb9, 0x8a, 0x1f, 0x7f, 0x67, + 0x39, 0xb2, 0x58, 0xe4, 0x48, 0xd0, 0xac, 0x75, 0xba, 0x57, 0x13, 0x37, + 0xaa, 0x0b, 0xd5, 0x02, 0x89, 0x01, 0x5f, 0xe8, 0x75, 0xcc, 0x2d, 0x76, + 0x05, 0xde, 0x57, 0xa2, 0xac, 0x48, 0x95, 0xce, 0xa7, 0x1d, 0xf6, 0xdb, + 0x64, 0xff, 0xde, 0x85, 0x7e, 0x13, 0xb9, 0x58, 0x4f, 0xdd, 0xf0, 0xcb, + 0x9a, 0x31, 0x7e, 0xc5, 0x61, 0x41, 0x97, 0x56, 0xe4, 0xfa, 0xb1, 0xcb, + 0x1e, 0xb3, 0xfd, 0x9c, 0xfd, 0xfd, 0x81, 0x84, 0x78, 0xca, 0xbd, 0xf1, + 0x17, 0xec, 0xcb, 0x49, 0xa7, 0x8e, 0xe8, 0xc6, 0xc5, 0x9c, 0x30, 0xf4, + 0xc5, 0xac, 0x2a, 0x0e, 0x36, 0x72, 0x03, 0x7d, 0xf2, 0xac, 0xf9, 0x88, + 0x84, 0xb6, 0xae, 0x16, 0x77, 0x96, 0x81, 0x7b, 0xfc, 0x17, 0x07, 0x8f, + 0xc2, 0x74, 0xd9, 0xce, 0x57, 0xfc, 0x33, 0x53, 0xec, 0x85, 0x0e, 0x55, + 0x2e, 0xe3, 0xad, 0x5b, 0xeb, 0x47, 0x8d, 0xf6, 0xb4, 0xb9, 0xc9, 0xb7, + 0x9a, 0x63, 0x0e, 0xd9, 0x56, 0xa1, 0x2b, 0x05, 0x39, 0x02, 0xf8, 0x28, + 0xdf, 0x67, 0xa7, 0xce, 0x57, 0xfd, 0xd1, 0xb4, 0x55, 0x26, 0x85, 0xe7, + 0xd2, 0x5b, 0x90, 0x26, 0xe1, 0x25, 0x5e, 0x73, 0x12, 0xcc, 0x65, 0x8e, + 0xdc, 0x1c, 0x64, 0xe9, 0x2e, 0x58, 0xde, 0x70, 0xc3, 0xc6, 0xc9, 0x75, + 0x35, 0x0a, 0x0f, 0x60, 0x35, 0xb4, 0x9f, 0xcb, 0x52, 0x20, 0xc9, 0x36, + 0xb2, 0x52, 0xbb, 0x35, 0x61, 0xf9, 0xf2, 0xc0, 0xd3, 0x50, 0x08, 0xf6, + 0x73, 0x7d, 0xcb, 0x12, 0xd2, 0xd7, 0x4c, 0x56, 0x59, 0xb9, 0x74, 0xec, + 0xee, 0x4c, 0x93, 0x1d, 0x9d, 0x73, 0x79, 0xe6, 0xa9, 0x2a, 0x98, 0x22, + 0x77, 0x80, 0x84, 0x73, 0x78, 0xe2, 0x8a, 0xec, 0x1e, 0x08, 0x6f, 0x49, + 0x64, 0xea, 0x79, 0x99, 0xb1, 0xec, 0x19, 0x6e, 0x65, 0xbd, 0xaa, 0x28, + 0x81, 0x4a, 0xe7, 0x70, 0x44, 0xc6, 0xa1, 0xbc, 0xe3, 0x72, 0x8c, 0xa1, + 0x13, 0x8a, 0x17, 0x16, 0xcc, 0xfb, 0x03, 0x20, 0x36, 0xb6, 0x6f, 0x4c, + 0x52, 0xc4, 0xc6, 0x7e, 0x61, 0x08, 0x56, 0x27, 0xb0, 0xf9, 0x96, 0x44, + 0xd9, 0x62, 0x4a, 0x29, 0xea, 0xf2, 0x33, 0x2b, 0x68, 0x45, 0xe0, 0x50, + 0x97, 0x6a, 0x1c, 0x59, 0x89, 0x43, 0x60, 0xc2, 0x5f, 0x31, 0xfe, 0xc7, + 0x00, 0xcb, 0x6a, 0xe0, 0xcb, 0x1b, 0x50, 0x43, 0x18, 0xd1, 0x67, 0xbc, + 0x62, 0xcf, 0xb8, 0xe9, 0x77, 0x81, 0x1d, 0x12, 0x97, 0xa4, 0x87, 0xbe, + 0xa2, 0x16, 0x08, 0xf6, 0x1f, 0x42, 0x40, 0x51, 0x65, 0xe3, 0x72, 0x21, + 0xef, 0x72, 0xba, 0xff, 0xa7, 0xfa, 0x3b, 0xac, 0xd7, 0xc9, 0x60, 0xa1, + 0x5e, 0xae, 0xa5, 0x19, 0x80, 0xfa, 0x5c, 0xd8, 0xf0, 0x1a, 0xf4, 0x0f, + 0xb2, 0x5f, 0x57, 0x9a, 0xd2, 0x1b, 0x7c, 0xef, 0xb9, 0x24, 0xd0, 0x30, + 0x0e, 0x92, 0xc9, 0x44, 0x05, 0xcf, 0x16, 0x31, 0xc4, 0x3e, 0xa2, 0x94, + 0x1f, 0x69, 0xad, 0x0e, 0xcf, 0x7f, 0xce, 0x41, 0x9a, 0xaf, 0x89, 0xc6, + 0xbd, 0x65, 0x8c, 0x65, 0xca, 0x36, 0xea, 0xd4, 0x78, 0x04, 0x1b, 0x62, + 0x14, 0x2f, 0x37, 0xbe, 0x9b, 0xbe, 0xfd, 0xf4, 0x9d, 0x3c, 0x48, 0xe4, + 0x46, 0x71, 0x0d, 0x07, 0x93, 0xfa, 0x86, 0x6c, 0xa7, 0xce, 0xe9, 0x6a, + 0xf3, 0xa1, 0x72, 0x19, 0x0a, 0xc9, 0xb9, 0xc4, 0x3b, 0x76, 0x2f, 0x16, + 0x82, 0x85, 0xbc, 0x4c, 0x46, 0x32, 0x37, 0xc7, 0x19, 0xc5, 0x24, 0xe1, + 0x0f, 0xe2, 0x8f, 0x6d, 0x62, 0x6b, 0x6c, 0x9e, 0xc8, 0x15, 0x01, 0x93, + 0x5b, 0x03, 0x44, 0x6e, 0xd0, 0xdc, 0xbd, 0x29, 0x2f, 0x2e, 0x89, 0x11, + 0x24, 0x37, 0xfa, 0xff, 0x83, 0x51, 0x1b, 0x31, 0x5e, 0xc6, 0x71, 0x0b, + 0xbf, 0x64, 0xb0, 0xca, 0x01, 0xfd, 0x49, 0x64, 0x05, 0xe2, 0xe7, 0x9e, + 0x6e, 0xfe, 0xce, 0x98, 0x86, 0xef, 0x16, 0xec, 0x19, 0x66, 0xbe, 0x0d, + 0x8a, 0x99, 0xbb, 0xbf, 0x39, 0xa8, 0x85, 0xeb, 0x14, 0x51, 0x2d, 0x18, + 0xd5, 0x93, 0x6f, 0x78, 0x7a, 0x5e, 0xa1, 0xee, 0xc0, 0xc3, 0x93, 0x68, + 0x04, 0x0c, 0xd4, 0xfe, 0x6a, 0xf7, 0x05, 0x78, 0xac, 0xcf, 0xee, 0x9f, + 0x92, 0x53, 0x8d, 0xd4, 0x99, 0x75, 0x27, 0x7f, 0x79, 0x6c, 0x97, 0x52, + 0x34, 0x08, 0x31, 0xad, 0x4a, 0x54, 0x34, 0xab, 0x33, 0x2e, 0x16, 0x54, + 0xcd, 0x7a, 0x75, 0x96, 0x29, 0x72, 0x3f, 0x40, 0x2e, 0xa4, 0xc8, 0x1c, + 0x67, 0xb6, 0x37, 0x15, 0x51, 0x64, 0x83, 0x69, 0xd7, 0x62, 0xcf, 0x8f, + 0x34, 0x18, 0x46, 0x98, 0x5c, 0x8b, 0x61, 0x5f, 0x73, 0xe2, 0xc6, 0xe2, + 0x86, 0x82, 0xf5, 0xee, 0x1e, 0x2d, 0xb2, 0x73, 0xb0, 0x1b, 0x69, 0xf1, + 0x23, 0x47, 0x0d, 0xe6, 0x30, 0xe6, 0xed, 0x91, 0x69, 0x23, 0x23, 0xdd, + 0x28, 0x64, 0x56, 0xce, 0x98, 0xa1, 0xfd, 0x93, 0x8b, 0xd3, 0xc0, 0xb7, + 0xdd, 0xc8, 0x4a, 0x16, 0xe4, 0x75, 0xa3, 0xef, 0xba, 0x20, 0x92, 0x42, + 0xa8, 0x1b, 0xbc, 0x54, 0x88, 0x3e, 0x1b, 0xf0, 0xa1, 0x1f, 0xeb, 0x5d, + 0xcb, 0x49, 0x61, 0x19, 0xf3, 0x7f, 0x06, 0xe3, 0xd5, 0x29, 0xbc, 0x1a, + 0x54, 0x53, 0xb6, 0xf3, 0x33, 0x50, 0x01, 0xf5, 0xb2, 0xcf, 0x1a, 0x2a, + 0x17, 0x8f, 0xfa, 0x1f, 0x41, 0x47, 0x6d, 0x61, 0x60, 0x02, 0x19, 0x36, + 0x06, 0xfb, 0x66, 0xfc, 0x42, 0xe3, 0x87, 0xc6, 0x14, 0x82, 0x2e, 0x15, + 0xe6, 0x29, 0xbf, 0x44, 0x4d, 0xb4, 0x85, 0xa9, 0x6e, 0x81, 0x75, 0xb9, + 0x09, 0x8d, 0xcc, 0x57, 0x13, 0x4c, 0x43, 0xca, 0x94, 0x00, 0x5b, 0xb5, + 0x23, 0xbe, 0xf6, 0x3a, 0x1d, 0x42, 0x15, 0xa0, 0xea, 0x2e, 0x89, 0x78, + 0x7f, 0xba, 0x6c, 0xe4, 0x27, 0x4c, 0xb6, 0x86, 0x73, 0x0e, 0x9c, 0x8c, + 0xef, 0x14, 0xe4, 0x35, 0x43, 0x6d, 0x56, 0xcd, 0x4d, 0xab, 0x48, 0x96, + 0x18, 0xb0, 0x4f, 0x03, 0xad, 0xec, 0x62, 0xd3, 0x34, 0x5a, 0xd0, 0xff, + 0x14, 0xb1, 0xaf, 0x92, 0xc4, 0x20, 0xaf, 0x2d, 0x17, 0x5e, 0x22, 0x51, + 0x5c, 0x66, 0x09, 0x20, 0x1f, 0x50, 0x0c, 0xbc, 0xe0, 0xf5, 0x6e, 0x85, + 0x31, 0x48, 0x80, 0xef, 0xf6, 0xef, 0x7c, 0xf5, 0xf3, 0x0e, 0x6b, 0x8d, + 0x0b, 0x94, 0x5d, 0xaa, 0x27, 0x81, 0xd3, 0x9b, 0x07, 0xe0, 0x22, 0x0a, + 0xce, 0x9d, 0xa1, 0x79, 0x38, 0xe2, 0x6d, 0x58, 0xa7, 0xde, 0x9b, 0xb0, + 0x25, 0x0e, 0x6b, 0x25, 0x1e, 0xac, 0x48, 0xb0, 0x34, 0x27, 0x42, 0x30, + 0xf5, 0xdb, 0x35, 0x4d, 0xf3, 0x1f, 0xbc, 0x58, 0x2a, 0xeb, 0x86, 0x82, + 0x6e, 0x9a, 0x0b, 0xf6, 0xdb, 0xa6, 0x18, 0x12, 0x42, 0x17, 0x5c, 0xb3, + 0xe9, 0x49, 0x5f, 0x2a, 0xf9, 0x81, 0x35, 0x2f, 0x2b, 0x92, 0xaa, 0x0d, + 0x6b, 0xa7, 0xe1, 0xed, 0x55, 0xa4, 0xf1, 0xef, 0xa4, 0x6b, 0x50, 0x85, + 0xe3, 0xcd, 0x9a, 0xb2, 0xc7, 0xd9, 0xdd, 0x46, 0x6a, 0xfc, 0x2c, 0xed, + 0xd8, 0xf9, 0x4d, 0x8a, 0xb4, 0x30, 0xa0, 0xd0, 0x08, 0xf8, 0xf0, 0x61, + 0xc5, 0x39, 0xa2, 0xfd, 0x78, 0x65, 0x41, 0x5a, 0x52, 0x06, 0xfe, 0x50, + 0x74, 0xe2, 0x80, 0xb2, 0x22, 0x2d, 0x96, 0xbf, 0xd2, 0x4f, 0xc9, 0x00, + 0x36, 0x58, 0xa7, 0x67, 0x7d, 0xfc, 0xc0, 0xae, 0x34, 0x69, 0xe3, 0x16, + 0xbe, 0x59, 0x9e, 0x01, 0x9e, 0x39, 0x0b, 0x3d, 0x16, 0x80, 0x8b, 0xfb, + 0xe0, 0x58, 0x81, 0xcd, 0xd1, 0x2c, 0x66, 0x9a, 0xe5, 0x0b, 0x4a, 0xec, + 0xb7, 0x45, 0x7f, 0x98, 0xf2, 0xfb, 0xbe, 0x2b, 0xc8, 0xdd, 0x48, 0x95, + 0x22, 0xde, 0x8e, 0xb8, 0xb8, 0x4c, 0x09, 0xe8, 0x9e, 0x34, 0x4b, 0xab, + 0xe3, 0xa1, 0x19, 0x56, 0xb6, 0x08, 0x67, 0x29, 0x47, 0xba, 0x84, 0xe5, + 0x5c, 0x7d, 0x81, 0x7c, 0x8e, 0x48, 0x0a, 0xce, 0x9f, 0x61, 0xfd, 0x26, + 0x94, 0xe1, 0x35, 0x43, 0x60, 0xe4, 0x5c, 0x11, 0x26, 0x47, 0xab, 0x1b, + 0x0e, 0x24, 0x8b, 0x25, 0x51, 0x6e, 0x21, 0x39, 0x36, 0xa6, 0x46, 0x4c, + 0x93, 0x8d, 0xf8, 0xb1, 0xd6, 0x28, 0xdd, 0x21, 0x43, 0xf0, 0xa2, 0x5d, + 0x2f, 0xbe, 0x67, 0xe3, 0x89, 0x7a, 0xe5, 0xa2, 0x22, 0x3f, 0xf7, 0x0a, + 0xcd, 0xc4, 0xcc, 0xde, 0x81, 0xf1, 0x03, 0xa6, 0xac, 0x29, 0xa3, 0xbe, + 0xdc, 0x3a, 0x84, 0x81, 0x98, 0x33, 0x97, 0xa5, 0xe2, 0x93, 0x98, 0xbd, + 0x28, 0x7c, 0x78, 0xed, 0xdb, 0x30, 0x26, 0x11, 0x87, 0x32, 0x6f, 0xeb, + 0xb0, 0x36, 0x05, 0xcf, 0xe3, 0x28, 0xfc, 0x3e, 0x71, 0xd3, 0x7c, 0x8e, + 0xcd, 0xa9, 0xb4, 0x8b, 0x6e, 0x45, 0xdb, 0xc2, 0x48, 0xdf, 0x7f, 0xb8, + 0x8a, 0x5b, 0x43, 0xc0, 0x41, 0x25, 0xe7, 0x0a, 0xe5, 0x02, 0x56, 0x21, + 0xa1, 0x71, 0xa5, 0xee, 0x13, 0x9f, 0xd7, 0x6b, 0xdb, 0xfa, 0x28, 0x43, + 0x58, 0x76, 0x2a, 0x32, 0x89, 0x9b, 0x11, 0x7c, 0xc8, 0xe5, 0x71, 0xe6, + 0x2e, 0x08, 0xe1, 0x7a, 0xe5, 0x96, 0x30, 0x61, 0xb1, 0xd9, 0x0a, 0xe5, + 0x39, 0x60, 0xaa, 0xf9, 0x3f, 0x7f, 0x88, 0xc7, 0x23, 0xa9, 0x02, 0xe1, + 0xbd, 0xa9, 0xcd, 0xb7, 0xee, 0x6c, 0x67, 0xc7, 0x1b, 0xe0, 0xe1, 0x22, + 0x08, 0xbe, 0xdb, 0xf6, 0x2b, 0x0d, 0x61, 0xc7, 0x62, 0x5c, 0x5a, 0x9c, + 0xb0, 0xd0, 0x6a, 0x7f, 0xd4, 0x4b, 0x43, 0x4e, 0x93, 0x69, 0xf6, 0x43, + 0xc7, 0xbd, 0x78, 0xfe, 0xf3, 0x2f, 0xc8, 0x1f, 0x6c, 0xf0, 0x13, 0x9d, + 0xff, 0x46, 0xfc, 0xe9, 0x09, 0xe3, 0xaf, 0x47, 0x83, 0x71, 0x99, 0x74, + 0xa1, 0xce, 0xa8, 0x35, 0x18, 0x29, 0xa4, 0xbd, 0x4a, 0x84, 0x55, 0x05, + 0x29, 0x53, 0x2f, 0xa5, 0xad, 0xa7, 0xf2, 0x1c, 0x45, 0x13, 0xa3, 0xb0, + 0xa9, 0x11, 0x80, 0xd6, 0x22, 0xff, 0x64, 0x3e, 0x39, 0xa0, 0x15, 0x58, + 0x7f, 0x08, 0x99, 0x08, 0xff, 0x8e, 0x5f, 0x73, 0x98, 0x17, 0x3d, 0x5e, + 0xe6, 0xcd, 0x2d, 0xdf, 0xd1, 0x32, 0x2b, 0x8c, 0x08, 0xa6, 0xdb, 0x51, + 0x2a, 0x5f, 0xac, 0xad, 0x7a, 0x9b, 0x4c, 0x82, 0x25, 0xad, 0x33, 0x7f, + 0x30, 0xc7, 0x48, 0xbd, 0xb4, 0xf5, 0x0d, 0x22, 0xd3, 0xf8, 0x47, 0x58, + 0xd3, 0x17, 0x65, 0x81, 0xe4, 0x07, 0xa0, 0x32, 0x7e, 0xee, 0x30, 0xe9, + 0x17, 0xaa, 0xa0, 0x0b, 0xd7, 0x97, 0x25, 0xa7, 0xdb, 0xfe, 0xab, 0xd2, + 0x3c, 0x9b, 0x6b, 0xdf, 0xa6, 0x87, 0xa3, 0xc0, 0x59, 0xb4, 0x08, 0x11, + 0xd4, 0x94, 0x49, 0x83, 0x70, 0x5f, 0x5b, 0x2d, 0x09, 0xd8, 0x53, 0x25, + 0xb6, 0xd9, 0x91, 0x32, 0xd3, 0x94, 0x79, 0xbb, 0x4d, 0xf7, 0xb6, 0x92, + 0xd3, 0xf4, 0x08, 0x50, 0xda, 0x7d, 0x91, 0x69, 0xad, 0xc3, 0x59, 0x4d, + 0xc0, 0x1c, 0x14, 0xf2, 0x95, 0xd3, 0xea, 0xa8, 0x85, 0xa3, 0x93, 0x62, + 0x73, 0x68, 0x4c, 0x36, 0x49, 0x35, 0x88, 0xf0, 0xe1, 0xc8, 0x29, 0xab, + 0x59, 0x4f, 0xef, 0x2c, 0x4c, 0x07, 0xbd, 0xf3, 0x05, 0x24, 0x41, 0x46, + 0x92, 0x66, 0x66, 0x79, 0x92, 0x32, 0x87, 0xcb, 0x90, 0x11, 0x2d, 0x54, + 0xda, 0x6d, 0x64, 0x72, 0xae, 0x7d, 0x19, 0x50, 0x61, 0x46, 0xd9, 0x58, + 0x2d, 0x28, 0x0d, 0x33, 0x7b, 0x17, 0x01, 0x61, 0x44, 0x77, 0x7e, 0x06, + 0x2b, 0x41, 0x88, 0xe6, 0x92, 0x9c, 0x69, 0xa8, 0x89, 0x9f, 0x4a, 0xa4, + 0x6f, 0x31, 0xa5, 0x68, 0xff, 0xdb, 0xb6, 0xe3, 0x0c, 0x3f, 0xc6, 0x24, + 0x5e, 0xfc, 0x0e, 0x31, 0x32, 0x64, 0xec, 0x3f, 0xe7, 0x7a, 0x29, 0x16, + 0x8f, 0xed, 0x6a, 0xb7, 0x96, 0x77, 0x29, 0xa3, 0xfe, 0x1e, 0x6b, 0x54, + 0x91, 0xe9, 0xbb, 0x38, 0x65, 0x70, 0x8c, 0x4e, 0x50, 0x39, 0x5e, 0x63, + 0xe4, 0x49, 0x33, 0x61, 0xb9, 0x92, 0x92, 0x51, 0xe9, 0x31, 0xf6, 0x5c, + 0x21, 0x1a, 0x53, 0xbf, 0x4a, 0xe1, 0xbc, 0x1f, 0x61, 0x66, 0x3d, 0x83, + 0x8c, 0x27, 0x17, 0x53, 0xbd, 0xf4, 0x3d, 0x13, 0xe3, 0xfe, 0x58, 0x60, + 0x12, 0x78, 0x3b, 0xb1, 0xd7, 0x94, 0x87, 0xe2, 0x3e, 0xc2, 0x16, 0x43, + 0x4e, 0xc8, 0xce, 0x43, 0xbc, 0x4a, 0xe4, 0xba, 0xd6, 0xfa, 0xfa, 0x7e, + 0x13, 0xd3, 0x28, 0x8f, 0xd2, 0x2b, 0x0b, 0x93, 0x8d, 0x42, 0xe8, 0xd2, + 0x50, 0x34, 0x31, 0xd8, 0xb3, 0xa7, 0x1c, 0x93, 0x3d, 0x80, 0x8f, 0x3f, + 0xee, 0x7d, 0x3c, 0xd1, 0xcc, 0x2a, 0x99, 0x39, 0xfd, 0x8f, 0xed, 0x8e, + 0x85, 0x51, 0x20, 0x04, 0xa7, 0xd7, 0x34, 0x83, 0x20, 0x7c, 0x91, 0x20, + 0x96, 0xe6, 0x88, 0xd6, 0x29, 0x99, 0xff, 0x18, 0xa9, 0x3b, 0x3e, 0x41, + 0x42, 0x80, 0x14, 0x3b, 0xf2, 0xc4, 0x24, 0x4e, 0x18, 0x04, 0xa0, 0xfb, + 0x17, 0x00, 0x2b, 0xf7, 0x44, 0x22, 0x4b, 0x7c, 0x38, 0xb6, 0x66, 0x4c, + 0x56, 0x85, 0xb2, 0x82, 0x92, 0xe8, 0x7b, 0x76, 0x33, 0x59, 0xe8, 0x09, + 0xd1, 0xef, 0xa0, 0x1b, 0x5b, 0xc2, 0xb1, 0x01, 0x6d, 0x7f, 0xbb, 0x37, + 0x72, 0x3f, 0xcd, 0x20, 0x6f, 0x2d, 0x9c, 0xfc, 0xfc, 0x1c, 0x2c, 0x24, + 0x3f, 0x30, 0xcd, 0xf5, 0x91, 0x0a, 0x09, 0xa5, 0xeb, 0x97, 0xb0, 0xc4, + 0x2c, 0xa3, 0x95, 0x57, 0xea, 0x59, 0x15, 0xc7, 0x81, 0xdf, 0x64, 0x99, + 0x1b, 0x20, 0x8d, 0xe0, 0x29, 0x7c, 0x23, 0x16, 0x3c, 0x8e, 0x2e, 0x0b, + 0xff, 0x02, 0xdc, 0x6f, 0x24, 0x1e, 0xfc, 0xac, 0x0f, 0x16, 0x38, 0xde, + 0x5c, 0x9c, 0x5f, 0x48, 0x13, 0xfb, 0xac, 0xbd, 0x99, 0x5e, 0x7e, 0xe0, + 0xcc, 0x3c, 0x21, 0x17, 0xfa, 0x67, 0x53, 0x31, 0xb3, 0x99, 0x98, 0x05, + 0x2d, 0xa1, 0xb2, 0x72, 0xf6, 0xe0, 0x73, 0xb5, 0x6e, 0xc7, 0xa0, 0x7f, + 0xfa, 0x75, 0x89, 0x20, 0x4c, 0x84, 0xa1, 0x2b, 0x39, 0x4a, 0xd1, 0xf5, + 0x36, 0xfa, 0x5f, 0xb2, 0x7e, 0x75, 0x1b, 0x21, 0xae, 0x30, 0x83, 0x14, + 0x3d, 0xf6, 0x4c, 0x3a, 0xd8, 0x1e, 0x05, 0xed, 0x93, 0xbf, 0xd3, 0xb2, + 0x46, 0x23, 0x43, 0x3d, 0x78, 0xf9, 0x94, 0x78, 0xae, 0x20, 0x8b, 0x49, + 0x8c, 0xac, 0x8f, 0x41, 0xe4, 0x5d, 0x96, 0x25, 0xd6, 0xd9, 0xde, 0x8b, + 0x6a, 0xcd, 0x37, 0x22, 0x1b, 0xd7, 0xca, 0x8a, 0xa3, 0x79, 0x7b, 0x3d, + 0x3e, 0x2d, 0xe3, 0x09, 0xa4, 0xbf, 0x9a, 0x48, 0x63, 0xa5, 0x88, 0xbf, + 0xb6, 0xfb, 0x7b, 0xcc, 0x29, 0xd2, 0x2f, 0xdc, 0x36, 0xab, 0xd6, 0x46, + 0xae, 0xf7, 0xca, 0xa9, 0xb4, 0x14, 0xae, 0xfe, 0x12, 0x62, 0xb9, 0xce, + 0x4b, 0x99, 0xf8, 0xa5, 0x59, 0x81, 0x83, 0x68, 0x48, 0x44, 0x75, 0xab, + 0xb2, 0x12, 0x89, 0x1c, 0xe7, 0x3d, 0x9b, 0x19, 0xa2, 0x6c, 0xa5, 0x62, + 0x48, 0xed, 0x1c, 0xba, 0x89, 0x61, 0xe4, 0x24, 0x8a, 0xd8, 0x87, 0xb7, + 0x64, 0xc8, 0x09, 0x20, 0x83, 0x50, 0x2b, 0xe7, 0xb9, 0x8c, 0x99, 0x05, + 0x18, 0xd3, 0xf5, 0x19, 0xec, 0x75, 0x83, 0x4c, 0x80, 0x20, 0x2b, 0x6a, + 0xa0, 0x75, 0x6b, 0x62, 0xd1, 0x02, 0x06, 0x16, 0x66, 0x9e, 0x5a, 0x13, + 0xfe, 0x1d, 0xf4, 0x95, 0xff, 0xc2, 0xd7, 0x94, 0xda, 0x70, 0x57, 0xc2, + 0x1d, 0xe8, 0x70, 0x13, 0x13, 0x95, 0x58, 0xdb, 0x0f, 0x1e, 0xab, 0x40, + 0xbc, 0x14, 0xe5, 0xdd, 0x8b, 0xf5, 0x42, 0x21, 0xb4, 0x86, 0xf9, 0x18, + 0x2b, 0xd4, 0x92, 0x39, 0xff, 0x11, 0x87, 0xa5, 0xe6, 0x64, 0xbc, 0x44, + 0xa0, 0x41, 0x38, 0xed, 0xce, 0x3b, 0xce, 0x8c, 0x56, 0xad, 0xa2, 0xd5, + 0xf2, 0x1c, 0x3b, 0xcb, 0x49, 0x27, 0x99, 0xab, 0xb1, 0x4a, 0x1b, 0x24, + 0x8f, 0xb0, 0x17, 0xc0, 0x9d, 0xb3, 0x25, 0xac, 0xa8, 0x71, 0x78, 0x2b, + 0xd3, 0xa0, 0xc9, 0x76, 0xc5, 0x5b, 0x2d, 0xfc, 0x20, 0x5c, 0x48, 0x0f, + 0xfe, 0x19, 0x0b, 0x21, 0xed, 0xf3, 0xed, 0x19, 0xa1, 0xba, 0x27, 0xc3, + 0x91, 0xa1, 0xf4, 0x98, 0x62, 0x14, 0xcf, 0x4a, 0x13, 0x1a, 0x35, 0xda, + 0x09, 0xfe, 0x34, 0x6a, 0x47, 0x2a, 0x47, 0x7e, 0x99, 0x52, 0x03, 0x8b, + 0x47, 0xc6, 0x37, 0x2f, 0xb6, 0xc8, 0x80, 0x7a, 0x14, 0x41, 0x7d, 0x10, + 0x93, 0x48, 0xad, 0x95, 0xb5, 0xbb, 0x72, 0xd0, 0x36, 0xfb, 0x07, 0xfa, + 0x42, 0x2c, 0x5b, 0x0a, 0xea, 0xe4, 0xea, 0x67, 0x05, 0x1c, 0x73, 0x20, + 0xc5, 0xe6, 0x86, 0x87, 0xac, 0xe1, 0xcb, 0x67, 0xae, 0x01, 0x71, 0x05, + 0x1e, 0xe5, 0x13, 0x1a, 0x9e, 0xa8, 0xbc, 0xd0, 0x66, 0x83, 0x30, 0x33, + 0xcb, 0xf8, 0x01, 0x61, 0x60, 0xe3, 0xfc, 0x55, 0x7f, 0x3b, 0xd0, 0x51, + 0x2e, 0x82, 0xb6, 0x11, 0x25, 0xe1, 0x50, 0x77, 0x02, 0x7d, 0x6c, 0x16, + 0x37, 0x02, 0xa6, 0x9f, 0x8c, 0xdd, 0x50, 0xdc, 0x95, 0xb6, 0x4a, 0xc9, + 0x97, 0xee, 0x0a, 0x11, 0x3c, 0x81, 0xbd, 0x08, 0x23, 0xda, 0xbe, 0xa7, + 0x39, 0x8c, 0x24, 0x68, 0x2b, 0x68, 0x7b, 0x9e, 0x1a, 0xf1, 0xb7, 0x58, + 0x2f, 0x2a, 0x3f, 0x90, 0xd8, 0x9a, 0xad, 0x92, 0xae, 0x2b, 0xb2, 0x41, + 0x46, 0x2d, 0x80, 0xac, 0x84, 0x61, 0xe4, 0x7f, 0xb0, 0x59, 0xa6, 0x3e, + 0x4d, 0x61, 0x8a, 0xb7, 0x6b, 0xaa, 0xb4, 0x86, 0x63, 0xd2, 0xf3, 0x12, + 0xac, 0xe5, 0x97, 0x88, 0xaf, 0x29, 0xdd, 0x7c, 0xa3, 0xcd, 0x8d, 0x9f, + 0x53, 0x22, 0xaf, 0x62, 0xad, 0xf9, 0x96, 0x10, 0x8d, 0x98, 0x99, 0x9a, + 0x39, 0x10, 0x1a, 0x09, 0xad, 0x50, 0x52, 0x73, 0xb5, 0xfd, 0xde, 0xe4, + 0xf6, 0xe9, 0x96, 0x65, 0xc8, 0x42, 0xf0, 0xf8, 0x31, 0x5a, 0xe5, 0x6d, + 0x3b, 0x66, 0x02, 0xc4, 0x76, 0x45, 0xd8, 0xce, 0x97, 0xb9, 0xdd, 0x3b, + 0x00, 0x8a, 0xda, 0x7b, 0x23, 0x7d, 0xc2, 0x4e, 0x18, 0xa9, 0xa0, 0x89, + 0x3a, 0x19, 0xfd, 0xff, 0xcb, 0x3e, 0xd6, 0x8d, 0xcb, 0x40, 0xa7, 0x9f, + 0x3b, 0xe9, 0x9c, 0xe9, 0x4f, 0x91, 0x5a, 0xa0, 0x24, 0x44, 0x4a, 0x37, + 0x2b, 0xd5, 0x2c, 0x39, 0x8c, 0xe6, 0x1a, 0xcc, 0x23, 0x28, 0x88, 0xca, + 0xfc, 0x77, 0xbd, 0xbf, 0x71, 0x2b, 0x0b, 0x2a, 0xb9, 0x36, 0x6a, 0x71, + 0x88, 0x1e, 0x50, 0x76, 0x4c, 0xba, 0xe4, 0x43, 0xd0, 0xf1, 0x08, 0x5f, + 0xe3, 0xb4, 0x18, 0x3e, 0x6c, 0x9d, 0xcd, 0xb7, 0xf4, 0xb7, 0x54, 0xdd, + 0x71, 0xe0, 0xd3, 0xcb, 0x30, 0x3d, 0x74, 0x7c, 0x8a, 0x22, 0xd4, 0x38, + 0x9c, 0x63, 0x24, 0x37, 0x26, 0x0c, 0x32, 0xaa, 0x7b, 0xaa, 0x54, 0x49, + 0x98, 0xc4, 0x5a, 0x4e, 0xb3, 0x9b, 0xa8, 0x8e, 0xb6, 0x61, 0x0c, 0xf6, + 0xe8, 0x1c, 0x1e, 0xb2, 0x4c, 0xd6, 0x2c, 0x03, 0x2d, 0x87, 0xc8, 0x23, + 0x85, 0xfa, 0x06, 0x4e, 0x60, 0x23, 0x88, 0x9b, 0x8c, 0xad, 0x74, 0x99, + 0x95, 0x5d, 0x9b, 0x79, 0x40, 0xad, 0x1e, 0x9c, 0xa3, 0xcf, 0xa4, 0x5e, + 0xf4, 0xc0, 0x2e, 0xe1, 0x1b, 0x22, 0xc3, 0x9d, 0xb0, 0xc7, 0x80, 0xc5, + 0x00, 0x8b, 0xd8, 0x45, 0x01, 0x27, 0xf2, 0x1c, 0x25, 0x9d, 0x73, 0xa3, + 0xf4, 0x1c, 0x19, 0x1d, 0xc4, 0x46, 0xb3, 0x12, 0x9b, 0xdb, 0xa0, 0x31, + 0x46, 0xb6, 0xf2, 0xaa, 0x27, 0xae, 0x80, 0x51, 0xac, 0x2c, 0xca, 0x1a, + 0x60, 0xaa, 0x1a, 0x57, 0xb9, 0x88, 0x2f, 0xbd, 0x60, 0x48, 0xa0, 0xe6, + 0xf3, 0xce, 0x3b, 0x48, 0x80, 0xe4, 0x3a, 0x82, 0x49, 0xcf, 0x68, 0x54, + 0x48, 0xd3, 0xab, 0x70, 0xd8, 0x0a, 0x8c, 0x75, 0x66, 0x9b, 0x8b, 0x71, + 0x65, 0xbf, 0x1b, 0x7c, 0xcf, 0x51, 0x67, 0xce, 0x3c, 0x16, 0x3b, 0x1c, + 0xe8, 0x77, 0x33, 0x49, 0x4d, 0x15, 0x04, 0x2b, 0xfc, 0x3a, 0xcf, 0x97, + 0x38, 0x97, 0x65, 0x77, 0x23, 0x95, 0xc4, 0x24, 0x7e, 0x44, 0x60, 0x04, + 0x46, 0xa1, 0x41, 0x9a, 0x1a, 0x22, 0xbc, 0x4f, 0x10, 0xad, 0x52, 0xc1, + 0xf2, 0xbf, 0xe8, 0xd6, 0x44, 0x65, 0xed, 0x66, 0x93, 0x77, 0x64, 0xa1, + 0xcf, 0x25, 0xb6, 0x53, 0x32, 0x5a, 0x89, 0xf4, 0xe3, 0x47, 0x7f, 0xeb, + 0x60, 0x68, 0x12, 0x50, 0x47, 0x32, 0x72, 0x31, 0x61, 0x26, 0xf9, 0x20, + 0x52, 0x9c, 0xa6, 0x95, 0x66, 0xb7, 0x28, 0x7b, 0xc7, 0x9b, 0xe0, 0xc6, + 0x19, 0x06, 0x3d, 0xe0, 0x29, 0x59, 0xe6, 0xc1, 0x21, 0x26, 0x5c, 0x72, + 0x98, 0x05, 0xde, 0x5b, 0xae, 0x5a, 0x5c, 0x40, 0xf8, 0x31, 0x9b, 0x5e, + 0xa8, 0x18, 0x75, 0x1e, 0xb4, 0x22, 0x4a, 0xda, 0x41, 0xa8, 0x1e, 0xec, + 0x6a, 0x39, 0x8c, 0xfa, 0x6f, 0xa9, 0xdc, 0xa5, 0x8f, 0xc5, 0x67, 0x8a, + 0x68, 0xde, 0xee, 0xc4, 0x0a, 0xcd, 0x34, 0x6b, 0xf7, 0x57, 0x5f, 0x19, + 0x8c, 0x98, 0xdd, 0xec, 0x4f, 0xd2, 0x28, 0x48, 0x4d, 0x34, 0x86, 0x6a, + 0x2f, 0x5a, 0x36, 0x80, 0x5d, 0x22, 0x68, 0xd2, 0x3e, 0x39, 0x2a, 0x0d, + 0xeb, 0xbf, 0x91, 0x83, 0x80, 0x60, 0xa8, 0x44, 0x67, 0x21, 0x89, 0xca, + 0x8a, 0x52, 0xcd, 0x7e, 0xcf, 0xa5, 0x99, 0xd0, 0x72, 0xa9, 0x62, 0x7b, + 0xfa, 0x38, 0x36, 0x20, 0xe1, 0x19, 0xb5, 0x6d, 0x68, 0x9a, 0x44, 0x7e, + 0x7b, 0x1d, 0x46, 0x11, 0x49, 0x26, 0x9e, 0xbd, 0x06, 0xb2, 0x01, 0x53, + 0x02, 0xf2, 0x47, 0xe6, 0x57, 0x74, 0x09, 0x13, 0x39, 0x38, 0x02, 0x34, + 0x2a, 0x3b, 0x8d, 0xc0, 0x68, 0x2d, 0x2c, 0x7c, 0x1e, 0x38, 0x80, 0xdb, + 0x0a, 0xbf, 0xf7, 0x31, 0x35, 0xec, 0xbb, 0xaa, 0xc9, 0xaa, 0x0e, 0x9c, + 0x33, 0x8e, 0x7f, 0x85, 0x1f, 0x69, 0x9f, 0xf2, 0x48, 0x5d, 0x51, 0x51, + 0xde, 0x33, 0x87, 0x8f, 0x09, 0x02, 0x9e, 0xb7, 0x42, 0xa5, 0x32, 0x1a, + 0xe8, 0xf7, 0xa3, 0x8e, 0xfc, 0x5c, 0xcc, 0x8b, 0x8c, 0xc4, 0x44, 0x66, + 0xca, 0x28, 0xe4, 0x54, 0xc2, 0xfa, 0x9f, 0x69, 0xd8, 0xfe, 0x1e, 0x37, + 0x6b, 0x34, 0x04, 0x94, 0xfe, 0x47, 0xa4, 0x6d, 0x45, 0x28, 0x19, 0xa2, + 0xd8, 0x42, 0x89, 0xc7, 0xa0, 0x05, 0x34, 0x3c, 0x9f, 0xb3, 0x0d, 0xa2, + 0x56, 0xb3, 0x45, 0xc4, 0x8c, 0x0b, 0x60, 0xcf, 0x58, 0x40, 0xa0, 0x21, + 0x49, 0x3a, 0xac, 0xee, 0x6b, 0xc8, 0x98, 0x1c, 0x7c, 0xf9, 0x9d, 0xac, + 0x7c, 0x9c, 0xc8, 0xd5, 0xcf, 0xf9, 0x2a, 0x83, 0xec, 0x50, 0x0e, 0xf2, + 0x4d, 0x73, 0x05, 0xf2, 0x0f, 0xea, 0xca, 0x45, 0x2b, 0x97, 0x75, 0xbd, + 0x53, 0x63, 0x36, 0x68, 0x9b, 0x14, 0x5c, 0xe0, 0x75, 0xd3, 0xb9, 0xe3, + 0x25, 0x04, 0x75, 0x7a, 0xe7, 0xb7, 0xa5, 0x44, 0x14, 0x0d, 0xc4, 0x4f, + 0xab, 0x86, 0x94, 0xdf, 0x50, 0x76, 0x73, 0xd7, 0x8d, 0x21, 0x2c, 0x77, + 0x51, 0x16, 0xb2, 0x4a, 0x5f, 0xcb, 0x45, 0x6c, 0x19, 0x7a, 0x3b, 0x61, + 0xbb, 0x98, 0xe4, 0xd3, 0x4a, 0x33, 0x1d, 0x88, 0x4c, 0x65, 0x1d, 0xdb, + 0x86, 0xd0, 0x14, 0x2f, 0x3e, 0x27, 0x37, 0x1e, 0xbc, 0x88, 0x84, 0x23, + 0x07, 0xe1, 0xc4, 0xaa, 0x5b, 0xde, 0x03, 0x86, 0x77, 0xfd, 0x53, 0x73, + 0xd0, 0x70, 0xa2, 0xf9, 0xc6, 0x6c, 0xcb, 0x60, 0xd7, 0xe3, 0xa5, 0xd8, + 0x34, 0xf8, 0x35, 0x7d, 0xe1, 0xaf, 0x8b, 0x0c, 0x04, 0x42, 0x70, 0x06, + 0x32, 0xb5, 0x7e, 0x9e, 0xec, 0xda, 0xef, 0xd3, 0xd6, 0x3d, 0xd0, 0x6a, + 0xcb, 0x7b, 0xed, 0xde, 0xb9, 0x0c, 0xf3, 0xb1, 0x88, 0x35, 0x4a, 0x79, + 0x2c, 0x88, 0x63, 0xbc, 0xb7, 0x1a, 0x3e, 0xee, 0x61, 0xc5, 0x1f, 0x1b, + 0x94, 0xbb, 0xc1, 0x6b, 0x61, 0xfa, 0x43, 0x27, 0x9b, 0x39, 0x5d, 0x0f, + 0xb8, 0x0d, 0x25, 0xa7, 0xcc, 0x1c, 0x99, 0x36, 0xa0, 0x07, 0x71, 0xa9, + 0x39, 0x73, 0x3a, 0x5a, 0x12, 0x99, 0xa6, 0x32, 0x30, 0xa5, 0x85, 0x84, + 0xa4, 0xe8, 0x98, 0xb4, 0xd1, 0xd1, 0x5d, 0x39, 0xeb, 0x24, 0xde, 0xf2, + 0xb4, 0xc2, 0x09, 0x90, 0x1a, 0x1b, 0x61, 0x58, 0x51, 0x11, 0x75, 0x7a, + 0x37, 0x0f, 0x69, 0x07, 0x6d, 0xfa, 0x14, 0x08, 0x84, 0xce, 0x4e, 0xa4, + 0x91, 0x96, 0x64, 0xfe, 0x1b, 0x4d, 0xa4, 0x52, 0x4e, 0xa8, 0x25, 0xa8, + 0x70, 0x25, 0xb0, 0x60, 0xed, 0xbe, 0x65, 0x3f, 0x8a, 0xf5, 0x95, 0x5a, + 0xf7, 0xda, 0xe2, 0xd7, 0xc2, 0xcf, 0x3b, 0xdc, 0x44, 0xf6, 0xe7, 0x68, + 0x10, 0x50, 0x29, 0xbe, 0x48, 0x55, 0xcc, 0xac, 0xaf, 0x12, 0xa9, 0x61, + 0x89, 0x89, 0xc9, 0xca, 0xef, 0x37, 0x35, 0x49, 0x0c, 0xc2, 0xb6, 0x36, + 0xbf, 0x6e, 0x24, 0x80, 0x5c, 0x0c, 0x78, 0x33, 0x64, 0xfe, 0x5b, 0xf3, + 0x75, 0x36, 0x65, 0x28, 0xeb, 0x20, 0xb4, 0xb0, 0x61, 0xcd, 0x7a, 0x0b, + 0x24, 0x1e, 0x46, 0x1d, 0x10, 0xb1, 0x25, 0x6e, 0x3a, 0x25, 0x86, 0xf6, + 0xad, 0x1c, 0x43, 0x0c, 0xf9, 0xcf, 0x42, 0xe0, 0x67, 0x2b, 0x36, 0x51, + 0x66, 0xbb, 0xf7, 0xea, 0x41, 0x69, 0x6e, 0xb8, 0xb7, 0x0e, 0xfc, 0xfd, + 0xaa, 0x1a, 0x95, 0x43, 0x08, 0x4b, 0xb8, 0xfc, 0x3b, 0x99, 0xeb, 0xa0, + 0x7c, 0x8d, 0xbd, 0x69, 0x49, 0xd3, 0xdd, 0x0f, 0x8e, 0xbe, 0x34, 0xd2, + 0x27, 0x87, 0x64, 0x79, 0x0d, 0xa6, 0xf2, 0xe5, 0x2f, 0xe3, 0xad, 0x2f, + 0x21, 0xd6, 0x36, 0x58, 0x2c, 0x6e, 0x6a, 0x43, 0x6b, 0xc7, 0x3d, 0x8b, + 0x8d, 0x24, 0xf0, 0xcb, 0x33, 0xdf, 0xf1, 0xd2, 0xeb, 0xcb, 0x5d, 0x00, + 0xb1, 0x63, 0x08, 0x77, 0x9d, 0xfc, 0xa1, 0x58, 0x7d, 0xe4, 0xb7, 0x95, + 0xc5, 0x99, 0x08, 0x6f, 0xf9, 0xc9, 0x50, 0x4f, 0x8c, 0xa8, 0xc0, 0xe6, + 0xc4, 0xff, 0x00, 0xcd, 0x6f, 0xbe, 0xa3, 0xb6, 0x66, 0x63, 0x5d, 0x4b, + 0x60, 0x4b, 0x1b, 0x3e, 0xc6, 0x70, 0x65, 0xe4, 0x8e, 0x48, 0xb2, 0x68, + 0xdc, 0xd1, 0xac, 0x78, 0x46, 0x3d, 0x32, 0x58, 0x00, 0xfb, 0x28, 0x95, + 0x7a, 0xf7, 0x11, 0x33, 0x5a, 0x5b, 0xb8, 0xfa, 0xe0, 0xa1, 0x84, 0x33, + 0x64, 0x36, 0x37, 0xd4, 0x64, 0x9e, 0xc1, 0x05, 0xca, 0x87, 0x1d, 0xee, + 0xc0, 0x4f, 0xf4, 0xd4, 0x81, 0x78, 0x54, 0x09, 0x93, 0x56, 0xc0, 0xb6, + 0x7e, 0x26, 0xc3, 0xed, 0xec, 0xdb, 0x24, 0x2c, 0x80, 0xee, 0x0a, 0x57, + 0x6c, 0xa9, 0x85, 0x1d, 0x44, 0xdd, 0x61, 0xb6, 0xfc, 0xdb, 0xd1, 0x00, + 0xd0, 0x5f, 0xbb, 0x43, 0xcf, 0x32, 0x41, 0x2a, 0xef, 0xe8, 0x57, 0x89, + 0x6d, 0xf0, 0x55, 0x8c, 0x54, 0x61, 0xee, 0x8a, 0xf7, 0x1d, 0x90, 0x51, + 0x0f, 0x4e, 0x56, 0xd6, 0x9d, 0x71, 0x77, 0x3c, 0xe5, 0x28, 0xee, 0xc1, + 0xd6, 0x8f, 0xef, 0xbd, 0x2c, 0x83, 0xb8, 0xce, 0x24, 0xa4, 0x41, 0x90, + 0x14, 0x59, 0xd2, 0xfc, 0xc3, 0x0e, 0x5d, 0xa9, 0x22, 0x3b, 0xd3, 0xa4, + 0x30, 0x19, 0x05, 0x64, 0x61, 0x1e, 0x86, 0xec, 0x89, 0x19, 0x75, 0xaf, + 0x80, 0xe3, 0xd7, 0xfb, 0x0a, 0x91, 0xe2, 0xb7, 0xd1, 0x6f, 0xe4, 0xcc, + 0x88, 0x92, 0x48, 0xed, 0x1e, 0x3b, 0xec, 0xa3, 0x9d, 0xeb, 0x9e, 0x78, + 0xb1, 0xa9, 0x34, 0xc1, 0xb6, 0x9b, 0xfc, 0x1e, 0x2d, 0xef, 0xb1, 0x16, + 0x49, 0xd9, 0x60, 0xde, 0xd0, 0xf5, 0x69, 0x12, 0x4d, 0xa1, 0xf7, 0xb4, + 0x58, 0x3d, 0x9e, 0xb7, 0x93, 0xce, 0x37, 0x33, 0xc1, 0x82, 0x00, 0x74, + 0x20, 0xfb, 0x3f, 0x48, 0xb1, 0x52, 0x43, 0x0c, 0x5c, 0x96, 0x82, 0x84, + 0x4b, 0xe3, 0x8e, 0x1a, 0xe4, 0x64, 0x39, 0x6d, 0x98, 0x1c, 0x65, 0x11, + 0xb2, 0x06, 0xc1, 0xae, 0x01, 0x68, 0xd6, 0xfe, 0x57, 0xe7, 0x53, 0xd7, + 0xb1, 0x9f, 0xf9, 0xc8, 0x14, 0x17, 0x85, 0x5c, 0x41, 0x2b, 0x30, 0x60, + 0xbe, 0xfb, 0x61, 0x6b, 0xd9, 0x04, 0x24, 0xa3, 0xec, 0x84, 0x29, 0x20, + 0x6a, 0x70, 0x85, 0x1c, 0xf7, 0xe1, 0xce, 0x37, 0x14, 0x13, 0x6c, 0xfe, + 0xe5, 0x48, 0x6a, 0x33, 0xb9, 0xb9, 0xa2, 0x31, 0x5f, 0xec, 0x3f, 0x8b, + 0x16, 0x63, 0xb1, 0x2a, 0xb3, 0x6b, 0x07, 0x9a, 0x72, 0x62, 0xd2, 0x02, + 0xef, 0x4c, 0x77, 0x82, 0x1b, 0xaa, 0xbf, 0xc4, 0xe2, 0x21, 0x2f, 0xc9, + 0x31, 0x3d, 0x92, 0xf4, 0x73, 0x02, 0xa2, 0x47, 0x90, 0xed, 0xcb, 0xb2, + 0xb9, 0xcd, 0xbf, 0xab, 0x7f, 0x6a, 0xe8, 0xc5, 0x19, 0x9a, 0x6a, 0x02, + 0x98, 0x36, 0xfb, 0xe1, 0xae, 0xac, 0x6b, 0xca, 0x93, 0x59, 0xf1, 0x04, + 0x9e, 0xd8, 0x78, 0x1e, 0x6d, 0x85, 0x3e, 0x2a, 0xe8, 0x79, 0x12, 0x67, + 0x27, 0xb4, 0x9b, 0x66, 0xf7, 0xe2, 0x01, 0xf4, 0x98, 0xbe, 0xb0, 0x74, + 0x6d, 0x91, 0xf8, 0x3c, 0x18, 0x7e, 0xc6, 0x84, 0x10, 0x51, 0xda, 0x14, + 0x69, 0xc0, 0xac, 0xa4, 0xe3, 0xd8, 0x97, 0x73, 0xfe, 0xeb, 0xe0, 0xae, + 0x51, 0xc4, 0x22, 0xa5, 0x91, 0x6f, 0x87, 0x38, 0x39, 0x9e, 0x97, 0x3a, + 0xeb, 0x91, 0xa9, 0xbb, 0xf9, 0xa4, 0xca, 0x3e, 0xef, 0x17, 0x58, 0x79, + 0x45, 0x77, 0x2b, 0xe7, 0x50, 0x92, 0x85, 0x90, 0x12, 0xc5, 0xc3, 0xe3, + 0xcd, 0xc4, 0xbe, 0xbf, 0x6f, 0x47, 0x25, 0x88, 0xcd, 0x6c, 0xcc, 0xee, + 0x9e, 0x61, 0x1b, 0x78, 0x0d, 0xd7, 0xaa, 0x77, 0x1b, 0x05, 0x91, 0xe7, + 0x28, 0x09, 0xfc, 0x66, 0xe5, 0x7e, 0x28, 0xb3, 0x1d, 0x29, 0x33, 0x09, + 0xdd, 0x15, 0x28, 0x17, 0x40, 0xa2, 0xef, 0xf4, 0xb9, 0x98, 0x2d, 0xc4, + 0x22, 0x9b, 0x1e, 0xc1, 0xef, 0x49, 0x5f, 0xe7, 0xb9, 0xf2, 0x5b, 0x74, + 0x6c, 0xd0, 0x11, 0xca, 0x5a, 0x3c, 0xa5, 0x78, 0xe7, 0xd2, 0xd7, 0x05, + 0x86, 0x06, 0x7d, 0xe9, 0x13, 0x6f, 0xcd, 0x12, 0x26, 0x51, 0x9c, 0xf8, + 0xcb, 0x5f, 0x38, 0x89, 0x32, 0xbb, 0xaa, 0x8c, 0xb9, 0xac, 0x67, 0x19, + 0xee, 0xf8, 0x9f, 0x8b, 0x89, 0xca, 0xbc, 0x89, 0x13, 0xd3, 0x7e, 0x2b, + 0xa7, 0xb0, 0x8c, 0x5f, 0x4e, 0x89, 0xad, 0x75, 0xc7, 0xc1, 0xf1, 0xd3, + 0xa1, 0x0d, 0xc1, 0xbc, 0x7f, 0x60, 0xcf, 0x83, 0x5f, 0x27, 0x43, 0xe1, + 0x2f, 0x45, 0x0c, 0x01, 0x65, 0x73, 0x31, 0x34, 0x63, 0x52, 0x44, 0x0e, + 0x2a, 0x6c, 0x10, 0x29, 0xf4, 0xad, 0x51, 0x07, 0x58, 0x41, 0xc6, 0x74, + 0xac, 0x8c, 0xfc, 0x01, 0xba, 0x80, 0xb9, 0x2d, 0x02, 0xc6, 0x96, 0x1d, + 0xcf, 0x6d, 0xad, 0xb2, 0xc1, 0x8f, 0xec, 0x6d, 0xdd, 0xb6, 0x70, 0x4c, + 0x99, 0x33, 0x16, 0xb2, 0x60, 0x34, 0xc3, 0xaa, 0x15, 0x09, 0xf2, 0x62, + 0xf2, 0x95, 0x80, 0xf2, 0xc1, 0xca, 0x6d, 0x20, 0x67, 0x79, 0x7a, 0x29, + 0xfa, 0xbb, 0x19, 0xd8, 0xd8, 0x20, 0xa7, 0x56, 0x84, 0xce, 0x96, 0x2c, + 0x92, 0x0d, 0xf7, 0x00, 0x66, 0xc5, 0x8e, 0x05, 0x59, 0x3e, 0xf7, 0xa9, + 0x14, 0x6b, 0x50, 0xaf, 0x83, 0xfd, 0x7b, 0x58, 0x5c, 0x2c, 0x6b, 0x7f, + 0x84, 0x60, 0x53, 0xbb, 0x02, 0x3e, 0xad, 0x6c, 0x58, 0xe4, 0x0f, 0x7b, + 0x77, 0x54, 0xd8, 0x11, 0x35, 0x5d, 0x98, 0x63, 0x51, 0x0c, 0x3f, 0x16, + 0x3f, 0x8a, 0x1e, 0xf3, 0x58, 0x59, 0xb1, 0x32, 0x98, 0x10, 0xda, 0x94, + 0x61, 0x81, 0x78, 0xf4, 0xec, 0x3b, 0x88, 0xc6, 0x47, 0xb1, 0xad, 0x01, + 0xe6, 0xdb, 0x70, 0xa8, 0x13, 0x59, 0xbb, 0x67, 0x10, 0xeb, 0x7c, 0x5c, + 0xba, 0x5d, 0xba, 0xdf, 0x81, 0xf0, 0xe9, 0x80, 0x75, 0xe4, 0x61, 0xb0, + 0x72, 0xcf, 0x3b, 0x7a, 0xeb, 0xf0, 0xc1, 0xdf, 0xb9, 0xcc, 0x6f, 0x07, + 0x75, 0xce, 0x22, 0xc8, 0x90, 0x3f, 0x90, 0xc9, 0xf3, 0x68, 0x91, 0xc8, + 0xf2, 0x51, 0x2e, 0x72, 0x54, 0x40, 0x76, 0xdb, 0x9c, 0x75, 0x70, 0x5b, + 0x5f, 0xd4, 0x5a, 0x45, 0x81, 0x89, 0x4a, 0xd8, 0x76, 0x22, 0x3e, 0xba, + 0xaf, 0x80, 0xee, 0xe6, 0x8e, 0xd1, 0x14, 0xad, 0x24, 0x6f, 0x54, 0x71, + 0x7d, 0x9f, 0x94, 0x25, 0x19, 0x39, 0xcc, 0x11, 0x11, 0x1a, 0x15, 0xcb, + 0xc7, 0x02, 0xcd, 0x82, 0xf5, 0x7c, 0xbf, 0x11, 0x32, 0xd2, 0x33, 0xc6, + 0xee, 0x05, 0x02, 0x46, 0x9d, 0xe6, 0xaf, 0xdd, 0xde, 0x94, 0xac, 0x35, + 0xf1, 0x4f, 0x21, 0xbe, 0xab, 0x05, 0x78, 0xae, 0xd4, 0x6c, 0x64, 0x10, + 0x71, 0xc4, 0x64, 0x11, 0x5d, 0x06, 0xc7, 0xdf, 0xfd, 0x1b, 0x90, 0x81, + 0x75, 0xd2, 0xee, 0x2f, 0x8e, 0x5a, 0xe2, 0xc6, 0x12, 0x95, 0xda, 0xfb, + 0xf1, 0xea, 0xf2, 0xfc, 0xcc, 0x09, 0x8b, 0xd3, 0x0d, 0x76, 0x20, 0xb1, + 0xe6, 0x62, 0xe4, 0xee, 0xfe, 0x1e, 0x45, 0x70, 0x25, 0x91, 0x61, 0x3a, + 0xb5, 0xd5, 0x86, 0xa3, 0xb6, 0x96, 0x9c, 0xc0, 0xa3, 0x61, 0x77, 0x30, + 0xc4, 0x83, 0x1b, 0xeb, 0xa6, 0x46, 0x5c, 0xe7, 0xb6, 0x2b, 0xb2, 0xf6, + 0xce, 0xd8, 0xb0, 0xad, 0xff, 0x8d, 0x70, 0x19, 0x3d, 0x37, 0x6a, 0x77, + 0xc6, 0xac, 0x91, 0x20, 0xdd, 0x8c, 0x4a, 0x9f, 0x99, 0x3e, 0x9f, 0x57, + 0x7e, 0x81, 0x84, 0xd3, 0x4c, 0x56, 0x2e, 0xa4, 0xff, 0xa7, 0xa5, 0x25, + 0x86, 0xbf, 0x80, 0x04, 0x6e, 0x8f, 0xe8, 0xf9, 0x1f, 0x38, 0x60, 0xf6, + 0x2c, 0xae, 0x7f, 0xe0, 0xe5, 0x9f, 0x5e, 0x21, 0x08, 0xbb, 0xe1, 0x41, + 0x42, 0xc2, 0xa5, 0xd9, 0xfd, 0x73, 0x82, 0x86, 0xfa, 0x86, 0xe6, 0x91, + 0x53, 0x6f, 0x99, 0x21, 0x29, 0x09, 0xb9, 0xfe, 0x91, 0x20, 0x99, 0xd8, + 0xf1, 0xe9, 0xfc, 0x63, 0x2f, 0x51, 0x52, 0x7d, 0x3f, 0xe6, 0x88, 0xbd, + 0x31, 0xdf, 0x74, 0xa0, 0xb8, 0x5d, 0xf0, 0x9d, 0x07, 0x20, 0xd3, 0x26, + 0x7b, 0x80, 0xbd, 0x07, 0x6d, 0x32, 0xbb, 0x62, 0xd2, 0xec, 0x35, 0x3f, + 0x7e, 0x5c, 0x10, 0xd0, 0x94, 0xd9, 0x14, 0x09, 0xee, 0xdc, 0x6f, 0xcd, + 0xbb, 0x57, 0x1b, 0xed, 0x9a, 0x29, 0xfe, 0x34, 0x83, 0x3e, 0x08, 0x82, + 0xc1, 0xa1, 0xae, 0x8d, 0xc6, 0x7c, 0xa2, 0x5f, 0x49, 0x1e, 0x9a, 0x55, + 0xfc, 0x88, 0x21, 0x45, 0xde, 0xd1, 0x7e, 0x4c, 0xe0, 0x38, 0x9c, 0xea, + 0xd7, 0x18, 0x0a, 0xba, 0x80, 0x23, 0xd6, 0x3e, 0x2e, 0xe1, 0x3e, 0x34, + 0x83, 0x7b, 0x69, 0x42, 0xfc, 0xf6, 0xf0, 0x17, 0xc1, 0xac, 0x7f, 0x86, + 0xc1, 0x86, 0x81, 0x8c, 0x01, 0xe7, 0x70, 0x02, 0x55, 0x7a, 0x72, 0xf2, + 0xee, 0x4c, 0x96, 0xb9, 0xa1, 0xb9, 0x50, 0x67, 0x95, 0x74, 0x42, 0xd1, + 0x16, 0x79, 0x90, 0xb2, 0x85, 0x78, 0x5b, 0x91, 0xf8, 0x59, 0x19, 0xe7, + 0x6e, 0xb6, 0xc6, 0xff, 0xe0, 0x27, 0x2a, 0x39, 0x82, 0x91, 0xf2, 0x5b, + 0xb4, 0x4c, 0x56, 0x22, 0x46, 0x06, 0x09, 0xf3, 0x6e, 0x2e, 0x69, 0x3b, + 0x58, 0x7f, 0xb9, 0x8e, 0x40, 0x59, 0x51, 0xb0, 0x2f, 0x4c, 0x0d, 0x7b, + 0x01, 0x9f, 0x0a, 0xf3, 0xdd, 0x38, 0xb3, 0xc9, 0x49, 0x15, 0xea, 0xaf, + 0x6a, 0xf0, 0x2a, 0xe7, 0x3d, 0x23, 0xb2, 0xe7, 0xf7, 0x5d, 0xb9, 0xa0, + 0x40, 0xd0, 0x4b, 0xcd, 0x95, 0xca, 0x54, 0xba, 0x25, 0x82, 0x11, 0xb8, + 0x45, 0x76, 0x4c, 0xa9, 0x6f, 0x46, 0x1e, 0xf0, 0xcb, 0xcb, 0x89, 0x92, + 0x2c, 0x5f, 0xdf, 0xbb, 0x80, 0xcb, 0x30, 0x5b, 0xbe, 0x29, 0x21, 0x23, + 0x2f, 0x87, 0xa0, 0x1e, 0xe8, 0xe4, 0x29, 0x8c, 0x77, 0x78, 0x10, 0xb1, + 0x2c, 0x35, 0x10, 0xa0, 0x17, 0x8c, 0x12, 0x57, 0x2e, 0xca, 0xae, 0xa7, + 0x56, 0x21, 0xde, 0x74, 0x17, 0xf9, 0xa4, 0xdd, 0x38, 0x9c, 0xb9, 0x0f, + 0x2a, 0xa7, 0x33, 0xb7, 0x22, 0xb3, 0x2b, 0xd9, 0xcd, 0x88, 0x3e, 0x86, + 0x85, 0x38, 0xae, 0xb5, 0x88, 0x8d, 0xa4, 0x8f, 0x99, 0x1e, 0x2f, 0x4a, + 0x8c, 0xfe, 0x58, 0x7f, 0xb4, 0x6e, 0xf2, 0x4f, 0x9e, 0x9f, 0x36, 0xec, + 0xf2, 0x5b, 0xd3, 0xa2, 0x76, 0xf5, 0xa2, 0x2f, 0xbb, 0x42, 0x1d, 0x9c, + 0xa5, 0xb9, 0x20, 0xb7, 0xcc, 0xdb, 0xd9, 0x5c, 0xe0, 0x69, 0x10, 0xbb, + 0xae, 0xa3, 0xb4, 0x69, 0x86, 0xa0, 0x7e, 0xfa, 0xb5, 0x6d, 0xf2, 0xac, + 0x3c, 0x96, 0xe5, 0xb6, 0x07, 0xe5, 0x8a, 0x97, 0xb8, 0x90, 0xcb, 0x5a, + 0x23, 0x75, 0x56, 0xba, 0xf1, 0xbb, 0xf6, 0x10, 0x69, 0xc1, 0xf2, 0x57, + 0xec, 0xab, 0x8b, 0xbe, 0x19, 0x0f, 0x72, 0x79, 0x46, 0xcc, 0xcb, 0xac, + 0xab, 0xbe, 0x94, 0xae, 0xa4, 0x52, 0x43, 0xf4, 0x74, 0x05, 0x95, 0x33, + 0x97, 0xd5, 0x7c, 0x41, 0x92, 0xef, 0x33, 0x02, 0x64, 0x57, 0x8e, 0x14, + 0x05, 0x2c, 0xf4, 0x14, 0xc6, 0x62, 0x16, 0xa5, 0xc6, 0xe1, 0x38, 0x19, + 0x01, 0x9f, 0xd2, 0x68, 0x18, 0x20, 0xaf, 0x81, 0xd1, 0x62, 0xd8, 0x32, + 0x7a, 0x0e, 0x75, 0x85, 0x47, 0x43, 0xc2, 0xee, 0x8d, 0x96, 0xc7, 0xae, + 0x00, 0xc3, 0x29, 0xc5, 0xf5, 0xf4, 0xca, 0xdd, 0x05, 0x1b, 0xfd, 0x82, + 0x26, 0xa0, 0x67, 0xa0, 0x9f, 0xcb, 0x37, 0xe9, 0xe3, 0x3f, 0x79, 0x71, + 0x80, 0x7f, 0x12, 0xc2, 0xe8, 0x54, 0x60, 0x8b, 0x8b, 0xb9, 0x19, 0xe7, + 0x81, 0x70, 0x28, 0xd5, 0xf2, 0xb6, 0x3e, 0xb1, 0xc7, 0x66, 0x44, 0x4f, + 0xe3, 0xa8, 0xe1, 0xa1, 0x97, 0x9a, 0xea, 0x60, 0x6c, 0x1b, 0xdd, 0x93, + 0xd5, 0x76, 0x0e, 0xe8, 0xce, 0x8f, 0x75, 0x3f, 0x08, 0x55, 0x03, 0x55, + 0x2e, 0xd6, 0x93, 0xb8, 0x4a, 0xe4, 0xef, 0x18, 0xbc, 0x7d, 0x7e, 0x4a, + 0xae, 0x90, 0x5d, 0xd6, 0x53, 0x72, 0xff, 0x5b, 0xaa, 0x8b, 0x3b, 0x22, + 0xb4, 0x8d, 0x1c, 0xdc, 0x8d, 0x4d, 0x2b, 0xd6, 0x4a, 0x33, 0xaf, 0xa6, + 0x5c, 0xb3, 0x02, 0xfe, 0x57, 0x6e, 0x1f, 0xe8, 0x39, 0x0e, 0x62, 0x68, + 0x93, 0xa9, 0x7b, 0x64, 0x40, 0x2c, 0xc6, 0xea, 0xd6, 0x26, 0xf0, 0x0e, + 0x40, 0xdd, 0xf2, 0x6e, 0x6a, 0x2b, 0x9e, 0x0e, 0xd7, 0x03, 0xdf, 0x4c, + 0x00, 0x8a, 0x6a, 0xf4, 0x14, 0x4c, 0x5a, 0x61, 0x23, 0x93, 0x12, 0xeb, + 0x97, 0xe9, 0xfb, 0x6b, 0x63, 0xc4, 0x08, 0x54, 0xac, 0x01, 0xd0, 0xe8, + 0xb4, 0x6e, 0xee, 0x58, 0x3c, 0x4b, 0x89, 0xfa, 0xd8, 0x6e, 0x4f, 0x5c, + 0xb3, 0xdb, 0x51, 0x20, 0x63, 0x96, 0xa8, 0xef, 0x68, 0xbe, 0xbf, 0x0a, + 0x69, 0x71, 0x33, 0x2a, 0x57, 0x0c, 0x20, 0x21, 0x5f, 0xe5, 0x45, 0x4d, + 0xa0, 0x95, 0xd2, 0xb9, 0xe8, 0xc3, 0xbf, 0x6c, 0x77, 0xb8, 0xd8, 0x28, + 0x2b, 0xce, 0x68, 0x2c, 0xb5, 0x7b, 0xdd, 0x31, 0xae, 0xd8, 0xa4, 0xee, + 0x66, 0x7b, 0x3c, 0xc6, 0x6d, 0x58, 0xf3, 0xc2, 0xee, 0xff, 0x95, 0x98, + 0x2b, 0xff, 0x1a, 0x52, 0xc1, 0xca, 0xb2, 0xbe, 0x87, 0x79, 0x9b, 0xe1, + 0x19, 0x5c, 0xa5, 0x6c, 0x20, 0x8b, 0x2d, 0xa5, 0x46, 0x9d, 0xa7, 0x8a, + 0xa5, 0x2e, 0xb0, 0xe1, 0x54, 0x1d, 0x74, 0xb8, 0x2e, 0x55, 0x99, 0x74, + 0x7a, 0xe1, 0xe0, 0x81, 0xce, 0x64, 0x85, 0xb2, 0x5a, 0x1e, 0x57, 0xdc, + 0x60, 0xf0, 0xfe, 0x0f, 0xe2, 0x72, 0x96, 0xab, 0x68, 0x6d, 0xcb, 0x5a, + 0x1a, 0x32, 0xf0, 0x77, 0xda, 0x54, 0x36, 0x3d, 0x26, 0x12, 0x23, 0x23, + 0xcf, 0x01, 0xf3, 0x49, 0xe4, 0x68, 0x1f, 0x5b, 0x95, 0x50, 0x04, 0xb5, + 0x67, 0x49, 0xf5, 0x31, 0x0b, 0xb1, 0x2c, 0xf2, 0xb6, 0x26, 0xb4, 0x5d, + 0x27, 0x9f, 0x3d, 0x7d, 0x2f, 0xf0, 0x06, 0xd4, 0x95, 0xdc, 0x38, 0x0f, + 0xdb, 0xa8, 0xc8, 0x23, 0x9f, 0xe6, 0x6c, 0x4d, 0xe7, 0xdc, 0xeb, 0x30, + 0xdf, 0xc1, 0xfa, 0xeb, 0xa0, 0xa0, 0x08, 0x0f, 0xdc, 0xad, 0xf6, 0xe3, + 0xb3, 0x62, 0x62, 0x5b, 0xe4, 0xf7, 0x14, 0xda, 0x07, 0x2f, 0x75, 0x35, + 0xe4, 0xaf, 0x7c, 0xda, 0x8e, 0x2a, 0xf1, 0x62, 0x60, 0x8d, 0x63, 0xe3, + 0x9a, 0x98, 0x6a, 0xbe, 0x9b, 0xfb, 0x51, 0xd9, 0x40, 0x8f, 0xf5, 0x5c, + 0xb5, 0x32, 0xc0, 0xfd, 0x9e, 0xce, 0xa3, 0x64, 0xe5, 0x38, 0xf9, 0xd6, + 0xef, 0x85, 0x25, 0x70, 0xb8, 0x38, 0xc5, 0x12, 0xb9, 0x5d, 0xd5, 0x57, + 0xf0, 0x29, 0x76, 0xa0, 0x1d, 0xc7, 0x9b, 0xf2, 0xf8, 0xe8, 0xb3, 0xea, + 0x10, 0x28, 0x3d, 0xf2, 0x71, 0xa8, 0x2d, 0x12, 0xfa, 0x6a, 0x74, 0x61, + 0x05, 0x16, 0x98, 0x63, 0xcc, 0x70, 0x78, 0xdb, 0xe6, 0x39, 0x45, 0x83, + 0xd2, 0x16, 0x6d, 0x8e, 0xf5, 0x0e, 0xbc, 0x13, 0xf5, 0x37, 0x0c, 0xb4, + 0x86, 0x10, 0x0b, 0x07, 0xb6, 0xc6, 0x59, 0xbe, 0xfa, 0x70, 0xc9, 0x6a, + 0x29, 0xa7, 0xbe, 0x94, 0x0d, 0xde, 0xb8, 0x01, 0x87, 0xb2, 0xa0, 0xcf, + 0xa0, 0xb7, 0x2d, 0x4d, 0x26, 0x69, 0x15, 0x34, 0x23, 0x52, 0x4a, 0x82, + 0xce, 0xd7, 0xe2, 0x7a, 0xd3, 0x4c, 0x4c, 0x57, 0x37, 0xb3, 0xc9, 0xc4, + 0x4f, 0xec, 0xad, 0xd2, 0x1b, 0x95, 0xef, 0xa8, 0x1f, 0x43, 0xb6, 0x4f, + 0x66, 0x5a, 0xad, 0xf2, 0x64, 0x13, 0x4b, 0x0f, 0x0a, 0x02, 0x00, 0x2a, + 0x11, 0xb8, 0x53, 0x09, 0xaa, 0x08, 0x2f, 0x8c, 0x30, 0x68, 0x8e, 0x9c, + 0xd1, 0xb8, 0x0c, 0xf5, 0xd0, 0xb6, 0x8e, 0x1c, 0xdb, 0x2e, 0x0e, 0xaa, + 0x86, 0xcc, 0x86, 0x16, 0x3b, 0x8f, 0xe0, 0xa3, 0xb0, 0x36, 0x50, 0x25, + 0x11, 0x16, 0xd2, 0xc4, 0xc7, 0x2b, 0xbf, 0xa5, 0x25, 0xc5, 0xcb, 0x6e, + 0xd3, 0x8b, 0xca, 0x24, 0xec, 0x69, 0x1a, 0x75, 0x38, 0x3f, 0x60, 0x9b, + 0x17, 0xf6, 0x9c, 0x9a, 0xc1, 0xb2, 0xeb, 0xac, 0x20, 0x43, 0x73, 0x5f, + 0x6d, 0x6a, 0x53, 0xcf, 0x4a, 0xfd, 0x42, 0xa7, 0x2d, 0x98, 0xb0, 0x53, + 0x44, 0xad, 0xe2, 0x28, 0xf9, 0x61, 0xc3, 0x94, 0xef, 0x7e, 0x70, 0x3c, + 0x09, 0xf5, 0xd0, 0x30, 0x10, 0xc9, 0x15, 0xb2, 0x99, 0xbc, 0xb9, 0xbb, + 0x58, 0x72, 0x4d, 0x12, 0x80, 0x87, 0x8b, 0xd2, 0x3f, 0xf2, 0xa3, 0xc8, + 0x92, 0xde, 0xb1, 0xf8, 0xf4, 0x2c, 0x3b, 0x1a, 0x37, 0x10, 0xb9, 0xea, + 0x32, 0xc9, 0x34, 0xac, 0x3e, 0x6d, 0xb6, 0xd2, 0x7f, 0x02, 0x27, 0xfa, + 0x36, 0xca, 0x29, 0x18, 0x35, 0x7a, 0x3d, 0x75, 0x36, 0x38, 0x98, 0x8f, + 0x96, 0xa4, 0xaf, 0x17, 0x22, 0x2e, 0xdb, 0xaf, 0xbe, 0xd4, 0x1b, 0x42, + 0x92, 0x37, 0x1b, 0x85, 0x97, 0xd6, 0x98, 0xf6, 0x20, 0xfb, 0x9f, 0xbd, + 0xee, 0x7b, 0xd5, 0xb2, 0xed, 0x26, 0xfd, 0xc3, 0xd7, 0x36, 0x93, 0xb5, + 0xeb, 0xe9, 0xe2, 0xd9, 0xcc, 0x5c, 0x15, 0xf1, 0x63, 0xa2, 0xc8, 0x21, + 0xff, 0x3e, 0x06, 0x2e, 0x09, 0xe5, 0x2c, 0x57, 0xfe, 0x66, 0x36, 0x26, + 0x00, 0xd2, 0x2f, 0xe1, 0x4b, 0xfb, 0x55, 0x38, 0xfa, 0x29, 0xff, 0x3d, + 0x12, 0x28, 0xf8, 0xd8, 0xc0, 0x19, 0x55, 0xd3, 0xc6, 0xde, 0xb6, 0xa4, + 0xf7, 0x1a, 0x8c, 0xd3, 0x2b, 0x32, 0xc3, 0x45, 0x69, 0xd0, 0xc2, 0xf3, + 0x44, 0x07, 0x9a, 0x30, 0x3a, 0x68, 0xab, 0xdb, 0xa3, 0x05, 0x90, 0x57, + 0xef, 0x93, 0x8a, 0x09, 0x3a, 0xdc, 0xc1, 0x14, 0x00, 0x00, 0x3f, 0xd3, + 0x11, 0x16, 0x07, 0x89, 0xe8, 0x62, 0x63, 0x5a, 0x12, 0xba, 0x69, 0x56, + 0x66, 0x7e, 0xa8, 0xac, 0x65, 0xe4, 0xd3, 0xa3, 0xfa, 0x14, 0x63, 0xac, + 0x4b, 0x6e, 0xa5, 0x3d, 0x2e, 0xd7, 0xfa, 0xe0, 0xcf, 0x2f, 0x02, 0x80, + 0x6a, 0xfa, 0x35, 0xfe, 0xac, 0x29, 0x99, 0xa6, 0x07, 0xec, 0x54, 0xcb, + 0x0f, 0x16, 0x84, 0x6e, 0x65, 0x99, 0x52, 0x7c, 0xd7, 0x02, 0xa1, 0xdc, + 0xa8, 0x0d, 0x79, 0x4e, 0x1d, 0x87, 0x6f, 0xad, 0xe2, 0xb3, 0xed, 0x92, + 0xea, 0xe1, 0xcb, 0xa2, 0x50, 0xf0, 0x52, 0x53, 0x00, 0xaf, 0x02, 0xab, + 0xb3, 0xda, 0xed, 0xa8, 0x68, 0xc7, 0xf4, 0xb6, 0xa6, 0x7e, 0x43, 0x9c, + 0x4b, 0x18, 0x23, 0x3d, 0x02, 0x5a, 0xc9, 0x1b, 0x55, 0xda, 0x93, 0xc7, + 0x8d, 0x4d, 0xdd, 0xd3, 0xb2, 0x48, 0xf9, 0x6b, 0x98, 0x98, 0x12, 0x2a, + 0x59, 0x19, 0x7f, 0xfb, 0xc8, 0x2e, 0x08, 0x21, 0xc3, 0x49, 0x9c, 0x86, + 0xf4, 0xf8, 0x32, 0x82, 0x97, 0x49, 0x58, 0x1c, 0x3a, 0x22, 0xd3, 0x24, + 0x13, 0xf2, 0xc7, 0xa5, 0x71, 0x76, 0x40, 0x4c, 0x4a, 0x21, 0x04, 0x18, + 0x8c, 0xcc, 0x15, 0x37, 0xa9, 0xf6, 0x3b, 0x79, 0xe5, 0xc3, 0x7a, 0xba, + 0x2a, 0x5c, 0xc1, 0x35, 0x14, 0x5b, 0xd1, 0x13, 0x66, 0xaf, 0xe3, 0xc8, + 0xb9, 0x50, 0x82, 0x26, 0x5d, 0x6b, 0xc7, 0x72, 0x19, 0x4b, 0x7c, 0xa9, + 0xd6, 0xa3, 0xf8, 0x5a, 0xd6, 0x0e, 0xc6, 0x4e, 0xa0, 0x5a, 0xe5, 0x59, + 0x84, 0x6b, 0x42, 0x2d, 0x2d, 0x4d, 0x52, 0x62, 0x36, 0x11, 0x05, 0x3f, + 0xc8, 0x0b, 0xcf, 0x53, 0xd7, 0x5e, 0xb7, 0x18, 0x9e, 0xa4, 0xe0, 0xba, + 0xe2, 0x48, 0xb8, 0x9d, 0x97, 0x88, 0xa0, 0xd8, 0x47, 0x97, 0x82, 0x3b, + 0x08, 0x0b, 0x8b, 0x89, 0x6c, 0xaf, 0x95, 0xf2, 0xd7, 0x08, 0x1c, 0x9d, + 0x98, 0x0c, 0x20, 0x37, 0x3e, 0xc8, 0x18, 0xd3, 0x53, 0x9c, 0x4f, 0x4d, + 0x14, 0xd1, 0xac, 0xf9, 0x54, 0xf1, 0x54, 0x66, 0x39, 0x24, 0x22, 0x1a, + 0xfb, 0xf7, 0x2a, 0x1d, 0x13, 0x09, 0x58, 0x31, 0x4a, 0x0f, 0xac, 0x67, + 0xa6, 0xbe, 0xe8, 0x36, 0x1b, 0xd6, 0x05, 0xb3, 0x9a, 0xbb, 0x37, 0xb9, + 0xf5, 0x48, 0xe5, 0x89, 0x0c, 0x89, 0xbb, 0x02, 0x26, 0x86, 0x1d, 0x82, + 0xb9, 0xe7, 0xd7, 0x8e, 0x50, 0x19, 0xfc, 0xac, 0x6f, 0x65, 0xb5, 0xb2, + 0x21, 0xf7, 0xd4, 0x76, 0x92, 0x6e, 0x6f, 0x56, 0x01, 0x94, 0x01, 0xab, + 0x9e, 0x89, 0x05, 0x77, 0xa9, 0x65, 0x12, 0x2c, 0x62, 0xb2, 0xb1, 0xcc, + 0xac, 0x46, 0x02, 0x9f, 0x09, 0x7d, 0xa2, 0xac, 0x9e, 0x40, 0x83, 0x2a, + 0xd7, 0x9e, 0xc3, 0x7c, 0xa4, 0x8c, 0xa6, 0x01, 0xe3, 0x61, 0xc0, 0x09, + 0xb9, 0xdb, 0x8a, 0xfc, 0x11, 0x03, 0xa2, 0xbb, 0x1b, 0x13, 0x59, 0x9e, + 0xb6, 0x9b, 0x4b, 0xe8, 0x30, 0x4e, 0x6b, 0x8d, 0xd7, 0x04, 0x20, 0x3d, + 0x82, 0x53, 0xfb, 0x75, 0x6f, 0x43, 0xdf, 0x05, 0x44, 0xb1, 0x7a, 0x94, + 0x83, 0x9d, 0x55, 0x3d, 0x65, 0xa4, 0xdf, 0x78, 0x4a, 0xf1, 0x15, 0xb3, + 0x6b, 0x4a, 0x23, 0xfa, 0xcf, 0x52, 0xc6, 0xa4, 0x3c, 0x98, 0x54, 0xc6, + 0x3b, 0x5f, 0x90, 0x83, 0x67, 0x70, 0xbe, 0x5d, 0x93, 0x24, 0x32, 0xb2, + 0xbc, 0x56, 0x81, 0x89, 0x91, 0x8d, 0xf9, 0x17, 0x56, 0x2d, 0xd4, 0xec, + 0x0d, 0x17, 0xd9, 0x35, 0x05, 0x46, 0x29, 0x60, 0xf0, 0x6f, 0x3a, 0x9d, + 0x20, 0x5d, 0xee, 0x3c, 0x30, 0x72, 0xee, 0x39, 0xbc, 0x2b, 0xf8, 0xcb, + 0x94, 0xff, 0x65, 0x8e, 0x1c, 0x51, 0xfa, 0x1e, 0xd9, 0xb9, 0xc2, 0xcb, + 0x29, 0x4b, 0x45, 0x67, 0x70, 0x7c, 0x38, 0xf1, 0xd8, 0xd1, 0xba, 0xa9, + 0xae, 0x1f, 0xff, 0x80, 0x76, 0x8b, 0x5b, 0xe0, 0x2c, 0xcd, 0xe5, 0x9f, + 0x39, 0x38, 0x62, 0x91, 0x62, 0x40, 0x37, 0xf9, 0x87, 0x65, 0xc9, 0x62, + 0x53, 0xe9, 0xbf, 0xb0, 0x22, 0x72, 0x9e, 0x85, 0xf8, 0xa3, 0x9e, 0xa8, + 0x71, 0x46, 0xef, 0xd5, 0xec, 0x5d, 0xa0, 0x47, 0x58, 0x96, 0xc7, 0x53, + 0xdf, 0x69, 0x9e, 0xc5, 0xb0, 0x6c, 0x34, 0x6a, 0x85, 0xb3, 0xa2, 0xa2, + 0x6d, 0x04, 0x0c, 0xd0, 0x31, 0xdd, 0x0c, 0x78, 0x69, 0x9e, 0x4b, 0x79, + 0xde, 0xb6, 0x1d, 0x86, 0x83, 0x04, 0x98, 0x54, 0x97, 0x04, 0xae, 0xa2, + 0xc1, 0xd9, 0xe5, 0x3d, 0xda, 0x98, 0x02, 0xb5, 0xad, 0x67, 0x0b, 0x7c, + 0xb3, 0x6d, 0x0a, 0x40, 0xbe, 0xf2, 0x3a, 0x63, 0x1a, 0x34, 0x64, 0xbe, + 0x76, 0x83, 0x82, 0x39, 0xfa, 0xb6, 0x6f, 0xc1, 0x9a, 0x9f, 0xb1, 0x5c, + 0x8c, 0x86, 0x39, 0xf5, 0xcf, 0x78, 0x5e, 0x9e, 0x07, 0x17, 0xcb, 0x69, + 0x74, 0x19, 0x4a, 0x48, 0x48, 0x50, 0xda, 0xae, 0xfa, 0xa2, 0xd4, 0xb8, + 0x11, 0x00, 0x1a, 0x22, 0x69, 0x9c, 0x83, 0xf8, 0x00, 0x17, 0x65, 0xf9, + 0x18, 0xc9, 0xef, 0x99, 0x0f, 0x91, 0x4b, 0x90, 0x4d, 0xb5, 0x8c, 0x20, + 0x76, 0x7a, 0x2b, 0x7c, 0x21, 0xf6, 0x2e, 0x45, 0x32, 0xd3, 0x53, 0x80, + 0xd0, 0xfc, 0xb1, 0xd2, 0x1c, 0x27, 0x33, 0x75, 0x4c, 0x6a, 0xaa, 0xfb, + 0x68, 0x7b, 0x10, 0x6c, 0x93, 0xca, 0x2f, 0xa6, 0x50, 0x9c, 0x55, 0x63, + 0x65, 0x4f, 0xd9, 0xcf, 0xee, 0x88, 0x8c, 0xf5, 0x96, 0x9a, 0x72, 0x52, + 0x2e, 0xe5, 0xc0, 0xdf, 0xbc, 0x95, 0x68, 0x82, 0x97, 0xe8, 0x4f, 0xe2, + 0x4c, 0x3a, 0x8c, 0xe5, 0xe4, 0x36, 0xd9, 0x7d, 0xd0, 0xd1, 0xce, 0xd6, + 0xd4, 0x50, 0xd3, 0xee, 0x77, 0x14, 0x3e, 0x14, 0x1a, 0x47, 0x1d, 0xa7, + 0x3b, 0x30, 0x1f, 0x99, 0x6b, 0x1e, 0x93, 0x27, 0xbd, 0x62, 0x09, 0x27, + 0xbc, 0x9f, 0xcd, 0x94, 0x3b, 0x97, 0x89, 0x23, 0xc3, 0x56, 0xde, 0x87, + 0x92, 0xa8, 0xc9, 0x3e, 0x37, 0x6a, 0x14, 0xf2, 0x84, 0x79, 0x23, 0x1d, + 0xc0, 0x8f, 0x25, 0xc0, 0xc1, 0x0a, 0x22, 0x45, 0x5f, 0xf4, 0x4e, 0xbf, + 0x3f, 0x71, 0x88, 0x9b, 0x36, 0x20, 0x5b, 0x96, 0xc0, 0x20, 0xba, 0x15, + 0x8d, 0x7d, 0xef, 0x96, 0x1f, 0x79, 0xb5, 0x8c, 0x4c, 0x51, 0xd9, 0x38, + 0x35, 0x57, 0x24, 0x73, 0xbe, 0x21, 0xb1, 0xf7, 0x23, 0x89, 0x3c, 0x13, + 0x13, 0xd9, 0x70, 0x12, 0x8c, 0x41, 0x18, 0xab, 0xb9, 0xb0, 0x9e, 0x11, + 0x4e, 0x1d, 0xb6, 0x99, 0xd2, 0xb2, 0x9f, 0x14, 0x5b, 0x15, 0xfd, 0xc8, + 0x2f, 0xfb, 0xac, 0x10, 0xcc, 0x37, 0x92, 0xd9, 0x54, 0xab, 0x83, 0x0c, + 0xc8, 0xad, 0x4d, 0xe0, 0x33, 0x41, 0x1e, 0xd8, 0xce, 0xff, 0x44, 0x87, + 0x64, 0x04, 0x30, 0x33, 0x55, 0x3f, 0x1b, 0xe2, 0x3b, 0x42, 0x2b, 0x3c, + 0x52, 0x28, 0xbd, 0x35, 0x44, 0xa1, 0xc9, 0xa4, 0x27, 0x23, 0x52, 0xe9, + 0x74, 0xbc, 0x70, 0x1a, 0x54, 0x65, 0x1c, 0x02, 0xeb, 0xa0, 0x41, 0x8e, + 0x3f, 0x14, 0x92, 0x04, 0x44, 0x1d, 0x30, 0x1e, 0x33, 0x31, 0x17, 0x4f, + 0xed, 0xbc, 0xc5, 0xa1, 0x8f, 0xc7, 0x54, 0x72, 0xee, 0x93, 0x7e, 0x0a, + 0x57, 0x00, 0xdd, 0x93, 0xc5, 0xe5, 0xc7, 0xa1, 0x37, 0x7f, 0x66, 0x07, + 0xa8, 0x94, 0x90, 0x61, 0xd5, 0x08, 0x9d, 0xd7, 0xce, 0x1f, 0x76, 0xff, + 0x6e, 0xaa, 0x50, 0x93, 0x36, 0xda, 0x34, 0xf9, 0x85, 0x7f, 0x0f, 0x87, + 0x90, 0x93, 0x9e, 0x54, 0x5d, 0x48, 0xe7, 0xa3, 0x66, 0xb6, 0xb8, 0xd1, + 0x09, 0x69, 0x55, 0x93, 0xcb, 0x70, 0x2d, 0xea, 0xe6, 0x6c, 0xe5, 0x54, + 0x08, 0x7d, 0x26, 0x38, 0x1c, 0x8c, 0x1d, 0xc1, 0xfd, 0x54, 0xa3, 0x3e, + 0x5a, 0x1f, 0x7a, 0xb9, 0x17, 0xec, 0x14, 0x1a, 0xba, 0x73, 0x97, 0x34, + 0x04, 0x03, 0xc9, 0x90, 0xfc, 0x21, 0xcc, 0xb5, 0xec, 0x9b, 0x25, 0x81, + 0x96, 0x5c, 0xc5, 0x00, 0xa8, 0xdd, 0x32, 0x5b, 0xdc, 0xd1, 0x23, 0xdb, + 0xc2, 0x0c, 0xed, 0x22, 0xb2, 0xd5, 0x49, 0xae, 0xb1, 0x10, 0x10, 0xb8, + 0xdf, 0xb0, 0xf8, 0x51, 0xee, 0x6f, 0x23, 0x85, 0xb8, 0xe7, 0x45, 0x17, + 0x21, 0x32, 0x17, 0x66, 0x73, 0x31, 0x09, 0x02, 0xa6, 0x82, 0x30, 0x7f, + 0x9d, 0xc4, 0x54, 0x41, 0x83, 0x5e, 0xc2, 0x42, 0xff, 0x0f, 0x85, 0x40, + 0xe6, 0x86, 0x6c, 0x31, 0x9c, 0xbb, 0x86, 0xa9, 0xde, 0xf6, 0xda, 0x34, + 0x1b, 0x22, 0x4b, 0xba, 0x65, 0x31, 0x4f, 0xbb, 0x28, 0x58, 0x9d, 0x96, + 0x73, 0x5f, 0x5c, 0x16, 0x42, 0x15, 0x47, 0x3e, 0xa0, 0xf5, 0xf1, 0x0d, + 0xbe, 0x51, 0x21, 0x64, 0x8d, 0xf1, 0x93, 0x21, 0x62, 0x2c, 0x34, 0x25, + 0x99, 0x64, 0xe2, 0x70, 0xb4, 0x34, 0x17, 0x6f, 0x92, 0x38, 0x31, 0x20, + 0x45, 0x6d, 0x84, 0x5e, 0xe4, 0xd0, 0x3f, 0x78, 0xab, 0xa9, 0x8f, 0x33, + 0xa3, 0xc7, 0x74, 0x6f, 0x33, 0x38, 0x17, 0xe8, 0xa9, 0x55, 0xa7, 0x5a, + 0x51, 0xf1, 0x94, 0xe2, 0xa7, 0x25, 0x06, 0x6d, 0x1c, 0x2b, 0xf8, 0x1c, + 0xcf, 0xff, 0xab, 0xc0, 0xdc, 0x64, 0xc5, 0x77, 0xcb, 0x3c, 0x8a, 0xac, + 0x4d, 0x07, 0xc5, 0x4f, 0x7d, 0x64, 0xee, 0x20, 0x30, 0xa9, 0x08, 0x88, + 0xf1, 0x6c, 0x53, 0x4f, 0x60, 0xe0, 0x46, 0x1e, 0x2d, 0xf9, 0x59, 0xfd, + 0xd6, 0x52, 0x09, 0x56, 0x19, 0x5c, 0x14, 0x47, 0xd2, 0x9c, 0x8f, 0x59, + 0x64, 0xde, 0x15, 0xeb, 0xec, 0x60, 0xc4, 0x9f, 0xbc, 0xd3, 0xde, 0x37, + 0x0a, 0xc0, 0x68, 0xcb, 0xc5, 0x2f, 0x91, 0xf1, 0xb4, 0x33, 0x82, 0x9d, + 0x1e, 0x33, 0x2e, 0x9d, 0xfe, 0x9a, 0x5f, 0x8a, 0xa9, 0x19, 0x9e, 0x5b, + 0xdb, 0x2c, 0x9a, 0xcf, 0x99, 0x58, 0x7f, 0x00, 0xdd, 0x85, 0x96, 0xff, + 0x3e, 0xd0, 0x0a, 0xd2, 0xa3, 0xc5, 0x93, 0xba, 0xa3, 0x8a, 0x6e, 0x9a, + 0x36, 0xa9, 0x85, 0x89, 0x8a, 0x96, 0x5b, 0x94, 0x0e, 0xc3, 0x02, 0xe2, + 0x5d, 0x62, 0x8b, 0x2c, 0x55, 0x47, 0xfb, 0x50, 0xde, 0x18, 0x01, 0x03, + 0xe4, 0xfb, 0xf7, 0x42, 0x22, 0xfb, 0xd4, 0xfd}; diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/fors.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/fors.cc.inc new file mode 100644 index 00000000..8ef61c5f --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/fors.cc.inc @@ -0,0 +1,179 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "../../internal.h" +#include "./address.h" +#include "./fors.h" +#include "./params.h" +#include "./thash.h" + + +using namespace bssl; + +// Compute the base 2^a representation of `message` (algorithm 4, page 16). +static void fors_base_b(const slh_dsa_config *config, uint32_t *indices, + const uint8_t *message) { + const size_t bits_per_index = config->fors_height; + BSSL_CHECK(bits_per_index <= SLHDSA_MAX_FORS_HEIGHT); + const size_t total_bits = bits_per_index * config->fors_trees; + const size_t msg_bytes = slhdsa_fors_msg_bytes(config); + BSSL_CHECK(total_bits <= msg_bytes * 8); + + size_t bit_offset = 0; + for (size_t i = 0; i < config->fors_trees; ++i) { + uint32_t value = 0; + for (size_t j = 0; j < bits_per_index; ++j) { + const size_t bit_index = bit_offset + j; + const size_t byte_index = bit_index >> 3; + const size_t bit_in_byte = 7 - (bit_index & 7); + const uint8_t bit = (message[byte_index] >> bit_in_byte) & 0x01; + value = (value << 1) | bit; + } + indices[i] = value; + bit_offset += bits_per_index; + } +} + +// Implements Algorithm 14: fors_skGen function (page 29) +void bssl::slhdsa_fors_sk_gen(const slh_dsa_config *config, uint8_t *fors_sk, + uint32_t idx, const uint8_t *sk_seed, + const uint8_t *pk_seed, uint8_t addr[32]) { + uint8_t sk_addr[32]; + OPENSSL_memcpy(sk_addr, addr, sizeof(sk_addr)); + + slhdsa_set_type(config, sk_addr, SLHDSA_ADDR_TYPE_FORSPRF); + slhdsa_copy_keypair_addr(config, sk_addr, addr); + slhdsa_set_tree_index(config, sk_addr, idx); + slhdsa_thash_prf(config, fors_sk, pk_seed, sk_seed, sk_addr); +} + +// Implements Algorithm 15: fors_node function (page 30) +void bssl::slhdsa_fors_treehash(const slh_dsa_config *config, + uint8_t *root_node, const uint8_t *sk_seed, + uint32_t i /*target node index*/, + uint32_t z /*target node height*/, + const uint8_t *pk_seed, uint8_t addr[32]) { + BSSL_CHECK(z <= config->fors_height); + const uint32_t nodes_in_tree = 1u << (config->fors_height - z); + BSSL_CHECK(i < (uint32_t)(config->fors_trees * nodes_in_tree)); + + if (z == 0) { + uint8_t sk[SLHDSA_MAX_N]; + slhdsa_set_tree_height(config, addr, 0); + slhdsa_set_tree_index(config, addr, i); + slhdsa_fors_sk_gen(config, sk, i, sk_seed, pk_seed, addr); + slhdsa_thash_f(config, root_node, sk, pk_seed, addr); + } else { + // Stores left node and right node. + uint8_t nodes[2 * SLHDSA_MAX_N]; + slhdsa_fors_treehash(config, nodes, sk_seed, 2 * i, z - 1, pk_seed, addr); + slhdsa_fors_treehash(config, nodes + config->n, sk_seed, 2 * i + 1, z - 1, + pk_seed, addr); + slhdsa_set_tree_height(config, addr, z); + slhdsa_set_tree_index(config, addr, i); + slhdsa_thash_h(config, root_node, nodes, pk_seed, addr); + } +} + +// Implements Algorithm 16: fors_sign function (page 31) +void bssl::slhdsa_fors_sign(const slh_dsa_config *config, uint8_t *fors_sig, + const uint8_t *message, const uint8_t *sk_seed, + const uint8_t *pk_seed, uint8_t addr[32]) { + uint32_t indices[SLHDSA_MAX_FORS_TREES]; + BSSL_CHECK(config->fors_trees <= SLHDSA_MAX_FORS_TREES); + BSSL_CHECK(config->fors_height <= SLHDSA_MAX_FORS_HEIGHT); + + // Derive FORS indices compatible with the NIST changes. + fors_base_b(config, indices, message); + + const size_t n = config->n; + const size_t node_stride = n * (config->fors_height + 1); + + for (uint32_t i = 0; i < config->fors_trees; ++i) { + slhdsa_set_tree_height(config, addr, 0); + // Write the FORS secret key element to the correct position. + uint8_t *tree_sig = fors_sig + i * node_stride; + slhdsa_fors_sk_gen(config, tree_sig, + i * (1u << config->fors_height) + indices[i], sk_seed, + pk_seed, addr); + for (uint32_t j = 0; j < config->fors_height; ++j) { + const size_t sibling = (indices[i] >> j) ^ 1u; + // Write the FORS auth path element to the correct position. + slhdsa_fors_treehash(config, tree_sig + (j + 1) * n, sk_seed, + i * (1u << (config->fors_height - j)) + sibling, j, + pk_seed, addr); + } + } +} + +// Implements Algorithm 17: fors_pkFromSig function (page 32) +void bssl::slhdsa_fors_pk_from_sig(const slh_dsa_config *config, + uint8_t *fors_pk, const uint8_t *fors_sig, + const uint8_t *message, + const uint8_t *pk_seed, uint8_t addr[32]) { + uint32_t indices[SLHDSA_MAX_FORS_TREES]; + uint8_t tmp[2 * SLHDSA_MAX_N]; + uint8_t roots[SLHDSA_MAX_FORS_TREES * SLHDSA_MAX_N]; + BSSL_CHECK(config->fors_trees <= SLHDSA_MAX_FORS_TREES); + BSSL_CHECK(config->fors_height <= SLHDSA_MAX_FORS_HEIGHT); + + // Derive FORS indices compatible with the NIST changes. + fors_base_b(config, indices, message); + + const size_t n = config->n; + const size_t node_stride = n * (config->fors_height + 1); + + for (size_t i = 0; i < config->fors_trees; ++i) { + // Pointer to current sk and authentication path + const uint8_t *tree_sig = fors_sig + i * node_stride; + const uint8_t *auth = tree_sig + n; + uint8_t nodes[2 * SLHDSA_MAX_N]; + + slhdsa_set_tree_height(config, addr, 0); + uint32_t index = i * (1ULL << config->fors_height) + indices[i]; + slhdsa_set_tree_index(config, addr, index); + + slhdsa_thash_f(config, nodes, tree_sig, pk_seed, addr); + + for (uint32_t j = 0; j < config->fors_height; ++j) { + slhdsa_set_tree_height(config, addr, j + 1); + index >>= 1; + slhdsa_set_tree_index(config, addr, index); + + // Even node + if (((indices[i] >> j) & 1) == 0) { + OPENSSL_memcpy(tmp, nodes, n); + OPENSSL_memcpy(tmp + n, auth + j * n, n); + slhdsa_thash_h(config, nodes + n, tmp, pk_seed, addr); + } else { + OPENSSL_memcpy(tmp, auth + j * n, n); + OPENSSL_memcpy(tmp + n, nodes, n); + slhdsa_thash_h(config, nodes + n, tmp, pk_seed, addr); + } + OPENSSL_memcpy(nodes, nodes + n, n); + } + OPENSSL_memcpy(roots + i * n, nodes, n); + } + + uint8_t forspk_addr[32]; + OPENSSL_memcpy(forspk_addr, addr, sizeof(forspk_addr)); + slhdsa_set_type(config, forspk_addr, SLHDSA_ADDR_TYPE_FORSPK); + slhdsa_copy_keypair_addr(config, forspk_addr, addr); + slhdsa_thash_tk(config, fors_pk, roots, pk_seed, forspk_addr); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/fors.h b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/fors.h new file mode 100644 index 00000000..f123a22f --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/fors.h @@ -0,0 +1,48 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_FORS_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_FORS_H + +#include "./params.h" + + +BSSL_NAMESPACE_BEGIN + +// Implements Algorithm 14: fors_skGen function (page 29) +void slhdsa_fors_sk_gen(const slh_dsa_config *config, uint8_t *fors_sk, + uint32_t idx, const uint8_t *sk_seed, + const uint8_t *pk_seed, uint8_t addr[32]); + +// Implements Algorithm 15: fors_node function (page 30) +void slhdsa_fors_treehash(const slh_dsa_config *config, uint8_t *root_node, + const uint8_t *sk_seed, + uint32_t i /*target node index*/, + uint32_t z /*target node height*/, + const uint8_t *pk_seed, + uint8_t addr[32]); + +// Implements Algorithm 16: fors_sign function (page 31) +void slhdsa_fors_sign(const slh_dsa_config *config, uint8_t *fors_sig, + const uint8_t *message, const uint8_t *sk_seed, + const uint8_t *pk_seed, uint8_t addr[32]); + +// Implements Algorithm 17: fors_pkFromSig function (page 32) +void slhdsa_fors_pk_from_sig(const slh_dsa_config *config, uint8_t *fors_pk, + const uint8_t *fors_sig, const uint8_t *message, + const uint8_t *pk_seed, uint8_t addr[32]); + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_FORS_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/merkle.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/merkle.cc.inc new file mode 100644 index 00000000..84602c88 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/merkle.cc.inc @@ -0,0 +1,164 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../../internal.h" +#include "./address.h" +#include "./merkle.h" +#include "./params.h" +#include "./thash.h" +#include "./wots.h" + + +using namespace bssl; + +// Implements Algorithm 9: xmss_node function (page 23) +void bssl::slhdsa_treehash(const slh_dsa_config *config, uint8_t *out_pk, + const uint8_t *sk_seed, uint32_t i, uint32_t z, + const uint8_t *pk_seed, uint8_t addr[32]) { + BSSL_CHECK(z <= config->tree_height); + BSSL_CHECK(i < (uint32_t)(1u << (config->tree_height - z))); + + if (z == 0) { + slhdsa_set_type(config, addr, SLHDSA_ADDR_TYPE_WOTS); + slhdsa_set_keypair_addr(config, addr, i); + slhdsa_wots_pk_gen(config, out_pk, sk_seed, pk_seed, addr); + } else { + // Stores left node and right node. + uint8_t nodes[2 * SLHDSA_MAX_N]; + slhdsa_treehash(config, nodes, sk_seed, 2 * i, z - 1, pk_seed, addr); + slhdsa_treehash(config, nodes + config->n, sk_seed, 2 * i + 1, z - 1, + pk_seed, addr); + slhdsa_set_type(config, addr, SLHDSA_ADDR_TYPE_HASHTREE); + slhdsa_set_tree_height(config, addr, z); + slhdsa_set_tree_index(config, addr, i); + slhdsa_thash_h(config, out_pk, nodes, pk_seed, addr); + } +} + +// Implements Algorithm 10: xmss_sign function (page 24) +void bssl::slhdsa_xmss_sign(const slh_dsa_config *config, uint8_t *sig, + const uint8_t *msg, unsigned int idx, + const uint8_t *sk_seed, const uint8_t *pk_seed, + uint8_t addr[32]) { + // Build authentication path + const size_t wots_bytes = slhdsa_wots_bytes(config); + const size_t n = config->n; + for (size_t j = 0; j < config->tree_height; ++j) { + unsigned int k = (idx >> j) ^ 1; + slhdsa_treehash(config, sig + wots_bytes + j * n, sk_seed, k, j, pk_seed, + addr); + } + + // Compute WOTS+ signature + slhdsa_set_type(config, addr, SLHDSA_ADDR_TYPE_WOTS); + slhdsa_set_keypair_addr(config, addr, idx); + slhdsa_wots_sign(config, sig, msg, sk_seed, pk_seed, addr); +} + +// Implements Algorithm 11: xmss_pkFromSig function (page 25) +void bssl::slhdsa_xmss_pk_from_sig(const slh_dsa_config *config, uint8_t *root, + const uint8_t *xmss_sig, unsigned int idx, + const uint8_t *msg, const uint8_t *pk_seed, + uint8_t addr[32]) { + // Stores node[0] and node[1] from Algorithm 11 + slhdsa_set_type(config, addr, SLHDSA_ADDR_TYPE_WOTS); + slhdsa_set_keypair_addr(config, addr, idx); + uint8_t node[2 * SLHDSA_MAX_N]; + const size_t n = config->n; + slhdsa_wots_pk_from_sig(config, node, xmss_sig, msg, pk_seed, addr); + + slhdsa_set_type(config, addr, SLHDSA_ADDR_TYPE_HASHTREE); + slhdsa_set_tree_index(config, addr, idx); + + uint8_t tmp[2 * SLHDSA_MAX_N]; + const uint8_t *const auth = xmss_sig + slhdsa_wots_bytes(config); + for (uint32_t k = 0; k < config->tree_height; ++k) { + slhdsa_set_tree_height(config, addr, k + 1); + if (((idx >> k) & 1) == 0) { + slhdsa_set_tree_index(config, addr, + slhdsa_get_tree_index(config, addr) >> 1); + OPENSSL_memcpy(tmp, node, n); + OPENSSL_memcpy(tmp + n, auth + k * n, n); + slhdsa_thash_h(config, node + n, tmp, pk_seed, addr); + } else { + slhdsa_set_tree_index(config, addr, + (slhdsa_get_tree_index(config, addr) - 1) >> 1); + OPENSSL_memcpy(tmp, auth + k * n, n); + OPENSSL_memcpy(tmp + n, node, n); + slhdsa_thash_h(config, node + n, tmp, pk_seed, addr); + } + OPENSSL_memcpy(node, node + n, n); + } + OPENSSL_memcpy(root, node, n); +} + +// Implements Algorithm 12: ht_sign function (page 27) +void bssl::slhdsa_ht_sign(const slh_dsa_config *config, uint8_t *sig, + const uint8_t *message, uint64_t idx_tree, + uint32_t idx_leaf, const uint8_t *sk_seed, + const uint8_t *pk_seed) { + uint8_t addr[32] = {0}; + slhdsa_set_tree_addr(config, addr, idx_tree); + + // Layer 0 + slhdsa_xmss_sign(config, sig, message, idx_leaf, sk_seed, pk_seed, addr); + uint8_t root[SLHDSA_MAX_N]; + slhdsa_xmss_pk_from_sig(config, root, sig, idx_leaf, message, pk_seed, addr); + sig += slhdsa_xmss_bytes(config); + + // All other layers + BSSL_CHECK(config->tree_height <= SLHDSA_MAX_TREE_HEIGHT); + const uint32_t leaf_mask = (1u << config->tree_height) - 1; + for (uint32_t j = 1; j < config->d; ++j) { + idx_leaf = idx_tree & leaf_mask; + idx_tree = idx_tree >> config->tree_height; + slhdsa_set_layer_addr(config, addr, j); + slhdsa_set_tree_addr(config, addr, idx_tree); + slhdsa_xmss_sign(config, sig, root, idx_leaf, sk_seed, pk_seed, addr); + if (j < (config->d - 1)) { + slhdsa_xmss_pk_from_sig(config, root, sig, idx_leaf, root, pk_seed, addr); + } + + sig += slhdsa_xmss_bytes(config); + } +} + +// Implements Algorithm 13: ht_verify function (page 28) +int bssl::slhdsa_ht_verify(const slh_dsa_config *config, const uint8_t *sig, + const uint8_t *message, uint64_t idx_tree, + uint32_t idx_leaf, const uint8_t *pk_root, + const uint8_t *pk_seed) { + uint8_t addr[32] = {0}; + slhdsa_set_tree_addr(config, addr, idx_tree); + + uint8_t node[SLHDSA_MAX_N]; + slhdsa_xmss_pk_from_sig(config, node, sig, idx_leaf, message, pk_seed, addr); + + BSSL_CHECK(config->tree_height <= SLHDSA_MAX_TREE_HEIGHT); + const uint32_t leaf_mask = (1u << config->tree_height) - 1; + for (uint32_t j = 1; j < config->d; ++j) { + idx_leaf = idx_tree & leaf_mask; + idx_tree = idx_tree >> config->tree_height; + slhdsa_set_layer_addr(config, addr, j); + slhdsa_set_tree_addr(config, addr, idx_tree); + + slhdsa_xmss_pk_from_sig(config, node, sig + j * slhdsa_xmss_bytes(config), + idx_leaf, node, pk_seed, addr); + } + return memcmp(node, pk_root, config->n) == 0; +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/merkle.h b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/merkle.h new file mode 100644 index 00000000..c13b6ff3 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/merkle.h @@ -0,0 +1,58 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_MERKLE_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_MERKLE_H + +#include + +#include + +#include "./params.h" + + +BSSL_NAMESPACE_BEGIN + +// Implements Algorithm 9: xmss_node function (page 23) +void slhdsa_treehash(const slh_dsa_config *config, uint8_t *out_pk, + const uint8_t *sk_seed, uint32_t i /*target node index*/, + uint32_t z /*target node height*/, const uint8_t *pk_seed, + uint8_t addr[32]); + +// Implements Algorithm 10: xmss_sign function (page 24) +void slhdsa_xmss_sign(const slh_dsa_config *config, uint8_t *sig, + const uint8_t *msg, unsigned int idx, + const uint8_t *sk_seed, const uint8_t *pk_seed, + uint8_t addr[32]); + +// Implements Algorithm 11: xmss_pkFromSig function (page 25) +void slhdsa_xmss_pk_from_sig(const slh_dsa_config *config, uint8_t *root, + const uint8_t *xmss_sig, unsigned int idx, + const uint8_t *msg, const uint8_t *pk_seed, + uint8_t addr[32]); + +// Implements Algorithm 12: ht_sign function (page 27) +void slhdsa_ht_sign(const slh_dsa_config *config, uint8_t *sig, + const uint8_t *message, uint64_t idx_tree, uint32_t idx_leaf, + const uint8_t *sk_seed, const uint8_t *pk_seed); + +// Implements Algorithm 13: ht_verify function (page 28) +int slhdsa_ht_verify(const slh_dsa_config *config, const uint8_t *sig, + const uint8_t *message, uint64_t idx_tree, + uint32_t idx_leaf, const uint8_t *pk_root, + const uint8_t *pk_seed); + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_MERKLE_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/params.h b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/params.h new file mode 100644 index 00000000..3d4c1b28 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/params.h @@ -0,0 +1,178 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_PARAMS_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_PARAMS_H + +#include + +#include + +#include "../bcm_interface.h" + + +BSSL_NAMESPACE_BEGIN + +enum slh_dsa_hash_type { + SLH_DSA_HASH_SHA2_256, + SLH_DSA_HASH_SHAKE_256, +}; + +// Upper bounds for stack allocations across all SLH-DSA parameter sets in +// FIPS 205. These keep the code simple and avoid dynamic allocation while +// still covering larger future parameter sets such as SLH-DSA-SHAKE-256f. +#define SLHDSA_MAX_N 32 +#define SLHDSA_MAX_WOTS_LEN 67 +#define SLHDSA_MAX_WOTS_BYTES (SLHDSA_MAX_N * SLHDSA_MAX_WOTS_LEN) +#define SLHDSA_MAX_FORS_HEIGHT 17 +#define SLHDSA_MAX_FORS_TREES 35 +#define SLHDSA_MAX_FORS_BYTES \ + ((SLHDSA_MAX_FORS_HEIGHT + 1) * SLHDSA_MAX_FORS_TREES * SLHDSA_MAX_N) +#define SLHDSA_MAX_FORS_MSG_BYTES \ + ((SLHDSA_MAX_FORS_HEIGHT * SLHDSA_MAX_FORS_TREES + 7) / 8) +#define SLHDSA_MAX_TREE_HEIGHT 18 +#define SLHDSA_MAX_D 18 +#define SLHDSA_MAX_DIGEST_SIZE 64 +#define SLHDSA_MAX_HASH_BLOCK_BYTES 168 + +// Values bound by these limits are assumed to be valid shifts within a +// uint32_t. +static_assert(SLHDSA_MAX_TREE_HEIGHT < 32); +static_assert(SLHDSA_MAX_FORS_HEIGHT < 32); + +#define SLHDSA_ADDR_BYTES 32 +#define SLHDSA_ADDR_COMPRESSED_BYTES 22 +#define SLHDSA_ADDR_COMP_OFFSET_LAYER 0 +#define SLHDSA_ADDR_COMP_OFFSET_TREE 1 +#define SLHDSA_ADDR_COMP_OFFSET_TYPE 9 +#define SLHDSA_ADDR_COMP_OFFSET_KEYPAIR 10 +#define SLHDSA_ADDR_COMP_OFFSET_CHAIN 14 +#define SLHDSA_ADDR_COMP_OFFSET_TREE_HEIGHT 14 +#define SLHDSA_ADDR_COMP_OFFSET_HASH 18 +#define SLHDSA_ADDR_COMP_OFFSET_TREE_INDEX 18 +#define SLHDSA_ADDR_COMP_ZERO_START 10 +#define SLHDSA_ADDR_COMP_ZERO_LEN 12 +#define SLHDSA_ADDR_FULL_OFFSET_LAYER 0 +#define SLHDSA_ADDR_FULL_OFFSET_TREE 4 +#define SLHDSA_ADDR_FULL_OFFSET_TYPE 16 +#define SLHDSA_ADDR_FULL_OFFSET_KEYPAIR 20 +#define SLHDSA_ADDR_FULL_OFFSET_CHAIN 24 +#define SLHDSA_ADDR_FULL_OFFSET_TREE_HEIGHT 24 +#define SLHDSA_ADDR_FULL_OFFSET_HASH 28 +#define SLHDSA_ADDR_FULL_OFFSET_TREE_INDEX 28 +#define SLHDSA_ADDR_FULL_ZERO_START 20 +#define SLHDSA_ADDR_FULL_ZERO_LEN 12 + +typedef struct slh_dsa_config { + uint32_t n; + uint32_t full_height; + uint32_t d; + uint32_t tree_height; + uint32_t fors_height; + uint32_t fors_trees; + uint32_t wots_w; + uint32_t wots_log_w; + uint32_t wots_len1; + uint32_t wots_len2; + uint32_t digest_size; + uint32_t hash_block_bytes; + uint32_t hash_output_bytes; + uint32_t public_key_bytes; + uint32_t private_key_bytes; + uint32_t signature_bytes; + enum slh_dsa_hash_type hash_type; + bool compressed_addresses; +} slh_dsa_config; + +inline uint32_t slhdsa_wots_len(const slh_dsa_config *config) { + return config->wots_len1 + config->wots_len2; +} + +inline uint32_t slhdsa_wots_bytes(const slh_dsa_config *config) { + return config->n * slhdsa_wots_len(config); +} + +inline uint32_t slhdsa_xmss_bytes(const slh_dsa_config *config) { + return slhdsa_wots_bytes(config) + config->n * config->tree_height; +} + +inline uint32_t slhdsa_fors_msg_bytes(const slh_dsa_config *config) { + return (config->fors_height * config->fors_trees + 7) / 8; +} + +inline uint32_t slhdsa_fors_bytes(const slh_dsa_config *config) { + return (config->fors_height + 1) * config->fors_trees * config->n; +} + +inline uint32_t slhdsa_tree_bits(const slh_dsa_config *config) { + return config->tree_height * (config->d - 1); +} + +inline uint32_t slhdsa_tree_bytes(const slh_dsa_config *config) { + return (slhdsa_tree_bits(config) + 7) / 8; +} + +inline uint32_t slhdsa_leaf_bits(const slh_dsa_config *config) { + return config->tree_height; +} + +inline uint32_t slhdsa_leaf_bytes(const slh_dsa_config *config) { + return (slhdsa_leaf_bits(config) + 7) / 8; +} + +static const slh_dsa_config kSLHDSAConfigSHA2_128s = { + /*n=*/BCM_SLHDSA_SHA2_128S_N, + /*full_height=*/63, + /*d=*/7, + /*tree_height=*/9, + /*fors_height=*/12, + /*fors_trees=*/14, + /*wots_w=*/16, + /*wots_log_w=*/4, + /*wots_len1=*/32, + /*wots_len2=*/3, + /*digest_size=*/30, + /*hash_block_bytes=*/64, + /*hash_output_bytes=*/32, + /*public_key_bytes=*/BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES, + /*private_key_bytes=*/BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES, + /*signature_bytes=*/BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES, + /*hash_type=*/SLH_DSA_HASH_SHA2_256, + /*compressed_addresses=*/true, +}; + +static const slh_dsa_config kSLHDSAConfigSHAKE_256f = { + /*n=*/BCM_SLHDSA_SHAKE_256F_N, + /*full_height=*/68, + /*d=*/17, + /*tree_height=*/4, + /*fors_height=*/9, + /*fors_trees=*/35, + /*wots_w=*/16, + /*wots_log_w=*/4, + /*wots_len1=*/64, + /*wots_len2=*/3, + /*digest_size=*/49, + /*hash_block_bytes=*/136, + /*hash_output_bytes=*/32, + /*public_key_bytes=*/BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES, + /*private_key_bytes=*/BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES, + /*signature_bytes=*/BCM_SLHDSA_SHAKE_256F_SIGNATURE_BYTES, + /*hash_type=*/SLH_DSA_HASH_SHAKE_256, + /*compressed_addresses=*/false, +}; + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_PARAMS_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/slhdsa.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/slhdsa.cc.inc new file mode 100644 index 00000000..7d56ebe6 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/slhdsa.cc.inc @@ -0,0 +1,690 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include + +#include "../../internal.h" +#include "../bcm_interface.h" +#include "address.h" +#include "fors.h" +#include "merkle.h" +#include "params.h" +#include "thash.h" + + +using namespace bssl; + +#if defined(BORINGSSL_FIPS) + +DEFINE_STATIC_ONCE(g_slhdsa_keygen_self_test_once) +DEFINE_STATIC_ONCE(g_slhdsa_sign_self_test_once) +DEFINE_STATIC_ONCE(g_slhdsa_verify_self_test_once) + +#endif + +namespace { + +namespace fips { +void ensure_keygen_self_test(); +void ensure_sign_self_test(); +void ensure_verify_self_test(); +} // namespace fips + +// The OBJECT IDENTIFIER header is also included in these values, per the spec. +const uint8_t kSHA256OID[] = {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, + 0x65, 0x03, 0x04, 0x02, 0x01}; +const uint8_t kSHA384OID[] = {0x06, 0x09, 0x60, 0x86, 0x48, 0x01, + 0x65, 0x03, 0x04, 0x02, 0x02}; +#define MAX_OID_LENGTH 11 +#define MAX_CONTEXT_LENGTH 255 + +bcm_infallible generate_key_from_seed_no_self_test(const slh_dsa_config *config, + uint8_t *out_public_key, + uint8_t *out_secret_key, + const uint8_t *seed) { + // Initialize SK.seed || SK.prf || PK.seed from seed. + OPENSSL_memcpy(out_secret_key, seed, 3 * config->n); + + // Initialize PK.seed from seed. + OPENSSL_memcpy(out_public_key, seed + 2 * config->n, config->n); + + uint8_t addr[32] = {0}; + slhdsa_set_layer_addr(config, addr, config->d - 1); + + // Set PK.root + slhdsa_treehash(config, out_public_key + config->n, out_secret_key, 0, + config->tree_height, out_public_key, addr); + OPENSSL_memcpy(out_secret_key + 3 * config->n, out_public_key + config->n, + config->n); + + // FIPS 140-3 IG 10.3.A comment 1 says of the pair-wise consistency test for + // SLH-DSA: + // + // "For key pairs generated for use with approved algorithms in SP 800-208 and + // FIPS 205, the PCT (described by the tester in TE10.35.02) may be limited to + // confirming the same key identifier (I in the case of LMS, SEED in the case + // of XMSS and PK.SEED for SLH-DSA) is shared by the resulting public and + // private key following generation." + // + // Since this is cheap, we always do this. + + if (boringssl_fips_break_test("SLHDSA_PWCT")) { + out_public_key[0] ^= 1; + } + if (OPENSSL_memcmp(out_public_key, out_secret_key + 2 * config->n, + config->n) != 0) { + abort(); + } + + return bcm_infallible::not_approved; +} + +uint64_t load_tree_index(const slh_dsa_config *config, const uint8_t *in) { + const size_t tree_bits = slhdsa_tree_bits(config); + const size_t tree_bytes = slhdsa_tree_bytes(config); + BSSL_CHECK(tree_bits <= 64); + BSSL_CHECK(tree_bytes <= 8); + + uint8_t buf[8] = {0}; + OPENSSL_memcpy(buf + (sizeof(buf) - tree_bytes), in, tree_bytes); + uint64_t index = CRYPTO_load_u64_be(buf); + if (tree_bits < 64) { + index &= (~(uint64_t)0) >> (64 - tree_bits); + } + return index; +} + +// Implements Algorithm 22: slh_sign function (Section 10.2.1, page 39) +bcm_infallible sign_internal_no_self_test( + const slh_dsa_config *config, uint8_t *out_signature, + const uint8_t *secret_key, + const uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN], const uint8_t *context, + size_t context_len, const uint8_t *msg, size_t msg_len, + const uint8_t *entropy) { + const size_t n = config->n; + const uint8_t *sk_seed = secret_key; + const uint8_t *sk_prf = secret_key + n; + const uint8_t *pk_seed = secret_key + 2 * n; + const uint8_t *pk_root = secret_key + 3 * n; + + // Derive randomizer R and copy it to signature + uint8_t R[SLHDSA_MAX_N]; + slhdsa_thash_prfmsg(config, R, sk_prf, entropy, header, context, context_len, + msg, msg_len); + OPENSSL_memcpy(out_signature, R, n); + + // Compute message digest + uint8_t digest[SLHDSA_MAX_DIGEST_SIZE]; + slhdsa_thash_hmsg(config, digest, R, pk_seed, pk_root, header, context, + context_len, msg, msg_len); + + uint8_t fors_digest[SLHDSA_MAX_FORS_MSG_BYTES]; + const size_t fors_msg_bytes = slhdsa_fors_msg_bytes(config); + OPENSSL_memcpy(fors_digest, digest, fors_msg_bytes); + + size_t digest_offset = fors_msg_bytes; + const uint64_t idx_tree = load_tree_index(config, digest + digest_offset); + digest_offset += slhdsa_tree_bytes(config); + uint32_t idx_leaf = 0; + const size_t leaf_bytes = slhdsa_leaf_bytes(config); + for (size_t i = 0; i < leaf_bytes; ++i) { + idx_leaf = (idx_leaf << 8) | digest[digest_offset + i]; + } + const size_t leaf_bits = slhdsa_leaf_bits(config); + if (leaf_bits < 32) { + idx_leaf &= (~(uint32_t)0) >> (32 - leaf_bits); + } + + uint8_t addr[32] = {0}; + slhdsa_set_tree_addr(config, addr, idx_tree); + slhdsa_set_type(config, addr, SLHDSA_ADDR_TYPE_FORSTREE); + slhdsa_set_keypair_addr(config, addr, idx_leaf); + + slhdsa_fors_sign(config, out_signature + n, fors_digest, sk_seed, pk_seed, + addr); + + uint8_t pk_fors[SLHDSA_MAX_N]; + slhdsa_fors_pk_from_sig(config, pk_fors, out_signature + n, fors_digest, + pk_seed, addr); + + slhdsa_ht_sign(config, out_signature + n + slhdsa_fors_bytes(config), pk_fors, + idx_tree, idx_leaf, sk_seed, pk_seed); + return bcm_infallible::approved; +} + +bcm_status verify_internal(const slh_dsa_config *config, + const uint8_t *signature, size_t signature_len, + const uint8_t *public_key, + const uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN], + const uint8_t *context, size_t context_len, + const uint8_t *msg, size_t msg_len) { + const size_t n = config->n; + if (signature_len != config->signature_bytes) { + return bcm_status::failure; + } + const uint8_t *pk_seed = public_key; + const uint8_t *pk_root = public_key + n; + + const uint8_t *r = signature; + const uint8_t *sig_fors = signature + n; + const uint8_t *sig_ht = sig_fors + slhdsa_fors_bytes(config); + + uint8_t digest[SLHDSA_MAX_DIGEST_SIZE]; + slhdsa_thash_hmsg(config, digest, r, pk_seed, pk_root, header, context, + context_len, msg, msg_len); + + uint8_t fors_digest[SLHDSA_MAX_FORS_MSG_BYTES]; + const size_t fors_msg_bytes = slhdsa_fors_msg_bytes(config); + OPENSSL_memcpy(fors_digest, digest, fors_msg_bytes); + + size_t digest_offset = fors_msg_bytes; + const uint64_t idx_tree = load_tree_index(config, digest + digest_offset); + digest_offset += slhdsa_tree_bytes(config); + uint32_t idx_leaf = 0; + const size_t leaf_bytes = slhdsa_leaf_bytes(config); + for (size_t i = 0; i < leaf_bytes; ++i) { + idx_leaf = (idx_leaf << 8) | digest[digest_offset + i]; + } + const size_t leaf_bits = slhdsa_leaf_bits(config); + if (leaf_bits < 32) { + idx_leaf &= (~(uint32_t)0) >> (32 - leaf_bits); + } + + uint8_t addr[32] = {0}; + slhdsa_set_tree_addr(config, addr, idx_tree); + slhdsa_set_type(config, addr, SLHDSA_ADDR_TYPE_FORSTREE); + slhdsa_set_keypair_addr(config, addr, idx_leaf); + + uint8_t pk_fors[SLHDSA_MAX_N]; + slhdsa_fors_pk_from_sig(config, pk_fors, sig_fors, fors_digest, pk_seed, + addr); + + if (!slhdsa_ht_verify(config, sig_ht, pk_fors, idx_tree, idx_leaf, pk_root, + pk_seed)) { + return bcm_status::failure; + } + + return bcm_status::approved; +} + +namespace fips { + +#include "fips_known_values.inc" + +static int keygen_self_test() { + uint8_t seed[3 * BCM_SLHDSA_SHA2_128S_N] = {0}; + uint8_t pub[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES]; + uint8_t priv[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES]; + generate_key_from_seed_no_self_test(&kSLHDSAConfigSHA2_128s, pub, priv, seed); + + if (!BORINGSSL_check_test(kExpectedPublicKey, pub, "SLH-DSA public key") || + !BORINGSSL_check_test(kExpectedPrivateKey, priv, "SLH-DSA private key")) { + return 0; + } + + return 1; +} + +static int sign_self_test() { + uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN] = {0}; + uint8_t entropy[BCM_SLHDSA_SHA2_128S_N] = {0}; + uint8_t sig[BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES]; + sign_internal_no_self_test(&kSLHDSAConfigSHA2_128s, sig, kExpectedPrivateKey, + header, nullptr, 0, nullptr, 0, entropy); + uint8_t digest[32]; + SHA256(sig, sizeof(sig), digest); + + if (!BORINGSSL_check_test(kExpectedSignatureSHA256, digest, + "SLH-DSA signature")) { + return 0; + } + + return 1; +} + +static int verify_self_test() { + uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN] = {0}; + return verify_internal(&kSLHDSAConfigSHA2_128s, kExpectedSignature, + sizeof(kExpectedSignature), kExpectedPublicKey, header, + nullptr, 0, nullptr, 0) == bcm_status::approved; +} + +#if defined(BORINGSSL_FIPS) + +void ensure_keygen_self_test() { + CRYPTO_once(g_slhdsa_keygen_self_test_once_bss_get(), []() { + if (!keygen_self_test()) { + BORINGSSL_FIPS_abort(); + } + }); +} + +void ensure_sign_self_test() { + CRYPTO_once(g_slhdsa_sign_self_test_once_bss_get(), []() { + if (!sign_self_test()) { + BORINGSSL_FIPS_abort(); + } + }); +} + +void ensure_verify_self_test() { + CRYPTO_once(g_slhdsa_verify_self_test_once_bss_get(), []() { + if (!verify_self_test()) { + BORINGSSL_FIPS_abort(); + } + }); +} + +#else + +void ensure_keygen_self_test() {} +void ensure_sign_self_test() {} +void ensure_verify_self_test() {} + +#endif + +} // namespace fips + +} // namespace + +bcm_infallible bssl::BCM_slhdsa_sha2_128s_generate_key_from_seed( + uint8_t out_public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + uint8_t out_secret_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t seed[3 * BCM_SLHDSA_SHA2_128S_N]) { + fips::ensure_keygen_self_test(); + return generate_key_from_seed_no_self_test( + &kSLHDSAConfigSHA2_128s, out_public_key, out_secret_key, seed); +} + +bcm_infallible bssl::BCM_slhdsa_shake_256f_generate_key_from_seed( + uint8_t out_public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + uint8_t out_secret_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES], + const uint8_t seed[3 * BCM_SLHDSA_SHAKE_256F_N]) { + fips::ensure_keygen_self_test(); + return generate_key_from_seed_no_self_test( + &kSLHDSAConfigSHAKE_256f, out_public_key, out_secret_key, seed); +} + +bcm_status bssl::BCM_slhdsa_sha2_128s_generate_key_from_seed_fips( + uint8_t out_public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + uint8_t out_secret_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t seed[3 * BCM_SLHDSA_SHA2_128S_N]) { + if (out_public_key == nullptr || out_secret_key == nullptr) { + return bcm_status::failure; + } + BCM_slhdsa_sha2_128s_generate_key_from_seed(out_public_key, out_secret_key, + seed); + return bcm_status::approved; +} + +bcm_status bssl::BCM_slhdsa_shake_256f_generate_key_from_seed_fips( + uint8_t out_public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + uint8_t out_secret_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES], + const uint8_t seed[3 * BCM_SLHDSA_SHAKE_256F_N]) { + if (out_public_key == nullptr || out_secret_key == nullptr) { + return bcm_status::failure; + } + BCM_slhdsa_shake_256f_generate_key_from_seed(out_public_key, out_secret_key, + seed); + return bcm_status::approved; +} + +bcm_infallible bssl::BCM_slhdsa_sha2_128s_generate_key( + uint8_t out_public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + uint8_t out_private_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES]) { + uint8_t seed[3 * BCM_SLHDSA_SHA2_128S_N]; + RAND_bytes(seed, 3 * BCM_SLHDSA_SHA2_128S_N); + return BCM_slhdsa_sha2_128s_generate_key_from_seed(out_public_key, + out_private_key, seed); +} + +bcm_infallible bssl::BCM_slhdsa_shake_256f_generate_key( + uint8_t out_public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + uint8_t out_private_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES]) { + uint8_t seed[3 * BCM_SLHDSA_SHAKE_256F_N]; + RAND_bytes(seed, 3 * BCM_SLHDSA_SHAKE_256F_N); + return BCM_slhdsa_shake_256f_generate_key_from_seed(out_public_key, + out_private_key, seed); +} + +bcm_status bssl::BCM_slhdsa_sha2_128s_generate_key_fips( + uint8_t out_public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + uint8_t out_private_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES]) { + if (out_public_key == nullptr || out_private_key == nullptr) { + return bcm_status::failure; + } + BCM_slhdsa_sha2_128s_generate_key(out_public_key, out_private_key); + return bcm_status::approved; +} + +bcm_status bssl::BCM_slhdsa_shake_256f_generate_key_fips( + uint8_t out_public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + uint8_t out_private_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES]) { + if (out_public_key == nullptr || out_private_key == nullptr) { + return bcm_status::failure; + } + BCM_slhdsa_shake_256f_generate_key(out_public_key, out_private_key); + return bcm_status::approved; +} + +bcm_infallible bssl::BCM_slhdsa_sha2_128s_public_from_private( + uint8_t out_public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES]) { + OPENSSL_memcpy(out_public_key, private_key + 2 * BCM_SLHDSA_SHA2_128S_N, + BCM_SLHDSA_SHA2_128S_N * 2); + return bcm_infallible::approved; +} + +bcm_infallible bssl::BCM_slhdsa_shake_256f_public_from_private( + uint8_t out_public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES]) { + OPENSSL_memcpy(out_public_key, private_key + 2 * BCM_SLHDSA_SHAKE_256F_N, + BCM_SLHDSA_SHAKE_256F_N * 2); + return bcm_infallible::approved; +} + +bcm_status bssl::BCM_slhdsa_sha2_128s_sign( + uint8_t out_signature[BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > MAX_CONTEXT_LENGTH) { + return bcm_status::failure; + } + + // Construct header for M' as specified in Algorithm 22 + uint8_t M_prime_header[2]; + M_prime_header[0] = 0; // domain separator for pure signing + M_prime_header[1] = (uint8_t)context_len; + + uint8_t entropy[BCM_SLHDSA_SHA2_128S_N]; + RAND_bytes(entropy, sizeof(entropy)); + BCM_slhdsa_sha2_128s_sign_internal(out_signature, private_key, M_prime_header, + context, context_len, msg, msg_len, + entropy); + return bcm_status::approved; +} + +bcm_status bssl::BCM_slhdsa_shake_256f_sign( + uint8_t out_signature[BCM_SLHDSA_SHAKE_256F_SIGNATURE_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > MAX_CONTEXT_LENGTH) { + return bcm_status::failure; + } + + uint8_t M_prime_header[2]; + M_prime_header[0] = 0; + M_prime_header[1] = (uint8_t)context_len; + + uint8_t entropy[BCM_SLHDSA_SHAKE_256F_N]; + RAND_bytes(entropy, sizeof(entropy)); + BCM_slhdsa_shake_256f_sign_internal(out_signature, private_key, + M_prime_header, context, context_len, msg, + msg_len, entropy); + return bcm_status::approved; +} + +static int slhdsa_get_context_and_oid(uint8_t *out_context_and_oid, + size_t *out_context_and_oid_len, + size_t max_out_context_and_oid, + const uint8_t *context, + size_t context_len, int hash_nid, + size_t hashed_msg_len) { + const uint8_t *oid; + size_t oid_len; + size_t expected_hash_len; + switch (hash_nid) { + case NID_sha256: + oid = kSHA256OID; + oid_len = sizeof(kSHA256OID); + static_assert(sizeof(kSHA256OID) <= MAX_OID_LENGTH); + expected_hash_len = 32; + break; + + // The SLH-DSA spec only lists SHA-256 and SHA-512. This function also + // supports SHA-384, which is non-standard. + case NID_sha384: + oid = kSHA384OID; + oid_len = sizeof(kSHA384OID); + static_assert(sizeof(kSHA384OID) <= MAX_OID_LENGTH); + expected_hash_len = 48; + break; + + // If adding a hash function with a larger `oid_len`, update the size of + // `context_and_oid` in the callers. + default: + return 0; + } + + if (hashed_msg_len != expected_hash_len) { + return 0; + } + + *out_context_and_oid_len = context_len + oid_len; + if (*out_context_and_oid_len > max_out_context_and_oid) { + return 0; + } + + OPENSSL_memcpy(out_context_and_oid, context, context_len); + OPENSSL_memcpy(out_context_and_oid + context_len, oid, oid_len); + + return 1; +} + +bcm_infallible bssl::BCM_slhdsa_sha2_128s_sign_internal( + uint8_t out_signature[BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES], + const uint8_t secret_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN], const uint8_t *context, + size_t context_len, const uint8_t *msg, size_t msg_len, + const uint8_t entropy[BCM_SLHDSA_SHA2_128S_N]) { + fips::ensure_sign_self_test(); + return sign_internal_no_self_test(&kSLHDSAConfigSHA2_128s, out_signature, + secret_key, header, context, context_len, + msg, msg_len, entropy); +} + +bcm_infallible bssl::BCM_slhdsa_shake_256f_sign_internal( + uint8_t out_signature[BCM_SLHDSA_SHAKE_256F_SIGNATURE_BYTES], + const uint8_t secret_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES], + const uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN], const uint8_t *context, + size_t context_len, const uint8_t *msg, size_t msg_len, + const uint8_t entropy[BCM_SLHDSA_SHAKE_256F_N]) { + fips::ensure_sign_self_test(); + return sign_internal_no_self_test(&kSLHDSAConfigSHAKE_256f, out_signature, + secret_key, header, context, context_len, + msg, msg_len, entropy); +} + +bcm_status bssl::BCM_slhdsa_sha2_128s_prehash_sign( + uint8_t out_signature[BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len) { + if (context_len > MAX_CONTEXT_LENGTH) { + return bcm_status::failure; + } + + uint8_t M_prime_header[2]; + M_prime_header[0] = 1; // domain separator for prehashed signing + M_prime_header[1] = (uint8_t)context_len; + + uint8_t context_and_oid[MAX_CONTEXT_LENGTH + MAX_OID_LENGTH]; + size_t context_and_oid_len; + if (!slhdsa_get_context_and_oid(context_and_oid, &context_and_oid_len, + sizeof(context_and_oid), context, context_len, + hash_nid, hashed_msg_len)) { + return bcm_status::failure; + } + + uint8_t entropy[BCM_SLHDSA_SHA2_128S_N]; + RAND_bytes(entropy, sizeof(entropy)); + BCM_slhdsa_sha2_128s_sign_internal(out_signature, private_key, M_prime_header, + context_and_oid, context_and_oid_len, + hashed_msg, hashed_msg_len, entropy); + return bcm_status::approved; +} + +bcm_status bssl::BCM_slhdsa_shake_256f_prehash_sign( + uint8_t out_signature[BCM_SLHDSA_SHAKE_256F_SIGNATURE_BYTES], + const uint8_t private_key[BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len) { + if (context_len > MAX_CONTEXT_LENGTH) { + return bcm_status::failure; + } + + uint8_t M_prime_header[2]; + M_prime_header[0] = 1; + M_prime_header[1] = (uint8_t)context_len; + + uint8_t context_and_oid[MAX_CONTEXT_LENGTH + MAX_OID_LENGTH]; + size_t context_and_oid_len; + if (!slhdsa_get_context_and_oid(context_and_oid, &context_and_oid_len, + sizeof(context_and_oid), context, context_len, + hash_nid, hashed_msg_len)) { + return bcm_status::failure; + } + + uint8_t entropy[BCM_SLHDSA_SHAKE_256F_N]; + RAND_bytes(entropy, sizeof(entropy)); + BCM_slhdsa_shake_256f_sign_internal( + out_signature, private_key, M_prime_header, context_and_oid, + context_and_oid_len, hashed_msg, hashed_msg_len, entropy); + return bcm_status::approved; +} + +// Implements Algorithm 24: slh_verify function (Section 10.3, page 41) +bcm_status bssl::BCM_slhdsa_sha2_128s_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > MAX_CONTEXT_LENGTH) { + return bcm_status::failure; + } + + // Construct header for M' as specified in Algorithm 24 + uint8_t M_prime_header[2]; + M_prime_header[0] = 0; // domain separator for pure verification + M_prime_header[1] = (uint8_t)context_len; + + return BCM_slhdsa_sha2_128s_verify_internal( + signature, signature_len, public_key, M_prime_header, context, + context_len, msg, msg_len); +} + +bcm_status bssl::BCM_slhdsa_shake_256f_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > MAX_CONTEXT_LENGTH) { + return bcm_status::failure; + } + + uint8_t M_prime_header[2]; + M_prime_header[0] = 0; + M_prime_header[1] = (uint8_t)context_len; + + return BCM_slhdsa_shake_256f_verify_internal( + signature, signature_len, public_key, M_prime_header, context, + context_len, msg, msg_len); +} + +bcm_status bssl::BCM_slhdsa_sha2_128s_prehash_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len) { + if (context_len > MAX_CONTEXT_LENGTH) { + return bcm_status::failure; + } + + uint8_t M_prime_header[2]; + M_prime_header[0] = 1; // domain separator for prehashed verification + M_prime_header[1] = (uint8_t)context_len; + + uint8_t context_and_oid[MAX_CONTEXT_LENGTH + MAX_OID_LENGTH]; + size_t context_and_oid_len; + if (!slhdsa_get_context_and_oid(context_and_oid, &context_and_oid_len, + sizeof(context_and_oid), context, context_len, + hash_nid, hashed_msg_len)) { + return bcm_status::failure; + } + + return BCM_slhdsa_sha2_128s_verify_internal( + signature, signature_len, public_key, M_prime_header, context_and_oid, + context_and_oid_len, hashed_msg, hashed_msg_len); +} + +bcm_status bssl::BCM_slhdsa_shake_256f_prehash_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len) { + if (context_len > MAX_CONTEXT_LENGTH) { + return bcm_status::failure; + } + + uint8_t M_prime_header[2]; + M_prime_header[0] = 1; + M_prime_header[1] = (uint8_t)context_len; + + uint8_t context_and_oid[MAX_CONTEXT_LENGTH + MAX_OID_LENGTH]; + size_t context_and_oid_len; + if (!slhdsa_get_context_and_oid(context_and_oid, &context_and_oid_len, + sizeof(context_and_oid), context, context_len, + hash_nid, hashed_msg_len)) { + return bcm_status::failure; + } + + return BCM_slhdsa_shake_256f_verify_internal( + signature, signature_len, public_key, M_prime_header, context_and_oid, + context_and_oid_len, hashed_msg, hashed_msg_len); +} + +bcm_status bssl::BCM_slhdsa_sha2_128s_verify_internal( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN], const uint8_t *context, + size_t context_len, const uint8_t *msg, size_t msg_len) { + fips::ensure_verify_self_test(); + return verify_internal(&kSLHDSAConfigSHA2_128s, signature, signature_len, + public_key, header, context, context_len, msg, + msg_len); +} + +bcm_status bssl::BCM_slhdsa_shake_256f_verify_internal( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + const uint8_t header[BCM_SLHDSA_M_PRIME_HEADER_LEN], const uint8_t *context, + size_t context_len, const uint8_t *msg, size_t msg_len) { + fips::ensure_verify_self_test(); + return verify_internal(&kSLHDSAConfigSHAKE_256f, signature, signature_len, + public_key, header, context, context_len, msg, + msg_len); +} + +int bssl::boringssl_self_test_slhdsa() { + return fips::keygen_self_test() && fips::sign_self_test() && + fips::verify_self_test(); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/thash.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/thash.cc.inc new file mode 100644 index 00000000..2cb608fb --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/thash.cc.inc @@ -0,0 +1,326 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include "../../internal.h" +#include "../keccak/internal.h" +#include "./thash.h" + + +using namespace bssl; + +// Internal thash function used by F, H, and T_l for SHA-256 based parameter +// sets (Section 11.2, pages 44-46) +static void slhdsa_thash_sha256(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, size_t input_blocks, + const uint8_t *pk_seed, uint8_t addr[32]) { + BSSL_CHECK(config->hash_type == SLH_DSA_HASH_SHA2_256); + SHA256_CTX sha256; + SHA256_Init(&sha256); + + uint8_t zeros[SLHDSA_MAX_HASH_BLOCK_BYTES] = {0}; + BSSL_CHECK(config->hash_block_bytes <= sizeof(zeros)); + BSSL_CHECK(config->hash_block_bytes >= config->n); + SHA256_Update(&sha256, pk_seed, config->n); + SHA256_Update(&sha256, zeros, config->hash_block_bytes - config->n); + SHA256_Update(&sha256, addr, SLHDSA_ADDR_COMPRESSED_BYTES); + SHA256_Update(&sha256, input, input_blocks * config->n); + + uint8_t hash[SLHDSA_MAX_DIGEST_SIZE]; + SHA256_Final(hash, &sha256); + OPENSSL_memcpy(output, hash, config->n); +} + +// Internal thash function used by F, H, and T_l for SHAKE-256 based parameter +// sets (Section 11.1, pages 43-45) +static void slhdsa_thash_shake(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, size_t input_blocks, + const uint8_t *pk_seed, uint8_t addr[32]) { + BSSL_CHECK(config->hash_type == SLH_DSA_HASH_SHAKE_256); + BSSL_CHECK(!config->compressed_addresses); + + struct BORINGSSL_keccak_st ctx; + BORINGSSL_keccak_init(&ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&ctx, pk_seed, config->n); + BORINGSSL_keccak_absorb(&ctx, addr, SLHDSA_ADDR_BYTES); + BORINGSSL_keccak_absorb(&ctx, input, input_blocks * config->n); + BORINGSSL_keccak_squeeze(&ctx, output, config->n); +} + +static void slhdsa_thash_dispatch(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, size_t input_blocks, + const uint8_t *pk_seed, uint8_t addr[32]) { + switch (config->hash_type) { + case SLH_DSA_HASH_SHA2_256: + slhdsa_thash_sha256(config, output, input, input_blocks, pk_seed, addr); + return; + case SLH_DSA_HASH_SHAKE_256: + slhdsa_thash_shake(config, output, input, input_blocks, pk_seed, addr); + return; + } + BSSL_CHECK(false); +} + +// Implements PRF_msg function (Section 4.1, page 11 and Section 11.2, pages +// 44-46) +static void slhdsa_thash_prfmsg_sha256(const slh_dsa_config *config, + uint8_t *output, const uint8_t *sk_prf, + const uint8_t *entropy, + const uint8_t *header, + const uint8_t *ctx, size_t ctx_len, + const uint8_t *msg, size_t msg_len) { + BSSL_CHECK(config->hash_type == SLH_DSA_HASH_SHA2_256); + // Compute HMAC-SHA256(sk_prf, entropy || header || ctx || msg). We inline + // HMAC to avoid an allocation. + uint8_t hmac_key[SHA256_CBLOCK]; + BSSL_CHECK(config->n <= sizeof(hmac_key)); + OPENSSL_memcpy(hmac_key, sk_prf, config->n); + for (size_t i = 0; i < config->n; i++) { + hmac_key[i] ^= 0x36; + } + OPENSSL_memset(hmac_key + config->n, 0x36, sizeof(hmac_key) - config->n); + + SHA256_CTX sha_ctx; + SHA256_Init(&sha_ctx); + SHA256_Update(&sha_ctx, hmac_key, sizeof(hmac_key)); + SHA256_Update(&sha_ctx, entropy, config->n); + if (header) { + SHA256_Update(&sha_ctx, header, BCM_SLHDSA_M_PRIME_HEADER_LEN); + } + if (ctx_len != 0) { + SHA256_Update(&sha_ctx, ctx, ctx_len); + } + if (msg_len != 0) { + SHA256_Update(&sha_ctx, msg, msg_len); + } + uint8_t hash[SHA256_DIGEST_LENGTH]; + SHA256_Final(hash, &sha_ctx); + + for (size_t i = 0; i < config->n; i++) { + hmac_key[i] ^= 0x36 ^ 0x5c; + } + OPENSSL_memset(hmac_key + config->n, 0x5c, sizeof(hmac_key) - config->n); + + SHA256_Init(&sha_ctx); + SHA256_Update(&sha_ctx, hmac_key, sizeof(hmac_key)); + SHA256_Update(&sha_ctx, hash, sizeof(hash)); + SHA256_Final(hash, &sha_ctx); + + // Truncate to n bytes + OPENSSL_memcpy(output, hash, config->n); +} + +static void slhdsa_thash_prfmsg_shake(const slh_dsa_config *config, + uint8_t *output, const uint8_t *sk_prf, + const uint8_t *entropy, + const uint8_t *header, + const uint8_t *ctx, size_t ctx_len, + const uint8_t *msg, size_t msg_len) { + BSSL_CHECK(config->hash_type == SLH_DSA_HASH_SHAKE_256); + + struct BORINGSSL_keccak_st keccak; + BORINGSSL_keccak_init(&keccak, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak, sk_prf, config->n); + BORINGSSL_keccak_absorb(&keccak, entropy, config->n); + if (header) { + BORINGSSL_keccak_absorb(&keccak, header, BCM_SLHDSA_M_PRIME_HEADER_LEN); + } + if (ctx_len != 0) { + BORINGSSL_keccak_absorb(&keccak, ctx, ctx_len); + } + if (msg_len != 0) { + BORINGSSL_keccak_absorb(&keccak, msg, msg_len); + } + BORINGSSL_keccak_squeeze(&keccak, output, config->n); +} + +void bssl::slhdsa_thash_prfmsg(const slh_dsa_config *config, uint8_t *output, + const uint8_t *sk_prf, const uint8_t *entropy, + const uint8_t *header, const uint8_t *ctx, + size_t ctx_len, const uint8_t *msg, + size_t msg_len) { + switch (config->hash_type) { + case SLH_DSA_HASH_SHA2_256: + slhdsa_thash_prfmsg_sha256(config, output, sk_prf, entropy, header, ctx, + ctx_len, msg, msg_len); + return; + case SLH_DSA_HASH_SHAKE_256: + slhdsa_thash_prfmsg_shake(config, output, sk_prf, entropy, header, ctx, + ctx_len, msg, msg_len); + return; + } + BSSL_CHECK(false); +} + +// Implements H_msg function (Section 4.1, page 11 and Section 11.2, pages +// 44-46) +static void slhdsa_thash_hmsg_sha256(const slh_dsa_config *config, + uint8_t *output, const uint8_t *r, + const uint8_t *pk_seed, + const uint8_t *pk_root, + const uint8_t *header, + const uint8_t *ctx, size_t ctx_len, + const uint8_t *msg, size_t msg_len) { + BSSL_CHECK(config->hash_type == SLH_DSA_HASH_SHA2_256); + // MGF1-SHA-256(R || PK.seed || SHA-256(R || PK.seed || PK.root || header || + // ctx || M), m) input_buffer stores R || PK_SEED || SHA256(..) || 4-byte + // index + uint8_t input_buffer[2 * SLHDSA_MAX_N + SLHDSA_MAX_DIGEST_SIZE + 4] = {0}; + OPENSSL_memcpy(input_buffer, r, config->n); + OPENSSL_memcpy(input_buffer + config->n, pk_seed, config->n); + + // Inner hash + SHA256_CTX sha_ctx; + SHA256_Init(&sha_ctx); + SHA256_Update(&sha_ctx, r, config->n); + SHA256_Update(&sha_ctx, pk_seed, config->n); + SHA256_Update(&sha_ctx, pk_root, config->n); + if (header) { + SHA256_Update(&sha_ctx, header, BCM_SLHDSA_M_PRIME_HEADER_LEN); + } + if (ctx_len != 0) { + SHA256_Update(&sha_ctx, ctx, ctx_len); + } + if (msg_len != 0) { + SHA256_Update(&sha_ctx, msg, msg_len); + } + // Write directly into the input buffer + SHA256_Final(input_buffer + 2 * config->n, &sha_ctx); + + // MGF1-SHA-256 + uint8_t hash[32]; + BSSL_CHECK(config->digest_size <= sizeof(hash)); + BSSL_CHECK(config->hash_output_bytes == sizeof(hash)); + const size_t mgf_input_len = + 2 * config->n + config->hash_output_bytes + 4; + SHA256(input_buffer, mgf_input_len, hash); + OPENSSL_memcpy(output, hash, config->digest_size); +} + +static void slhdsa_thash_hmsg_shake(const slh_dsa_config *config, + uint8_t *output, const uint8_t *r, + const uint8_t *pk_seed, + const uint8_t *pk_root, + const uint8_t *header, + const uint8_t *ctx, size_t ctx_len, + const uint8_t *msg, size_t msg_len) { + BSSL_CHECK(config->hash_type == SLH_DSA_HASH_SHAKE_256); + + struct BORINGSSL_keccak_st keccak; + BORINGSSL_keccak_init(&keccak, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak, r, config->n); + BORINGSSL_keccak_absorb(&keccak, pk_seed, config->n); + BORINGSSL_keccak_absorb(&keccak, pk_root, config->n); + if (header) { + BORINGSSL_keccak_absorb(&keccak, header, BCM_SLHDSA_M_PRIME_HEADER_LEN); + } + if (ctx_len != 0) { + BORINGSSL_keccak_absorb(&keccak, ctx, ctx_len); + } + if (msg_len != 0) { + BORINGSSL_keccak_absorb(&keccak, msg, msg_len); + } + BORINGSSL_keccak_squeeze(&keccak, output, config->digest_size); +} + +void bssl::slhdsa_thash_hmsg(const slh_dsa_config *config, uint8_t *output, + const uint8_t *r, const uint8_t *pk_seed, + const uint8_t *pk_root, const uint8_t *header, + const uint8_t *ctx, size_t ctx_len, + const uint8_t *msg, size_t msg_len) { + switch (config->hash_type) { + case SLH_DSA_HASH_SHA2_256: + slhdsa_thash_hmsg_sha256(config, output, r, pk_seed, pk_root, header, ctx, + ctx_len, msg, msg_len); + return; + case SLH_DSA_HASH_SHAKE_256: + slhdsa_thash_hmsg_shake(config, output, r, pk_seed, pk_root, header, ctx, + ctx_len, msg, msg_len); + return; + } + BSSL_CHECK(false); +} + +// Implements PRF function (Section 4.1, page 11 and Section 11.2, pages 44-46) +static void slhdsa_thash_prf_sha256(const slh_dsa_config *config, + uint8_t *output, const uint8_t *pk_seed, + const uint8_t *sk_seed, uint8_t addr[32]) { + slhdsa_thash_sha256(config, output, sk_seed, 1, pk_seed, addr); +} + +static void slhdsa_thash_prf_shake(const slh_dsa_config *config, + uint8_t *output, const uint8_t *pk_seed, + const uint8_t *sk_seed, uint8_t addr[32]) { + BSSL_CHECK(config->hash_type == SLH_DSA_HASH_SHAKE_256); + BSSL_CHECK(!config->compressed_addresses); + + struct BORINGSSL_keccak_st keccak; + BORINGSSL_keccak_init(&keccak, boringssl_shake256); + BORINGSSL_keccak_absorb(&keccak, pk_seed, config->n); + BORINGSSL_keccak_absorb(&keccak, addr, SLHDSA_ADDR_BYTES); + BORINGSSL_keccak_absorb(&keccak, sk_seed, config->n); + BORINGSSL_keccak_squeeze(&keccak, output, config->n); +} + +void bssl::slhdsa_thash_prf(const slh_dsa_config *config, uint8_t *output, + const uint8_t *pk_seed, const uint8_t *sk_seed, + uint8_t addr[32]) { + switch (config->hash_type) { + case SLH_DSA_HASH_SHA2_256: + slhdsa_thash_prf_sha256(config, output, pk_seed, sk_seed, addr); + return; + case SLH_DSA_HASH_SHAKE_256: + slhdsa_thash_prf_shake(config, output, pk_seed, sk_seed, addr); + return; + } + BSSL_CHECK(false); +} + +// Implements T_l function for WOTS+ public key compression (Section 4.1, page +// 11 and Section 11.2, pages 44-46) +void bssl::slhdsa_thash_tl(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, const uint8_t *pk_seed, + uint8_t addr[32]) { + slhdsa_thash_dispatch(config, output, input, slhdsa_wots_len(config), pk_seed, + addr); +} + +// Implements H function (Section 4.1, page 11 and Section 11.2, pages 44-46) +void bssl::slhdsa_thash_h(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, const uint8_t *pk_seed, + uint8_t addr[32]) { + slhdsa_thash_dispatch(config, output, input, 2, pk_seed, addr); +} + +// Implements F function (Section 4.1, page 11 and Section 11.2, pages 44-46) +void bssl::slhdsa_thash_f(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, const uint8_t *pk_seed, + uint8_t addr[32]) { + slhdsa_thash_dispatch(config, output, input, 1, pk_seed, addr); +} + +// Implements T_k function for FORS public key compression (Section 4.1, page 11 +// and Section 11.2, pages 44-46) +void bssl::slhdsa_thash_tk(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, const uint8_t *pk_seed, + uint8_t addr[32]) { + slhdsa_thash_dispatch(config, output, input, config->fors_trees, pk_seed, + addr); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/thash.h b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/thash.h new file mode 100644 index 00000000..23a42c6b --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/thash.h @@ -0,0 +1,71 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_THASH_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_THASH_H + +#include "./params.h" + + +BSSL_NAMESPACE_BEGIN + +// Implements PRF_msg: a pseudo-random function that is used to generate the +// randomizer r for the randomized hashing of the message to be signed. +// (Section 4.1, page 11) +void slhdsa_thash_prfmsg(const slh_dsa_config *config, uint8_t *output, + const uint8_t *sk_prf, const uint8_t *opt_rand, + const uint8_t *header, const uint8_t *ctx, + size_t ctx_len, const uint8_t *msg, size_t msg_len); + +// Implements H_msg: a hash function used to generate the digest of the message +// to be signed. (Section 4.1, page 11) +void slhdsa_thash_hmsg(const slh_dsa_config *config, uint8_t *output, + const uint8_t *r, const uint8_t *pk_seed, + const uint8_t *pk_root, const uint8_t *header, + const uint8_t *ctx, size_t ctx_len, const uint8_t *msg, + size_t msg_len); + +// Implements PRF: a pseudo-random function that is used to generate the secret +// values in WOTS+ and FORS private keys. (Section 4.1, page 11) +void slhdsa_thash_prf(const slh_dsa_config *config, uint8_t *output, + const uint8_t *pk_seed, const uint8_t *sk_seed, + uint8_t addr[32]); + +// Implements T_l: a hash function that maps an l*n-byte message to an n-byte +// message. Used for WOTS+ public key compression. (Section 4.1, page 11) +void slhdsa_thash_tl(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, const uint8_t *pk_seed, + uint8_t addr[32]); + +// Implements H: a hash function that takes a 2*n-byte message as input and +// produces an n-byte output. (Section 4.1, page 11) +void slhdsa_thash_h(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, const uint8_t *pk_seed, + uint8_t addr[32]); + +// Implements F: a hash function that takes an n-byte message as input and +// produces an n-byte output. (Section 4.1, page 11) +void slhdsa_thash_f(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, const uint8_t *pk_seed, + uint8_t addr[32]); + +// Implements T_k: a hash function that maps a k*n-byte message to an n-byte +// message. Used for FORS public key compression. (Section 4.1, page 11) +void slhdsa_thash_tk(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, const uint8_t *pk_seed, + uint8_t addr[32]); + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_THASH_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/wots.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/wots.cc.inc new file mode 100644 index 00000000..e8bc9900 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/wots.cc.inc @@ -0,0 +1,172 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include "../../internal.h" +#include "./address.h" +#include "./params.h" +#include "./thash.h" +#include "./wots.h" + + +using namespace bssl; + +// Implements Algorithm 5: chain function, page 18 +static void chain(const slh_dsa_config *config, uint8_t *output, + const uint8_t *input, uint32_t start, uint32_t steps, + const uint8_t *pub_seed, uint8_t addr[32]) { + assert(start < config->wots_w); + assert(steps < config->wots_w); + + OPENSSL_memcpy(output, input, config->n); + + for (uint32_t i = start; i < (start + steps) && i < config->wots_w; ++i) { + slhdsa_set_hash_addr(config, addr, i); + slhdsa_thash_f(config, output, output, pub_seed, addr); + } +} + +static void slhdsa_wots_do_chain(const slh_dsa_config *config, uint8_t *out, + uint8_t sk_addr[32], uint8_t addr[32], + uint8_t value, const uint8_t *sk_seed, + const uint8_t *pub_seed, + uint32_t chain_index) { + uint8_t tmp_sk[SLHDSA_MAX_N]; + slhdsa_set_chain_addr(config, sk_addr, chain_index); + slhdsa_thash_prf(config, tmp_sk, pub_seed, sk_seed, sk_addr); + slhdsa_set_chain_addr(config, addr, chain_index); + chain(config, out, tmp_sk, 0, value, pub_seed, addr); +} + +// Implements Algorithm 6: wots_pkGen function, page 18 +void bssl::slhdsa_wots_pk_gen(const slh_dsa_config *config, uint8_t *pk, + const uint8_t *sk_seed, const uint8_t *pub_seed, + uint8_t addr[32]) { + uint8_t wots_pk_addr[32], sk_addr[32]; + OPENSSL_memcpy(wots_pk_addr, addr, sizeof(wots_pk_addr)); + OPENSSL_memcpy(sk_addr, addr, sizeof(sk_addr)); + slhdsa_set_type(config, sk_addr, SLHDSA_ADDR_TYPE_WOTSPRF); + slhdsa_copy_keypair_addr(config, sk_addr, addr); + + uint8_t tmp[SLHDSA_MAX_WOTS_BYTES]; + const uint32_t n = config->n; + const uint32_t wots_len = slhdsa_wots_len(config); + const uint32_t max_chain = config->wots_w - 1; + for (uint32_t i = 0; i < wots_len; ++i) { + slhdsa_wots_do_chain(config, tmp + i * n, sk_addr, addr, max_chain, sk_seed, + pub_seed, i); + } + + // Compress pk + slhdsa_set_type(config, wots_pk_addr, SLHDSA_ADDR_TYPE_WOTSPK); + slhdsa_copy_keypair_addr(config, wots_pk_addr, addr); + slhdsa_thash_tl(config, pk, tmp, pub_seed, wots_pk_addr); +} + +// Implements Algorithm 7: wots_sign function, page 20 +void bssl::slhdsa_wots_sign(const slh_dsa_config *config, uint8_t *sig, + const uint8_t *msg, const uint8_t *sk_seed, + const uint8_t *pub_seed, uint8_t addr[32]) { + BSSL_CHECK(config->wots_w == 16); + const size_t n = config->n; + const uint32_t max_chain = config->wots_w - 1; + + uint16_t csum = 0; + for (size_t i = 0; i < n; ++i) { + csum += max_chain - (msg[i] >> 4); + csum += max_chain - (msg[i] & 15); + } + + // Compute chains + uint8_t sk_addr[32]; + OPENSSL_memcpy(sk_addr, addr, sizeof(sk_addr)); + slhdsa_set_type(config, sk_addr, SLHDSA_ADDR_TYPE_WOTSPRF); + slhdsa_copy_keypair_addr(config, sk_addr, addr); + + uint32_t chain_index = 0; + for (size_t i = 0; i < n; ++i) { + slhdsa_wots_do_chain(config, sig, sk_addr, addr, msg[i] >> 4, sk_seed, + pub_seed, chain_index++); + sig += n; + + slhdsa_wots_do_chain(config, sig, sk_addr, addr, msg[i] & 15, sk_seed, + pub_seed, chain_index++); + sig += n; + } + + // Include the WOTS checksum values (len2 = 3 for the parameter sets we + // support). + slhdsa_wots_do_chain(config, sig, sk_addr, addr, (csum >> 8) & 15, sk_seed, + pub_seed, chain_index++); + sig += n; + slhdsa_wots_do_chain(config, sig, sk_addr, addr, (csum >> 4) & 15, sk_seed, + pub_seed, chain_index++); + sig += n; + slhdsa_wots_do_chain(config, sig, sk_addr, addr, csum & 15, sk_seed, pub_seed, + chain_index++); +} + +static void slhdsa_wots_pk_from_sig_do_chain(const slh_dsa_config *config, + uint8_t *out, uint8_t addr[32], + const uint8_t *in, uint8_t value, + const uint8_t *pub_seed, + uint32_t chain_index) { + slhdsa_set_chain_addr(config, addr, chain_index); + chain(config, out + chain_index * config->n, in + chain_index * config->n, + value, (config->wots_w - 1) - value, pub_seed, addr); +} + +// Implements Algorithm 8: wots_pkFromSig function, page 21 +void bssl::slhdsa_wots_pk_from_sig(const slh_dsa_config *config, uint8_t *pk, + const uint8_t *sig, const uint8_t *msg, + const uint8_t *pub_seed, uint8_t addr[32]) { + BSSL_CHECK(config->wots_w == 16); + const size_t n = config->n; + const uint32_t max_chain = config->wots_w - 1; + + uint16_t csum = 0; + for (size_t i = 0; i < n; ++i) { + csum += max_chain - (msg[i] >> 4); + csum += max_chain - (msg[i] & 15); + } + + uint8_t tmp[SLHDSA_MAX_WOTS_BYTES]; + uint8_t wots_pk_addr[32]; + OPENSSL_memcpy(wots_pk_addr, addr, sizeof(wots_pk_addr)); + + uint32_t chain_index = 0; + for (size_t i = 0; i < n; ++i) { + slhdsa_wots_pk_from_sig_do_chain(config, tmp, addr, sig, msg[i] >> 4, + pub_seed, chain_index++); + slhdsa_wots_pk_from_sig_do_chain(config, tmp, addr, sig, msg[i] & 15, + pub_seed, chain_index++); + } + + slhdsa_wots_pk_from_sig_do_chain(config, tmp, addr, sig, csum >> 8, pub_seed, + chain_index++); + slhdsa_wots_pk_from_sig_do_chain(config, tmp, addr, sig, (csum >> 4) & 15, + pub_seed, chain_index++); + slhdsa_wots_pk_from_sig_do_chain(config, tmp, addr, sig, csum & 15, pub_seed, + chain_index++); + + // Compress pk + slhdsa_set_type(config, wots_pk_addr, SLHDSA_ADDR_TYPE_WOTSPK); + slhdsa_copy_keypair_addr(config, wots_pk_addr, addr); + slhdsa_thash_tl(config, pk, tmp, pub_seed, wots_pk_addr); +} diff --git a/third_party/boringssl/src/crypto/fipsmodule/slhdsa/wots.h b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/wots.h new file mode 100644 index 00000000..28afeb05 --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/slhdsa/wots.h @@ -0,0 +1,40 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_WOTS_H +#define OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_WOTS_H + +#include "./params.h" + + +BSSL_NAMESPACE_BEGIN + +// Implements Algorithm 6: wots_pkGen function, page 18 +void slhdsa_wots_pk_gen(const slh_dsa_config *config, uint8_t *pk, + const uint8_t *sk_seed, const uint8_t *pub_seed, + uint8_t addr[32]); + +// Implements Algorithm 7: wots_sign function, page 20 +void slhdsa_wots_sign(const slh_dsa_config *config, uint8_t *sig, + const uint8_t *msg, const uint8_t *sk_seed, + const uint8_t *pub_seed, uint8_t addr[32]); + +// Implements Algorithm 8: wots_pkFromSig function, page 21 +void slhdsa_wots_pk_from_sig(const slh_dsa_config *config, uint8_t *pk, + const uint8_t *sig, const uint8_t *msg, + const uint8_t *pub_seed, uint8_t addr[32]); + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_SLHDSA_WOTS_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/tls/internal.h b/third_party/boringssl/src/crypto/fipsmodule/tls/internal.h index ef642a6c..e41036f2 100644 --- a/third_party/boringssl/src/crypto/fipsmodule/tls/internal.h +++ b/third_party/boringssl/src/crypto/fipsmodule/tls/internal.h @@ -1,39 +1,33 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #ifndef OPENSSL_HEADER_CRYPTO_FIPSMODULE_TLS_INTERNAL_H #define OPENSSL_HEADER_CRYPTO_FIPSMODULE_TLS_INTERNAL_H #include -#if defined(__cplusplus) -extern "C" { -#endif +BSSL_NAMESPACE_BEGIN -// tls1_prf calculates |out_len| bytes of the TLS PDF, using |digest|, and -// writes them to |out|. It returns one on success and zero on error. -OPENSSL_EXPORT int CRYPTO_tls1_prf(const EVP_MD *digest, - uint8_t *out, size_t out_len, - const uint8_t *secret, size_t secret_len, - const char *label, size_t label_len, - const uint8_t *seed1, size_t seed1_len, - const uint8_t *seed2, size_t seed2_len); +// CRYPTO_tls13_hkdf_expand_label computes the TLS 1.3 KDF function of the same +// name. See https://www.rfc-editor.org/rfc/rfc8446#section-7.1. +OPENSSL_EXPORT int CRYPTO_tls13_hkdf_expand_label( + uint8_t *out, size_t out_len, const EVP_MD *digest, // + const uint8_t *secret, size_t secret_len, // + const uint8_t *label, size_t label_len, // + const uint8_t *hash, size_t hash_len); - -#if defined(__cplusplus) -} -#endif +BSSL_NAMESPACE_END #endif // OPENSSL_HEADER_CRYPTO_FIPSMODULE_TLS_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/fipsmodule/tls/kdf.c b/third_party/boringssl/src/crypto/fipsmodule/tls/kdf.c deleted file mode 100644 index 6f2b68bd..00000000 --- a/third_party/boringssl/src/crypto/fipsmodule/tls/kdf.c +++ /dev/null @@ -1,177 +0,0 @@ -/* ==================================================================== - * Copyright (c) 1998-2007 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include - -#include "internal.h" -#include "../../internal.h" -#include "../service_indicator/internal.h" - - -// tls1_P_hash computes the TLS P_ function as described in RFC 5246, -// section 5. It XORs |out_len| bytes to |out|, using |md| as the hash and -// |secret| as the secret. |label|, |seed1|, and |seed2| are concatenated to -// form the seed parameter. It returns true on success and false on failure. -static int tls1_P_hash(uint8_t *out, size_t out_len, - const EVP_MD *md, - const uint8_t *secret, size_t secret_len, - const char *label, size_t label_len, - const uint8_t *seed1, size_t seed1_len, - const uint8_t *seed2, size_t seed2_len) { - HMAC_CTX ctx, ctx_tmp, ctx_init; - uint8_t A1[EVP_MAX_MD_SIZE]; - unsigned A1_len; - int ret = 0; - - const size_t chunk = EVP_MD_size(md); - HMAC_CTX_init(&ctx); - HMAC_CTX_init(&ctx_tmp); - HMAC_CTX_init(&ctx_init); - - if (!HMAC_Init_ex(&ctx_init, secret, secret_len, md, NULL) || - !HMAC_CTX_copy_ex(&ctx, &ctx_init) || - !HMAC_Update(&ctx, (const uint8_t *) label, label_len) || - !HMAC_Update(&ctx, seed1, seed1_len) || - !HMAC_Update(&ctx, seed2, seed2_len) || - !HMAC_Final(&ctx, A1, &A1_len)) { - goto err; - } - - for (;;) { - unsigned len; - uint8_t hmac[EVP_MAX_MD_SIZE]; - if (!HMAC_CTX_copy_ex(&ctx, &ctx_init) || - !HMAC_Update(&ctx, A1, A1_len) || - // Save a copy of |ctx| to compute the next A1 value below. - (out_len > chunk && !HMAC_CTX_copy_ex(&ctx_tmp, &ctx)) || - !HMAC_Update(&ctx, (const uint8_t *) label, label_len) || - !HMAC_Update(&ctx, seed1, seed1_len) || - !HMAC_Update(&ctx, seed2, seed2_len) || - !HMAC_Final(&ctx, hmac, &len)) { - goto err; - } - assert(len == chunk); - - // XOR the result into |out|. - if (len > out_len) { - len = out_len; - } - for (unsigned i = 0; i < len; i++) { - out[i] ^= hmac[i]; - } - out += len; - out_len -= len; - - if (out_len == 0) { - break; - } - - // Calculate the next A1 value. - if (!HMAC_Final(&ctx_tmp, A1, &A1_len)) { - goto err; - } - } - - ret = 1; - -err: - OPENSSL_cleanse(A1, sizeof(A1)); - HMAC_CTX_cleanup(&ctx); - HMAC_CTX_cleanup(&ctx_tmp); - HMAC_CTX_cleanup(&ctx_init); - return ret; -} - -int CRYPTO_tls1_prf(const EVP_MD *digest, - uint8_t *out, size_t out_len, - const uint8_t *secret, size_t secret_len, - const char *label, size_t label_len, - const uint8_t *seed1, size_t seed1_len, - const uint8_t *seed2, size_t seed2_len) { - if (out_len == 0) { - return 1; - } - - OPENSSL_memset(out, 0, out_len); - - const EVP_MD *const original_digest = digest; - FIPS_service_indicator_lock_state(); - int ret = 0; - - if (digest == EVP_md5_sha1()) { - // If using the MD5/SHA1 PRF, |secret| is partitioned between MD5 and SHA-1. - size_t secret_half = secret_len - (secret_len / 2); - if (!tls1_P_hash(out, out_len, EVP_md5(), secret, secret_half, label, - label_len, seed1, seed1_len, seed2, seed2_len)) { - goto end; - } - - // Note that, if |secret_len| is odd, the two halves share a byte. - secret += secret_len - secret_half; - secret_len = secret_half; - digest = EVP_sha1(); - } - - ret = tls1_P_hash(out, out_len, digest, secret, secret_len, label, label_len, - seed1, seed1_len, seed2, seed2_len); - -end: - FIPS_service_indicator_unlock_state(); - if (ret) { - TLSKDF_verify_service_indicator(original_digest); - } - return ret; -} diff --git a/third_party/boringssl/src/crypto/fipsmodule/tls/kdf.cc.inc b/third_party/boringssl/src/crypto/fipsmodule/tls/kdf.cc.inc new file mode 100644 index 00000000..2446a36b --- /dev/null +++ b/third_party/boringssl/src/crypto/fipsmodule/tls/kdf.cc.inc @@ -0,0 +1,180 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include + +#include "../../internal.h" +#include "../service_indicator/internal.h" +#include "internal.h" + + +using namespace bssl; + +// tls1_P_hash computes the TLS P_ function as described in RFC 5246, +// section 5. It XORs |out_len| bytes to |out|, using |md| as the hash and +// |secret| as the secret. |label|, |seed1|, and |seed2| are concatenated to +// form the seed parameter. It returns true on success and false on failure. +static int tls1_P_hash(uint8_t *out, size_t out_len, const EVP_MD *md, + const uint8_t *secret, size_t secret_len, + const uint8_t *label, size_t label_len, + const uint8_t *seed1, size_t seed1_len, + const uint8_t *seed2, size_t seed2_len) { + HMAC_CTX ctx, ctx_tmp, ctx_init; + uint8_t A1[EVP_MAX_MD_SIZE]; + unsigned A1_len; + int ret = 0; + + const size_t chunk = EVP_MD_size(md); + HMAC_CTX_init(&ctx); + HMAC_CTX_init(&ctx_tmp); + HMAC_CTX_init(&ctx_init); + + if (!HMAC_Init_ex(&ctx_init, secret, secret_len, md, nullptr) || + !HMAC_CTX_copy_ex(&ctx, &ctx_init) || + !HMAC_Update(&ctx, label, label_len) || + !HMAC_Update(&ctx, seed1, seed1_len) || + !HMAC_Update(&ctx, seed2, seed2_len) || // + !HMAC_Final(&ctx, A1, &A1_len)) { + goto err; + } + + for (;;) { + unsigned len_u; + uint8_t hmac[EVP_MAX_MD_SIZE]; + if (!HMAC_CTX_copy_ex(&ctx, &ctx_init) || !HMAC_Update(&ctx, A1, A1_len) || + // Save a copy of |ctx| to compute the next A1 value below. + (out_len > chunk && !HMAC_CTX_copy_ex(&ctx_tmp, &ctx)) || + !HMAC_Update(&ctx, (const uint8_t *)label, label_len) || + !HMAC_Update(&ctx, seed1, seed1_len) || + !HMAC_Update(&ctx, seed2, seed2_len) || + !HMAC_Final(&ctx, hmac, &len_u)) { + goto err; + } + size_t len = len_u; + assert(len == chunk); + + // XOR the result into |out|. + if (len > out_len) { + len = out_len; + } + for (size_t i = 0; i < len; i++) { + out[i] ^= hmac[i]; + } + out += len; + out_len -= len; + + if (out_len == 0) { + break; + } + + // Calculate the next A1 value. + if (!HMAC_Final(&ctx_tmp, A1, &A1_len)) { + goto err; + } + } + + ret = 1; + +err: + OPENSSL_cleanse(A1, sizeof(A1)); + HMAC_CTX_cleanup(&ctx); + HMAC_CTX_cleanup(&ctx_tmp); + HMAC_CTX_cleanup(&ctx_init); + return ret; +} + +int CRYPTO_tls1_prf(const EVP_MD *digest, uint8_t *out, size_t out_len, + const uint8_t *secret, size_t secret_len, + const uint8_t *label, size_t label_len, + const uint8_t *seed1, size_t seed1_len, + const uint8_t *seed2, size_t seed2_len) { + if (out_len == 0) { + return 1; + } + + OPENSSL_memset(out, 0, out_len); + + const EVP_MD *const original_digest = digest; + FIPS_service_indicator_lock_state(); + int ret = 0; + + if (digest == EVP_md5_sha1()) { + // If using the MD5/SHA1 PRF, |secret| is partitioned between MD5 and SHA-1. + size_t secret_half = secret_len - (secret_len / 2); + if (!tls1_P_hash(out, out_len, EVP_md5(), secret, secret_half, label, + label_len, seed1, seed1_len, seed2, seed2_len)) { + goto end; + } + + // Note that, if |secret_len| is odd, the two halves share a byte. + secret += secret_len - secret_half; + secret_len = secret_half; + digest = EVP_sha1(); + } + + ret = tls1_P_hash(out, out_len, digest, secret, secret_len, label, label_len, + seed1, seed1_len, seed2, seed2_len); + +end: + FIPS_service_indicator_unlock_state(); + if (ret) { + TLSKDF_verify_service_indicator(original_digest); + } + return ret; +} + +int bssl::CRYPTO_tls13_hkdf_expand_label(uint8_t *out, size_t out_len, + const EVP_MD *digest, // + const uint8_t *secret, + size_t secret_len, + const uint8_t *label, size_t label_len, + const uint8_t *hash, size_t hash_len) { + static const uint8_t kProtocolLabel[] = "tls13 "; + CBB cbb, child; + uint8_t *hkdf_label = nullptr; + size_t hkdf_label_len; + + FIPS_service_indicator_lock_state(); + CBB_zero(&cbb); + if (!CBB_init(&cbb, 2 + 1 + sizeof(kProtocolLabel) - 1 + label_len + 1 + + hash_len) || + !CBB_add_u16(&cbb, out_len) || + !CBB_add_u8_length_prefixed(&cbb, &child) || + !CBB_add_bytes(&child, kProtocolLabel, sizeof(kProtocolLabel) - 1) || + !CBB_add_bytes(&child, label, label_len) || + !CBB_add_u8_length_prefixed(&cbb, &child) || + !CBB_add_bytes(&child, hash, hash_len) || + !CBB_finish(&cbb, &hkdf_label, &hkdf_label_len)) { + CBB_cleanup(&cbb); + FIPS_service_indicator_unlock_state(); + return 0; + } + + const int ret = HKDF_expand(out, out_len, digest, secret, secret_len, + hkdf_label, hkdf_label_len); + OPENSSL_free(hkdf_label); + + FIPS_service_indicator_unlock_state(); + if (ret) { + TLSKDF_verify_service_indicator(digest); + } + return ret; +} diff --git a/third_party/boringssl/src/crypto/fuzzer_mode.cc b/third_party/boringssl/src/crypto/fuzzer_mode.cc new file mode 100644 index 00000000..a002a424 --- /dev/null +++ b/third_party/boringssl/src/crypto/fuzzer_mode.cc @@ -0,0 +1,30 @@ +// Copyright 2025 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "internal.h" + + +using namespace bssl; + +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) +static bssl::Atomic fuzzer_mode_enabled = 0; + +int bssl::CRYPTO_fuzzer_mode_enabled() { return fuzzer_mode_enabled.load(); } + +void CRYPTO_set_fuzzer_mode(int enabled) { + fuzzer_mode_enabled.store(!!enabled); +} +#endif // FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION diff --git a/third_party/boringssl/src/crypto/hkdf/hkdf.c b/third_party/boringssl/src/crypto/hkdf/hkdf.c deleted file mode 100644 index 23b60afe..00000000 --- a/third_party/boringssl/src/crypto/hkdf/hkdf.c +++ /dev/null @@ -1,112 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include -#include - -#include "../internal.h" - - -int HKDF(uint8_t *out_key, size_t out_len, const EVP_MD *digest, - const uint8_t *secret, size_t secret_len, const uint8_t *salt, - size_t salt_len, const uint8_t *info, size_t info_len) { - // https://tools.ietf.org/html/rfc5869#section-2 - uint8_t prk[EVP_MAX_MD_SIZE]; - size_t prk_len; - - if (!HKDF_extract(prk, &prk_len, digest, secret, secret_len, salt, - salt_len) || - !HKDF_expand(out_key, out_len, digest, prk, prk_len, info, info_len)) { - return 0; - } - - return 1; -} - -int HKDF_extract(uint8_t *out_key, size_t *out_len, const EVP_MD *digest, - const uint8_t *secret, size_t secret_len, const uint8_t *salt, - size_t salt_len) { - // https://tools.ietf.org/html/rfc5869#section-2.2 - - // If salt is not given, HashLength zeros are used. However, HMAC does that - // internally already so we can ignore it. - unsigned len; - if (HMAC(digest, salt, salt_len, secret, secret_len, out_key, &len) == NULL) { - OPENSSL_PUT_ERROR(HKDF, ERR_R_HMAC_LIB); - return 0; - } - *out_len = len; - assert(*out_len == EVP_MD_size(digest)); - return 1; -} - -int HKDF_expand(uint8_t *out_key, size_t out_len, const EVP_MD *digest, - const uint8_t *prk, size_t prk_len, const uint8_t *info, - size_t info_len) { - // https://tools.ietf.org/html/rfc5869#section-2.3 - const size_t digest_len = EVP_MD_size(digest); - uint8_t previous[EVP_MAX_MD_SIZE]; - size_t n, done = 0; - unsigned i; - int ret = 0; - HMAC_CTX hmac; - - // Expand key material to desired length. - n = (out_len + digest_len - 1) / digest_len; - if (out_len + digest_len < out_len || n > 255) { - OPENSSL_PUT_ERROR(HKDF, HKDF_R_OUTPUT_TOO_LARGE); - return 0; - } - - HMAC_CTX_init(&hmac); - if (!HMAC_Init_ex(&hmac, prk, prk_len, digest, NULL)) { - goto out; - } - - for (i = 0; i < n; i++) { - uint8_t ctr = i + 1; - size_t todo; - - if (i != 0 && (!HMAC_Init_ex(&hmac, NULL, 0, NULL, NULL) || - !HMAC_Update(&hmac, previous, digest_len))) { - goto out; - } - if (!HMAC_Update(&hmac, info, info_len) || - !HMAC_Update(&hmac, &ctr, 1) || - !HMAC_Final(&hmac, previous, NULL)) { - goto out; - } - - todo = digest_len; - if (done + todo > out_len) { - todo = out_len - done; - } - OPENSSL_memcpy(out_key + done, previous, todo); - done += todo; - } - - ret = 1; - -out: - HMAC_CTX_cleanup(&hmac); - if (ret != 1) { - OPENSSL_PUT_ERROR(HKDF, ERR_R_HMAC_LIB); - } - return ret; -} diff --git a/third_party/boringssl/src/crypto/hpke/hpke.c b/third_party/boringssl/src/crypto/hpke/hpke.c deleted file mode 100644 index 827ffaab..00000000 --- a/third_party/boringssl/src/crypto/hpke/hpke.c +++ /dev/null @@ -1,618 +0,0 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../internal.h" - - -// This file implements RFC 9180. - -#define MAX_SEED_LEN X25519_PRIVATE_KEY_LEN -#define MAX_SHARED_SECRET_LEN SHA256_DIGEST_LENGTH - -struct evp_hpke_kem_st { - uint16_t id; - size_t public_key_len; - size_t private_key_len; - size_t seed_len; - int (*init_key)(EVP_HPKE_KEY *key, const uint8_t *priv_key, - size_t priv_key_len); - int (*generate_key)(EVP_HPKE_KEY *key); - int (*encap_with_seed)(const EVP_HPKE_KEM *kem, uint8_t *out_shared_secret, - size_t *out_shared_secret_len, uint8_t *out_enc, - size_t *out_enc_len, size_t max_enc, - const uint8_t *peer_public_key, - size_t peer_public_key_len, const uint8_t *seed, - size_t seed_len); - int (*decap)(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, - size_t *out_shared_secret_len, const uint8_t *enc, - size_t enc_len); -}; - -struct evp_hpke_kdf_st { - uint16_t id; - // We only support HKDF-based KDFs. - const EVP_MD *(*hkdf_md_func)(void); -}; - -struct evp_hpke_aead_st { - uint16_t id; - const EVP_AEAD *(*aead_func)(void); -}; - - -// Low-level labeled KDF functions. - -static const char kHpkeVersionId[] = "HPKE-v1"; - -static int add_label_string(CBB *cbb, const char *label) { - return CBB_add_bytes(cbb, (const uint8_t *)label, strlen(label)); -} - -static int hpke_labeled_extract(const EVP_MD *hkdf_md, uint8_t *out_key, - size_t *out_len, const uint8_t *salt, - size_t salt_len, const uint8_t *suite_id, - size_t suite_id_len, const char *label, - const uint8_t *ikm, size_t ikm_len) { - // labeledIKM = concat("HPKE-v1", suite_id, label, IKM) - CBB labeled_ikm; - int ok = CBB_init(&labeled_ikm, 0) && - add_label_string(&labeled_ikm, kHpkeVersionId) && - CBB_add_bytes(&labeled_ikm, suite_id, suite_id_len) && - add_label_string(&labeled_ikm, label) && - CBB_add_bytes(&labeled_ikm, ikm, ikm_len) && - HKDF_extract(out_key, out_len, hkdf_md, CBB_data(&labeled_ikm), - CBB_len(&labeled_ikm), salt, salt_len); - CBB_cleanup(&labeled_ikm); - return ok; -} - -static int hpke_labeled_expand(const EVP_MD *hkdf_md, uint8_t *out_key, - size_t out_len, const uint8_t *prk, - size_t prk_len, const uint8_t *suite_id, - size_t suite_id_len, const char *label, - const uint8_t *info, size_t info_len) { - // labeledInfo = concat(I2OSP(L, 2), "HPKE-v1", suite_id, label, info) - CBB labeled_info; - int ok = CBB_init(&labeled_info, 0) && - CBB_add_u16(&labeled_info, out_len) && - add_label_string(&labeled_info, kHpkeVersionId) && - CBB_add_bytes(&labeled_info, suite_id, suite_id_len) && - add_label_string(&labeled_info, label) && - CBB_add_bytes(&labeled_info, info, info_len) && - HKDF_expand(out_key, out_len, hkdf_md, prk, prk_len, - CBB_data(&labeled_info), CBB_len(&labeled_info)); - CBB_cleanup(&labeled_info); - return ok; -} - - -// KEM implementations. - -// dhkem_extract_and_expand implements the ExtractAndExpand operation in the -// DHKEM construction. See section 4.1 of RFC 9180. -static int dhkem_extract_and_expand(uint16_t kem_id, const EVP_MD *hkdf_md, - uint8_t *out_key, size_t out_len, - const uint8_t *dh, size_t dh_len, - const uint8_t *kem_context, - size_t kem_context_len) { - // concat("KEM", I2OSP(kem_id, 2)) - uint8_t suite_id[5] = {'K', 'E', 'M', kem_id >> 8, kem_id & 0xff}; - uint8_t prk[EVP_MAX_MD_SIZE]; - size_t prk_len; - return hpke_labeled_extract(hkdf_md, prk, &prk_len, NULL, 0, suite_id, - sizeof(suite_id), "eae_prk", dh, dh_len) && - hpke_labeled_expand(hkdf_md, out_key, out_len, prk, prk_len, suite_id, - sizeof(suite_id), "shared_secret", kem_context, - kem_context_len); -} - -static int x25519_init_key(EVP_HPKE_KEY *key, const uint8_t *priv_key, - size_t priv_key_len) { - if (priv_key_len != X25519_PRIVATE_KEY_LEN) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - - OPENSSL_memcpy(key->private_key, priv_key, priv_key_len); - X25519_public_from_private(key->public_key, priv_key); - return 1; -} - -static int x25519_generate_key(EVP_HPKE_KEY *key) { - X25519_keypair(key->public_key, key->private_key); - return 1; -} - -static int x25519_encap_with_seed( - const EVP_HPKE_KEM *kem, uint8_t *out_shared_secret, - size_t *out_shared_secret_len, uint8_t *out_enc, size_t *out_enc_len, - size_t max_enc, const uint8_t *peer_public_key, size_t peer_public_key_len, - const uint8_t *seed, size_t seed_len) { - if (max_enc < X25519_PUBLIC_VALUE_LEN) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); - return 0; - } - if (seed_len != X25519_PRIVATE_KEY_LEN) { - OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); - return 0; - } - X25519_public_from_private(out_enc, seed); - - uint8_t dh[X25519_SHARED_KEY_LEN]; - if (peer_public_key_len != X25519_PUBLIC_VALUE_LEN || - !X25519(dh, seed, peer_public_key)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); - return 0; - } - - uint8_t kem_context[2 * X25519_PUBLIC_VALUE_LEN]; - OPENSSL_memcpy(kem_context, out_enc, X25519_PUBLIC_VALUE_LEN); - OPENSSL_memcpy(kem_context + X25519_PUBLIC_VALUE_LEN, peer_public_key, - X25519_PUBLIC_VALUE_LEN); - if (!dhkem_extract_and_expand(kem->id, EVP_sha256(), out_shared_secret, - SHA256_DIGEST_LENGTH, dh, sizeof(dh), - kem_context, sizeof(kem_context))) { - return 0; - } - - *out_enc_len = X25519_PUBLIC_VALUE_LEN; - *out_shared_secret_len = SHA256_DIGEST_LENGTH; - return 1; -} - -static int x25519_decap(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, - size_t *out_shared_secret_len, const uint8_t *enc, - size_t enc_len) { - uint8_t dh[X25519_SHARED_KEY_LEN]; - if (enc_len != X25519_PUBLIC_VALUE_LEN || - !X25519(dh, key->private_key, enc)) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); - return 0; - } - - uint8_t kem_context[2 * X25519_PUBLIC_VALUE_LEN]; - OPENSSL_memcpy(kem_context, enc, X25519_PUBLIC_VALUE_LEN); - OPENSSL_memcpy(kem_context + X25519_PUBLIC_VALUE_LEN, key->public_key, - X25519_PUBLIC_VALUE_LEN); - if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret, - SHA256_DIGEST_LENGTH, dh, sizeof(dh), - kem_context, sizeof(kem_context))) { - return 0; - } - - *out_shared_secret_len = SHA256_DIGEST_LENGTH; - return 1; -} - -const EVP_HPKE_KEM *EVP_hpke_x25519_hkdf_sha256(void) { - static const EVP_HPKE_KEM kKEM = { - /*id=*/EVP_HPKE_DHKEM_X25519_HKDF_SHA256, - /*public_key_len=*/X25519_PUBLIC_VALUE_LEN, - /*private_key_len=*/X25519_PRIVATE_KEY_LEN, - /*seed_len=*/X25519_PRIVATE_KEY_LEN, - x25519_init_key, - x25519_generate_key, - x25519_encap_with_seed, - x25519_decap, - }; - return &kKEM; -} - -uint16_t EVP_HPKE_KEM_id(const EVP_HPKE_KEM *kem) { return kem->id; } - -void EVP_HPKE_KEY_zero(EVP_HPKE_KEY *key) { - OPENSSL_memset(key, 0, sizeof(EVP_HPKE_KEY)); -} - -void EVP_HPKE_KEY_cleanup(EVP_HPKE_KEY *key) { - // Nothing to clean up for now, but we may introduce a cleanup process in the - // future. -} - -EVP_HPKE_KEY *EVP_HPKE_KEY_new(void) { - EVP_HPKE_KEY *key = OPENSSL_malloc(sizeof(EVP_HPKE_KEY)); - if (key == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return NULL; - } - EVP_HPKE_KEY_zero(key); - return key; -} - -void EVP_HPKE_KEY_free(EVP_HPKE_KEY *key) { - if (key != NULL) { - EVP_HPKE_KEY_cleanup(key); - OPENSSL_free(key); - } -} - -int EVP_HPKE_KEY_copy(EVP_HPKE_KEY *dst, const EVP_HPKE_KEY *src) { - // For now, |EVP_HPKE_KEY| is trivially copyable. - OPENSSL_memcpy(dst, src, sizeof(EVP_HPKE_KEY)); - return 1; -} - -int EVP_HPKE_KEY_init(EVP_HPKE_KEY *key, const EVP_HPKE_KEM *kem, - const uint8_t *priv_key, size_t priv_key_len) { - EVP_HPKE_KEY_zero(key); - key->kem = kem; - if (!kem->init_key(key, priv_key, priv_key_len)) { - key->kem = NULL; - return 0; - } - return 1; -} - -int EVP_HPKE_KEY_generate(EVP_HPKE_KEY *key, const EVP_HPKE_KEM *kem) { - EVP_HPKE_KEY_zero(key); - key->kem = kem; - if (!kem->generate_key(key)) { - key->kem = NULL; - return 0; - } - return 1; -} - -const EVP_HPKE_KEM *EVP_HPKE_KEY_kem(const EVP_HPKE_KEY *key) { - return key->kem; -} - -int EVP_HPKE_KEY_public_key(const EVP_HPKE_KEY *key, uint8_t *out, - size_t *out_len, size_t max_out) { - if (max_out < key->kem->public_key_len) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); - return 0; - } - OPENSSL_memcpy(out, key->public_key, key->kem->public_key_len); - *out_len = key->kem->public_key_len; - return 1; -} - -int EVP_HPKE_KEY_private_key(const EVP_HPKE_KEY *key, uint8_t *out, - size_t *out_len, size_t max_out) { - if (max_out < key->kem->private_key_len) { - OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); - return 0; - } - OPENSSL_memcpy(out, key->private_key, key->kem->private_key_len); - *out_len = key->kem->private_key_len; - return 1; -} - - -// Supported KDFs and AEADs. - -const EVP_HPKE_KDF *EVP_hpke_hkdf_sha256(void) { - static const EVP_HPKE_KDF kKDF = {EVP_HPKE_HKDF_SHA256, &EVP_sha256}; - return &kKDF; -} - -uint16_t EVP_HPKE_KDF_id(const EVP_HPKE_KDF *kdf) { return kdf->id; } - -const EVP_HPKE_AEAD *EVP_hpke_aes_128_gcm(void) { - static const EVP_HPKE_AEAD kAEAD = {EVP_HPKE_AES_128_GCM, - &EVP_aead_aes_128_gcm}; - return &kAEAD; -} - -const EVP_HPKE_AEAD *EVP_hpke_aes_256_gcm(void) { - static const EVP_HPKE_AEAD kAEAD = {EVP_HPKE_AES_256_GCM, - &EVP_aead_aes_256_gcm}; - return &kAEAD; -} - -const EVP_HPKE_AEAD *EVP_hpke_chacha20_poly1305(void) { - static const EVP_HPKE_AEAD kAEAD = {EVP_HPKE_CHACHA20_POLY1305, - &EVP_aead_chacha20_poly1305}; - return &kAEAD; -} - -uint16_t EVP_HPKE_AEAD_id(const EVP_HPKE_AEAD *aead) { return aead->id; } - -const EVP_AEAD *EVP_HPKE_AEAD_aead(const EVP_HPKE_AEAD *aead) { - return aead->aead_func(); -} - - -// HPKE implementation. - -// This is strlen("HPKE") + 3 * sizeof(uint16_t). -#define HPKE_SUITE_ID_LEN 10 - -// The suite_id for non-KEM pieces of HPKE is defined as concat("HPKE", -// I2OSP(kem_id, 2), I2OSP(kdf_id, 2), I2OSP(aead_id, 2)). -static int hpke_build_suite_id(const EVP_HPKE_CTX *ctx, - uint8_t out[HPKE_SUITE_ID_LEN]) { - CBB cbb; - int ret = CBB_init_fixed(&cbb, out, HPKE_SUITE_ID_LEN) && - add_label_string(&cbb, "HPKE") && - CBB_add_u16(&cbb, EVP_HPKE_DHKEM_X25519_HKDF_SHA256) && - CBB_add_u16(&cbb, ctx->kdf->id) && - CBB_add_u16(&cbb, ctx->aead->id); - CBB_cleanup(&cbb); - return ret; -} - -#define HPKE_MODE_BASE 0 - -static int hpke_key_schedule(EVP_HPKE_CTX *ctx, const uint8_t *shared_secret, - size_t shared_secret_len, const uint8_t *info, - size_t info_len) { - uint8_t suite_id[HPKE_SUITE_ID_LEN]; - if (!hpke_build_suite_id(ctx, suite_id)) { - return 0; - } - - // psk_id_hash = LabeledExtract("", "psk_id_hash", psk_id) - // TODO(davidben): Precompute this value and store it with the EVP_HPKE_KDF. - const EVP_MD *hkdf_md = ctx->kdf->hkdf_md_func(); - uint8_t psk_id_hash[EVP_MAX_MD_SIZE]; - size_t psk_id_hash_len; - if (!hpke_labeled_extract(hkdf_md, psk_id_hash, &psk_id_hash_len, NULL, 0, - suite_id, sizeof(suite_id), "psk_id_hash", NULL, - 0)) { - return 0; - } - - // info_hash = LabeledExtract("", "info_hash", info) - uint8_t info_hash[EVP_MAX_MD_SIZE]; - size_t info_hash_len; - if (!hpke_labeled_extract(hkdf_md, info_hash, &info_hash_len, NULL, 0, - suite_id, sizeof(suite_id), "info_hash", info, - info_len)) { - return 0; - } - - // key_schedule_context = concat(mode, psk_id_hash, info_hash) - uint8_t context[sizeof(uint8_t) + 2 * EVP_MAX_MD_SIZE]; - size_t context_len; - CBB context_cbb; - if (!CBB_init_fixed(&context_cbb, context, sizeof(context)) || - !CBB_add_u8(&context_cbb, HPKE_MODE_BASE) || - !CBB_add_bytes(&context_cbb, psk_id_hash, psk_id_hash_len) || - !CBB_add_bytes(&context_cbb, info_hash, info_hash_len) || - !CBB_finish(&context_cbb, NULL, &context_len)) { - return 0; - } - - // secret = LabeledExtract(shared_secret, "secret", psk) - uint8_t secret[EVP_MAX_MD_SIZE]; - size_t secret_len; - if (!hpke_labeled_extract(hkdf_md, secret, &secret_len, shared_secret, - shared_secret_len, suite_id, sizeof(suite_id), - "secret", NULL, 0)) { - return 0; - } - - // key = LabeledExpand(secret, "key", key_schedule_context, Nk) - const EVP_AEAD *aead = EVP_HPKE_AEAD_aead(ctx->aead); - uint8_t key[EVP_AEAD_MAX_KEY_LENGTH]; - const size_t kKeyLen = EVP_AEAD_key_length(aead); - if (!hpke_labeled_expand(hkdf_md, key, kKeyLen, secret, secret_len, suite_id, - sizeof(suite_id), "key", context, context_len) || - !EVP_AEAD_CTX_init(&ctx->aead_ctx, aead, key, kKeyLen, - EVP_AEAD_DEFAULT_TAG_LENGTH, NULL)) { - return 0; - } - - // base_nonce = LabeledExpand(secret, "base_nonce", key_schedule_context, Nn) - if (!hpke_labeled_expand(hkdf_md, ctx->base_nonce, - EVP_AEAD_nonce_length(aead), secret, secret_len, - suite_id, sizeof(suite_id), "base_nonce", context, - context_len)) { - return 0; - } - - // exporter_secret = LabeledExpand(secret, "exp", key_schedule_context, Nh) - if (!hpke_labeled_expand(hkdf_md, ctx->exporter_secret, EVP_MD_size(hkdf_md), - secret, secret_len, suite_id, sizeof(suite_id), - "exp", context, context_len)) { - return 0; - } - - return 1; -} - -void EVP_HPKE_CTX_zero(EVP_HPKE_CTX *ctx) { - OPENSSL_memset(ctx, 0, sizeof(EVP_HPKE_CTX)); - EVP_AEAD_CTX_zero(&ctx->aead_ctx); -} - -void EVP_HPKE_CTX_cleanup(EVP_HPKE_CTX *ctx) { - EVP_AEAD_CTX_cleanup(&ctx->aead_ctx); -} - -EVP_HPKE_CTX *EVP_HPKE_CTX_new(void) { - EVP_HPKE_CTX *ctx = OPENSSL_malloc(sizeof(EVP_HPKE_CTX)); - if (ctx == NULL) { - OPENSSL_PUT_ERROR(EVP, ERR_R_MALLOC_FAILURE); - return NULL; - } - EVP_HPKE_CTX_zero(ctx); - return ctx; -} - -void EVP_HPKE_CTX_free(EVP_HPKE_CTX *ctx) { - if (ctx != NULL) { - EVP_HPKE_CTX_cleanup(ctx); - OPENSSL_free(ctx); - } -} - -int EVP_HPKE_CTX_setup_sender(EVP_HPKE_CTX *ctx, uint8_t *out_enc, - size_t *out_enc_len, size_t max_enc, - const EVP_HPKE_KEM *kem, const EVP_HPKE_KDF *kdf, - const EVP_HPKE_AEAD *aead, - const uint8_t *peer_public_key, - size_t peer_public_key_len, const uint8_t *info, - size_t info_len) { - uint8_t seed[MAX_SEED_LEN]; - RAND_bytes(seed, kem->seed_len); - return EVP_HPKE_CTX_setup_sender_with_seed_for_testing( - ctx, out_enc, out_enc_len, max_enc, kem, kdf, aead, peer_public_key, - peer_public_key_len, info, info_len, seed, kem->seed_len); -} - -int EVP_HPKE_CTX_setup_sender_with_seed_for_testing( - EVP_HPKE_CTX *ctx, uint8_t *out_enc, size_t *out_enc_len, size_t max_enc, - const EVP_HPKE_KEM *kem, const EVP_HPKE_KDF *kdf, const EVP_HPKE_AEAD *aead, - const uint8_t *peer_public_key, size_t peer_public_key_len, - const uint8_t *info, size_t info_len, const uint8_t *seed, - size_t seed_len) { - EVP_HPKE_CTX_zero(ctx); - ctx->is_sender = 1; - ctx->kdf = kdf; - ctx->aead = aead; - uint8_t shared_secret[MAX_SHARED_SECRET_LEN]; - size_t shared_secret_len; - if (!kem->encap_with_seed(kem, shared_secret, &shared_secret_len, out_enc, - out_enc_len, max_enc, peer_public_key, - peer_public_key_len, seed, seed_len) || - !hpke_key_schedule(ctx, shared_secret, shared_secret_len, info, - info_len)) { - EVP_HPKE_CTX_cleanup(ctx); - return 0; - } - return 1; -} - -int EVP_HPKE_CTX_setup_recipient(EVP_HPKE_CTX *ctx, const EVP_HPKE_KEY *key, - const EVP_HPKE_KDF *kdf, - const EVP_HPKE_AEAD *aead, const uint8_t *enc, - size_t enc_len, const uint8_t *info, - size_t info_len) { - EVP_HPKE_CTX_zero(ctx); - ctx->is_sender = 0; - ctx->kdf = kdf; - ctx->aead = aead; - uint8_t shared_secret[MAX_SHARED_SECRET_LEN]; - size_t shared_secret_len; - if (!key->kem->decap(key, shared_secret, &shared_secret_len, enc, enc_len) || - !hpke_key_schedule(ctx, shared_secret, sizeof(shared_secret), info, - info_len)) { - EVP_HPKE_CTX_cleanup(ctx); - return 0; - } - return 1; -} - -static void hpke_nonce(const EVP_HPKE_CTX *ctx, uint8_t *out_nonce, - size_t nonce_len) { - assert(nonce_len >= 8); - - // Write padded big-endian bytes of |ctx->seq| to |out_nonce|. - OPENSSL_memset(out_nonce, 0, nonce_len); - uint64_t seq_copy = ctx->seq; - for (size_t i = 0; i < 8; i++) { - out_nonce[nonce_len - i - 1] = seq_copy & 0xff; - seq_copy >>= 8; - } - - // XOR the encoded sequence with the |ctx->base_nonce|. - for (size_t i = 0; i < nonce_len; i++) { - out_nonce[i] ^= ctx->base_nonce[i]; - } -} - -int EVP_HPKE_CTX_open(EVP_HPKE_CTX *ctx, uint8_t *out, size_t *out_len, - size_t max_out_len, const uint8_t *in, size_t in_len, - const uint8_t *ad, size_t ad_len) { - if (ctx->is_sender) { - OPENSSL_PUT_ERROR(EVP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - if (ctx->seq == UINT64_MAX) { - OPENSSL_PUT_ERROR(EVP, ERR_R_OVERFLOW); - return 0; - } - - uint8_t nonce[EVP_AEAD_MAX_NONCE_LENGTH]; - const size_t nonce_len = EVP_AEAD_nonce_length(ctx->aead_ctx.aead); - hpke_nonce(ctx, nonce, nonce_len); - - if (!EVP_AEAD_CTX_open(&ctx->aead_ctx, out, out_len, max_out_len, nonce, - nonce_len, in, in_len, ad, ad_len)) { - return 0; - } - ctx->seq++; - return 1; -} - -int EVP_HPKE_CTX_seal(EVP_HPKE_CTX *ctx, uint8_t *out, size_t *out_len, - size_t max_out_len, const uint8_t *in, size_t in_len, - const uint8_t *ad, size_t ad_len) { - if (!ctx->is_sender) { - OPENSSL_PUT_ERROR(EVP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - return 0; - } - if (ctx->seq == UINT64_MAX) { - OPENSSL_PUT_ERROR(EVP, ERR_R_OVERFLOW); - return 0; - } - - uint8_t nonce[EVP_AEAD_MAX_NONCE_LENGTH]; - const size_t nonce_len = EVP_AEAD_nonce_length(ctx->aead_ctx.aead); - hpke_nonce(ctx, nonce, nonce_len); - - if (!EVP_AEAD_CTX_seal(&ctx->aead_ctx, out, out_len, max_out_len, nonce, - nonce_len, in, in_len, ad, ad_len)) { - return 0; - } - ctx->seq++; - return 1; -} - -int EVP_HPKE_CTX_export(const EVP_HPKE_CTX *ctx, uint8_t *out, - size_t secret_len, const uint8_t *context, - size_t context_len) { - uint8_t suite_id[HPKE_SUITE_ID_LEN]; - if (!hpke_build_suite_id(ctx, suite_id)) { - return 0; - } - const EVP_MD *hkdf_md = ctx->kdf->hkdf_md_func(); - if (!hpke_labeled_expand(hkdf_md, out, secret_len, ctx->exporter_secret, - EVP_MD_size(hkdf_md), suite_id, sizeof(suite_id), - "sec", context, context_len)) { - return 0; - } - return 1; -} - -size_t EVP_HPKE_CTX_max_overhead(const EVP_HPKE_CTX *ctx) { - assert(ctx->is_sender); - return EVP_AEAD_max_overhead(EVP_AEAD_CTX_aead(&ctx->aead_ctx)); -} - -const EVP_HPKE_AEAD *EVP_HPKE_CTX_aead(const EVP_HPKE_CTX *ctx) { - return ctx->aead; -} - -const EVP_HPKE_KDF *EVP_HPKE_CTX_kdf(const EVP_HPKE_CTX *ctx) { - return ctx->kdf; -} diff --git a/third_party/boringssl/src/crypto/hpke/hpke.cc b/third_party/boringssl/src/crypto/hpke/hpke.cc new file mode 100644 index 00000000..2365a53b --- /dev/null +++ b/third_party/boringssl/src/crypto/hpke/hpke.cc @@ -0,0 +1,1459 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../fipsmodule/bcm_interface.h" +#include "../fipsmodule/ec/internal.h" +#include "../fipsmodule/keccak/internal.h" +#include "../internal.h" +#include "../mem_internal.h" + + +using namespace bssl; + +// This file implements RFC 9180. + +#define MAX_SEED_LEN XWING_SEED_LEN +#define MAX_SHARED_SECRET_LEN SHA256_DIGEST_LENGTH + +// TODO(chlily): Fold this into `EVP_KEM`. +struct evp_hpke_kem_st { + uint16_t id; + size_t public_key_len; + size_t private_key_len; + size_t seed_len; + size_t enc_len; + int (*init_key)(EVP_HPKE_KEY *key, const uint8_t *priv_key, + size_t priv_key_len); + int (*generate_key)(EVP_HPKE_KEY *key); + int (*derive_key)(EVP_HPKE_KEY *key, Span ikm); + int (*encap_with_seed)(const EVP_HPKE_KEM *kem, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, uint8_t *out_enc, + size_t *out_enc_len, size_t max_enc, + const uint8_t *peer_public_key, + size_t peer_public_key_len, const uint8_t *seed, + size_t seed_len); + int (*decap)(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, const uint8_t *enc, + size_t enc_len); + int (*auth_encap_with_seed)(const EVP_HPKE_KEY *key, + uint8_t *out_shared_secret, + size_t *out_shared_secret_len, uint8_t *out_enc, + size_t *out_enc_len, size_t max_enc, + const uint8_t *peer_public_key, + size_t peer_public_key_len, const uint8_t *seed, + size_t seed_len); + int (*auth_decap)(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, const uint8_t *enc, + size_t enc_len, const uint8_t *peer_public_key, + size_t peer_public_key_len); +}; + +struct evp_hpke_kdf_st { + uint16_t id; + // We only support HKDF-based KDFs. + const EVP_MD *(*hkdf_md_func)(); +}; + +struct evp_hpke_aead_st { + uint16_t id; + const EVP_AEAD *(*aead_func)(); +}; + + +// Low-level labeled KDF functions. + +static const char kHpkeVersionId[] = "HPKE-v1"; + +static int add_label_string(CBB *cbb, const char *label) { + return CBB_add_bytes(cbb, (const uint8_t *)label, strlen(label)); +} + +static int hpke_labeled_extract(const EVP_MD *hkdf_md, uint8_t *out_key, + size_t *out_len, const uint8_t *salt, + size_t salt_len, const uint8_t *suite_id, + size_t suite_id_len, const char *label, + const uint8_t *ikm, size_t ikm_len) { + // labeledIKM = concat("HPKE-v1", suite_id, label, IKM) + CBB labeled_ikm; + int ok = CBB_init(&labeled_ikm, 0) && + add_label_string(&labeled_ikm, kHpkeVersionId) && + CBB_add_bytes(&labeled_ikm, suite_id, suite_id_len) && + add_label_string(&labeled_ikm, label) && + CBB_add_bytes(&labeled_ikm, ikm, ikm_len) && + HKDF_extract(out_key, out_len, hkdf_md, CBB_data(&labeled_ikm), + CBB_len(&labeled_ikm), salt, salt_len); + CBB_cleanup(&labeled_ikm); + return ok; +} + +static int hpke_labeled_expand(const EVP_MD *hkdf_md, uint8_t *out_key, + size_t out_len, const uint8_t *prk, + size_t prk_len, const uint8_t *suite_id, + size_t suite_id_len, const char *label, + const uint8_t *info, size_t info_len) { + // labeledInfo = concat(I2OSP(L, 2), "HPKE-v1", suite_id, label, info) + CBB labeled_info; + int ok = CBB_init(&labeled_info, 0) && // + CBB_add_u16(&labeled_info, out_len) && + add_label_string(&labeled_info, kHpkeVersionId) && + CBB_add_bytes(&labeled_info, suite_id, suite_id_len) && + add_label_string(&labeled_info, label) && + CBB_add_bytes(&labeled_info, info, info_len) && + HKDF_expand(out_key, out_len, hkdf_md, prk, prk_len, + CBB_data(&labeled_info), CBB_len(&labeled_info)); + CBB_cleanup(&labeled_info); + return ok; +} + +static void absorb_u16(BORINGSSL_keccak_st *ctx, uint16_t n) { + uint8_t bytes[2]; + CRYPTO_store_u16_be(bytes, n); + BORINGSSL_keccak_absorb(ctx, bytes, sizeof(bytes)); +} + +static void hpke_shake256_labeled_derive(Span out, + Span ikm, + Span suite_id, + std::string_view label, + Span context) { + // https://www.ietf.org/archive/id/draft-ietf-hpke-hpke-03.html#name-labeled-derivation-function + // https://www.ietf.org/archive/id/draft-ietf-hpke-pq-04.html#name-single-stage-kdfs + BORINGSSL_keccak_st ctx; + BORINGSSL_keccak_init(&ctx, boringssl_shake256); + BORINGSSL_keccak_absorb(&ctx, ikm.data(), ikm.size()); + auto hpke_version_id = StringAsBytes(kHpkeVersionId); + BORINGSSL_keccak_absorb(&ctx, hpke_version_id.data(), hpke_version_id.size()); + BORINGSSL_keccak_absorb(&ctx, suite_id.data(), suite_id.size()); + auto label_bytes = StringAsBytes(label); + assert(label_bytes.size() < 0xffff); + absorb_u16(&ctx, static_cast(label_bytes.size())); + BORINGSSL_keccak_absorb(&ctx, label_bytes.data(), label_bytes.size()); + assert(out.size() <= 0xffff); + absorb_u16(&ctx, static_cast(out.size())); + BORINGSSL_keccak_absorb(&ctx, context.data(), context.size()); + BORINGSSL_keccak_squeeze(&ctx, out.data(), out.size()); +} + + +// KEM implementations. + +static std::array hpke_kem_suite_id(uint16_t kem_id) { + // concat("KEM", I2OSP(kem_id, 2)) + return {'K', 'E', 'M', static_cast(kem_id >> 8), + static_cast(kem_id)}; +} + +// dhkem_extract_and_expand implements the ExtractAndExpand operation in the +// DHKEM construction. See section 4.1 of RFC 9180. +static int dhkem_extract_and_expand(uint16_t kem_id, const EVP_MD *hkdf_md, + uint8_t *out_key, size_t out_len, + const uint8_t *dh, size_t dh_len, + const uint8_t *kem_context, + size_t kem_context_len) { + auto suite_id = hpke_kem_suite_id(kem_id); + uint8_t prk[EVP_MAX_MD_SIZE]; + size_t prk_len; + return hpke_labeled_extract(hkdf_md, prk, &prk_len, nullptr, 0, + suite_id.data(), suite_id.size(), "eae_prk", dh, + dh_len) && + hpke_labeled_expand(hkdf_md, out_key, out_len, prk, prk_len, + suite_id.data(), suite_id.size(), "shared_secret", + kem_context, kem_context_len); +} + +static int x25519_init_key(EVP_HPKE_KEY *key, const uint8_t *priv_key, + size_t priv_key_len) { + if (priv_key_len != X25519_PRIVATE_KEY_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + OPENSSL_memcpy(key->private_key, priv_key, priv_key_len); + X25519_public_from_private(key->public_key, priv_key); + return 1; +} + +static int x25519_generate_key(EVP_HPKE_KEY *key) { + X25519_keypair(key->public_key, key->private_key); + return 1; +} + +static int x25519_derive_key(EVP_HPKE_KEY *key, Span ikm) { + // https://www.rfc-editor.org/rfc/rfc9180.html#name-derivekeypair + auto suite_id = hpke_kem_suite_id(EVP_HPKE_DHKEM_X25519_HKDF_SHA256); + uint8_t dkp_prk[SHA256_DIGEST_LENGTH]; + size_t dkp_prk_len; + uint8_t sk[32]; + if (!hpke_labeled_extract(EVP_sha256(), dkp_prk, &dkp_prk_len, nullptr, 0, + suite_id.data(), suite_id.size(), "dkp_prk", + ikm.data(), ikm.size()) || + !hpke_labeled_expand(EVP_sha256(), sk, sizeof(sk), dkp_prk, dkp_prk_len, + suite_id.data(), suite_id.size(), "sk", + /*info=*/nullptr, /*info_len=*/0)) { + return 0; + } + return x25519_init_key(key, sk, sizeof(sk)); +} + +static int x25519_encap_with_seed( + const EVP_HPKE_KEM *kem, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, uint8_t *out_enc, size_t *out_enc_len, + size_t max_enc, const uint8_t *peer_public_key, size_t peer_public_key_len, + const uint8_t *seed, size_t seed_len) { + if (max_enc < X25519_PUBLIC_VALUE_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); + return 0; + } + if (seed_len != X25519_PRIVATE_KEY_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + X25519_public_from_private(out_enc, seed); + + uint8_t dh[X25519_SHARED_KEY_LEN]; + if (peer_public_key_len != X25519_PUBLIC_VALUE_LEN || + !X25519(dh, seed, peer_public_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[2 * X25519_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, out_enc, X25519_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + X25519_PUBLIC_VALUE_LEN, peer_public_key, + X25519_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_enc_len = X25519_PUBLIC_VALUE_LEN; + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +static int x25519_decap(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, const uint8_t *enc, + size_t enc_len) { + uint8_t dh[X25519_SHARED_KEY_LEN]; + if (enc_len != X25519_PUBLIC_VALUE_LEN || + !X25519(dh, key->private_key, enc)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[2 * X25519_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, enc, X25519_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + X25519_PUBLIC_VALUE_LEN, key->public_key, + X25519_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +static int x25519_auth_encap_with_seed( + const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, uint8_t *out_enc, size_t *out_enc_len, + size_t max_enc, const uint8_t *peer_public_key, size_t peer_public_key_len, + const uint8_t *seed, size_t seed_len) { + if (max_enc < X25519_PUBLIC_VALUE_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); + return 0; + } + if (seed_len != X25519_PRIVATE_KEY_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + X25519_public_from_private(out_enc, seed); + + uint8_t dh[2 * X25519_SHARED_KEY_LEN]; + if (peer_public_key_len != X25519_PUBLIC_VALUE_LEN || + !X25519(dh, seed, peer_public_key) || + !X25519(dh + X25519_SHARED_KEY_LEN, key->private_key, peer_public_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[3 * X25519_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, out_enc, X25519_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + X25519_PUBLIC_VALUE_LEN, peer_public_key, + X25519_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + 2 * X25519_PUBLIC_VALUE_LEN, key->public_key, + X25519_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_enc_len = X25519_PUBLIC_VALUE_LEN; + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +static int x25519_auth_decap(const EVP_HPKE_KEY *key, + uint8_t *out_shared_secret, + size_t *out_shared_secret_len, const uint8_t *enc, + size_t enc_len, const uint8_t *peer_public_key, + size_t peer_public_key_len) { + uint8_t dh[2 * X25519_SHARED_KEY_LEN]; + if (enc_len != X25519_PUBLIC_VALUE_LEN || + peer_public_key_len != X25519_PUBLIC_VALUE_LEN || + !X25519(dh, key->private_key, enc) || + !X25519(dh + X25519_SHARED_KEY_LEN, key->private_key, peer_public_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[3 * X25519_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, enc, X25519_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + X25519_PUBLIC_VALUE_LEN, key->public_key, + X25519_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + 2 * X25519_PUBLIC_VALUE_LEN, peer_public_key, + X25519_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +const EVP_HPKE_KEM *EVP_hpke_x25519_hkdf_sha256() { + static const EVP_HPKE_KEM kKEM = { + /*id=*/EVP_HPKE_DHKEM_X25519_HKDF_SHA256, + /*public_key_len=*/X25519_PUBLIC_VALUE_LEN, + /*private_key_len=*/X25519_PRIVATE_KEY_LEN, + /*seed_len=*/X25519_PRIVATE_KEY_LEN, + /*enc_len=*/X25519_PUBLIC_VALUE_LEN, + x25519_init_key, + x25519_generate_key, + x25519_derive_key, + x25519_encap_with_seed, + x25519_decap, + x25519_auth_encap_with_seed, + x25519_auth_decap, + }; + return &kKEM; +} + +#define P256_PRIVATE_KEY_LEN 32 +#define P256_PUBLIC_KEY_LEN 65 +#define P256_PUBLIC_VALUE_LEN 65 +#define P256_SEED_LEN 32 +#define P256_SHARED_KEY_LEN 32 + +static int p256_public_from_private(uint8_t out_pub[P256_PUBLIC_VALUE_LEN], + const uint8_t priv[P256_PRIVATE_KEY_LEN]) { + const EC_GROUP *const group = EC_group_p256(); + const uint8_t kAllZeros[P256_PRIVATE_KEY_LEN] = {0}; + EC_SCALAR private_scalar; + EC_JACOBIAN public_point; + EC_AFFINE public_point_affine; + + if (CRYPTO_memcmp(kAllZeros, priv, sizeof(kAllZeros)) == 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + if (!ec_scalar_from_bytes(group, &private_scalar, priv, + P256_PRIVATE_KEY_LEN) || + !ec_point_mul_scalar_base(group, &public_point, &private_scalar) || + !ec_jacobian_to_affine(group, &public_point_affine, &public_point)) { + return 0; + } + + size_t out_len_x, out_len_y; + out_pub[0] = POINT_CONVERSION_UNCOMPRESSED; + ec_felem_to_bytes(group, &out_pub[1], &out_len_x, &public_point_affine.X); + ec_felem_to_bytes(group, &out_pub[33], &out_len_y, &public_point_affine.Y); + return 1; +} + +static int p256_init_key(EVP_HPKE_KEY *key, const uint8_t *priv_key, + size_t priv_key_len) { + if (priv_key_len != P256_PRIVATE_KEY_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + if (!p256_public_from_private(key->public_key, priv_key)) { + return 0; + } + + OPENSSL_memcpy(key->private_key, priv_key, priv_key_len); + return 1; +} + +static int p256_private_key_from_seed(uint8_t out_priv[P256_PRIVATE_KEY_LEN], + const uint8_t *seed, size_t seed_len) { + // https://www.rfc-editor.org/rfc/rfc9180.html#name-derivekeypair + auto suite_id = hpke_kem_suite_id(EVP_HPKE_DHKEM_P256_HKDF_SHA256); + uint8_t dkp_prk[32]; + size_t dkp_prk_len; + if (!hpke_labeled_extract(EVP_sha256(), dkp_prk, &dkp_prk_len, nullptr, 0, + suite_id.data(), suite_id.size(), "dkp_prk", seed, + seed_len)) { + return 0; + } + assert(dkp_prk_len == sizeof(dkp_prk)); + + const EC_GROUP *const group = EC_group_p256(); + EC_SCALAR private_scalar; + + for (unsigned counter = 0; counter < 256; counter++) { + const uint8_t counter_byte = counter & 0xff; + if (!hpke_labeled_expand(EVP_sha256(), out_priv, P256_PRIVATE_KEY_LEN, + dkp_prk, sizeof(dkp_prk), suite_id.data(), + suite_id.size(), "candidate", &counter_byte, + sizeof(counter_byte))) { + return 0; + } + + // |ec_scalar_from_bytes| checks that the scalar is less than the order. + if (ec_scalar_from_bytes(group, &private_scalar, out_priv, + P256_PRIVATE_KEY_LEN) && + !ec_scalar_is_zero(group, &private_scalar)) { + return 1; + } + } + + // This happens with probability of 2^-(32*256). + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; +} + +static int p256_derive_key(EVP_HPKE_KEY *key, Span ikm) { + if (!p256_private_key_from_seed(key->private_key, ikm.data(), ikm.size()) || + !p256_public_from_private(key->public_key, key->private_key)) { + return 0; + } + return 1; +} + +static int p256_generate_key(EVP_HPKE_KEY *key) { + uint8_t seed[P256_SEED_LEN]; + RAND_bytes(seed, sizeof(seed)); + return p256_derive_key(key, seed); +} + +static int p256(uint8_t out_dh[P256_SHARED_KEY_LEN], + const uint8_t my_private[P256_PRIVATE_KEY_LEN], + const uint8_t their_public[P256_PUBLIC_VALUE_LEN]) { + const EC_GROUP *const group = EC_group_p256(); + EC_SCALAR private_scalar; + EC_FELEM x, y; + EC_JACOBIAN shared_point, their_point; + EC_AFFINE their_point_affine, shared_point_affine; + + if (their_public[0] != POINT_CONVERSION_UNCOMPRESSED || + !ec_felem_from_bytes(group, &x, &their_public[1], 32) || + !ec_felem_from_bytes(group, &y, &their_public[33], 32) || + !ec_point_set_affine_coordinates(group, &their_point_affine, &x, &y) || + !ec_scalar_from_bytes(group, &private_scalar, my_private, + P256_PRIVATE_KEY_LEN)) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; + } + + ec_affine_to_jacobian(group, &their_point, &their_point_affine); + if (!ec_point_mul_scalar(group, &shared_point, &their_point, + &private_scalar) || + !ec_jacobian_to_affine(group, &shared_point_affine, &shared_point)) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; + } + + size_t out_len; + ec_felem_to_bytes(group, out_dh, &out_len, &shared_point_affine.X); + assert(out_len == P256_SHARED_KEY_LEN); + return 1; +} + +static int p256_encap_with_seed(const EVP_HPKE_KEM *kem, + uint8_t *out_shared_secret, + size_t *out_shared_secret_len, uint8_t *out_enc, + size_t *out_enc_len, size_t max_enc, + const uint8_t *peer_public_key, + size_t peer_public_key_len, const uint8_t *seed, + size_t seed_len) { + if (max_enc < P256_PUBLIC_VALUE_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); + return 0; + } + if (seed_len != P256_SEED_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + uint8_t private_key[P256_PRIVATE_KEY_LEN]; + if (!p256_private_key_from_seed(private_key, seed, seed_len) || + !p256_public_from_private(out_enc, private_key)) { + return 0; + } + + uint8_t dh[P256_SHARED_KEY_LEN]; + if (peer_public_key_len != P256_PUBLIC_VALUE_LEN || + !p256(dh, private_key, peer_public_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[2 * P256_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, out_enc, P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, peer_public_key, + P256_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_enc_len = P256_PUBLIC_VALUE_LEN; + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +static int p256_decap(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, const uint8_t *enc, + size_t enc_len) { + uint8_t dh[P256_SHARED_KEY_LEN]; + if (enc_len != P256_PUBLIC_VALUE_LEN || // + !p256(dh, key->private_key, enc)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[2 * P256_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, enc, P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, key->public_key, + P256_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +static int p256_auth_encap_with_seed( + const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, uint8_t *out_enc, size_t *out_enc_len, + size_t max_enc, const uint8_t *peer_public_key, size_t peer_public_key_len, + const uint8_t *seed, size_t seed_len) { + if (max_enc < P256_PUBLIC_VALUE_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); + return 0; + } + if (seed_len != P256_SEED_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + uint8_t private_key[P256_PRIVATE_KEY_LEN]; + if (!p256_private_key_from_seed(private_key, seed, seed_len) || + !p256_public_from_private(out_enc, private_key)) { + return 0; + } + + uint8_t dh[2 * P256_SHARED_KEY_LEN]; + if (peer_public_key_len != P256_PUBLIC_VALUE_LEN || + !p256(dh, private_key, peer_public_key) || + !p256(dh + P256_SHARED_KEY_LEN, key->private_key, peer_public_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[3 * P256_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, out_enc, P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, peer_public_key, + P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + 2 * P256_PUBLIC_VALUE_LEN, key->public_key, + P256_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_enc_len = P256_PUBLIC_VALUE_LEN; + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +static int p256_auth_decap(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, const uint8_t *enc, + size_t enc_len, const uint8_t *peer_public_key, + size_t peer_public_key_len) { + uint8_t dh[2 * P256_SHARED_KEY_LEN]; + if (enc_len != P256_PUBLIC_VALUE_LEN || + peer_public_key_len != P256_PUBLIC_VALUE_LEN || + !p256(dh, key->private_key, enc) || + !p256(dh + P256_SHARED_KEY_LEN, key->private_key, peer_public_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_PEER_KEY); + return 0; + } + + uint8_t kem_context[3 * P256_PUBLIC_VALUE_LEN]; + OPENSSL_memcpy(kem_context, enc, P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + P256_PUBLIC_VALUE_LEN, key->public_key, + P256_PUBLIC_VALUE_LEN); + OPENSSL_memcpy(kem_context + 2 * P256_PUBLIC_VALUE_LEN, peer_public_key, + P256_PUBLIC_VALUE_LEN); + if (!dhkem_extract_and_expand(key->kem->id, EVP_sha256(), out_shared_secret, + SHA256_DIGEST_LENGTH, dh, sizeof(dh), + kem_context, sizeof(kem_context))) { + return 0; + } + + *out_shared_secret_len = SHA256_DIGEST_LENGTH; + return 1; +} + +const EVP_HPKE_KEM *EVP_hpke_p256_hkdf_sha256() { + static const EVP_HPKE_KEM kKEM = { + /*id=*/EVP_HPKE_DHKEM_P256_HKDF_SHA256, + /*public_key_len=*/P256_PUBLIC_KEY_LEN, + /*private_key_len=*/P256_PRIVATE_KEY_LEN, + /*seed_len=*/P256_SEED_LEN, + /*enc_len=*/P256_PUBLIC_VALUE_LEN, + p256_init_key, + p256_generate_key, + p256_derive_key, + p256_encap_with_seed, + p256_decap, + p256_auth_encap_with_seed, + p256_auth_decap, + }; + return &kKEM; +} + +#define XWING_PRIVATE_KEY_LEN XWING_PRIVATE_KEY_BYTES +#define XWING_PUBLIC_KEY_LEN XWING_PUBLIC_KEY_BYTES +#define XWING_PUBLIC_VALUE_LEN XWING_CIPHERTEXT_BYTES +#define XWING_SEED_LEN 64 +#define XWING_SHARED_KEY_LEN XWING_SHARED_SECRET_BYTES + +static int xwing_init_key(EVP_HPKE_KEY *key, const uint8_t *priv_key, + size_t priv_key_len) { + CBS cbs; + CBS_init(&cbs, priv_key, priv_key_len); + XWING_private_key private_key; + if (!XWING_parse_private_key(&private_key, &cbs) || CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + if (!XWING_public_from_private(key->public_key, &private_key)) { + return 0; + } + + if (priv_key_len > sizeof(key->private_key)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + OPENSSL_memcpy(key->private_key, priv_key, priv_key_len); + return 1; +} + +static int xwing_generate_key(EVP_HPKE_KEY *key) { + XWING_private_key private_key; + if (!XWING_generate_key(key->public_key, &private_key)) { + return 0; + } + + CBB cbb; + CBB_init_fixed(&cbb, key->private_key, XWING_PRIVATE_KEY_LEN); + if (!XWING_marshal_private_key(&cbb, &private_key) || + CBB_len(&cbb) != XWING_PRIVATE_KEY_LEN) { + return 0; + } + + return 1; +} + +static int xwing_derive_key(EVP_HPKE_KEY *key, Span ikm) { + uint8_t seed[32]; + hpke_shake256_labeled_derive(seed, ikm, hpke_kem_suite_id(EVP_HPKE_XWING), + "DeriveKeyPair", /*context=*/{}); + return xwing_init_key(key, seed, sizeof(seed)); +} + +static int xwing_encap_with_seed(const EVP_HPKE_KEM *kem, + uint8_t *out_shared_secret, + size_t *out_shared_secret_len, + uint8_t *out_enc, size_t *out_enc_len, + size_t max_enc, const uint8_t *peer_public_key, + size_t peer_public_key_len, + const uint8_t *seed, size_t seed_len) { + if (max_enc < XWING_PUBLIC_VALUE_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); + return 0; + } + if (peer_public_key_len != XWING_PUBLIC_KEY_LEN || + seed_len != XWING_SEED_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + if (!XWING_encap_external_entropy(out_enc, out_shared_secret, peer_public_key, + seed)) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; + } + + *out_enc_len = XWING_PUBLIC_VALUE_LEN; + *out_shared_secret_len = XWING_SHARED_KEY_LEN; + return 1; +} + +static int xwing_decap(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, const uint8_t *enc, + size_t enc_len) { + if (enc_len != XWING_PUBLIC_VALUE_LEN) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + CBS cbs; + CBS_init(&cbs, key->private_key, XWING_PRIVATE_KEY_LEN); + XWING_private_key private_key; + if (!XWING_parse_private_key(&private_key, &cbs)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + if (!XWING_decap(out_shared_secret, enc, &private_key)) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; + } + + *out_shared_secret_len = XWING_SHARED_KEY_LEN; + return 1; +} + +const EVP_HPKE_KEM *EVP_hpke_xwing() { + static const EVP_HPKE_KEM kKEM = { + /*id=*/EVP_HPKE_XWING, + /*public_key_len=*/XWING_PUBLIC_KEY_LEN, + /*private_key_len=*/XWING_PRIVATE_KEY_LEN, + /*seed_len=*/XWING_SEED_LEN, + /*enc_len=*/XWING_PUBLIC_VALUE_LEN, + xwing_init_key, + xwing_generate_key, + xwing_derive_key, + xwing_encap_with_seed, + xwing_decap, + // X-Wing doesn't support authenticated encapsulation/decapsulation: + // https://datatracker.ietf.org/doc/html/draft-connolly-cfrg-xwing-kem-08#name-use-in-hpke + /* auth_encap_with_seed= */ nullptr, + /* auth_decap= */ nullptr, + }; + return &kKEM; +} + +namespace { + +template +struct MLKEMHPKE { + // These sizes are common across both ML-KEM-768 and ML-KEM-1024. + static constexpr size_t PRIVATE_KEY_LEN = MLKEM_SEED_BYTES; + static constexpr size_t SHARED_KEY_LEN = MLKEM_SHARED_SECRET_BYTES; + + static constexpr uint16_t ID = KEM_ID; + static constexpr size_t PUBLIC_KEY_LEN = PUBLIC_KEY_BYTES; + static constexpr size_t SEED_LEN = ENCAP_ENTROPY_BYTES; + static constexpr size_t ENC_LEN = CIPHERTEXT_BYTES; + + static int InitKey(EVP_HPKE_KEY *key, const uint8_t *priv_key, + size_t priv_key_len) { + PrivateKey expanded_private_key; + if (!PrivateKeyFromSeed(&expanded_private_key, priv_key, priv_key_len)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + PublicKey public_key; + PublicFromPrivate(&public_key, &expanded_private_key); + CBB cbb; + static_assert(sizeof(key->public_key) >= PUBLIC_KEY_BYTES, + "EVP_HPKE_KEY public_key is too small for ML-KEM."); + if (!CBB_init_fixed(&cbb, key->public_key, PUBLIC_KEY_BYTES) || + !MarshalPublicKey(&cbb, &public_key)) { + return 0; + } + + static_assert(sizeof(key->private_key) >= PRIVATE_KEY_LEN, + "EVP_HPKE_KEY private_key is too small for ML-KEM"); + OPENSSL_memcpy(key->private_key, priv_key, priv_key_len); + return 1; + } + + static int HpkeDeriveKey(EVP_HPKE_KEY *key, Span ikm) { + uint8_t seed[64]; + hpke_shake256_labeled_derive(seed, ikm, hpke_kem_suite_id(ID), + "DeriveKeyPair", + /*context=*/{}); + return InitKey(key, seed, sizeof(seed)); + } + + static int HpkeGenerateKey(EVP_HPKE_KEY *key) { + static_assert(sizeof(key->public_key) >= PUBLIC_KEY_BYTES, + "EVP_HPKE_KEY public_key is too small for ML-KEM."); + static_assert(sizeof(key->private_key) >= PRIVATE_KEY_LEN, + "EVP_HPKE_KEY private_key is too small for ML-KEM"); + PrivateKey expanded_private_key; + GenerateKey(key->public_key, key->private_key, &expanded_private_key); + + return 1; + } + + static int EncapWithSeed(const EVP_HPKE_KEM *kem, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, uint8_t *out_enc, + size_t *out_enc_len, size_t max_enc, + const uint8_t *peer_public_key, + size_t peer_public_key_len, const uint8_t *seed, + size_t seed_len) { + if (max_enc < CIPHERTEXT_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); + return 0; + } + if (peer_public_key_len != PUBLIC_KEY_BYTES || + seed_len != ENCAP_ENTROPY_BYTES) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + CBS cbs; + CBS_init(&cbs, peer_public_key, peer_public_key_len); + PublicKey public_key; + if (!ParsePublicKey(&public_key, &cbs)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + // The public ML-KEM interface doesn't support providing the encap entropy + // so the BCM function is used here. + BCMEncapExternalEntropy(out_enc, out_shared_secret, &public_key, seed); + + *out_enc_len = CIPHERTEXT_BYTES; + *out_shared_secret_len = SHARED_KEY_LEN; + return 1; + } + + static int HpkeDecap(const EVP_HPKE_KEY *key, uint8_t *out_shared_secret, + size_t *out_shared_secret_len, const uint8_t *enc, + size_t enc_len) { + PrivateKey private_key; + if (!PrivateKeyFromSeed(&private_key, key->private_key, PRIVATE_KEY_LEN)) { + OPENSSL_PUT_ERROR(EVP, EVP_R_DECODE_ERROR); + return 0; + } + + if (!Decap(out_shared_secret, enc, enc_len, &private_key)) { + OPENSSL_PUT_ERROR(EVP, ERR_R_INTERNAL_ERROR); + return 0; + } + + *out_shared_secret_len = SHARED_KEY_LEN; + return 1; + } +}; + +using MLKEM768HPKE = + MLKEMHPKE; + +using MLKEM1024HPKE = + MLKEMHPKE; + +template +static const EVP_HPKE_KEM kMLKEM = { + /*id=*/MLKEM::ID, + /*public_key_len=*/MLKEM::PUBLIC_KEY_LEN, + /*private_key_len=*/MLKEM::PRIVATE_KEY_LEN, + /*seed_len=*/MLKEM::SEED_LEN, + /*enc_len=*/MLKEM::ENC_LEN, + MLKEM::InitKey, + MLKEM::HpkeGenerateKey, + MLKEM::HpkeDeriveKey, + MLKEM::EncapWithSeed, + MLKEM::HpkeDecap, + // ML-KEM doesn't support authenticated encapsulation/decapsulation: + // https://datatracker.ietf.org/doc/draft-ietf-hpke-pq/01/ + /*auth_encap_with_seed=*/nullptr, + /*auth_decap=*/nullptr, +}; + +} // namespace + +const EVP_HPKE_KEM *EVP_hpke_mlkem768() { return &kMLKEM; } +const EVP_HPKE_KEM *EVP_hpke_mlkem1024() { return &kMLKEM; } + +uint16_t EVP_HPKE_KEM_id(const EVP_HPKE_KEM *kem) { return kem->id; } + +size_t EVP_HPKE_KEM_public_key_len(const EVP_HPKE_KEM *kem) { + return kem->public_key_len; +} + +size_t EVP_HPKE_KEM_private_key_len(const EVP_HPKE_KEM *kem) { + return kem->private_key_len; +} + +size_t EVP_HPKE_KEM_enc_len(const EVP_HPKE_KEM *kem) { return kem->enc_len; } + +void EVP_HPKE_KEY_zero(EVP_HPKE_KEY *key) { + OPENSSL_memset(key, 0, sizeof(EVP_HPKE_KEY)); +} + +void EVP_HPKE_KEY_cleanup(EVP_HPKE_KEY *key) { + // Nothing to clean up for now, but we may introduce a cleanup process in the + // future. +} + +EVP_HPKE_KEY *EVP_HPKE_KEY_new() { + EVP_HPKE_KEY *key = New(); + if (key == nullptr) { + return nullptr; + } + EVP_HPKE_KEY_zero(key); + return key; +} + +void EVP_HPKE_KEY_free(EVP_HPKE_KEY *key) { + if (key != nullptr) { + EVP_HPKE_KEY_cleanup(key); + Delete(key); + } +} + +int EVP_HPKE_KEY_copy(EVP_HPKE_KEY *dst, const EVP_HPKE_KEY *src) { + // For now, |EVP_HPKE_KEY| is trivially copyable. + OPENSSL_memcpy(dst, src, sizeof(EVP_HPKE_KEY)); + return 1; +} + +void EVP_HPKE_KEY_move(EVP_HPKE_KEY *out, EVP_HPKE_KEY *in) { + EVP_HPKE_KEY_cleanup(out); + // For now, |EVP_HPKE_KEY| is trivially movable. + // Note that Rust may move this structure. See + // bssl-crypto/src/scoped.rs:EvpHpkeKey. + OPENSSL_memcpy(out, in, sizeof(EVP_HPKE_KEY)); + EVP_HPKE_KEY_zero(in); +} + +int EVP_HPKE_KEY_init(EVP_HPKE_KEY *key, const EVP_HPKE_KEM *kem, + const uint8_t *priv_key, size_t priv_key_len) { + EVP_HPKE_KEY_zero(key); + key->kem = kem; + if (!kem->init_key(key, priv_key, priv_key_len)) { + key->kem = nullptr; + return 0; + } + return 1; +} + +int EVP_HPKE_KEY_generate(EVP_HPKE_KEY *key, const EVP_HPKE_KEM *kem) { + EVP_HPKE_KEY_zero(key); + key->kem = kem; + if (!kem->generate_key(key)) { + key->kem = nullptr; + return 0; + } + return 1; +} + +int EVP_HPKE_KEY_derive(EVP_HPKE_KEY *key, const EVP_HPKE_KEM *kem, + const uint8_t *ikm, size_t ikm_len) { + EVP_HPKE_KEY_zero(key); + key->kem = kem; + if (!kem->derive_key(key, Span(ikm, ikm_len))) { + key->kem = nullptr; + return 0; + } + return 1; +} + +const EVP_HPKE_KEM *EVP_HPKE_KEY_kem(const EVP_HPKE_KEY *key) { + return key->kem; +} + +int EVP_HPKE_KEY_public_key(const EVP_HPKE_KEY *key, uint8_t *out, + size_t *out_len, size_t max_out) { + if (max_out < key->kem->public_key_len) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); + return 0; + } + OPENSSL_memcpy(out, key->public_key, key->kem->public_key_len); + *out_len = key->kem->public_key_len; + return 1; +} + +int EVP_HPKE_KEY_private_key(const EVP_HPKE_KEY *key, uint8_t *out, + size_t *out_len, size_t max_out) { + if (max_out < key->kem->private_key_len) { + OPENSSL_PUT_ERROR(EVP, EVP_R_INVALID_BUFFER_SIZE); + return 0; + } + OPENSSL_memcpy(out, key->private_key, key->kem->private_key_len); + *out_len = key->kem->private_key_len; + return 1; +} + + +// Supported KDFs and AEADs. + +const EVP_HPKE_KDF *EVP_hpke_hkdf_sha256() { + static const EVP_HPKE_KDF kKDF = {EVP_HPKE_HKDF_SHA256, &EVP_sha256}; + return &kKDF; +} + +const EVP_HPKE_KDF *EVP_hpke_hkdf_sha384() { + static const EVP_HPKE_KDF kKDF = {EVP_HPKE_HKDF_SHA384, &EVP_sha384}; + return &kKDF; +} + +uint16_t EVP_HPKE_KDF_id(const EVP_HPKE_KDF *kdf) { return kdf->id; } + +const EVP_MD *EVP_HPKE_KDF_hkdf_md(const EVP_HPKE_KDF *kdf) { + return kdf->hkdf_md_func(); +} + +const EVP_HPKE_AEAD *EVP_hpke_aes_128_gcm() { + static const EVP_HPKE_AEAD kAEAD = {EVP_HPKE_AES_128_GCM, + &EVP_aead_aes_128_gcm}; + return &kAEAD; +} + +const EVP_HPKE_AEAD *EVP_hpke_aes_256_gcm() { + static const EVP_HPKE_AEAD kAEAD = {EVP_HPKE_AES_256_GCM, + &EVP_aead_aes_256_gcm}; + return &kAEAD; +} + +const EVP_HPKE_AEAD *EVP_hpke_chacha20_poly1305() { + static const EVP_HPKE_AEAD kAEAD = {EVP_HPKE_CHACHA20_POLY1305, + &EVP_aead_chacha20_poly1305}; + return &kAEAD; +} + +uint16_t EVP_HPKE_AEAD_id(const EVP_HPKE_AEAD *aead) { return aead->id; } + +const EVP_AEAD *EVP_HPKE_AEAD_aead(const EVP_HPKE_AEAD *aead) { + return aead->aead_func(); +} + + +// HPKE implementation. + +static std::array hpke_full_suite_id(const EVP_HPKE_CTX *ctx) { + return { + 'H', + 'P', + 'K', + 'E', + static_cast(ctx->kem->id >> 8), + static_cast(ctx->kem->id), + static_cast(ctx->kdf->id >> 8), + static_cast(ctx->kdf->id), + static_cast(ctx->aead->id >> 8), + static_cast(ctx->aead->id), + }; +} + +#define HPKE_MODE_BASE 0 +#define HPKE_MODE_AUTH 2 + +static int hpke_key_schedule(EVP_HPKE_CTX *ctx, uint8_t mode, + const uint8_t *shared_secret, + size_t shared_secret_len, const uint8_t *info, + size_t info_len) { + auto suite_id = hpke_full_suite_id(ctx); + + // psk_id_hash = LabeledExtract("", "psk_id_hash", psk_id) + // TODO(davidben): Precompute this value and store it with the EVP_HPKE_KDF. + const EVP_MD *hkdf_md = ctx->kdf->hkdf_md_func(); + uint8_t psk_id_hash[EVP_MAX_MD_SIZE]; + size_t psk_id_hash_len; + if (!hpke_labeled_extract(hkdf_md, psk_id_hash, &psk_id_hash_len, nullptr, 0, + suite_id.data(), suite_id.size(), "psk_id_hash", + nullptr, 0)) { + return 0; + } + + // info_hash = LabeledExtract("", "info_hash", info) + uint8_t info_hash[EVP_MAX_MD_SIZE]; + size_t info_hash_len; + if (!hpke_labeled_extract(hkdf_md, info_hash, &info_hash_len, nullptr, 0, + suite_id.data(), suite_id.size(), "info_hash", info, + info_len)) { + return 0; + } + + // key_schedule_context = concat(mode, psk_id_hash, info_hash) + uint8_t context[sizeof(uint8_t) + 2 * EVP_MAX_MD_SIZE]; + size_t context_len; + CBB context_cbb; + CBB_init_fixed(&context_cbb, context, sizeof(context)); + if (!CBB_add_u8(&context_cbb, mode) || + !CBB_add_bytes(&context_cbb, psk_id_hash, psk_id_hash_len) || + !CBB_add_bytes(&context_cbb, info_hash, info_hash_len) || + !CBB_finish(&context_cbb, nullptr, &context_len)) { + return 0; + } + + // secret = LabeledExtract(shared_secret, "secret", psk) + uint8_t secret[EVP_MAX_MD_SIZE]; + size_t secret_len; + if (!hpke_labeled_extract(hkdf_md, secret, &secret_len, shared_secret, + shared_secret_len, suite_id.data(), suite_id.size(), + "secret", nullptr, 0)) { + return 0; + } + + // key = LabeledExpand(secret, "key", key_schedule_context, Nk) + const EVP_AEAD *aead = EVP_HPKE_AEAD_aead(ctx->aead); + uint8_t key[EVP_AEAD_MAX_KEY_LENGTH]; + const size_t kKeyLen = EVP_AEAD_key_length(aead); + if (!hpke_labeled_expand(hkdf_md, key, kKeyLen, secret, secret_len, + suite_id.data(), suite_id.size(), "key", context, + context_len) || + !EVP_AEAD_CTX_init(&ctx->aead_ctx, aead, key, kKeyLen, + EVP_AEAD_DEFAULT_TAG_LENGTH, nullptr)) { + return 0; + } + + // base_nonce = LabeledExpand(secret, "base_nonce", key_schedule_context, Nn) + if (!hpke_labeled_expand(hkdf_md, ctx->base_nonce, + EVP_AEAD_nonce_length(aead), secret, secret_len, + suite_id.data(), suite_id.size(), "base_nonce", + context, context_len)) { + return 0; + } + + // exporter_secret = LabeledExpand(secret, "exp", key_schedule_context, Nh) + if (!hpke_labeled_expand(hkdf_md, ctx->exporter_secret, EVP_MD_size(hkdf_md), + secret, secret_len, suite_id.data(), suite_id.size(), + "exp", context, context_len)) { + return 0; + } + + return 1; +} + +void EVP_HPKE_CTX_zero(EVP_HPKE_CTX *ctx) { + OPENSSL_memset(ctx, 0, sizeof(EVP_HPKE_CTX)); + EVP_AEAD_CTX_zero(&ctx->aead_ctx); +} + +void EVP_HPKE_CTX_cleanup(EVP_HPKE_CTX *ctx) { + EVP_AEAD_CTX_cleanup(&ctx->aead_ctx); +} + +EVP_HPKE_CTX *EVP_HPKE_CTX_new() { + EVP_HPKE_CTX *ctx = New(); + if (ctx == nullptr) { + return nullptr; + } + EVP_HPKE_CTX_zero(ctx); + return ctx; +} + +void EVP_HPKE_CTX_free(EVP_HPKE_CTX *ctx) { + if (ctx != nullptr) { + EVP_HPKE_CTX_cleanup(ctx); + Delete(ctx); + } +} + +int EVP_HPKE_CTX_setup_sender(EVP_HPKE_CTX *ctx, uint8_t *out_enc, + size_t *out_enc_len, size_t max_enc, + const EVP_HPKE_KEM *kem, const EVP_HPKE_KDF *kdf, + const EVP_HPKE_AEAD *aead, + const uint8_t *peer_public_key, + size_t peer_public_key_len, const uint8_t *info, + size_t info_len) { + uint8_t seed[MAX_SEED_LEN]; + RAND_bytes(seed, kem->seed_len); + return EVP_HPKE_CTX_setup_sender_with_seed_for_testing( + ctx, out_enc, out_enc_len, max_enc, kem, kdf, aead, peer_public_key, + peer_public_key_len, info, info_len, seed, kem->seed_len); +} + +int EVP_HPKE_CTX_setup_sender_with_seed_for_testing( + EVP_HPKE_CTX *ctx, uint8_t *out_enc, size_t *out_enc_len, size_t max_enc, + const EVP_HPKE_KEM *kem, const EVP_HPKE_KDF *kdf, const EVP_HPKE_AEAD *aead, + const uint8_t *peer_public_key, size_t peer_public_key_len, + const uint8_t *info, size_t info_len, const uint8_t *seed, + size_t seed_len) { + EVP_HPKE_CTX_zero(ctx); + ctx->is_sender = 1; + ctx->kem = kem; + ctx->kdf = kdf; + ctx->aead = aead; + uint8_t shared_secret[MAX_SHARED_SECRET_LEN]; + size_t shared_secret_len; + if (!kem->encap_with_seed(kem, shared_secret, &shared_secret_len, out_enc, + out_enc_len, max_enc, peer_public_key, + peer_public_key_len, seed, seed_len) || + !hpke_key_schedule(ctx, HPKE_MODE_BASE, shared_secret, shared_secret_len, + info, info_len)) { + EVP_HPKE_CTX_cleanup(ctx); + return 0; + } + return 1; +} + +int EVP_HPKE_CTX_setup_recipient(EVP_HPKE_CTX *ctx, const EVP_HPKE_KEY *key, + const EVP_HPKE_KDF *kdf, + const EVP_HPKE_AEAD *aead, const uint8_t *enc, + size_t enc_len, const uint8_t *info, + size_t info_len) { + EVP_HPKE_CTX_zero(ctx); + ctx->is_sender = 0; + ctx->kem = key->kem; + ctx->kdf = kdf; + ctx->aead = aead; + uint8_t shared_secret[MAX_SHARED_SECRET_LEN]; + size_t shared_secret_len; + if (!key->kem->decap(key, shared_secret, &shared_secret_len, enc, enc_len) || + !hpke_key_schedule(ctx, HPKE_MODE_BASE, shared_secret, shared_secret_len, + info, info_len)) { + EVP_HPKE_CTX_cleanup(ctx); + return 0; + } + return 1; +} + + +int EVP_HPKE_CTX_setup_auth_sender( + EVP_HPKE_CTX *ctx, uint8_t *out_enc, size_t *out_enc_len, size_t max_enc, + const EVP_HPKE_KEY *key, const EVP_HPKE_KDF *kdf, const EVP_HPKE_AEAD *aead, + const uint8_t *peer_public_key, size_t peer_public_key_len, + const uint8_t *info, size_t info_len) { + uint8_t seed[MAX_SEED_LEN]; + RAND_bytes(seed, key->kem->seed_len); + return EVP_HPKE_CTX_setup_auth_sender_with_seed_for_testing( + ctx, out_enc, out_enc_len, max_enc, key, kdf, aead, peer_public_key, + peer_public_key_len, info, info_len, seed, key->kem->seed_len); +} + +int EVP_HPKE_CTX_setup_auth_sender_with_seed_for_testing( + EVP_HPKE_CTX *ctx, uint8_t *out_enc, size_t *out_enc_len, size_t max_enc, + const EVP_HPKE_KEY *key, const EVP_HPKE_KDF *kdf, const EVP_HPKE_AEAD *aead, + const uint8_t *peer_public_key, size_t peer_public_key_len, + const uint8_t *info, size_t info_len, const uint8_t *seed, + size_t seed_len) { + if (key->kem->auth_encap_with_seed == nullptr) { + // Not all HPKE KEMs support AuthEncap. + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + EVP_HPKE_CTX_zero(ctx); + ctx->is_sender = 1; + ctx->kem = key->kem; + ctx->kdf = kdf; + ctx->aead = aead; + uint8_t shared_secret[MAX_SHARED_SECRET_LEN]; + size_t shared_secret_len; + if (!key->kem->auth_encap_with_seed( + key, shared_secret, &shared_secret_len, out_enc, out_enc_len, max_enc, + peer_public_key, peer_public_key_len, seed, seed_len) || + !hpke_key_schedule(ctx, HPKE_MODE_AUTH, shared_secret, shared_secret_len, + info, info_len)) { + EVP_HPKE_CTX_cleanup(ctx); + return 0; + } + return 1; +} + +int EVP_HPKE_CTX_setup_auth_recipient( + EVP_HPKE_CTX *ctx, const EVP_HPKE_KEY *key, const EVP_HPKE_KDF *kdf, + const EVP_HPKE_AEAD *aead, const uint8_t *enc, size_t enc_len, + const uint8_t *info, size_t info_len, const uint8_t *peer_public_key, + size_t peer_public_key_len) { + if (key->kem->auth_decap == nullptr) { + // Not all HPKE KEMs support AuthDecap. + OPENSSL_PUT_ERROR(EVP, EVP_R_OPERATION_NOT_SUPPORTED_FOR_THIS_KEYTYPE); + return 0; + } + + EVP_HPKE_CTX_zero(ctx); + ctx->is_sender = 0; + ctx->kem = key->kem; + ctx->kdf = kdf; + ctx->aead = aead; + uint8_t shared_secret[MAX_SHARED_SECRET_LEN]; + size_t shared_secret_len; + if (!key->kem->auth_decap(key, shared_secret, &shared_secret_len, enc, + enc_len, peer_public_key, peer_public_key_len) || + !hpke_key_schedule(ctx, HPKE_MODE_AUTH, shared_secret, shared_secret_len, + info, info_len)) { + EVP_HPKE_CTX_cleanup(ctx); + return 0; + } + return 1; +} + +static void hpke_nonce(const EVP_HPKE_CTX *ctx, uint8_t *out_nonce, + size_t nonce_len) { + assert(nonce_len >= 8); + + // Write padded big-endian bytes of |ctx->seq| to |out_nonce|. + OPENSSL_memset(out_nonce, 0, nonce_len); + uint64_t seq_copy = ctx->seq; + for (size_t i = 0; i < 8; i++) { + out_nonce[nonce_len - i - 1] = seq_copy & 0xff; + seq_copy >>= 8; + } + + // XOR the encoded sequence with the |ctx->base_nonce|. + for (size_t i = 0; i < nonce_len; i++) { + out_nonce[i] ^= ctx->base_nonce[i]; + } +} + +int EVP_HPKE_CTX_open(EVP_HPKE_CTX *ctx, uint8_t *out, size_t *out_len, + size_t max_out_len, const uint8_t *in, size_t in_len, + const uint8_t *ad, size_t ad_len) { + if (ctx->is_sender) { + OPENSSL_PUT_ERROR(EVP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (ctx->seq == UINT64_MAX) { + OPENSSL_PUT_ERROR(EVP, ERR_R_OVERFLOW); + return 0; + } + + uint8_t nonce[EVP_AEAD_MAX_NONCE_LENGTH]; + const size_t nonce_len = EVP_AEAD_nonce_length(ctx->aead_ctx.aead); + hpke_nonce(ctx, nonce, nonce_len); + + if (!EVP_AEAD_CTX_open(&ctx->aead_ctx, out, out_len, max_out_len, nonce, + nonce_len, in, in_len, ad, ad_len)) { + return 0; + } + ctx->seq++; + return 1; +} + +int EVP_HPKE_CTX_seal(EVP_HPKE_CTX *ctx, uint8_t *out, size_t *out_len, + size_t max_out_len, const uint8_t *in, size_t in_len, + const uint8_t *ad, size_t ad_len) { + if (!ctx->is_sender) { + OPENSSL_PUT_ERROR(EVP, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (ctx->seq == UINT64_MAX) { + OPENSSL_PUT_ERROR(EVP, ERR_R_OVERFLOW); + return 0; + } + + uint8_t nonce[EVP_AEAD_MAX_NONCE_LENGTH]; + const size_t nonce_len = EVP_AEAD_nonce_length(ctx->aead_ctx.aead); + hpke_nonce(ctx, nonce, nonce_len); + + if (!EVP_AEAD_CTX_seal(&ctx->aead_ctx, out, out_len, max_out_len, nonce, + nonce_len, in, in_len, ad, ad_len)) { + return 0; + } + ctx->seq++; + return 1; +} + +int EVP_HPKE_CTX_export(const EVP_HPKE_CTX *ctx, uint8_t *out, + size_t secret_len, const uint8_t *context, + size_t context_len) { + auto suite_id = hpke_full_suite_id(ctx); + const EVP_MD *hkdf_md = ctx->kdf->hkdf_md_func(); + if (!hpke_labeled_expand(hkdf_md, out, secret_len, ctx->exporter_secret, + EVP_MD_size(hkdf_md), suite_id.data(), + suite_id.size(), "sec", context, context_len)) { + return 0; + } + return 1; +} + +size_t EVP_HPKE_CTX_max_overhead(const EVP_HPKE_CTX *ctx) { + assert(ctx->is_sender); + return EVP_AEAD_max_overhead(EVP_AEAD_CTX_aead(&ctx->aead_ctx)); +} + +const EVP_HPKE_KEM *EVP_HPKE_CTX_kem(const EVP_HPKE_CTX *ctx) { + return ctx->kem; +} + +const EVP_HPKE_AEAD *EVP_HPKE_CTX_aead(const EVP_HPKE_CTX *ctx) { + return ctx->aead; +} + +const EVP_HPKE_KDF *EVP_HPKE_CTX_kdf(const EVP_HPKE_CTX *ctx) { + return ctx->kdf; +} diff --git a/third_party/boringssl/src/crypto/hrss/hrss.c b/third_party/boringssl/src/crypto/hrss/hrss.c deleted file mode 100644 index 572e9817..00000000 --- a/third_party/boringssl/src/crypto/hrss/hrss.c +++ /dev/null @@ -1,2234 +0,0 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include - -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#define RESTRICT -#else -#define RESTRICT restrict -#endif - -#include "../internal.h" -#include "internal.h" - -#if defined(OPENSSL_SSE2) -#include -#endif - -#if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && defined(__ARM_NEON) -#include -#endif - -// This is an implementation of [HRSS], but with a KEM transformation based on -// [SXY]. The primary references are: - -// HRSS: https://eprint.iacr.org/2017/667.pdf -// HRSSNIST: -// https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip -// SXY: https://eprint.iacr.org/2017/1005.pdf -// NTRUTN14: -// https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf -// NTRUCOMP: https://eprint.iacr.org/2018/1174 -// SAFEGCD: https://gcd.cr.yp.to/papers.html#safegcd - - -// Vector operations. -// -// A couple of functions in this file can use vector operations to meaningful -// effect. If we're building for a target that has a supported vector unit, -// |HRSS_HAVE_VECTOR_UNIT| will be defined and |vec_t| will be typedefed to a -// 128-bit vector. The following functions abstract over the differences between -// NEON and SSE2 for implementing some vector operations. - -// TODO: MSVC can likely also be made to work with vector operations, but ^ must -// be replaced with _mm_xor_si128, etc. -#if defined(OPENSSL_SSE2) && (defined(__clang__) || !defined(_MSC_VER)) - -#define HRSS_HAVE_VECTOR_UNIT -typedef __m128i vec_t; - -// vec_capable returns one iff the current platform supports SSE2. -static int vec_capable(void) { return 1; } - -// vec_add performs a pair-wise addition of four uint16s from |a| and |b|. -static inline vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); } - -// vec_sub performs a pair-wise subtraction of four uint16s from |a| and |b|. -static inline vec_t vec_sub(vec_t a, vec_t b) { return _mm_sub_epi16(a, b); } - -// vec_mul multiplies each uint16_t in |a| by |b| and returns the resulting -// vector. -static inline vec_t vec_mul(vec_t a, uint16_t b) { - return _mm_mullo_epi16(a, _mm_set1_epi16(b)); -} - -// vec_fma multiplies each uint16_t in |b| by |c|, adds the result to |a|, and -// returns the resulting vector. -static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { - return _mm_add_epi16(a, _mm_mullo_epi16(b, _mm_set1_epi16(c))); -} - -// vec3_rshift_word right-shifts the 24 uint16_t's in |v| by one uint16. -static inline void vec3_rshift_word(vec_t v[3]) { - // Intel's left and right shifting is backwards compared to the order in - // memory because they're based on little-endian order of words (and not just - // bytes). So the shifts in this function will be backwards from what one - // might expect. - const __m128i carry0 = _mm_srli_si128(v[0], 14); - v[0] = _mm_slli_si128(v[0], 2); - - const __m128i carry1 = _mm_srli_si128(v[1], 14); - v[1] = _mm_slli_si128(v[1], 2); - v[1] |= carry0; - - v[2] = _mm_slli_si128(v[2], 2); - v[2] |= carry1; -} - -// vec4_rshift_word right-shifts the 32 uint16_t's in |v| by one uint16. -static inline void vec4_rshift_word(vec_t v[4]) { - // Intel's left and right shifting is backwards compared to the order in - // memory because they're based on little-endian order of words (and not just - // bytes). So the shifts in this function will be backwards from what one - // might expect. - const __m128i carry0 = _mm_srli_si128(v[0], 14); - v[0] = _mm_slli_si128(v[0], 2); - - const __m128i carry1 = _mm_srli_si128(v[1], 14); - v[1] = _mm_slli_si128(v[1], 2); - v[1] |= carry0; - - const __m128i carry2 = _mm_srli_si128(v[2], 14); - v[2] = _mm_slli_si128(v[2], 2); - v[2] |= carry1; - - v[3] = _mm_slli_si128(v[3], 2); - v[3] |= carry2; -} - -// vec_merge_3_5 takes the final three uint16_t's from |left|, appends the first -// five from |right|, and returns the resulting vector. -static inline vec_t vec_merge_3_5(vec_t left, vec_t right) { - return _mm_srli_si128(left, 10) | _mm_slli_si128(right, 6); -} - -// poly3_vec_lshift1 left-shifts the 768 bits in |a_s|, and in |a_a|, by one -// bit. -static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { - vec_t carry_s = {0}; - vec_t carry_a = {0}; - - for (int i = 0; i < 6; i++) { - vec_t next_carry_s = _mm_srli_epi64(a_s[i], 63); - a_s[i] = _mm_slli_epi64(a_s[i], 1); - a_s[i] |= _mm_slli_si128(next_carry_s, 8); - a_s[i] |= carry_s; - carry_s = _mm_srli_si128(next_carry_s, 8); - - vec_t next_carry_a = _mm_srli_epi64(a_a[i], 63); - a_a[i] = _mm_slli_epi64(a_a[i], 1); - a_a[i] |= _mm_slli_si128(next_carry_a, 8); - a_a[i] |= carry_a; - carry_a = _mm_srli_si128(next_carry_a, 8); - } -} - -// poly3_vec_rshift1 right-shifts the 768 bits in |a_s|, and in |a_a|, by one -// bit. -static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { - vec_t carry_s = {0}; - vec_t carry_a = {0}; - - for (int i = 5; i >= 0; i--) { - const vec_t next_carry_s = _mm_slli_epi64(a_s[i], 63); - a_s[i] = _mm_srli_epi64(a_s[i], 1); - a_s[i] |= _mm_srli_si128(next_carry_s, 8); - a_s[i] |= carry_s; - carry_s = _mm_slli_si128(next_carry_s, 8); - - const vec_t next_carry_a = _mm_slli_epi64(a_a[i], 63); - a_a[i] = _mm_srli_epi64(a_a[i], 1); - a_a[i] |= _mm_srli_si128(next_carry_a, 8); - a_a[i] |= carry_a; - carry_a = _mm_slli_si128(next_carry_a, 8); - } -} - -// vec_broadcast_bit duplicates the least-significant bit in |a| to all bits in -// a vector and returns the result. -static inline vec_t vec_broadcast_bit(vec_t a) { - return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63), 31), - 0b01010101); -} - -// vec_get_word returns the |i|th uint16_t in |v|. (This is a macro because the -// compiler requires that |i| be a compile-time constant.) -#define vec_get_word(v, i) _mm_extract_epi16(v, i) - -#elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && defined(__ARM_NEON) - -#define HRSS_HAVE_VECTOR_UNIT -typedef uint16x8_t vec_t; - -// These functions perform the same actions as the SSE2 function of the same -// name, above. - -static int vec_capable(void) { return CRYPTO_is_NEON_capable(); } - -static inline vec_t vec_add(vec_t a, vec_t b) { return a + b; } - -static inline vec_t vec_sub(vec_t a, vec_t b) { return a - b; } - -static inline vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); } - -static inline vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { - return vmlaq_n_u16(a, b, c); -} - -static inline void vec3_rshift_word(vec_t v[3]) { - const uint16x8_t kZero = {0}; - v[2] = vextq_u16(v[1], v[2], 7); - v[1] = vextq_u16(v[0], v[1], 7); - v[0] = vextq_u16(kZero, v[0], 7); -} - -static inline void vec4_rshift_word(vec_t v[4]) { - const uint16x8_t kZero = {0}; - v[3] = vextq_u16(v[2], v[3], 7); - v[2] = vextq_u16(v[1], v[2], 7); - v[1] = vextq_u16(v[0], v[1], 7); - v[0] = vextq_u16(kZero, v[0], 7); -} - -static inline vec_t vec_merge_3_5(vec_t left, vec_t right) { - return vextq_u16(left, right, 5); -} - -static inline uint16_t vec_get_word(vec_t v, unsigned i) { - return v[i]; -} - -#if !defined(OPENSSL_AARCH64) - -static inline vec_t vec_broadcast_bit(vec_t a) { - a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15); - return vdupq_lane_u16(vget_low_u16(a), 0); -} - -static inline void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { - vec_t carry_s = {0}; - vec_t carry_a = {0}; - const vec_t kZero = {0}; - - for (int i = 0; i < 6; i++) { - vec_t next_carry_s = a_s[i] >> 15; - a_s[i] <<= 1; - a_s[i] |= vextq_u16(kZero, next_carry_s, 7); - a_s[i] |= carry_s; - carry_s = vextq_u16(next_carry_s, kZero, 7); - - vec_t next_carry_a = a_a[i] >> 15; - a_a[i] <<= 1; - a_a[i] |= vextq_u16(kZero, next_carry_a, 7); - a_a[i] |= carry_a; - carry_a = vextq_u16(next_carry_a, kZero, 7); - } -} - -static inline void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { - vec_t carry_s = {0}; - vec_t carry_a = {0}; - const vec_t kZero = {0}; - - for (int i = 5; i >= 0; i--) { - vec_t next_carry_s = a_s[i] << 15; - a_s[i] >>= 1; - a_s[i] |= vextq_u16(next_carry_s, kZero, 1); - a_s[i] |= carry_s; - carry_s = vextq_u16(kZero, next_carry_s, 1); - - vec_t next_carry_a = a_a[i] << 15; - a_a[i] >>= 1; - a_a[i] |= vextq_u16(next_carry_a, kZero, 1); - a_a[i] |= carry_a; - carry_a = vextq_u16(kZero, next_carry_a, 1); - } -} - -#endif // !OPENSSL_AARCH64 - -#endif // (ARM || AARCH64) && NEON - -// Polynomials in this scheme have N terms. -// #define N 701 - -// Underlying data types and arithmetic operations. -// ------------------------------------------------ - -// Binary polynomials. - -// poly2 represents a degree-N polynomial over GF(2). The words are in little- -// endian order, i.e. the coefficient of x^0 is the LSB of the first word. The -// final word is only partially used since N is not a multiple of the word size. - -// Defined in internal.h: -// struct poly2 { -// crypto_word_t v[WORDS_PER_POLY]; -// }; - -OPENSSL_UNUSED static void hexdump(const void *void_in, size_t len) { - const uint8_t *in = (const uint8_t *)void_in; - for (size_t i = 0; i < len; i++) { - printf("%02x", in[i]); - } - printf("\n"); -} - -static void poly2_zero(struct poly2 *p) { - OPENSSL_memset(&p->v[0], 0, sizeof(crypto_word_t) * WORDS_PER_POLY); -} - -// word_reverse returns |in| with the bits in reverse order. -static crypto_word_t word_reverse(crypto_word_t in) { -#if defined(OPENSSL_64_BIT) - static const crypto_word_t kMasks[6] = { - UINT64_C(0x5555555555555555), - UINT64_C(0x3333333333333333), - UINT64_C(0x0f0f0f0f0f0f0f0f), - UINT64_C(0x00ff00ff00ff00ff), - UINT64_C(0x0000ffff0000ffff), - UINT64_C(0x00000000ffffffff), - }; -#else - static const crypto_word_t kMasks[5] = { - 0x55555555, - 0x33333333, - 0x0f0f0f0f, - 0x00ff00ff, - 0x0000ffff, - }; -#endif - - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kMasks); i++) { - in = ((in >> (1 << i)) & kMasks[i]) | ((in & kMasks[i]) << (1 << i)); - } - - return in; -} - -// lsb_to_all replicates the least-significant bit of |v| to all bits of the -// word. This is used in bit-slicing operations to make a vector from a fixed -// value. -static crypto_word_t lsb_to_all(crypto_word_t v) { return 0u - (v & 1); } - -// poly2_mod_phiN reduces |p| by Φ(N). -static void poly2_mod_phiN(struct poly2 *p) { - // m is the term at x^700, replicated to every bit. - const crypto_word_t m = - lsb_to_all(p->v[WORDS_PER_POLY - 1] >> (BITS_IN_LAST_WORD - 1)); - for (size_t i = 0; i < WORDS_PER_POLY; i++) { - p->v[i] ^= m; - } - p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << (BITS_IN_LAST_WORD - 1)) - 1; -} - -// poly2_reverse_700 reverses the order of the first 700 bits of |in| and writes -// the result to |out|. -static void poly2_reverse_700(struct poly2 *out, const struct poly2 *in) { - struct poly2 t; - for (size_t i = 0; i < WORDS_PER_POLY; i++) { - t.v[i] = word_reverse(in->v[i]); - } - - static const size_t shift = BITS_PER_WORD - ((N-1) % BITS_PER_WORD); - for (size_t i = 0; i < WORDS_PER_POLY-1; i++) { - out->v[i] = t.v[WORDS_PER_POLY-1-i] >> shift; - out->v[i] |= t.v[WORDS_PER_POLY-2-i] << (BITS_PER_WORD - shift); - } - out->v[WORDS_PER_POLY-1] = t.v[0] >> shift; -} - -// poly2_cswap exchanges the values of |a| and |b| if |swap| is all ones. -static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap) { - for (size_t i = 0; i < WORDS_PER_POLY; i++) { - const crypto_word_t sum = swap & (a->v[i] ^ b->v[i]); - a->v[i] ^= sum; - b->v[i] ^= sum; - } -} - -// poly2_fmadd sets |out| to |out| + |in| * m, where m is either -// |CONSTTIME_TRUE_W| or |CONSTTIME_FALSE_W|. -static void poly2_fmadd(struct poly2 *out, const struct poly2 *in, - crypto_word_t m) { - for (size_t i = 0; i < WORDS_PER_POLY; i++) { - out->v[i] ^= in->v[i] & m; - } -} - -// poly2_lshift1 left-shifts |p| by one bit. -static void poly2_lshift1(struct poly2 *p) { - crypto_word_t carry = 0; - for (size_t i = 0; i < WORDS_PER_POLY; i++) { - const crypto_word_t next_carry = p->v[i] >> (BITS_PER_WORD - 1); - p->v[i] <<= 1; - p->v[i] |= carry; - carry = next_carry; - } -} - -// poly2_rshift1 right-shifts |p| by one bit. -static void poly2_rshift1(struct poly2 *p) { - crypto_word_t carry = 0; - for (size_t i = WORDS_PER_POLY - 1; i < WORDS_PER_POLY; i--) { - const crypto_word_t next_carry = p->v[i] & 1; - p->v[i] >>= 1; - p->v[i] |= carry << (BITS_PER_WORD - 1); - carry = next_carry; - } -} - -// poly2_clear_top_bits clears the bits in the final word that are only for -// alignment. -static void poly2_clear_top_bits(struct poly2 *p) { - p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1; -} - -// poly2_top_bits_are_clear returns one iff the extra bits in the final words of -// |p| are zero. -static int poly2_top_bits_are_clear(const struct poly2 *p) { - return (p->v[WORDS_PER_POLY - 1] & - ~((UINT64_C(1) << BITS_IN_LAST_WORD) - 1)) == 0; -} - -// Ternary polynomials. - -// poly3 represents a degree-N polynomial over GF(3). Each coefficient is -// bitsliced across the |s| and |a| arrays, like this: -// -// s | a | value -// ----------------- -// 0 | 0 | 0 -// 0 | 1 | 1 -// 1 | 1 | -1 (aka 2) -// 1 | 0 | -// -// ('s' is for sign, and 'a' is the absolute value.) -// -// Once bitsliced as such, the following circuits can be used to implement -// addition and multiplication mod 3: -// -// (s3, a3) = (s1, a1) × (s2, a2) -// a3 = a1 ∧ a2 -// s3 = (s1 ⊕ s2) ∧ a3 -// -// (s3, a3) = (s1, a1) + (s2, a2) -// t = s1 ⊕ a2 -// s3 = t ∧ (s2 ⊕ a1) -// a3 = (a1 ⊕ a2) ∨ (t ⊕ s2) -// -// (s3, a3) = (s1, a1) - (s2, a2) -// t = a1 ⊕ a2 -// s3 = (s1 ⊕ a2) ∧ (t ⊕ s2) -// a3 = t ∨ (s1 ⊕ s2) -// -// Negating a value just involves XORing s by a. -// -// struct poly3 { -// struct poly2 s, a; -// }; - -OPENSSL_UNUSED static void poly3_print(const struct poly3 *in) { - struct poly3 p; - OPENSSL_memcpy(&p, in, sizeof(p)); - p.s.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1; - p.a.v[WORDS_PER_POLY - 1] &= ((crypto_word_t)1 << BITS_IN_LAST_WORD) - 1; - - printf("{["); - for (unsigned i = 0; i < WORDS_PER_POLY; i++) { - if (i) { - printf(" "); - } - printf(BN_HEX_FMT2, p.s.v[i]); - } - printf("] ["); - for (unsigned i = 0; i < WORDS_PER_POLY; i++) { - if (i) { - printf(" "); - } - printf(BN_HEX_FMT2, p.a.v[i]); - } - printf("]}\n"); -} - -static void poly3_zero(struct poly3 *p) { - poly2_zero(&p->s); - poly2_zero(&p->a); -} - -// poly3_reverse_700 reverses the order of the first 700 terms of |in| and -// writes them to |out|. -static void poly3_reverse_700(struct poly3 *out, const struct poly3 *in) { - poly2_reverse_700(&out->a, &in->a); - poly2_reverse_700(&out->s, &in->s); -} - -// poly3_word_mul sets (|out_s|, |out_a|) to (|s1|, |a1|) × (|s2|, |a2|). -static void poly3_word_mul(crypto_word_t *out_s, crypto_word_t *out_a, - const crypto_word_t s1, const crypto_word_t a1, - const crypto_word_t s2, const crypto_word_t a2) { - *out_a = a1 & a2; - *out_s = (s1 ^ s2) & *out_a; -} - -// poly3_word_add sets (|out_s|, |out_a|) to (|s1|, |a1|) + (|s2|, |a2|). -static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a, - const crypto_word_t s1, const crypto_word_t a1, - const crypto_word_t s2, const crypto_word_t a2) { - const crypto_word_t t = s1 ^ a2; - *out_s = t & (s2 ^ a1); - *out_a = (a1 ^ a2) | (t ^ s2); -} - -// poly3_word_sub sets (|out_s|, |out_a|) to (|s1|, |a1|) - (|s2|, |a2|). -static void poly3_word_sub(crypto_word_t *out_s, crypto_word_t *out_a, - const crypto_word_t s1, const crypto_word_t a1, - const crypto_word_t s2, const crypto_word_t a2) { - const crypto_word_t t = a1 ^ a2; - *out_s = (s1 ^ a2) & (t ^ s2); - *out_a = t | (s1 ^ s2); -} - -// poly3_mul_const sets |p| to |p|×m, where m = (ms, ma). -static void poly3_mul_const(struct poly3 *p, crypto_word_t ms, - crypto_word_t ma) { - ms = lsb_to_all(ms); - ma = lsb_to_all(ma); - - for (size_t i = 0; i < WORDS_PER_POLY; i++) { - poly3_word_mul(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], ms, ma); - } -} - -// poly3_fmadd sets |out| to |out| - |in|×m, where m is (ms, ma). -static void poly3_fmsub(struct poly3 *RESTRICT out, - const struct poly3 *RESTRICT in, crypto_word_t ms, - crypto_word_t ma) { - crypto_word_t product_s, product_a; - for (size_t i = 0; i < WORDS_PER_POLY; i++) { - poly3_word_mul(&product_s, &product_a, in->s.v[i], in->a.v[i], ms, ma); - poly3_word_sub(&out->s.v[i], &out->a.v[i], out->s.v[i], out->a.v[i], - product_s, product_a); - } -} - -// final_bit_to_all replicates the bit in the final position of the last word to -// all the bits in the word. -static crypto_word_t final_bit_to_all(crypto_word_t v) { - return lsb_to_all(v >> (BITS_IN_LAST_WORD - 1)); -} - -// poly3_top_bits_are_clear returns one iff the extra bits in the final words of -// |p| are zero. -OPENSSL_UNUSED static int poly3_top_bits_are_clear(const struct poly3 *p) { - return poly2_top_bits_are_clear(&p->s) && poly2_top_bits_are_clear(&p->a); -} - -// poly3_mod_phiN reduces |p| by Φ(N). -static void poly3_mod_phiN(struct poly3 *p) { - // In order to reduce by Φ(N) we subtract by the value of the greatest - // coefficient. - const crypto_word_t factor_s = final_bit_to_all(p->s.v[WORDS_PER_POLY - 1]); - const crypto_word_t factor_a = final_bit_to_all(p->a.v[WORDS_PER_POLY - 1]); - - for (size_t i = 0; i < WORDS_PER_POLY; i++) { - poly3_word_sub(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], factor_s, - factor_a); - } - - poly2_clear_top_bits(&p->s); - poly2_clear_top_bits(&p->a); -} - -static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap) { - poly2_cswap(&a->s, &b->s, swap); - poly2_cswap(&a->a, &b->a, swap); -} - -static void poly3_lshift1(struct poly3 *p) { - poly2_lshift1(&p->s); - poly2_lshift1(&p->a); -} - -static void poly3_rshift1(struct poly3 *p) { - poly2_rshift1(&p->s); - poly2_rshift1(&p->a); -} - -// poly3_span represents a pointer into a poly3. -struct poly3_span { - crypto_word_t *s; - crypto_word_t *a; -}; - -// poly3_span_add adds |n| words of values from |a| and |b| and writes the -// result to |out|. -static void poly3_span_add(const struct poly3_span *out, - const struct poly3_span *a, - const struct poly3_span *b, size_t n) { - for (size_t i = 0; i < n; i++) { - poly3_word_add(&out->s[i], &out->a[i], a->s[i], a->a[i], b->s[i], b->a[i]); - } -} - -// poly3_span_sub subtracts |n| words of |b| from |n| words of |a|. -static void poly3_span_sub(const struct poly3_span *a, - const struct poly3_span *b, size_t n) { - for (size_t i = 0; i < n; i++) { - poly3_word_sub(&a->s[i], &a->a[i], a->s[i], a->a[i], b->s[i], b->a[i]); - } -} - -// poly3_mul_aux is a recursive function that multiplies |n| words from |a| and -// |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements of -// |scratch| and the function recurses, except if |n| == 1, when |scratch| isn't -// used and the recursion stops. For |n| in {11, 22}, the transitive total -// amount of |scratch| needed happens to be 2n+2. -static void poly3_mul_aux(const struct poly3_span *out, - const struct poly3_span *scratch, - const struct poly3_span *a, - const struct poly3_span *b, size_t n) { - if (n == 1) { - crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0; - crypto_word_t b_s = b->s[0], b_a = b->a[0]; - const crypto_word_t a_s = a->s[0], a_a = a->a[0]; - - for (size_t i = 0; i < BITS_PER_WORD; i++) { - // Multiply (s, a) by the next value from (b_s, b_a). - crypto_word_t m_s, m_a; - poly3_word_mul(&m_s, &m_a, a_s, a_a, lsb_to_all(b_s), lsb_to_all(b_a)); - b_s >>= 1; - b_a >>= 1; - - if (i == 0) { - // Special case otherwise the code tries to shift by BITS_PER_WORD - // below, which is undefined. - r_s_low = m_s; - r_a_low = m_a; - continue; - } - - // Shift the multiplication result to the correct position. - const crypto_word_t m_s_low = m_s << i; - const crypto_word_t m_s_high = m_s >> (BITS_PER_WORD - i); - const crypto_word_t m_a_low = m_a << i; - const crypto_word_t m_a_high = m_a >> (BITS_PER_WORD - i); - - // Add into the result. - poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low); - poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high, - m_a_high); - } - - out->s[0] = r_s_low; - out->s[1] = r_s_high; - out->a[0] = r_a_low; - out->a[1] = r_a_high; - return; - } - - // Karatsuba multiplication. - // https://en.wikipedia.org/wiki/Karatsuba_algorithm - - // When |n| is odd, the two "halves" will have different lengths. The first - // is always the smaller. - const size_t low_len = n / 2; - const size_t high_len = n - low_len; - const struct poly3_span a_high = {&a->s[low_len], &a->a[low_len]}; - const struct poly3_span b_high = {&b->s[low_len], &b->a[low_len]}; - - // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second - // half. - const struct poly3_span a_cross_sum = *out; - const struct poly3_span b_cross_sum = {&out->s[high_len], &out->a[high_len]}; - poly3_span_add(&a_cross_sum, a, &a_high, low_len); - poly3_span_add(&b_cross_sum, b, &b_high, low_len); - if (high_len != low_len) { - a_cross_sum.s[low_len] = a_high.s[low_len]; - a_cross_sum.a[low_len] = a_high.a[low_len]; - b_cross_sum.s[low_len] = b_high.s[low_len]; - b_cross_sum.a[low_len] = b_high.a[low_len]; - } - - const struct poly3_span child_scratch = {&scratch->s[2 * high_len], - &scratch->a[2 * high_len]}; - const struct poly3_span out_mid = {&out->s[low_len], &out->a[low_len]}; - const struct poly3_span out_high = {&out->s[2 * low_len], - &out->a[2 * low_len]}; - - // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer. - poly3_mul_aux(scratch, &child_scratch, &a_cross_sum, &b_cross_sum, high_len); - // Calculate a_1 × b_1. - poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len); - // Calculate a_0 × b_0. - poly3_mul_aux(out, &child_scratch, a, b, low_len); - - // Subtract those last two products from the first. - poly3_span_sub(scratch, out, low_len * 2); - poly3_span_sub(scratch, &out_high, high_len * 2); - - // Add the middle product into the output. - poly3_span_add(&out_mid, &out_mid, scratch, high_len * 2); -} - -// HRSS_poly3_mul sets |*out| to |x|×|y| mod Φ(N). -void HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x, - const struct poly3 *y) { - crypto_word_t prod_s[WORDS_PER_POLY * 2]; - crypto_word_t prod_a[WORDS_PER_POLY * 2]; - crypto_word_t scratch_s[WORDS_PER_POLY * 2 + 2]; - crypto_word_t scratch_a[WORDS_PER_POLY * 2 + 2]; - const struct poly3_span prod_span = {prod_s, prod_a}; - const struct poly3_span scratch_span = {scratch_s, scratch_a}; - const struct poly3_span x_span = {(crypto_word_t *)x->s.v, - (crypto_word_t *)x->a.v}; - const struct poly3_span y_span = {(crypto_word_t *)y->s.v, - (crypto_word_t *)y->a.v}; - - poly3_mul_aux(&prod_span, &scratch_span, &x_span, &y_span, WORDS_PER_POLY); - - // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the - // upper-half to the lower-half. However, N is 701, which isn't a multiple of - // BITS_PER_WORD, so the upper-half vectors all have to be shifted before - // being added to the lower-half. - for (size_t i = 0; i < WORDS_PER_POLY; i++) { - crypto_word_t v_s = prod_s[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD; - v_s |= prod_s[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD); - crypto_word_t v_a = prod_a[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD; - v_a |= prod_a[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD); - - poly3_word_add(&out->s.v[i], &out->a.v[i], prod_s[i], prod_a[i], v_s, v_a); - } - - poly3_mod_phiN(out); -} - -#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64) - -// poly3_vec_cswap swaps (|a_s|, |a_a|) and (|b_s|, |b_a|) if |swap| is -// |0xff..ff|. Otherwise, |swap| must be zero. -static inline void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], - vec_t b_a[6], const vec_t swap) { - for (int i = 0; i < 6; i++) { - const vec_t sum_s = swap & (a_s[i] ^ b_s[i]); - a_s[i] ^= sum_s; - b_s[i] ^= sum_s; - - const vec_t sum_a = swap & (a_a[i] ^ b_a[i]); - a_a[i] ^= sum_a; - b_a[i] ^= sum_a; - } -} - -// poly3_vec_fmsub subtracts (|ms|, |ma|) × (|b_s|, |b_a|) from (|a_s|, |a_a|). -static inline void poly3_vec_fmsub(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], - vec_t b_a[6], const vec_t ms, - const vec_t ma) { - for (int i = 0; i < 6; i++) { - // See the bitslice formula, above. - const vec_t s = b_s[i]; - const vec_t a = b_a[i]; - const vec_t product_a = a & ma; - const vec_t product_s = (s ^ ms) & product_a; - - const vec_t out_s = a_s[i]; - const vec_t out_a = a_a[i]; - const vec_t t = out_a ^ product_a; - a_s[i] = (out_s ^ product_a) & (t ^ product_s); - a_a[i] = t | (out_s ^ product_s); - } -} - -// poly3_invert_vec sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod -// Φ(N). -static void poly3_invert_vec(struct poly3 *out, const struct poly3 *in) { - // This algorithm is taken from section 7.1 of [SAFEGCD]. - const vec_t kZero = {0}; - const vec_t kOne = {1}; - static const uint8_t kBottomSixtyOne[sizeof(vec_t)] = { - 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f}; - - vec_t v_s[6], v_a[6], r_s[6], r_a[6], f_s[6], f_a[6], g_s[6], g_a[6]; - // v = 0 - memset(&v_s, 0, sizeof(v_s)); - memset(&v_a, 0, sizeof(v_a)); - // r = 1 - memset(&r_s, 0, sizeof(r_s)); - memset(&r_a, 0, sizeof(r_a)); - r_a[0] = kOne; - // f = all ones. - memset(f_s, 0, sizeof(f_s)); - memset(f_a, 0xff, 5 * sizeof(vec_t)); - memcpy(&f_a[5], kBottomSixtyOne, sizeof(kBottomSixtyOne)); - // g is the reversal of |in|. - struct poly3 in_reversed; - poly3_reverse_700(&in_reversed, in); - g_s[5] = kZero; - memcpy(&g_s, &in_reversed.s.v, WORDS_PER_POLY * sizeof(crypto_word_t)); - g_a[5] = kZero; - memcpy(&g_a, &in_reversed.a.v, WORDS_PER_POLY * sizeof(crypto_word_t)); - - int delta = 1; - - for (size_t i = 0; i < (2*(N-1)) - 1; i++) { - poly3_vec_lshift1(v_s, v_a); - - const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1; - const crypto_word_t delta_is_non_negative = delta_sign_bit - 1; - const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta); - const vec_t g_has_constant_term = vec_broadcast_bit(g_a[0]); - const vec_t mask_w = - {delta_is_non_negative & delta_is_non_zero}; - const vec_t mask = vec_broadcast_bit(mask_w) & g_has_constant_term; - - const vec_t c_a = vec_broadcast_bit(f_a[0] & g_a[0]); - const vec_t c_s = vec_broadcast_bit((f_s[0] ^ g_s[0]) & c_a); - - delta = constant_time_select_int(lsb_to_all(mask[0]), -delta, delta); - delta++; - - poly3_vec_cswap(f_s, f_a, g_s, g_a, mask); - poly3_vec_fmsub(g_s, g_a, f_s, f_a, c_s, c_a); - poly3_vec_rshift1(g_s, g_a); - - poly3_vec_cswap(v_s, v_a, r_s, r_a, mask); - poly3_vec_fmsub(r_s, r_a, v_s, v_a, c_s, c_a); - } - - assert(delta == 0); - memcpy(out->s.v, v_s, WORDS_PER_POLY * sizeof(crypto_word_t)); - memcpy(out->a.v, v_a, WORDS_PER_POLY * sizeof(crypto_word_t)); - poly3_mul_const(out, vec_get_word(f_s[0], 0), vec_get_word(f_a[0], 0)); - poly3_reverse_700(out, out); -} - -#endif // HRSS_HAVE_VECTOR_UNIT - -// HRSS_poly3_invert sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod -// Φ(N). -void HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in) { - // The vector version of this function seems slightly slower on AArch64, but - // is useful on ARMv7 and x86-64. -#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64) - if (vec_capable()) { - poly3_invert_vec(out, in); - return; - } -#endif - - // This algorithm is taken from section 7.1 of [SAFEGCD]. - struct poly3 v, r, f, g; - // v = 0 - poly3_zero(&v); - // r = 1 - poly3_zero(&r); - r.a.v[0] = 1; - // f = all ones. - OPENSSL_memset(&f.s, 0, sizeof(struct poly2)); - OPENSSL_memset(&f.a, 0xff, sizeof(struct poly2)); - f.a.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD; - // g is the reversal of |in|. - poly3_reverse_700(&g, in); - int delta = 1; - - for (size_t i = 0; i < (2*(N-1)) - 1; i++) { - poly3_lshift1(&v); - - const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1; - const crypto_word_t delta_is_non_negative = delta_sign_bit - 1; - const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta); - const crypto_word_t g_has_constant_term = lsb_to_all(g.a.v[0]); - const crypto_word_t mask = - g_has_constant_term & delta_is_non_negative & delta_is_non_zero; - - crypto_word_t c_s, c_a; - poly3_word_mul(&c_s, &c_a, f.s.v[0], f.a.v[0], g.s.v[0], g.a.v[0]); - c_s = lsb_to_all(c_s); - c_a = lsb_to_all(c_a); - - delta = constant_time_select_int(mask, -delta, delta); - delta++; - - poly3_cswap(&f, &g, mask); - poly3_fmsub(&g, &f, c_s, c_a); - poly3_rshift1(&g); - - poly3_cswap(&v, &r, mask); - poly3_fmsub(&r, &v, c_s, c_a); - } - - assert(delta == 0); - poly3_mul_const(&v, f.s.v[0], f.a.v[0]); - poly3_reverse_700(out, &v); -} - -// Polynomials in Q. - -// Coefficients are reduced mod Q. (Q is clearly not prime, therefore the -// coefficients do not form a field.) -#define Q 8192 - -// VECS_PER_POLY is the number of 128-bit vectors needed to represent a -// polynomial. -#define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t)) -#define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC) - -// poly represents a polynomial with coefficients mod Q. Note that, while Q is a -// power of two, this does not operate in GF(Q). That would be a binary field -// but this is simply mod Q. Thus the coefficients are not a field. -// -// Coefficients are ordered little-endian, thus the coefficient of x^0 is the -// first element of the array. -struct poly { -#if defined(HRSS_HAVE_VECTOR_UNIT) - union { - // N + 3 = 704, which is a multiple of 64 and thus aligns things, esp for - // the vector code. - uint16_t v[N + 3]; - vec_t vectors[VECS_PER_POLY]; - }; -#else - // Even if !HRSS_HAVE_VECTOR_UNIT, external assembly may be called that - // requires alignment. - alignas(16) uint16_t v[N + 3]; -#endif -}; - -// poly_normalize zeros out the excess elements of |x| which are included only -// for alignment. -static void poly_normalize(struct poly *x) { - OPENSSL_memset(&x->v[N], 0, 3 * sizeof(uint16_t)); -} - -// poly_assert_normalized asserts that the excess elements of |x| are zeroed out -// for the cases that case. (E.g. |poly_mul_vec|.) -static void poly_assert_normalized(const struct poly *x) { - assert(x->v[N] == 0); - assert(x->v[N + 1] == 0); - assert(x->v[N + 2] == 0); -} - -OPENSSL_UNUSED static void poly_print(const struct poly *p) { - printf("["); - for (unsigned i = 0; i < N; i++) { - if (i) { - printf(" "); - } - printf("%d", p->v[i]); - } - printf("]\n"); -} - -// POLY_MUL_SCRATCH contains space for the working variables needed by -// |poly_mul|. The contents afterwards may be discarded, but the object may also -// be reused with future |poly_mul| calls to save heap allocations. -// -// This object must have 32-byte alignment. -struct POLY_MUL_SCRATCH { - union { - // This is used by |poly_mul_novec|. - struct { - uint16_t prod[2 * N]; - uint16_t scratch[1318]; - } novec; - -#if defined(HRSS_HAVE_VECTOR_UNIT) - // This is used by |poly_mul_vec|. - struct { - vec_t prod[VECS_PER_POLY * 2]; - vec_t scratch[172]; - } vec; -#endif - -#if defined(POLY_RQ_MUL_ASM) - // This is the space used by |poly_Rq_mul|. - uint8_t rq[POLY_MUL_RQ_SCRATCH_SPACE]; -#endif - } u; -}; - -#if defined(HRSS_HAVE_VECTOR_UNIT) - -// poly_mul_vec_aux is a recursive function that multiplies |n| words from |a| -// and |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements -// of |scratch| and the function recurses, except if |n| < 3, when |scratch| -// isn't used and the recursion stops. If |n| == |VECS_PER_POLY| then |scratch| -// needs 172 elements. -static void poly_mul_vec_aux(vec_t *restrict out, vec_t *restrict scratch, - const vec_t *restrict a, const vec_t *restrict b, - const size_t n) { - // In [HRSS], the technique they used for polynomial multiplication is - // described: they start with Toom-4 at the top level and then two layers of - // Karatsuba. Karatsuba is a specific instance of the general Toom–Cook - // decomposition, which splits an input n-ways and produces 2n-1 - // multiplications of those parts. So, starting with 704 coefficients (rounded - // up from 701 to have more factors of two), Toom-4 gives seven - // multiplications of degree-174 polynomials. Each round of Karatsuba (which - // is Toom-2) increases the number of multiplications by a factor of three - // while halving the size of the values being multiplied. So two rounds gives - // 63 multiplications of degree-44 polynomials. Then they (I think) form - // vectors by gathering all 63 coefficients of each power together, for each - // input, and doing more rounds of Karatsuba on the vectors until they bottom- - // out somewhere with schoolbook multiplication. - // - // I tried something like that for NEON. NEON vectors are 128 bits so hold - // eight coefficients. I wrote a function that did Karatsuba on eight - // multiplications at the same time, using such vectors, and a Go script that - // decomposed from degree-704, with Karatsuba in non-transposed form, until it - // reached multiplications of degree-44. It batched up those 81 - // multiplications into lots of eight with a single one left over (which was - // handled directly). - // - // It worked, but it was significantly slower than the dumb algorithm used - // below. Potentially that was because I misunderstood how [HRSS] did it, or - // because Clang is bad at generating good code from NEON intrinsics on ARMv7. - // (Which is true: the code generated by Clang for the below is pretty crap.) - // - // This algorithm is much simpler. It just does Karatsuba decomposition all - // the way down and never transposes. When it gets down to degree-16 or - // degree-24 values, they are multiplied using schoolbook multiplication and - // vector intrinsics. The vector operations form each of the eight phase- - // shifts of one of the inputs, point-wise multiply, and then add into the - // result at the correct place. This means that 33% (degree-16) or 25% - // (degree-24) of the multiplies and adds are wasted, but it does ok. - if (n == 2) { - vec_t result[4]; - vec_t vec_a[3]; - static const vec_t kZero = {0}; - vec_a[0] = a[0]; - vec_a[1] = a[1]; - vec_a[2] = kZero; - - result[0] = vec_mul(vec_a[0], vec_get_word(b[0], 0)); - result[1] = vec_mul(vec_a[1], vec_get_word(b[0], 0)); - - result[1] = vec_fma(result[1], vec_a[0], vec_get_word(b[1], 0)); - result[2] = vec_mul(vec_a[1], vec_get_word(b[1], 0)); - result[3] = kZero; - - vec3_rshift_word(vec_a); - -#define BLOCK(x, y) \ - do { \ - result[x + 0] = \ - vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ - result[x + 1] = \ - vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ - result[x + 2] = \ - vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \ - } while (0) - - BLOCK(0, 1); - BLOCK(1, 9); - - vec3_rshift_word(vec_a); - - BLOCK(0, 2); - BLOCK(1, 10); - - vec3_rshift_word(vec_a); - - BLOCK(0, 3); - BLOCK(1, 11); - - vec3_rshift_word(vec_a); - - BLOCK(0, 4); - BLOCK(1, 12); - - vec3_rshift_word(vec_a); - - BLOCK(0, 5); - BLOCK(1, 13); - - vec3_rshift_word(vec_a); - - BLOCK(0, 6); - BLOCK(1, 14); - - vec3_rshift_word(vec_a); - - BLOCK(0, 7); - BLOCK(1, 15); - -#undef BLOCK - - memcpy(out, result, sizeof(result)); - return; - } - - if (n == 3) { - vec_t result[6]; - vec_t vec_a[4]; - static const vec_t kZero = {0}; - vec_a[0] = a[0]; - vec_a[1] = a[1]; - vec_a[2] = a[2]; - vec_a[3] = kZero; - - result[0] = vec_mul(a[0], vec_get_word(b[0], 0)); - result[1] = vec_mul(a[1], vec_get_word(b[0], 0)); - result[2] = vec_mul(a[2], vec_get_word(b[0], 0)); - -#define BLOCK_PRE(x, y) \ - do { \ - result[x + 0] = \ - vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ - result[x + 1] = \ - vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ - result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8)); \ - } while (0) - - BLOCK_PRE(1, 8); - BLOCK_PRE(2, 16); - - result[5] = kZero; - - vec4_rshift_word(vec_a); - -#define BLOCK(x, y) \ - do { \ - result[x + 0] = \ - vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ - result[x + 1] = \ - vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ - result[x + 2] = \ - vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \ - result[x + 3] = \ - vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \ - } while (0) - - BLOCK(0, 1); - BLOCK(1, 9); - BLOCK(2, 17); - - vec4_rshift_word(vec_a); - - BLOCK(0, 2); - BLOCK(1, 10); - BLOCK(2, 18); - - vec4_rshift_word(vec_a); - - BLOCK(0, 3); - BLOCK(1, 11); - BLOCK(2, 19); - - vec4_rshift_word(vec_a); - - BLOCK(0, 4); - BLOCK(1, 12); - BLOCK(2, 20); - - vec4_rshift_word(vec_a); - - BLOCK(0, 5); - BLOCK(1, 13); - BLOCK(2, 21); - - vec4_rshift_word(vec_a); - - BLOCK(0, 6); - BLOCK(1, 14); - BLOCK(2, 22); - - vec4_rshift_word(vec_a); - - BLOCK(0, 7); - BLOCK(1, 15); - BLOCK(2, 23); - -#undef BLOCK -#undef BLOCK_PRE - - memcpy(out, result, sizeof(result)); - - return; - } - - // Karatsuba multiplication. - // https://en.wikipedia.org/wiki/Karatsuba_algorithm - - // When |n| is odd, the two "halves" will have different lengths. The first is - // always the smaller. - const size_t low_len = n / 2; - const size_t high_len = n - low_len; - const vec_t *a_high = &a[low_len]; - const vec_t *b_high = &b[low_len]; - - // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second - // half. - for (size_t i = 0; i < low_len; i++) { - out[i] = vec_add(a_high[i], a[i]); - out[high_len + i] = vec_add(b_high[i], b[i]); - } - if (high_len != low_len) { - out[low_len] = a_high[low_len]; - out[high_len + low_len] = b_high[low_len]; - } - - vec_t *const child_scratch = &scratch[2 * high_len]; - // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer. - poly_mul_vec_aux(scratch, child_scratch, out, &out[high_len], high_len); - // Calculate a_1 × b_1. - poly_mul_vec_aux(&out[low_len * 2], child_scratch, a_high, b_high, high_len); - // Calculate a_0 × b_0. - poly_mul_vec_aux(out, child_scratch, a, b, low_len); - - // Subtract those last two products from the first. - for (size_t i = 0; i < low_len * 2; i++) { - scratch[i] = vec_sub(scratch[i], vec_add(out[i], out[low_len * 2 + i])); - } - if (low_len != high_len) { - scratch[low_len * 2] = vec_sub(scratch[low_len * 2], out[low_len * 4]); - scratch[low_len * 2 + 1] = - vec_sub(scratch[low_len * 2 + 1], out[low_len * 4 + 1]); - } - - // Add the middle product into the output. - for (size_t i = 0; i < high_len * 2; i++) { - out[low_len + i] = vec_add(out[low_len + i], scratch[i]); - } -} - -// poly_mul_vec sets |*out| to |x|×|y| mod (𝑥^n - 1). -static void poly_mul_vec(struct POLY_MUL_SCRATCH *scratch, struct poly *out, - const struct poly *x, const struct poly *y) { - static_assert(sizeof(out->v) == sizeof(vec_t) * VECS_PER_POLY, - "struct poly is the wrong size"); - static_assert(alignof(struct poly) == alignof(vec_t), - "struct poly has incorrect alignment"); - poly_assert_normalized(x); - poly_assert_normalized(y); - - vec_t *const prod = scratch->u.vec.prod; - vec_t *const aux_scratch = scratch->u.vec.scratch; - poly_mul_vec_aux(prod, aux_scratch, x->vectors, y->vectors, VECS_PER_POLY); - - // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the - // upper-half to the lower-half. However, N is 701, which isn't a multiple of - // the vector size, so the upper-half vectors all have to be shifted before - // being added to the lower-half. - vec_t *out_vecs = (vec_t *)out->v; - - for (size_t i = 0; i < VECS_PER_POLY; i++) { - const vec_t prev = prod[VECS_PER_POLY - 1 + i]; - const vec_t this = prod[VECS_PER_POLY + i]; - out_vecs[i] = vec_add(prod[i], vec_merge_3_5(prev, this)); - } - - OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t)); -} - -#endif // HRSS_HAVE_VECTOR_UNIT - -// poly_mul_novec_aux writes the product of |a| and |b| to |out|, using -// |scratch| as scratch space. It'll use Karatsuba if the inputs are large -// enough to warrant it. Each call uses 2*ceil(n/2) elements of |scratch| and -// the function recurses, except if |n| < 64, when |scratch| isn't used and the -// recursion stops. If |n| == |N| then |scratch| needs 1318 elements. -static void poly_mul_novec_aux(uint16_t *out, uint16_t *scratch, - const uint16_t *a, const uint16_t *b, size_t n) { - static const size_t kSchoolbookLimit = 64; - if (n < kSchoolbookLimit) { - OPENSSL_memset(out, 0, sizeof(uint16_t) * n * 2); - for (size_t i = 0; i < n; i++) { - for (size_t j = 0; j < n; j++) { - out[i + j] += (unsigned) a[i] * b[j]; - } - } - - return; - } - - // Karatsuba multiplication. - // https://en.wikipedia.org/wiki/Karatsuba_algorithm - - // When |n| is odd, the two "halves" will have different lengths. The - // first is always the smaller. - const size_t low_len = n / 2; - const size_t high_len = n - low_len; - const uint16_t *const a_high = &a[low_len]; - const uint16_t *const b_high = &b[low_len]; - - for (size_t i = 0; i < low_len; i++) { - out[i] = a_high[i] + a[i]; - out[high_len + i] = b_high[i] + b[i]; - } - if (high_len != low_len) { - out[low_len] = a_high[low_len]; - out[high_len + low_len] = b_high[low_len]; - } - - uint16_t *const child_scratch = &scratch[2 * high_len]; - poly_mul_novec_aux(scratch, child_scratch, out, &out[high_len], high_len); - poly_mul_novec_aux(&out[low_len * 2], child_scratch, a_high, b_high, - high_len); - poly_mul_novec_aux(out, child_scratch, a, b, low_len); - - for (size_t i = 0; i < low_len * 2; i++) { - scratch[i] -= out[i] + out[low_len * 2 + i]; - } - if (low_len != high_len) { - scratch[low_len * 2] -= out[low_len * 4]; - assert(out[low_len * 4 + 1] == 0); - } - - for (size_t i = 0; i < high_len * 2; i++) { - out[low_len + i] += scratch[i]; - } -} - -// poly_mul_novec sets |*out| to |x|×|y| mod (𝑥^n - 1). -static void poly_mul_novec(struct POLY_MUL_SCRATCH *scratch, struct poly *out, - const struct poly *x, const struct poly *y) { - uint16_t *const prod = scratch->u.novec.prod; - uint16_t *const aux_scratch = scratch->u.novec.scratch; - poly_mul_novec_aux(prod, aux_scratch, x->v, y->v, N); - - for (size_t i = 0; i < N; i++) { - out->v[i] = prod[i] + prod[i + N]; - } - OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t)); -} - -static void poly_mul(struct POLY_MUL_SCRATCH *scratch, struct poly *r, - const struct poly *a, const struct poly *b) { -#if defined(POLY_RQ_MUL_ASM) - if (CRYPTO_is_AVX2_capable()) { - poly_Rq_mul(r->v, a->v, b->v, scratch->u.rq); - poly_normalize(r); - } else -#endif - -#if defined(HRSS_HAVE_VECTOR_UNIT) - if (vec_capable()) { - poly_mul_vec(scratch, r, a, b); - } else -#endif - - // Fallback, non-vector case. - { - poly_mul_novec(scratch, r, a, b); - } - - poly_assert_normalized(r); -} - -// poly_mul_x_minus_1 sets |p| to |p|×(𝑥 - 1) mod (𝑥^n - 1). -static void poly_mul_x_minus_1(struct poly *p) { - // Multiplying by (𝑥 - 1) means negating each coefficient and adding in - // the value of the previous one. - const uint16_t orig_final_coefficient = p->v[N - 1]; - - for (size_t i = N - 1; i > 0; i--) { - p->v[i] = p->v[i - 1] - p->v[i]; - } - p->v[0] = orig_final_coefficient - p->v[0]; -} - -// poly_mod_phiN sets |p| to |p| mod Φ(N). -static void poly_mod_phiN(struct poly *p) { - const uint16_t coeff700 = p->v[N - 1]; - - for (unsigned i = 0; i < N; i++) { - p->v[i] -= coeff700; - } -} - -// poly_clamp reduces each coefficient mod Q. -static void poly_clamp(struct poly *p) { - for (unsigned i = 0; i < N; i++) { - p->v[i] &= Q - 1; - } -} - - -// Conversion functions -// -------------------- - -// poly2_from_poly sets |*out| to |in| mod 2. -static void poly2_from_poly(struct poly2 *out, const struct poly *in) { - crypto_word_t *words = out->v; - unsigned shift = 0; - crypto_word_t word = 0; - - for (unsigned i = 0; i < N; i++) { - word >>= 1; - word |= (crypto_word_t)(in->v[i] & 1) << (BITS_PER_WORD - 1); - shift++; - - if (shift == BITS_PER_WORD) { - *words = word; - words++; - word = 0; - shift = 0; - } - } - - word >>= BITS_PER_WORD - shift; - *words = word; -} - -// mod3 treats |a| as a signed number and returns |a| mod 3. -static uint16_t mod3(int16_t a) { - const int16_t q = ((int32_t)a * 21845) >> 16; - int16_t ret = a - 3 * q; - // At this point, |ret| is in {0, 1, 2, 3} and that needs to be mapped to {0, - // 1, 2, 0}. - return ret & ((ret & (ret >> 1)) - 1); -} - -// poly3_from_poly sets |*out| to |in|. -static void poly3_from_poly(struct poly3 *out, const struct poly *in) { - crypto_word_t *words_s = out->s.v; - crypto_word_t *words_a = out->a.v; - crypto_word_t s = 0; - crypto_word_t a = 0; - unsigned shift = 0; - - for (unsigned i = 0; i < N; i++) { - // This duplicates the 13th bit upwards to the top of the uint16, - // essentially treating it as a sign bit and converting into a signed int16. - // The signed value is reduced mod 3, yielding {0, 1, 2}. - const uint16_t v = mod3((int16_t)(in->v[i] << 3) >> 3); - s >>= 1; - const crypto_word_t s_bit = (crypto_word_t)(v & 2) << (BITS_PER_WORD - 2); - s |= s_bit; - a >>= 1; - a |= s_bit | (crypto_word_t)(v & 1) << (BITS_PER_WORD - 1); - shift++; - - if (shift == BITS_PER_WORD) { - *words_s = s; - words_s++; - *words_a = a; - words_a++; - s = a = 0; - shift = 0; - } - } - - s >>= BITS_PER_WORD - shift; - a >>= BITS_PER_WORD - shift; - *words_s = s; - *words_a = a; -} - -// poly3_from_poly_checked sets |*out| to |in|, which has coefficients in {0, 1, -// Q-1}. It returns a mask indicating whether all coefficients were found to be -// in that set. -static crypto_word_t poly3_from_poly_checked(struct poly3 *out, - const struct poly *in) { - crypto_word_t *words_s = out->s.v; - crypto_word_t *words_a = out->a.v; - crypto_word_t s = 0; - crypto_word_t a = 0; - unsigned shift = 0; - crypto_word_t ok = CONSTTIME_TRUE_W; - - for (unsigned i = 0; i < N; i++) { - const uint16_t v = in->v[i]; - // Maps {0, 1, Q-1} to {0, 1, 2}. - uint16_t mod3 = v & 3; - mod3 ^= mod3 >> 1; - const uint16_t expected = (uint16_t)((~((mod3 >> 1) - 1)) | mod3) % Q; - ok &= constant_time_eq_w(v, expected); - - s >>= 1; - const crypto_word_t s_bit = (crypto_word_t)(mod3 & 2) - << (BITS_PER_WORD - 2); - s |= s_bit; - a >>= 1; - a |= s_bit | (crypto_word_t)(mod3 & 1) << (BITS_PER_WORD - 1); - shift++; - - if (shift == BITS_PER_WORD) { - *words_s = s; - words_s++; - *words_a = a; - words_a++; - s = a = 0; - shift = 0; - } - } - - s >>= BITS_PER_WORD - shift; - a >>= BITS_PER_WORD - shift; - *words_s = s; - *words_a = a; - - return ok; -} - -static void poly_from_poly2(struct poly *out, const struct poly2 *in) { - const crypto_word_t *words = in->v; - unsigned shift = 0; - crypto_word_t word = *words; - - for (unsigned i = 0; i < N; i++) { - out->v[i] = word & 1; - word >>= 1; - shift++; - - if (shift == BITS_PER_WORD) { - words++; - word = *words; - shift = 0; - } - } - - poly_normalize(out); -} - -static void poly_from_poly3(struct poly *out, const struct poly3 *in) { - const crypto_word_t *words_s = in->s.v; - const crypto_word_t *words_a = in->a.v; - crypto_word_t word_s = ~(*words_s); - crypto_word_t word_a = *words_a; - unsigned shift = 0; - - for (unsigned i = 0; i < N; i++) { - out->v[i] = (uint16_t)(word_s & 1) - 1; - out->v[i] |= word_a & 1; - word_s >>= 1; - word_a >>= 1; - shift++; - - if (shift == BITS_PER_WORD) { - words_s++; - words_a++; - word_s = ~(*words_s); - word_a = *words_a; - shift = 0; - } - } - - poly_normalize(out); -} - -// Polynomial inversion -// -------------------- - -// poly_invert_mod2 sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod -// Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion -// mod Q. -static void poly_invert_mod2(struct poly *out, const struct poly *in) { - // This algorithm is taken from section 7.1 of [SAFEGCD]. - struct poly2 v, r, f, g; - - // v = 0 - poly2_zero(&v); - // r = 1 - poly2_zero(&r); - r.v[0] = 1; - // f = all ones. - OPENSSL_memset(&f, 0xff, sizeof(struct poly2)); - f.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD; - // g is the reversal of |in|. - poly2_from_poly(&g, in); - poly2_mod_phiN(&g); - poly2_reverse_700(&g, &g); - int delta = 1; - - for (size_t i = 0; i < (2*(N-1)) - 1; i++) { - poly2_lshift1(&v); - - const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1; - const crypto_word_t delta_is_non_negative = delta_sign_bit - 1; - const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta); - const crypto_word_t g_has_constant_term = lsb_to_all(g.v[0]); - const crypto_word_t mask = - g_has_constant_term & delta_is_non_negative & delta_is_non_zero; - - const crypto_word_t c = lsb_to_all(f.v[0] & g.v[0]); - - delta = constant_time_select_int(mask, -delta, delta); - delta++; - - poly2_cswap(&f, &g, mask); - poly2_fmadd(&g, &f, c); - poly2_rshift1(&g); - - poly2_cswap(&v, &r, mask); - poly2_fmadd(&r, &v, c); - } - - assert(delta == 0); - assert(f.v[0] & 1); - poly2_reverse_700(&v, &v); - poly_from_poly2(out, &v); - poly_assert_normalized(out); -} - -// poly_invert sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod Φ(N)). -static void poly_invert(struct POLY_MUL_SCRATCH *scratch, struct poly *out, - const struct poly *in) { - // Inversion mod Q, which is done based on the result of inverting mod - // 2. See [NTRUTN14] paper, bottom of page two. - struct poly a, *b, tmp; - - // a = -in. - for (unsigned i = 0; i < N; i++) { - a.v[i] = -in->v[i]; - } - poly_normalize(&a); - - // b = in^-1 mod 2. - b = out; - poly_invert_mod2(b, in); - - // We are working mod Q=2**13 and we need to iterate ceil(log_2(13)) - // times, which is four. - for (unsigned i = 0; i < 4; i++) { - poly_mul(scratch, &tmp, &a, b); - tmp.v[0] += 2; - poly_mul(scratch, b, b, &tmp); - } - - poly_assert_normalized(out); -} - -// Marshal and unmarshal functions for various basic types. -// -------------------------------------------------------- - -#define POLY_BYTES 1138 - -// poly_marshal serialises all but the final coefficient of |in| to |out|. -static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) { - const uint16_t *p = in->v; - - for (size_t i = 0; i < N / 8; i++) { - out[0] = p[0]; - out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5); - out[2] = p[1] >> 3; - out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2); - out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7); - out[5] = p[3] >> 1; - out[6] = (0xf & (p[3] >> 9)) | ((p[4] & 0x0f) << 4); - out[7] = p[4] >> 4; - out[8] = (1 & (p[4] >> 12)) | ((p[5] & 0x7f) << 1); - out[9] = (0x3f & (p[5] >> 7)) | ((p[6] & 0x03) << 6); - out[10] = p[6] >> 2; - out[11] = (7 & (p[6] >> 10)) | ((p[7] & 0x1f) << 3); - out[12] = p[7] >> 5; - - p += 8; - out += 13; - } - - // There are four remaining values. - out[0] = p[0]; - out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5); - out[2] = p[1] >> 3; - out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2); - out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7); - out[5] = p[3] >> 1; - out[6] = 0xf & (p[3] >> 9); -} - -// poly_unmarshal parses the output of |poly_marshal| and sets |out| such that -// all but the final coefficients match, and the final coefficient is calculated -// such that evaluating |out| at one results in zero. It returns one on success -// or zero if |in| is an invalid encoding. -static int poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) { - uint16_t *p = out->v; - - for (size_t i = 0; i < N / 8; i++) { - p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8; - p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 | - (uint16_t)(in[3] & 3) << 11; - p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6; - p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 | - (uint16_t)(in[6] & 0xf) << 9; - p[4] = (uint16_t)(in[6] >> 4) | (uint16_t)(in[7]) << 4 | - (uint16_t)(in[8] & 1) << 12; - p[5] = (uint16_t)(in[8] >> 1) | (uint16_t)(in[9] & 0x3f) << 7; - p[6] = (uint16_t)(in[9] >> 6) | (uint16_t)(in[10]) << 2 | - (uint16_t)(in[11] & 7) << 10; - p[7] = (uint16_t)(in[11] >> 3) | (uint16_t)(in[12]) << 5; - - p += 8; - in += 13; - } - - // There are four coefficients remaining. - p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8; - p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 | - (uint16_t)(in[3] & 3) << 11; - p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6; - p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 | - (uint16_t)(in[6] & 0xf) << 9; - - for (unsigned i = 0; i < N - 1; i++) { - out->v[i] = (int16_t)(out->v[i] << 3) >> 3; - } - - // There are four unused bits in the last byte. We require them to be zero. - if ((in[6] & 0xf0) != 0) { - return 0; - } - - // Set the final coefficient as specifed in [HRSSNIST] 1.9.2 step 6. - uint32_t sum = 0; - for (size_t i = 0; i < N - 1; i++) { - sum += out->v[i]; - } - - out->v[N - 1] = (uint16_t)(0u - sum); - poly_normalize(out); - - return 1; -} - -// mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that |v| may -// have an invalid value when processing attacker-controlled inputs. -static uint16_t mod3_from_modQ(uint16_t v) { - v &= 3; - return v ^ (v >> 1); -} - -// poly_marshal_mod3 marshals |in| to |out| where the coefficients of |in| are -// all in {0, 1, Q-1, 65535} and |in| is mod Φ(N). (Note that coefficients may -// have invalid values when processing attacker-controlled inputs.) -static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES], - const struct poly *in) { - const uint16_t *coeffs = in->v; - - // Only 700 coefficients are marshaled because in[700] must be zero. - assert(coeffs[N-1] == 0); - - for (size_t i = 0; i < HRSS_POLY3_BYTES; i++) { - const uint16_t coeffs0 = mod3_from_modQ(coeffs[0]); - const uint16_t coeffs1 = mod3_from_modQ(coeffs[1]); - const uint16_t coeffs2 = mod3_from_modQ(coeffs[2]); - const uint16_t coeffs3 = mod3_from_modQ(coeffs[3]); - const uint16_t coeffs4 = mod3_from_modQ(coeffs[4]); - out[i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81; - coeffs += 5; - } -} - -// HRSS-specific functions -// ----------------------- - -// poly_short_sample samples a vector of values in {0xffff (i.e. -1), 0, 1}. -// This is the same action as the algorithm in [HRSSNIST] section 1.8.1, but -// with HRSS-SXY the sampling algorithm is now a private detail of the -// implementation (previously it had to match between two parties). This -// function uses that freedom to implement a flatter distribution of values. -static void poly_short_sample(struct poly *out, - const uint8_t in[HRSS_SAMPLE_BYTES]) { - static_assert(HRSS_SAMPLE_BYTES == N - 1, "HRSS_SAMPLE_BYTES incorrect"); - for (size_t i = 0; i < N - 1; i++) { - uint16_t v = mod3(in[i]); - // Map {0, 1, 2} -> {0, 1, 0xffff} - v |= ((v >> 1) ^ 1) - 1; - out->v[i] = v; - } - out->v[N - 1] = 0; - poly_normalize(out); -} - -// poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST], -// section 1.8.2. -static void poly_short_sample_plus(struct poly *out, - const uint8_t in[HRSS_SAMPLE_BYTES]) { - poly_short_sample(out, in); - - // sum (and the product in the for loop) will overflow. But that's fine - // because |sum| is bound by +/- (N-2), and N < 2^15 so it works out. - uint16_t sum = 0; - for (unsigned i = 0; i < N - 2; i++) { - sum += (unsigned) out->v[i] * out->v[i + 1]; - } - - // If the sum is negative, flip the sign of even-positioned coefficients. (See - // page 8 of [HRSS].) - sum = ((int16_t) sum) >> 15; - const uint16_t scale = sum | (~sum & 1); - for (unsigned i = 0; i < N; i += 2) { - out->v[i] = (unsigned) out->v[i] * scale; - } - poly_assert_normalized(out); -} - -// poly_lift computes the function discussed in [HRSS], appendix B. -static void poly_lift(struct poly *out, const struct poly *a) { - // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the - // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime). - - // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up: - // - // R. = PolynomialRing(GF(3)…) - // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n)) - // list(inv)[:15] - // [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2] - // - // This three-element pattern of coefficients repeats for the whole - // polynomial. - // - // Next define the overbar operator such that z̅ = z[0] + - // reverse(z[1:]). (Index zero of a polynomial here is the coefficient - // of the constant term. So index one is the coefficient of 𝑥 and so - // on.) - // - // A less odd way to define this is to see that z̅ negates the indexes, - // so z̅[0] = z[-0], z̅[1] = z[-1] and so on. - // - // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = , vz[1] = , …. (Where is the inner product: the sum - // of the point-wise products.) Although we calculated the inverse mod - // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end. - // (That's because (𝑥^N - 1) is a multiple of Φ(N).) - // - // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation - // of the list of coefficients. - // - // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like: - // - // def reverse(xs): - // suffix = list(xs[1:]) - // suffix.reverse() - // return [xs[0]] + suffix - // - // def rotate(xs): - // return [xs[-1]] + xs[:-1] - // - // zoverbar = reverse(list(inv) + [0]) - // xzoverbar = rotate(reverse(list(inv) + [0])) - // x2zoverbar = rotate(rotate(reverse(list(inv) + [0]))) - // - // zoverbar[:15] - // [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1] - // xzoverbar[:15] - // [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0] - // x2zoverbar[:15] - // [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] - // - // (For a formula for z̅, see lemma two of appendix B.) - // - // After the first three elements have been taken care of, all then have - // a repeating three-element cycle. The next value (𝑥^3z̅) involves - // three rotations of the first pattern, thus the three-element cycle - // lines up. However, the discontinuity in the first three elements - // obviously moves to a different position. Consider the difference - // between 𝑥^3z̅ and z̅: - // - // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15] - // [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] - // - // This pattern of differences is the same for all elements, although it - // obviously moves right with the rotations. - // - // From this, we reach algorithm eight of appendix B. - - // Handle the first three elements of the inner products. - out->v[0] = a->v[0] + a->v[2]; - out->v[1] = a->v[1]; - out->v[2] = -a->v[0] + a->v[2]; - - // s0, s1, s2 are added into out->v[0], out->v[1], and out->v[2], - // respectively. We do not compute s1 because it's just -(s0 + s1). - uint16_t s0 = 0, s2 = 0; - for (size_t i = 3; i < 699; i += 3) { - s0 += -a->v[i] + a->v[i + 2]; - // s1 += a->v[i] - a->v[i + 1]; - s2 += a->v[i + 1] - a->v[i + 2]; - } - - // Handle the fact that the three-element pattern doesn't fill the - // polynomial exactly (since 701 isn't a multiple of three). - s0 -= a->v[699]; - // s1 += a->v[699] - a->v[700]; - s2 += a->v[700]; - - // Note that s0 + s1 + s2 = 0. - out->v[0] += s0; - out->v[1] -= (s0 + s2); // = s1 - out->v[2] += s2; - - // Calculate the remaining inner products by taking advantage of the - // fact that the pattern repeats every three cycles and the pattern of - // differences moves with the rotation. - for (size_t i = 3; i < N; i++) { - out->v[i] = (out->v[i - 3] - (a->v[i - 2] + a->v[i - 1] + a->v[i])); - } - - // Reduce mod Φ(N) by subtracting a multiple of out[700] from every - // element and convert to mod Q. (See above about adding twice as - // subtraction.) - const crypto_word_t v = out->v[700]; - for (unsigned i = 0; i < N; i++) { - const uint16_t vi_mod3 = mod3(out->v[i] - v); - // Map {0, 1, 2} to {0, 1, 0xffff}. - out->v[i] = (~((vi_mod3 >> 1) - 1)) | vi_mod3; - } - - poly_mul_x_minus_1(out); - poly_normalize(out); -} - -struct public_key { - struct poly ph; -}; - -struct private_key { - struct poly3 f, f_inverse; - struct poly ph_inverse; - uint8_t hmac_key[32]; -}; - -// public_key_from_external converts an external public key pointer into an -// internal one. Externally the alignment is only specified to be eight bytes -// but we need 16-byte alignment. We could annotate the external struct with -// that alignment but we can only assume that malloced pointers are 8-byte -// aligned in any case. (Even if the underlying malloc returns values with -// 16-byte alignment, |OPENSSL_malloc| will store an 8-byte size prefix and mess -// that up.) -static struct public_key *public_key_from_external( - struct HRSS_public_key *ext) { - static_assert( - sizeof(struct HRSS_public_key) >= sizeof(struct public_key) + 15, - "HRSS public key too small"); - - return align_pointer(ext->opaque, 16); -} - -// private_key_from_external does the same thing as |public_key_from_external|, -// but for private keys. See the comment on that function about alignment -// issues. -static struct private_key *private_key_from_external( - struct HRSS_private_key *ext) { - static_assert( - sizeof(struct HRSS_private_key) >= sizeof(struct private_key) + 15, - "HRSS private key too small"); - - return align_pointer(ext->opaque, 16); -} - -// malloc_align32 returns a pointer to |size| bytes of 32-byte-aligned heap and -// sets |*out_ptr| to a value that can be passed to |OPENSSL_free| to release -// it. It returns NULL if out of memory. -static void *malloc_align32(void **out_ptr, size_t size) { - void *ptr = OPENSSL_malloc(size + 31); - if (!ptr) { - *out_ptr = NULL; - return NULL; - } - - *out_ptr = ptr; - return align_pointer(ptr, 32); -} - -int HRSS_generate_key( - struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv, - const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32]) { - struct public_key *pub = public_key_from_external(out_pub); - struct private_key *priv = private_key_from_external(out_priv); - - struct vars { - struct POLY_MUL_SCRATCH scratch; - struct poly f; - struct poly pg_phi1; - struct poly pfg_phi1; - struct poly pfg_phi1_inverse; - }; - - void *malloc_ptr; - struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars)); - if (!vars) { - // If the caller ignores the return value the output will still be safe. - // The private key output is randomised in case it's later passed to - // |HRSS_encap|. - memset(out_pub, 0, sizeof(struct HRSS_public_key)); - RAND_bytes((uint8_t*) out_priv, sizeof(struct HRSS_private_key)); - return 0; - } - -#if !defined(NDEBUG) - OPENSSL_memset(vars, 0xff, sizeof(struct vars)); -#endif - - OPENSSL_memcpy(priv->hmac_key, in + 2 * HRSS_SAMPLE_BYTES, - sizeof(priv->hmac_key)); - - poly_short_sample_plus(&vars->f, in); - poly3_from_poly(&priv->f, &vars->f); - HRSS_poly3_invert(&priv->f_inverse, &priv->f); - - // pg_phi1 is p (i.e. 3) × g × Φ(1) (i.e. 𝑥-1). - poly_short_sample_plus(&vars->pg_phi1, in + HRSS_SAMPLE_BYTES); - for (unsigned i = 0; i < N; i++) { - vars->pg_phi1.v[i] *= 3; - } - poly_mul_x_minus_1(&vars->pg_phi1); - - poly_mul(&vars->scratch, &vars->pfg_phi1, &vars->f, &vars->pg_phi1); - - poly_invert(&vars->scratch, &vars->pfg_phi1_inverse, &vars->pfg_phi1); - - poly_mul(&vars->scratch, &pub->ph, &vars->pfg_phi1_inverse, &vars->pg_phi1); - poly_mul(&vars->scratch, &pub->ph, &pub->ph, &vars->pg_phi1); - poly_clamp(&pub->ph); - - poly_mul(&vars->scratch, &priv->ph_inverse, &vars->pfg_phi1_inverse, - &vars->f); - poly_mul(&vars->scratch, &priv->ph_inverse, &priv->ph_inverse, &vars->f); - poly_clamp(&priv->ph_inverse); - - OPENSSL_free(malloc_ptr); - return 1; -} - -static const char kSharedKey[] = "shared key"; - -int HRSS_encap(uint8_t out_ciphertext[POLY_BYTES], uint8_t out_shared_key[32], - const struct HRSS_public_key *in_pub, - const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES]) { - const struct public_key *pub = - public_key_from_external((struct HRSS_public_key *)in_pub); - - struct vars { - struct POLY_MUL_SCRATCH scratch; - struct poly m, r, m_lifted; - struct poly prh_plus_m; - SHA256_CTX hash_ctx; - uint8_t m_bytes[HRSS_POLY3_BYTES]; - uint8_t r_bytes[HRSS_POLY3_BYTES]; - }; - - void *malloc_ptr; - struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars)); - if (!vars) { - // If the caller ignores the return value the output will still be safe. - // The private key output is randomised in case it's used to encrypt and - // transmit something. - memset(out_ciphertext, 0, POLY_BYTES); - RAND_bytes(out_shared_key, 32); - return 0; - } - -#if !defined(NDEBUG) - OPENSSL_memset(vars, 0xff, sizeof(struct vars)); -#endif - - poly_short_sample(&vars->m, in); - poly_short_sample(&vars->r, in + HRSS_SAMPLE_BYTES); - poly_lift(&vars->m_lifted, &vars->m); - - poly_mul(&vars->scratch, &vars->prh_plus_m, &vars->r, &pub->ph); - for (unsigned i = 0; i < N; i++) { - vars->prh_plus_m.v[i] += vars->m_lifted.v[i]; - } - - poly_marshal(out_ciphertext, &vars->prh_plus_m); - - poly_marshal_mod3(vars->m_bytes, &vars->m); - poly_marshal_mod3(vars->r_bytes, &vars->r); - - SHA256_Init(&vars->hash_ctx); - SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey)); - SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes)); - SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes)); - SHA256_Update(&vars->hash_ctx, out_ciphertext, POLY_BYTES); - SHA256_Final(out_shared_key, &vars->hash_ctx); - - OPENSSL_free(malloc_ptr); - return 1; -} - -int HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES], - const struct HRSS_private_key *in_priv, - const uint8_t *ciphertext, size_t ciphertext_len) { - const struct private_key *priv = - private_key_from_external((struct HRSS_private_key *)in_priv); - - struct vars { - struct POLY_MUL_SCRATCH scratch; - uint8_t masked_key[SHA256_CBLOCK]; - SHA256_CTX hash_ctx; - struct poly c; - struct poly f, cf; - struct poly3 cf3, m3; - struct poly m, m_lifted; - struct poly r; - struct poly3 r3; - uint8_t expected_ciphertext[HRSS_CIPHERTEXT_BYTES]; - uint8_t m_bytes[HRSS_POLY3_BYTES]; - uint8_t r_bytes[HRSS_POLY3_BYTES]; - uint8_t shared_key[32]; - }; - - void *malloc_ptr; - struct vars *const vars = malloc_align32(&malloc_ptr, sizeof(struct vars)); - if (!vars) { - // If the caller ignores the return value the output will still be safe. - // The private key output is randomised in case it's used to encrypt and - // transmit something. - RAND_bytes(out_shared_key, HRSS_KEY_BYTES); - return 0; - } - -#if !defined(NDEBUG) - OPENSSL_memset(vars, 0xff, sizeof(struct vars)); -#endif - - // This is HMAC, expanded inline rather than using the |HMAC| function so that - // we can avoid dealing with possible allocation failures and so keep this - // function infallible. - static_assert(sizeof(priv->hmac_key) <= sizeof(vars->masked_key), - "HRSS HMAC key larger than SHA-256 block size"); - for (size_t i = 0; i < sizeof(priv->hmac_key); i++) { - vars->masked_key[i] = priv->hmac_key[i] ^ 0x36; - } - OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x36, - sizeof(vars->masked_key) - sizeof(priv->hmac_key)); - - SHA256_Init(&vars->hash_ctx); - SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key)); - SHA256_Update(&vars->hash_ctx, ciphertext, ciphertext_len); - uint8_t inner_digest[SHA256_DIGEST_LENGTH]; - SHA256_Final(inner_digest, &vars->hash_ctx); - - for (size_t i = 0; i < sizeof(priv->hmac_key); i++) { - vars->masked_key[i] ^= (0x5c ^ 0x36); - } - OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x5c, - sizeof(vars->masked_key) - sizeof(priv->hmac_key)); - - SHA256_Init(&vars->hash_ctx); - SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key)); - SHA256_Update(&vars->hash_ctx, inner_digest, sizeof(inner_digest)); - static_assert(HRSS_KEY_BYTES == SHA256_DIGEST_LENGTH, - "HRSS shared key length incorrect"); - SHA256_Final(out_shared_key, &vars->hash_ctx); - - // If the ciphertext is publicly invalid then a random shared key is still - // returned to simply the logic of the caller, but this path is not constant - // time. - if (ciphertext_len != HRSS_CIPHERTEXT_BYTES || - !poly_unmarshal(&vars->c, ciphertext)) { - goto out; - } - - poly_from_poly3(&vars->f, &priv->f); - poly_mul(&vars->scratch, &vars->cf, &vars->c, &vars->f); - poly3_from_poly(&vars->cf3, &vars->cf); - // Note that cf3 is not reduced mod Φ(N). That reduction is deferred. - HRSS_poly3_mul(&vars->m3, &vars->cf3, &priv->f_inverse); - - poly_from_poly3(&vars->m, &vars->m3); - poly_lift(&vars->m_lifted, &vars->m); - - for (unsigned i = 0; i < N; i++) { - vars->r.v[i] = vars->c.v[i] - vars->m_lifted.v[i]; - } - poly_normalize(&vars->r); - poly_mul(&vars->scratch, &vars->r, &vars->r, &priv->ph_inverse); - poly_mod_phiN(&vars->r); - poly_clamp(&vars->r); - - crypto_word_t ok = poly3_from_poly_checked(&vars->r3, &vars->r); - - // [NTRUCOMP] section 5.1 includes ReEnc2 and a proof that it's valid. Rather - // than do an expensive |poly_mul|, it rebuilds |c'| from |c - lift(m)| - // (called |b|) with: - // t = (−b(1)/N) mod Q - // c' = b + tΦ(N) + lift(m) mod Q - // - // When polynomials are transmitted, the final coefficient is omitted and - // |poly_unmarshal| sets it such that f(1) == 0. Thus c(1) == 0. Also, - // |poly_lift| multiplies the result by (x-1) and therefore evaluating a - // lifted polynomial at 1 is also zero. Thus lift(m)(1) == 0 and so - // (c - lift(m))(1) == 0. - // - // Although we defer the reduction above, |b| is conceptually reduced mod - // Φ(N). In order to do that reduction one subtracts |c[N-1]| from every - // coefficient. Therefore b(1) = -c[N-1]×N. The value of |t|, above, then is - // just recovering |c[N-1]|, and adding tΦ(N) is simply undoing the reduction. - // Therefore b + tΦ(N) + lift(m) = c by construction and we don't need to - // recover |c| at all so long as we do the checks in - // |poly3_from_poly_checked|. - // - // The |poly_marshal| here then is just confirming that |poly_unmarshal| is - // strict and could be omitted. - - static_assert(HRSS_CIPHERTEXT_BYTES == POLY_BYTES, - "ciphertext is the wrong size"); - assert(ciphertext_len == sizeof(vars->expected_ciphertext)); - poly_marshal(vars->expected_ciphertext, &vars->c); - - poly_marshal_mod3(vars->m_bytes, &vars->m); - poly_marshal_mod3(vars->r_bytes, &vars->r); - - ok &= constant_time_is_zero_w( - CRYPTO_memcmp(ciphertext, vars->expected_ciphertext, - sizeof(vars->expected_ciphertext))); - - SHA256_Init(&vars->hash_ctx); - SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey)); - SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes)); - SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes)); - SHA256_Update(&vars->hash_ctx, vars->expected_ciphertext, - sizeof(vars->expected_ciphertext)); - SHA256_Final(vars->shared_key, &vars->hash_ctx); - - for (unsigned i = 0; i < sizeof(vars->shared_key); i++) { - out_shared_key[i] = - constant_time_select_8(ok, vars->shared_key[i], out_shared_key[i]); - } - -out: - OPENSSL_free(malloc_ptr); - return 1; -} - -void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES], - const struct HRSS_public_key *in_pub) { - const struct public_key *pub = - public_key_from_external((struct HRSS_public_key *)in_pub); - poly_marshal(out, &pub->ph); -} - -int HRSS_parse_public_key(struct HRSS_public_key *out, - const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) { - struct public_key *pub = public_key_from_external(out); - if (!poly_unmarshal(&pub->ph, in)) { - return 0; - } - OPENSSL_memset(&pub->ph.v[N], 0, 3 * sizeof(uint16_t)); - return 1; -} diff --git a/third_party/boringssl/src/crypto/hrss/hrss.cc b/third_party/boringssl/src/crypto/hrss/hrss.cc new file mode 100644 index 00000000..2915c70c --- /dev/null +++ b/third_party/boringssl/src/crypto/hrss/hrss.cc @@ -0,0 +1,2189 @@ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + +#if defined(OPENSSL_SSE2) +#include +#endif + +#if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && defined(__ARM_NEON) +#include +#endif + + +using namespace bssl; + +// This is an implementation of [HRSS], but with a KEM transformation based on +// [SXY]. The primary references are: + +// HRSS: https://eprint.iacr.org/2017/667.pdf +// HRSSNIST: +// https://csrc.nist.gov/CSRC/media/Projects/Post-Quantum-Cryptography/documents/round-1/submissions/NTRU_HRSS_KEM.zip +// SXY: https://eprint.iacr.org/2017/1005.pdf +// NTRUTN14: +// https://assets.onboardsecurity.com/static/downloads/NTRU/resources/NTRUTech014.pdf +// NTRUCOMP: https://eprint.iacr.org/2018/1174 +// SAFEGCD: https://gcd.cr.yp.to/papers.html#safegcd + +// Vector operations. +// +// A couple of functions in this file can use vector operations to meaningful +// effect. If we're building for a target that has a supported vector unit, +// |HRSS_HAVE_VECTOR_UNIT| will be defined and |vec_t| will be typedefed to a +// 128-bit vector. The following functions abstract over the differences between +// NEON and SSE2 for implementing some vector operations. + +// TODO: MSVC can likely also be made to work with vector operations, but ^ must +// be replaced with _mm_xor_si128, etc. +#if defined(OPENSSL_SSE2) && (defined(__clang__) || !defined(_MSC_VER)) + +#define HRSS_HAVE_VECTOR_UNIT + +typedef __m128i vec_t; + +// vec_capable returns one iff the current platform supports SSE2. +static int vec_capable() { return 1; } + +// vec_add performs a pair-wise addition of four uint16s from |a| and |b|. +static vec_t vec_add(vec_t a, vec_t b) { return _mm_add_epi16(a, b); } + +// vec_sub performs a pair-wise subtraction of four uint16s from |a| and |b|. +static vec_t vec_sub(vec_t a, vec_t b) { return _mm_sub_epi16(a, b); } + +// vec_mul multiplies each uint16_t in |a| by |b| and returns the resulting +// vector. +static vec_t vec_mul(vec_t a, uint16_t b) { + return _mm_mullo_epi16(a, _mm_set1_epi16(b)); +} + +// vec_fma multiplies each uint16_t in |b| by |c|, adds the result to |a|, and +// returns the resulting vector. +static vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { + return _mm_add_epi16(a, _mm_mullo_epi16(b, _mm_set1_epi16(c))); +} + +// vec3_rshift_word right-shifts the 24 uint16_t's in |v| by one uint16. +static void vec3_rshift_word(vec_t v[3]) { + // Intel's left and right shifting is backwards compared to the order in + // memory because they're based on little-endian order of words (and not just + // bytes). So the shifts in this function will be backwards from what one + // might expect. + const __m128i carry0 = _mm_srli_si128(v[0], 14); + v[0] = _mm_slli_si128(v[0], 2); + + const __m128i carry1 = _mm_srli_si128(v[1], 14); + v[1] = _mm_slli_si128(v[1], 2); + v[1] |= carry0; + + v[2] = _mm_slli_si128(v[2], 2); + v[2] |= carry1; +} + +// vec4_rshift_word right-shifts the 32 uint16_t's in |v| by one uint16. +static void vec4_rshift_word(vec_t v[4]) { + // Intel's left and right shifting is backwards compared to the order in + // memory because they're based on little-endian order of words (and not just + // bytes). So the shifts in this function will be backwards from what one + // might expect. + const __m128i carry0 = _mm_srli_si128(v[0], 14); + v[0] = _mm_slli_si128(v[0], 2); + + const __m128i carry1 = _mm_srli_si128(v[1], 14); + v[1] = _mm_slli_si128(v[1], 2); + v[1] |= carry0; + + const __m128i carry2 = _mm_srli_si128(v[2], 14); + v[2] = _mm_slli_si128(v[2], 2); + v[2] |= carry1; + + v[3] = _mm_slli_si128(v[3], 2); + v[3] |= carry2; +} + +// vec_merge_3_5 takes the final three uint16_t's from |left|, appends the first +// five from |right|, and returns the resulting vector. +static vec_t vec_merge_3_5(vec_t left, vec_t right) { + return _mm_srli_si128(left, 10) | _mm_slli_si128(right, 6); +} + +// poly3_vec_lshift1 left-shifts the 768 bits in |a_s|, and in |a_a|, by one +// bit. +static void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + + for (int i = 0; i < 6; i++) { + vec_t next_carry_s = _mm_srli_epi64(a_s[i], 63); + a_s[i] = _mm_slli_epi64(a_s[i], 1); + a_s[i] |= _mm_slli_si128(next_carry_s, 8); + a_s[i] |= carry_s; + carry_s = _mm_srli_si128(next_carry_s, 8); + + vec_t next_carry_a = _mm_srli_epi64(a_a[i], 63); + a_a[i] = _mm_slli_epi64(a_a[i], 1); + a_a[i] |= _mm_slli_si128(next_carry_a, 8); + a_a[i] |= carry_a; + carry_a = _mm_srli_si128(next_carry_a, 8); + } +} + +// poly3_vec_rshift1 right-shifts the 768 bits in |a_s|, and in |a_a|, by one +// bit. +static void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + + for (int i = 5; i >= 0; i--) { + const vec_t next_carry_s = _mm_slli_epi64(a_s[i], 63); + a_s[i] = _mm_srli_epi64(a_s[i], 1); + a_s[i] |= _mm_srli_si128(next_carry_s, 8); + a_s[i] |= carry_s; + carry_s = _mm_slli_si128(next_carry_s, 8); + + const vec_t next_carry_a = _mm_slli_epi64(a_a[i], 63); + a_a[i] = _mm_srli_epi64(a_a[i], 1); + a_a[i] |= _mm_srli_si128(next_carry_a, 8); + a_a[i] |= carry_a; + carry_a = _mm_slli_si128(next_carry_a, 8); + } +} + +// vec_broadcast_bit duplicates the least-significant bit in |a| to all bits in +// a vector and returns the result. +static vec_t vec_broadcast_bit(vec_t a) { + return _mm_shuffle_epi32(_mm_srai_epi32(_mm_slli_epi64(a, 63), 31), + 0b01010101); +} + +// vec_get_word returns the |i|th uint16_t in |v|. (This is a macro because the +// compiler requires that |i| be a compile-time constant.) +#define vec_get_word(v, i) _mm_extract_epi16(v, i) + +#elif (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && defined(__ARM_NEON) + +#define HRSS_HAVE_VECTOR_UNIT +typedef uint16x8_t vec_t; + +// These functions perform the same actions as the SSE2 function of the same +// name, above. + +static int vec_capable() { return CRYPTO_is_NEON_capable(); } + +static vec_t vec_add(vec_t a, vec_t b) { return a + b; } + +static vec_t vec_sub(vec_t a, vec_t b) { return a - b; } + +static vec_t vec_mul(vec_t a, uint16_t b) { return vmulq_n_u16(a, b); } + +static vec_t vec_fma(vec_t a, vec_t b, uint16_t c) { + return vmlaq_n_u16(a, b, c); +} + +static void vec3_rshift_word(vec_t v[3]) { + const uint16x8_t kZero = {0}; + v[2] = vextq_u16(v[1], v[2], 7); + v[1] = vextq_u16(v[0], v[1], 7); + v[0] = vextq_u16(kZero, v[0], 7); +} + +static void vec4_rshift_word(vec_t v[4]) { + const uint16x8_t kZero = {0}; + v[3] = vextq_u16(v[2], v[3], 7); + v[2] = vextq_u16(v[1], v[2], 7); + v[1] = vextq_u16(v[0], v[1], 7); + v[0] = vextq_u16(kZero, v[0], 7); +} + +static vec_t vec_merge_3_5(vec_t left, vec_t right) { + return vextq_u16(left, right, 5); +} + +static uint16_t vec_get_word(vec_t v, unsigned i) { return v[i]; } + +#if !defined(OPENSSL_AARCH64) + +static vec_t vec_broadcast_bit(vec_t a) { + a = (vec_t)vshrq_n_s16(((int16x8_t)a) << 15, 15); + return vdupq_lane_u16(vget_low_u16(a), 0); +} + +static void poly3_vec_lshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + const vec_t kZero = {0}; + + for (int i = 0; i < 6; i++) { + vec_t next_carry_s = a_s[i] >> 15; + a_s[i] <<= 1; + a_s[i] |= vextq_u16(kZero, next_carry_s, 7); + a_s[i] |= carry_s; + carry_s = vextq_u16(next_carry_s, kZero, 7); + + vec_t next_carry_a = a_a[i] >> 15; + a_a[i] <<= 1; + a_a[i] |= vextq_u16(kZero, next_carry_a, 7); + a_a[i] |= carry_a; + carry_a = vextq_u16(next_carry_a, kZero, 7); + } +} + +static void poly3_vec_rshift1(vec_t a_s[6], vec_t a_a[6]) { + vec_t carry_s = {0}; + vec_t carry_a = {0}; + const vec_t kZero = {0}; + + for (int i = 5; i >= 0; i--) { + vec_t next_carry_s = a_s[i] << 15; + a_s[i] >>= 1; + a_s[i] |= vextq_u16(next_carry_s, kZero, 1); + a_s[i] |= carry_s; + carry_s = vextq_u16(kZero, next_carry_s, 1); + + vec_t next_carry_a = a_a[i] << 15; + a_a[i] >>= 1; + a_a[i] |= vextq_u16(next_carry_a, kZero, 1); + a_a[i] |= carry_a; + carry_a = vextq_u16(kZero, next_carry_a, 1); + } +} + +#endif // !OPENSSL_AARCH64 + +#endif // (ARM || AARCH64) && NEON + +// Polynomials in this scheme have N terms. +#define N HRSS_N + +// Underlying data types and arithmetic operations. +// ------------------------------------------------ + +// Binary polynomials. + +// poly2 represents a degree-N polynomial over GF(2). The words are in little- +// endian order, i.e. the coefficient of x^0 is the LSB of the first word. The +// final word is only partially used since N is not a multiple of the word size. + +// Defined in internal.h: +// struct poly2 { +// crypto_word_t v[WORDS_PER_POLY]; +// }; + +static void poly2_zero(struct poly2 *p) { + OPENSSL_memset(&p->v[0], 0, sizeof(crypto_word_t) * WORDS_PER_POLY); +} + +// word_reverse returns |in| with the bits in reverse order. +static crypto_word_t word_reverse(crypto_word_t in) { +#if defined(OPENSSL_64_BIT) + static const crypto_word_t kMasks[6] = { + UINT64_C(0x5555555555555555), UINT64_C(0x3333333333333333), + UINT64_C(0x0f0f0f0f0f0f0f0f), UINT64_C(0x00ff00ff00ff00ff), + UINT64_C(0x0000ffff0000ffff), UINT64_C(0x00000000ffffffff), + }; +#else + static const crypto_word_t kMasks[5] = { + 0x55555555, 0x33333333, 0x0f0f0f0f, 0x00ff00ff, 0x0000ffff, + }; +#endif + + for (size_t i = 0; i < std::size(kMasks); i++) { + in = ((in >> (1 << i)) & kMasks[i]) | ((in & kMasks[i]) << (1 << i)); + } + + return in; +} + +// lsb_to_all replicates the least-significant bit of |v| to all bits of the +// word. This is used in bit-slicing operations to make a vector from a fixed +// value. +static crypto_word_t lsb_to_all(crypto_word_t v) { return 0u - (v & 1); } + +// poly2_mod_phiN reduces |p| by Φ(N). +static void poly2_mod_phiN(struct poly2 *p) { + // m is the term at x^700, replicated to every bit. + const crypto_word_t m = + lsb_to_all(p->v[WORDS_PER_POLY - 1] >> (BITS_IN_LAST_WORD - 1)); + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + p->v[i] ^= m; + } + p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << (BITS_IN_LAST_WORD - 1)) - 1; +} + +// poly2_reverse_700 reverses the order of the first 700 bits of |in| and writes +// the result to |out|. +static void poly2_reverse_700(struct poly2 *out, const struct poly2 *in) { + struct poly2 t; + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + t.v[i] = word_reverse(in->v[i]); + } + + static const size_t shift = BITS_PER_WORD - ((N - 1) % BITS_PER_WORD); + for (size_t i = 0; i < WORDS_PER_POLY - 1; i++) { + out->v[i] = t.v[WORDS_PER_POLY - 1 - i] >> shift; + out->v[i] |= t.v[WORDS_PER_POLY - 2 - i] << (BITS_PER_WORD - shift); + } + out->v[WORDS_PER_POLY - 1] = t.v[0] >> shift; +} + +// poly2_cswap exchanges the values of |a| and |b| if |swap| is all ones. +static void poly2_cswap(struct poly2 *a, struct poly2 *b, crypto_word_t swap) { + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t sum = swap & (a->v[i] ^ b->v[i]); + a->v[i] ^= sum; + b->v[i] ^= sum; + } +} + +// poly2_fmadd sets |out| to |out| + |in| * m, where m is either +// |CONSTTIME_TRUE_W| or |CONSTTIME_FALSE_W|. +static void poly2_fmadd(struct poly2 *out, const struct poly2 *in, + crypto_word_t m) { + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + out->v[i] ^= in->v[i] & m; + } +} + +// poly2_lshift1 left-shifts |p| by one bit. +static void poly2_lshift1(struct poly2 *p) { + crypto_word_t carry = 0; + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + const crypto_word_t next_carry = p->v[i] >> (BITS_PER_WORD - 1); + p->v[i] <<= 1; + p->v[i] |= carry; + carry = next_carry; + } +} + +// poly2_rshift1 right-shifts |p| by one bit. +static void poly2_rshift1(struct poly2 *p) { + crypto_word_t carry = 0; + for (size_t i = WORDS_PER_POLY - 1; i < WORDS_PER_POLY; i--) { + const crypto_word_t next_carry = p->v[i] & 1; + p->v[i] >>= 1; + p->v[i] |= carry << (BITS_PER_WORD - 1); + carry = next_carry; + } +} + +// poly2_clear_top_bits clears the bits in the final word that are only for +// alignment. +static void poly2_clear_top_bits(struct poly2 *p) { + p->v[WORDS_PER_POLY - 1] &= (UINT64_C(1) << BITS_IN_LAST_WORD) - 1; +} + +// Ternary polynomials. + +// poly3 represents a degree-N polynomial over GF(3). Each coefficient is +// bitsliced across the |s| and |a| arrays, like this: +// +// s | a | value +// ----------------- +// 0 | 0 | 0 +// 0 | 1 | 1 +// 1 | 1 | -1 (aka 2) +// 1 | 0 | +// +// ('s' is for sign, and 'a' is the absolute value.) +// +// Once bitsliced as such, the following circuits can be used to implement +// addition and multiplication mod 3: +// +// (s3, a3) = (s1, a1) × (s2, a2) +// a3 = a1 ∧ a2 +// s3 = (s1 ⊕ s2) ∧ a3 +// +// (s3, a3) = (s1, a1) + (s2, a2) +// t = s1 ⊕ a2 +// s3 = t ∧ (s2 ⊕ a1) +// a3 = (a1 ⊕ a2) ∨ (t ⊕ s2) +// +// (s3, a3) = (s1, a1) - (s2, a2) +// t = a1 ⊕ a2 +// s3 = (s1 ⊕ a2) ∧ (t ⊕ s2) +// a3 = t ∨ (s1 ⊕ s2) +// +// Negating a value just involves XORing s by a. +// +// struct poly3 { +// struct poly2 s, a; +// }; + +static void poly3_zero(struct poly3 *p) { + poly2_zero(&p->s); + poly2_zero(&p->a); +} + +// poly3_reverse_700 reverses the order of the first 700 terms of |in| and +// writes them to |out|. +static void poly3_reverse_700(struct poly3 *out, const struct poly3 *in) { + poly2_reverse_700(&out->a, &in->a); + poly2_reverse_700(&out->s, &in->s); +} + +// poly3_word_mul sets (|out_s|, |out_a|) to (|s1|, |a1|) × (|s2|, |a2|). +static void poly3_word_mul(crypto_word_t *out_s, crypto_word_t *out_a, + const crypto_word_t s1, const crypto_word_t a1, + const crypto_word_t s2, const crypto_word_t a2) { + *out_a = a1 & a2; + *out_s = (s1 ^ s2) & *out_a; +} + +// poly3_word_add sets (|out_s|, |out_a|) to (|s1|, |a1|) + (|s2|, |a2|). +static void poly3_word_add(crypto_word_t *out_s, crypto_word_t *out_a, + const crypto_word_t s1, const crypto_word_t a1, + const crypto_word_t s2, const crypto_word_t a2) { + const crypto_word_t t = s1 ^ a2; + *out_s = t & (s2 ^ a1); + *out_a = (a1 ^ a2) | (t ^ s2); +} + +// poly3_word_sub sets (|out_s|, |out_a|) to (|s1|, |a1|) - (|s2|, |a2|). +static void poly3_word_sub(crypto_word_t *out_s, crypto_word_t *out_a, + const crypto_word_t s1, const crypto_word_t a1, + const crypto_word_t s2, const crypto_word_t a2) { + const crypto_word_t t = a1 ^ a2; + *out_s = (s1 ^ a2) & (t ^ s2); + *out_a = t | (s1 ^ s2); +} + +// poly3_mul_const sets |p| to |p|×m, where m = (ms, ma). +static void poly3_mul_const(struct poly3 *p, crypto_word_t ms, + crypto_word_t ma) { + ms = lsb_to_all(ms); + ma = lsb_to_all(ma); + + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + poly3_word_mul(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], ms, ma); + } +} + +// poly3_fmadd sets |out| to |out| - |in|×m, where m is (ms, ma). +static void poly3_fmsub(struct poly3 *out, const struct poly3 *in, + crypto_word_t ms, crypto_word_t ma) { + crypto_word_t product_s, product_a; + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + poly3_word_mul(&product_s, &product_a, in->s.v[i], in->a.v[i], ms, ma); + poly3_word_sub(&out->s.v[i], &out->a.v[i], out->s.v[i], out->a.v[i], + product_s, product_a); + } +} + +// final_bit_to_all replicates the bit in the final position of the last word to +// all the bits in the word. +static crypto_word_t final_bit_to_all(crypto_word_t v) { + return lsb_to_all(v >> (BITS_IN_LAST_WORD - 1)); +} + +// poly3_mod_phiN reduces |p| by Φ(N). +static void poly3_mod_phiN(struct poly3 *p) { + // In order to reduce by Φ(N) we subtract by the value of the greatest + // coefficient. + const crypto_word_t factor_s = final_bit_to_all(p->s.v[WORDS_PER_POLY - 1]); + const crypto_word_t factor_a = final_bit_to_all(p->a.v[WORDS_PER_POLY - 1]); + + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + poly3_word_sub(&p->s.v[i], &p->a.v[i], p->s.v[i], p->a.v[i], factor_s, + factor_a); + } + + poly2_clear_top_bits(&p->s); + poly2_clear_top_bits(&p->a); +} + +static void poly3_cswap(struct poly3 *a, struct poly3 *b, crypto_word_t swap) { + poly2_cswap(&a->s, &b->s, swap); + poly2_cswap(&a->a, &b->a, swap); +} + +static void poly3_lshift1(struct poly3 *p) { + poly2_lshift1(&p->s); + poly2_lshift1(&p->a); +} + +static void poly3_rshift1(struct poly3 *p) { + poly2_rshift1(&p->s); + poly2_rshift1(&p->a); +} + +// poly3_span represents a pointer into a poly3. +struct poly3_span { + crypto_word_t *s; + crypto_word_t *a; +}; + +// poly3_span_add adds |n| words of values from |a| and |b| and writes the +// result to |out|. +static void poly3_span_add(const struct poly3_span *out, + const struct poly3_span *a, + const struct poly3_span *b, size_t n) { + for (size_t i = 0; i < n; i++) { + poly3_word_add(&out->s[i], &out->a[i], a->s[i], a->a[i], b->s[i], b->a[i]); + } +} + +// poly3_span_sub subtracts |n| words of |b| from |n| words of |a|. +static void poly3_span_sub(const struct poly3_span *a, + const struct poly3_span *b, size_t n) { + for (size_t i = 0; i < n; i++) { + poly3_word_sub(&a->s[i], &a->a[i], a->s[i], a->a[i], b->s[i], b->a[i]); + } +} + +// poly3_mul_aux is a recursive function that multiplies |n| words from |a| and +// |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements of +// |scratch| and the function recurses, except if |n| == 1, when |scratch| isn't +// used and the recursion stops. For |n| in {11, 22}, the transitive total +// amount of |scratch| needed happens to be 2n+2. +static void poly3_mul_aux(const struct poly3_span *out, + const struct poly3_span *scratch, + const struct poly3_span *a, + const struct poly3_span *b, size_t n) { + if (n == 1) { + crypto_word_t r_s_low = 0, r_s_high = 0, r_a_low = 0, r_a_high = 0; + crypto_word_t b_s = b->s[0], b_a = b->a[0]; + const crypto_word_t a_s = a->s[0], a_a = a->a[0]; + + for (size_t i = 0; i < BITS_PER_WORD; i++) { + // Multiply (s, a) by the next value from (b_s, b_a). + crypto_word_t m_s, m_a; + poly3_word_mul(&m_s, &m_a, a_s, a_a, lsb_to_all(b_s), lsb_to_all(b_a)); + b_s >>= 1; + b_a >>= 1; + + if (i == 0) { + // Special case otherwise the code tries to shift by BITS_PER_WORD + // below, which is undefined. + r_s_low = m_s; + r_a_low = m_a; + continue; + } + + // Shift the multiplication result to the correct position. + const crypto_word_t m_s_low = m_s << i; + const crypto_word_t m_s_high = m_s >> (BITS_PER_WORD - i); + const crypto_word_t m_a_low = m_a << i; + const crypto_word_t m_a_high = m_a >> (BITS_PER_WORD - i); + + // Add into the result. + poly3_word_add(&r_s_low, &r_a_low, r_s_low, r_a_low, m_s_low, m_a_low); + poly3_word_add(&r_s_high, &r_a_high, r_s_high, r_a_high, m_s_high, + m_a_high); + } + + out->s[0] = r_s_low; + out->s[1] = r_s_high; + out->a[0] = r_a_low; + out->a[1] = r_a_high; + return; + } + + // Karatsuba multiplication. + // https://en.wikipedia.org/wiki/Karatsuba_algorithm + + // When |n| is odd, the two "halves" will have different lengths. The first + // is always the smaller. + const size_t low_len = n / 2; + const size_t high_len = n - low_len; + const struct poly3_span a_high = {&a->s[low_len], &a->a[low_len]}; + const struct poly3_span b_high = {&b->s[low_len], &b->a[low_len]}; + + // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second + // half. + const struct poly3_span a_cross_sum = *out; + const struct poly3_span b_cross_sum = {&out->s[high_len], &out->a[high_len]}; + poly3_span_add(&a_cross_sum, a, &a_high, low_len); + poly3_span_add(&b_cross_sum, b, &b_high, low_len); + if (high_len != low_len) { + a_cross_sum.s[low_len] = a_high.s[low_len]; + a_cross_sum.a[low_len] = a_high.a[low_len]; + b_cross_sum.s[low_len] = b_high.s[low_len]; + b_cross_sum.a[low_len] = b_high.a[low_len]; + } + + const struct poly3_span child_scratch = {&scratch->s[2 * high_len], + &scratch->a[2 * high_len]}; + const struct poly3_span out_mid = {&out->s[low_len], &out->a[low_len]}; + const struct poly3_span out_high = {&out->s[2 * low_len], + &out->a[2 * low_len]}; + + // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer. + poly3_mul_aux(scratch, &child_scratch, &a_cross_sum, &b_cross_sum, high_len); + // Calculate a_1 × b_1. + poly3_mul_aux(&out_high, &child_scratch, &a_high, &b_high, high_len); + // Calculate a_0 × b_0. + poly3_mul_aux(out, &child_scratch, a, b, low_len); + + // Subtract those last two products from the first. + poly3_span_sub(scratch, out, low_len * 2); + poly3_span_sub(scratch, &out_high, high_len * 2); + + // Add the middle product into the output. + poly3_span_add(&out_mid, &out_mid, scratch, high_len * 2); +} + +// HRSS_poly3_mul sets |*out| to |x|×|y| mod Φ(N). +void bssl::HRSS_poly3_mul(struct poly3 *out, const struct poly3 *x, + const struct poly3 *y) { + crypto_word_t prod_s[WORDS_PER_POLY * 2]; + crypto_word_t prod_a[WORDS_PER_POLY * 2]; + crypto_word_t scratch_s[WORDS_PER_POLY * 2 + 2]; + crypto_word_t scratch_a[WORDS_PER_POLY * 2 + 2]; + const struct poly3_span prod_span = {prod_s, prod_a}; + const struct poly3_span scratch_span = {scratch_s, scratch_a}; + const struct poly3_span x_span = {(crypto_word_t *)x->s.v, + (crypto_word_t *)x->a.v}; + const struct poly3_span y_span = {(crypto_word_t *)y->s.v, + (crypto_word_t *)y->a.v}; + + poly3_mul_aux(&prod_span, &scratch_span, &x_span, &y_span, WORDS_PER_POLY); + + // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the + // upper-half to the lower-half. However, N is 701, which isn't a multiple of + // BITS_PER_WORD, so the upper-half vectors all have to be shifted before + // being added to the lower-half. + for (size_t i = 0; i < WORDS_PER_POLY; i++) { + crypto_word_t v_s = prod_s[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD; + v_s |= prod_s[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD); + crypto_word_t v_a = prod_a[WORDS_PER_POLY + i - 1] >> BITS_IN_LAST_WORD; + v_a |= prod_a[WORDS_PER_POLY + i] << (BITS_PER_WORD - BITS_IN_LAST_WORD); + + poly3_word_add(&out->s.v[i], &out->a.v[i], prod_s[i], prod_a[i], v_s, v_a); + } + + poly3_mod_phiN(out); +} + +#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64) + +// poly3_vec_cswap swaps (|a_s|, |a_a|) and (|b_s|, |b_a|) if |swap| is +// |0xff..ff|. Otherwise, |swap| must be zero. +static void poly3_vec_cswap(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], + vec_t b_a[6], const vec_t swap) { + for (int i = 0; i < 6; i++) { + const vec_t sum_s = swap & (a_s[i] ^ b_s[i]); + a_s[i] ^= sum_s; + b_s[i] ^= sum_s; + + const vec_t sum_a = swap & (a_a[i] ^ b_a[i]); + a_a[i] ^= sum_a; + b_a[i] ^= sum_a; + } +} + +// poly3_vec_fmsub subtracts (|ms|, |ma|) × (|b_s|, |b_a|) from (|a_s|, |a_a|). +static void poly3_vec_fmsub(vec_t a_s[6], vec_t a_a[6], vec_t b_s[6], + vec_t b_a[6], const vec_t ms, const vec_t ma) { + for (int i = 0; i < 6; i++) { + // See the bitslice formula, above. + const vec_t s = b_s[i]; + const vec_t a = b_a[i]; + const vec_t product_a = a & ma; + const vec_t product_s = (s ^ ms) & product_a; + + const vec_t out_s = a_s[i]; + const vec_t out_a = a_a[i]; + const vec_t t = out_a ^ product_a; + a_s[i] = (out_s ^ product_a) & (t ^ product_s); + a_a[i] = t | (out_s ^ product_s); + } +} + +// poly3_invert_vec sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod +// Φ(N). +static void poly3_invert_vec(struct poly3 *out, const struct poly3 *in) { + // This algorithm is taken from section 7.1 of [SAFEGCD]. + const vec_t kZero = {0}; + const vec_t kOne = {1}; + static const uint8_t kBottomSixtyOne[sizeof(vec_t)] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x1f}; + + vec_t v_s[6], v_a[6], r_s[6], r_a[6], f_s[6], f_a[6], g_s[6], g_a[6]; + // v = 0 + memset(&v_s, 0, sizeof(v_s)); + memset(&v_a, 0, sizeof(v_a)); + // r = 1 + memset(&r_s, 0, sizeof(r_s)); + memset(&r_a, 0, sizeof(r_a)); + r_a[0] = kOne; + // f = all ones. + memset(f_s, 0, sizeof(f_s)); + memset(f_a, 0xff, 5 * sizeof(vec_t)); + memcpy(&f_a[5], kBottomSixtyOne, sizeof(kBottomSixtyOne)); + // g is the reversal of |in|. + struct poly3 in_reversed; + poly3_reverse_700(&in_reversed, in); + g_s[5] = kZero; + memcpy(&g_s, &in_reversed.s.v, WORDS_PER_POLY * sizeof(crypto_word_t)); + g_a[5] = kZero; + memcpy(&g_a, &in_reversed.a.v, WORDS_PER_POLY * sizeof(crypto_word_t)); + + int delta = 1; + + for (size_t i = 0; i < (2 * (N - 1)) - 1; i++) { + poly3_vec_lshift1(v_s, v_a); + + const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1; + const crypto_word_t delta_is_non_negative = delta_sign_bit - 1; + const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta); + const vec_t g_has_constant_term = vec_broadcast_bit(g_a[0]); + const vec_t mask_w = { + static_cast>( + delta_is_non_negative & delta_is_non_zero)}; + const vec_t mask = vec_broadcast_bit(mask_w) & g_has_constant_term; + + const vec_t c_a = vec_broadcast_bit(f_a[0] & g_a[0]); + const vec_t c_s = vec_broadcast_bit((f_s[0] ^ g_s[0]) & c_a); + + delta = constant_time_select_int(lsb_to_all(mask[0]), -delta, delta); + delta++; + + poly3_vec_cswap(f_s, f_a, g_s, g_a, mask); + poly3_vec_fmsub(g_s, g_a, f_s, f_a, c_s, c_a); + poly3_vec_rshift1(g_s, g_a); + + poly3_vec_cswap(v_s, v_a, r_s, r_a, mask); + poly3_vec_fmsub(r_s, r_a, v_s, v_a, c_s, c_a); + } + + assert(delta == 0); + memcpy(out->s.v, v_s, WORDS_PER_POLY * sizeof(crypto_word_t)); + memcpy(out->a.v, v_a, WORDS_PER_POLY * sizeof(crypto_word_t)); + poly3_mul_const(out, vec_get_word(f_s[0], 0), vec_get_word(f_a[0], 0)); + poly3_reverse_700(out, out); +} + +#endif // HRSS_HAVE_VECTOR_UNIT + +// HRSS_poly3_invert sets |*out| to |in|^-1, i.e. such that |out|×|in| == 1 mod +// Φ(N). +void bssl::HRSS_poly3_invert(struct poly3 *out, const struct poly3 *in) { + // The vector version of this function seems slightly slower on AArch64, but + // is useful on ARMv7 and x86-64. +#if defined(HRSS_HAVE_VECTOR_UNIT) && !defined(OPENSSL_AARCH64) + if (vec_capable()) { + poly3_invert_vec(out, in); + return; + } +#endif + + // This algorithm is taken from section 7.1 of [SAFEGCD]. + struct poly3 v, r, f, g; + // v = 0 + poly3_zero(&v); + // r = 1 + poly3_zero(&r); + r.a.v[0] = 1; + // f = all ones. + OPENSSL_memset(&f.s, 0, sizeof(struct poly2)); + OPENSSL_memset(&f.a, 0xff, sizeof(struct poly2)); + f.a.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD; + // g is the reversal of |in|. + poly3_reverse_700(&g, in); + int delta = 1; + + for (size_t i = 0; i < (2 * (N - 1)) - 1; i++) { + poly3_lshift1(&v); + + const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1; + const crypto_word_t delta_is_non_negative = delta_sign_bit - 1; + const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta); + const crypto_word_t g_has_constant_term = lsb_to_all(g.a.v[0]); + const crypto_word_t mask = + g_has_constant_term & delta_is_non_negative & delta_is_non_zero; + + crypto_word_t c_s, c_a; + poly3_word_mul(&c_s, &c_a, f.s.v[0], f.a.v[0], g.s.v[0], g.a.v[0]); + c_s = lsb_to_all(c_s); + c_a = lsb_to_all(c_a); + + delta = constant_time_select_int(mask, -delta, delta); + delta++; + + poly3_cswap(&f, &g, mask); + poly3_fmsub(&g, &f, c_s, c_a); + poly3_rshift1(&g); + + poly3_cswap(&v, &r, mask); + poly3_fmsub(&r, &v, c_s, c_a); + } + + assert(delta == 0); + poly3_mul_const(&v, f.s.v[0], f.a.v[0]); + poly3_reverse_700(out, &v); +} + +// Polynomials in Q. + +// Coefficients are reduced mod Q. (Q is clearly not prime, therefore the +// coefficients do not form a field.) +#define Q 8192 + +// VECS_PER_POLY is the number of 128-bit vectors needed to represent a +// polynomial. +#define COEFFICIENTS_PER_VEC (sizeof(vec_t) / sizeof(uint16_t)) +#define VECS_PER_POLY ((N + COEFFICIENTS_PER_VEC - 1) / COEFFICIENTS_PER_VEC) + +namespace { +// poly represents a polynomial with coefficients mod Q. Note that, while Q is a +// power of two, this does not operate in GF(Q). That would be a binary field +// but this is simply mod Q. Thus the coefficients are not a field. +// +// Coefficients are ordered little-endian, thus the coefficient of x^0 is the +// first element of the array. +struct poly { +#if defined(HRSS_HAVE_VECTOR_UNIT) + union { + // N + 3 = 704, which is a multiple of 64 and thus aligns things, esp for + // the vector code. + uint16_t v[N + 3]; + vec_t vectors[VECS_PER_POLY]; + }; +#else + // Even if !HRSS_HAVE_VECTOR_UNIT, external assembly may be called that + // requires alignment. + alignas(16) uint16_t v[N + 3]; +#endif +}; +} // namespace + +// poly_normalize zeros out the excess elements of |x| which are included only +// for alignment. +static void poly_normalize(struct poly *x) { + OPENSSL_memset(&x->v[N], 0, 3 * sizeof(uint16_t)); +} + +// poly_assert_normalized asserts that the excess elements of |x| are zeroed out +// for the cases that case. (E.g. |poly_mul_vec|.) +static void poly_assert_normalized(const struct poly *x) { + assert(x->v[N] == 0); + assert(x->v[N + 1] == 0); + assert(x->v[N + 2] == 0); +} + +namespace { +// POLY_MUL_SCRATCH contains space for the working variables needed by +// |poly_mul|. The contents afterwards may be discarded, but the object may also +// be reused with future |poly_mul| calls to save heap allocations. +// +// This object must have 32-byte alignment. +struct POLY_MUL_SCRATCH { + union { + // This is used by |poly_mul_novec|. + struct { + uint16_t prod[2 * N]; + uint16_t scratch[1318]; + } novec; + +#if defined(HRSS_HAVE_VECTOR_UNIT) + // This is used by |poly_mul_vec|. + struct { + vec_t prod[VECS_PER_POLY * 2]; + vec_t scratch[172]; + } vec; +#endif + +#if defined(POLY_RQ_MUL_ASM) + // This is the space used by |poly_Rq_mul|. + uint8_t rq[POLY_MUL_RQ_SCRATCH_SPACE]; +#endif + } u; +}; +} // namespace + +#if defined(HRSS_HAVE_VECTOR_UNIT) + +// poly_mul_vec_aux is a recursive function that multiplies |n| words from |a| +// and |b| and writes 2×|n| words to |out|. Each call uses 2*ceil(n/2) elements +// of |scratch| and the function recurses, except if |n| < 3, when |scratch| +// isn't used and the recursion stops. If |n| == |VECS_PER_POLY| then |scratch| +// needs 172 elements. +static void poly_mul_vec_aux(vec_t *out, vec_t *scratch, const vec_t *a, + const vec_t *b, const size_t n) { + // In [HRSS], the technique they used for polynomial multiplication is + // described: they start with Toom-4 at the top level and then two layers of + // Karatsuba. Karatsuba is a specific instance of the general Toom–Cook + // decomposition, which splits an input n-ways and produces 2n-1 + // multiplications of those parts. So, starting with 704 coefficients (rounded + // up from 701 to have more factors of two), Toom-4 gives seven + // multiplications of degree-174 polynomials. Each round of Karatsuba (which + // is Toom-2) increases the number of multiplications by a factor of three + // while halving the size of the values being multiplied. So two rounds gives + // 63 multiplications of degree-44 polynomials. Then they (I think) form + // vectors by gathering all 63 coefficients of each power together, for each + // input, and doing more rounds of Karatsuba on the vectors until they bottom- + // out somewhere with schoolbook multiplication. + // + // I tried something like that for NEON. NEON vectors are 128 bits so hold + // eight coefficients. I wrote a function that did Karatsuba on eight + // multiplications at the same time, using such vectors, and a Go script that + // decomposed from degree-704, with Karatsuba in non-transposed form, until it + // reached multiplications of degree-44. It batched up those 81 + // multiplications into lots of eight with a single one left over (which was + // handled directly). + // + // It worked, but it was significantly slower than the dumb algorithm used + // below. Potentially that was because I misunderstood how [HRSS] did it, or + // because Clang is bad at generating good code from NEON intrinsics on ARMv7. + // (Which is true: the code generated by Clang for the below is pretty crap.) + // + // This algorithm is much simpler. It just does Karatsuba decomposition all + // the way down and never transposes. When it gets down to degree-16 or + // degree-24 values, they are multiplied using schoolbook multiplication and + // vector intrinsics. The vector operations form each of the eight phase- + // shifts of one of the inputs, point-wise multiply, and then add into the + // result at the correct place. This means that 33% (degree-16) or 25% + // (degree-24) of the multiplies and adds are wasted, but it does ok. + if (n == 2) { + vec_t result[4]; + vec_t vec_a[3]; + static const vec_t kZero = {0}; + vec_a[0] = a[0]; + vec_a[1] = a[1]; + vec_a[2] = kZero; + + result[0] = vec_mul(vec_a[0], vec_get_word(b[0], 0)); + result[1] = vec_mul(vec_a[1], vec_get_word(b[0], 0)); + + result[1] = vec_fma(result[1], vec_a[0], vec_get_word(b[1], 0)); + result[2] = vec_mul(vec_a[1], vec_get_word(b[1], 0)); + result[3] = kZero; + + vec3_rshift_word(vec_a); + +#define BLOCK(x, y) \ + do { \ + result[x + 0] = \ + vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ + result[x + 1] = \ + vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ + result[x + 2] = \ + vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \ + } while (0) + + BLOCK(0, 1); + BLOCK(1, 9); + + vec3_rshift_word(vec_a); + + BLOCK(0, 2); + BLOCK(1, 10); + + vec3_rshift_word(vec_a); + + BLOCK(0, 3); + BLOCK(1, 11); + + vec3_rshift_word(vec_a); + + BLOCK(0, 4); + BLOCK(1, 12); + + vec3_rshift_word(vec_a); + + BLOCK(0, 5); + BLOCK(1, 13); + + vec3_rshift_word(vec_a); + + BLOCK(0, 6); + BLOCK(1, 14); + + vec3_rshift_word(vec_a); + + BLOCK(0, 7); + BLOCK(1, 15); + +#undef BLOCK + + memcpy(out, result, sizeof(result)); + return; + } + + if (n == 3) { + vec_t result[6]; + vec_t vec_a[4]; + static const vec_t kZero = {0}; + vec_a[0] = a[0]; + vec_a[1] = a[1]; + vec_a[2] = a[2]; + vec_a[3] = kZero; + + result[0] = vec_mul(a[0], vec_get_word(b[0], 0)); + result[1] = vec_mul(a[1], vec_get_word(b[0], 0)); + result[2] = vec_mul(a[2], vec_get_word(b[0], 0)); + +#define BLOCK_PRE(x, y) \ + do { \ + result[x + 0] = \ + vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ + result[x + 1] = \ + vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ + result[x + 2] = vec_mul(vec_a[2], vec_get_word(b[y / 8], y % 8)); \ + } while (0) + + BLOCK_PRE(1, 8); + BLOCK_PRE(2, 16); + + result[5] = kZero; + + vec4_rshift_word(vec_a); + +#define BLOCK(x, y) \ + do { \ + result[x + 0] = \ + vec_fma(result[x + 0], vec_a[0], vec_get_word(b[y / 8], y % 8)); \ + result[x + 1] = \ + vec_fma(result[x + 1], vec_a[1], vec_get_word(b[y / 8], y % 8)); \ + result[x + 2] = \ + vec_fma(result[x + 2], vec_a[2], vec_get_word(b[y / 8], y % 8)); \ + result[x + 3] = \ + vec_fma(result[x + 3], vec_a[3], vec_get_word(b[y / 8], y % 8)); \ + } while (0) + + BLOCK(0, 1); + BLOCK(1, 9); + BLOCK(2, 17); + + vec4_rshift_word(vec_a); + + BLOCK(0, 2); + BLOCK(1, 10); + BLOCK(2, 18); + + vec4_rshift_word(vec_a); + + BLOCK(0, 3); + BLOCK(1, 11); + BLOCK(2, 19); + + vec4_rshift_word(vec_a); + + BLOCK(0, 4); + BLOCK(1, 12); + BLOCK(2, 20); + + vec4_rshift_word(vec_a); + + BLOCK(0, 5); + BLOCK(1, 13); + BLOCK(2, 21); + + vec4_rshift_word(vec_a); + + BLOCK(0, 6); + BLOCK(1, 14); + BLOCK(2, 22); + + vec4_rshift_word(vec_a); + + BLOCK(0, 7); + BLOCK(1, 15); + BLOCK(2, 23); + +#undef BLOCK +#undef BLOCK_PRE + + memcpy(out, result, sizeof(result)); + + return; + } + + // Karatsuba multiplication. + // https://en.wikipedia.org/wiki/Karatsuba_algorithm + + // When |n| is odd, the two "halves" will have different lengths. The first is + // always the smaller. + const size_t low_len = n / 2; + const size_t high_len = n - low_len; + const vec_t *a_high = &a[low_len]; + const vec_t *b_high = &b[low_len]; + + // Store a_1 + a_0 in the first half of |out| and b_1 + b_0 in the second + // half. + for (size_t i = 0; i < low_len; i++) { + out[i] = vec_add(a_high[i], a[i]); + out[high_len + i] = vec_add(b_high[i], b[i]); + } + if (high_len != low_len) { + out[low_len] = a_high[low_len]; + out[high_len + low_len] = b_high[low_len]; + } + + vec_t *const child_scratch = &scratch[2 * high_len]; + // Calculate (a_1 + a_0) × (b_1 + b_0) and write to scratch buffer. + poly_mul_vec_aux(scratch, child_scratch, out, &out[high_len], high_len); + // Calculate a_1 × b_1. + poly_mul_vec_aux(&out[low_len * 2], child_scratch, a_high, b_high, high_len); + // Calculate a_0 × b_0. + poly_mul_vec_aux(out, child_scratch, a, b, low_len); + + // Subtract those last two products from the first. + for (size_t i = 0; i < low_len * 2; i++) { + scratch[i] = vec_sub(scratch[i], vec_add(out[i], out[low_len * 2 + i])); + } + if (low_len != high_len) { + scratch[low_len * 2] = vec_sub(scratch[low_len * 2], out[low_len * 4]); + scratch[low_len * 2 + 1] = + vec_sub(scratch[low_len * 2 + 1], out[low_len * 4 + 1]); + } + + // Add the middle product into the output. + for (size_t i = 0; i < high_len * 2; i++) { + out[low_len + i] = vec_add(out[low_len + i], scratch[i]); + } +} + +// poly_mul_vec sets |*out| to |x|×|y| mod (𝑥^n - 1). +static void poly_mul_vec(struct POLY_MUL_SCRATCH *scratch, struct poly *out, + const struct poly *x, const struct poly *y) { + static_assert(sizeof(out->v) == sizeof(vec_t) * VECS_PER_POLY, + "struct poly is the wrong size"); + static_assert(alignof(struct poly) == alignof(vec_t), + "struct poly has incorrect alignment"); + poly_assert_normalized(x); + poly_assert_normalized(y); + + vec_t *const prod = scratch->u.vec.prod; + vec_t *const aux_scratch = scratch->u.vec.scratch; + poly_mul_vec_aux(prod, aux_scratch, x->vectors, y->vectors, VECS_PER_POLY); + + // |prod| needs to be reduced mod (𝑥^n - 1), which just involves adding the + // upper-half to the lower-half. However, N is 701, which isn't a multiple of + // the vector size, so the upper-half vectors all have to be shifted before + // being added to the lower-half. + vec_t *out_vecs = (vec_t *)out->v; + + for (size_t i = 0; i < VECS_PER_POLY; i++) { + const vec_t prev = prod[VECS_PER_POLY - 1 + i]; + const vec_t this_vec = prod[VECS_PER_POLY + i]; + out_vecs[i] = vec_add(prod[i], vec_merge_3_5(prev, this_vec)); + } + + OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t)); +} + +#endif // HRSS_HAVE_VECTOR_UNIT + +// poly_mul_novec_aux writes the product of |a| and |b| to |out|, using +// |scratch| as scratch space. It'll use Karatsuba if the inputs are large +// enough to warrant it. Each call uses 2*ceil(n/2) elements of |scratch| and +// the function recurses, except if |n| < 64, when |scratch| isn't used and the +// recursion stops. If |n| == |N| then |scratch| needs 1318 elements. +static void poly_mul_novec_aux(uint16_t *out, uint16_t *scratch, + const uint16_t *a, const uint16_t *b, size_t n) { + static const size_t kSchoolbookLimit = 64; + if (n < kSchoolbookLimit) { + OPENSSL_memset(out, 0, sizeof(uint16_t) * n * 2); + for (size_t i = 0; i < n; i++) { + for (size_t j = 0; j < n; j++) { + out[i + j] += (unsigned)a[i] * b[j]; + } + } + + return; + } + + // Karatsuba multiplication. + // https://en.wikipedia.org/wiki/Karatsuba_algorithm + + // When |n| is odd, the two "halves" will have different lengths. The + // first is always the smaller. + const size_t low_len = n / 2; + const size_t high_len = n - low_len; + const uint16_t *const a_high = &a[low_len]; + const uint16_t *const b_high = &b[low_len]; + + for (size_t i = 0; i < low_len; i++) { + out[i] = a_high[i] + a[i]; + out[high_len + i] = b_high[i] + b[i]; + } + if (high_len != low_len) { + out[low_len] = a_high[low_len]; + out[high_len + low_len] = b_high[low_len]; + } + + uint16_t *const child_scratch = &scratch[2 * high_len]; + poly_mul_novec_aux(scratch, child_scratch, out, &out[high_len], high_len); + poly_mul_novec_aux(&out[low_len * 2], child_scratch, a_high, b_high, + high_len); + poly_mul_novec_aux(out, child_scratch, a, b, low_len); + + for (size_t i = 0; i < low_len * 2; i++) { + scratch[i] -= out[i] + out[low_len * 2 + i]; + } + if (low_len != high_len) { + scratch[low_len * 2] -= out[low_len * 4]; + assert(out[low_len * 4 + 1] == 0); + } + + for (size_t i = 0; i < high_len * 2; i++) { + out[low_len + i] += scratch[i]; + } +} + +// poly_mul_novec sets |*out| to |x|×|y| mod (𝑥^n - 1). +static void poly_mul_novec(struct POLY_MUL_SCRATCH *scratch, struct poly *out, + const struct poly *x, const struct poly *y) { + uint16_t *const prod = scratch->u.novec.prod; + uint16_t *const aux_scratch = scratch->u.novec.scratch; + poly_mul_novec_aux(prod, aux_scratch, x->v, y->v, N); + + for (size_t i = 0; i < N; i++) { + out->v[i] = prod[i] + prod[i + N]; + } + OPENSSL_memset(&out->v[N], 0, 3 * sizeof(uint16_t)); +} + +static void poly_mul(struct POLY_MUL_SCRATCH *scratch, struct poly *r, + const struct poly *a, const struct poly *b) { +#if defined(POLY_RQ_MUL_ASM) + if (CRYPTO_is_AVX2_capable()) { + poly_Rq_mul(r->v, a->v, b->v, scratch->u.rq); + poly_normalize(r); + } else +#endif + +#if defined(HRSS_HAVE_VECTOR_UNIT) + if (vec_capable()) { + poly_mul_vec(scratch, r, a, b); + } else +#endif + + // Fallback, non-vector case. + { + poly_mul_novec(scratch, r, a, b); + } + + poly_assert_normalized(r); +} + +// poly_mul_x_minus_1 sets |p| to |p|×(𝑥 - 1) mod (𝑥^n - 1). +static void poly_mul_x_minus_1(struct poly *p) { + // Multiplying by (𝑥 - 1) means negating each coefficient and adding in + // the value of the previous one. + const uint16_t orig_final_coefficient = p->v[N - 1]; + + for (size_t i = N - 1; i > 0; i--) { + p->v[i] = p->v[i - 1] - p->v[i]; + } + p->v[0] = orig_final_coefficient - p->v[0]; +} + +// poly_mod_phiN sets |p| to |p| mod Φ(N). +static void poly_mod_phiN(struct poly *p) { + const uint16_t coeff700 = p->v[N - 1]; + + for (unsigned i = 0; i < N; i++) { + p->v[i] -= coeff700; + } +} + +// poly_clamp reduces each coefficient mod Q. +static void poly_clamp(struct poly *p) { + for (unsigned i = 0; i < N; i++) { + p->v[i] &= Q - 1; + } +} + + +// Conversion functions +// -------------------- + +// poly2_from_poly sets |*out| to |in| mod 2. +static void poly2_from_poly(struct poly2 *out, const struct poly *in) { + crypto_word_t *words = out->v; + unsigned shift = 0; + crypto_word_t word = 0; + + for (unsigned i = 0; i < N; i++) { + word >>= 1; + word |= (crypto_word_t)(in->v[i] & 1) << (BITS_PER_WORD - 1); + shift++; + + if (shift == BITS_PER_WORD) { + *words = word; + words++; + word = 0; + shift = 0; + } + } + + word >>= BITS_PER_WORD - shift; + *words = word; +} + +// mod3 treats |a| as a signed number and returns |a| mod 3. +static uint16_t mod3(int16_t a) { + const int16_t q = ((int32_t)a * 21845) >> 16; + int16_t ret = a - 3 * q; + // At this point, |ret| is in {0, 1, 2, 3} and that needs to be mapped to {0, + // 1, 2, 0}. + return ret & ((ret & (ret >> 1)) - 1); +} + +// poly3_from_poly sets |*out| to |in|. +static void poly3_from_poly(struct poly3 *out, const struct poly *in) { + crypto_word_t *words_s = out->s.v; + crypto_word_t *words_a = out->a.v; + crypto_word_t s = 0; + crypto_word_t a = 0; + unsigned shift = 0; + + for (unsigned i = 0; i < N; i++) { + // This duplicates the 13th bit upwards to the top of the uint16, + // essentially treating it as a sign bit and converting into a signed int16. + // The signed value is reduced mod 3, yielding {0, 1, 2}. + const uint16_t v = mod3((int16_t)(in->v[i] << 3) >> 3); + s >>= 1; + const crypto_word_t s_bit = (crypto_word_t)(v & 2) << (BITS_PER_WORD - 2); + s |= s_bit; + a >>= 1; + a |= s_bit | (crypto_word_t)(v & 1) << (BITS_PER_WORD - 1); + shift++; + + if (shift == BITS_PER_WORD) { + *words_s = s; + words_s++; + *words_a = a; + words_a++; + s = a = 0; + shift = 0; + } + } + + s >>= BITS_PER_WORD - shift; + a >>= BITS_PER_WORD - shift; + *words_s = s; + *words_a = a; +} + +// poly3_from_poly_checked sets |*out| to |in|, which has coefficients in {0, 1, +// Q-1}. It returns a mask indicating whether all coefficients were found to be +// in that set. +static crypto_word_t poly3_from_poly_checked(struct poly3 *out, + const struct poly *in) { + crypto_word_t *words_s = out->s.v; + crypto_word_t *words_a = out->a.v; + crypto_word_t s = 0; + crypto_word_t a = 0; + unsigned shift = 0; + crypto_word_t ok = CONSTTIME_TRUE_W; + + for (unsigned i = 0; i < N; i++) { + const uint16_t v = in->v[i]; + // Maps {0, 1, Q-1} to {0, 1, 2}. + uint16_t mod3 = v & 3; + mod3 ^= mod3 >> 1; + const uint16_t expected = (uint16_t)((~((mod3 >> 1) - 1)) | mod3) % Q; + ok &= constant_time_eq_w(v, expected); + + s >>= 1; + const crypto_word_t s_bit = (crypto_word_t)(mod3 & 2) + << (BITS_PER_WORD - 2); + s |= s_bit; + a >>= 1; + a |= s_bit | (crypto_word_t)(mod3 & 1) << (BITS_PER_WORD - 1); + shift++; + + if (shift == BITS_PER_WORD) { + *words_s = s; + words_s++; + *words_a = a; + words_a++; + s = a = 0; + shift = 0; + } + } + + s >>= BITS_PER_WORD - shift; + a >>= BITS_PER_WORD - shift; + *words_s = s; + *words_a = a; + + return ok; +} + +static void poly_from_poly2(struct poly *out, const struct poly2 *in) { + const crypto_word_t *words = in->v; + unsigned shift = 0; + crypto_word_t word = *words; + + for (unsigned i = 0; i < N; i++) { + out->v[i] = word & 1; + word >>= 1; + shift++; + + if (shift == BITS_PER_WORD) { + words++; + word = *words; + shift = 0; + } + } + + poly_normalize(out); +} + +static void poly_from_poly3(struct poly *out, const struct poly3 *in) { + const crypto_word_t *words_s = in->s.v; + const crypto_word_t *words_a = in->a.v; + crypto_word_t word_s = ~(*words_s); + crypto_word_t word_a = *words_a; + unsigned shift = 0; + + for (unsigned i = 0; i < N; i++) { + out->v[i] = (uint16_t)(word_s & 1) - 1; + out->v[i] |= word_a & 1; + word_s >>= 1; + word_a >>= 1; + shift++; + + if (shift == BITS_PER_WORD) { + words_s++; + words_a++; + word_s = ~(*words_s); + word_a = *words_a; + shift = 0; + } + } + + poly_normalize(out); +} + +// Polynomial inversion +// -------------------- + +// poly_invert_mod2 sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod +// Φ(N)), all mod 2. This isn't useful in itself, but is part of doing inversion +// mod Q. +static void poly_invert_mod2(struct poly *out, const struct poly *in) { + // This algorithm is taken from section 7.1 of [SAFEGCD]. + struct poly2 v, r, f, g; + + // v = 0 + poly2_zero(&v); + // r = 1 + poly2_zero(&r); + r.v[0] = 1; + // f = all ones. + OPENSSL_memset(&f, 0xff, sizeof(struct poly2)); + f.v[WORDS_PER_POLY - 1] >>= BITS_PER_WORD - BITS_IN_LAST_WORD; + // g is the reversal of |in|. + poly2_from_poly(&g, in); + poly2_mod_phiN(&g); + poly2_reverse_700(&g, &g); + int delta = 1; + + for (size_t i = 0; i < (2 * (N - 1)) - 1; i++) { + poly2_lshift1(&v); + + const crypto_word_t delta_sign_bit = (delta >> (sizeof(delta) * 8 - 1)) & 1; + const crypto_word_t delta_is_non_negative = delta_sign_bit - 1; + const crypto_word_t delta_is_non_zero = ~constant_time_is_zero_w(delta); + const crypto_word_t g_has_constant_term = lsb_to_all(g.v[0]); + const crypto_word_t mask = + g_has_constant_term & delta_is_non_negative & delta_is_non_zero; + + const crypto_word_t c = lsb_to_all(f.v[0] & g.v[0]); + + delta = constant_time_select_int(mask, -delta, delta); + delta++; + + poly2_cswap(&f, &g, mask); + poly2_fmadd(&g, &f, c); + poly2_rshift1(&g); + + poly2_cswap(&v, &r, mask); + poly2_fmadd(&r, &v, c); + } + + assert(delta == 0); + assert(f.v[0] & 1); + poly2_reverse_700(&v, &v); + poly_from_poly2(out, &v); + poly_assert_normalized(out); +} + +// poly_invert sets |*out| to |in^-1| (i.e. such that |*out|×|in| = 1 mod Φ(N)). +static void poly_invert(struct POLY_MUL_SCRATCH *scratch, struct poly *out, + const struct poly *in) { + // Inversion mod Q, which is done based on the result of inverting mod + // 2. See [NTRUTN14] paper, bottom of page two. + struct poly a, *b, tmp; + + // a = -in. + for (unsigned i = 0; i < N; i++) { + a.v[i] = -in->v[i]; + } + poly_normalize(&a); + + // b = in^-1 mod 2. + b = out; + poly_invert_mod2(b, in); + + // We are working mod Q=2**13 and we need to iterate ceil(log_2(13)) + // times, which is four. + for (unsigned i = 0; i < 4; i++) { + poly_mul(scratch, &tmp, &a, b); + tmp.v[0] += 2; + poly_mul(scratch, b, b, &tmp); + } + + poly_assert_normalized(out); +} + +// Marshal and unmarshal functions for various basic types. +// -------------------------------------------------------- + +#define POLY_BYTES 1138 + +// poly_marshal serialises all but the final coefficient of |in| to |out|. +static void poly_marshal(uint8_t out[POLY_BYTES], const struct poly *in) { + const uint16_t *p = in->v; + + for (size_t i = 0; i < N / 8; i++) { + out[0] = p[0]; + out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5); + out[2] = p[1] >> 3; + out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2); + out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7); + out[5] = p[3] >> 1; + out[6] = (0xf & (p[3] >> 9)) | ((p[4] & 0x0f) << 4); + out[7] = p[4] >> 4; + out[8] = (1 & (p[4] >> 12)) | ((p[5] & 0x7f) << 1); + out[9] = (0x3f & (p[5] >> 7)) | ((p[6] & 0x03) << 6); + out[10] = p[6] >> 2; + out[11] = (7 & (p[6] >> 10)) | ((p[7] & 0x1f) << 3); + out[12] = p[7] >> 5; + + p += 8; + out += 13; + } + + // There are four remaining values. + out[0] = p[0]; + out[1] = (0x1f & (p[0] >> 8)) | ((p[1] & 0x07) << 5); + out[2] = p[1] >> 3; + out[3] = (3 & (p[1] >> 11)) | ((p[2] & 0x3f) << 2); + out[4] = (0x7f & (p[2] >> 6)) | ((p[3] & 0x01) << 7); + out[5] = p[3] >> 1; + out[6] = 0xf & (p[3] >> 9); +} + +// poly_unmarshal parses the output of |poly_marshal| and sets |out| such that +// all but the final coefficients match, and the final coefficient is calculated +// such that evaluating |out| at one results in zero. It returns one on success +// or zero if |in| is an invalid encoding. +static int poly_unmarshal(struct poly *out, const uint8_t in[POLY_BYTES]) { + uint16_t *p = out->v; + + for (size_t i = 0; i < N / 8; i++) { + p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8; + p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 | + (uint16_t)(in[3] & 3) << 11; + p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6; + p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 | + (uint16_t)(in[6] & 0xf) << 9; + p[4] = (uint16_t)(in[6] >> 4) | (uint16_t)(in[7]) << 4 | + (uint16_t)(in[8] & 1) << 12; + p[5] = (uint16_t)(in[8] >> 1) | (uint16_t)(in[9] & 0x3f) << 7; + p[6] = (uint16_t)(in[9] >> 6) | (uint16_t)(in[10]) << 2 | + (uint16_t)(in[11] & 7) << 10; + p[7] = (uint16_t)(in[11] >> 3) | (uint16_t)(in[12]) << 5; + + p += 8; + in += 13; + } + + // There are four coefficients remaining. + p[0] = (uint16_t)(in[0]) | (uint16_t)(in[1] & 0x1f) << 8; + p[1] = (uint16_t)(in[1] >> 5) | (uint16_t)(in[2]) << 3 | + (uint16_t)(in[3] & 3) << 11; + p[2] = (uint16_t)(in[3] >> 2) | (uint16_t)(in[4] & 0x7f) << 6; + p[3] = (uint16_t)(in[4] >> 7) | (uint16_t)(in[5]) << 1 | + (uint16_t)(in[6] & 0xf) << 9; + + for (unsigned i = 0; i < N - 1; i++) { + out->v[i] = (int16_t)(out->v[i] << 3) >> 3; + } + + // There are four unused bits in the last byte. We require them to be zero. + if ((in[6] & 0xf0) != 0) { + return 0; + } + + // Set the final coefficient as specified in [HRSSNIST] 1.9.2 step 6. + uint32_t sum = 0; + for (size_t i = 0; i < N - 1; i++) { + sum += out->v[i]; + } + + out->v[N - 1] = (uint16_t)(0u - sum); + poly_normalize(out); + + return 1; +} + +// mod3_from_modQ maps {0, 1, Q-1, 65535} -> {0, 1, 2, 2}. Note that |v| may +// have an invalid value when processing attacker-controlled inputs. +static uint16_t mod3_from_modQ(uint16_t v) { + v &= 3; + return v ^ (v >> 1); +} + +// poly_marshal_mod3 marshals |in| to |out| where the coefficients of |in| are +// all in {0, 1, Q-1, 65535} and |in| is mod Φ(N). (Note that coefficients may +// have invalid values when processing attacker-controlled inputs.) +static void poly_marshal_mod3(uint8_t out[HRSS_POLY3_BYTES], + const struct poly *in) { + const uint16_t *coeffs = in->v; + + // Only 700 coefficients are marshaled because in[700] must be zero. + assert(coeffs[N - 1] == 0); + + for (size_t i = 0; i < HRSS_POLY3_BYTES; i++) { + const uint16_t coeffs0 = mod3_from_modQ(coeffs[0]); + const uint16_t coeffs1 = mod3_from_modQ(coeffs[1]); + const uint16_t coeffs2 = mod3_from_modQ(coeffs[2]); + const uint16_t coeffs3 = mod3_from_modQ(coeffs[3]); + const uint16_t coeffs4 = mod3_from_modQ(coeffs[4]); + out[i] = coeffs0 + coeffs1 * 3 + coeffs2 * 9 + coeffs3 * 27 + coeffs4 * 81; + coeffs += 5; + } +} + +// HRSS-specific functions +// ----------------------- + +// poly_short_sample samples a vector of values in {0xffff (i.e. -1), 0, 1}. +// This is the same action as the algorithm in [HRSSNIST] section 1.8.1, but +// with HRSS-SXY the sampling algorithm is now a private detail of the +// implementation (previously it had to match between two parties). This +// function uses that freedom to implement a flatter distribution of values. +static void poly_short_sample(struct poly *out, + const uint8_t in[HRSS_SAMPLE_BYTES]) { + static_assert(HRSS_SAMPLE_BYTES == N - 1, "HRSS_SAMPLE_BYTES incorrect"); + for (size_t i = 0; i < N - 1; i++) { + uint16_t v = mod3(in[i]); + // Map {0, 1, 2} -> {0, 1, 0xffff} + v |= ((v >> 1) ^ 1) - 1; + out->v[i] = v; + } + out->v[N - 1] = 0; + poly_normalize(out); +} + +// poly_short_sample_plus performs the T+ sample as defined in [HRSSNIST], +// section 1.8.2. +static void poly_short_sample_plus(struct poly *out, + const uint8_t in[HRSS_SAMPLE_BYTES]) { + poly_short_sample(out, in); + + // sum (and the product in the for loop) will overflow. But that's fine + // because |sum| is bound by +/- (N-2), and N < 2^15 so it works out. + uint16_t sum = 0; + for (unsigned i = 0; i < N - 2; i++) { + sum += (unsigned)out->v[i] * out->v[i + 1]; + } + + // If the sum is negative, flip the sign of even-positioned coefficients. (See + // page 8 of [HRSS].) + sum = ((int16_t)sum) >> 15; + const uint16_t scale = sum | (~sum & 1); + for (unsigned i = 0; i < N; i += 2) { + out->v[i] = (unsigned)out->v[i] * scale; + } + poly_assert_normalized(out); +} + +// poly_lift computes the function discussed in [HRSS], appendix B. +static void poly_lift(struct poly *out, const struct poly *a) { + // We wish to calculate a/(𝑥-1) mod Φ(N) over GF(3), where Φ(N) is the + // Nth cyclotomic polynomial, i.e. 1 + 𝑥 + … + 𝑥^700 (since N is prime). + + // 1/(𝑥-1) has a fairly basic structure that we can exploit to speed this up: + // + // R. = PolynomialRing(GF(3)…) + // inv = R.cyclotomic_polynomial(1).inverse_mod(R.cyclotomic_polynomial(n)) + // list(inv)[:15] + // [1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2, 1, 0, 2] + // + // This three-element pattern of coefficients repeats for the whole + // polynomial. + // + // Next define the overbar operator such that z̅ = z[0] + + // reverse(z[1:]). (Index zero of a polynomial here is the coefficient + // of the constant term. So index one is the coefficient of 𝑥 and so + // on.) + // + // A less odd way to define this is to see that z̅ negates the indexes, + // so z̅[0] = z[-0], z̅[1] = z[-1] and so on. + // + // The use of z̅ is that, when working mod (𝑥^701 - 1), vz[0] = , vz[1] = , …. (Where is the inner product: the sum + // of the point-wise products.) Although we calculated the inverse mod + // Φ(N), we can work mod (𝑥^N - 1) and reduce mod Φ(N) at the end. + // (That's because (𝑥^N - 1) is a multiple of Φ(N).) + // + // When working mod (𝑥^N - 1), multiplication by 𝑥 is a right-rotation + // of the list of coefficients. + // + // Thus we can consider what the pattern of z̅, 𝑥z̅, 𝑥^2z̅, … looks like: + // + // def reverse(xs): + // suffix = list(xs[1:]) + // suffix.reverse() + // return [xs[0]] + suffix + // + // def rotate(xs): + // return [xs[-1]] + xs[:-1] + // + // zoverbar = reverse(list(inv) + [0]) + // xzoverbar = rotate(reverse(list(inv) + [0])) + // x2zoverbar = rotate(rotate(reverse(list(inv) + [0]))) + // + // zoverbar[:15] + // [1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1] + // xzoverbar[:15] + // [0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0] + // x2zoverbar[:15] + // [2, 0, 1, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2] + // + // (For a formula for z̅, see lemma two of appendix B.) + // + // After the first three elements have been taken care of, all then have + // a repeating three-element cycle. The next value (𝑥^3z̅) involves + // three rotations of the first pattern, thus the three-element cycle + // lines up. However, the discontinuity in the first three elements + // obviously moves to a different position. Consider the difference + // between 𝑥^3z̅ and z̅: + // + // [x-y for (x,y) in zip(zoverbar, x3zoverbar)][:15] + // [0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0] + // + // This pattern of differences is the same for all elements, although it + // obviously moves right with the rotations. + // + // From this, we reach algorithm eight of appendix B. + + // Handle the first three elements of the inner products. + out->v[0] = a->v[0] + a->v[2]; + out->v[1] = a->v[1]; + out->v[2] = -a->v[0] + a->v[2]; + + // s0, s1, s2 are added into out->v[0], out->v[1], and out->v[2], + // respectively. We do not compute s1 because it's just -(s0 + s1). + uint16_t s0 = 0, s2 = 0; + for (size_t i = 3; i < 699; i += 3) { + s0 += -a->v[i] + a->v[i + 2]; + // s1 += a->v[i] - a->v[i + 1]; + s2 += a->v[i + 1] - a->v[i + 2]; + } + + // Handle the fact that the three-element pattern doesn't fill the + // polynomial exactly (since 701 isn't a multiple of three). + s0 -= a->v[699]; + // s1 += a->v[699] - a->v[700]; + s2 += a->v[700]; + + // Note that s0 + s1 + s2 = 0. + out->v[0] += s0; + out->v[1] -= (s0 + s2); // = s1 + out->v[2] += s2; + + // Calculate the remaining inner products by taking advantage of the + // fact that the pattern repeats every three cycles and the pattern of + // differences moves with the rotation. + for (size_t i = 3; i < N; i++) { + out->v[i] = (out->v[i - 3] - (a->v[i - 2] + a->v[i - 1] + a->v[i])); + } + + // Reduce mod Φ(N) by subtracting a multiple of out[700] from every + // element and convert to mod Q. (See above about adding twice as + // subtraction.) + const crypto_word_t v = out->v[700]; + for (unsigned i = 0; i < N; i++) { + const uint16_t vi_mod3 = mod3(out->v[i] - v); + // Map {0, 1, 2} to {0, 1, 0xffff}. + out->v[i] = (~((vi_mod3 >> 1) - 1)) | vi_mod3; + } + + poly_mul_x_minus_1(out); + poly_normalize(out); +} + +namespace { + +struct public_key { + struct poly ph; +}; + +struct private_key { + struct poly3 f, f_inverse; + struct poly ph_inverse; + uint8_t hmac_key[32]; +}; + +} // namespace + +// public_key_from_external converts an external public key pointer into an +// internal one. Externally the alignment is only specified to be eight bytes +// but we need 16-byte alignment. We could annotate the external struct with +// that alignment but we can only assume that malloced pointers are 8-byte +// aligned in any case. (Even if the underlying malloc returns values with +// 16-byte alignment, |OPENSSL_malloc| will store an 8-byte size prefix and mess +// that up.) +static struct public_key *public_key_from_external( + struct HRSS_public_key *ext) { + static_assert( + sizeof(struct HRSS_public_key) >= sizeof(struct public_key) + 15, + "HRSS public key too small"); + + return reinterpret_cast(align_pointer(ext->opaque, 16)); +} + +// private_key_from_external does the same thing as |public_key_from_external|, +// but for private keys. See the comment on that function about alignment +// issues. +static struct private_key *private_key_from_external( + struct HRSS_private_key *ext) { + static_assert( + sizeof(struct HRSS_private_key) >= sizeof(struct private_key) + 15, + "HRSS private key too small"); + + return reinterpret_cast(align_pointer(ext->opaque, 16)); +} + +// malloc_align32 returns a pointer to |size| bytes of 32-byte-aligned heap and +// sets |*out_ptr| to a value that can be passed to |OPENSSL_free| to release +// it. It returns NULL if out of memory. +static void *malloc_align32(void **out_ptr, size_t size) { + void *ptr = OPENSSL_malloc(size + 31); + if (!ptr) { + *out_ptr = nullptr; + return nullptr; + } + + *out_ptr = ptr; + return align_pointer(ptr, 32); +} + +int HRSS_generate_key( + struct HRSS_public_key *out_pub, struct HRSS_private_key *out_priv, + const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES + 32]) { + struct public_key *pub = public_key_from_external(out_pub); + struct private_key *priv = private_key_from_external(out_priv); + + struct vars { + struct POLY_MUL_SCRATCH scratch; + struct poly f; + struct poly pg_phi1; + struct poly pfg_phi1; + struct poly pfg_phi1_inverse; + }; + + void *malloc_ptr; + struct vars *const vars = reinterpret_cast( + malloc_align32(&malloc_ptr, sizeof(struct vars))); + if (!vars) { + // If the caller ignores the return value the output will still be safe. + // The private key output is randomised in case it's later passed to + // |HRSS_encap|. + memset(out_pub, 0, sizeof(struct HRSS_public_key)); + RAND_bytes((uint8_t *)out_priv, sizeof(struct HRSS_private_key)); + return 0; + } + +#if !defined(NDEBUG) + OPENSSL_memset(vars, 0xff, sizeof(struct vars)); +#endif + + OPENSSL_memcpy(priv->hmac_key, in + 2 * HRSS_SAMPLE_BYTES, + sizeof(priv->hmac_key)); + + poly_short_sample_plus(&vars->f, in); + poly3_from_poly(&priv->f, &vars->f); + HRSS_poly3_invert(&priv->f_inverse, &priv->f); + + // pg_phi1 is p (i.e. 3) × g × Φ(1) (i.e. 𝑥-1). + poly_short_sample_plus(&vars->pg_phi1, in + HRSS_SAMPLE_BYTES); + for (unsigned i = 0; i < N; i++) { + vars->pg_phi1.v[i] *= 3; + } + poly_mul_x_minus_1(&vars->pg_phi1); + + poly_mul(&vars->scratch, &vars->pfg_phi1, &vars->f, &vars->pg_phi1); + + poly_invert(&vars->scratch, &vars->pfg_phi1_inverse, &vars->pfg_phi1); + + poly_mul(&vars->scratch, &pub->ph, &vars->pfg_phi1_inverse, &vars->pg_phi1); + poly_mul(&vars->scratch, &pub->ph, &pub->ph, &vars->pg_phi1); + poly_clamp(&pub->ph); + + poly_mul(&vars->scratch, &priv->ph_inverse, &vars->pfg_phi1_inverse, + &vars->f); + poly_mul(&vars->scratch, &priv->ph_inverse, &priv->ph_inverse, &vars->f); + poly_clamp(&priv->ph_inverse); + + OPENSSL_free(malloc_ptr); + return 1; +} + +static const char kSharedKey[] = "shared key"; + +int HRSS_encap(uint8_t out_ciphertext[POLY_BYTES], uint8_t out_shared_key[32], + const struct HRSS_public_key *in_pub, + const uint8_t in[HRSS_SAMPLE_BYTES + HRSS_SAMPLE_BYTES]) { + const struct public_key *pub = + public_key_from_external((struct HRSS_public_key *)in_pub); + + struct vars { + struct POLY_MUL_SCRATCH scratch; + struct poly m, r, m_lifted; + struct poly prh_plus_m; + SHA256_CTX hash_ctx; + uint8_t m_bytes[HRSS_POLY3_BYTES]; + uint8_t r_bytes[HRSS_POLY3_BYTES]; + }; + + void *malloc_ptr; + struct vars *const vars = reinterpret_cast( + malloc_align32(&malloc_ptr, sizeof(struct vars))); + if (!vars) { + // If the caller ignores the return value the output will still be safe. + // The private key output is randomised in case it's used to encrypt and + // transmit something. + memset(out_ciphertext, 0, POLY_BYTES); + RAND_bytes(out_shared_key, 32); + return 0; + } + +#if !defined(NDEBUG) + OPENSSL_memset(vars, 0xff, sizeof(struct vars)); +#endif + + poly_short_sample(&vars->m, in); + poly_short_sample(&vars->r, in + HRSS_SAMPLE_BYTES); + poly_lift(&vars->m_lifted, &vars->m); + + poly_mul(&vars->scratch, &vars->prh_plus_m, &vars->r, &pub->ph); + for (unsigned i = 0; i < N; i++) { + vars->prh_plus_m.v[i] += vars->m_lifted.v[i]; + } + + poly_marshal(out_ciphertext, &vars->prh_plus_m); + + poly_marshal_mod3(vars->m_bytes, &vars->m); + poly_marshal_mod3(vars->r_bytes, &vars->r); + + SHA256_Init(&vars->hash_ctx); + SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey)); + SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes)); + SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes)); + SHA256_Update(&vars->hash_ctx, out_ciphertext, POLY_BYTES); + SHA256_Final(out_shared_key, &vars->hash_ctx); + + OPENSSL_free(malloc_ptr); + return 1; +} + +int HRSS_decap(uint8_t out_shared_key[HRSS_KEY_BYTES], + const struct HRSS_private_key *in_priv, + const uint8_t *ciphertext, size_t ciphertext_len) { + const struct private_key *priv = + private_key_from_external((struct HRSS_private_key *)in_priv); + +#if defined(_MSC_VER) + // MSVC will produce this useless warning: + // warning C4324: structure was padded due to alignment specifier +#pragma warning(push) +#pragma warning(disable : 4324) +#endif + struct vars { + struct POLY_MUL_SCRATCH scratch; + uint8_t masked_key[SHA256_CBLOCK]; + SHA256_CTX hash_ctx; + struct poly c; + struct poly f, cf; + struct poly3 cf3, m3; + struct poly m, m_lifted; + struct poly r; + struct poly3 r3; + uint8_t expected_ciphertext[HRSS_CIPHERTEXT_BYTES]; + uint8_t m_bytes[HRSS_POLY3_BYTES]; + uint8_t r_bytes[HRSS_POLY3_BYTES]; + uint8_t shared_key[32]; + }; +#if defined(_MSC_VER) +#pragma warning(pop) +#endif + + void *malloc_ptr; + struct vars *const vars = reinterpret_cast( + malloc_align32(&malloc_ptr, sizeof(struct vars))); + if (!vars) { + // If the caller ignores the return value the output will still be safe. + // The private key output is randomised in case it's used to encrypt and + // transmit something. + RAND_bytes(out_shared_key, HRSS_KEY_BYTES); + return 0; + } + +#if !defined(NDEBUG) + OPENSSL_memset(vars, 0xff, sizeof(struct vars)); +#endif + + // This is HMAC, expanded inline rather than using the |HMAC| function so that + // we can avoid dealing with possible allocation failures and so keep this + // function infallible. + static_assert(sizeof(priv->hmac_key) <= sizeof(vars->masked_key), + "HRSS HMAC key larger than SHA-256 block size"); + for (size_t i = 0; i < sizeof(priv->hmac_key); i++) { + vars->masked_key[i] = priv->hmac_key[i] ^ 0x36; + } + OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x36, + sizeof(vars->masked_key) - sizeof(priv->hmac_key)); + + SHA256_Init(&vars->hash_ctx); + SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key)); + SHA256_Update(&vars->hash_ctx, ciphertext, ciphertext_len); + uint8_t inner_digest[SHA256_DIGEST_LENGTH]; + SHA256_Final(inner_digest, &vars->hash_ctx); + + for (size_t i = 0; i < sizeof(priv->hmac_key); i++) { + vars->masked_key[i] ^= (0x5c ^ 0x36); + } + OPENSSL_memset(vars->masked_key + sizeof(priv->hmac_key), 0x5c, + sizeof(vars->masked_key) - sizeof(priv->hmac_key)); + + SHA256_Init(&vars->hash_ctx); + SHA256_Update(&vars->hash_ctx, vars->masked_key, sizeof(vars->masked_key)); + SHA256_Update(&vars->hash_ctx, inner_digest, sizeof(inner_digest)); + static_assert(HRSS_KEY_BYTES == SHA256_DIGEST_LENGTH, + "HRSS shared key length incorrect"); + SHA256_Final(out_shared_key, &vars->hash_ctx); + + // If the ciphertext is publicly invalid then a random shared key is still + // returned to simply the logic of the caller, but this path is not constant + // time. + crypto_word_t ok = 0; + if (ciphertext_len != HRSS_CIPHERTEXT_BYTES || + !poly_unmarshal(&vars->c, ciphertext)) { + goto out; + } + + poly_from_poly3(&vars->f, &priv->f); + poly_mul(&vars->scratch, &vars->cf, &vars->c, &vars->f); + poly3_from_poly(&vars->cf3, &vars->cf); + // Note that cf3 is not reduced mod Φ(N). That reduction is deferred. + HRSS_poly3_mul(&vars->m3, &vars->cf3, &priv->f_inverse); + + poly_from_poly3(&vars->m, &vars->m3); + poly_lift(&vars->m_lifted, &vars->m); + + for (unsigned i = 0; i < N; i++) { + vars->r.v[i] = vars->c.v[i] - vars->m_lifted.v[i]; + } + poly_normalize(&vars->r); + poly_mul(&vars->scratch, &vars->r, &vars->r, &priv->ph_inverse); + poly_mod_phiN(&vars->r); + poly_clamp(&vars->r); + + ok = poly3_from_poly_checked(&vars->r3, &vars->r); + + // [NTRUCOMP] section 5.1 includes ReEnc2 and a proof that it's valid. Rather + // than do an expensive |poly_mul|, it rebuilds |c'| from |c - lift(m)| + // (called |b|) with: + // t = (−b(1)/N) mod Q + // c' = b + tΦ(N) + lift(m) mod Q + // + // When polynomials are transmitted, the final coefficient is omitted and + // |poly_unmarshal| sets it such that f(1) == 0. Thus c(1) == 0. Also, + // |poly_lift| multiplies the result by (x-1) and therefore evaluating a + // lifted polynomial at 1 is also zero. Thus lift(m)(1) == 0 and so + // (c - lift(m))(1) == 0. + // + // Although we defer the reduction above, |b| is conceptually reduced mod + // Φ(N). In order to do that reduction one subtracts |c[N-1]| from every + // coefficient. Therefore b(1) = -c[N-1]×N. The value of |t|, above, then is + // just recovering |c[N-1]|, and adding tΦ(N) is simply undoing the reduction. + // Therefore b + tΦ(N) + lift(m) = c by construction and we don't need to + // recover |c| at all so long as we do the checks in + // |poly3_from_poly_checked|. + // + // The |poly_marshal| here then is just confirming that |poly_unmarshal| is + // strict and could be omitted. + + static_assert(HRSS_CIPHERTEXT_BYTES == POLY_BYTES, + "ciphertext is the wrong size"); + assert(ciphertext_len == sizeof(vars->expected_ciphertext)); + poly_marshal(vars->expected_ciphertext, &vars->c); + + poly_marshal_mod3(vars->m_bytes, &vars->m); + poly_marshal_mod3(vars->r_bytes, &vars->r); + + ok &= constant_time_is_zero_w( + CRYPTO_memcmp(ciphertext, vars->expected_ciphertext, + sizeof(vars->expected_ciphertext))); + + SHA256_Init(&vars->hash_ctx); + SHA256_Update(&vars->hash_ctx, kSharedKey, sizeof(kSharedKey)); + SHA256_Update(&vars->hash_ctx, vars->m_bytes, sizeof(vars->m_bytes)); + SHA256_Update(&vars->hash_ctx, vars->r_bytes, sizeof(vars->r_bytes)); + SHA256_Update(&vars->hash_ctx, vars->expected_ciphertext, + sizeof(vars->expected_ciphertext)); + SHA256_Final(vars->shared_key, &vars->hash_ctx); + + for (unsigned i = 0; i < sizeof(vars->shared_key); i++) { + out_shared_key[i] = + constant_time_select_8(ok, vars->shared_key[i], out_shared_key[i]); + } + +out: + OPENSSL_free(malloc_ptr); + return 1; +} + +void HRSS_marshal_public_key(uint8_t out[HRSS_PUBLIC_KEY_BYTES], + const struct HRSS_public_key *in_pub) { + const struct public_key *pub = + public_key_from_external((struct HRSS_public_key *)in_pub); + poly_marshal(out, &pub->ph); +} + +int HRSS_parse_public_key(struct HRSS_public_key *out, + const uint8_t in[HRSS_PUBLIC_KEY_BYTES]) { + struct public_key *pub = public_key_from_external(out); + if (!poly_unmarshal(&pub->ph, in)) { + return 0; + } + OPENSSL_memset(&pub->ph.v[N], 0, 3 * sizeof(uint16_t)); + return 1; +} diff --git a/third_party/boringssl/src/crypto/hrss/internal.h b/third_party/boringssl/src/crypto/hrss/internal.h index 340b2e0e..78d385b0 100644 --- a/third_party/boringssl/src/crypto/hrss/internal.h +++ b/third_party/boringssl/src/crypto/hrss/internal.h @@ -1,32 +1,29 @@ -/* Copyright (c) 2018, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. -#ifndef OPENSSL_HEADER_HRSS_INTERNAL_H -#define OPENSSL_HEADER_HRSS_INTERNAL_H +#ifndef OPENSSL_HEADER_CRYPTO_HRSS_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_HRSS_INTERNAL_H #include #include "../internal.h" -#if defined(__cplusplus) -extern "C" { -#endif - +BSSL_NAMESPACE_BEGIN -#define N 701 +#define HRSS_N 701 #define BITS_PER_WORD (sizeof(crypto_word_t) * 8) -#define WORDS_PER_POLY ((N + BITS_PER_WORD - 1) / BITS_PER_WORD) -#define BITS_IN_LAST_WORD (N % BITS_PER_WORD) +#define WORDS_PER_POLY ((HRSS_N + BITS_PER_WORD - 1) / BITS_PER_WORD) +#define BITS_IN_LAST_WORD (HRSS_N % BITS_PER_WORD) struct poly2 { crypto_word_t v[WORDS_PER_POLY]; @@ -53,16 +50,15 @@ OPENSSL_EXPORT void HRSS_poly3_invert(struct poly3 *out, // poly_Rq_mul is defined in assembly. Inputs and outputs must be 16-byte- // aligned. -extern void poly_Rq_mul( - uint16_t r[N + 3], const uint16_t a[N + 3], const uint16_t b[N + 3], +extern "C" void poly_Rq_mul( + uint16_t r[HRSS_N + 3], const uint16_t a[HRSS_N + 3], + const uint16_t b[HRSS_N + 3], // The following should be `scratch[POLY_MUL_RQ_SCRATCH_SPACE]` but // GCC 11.1 has a bug with unions that breaks that. uint8_t scratch[]); #endif +BSSL_NAMESPACE_END -#if defined(__cplusplus) -} // extern "C" -#endif -#endif // !OPENSSL_HEADER_HRSS_INTERNAL_H +#endif // !OPENSSL_HEADER_CRYPTO_HRSS_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/internal.h b/third_party/boringssl/src/crypto/internal.h index 63e6a662..7c4b509a 100644 --- a/third_party/boringssl/src/crypto/internal.h +++ b/third_party/boringssl/src/crypto/internal.h @@ -1,122 +1,32 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -/* ==================================================================== - * Copyright (c) 1998-2001 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #ifndef OPENSSL_HEADER_CRYPTO_INTERNAL_H #define OPENSSL_HEADER_CRYPTO_INTERNAL_H +#include #include #include +#include #include -#include #include +#include #include +#include + #if defined(BORINGSSL_CONSTANT_TIME_VALIDATION) #include #endif @@ -125,27 +35,6 @@ #include #endif -#if !defined(__cplusplus) -#if defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L -#include -#elif defined(_MSC_VER) && !defined(__clang__) -#define alignas(x) __declspec(align(x)) -#define alignof __alignof -#else -// With the exception of MSVC, we require C11 to build the library. C11 is a -// prerequisite for improved refcounting performance. All our supported C -// compilers have long implemented C11 and made it default. The most likely -// cause of pre-C11 modes is stale -std=c99 or -std=gnu99 flags in build -// configuration. Such flags can be removed. -// -// TODO(davidben): In MSVC 2019 16.8 or higher (_MSC_VER >= 1928), -// |__STDC_VERSION__| will be 201112 when passed /std:c11 and unset otherwise. -// C11 alignas and alignof are only implemented in C11 mode. Can we mandate C11 -// mode for those versions? -#error "BoringSSL must be built in C11 mode or higher." -#endif -#endif - #if defined(OPENSSL_THREADS) && \ (!defined(OPENSSL_WINDOWS) || defined(__MINGW32__)) #include @@ -155,95 +44,159 @@ #if defined(OPENSSL_THREADS) && !defined(OPENSSL_PTHREADS) && \ defined(OPENSSL_WINDOWS) #define OPENSSL_WINDOWS_THREADS -OPENSSL_MSVC_PRAGMA(warning(push, 3)) +#endif + +#if defined(OPENSSL_THREADS) +#include +#else +#include +#endif + +#if defined(OPENSSL_WINDOWS_THREADS) #include -OPENSSL_MSVC_PRAGMA(warning(pop)) #endif -#if defined(__cplusplus) -extern "C" { +#if defined(_M_X64) || defined(_M_IX86) +#include "intrin.h" #endif +#if defined(BORINGSSL_PREFIX) +#include // IWYU pragma: export +#endif + + +BSSL_NAMESPACE_BEGIN -#if defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || defined(OPENSSL_ARM) || \ - defined(OPENSSL_AARCH64) || defined(OPENSSL_PPC64LE) -// OPENSSL_cpuid_setup initializes the platform-specific feature cache. -void OPENSSL_cpuid_setup(void); +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86) || defined(OPENSSL_X86_64) || \ + (!defined(OPENSSL_STATIC_ARMCAP) && \ + (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)))) +// x86, x86_64, and the ARMs need to record the result of a cpuid/getauxval call +// for the asm to work correctly, unless compiled without asm code. +#define NEED_CPUID + +// OPENSSL_cpuid_setup initializes the platform-specific feature cache. This +// function should not be called directly. Call |OPENSSL_init_cpuid| instead. +void OPENSSL_cpuid_setup(); + +// OPENSSL_init_cpuid initializes the platform-specific feature cache, if +// needed. This function is idempotent and may be called concurrently. +void OPENSSL_init_cpuid(); +#else +inline void OPENSSL_init_cpuid() {} #endif #if (defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64)) && \ !defined(OPENSSL_STATIC_ARMCAP) -// OPENSSL_get_armcap_pointer_for_test returns a pointer to |OPENSSL_armcap_P| -// for unit tests. Any modifications to the value must be made after -// |CRYPTO_library_init| but before any other function call in BoringSSL. -OPENSSL_EXPORT uint32_t *OPENSSL_get_armcap_pointer_for_test(void); +// OPENSSL_get_armcap_pointer_for_test returns a pointer to +// |OPENSSL_armcap_P| for unit tests. Any modifications to the value must be +// made before any other function call in BoringSSL. +OPENSSL_EXPORT uint32_t *OPENSSL_get_armcap_pointer_for_test(); #endif +// On non-MSVC 64-bit targets, we expect __uint128_t support. This includes +// clang-cl, which defines both __clang__ and _MSC_VER. #if (!defined(_MSC_VER) || defined(__clang__)) && defined(OPENSSL_64_BIT) #define BORINGSSL_HAS_UINT128 typedef __int128_t int128_t; typedef __uint128_t uint128_t; -// clang-cl supports __uint128_t but modulus and division don't work. -// https://crbug.com/787617. -#if !defined(_MSC_VER) || !defined(__clang__) +// __uint128_t division depends on intrinsics in the compiler runtime. Those +// intrinsics are missing in clang-cl (https://crbug.com/787617) and nanolibc. +// These may be bugs in the toolchain definition, but just disable it for now. +// EDK2's toolchain is missing __udivti3 (b/339380897) so cannot support +// 128-bit division currently. +#if !defined(_MSC_VER) && !defined(OPENSSL_NANOLIBC) && \ + !defined(__EDK2_BORINGSSL__) #define BORINGSSL_CAN_DIVIDE_UINT128 #endif #endif -#define OPENSSL_ARRAY_SIZE(array) (sizeof(array) / sizeof((array)[0])) - -// Have a generic fall-through for different versions of C/C++. -#if defined(__cplusplus) && __cplusplus >= 201703L -#define OPENSSL_FALLTHROUGH [[fallthrough]] -#elif defined(__cplusplus) && __cplusplus >= 201103L && defined(__clang__) -#define OPENSSL_FALLTHROUGH [[clang::fallthrough]] -#elif defined(__cplusplus) && __cplusplus >= 201103L && defined(__GNUC__) && \ - __GNUC__ >= 7 -#define OPENSSL_FALLTHROUGH [[gnu::fallthrough]] -#elif defined(__GNUC__) && __GNUC__ >= 7 // gcc 7 -#define OPENSSL_FALLTHROUGH __attribute__ ((fallthrough)) -#elif defined(__clang__) -#if __has_attribute(fallthrough) && __clang_major__ >= 5 -// Clang 3.5, at least, complains about "error: declaration does not declare -// anything", possibily because we put a semicolon after this macro in -// practice. Thus limit it to >= Clang 5, which does work. -#define OPENSSL_FALLTHROUGH __attribute__ ((fallthrough)) -#else // clang versions that do not support fallthrough. -#define OPENSSL_FALLTHROUGH -#endif -#else // C++11 on gcc 6, and all other cases -#define OPENSSL_FALLTHROUGH -#endif - -// For convenience in testing 64-bit generic code, we allow disabling SSE2 -// intrinsics via |OPENSSL_NO_SSE2_FOR_TESTING|. x86_64 always has SSE2 -// available, so we would otherwise need to test such code on a non-x86_64 -// platform. -#if defined(__SSE2__) && !defined(OPENSSL_NO_SSE2_FOR_TESTING) +// GCC-like compilers indicate SSE2 with |__SSE2__|. MSVC leaves the caller to +// know that x86_64 has SSE2, and uses _M_IX86_FP to indicate SSE2 on x86. +// https://learn.microsoft.com/en-us/cpp/preprocessor/predefined-macros?view=msvc-170 +#if defined(__SSE2__) || defined(_M_AMD64) || defined(_M_X64) || \ + (defined(_M_IX86_FP) && _M_IX86_FP >= 2) #define OPENSSL_SSE2 #endif +#if defined(OPENSSL_X86) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_SSE2) +#error \ + "x86 assembly requires SSE2. Build with -msse2 (recommended), or disable assembly optimizations with -DOPENSSL_NO_ASM." +#endif + +// For convenience in testing the fallback code, we allow disabling SSE2 +// intrinsics via |OPENSSL_NO_SSE2_FOR_TESTING|. We require SSE2 on x86 and +// x86_64, so we would otherwise need to test such code on a non-x86 platform. +// +// This does not remove the above requirement for SSE2 support with assembly +// optimizations. It only disables some intrinsics-based optimizations so that +// we can test the fallback code on CI. +#if defined(OPENSSL_SSE2) && defined(OPENSSL_NO_SSE2_FOR_TESTING) +#undef OPENSSL_SSE2 +#endif + +#if defined(__GNUC__) || defined(__clang__) +#define OPENSSL_ATTR_CONST __attribute__((const)) +#else +#define OPENSSL_ATTR_CONST +#endif + +#if defined(BORINGSSL_MALLOC_FAILURE_TESTING) +// OPENSSL_reset_malloc_counter_for_testing, when malloc testing is enabled, +// resets the internal malloc counter, to simulate further malloc failures. This +// should be called in between independent tests, at a point where failure from +// a previous test will not impact subsequent ones. +OPENSSL_EXPORT void OPENSSL_reset_malloc_counter_for_testing(); + +// OPENSSL_disable_malloc_failures_for_testing, when malloc testing is enabled, +// disables simulated malloc failures. Calls to |OPENSSL_malloc| will not +// increment the malloc counter or synthesize failures. This may be used to skip +// simulating malloc failures in some region of code. +OPENSSL_EXPORT void OPENSSL_disable_malloc_failures_for_testing(); + +// OPENSSL_enable_malloc_failures_for_testing, when malloc testing is enabled, +// re-enables simulated malloc failures. +OPENSSL_EXPORT void OPENSSL_enable_malloc_failures_for_testing(); +#else +inline void OPENSSL_reset_malloc_counter_for_testing() {} +inline void OPENSSL_disable_malloc_failures_for_testing() {} +inline void OPENSSL_enable_malloc_failures_for_testing() {} +#endif + +#if defined(__has_builtin) +#define OPENSSL_HAS_BUILTIN(x) __has_builtin(x) +#else +#define OPENSSL_HAS_BUILTIN(x) 0 +#endif + // Pointer utility functions. // buffers_alias returns one if |a| and |b| alias and zero otherwise. -static inline int buffers_alias(const uint8_t *a, size_t a_len, - const uint8_t *b, size_t b_len) { +inline int buffers_alias(const void *a, size_t a_bytes, const void *b, + size_t b_bytes) { // Cast |a| and |b| to integers. In C, pointer comparisons between unrelated // objects are undefined whereas pointer to integer conversions are merely // implementation-defined. We assume the implementation defined it in a sane // way. uintptr_t a_u = (uintptr_t)a; uintptr_t b_u = (uintptr_t)b; - return a_u + a_len > b_u && b_u + b_len > a_u; + return a_u + a_bytes > b_u && b_u + b_bytes > a_u; +} + +// spans_alias returns one if |a| and |b| alias, and zero otherwise. +template +inline int spans_alias(Span a, Span b) { + return buffers_alias(a.data(), a.size() * sizeof(T), b.data(), + b.size() * sizeof(T)); } // align_pointer returns |ptr|, advanced to |alignment|. |alignment| must be a // power of two, and |ptr| must have at least |alignment - 1| bytes of scratch // space. -static inline void *align_pointer(void *ptr, size_t alignment) { +inline void *align_pointer(void *ptr, size_t alignment) { // |alignment| must be a power of two. assert(alignment != 0 && (alignment & (alignment - 1)) == 0); // Instead of aligning |ptr| as a |uintptr_t| and casting back, compute the @@ -301,38 +254,40 @@ typedef uint32_t crypto_word_t; // Note the compiler is aware that |value_barrier_w| has no side effects and // always has the same output for a given input. This allows it to eliminate // dead code, move computations across loops, and vectorize. -static inline crypto_word_t value_barrier_w(crypto_word_t a) { -#if !defined(OPENSSL_NO_ASM) && (defined(__GNUC__) || defined(__clang__)) +inline crypto_word_t value_barrier_w(crypto_word_t a) { +#if defined(__GNUC__) || defined(__clang__) __asm__("" : "+r"(a) : /* no inputs */); #endif return a; } // value_barrier_u32 behaves like |value_barrier_w| but takes a |uint32_t|. -static inline uint32_t value_barrier_u32(uint32_t a) { -#if !defined(OPENSSL_NO_ASM) && (defined(__GNUC__) || defined(__clang__)) +inline uint32_t value_barrier_u32(uint32_t a) { +#if defined(__GNUC__) || defined(__clang__) __asm__("" : "+r"(a) : /* no inputs */); #endif return a; } // value_barrier_u64 behaves like |value_barrier_w| but takes a |uint64_t|. -static inline uint64_t value_barrier_u64(uint64_t a) { -#if !defined(OPENSSL_NO_ASM) && (defined(__GNUC__) || defined(__clang__)) +inline uint64_t value_barrier_u64(uint64_t a) { +#if defined(__GNUC__) || defined(__clang__) __asm__("" : "+r"(a) : /* no inputs */); #endif return a; } +// |value_barrier_u8| could be defined as above, but compilers other than +// clang seem to still materialize 0x00..00MM instead of reusing 0x??..??MM. + // constant_time_msb_w returns the given value with the MSB copied to all the // other bits. -static inline crypto_word_t constant_time_msb_w(crypto_word_t a) { +inline crypto_word_t constant_time_msb_w(crypto_word_t a) { return 0u - (a >> (sizeof(a) * 8 - 1)); } // constant_time_lt_w returns 0xff..f if a < b and 0 otherwise. -static inline crypto_word_t constant_time_lt_w(crypto_word_t a, - crypto_word_t b) { +inline crypto_word_t constant_time_lt_w(crypto_word_t a, crypto_word_t b) { // Consider the two cases of the problem: // msb(a) == msb(b): a < b iff the MSB of a - b is set. // msb(a) != msb(b): a < b iff the MSB of b is set. @@ -363,29 +318,28 @@ static inline crypto_word_t constant_time_lt_w(crypto_word_t a, // (assert (not (= (= #x00000001 (bvlshr (lt a b) #x0000001f)) (bvult a b)))) // (check-sat) // (get-model) - return constant_time_msb_w(a^((a^b)|((a-b)^a))); + return constant_time_msb_w(a ^ ((a ^ b) | ((a - b) ^ a))); } // constant_time_lt_8 acts like |constant_time_lt_w| but returns an 8-bit // mask. -static inline uint8_t constant_time_lt_8(crypto_word_t a, crypto_word_t b) { +inline uint8_t constant_time_lt_8(crypto_word_t a, crypto_word_t b) { return (uint8_t)(constant_time_lt_w(a, b)); } // constant_time_ge_w returns 0xff..f if a >= b and 0 otherwise. -static inline crypto_word_t constant_time_ge_w(crypto_word_t a, - crypto_word_t b) { +inline crypto_word_t constant_time_ge_w(crypto_word_t a, crypto_word_t b) { return ~constant_time_lt_w(a, b); } // constant_time_ge_8 acts like |constant_time_ge_w| but returns an 8-bit // mask. -static inline uint8_t constant_time_ge_8(crypto_word_t a, crypto_word_t b) { +inline uint8_t constant_time_ge_8(crypto_word_t a, crypto_word_t b) { return (uint8_t)(constant_time_ge_w(a, b)); } // constant_time_is_zero returns 0xff..f if a == 0 and 0 otherwise. -static inline crypto_word_t constant_time_is_zero_w(crypto_word_t a) { +inline crypto_word_t constant_time_is_zero_w(crypto_word_t a) { // Here is an SMT-LIB verification of this formula: // // (define-fun is_zero ((a (_ BitVec 32))) (_ BitVec 32) @@ -394,69 +348,121 @@ static inline crypto_word_t constant_time_is_zero_w(crypto_word_t a) { // // (declare-fun a () (_ BitVec 32)) // - // (assert (not (= (= #x00000001 (bvlshr (is_zero a) #x0000001f)) (= a #x00000000)))) - // (check-sat) - // (get-model) + // (assert (not (= (= #x00000001 (bvlshr (is_zero a) #x0000001f)) (= a + // #x00000000)))) (check-sat) (get-model) return constant_time_msb_w(~a & (a - 1)); } // constant_time_is_zero_8 acts like |constant_time_is_zero_w| but returns an // 8-bit mask. -static inline uint8_t constant_time_is_zero_8(crypto_word_t a) { +inline uint8_t constant_time_is_zero_8(crypto_word_t a) { return (uint8_t)(constant_time_is_zero_w(a)); } // constant_time_eq_w returns 0xff..f if a == b and 0 otherwise. -static inline crypto_word_t constant_time_eq_w(crypto_word_t a, - crypto_word_t b) { +inline crypto_word_t constant_time_eq_w(crypto_word_t a, crypto_word_t b) { return constant_time_is_zero_w(a ^ b); } // constant_time_eq_8 acts like |constant_time_eq_w| but returns an 8-bit // mask. -static inline uint8_t constant_time_eq_8(crypto_word_t a, crypto_word_t b) { +inline uint8_t constant_time_eq_8(crypto_word_t a, crypto_word_t b) { return (uint8_t)(constant_time_eq_w(a, b)); } // constant_time_eq_int acts like |constant_time_eq_w| but works on int // values. -static inline crypto_word_t constant_time_eq_int(int a, int b) { +inline crypto_word_t constant_time_eq_int(int a, int b) { return constant_time_eq_w((crypto_word_t)(a), (crypto_word_t)(b)); } // constant_time_eq_int_8 acts like |constant_time_eq_int| but returns an 8-bit // mask. -static inline uint8_t constant_time_eq_int_8(int a, int b) { +inline uint8_t constant_time_eq_int_8(int a, int b) { return constant_time_eq_8((crypto_word_t)(a), (crypto_word_t)(b)); } // constant_time_select_w returns (mask & a) | (~mask & b). When |mask| is all // 1s or all 0s (as returned by the methods above), the select methods return // either |a| (if |mask| is nonzero) or |b| (if |mask| is zero). -static inline crypto_word_t constant_time_select_w(crypto_word_t mask, - crypto_word_t a, - crypto_word_t b) { +inline crypto_word_t constant_time_select_w(crypto_word_t mask, crypto_word_t a, + crypto_word_t b) { // Clang recognizes this pattern as a select. While it usually transforms it // to a cmov, it sometimes further transforms it into a branch, which we do // not want. // - // Adding barriers to both |mask| and |~mask| breaks the relationship between - // the two, which makes the compiler stick with bitmasks. - return (value_barrier_w(mask) & a) | (value_barrier_w(~mask) & b); + // Hiding the value of the mask from the compiler evades this transformation. + mask = value_barrier_w(mask); + return (mask & a) | (~mask & b); } // constant_time_select_8 acts like |constant_time_select| but operates on // 8-bit values. -static inline uint8_t constant_time_select_8(uint8_t mask, uint8_t a, - uint8_t b) { - return (uint8_t)(constant_time_select_w(mask, a, b)); +inline uint8_t constant_time_select_8(crypto_word_t mask, uint8_t a, + uint8_t b) { + // |mask| is a word instead of |uint8_t| to avoid materializing 0x000..0MM + // Making both |mask| and its value barrier |uint8_t| would allow the compiler + // to materialize 0x????..?MM instead, but only clang is that clever. + // However, vectorization of bitwise operations seems to work better on + // |uint8_t| than a mix of |uint64_t| and |uint8_t|, so |m| is cast to + // |uint8_t| after the value barrier but before the bitwise operations. + uint8_t m = value_barrier_w(mask); + return (m & a) | (~m & b); } // constant_time_select_int acts like |constant_time_select| but operates on // ints. -static inline int constant_time_select_int(crypto_word_t mask, int a, int b) { - return (int)(constant_time_select_w(mask, (crypto_word_t)(a), - (crypto_word_t)(b))); +inline int constant_time_select_int(crypto_word_t mask, int a, int b) { + return static_cast(constant_time_select_w( + mask, static_cast(a), static_cast(b))); +} + +// constant_time_select_32 acts like |constant_time_select| but operates on +// 32-bit values. +inline uint32_t constant_time_select_32(crypto_word_t mask, uint32_t a, + uint32_t b) { + return static_cast( + constant_time_select_w(mask, crypto_word_t{a}, crypto_word_t{b})); +} + +// constant_time_conditional_memcpy copies |n| bytes from |src| to |dst| if +// |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory +// ranges at |dst| and |src| must not overlap, as when calling |memcpy|. +inline void constant_time_conditional_memcpy(void *dst, const void *src, + const size_t n, + const crypto_word_t mask) { + assert(!buffers_alias(dst, n, src, n)); + uint8_t *out = (uint8_t *)dst; + const uint8_t *in = (const uint8_t *)src; + for (size_t i = 0; i < n; i++) { + out[i] = constant_time_select_8(mask, in[i], out[i]); + } +} + +// constant_time_conditional_memxor xors |n| bytes from |src| to |dst| if +// |mask| is 0xff..ff and does nothing if |mask| is 0. The |n|-byte memory +// ranges at |dst| and |src| must not overlap, as when calling |memcpy|. +inline void constant_time_conditional_memxor(void *dst, const void *src, + size_t n, + const crypto_word_t mask) { + assert(!buffers_alias(dst, n, src, n)); + uint8_t *out = (uint8_t *)dst; + const uint8_t *in = (const uint8_t *)src; +#if defined(__GNUC__) && !defined(__clang__) + // gcc 13.2.0 doesn't automatically vectorize this loop regardless of barrier + typedef uint8_t v32u8 __attribute__((vector_size(32), aligned(1), may_alias)); + size_t n_vec = n & ~(size_t)31; + v32u8 masks = ((uint8_t)mask - (v32u8){}); // broadcast + for (size_t i = 0; i < n_vec; i += 32) { + *(v32u8 *)&out[i] ^= masks & *(v32u8 *)&in[i]; + } + in += n_vec; + out += n_vec; + n -= n_vec; +#endif + for (size_t i = 0; i < n; i++) { + out[i] ^= value_barrier_w(mask) & in[i]; + } } #if defined(BORINGSSL_CONSTANT_TIME_VALIDATION) @@ -465,20 +471,59 @@ static inline int constant_time_select_int(crypto_word_t mask, int a, int b) { // of memory as secret. Secret data is tracked as it flows to registers and // other parts of a memory. If secret data is used as a condition for a branch, // or as a memory index, it will trigger warnings in valgrind. -#define CONSTTIME_SECRET(x, y) VALGRIND_MAKE_MEM_UNDEFINED(x, y) +#define CONSTTIME_SECRET(ptr, len) VALGRIND_MAKE_MEM_UNDEFINED(ptr, len) // CONSTTIME_DECLASSIFY takes a pointer and a number of bytes and marks that // region of memory as public. Public data is not subject to constant-time // rules. -#define CONSTTIME_DECLASSIFY(x, y) VALGRIND_MAKE_MEM_DEFINED(x, y) +#define CONSTTIME_DECLASSIFY(ptr, len) VALGRIND_MAKE_MEM_DEFINED(ptr, len) #else -#define CONSTTIME_SECRET(x, y) -#define CONSTTIME_DECLASSIFY(x, y) +// Just disable unused warnings for those. +#define CONSTTIME_SECRET(ptr, len) \ + do { \ + (void)(ptr); \ + (void)(len); \ + } while (false) +#define CONSTTIME_DECLASSIFY(ptr, len) \ + do { \ + (void)(ptr); \ + (void)(len); \ + } while (false) #endif // BORINGSSL_CONSTANT_TIME_VALIDATION +inline crypto_word_t constant_time_declassify_w(crypto_word_t v) { + // Return |v| through a value barrier to be safe. Valgrind-based constant-time + // validation is partly to check the compiler has not undone any constant-time + // work. Any place |BORINGSSL_CONSTANT_TIME_VALIDATION| influences + // optimizations, this validation is inaccurate. + // + // However, by sending pointers through valgrind, we likely inhibit escape + // analysis. On local variables, particularly booleans, we likely + // significantly impact optimizations. + // + // Thus, to be safe, stick a value barrier, in hopes of comparably inhibiting + // compiler analysis. + CONSTTIME_DECLASSIFY(&v, sizeof(v)); + return value_barrier_w(v); +} + +inline int constant_time_declassify_int(int v) { + static_assert(sizeof(uint32_t) == sizeof(int), + "int is not the same size as uint32_t"); + // See comment above. + CONSTTIME_DECLASSIFY(&v, sizeof(v)); + return value_barrier_u32(v); +} + +// declassify_assert behaves like |assert| but declassifies the result of +// evaluating |expr|. This allows the assertion to branch on the (presumably +// public) result, but still ensures that values leading up to the computation +// were secret. +#define declassify_assert(expr) assert(constant_time_declassify_int(expr)) + // Thread-safe initialisation. @@ -502,21 +547,65 @@ typedef pthread_once_t CRYPTO_once_t; // // The |once| argument must be a |CRYPTO_once_t| that has been initialised with // the value |CRYPTO_ONCE_INIT|. -OPENSSL_EXPORT void CRYPTO_once(CRYPTO_once_t *once, void (*init)(void)); +OPENSSL_EXPORT void CRYPTO_once(CRYPTO_once_t *once, void (*init)()); -// Reference counting. +// Atomics. +// +// This is a thin wrapper over std::atomic because some embedded platforms do +// not support threads and don't provide a trivial std::atomic implementation. +// For now, this does not wrap std::memory_order. If we ever use non-default +// std::memory_order, we will need to wrap these too, or fix the embedded +// platforms to provide a no-op std::atomic. See https://crbug.com/442112336. + +#if defined(OPENSSL_THREADS) +template +using Atomic = std::atomic; +#else +template +class Atomic { + public: + static_assert(std::is_integral_v || std::is_pointer_v); + + Atomic() = default; + constexpr Atomic(T value) : value_(value) {} + Atomic(const Atomic &) = delete; + Atomic &operator=(const Atomic &) = delete; + T operator=(T value) { + value_ = value; + return value_; + } + + T load() const { return value_; } + void store(T desired) { value_ = desired; } -// Automatically enable C11 atomics if implemented. -#if !defined(OPENSSL_C11_ATOMIC) && defined(OPENSSL_THREADS) && \ - !defined(__STDC_NO_ATOMICS__) && defined(__STDC_VERSION__) && \ - __STDC_VERSION__ >= 201112L -#define OPENSSL_C11_ATOMIC + bool compare_exchange_strong(T &expected, T desired) { + if (value_ != expected) { + expected = value_; + return false; + } + value_ = desired; + return true; + } + bool compare_exchange_weak(T &expected, T desired) { + return compare_exchange_strong(expected, desired); + } + + T exchange(T desired) { return std::exchange(value_, desired); } + + private: + T value_; +}; #endif + +// Reference counting. + // CRYPTO_REFCOUNT_MAX is the value at which the reference count saturates. #define CRYPTO_REFCOUNT_MAX 0xffffffff +using CRYPTO_refcount_t = Atomic; + // CRYPTO_refcount_inc atomically increments the value at |*count| unless the // value would overflow. It's safe for multiple threads to concurrently call // this or |CRYPTO_refcount_dec_and_test_zero| on the same @@ -535,113 +624,75 @@ OPENSSL_EXPORT int CRYPTO_refcount_dec_and_test_zero(CRYPTO_refcount_t *count); // Locks. -// -// Two types of locks are defined: |CRYPTO_MUTEX|, which can be used in -// structures as normal, and |struct CRYPTO_STATIC_MUTEX|, which can be used as -// a global lock. A global lock must be initialised to the value -// |CRYPTO_STATIC_MUTEX_INIT|. -// -// |CRYPTO_MUTEX| can appear in public structures and so is defined in -// thread.h as a structure large enough to fit the real type. The global lock is -// a different type so it may be initialized with platform initializer macros. +// A Mutex is a read/write lock. It can be constant-initialized, but has a +// destructor. To allocate a global one, use StaticMutex, which skips the +// destructor. +class OPENSSL_EXPORT StaticMutex { + public: + constexpr StaticMutex() = default; + StaticMutex(const StaticMutex &) = delete; + StaticMutex &operator=(const StaticMutex &) = delete; + + // LockRead locks the mutex such that other threads may also have a read lock, + // but none may have a write lock. + void LockRead(); + // UnlockRead releases a read lock. + void UnlockRead(); + + // LockWrite locks the mutex such that no other thread has any type of lock on + // it. + void LockWrite(); + // UnlockWrite releases a write lock. + void UnlockWrite(); + +protected: #if !defined(OPENSSL_THREADS) -struct CRYPTO_STATIC_MUTEX { - char padding; // Empty structs have different sizes in C and C++. -}; -#define CRYPTO_STATIC_MUTEX_INIT { 0 } + // Nothing. #elif defined(OPENSSL_WINDOWS_THREADS) -struct CRYPTO_STATIC_MUTEX { - SRWLOCK lock; -}; -#define CRYPTO_STATIC_MUTEX_INIT { SRWLOCK_INIT } + SRWLOCK lock_ = SRWLOCK_INIT; #elif defined(OPENSSL_PTHREADS) -struct CRYPTO_STATIC_MUTEX { - pthread_rwlock_t lock; -}; -#define CRYPTO_STATIC_MUTEX_INIT { PTHREAD_RWLOCK_INITIALIZER } + pthread_rwlock_t lock_ = PTHREAD_RWLOCK_INITIALIZER; #else #error "Unknown threading library" #endif +}; -// CRYPTO_MUTEX_init initialises |lock|. If |lock| is a static variable, use a -// |CRYPTO_STATIC_MUTEX|. -OPENSSL_EXPORT void CRYPTO_MUTEX_init(CRYPTO_MUTEX *lock); - -// CRYPTO_MUTEX_lock_read locks |lock| such that other threads may also have a -// read lock, but none may have a write lock. -OPENSSL_EXPORT void CRYPTO_MUTEX_lock_read(CRYPTO_MUTEX *lock); - -// CRYPTO_MUTEX_lock_write locks |lock| such that no other thread has any type -// of lock on it. -OPENSSL_EXPORT void CRYPTO_MUTEX_lock_write(CRYPTO_MUTEX *lock); - -// CRYPTO_MUTEX_unlock_read unlocks |lock| for reading. -OPENSSL_EXPORT void CRYPTO_MUTEX_unlock_read(CRYPTO_MUTEX *lock); - -// CRYPTO_MUTEX_unlock_write unlocks |lock| for writing. -OPENSSL_EXPORT void CRYPTO_MUTEX_unlock_write(CRYPTO_MUTEX *lock); - -// CRYPTO_MUTEX_cleanup releases all resources held by |lock|. -OPENSSL_EXPORT void CRYPTO_MUTEX_cleanup(CRYPTO_MUTEX *lock); - -// CRYPTO_STATIC_MUTEX_lock_read locks |lock| such that other threads may also -// have a read lock, but none may have a write lock. The |lock| variable does -// not need to be initialised by any function, but must have been statically -// initialised with |CRYPTO_STATIC_MUTEX_INIT|. -OPENSSL_EXPORT void CRYPTO_STATIC_MUTEX_lock_read( - struct CRYPTO_STATIC_MUTEX *lock); - -// CRYPTO_STATIC_MUTEX_lock_write locks |lock| such that no other thread has -// any type of lock on it. The |lock| variable does not need to be initialised -// by any function, but must have been statically initialised with -// |CRYPTO_STATIC_MUTEX_INIT|. -OPENSSL_EXPORT void CRYPTO_STATIC_MUTEX_lock_write( - struct CRYPTO_STATIC_MUTEX *lock); - -// CRYPTO_STATIC_MUTEX_unlock_read unlocks |lock| for reading. -OPENSSL_EXPORT void CRYPTO_STATIC_MUTEX_unlock_read( - struct CRYPTO_STATIC_MUTEX *lock); - -// CRYPTO_STATIC_MUTEX_unlock_write unlocks |lock| for writing. -OPENSSL_EXPORT void CRYPTO_STATIC_MUTEX_unlock_write( - struct CRYPTO_STATIC_MUTEX *lock); - -#if defined(__cplusplus) -extern "C++" { - -BSSL_NAMESPACE_BEGIN +class OPENSSL_EXPORT Mutex : public StaticMutex { + public: + constexpr Mutex() = default; + ~Mutex(); +}; namespace internal { -// MutexLockBase is a RAII helper for CRYPTO_MUTEX locking. -template +// MutexLockBase is a RAII helper for Mutex locking. +template class MutexLockBase { public: - explicit MutexLockBase(CRYPTO_MUTEX *mu) : mu_(mu) { + explicit MutexLockBase(StaticMutex *mu) : mu_(mu) { assert(mu_ != nullptr); - LockFunc(mu_); + (mu_->*LockMethod)(); } - ~MutexLockBase() { ReleaseFunc(mu_); } - MutexLockBase(const MutexLockBase &) = delete; - MutexLockBase &operator=(const MutexLockBase &) = - delete; + ~MutexLockBase() { (mu_->*ReleaseMethod)(); } + MutexLockBase(const MutexLockBase &) = delete; + MutexLockBase &operator=(const MutexLockBase &) = delete; private: - CRYPTO_MUTEX *const mu_; + StaticMutex *const mu_; }; } // namespace internal using MutexWriteLock = - internal::MutexLockBase; + internal::MutexLockBase<&StaticMutex::LockWrite, &StaticMutex::UnlockWrite>; using MutexReadLock = - internal::MutexLockBase; - -BSSL_NAMESPACE_END - -} // extern "C++" -#endif // defined(__cplusplus) + internal::MutexLockBase<&StaticMutex::LockRead, &StaticMutex::UnlockRead>; +using MutexWriteUnlock = + internal::MutexLockBase<&StaticMutex::UnlockWrite, &StaticMutex::LockWrite>; +using MutexReadUnlock = + internal::MutexLockBase<&StaticMutex::UnlockRead, &StaticMutex::LockRead>; // Thread local storage. @@ -686,33 +737,41 @@ OPENSSL_EXPORT int CRYPTO_set_thread_local( // ex_data -typedef struct crypto_ex_data_func_st CRYPTO_EX_DATA_FUNCS; +BSSL_NAMESPACE_END + +struct crypto_ex_data_st { + STACK_OF(void) *sk; +} /* CRYPTO_EX_DATA */; -DECLARE_STACK_OF(CRYPTO_EX_DATA_FUNCS) +BSSL_NAMESPACE_BEGIN -// CRYPTO_EX_DATA_CLASS tracks the ex_indices registered for a type which +struct ExDataFuncs; + +// ExDataClass tracks the ex_indices registered for a type which // supports ex_data. It should defined as a static global within the module // which defines that type. -typedef struct { - struct CRYPTO_STATIC_MUTEX lock; - STACK_OF(CRYPTO_EX_DATA_FUNCS) *meth; +struct ExDataClass { + explicit constexpr ExDataClass(bool with_app_data = false) + : num_reserved(with_app_data ? 1 : 0) {} + + StaticMutex lock; + // funcs is a linked list of |ExDataFuncs| structures. It may be traversed + // without serialization only up to |num_funcs|. last points to the final + // entry of |funcs|, or nullptr if empty. + ExDataFuncs *funcs = nullptr, *last = nullptr; + // num_funcs is the number of entries in |funcs|. + Atomic num_funcs = 0; // num_reserved is one if the ex_data index zero is reserved for legacy // |TYPE_get_app_data| functions. - uint8_t num_reserved; -} CRYPTO_EX_DATA_CLASS; - -#define CRYPTO_EX_DATA_CLASS_INIT {CRYPTO_STATIC_MUTEX_INIT, NULL, 0} -#define CRYPTO_EX_DATA_CLASS_INIT_WITH_APP_DATA \ - {CRYPTO_STATIC_MUTEX_INIT, NULL, 1} - -// CRYPTO_get_ex_new_index allocates a new index for |ex_data_class| and writes -// it to |*out_index|. Each class of object should provide a wrapper function -// that uses the correct |CRYPTO_EX_DATA_CLASS|. It returns one on success and -// zero otherwise. -OPENSSL_EXPORT int CRYPTO_get_ex_new_index(CRYPTO_EX_DATA_CLASS *ex_data_class, - int *out_index, long argl, - void *argp, - CRYPTO_EX_free *free_func); + uint8_t num_reserved = 0; +}; + +// CRYPTO_get_ex_new_index_ex allocates a new index for |ex_data_class|. Each +// class of object should provide a wrapper function that uses the correct +// |ExDataClass|. It returns the new index on success and -1 on error. +OPENSSL_EXPORT int CRYPTO_get_ex_new_index_ex( + ExDataClass *ex_data_class, long argl, void *argp, + CRYPTO_EX_free *free_func); // CRYPTO_set_ex_data sets an extra data pointer on a given object. Each class // of object should provide a wrapper function. @@ -726,54 +785,36 @@ OPENSSL_EXPORT void *CRYPTO_get_ex_data(const CRYPTO_EX_DATA *ad, int index); // CRYPTO_new_ex_data initialises a newly allocated |CRYPTO_EX_DATA|. OPENSSL_EXPORT void CRYPTO_new_ex_data(CRYPTO_EX_DATA *ad); -// CRYPTO_free_ex_data frees |ad|, which is embedded inside |obj|, which is an -// object of the given class. -OPENSSL_EXPORT void CRYPTO_free_ex_data(CRYPTO_EX_DATA_CLASS *ex_data_class, - void *obj, CRYPTO_EX_DATA *ad); +// CRYPTO_free_ex_data frees |ad|, which is an object of the given class. +OPENSSL_EXPORT void CRYPTO_free_ex_data(ExDataClass *ex_data_class, + CRYPTO_EX_DATA *ad); // Endianness conversions. #if defined(__GNUC__) && __GNUC__ >= 2 -static inline uint16_t CRYPTO_bswap2(uint16_t x) { - return __builtin_bswap16(x); -} +inline uint16_t CRYPTO_bswap2(uint16_t x) { return __builtin_bswap16(x); } -static inline uint32_t CRYPTO_bswap4(uint32_t x) { - return __builtin_bswap32(x); -} +inline uint32_t CRYPTO_bswap4(uint32_t x) { return __builtin_bswap32(x); } -static inline uint64_t CRYPTO_bswap8(uint64_t x) { - return __builtin_bswap64(x); -} +inline uint64_t CRYPTO_bswap8(uint64_t x) { return __builtin_bswap64(x); } #elif defined(_MSC_VER) -OPENSSL_MSVC_PRAGMA(warning(push, 3)) -#include -OPENSSL_MSVC_PRAGMA(warning(pop)) #pragma intrinsic(_byteswap_uint64, _byteswap_ulong, _byteswap_ushort) -static inline uint16_t CRYPTO_bswap2(uint16_t x) { - return _byteswap_ushort(x); -} +inline uint16_t CRYPTO_bswap2(uint16_t x) { return _byteswap_ushort(x); } -static inline uint32_t CRYPTO_bswap4(uint32_t x) { - return _byteswap_ulong(x); -} +inline uint32_t CRYPTO_bswap4(uint32_t x) { return _byteswap_ulong(x); } -static inline uint64_t CRYPTO_bswap8(uint64_t x) { - return _byteswap_uint64(x); -} +inline uint64_t CRYPTO_bswap8(uint64_t x) { return _byteswap_uint64(x); } #else -static inline uint16_t CRYPTO_bswap2(uint16_t x) { - return (x >> 8) | (x << 8); -} +inline uint16_t CRYPTO_bswap2(uint16_t x) { return (x >> 8) | (x << 8); } -static inline uint32_t CRYPTO_bswap4(uint32_t x) { +inline uint32_t CRYPTO_bswap4(uint32_t x) { x = (x >> 16) | (x << 16); x = ((x & 0xff00ff00) >> 8) | ((x & 0x00ff00ff) << 8); return x; } -static inline uint64_t CRYPTO_bswap8(uint64_t x) { +inline uint64_t CRYPTO_bswap8(uint64_t x) { return CRYPTO_bswap4(x >> 32) | (((uint64_t)CRYPTO_bswap4(x)) << 32); } #endif @@ -793,39 +834,23 @@ static inline uint64_t CRYPTO_bswap8(uint64_t x) { // Note |OPENSSL_memcmp| is a different function from |CRYPTO_memcmp|. // C++ defines |memchr| as a const-correct overload. -#if defined(__cplusplus) -extern "C++" { - -static inline const void *OPENSSL_memchr(const void *s, int c, size_t n) { +inline const void *OPENSSL_memchr(const void *s, int c, size_t n) { if (n == 0) { - return NULL; + return nullptr; } return memchr(s, c, n); } -static inline void *OPENSSL_memchr(void *s, int c, size_t n) { +inline void *OPENSSL_memchr(void *s, int c, size_t n) { if (n == 0) { - return NULL; + return nullptr; } return memchr(s, c, n); } -} // extern "C++" -#else // __cplusplus - -static inline void *OPENSSL_memchr(const void *s, int c, size_t n) { - if (n == 0) { - return NULL; - } - - return memchr(s, c, n); -} - -#endif // __cplusplus - -static inline int OPENSSL_memcmp(const void *s1, const void *s2, size_t n) { +inline int OPENSSL_memcmp(const void *s1, const void *s2, size_t n) { if (n == 0) { return 0; } @@ -833,7 +858,7 @@ static inline int OPENSSL_memcmp(const void *s1, const void *s2, size_t n) { return memcmp(s1, s2, n); } -static inline void *OPENSSL_memcpy(void *dst, const void *src, size_t n) { +inline void *OPENSSL_memcpy(void *dst, const void *src, size_t n) { if (n == 0) { return dst; } @@ -841,7 +866,7 @@ static inline void *OPENSSL_memcpy(void *dst, const void *src, size_t n) { return memcpy(dst, src, n); } -static inline void *OPENSSL_memmove(void *dst, const void *src, size_t n) { +inline void *OPENSSL_memmove(void *dst, const void *src, size_t n) { if (n == 0) { return dst; } @@ -849,7 +874,7 @@ static inline void *OPENSSL_memmove(void *dst, const void *src, size_t n) { return memmove(dst, src, n); } -static inline void *OPENSSL_memset(void *dst, int c, size_t n) { +inline void *OPENSSL_memset(void *dst, int c, size_t n) { if (n == 0) { return dst; } @@ -864,59 +889,80 @@ static inline void *OPENSSL_memset(void *dst, int c, size_t n) { // endianness. They use |memcpy|, and so avoid alignment or strict aliasing // requirements on the input and output pointers. -static inline uint32_t CRYPTO_load_u32_le(const void *in) { +inline uint16_t CRYPTO_load_u16_le(const void *in) { + uint16_t v; + OPENSSL_memcpy(&v, in, sizeof(v)); + return v; +} + +inline void CRYPTO_store_u16_le(void *out, uint16_t v) { + OPENSSL_memcpy(out, &v, sizeof(v)); +} + +inline uint16_t CRYPTO_load_u16_be(const void *in) { + uint16_t v; + OPENSSL_memcpy(&v, in, sizeof(v)); + return CRYPTO_bswap2(v); +} + +inline void CRYPTO_store_u16_be(void *out, uint16_t v) { + v = CRYPTO_bswap2(v); + OPENSSL_memcpy(out, &v, sizeof(v)); +} + +inline uint32_t CRYPTO_load_u32_le(const void *in) { uint32_t v; OPENSSL_memcpy(&v, in, sizeof(v)); return v; } -static inline void CRYPTO_store_u32_le(void *out, uint32_t v) { +inline void CRYPTO_store_u32_le(void *out, uint32_t v) { OPENSSL_memcpy(out, &v, sizeof(v)); } -static inline uint32_t CRYPTO_load_u32_be(const void *in) { +inline uint32_t CRYPTO_load_u32_be(const void *in) { uint32_t v; OPENSSL_memcpy(&v, in, sizeof(v)); return CRYPTO_bswap4(v); } -static inline void CRYPTO_store_u32_be(void *out, uint32_t v) { +inline void CRYPTO_store_u32_be(void *out, uint32_t v) { v = CRYPTO_bswap4(v); OPENSSL_memcpy(out, &v, sizeof(v)); } -static inline uint64_t CRYPTO_load_u64_le(const void *in) { +inline uint64_t CRYPTO_load_u64_le(const void *in) { uint64_t v; OPENSSL_memcpy(&v, in, sizeof(v)); return v; } -static inline void CRYPTO_store_u64_le(void *out, uint64_t v) { +inline void CRYPTO_store_u64_le(void *out, uint64_t v) { OPENSSL_memcpy(out, &v, sizeof(v)); } -static inline uint64_t CRYPTO_load_u64_be(const void *ptr) { +inline uint64_t CRYPTO_load_u64_be(const void *ptr) { uint64_t ret; OPENSSL_memcpy(&ret, ptr, sizeof(ret)); return CRYPTO_bswap8(ret); } -static inline void CRYPTO_store_u64_be(void *out, uint64_t v) { +inline void CRYPTO_store_u64_be(void *out, uint64_t v) { v = CRYPTO_bswap8(v); OPENSSL_memcpy(out, &v, sizeof(v)); } -static inline crypto_word_t CRYPTO_load_word_le(const void *in) { +inline crypto_word_t CRYPTO_load_word_le(const void *in) { crypto_word_t v; OPENSSL_memcpy(&v, in, sizeof(v)); return v; } -static inline void CRYPTO_store_word_le(void *out, crypto_word_t v) { +inline void CRYPTO_store_word_le(void *out, crypto_word_t v) { OPENSSL_memcpy(out, &v, sizeof(v)); } -static inline crypto_word_t CRYPTO_load_word_be(const void *in) { +inline crypto_word_t CRYPTO_load_word_be(const void *in) { crypto_word_t v; OPENSSL_memcpy(&v, in, sizeof(v)); #if defined(OPENSSL_64_BIT) @@ -935,7 +981,7 @@ static inline crypto_word_t CRYPTO_load_word_be(const void *in) { // width is undefined. Both Clang and GCC recognize this pattern as a rotation, // but MSVC does not. Instead, we call MSVC's built-in functions. -static inline uint32_t CRYPTO_rotl_u32(uint32_t value, int shift) { +inline uint32_t CRYPTO_rotl_u32(uint32_t value, int shift) { #if defined(_MSC_VER) return _rotl(value, shift); #else @@ -943,7 +989,7 @@ static inline uint32_t CRYPTO_rotl_u32(uint32_t value, int shift) { #endif } -static inline uint32_t CRYPTO_rotr_u32(uint32_t value, int shift) { +inline uint32_t CRYPTO_rotr_u32(uint32_t value, int shift) { #if defined(_MSC_VER) return _rotr(value, shift); #else @@ -951,7 +997,7 @@ static inline uint32_t CRYPTO_rotr_u32(uint32_t value, int shift) { #endif } -static inline uint64_t CRYPTO_rotl_u64(uint64_t value, int shift) { +inline uint64_t CRYPTO_rotl_u64(uint64_t value, int shift) { #if defined(_MSC_VER) return _rotl64(value, shift); #else @@ -959,7 +1005,7 @@ static inline uint64_t CRYPTO_rotl_u64(uint64_t value, int shift) { #endif } -static inline uint64_t CRYPTO_rotr_u64(uint64_t value, int shift) { +inline uint64_t CRYPTO_rotr_u64(uint64_t value, int shift) { #if defined(_MSC_VER) return _rotr64(value, shift); #else @@ -975,62 +1021,81 @@ static inline uint64_t CRYPTO_rotr_u64(uint64_t value, int shift) { // BORINGSSL_FIPS_abort is called when a FIPS power-on or continuous test // fails. It prevents any further cryptographic operations by the current // process. -void BORINGSSL_FIPS_abort(void) __attribute__((noreturn)); +void BORINGSSL_FIPS_abort() __attribute__((noreturn)); // boringssl_self_test_startup runs all startup self tests and returns one on // success or zero on error. Startup self tests do not include lazy tests. // Call |BORINGSSL_self_test| to run every self test. -int boringssl_self_test_startup(void); +int boringssl_self_test_startup(); -// boringssl_ensure_rsa_self_test checks whether the RSA self-test has been run -// in this address space. If not, it runs it and crashes the address space if -// unsuccessful. -void boringssl_ensure_rsa_self_test(void); +// boringssl_ensure_rsa_sign_self_test checks whether the RSA signing self-test +// has been run in this address space. If not, it runs it and crashes the +// address space if unsuccessful. +void boringssl_ensure_rsa_sign_self_test(); + +// boringssl_ensure_rsa_verify_self_test checks whether the RSA verification +// self-test has been run in this address space. If not, it runs it and crashes +// the address space if unsuccessful. +void boringssl_ensure_rsa_verify_self_test(); // boringssl_ensure_ecc_self_test checks whether the ECDSA and ECDH self-test // has been run in this address space. If not, it runs it and crashes the // address space if unsuccessful. -void boringssl_ensure_ecc_self_test(void); +void boringssl_ensure_ecc_self_test(); // boringssl_ensure_ffdh_self_test checks whether the FFDH self-test has been // run in this address space. If not, it runs it and crashes the address space // if unsuccessful. -void boringssl_ensure_ffdh_self_test(void); +void boringssl_ensure_ffdh_self_test(); #else // Outside of FIPS mode, the lazy tests are no-ops. -OPENSSL_INLINE void boringssl_ensure_rsa_self_test(void) {} -OPENSSL_INLINE void boringssl_ensure_ecc_self_test(void) {} -OPENSSL_INLINE void boringssl_ensure_ffdh_self_test(void) {} +inline void boringssl_ensure_rsa_sign_self_test() {} +inline void boringssl_ensure_rsa_verify_self_test() {} +inline void boringssl_ensure_ecc_self_test() {} +inline void boringssl_ensure_ffdh_self_test() {} #endif // FIPS +// BORINGSSL_check_test checks that |expected| and |actual| are equal. It +// returns 1 on success and, on failure, it prints an error message that +// includes the hexdumps the two values and returns 0. +int BORINGSSL_check_test(Span expected, + Span actual, const char *name); + // boringssl_self_test_sha256 performs a SHA-256 KAT. -int boringssl_self_test_sha256(void); +int boringssl_self_test_sha256(); // boringssl_self_test_sha512 performs a SHA-512 KAT. -int boringssl_self_test_sha512(void); +int boringssl_self_test_sha512(); // boringssl_self_test_hmac_sha256 performs an HMAC-SHA-256 KAT. -int boringssl_self_test_hmac_sha256(void); +int boringssl_self_test_hmac_sha256(); + +// boringssl_self_test_mlkem performs the ML-KEM KATs. +OPENSSL_EXPORT int boringssl_self_test_mlkem(); + +// boringssl_self_test_mldsa performs the ML-DSA KATs. +OPENSSL_EXPORT int boringssl_self_test_mldsa(); + +// boringssl_self_test_slhdsa performs the SLH-DSA KATs. +OPENSSL_EXPORT int boringssl_self_test_slhdsa(); #if defined(BORINGSSL_FIPS_COUNTERS) void boringssl_fips_inc_counter(enum fips_counter_t counter); #else -OPENSSL_INLINE void boringssl_fips_inc_counter(enum fips_counter_t counter) {} +inline void boringssl_fips_inc_counter(enum fips_counter_t counter) {} #endif #if defined(BORINGSSL_FIPS_BREAK_TESTS) -OPENSSL_INLINE int boringssl_fips_break_test(const char *test) { +inline int boringssl_fips_break_test(const char *test) { const char *const value = getenv("BORINGSSL_FIPS_BREAK_TEST"); - return value != NULL && strcmp(value, test) == 0; + return value != nullptr && strcmp(value, test) == 0; } #else -OPENSSL_INLINE int boringssl_fips_break_test(const char *test) { - return 0; -} +inline int boringssl_fips_break_test(const char *test) { return 0; } #endif // BORINGSSL_FIPS_BREAK_TESTS @@ -1042,139 +1107,204 @@ OPENSSL_INLINE int boringssl_fips_break_test(const char *test) { // // Index 0: // EDX for CPUID where EAX = 1 -// Bit 20 is always zero -// Bit 28 is adjusted to reflect whether the data cache is shared between -// multiple logical cores // Bit 30 is used to indicate an Intel CPU // Index 1: // ECX for CPUID where EAX = 1 -// Bit 11 is used to indicate AMD XOP support, not SDBG // Index 2: -// EBX for CPUID where EAX = 7 +// EBX for CPUID where EAX = 7, ECX = 0 +// Bit 14 (for removed feature MPX) is used to indicate a preference for ymm +// registers over zmm even when zmm registers are supported // Index 3: -// ECX for CPUID where EAX = 7 +// ECX for CPUID where EAX = 7, ECX = 0 +// +// Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the XMM, YMM, +// and AVX512 bits in XCR0, so it is not necessary to check those. (WARNING: See +// caveats in cpu_intel.c.) // -// Note: the CPUID bits are pre-adjusted for the OSXSAVE bit and the YMM and XMM -// bits in XCR0, so it is not necessary to check those. +// This symbol should only be accessed with |OPENSSL_get_ia32cap|. extern uint32_t OPENSSL_ia32cap_P[4]; -#if defined(BORINGSSL_FIPS) && !defined(BORINGSSL_SHARED_LIBRARY) -// The FIPS module, as a static library, requires an out-of-line version of -// |OPENSSL_ia32cap_get| so accesses can be rewritten by delocate. Mark the -// function const so multiple accesses can be optimized together. -const uint32_t *OPENSSL_ia32cap_get(void) __attribute__((const)); -#else -OPENSSL_INLINE const uint32_t *OPENSSL_ia32cap_get(void) { - return OPENSSL_ia32cap_P; -} -#endif +// OPENSSL_get_ia32cap initializes the library if needed and returns the |idx|th +// entry of |OPENSSL_ia32cap_P|. It is marked as a const function so duplicate +// calls can be merged by the compiler, at least when indices match. +OPENSSL_ATTR_CONST uint32_t OPENSSL_get_ia32cap(int idx); -// See Intel manual, volume 2A, table 3-11. +// OPENSSL_adjust_ia32cap adjusts |cap|, which should contain +// |OPENSSL_ia32cap_P|, based on the environment variable value in |env|. This +// function is exposed for unit tests. +void OPENSSL_adjust_ia32cap(uint32_t cap[4], const char *env); -OPENSSL_INLINE int CRYPTO_is_FXSR_capable(void) { -#if defined(__FXSR__) - return 1; -#else - return (OPENSSL_ia32cap_get()[0] & (1 << 24)) != 0; -#endif -} +// See Intel manual, volume 2A, table 3-11. -OPENSSL_INLINE int CRYPTO_is_intel_cpu(void) { +inline int CRYPTO_is_intel_cpu() { // The reserved bit 30 is used to indicate an Intel CPU. - return (OPENSSL_ia32cap_get()[0] & (1 << 30)) != 0; + return (OPENSSL_get_ia32cap(0) & (1u << 30)) != 0; } // See Intel manual, volume 2A, table 3-10. -OPENSSL_INLINE int CRYPTO_is_PCLMUL_capable(void) { +inline int CRYPTO_is_PCLMUL_capable() { #if defined(__PCLMUL__) return 1; #else - return (OPENSSL_ia32cap_get()[1] & (1 << 1)) != 0; + return (OPENSSL_get_ia32cap(1) & (1u << 1)) != 0; #endif } -OPENSSL_INLINE int CRYPTO_is_SSSE3_capable(void) { +inline int CRYPTO_is_SSSE3_capable() { #if defined(__SSSE3__) return 1; #else - return (OPENSSL_ia32cap_get()[1] & (1 << 9)) != 0; + return (OPENSSL_get_ia32cap(1) & (1u << 9)) != 0; #endif } -OPENSSL_INLINE int CRYPTO_is_SSE4_1_capable(void) { +inline int CRYPTO_is_SSE4_1_capable() { #if defined(__SSE4_1__) return 1; #else - return (OPENSSL_ia32cap_P[1] & (1 << 19)) != 0; + return (OPENSSL_get_ia32cap(1) & (1u << 19)) != 0; #endif } -OPENSSL_INLINE int CRYPTO_is_MOVBE_capable(void) { +inline int CRYPTO_is_MOVBE_capable() { #if defined(__MOVBE__) return 1; #else - return (OPENSSL_ia32cap_get()[1] & (1 << 22)) != 0; + return (OPENSSL_get_ia32cap(1) & (1u << 22)) != 0; #endif } -OPENSSL_INLINE int CRYPTO_is_AESNI_capable(void) { +inline int CRYPTO_is_AESNI_capable() { #if defined(__AES__) return 1; #else - return (OPENSSL_ia32cap_get()[1] & (1 << 25)) != 0; + return (OPENSSL_get_ia32cap(1) & (1u << 25)) != 0; #endif } -OPENSSL_INLINE int CRYPTO_is_AVX_capable(void) { +// We intentionally avoid defining a |CRYPTO_is_XSAVE_capable| function. See +// |CRYPTO_cpu_perf_is_like_silvermont|. + +inline int CRYPTO_is_AVX_capable() { #if defined(__AVX__) return 1; #else - return (OPENSSL_ia32cap_get()[1] & (1 << 28)) != 0; + return (OPENSSL_get_ia32cap(1) & (1u << 28)) != 0; #endif } -OPENSSL_INLINE int CRYPTO_is_RDRAND_capable(void) { - // The GCC/Clang feature name and preprocessor symbol for RDRAND are "rdrnd" - // and |__RDRND__|, respectively. -#if defined(__RDRND__) - return 1; -#else - return (OPENSSL_ia32cap_get()[1] & (1u << 30)) != 0; -#endif +inline int CRYPTO_is_RDRAND_capable() { + // We intentionally do not check |__RDRND__| here. On some AMD processors, we + // will act as if the hardware is RDRAND-incapable, even it actually supports + // it. See cpu_intel.c. + return (OPENSSL_get_ia32cap(1) & (1u << 30)) != 0; } // See Intel manual, volume 2A, table 3-8. -OPENSSL_INLINE int CRYPTO_is_BMI1_capable(void) { -#if defined(__BMI1__) +inline int CRYPTO_is_BMI1_capable() { +#if defined(__BMI__) return 1; #else - return (OPENSSL_ia32cap_get()[2] & (1 << 3)) != 0; + return (OPENSSL_get_ia32cap(2) & (1u << 3)) != 0; #endif } -OPENSSL_INLINE int CRYPTO_is_AVX2_capable(void) { +inline int CRYPTO_is_AVX2_capable() { #if defined(__AVX2__) return 1; #else - return (OPENSSL_ia32cap_get()[2] & (1 << 5)) != 0; + return (OPENSSL_get_ia32cap(2) & (1u << 5)) != 0; #endif } -OPENSSL_INLINE int CRYPTO_is_BMI2_capable(void) { +inline int CRYPTO_is_BMI2_capable() { #if defined(__BMI2__) return 1; #else - return (OPENSSL_ia32cap_get()[2] & (1 << 8)) != 0; + return (OPENSSL_get_ia32cap(2) & (1u << 8)) != 0; #endif } -OPENSSL_INLINE int CRYPTO_is_ADX_capable(void) { +inline int CRYPTO_is_ADX_capable() { #if defined(__ADX__) return 1; #else - return (OPENSSL_ia32cap_get()[2] & (1 << 19)) != 0; + return (OPENSSL_get_ia32cap(2) & (1u << 19)) != 0; +#endif +} + +// SHA-1 and SHA-256 are defined as a single extension. +inline int CRYPTO_is_x86_SHA_capable() { +#if defined(__SHA__) + return 1; +#else + return (OPENSSL_get_ia32cap(2) & (1u << 29)) != 0; +#endif +} + +// CRYPTO_cpu_perf_is_like_silvermont returns one if, based on a heuristic, the +// CPU has Silvermont-like performance characteristics. It is often faster to +// run different codepaths on these CPUs than the available instructions would +// otherwise select. See chacha-x86_64.pl. +// +// Bonnell, Silvermont's predecessor in the Atom lineup, will also be matched by +// this. Goldmont (Silvermont's successor in the Atom lineup) added XSAVE so it +// isn't matched by this. Various sources indicate AMD first implemented MOVBE +// and XSAVE at the same time in Jaguar, so it seems like AMD chips will not be +// matched by this. That seems to be the case for other x86(-64) CPUs. +inline int CRYPTO_cpu_perf_is_like_silvermont() { + // WARNING: This MUST NOT be used to guard the execution of the XSAVE + // instruction. This is the "hardware supports XSAVE" bit, not the OSXSAVE bit + // that indicates whether we can safely execute XSAVE. This bit may be set + // even when XSAVE is disabled (by the operating system). See how the users of + // this bit use it. + // + // Historically, the XSAVE bit was artificially cleared on Knights Landing + // and Knights Mill chips, but as Intel has removed all support from GCC, + // LLVM, and SDE, we assume they are no longer worth special-casing. + int hardware_supports_xsave = (OPENSSL_get_ia32cap(1) & (1u << 26)) != 0; + return !hardware_supports_xsave && CRYPTO_is_MOVBE_capable(); +} + +inline int CRYPTO_is_AVX512BW_capable() { +#if defined(__AVX512BW__) + return 1; +#else + return (OPENSSL_get_ia32cap(2) & (1u << 30)) != 0; +#endif +} + +inline int CRYPTO_is_AVX512VL_capable() { +#if defined(__AVX512VL__) + return 1; +#else + return (OPENSSL_get_ia32cap(2) & (1u << 31)) != 0; +#endif +} + +// CRYPTO_cpu_avoid_zmm_registers returns 1 if zmm registers (512-bit vectors) +// should not be used even if the CPU supports them. +// +// Note that this reuses the bit for the removed MPX feature. +inline int CRYPTO_cpu_avoid_zmm_registers() { + return (OPENSSL_get_ia32cap(2) & (1u << 14)) != 0; +} + +inline int CRYPTO_is_VAES_capable() { +#if defined(__VAES__) + return 1; +#else + return (OPENSSL_get_ia32cap(3) & (1u << 9)) != 0; +#endif +} + +inline int CRYPTO_is_VPCLMULQDQ_capable() { +#if defined(__VPCLMULQDQ__) + return 1; +#else + return (OPENSSL_get_ia32cap(3) & (1u << 10)) != 0; #endif } @@ -1182,11 +1312,40 @@ OPENSSL_INLINE int CRYPTO_is_ADX_capable(void) { #if defined(OPENSSL_ARM) || defined(OPENSSL_AARCH64) -#if defined(OPENSSL_APPLE) && defined(OPENSSL_ARM) -// We do not detect any features at runtime for Apple's 32-bit ARM platforms. On -// 64-bit ARM, we detect some post-ARMv8.0 features. -#define OPENSSL_STATIC_ARMCAP -#endif +// ARMV7_NEON indicates support for NEON. +#define ARMV7_NEON (1 << 0) + +// ARMV8_AES indicates support for hardware AES instructions. +#define ARMV8_AES (1 << 2) + +// ARMV8_SHA1 indicates support for hardware SHA-1 instructions. +#define ARMV8_SHA1 (1 << 3) + +// ARMV8_SHA256 indicates support for hardware SHA-256 instructions. +#define ARMV8_SHA256 (1 << 4) + +// ARMV8_PMULL indicates support for carryless multiplication. +#define ARMV8_PMULL (1 << 5) + +// ARMV8_SHA512 indicates support for hardware SHA-512 instructions. +#define ARMV8_SHA512 (1 << 6) + +// ARMV8_SHA3 indicates support for eor3 instructions. +#define ARMV8_SHA3 (1 << 7) + +#if defined(OPENSSL_STATIC_ARMCAP) +// We assume |CRYPTO_is_*_capable| already checked static capabilities. +inline uint32_t OPENSSL_get_armcap() { return 0; } +#else +// OPENSSL_armcap_P contains ARM CPU capabilities as a bitmask of the above +// constants. This should only be accessed with |OPENSSL_get_armcap|. +extern uint32_t OPENSSL_armcap_P; + +// OPENSSL_get_armcap initializes the library if needed and returns ARM CPU +// capabilities. It is marked as a const function so duplicate calls can be +// merged by the compiler. +OPENSSL_ATTR_CONST uint32_t OPENSSL_get_armcap(); +#endif // OPENSSL_STATIC_ARMCAP // Normalize some older feature flags to their modern ACLE values. // https://developer.arm.com/architectures/system-architectures/software-standards/acle @@ -1202,64 +1361,73 @@ OPENSSL_INLINE int CRYPTO_is_ADX_capable(void) { #endif #endif -#if !defined(OPENSSL_STATIC_ARMCAP) -// CRYPTO_is_NEON_capable_at_runtime returns true if the current CPU has a NEON -// unit. Note that |OPENSSL_armcap_P| also exists and contains the same -// information in a form that's easier for assembly to use. -OPENSSL_EXPORT int CRYPTO_is_NEON_capable_at_runtime(void); - -// CRYPTO_is_ARMv8_AES_capable_at_runtime returns true if the current CPU -// supports the ARMv8 AES instruction. -int CRYPTO_is_ARMv8_AES_capable_at_runtime(void); - -// CRYPTO_is_ARMv8_PMULL_capable_at_runtime returns true if the current CPU -// supports the ARMv8 PMULL instruction. -int CRYPTO_is_ARMv8_PMULL_capable_at_runtime(void); -#endif // !OPENSSL_STATIC_ARMCAP - // CRYPTO_is_NEON_capable returns true if the current CPU has a NEON unit. If // this is known statically, it is a constant inline function. -OPENSSL_INLINE int CRYPTO_is_NEON_capable(void) { -#if defined(OPENSSL_STATIC_ARMCAP_NEON) || defined(__ARM_NEON) +inline int CRYPTO_is_NEON_capable() { +#if (defined(OPENSSL_STATIC_ARMCAP_NEON) || defined(__ARM_NEON)) && \ + !defined(OPENSSL_NO_STATIC_NEON_FOR_TESTING) return 1; -#elif defined(OPENSSL_STATIC_ARMCAP) - return 0; #else - return CRYPTO_is_NEON_capable_at_runtime(); + return (OPENSSL_get_armcap() & ARMV7_NEON) != 0; #endif } -OPENSSL_INLINE int CRYPTO_is_ARMv8_AES_capable(void) { +inline int CRYPTO_is_ARMv8_AES_capable() { #if defined(OPENSSL_STATIC_ARMCAP_AES) || defined(__ARM_FEATURE_AES) return 1; -#elif defined(OPENSSL_STATIC_ARMCAP) - return 0; #else - return CRYPTO_is_ARMv8_AES_capable_at_runtime(); + return (OPENSSL_get_armcap() & ARMV8_AES) != 0; #endif } -OPENSSL_INLINE int CRYPTO_is_ARMv8_PMULL_capable(void) { +inline int CRYPTO_is_ARMv8_PMULL_capable() { #if defined(OPENSSL_STATIC_ARMCAP_PMULL) || defined(__ARM_FEATURE_AES) return 1; -#elif defined(OPENSSL_STATIC_ARMCAP) - return 0; #else - return CRYPTO_is_ARMv8_PMULL_capable_at_runtime(); + return (OPENSSL_get_armcap() & ARMV8_PMULL) != 0; #endif } -#endif // OPENSSL_ARM || OPENSSL_AARCH64 +inline int CRYPTO_is_ARMv8_SHA1_capable() { + // SHA-1 and SHA-2 (only) share |__ARM_FEATURE_SHA2| but otherwise + // are dealt with independently. +#if defined(OPENSSL_STATIC_ARMCAP_SHA1) || defined(__ARM_FEATURE_SHA2) + return 1; +#else + return (OPENSSL_get_armcap() & ARMV8_SHA1) != 0; +#endif +} + +inline int CRYPTO_is_ARMv8_SHA256_capable() { + // SHA-1 and SHA-2 (only) share |__ARM_FEATURE_SHA2| but otherwise + // are dealt with independently. +#if defined(OPENSSL_STATIC_ARMCAP_SHA256) || defined(__ARM_FEATURE_SHA2) + return 1; +#else + return (OPENSSL_get_armcap() & ARMV8_SHA256) != 0; +#endif +} -#if defined(OPENSSL_PPC64LE) +inline int CRYPTO_is_ARMv8_SHA512_capable() { + // There is no |OPENSSL_STATIC_ARMCAP_SHA512|. +#if defined(__ARM_FEATURE_SHA512) + return 1; +#else + return (OPENSSL_get_armcap() & ARMV8_SHA512) != 0; +#endif +} -// CRYPTO_is_PPC64LE_vcrypto_capable returns true iff the current CPU supports -// the Vector.AES category of instructions. -int CRYPTO_is_PPC64LE_vcrypto_capable(void); +inline int CRYPTO_is_ARMv8_SHA3_capable() { + // There is no |OPENSSL_STATIC_ARMCAP_SHA3|. +#if defined(__ARM_FEATURE_SHA3) + return 1; +#else + return (OPENSSL_get_armcap() & ARMV8_SHA3) != 0; +#endif +} -extern unsigned long OPENSSL_ppc64le_hwcap2; +#endif // OPENSSL_ARM || OPENSSL_AARCH64 -#endif // OPENSSL_PPC64LE #if defined(BORINGSSL_DISPATCH_TEST) // Runtime CPU dispatch testing support @@ -1272,12 +1440,263 @@ extern unsigned long OPENSSL_ppc64le_hwcap2; // 3: aes_hw_set_encrypt_key // 4: vpaes_encrypt // 5: vpaes_set_encrypt_key -extern uint8_t BORINGSSL_function_hit[7]; +// 6: aes_gcm_enc_update_vaes_avx2 +// 7: aes_gcm_enc_update_vaes_avx512 +extern "C" uint8_t BORINGSSL_function_hit[8]; #endif // BORINGSSL_DISPATCH_TEST -#if defined(__cplusplus) -} // extern C +// OPENSSL_vasprintf_internal is just like |vasprintf(3)|. If |system_malloc| is +// 0, memory will be allocated with |OPENSSL_malloc| and must be freed with +// |OPENSSL_free|. Otherwise the system |malloc| function is used and the memory +// must be freed with the system |free| function. +OPENSSL_EXPORT int OPENSSL_vasprintf_internal(char **str, const char *format, + va_list args, int system_malloc) + OPENSSL_PRINTF_FORMAT_FUNC(2, 0); + + +// Fuzzer mode. + +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) +// CRYPTO_fuzzer_mode_enabled returns whether fuzzer mode is enabled. See +// |CRYPTO_set_fuzzer_mode|. In non-fuzzer builds, this function statically +// returns zero so the codepaths will be deleted by the optimizer. +int CRYPTO_fuzzer_mode_enabled(); +#else +inline int CRYPTO_fuzzer_mode_enabled() { return 0; } #endif + +// Arithmetic functions. + +// CRYPTO_addc_* returns |x + y + carry|, and sets |*out_carry| to the carry +// bit. |carry| must be zero or one. + +// NOTE: Unoptimized GCC builds may compile these builtins to non-constant-time +// code. For correct constant-time behavior, ensure builds are optimized. +#if OPENSSL_HAS_BUILTIN(__builtin_addc) + +inline unsigned int CRYPTO_addc_impl(unsigned int x, unsigned int y, + unsigned int carry, + unsigned int *out_carry) { + return __builtin_addc(x, y, carry, out_carry); +} + +inline unsigned long CRYPTO_addc_impl(unsigned long x, unsigned long y, + unsigned long carry, + unsigned long *out_carry) { + return __builtin_addcl(x, y, carry, out_carry); +} + +inline unsigned long long CRYPTO_addc_impl(unsigned long long x, + unsigned long long y, + unsigned long long carry, + unsigned long long *out_carry) { + return __builtin_addcll(x, y, carry, out_carry); +} + +inline uint32_t CRYPTO_addc_u32(uint32_t x, uint32_t y, uint32_t carry, + uint32_t *out_carry) { + return CRYPTO_addc_impl(x, y, carry, out_carry); +} + +inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry, + uint64_t *out_carry) { + return CRYPTO_addc_impl(x, y, carry, out_carry); +} + +#else + +inline uint32_t CRYPTO_addc_u32(uint32_t x, uint32_t y, uint32_t carry, + uint32_t *out_carry) { + declassify_assert(carry <= 1); +#if defined(_M_IX86) + uint32_t sum = 0; + *out_carry = _addcarry_u32(carry, x, y, &sum); + return sum; +#else + uint64_t ret = carry; + ret += (uint64_t)x + y; + *out_carry = (uint32_t)(ret >> 32); + return (uint32_t)ret; +#endif +} + +inline uint64_t CRYPTO_addc_u64(uint64_t x, uint64_t y, uint64_t carry, + uint64_t *out_carry) { + declassify_assert(carry <= 1); +#if defined(_M_X64) + uint64_t sum = 0; + *out_carry = _addcarry_u64(carry, x, y, &sum); + return sum; +#elif defined(BORINGSSL_HAS_UINT128) + uint128_t ret = carry; + ret += (uint128_t)x + y; + *out_carry = (uint64_t)(ret >> 64); + return (uint64_t)ret; +#else + x += carry; + carry = x < carry; + uint64_t ret = x + y; + carry += ret < x; + *out_carry = carry; + return ret; +#endif +} +#endif + + +// CRYPTO_subc_* returns |x - y - borrow|, and sets |*out_borrow| to the borrow +// bit. |borrow| must be zero or one. +#if OPENSSL_HAS_BUILTIN(__builtin_subc) + +inline unsigned int CRYPTO_subc_impl(unsigned int x, unsigned int y, + unsigned int borrow, + unsigned int *out_borrow) { + return __builtin_subc(x, y, borrow, out_borrow); +} + +inline unsigned long CRYPTO_subc_impl(unsigned long x, unsigned long y, + unsigned long borrow, + unsigned long *out_borrow) { + return __builtin_subcl(x, y, borrow, out_borrow); +} + +inline unsigned long long CRYPTO_subc_impl(unsigned long long x, + unsigned long long y, + unsigned long long borrow, + unsigned long long *out_borrow) { + return __builtin_subcll(x, y, borrow, out_borrow); +} + +inline uint32_t CRYPTO_subc_u32(uint32_t x, uint32_t y, uint32_t borrow, + uint32_t *out_borrow) { + return CRYPTO_subc_impl(x, y, borrow, out_borrow); +} + +inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow, + uint64_t *out_borrow) { + return CRYPTO_subc_impl(x, y, borrow, out_borrow); +} + +#else + +inline uint32_t CRYPTO_subc_u32(uint32_t x, uint32_t y, uint32_t borrow, + uint32_t *out_borrow) { + declassify_assert(borrow <= 1); +#if defined(_M_IX86) + uint32_t diff = 0; + *out_borrow = _subborrow_u32(borrow, x, y, &diff); + return diff; +#else + uint32_t ret = x - y - borrow; + *out_borrow = (x < y) | ((x == y) & borrow); + return ret; +#endif +} + +inline uint64_t CRYPTO_subc_u64(uint64_t x, uint64_t y, uint64_t borrow, + uint64_t *out_borrow) { + declassify_assert(borrow <= 1); +#if defined(_M_X64) + uint64_t diff = 0; + *out_borrow = _subborrow_u64(borrow, x, y, &diff); + return diff; +#else + uint64_t ret = x - y - borrow; + *out_borrow = (x < y) | ((x == y) & borrow); + return ret; +#endif +} +#endif + +#if defined(OPENSSL_64_BIT) +#define CRYPTO_addc_w CRYPTO_addc_u64 +#define CRYPTO_subc_w CRYPTO_subc_u64 +#else +#define CRYPTO_addc_w CRYPTO_addc_u32 +#define CRYPTO_subc_w CRYPTO_subc_u32 +#endif + + +// Cleanup implements a custom scope guard, when the cleanup logic does not fit +// in a destructor. Usage: +// +// bssl::Cleanup cleanup = [&] { SomeCleanupWork(local_var); }; +template +class Cleanup { + public: + static_assert(std::is_invocable_v); + static_assert(std::is_same_v, void>); + + Cleanup(F func) : func_(func) {} + Cleanup(const Cleanup &) = delete; + Cleanup &operator=(const Cleanup &) = delete; + ~Cleanup() { func_(); } + + private: + F func_; +}; +template +Cleanup(F func) -> Cleanup; + +// DECLARE_OPAQUE_STRUCT defines a public struct |public_name| with an +// implementation struct |impl_name|. +// +// To prevent accidents, the |public_name| struct will be neither constructable, +// nor copyable/movable, nor deletable. +// +// It must be used from inside the |bssl| namespace; however, |public_name| will +// be defined outside. +// +// Usage: +// +// DECLARE_OPAQUE_STRUCT(public_st, PublicImpl) +// +// BSSL_NAMESPACE_BEGIN +// +// class PublicImpl : public public_st { +// public: +// PublicImpl(); +// ~PublicImpl(); +// void foo(); +// }; +// +// BSSL_NAMESPACE_END +// +// The implementation struct can be converted to the public struct implicitly; +// to convert the public struct to the implementation struct, call +// |FromOpaque| on it. It is explicitly allowed to call |FromOpaque| on a +// |nullptr|. +#define DECLARE_OPAQUE_STRUCT(public_name, impl_name) \ + BSSL_NAMESPACE_BEGIN \ + class impl_name; \ + BSSL_NAMESPACE_END \ + \ + /* This is unnamespaced but assumed to not create linker symbols. */ \ + struct public_name { \ + using ImplType = bssl::impl_name; \ + \ + private: \ + public_name() = default; \ + ~public_name() = default; \ + public_name(const public_name &) = delete; \ + public_name &operator=(const public_name &) = delete; \ + \ + friend class bssl::impl_name; \ + }; + +template +inline typename Public::ImplType *FromOpaque(Public *p) { + return static_cast(p); +} + +template +inline const typename Public::ImplType *FromOpaque(const Public *p) { + return static_cast(p); +} + +BSSL_NAMESPACE_END + + #endif // OPENSSL_HEADER_CRYPTO_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/kyber/internal.h b/third_party/boringssl/src/crypto/kyber/internal.h new file mode 100644 index 00000000..21ec5afd --- /dev/null +++ b/third_party/boringssl/src/crypto/kyber/internal.h @@ -0,0 +1,165 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_KYBER_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_KYBER_INTERNAL_H + +#include + + +BSSL_NAMESPACE_BEGIN + +// Kyber is the pre-standard version of ML-KEM. This was once exported as public +// API, but is now internal and only used by libssl. It will be removed entirely +// in the future. +// +// This implements the round-3 specification of Kyber, defined at +// https://pq-crystals.org/kyber/data/kyber-specification-round3-20210804.pdf + +// KYBER_public_key contains a Kyber768 public key. The contents of this +// object should never leave the address space since the format is unstable. +struct KYBER_public_key { + union { + uint8_t bytes[512 * (3 + 9) + 32 + 32]; + uint16_t alignment; + } opaque; +}; + +// KYBER_private_key contains a Kyber768 private key. The contents of this +// object should never leave the address space since the format is unstable. +struct KYBER_private_key { + union { + uint8_t bytes[512 * (3 + 3 + 9) + 32 + 32 + 32]; + uint16_t alignment; + } opaque; +}; + +// KYBER_PUBLIC_KEY_BYTES is the number of bytes in an encoded Kyber768 public +// key. +#define KYBER_PUBLIC_KEY_BYTES 1184 + +// KYBER_SHARED_SECRET_BYTES is the number of bytes in the Kyber768 shared +// secret. Although the round-3 specification has a variable-length output, the +// final ML-KEM construction is expected to use a fixed 32-byte output. To +// simplify the future transition, we apply the same restriction. +#define KYBER_SHARED_SECRET_BYTES 32 + +// KYBER_generate_key generates a random public/private key pair, writes the +// encoded public key to |out_encoded_public_key| and sets |out_private_key| to +// the private key. +OPENSSL_EXPORT void KYBER_generate_key( + uint8_t out_encoded_public_key[KYBER_PUBLIC_KEY_BYTES], + struct KYBER_private_key *out_private_key); + +// KYBER_public_from_private sets |*out_public_key| to the public key that +// corresponds to |private_key|. (This is faster than parsing the output of +// |KYBER_generate_key| if, for some reason, you need to encapsulate to a key +// that was just generated.) +OPENSSL_EXPORT void KYBER_public_from_private( + struct KYBER_public_key *out_public_key, + const struct KYBER_private_key *private_key); + +// KYBER_CIPHERTEXT_BYTES is number of bytes in the Kyber768 ciphertext. +#define KYBER_CIPHERTEXT_BYTES 1088 + +// KYBER_encap encrypts a random shared secret for |public_key|, writes the +// ciphertext to |out_ciphertext|, and writes the random shared secret to +// |out_shared_secret|. +OPENSSL_EXPORT void KYBER_encap( + uint8_t out_ciphertext[KYBER_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[KYBER_SHARED_SECRET_BYTES], + const struct KYBER_public_key *public_key); + +// KYBER_decap decrypts a shared secret from |ciphertext| using |private_key| +// and writes it to |out_shared_secret|. If |ciphertext| is invalid, +// |out_shared_secret| is filled with a key that will always be the same for the +// same |ciphertext| and |private_key|, but which appears to be random unless +// one has access to |private_key|. These alternatives occur in constant time. +// Any subsequent symmetric encryption using |out_shared_secret| must use an +// authenticated encryption scheme in order to discover the decapsulation +// failure. +OPENSSL_EXPORT void KYBER_decap( + uint8_t out_shared_secret[KYBER_SHARED_SECRET_BYTES], + const uint8_t ciphertext[KYBER_CIPHERTEXT_BYTES], + const struct KYBER_private_key *private_key); + + +// Serialisation of keys. + +// KYBER_marshal_public_key serializes |public_key| to |out| in the standard +// format for Kyber public keys. It returns one on success or zero on allocation +// error. +OPENSSL_EXPORT int KYBER_marshal_public_key( + CBB *out, const struct KYBER_public_key *public_key); + +// KYBER_parse_public_key parses a public key, in the format generated by +// |KYBER_marshal_public_key|, from |in| and writes the result to +// |out_public_key|. It returns one on success or zero on parse error or if +// there are trailing bytes in |in|. +OPENSSL_EXPORT int KYBER_parse_public_key( + struct KYBER_public_key *out_public_key, CBS *in); + +// KYBER_marshal_private_key serializes |private_key| to |out| in the standard +// format for Kyber private keys. It returns one on success or zero on +// allocation error. +OPENSSL_EXPORT int KYBER_marshal_private_key( + CBB *out, const struct KYBER_private_key *private_key); + +// KYBER_PRIVATE_KEY_BYTES is the length of the data produced by +// |KYBER_marshal_private_key|. +#define KYBER_PRIVATE_KEY_BYTES 2400 + +// KYBER_parse_private_key parses a private key, in the format generated by +// |KYBER_marshal_private_key|, from |in| and writes the result to +// |out_private_key|. It returns one on success or zero on parse error or if +// there are trailing bytes in |in|. +OPENSSL_EXPORT int KYBER_parse_private_key( + struct KYBER_private_key *out_private_key, CBS *in); + + +// Internal symbols. + +// KYBER_ENCAP_ENTROPY is the number of bytes of uniformly random entropy +// necessary to encapsulate a secret. The entropy will be leaked to the +// decapsulating party. +#define KYBER_ENCAP_ENTROPY 32 + +// KYBER_GENERATE_KEY_ENTROPY is the number of bytes of uniformly random entropy +// necessary to generate a key. +#define KYBER_GENERATE_KEY_ENTROPY 64 + +// KYBER_generate_key_external_entropy is a deterministic function to create a +// pair of Kyber768 keys, using the supplied entropy. The entropy needs to be +// uniformly random generated. This function is should only be used for tests, +// regular callers should use the non-deterministic |KYBER_generate_key| +// directly. +OPENSSL_EXPORT void KYBER_generate_key_external_entropy( + uint8_t out_encoded_public_key[KYBER_PUBLIC_KEY_BYTES], + struct KYBER_private_key *out_private_key, + const uint8_t entropy[KYBER_GENERATE_KEY_ENTROPY]); + +// KYBER_encap_external_entropy behaves like |KYBER_encap|, but uses +// |KYBER_ENCAP_ENTROPY| bytes of |entropy| for randomization. The decapsulating +// side will be able to recover |entropy| in full. This function should only be +// used for tests, regular callers should use the non-deterministic +// |KYBER_encap| directly. +OPENSSL_EXPORT void KYBER_encap_external_entropy( + uint8_t out_ciphertext[KYBER_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[KYBER_SHARED_SECRET_BYTES], + const struct KYBER_public_key *public_key, + const uint8_t entropy[KYBER_ENCAP_ENTROPY]); + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_KYBER_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/kyber/kyber.cc b/third_party/boringssl/src/crypto/kyber/kyber.cc new file mode 100644 index 00000000..f35ca558 --- /dev/null +++ b/third_party/boringssl/src/crypto/kyber/kyber.cc @@ -0,0 +1,870 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include + +#include "../fipsmodule/keccak/internal.h" +#include "../internal.h" +#include "./internal.h" + + +// See +// https://pq-crystals.org/kyber/data/kyber-specification-round3-20210804.pdf + +using namespace bssl; + +static void prf(uint8_t *out, size_t out_len, const uint8_t in[33]) { + BORINGSSL_keccak(out, out_len, in, 33, boringssl_shake256); +} + +static void hash_h(uint8_t out[32], const uint8_t *in, size_t len) { + BORINGSSL_keccak(out, 32, in, len, boringssl_sha3_256); +} + +static void hash_g(uint8_t out[64], const uint8_t *in, size_t len) { + BORINGSSL_keccak(out, 64, in, len, boringssl_sha3_512); +} + +static void kdf(uint8_t *out, size_t out_len, const uint8_t *in, size_t len) { + BORINGSSL_keccak(out, out_len, in, len, boringssl_shake256); +} + +#define DEGREE 256 +#define RANK 3 + +static const size_t kBarrettMultiplier = 5039; +static const unsigned kBarrettShift = 24; +static const uint16_t kPrime = 3329; +static const int kLog2Prime = 12; +static const uint16_t kHalfPrime = (/*kPrime=*/3329 - 1) / 2; +static const int kDU = 10; +static const int kDV = 4; +// kInverseDegree is 128^-1 mod 3329; 128 because kPrime does not have a 512th +// root of unity. +static const uint16_t kInverseDegree = 3303; +static const size_t kEncodedVectorSize = + (/*kLog2Prime=*/12 * DEGREE / 8) * RANK; +static const size_t kCompressedVectorSize = /*kDU=*/10 * RANK * DEGREE / 8; + +typedef struct scalar { + // On every function entry and exit, 0 <= c < kPrime. + uint16_t c[DEGREE]; +} scalar; + +typedef struct vector { + scalar v[RANK]; +} vector; + +typedef struct matrix { + scalar v[RANK][RANK]; +} matrix; + +// This bit of Python will be referenced in some of the following comments: +// +// p = 3329 +// +// def bitreverse(i): +// ret = 0 +// for n in range(7): +// bit = i & 1 +// ret <<= 1 +// ret |= bit +// i >>= 1 +// return ret + +// kNTTRoots = [pow(17, bitreverse(i), p) for i in range(128)] +static const uint16_t kNTTRoots[128] = { + 1, 1729, 2580, 3289, 2642, 630, 1897, 848, 1062, 1919, 193, 797, + 2786, 3260, 569, 1746, 296, 2447, 1339, 1476, 3046, 56, 2240, 1333, + 1426, 2094, 535, 2882, 2393, 2879, 1974, 821, 289, 331, 3253, 1756, + 1197, 2304, 2277, 2055, 650, 1977, 2513, 632, 2865, 33, 1320, 1915, + 2319, 1435, 807, 452, 1438, 2868, 1534, 2402, 2647, 2617, 1481, 648, + 2474, 3110, 1227, 910, 17, 2761, 583, 2649, 1637, 723, 2288, 1100, + 1409, 2662, 3281, 233, 756, 2156, 3015, 3050, 1703, 1651, 2789, 1789, + 1847, 952, 1461, 2687, 939, 2308, 2437, 2388, 733, 2337, 268, 641, + 1584, 2298, 2037, 3220, 375, 2549, 2090, 1645, 1063, 319, 2773, 757, + 2099, 561, 2466, 2594, 2804, 1092, 403, 1026, 1143, 2150, 2775, 886, + 1722, 1212, 1874, 1029, 2110, 2935, 885, 2154, +}; + +// kInverseNTTRoots = [pow(17, -bitreverse(i), p) for i in range(128)] +static const uint16_t kInverseNTTRoots[128] = { + 1, 1600, 40, 749, 2481, 1432, 2699, 687, 1583, 2760, 69, 543, + 2532, 3136, 1410, 2267, 2508, 1355, 450, 936, 447, 2794, 1235, 1903, + 1996, 1089, 3273, 283, 1853, 1990, 882, 3033, 2419, 2102, 219, 855, + 2681, 1848, 712, 682, 927, 1795, 461, 1891, 2877, 2522, 1894, 1010, + 1414, 2009, 3296, 464, 2697, 816, 1352, 2679, 1274, 1052, 1025, 2132, + 1573, 76, 2998, 3040, 1175, 2444, 394, 1219, 2300, 1455, 2117, 1607, + 2443, 554, 1179, 2186, 2303, 2926, 2237, 525, 735, 863, 2768, 1230, + 2572, 556, 3010, 2266, 1684, 1239, 780, 2954, 109, 1292, 1031, 1745, + 2688, 3061, 992, 2596, 941, 892, 1021, 2390, 642, 1868, 2377, 1482, + 1540, 540, 1678, 1626, 279, 314, 1173, 2573, 3096, 48, 667, 1920, + 2229, 1041, 2606, 1692, 680, 2746, 568, 3312, +}; + +// kModRoots = [pow(17, 2*bitreverse(i) + 1, p) for i in range(128)] +static const uint16_t kModRoots[128] = { + 17, 3312, 2761, 568, 583, 2746, 2649, 680, 1637, 1692, 723, 2606, + 2288, 1041, 1100, 2229, 1409, 1920, 2662, 667, 3281, 48, 233, 3096, + 756, 2573, 2156, 1173, 3015, 314, 3050, 279, 1703, 1626, 1651, 1678, + 2789, 540, 1789, 1540, 1847, 1482, 952, 2377, 1461, 1868, 2687, 642, + 939, 2390, 2308, 1021, 2437, 892, 2388, 941, 733, 2596, 2337, 992, + 268, 3061, 641, 2688, 1584, 1745, 2298, 1031, 2037, 1292, 3220, 109, + 375, 2954, 2549, 780, 2090, 1239, 1645, 1684, 1063, 2266, 319, 3010, + 2773, 556, 757, 2572, 2099, 1230, 561, 2768, 2466, 863, 2594, 735, + 2804, 525, 1092, 2237, 403, 2926, 1026, 2303, 1143, 2186, 2150, 1179, + 2775, 554, 886, 2443, 1722, 1607, 1212, 2117, 1874, 1455, 1029, 2300, + 2110, 1219, 2935, 394, 885, 2444, 2154, 1175, +}; + +// reduce_once reduces 0 <= x < 2*kPrime, mod kPrime. +static uint16_t reduce_once(uint16_t x) { + declassify_assert(x < 2 * kPrime); + const uint16_t subtracted = x - kPrime; + uint16_t mask = 0u - (subtracted >> 15); + // Although this is a constant-time select, we omit a value barrier here. + // Value barriers impede auto-vectorization (likely because it forces the + // value to transit through a general-purpose register). On AArch64, this is a + // difference of 2x. + // + // We usually add value barriers to selects because Clang turns consecutive + // selects with the same condition into a branch instead of CMOV/CSEL. This + // condition does not occur in Kyber, so omitting it seems to be safe so far, + // but see |scalar_centered_binomial_distribution_eta_2_with_prf|. + return (mask & x) | (~mask & subtracted); +} + +// constant time reduce x mod kPrime using Barrett reduction. x must be less +// than kPrime + 2×kPrime². +static uint16_t reduce(uint32_t x) { + declassify_assert(x < kPrime + 2u * kPrime * kPrime); + uint64_t product = (uint64_t)x * kBarrettMultiplier; + uint32_t quotient = (uint32_t)(product >> kBarrettShift); + uint32_t remainder = x - quotient * kPrime; + return reduce_once(remainder); +} + +static void scalar_zero(scalar *out) { OPENSSL_memset(out, 0, sizeof(*out)); } + +static void vector_zero(vector *out) { OPENSSL_memset(out, 0, sizeof(*out)); } + +// In place number theoretic transform of a given scalar. +// Note that Kyber's kPrime 3329 does not have a 512th root of unity, so this +// transform leaves off the last iteration of the usual FFT code, with the 128 +// relevant roots of unity being stored in |kNTTRoots|. This means the output +// should be seen as 128 elements in GF(3329^2), with the coefficients of the +// elements being consecutive entries in |s->c|. +static void scalar_ntt(scalar *s) { + int offset = DEGREE; + // `int` is used here because using `size_t` throughout caused a ~5% slowdown + // with Clang 14 on Aarch64. + for (int step = 1; step < DEGREE / 2; step <<= 1) { + offset >>= 1; + int k = 0; + for (int i = 0; i < step; i++) { + const uint32_t step_root = kNTTRoots[i + step]; + for (int j = k; j < k + offset; j++) { + uint16_t odd = reduce(step_root * s->c[j + offset]); + uint16_t even = s->c[j]; + s->c[j] = reduce_once(odd + even); + s->c[j + offset] = reduce_once(even - odd + kPrime); + } + k += 2 * offset; + } + } +} + +static void vector_ntt(vector *a) { + for (int i = 0; i < RANK; i++) { + scalar_ntt(&a->v[i]); + } +} + +// In place inverse number theoretic transform of a given scalar, with pairs of +// entries of s->v being interpreted as elements of GF(3329^2). Just as with the +// number theoretic transform, this leaves off the first step of the normal iFFT +// to account for the fact that 3329 does not have a 512th root of unity, using +// the precomputed 128 roots of unity stored in |kInverseNTTRoots|. +static void scalar_inverse_ntt(scalar *s) { + int step = DEGREE / 2; + // `int` is used here because using `size_t` throughout caused a ~5% slowdown + // with Clang 14 on Aarch64. + for (int offset = 2; offset < DEGREE; offset <<= 1) { + step >>= 1; + int k = 0; + for (int i = 0; i < step; i++) { + uint32_t step_root = kInverseNTTRoots[i + step]; + for (int j = k; j < k + offset; j++) { + uint16_t odd = s->c[j + offset]; + uint16_t even = s->c[j]; + s->c[j] = reduce_once(odd + even); + s->c[j + offset] = reduce(step_root * (even - odd + kPrime)); + } + k += 2 * offset; + } + } + for (int i = 0; i < DEGREE; i++) { + s->c[i] = reduce(s->c[i] * kInverseDegree); + } +} + +static void vector_inverse_ntt(vector *a) { + for (int i = 0; i < RANK; i++) { + scalar_inverse_ntt(&a->v[i]); + } +} + +static void scalar_add(scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE; i++) { + lhs->c[i] = reduce_once(lhs->c[i] + rhs->c[i]); + } +} + +static void scalar_sub(scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE; i++) { + lhs->c[i] = reduce_once(lhs->c[i] - rhs->c[i] + kPrime); + } +} + +// Multiplying two scalars in the number theoretically transformed state. Since +// 3329 does not have a 512th root of unity, this means we have to interpret +// the 2*ith and (2*i+1)th entries of the scalar as elements of GF(3329)[X]/(X^2 +// - 17^(2*bitreverse(i)+1)) The value of 17^(2*bitreverse(i)+1) mod 3329 is +// stored in the precomputed |kModRoots| table. Note that our Barrett transform +// only allows us to multiply two reduced numbers together, so we need some +// intermediate reduction steps, even if an uint64_t could hold 3 multiplied +// numbers. +static void scalar_mult(scalar *out, const scalar *lhs, const scalar *rhs) { + for (int i = 0; i < DEGREE / 2; i++) { + uint32_t real_real = (uint32_t)lhs->c[2 * i] * rhs->c[2 * i]; + uint32_t img_img = (uint32_t)lhs->c[2 * i + 1] * rhs->c[2 * i + 1]; + uint32_t real_img = (uint32_t)lhs->c[2 * i] * rhs->c[2 * i + 1]; + uint32_t img_real = (uint32_t)lhs->c[2 * i + 1] * rhs->c[2 * i]; + out->c[2 * i] = + reduce(real_real + (uint32_t)reduce(img_img) * kModRoots[i]); + out->c[2 * i + 1] = reduce(img_real + real_img); + } +} + +static void vector_add(vector *lhs, const vector *rhs) { + for (int i = 0; i < RANK; i++) { + scalar_add(&lhs->v[i], &rhs->v[i]); + } +} + +static void matrix_mult(vector *out, const matrix *m, const vector *a) { + vector_zero(out); + for (int i = 0; i < RANK; i++) { + for (int j = 0; j < RANK; j++) { + scalar product; + scalar_mult(&product, &m->v[i][j], &a->v[j]); + scalar_add(&out->v[i], &product); + } + } +} + +static void matrix_mult_transpose(vector *out, const matrix *m, + const vector *a) { + vector_zero(out); + for (int i = 0; i < RANK; i++) { + for (int j = 0; j < RANK; j++) { + scalar product; + scalar_mult(&product, &m->v[j][i], &a->v[j]); + scalar_add(&out->v[i], &product); + } + } +} + +static void scalar_inner_product(scalar *out, const vector *lhs, + const vector *rhs) { + scalar_zero(out); + for (int i = 0; i < RANK; i++) { + scalar product; + scalar_mult(&product, &lhs->v[i], &rhs->v[i]); + scalar_add(out, &product); + } +} + +// Algorithm 1 of the Kyber spec. Rejection samples a Keccak stream to get +// uniformly distributed elements. This is used for matrix expansion and only +// operates on public inputs. +static void scalar_from_keccak_vartime(scalar *out, + struct BORINGSSL_keccak_st *keccak_ctx) { + assert(keccak_ctx->squeeze_offset == 0); + assert(keccak_ctx->rate_bytes == 168); + static_assert(168 % 3 == 0, "block and coefficient boundaries do not align"); + + int done = 0; + while (done < DEGREE) { + uint8_t block[168]; + BORINGSSL_keccak_squeeze(keccak_ctx, block, sizeof(block)); + for (size_t i = 0; i < sizeof(block) && done < DEGREE; i += 3) { + uint16_t d1 = block[i] + 256 * (block[i + 1] % 16); + uint16_t d2 = block[i + 1] / 16 + 16 * block[i + 2]; + if (d1 < kPrime) { + out->c[done++] = d1; + } + if (d2 < kPrime && done < DEGREE) { + out->c[done++] = d2; + } + } + } +} + +// Algorithm 2 of the Kyber spec, with eta fixed to two and the PRF call +// included. Creates binominally distributed elements by sampling 2*|eta| bits, +// and setting the coefficient to the count of the first bits minus the count of +// the second bits, resulting in a centered binomial distribution. Since eta is +// two this gives -2/2 with a probability of 1/16, -1/1 with probability 1/4, +// and 0 with probability 3/8. +static void scalar_centered_binomial_distribution_eta_2_with_prf( + scalar *out, const uint8_t input[33]) { + uint8_t entropy[128]; + static_assert(sizeof(entropy) == 2 * /*kEta=*/2 * DEGREE / 8); + prf(entropy, sizeof(entropy), input); + + for (int i = 0; i < DEGREE; i += 2) { + uint8_t byte = entropy[i / 2]; + + uint16_t value = (byte & 1) + ((byte >> 1) & 1); + value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); + // Add |kPrime| if |value| underflowed. See |reduce_once| for a discussion + // on why the value barrier is omitted. While this could have been written + // reduce_once(value + kPrime), this is one extra addition and small range + // of |value| tempts some versions of Clang to emit a branch. + uint16_t mask = 0u - (value >> 15); + out->c[i] = value + (kPrime & mask); + + byte >>= 4; + value = (byte & 1) + ((byte >> 1) & 1); + value -= ((byte >> 2) & 1) + ((byte >> 3) & 1); + // See above. + mask = 0u - (value >> 15); + out->c[i + 1] = value + (kPrime & mask); + } +} + +// Generates a secret vector by using +// |scalar_centered_binomial_distribution_eta_2_with_prf|, using the given seed +// appending and incrementing |counter| for entry of the vector. +static void vector_generate_secret_eta_2(vector *out, uint8_t *counter, + const uint8_t seed[32]) { + uint8_t input[33]; + OPENSSL_memcpy(input, seed, 32); + for (int i = 0; i < RANK; i++) { + input[32] = (*counter)++; + scalar_centered_binomial_distribution_eta_2_with_prf(&out->v[i], input); + } +} + +// Expands the matrix of a seed for key generation and for encaps-CPA. +static void matrix_expand(matrix *out, const uint8_t rho[32]) { + uint8_t input[34]; + OPENSSL_memcpy(input, rho, 32); + for (int i = 0; i < RANK; i++) { + for (int j = 0; j < RANK; j++) { + input[32] = i; + input[33] = j; + struct BORINGSSL_keccak_st keccak_ctx; + BORINGSSL_keccak_init(&keccak_ctx, boringssl_shake128); + BORINGSSL_keccak_absorb(&keccak_ctx, input, sizeof(input)); + scalar_from_keccak_vartime(&out->v[i][j], &keccak_ctx); + } + } +} + +static const uint8_t kMasks[8] = {0x01, 0x03, 0x07, 0x0f, + 0x1f, 0x3f, 0x7f, 0xff}; + +static void scalar_encode(uint8_t *out, const scalar *s, int bits) { + assert(bits <= (int)sizeof(*s->c) * 8 && bits != 1); + + uint8_t out_byte = 0; + int out_byte_bits = 0; + + for (int i = 0; i < DEGREE; i++) { + uint16_t element = s->c[i]; + int element_bits_done = 0; + + while (element_bits_done < bits) { + int chunk_bits = bits - element_bits_done; + int out_bits_remaining = 8 - out_byte_bits; + if (chunk_bits >= out_bits_remaining) { + chunk_bits = out_bits_remaining; + out_byte |= (element & kMasks[chunk_bits - 1]) << out_byte_bits; + *out = out_byte; + out++; + out_byte_bits = 0; + out_byte = 0; + } else { + out_byte |= (element & kMasks[chunk_bits - 1]) << out_byte_bits; + out_byte_bits += chunk_bits; + } + + element_bits_done += chunk_bits; + element >>= chunk_bits; + } + } + + if (out_byte_bits > 0) { + *out = out_byte; + } +} + +// scalar_encode_1 is |scalar_encode| specialised for |bits| == 1. +static void scalar_encode_1(uint8_t out[32], const scalar *s) { + for (int i = 0; i < DEGREE; i += 8) { + uint8_t out_byte = 0; + for (int j = 0; j < 8; j++) { + out_byte |= (s->c[i + j] & 1) << j; + } + *out = out_byte; + out++; + } +} + +// Encodes an entire vector into 32*|RANK|*|bits| bytes. Note that since 256 +// (DEGREE) is divisible by 8, the individual vector entries will always fill a +// whole number of bytes, so we do not need to worry about bit packing here. +static void vector_encode(uint8_t *out, const vector *a, int bits) { + for (int i = 0; i < RANK; i++) { + scalar_encode(out + i * bits * DEGREE / 8, &a->v[i], bits); + } +} + +// scalar_decode parses |DEGREE * bits| bits from |in| into |DEGREE| values in +// |out|. It returns one on success and zero if any parsed value is >= +// |kPrime|. +static int scalar_decode(scalar *out, const uint8_t *in, int bits) { + assert(bits <= (int)sizeof(*out->c) * 8 && bits != 1); + + uint8_t in_byte = 0; + int in_byte_bits_left = 0; + + for (int i = 0; i < DEGREE; i++) { + uint16_t element = 0; + int element_bits_done = 0; + + while (element_bits_done < bits) { + if (in_byte_bits_left == 0) { + in_byte = *in; + in++; + in_byte_bits_left = 8; + } + + int chunk_bits = bits - element_bits_done; + if (chunk_bits > in_byte_bits_left) { + chunk_bits = in_byte_bits_left; + } + + element |= (in_byte & kMasks[chunk_bits - 1]) << element_bits_done; + in_byte_bits_left -= chunk_bits; + in_byte >>= chunk_bits; + + element_bits_done += chunk_bits; + } + + // An element is only out of range in the case of invalid input, in which + // case it is okay to leak the comparison. + if (constant_time_declassify_int(element >= kPrime)) { + return 0; + } + out->c[i] = element; + } + + return 1; +} + +// scalar_decode_1 is |scalar_decode| specialised for |bits| == 1. +static void scalar_decode_1(scalar *out, const uint8_t in[32]) { + for (int i = 0; i < DEGREE; i += 8) { + uint8_t in_byte = *in; + in++; + for (int j = 0; j < 8; j++) { + out->c[i + j] = in_byte & 1; + in_byte >>= 1; + } + } +} + +// Decodes 32*|RANK|*|bits| bytes from |in| into |out|. It returns one on +// success or zero if any parsed value is >= |kPrime|. +static int vector_decode(vector *out, const uint8_t *in, int bits) { + for (int i = 0; i < RANK; i++) { + if (!scalar_decode(&out->v[i], in + i * bits * DEGREE / 8, bits)) { + return 0; + } + } + return 1; +} + +// Compresses (lossily) an input |x| mod 3329 into |bits| many bits by grouping +// numbers close to each other together. The formula used is +// round(2^|bits|/kPrime*x) mod 2^|bits|. +// Uses Barrett reduction to achieve constant time. Since we need both the +// remainder (for rounding) and the quotient (as the result), we cannot use +// |reduce| here, but need to do the Barrett reduction directly. +static uint16_t compress(uint16_t x, int bits) { + uint32_t shifted = (uint32_t)x << bits; + uint64_t product = (uint64_t)shifted * kBarrettMultiplier; + uint32_t quotient = (uint32_t)(product >> kBarrettShift); + uint32_t remainder = shifted - quotient * kPrime; + + // Adjust the quotient to round correctly: + // 0 <= remainder <= kHalfPrime round to 0 + // kHalfPrime < remainder <= kPrime + kHalfPrime round to 1 + // kPrime + kHalfPrime < remainder < 2 * kPrime round to 2 + declassify_assert(remainder < 2u * kPrime); + quotient += 1 & constant_time_lt_w(kHalfPrime, remainder); + quotient += 1 & constant_time_lt_w(kPrime + kHalfPrime, remainder); + return quotient & ((1 << bits) - 1); +} + +// Decompresses |x| by using an equi-distant representative. The formula is +// round(kPrime/2^|bits|*x). Note that 2^|bits| being the divisor allows us to +// implement this logic using only bit operations. +static uint16_t decompress(uint16_t x, int bits) { + uint32_t product = (uint32_t)x * kPrime; + uint32_t power = 1 << bits; + // This is |product| % power, since |power| is a power of 2. + uint32_t remainder = product & (power - 1); + // This is |product| / power, since |power| is a power of 2. + uint32_t lower = product >> bits; + // The rounding logic works since the first half of numbers mod |power| have a + // 0 as first bit, and the second half has a 1 as first bit, since |power| is + // a power of 2. As a 12 bit number, |remainder| is always positive, so we + // will shift in 0s for a right shift. + return lower + (remainder >> (bits - 1)); +} + +static void scalar_compress(scalar *s, int bits) { + for (int i = 0; i < DEGREE; i++) { + s->c[i] = compress(s->c[i], bits); + } +} + +static void scalar_decompress(scalar *s, int bits) { + for (int i = 0; i < DEGREE; i++) { + s->c[i] = decompress(s->c[i], bits); + } +} + +static void vector_compress(vector *a, int bits) { + for (int i = 0; i < RANK; i++) { + scalar_compress(&a->v[i], bits); + } +} + +static void vector_decompress(vector *a, int bits) { + for (int i = 0; i < RANK; i++) { + scalar_decompress(&a->v[i], bits); + } +} + +namespace { + +struct public_key { + vector t; + uint8_t rho[32]; + uint8_t public_key_hash[32]; + matrix m; +}; + +static struct public_key *public_key_from_external( + const struct KYBER_public_key *external) { + static_assert(sizeof(struct KYBER_public_key) >= sizeof(struct public_key), + "Kyber public key is too small"); + static_assert(alignof(struct KYBER_public_key) >= alignof(struct public_key), + "Kyber public key align incorrect"); + return (struct public_key *)external; +} + +struct private_key { + struct public_key pub; + vector s; + uint8_t fo_failure_secret[32]; +}; + +static struct private_key *private_key_from_external( + const struct KYBER_private_key *external) { + static_assert(sizeof(struct KYBER_private_key) >= sizeof(struct private_key), + "Kyber private key too small"); + static_assert( + alignof(struct KYBER_private_key) >= alignof(struct private_key), + "Kyber private key align incorrect"); + return (struct private_key *)external; +} + +} // namespace + +// Calls |KYBER_generate_key_external_entropy| with random bytes from +// |RAND_bytes|. +void bssl::KYBER_generate_key( + uint8_t out_encoded_public_key[KYBER_PUBLIC_KEY_BYTES], + struct KYBER_private_key *out_private_key) { + uint8_t entropy[KYBER_GENERATE_KEY_ENTROPY]; + RAND_bytes(entropy, sizeof(entropy)); + CONSTTIME_SECRET(entropy, sizeof(entropy)); + KYBER_generate_key_external_entropy(out_encoded_public_key, out_private_key, + entropy); +} + +static int kyber_marshal_public_key(CBB *out, const struct public_key *pub) { + uint8_t *vector_output; + if (!CBB_add_space(out, &vector_output, kEncodedVectorSize)) { + return 0; + } + vector_encode(vector_output, &pub->t, kLog2Prime); + if (!CBB_add_bytes(out, pub->rho, sizeof(pub->rho))) { + return 0; + } + return 1; +} + +// Algorithms 4 and 7 of the Kyber spec. Algorithms are combined since key +// generation is not part of the FO transform, and the spec uses Algorithm 7 to +// specify the actual key format. +void bssl::KYBER_generate_key_external_entropy( + uint8_t out_encoded_public_key[KYBER_PUBLIC_KEY_BYTES], + struct KYBER_private_key *out_private_key, + const uint8_t entropy[KYBER_GENERATE_KEY_ENTROPY]) { + struct private_key *priv = private_key_from_external(out_private_key); + uint8_t hashed[64]; + hash_g(hashed, entropy, 32); + const uint8_t *const rho = hashed; + const uint8_t *const sigma = hashed + 32; + // rho is public. + CONSTTIME_DECLASSIFY(rho, 32); + OPENSSL_memcpy(priv->pub.rho, hashed, sizeof(priv->pub.rho)); + matrix_expand(&priv->pub.m, rho); + uint8_t counter = 0; + vector_generate_secret_eta_2(&priv->s, &counter, sigma); + vector_ntt(&priv->s); + vector error; + vector_generate_secret_eta_2(&error, &counter, sigma); + vector_ntt(&error); + matrix_mult_transpose(&priv->pub.t, &priv->pub.m, &priv->s); + vector_add(&priv->pub.t, &error); + // t is part of the public key and thus is public. + CONSTTIME_DECLASSIFY(&priv->pub.t, sizeof(priv->pub.t)); + + CBB cbb; + CBB_init_fixed(&cbb, out_encoded_public_key, KYBER_PUBLIC_KEY_BYTES); + if (!kyber_marshal_public_key(&cbb, &priv->pub)) { + abort(); + } + + hash_h(priv->pub.public_key_hash, out_encoded_public_key, + KYBER_PUBLIC_KEY_BYTES); + OPENSSL_memcpy(priv->fo_failure_secret, entropy + 32, 32); +} + +void bssl::KYBER_public_from_private( + struct KYBER_public_key *out_public_key, + const struct KYBER_private_key *private_key) { + struct public_key *const pub = public_key_from_external(out_public_key); + const struct private_key *const priv = private_key_from_external(private_key); + *pub = priv->pub; +} + +// Algorithm 5 of the Kyber spec. Encrypts a message with given randomness to +// the ciphertext in |out|. Without applying the Fujisaki-Okamoto transform this +// would not result in a CCA secure scheme, since lattice schemes are vulnerable +// to decryption failure oracles. +static void encrypt_cpa(uint8_t out[KYBER_CIPHERTEXT_BYTES], + const struct public_key *pub, const uint8_t message[32], + const uint8_t randomness[32]) { + uint8_t counter = 0; + vector secret; + vector_generate_secret_eta_2(&secret, &counter, randomness); + vector_ntt(&secret); + vector error; + vector_generate_secret_eta_2(&error, &counter, randomness); + uint8_t input[33]; + OPENSSL_memcpy(input, randomness, 32); + input[32] = counter; + scalar scalar_error; + scalar_centered_binomial_distribution_eta_2_with_prf(&scalar_error, input); + vector u; + matrix_mult(&u, &pub->m, &secret); + vector_inverse_ntt(&u); + vector_add(&u, &error); + scalar v; + scalar_inner_product(&v, &pub->t, &secret); + scalar_inverse_ntt(&v); + scalar_add(&v, &scalar_error); + scalar expanded_message; + scalar_decode_1(&expanded_message, message); + scalar_decompress(&expanded_message, 1); + scalar_add(&v, &expanded_message); + vector_compress(&u, kDU); + vector_encode(out, &u, kDU); + scalar_compress(&v, kDV); + scalar_encode(out + kCompressedVectorSize, &v, kDV); +} + +// Calls KYBER_encap_external_entropy| with random bytes from |RAND_bytes| +void bssl::KYBER_encap(uint8_t out_ciphertext[KYBER_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[KYBER_SHARED_SECRET_BYTES], + const struct KYBER_public_key *public_key) { + uint8_t entropy[KYBER_ENCAP_ENTROPY]; + RAND_bytes(entropy, KYBER_ENCAP_ENTROPY); + CONSTTIME_SECRET(entropy, KYBER_ENCAP_ENTROPY); + KYBER_encap_external_entropy(out_ciphertext, out_shared_secret, public_key, + entropy); +} + +// Algorithm 8 of the Kyber spec, safe for line 2 of the spec. The spec there +// hashes the output of the system's random number generator, since the FO +// transform will reveal it to the decrypting party. There is no reason to do +// this when a secure random number generator is used. When an insecure random +// number generator is used, the caller should switch to a secure one before +// calling this method. +void bssl::KYBER_encap_external_entropy( + uint8_t out_ciphertext[KYBER_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[KYBER_SHARED_SECRET_BYTES], + const struct KYBER_public_key *public_key, + const uint8_t entropy[KYBER_ENCAP_ENTROPY]) { + const struct public_key *pub = public_key_from_external(public_key); + uint8_t input[64]; + OPENSSL_memcpy(input, entropy, KYBER_ENCAP_ENTROPY); + OPENSSL_memcpy(input + KYBER_ENCAP_ENTROPY, pub->public_key_hash, + sizeof(input) - KYBER_ENCAP_ENTROPY); + uint8_t prekey_and_randomness[64]; + hash_g(prekey_and_randomness, input, sizeof(input)); + encrypt_cpa(out_ciphertext, pub, entropy, prekey_and_randomness + 32); + // The ciphertext is public. + CONSTTIME_DECLASSIFY(out_ciphertext, KYBER_CIPHERTEXT_BYTES); + hash_h(prekey_and_randomness + 32, out_ciphertext, KYBER_CIPHERTEXT_BYTES); + kdf(out_shared_secret, KYBER_SHARED_SECRET_BYTES, prekey_and_randomness, + sizeof(prekey_and_randomness)); +} + +// Algorithm 6 of the Kyber spec. +static void decrypt_cpa(uint8_t out[32], const struct private_key *priv, + const uint8_t ciphertext[KYBER_CIPHERTEXT_BYTES]) { + vector u; + vector_decode(&u, ciphertext, kDU); + vector_decompress(&u, kDU); + vector_ntt(&u); + scalar v; + scalar_decode(&v, ciphertext + kCompressedVectorSize, kDV); + scalar_decompress(&v, kDV); + scalar mask; + scalar_inner_product(&mask, &priv->s, &u); + scalar_inverse_ntt(&mask); + scalar_sub(&v, &mask); + scalar_compress(&v, 1); + scalar_encode_1(out, &v); +} + +// Algorithm 9 of the Kyber spec, performing the FO transform by running +// encrypt_cpa on the decrypted message. The spec does not allow the decryption +// failure to be passed on to the caller, and instead returns a result that is +// deterministic but unpredictable to anyone without knowledge of the private +// key. +void bssl::KYBER_decap(uint8_t out_shared_secret[KYBER_SHARED_SECRET_BYTES], + const uint8_t ciphertext[KYBER_CIPHERTEXT_BYTES], + const struct KYBER_private_key *private_key) { + const struct private_key *priv = private_key_from_external(private_key); + uint8_t decrypted[64]; + decrypt_cpa(decrypted, priv, ciphertext); + OPENSSL_memcpy(decrypted + 32, priv->pub.public_key_hash, + sizeof(decrypted) - 32); + uint8_t prekey_and_randomness[64]; + hash_g(prekey_and_randomness, decrypted, sizeof(decrypted)); + uint8_t expected_ciphertext[KYBER_CIPHERTEXT_BYTES]; + encrypt_cpa(expected_ciphertext, &priv->pub, decrypted, + prekey_and_randomness + 32); + uint8_t mask = + constant_time_eq_int_8(CRYPTO_memcmp(ciphertext, expected_ciphertext, + sizeof(expected_ciphertext)), + 0); + uint8_t input[64]; + for (int i = 0; i < 32; i++) { + input[i] = constant_time_select_8(mask, prekey_and_randomness[i], + priv->fo_failure_secret[i]); + } + hash_h(input + 32, ciphertext, KYBER_CIPHERTEXT_BYTES); + kdf(out_shared_secret, KYBER_SHARED_SECRET_BYTES, input, sizeof(input)); +} + +int bssl::KYBER_marshal_public_key(CBB *out, + const struct KYBER_public_key *public_key) { + return kyber_marshal_public_key(out, public_key_from_external(public_key)); +} + +// kyber_parse_public_key_no_hash parses |in| into |pub| but doesn't calculate +// the value of |pub->public_key_hash|. +static int kyber_parse_public_key_no_hash(struct public_key *pub, CBS *in) { + CBS t_bytes; + if (!CBS_get_bytes(in, &t_bytes, kEncodedVectorSize) || + !vector_decode(&pub->t, CBS_data(&t_bytes), kLog2Prime) || + !CBS_copy_bytes(in, pub->rho, sizeof(pub->rho))) { + return 0; + } + matrix_expand(&pub->m, pub->rho); + return 1; +} + +int bssl::KYBER_parse_public_key(struct KYBER_public_key *public_key, CBS *in) { + struct public_key *pub = public_key_from_external(public_key); + CBS orig_in = *in; + if (!kyber_parse_public_key_no_hash(pub, in) || // + CBS_len(in) != 0) { + return 0; + } + hash_h(pub->public_key_hash, CBS_data(&orig_in), CBS_len(&orig_in)); + return 1; +} + +int bssl::KYBER_marshal_private_key( + CBB *out, const struct KYBER_private_key *private_key) { + const struct private_key *const priv = private_key_from_external(private_key); + uint8_t *s_output; + if (!CBB_add_space(out, &s_output, kEncodedVectorSize)) { + return 0; + } + vector_encode(s_output, &priv->s, kLog2Prime); + if (!kyber_marshal_public_key(out, &priv->pub) || + !CBB_add_bytes(out, priv->pub.public_key_hash, + sizeof(priv->pub.public_key_hash)) || + !CBB_add_bytes(out, priv->fo_failure_secret, + sizeof(priv->fo_failure_secret))) { + return 0; + } + return 1; +} + +int bssl::KYBER_parse_private_key(struct KYBER_private_key *out_private_key, + CBS *in) { + struct private_key *const priv = private_key_from_external(out_private_key); + + CBS s_bytes; + if (!CBS_get_bytes(in, &s_bytes, kEncodedVectorSize) || + !vector_decode(&priv->s, CBS_data(&s_bytes), kLog2Prime) || + !kyber_parse_public_key_no_hash(&priv->pub, in) || + !CBS_copy_bytes(in, priv->pub.public_key_hash, + sizeof(priv->pub.public_key_hash)) || + !CBS_copy_bytes(in, priv->fo_failure_secret, + sizeof(priv->fo_failure_secret)) || + CBS_len(in) != 0) { + return 0; + } + return 1; +} diff --git a/third_party/boringssl/src/crypto/lhash/internal.h b/third_party/boringssl/src/crypto/lhash/internal.h index 512f06df..27c77806 100644 --- a/third_party/boringssl/src/crypto/lhash/internal.h +++ b/third_party/boringssl/src/crypto/lhash/internal.h @@ -1,76 +1,31 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. -#ifndef OPENSSL_HEADER_LHASH_INTERNAL_H -#define OPENSSL_HEADER_LHASH_INTERNAL_H +#ifndef OPENSSL_HEADER_CRYPTO_LHASH_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_LHASH_INTERNAL_H -#include +#include -#if defined(__cplusplus) -extern "C" { -#endif +BSSL_NAMESPACE_BEGIN // lhash is a traditional, chaining hash table that automatically expands and // contracts as needed. One should not use the lh_* functions directly, rather // use the type-safe macro wrappers: // // A hash table of a specific type of object has type |LHASH_OF(type)|. This -// can be defined (once) with |DEFINE_LHASH_OF(type)| and declared where needed -// with |DECLARE_LHASH_OF(type)|. For example: +// can be defined (once) with |DEFINE_LHASH_OF(type)|. // // struct foo { // int bar; @@ -82,7 +37,12 @@ extern "C" { // // A macro will be defined for each of the |OPENSSL_lh_*| functions below. For // |LHASH_OF(foo)|, the macros would be |lh_foo_new|, |lh_foo_num_items| etc. +// +// TODO(davidben): Now that this type is completely internal, this can just be a +// C++ template without any macros. + +#define LHASH_OF(type) struct bssl::type##_lhash_st // lhash_cmp_func is a comparison function that returns a value equal, or not // equal, to zero depending on whether |*a| is equal, or not equal to |*b|, @@ -105,7 +65,7 @@ typedef int (*lhash_cmp_func_helper)(lhash_cmp_func func, const void *a, typedef uint32_t (*lhash_hash_func)(const void *a); typedef uint32_t (*lhash_hash_func_helper)(lhash_hash_func func, const void *a); -typedef struct lhash_st _LHASH; +struct _LHASH; // OPENSSL_lh_new returns a new, empty hash table or NULL on error. OPENSSL_EXPORT _LHASH *OPENSSL_lh_new(lhash_hash_func hash, @@ -156,6 +116,8 @@ OPENSSL_EXPORT void OPENSSL_lh_doall_arg(_LHASH *lh, void (*func)(void *, void *), void *arg); +// DEFINE_LHASH_OF creates (inline) definitions of hash table. It must be used +// from within the bssl namespace. #define DEFINE_LHASH_OF(type) \ /* We disable MSVC C4191 in this macro, which warns when pointers are cast \ * to the wrong type. While the cast itself is valid, it is often a bug \ @@ -167,37 +129,37 @@ OPENSSL_EXPORT void OPENSSL_lh_doall_arg(_LHASH *lh, OPENSSL_MSVC_PRAGMA(warning(push)) \ OPENSSL_MSVC_PRAGMA(warning(disable : 4191)) \ \ - DECLARE_LHASH_OF(type) \ + struct type##_lhash_st; \ \ typedef int (*lhash_##type##_cmp_func)(const type *, const type *); \ typedef uint32_t (*lhash_##type##_hash_func)(const type *); \ \ - OPENSSL_INLINE int lh_##type##_call_cmp_func(lhash_cmp_func func, \ - const void *a, const void *b) { \ + inline int lh_##type##_call_cmp_func(lhash_cmp_func func, const void *a, \ + const void *b) { \ return ((lhash_##type##_cmp_func)func)((const type *)a, (const type *)b); \ } \ \ - OPENSSL_INLINE uint32_t lh_##type##_call_hash_func(lhash_hash_func func, \ - const void *a) { \ + inline uint32_t lh_##type##_call_hash_func(lhash_hash_func func, \ + const void *a) { \ return ((lhash_##type##_hash_func)func)((const type *)a); \ } \ \ - OPENSSL_INLINE LHASH_OF(type) *lh_##type##_new( \ - lhash_##type##_hash_func hash, lhash_##type##_cmp_func comp) { \ + inline LHASH_OF(type) *lh_##type##_new(lhash_##type##_hash_func hash, \ + lhash_##type##_cmp_func comp) { \ return (LHASH_OF(type) *)OPENSSL_lh_new((lhash_hash_func)hash, \ (lhash_cmp_func)comp); \ } \ \ - OPENSSL_INLINE void lh_##type##_free(LHASH_OF(type) *lh) { \ + inline void lh_##type##_free(LHASH_OF(type) *lh) { \ OPENSSL_lh_free((_LHASH *)lh); \ } \ \ - OPENSSL_INLINE size_t lh_##type##_num_items(const LHASH_OF(type) *lh) { \ + inline size_t lh_##type##_num_items(const LHASH_OF(type) *lh) { \ return OPENSSL_lh_num_items((const _LHASH *)lh); \ } \ \ - OPENSSL_INLINE type *lh_##type##_retrieve(const LHASH_OF(type) *lh, \ - const type *data) { \ + inline type *lh_##type##_retrieve(const LHASH_OF(type) *lh, \ + const type *data) { \ return (type *)OPENSSL_lh_retrieve((const _LHASH *)lh, data, \ lh_##type##_call_hash_func, \ lh_##type##_call_cmp_func); \ @@ -208,13 +170,12 @@ OPENSSL_EXPORT void OPENSSL_lh_doall_arg(_LHASH *lh, const void *key; \ } LHASH_CMP_KEY_##type; \ \ - OPENSSL_INLINE int lh_##type##_call_cmp_key(const void *key, \ - const void *value) { \ + inline int lh_##type##_call_cmp_key(const void *key, const void *value) { \ const LHASH_CMP_KEY_##type *cb = (const LHASH_CMP_KEY_##type *)key; \ return cb->cmp_key(cb->key, (const type *)value); \ } \ \ - OPENSSL_INLINE type *lh_##type##_retrieve_key( \ + inline type *lh_##type##_retrieve_key( \ const LHASH_OF(type) *lh, const void *key, uint32_t key_hash, \ int (*cmp_key)(const void *key, const type *value)) { \ LHASH_CMP_KEY_##type cb = {cmp_key, key}; \ @@ -222,9 +183,9 @@ OPENSSL_EXPORT void OPENSSL_lh_doall_arg(_LHASH *lh, lh_##type##_call_cmp_key); \ } \ \ - OPENSSL_INLINE int lh_##type##_insert(LHASH_OF(type) *lh, type **old_data, \ - type *data) { \ - void *old_data_void = NULL; \ + inline int lh_##type##_insert(LHASH_OF(type) *lh, type **old_data, \ + type *data) { \ + void *old_data_void = nullptr; \ int ret = OPENSSL_lh_insert((_LHASH *)lh, &old_data_void, data, \ lh_##type##_call_hash_func, \ lh_##type##_call_cmp_func); \ @@ -232,8 +193,7 @@ OPENSSL_EXPORT void OPENSSL_lh_doall_arg(_LHASH *lh, return ret; \ } \ \ - OPENSSL_INLINE type *lh_##type##_delete(LHASH_OF(type) *lh, \ - const type *data) { \ + inline type *lh_##type##_delete(LHASH_OF(type) *lh, const type *data) { \ return (type *)OPENSSL_lh_delete((_LHASH *)lh, data, \ lh_##type##_call_hash_func, \ lh_##type##_call_cmp_func); \ @@ -244,22 +204,19 @@ OPENSSL_EXPORT void OPENSSL_lh_doall_arg(_LHASH *lh, void *arg; \ } LHASH_DOALL_##type; \ \ - OPENSSL_INLINE void lh_##type##_call_doall_arg(void *value, void *arg) { \ + inline void lh_##type##_call_doall_arg(void *value, void *arg) { \ const LHASH_DOALL_##type *cb = (const LHASH_DOALL_##type *)arg; \ cb->doall_arg((type *)value, cb->arg); \ } \ \ - OPENSSL_INLINE void lh_##type##_doall_arg( \ - LHASH_OF(type) *lh, void (*func)(type *, void *), void *arg) { \ + inline void lh_##type##_doall_arg(LHASH_OF(type) *lh, \ + void (*func)(type *, void *), void *arg) { \ LHASH_DOALL_##type cb = {func, arg}; \ OPENSSL_lh_doall_arg((_LHASH *)lh, lh_##type##_call_doall_arg, &cb); \ } \ \ OPENSSL_MSVC_PRAGMA(warning(pop)) +BSSL_NAMESPACE_END -#if defined(__cplusplus) -} // extern C -#endif - -#endif // OPENSSL_HEADER_LHASH_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_LHASH_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/lhash/lhash.c b/third_party/boringssl/src/crypto/lhash/lhash.c deleted file mode 100644 index 4a95a2e6..00000000 --- a/third_party/boringssl/src/crypto/lhash/lhash.c +++ /dev/null @@ -1,353 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include - -#include "internal.h" -#include "../internal.h" - - -// kMinNumBuckets is the minimum size of the buckets array in an |_LHASH|. -static const size_t kMinNumBuckets = 16; - -// kMaxAverageChainLength contains the maximum, average chain length. When the -// average chain length exceeds this value, the hash table will be resized. -static const size_t kMaxAverageChainLength = 2; -static const size_t kMinAverageChainLength = 1; - -// lhash_item_st is an element of a hash chain. It points to the opaque data -// for this element and to the next item in the chain. The linked-list is NULL -// terminated. -typedef struct lhash_item_st { - void *data; - struct lhash_item_st *next; - // hash contains the cached, hash value of |data|. - uint32_t hash; -} LHASH_ITEM; - -struct lhash_st { - // num_items contains the total number of items in the hash table. - size_t num_items; - // buckets is an array of |num_buckets| pointers. Each points to the head of - // a chain of LHASH_ITEM objects that have the same hash value, mod - // |num_buckets|. - LHASH_ITEM **buckets; - // num_buckets contains the length of |buckets|. This value is always >= - // kMinNumBuckets. - size_t num_buckets; - // callback_depth contains the current depth of |lh_doall| or |lh_doall_arg| - // calls. If non-zero then this suppresses resizing of the |buckets| array, - // which would otherwise disrupt the iteration. - unsigned callback_depth; - - lhash_cmp_func comp; - lhash_hash_func hash; -}; - -_LHASH *OPENSSL_lh_new(lhash_hash_func hash, lhash_cmp_func comp) { - _LHASH *ret = OPENSSL_malloc(sizeof(_LHASH)); - if (ret == NULL) { - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(_LHASH)); - - ret->num_buckets = kMinNumBuckets; - ret->buckets = OPENSSL_malloc(sizeof(LHASH_ITEM *) * ret->num_buckets); - if (ret->buckets == NULL) { - OPENSSL_free(ret); - return NULL; - } - OPENSSL_memset(ret->buckets, 0, sizeof(LHASH_ITEM *) * ret->num_buckets); - - ret->comp = comp; - ret->hash = hash; - return ret; -} - -void OPENSSL_lh_free(_LHASH *lh) { - if (lh == NULL) { - return; - } - - for (size_t i = 0; i < lh->num_buckets; i++) { - LHASH_ITEM *next; - for (LHASH_ITEM *n = lh->buckets[i]; n != NULL; n = next) { - next = n->next; - OPENSSL_free(n); - } - } - - OPENSSL_free(lh->buckets); - OPENSSL_free(lh); -} - -size_t OPENSSL_lh_num_items(const _LHASH *lh) { return lh->num_items; } - -// get_next_ptr_and_hash returns a pointer to the pointer that points to the -// item equal to |data|. In other words, it searches for an item equal to |data| -// and, if it's at the start of a chain, then it returns a pointer to an -// element of |lh->buckets|, otherwise it returns a pointer to the |next| -// element of the previous item in the chain. If an element equal to |data| is -// not found, it returns a pointer that points to a NULL pointer. If |out_hash| -// is not NULL, then it also puts the hash value of |data| in |*out_hash|. -static LHASH_ITEM **get_next_ptr_and_hash(const _LHASH *lh, uint32_t *out_hash, - const void *data, - lhash_hash_func_helper call_hash_func, - lhash_cmp_func_helper call_cmp_func) { - const uint32_t hash = call_hash_func(lh->hash, data); - if (out_hash != NULL) { - *out_hash = hash; - } - - LHASH_ITEM **ret = &lh->buckets[hash % lh->num_buckets]; - for (LHASH_ITEM *cur = *ret; cur != NULL; cur = *ret) { - if (call_cmp_func(lh->comp, cur->data, data) == 0) { - break; - } - ret = &cur->next; - } - - return ret; -} - -// get_next_ptr_by_key behaves like |get_next_ptr_and_hash| but takes a key -// which may be a different type from the values stored in |lh|. -static LHASH_ITEM **get_next_ptr_by_key(const _LHASH *lh, const void *key, - uint32_t key_hash, - int (*cmp_key)(const void *key, - const void *value)) { - LHASH_ITEM **ret = &lh->buckets[key_hash % lh->num_buckets]; - for (LHASH_ITEM *cur = *ret; cur != NULL; cur = *ret) { - if (cmp_key(key, cur->data) == 0) { - break; - } - ret = &cur->next; - } - - return ret; -} - -void *OPENSSL_lh_retrieve(const _LHASH *lh, const void *data, - lhash_hash_func_helper call_hash_func, - lhash_cmp_func_helper call_cmp_func) { - LHASH_ITEM **next_ptr = - get_next_ptr_and_hash(lh, NULL, data, call_hash_func, call_cmp_func); - return *next_ptr == NULL ? NULL : (*next_ptr)->data; -} - -void *OPENSSL_lh_retrieve_key(const _LHASH *lh, const void *key, - uint32_t key_hash, - int (*cmp_key)(const void *key, - const void *value)) { - LHASH_ITEM **next_ptr = get_next_ptr_by_key(lh, key, key_hash, cmp_key); - return *next_ptr == NULL ? NULL : (*next_ptr)->data; -} - -// lh_rebucket allocates a new array of |new_num_buckets| pointers and -// redistributes the existing items into it before making it |lh->buckets| and -// freeing the old array. -static void lh_rebucket(_LHASH *lh, const size_t new_num_buckets) { - LHASH_ITEM **new_buckets, *cur, *next; - size_t i, alloc_size; - - alloc_size = sizeof(LHASH_ITEM *) * new_num_buckets; - if (alloc_size / sizeof(LHASH_ITEM*) != new_num_buckets) { - return; - } - - new_buckets = OPENSSL_malloc(alloc_size); - if (new_buckets == NULL) { - return; - } - OPENSSL_memset(new_buckets, 0, alloc_size); - - for (i = 0; i < lh->num_buckets; i++) { - for (cur = lh->buckets[i]; cur != NULL; cur = next) { - const size_t new_bucket = cur->hash % new_num_buckets; - next = cur->next; - cur->next = new_buckets[new_bucket]; - new_buckets[new_bucket] = cur; - } - } - - OPENSSL_free(lh->buckets); - - lh->num_buckets = new_num_buckets; - lh->buckets = new_buckets; -} - -// lh_maybe_resize resizes the |buckets| array if needed. -static void lh_maybe_resize(_LHASH *lh) { - size_t avg_chain_length; - - if (lh->callback_depth > 0) { - // Don't resize the hash if we are currently iterating over it. - return; - } - - assert(lh->num_buckets >= kMinNumBuckets); - avg_chain_length = lh->num_items / lh->num_buckets; - - if (avg_chain_length > kMaxAverageChainLength) { - const size_t new_num_buckets = lh->num_buckets * 2; - - if (new_num_buckets > lh->num_buckets) { - lh_rebucket(lh, new_num_buckets); - } - } else if (avg_chain_length < kMinAverageChainLength && - lh->num_buckets > kMinNumBuckets) { - size_t new_num_buckets = lh->num_buckets / 2; - - if (new_num_buckets < kMinNumBuckets) { - new_num_buckets = kMinNumBuckets; - } - - lh_rebucket(lh, new_num_buckets); - } -} - -int OPENSSL_lh_insert(_LHASH *lh, void **old_data, void *data, - lhash_hash_func_helper call_hash_func, - lhash_cmp_func_helper call_cmp_func) { - uint32_t hash; - LHASH_ITEM **next_ptr, *item; - - *old_data = NULL; - next_ptr = - get_next_ptr_and_hash(lh, &hash, data, call_hash_func, call_cmp_func); - - - if (*next_ptr != NULL) { - // An element equal to |data| already exists in the hash table. It will be - // replaced. - *old_data = (*next_ptr)->data; - (*next_ptr)->data = data; - return 1; - } - - // An element equal to |data| doesn't exist in the hash table yet. - item = OPENSSL_malloc(sizeof(LHASH_ITEM)); - if (item == NULL) { - return 0; - } - - item->data = data; - item->hash = hash; - item->next = NULL; - *next_ptr = item; - lh->num_items++; - lh_maybe_resize(lh); - - return 1; -} - -void *OPENSSL_lh_delete(_LHASH *lh, const void *data, - lhash_hash_func_helper call_hash_func, - lhash_cmp_func_helper call_cmp_func) { - LHASH_ITEM **next_ptr, *item, *ret; - - next_ptr = - get_next_ptr_and_hash(lh, NULL, data, call_hash_func, call_cmp_func); - - if (*next_ptr == NULL) { - // No such element. - return NULL; - } - - item = *next_ptr; - *next_ptr = item->next; - ret = item->data; - OPENSSL_free(item); - - lh->num_items--; - lh_maybe_resize(lh); - - return ret; -} - -void OPENSSL_lh_doall_arg(_LHASH *lh, void (*func)(void *, void *), void *arg) { - if (lh == NULL) { - return; - } - - if (lh->callback_depth < UINT_MAX) { - // |callback_depth| is a saturating counter. - lh->callback_depth++; - } - - for (size_t i = 0; i < lh->num_buckets; i++) { - LHASH_ITEM *next; - for (LHASH_ITEM *cur = lh->buckets[i]; cur != NULL; cur = next) { - next = cur->next; - func(cur->data, arg); - } - } - - if (lh->callback_depth < UINT_MAX) { - lh->callback_depth--; - } - - // The callback may have added or removed elements and the non-zero value of - // |callback_depth| will have suppressed any resizing. Thus any needed - // resizing is done here. - lh_maybe_resize(lh); -} diff --git a/third_party/boringssl/src/crypto/lhash/lhash.cc b/third_party/boringssl/src/crypto/lhash/lhash.cc new file mode 100644 index 00000000..b074bb86 --- /dev/null +++ b/third_party/boringssl/src/crypto/lhash/lhash.cc @@ -0,0 +1,312 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include + +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +BSSL_NAMESPACE_BEGIN + +// kMinNumBuckets is the minimum size of the buckets array in an |_LHASH|. +static const size_t kMinNumBuckets = 16; + +// kMaxAverageChainLength contains the maximum, average chain length. When the +// average chain length exceeds this value, the hash table will be resized. +static const size_t kMaxAverageChainLength = 2; +static const size_t kMinAverageChainLength = 1; + +// LHASH_ITEM is an element of a hash chain. It points to the opaque data +// for this element and to the next item in the chain. The linked-list is NULL +// terminated. +struct LHASH_ITEM { + void *data = nullptr; + LHASH_ITEM *next = nullptr; + // hash contains the cached, hash value of |data|. + uint32_t hash = 0; +}; + +struct _LHASH { + // num_items contains the total number of items in the hash table. + size_t num_items = 0; + // buckets is an array of |num_buckets| pointers. Each points to the head of + // a chain of LHASH_ITEM objects that have the same hash value, mod + // |num_buckets|. + LHASH_ITEM **buckets = nullptr; + // num_buckets contains the length of |buckets|. This value is always >= + // kMinNumBuckets. + size_t num_buckets = 0; + // callback_depth contains the current depth of |lh_doall| or |lh_doall_arg| + // calls. If non-zero then this suppresses resizing of the |buckets| array, + // which would otherwise disrupt the iteration. + unsigned callback_depth = 0; + + lhash_cmp_func comp = nullptr; + lhash_hash_func hash = nullptr; +}; + +_LHASH *OPENSSL_lh_new(lhash_hash_func hash, lhash_cmp_func comp) { + _LHASH *ret = New<_LHASH>(); + if (ret == nullptr) { + return nullptr; + } + + ret->num_buckets = kMinNumBuckets; + ret->buckets = reinterpret_cast( + OPENSSL_calloc(ret->num_buckets, sizeof(LHASH_ITEM *))); + if (ret->buckets == nullptr) { + Delete(ret); + return nullptr; + } + + ret->comp = comp; + ret->hash = hash; + return ret; +} + +void OPENSSL_lh_free(_LHASH *lh) { + if (lh == nullptr) { + return; + } + + for (size_t i = 0; i < lh->num_buckets; i++) { + LHASH_ITEM *next; + for (LHASH_ITEM *n = lh->buckets[i]; n != nullptr; n = next) { + next = n->next; + Delete(n); + } + } + + OPENSSL_free(lh->buckets); + Delete(lh); +} + +size_t OPENSSL_lh_num_items(const _LHASH *lh) { return lh->num_items; } + +// get_next_ptr_and_hash returns a pointer to the pointer that points to the +// item equal to |data|. In other words, it searches for an item equal to |data| +// and, if it's at the start of a chain, then it returns a pointer to an +// element of |lh->buckets|, otherwise it returns a pointer to the |next| +// element of the previous item in the chain. If an element equal to |data| is +// not found, it returns a pointer that points to a NULL pointer. If |out_hash| +// is not NULL, then it also puts the hash value of |data| in |*out_hash|. +static LHASH_ITEM **get_next_ptr_and_hash(const _LHASH *lh, uint32_t *out_hash, + const void *data, + lhash_hash_func_helper call_hash_func, + lhash_cmp_func_helper call_cmp_func) { + const uint32_t hash = call_hash_func(lh->hash, data); + if (out_hash != nullptr) { + *out_hash = hash; + } + + LHASH_ITEM **ret = &lh->buckets[hash % lh->num_buckets]; + for (LHASH_ITEM *cur = *ret; cur != nullptr; cur = *ret) { + if (call_cmp_func(lh->comp, cur->data, data) == 0) { + break; + } + ret = &cur->next; + } + + return ret; +} + +// get_next_ptr_by_key behaves like |get_next_ptr_and_hash| but takes a key +// which may be a different type from the values stored in |lh|. +static LHASH_ITEM **get_next_ptr_by_key(const _LHASH *lh, const void *key, + uint32_t key_hash, + int (*cmp_key)(const void *key, + const void *value)) { + LHASH_ITEM **ret = &lh->buckets[key_hash % lh->num_buckets]; + for (LHASH_ITEM *cur = *ret; cur != nullptr; cur = *ret) { + if (cmp_key(key, cur->data) == 0) { + break; + } + ret = &cur->next; + } + + return ret; +} + +void *OPENSSL_lh_retrieve(const _LHASH *lh, const void *data, + lhash_hash_func_helper call_hash_func, + lhash_cmp_func_helper call_cmp_func) { + LHASH_ITEM **next_ptr = + get_next_ptr_and_hash(lh, nullptr, data, call_hash_func, call_cmp_func); + return *next_ptr == nullptr ? nullptr : (*next_ptr)->data; +} + +void *OPENSSL_lh_retrieve_key(const _LHASH *lh, const void *key, + uint32_t key_hash, + int (*cmp_key)(const void *key, + const void *value)) { + LHASH_ITEM **next_ptr = get_next_ptr_by_key(lh, key, key_hash, cmp_key); + return *next_ptr == nullptr ? nullptr : (*next_ptr)->data; +} + +// lh_rebucket allocates a new array of |new_num_buckets| pointers and +// redistributes the existing items into it before making it |lh->buckets| and +// freeing the old array. +static void lh_rebucket(_LHASH *lh, const size_t new_num_buckets) { + LHASH_ITEM **new_buckets, *cur, *next; + size_t i, alloc_size; + + alloc_size = sizeof(LHASH_ITEM *) * new_num_buckets; + if (alloc_size / sizeof(LHASH_ITEM *) != new_num_buckets) { + return; + } + + new_buckets = reinterpret_cast(OPENSSL_zalloc(alloc_size)); + if (new_buckets == nullptr) { + return; + } + + for (i = 0; i < lh->num_buckets; i++) { + for (cur = lh->buckets[i]; cur != nullptr; cur = next) { + const size_t new_bucket = cur->hash % new_num_buckets; + next = cur->next; + cur->next = new_buckets[new_bucket]; + new_buckets[new_bucket] = cur; + } + } + + OPENSSL_free(lh->buckets); + + lh->num_buckets = new_num_buckets; + lh->buckets = new_buckets; +} + +// lh_maybe_resize resizes the |buckets| array if needed. +static void lh_maybe_resize(_LHASH *lh) { + size_t avg_chain_length; + + if (lh->callback_depth > 0) { + // Don't resize the hash if we are currently iterating over it. + return; + } + + assert(lh->num_buckets >= kMinNumBuckets); + avg_chain_length = lh->num_items / lh->num_buckets; + + if (avg_chain_length > kMaxAverageChainLength) { + const size_t new_num_buckets = lh->num_buckets * 2; + + if (new_num_buckets > lh->num_buckets) { + lh_rebucket(lh, new_num_buckets); + } + } else if (avg_chain_length < kMinAverageChainLength && + lh->num_buckets > kMinNumBuckets) { + size_t new_num_buckets = lh->num_buckets / 2; + + if (new_num_buckets < kMinNumBuckets) { + new_num_buckets = kMinNumBuckets; + } + + lh_rebucket(lh, new_num_buckets); + } +} + +int OPENSSL_lh_insert(_LHASH *lh, void **old_data, void *data, + lhash_hash_func_helper call_hash_func, + lhash_cmp_func_helper call_cmp_func) { + uint32_t hash; + LHASH_ITEM **next_ptr, *item; + + *old_data = nullptr; + next_ptr = + get_next_ptr_and_hash(lh, &hash, data, call_hash_func, call_cmp_func); + + + if (*next_ptr != nullptr) { + // An element equal to |data| already exists in the hash table. It will be + // replaced. + *old_data = (*next_ptr)->data; + (*next_ptr)->data = data; + return 1; + } + + // An element equal to |data| doesn't exist in the hash table yet. + item = New(); + if (item == nullptr) { + return 0; + } + + item->data = data; + item->hash = hash; + item->next = nullptr; + *next_ptr = item; + lh->num_items++; + lh_maybe_resize(lh); + + return 1; +} + +void *OPENSSL_lh_delete(_LHASH *lh, const void *data, + lhash_hash_func_helper call_hash_func, + lhash_cmp_func_helper call_cmp_func) { + LHASH_ITEM **next_ptr, *item, *ret; + + next_ptr = + get_next_ptr_and_hash(lh, nullptr, data, call_hash_func, call_cmp_func); + + if (*next_ptr == nullptr) { + // No such element. + return nullptr; + } + + item = *next_ptr; + *next_ptr = item->next; + ret = reinterpret_cast(item->data); + Delete(item); + + lh->num_items--; + lh_maybe_resize(lh); + + return ret; +} + +void OPENSSL_lh_doall_arg(_LHASH *lh, void (*func)(void *, void *), void *arg) { + if (lh == nullptr) { + return; + } + + if (lh->callback_depth < UINT_MAX) { + // |callback_depth| is a saturating counter. + lh->callback_depth++; + } + + for (size_t i = 0; i < lh->num_buckets; i++) { + LHASH_ITEM *next; + for (LHASH_ITEM *cur = lh->buckets[i]; cur != nullptr; cur = next) { + next = cur->next; + func(cur->data, arg); + } + } + + if (lh->callback_depth < UINT_MAX) { + lh->callback_depth--; + } + + // The callback may have added or removed elements and the non-zero value of + // |callback_depth| will have suppressed any resizing. Thus any needed + // resizing is done here. + lh_maybe_resize(lh); +} + +BSSL_NAMESPACE_END diff --git a/third_party/boringssl/src/crypto/md4/md4.cc b/third_party/boringssl/src/crypto/md4/md4.cc new file mode 100644 index 00000000..d72c46b0 --- /dev/null +++ b/third_party/boringssl/src/crypto/md4/md4.cc @@ -0,0 +1,207 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include "../fipsmodule/digest/md32_common.h" +#include "../internal.h" + + +using namespace bssl; + +uint8_t *MD4(const uint8_t *data, size_t len, uint8_t out[MD4_DIGEST_LENGTH]) { + MD4_CTX ctx; + MD4_Init(&ctx); + MD4_Update(&ctx, data, len); + MD4_Final(out, &ctx); + + return out; +} + +// Implemented from RFC 1186 The MD4 Message-Digest Algorithm. + +int MD4_Init(MD4_CTX *md4) { + OPENSSL_memset(md4, 0, sizeof(MD4_CTX)); + md4->h[0] = 0x67452301UL; + md4->h[1] = 0xefcdab89UL; + md4->h[2] = 0x98badcfeUL; + md4->h[3] = 0x10325476UL; + return 1; +} + +static void md4_block_data_order(uint32_t *state, const uint8_t *data, + size_t num); + +void MD4_Transform(MD4_CTX *c, const uint8_t data[MD4_CBLOCK]) { + md4_block_data_order(c->h, data, 1); +} + +namespace { +struct MD4Traits { + using HashContext = MD4_CTX; + static constexpr size_t kBlockSize = MD4_CBLOCK; + static constexpr bool kLengthIsBigEndian = false; + static void HashBlocks(uint32_t *state, const uint8_t *data, + size_t num_blocks) { + md4_block_data_order(state, data, num_blocks); + } +}; +} // namespace + +int MD4_Update(MD4_CTX *c, const void *data, size_t len) { + crypto_md32_update(c, + Span(static_cast(data), len)); + return 1; +} + +int MD4_Final(uint8_t out[MD4_DIGEST_LENGTH], MD4_CTX *c) { + crypto_md32_final(c); + CRYPTO_store_u32_le(out, c->h[0]); + CRYPTO_store_u32_le(out + 4, c->h[1]); + CRYPTO_store_u32_le(out + 8, c->h[2]); + CRYPTO_store_u32_le(out + 12, c->h[3]); + return 1; +} + +// As pointed out by Wei Dai , the above can be +// simplified to the code below. Wei attributes these optimizations +// to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel. +#define F(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) +#define G(b, c, d) (((b) & (c)) | ((b) & (d)) | ((c) & (d))) +#define H(b, c, d) ((b) ^ (c) ^ (d)) + +#define R0(a, b, c, d, k, s, t) \ + do { \ + (a) += ((k) + (t) + F((b), (c), (d))); \ + (a) = CRYPTO_rotl_u32(a, s); \ + } while (0) + +#define R1(a, b, c, d, k, s, t) \ + do { \ + (a) += ((k) + (t) + G((b), (c), (d))); \ + (a) = CRYPTO_rotl_u32(a, s); \ + } while (0) + +#define R2(a, b, c, d, k, s, t) \ + do { \ + (a) += ((k) + (t) + H((b), (c), (d))); \ + (a) = CRYPTO_rotl_u32(a, s); \ + } while (0) + +static void md4_block_data_order(uint32_t *state, const uint8_t *data, + size_t num) { + uint32_t A, B, C, D; + uint32_t X0, X1, X2, X3, X4, X5, X6, X7, X8, X9, X10, X11, X12, X13, X14, X15; + + A = state[0]; + B = state[1]; + C = state[2]; + D = state[3]; + + for (; num--;) { + X0 = CRYPTO_load_u32_le(data); + data += 4; + X1 = CRYPTO_load_u32_le(data); + data += 4; + // Round 0 + R0(A, B, C, D, X0, 3, 0); + X2 = CRYPTO_load_u32_le(data); + data += 4; + R0(D, A, B, C, X1, 7, 0); + X3 = CRYPTO_load_u32_le(data); + data += 4; + R0(C, D, A, B, X2, 11, 0); + X4 = CRYPTO_load_u32_le(data); + data += 4; + R0(B, C, D, A, X3, 19, 0); + X5 = CRYPTO_load_u32_le(data); + data += 4; + R0(A, B, C, D, X4, 3, 0); + X6 = CRYPTO_load_u32_le(data); + data += 4; + R0(D, A, B, C, X5, 7, 0); + X7 = CRYPTO_load_u32_le(data); + data += 4; + R0(C, D, A, B, X6, 11, 0); + X8 = CRYPTO_load_u32_le(data); + data += 4; + R0(B, C, D, A, X7, 19, 0); + X9 = CRYPTO_load_u32_le(data); + data += 4; + R0(A, B, C, D, X8, 3, 0); + X10 = CRYPTO_load_u32_le(data); + data += 4; + R0(D, A, B, C, X9, 7, 0); + X11 = CRYPTO_load_u32_le(data); + data += 4; + R0(C, D, A, B, X10, 11, 0); + X12 = CRYPTO_load_u32_le(data); + data += 4; + R0(B, C, D, A, X11, 19, 0); + X13 = CRYPTO_load_u32_le(data); + data += 4; + R0(A, B, C, D, X12, 3, 0); + X14 = CRYPTO_load_u32_le(data); + data += 4; + R0(D, A, B, C, X13, 7, 0); + X15 = CRYPTO_load_u32_le(data); + data += 4; + R0(C, D, A, B, X14, 11, 0); + R0(B, C, D, A, X15, 19, 0); + // Round 1 + R1(A, B, C, D, X0, 3, 0x5A827999L); + R1(D, A, B, C, X4, 5, 0x5A827999L); + R1(C, D, A, B, X8, 9, 0x5A827999L); + R1(B, C, D, A, X12, 13, 0x5A827999L); + R1(A, B, C, D, X1, 3, 0x5A827999L); + R1(D, A, B, C, X5, 5, 0x5A827999L); + R1(C, D, A, B, X9, 9, 0x5A827999L); + R1(B, C, D, A, X13, 13, 0x5A827999L); + R1(A, B, C, D, X2, 3, 0x5A827999L); + R1(D, A, B, C, X6, 5, 0x5A827999L); + R1(C, D, A, B, X10, 9, 0x5A827999L); + R1(B, C, D, A, X14, 13, 0x5A827999L); + R1(A, B, C, D, X3, 3, 0x5A827999L); + R1(D, A, B, C, X7, 5, 0x5A827999L); + R1(C, D, A, B, X11, 9, 0x5A827999L); + R1(B, C, D, A, X15, 13, 0x5A827999L); + // Round 2 + R2(A, B, C, D, X0, 3, 0x6ED9EBA1L); + R2(D, A, B, C, X8, 9, 0x6ED9EBA1L); + R2(C, D, A, B, X4, 11, 0x6ED9EBA1L); + R2(B, C, D, A, X12, 15, 0x6ED9EBA1L); + R2(A, B, C, D, X2, 3, 0x6ED9EBA1L); + R2(D, A, B, C, X10, 9, 0x6ED9EBA1L); + R2(C, D, A, B, X6, 11, 0x6ED9EBA1L); + R2(B, C, D, A, X14, 15, 0x6ED9EBA1L); + R2(A, B, C, D, X1, 3, 0x6ED9EBA1L); + R2(D, A, B, C, X9, 9, 0x6ED9EBA1L); + R2(C, D, A, B, X5, 11, 0x6ED9EBA1L); + R2(B, C, D, A, X13, 15, 0x6ED9EBA1L); + R2(A, B, C, D, X3, 3, 0x6ED9EBA1L); + R2(D, A, B, C, X11, 9, 0x6ED9EBA1L); + R2(C, D, A, B, X7, 11, 0x6ED9EBA1L); + R2(B, C, D, A, X15, 15, 0x6ED9EBA1L); + + A = state[0] += A; + B = state[1] += B; + C = state[2] += C; + D = state[3] += D; + } +} diff --git a/third_party/boringssl/src/crypto/md5/internal.h b/third_party/boringssl/src/crypto/md5/internal.h new file mode 100644 index 00000000..428b5a69 --- /dev/null +++ b/third_party/boringssl/src/crypto/md5/internal.h @@ -0,0 +1,34 @@ +// Copyright 2018 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_MD5_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_MD5_INTERNAL_H + +#include + +#include "../internal.h" + + +BSSL_NAMESPACE_BEGIN + +#if !defined(OPENSSL_NO_ASM) && \ + (defined(OPENSSL_X86_64) || defined(OPENSSL_X86)) +#define MD5_ASM +extern "C" void md5_block_asm_data_order(uint32_t *state, const uint8_t *data, + size_t num); +#endif + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_MD5_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/md5/md5.cc b/third_party/boringssl/src/crypto/md5/md5.cc new file mode 100644 index 00000000..0604ddc0 --- /dev/null +++ b/third_party/boringssl/src/crypto/md5/md5.cc @@ -0,0 +1,255 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include + +#include "../fipsmodule/digest/md32_common.h" +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +uint8_t *MD5(const uint8_t *data, size_t len, uint8_t out[MD5_DIGEST_LENGTH]) { + MD5_CTX ctx; + MD5_Init(&ctx); + MD5_Update(&ctx, data, len); + MD5_Final(out, &ctx); + + return out; +} + +int MD5_Init(MD5_CTX *md5) { + OPENSSL_memset(md5, 0, sizeof(MD5_CTX)); + md5->h[0] = 0x67452301UL; + md5->h[1] = 0xefcdab89UL; + md5->h[2] = 0x98badcfeUL; + md5->h[3] = 0x10325476UL; + return 1; +} + +#if defined(MD5_ASM) +#define md5_block_data_order md5_block_asm_data_order +#else +static void md5_block_data_order(uint32_t *state, const uint8_t *data, + size_t num); +#endif + +void MD5_Transform(MD5_CTX *c, const uint8_t data[MD5_CBLOCK]) { + md5_block_data_order(c->h, data, 1); +} + +namespace { +struct MD5Traits { + using HashContext = MD5_CTX; + static constexpr size_t kBlockSize = MD5_CBLOCK; + static constexpr bool kLengthIsBigEndian = false; + static void HashBlocks(uint32_t *state, const uint8_t *data, + size_t num_blocks) { + md5_block_data_order(state, data, num_blocks); + } +}; +} // namespace + +int MD5_Update(MD5_CTX *c, const void *data, size_t len) { + crypto_md32_update(c, + Span(static_cast(data), len)); + return 1; +} + +int MD5_Final(uint8_t out[MD5_DIGEST_LENGTH], MD5_CTX *c) { + crypto_md32_final(c); + CRYPTO_store_u32_le(out, c->h[0]); + CRYPTO_store_u32_le(out + 4, c->h[1]); + CRYPTO_store_u32_le(out + 8, c->h[2]); + CRYPTO_store_u32_le(out + 12, c->h[3]); + return 1; +} + +// As pointed out by Wei Dai , the above can be +// simplified to the code below. Wei attributes these optimizations +// to Peter Gutmann's SHS code, and he attributes it to Rich Schroeppel. +#define F(b, c, d) ((((c) ^ (d)) & (b)) ^ (d)) +#define G(b, c, d) ((((b) ^ (c)) & (d)) ^ (c)) +#define H(b, c, d) ((b) ^ (c) ^ (d)) +#define I(b, c, d) (((~(d)) | (b)) ^ (c)) + +#define R0(a, b, c, d, k, s, t) \ + do { \ + (a) += ((k) + (t) + F((b), (c), (d))); \ + (a) = CRYPTO_rotl_u32(a, s); \ + (a) += (b); \ + } while (0) + +#define R1(a, b, c, d, k, s, t) \ + do { \ + (a) += ((k) + (t) + G((b), (c), (d))); \ + (a) = CRYPTO_rotl_u32(a, s); \ + (a) += (b); \ + } while (0) + +#define R2(a, b, c, d, k, s, t) \ + do { \ + (a) += ((k) + (t) + H((b), (c), (d))); \ + (a) = CRYPTO_rotl_u32(a, s); \ + (a) += (b); \ + } while (0) + +#define R3(a, b, c, d, k, s, t) \ + do { \ + (a) += ((k) + (t) + I((b), (c), (d))); \ + (a) = CRYPTO_rotl_u32(a, s); \ + (a) += (b); \ + } while (0) + +#ifndef MD5_ASM +#ifdef X +#undef X +#endif +static void md5_block_data_order(uint32_t *state, const uint8_t *data, + size_t num) { + uint32_t A, B, C, D; + uint32_t XX0, XX1, XX2, XX3, XX4, XX5, XX6, XX7, XX8, XX9, XX10, XX11, XX12, + XX13, XX14, XX15; +#define X(i) XX##i + + A = state[0]; + B = state[1]; + C = state[2]; + D = state[3]; + + for (; num--;) { + X(0) = CRYPTO_load_u32_le(data); + data += 4; + X(1) = CRYPTO_load_u32_le(data); + data += 4; + // Round 0 + R0(A, B, C, D, X(0), 7, 0xd76aa478L); + X(2) = CRYPTO_load_u32_le(data); + data += 4; + R0(D, A, B, C, X(1), 12, 0xe8c7b756L); + X(3) = CRYPTO_load_u32_le(data); + data += 4; + R0(C, D, A, B, X(2), 17, 0x242070dbL); + X(4) = CRYPTO_load_u32_le(data); + data += 4; + R0(B, C, D, A, X(3), 22, 0xc1bdceeeL); + X(5) = CRYPTO_load_u32_le(data); + data += 4; + R0(A, B, C, D, X(4), 7, 0xf57c0fafL); + X(6) = CRYPTO_load_u32_le(data); + data += 4; + R0(D, A, B, C, X(5), 12, 0x4787c62aL); + X(7) = CRYPTO_load_u32_le(data); + data += 4; + R0(C, D, A, B, X(6), 17, 0xa8304613L); + X(8) = CRYPTO_load_u32_le(data); + data += 4; + R0(B, C, D, A, X(7), 22, 0xfd469501L); + X(9) = CRYPTO_load_u32_le(data); + data += 4; + R0(A, B, C, D, X(8), 7, 0x698098d8L); + X(10) = CRYPTO_load_u32_le(data); + data += 4; + R0(D, A, B, C, X(9), 12, 0x8b44f7afL); + X(11) = CRYPTO_load_u32_le(data); + data += 4; + R0(C, D, A, B, X(10), 17, 0xffff5bb1L); + X(12) = CRYPTO_load_u32_le(data); + data += 4; + R0(B, C, D, A, X(11), 22, 0x895cd7beL); + X(13) = CRYPTO_load_u32_le(data); + data += 4; + R0(A, B, C, D, X(12), 7, 0x6b901122L); + X(14) = CRYPTO_load_u32_le(data); + data += 4; + R0(D, A, B, C, X(13), 12, 0xfd987193L); + X(15) = CRYPTO_load_u32_le(data); + data += 4; + R0(C, D, A, B, X(14), 17, 0xa679438eL); + R0(B, C, D, A, X(15), 22, 0x49b40821L); + // Round 1 + R1(A, B, C, D, X(1), 5, 0xf61e2562L); + R1(D, A, B, C, X(6), 9, 0xc040b340L); + R1(C, D, A, B, X(11), 14, 0x265e5a51L); + R1(B, C, D, A, X(0), 20, 0xe9b6c7aaL); + R1(A, B, C, D, X(5), 5, 0xd62f105dL); + R1(D, A, B, C, X(10), 9, 0x02441453L); + R1(C, D, A, B, X(15), 14, 0xd8a1e681L); + R1(B, C, D, A, X(4), 20, 0xe7d3fbc8L); + R1(A, B, C, D, X(9), 5, 0x21e1cde6L); + R1(D, A, B, C, X(14), 9, 0xc33707d6L); + R1(C, D, A, B, X(3), 14, 0xf4d50d87L); + R1(B, C, D, A, X(8), 20, 0x455a14edL); + R1(A, B, C, D, X(13), 5, 0xa9e3e905L); + R1(D, A, B, C, X(2), 9, 0xfcefa3f8L); + R1(C, D, A, B, X(7), 14, 0x676f02d9L); + R1(B, C, D, A, X(12), 20, 0x8d2a4c8aL); + // Round 2 + R2(A, B, C, D, X(5), 4, 0xfffa3942L); + R2(D, A, B, C, X(8), 11, 0x8771f681L); + R2(C, D, A, B, X(11), 16, 0x6d9d6122L); + R2(B, C, D, A, X(14), 23, 0xfde5380cL); + R2(A, B, C, D, X(1), 4, 0xa4beea44L); + R2(D, A, B, C, X(4), 11, 0x4bdecfa9L); + R2(C, D, A, B, X(7), 16, 0xf6bb4b60L); + R2(B, C, D, A, X(10), 23, 0xbebfbc70L); + R2(A, B, C, D, X(13), 4, 0x289b7ec6L); + R2(D, A, B, C, X(0), 11, 0xeaa127faL); + R2(C, D, A, B, X(3), 16, 0xd4ef3085L); + R2(B, C, D, A, X(6), 23, 0x04881d05L); + R2(A, B, C, D, X(9), 4, 0xd9d4d039L); + R2(D, A, B, C, X(12), 11, 0xe6db99e5L); + R2(C, D, A, B, X(15), 16, 0x1fa27cf8L); + R2(B, C, D, A, X(2), 23, 0xc4ac5665L); + // Round 3 + R3(A, B, C, D, X(0), 6, 0xf4292244L); + R3(D, A, B, C, X(7), 10, 0x432aff97L); + R3(C, D, A, B, X(14), 15, 0xab9423a7L); + R3(B, C, D, A, X(5), 21, 0xfc93a039L); + R3(A, B, C, D, X(12), 6, 0x655b59c3L); + R3(D, A, B, C, X(3), 10, 0x8f0ccc92L); + R3(C, D, A, B, X(10), 15, 0xffeff47dL); + R3(B, C, D, A, X(1), 21, 0x85845dd1L); + R3(A, B, C, D, X(8), 6, 0x6fa87e4fL); + R3(D, A, B, C, X(15), 10, 0xfe2ce6e0L); + R3(C, D, A, B, X(6), 15, 0xa3014314L); + R3(B, C, D, A, X(13), 21, 0x4e0811a1L); + R3(A, B, C, D, X(4), 6, 0xf7537e82L); + R3(D, A, B, C, X(11), 10, 0xbd3af235L); + R3(C, D, A, B, X(2), 15, 0x2ad7d2bbL); + R3(B, C, D, A, X(9), 21, 0xeb86d391L); + + A = state[0] += A; + B = state[1] += B; + C = state[2] += C; + D = state[3] += D; + } +} +#undef X +#endif + +#undef F +#undef G +#undef H +#undef I +#undef R0 +#undef R1 +#undef R2 +#undef R3 diff --git a/third_party/boringssl/src/crypto/mem.c b/third_party/boringssl/src/crypto/mem.c deleted file mode 100644 index c90bb162..00000000 --- a/third_party/boringssl/src/crypto/mem.c +++ /dev/null @@ -1,416 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include - -#if defined(OPENSSL_WINDOWS) -OPENSSL_MSVC_PRAGMA(warning(push, 3)) -#include -OPENSSL_MSVC_PRAGMA(warning(pop)) -#endif - -#include "internal.h" - - -#define OPENSSL_MALLOC_PREFIX 8 -static_assert(OPENSSL_MALLOC_PREFIX >= sizeof(size_t), "size_t too large"); - -#if defined(OPENSSL_ASAN) -void __asan_poison_memory_region(const volatile void *addr, size_t size); -void __asan_unpoison_memory_region(const volatile void *addr, size_t size); -#else -static void __asan_poison_memory_region(const void *addr, size_t size) {} -static void __asan_unpoison_memory_region(const void *addr, size_t size) {} -#endif - -// Windows doesn't really support weak symbols as of May 2019, and Clang on -// Windows will emit strong symbols instead. See -// https://bugs.llvm.org/show_bug.cgi?id=37598 -#if defined(__ELF__) && defined(__GNUC__) -#define WEAK_SYMBOL_FUNC(rettype, name, args) \ - rettype name args __attribute__((weak)); -#else -#define WEAK_SYMBOL_FUNC(rettype, name, args) static rettype(*name) args = NULL; -#endif - -// sdallocx is a sized |free| function. By passing the size (which we happen to -// always know in BoringSSL), the malloc implementation can save work. We cannot -// depend on |sdallocx| being available, however, so it's a weak symbol. -// -// This will always be safe, but will only be overridden if the malloc -// implementation is statically linked with BoringSSL. So, if |sdallocx| is -// provided in, say, libc.so, we still won't use it because that's dynamically -// linked. This isn't an ideal result, but its helps in some cases. -WEAK_SYMBOL_FUNC(void, sdallocx, (void *ptr, size_t size, int flags)); - -// The following three functions can be defined to override default heap -// allocation and freeing. If defined, it is the responsibility of -// |OPENSSL_memory_free| to zero out the memory before returning it to the -// system. |OPENSSL_memory_free| will not be passed NULL pointers. -// -// WARNING: These functions are called on every allocation and free in -// BoringSSL across the entire process. They may be called by any code in the -// process which calls BoringSSL, including in process initializers and thread -// destructors. When called, BoringSSL may hold pthreads locks. Any other code -// in the process which, directly or indirectly, calls BoringSSL may be on the -// call stack and may itself be using arbitrary synchronization primitives. -// -// As a result, these functions may not have the usual programming environment -// available to most C or C++ code. In particular, they may not call into -// BoringSSL, or any library which depends on BoringSSL. Any synchronization -// primitives used must tolerate every other synchronization primitive linked -// into the process, including pthreads locks. Failing to meet these constraints -// may result in deadlocks, crashes, or memory corruption. -WEAK_SYMBOL_FUNC(void*, OPENSSL_memory_alloc, (size_t size)); -WEAK_SYMBOL_FUNC(void, OPENSSL_memory_free, (void *ptr)); -WEAK_SYMBOL_FUNC(size_t, OPENSSL_memory_get_size, (void *ptr)); - -// kBoringSSLBinaryTag is a distinctive byte sequence to identify binaries that -// are linking in BoringSSL and, roughly, what version they are using. -static const uint8_t kBoringSSLBinaryTag[18] = { - // 16 bytes of magic tag. - 0x8c, 0x62, 0x20, 0x0b, 0xd2, 0xa0, 0x72, 0x58, - 0x44, 0xa8, 0x96, 0x69, 0xad, 0x55, 0x7e, 0xec, - // Current source iteration. Incremented ~monthly. - 3, 0, -}; - -void *OPENSSL_malloc(size_t size) { - if (OPENSSL_memory_alloc != NULL) { - assert(OPENSSL_memory_free != NULL); - assert(OPENSSL_memory_get_size != NULL); - return OPENSSL_memory_alloc(size); - } - - if (size + OPENSSL_MALLOC_PREFIX < size) { - // |OPENSSL_malloc| is a central function in BoringSSL thus a reference to - // |kBoringSSLBinaryTag| is created here so that the tag isn't discarded by - // the linker. The following is sufficient to stop GCC, Clang, and MSVC - // optimising away the reference at the time of writing. Since this - // probably results in an actual memory reference, it is put in this very - // rare code path. - uint8_t unused = *(volatile uint8_t *)kBoringSSLBinaryTag; - (void) unused; - return NULL; - } - - void *ptr = malloc(size + OPENSSL_MALLOC_PREFIX); - if (ptr == NULL) { - return NULL; - } - - *(size_t *)ptr = size; - - __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); - return ((uint8_t *)ptr) + OPENSSL_MALLOC_PREFIX; -} - -void OPENSSL_free(void *orig_ptr) { - if (orig_ptr == NULL) { - return; - } - - if (OPENSSL_memory_free != NULL) { - OPENSSL_memory_free(orig_ptr); - return; - } - - void *ptr = ((uint8_t *)orig_ptr) - OPENSSL_MALLOC_PREFIX; - __asan_unpoison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); - - size_t size = *(size_t *)ptr; - OPENSSL_cleanse(ptr, size + OPENSSL_MALLOC_PREFIX); - -// ASan knows to intercept malloc and free, but not sdallocx. -#if defined(OPENSSL_ASAN) - (void)sdallocx; - free(ptr); -#else - if (sdallocx) { - sdallocx(ptr, size + OPENSSL_MALLOC_PREFIX, 0 /* flags */); - } else { - free(ptr); - } -#endif -} - -void *OPENSSL_realloc(void *orig_ptr, size_t new_size) { - if (orig_ptr == NULL) { - return OPENSSL_malloc(new_size); - } - - size_t old_size; - if (OPENSSL_memory_get_size != NULL) { - old_size = OPENSSL_memory_get_size(orig_ptr); - } else { - void *ptr = ((uint8_t *)orig_ptr) - OPENSSL_MALLOC_PREFIX; - __asan_unpoison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); - old_size = *(size_t *)ptr; - __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); - } - - void *ret = OPENSSL_malloc(new_size); - if (ret == NULL) { - return NULL; - } - - size_t to_copy = new_size; - if (old_size < to_copy) { - to_copy = old_size; - } - - memcpy(ret, orig_ptr, to_copy); - OPENSSL_free(orig_ptr); - - return ret; -} - -void OPENSSL_cleanse(void *ptr, size_t len) { -#if defined(OPENSSL_WINDOWS) - SecureZeroMemory(ptr, len); -#else - OPENSSL_memset(ptr, 0, len); - -#if !defined(OPENSSL_NO_ASM) - /* As best as we can tell, this is sufficient to break any optimisations that - might try to eliminate "superfluous" memsets. If there's an easy way to - detect memset_s, it would be better to use that. */ - __asm__ __volatile__("" : : "r"(ptr) : "memory"); -#endif -#endif // !OPENSSL_NO_ASM -} - -void OPENSSL_clear_free(void *ptr, size_t unused) { - OPENSSL_free(ptr); -} - -int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len) { - const uint8_t *a = in_a; - const uint8_t *b = in_b; - uint8_t x = 0; - - for (size_t i = 0; i < len; i++) { - x |= a[i] ^ b[i]; - } - - return x; -} - -uint32_t OPENSSL_hash32(const void *ptr, size_t len) { - // These are the FNV-1a parameters for 32 bits. - static const uint32_t kPrime = 16777619u; - static const uint32_t kOffsetBasis = 2166136261u; - - const uint8_t *in = ptr; - uint32_t h = kOffsetBasis; - - for (size_t i = 0; i < len; i++) { - h ^= in[i]; - h *= kPrime; - } - - return h; -} - -uint32_t OPENSSL_strhash(const char *s) { return OPENSSL_hash32(s, strlen(s)); } - -size_t OPENSSL_strnlen(const char *s, size_t len) { - for (size_t i = 0; i < len; i++) { - if (s[i] == 0) { - return i; - } - } - - return len; -} - -char *OPENSSL_strdup(const char *s) { - if (s == NULL) { - return NULL; - } - const size_t len = strlen(s) + 1; - char *ret = OPENSSL_malloc(len); - if (ret == NULL) { - return NULL; - } - OPENSSL_memcpy(ret, s, len); - return ret; -} - -int OPENSSL_tolower(int c) { - if (c >= 'A' && c <= 'Z') { - return c + ('a' - 'A'); - } - return c; -} - -int OPENSSL_strcasecmp(const char *a, const char *b) { - for (size_t i = 0;; i++) { - const int aa = OPENSSL_tolower(a[i]); - const int bb = OPENSSL_tolower(b[i]); - - if (aa < bb) { - return -1; - } else if (aa > bb) { - return 1; - } else if (aa == 0) { - return 0; - } - } -} - -int OPENSSL_strncasecmp(const char *a, const char *b, size_t n) { - for (size_t i = 0; i < n; i++) { - const int aa = OPENSSL_tolower(a[i]); - const int bb = OPENSSL_tolower(b[i]); - - if (aa < bb) { - return -1; - } else if (aa > bb) { - return 1; - } else if (aa == 0) { - return 0; - } - } - - return 0; -} - -int BIO_snprintf(char *buf, size_t n, const char *format, ...) { - va_list args; - va_start(args, format); - int ret = BIO_vsnprintf(buf, n, format, args); - va_end(args); - return ret; -} - -int BIO_vsnprintf(char *buf, size_t n, const char *format, va_list args) { - return vsnprintf(buf, n, format, args); -} - -char *OPENSSL_strndup(const char *str, size_t size) { - size = OPENSSL_strnlen(str, size); - - size_t alloc_size = size + 1; - if (alloc_size < size) { - // overflow - OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); - return NULL; - } - char *ret = OPENSSL_malloc(alloc_size); - if (ret == NULL) { - OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); - return NULL; - } - - OPENSSL_memcpy(ret, str, size); - ret[size] = '\0'; - return ret; -} - -size_t OPENSSL_strlcpy(char *dst, const char *src, size_t dst_size) { - size_t l = 0; - - for (; dst_size > 1 && *src; dst_size--) { - *dst++ = *src++; - l++; - } - - if (dst_size) { - *dst = 0; - } - - return l + strlen(src); -} - -size_t OPENSSL_strlcat(char *dst, const char *src, size_t dst_size) { - size_t l = 0; - for (; dst_size > 0 && *dst; dst_size--, dst++) { - l++; - } - return l + OPENSSL_strlcpy(dst, src, dst_size); -} - -void *OPENSSL_memdup(const void *data, size_t size) { - if (size == 0) { - return NULL; - } - - void *ret = OPENSSL_malloc(size); - if (ret == NULL) { - OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); - return NULL; - } - - OPENSSL_memcpy(ret, data, size); - return ret; -} - -void *CRYPTO_malloc(size_t size, const char *file, int line) { - return OPENSSL_malloc(size); -} - -void *CRYPTO_realloc(void *ptr, size_t new_size, const char *file, int line) { - return OPENSSL_realloc(ptr, new_size); -} - -void CRYPTO_free(void *ptr, const char *file, int line) { OPENSSL_free(ptr); } diff --git a/third_party/boringssl/src/crypto/mem.cc b/third_party/boringssl/src/crypto/mem.cc new file mode 100644 index 00000000..28ab3deb --- /dev/null +++ b/third_party/boringssl/src/crypto/mem.cc @@ -0,0 +1,577 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#if defined(OPENSSL_WINDOWS) +#include +#endif + +#if defined(BORINGSSL_MALLOC_FAILURE_TESTING) +#include +#include +#include +#endif + +#include "internal.h" + + +using namespace bssl; + +#define OPENSSL_MALLOC_PREFIX 8 +static_assert(OPENSSL_MALLOC_PREFIX >= sizeof(size_t), "size_t too large"); + +#if defined(OPENSSL_ASAN) +extern "C" { +void __asan_poison_memory_region(const volatile void *addr, size_t size); +void __asan_unpoison_memory_region(const volatile void *addr, size_t size); +} +#else +static void __asan_poison_memory_region(const void *addr, size_t size) {} +static void __asan_unpoison_memory_region(const void *addr, size_t size) {} +#endif + +// Windows doesn't really support weak symbols as of May 2019, and Clang on +// Windows will emit strong symbols instead. See +// https://bugs.llvm.org/show_bug.cgi?id=37598 +// +// EDK2 targets UEFI but builds as ELF and then translates the binary to +// COFF(!). Thus it builds with __ELF__ defined but cannot actually cope with +// weak symbols. +#if !defined(__EDK2_BORINGSSL__) && defined(__ELF__) && defined(__GNUC__) +#define WEAK_SYMBOL_FUNC(rettype, name, args) \ + extern "C" { \ + rettype name args __attribute__((weak)); \ + } +#else +#define WEAK_SYMBOL_FUNC(rettype, name, args) \ + static rettype(*const name) args = nullptr; +#endif + +#if defined(BORINGSSL_DETECT_SDALLOCX) +// sdallocx is a sized |free| function. By passing the size (which we happen to +// always know in BoringSSL), the malloc implementation can save work. We cannot +// depend on |sdallocx| being available, however, so it's a weak symbol. +// +// This mechanism is kept opt-in because it assumes that, when |sdallocx| is +// defined, it is part of the same allocator as |malloc|. This is usually true +// but may break if |malloc| does not implement |sdallocx|, but some other +// allocator with |sdallocx| is imported which does. +WEAK_SYMBOL_FUNC(void, sdallocx, (void *ptr, size_t size, int flags)) +#else +static void (*const sdallocx)(void *ptr, size_t size, int flags) = nullptr; +#endif + +// The following three functions can be defined to override default heap +// allocation and freeing. If defined, it is the responsibility of +// |OPENSSL_memory_free| to zero out the memory before returning it to the +// system. |OPENSSL_memory_free| will not be passed NULL pointers. +// +// WARNING: These functions are called on every allocation and free in +// BoringSSL across the entire process. They may be called by any code in the +// process which calls BoringSSL, including in process initializers and thread +// destructors. When called, BoringSSL may hold pthreads locks. Any other code +// in the process which, directly or indirectly, calls BoringSSL may be on the +// call stack and may itself be using arbitrary synchronization primitives. +// +// As a result, these functions may not have the usual programming environment +// available to most C or C++ code. In particular, they may not call into +// BoringSSL, or any library which depends on BoringSSL. Any synchronization +// primitives used must tolerate every other synchronization primitive linked +// into the process, including pthreads locks. Failing to meet these constraints +// may result in deadlocks, crashes, or memory corruption. +WEAK_SYMBOL_FUNC(void *, OPENSSL_memory_alloc, (size_t size)) +WEAK_SYMBOL_FUNC(void, OPENSSL_memory_free, (void *ptr)) +WEAK_SYMBOL_FUNC(size_t, OPENSSL_memory_get_size, (void *ptr)) + +#if defined(BORINGSSL_MALLOC_FAILURE_TESTING) +static StaticMutex malloc_failure_lock; +static uint64_t current_malloc_count = 0; +static uint64_t malloc_number_to_fail = 0; +static int malloc_failure_enabled = 0, break_on_malloc_fail = 0, + any_malloc_failed = 0, disable_malloc_failures = 0; + +static void malloc_exit_handler() { + MutexReadLock lock(&malloc_failure_lock); + if (any_malloc_failed) { + // Signal to the test driver that some allocation failed, so it knows to + // increment the counter and continue. + _exit(88); + } +} + +static void init_malloc_failure() { + const char *env = getenv("MALLOC_NUMBER_TO_FAIL"); + if (env != nullptr && env[0] != 0) { + char *endptr; + malloc_number_to_fail = strtoull(env, &endptr, 10); + if (*endptr == 0) { + malloc_failure_enabled = 1; + atexit(malloc_exit_handler); + } + } + break_on_malloc_fail = getenv("MALLOC_BREAK_ON_FAIL") != nullptr; +} + +// should_fail_allocation returns one if the current allocation should fail and +// zero otherwise. +static int should_fail_allocation() { + static CRYPTO_once_t once = CRYPTO_ONCE_INIT; + CRYPTO_once(&once, init_malloc_failure); + if (!malloc_failure_enabled || disable_malloc_failures) { + return 0; + } + + // We lock just so multi-threaded tests are still correct, but we won't test + // every malloc exhaustively. + malloc_failure_lock.LockWrite(); + int should_fail = current_malloc_count == malloc_number_to_fail; + current_malloc_count++; + any_malloc_failed = any_malloc_failed || should_fail; + malloc_failure_lock.UnlockWrite(); + + if (should_fail && break_on_malloc_fail) { + raise(SIGTRAP); + } + if (should_fail) { + errno = ENOMEM; + } + return should_fail; +} + +void bssl::OPENSSL_reset_malloc_counter_for_testing() { + MutexWriteLock lock(&malloc_failure_lock); + current_malloc_count = 0; +} + +void bssl::OPENSSL_disable_malloc_failures_for_testing() { + MutexWriteLock lock(&malloc_failure_lock); + BSSL_CHECK(!disable_malloc_failures); + disable_malloc_failures = 1; +} + +void bssl::OPENSSL_enable_malloc_failures_for_testing() { + MutexWriteLock lock(&malloc_failure_lock); + BSSL_CHECK(disable_malloc_failures); + disable_malloc_failures = 0; +} + +#else +static int should_fail_allocation() { return 0; } +#endif + +void *OPENSSL_malloc(size_t size) { + void *ptr = nullptr; + if (should_fail_allocation()) { + goto err; + } + + if (OPENSSL_memory_alloc != nullptr) { + assert(OPENSSL_memory_free != nullptr); + assert(OPENSSL_memory_get_size != nullptr); + void *ptr2 = OPENSSL_memory_alloc(size); + if (ptr2 == nullptr && size != 0) { + goto err; + } + return ptr2; + } + + if (size + OPENSSL_MALLOC_PREFIX < size) { + goto err; + } + + ptr = malloc(size + OPENSSL_MALLOC_PREFIX); + if (ptr == nullptr) { + goto err; + } + + *(size_t *)ptr = size; + + __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); + return ((uint8_t *)ptr) + OPENSSL_MALLOC_PREFIX; + +err: + // This only works because ERR does not call OPENSSL_malloc. + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); + return nullptr; +} + +void *OPENSSL_zalloc(size_t size) { + void *ret = OPENSSL_malloc(size); + if (ret != nullptr) { + OPENSSL_memset(ret, 0, size); + } + return ret; +} + +void *OPENSSL_calloc(size_t num, size_t size) { + if (size != 0 && num > SIZE_MAX / size) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_OVERFLOW); + return nullptr; + } + + return OPENSSL_zalloc(num * size); +} + +void OPENSSL_free(void *orig_ptr) { + if (orig_ptr == nullptr) { + return; + } + + if (OPENSSL_memory_free != nullptr) { + OPENSSL_memory_free(orig_ptr); + return; + } + + void *ptr = ((uint8_t *)orig_ptr) - OPENSSL_MALLOC_PREFIX; + __asan_unpoison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); + + size_t size = *(size_t *)ptr; + OPENSSL_cleanse(ptr, size + OPENSSL_MALLOC_PREFIX); + +// ASan knows to intercept malloc and free, but not sdallocx. +#if defined(OPENSSL_ASAN) + (void)sdallocx; + free(ptr); +#else + if (sdallocx) { + sdallocx(ptr, size + OPENSSL_MALLOC_PREFIX, 0 /* flags */); + } else { + free(ptr); + } +#endif +} + +void *OPENSSL_realloc(void *orig_ptr, size_t new_size) { + if (orig_ptr == nullptr) { + return OPENSSL_malloc(new_size); + } + + size_t old_size; + if (OPENSSL_memory_get_size != nullptr) { + old_size = OPENSSL_memory_get_size(orig_ptr); + } else { + void *ptr = ((uint8_t *)orig_ptr) - OPENSSL_MALLOC_PREFIX; + __asan_unpoison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); + old_size = *(size_t *)ptr; + __asan_poison_memory_region(ptr, OPENSSL_MALLOC_PREFIX); + } + + void *ret = OPENSSL_malloc(new_size); + if (ret == nullptr) { + return nullptr; + } + + size_t to_copy = new_size; + if (old_size < to_copy) { + to_copy = old_size; + } + + memcpy(ret, orig_ptr, to_copy); + OPENSSL_free(orig_ptr); + + return ret; +} + +void OPENSSL_cleanse(void *ptr, size_t len) { +#if defined(OPENSSL_WINDOWS) + SecureZeroMemory(ptr, len); +#else + OPENSSL_memset(ptr, 0, len); + // As best as we can tell, this is sufficient to break any optimisations that + // might try to eliminate "superfluous" memsets. If there's an easy way to + // detect memset_s, it would be better to use that. + __asm__ __volatile__("" : : "r"(ptr) : "memory"); +#endif +} + +void OPENSSL_clear_free(void *ptr, size_t unused) { OPENSSL_free(ptr); } + +int CRYPTO_secure_malloc_init(size_t size, size_t min_size) { return 0; } + +int CRYPTO_secure_malloc_initialized() { return 0; } + +size_t CRYPTO_secure_used() { return 0; } + +void *OPENSSL_secure_malloc(size_t size) { return OPENSSL_malloc(size); } + +void OPENSSL_secure_clear_free(void *ptr, size_t len) { + OPENSSL_clear_free(ptr, len); +} + +int CRYPTO_memcmp(const void *in_a, const void *in_b, size_t len) { + const uint8_t *a = reinterpret_cast(in_a); + const uint8_t *b = reinterpret_cast(in_b); + uint8_t x = 0; + + for (size_t i = 0; i < len; i++) { + x |= a[i] ^ b[i]; + } + + return x; +} + +uint32_t OPENSSL_hash32(const void *ptr, size_t len) { + // These are the FNV-1a parameters for 32 bits. + static const uint32_t kPrime = 16777619u; + static const uint32_t kOffsetBasis = 2166136261u; + + const uint8_t *in = reinterpret_cast(ptr); + uint32_t h = kOffsetBasis; + + for (size_t i = 0; i < len; i++) { + h ^= in[i]; + h *= kPrime; + } + + return h; +} + +uint32_t OPENSSL_strhash(const char *s) { return OPENSSL_hash32(s, strlen(s)); } + +size_t OPENSSL_strnlen(const char *s, size_t len) { + for (size_t i = 0; i < len; i++) { + if (s[i] == 0) { + return i; + } + } + + return len; +} + +char *OPENSSL_strdup(const char *s) { + if (s == nullptr) { + return nullptr; + } + // Copy the NUL terminator. + return reinterpret_cast(OPENSSL_memdup(s, strlen(s) + 1)); +} + +int OPENSSL_isalpha(int c) { + return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z'); +} + +int OPENSSL_isdigit(int c) { return c >= '0' && c <= '9'; } + +int OPENSSL_isxdigit(int c) { + return OPENSSL_isdigit(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F'); +} + +int OPENSSL_fromxdigit(uint8_t *out, int c) { + if (OPENSSL_isdigit(c)) { + *out = c - '0'; + return 1; + } + if ('a' <= c && c <= 'f') { + *out = c - 'a' + 10; + return 1; + } + if ('A' <= c && c <= 'F') { + *out = c - 'A' + 10; + return 1; + } + return 0; +} + +int OPENSSL_isalnum(int c) { return OPENSSL_isalpha(c) || OPENSSL_isdigit(c); } + +int OPENSSL_tolower(int c) { + if (c >= 'A' && c <= 'Z') { + return c + ('a' - 'A'); + } + return c; +} + +int OPENSSL_isspace(int c) { + return c == '\t' || c == '\n' || c == '\v' || c == '\f' || c == '\r' || + c == ' '; +} + +int OPENSSL_strcasecmp(const char *a, const char *b) { + for (size_t i = 0;; i++) { + const int aa = OPENSSL_tolower(a[i]); + const int bb = OPENSSL_tolower(b[i]); + + if (aa < bb) { + return -1; + } else if (aa > bb) { + return 1; + } else if (aa == 0) { + return 0; + } + } +} + +int OPENSSL_strncasecmp(const char *a, const char *b, size_t n) { + for (size_t i = 0; i < n; i++) { + const int aa = OPENSSL_tolower(a[i]); + const int bb = OPENSSL_tolower(b[i]); + + if (aa < bb) { + return -1; + } else if (aa > bb) { + return 1; + } else if (aa == 0) { + return 0; + } + } + + return 0; +} + +int BIO_snprintf(char *buf, size_t n, const char *format, ...) { + va_list args; + va_start(args, format); + int ret = BIO_vsnprintf(buf, n, format, args); + va_end(args); + return ret; +} + +int BIO_vsnprintf(char *buf, size_t n, const char *format, va_list args) { + return vsnprintf(buf, n, format, args); +} + +int bssl::OPENSSL_vasprintf_internal(char **str, const char *format, + va_list args, int system_malloc) { + void *(*allocate)(size_t) = system_malloc ? malloc : OPENSSL_malloc; + void (*deallocate)(void *) = system_malloc ? free : OPENSSL_free; + void *(*reallocate)(void *, size_t) = + system_malloc ? realloc : OPENSSL_realloc; + char *candidate = nullptr; + size_t candidate_len = 64; // TODO(bbe) what's the best initial size? + int ret; + + if ((candidate = reinterpret_cast(allocate(candidate_len))) == + nullptr) { + goto err; + } + va_list args_copy; + va_copy(args_copy, args); + ret = vsnprintf(candidate, candidate_len, format, args_copy); + va_end(args_copy); + if (ret < 0) { + goto err; + } + if ((size_t)ret >= candidate_len) { + // Too big to fit in allocation. + char *tmp; + + candidate_len = (size_t)ret + 1; + if ((tmp = reinterpret_cast( + reallocate(candidate, candidate_len))) == nullptr) { + goto err; + } + candidate = tmp; + ret = vsnprintf(candidate, candidate_len, format, args); + } + // At this point this should not happen unless vsnprintf is insane. + if (ret < 0 || (size_t)ret >= candidate_len) { + goto err; + } + *str = candidate; + return ret; + +err: + deallocate(candidate); + *str = nullptr; + errno = ENOMEM; + return -1; +} + +int OPENSSL_vasprintf(char **str, const char *format, va_list args) { + return OPENSSL_vasprintf_internal(str, format, args, /*system_malloc=*/0); +} + +int OPENSSL_asprintf(char **str, const char *format, ...) { + va_list args; + va_start(args, format); + int ret = OPENSSL_vasprintf(str, format, args); + va_end(args); + return ret; +} + +char *OPENSSL_strndup(const char *str, size_t size) { + size = OPENSSL_strnlen(str, size); + + size_t alloc_size = size + 1; + if (alloc_size < size) { + // overflow + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_MALLOC_FAILURE); + return nullptr; + } + char *ret = reinterpret_cast(OPENSSL_malloc(alloc_size)); + if (ret == nullptr) { + return nullptr; + } + + OPENSSL_memcpy(ret, str, size); + ret[size] = '\0'; + return ret; +} + +size_t OPENSSL_strlcpy(char *dst, const char *src, size_t dst_size) { + size_t l = 0; + + for (; dst_size > 1 && *src; dst_size--) { + *dst++ = *src++; + l++; + } + + if (dst_size) { + *dst = 0; + } + + return l + strlen(src); +} + +size_t OPENSSL_strlcat(char *dst, const char *src, size_t dst_size) { + size_t l = 0; + for (; dst_size > 0 && *dst; dst_size--, dst++) { + l++; + } + return l + OPENSSL_strlcpy(dst, src, dst_size); +} + +void *OPENSSL_memdup(const void *data, size_t size) { + if (size == 0) { + return nullptr; + } + + void *ret = OPENSSL_malloc(size); + if (ret == nullptr) { + return nullptr; + } + + OPENSSL_memcpy(ret, data, size); + return ret; +} + +void *CRYPTO_malloc(size_t size, const char *file, int line) { + return OPENSSL_malloc(size); +} + +void *CRYPTO_realloc(void *ptr, size_t new_size, const char *file, int line) { + return OPENSSL_realloc(ptr, new_size); +} + +void CRYPTO_free(void *ptr, const char *file, int line) { OPENSSL_free(ptr); } diff --git a/third_party/boringssl/src/crypto/mem_internal.h b/third_party/boringssl/src/crypto/mem_internal.h new file mode 100644 index 00000000..78c3348d --- /dev/null +++ b/third_party/boringssl/src/crypto/mem_internal.h @@ -0,0 +1,671 @@ +// Copyright 2025 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_MEM_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_MEM_INTERNAL_H + +#include + +#include +#include +#include +#include + +#include +#include + +#include "internal.h" + + +BSSL_NAMESPACE_BEGIN + +// Internal allocation-dependent functions. +// +// This header is separate from crypto/internal.h because there are some files +// which must avoid |OPENSSL_malloc|, to avoid a circular dependency, but +// need other support routines in crypto/internal.h. (See +// |_BORINGSSL_PROHIBIT_OPENSSL_MALLOC|.) + + +// Memory allocation. + +// New behaves like |new| but uses |OPENSSL_malloc| for memory allocation. It +// returns nullptr on allocation error. It only implements single-object +// allocation and not new T[n]. +// +// When called with no arguments, it performs value-initialization, not +// default-initialization. This means that, if selects a non-user-provided +// constructor, the object will be zero-initialized. (As in any C++ type, once +// |T| gains a user-provided constructors, it is responsible for initializing +// all fields explicitly.) +// +// Note: unlike |new|, this does not support non-public constructors. +template +T *New(Args &&...args) { + void *t = OPENSSL_malloc(sizeof(T)); + if (t == nullptr) { + return nullptr; + } + return new (t) T(std::forward(args)...); +} + +// Delete behaves like |delete| but uses |OPENSSL_free| to release memory. +// +// Note: unlike |delete| this does not support non-public destructors. +template +void Delete(T *t) { + if (t != nullptr) { + t->~T(); + OPENSSL_free(t); + } +} + +namespace internal { + +// All types with kAllowUniquePtr set may be used with UniquePtr. Other types +// may be C structs which require a |BORINGSSL_MAKE_DELETER| registration. Where +// an internal type cannot be annotated (e.g. an alias of std::variant), use +// |BORINGSSL_MAKE_DELETER(T, Delete)|. +template +struct DeleterImpl> { + static void Free(T *t) { Delete(t); } +}; + +// All types with kAllowRefCountedUniquePtr may be used with UniquePtr, which +// then will behave like std::shared_ptr. +template +struct DeleterImpl> { + static void Free(T *t) { t->DecRefInternal(); } +}; + +} // namespace internal + +// All types with kAllowRefCountedUniquePtr types also automatically get an +// UpRef function. Other types may be C structs which require a +// |BORINGSSL_MAKE_UP_REF| registration. +template > +inline UniquePtr UpRef(const T *v) { + if (v != nullptr) { + v->UpRefInternal(); + } + return UniquePtr(const_cast(v)); +} +template > +inline UniquePtr UpRef(const UniquePtr &ptr) { + return UpRef(ptr.get()); +} + +// MakeUnique behaves like |std::make_unique| but returns nullptr on allocation +// error. +template +UniquePtr MakeUnique(Args &&...args) { + return UniquePtr(New(std::forward(args)...)); +} + + +// RefCounted is a common base for ref-counted types. This is an instance of the +// C++ curiously-recurring template pattern, so a type Foo must subclass +// RefCounted. It additionally must friend RefCounted to allow calling +// the destructor. +template +class RefCounted { + public: + static constexpr bool kAllowRefCountedUniquePtr = true; + + RefCounted(const RefCounted &) = delete; + RefCounted &operator=(const RefCounted &) = delete; + + // These methods are intentionally named differently from `bssl::UpRef` to + // avoid a collision. Only the implementations of `FOO_up_ref` and `FOO_free` + // should call these. |DecRefInternal| returns true if the object was freed + // and false if there are still references. + void UpRefInternal() const { + // Safety: the folowing call does not mutate anything other than the atomic + // ref-count variable. + CRYPTO_refcount_inc(&references_); + } + bool DecRefInternal() { + if (CRYPTO_refcount_dec_and_test_zero(&references_)) { + Derived *d = static_cast(this); + d->~Derived(); + OPENSSL_free(d); + return true; + } + return false; + } + + protected: + // Ensure that only `Derived`, which must inherit from `RefCounted`, + // can call the constructor. This catches bugs where someone inherited from + // the wrong base. + class CheckSubClass { + private: + friend Derived; + CheckSubClass() = default; + }; + RefCounted(CheckSubClass) { + static_assert(std::is_base_of_v, + "Derived must subclass RefCounted"); + } + + ~RefCounted() { BSSL_CHECK(references_.load() == 0); } + + private: + mutable CRYPTO_refcount_t references_ = 1; +}; + + +// Containers. + +// Array is an owning array of elements of |T|. +template +class Array { + public: + using value_type = std::remove_cv_t; + + // Array's default constructor creates an empty array. + Array() {} + Array(const Array &) = delete; + Array(Array &&other) { *this = std::move(other); } + + ~Array() { Reset(); } + + Array &operator=(const Array &) = delete; + Array &operator=(Array &&other) { + Reset(); + other.Release(&data_, &size_); + return *this; + } + + const T *data() const { return data_; } + T *data() { return data_; } + size_t size() const { return size_; } + bool empty() const { return size_ == 0; } + + const T &operator[](size_t i) const { + BSSL_CHECK(i < size_); + return data_[i]; + } + T &operator[](size_t i) { + BSSL_CHECK(i < size_); + return data_[i]; + } + + T &front() { + BSSL_CHECK(size_ != 0); + return data_[0]; + } + const T &front() const { + BSSL_CHECK(size_ != 0); + return data_[0]; + } + T &back() { + BSSL_CHECK(size_ != 0); + return data_[size_ - 1]; + } + const T &back() const { + BSSL_CHECK(size_ != 0); + return data_[size_ - 1]; + } + + T *begin() { return data_; } + const T *begin() const { return data_; } + T *end() { return data_ + size_; } + const T *end() const { return data_ + size_; } + + void Reset() { Reset(nullptr, 0); } + + // Reset releases the current contents of the array and takes ownership of the + // raw pointer supplied by the caller. + void Reset(T *new_data, size_t new_size) { + std::destroy_n(data_, size_); + OPENSSL_free(data_); + data_ = new_data; + size_ = new_size; + } + + // Release releases ownership of the array to a raw pointer supplied by the + // caller. + void Release(T **out, size_t *out_size) { + *out = data_; + *out_size = size_; + data_ = nullptr; + size_ = 0; + } + + // Init replaces the array with a newly-allocated array of |new_size| + // value-constructed copies of |T|. It returns true on success and false on + // error. If |T| is a primitive type like |uint8_t|, value-construction means + // it will be zero-initialized. + [[nodiscard]] bool Init(size_t new_size) { + if (!InitUninitialized(new_size)) { + return false; + } + std::uninitialized_value_construct_n(data_, size_); + return true; + } + + // InitForOverwrite behaves like |Init| but it default-constructs each element + // instead. This means that, if |T| is a primitive type, the array will be + // uninitialized and thus must be filled in by the caller. + [[nodiscard]] bool InitForOverwrite(size_t new_size) { + if (!InitUninitialized(new_size)) { + return false; + } + std::uninitialized_default_construct_n(data_, size_); + return true; + } + + // CopyFrom replaces the array with a newly-allocated copy of |in|. It returns + // true on success and false on error. + // + // |in| may not alias |this|. + [[nodiscard]] bool CopyFrom(Span in) { + BSSL_CHECK(!spans_alias(MakeConstSpan(*this), in)); + if (!InitUninitialized(in.size())) { + return false; + } + std::uninitialized_copy(in.begin(), in.end(), data_); + return true; + } + + // Shrink shrinks the stored size of the array to |new_size|. It crashes if + // the new size is larger. Note this does not shrink the allocation itself. + void Shrink(size_t new_size) { + if (new_size > size_) { + abort(); + } + std::destroy_n(data_ + new_size, size_ - new_size); + size_ = new_size; + } + + private: + // InitUninitialized replaces the array with a newly-allocated array of + // |new_size| elements, but whose constructor has not yet run. On success, the + // elements must be constructed before returning control to the caller. + bool InitUninitialized(size_t new_size) { + Reset(); + if (new_size == 0) { + return true; + } + + if (new_size > SIZE_MAX / sizeof(T)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_OVERFLOW); + return false; + } + data_ = reinterpret_cast(OPENSSL_malloc(new_size * sizeof(T))); + if (data_ == nullptr) { + return false; + } + size_ = new_size; + return true; + } + + T *data_ = nullptr; + size_t size_ = 0; +}; + +// Vector is a resizable array of elements of |T|. +template +class Vector { + public: + Vector() = default; + Vector(const Vector &) = delete; + Vector(Vector &&other) { *this = std::move(other); } + ~Vector() { clear(); } + + Vector &operator=(const Vector &) = delete; + Vector &operator=(Vector &&other) { + clear(); + std::swap(data_, other.data_); + std::swap(size_, other.size_); + std::swap(capacity_, other.capacity_); + return *this; + } + + const T *data() const { return data_; } + T *data() { return data_; } + size_t size() const { return size_; } + bool empty() const { return size_ == 0; } + + const T &operator[](size_t i) const { + BSSL_CHECK(i < size_); + return data_[i]; + } + T &operator[](size_t i) { + BSSL_CHECK(i < size_); + return data_[i]; + } + + T &front() { + BSSL_CHECK(size_ != 0); + return data_[0]; + } + const T &front() const { + BSSL_CHECK(size_ != 0); + return data_[0]; + } + T &back() { + BSSL_CHECK(size_ != 0); + return data_[size_ - 1]; + } + const T &back() const { + BSSL_CHECK(size_ != 0); + return data_[size_ - 1]; + } + + T *begin() { return data_; } + const T *begin() const { return data_; } + T *end() { return data_ + size_; } + const T *end() const { return data_ + size_; } + + void clear() { + std::destroy_n(data_, size_); + OPENSSL_free(data_); + data_ = nullptr; + size_ = 0; + capacity_ = 0; + } + + void pop_back() { + BSSL_CHECK(size_ != 0); + std::destroy_at(&data_[size_ - 1]); + size_--; + } + + // Push adds |elem| at the end of the internal array, growing if necessary. It + // returns false when allocation fails. + [[nodiscard]] bool Push(T elem) { + if (!MaybeGrow(1)) { + return false; + } + new (&data_[size_]) T(std::move(elem)); + size_++; + return true; + } + + // CopyFrom replaces the contents of the array with a copy of |in|. It returns + // true on success and false on allocation error. + [[nodiscard]] bool CopyFrom(Span in) { + Array copy; + if (!copy.CopyFrom(in)) { + return false; + } + + clear(); + copy.Release(&data_, &size_); + capacity_ = size_; + return true; + } + + // Append appends the contents of |in| to the array. It returns true on + // success and false on allocation error. + [[nodiscard]] bool Append(Span in) { + if (!MaybeGrow(in.size())) { + return false; + } + std::uninitialized_copy(in.begin(), in.end(), data_ + size_); + size_ += in.size(); + return true; + } + + // AppendMove moves the contents of |in| and appends them to the array. It + // returns true on success and false on allocation error. + [[nodiscard]] bool AppendMove(Span in) { + if (!MaybeGrow(in.size())) { + return false; + } + std::uninitialized_move(in.begin(), in.end(), data_ + size_); + size_ += in.size(); + return true; + } + + // EraseIf removes all elements that satisfy the predicate |pred|. + template + void EraseIf(Pred pred) { + auto it = std::remove_if(begin(), end(), pred); + std::destroy(it, end()); + size_ = it - begin(); + } + + private: + // If there is no room for |num| elements, creates a new backing array with + // double the size of the old one and copies elements over. + [[nodiscard]] bool MaybeGrow(size_t num) { + constexpr size_t kDefaultSize = 16; + constexpr size_t kMaxCapacity = SIZE_MAX / sizeof(T); + if (num > kMaxCapacity - size_) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_OVERFLOW); + return false; + } + size_t new_capacity = size_ + num; + // No need to grow if we have room. + if (capacity_ >= new_capacity) { + return true; + } + // Always grow to at least kDefaultSize to avoid several small mallocs at + // the start. + new_capacity = std::max(new_capacity, std::min(kDefaultSize, kMaxCapacity)); + // At least double the old capacity for linear amortized behavior. + if (capacity_ <= kMaxCapacity / 2) { + new_capacity = std::max(new_capacity, capacity_ * 2); + } + T *new_data = + reinterpret_cast(OPENSSL_malloc(new_capacity * sizeof(T))); + if (new_data == nullptr) { + return false; + } + size_t new_size = size_; + std::uninitialized_move(begin(), end(), new_data); + clear(); + data_ = new_data; + size_ = new_size; + capacity_ = new_capacity; + return true; + } + + // data_ is a pointer to |capacity_| objects of size |T|, the first |size_| of + // which are constructed. + T *data_ = nullptr; + // |size_| is the number of elements stored in this Vector. + size_t size_ = 0; + // |capacity_| is the number of elements allocated in this Vector. + size_t capacity_ = 0; +}; + +// A PackedSize is an integer that can store values from 0 to N, represented as +// a minimal-width integer. +template +using PackedSize = std::conditional_t< + N <= 0xff, uint8_t, + std::conditional_t>>; + +// An InplaceVector is like a Vector, but stores up to N elements inline in the +// object. It is inspired by std::inplace_vector in C++26. +template +class InplaceVector { + public: + using value_type = std::remove_cv_t; + + InplaceVector() = default; + InplaceVector(const InplaceVector &other) { *this = other; } + InplaceVector(InplaceVector &&other) { *this = std::move(other); } + ~InplaceVector() { clear(); } + InplaceVector &operator=(const InplaceVector &other) { + if (this != &other) { + CopyFrom(other); + } + return *this; + } + InplaceVector &operator=(InplaceVector &&other) { + clear(); + std::uninitialized_move(other.begin(), other.end(), data()); + size_ = other.size(); + return *this; + } + + const T *data() const { return reinterpret_cast(storage_); } + T *data() { return reinterpret_cast(storage_); } + size_t size() const { return size_; } + static constexpr size_t capacity() { return N; } + bool empty() const { return size_ == 0; } + + const T &operator[](size_t i) const { + BSSL_CHECK(i < size_); + return data()[i]; + } + T &operator[](size_t i) { + BSSL_CHECK(i < size_); + return data()[i]; + } + + T &front() { + BSSL_CHECK(size_ != 0); + return data()[0]; + } + const T &front() const { + BSSL_CHECK(size_ != 0); + return data()[0]; + } + T &back() { + BSSL_CHECK(size_ != 0); + return data()[size_ - 1]; + } + const T &back() const { + BSSL_CHECK(size_ != 0); + return data()[size_ - 1]; + } + + T *begin() { return data(); } + const T *begin() const { return data(); } + T *end() { return data() + size_; } + const T *end() const { return data() + size_; } + + void clear() { Shrink(0); } + + void pop_back() { + BSSL_CHECK(size_ != 0); + Shrink(size_ - 1); + } + + // Shrink resizes the vector to |new_size|, which must not be larger than the + // current size. Unlike |Resize|, this can be called when |T| is not + // default-constructible. + void Shrink(size_t new_size) { + BSSL_CHECK(new_size <= size_); + std::destroy_n(data() + new_size, size_ - new_size); + size_ = static_cast>(new_size); + } + + // TryResize resizes the vector to |new_size| and returns true, or returns + // false if |new_size| is too large. Any newly-added elements are + // value-initialized. + [[nodiscard]] bool TryResize(size_t new_size) { + if (new_size <= size_) { + Shrink(new_size); + return true; + } + if (new_size > capacity()) { + return false; + } + std::uninitialized_value_construct_n(data() + size_, new_size - size_); + size_ = static_cast>(new_size); + return true; + } + + // TryResizeForOverwrite behaves like |TryResize|, but newly-added elements + // are default-initialized, so POD types may contain uninitialized values that + // the caller is responsible for filling in. + [[nodiscard]] bool TryResizeForOverwrite(size_t new_size) { + if (new_size <= size_) { + Shrink(new_size); + return true; + } + if (new_size > capacity()) { + return false; + } + std::uninitialized_default_construct_n(data() + size_, new_size - size_); + size_ = static_cast>(new_size); + return true; + } + + // TryCopyFrom sets the vector to a copy of |in| and returns true, or returns + // false if |in| is too large. + // + // |in| may not alias |this|. + [[nodiscard]] bool TryCopyFrom(Span in) { + BSSL_CHECK(!spans_alias(MakeConstSpan(*this), in)); + if (in.size() > capacity()) { + return false; + } + clear(); + std::uninitialized_copy(in.begin(), in.end(), data()); + size_ = in.size(); + return true; + } + + // TryAppend appends the vector by a copy of |in| and returns true, or + // returns false if |in| is too large. + [[nodiscard]] bool TryAppend(Span in) { + if (in.size() > capacity() - size()) { + return false; + } + std::uninitialized_copy(in.begin(), in.end(), &data()[size_]); + size_ += in.size(); + return true; + } + + // TryPushBack appends |val| to the vector and returns a pointer to the + // newly-inserted value, or nullptr if the vector is at capacity. + [[nodiscard]] T *TryPushBack(T val) { + if (size() >= capacity()) { + return nullptr; + } + T *ret = &data()[size_]; + new (ret) T(std::move(val)); + size_++; + return ret; + } + + // The following methods behave like their |Try*| counterparts, but abort the + // program on failure. + void Resize(size_t size) { BSSL_CHECK(TryResize(size)); } + void ResizeForOverwrite(size_t size) { + BSSL_CHECK(TryResizeForOverwrite(size)); + } + void CopyFrom(Span in) { BSSL_CHECK(TryCopyFrom(in)); } + void Append(Span in) { BSSL_CHECK(TryAppend(in)); } + T &PushBack(T val) { + T *ret = TryPushBack(std::move(val)); + BSSL_CHECK(ret != nullptr); + return *ret; + } + + // EraseIf removes all elements that satisfy the predicate |pred|. + template + void EraseIf(Pred pred) { + auto it = std::remove_if(begin(), end(), pred); + Shrink(it - begin()); + } + + private: + alignas(T) char storage_[sizeof(T[N])]; + PackedSize size_ = 0; +}; + + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_MEM_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/mldsa/mldsa.cc b/third_party/boringssl/src/crypto/mldsa/mldsa.cc new file mode 100644 index 00000000..f17ab882 --- /dev/null +++ b/third_party/boringssl/src/crypto/mldsa/mldsa.cc @@ -0,0 +1,296 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../fipsmodule/bcm_interface.h" + + +using namespace bssl; + +int MLDSA65_generate_key( + uint8_t out_encoded_public_key[MLDSA65_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], + struct MLDSA65_private_key *out_private_key) { + return bcm_success(BCM_mldsa65_generate_key(out_encoded_public_key, out_seed, + out_private_key)); +} + +int MLDSA65_private_key_from_seed(struct MLDSA65_private_key *out_private_key, + const uint8_t *seed, size_t seed_len) { + if (seed_len != MLDSA_SEED_BYTES) { + return 0; + } + return bcm_success(BCM_mldsa65_private_key_from_seed(out_private_key, seed)); +} + +int MLDSA65_public_from_private(struct MLDSA65_public_key *out_public_key, + const struct MLDSA65_private_key *private_key) { + return bcm_success( + BCM_mldsa65_public_from_private(out_public_key, private_key)); +} + +int MLDSA65_sign(uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const struct MLDSA65_private_key *private_key, + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > 255) { + return 0; + } + return bcm_success(BCM_mldsa65_sign(out_encoded_signature, private_key, msg, + msg_len, context, context_len)); +} + +int MLDSA65_verify(const struct MLDSA65_public_key *public_key, + const uint8_t *signature, size_t signature_len, + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > 255 || signature_len != MLDSA65_SIGNATURE_BYTES) { + return 0; + } + return bcm_success(BCM_mldsa65_verify(public_key, signature, msg, msg_len, + context, context_len)); +} + +int MLDSA65_prehash_init(struct MLDSA65_prehash *out_state, + const struct MLDSA65_public_key *public_key, + const uint8_t *context, size_t context_len) { + if (context_len > 255) { + return 0; + } + BCM_mldsa65_prehash_init(out_state, public_key, context, context_len); + return 1; +} + +void MLDSA65_prehash_update(struct MLDSA65_prehash *inout_state, + const uint8_t *msg, size_t msg_len) { + BCM_mldsa65_prehash_update(inout_state, msg, msg_len); +} + +void MLDSA65_prehash_finalize(uint8_t out_msg_rep[MLDSA_MU_BYTES], + struct MLDSA65_prehash *inout_state) { + BCM_mldsa65_prehash_finalize(out_msg_rep, inout_state); +} + +int MLDSA65_sign_message_representative( + uint8_t out_encoded_signature[MLDSA65_SIGNATURE_BYTES], + const struct MLDSA65_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + return bcm_success(BCM_mldsa65_sign_message_representative( + out_encoded_signature, private_key, msg_rep)); +} + +int MLDSA65_verify_message_representative( + const struct MLDSA65_public_key *public_key, + const uint8_t *signature, size_t signature_len, + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + if (signature_len != MLDSA65_SIGNATURE_BYTES) { + return 0; + } + return bcm_success(BCM_mldsa65_verify_message_representative( + public_key, signature, msg_rep)); +} + +int MLDSA65_marshal_public_key(CBB *out, + const struct MLDSA65_public_key *public_key) { + return bcm_success(BCM_mldsa65_marshal_public_key(out, public_key)); +} + +int MLDSA65_parse_public_key(struct MLDSA65_public_key *public_key, CBS *in) { + return bcm_success(BCM_mldsa65_parse_public_key(public_key, in)); +} + +int MLDSA87_generate_key( + uint8_t out_encoded_public_key[MLDSA87_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], + struct MLDSA87_private_key *out_private_key) { + return bcm_success(BCM_mldsa87_generate_key(out_encoded_public_key, out_seed, + out_private_key)); +} + +int MLDSA87_private_key_from_seed(struct MLDSA87_private_key *out_private_key, + const uint8_t *seed, size_t seed_len) { + if (seed_len != MLDSA_SEED_BYTES) { + return 0; + } + return bcm_success(BCM_mldsa87_private_key_from_seed(out_private_key, seed)); +} + +int MLDSA87_public_from_private(struct MLDSA87_public_key *out_public_key, + const struct MLDSA87_private_key *private_key) { + return bcm_success( + BCM_mldsa87_public_from_private(out_public_key, private_key)); +} + +int MLDSA87_sign(uint8_t out_encoded_signature[MLDSA87_SIGNATURE_BYTES], + const struct MLDSA87_private_key *private_key, + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > 255) { + return 0; + } + return bcm_success(BCM_mldsa87_sign(out_encoded_signature, private_key, msg, + msg_len, context, context_len)); +} + +int MLDSA87_verify(const struct MLDSA87_public_key *public_key, + const uint8_t *signature, size_t signature_len, + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > 255 || signature_len != MLDSA87_SIGNATURE_BYTES) { + return 0; + } + return bcm_success(BCM_mldsa87_verify(public_key, signature, msg, msg_len, + context, context_len)); +} + +int MLDSA87_prehash_init(struct MLDSA87_prehash *out_state, + const struct MLDSA87_public_key *public_key, + const uint8_t *context, size_t context_len) { + if (context_len > 255) { + return 0; + } + BCM_mldsa87_prehash_init(out_state, public_key, context, context_len); + return 1; +} + +void MLDSA87_prehash_update(struct MLDSA87_prehash *inout_state, + const uint8_t *msg, size_t msg_len) { + BCM_mldsa87_prehash_update(inout_state, msg, msg_len); +} + +void MLDSA87_prehash_finalize(uint8_t out_msg_rep[MLDSA_MU_BYTES], + struct MLDSA87_prehash *inout_state) { + BCM_mldsa87_prehash_finalize(out_msg_rep, inout_state); +} + +int MLDSA87_sign_message_representative( + uint8_t out_encoded_signature[MLDSA87_SIGNATURE_BYTES], + const struct MLDSA87_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + return bcm_success(BCM_mldsa87_sign_message_representative( + out_encoded_signature, private_key, msg_rep)); +} + +int MLDSA87_verify_message_representative( + const struct MLDSA87_public_key *public_key, + const uint8_t *signature, size_t signature_len, + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + if (signature_len != MLDSA87_SIGNATURE_BYTES) { + return 0; + } + return bcm_success(BCM_mldsa87_verify_message_representative( + public_key, signature, msg_rep)); +} + +int MLDSA87_marshal_public_key(CBB *out, + const struct MLDSA87_public_key *public_key) { + return bcm_success(BCM_mldsa87_marshal_public_key(out, public_key)); +} + +int MLDSA87_parse_public_key(struct MLDSA87_public_key *public_key, CBS *in) { + return bcm_success(BCM_mldsa87_parse_public_key(public_key, in)); +} + +int MLDSA44_generate_key( + uint8_t out_encoded_public_key[MLDSA44_PUBLIC_KEY_BYTES], + uint8_t out_seed[MLDSA_SEED_BYTES], + struct MLDSA44_private_key *out_private_key) { + return bcm_success(BCM_mldsa44_generate_key(out_encoded_public_key, out_seed, + out_private_key)); +} + +int MLDSA44_private_key_from_seed(struct MLDSA44_private_key *out_private_key, + const uint8_t *seed, size_t seed_len) { + if (seed_len != MLDSA_SEED_BYTES) { + return 0; + } + return bcm_success(BCM_mldsa44_private_key_from_seed(out_private_key, seed)); +} + +int MLDSA44_public_from_private(struct MLDSA44_public_key *out_public_key, + const struct MLDSA44_private_key *private_key) { + return bcm_success( + BCM_mldsa44_public_from_private(out_public_key, private_key)); +} + +int MLDSA44_sign(uint8_t out_encoded_signature[MLDSA44_SIGNATURE_BYTES], + const struct MLDSA44_private_key *private_key, + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > 255) { + return 0; + } + return bcm_success(BCM_mldsa44_sign(out_encoded_signature, private_key, msg, + msg_len, context, context_len)); +} + +int MLDSA44_verify(const struct MLDSA44_public_key *public_key, + const uint8_t *signature, size_t signature_len, + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + if (context_len > 255 || signature_len != MLDSA44_SIGNATURE_BYTES) { + return 0; + } + return bcm_success(BCM_mldsa44_verify(public_key, signature, msg, msg_len, + context, context_len)); +} + +int MLDSA44_prehash_init(struct MLDSA44_prehash *out_state, + const struct MLDSA44_public_key *public_key, + const uint8_t *context, size_t context_len) { + if (context_len > 255) { + return 0; + } + BCM_mldsa44_prehash_init(out_state, public_key, context, context_len); + return 1; +} + +void MLDSA44_prehash_update(struct MLDSA44_prehash *inout_state, + const uint8_t *msg, size_t msg_len) { + BCM_mldsa44_prehash_update(inout_state, msg, msg_len); +} + +void MLDSA44_prehash_finalize(uint8_t out_msg_rep[MLDSA_MU_BYTES], + struct MLDSA44_prehash *inout_state) { + BCM_mldsa44_prehash_finalize(out_msg_rep, inout_state); +} + +int MLDSA44_sign_message_representative( + uint8_t out_encoded_signature[MLDSA44_SIGNATURE_BYTES], + const struct MLDSA44_private_key *private_key, + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + return bcm_success(BCM_mldsa44_sign_message_representative( + out_encoded_signature, private_key, msg_rep)); +} + +int MLDSA44_verify_message_representative( + const struct MLDSA44_public_key *public_key, + const uint8_t *signature, size_t signature_len, + const uint8_t msg_rep[MLDSA_MU_BYTES]) { + if (signature_len != MLDSA44_SIGNATURE_BYTES) { + return 0; + } + return bcm_success(BCM_mldsa44_verify_message_representative( + public_key, signature, msg_rep)); +} + +int MLDSA44_marshal_public_key(CBB *out, + const struct MLDSA44_public_key *public_key) { + return bcm_success(BCM_mldsa44_marshal_public_key(out, public_key)); +} + +int MLDSA44_parse_public_key(struct MLDSA44_public_key *public_key, CBS *in) { + return bcm_success(BCM_mldsa44_parse_public_key(public_key, in)); +} diff --git a/third_party/boringssl/src/crypto/mlkem/mlkem.cc b/third_party/boringssl/src/crypto/mlkem/mlkem.cc new file mode 100644 index 00000000..5728be66 --- /dev/null +++ b/third_party/boringssl/src/crypto/mlkem/mlkem.cc @@ -0,0 +1,108 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../fipsmodule/bcm_interface.h" + + +using namespace bssl; + +void MLKEM768_generate_key( + uint8_t out_encoded_public_key[MLKEM768_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + struct MLKEM768_private_key *out_private_key) { + BCM_mlkem768_generate_key(out_encoded_public_key, optional_out_seed, + out_private_key); +} + +int MLKEM768_private_key_from_seed(struct MLKEM768_private_key *out_private_key, + const uint8_t *seed, size_t seed_len) { + return bcm_success( + BCM_mlkem768_private_key_from_seed(out_private_key, seed, seed_len)); +} + +void MLKEM768_public_from_private( + struct MLKEM768_public_key *out_public_key, + const struct MLKEM768_private_key *private_key) { + (void)BCM_mlkem768_public_from_private(out_public_key, private_key); +} + +void MLKEM768_encap(uint8_t out_ciphertext[MLKEM768_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const struct MLKEM768_public_key *public_key) { + (void)BCM_mlkem768_encap(out_ciphertext, out_shared_secret, public_key); +} + +int MLKEM768_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const uint8_t *ciphertext, size_t ciphertext_len, + const struct MLKEM768_private_key *private_key) { + return bcm_success(BCM_mlkem768_decap(out_shared_secret, ciphertext, + ciphertext_len, private_key)); +} + +int MLKEM768_marshal_public_key(CBB *out, + const struct MLKEM768_public_key *public_key) { + return bcm_success(BCM_mlkem768_marshal_public_key(out, public_key)); +} + +int MLKEM768_parse_public_key(struct MLKEM768_public_key *out_public_key, + CBS *in) { + return bcm_success(BCM_mlkem768_parse_public_key(out_public_key, in)); +} + + +void MLKEM1024_generate_key( + uint8_t out_encoded_public_key[MLKEM1024_PUBLIC_KEY_BYTES], + uint8_t optional_out_seed[MLKEM_SEED_BYTES], + struct MLKEM1024_private_key *out_private_key) { + (void)BCM_mlkem1024_generate_key(out_encoded_public_key, optional_out_seed, + out_private_key); +} + +int MLKEM1024_private_key_from_seed( + struct MLKEM1024_private_key *out_private_key, const uint8_t *seed, + size_t seed_len) { + return bcm_success( + BCM_mlkem1024_private_key_from_seed(out_private_key, seed, seed_len)); +} + +void MLKEM1024_public_from_private( + struct MLKEM1024_public_key *out_public_key, + const struct MLKEM1024_private_key *private_key) { + (void)BCM_mlkem1024_public_from_private(out_public_key, private_key); +} + +void MLKEM1024_encap(uint8_t out_ciphertext[MLKEM1024_CIPHERTEXT_BYTES], + uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const struct MLKEM1024_public_key *public_key) { + (void)BCM_mlkem1024_encap(out_ciphertext, out_shared_secret, public_key); +} + +int MLKEM1024_decap(uint8_t out_shared_secret[MLKEM_SHARED_SECRET_BYTES], + const uint8_t *ciphertext, size_t ciphertext_len, + const struct MLKEM1024_private_key *private_key) { + return bcm_success(BCM_mlkem1024_decap(out_shared_secret, ciphertext, + ciphertext_len, private_key)); +} + +int MLKEM1024_marshal_public_key( + CBB *out, const struct MLKEM1024_public_key *public_key) { + return bcm_success(BCM_mlkem1024_marshal_public_key(out, public_key)); +} + +int MLKEM1024_parse_public_key(struct MLKEM1024_public_key *out_public_key, + CBS *in) { + return bcm_success(BCM_mlkem1024_parse_public_key(out_public_key, in)); +} diff --git a/third_party/boringssl/src/crypto/obj/obj.c b/third_party/boringssl/src/crypto/obj/obj.c deleted file mode 100644 index 958625d0..00000000 --- a/third_party/boringssl/src/crypto/obj/obj.c +++ /dev/null @@ -1,553 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "../asn1/internal.h" -#include "../internal.h" -#include "../lhash/internal.h" - -// obj_data.h must be included after the definition of |ASN1_OBJECT|. -#include "obj_dat.h" - - -DEFINE_LHASH_OF(ASN1_OBJECT) - -static struct CRYPTO_STATIC_MUTEX global_added_lock = CRYPTO_STATIC_MUTEX_INIT; -// These globals are protected by |global_added_lock|. -static LHASH_OF(ASN1_OBJECT) *global_added_by_data = NULL; -static LHASH_OF(ASN1_OBJECT) *global_added_by_nid = NULL; -static LHASH_OF(ASN1_OBJECT) *global_added_by_short_name = NULL; -static LHASH_OF(ASN1_OBJECT) *global_added_by_long_name = NULL; - -static struct CRYPTO_STATIC_MUTEX global_next_nid_lock = - CRYPTO_STATIC_MUTEX_INIT; -static unsigned global_next_nid = NUM_NID; - -static int obj_next_nid(void) { - int ret; - - CRYPTO_STATIC_MUTEX_lock_write(&global_next_nid_lock); - ret = global_next_nid++; - CRYPTO_STATIC_MUTEX_unlock_write(&global_next_nid_lock); - - return ret; -} - -ASN1_OBJECT *OBJ_dup(const ASN1_OBJECT *o) { - ASN1_OBJECT *r; - unsigned char *data = NULL; - char *sn = NULL, *ln = NULL; - - if (o == NULL) { - return NULL; - } - - if (!(o->flags & ASN1_OBJECT_FLAG_DYNAMIC)) { - // TODO(fork): this is a little dangerous. - return (ASN1_OBJECT *)o; - } - - r = ASN1_OBJECT_new(); - if (r == NULL) { - OPENSSL_PUT_ERROR(OBJ, ERR_R_ASN1_LIB); - return NULL; - } - r->ln = r->sn = NULL; - - data = OPENSSL_malloc(o->length); - if (data == NULL) { - goto err; - } - if (o->data != NULL) { - OPENSSL_memcpy(data, o->data, o->length); - } - - // once data is attached to an object, it remains const - r->data = data; - r->length = o->length; - r->nid = o->nid; - - if (o->ln != NULL) { - ln = OPENSSL_strdup(o->ln); - if (ln == NULL) { - goto err; - } - } - - if (o->sn != NULL) { - sn = OPENSSL_strdup(o->sn); - if (sn == NULL) { - goto err; - } - } - - r->sn = sn; - r->ln = ln; - - r->flags = - o->flags | (ASN1_OBJECT_FLAG_DYNAMIC | ASN1_OBJECT_FLAG_DYNAMIC_STRINGS | - ASN1_OBJECT_FLAG_DYNAMIC_DATA); - return r; - -err: - OPENSSL_PUT_ERROR(OBJ, ERR_R_MALLOC_FAILURE); - OPENSSL_free(ln); - OPENSSL_free(sn); - OPENSSL_free(data); - OPENSSL_free(r); - return NULL; -} - -int OBJ_cmp(const ASN1_OBJECT *a, const ASN1_OBJECT *b) { - int ret; - - ret = a->length - b->length; - if (ret) { - return ret; - } - return OPENSSL_memcmp(a->data, b->data, a->length); -} - -const uint8_t *OBJ_get0_data(const ASN1_OBJECT *obj) { - if (obj == NULL) { - return NULL; - } - - return obj->data; -} - -size_t OBJ_length(const ASN1_OBJECT *obj) { - if (obj == NULL || obj->length < 0) { - return 0; - } - - return (size_t)obj->length; -} - -// obj_cmp is called to search the kNIDsInOIDOrder array. The |key| argument is -// an |ASN1_OBJECT|* that we're looking for and |element| is a pointer to an -// unsigned int in the array. -static int obj_cmp(const void *key, const void *element) { - uint16_t nid = *((const uint16_t *)element); - const ASN1_OBJECT *a = key; - const ASN1_OBJECT *b = &kObjects[nid]; - - if (a->length < b->length) { - return -1; - } else if (a->length > b->length) { - return 1; - } - return OPENSSL_memcmp(a->data, b->data, a->length); -} - -int OBJ_obj2nid(const ASN1_OBJECT *obj) { - if (obj == NULL) { - return NID_undef; - } - - if (obj->nid != 0) { - return obj->nid; - } - - CRYPTO_STATIC_MUTEX_lock_read(&global_added_lock); - if (global_added_by_data != NULL) { - ASN1_OBJECT *match; - - match = lh_ASN1_OBJECT_retrieve(global_added_by_data, obj); - if (match != NULL) { - CRYPTO_STATIC_MUTEX_unlock_read(&global_added_lock); - return match->nid; - } - } - CRYPTO_STATIC_MUTEX_unlock_read(&global_added_lock); - - const uint16_t *nid_ptr = - bsearch(obj, kNIDsInOIDOrder, OPENSSL_ARRAY_SIZE(kNIDsInOIDOrder), - sizeof(kNIDsInOIDOrder[0]), obj_cmp); - if (nid_ptr == NULL) { - return NID_undef; - } - - return kObjects[*nid_ptr].nid; -} - -int OBJ_cbs2nid(const CBS *cbs) { - if (CBS_len(cbs) > INT_MAX) { - return NID_undef; - } - - ASN1_OBJECT obj; - OPENSSL_memset(&obj, 0, sizeof(obj)); - obj.data = CBS_data(cbs); - obj.length = (int)CBS_len(cbs); - - return OBJ_obj2nid(&obj); -} - -// short_name_cmp is called to search the kNIDsInShortNameOrder array. The -// |key| argument is name that we're looking for and |element| is a pointer to -// an unsigned int in the array. -static int short_name_cmp(const void *key, const void *element) { - const char *name = (const char *)key; - uint16_t nid = *((const uint16_t *)element); - - return strcmp(name, kObjects[nid].sn); -} - -int OBJ_sn2nid(const char *short_name) { - CRYPTO_STATIC_MUTEX_lock_read(&global_added_lock); - if (global_added_by_short_name != NULL) { - ASN1_OBJECT *match, template; - - template.sn = short_name; - match = lh_ASN1_OBJECT_retrieve(global_added_by_short_name, &template); - if (match != NULL) { - CRYPTO_STATIC_MUTEX_unlock_read(&global_added_lock); - return match->nid; - } - } - CRYPTO_STATIC_MUTEX_unlock_read(&global_added_lock); - - const uint16_t *nid_ptr = - bsearch(short_name, kNIDsInShortNameOrder, - OPENSSL_ARRAY_SIZE(kNIDsInShortNameOrder), - sizeof(kNIDsInShortNameOrder[0]), short_name_cmp); - if (nid_ptr == NULL) { - return NID_undef; - } - - return kObjects[*nid_ptr].nid; -} - -// long_name_cmp is called to search the kNIDsInLongNameOrder array. The -// |key| argument is name that we're looking for and |element| is a pointer to -// an unsigned int in the array. -static int long_name_cmp(const void *key, const void *element) { - const char *name = (const char *)key; - uint16_t nid = *((const uint16_t *)element); - - return strcmp(name, kObjects[nid].ln); -} - -int OBJ_ln2nid(const char *long_name) { - CRYPTO_STATIC_MUTEX_lock_read(&global_added_lock); - if (global_added_by_long_name != NULL) { - ASN1_OBJECT *match, template; - - template.ln = long_name; - match = lh_ASN1_OBJECT_retrieve(global_added_by_long_name, &template); - if (match != NULL) { - CRYPTO_STATIC_MUTEX_unlock_read(&global_added_lock); - return match->nid; - } - } - CRYPTO_STATIC_MUTEX_unlock_read(&global_added_lock); - - const uint16_t *nid_ptr = bsearch( - long_name, kNIDsInLongNameOrder, OPENSSL_ARRAY_SIZE(kNIDsInLongNameOrder), - sizeof(kNIDsInLongNameOrder[0]), long_name_cmp); - if (nid_ptr == NULL) { - return NID_undef; - } - - return kObjects[*nid_ptr].nid; -} - -int OBJ_txt2nid(const char *s) { - ASN1_OBJECT *obj; - int nid; - - obj = OBJ_txt2obj(s, 0 /* search names */); - nid = OBJ_obj2nid(obj); - ASN1_OBJECT_free(obj); - return nid; -} - -OPENSSL_EXPORT int OBJ_nid2cbb(CBB *out, int nid) { - const ASN1_OBJECT *obj = OBJ_nid2obj(nid); - CBB oid; - - if (obj == NULL || - !CBB_add_asn1(out, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, obj->data, obj->length) || - !CBB_flush(out)) { - return 0; - } - - return 1; -} - -ASN1_OBJECT *OBJ_nid2obj(int nid) { - if (nid >= 0 && nid < NUM_NID) { - if (nid != NID_undef && kObjects[nid].nid == NID_undef) { - goto err; - } - return (ASN1_OBJECT *)&kObjects[nid]; - } - - CRYPTO_STATIC_MUTEX_lock_read(&global_added_lock); - if (global_added_by_nid != NULL) { - ASN1_OBJECT *match, template; - - template.nid = nid; - match = lh_ASN1_OBJECT_retrieve(global_added_by_nid, &template); - if (match != NULL) { - CRYPTO_STATIC_MUTEX_unlock_read(&global_added_lock); - return match; - } - } - CRYPTO_STATIC_MUTEX_unlock_read(&global_added_lock); - -err: - OPENSSL_PUT_ERROR(OBJ, OBJ_R_UNKNOWN_NID); - return NULL; -} - -const char *OBJ_nid2sn(int nid) { - const ASN1_OBJECT *obj = OBJ_nid2obj(nid); - if (obj == NULL) { - return NULL; - } - - return obj->sn; -} - -const char *OBJ_nid2ln(int nid) { - const ASN1_OBJECT *obj = OBJ_nid2obj(nid); - if (obj == NULL) { - return NULL; - } - - return obj->ln; -} - -static ASN1_OBJECT *create_object_with_text_oid(int (*get_nid)(void), - const char *oid, - const char *short_name, - const char *long_name) { - uint8_t *buf; - size_t len; - CBB cbb; - if (!CBB_init(&cbb, 32) || - !CBB_add_asn1_oid_from_text(&cbb, oid, strlen(oid)) || - !CBB_finish(&cbb, &buf, &len)) { - OPENSSL_PUT_ERROR(OBJ, OBJ_R_INVALID_OID_STRING); - CBB_cleanup(&cbb); - return NULL; - } - - ASN1_OBJECT *ret = ASN1_OBJECT_create(get_nid ? get_nid() : NID_undef, buf, - len, short_name, long_name); - OPENSSL_free(buf); - return ret; -} - -ASN1_OBJECT *OBJ_txt2obj(const char *s, int dont_search_names) { - if (!dont_search_names) { - int nid = OBJ_sn2nid(s); - if (nid == NID_undef) { - nid = OBJ_ln2nid(s); - } - - if (nid != NID_undef) { - return OBJ_nid2obj(nid); - } - } - - return create_object_with_text_oid(NULL, s, NULL, NULL); -} - -static int strlcpy_int(char *dst, const char *src, int dst_size) { - size_t ret = OPENSSL_strlcpy(dst, src, dst_size < 0 ? 0 : (size_t)dst_size); - if (ret > INT_MAX) { - OPENSSL_PUT_ERROR(OBJ, ERR_R_OVERFLOW); - return -1; - } - return (int)ret; -} - -int OBJ_obj2txt(char *out, int out_len, const ASN1_OBJECT *obj, - int always_return_oid) { - // Python depends on the empty OID successfully encoding as the empty - // string. - if (obj == NULL || obj->length == 0) { - return strlcpy_int(out, "", out_len); - } - - if (!always_return_oid) { - int nid = OBJ_obj2nid(obj); - if (nid != NID_undef) { - const char *name = OBJ_nid2ln(nid); - if (name == NULL) { - name = OBJ_nid2sn(nid); - } - if (name != NULL) { - return strlcpy_int(out, name, out_len); - } - } - } - - CBS cbs; - CBS_init(&cbs, obj->data, obj->length); - char *txt = CBS_asn1_oid_to_text(&cbs); - if (txt == NULL) { - if (out_len > 0) { - out[0] = '\0'; - } - return -1; - } - - int ret = strlcpy_int(out, txt, out_len); - OPENSSL_free(txt); - return ret; -} - -static uint32_t hash_nid(const ASN1_OBJECT *obj) { - return obj->nid; -} - -static int cmp_nid(const ASN1_OBJECT *a, const ASN1_OBJECT *b) { - return a->nid - b->nid; -} - -static uint32_t hash_data(const ASN1_OBJECT *obj) { - return OPENSSL_hash32(obj->data, obj->length); -} - -static int cmp_data(const ASN1_OBJECT *a, const ASN1_OBJECT *b) { - int i = a->length - b->length; - if (i) { - return i; - } - return OPENSSL_memcmp(a->data, b->data, a->length); -} - -static uint32_t hash_short_name(const ASN1_OBJECT *obj) { - return OPENSSL_strhash(obj->sn); -} - -static int cmp_short_name(const ASN1_OBJECT *a, const ASN1_OBJECT *b) { - return strcmp(a->sn, b->sn); -} - -static uint32_t hash_long_name(const ASN1_OBJECT *obj) { - return OPENSSL_strhash(obj->ln); -} - -static int cmp_long_name(const ASN1_OBJECT *a, const ASN1_OBJECT *b) { - return strcmp(a->ln, b->ln); -} - -// obj_add_object inserts |obj| into the various global hashes for run-time -// added objects. It returns one on success or zero otherwise. -static int obj_add_object(ASN1_OBJECT *obj) { - int ok; - ASN1_OBJECT *old_object; - - obj->flags &= ~(ASN1_OBJECT_FLAG_DYNAMIC | ASN1_OBJECT_FLAG_DYNAMIC_STRINGS | - ASN1_OBJECT_FLAG_DYNAMIC_DATA); - - CRYPTO_STATIC_MUTEX_lock_write(&global_added_lock); - if (global_added_by_nid == NULL) { - global_added_by_nid = lh_ASN1_OBJECT_new(hash_nid, cmp_nid); - global_added_by_data = lh_ASN1_OBJECT_new(hash_data, cmp_data); - global_added_by_short_name = lh_ASN1_OBJECT_new(hash_short_name, cmp_short_name); - global_added_by_long_name = lh_ASN1_OBJECT_new(hash_long_name, cmp_long_name); - } - - // We don't pay attention to |old_object| (which contains any previous object - // that was evicted from the hashes) because we don't have a reference count - // on ASN1_OBJECT values. Also, we should never have duplicates nids and so - // should always have objects in |global_added_by_nid|. - - ok = lh_ASN1_OBJECT_insert(global_added_by_nid, &old_object, obj); - if (obj->length != 0 && obj->data != NULL) { - ok &= lh_ASN1_OBJECT_insert(global_added_by_data, &old_object, obj); - } - if (obj->sn != NULL) { - ok &= lh_ASN1_OBJECT_insert(global_added_by_short_name, &old_object, obj); - } - if (obj->ln != NULL) { - ok &= lh_ASN1_OBJECT_insert(global_added_by_long_name, &old_object, obj); - } - CRYPTO_STATIC_MUTEX_unlock_write(&global_added_lock); - - return ok; -} - -int OBJ_create(const char *oid, const char *short_name, const char *long_name) { - ASN1_OBJECT *op = - create_object_with_text_oid(obj_next_nid, oid, short_name, long_name); - if (op == NULL || - !obj_add_object(op)) { - return NID_undef; - } - return op->nid; -} - -void OBJ_cleanup(void) {} diff --git a/third_party/boringssl/src/crypto/obj/obj.cc b/third_party/boringssl/src/crypto/obj/obj.cc new file mode 100644 index 00000000..a6e6e5a8 --- /dev/null +++ b/third_party/boringssl/src/crypto/obj/obj.cc @@ -0,0 +1,509 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include + +#include "../asn1/internal.h" +#include "../internal.h" +#include "../lhash/internal.h" + +// obj_data.h must be included after the definition of |ASN1_OBJECT|. +#include "obj_dat.h" + + +using namespace bssl; + +BSSL_NAMESPACE_BEGIN + +DEFINE_LHASH_OF(ASN1_OBJECT) + +BSSL_NAMESPACE_END + +static StaticMutex global_added_lock; +// These globals are protected by |global_added_lock|. +static LHASH_OF(ASN1_OBJECT) *global_added_by_data = nullptr; +static LHASH_OF(ASN1_OBJECT) *global_added_by_nid = nullptr; +static LHASH_OF(ASN1_OBJECT) *global_added_by_short_name = nullptr; +static LHASH_OF(ASN1_OBJECT) *global_added_by_long_name = nullptr; + +static StaticMutex global_next_nid_lock; +static unsigned global_next_nid = NUM_NID; + +static int obj_next_nid() { + MutexWriteLock lock(&global_next_nid_lock); + return global_next_nid++; +} + +ASN1_OBJECT *OBJ_dup(const ASN1_OBJECT *o) { + ASN1_OBJECT *r; + unsigned char *data = nullptr; + char *sn = nullptr, *ln = nullptr; + + if (o == nullptr) { + return nullptr; + } + + if (!(o->flags & ASN1_OBJECT_FLAG_DYNAMIC)) { + // TODO(fork): this is a little dangerous. + return (ASN1_OBJECT *)o; + } + + r = ASN1_OBJECT_new(); + if (r == nullptr) { + OPENSSL_PUT_ERROR(OBJ, ERR_R_ASN1_LIB); + return nullptr; + } + r->ln = r->sn = nullptr; + + // once data is attached to an object, it remains const + r->data = reinterpret_cast(OPENSSL_memdup(o->data, o->length)); + if (o->length != 0 && r->data == nullptr) { + goto err; + } + + r->length = o->length; + r->nid = o->nid; + + if (o->ln != nullptr) { + ln = OPENSSL_strdup(o->ln); + if (ln == nullptr) { + goto err; + } + } + + if (o->sn != nullptr) { + sn = OPENSSL_strdup(o->sn); + if (sn == nullptr) { + goto err; + } + } + + r->sn = sn; + r->ln = ln; + + r->flags = + o->flags | (ASN1_OBJECT_FLAG_DYNAMIC | ASN1_OBJECT_FLAG_DYNAMIC_STRINGS | + ASN1_OBJECT_FLAG_DYNAMIC_DATA); + return r; + +err: + OPENSSL_free(ln); + OPENSSL_free(sn); + OPENSSL_free(data); + OPENSSL_free(r); + return nullptr; +} + +int OBJ_cmp(const ASN1_OBJECT *a, const ASN1_OBJECT *b) { + if (a->length < b->length) { + return -1; + } else if (a->length > b->length) { + return 1; + } + return OPENSSL_memcmp(a->data, b->data, a->length); +} + +const uint8_t *OBJ_get0_data(const ASN1_OBJECT *obj) { + if (obj == nullptr) { + return nullptr; + } + + return obj->data; +} + +size_t OBJ_length(const ASN1_OBJECT *obj) { + if (obj == nullptr || obj->length < 0) { + return 0; + } + + return (size_t)obj->length; +} + +static const ASN1_OBJECT *get_builtin_object(int nid) { + // |NID_undef| is stored separately, so all the indices are off by one. The + // caller of this function must have a valid built-in, non-undef NID. + BSSL_CHECK(nid > 0 && nid < NUM_NID); + return &kObjects[nid - 1]; +} + +// obj_cmp is called to search the kNIDsInOIDOrder array. The |key| argument is +// an |ASN1_OBJECT|* that we're looking for and |element| is a pointer to an +// unsigned int in the array. +static int obj_cmp(const void *key, const void *element) { + uint16_t nid = *((const uint16_t *)element); + return OBJ_cmp(reinterpret_cast(key), + get_builtin_object(nid)); +} + +int OBJ_obj2nid(const ASN1_OBJECT *obj) { + if (obj == nullptr) { + return NID_undef; + } + + if (obj->nid != 0) { + return obj->nid; + } + + { + MutexReadLock lock(&global_added_lock); + if (global_added_by_data != nullptr) { + ASN1_OBJECT *match = lh_ASN1_OBJECT_retrieve(global_added_by_data, obj); + if (match != nullptr) { + return match->nid; + } + } + } + + const uint16_t *nid_ptr = reinterpret_cast( + bsearch(obj, kNIDsInOIDOrder, std::size(kNIDsInOIDOrder), + sizeof(kNIDsInOIDOrder[0]), obj_cmp)); + if (nid_ptr == nullptr) { + return NID_undef; + } + + return get_builtin_object(*nid_ptr)->nid; +} + +int OBJ_cbs2nid(const CBS *cbs) { + if (CBS_len(cbs) > INT_MAX) { + return NID_undef; + } + + ASN1_OBJECT obj; + OPENSSL_memset(&obj, 0, sizeof(obj)); + obj.data = CBS_data(cbs); + obj.length = (int)CBS_len(cbs); + + return OBJ_obj2nid(&obj); +} + +// short_name_cmp is called to search the kNIDsInShortNameOrder array. The +// |key| argument is name that we're looking for and |element| is a pointer to +// an unsigned int in the array. +static int short_name_cmp(const void *key, const void *element) { + const char *name = (const char *)key; + uint16_t nid = *((const uint16_t *)element); + + return strcmp(name, get_builtin_object(nid)->sn); +} + +int OBJ_sn2nid(const char *short_name) { + { + MutexReadLock lock(&global_added_lock); + if (global_added_by_short_name != nullptr) { + ASN1_OBJECT templ; + templ.sn = short_name; + ASN1_OBJECT *match = + lh_ASN1_OBJECT_retrieve(global_added_by_short_name, &templ); + if (match != nullptr) { + return match->nid; + } + } + } + + const uint16_t *nid_ptr = reinterpret_cast(bsearch( + short_name, kNIDsInShortNameOrder, std::size(kNIDsInShortNameOrder), + sizeof(kNIDsInShortNameOrder[0]), short_name_cmp)); + if (nid_ptr == nullptr) { + return NID_undef; + } + + return get_builtin_object(*nid_ptr)->nid; +} + +// long_name_cmp is called to search the kNIDsInLongNameOrder array. The +// |key| argument is name that we're looking for and |element| is a pointer to +// an unsigned int in the array. +static int long_name_cmp(const void *key, const void *element) { + const char *name = (const char *)key; + uint16_t nid = *((const uint16_t *)element); + + return strcmp(name, get_builtin_object(nid)->ln); +} + +int OBJ_ln2nid(const char *long_name) { + { + MutexReadLock lock(&global_added_lock); + if (global_added_by_long_name != nullptr) { + ASN1_OBJECT templ; + templ.ln = long_name; + ASN1_OBJECT *match = + lh_ASN1_OBJECT_retrieve(global_added_by_long_name, &templ); + if (match != nullptr) { + return match->nid; + } + } + } + + const uint16_t *nid_ptr = reinterpret_cast( + bsearch(long_name, kNIDsInLongNameOrder, std::size(kNIDsInLongNameOrder), + sizeof(kNIDsInLongNameOrder[0]), long_name_cmp)); + if (nid_ptr == nullptr) { + return NID_undef; + } + + return get_builtin_object(*nid_ptr)->nid; +} + +int OBJ_txt2nid(const char *s) { + ASN1_OBJECT *obj; + int nid; + + obj = OBJ_txt2obj(s, 0 /* search names */); + nid = OBJ_obj2nid(obj); + ASN1_OBJECT_free(obj); + return nid; +} + +OPENSSL_EXPORT int OBJ_nid2cbb(CBB *out, int nid) { + const ASN1_OBJECT *obj = OBJ_nid2obj(nid); + return obj != nullptr && + CBB_add_asn1_element(out, CBS_ASN1_OBJECT, obj->data, obj->length); +} + +const ASN1_OBJECT *OBJ_get_undef() { + static const ASN1_OBJECT kUndef = { + /*sn=*/SN_undef, + /*ln=*/LN_undef, + /*nid=*/NID_undef, + /*length=*/0, + /*data=*/nullptr, + /*flags=*/0, + }; + return &kUndef; +} + +ASN1_OBJECT *OBJ_nid2obj(int nid) { + if (nid == NID_undef) { + return (ASN1_OBJECT *)OBJ_get_undef(); + } + + if (nid > 0 && nid < NUM_NID) { + const ASN1_OBJECT *obj = get_builtin_object(nid); + if (nid != NID_undef && obj->nid == NID_undef) { + OPENSSL_PUT_ERROR(OBJ, OBJ_R_UNKNOWN_NID); + return nullptr; + } + return (ASN1_OBJECT *)obj; + } + + { + MutexReadLock lock(&global_added_lock); + if (global_added_by_nid != nullptr) { + ASN1_OBJECT templ; + templ.nid = nid; + ASN1_OBJECT *match = lh_ASN1_OBJECT_retrieve(global_added_by_nid, &templ); + if (match != nullptr) { + return match; + } + } + } + + OPENSSL_PUT_ERROR(OBJ, OBJ_R_UNKNOWN_NID); + return nullptr; +} + +const char *OBJ_nid2sn(int nid) { + const ASN1_OBJECT *obj = OBJ_nid2obj(nid); + if (obj == nullptr) { + return nullptr; + } + + return obj->sn; +} + +const char *OBJ_nid2ln(int nid) { + const ASN1_OBJECT *obj = OBJ_nid2obj(nid); + if (obj == nullptr) { + return nullptr; + } + + return obj->ln; +} + +static ASN1_OBJECT *create_object_with_text_oid(int (*get_nid)(), + const char *oid, + const char *short_name, + const char *long_name) { + uint8_t *buf; + size_t len; + CBB cbb; + if (!CBB_init(&cbb, 32) || + !CBB_add_asn1_oid_from_text(&cbb, oid, strlen(oid)) || + !CBB_finish(&cbb, &buf, &len)) { + OPENSSL_PUT_ERROR(OBJ, OBJ_R_INVALID_OID_STRING); + CBB_cleanup(&cbb); + return nullptr; + } + + ASN1_OBJECT *ret = ASN1_OBJECT_create(get_nid ? get_nid() : NID_undef, buf, + len, short_name, long_name); + OPENSSL_free(buf); + return ret; +} + +ASN1_OBJECT *OBJ_txt2obj(const char *s, int dont_search_names) { + if (!dont_search_names) { + int nid = OBJ_sn2nid(s); + if (nid == NID_undef) { + nid = OBJ_ln2nid(s); + } + + if (nid != NID_undef) { + return OBJ_nid2obj(nid); + } + } + + return create_object_with_text_oid(nullptr, s, nullptr, nullptr); +} + +static int strlcpy_int(char *dst, const char *src, int dst_size) { + size_t ret = OPENSSL_strlcpy(dst, src, dst_size < 0 ? 0 : (size_t)dst_size); + if (ret > INT_MAX) { + OPENSSL_PUT_ERROR(OBJ, ERR_R_OVERFLOW); + return -1; + } + return (int)ret; +} + +int OBJ_obj2txt(char *out, int out_len, const ASN1_OBJECT *obj, + int always_return_oid) { + // Python depends on the empty OID successfully encoding as the empty + // string. + if (obj == nullptr || obj->length == 0) { + return strlcpy_int(out, "", out_len); + } + + if (!always_return_oid) { + int nid = OBJ_obj2nid(obj); + if (nid != NID_undef) { + const char *name = OBJ_nid2ln(nid); + if (name == nullptr) { + name = OBJ_nid2sn(nid); + } + if (name != nullptr) { + return strlcpy_int(out, name, out_len); + } + } + } + + CBS cbs; + CBS_init(&cbs, obj->data, obj->length); + char *txt = CBS_asn1_oid_to_text(&cbs); + if (txt == nullptr) { + if (out_len > 0) { + out[0] = '\0'; + } + return -1; + } + + int ret = strlcpy_int(out, txt, out_len); + OPENSSL_free(txt); + return ret; +} + +static uint32_t hash_nid(const ASN1_OBJECT *obj) { return obj->nid; } + +static int cmp_nid(const ASN1_OBJECT *a, const ASN1_OBJECT *b) { + return a->nid - b->nid; +} + +static uint32_t hash_data(const ASN1_OBJECT *obj) { + return OPENSSL_hash32(obj->data, obj->length); +} + +static uint32_t hash_short_name(const ASN1_OBJECT *obj) { + return OPENSSL_strhash(obj->sn); +} + +static int cmp_short_name(const ASN1_OBJECT *a, const ASN1_OBJECT *b) { + return strcmp(a->sn, b->sn); +} + +static uint32_t hash_long_name(const ASN1_OBJECT *obj) { + return OPENSSL_strhash(obj->ln); +} + +static int cmp_long_name(const ASN1_OBJECT *a, const ASN1_OBJECT *b) { + return strcmp(a->ln, b->ln); +} + +// obj_add_object inserts |obj| into the various global hashes for run-time +// added objects. It returns one on success or zero otherwise. +static int obj_add_object(ASN1_OBJECT *obj) { + obj->flags &= ~(ASN1_OBJECT_FLAG_DYNAMIC | ASN1_OBJECT_FLAG_DYNAMIC_STRINGS | + ASN1_OBJECT_FLAG_DYNAMIC_DATA); + + MutexWriteLock lock(&global_added_lock); + if (global_added_by_nid == nullptr) { + global_added_by_nid = lh_ASN1_OBJECT_new(hash_nid, cmp_nid); + } + if (global_added_by_data == nullptr) { + global_added_by_data = lh_ASN1_OBJECT_new(hash_data, OBJ_cmp); + } + if (global_added_by_short_name == nullptr) { + global_added_by_short_name = + lh_ASN1_OBJECT_new(hash_short_name, cmp_short_name); + } + if (global_added_by_long_name == nullptr) { + global_added_by_long_name = + lh_ASN1_OBJECT_new(hash_long_name, cmp_long_name); + } + + if (global_added_by_nid == nullptr || // + global_added_by_data == nullptr || // + global_added_by_short_name == nullptr || // + global_added_by_long_name == nullptr) { + return 0; + } + + // We don't pay attention to |old_object| (which contains any previous object + // that was evicted from the hashes) because we don't have a reference count + // on ASN1_OBJECT values. Also, we should never have duplicates nids and so + // should always have objects in |global_added_by_nid|. + ASN1_OBJECT *old_object; + int ok = lh_ASN1_OBJECT_insert(global_added_by_nid, &old_object, obj); + if (obj->length != 0 && obj->data != nullptr) { + ok &= lh_ASN1_OBJECT_insert(global_added_by_data, &old_object, obj); + } + if (obj->sn != nullptr) { + ok &= lh_ASN1_OBJECT_insert(global_added_by_short_name, &old_object, obj); + } + if (obj->ln != nullptr) { + ok &= lh_ASN1_OBJECT_insert(global_added_by_long_name, &old_object, obj); + } + return ok; +} + +int OBJ_create(const char *oid, const char *short_name, const char *long_name) { + ASN1_OBJECT *op = + create_object_with_text_oid(obj_next_nid, oid, short_name, long_name); + if (op == nullptr || !obj_add_object(op)) { + return NID_undef; + } + return op->nid; +} + +void OBJ_cleanup() {} diff --git a/third_party/boringssl/src/crypto/obj/obj_dat.h b/third_party/boringssl/src/crypto/obj/obj_dat.h index cc185f1b..feb8f2d1 100644 --- a/third_party/boringssl/src/crypto/obj/obj_dat.h +++ b/third_party/boringssl/src/crypto/obj/obj_dat.h @@ -1,63 +1,22 @@ -/* Copyright (C) 1995-1997 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. -/* This file is generated by crypto/obj/objects.go. */ +// This file is generated by crypto/obj/objects.go. -#define NUM_NID 964 +BSSL_NAMESPACE_BEGIN +#define NUM_NID 973 static const uint8_t kObjectData[] = { /* NID_rsadsi */ @@ -7137,10 +7096,59 @@ static const uint8_t kObjectData[] = { 0x04, 0x02, 0x06, + /* NID_ML_KEM_1024 */ + 0x60, + 0x86, + 0x48, + 0x01, + 0x65, + 0x03, + 0x04, + 0x04, + 0x03, + /* NID_ML_DSA_44 */ + 0x60, + 0x86, + 0x48, + 0x01, + 0x65, + 0x03, + 0x04, + 0x03, + 0x11, + /* NID_ML_DSA_65 */ + 0x60, + 0x86, + 0x48, + 0x01, + 0x65, + 0x03, + 0x04, + 0x03, + 0x12, + /* NID_ML_DSA_87 */ + 0x60, + 0x86, + 0x48, + 0x01, + 0x65, + 0x03, + 0x04, + 0x03, + 0x13, + /* NID_ML_KEM_768 */ + 0x60, + 0x86, + 0x48, + 0x01, + 0x65, + 0x03, + 0x04, + 0x04, + 0x02, }; static const ASN1_OBJECT kObjects[NUM_NID] = { - {"UNDEF", "undefined", NID_undef, 0, NULL, 0}, {"rsadsi", "RSA Data Security, Inc.", NID_rsadsi, 6, &kObjectData[0], 0}, {"pkcs", "RSA Data Security, Inc. PKCS", NID_pkcs, 7, &kObjectData[6], 0}, {"MD2", "md2", NID_md2, 8, &kObjectData[13], 0}, @@ -8777,11 +8785,23 @@ static const ASN1_OBJECT kObjects[NUM_NID] = { {"AuthPSK", "auth-psk", NID_auth_psk, 0, NULL, 0}, {"KxANY", "kx-any", NID_kx_any, 0, NULL, 0}, {"AuthANY", "auth-any", NID_auth_any, 0, NULL, 0}, - {"CECPQ2", "CECPQ2", NID_CECPQ2, 0, NULL, 0}, + {NULL, NULL, NID_undef, 0, NULL, 0}, {"ED448", "ED448", NID_ED448, 3, &kObjectData[6181], 0}, {"X448", "X448", NID_X448, 3, &kObjectData[6184], 0}, {"SHA512-256", "sha512-256", NID_sha512_256, 9, &kObjectData[6187], 0}, {"HKDF", "hkdf", NID_hkdf, 0, NULL, 0}, + {"X25519Kyber768Draft00", "X25519Kyber768Draft00", + NID_X25519Kyber768Draft00, 0, NULL, 0}, + {"X25519MLKEM768", "X25519MLKEM768", NID_X25519MLKEM768, 0, NULL, 0}, + {"id-alg-ml-kem-1024", "ML-KEM-1024", NID_ML_KEM_1024, 9, + &kObjectData[6196], 0}, + {"id-ml-dsa-44", "ML-DSA-44", NID_ML_DSA_44, 9, &kObjectData[6205], 0}, + {"id-ml-dsa-65", "ML-DSA-65", NID_ML_DSA_65, 9, &kObjectData[6214], 0}, + {"id-ml-dsa-87", "ML-DSA-87", NID_ML_DSA_87, 9, &kObjectData[6223], 0}, + {"id-alg-ml-kem-768", "ML-KEM-768", NID_ML_KEM_768, 9, &kObjectData[6232], + 0}, + {NULL, NULL, NID_undef, 0, NULL, 0}, + {"X-Wing", "X-Wing", NID_X_Wing, 0, NULL, 0}, }; static const uint16_t kNIDsInShortNameOrder[] = { @@ -8843,7 +8863,6 @@ static const uint16_t kNIDsInShortNameOrder[] = { 110 /* CAST5-CFB */, 109 /* CAST5-ECB */, 111 /* CAST5-OFB */, - 959 /* CECPQ2 */, 894 /* CMAC */, 13 /* CN */, 141 /* CRLReason */, @@ -8979,8 +8998,10 @@ static const uint16_t kNIDsInShortNameOrder[] = { 16 /* ST */, 143 /* SXNetID */, 458 /* UID */, - 0 /* UNDEF */, + 972 /* X-Wing */, 948 /* X25519 */, + 964 /* X25519Kyber768Draft00 */, + 965 /* X25519MLKEM768 */, 961 /* X448 */, 11 /* X500 */, 378 /* X500algorithms */, @@ -9219,6 +9240,8 @@ static const uint16_t kNIDsInShortNameOrder[] = { 323 /* id-alg-des40 */, 326 /* id-alg-dh-pop */, 325 /* id-alg-dh-sig-hmac-sha1 */, + 966 /* id-alg-ml-kem-1024 */, + 970 /* id-alg-ml-kem-768 */, 324 /* id-alg-noSignature */, 907 /* id-camellia128-wrap */, 908 /* id-camellia192-wrap */, @@ -9272,6 +9295,9 @@ static const uint16_t kNIDsInShortNameOrder[] = { 784 /* id-it-suppLangTags */, 304 /* id-it-unsupportedOIDs */, 128 /* id-kp */, + 967 /* id-ml-dsa-44 */, + 968 /* id-ml-dsa-65 */, + 969 /* id-ml-dsa-87 */, 280 /* id-mod-attribute-cert */, 274 /* id-mod-cmc */, 277 /* id-mod-cmp */, @@ -9752,7 +9778,6 @@ static const uint16_t kNIDsInLongNameOrder[] = { 285 /* Biometric Info */, 179 /* CA Issuers */, 785 /* CA Repository */, - 959 /* CECPQ2 */, 131 /* Code Signing */, 783 /* Diffie-Hellman based MAC */, 382 /* Directory */, @@ -9796,6 +9821,11 @@ static const uint16_t kNIDsInLongNameOrder[] = { 647 /* International Organizations */, 142 /* Invalidity Date */, 504 /* MIME MHS */, + 967 /* ML-DSA-44 */, + 968 /* ML-DSA-65 */, + 969 /* ML-DSA-87 */, + 966 /* ML-KEM-1024 */, + 970 /* ML-KEM-768 */, 388 /* Mail */, 383 /* Management */, 417 /* Microsoft CSP Name */, @@ -9851,7 +9881,10 @@ static const uint16_t kNIDsInLongNameOrder[] = { 129 /* TLS Web Server Authentication */, 133 /* Time Stamping */, 375 /* Trust Root */, + 972 /* X-Wing */, 948 /* X25519 */, + 964 /* X25519Kyber768Draft00 */, + 965 /* X25519MLKEM768 */, 961 /* X448 */, 12 /* X509 */, 402 /* X509v3 AC Targeting */, @@ -10668,7 +10701,6 @@ static const uint16_t kNIDsInLongNameOrder[] = { 106 /* title */, 682 /* tpBasis */, 436 /* ucl */, - 0 /* undefined */, 888 /* uniqueMember */, 55 /* unstructuredAddress */, 49 /* unstructuredName */, @@ -11410,6 +11442,11 @@ static const uint16_t kNIDsInOIDOrder[] = { 962 /* 2.16.840.1.101.3.4.2.6 (OBJ_sha512_256) */, 802 /* 2.16.840.1.101.3.4.3.1 (OBJ_dsa_with_SHA224) */, 803 /* 2.16.840.1.101.3.4.3.2 (OBJ_dsa_with_SHA256) */, + 967 /* 2.16.840.1.101.3.4.3.17 (OBJ_ML_DSA_44) */, + 968 /* 2.16.840.1.101.3.4.3.18 (OBJ_ML_DSA_65) */, + 969 /* 2.16.840.1.101.3.4.3.19 (OBJ_ML_DSA_87) */, + 970 /* 2.16.840.1.101.3.4.4.2 (OBJ_ML_KEM_768) */, + 966 /* 2.16.840.1.101.3.4.4.3 (OBJ_ML_KEM_1024) */, 71 /* 2.16.840.1.113730.1.1 (OBJ_netscape_cert_type) */, 72 /* 2.16.840.1.113730.1.2 (OBJ_netscape_base_url) */, 73 /* 2.16.840.1.113730.1.3 (OBJ_netscape_revocation_url) */, @@ -11586,3 +11623,4 @@ static const uint16_t kNIDsInOIDOrder[] = { 155 /* 1.2.840.113549.1.12.10.1.6 (OBJ_safeContentsBag) */, 34 /* 1.3.6.1.4.1.188.7.1.1.2 (OBJ_idea_cbc) */, }; +BSSL_NAMESPACE_END diff --git a/third_party/boringssl/src/crypto/obj/obj_xref.c b/third_party/boringssl/src/crypto/obj/obj_xref.c deleted file mode 100644 index 21bde279..00000000 --- a/third_party/boringssl/src/crypto/obj/obj_xref.c +++ /dev/null @@ -1,122 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include "../internal.h" - - -typedef struct { - int sign_nid; - int digest_nid; - int pkey_nid; -} nid_triple; - -static const nid_triple kTriples[] = { - // RSA PKCS#1. - {NID_md4WithRSAEncryption, NID_md4, NID_rsaEncryption}, - {NID_md5WithRSAEncryption, NID_md5, NID_rsaEncryption}, - {NID_sha1WithRSAEncryption, NID_sha1, NID_rsaEncryption}, - {NID_sha224WithRSAEncryption, NID_sha224, NID_rsaEncryption}, - {NID_sha256WithRSAEncryption, NID_sha256, NID_rsaEncryption}, - {NID_sha384WithRSAEncryption, NID_sha384, NID_rsaEncryption}, - {NID_sha512WithRSAEncryption, NID_sha512, NID_rsaEncryption}, - // DSA. - {NID_dsaWithSHA1, NID_sha1, NID_dsa}, - {NID_dsaWithSHA1_2, NID_sha1, NID_dsa_2}, - {NID_dsa_with_SHA224, NID_sha224, NID_dsa}, - {NID_dsa_with_SHA256, NID_sha256, NID_dsa}, - // ECDSA. - {NID_ecdsa_with_SHA1, NID_sha1, NID_X9_62_id_ecPublicKey}, - {NID_ecdsa_with_SHA224, NID_sha224, NID_X9_62_id_ecPublicKey}, - {NID_ecdsa_with_SHA256, NID_sha256, NID_X9_62_id_ecPublicKey}, - {NID_ecdsa_with_SHA384, NID_sha384, NID_X9_62_id_ecPublicKey}, - {NID_ecdsa_with_SHA512, NID_sha512, NID_X9_62_id_ecPublicKey}, - // The following algorithms use more complex (or simpler) parameters. The - // digest "undef" indicates the caller should handle this explicitly. - {NID_rsassaPss, NID_undef, NID_rsaEncryption}, - {NID_ED25519, NID_undef, NID_ED25519}, -}; - -int OBJ_find_sigid_algs(int sign_nid, int *out_digest_nid, int *out_pkey_nid) { - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kTriples); i++) { - if (kTriples[i].sign_nid == sign_nid) { - if (out_digest_nid != NULL) { - *out_digest_nid = kTriples[i].digest_nid; - } - if (out_pkey_nid != NULL) { - *out_pkey_nid = kTriples[i].pkey_nid; - } - return 1; - } - } - - return 0; -} - -int OBJ_find_sigid_by_algs(int *out_sign_nid, int digest_nid, int pkey_nid) { - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kTriples); i++) { - if (kTriples[i].digest_nid == digest_nid && - kTriples[i].pkey_nid == pkey_nid) { - if (out_sign_nid != NULL) { - *out_sign_nid = kTriples[i].sign_nid; - } - return 1; - } - } - - return 0; -} diff --git a/third_party/boringssl/src/crypto/obj/obj_xref.cc b/third_party/boringssl/src/crypto/obj/obj_xref.cc new file mode 100644 index 00000000..52120f68 --- /dev/null +++ b/third_party/boringssl/src/crypto/obj/obj_xref.cc @@ -0,0 +1,83 @@ +// Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../internal.h" + + +typedef struct { + int sign_nid; + int digest_nid; + int pkey_nid; +} nid_triple; + +static const nid_triple kTriples[] = { + // RSA PKCS#1. + {NID_md4WithRSAEncryption, NID_md4, NID_rsaEncryption}, + {NID_md5WithRSAEncryption, NID_md5, NID_rsaEncryption}, + {NID_sha1WithRSAEncryption, NID_sha1, NID_rsaEncryption}, + {NID_sha224WithRSAEncryption, NID_sha224, NID_rsaEncryption}, + {NID_sha256WithRSAEncryption, NID_sha256, NID_rsaEncryption}, + {NID_sha384WithRSAEncryption, NID_sha384, NID_rsaEncryption}, + {NID_sha512WithRSAEncryption, NID_sha512, NID_rsaEncryption}, + // DSA. + {NID_dsaWithSHA1, NID_sha1, NID_dsa}, + {NID_dsaWithSHA1_2, NID_sha1, NID_dsa_2}, + {NID_dsa_with_SHA224, NID_sha224, NID_dsa}, + {NID_dsa_with_SHA256, NID_sha256, NID_dsa}, + // ECDSA. + {NID_ecdsa_with_SHA1, NID_sha1, NID_X9_62_id_ecPublicKey}, + {NID_ecdsa_with_SHA224, NID_sha224, NID_X9_62_id_ecPublicKey}, + {NID_ecdsa_with_SHA256, NID_sha256, NID_X9_62_id_ecPublicKey}, + {NID_ecdsa_with_SHA384, NID_sha384, NID_X9_62_id_ecPublicKey}, + {NID_ecdsa_with_SHA512, NID_sha512, NID_X9_62_id_ecPublicKey}, + // The following algorithms use more complex (or simpler) parameters. The + // digest "undef" indicates the caller should handle this explicitly. + {NID_rsassaPss, NID_undef, NID_rsaEncryption}, + {NID_ED25519, NID_undef, NID_ED25519}, + {NID_ML_DSA_44, NID_undef, NID_ML_DSA_44}, + {NID_ML_DSA_65, NID_undef, NID_ML_DSA_65}, + {NID_ML_DSA_87, NID_undef, NID_ML_DSA_87}, +}; + +int OBJ_find_sigid_algs(int sign_nid, int *out_digest_nid, int *out_pkey_nid) { + for (const auto &triple : kTriples) { + if (triple.sign_nid == sign_nid) { + if (out_digest_nid != nullptr) { + *out_digest_nid = triple.digest_nid; + } + if (out_pkey_nid != nullptr) { + *out_pkey_nid = triple.pkey_nid; + } + return 1; + } + } + + return 0; +} + +int OBJ_find_sigid_by_algs(int *out_sign_nid, int digest_nid, int pkey_nid) { + for (const auto &triple : kTriples) { + if (triple.digest_nid == digest_nid && + triple.pkey_nid == pkey_nid) { + if (out_sign_nid != nullptr) { + *out_sign_nid = triple.sign_nid; + } + return 1; + } + } + + return 0; +} diff --git a/third_party/boringssl/src/crypto/params_internal.h b/third_party/boringssl/src/crypto/params_internal.h new file mode 100644 index 00000000..7baa2a89 --- /dev/null +++ b/third_party/boringssl/src/crypto/params_internal.h @@ -0,0 +1,30 @@ +// Copyright 2026 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_PARAMS_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_PARAMS_INTERNAL_H + +#include + + +BSSL_NAMESPACE_BEGIN + +// IsEndParam returns whether `param` is a terminating element. +inline bool IsEndParam(const OSSL_PARAM ¶m) { + return param.key == nullptr; +} + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_PARAMS_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/pem/internal.h b/third_party/boringssl/src/crypto/pem/internal.h new file mode 100644 index 00000000..656dffba --- /dev/null +++ b/third_party/boringssl/src/crypto/pem/internal.h @@ -0,0 +1,49 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_PEM_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_PEM_INTERNAL_H + +#include +#include + +#include "../mem_internal.h" + + +BSSL_NAMESPACE_BEGIN + +// PEM_get_EVP_CIPHER_INFO decodes |header| as a PEM header block and writes the +// specified cipher and IV to |cipher|. It returns one on success and zero on +// error. |header| must be a NUL-terminated string. If |header| does not +// specify encryption, this function will return success and set +// |cipher->cipher| to NULL. +int PEM_get_EVP_CIPHER_INFO(const char *header, EVP_CIPHER_INFO *cipher); + +// PEM_do_header decrypts |*len| bytes from |data| in-place according to the +// information in |cipher|. On success, it returns one and sets |*len| to the +// length of the plaintext. Otherwise, it returns zero. If |cipher| specifies +// encryption, the key is derived from a password returned from |callback|. +int PEM_do_header(const EVP_CIPHER_INFO *cipher, uint8_t *data, size_t *len, + pem_password_cb *callback, void *u); + +// PEM_read_bio_inner differs from |PEM_read_bio| on the out pointer |len| +// so that it guarantee non-negativeness on this output and it takes in +// owned types. +int PEM_read_bio_inner(BIO *bp, bssl::UniquePtr *name, + bssl::UniquePtr *header, + bssl::Array *data); + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_PEM_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/pem/pem_all.c b/third_party/boringssl/src/crypto/pem/pem_all.c deleted file mode 100644 index cade0a25..00000000 --- a/third_party/boringssl/src/crypto/pem/pem_all.c +++ /dev/null @@ -1,243 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ -/* ==================================================================== - * Copyright (c) 1998-2002 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.openssl.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * openssl-core@openssl.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.openssl.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -static RSA *pkey_get_rsa(EVP_PKEY *key, RSA **rsa); -static DSA *pkey_get_dsa(EVP_PKEY *key, DSA **dsa); -static EC_KEY *pkey_get_eckey(EVP_PKEY *key, EC_KEY **eckey); - -IMPLEMENT_PEM_rw(X509_REQ, X509_REQ, PEM_STRING_X509_REQ, X509_REQ) - -IMPLEMENT_PEM_write(X509_REQ_NEW, X509_REQ, PEM_STRING_X509_REQ_OLD, X509_REQ) -IMPLEMENT_PEM_rw(X509_CRL, X509_CRL, PEM_STRING_X509_CRL, X509_CRL) -IMPLEMENT_PEM_rw(PKCS7, PKCS7, PEM_STRING_PKCS7, PKCS7) - -// We treat RSA or DSA private keys as a special case. For private keys we -// read in an EVP_PKEY structure with PEM_read_bio_PrivateKey() and extract -// the relevant private key: this means can handle "traditional" and PKCS#8 -// formats transparently. -static RSA *pkey_get_rsa(EVP_PKEY *key, RSA **rsa) { - RSA *rtmp; - if (!key) { - return NULL; - } - rtmp = EVP_PKEY_get1_RSA(key); - EVP_PKEY_free(key); - if (!rtmp) { - return NULL; - } - if (rsa) { - RSA_free(*rsa); - *rsa = rtmp; - } - return rtmp; -} - -RSA *PEM_read_bio_RSAPrivateKey(BIO *bp, RSA **rsa, pem_password_cb *cb, - void *u) { - EVP_PKEY *pktmp; - pktmp = PEM_read_bio_PrivateKey(bp, NULL, cb, u); - return pkey_get_rsa(pktmp, rsa); -} - -RSA *PEM_read_RSAPrivateKey(FILE *fp, RSA **rsa, pem_password_cb *cb, void *u) { - EVP_PKEY *pktmp; - pktmp = PEM_read_PrivateKey(fp, NULL, cb, u); - return pkey_get_rsa(pktmp, rsa); -} - -IMPLEMENT_PEM_write_cb_const(RSAPrivateKey, RSA, PEM_STRING_RSA, RSAPrivateKey) - - -IMPLEMENT_PEM_rw_const(RSAPublicKey, RSA, PEM_STRING_RSA_PUBLIC, RSAPublicKey) -IMPLEMENT_PEM_rw(RSA_PUBKEY, RSA, PEM_STRING_PUBLIC, RSA_PUBKEY) -#ifndef OPENSSL_NO_DSA -static DSA *pkey_get_dsa(EVP_PKEY *key, DSA **dsa) { - DSA *dtmp; - if (!key) { - return NULL; - } - dtmp = EVP_PKEY_get1_DSA(key); - EVP_PKEY_free(key); - if (!dtmp) { - return NULL; - } - if (dsa) { - DSA_free(*dsa); - *dsa = dtmp; - } - return dtmp; -} - -DSA *PEM_read_bio_DSAPrivateKey(BIO *bp, DSA **dsa, pem_password_cb *cb, - void *u) { - EVP_PKEY *pktmp; - pktmp = PEM_read_bio_PrivateKey(bp, NULL, cb, u); - return pkey_get_dsa(pktmp, dsa); // will free pktmp -} - -IMPLEMENT_PEM_write_cb_const(DSAPrivateKey, DSA, PEM_STRING_DSA, DSAPrivateKey) - -IMPLEMENT_PEM_rw(DSA_PUBKEY, DSA, PEM_STRING_PUBLIC, DSA_PUBKEY) -DSA *PEM_read_DSAPrivateKey(FILE *fp, DSA **dsa, pem_password_cb *cb, void *u) { - EVP_PKEY *pktmp; - pktmp = PEM_read_PrivateKey(fp, NULL, cb, u); - return pkey_get_dsa(pktmp, dsa); // will free pktmp -} - -IMPLEMENT_PEM_rw_const(DSAparams, DSA, PEM_STRING_DSAPARAMS, DSAparams) -#endif -static EC_KEY *pkey_get_eckey(EVP_PKEY *key, EC_KEY **eckey) { - EC_KEY *dtmp; - if (!key) { - return NULL; - } - dtmp = EVP_PKEY_get1_EC_KEY(key); - EVP_PKEY_free(key); - if (!dtmp) { - return NULL; - } - if (eckey) { - EC_KEY_free(*eckey); - *eckey = dtmp; - } - return dtmp; -} - -EC_KEY *PEM_read_bio_ECPrivateKey(BIO *bp, EC_KEY **key, pem_password_cb *cb, - void *u) { - EVP_PKEY *pktmp; - pktmp = PEM_read_bio_PrivateKey(bp, NULL, cb, u); - return pkey_get_eckey(pktmp, key); // will free pktmp -} - -IMPLEMENT_PEM_write_cb(ECPrivateKey, EC_KEY, PEM_STRING_ECPRIVATEKEY, - ECPrivateKey) - -IMPLEMENT_PEM_rw(EC_PUBKEY, EC_KEY, PEM_STRING_PUBLIC, EC_PUBKEY) -EC_KEY *PEM_read_ECPrivateKey(FILE *fp, EC_KEY **eckey, pem_password_cb *cb, - void *u) { - EVP_PKEY *pktmp; - pktmp = PEM_read_PrivateKey(fp, NULL, cb, u); - return pkey_get_eckey(pktmp, eckey); // will free pktmp -} - - -IMPLEMENT_PEM_rw_const(DHparams, DH, PEM_STRING_DHPARAMS, DHparams) - -IMPLEMENT_PEM_rw(PUBKEY, EVP_PKEY, PEM_STRING_PUBLIC, PUBKEY) diff --git a/third_party/boringssl/src/crypto/pem/pem_all.cc b/third_party/boringssl/src/crypto/pem/pem_all.cc new file mode 100644 index 00000000..f03b6080 --- /dev/null +++ b/third_party/boringssl/src/crypto/pem/pem_all.cc @@ -0,0 +1,150 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +static RSA *pkey_get_rsa(EVP_PKEY *key, RSA **rsa); +static DSA *pkey_get_dsa(EVP_PKEY *key, DSA **dsa); +static EC_KEY *pkey_get_eckey(EVP_PKEY *key, EC_KEY **eckey); + +IMPLEMENT_PEM_rw(X509_REQ, X509_REQ, PEM_STRING_X509_REQ, X509_REQ) + +IMPLEMENT_PEM_write(X509_REQ_NEW, X509_REQ, PEM_STRING_X509_REQ_OLD, X509_REQ) +IMPLEMENT_PEM_rw(X509_CRL, X509_CRL, PEM_STRING_X509_CRL, X509_CRL) +IMPLEMENT_PEM_rw(PKCS7, PKCS7, PEM_STRING_PKCS7, PKCS7) + +// We treat RSA or DSA private keys as a special case. For private keys we +// read in an EVP_PKEY structure with PEM_read_bio_PrivateKey() and extract +// the relevant private key: this means can handle "traditional" and PKCS#8 +// formats transparently. +static RSA *pkey_get_rsa(EVP_PKEY *key, RSA **rsa) { + if (!key) { + return nullptr; + } + if (EVP_PKEY_id(key) != EVP_PKEY_RSA) { + // Don't accept RSA-PSS keys in this function. + OPENSSL_PUT_ERROR(EVP, EVP_R_EXPECTING_AN_RSA_KEY); + return nullptr; + } + RSA *rtmp = EVP_PKEY_get1_RSA(key); + if (!rtmp) { + return nullptr; + } + if (rsa) { + RSA_free(*rsa); + *rsa = rtmp; + } + return rtmp; +} + +RSA *PEM_read_bio_RSAPrivateKey(BIO *bp, RSA **rsa, pem_password_cb *cb, + void *u) { + bssl::UniquePtr pkey(PEM_read_bio_PrivateKey(bp, nullptr, cb, u)); + return pkey_get_rsa(pkey.get(), rsa); +} + +RSA *PEM_read_RSAPrivateKey(FILE *fp, RSA **rsa, pem_password_cb *cb, void *u) { + bssl::UniquePtr pkey(PEM_read_PrivateKey(fp, nullptr, cb, u)); + return pkey_get_rsa(pkey.get(), rsa); +} + +IMPLEMENT_PEM_write_cb_const(RSAPrivateKey, RSA, PEM_STRING_RSA, RSAPrivateKey) + + +IMPLEMENT_PEM_rw_const(RSAPublicKey, RSA, PEM_STRING_RSA_PUBLIC, RSAPublicKey) +IMPLEMENT_PEM_rw(RSA_PUBKEY, RSA, PEM_STRING_PUBLIC, RSA_PUBKEY) +#ifndef OPENSSL_NO_DSA +static DSA *pkey_get_dsa(EVP_PKEY *key, DSA **dsa) { + DSA *dtmp; + if (!key) { + return nullptr; + } + dtmp = EVP_PKEY_get1_DSA(key); + EVP_PKEY_free(key); + if (!dtmp) { + return nullptr; + } + if (dsa) { + DSA_free(*dsa); + *dsa = dtmp; + } + return dtmp; +} + +DSA *PEM_read_bio_DSAPrivateKey(BIO *bp, DSA **dsa, pem_password_cb *cb, + void *u) { + EVP_PKEY *pktmp; + pktmp = PEM_read_bio_PrivateKey(bp, nullptr, cb, u); + return pkey_get_dsa(pktmp, dsa); // will free pktmp +} + +IMPLEMENT_PEM_write_cb_const(DSAPrivateKey, DSA, PEM_STRING_DSA, DSAPrivateKey) + +IMPLEMENT_PEM_rw(DSA_PUBKEY, DSA, PEM_STRING_PUBLIC, DSA_PUBKEY) +DSA *PEM_read_DSAPrivateKey(FILE *fp, DSA **dsa, pem_password_cb *cb, void *u) { + EVP_PKEY *pktmp; + pktmp = PEM_read_PrivateKey(fp, nullptr, cb, u); + return pkey_get_dsa(pktmp, dsa); // will free pktmp +} + +IMPLEMENT_PEM_rw_const(DSAparams, DSA, PEM_STRING_DSAPARAMS, DSAparams) +#endif +static EC_KEY *pkey_get_eckey(EVP_PKEY *key, EC_KEY **eckey) { + EC_KEY *dtmp; + if (!key) { + return nullptr; + } + dtmp = EVP_PKEY_get1_EC_KEY(key); + EVP_PKEY_free(key); + if (!dtmp) { + return nullptr; + } + if (eckey) { + EC_KEY_free(*eckey); + *eckey = dtmp; + } + return dtmp; +} + +EC_KEY *PEM_read_bio_ECPrivateKey(BIO *bp, EC_KEY **key, pem_password_cb *cb, + void *u) { + EVP_PKEY *pktmp; + pktmp = PEM_read_bio_PrivateKey(bp, nullptr, cb, u); + return pkey_get_eckey(pktmp, key); // will free pktmp +} + +IMPLEMENT_PEM_write_cb(ECPrivateKey, EC_KEY, PEM_STRING_ECPRIVATEKEY, + ECPrivateKey) + +IMPLEMENT_PEM_rw(EC_PUBKEY, EC_KEY, PEM_STRING_PUBLIC, EC_PUBKEY) +EC_KEY *PEM_read_ECPrivateKey(FILE *fp, EC_KEY **eckey, pem_password_cb *cb, + void *u) { + EVP_PKEY *pktmp; + pktmp = PEM_read_PrivateKey(fp, nullptr, cb, u); + return pkey_get_eckey(pktmp, eckey); // will free pktmp +} + + +IMPLEMENT_PEM_rw_const(DHparams, DH, PEM_STRING_DHPARAMS, DHparams) + +IMPLEMENT_PEM_rw(PUBKEY, EVP_PKEY, PEM_STRING_PUBLIC, PUBKEY) diff --git a/third_party/boringssl/src/crypto/pem/pem_info.c b/third_party/boringssl/src/crypto/pem/pem_info.c deleted file mode 100644 index 04b47437..00000000 --- a/third_party/boringssl/src/crypto/pem/pem_info.c +++ /dev/null @@ -1,265 +0,0 @@ -/* crypto/pem/pem_info.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] - */ - -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -STACK_OF(X509_INFO) *PEM_X509_INFO_read(FILE *fp, STACK_OF(X509_INFO) *sk, - pem_password_cb *cb, void *u) { - BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); - if (b == NULL) { - OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); - return 0; - } - STACK_OF(X509_INFO) *ret = PEM_X509_INFO_read_bio(b, sk, cb, u); - BIO_free(b); - return ret; -} - -enum parse_result_t { - parse_ok, - parse_error, - parse_new_entry, -}; - -static enum parse_result_t parse_x509(X509_INFO *info, const uint8_t *data, - size_t len, int key_type) { - if (info->x509 != NULL) { - return parse_new_entry; - } - info->x509 = d2i_X509(NULL, &data, len); - return info->x509 != NULL ? parse_ok : parse_error; -} - -static enum parse_result_t parse_x509_aux(X509_INFO *info, const uint8_t *data, - size_t len, int key_type) { - if (info->x509 != NULL) { - return parse_new_entry; - } - info->x509 = d2i_X509_AUX(NULL, &data, len); - return info->x509 != NULL ? parse_ok : parse_error; -} - -static enum parse_result_t parse_crl(X509_INFO *info, const uint8_t *data, - size_t len, int key_type) { - if (info->crl != NULL) { - return parse_new_entry; - } - info->crl = d2i_X509_CRL(NULL, &data, len); - return info->crl != NULL ? parse_ok : parse_error; -} - -static enum parse_result_t parse_key(X509_INFO *info, const uint8_t *data, - size_t len, int key_type) { - if (info->x_pkey != NULL) { - return parse_new_entry; - } - info->x_pkey = X509_PKEY_new(); - if (info->x_pkey == NULL) { - return parse_error; - } - info->x_pkey->dec_pkey = d2i_PrivateKey(key_type, NULL, &data, len); - return info->x_pkey->dec_pkey != NULL ? parse_ok : parse_error; -} - -STACK_OF(X509_INFO) *PEM_X509_INFO_read_bio(BIO *bp, STACK_OF(X509_INFO) *sk, - pem_password_cb *cb, void *u) { - X509_INFO *info = NULL; - char *name = NULL, *header = NULL; - unsigned char *data = NULL; - long len; - int ok = 0; - STACK_OF(X509_INFO) *ret = NULL; - - if (sk == NULL) { - ret = sk_X509_INFO_new_null(); - if (ret == NULL) { - OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); - return NULL; - } - } else { - ret = sk; - } - size_t orig_num = sk_X509_INFO_num(ret); - - info = X509_INFO_new(); - if (info == NULL) { - goto err; - } - - for (;;) { - if (!PEM_read_bio(bp, &name, &header, &data, &len)) { - uint32_t error = ERR_peek_last_error(); - if (ERR_GET_LIB(error) == ERR_LIB_PEM && - ERR_GET_REASON(error) == PEM_R_NO_START_LINE) { - ERR_clear_error(); - break; - } - goto err; - } - - enum parse_result_t (*parse_function)(X509_INFO *, const uint8_t *, size_t, - int) = NULL; - int key_type = EVP_PKEY_NONE; - if (strcmp(name, PEM_STRING_X509) == 0 || - strcmp(name, PEM_STRING_X509_OLD) == 0) { - parse_function = parse_x509; - } else if (strcmp(name, PEM_STRING_X509_TRUSTED) == 0) { - parse_function = parse_x509_aux; - } else if (strcmp(name, PEM_STRING_X509_CRL) == 0) { - parse_function = parse_crl; - } else if (strcmp(name, PEM_STRING_RSA) == 0) { - parse_function = parse_key; - key_type = EVP_PKEY_RSA; - } else if (strcmp(name, PEM_STRING_DSA) == 0) { - parse_function = parse_key; - key_type = EVP_PKEY_DSA; - } else if (strcmp(name, PEM_STRING_ECPRIVATEKEY) == 0) { - parse_function = parse_key; - key_type = EVP_PKEY_EC; - } - - // If a private key has a header, assume it is encrypted. - if (key_type != EVP_PKEY_NONE && strlen(header) > 10) { - if (info->x_pkey != NULL) { - if (!sk_X509_INFO_push(ret, info)) { - goto err; - } - info = X509_INFO_new(); - if (info == NULL) { - goto err; - } - } - // Historically, raw entries pushed an empty key. - info->x_pkey = X509_PKEY_new(); - if (info->x_pkey == NULL || - !PEM_get_EVP_CIPHER_INFO(header, &info->enc_cipher)) { - goto err; - } - info->enc_data = (char *)data; - info->enc_len = (int)len; - data = NULL; - } else if (parse_function != NULL) { - EVP_CIPHER_INFO cipher; - if (!PEM_get_EVP_CIPHER_INFO(header, &cipher) || - !PEM_do_header(&cipher, data, &len, cb, u)) { - goto err; - } - enum parse_result_t result = parse_function(info, data, len, key_type); - if (result == parse_new_entry) { - if (!sk_X509_INFO_push(ret, info)) { - goto err; - } - info = X509_INFO_new(); - if (info == NULL) { - goto err; - } - result = parse_function(info, data, len, key_type); - } - if (result != parse_ok) { - OPENSSL_PUT_ERROR(PEM, ERR_R_ASN1_LIB); - goto err; - } - } - OPENSSL_free(name); - OPENSSL_free(header); - OPENSSL_free(data); - name = NULL; - header = NULL; - data = NULL; - } - - // Push the last entry on the stack if not empty. - if (info->x509 != NULL || info->crl != NULL || info->x_pkey != NULL || - info->enc_data != NULL) { - if (!sk_X509_INFO_push(ret, info)) { - goto err; - } - info = NULL; - } - - ok = 1; - -err: - X509_INFO_free(info); - if (!ok) { - while (sk_X509_INFO_num(ret) > orig_num) { - X509_INFO_free(sk_X509_INFO_pop(ret)); - } - if (ret != sk) { - sk_X509_INFO_free(ret); - } - ret = NULL; - } - - OPENSSL_free(name); - OPENSSL_free(header); - OPENSSL_free(data); - return ret; -} diff --git a/third_party/boringssl/src/crypto/pem/pem_info.cc b/third_party/boringssl/src/crypto/pem/pem_info.cc new file mode 100644 index 00000000..a75e7e41 --- /dev/null +++ b/third_party/boringssl/src/crypto/pem/pem_info.cc @@ -0,0 +1,250 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +static X509_PKEY *X509_PKEY_new() { return New(); } + +static void X509_PKEY_free(X509_PKEY *x) { + if (x == nullptr) { + return; + } + + EVP_PKEY_free(x->dec_pkey); + Delete(x); +} + +static X509_INFO *X509_INFO_new() { return New(); } + +void X509_INFO_free(X509_INFO *x) { + if (x == nullptr) { + return; + } + + X509_free(x->x509); + X509_CRL_free(x->crl); + X509_PKEY_free(x->x_pkey); + OPENSSL_free(x->enc_data); + Delete(x); +} + + +STACK_OF(X509_INFO) *PEM_X509_INFO_read(FILE *fp, STACK_OF(X509_INFO) *sk, + pem_password_cb *cb, void *u) { + BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); + if (b == nullptr) { + OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); + return nullptr; + } + STACK_OF(X509_INFO) *ret = PEM_X509_INFO_read_bio(b, sk, cb, u); + BIO_free(b); + return ret; +} + +enum parse_result_t { + parse_ok, + parse_error, + parse_new_entry, +}; + +static enum parse_result_t parse_x509(X509_INFO *info, const uint8_t *data, + size_t len, int key_type) { + if (info->x509 != nullptr) { + return parse_new_entry; + } + info->x509 = d2i_X509(nullptr, &data, len); + return info->x509 != nullptr ? parse_ok : parse_error; +} + +static enum parse_result_t parse_x509_aux(X509_INFO *info, const uint8_t *data, + size_t len, int key_type) { + if (info->x509 != nullptr) { + return parse_new_entry; + } + info->x509 = d2i_X509_AUX(nullptr, &data, len); + return info->x509 != nullptr ? parse_ok : parse_error; +} + +static enum parse_result_t parse_crl(X509_INFO *info, const uint8_t *data, + size_t len, int key_type) { + if (info->crl != nullptr) { + return parse_new_entry; + } + info->crl = d2i_X509_CRL(nullptr, &data, len); + return info->crl != nullptr ? parse_ok : parse_error; +} + +static enum parse_result_t parse_key(X509_INFO *info, const uint8_t *data, + size_t len, int key_type) { + if (info->x_pkey != nullptr) { + return parse_new_entry; + } + info->x_pkey = X509_PKEY_new(); + if (info->x_pkey == nullptr) { + return parse_error; + } + info->x_pkey->dec_pkey = d2i_PrivateKey(key_type, nullptr, &data, len); + return info->x_pkey->dec_pkey != nullptr ? parse_ok : parse_error; +} + +STACK_OF(X509_INFO) *PEM_X509_INFO_read_bio(BIO *bp, STACK_OF(X509_INFO) *sk, + pem_password_cb *cb, void *u) { + X509_INFO *info = nullptr; + UniquePtr name; + UniquePtr header; + Array data; + int ok = 0; + STACK_OF(X509_INFO) *ret = nullptr; + + if (sk == nullptr) { + ret = sk_X509_INFO_new_null(); + if (ret == nullptr) { + return nullptr; + } + } else { + ret = sk; + } + size_t orig_num = sk_X509_INFO_num(ret); + + info = X509_INFO_new(); + if (info == nullptr) { + goto err; + } + + for (;;) { + if (!PEM_read_bio_inner(bp, &name, &header, &data)) { + if (ERR_equals(ERR_peek_last_error(), ERR_LIB_PEM, PEM_R_NO_START_LINE)) { + ERR_clear_error(); + break; + } + goto err; + } + + enum parse_result_t (*parse_function)(X509_INFO *, const uint8_t *, size_t, + int) = nullptr; + int key_type = EVP_PKEY_NONE; + std::string_view name_view = name.get(); + if (name_view == PEM_STRING_X509 || name_view == PEM_STRING_X509_OLD) { + parse_function = parse_x509; + } else if (name_view == PEM_STRING_X509_TRUSTED) { + parse_function = parse_x509_aux; + } else if (name_view == PEM_STRING_X509_CRL) { + parse_function = parse_crl; + } else if (name_view == PEM_STRING_RSA) { + parse_function = parse_key; + key_type = EVP_PKEY_RSA; + } else if (name_view == PEM_STRING_DSA) { + parse_function = parse_key; + key_type = EVP_PKEY_DSA; + } else if (name_view == PEM_STRING_ECPRIVATEKEY) { + parse_function = parse_key; + key_type = EVP_PKEY_EC; + } + + // If a private key has a header, assume it is encrypted. This function does + // not decrypt private keys. + if (key_type != EVP_PKEY_NONE && strlen(header.get()) > 10) { + if (data.size() > INT_MAX) { + // We need the data to fit in |info| which forces the size to + // fit in one int type. + goto err; + } + if (info->x_pkey != nullptr) { + if (!sk_X509_INFO_push(ret, info)) { + goto err; + } + info = X509_INFO_new(); + if (info == nullptr) { + goto err; + } + } + // Use an empty key as a placeholder. + info->x_pkey = X509_PKEY_new(); + if (info->x_pkey == nullptr || + !PEM_get_EVP_CIPHER_INFO(header.get(), &info->enc_cipher)) { + goto err; + } + size_t size; + data.Release(reinterpret_cast(&info->enc_data), &size); + // Safety: we checked that |size| <= |INT_MAX|. + info->enc_len = static_cast(size); + } else if (parse_function != nullptr) { + EVP_CIPHER_INFO cipher; + size_t len = data.size(); + if (!PEM_get_EVP_CIPHER_INFO(header.get(), &cipher) || + !PEM_do_header(&cipher, data.data(), &len, cb, u)) { + goto err; + } + enum parse_result_t result = + parse_function(info, data.data(), len, key_type); + if (result == parse_new_entry) { + if (!sk_X509_INFO_push(ret, info)) { + goto err; + } + info = X509_INFO_new(); + if (info == nullptr) { + goto err; + } + result = parse_function(info, data.data(), len, key_type); + } + if (result != parse_ok) { + OPENSSL_PUT_ERROR(PEM, ERR_R_ASN1_LIB); + goto err; + } + } + } + + // Push the last entry on the stack if not empty. + if (info->x509 != nullptr || info->crl != nullptr || + info->x_pkey != nullptr || info->enc_data != nullptr) { + if (!sk_X509_INFO_push(ret, info)) { + goto err; + } + info = nullptr; + } + + ok = 1; + +err: + X509_INFO_free(info); + if (!ok) { + while (sk_X509_INFO_num(ret) > orig_num) { + X509_INFO_free(sk_X509_INFO_pop(ret)); + } + if (ret != sk) { + sk_X509_INFO_free(ret); + } + ret = nullptr; + } + return ret; +} diff --git a/third_party/boringssl/src/crypto/pem/pem_lib.c b/third_party/boringssl/src/crypto/pem/pem_lib.c deleted file mode 100644 index 76622abd..00000000 --- a/third_party/boringssl/src/crypto/pem/pem_lib.c +++ /dev/null @@ -1,800 +0,0 @@ -/* crypto/pem/pem_lib.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../internal.h" - - -#define MIN_LENGTH 4 - -static int load_iv(char **fromp, unsigned char *to, int num); -static int check_pem(const char *nm, const char *name); - -void PEM_proc_type(char *buf, int type) { - const char *str; - - if (type == PEM_TYPE_ENCRYPTED) { - str = "ENCRYPTED"; - } else if (type == PEM_TYPE_MIC_CLEAR) { - str = "MIC-CLEAR"; - } else if (type == PEM_TYPE_MIC_ONLY) { - str = "MIC-ONLY"; - } else { - str = "BAD-TYPE"; - } - - OPENSSL_strlcat(buf, "Proc-Type: 4,", PEM_BUFSIZE); - OPENSSL_strlcat(buf, str, PEM_BUFSIZE); - OPENSSL_strlcat(buf, "\n", PEM_BUFSIZE); -} - -void PEM_dek_info(char *buf, const char *type, int len, char *str) { - static const unsigned char map[17] = "0123456789ABCDEF"; - long i; - int j; - - OPENSSL_strlcat(buf, "DEK-Info: ", PEM_BUFSIZE); - OPENSSL_strlcat(buf, type, PEM_BUFSIZE); - OPENSSL_strlcat(buf, ",", PEM_BUFSIZE); - j = strlen(buf); - if (j + (len * 2) + 1 > PEM_BUFSIZE) { - return; - } - for (i = 0; i < len; i++) { - buf[j + i * 2] = map[(str[i] >> 4) & 0x0f]; - buf[j + i * 2 + 1] = map[(str[i]) & 0x0f]; - } - buf[j + i * 2] = '\n'; - buf[j + i * 2 + 1] = '\0'; -} - -void *PEM_ASN1_read(d2i_of_void *d2i, const char *name, FILE *fp, void **x, - pem_password_cb *cb, void *u) { - BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); - if (b == NULL) { - OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); - return NULL; - } - void *ret = PEM_ASN1_read_bio(d2i, name, b, x, cb, u); - BIO_free(b); - return ret; -} - -static int check_pem(const char *nm, const char *name) { - // Normal matching nm and name - if (!strcmp(nm, name)) { - return 1; - } - - // Make PEM_STRING_EVP_PKEY match any private key - - if (!strcmp(name, PEM_STRING_EVP_PKEY)) { - return !strcmp(nm, PEM_STRING_PKCS8) || !strcmp(nm, PEM_STRING_PKCS8INF) || - !strcmp(nm, PEM_STRING_RSA) || !strcmp(nm, PEM_STRING_EC) || - !strcmp(nm, PEM_STRING_DSA); - } - - // Permit older strings - - if (!strcmp(nm, PEM_STRING_X509_OLD) && !strcmp(name, PEM_STRING_X509)) { - return 1; - } - - if (!strcmp(nm, PEM_STRING_X509_REQ_OLD) && - !strcmp(name, PEM_STRING_X509_REQ)) { - return 1; - } - - // Allow normal certs to be read as trusted certs - if (!strcmp(nm, PEM_STRING_X509) && !strcmp(name, PEM_STRING_X509_TRUSTED)) { - return 1; - } - - if (!strcmp(nm, PEM_STRING_X509_OLD) && - !strcmp(name, PEM_STRING_X509_TRUSTED)) { - return 1; - } - - // Some CAs use PKCS#7 with CERTIFICATE headers - if (!strcmp(nm, PEM_STRING_X509) && !strcmp(name, PEM_STRING_PKCS7)) { - return 1; - } - - if (!strcmp(nm, PEM_STRING_PKCS7_SIGNED) && !strcmp(name, PEM_STRING_PKCS7)) { - return 1; - } - -#ifndef OPENSSL_NO_CMS - if (!strcmp(nm, PEM_STRING_X509) && !strcmp(name, PEM_STRING_CMS)) { - return 1; - } - // Allow CMS to be read from PKCS#7 headers - if (!strcmp(nm, PEM_STRING_PKCS7) && !strcmp(name, PEM_STRING_CMS)) { - return 1; - } -#endif - - return 0; -} - -static const EVP_CIPHER *cipher_by_name(const char *name) { - // This is similar to the (deprecated) function |EVP_get_cipherbyname|. Note - // the PEM code assumes that ciphers have at least 8 bytes of IV, at most 20 - // bytes of overhead and generally behave like CBC mode. - if (0 == strcmp(name, SN_des_cbc)) { - return EVP_des_cbc(); - } else if (0 == strcmp(name, SN_des_ede3_cbc)) { - return EVP_des_ede3_cbc(); - } else if (0 == strcmp(name, SN_aes_128_cbc)) { - return EVP_aes_128_cbc(); - } else if (0 == strcmp(name, SN_aes_192_cbc)) { - return EVP_aes_192_cbc(); - } else if (0 == strcmp(name, SN_aes_256_cbc)) { - return EVP_aes_256_cbc(); - } else { - return NULL; - } -} - -int PEM_bytes_read_bio(unsigned char **pdata, long *plen, char **pnm, - const char *name, BIO *bp, pem_password_cb *cb, - void *u) { - EVP_CIPHER_INFO cipher; - char *nm = NULL, *header = NULL; - unsigned char *data = NULL; - long len; - int ret = 0; - - for (;;) { - if (!PEM_read_bio(bp, &nm, &header, &data, &len)) { - uint32_t error = ERR_peek_error(); - if (ERR_GET_LIB(error) == ERR_LIB_PEM && - ERR_GET_REASON(error) == PEM_R_NO_START_LINE) { - ERR_add_error_data(2, "Expecting: ", name); - } - return 0; - } - if (check_pem(nm, name)) { - break; - } - OPENSSL_free(nm); - OPENSSL_free(header); - OPENSSL_free(data); - } - if (!PEM_get_EVP_CIPHER_INFO(header, &cipher)) { - goto err; - } - if (!PEM_do_header(&cipher, data, &len, cb, u)) { - goto err; - } - - *pdata = data; - *plen = len; - - if (pnm) { - *pnm = nm; - } - - ret = 1; - -err: - if (!ret || !pnm) { - OPENSSL_free(nm); - } - OPENSSL_free(header); - if (!ret) { - OPENSSL_free(data); - } - return ret; -} - -int PEM_ASN1_write(i2d_of_void *i2d, const char *name, FILE *fp, void *x, - const EVP_CIPHER *enc, unsigned char *kstr, int klen, - pem_password_cb *callback, void *u) { - BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); - if (b == NULL) { - OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); - return 0; - } - int ret = PEM_ASN1_write_bio(i2d, name, b, x, enc, kstr, klen, callback, u); - BIO_free(b); - return ret; -} - -int PEM_ASN1_write_bio(i2d_of_void *i2d, const char *name, BIO *bp, void *x, - const EVP_CIPHER *enc, unsigned char *kstr, int klen, - pem_password_cb *callback, void *u) { - EVP_CIPHER_CTX ctx; - int dsize = 0, i, j, ret = 0; - unsigned char *p, *data = NULL; - const char *objstr = NULL; - char buf[PEM_BUFSIZE]; - unsigned char key[EVP_MAX_KEY_LENGTH]; - unsigned char iv[EVP_MAX_IV_LENGTH]; - - if (enc != NULL) { - objstr = OBJ_nid2sn(EVP_CIPHER_nid(enc)); - if (objstr == NULL || cipher_by_name(objstr) == NULL || - EVP_CIPHER_iv_length(enc) < 8) { - OPENSSL_PUT_ERROR(PEM, PEM_R_UNSUPPORTED_CIPHER); - goto err; - } - } - - if ((dsize = i2d(x, NULL)) < 0) { - OPENSSL_PUT_ERROR(PEM, ERR_R_ASN1_LIB); - dsize = 0; - goto err; - } - // dzise + 8 bytes are needed - // actually it needs the cipher block size extra... - data = (unsigned char *)OPENSSL_malloc((unsigned int)dsize + 20); - if (data == NULL) { - OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); - goto err; - } - p = data; - i = i2d(x, &p); - - if (enc != NULL) { - const unsigned iv_len = EVP_CIPHER_iv_length(enc); - - if (kstr == NULL) { - klen = 0; - if (!callback) { - callback = PEM_def_callback; - } - klen = (*callback)(buf, PEM_BUFSIZE, 1, u); - if (klen <= 0) { - OPENSSL_PUT_ERROR(PEM, PEM_R_READ_KEY); - goto err; - } - kstr = (unsigned char *)buf; - } - assert(iv_len <= (int)sizeof(iv)); - if (!RAND_bytes(iv, iv_len)) { // Generate a salt - goto err; - } - // The 'iv' is used as the iv and as a salt. It is NOT taken from - // the BytesToKey function - if (!EVP_BytesToKey(enc, EVP_md5(), iv, kstr, klen, 1, key, NULL)) { - goto err; - } - - if (kstr == (unsigned char *)buf) { - OPENSSL_cleanse(buf, PEM_BUFSIZE); - } - - assert(strlen(objstr) + 23 + 2 * iv_len + 13 <= sizeof buf); - - buf[0] = '\0'; - PEM_proc_type(buf, PEM_TYPE_ENCRYPTED); - PEM_dek_info(buf, objstr, iv_len, (char *)iv); - // k=strlen(buf); - - EVP_CIPHER_CTX_init(&ctx); - ret = 1; - if (!EVP_EncryptInit_ex(&ctx, enc, NULL, key, iv) || - !EVP_EncryptUpdate(&ctx, data, &j, data, i) || - !EVP_EncryptFinal_ex(&ctx, &(data[j]), &i)) { - ret = 0; - } else { - i += j; - } - EVP_CIPHER_CTX_cleanup(&ctx); - if (ret == 0) { - goto err; - } - } else { - ret = 1; - buf[0] = '\0'; - } - i = PEM_write_bio(bp, name, buf, data, i); - if (i <= 0) { - ret = 0; - } -err: - OPENSSL_cleanse(key, sizeof(key)); - OPENSSL_cleanse(iv, sizeof(iv)); - OPENSSL_cleanse((char *)&ctx, sizeof(ctx)); - OPENSSL_cleanse(buf, PEM_BUFSIZE); - OPENSSL_free(data); - return ret; -} - -int PEM_do_header(EVP_CIPHER_INFO *cipher, unsigned char *data, long *plen, - pem_password_cb *callback, void *u) { - int i = 0, j, o, klen; - long len; - EVP_CIPHER_CTX ctx; - unsigned char key[EVP_MAX_KEY_LENGTH]; - char buf[PEM_BUFSIZE]; - - len = *plen; - - if (cipher->cipher == NULL) { - return 1; - } - - klen = 0; - if (!callback) { - callback = PEM_def_callback; - } - klen = callback(buf, PEM_BUFSIZE, 0, u); - if (klen <= 0) { - OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ); - return 0; - } - - if (!EVP_BytesToKey(cipher->cipher, EVP_md5(), &(cipher->iv[0]), - (unsigned char *)buf, klen, 1, key, NULL)) { - return 0; - } - - j = (int)len; - EVP_CIPHER_CTX_init(&ctx); - o = EVP_DecryptInit_ex(&ctx, cipher->cipher, NULL, key, &(cipher->iv[0])); - if (o) { - o = EVP_DecryptUpdate(&ctx, data, &i, data, j); - } - if (o) { - o = EVP_DecryptFinal_ex(&ctx, &(data[i]), &j); - } - EVP_CIPHER_CTX_cleanup(&ctx); - OPENSSL_cleanse((char *)buf, sizeof(buf)); - OPENSSL_cleanse((char *)key, sizeof(key)); - if (!o) { - OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_DECRYPT); - return 0; - } - j += i; - *plen = j; - return 1; -} - -int PEM_get_EVP_CIPHER_INFO(char *header, EVP_CIPHER_INFO *cipher) { - const EVP_CIPHER *enc = NULL; - char *p, c; - char **header_pp = &header; - - cipher->cipher = NULL; - OPENSSL_memset(cipher->iv, 0, sizeof(cipher->iv)); - if ((header == NULL) || (*header == '\0') || (*header == '\n')) { - return 1; - } - if (strncmp(header, "Proc-Type: ", 11) != 0) { - OPENSSL_PUT_ERROR(PEM, PEM_R_NOT_PROC_TYPE); - return 0; - } - header += 11; - if (*header != '4') { - return 0; - } - header++; - if (*header != ',') { - return 0; - } - header++; - if (strncmp(header, "ENCRYPTED", 9) != 0) { - OPENSSL_PUT_ERROR(PEM, PEM_R_NOT_ENCRYPTED); - return 0; - } - for (; (*header != '\n') && (*header != '\0'); header++) { - ; - } - if (*header == '\0') { - OPENSSL_PUT_ERROR(PEM, PEM_R_SHORT_HEADER); - return 0; - } - header++; - if (strncmp(header, "DEK-Info: ", 10) != 0) { - OPENSSL_PUT_ERROR(PEM, PEM_R_NOT_DEK_INFO); - return 0; - } - header += 10; - - p = header; - for (;;) { - c = *header; - if (!(((c >= 'A') && (c <= 'Z')) || (c == '-') || - ((c >= '0') && (c <= '9')))) { - break; - } - header++; - } - *header = '\0'; - cipher->cipher = enc = cipher_by_name(p); - *header = c; - header++; - - if (enc == NULL) { - OPENSSL_PUT_ERROR(PEM, PEM_R_UNSUPPORTED_ENCRYPTION); - return 0; - } - // The IV parameter must be at least 8 bytes long to be used as the salt in - // the KDF. (This should not happen given |cipher_by_name|.) - if (EVP_CIPHER_iv_length(enc) < 8) { - assert(0); - OPENSSL_PUT_ERROR(PEM, PEM_R_UNSUPPORTED_ENCRYPTION); - return 0; - } - if (!load_iv(header_pp, &(cipher->iv[0]), EVP_CIPHER_iv_length(enc))) { - return 0; - } - - return 1; -} - -static int load_iv(char **fromp, unsigned char *to, int num) { - int v, i; - char *from; - - from = *fromp; - for (i = 0; i < num; i++) { - to[i] = 0; - } - num *= 2; - for (i = 0; i < num; i++) { - if ((*from >= '0') && (*from <= '9')) { - v = *from - '0'; - } else if ((*from >= 'A') && (*from <= 'F')) { - v = *from - 'A' + 10; - } else if ((*from >= 'a') && (*from <= 'f')) { - v = *from - 'a' + 10; - } else { - OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_IV_CHARS); - return 0; - } - from++; - to[i / 2] |= v << (long)((!(i & 1)) * 4); - } - - *fromp = from; - return 1; -} - -int PEM_write(FILE *fp, const char *name, const char *header, - const unsigned char *data, long len) { - BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); - if (b == NULL) { - OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); - return 0; - } - int ret = PEM_write_bio(b, name, header, data, len); - BIO_free(b); - return ret; -} - -int PEM_write_bio(BIO *bp, const char *name, const char *header, - const unsigned char *data, long len) { - int nlen, n, i, j, outl; - unsigned char *buf = NULL; - EVP_ENCODE_CTX ctx; - int reason = ERR_R_BUF_LIB; - - EVP_EncodeInit(&ctx); - nlen = strlen(name); - - if ((BIO_write(bp, "-----BEGIN ", 11) != 11) || - (BIO_write(bp, name, nlen) != nlen) || - (BIO_write(bp, "-----\n", 6) != 6)) { - goto err; - } - - i = strlen(header); - if (i > 0) { - if ((BIO_write(bp, header, i) != i) || (BIO_write(bp, "\n", 1) != 1)) { - goto err; - } - } - - buf = OPENSSL_malloc(PEM_BUFSIZE * 8); - if (buf == NULL) { - reason = ERR_R_MALLOC_FAILURE; - goto err; - } - - i = j = 0; - while (len > 0) { - n = (int)((len > (PEM_BUFSIZE * 5)) ? (PEM_BUFSIZE * 5) : len); - EVP_EncodeUpdate(&ctx, buf, &outl, &(data[j]), n); - if ((outl) && (BIO_write(bp, (char *)buf, outl) != outl)) { - goto err; - } - i += outl; - len -= n; - j += n; - } - EVP_EncodeFinal(&ctx, buf, &outl); - if ((outl > 0) && (BIO_write(bp, (char *)buf, outl) != outl)) { - goto err; - } - OPENSSL_free(buf); - buf = NULL; - if ((BIO_write(bp, "-----END ", 9) != 9) || - (BIO_write(bp, name, nlen) != nlen) || - (BIO_write(bp, "-----\n", 6) != 6)) { - goto err; - } - return i + outl; -err: - if (buf) { - OPENSSL_free(buf); - } - OPENSSL_PUT_ERROR(PEM, reason); - return 0; -} - -int PEM_read(FILE *fp, char **name, char **header, unsigned char **data, - long *len) { - BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); - if (b == NULL) { - OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); - return 0; - } - int ret = PEM_read_bio(b, name, header, data, len); - BIO_free(b); - return ret; -} - -int PEM_read_bio(BIO *bp, char **name, char **header, unsigned char **data, - long *len) { - EVP_ENCODE_CTX ctx; - int end = 0, i, k, bl = 0, hl = 0, nohead = 0; - char buf[256]; - BUF_MEM *nameB; - BUF_MEM *headerB; - BUF_MEM *dataB, *tmpB; - - nameB = BUF_MEM_new(); - headerB = BUF_MEM_new(); - dataB = BUF_MEM_new(); - if ((nameB == NULL) || (headerB == NULL) || (dataB == NULL)) { - BUF_MEM_free(nameB); - BUF_MEM_free(headerB); - BUF_MEM_free(dataB); - OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); - return 0; - } - - buf[254] = '\0'; - for (;;) { - i = BIO_gets(bp, buf, 254); - - if (i <= 0) { - OPENSSL_PUT_ERROR(PEM, PEM_R_NO_START_LINE); - goto err; - } - - while ((i >= 0) && (buf[i] <= ' ')) { - i--; - } - buf[++i] = '\n'; - buf[++i] = '\0'; - - if (strncmp(buf, "-----BEGIN ", 11) == 0) { - i = strlen(&(buf[11])); - - if (strncmp(&(buf[11 + i - 6]), "-----\n", 6) != 0) { - continue; - } - if (!BUF_MEM_grow(nameB, i + 9)) { - OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); - goto err; - } - OPENSSL_memcpy(nameB->data, &(buf[11]), i - 6); - nameB->data[i - 6] = '\0'; - break; - } - } - hl = 0; - if (!BUF_MEM_grow(headerB, 256)) { - OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); - goto err; - } - headerB->data[0] = '\0'; - for (;;) { - i = BIO_gets(bp, buf, 254); - if (i <= 0) { - break; - } - - while ((i >= 0) && (buf[i] <= ' ')) { - i--; - } - buf[++i] = '\n'; - buf[++i] = '\0'; - - if (buf[0] == '\n') { - break; - } - if (!BUF_MEM_grow(headerB, hl + i + 9)) { - OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); - goto err; - } - if (strncmp(buf, "-----END ", 9) == 0) { - nohead = 1; - break; - } - OPENSSL_memcpy(&(headerB->data[hl]), buf, i); - headerB->data[hl + i] = '\0'; - hl += i; - } - - bl = 0; - if (!BUF_MEM_grow(dataB, 1024)) { - OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); - goto err; - } - dataB->data[0] = '\0'; - if (!nohead) { - for (;;) { - i = BIO_gets(bp, buf, 254); - if (i <= 0) { - break; - } - - while ((i >= 0) && (buf[i] <= ' ')) { - i--; - } - buf[++i] = '\n'; - buf[++i] = '\0'; - - if (i != 65) { - end = 1; - } - if (strncmp(buf, "-----END ", 9) == 0) { - break; - } - if (i > 65) { - break; - } - if (!BUF_MEM_grow_clean(dataB, i + bl + 9)) { - OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); - goto err; - } - OPENSSL_memcpy(&(dataB->data[bl]), buf, i); - dataB->data[bl + i] = '\0'; - bl += i; - if (end) { - buf[0] = '\0'; - i = BIO_gets(bp, buf, 254); - if (i <= 0) { - break; - } - - while ((i >= 0) && (buf[i] <= ' ')) { - i--; - } - buf[++i] = '\n'; - buf[++i] = '\0'; - - break; - } - } - } else { - tmpB = headerB; - headerB = dataB; - dataB = tmpB; - bl = hl; - } - i = strlen(nameB->data); - if ((strncmp(buf, "-----END ", 9) != 0) || - (strncmp(nameB->data, &(buf[9]), i) != 0) || - (strncmp(&(buf[9 + i]), "-----\n", 6) != 0)) { - OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_END_LINE); - goto err; - } - - EVP_DecodeInit(&ctx); - i = EVP_DecodeUpdate(&ctx, (unsigned char *)dataB->data, &bl, - (unsigned char *)dataB->data, bl); - if (i < 0) { - OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_BASE64_DECODE); - goto err; - } - i = EVP_DecodeFinal(&ctx, (unsigned char *)&(dataB->data[bl]), &k); - if (i < 0) { - OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_BASE64_DECODE); - goto err; - } - bl += k; - - if (bl == 0) { - goto err; - } - *name = nameB->data; - *header = headerB->data; - *data = (unsigned char *)dataB->data; - *len = bl; - OPENSSL_free(nameB); - OPENSSL_free(headerB); - OPENSSL_free(dataB); - return 1; -err: - BUF_MEM_free(nameB); - BUF_MEM_free(headerB); - BUF_MEM_free(dataB); - return 0; -} - -int PEM_def_callback(char *buf, int size, int rwflag, void *userdata) { - if (!buf || !userdata || size < 0) { - return 0; - } - size_t len = strlen((char *)userdata); - if (len >= (size_t)size) { - return 0; - } - OPENSSL_strlcpy(buf, userdata, (size_t)size); - return len; -} diff --git a/third_party/boringssl/src/crypto/pem/pem_lib.cc b/third_party/boringssl/src/crypto/pem/pem_lib.cc new file mode 100644 index 00000000..3fb85159 --- /dev/null +++ b/third_party/boringssl/src/crypto/pem/pem_lib.cc @@ -0,0 +1,739 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + + +#define MIN_LENGTH 4 + +using namespace bssl; + +static int load_iv(const char **fromp, unsigned char *to, size_t num); +static int check_pem(const std::string_view nm, const std::string_view name); + +// PEM_proc_type appends a Proc-Type header to |buf|, determined by |type|. +static void PEM_proc_type(char buf[PEM_BUFSIZE], int type) { + const char *str; + + if (type == PEM_TYPE_ENCRYPTED) { + str = "ENCRYPTED"; + } else if (type == PEM_TYPE_MIC_CLEAR) { + str = "MIC-CLEAR"; + } else if (type == PEM_TYPE_MIC_ONLY) { + str = "MIC-ONLY"; + } else { + str = "BAD-TYPE"; + } + + OPENSSL_strlcat(buf, "Proc-Type: 4,", PEM_BUFSIZE); + OPENSSL_strlcat(buf, str, PEM_BUFSIZE); + OPENSSL_strlcat(buf, "\n", PEM_BUFSIZE); +} + +// PEM_dek_info appends a DEK-Info header to |buf|, with an algorithm of |type| +// and a single parameter, specified by hex-encoding |len| bytes from |str|. +static void PEM_dek_info(char buf[PEM_BUFSIZE], const char *type, size_t len, + char *str) { + static const unsigned char map[17] = "0123456789ABCDEF"; + + OPENSSL_strlcat(buf, "DEK-Info: ", PEM_BUFSIZE); + OPENSSL_strlcat(buf, type, PEM_BUFSIZE); + OPENSSL_strlcat(buf, ",", PEM_BUFSIZE); + + const size_t used = strlen(buf); + const size_t available = PEM_BUFSIZE - used; + if (len * 2 < len || len * 2 + 2 < len || available < len * 2 + 2) { + return; + } + + for (size_t i = 0; i < len; i++) { + buf[used + i * 2] = map[(str[i] >> 4) & 0x0f]; + buf[used + i * 2 + 1] = map[(str[i]) & 0x0f]; + } + buf[used + len * 2] = '\n'; + buf[used + len * 2 + 1] = '\0'; +} + +void *PEM_ASN1_read(d2i_of_void *d2i, const char *name, FILE *fp, void **x, + pem_password_cb *cb, void *u) { + BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); + if (b == nullptr) { + OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); + return nullptr; + } + void *ret = PEM_ASN1_read_bio(d2i, name, b, x, cb, u); + BIO_free(b); + return ret; +} + +static int check_pem(const std::string_view nm, const std::string_view name) { + // Normal matching nm and name + if (nm == name) { + return 1; + } + + // Make PEM_STRING_EVP_PKEY match any private key + + if (name == PEM_STRING_EVP_PKEY) { + return nm == PEM_STRING_PKCS8 || nm == PEM_STRING_PKCS8INF || + nm == PEM_STRING_RSA || nm == PEM_STRING_EC || nm == PEM_STRING_DSA; + } + + // Permit older strings + + if (nm == PEM_STRING_X509_OLD && name == PEM_STRING_X509) { + return 1; + } + + if (nm == PEM_STRING_X509_REQ_OLD && name == PEM_STRING_X509_REQ) { + return 1; + } + + // Allow normal certs to be read as trusted certs + if (nm == PEM_STRING_X509 && name == PEM_STRING_X509_TRUSTED) { + return 1; + } + + if (nm == PEM_STRING_X509_OLD && name == PEM_STRING_X509_TRUSTED) { + return 1; + } + + // Some CAs use PKCS#7 with CERTIFICATE headers + if (nm == PEM_STRING_X509 && name == PEM_STRING_PKCS7) { + return 1; + } + + if (nm == PEM_STRING_PKCS7_SIGNED && name == PEM_STRING_PKCS7) { + return 1; + } + +#ifndef OPENSSL_NO_CMS + if (nm == PEM_STRING_X509 && name == PEM_STRING_CMS) { + return 1; + } + // Allow CMS to be read from PKCS#7 headers + if (nm == PEM_STRING_PKCS7 && name == PEM_STRING_CMS) { + return 1; + } +#endif + + return 0; +} + +static const EVP_CIPHER *cipher_by_name(const std::string_view name) { + // This is similar to the (deprecated) function |EVP_get_cipherbyname|. Note + // the PEM code assumes that ciphers have at least 8 bytes of IV, at most 20 + // bytes of overhead and generally behave like CBC mode. + if (name == SN_des_cbc) { + return EVP_des_cbc(); + } else if (name == SN_des_ede3_cbc) { + return EVP_des_ede3_cbc(); + } else if (name == SN_aes_128_cbc) { + return EVP_aes_128_cbc(); + } else if (name == SN_aes_192_cbc) { + return EVP_aes_192_cbc(); + } else if (name == SN_aes_256_cbc) { + return EVP_aes_256_cbc(); + } else { + return nullptr; + } +} + +int PEM_bytes_read_bio(unsigned char **pdata, long *plen, char **pnm, + const char *name, BIO *bp, pem_password_cb *cb, + void *u) { + EVP_CIPHER_INFO cipher; + UniquePtr nm; + UniquePtr header; + Array data; + size_t ulen; + size_t unused = 0; + + for (;;) { + if (!PEM_read_bio_inner(bp, &nm, &header, &data)) { + if (ERR_equals(ERR_peek_error(), ERR_LIB_PEM, PEM_R_NO_START_LINE)) { + ERR_add_error_data(2, "Expecting: ", name); + } + return 0; + } + if (data.size() > LONG_MAX) { + OPENSSL_PUT_ERROR(PEM, ERR_R_OVERFLOW); + return 0; + } + if (check_pem(nm.get(), name)) { + break; + } + } + if (!PEM_get_EVP_CIPHER_INFO(header.get(), &cipher)) { + return 0; + } + ulen = data.size(); + if (!PEM_do_header(&cipher, data.data(), &ulen, cb, u)) { + return 0; + } + + // Release the buffer to the caller. + // Note that |PEM_do_header| may have reduced the length after decrypting + // in-place. + // This will not overflow because |data.size()| was checked to fit in |long| + // above. + data.Release(pdata, &unused); + *plen = static_cast(ulen); + + if (pnm) { + *pnm = nm.release(); + } + + return 1; +} + +int PEM_ASN1_write(i2d_of_void *i2d, const char *name, FILE *fp, void *x, + const EVP_CIPHER *enc, const unsigned char *pass, + int pass_len, pem_password_cb *callback, void *u) { + BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); + if (b == nullptr) { + OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); + return 0; + } + int ret = + PEM_ASN1_write_bio(i2d, name, b, x, enc, pass, pass_len, callback, u); + BIO_free(b); + return ret; +} + +int PEM_ASN1_write_bio(i2d_of_void *i2d, const char *name, BIO *bp, void *x, + const EVP_CIPHER *enc, const unsigned char *pass, + int pass_len, pem_password_cb *callback, void *u) { + ScopedEVP_CIPHER_CTX ctx; + int dsize = 0, ret = 0; + size_t i, j, data_size; + unsigned char *p, *data = nullptr; + const char *objstr = nullptr; + char buf[PEM_BUFSIZE]; + unsigned char key[EVP_MAX_KEY_LENGTH]; + unsigned char iv[EVP_MAX_IV_LENGTH]; + + if (enc != nullptr) { + objstr = OBJ_nid2sn(EVP_CIPHER_nid(enc)); + if (objstr == nullptr || cipher_by_name(objstr) == nullptr || + EVP_CIPHER_iv_length(enc) < 8) { + OPENSSL_PUT_ERROR(PEM, PEM_R_UNSUPPORTED_CIPHER); + goto err; + } + } + + if ((dsize = i2d(x, nullptr)) < 0) { + OPENSSL_PUT_ERROR(PEM, ERR_R_ASN1_LIB); + dsize = 0; + goto err; + } + // dzise + 8 bytes are needed + // actually it needs the cipher block size extra... + data_size = static_cast(dsize) + 20; + data = (unsigned char *)OPENSSL_malloc(data_size); + if (data == nullptr) { + goto err; + } + p = data; + i = i2d(x, &p); + + if (enc != nullptr) { + const unsigned iv_len = EVP_CIPHER_iv_length(enc); + + if (pass == nullptr) { + if (!callback) { + callback = PEM_def_callback; + } + pass_len = (*callback)(buf, PEM_BUFSIZE, 1, u); + if (pass_len < 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_READ_KEY); + goto err; + } + pass = (const unsigned char *)buf; + } + assert(iv_len <= sizeof(iv)); + if (!RAND_bytes(iv, iv_len)) { // Generate a salt + goto err; + } + // The 'iv' is used as the iv and as a salt. It is NOT taken from + // the BytesToKey function + if (!EVP_BytesToKey(enc, EVP_md5(), iv, pass, pass_len, 1, key, nullptr)) { + goto err; + } + + if (pass == (const unsigned char *)buf) { + OPENSSL_cleanse(buf, PEM_BUFSIZE); + } + + assert(strlen(objstr) + 23 + 2 * iv_len + 13 <= sizeof(buf)); + + buf[0] = '\0'; + PEM_proc_type(buf, PEM_TYPE_ENCRYPTED); + PEM_dek_info(buf, objstr, iv_len, (char *)iv); + // k=strlen(buf); + + ret = 1; + if (!EVP_EncryptInit_ex(ctx.get(), enc, nullptr, key, iv) || + !EVP_EncryptUpdate_ex(ctx.get(), data, &j, data_size, data, i) || + !EVP_EncryptFinal_ex2(ctx.get(), &(data[j]), &i, data_size - j)) { + ret = 0; + } else { + i += j; + } + if (ret == 0) { + goto err; + } + } else { + ret = 1; + buf[0] = '\0'; + } + i = PEM_write_bio(bp, name, buf, data, i); + if (i <= 0) { + ret = 0; + } +err: + OPENSSL_cleanse(key, sizeof(key)); + OPENSSL_cleanse(iv, sizeof(iv)); + OPENSSL_cleanse(buf, PEM_BUFSIZE); + OPENSSL_free(data); + return ret; +} + +int bssl::PEM_do_header(const EVP_CIPHER_INFO *cipher, unsigned char *data, + size_t *len, pem_password_cb *callback, void *u) { + int pass_len; + ScopedEVP_CIPHER_CTX ctx; + unsigned char key[EVP_MAX_KEY_LENGTH]; + char buf[PEM_BUFSIZE]; + const size_t in_len = *len; + + if (cipher->cipher == nullptr) { + return 1; + } + + pass_len = 0; + if (!callback) { + callback = PEM_def_callback; + } + pass_len = callback(buf, PEM_BUFSIZE, 0, u); + if (pass_len < 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ); + return 0; + } + + if (!EVP_BytesToKey(cipher->cipher, EVP_md5(), cipher->iv, + (unsigned char *)buf, pass_len, 1, key, nullptr)) { + return 0; + } + + // Safety: we have checked |*len| before narrowing so that |EVP_DecryptUpdate| + // can safely work with it. + size_t out_len1 = 0; + size_t out_len2 = 0; + if (!EVP_DecryptInit_ex(ctx.get(), cipher->cipher, nullptr, key, + cipher->iv) || + !EVP_DecryptUpdate_ex(ctx.get(), data, &out_len1, in_len, data, in_len) || + !EVP_DecryptFinal_ex2(ctx.get(), data + out_len1, &out_len2, + in_len - out_len1)) { + OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_DECRYPT); + return 0; + } + *len = out_len1 + out_len2; + return 1; +} + +int bssl::PEM_get_EVP_CIPHER_INFO(const char *header, EVP_CIPHER_INFO *cipher) { + cipher->cipher = nullptr; + OPENSSL_memset(cipher->iv, 0, sizeof(cipher->iv)); + if ((header == nullptr) || (*header == '\0') || (*header == '\n')) { + return 1; + } + if (strncmp(header, "Proc-Type: ", 11) != 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_NOT_PROC_TYPE); + return 0; + } + header += 11; + if (header[0] != '4' || header[1] != ',') { + OPENSSL_PUT_ERROR(PEM, PEM_R_UNSUPPORTED_PROC_TYPE_VERSION); + return 0; + } + header += 2; + if (strncmp(header, "ENCRYPTED", 9) != 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_NOT_ENCRYPTED); + return 0; + } + for (; (*header != '\n') && (*header != '\0'); header++) { + ; + } + if (*header == '\0') { + OPENSSL_PUT_ERROR(PEM, PEM_R_SHORT_HEADER); + return 0; + } + header++; + if (strncmp(header, "DEK-Info: ", 10) != 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_NOT_DEK_INFO); + return 0; + } + header += 10; + + const char *p = header; + for (;;) { + char c = *header; + if (!((c >= 'A' && c <= 'Z') || c == '-' || OPENSSL_isdigit(c))) { + break; + } + header++; + } + cipher->cipher = cipher_by_name(std::string_view(p, header - p)); + header++; + if (cipher->cipher == nullptr) { + OPENSSL_PUT_ERROR(PEM, PEM_R_UNSUPPORTED_ENCRYPTION); + return 0; + } + // The IV parameter must be at least 8 bytes long to be used as the salt in + // the KDF. (This should not happen given |cipher_by_name|.) + if (EVP_CIPHER_iv_length(cipher->cipher) < 8) { + assert(0); + OPENSSL_PUT_ERROR(PEM, PEM_R_UNSUPPORTED_ENCRYPTION); + return 0; + } + const char **header_pp = &header; + if (!load_iv(header_pp, cipher->iv, EVP_CIPHER_iv_length(cipher->cipher))) { + return 0; + } + + return 1; +} + +static int load_iv(const char **fromp, unsigned char *to, size_t num) { + uint8_t v; + const char *from; + + from = *fromp; + for (size_t i = 0; i < num; i++) { + to[i] = 0; + } + num *= 2; + for (size_t i = 0; i < num; i++) { + if (!OPENSSL_fromxdigit(&v, *from)) { + OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_IV_CHARS); + return 0; + } + from++; + to[i / 2] |= v << (!(i & 1)) * 4; + } + + *fromp = from; + return 1; +} + +int PEM_write(FILE *fp, const char *name, const char *header, + const unsigned char *data, long len) { + BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); + if (b == nullptr) { + OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); + return 0; + } + int ret = PEM_write_bio(b, name, header, data, len); + BIO_free(b); + return ret; +} + +int PEM_write_bio(BIO *bp, const char *name, const char *header, + const unsigned char *data, long len) { + int nlen, n, i, j, outl; + unsigned char *buf = nullptr; + EVP_ENCODE_CTX ctx; + int reason = ERR_R_BUF_LIB; + int retval = 0; + + EVP_EncodeInit(&ctx); + nlen = strlen(name); + + if ((BIO_write(bp, "-----BEGIN ", 11) != 11) || + (BIO_write(bp, name, nlen) != nlen) || + (BIO_write(bp, "-----\n", 6) != 6)) { + goto err; + } + + i = strlen(header); + if (i > 0) { + if ((BIO_write(bp, header, i) != i) || (BIO_write(bp, "\n", 1) != 1)) { + goto err; + } + } + + buf = reinterpret_cast(OPENSSL_malloc(PEM_BUFSIZE * 8)); + if (buf == nullptr) { + goto err; + } + + i = j = 0; + while (len > 0) { + n = (int)((len > (PEM_BUFSIZE * 5)) ? (PEM_BUFSIZE * 5) : len); + EVP_EncodeUpdate(&ctx, buf, &outl, &(data[j]), n); + if ((outl) && (BIO_write(bp, (char *)buf, outl) != outl)) { + goto err; + } + i += outl; + len -= n; + j += n; + } + EVP_EncodeFinal(&ctx, buf, &outl); + if ((outl > 0) && (BIO_write(bp, (char *)buf, outl) != outl)) { + goto err; + } + if ((BIO_write(bp, "-----END ", 9) != 9) || + (BIO_write(bp, name, nlen) != nlen) || + (BIO_write(bp, "-----\n", 6) != 6)) { + goto err; + } + retval = i + outl; + +err: + if (retval == 0) { + OPENSSL_PUT_ERROR(PEM, reason); + } + OPENSSL_free(buf); + return retval; +} + +int PEM_read(FILE *fp, char **name, char **header, unsigned char **data, + long *len) { + BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); + if (b == nullptr) { + OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); + return 0; + } + int ret = PEM_read_bio(b, name, header, data, len); + BIO_free(b); + return ret; +} + +int bssl::PEM_read_bio_inner(BIO *bp, UniquePtr *name, + UniquePtr *header, Array *data) { + bssl::UniquePtr nameB(BUF_MEM_new()); + bssl::UniquePtr headerB(BUF_MEM_new()); + bssl::UniquePtr dataB(BUF_MEM_new()); + if ((nameB == nullptr) || (headerB == nullptr) || (dataB == nullptr)) { + OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); + return 0; + } + + char buf[256]; // 254 characters + newline + \0. + buf[254] = '\0'; + // Invariant: buf[254] < ' '. It may get overwritten by '\n' or '\0' only. + for (;;) { + int i = BIO_gets(bp, buf, 254); + if (i <= 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_NO_START_LINE); + return 0; + } + + while ((i >= 0) && (buf[i] <= ' ')) { + i--; + } + buf[++i] = '\n'; + buf[++i] = '\0'; + + if (strncmp(buf, "-----BEGIN ", 11) == 0) { + i = strlen(&(buf[11])); + + if (strncmp(&(buf[11 + i - 6]), "-----\n", 6) != 0) { + continue; + } + if (!BUF_MEM_grow(nameB.get(), i - 5)) { + OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); + return 0; + } + OPENSSL_memcpy(nameB->data, &(buf[11]), i - 6); + nameB->data[i - 6] = '\0'; + break; + } + } + + size_t hl = 0; + if (!BUF_MEM_grow(headerB.get(), 256)) { + OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); + return 0; + } + headerB->data[0] = '\0'; + + size_t bl = 0; + if (!BUF_MEM_grow(dataB.get(), 1024)) { + OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); + return 0; + } + dataB->data[0] = '\0'; + + bool failed = false; // Set to true (and error put into queue) on failure. + auto read_until_end = [&](BUF_MEM *out, size_t &out_len, + bool stop_at_newline) { + // Invariant: buf[254] < ' '. It may get overwritten by '\n' or '\0' only. + for (;;) { + int i = BIO_gets(bp, buf, 254); + if (i <= 0) { + break; + } + while ((i >= 0) && (buf[i] <= ' ')) { + i--; + } + buf[++i] = '\n'; + buf[++i] = '\0'; + + if (stop_at_newline && buf[0] == '\n') { + // End of header, start of body. + return true; + } + if (strncmp(buf, "-----END ", 9) == 0) { + return false; + } + if (static_cast(i) > SIZE_MAX - hl - 1) { + OPENSSL_PUT_ERROR(PEM, ERR_R_OVERFLOW); + failed = true; + return false; + } + size_t new_len = out_len + i; + if (new_len > INT_MAX / 2) { + // Arbitrarily limit PEM data to INT_MAX / 2 bytes, which "ought to be + // enough for anyone". Hardens against possible integer overflows + // downstream. + OPENSSL_PUT_ERROR(PEM, ERR_R_OVERFLOW); + failed = true; + return false; + } + if (!BUF_MEM_grow(out, new_len + 1)) { + OPENSSL_PUT_ERROR(PEM, ERR_R_MALLOC_FAILURE); + failed = true; + return false; + } + OPENSSL_memcpy(&(out->data[out_len]), buf, i); + out->data[new_len] = '\0'; + out_len = new_len; + } + return false; + }; + + if (read_until_end(headerB.get(), hl, /*stop_at_newline=*/true)) { + read_until_end(dataB.get(), bl, /*stop_at_newline=*/false); + } else { + // Actually we've read the body, as there is no header. + std::swap(hl, bl); + std::swap(headerB, dataB); + } + if (failed) { + return 0; + } + + size_t name_len = strlen(nameB->data); + if ((strncmp(buf, "-----END ", 9) != 0) || + (strncmp(&(buf[9]), nameB->data, name_len) != 0) || + (strncmp(&(buf[9 + name_len]), "-----\n", 6) != 0)) { + OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_END_LINE); + return 0; + } + + EVP_ENCODE_CTX ctx; + EVP_DecodeInit(&ctx); + int decoded_length; + int status = + EVP_DecodeUpdate(&ctx, (unsigned char *)dataB->data, &decoded_length, + (unsigned char *)dataB->data, bl); + if (status < 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_BASE64_DECODE); + return 0; + } + int k; + status = EVP_DecodeFinal(&ctx, (unsigned char *)&(dataB->data[bl]), &k); + if (status < 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_BASE64_DECODE); + return 0; + } + if (k > INT_MAX - decoded_length) { + OPENSSL_PUT_ERROR(PEM, ERR_R_OVERFLOW); + return 0; + } + decoded_length += k; + + if (decoded_length == 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_NO_DATA); + return 0; + } + + // Transfer ownership of buffers + name->reset(nameB->data); + nameB->data = nullptr; + header->reset(headerB->data); + headerB->data = nullptr; + data->Reset((uint8_t *)dataB->data, decoded_length); + dataB->data = nullptr; + + return 1; +} + +int PEM_read_bio(BIO *bp, char **name, char **header, unsigned char **data, + long *len) { + UniquePtr owned_name; + UniquePtr owned_header; + Array owned_data; + if (!PEM_read_bio_inner(bp, &owned_name, &owned_header, &owned_data)) { + return 0; + } + if (owned_data.size() > LONG_MAX) { + OPENSSL_PUT_ERROR(PEM, ERR_R_OVERFLOW); + return 0; + } + size_t ulen = 0; + *name = owned_name.release(); + *header = owned_header.release(); + owned_data.Release(data, &ulen); + // Safety: we checked that |ulen| <= |LONG_MAX|. + *len = static_cast(ulen); + return 1; +} + +int PEM_def_callback(char *buf, int size, int rwflag, void *userdata) { + if (!buf || !userdata || size < 0) { + return -1; + } + size_t len = strlen((char *)userdata); + if (len >= (size_t)size) { + return -1; + } + OPENSSL_strlcpy(buf, reinterpret_cast(userdata), (size_t)size); + return (int)len; +} diff --git a/third_party/boringssl/src/crypto/pem/pem_oth.c b/third_party/boringssl/src/crypto/pem/pem_oth.c deleted file mode 100644 index 8ea05be0..00000000 --- a/third_party/boringssl/src/crypto/pem/pem_oth.c +++ /dev/null @@ -1,88 +0,0 @@ -/* crypto/pem/pem_oth.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include -#include -#include -#include -#include - -// Handle 'other' PEMs: not private keys - -void *PEM_ASN1_read_bio(d2i_of_void *d2i, const char *name, BIO *bp, void **x, - pem_password_cb *cb, void *u) { - const unsigned char *p = NULL; - unsigned char *data = NULL; - long len; - char *ret = NULL; - - if (!PEM_bytes_read_bio(&data, &len, NULL, name, bp, cb, u)) { - return NULL; - } - p = data; - ret = d2i(x, &p, len); - if (ret == NULL) { - OPENSSL_PUT_ERROR(PEM, ERR_R_ASN1_LIB); - } - OPENSSL_free(data); - return ret; -} diff --git a/third_party/boringssl/src/crypto/pem/pem_oth.cc b/third_party/boringssl/src/crypto/pem/pem_oth.cc new file mode 100644 index 00000000..c19a78e9 --- /dev/null +++ b/third_party/boringssl/src/crypto/pem/pem_oth.cc @@ -0,0 +1,45 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include +#include +#include + +// Handle 'other' PEMs: not private keys + +void *PEM_ASN1_read_bio(d2i_of_void *d2i, const char *name, BIO *bp, void **x, + pem_password_cb *cb, void *u) { + const unsigned char *p = nullptr; + unsigned char *data = nullptr; + long len; + char *ret = nullptr; + + if (!PEM_bytes_read_bio(&data, &len, nullptr, name, bp, cb, u)) { + return nullptr; + } + p = data; + ret = reinterpret_cast(d2i(x, &p, len)); + if (ret == nullptr) { + OPENSSL_PUT_ERROR(PEM, ERR_R_ASN1_LIB); + } + OPENSSL_free(data); + return ret; +} diff --git a/third_party/boringssl/src/crypto/pem/pem_pk8.c b/third_party/boringssl/src/crypto/pem/pem_pk8.c deleted file mode 100644 index 85196fa9..00000000 --- a/third_party/boringssl/src/crypto/pem/pem_pk8.c +++ /dev/null @@ -1,246 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include -#include -#include -#include -#include - -static int do_pk8pkey(BIO *bp, EVP_PKEY *x, int isder, int nid, - const EVP_CIPHER *enc, char *kstr, int klen, - pem_password_cb *cb, void *u); -static int do_pk8pkey_fp(FILE *bp, EVP_PKEY *x, int isder, int nid, - const EVP_CIPHER *enc, char *kstr, int klen, - pem_password_cb *cb, void *u); - -// These functions write a private key in PKCS#8 format: it is a "drop in" -// replacement for PEM_write_bio_PrivateKey() and friends. As usual if 'enc' -// is NULL then it uses the unencrypted private key form. The 'nid' versions -// uses PKCS#5 v1.5 PBE algorithms whereas the others use PKCS#5 v2.0. - -int PEM_write_bio_PKCS8PrivateKey_nid(BIO *bp, EVP_PKEY *x, int nid, char *kstr, - int klen, pem_password_cb *cb, void *u) { - return do_pk8pkey(bp, x, 0, nid, NULL, kstr, klen, cb, u); -} - -int PEM_write_bio_PKCS8PrivateKey(BIO *bp, EVP_PKEY *x, const EVP_CIPHER *enc, - char *kstr, int klen, pem_password_cb *cb, - void *u) { - return do_pk8pkey(bp, x, 0, -1, enc, kstr, klen, cb, u); -} - -int i2d_PKCS8PrivateKey_bio(BIO *bp, EVP_PKEY *x, const EVP_CIPHER *enc, - char *kstr, int klen, pem_password_cb *cb, - void *u) { - return do_pk8pkey(bp, x, 1, -1, enc, kstr, klen, cb, u); -} - -int i2d_PKCS8PrivateKey_nid_bio(BIO *bp, EVP_PKEY *x, int nid, char *kstr, - int klen, pem_password_cb *cb, void *u) { - return do_pk8pkey(bp, x, 1, nid, NULL, kstr, klen, cb, u); -} - -static int do_pk8pkey(BIO *bp, EVP_PKEY *x, int isder, int nid, - const EVP_CIPHER *enc, char *kstr, int klen, - pem_password_cb *cb, void *u) { - X509_SIG *p8; - PKCS8_PRIV_KEY_INFO *p8inf; - char buf[PEM_BUFSIZE]; - int ret; - if (!(p8inf = EVP_PKEY2PKCS8(x))) { - OPENSSL_PUT_ERROR(PEM, PEM_R_ERROR_CONVERTING_PRIVATE_KEY); - return 0; - } - if (enc || (nid != -1)) { - if (!kstr) { - klen = 0; - if (!cb) { - cb = PEM_def_callback; - } - klen = cb(buf, PEM_BUFSIZE, 1, u); - if (klen <= 0) { - OPENSSL_PUT_ERROR(PEM, PEM_R_READ_KEY); - PKCS8_PRIV_KEY_INFO_free(p8inf); - return 0; - } - - kstr = buf; - } - p8 = PKCS8_encrypt(nid, enc, kstr, klen, NULL, 0, 0, p8inf); - if (kstr == buf) { - OPENSSL_cleanse(buf, klen); - } - PKCS8_PRIV_KEY_INFO_free(p8inf); - if (isder) { - ret = i2d_PKCS8_bio(bp, p8); - } else { - ret = PEM_write_bio_PKCS8(bp, p8); - } - X509_SIG_free(p8); - return ret; - } else { - if (isder) { - ret = i2d_PKCS8_PRIV_KEY_INFO_bio(bp, p8inf); - } else { - ret = PEM_write_bio_PKCS8_PRIV_KEY_INFO(bp, p8inf); - } - PKCS8_PRIV_KEY_INFO_free(p8inf); - return ret; - } -} - -EVP_PKEY *d2i_PKCS8PrivateKey_bio(BIO *bp, EVP_PKEY **x, pem_password_cb *cb, - void *u) { - PKCS8_PRIV_KEY_INFO *p8inf = NULL; - X509_SIG *p8 = NULL; - int klen; - EVP_PKEY *ret; - char psbuf[PEM_BUFSIZE]; - p8 = d2i_PKCS8_bio(bp, NULL); - if (!p8) { - return NULL; - } - - klen = 0; - if (!cb) { - cb = PEM_def_callback; - } - klen = cb(psbuf, PEM_BUFSIZE, 0, u); - if (klen <= 0) { - OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ); - X509_SIG_free(p8); - return NULL; - } - p8inf = PKCS8_decrypt(p8, psbuf, klen); - X509_SIG_free(p8); - OPENSSL_cleanse(psbuf, klen); - if (!p8inf) { - return NULL; - } - ret = EVP_PKCS82PKEY(p8inf); - PKCS8_PRIV_KEY_INFO_free(p8inf); - if (!ret) { - return NULL; - } - if (x) { - if (*x) { - EVP_PKEY_free(*x); - } - *x = ret; - } - return ret; -} - - -int i2d_PKCS8PrivateKey_fp(FILE *fp, EVP_PKEY *x, const EVP_CIPHER *enc, - char *kstr, int klen, pem_password_cb *cb, void *u) { - return do_pk8pkey_fp(fp, x, 1, -1, enc, kstr, klen, cb, u); -} - -int i2d_PKCS8PrivateKey_nid_fp(FILE *fp, EVP_PKEY *x, int nid, char *kstr, - int klen, pem_password_cb *cb, void *u) { - return do_pk8pkey_fp(fp, x, 1, nid, NULL, kstr, klen, cb, u); -} - -int PEM_write_PKCS8PrivateKey_nid(FILE *fp, EVP_PKEY *x, int nid, char *kstr, - int klen, pem_password_cb *cb, void *u) { - return do_pk8pkey_fp(fp, x, 0, nid, NULL, kstr, klen, cb, u); -} - -int PEM_write_PKCS8PrivateKey(FILE *fp, EVP_PKEY *x, const EVP_CIPHER *enc, - char *kstr, int klen, pem_password_cb *cb, - void *u) { - return do_pk8pkey_fp(fp, x, 0, -1, enc, kstr, klen, cb, u); -} - -static int do_pk8pkey_fp(FILE *fp, EVP_PKEY *x, int isder, int nid, - const EVP_CIPHER *enc, char *kstr, int klen, - pem_password_cb *cb, void *u) { - BIO *bp; - int ret; - if (!(bp = BIO_new_fp(fp, BIO_NOCLOSE))) { - OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); - return 0; - } - ret = do_pk8pkey(bp, x, isder, nid, enc, kstr, klen, cb, u); - BIO_free(bp); - return ret; -} - -EVP_PKEY *d2i_PKCS8PrivateKey_fp(FILE *fp, EVP_PKEY **x, pem_password_cb *cb, - void *u) { - BIO *bp; - EVP_PKEY *ret; - if (!(bp = BIO_new_fp(fp, BIO_NOCLOSE))) { - OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); - return NULL; - } - ret = d2i_PKCS8PrivateKey_bio(bp, x, cb, u); - BIO_free(bp); - return ret; -} - - -IMPLEMENT_PEM_rw(PKCS8, X509_SIG, PEM_STRING_PKCS8, X509_SIG) - - -IMPLEMENT_PEM_rw(PKCS8_PRIV_KEY_INFO, PKCS8_PRIV_KEY_INFO, PEM_STRING_PKCS8INF, - PKCS8_PRIV_KEY_INFO) diff --git a/third_party/boringssl/src/crypto/pem/pem_pk8.cc b/third_party/boringssl/src/crypto/pem/pem_pk8.cc new file mode 100644 index 00000000..9f477e9f --- /dev/null +++ b/third_party/boringssl/src/crypto/pem/pem_pk8.cc @@ -0,0 +1,208 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include + +static int do_pk8pkey(BIO *bp, const EVP_PKEY *x, int isder, int nid, + const EVP_CIPHER *enc, const char *pass, int pass_len, + pem_password_cb *cb, void *u); +static int do_pk8pkey_fp(FILE *bp, const EVP_PKEY *x, int isder, int nid, + const EVP_CIPHER *enc, const char *pass, int pass_len, + pem_password_cb *cb, void *u); + +// These functions write a private key in PKCS#8 format: it is a "drop in" +// replacement for PEM_write_bio_PrivateKey() and friends. As usual if 'enc' +// is NULL then it uses the unencrypted private key form. The 'nid' versions +// uses PKCS#5 v1.5 PBE algorithms whereas the others use PKCS#5 v2.0. + +int PEM_write_bio_PKCS8PrivateKey_nid(BIO *bp, const EVP_PKEY *x, int nid, + const char *pass, int pass_len, + pem_password_cb *cb, void *u) { + return do_pk8pkey(bp, x, 0, nid, nullptr, pass, pass_len, cb, u); +} + +int PEM_write_bio_PKCS8PrivateKey(BIO *bp, const EVP_PKEY *x, + const EVP_CIPHER *enc, const char *pass, + int pass_len, pem_password_cb *cb, void *u) { + return do_pk8pkey(bp, x, 0, -1, enc, pass, pass_len, cb, u); +} + +int i2d_PKCS8PrivateKey_bio(BIO *bp, const EVP_PKEY *x, const EVP_CIPHER *enc, + const char *pass, int pass_len, pem_password_cb *cb, + void *u) { + return do_pk8pkey(bp, x, 1, -1, enc, pass, pass_len, cb, u); +} + +int i2d_PKCS8PrivateKey_nid_bio(BIO *bp, const EVP_PKEY *x, int nid, + const char *pass, int pass_len, + pem_password_cb *cb, void *u) { + return do_pk8pkey(bp, x, 1, nid, nullptr, pass, pass_len, cb, u); +} + +static int do_pk8pkey(BIO *bp, const EVP_PKEY *x, int isder, int nid, + const EVP_CIPHER *enc, const char *pass, int pass_len, + pem_password_cb *cb, void *u) { + X509_SIG *p8; + PKCS8_PRIV_KEY_INFO *p8inf; + char buf[PEM_BUFSIZE]; + int ret; + if (!(p8inf = EVP_PKEY2PKCS8(x))) { + OPENSSL_PUT_ERROR(PEM, PEM_R_ERROR_CONVERTING_PRIVATE_KEY); + return 0; + } + if (enc || (nid != -1)) { + if (!pass) { + if (!cb) { + cb = PEM_def_callback; + } + pass_len = cb(buf, PEM_BUFSIZE, 1, u); + if (pass_len < 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_READ_KEY); + PKCS8_PRIV_KEY_INFO_free(p8inf); + return 0; + } + + pass = buf; + } + p8 = PKCS8_encrypt(nid, enc, pass, pass_len, nullptr, 0, 0, p8inf); + if (pass == buf) { + OPENSSL_cleanse(buf, pass_len); + } + PKCS8_PRIV_KEY_INFO_free(p8inf); + if (isder) { + ret = i2d_PKCS8_bio(bp, p8); + } else { + ret = PEM_write_bio_PKCS8(bp, p8); + } + X509_SIG_free(p8); + return ret; + } else { + if (isder) { + ret = i2d_PKCS8_PRIV_KEY_INFO_bio(bp, p8inf); + } else { + ret = PEM_write_bio_PKCS8_PRIV_KEY_INFO(bp, p8inf); + } + PKCS8_PRIV_KEY_INFO_free(p8inf); + return ret; + } +} + +EVP_PKEY *d2i_PKCS8PrivateKey_bio(BIO *bp, EVP_PKEY **x, pem_password_cb *cb, + void *u) { + PKCS8_PRIV_KEY_INFO *p8inf = nullptr; + X509_SIG *p8 = nullptr; + int pass_len; + EVP_PKEY *ret; + char psbuf[PEM_BUFSIZE]; + p8 = d2i_PKCS8_bio(bp, nullptr); + if (!p8) { + return nullptr; + } + + pass_len = 0; + if (!cb) { + cb = PEM_def_callback; + } + pass_len = cb(psbuf, PEM_BUFSIZE, 0, u); + if (pass_len < 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ); + X509_SIG_free(p8); + return nullptr; + } + p8inf = PKCS8_decrypt(p8, psbuf, pass_len); + X509_SIG_free(p8); + OPENSSL_cleanse(psbuf, pass_len); + if (!p8inf) { + return nullptr; + } + ret = EVP_PKCS82PKEY(p8inf); + PKCS8_PRIV_KEY_INFO_free(p8inf); + if (!ret) { + return nullptr; + } + if (x) { + if (*x) { + EVP_PKEY_free(*x); + } + *x = ret; + } + return ret; +} + + +int i2d_PKCS8PrivateKey_fp(FILE *fp, const EVP_PKEY *x, const EVP_CIPHER *enc, + const char *pass, int pass_len, pem_password_cb *cb, + void *u) { + return do_pk8pkey_fp(fp, x, 1, -1, enc, pass, pass_len, cb, u); +} + +int i2d_PKCS8PrivateKey_nid_fp(FILE *fp, const EVP_PKEY *x, int nid, + const char *pass, int pass_len, + pem_password_cb *cb, void *u) { + return do_pk8pkey_fp(fp, x, 1, nid, nullptr, pass, pass_len, cb, u); +} + +int PEM_write_PKCS8PrivateKey_nid(FILE *fp, const EVP_PKEY *x, int nid, + const char *pass, int pass_len, + pem_password_cb *cb, void *u) { + return do_pk8pkey_fp(fp, x, 0, nid, nullptr, pass, pass_len, cb, u); +} + +int PEM_write_PKCS8PrivateKey(FILE *fp, const EVP_PKEY *x, + const EVP_CIPHER *enc, const char *pass, + int pass_len, pem_password_cb *cb, void *u) { + return do_pk8pkey_fp(fp, x, 0, -1, enc, pass, pass_len, cb, u); +} + +static int do_pk8pkey_fp(FILE *fp, const EVP_PKEY *x, int isder, int nid, + const EVP_CIPHER *enc, const char *pass, int pass_len, + pem_password_cb *cb, void *u) { + BIO *bp; + int ret; + if (!(bp = BIO_new_fp(fp, BIO_NOCLOSE))) { + OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); + return 0; + } + ret = do_pk8pkey(bp, x, isder, nid, enc, pass, pass_len, cb, u); + BIO_free(bp); + return ret; +} + +EVP_PKEY *d2i_PKCS8PrivateKey_fp(FILE *fp, EVP_PKEY **x, pem_password_cb *cb, + void *u) { + BIO *bp; + EVP_PKEY *ret; + if (!(bp = BIO_new_fp(fp, BIO_NOCLOSE))) { + OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); + return nullptr; + } + ret = d2i_PKCS8PrivateKey_bio(bp, x, cb, u); + BIO_free(bp); + return ret; +} + + +IMPLEMENT_PEM_rw(PKCS8, X509_SIG, PEM_STRING_PKCS8, X509_SIG) + + +IMPLEMENT_PEM_rw(PKCS8_PRIV_KEY_INFO, PKCS8_PRIV_KEY_INFO, PEM_STRING_PKCS8INF, + PKCS8_PRIV_KEY_INFO) diff --git a/third_party/boringssl/src/crypto/pem/pem_pkey.c b/third_party/boringssl/src/crypto/pem/pem_pkey.c deleted file mode 100644 index 2d28d6c0..00000000 --- a/third_party/boringssl/src/crypto/pem/pem_pkey.c +++ /dev/null @@ -1,182 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -EVP_PKEY *PEM_read_bio_PrivateKey(BIO *bp, EVP_PKEY **x, pem_password_cb *cb, - void *u) { - char *nm = NULL; - const unsigned char *p = NULL; - unsigned char *data = NULL; - long len; - EVP_PKEY *ret = NULL; - - if (!PEM_bytes_read_bio(&data, &len, &nm, PEM_STRING_EVP_PKEY, bp, cb, u)) { - return NULL; - } - p = data; - - if (strcmp(nm, PEM_STRING_PKCS8INF) == 0) { - PKCS8_PRIV_KEY_INFO *p8inf; - p8inf = d2i_PKCS8_PRIV_KEY_INFO(NULL, &p, len); - if (!p8inf) { - goto p8err; - } - ret = EVP_PKCS82PKEY(p8inf); - if (x) { - if (*x) { - EVP_PKEY_free((EVP_PKEY *)*x); - } - *x = ret; - } - PKCS8_PRIV_KEY_INFO_free(p8inf); - } else if (strcmp(nm, PEM_STRING_PKCS8) == 0) { - PKCS8_PRIV_KEY_INFO *p8inf; - X509_SIG *p8; - int klen; - char psbuf[PEM_BUFSIZE]; - p8 = d2i_X509_SIG(NULL, &p, len); - if (!p8) { - goto p8err; - } - - klen = 0; - if (!cb) { - cb = PEM_def_callback; - } - klen = cb(psbuf, PEM_BUFSIZE, 0, u); - if (klen <= 0) { - OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ); - X509_SIG_free(p8); - goto err; - } - p8inf = PKCS8_decrypt(p8, psbuf, klen); - X509_SIG_free(p8); - OPENSSL_cleanse(psbuf, klen); - if (!p8inf) { - goto p8err; - } - ret = EVP_PKCS82PKEY(p8inf); - if (x) { - if (*x) { - EVP_PKEY_free((EVP_PKEY *)*x); - } - *x = ret; - } - PKCS8_PRIV_KEY_INFO_free(p8inf); - } else if (strcmp(nm, PEM_STRING_RSA) == 0) { - // TODO(davidben): d2i_PrivateKey parses PKCS#8 along with the - // standalone format. This and the cases below probably should not - // accept PKCS#8. - ret = d2i_PrivateKey(EVP_PKEY_RSA, x, &p, len); - } else if (strcmp(nm, PEM_STRING_EC) == 0) { - ret = d2i_PrivateKey(EVP_PKEY_EC, x, &p, len); - } else if (strcmp(nm, PEM_STRING_DSA) == 0) { - ret = d2i_PrivateKey(EVP_PKEY_DSA, x, &p, len); - } -p8err: - if (ret == NULL) { - OPENSSL_PUT_ERROR(PEM, ERR_R_ASN1_LIB); - } - -err: - OPENSSL_free(nm); - OPENSSL_free(data); - return ret; -} - -int PEM_write_bio_PrivateKey(BIO *bp, EVP_PKEY *x, const EVP_CIPHER *enc, - unsigned char *kstr, int klen, pem_password_cb *cb, - void *u) { - return PEM_write_bio_PKCS8PrivateKey(bp, x, enc, (char *)kstr, klen, cb, u); -} - -EVP_PKEY *PEM_read_PrivateKey(FILE *fp, EVP_PKEY **x, pem_password_cb *cb, - void *u) { - BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); - if (b == NULL) { - OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); - return NULL; - } - EVP_PKEY *ret = PEM_read_bio_PrivateKey(b, x, cb, u); - BIO_free(b); - return ret; -} - -int PEM_write_PrivateKey(FILE *fp, EVP_PKEY *x, const EVP_CIPHER *enc, - unsigned char *kstr, int klen, pem_password_cb *cb, - void *u) { - BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); - if (b == NULL) { - OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); - return 0; - } - int ret = PEM_write_bio_PrivateKey(b, x, enc, kstr, klen, cb, u); - BIO_free(b); - return ret; -} diff --git a/third_party/boringssl/src/crypto/pem/pem_pkey.cc b/third_party/boringssl/src/crypto/pem/pem_pkey.cc new file mode 100644 index 00000000..433215e0 --- /dev/null +++ b/third_party/boringssl/src/crypto/pem/pem_pkey.cc @@ -0,0 +1,141 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +EVP_PKEY *PEM_read_bio_PrivateKey(BIO *bp, EVP_PKEY **x, pem_password_cb *cb, + void *u) { + char *nm = nullptr; + const unsigned char *p = nullptr; + unsigned char *data = nullptr; + long len; + EVP_PKEY *ret = nullptr; + + if (!PEM_bytes_read_bio(&data, &len, &nm, PEM_STRING_EVP_PKEY, bp, cb, u)) { + return nullptr; + } + p = data; + + if (strcmp(nm, PEM_STRING_PKCS8INF) == 0) { + PKCS8_PRIV_KEY_INFO *p8inf; + p8inf = d2i_PKCS8_PRIV_KEY_INFO(nullptr, &p, len); + if (!p8inf) { + goto p8err; + } + ret = EVP_PKCS82PKEY(p8inf); + if (x) { + if (*x) { + EVP_PKEY_free((EVP_PKEY *)*x); + } + *x = ret; + } + PKCS8_PRIV_KEY_INFO_free(p8inf); + } else if (strcmp(nm, PEM_STRING_PKCS8) == 0) { + PKCS8_PRIV_KEY_INFO *p8inf; + X509_SIG *p8; + int pass_len; + char psbuf[PEM_BUFSIZE]; + p8 = d2i_X509_SIG(nullptr, &p, len); + if (!p8) { + goto p8err; + } + + pass_len = 0; + if (!cb) { + cb = PEM_def_callback; + } + pass_len = cb(psbuf, PEM_BUFSIZE, 0, u); + if (pass_len < 0) { + OPENSSL_PUT_ERROR(PEM, PEM_R_BAD_PASSWORD_READ); + X509_SIG_free(p8); + goto err; + } + p8inf = PKCS8_decrypt(p8, psbuf, pass_len); + X509_SIG_free(p8); + OPENSSL_cleanse(psbuf, pass_len); + if (!p8inf) { + goto p8err; + } + ret = EVP_PKCS82PKEY(p8inf); + if (x) { + if (*x) { + EVP_PKEY_free((EVP_PKEY *)*x); + } + *x = ret; + } + PKCS8_PRIV_KEY_INFO_free(p8inf); + } else if (strcmp(nm, PEM_STRING_RSA) == 0) { + // TODO(davidben): d2i_PrivateKey parses PKCS#8 along with the + // standalone format. This and the cases below probably should not + // accept PKCS#8. + ret = d2i_PrivateKey(EVP_PKEY_RSA, x, &p, len); + } else if (strcmp(nm, PEM_STRING_EC) == 0) { + ret = d2i_PrivateKey(EVP_PKEY_EC, x, &p, len); + } else if (strcmp(nm, PEM_STRING_DSA) == 0) { + ret = d2i_PrivateKey(EVP_PKEY_DSA, x, &p, len); + } +p8err: + if (ret == nullptr) { + OPENSSL_PUT_ERROR(PEM, ERR_R_ASN1_LIB); + } + +err: + OPENSSL_free(nm); + OPENSSL_free(data); + return ret; +} + +int PEM_write_bio_PrivateKey(BIO *bp, EVP_PKEY *x, const EVP_CIPHER *enc, + const unsigned char *pass, int pass_len, + pem_password_cb *cb, void *u) { + return PEM_write_bio_PKCS8PrivateKey(bp, x, enc, (const char *)pass, pass_len, + cb, u); +} + +EVP_PKEY *PEM_read_PrivateKey(FILE *fp, EVP_PKEY **x, pem_password_cb *cb, + void *u) { + BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); + if (b == nullptr) { + OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); + return nullptr; + } + EVP_PKEY *ret = PEM_read_bio_PrivateKey(b, x, cb, u); + BIO_free(b); + return ret; +} + +int PEM_write_PrivateKey(FILE *fp, EVP_PKEY *x, const EVP_CIPHER *enc, + const unsigned char *pass, int pass_len, + pem_password_cb *cb, void *u) { + BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); + if (b == nullptr) { + OPENSSL_PUT_ERROR(PEM, ERR_R_BUF_LIB); + return 0; + } + int ret = PEM_write_bio_PrivateKey(b, x, enc, pass, pass_len, cb, u); + BIO_free(b); + return ret; +} diff --git a/third_party/boringssl/src/crypto/pem/pem_x509.c b/third_party/boringssl/src/crypto/pem/pem_x509.c deleted file mode 100644 index 97f814db..00000000 --- a/third_party/boringssl/src/crypto/pem/pem_x509.c +++ /dev/null @@ -1,65 +0,0 @@ -/* pem_x509.c */ -/* - * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL project - * 2001. - */ -/* ==================================================================== - * Copyright (c) 2001 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include -#include - -IMPLEMENT_PEM_rw(X509, X509, PEM_STRING_X509, X509) diff --git a/third_party/boringssl/src/crypto/pem/pem_x509.cc b/third_party/boringssl/src/crypto/pem/pem_x509.cc new file mode 100644 index 00000000..e3399144 --- /dev/null +++ b/third_party/boringssl/src/crypto/pem/pem_x509.cc @@ -0,0 +1,22 @@ +// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +IMPLEMENT_PEM_rw(X509, X509, PEM_STRING_X509, X509) diff --git a/third_party/boringssl/src/crypto/pem/pem_xaux.c b/third_party/boringssl/src/crypto/pem/pem_xaux.c deleted file mode 100644 index b0cceca3..00000000 --- a/third_party/boringssl/src/crypto/pem/pem_xaux.c +++ /dev/null @@ -1,65 +0,0 @@ -/* pem_xaux.c */ -/* - * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL project - * 2001. - */ -/* ==================================================================== - * Copyright (c) 2001 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include -#include - -IMPLEMENT_PEM_rw(X509_AUX, X509, PEM_STRING_X509_TRUSTED, X509_AUX) diff --git a/third_party/boringssl/src/crypto/pem/pem_xaux.cc b/third_party/boringssl/src/crypto/pem/pem_xaux.cc new file mode 100644 index 00000000..56a2b1e5 --- /dev/null +++ b/third_party/boringssl/src/crypto/pem/pem_xaux.cc @@ -0,0 +1,22 @@ +// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +IMPLEMENT_PEM_rw(X509_AUX, X509, PEM_STRING_X509_TRUSTED, X509_AUX) diff --git a/third_party/boringssl/src/crypto/pkcs7/internal.h b/third_party/boringssl/src/crypto/pkcs7/internal.h index 5ee8e8a1..b85f2be7 100644 --- a/third_party/boringssl/src/crypto/pkcs7/internal.h +++ b/third_party/boringssl/src/crypto/pkcs7/internal.h @@ -1,26 +1,24 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_PKCS7_INTERNAL_H -#define OPENSSL_HEADER_PKCS7_INTERNAL_H +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_PKCS7_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_PKCS7_INTERNAL_H #include -#if defined(__cplusplus) -extern "C" { -#endif +BSSL_NAMESPACE_BEGIN // pkcs7_parse_header reads the non-certificate/non-CRL prefix of a PKCS#7 // SignedData blob from |cbs| and sets |*out| to point to the rest of the @@ -36,6 +34,8 @@ int pkcs7_parse_header(uint8_t **der_bytes, CBS *out, CBS *cbs); // doing so it makes callbacks to let the caller fill in parts of the structure. // All callbacks are ignored if NULL and return one on success or zero on error. // +// signed_data_version: version number of the SignedData structure. In PKCS#7, +// it is always 1. In CMS, it depends on the features used. // digest_algos_cb: may write AlgorithmIdentifiers into the given CBB, which // is a SET of digest algorithms. // cert_crl_cb: may write the |certificates| or |crls| fields. @@ -44,15 +44,24 @@ int pkcs7_parse_header(uint8_t **der_bytes, CBS *out, CBS *cbs); // (See https://datatracker.ietf.org/doc/html/rfc2315#section-9.1) // // pkcs7_add_signed_data returns one on success or zero on error. -int pkcs7_add_signed_data(CBB *out, - int (*digest_algos_cb)(CBB *out, const void *arg), - int (*cert_crl_cb)(CBB *out, const void *arg), - int (*signer_infos_cb)(CBB *out, const void *arg), - const void *arg); +int pkcs7_add_signed_data(CBB *out, uint64_t signed_data_version, + int (*digest_algos_cb)(CBB *out, void *arg), + int (*cert_crl_cb)(CBB *out, void *arg), + int (*signer_infos_cb)(CBB *out, void *arg), + void *arg); +// pkcs7_add_external_signature writes a PKCS#7 or CMS SignedData structure to +// |out|, containing an external (i.e. the contents are not included) signature, +// using |sign_cert| and |key| to sign the contents of |data| with |md|. If +// |use_key_id| is true (CMS-only), the SignerInfo specifies the signer with key +// identifier. Otherwise, it uses issuer and serial number (PKCS#7 or CMS v1). +// The SignedData will have no embedded certificates and no attributes. +// +// Note: CMS v1 and PKCS#7 v1.5 are not completely compatible, but they overlap +// in all cases implemented by this function. +int pkcs7_add_external_signature(CBB *out, X509 *sign_cert, EVP_PKEY *key, + const EVP_MD *md, BIO *data, bool use_key_id); -#if defined(__cplusplus) -} // extern C -#endif +BSSL_NAMESPACE_END -#endif // OPENSSL_HEADER_PKCS7_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_PKCS7_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/pkcs7/pkcs7.c b/third_party/boringssl/src/crypto/pkcs7/pkcs7.c deleted file mode 100644 index 8232af62..00000000 --- a/third_party/boringssl/src/crypto/pkcs7/pkcs7.c +++ /dev/null @@ -1,193 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../bytestring/internal.h" - - -// 1.2.840.113549.1.7.1 -static const uint8_t kPKCS7Data[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, - 0x0d, 0x01, 0x07, 0x01}; - -// 1.2.840.113549.1.7.2 -static const uint8_t kPKCS7SignedData[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, - 0x0d, 0x01, 0x07, 0x02}; - -// pkcs7_parse_header reads the non-certificate/non-CRL prefix of a PKCS#7 -// SignedData blob from |cbs| and sets |*out| to point to the rest of the -// input. If the input is in BER format, then |*der_bytes| will be set to a -// pointer that needs to be freed by the caller once they have finished -// processing |*out| (which will be pointing into |*der_bytes|). -// -// It returns one on success or zero on error. On error, |*der_bytes| is -// NULL. -int pkcs7_parse_header(uint8_t **der_bytes, CBS *out, CBS *cbs) { - CBS in, content_info, content_type, wrapped_signed_data, signed_data; - uint64_t version; - - // The input may be in BER format. - *der_bytes = NULL; - if (!CBS_asn1_ber_to_der(cbs, &in, der_bytes) || - // See https://tools.ietf.org/html/rfc2315#section-7 - !CBS_get_asn1(&in, &content_info, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&content_info, &content_type, CBS_ASN1_OBJECT)) { - goto err; - } - - if (!CBS_mem_equal(&content_type, kPKCS7SignedData, - sizeof(kPKCS7SignedData))) { - OPENSSL_PUT_ERROR(PKCS7, PKCS7_R_NOT_PKCS7_SIGNED_DATA); - goto err; - } - - // See https://tools.ietf.org/html/rfc2315#section-9.1 - if (!CBS_get_asn1(&content_info, &wrapped_signed_data, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) || - !CBS_get_asn1(&wrapped_signed_data, &signed_data, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1_uint64(&signed_data, &version) || - !CBS_get_asn1(&signed_data, NULL /* digests */, CBS_ASN1_SET) || - !CBS_get_asn1(&signed_data, NULL /* content */, CBS_ASN1_SEQUENCE)) { - goto err; - } - - if (version < 1) { - OPENSSL_PUT_ERROR(PKCS7, PKCS7_R_BAD_PKCS7_VERSION); - goto err; - } - - CBS_init(out, CBS_data(&signed_data), CBS_len(&signed_data)); - return 1; - -err: - OPENSSL_free(*der_bytes); - *der_bytes = NULL; - return 0; -} - -int PKCS7_get_raw_certificates(STACK_OF(CRYPTO_BUFFER) *out_certs, CBS *cbs, - CRYPTO_BUFFER_POOL *pool) { - CBS signed_data, certificates; - uint8_t *der_bytes = NULL; - int ret = 0, has_certificates; - const size_t initial_certs_len = sk_CRYPTO_BUFFER_num(out_certs); - - // See https://tools.ietf.org/html/rfc2315#section-9.1 - if (!pkcs7_parse_header(&der_bytes, &signed_data, cbs) || - !CBS_get_optional_asn1( - &signed_data, &certificates, &has_certificates, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)) { - goto err; - } - - if (!has_certificates) { - CBS_init(&certificates, NULL, 0); - } - - while (CBS_len(&certificates) > 0) { - CBS cert; - if (!CBS_get_asn1_element(&certificates, &cert, CBS_ASN1_SEQUENCE)) { - goto err; - } - - CRYPTO_BUFFER *buf = CRYPTO_BUFFER_new_from_CBS(&cert, pool); - if (buf == NULL || - !sk_CRYPTO_BUFFER_push(out_certs, buf)) { - CRYPTO_BUFFER_free(buf); - goto err; - } - } - - ret = 1; - -err: - OPENSSL_free(der_bytes); - - if (!ret) { - while (sk_CRYPTO_BUFFER_num(out_certs) != initial_certs_len) { - CRYPTO_BUFFER *buf = sk_CRYPTO_BUFFER_pop(out_certs); - CRYPTO_BUFFER_free(buf); - } - } - - return ret; -} - -static int pkcs7_bundle_raw_certificates_cb(CBB *out, const void *arg) { - const STACK_OF(CRYPTO_BUFFER) *certs = arg; - CBB certificates; - - // See https://tools.ietf.org/html/rfc2315#section-9.1 - if (!CBB_add_asn1(out, &certificates, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)) { - return 0; - } - - for (size_t i = 0; i < sk_CRYPTO_BUFFER_num(certs); i++) { - CRYPTO_BUFFER *cert = sk_CRYPTO_BUFFER_value(certs, i); - if (!CBB_add_bytes(&certificates, CRYPTO_BUFFER_data(cert), - CRYPTO_BUFFER_len(cert))) { - return 0; - } - } - - // |certificates| is a implicitly-tagged SET OF. - return CBB_flush_asn1_set_of(&certificates) && CBB_flush(out); -} - -int PKCS7_bundle_raw_certificates(CBB *out, - const STACK_OF(CRYPTO_BUFFER) *certs) { - return pkcs7_add_signed_data(out, /*digest_algos_cb=*/NULL, - pkcs7_bundle_raw_certificates_cb, - /*signer_infos_cb=*/NULL, certs); -} - -int pkcs7_add_signed_data(CBB *out, - int (*digest_algos_cb)(CBB *out, const void *arg), - int (*cert_crl_cb)(CBB *out, const void *arg), - int (*signer_infos_cb)(CBB *out, const void *arg), - const void *arg) { - CBB outer_seq, oid, wrapped_seq, seq, version_bytes, digest_algos_set, - content_info, signer_infos; - - // See https://tools.ietf.org/html/rfc2315#section-7 - if (!CBB_add_asn1(out, &outer_seq, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&outer_seq, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, kPKCS7SignedData, sizeof(kPKCS7SignedData)) || - !CBB_add_asn1(&outer_seq, &wrapped_seq, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) || - // See https://tools.ietf.org/html/rfc2315#section-9.1 - !CBB_add_asn1(&wrapped_seq, &seq, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&seq, &version_bytes, CBS_ASN1_INTEGER) || - !CBB_add_u8(&version_bytes, 1) || - !CBB_add_asn1(&seq, &digest_algos_set, CBS_ASN1_SET) || - (digest_algos_cb != NULL && !digest_algos_cb(&digest_algos_set, arg)) || - !CBB_add_asn1(&seq, &content_info, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&content_info, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, kPKCS7Data, sizeof(kPKCS7Data)) || - (cert_crl_cb != NULL && !cert_crl_cb(&seq, arg)) || - !CBB_add_asn1(&seq, &signer_infos, CBS_ASN1_SET) || - (signer_infos_cb != NULL && !signer_infos_cb(&signer_infos, arg))) { - return 0; - } - - return CBB_flush(out); -} diff --git a/third_party/boringssl/src/crypto/pkcs7/pkcs7.cc b/third_party/boringssl/src/crypto/pkcs7/pkcs7.cc new file mode 100644 index 00000000..71e9ab84 --- /dev/null +++ b/third_party/boringssl/src/crypto/pkcs7/pkcs7.cc @@ -0,0 +1,198 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + +#include "../bytestring/internal.h" +#include "internal.h" + + +using namespace bssl; + +// 1.2.840.113549.1.7.1 +static const uint8_t kPKCS7Data[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, + 0x0d, 0x01, 0x07, 0x01}; + +// 1.2.840.113549.1.7.2 +static const uint8_t kPKCS7SignedData[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, + 0x0d, 0x01, 0x07, 0x02}; + +// pkcs7_parse_header reads the non-certificate/non-CRL prefix of a PKCS#7 +// SignedData blob from |cbs| and sets |*out| to point to the rest of the +// input. If the input is in BER format, then |*der_bytes| will be set to a +// pointer that needs to be freed by the caller once they have finished +// processing |*out| (which will be pointing into |*der_bytes|). +// +// It returns one on success or zero on error. On error, |*der_bytes| is +// NULL. +int bssl::pkcs7_parse_header(uint8_t **der_bytes, CBS *out, CBS *cbs) { + CBS in, content_info, content_type, wrapped_signed_data, signed_data; + uint64_t version; + + // The input may be in BER format. + *der_bytes = nullptr; + if (!CBS_asn1_ber_to_der(cbs, &in, der_bytes) || + // See https://tools.ietf.org/html/rfc2315#section-7 + !CBS_get_asn1(&in, &content_info, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&content_info, &content_type, CBS_ASN1_OBJECT)) { + goto err; + } + + if (!CBS_mem_equal(&content_type, kPKCS7SignedData, + sizeof(kPKCS7SignedData))) { + OPENSSL_PUT_ERROR(PKCS7, PKCS7_R_NOT_PKCS7_SIGNED_DATA); + goto err; + } + + // See https://tools.ietf.org/html/rfc2315#section-9.1 + if (!CBS_get_asn1(&content_info, &wrapped_signed_data, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) || + !CBS_get_asn1(&wrapped_signed_data, &signed_data, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1_uint64(&signed_data, &version) || + !CBS_get_asn1(&signed_data, nullptr /* digests */, CBS_ASN1_SET) || + !CBS_get_asn1(&signed_data, nullptr /* content */, CBS_ASN1_SEQUENCE)) { + goto err; + } + + if (version < 1) { + OPENSSL_PUT_ERROR(PKCS7, PKCS7_R_BAD_PKCS7_VERSION); + goto err; + } + + CBS_init(out, CBS_data(&signed_data), CBS_len(&signed_data)); + return 1; + +err: + OPENSSL_free(*der_bytes); + *der_bytes = nullptr; + return 0; +} + +int PKCS7_get_raw_certificates(STACK_OF(CRYPTO_BUFFER) *out_certs, CBS *cbs, + CRYPTO_BUFFER_POOL *pool) { + CBS signed_data, certificates; + uint8_t *der_bytes = nullptr; + int ret = 0, has_certificates; + const size_t initial_certs_len = sk_CRYPTO_BUFFER_num(out_certs); + + // See https://tools.ietf.org/html/rfc2315#section-9.1 + if (!pkcs7_parse_header(&der_bytes, &signed_data, cbs) || + !CBS_get_optional_asn1( + &signed_data, &certificates, &has_certificates, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)) { + goto err; + } + + if (!has_certificates) { + CBS_init(&certificates, nullptr, 0); + } + + while (CBS_len(&certificates) > 0) { + CBS cert; + if (!CBS_get_asn1_element(&certificates, &cert, CBS_ASN1_SEQUENCE)) { + goto err; + } + + CRYPTO_BUFFER *buf = CRYPTO_BUFFER_new_from_CBS(&cert, pool); + if (buf == nullptr || !sk_CRYPTO_BUFFER_push(out_certs, buf)) { + CRYPTO_BUFFER_free(buf); + goto err; + } + } + + ret = 1; + +err: + OPENSSL_free(der_bytes); + + if (!ret) { + while (sk_CRYPTO_BUFFER_num(out_certs) != initial_certs_len) { + CRYPTO_BUFFER *buf = sk_CRYPTO_BUFFER_pop(out_certs); + CRYPTO_BUFFER_free(buf); + } + } + + return ret; +} + +static int pkcs7_bundle_raw_certificates_cb(CBB *out, void *arg) { + const STACK_OF(CRYPTO_BUFFER) *certs = + reinterpret_cast(arg); + CBB certificates; + + // See https://tools.ietf.org/html/rfc2315#section-9.1 + if (!CBB_add_asn1(out, &certificates, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)) { + return 0; + } + + for (size_t i = 0; i < sk_CRYPTO_BUFFER_num(certs); i++) { + CRYPTO_BUFFER *cert = sk_CRYPTO_BUFFER_value(certs, i); + if (!CBB_add_bytes(&certificates, CRYPTO_BUFFER_data(cert), + CRYPTO_BUFFER_len(cert))) { + return 0; + } + } + + // |certificates| is a implicitly-tagged SET OF. + return CBB_flush_asn1_set_of(&certificates) && CBB_flush(out); +} + +int PKCS7_bundle_raw_certificates(CBB *out, + const STACK_OF(CRYPTO_BUFFER) *certs) { + return pkcs7_add_signed_data(out, /*signed_data_version=*/1, + /*digest_algos_cb=*/nullptr, + pkcs7_bundle_raw_certificates_cb, + /*signer_infos_cb=*/nullptr, + const_cast(certs)); +} + +int bssl::pkcs7_add_signed_data(CBB *out, uint64_t signed_data_version, + int (*digest_algos_cb)(CBB *out, void *arg), + int (*cert_crl_cb)(CBB *out, void *arg), + int (*signer_infos_cb)(CBB *out, void *arg), + void *arg) { + CBB outer_seq, wrapped_seq, seq, digest_algos_set, content_info, signer_infos; + + // See https://tools.ietf.org/html/rfc2315#section-7 + if (!CBB_add_asn1(out, &outer_seq, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&outer_seq, CBS_ASN1_OBJECT, kPKCS7SignedData, + sizeof(kPKCS7SignedData)) || + !CBB_add_asn1(&outer_seq, &wrapped_seq, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) || + // See https://tools.ietf.org/html/rfc2315#section-9.1 + !CBB_add_asn1(&wrapped_seq, &seq, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&seq, signed_data_version) || + !CBB_add_asn1(&seq, &digest_algos_set, CBS_ASN1_SET) || + (digest_algos_cb != nullptr && + !digest_algos_cb(&digest_algos_set, arg)) || + !CBB_flush_asn1_set_of(&digest_algos_set) || + !CBB_add_asn1(&seq, &content_info, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&content_info, CBS_ASN1_OBJECT, kPKCS7Data, + sizeof(kPKCS7Data)) || + (cert_crl_cb != nullptr && !cert_crl_cb(&seq, arg)) || + !CBB_add_asn1(&seq, &signer_infos, CBS_ASN1_SET) || + (signer_infos_cb != nullptr && !signer_infos_cb(&signer_infos, arg)) || + !CBB_flush_asn1_set_of(&signer_infos)) { + return 0; + } + + return CBB_flush(out); +} diff --git a/third_party/boringssl/src/crypto/pkcs7/pkcs7_x509.c b/third_party/boringssl/src/crypto/pkcs7/pkcs7_x509.c deleted file mode 100644 index 773c5923..00000000 --- a/third_party/boringssl/src/crypto/pkcs7/pkcs7_x509.c +++ /dev/null @@ -1,526 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../internal.h" - - -int PKCS7_get_certificates(STACK_OF(X509) *out_certs, CBS *cbs) { - int ret = 0; - const size_t initial_certs_len = sk_X509_num(out_certs); - STACK_OF(CRYPTO_BUFFER) *raw = sk_CRYPTO_BUFFER_new_null(); - if (raw == NULL || - !PKCS7_get_raw_certificates(raw, cbs, NULL)) { - goto err; - } - - for (size_t i = 0; i < sk_CRYPTO_BUFFER_num(raw); i++) { - CRYPTO_BUFFER *buf = sk_CRYPTO_BUFFER_value(raw, i); - X509 *x509 = X509_parse_from_buffer(buf); - if (x509 == NULL || - !sk_X509_push(out_certs, x509)) { - X509_free(x509); - goto err; - } - } - - ret = 1; - -err: - sk_CRYPTO_BUFFER_pop_free(raw, CRYPTO_BUFFER_free); - if (!ret) { - while (sk_X509_num(out_certs) != initial_certs_len) { - X509 *x509 = sk_X509_pop(out_certs); - X509_free(x509); - } - } - - return ret; -} - -int PKCS7_get_CRLs(STACK_OF(X509_CRL) *out_crls, CBS *cbs) { - CBS signed_data, crls; - uint8_t *der_bytes = NULL; - int ret = 0, has_crls; - const size_t initial_crls_len = sk_X509_CRL_num(out_crls); - - // See https://tools.ietf.org/html/rfc2315#section-9.1 - if (!pkcs7_parse_header(&der_bytes, &signed_data, cbs) || - // Even if only CRLs are included, there may be an empty certificates - // block. OpenSSL does this, for example. - !CBS_get_optional_asn1( - &signed_data, NULL, NULL, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) || - !CBS_get_optional_asn1( - &signed_data, &crls, &has_crls, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 1)) { - goto err; - } - - if (!has_crls) { - CBS_init(&crls, NULL, 0); - } - - while (CBS_len(&crls) > 0) { - CBS crl_data; - X509_CRL *crl; - const uint8_t *inp; - - if (!CBS_get_asn1_element(&crls, &crl_data, CBS_ASN1_SEQUENCE)) { - goto err; - } - - if (CBS_len(&crl_data) > LONG_MAX) { - goto err; - } - inp = CBS_data(&crl_data); - crl = d2i_X509_CRL(NULL, &inp, (long)CBS_len(&crl_data)); - if (!crl) { - goto err; - } - - assert(inp == CBS_data(&crl_data) + CBS_len(&crl_data)); - - if (sk_X509_CRL_push(out_crls, crl) == 0) { - X509_CRL_free(crl); - goto err; - } - } - - ret = 1; - -err: - OPENSSL_free(der_bytes); - - if (!ret) { - while (sk_X509_CRL_num(out_crls) != initial_crls_len) { - X509_CRL_free(sk_X509_CRL_pop(out_crls)); - } - } - - return ret; -} - -int PKCS7_get_PEM_certificates(STACK_OF(X509) *out_certs, BIO *pem_bio) { - uint8_t *data; - long len; - int ret; - - // Even though we pass PEM_STRING_PKCS7 as the expected PEM type here, PEM - // internally will actually allow several other values too, including - // "CERTIFICATE". - if (!PEM_bytes_read_bio(&data, &len, NULL /* PEM type output */, - PEM_STRING_PKCS7, pem_bio, - NULL /* password callback */, - NULL /* password callback argument */)) { - return 0; - } - - CBS cbs; - CBS_init(&cbs, data, len); - ret = PKCS7_get_certificates(out_certs, &cbs); - OPENSSL_free(data); - return ret; -} - -int PKCS7_get_PEM_CRLs(STACK_OF(X509_CRL) *out_crls, BIO *pem_bio) { - uint8_t *data; - long len; - int ret; - - // Even though we pass PEM_STRING_PKCS7 as the expected PEM type here, PEM - // internally will actually allow several other values too, including - // "CERTIFICATE". - if (!PEM_bytes_read_bio(&data, &len, NULL /* PEM type output */, - PEM_STRING_PKCS7, pem_bio, - NULL /* password callback */, - NULL /* password callback argument */)) { - return 0; - } - - CBS cbs; - CBS_init(&cbs, data, len); - ret = PKCS7_get_CRLs(out_crls, &cbs); - OPENSSL_free(data); - return ret; -} - -static int pkcs7_bundle_certificates_cb(CBB *out, const void *arg) { - const STACK_OF(X509) *certs = arg; - size_t i; - CBB certificates; - - // See https://tools.ietf.org/html/rfc2315#section-9.1 - if (!CBB_add_asn1(out, &certificates, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)) { - return 0; - } - - for (i = 0; i < sk_X509_num(certs); i++) { - X509 *x509 = sk_X509_value(certs, i); - uint8_t *buf; - int len = i2d_X509(x509, NULL); - - if (len < 0 || - !CBB_add_space(&certificates, &buf, len) || - i2d_X509(x509, &buf) < 0) { - return 0; - } - } - - // |certificates| is a implicitly-tagged SET OF. - return CBB_flush_asn1_set_of(&certificates) && CBB_flush(out); -} - -int PKCS7_bundle_certificates(CBB *out, const STACK_OF(X509) *certs) { - return pkcs7_add_signed_data(out, /*digest_algos_cb=*/NULL, - pkcs7_bundle_certificates_cb, - /*signer_infos_cb=*/NULL, certs); -} - -static int pkcs7_bundle_crls_cb(CBB *out, const void *arg) { - const STACK_OF(X509_CRL) *crls = arg; - size_t i; - CBB crl_data; - - // See https://tools.ietf.org/html/rfc2315#section-9.1 - if (!CBB_add_asn1(out, &crl_data, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 1)) { - return 0; - } - - for (i = 0; i < sk_X509_CRL_num(crls); i++) { - X509_CRL *crl = sk_X509_CRL_value(crls, i); - uint8_t *buf; - int len = i2d_X509_CRL(crl, NULL); - - if (len < 0 || - !CBB_add_space(&crl_data, &buf, len) || - i2d_X509_CRL(crl, &buf) < 0) { - return 0; - } - } - - // |crl_data| is a implicitly-tagged SET OF. - return CBB_flush_asn1_set_of(&crl_data) && CBB_flush(out); -} - -int PKCS7_bundle_CRLs(CBB *out, const STACK_OF(X509_CRL) *crls) { - return pkcs7_add_signed_data(out, /*digest_algos_cb=*/NULL, - pkcs7_bundle_crls_cb, - /*signer_infos_cb=*/NULL, crls); -} - -static PKCS7 *pkcs7_new(CBS *cbs) { - PKCS7 *ret = OPENSSL_malloc(sizeof(PKCS7)); - if (ret == NULL) { - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(PKCS7)); - ret->type = OBJ_nid2obj(NID_pkcs7_signed); - ret->d.sign = OPENSSL_malloc(sizeof(PKCS7_SIGNED)); - if (ret->d.sign == NULL) { - goto err; - } - ret->d.sign->cert = sk_X509_new_null(); - ret->d.sign->crl = sk_X509_CRL_new_null(); - CBS copy = *cbs, copy2 = *cbs; - if (ret->d.sign->cert == NULL || ret->d.sign->crl == NULL || - !PKCS7_get_certificates(ret->d.sign->cert, ©) || - !PKCS7_get_CRLs(ret->d.sign->crl, cbs)) { - goto err; - } - - if (sk_X509_num(ret->d.sign->cert) == 0) { - sk_X509_free(ret->d.sign->cert); - ret->d.sign->cert = NULL; - } - - if (sk_X509_CRL_num(ret->d.sign->crl) == 0) { - sk_X509_CRL_free(ret->d.sign->crl); - ret->d.sign->crl = NULL; - } - - ret->ber_len = CBS_len(©2) - CBS_len(cbs); - ret->ber_bytes = OPENSSL_memdup(CBS_data(©2), ret->ber_len); - if (ret->ber_bytes == NULL) { - goto err; - } - - return ret; - -err: - PKCS7_free(ret); - return NULL; -} - -PKCS7 *d2i_PKCS7(PKCS7 **out, const uint8_t **inp, - size_t len) { - CBS cbs; - CBS_init(&cbs, *inp, len); - PKCS7 *ret = pkcs7_new(&cbs); - if (ret == NULL) { - return NULL; - } - *inp = CBS_data(&cbs); - if (out != NULL) { - PKCS7_free(*out); - *out = ret; - } - return ret; -} - -PKCS7 *d2i_PKCS7_bio(BIO *bio, PKCS7 **out) { - // Use a generous bound, to allow for PKCS#7 files containing large root sets. - static const size_t kMaxSize = 4 * 1024 * 1024; - uint8_t *data; - size_t len; - if (!BIO_read_asn1(bio, &data, &len, kMaxSize)) { - return NULL; - } - - CBS cbs; - CBS_init(&cbs, data, len); - PKCS7 *ret = pkcs7_new(&cbs); - OPENSSL_free(data); - if (out != NULL && ret != NULL) { - PKCS7_free(*out); - *out = ret; - } - return ret; -} - -int i2d_PKCS7(const PKCS7 *p7, uint8_t **out) { - if (p7->ber_len > INT_MAX) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_OVERFLOW); - return -1; - } - - if (out == NULL) { - return (int)p7->ber_len; - } - - if (*out == NULL) { - *out = OPENSSL_malloc(p7->ber_len); - if (*out == NULL) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE); - return -1; - } - OPENSSL_memcpy(*out, p7->ber_bytes, p7->ber_len); - } else { - OPENSSL_memcpy(*out, p7->ber_bytes, p7->ber_len); - *out += p7->ber_len; - } - return (int)p7->ber_len; -} - -int i2d_PKCS7_bio(BIO *bio, const PKCS7 *p7) { - return BIO_write_all(bio, p7->ber_bytes, p7->ber_len); -} - -void PKCS7_free(PKCS7 *p7) { - if (p7 == NULL) { - return; - } - - OPENSSL_free(p7->ber_bytes); - ASN1_OBJECT_free(p7->type); - // We only supported signed data. - if (p7->d.sign != NULL) { - sk_X509_pop_free(p7->d.sign->cert, X509_free); - sk_X509_CRL_pop_free(p7->d.sign->crl, X509_CRL_free); - OPENSSL_free(p7->d.sign); - } - OPENSSL_free(p7); -} - -// We only support signed data, so these getters are no-ops. -int PKCS7_type_is_data(const PKCS7 *p7) { return 0; } -int PKCS7_type_is_digest(const PKCS7 *p7) { return 0; } -int PKCS7_type_is_encrypted(const PKCS7 *p7) { return 0; } -int PKCS7_type_is_enveloped(const PKCS7 *p7) { return 0; } -int PKCS7_type_is_signed(const PKCS7 *p7) { return 1; } -int PKCS7_type_is_signedAndEnveloped(const PKCS7 *p7) { return 0; } - -// write_sha256_ai writes an AlgorithmIdentifier for SHA-256 to -// |digest_algos_set|. -static int write_sha256_ai(CBB *digest_algos_set, const void *arg) { - CBB seq; - return CBB_add_asn1(digest_algos_set, &seq, CBS_ASN1_SEQUENCE) && - OBJ_nid2cbb(&seq, NID_sha256) && // - // https://datatracker.ietf.org/doc/html/rfc5754#section-2 - // "Implementations MUST generate SHA2 AlgorithmIdentifiers with absent - // parameters." - CBB_flush(digest_algos_set); -} - -// sign_sha256 writes at most |max_out_sig| bytes of the signature of |data| by -// |pkey| to |out_sig| and sets |*out_sig_len| to the number of bytes written. -// It returns one on success or zero on error. -static int sign_sha256(uint8_t *out_sig, size_t *out_sig_len, - size_t max_out_sig, EVP_PKEY *pkey, BIO *data) { - static const size_t kBufSize = 4096; - uint8_t *buffer = OPENSSL_malloc(kBufSize); - if (!buffer) { - return 0; - } - - EVP_MD_CTX ctx; - EVP_MD_CTX_init(&ctx); - - int ret = 0; - if (!EVP_DigestSignInit(&ctx, NULL, EVP_sha256(), NULL, pkey)) { - goto out; - } - - for (;;) { - const int n = BIO_read(data, buffer, kBufSize); - if (n == 0) { - break; - } else if (n < 0 || !EVP_DigestSignUpdate(&ctx, buffer, n)) { - goto out; - } - } - - *out_sig_len = max_out_sig; - if (!EVP_DigestSignFinal(&ctx, out_sig, out_sig_len)) { - goto out; - } - - ret = 1; - -out: - EVP_MD_CTX_cleanup(&ctx); - OPENSSL_free(buffer); - return ret; -} - -struct signer_info_data { - const X509 *sign_cert; - uint8_t *signature; - size_t signature_len; -}; - -// write_signer_info writes the SignerInfo structure from -// https://datatracker.ietf.org/doc/html/rfc2315#section-9.2 to |out|. It -// returns one on success or zero on error. -static int write_signer_info(CBB *out, const void *arg) { - const struct signer_info_data *const si_data = arg; - - int ret = 0; - uint8_t *subject_bytes = NULL; - uint8_t *serial_bytes = NULL; - - const int subject_len = - i2d_X509_NAME(X509_get_subject_name(si_data->sign_cert), &subject_bytes); - const int serial_len = i2d_ASN1_INTEGER( - (ASN1_INTEGER *)X509_get0_serialNumber(si_data->sign_cert), - &serial_bytes); - - CBB seq, issuer_and_serial, signing_algo, null, signature; - if (subject_len < 0 || - serial_len < 0 || - !CBB_add_asn1(out, &seq, CBS_ASN1_SEQUENCE) || - // version - !CBB_add_asn1_uint64(&seq, 1) || - !CBB_add_asn1(&seq, &issuer_and_serial, CBS_ASN1_SEQUENCE) || - !CBB_add_bytes(&issuer_and_serial, subject_bytes, subject_len) || - !CBB_add_bytes(&issuer_and_serial, serial_bytes, serial_len) || - !write_sha256_ai(&seq, NULL) || - !CBB_add_asn1(&seq, &signing_algo, CBS_ASN1_SEQUENCE) || - !OBJ_nid2cbb(&signing_algo, NID_rsaEncryption) || - !CBB_add_asn1(&signing_algo, &null, CBS_ASN1_NULL) || - !CBB_add_asn1(&seq, &signature, CBS_ASN1_OCTETSTRING) || - !CBB_add_bytes(&signature, si_data->signature, si_data->signature_len) || - !CBB_flush(out)) { - goto out; - } - - ret = 1; - -out: - OPENSSL_free(subject_bytes); - OPENSSL_free(serial_bytes); - return ret; -} - -PKCS7 *PKCS7_sign(X509 *sign_cert, EVP_PKEY *pkey, STACK_OF(X509) *certs, - BIO *data, int flags) { - CBB cbb; - if (!CBB_init(&cbb, 2048)) { - return NULL; - } - - uint8_t *der = NULL; - size_t len; - PKCS7 *ret = NULL; - - if (sign_cert == NULL && pkey == NULL && flags == PKCS7_DETACHED) { - // Caller just wants to bundle certificates. - if (!PKCS7_bundle_certificates(&cbb, certs)) { - goto out; - } - } else if (sign_cert != NULL && pkey != NULL && certs == NULL && - data != NULL && - flags == (PKCS7_NOATTR | PKCS7_BINARY | PKCS7_NOCERTS | - PKCS7_DETACHED) && - EVP_PKEY_id(pkey) == NID_rsaEncryption) { - // sign-file.c from the Linux kernel. - const size_t signature_max_len = EVP_PKEY_size(pkey); - struct signer_info_data si_data = { - .sign_cert = sign_cert, - .signature = OPENSSL_malloc(signature_max_len), - }; - - if (!si_data.signature || - !sign_sha256(si_data.signature, &si_data.signature_len, - signature_max_len, pkey, data) || - !pkcs7_add_signed_data(&cbb, write_sha256_ai, /*cert_crl_cb=*/NULL, - write_signer_info, &si_data)) { - OPENSSL_free(si_data.signature); - goto out; - } - OPENSSL_free(si_data.signature); - } else { - OPENSSL_PUT_ERROR(PKCS7, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); - goto out; - } - - if (!CBB_finish(&cbb, &der, &len)) { - goto out; - } - - CBS cbs; - CBS_init(&cbs, der, len); - ret = pkcs7_new(&cbs); - -out: - CBB_cleanup(&cbb); - OPENSSL_free(der); - return ret; -} diff --git a/third_party/boringssl/src/crypto/pkcs7/pkcs7_x509.cc b/third_party/boringssl/src/crypto/pkcs7/pkcs7_x509.cc new file mode 100644 index 00000000..07975b2c --- /dev/null +++ b/third_party/boringssl/src/crypto/pkcs7/pkcs7_x509.cc @@ -0,0 +1,527 @@ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../asn1/internal.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "../x509/internal.h" +#include "internal.h" + + +using namespace bssl; + +int PKCS7_get_certificates(STACK_OF(X509) *out_certs, CBS *cbs) { + int ret = 0; + const size_t initial_certs_len = sk_X509_num(out_certs); + STACK_OF(CRYPTO_BUFFER) *raw = sk_CRYPTO_BUFFER_new_null(); + if (raw == nullptr || !PKCS7_get_raw_certificates(raw, cbs, nullptr)) { + goto err; + } + + for (size_t i = 0; i < sk_CRYPTO_BUFFER_num(raw); i++) { + CRYPTO_BUFFER *buf = sk_CRYPTO_BUFFER_value(raw, i); + X509 *x509 = X509_parse_from_buffer(buf); + if (x509 == nullptr || !sk_X509_push(out_certs, x509)) { + X509_free(x509); + goto err; + } + } + + ret = 1; + +err: + sk_CRYPTO_BUFFER_pop_free(raw, CRYPTO_BUFFER_free); + if (!ret) { + while (sk_X509_num(out_certs) != initial_certs_len) { + X509 *x509 = sk_X509_pop(out_certs); + X509_free(x509); + } + } + + return ret; +} + +int PKCS7_get_CRLs(STACK_OF(X509_CRL) *out_crls, CBS *cbs) { + CBS signed_data, crls; + uint8_t *der_bytes = nullptr; + int ret = 0, has_crls; + const size_t initial_crls_len = sk_X509_CRL_num(out_crls); + + // See https://tools.ietf.org/html/rfc2315#section-9.1 + if (!pkcs7_parse_header(&der_bytes, &signed_data, cbs) || + // Even if only CRLs are included, there may be an empty certificates + // block. OpenSSL does this, for example. + !CBS_get_optional_asn1( + &signed_data, nullptr, nullptr, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) || + !CBS_get_optional_asn1( + &signed_data, &crls, &has_crls, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 1)) { + goto err; + } + + if (!has_crls) { + CBS_init(&crls, nullptr, 0); + } + + while (CBS_len(&crls) > 0) { + CBS crl_data; + X509_CRL *crl; + const uint8_t *inp; + + if (!CBS_get_asn1_element(&crls, &crl_data, CBS_ASN1_SEQUENCE)) { + goto err; + } + + if (CBS_len(&crl_data) > LONG_MAX) { + goto err; + } + inp = CBS_data(&crl_data); + crl = d2i_X509_CRL(nullptr, &inp, (long)CBS_len(&crl_data)); + if (!crl) { + goto err; + } + + assert(inp == CBS_data(&crl_data) + CBS_len(&crl_data)); + + if (sk_X509_CRL_push(out_crls, crl) == 0) { + X509_CRL_free(crl); + goto err; + } + } + + ret = 1; + +err: + OPENSSL_free(der_bytes); + + if (!ret) { + while (sk_X509_CRL_num(out_crls) != initial_crls_len) { + X509_CRL_free(sk_X509_CRL_pop(out_crls)); + } + } + + return ret; +} + +int PKCS7_get_PEM_certificates(STACK_OF(X509) *out_certs, BIO *pem_bio) { + uint8_t *data; + long len; + int ret; + + // Even though we pass PEM_STRING_PKCS7 as the expected PEM type here, PEM + // internally will actually allow several other values too, including + // "CERTIFICATE". + if (!PEM_bytes_read_bio(&data, &len, nullptr /* PEM type output */, + PEM_STRING_PKCS7, pem_bio, + nullptr /* password callback */, + nullptr /* password callback argument */)) { + return 0; + } + + CBS cbs; + CBS_init(&cbs, data, len); + ret = PKCS7_get_certificates(out_certs, &cbs); + OPENSSL_free(data); + return ret; +} + +int PKCS7_get_PEM_CRLs(STACK_OF(X509_CRL) *out_crls, BIO *pem_bio) { + uint8_t *data; + long len; + int ret; + + // Even though we pass PEM_STRING_PKCS7 as the expected PEM type here, PEM + // internally will actually allow several other values too, including + // "CERTIFICATE". + if (!PEM_bytes_read_bio(&data, &len, nullptr /* PEM type output */, + PEM_STRING_PKCS7, pem_bio, + nullptr /* password callback */, + nullptr /* password callback argument */)) { + return 0; + } + + CBS cbs; + CBS_init(&cbs, data, len); + ret = PKCS7_get_CRLs(out_crls, &cbs); + OPENSSL_free(data); + return ret; +} + +static int pkcs7_bundle_certificates_cb(CBB *out, void *arg) { + auto *certs = static_cast(arg); + size_t i; + CBB certificates; + + // See https://tools.ietf.org/html/rfc2315#section-9.1 + if (!CBB_add_asn1(out, &certificates, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)) { + return 0; + } + + for (i = 0; i < sk_X509_num(certs); i++) { + X509 *x509 = sk_X509_value(certs, i); + uint8_t *buf; + int len = i2d_X509(x509, nullptr); + + if (len < 0 || !CBB_add_space(&certificates, &buf, len) || + i2d_X509(x509, &buf) < 0) { + return 0; + } + } + + // |certificates| is a implicitly-tagged SET OF. + return CBB_flush_asn1_set_of(&certificates) && CBB_flush(out); +} + +int PKCS7_bundle_certificates(CBB *out, const STACK_OF(X509) *certs) { + return pkcs7_add_signed_data( + out, /*signed_data_version=*/1, + /*digest_algos_cb=*/nullptr, pkcs7_bundle_certificates_cb, + /*signer_infos_cb=*/nullptr, const_cast(certs)); +} + +static int pkcs7_bundle_crls_cb(CBB *out, void *arg) { + auto *crls = static_cast(arg); + size_t i; + CBB crl_data; + + // See https://tools.ietf.org/html/rfc2315#section-9.1 + if (!CBB_add_asn1(out, &crl_data, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 1)) { + return 0; + } + + for (i = 0; i < sk_X509_CRL_num(crls); i++) { + X509_CRL *crl = sk_X509_CRL_value(crls, i); + uint8_t *buf; + int len = i2d_X509_CRL(crl, nullptr); + + if (len < 0 || !CBB_add_space(&crl_data, &buf, len) || + i2d_X509_CRL(crl, &buf) < 0) { + return 0; + } + } + + // |crl_data| is a implicitly-tagged SET OF. + return CBB_flush_asn1_set_of(&crl_data) && CBB_flush(out); +} + +int PKCS7_bundle_CRLs(CBB *out, const STACK_OF(X509_CRL) *crls) { + return pkcs7_add_signed_data( + out, /*signed_data_version=*/1, + /*digest_algos_cb=*/nullptr, pkcs7_bundle_crls_cb, + /*signer_infos_cb=*/nullptr, const_cast(crls)); +} + +static PKCS7 *pkcs7_new(CBS *cbs) { + CBS copy = *cbs, copy2 = *cbs; + PKCS7 *ret = New(); + if (ret == nullptr) { + return nullptr; + } + ret->type = OBJ_nid2obj(NID_pkcs7_signed); + ret->d.sign = New(); + if (ret->d.sign == nullptr) { + goto err; + } + ret->d.sign->cert = sk_X509_new_null(); + ret->d.sign->crl = sk_X509_CRL_new_null(); + if (ret->d.sign->cert == nullptr || ret->d.sign->crl == nullptr || + !PKCS7_get_certificates(ret->d.sign->cert, ©) || + !PKCS7_get_CRLs(ret->d.sign->crl, cbs)) { + goto err; + } + + if (sk_X509_num(ret->d.sign->cert) == 0) { + sk_X509_free(ret->d.sign->cert); + ret->d.sign->cert = nullptr; + } + + if (sk_X509_CRL_num(ret->d.sign->crl) == 0) { + sk_X509_CRL_free(ret->d.sign->crl); + ret->d.sign->crl = nullptr; + } + + ret->ber_len = CBS_len(©2) - CBS_len(cbs); + ret->ber_bytes = reinterpret_cast( + OPENSSL_memdup(CBS_data(©2), ret->ber_len)); + if (ret->ber_bytes == nullptr) { + goto err; + } + + return ret; + +err: + PKCS7_free(ret); + return nullptr; +} + +PKCS7 *d2i_PKCS7(PKCS7 **out, const uint8_t **inp, size_t len) { + CBS cbs; + CBS_init(&cbs, *inp, len); + PKCS7 *ret = pkcs7_new(&cbs); + if (ret == nullptr) { + return nullptr; + } + *inp = CBS_data(&cbs); + if (out != nullptr) { + PKCS7_free(*out); + *out = ret; + } + return ret; +} + +PKCS7 *d2i_PKCS7_bio(BIO *bio, PKCS7 **out) { + // Use a generous bound, to allow for PKCS#7 files containing large root sets. + static const size_t kMaxSize = 4 * 1024 * 1024; + uint8_t *data; + size_t len; + if (!BIO_read_asn1(bio, &data, &len, kMaxSize)) { + return nullptr; + } + + CBS cbs; + CBS_init(&cbs, data, len); + PKCS7 *ret = pkcs7_new(&cbs); + OPENSSL_free(data); + if (out != nullptr && ret != nullptr) { + PKCS7_free(*out); + *out = ret; + } + return ret; +} + +int i2d_PKCS7(const PKCS7 *p7, uint8_t **out) { + if (p7->ber_len > INT_MAX) { + OPENSSL_PUT_ERROR(PKCS8, ERR_R_OVERFLOW); + return -1; + } + + if (out == nullptr) { + return (int)p7->ber_len; + } + + if (*out == nullptr) { + *out = + reinterpret_cast(OPENSSL_memdup(p7->ber_bytes, p7->ber_len)); + if (*out == nullptr) { + return -1; + } + } else { + OPENSSL_memcpy(*out, p7->ber_bytes, p7->ber_len); + *out += p7->ber_len; + } + return (int)p7->ber_len; +} + +int i2d_PKCS7_bio(BIO *bio, const PKCS7 *p7) { + return BIO_write_all(bio, p7->ber_bytes, p7->ber_len); +} + +void PKCS7_free(PKCS7 *p7) { + if (p7 == nullptr) { + return; + } + + OPENSSL_free(p7->ber_bytes); + ASN1_OBJECT_free(p7->type); + // We only supported signed data. + if (p7->d.sign != nullptr) { + sk_X509_pop_free(p7->d.sign->cert, X509_free); + sk_X509_CRL_pop_free(p7->d.sign->crl, X509_CRL_free); + Delete(p7->d.sign); + } + Delete(p7); +} + +// We only support signed data, so these getters are no-ops. +int PKCS7_type_is_data(const PKCS7 *p7) { return 0; } +int PKCS7_type_is_digest(const PKCS7 *p7) { return 0; } +int PKCS7_type_is_encrypted(const PKCS7 *p7) { return 0; } +int PKCS7_type_is_enveloped(const PKCS7 *p7) { return 0; } +int PKCS7_type_is_signed(const PKCS7 *p7) { return 1; } +int PKCS7_type_is_signedAndEnveloped(const PKCS7 *p7) { return 0; } + +static bool digest_sign_update(EVP_MD_CTX *ctx, BIO *data) { + for (;;) { + uint8_t buf[4096]; + const int n = BIO_read(data, buf, sizeof(buf)); + if (n == 0) { + return true; + } else if (n < 0 || !EVP_DigestSignUpdate(ctx, buf, n)) { + return false; + } + } +} + +namespace { +struct signer_info_data { + X509 *sign_cert = nullptr; + ScopedEVP_MD_CTX sign_ctx; + bool use_key_id = false; +}; +} // namespace + +static int write_signer_digest_algos(CBB *digest_algos_set, void *arg) { + auto *si_data = static_cast(arg); + // https://www.rfc-editor.org/rfc/rfc5754.html#section-2 + // "Implementations MUST generate SHA2 AlgorithmIdentifiers with absent + // parameters." + return EVP_marshal_digest_algorithm_no_params( + digest_algos_set, EVP_MD_CTX_get0_md(si_data->sign_ctx.get())); +} + +// write_signer_info writes the SignerInfo structure from +// https://www.rfc-editor.org/rfc/rfc2315.html#section-9.2 and +// https://www.rfc-editor.org/rfc/rfc5652.html#section-5.3 to |out|. It returns +// one on success or zero on error. +static int write_signer_info(CBB *out, void *arg) { + auto *si_data = static_cast(arg); + + uint64_t version = si_data->use_key_id ? 3u : 1u; + CBB seq, child, signing_algo, null, signature; + if (!CBB_add_asn1(out, &seq, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&seq, version)) { + return 0; + } + + // Output the SignerIdentifier. + if (si_data->use_key_id) { + const ASN1_OCTET_STRING *skid = + X509_get0_subject_key_id(si_data->sign_cert); + if (skid == nullptr) { + OPENSSL_PUT_ERROR(CMS, CMS_R_CERTIFICATE_HAS_NO_KEYID); + return 0; + } + // subjectKeyIdentifier is implicitly-tagged. + if (!CBB_add_asn1_element(&seq, CBS_ASN1_CONTEXT_SPECIFIC | 0, + ASN1_STRING_get0_data(skid), + ASN1_STRING_length(skid))) { + return 0; + } + } else { + if (!CBB_add_asn1(&seq, &child, CBS_ASN1_SEQUENCE) || + !x509_marshal_name(&child, X509_get_issuer_name(si_data->sign_cert)) || + !asn1_marshal_integer(&child, + X509_get0_serialNumber(si_data->sign_cert), + /*tag=*/0)) { + return 0; + } + } + + // Output the digest and signature algorithm. This cannot use X.509 signature + // algorithms because CMS incorrectly decomposes signature algorithms into a + // combination of digesting and "encrypting" the digest, then uses the plain + // rsaEncryption OID instead of the hash-specific RSA OIDs. For now, we only + // support RSA. + EVP_PKEY *pkey = EVP_PKEY_CTX_get0_pkey(si_data->sign_ctx->pctx); + if (EVP_PKEY_id(pkey) != EVP_PKEY_RSA) { + OPENSSL_PUT_ERROR(PKCS7, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return 0; + } + if (!EVP_marshal_digest_algorithm_no_params( + &seq, EVP_MD_CTX_get0_md(si_data->sign_ctx.get())) || + !CBB_add_asn1(&seq, &signing_algo, CBS_ASN1_SEQUENCE) || + !OBJ_nid2cbb(&signing_algo, NID_rsaEncryption) || + !CBB_add_asn1(&signing_algo, &null, CBS_ASN1_NULL)) { + return 0; + } + + // Output the signature. + uint8_t *ptr; + size_t sig_len; + if (!EVP_DigestSignFinal(si_data->sign_ctx.get(), nullptr, &sig_len) || + !CBB_add_asn1(&seq, &signature, CBS_ASN1_OCTETSTRING) || + !CBB_reserve(&signature, &ptr, sig_len) || + !EVP_DigestSignFinal(si_data->sign_ctx.get(), ptr, &sig_len) || + !CBB_did_write(&signature, sig_len) || // + !CBB_flush(out)) { + return 0; + } + + return 1; +} + +int bssl::pkcs7_add_external_signature(CBB *out, X509 *sign_cert, EVP_PKEY *key, + const EVP_MD *md, BIO *data, + bool use_key_id) { + signer_info_data si_data; + si_data.use_key_id = use_key_id; + si_data.sign_cert = sign_cert; + + // Set up the signature. + if (!EVP_DigestSignInit(si_data.sign_ctx.get(), nullptr, md, nullptr, key) || + !digest_sign_update(si_data.sign_ctx.get(), data)) { + return 0; + } + + // See RFC 5652, Section 5.1. When no certificates are present, the version + // comes from the highest SignerInfo version, which will be 3 (CMS) for a key + // ID, and 1 (CMS or PKCS#7) for issuer and serial. + uint64_t signed_data_version = use_key_id ? 3u : 1u; + return pkcs7_add_signed_data( + out, signed_data_version, write_signer_digest_algos, + /*cert_crl_cb=*/nullptr, write_signer_info, &si_data); +} + +PKCS7 *PKCS7_sign(X509 *sign_cert, EVP_PKEY *pkey, STACK_OF(X509) *certs, + BIO *data, int flags) { + ScopedCBB cbb; + if (!CBB_init(cbb.get(), 2048)) { + return nullptr; + } + + if (sign_cert == nullptr && pkey == nullptr && flags == PKCS7_DETACHED) { + // Caller just wants to bundle certificates. + if (!PKCS7_bundle_certificates(cbb.get(), certs)) { + return nullptr; + } + } else if (sign_cert != nullptr && pkey != nullptr && certs == nullptr && + data != nullptr && + flags == (PKCS7_NOATTR | PKCS7_BINARY | PKCS7_NOCERTS | + PKCS7_DETACHED)) { + // In OpenSSL, this API signs with some default hash. That default has been + // SHA-256 since 2015. + if (!pkcs7_add_external_signature(cbb.get(), sign_cert, pkey, EVP_sha256(), + data, /*use_key_id=*/false)) { + return nullptr; + } + } else { + OPENSSL_PUT_ERROR(PKCS7, ERR_R_SHOULD_NOT_HAVE_BEEN_CALLED); + return nullptr; + } + + CBS cbs; + CBS_init(&cbs, CBB_data(cbb.get()), CBB_len(cbb.get())); + return pkcs7_new(&cbs); +} diff --git a/third_party/boringssl/src/crypto/pkcs8/internal.h b/third_party/boringssl/src/crypto/pkcs8/internal.h index ab84c82b..89cdf9cd 100644 --- a/third_party/boringssl/src/crypto/pkcs8/internal.h +++ b/third_party/boringssl/src/crypto/pkcs8/internal.h @@ -1,66 +1,22 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 1999. - */ -/* ==================================================================== - * Copyright (c) 1999 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#ifndef OPENSSL_HEADER_PKCS8_INTERNAL_H -#define OPENSSL_HEADER_PKCS8_INTERNAL_H +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_PKCS8_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_PKCS8_INTERNAL_H #include - -#if defined(__cplusplus) -extern "C" { -#endif +#include struct pkcs8_priv_key_info_st { @@ -70,6 +26,8 @@ struct pkcs8_priv_key_info_st { STACK_OF(X509_ATTRIBUTE) *attributes; }; +BSSL_NAMESPACE_BEGIN + // pkcs8_pbe_decrypt decrypts |in| using the PBE scheme described by // |algorithm|, which should be a serialized AlgorithmIdentifier structure. On // success, it sets |*out| to a newly-allocated buffer containing the decrypted @@ -87,22 +45,24 @@ int pkcs8_pbe_decrypt(uint8_t **out, size_t *out_len, CBS *algorithm, // key material to |out| and returns one. Otherwise, it returns zero. |id| // should be one of the |PKCS12_*_ID| values. int pkcs12_key_gen(const char *pass, size_t pass_len, const uint8_t *salt, - size_t salt_len, uint8_t id, unsigned iterations, + size_t salt_len, uint8_t id, uint32_t iterations, size_t out_len, uint8_t *out, const EVP_MD *md); // pkcs12_pbe_encrypt_init configures |ctx| for encrypting with a PBES1 scheme -// defined in PKCS#12. It writes the corresponding AlgorithmIdentifier to |out|. -int pkcs12_pbe_encrypt_init(CBB *out, EVP_CIPHER_CTX *ctx, int alg, - unsigned iterations, const char *pass, - size_t pass_len, const uint8_t *salt, - size_t salt_len); +// defined in PKCS#12, or a PBES2 scheme defined in PKCS#5. The algorithm is +// determined as in |PKCS8_encrypt|. It writes the corresponding +// AlgorithmIdentifier to |out|. +int pkcs12_pbe_encrypt_init(CBB *out, EVP_CIPHER_CTX *ctx, int alg_nid, + const EVP_CIPHER *alg_cipher, uint32_t iterations, + const char *pass, size_t pass_len, + const uint8_t *salt, size_t salt_len); struct pbe_suite { int pbe_nid; uint8_t oid[10]; uint8_t oid_len; - const EVP_CIPHER *(*cipher_func)(void); - const EVP_MD *(*md_func)(void); + const EVP_CIPHER *(*cipher_func)(); + const EVP_MD *(*md_func)(); // decrypt_init initialize |ctx| for decrypting. The password is specified by // |pass| and |pass_len|. |param| contains the serialized parameters field of // the AlgorithmIdentifier. @@ -112,7 +72,11 @@ struct pbe_suite { const char *pass, size_t pass_len, CBS *param); }; -#define PKCS5_SALT_LEN 8 +#define PKCS5_SALT_LEN 16 + +// pkcs5_pbe2_nid_to_cipher returns the |EVP_CIPHER| for |nid| if |nid| is +// supported with PKCS#5 PBES2, and nullptr otherwise. +const EVP_CIPHER *pkcs5_pbe2_nid_to_cipher(int nid); int PKCS5_pbe2_decrypt_init(const struct pbe_suite *suite, EVP_CIPHER_CTX *ctx, const char *pass, size_t pass_len, CBS *param); @@ -121,7 +85,7 @@ int PKCS5_pbe2_decrypt_init(const struct pbe_suite *suite, EVP_CIPHER_CTX *ctx, // as defined in RFC 2998, with the specified parameters. It writes the // corresponding AlgorithmIdentifier to |out|. int PKCS5_pbe2_encrypt_init(CBB *out, EVP_CIPHER_CTX *ctx, - const EVP_CIPHER *cipher, unsigned iterations, + const EVP_CIPHER *cipher, uint32_t iterations, const char *pass, size_t pass_len, const uint8_t *salt, size_t salt_len); @@ -129,9 +93,6 @@ int PKCS5_pbe2_encrypt_init(CBB *out, EVP_CIPHER_CTX *ctx, // number of PBKDF2 iterations and zero otherwise. int pkcs12_iterations_acceptable(uint64_t iterations); +BSSL_NAMESPACE_END -#if defined(__cplusplus) -} // extern C -#endif - -#endif // OPENSSL_HEADER_PKCS8_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_PKCS8_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/pkcs8/p5_pbev2.c b/third_party/boringssl/src/crypto/pkcs8/p5_pbev2.c deleted file mode 100644 index e58cf444..00000000 --- a/third_party/boringssl/src/crypto/pkcs8/p5_pbev2.c +++ /dev/null @@ -1,316 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 1999-2004. - */ -/* ==================================================================== - * Copyright (c) 1999 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../internal.h" - - -// 1.2.840.113549.1.5.12 -static const uint8_t kPBKDF2[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, - 0x0d, 0x01, 0x05, 0x0c}; - -// 1.2.840.113549.1.5.13 -static const uint8_t kPBES2[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, - 0x0d, 0x01, 0x05, 0x0d}; - -// 1.2.840.113549.2.7 -static const uint8_t kHMACWithSHA1[] = {0x2a, 0x86, 0x48, 0x86, - 0xf7, 0x0d, 0x02, 0x07}; - -// 1.2.840.113549.2.9 -static const uint8_t kHMACWithSHA256[] = {0x2a, 0x86, 0x48, 0x86, - 0xf7, 0x0d, 0x02, 0x09}; - -static const struct { - uint8_t oid[9]; - uint8_t oid_len; - int nid; - const EVP_CIPHER *(*cipher_func)(void); -} kCipherOIDs[] = { - // 1.2.840.113549.3.2 - {{0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x03, 0x02}, - 8, - NID_rc2_cbc, - &EVP_rc2_cbc}, - // 1.2.840.113549.3.7 - {{0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x03, 0x07}, - 8, - NID_des_ede3_cbc, - &EVP_des_ede3_cbc}, - // 2.16.840.1.101.3.4.1.2 - {{0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x01, 0x02}, - 9, - NID_aes_128_cbc, - &EVP_aes_128_cbc}, - // 2.16.840.1.101.3.4.1.22 - {{0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x01, 0x16}, - 9, - NID_aes_192_cbc, - &EVP_aes_192_cbc}, - // 2.16.840.1.101.3.4.1.42 - {{0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x01, 0x2a}, - 9, - NID_aes_256_cbc, - &EVP_aes_256_cbc}, -}; - -static const EVP_CIPHER *cbs_to_cipher(const CBS *cbs) { - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kCipherOIDs); i++) { - if (CBS_mem_equal(cbs, kCipherOIDs[i].oid, kCipherOIDs[i].oid_len)) { - return kCipherOIDs[i].cipher_func(); - } - } - - return NULL; -} - -static int add_cipher_oid(CBB *out, int nid) { - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(kCipherOIDs); i++) { - if (kCipherOIDs[i].nid == nid) { - CBB child; - return CBB_add_asn1(out, &child, CBS_ASN1_OBJECT) && - CBB_add_bytes(&child, kCipherOIDs[i].oid, - kCipherOIDs[i].oid_len) && - CBB_flush(out); - } - } - - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_CIPHER); - return 0; -} - -static int pkcs5_pbe2_cipher_init(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, - const EVP_MD *pbkdf2_md, unsigned iterations, - const char *pass, size_t pass_len, - const uint8_t *salt, size_t salt_len, - const uint8_t *iv, size_t iv_len, int enc) { - if (iv_len != EVP_CIPHER_iv_length(cipher)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_ERROR_SETTING_CIPHER_PARAMS); - return 0; - } - - uint8_t key[EVP_MAX_KEY_LENGTH]; - int ret = PKCS5_PBKDF2_HMAC(pass, pass_len, salt, salt_len, iterations, - pbkdf2_md, EVP_CIPHER_key_length(cipher), key) && - EVP_CipherInit_ex(ctx, cipher, NULL /* engine */, key, iv, enc); - OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH); - return ret; -} - -int PKCS5_pbe2_encrypt_init(CBB *out, EVP_CIPHER_CTX *ctx, - const EVP_CIPHER *cipher, unsigned iterations, - const char *pass, size_t pass_len, - const uint8_t *salt, size_t salt_len) { - int cipher_nid = EVP_CIPHER_nid(cipher); - if (cipher_nid == NID_undef) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER); - return 0; - } - - // Generate a random IV. - uint8_t iv[EVP_MAX_IV_LENGTH]; - if (!RAND_bytes(iv, EVP_CIPHER_iv_length(cipher))) { - return 0; - } - - // See RFC 2898, appendix A. - CBB algorithm, oid, param, kdf, kdf_oid, kdf_param, salt_cbb, cipher_cbb, - iv_cbb; - if (!CBB_add_asn1(out, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, kPBES2, sizeof(kPBES2)) || - !CBB_add_asn1(&algorithm, ¶m, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(¶m, &kdf, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&kdf, &kdf_oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&kdf_oid, kPBKDF2, sizeof(kPBKDF2)) || - !CBB_add_asn1(&kdf, &kdf_param, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&kdf_param, &salt_cbb, CBS_ASN1_OCTETSTRING) || - !CBB_add_bytes(&salt_cbb, salt, salt_len) || - !CBB_add_asn1_uint64(&kdf_param, iterations) || - // Specify a key length for RC2. - (cipher_nid == NID_rc2_cbc && - !CBB_add_asn1_uint64(&kdf_param, EVP_CIPHER_key_length(cipher))) || - // Omit the PRF. We use the default hmacWithSHA1. - !CBB_add_asn1(¶m, &cipher_cbb, CBS_ASN1_SEQUENCE) || - !add_cipher_oid(&cipher_cbb, cipher_nid) || - // RFC 2898 says RC2-CBC and RC5-CBC-Pad use a SEQUENCE with version and - // IV, but OpenSSL always uses an OCTET STRING IV, so we do the same. - !CBB_add_asn1(&cipher_cbb, &iv_cbb, CBS_ASN1_OCTETSTRING) || - !CBB_add_bytes(&iv_cbb, iv, EVP_CIPHER_iv_length(cipher)) || - !CBB_flush(out)) { - return 0; - } - - return pkcs5_pbe2_cipher_init(ctx, cipher, EVP_sha1(), iterations, pass, - pass_len, salt, salt_len, iv, - EVP_CIPHER_iv_length(cipher), 1 /* encrypt */); -} - -int PKCS5_pbe2_decrypt_init(const struct pbe_suite *suite, EVP_CIPHER_CTX *ctx, - const char *pass, size_t pass_len, CBS *param) { - CBS pbe_param, kdf, kdf_obj, enc_scheme, enc_obj; - if (!CBS_get_asn1(param, &pbe_param, CBS_ASN1_SEQUENCE) || - CBS_len(param) != 0 || - !CBS_get_asn1(&pbe_param, &kdf, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&pbe_param, &enc_scheme, CBS_ASN1_SEQUENCE) || - CBS_len(&pbe_param) != 0 || - !CBS_get_asn1(&kdf, &kdf_obj, CBS_ASN1_OBJECT) || - !CBS_get_asn1(&enc_scheme, &enc_obj, CBS_ASN1_OBJECT)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); - return 0; - } - - // Only PBKDF2 is supported. - if (!CBS_mem_equal(&kdf_obj, kPBKDF2, sizeof(kPBKDF2))) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_KEY_DERIVATION_FUNCTION); - return 0; - } - - // See if we recognise the encryption algorithm. - const EVP_CIPHER *cipher = cbs_to_cipher(&enc_obj); - if (cipher == NULL) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_CIPHER); - return 0; - } - - // Parse the KDF parameters. See RFC 8018, appendix A.2. - CBS pbkdf2_params, salt; - uint64_t iterations; - if (!CBS_get_asn1(&kdf, &pbkdf2_params, CBS_ASN1_SEQUENCE) || - CBS_len(&kdf) != 0 || - !CBS_get_asn1(&pbkdf2_params, &salt, CBS_ASN1_OCTETSTRING) || - !CBS_get_asn1_uint64(&pbkdf2_params, &iterations)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); - return 0; - } - - if (!pkcs12_iterations_acceptable(iterations)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_ITERATION_COUNT); - return 0; - } - - // The optional keyLength parameter, if present, must match the key length of - // the cipher. - if (CBS_peek_asn1_tag(&pbkdf2_params, CBS_ASN1_INTEGER)) { - uint64_t key_len; - if (!CBS_get_asn1_uint64(&pbkdf2_params, &key_len)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); - return 0; - } - - if (key_len != EVP_CIPHER_key_length(cipher)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_KEYLENGTH); - return 0; - } - } - - const EVP_MD *md = EVP_sha1(); - if (CBS_len(&pbkdf2_params) != 0) { - CBS alg_id, prf; - if (!CBS_get_asn1(&pbkdf2_params, &alg_id, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&alg_id, &prf, CBS_ASN1_OBJECT) || - CBS_len(&pbkdf2_params) != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); - return 0; - } - - if (CBS_mem_equal(&prf, kHMACWithSHA1, sizeof(kHMACWithSHA1))) { - // hmacWithSHA1 is the DEFAULT, so DER requires it be omitted, but we - // match OpenSSL in tolerating it being present. - md = EVP_sha1(); - } else if (CBS_mem_equal(&prf, kHMACWithSHA256, sizeof(kHMACWithSHA256))) { - md = EVP_sha256(); - } else { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_PRF); - return 0; - } - - // All supported PRFs use a NULL parameter. - CBS null; - if (!CBS_get_asn1(&alg_id, &null, CBS_ASN1_NULL) || - CBS_len(&null) != 0 || - CBS_len(&alg_id) != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); - return 0; - } - } - - // Parse the encryption scheme parameters. Note OpenSSL does not match the - // specification. Per RFC 2898, this should depend on the encryption scheme. - // In particular, RC2-CBC uses a SEQUENCE with version and IV. We align with - // OpenSSL. - CBS iv; - if (!CBS_get_asn1(&enc_scheme, &iv, CBS_ASN1_OCTETSTRING) || - CBS_len(&enc_scheme) != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_PRF); - return 0; - } - - return pkcs5_pbe2_cipher_init(ctx, cipher, md, (unsigned)iterations, pass, - pass_len, CBS_data(&salt), CBS_len(&salt), - CBS_data(&iv), CBS_len(&iv), 0 /* decrypt */); -} diff --git a/third_party/boringssl/src/crypto/pkcs8/p5_pbev2.cc b/third_party/boringssl/src/crypto/pkcs8/p5_pbev2.cc new file mode 100644 index 00000000..e1307bd9 --- /dev/null +++ b/third_party/boringssl/src/crypto/pkcs8/p5_pbev2.cc @@ -0,0 +1,286 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +// 1.2.840.113549.1.5.12 +static const uint8_t kPBKDF2[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, + 0x0d, 0x01, 0x05, 0x0c}; + +// 1.2.840.113549.1.5.13 +static const uint8_t kPBES2[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, + 0x0d, 0x01, 0x05, 0x0d}; + +// 1.2.840.113549.2.7 +static const uint8_t kHMACWithSHA1[] = {0x2a, 0x86, 0x48, 0x86, + 0xf7, 0x0d, 0x02, 0x07}; + +// 1.2.840.113549.2.9 +static const uint8_t kHMACWithSHA256[] = {0x2a, 0x86, 0x48, 0x86, + 0xf7, 0x0d, 0x02, 0x09}; + +static const struct { + uint8_t oid[9]; + uint8_t oid_len; + int nid; + const EVP_CIPHER *(*cipher_func)(); +} kCipherOIDs[] = { + // 1.2.840.113549.3.2 + {{0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x03, 0x02}, + 8, + NID_rc2_cbc, + &EVP_rc2_cbc}, + // 1.2.840.113549.3.7 + {{0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x03, 0x07}, + 8, + NID_des_ede3_cbc, + &EVP_des_ede3_cbc}, + // 2.16.840.1.101.3.4.1.2 + {{0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x01, 0x02}, + 9, + NID_aes_128_cbc, + &EVP_aes_128_cbc}, + // 2.16.840.1.101.3.4.1.22 + {{0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x01, 0x16}, + 9, + NID_aes_192_cbc, + &EVP_aes_192_cbc}, + // 2.16.840.1.101.3.4.1.42 + {{0x60, 0x86, 0x48, 0x01, 0x65, 0x03, 0x04, 0x01, 0x2a}, + 9, + NID_aes_256_cbc, + &EVP_aes_256_cbc}, +}; + +static const EVP_CIPHER *cbs_to_cipher(const CBS *cbs) { + for (const auto &cipher : kCipherOIDs) { + if (CBS_mem_equal(cbs, cipher.oid, cipher.oid_len)) { + return cipher.cipher_func(); + } + } + + return nullptr; +} + +static int add_cipher_oid(CBB *out, int nid) { + for (const auto &cipher : kCipherOIDs) { + if (cipher.nid == nid) { + return CBB_add_asn1_element(out, CBS_ASN1_OBJECT, cipher.oid, + cipher.oid_len); + } + } + + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_CIPHER); + return 0; +} + +const EVP_CIPHER *bssl::pkcs5_pbe2_nid_to_cipher(int nid) { + for (const auto &cipher : kCipherOIDs) { + if (cipher.nid == nid) { + return cipher.cipher_func(); + } + } + return nullptr; +} + +static int pkcs5_pbe2_cipher_init(EVP_CIPHER_CTX *ctx, const EVP_CIPHER *cipher, + const EVP_MD *pbkdf2_md, uint32_t iterations, + const char *pass, size_t pass_len, + const uint8_t *salt, size_t salt_len, + const uint8_t *iv, size_t iv_len, int enc) { + if (iv_len != EVP_CIPHER_iv_length(cipher)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_ERROR_SETTING_CIPHER_PARAMS); + return 0; + } + + uint8_t key[EVP_MAX_KEY_LENGTH]; + int ret = PKCS5_PBKDF2_HMAC(pass, pass_len, salt, salt_len, iterations, + pbkdf2_md, EVP_CIPHER_key_length(cipher), key) && + EVP_CipherInit_ex(ctx, cipher, nullptr /* engine */, key, iv, enc); + OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH); + return ret; +} + +int bssl::PKCS5_pbe2_encrypt_init(CBB *out, EVP_CIPHER_CTX *ctx, + const EVP_CIPHER *cipher, uint32_t iterations, + const char *pass, size_t pass_len, + const uint8_t *salt, size_t salt_len) { + int cipher_nid = EVP_CIPHER_nid(cipher); + if (cipher_nid == NID_undef) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_CIPHER_HAS_NO_OBJECT_IDENTIFIER); + return 0; + } + + // Generate a random IV. + uint8_t iv[EVP_MAX_IV_LENGTH]; + if (!RAND_bytes(iv, EVP_CIPHER_iv_length(cipher))) { + return 0; + } + + // See RFC 8018, appendix A. + CBB algorithm, param, kdf, kdf_param, prf, cipher_cbb; + if (!CBB_add_asn1(out, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, kPBES2, + sizeof(kPBES2)) || + !CBB_add_asn1(&algorithm, ¶m, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1(¶m, &kdf, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&kdf, CBS_ASN1_OBJECT, kPBKDF2, sizeof(kPBKDF2)) || + !CBB_add_asn1(&kdf, &kdf_param, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_octet_string(&kdf_param, salt, salt_len) || + !CBB_add_asn1_uint64(&kdf_param, iterations) || + // Specify a key length for RC2. + (cipher_nid == NID_rc2_cbc && + !CBB_add_asn1_uint64(&kdf_param, EVP_CIPHER_key_length(cipher))) || + // Use hmacWithSHA256 for the PRF. + !CBB_add_asn1(&kdf_param, &prf, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&prf, CBS_ASN1_OBJECT, kHMACWithSHA256, + sizeof(kHMACWithSHA256)) || + !CBB_add_asn1_element(&prf, CBS_ASN1_NULL, nullptr, 0) || + !CBB_add_asn1(¶m, &cipher_cbb, CBS_ASN1_SEQUENCE) || + !add_cipher_oid(&cipher_cbb, cipher_nid) || + // RFC 8018 says RC2-CBC and RC5-CBC-Pad use a SEQUENCE with version and + // IV, but OpenSSL always uses an OCTET STRING IV, so we do the same. + !CBB_add_asn1_octet_string(&cipher_cbb, iv, + EVP_CIPHER_iv_length(cipher)) || + !CBB_flush(out)) { + return 0; + } + + return pkcs5_pbe2_cipher_init(ctx, cipher, EVP_sha256(), iterations, pass, + pass_len, salt, salt_len, iv, + EVP_CIPHER_iv_length(cipher), 1 /* encrypt */); +} + +int bssl::PKCS5_pbe2_decrypt_init(const struct pbe_suite *suite, + EVP_CIPHER_CTX *ctx, const char *pass, + size_t pass_len, CBS *param) { + CBS pbe_param, kdf, kdf_obj, enc_scheme, enc_obj; + if (!CBS_get_asn1(param, &pbe_param, CBS_ASN1_SEQUENCE) || + CBS_len(param) != 0 || + !CBS_get_asn1(&pbe_param, &kdf, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&pbe_param, &enc_scheme, CBS_ASN1_SEQUENCE) || + CBS_len(&pbe_param) != 0 || + !CBS_get_asn1(&kdf, &kdf_obj, CBS_ASN1_OBJECT) || + !CBS_get_asn1(&enc_scheme, &enc_obj, CBS_ASN1_OBJECT)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); + return 0; + } + + // Only PBKDF2 is supported. + if (!CBS_mem_equal(&kdf_obj, kPBKDF2, sizeof(kPBKDF2))) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_KEY_DERIVATION_FUNCTION); + return 0; + } + + // See if we recognise the encryption algorithm. + const EVP_CIPHER *cipher = cbs_to_cipher(&enc_obj); + if (cipher == nullptr) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_CIPHER); + return 0; + } + + // Parse the KDF parameters. See RFC 8018, appendix A.2. + CBS pbkdf2_params, salt; + uint64_t iterations; + if (!CBS_get_asn1(&kdf, &pbkdf2_params, CBS_ASN1_SEQUENCE) || + CBS_len(&kdf) != 0 || + !CBS_get_asn1(&pbkdf2_params, &salt, CBS_ASN1_OCTETSTRING) || + !CBS_get_asn1_uint64(&pbkdf2_params, &iterations)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); + return 0; + } + + if (!pkcs12_iterations_acceptable(iterations)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_ITERATION_COUNT); + return 0; + } + + // The optional keyLength parameter, if present, must match the key length of + // the cipher. + if (CBS_peek_asn1_tag(&pbkdf2_params, CBS_ASN1_INTEGER)) { + uint64_t key_len; + if (!CBS_get_asn1_uint64(&pbkdf2_params, &key_len)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); + return 0; + } + + if (key_len != EVP_CIPHER_key_length(cipher)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_KEYLENGTH); + return 0; + } + } + + const EVP_MD *md = EVP_sha1(); + if (CBS_len(&pbkdf2_params) != 0) { + CBS alg_id, prf; + if (!CBS_get_asn1(&pbkdf2_params, &alg_id, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&alg_id, &prf, CBS_ASN1_OBJECT) || + CBS_len(&pbkdf2_params) != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); + return 0; + } + + if (CBS_mem_equal(&prf, kHMACWithSHA1, sizeof(kHMACWithSHA1))) { + // hmacWithSHA1 is the DEFAULT, so DER requires it be omitted, but we + // match OpenSSL in tolerating it being present. + md = EVP_sha1(); + } else if (CBS_mem_equal(&prf, kHMACWithSHA256, sizeof(kHMACWithSHA256))) { + md = EVP_sha256(); + } else { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_PRF); + return 0; + } + + // All supported PRFs use a NULL parameter. + CBS null; + if (!CBS_get_asn1(&alg_id, &null, CBS_ASN1_NULL) || + CBS_len(&null) != 0 || + CBS_len(&alg_id) != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); + return 0; + } + } + + // Parse the encryption scheme parameters. Note OpenSSL does not match the + // specification. Per RFC 8018, this should depend on the encryption scheme. + // In particular, RC2-CBC uses a SEQUENCE with version and IV. We align with + // OpenSSL. + CBS iv; + if (!CBS_get_asn1(&enc_scheme, &iv, CBS_ASN1_OCTETSTRING) || + CBS_len(&enc_scheme) != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_PRF); + return 0; + } + + return pkcs5_pbe2_cipher_init(ctx, cipher, md, (uint32_t)iterations, pass, + pass_len, CBS_data(&salt), CBS_len(&salt), + CBS_data(&iv), CBS_len(&iv), 0 /* decrypt */); +} diff --git a/third_party/boringssl/src/crypto/pkcs8/pkcs8.c b/third_party/boringssl/src/crypto/pkcs8/pkcs8.c deleted file mode 100644 index 84b7b127..00000000 --- a/third_party/boringssl/src/crypto/pkcs8/pkcs8.c +++ /dev/null @@ -1,530 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 1999. - */ -/* ==================================================================== - * Copyright (c) 1999 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../bytestring/internal.h" -#include "../internal.h" - - -static int pkcs12_encode_password(const char *in, size_t in_len, uint8_t **out, - size_t *out_len) { - CBB cbb; - if (!CBB_init(&cbb, in_len * 2)) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE); - return 0; - } - - // Convert the password to BMPString, or UCS-2. See - // https://tools.ietf.org/html/rfc7292#appendix-B.1. - CBS cbs; - CBS_init(&cbs, (const uint8_t *)in, in_len); - while (CBS_len(&cbs) != 0) { - uint32_t c; - if (!cbs_get_utf8(&cbs, &c) || - !cbb_add_ucs2_be(&cbb, c)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_INVALID_CHARACTERS); - goto err; - } - } - - // Terminate the result with a UCS-2 NUL. - if (!cbb_add_ucs2_be(&cbb, 0) || - !CBB_finish(&cbb, out, out_len)) { - goto err; - } - - return 1; - -err: - CBB_cleanup(&cbb); - return 0; -} - -int pkcs12_key_gen(const char *pass, size_t pass_len, const uint8_t *salt, - size_t salt_len, uint8_t id, unsigned iterations, - size_t out_len, uint8_t *out, const EVP_MD *md) { - // See https://tools.ietf.org/html/rfc7292#appendix-B. Quoted parts of the - // specification have errata applied and other typos fixed. - - if (iterations < 1) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_ITERATION_COUNT); - return 0; - } - - int ret = 0; - EVP_MD_CTX ctx; - EVP_MD_CTX_init(&ctx); - uint8_t *pass_raw = NULL, *I = NULL; - size_t pass_raw_len = 0, I_len = 0; - // If |pass| is NULL, we use the empty string rather than {0, 0} as the raw - // password. - if (pass != NULL && - !pkcs12_encode_password(pass, pass_len, &pass_raw, &pass_raw_len)) { - goto err; - } - - // In the spec, |block_size| is called "v", but measured in bits. - size_t block_size = EVP_MD_block_size(md); - - // 1. Construct a string, D (the "diversifier"), by concatenating v/8 copies - // of ID. - uint8_t D[EVP_MAX_MD_BLOCK_SIZE]; - OPENSSL_memset(D, id, block_size); - - // 2. Concatenate copies of the salt together to create a string S of length - // v(ceiling(s/v)) bits (the final copy of the salt may be truncated to - // create S). Note that if the salt is the empty string, then so is S. - // - // 3. Concatenate copies of the password together to create a string P of - // length v(ceiling(p/v)) bits (the final copy of the password may be - // truncated to create P). Note that if the password is the empty string, - // then so is P. - // - // 4. Set I=S||P to be the concatenation of S and P. - if (salt_len + block_size - 1 < salt_len || - pass_raw_len + block_size - 1 < pass_raw_len) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_OVERFLOW); - goto err; - } - size_t S_len = block_size * ((salt_len + block_size - 1) / block_size); - size_t P_len = block_size * ((pass_raw_len + block_size - 1) / block_size); - I_len = S_len + P_len; - if (I_len < S_len) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_OVERFLOW); - goto err; - } - - I = OPENSSL_malloc(I_len); - if (I_len != 0 && I == NULL) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE); - goto err; - } - - for (size_t i = 0; i < S_len; i++) { - I[i] = salt[i % salt_len]; - } - for (size_t i = 0; i < P_len; i++) { - I[i + S_len] = pass_raw[i % pass_raw_len]; - } - - while (out_len != 0) { - // A. Set A_i=H^r(D||I). (i.e., the r-th hash of D||I, - // H(H(H(... H(D||I)))) - uint8_t A[EVP_MAX_MD_SIZE]; - unsigned A_len; - if (!EVP_DigestInit_ex(&ctx, md, NULL) || - !EVP_DigestUpdate(&ctx, D, block_size) || - !EVP_DigestUpdate(&ctx, I, I_len) || - !EVP_DigestFinal_ex(&ctx, A, &A_len)) { - goto err; - } - for (unsigned iter = 1; iter < iterations; iter++) { - if (!EVP_DigestInit_ex(&ctx, md, NULL) || - !EVP_DigestUpdate(&ctx, A, A_len) || - !EVP_DigestFinal_ex(&ctx, A, &A_len)) { - goto err; - } - } - - size_t todo = out_len < A_len ? out_len : A_len; - OPENSSL_memcpy(out, A, todo); - out += todo; - out_len -= todo; - if (out_len == 0) { - break; - } - - // B. Concatenate copies of A_i to create a string B of length v bits (the - // final copy of A_i may be truncated to create B). - uint8_t B[EVP_MAX_MD_BLOCK_SIZE]; - for (size_t i = 0; i < block_size; i++) { - B[i] = A[i % A_len]; - } - - // C. Treating I as a concatenation I_0, I_1, ..., I_(k-1) of v-bit blocks, - // where k=ceiling(s/v)+ceiling(p/v), modify I by setting I_j=(I_j+B+1) mod - // 2^v for each j. - assert(I_len % block_size == 0); - for (size_t i = 0; i < I_len; i += block_size) { - unsigned carry = 1; - for (size_t j = block_size - 1; j < block_size; j--) { - carry += I[i + j] + B[j]; - I[i + j] = (uint8_t)carry; - carry >>= 8; - } - } - } - - ret = 1; - -err: - OPENSSL_free(I); - OPENSSL_free(pass_raw); - EVP_MD_CTX_cleanup(&ctx); - return ret; -} - -static int pkcs12_pbe_cipher_init(const struct pbe_suite *suite, - EVP_CIPHER_CTX *ctx, unsigned iterations, - const char *pass, size_t pass_len, - const uint8_t *salt, size_t salt_len, - int is_encrypt) { - const EVP_CIPHER *cipher = suite->cipher_func(); - const EVP_MD *md = suite->md_func(); - - uint8_t key[EVP_MAX_KEY_LENGTH]; - uint8_t iv[EVP_MAX_IV_LENGTH]; - if (!pkcs12_key_gen(pass, pass_len, salt, salt_len, PKCS12_KEY_ID, iterations, - EVP_CIPHER_key_length(cipher), key, md) || - !pkcs12_key_gen(pass, pass_len, salt, salt_len, PKCS12_IV_ID, iterations, - EVP_CIPHER_iv_length(cipher), iv, md)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_KEY_GEN_ERROR); - return 0; - } - - int ret = EVP_CipherInit_ex(ctx, cipher, NULL, key, iv, is_encrypt); - OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH); - OPENSSL_cleanse(iv, EVP_MAX_IV_LENGTH); - return ret; -} - -static int pkcs12_pbe_decrypt_init(const struct pbe_suite *suite, - EVP_CIPHER_CTX *ctx, const char *pass, - size_t pass_len, CBS *param) { - CBS pbe_param, salt; - uint64_t iterations; - if (!CBS_get_asn1(param, &pbe_param, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&pbe_param, &salt, CBS_ASN1_OCTETSTRING) || - !CBS_get_asn1_uint64(&pbe_param, &iterations) || - CBS_len(&pbe_param) != 0 || - CBS_len(param) != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); - return 0; - } - - if (!pkcs12_iterations_acceptable(iterations)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_ITERATION_COUNT); - return 0; - } - - return pkcs12_pbe_cipher_init(suite, ctx, (unsigned)iterations, pass, - pass_len, CBS_data(&salt), CBS_len(&salt), - 0 /* decrypt */); -} - -static const struct pbe_suite kBuiltinPBE[] = { - { - NID_pbe_WithSHA1And40BitRC2_CBC, - // 1.2.840.113549.1.12.1.6 - {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x0c, 0x01, 0x06}, - 10, - EVP_rc2_40_cbc, - EVP_sha1, - pkcs12_pbe_decrypt_init, - }, - { - NID_pbe_WithSHA1And128BitRC4, - // 1.2.840.113549.1.12.1.1 - {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x0c, 0x01, 0x01}, - 10, - EVP_rc4, - EVP_sha1, - pkcs12_pbe_decrypt_init, - }, - { - NID_pbe_WithSHA1And3_Key_TripleDES_CBC, - // 1.2.840.113549.1.12.1.3 - {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x0c, 0x01, 0x03}, - 10, - EVP_des_ede3_cbc, - EVP_sha1, - pkcs12_pbe_decrypt_init, - }, - { - NID_pbes2, - // 1.2.840.113549.1.5.13 - {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x05, 0x0d}, - 9, - NULL, - NULL, - PKCS5_pbe2_decrypt_init, - }, -}; - -static const struct pbe_suite *get_pkcs12_pbe_suite(int pbe_nid) { - for (unsigned i = 0; i < OPENSSL_ARRAY_SIZE(kBuiltinPBE); i++) { - if (kBuiltinPBE[i].pbe_nid == pbe_nid && - // If |cipher_func| or |md_func| are missing, this is a PBES2 scheme. - kBuiltinPBE[i].cipher_func != NULL && - kBuiltinPBE[i].md_func != NULL) { - return &kBuiltinPBE[i]; - } - } - - return NULL; -} - -int pkcs12_pbe_encrypt_init(CBB *out, EVP_CIPHER_CTX *ctx, int alg, - unsigned iterations, const char *pass, - size_t pass_len, const uint8_t *salt, - size_t salt_len) { - const struct pbe_suite *suite = get_pkcs12_pbe_suite(alg); - if (suite == NULL) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNKNOWN_ALGORITHM); - return 0; - } - - // See RFC 2898, appendix A.3. - CBB algorithm, oid, param, salt_cbb; - if (!CBB_add_asn1(out, &algorithm, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&algorithm, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, suite->oid, suite->oid_len) || - !CBB_add_asn1(&algorithm, ¶m, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(¶m, &salt_cbb, CBS_ASN1_OCTETSTRING) || - !CBB_add_bytes(&salt_cbb, salt, salt_len) || - !CBB_add_asn1_uint64(¶m, iterations) || - !CBB_flush(out)) { - return 0; - } - - return pkcs12_pbe_cipher_init(suite, ctx, iterations, pass, pass_len, salt, - salt_len, 1 /* encrypt */); -} - -int pkcs8_pbe_decrypt(uint8_t **out, size_t *out_len, CBS *algorithm, - const char *pass, size_t pass_len, const uint8_t *in, - size_t in_len) { - int ret = 0; - uint8_t *buf = NULL;; - EVP_CIPHER_CTX ctx; - EVP_CIPHER_CTX_init(&ctx); - - CBS obj; - if (!CBS_get_asn1(algorithm, &obj, CBS_ASN1_OBJECT)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); - goto err; - } - - const struct pbe_suite *suite = NULL; - for (unsigned i = 0; i < OPENSSL_ARRAY_SIZE(kBuiltinPBE); i++) { - if (CBS_mem_equal(&obj, kBuiltinPBE[i].oid, kBuiltinPBE[i].oid_len)) { - suite = &kBuiltinPBE[i]; - break; - } - } - if (suite == NULL) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNKNOWN_ALGORITHM); - goto err; - } - - if (!suite->decrypt_init(suite, &ctx, pass, pass_len, algorithm)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_KEYGEN_FAILURE); - goto err; - } - - buf = OPENSSL_malloc(in_len); - if (buf == NULL) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE); - goto err; - } - - if (in_len > INT_MAX) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_OVERFLOW); - goto err; - } - - int n1, n2; - if (!EVP_DecryptUpdate(&ctx, buf, &n1, in, (int)in_len) || - !EVP_DecryptFinal_ex(&ctx, buf + n1, &n2)) { - goto err; - } - - *out = buf; - *out_len = n1 + n2; - ret = 1; - buf = NULL; - -err: - OPENSSL_free(buf); - EVP_CIPHER_CTX_cleanup(&ctx); - return ret; -} - -EVP_PKEY *PKCS8_parse_encrypted_private_key(CBS *cbs, const char *pass, - size_t pass_len) { - // See RFC 5208, section 6. - CBS epki, algorithm, ciphertext; - if (!CBS_get_asn1(cbs, &epki, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&epki, &algorithm, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&epki, &ciphertext, CBS_ASN1_OCTETSTRING) || - CBS_len(&epki) != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); - return 0; - } - - uint8_t *out; - size_t out_len; - if (!pkcs8_pbe_decrypt(&out, &out_len, &algorithm, pass, pass_len, - CBS_data(&ciphertext), CBS_len(&ciphertext))) { - return 0; - } - - CBS pki; - CBS_init(&pki, out, out_len); - EVP_PKEY *ret = EVP_parse_private_key(&pki); - OPENSSL_free(out); - return ret; -} - -int PKCS8_marshal_encrypted_private_key(CBB *out, int pbe_nid, - const EVP_CIPHER *cipher, - const char *pass, size_t pass_len, - const uint8_t *salt, size_t salt_len, - int iterations, const EVP_PKEY *pkey) { - int ret = 0; - uint8_t *plaintext = NULL, *salt_buf = NULL; - size_t plaintext_len = 0; - EVP_CIPHER_CTX ctx; - EVP_CIPHER_CTX_init(&ctx); - - // Generate a random salt if necessary. - if (salt == NULL) { - if (salt_len == 0) { - salt_len = PKCS5_SALT_LEN; - } - - salt_buf = OPENSSL_malloc(salt_len); - if (salt_buf == NULL || - !RAND_bytes(salt_buf, salt_len)) { - goto err; - } - - salt = salt_buf; - } - - if (iterations <= 0) { - iterations = PKCS12_DEFAULT_ITER; - } - - // Serialize the input key. - CBB plaintext_cbb; - if (!CBB_init(&plaintext_cbb, 128) || - !EVP_marshal_private_key(&plaintext_cbb, pkey) || - !CBB_finish(&plaintext_cbb, &plaintext, &plaintext_len)) { - CBB_cleanup(&plaintext_cbb); - goto err; - } - - CBB epki; - if (!CBB_add_asn1(out, &epki, CBS_ASN1_SEQUENCE)) { - goto err; - } - - // TODO(davidben): OpenSSL has since extended |pbe_nid| to control either the - // PBES1 scheme or the PBES2 PRF. E.g. passing |NID_hmacWithSHA256| will - // select PBES2 with HMAC-SHA256 as the PRF. Implement this if anything uses - // it. See 5693a30813a031d3921a016a870420e7eb93ec90 in OpenSSL. - int alg_ok; - if (pbe_nid == -1) { - alg_ok = PKCS5_pbe2_encrypt_init(&epki, &ctx, cipher, (unsigned)iterations, - pass, pass_len, salt, salt_len); - } else { - alg_ok = pkcs12_pbe_encrypt_init(&epki, &ctx, pbe_nid, (unsigned)iterations, - pass, pass_len, salt, salt_len); - } - if (!alg_ok) { - goto err; - } - - size_t max_out = plaintext_len + EVP_CIPHER_CTX_block_size(&ctx); - if (max_out < plaintext_len) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_TOO_LONG); - goto err; - } - - CBB ciphertext; - uint8_t *ptr; - int n1, n2; - if (!CBB_add_asn1(&epki, &ciphertext, CBS_ASN1_OCTETSTRING) || - !CBB_reserve(&ciphertext, &ptr, max_out) || - !EVP_CipherUpdate(&ctx, ptr, &n1, plaintext, plaintext_len) || - !EVP_CipherFinal_ex(&ctx, ptr + n1, &n2) || - !CBB_did_write(&ciphertext, n1 + n2) || - !CBB_flush(out)) { - goto err; - } - - ret = 1; - -err: - OPENSSL_free(plaintext); - OPENSSL_free(salt_buf); - EVP_CIPHER_CTX_cleanup(&ctx); - return ret; -} diff --git a/third_party/boringssl/src/crypto/pkcs8/pkcs8.cc b/third_party/boringssl/src/crypto/pkcs8/pkcs8.cc new file mode 100644 index 00000000..e7f388aa --- /dev/null +++ b/third_party/boringssl/src/crypto/pkcs8/pkcs8.cc @@ -0,0 +1,473 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +static int pkcs12_encode_password(const char *in, size_t in_len, uint8_t **out, + size_t *out_len) { + ScopedCBB cbb; + if (!CBB_init(cbb.get(), in_len * 2)) { + return 0; + } + + // Convert the password to BMPString, or UCS-2. See + // https://tools.ietf.org/html/rfc7292#appendix-B.1. + CBS cbs; + CBS_init(&cbs, (const uint8_t *)in, in_len); + while (CBS_len(&cbs) != 0) { + uint32_t c; + if (!CBS_get_utf8(&cbs, &c) || !CBB_add_ucs2_be(cbb.get(), c)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_INVALID_CHARACTERS); + return 0; + } + } + + // Terminate the result with a UCS-2 NUL. + if (!CBB_add_ucs2_be(cbb.get(), 0) || !CBB_finish(cbb.get(), out, out_len)) { + return 0; + } + + return 1; +} + +int bssl::pkcs12_key_gen(const char *pass, size_t pass_len, const uint8_t *salt, + size_t salt_len, uint8_t id, uint32_t iterations, + size_t out_len, uint8_t *out, const EVP_MD *md) { + // See https://tools.ietf.org/html/rfc7292#appendix-B. Quoted parts of the + // specification have errata applied and other typos fixed. + + if (iterations < 1) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_ITERATION_COUNT); + return 0; + } + + int ret = 0; + EVP_MD_CTX ctx; + EVP_MD_CTX_init(&ctx); + uint8_t *pass_raw = nullptr, *I = nullptr; + size_t pass_raw_len = 0, I_len = 0; + + { + // If |pass| is NULL, we use the empty string rather than {0, 0} as the raw + // password. + if (pass != nullptr && + !pkcs12_encode_password(pass, pass_len, &pass_raw, &pass_raw_len)) { + goto err; + } + + // In the spec, |block_size| is called "v", but measured in bits. + size_t block_size = EVP_MD_block_size(md); + + // 1. Construct a string, D (the "diversifier"), by concatenating v/8 copies + // of ID. + uint8_t D[EVP_MAX_MD_BLOCK_SIZE]; + OPENSSL_memset(D, id, block_size); + + // 2. Concatenate copies of the salt together to create a string S of length + // v(ceiling(s/v)) bits (the final copy of the salt may be truncated to + // create S). Note that if the salt is the empty string, then so is S. + // + // 3. Concatenate copies of the password together to create a string P of + // length v(ceiling(p/v)) bits (the final copy of the password may be + // truncated to create P). Note that if the password is the empty string, + // then so is P. + // + // 4. Set I=S||P to be the concatenation of S and P. + if (salt_len + block_size - 1 < salt_len || + pass_raw_len + block_size - 1 < pass_raw_len) { + OPENSSL_PUT_ERROR(PKCS8, ERR_R_OVERFLOW); + goto err; + } + size_t S_len = block_size * ((salt_len + block_size - 1) / block_size); + size_t P_len = block_size * ((pass_raw_len + block_size - 1) / block_size); + I_len = S_len + P_len; + if (I_len < S_len) { + OPENSSL_PUT_ERROR(PKCS8, ERR_R_OVERFLOW); + goto err; + } + + I = reinterpret_cast(OPENSSL_malloc(I_len)); + if (I_len != 0 && I == nullptr) { + goto err; + } + + for (size_t i = 0; i < S_len; i++) { + I[i] = salt[i % salt_len]; + } + for (size_t i = 0; i < P_len; i++) { + I[i + S_len] = pass_raw[i % pass_raw_len]; + } + + while (out_len != 0) { + // A. Set A_i=H^r(D||I). (i.e., the r-th hash of D||I, + // H(H(H(... H(D||I)))) + uint8_t A[EVP_MAX_MD_SIZE]; + unsigned A_len; + if (!EVP_DigestInit_ex(&ctx, md, nullptr) || + !EVP_DigestUpdate(&ctx, D, block_size) || + !EVP_DigestUpdate(&ctx, I, I_len) || + !EVP_DigestFinal_ex(&ctx, A, &A_len)) { + goto err; + } + for (uint32_t iter = 1; iter < iterations; iter++) { + if (!EVP_DigestInit_ex(&ctx, md, nullptr) || + !EVP_DigestUpdate(&ctx, A, A_len) || + !EVP_DigestFinal_ex(&ctx, A, &A_len)) { + goto err; + } + } + + size_t todo = out_len < A_len ? out_len : A_len; + OPENSSL_memcpy(out, A, todo); + out += todo; + out_len -= todo; + if (out_len == 0) { + break; + } + + // B. Concatenate copies of A_i to create a string B of length v bits (the + // final copy of A_i may be truncated to create B). + uint8_t B[EVP_MAX_MD_BLOCK_SIZE]; + for (size_t i = 0; i < block_size; i++) { + B[i] = A[i % A_len]; + } + + // C. Treating I as a concatenation I_0, I_1, ..., I_(k-1) of v-bit + // blocks, where k=ceiling(s/v)+ceiling(p/v), modify I by setting + // I_j=(I_j+B+1) mod 2^v for each j. + assert(I_len % block_size == 0); + for (size_t i = 0; i < I_len; i += block_size) { + unsigned carry = 1; + for (size_t j = block_size - 1; j < block_size; j--) { + carry += I[i + j] + B[j]; + I[i + j] = (uint8_t)carry; + carry >>= 8; + } + } + } + + ret = 1; + } + +err: + OPENSSL_free(I); + OPENSSL_free(pass_raw); + EVP_MD_CTX_cleanup(&ctx); + return ret; +} + +static int pkcs12_pbe_cipher_init(const struct pbe_suite *suite, + EVP_CIPHER_CTX *ctx, uint32_t iterations, + const char *pass, size_t pass_len, + const uint8_t *salt, size_t salt_len, + int is_encrypt) { + const EVP_CIPHER *cipher = suite->cipher_func(); + const EVP_MD *md = suite->md_func(); + + uint8_t key[EVP_MAX_KEY_LENGTH]; + uint8_t iv[EVP_MAX_IV_LENGTH]; + if (!pkcs12_key_gen(pass, pass_len, salt, salt_len, PKCS12_KEY_ID, iterations, + EVP_CIPHER_key_length(cipher), key, md) || + !pkcs12_key_gen(pass, pass_len, salt, salt_len, PKCS12_IV_ID, iterations, + EVP_CIPHER_iv_length(cipher), iv, md)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_KEY_GEN_ERROR); + return 0; + } + + int ret = EVP_CipherInit_ex(ctx, cipher, nullptr, key, iv, is_encrypt); + OPENSSL_cleanse(key, EVP_MAX_KEY_LENGTH); + OPENSSL_cleanse(iv, EVP_MAX_IV_LENGTH); + return ret; +} + +static int pkcs12_pbe_decrypt_init(const struct pbe_suite *suite, + EVP_CIPHER_CTX *ctx, const char *pass, + size_t pass_len, CBS *param) { + CBS pbe_param, salt; + uint64_t iterations; + if (!CBS_get_asn1(param, &pbe_param, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&pbe_param, &salt, CBS_ASN1_OCTETSTRING) || + !CBS_get_asn1_uint64(&pbe_param, &iterations) || + CBS_len(&pbe_param) != 0 || CBS_len(param) != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); + return 0; + } + + if (!pkcs12_iterations_acceptable(iterations)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_ITERATION_COUNT); + return 0; + } + + return pkcs12_pbe_cipher_init(suite, ctx, (uint32_t)iterations, pass, + pass_len, CBS_data(&salt), CBS_len(&salt), + 0 /* decrypt */); +} + +static const struct bssl::pbe_suite kBuiltinPBE[] = { + { + NID_pbe_WithSHA1And40BitRC2_CBC, + // 1.2.840.113549.1.12.1.6 + {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x0c, 0x01, 0x06}, + 10, + EVP_rc2_40_cbc, + EVP_sha1, + pkcs12_pbe_decrypt_init, + }, + { + NID_pbe_WithSHA1And128BitRC4, + // 1.2.840.113549.1.12.1.1 + {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x0c, 0x01, 0x01}, + 10, + EVP_rc4, + EVP_sha1, + pkcs12_pbe_decrypt_init, + }, + { + NID_pbe_WithSHA1And3_Key_TripleDES_CBC, + // 1.2.840.113549.1.12.1.3 + {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x0c, 0x01, 0x03}, + 10, + EVP_des_ede3_cbc, + EVP_sha1, + pkcs12_pbe_decrypt_init, + }, + { + NID_pbes2, + // 1.2.840.113549.1.5.13 + {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x05, 0x0d}, + 9, + nullptr, + nullptr, + PKCS5_pbe2_decrypt_init, + }, +}; + +static const struct bssl::pbe_suite *get_pkcs12_pbe_suite(int pbe_nid) { + for (const auto &pbe : kBuiltinPBE) { + if (pbe.pbe_nid == pbe_nid && + // If |cipher_func| or |md_func| are missing, this is a PBES2 scheme. + pbe.cipher_func != nullptr && pbe.md_func != nullptr) { + return &pbe; + } + } + + return nullptr; +} + +int bssl::pkcs12_pbe_encrypt_init(CBB *out, EVP_CIPHER_CTX *ctx, int alg_nid, + const EVP_CIPHER *alg_cipher, + uint32_t iterations, const char *pass, + size_t pass_len, const uint8_t *salt, + size_t salt_len) { + // TODO(davidben): OpenSSL has since extended |pbe_nid| to control either + // the PBES1 scheme or the PBES2 PRF. E.g. passing |NID_hmacWithSHA256| will + // select PBES2 with HMAC-SHA256 as the PRF. Implement this if anything uses + // it. See 5693a30813a031d3921a016a870420e7eb93ec90 in OpenSSL. + if (alg_nid == -1) { + return PKCS5_pbe2_encrypt_init(out, ctx, alg_cipher, iterations, pass, + pass_len, salt, salt_len); + } + + const struct pbe_suite *suite = get_pkcs12_pbe_suite(alg_nid); + if (suite == nullptr) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNKNOWN_ALGORITHM); + return 0; + } + + // See RFC 7292, appendix C. All our supported "PBES1" schemes are the PKCS#12 + // schemes, which use a different KDF. The true PBES1 schemes in RFC 8018 use + // PBKDF1, which use a very similar PBEParameter structure, but require the + // salt be exactly 8 bytes. + CBB algorithm, param; + if (!CBB_add_asn1(out, &algorithm, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&algorithm, CBS_ASN1_OBJECT, suite->oid, + suite->oid_len) || + !CBB_add_asn1(&algorithm, ¶m, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_octet_string(¶m, salt, salt_len) || + !CBB_add_asn1_uint64(¶m, iterations) || !CBB_flush(out)) { + return 0; + } + + return pkcs12_pbe_cipher_init(suite, ctx, iterations, pass, pass_len, salt, + salt_len, 1 /* encrypt */); +} + +int bssl::pkcs8_pbe_decrypt(uint8_t **out, size_t *out_len, CBS *algorithm, + const char *pass, size_t pass_len, + const uint8_t *in, size_t in_len) { + int ret = 0; + uint8_t *buf = nullptr; + ScopedEVP_CIPHER_CTX ctx; + + CBS obj; + const struct pbe_suite *suite = nullptr; + if (!CBS_get_asn1(algorithm, &obj, CBS_ASN1_OBJECT)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); + goto err; + } + + for (const auto &pbe : kBuiltinPBE) { + if (CBS_mem_equal(&obj, pbe.oid, pbe.oid_len)) { + suite = &pbe; + break; + } + } + if (suite == nullptr) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNKNOWN_ALGORITHM); + goto err; + } + + if (!suite->decrypt_init(suite, ctx.get(), pass, pass_len, algorithm)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_KEYGEN_FAILURE); + goto err; + } + + buf = reinterpret_cast(OPENSSL_malloc(in_len)); + if (buf == nullptr) { + goto err; + } + + size_t n1, n2; + if (!EVP_DecryptUpdate_ex(ctx.get(), buf, &n1, in_len, in, in_len) || + !EVP_DecryptFinal_ex2(ctx.get(), buf + n1, &n2, in_len - n1)) { + goto err; + } + + *out = buf; + *out_len = n1 + n2; + ret = 1; + buf = nullptr; + +err: + OPENSSL_free(buf); + return ret; +} + +EVP_PKEY *PKCS8_parse_encrypted_private_key(CBS *cbs, const char *pass, + size_t pass_len) { + // See RFC 5208, section 6. + CBS epki, algorithm, ciphertext; + if (!CBS_get_asn1(cbs, &epki, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&epki, &algorithm, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&epki, &ciphertext, CBS_ASN1_OCTETSTRING) || + CBS_len(&epki) != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); + return nullptr; + } + + uint8_t *out; + size_t out_len; + if (!pkcs8_pbe_decrypt(&out, &out_len, &algorithm, pass, pass_len, + CBS_data(&ciphertext), CBS_len(&ciphertext))) { + return nullptr; + } + + CBS pki; + CBS_init(&pki, out, out_len); + EVP_PKEY *ret = EVP_parse_private_key(&pki); + OPENSSL_free(out); + return ret; +} + +int PKCS8_marshal_encrypted_private_key(CBB *out, int pbe_nid, + const EVP_CIPHER *cipher, + const char *pass, size_t pass_len, + const uint8_t *salt, size_t salt_len, + int iterations, const EVP_PKEY *pkey) { + int ret = 0; + uint8_t *plaintext = nullptr, *salt_buf = nullptr; + size_t plaintext_len = 0; + ScopedEVP_CIPHER_CTX ctx; + + { + // Generate a random salt if necessary. + if (salt == nullptr) { + if (salt_len == 0) { + salt_len = PKCS5_SALT_LEN; + } + + salt_buf = reinterpret_cast(OPENSSL_malloc(salt_len)); + if (salt_buf == nullptr || !RAND_bytes(salt_buf, salt_len)) { + goto err; + } + + salt = salt_buf; + } + + if (iterations <= 0) { + iterations = PKCS12_DEFAULT_ITER; + } + + // Serialize the input key. + CBB plaintext_cbb; + if (!CBB_init(&plaintext_cbb, 128) || + !EVP_marshal_private_key(&plaintext_cbb, pkey) || + !CBB_finish(&plaintext_cbb, &plaintext, &plaintext_len)) { + CBB_cleanup(&plaintext_cbb); + goto err; + } + + CBB epki; + if (!CBB_add_asn1(out, &epki, CBS_ASN1_SEQUENCE) || + !pkcs12_pbe_encrypt_init(&epki, ctx.get(), pbe_nid, cipher, + (uint32_t)iterations, pass, pass_len, salt, + salt_len)) { + goto err; + } + + size_t max_out = plaintext_len + EVP_CIPHER_CTX_block_size(ctx.get()); + if (max_out < plaintext_len) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_TOO_LONG); + goto err; + } + + CBB ciphertext; + uint8_t *ptr; + size_t n1, n2; + if (!CBB_add_asn1(&epki, &ciphertext, CBS_ASN1_OCTETSTRING) || + !CBB_reserve(&ciphertext, &ptr, max_out) || + !EVP_CipherUpdate_ex(ctx.get(), ptr, &n1, max_out, plaintext, + plaintext_len) || + !EVP_CipherFinal_ex2(ctx.get(), ptr + n1, &n2, max_out - n1) || + !CBB_did_write(&ciphertext, n1 + n2) || !CBB_flush(out)) { + goto err; + } + + ret = 1; + } + +err: + OPENSSL_free(plaintext); + OPENSSL_free(salt_buf); + return ret; +} diff --git a/third_party/boringssl/src/crypto/pkcs8/pkcs8_x509.c b/third_party/boringssl/src/crypto/pkcs8/pkcs8_x509.c deleted file mode 100644 index f7b37e9e..00000000 --- a/third_party/boringssl/src/crypto/pkcs8/pkcs8_x509.c +++ /dev/null @@ -1,1355 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 1999. - */ -/* ==================================================================== - * Copyright (c) 1999 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" -#include "../bytestring/internal.h" -#include "../internal.h" - - -int pkcs12_iterations_acceptable(uint64_t iterations) { -#if defined(BORINGSSL_UNSAFE_FUZZER_MODE) - static const uint64_t kIterationsLimit = 2048; -#else - // Windows imposes a limit of 600K. Mozilla say: “so them increasing - // maximum to something like 100M or 1G (to have few decades of breathing - // room) would be very welcome”[1]. So here we set the limit to 100M. - // - // [1] https://bugzilla.mozilla.org/show_bug.cgi?id=1436873#c14 - static const uint64_t kIterationsLimit = 100 * 1000000; -#endif - - return 0 < iterations && iterations <= kIterationsLimit; -} - -ASN1_SEQUENCE(PKCS8_PRIV_KEY_INFO) = { - ASN1_SIMPLE(PKCS8_PRIV_KEY_INFO, version, ASN1_INTEGER), - ASN1_SIMPLE(PKCS8_PRIV_KEY_INFO, pkeyalg, X509_ALGOR), - ASN1_SIMPLE(PKCS8_PRIV_KEY_INFO, pkey, ASN1_OCTET_STRING), - ASN1_IMP_SET_OF_OPT(PKCS8_PRIV_KEY_INFO, attributes, X509_ATTRIBUTE, 0), -} ASN1_SEQUENCE_END(PKCS8_PRIV_KEY_INFO) - -IMPLEMENT_ASN1_FUNCTIONS_const(PKCS8_PRIV_KEY_INFO) - -EVP_PKEY *EVP_PKCS82PKEY(const PKCS8_PRIV_KEY_INFO *p8) { - uint8_t *der = NULL; - int der_len = i2d_PKCS8_PRIV_KEY_INFO(p8, &der); - if (der_len < 0) { - return NULL; - } - - CBS cbs; - CBS_init(&cbs, der, (size_t)der_len); - EVP_PKEY *ret = EVP_parse_private_key(&cbs); - if (ret == NULL || CBS_len(&cbs) != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); - EVP_PKEY_free(ret); - OPENSSL_free(der); - return NULL; - } - - OPENSSL_free(der); - return ret; -} - -PKCS8_PRIV_KEY_INFO *EVP_PKEY2PKCS8(const EVP_PKEY *pkey) { - CBB cbb; - uint8_t *der = NULL; - size_t der_len; - if (!CBB_init(&cbb, 0) || - !EVP_marshal_private_key(&cbb, pkey) || - !CBB_finish(&cbb, &der, &der_len) || - der_len > LONG_MAX) { - CBB_cleanup(&cbb); - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_ENCODE_ERROR); - goto err; - } - - const uint8_t *p = der; - PKCS8_PRIV_KEY_INFO *p8 = d2i_PKCS8_PRIV_KEY_INFO(NULL, &p, (long)der_len); - if (p8 == NULL || p != der + der_len) { - PKCS8_PRIV_KEY_INFO_free(p8); - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); - goto err; - } - - OPENSSL_free(der); - return p8; - -err: - OPENSSL_free(der); - return NULL; -} - -PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8, const char *pass, - int pass_len_in) { - size_t pass_len; - if (pass_len_in == -1 && pass != NULL) { - pass_len = strlen(pass); - } else { - pass_len = (size_t)pass_len_in; - } - - PKCS8_PRIV_KEY_INFO *ret = NULL; - EVP_PKEY *pkey = NULL; - uint8_t *in = NULL; - - // Convert the legacy ASN.1 object to a byte string. - int in_len = i2d_X509_SIG(pkcs8, &in); - if (in_len < 0) { - goto err; - } - - CBS cbs; - CBS_init(&cbs, in, in_len); - pkey = PKCS8_parse_encrypted_private_key(&cbs, pass, pass_len); - if (pkey == NULL || CBS_len(&cbs) != 0) { - goto err; - } - - ret = EVP_PKEY2PKCS8(pkey); - -err: - OPENSSL_free(in); - EVP_PKEY_free(pkey); - return ret; -} - -X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, const char *pass, - int pass_len_in, const uint8_t *salt, size_t salt_len, - int iterations, PKCS8_PRIV_KEY_INFO *p8inf) { - size_t pass_len; - if (pass_len_in == -1 && pass != NULL) { - pass_len = strlen(pass); - } else { - pass_len = (size_t)pass_len_in; - } - - // Parse out the private key. - EVP_PKEY *pkey = EVP_PKCS82PKEY(p8inf); - if (pkey == NULL) { - return NULL; - } - - X509_SIG *ret = NULL; - uint8_t *der = NULL; - size_t der_len; - CBB cbb; - if (!CBB_init(&cbb, 128) || - !PKCS8_marshal_encrypted_private_key(&cbb, pbe_nid, cipher, pass, - pass_len, salt, salt_len, iterations, - pkey) || - !CBB_finish(&cbb, &der, &der_len)) { - CBB_cleanup(&cbb); - goto err; - } - - // Convert back to legacy ASN.1 objects. - const uint8_t *ptr = der; - ret = d2i_X509_SIG(NULL, &ptr, der_len); - if (ret == NULL || ptr != der + der_len) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_INTERNAL_ERROR); - X509_SIG_free(ret); - ret = NULL; - } - -err: - OPENSSL_free(der); - EVP_PKEY_free(pkey); - return ret; -} - -struct pkcs12_context { - EVP_PKEY **out_key; - STACK_OF(X509) *out_certs; - const char *password; - size_t password_len; -}; - -// PKCS12_handle_sequence parses a BER-encoded SEQUENCE of elements in a PKCS#12 -// structure. -static int PKCS12_handle_sequence( - CBS *sequence, struct pkcs12_context *ctx, - int (*handle_element)(CBS *cbs, struct pkcs12_context *ctx)) { - uint8_t *storage = NULL; - CBS in; - int ret = 0; - - // Although a BER->DER conversion is done at the beginning of |PKCS12_parse|, - // the ASN.1 data gets wrapped in OCTETSTRINGs and/or encrypted and the - // conversion cannot see through those wrappings. So each time we step - // through one we need to convert to DER again. - if (!CBS_asn1_ber_to_der(sequence, &in, &storage)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - return 0; - } - - CBS child; - if (!CBS_get_asn1(&in, &child, CBS_ASN1_SEQUENCE) || - CBS_len(&in) != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - while (CBS_len(&child) > 0) { - CBS element; - if (!CBS_get_asn1(&child, &element, CBS_ASN1_SEQUENCE)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - if (!handle_element(&element, ctx)) { - goto err; - } - } - - ret = 1; - -err: - OPENSSL_free(storage); - return ret; -} - -// 1.2.840.113549.1.12.10.1.1 -static const uint8_t kKeyBag[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, - 0x01, 0x0c, 0x0a, 0x01, 0x01}; - -// 1.2.840.113549.1.12.10.1.2 -static const uint8_t kPKCS8ShroudedKeyBag[] = { - 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x0c, 0x0a, 0x01, 0x02}; - -// 1.2.840.113549.1.12.10.1.3 -static const uint8_t kCertBag[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, - 0x01, 0x0c, 0x0a, 0x01, 0x03}; - -// 1.2.840.113549.1.9.20 -static const uint8_t kFriendlyName[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, - 0x0d, 0x01, 0x09, 0x14}; - -// 1.2.840.113549.1.9.21 -static const uint8_t kLocalKeyID[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, - 0x0d, 0x01, 0x09, 0x15}; - -// 1.2.840.113549.1.9.22.1 -static const uint8_t kX509Certificate[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, - 0x0d, 0x01, 0x09, 0x16, 0x01}; - -// parse_bag_attributes parses the bagAttributes field of a SafeBag structure. -// It sets |*out_friendly_name| to a newly-allocated copy of the friendly name, -// encoded as a UTF-8 string, or NULL if there is none. It returns one on -// success and zero on error. -static int parse_bag_attributes(CBS *attrs, uint8_t **out_friendly_name, - size_t *out_friendly_name_len) { - *out_friendly_name = NULL; - *out_friendly_name_len = 0; - - // See https://tools.ietf.org/html/rfc7292#section-4.2. - while (CBS_len(attrs) != 0) { - CBS attr, oid, values; - if (!CBS_get_asn1(attrs, &attr, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&attr, &oid, CBS_ASN1_OBJECT) || - !CBS_get_asn1(&attr, &values, CBS_ASN1_SET) || - CBS_len(&attr) != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - if (CBS_mem_equal(&oid, kFriendlyName, sizeof(kFriendlyName))) { - // See https://tools.ietf.org/html/rfc2985, section 5.5.1. - CBS value; - if (*out_friendly_name != NULL || - !CBS_get_asn1(&values, &value, CBS_ASN1_BMPSTRING) || - CBS_len(&values) != 0 || - CBS_len(&value) == 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - // Convert the friendly name to UTF-8. - CBB cbb; - if (!CBB_init(&cbb, CBS_len(&value))) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE); - goto err; - } - while (CBS_len(&value) != 0) { - uint32_t c; - if (!cbs_get_ucs2_be(&value, &c) || - !cbb_add_utf8(&cbb, c)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_INVALID_CHARACTERS); - CBB_cleanup(&cbb); - goto err; - } - } - if (!CBB_finish(&cbb, out_friendly_name, out_friendly_name_len)) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE); - CBB_cleanup(&cbb); - goto err; - } - } - } - - return 1; - -err: - OPENSSL_free(*out_friendly_name); - *out_friendly_name = NULL; - *out_friendly_name_len = 0; - return 0; -} - -// PKCS12_handle_safe_bag parses a single SafeBag element in a PKCS#12 -// structure. -static int PKCS12_handle_safe_bag(CBS *safe_bag, struct pkcs12_context *ctx) { - CBS bag_id, wrapped_value, bag_attrs; - if (!CBS_get_asn1(safe_bag, &bag_id, CBS_ASN1_OBJECT) || - !CBS_get_asn1(safe_bag, &wrapped_value, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - return 0; - } - if (CBS_len(safe_bag) == 0) { - CBS_init(&bag_attrs, NULL, 0); - } else if (!CBS_get_asn1(safe_bag, &bag_attrs, CBS_ASN1_SET) || - CBS_len(safe_bag) != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - return 0; - } - - const int is_key_bag = CBS_mem_equal(&bag_id, kKeyBag, sizeof(kKeyBag)); - const int is_shrouded_key_bag = CBS_mem_equal(&bag_id, kPKCS8ShroudedKeyBag, - sizeof(kPKCS8ShroudedKeyBag)); - if (is_key_bag || is_shrouded_key_bag) { - // See RFC 7292, section 4.2.1 and 4.2.2. - if (*ctx->out_key) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_MULTIPLE_PRIVATE_KEYS_IN_PKCS12); - return 0; - } - - EVP_PKEY *pkey = - is_key_bag ? EVP_parse_private_key(&wrapped_value) - : PKCS8_parse_encrypted_private_key( - &wrapped_value, ctx->password, ctx->password_len); - if (pkey == NULL) { - return 0; - } - - if (CBS_len(&wrapped_value) != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - EVP_PKEY_free(pkey); - return 0; - } - - *ctx->out_key = pkey; - return 1; - } - - if (CBS_mem_equal(&bag_id, kCertBag, sizeof(kCertBag))) { - // See RFC 7292, section 4.2.3. - CBS cert_bag, cert_type, wrapped_cert, cert; - if (!CBS_get_asn1(&wrapped_value, &cert_bag, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&cert_bag, &cert_type, CBS_ASN1_OBJECT) || - !CBS_get_asn1(&cert_bag, &wrapped_cert, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) || - !CBS_get_asn1(&wrapped_cert, &cert, CBS_ASN1_OCTETSTRING)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - return 0; - } - - // Skip unknown certificate types. - if (!CBS_mem_equal(&cert_type, kX509Certificate, - sizeof(kX509Certificate))) { - return 1; - } - - if (CBS_len(&cert) > LONG_MAX) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - return 0; - } - - const uint8_t *inp = CBS_data(&cert); - X509 *x509 = d2i_X509(NULL, &inp, (long)CBS_len(&cert)); - if (!x509) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - return 0; - } - - if (inp != CBS_data(&cert) + CBS_len(&cert)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - X509_free(x509); - return 0; - } - - uint8_t *friendly_name; - size_t friendly_name_len; - if (!parse_bag_attributes(&bag_attrs, &friendly_name, &friendly_name_len)) { - X509_free(x509); - return 0; - } - int ok = friendly_name_len == 0 || - X509_alias_set1(x509, friendly_name, friendly_name_len); - OPENSSL_free(friendly_name); - if (!ok || - 0 == sk_X509_push(ctx->out_certs, x509)) { - X509_free(x509); - return 0; - } - - return 1; - } - - // Unknown element type - ignore it. - return 1; -} - -// 1.2.840.113549.1.7.1 -static const uint8_t kPKCS7Data[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, - 0x0d, 0x01, 0x07, 0x01}; - -// 1.2.840.113549.1.7.6 -static const uint8_t kPKCS7EncryptedData[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, - 0x0d, 0x01, 0x07, 0x06}; - -// PKCS12_handle_content_info parses a single PKCS#7 ContentInfo element in a -// PKCS#12 structure. -static int PKCS12_handle_content_info(CBS *content_info, - struct pkcs12_context *ctx) { - CBS content_type, wrapped_contents, contents; - int ret = 0; - uint8_t *storage = NULL; - - if (!CBS_get_asn1(content_info, &content_type, CBS_ASN1_OBJECT) || - !CBS_get_asn1(content_info, &wrapped_contents, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) || - CBS_len(content_info) != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - if (CBS_mem_equal(&content_type, kPKCS7EncryptedData, - sizeof(kPKCS7EncryptedData))) { - // See https://tools.ietf.org/html/rfc2315#section-13. - // - // PKCS#7 encrypted data inside a PKCS#12 structure is generally an - // encrypted certificate bag and it's generally encrypted with 40-bit - // RC2-CBC. - CBS version_bytes, eci, contents_type, ai, encrypted_contents; - uint8_t *out; - size_t out_len; - - if (!CBS_get_asn1(&wrapped_contents, &contents, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&contents, &version_bytes, CBS_ASN1_INTEGER) || - // EncryptedContentInfo, see - // https://tools.ietf.org/html/rfc2315#section-10.1 - !CBS_get_asn1(&contents, &eci, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1(&eci, &contents_type, CBS_ASN1_OBJECT) || - // AlgorithmIdentifier, see - // https://tools.ietf.org/html/rfc5280#section-4.1.1.2 - !CBS_get_asn1(&eci, &ai, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1_implicit_string( - &eci, &encrypted_contents, &storage, - CBS_ASN1_CONTEXT_SPECIFIC | 0, CBS_ASN1_OCTETSTRING)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - if (!CBS_mem_equal(&contents_type, kPKCS7Data, sizeof(kPKCS7Data))) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - if (!pkcs8_pbe_decrypt(&out, &out_len, &ai, ctx->password, - ctx->password_len, CBS_data(&encrypted_contents), - CBS_len(&encrypted_contents))) { - goto err; - } - - CBS safe_contents; - CBS_init(&safe_contents, out, out_len); - ret = PKCS12_handle_sequence(&safe_contents, ctx, PKCS12_handle_safe_bag); - OPENSSL_free(out); - } else if (CBS_mem_equal(&content_type, kPKCS7Data, sizeof(kPKCS7Data))) { - CBS octet_string_contents; - - if (!CBS_get_asn1(&wrapped_contents, &octet_string_contents, - CBS_ASN1_OCTETSTRING)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - ret = PKCS12_handle_sequence(&octet_string_contents, ctx, - PKCS12_handle_safe_bag); - } else { - // Unknown element type - ignore it. - ret = 1; - } - -err: - OPENSSL_free(storage); - return ret; -} - -static int pkcs12_check_mac(int *out_mac_ok, const char *password, - size_t password_len, const CBS *salt, - unsigned iterations, const EVP_MD *md, - const CBS *authsafes, const CBS *expected_mac) { - int ret = 0; - uint8_t hmac_key[EVP_MAX_MD_SIZE]; - if (!pkcs12_key_gen(password, password_len, CBS_data(salt), CBS_len(salt), - PKCS12_MAC_ID, iterations, EVP_MD_size(md), hmac_key, - md)) { - goto err; - } - - uint8_t hmac[EVP_MAX_MD_SIZE]; - unsigned hmac_len; - if (NULL == HMAC(md, hmac_key, EVP_MD_size(md), CBS_data(authsafes), - CBS_len(authsafes), hmac, &hmac_len)) { - goto err; - } - - *out_mac_ok = CBS_mem_equal(expected_mac, hmac, hmac_len); -#if defined(BORINGSSL_UNSAFE_FUZZER_MODE) - *out_mac_ok = 1; -#endif - ret = 1; - -err: - OPENSSL_cleanse(hmac_key, sizeof(hmac_key)); - return ret; -} - - -int PKCS12_get_key_and_certs(EVP_PKEY **out_key, STACK_OF(X509) *out_certs, - CBS *ber_in, const char *password) { - uint8_t *storage = NULL; - CBS in, pfx, mac_data, authsafe, content_type, wrapped_authsafes, authsafes; - uint64_t version; - int ret = 0; - struct pkcs12_context ctx; - const size_t original_out_certs_len = sk_X509_num(out_certs); - - // The input may be in BER format. - if (!CBS_asn1_ber_to_der(ber_in, &in, &storage)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - return 0; - } - - *out_key = NULL; - OPENSSL_memset(&ctx, 0, sizeof(ctx)); - - // See ftp://ftp.rsasecurity.com/pub/pkcs/pkcs-12/pkcs-12v1.pdf, section - // four. - if (!CBS_get_asn1(&in, &pfx, CBS_ASN1_SEQUENCE) || - CBS_len(&in) != 0 || - !CBS_get_asn1_uint64(&pfx, &version)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - if (version < 3) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_VERSION); - goto err; - } - - if (!CBS_get_asn1(&pfx, &authsafe, CBS_ASN1_SEQUENCE)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - if (CBS_len(&pfx) == 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_MISSING_MAC); - goto err; - } - - if (!CBS_get_asn1(&pfx, &mac_data, CBS_ASN1_SEQUENCE)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - // authsafe is a PKCS#7 ContentInfo. See - // https://tools.ietf.org/html/rfc2315#section-7. - if (!CBS_get_asn1(&authsafe, &content_type, CBS_ASN1_OBJECT) || - !CBS_get_asn1(&authsafe, &wrapped_authsafes, - CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - // The content type can either be data or signedData. The latter indicates - // that it's signed by a public key, which isn't supported. - if (!CBS_mem_equal(&content_type, kPKCS7Data, sizeof(kPKCS7Data))) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_PKCS12_PUBLIC_KEY_INTEGRITY_NOT_SUPPORTED); - goto err; - } - - if (!CBS_get_asn1(&wrapped_authsafes, &authsafes, CBS_ASN1_OCTETSTRING)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - ctx.out_key = out_key; - ctx.out_certs = out_certs; - ctx.password = password; - ctx.password_len = password != NULL ? strlen(password) : 0; - - // Verify the MAC. - { - CBS mac, salt, expected_mac; - if (!CBS_get_asn1(&mac_data, &mac, CBS_ASN1_SEQUENCE)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - const EVP_MD *md = EVP_parse_digest_algorithm(&mac); - if (md == NULL) { - goto err; - } - - if (!CBS_get_asn1(&mac, &expected_mac, CBS_ASN1_OCTETSTRING) || - !CBS_get_asn1(&mac_data, &salt, CBS_ASN1_OCTETSTRING)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - - // The iteration count is optional and the default is one. - uint64_t iterations = 1; - if (CBS_len(&mac_data) > 0) { - if (!CBS_get_asn1_uint64(&mac_data, &iterations) || - !pkcs12_iterations_acceptable(iterations)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); - goto err; - } - } - - int mac_ok; - if (!pkcs12_check_mac(&mac_ok, ctx.password, ctx.password_len, &salt, - iterations, md, &authsafes, &expected_mac)) { - goto err; - } - if (!mac_ok && ctx.password_len == 0) { - // PKCS#12 encodes passwords as NUL-terminated UCS-2, so the empty - // password is encoded as {0, 0}. Some implementations use the empty byte - // array for "no password". OpenSSL considers a non-NULL password as {0, - // 0} and a NULL password as {}. It then, in high-level PKCS#12 parsing - // code, tries both options. We match this behavior. - ctx.password = ctx.password != NULL ? NULL : ""; - if (!pkcs12_check_mac(&mac_ok, ctx.password, ctx.password_len, &salt, - iterations, md, &authsafes, &expected_mac)) { - goto err; - } - } - if (!mac_ok) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_INCORRECT_PASSWORD); - goto err; - } - } - - // authsafes contains a series of PKCS#7 ContentInfos. - if (!PKCS12_handle_sequence(&authsafes, &ctx, PKCS12_handle_content_info)) { - goto err; - } - - ret = 1; - -err: - OPENSSL_free(storage); - if (!ret) { - EVP_PKEY_free(*out_key); - *out_key = NULL; - while (sk_X509_num(out_certs) > original_out_certs_len) { - X509 *x509 = sk_X509_pop(out_certs); - X509_free(x509); - } - } - - return ret; -} - -void PKCS12_PBE_add(void) {} - -struct pkcs12_st { - uint8_t *ber_bytes; - size_t ber_len; -}; - -PKCS12 *d2i_PKCS12(PKCS12 **out_p12, const uint8_t **ber_bytes, - size_t ber_len) { - PKCS12 *p12; - - p12 = OPENSSL_malloc(sizeof(PKCS12)); - if (!p12) { - return NULL; - } - - p12->ber_bytes = OPENSSL_malloc(ber_len); - if (!p12->ber_bytes) { - OPENSSL_free(p12); - return NULL; - } - - OPENSSL_memcpy(p12->ber_bytes, *ber_bytes, ber_len); - p12->ber_len = ber_len; - *ber_bytes += ber_len; - - if (out_p12) { - PKCS12_free(*out_p12); - - *out_p12 = p12; - } - - return p12; -} - -PKCS12* d2i_PKCS12_bio(BIO *bio, PKCS12 **out_p12) { - size_t used = 0; - BUF_MEM *buf; - const uint8_t *dummy; - static const size_t kMaxSize = 256 * 1024; - PKCS12 *ret = NULL; - - buf = BUF_MEM_new(); - if (buf == NULL) { - return NULL; - } - if (BUF_MEM_grow(buf, 8192) == 0) { - goto out; - } - - for (;;) { - int n = BIO_read(bio, &buf->data[used], buf->length - used); - if (n < 0) { - if (used == 0) { - goto out; - } - // Workaround a bug in node.js. It uses a memory BIO for this in the wrong - // mode. - n = 0; - } - - if (n == 0) { - break; - } - used += n; - - if (used < buf->length) { - continue; - } - - if (buf->length > kMaxSize || - BUF_MEM_grow(buf, buf->length * 2) == 0) { - goto out; - } - } - - dummy = (uint8_t*) buf->data; - ret = d2i_PKCS12(out_p12, &dummy, used); - -out: - BUF_MEM_free(buf); - return ret; -} - -PKCS12* d2i_PKCS12_fp(FILE *fp, PKCS12 **out_p12) { - BIO *bio; - PKCS12 *ret; - - bio = BIO_new_fp(fp, 0 /* don't take ownership */); - if (!bio) { - return NULL; - } - - ret = d2i_PKCS12_bio(bio, out_p12); - BIO_free(bio); - return ret; -} - -int i2d_PKCS12(const PKCS12 *p12, uint8_t **out) { - if (p12->ber_len > INT_MAX) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_OVERFLOW); - return -1; - } - - if (out == NULL) { - return (int)p12->ber_len; - } - - if (*out == NULL) { - *out = OPENSSL_malloc(p12->ber_len); - if (*out == NULL) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE); - return -1; - } - OPENSSL_memcpy(*out, p12->ber_bytes, p12->ber_len); - } else { - OPENSSL_memcpy(*out, p12->ber_bytes, p12->ber_len); - *out += p12->ber_len; - } - return (int)p12->ber_len; -} - -int i2d_PKCS12_bio(BIO *bio, const PKCS12 *p12) { - return BIO_write_all(bio, p12->ber_bytes, p12->ber_len); -} - -int i2d_PKCS12_fp(FILE *fp, const PKCS12 *p12) { - BIO *bio = BIO_new_fp(fp, 0 /* don't take ownership */); - if (bio == NULL) { - return 0; - } - - int ret = i2d_PKCS12_bio(bio, p12); - BIO_free(bio); - return ret; -} - -int PKCS12_parse(const PKCS12 *p12, const char *password, EVP_PKEY **out_pkey, - X509 **out_cert, STACK_OF(X509) **out_ca_certs) { - CBS ber_bytes; - STACK_OF(X509) *ca_certs = NULL; - char ca_certs_alloced = 0; - - if (out_ca_certs != NULL && *out_ca_certs != NULL) { - ca_certs = *out_ca_certs; - } - - if (!ca_certs) { - ca_certs = sk_X509_new_null(); - if (ca_certs == NULL) { - OPENSSL_PUT_ERROR(PKCS8, ERR_R_MALLOC_FAILURE); - return 0; - } - ca_certs_alloced = 1; - } - - CBS_init(&ber_bytes, p12->ber_bytes, p12->ber_len); - if (!PKCS12_get_key_and_certs(out_pkey, ca_certs, &ber_bytes, password)) { - if (ca_certs_alloced) { - sk_X509_free(ca_certs); - } - return 0; - } - - // OpenSSL selects the last certificate which matches the private key as - // |out_cert|. - *out_cert = NULL; - size_t num_certs = sk_X509_num(ca_certs); - if (*out_pkey != NULL && num_certs > 0) { - for (size_t i = num_certs - 1; i < num_certs; i--) { - X509 *cert = sk_X509_value(ca_certs, i); - if (X509_check_private_key(cert, *out_pkey)) { - *out_cert = cert; - sk_X509_delete(ca_certs, i); - break; - } - ERR_clear_error(); - } - } - - if (out_ca_certs) { - *out_ca_certs = ca_certs; - } else { - sk_X509_pop_free(ca_certs, X509_free); - } - - return 1; -} - -int PKCS12_verify_mac(const PKCS12 *p12, const char *password, - int password_len) { - if (password == NULL) { - if (password_len != 0) { - return 0; - } - } else if (password_len != -1 && - (password[password_len] != 0 || - OPENSSL_memchr(password, 0, password_len) != NULL)) { - return 0; - } - - EVP_PKEY *pkey = NULL; - X509 *cert = NULL; - if (!PKCS12_parse(p12, password, &pkey, &cert, NULL)) { - ERR_clear_error(); - return 0; - } - - EVP_PKEY_free(pkey); - X509_free(cert); - - return 1; -} - -// add_bag_attributes adds the bagAttributes field of a SafeBag structure, -// containing the specified friendlyName and localKeyId attributes. -static int add_bag_attributes(CBB *bag, const char *name, size_t name_len, - const uint8_t *key_id, size_t key_id_len) { - if (name == NULL && key_id_len == 0) { - return 1; // Omit the OPTIONAL SET. - } - // See https://tools.ietf.org/html/rfc7292#section-4.2. - CBB attrs, attr, oid, values, value; - if (!CBB_add_asn1(bag, &attrs, CBS_ASN1_SET)) { - return 0; - } - if (name_len != 0) { - // See https://tools.ietf.org/html/rfc2985, section 5.5.1. - if (!CBB_add_asn1(&attrs, &attr, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&attr, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, kFriendlyName, sizeof(kFriendlyName)) || - !CBB_add_asn1(&attr, &values, CBS_ASN1_SET) || - !CBB_add_asn1(&values, &value, CBS_ASN1_BMPSTRING)) { - return 0; - } - // Convert the friendly name to a BMPString. - CBS name_cbs; - CBS_init(&name_cbs, (const uint8_t *)name, name_len); - while (CBS_len(&name_cbs) != 0) { - uint32_t c; - if (!cbs_get_utf8(&name_cbs, &c) || - !cbb_add_ucs2_be(&value, c)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_INVALID_CHARACTERS); - return 0; - } - } - } - if (key_id_len != 0) { - // See https://tools.ietf.org/html/rfc2985, section 5.5.2. - if (!CBB_add_asn1(&attrs, &attr, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&attr, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, kLocalKeyID, sizeof(kLocalKeyID)) || - !CBB_add_asn1(&attr, &values, CBS_ASN1_SET) || - !CBB_add_asn1(&values, &value, CBS_ASN1_OCTETSTRING) || - !CBB_add_bytes(&value, key_id, key_id_len)) { - return 0; - } - } - return CBB_flush_asn1_set_of(&attrs) && - CBB_flush(bag); -} - -static int add_cert_bag(CBB *cbb, X509 *cert, const char *name, - const uint8_t *key_id, size_t key_id_len) { - CBB bag, bag_oid, bag_contents, cert_bag, cert_type, wrapped_cert, cert_value; - if (// See https://tools.ietf.org/html/rfc7292#section-4.2. - !CBB_add_asn1(cbb, &bag, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&bag, &bag_oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&bag_oid, kCertBag, sizeof(kCertBag)) || - !CBB_add_asn1(&bag, &bag_contents, - CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || - // See https://tools.ietf.org/html/rfc7292#section-4.2.3. - !CBB_add_asn1(&bag_contents, &cert_bag, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&cert_bag, &cert_type, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&cert_type, kX509Certificate, sizeof(kX509Certificate)) || - !CBB_add_asn1(&cert_bag, &wrapped_cert, - CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || - !CBB_add_asn1(&wrapped_cert, &cert_value, CBS_ASN1_OCTETSTRING)) { - return 0; - } - uint8_t *buf; - int len = i2d_X509(cert, NULL); - - int int_name_len = 0; - const char *cert_name = (const char *)X509_alias_get0(cert, &int_name_len); - size_t name_len = int_name_len; - if (name) { - if (name_len != 0) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_AMBIGUOUS_FRIENDLY_NAME); - return 0; - } - name_len = strlen(name); - } else { - name = cert_name; - } - - if (len < 0 || - !CBB_add_space(&cert_value, &buf, (size_t)len) || - i2d_X509(cert, &buf) < 0 || - !add_bag_attributes(&bag, name, name_len, key_id, key_id_len) || - !CBB_flush(cbb)) { - return 0; - } - return 1; -} - -static int add_cert_safe_contents(CBB *cbb, X509 *cert, - const STACK_OF(X509) *chain, const char *name, - const uint8_t *key_id, size_t key_id_len) { - CBB safe_contents; - if (!CBB_add_asn1(cbb, &safe_contents, CBS_ASN1_SEQUENCE) || - (cert != NULL && - !add_cert_bag(&safe_contents, cert, name, key_id, key_id_len))) { - return 0; - } - - for (size_t i = 0; i < sk_X509_num(chain); i++) { - // Only the leaf certificate gets attributes. - if (!add_cert_bag(&safe_contents, sk_X509_value(chain, i), NULL, NULL, 0)) { - return 0; - } - } - - return CBB_flush(cbb); -} - -static int add_encrypted_data(CBB *out, int pbe_nid, const char *password, - size_t password_len, unsigned iterations, - const uint8_t *in, size_t in_len) { - uint8_t salt[PKCS5_SALT_LEN]; - if (!RAND_bytes(salt, sizeof(salt))) { - return 0; - } - - int ret = 0; - EVP_CIPHER_CTX ctx; - EVP_CIPHER_CTX_init(&ctx); - CBB content_info, type, wrapper, encrypted_data, encrypted_content_info, - inner_type, encrypted_content; - if (// Add the ContentInfo wrapping. - !CBB_add_asn1(out, &content_info, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&content_info, &type, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&type, kPKCS7EncryptedData, sizeof(kPKCS7EncryptedData)) || - !CBB_add_asn1(&content_info, &wrapper, - CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || - // See https://tools.ietf.org/html/rfc2315#section-13. - !CBB_add_asn1(&wrapper, &encrypted_data, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1_uint64(&encrypted_data, 0 /* version */) || - // See https://tools.ietf.org/html/rfc2315#section-10.1. - !CBB_add_asn1(&encrypted_data, &encrypted_content_info, - CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&encrypted_content_info, &inner_type, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&inner_type, kPKCS7Data, sizeof(kPKCS7Data)) || - // Set up encryption and fill in contentEncryptionAlgorithm. - !pkcs12_pbe_encrypt_init(&encrypted_content_info, &ctx, pbe_nid, - iterations, password, password_len, salt, - sizeof(salt)) || - // Note this tag is primitive. It is an implicitly-tagged OCTET_STRING, so - // it inherits the inner tag's constructed bit. - !CBB_add_asn1(&encrypted_content_info, &encrypted_content, - CBS_ASN1_CONTEXT_SPECIFIC | 0)) { - goto err; - } - - size_t max_out = in_len + EVP_CIPHER_CTX_block_size(&ctx); - if (max_out < in_len) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_TOO_LONG); - goto err; - } - - uint8_t *ptr; - int n1, n2; - if (!CBB_reserve(&encrypted_content, &ptr, max_out) || - !EVP_CipherUpdate(&ctx, ptr, &n1, in, in_len) || - !EVP_CipherFinal_ex(&ctx, ptr + n1, &n2) || - !CBB_did_write(&encrypted_content, n1 + n2) || - !CBB_flush(out)) { - goto err; - } - - ret = 1; - -err: - EVP_CIPHER_CTX_cleanup(&ctx); - return ret; -} - -PKCS12 *PKCS12_create(const char *password, const char *name, - const EVP_PKEY *pkey, X509 *cert, - const STACK_OF(X509)* chain, int key_nid, int cert_nid, - int iterations, int mac_iterations, int key_type) { - if (key_nid == 0) { - key_nid = NID_pbe_WithSHA1And3_Key_TripleDES_CBC; - } - if (cert_nid == 0) { - cert_nid = NID_pbe_WithSHA1And40BitRC2_CBC; - } - if (iterations == 0) { - iterations = PKCS12_DEFAULT_ITER; - } - if (mac_iterations == 0) { - mac_iterations = 1; - } - if (// In OpenSSL, this specifies a non-standard Microsoft key usage extension - // which we do not currently support. - key_type != 0 || - // In OpenSSL, -1 here means to omit the MAC, which we do not - // currently support. Omitting it is also invalid for a password-based - // PKCS#12 file. - mac_iterations < 0 || - // Don't encode empty objects. - (pkey == NULL && cert == NULL && sk_X509_num(chain) == 0)) { - OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_OPTIONS); - return 0; - } - - // PKCS#12 is a very confusing recursive data format, built out of another - // recursive data format. Section 5.1 of RFC 7292 describes the encoding - // algorithm, but there is no clear overview. A quick summary: - // - // PKCS#7 defines a ContentInfo structure, which is a overgeneralized typed - // combinator structure for applying cryptography. We care about two types. A - // data ContentInfo contains an OCTET STRING and is a leaf node of the - // combinator tree. An encrypted-data ContentInfo contains encryption - // parameters (key derivation and encryption) and wraps another ContentInfo, - // usually data. - // - // A PKCS#12 file is a PFX structure (section 4), which contains a single data - // ContentInfo and a MAC over it. This root ContentInfo is the - // AuthenticatedSafe and its payload is a SEQUENCE of other ContentInfos, so - // that different parts of the PKCS#12 file can by differently protected. - // - // Each ContentInfo in the AuthenticatedSafe, after undoing all the PKCS#7 - // combinators, has SafeContents payload. A SafeContents is a SEQUENCE of - // SafeBag. SafeBag is PKCS#12's typed structure, with subtypes such as KeyBag - // and CertBag. Confusingly, there is a SafeContents bag type which itself - // recursively contains more SafeBags, but we do not implement this. Bags also - // can have attributes. - // - // The grouping of SafeBags into intermediate ContentInfos does not appear to - // be significant, except that all SafeBags sharing a ContentInfo have the - // same level of protection. Additionally, while keys may be encrypted by - // placing a KeyBag in an encrypted-data ContentInfo, PKCS#12 also defines a - // key-specific encryption container, PKCS8ShroudedKeyBag, which is used - // instead. - - // Note that |password| may be NULL to specify no password, rather than the - // empty string. They are encoded differently in PKCS#12. (One is the empty - // byte array and the other is NUL-terminated UCS-2.) - size_t password_len = password != NULL ? strlen(password) : 0; - - uint8_t key_id[EVP_MAX_MD_SIZE]; - unsigned key_id_len = 0; - if (cert != NULL && pkey != NULL) { - if (!X509_check_private_key(cert, pkey) || - // Matching OpenSSL, use the SHA-1 hash of the certificate as the local - // key ID. Some PKCS#12 consumers require one to connect the private key - // and certificate. - !X509_digest(cert, EVP_sha1(), key_id, &key_id_len)) { - return 0; - } - } - - // See https://tools.ietf.org/html/rfc7292#section-4. - PKCS12 *ret = NULL; - CBB cbb, pfx, auth_safe, auth_safe_oid, auth_safe_wrapper, auth_safe_data, - content_infos; - uint8_t mac_key[EVP_MAX_MD_SIZE]; - if (!CBB_init(&cbb, 0) || - !CBB_add_asn1(&cbb, &pfx, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1_uint64(&pfx, 3) || - // auth_safe is a data ContentInfo. - !CBB_add_asn1(&pfx, &auth_safe, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&auth_safe, &auth_safe_oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&auth_safe_oid, kPKCS7Data, sizeof(kPKCS7Data)) || - !CBB_add_asn1(&auth_safe, &auth_safe_wrapper, - CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || - !CBB_add_asn1(&auth_safe_wrapper, &auth_safe_data, - CBS_ASN1_OCTETSTRING) || - // See https://tools.ietf.org/html/rfc7292#section-4.1. |auth_safe|'s - // contains a SEQUENCE of ContentInfos. - !CBB_add_asn1(&auth_safe_data, &content_infos, CBS_ASN1_SEQUENCE)) { - goto err; - } - - // If there are any certificates, place them in CertBags wrapped in a single - // encrypted ContentInfo. - if (cert != NULL || sk_X509_num(chain) > 0) { - if (cert_nid < 0) { - // Place the certificates in an unencrypted ContentInfo. This could be - // more compactly-encoded by reusing the same ContentInfo as the key, but - // OpenSSL does not do this. We keep them separate for consistency. (Keys, - // even when encrypted, are always placed in unencrypted ContentInfos. - // PKCS#12 defines bag-level encryption for keys.) - CBB content_info, oid, wrapper, data; - if (!CBB_add_asn1(&content_infos, &content_info, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&content_info, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, kPKCS7Data, sizeof(kPKCS7Data)) || - !CBB_add_asn1(&content_info, &wrapper, - CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || - !CBB_add_asn1(&wrapper, &data, CBS_ASN1_OCTETSTRING) || - !add_cert_safe_contents(&data, cert, chain, name, key_id, - key_id_len) || - !CBB_flush(&content_infos)) { - goto err; - } - } else { - CBB plaintext_cbb; - int ok = CBB_init(&plaintext_cbb, 0) && - add_cert_safe_contents(&plaintext_cbb, cert, chain, name, key_id, - key_id_len) && - add_encrypted_data( - &content_infos, cert_nid, password, password_len, iterations, - CBB_data(&plaintext_cbb), CBB_len(&plaintext_cbb)); - CBB_cleanup(&plaintext_cbb); - if (!ok) { - goto err; - } - } - } - - // If there is a key, place it in a single KeyBag or PKCS8ShroudedKeyBag - // wrapped in an unencrypted ContentInfo. (One could also place it in a KeyBag - // inside an encrypted ContentInfo, but OpenSSL does not do this and some - // PKCS#12 consumers do not support KeyBags.) - if (pkey != NULL) { - CBB content_info, oid, wrapper, data, safe_contents, bag, bag_oid, - bag_contents; - if (// Add another data ContentInfo. - !CBB_add_asn1(&content_infos, &content_info, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&content_info, &oid, CBS_ASN1_OBJECT) || - !CBB_add_bytes(&oid, kPKCS7Data, sizeof(kPKCS7Data)) || - !CBB_add_asn1(&content_info, &wrapper, - CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || - !CBB_add_asn1(&wrapper, &data, CBS_ASN1_OCTETSTRING) || - !CBB_add_asn1(&data, &safe_contents, CBS_ASN1_SEQUENCE) || - // Add a SafeBag containing a PKCS8ShroudedKeyBag. - !CBB_add_asn1(&safe_contents, &bag, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&bag, &bag_oid, CBS_ASN1_OBJECT)) { - goto err; - } - if (key_nid < 0) { - if (!CBB_add_bytes(&bag_oid, kKeyBag, sizeof(kKeyBag)) || - !CBB_add_asn1(&bag, &bag_contents, - CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || - !EVP_marshal_private_key(&bag_contents, pkey)) { - goto err; - } - } else { - if (!CBB_add_bytes(&bag_oid, kPKCS8ShroudedKeyBag, - sizeof(kPKCS8ShroudedKeyBag)) || - !CBB_add_asn1(&bag, &bag_contents, - CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || - !PKCS8_marshal_encrypted_private_key( - &bag_contents, key_nid, NULL, password, password_len, - NULL /* generate a random salt */, - 0 /* use default salt length */, iterations, pkey)) { - goto err; - } - } - size_t name_len = 0; - if (name) { - name_len = strlen(name); - } - if (!add_bag_attributes(&bag, name, name_len, key_id, key_id_len) || - !CBB_flush(&content_infos)) { - goto err; - } - } - - // Compute the MAC. Match OpenSSL in using SHA-1 as the hash function. The MAC - // covers |auth_safe_data|. - const EVP_MD *mac_md = EVP_sha1(); - uint8_t mac_salt[PKCS5_SALT_LEN]; - uint8_t mac[EVP_MAX_MD_SIZE]; - unsigned mac_len; - if (!CBB_flush(&auth_safe_data) || - !RAND_bytes(mac_salt, sizeof(mac_salt)) || - !pkcs12_key_gen(password, password_len, mac_salt, sizeof(mac_salt), - PKCS12_MAC_ID, mac_iterations, EVP_MD_size(mac_md), - mac_key, mac_md) || - !HMAC(mac_md, mac_key, EVP_MD_size(mac_md), CBB_data(&auth_safe_data), - CBB_len(&auth_safe_data), mac, &mac_len)) { - goto err; - } - - CBB mac_data, digest_info, mac_cbb, mac_salt_cbb; - if (!CBB_add_asn1(&pfx, &mac_data, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1(&mac_data, &digest_info, CBS_ASN1_SEQUENCE) || - !EVP_marshal_digest_algorithm(&digest_info, mac_md) || - !CBB_add_asn1(&digest_info, &mac_cbb, CBS_ASN1_OCTETSTRING) || - !CBB_add_bytes(&mac_cbb, mac, mac_len) || - !CBB_add_asn1(&mac_data, &mac_salt_cbb, CBS_ASN1_OCTETSTRING) || - !CBB_add_bytes(&mac_salt_cbb, mac_salt, sizeof(mac_salt)) || - // The iteration count has a DEFAULT of 1, but RFC 7292 says "The default - // is for historical reasons and its use is deprecated." Thus we - // explicitly encode the iteration count, though it is not valid DER. - !CBB_add_asn1_uint64(&mac_data, mac_iterations)) { - goto err; - } - - ret = OPENSSL_malloc(sizeof(PKCS12)); - if (ret == NULL || - !CBB_finish(&cbb, &ret->ber_bytes, &ret->ber_len)) { - OPENSSL_free(ret); - ret = NULL; - goto err; - } - -err: - OPENSSL_cleanse(mac_key, sizeof(mac_key)); - CBB_cleanup(&cbb); - return ret; -} - -void PKCS12_free(PKCS12 *p12) { - if (p12 == NULL) { - return; - } - OPENSSL_free(p12->ber_bytes); - OPENSSL_free(p12); -} diff --git a/third_party/boringssl/src/crypto/pkcs8/pkcs8_x509.cc b/third_party/boringssl/src/crypto/pkcs8/pkcs8_x509.cc new file mode 100644 index 00000000..3f18013d --- /dev/null +++ b/third_party/boringssl/src/crypto/pkcs8/pkcs8_x509.cc @@ -0,0 +1,1328 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../internal.h" +#include "../mem_internal.h" +#include "../x509/internal.h" +#include "internal.h" + + +using namespace bssl; + +int bssl::pkcs12_iterations_acceptable(uint64_t iterations) { +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) + static const uint64_t kIterationsLimit = 2048; +#else + // Windows imposes a limit of 600K. Mozilla say: “so them increasing + // maximum to something like 100M or 1G (to have few decades of breathing + // room) would be very welcome”[1]. So here we set the limit to 100M. + // + // [1] https://bugzilla.mozilla.org/show_bug.cgi?id=1436873#c14 + static const uint64_t kIterationsLimit = 100 * 1000000; +#endif + + assert(kIterationsLimit <= UINT32_MAX); + return 0 < iterations && iterations <= kIterationsLimit; +} + +ASN1_SEQUENCE(PKCS8_PRIV_KEY_INFO) = { + ASN1_SIMPLE(PKCS8_PRIV_KEY_INFO, version, ASN1_INTEGER), + ASN1_SIMPLE(PKCS8_PRIV_KEY_INFO, pkeyalg, X509_ALGOR), + ASN1_SIMPLE(PKCS8_PRIV_KEY_INFO, pkey, ASN1_OCTET_STRING), + ASN1_IMP_SET_OF_OPT(PKCS8_PRIV_KEY_INFO, attributes, bssl::X509_ATTRIBUTE, + 0), +} ASN1_SEQUENCE_END(PKCS8_PRIV_KEY_INFO) + +IMPLEMENT_ASN1_FUNCTIONS_const(PKCS8_PRIV_KEY_INFO) + +EVP_PKEY *EVP_PKCS82PKEY(const PKCS8_PRIV_KEY_INFO *p8) { + uint8_t *der = nullptr; + int der_len = i2d_PKCS8_PRIV_KEY_INFO(p8, &der); + if (der_len < 0) { + return nullptr; + } + + CBS cbs; + CBS_init(&cbs, der, (size_t)der_len); + EVP_PKEY *ret = EVP_parse_private_key(&cbs); + if (ret == nullptr || CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); + EVP_PKEY_free(ret); + OPENSSL_free(der); + return nullptr; + } + + OPENSSL_free(der); + return ret; +} + +PKCS8_PRIV_KEY_INFO *EVP_PKEY2PKCS8(const EVP_PKEY *pkey) { + CBB cbb; + uint8_t *der = nullptr; + size_t der_len; + if (!CBB_init(&cbb, 0) || !EVP_marshal_private_key(&cbb, pkey) || + !CBB_finish(&cbb, &der, &der_len) || der_len > LONG_MAX) { + CBB_cleanup(&cbb); + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_ENCODE_ERROR); + OPENSSL_free(der); + return nullptr; + } + + const uint8_t *p = der; + PKCS8_PRIV_KEY_INFO *p8 = d2i_PKCS8_PRIV_KEY_INFO(nullptr, &p, (long)der_len); + if (p8 == nullptr || p != der + der_len) { + PKCS8_PRIV_KEY_INFO_free(p8); + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_DECODE_ERROR); + goto err; + } + + OPENSSL_free(der); + return p8; + +err: + OPENSSL_free(der); + return nullptr; +} + +PKCS8_PRIV_KEY_INFO *PKCS8_decrypt(X509_SIG *pkcs8, const char *pass, + int pass_len_in) { + size_t pass_len; + if (pass_len_in == -1 && pass != nullptr) { + pass_len = strlen(pass); + } else { + pass_len = (size_t)pass_len_in; + } + + PKCS8_PRIV_KEY_INFO *ret = nullptr; + EVP_PKEY *pkey = nullptr; + uint8_t *in = nullptr; + + // Convert the legacy ASN.1 object to a byte string. + int in_len = i2d_X509_SIG(pkcs8, &in); + if (in_len < 0) { + goto err; + } + + CBS cbs; + CBS_init(&cbs, in, in_len); + pkey = PKCS8_parse_encrypted_private_key(&cbs, pass, pass_len); + if (pkey == nullptr || CBS_len(&cbs) != 0) { + goto err; + } + + ret = EVP_PKEY2PKCS8(pkey); + +err: + OPENSSL_free(in); + EVP_PKEY_free(pkey); + return ret; +} + +X509_SIG *PKCS8_encrypt(int pbe_nid, const EVP_CIPHER *cipher, const char *pass, + int pass_len_in, const uint8_t *salt, size_t salt_len, + int iterations, PKCS8_PRIV_KEY_INFO *p8inf) { + size_t pass_len; + if (pass_len_in == -1 && pass != nullptr) { + pass_len = strlen(pass); + } else { + pass_len = (size_t)pass_len_in; + } + + // Parse out the private key. + EVP_PKEY *pkey = EVP_PKCS82PKEY(p8inf); + if (pkey == nullptr) { + return nullptr; + } + + X509_SIG *ret = nullptr; + uint8_t *der = nullptr; + const uint8_t *ptr; + size_t der_len; + CBB cbb; + if (!CBB_init(&cbb, 128) || + !PKCS8_marshal_encrypted_private_key(&cbb, pbe_nid, cipher, pass, + pass_len, salt, salt_len, iterations, + pkey) || + !CBB_finish(&cbb, &der, &der_len)) { + CBB_cleanup(&cbb); + goto err; + } + + // Convert back to legacy ASN.1 objects. + ptr = der; + ret = d2i_X509_SIG(nullptr, &ptr, der_len); + if (ret == nullptr || ptr != der + der_len) { + OPENSSL_PUT_ERROR(PKCS8, ERR_R_INTERNAL_ERROR); + X509_SIG_free(ret); + ret = nullptr; + } + +err: + OPENSSL_free(der); + EVP_PKEY_free(pkey); + return ret; +} + +struct pkcs12_context { + EVP_PKEY **out_key; + STACK_OF(X509) *out_certs; + const char *password; + size_t password_len; +}; + +// PKCS12_handle_sequence parses a BER-encoded SEQUENCE of elements in a PKCS#12 +// structure. +static int PKCS12_handle_sequence( + CBS *sequence, struct pkcs12_context *ctx, + int (*handle_element)(CBS *cbs, struct pkcs12_context *ctx)) { + uint8_t *storage = nullptr; + CBS in; + int ret = 0; + + // Although a BER->DER conversion is done at the beginning of |PKCS12_parse|, + // the ASN.1 data gets wrapped in OCTETSTRINGs and/or encrypted and the + // conversion cannot see through those wrappings. So each time we step + // through one we need to convert to DER again. + if (!CBS_asn1_ber_to_der(sequence, &in, &storage)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + return 0; + } + + CBS child; + if (!CBS_get_asn1(&in, &child, CBS_ASN1_SEQUENCE) || CBS_len(&in) != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + while (CBS_len(&child) > 0) { + CBS element; + if (!CBS_get_asn1(&child, &element, CBS_ASN1_SEQUENCE)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + if (!handle_element(&element, ctx)) { + goto err; + } + } + + ret = 1; + +err: + OPENSSL_free(storage); + return ret; +} + +// 1.2.840.113549.1.12.10.1.1 +static const uint8_t kKeyBag[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, + 0x01, 0x0c, 0x0a, 0x01, 0x01}; + +// 1.2.840.113549.1.12.10.1.2 +static const uint8_t kPKCS8ShroudedKeyBag[] = { + 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x0c, 0x0a, 0x01, 0x02}; + +// 1.2.840.113549.1.12.10.1.3 +static const uint8_t kCertBag[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, + 0x01, 0x0c, 0x0a, 0x01, 0x03}; + +// 1.2.840.113549.1.9.20 +static const uint8_t kFriendlyName[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, + 0x0d, 0x01, 0x09, 0x14}; + +// 1.2.840.113549.1.9.21 +static const uint8_t kLocalKeyID[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, + 0x0d, 0x01, 0x09, 0x15}; + +// 1.2.840.113549.1.9.22.1 +static const uint8_t kX509Certificate[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, + 0x0d, 0x01, 0x09, 0x16, 0x01}; + +// parse_bag_attributes parses the bagAttributes field of a SafeBag structure. +// It sets |*out_friendly_name| to a newly-allocated copy of the friendly name, +// encoded as a UTF-8 string, or NULL if there is none. It returns one on +// success and zero on error. +static int parse_bag_attributes(CBS *attrs, uint8_t **out_friendly_name, + size_t *out_friendly_name_len) { + *out_friendly_name = nullptr; + *out_friendly_name_len = 0; + + // See https://tools.ietf.org/html/rfc7292#section-4.2. + while (CBS_len(attrs) != 0) { + CBS attr, oid, values; + if (!CBS_get_asn1(attrs, &attr, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&attr, &oid, CBS_ASN1_OBJECT) || + !CBS_get_asn1(&attr, &values, CBS_ASN1_SET) || CBS_len(&attr) != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + if (CBS_mem_equal(&oid, kFriendlyName, sizeof(kFriendlyName))) { + // See https://tools.ietf.org/html/rfc2985, section 5.5.1. + CBS value; + if (*out_friendly_name != nullptr || + !CBS_get_asn1(&values, &value, CBS_ASN1_BMPSTRING) || + CBS_len(&values) != 0 || CBS_len(&value) == 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + // Convert the friendly name to UTF-8. + CBB cbb; + if (!CBB_init(&cbb, CBS_len(&value))) { + goto err; + } + while (CBS_len(&value) != 0) { + uint32_t c; + if (!CBS_get_ucs2_be(&value, &c) || !CBB_add_utf8(&cbb, c)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_INVALID_CHARACTERS); + CBB_cleanup(&cbb); + goto err; + } + } + if (!CBB_finish(&cbb, out_friendly_name, out_friendly_name_len)) { + CBB_cleanup(&cbb); + goto err; + } + } + } + + return 1; + +err: + OPENSSL_free(*out_friendly_name); + *out_friendly_name = nullptr; + *out_friendly_name_len = 0; + return 0; +} + +// PKCS12_handle_safe_bag parses a single SafeBag element in a PKCS#12 +// structure. +static int PKCS12_handle_safe_bag(CBS *safe_bag, struct pkcs12_context *ctx) { + CBS bag_id, wrapped_value, bag_attrs; + if (!CBS_get_asn1(safe_bag, &bag_id, CBS_ASN1_OBJECT) || + !CBS_get_asn1(safe_bag, &wrapped_value, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + return 0; + } + if (CBS_len(safe_bag) == 0) { + CBS_init(&bag_attrs, nullptr, 0); + } else if (!CBS_get_asn1(safe_bag, &bag_attrs, CBS_ASN1_SET) || + CBS_len(safe_bag) != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + return 0; + } + + const int is_key_bag = CBS_mem_equal(&bag_id, kKeyBag, sizeof(kKeyBag)); + const int is_shrouded_key_bag = CBS_mem_equal(&bag_id, kPKCS8ShroudedKeyBag, + sizeof(kPKCS8ShroudedKeyBag)); + if (is_key_bag || is_shrouded_key_bag) { + // See RFC 7292, section 4.2.1 and 4.2.2. + if (*ctx->out_key) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_MULTIPLE_PRIVATE_KEYS_IN_PKCS12); + return 0; + } + + EVP_PKEY *pkey = + is_key_bag ? EVP_parse_private_key(&wrapped_value) + : PKCS8_parse_encrypted_private_key( + &wrapped_value, ctx->password, ctx->password_len); + if (pkey == nullptr) { + return 0; + } + + if (CBS_len(&wrapped_value) != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + EVP_PKEY_free(pkey); + return 0; + } + + *ctx->out_key = pkey; + return 1; + } + + if (CBS_mem_equal(&bag_id, kCertBag, sizeof(kCertBag))) { + // See RFC 7292, section 4.2.3. + CBS cert_bag, cert_type, wrapped_cert, cert; + if (!CBS_get_asn1(&wrapped_value, &cert_bag, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&cert_bag, &cert_type, CBS_ASN1_OBJECT) || + !CBS_get_asn1(&cert_bag, &wrapped_cert, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) || + !CBS_get_asn1(&wrapped_cert, &cert, CBS_ASN1_OCTETSTRING)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + return 0; + } + + // Skip unknown certificate types. + if (!CBS_mem_equal(&cert_type, kX509Certificate, + sizeof(kX509Certificate))) { + return 1; + } + + if (CBS_len(&cert) > LONG_MAX) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + return 0; + } + + const uint8_t *inp = CBS_data(&cert); + X509 *x509 = d2i_X509(nullptr, &inp, (long)CBS_len(&cert)); + if (!x509) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + return 0; + } + + if (inp != CBS_data(&cert) + CBS_len(&cert)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + X509_free(x509); + return 0; + } + + uint8_t *friendly_name; + size_t friendly_name_len; + if (!parse_bag_attributes(&bag_attrs, &friendly_name, &friendly_name_len)) { + X509_free(x509); + return 0; + } + int ok = friendly_name_len == 0 || + X509_alias_set1(x509, friendly_name, friendly_name_len); + OPENSSL_free(friendly_name); + if (!ok || 0 == sk_X509_push(ctx->out_certs, x509)) { + X509_free(x509); + return 0; + } + + return 1; + } + + // Unknown element type - ignore it. + return 1; +} + +// 1.2.840.113549.1.7.1 +static const uint8_t kPKCS7Data[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, + 0x0d, 0x01, 0x07, 0x01}; + +// 1.2.840.113549.1.7.6 +static const uint8_t kPKCS7EncryptedData[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, + 0x0d, 0x01, 0x07, 0x06}; + +// PKCS12_handle_content_info parses a single PKCS#7 ContentInfo element in a +// PKCS#12 structure. +static int PKCS12_handle_content_info(CBS *content_info, + struct pkcs12_context *ctx) { + CBS content_type, wrapped_contents, contents; + int ret = 0; + uint8_t *storage = nullptr; + + if (!CBS_get_asn1(content_info, &content_type, CBS_ASN1_OBJECT) || + !CBS_get_asn1(content_info, &wrapped_contents, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0) || + CBS_len(content_info) != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + if (CBS_mem_equal(&content_type, kPKCS7EncryptedData, + sizeof(kPKCS7EncryptedData))) { + // See https://tools.ietf.org/html/rfc2315#section-13. + // + // PKCS#7 encrypted data inside a PKCS#12 structure is generally an + // encrypted certificate bag and it's generally encrypted with 40-bit + // RC2-CBC. + CBS version_bytes, eci, contents_type, ai, encrypted_contents; + uint8_t *out; + size_t out_len; + + if (!CBS_get_asn1(&wrapped_contents, &contents, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&contents, &version_bytes, CBS_ASN1_INTEGER) || + // EncryptedContentInfo, see + // https://tools.ietf.org/html/rfc2315#section-10.1 + !CBS_get_asn1(&contents, &eci, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&eci, &contents_type, CBS_ASN1_OBJECT) || + // AlgorithmIdentifier, see + // https://tools.ietf.org/html/rfc5280#section-4.1.1.2 + !CBS_get_asn1(&eci, &ai, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1_implicit_string(&eci, &encrypted_contents, &storage, + CBS_ASN1_CONTEXT_SPECIFIC | 0, + CBS_ASN1_OCTETSTRING)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + if (!CBS_mem_equal(&contents_type, kPKCS7Data, sizeof(kPKCS7Data))) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + if (!pkcs8_pbe_decrypt(&out, &out_len, &ai, ctx->password, + ctx->password_len, CBS_data(&encrypted_contents), + CBS_len(&encrypted_contents))) { + goto err; + } + + CBS safe_contents; + CBS_init(&safe_contents, out, out_len); + ret = PKCS12_handle_sequence(&safe_contents, ctx, PKCS12_handle_safe_bag); + OPENSSL_free(out); + } else if (CBS_mem_equal(&content_type, kPKCS7Data, sizeof(kPKCS7Data))) { + CBS octet_string_contents; + + if (!CBS_get_asn1(&wrapped_contents, &octet_string_contents, + CBS_ASN1_OCTETSTRING)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + ret = PKCS12_handle_sequence(&octet_string_contents, ctx, + PKCS12_handle_safe_bag); + } else { + // Unknown element type - ignore it. + ret = 1; + } + +err: + OPENSSL_free(storage); + return ret; +} + +static int pkcs12_check_mac(int *out_mac_ok, const char *password, + size_t password_len, const CBS *salt, + uint32_t iterations, const EVP_MD *md, + const CBS *authsafes, const CBS *expected_mac) { + int ret = 0; + uint8_t hmac_key[EVP_MAX_MD_SIZE]; + if (!pkcs12_key_gen(password, password_len, CBS_data(salt), CBS_len(salt), + PKCS12_MAC_ID, iterations, EVP_MD_size(md), hmac_key, + md)) { + goto err; + } + + uint8_t hmac[EVP_MAX_MD_SIZE]; + unsigned hmac_len; + if (nullptr == HMAC(md, hmac_key, EVP_MD_size(md), CBS_data(authsafes), + CBS_len(authsafes), hmac, &hmac_len)) { + goto err; + } + + *out_mac_ok = CBS_mem_equal(expected_mac, hmac, hmac_len); + if (CRYPTO_fuzzer_mode_enabled()) { + *out_mac_ok = 1; + } + ret = 1; + +err: + OPENSSL_cleanse(hmac_key, sizeof(hmac_key)); + return ret; +} + + +int PKCS12_get_key_and_certs(EVP_PKEY **out_key, STACK_OF(X509) *out_certs, + CBS *ber_in, const char *password) { + uint8_t *storage = nullptr; + CBS in, pfx, mac_data, authsafe, content_type, wrapped_authsafes, authsafes; + uint64_t version; + int ret = 0; + struct pkcs12_context ctx; + const size_t original_out_certs_len = sk_X509_num(out_certs); + + // The input may be in BER format. + if (!CBS_asn1_ber_to_der(ber_in, &in, &storage)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + return 0; + } + + *out_key = nullptr; + OPENSSL_memset(&ctx, 0, sizeof(ctx)); + + // See ftp://ftp.rsasecurity.com/pub/pkcs/pkcs-12/pkcs-12v1.pdf, section + // four. + if (!CBS_get_asn1(&in, &pfx, CBS_ASN1_SEQUENCE) || CBS_len(&in) != 0 || + !CBS_get_asn1_uint64(&pfx, &version)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + if (version < 3) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_VERSION); + goto err; + } + + if (!CBS_get_asn1(&pfx, &authsafe, CBS_ASN1_SEQUENCE)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + if (CBS_len(&pfx) == 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_MISSING_MAC); + goto err; + } + + if (!CBS_get_asn1(&pfx, &mac_data, CBS_ASN1_SEQUENCE)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + // authsafe is a PKCS#7 ContentInfo. See + // https://tools.ietf.org/html/rfc2315#section-7. + if (!CBS_get_asn1(&authsafe, &content_type, CBS_ASN1_OBJECT) || + !CBS_get_asn1(&authsafe, &wrapped_authsafes, + CBS_ASN1_CONTEXT_SPECIFIC | CBS_ASN1_CONSTRUCTED | 0)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + // The content type can either be data or signedData. The latter indicates + // that it's signed by a public key, which isn't supported. + if (!CBS_mem_equal(&content_type, kPKCS7Data, sizeof(kPKCS7Data))) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_PKCS12_PUBLIC_KEY_INTEGRITY_NOT_SUPPORTED); + goto err; + } + + if (!CBS_get_asn1(&wrapped_authsafes, &authsafes, CBS_ASN1_OCTETSTRING)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + ctx.out_key = out_key; + ctx.out_certs = out_certs; + ctx.password = password; + ctx.password_len = password != nullptr ? strlen(password) : 0; + + // Verify the MAC. + { + CBS mac, salt, expected_mac; + if (!CBS_get_asn1(&mac_data, &mac, CBS_ASN1_SEQUENCE)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + const EVP_MD *md = EVP_parse_digest_algorithm(&mac); + if (md == nullptr) { + goto err; + } + + if (!CBS_get_asn1(&mac, &expected_mac, CBS_ASN1_OCTETSTRING) || + !CBS_get_asn1(&mac_data, &salt, CBS_ASN1_OCTETSTRING)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + + // The iteration count is optional and the default is one. + uint32_t iterations = 1; + if (CBS_len(&mac_data) > 0) { + uint64_t iterations_u64; + if (!CBS_get_asn1_uint64(&mac_data, &iterations_u64) || + !pkcs12_iterations_acceptable(iterations_u64)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_BAD_PKCS12_DATA); + goto err; + } + iterations = (uint32_t)iterations_u64; + } + + int mac_ok; + if (!pkcs12_check_mac(&mac_ok, ctx.password, ctx.password_len, &salt, + iterations, md, &authsafes, &expected_mac)) { + goto err; + } + if (!mac_ok && ctx.password_len == 0) { + // PKCS#12 encodes passwords as NUL-terminated UCS-2, so the empty + // password is encoded as {0, 0}. Some implementations use the empty byte + // array for "no password". OpenSSL considers a non-NULL password as {0, + // 0} and a NULL password as {}. It then, in high-level PKCS#12 parsing + // code, tries both options. We match this behavior. + ctx.password = ctx.password != nullptr ? nullptr : ""; + if (!pkcs12_check_mac(&mac_ok, ctx.password, ctx.password_len, &salt, + iterations, md, &authsafes, &expected_mac)) { + goto err; + } + } + if (!mac_ok) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_INCORRECT_PASSWORD); + goto err; + } + } + + // authsafes contains a series of PKCS#7 ContentInfos. + if (!PKCS12_handle_sequence(&authsafes, &ctx, PKCS12_handle_content_info)) { + goto err; + } + + ret = 1; + +err: + OPENSSL_free(storage); + if (!ret) { + EVP_PKEY_free(*out_key); + *out_key = nullptr; + while (sk_X509_num(out_certs) > original_out_certs_len) { + X509 *x509 = sk_X509_pop(out_certs); + X509_free(x509); + } + } + + return ret; +} + +void PKCS12_PBE_add() {} + +struct pkcs12_st { + uint8_t *ber_bytes; + size_t ber_len; +}; + +PKCS12 *d2i_PKCS12(PKCS12 **out_p12, const uint8_t **ber_bytes, + size_t ber_len) { + PKCS12 *p12 = New(); + if (!p12) { + return nullptr; + } + + p12->ber_bytes = + reinterpret_cast(OPENSSL_memdup(*ber_bytes, ber_len)); + if (!p12->ber_bytes) { + Delete(p12); + return nullptr; + } + + p12->ber_len = ber_len; + *ber_bytes += ber_len; + + if (out_p12) { + PKCS12_free(*out_p12); + *out_p12 = p12; + } + + return p12; +} + +PKCS12 *d2i_PKCS12_bio(BIO *bio, PKCS12 **out_p12) { + size_t used = 0; + BUF_MEM *buf; + const uint8_t *dummy; + static const size_t kMaxSize = 256 * 1024; + PKCS12 *ret = nullptr; + + buf = BUF_MEM_new(); + if (buf == nullptr) { + return nullptr; + } + if (BUF_MEM_grow(buf, 8192) == 0) { + goto out; + } + + for (;;) { + size_t max_read = buf->length - used; + int n = BIO_read(bio, &buf->data[used], + max_read > INT_MAX ? INT_MAX : (int)max_read); + if (n < 0) { + if (used == 0) { + goto out; + } + // Workaround a bug in node.js. It uses a memory BIO for this in the wrong + // mode. + n = 0; + } + + if (n == 0) { + break; + } + used += n; + + if (used < buf->length) { + continue; + } + + if (buf->length > kMaxSize || BUF_MEM_grow(buf, buf->length * 2) == 0) { + goto out; + } + } + + dummy = (uint8_t *)buf->data; + ret = d2i_PKCS12(out_p12, &dummy, used); + +out: + BUF_MEM_free(buf); + return ret; +} + +PKCS12 *d2i_PKCS12_fp(FILE *fp, PKCS12 **out_p12) { + BIO *bio; + PKCS12 *ret; + + bio = BIO_new_fp(fp, 0 /* don't take ownership */); + if (!bio) { + return nullptr; + } + + ret = d2i_PKCS12_bio(bio, out_p12); + BIO_free(bio); + return ret; +} + +int i2d_PKCS12(const PKCS12 *p12, uint8_t **out) { + if (p12->ber_len > INT_MAX) { + OPENSSL_PUT_ERROR(PKCS8, ERR_R_OVERFLOW); + return -1; + } + + if (out == nullptr) { + return (int)p12->ber_len; + } + + if (*out == nullptr) { + *out = reinterpret_cast( + OPENSSL_memdup(p12->ber_bytes, p12->ber_len)); + if (*out == nullptr) { + return -1; + } + } else { + OPENSSL_memcpy(*out, p12->ber_bytes, p12->ber_len); + *out += p12->ber_len; + } + return (int)p12->ber_len; +} + +int i2d_PKCS12_bio(BIO *bio, const PKCS12 *p12) { + return BIO_write_all(bio, p12->ber_bytes, p12->ber_len); +} + +int i2d_PKCS12_fp(FILE *fp, const PKCS12 *p12) { + BIO *bio = BIO_new_fp(fp, 0 /* don't take ownership */); + if (bio == nullptr) { + return 0; + } + + int ret = i2d_PKCS12_bio(bio, p12); + BIO_free(bio); + return ret; +} + +int PKCS12_parse(const PKCS12 *p12, const char *password, EVP_PKEY **out_pkey, + X509 **out_cert, STACK_OF(X509) **out_ca_certs) { + CBS ber_bytes; + STACK_OF(X509) *ca_certs = nullptr; + char ca_certs_alloced = 0; + + if (out_ca_certs != nullptr && *out_ca_certs != nullptr) { + ca_certs = *out_ca_certs; + } + + if (!ca_certs) { + ca_certs = sk_X509_new_null(); + if (ca_certs == nullptr) { + return 0; + } + ca_certs_alloced = 1; + } + + CBS_init(&ber_bytes, p12->ber_bytes, p12->ber_len); + if (!PKCS12_get_key_and_certs(out_pkey, ca_certs, &ber_bytes, password)) { + if (ca_certs_alloced) { + sk_X509_free(ca_certs); + } + return 0; + } + + // OpenSSL selects the last certificate which matches the private key as + // |out_cert|. + *out_cert = nullptr; + size_t num_certs = sk_X509_num(ca_certs); + if (*out_pkey != nullptr && num_certs > 0) { + for (size_t i = num_certs - 1; i < num_certs; i--) { + X509 *cert = sk_X509_value(ca_certs, i); + if (X509_check_private_key(cert, *out_pkey)) { + *out_cert = cert; + sk_X509_delete(ca_certs, i); + break; + } + ERR_clear_error(); + } + } + + if (out_ca_certs) { + *out_ca_certs = ca_certs; + } else { + sk_X509_pop_free(ca_certs, X509_free); + } + + return 1; +} + +int PKCS12_verify_mac(const PKCS12 *p12, const char *password, + int password_len) { + if (password == nullptr) { + if (password_len != 0) { + return 0; + } + } else if (password_len != -1 && + (password[password_len] != 0 || + OPENSSL_memchr(password, 0, password_len) != nullptr)) { + return 0; + } + + EVP_PKEY *pkey = nullptr; + X509 *cert = nullptr; + if (!PKCS12_parse(p12, password, &pkey, &cert, nullptr)) { + ERR_clear_error(); + return 0; + } + + EVP_PKEY_free(pkey); + X509_free(cert); + + return 1; +} + +// add_bag_attributes adds the bagAttributes field of a SafeBag structure, +// containing the specified friendlyName and localKeyId attributes. +static int add_bag_attributes(CBB *bag, const char *name, size_t name_len, + const uint8_t *key_id, size_t key_id_len) { + if (name == nullptr && key_id_len == 0) { + return 1; // Omit the OPTIONAL SET. + } + // See https://tools.ietf.org/html/rfc7292#section-4.2. + CBB attrs, attr, values, value; + if (!CBB_add_asn1(bag, &attrs, CBS_ASN1_SET)) { + return 0; + } + if (name_len != 0) { + // See https://tools.ietf.org/html/rfc2985, section 5.5.1. + if (!CBB_add_asn1(&attrs, &attr, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&attr, CBS_ASN1_OBJECT, kFriendlyName, + sizeof(kFriendlyName)) || + !CBB_add_asn1(&attr, &values, CBS_ASN1_SET) || + !CBB_add_asn1(&values, &value, CBS_ASN1_BMPSTRING)) { + return 0; + } + // Convert the friendly name to a BMPString. + CBS name_cbs; + CBS_init(&name_cbs, (const uint8_t *)name, name_len); + while (CBS_len(&name_cbs) != 0) { + uint32_t c; + if (!CBS_get_utf8(&name_cbs, &c) || !CBB_add_ucs2_be(&value, c)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_INVALID_CHARACTERS); + return 0; + } + } + } + if (key_id_len != 0) { + // See https://tools.ietf.org/html/rfc2985, section 5.5.2. + if (!CBB_add_asn1(&attrs, &attr, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&attr, CBS_ASN1_OBJECT, kLocalKeyID, + sizeof(kLocalKeyID)) || + !CBB_add_asn1(&attr, &values, CBS_ASN1_SET) || + !CBB_add_asn1_octet_string(&values, key_id, key_id_len)) { + return 0; + } + } + return CBB_flush_asn1_set_of(&attrs) && CBB_flush(bag); +} + +static int add_cert_bag(CBB *cbb, X509 *cert, const char *name, + const uint8_t *key_id, size_t key_id_len) { + CBB bag, bag_contents, cert_bag, wrapped_cert, cert_value; + if ( // See https://tools.ietf.org/html/rfc7292#section-4.2. + !CBB_add_asn1(cbb, &bag, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&bag, CBS_ASN1_OBJECT, kCertBag, + sizeof(kCertBag)) || + !CBB_add_asn1(&bag, &bag_contents, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || + // See https://tools.ietf.org/html/rfc7292#section-4.2.3. + !CBB_add_asn1(&bag_contents, &cert_bag, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&cert_bag, CBS_ASN1_OBJECT, kX509Certificate, + sizeof(kX509Certificate)) || + !CBB_add_asn1(&cert_bag, &wrapped_cert, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || + !CBB_add_asn1(&wrapped_cert, &cert_value, CBS_ASN1_OCTETSTRING)) { + return 0; + } + uint8_t *buf; + int len = i2d_X509(cert, nullptr); + + int int_name_len = 0; + const char *cert_name = (const char *)X509_alias_get0(cert, &int_name_len); + size_t name_len = int_name_len; + if (name) { + if (name_len != 0) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_AMBIGUOUS_FRIENDLY_NAME); + return 0; + } + name_len = strlen(name); + } else { + name = cert_name; + } + + if (len < 0 || !CBB_add_space(&cert_value, &buf, (size_t)len) || + i2d_X509(cert, &buf) < 0 || + !add_bag_attributes(&bag, name, name_len, key_id, key_id_len) || + !CBB_flush(cbb)) { + return 0; + } + return 1; +} + +static int add_cert_safe_contents(CBB *cbb, X509 *cert, + const STACK_OF(X509) *chain, const char *name, + const uint8_t *key_id, size_t key_id_len) { + CBB safe_contents; + if (!CBB_add_asn1(cbb, &safe_contents, CBS_ASN1_SEQUENCE) || + (cert != nullptr && + !add_cert_bag(&safe_contents, cert, name, key_id, key_id_len))) { + return 0; + } + + for (size_t i = 0; i < sk_X509_num(chain); i++) { + // Only the leaf certificate gets attributes. + if (!add_cert_bag(&safe_contents, sk_X509_value(chain, i), nullptr, nullptr, + 0)) { + return 0; + } + } + + return CBB_flush(cbb); +} + +// add_encrypted_data encrypts |in| with |pbe_nid| and |pbe_cipher|, writing the +// result to |out|. It returns one on success and zero on error. |pbe_nid| and +// |pbe_cipher| are interpreted as in |PKCS8_encrypt|. +static int add_encrypted_data(CBB *out, int pbe_nid, + const EVP_CIPHER *pbe_cipher, + const char *password, size_t password_len, + uint32_t iterations, const uint8_t *in, + size_t in_len) { + uint8_t salt[PKCS5_SALT_LEN]; + if (!RAND_bytes(salt, sizeof(salt))) { + return 0; + } + + ScopedEVP_CIPHER_CTX ctx; + CBB content_info, wrapper, encrypted_data, encrypted_content_info, + encrypted_content; + if ( // Add the ContentInfo wrapping. + !CBB_add_asn1(out, &content_info, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&content_info, CBS_ASN1_OBJECT, kPKCS7EncryptedData, + sizeof(kPKCS7EncryptedData)) || + !CBB_add_asn1(&content_info, &wrapper, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || + // See https://tools.ietf.org/html/rfc2315#section-13. + !CBB_add_asn1(&wrapper, &encrypted_data, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&encrypted_data, 0 /* version */) || + // See https://tools.ietf.org/html/rfc2315#section-10.1. + !CBB_add_asn1(&encrypted_data, &encrypted_content_info, + CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&encrypted_content_info, CBS_ASN1_OBJECT, + kPKCS7Data, sizeof(kPKCS7Data)) || + // Set up encryption and fill in contentEncryptionAlgorithm. + !pkcs12_pbe_encrypt_init(&encrypted_content_info, ctx.get(), pbe_nid, + pbe_cipher, iterations, password, password_len, + salt, sizeof(salt)) || + // Note this tag is primitive. It is an implicitly-tagged OCTET_STRING, so + // it inherits the inner tag's constructed bit. + !CBB_add_asn1(&encrypted_content_info, &encrypted_content, + CBS_ASN1_CONTEXT_SPECIFIC | 0)) { + return 0; + } + + size_t max_out = in_len + EVP_CIPHER_CTX_block_size(ctx.get()); + if (max_out < in_len) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_TOO_LONG); + return 0; + } + + uint8_t *ptr; + size_t n1, n2; + if (!CBB_reserve(&encrypted_content, &ptr, max_out) || + !EVP_CipherUpdate_ex(ctx.get(), ptr, &n1, max_out, in, in_len) || + !EVP_CipherFinal_ex2(ctx.get(), ptr + n1, &n2, max_out - n1) || + !CBB_did_write(&encrypted_content, n1 + n2) || !CBB_flush(out)) { + return 0; + } + + return 1; +} + +PKCS12 *PKCS12_create(const char *password, const char *name, + const EVP_PKEY *pkey, X509 *cert, + const STACK_OF(X509) *chain, int key_nid, int cert_nid, + int iterations, int mac_iterations, int key_type) { + if (key_nid == 0) { + key_nid = NID_aes_256_cbc; + } + if (cert_nid == 0) { + cert_nid = NID_aes_256_cbc; + } + if (iterations == 0) { + iterations = PKCS12_DEFAULT_ITER; + } + if (mac_iterations == 0) { + mac_iterations = PKCS12_DEFAULT_ITER; + } + if ( // In OpenSSL, this specifies a non-standard Microsoft key usage + // extension which we do not currently support. + key_type != 0 || + // In OpenSSL, -1 here means to omit the MAC, which we do not + // currently support. Omitting it is also invalid for a password-based + // PKCS#12 file. + mac_iterations < 0 || + // Don't encode empty objects. + (pkey == nullptr && cert == nullptr && sk_X509_num(chain) == 0)) { + OPENSSL_PUT_ERROR(PKCS8, PKCS8_R_UNSUPPORTED_OPTIONS); + return nullptr; + } + + // PKCS#12 is a very confusing recursive data format, built out of another + // recursive data format. Section 5.1 of RFC 7292 describes the encoding + // algorithm, but there is no clear overview. A quick summary: + // + // PKCS#7 defines a ContentInfo structure, which is a overgeneralized typed + // combinator structure for applying cryptography. We care about two types. A + // data ContentInfo contains an OCTET STRING and is a leaf node of the + // combinator tree. An encrypted-data ContentInfo contains encryption + // parameters (key derivation and encryption) and wraps another ContentInfo, + // usually data. + // + // A PKCS#12 file is a PFX structure (section 4), which contains a single data + // ContentInfo and a MAC over it. This root ContentInfo is the + // AuthenticatedSafe and its payload is a SEQUENCE of other ContentInfos, so + // that different parts of the PKCS#12 file can by differently protected. + // + // Each ContentInfo in the AuthenticatedSafe, after undoing all the PKCS#7 + // combinators, has SafeContents payload. A SafeContents is a SEQUENCE of + // SafeBag. SafeBag is PKCS#12's typed structure, with subtypes such as KeyBag + // and CertBag. Confusingly, there is a SafeContents bag type which itself + // recursively contains more SafeBags, but we do not implement this. Bags also + // can have attributes. + // + // The grouping of SafeBags into intermediate ContentInfos does not appear to + // be significant, except that all SafeBags sharing a ContentInfo have the + // same level of protection. Additionally, while keys may be encrypted by + // placing a KeyBag in an encrypted-data ContentInfo, PKCS#12 also defines a + // key-specific encryption container, PKCS8ShroudedKeyBag, which is used + // instead. + + // Note that |password| may be NULL to specify no password, rather than the + // empty string. They are encoded differently in PKCS#12. (One is the empty + // byte array and the other is NUL-terminated UCS-2.) + size_t password_len = password != nullptr ? strlen(password) : 0; + + uint8_t key_id[EVP_MAX_MD_SIZE]; + unsigned key_id_len = 0; + if (cert != nullptr && pkey != nullptr) { + if (!X509_check_private_key(cert, pkey) || + // Matching OpenSSL, use the SHA-1 hash of the certificate as the local + // key ID. Some PKCS#12 consumers require one to connect the private key + // and certificate. + !X509_digest(cert, EVP_sha1(), key_id, &key_id_len)) { + return nullptr; + } + } + + // See https://tools.ietf.org/html/rfc7292#section-4. + PKCS12 *ret = nullptr; + CBB cbb, pfx, auth_safe, auth_safe_wrapper, auth_safe_data, content_infos; + uint8_t mac_key[EVP_MAX_MD_SIZE]; + if (!CBB_init(&cbb, 0) || !CBB_add_asn1(&cbb, &pfx, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&pfx, 3) || + // auth_safe is a data ContentInfo. + !CBB_add_asn1(&pfx, &auth_safe, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&auth_safe, CBS_ASN1_OBJECT, kPKCS7Data, + sizeof(kPKCS7Data)) || + !CBB_add_asn1(&auth_safe, &auth_safe_wrapper, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || + !CBB_add_asn1(&auth_safe_wrapper, &auth_safe_data, + CBS_ASN1_OCTETSTRING) || + // See https://tools.ietf.org/html/rfc7292#section-4.1. |auth_safe|'s + // contains a SEQUENCE of ContentInfos. + !CBB_add_asn1(&auth_safe_data, &content_infos, CBS_ASN1_SEQUENCE)) { + goto err; + } + + // If there are any certificates, place them in CertBags wrapped in a single + // encrypted ContentInfo. + if (cert != nullptr || sk_X509_num(chain) > 0) { + if (cert_nid < 0) { + // Place the certificates in an unencrypted ContentInfo. This could be + // more compactly-encoded by reusing the same ContentInfo as the key, but + // OpenSSL does not do this. We keep them separate for consistency. (Keys, + // even when encrypted, are always placed in unencrypted ContentInfos. + // PKCS#12 defines bag-level encryption for keys.) + CBB content_info, wrapper, data; + if (!CBB_add_asn1(&content_infos, &content_info, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&content_info, CBS_ASN1_OBJECT, kPKCS7Data, + sizeof(kPKCS7Data)) || + !CBB_add_asn1(&content_info, &wrapper, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || + !CBB_add_asn1(&wrapper, &data, CBS_ASN1_OCTETSTRING) || + !add_cert_safe_contents(&data, cert, chain, name, key_id, + key_id_len) || + !CBB_flush(&content_infos)) { + goto err; + } + } else { + // This function differs from other OpenSSL functions in how PBES1 and + // PBES2 schemes are selected. If the NID matches a cipher, treat this as + // PBES2 instead. Convert to the other convention. + const EVP_CIPHER *cipher = pkcs5_pbe2_nid_to_cipher(cert_nid); + if (cipher != nullptr) { + cert_nid = -1; + } + CBB plaintext_cbb; + int ok = + CBB_init(&plaintext_cbb, 0) && + add_cert_safe_contents(&plaintext_cbb, cert, chain, name, key_id, + key_id_len) && + add_encrypted_data(&content_infos, cert_nid, cipher, password, + password_len, iterations, CBB_data(&plaintext_cbb), + CBB_len(&plaintext_cbb)); + CBB_cleanup(&plaintext_cbb); + if (!ok) { + goto err; + } + } + } + + // If there is a key, place it in a single KeyBag or PKCS8ShroudedKeyBag + // wrapped in an unencrypted ContentInfo. (One could also place it in a KeyBag + // inside an encrypted ContentInfo, but OpenSSL does not do this and some + // PKCS#12 consumers do not support KeyBags.) + if (pkey != nullptr) { + CBB content_info, wrapper, data, safe_contents, bag, bag_contents; + if ( // Add another data ContentInfo. + !CBB_add_asn1(&content_infos, &content_info, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_element(&content_info, CBS_ASN1_OBJECT, kPKCS7Data, + sizeof(kPKCS7Data)) || + !CBB_add_asn1(&content_info, &wrapper, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || + !CBB_add_asn1(&wrapper, &data, CBS_ASN1_OCTETSTRING) || + !CBB_add_asn1(&data, &safe_contents, CBS_ASN1_SEQUENCE) || + // Add a SafeBag containing a PKCS8ShroudedKeyBag. + !CBB_add_asn1(&safe_contents, &bag, CBS_ASN1_SEQUENCE)) { + goto err; + } + if (key_nid < 0) { + if (!CBB_add_asn1_element(&bag, CBS_ASN1_OBJECT, kKeyBag, + sizeof(kKeyBag)) || + !CBB_add_asn1(&bag, &bag_contents, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || + !EVP_marshal_private_key(&bag_contents, pkey)) { + goto err; + } + } else { + // This function differs from other OpenSSL functions in how PBES1 and + // PBES2 schemes are selected. If the NID matches a cipher, treat this as + // PBES2 instead. Convert to the other convention. + const EVP_CIPHER *cipher = pkcs5_pbe2_nid_to_cipher(key_nid); + if (cipher != nullptr) { + key_nid = -1; + } + if (!CBB_add_asn1_element(&bag, CBS_ASN1_OBJECT, kPKCS8ShroudedKeyBag, + sizeof(kPKCS8ShroudedKeyBag)) || + !CBB_add_asn1(&bag, &bag_contents, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || + !PKCS8_marshal_encrypted_private_key( + &bag_contents, key_nid, cipher, password, password_len, + nullptr /* generate a random salt */, + 0 /* use default salt length */, iterations, pkey)) { + goto err; + } + } + size_t name_len = 0; + if (name) { + name_len = strlen(name); + } + if (!add_bag_attributes(&bag, name, name_len, key_id, key_id_len) || + !CBB_flush(&content_infos)) { + goto err; + } + } + + { + // Compute the MAC. Match OpenSSL in using SHA-1 as the hash function. The + // MAC covers |auth_safe_data|. + const EVP_MD *mac_md = EVP_sha1(); + uint8_t mac_salt[PKCS5_SALT_LEN]; + uint8_t mac[EVP_MAX_MD_SIZE]; + unsigned mac_len; + if (!CBB_flush(&auth_safe_data) || + !RAND_bytes(mac_salt, sizeof(mac_salt)) || + !pkcs12_key_gen(password, password_len, mac_salt, sizeof(mac_salt), + PKCS12_MAC_ID, mac_iterations, EVP_MD_size(mac_md), + mac_key, mac_md) || + !HMAC(mac_md, mac_key, EVP_MD_size(mac_md), CBB_data(&auth_safe_data), + CBB_len(&auth_safe_data), mac, &mac_len)) { + goto err; + } + + CBB mac_data, digest_info; + if (!CBB_add_asn1(&pfx, &mac_data, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1(&mac_data, &digest_info, CBS_ASN1_SEQUENCE) || + // OpenSSL and NSS always include a NULL parameter with the digest + // algorithm. Windows does not. RFC 7292 imports DigestInfo from PKCS + // #7. PKCS #7 does not actually use DigestInfo. It just describes + // RSASSA-PKCS1-v1_5 signing as encoding a DigestInfo and then + // "encrypting" it with the private key. In that context, NULL should be + // included. Confusingly, there is also a digestAlgorithm field in + // SignerInfo. There, RFC 5754 says to omit the NULL. But that field + // does not use DigestInfo per se. + // + // We match OpenSSL, NSS, and RSASSA-PKCS1-v1_5 in including the NULL. + !EVP_marshal_digest_algorithm(&digest_info, mac_md) || + !CBB_add_asn1_octet_string(&digest_info, mac, mac_len) || + !CBB_add_asn1_octet_string(&mac_data, mac_salt, sizeof(mac_salt)) || + // The iteration count has a DEFAULT of 1, but RFC 7292 says "The + // default is for historical reasons and its use is deprecated." Thus we + // explicitly encode the iteration count, though it is not valid DER. + !CBB_add_asn1_uint64(&mac_data, mac_iterations)) { + goto err; + } + + ret = New(); + if (ret == nullptr || !CBB_finish(&cbb, &ret->ber_bytes, &ret->ber_len)) { + Delete(ret); + ret = nullptr; + goto err; + } + } + +err: + OPENSSL_cleanse(mac_key, sizeof(mac_key)); + CBB_cleanup(&cbb); + return ret; +} + +void PKCS12_free(PKCS12 *p12) { + if (p12 == nullptr) { + return; + } + OPENSSL_free(p12->ber_bytes); + Delete(p12); +} diff --git a/third_party/boringssl/src/crypto/poly1305/internal.h b/third_party/boringssl/src/crypto/poly1305/internal.h index 251b1f4f..39094906 100644 --- a/third_party/boringssl/src/crypto/poly1305/internal.h +++ b/third_party/boringssl/src/crypto/poly1305/internal.h @@ -1,26 +1,25 @@ -/* Copyright (c) 2016, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_POLY1305_INTERNAL_H -#define OPENSSL_HEADER_POLY1305_INTERNAL_H +// Copyright 2016 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_POLY1305_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_POLY1305_INTERNAL_H #include #include -#if defined(__cplusplus) -extern "C" { -#endif + +BSSL_NAMESPACE_BEGIN #if defined(OPENSSL_ARM) && !defined(OPENSSL_NO_ASM) && !defined(OPENSSL_APPLE) #define OPENSSL_POLY1305_NEON @@ -33,9 +32,6 @@ void CRYPTO_poly1305_update_neon(poly1305_state *state, const uint8_t *in, void CRYPTO_poly1305_finish_neon(poly1305_state *state, uint8_t mac[16]); #endif +BSSL_NAMESPACE_END -#if defined(__cplusplus) -} // extern C -#endif - -#endif // OPENSSL_HEADER_POLY1305_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_POLY1305_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/poly1305/poly1305.c b/third_party/boringssl/src/crypto/poly1305/poly1305.c deleted file mode 100644 index 3017e327..00000000 --- a/third_party/boringssl/src/crypto/poly1305/poly1305.c +++ /dev/null @@ -1,325 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -// This implementation of poly1305 is by Andrew Moon -// (https://github.com/floodyberry/poly1305-donna) and released as public -// domain. - -#include - -#include -#include - -#include "internal.h" -#include "../internal.h" - - -#if !defined(BORINGSSL_HAS_UINT128) || !defined(OPENSSL_X86_64) - -// We can assume little-endian. -static uint32_t U8TO32_LE(const uint8_t *m) { - uint32_t r; - OPENSSL_memcpy(&r, m, sizeof(r)); - return r; -} - -static void U32TO8_LE(uint8_t *m, uint32_t v) { - OPENSSL_memcpy(m, &v, sizeof(v)); -} - -static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } - -struct poly1305_state_st { - uint32_t r0, r1, r2, r3, r4; - uint32_t s1, s2, s3, s4; - uint32_t h0, h1, h2, h3, h4; - uint8_t buf[16]; - size_t buf_used; - uint8_t key[16]; -}; - -static_assert( - sizeof(struct poly1305_state_st) + 63 <= sizeof(poly1305_state), - "poly1305_state isn't large enough to hold aligned poly1305_state_st"); - -static inline struct poly1305_state_st *poly1305_aligned_state( - poly1305_state *state) { - return align_pointer(state, 64); -} - -// poly1305_blocks updates |state| given some amount of input data. This -// function may only be called with a |len| that is not a multiple of 16 at the -// end of the data. Otherwise the input must be buffered into 16 byte blocks. -static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in, - size_t len) { - uint32_t t0, t1, t2, t3; - uint64_t t[5]; - uint32_t b; - uint64_t c; - size_t j; - uint8_t mp[16]; - - if (len < 16) { - goto poly1305_donna_atmost15bytes; - } - -poly1305_donna_16bytes: - t0 = U8TO32_LE(in); - t1 = U8TO32_LE(in + 4); - t2 = U8TO32_LE(in + 8); - t3 = U8TO32_LE(in + 12); - - in += 16; - len -= 16; - - state->h0 += t0 & 0x3ffffff; - state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; - state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; - state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; - state->h4 += (t3 >> 8) | (1 << 24); - -poly1305_donna_mul: - t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + - mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + - mul32x32_64(state->h4, state->s1); - t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + - mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + - mul32x32_64(state->h4, state->s2); - t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + - mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + - mul32x32_64(state->h4, state->s3); - t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + - mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + - mul32x32_64(state->h4, state->s4); - t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + - mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + - mul32x32_64(state->h4, state->r0); - - state->h0 = (uint32_t)t[0] & 0x3ffffff; - c = (t[0] >> 26); - t[1] += c; - state->h1 = (uint32_t)t[1] & 0x3ffffff; - b = (uint32_t)(t[1] >> 26); - t[2] += b; - state->h2 = (uint32_t)t[2] & 0x3ffffff; - b = (uint32_t)(t[2] >> 26); - t[3] += b; - state->h3 = (uint32_t)t[3] & 0x3ffffff; - b = (uint32_t)(t[3] >> 26); - t[4] += b; - state->h4 = (uint32_t)t[4] & 0x3ffffff; - b = (uint32_t)(t[4] >> 26); - state->h0 += b * 5; - - if (len >= 16) { - goto poly1305_donna_16bytes; - } - -// final bytes -poly1305_donna_atmost15bytes: - if (!len) { - return; - } - - for (j = 0; j < len; j++) { - mp[j] = in[j]; - } - mp[j++] = 1; - for (; j < 16; j++) { - mp[j] = 0; - } - len = 0; - - t0 = U8TO32_LE(mp + 0); - t1 = U8TO32_LE(mp + 4); - t2 = U8TO32_LE(mp + 8); - t3 = U8TO32_LE(mp + 12); - - state->h0 += t0 & 0x3ffffff; - state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; - state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; - state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; - state->h4 += (t3 >> 8); - - goto poly1305_donna_mul; -} - -void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) { - struct poly1305_state_st *state = poly1305_aligned_state(statep); - uint32_t t0, t1, t2, t3; - -#if defined(OPENSSL_POLY1305_NEON) - if (CRYPTO_is_NEON_capable()) { - CRYPTO_poly1305_init_neon(statep, key); - return; - } -#endif - - t0 = U8TO32_LE(key + 0); - t1 = U8TO32_LE(key + 4); - t2 = U8TO32_LE(key + 8); - t3 = U8TO32_LE(key + 12); - - // precompute multipliers - state->r0 = t0 & 0x3ffffff; - t0 >>= 26; - t0 |= t1 << 6; - state->r1 = t0 & 0x3ffff03; - t1 >>= 20; - t1 |= t2 << 12; - state->r2 = t1 & 0x3ffc0ff; - t2 >>= 14; - t2 |= t3 << 18; - state->r3 = t2 & 0x3f03fff; - t3 >>= 8; - state->r4 = t3 & 0x00fffff; - - state->s1 = state->r1 * 5; - state->s2 = state->r2 * 5; - state->s3 = state->r3 * 5; - state->s4 = state->r4 * 5; - - // init state - state->h0 = 0; - state->h1 = 0; - state->h2 = 0; - state->h3 = 0; - state->h4 = 0; - - state->buf_used = 0; - OPENSSL_memcpy(state->key, key + 16, sizeof(state->key)); -} - -void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in, - size_t in_len) { - struct poly1305_state_st *state = poly1305_aligned_state(statep); - - // Work around a C language bug. See https://crbug.com/1019588. - if (in_len == 0) { - return; - } - -#if defined(OPENSSL_POLY1305_NEON) - if (CRYPTO_is_NEON_capable()) { - CRYPTO_poly1305_update_neon(statep, in, in_len); - return; - } -#endif - - if (state->buf_used) { - size_t todo = 16 - state->buf_used; - if (todo > in_len) { - todo = in_len; - } - for (size_t i = 0; i < todo; i++) { - state->buf[state->buf_used + i] = in[i]; - } - state->buf_used += todo; - in_len -= todo; - in += todo; - - if (state->buf_used == 16) { - poly1305_update(state, state->buf, 16); - state->buf_used = 0; - } - } - - if (in_len >= 16) { - size_t todo = in_len & ~0xf; - poly1305_update(state, in, todo); - in += todo; - in_len &= 0xf; - } - - if (in_len) { - for (size_t i = 0; i < in_len; i++) { - state->buf[i] = in[i]; - } - state->buf_used = in_len; - } -} - -void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) { - struct poly1305_state_st *state = poly1305_aligned_state(statep); - uint64_t f0, f1, f2, f3; - uint32_t g0, g1, g2, g3, g4; - uint32_t b, nb; - -#if defined(OPENSSL_POLY1305_NEON) - if (CRYPTO_is_NEON_capable()) { - CRYPTO_poly1305_finish_neon(statep, mac); - return; - } -#endif - - if (state->buf_used) { - poly1305_update(state, state->buf, state->buf_used); - } - - b = state->h0 >> 26; - state->h0 = state->h0 & 0x3ffffff; - state->h1 += b; - b = state->h1 >> 26; - state->h1 = state->h1 & 0x3ffffff; - state->h2 += b; - b = state->h2 >> 26; - state->h2 = state->h2 & 0x3ffffff; - state->h3 += b; - b = state->h3 >> 26; - state->h3 = state->h3 & 0x3ffffff; - state->h4 += b; - b = state->h4 >> 26; - state->h4 = state->h4 & 0x3ffffff; - state->h0 += b * 5; - - g0 = state->h0 + 5; - b = g0 >> 26; - g0 &= 0x3ffffff; - g1 = state->h1 + b; - b = g1 >> 26; - g1 &= 0x3ffffff; - g2 = state->h2 + b; - b = g2 >> 26; - g2 &= 0x3ffffff; - g3 = state->h3 + b; - b = g3 >> 26; - g3 &= 0x3ffffff; - g4 = state->h4 + b - (1 << 26); - - b = (g4 >> 31) - 1; - nb = ~b; - state->h0 = (state->h0 & nb) | (g0 & b); - state->h1 = (state->h1 & nb) | (g1 & b); - state->h2 = (state->h2 & nb) | (g2 & b); - state->h3 = (state->h3 & nb) | (g3 & b); - state->h4 = (state->h4 & nb) | (g4 & b); - - f0 = ((state->h0) | (state->h1 << 26)) + (uint64_t)U8TO32_LE(&state->key[0]); - f1 = ((state->h1 >> 6) | (state->h2 << 20)) + - (uint64_t)U8TO32_LE(&state->key[4]); - f2 = ((state->h2 >> 12) | (state->h3 << 14)) + - (uint64_t)U8TO32_LE(&state->key[8]); - f3 = ((state->h3 >> 18) | (state->h4 << 8)) + - (uint64_t)U8TO32_LE(&state->key[12]); - - U32TO8_LE(&mac[0], f0); - f1 += (f0 >> 32); - U32TO8_LE(&mac[4], f1); - f2 += (f1 >> 32); - U32TO8_LE(&mac[8], f2); - f3 += (f2 >> 32); - U32TO8_LE(&mac[12], f3); -} - -#endif // !BORINGSSL_HAS_UINT128 || !OPENSSL_X86_64 diff --git a/third_party/boringssl/src/crypto/poly1305/poly1305.cc b/third_party/boringssl/src/crypto/poly1305/poly1305.cc new file mode 100644 index 00000000..2d71558f --- /dev/null +++ b/third_party/boringssl/src/crypto/poly1305/poly1305.cc @@ -0,0 +1,315 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This implementation of poly1305 is by Andrew Moon +// (https://github.com/floodyberry/poly1305-donna) and released as public +// domain. + +#include + +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +#if !defined(BORINGSSL_HAS_UINT128) || !defined(OPENSSL_X86_64) + +static uint64_t mul32x32_64(uint32_t a, uint32_t b) { return (uint64_t)a * b; } + +struct poly1305_state_st { + uint32_t r0, r1, r2, r3, r4; + uint32_t s1, s2, s3, s4; + uint32_t h0, h1, h2, h3, h4; + uint8_t buf[16]; + size_t buf_used; + uint8_t key[16]; +}; + +static_assert( + sizeof(struct poly1305_state_st) + 63 <= sizeof(poly1305_state), + "poly1305_state isn't large enough to hold aligned poly1305_state_st"); + +static struct poly1305_state_st *poly1305_aligned_state(poly1305_state *state) { + return reinterpret_cast(align_pointer(state, 64)); +} + +// poly1305_blocks updates |state| given some amount of input data. This +// function may only be called with a |len| that is not a multiple of 16 at the +// end of the data. Otherwise the input must be buffered into 16 byte blocks. +static void poly1305_update(struct poly1305_state_st *state, const uint8_t *in, + size_t len) { + uint32_t t0, t1, t2, t3; + uint64_t t[5]; + uint32_t b; + uint64_t c; + size_t j; + uint8_t mp[16]; + + if (len < 16) { + goto poly1305_donna_atmost15bytes; + } + +poly1305_donna_16bytes: + t0 = CRYPTO_load_u32_le(in); + t1 = CRYPTO_load_u32_le(in + 4); + t2 = CRYPTO_load_u32_le(in + 8); + t3 = CRYPTO_load_u32_le(in + 12); + + in += 16; + len -= 16; + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8) | (1 << 24); + +poly1305_donna_mul: + t[0] = mul32x32_64(state->h0, state->r0) + mul32x32_64(state->h1, state->s4) + + mul32x32_64(state->h2, state->s3) + mul32x32_64(state->h3, state->s2) + + mul32x32_64(state->h4, state->s1); + t[1] = mul32x32_64(state->h0, state->r1) + mul32x32_64(state->h1, state->r0) + + mul32x32_64(state->h2, state->s4) + mul32x32_64(state->h3, state->s3) + + mul32x32_64(state->h4, state->s2); + t[2] = mul32x32_64(state->h0, state->r2) + mul32x32_64(state->h1, state->r1) + + mul32x32_64(state->h2, state->r0) + mul32x32_64(state->h3, state->s4) + + mul32x32_64(state->h4, state->s3); + t[3] = mul32x32_64(state->h0, state->r3) + mul32x32_64(state->h1, state->r2) + + mul32x32_64(state->h2, state->r1) + mul32x32_64(state->h3, state->r0) + + mul32x32_64(state->h4, state->s4); + t[4] = mul32x32_64(state->h0, state->r4) + mul32x32_64(state->h1, state->r3) + + mul32x32_64(state->h2, state->r2) + mul32x32_64(state->h3, state->r1) + + mul32x32_64(state->h4, state->r0); + + state->h0 = (uint32_t)t[0] & 0x3ffffff; + c = (t[0] >> 26); + t[1] += c; + state->h1 = (uint32_t)t[1] & 0x3ffffff; + b = (uint32_t)(t[1] >> 26); + t[2] += b; + state->h2 = (uint32_t)t[2] & 0x3ffffff; + b = (uint32_t)(t[2] >> 26); + t[3] += b; + state->h3 = (uint32_t)t[3] & 0x3ffffff; + b = (uint32_t)(t[3] >> 26); + t[4] += b; + state->h4 = (uint32_t)t[4] & 0x3ffffff; + b = (uint32_t)(t[4] >> 26); + state->h0 += b * 5; + + if (len >= 16) { + goto poly1305_donna_16bytes; + } + +// final bytes +poly1305_donna_atmost15bytes: + if (!len) { + return; + } + + for (j = 0; j < len; j++) { + mp[j] = in[j]; + } + mp[j++] = 1; + for (; j < 16; j++) { + mp[j] = 0; + } + len = 0; + + t0 = CRYPTO_load_u32_le(mp + 0); + t1 = CRYPTO_load_u32_le(mp + 4); + t2 = CRYPTO_load_u32_le(mp + 8); + t3 = CRYPTO_load_u32_le(mp + 12); + + state->h0 += t0 & 0x3ffffff; + state->h1 += ((((uint64_t)t1 << 32) | t0) >> 26) & 0x3ffffff; + state->h2 += ((((uint64_t)t2 << 32) | t1) >> 20) & 0x3ffffff; + state->h3 += ((((uint64_t)t3 << 32) | t2) >> 14) & 0x3ffffff; + state->h4 += (t3 >> 8); + + goto poly1305_donna_mul; +} + +void CRYPTO_poly1305_init(poly1305_state *statep, const uint8_t key[32]) { + struct poly1305_state_st *state = poly1305_aligned_state(statep); + uint32_t t0, t1, t2, t3; + +#if defined(OPENSSL_POLY1305_NEON) + if (CRYPTO_is_NEON_capable()) { + CRYPTO_poly1305_init_neon(statep, key); + return; + } +#endif + + t0 = CRYPTO_load_u32_le(key + 0); + t1 = CRYPTO_load_u32_le(key + 4); + t2 = CRYPTO_load_u32_le(key + 8); + t3 = CRYPTO_load_u32_le(key + 12); + + // precompute multipliers + state->r0 = t0 & 0x3ffffff; + t0 >>= 26; + t0 |= t1 << 6; + state->r1 = t0 & 0x3ffff03; + t1 >>= 20; + t1 |= t2 << 12; + state->r2 = t1 & 0x3ffc0ff; + t2 >>= 14; + t2 |= t3 << 18; + state->r3 = t2 & 0x3f03fff; + t3 >>= 8; + state->r4 = t3 & 0x00fffff; + + state->s1 = state->r1 * 5; + state->s2 = state->r2 * 5; + state->s3 = state->r3 * 5; + state->s4 = state->r4 * 5; + + // init state + state->h0 = 0; + state->h1 = 0; + state->h2 = 0; + state->h3 = 0; + state->h4 = 0; + + state->buf_used = 0; + OPENSSL_memcpy(state->key, key + 16, sizeof(state->key)); +} + +void CRYPTO_poly1305_update(poly1305_state *statep, const uint8_t *in, + size_t in_len) { + struct poly1305_state_st *state = poly1305_aligned_state(statep); + + // Work around a C language bug. See https://crbug.com/1019588. + if (in_len == 0) { + return; + } + +#if defined(OPENSSL_POLY1305_NEON) + if (CRYPTO_is_NEON_capable()) { + CRYPTO_poly1305_update_neon(statep, in, in_len); + return; + } +#endif + + if (state->buf_used) { + size_t todo = 16 - state->buf_used; + if (todo > in_len) { + todo = in_len; + } + for (size_t i = 0; i < todo; i++) { + state->buf[state->buf_used + i] = in[i]; + } + state->buf_used += todo; + in_len -= todo; + in += todo; + + if (state->buf_used == 16) { + poly1305_update(state, state->buf, 16); + state->buf_used = 0; + } + } + + if (in_len >= 16) { + size_t todo = in_len & ~0xf; + poly1305_update(state, in, todo); + in += todo; + in_len &= 0xf; + } + + if (in_len) { + for (size_t i = 0; i < in_len; i++) { + state->buf[i] = in[i]; + } + state->buf_used = in_len; + } +} + +void CRYPTO_poly1305_finish(poly1305_state *statep, uint8_t mac[16]) { + struct poly1305_state_st *state = poly1305_aligned_state(statep); + uint32_t g0, g1, g2, g3, g4; + uint32_t b, nb; + +#if defined(OPENSSL_POLY1305_NEON) + if (CRYPTO_is_NEON_capable()) { + CRYPTO_poly1305_finish_neon(statep, mac); + return; + } +#endif + + if (state->buf_used) { + poly1305_update(state, state->buf, state->buf_used); + } + + b = state->h0 >> 26; + state->h0 = state->h0 & 0x3ffffff; + state->h1 += b; + b = state->h1 >> 26; + state->h1 = state->h1 & 0x3ffffff; + state->h2 += b; + b = state->h2 >> 26; + state->h2 = state->h2 & 0x3ffffff; + state->h3 += b; + b = state->h3 >> 26; + state->h3 = state->h3 & 0x3ffffff; + state->h4 += b; + b = state->h4 >> 26; + state->h4 = state->h4 & 0x3ffffff; + state->h0 += b * 5; + + g0 = state->h0 + 5; + b = g0 >> 26; + g0 &= 0x3ffffff; + g1 = state->h1 + b; + b = g1 >> 26; + g1 &= 0x3ffffff; + g2 = state->h2 + b; + b = g2 >> 26; + g2 &= 0x3ffffff; + g3 = state->h3 + b; + b = g3 >> 26; + g3 &= 0x3ffffff; + g4 = state->h4 + b - (1 << 26); + + b = (g4 >> 31) - 1; + nb = ~b; + state->h0 = (state->h0 & nb) | (g0 & b); + state->h1 = (state->h1 & nb) | (g1 & b); + state->h2 = (state->h2 & nb) | (g2 & b); + state->h3 = (state->h3 & nb) | (g3 & b); + state->h4 = (state->h4 & nb) | (g4 & b); + + uint64_t f0 = ((state->h0) | (state->h1 << 26)) + + (uint64_t)CRYPTO_load_u32_le(&state->key[0]); + uint64_t f1 = ((state->h1 >> 6) | (state->h2 << 20)) + + (uint64_t)CRYPTO_load_u32_le(&state->key[4]); + uint64_t f2 = ((state->h2 >> 12) | (state->h3 << 14)) + + (uint64_t)CRYPTO_load_u32_le(&state->key[8]); + uint64_t f3 = ((state->h3 >> 18) | (state->h4 << 8)) + + (uint64_t)CRYPTO_load_u32_le(&state->key[12]); + + CRYPTO_store_u32_le(&mac[0], (uint32_t)f0); + f1 += (f0 >> 32); + CRYPTO_store_u32_le(&mac[4], (uint32_t)f1); + f2 += (f1 >> 32); + CRYPTO_store_u32_le(&mac[8], (uint32_t)f2); + f3 += (f2 >> 32); + CRYPTO_store_u32_le(&mac[12], (uint32_t)f3); +} + +#endif // !BORINGSSL_HAS_UINT128 || !OPENSSL_X86_64 diff --git a/third_party/boringssl/src/crypto/poly1305/poly1305_arm.c b/third_party/boringssl/src/crypto/poly1305/poly1305_arm.c deleted file mode 100644 index d01e0b73..00000000 --- a/third_party/boringssl/src/crypto/poly1305/poly1305_arm.c +++ /dev/null @@ -1,308 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -// This implementation was taken from the public domain, neon2 version in -// SUPERCOP by D. J. Bernstein and Peter Schwabe. - -#include - -#include -#include - -#include "../internal.h" -#include "internal.h" - - -#if defined(OPENSSL_POLY1305_NEON) - -typedef struct { - uint32_t v[12]; // for alignment; only using 10 -} fe1305x2; - -#define addmulmod openssl_poly1305_neon2_addmulmod -#define blocks openssl_poly1305_neon2_blocks - -extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y, - const fe1305x2 *c); - -extern int blocks(fe1305x2 *h, const fe1305x2 *precomp, const uint8_t *in, - size_t inlen); - -static void freeze(fe1305x2 *r) { - int i; - - uint32_t x0 = r->v[0]; - uint32_t x1 = r->v[2]; - uint32_t x2 = r->v[4]; - uint32_t x3 = r->v[6]; - uint32_t x4 = r->v[8]; - uint32_t y0; - uint32_t y1; - uint32_t y2; - uint32_t y3; - uint32_t y4; - uint32_t swap; - - for (i = 0; i < 3; ++i) { - x1 += x0 >> 26; - x0 &= 0x3ffffff; - x2 += x1 >> 26; - x1 &= 0x3ffffff; - x3 += x2 >> 26; - x2 &= 0x3ffffff; - x4 += x3 >> 26; - x3 &= 0x3ffffff; - x0 += 5 * (x4 >> 26); - x4 &= 0x3ffffff; - } - - y0 = x0 + 5; - y1 = x1 + (y0 >> 26); - y0 &= 0x3ffffff; - y2 = x2 + (y1 >> 26); - y1 &= 0x3ffffff; - y3 = x3 + (y2 >> 26); - y2 &= 0x3ffffff; - y4 = x4 + (y3 >> 26); - y3 &= 0x3ffffff; - swap = -(y4 >> 26); - y4 &= 0x3ffffff; - - y0 ^= x0; - y1 ^= x1; - y2 ^= x2; - y3 ^= x3; - y4 ^= x4; - - y0 &= swap; - y1 &= swap; - y2 &= swap; - y3 &= swap; - y4 &= swap; - - y0 ^= x0; - y1 ^= x1; - y2 ^= x2; - y3 ^= x3; - y4 ^= x4; - - r->v[0] = y0; - r->v[2] = y1; - r->v[4] = y2; - r->v[6] = y3; - r->v[8] = y4; -} - -static void store32(uint8_t out[4], uint32_t v) { OPENSSL_memcpy(out, &v, 4); } - -// load32 exists to avoid breaking strict aliasing rules in -// fe1305x2_frombytearray. -static uint32_t load32(const uint8_t t[4]) { - uint32_t tmp; - OPENSSL_memcpy(&tmp, t, sizeof(tmp)); - return tmp; -} - -static void fe1305x2_tobytearray(uint8_t r[16], fe1305x2 *x) { - uint32_t x0 = x->v[0]; - uint32_t x1 = x->v[2]; - uint32_t x2 = x->v[4]; - uint32_t x3 = x->v[6]; - uint32_t x4 = x->v[8]; - - x1 += x0 >> 26; - x0 &= 0x3ffffff; - x2 += x1 >> 26; - x1 &= 0x3ffffff; - x3 += x2 >> 26; - x2 &= 0x3ffffff; - x4 += x3 >> 26; - x3 &= 0x3ffffff; - - store32(r, x0 + (x1 << 26)); - store32(r + 4, (x1 >> 6) + (x2 << 20)); - store32(r + 8, (x2 >> 12) + (x3 << 14)); - store32(r + 12, (x3 >> 18) + (x4 << 8)); -} - -static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x, size_t xlen) { - size_t i; - uint8_t t[17]; - - for (i = 0; (i < 16) && (i < xlen); i++) { - t[i] = x[i]; - } - xlen -= i; - x += i; - t[i++] = 1; - for (; i < 17; i++) { - t[i] = 0; - } - - r->v[0] = 0x3ffffff & load32(t); - r->v[2] = 0x3ffffff & (load32(t + 3) >> 2); - r->v[4] = 0x3ffffff & (load32(t + 6) >> 4); - r->v[6] = 0x3ffffff & (load32(t + 9) >> 6); - r->v[8] = load32(t + 13); - - if (xlen) { - for (i = 0; (i < 16) && (i < xlen); i++) { - t[i] = x[i]; - } - t[i++] = 1; - for (; i < 17; i++) { - t[i] = 0; - } - - r->v[1] = 0x3ffffff & load32(t); - r->v[3] = 0x3ffffff & (load32(t + 3) >> 2); - r->v[5] = 0x3ffffff & (load32(t + 6) >> 4); - r->v[7] = 0x3ffffff & (load32(t + 9) >> 6); - r->v[9] = load32(t + 13); - } else { - r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0; - } -} - -static const alignas(16) fe1305x2 zero; - -struct poly1305_state_st { - uint8_t data[sizeof(fe1305x2[5]) + 128]; - uint8_t buf[32]; - size_t buf_used; - uint8_t key[16]; -}; - -static_assert( - sizeof(struct poly1305_state_st) + 63 <= sizeof(poly1305_state), - "poly1305_state isn't large enough to hold aligned poly1305_state_st."); - -void CRYPTO_poly1305_init_neon(poly1305_state *state, const uint8_t key[32]) { - struct poly1305_state_st *st = (struct poly1305_state_st *)(state); - fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); - fe1305x2 *const h = r + 1; - fe1305x2 *const c = h + 1; - fe1305x2 *const precomp = c + 1; - - r->v[1] = r->v[0] = 0x3ffffff & load32(key); - r->v[3] = r->v[2] = 0x3ffff03 & (load32(key + 3) >> 2); - r->v[5] = r->v[4] = 0x3ffc0ff & (load32(key + 6) >> 4); - r->v[7] = r->v[6] = 0x3f03fff & (load32(key + 9) >> 6); - r->v[9] = r->v[8] = 0x00fffff & (load32(key + 12) >> 8); - - for (size_t j = 0; j < 10; j++) { - h->v[j] = 0; // XXX: should fast-forward a bit - } - - addmulmod(precomp, r, r, &zero); // precompute r^2 - addmulmod(precomp + 1, precomp, precomp, &zero); // precompute r^4 - - OPENSSL_memcpy(st->key, key + 16, 16); - st->buf_used = 0; -} - -void CRYPTO_poly1305_update_neon(poly1305_state *state, const uint8_t *in, - size_t in_len) { - struct poly1305_state_st *st = (struct poly1305_state_st *)(state); - fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); - fe1305x2 *const h = r + 1; - fe1305x2 *const c = h + 1; - fe1305x2 *const precomp = c + 1; - - if (st->buf_used) { - size_t todo = 32 - st->buf_used; - if (todo > in_len) { - todo = in_len; - } - for (size_t i = 0; i < todo; i++) { - st->buf[st->buf_used + i] = in[i]; - } - st->buf_used += todo; - in_len -= todo; - in += todo; - - if (st->buf_used == sizeof(st->buf) && in_len) { - addmulmod(h, h, precomp, &zero); - fe1305x2_frombytearray(c, st->buf, sizeof(st->buf)); - for (size_t i = 0; i < 10; i++) { - h->v[i] += c->v[i]; - } - st->buf_used = 0; - } - } - - while (in_len > 32) { - size_t tlen = 1048576; - if (in_len < tlen) { - tlen = in_len; - } - tlen -= blocks(h, precomp, in, tlen); - in_len -= tlen; - in += tlen; - } - - if (in_len) { - for (size_t i = 0; i < in_len; i++) { - st->buf[i] = in[i]; - } - st->buf_used = in_len; - } -} - -void CRYPTO_poly1305_finish_neon(poly1305_state *state, uint8_t mac[16]) { - struct poly1305_state_st *st = (struct poly1305_state_st *)(state); - fe1305x2 *const r = (fe1305x2 *)(st->data + (15 & (-(int)st->data))); - fe1305x2 *const h = r + 1; - fe1305x2 *const c = h + 1; - fe1305x2 *const precomp = c + 1; - - addmulmod(h, h, precomp, &zero); - - if (st->buf_used > 16) { - fe1305x2_frombytearray(c, st->buf, st->buf_used); - precomp->v[1] = r->v[1]; - precomp->v[3] = r->v[3]; - precomp->v[5] = r->v[5]; - precomp->v[7] = r->v[7]; - precomp->v[9] = r->v[9]; - addmulmod(h, h, precomp, c); - } else if (st->buf_used > 0) { - fe1305x2_frombytearray(c, st->buf, st->buf_used); - r->v[1] = 1; - r->v[3] = 0; - r->v[5] = 0; - r->v[7] = 0; - r->v[9] = 0; - addmulmod(h, h, r, c); - } - - h->v[0] += h->v[1]; - h->v[2] += h->v[3]; - h->v[4] += h->v[5]; - h->v[6] += h->v[7]; - h->v[8] += h->v[9]; - freeze(h); - - fe1305x2_frombytearray(c, st->key, 16); - c->v[8] ^= (1 << 24); - - h->v[0] += c->v[0]; - h->v[2] += c->v[2]; - h->v[4] += c->v[4]; - h->v[6] += c->v[6]; - h->v[8] += c->v[8]; - fe1305x2_tobytearray(mac, h); -} - -#endif // OPENSSL_POLY1305_NEON diff --git a/third_party/boringssl/src/crypto/poly1305/poly1305_arm.cc b/third_party/boringssl/src/crypto/poly1305/poly1305_arm.cc new file mode 100644 index 00000000..6620d70d --- /dev/null +++ b/third_party/boringssl/src/crypto/poly1305/poly1305_arm.cc @@ -0,0 +1,306 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This implementation was taken from the public domain, neon2 version in +// SUPERCOP by D. J. Bernstein and Peter Schwabe. + +#include + +#include +#include + +#include "../internal.h" +#include "internal.h" + + +BSSL_NAMESPACE_BEGIN + +#if defined(OPENSSL_POLY1305_NEON) + +typedef struct { + uint32_t v[12]; // for alignment; only using 10 +} fe1305x2; + +#define addmulmod openssl_poly1305_neon2_addmulmod +#define blocks openssl_poly1305_neon2_blocks + +extern "C" { +extern void addmulmod(fe1305x2 *r, const fe1305x2 *x, const fe1305x2 *y, + const fe1305x2 *c); + +extern int blocks(fe1305x2 *h, const fe1305x2 precomp[2], const uint8_t *in, + size_t inlen); +} + +static void freeze(fe1305x2 *r) { + int i; + + uint32_t x0 = r->v[0]; + uint32_t x1 = r->v[2]; + uint32_t x2 = r->v[4]; + uint32_t x3 = r->v[6]; + uint32_t x4 = r->v[8]; + uint32_t y0; + uint32_t y1; + uint32_t y2; + uint32_t y3; + uint32_t y4; + uint32_t swap; + + for (i = 0; i < 3; ++i) { + x1 += x0 >> 26; + x0 &= 0x3ffffff; + x2 += x1 >> 26; + x1 &= 0x3ffffff; + x3 += x2 >> 26; + x2 &= 0x3ffffff; + x4 += x3 >> 26; + x3 &= 0x3ffffff; + x0 += 5 * (x4 >> 26); + x4 &= 0x3ffffff; + } + + y0 = x0 + 5; + y1 = x1 + (y0 >> 26); + y0 &= 0x3ffffff; + y2 = x2 + (y1 >> 26); + y1 &= 0x3ffffff; + y3 = x3 + (y2 >> 26); + y2 &= 0x3ffffff; + y4 = x4 + (y3 >> 26); + y3 &= 0x3ffffff; + swap = -(y4 >> 26); + y4 &= 0x3ffffff; + + y0 ^= x0; + y1 ^= x1; + y2 ^= x2; + y3 ^= x3; + y4 ^= x4; + + y0 &= swap; + y1 &= swap; + y2 &= swap; + y3 &= swap; + y4 &= swap; + + y0 ^= x0; + y1 ^= x1; + y2 ^= x2; + y3 ^= x3; + y4 ^= x4; + + r->v[0] = y0; + r->v[2] = y1; + r->v[4] = y2; + r->v[6] = y3; + r->v[8] = y4; +} + +static void fe1305x2_tobytearray(uint8_t r[16], fe1305x2 *x) { + uint32_t x0 = x->v[0]; + uint32_t x1 = x->v[2]; + uint32_t x2 = x->v[4]; + uint32_t x3 = x->v[6]; + uint32_t x4 = x->v[8]; + + x1 += x0 >> 26; + x0 &= 0x3ffffff; + x2 += x1 >> 26; + x1 &= 0x3ffffff; + x3 += x2 >> 26; + x2 &= 0x3ffffff; + x4 += x3 >> 26; + x3 &= 0x3ffffff; + + CRYPTO_store_u32_le(r, x0 + (x1 << 26)); + CRYPTO_store_u32_le(r + 4, (x1 >> 6) + (x2 << 20)); + CRYPTO_store_u32_le(r + 8, (x2 >> 12) + (x3 << 14)); + CRYPTO_store_u32_le(r + 12, (x3 >> 18) + (x4 << 8)); +} + +static void fe1305x2_frombytearray(fe1305x2 *r, const uint8_t *x, size_t xlen) { + size_t i; + uint8_t t[17]; + + for (i = 0; (i < 16) && (i < xlen); i++) { + t[i] = x[i]; + } + xlen -= i; + x += i; + t[i++] = 1; + for (; i < 17; i++) { + t[i] = 0; + } + + r->v[0] = 0x3ffffff & CRYPTO_load_u32_le(t); + r->v[2] = 0x3ffffff & (CRYPTO_load_u32_le(t + 3) >> 2); + r->v[4] = 0x3ffffff & (CRYPTO_load_u32_le(t + 6) >> 4); + r->v[6] = 0x3ffffff & (CRYPTO_load_u32_le(t + 9) >> 6); + r->v[8] = CRYPTO_load_u32_le(t + 13); + + if (xlen) { + for (i = 0; (i < 16) && (i < xlen); i++) { + t[i] = x[i]; + } + t[i++] = 1; + for (; i < 17; i++) { + t[i] = 0; + } + + r->v[1] = 0x3ffffff & CRYPTO_load_u32_le(t); + r->v[3] = 0x3ffffff & (CRYPTO_load_u32_le(t + 3) >> 2); + r->v[5] = 0x3ffffff & (CRYPTO_load_u32_le(t + 6) >> 4); + r->v[7] = 0x3ffffff & (CRYPTO_load_u32_le(t + 9) >> 6); + r->v[9] = CRYPTO_load_u32_le(t + 13); + } else { + r->v[1] = r->v[3] = r->v[5] = r->v[7] = r->v[9] = 0; + } +} + +static const fe1305x2 zero alignas(16) = {0}; + +struct poly1305_state_st { + fe1305x2 r, h, c, precomp[2]; + uint8_t buf[32]; + size_t buf_used; + uint8_t key[16]; +}; + +static_assert( + sizeof(struct poly1305_state_st) + 63 <= sizeof(poly1305_state), + "poly1305_state isn't large enough to hold aligned poly1305_state_st."); + +static poly1305_state_st *poly1305_aligned_state(poly1305_state *state) { + return reinterpret_cast(align_pointer(state, 64)); +} + +void CRYPTO_poly1305_init_neon(poly1305_state *state, const uint8_t key[32]) { + poly1305_state_st *st = poly1305_aligned_state(state); + fe1305x2 *const r = &st->r; + fe1305x2 *const h = &st->h; + fe1305x2 *const precomp = st->precomp; + + r->v[1] = r->v[0] = 0x3ffffff & CRYPTO_load_u32_le(key); + r->v[3] = r->v[2] = 0x3ffff03 & (CRYPTO_load_u32_le(key + 3) >> 2); + r->v[5] = r->v[4] = 0x3ffc0ff & (CRYPTO_load_u32_le(key + 6) >> 4); + r->v[7] = r->v[6] = 0x3f03fff & (CRYPTO_load_u32_le(key + 9) >> 6); + r->v[9] = r->v[8] = 0x00fffff & (CRYPTO_load_u32_le(key + 12) >> 8); + + for (size_t j = 0; j < 10; j++) { + h->v[j] = 0; // XXX: should fast-forward a bit + } + + addmulmod(precomp, r, r, &zero); // precompute r^2 + addmulmod(precomp + 1, precomp, precomp, &zero); // precompute r^4 + + OPENSSL_memcpy(st->key, key + 16, 16); + st->buf_used = 0; +} + +void CRYPTO_poly1305_update_neon(poly1305_state *state, const uint8_t *in, + size_t in_len) { + poly1305_state_st *st = poly1305_aligned_state(state); + fe1305x2 *const h = &st->h; + fe1305x2 *const c = &st->c; + fe1305x2 *const precomp = st->precomp; + + if (st->buf_used) { + size_t todo = 32 - st->buf_used; + if (todo > in_len) { + todo = in_len; + } + for (size_t i = 0; i < todo; i++) { + st->buf[st->buf_used + i] = in[i]; + } + st->buf_used += todo; + in_len -= todo; + in += todo; + + if (st->buf_used == sizeof(st->buf) && in_len) { + addmulmod(h, h, precomp, &zero); + fe1305x2_frombytearray(c, st->buf, sizeof(st->buf)); + for (size_t i = 0; i < 10; i++) { + h->v[i] += c->v[i]; + } + st->buf_used = 0; + } + } + + while (in_len > 32) { + size_t tlen = 1048576; + if (in_len < tlen) { + tlen = in_len; + } + tlen -= blocks(h, precomp, in, tlen); + in_len -= tlen; + in += tlen; + } + + if (in_len) { + for (size_t i = 0; i < in_len; i++) { + st->buf[i] = in[i]; + } + st->buf_used = in_len; + } +} + +void CRYPTO_poly1305_finish_neon(poly1305_state *state, uint8_t mac[16]) { + poly1305_state_st *st = poly1305_aligned_state(state); + fe1305x2 *const r = &st->r; + fe1305x2 *const h = &st->h; + fe1305x2 *const c = &st->c; + fe1305x2 *const precomp = st->precomp; + + addmulmod(h, h, precomp, &zero); + + if (st->buf_used > 16) { + fe1305x2_frombytearray(c, st->buf, st->buf_used); + precomp->v[1] = r->v[1]; + precomp->v[3] = r->v[3]; + precomp->v[5] = r->v[5]; + precomp->v[7] = r->v[7]; + precomp->v[9] = r->v[9]; + addmulmod(h, h, precomp, c); + } else if (st->buf_used > 0) { + fe1305x2_frombytearray(c, st->buf, st->buf_used); + r->v[1] = 1; + r->v[3] = 0; + r->v[5] = 0; + r->v[7] = 0; + r->v[9] = 0; + addmulmod(h, h, r, c); + } + + h->v[0] += h->v[1]; + h->v[2] += h->v[3]; + h->v[4] += h->v[5]; + h->v[6] += h->v[7]; + h->v[8] += h->v[9]; + freeze(h); + + fe1305x2_frombytearray(c, st->key, 16); + c->v[8] ^= (1 << 24); + + h->v[0] += c->v[0]; + h->v[2] += c->v[2]; + h->v[4] += c->v[4]; + h->v[6] += c->v[6]; + h->v[8] += c->v[8]; + fe1305x2_tobytearray(mac, h); +} + +#endif // OPENSSL_POLY1305_NEON + +BSSL_NAMESPACE_END diff --git a/third_party/boringssl/src/crypto/poly1305/poly1305_vec.c b/third_party/boringssl/src/crypto/poly1305/poly1305_vec.c deleted file mode 100644 index 209b4033..00000000 --- a/third_party/boringssl/src/crypto/poly1305/poly1305_vec.c +++ /dev/null @@ -1,847 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -// This implementation of poly1305 is by Andrew Moon -// (https://github.com/floodyberry/poly1305-donna) and released as public -// domain. It implements SIMD vectorization based on the algorithm described in -// http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte -// block size - -#include - -#include - -#include "../internal.h" - - -#if defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_X86_64) - -#include - -typedef __m128i xmmi; - -static const alignas(16) uint32_t poly1305_x64_sse2_message_mask[4] = { - (1 << 26) - 1, 0, (1 << 26) - 1, 0}; -static const alignas(16) uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0}; -static const alignas(16) uint32_t poly1305_x64_sse2_1shl128[4] = { - (1 << 24), 0, (1 << 24), 0}; - -static inline uint128_t add128(uint128_t a, uint128_t b) { return a + b; } - -static inline uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; } - -static inline uint128_t mul64x64_128(uint64_t a, uint64_t b) { - return (uint128_t)a * b; -} - -static inline uint64_t lo128(uint128_t a) { return (uint64_t)a; } - -static inline uint64_t shr128(uint128_t v, const int shift) { - return (uint64_t)(v >> shift); -} - -static inline uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) { - return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift); -} - -typedef struct poly1305_power_t { - union { - xmmi v; - uint64_t u[2]; - uint32_t d[4]; - } R20, R21, R22, R23, R24, S21, S22, S23, S24; -} poly1305_power; - -typedef struct poly1305_state_internal_t { - poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144 - bytes of free storage */ - union { - xmmi H[5]; // 80 bytes - uint64_t HH[10]; - }; - // uint64_t r0,r1,r2; [24 bytes] - // uint64_t pad0,pad1; [16 bytes] - uint64_t started; // 8 bytes - uint64_t leftover; // 8 bytes - uint8_t buffer[64]; // 64 bytes -} poly1305_state_internal; /* 448 bytes total + 63 bytes for - alignment = 511 bytes raw */ - -static_assert(sizeof(struct poly1305_state_internal_t) + 63 <= - sizeof(poly1305_state), - "poly1305_state isn't large enough to hold aligned " - "poly1305_state_internal_t"); - -static inline poly1305_state_internal *poly1305_aligned_state( - poly1305_state *state) { - return (poly1305_state_internal *)(((uint64_t)state + 63) & ~63); -} - -static inline size_t poly1305_min(size_t a, size_t b) { - return (a < b) ? a : b; -} - -void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) { - poly1305_state_internal *st = poly1305_aligned_state(state); - poly1305_power *p; - uint64_t r0, r1, r2; - uint64_t t0, t1; - - // clamp key - t0 = CRYPTO_load_u64_le(key + 0); - t1 = CRYPTO_load_u64_le(key + 8); - r0 = t0 & 0xffc0fffffff; - t0 >>= 44; - t0 |= t1 << 20; - r1 = t0 & 0xfffffc0ffff; - t1 >>= 24; - r2 = t1 & 0x00ffffffc0f; - - // store r in un-used space of st->P[1] - p = &st->P[1]; - p->R20.d[1] = (uint32_t)(r0); - p->R20.d[3] = (uint32_t)(r0 >> 32); - p->R21.d[1] = (uint32_t)(r1); - p->R21.d[3] = (uint32_t)(r1 >> 32); - p->R22.d[1] = (uint32_t)(r2); - p->R22.d[3] = (uint32_t)(r2 >> 32); - - // store pad - p->R23.d[1] = CRYPTO_load_u32_le(key + 16); - p->R23.d[3] = CRYPTO_load_u32_le(key + 20); - p->R24.d[1] = CRYPTO_load_u32_le(key + 24); - p->R24.d[3] = CRYPTO_load_u32_le(key + 28); - - // H = 0 - st->H[0] = _mm_setzero_si128(); - st->H[1] = _mm_setzero_si128(); - st->H[2] = _mm_setzero_si128(); - st->H[3] = _mm_setzero_si128(); - st->H[4] = _mm_setzero_si128(); - - st->started = 0; - st->leftover = 0; -} - -static void poly1305_first_block(poly1305_state_internal *st, - const uint8_t *m) { - const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); - const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); - const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); - xmmi T5, T6; - poly1305_power *p; - uint128_t d[3]; - uint64_t r0, r1, r2; - uint64_t r20, r21, r22, s22; - uint64_t pad0, pad1; - uint64_t c; - uint64_t i; - - // pull out stored info - p = &st->P[1]; - - r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; - r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; - r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; - pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; - pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; - - // compute powers r^2,r^4 - r20 = r0; - r21 = r1; - r22 = r2; - for (i = 0; i < 2; i++) { - s22 = r22 * (5 << 2); - - d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22)); - d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21)); - d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20)); - - r20 = lo128(d[0]) & 0xfffffffffff; - c = shr128(d[0], 44); - d[1] = add128_64(d[1], c); - r21 = lo128(d[1]) & 0xfffffffffff; - c = shr128(d[1], 44); - d[2] = add128_64(d[2], c); - r22 = lo128(d[2]) & 0x3ffffffffff; - c = shr128(d[2], 42); - r20 += c * 5; - c = (r20 >> 44); - r20 = r20 & 0xfffffffffff; - r21 += c; - - p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20)&0x3ffffff), - _MM_SHUFFLE(1, 0, 1, 0)); - p->R21.v = _mm_shuffle_epi32( - _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff), - _MM_SHUFFLE(1, 0, 1, 0)); - p->R22.v = - _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff), - _MM_SHUFFLE(1, 0, 1, 0)); - p->R23.v = _mm_shuffle_epi32( - _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff), - _MM_SHUFFLE(1, 0, 1, 0)); - p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))), - _MM_SHUFFLE(1, 0, 1, 0)); - p->S21.v = _mm_mul_epu32(p->R21.v, FIVE); - p->S22.v = _mm_mul_epu32(p->R22.v, FIVE); - p->S23.v = _mm_mul_epu32(p->R23.v, FIVE); - p->S24.v = _mm_mul_epu32(p->R24.v, FIVE); - p--; - } - - // put saved info back - p = &st->P[1]; - p->R20.d[1] = (uint32_t)(r0); - p->R20.d[3] = (uint32_t)(r0 >> 32); - p->R21.d[1] = (uint32_t)(r1); - p->R21.d[3] = (uint32_t)(r1 >> 32); - p->R22.d[1] = (uint32_t)(r2); - p->R22.d[3] = (uint32_t)(r2 >> 32); - p->R23.d[1] = (uint32_t)(pad0); - p->R23.d[3] = (uint32_t)(pad0 >> 32); - p->R24.d[1] = (uint32_t)(pad1); - p->R24.d[3] = (uint32_t)(pad1 >> 32); - - // H = [Mx,My] - T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), - _mm_loadl_epi64((const xmmi *)(m + 16))); - T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), - _mm_loadl_epi64((const xmmi *)(m + 24))); - st->H[0] = _mm_and_si128(MMASK, T5); - st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); - T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); - st->H[2] = _mm_and_si128(MMASK, T5); - st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); - st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); -} - -static void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, - size_t bytes) { - const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); - const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); - const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); - - poly1305_power *p; - xmmi H0, H1, H2, H3, H4; - xmmi T0, T1, T2, T3, T4, T5, T6; - xmmi M0, M1, M2, M3, M4; - xmmi C1, C2; - - H0 = st->H[0]; - H1 = st->H[1]; - H2 = st->H[2]; - H3 = st->H[3]; - H4 = st->H[4]; - - while (bytes >= 64) { - // H *= [r^4,r^4] - p = &st->P[0]; - T0 = _mm_mul_epu32(H0, p->R20.v); - T1 = _mm_mul_epu32(H0, p->R21.v); - T2 = _mm_mul_epu32(H0, p->R22.v); - T3 = _mm_mul_epu32(H0, p->R23.v); - T4 = _mm_mul_epu32(H0, p->R24.v); - T5 = _mm_mul_epu32(H1, p->S24.v); - T6 = _mm_mul_epu32(H1, p->R20.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H2, p->S23.v); - T6 = _mm_mul_epu32(H2, p->S24.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H3, p->S22.v); - T6 = _mm_mul_epu32(H3, p->S23.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H4, p->S21.v); - T6 = _mm_mul_epu32(H4, p->S22.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H1, p->R21.v); - T6 = _mm_mul_epu32(H1, p->R22.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H2, p->R20.v); - T6 = _mm_mul_epu32(H2, p->R21.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H3, p->S24.v); - T6 = _mm_mul_epu32(H3, p->R20.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H4, p->S23.v); - T6 = _mm_mul_epu32(H4, p->S24.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H1, p->R23.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(H2, p->R22.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(H3, p->R21.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(H4, p->R20.v); - T4 = _mm_add_epi64(T4, T5); - - // H += [Mx,My]*[r^2,r^2] - T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), - _mm_loadl_epi64((const xmmi *)(m + 16))); - T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), - _mm_loadl_epi64((const xmmi *)(m + 24))); - M0 = _mm_and_si128(MMASK, T5); - M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); - T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); - M2 = _mm_and_si128(MMASK, T5); - M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); - M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); - - p = &st->P[1]; - T5 = _mm_mul_epu32(M0, p->R20.v); - T6 = _mm_mul_epu32(M0, p->R21.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(M1, p->S24.v); - T6 = _mm_mul_epu32(M1, p->R20.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(M2, p->S23.v); - T6 = _mm_mul_epu32(M2, p->S24.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(M3, p->S22.v); - T6 = _mm_mul_epu32(M3, p->S23.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(M4, p->S21.v); - T6 = _mm_mul_epu32(M4, p->S22.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(M0, p->R22.v); - T6 = _mm_mul_epu32(M0, p->R23.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(M1, p->R21.v); - T6 = _mm_mul_epu32(M1, p->R22.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(M2, p->R20.v); - T6 = _mm_mul_epu32(M2, p->R21.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(M3, p->S24.v); - T6 = _mm_mul_epu32(M3, p->R20.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(M4, p->S23.v); - T6 = _mm_mul_epu32(M4, p->S24.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(M0, p->R24.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(M1, p->R23.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(M2, p->R22.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(M3, p->R21.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(M4, p->R20.v); - T4 = _mm_add_epi64(T4, T5); - - // H += [Mx,My] - T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)), - _mm_loadl_epi64((const xmmi *)(m + 48))); - T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)), - _mm_loadl_epi64((const xmmi *)(m + 56))); - M0 = _mm_and_si128(MMASK, T5); - M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); - T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); - M2 = _mm_and_si128(MMASK, T5); - M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); - M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); - - T0 = _mm_add_epi64(T0, M0); - T1 = _mm_add_epi64(T1, M1); - T2 = _mm_add_epi64(T2, M2); - T3 = _mm_add_epi64(T3, M3); - T4 = _mm_add_epi64(T4, M4); - - // reduce - C1 = _mm_srli_epi64(T0, 26); - C2 = _mm_srli_epi64(T3, 26); - T0 = _mm_and_si128(T0, MMASK); - T3 = _mm_and_si128(T3, MMASK); - T1 = _mm_add_epi64(T1, C1); - T4 = _mm_add_epi64(T4, C2); - C1 = _mm_srli_epi64(T1, 26); - C2 = _mm_srli_epi64(T4, 26); - T1 = _mm_and_si128(T1, MMASK); - T4 = _mm_and_si128(T4, MMASK); - T2 = _mm_add_epi64(T2, C1); - T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); - C1 = _mm_srli_epi64(T2, 26); - C2 = _mm_srli_epi64(T0, 26); - T2 = _mm_and_si128(T2, MMASK); - T0 = _mm_and_si128(T0, MMASK); - T3 = _mm_add_epi64(T3, C1); - T1 = _mm_add_epi64(T1, C2); - C1 = _mm_srli_epi64(T3, 26); - T3 = _mm_and_si128(T3, MMASK); - T4 = _mm_add_epi64(T4, C1); - - // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) - H0 = T0; - H1 = T1; - H2 = T2; - H3 = T3; - H4 = T4; - - m += 64; - bytes -= 64; - } - - st->H[0] = H0; - st->H[1] = H1; - st->H[2] = H2; - st->H[3] = H3; - st->H[4] = H4; -} - -static size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, - size_t bytes) { - const xmmi MMASK = _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); - const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); - const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); - - poly1305_power *p; - xmmi H0, H1, H2, H3, H4; - xmmi M0, M1, M2, M3, M4; - xmmi T0, T1, T2, T3, T4, T5, T6; - xmmi C1, C2; - - uint64_t r0, r1, r2; - uint64_t t0, t1, t2, t3, t4; - uint64_t c; - size_t consumed = 0; - - H0 = st->H[0]; - H1 = st->H[1]; - H2 = st->H[2]; - H3 = st->H[3]; - H4 = st->H[4]; - - // p = [r^2,r^2] - p = &st->P[1]; - - if (bytes >= 32) { - // H *= [r^2,r^2] - T0 = _mm_mul_epu32(H0, p->R20.v); - T1 = _mm_mul_epu32(H0, p->R21.v); - T2 = _mm_mul_epu32(H0, p->R22.v); - T3 = _mm_mul_epu32(H0, p->R23.v); - T4 = _mm_mul_epu32(H0, p->R24.v); - T5 = _mm_mul_epu32(H1, p->S24.v); - T6 = _mm_mul_epu32(H1, p->R20.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H2, p->S23.v); - T6 = _mm_mul_epu32(H2, p->S24.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H3, p->S22.v); - T6 = _mm_mul_epu32(H3, p->S23.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H4, p->S21.v); - T6 = _mm_mul_epu32(H4, p->S22.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H1, p->R21.v); - T6 = _mm_mul_epu32(H1, p->R22.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H2, p->R20.v); - T6 = _mm_mul_epu32(H2, p->R21.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H3, p->S24.v); - T6 = _mm_mul_epu32(H3, p->R20.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H4, p->S23.v); - T6 = _mm_mul_epu32(H4, p->S24.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H1, p->R23.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(H2, p->R22.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(H3, p->R21.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(H4, p->R20.v); - T4 = _mm_add_epi64(T4, T5); - - // H += [Mx,My] - T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), - _mm_loadl_epi64((const xmmi *)(m + 16))); - T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), - _mm_loadl_epi64((const xmmi *)(m + 24))); - M0 = _mm_and_si128(MMASK, T5); - M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); - T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); - M2 = _mm_and_si128(MMASK, T5); - M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); - M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); - - T0 = _mm_add_epi64(T0, M0); - T1 = _mm_add_epi64(T1, M1); - T2 = _mm_add_epi64(T2, M2); - T3 = _mm_add_epi64(T3, M3); - T4 = _mm_add_epi64(T4, M4); - - // reduce - C1 = _mm_srli_epi64(T0, 26); - C2 = _mm_srli_epi64(T3, 26); - T0 = _mm_and_si128(T0, MMASK); - T3 = _mm_and_si128(T3, MMASK); - T1 = _mm_add_epi64(T1, C1); - T4 = _mm_add_epi64(T4, C2); - C1 = _mm_srli_epi64(T1, 26); - C2 = _mm_srli_epi64(T4, 26); - T1 = _mm_and_si128(T1, MMASK); - T4 = _mm_and_si128(T4, MMASK); - T2 = _mm_add_epi64(T2, C1); - T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); - C1 = _mm_srli_epi64(T2, 26); - C2 = _mm_srli_epi64(T0, 26); - T2 = _mm_and_si128(T2, MMASK); - T0 = _mm_and_si128(T0, MMASK); - T3 = _mm_add_epi64(T3, C1); - T1 = _mm_add_epi64(T1, C2); - C1 = _mm_srli_epi64(T3, 26); - T3 = _mm_and_si128(T3, MMASK); - T4 = _mm_add_epi64(T4, C1); - - // H = (H*[r^2,r^2] + [Mx,My]) - H0 = T0; - H1 = T1; - H2 = T2; - H3 = T3; - H4 = T4; - - consumed = 32; - } - - // finalize, H *= [r^2,r] - r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; - r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; - r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; - - p->R20.d[2] = (uint32_t)(r0)&0x3ffffff; - p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; - p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff; - p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; - p->R24.d[2] = (uint32_t)((r2 >> 16)); - p->S21.d[2] = p->R21.d[2] * 5; - p->S22.d[2] = p->R22.d[2] * 5; - p->S23.d[2] = p->R23.d[2] * 5; - p->S24.d[2] = p->R24.d[2] * 5; - - // H *= [r^2,r] - T0 = _mm_mul_epu32(H0, p->R20.v); - T1 = _mm_mul_epu32(H0, p->R21.v); - T2 = _mm_mul_epu32(H0, p->R22.v); - T3 = _mm_mul_epu32(H0, p->R23.v); - T4 = _mm_mul_epu32(H0, p->R24.v); - T5 = _mm_mul_epu32(H1, p->S24.v); - T6 = _mm_mul_epu32(H1, p->R20.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H2, p->S23.v); - T6 = _mm_mul_epu32(H2, p->S24.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H3, p->S22.v); - T6 = _mm_mul_epu32(H3, p->S23.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H4, p->S21.v); - T6 = _mm_mul_epu32(H4, p->S22.v); - T0 = _mm_add_epi64(T0, T5); - T1 = _mm_add_epi64(T1, T6); - T5 = _mm_mul_epu32(H1, p->R21.v); - T6 = _mm_mul_epu32(H1, p->R22.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H2, p->R20.v); - T6 = _mm_mul_epu32(H2, p->R21.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H3, p->S24.v); - T6 = _mm_mul_epu32(H3, p->R20.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H4, p->S23.v); - T6 = _mm_mul_epu32(H4, p->S24.v); - T2 = _mm_add_epi64(T2, T5); - T3 = _mm_add_epi64(T3, T6); - T5 = _mm_mul_epu32(H1, p->R23.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(H2, p->R22.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(H3, p->R21.v); - T4 = _mm_add_epi64(T4, T5); - T5 = _mm_mul_epu32(H4, p->R20.v); - T4 = _mm_add_epi64(T4, T5); - - C1 = _mm_srli_epi64(T0, 26); - C2 = _mm_srli_epi64(T3, 26); - T0 = _mm_and_si128(T0, MMASK); - T3 = _mm_and_si128(T3, MMASK); - T1 = _mm_add_epi64(T1, C1); - T4 = _mm_add_epi64(T4, C2); - C1 = _mm_srli_epi64(T1, 26); - C2 = _mm_srli_epi64(T4, 26); - T1 = _mm_and_si128(T1, MMASK); - T4 = _mm_and_si128(T4, MMASK); - T2 = _mm_add_epi64(T2, C1); - T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); - C1 = _mm_srli_epi64(T2, 26); - C2 = _mm_srli_epi64(T0, 26); - T2 = _mm_and_si128(T2, MMASK); - T0 = _mm_and_si128(T0, MMASK); - T3 = _mm_add_epi64(T3, C1); - T1 = _mm_add_epi64(T1, C2); - C1 = _mm_srli_epi64(T3, 26); - T3 = _mm_and_si128(T3, MMASK); - T4 = _mm_add_epi64(T4, C1); - - // H = H[0]+H[1] - H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8)); - H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); - H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); - H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8)); - H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8)); - - t0 = _mm_cvtsi128_si32(H0); - c = (t0 >> 26); - t0 &= 0x3ffffff; - t1 = _mm_cvtsi128_si32(H1) + c; - c = (t1 >> 26); - t1 &= 0x3ffffff; - t2 = _mm_cvtsi128_si32(H2) + c; - c = (t2 >> 26); - t2 &= 0x3ffffff; - t3 = _mm_cvtsi128_si32(H3) + c; - c = (t3 >> 26); - t3 &= 0x3ffffff; - t4 = _mm_cvtsi128_si32(H4) + c; - c = (t4 >> 26); - t4 &= 0x3ffffff; - t0 = t0 + (c * 5); - c = (t0 >> 26); - t0 &= 0x3ffffff; - t1 = t1 + c; - - st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff); - st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff); - st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff); - - return consumed; -} - -void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m, - size_t bytes) { - poly1305_state_internal *st = poly1305_aligned_state(state); - size_t want; - - // Work around a C language bug. See https://crbug.com/1019588. - if (bytes == 0) { - return; - } - - // need at least 32 initial bytes to start the accelerated branch - if (!st->started) { - if ((st->leftover == 0) && (bytes > 32)) { - poly1305_first_block(st, m); - m += 32; - bytes -= 32; - } else { - want = poly1305_min(32 - st->leftover, bytes); - OPENSSL_memcpy(st->buffer + st->leftover, m, want); - bytes -= want; - m += want; - st->leftover += want; - if ((st->leftover < 32) || (bytes == 0)) { - return; - } - poly1305_first_block(st, st->buffer); - st->leftover = 0; - } - st->started = 1; - } - - // handle leftover - if (st->leftover) { - want = poly1305_min(64 - st->leftover, bytes); - OPENSSL_memcpy(st->buffer + st->leftover, m, want); - bytes -= want; - m += want; - st->leftover += want; - if (st->leftover < 64) { - return; - } - poly1305_blocks(st, st->buffer, 64); - st->leftover = 0; - } - - // process 64 byte blocks - if (bytes >= 64) { - want = (bytes & ~63); - poly1305_blocks(st, m, want); - m += want; - bytes -= want; - } - - if (bytes) { - OPENSSL_memcpy(st->buffer + st->leftover, m, bytes); - st->leftover += bytes; - } -} - -void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) { - poly1305_state_internal *st = poly1305_aligned_state(state); - size_t leftover = st->leftover; - uint8_t *m = st->buffer; - uint128_t d[3]; - uint64_t h0, h1, h2; - uint64_t t0, t1; - uint64_t g0, g1, g2, c, nc; - uint64_t r0, r1, r2, s1, s2; - poly1305_power *p; - - if (st->started) { - size_t consumed = poly1305_combine(st, m, leftover); - leftover -= consumed; - m += consumed; - } - - // st->HH will either be 0 or have the combined result - h0 = st->HH[0]; - h1 = st->HH[1]; - h2 = st->HH[2]; - - p = &st->P[1]; - r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; - r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; - r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; - s1 = r1 * (5 << 2); - s2 = r2 * (5 << 2); - - if (leftover < 16) { - goto poly1305_donna_atmost15bytes; - } - -poly1305_donna_atleast16bytes: - t0 = CRYPTO_load_u64_le(m + 0); - t1 = CRYPTO_load_u64_le(m + 8); - h0 += t0 & 0xfffffffffff; - t0 = shr128_pair(t1, t0, 44); - h1 += t0 & 0xfffffffffff; - h2 += (t1 >> 24) | ((uint64_t)1 << 40); - -poly1305_donna_mul: - d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)), - mul64x64_128(h2, s1)); - d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)), - mul64x64_128(h2, s2)); - d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)), - mul64x64_128(h2, r0)); - h0 = lo128(d[0]) & 0xfffffffffff; - c = shr128(d[0], 44); - d[1] = add128_64(d[1], c); - h1 = lo128(d[1]) & 0xfffffffffff; - c = shr128(d[1], 44); - d[2] = add128_64(d[2], c); - h2 = lo128(d[2]) & 0x3ffffffffff; - c = shr128(d[2], 42); - h0 += c * 5; - - m += 16; - leftover -= 16; - if (leftover >= 16) { - goto poly1305_donna_atleast16bytes; - } - -// final bytes -poly1305_donna_atmost15bytes: - if (!leftover) { - goto poly1305_donna_finish; - } - - m[leftover++] = 1; - OPENSSL_memset(m + leftover, 0, 16 - leftover); - leftover = 16; - - t0 = CRYPTO_load_u64_le(m + 0); - t1 = CRYPTO_load_u64_le(m + 8); - h0 += t0 & 0xfffffffffff; - t0 = shr128_pair(t1, t0, 44); - h1 += t0 & 0xfffffffffff; - h2 += (t1 >> 24); - - goto poly1305_donna_mul; - -poly1305_donna_finish: - c = (h0 >> 44); - h0 &= 0xfffffffffff; - h1 += c; - c = (h1 >> 44); - h1 &= 0xfffffffffff; - h2 += c; - c = (h2 >> 42); - h2 &= 0x3ffffffffff; - h0 += c * 5; - - g0 = h0 + 5; - c = (g0 >> 44); - g0 &= 0xfffffffffff; - g1 = h1 + c; - c = (g1 >> 44); - g1 &= 0xfffffffffff; - g2 = h2 + c - ((uint64_t)1 << 42); - - c = (g2 >> 63) - 1; - nc = ~c; - h0 = (h0 & nc) | (g0 & c); - h1 = (h1 & nc) | (g1 & c); - h2 = (h2 & nc) | (g2 & c); - - // pad - t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; - t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; - h0 += (t0 & 0xfffffffffff); - c = (h0 >> 44); - h0 &= 0xfffffffffff; - t0 = shr128_pair(t1, t0, 44); - h1 += (t0 & 0xfffffffffff) + c; - c = (h1 >> 44); - h1 &= 0xfffffffffff; - t1 = (t1 >> 24); - h2 += (t1)+c; - - CRYPTO_store_u64_le(mac + 0, ((h0) | (h1 << 44))); - CRYPTO_store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24))); -} - -#endif // BORINGSSL_HAS_UINT128 && OPENSSL_X86_64 diff --git a/third_party/boringssl/src/crypto/poly1305/poly1305_vec.cc b/third_party/boringssl/src/crypto/poly1305/poly1305_vec.cc new file mode 100644 index 00000000..20f77f2d --- /dev/null +++ b/third_party/boringssl/src/crypto/poly1305/poly1305_vec.cc @@ -0,0 +1,853 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// This implementation of poly1305 is by Andrew Moon +// (https://github.com/floodyberry/poly1305-donna) and released as public +// domain. It implements SIMD vectorization based on the algorithm described in +// http://cr.yp.to/papers.html#neoncrypto. Unrolled to 2 powers, i.e. 64 byte +// block size + +#include + +#include + +#include "../internal.h" + + +#if defined(BORINGSSL_HAS_UINT128) && defined(OPENSSL_X86_64) + +#include + + +using namespace bssl; + +namespace { + +typedef __m128i xmmi; + +alignas(16) const uint32_t poly1305_x64_sse2_message_mask[4] = { + (1 << 26) - 1, 0, (1 << 26) - 1, 0}; +alignas(16) const uint32_t poly1305_x64_sse2_5[4] = {5, 0, 5, 0}; +alignas(16) const uint32_t poly1305_x64_sse2_1shl128[4] = {(1 << 24), 0, + (1 << 24), 0}; + +uint128_t add128(uint128_t a, uint128_t b) { return a + b; } + +uint128_t add128_64(uint128_t a, uint64_t b) { return a + b; } + +uint128_t mul64x64_128(uint64_t a, uint64_t b) { return (uint128_t)a * b; } + +uint64_t lo128(uint128_t a) { return (uint64_t)a; } + +uint64_t shr128(uint128_t v, const int shift) { return (uint64_t)(v >> shift); } + +uint64_t shr128_pair(uint64_t hi, uint64_t lo, const int shift) { + return (uint64_t)((((uint128_t)hi << 64) | lo) >> shift); +} + +typedef struct poly1305_power_t { + union { + xmmi v; + uint64_t u[2]; + uint32_t d[4]; + } R20, R21, R22, R23, R24, S21, S22, S23, S24; +} poly1305_power; + +typedef struct poly1305_state_internal_t { + poly1305_power P[2]; /* 288 bytes, top 32 bit halves unused = 144 + bytes of free storage */ + union { + xmmi H[5]; // 80 bytes + uint64_t HH[10]; + }; + // uint64_t r0,r1,r2; [24 bytes] + // uint64_t pad0,pad1; [16 bytes] + uint64_t started; // 8 bytes + uint64_t leftover; // 8 bytes + uint8_t buffer[64]; // 64 bytes +} poly1305_state_internal; /* 448 bytes total + 63 bytes for + alignment = 511 bytes raw */ + +static_assert(sizeof(struct poly1305_state_internal_t) + 63 <= + sizeof(poly1305_state), + "poly1305_state isn't large enough to hold aligned " + "poly1305_state_internal_t"); + +poly1305_state_internal *poly1305_aligned_state(poly1305_state *state) { + return reinterpret_cast(align_pointer(state, 64)); +} + +size_t poly1305_min(size_t a, size_t b) { return (a < b) ? a : b; } + +} // namespace + +void CRYPTO_poly1305_init(poly1305_state *state, const uint8_t key[32]) { + poly1305_state_internal *st = poly1305_aligned_state(state); + poly1305_power *p; + uint64_t r0, r1, r2; + uint64_t t0, t1; + + // clamp key + t0 = CRYPTO_load_u64_le(key + 0); + t1 = CRYPTO_load_u64_le(key + 8); + r0 = t0 & 0xffc0fffffff; + t0 >>= 44; + t0 |= t1 << 20; + r1 = t0 & 0xfffffc0ffff; + t1 >>= 24; + r2 = t1 & 0x00ffffffc0f; + + // store r in un-used space of st->P[1] + p = &st->P[1]; + p->R20.d[1] = (uint32_t)(r0); + p->R20.d[3] = (uint32_t)(r0 >> 32); + p->R21.d[1] = (uint32_t)(r1); + p->R21.d[3] = (uint32_t)(r1 >> 32); + p->R22.d[1] = (uint32_t)(r2); + p->R22.d[3] = (uint32_t)(r2 >> 32); + + // store pad + p->R23.d[1] = CRYPTO_load_u32_le(key + 16); + p->R23.d[3] = CRYPTO_load_u32_le(key + 20); + p->R24.d[1] = CRYPTO_load_u32_le(key + 24); + p->R24.d[3] = CRYPTO_load_u32_le(key + 28); + + // H = 0 + st->H[0] = _mm_setzero_si128(); + st->H[1] = _mm_setzero_si128(); + st->H[2] = _mm_setzero_si128(); + st->H[3] = _mm_setzero_si128(); + st->H[4] = _mm_setzero_si128(); + + st->started = 0; + st->leftover = 0; +} + +namespace { + +void poly1305_first_block(poly1305_state_internal *st, const uint8_t *m) { + const xmmi MMASK = + _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); + const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); + const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); + xmmi T5, T6; + poly1305_power *p; + uint128_t d[3]; + uint64_t r0, r1, r2; + uint64_t r20, r21, r22, s22; + uint64_t pad0, pad1; + uint64_t c; + uint64_t i; + + // pull out stored info + p = &st->P[1]; + + r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; + r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; + r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; + pad0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; + pad1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; + + // compute powers r^2,r^4 + r20 = r0; + r21 = r1; + r22 = r2; + for (i = 0; i < 2; i++) { + s22 = r22 * (5 << 2); + + d[0] = add128(mul64x64_128(r20, r20), mul64x64_128(r21 * 2, s22)); + d[1] = add128(mul64x64_128(r22, s22), mul64x64_128(r20 * 2, r21)); + d[2] = add128(mul64x64_128(r21, r21), mul64x64_128(r22 * 2, r20)); + + r20 = lo128(d[0]) & 0xfffffffffff; + c = shr128(d[0], 44); + d[1] = add128_64(d[1], c); + r21 = lo128(d[1]) & 0xfffffffffff; + c = shr128(d[1], 44); + d[2] = add128_64(d[2], c); + r22 = lo128(d[2]) & 0x3ffffffffff; + c = shr128(d[2], 42); + r20 += c * 5; + c = (r20 >> 44); + r20 = r20 & 0xfffffffffff; + r21 += c; + + p->R20.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)(r20) & 0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R21.v = _mm_shuffle_epi32( + _mm_cvtsi32_si128((uint32_t)((r20 >> 26) | (r21 << 18)) & 0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R22.v = + _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r21 >> 8)) & 0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R23.v = _mm_shuffle_epi32( + _mm_cvtsi32_si128((uint32_t)((r21 >> 34) | (r22 << 10)) & 0x3ffffff), + _MM_SHUFFLE(1, 0, 1, 0)); + p->R24.v = _mm_shuffle_epi32(_mm_cvtsi32_si128((uint32_t)((r22 >> 16))), + _MM_SHUFFLE(1, 0, 1, 0)); + p->S21.v = _mm_mul_epu32(p->R21.v, FIVE); + p->S22.v = _mm_mul_epu32(p->R22.v, FIVE); + p->S23.v = _mm_mul_epu32(p->R23.v, FIVE); + p->S24.v = _mm_mul_epu32(p->R24.v, FIVE); + p--; + } + + // put saved info back + p = &st->P[1]; + p->R20.d[1] = (uint32_t)(r0); + p->R20.d[3] = (uint32_t)(r0 >> 32); + p->R21.d[1] = (uint32_t)(r1); + p->R21.d[3] = (uint32_t)(r1 >> 32); + p->R22.d[1] = (uint32_t)(r2); + p->R22.d[3] = (uint32_t)(r2 >> 32); + p->R23.d[1] = (uint32_t)(pad0); + p->R23.d[3] = (uint32_t)(pad0 >> 32); + p->R24.d[1] = (uint32_t)(pad1); + p->R24.d[3] = (uint32_t)(pad1 >> 32); + + // H = [Mx,My] + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), + _mm_loadl_epi64((const xmmi *)(m + 16))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), + _mm_loadl_epi64((const xmmi *)(m + 24))); + st->H[0] = _mm_and_si128(MMASK, T5); + st->H[1] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + st->H[2] = _mm_and_si128(MMASK, T5); + st->H[3] = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + st->H[4] = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); +} + +void poly1305_blocks(poly1305_state_internal *st, const uint8_t *m, + size_t bytes) { + const xmmi MMASK = + _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); + const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); + const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); + + poly1305_power *p; + xmmi H0, H1, H2, H3, H4; + xmmi T0, T1, T2, T3, T4, T5, T6; + xmmi M0, M1, M2, M3, M4; + xmmi C1, C2; + + H0 = st->H[0]; + H1 = st->H[1]; + H2 = st->H[2]; + H3 = st->H[3]; + H4 = st->H[4]; + + while (bytes >= 64) { + // H *= [r^4,r^4] + p = &st->P[0]; + T0 = _mm_mul_epu32(H0, p->R20.v); + T1 = _mm_mul_epu32(H0, p->R21.v); + T2 = _mm_mul_epu32(H0, p->R22.v); + T3 = _mm_mul_epu32(H0, p->R23.v); + T4 = _mm_mul_epu32(H0, p->R24.v); + T5 = _mm_mul_epu32(H1, p->S24.v); + T6 = _mm_mul_epu32(H1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H2, p->S23.v); + T6 = _mm_mul_epu32(H2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H3, p->S22.v); + T6 = _mm_mul_epu32(H3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H4, p->S21.v); + T6 = _mm_mul_epu32(H4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H1, p->R21.v); + T6 = _mm_mul_epu32(H1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H2, p->R20.v); + T6 = _mm_mul_epu32(H2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H3, p->S24.v); + T6 = _mm_mul_epu32(H3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H4, p->S23.v); + T6 = _mm_mul_epu32(H4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + // H += [Mx,My]*[r^2,r^2] + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), + _mm_loadl_epi64((const xmmi *)(m + 16))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), + _mm_loadl_epi64((const xmmi *)(m + 24))); + M0 = _mm_and_si128(MMASK, T5); + M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + M2 = _mm_and_si128(MMASK, T5); + M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); + + p = &st->P[1]; + T5 = _mm_mul_epu32(M0, p->R20.v); + T6 = _mm_mul_epu32(M0, p->R21.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M1, p->S24.v); + T6 = _mm_mul_epu32(M1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M2, p->S23.v); + T6 = _mm_mul_epu32(M2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M3, p->S22.v); + T6 = _mm_mul_epu32(M3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M4, p->S21.v); + T6 = _mm_mul_epu32(M4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(M0, p->R22.v); + T6 = _mm_mul_epu32(M0, p->R23.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M1, p->R21.v); + T6 = _mm_mul_epu32(M1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M2, p->R20.v); + T6 = _mm_mul_epu32(M2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M3, p->S24.v); + T6 = _mm_mul_epu32(M3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M4, p->S23.v); + T6 = _mm_mul_epu32(M4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(M0, p->R24.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(M4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + // H += [Mx,My] + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 32)), + _mm_loadl_epi64((const xmmi *)(m + 48))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 40)), + _mm_loadl_epi64((const xmmi *)(m + 56))); + M0 = _mm_and_si128(MMASK, T5); + M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + M2 = _mm_and_si128(MMASK, T5); + M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); + + T0 = _mm_add_epi64(T0, M0); + T1 = _mm_add_epi64(T1, M1); + T2 = _mm_add_epi64(T2, M2); + T3 = _mm_add_epi64(T3, M3); + T4 = _mm_add_epi64(T4, M4); + + // reduce + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); + + // H = (H*[r^4,r^4] + [Mx,My]*[r^2,r^2] + [Mx,My]) + H0 = T0; + H1 = T1; + H2 = T2; + H3 = T3; + H4 = T4; + + m += 64; + bytes -= 64; + } + + st->H[0] = H0; + st->H[1] = H1; + st->H[2] = H2; + st->H[3] = H3; + st->H[4] = H4; +} + +size_t poly1305_combine(poly1305_state_internal *st, const uint8_t *m, + size_t bytes) { + const xmmi MMASK = + _mm_load_si128((const xmmi *)poly1305_x64_sse2_message_mask); + const xmmi HIBIT = _mm_load_si128((const xmmi *)poly1305_x64_sse2_1shl128); + const xmmi FIVE = _mm_load_si128((const xmmi *)poly1305_x64_sse2_5); + + poly1305_power *p; + xmmi H0, H1, H2, H3, H4; + xmmi M0, M1, M2, M3, M4; + xmmi T0, T1, T2, T3, T4, T5, T6; + xmmi C1, C2; + + uint64_t r0, r1, r2; + uint64_t t0, t1, t2, t3, t4; + uint64_t c; + size_t consumed = 0; + + H0 = st->H[0]; + H1 = st->H[1]; + H2 = st->H[2]; + H3 = st->H[3]; + H4 = st->H[4]; + + // p = [r^2,r^2] + p = &st->P[1]; + + if (bytes >= 32) { + // H *= [r^2,r^2] + T0 = _mm_mul_epu32(H0, p->R20.v); + T1 = _mm_mul_epu32(H0, p->R21.v); + T2 = _mm_mul_epu32(H0, p->R22.v); + T3 = _mm_mul_epu32(H0, p->R23.v); + T4 = _mm_mul_epu32(H0, p->R24.v); + T5 = _mm_mul_epu32(H1, p->S24.v); + T6 = _mm_mul_epu32(H1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H2, p->S23.v); + T6 = _mm_mul_epu32(H2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H3, p->S22.v); + T6 = _mm_mul_epu32(H3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H4, p->S21.v); + T6 = _mm_mul_epu32(H4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H1, p->R21.v); + T6 = _mm_mul_epu32(H1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H2, p->R20.v); + T6 = _mm_mul_epu32(H2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H3, p->S24.v); + T6 = _mm_mul_epu32(H3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H4, p->S23.v); + T6 = _mm_mul_epu32(H4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + // H += [Mx,My] + T5 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 0)), + _mm_loadl_epi64((const xmmi *)(m + 16))); + T6 = _mm_unpacklo_epi64(_mm_loadl_epi64((const xmmi *)(m + 8)), + _mm_loadl_epi64((const xmmi *)(m + 24))); + M0 = _mm_and_si128(MMASK, T5); + M1 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + T5 = _mm_or_si128(_mm_srli_epi64(T5, 52), _mm_slli_epi64(T6, 12)); + M2 = _mm_and_si128(MMASK, T5); + M3 = _mm_and_si128(MMASK, _mm_srli_epi64(T5, 26)); + M4 = _mm_or_si128(_mm_srli_epi64(T6, 40), HIBIT); + + T0 = _mm_add_epi64(T0, M0); + T1 = _mm_add_epi64(T1, M1); + T2 = _mm_add_epi64(T2, M2); + T3 = _mm_add_epi64(T3, M3); + T4 = _mm_add_epi64(T4, M4); + + // reduce + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); + + // H = (H*[r^2,r^2] + [Mx,My]) + H0 = T0; + H1 = T1; + H2 = T2; + H3 = T3; + H4 = T4; + + consumed = 32; + } + + // finalize, H *= [r^2,r] + r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; + r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; + r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; + + p->R20.d[2] = (uint32_t)(r0) & 0x3ffffff; + p->R21.d[2] = (uint32_t)((r0 >> 26) | (r1 << 18)) & 0x3ffffff; + p->R22.d[2] = (uint32_t)((r1 >> 8)) & 0x3ffffff; + p->R23.d[2] = (uint32_t)((r1 >> 34) | (r2 << 10)) & 0x3ffffff; + p->R24.d[2] = (uint32_t)((r2 >> 16)); + p->S21.d[2] = p->R21.d[2] * 5; + p->S22.d[2] = p->R22.d[2] * 5; + p->S23.d[2] = p->R23.d[2] * 5; + p->S24.d[2] = p->R24.d[2] * 5; + + // H *= [r^2,r] + T0 = _mm_mul_epu32(H0, p->R20.v); + T1 = _mm_mul_epu32(H0, p->R21.v); + T2 = _mm_mul_epu32(H0, p->R22.v); + T3 = _mm_mul_epu32(H0, p->R23.v); + T4 = _mm_mul_epu32(H0, p->R24.v); + T5 = _mm_mul_epu32(H1, p->S24.v); + T6 = _mm_mul_epu32(H1, p->R20.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H2, p->S23.v); + T6 = _mm_mul_epu32(H2, p->S24.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H3, p->S22.v); + T6 = _mm_mul_epu32(H3, p->S23.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H4, p->S21.v); + T6 = _mm_mul_epu32(H4, p->S22.v); + T0 = _mm_add_epi64(T0, T5); + T1 = _mm_add_epi64(T1, T6); + T5 = _mm_mul_epu32(H1, p->R21.v); + T6 = _mm_mul_epu32(H1, p->R22.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H2, p->R20.v); + T6 = _mm_mul_epu32(H2, p->R21.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H3, p->S24.v); + T6 = _mm_mul_epu32(H3, p->R20.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H4, p->S23.v); + T6 = _mm_mul_epu32(H4, p->S24.v); + T2 = _mm_add_epi64(T2, T5); + T3 = _mm_add_epi64(T3, T6); + T5 = _mm_mul_epu32(H1, p->R23.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H2, p->R22.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H3, p->R21.v); + T4 = _mm_add_epi64(T4, T5); + T5 = _mm_mul_epu32(H4, p->R20.v); + T4 = _mm_add_epi64(T4, T5); + + C1 = _mm_srli_epi64(T0, 26); + C2 = _mm_srli_epi64(T3, 26); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_and_si128(T3, MMASK); + T1 = _mm_add_epi64(T1, C1); + T4 = _mm_add_epi64(T4, C2); + C1 = _mm_srli_epi64(T1, 26); + C2 = _mm_srli_epi64(T4, 26); + T1 = _mm_and_si128(T1, MMASK); + T4 = _mm_and_si128(T4, MMASK); + T2 = _mm_add_epi64(T2, C1); + T0 = _mm_add_epi64(T0, _mm_mul_epu32(C2, FIVE)); + C1 = _mm_srli_epi64(T2, 26); + C2 = _mm_srli_epi64(T0, 26); + T2 = _mm_and_si128(T2, MMASK); + T0 = _mm_and_si128(T0, MMASK); + T3 = _mm_add_epi64(T3, C1); + T1 = _mm_add_epi64(T1, C2); + C1 = _mm_srli_epi64(T3, 26); + T3 = _mm_and_si128(T3, MMASK); + T4 = _mm_add_epi64(T4, C1); + + // H = H[0]+H[1] + H0 = _mm_add_epi64(T0, _mm_srli_si128(T0, 8)); + H1 = _mm_add_epi64(T1, _mm_srli_si128(T1, 8)); + H2 = _mm_add_epi64(T2, _mm_srli_si128(T2, 8)); + H3 = _mm_add_epi64(T3, _mm_srli_si128(T3, 8)); + H4 = _mm_add_epi64(T4, _mm_srli_si128(T4, 8)); + + t0 = _mm_cvtsi128_si32(H0); + c = (t0 >> 26); + t0 &= 0x3ffffff; + t1 = _mm_cvtsi128_si32(H1) + c; + c = (t1 >> 26); + t1 &= 0x3ffffff; + t2 = _mm_cvtsi128_si32(H2) + c; + c = (t2 >> 26); + t2 &= 0x3ffffff; + t3 = _mm_cvtsi128_si32(H3) + c; + c = (t3 >> 26); + t3 &= 0x3ffffff; + t4 = _mm_cvtsi128_si32(H4) + c; + c = (t4 >> 26); + t4 &= 0x3ffffff; + t0 = t0 + (c * 5); + c = (t0 >> 26); + t0 &= 0x3ffffff; + t1 = t1 + c; + + st->HH[0] = ((t0) | (t1 << 26)) & UINT64_C(0xfffffffffff); + st->HH[1] = ((t1 >> 18) | (t2 << 8) | (t3 << 34)) & UINT64_C(0xfffffffffff); + st->HH[2] = ((t3 >> 10) | (t4 << 16)) & UINT64_C(0x3ffffffffff); + + return consumed; +} + +} // namespace + +void CRYPTO_poly1305_update(poly1305_state *state, const uint8_t *m, + size_t bytes) { + poly1305_state_internal *st = poly1305_aligned_state(state); + size_t want; + + // Work around a C language bug. See https://crbug.com/1019588. + if (bytes == 0) { + return; + } + + // need at least 32 initial bytes to start the accelerated branch + if (!st->started) { + if ((st->leftover == 0) && (bytes > 32)) { + poly1305_first_block(st, m); + m += 32; + bytes -= 32; + } else { + want = poly1305_min(32 - st->leftover, bytes); + OPENSSL_memcpy(st->buffer + st->leftover, m, want); + bytes -= want; + m += want; + st->leftover += want; + if ((st->leftover < 32) || (bytes == 0)) { + return; + } + poly1305_first_block(st, st->buffer); + st->leftover = 0; + } + st->started = 1; + } + + // handle leftover + if (st->leftover) { + want = poly1305_min(64 - st->leftover, bytes); + OPENSSL_memcpy(st->buffer + st->leftover, m, want); + bytes -= want; + m += want; + st->leftover += want; + if (st->leftover < 64) { + return; + } + poly1305_blocks(st, st->buffer, 64); + st->leftover = 0; + } + + // process 64 byte blocks + if (bytes >= 64) { + want = (bytes & ~63); + poly1305_blocks(st, m, want); + m += want; + bytes -= want; + } + + if (bytes) { + OPENSSL_memcpy(st->buffer + st->leftover, m, bytes); + st->leftover += bytes; + } +} + +void CRYPTO_poly1305_finish(poly1305_state *state, uint8_t mac[16]) { + poly1305_state_internal *st = poly1305_aligned_state(state); + size_t leftover = st->leftover; + uint8_t *m = st->buffer; + uint128_t d[3]; + uint64_t h0, h1, h2; + uint64_t t0, t1; + uint64_t g0, g1, g2, c, nc; + uint64_t r0, r1, r2, s1, s2; + poly1305_power *p; + + if (st->started) { + size_t consumed = poly1305_combine(st, m, leftover); + leftover -= consumed; + m += consumed; + } + + // st->HH will either be 0 or have the combined result + h0 = st->HH[0]; + h1 = st->HH[1]; + h2 = st->HH[2]; + + p = &st->P[1]; + r0 = ((uint64_t)p->R20.d[3] << 32) | (uint64_t)p->R20.d[1]; + r1 = ((uint64_t)p->R21.d[3] << 32) | (uint64_t)p->R21.d[1]; + r2 = ((uint64_t)p->R22.d[3] << 32) | (uint64_t)p->R22.d[1]; + s1 = r1 * (5 << 2); + s2 = r2 * (5 << 2); + + if (leftover < 16) { + goto poly1305_donna_atmost15bytes; + } + +poly1305_donna_atleast16bytes: + t0 = CRYPTO_load_u64_le(m + 0); + t1 = CRYPTO_load_u64_le(m + 8); + h0 += t0 & 0xfffffffffff; + t0 = shr128_pair(t1, t0, 44); + h1 += t0 & 0xfffffffffff; + h2 += (t1 >> 24) | ((uint64_t)1 << 40); + +poly1305_donna_mul: + d[0] = add128(add128(mul64x64_128(h0, r0), mul64x64_128(h1, s2)), + mul64x64_128(h2, s1)); + d[1] = add128(add128(mul64x64_128(h0, r1), mul64x64_128(h1, r0)), + mul64x64_128(h2, s2)); + d[2] = add128(add128(mul64x64_128(h0, r2), mul64x64_128(h1, r1)), + mul64x64_128(h2, r0)); + h0 = lo128(d[0]) & 0xfffffffffff; + c = shr128(d[0], 44); + d[1] = add128_64(d[1], c); + h1 = lo128(d[1]) & 0xfffffffffff; + c = shr128(d[1], 44); + d[2] = add128_64(d[2], c); + h2 = lo128(d[2]) & 0x3ffffffffff; + c = shr128(d[2], 42); + h0 += c * 5; + + m += 16; + leftover -= 16; + if (leftover >= 16) { + goto poly1305_donna_atleast16bytes; + } + +// final bytes +poly1305_donna_atmost15bytes: + if (!leftover) { + goto poly1305_donna_finish; + } + + m[leftover++] = 1; + OPENSSL_memset(m + leftover, 0, 16 - leftover); + leftover = 16; + + t0 = CRYPTO_load_u64_le(m + 0); + t1 = CRYPTO_load_u64_le(m + 8); + h0 += t0 & 0xfffffffffff; + t0 = shr128_pair(t1, t0, 44); + h1 += t0 & 0xfffffffffff; + h2 += (t1 >> 24); + + goto poly1305_donna_mul; + +poly1305_donna_finish: + c = (h0 >> 44); + h0 &= 0xfffffffffff; + h1 += c; + c = (h1 >> 44); + h1 &= 0xfffffffffff; + h2 += c; + c = (h2 >> 42); + h2 &= 0x3ffffffffff; + h0 += c * 5; + + g0 = h0 + 5; + c = (g0 >> 44); + g0 &= 0xfffffffffff; + g1 = h1 + c; + c = (g1 >> 44); + g1 &= 0xfffffffffff; + g2 = h2 + c - ((uint64_t)1 << 42); + + c = (g2 >> 63) - 1; + nc = ~c; + h0 = (h0 & nc) | (g0 & c); + h1 = (h1 & nc) | (g1 & c); + h2 = (h2 & nc) | (g2 & c); + + // pad + t0 = ((uint64_t)p->R23.d[3] << 32) | (uint64_t)p->R23.d[1]; + t1 = ((uint64_t)p->R24.d[3] << 32) | (uint64_t)p->R24.d[1]; + h0 += (t0 & 0xfffffffffff); + c = (h0 >> 44); + h0 &= 0xfffffffffff; + t0 = shr128_pair(t1, t0, 44); + h1 += (t0 & 0xfffffffffff) + c; + c = (h1 >> 44); + h1 &= 0xfffffffffff; + t1 = (t1 >> 24); + h2 += (t1) + c; + + CRYPTO_store_u64_le(mac + 0, ((h0) | (h1 << 44))); + CRYPTO_store_u64_le(mac + 8, ((h1 >> 20) | (h2 << 24))); +} + +#endif // BORINGSSL_HAS_UINT128 && OPENSSL_X86_64 diff --git a/third_party/boringssl/src/crypto/pool/internal.h b/third_party/boringssl/src/crypto/pool/internal.h index f9f4838b..6f8165ff 100644 --- a/third_party/boringssl/src/crypto/pool/internal.h +++ b/third_party/boringssl/src/crypto/pool/internal.h @@ -1,50 +1,95 @@ -/* Copyright (c) 2016, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_POOL_INTERNAL_H -#define OPENSSL_HEADER_POOL_INTERNAL_H - -#include -#include +// Copyright 2016 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#ifndef OPENSSL_HEADER_CRYPTO_POOL_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_POOL_INTERNAL_H + +#include "../internal.h" #include "../lhash/internal.h" +#include "../mem_internal.h" + +DECLARE_OPAQUE_STRUCT(crypto_buffer_st, CryptoBuffer) +DECLARE_OPAQUE_STRUCT(crypto_buffer_pool_st, CryptoBufferPool) -#if defined(__cplusplus) -extern "C" { -#endif +BSSL_NAMESPACE_BEGIN +// A CryptoBufferPoolHandle is the portion of the pool that lasts as long as any +// live buffer or pool. This allows buffers to outlive the pool. (The pool is +// only needed as long as callers wish to create new buffers.) +class CryptoBufferPoolHandle : public RefCounted { + public: + explicit CryptoBufferPoolHandle(CryptoBufferPool *pool) + : RefCounted(CheckSubClass()), pool_(pool) {} -DEFINE_LHASH_OF(CRYPTO_BUFFER) + // pool_ is protected by lock_. + Mutex lock_; + CryptoBufferPool *pool_ = nullptr; -struct crypto_buffer_st { - CRYPTO_BUFFER_POOL *pool; - uint8_t *data; - size_t len; - CRYPTO_refcount_t references; - int data_is_static; + private: + friend RefCounted; + ~CryptoBufferPoolHandle() = default; }; -struct crypto_buffer_pool_st { - LHASH_OF(CRYPTO_BUFFER) *bufs; - CRYPTO_MUTEX lock; - const uint64_t hash_key[2]; +class CryptoBuffer : public crypto_buffer_st { + public: + CryptoBuffer() = default; + CryptoBuffer(const CryptoBuffer &) = delete; + CryptoBuffer &operator=(const CryptoBuffer &) = delete; + + Span span() const { return Span(data_, len_); } + + // Instead of subclassing RefCounted, implement refcounting by hand. + // CryptoBuffer's refcounting must synchronize with CryptoBufferPool. + static constexpr bool kAllowRefCountedUniquePtr = true; + void UpRefInternal() const; + void DecRefInternal(); + + UniquePtr pool_handle_; + uint8_t *data_ = nullptr; + size_t len_ = 0; + mutable CRYPTO_refcount_t references_ = 1; + bool data_is_static_ = false; + + private: + ~CryptoBuffer(); }; +DEFINE_LHASH_OF(CryptoBuffer) + +class CryptoBufferPool : public crypto_buffer_pool_st, + public RefCounted { + public: + CryptoBufferPool(); + + // Hash returns the hash of |data|. + uint32_t Hash(Span data) const; + + // FindBufferLocked looks for a buffer with hash |hash| and contents |data|. + // It returns it if found and nullptr otherwise. |handle_->lock_| must be + // locked for reading or writing before calling this. + CryptoBuffer *FindBufferLocked(uint32_t hash, Span data); + + UniquePtr handle_; + LHASH_OF(CryptoBuffer) *bufs_ = nullptr; + uint64_t hash_key_[2]; + + private: + friend RefCounted; + ~CryptoBufferPool(); +}; -#if defined(__cplusplus) -} // extern C -#endif +BSSL_NAMESPACE_END -#endif // OPENSSL_HEADER_POOL_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_POOL_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/pool/pool.c b/third_party/boringssl/src/crypto/pool/pool.c deleted file mode 100644 index e889f521..00000000 --- a/third_party/boringssl/src/crypto/pool/pool.c +++ /dev/null @@ -1,264 +0,0 @@ -/* Copyright (c) 2016, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include - -#include -#include -#include -#include -#include - -#include "../internal.h" -#include "internal.h" - - -static uint32_t CRYPTO_BUFFER_hash(const CRYPTO_BUFFER *buf) { - return (uint32_t)SIPHASH_24(buf->pool->hash_key, buf->data, buf->len); -} - -static int CRYPTO_BUFFER_cmp(const CRYPTO_BUFFER *a, const CRYPTO_BUFFER *b) { - // Only |CRYPTO_BUFFER|s from the same pool have compatible hashes. - assert(a->pool != NULL); - assert(a->pool == b->pool); - if (a->len != b->len) { - return 1; - } - return OPENSSL_memcmp(a->data, b->data, a->len); -} - -CRYPTO_BUFFER_POOL* CRYPTO_BUFFER_POOL_new(void) { - CRYPTO_BUFFER_POOL *pool = OPENSSL_malloc(sizeof(CRYPTO_BUFFER_POOL)); - if (pool == NULL) { - return NULL; - } - - OPENSSL_memset(pool, 0, sizeof(CRYPTO_BUFFER_POOL)); - pool->bufs = lh_CRYPTO_BUFFER_new(CRYPTO_BUFFER_hash, CRYPTO_BUFFER_cmp); - if (pool->bufs == NULL) { - OPENSSL_free(pool); - return NULL; - } - - CRYPTO_MUTEX_init(&pool->lock); - RAND_bytes((uint8_t *)&pool->hash_key, sizeof(pool->hash_key)); - - return pool; -} - -void CRYPTO_BUFFER_POOL_free(CRYPTO_BUFFER_POOL *pool) { - if (pool == NULL) { - return; - } - -#if !defined(NDEBUG) - CRYPTO_MUTEX_lock_write(&pool->lock); - assert(lh_CRYPTO_BUFFER_num_items(pool->bufs) == 0); - CRYPTO_MUTEX_unlock_write(&pool->lock); -#endif - - lh_CRYPTO_BUFFER_free(pool->bufs); - CRYPTO_MUTEX_cleanup(&pool->lock); - OPENSSL_free(pool); -} - -static void crypto_buffer_free_object(CRYPTO_BUFFER *buf) { - if (!buf->data_is_static) { - OPENSSL_free(buf->data); - } - OPENSSL_free(buf); -} - -static CRYPTO_BUFFER *crypto_buffer_new(const uint8_t *data, size_t len, - int data_is_static, - CRYPTO_BUFFER_POOL *pool) { - if (pool != NULL) { - CRYPTO_BUFFER tmp; - tmp.data = (uint8_t *) data; - tmp.len = len; - tmp.pool = pool; - - CRYPTO_MUTEX_lock_read(&pool->lock); - CRYPTO_BUFFER *duplicate = lh_CRYPTO_BUFFER_retrieve(pool->bufs, &tmp); - if (data_is_static && duplicate != NULL && !duplicate->data_is_static) { - // If the new |CRYPTO_BUFFER| would have static data, but the duplicate - // does not, we replace the old one with the new static version. - duplicate = NULL; - } - if (duplicate != NULL) { - CRYPTO_refcount_inc(&duplicate->references); - } - CRYPTO_MUTEX_unlock_read(&pool->lock); - - if (duplicate != NULL) { - return duplicate; - } - } - - CRYPTO_BUFFER *const buf = OPENSSL_malloc(sizeof(CRYPTO_BUFFER)); - if (buf == NULL) { - return NULL; - } - OPENSSL_memset(buf, 0, sizeof(CRYPTO_BUFFER)); - - if (data_is_static) { - buf->data = (uint8_t *)data; - buf->data_is_static = 1; - } else { - buf->data = OPENSSL_memdup(data, len); - if (len != 0 && buf->data == NULL) { - OPENSSL_free(buf); - return NULL; - } - } - - buf->len = len; - buf->references = 1; - - if (pool == NULL) { - return buf; - } - - buf->pool = pool; - - CRYPTO_MUTEX_lock_write(&pool->lock); - CRYPTO_BUFFER *duplicate = lh_CRYPTO_BUFFER_retrieve(pool->bufs, buf); - if (data_is_static && duplicate != NULL && !duplicate->data_is_static) { - // If the new |CRYPTO_BUFFER| would have static data, but the duplicate does - // not, we replace the old one with the new static version. - duplicate = NULL; - } - int inserted = 0; - if (duplicate == NULL) { - CRYPTO_BUFFER *old = NULL; - inserted = lh_CRYPTO_BUFFER_insert(pool->bufs, &old, buf); - // |old| may be non-NULL if a match was found but ignored. |pool->bufs| does - // not increment refcounts, so there is no need to clean up after the - // replacement. - } else { - CRYPTO_refcount_inc(&duplicate->references); - } - CRYPTO_MUTEX_unlock_write(&pool->lock); - - if (!inserted) { - // We raced to insert |buf| into the pool and lost, or else there was an - // error inserting. - crypto_buffer_free_object(buf); - return duplicate; - } - - return buf; -} - -CRYPTO_BUFFER *CRYPTO_BUFFER_new(const uint8_t *data, size_t len, - CRYPTO_BUFFER_POOL *pool) { - return crypto_buffer_new(data, len, /*data_is_static=*/0, pool); -} - -CRYPTO_BUFFER *CRYPTO_BUFFER_alloc(uint8_t **out_data, size_t len) { - CRYPTO_BUFFER *const buf = OPENSSL_malloc(sizeof(CRYPTO_BUFFER)); - if (buf == NULL) { - return NULL; - } - OPENSSL_memset(buf, 0, sizeof(CRYPTO_BUFFER)); - - buf->data = OPENSSL_malloc(len); - if (len != 0 && buf->data == NULL) { - OPENSSL_free(buf); - return NULL; - } - buf->len = len; - buf->references = 1; - - *out_data = buf->data; - return buf; -} - -CRYPTO_BUFFER *CRYPTO_BUFFER_new_from_CBS(const CBS *cbs, - CRYPTO_BUFFER_POOL *pool) { - return CRYPTO_BUFFER_new(CBS_data(cbs), CBS_len(cbs), pool); -} - -CRYPTO_BUFFER *CRYPTO_BUFFER_new_from_static_data_unsafe( - const uint8_t *data, size_t len, CRYPTO_BUFFER_POOL *pool) { - return crypto_buffer_new(data, len, /*data_is_static=*/1, pool); -} - -void CRYPTO_BUFFER_free(CRYPTO_BUFFER *buf) { - if (buf == NULL) { - return; - } - - CRYPTO_BUFFER_POOL *const pool = buf->pool; - if (pool == NULL) { - if (CRYPTO_refcount_dec_and_test_zero(&buf->references)) { - // If a reference count of zero is observed, there cannot be a reference - // from any pool to this buffer and thus we are able to free this - // buffer. - crypto_buffer_free_object(buf); - } - - return; - } - - CRYPTO_MUTEX_lock_write(&pool->lock); - if (!CRYPTO_refcount_dec_and_test_zero(&buf->references)) { - CRYPTO_MUTEX_unlock_write(&buf->pool->lock); - return; - } - - // We have an exclusive lock on the pool, therefore no concurrent lookups can - // find this buffer and increment the reference count. Thus, if the count is - // zero there are and can never be any more references and thus we can free - // this buffer. - // - // Note it is possible |buf| is no longer in the pool, if it was replaced by a - // static version. If that static version was since removed, it is even - // possible for |found| to be NULL. - CRYPTO_BUFFER *found = lh_CRYPTO_BUFFER_retrieve(pool->bufs, buf); - if (found == buf) { - found = lh_CRYPTO_BUFFER_delete(pool->bufs, buf); - assert(found == buf); - (void)found; - } - - CRYPTO_MUTEX_unlock_write(&buf->pool->lock); - crypto_buffer_free_object(buf); -} - -int CRYPTO_BUFFER_up_ref(CRYPTO_BUFFER *buf) { - // This is safe in the case that |buf->pool| is NULL because it's just - // standard reference counting in that case. - // - // This is also safe if |buf->pool| is non-NULL because, if it were racing - // with |CRYPTO_BUFFER_free| then the two callers must have independent - // references already and so the reference count will never hit zero. - CRYPTO_refcount_inc(&buf->references); - return 1; -} - -const uint8_t *CRYPTO_BUFFER_data(const CRYPTO_BUFFER *buf) { - return buf->data; -} - -size_t CRYPTO_BUFFER_len(const CRYPTO_BUFFER *buf) { - return buf->len; -} - -void CRYPTO_BUFFER_init_CBS(const CRYPTO_BUFFER *buf, CBS *out) { - CBS_init(out, buf->data, buf->len); -} diff --git a/third_party/boringssl/src/crypto/pool/pool.cc b/third_party/boringssl/src/crypto/pool/pool.cc new file mode 100644 index 00000000..c9f7bc80 --- /dev/null +++ b/third_party/boringssl/src/crypto/pool/pool.cc @@ -0,0 +1,282 @@ +// Copyright 2016 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +static uint32_t CRYPTO_BUFFER_hash(const CryptoBuffer *buf) { + // This function must be called while there is a read or write lock on the + // pool, so it is safe to read |pool_|. + return buf->pool_handle_->pool_->Hash(buf->span()); +} + +static int CRYPTO_BUFFER_cmp(const CryptoBuffer *a, const CryptoBuffer *b) { + // Only |CRYPTO_BUFFER|s from the same pool have compatible hashes. + assert(a->pool_handle_ != nullptr); + assert(a->pool_handle_ == b->pool_handle_); + return a->span() == b->span() ? 0 : 1; +} + +CryptoBufferPool::CryptoBufferPool() : RefCounted(CheckSubClass()) { + RAND_bytes(reinterpret_cast(&hash_key_), sizeof(hash_key_)); +} + +CryptoBufferPool::~CryptoBufferPool() { + if (handle_) { + MutexWriteLock lock(&handle_->lock_); + handle_->pool_ = nullptr; + } + lh_CryptoBuffer_free(bufs_); +} + +uint32_t CryptoBufferPool::Hash(Span data) const { + return static_cast(SIPHASH_24(hash_key_, data.data(), data.size())); +} + +CryptoBuffer *CryptoBufferPool::FindBufferLocked(uint32_t hash, + Span data) { + return lh_CryptoBuffer_retrieve_key( + bufs_, &data, hash, + [](const void *key_v, const CryptoBuffer *buf) -> int { + Span key = + *static_cast *>(key_v); + return key == buf->span() ? 0 : 1; + }); +} + +CRYPTO_BUFFER_POOL *CRYPTO_BUFFER_POOL_new() { + auto pool = MakeUnique(); + if (pool == nullptr) { + return nullptr; + } + + pool->bufs_ = lh_CryptoBuffer_new(CRYPTO_BUFFER_hash, CRYPTO_BUFFER_cmp); + pool->handle_ = MakeUnique(pool.get()); + if (pool->bufs_ == nullptr || pool->handle_ == nullptr) { + return nullptr; + } + + return pool.release(); +} + +void CRYPTO_BUFFER_POOL_free(CRYPTO_BUFFER_POOL *pool) { + if (pool != nullptr) { + FromOpaque(pool)->DecRefInternal(); + } +} + +int CRYPTO_BUFFER_POOL_up_ref(CRYPTO_BUFFER_POOL *pool) { + FromOpaque(pool)->UpRefInternal(); + return 1; +} + +void CryptoBuffer::UpRefInternal() const { + // This is safe in the case that |buf->pool| is NULL because it's just + // standard reference counting in that case. + // + // This is also safe if |buf->pool| is non-NULL because, if it were racing + // with |CRYPTO_BUFFER_free| then the two callers must have independent + // references already and so the reference count will never hit zero. + CRYPTO_refcount_inc(&references_); +} + +void CryptoBuffer::DecRefInternal() { + // If there is a pool, decrementing the refcount must synchronize with it. + if (pool_handle_ == nullptr) { + if (!CRYPTO_refcount_dec_and_test_zero(&references_)) { + return; + } + } else { + MutexWriteLock lock(&pool_handle_->lock_); + if (!CRYPTO_refcount_dec_and_test_zero(&references_)) { + return; + } + + // We have an exclusive lock on the pool handle, therefore no concurrent + // lookups can find this buffer and increment the reference count. Thus, if + // the count is zero there are and can never be any more references and thus + // we can free this buffer. It is possible the pool was already destroyed, + // but it cannot be destroyed concurrently. + // + // Note it is possible |buf| is no longer in the pool, if it was replaced by + // a static version. If that static version was since removed, it is even + // possible for |found| to be NULL. + if (CryptoBufferPool *pool = pool_handle_->pool_; pool != nullptr) { + CryptoBuffer *found = lh_CryptoBuffer_retrieve(pool->bufs_, this); + if (found == this) { + found = lh_CryptoBuffer_delete(pool->bufs_, this); + assert(found == this); + (void)found; + } + } + } + + this->~CryptoBuffer(); + OPENSSL_free(this); +} + +CryptoBuffer::~CryptoBuffer() { + if (!data_is_static_) { + OPENSSL_free(data_); + } +} + +static UniquePtr crypto_buffer_new(Span data, + bool data_is_static) { + UniquePtr buf = MakeUnique(); + if (buf == nullptr) { + return nullptr; + } + + if (data_is_static) { + buf->data_ = const_cast(data.data()); + buf->data_is_static_ = true; + } else { + buf->data_ = + static_cast(OPENSSL_memdup(data.data(), data.size())); + if (!data.empty() && buf->data_ == nullptr) { + return nullptr; + } + } + + buf->len_ = data.size(); + return buf; +} + +static UniquePtr crypto_buffer_new_with_pool( + Span data, bool data_is_static, CryptoBufferPool *pool) { + if (pool == nullptr) { + return crypto_buffer_new(data, data_is_static); + } + + const uint32_t hash = pool->Hash(data); + { + // Look for a matching buffer in the pool. + MutexReadLock lock(&pool->handle_->lock_); + CryptoBuffer *duplicate = pool->FindBufferLocked(hash, data); + if (data_is_static && duplicate != nullptr && !duplicate->data_is_static_) { + // If the new |CRYPTO_BUFFER| would have static data, but the duplicate + // does not, we replace the old one with the new static version. + duplicate = nullptr; + } + if (duplicate != nullptr) { + return UpRef(duplicate); + } + } + + UniquePtr buf = crypto_buffer_new(data, data_is_static); + if (buf == nullptr) { + return nullptr; + } + + MutexWriteLock lock(&pool->handle_->lock_); + CryptoBuffer *duplicate = pool->FindBufferLocked(hash, data); + if (data_is_static && duplicate != nullptr && !duplicate->data_is_static_) { + // If the new |CRYPTO_BUFFER| would have static data, but the duplicate does + // not, we replace the old one with the new static version. + duplicate = nullptr; + } + if (duplicate != nullptr) { + return UpRef(duplicate); + } + + // Insert |buf| into the pool. Note |old| may be non-NULL if a match was found + // but ignored. |pool->bufs_| does not increment refcounts, so there is no + // need to clean up after the replacement. + buf->pool_handle_ = UpRef(pool->handle_); + CryptoBuffer *old = nullptr; + if (!lh_CryptoBuffer_insert(pool->bufs_, &old, buf.get())) { + buf->pool_handle_ = nullptr; // No need to synchronize with the pool. + return nullptr; + } + return buf; +} + +CRYPTO_BUFFER *CRYPTO_BUFFER_new(const uint8_t *data, size_t len, + CRYPTO_BUFFER_POOL *pool) { + return crypto_buffer_new_with_pool(Span(data, len), /*data_is_static=*/false, + FromOpaque(pool)) + .release(); +} + +CRYPTO_BUFFER *CRYPTO_BUFFER_alloc(uint8_t **out_data, size_t len) { + auto buf = MakeUnique(); + if (buf == nullptr) { + return nullptr; + } + + buf->data_ = reinterpret_cast(OPENSSL_malloc(len)); + if (len != 0 && buf->data_ == nullptr) { + return nullptr; + } + buf->len_ = len; + + *out_data = buf->data_; + return buf.release(); +} + +CRYPTO_BUFFER *CRYPTO_BUFFER_new_from_CBS(const CBS *cbs, + CRYPTO_BUFFER_POOL *pool) { + return CRYPTO_BUFFER_new(CBS_data(cbs), CBS_len(cbs), pool); +} + +CRYPTO_BUFFER *CRYPTO_BUFFER_new_from_static_data_unsafe( + const uint8_t *data, size_t len, CRYPTO_BUFFER_POOL *pool) { + return crypto_buffer_new_with_pool(Span(data, len), /*data_is_static=*/true, + FromOpaque(pool)) + .release(); +} + +void CRYPTO_BUFFER_free(CRYPTO_BUFFER *buf) { + if (buf != nullptr) { + FromOpaque(buf)->DecRefInternal(); + } +} + +int CRYPTO_BUFFER_up_ref(CRYPTO_BUFFER *buf) { + FromOpaque(buf)->UpRefInternal(); + return 1; +} + +CRYPTO_BUFFER *CRYPTO_BUFFER_dup_ref(const CRYPTO_BUFFER *buf) { + auto *buf_ = const_cast(buf); + FromOpaque(buf_)->UpRefInternal(); + return buf_; +} + +const uint8_t *CRYPTO_BUFFER_data(const CRYPTO_BUFFER *buf) { + return FromOpaque(buf)->data_; +} + +size_t CRYPTO_BUFFER_len(const CRYPTO_BUFFER *buf) { + return FromOpaque(buf)->len_; +} + +void CRYPTO_BUFFER_init_CBS(const CRYPTO_BUFFER *buf, CBS *out) { + *out = FromOpaque(buf)->span(); +} diff --git a/third_party/boringssl/src/crypto/rand/deterministic.cc b/third_party/boringssl/src/crypto/rand/deterministic.cc new file mode 100644 index 00000000..d2fcceee --- /dev/null +++ b/third_party/boringssl/src/crypto/rand/deterministic.cc @@ -0,0 +1,58 @@ +// Copyright 2016 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../bcm_support.h" +#include "internal.h" + +#if defined(OPENSSL_RAND_DETERMINISTIC) + +#include + +#include + +#include "../internal.h" + + +using namespace bssl; + +// g_num_calls is the number of calls to |CRYPTO_sysrand| that have occurred. +// +// This is intentionally not thread-safe. If the fuzzer mode is ever used in a +// multi-threaded program, replace this with a thread-local. (A mutex would not +// be deterministic.) +static uint64_t g_num_calls = 0; +static StaticMutex g_num_calls_lock; + +void RAND_reset_for_fuzzing() { g_num_calls = 0; } + +void bssl::CRYPTO_init_sysrand() {} + +void bssl::CRYPTO_sysrand(uint8_t *out, size_t requested) { + static const uint8_t kZeroKey[32] = {0}; + + g_num_calls_lock.LockWrite(); + uint64_t num_calls = g_num_calls++; + g_num_calls_lock.UnlockWrite(); + + uint8_t nonce[12]; + OPENSSL_memset(nonce, 0, sizeof(nonce)); + OPENSSL_memcpy(nonce, &num_calls, sizeof(num_calls)); + + OPENSSL_memset(out, 0, requested); + CRYPTO_chacha_20(out, out, requested, kZeroKey, nonce, 0); +} + +#endif // OPENSSL_RAND_DETERMINISTIC diff --git a/third_party/boringssl/src/crypto/rand/fork_detect.cc b/third_party/boringssl/src/crypto/rand/fork_detect.cc new file mode 100644 index 00000000..ecee4361 --- /dev/null +++ b/third_party/boringssl/src/crypto/rand/fork_detect.cc @@ -0,0 +1,222 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if !defined(_GNU_SOURCE) +#define _GNU_SOURCE // needed for madvise() and MAP_ANONYMOUS on Linux. +#endif + +#include "../bcm_support.h" +#include "../internal.h" +#include "internal.h" + +#if defined(OPENSSL_FORK_DETECTION_WIPEONFORK) + +#if defined(OPENSSL_LINUX) + +#include +#include +#include +#include +#if defined(MADV_WIPEONFORK) +static_assert(MADV_WIPEONFORK == 18); +#else +#define MADV_WIPEONFORK 18 +#endif + +#else + +// Otherwise assume a BSD style API. +#include +#include +#include + +#endif + +#elif defined(OPENSSL_FORK_DETECTION_PTHREAD_ATFORK) + +#include +#include +#include + +#endif // OPENSSL_FORK_DETECTION_PTHREAD_ATFORK + + +using namespace bssl; + +#if defined(OPENSSL_FORK_DETECTION_WIPEONFORK) + +static bool wipeonfork(void *addr, size_t page_size) { +#if defined(OPENSSL_LINUX) + // Linux flavor, >=4.14. + // Some versions of qemu (up to at least 5.0.0-rc4, see linux-user/syscall.c) + // ignore |madvise| calls and just return zero (i.e. success). But we need to + // know whether MADV_WIPEONFORK actually took effect. Therefore try an invalid + // call to check that the implementation of |madvise| is actually rejecting + // unknown |advice| values. + return madvise(addr, page_size, -1) != 0 && + madvise(addr, page_size, MADV_WIPEONFORK) == 0; +#elif defined(MAP_INHERIT_ZERO) + // OpenBSD flavor, >=5.6. + return minherit(addr, page_size, MAP_INHERIT_ZERO) == 0; +#else + // FreeBSD flavor, >=12.0. + return minherit(addr, page_size, INHERIT_ZERO) == 0; +#endif +} + +static int g_force_madv_wipeonfork; +static int g_force_madv_wipeonfork_enabled; +static CRYPTO_once_t g_fork_detect_once = CRYPTO_ONCE_INIT; +static StaticMutex g_fork_detect_lock; +static Atomic *g_fork_detect_addr; +static uint64_t g_fork_generation; + +static void init_fork_detect() { + if (g_force_madv_wipeonfork) { + return; + } + + long page_size = sysconf(_SC_PAGESIZE); + if (page_size <= 0) { + return; + } + + void *addr = mmap(nullptr, static_cast(page_size), + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + if (addr == MAP_FAILED) { + return; + } + + if (!wipeonfork(addr, static_cast(page_size))) { + munmap(addr, static_cast(page_size)); + return; + } + + g_fork_detect_addr = new (addr) Atomic(1); + g_fork_generation = 1; +} + +uint64_t bssl::CRYPTO_get_fork_generation() { + CRYPTO_once(&g_fork_detect_once, init_fork_detect); + + // In a single-threaded process, there are obviously no races because there's + // only a single mutator in the address space. + // + // In a multi-threaded environment, |CRYPTO_once| ensures that the flag byte + // is initialised atomically, even if multiple threads enter this function + // concurrently. + // + // Additionally, while the kernel will only clear WIPEONFORK at a point when a + // child process is single-threaded, the child may become multi-threaded + // before it observes this. Therefore, we must synchronize the logic below. + + Atomic *const flag_ptr = g_fork_detect_addr; + if (flag_ptr == nullptr) { + // Our kernel is too old to support |MADV_WIPEONFORK| or + // |g_force_madv_wipeonfork| is set. + if (g_force_madv_wipeonfork && g_force_madv_wipeonfork_enabled) { + // A constant generation number to simulate support, even if the kernel + // doesn't support it. + return 42; + } + // With Linux and clone(), we do not believe that pthread_atfork() is + // sufficient for detecting all forms of address space duplication. At this + // point we have a kernel that does not support MADV_WIPEONFORK. We could + // return the generation number from pthread_atfork() here and it would + // probably be safe in almost any situation, but to ensure safety we return + // 0 and force an entropy draw on every call. + return 0; + } + + // In the common case, try to observe the flag without taking a lock. This + // avoids cacheline contention in the PRNG. + uint64_t *const generation_ptr = &g_fork_generation; + if (flag_ptr->load() != 0) { + // If we observe a non-zero flag, it is safe to read |generation_ptr| + // without a lock. The flag and generation number are fixed for this copy of + // the address space. + return *generation_ptr; + } + + // The flag was zero. The generation number must be incremented, but other + // threads may have concurrently observed the zero, so take a lock before + // incrementing. + MutexWriteLock lock(&g_fork_detect_lock); + uint64_t current_generation = *generation_ptr; + if (flag_ptr->load() == 0) { + // A fork has occurred. + current_generation++; + if (current_generation == 0) { + // Zero means fork detection isn't supported, so skip that value. + current_generation = 1; + } + + // We must update |generation_ptr| before |flag_ptr|. Other threads may + // observe |flag_ptr| without taking a lock. + *generation_ptr = current_generation; + flag_ptr->store(1); + } + + return current_generation; +} + +void bssl::CRYPTO_fork_detect_force_madv_wipeonfork_for_testing(int on) { + g_force_madv_wipeonfork = 1; + g_force_madv_wipeonfork_enabled = on; +} + +#elif defined(OPENSSL_FORK_DETECTION_PTHREAD_ATFORK) + +static CRYPTO_once_t g_pthread_fork_detection_once = CRYPTO_ONCE_INIT; +static uint64_t g_atfork_fork_generation; + +static void we_are_forked() { + // Immediately after a fork, the process must be single-threaded. + uint64_t value = g_atfork_fork_generation + 1; + if (value == 0) { + value = 1; + } + g_atfork_fork_generation = value; +} + +static void init_pthread_fork_detection() { + if (pthread_atfork(nullptr, nullptr, we_are_forked) != 0) { + abort(); + } + g_atfork_fork_generation = 1; +} + +uint64_t bssl::CRYPTO_get_fork_generation() { + CRYPTO_once(&g_pthread_fork_detection_once, init_pthread_fork_detection); + + return g_atfork_fork_generation; +} + +#elif defined(OPENSSL_DOES_NOT_FORK) + +// These platforms are guaranteed not to fork, and therefore do not require +// fork detection support. Returning a constant non zero value makes BoringSSL +// assume address space duplication is not a concern and adding entropy to +// every RAND_bytes call is not needed. +uint64_t bssl::CRYPTO_get_fork_generation() { return 0xc0ffee; } + +#else + +// These platforms may fork, but we do not have a mitigation mechanism in +// place. Returning a constant zero value makes BoringSSL assume that address +// space duplication could have occurred on any call entropy must be added to +// every RAND_bytes call. +uint64_t bssl::CRYPTO_get_fork_generation() { return 0; } + +#endif diff --git a/third_party/boringssl/src/crypto/rand/forkunsafe.cc b/third_party/boringssl/src/crypto/rand/forkunsafe.cc new file mode 100644 index 00000000..32760b27 --- /dev/null +++ b/third_party/boringssl/src/crypto/rand/forkunsafe.cc @@ -0,0 +1,44 @@ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../fipsmodule/rand/internal.h" +#include "../internal.h" + + +using namespace bssl; + +// g_buffering_enabled is one if fork-unsafe buffering has been enabled and zero +// otherwise. +static bssl::Atomic g_buffering_enabled; + +#if !defined(OPENSSL_WINDOWS) +void RAND_enable_fork_unsafe_buffering(int fd) { + // We no longer support setting the file-descriptor with this function. + if (fd != -1) { + abort(); + } + + g_buffering_enabled.store(1); +} + +void RAND_disable_fork_unsafe_buffering() { g_buffering_enabled.store(0); } +#endif + +int bssl::rand_fork_unsafe_buffering_enabled() { + return g_buffering_enabled.load() != 0; +} diff --git a/third_party/boringssl/src/crypto/rand/getentropy.cc b/third_party/boringssl/src/crypto/rand/getentropy.cc new file mode 100644 index 00000000..7e85d8ab --- /dev/null +++ b/third_party/boringssl/src/crypto/rand/getentropy.cc @@ -0,0 +1,53 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if !defined(_DEFAULT_SOURCE) +#define _DEFAULT_SOURCE // Needed for getentropy on musl and glibc +#endif + +#include + +#include "../bcm_support.h" +#include "internal.h" + +#if defined(OPENSSL_RAND_GETENTROPY) + +#include +#include +#include + +#if defined(OPENSSL_MACOS) || defined(OPENSSL_FUCHSIA) +#include +#endif + +using namespace bssl; + +void bssl::CRYPTO_init_sysrand() {} + +// CRYPTO_sysrand puts |requested| random bytes into |out|. +void bssl::CRYPTO_sysrand(uint8_t *out, size_t requested) { + while (requested > 0) { + // |getentropy| can only request 256 bytes at a time. + size_t todo = requested <= 256 ? requested : 256; + if (getentropy(out, todo) != 0) { + perror("getentropy() failed"); + abort(); + } + + out += todo; + requested -= todo; + } +} + +#endif // OPENSSL_RAND_GETENTROPY diff --git a/third_party/boringssl/src/crypto/rand/internal.h b/third_party/boringssl/src/crypto/rand/internal.h new file mode 100644 index 00000000..50dbf0b0 --- /dev/null +++ b/third_party/boringssl/src/crypto/rand/internal.h @@ -0,0 +1,67 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_RAND_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_RAND_INTERNAL_H + +#include + + +#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) +#define OPENSSL_RAND_DETERMINISTIC +#elif defined(OPENSSL_TRUSTY) +#define OPENSSL_RAND_TRUSTY +#elif defined(OPENSSL_WINDOWS) +#define OPENSSL_RAND_WINDOWS +#elif defined(OPENSSL_LINUX) +#define OPENSSL_RAND_URANDOM +#elif defined(OPENSSL_APPLE) && !defined(OPENSSL_MACOS) +// Unlike macOS, iOS and similar hide away getentropy(). +#define OPENSSL_RAND_IOS +#else +// By default if you are integrating BoringSSL we expect you to +// provide getentropy from the header file. +#define OPENSSL_RAND_GETENTROPY +#endif + +#if defined(OPENSSL_LINUX) || defined(OPENSSL_FREEBSD) || \ + defined(OPENSSL_OPENBSD) + +// On linux we use MADVISE instead of pthread_atfork(), due +// to concerns about clone() being used for address space +// duplication. +#define OPENSSL_FORK_DETECTION +#define OPENSSL_FORK_DETECTION_WIPEONFORK + +#elif defined(OPENSSL_MACOS) || defined(OPENSSL_IOS) + +// These platforms may detect address space duplication with pthread_atfork. +// iOS doesn't normally allow fork in apps, but it's there. +#define OPENSSL_FORK_DETECTION +#define OPENSSL_FORK_DETECTION_PTHREAD_ATFORK + +#elif defined(OPENSSL_WINDOWS) || defined(OPENSSL_TRUSTY) || \ + defined(__ZEPHYR__) || defined(CROS_EC) + +// These platforms do not fork. +#define OPENSSL_DOES_NOT_FORK + +#else + +// Other platforms may fork, but BoringSSL cannot reliably detect it happening. +// So instead, new entropy will be drawn on every RNG call. + +#endif + +#endif // OPENSSL_HEADER_CRYPTO_RAND_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/rand/ios.cc b/third_party/boringssl/src/crypto/rand/ios.cc new file mode 100644 index 00000000..ddd6659e --- /dev/null +++ b/third_party/boringssl/src/crypto/rand/ios.cc @@ -0,0 +1,35 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../bcm_support.h" +#include "internal.h" + +#if defined(OPENSSL_RAND_IOS) +#include + +#include + +using namespace bssl; + +void bssl::CRYPTO_init_sysrand() {} + +void bssl::CRYPTO_sysrand(uint8_t *out, size_t requested) { + if (CCRandomGenerateBytes(out, requested) != kCCSuccess) { + abort(); + } +} + +#endif // OPENSSL_RAND_IOS diff --git a/third_party/boringssl/src/crypto/rand/passive.cc b/third_party/boringssl/src/crypto/rand/passive.cc new file mode 100644 index 00000000..a791ec0c --- /dev/null +++ b/third_party/boringssl/src/crypto/rand/passive.cc @@ -0,0 +1,180 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../bcm_support.h" +#include "../fipsmodule/bcm_interface.h" +#include "../internal.h" + +#if defined(BORINGSSL_FIPS) + +#include + +using namespace bssl; + +// passive_get_seed_entropy writes |out_entropy_len| bytes of entropy, suitable +// for seeding a DRBG, to |out_entropy|. It sets |*out_used_cpu| to one if the +// entropy came directly from the CPU and zero if it came from the OS. It +// actively obtains entropy from the CPU/OS +static void passive_get_seed_entropy(uint8_t *out_entropy, + size_t out_entropy_len, + int *out_want_additional_input) { + *out_want_additional_input = 0; + if (bcm_success(BCM_rand_bytes_hwrng(out_entropy, out_entropy_len))) { + *out_want_additional_input = 1; + } else { + CRYPTO_sysrand(out_entropy, out_entropy_len); + } +} + +#define ENTROPY_READ_LEN \ + (/* last_block size */ 16 + CTR_DRBG_SEED_LEN * BORINGSSL_FIPS_OVERREAD) + +#if defined(OPENSSL_ANDROID) + +#include +#include +#include +#include +#include + +// socket_history_t enumerates whether the entropy daemon should be contacted +// for a given entropy request. Values other than socket_not_yet_attempted are +// sticky so if the first attempt to read from the daemon fails it's assumed +// that the daemon is not present and no more attempts will be made. If the +// first attempt is successful then attempts will be made forever more. +enum class socket_history_t { + // initial value, no connections to the entropy daemon have been made yet. + socket_not_yet_attempted = 0, + // reading from the entropy daemon was successful + socket_success, + // reading from the entropy daemon failed. + socket_failed, +}; + +static std::atomic g_socket_history{ + socket_history_t::socket_not_yet_attempted}; + +// DAEMON_RESPONSE_LEN is the number of bytes that the entropy daemon replies +// with. +#define DAEMON_RESPONSE_LEN 496 + +static_assert(ENTROPY_READ_LEN == DAEMON_RESPONSE_LEN, + "entropy daemon response length mismatch"); + +static int get_seed_from_daemon(uint8_t *out_entropy, size_t out_entropy_len) { + // |RAND_need_entropy| should never call this function for more than + // |DAEMON_RESPONSE_LEN| bytes. + if (out_entropy_len > DAEMON_RESPONSE_LEN) { + abort(); + } + + const socket_history_t socket_history = + g_socket_history.load(std::memory_order_acquire); + if (socket_history == socket_history_t::socket_failed) { + return 0; + } + + int ret = 0; + static const char kSocketPath[] = "/dev/socket/prng_seeder"; + struct sockaddr_un sun; + uint8_t buffer[DAEMON_RESPONSE_LEN]; + size_t done = 0; + const int sock = socket(AF_UNIX, SOCK_STREAM, 0); + if (sock < 0) { + goto out; + } + + memset(&sun, 0, sizeof(sun)); + sun.sun_family = AF_UNIX; + static_assert(sizeof(kSocketPath) <= UNIX_PATH_MAX, "kSocketPath too long"); + OPENSSL_memcpy(sun.sun_path, kSocketPath, sizeof(kSocketPath)); + + if (connect(sock, (struct sockaddr *)&sun, sizeof(sun))) { + goto out; + } + + while (done < sizeof(buffer)) { + ssize_t n; + do { + n = read(sock, buffer + done, sizeof(buffer) - done); + } while (n == -1 && errno == EINTR); + + if (n < 1) { + goto out; + } + done += n; + } + + if (done != DAEMON_RESPONSE_LEN) { + // The daemon should always write |DAEMON_RESPONSE_LEN| bytes on every + // connection. + goto out; + } + + assert(out_entropy_len <= DAEMON_RESPONSE_LEN); + OPENSSL_memcpy(out_entropy, buffer, out_entropy_len); + ret = 1; + +out: + if (socket_history == socket_history_t::socket_not_yet_attempted) { + socket_history_t expected = socket_history_t::socket_not_yet_attempted; + // If another thread has already updated |g_socket_history| then we defer + // to their value. + g_socket_history.compare_exchange_strong( + expected, + (ret == 0) ? socket_history_t::socket_failed + : socket_history_t::socket_success, + std::memory_order_release, std::memory_order_relaxed); + } + + close(sock); + return ret; +} + +#else + +static int get_seed_from_daemon(uint8_t *out_entropy, size_t out_entropy_len) { + return 0; +} + +#endif // OPENSSL_ANDROID + +// RAND_need_entropy is called by the FIPS module when it has blocked because of +// a lack of entropy. This signal is used as an indication to feed it more. +void bssl::RAND_need_entropy(size_t bytes_needed) { + uint8_t buf[ENTROPY_READ_LEN]; + size_t todo = sizeof(buf); + if (todo > bytes_needed) { + todo = bytes_needed; + } + + int want_additional_input; + if (get_seed_from_daemon(buf, todo)) { + want_additional_input = 1; + } else { + passive_get_seed_entropy(buf, todo, &want_additional_input); + } + + if (boringssl_fips_break_test("CRNG")) { + // This breaks the "continuous random number generator test" defined in FIPS + // 140-2, section 4.9.2, and implemented in |rand_get_seed|. + OPENSSL_memset(buf, 0, todo); + } + + BCM_rand_load_entropy(buf, todo, want_additional_input); +} + +#endif // FIPS diff --git a/third_party/boringssl/src/crypto/rand/rand.cc b/third_party/boringssl/src/crypto/rand/rand.cc new file mode 100644 index 00000000..2aa30270 --- /dev/null +++ b/third_party/boringssl/src/crypto/rand/rand.cc @@ -0,0 +1,79 @@ +// Copyright 2017 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../bcm_support.h" +#include "../fipsmodule/bcm_interface.h" + + +using namespace bssl; + +int RAND_bytes(uint8_t *buf, size_t len) { + BCM_rand_bytes(buf, len); + return 1; +} + +int RAND_pseudo_bytes(uint8_t *buf, size_t len) { return RAND_bytes(buf, len); } + +void RAND_seed(const void *buf, int num) { + // OpenSSH calls |RAND_seed| before jailing on the assumption that any needed + // file descriptors etc will be opened. + uint8_t unused; + RAND_bytes(&unused, sizeof(unused)); +} + +int RAND_load_file(const char *path, long num) { + if (num < 0) { // read the "whole file" + return 1; + } else if (num <= INT_MAX) { + return (int)num; + } else { + return INT_MAX; + } +} + +const char *RAND_file_name(char *buf, size_t num) { return nullptr; } + +void RAND_add(const void *buf, int num, double entropy) {} + +int RAND_egd(const char *path) { return 255; } + +int RAND_poll() { return 1; } + +int RAND_status() { return 1; } + +static const struct rand_meth_st kSSLeayMethod = { + RAND_seed, RAND_bytes, RAND_cleanup, + RAND_add, RAND_pseudo_bytes, RAND_status, +}; + +RAND_METHOD *RAND_SSLeay() { return (RAND_METHOD *)&kSSLeayMethod; } + +RAND_METHOD *RAND_OpenSSL() { return RAND_SSLeay(); } + +const RAND_METHOD *RAND_get_rand_method() { return RAND_SSLeay(); } + +int RAND_set_rand_method(const RAND_METHOD *method) { return 1; } + +void RAND_cleanup() {} + +void RAND_get_system_entropy_for_custom_prng(uint8_t *buf, size_t len) { + if (len > 256) { + abort(); + } + CRYPTO_sysrand(buf, len); +} diff --git a/third_party/boringssl/src/crypto/rand/trusty.cc b/third_party/boringssl/src/crypto/rand/trusty.cc new file mode 100644 index 00000000..e2a5b5bf --- /dev/null +++ b/third_party/boringssl/src/crypto/rand/trusty.cc @@ -0,0 +1,39 @@ +// Copyright 2023 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../bcm_support.h" +#include "internal.h" + +#if defined(OPENSSL_RAND_TRUSTY) +#include +#include + +#include +#include + +#include + +using namespace bssl; + +void bssl::CRYPTO_init_sysrand() {} + +void bssl::CRYPTO_sysrand(uint8_t *out, size_t requested) { + if (trusty_rng_hw_rand(out, requested) != NO_ERROR) { + abort(); + } +} + +#endif // OPENSSL_RAND_TRUSTY diff --git a/third_party/boringssl/src/crypto/rand/urandom.cc b/third_party/boringssl/src/crypto/rand/urandom.cc new file mode 100644 index 00000000..d06ade71 --- /dev/null +++ b/third_party/boringssl/src/crypto/rand/urandom.cc @@ -0,0 +1,149 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#if !defined(_GNU_SOURCE) +#define _GNU_SOURCE // needed for syscall() on Linux. +#endif + +#include + +#include "../bcm_support.h" +#include "internal.h" + +#if defined(OPENSSL_RAND_URANDOM) + +#include +#include +#include +#include +#include +#include + +#include "../internal.h" + + +using namespace bssl; + +#if defined(OPENSSL_MSAN) +extern "C" { +void __msan_unpoison(void *, size_t); +} +#endif + +static ssize_t boringssl_getrandom(void *buf, size_t buf_len, unsigned flags) { + ssize_t ret; + do { + ret = syscall(__NR_getrandom, buf, buf_len, flags); + } while (ret == -1 && errno == EINTR); + +#if defined(OPENSSL_MSAN) + if (ret > 0) { + // MSAN doesn't recognise |syscall| and thus doesn't notice that we have + // initialised the output buffer. + __msan_unpoison(buf, ret); + } +#endif // OPENSSL_MSAN + + return ret; +} + +// kHaveGetrandom in |urandom_fd| signals that |getrandom| or |getentropy| is +// available and should be used instead. +static const int kHaveGetrandom = -3; + +// urandom_fd is a file descriptor to /dev/urandom. It's protected by |once|. +static int urandom_fd; + +static CRYPTO_once_t rand_once = CRYPTO_ONCE_INIT; + +// init_once initializes the state of this module to values previously +// requested. This is the only function that modifies |urandom_fd|, which may be +// read safely after calling the once. +static void init_once() { + int have_getrandom; + uint8_t dummy; + ssize_t getrandom_ret = + boringssl_getrandom(&dummy, sizeof(dummy), GRND_NONBLOCK); + if (getrandom_ret == 1) { + have_getrandom = 1; + } else if (getrandom_ret == -1 && errno == EAGAIN) { + // We have getrandom, but the entropy pool has not been initialized yet. + have_getrandom = 1; + } else if (getrandom_ret == -1 && errno == ENOSYS) { + // Fallthrough to using /dev/urandom, below. + have_getrandom = 0; + } else { + // Other errors are fatal. + perror("getrandom"); + abort(); + } + + if (have_getrandom) { + urandom_fd = kHaveGetrandom; + return; + } + + // FIPS builds must support getrandom. +#if defined(BORINGSSL_FIPS) + perror("getrandom not found"); + abort(); +#endif + + int fd; + do { + fd = open("/dev/urandom", O_RDONLY | O_CLOEXEC); + } while (fd == -1 && errno == EINTR); + + if (fd < 0) { + perror("failed to open /dev/urandom"); + abort(); + } + + urandom_fd = fd; +} + +void bssl::CRYPTO_init_sysrand() { CRYPTO_once(&rand_once, init_once); } + +// CRYPTO_sysrand writes |len| bytes of entropy into |out|. +void bssl::CRYPTO_sysrand(uint8_t *out, size_t len) { + if (len == 0) { + return; + } + + CRYPTO_init_sysrand(); + + // Clear |errno| so it has defined value if |read| or |getrandom| + // "successfully" returns zero. + errno = 0; + while (len > 0) { + ssize_t r; + + if (urandom_fd == kHaveGetrandom) { + r = boringssl_getrandom(out, len, 0); + } else { + do { + r = read(urandom_fd, out, len); + } while (r == -1 && errno == EINTR); + } + + if (r <= 0) { + perror("entropy fill failed"); + abort(); + } + out += r; + len -= r; + } +} + +#endif // OPENSSL_RAND_URANDOM diff --git a/third_party/boringssl/src/crypto/rand/windows.cc b/third_party/boringssl/src/crypto/rand/windows.cc new file mode 100644 index 00000000..ebd0d4af --- /dev/null +++ b/third_party/boringssl/src/crypto/rand/windows.cc @@ -0,0 +1,91 @@ +// Copyright 2014 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include "../bcm_support.h" +#include "../internal.h" +#include "internal.h" + +#if defined(OPENSSL_RAND_WINDOWS) + +#include +#include + +#include + +using namespace bssl; + +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) && \ + !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) +#include +OPENSSL_MSVC_PRAGMA(comment(lib, "bcrypt.lib")) +#endif // WINAPI_PARTITION_APP && !WINAPI_PARTITION_DESKTOP + +#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) && \ + !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) + +void bssl::CRYPTO_init_sysrand() {} + +void bssl::CRYPTO_sysrand(uint8_t *out, size_t requested) { + while (requested > 0) { + ULONG output_bytes_this_pass = ULONG_MAX; + if (requested < output_bytes_this_pass) { + output_bytes_this_pass = (ULONG)requested; + } + if (!BCRYPT_SUCCESS(BCryptGenRandom( + /*hAlgorithm=*/nullptr, out, output_bytes_this_pass, + BCRYPT_USE_SYSTEM_PREFERRED_RNG))) { + abort(); + } + requested -= output_bytes_this_pass; + out += output_bytes_this_pass; + } +} + +#else + +// See: https://learn.microsoft.com/en-us/windows/win32/seccng/processprng +typedef BOOL(WINAPI *ProcessPrngFunction)(PBYTE pbData, SIZE_T cbData); +static ProcessPrngFunction g_processprng_fn = nullptr; + +static void init_processprng() { + HMODULE hmod = LoadLibraryW(L"bcryptprimitives"); + if (hmod == nullptr) { + abort(); + } + g_processprng_fn = (ProcessPrngFunction)GetProcAddress(hmod, "ProcessPrng"); + if (g_processprng_fn == nullptr) { + abort(); + } +} + +void bssl::CRYPTO_init_sysrand() { + static CRYPTO_once_t once = CRYPTO_ONCE_INIT; + CRYPTO_once(&once, init_processprng); +} + +void bssl::CRYPTO_sysrand(uint8_t *out, size_t requested) { + CRYPTO_init_sysrand(); + // On non-UWP configurations, use ProcessPrng instead of BCryptGenRandom + // to avoid accessing resources that may be unavailable inside the + // Chromium sandbox. See https://crbug.com/74242 + if (!g_processprng_fn(out, requested)) { + abort(); + } +} + +#endif // WINAPI_PARTITION_APP && !WINAPI_PARTITION_DESKTOP + +#endif // OPENSSL_RAND_WINDOWS diff --git a/third_party/boringssl/src/crypto/rand_extra/deterministic.c b/third_party/boringssl/src/crypto/rand_extra/deterministic.c deleted file mode 100644 index 435f0633..00000000 --- a/third_party/boringssl/src/crypto/rand_extra/deterministic.c +++ /dev/null @@ -1,56 +0,0 @@ -/* Copyright (c) 2016, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#if defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE) - -#include - -#include - -#include "../internal.h" -#include "../fipsmodule/rand/internal.h" - - -// g_num_calls is the number of calls to |CRYPTO_sysrand| that have occurred. -// -// This is intentionally not thread-safe. If the fuzzer mode is ever used in a -// multi-threaded program, replace this with a thread-local. (A mutex would not -// be deterministic.) -static uint64_t g_num_calls = 0; -static struct CRYPTO_STATIC_MUTEX g_num_calls_lock = CRYPTO_STATIC_MUTEX_INIT; - -void RAND_reset_for_fuzzing(void) { g_num_calls = 0; } - -void CRYPTO_sysrand(uint8_t *out, size_t requested) { - static const uint8_t kZeroKey[32]; - - CRYPTO_STATIC_MUTEX_lock_write(&g_num_calls_lock); - uint64_t num_calls = g_num_calls++; - CRYPTO_STATIC_MUTEX_unlock_write(&g_num_calls_lock); - - uint8_t nonce[12]; - OPENSSL_memset(nonce, 0, sizeof(nonce)); - OPENSSL_memcpy(nonce, &num_calls, sizeof(num_calls)); - - OPENSSL_memset(out, 0, requested); - CRYPTO_chacha_20(out, out, requested, kZeroKey, nonce, 0); -} - -void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) { - CRYPTO_sysrand(out, requested); -} - -#endif // BORINGSSL_UNSAFE_DETERMINISTIC_MODE diff --git a/third_party/boringssl/src/crypto/rand_extra/forkunsafe.c b/third_party/boringssl/src/crypto/rand_extra/forkunsafe.c deleted file mode 100644 index 0f1ececc..00000000 --- a/third_party/boringssl/src/crypto/rand_extra/forkunsafe.c +++ /dev/null @@ -1,46 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - -#include "../fipsmodule/rand/internal.h" - - -// g_buffering_enabled is true if fork-unsafe buffering has been enabled. -static int g_buffering_enabled = 0; - -// g_lock protects |g_buffering_enabled|. -static struct CRYPTO_STATIC_MUTEX g_lock = CRYPTO_STATIC_MUTEX_INIT; - -#if !defined(OPENSSL_WINDOWS) -void RAND_enable_fork_unsafe_buffering(int fd) { - // We no longer support setting the file-descriptor with this function. - if (fd != -1) { - abort(); - } - - CRYPTO_STATIC_MUTEX_lock_write(&g_lock); - g_buffering_enabled = 1; - CRYPTO_STATIC_MUTEX_unlock_write(&g_lock); -} -#endif - -int rand_fork_unsafe_buffering_enabled(void) { - CRYPTO_STATIC_MUTEX_lock_read(&g_lock); - const int ret = g_buffering_enabled; - CRYPTO_STATIC_MUTEX_unlock_read(&g_lock); - return ret; -} diff --git a/third_party/boringssl/src/crypto/rand_extra/fuchsia.c b/third_party/boringssl/src/crypto/rand_extra/fuchsia.c deleted file mode 100644 index ee6cfdba..00000000 --- a/third_party/boringssl/src/crypto/rand_extra/fuchsia.c +++ /dev/null @@ -1,34 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#if defined(OPENSSL_FUCHSIA) && !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE) - -#include -#include - -#include - -#include "../fipsmodule/rand/internal.h" - -void CRYPTO_sysrand(uint8_t *out, size_t requested) { - zx_cprng_draw(out, requested); -} - -void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) { - CRYPTO_sysrand(out, requested); -} - -#endif // OPENSSL_FUCHSIA && !BORINGSSL_UNSAFE_DETERMINISTIC_MODE diff --git a/third_party/boringssl/src/crypto/rand_extra/passive.c b/third_party/boringssl/src/crypto/rand_extra/passive.c deleted file mode 100644 index f27803b6..00000000 --- a/third_party/boringssl/src/crypto/rand_extra/passive.c +++ /dev/null @@ -1,36 +0,0 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include "../fipsmodule/rand/internal.h" - -#if defined(BORINGSSL_FIPS) - -// RAND_need_entropy is called by the FIPS module when it has blocked because of -// a lack of entropy. This signal is used as an indication to feed it more. -void RAND_need_entropy(size_t bytes_needed) { - uint8_t buf[/* last_block size */ 16 + - CTR_DRBG_ENTROPY_LEN * BORINGSSL_FIPS_OVERREAD]; - size_t todo = sizeof(buf); - if (todo > bytes_needed) { - todo = bytes_needed; - } - - int want_additional_input; - CRYPTO_get_seed_entropy(buf, todo, &want_additional_input); - RAND_load_entropy(buf, todo, want_additional_input); -} - -#endif // FIPS diff --git a/third_party/boringssl/src/crypto/rand_extra/rand_extra.c b/third_party/boringssl/src/crypto/rand_extra/rand_extra.c deleted file mode 100644 index e73b99e3..00000000 --- a/third_party/boringssl/src/crypto/rand_extra/rand_extra.c +++ /dev/null @@ -1,74 +0,0 @@ -/* Copyright (c) 2017, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include - - -void RAND_seed(const void *buf, int num) { - // OpenSSH calls |RAND_seed| before jailing on the assumption that any needed - // file descriptors etc will be opened. - uint8_t unused; - RAND_bytes(&unused, sizeof(unused)); -} - -int RAND_load_file(const char *path, long num) { - if (num < 0) { // read the "whole file" - return 1; - } else if (num <= INT_MAX) { - return (int) num; - } else { - return INT_MAX; - } -} - -const char *RAND_file_name(char *buf, size_t num) { return NULL; } - -void RAND_add(const void *buf, int num, double entropy) {} - -int RAND_egd(const char *path) { - return 255; -} - -int RAND_poll(void) { - return 1; -} - -int RAND_status(void) { - return 1; -} - -static const struct rand_meth_st kSSLeayMethod = { - RAND_seed, - RAND_bytes, - RAND_cleanup, - RAND_add, - RAND_pseudo_bytes, - RAND_status, -}; - -RAND_METHOD *RAND_SSLeay(void) { - return (RAND_METHOD*) &kSSLeayMethod; -} - -RAND_METHOD *RAND_OpenSSL(void) { - return RAND_SSLeay(); -} - -const RAND_METHOD *RAND_get_rand_method(void) { return RAND_SSLeay(); } - -int RAND_set_rand_method(const RAND_METHOD *method) { return 1; } - -void RAND_cleanup(void) {} diff --git a/third_party/boringssl/src/crypto/rand_extra/windows.c b/third_party/boringssl/src/crypto/rand_extra/windows.c deleted file mode 100644 index 8ade6896..00000000 --- a/third_party/boringssl/src/crypto/rand_extra/windows.c +++ /dev/null @@ -1,73 +0,0 @@ -/* Copyright (c) 2014, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#if defined(OPENSSL_WINDOWS) && !defined(BORINGSSL_UNSAFE_DETERMINISTIC_MODE) - -#include -#include - -OPENSSL_MSVC_PRAGMA(warning(push, 3)) - -#include - -#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) && \ - !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) -#include -OPENSSL_MSVC_PRAGMA(comment(lib, "bcrypt.lib")) -#else -// #define needed to link in RtlGenRandom(), a.k.a. SystemFunction036. See the -// "Community Additions" comment on MSDN here: -// http://msdn.microsoft.com/en-us/library/windows/desktop/aa387694.aspx -#define SystemFunction036 NTAPI SystemFunction036 -#include -#undef SystemFunction036 -#endif // WINAPI_PARTITION_APP && !WINAPI_PARTITION_DESKTOP - -OPENSSL_MSVC_PRAGMA(warning(pop)) - -#include "../fipsmodule/rand/internal.h" - - -void CRYPTO_sysrand(uint8_t *out, size_t requested) { - while (requested > 0) { - ULONG output_bytes_this_pass = ULONG_MAX; - if (requested < output_bytes_this_pass) { - output_bytes_this_pass = (ULONG)requested; - } - // On non-UWP configurations, use RtlGenRandom instead of BCryptGenRandom - // to avoid accessing resources that may be unavailable inside the - // Chromium sandbox. See https://crbug.com/boringssl/307 -#if WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_APP) && \ - !WINAPI_FAMILY_PARTITION(WINAPI_PARTITION_DESKTOP) - if (!BCRYPT_SUCCESS(BCryptGenRandom( - /*hAlgorithm=*/NULL, out, output_bytes_this_pass, - BCRYPT_USE_SYSTEM_PREFERRED_RNG))) { -#else - if (RtlGenRandom(out, output_bytes_this_pass) == FALSE) { -#endif // WINAPI_PARTITION_APP && !WINAPI_PARTITION_DESKTOP - abort(); - } - requested -= output_bytes_this_pass; - out += output_bytes_this_pass; - } - return; -} - -void CRYPTO_sysrand_for_seed(uint8_t *out, size_t requested) { - CRYPTO_sysrand(out, requested); -} - -#endif // OPENSSL_WINDOWS && !BORINGSSL_UNSAFE_DETERMINISTIC_MODE diff --git a/third_party/boringssl/src/crypto/rc4/rc4.c b/third_party/boringssl/src/crypto/rc4/rc4.c deleted file mode 100644 index a27a657f..00000000 --- a/third_party/boringssl/src/crypto/rc4/rc4.c +++ /dev/null @@ -1,98 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - - -void RC4(RC4_KEY *key, size_t len, const uint8_t *in, uint8_t *out) { - uint32_t x = key->x; - uint32_t y = key->y; - uint32_t *d = key->data; - - for (size_t i = 0; i < len; i++) { - x = (x + 1) & 0xff; - uint32_t tx = d[x]; - y = (tx + y) & 0xff; - uint32_t ty = d[y]; - d[x] = ty; - d[y] = tx; - out[i] = d[(tx + ty) & 0xff] ^ in[i]; - } - - key->x = x; - key->y = y; -} - -void RC4_set_key(RC4_KEY *rc4key, unsigned len, const uint8_t *key) { - uint32_t *d = &rc4key->data[0]; - rc4key->x = 0; - rc4key->y = 0; - - for (unsigned i = 0; i < 256; i++) { - d[i] = i; - } - - unsigned id1 = 0, id2 = 0; - for (unsigned i = 0; i < 256; i++) { - uint32_t tmp = d[i]; - id2 = (key[id1] + tmp + id2) & 0xff; - if (++id1 == len) { - id1 = 0; - } - d[i] = d[id2]; - d[id2] = tmp; - } -} diff --git a/third_party/boringssl/src/crypto/rc4/rc4.cc b/third_party/boringssl/src/crypto/rc4/rc4.cc new file mode 100644 index 00000000..1c8971fc --- /dev/null +++ b/third_party/boringssl/src/crypto/rc4/rc4.cc @@ -0,0 +1,56 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + + +void RC4(RC4_KEY *key, size_t len, const uint8_t *in, uint8_t *out) { + uint32_t x = key->x; + uint32_t y = key->y; + uint32_t *d = key->data; + + for (size_t i = 0; i < len; i++) { + x = (x + 1) & 0xff; + uint32_t tx = d[x]; + y = (tx + y) & 0xff; + uint32_t ty = d[y]; + d[x] = ty; + d[y] = tx; + out[i] = d[(tx + ty) & 0xff] ^ in[i]; + } + + key->x = x; + key->y = y; +} + +void RC4_set_key(RC4_KEY *rc4key, unsigned len, const uint8_t *key) { + uint32_t *d = &rc4key->data[0]; + rc4key->x = 0; + rc4key->y = 0; + + for (unsigned i = 0; i < 256; i++) { + d[i] = i; + } + + unsigned id1 = 0, id2 = 0; + for (unsigned i = 0; i < 256; i++) { + uint32_t tmp = d[i]; + id2 = (key[id1] + tmp + id2) & 0xff; + if (++id1 == len) { + id1 = 0; + } + d[i] = d[id2]; + d[id2] = tmp; + } +} diff --git a/third_party/boringssl/src/crypto/refcount.cc b/third_party/boringssl/src/crypto/refcount.cc new file mode 100644 index 00000000..d5589db0 --- /dev/null +++ b/third_party/boringssl/src/crypto/refcount.cc @@ -0,0 +1,49 @@ +// Copyright 2015 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" + +#include +#include + + +using namespace bssl; + +void bssl::CRYPTO_refcount_inc(CRYPTO_refcount_t *count) { + uint32_t expected = count->load(); + + while (expected != CRYPTO_REFCOUNT_MAX) { + uint32_t new_value = expected + 1; + if (count->compare_exchange_weak(expected, new_value)) { + break; + } + } +} + +int bssl::CRYPTO_refcount_dec_and_test_zero(CRYPTO_refcount_t *count) { + uint32_t expected = count->load(); + + for (;;) { + if (expected == 0) { + abort(); + } else if (expected == CRYPTO_REFCOUNT_MAX) { + return 0; + } else { + const uint32_t new_value = expected - 1; + if (count->compare_exchange_weak(expected, new_value)) { + return new_value == 0; + } + } + } +} diff --git a/third_party/boringssl/src/crypto/refcount_c11.c b/third_party/boringssl/src/crypto/refcount_c11.c deleted file mode 100644 index a1781c66..00000000 --- a/third_party/boringssl/src/crypto/refcount_c11.c +++ /dev/null @@ -1,65 +0,0 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "internal.h" - - -#if defined(OPENSSL_C11_ATOMIC) - -#include -#include -#include -#include - - -// See comment above the typedef of CRYPTO_refcount_t about these tests. -static_assert(alignof(CRYPTO_refcount_t) == alignof(_Atomic CRYPTO_refcount_t), - "_Atomic alters the needed alignment of a reference count"); -static_assert(sizeof(CRYPTO_refcount_t) == sizeof(_Atomic CRYPTO_refcount_t), - "_Atomic alters the size of a reference count"); - -static_assert((CRYPTO_refcount_t)-1 == CRYPTO_REFCOUNT_MAX, - "CRYPTO_REFCOUNT_MAX is incorrect"); - -void CRYPTO_refcount_inc(CRYPTO_refcount_t *in_count) { - _Atomic CRYPTO_refcount_t *count = (_Atomic CRYPTO_refcount_t *) in_count; - uint32_t expected = atomic_load(count); - - while (expected != CRYPTO_REFCOUNT_MAX) { - uint32_t new_value = expected + 1; - if (atomic_compare_exchange_weak(count, &expected, new_value)) { - break; - } - } -} - -int CRYPTO_refcount_dec_and_test_zero(CRYPTO_refcount_t *in_count) { - _Atomic CRYPTO_refcount_t *count = (_Atomic CRYPTO_refcount_t *)in_count; - uint32_t expected = atomic_load(count); - - for (;;) { - if (expected == 0) { - abort(); - } else if (expected == CRYPTO_REFCOUNT_MAX) { - return 0; - } else { - const uint32_t new_value = expected - 1; - if (atomic_compare_exchange_weak(count, &expected, new_value)) { - return new_value == 0; - } - } - } -} - -#endif // OPENSSL_C11_ATOMIC diff --git a/third_party/boringssl/src/crypto/refcount_lock.c b/third_party/boringssl/src/crypto/refcount_lock.c deleted file mode 100644 index 173267e3..00000000 --- a/third_party/boringssl/src/crypto/refcount_lock.c +++ /dev/null @@ -1,52 +0,0 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "internal.h" - -#include -#include - - -#if !defined(OPENSSL_C11_ATOMIC) - -static_assert((CRYPTO_refcount_t)-1 == CRYPTO_REFCOUNT_MAX, - "CRYPTO_REFCOUNT_MAX is incorrect"); - -static struct CRYPTO_STATIC_MUTEX g_refcount_lock = CRYPTO_STATIC_MUTEX_INIT; - -void CRYPTO_refcount_inc(CRYPTO_refcount_t *count) { - CRYPTO_STATIC_MUTEX_lock_write(&g_refcount_lock); - if (*count < CRYPTO_REFCOUNT_MAX) { - (*count)++; - } - CRYPTO_STATIC_MUTEX_unlock_write(&g_refcount_lock); -} - -int CRYPTO_refcount_dec_and_test_zero(CRYPTO_refcount_t *count) { - int ret; - - CRYPTO_STATIC_MUTEX_lock_write(&g_refcount_lock); - if (*count == 0) { - abort(); - } - if (*count < CRYPTO_REFCOUNT_MAX) { - (*count)--; - } - ret = (*count == 0); - CRYPTO_STATIC_MUTEX_unlock_write(&g_refcount_lock); - - return ret; -} - -#endif // OPENSSL_C11_ATOMIC diff --git a/third_party/boringssl/src/crypto/rsa/internal.h b/third_party/boringssl/src/crypto/rsa/internal.h new file mode 100644 index 00000000..fa56db7b --- /dev/null +++ b/third_party/boringssl/src/crypto/rsa/internal.h @@ -0,0 +1,51 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_RSA_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_RSA_INTERNAL_H + +#include + +#include "../fipsmodule/rsa/internal.h" + + +BSSL_NAMESPACE_BEGIN + +int RSA_padding_check_PKCS1_OAEP_mgf1(uint8_t *out, size_t *out_len, + size_t max_out, const uint8_t *from, + size_t from_len, const uint8_t *param, + size_t param_len, const EVP_MD *md, + const EVP_MD *mgf1md); + +// rsa_pss_params_get_md returns the hash function used with |params|. This also +// specifies the MGF-1 hash and the salt length because we do not support other +// configurations. +const EVP_MD *rsa_pss_params_get_md(rsa_pss_params_t params); + +// rsa_marshal_pss_params marshals |params| as a DER-encoded RSASSA-PSS-params +// (RFC 4055). It returns one on success and zero on error. If |params| is +// |rsa_pss_params_none|, this function gives an error. +int rsa_marshal_pss_params(CBB *cbb, rsa_pss_params_t params); + +// rsa_marshal_pss_params decodes a DER-encoded RSASSA-PSS-params +// (RFC 4055). It returns one on success and zero on error. On success, it sets +// |*out| to the result. If |allow_explicit_trailer| is non-zero, an explicit +// encoding of the trailerField is allowed, although it is not valid DER. This +// function never outputs |rsa_pss_params_none|. +int rsa_parse_pss_params(CBS *cbs, rsa_pss_params_t *out, + int allow_explicit_trailer); + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_RSA_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/rsa/rsa_asn1.cc b/third_party/boringssl/src/crypto/rsa/rsa_asn1.cc new file mode 100644 index 00000000..3869de62 --- /dev/null +++ b/third_party/boringssl/src/crypto/rsa/rsa_asn1.cc @@ -0,0 +1,404 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../bytestring/internal.h" +#include "../fipsmodule/rsa/internal.h" +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +static int parse_integer(CBS *cbs, UniquePtr *out) { + assert(*out == nullptr); + out->reset(BN_new()); + if (*out == nullptr) { + return 0; + } + return BN_parse_asn1_unsigned(cbs, out->get()); +} + +static int marshal_integer(CBB *cbb, const BIGNUM *bn) { + if (bn == nullptr) { + // An RSA object may be missing some components. + OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING); + return 0; + } + return BN_marshal_asn1(cbb, bn); +} + +RSA *RSA_parse_public_key(CBS *cbs) { + RSAImpl *ret = FromOpaque(RSA_new()); + if (ret == nullptr) { + return nullptr; + } + CBS child; + if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || + !parse_integer(&child, &ret->n) || + !parse_integer(&child, &ret->e) || + CBS_len(&child) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + RSA_free(ret); + return nullptr; + } + + if (!RSA_check_key(ret)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_RSA_PARAMETERS); + RSA_free(ret); + return nullptr; + } + + return ret; +} + +RSA *RSA_public_key_from_bytes(const uint8_t *in, size_t in_len) { + CBS cbs; + CBS_init(&cbs, in, in_len); + RSA *ret = RSA_parse_public_key(&cbs); + if (ret == nullptr || CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + RSA_free(ret); + return nullptr; + } + return ret; +} + +int RSA_marshal_public_key(CBB *cbb, const RSA *rsa) { + CBB child; + const RSAImpl *impl = FromOpaque(rsa); + if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || + !marshal_integer(&child, impl->n.get()) || + !marshal_integer(&child, impl->e.get()) || // + !CBB_flush(cbb)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_ENCODE_ERROR); + return 0; + } + return 1; +} + +int RSA_public_key_to_bytes(uint8_t **out_bytes, size_t *out_len, + const RSA *rsa) { + CBB cbb; + CBB_zero(&cbb); + if (!CBB_init(&cbb, 0) || + !RSA_marshal_public_key(&cbb, rsa) || + !CBB_finish(&cbb, out_bytes, out_len)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_ENCODE_ERROR); + CBB_cleanup(&cbb); + return 0; + } + return 1; +} + +// kVersionTwoPrime is the value of the version field for a two-prime +// RSAPrivateKey structure (RFC 8017). +static const uint64_t kVersionTwoPrime = 0; + +RSA *RSA_parse_private_key(CBS *cbs) { + RSAImpl *ret = FromOpaque(RSA_new()); + if (ret == nullptr) { + return nullptr; + } + + CBS child; + uint64_t version; + if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1_uint64(&child, &version)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + goto err; + } + + if (version != kVersionTwoPrime) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_VERSION); + goto err; + } + + if (!parse_integer(&child, &ret->n) || + !parse_integer(&child, &ret->e) || + !parse_integer(&child, &ret->d) || + !parse_integer(&child, &ret->p) || + !parse_integer(&child, &ret->q) || + !parse_integer(&child, &ret->dmp1) || + !parse_integer(&child, &ret->dmq1) || + !parse_integer(&child, &ret->iqmp)) { + goto err; + } + + if (CBS_len(&child) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + goto err; + } + + if (!RSA_check_key(ret)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_RSA_PARAMETERS); + goto err; + } + + return ret; + +err: + RSA_free(ret); + return nullptr; +} + +RSA *RSA_private_key_from_bytes(const uint8_t *in, size_t in_len) { + CBS cbs; + CBS_init(&cbs, in, in_len); + RSA *ret = RSA_parse_private_key(&cbs); + if (ret == nullptr || CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + RSA_free(ret); + return nullptr; + } + return ret; +} + +int RSA_marshal_private_key(CBB *cbb, const RSA *rsa) { + const RSAImpl *impl = FromOpaque(rsa); + CBB child; + if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || + !CBB_add_asn1_uint64(&child, kVersionTwoPrime) || + !marshal_integer(&child, impl->n.get()) || + !marshal_integer(&child, impl->e.get()) || + !marshal_integer(&child, impl->d.get()) || + !marshal_integer(&child, impl->p.get()) || + !marshal_integer(&child, impl->q.get()) || + !marshal_integer(&child, impl->dmp1.get()) || + !marshal_integer(&child, impl->dmq1.get()) || + !marshal_integer(&child, impl->iqmp.get()) || // + !CBB_flush(cbb)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_ENCODE_ERROR); + return 0; + } + return 1; +} + +int RSA_private_key_to_bytes(uint8_t **out_bytes, size_t *out_len, + const RSA *rsa) { + CBB cbb; + CBB_zero(&cbb); + if (!CBB_init(&cbb, 0) || + !RSA_marshal_private_key(&cbb, rsa) || + !CBB_finish(&cbb, out_bytes, out_len)) { + OPENSSL_PUT_ERROR(RSA, RSA_R_ENCODE_ERROR); + CBB_cleanup(&cbb); + return 0; + } + return 1; +} + +RSA *d2i_RSAPublicKey(RSA **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, RSA_parse_public_key); +} + +int i2d_RSAPublicKey(const RSA *in, uint8_t **outp) { + return I2DFromCBB( + /*initial_capacity=*/256, outp, + [&](CBB *cbb) -> bool { return RSA_marshal_public_key(cbb, in); }); +} + +RSA *d2i_RSAPrivateKey(RSA **out, const uint8_t **inp, long len) { + return D2IFromCBS(out, inp, len, RSA_parse_private_key); +} + +int i2d_RSAPrivateKey(const RSA *in, uint8_t **outp) { + return I2DFromCBB( + /*initial_capacity=*/512, outp, + [&](CBB *cbb) -> bool { return RSA_marshal_private_key(cbb, in); }); +} + +RSA *RSAPublicKey_dup(const RSA *rsa) { + uint8_t *der; + size_t der_len; + if (!RSA_public_key_to_bytes(&der, &der_len, rsa)) { + return nullptr; + } + RSA *ret = RSA_public_key_from_bytes(der, der_len); + OPENSSL_free(der); + return ret; +} + +RSA *RSAPrivateKey_dup(const RSA *rsa) { + uint8_t *der; + size_t der_len; + if (!RSA_private_key_to_bytes(&der, &der_len, rsa)) { + return nullptr; + } + RSA *ret = RSA_private_key_from_bytes(der, der_len); + OPENSSL_free(der); + return ret; +} + +static const uint8_t kPSSParamsSHA256[] = { + 0x30, 0x34, 0xa0, 0x0f, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, + 0x01, 0x65, 0x03, 0x04, 0x02, 0x01, 0x05, 0x00, 0xa1, 0x1c, 0x30, + 0x1a, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, + 0x08, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, + 0x04, 0x02, 0x01, 0x05, 0x00, 0xa2, 0x03, 0x02, 0x01, 0x20}; + +static const uint8_t kPSSParamsSHA384[] = { + 0x30, 0x34, 0xa0, 0x0f, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, + 0x01, 0x65, 0x03, 0x04, 0x02, 0x02, 0x05, 0x00, 0xa1, 0x1c, 0x30, + 0x1a, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, + 0x08, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, + 0x04, 0x02, 0x02, 0x05, 0x00, 0xa2, 0x03, 0x02, 0x01, 0x30}; + +static const uint8_t kPSSParamsSHA512[] = { + 0x30, 0x34, 0xa0, 0x0f, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, + 0x01, 0x65, 0x03, 0x04, 0x02, 0x03, 0x05, 0x00, 0xa1, 0x1c, 0x30, + 0x1a, 0x06, 0x09, 0x2a, 0x86, 0x48, 0x86, 0xf7, 0x0d, 0x01, 0x01, + 0x08, 0x30, 0x0d, 0x06, 0x09, 0x60, 0x86, 0x48, 0x01, 0x65, 0x03, + 0x04, 0x02, 0x03, 0x05, 0x00, 0xa2, 0x03, 0x02, 0x01, 0x40}; + +const EVP_MD *bssl::rsa_pss_params_get_md(rsa_pss_params_t params) { + switch (params) { + case rsa_pss_none: + return nullptr; + case rsa_pss_sha256: + return EVP_sha256(); + case rsa_pss_sha384: + return EVP_sha384(); + case rsa_pss_sha512: + return EVP_sha512(); + } + abort(); +} + +int bssl::rsa_marshal_pss_params(CBB *cbb, rsa_pss_params_t params) { + Span bytes; + switch (params) { + case rsa_pss_none: + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + return 0; + case rsa_pss_sha256: + bytes = kPSSParamsSHA256; + break; + case rsa_pss_sha384: + bytes = kPSSParamsSHA384; + break; + case rsa_pss_sha512: + bytes = kPSSParamsSHA512; + break; + } + + return CBB_add_bytes(cbb, bytes.data(), bytes.size()); +} + +// 1.2.840.113549.1.1.8 +static const uint8_t kMGF1OID[] = {0x2a, 0x86, 0x48, 0x86, 0xf7, + 0x0d, 0x01, 0x01, 0x08}; + +int bssl::rsa_parse_pss_params(CBS *cbs, rsa_pss_params_t *out, + int allow_explicit_trailer) { + // See RFC 4055, section 3.1. + // + // hashAlgorithm, maskGenAlgorithm, and saltLength all have DEFAULTs + // corresponding to SHA-1. We do not support SHA-1 with PSS, so we do not + // bother recognizing the omitted versions. + CBS params, hash_wrapper, mask_wrapper, mask_alg, mask_oid, salt_wrapper; + uint64_t salt_len; + if (!CBS_get_asn1(cbs, ¶ms, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(¶ms, &hash_wrapper, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 0) || + // |hash_wrapper| will be parsed below. + !CBS_get_asn1(¶ms, &mask_wrapper, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 1) || + !CBS_get_asn1(&mask_wrapper, &mask_alg, CBS_ASN1_SEQUENCE) || + !CBS_get_asn1(&mask_alg, &mask_oid, CBS_ASN1_OBJECT) || + // We only support MGF-1. + Span(mask_oid) != kMGF1OID || + // The remainder of |mask_alg| will be parsed below. + CBS_len(&mask_wrapper) != 0 || + !CBS_get_asn1(¶ms, &salt_wrapper, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 2) || + !CBS_get_asn1_uint64(&salt_wrapper, &salt_len) || + CBS_len(&salt_wrapper) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + return 0; + } + + // The trailer field must be 1 (0xbc). This value is DEFAULT, so the structure + // is required to omit it in DER. + if (CBS_len(¶ms) != 0 && allow_explicit_trailer) { + CBS trailer_wrapper; + uint64_t trailer; + if (!CBS_get_asn1(¶ms, &trailer_wrapper, + CBS_ASN1_CONSTRUCTED | CBS_ASN1_CONTEXT_SPECIFIC | 3) || + !CBS_get_asn1_uint64(&trailer_wrapper, &trailer) || // + trailer != 1) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + return 0; + } + } + if (CBS_len(¶ms) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + return 0; + } + + int hash_nid = EVP_parse_digest_algorithm_nid(&hash_wrapper); + if (hash_nid == NID_undef || CBS_len(&hash_wrapper) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + return 0; + } + + // We only support combinations where the MGF-1 hash matches the overall hash. + int mgf1_hash_nid = EVP_parse_digest_algorithm_nid(&mask_alg); + if (mgf1_hash_nid != hash_nid || CBS_len(&mask_alg) != 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + return 0; + } + + // We only support salt lengths that match the hash length. + rsa_pss_params_t ret; + uint64_t hash_len; + switch (hash_nid) { + case NID_sha256: + ret = rsa_pss_sha256; + hash_len = 32; + break; + case NID_sha384: + ret = rsa_pss_sha384; + hash_len = 48; + break; + case NID_sha512: + ret = rsa_pss_sha512; + hash_len = 64; + break; + default: + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + return 0; + } + if (salt_len != hash_len) { + OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); + return 0; + } + + *out = ret; + return 1; +} diff --git a/third_party/boringssl/src/crypto/rsa/rsa_crypt.cc b/third_party/boringssl/src/crypto/rsa/rsa_crypt.cc new file mode 100644 index 00000000..16085c53 --- /dev/null +++ b/third_party/boringssl/src/crypto/rsa/rsa_crypt.cc @@ -0,0 +1,530 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include "../fipsmodule/bn/internal.h" +#include "../fipsmodule/rsa/internal.h" +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +static void rand_nonzero(uint8_t *out, size_t len) { + RAND_bytes(out, len); + + for (size_t i = 0; i < len; i++) { + // Zero values are replaced, and the distribution of zero and non-zero bytes + // is public, so leaking this is safe. + while (constant_time_declassify_int(out[i] == 0)) { + RAND_bytes(out + i, 1); + } + } +} + +int RSA_padding_add_PKCS1_OAEP_mgf1(uint8_t *to, size_t to_len, + const uint8_t *from, size_t from_len, + const uint8_t *param, size_t param_len, + const EVP_MD *md, const EVP_MD *mgf1md) { + if (md == nullptr) { + md = EVP_sha1(); + } + if (mgf1md == nullptr) { + mgf1md = md; + } + + size_t mdlen = EVP_MD_size(md); + + if (to_len < 2 * mdlen + 2) { + OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); + return 0; + } + + size_t emlen = to_len - 1; + if (from_len > emlen - 2 * mdlen - 1) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); + return 0; + } + + if (emlen < 2 * mdlen + 1) { + OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); + return 0; + } + + to[0] = 0; + uint8_t *seed = to + 1; + uint8_t *db = to + mdlen + 1; + + uint8_t *dbmask = nullptr; + int ret = 0; + if (!EVP_Digest(param, param_len, db, nullptr, md, nullptr)) { + goto out; + } + OPENSSL_memset(db + mdlen, 0, emlen - from_len - 2 * mdlen - 1); + db[emlen - from_len - mdlen - 1] = 0x01; + OPENSSL_memcpy(db + emlen - from_len - mdlen, from, from_len); + if (!RAND_bytes(seed, mdlen)) { + goto out; + } + + dbmask = reinterpret_cast(OPENSSL_malloc(emlen - mdlen)); + if (dbmask == nullptr) { + goto out; + } + + if (!PKCS1_MGF1(dbmask, emlen - mdlen, seed, mdlen, mgf1md)) { + goto out; + } + for (size_t i = 0; i < emlen - mdlen; i++) { + db[i] ^= dbmask[i]; + } + + uint8_t seedmask[EVP_MAX_MD_SIZE]; + if (!PKCS1_MGF1(seedmask, mdlen, db, emlen - mdlen, mgf1md)) { + goto out; + } + for (size_t i = 0; i < mdlen; i++) { + seed[i] ^= seedmask[i]; + } + ret = 1; + +out: + OPENSSL_free(dbmask); + return ret; +} + +int bssl::RSA_padding_check_PKCS1_OAEP_mgf1(uint8_t *out, size_t *out_len, + size_t max_out, const uint8_t *from, + size_t from_len, + const uint8_t *param, + size_t param_len, const EVP_MD *md, + const EVP_MD *mgf1md) { + uint8_t *db = nullptr; + + { + if (md == nullptr) { + md = EVP_sha1(); + } + if (mgf1md == nullptr) { + mgf1md = md; + } + + size_t mdlen = EVP_MD_size(md); + + // The encoded message is one byte smaller than the modulus to ensure that + // it doesn't end up greater than the modulus. Thus there's an extra "+1" + // here compared to https://tools.ietf.org/html/rfc2437#section-9.1.1.2. + if (from_len < 1 + 2 * mdlen + 1) { + // 'from_len' is the length of the modulus, i.e. does not depend on the + // particular ciphertext. + goto decoding_err; + } + + size_t dblen = from_len - mdlen - 1; + db = reinterpret_cast(OPENSSL_malloc(dblen)); + if (db == nullptr) { + goto err; + } + + const uint8_t *maskedseed = from + 1; + const uint8_t *maskeddb = from + 1 + mdlen; + + uint8_t seed[EVP_MAX_MD_SIZE]; + if (!PKCS1_MGF1(seed, mdlen, maskeddb, dblen, mgf1md)) { + goto err; + } + for (size_t i = 0; i < mdlen; i++) { + seed[i] ^= maskedseed[i]; + } + + if (!PKCS1_MGF1(db, dblen, seed, mdlen, mgf1md)) { + goto err; + } + for (size_t i = 0; i < dblen; i++) { + db[i] ^= maskeddb[i]; + } + + uint8_t phash[EVP_MAX_MD_SIZE]; + if (!EVP_Digest(param, param_len, phash, nullptr, md, nullptr)) { + goto err; + } + + crypto_word_t bad = + ~constant_time_is_zero_w(CRYPTO_memcmp(db, phash, mdlen)); + bad |= ~constant_time_is_zero_w(from[0]); + + crypto_word_t looking_for_one_byte = CONSTTIME_TRUE_W; + size_t one_index = 0; + for (size_t i = mdlen; i < dblen; i++) { + crypto_word_t equals1 = constant_time_eq_w(db[i], 1); + crypto_word_t equals0 = constant_time_eq_w(db[i], 0); + one_index = + constant_time_select_w(looking_for_one_byte & equals1, i, one_index); + looking_for_one_byte = + constant_time_select_w(equals1, 0, looking_for_one_byte); + bad |= looking_for_one_byte & ~equals0; + } + + bad |= looking_for_one_byte; + + // Whether the overall padding was valid or not in OAEP is public. + if (constant_time_declassify_w(bad)) { + goto decoding_err; + } + + // Once the padding is known to be valid, the output length is also public. + static_assert(sizeof(size_t) <= sizeof(crypto_word_t), + "size_t does not fit in crypto_word_t"); + one_index = constant_time_declassify_w(one_index); + + one_index++; + size_t mlen = dblen - one_index; + if (max_out < mlen) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE); + goto err; + } + + OPENSSL_memcpy(out, db + one_index, mlen); + *out_len = mlen; + OPENSSL_free(db); + return 1; + } + +decoding_err: + // To avoid chosen ciphertext attacks, the error message should not reveal + // which kind of decoding error happened. + OPENSSL_PUT_ERROR(RSA, RSA_R_OAEP_DECODING_ERROR); +err: + OPENSSL_free(db); + return 0; +} + +static int rsa_padding_add_PKCS1_type_2(uint8_t *to, size_t to_len, + const uint8_t *from, size_t from_len) { + // See RFC 8017, section 7.2.1. + if (to_len < RSA_PKCS1_PADDING_SIZE) { + OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); + return 0; + } + + if (from_len > to_len - RSA_PKCS1_PADDING_SIZE) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_KEY_SIZE); + return 0; + } + + to[0] = 0; + to[1] = 2; + + size_t padding_len = to_len - 3 - from_len; + rand_nonzero(to + 2, padding_len); + to[2 + padding_len] = 0; + OPENSSL_memcpy(to + to_len - from_len, from, from_len); + return 1; +} + +static int rsa_padding_check_PKCS1_type_2(uint8_t *out, size_t *out_len, + size_t max_out, const uint8_t *from, + size_t from_len) { + if (from_len == 0) { + OPENSSL_PUT_ERROR(RSA, RSA_R_EMPTY_PUBLIC_KEY); + return 0; + } + + // PKCS#1 v1.5 decryption. See "PKCS #1 v2.2: RSA Cryptography + // Standard", section 7.2.2. + if (from_len < RSA_PKCS1_PADDING_SIZE) { + // |from| is zero-padded to the size of the RSA modulus, a public value, so + // this can be rejected in non-constant time. + OPENSSL_PUT_ERROR(RSA, RSA_R_KEY_SIZE_TOO_SMALL); + return 0; + } + + crypto_word_t first_byte_is_zero = constant_time_eq_w(from[0], 0); + crypto_word_t second_byte_is_two = constant_time_eq_w(from[1], 2); + + crypto_word_t zero_index = 0, looking_for_index = CONSTTIME_TRUE_W; + for (size_t i = 2; i < from_len; i++) { + crypto_word_t equals0 = constant_time_is_zero_w(from[i]); + zero_index = + constant_time_select_w(looking_for_index & equals0, i, zero_index); + looking_for_index = constant_time_select_w(equals0, 0, looking_for_index); + } + + // The input must begin with 00 02. + crypto_word_t valid_index = first_byte_is_zero; + valid_index &= second_byte_is_two; + + // We must have found the end of PS. + valid_index &= ~looking_for_index; + + // PS must be at least 8 bytes long, and it starts two bytes into |from|. + valid_index &= constant_time_ge_w(zero_index, 2 + 8); + + // Skip the zero byte. + zero_index++; + + // NOTE: Although this logic attempts to be constant time, the API contracts + // of this function and |RSA_decrypt| with |RSA_PKCS1_PADDING| make it + // impossible to completely avoid Bleichenbacher's attack. Consumers should + // use |RSA_PADDING_NONE| and perform the padding check in constant-time + // combined with a swap to a random session key or other mitigation. + CONSTTIME_DECLASSIFY(&valid_index, sizeof(valid_index)); + CONSTTIME_DECLASSIFY(&zero_index, sizeof(zero_index)); + + if (!valid_index) { + OPENSSL_PUT_ERROR(RSA, RSA_R_PKCS_DECODING_ERROR); + return 0; + } + + const size_t msg_len = from_len - zero_index; + if (msg_len > max_out) { + // This shouldn't happen because this function is always called with + // |max_out| as the key size and |from_len| is bounded by the key size. + OPENSSL_PUT_ERROR(RSA, RSA_R_PKCS_DECODING_ERROR); + return 0; + } + + OPENSSL_memcpy(out, &from[zero_index], msg_len); + *out_len = msg_len; + return 1; +} + +int RSA_public_encrypt(size_t flen, const uint8_t *from, uint8_t *to, RSA *rsa, + int padding) { + size_t out_len; + + if (!RSA_encrypt(rsa, &out_len, to, RSA_size(rsa), from, flen, padding)) { + return -1; + } + + if (out_len > INT_MAX) { + OPENSSL_PUT_ERROR(RSA, ERR_R_OVERFLOW); + return -1; + } + return (int)out_len; +} + +int RSA_private_encrypt(size_t flen, const uint8_t *from, uint8_t *to, RSA *rsa, + int padding) { + size_t out_len; + + if (!RSA_sign_raw(rsa, &out_len, to, RSA_size(rsa), from, flen, padding)) { + return -1; + } + + if (out_len > INT_MAX) { + OPENSSL_PUT_ERROR(RSA, ERR_R_OVERFLOW); + return -1; + } + return (int)out_len; +} + +int RSA_encrypt(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, + const uint8_t *in, size_t in_len, int padding) { + auto *impl = FromOpaque(rsa); + + if (impl->n == nullptr || impl->e == nullptr) { + OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING); + return 0; + } + + if (!rsa_check_public_key(rsa)) { + return 0; + } + + const unsigned rsa_size = RSA_size(rsa); + if (max_out < rsa_size) { + OPENSSL_PUT_ERROR(RSA, RSA_R_OUTPUT_BUFFER_TOO_SMALL); + return 0; + } + + UniquePtr ctx(BN_CTX_new()); + if (ctx == nullptr) { + return 0; + } + + BN_CTXScope scope(ctx.get()); + BIGNUM *f = BN_CTX_get(ctx.get()); + BIGNUM *result = BN_CTX_get(ctx.get()); + uint8_t *buf = reinterpret_cast(OPENSSL_malloc(rsa_size)); + int i, ret = 0; + if (!f || !result || !buf) { + goto err; + } + + switch (padding) { + case RSA_PKCS1_PADDING: + i = rsa_padding_add_PKCS1_type_2(buf, rsa_size, in, in_len); + break; + case RSA_PKCS1_OAEP_PADDING: + // Use the default parameters: SHA-1 for both hashes and no label. + i = RSA_padding_add_PKCS1_OAEP_mgf1(buf, rsa_size, in, in_len, nullptr, 0, + nullptr, nullptr); + break; + case RSA_NO_PADDING: + i = RSA_padding_add_none(buf, rsa_size, in, in_len); + break; + default: + OPENSSL_PUT_ERROR(RSA, RSA_R_UNKNOWN_PADDING_TYPE); + goto err; + } + + if (i <= 0) { + goto err; + } + + if (BN_bin2bn(buf, rsa_size, f) == nullptr) { + goto err; + } + + if (BN_ucmp(f, impl->n.get()) >= 0) { + // usually the padding functions would catch this + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_TOO_LARGE_FOR_MODULUS); + goto err; + } + + if (!BN_MONT_CTX_set_locked(&impl->mont_n, &impl->lock, impl->n.get(), + ctx.get()) || + !BN_mod_exp_mont(result, f, impl->e.get(), &impl->mont_n->N, ctx.get(), + impl->mont_n.get())) { + goto err; + } + + // put in leading 0 bytes if the number is less than the length of the + // modulus + if (!BN_bn2bin_padded(out, rsa_size, result)) { + OPENSSL_PUT_ERROR(RSA, ERR_R_INTERNAL_ERROR); + goto err; + } + + *out_len = rsa_size; + ret = 1; + +err: + OPENSSL_free(buf); + return ret; +} + +static int rsa_default_decrypt(RSA *rsa, size_t *out_len, uint8_t *out, + size_t max_out, const uint8_t *in, size_t in_len, + int padding) { + const unsigned rsa_size = RSA_size(rsa); + uint8_t *buf = nullptr; + int ret = 0; + + if (max_out < rsa_size) { + OPENSSL_PUT_ERROR(RSA, RSA_R_OUTPUT_BUFFER_TOO_SMALL); + return 0; + } + + if (padding == RSA_NO_PADDING) { + buf = out; + } else { + // Allocate a temporary buffer to hold the padded plaintext. + buf = reinterpret_cast(OPENSSL_malloc(rsa_size)); + if (buf == nullptr) { + goto err; + } + } + + if (in_len != rsa_size) { + OPENSSL_PUT_ERROR(RSA, RSA_R_DATA_LEN_NOT_EQUAL_TO_MOD_LEN); + goto err; + } + + if (!rsa_private_transform(rsa, buf, in, rsa_size)) { + goto err; + } + + switch (padding) { + case RSA_PKCS1_PADDING: + ret = + rsa_padding_check_PKCS1_type_2(out, out_len, rsa_size, buf, rsa_size); + break; + case RSA_PKCS1_OAEP_PADDING: + // Use the default parameters: SHA-1 for both hashes and no label. + ret = RSA_padding_check_PKCS1_OAEP_mgf1( + out, out_len, rsa_size, buf, rsa_size, nullptr, 0, nullptr, nullptr); + break; + case RSA_NO_PADDING: + *out_len = rsa_size; + ret = 1; + break; + default: + OPENSSL_PUT_ERROR(RSA, RSA_R_UNKNOWN_PADDING_TYPE); + goto err; + } + + CONSTTIME_DECLASSIFY(&ret, sizeof(ret)); + if (!ret) { + OPENSSL_PUT_ERROR(RSA, RSA_R_PADDING_CHECK_FAILED); + } else { + CONSTTIME_DECLASSIFY(out, *out_len); + } + +err: + if (padding != RSA_NO_PADDING) { + OPENSSL_free(buf); + } + + return ret; +} + +int RSA_decrypt(RSA *rsa, size_t *out_len, uint8_t *out, size_t max_out, + const uint8_t *in, size_t in_len, int padding) { + auto *impl = FromOpaque(rsa); + if (impl->meth->decrypt) { + return impl->meth->decrypt(rsa, out_len, out, max_out, in, in_len, padding); + } + + return rsa_default_decrypt(rsa, out_len, out, max_out, in, in_len, padding); +} + +int RSA_private_decrypt(size_t flen, const uint8_t *from, uint8_t *to, RSA *rsa, + int padding) { + size_t out_len; + if (!RSA_decrypt(rsa, &out_len, to, RSA_size(rsa), from, flen, padding)) { + return -1; + } + + if (out_len > INT_MAX) { + OPENSSL_PUT_ERROR(RSA, ERR_R_OVERFLOW); + return -1; + } + return (int)out_len; +} + +int RSA_public_decrypt(size_t flen, const uint8_t *from, uint8_t *to, RSA *rsa, + int padding) { + size_t out_len; + if (!RSA_verify_raw(rsa, &out_len, to, RSA_size(rsa), from, flen, padding)) { + return -1; + } + + if (out_len > INT_MAX) { + OPENSSL_PUT_ERROR(RSA, ERR_R_OVERFLOW); + return -1; + } + return (int)out_len; +} diff --git a/third_party/boringssl/src/crypto/rsa/rsa_extra.cc b/third_party/boringssl/src/crypto/rsa/rsa_extra.cc new file mode 100644 index 00000000..58015bd4 --- /dev/null +++ b/third_party/boringssl/src/crypto/rsa/rsa_extra.cc @@ -0,0 +1,30 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +int RSA_blinding_on(RSA *rsa, BN_CTX *ctx) { return 1; } + +void RSA_blinding_off(RSA *rsa) {} + +const RSA_PSS_PARAMS *RSA_get0_pss_params(const RSA *rsa) { + // We do not currently implement this function. By default, we will not parse + // |EVP_PKEY_RSA_PSS|. Callers that opt in with a BoringSSL-specific API are + // currently assumed to not need this function. Callers that need that opt-in + // and this functionality should contact the BoringSSL team. + // + // If we do add support later, the |maskHash| field should be filled in for + // OpenSSL compatibility. + return nullptr; +} diff --git a/third_party/boringssl/src/crypto/rsa/rsa_print.cc b/third_party/boringssl/src/crypto/rsa/rsa_print.cc new file mode 100644 index 00000000..52d77321 --- /dev/null +++ b/third_party/boringssl/src/crypto/rsa/rsa_print.cc @@ -0,0 +1,25 @@ +// Copyright 2006-2017 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + + +int RSA_print(BIO *bio, const RSA *rsa, int indent) { + bssl::UniquePtr pkey(EVP_PKEY_new()); + return pkey != nullptr && + EVP_PKEY_set1_RSA(pkey.get(), const_cast(rsa)) && + EVP_PKEY_print_private(bio, pkey.get(), indent, nullptr); +} diff --git a/third_party/boringssl/src/crypto/rsa_extra/rsa_asn1.c b/third_party/boringssl/src/crypto/rsa_extra/rsa_asn1.c deleted file mode 100644 index 58fd69a8..00000000 --- a/third_party/boringssl/src/crypto/rsa_extra/rsa_asn1.c +++ /dev/null @@ -1,324 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 2000. - */ -/* ==================================================================== - * Copyright (c) 2000-2005 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include -#include - -#include -#include -#include -#include - -#include "../fipsmodule/rsa/internal.h" -#include "../bytestring/internal.h" -#include "../internal.h" - - -static int parse_integer(CBS *cbs, BIGNUM **out) { - assert(*out == NULL); - *out = BN_new(); - if (*out == NULL) { - return 0; - } - return BN_parse_asn1_unsigned(cbs, *out); -} - -static int marshal_integer(CBB *cbb, BIGNUM *bn) { - if (bn == NULL) { - // An RSA object may be missing some components. - OPENSSL_PUT_ERROR(RSA, RSA_R_VALUE_MISSING); - return 0; - } - return BN_marshal_asn1(cbb, bn); -} - -RSA *RSA_parse_public_key(CBS *cbs) { - RSA *ret = RSA_new(); - if (ret == NULL) { - return NULL; - } - CBS child; - if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || - !parse_integer(&child, &ret->n) || - !parse_integer(&child, &ret->e) || - CBS_len(&child) != 0) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); - RSA_free(ret); - return NULL; - } - - if (!RSA_check_key(ret)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_RSA_PARAMETERS); - RSA_free(ret); - return NULL; - } - - return ret; -} - -RSA *RSA_public_key_from_bytes(const uint8_t *in, size_t in_len) { - CBS cbs; - CBS_init(&cbs, in, in_len); - RSA *ret = RSA_parse_public_key(&cbs); - if (ret == NULL || CBS_len(&cbs) != 0) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); - RSA_free(ret); - return NULL; - } - return ret; -} - -int RSA_marshal_public_key(CBB *cbb, const RSA *rsa) { - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || - !marshal_integer(&child, rsa->n) || - !marshal_integer(&child, rsa->e) || - !CBB_flush(cbb)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_ENCODE_ERROR); - return 0; - } - return 1; -} - -int RSA_public_key_to_bytes(uint8_t **out_bytes, size_t *out_len, - const RSA *rsa) { - CBB cbb; - CBB_zero(&cbb); - if (!CBB_init(&cbb, 0) || - !RSA_marshal_public_key(&cbb, rsa) || - !CBB_finish(&cbb, out_bytes, out_len)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_ENCODE_ERROR); - CBB_cleanup(&cbb); - return 0; - } - return 1; -} - -// kVersionTwoPrime is the value of the version field for a two-prime -// RSAPrivateKey structure (RFC 3447). -static const uint64_t kVersionTwoPrime = 0; - -RSA *RSA_parse_private_key(CBS *cbs) { - RSA *ret = RSA_new(); - if (ret == NULL) { - return NULL; - } - - CBS child; - uint64_t version; - if (!CBS_get_asn1(cbs, &child, CBS_ASN1_SEQUENCE) || - !CBS_get_asn1_uint64(&child, &version)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); - goto err; - } - - if (version != kVersionTwoPrime) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_VERSION); - goto err; - } - - if (!parse_integer(&child, &ret->n) || - !parse_integer(&child, &ret->e) || - !parse_integer(&child, &ret->d) || - !parse_integer(&child, &ret->p) || - !parse_integer(&child, &ret->q) || - !parse_integer(&child, &ret->dmp1) || - !parse_integer(&child, &ret->dmq1) || - !parse_integer(&child, &ret->iqmp)) { - goto err; - } - - if (CBS_len(&child) != 0) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); - goto err; - } - - if (!RSA_check_key(ret)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_RSA_PARAMETERS); - goto err; - } - - return ret; - -err: - RSA_free(ret); - return NULL; -} - -RSA *RSA_private_key_from_bytes(const uint8_t *in, size_t in_len) { - CBS cbs; - CBS_init(&cbs, in, in_len); - RSA *ret = RSA_parse_private_key(&cbs); - if (ret == NULL || CBS_len(&cbs) != 0) { - OPENSSL_PUT_ERROR(RSA, RSA_R_BAD_ENCODING); - RSA_free(ret); - return NULL; - } - return ret; -} - -int RSA_marshal_private_key(CBB *cbb, const RSA *rsa) { - CBB child; - if (!CBB_add_asn1(cbb, &child, CBS_ASN1_SEQUENCE) || - !CBB_add_asn1_uint64(&child, kVersionTwoPrime) || - !marshal_integer(&child, rsa->n) || - !marshal_integer(&child, rsa->e) || - !marshal_integer(&child, rsa->d) || - !marshal_integer(&child, rsa->p) || - !marshal_integer(&child, rsa->q) || - !marshal_integer(&child, rsa->dmp1) || - !marshal_integer(&child, rsa->dmq1) || - !marshal_integer(&child, rsa->iqmp) || - !CBB_flush(cbb)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_ENCODE_ERROR); - return 0; - } - return 1; -} - -int RSA_private_key_to_bytes(uint8_t **out_bytes, size_t *out_len, - const RSA *rsa) { - CBB cbb; - CBB_zero(&cbb); - if (!CBB_init(&cbb, 0) || - !RSA_marshal_private_key(&cbb, rsa) || - !CBB_finish(&cbb, out_bytes, out_len)) { - OPENSSL_PUT_ERROR(RSA, RSA_R_ENCODE_ERROR); - CBB_cleanup(&cbb); - return 0; - } - return 1; -} - -RSA *d2i_RSAPublicKey(RSA **out, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - RSA *ret = RSA_parse_public_key(&cbs); - if (ret == NULL) { - return NULL; - } - if (out != NULL) { - RSA_free(*out); - *out = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -int i2d_RSAPublicKey(const RSA *in, uint8_t **outp) { - CBB cbb; - if (!CBB_init(&cbb, 0) || - !RSA_marshal_public_key(&cbb, in)) { - CBB_cleanup(&cbb); - return -1; - } - return CBB_finish_i2d(&cbb, outp); -} - -RSA *d2i_RSAPrivateKey(RSA **out, const uint8_t **inp, long len) { - if (len < 0) { - return NULL; - } - CBS cbs; - CBS_init(&cbs, *inp, (size_t)len); - RSA *ret = RSA_parse_private_key(&cbs); - if (ret == NULL) { - return NULL; - } - if (out != NULL) { - RSA_free(*out); - *out = ret; - } - *inp = CBS_data(&cbs); - return ret; -} - -int i2d_RSAPrivateKey(const RSA *in, uint8_t **outp) { - CBB cbb; - if (!CBB_init(&cbb, 0) || - !RSA_marshal_private_key(&cbb, in)) { - CBB_cleanup(&cbb); - return -1; - } - return CBB_finish_i2d(&cbb, outp); -} - -RSA *RSAPublicKey_dup(const RSA *rsa) { - uint8_t *der; - size_t der_len; - if (!RSA_public_key_to_bytes(&der, &der_len, rsa)) { - return NULL; - } - RSA *ret = RSA_public_key_from_bytes(der, der_len); - OPENSSL_free(der); - return ret; -} - -RSA *RSAPrivateKey_dup(const RSA *rsa) { - uint8_t *der; - size_t der_len; - if (!RSA_private_key_to_bytes(&der, &der_len, rsa)) { - return NULL; - } - RSA *ret = RSA_private_key_from_bytes(der, der_len); - OPENSSL_free(der); - return ret; -} diff --git a/third_party/boringssl/src/crypto/rsa_extra/rsa_print.c b/third_party/boringssl/src/crypto/rsa_extra/rsa_print.c deleted file mode 100644 index 71970b8e..00000000 --- a/third_party/boringssl/src/crypto/rsa_extra/rsa_print.c +++ /dev/null @@ -1,22 +0,0 @@ -/* - * Copyright 2006-2017 The OpenSSL Project Authors. All Rights Reserved. - * - * Licensed under the OpenSSL license (the "License"). You may not use - * this file except in compliance with the License. You can obtain a copy - * in the file LICENSE in the source distribution or at - * https://www.openssl.org/source/license.html - */ - -#include - -#include - - -int RSA_print(BIO *bio, const RSA *rsa, int indent) { - EVP_PKEY *pkey = EVP_PKEY_new(); - int ret = pkey != NULL && - EVP_PKEY_set1_RSA(pkey, (RSA *)rsa) && - EVP_PKEY_print_private(bio, pkey, indent, NULL); - EVP_PKEY_free(pkey); - return ret; -} diff --git a/third_party/boringssl/src/crypto/sha/sha1.cc b/third_party/boringssl/src/crypto/sha/sha1.cc new file mode 100644 index 00000000..6b06961b --- /dev/null +++ b/third_party/boringssl/src/crypto/sha/sha1.cc @@ -0,0 +1,55 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../fipsmodule/bcm_interface.h" + + +using namespace bssl; + +int SHA1_Init(SHA_CTX *sha) { + BCM_sha1_init(sha); + return 1; +} + +int SHA1_Update(SHA_CTX *sha, const void *data, size_t len) { + BCM_sha1_update(sha, data, len); + return 1; +} + +int SHA1_Final(uint8_t out[SHA_DIGEST_LENGTH], SHA_CTX *sha) { + BCM_sha1_final(out, sha); + return 1; +} + +uint8_t *SHA1(const uint8_t *data, size_t len, uint8_t out[SHA_DIGEST_LENGTH]) { + SHA_CTX ctx; + BCM_sha1_init(&ctx); + BCM_sha1_update(&ctx, data, len); + BCM_sha1_final(out, &ctx); + OPENSSL_cleanse(&ctx, sizeof(ctx)); + return out; +} + +void SHA1_Transform(SHA_CTX *sha, const uint8_t block[SHA_CBLOCK]) { + BCM_sha1_transform(sha, block); +} + +void CRYPTO_fips_186_2_prf(uint8_t *out, size_t out_len, + const uint8_t xkey[SHA_DIGEST_LENGTH]) { + BCM_fips_186_2_prf(out, out_len, xkey); +} diff --git a/third_party/boringssl/src/crypto/sha/sha256.cc b/third_party/boringssl/src/crypto/sha/sha256.cc new file mode 100644 index 00000000..3ef66bce --- /dev/null +++ b/third_party/boringssl/src/crypto/sha/sha256.cc @@ -0,0 +1,89 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../fipsmodule/bcm_interface.h" + + +using namespace bssl; + +int SHA224_Init(SHA256_CTX *sha) { + BCM_sha224_init(sha); + return 1; +} + +int SHA224_Update(SHA256_CTX *sha, const void *data, size_t len) { + BCM_sha224_update(sha, data, len); + return 1; +} + +int SHA224_Final(uint8_t out[SHA224_DIGEST_LENGTH], SHA256_CTX *sha) { + BCM_sha224_final(out, sha); + return 1; +} + +uint8_t *SHA224(const uint8_t *data, size_t len, + uint8_t out[SHA224_DIGEST_LENGTH]) { + SHA256_CTX ctx; + BCM_sha224_init(&ctx); + BCM_sha224_update(&ctx, data, len); + BCM_sha224_final(out, &ctx); + OPENSSL_cleanse(&ctx, sizeof(ctx)); + return out; +} + +int SHA256_Init(SHA256_CTX *sha) { + BCM_sha256_init(sha); + return 1; +} + +int SHA256_Update(SHA256_CTX *sha, const void *data, size_t len) { + BCM_sha256_update(sha, data, len); + return 1; +} + +int SHA256_Final(uint8_t out[SHA256_DIGEST_LENGTH], SHA256_CTX *sha) { + // TODO(bbe): This overflow check one of the few places a low-level hash + // 'final' function can fail. SHA-512 does not have a corresponding check. + // The BCM function is infallible and will abort if this is done incorrectly. + // we should verify nothing crashes with this removed and eliminate the 0 + // return. + if (sha->md_len > SHA256_DIGEST_LENGTH) { + return 0; + } + BCM_sha256_final(out, sha); + return 1; +} + +uint8_t *SHA256(const uint8_t *data, size_t len, + uint8_t out[SHA256_DIGEST_LENGTH]) { + SHA256_CTX ctx; + BCM_sha256_init(&ctx); + BCM_sha256_update(&ctx, data, len); + BCM_sha256_final(out, &ctx); + OPENSSL_cleanse(&ctx, sizeof(ctx)); + return out; +} + +void SHA256_Transform(SHA256_CTX *sha, const uint8_t block[SHA256_CBLOCK]) { + BCM_sha256_transform(sha, block); +} + +void SHA256_TransformBlocks(uint32_t state[8], const uint8_t *data, + size_t num_blocks) { + BCM_sha256_transform_blocks(state, data, num_blocks); +} diff --git a/third_party/boringssl/src/crypto/sha/sha512.cc b/third_party/boringssl/src/crypto/sha/sha512.cc new file mode 100644 index 00000000..afb5516c --- /dev/null +++ b/third_party/boringssl/src/crypto/sha/sha512.cc @@ -0,0 +1,106 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../fipsmodule/bcm_interface.h" + + +using namespace bssl; + +int SHA384_Init(SHA512_CTX *sha) { + BCM_sha384_init(sha); + return 1; +} + +int SHA384_Update(SHA512_CTX *sha, const void *data, size_t len) { + BCM_sha384_update(sha, data, len); + return 1; +} + +int SHA384_Final(uint8_t out[SHA384_DIGEST_LENGTH], SHA512_CTX *sha) { + BCM_sha384_final(out, sha); + return 1; +} + +uint8_t *SHA384(const uint8_t *data, size_t len, + uint8_t out[SHA384_DIGEST_LENGTH]) { + SHA512_CTX ctx; + BCM_sha384_init(&ctx); + BCM_sha384_update(&ctx, data, len); + BCM_sha384_final(out, &ctx); + OPENSSL_cleanse(&ctx, sizeof(ctx)); + return out; +} + +int SHA512_256_Init(SHA512_CTX *sha) { + BCM_sha512_256_init(sha); + return 1; +} + +int SHA512_256_Update(SHA512_CTX *sha, const void *data, size_t len) { + BCM_sha512_256_update(sha, data, len); + return 1; +} + +int SHA512_256_Final(uint8_t out[SHA512_256_DIGEST_LENGTH], SHA512_CTX *sha) { + BCM_sha512_256_final(out, sha); + return 1; +} + +uint8_t *SHA512_256(const uint8_t *data, size_t len, + uint8_t out[SHA512_256_DIGEST_LENGTH]) { + SHA512_CTX ctx; + BCM_sha512_256_init(&ctx); + BCM_sha512_256_update(&ctx, data, len); + BCM_sha512_256_final(out, &ctx); + OPENSSL_cleanse(&ctx, sizeof(ctx)); + return out; +} + +int SHA512_Init(SHA512_CTX *sha) { + BCM_sha512_init(sha); + return 1; +} + +int SHA512_Update(SHA512_CTX *sha, const void *data, size_t len) { + BCM_sha512_update(sha, data, len); + return 1; +} + +int SHA512_Final(uint8_t out[SHA512_DIGEST_LENGTH], SHA512_CTX *sha) { + // Historically this function returned failure if passed NULL, even though + // other final functions do not. + if (out == nullptr) { + return 0; + } + BCM_sha512_final(out, sha); + return 1; +} + +uint8_t *SHA512(const uint8_t *data, size_t len, + uint8_t out[SHA512_DIGEST_LENGTH]) { + SHA512_CTX ctx; + BCM_sha512_init(&ctx); + BCM_sha512_update(&ctx, data, len); + BCM_sha512_final(out, &ctx); + OPENSSL_cleanse(&ctx, sizeof(ctx)); + return out; +} + +void SHA512_Transform(SHA512_CTX *sha, const uint8_t block[SHA512_CBLOCK]) { + BCM_sha512_transform(sha, block); +} diff --git a/third_party/boringssl/src/crypto/siphash/siphash.c b/third_party/boringssl/src/crypto/siphash/siphash.c deleted file mode 100644 index 0921eac2..00000000 --- a/third_party/boringssl/src/crypto/siphash/siphash.c +++ /dev/null @@ -1,79 +0,0 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include -#include - -#include - -#include "../internal.h" - - -static void siphash_round(uint64_t v[4]) { - v[0] += v[1]; - v[2] += v[3]; - v[1] = CRYPTO_rotl_u64(v[1], 13); - v[3] = CRYPTO_rotl_u64(v[3], 16); - v[1] ^= v[0]; - v[3] ^= v[2]; - v[0] = CRYPTO_rotl_u64(v[0], 32); - v[2] += v[1]; - v[0] += v[3]; - v[1] = CRYPTO_rotl_u64(v[1], 17); - v[3] = CRYPTO_rotl_u64(v[3], 21); - v[1] ^= v[2]; - v[3] ^= v[0]; - v[2] = CRYPTO_rotl_u64(v[2], 32); -} - -uint64_t SIPHASH_24(const uint64_t key[2], const uint8_t *input, - size_t input_len) { - const size_t orig_input_len = input_len; - - uint64_t v[4]; - v[0] = key[0] ^ UINT64_C(0x736f6d6570736575); - v[1] = key[1] ^ UINT64_C(0x646f72616e646f6d); - v[2] = key[0] ^ UINT64_C(0x6c7967656e657261); - v[3] = key[1] ^ UINT64_C(0x7465646279746573); - - while (input_len >= sizeof(uint64_t)) { - uint64_t m = CRYPTO_load_u64_le(input); - v[3] ^= m; - siphash_round(v); - siphash_round(v); - v[0] ^= m; - - input += sizeof(uint64_t); - input_len -= sizeof(uint64_t); - } - - uint8_t last_block[8]; - OPENSSL_memset(last_block, 0, sizeof(last_block)); - OPENSSL_memcpy(last_block, input, input_len); - last_block[7] = orig_input_len & 0xff; - - uint64_t last_block_word = CRYPTO_load_u64_le(last_block); - v[3] ^= last_block_word; - siphash_round(v); - siphash_round(v); - v[0] ^= last_block_word; - - v[2] ^= 0xff; - siphash_round(v); - siphash_round(v); - siphash_round(v); - siphash_round(v); - - return v[0] ^ v[1] ^ v[2] ^ v[3]; -} diff --git a/third_party/boringssl/src/crypto/siphash/siphash.cc b/third_party/boringssl/src/crypto/siphash/siphash.cc new file mode 100644 index 00000000..551f0375 --- /dev/null +++ b/third_party/boringssl/src/crypto/siphash/siphash.cc @@ -0,0 +1,81 @@ +// Copyright 2019 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include + +#include "../internal.h" + + +using namespace bssl; + +static void siphash_round(uint64_t v[4]) { + v[0] += v[1]; + v[2] += v[3]; + v[1] = CRYPTO_rotl_u64(v[1], 13); + v[3] = CRYPTO_rotl_u64(v[3], 16); + v[1] ^= v[0]; + v[3] ^= v[2]; + v[0] = CRYPTO_rotl_u64(v[0], 32); + v[2] += v[1]; + v[0] += v[3]; + v[1] = CRYPTO_rotl_u64(v[1], 17); + v[3] = CRYPTO_rotl_u64(v[3], 21); + v[1] ^= v[2]; + v[3] ^= v[0]; + v[2] = CRYPTO_rotl_u64(v[2], 32); +} + +uint64_t SIPHASH_24(const uint64_t key[2], const uint8_t *input, + size_t input_len) { + const size_t orig_input_len = input_len; + + uint64_t v[4]; + v[0] = key[0] ^ UINT64_C(0x736f6d6570736575); + v[1] = key[1] ^ UINT64_C(0x646f72616e646f6d); + v[2] = key[0] ^ UINT64_C(0x6c7967656e657261); + v[3] = key[1] ^ UINT64_C(0x7465646279746573); + + while (input_len >= sizeof(uint64_t)) { + uint64_t m = CRYPTO_load_u64_le(input); + v[3] ^= m; + siphash_round(v); + siphash_round(v); + v[0] ^= m; + + input += sizeof(uint64_t); + input_len -= sizeof(uint64_t); + } + + uint8_t last_block[8]; + OPENSSL_memset(last_block, 0, sizeof(last_block)); + OPENSSL_memcpy(last_block, input, input_len); + last_block[7] = orig_input_len & 0xff; + + uint64_t last_block_word = CRYPTO_load_u64_le(last_block); + v[3] ^= last_block_word; + siphash_round(v); + siphash_round(v); + v[0] ^= last_block_word; + + v[2] ^= 0xff; + siphash_round(v); + siphash_round(v); + siphash_round(v); + siphash_round(v); + + return v[0] ^ v[1] ^ v[2] ^ v[3]; +} diff --git a/third_party/boringssl/src/crypto/slhdsa/slhdsa.cc b/third_party/boringssl/src/crypto/slhdsa/slhdsa.cc new file mode 100644 index 00000000..ff470735 --- /dev/null +++ b/third_party/boringssl/src/crypto/slhdsa/slhdsa.cc @@ -0,0 +1,150 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include "../fipsmodule/bcm_interface.h" + + +using namespace bssl; + +static_assert(SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES == + BCM_SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES); +static_assert(SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES == + BCM_SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES); +static_assert(SLHDSA_SHA2_128S_SIGNATURE_BYTES == + BCM_SLHDSA_SHA2_128S_SIGNATURE_BYTES); +static_assert(SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES == + BCM_SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES); +static_assert(SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES == + BCM_SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES); +static_assert(SLHDSA_SHAKE_256F_SIGNATURE_BYTES == + BCM_SLHDSA_SHAKE_256F_SIGNATURE_BYTES); + +void SLHDSA_SHA2_128S_generate_key( + uint8_t out_public_key[SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + uint8_t out_private_key[SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES]) { + BCM_slhdsa_sha2_128s_generate_key(out_public_key, out_private_key); +} + +void SLHDSA_SHAKE_256F_generate_key( + uint8_t out_public_key[SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + uint8_t out_private_key[SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES]) { + BCM_slhdsa_shake_256f_generate_key(out_public_key, out_private_key); +} + +void SLHDSA_SHA2_128S_public_from_private( + uint8_t out_public_key[SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t private_key[SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES]) { + BCM_slhdsa_sha2_128s_public_from_private(out_public_key, private_key); +} + +void SLHDSA_SHAKE_256F_public_from_private( + uint8_t out_public_key[SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + const uint8_t private_key[SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES]) { + BCM_slhdsa_shake_256f_public_from_private(out_public_key, private_key); +} + +int SLHDSA_SHA2_128S_sign( + uint8_t out_signature[SLHDSA_SHA2_128S_SIGNATURE_BYTES], + const uint8_t private_key[SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + return bcm_success(BCM_slhdsa_sha2_128s_sign(out_signature, private_key, msg, + msg_len, context, context_len)); +} + +int SLHDSA_SHAKE_256F_sign( + uint8_t out_signature[SLHDSA_SHAKE_256F_SIGNATURE_BYTES], + const uint8_t private_key[SLHDSA_SHAKE_256F_PRIVATE_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + return bcm_success(BCM_slhdsa_shake_256f_sign(out_signature, private_key, msg, + msg_len, context, + context_len)); +} + +int SLHDSA_SHA2_128S_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + return bcm_success(BCM_slhdsa_sha2_128s_verify(signature, signature_len, + public_key, msg, msg_len, + context, context_len)); +} + +int SLHDSA_SHAKE_256F_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[SLHDSA_SHAKE_256F_PUBLIC_KEY_BYTES], + const uint8_t *msg, size_t msg_len, const uint8_t *context, + size_t context_len) { + return bcm_success(BCM_slhdsa_shake_256f_verify(signature, signature_len, + public_key, msg, msg_len, + context, context_len)); +} + +int SLHDSA_SHA2_128S_prehash_sign( + uint8_t out_signature[SLHDSA_SHA2_128S_SIGNATURE_BYTES], + const uint8_t private_key[SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len) { + if (hash_nid != NID_sha256) { + return 0; + } + return bcm_success(BCM_slhdsa_sha2_128s_prehash_sign( + out_signature, private_key, hashed_msg, hashed_msg_len, hash_nid, context, + context_len)); +} + +int SLHDSA_SHA2_128S_prehash_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len) { + if (hash_nid != NID_sha256) { + return 0; + } + return bcm_success(BCM_slhdsa_sha2_128s_prehash_verify( + signature, signature_len, public_key, hashed_msg, hashed_msg_len, + hash_nid, context, context_len)); +} + +int SLHDSA_SHA2_128S_prehash_warning_nonstandard_sign( + uint8_t out_signature[SLHDSA_SHA2_128S_SIGNATURE_BYTES], + const uint8_t private_key[SLHDSA_SHA2_128S_PRIVATE_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len) { + if (hash_nid != NID_sha384) { + return 0; + } + return bcm_success(BCM_slhdsa_sha2_128s_prehash_sign( + out_signature, private_key, hashed_msg, hashed_msg_len, hash_nid, context, + context_len)); +} + +int SLHDSA_SHA2_128S_prehash_warning_nonstandard_verify( + const uint8_t *signature, size_t signature_len, + const uint8_t public_key[SLHDSA_SHA2_128S_PUBLIC_KEY_BYTES], + const uint8_t *hashed_msg, size_t hashed_msg_len, int hash_nid, + const uint8_t *context, size_t context_len) { + if (hash_nid != NID_sha384) { + return 0; + } + return bcm_success(BCM_slhdsa_sha2_128s_prehash_verify( + signature, signature_len, public_key, hashed_msg, hashed_msg_len, + hash_nid, context, context_len)); +} diff --git a/third_party/boringssl/src/crypto/spake2plus/internal.h b/third_party/boringssl/src/crypto/spake2plus/internal.h new file mode 100644 index 00000000..7344c57e --- /dev/null +++ b/third_party/boringssl/src/crypto/spake2plus/internal.h @@ -0,0 +1,204 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_SPAKE2PLUS_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_SPAKE2PLUS_INTERNAL_H + +#include + +#include + +#include +#include + +#include "../fipsmodule/ec/internal.h" + + +BSSL_NAMESPACE_BEGIN + +// SPAKE2+. +// +// SPAKE2+ is an augmented password-authenticated key-exchange. It allows +// two parties, a prover and verifier, to derive a strong shared key with no +// risk of disclosing the password, known only to the prover, to the verifier. +// (But note that the verifier can still attempt an offline, brute-force attack +// to recover the password.) +// +// This is an implementation of SPAKE2+ using P-256 as the group, SHA-256 as +// the hash function, HKDF-SHA256 as the key derivation function, and +// HMAC-SHA256 as the message authentication code. +// +// See https://www.rfc-editor.org/rfc/rfc9383.html + +namespace spake2plus { + +// kShareSize is the size of a SPAKE2+ key share. +constexpr size_t kShareSize = 65; + +// kConfirmSize is the size of a SPAKE2+ key confirmation message. +constexpr size_t kConfirmSize = 32; + +// kVerifierSize is the size of the w0 and w1 values in the SPAKE2+ protocol. +constexpr size_t kVerifierSize = 32; + +// kRegistrationRecordSize is the number of bytes in a registration record, +// which is provided to the verifier. +constexpr size_t kRegistrationRecordSize = 65; + +// kSecretSize is the number of bytes of shared secret that the SPAKE2+ protocol +// generates. +constexpr size_t kSecretSize = 32; + +// Register computes the values needed in the offline registration +// step of the SPAKE2+ protocol. See the following for more details: +// https://www.rfc-editor.org/rfc/rfc9383.html#section-3.2 +// +// The |password| argument is the mandatory prover password. The |out_w0|, +// |out_w1|, and |out_registration_record| arguments are where the password +// verifiers (w0 and w1) and registration record (L) are stored, respectively. +// The prover is given |out_w0| and |out_w1| while the verifier is given +// |out_w0| and |out_registration_record|. +// +// To ensure success, |out_w0| and |out_w1| must be of length |kVerifierSize|, +// and |out_registration_record| of size |kRegistrationRecordSize|. +[[nodiscard]] OPENSSL_EXPORT bool Register( + Span out_w0, Span out_w1, + Span out_registration_record, Span password, + Span id_prover, Span id_verifier); + +class OPENSSL_EXPORT Prover { + public: + static constexpr bool kAllowUniquePtr = true; + + Prover(); + ~Prover(); + + // Init creates a new prover, which can only be used for a single execution of + // the protocol. + // + // The |context| argument is an application-specific value meant to constrain + // the protocol execution. The |w0| and |w1| arguments are password verifier + // values computed during the offline registration phase of the protocol. The + // |id_prover| and |id_verifier| arguments allow optional, opaque names to be + // bound into the protocol. See the following for more information about how + // these identities may be chosen: + // https://www.rfc-editor.org/rfc/rfc9383.html#name-definition-of-spake2 + [[nodiscard]] bool Init(Span context, + Span id_prover, + Span id_verifier, + Span w0, Span w1, + Span x = Span()); + + // GenerateShare computes a SPAKE2+ share and writes it to |out_share|. + // + // This function can only be called once for a given |Prover|. To ensure + // success, |out_share| must be |kShareSize| bytes. + [[nodiscard]] bool GenerateShare(Span out_share); + + // ComputeConfirmation computes a SPAKE2+ key confirmation + // message and writes it to |out_confirm|. It also computes the shared secret + // and writes it to |out_secret|. + // + // This function can only be called once for a given |Prover|. + // + // To ensure success, |out_confirm| must be |kConfirmSize| bytes + // and |out_secret| must be |kSecretSize| bytes. + [[nodiscard]] bool ComputeConfirmation(Span out_confirm, + Span out_secret, + Span peer_share, + Span peer_confirm); + + private: + enum class State { + kInit, + kShareGenerated, + kConfirmGenerated, + kDone, + }; + + State state_ = State::kInit; + SHA256_CTX transcript_hash_; + EC_SCALAR w0_; + EC_SCALAR w1_; + EC_SCALAR x_; + EC_AFFINE X_; + uint8_t share_[kShareSize]; +}; + +class OPENSSL_EXPORT Verifier { + public: + static constexpr bool kAllowUniquePtr = true; + + Verifier(); + ~Verifier(); + + // Init creates a new verifier, which can only be used for a single execution + // of the protocol. + // + // The |context| argument is an application-specific value meant to constrain + // the protocol execution. The |w0| and |registration_record| arguments are + // required, and are computed by the prover via |Register|. Only the prover + // can produce |w0| and |registration_record|, as they require + // knowledge of the password. The prover must securely transmit this to the + // verifier out-of-band. The |id_prover| and |id_verifier| arguments allow + // optional, opaque names to be bound into the protocol. See the following for + // more information about how these identities may be chosen: + // https://www.rfc-editor.org/rfc/rfc9383.html#name-definition-of-spake2 + [[nodiscard]] bool Init(Span context, + Span id_prover, + Span id_verifier, + Span w0, + Span registration_record, + Span y = Span()); + + // ProcessProverShare computes a SPAKE2+ share from an input share, + // |prover_share|, and writes it to |out_share|. It also computes the key + // confirmation message and writes it to |out_confirm|. Finally, it computes + // the shared secret and writes it to |out_secret|. + // + // This function can only be called once for a given |Verifier|. + // + // To ensure success, |out_share| must be |kShareSize| bytes, |out_confirm| + // must be |kConfirmSize| bytes, and |out_secret| must be |kSecretSize| bytes. + [[nodiscard]] bool ProcessProverShare(Span out_share, + Span out_confirm, + Span out_secret, + Span prover_share); + + // VerifyProverConfirmation verifies a SPAKE2+ key confirmation message, + // |prover_confirm|. + // + // This function can only be called once for a given |Verifier|. + [[nodiscard]] bool VerifyProverConfirmation(Span peer_confirm); + + private: + enum class State { + kInit, + kProverShareSeen, + kDone, + }; + + State state_ = State::kInit; + SHA256_CTX transcript_hash_; + EC_SCALAR w0_; + EC_AFFINE L_; + EC_SCALAR y_; + uint8_t confirm_[kConfirmSize]; +}; + +} // namespace spake2plus + +BSSL_NAMESPACE_END + +#endif // OPENSSL_HEADER_CRYPTO_SPAKE2PLUS_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/spake2plus/spake2plus.cc b/third_party/boringssl/src/crypto/spake2plus/spake2plus.cc new file mode 100644 index 00000000..edf26294 --- /dev/null +++ b/third_party/boringssl/src/crypto/spake2plus/spake2plus.cc @@ -0,0 +1,501 @@ +// Copyright 2024 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../fipsmodule/bn/internal.h" +#include "../fipsmodule/ec/internal.h" +#include "../internal.h" +#include "./internal.h" + +BSSL_NAMESPACE_BEGIN +namespace spake2plus { +namespace { + +const uint8_t kDefaultAdditionalData[32] = {0}; + +// https://www.rfc-editor.org/rfc/rfc9383.html#appendix-B +// seed: 1.2.840.10045.3.1.7 point generation seed (M) +// M = +// 02886e2f97ace46e55ba9dd7242579f2993b64e16ef3dcab95afd497333d8fa12f +// +// `M` is interpreted as a X9.62-format compressed point. This is then the +// uncompressed form: +const uint8_t kM_bytes[] = { + 0x04, 0x88, 0x6e, 0x2f, 0x97, 0xac, 0xe4, 0x6e, 0x55, 0xba, 0x9d, + 0xd7, 0x24, 0x25, 0x79, 0xf2, 0x99, 0x3b, 0x64, 0xe1, 0x6e, 0xf3, + 0xdc, 0xab, 0x95, 0xaf, 0xd4, 0x97, 0x33, 0x3d, 0x8f, 0xa1, 0x2f, + 0x5f, 0xf3, 0x55, 0x16, 0x3e, 0x43, 0xce, 0x22, 0x4e, 0x0b, 0x0e, + 0x65, 0xff, 0x02, 0xac, 0x8e, 0x5c, 0x7b, 0xe0, 0x94, 0x19, 0xc7, + 0x85, 0xe0, 0xca, 0x54, 0x7d, 0x55, 0xa1, 0x2e, 0x2d, 0x20}; + +// https://www.rfc-editor.org/rfc/rfc9383.html#appendix-B +// seed: 1.2.840.10045.3.1.7 point generation seed (N) +// N = +// 03d8bbd6c639c62937b04d997f38c3770719c629d7014d49a24b4f98baa1292b49 +// +// `N` is interpreted as a X9.62-format compressed point. This is then the +// uncompressed form: +const uint8_t kN_bytes[] = { + 0x04, 0xd8, 0xbb, 0xd6, 0xc6, 0x39, 0xc6, 0x29, 0x37, 0xb0, 0x4d, + 0x99, 0x7f, 0x38, 0xc3, 0x77, 0x07, 0x19, 0xc6, 0x29, 0xd7, 0x01, + 0x4d, 0x49, 0xa2, 0x4b, 0x4f, 0x98, 0xba, 0xa1, 0x29, 0x2b, 0x49, + 0x07, 0xd6, 0x0a, 0xa6, 0xbf, 0xad, 0xe4, 0x50, 0x08, 0xa6, 0x36, + 0x33, 0x7f, 0x51, 0x68, 0xc6, 0x4d, 0x9b, 0xd3, 0x60, 0x34, 0x80, + 0x8c, 0xd5, 0x64, 0x49, 0x0b, 0x1e, 0x65, 0x6e, 0xdb, 0xe7}; + +void UpdateWithLengthPrefix(SHA256_CTX *sha, Span data) { + uint8_t len_le[8]; + CRYPTO_store_u64_le(len_le, data.size()); + SHA256_Update(sha, len_le, sizeof(len_le)); + SHA256_Update(sha, data.data(), data.size()); +} + +void ConstantToJacobian(const EC_GROUP *group, EC_JACOBIAN *out, + bssl::Span in) { + EC_AFFINE point; + BSSL_CHECK(ec_point_from_uncompressed(group, &point, in.data(), in.size())); + ec_affine_to_jacobian(group, out, &point); +} + +void ScalarToSizedBuffer(const EC_GROUP *group, const EC_SCALAR *s, + Span out_buf) { + size_t out_bytes; + ec_scalar_to_bytes(group, out_buf.data(), &out_bytes, s); + BSSL_CHECK(out_bytes == out_buf.size()); +} + +bool AddLengthPrefixed(CBB *cbb, Span bytes) { + return CBB_add_u64le(cbb, bytes.size()) && + CBB_add_bytes(cbb, bytes.data(), bytes.size()); +} + +void InitTranscriptHash(SHA256_CTX *sha, Span context, + Span id_prover, + Span id_verifier) { + SHA256_Init(sha); + UpdateWithLengthPrefix(sha, context); + UpdateWithLengthPrefix(sha, id_prover); + UpdateWithLengthPrefix(sha, id_verifier); + UpdateWithLengthPrefix(sha, kM_bytes); + UpdateWithLengthPrefix(sha, kN_bytes); +} + +bool ComputeTranscript(uint8_t out_prover_confirm[kConfirmSize], + uint8_t out_verifier_confirm[kConfirmSize], + uint8_t out_secret[kSecretSize], + const uint8_t prover_share[kShareSize], + const uint8_t verifier_share[kShareSize], + SHA256_CTX *sha, const EC_AFFINE *Z, const EC_AFFINE *V, + const EC_SCALAR *w0) { + const EC_GROUP *group = EC_group_p256(); + + uint8_t Z_enc[kShareSize]; + size_t Z_enc_len = ec_point_to_bytes(group, Z, POINT_CONVERSION_UNCOMPRESSED, + Z_enc, sizeof(Z_enc)); + BSSL_CHECK(Z_enc_len == sizeof(Z_enc)); + + uint8_t V_enc[kShareSize]; + size_t V_enc_len = ec_point_to_bytes(group, V, POINT_CONVERSION_UNCOMPRESSED, + V_enc, sizeof(V_enc)); + BSSL_CHECK(V_enc_len == sizeof(V_enc)); + + uint8_t w0_enc[kVerifierSize]; + ScalarToSizedBuffer(group, w0, w0_enc); + + uint8_t K_main[SHA256_DIGEST_LENGTH]; + UpdateWithLengthPrefix(sha, Span(prover_share, kShareSize)); + UpdateWithLengthPrefix(sha, Span(verifier_share, kShareSize)); + UpdateWithLengthPrefix(sha, Z_enc); + UpdateWithLengthPrefix(sha, V_enc); + UpdateWithLengthPrefix(sha, w0_enc); + SHA256_Final(K_main, sha); + + auto confirmation_str = StringAsBytes("ConfirmationKeys"); + uint8_t keys[kSecretSize * 2]; + if (!HKDF(keys, sizeof(keys), EVP_sha256(), K_main, sizeof(K_main), nullptr, + 0, confirmation_str.data(), confirmation_str.size())) { + return false; + } + + auto secret_info_str = StringAsBytes("SharedKey"); + if (!HKDF(out_secret, kSecretSize, EVP_sha256(), K_main, sizeof(K_main), + nullptr, 0, secret_info_str.data(), secret_info_str.size())) { + return false; + } + + unsigned prover_confirm_len; + if (HMAC(EVP_sha256(), keys, kSecretSize, verifier_share, kShareSize, + out_prover_confirm, &prover_confirm_len) == nullptr) { + return false; + } + BSSL_CHECK(prover_confirm_len == kConfirmSize); + + unsigned verifier_confirm_len; + if (HMAC(EVP_sha256(), keys + kSecretSize, kSecretSize, prover_share, + kShareSize, out_verifier_confirm, + &verifier_confirm_len) == nullptr) { + return false; + } + BSSL_CHECK(verifier_confirm_len == kConfirmSize); + + return true; +} + +} // namespace + +bool Register(Span out_w0, Span out_w1, + Span out_registration_record, + Span password, Span id_prover, + Span id_verifier) { + if (out_w0.size() != kVerifierSize || out_w1.size() != kVerifierSize || + out_registration_record.size() != kRegistrationRecordSize) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + // Offline registration format from: + // https://www.rfc-editor.org/rfc/rfc9383.html#section-3.2 + ScopedCBB mhf_input; + if (!CBB_init(mhf_input.get(), password.size() + id_prover.size() + + id_verifier.size() + + 3 * sizeof(uint64_t)) || // + !AddLengthPrefixed(mhf_input.get(), password) || + !AddLengthPrefixed(mhf_input.get(), id_prover) || + !AddLengthPrefixed(mhf_input.get(), id_verifier) || + !CBB_flush(mhf_input.get())) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + // https://neuromancer.sk/std/nist/P-256 + // sage: p = + // 0xffffffff00000001000000000000000000000000ffffffffffffffffffffffff + // ....: K = GF(p) + // ....: a = + // K(0xffffffff00000001000000000000000000000000fffffffffffffffffffffffc) + // ....: b = + // K(0x5ac635d8aa3a93e7b3ebbd55769886bc651d06b0cc53b0f63bce3c3e27d2604b) + // ....: E = EllipticCurve(K, (a, b)) + // ....: G = + // E(0x6b17d1f2e12c4247f8bce6e563a440f277037d812deb33a0f4a13945d898c296, + // ....: 0x4fe342e2fe1a7f9b8ee7eb4a7c0f9e162bce33576b315ececbb6406837bf51f5) + // ....: + // E.set_order(0xffffffff00000000ffffffffffffffffbce6faada7179e84f3b9cac2fc63 + // ....: 2551 * 0x1) + // sage: k = 64 + // sage: L = (2 * (ceil(log(p)/log(2)) + k)) / 8 + + // RFC 9383 Section 3.2 + constexpr size_t kKDFOutputSize = 80; + constexpr size_t kKDFOutputWords = kKDFOutputSize / BN_BYTES; + + uint8_t key[kKDFOutputSize]; + if (!EVP_PBE_scrypt((const char *)CBB_data(mhf_input.get()), + CBB_len(mhf_input.get()), nullptr, 0, + /*N=*/32768, /*r=*/8, /*p=*/1, + /*max_mem=*/1024 * 1024 * 33, key, kKDFOutputSize)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + const EC_GROUP *group = EC_group_p256(); + BN_ULONG w0_words[kKDFOutputWords / 2]; + bn_big_endian_to_words(w0_words, kKDFOutputWords / 2, key, + kKDFOutputSize / 2); + EC_SCALAR w0; + ec_scalar_reduce(group, &w0, w0_words, kKDFOutputWords / 2); + ScalarToSizedBuffer(group, &w0, out_w0); + + BN_ULONG w1_words[kKDFOutputWords / 2]; + bn_big_endian_to_words(w1_words, kKDFOutputWords / 2, + key + kKDFOutputSize / 2, kKDFOutputSize / 2); + EC_SCALAR w1; + ec_scalar_reduce(group, &w1, w1_words, kKDFOutputWords / 2); + ScalarToSizedBuffer(group, &w1, out_w1); + + EC_JACOBIAN L_j; + EC_AFFINE L; + if (!ec_point_mul_scalar_base(group, &L_j, &w1) || // + !ec_jacobian_to_affine(group, &L, &L_j) || // + !ec_point_to_bytes(group, &L, POINT_CONVERSION_UNCOMPRESSED, + out_registration_record.data(), + kRegistrationRecordSize)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + return true; +} + +Prover::Prover() = default; +Prover::~Prover() = default; + +bool Prover::Init(Span context, Span id_prover, + Span id_verifier, Span w0, + Span w1, Span x) { + const EC_GROUP *group = EC_group_p256(); + + if (!ec_scalar_from_bytes(group, &w0_, w0.data(), w0.size()) || + !ec_scalar_from_bytes(group, &w1_, w1.data(), w1.size()) || + (!x.empty() && + !ec_scalar_from_bytes(group, &x_, x.data(), x.size())) || // + (x.empty() && !ec_random_scalar(group, &x_, kDefaultAdditionalData))) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + InitTranscriptHash(&transcript_hash_, context, id_prover, id_verifier); + + return true; +} + +bool Prover::GenerateShare(Span out_share) { + if (state_ != State::kInit || out_share.size() != kShareSize) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + // Compute X = x×P + w0×M. + // TODO(crbug.com/383778231): This could be sped up with a constant-time, + // two-point multiplication. + const EC_GROUP *group = EC_group_p256(); + EC_JACOBIAN l; + if (!ec_point_mul_scalar_base(group, &l, &x_)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + EC_JACOBIAN M_j; + ConstantToJacobian(group, &M_j, kM_bytes); + + EC_JACOBIAN r; + if (!ec_point_mul_scalar(group, &r, &M_j, &w0_)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + EC_JACOBIAN X_j; + group->meth->add(group, &X_j, &l, &r); + if (!ec_jacobian_to_affine(group, &X_, &X_j)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + size_t written = ec_point_to_bytes(group, &X_, POINT_CONVERSION_UNCOMPRESSED, + out_share.data(), kShareSize); + BSSL_CHECK(written == kShareSize); + + memcpy(share_, out_share.data(), kShareSize); + state_ = State::kShareGenerated; + return true; +} + +bool Prover::ComputeConfirmation(Span out_confirm, + Span out_secret, + Span peer_share, + Span peer_confirm) { + if (state_ != State::kShareGenerated || out_confirm.size() != kConfirmSize || + out_secret.size() != kSecretSize || peer_share.size() != kShareSize || + peer_confirm.size() != kConfirmSize) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + const EC_GROUP *group = EC_group_p256(); + EC_AFFINE Y; + if (!ec_point_from_uncompressed(group, &Y, peer_share.data(), + peer_share.size())) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + EC_JACOBIAN N_j; + ConstantToJacobian(group, &N_j, kN_bytes); + + EC_JACOBIAN r; + if (!ec_point_mul_scalar(group, &r, &N_j, &w0_)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + ec_felem_neg(group, &r.Y, &r.Y); + + EC_JACOBIAN Y_j; + ec_affine_to_jacobian(group, &Y_j, &Y); + + EC_JACOBIAN t; + group->meth->add(group, &t, &Y_j, &r); + + EC_JACOBIAN tmp; + EC_AFFINE Z, V; + // TODO(crbug.com/383778231): The two affine conversions could be batched + // together. + if (!ec_point_mul_scalar(group, &tmp, &t, &x_) || // + !ec_jacobian_to_affine(group, &Z, &tmp) || // + !ec_point_mul_scalar(group, &tmp, &t, &w1_) || // + !ec_jacobian_to_affine(group, &V, &tmp)) { + return 0; + } + + uint8_t verifier_confirm[kConfirmSize]; + if (!ComputeTranscript(out_confirm.data(), verifier_confirm, + out_secret.data(), share_, peer_share.data(), + &transcript_hash_, &Z, &V, &w0_) || + CRYPTO_memcmp(verifier_confirm, peer_confirm.data(), + sizeof(verifier_confirm)) != 0) { + return 0; + } + + state_ = State::kDone; + return true; +} + +Verifier::Verifier() = default; +Verifier::~Verifier() = default; + +bool Verifier::Init(Span context, Span id_prover, + Span id_verifier, Span w0, + Span registration_record, + Span y) { + const EC_GROUP *group = EC_group_p256(); + + if (!ec_scalar_from_bytes(group, &w0_, w0.data(), w0.size()) || + !ec_point_from_uncompressed(group, &L_, registration_record.data(), + registration_record.size()) || // + (!y.empty() && + !ec_scalar_from_bytes(group, &y_, y.data(), y.size())) || // + (y.empty() && !ec_random_scalar(group, &y_, kDefaultAdditionalData))) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + InitTranscriptHash(&transcript_hash_, context, id_prover, id_verifier); + + return true; +} + + +bool Verifier::ProcessProverShare(Span out_share, + Span out_confirm, + Span out_secret, + Span prover_share) { + if (state_ != State::kInit || // + out_share.size() != kShareSize || out_confirm.size() != kConfirmSize || + out_secret.size() != kSecretSize || prover_share.size() != kShareSize) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + const EC_GROUP *group = EC_group_p256(); + EC_JACOBIAN l, r, M_j, N_j; + ConstantToJacobian(group, &M_j, kM_bytes); + ConstantToJacobian(group, &N_j, kN_bytes); + + // Compute Y = y×P + w0×M. + // TODO(crbug.com/383778231): This could be sped up with a constant-time, + // two-point multiplication. + if (!ec_point_mul_scalar_base(group, &l, &y_) || + !ec_point_mul_scalar(group, &r, &N_j, &w0_)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + EC_JACOBIAN Y_j; + EC_AFFINE Y; + group->meth->add(group, &Y_j, &l, &r); + if (!ec_jacobian_to_affine(group, &Y, &Y_j)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + const size_t written = ec_point_to_bytes( + group, &Y, POINT_CONVERSION_UNCOMPRESSED, out_share.data(), kShareSize); + BSSL_CHECK(written == kShareSize); + + EC_JACOBIAN r2; + EC_AFFINE X; + if (!ec_point_from_uncompressed(group, &X, prover_share.data(), + prover_share.size()) || + !ec_point_mul_scalar(group, &r2, &M_j, &w0_)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + ec_felem_neg(group, &r2.Y, &r2.Y); + + EC_JACOBIAN X_j, T; + ec_affine_to_jacobian(group, &X_j, &X); + group->meth->add(group, &T, &X_j, &r2); + + // TODO(crbug.com/383778231): The two affine conversions could be batched + // together. + EC_JACOBIAN tmp; + EC_AFFINE Z; + if (!ec_point_mul_scalar(group, &tmp, &T, &y_) || // + !ec_jacobian_to_affine(group, &Z, &tmp)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + EC_JACOBIAN L_j; + EC_AFFINE V; + ec_affine_to_jacobian(group, &L_j, &L_); + if (!ec_point_mul_scalar(group, &tmp, &L_j, &y_) || // + !ec_jacobian_to_affine(group, &V, &tmp)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + if (!ComputeTranscript(confirm_, out_confirm.data(), out_secret.data(), + prover_share.data(), out_share.data(), + &transcript_hash_, &Z, &V, &w0_)) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + state_ = State::kProverShareSeen; + return true; +} + +bool Verifier::VerifyProverConfirmation(Span peer_confirm) { + if (state_ != State::kProverShareSeen || // + peer_confirm.size() != kConfirmSize || // + CRYPTO_memcmp(confirm_, peer_confirm.data(), sizeof(confirm_)) != 0) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_INTERNAL_ERROR); + return false; + } + + state_ = State::kDone; + return true; +} + +} // namespace spake2plus + +BSSL_NAMESPACE_END diff --git a/third_party/boringssl/src/crypto/stack/stack.c b/third_party/boringssl/src/crypto/stack/stack.c deleted file mode 100644 index fe1b5134..00000000 --- a/third_party/boringssl/src/crypto/stack/stack.c +++ /dev/null @@ -1,442 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include - -#include "../internal.h" - - -// kMinSize is the number of pointers that will be initially allocated in a new -// stack. -static const size_t kMinSize = 4; - -_STACK *sk_new(OPENSSL_sk_cmp_func comp) { - _STACK *ret = OPENSSL_malloc(sizeof(_STACK)); - if (ret == NULL) { - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(_STACK)); - - ret->data = OPENSSL_malloc(sizeof(void *) * kMinSize); - if (ret->data == NULL) { - goto err; - } - - OPENSSL_memset(ret->data, 0, sizeof(void *) * kMinSize); - - ret->comp = comp; - ret->num_alloc = kMinSize; - - return ret; - -err: - OPENSSL_free(ret); - return NULL; -} - -_STACK *sk_new_null(void) { return sk_new(NULL); } - -size_t sk_num(const _STACK *sk) { - if (sk == NULL) { - return 0; - } - return sk->num; -} - -void sk_zero(_STACK *sk) { - if (sk == NULL || sk->num == 0) { - return; - } - OPENSSL_memset(sk->data, 0, sizeof(void*) * sk->num); - sk->num = 0; - sk->sorted = 0; -} - -void *sk_value(const _STACK *sk, size_t i) { - if (!sk || i >= sk->num) { - return NULL; - } - return sk->data[i]; -} - -void *sk_set(_STACK *sk, size_t i, void *value) { - if (!sk || i >= sk->num) { - return NULL; - } - return sk->data[i] = value; -} - -void sk_free(_STACK *sk) { - if (sk == NULL) { - return; - } - OPENSSL_free(sk->data); - OPENSSL_free(sk); -} - -void sk_pop_free_ex(_STACK *sk, OPENSSL_sk_call_free_func call_free_func, - OPENSSL_sk_free_func free_func) { - if (sk == NULL) { - return; - } - - for (size_t i = 0; i < sk->num; i++) { - if (sk->data[i] != NULL) { - call_free_func(free_func, sk->data[i]); - } - } - sk_free(sk); -} - -// Historically, |sk_pop_free| called the function as |OPENSSL_sk_free_func| -// directly. This is undefined in C. Some callers called |sk_pop_free| directly, -// so we must maintain a compatibility version for now. -static void call_free_func_legacy(OPENSSL_sk_free_func func, void *ptr) { - func(ptr); -} - -void sk_pop_free(_STACK *sk, OPENSSL_sk_free_func free_func) { - sk_pop_free_ex(sk, call_free_func_legacy, free_func); -} - -size_t sk_insert(_STACK *sk, void *p, size_t where) { - if (sk == NULL) { - return 0; - } - - if (sk->num_alloc <= sk->num + 1) { - // Attempt to double the size of the array. - size_t new_alloc = sk->num_alloc << 1; - size_t alloc_size = new_alloc * sizeof(void *); - void **data; - - // If the doubling overflowed, try to increment. - if (new_alloc < sk->num_alloc || alloc_size / sizeof(void *) != new_alloc) { - new_alloc = sk->num_alloc + 1; - alloc_size = new_alloc * sizeof(void *); - } - - // If the increment also overflowed, fail. - if (new_alloc < sk->num_alloc || alloc_size / sizeof(void *) != new_alloc) { - return 0; - } - - data = OPENSSL_realloc(sk->data, alloc_size); - if (data == NULL) { - return 0; - } - - sk->data = data; - sk->num_alloc = new_alloc; - } - - if (where >= sk->num) { - sk->data[sk->num] = p; - } else { - OPENSSL_memmove(&sk->data[where + 1], &sk->data[where], - sizeof(void *) * (sk->num - where)); - sk->data[where] = p; - } - - sk->num++; - sk->sorted = 0; - - return sk->num; -} - -void *sk_delete(_STACK *sk, size_t where) { - void *ret; - - if (!sk || where >= sk->num) { - return NULL; - } - - ret = sk->data[where]; - - if (where != sk->num - 1) { - OPENSSL_memmove(&sk->data[where], &sk->data[where + 1], - sizeof(void *) * (sk->num - where - 1)); - } - - sk->num--; - return ret; -} - -void *sk_delete_ptr(_STACK *sk, const void *p) { - if (sk == NULL) { - return NULL; - } - - for (size_t i = 0; i < sk->num; i++) { - if (sk->data[i] == p) { - return sk_delete(sk, i); - } - } - - return NULL; -} - -int sk_find(const _STACK *sk, size_t *out_index, const void *p, - OPENSSL_sk_call_cmp_func call_cmp_func) { - if (sk == NULL) { - return 0; - } - - if (sk->comp == NULL) { - // Use pointer equality when no comparison function has been set. - for (size_t i = 0; i < sk->num; i++) { - if (sk->data[i] == p) { - if (out_index) { - *out_index = i; - } - return 1; - } - } - return 0; - } - - if (p == NULL) { - return 0; - } - - if (!sk_is_sorted(sk)) { - for (size_t i = 0; i < sk->num; i++) { - const void *elem = sk->data[i]; - if (call_cmp_func(sk->comp, &p, &elem) == 0) { - if (out_index) { - *out_index = i; - } - return 1; - } - } - return 0; - } - - // The stack is sorted, so binary search to find the element. - // - // |lo| and |hi| maintain a half-open interval of where the answer may be. All - // indices such that |lo <= idx < hi| are candidates. - size_t lo = 0, hi = sk->num; - while (lo < hi) { - // Bias |mid| towards |lo|. See the |r == 0| case below. - size_t mid = lo + (hi - lo - 1) / 2; - assert(lo <= mid && mid < hi); - const void *elem = sk->data[mid]; - int r = call_cmp_func(sk->comp, &p, &elem); - if (r > 0) { - lo = mid + 1; // |mid| is too low. - } else if (r < 0) { - hi = mid; // |mid| is too high. - } else { - // |mid| matches. However, this function returns the earliest match, so we - // can only return if the range has size one. - if (hi - lo == 1) { - if (out_index != NULL) { - *out_index = mid; - } - return 1; - } - // The sample is biased towards |lo|. |mid| can only be |hi - 1| if - // |hi - lo| was one, so this makes forward progress. - assert(mid + 1 < hi); - hi = mid + 1; - } - } - - assert(lo == hi); - return 0; // Not found. -} - -void *sk_shift(_STACK *sk) { - if (sk == NULL) { - return NULL; - } - if (sk->num == 0) { - return NULL; - } - return sk_delete(sk, 0); -} - -size_t sk_push(_STACK *sk, void *p) { return (sk_insert(sk, p, sk->num)); } - -void *sk_pop(_STACK *sk) { - if (sk == NULL) { - return NULL; - } - if (sk->num == 0) { - return NULL; - } - return sk_delete(sk, sk->num - 1); -} - -_STACK *sk_dup(const _STACK *sk) { - if (sk == NULL) { - return NULL; - } - - _STACK *ret = OPENSSL_malloc(sizeof(_STACK)); - if (ret == NULL) { - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(_STACK)); - - ret->data = OPENSSL_malloc(sizeof(void *) * sk->num_alloc); - if (ret->data == NULL) { - goto err; - } - - ret->num = sk->num; - OPENSSL_memcpy(ret->data, sk->data, sizeof(void *) * sk->num); - ret->sorted = sk->sorted; - ret->num_alloc = sk->num_alloc; - ret->comp = sk->comp; - return ret; - -err: - sk_free(ret); - return NULL; -} - -#if defined(_MSC_VER) -struct sort_compare_ctx { - OPENSSL_sk_call_cmp_func call_cmp_func; - OPENSSL_sk_cmp_func cmp_func; -}; - -static int sort_compare(void *ctx_v, const void *a, const void *b) { - struct sort_compare_ctx *ctx = ctx_v; - return ctx->call_cmp_func(ctx->cmp_func, a, b); -} -#endif - -void sk_sort(_STACK *sk, OPENSSL_sk_call_cmp_func call_cmp_func) { - if (sk == NULL || sk->comp == NULL || sk->sorted) { - return; - } - - if (sk->num >= 2) { -#if defined(_MSC_VER) - // MSVC's |qsort_s| is different from the C11 one. - // https://docs.microsoft.com/en-us/cpp/c-runtime-library/reference/qsort-s?view=msvc-170 - struct sort_compare_ctx ctx = {call_cmp_func, sk->comp}; - qsort_s(sk->data, sk->num, sizeof(void *), sort_compare, &ctx); -#else - // sk->comp is a function that takes pointers to pointers to elements, but - // qsort take a comparison function that just takes pointers to elements. - // However, since we're passing an array of pointers to qsort, we can just - // cast the comparison function and everything works. - // - // TODO(davidben): This is undefined behavior, but the call is in libc so, - // e.g., CFI does not notice. |qsort| is missing a void* parameter in its - // callback, while no one defines |qsort_r| or |qsort_s| consistently. See - // https://stackoverflow.com/a/39561369 - int (*comp_func)(const void *, const void *) = - (int (*)(const void *, const void *))(sk->comp); - qsort(sk->data, sk->num, sizeof(void *), comp_func); -#endif - } - sk->sorted = 1; -} - -int sk_is_sorted(const _STACK *sk) { - if (!sk) { - return 1; - } - return sk->sorted; -} - -OPENSSL_sk_cmp_func sk_set_cmp_func(_STACK *sk, OPENSSL_sk_cmp_func comp) { - OPENSSL_sk_cmp_func old = sk->comp; - - if (sk->comp != comp) { - sk->sorted = 0; - } - sk->comp = comp; - - return old; -} - -_STACK *sk_deep_copy(const _STACK *sk, OPENSSL_sk_call_copy_func call_copy_func, - OPENSSL_sk_copy_func copy_func, - OPENSSL_sk_call_free_func call_free_func, - OPENSSL_sk_free_func free_func) { - _STACK *ret = sk_dup(sk); - if (ret == NULL) { - return NULL; - } - - for (size_t i = 0; i < ret->num; i++) { - if (ret->data[i] == NULL) { - continue; - } - ret->data[i] = call_copy_func(copy_func, ret->data[i]); - if (ret->data[i] == NULL) { - for (size_t j = 0; j < i; j++) { - if (ret->data[j] != NULL) { - call_free_func(free_func, ret->data[j]); - } - } - sk_free(ret); - return NULL; - } - } - - return ret; -} diff --git a/third_party/boringssl/src/crypto/stack/stack.cc b/third_party/boringssl/src/crypto/stack/stack.cc new file mode 100644 index 00000000..89b4f954 --- /dev/null +++ b/third_party/boringssl/src/crypto/stack/stack.cc @@ -0,0 +1,455 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" + + +using namespace bssl; + +struct stack_st { + // num contains the number of valid pointers in |data|. + size_t num; + void **data; + // sorted is non-zero if the values pointed to by |data| are in ascending + // order, based on |comp|. + int sorted; + // num_alloc contains the number of pointers allocated in the buffer pointed + // to by |data|, which may be larger than |num|. + size_t num_alloc; + // comp is an optional comparison function. + OPENSSL_sk_cmp_func comp; +}; + +// kMinSize is the number of pointers that will be initially allocated in a new +// stack. +static const size_t kMinSize = 4; + +OPENSSL_STACK *OPENSSL_sk_new(OPENSSL_sk_cmp_func comp) { + OPENSSL_STACK *ret = New(); + if (ret == nullptr) { + return nullptr; + } + + ret->data = + reinterpret_cast(OPENSSL_calloc(kMinSize, sizeof(void *))); + if (ret->data == nullptr) { + goto err; + } + + ret->comp = comp; + ret->num_alloc = kMinSize; + + return ret; + +err: + Delete(ret); + return nullptr; +} + +OPENSSL_STACK *OPENSSL_sk_new_null() { return OPENSSL_sk_new(nullptr); } + +size_t OPENSSL_sk_num(const OPENSSL_STACK *sk) { + if (sk == nullptr) { + return 0; + } + return sk->num; +} + +void OPENSSL_sk_zero(OPENSSL_STACK *sk) { + if (sk == nullptr || sk->num == 0) { + return; + } + OPENSSL_memset(sk->data, 0, sizeof(void *) * sk->num); + sk->num = 0; + sk->sorted = 0; +} + +void *OPENSSL_sk_value(const OPENSSL_STACK *sk, size_t i) { + if (!sk || i >= sk->num) { + return nullptr; + } + return sk->data[i]; +} + +void *OPENSSL_sk_set(OPENSSL_STACK *sk, size_t i, void *value) { + if (!sk || i >= sk->num) { + return nullptr; + } + sk->sorted = 0; + return sk->data[i] = value; +} + +void OPENSSL_sk_free(OPENSSL_STACK *sk) { + if (sk == nullptr) { + return; + } + OPENSSL_free(sk->data); + Delete(sk); +} + +void OPENSSL_sk_pop_free_ex(OPENSSL_STACK *sk, + OPENSSL_sk_call_free_func call_free_func, + OPENSSL_sk_free_func free_func) { + if (sk == nullptr) { + return; + } + + for (size_t i = 0; i < sk->num; i++) { + if (sk->data[i] != nullptr) { + call_free_func(free_func, sk->data[i]); + } + } + OPENSSL_sk_free(sk); +} + +// Historically, |sk_pop_free| called the function as |OPENSSL_sk_free_func| +// directly. This is undefined in C. Some callers called |sk_pop_free| directly, +// so we must maintain a compatibility version for now. +static void call_free_func_legacy(OPENSSL_sk_free_func func, void *ptr) { + func(ptr); +} + +void sk_pop_free(OPENSSL_STACK *sk, OPENSSL_sk_free_func free_func) { + OPENSSL_sk_pop_free_ex(sk, call_free_func_legacy, free_func); +} + +size_t OPENSSL_sk_insert(OPENSSL_STACK *sk, void *p, size_t where) { + if (sk == nullptr) { + return 0; + } + + if (sk->num >= INT_MAX) { + OPENSSL_PUT_ERROR(CRYPTO, ERR_R_OVERFLOW); + return 0; + } + + if (sk->num_alloc <= sk->num + 1) { + // Attempt to double the size of the array. + size_t new_alloc = sk->num_alloc << 1; + size_t alloc_size = new_alloc * sizeof(void *); + void **data; + + // If the doubling overflowed, try to increment. + if (new_alloc < sk->num_alloc || alloc_size / sizeof(void *) != new_alloc) { + new_alloc = sk->num_alloc + 1; + alloc_size = new_alloc * sizeof(void *); + } + + // If the increment also overflowed, fail. + if (new_alloc < sk->num_alloc || alloc_size / sizeof(void *) != new_alloc) { + return 0; + } + + data = reinterpret_cast(OPENSSL_realloc(sk->data, alloc_size)); + if (data == nullptr) { + return 0; + } + + sk->data = data; + sk->num_alloc = new_alloc; + } + + if (where >= sk->num) { + sk->data[sk->num] = p; + } else { + OPENSSL_memmove(&sk->data[where + 1], &sk->data[where], + sizeof(void *) * (sk->num - where)); + sk->data[where] = p; + } + + sk->num++; + sk->sorted = 0; + + return sk->num; +} + +void *OPENSSL_sk_delete(OPENSSL_STACK *sk, size_t where) { + void *ret; + + if (!sk || where >= sk->num) { + return nullptr; + } + + ret = sk->data[where]; + + if (where != sk->num - 1) { + OPENSSL_memmove(&sk->data[where], &sk->data[where + 1], + sizeof(void *) * (sk->num - where - 1)); + } + + sk->num--; + return ret; +} + +void *OPENSSL_sk_delete_ptr(OPENSSL_STACK *sk, const void *p) { + if (sk == nullptr) { + return nullptr; + } + + for (size_t i = 0; i < sk->num; i++) { + if (sk->data[i] == p) { + return OPENSSL_sk_delete(sk, i); + } + } + + return nullptr; +} + +void OPENSSL_sk_delete_if(OPENSSL_STACK *sk, + OPENSSL_sk_call_delete_if_func call_func, + OPENSSL_sk_delete_if_func func, void *data) { + if (sk == nullptr) { + return; + } + + size_t new_num = 0; + for (size_t i = 0; i < sk->num; i++) { + if (!call_func(func, sk->data[i], data)) { + sk->data[new_num] = sk->data[i]; + new_num++; + } + } + sk->num = new_num; +} + +int OPENSSL_sk_find(const OPENSSL_STACK *sk, size_t *out_index, const void *p, + OPENSSL_sk_call_cmp_func call_cmp_func) { + if (sk == nullptr) { + return 0; + } + + if (sk->comp == nullptr) { + // Use pointer equality when no comparison function has been set. + for (size_t i = 0; i < sk->num; i++) { + if (sk->data[i] == p) { + if (out_index) { + *out_index = i; + } + return 1; + } + } + return 0; + } + + if (p == nullptr) { + return 0; + } + + if (!OPENSSL_sk_is_sorted(sk)) { + for (size_t i = 0; i < sk->num; i++) { + if (call_cmp_func(sk->comp, p, sk->data[i]) == 0) { + if (out_index) { + *out_index = i; + } + return 1; + } + } + return 0; + } + + // The stack is sorted, so binary search to find the element. + // + // |lo| and |hi| maintain a half-open interval of where the answer may be. All + // indices such that |lo <= idx < hi| are candidates. + size_t lo = 0, hi = sk->num; + while (lo < hi) { + // Bias |mid| towards |lo|. See the |r == 0| case below. + size_t mid = lo + (hi - lo - 1) / 2; + assert(lo <= mid && mid < hi); + int r = call_cmp_func(sk->comp, p, sk->data[mid]); + if (r > 0) { + lo = mid + 1; // |mid| is too low. + } else if (r < 0) { + hi = mid; // |mid| is too high. + } else { + // |mid| matches. However, this function returns the earliest match, so we + // can only return if the range has size one. + if (hi - lo == 1) { + if (out_index != nullptr) { + *out_index = mid; + } + return 1; + } + // The sample is biased towards |lo|. |mid| can only be |hi - 1| if + // |hi - lo| was one, so this makes forward progress. + assert(mid + 1 < hi); + hi = mid + 1; + } + } + + assert(lo == hi); + return 0; // Not found. +} + +void *OPENSSL_sk_shift(OPENSSL_STACK *sk) { + if (sk == nullptr) { + return nullptr; + } + if (sk->num == 0) { + return nullptr; + } + return OPENSSL_sk_delete(sk, 0); +} + +size_t OPENSSL_sk_push(OPENSSL_STACK *sk, void *p) { + return OPENSSL_sk_insert(sk, p, sk->num); +} + +void *OPENSSL_sk_pop(OPENSSL_STACK *sk) { + if (sk == nullptr) { + return nullptr; + } + if (sk->num == 0) { + return nullptr; + } + return OPENSSL_sk_delete(sk, sk->num - 1); +} + +OPENSSL_STACK *OPENSSL_sk_dup(const OPENSSL_STACK *sk) { + if (sk == nullptr) { + return nullptr; + } + + OPENSSL_STACK *ret = New(); + if (ret == nullptr) { + return nullptr; + } + + ret->data = reinterpret_cast( + OPENSSL_memdup(sk->data, sizeof(void *) * sk->num_alloc)); + if (ret->data == nullptr) { + goto err; + } + + ret->num = sk->num; + ret->sorted = sk->sorted; + ret->num_alloc = sk->num_alloc; + ret->comp = sk->comp; + return ret; + +err: + OPENSSL_sk_free(ret); + return nullptr; +} + +void OPENSSL_sk_sort(OPENSSL_STACK *sk, + OPENSSL_sk_call_cmp_func call_cmp_func) { + if (sk == nullptr || sk->comp == nullptr || sk->sorted) { + return; + } + + std::sort(sk->data, sk->data + sk->num, [&](void *a, void *b) { + return call_cmp_func(sk->comp, a, b) < 0; + }); + sk->sorted = 1; +} + +void OPENSSL_sk_sort_and_dedup( + OPENSSL_STACK *sk, OPENSSL_sk_call_cmp_func call_cmp_func, + OPENSSL_sk_call_free_func call_free_func, OPENSSL_sk_free_func free_func) { + OPENSSL_sk_sort(sk, call_cmp_func); + if (sk == nullptr || sk->comp == nullptr || sk->num <= 1) { + return; + } + + size_t new_num = 1; + for (size_t i = 1; i < sk->num; i++) { + if (call_cmp_func(sk->comp, sk->data[i], sk->data[new_num - 1]) != 0) { + sk->data[new_num] = sk->data[i]; + new_num++; + } else if (free_func != nullptr) { + call_free_func(free_func, sk->data[i]); + } + } + sk->num = new_num; +} + +int OPENSSL_sk_is_sorted(const OPENSSL_STACK *sk) { + if (!sk) { + return 1; + } + // Zero- and one-element lists are always sorted. + return sk->sorted || (sk->comp != nullptr && sk->num < 2); +} + +OPENSSL_sk_cmp_func OPENSSL_sk_set_cmp_func(OPENSSL_STACK *sk, + OPENSSL_sk_cmp_func comp) { + OPENSSL_sk_cmp_func old = sk->comp; + + if (sk->comp != comp) { + sk->sorted = 0; + } + sk->comp = comp; + + return old; +} + +OPENSSL_STACK *OPENSSL_sk_deep_copy(const OPENSSL_STACK *sk, + OPENSSL_sk_call_copy_func call_copy_func, + OPENSSL_sk_copy_func copy_func, + OPENSSL_sk_call_free_func call_free_func, + OPENSSL_sk_free_func free_func) { + OPENSSL_STACK *ret = OPENSSL_sk_dup(sk); + if (ret == nullptr) { + return nullptr; + } + + for (size_t i = 0; i < ret->num; i++) { + if (ret->data[i] == nullptr) { + continue; + } + ret->data[i] = call_copy_func(copy_func, ret->data[i]); + if (ret->data[i] == nullptr) { + for (size_t j = 0; j < i; j++) { + if (ret->data[j] != nullptr) { + call_free_func(free_func, ret->data[j]); + } + } + OPENSSL_sk_free(ret); + return nullptr; + } + } + + return ret; +} + +OPENSSL_STACK *sk_new_null() { return OPENSSL_sk_new_null(); } + +size_t sk_num(const OPENSSL_STACK *sk) { return OPENSSL_sk_num(sk); } + +void *sk_value(const OPENSSL_STACK *sk, size_t i) { + return OPENSSL_sk_value(sk, i); +} + +void sk_free(OPENSSL_STACK *sk) { OPENSSL_sk_free(sk); } + +size_t sk_push(OPENSSL_STACK *sk, void *p) { return OPENSSL_sk_push(sk, p); } + +void *sk_pop(OPENSSL_STACK *sk) { return OPENSSL_sk_pop(sk); } + +void sk_pop_free_ex(OPENSSL_STACK *sk, OPENSSL_sk_call_free_func call_free_func, + OPENSSL_sk_free_func free_func) { + OPENSSL_sk_pop_free_ex(sk, call_free_func, free_func); +} diff --git a/third_party/boringssl/src/crypto/thread.c b/third_party/boringssl/src/crypto/thread.c deleted file mode 100644 index 25acce1b..00000000 --- a/third_party/boringssl/src/crypto/thread.c +++ /dev/null @@ -1,110 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - - -int CRYPTO_num_locks(void) { return 1; } - -void CRYPTO_set_locking_callback(void (*func)(int mode, int lock_num, - const char *file, int line)) {} - -void (*CRYPTO_get_locking_callback(void))(int mode, int lock_num, - const char *file, int line) { - return NULL; -} - -void CRYPTO_set_add_lock_callback(int (*func)(int *num, int mount, int lock_num, - const char *file, int line)) {} - -const char *CRYPTO_get_lock_name(int lock_num) { - return "No old-style OpenSSL locks anymore"; -} - -int CRYPTO_THREADID_set_callback(void (*func)(CRYPTO_THREADID *)) { return 1; } - -void CRYPTO_THREADID_set_numeric(CRYPTO_THREADID *id, unsigned long val) {} - -void CRYPTO_THREADID_set_pointer(CRYPTO_THREADID *id, void *ptr) {} - -void CRYPTO_THREADID_current(CRYPTO_THREADID *id) {} - -void CRYPTO_set_id_callback(unsigned long (*func)(void)) {} - -void CRYPTO_set_dynlock_create_callback(struct CRYPTO_dynlock_value *( - *dyn_create_function)(const char *file, int line)) {} - -void CRYPTO_set_dynlock_lock_callback(void (*dyn_lock_function)( - int mode, struct CRYPTO_dynlock_value *l, const char *file, int line)) {} - -void CRYPTO_set_dynlock_destroy_callback(void (*dyn_destroy_function)( - struct CRYPTO_dynlock_value *l, const char *file, int line)) {} - -struct CRYPTO_dynlock_value *(*CRYPTO_get_dynlock_create_callback(void))( - const char *file, int line) { - return NULL; -} - -void (*CRYPTO_get_dynlock_lock_callback(void))(int mode, - struct CRYPTO_dynlock_value *l, - const char *file, int line) { - return NULL; -} - -void (*CRYPTO_get_dynlock_destroy_callback(void))( - struct CRYPTO_dynlock_value *l, const char *file, int line) { - return NULL; -} diff --git a/third_party/boringssl/src/crypto/thread.cc b/third_party/boringssl/src/crypto/thread.cc new file mode 100644 index 00000000..cdc4744a --- /dev/null +++ b/third_party/boringssl/src/crypto/thread.cc @@ -0,0 +1,68 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + + +int CRYPTO_num_locks() { return 1; } + +void CRYPTO_set_locking_callback(void (*func)(int mode, int lock_num, + const char *file, int line)) {} + +void (*CRYPTO_get_locking_callback())(int mode, int lock_num, const char *file, + int line) { + return nullptr; +} + +void CRYPTO_set_add_lock_callback(int (*func)(int *num, int mount, int lock_num, + const char *file, int line)) {} + +const char *CRYPTO_get_lock_name(int lock_num) { + return "No old-style OpenSSL locks anymore"; +} + +int CRYPTO_THREADID_set_callback(void (*func)(CRYPTO_THREADID *)) { return 1; } + +void CRYPTO_THREADID_set_numeric(CRYPTO_THREADID *id, unsigned long val) {} + +void CRYPTO_THREADID_set_pointer(CRYPTO_THREADID *id, void *ptr) {} + +void CRYPTO_THREADID_current(CRYPTO_THREADID *id) {} + +void CRYPTO_set_id_callback(unsigned long (*func)()) {} + +void CRYPTO_set_dynlock_create_callback(struct CRYPTO_dynlock_value *( + *dyn_create_function)(const char *file, int line)) {} + +void CRYPTO_set_dynlock_lock_callback(void (*dyn_lock_function)( + int mode, struct CRYPTO_dynlock_value *l, const char *file, int line)) {} + +void CRYPTO_set_dynlock_destroy_callback(void (*dyn_destroy_function)( + struct CRYPTO_dynlock_value *l, const char *file, int line)) {} + +struct CRYPTO_dynlock_value *(*CRYPTO_get_dynlock_create_callback())( + const char *file, int line) { + return nullptr; +} + +void (*CRYPTO_get_dynlock_lock_callback())(int mode, + struct CRYPTO_dynlock_value *l, + const char *file, int line) { + return nullptr; +} + +void (*CRYPTO_get_dynlock_destroy_callback())(struct CRYPTO_dynlock_value *l, + const char *file, int line) { + return nullptr; +} diff --git a/third_party/boringssl/src/crypto/thread_none.c b/third_party/boringssl/src/crypto/thread_none.c deleted file mode 100644 index 4f07b9d9..00000000 --- a/third_party/boringssl/src/crypto/thread_none.c +++ /dev/null @@ -1,59 +0,0 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "internal.h" - -#if !defined(OPENSSL_THREADS) - -void CRYPTO_MUTEX_init(CRYPTO_MUTEX *lock) {} - -void CRYPTO_MUTEX_lock_read(CRYPTO_MUTEX *lock) {} - -void CRYPTO_MUTEX_lock_write(CRYPTO_MUTEX *lock) {} - -void CRYPTO_MUTEX_unlock_read(CRYPTO_MUTEX *lock) {} - -void CRYPTO_MUTEX_unlock_write(CRYPTO_MUTEX *lock) {} - -void CRYPTO_MUTEX_cleanup(CRYPTO_MUTEX *lock) {} - -void CRYPTO_STATIC_MUTEX_lock_read(struct CRYPTO_STATIC_MUTEX *lock) {} - -void CRYPTO_STATIC_MUTEX_lock_write(struct CRYPTO_STATIC_MUTEX *lock) {} - -void CRYPTO_STATIC_MUTEX_unlock_read(struct CRYPTO_STATIC_MUTEX *lock) {} - -void CRYPTO_STATIC_MUTEX_unlock_write(struct CRYPTO_STATIC_MUTEX *lock) {} - -void CRYPTO_once(CRYPTO_once_t *once, void (*init)(void)) { - if (*once) { - return; - } - *once = 1; - init(); -} - -static void *g_thread_locals[NUM_OPENSSL_THREAD_LOCALS]; - -void *CRYPTO_get_thread_local(thread_local_data_t index) { - return g_thread_locals[index]; -} - -int CRYPTO_set_thread_local(thread_local_data_t index, void *value, - thread_local_destructor_t destructor) { - g_thread_locals[index] = value; - return 1; -} - -#endif // !OPENSSL_THREADS diff --git a/third_party/boringssl/src/crypto/thread_none.cc b/third_party/boringssl/src/crypto/thread_none.cc new file mode 100644 index 00000000..432e4cf1 --- /dev/null +++ b/third_party/boringssl/src/crypto/thread_none.cc @@ -0,0 +1,48 @@ +// Copyright 2015 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "internal.h" + +#if !defined(OPENSSL_THREADS) + + +using namespace bssl; + +void StaticMutex::LockRead() {} +void StaticMutex::UnlockRead() {} +void StaticMutex::LockWrite() {} +void StaticMutex::UnlockWrite() {} +Mutex::~Mutex() {} + +void bssl::CRYPTO_once(CRYPTO_once_t *once, void (*init)()) { + if (*once) { + return; + } + *once = 1; + init(); +} + +static void *g_thread_locals[NUM_OPENSSL_THREAD_LOCALS]; + +void *bssl::CRYPTO_get_thread_local(thread_local_data_t index) { + return g_thread_locals[index]; +} + +int bssl::CRYPTO_set_thread_local(thread_local_data_t index, void *value, + thread_local_destructor_t destructor) { + g_thread_locals[index] = value; + return 1; +} + +#endif // !OPENSSL_THREADS diff --git a/third_party/boringssl/src/crypto/thread_pthread.c b/third_party/boringssl/src/crypto/thread_pthread.c deleted file mode 100644 index 08bdd5a7..00000000 --- a/third_party/boringssl/src/crypto/thread_pthread.c +++ /dev/null @@ -1,180 +0,0 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "internal.h" - -#if defined(OPENSSL_PTHREADS) - -#include -#include -#include -#include - -#include - - -static_assert(sizeof(CRYPTO_MUTEX) >= sizeof(pthread_rwlock_t), - "CRYPTO_MUTEX is too small"); -static_assert(alignof(CRYPTO_MUTEX) >= alignof(pthread_rwlock_t), - "CRYPTO_MUTEX has insufficient alignment"); - -void CRYPTO_MUTEX_init(CRYPTO_MUTEX *lock) { - if (pthread_rwlock_init((pthread_rwlock_t *) lock, NULL) != 0) { - abort(); - } -} - -void CRYPTO_MUTEX_lock_read(CRYPTO_MUTEX *lock) { - if (pthread_rwlock_rdlock((pthread_rwlock_t *) lock) != 0) { - abort(); - } -} - -void CRYPTO_MUTEX_lock_write(CRYPTO_MUTEX *lock) { - if (pthread_rwlock_wrlock((pthread_rwlock_t *) lock) != 0) { - abort(); - } -} - -void CRYPTO_MUTEX_unlock_read(CRYPTO_MUTEX *lock) { - if (pthread_rwlock_unlock((pthread_rwlock_t *) lock) != 0) { - abort(); - } -} - -void CRYPTO_MUTEX_unlock_write(CRYPTO_MUTEX *lock) { - if (pthread_rwlock_unlock((pthread_rwlock_t *) lock) != 0) { - abort(); - } -} - -void CRYPTO_MUTEX_cleanup(CRYPTO_MUTEX *lock) { - pthread_rwlock_destroy((pthread_rwlock_t *) lock); -} - -void CRYPTO_STATIC_MUTEX_lock_read(struct CRYPTO_STATIC_MUTEX *lock) { - if (pthread_rwlock_rdlock(&lock->lock) != 0) { - abort(); - } -} - -void CRYPTO_STATIC_MUTEX_lock_write(struct CRYPTO_STATIC_MUTEX *lock) { - if (pthread_rwlock_wrlock(&lock->lock) != 0) { - abort(); - } -} - -void CRYPTO_STATIC_MUTEX_unlock_read(struct CRYPTO_STATIC_MUTEX *lock) { - if (pthread_rwlock_unlock(&lock->lock) != 0) { - abort(); - } -} - -void CRYPTO_STATIC_MUTEX_unlock_write(struct CRYPTO_STATIC_MUTEX *lock) { - if (pthread_rwlock_unlock(&lock->lock) != 0) { - abort(); - } -} - -void CRYPTO_once(CRYPTO_once_t *once, void (*init)(void)) { - if (pthread_once(once, init) != 0) { - abort(); - } -} - -static pthread_mutex_t g_destructors_lock = PTHREAD_MUTEX_INITIALIZER; -static thread_local_destructor_t g_destructors[NUM_OPENSSL_THREAD_LOCALS]; - -// thread_local_destructor is called when a thread exits. It releases thread -// local data for that thread only. -static void thread_local_destructor(void *arg) { - if (arg == NULL) { - return; - } - - thread_local_destructor_t destructors[NUM_OPENSSL_THREAD_LOCALS]; - if (pthread_mutex_lock(&g_destructors_lock) != 0) { - return; - } - OPENSSL_memcpy(destructors, g_destructors, sizeof(destructors)); - pthread_mutex_unlock(&g_destructors_lock); - - unsigned i; - void **pointers = arg; - for (i = 0; i < NUM_OPENSSL_THREAD_LOCALS; i++) { - if (destructors[i] != NULL) { - destructors[i](pointers[i]); - } - } - - OPENSSL_free(pointers); -} - -static pthread_once_t g_thread_local_init_once = PTHREAD_ONCE_INIT; -static pthread_key_t g_thread_local_key; -static int g_thread_local_key_created = 0; - -static void thread_local_init(void) { - g_thread_local_key_created = - pthread_key_create(&g_thread_local_key, thread_local_destructor) == 0; -} - -void *CRYPTO_get_thread_local(thread_local_data_t index) { - CRYPTO_once(&g_thread_local_init_once, thread_local_init); - if (!g_thread_local_key_created) { - return NULL; - } - - void **pointers = pthread_getspecific(g_thread_local_key); - if (pointers == NULL) { - return NULL; - } - return pointers[index]; -} - -int CRYPTO_set_thread_local(thread_local_data_t index, void *value, - thread_local_destructor_t destructor) { - CRYPTO_once(&g_thread_local_init_once, thread_local_init); - if (!g_thread_local_key_created) { - destructor(value); - return 0; - } - - void **pointers = pthread_getspecific(g_thread_local_key); - if (pointers == NULL) { - pointers = OPENSSL_malloc(sizeof(void *) * NUM_OPENSSL_THREAD_LOCALS); - if (pointers == NULL) { - destructor(value); - return 0; - } - OPENSSL_memset(pointers, 0, sizeof(void *) * NUM_OPENSSL_THREAD_LOCALS); - if (pthread_setspecific(g_thread_local_key, pointers) != 0) { - OPENSSL_free(pointers); - destructor(value); - return 0; - } - } - - if (pthread_mutex_lock(&g_destructors_lock) != 0) { - destructor(value); - return 0; - } - g_destructors[index] = destructor; - pthread_mutex_unlock(&g_destructors_lock); - - pointers[index] = value; - return 1; -} - -#endif // OPENSSL_PTHREADS diff --git a/third_party/boringssl/src/crypto/thread_pthread.cc b/third_party/boringssl/src/crypto/thread_pthread.cc new file mode 100644 index 00000000..4e132fb7 --- /dev/null +++ b/third_party/boringssl/src/crypto/thread_pthread.cc @@ -0,0 +1,138 @@ +// Copyright 2015 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Ensure we can't call OPENSSL_malloc circularly. +#define _BORINGSSL_PROHIBIT_OPENSSL_MALLOC +#include "internal.h" + +#if defined(OPENSSL_PTHREADS) + +#include +#include +#include +#include + + +BSSL_NAMESPACE_BEGIN + +void StaticMutex::LockRead() { BSSL_CHECK(pthread_rwlock_rdlock(&lock_) == 0); } + +void StaticMutex::UnlockRead() { + BSSL_CHECK(pthread_rwlock_unlock(&lock_) == 0); +} + +void StaticMutex::LockWrite() { + BSSL_CHECK(pthread_rwlock_wrlock(&lock_) == 0); +} + +void StaticMutex::UnlockWrite() { + BSSL_CHECK(pthread_rwlock_unlock(&lock_) == 0); +} + +Mutex::~Mutex() { pthread_rwlock_destroy(&lock_); } + +void CRYPTO_once(CRYPTO_once_t *once, void (*init)()) { + BSSL_CHECK(pthread_once(once, init) == 0); +} + +static pthread_mutex_t g_destructors_lock = PTHREAD_MUTEX_INITIALIZER; +static thread_local_destructor_t g_destructors[NUM_OPENSSL_THREAD_LOCALS]; + +// thread_local_destructor is called when a thread exits. It releases thread +// local data for that thread only. +static void thread_local_destructor(void *arg) { + if (arg == nullptr) { + return; + } + + thread_local_destructor_t destructors[NUM_OPENSSL_THREAD_LOCALS]; + if (pthread_mutex_lock(&g_destructors_lock) != 0) { + return; + } + OPENSSL_memcpy(destructors, g_destructors, sizeof(destructors)); + pthread_mutex_unlock(&g_destructors_lock); + + unsigned i; + void **pointers = reinterpret_cast(arg); + for (i = 0; i < NUM_OPENSSL_THREAD_LOCALS; i++) { + if (destructors[i] != nullptr) { + destructors[i](pointers[i]); + } + } + + free(pointers); +} + +static pthread_once_t g_thread_local_init_once = PTHREAD_ONCE_INIT; +static pthread_key_t g_thread_local_key; +static int g_thread_local_key_created = 0; + +static void thread_local_init() { + g_thread_local_key_created = + pthread_key_create(&g_thread_local_key, thread_local_destructor) == 0; +} + +void *CRYPTO_get_thread_local(thread_local_data_t index) { + CRYPTO_once(&g_thread_local_init_once, thread_local_init); + if (!g_thread_local_key_created) { + return nullptr; + } + + void **pointers = + reinterpret_cast(pthread_getspecific(g_thread_local_key)); + if (pointers == nullptr) { + return nullptr; + } + return pointers[index]; +} + +int CRYPTO_set_thread_local(thread_local_data_t index, void *value, + thread_local_destructor_t destructor) { + CRYPTO_once(&g_thread_local_init_once, thread_local_init); + if (!g_thread_local_key_created) { + destructor(value); + return 0; + } + + void **pointers = + reinterpret_cast(pthread_getspecific(g_thread_local_key)); + if (pointers == nullptr) { + pointers = reinterpret_cast( + malloc(sizeof(void *) * NUM_OPENSSL_THREAD_LOCALS)); + if (pointers == nullptr) { + destructor(value); + return 0; + } + OPENSSL_memset(pointers, 0, sizeof(void *) * NUM_OPENSSL_THREAD_LOCALS); + if (pthread_setspecific(g_thread_local_key, pointers) != 0) { + free(pointers); + destructor(value); + return 0; + } + } + + if (pthread_mutex_lock(&g_destructors_lock) != 0) { + destructor(value); + return 0; + } + g_destructors[index] = destructor; + pthread_mutex_unlock(&g_destructors_lock); + + pointers[index] = value; + return 1; +} + +BSSL_NAMESPACE_END + +#endif // OPENSSL_PTHREADS diff --git a/third_party/boringssl/src/crypto/thread_win.c b/third_party/boringssl/src/crypto/thread_win.c deleted file mode 100644 index 3b61bfcf..00000000 --- a/third_party/boringssl/src/crypto/thread_win.c +++ /dev/null @@ -1,258 +0,0 @@ -/* Copyright (c) 2015, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include "internal.h" - -#if defined(OPENSSL_WINDOWS_THREADS) - -OPENSSL_MSVC_PRAGMA(warning(push, 3)) -#include -OPENSSL_MSVC_PRAGMA(warning(pop)) - -#include -#include -#include - -#include - - -static_assert(sizeof(CRYPTO_MUTEX) >= sizeof(SRWLOCK), - "CRYPTO_MUTEX is too small"); -static_assert(alignof(CRYPTO_MUTEX) >= alignof(SRWLOCK), - "CRYPTO_MUTEX has insufficient alignment"); - -static BOOL CALLBACK call_once_init(INIT_ONCE *once, void *arg, void **out) { - void (**init)(void) = (void (**)(void))arg; - (**init)(); - return TRUE; -} - -void CRYPTO_once(CRYPTO_once_t *once, void (*init)(void)) { - if (!InitOnceExecuteOnce(once, call_once_init, &init, NULL)) { - abort(); - } -} - -void CRYPTO_MUTEX_init(CRYPTO_MUTEX *lock) { - InitializeSRWLock((SRWLOCK *) lock); -} - -void CRYPTO_MUTEX_lock_read(CRYPTO_MUTEX *lock) { - AcquireSRWLockShared((SRWLOCK *) lock); -} - -void CRYPTO_MUTEX_lock_write(CRYPTO_MUTEX *lock) { - AcquireSRWLockExclusive((SRWLOCK *) lock); -} - -void CRYPTO_MUTEX_unlock_read(CRYPTO_MUTEX *lock) { - ReleaseSRWLockShared((SRWLOCK *) lock); -} - -void CRYPTO_MUTEX_unlock_write(CRYPTO_MUTEX *lock) { - ReleaseSRWLockExclusive((SRWLOCK *) lock); -} - -void CRYPTO_MUTEX_cleanup(CRYPTO_MUTEX *lock) { - // SRWLOCKs require no cleanup. -} - -void CRYPTO_STATIC_MUTEX_lock_read(struct CRYPTO_STATIC_MUTEX *lock) { - AcquireSRWLockShared(&lock->lock); -} - -void CRYPTO_STATIC_MUTEX_lock_write(struct CRYPTO_STATIC_MUTEX *lock) { - AcquireSRWLockExclusive(&lock->lock); -} - -void CRYPTO_STATIC_MUTEX_unlock_read(struct CRYPTO_STATIC_MUTEX *lock) { - ReleaseSRWLockShared(&lock->lock); -} - -void CRYPTO_STATIC_MUTEX_unlock_write(struct CRYPTO_STATIC_MUTEX *lock) { - ReleaseSRWLockExclusive(&lock->lock); -} - -static SRWLOCK g_destructors_lock = SRWLOCK_INIT; -static thread_local_destructor_t g_destructors[NUM_OPENSSL_THREAD_LOCALS]; - -static CRYPTO_once_t g_thread_local_init_once = CRYPTO_ONCE_INIT; -static DWORD g_thread_local_key; -static int g_thread_local_failed; - -static void thread_local_init(void) { - g_thread_local_key = TlsAlloc(); - g_thread_local_failed = (g_thread_local_key == TLS_OUT_OF_INDEXES); -} - -static void NTAPI thread_local_destructor(PVOID module, DWORD reason, - PVOID reserved) { - // Only free memory on |DLL_THREAD_DETACH|, not |DLL_PROCESS_DETACH|. In - // VS2015's debug runtime, the C runtime has been unloaded by the time - // |DLL_PROCESS_DETACH| runs. See https://crbug.com/575795. This is consistent - // with |pthread_key_create| which does not call destructors on process exit, - // only thread exit. - if (reason != DLL_THREAD_DETACH) { - return; - } - - CRYPTO_once(&g_thread_local_init_once, thread_local_init); - if (g_thread_local_failed) { - return; - } - - void **pointers = (void**) TlsGetValue(g_thread_local_key); - if (pointers == NULL) { - return; - } - - thread_local_destructor_t destructors[NUM_OPENSSL_THREAD_LOCALS]; - - AcquireSRWLockExclusive(&g_destructors_lock); - OPENSSL_memcpy(destructors, g_destructors, sizeof(destructors)); - ReleaseSRWLockExclusive(&g_destructors_lock); - - for (unsigned i = 0; i < NUM_OPENSSL_THREAD_LOCALS; i++) { - if (destructors[i] != NULL) { - destructors[i](pointers[i]); - } - } - - OPENSSL_free(pointers); -} - -// Thread Termination Callbacks. -// -// Windows doesn't support a per-thread destructor with its TLS primitives. -// So, we build it manually by inserting a function to be called on each -// thread's exit. This magic is from http://www.codeproject.com/threads/tls.asp -// and it works for VC++ 7.0 and later. -// -// Force a reference to _tls_used to make the linker create the TLS directory -// if it's not already there. (E.g. if __declspec(thread) is not used). Force -// a reference to p_thread_callback_boringssl to prevent whole program -// optimization from discarding the variable. -// -// Note, in the prefixed build, |p_thread_callback_boringssl| may be a macro. -#define STRINGIFY(x) #x -#define EXPAND_AND_STRINGIFY(x) STRINGIFY(x) -#ifdef _WIN64 -__pragma(comment(linker, "/INCLUDE:_tls_used")) -__pragma(comment( - linker, "/INCLUDE:" EXPAND_AND_STRINGIFY(p_thread_callback_boringssl))) -#else -__pragma(comment(linker, "/INCLUDE:__tls_used")) -__pragma(comment( - linker, "/INCLUDE:_" EXPAND_AND_STRINGIFY(p_thread_callback_boringssl))) -#endif - -// .CRT$XLA to .CRT$XLZ is an array of PIMAGE_TLS_CALLBACK pointers that are -// called automatically by the OS loader code (not the CRT) when the module is -// loaded and on thread creation. They are NOT called if the module has been -// loaded by a LoadLibrary() call. It must have implicitly been loaded at -// process startup. -// -// By implicitly loaded, I mean that it is directly referenced by the main EXE -// or by one of its dependent DLLs. Delay-loaded DLL doesn't count as being -// implicitly loaded. -// -// See VC\crt\src\tlssup.c for reference. - -// The linker must not discard p_thread_callback_boringssl. (We force a -// reference to this variable with a linker /INCLUDE:symbol pragma to ensure -// that.) If this variable is discarded, the OnThreadExit function will never -// be called. -#ifdef _WIN64 - -// .CRT section is merged with .rdata on x64 so it must be constant data. -#pragma const_seg(".CRT$XLC") -// When defining a const variable, it must have external linkage to be sure the -// linker doesn't discard it. -extern const PIMAGE_TLS_CALLBACK p_thread_callback_boringssl; -const PIMAGE_TLS_CALLBACK p_thread_callback_boringssl = thread_local_destructor; -// Reset the default section. -#pragma const_seg() - -#else - -#pragma data_seg(".CRT$XLC") -PIMAGE_TLS_CALLBACK p_thread_callback_boringssl = thread_local_destructor; -// Reset the default section. -#pragma data_seg() - -#endif // _WIN64 - -static void **get_thread_locals(void) { - // |TlsGetValue| clears the last error even on success, so that callers may - // distinguish it successfully returning NULL or failing. It is documented to - // never fail if the argument is a valid index from |TlsAlloc|, so we do not - // need to handle this. - // - // However, this error-mangling behavior interferes with the caller's use of - // |GetLastError|. In particular |SSL_get_error| queries the error queue to - // determine whether the caller should look at the OS's errors. To avoid - // destroying state, save and restore the Windows error. - // - // https://msdn.microsoft.com/en-us/library/windows/desktop/ms686812(v=vs.85).aspx - DWORD last_error = GetLastError(); - void **ret = TlsGetValue(g_thread_local_key); - SetLastError(last_error); - return ret; -} - -void *CRYPTO_get_thread_local(thread_local_data_t index) { - CRYPTO_once(&g_thread_local_init_once, thread_local_init); - if (g_thread_local_failed) { - return NULL; - } - - void **pointers = get_thread_locals(); - if (pointers == NULL) { - return NULL; - } - return pointers[index]; -} - -int CRYPTO_set_thread_local(thread_local_data_t index, void *value, - thread_local_destructor_t destructor) { - CRYPTO_once(&g_thread_local_init_once, thread_local_init); - if (g_thread_local_failed) { - destructor(value); - return 0; - } - - void **pointers = get_thread_locals(); - if (pointers == NULL) { - pointers = OPENSSL_malloc(sizeof(void *) * NUM_OPENSSL_THREAD_LOCALS); - if (pointers == NULL) { - destructor(value); - return 0; - } - OPENSSL_memset(pointers, 0, sizeof(void *) * NUM_OPENSSL_THREAD_LOCALS); - if (TlsSetValue(g_thread_local_key, pointers) == 0) { - OPENSSL_free(pointers); - destructor(value); - return 0; - } - } - - AcquireSRWLockExclusive(&g_destructors_lock); - g_destructors[index] = destructor; - ReleaseSRWLockExclusive(&g_destructors_lock); - - pointers[index] = value; - return 1; -} - -#endif // OPENSSL_WINDOWS_THREADS diff --git a/third_party/boringssl/src/crypto/thread_win.cc b/third_party/boringssl/src/crypto/thread_win.cc new file mode 100644 index 00000000..5fce1bcc --- /dev/null +++ b/third_party/boringssl/src/crypto/thread_win.cc @@ -0,0 +1,227 @@ +// Copyright 2015 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Ensure we can't call OPENSSL_malloc circularly. +#define _BORINGSSL_PROHIBIT_OPENSSL_MALLOC +#include "internal.h" + +#if defined(OPENSSL_WINDOWS_THREADS) + +#include + +#include +#include +#include + + +BSSL_NAMESPACE_BEGIN + +static BOOL CALLBACK call_once_init(INIT_ONCE *once, void *arg, void **out) { + void (**init)() = (void (**)())arg; + (**init)(); + return TRUE; +} + +void CRYPTO_once(CRYPTO_once_t *once, void (*init)()) { + BSSL_CHECK(InitOnceExecuteOnce(once, call_once_init, &init, nullptr)); +} + +void StaticMutex::LockRead() { AcquireSRWLockShared(&lock_); } +void StaticMutex::UnlockRead() { ReleaseSRWLockShared(&lock_); } +void StaticMutex::LockWrite() { AcquireSRWLockExclusive(&lock_); } +void StaticMutex::UnlockWrite() { ReleaseSRWLockExclusive(&lock_); } +Mutex::~Mutex() { /* SRWLOCKs require no cleanup. */ } + +static SRWLOCK g_destructors_lock = SRWLOCK_INIT; +static thread_local_destructor_t g_destructors[NUM_OPENSSL_THREAD_LOCALS]; + +static CRYPTO_once_t g_thread_local_init_once = CRYPTO_ONCE_INIT; +static DWORD g_thread_local_key; +static int g_thread_local_failed; + +static void thread_local_init() { + g_thread_local_key = TlsAlloc(); + g_thread_local_failed = (g_thread_local_key == TLS_OUT_OF_INDEXES); +} + +static void NTAPI thread_local_destructor(PVOID module, DWORD reason, + PVOID reserved) { + // Only free memory on |DLL_THREAD_DETACH|, not |DLL_PROCESS_DETACH|. In + // VS2015's debug runtime, the C runtime has been unloaded by the time + // |DLL_PROCESS_DETACH| runs. See https://crbug.com/575795. This is consistent + // with |pthread_key_create| which does not call destructors on process exit, + // only thread exit. + if (reason != DLL_THREAD_DETACH) { + return; + } + + CRYPTO_once(&g_thread_local_init_once, thread_local_init); + if (g_thread_local_failed) { + return; + } + + void **pointers = (void **)TlsGetValue(g_thread_local_key); + if (pointers == nullptr) { + return; + } + + thread_local_destructor_t destructors[NUM_OPENSSL_THREAD_LOCALS]; + + AcquireSRWLockExclusive(&g_destructors_lock); + OPENSSL_memcpy(destructors, g_destructors, sizeof(destructors)); + ReleaseSRWLockExclusive(&g_destructors_lock); + + for (unsigned i = 0; i < NUM_OPENSSL_THREAD_LOCALS; i++) { + if (destructors[i] != nullptr) { + destructors[i](pointers[i]); + } + } + + free(pointers); +} + +// Thread Termination Callbacks. +// +// Windows doesn't support a per-thread destructor with its TLS primitives. +// So, we build it manually by inserting a function to be called on each +// thread's exit. This magic is from http://www.codeproject.com/threads/tls.asp +// and it works for VC++ 7.0 and later. +// +// Force a reference to _tls_used to make the linker create the TLS directory +// if it's not already there. (E.g. if __declspec(thread) is not used). Force +// a reference to p_thread_callback_boringssl to prevent whole program +// optimization from discarding the variable. +// +// Note, in the prefixed build, |p_thread_callback_boringssl| may be a macro. +#define STRINGIFY(x) #x +#define EXPAND_AND_STRINGIFY(x) STRINGIFY(x) +#ifdef _WIN64 +__pragma(comment(linker, "/INCLUDE:_tls_used")) __pragma(comment( + linker, "/INCLUDE:" EXPAND_AND_STRINGIFY(p_thread_callback_boringssl))) +#else +__pragma(comment(linker, "/INCLUDE:__tls_used")) __pragma(comment( + linker, "/INCLUDE:_" EXPAND_AND_STRINGIFY(p_thread_callback_boringssl))) +#endif + +// .CRT$XLA to .CRT$XLZ is an array of PIMAGE_TLS_CALLBACK pointers that are +// called automatically by the OS loader code (not the CRT) when the module is +// loaded and on thread creation. They are NOT called if the module has been +// loaded by a LoadLibrary() call. It must have implicitly been loaded at +// process startup. +// +// By implicitly loaded, I mean that it is directly referenced by the main EXE +// or by one of its dependent DLLs. Delay-loaded DLL doesn't count as being +// implicitly loaded. +// +// See VC\crt\src\tlssup.c for reference. + +// The linker must not discard p_thread_callback_boringssl. (We force a +// reference to this variable with a linker /INCLUDE:symbol pragma to ensure +// that.) If this variable is discarded, the OnThreadExit function will never +// be called. +#ifdef _WIN64 + +// .CRT section is merged with .rdata on x64 so it must be constant data. +#pragma const_seg(".CRT$XLC") + // clang-format off + // When defining a const variable, it must have external linkage to be sure + // the linker doesn't discard it. +extern "C" { + extern const PIMAGE_TLS_CALLBACK p_thread_callback_boringssl; +} +// clang-format on +const PIMAGE_TLS_CALLBACK p_thread_callback_boringssl = thread_local_destructor; +// Reset the default section. +#pragma const_seg() + +#else + +#pragma data_seg(".CRT$XLC") + // clang-format off +extern "C" { + extern PIMAGE_TLS_CALLBACK p_thread_callback_boringssl; +} +// clang-format on +PIMAGE_TLS_CALLBACK p_thread_callback_boringssl = thread_local_destructor; +// Reset the default section. +#pragma data_seg() + +#endif // _WIN64 + +static void **get_thread_locals() { + // |TlsGetValue| clears the last error even on success, so that callers may + // distinguish it successfully returning NULL or failing. It is documented to + // never fail if the argument is a valid index from |TlsAlloc|, so we do not + // need to handle this. + // + // However, this error-mangling behavior interferes with the caller's use of + // |GetLastError|. In particular |SSL_get_error| queries the error queue to + // determine whether the caller should look at the OS's errors. To avoid + // destroying state, save and restore the Windows error. + // + // https://msdn.microsoft.com/en-us/library/windows/desktop/ms686812(v=vs.85).aspx + DWORD last_error = GetLastError(); + void **ret = reinterpret_cast(TlsGetValue(g_thread_local_key)); + SetLastError(last_error); + return ret; +} + +void *CRYPTO_get_thread_local(thread_local_data_t index) { + CRYPTO_once(&g_thread_local_init_once, thread_local_init); + if (g_thread_local_failed) { + return nullptr; + } + + void **pointers = get_thread_locals(); + if (pointers == nullptr) { + return nullptr; + } + return pointers[index]; +} + +int CRYPTO_set_thread_local(thread_local_data_t index, void *value, + thread_local_destructor_t destructor) { + CRYPTO_once(&g_thread_local_init_once, thread_local_init); + if (g_thread_local_failed) { + destructor(value); + return 0; + } + + void **pointers = get_thread_locals(); + if (pointers == nullptr) { + pointers = reinterpret_cast( + malloc(sizeof(void *) * NUM_OPENSSL_THREAD_LOCALS)); + if (pointers == nullptr) { + destructor(value); + return 0; + } + OPENSSL_memset(pointers, 0, sizeof(void *) * NUM_OPENSSL_THREAD_LOCALS); + if (TlsSetValue(g_thread_local_key, pointers) == 0) { + free(pointers); + destructor(value); + return 0; + } + } + + AcquireSRWLockExclusive(&g_destructors_lock); + g_destructors[index] = destructor; + ReleaseSRWLockExclusive(&g_destructors_lock); + + pointers[index] = value; + return 1; +} + +BSSL_NAMESPACE_END + +#endif // OPENSSL_WINDOWS_THREADS diff --git a/third_party/boringssl/src/crypto/trust_token/internal.h b/third_party/boringssl/src/crypto/trust_token/internal.h index 31ecc49b..d3afc172 100644 --- a/third_party/boringssl/src/crypto/trust_token/internal.h +++ b/third_party/boringssl/src/crypto/trust_token/internal.h @@ -1,19 +1,19 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#ifndef OPENSSL_HEADER_TRUST_TOKEN_INTERNAL_H -#define OPENSSL_HEADER_TRUST_TOKEN_INTERNAL_H +// Copyright 2019 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_TRUST_TOKEN_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_TRUST_TOKEN_INTERNAL_H #include #include @@ -25,10 +25,7 @@ #include -#if defined(__cplusplus) -extern "C" { -#endif - +BSSL_NAMESPACE_BEGIN // For the following cryptographic schemes, we use P-384 instead of our usual // choice of P-256. See Appendix I of @@ -46,40 +43,42 @@ extern "C" { #define TRUST_TOKEN_NONCE_SIZE 64 typedef struct { - // TODO(https://crbug.com/boringssl/334): These should store |EC_PRECOMP| so - // that |TRUST_TOKEN_finish_issuance| can use |ec_point_mul_scalar_precomp|. - EC_AFFINE pub0; - EC_AFFINE pub1; - EC_AFFINE pubs; + // TODO(https://crbug.com/boringssl/334): These should store + // |bssl::EC_PRECOMP| so that |TRUST_TOKEN_finish_issuance| can use + // |ec_point_mul_scalar_precomp|. + bssl::EC_AFFINE pub0; + bssl::EC_AFFINE pub1; + bssl::EC_AFFINE pubs; } TRUST_TOKEN_CLIENT_KEY; typedef struct { - EC_SCALAR x0; - EC_SCALAR y0; - EC_SCALAR x1; - EC_SCALAR y1; - EC_SCALAR xs; - EC_SCALAR ys; - EC_AFFINE pub0; - EC_PRECOMP pub0_precomp; - EC_AFFINE pub1; - EC_PRECOMP pub1_precomp; - EC_AFFINE pubs; - EC_PRECOMP pubs_precomp; + bssl::EC_SCALAR x0; + bssl::EC_SCALAR y0; + bssl::EC_SCALAR x1; + bssl::EC_SCALAR y1; + bssl::EC_SCALAR xs; + bssl::EC_SCALAR ys; + bssl::EC_AFFINE pub0; + bssl::EC_PRECOMP pub0_precomp; + bssl::EC_AFFINE pub1; + bssl::EC_PRECOMP pub1_precomp; + bssl::EC_AFFINE pubs; + bssl::EC_PRECOMP pubs_precomp; } TRUST_TOKEN_ISSUER_KEY; // TRUST_TOKEN_PRETOKEN represents the intermediate state a client keeps during // a Trust_Token issuance operation. typedef struct pmb_pretoken_st { + uint8_t salt[TRUST_TOKEN_NONCE_SIZE]; uint8_t t[TRUST_TOKEN_NONCE_SIZE]; - EC_SCALAR r; - EC_AFFINE Tp; + bssl::EC_SCALAR r; + bssl::EC_AFFINE Tp; } TRUST_TOKEN_PRETOKEN; // TRUST_TOKEN_PRETOKEN_free releases the memory associated with |token|. OPENSSL_EXPORT void TRUST_TOKEN_PRETOKEN_free(TRUST_TOKEN_PRETOKEN *token); -DEFINE_STACK_OF(TRUST_TOKEN_PRETOKEN) +DEFINE_NAMESPACED_STACK_OF(TRUST_TOKEN_PRETOKEN) // PMBTokens. @@ -100,18 +99,22 @@ int pmbtoken_exp1_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, const uint8_t *in, size_t len); int pmbtoken_exp1_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, const uint8_t *in, size_t len); -STACK_OF(TRUST_TOKEN_PRETOKEN) * pmbtoken_exp1_blind(CBB *cbb, size_t count); +STACK_OF(TRUST_TOKEN_PRETOKEN) *pmbtoken_exp1_blind(CBB *cbb, size_t count, + int include_message, + const uint8_t *msg, + size_t msg_len); int pmbtoken_exp1_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, size_t num_requested, size_t num_to_issue, uint8_t private_metadata); -STACK_OF(TRUST_TOKEN) * - pmbtoken_exp1_unblind(const TRUST_TOKEN_CLIENT_KEY *key, - const STACK_OF(TRUST_TOKEN_PRETOKEN) * pretokens, - CBS *cbs, size_t count, uint32_t key_id); +STACK_OF(TRUST_TOKEN) *pmbtoken_exp1_unblind( + const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id); int pmbtoken_exp1_read(const TRUST_TOKEN_ISSUER_KEY *key, uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], uint8_t *out_private_metadata, const uint8_t *token, - size_t token_len); + size_t token_len, int include_message, + const uint8_t *msg, size_t msg_len); // pmbtoken_exp1_get_h_for_testing returns H in uncompressed coordinates. This // function is used to confirm H was computed as expected. @@ -128,23 +131,59 @@ int pmbtoken_exp2_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, const uint8_t *in, size_t len); int pmbtoken_exp2_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, const uint8_t *in, size_t len); -STACK_OF(TRUST_TOKEN_PRETOKEN) * pmbtoken_exp2_blind(CBB *cbb, size_t count); +STACK_OF(TRUST_TOKEN_PRETOKEN) *pmbtoken_exp2_blind(CBB *cbb, size_t count, + int include_message, + const uint8_t *msg, + size_t msg_len); int pmbtoken_exp2_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, size_t num_requested, size_t num_to_issue, uint8_t private_metadata); -STACK_OF(TRUST_TOKEN) * - pmbtoken_exp2_unblind(const TRUST_TOKEN_CLIENT_KEY *key, - const STACK_OF(TRUST_TOKEN_PRETOKEN) * pretokens, - CBS *cbs, size_t count, uint32_t key_id); +STACK_OF(TRUST_TOKEN) *pmbtoken_exp2_unblind( + const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id); int pmbtoken_exp2_read(const TRUST_TOKEN_ISSUER_KEY *key, uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], uint8_t *out_private_metadata, const uint8_t *token, - size_t token_len); + size_t token_len, int include_message, + const uint8_t *msg, size_t msg_len); // pmbtoken_exp2_get_h_for_testing returns H in uncompressed coordinates. This // function is used to confirm H was computed as expected. OPENSSL_EXPORT int pmbtoken_exp2_get_h_for_testing(uint8_t out[97]); +// The following functions implement the corresponding |TRUST_TOKENS_METHOD| +// functions for |TRUST_TOKENS_pst_v1|'s PMBTokens construction which uses +// P-384. +int pmbtoken_pst1_generate_key(CBB *out_private, CBB *out_public); +int pmbtoken_pst1_derive_key_from_secret(CBB *out_private, CBB *out_public, + const uint8_t *secret, + size_t secret_len); +int pmbtoken_pst1_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, + const uint8_t *in, size_t len); +int pmbtoken_pst1_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, + const uint8_t *in, size_t len); +STACK_OF(TRUST_TOKEN_PRETOKEN) *pmbtoken_pst1_blind(CBB *cbb, size_t count, + int include_message, + const uint8_t *msg, + size_t msg_len); +int pmbtoken_pst1_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, + size_t num_requested, size_t num_to_issue, + uint8_t private_metadata); +STACK_OF(TRUST_TOKEN) *pmbtoken_pst1_unblind( + const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id); +int pmbtoken_pst1_read(const TRUST_TOKEN_ISSUER_KEY *key, + uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], + uint8_t *out_private_metadata, const uint8_t *token, + size_t token_len, int include_message, + const uint8_t *msg, size_t msg_len); + +// pmbtoken_pst1_get_h_for_testing returns H in uncompressed coordinates. This +// function is used to confirm H was computed as expected. +OPENSSL_EXPORT int pmbtoken_pst1_get_h_for_testing(uint8_t out[97]); + // VOPRF. // @@ -165,19 +204,56 @@ int voprf_exp2_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, const uint8_t *in, size_t len); int voprf_exp2_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, const uint8_t *in, size_t len); -STACK_OF(TRUST_TOKEN_PRETOKEN) * voprf_exp2_blind(CBB *cbb, size_t count); +STACK_OF(TRUST_TOKEN_PRETOKEN) *voprf_exp2_blind(CBB *cbb, size_t count, + int include_message, + const uint8_t *msg, + size_t msg_len); int voprf_exp2_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, size_t num_requested, size_t num_to_issue, uint8_t private_metadata); -STACK_OF(TRUST_TOKEN) * - voprf_exp2_unblind(const TRUST_TOKEN_CLIENT_KEY *key, - const STACK_OF(TRUST_TOKEN_PRETOKEN) * pretokens, - CBS *cbs, size_t count, uint32_t key_id); +STACK_OF(TRUST_TOKEN) *voprf_exp2_unblind( + const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id); int voprf_exp2_read(const TRUST_TOKEN_ISSUER_KEY *key, uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], uint8_t *out_private_metadata, const uint8_t *token, - size_t token_len); + size_t token_len, int include_message, const uint8_t *msg, + size_t msg_len); +// The following functions implement the corresponding |TRUST_TOKENS_METHOD| +// functions for |TRUST_TOKENS_pst_v1|'s VOPRF construction which uses P-384. +int voprf_pst1_generate_key(CBB *out_private, CBB *out_public); +int voprf_pst1_derive_key_from_secret(CBB *out_private, CBB *out_public, + const uint8_t *secret, size_t secret_len); +int voprf_pst1_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, + const uint8_t *in, size_t len); +int voprf_pst1_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, + const uint8_t *in, size_t len); +STACK_OF(TRUST_TOKEN_PRETOKEN) *voprf_pst1_blind(CBB *cbb, size_t count, + int include_message, + const uint8_t *msg, + size_t msg_len); +int voprf_pst1_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, + size_t num_requested, size_t num_to_issue, + uint8_t private_metadata); +OPENSSL_EXPORT int voprf_pst1_sign_with_proof_scalar_for_testing( + const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, size_t num_requested, + size_t num_to_issue, uint8_t private_metadata, + const uint8_t *proof_scalar_buf, size_t proof_scalar_len); +STACK_OF(TRUST_TOKEN) *voprf_pst1_unblind( + const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id); +int voprf_pst1_read(const TRUST_TOKEN_ISSUER_KEY *key, + uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], + uint8_t *out_private_metadata, const uint8_t *token, + size_t token_len, int include_message, const uint8_t *msg, + size_t msg_len); + +using StackOfTrustTokenPretoken = STACK_OF(TRUST_TOKEN_PRETOKEN); + +BSSL_NAMESPACE_END // Trust Tokens internals. @@ -191,28 +267,31 @@ struct trust_token_method_st { // |secret| and writes their serialized forms into |out_private| and // |out_public|. It returns one on success and zero on failure. int (*derive_key_from_secret)(CBB *out_private, CBB *out_public, - const uint8_t *secret, size_t secret_len); + const uint8_t *secret, size_t secret_len); // client_key_from_bytes decodes a client key from |in| and sets |key| // to the resulting key. It returns one on success and zero // on failure. - int (*client_key_from_bytes)(TRUST_TOKEN_CLIENT_KEY *key, const uint8_t *in, - size_t len); + int (*client_key_from_bytes)(bssl::TRUST_TOKEN_CLIENT_KEY *key, + const uint8_t *in, size_t len); // issuer_key_from_bytes decodes a issuer key from |in| and sets |key| // to the resulting key. It returns one on success and zero // on failure. - int (*issuer_key_from_bytes)(TRUST_TOKEN_ISSUER_KEY *key, const uint8_t *in, - size_t len); + int (*issuer_key_from_bytes)(bssl::TRUST_TOKEN_ISSUER_KEY *key, + const uint8_t *in, size_t len); - // blind generates a new issuance request for |count| tokens. On + // blind generates a new issuance request for |count| tokens. If + // |include_message| is set, then |msg| is used to derive the token nonces. On // success, it returns a newly-allocated |STACK_OF(TRUST_TOKEN_PRETOKEN)| and // writes a request to the issuer to |cbb|. On failure, it returns NULL. The - // |STACK_OF(TRUST_TOKEN_PRETOKEN)|s should be passed to |pmbtoken_unblind| when - // the server responds. + // |STACK_OF(TRUST_TOKEN_PRETOKEN)|s should be passed to |pmbtoken_unblind| + // when the server responds. // // This function implements the AT.Usr0 operation. - STACK_OF(TRUST_TOKEN_PRETOKEN) * (*blind)(CBB *cbb, size_t count); + bssl::StackOfTrustTokenPretoken *(*blind)(CBB *cbb, size_t count, + int include_message, + const uint8_t *msg, size_t msg_len); // sign parses a request for |num_requested| tokens from |cbs| and // issues |num_to_issue| tokens with |key| and a private metadata value of @@ -220,7 +299,7 @@ struct trust_token_method_st { // success and zero on failure. // // This function implements the AT.Sig operation. - int (*sign)(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, + int (*sign)(const bssl::TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, size_t num_requested, size_t num_to_issue, uint8_t private_metadata); @@ -232,20 +311,22 @@ struct trust_token_method_st { // returns NULL. // // This function implements the AT.Usr1 operation. - STACK_OF(TRUST_TOKEN) * - (*unblind)(const TRUST_TOKEN_CLIENT_KEY *key, - const STACK_OF(TRUST_TOKEN_PRETOKEN) * pretokens, CBS *cbs, - size_t count, uint32_t key_id); - - // read parses a PMBToken from |token| and verifies it using |key|. On - // success, it returns one and stores the nonce and private metadata bit in - // |out_nonce| and |*out_private_metadata|. Otherwise, it returns zero. Note - // that, unlike the output of |unblind|, |token| does not have a - // four-byte key ID prepended. - int (*read)(const TRUST_TOKEN_ISSUER_KEY *key, + STACK_OF(TRUST_TOKEN) *(*unblind)( + const bssl::TRUST_TOKEN_CLIENT_KEY *key, + const bssl::StackOfTrustTokenPretoken *pretokens, CBS *cbs, size_t count, + uint32_t key_id); + + // read parses a token from |token| and verifies it using |key|. If + // |include_message| is set, then the nonce is derived from |msg| and the salt + // in the token. On success, it returns one and stores the nonce and private + // metadata bit in |out_nonce| and |*out_private_metadata|. Otherwise, it + // returns zero. Note that, unlike the output of |unblind|, |token| does not + // have a four-byte key ID prepended. + int (*read)(const bssl::TRUST_TOKEN_ISSUER_KEY *key, uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], uint8_t *out_private_metadata, const uint8_t *token, - size_t token_len); + size_t token_len, int include_message, const uint8_t *msg, + size_t msg_len); // whether the construction supports private metadata. int has_private_metadata; @@ -257,6 +338,8 @@ struct trust_token_method_st { int has_srr; }; +BSSL_NAMESPACE_BEGIN + // Structure representing a single Trust Token public key with the specified ID. struct trust_token_client_key_st { uint32_t id; @@ -270,6 +353,8 @@ struct trust_token_issuer_key_st { TRUST_TOKEN_ISSUER_KEY key; }; +BSSL_NAMESPACE_END + struct trust_token_client_st { const TRUST_TOKEN_METHOD *method; @@ -278,13 +363,15 @@ struct trust_token_client_st { // keys is the set of public keys that are supported by the client for // issuance/redemptions. - struct trust_token_client_key_st keys[6]; + // TODO(crbug.com/42290036): Replace this and |num_keys| with an + // InplaceVector. + struct bssl::trust_token_client_key_st keys[6]; // num_keys is the number of keys currently configured. size_t num_keys; // pretokens is the intermediate state during an active issuance. - STACK_OF(TRUST_TOKEN_PRETOKEN)* pretokens; + bssl::StackOfTrustTokenPretoken *pretokens; // srr_key is the public key used to verify the signature of the SRR. EVP_PKEY *srr_key; @@ -300,33 +387,19 @@ struct trust_token_issuer_st { // keys is the set of private keys that are supported by the issuer for // issuance/redemptions. The public metadata is an index into this list of // keys. - struct trust_token_issuer_key_st keys[6]; + struct bssl::trust_token_issuer_key_st keys[6]; // num_keys is the number of keys currently configured. size_t num_keys; // srr_key is the private key used to sign the SRR. EVP_PKEY *srr_key; - - // metadata_key is the secret material used to encode the private metadata bit - // in the SRR. - uint8_t *metadata_key; - size_t metadata_key_len; }; - -#if defined(__cplusplus) -} // extern C - -extern "C++" { - BSSL_NAMESPACE_BEGIN BORINGSSL_MAKE_DELETER(TRUST_TOKEN_PRETOKEN, TRUST_TOKEN_PRETOKEN_free) BSSL_NAMESPACE_END -} // extern C++ -#endif - -#endif // OPENSSL_HEADER_TRUST_TOKEN_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_TRUST_TOKEN_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/trust_token/pmbtoken.c b/third_party/boringssl/src/crypto/trust_token/pmbtoken.c deleted file mode 100644 index 68d8909b..00000000 --- a/third_party/boringssl/src/crypto/trust_token/pmbtoken.c +++ /dev/null @@ -1,1496 +0,0 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "../ec_extra/internal.h" -#include "../fipsmodule/bn/internal.h" -#include "../fipsmodule/ec/internal.h" - -#include "internal.h" - - -typedef int (*hash_t_func_t)(const EC_GROUP *group, EC_RAW_POINT *out, - const uint8_t t[TRUST_TOKEN_NONCE_SIZE]); -typedef int (*hash_s_func_t)(const EC_GROUP *group, EC_RAW_POINT *out, - const EC_AFFINE *t, - const uint8_t s[TRUST_TOKEN_NONCE_SIZE]); -typedef int (*hash_c_func_t)(const EC_GROUP *group, EC_SCALAR *out, - uint8_t *buf, size_t len); -typedef int (*hash_to_scalar_func_t)(const EC_GROUP *group, EC_SCALAR *out, - uint8_t *buf, size_t len); - -typedef struct { - const EC_GROUP *group; - EC_PRECOMP g_precomp; - EC_PRECOMP h_precomp; - EC_RAW_POINT h; - // hash_t implements the H_t operation in PMBTokens. It returns one on success - // and zero on error. - hash_t_func_t hash_t; - // hash_s implements the H_s operation in PMBTokens. It returns one on success - // and zero on error. - hash_s_func_t hash_s; - // hash_c implements the H_c operation in PMBTokens. It returns one on success - // and zero on error. - hash_c_func_t hash_c; - // hash_to_scalar implements the HashToScalar operation for PMBTokens. It - // returns one on success and zero on error. - hash_to_scalar_func_t hash_to_scalar; - int prefix_point : 1; -} PMBTOKEN_METHOD; - -static const uint8_t kDefaultAdditionalData[32] = {0}; - -static int pmbtoken_init_method(PMBTOKEN_METHOD *method, int curve_nid, - const uint8_t *h_bytes, size_t h_len, - hash_t_func_t hash_t, hash_s_func_t hash_s, - hash_c_func_t hash_c, - hash_to_scalar_func_t hash_to_scalar, - int prefix_point) { - method->group = EC_GROUP_new_by_curve_name(curve_nid); - if (method->group == NULL) { - return 0; - } - - method->hash_t = hash_t; - method->hash_s = hash_s; - method->hash_c = hash_c; - method->hash_to_scalar = hash_to_scalar; - method->prefix_point = prefix_point; - - EC_AFFINE h; - if (!ec_point_from_uncompressed(method->group, &h, h_bytes, h_len)) { - return 0; - } - ec_affine_to_jacobian(method->group, &method->h, &h); - - if (!ec_init_precomp(method->group, &method->g_precomp, - &method->group->generator->raw) || - !ec_init_precomp(method->group, &method->h_precomp, &method->h)) { - return 0; - } - return 1; -} - -static int derive_scalar_from_secret(const PMBTOKEN_METHOD *method, - EC_SCALAR *out, const uint8_t *secret, - size_t secret_len, uint8_t scalar_id) { - static const uint8_t kKeygenLabel[] = "TrustTokenPMBTokenKeyGen"; - - int ok = 0; - CBB cbb; - CBB_zero(&cbb); - uint8_t *buf = NULL; - size_t len; - if (!CBB_init(&cbb, 0) || - !CBB_add_bytes(&cbb, kKeygenLabel, sizeof(kKeygenLabel)) || - !CBB_add_u8(&cbb, scalar_id) || - !CBB_add_bytes(&cbb, secret, secret_len) || - !CBB_finish(&cbb, &buf, &len) || - !method->hash_to_scalar(method->group, out, buf, len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); - goto err; - } - - ok = 1; - -err: - CBB_cleanup(&cbb); - OPENSSL_free(buf); - return ok; -} - -static int point_to_cbb(CBB *out, const EC_GROUP *group, - const EC_AFFINE *point) { - size_t len = - ec_point_to_bytes(group, point, POINT_CONVERSION_UNCOMPRESSED, NULL, 0); - if (len == 0) { - return 0; - } - uint8_t *p; - return CBB_add_space(out, &p, len) && - ec_point_to_bytes(group, point, POINT_CONVERSION_UNCOMPRESSED, p, - len) == len; -} - -static int cbb_add_prefixed_point(CBB *out, const EC_GROUP *group, - const EC_AFFINE *point, int prefix_point) { - if (prefix_point) { - CBB child; - if (!CBB_add_u16_length_prefixed(out, &child) || - !point_to_cbb(&child, group, point) || - !CBB_flush(out)) { - return 0; - } - } else { - if (!point_to_cbb(out, group, point) || - !CBB_flush(out)) { - return 0; - } - } - - return 1; -} - -static int cbs_get_prefixed_point(CBS *cbs, const EC_GROUP *group, - EC_AFFINE *out, int prefix_point) { - CBS child; - if (prefix_point) { - if (!CBS_get_u16_length_prefixed(cbs, &child)) { - return 0; - } - } else { - size_t plen = 1 + 2 * BN_num_bytes(&group->field); - if (!CBS_get_bytes(cbs, &child, plen)) { - return 0; - } - } - - if (!ec_point_from_uncompressed(group, out, CBS_data(&child), - CBS_len(&child))) { - return 0; - } - return 1; -} - -static int mul_public_3(const EC_GROUP *group, EC_RAW_POINT *out, - const EC_RAW_POINT *p0, const EC_SCALAR *scalar0, - const EC_RAW_POINT *p1, const EC_SCALAR *scalar1, - const EC_RAW_POINT *p2, const EC_SCALAR *scalar2) { - EC_RAW_POINT points[3] = {*p0, *p1, *p2}; - EC_SCALAR scalars[3] = {*scalar0, *scalar1, *scalar2}; - return ec_point_mul_scalar_public_batch(group, out, /*g_scalar=*/NULL, points, - scalars, 3); -} - -static int pmbtoken_compute_keys(const PMBTOKEN_METHOD *method, - CBB *out_private, CBB *out_public, - const EC_SCALAR *x0, const EC_SCALAR *y0, - const EC_SCALAR *x1, const EC_SCALAR *y1, - const EC_SCALAR *xs, const EC_SCALAR *ys) { - const EC_GROUP *group = method->group; - EC_RAW_POINT pub[3]; - if (!ec_point_mul_scalar_precomp(group, &pub[0], &method->g_precomp, - x0, &method->h_precomp, y0, NULL, NULL) || - !ec_point_mul_scalar_precomp(group, &pub[1], &method->g_precomp, - x1, &method->h_precomp, y1, NULL, NULL) || - !ec_point_mul_scalar_precomp(method->group, &pub[2], &method->g_precomp, - xs, &method->h_precomp, ys, NULL, NULL)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); - return 0; - } - - const EC_SCALAR *scalars[] = {x0, y0, x1, y1, xs, ys}; - size_t scalar_len = BN_num_bytes(&group->order); - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(scalars); i++) { - uint8_t *buf; - if (!CBB_add_space(out_private, &buf, scalar_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); - return 0; - } - ec_scalar_to_bytes(group, buf, &scalar_len, scalars[i]); - } - - EC_AFFINE pub_affine[3]; - if (!ec_jacobian_to_affine_batch(group, pub_affine, pub, 3)) { - return 0; - } - - if (!cbb_add_prefixed_point(out_public, group, &pub_affine[0], - method->prefix_point) || - !cbb_add_prefixed_point(out_public, group, &pub_affine[1], - method->prefix_point) || - !cbb_add_prefixed_point(out_public, group, &pub_affine[2], - method->prefix_point)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); - return 0; - } - - return 1; -} - -static int pmbtoken_generate_key(const PMBTOKEN_METHOD *method, - CBB *out_private, CBB *out_public) { - EC_SCALAR x0, y0, x1, y1, xs, ys; - if (!ec_random_nonzero_scalar(method->group, &x0, kDefaultAdditionalData) || - !ec_random_nonzero_scalar(method->group, &y0, kDefaultAdditionalData) || - !ec_random_nonzero_scalar(method->group, &x1, kDefaultAdditionalData) || - !ec_random_nonzero_scalar(method->group, &y1, kDefaultAdditionalData) || - !ec_random_nonzero_scalar(method->group, &xs, kDefaultAdditionalData) || - !ec_random_nonzero_scalar(method->group, &ys, kDefaultAdditionalData)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); - return 0; - } - - return pmbtoken_compute_keys(method, out_private, out_public, &x0, &y0, &x1, - &y1, &xs, &ys); -} - -static int pmbtoken_derive_key_from_secret(const PMBTOKEN_METHOD *method, - CBB *out_private, CBB *out_public, - const uint8_t *secret, - size_t secret_len) { - EC_SCALAR x0, y0, x1, y1, xs, ys; - if (!derive_scalar_from_secret(method, &x0, secret, secret_len, 0) || - !derive_scalar_from_secret(method, &y0, secret, secret_len, 1) || - !derive_scalar_from_secret(method, &x1, secret, secret_len, 2) || - !derive_scalar_from_secret(method, &y1, secret, secret_len, 3) || - !derive_scalar_from_secret(method, &xs, secret, secret_len, 4) || - !derive_scalar_from_secret(method, &ys, secret, secret_len, 5)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); - return 0; - } - - return pmbtoken_compute_keys(method, out_private, out_public, &x0, &y0, &x1, - &y1, &xs, &ys); -} - -static int pmbtoken_client_key_from_bytes(const PMBTOKEN_METHOD *method, - TRUST_TOKEN_CLIENT_KEY *key, - const uint8_t *in, size_t len) { - CBS cbs; - CBS_init(&cbs, in, len); - if (!cbs_get_prefixed_point(&cbs, method->group, &key->pub0, - method->prefix_point) || - !cbs_get_prefixed_point(&cbs, method->group, &key->pub1, - method->prefix_point) || - !cbs_get_prefixed_point(&cbs, method->group, &key->pubs, - method->prefix_point) || - CBS_len(&cbs) != 0) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - - return 1; -} - -static int pmbtoken_issuer_key_from_bytes(const PMBTOKEN_METHOD *method, - TRUST_TOKEN_ISSUER_KEY *key, - const uint8_t *in, size_t len) { - const EC_GROUP *group = method->group; - CBS cbs, tmp; - CBS_init(&cbs, in, len); - size_t scalar_len = BN_num_bytes(&group->order); - EC_SCALAR *scalars[] = {&key->x0, &key->y0, &key->x1, - &key->y1, &key->xs, &key->ys}; - for (size_t i = 0; i < OPENSSL_ARRAY_SIZE(scalars); i++) { - if (!CBS_get_bytes(&cbs, &tmp, scalar_len) || - !ec_scalar_from_bytes(group, scalars[i], CBS_data(&tmp), - CBS_len(&tmp))) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - } - - // Recompute the public key. - EC_RAW_POINT pub[3]; - EC_AFFINE pub_affine[3]; - if (!ec_point_mul_scalar_precomp(group, &pub[0], &method->g_precomp, &key->x0, - &method->h_precomp, &key->y0, NULL, NULL) || - !ec_init_precomp(group, &key->pub0_precomp, &pub[0]) || - !ec_point_mul_scalar_precomp(group, &pub[1], &method->g_precomp, &key->x1, - &method->h_precomp, &key->y1, NULL, NULL) || - !ec_init_precomp(group, &key->pub1_precomp, &pub[1]) || - !ec_point_mul_scalar_precomp(group, &pub[2], &method->g_precomp, &key->xs, - &method->h_precomp, &key->ys, NULL, NULL) || - !ec_init_precomp(group, &key->pubs_precomp, &pub[2]) || - !ec_jacobian_to_affine_batch(group, pub_affine, pub, 3)) { - return 0; - } - - key->pub0 = pub_affine[0]; - key->pub1 = pub_affine[1]; - key->pubs = pub_affine[2]; - return 1; -} - -static STACK_OF(TRUST_TOKEN_PRETOKEN) * - pmbtoken_blind(const PMBTOKEN_METHOD *method, CBB *cbb, size_t count) { - const EC_GROUP *group = method->group; - STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens = sk_TRUST_TOKEN_PRETOKEN_new_null(); - if (pretokens == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - for (size_t i = 0; i < count; i++) { - // Insert |pretoken| into |pretokens| early to simplify error-handling. - TRUST_TOKEN_PRETOKEN *pretoken = OPENSSL_malloc(sizeof(TRUST_TOKEN_PRETOKEN)); - if (pretoken == NULL || - !sk_TRUST_TOKEN_PRETOKEN_push(pretokens, pretoken)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - TRUST_TOKEN_PRETOKEN_free(pretoken); - goto err; - } - - RAND_bytes(pretoken->t, sizeof(pretoken->t)); - - // We sample |pretoken->r| in Montgomery form to simplify inverting. - if (!ec_random_nonzero_scalar(group, &pretoken->r, - kDefaultAdditionalData)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - EC_SCALAR rinv; - ec_scalar_inv0_montgomery(group, &rinv, &pretoken->r); - // Convert both out of Montgomery form. - ec_scalar_from_montgomery(group, &pretoken->r, &pretoken->r); - ec_scalar_from_montgomery(group, &rinv, &rinv); - - EC_RAW_POINT T, Tp; - if (!method->hash_t(group, &T, pretoken->t) || - !ec_point_mul_scalar(group, &Tp, &T, &rinv) || - !ec_jacobian_to_affine(group, &pretoken->Tp, &Tp)) { - goto err; - } - - if (!cbb_add_prefixed_point(cbb, group, &pretoken->Tp, - method->prefix_point)) { - goto err; - } - } - - return pretokens; - -err: - sk_TRUST_TOKEN_PRETOKEN_pop_free(pretokens, TRUST_TOKEN_PRETOKEN_free); - return NULL; -} - -static int scalar_to_cbb(CBB *out, const EC_GROUP *group, - const EC_SCALAR *scalar) { - uint8_t *buf; - size_t scalar_len = BN_num_bytes(&group->order); - if (!CBB_add_space(out, &buf, scalar_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return 0; - } - ec_scalar_to_bytes(group, buf, &scalar_len, scalar); - return 1; -} - -static int scalar_from_cbs(CBS *cbs, const EC_GROUP *group, EC_SCALAR *out) { - size_t scalar_len = BN_num_bytes(&group->order); - CBS tmp; - if (!CBS_get_bytes(cbs, &tmp, scalar_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - - ec_scalar_from_bytes(group, out, CBS_data(&tmp), CBS_len(&tmp)); - return 1; -} - -static int hash_c_dleq(const PMBTOKEN_METHOD *method, EC_SCALAR *out, - const EC_AFFINE *X, const EC_AFFINE *T, - const EC_AFFINE *S, const EC_AFFINE *W, - const EC_AFFINE *K0, const EC_AFFINE *K1) { - static const uint8_t kDLEQ2Label[] = "DLEQ2"; - - int ok = 0; - CBB cbb; - CBB_zero(&cbb); - uint8_t *buf = NULL; - size_t len; - if (!CBB_init(&cbb, 0) || - !CBB_add_bytes(&cbb, kDLEQ2Label, sizeof(kDLEQ2Label)) || - !point_to_cbb(&cbb, method->group, X) || - !point_to_cbb(&cbb, method->group, T) || - !point_to_cbb(&cbb, method->group, S) || - !point_to_cbb(&cbb, method->group, W) || - !point_to_cbb(&cbb, method->group, K0) || - !point_to_cbb(&cbb, method->group, K1) || - !CBB_finish(&cbb, &buf, &len) || - !method->hash_c(method->group, out, buf, len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - ok = 1; - -err: - CBB_cleanup(&cbb); - OPENSSL_free(buf); - return ok; -} - -static int hash_c_dleqor(const PMBTOKEN_METHOD *method, EC_SCALAR *out, - const EC_AFFINE *X0, const EC_AFFINE *X1, - const EC_AFFINE *T, const EC_AFFINE *S, - const EC_AFFINE *W, const EC_AFFINE *K00, - const EC_AFFINE *K01, const EC_AFFINE *K10, - const EC_AFFINE *K11) { - static const uint8_t kDLEQOR2Label[] = "DLEQOR2"; - - int ok = 0; - CBB cbb; - CBB_zero(&cbb); - uint8_t *buf = NULL; - size_t len; - if (!CBB_init(&cbb, 0) || - !CBB_add_bytes(&cbb, kDLEQOR2Label, sizeof(kDLEQOR2Label)) || - !point_to_cbb(&cbb, method->group, X0) || - !point_to_cbb(&cbb, method->group, X1) || - !point_to_cbb(&cbb, method->group, T) || - !point_to_cbb(&cbb, method->group, S) || - !point_to_cbb(&cbb, method->group, W) || - !point_to_cbb(&cbb, method->group, K00) || - !point_to_cbb(&cbb, method->group, K01) || - !point_to_cbb(&cbb, method->group, K10) || - !point_to_cbb(&cbb, method->group, K11) || - !CBB_finish(&cbb, &buf, &len) || - !method->hash_c(method->group, out, buf, len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - ok = 1; - -err: - CBB_cleanup(&cbb); - OPENSSL_free(buf); - return ok; -} - -static int hash_c_batch(const PMBTOKEN_METHOD *method, EC_SCALAR *out, - const CBB *points, size_t index) { - static const uint8_t kDLEQBatchLabel[] = "DLEQ BATCH"; - if (index > 0xffff) { - // The protocol supports only two-byte batches. - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); - return 0; - } - - int ok = 0; - CBB cbb; - CBB_zero(&cbb); - uint8_t *buf = NULL; - size_t len; - if (!CBB_init(&cbb, 0) || - !CBB_add_bytes(&cbb, kDLEQBatchLabel, sizeof(kDLEQBatchLabel)) || - !CBB_add_bytes(&cbb, CBB_data(points), CBB_len(points)) || - !CBB_add_u16(&cbb, (uint16_t)index) || - !CBB_finish(&cbb, &buf, &len) || - !method->hash_c(method->group, out, buf, len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - ok = 1; - -err: - CBB_cleanup(&cbb); - OPENSSL_free(buf); - return ok; -} - -// The DLEQ2 and DLEQOR2 constructions are described in appendix B of -// https://eprint.iacr.org/2020/072/20200324:214215. DLEQ2 is an instance of -// DLEQOR2 with only one value (n=1). - -static int dleq_generate(const PMBTOKEN_METHOD *method, CBB *cbb, - const TRUST_TOKEN_ISSUER_KEY *priv, - const EC_RAW_POINT *T, const EC_RAW_POINT *S, - const EC_RAW_POINT *W, const EC_RAW_POINT *Ws, - uint8_t private_metadata) { - const EC_GROUP *group = method->group; - - // We generate a DLEQ proof for the validity token and a DLEQOR2 proof for the - // private metadata token. To allow amortizing Jacobian-to-affine conversions, - // we compute Ki for both proofs first. - enum { - idx_T, - idx_S, - idx_W, - idx_Ws, - idx_Ks0, - idx_Ks1, - idx_Kb0, - idx_Kb1, - idx_Ko0, - idx_Ko1, - num_idx, - }; - EC_RAW_POINT jacobians[num_idx]; - - // Setup the DLEQ proof. - EC_SCALAR ks0, ks1; - if (// ks0, ks1 <- Zp - !ec_random_nonzero_scalar(group, &ks0, kDefaultAdditionalData) || - !ec_random_nonzero_scalar(group, &ks1, kDefaultAdditionalData) || - // Ks = ks0*(G;T) + ks1*(H;S) - !ec_point_mul_scalar_precomp(group, &jacobians[idx_Ks0], - &method->g_precomp, &ks0, &method->h_precomp, - &ks1, NULL, NULL) || - !ec_point_mul_scalar_batch(group, &jacobians[idx_Ks1], T, &ks0, S, &ks1, - NULL, NULL)) { - return 0; - } - - // Setup the DLEQOR proof. First, select values of xb, yb (keys corresponding - // to the private metadata value) and pubo (public key corresponding to the - // other value) in constant time. - BN_ULONG mask = ((BN_ULONG)0) - (private_metadata & 1); - EC_PRECOMP pubo_precomp; - EC_SCALAR xb, yb; - ec_scalar_select(group, &xb, mask, &priv->x1, &priv->x0); - ec_scalar_select(group, &yb, mask, &priv->y1, &priv->y0); - ec_precomp_select(group, &pubo_precomp, mask, &priv->pub0_precomp, - &priv->pub1_precomp); - - EC_SCALAR k0, k1, minus_co, uo, vo; - if (// k0, k1 <- Zp - !ec_random_nonzero_scalar(group, &k0, kDefaultAdditionalData) || - !ec_random_nonzero_scalar(group, &k1, kDefaultAdditionalData) || - // Kb = k0*(G;T) + k1*(H;S) - !ec_point_mul_scalar_precomp(group, &jacobians[idx_Kb0], - &method->g_precomp, &k0, &method->h_precomp, - &k1, NULL, NULL) || - !ec_point_mul_scalar_batch(group, &jacobians[idx_Kb1], T, &k0, S, &k1, - NULL, NULL) || - // co, uo, vo <- Zp - !ec_random_nonzero_scalar(group, &minus_co, kDefaultAdditionalData) || - !ec_random_nonzero_scalar(group, &uo, kDefaultAdditionalData) || - !ec_random_nonzero_scalar(group, &vo, kDefaultAdditionalData) || - // Ko = uo*(G;T) + vo*(H;S) - co*(pubo;W) - !ec_point_mul_scalar_precomp(group, &jacobians[idx_Ko0], - &method->g_precomp, &uo, &method->h_precomp, - &vo, &pubo_precomp, &minus_co) || - !ec_point_mul_scalar_batch(group, &jacobians[idx_Ko1], T, &uo, S, &vo, W, - &minus_co)) { - return 0; - } - - EC_AFFINE affines[num_idx]; - jacobians[idx_T] = *T; - jacobians[idx_S] = *S; - jacobians[idx_W] = *W; - jacobians[idx_Ws] = *Ws; - if (!ec_jacobian_to_affine_batch(group, affines, jacobians, num_idx)) { - return 0; - } - - // Select the K corresponding to K0 and K1 in constant-time. - EC_AFFINE K00, K01, K10, K11; - ec_affine_select(group, &K00, mask, &affines[idx_Ko0], &affines[idx_Kb0]); - ec_affine_select(group, &K01, mask, &affines[idx_Ko1], &affines[idx_Kb1]); - ec_affine_select(group, &K10, mask, &affines[idx_Kb0], &affines[idx_Ko0]); - ec_affine_select(group, &K11, mask, &affines[idx_Kb1], &affines[idx_Ko1]); - - // Compute c = Hc(...) for the two proofs. - EC_SCALAR cs, c; - if (!hash_c_dleq(method, &cs, &priv->pubs, &affines[idx_T], &affines[idx_S], - &affines[idx_Ws], &affines[idx_Ks0], &affines[idx_Ks1]) || - !hash_c_dleqor(method, &c, &priv->pub0, &priv->pub1, &affines[idx_T], - &affines[idx_S], &affines[idx_W], &K00, &K01, &K10, - &K11)) { - return 0; - } - - // Compute cb, ub, and ub for the two proofs. In each of these products, only - // one operand is in Montgomery form, so the product does not need to be - // converted. - - EC_SCALAR cs_mont; - ec_scalar_to_montgomery(group, &cs_mont, &cs); - - // us = ks0 + cs*xs - EC_SCALAR us, vs; - ec_scalar_mul_montgomery(group, &us, &priv->xs, &cs_mont); - ec_scalar_add(group, &us, &ks0, &us); - - // vs = ks1 + cs*ys - ec_scalar_mul_montgomery(group, &vs, &priv->ys, &cs_mont); - ec_scalar_add(group, &vs, &ks1, &vs); - - // Store DLEQ2 proof in transcript. - if (!scalar_to_cbb(cbb, group, &cs) || - !scalar_to_cbb(cbb, group, &us) || - !scalar_to_cbb(cbb, group, &vs)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return 0; - } - - // cb = c - co - EC_SCALAR cb, ub, vb; - ec_scalar_add(group, &cb, &c, &minus_co); - - EC_SCALAR cb_mont; - ec_scalar_to_montgomery(group, &cb_mont, &cb); - - // ub = k0 + cb*xb - ec_scalar_mul_montgomery(group, &ub, &xb, &cb_mont); - ec_scalar_add(group, &ub, &k0, &ub); - - // vb = k1 + cb*yb - ec_scalar_mul_montgomery(group, &vb, &yb, &cb_mont); - ec_scalar_add(group, &vb, &k1, &vb); - - // Select c, u, v in constant-time. - EC_SCALAR co, c0, c1, u0, u1, v0, v1; - ec_scalar_neg(group, &co, &minus_co); - ec_scalar_select(group, &c0, mask, &co, &cb); - ec_scalar_select(group, &u0, mask, &uo, &ub); - ec_scalar_select(group, &v0, mask, &vo, &vb); - ec_scalar_select(group, &c1, mask, &cb, &co); - ec_scalar_select(group, &u1, mask, &ub, &uo); - ec_scalar_select(group, &v1, mask, &vb, &vo); - - // Store DLEQOR2 proof in transcript. - if (!scalar_to_cbb(cbb, group, &c0) || - !scalar_to_cbb(cbb, group, &c1) || - !scalar_to_cbb(cbb, group, &u0) || - !scalar_to_cbb(cbb, group, &u1) || - !scalar_to_cbb(cbb, group, &v0) || - !scalar_to_cbb(cbb, group, &v1)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return 0; - } - - return 1; -} - -static int dleq_verify(const PMBTOKEN_METHOD *method, CBS *cbs, - const TRUST_TOKEN_CLIENT_KEY *pub, const EC_RAW_POINT *T, - const EC_RAW_POINT *S, const EC_RAW_POINT *W, - const EC_RAW_POINT *Ws) { - const EC_GROUP *group = method->group; - const EC_RAW_POINT *g = &group->generator->raw; - - // We verify a DLEQ proof for the validity token and a DLEQOR2 proof for the - // private metadata token. To allow amortizing Jacobian-to-affine conversions, - // we compute Ki for both proofs first. Additionally, all inputs to this - // function are public, so we can use the faster variable-time - // multiplications. - enum { - idx_T, - idx_S, - idx_W, - idx_Ws, - idx_Ks0, - idx_Ks1, - idx_K00, - idx_K01, - idx_K10, - idx_K11, - num_idx, - }; - EC_RAW_POINT jacobians[num_idx]; - - // Decode the DLEQ proof. - EC_SCALAR cs, us, vs; - if (!scalar_from_cbs(cbs, group, &cs) || - !scalar_from_cbs(cbs, group, &us) || - !scalar_from_cbs(cbs, group, &vs)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - - // Ks = us*(G;T) + vs*(H;S) - cs*(pubs;Ws) - EC_RAW_POINT pubs; - ec_affine_to_jacobian(group, &pubs, &pub->pubs); - EC_SCALAR minus_cs; - ec_scalar_neg(group, &minus_cs, &cs); - if (!mul_public_3(group, &jacobians[idx_Ks0], g, &us, &method->h, &vs, &pubs, - &minus_cs) || - !mul_public_3(group, &jacobians[idx_Ks1], T, &us, S, &vs, Ws, - &minus_cs)) { - return 0; - } - - // Decode the DLEQOR proof. - EC_SCALAR c0, c1, u0, u1, v0, v1; - if (!scalar_from_cbs(cbs, group, &c0) || - !scalar_from_cbs(cbs, group, &c1) || - !scalar_from_cbs(cbs, group, &u0) || - !scalar_from_cbs(cbs, group, &u1) || - !scalar_from_cbs(cbs, group, &v0) || - !scalar_from_cbs(cbs, group, &v1)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - - EC_RAW_POINT pub0, pub1; - ec_affine_to_jacobian(group, &pub0, &pub->pub0); - ec_affine_to_jacobian(group, &pub1, &pub->pub1); - EC_SCALAR minus_c0, minus_c1; - ec_scalar_neg(group, &minus_c0, &c0); - ec_scalar_neg(group, &minus_c1, &c1); - if (// K0 = u0*(G;T) + v0*(H;S) - c0*(pub0;W) - !mul_public_3(group, &jacobians[idx_K00], g, &u0, &method->h, &v0, &pub0, - &minus_c0) || - !mul_public_3(group, &jacobians[idx_K01], T, &u0, S, &v0, W, &minus_c0) || - // K1 = u1*(G;T) + v1*(H;S) - c1*(pub1;W) - !mul_public_3(group, &jacobians[idx_K10], g, &u1, &method->h, &v1, &pub1, - &minus_c1) || - !mul_public_3(group, &jacobians[idx_K11], T, &u1, S, &v1, W, &minus_c1)) { - return 0; - } - - EC_AFFINE affines[num_idx]; - jacobians[idx_T] = *T; - jacobians[idx_S] = *S; - jacobians[idx_W] = *W; - jacobians[idx_Ws] = *Ws; - if (!ec_jacobian_to_affine_batch(group, affines, jacobians, num_idx)) { - return 0; - } - - // Check the DLEQ proof. - EC_SCALAR calculated; - if (!hash_c_dleq(method, &calculated, &pub->pubs, &affines[idx_T], - &affines[idx_S], &affines[idx_Ws], &affines[idx_Ks0], - &affines[idx_Ks1])) { - return 0; - } - - // cs == calculated - if (!ec_scalar_equal_vartime(group, &cs, &calculated)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_PROOF); - return 0; - } - - // Check the DLEQOR proof. - if (!hash_c_dleqor(method, &calculated, &pub->pub0, &pub->pub1, - &affines[idx_T], &affines[idx_S], &affines[idx_W], - &affines[idx_K00], &affines[idx_K01], &affines[idx_K10], - &affines[idx_K11])) { - return 0; - } - - // c0 + c1 == calculated - EC_SCALAR c; - ec_scalar_add(group, &c, &c0, &c1); - if (!ec_scalar_equal_vartime(group, &c, &calculated)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_PROOF); - return 0; - } - - return 1; -} - -static int pmbtoken_sign(const PMBTOKEN_METHOD *method, - const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, - size_t num_requested, size_t num_to_issue, - uint8_t private_metadata) { - const EC_GROUP *group = method->group; - if (num_requested < num_to_issue) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); - return 0; - } - - if (num_to_issue > ((size_t)-1) / sizeof(EC_RAW_POINT) || - num_to_issue > ((size_t)-1) / sizeof(EC_SCALAR)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); - return 0; - } - - int ret = 0; - EC_RAW_POINT *Tps = OPENSSL_malloc(num_to_issue * sizeof(EC_RAW_POINT)); - EC_RAW_POINT *Sps = OPENSSL_malloc(num_to_issue * sizeof(EC_RAW_POINT)); - EC_RAW_POINT *Wps = OPENSSL_malloc(num_to_issue * sizeof(EC_RAW_POINT)); - EC_RAW_POINT *Wsps = OPENSSL_malloc(num_to_issue * sizeof(EC_RAW_POINT)); - EC_SCALAR *es = OPENSSL_malloc(num_to_issue * sizeof(EC_SCALAR)); - CBB batch_cbb; - CBB_zero(&batch_cbb); - if (!Tps || - !Sps || - !Wps || - !Wsps || - !es || - !CBB_init(&batch_cbb, 0) || - !point_to_cbb(&batch_cbb, method->group, &key->pubs) || - !point_to_cbb(&batch_cbb, method->group, &key->pub0) || - !point_to_cbb(&batch_cbb, method->group, &key->pub1)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - for (size_t i = 0; i < num_to_issue; i++) { - EC_AFFINE Tp_affine; - EC_RAW_POINT Tp; - if (!cbs_get_prefixed_point(cbs, group, &Tp_affine, method->prefix_point)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - goto err; - } - ec_affine_to_jacobian(group, &Tp, &Tp_affine); - - EC_SCALAR xb, yb; - BN_ULONG mask = ((BN_ULONG)0) - (private_metadata & 1); - ec_scalar_select(group, &xb, mask, &key->x1, &key->x0); - ec_scalar_select(group, &yb, mask, &key->y1, &key->y0); - - uint8_t s[TRUST_TOKEN_NONCE_SIZE]; - RAND_bytes(s, TRUST_TOKEN_NONCE_SIZE); - // The |jacobians| and |affines| contain Sp, Wp, and Wsp. - EC_RAW_POINT jacobians[3]; - EC_AFFINE affines[3]; - if (!method->hash_s(group, &jacobians[0], &Tp_affine, s) || - !ec_point_mul_scalar_batch(group, &jacobians[1], &Tp, &xb, - &jacobians[0], &yb, NULL, NULL) || - !ec_point_mul_scalar_batch(group, &jacobians[2], &Tp, &key->xs, - &jacobians[0], &key->ys, NULL, NULL) || - !ec_jacobian_to_affine_batch(group, affines, jacobians, 3) || - !CBB_add_bytes(cbb, s, TRUST_TOKEN_NONCE_SIZE) || - !cbb_add_prefixed_point(cbb, group, &affines[1], - method->prefix_point) || - !cbb_add_prefixed_point(cbb, group, &affines[2], - method->prefix_point)) { - goto err; - } - - if (!point_to_cbb(&batch_cbb, group, &Tp_affine) || - !point_to_cbb(&batch_cbb, group, &affines[0]) || - !point_to_cbb(&batch_cbb, group, &affines[1]) || - !point_to_cbb(&batch_cbb, group, &affines[2])) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - Tps[i] = Tp; - Sps[i] = jacobians[0]; - Wps[i] = jacobians[1]; - Wsps[i] = jacobians[2]; - - if (!CBB_flush(cbb)) { - goto err; - } - } - - // The DLEQ batching construction is described in appendix B of - // https://eprint.iacr.org/2020/072/20200324:214215. Note the additional - // computations all act on public inputs. - for (size_t i = 0; i < num_to_issue; i++) { - if (!hash_c_batch(method, &es[i], &batch_cbb, i)) { - goto err; - } - } - - EC_RAW_POINT Tp_batch, Sp_batch, Wp_batch, Wsp_batch; - if (!ec_point_mul_scalar_public_batch(group, &Tp_batch, - /*g_scalar=*/NULL, Tps, es, - num_to_issue) || - !ec_point_mul_scalar_public_batch(group, &Sp_batch, - /*g_scalar=*/NULL, Sps, es, - num_to_issue) || - !ec_point_mul_scalar_public_batch(group, &Wp_batch, - /*g_scalar=*/NULL, Wps, es, - num_to_issue) || - !ec_point_mul_scalar_public_batch(group, &Wsp_batch, - /*g_scalar=*/NULL, Wsps, es, - num_to_issue)) { - goto err; - } - - CBB proof; - if (!CBB_add_u16_length_prefixed(cbb, &proof) || - !dleq_generate(method, &proof, key, &Tp_batch, &Sp_batch, &Wp_batch, - &Wsp_batch, private_metadata) || - !CBB_flush(cbb)) { - goto err; - } - - // Skip over any unused requests. - size_t point_len = 1 + 2 * BN_num_bytes(&group->field); - size_t token_len = point_len; - if (method->prefix_point) { - token_len += 2; - } - if (!CBS_skip(cbs, token_len * (num_requested - num_to_issue))) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - goto err; - } - - ret = 1; - -err: - OPENSSL_free(Tps); - OPENSSL_free(Sps); - OPENSSL_free(Wps); - OPENSSL_free(Wsps); - OPENSSL_free(es); - CBB_cleanup(&batch_cbb); - return ret; -} - -static STACK_OF(TRUST_TOKEN) * - pmbtoken_unblind(const PMBTOKEN_METHOD *method, - const TRUST_TOKEN_CLIENT_KEY *key, - const STACK_OF(TRUST_TOKEN_PRETOKEN) * pretokens, CBS *cbs, - size_t count, uint32_t key_id) { - const EC_GROUP *group = method->group; - if (count > sk_TRUST_TOKEN_PRETOKEN_num(pretokens)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return NULL; - } - - int ok = 0; - STACK_OF(TRUST_TOKEN) *ret = sk_TRUST_TOKEN_new_null(); - if (ret == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return NULL; - } - - if (count > ((size_t)-1) / sizeof(EC_RAW_POINT) || - count > ((size_t)-1) / sizeof(EC_SCALAR)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); - return 0; - } - EC_RAW_POINT *Tps = OPENSSL_malloc(count * sizeof(EC_RAW_POINT)); - EC_RAW_POINT *Sps = OPENSSL_malloc(count * sizeof(EC_RAW_POINT)); - EC_RAW_POINT *Wps = OPENSSL_malloc(count * sizeof(EC_RAW_POINT)); - EC_RAW_POINT *Wsps = OPENSSL_malloc(count * sizeof(EC_RAW_POINT)); - EC_SCALAR *es = OPENSSL_malloc(count * sizeof(EC_SCALAR)); - CBB batch_cbb; - CBB_zero(&batch_cbb); - if (!Tps || - !Sps || - !Wps || - !Wsps || - !es || - !CBB_init(&batch_cbb, 0) || - !point_to_cbb(&batch_cbb, method->group, &key->pubs) || - !point_to_cbb(&batch_cbb, method->group, &key->pub0) || - !point_to_cbb(&batch_cbb, method->group, &key->pub1)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - for (size_t i = 0; i < count; i++) { - const TRUST_TOKEN_PRETOKEN *pretoken = - sk_TRUST_TOKEN_PRETOKEN_value(pretokens, i); - - uint8_t s[TRUST_TOKEN_NONCE_SIZE]; - EC_AFFINE Wp_affine, Wsp_affine; - if (!CBS_copy_bytes(cbs, s, TRUST_TOKEN_NONCE_SIZE) || - !cbs_get_prefixed_point(cbs, group, &Wp_affine, method->prefix_point) || - !cbs_get_prefixed_point(cbs, group, &Wsp_affine, - method->prefix_point)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - goto err; - } - - ec_affine_to_jacobian(group, &Tps[i], &pretoken->Tp); - ec_affine_to_jacobian(group, &Wps[i], &Wp_affine); - ec_affine_to_jacobian(group, &Wsps[i], &Wsp_affine); - if (!method->hash_s(group, &Sps[i], &pretoken->Tp, s)) { - goto err; - } - - EC_AFFINE Sp_affine; - if (!point_to_cbb(&batch_cbb, group, &pretoken->Tp) || - !ec_jacobian_to_affine(group, &Sp_affine, &Sps[i]) || - !point_to_cbb(&batch_cbb, group, &Sp_affine) || - !point_to_cbb(&batch_cbb, group, &Wp_affine) || - !point_to_cbb(&batch_cbb, group, &Wsp_affine)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - // Unblind the token. - EC_RAW_POINT jacobians[3]; - EC_AFFINE affines[3]; - if (!ec_point_mul_scalar(group, &jacobians[0], &Sps[i], &pretoken->r) || - !ec_point_mul_scalar(group, &jacobians[1], &Wps[i], &pretoken->r) || - !ec_point_mul_scalar(group, &jacobians[2], &Wsps[i], &pretoken->r) || - !ec_jacobian_to_affine_batch(group, affines, jacobians, 3)) { - goto err; - } - - // Serialize the token. Include |key_id| to avoid an extra copy in the layer - // above. - CBB token_cbb; - size_t point_len = 1 + 2 * BN_num_bytes(&group->field); - if (!CBB_init(&token_cbb, - 4 + TRUST_TOKEN_NONCE_SIZE + 3 * (2 + point_len)) || - !CBB_add_u32(&token_cbb, key_id) || - !CBB_add_bytes(&token_cbb, pretoken->t, TRUST_TOKEN_NONCE_SIZE) || - !cbb_add_prefixed_point(&token_cbb, group, &affines[0], - method->prefix_point) || - !cbb_add_prefixed_point(&token_cbb, group, &affines[1], - method->prefix_point) || - !cbb_add_prefixed_point(&token_cbb, group, &affines[2], - method->prefix_point) || - !CBB_flush(&token_cbb)) { - CBB_cleanup(&token_cbb); - goto err; - } - - TRUST_TOKEN *token = - TRUST_TOKEN_new(CBB_data(&token_cbb), CBB_len(&token_cbb)); - CBB_cleanup(&token_cbb); - if (token == NULL || - !sk_TRUST_TOKEN_push(ret, token)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - TRUST_TOKEN_free(token); - goto err; - } - } - - // The DLEQ batching construction is described in appendix B of - // https://eprint.iacr.org/2020/072/20200324:214215. Note the additional - // computations all act on public inputs. - for (size_t i = 0; i < count; i++) { - if (!hash_c_batch(method, &es[i], &batch_cbb, i)) { - goto err; - } - } - - EC_RAW_POINT Tp_batch, Sp_batch, Wp_batch, Wsp_batch; - if (!ec_point_mul_scalar_public_batch(group, &Tp_batch, - /*g_scalar=*/NULL, Tps, es, count) || - !ec_point_mul_scalar_public_batch(group, &Sp_batch, - /*g_scalar=*/NULL, Sps, es, count) || - !ec_point_mul_scalar_public_batch(group, &Wp_batch, - /*g_scalar=*/NULL, Wps, es, count) || - !ec_point_mul_scalar_public_batch(group, &Wsp_batch, - /*g_scalar=*/NULL, Wsps, es, count)) { - goto err; - } - - CBS proof; - if (!CBS_get_u16_length_prefixed(cbs, &proof) || - !dleq_verify(method, &proof, key, &Tp_batch, &Sp_batch, &Wp_batch, - &Wsp_batch) || - CBS_len(&proof) != 0) { - goto err; - } - - ok = 1; - -err: - OPENSSL_free(Tps); - OPENSSL_free(Sps); - OPENSSL_free(Wps); - OPENSSL_free(Wsps); - OPENSSL_free(es); - CBB_cleanup(&batch_cbb); - if (!ok) { - sk_TRUST_TOKEN_pop_free(ret, TRUST_TOKEN_free); - ret = NULL; - } - return ret; -} - -static int pmbtoken_read(const PMBTOKEN_METHOD *method, - const TRUST_TOKEN_ISSUER_KEY *key, - uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], - uint8_t *out_private_metadata, const uint8_t *token, - size_t token_len) { - const EC_GROUP *group = method->group; - CBS cbs; - CBS_init(&cbs, token, token_len); - EC_AFFINE S, W, Ws; - if (!CBS_copy_bytes(&cbs, out_nonce, TRUST_TOKEN_NONCE_SIZE) || - !cbs_get_prefixed_point(&cbs, group, &S, method->prefix_point) || - !cbs_get_prefixed_point(&cbs, group, &W, method->prefix_point) || - !cbs_get_prefixed_point(&cbs, group, &Ws, method->prefix_point) || - CBS_len(&cbs) != 0) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_TOKEN); - return 0; - } - - - EC_RAW_POINT T; - if (!method->hash_t(group, &T, out_nonce)) { - return 0; - } - - // We perform three multiplications with S and T. This is enough that it is - // worth using |ec_point_mul_scalar_precomp|. - EC_RAW_POINT S_jacobian; - EC_PRECOMP S_precomp, T_precomp; - ec_affine_to_jacobian(group, &S_jacobian, &S); - if (!ec_init_precomp(group, &S_precomp, &S_jacobian) || - !ec_init_precomp(group, &T_precomp, &T)) { - return 0; - } - - EC_RAW_POINT Ws_calculated; - // Check the validity of the token. - if (!ec_point_mul_scalar_precomp(group, &Ws_calculated, &T_precomp, &key->xs, - &S_precomp, &key->ys, NULL, NULL) || - !ec_affine_jacobian_equal(group, &Ws, &Ws_calculated)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BAD_VALIDITY_CHECK); - return 0; - } - - EC_RAW_POINT W0, W1; - if (!ec_point_mul_scalar_precomp(group, &W0, &T_precomp, &key->x0, &S_precomp, - &key->y0, NULL, NULL) || - !ec_point_mul_scalar_precomp(group, &W1, &T_precomp, &key->x1, &S_precomp, - &key->y1, NULL, NULL)) { - return 0; - } - - const int is_W0 = ec_affine_jacobian_equal(group, &W, &W0); - const int is_W1 = ec_affine_jacobian_equal(group, &W, &W1); - const int is_valid = is_W0 ^ is_W1; - if (!is_valid) { - // Invalid tokens will fail the validity check above. - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); - return 0; - } - - *out_private_metadata = is_W1; - return 1; -} - - -// PMBTokens experiment v1. - -static int pmbtoken_exp1_hash_t(const EC_GROUP *group, EC_RAW_POINT *out, - const uint8_t t[TRUST_TOKEN_NONCE_SIZE]) { - const uint8_t kHashTLabel[] = "PMBTokens Experiment V1 HashT"; - return ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( - group, out, kHashTLabel, sizeof(kHashTLabel), t, TRUST_TOKEN_NONCE_SIZE); -} - -static int pmbtoken_exp1_hash_s(const EC_GROUP *group, EC_RAW_POINT *out, - const EC_AFFINE *t, - const uint8_t s[TRUST_TOKEN_NONCE_SIZE]) { - const uint8_t kHashSLabel[] = "PMBTokens Experiment V1 HashS"; - int ret = 0; - CBB cbb; - uint8_t *buf = NULL; - size_t len; - if (!CBB_init(&cbb, 0) || - !point_to_cbb(&cbb, group, t) || - !CBB_add_bytes(&cbb, s, TRUST_TOKEN_NONCE_SIZE) || - !CBB_finish(&cbb, &buf, &len) || - !ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( - group, out, kHashSLabel, sizeof(kHashSLabel), buf, len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - ret = 1; - -err: - OPENSSL_free(buf); - CBB_cleanup(&cbb); - return ret; -} - -static int pmbtoken_exp1_hash_c(const EC_GROUP *group, EC_SCALAR *out, - uint8_t *buf, size_t len) { - const uint8_t kHashCLabel[] = "PMBTokens Experiment V1 HashC"; - return ec_hash_to_scalar_p384_xmd_sha512_draft07( - group, out, kHashCLabel, sizeof(kHashCLabel), buf, len); -} - -static int pmbtoken_exp1_hash_to_scalar(const EC_GROUP *group, EC_SCALAR *out, - uint8_t *buf, size_t len) { - const uint8_t kHashLabel[] = "PMBTokens Experiment V1 HashToScalar"; - return ec_hash_to_scalar_p384_xmd_sha512_draft07( - group, out, kHashLabel, sizeof(kHashLabel), buf, len); -} - -static int pmbtoken_exp1_ok = 0; -static PMBTOKEN_METHOD pmbtoken_exp1_method; -static CRYPTO_once_t pmbtoken_exp1_method_once = CRYPTO_ONCE_INIT; - -static void pmbtoken_exp1_init_method_impl(void) { - // This is the output of |ec_hash_to_scalar_p384_xmd_sha512_draft07| with DST - // "PMBTokens Experiment V1 HashH" and message "generator". - static const uint8_t kH[] = { - 0x04, 0x82, 0xd5, 0x68, 0xf5, 0x39, 0xf6, 0x08, 0x19, 0xa1, 0x75, - 0x9f, 0x98, 0xb5, 0x10, 0xf5, 0x0b, 0x9d, 0x2b, 0xe1, 0x64, 0x4d, - 0x02, 0x76, 0x18, 0x11, 0xf8, 0x2f, 0xd3, 0x33, 0x25, 0x1f, 0x2c, - 0xb8, 0xf6, 0xf1, 0x9e, 0x93, 0x85, 0x79, 0xb3, 0xb7, 0x81, 0xa3, - 0xe6, 0x23, 0xc3, 0x1c, 0xff, 0x03, 0xd9, 0x40, 0x6c, 0xec, 0xe0, - 0x4d, 0xea, 0xdf, 0x9d, 0x94, 0xd1, 0x87, 0xab, 0x27, 0xf7, 0x4f, - 0x53, 0xea, 0xa3, 0x18, 0x72, 0xb9, 0xd1, 0x56, 0xa0, 0x4e, 0x81, - 0xaa, 0xeb, 0x1c, 0x22, 0x6d, 0x39, 0x1c, 0x5e, 0xb1, 0x27, 0xfc, - 0x87, 0xc3, 0x95, 0xd0, 0x13, 0xb7, 0x0b, 0x5c, 0xc7, - }; - - pmbtoken_exp1_ok = pmbtoken_init_method( - &pmbtoken_exp1_method, NID_secp384r1, kH, sizeof(kH), - pmbtoken_exp1_hash_t, pmbtoken_exp1_hash_s, pmbtoken_exp1_hash_c, - pmbtoken_exp1_hash_to_scalar, 1); -} - -static int pmbtoken_exp1_init_method(void) { - CRYPTO_once(&pmbtoken_exp1_method_once, pmbtoken_exp1_init_method_impl); - if (!pmbtoken_exp1_ok) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); - return 0; - } - return 1; -} - -int pmbtoken_exp1_generate_key(CBB *out_private, CBB *out_public) { - if (!pmbtoken_exp1_init_method()) { - return 0; - } - - return pmbtoken_generate_key(&pmbtoken_exp1_method, out_private, out_public); -} - -int pmbtoken_exp1_derive_key_from_secret(CBB *out_private, CBB *out_public, - const uint8_t *secret, - size_t secret_len) { - if (!pmbtoken_exp1_init_method()) { - return 0; - } - - return pmbtoken_derive_key_from_secret(&pmbtoken_exp1_method, out_private, - out_public, secret, secret_len); -} - -int pmbtoken_exp1_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, - const uint8_t *in, size_t len) { - if (!pmbtoken_exp1_init_method()) { - return 0; - } - return pmbtoken_client_key_from_bytes(&pmbtoken_exp1_method, key, in, len); -} - -int pmbtoken_exp1_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, - const uint8_t *in, size_t len) { - if (!pmbtoken_exp1_init_method()) { - return 0; - } - return pmbtoken_issuer_key_from_bytes(&pmbtoken_exp1_method, key, in, len); -} - -STACK_OF(TRUST_TOKEN_PRETOKEN) * pmbtoken_exp1_blind(CBB *cbb, size_t count) { - if (!pmbtoken_exp1_init_method()) { - return NULL; - } - return pmbtoken_blind(&pmbtoken_exp1_method, cbb, count); -} - -int pmbtoken_exp1_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, - size_t num_requested, size_t num_to_issue, - uint8_t private_metadata) { - if (!pmbtoken_exp1_init_method()) { - return 0; - } - return pmbtoken_sign(&pmbtoken_exp1_method, key, cbb, cbs, num_requested, - num_to_issue, private_metadata); -} - -STACK_OF(TRUST_TOKEN) * - pmbtoken_exp1_unblind(const TRUST_TOKEN_CLIENT_KEY *key, - const STACK_OF(TRUST_TOKEN_PRETOKEN) * pretokens, - CBS *cbs, size_t count, uint32_t key_id) { - if (!pmbtoken_exp1_init_method()) { - return NULL; - } - return pmbtoken_unblind(&pmbtoken_exp1_method, key, pretokens, cbs, count, - key_id); -} - -int pmbtoken_exp1_read(const TRUST_TOKEN_ISSUER_KEY *key, - uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], - uint8_t *out_private_metadata, const uint8_t *token, - size_t token_len) { - if (!pmbtoken_exp1_init_method()) { - return 0; - } - return pmbtoken_read(&pmbtoken_exp1_method, key, out_nonce, - out_private_metadata, token, token_len); -} - -int pmbtoken_exp1_get_h_for_testing(uint8_t out[97]) { - if (!pmbtoken_exp1_init_method()) { - return 0; - } - EC_AFFINE h; - return ec_jacobian_to_affine(pmbtoken_exp1_method.group, &h, - &pmbtoken_exp1_method.h) && - ec_point_to_bytes(pmbtoken_exp1_method.group, &h, - POINT_CONVERSION_UNCOMPRESSED, out, 97) == 97; -} - -// PMBTokens experiment v2. - -static int pmbtoken_exp2_hash_t(const EC_GROUP *group, EC_RAW_POINT *out, - const uint8_t t[TRUST_TOKEN_NONCE_SIZE]) { - const uint8_t kHashTLabel[] = "PMBTokens Experiment V2 HashT"; - return ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( - group, out, kHashTLabel, sizeof(kHashTLabel), t, TRUST_TOKEN_NONCE_SIZE); -} - -static int pmbtoken_exp2_hash_s(const EC_GROUP *group, EC_RAW_POINT *out, - const EC_AFFINE *t, - const uint8_t s[TRUST_TOKEN_NONCE_SIZE]) { - const uint8_t kHashSLabel[] = "PMBTokens Experiment V2 HashS"; - int ret = 0; - CBB cbb; - uint8_t *buf = NULL; - size_t len; - if (!CBB_init(&cbb, 0) || - !point_to_cbb(&cbb, group, t) || - !CBB_add_bytes(&cbb, s, TRUST_TOKEN_NONCE_SIZE) || - !CBB_finish(&cbb, &buf, &len) || - !ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( - group, out, kHashSLabel, sizeof(kHashSLabel), buf, len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - ret = 1; - -err: - OPENSSL_free(buf); - CBB_cleanup(&cbb); - return ret; -} - -static int pmbtoken_exp2_hash_c(const EC_GROUP *group, EC_SCALAR *out, - uint8_t *buf, size_t len) { - const uint8_t kHashCLabel[] = "PMBTokens Experiment V2 HashC"; - return ec_hash_to_scalar_p384_xmd_sha512_draft07( - group, out, kHashCLabel, sizeof(kHashCLabel), buf, len); -} - -static int pmbtoken_exp2_hash_to_scalar(const EC_GROUP *group, EC_SCALAR *out, - uint8_t *buf, size_t len) { - const uint8_t kHashLabel[] = "PMBTokens Experiment V2 HashToScalar"; - return ec_hash_to_scalar_p384_xmd_sha512_draft07( - group, out, kHashLabel, sizeof(kHashLabel), buf, len); -} - -static int pmbtoken_exp2_ok = 0; -static PMBTOKEN_METHOD pmbtoken_exp2_method; -static CRYPTO_once_t pmbtoken_exp2_method_once = CRYPTO_ONCE_INIT; - -static void pmbtoken_exp2_init_method_impl(void) { - // This is the output of |ec_hash_to_scalar_p384_xmd_sha512_draft07| with DST - // "PMBTokens Experiment V2 HashH" and message "generator". - static const uint8_t kH[] = { - 0x04, 0xbc, 0x27, 0x24, 0x99, 0xfa, 0xc9, 0xa4, 0x74, 0x6f, 0xf9, - 0x07, 0x81, 0x55, 0xf8, 0x1f, 0x6f, 0xda, 0x09, 0xe7, 0x8c, 0x5d, - 0x9e, 0x4e, 0x14, 0x7c, 0x53, 0x14, 0xbc, 0x7e, 0x29, 0x57, 0x92, - 0x17, 0x94, 0x6e, 0xd2, 0xdf, 0xa5, 0x31, 0x1b, 0x4e, 0xb7, 0xfc, - 0x93, 0xe3, 0x6e, 0x14, 0x1f, 0x4f, 0x14, 0xf3, 0xe5, 0x47, 0x61, - 0x1c, 0x2c, 0x72, 0x25, 0xf0, 0x4a, 0x45, 0x23, 0x2d, 0x57, 0x93, - 0x0e, 0xb2, 0x55, 0xb8, 0x57, 0x25, 0x4c, 0x1e, 0xdb, 0xfd, 0x58, - 0x70, 0x17, 0x9a, 0xbb, 0x9e, 0x5e, 0x93, 0x9e, 0x92, 0xd3, 0xe8, - 0x25, 0x62, 0xbf, 0x59, 0xb2, 0xd2, 0x3d, 0x71, 0xff - }; - - pmbtoken_exp2_ok = pmbtoken_init_method( - &pmbtoken_exp2_method, NID_secp384r1, kH, sizeof(kH), - pmbtoken_exp2_hash_t, pmbtoken_exp2_hash_s, pmbtoken_exp2_hash_c, - pmbtoken_exp2_hash_to_scalar, 0); -} - -static int pmbtoken_exp2_init_method(void) { - CRYPTO_once(&pmbtoken_exp2_method_once, pmbtoken_exp2_init_method_impl); - if (!pmbtoken_exp2_ok) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); - return 0; - } - return 1; -} - -int pmbtoken_exp2_generate_key(CBB *out_private, CBB *out_public) { - if (!pmbtoken_exp2_init_method()) { - return 0; - } - - return pmbtoken_generate_key(&pmbtoken_exp2_method, out_private, out_public); -} - - -int pmbtoken_exp2_derive_key_from_secret(CBB *out_private, CBB *out_public, - const uint8_t *secret, - size_t secret_len) { - if (!pmbtoken_exp2_init_method()) { - return 0; - } - - return pmbtoken_derive_key_from_secret(&pmbtoken_exp2_method, out_private, - out_public, secret, secret_len); -} - -int pmbtoken_exp2_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, - const uint8_t *in, size_t len) { - if (!pmbtoken_exp2_init_method()) { - return 0; - } - return pmbtoken_client_key_from_bytes(&pmbtoken_exp2_method, key, in, len); -} - -int pmbtoken_exp2_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, - const uint8_t *in, size_t len) { - if (!pmbtoken_exp2_init_method()) { - return 0; - } - return pmbtoken_issuer_key_from_bytes(&pmbtoken_exp2_method, key, in, len); -} - -STACK_OF(TRUST_TOKEN_PRETOKEN) * pmbtoken_exp2_blind(CBB *cbb, size_t count) { - if (!pmbtoken_exp2_init_method()) { - return NULL; - } - return pmbtoken_blind(&pmbtoken_exp2_method, cbb, count); -} - -int pmbtoken_exp2_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, - size_t num_requested, size_t num_to_issue, - uint8_t private_metadata) { - if (!pmbtoken_exp2_init_method()) { - return 0; - } - return pmbtoken_sign(&pmbtoken_exp2_method, key, cbb, cbs, num_requested, - num_to_issue, private_metadata); -} - -STACK_OF(TRUST_TOKEN) * - pmbtoken_exp2_unblind(const TRUST_TOKEN_CLIENT_KEY *key, - const STACK_OF(TRUST_TOKEN_PRETOKEN) * pretokens, - CBS *cbs, size_t count, uint32_t key_id) { - if (!pmbtoken_exp2_init_method()) { - return NULL; - } - return pmbtoken_unblind(&pmbtoken_exp2_method, key, pretokens, cbs, count, - key_id); -} - -int pmbtoken_exp2_read(const TRUST_TOKEN_ISSUER_KEY *key, - uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], - uint8_t *out_private_metadata, const uint8_t *token, - size_t token_len) { - if (!pmbtoken_exp2_init_method()) { - return 0; - } - return pmbtoken_read(&pmbtoken_exp2_method, key, out_nonce, - out_private_metadata, token, token_len); -} - -int pmbtoken_exp2_get_h_for_testing(uint8_t out[97]) { - if (!pmbtoken_exp2_init_method()) { - return 0; - } - EC_AFFINE h; - return ec_jacobian_to_affine(pmbtoken_exp2_method.group, &h, - &pmbtoken_exp2_method.h) && - ec_point_to_bytes(pmbtoken_exp2_method.group, &h, - POINT_CONVERSION_UNCOMPRESSED, out, 97) == 97; -} diff --git a/third_party/boringssl/src/crypto/trust_token/pmbtoken.cc b/third_party/boringssl/src/crypto/trust_token/pmbtoken.cc new file mode 100644 index 00000000..95f1fd3d --- /dev/null +++ b/third_party/boringssl/src/crypto/trust_token/pmbtoken.cc @@ -0,0 +1,1672 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ec/internal.h" +#include "../fipsmodule/bn/internal.h" +#include "../fipsmodule/ec/internal.h" +#include "../mem_internal.h" + +#include "internal.h" + + +using namespace bssl; + +typedef int (*hash_t_func_t)(const EC_GROUP *group, EC_JACOBIAN *out, + const uint8_t t[TRUST_TOKEN_NONCE_SIZE]); +typedef int (*hash_s_func_t)(const EC_GROUP *group, EC_JACOBIAN *out, + const EC_AFFINE *t, + const uint8_t s[TRUST_TOKEN_NONCE_SIZE]); +typedef int (*hash_c_func_t)(const EC_GROUP *group, EC_SCALAR *out, + uint8_t *buf, size_t len); +typedef int (*hash_to_scalar_func_t)(const EC_GROUP *group, EC_SCALAR *out, + uint8_t *buf, size_t len); + +typedef struct { + const EC_GROUP *group; + EC_PRECOMP g_precomp; + EC_PRECOMP h_precomp; + EC_JACOBIAN h; + // hash_t implements the H_t operation in PMBTokens. It returns one on success + // and zero on error. + hash_t_func_t hash_t; + // hash_s implements the H_s operation in PMBTokens. It returns one on success + // and zero on error. + hash_s_func_t hash_s; + // hash_c implements the H_c operation in PMBTokens. It returns one on success + // and zero on error. + hash_c_func_t hash_c; + // hash_to_scalar implements the HashToScalar operation for PMBTokens. It + // returns one on success and zero on error. + hash_to_scalar_func_t hash_to_scalar; + int prefix_point : 1; +} PMBTOKEN_METHOD; + +static const uint8_t kDefaultAdditionalData[32] = {0}; + +static int pmbtoken_init_method(PMBTOKEN_METHOD *method, const EC_GROUP *group, + const uint8_t *h_bytes, size_t h_len, + hash_t_func_t hash_t, hash_s_func_t hash_s, + hash_c_func_t hash_c, + hash_to_scalar_func_t hash_to_scalar, + int prefix_point) { + method->group = group; + method->hash_t = hash_t; + method->hash_s = hash_s; + method->hash_c = hash_c; + method->hash_to_scalar = hash_to_scalar; + method->prefix_point = prefix_point; + + EC_AFFINE h; + if (!ec_point_from_uncompressed(method->group, &h, h_bytes, h_len)) { + return 0; + } + ec_affine_to_jacobian(method->group, &method->h, &h); + + if (!ec_init_precomp(method->group, &method->g_precomp, + &method->group->generator.raw) || + !ec_init_precomp(method->group, &method->h_precomp, &method->h)) { + return 0; + } + return 1; +} + +static int derive_scalar_from_secret(const PMBTOKEN_METHOD *method, + EC_SCALAR *out, const uint8_t *secret, + size_t secret_len, uint8_t scalar_id) { + static const uint8_t kKeygenLabel[] = "TrustTokenPMBTokenKeyGen"; + + int ok = 0; + CBB cbb; + CBB_zero(&cbb); + uint8_t *buf = nullptr; + size_t len; + if (!CBB_init(&cbb, 0) || + !CBB_add_bytes(&cbb, kKeygenLabel, sizeof(kKeygenLabel)) || + !CBB_add_u8(&cbb, scalar_id) || + !CBB_add_bytes(&cbb, secret, secret_len) || + !CBB_finish(&cbb, &buf, &len) || + !method->hash_to_scalar(method->group, out, buf, len)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); + goto err; + } + + ok = 1; + +err: + CBB_cleanup(&cbb); + OPENSSL_free(buf); + return ok; +} + +static int point_to_cbb(CBB *out, const EC_GROUP *group, + const EC_AFFINE *point) { + size_t len = ec_point_byte_len(group, POINT_CONVERSION_UNCOMPRESSED); + if (len == 0) { + return 0; + } + uint8_t *p; + return CBB_add_space(out, &p, len) && + ec_point_to_bytes(group, point, POINT_CONVERSION_UNCOMPRESSED, p, + len) == len; +} + +static int cbb_add_prefixed_point(CBB *out, const EC_GROUP *group, + const EC_AFFINE *point, int prefix_point) { + if (prefix_point) { + CBB child; + if (!CBB_add_u16_length_prefixed(out, &child) || + !point_to_cbb(&child, group, point) || !CBB_flush(out)) { + return 0; + } + } else { + if (!point_to_cbb(out, group, point) || !CBB_flush(out)) { + return 0; + } + } + + return 1; +} + +static int cbs_get_prefixed_point(CBS *cbs, const EC_GROUP *group, + EC_AFFINE *out, int prefix_point) { + CBS child; + if (prefix_point) { + if (!CBS_get_u16_length_prefixed(cbs, &child)) { + return 0; + } + } else { + size_t plen = ec_point_byte_len(group, POINT_CONVERSION_UNCOMPRESSED); + if (!CBS_get_bytes(cbs, &child, plen)) { + return 0; + } + } + + if (!ec_point_from_uncompressed(group, out, CBS_data(&child), + CBS_len(&child))) { + return 0; + } + return 1; +} + +static int mul_public_3(const EC_GROUP *group, EC_JACOBIAN *out, + const EC_JACOBIAN *p0, const EC_SCALAR *scalar0, + const EC_JACOBIAN *p1, const EC_SCALAR *scalar1, + const EC_JACOBIAN *p2, const EC_SCALAR *scalar2) { + EC_JACOBIAN points[3] = {*p0, *p1, *p2}; + EC_SCALAR scalars[3] = {*scalar0, *scalar1, *scalar2}; + return ec_point_mul_scalar_public_batch(group, out, /*g_scalar=*/nullptr, + points, scalars, 3); +} + +static int pmbtoken_compute_keys(const PMBTOKEN_METHOD *method, + CBB *out_private, CBB *out_public, + const EC_SCALAR *x0, const EC_SCALAR *y0, + const EC_SCALAR *x1, const EC_SCALAR *y1, + const EC_SCALAR *xs, const EC_SCALAR *ys) { + const EC_GROUP *group = method->group; + EC_JACOBIAN pub[3]; + if (!ec_point_mul_scalar_precomp(group, &pub[0], &method->g_precomp, x0, + &method->h_precomp, y0, nullptr, nullptr) || + !ec_point_mul_scalar_precomp(group, &pub[1], &method->g_precomp, x1, + &method->h_precomp, y1, nullptr, nullptr) || + !ec_point_mul_scalar_precomp(method->group, &pub[2], &method->g_precomp, + xs, &method->h_precomp, ys, nullptr, + nullptr)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); + return 0; + } + + const EC_SCALAR *scalars[] = {x0, y0, x1, y1, xs, ys}; + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); + for (const EC_SCALAR *scalar : scalars) { + uint8_t *buf; + if (!CBB_add_space(out_private, &buf, scalar_len)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); + return 0; + } + ec_scalar_to_bytes(group, buf, &scalar_len, scalar); + } + + EC_AFFINE pub_affine[3]; + if (!ec_jacobian_to_affine_batch(group, pub_affine, pub, 3)) { + return 0; + } + + if (!cbb_add_prefixed_point(out_public, group, &pub_affine[0], + method->prefix_point) || + !cbb_add_prefixed_point(out_public, group, &pub_affine[1], + method->prefix_point) || + !cbb_add_prefixed_point(out_public, group, &pub_affine[2], + method->prefix_point)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); + return 0; + } + + return 1; +} + +static int pmbtoken_generate_key(const PMBTOKEN_METHOD *method, + CBB *out_private, CBB *out_public) { + EC_SCALAR x0, y0, x1, y1, xs, ys; + if (!ec_random_nonzero_scalar(method->group, &x0, kDefaultAdditionalData) || + !ec_random_nonzero_scalar(method->group, &y0, kDefaultAdditionalData) || + !ec_random_nonzero_scalar(method->group, &x1, kDefaultAdditionalData) || + !ec_random_nonzero_scalar(method->group, &y1, kDefaultAdditionalData) || + !ec_random_nonzero_scalar(method->group, &xs, kDefaultAdditionalData) || + !ec_random_nonzero_scalar(method->group, &ys, kDefaultAdditionalData)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); + return 0; + } + + return pmbtoken_compute_keys(method, out_private, out_public, &x0, &y0, &x1, + &y1, &xs, &ys); +} + +static int pmbtoken_derive_key_from_secret(const PMBTOKEN_METHOD *method, + CBB *out_private, CBB *out_public, + const uint8_t *secret, + size_t secret_len) { + EC_SCALAR x0, y0, x1, y1, xs, ys; + if (!derive_scalar_from_secret(method, &x0, secret, secret_len, 0) || + !derive_scalar_from_secret(method, &y0, secret, secret_len, 1) || + !derive_scalar_from_secret(method, &x1, secret, secret_len, 2) || + !derive_scalar_from_secret(method, &y1, secret, secret_len, 3) || + !derive_scalar_from_secret(method, &xs, secret, secret_len, 4) || + !derive_scalar_from_secret(method, &ys, secret, secret_len, 5)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); + return 0; + } + + return pmbtoken_compute_keys(method, out_private, out_public, &x0, &y0, &x1, + &y1, &xs, &ys); +} + +static int pmbtoken_client_key_from_bytes(const PMBTOKEN_METHOD *method, + TRUST_TOKEN_CLIENT_KEY *key, + const uint8_t *in, size_t len) { + CBS cbs; + CBS_init(&cbs, in, len); + if (!cbs_get_prefixed_point(&cbs, method->group, &key->pub0, + method->prefix_point) || + !cbs_get_prefixed_point(&cbs, method->group, &key->pub1, + method->prefix_point) || + !cbs_get_prefixed_point(&cbs, method->group, &key->pubs, + method->prefix_point) || + CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + + return 1; +} + +static int pmbtoken_issuer_key_from_bytes(const PMBTOKEN_METHOD *method, + TRUST_TOKEN_ISSUER_KEY *key, + const uint8_t *in, size_t len) { + const EC_GROUP *group = method->group; + CBS cbs, tmp; + CBS_init(&cbs, in, len); + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); + EC_SCALAR *scalars[] = {&key->x0, &key->y0, &key->x1, + &key->y1, &key->xs, &key->ys}; + for (EC_SCALAR *scalar : scalars) { + if (!CBS_get_bytes(&cbs, &tmp, scalar_len) || + !ec_scalar_from_bytes(group, scalar, CBS_data(&tmp), CBS_len(&tmp))) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + } + + // Recompute the public key. + EC_JACOBIAN pub[3]; + EC_AFFINE pub_affine[3]; + if (!ec_point_mul_scalar_precomp(group, &pub[0], &method->g_precomp, &key->x0, + &method->h_precomp, &key->y0, nullptr, + nullptr) || + !ec_init_precomp(group, &key->pub0_precomp, &pub[0]) || + !ec_point_mul_scalar_precomp(group, &pub[1], &method->g_precomp, &key->x1, + &method->h_precomp, &key->y1, nullptr, + nullptr) || + !ec_init_precomp(group, &key->pub1_precomp, &pub[1]) || + !ec_point_mul_scalar_precomp(group, &pub[2], &method->g_precomp, &key->xs, + &method->h_precomp, &key->ys, nullptr, + nullptr) || + !ec_init_precomp(group, &key->pubs_precomp, &pub[2]) || + !ec_jacobian_to_affine_batch(group, pub_affine, pub, 3)) { + return 0; + } + + key->pub0 = pub_affine[0]; + key->pub1 = pub_affine[1]; + key->pubs = pub_affine[2]; + return 1; +} + +static STACK_OF(TRUST_TOKEN_PRETOKEN) *pmbtoken_blind( + const PMBTOKEN_METHOD *method, CBB *cbb, size_t count, int include_message, + const uint8_t *msg, size_t msg_len) { + SHA512_CTX hash_ctx; + + const EC_GROUP *group = method->group; + STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens = + sk_TRUST_TOKEN_PRETOKEN_new_null(); + if (pretokens == nullptr) { + goto err; + } + + for (size_t i = 0; i < count; i++) { + // Insert |pretoken| into |pretokens| early to simplify error-handling. + TRUST_TOKEN_PRETOKEN *pretoken = New(); + if (pretoken == nullptr || + !sk_TRUST_TOKEN_PRETOKEN_push(pretokens, pretoken)) { + TRUST_TOKEN_PRETOKEN_free(pretoken); + goto err; + } + + RAND_bytes(pretoken->salt, sizeof(pretoken->salt)); + if (include_message) { + assert(SHA512_DIGEST_LENGTH == TRUST_TOKEN_NONCE_SIZE); + SHA512_Init(&hash_ctx); + SHA512_Update(&hash_ctx, pretoken->salt, sizeof(pretoken->salt)); + SHA512_Update(&hash_ctx, msg, msg_len); + SHA512_Final(pretoken->t, &hash_ctx); + } else { + OPENSSL_memcpy(pretoken->t, pretoken->salt, TRUST_TOKEN_NONCE_SIZE); + } + + // We sample |pretoken->r| in Montgomery form to simplify inverting. + if (!ec_random_nonzero_scalar(group, &pretoken->r, + kDefaultAdditionalData)) { + goto err; + } + + EC_SCALAR rinv; + ec_scalar_inv0_montgomery(group, &rinv, &pretoken->r); + // Convert both out of Montgomery form. + ec_scalar_from_montgomery(group, &pretoken->r, &pretoken->r); + ec_scalar_from_montgomery(group, &rinv, &rinv); + + EC_JACOBIAN T, Tp; + if (!method->hash_t(group, &T, pretoken->t) || + !ec_point_mul_scalar(group, &Tp, &T, &rinv) || + !ec_jacobian_to_affine(group, &pretoken->Tp, &Tp)) { + goto err; + } + + if (!cbb_add_prefixed_point(cbb, group, &pretoken->Tp, + method->prefix_point)) { + goto err; + } + } + + return pretokens; + +err: + sk_TRUST_TOKEN_PRETOKEN_pop_free(pretokens, TRUST_TOKEN_PRETOKEN_free); + return nullptr; +} + +static int scalar_to_cbb(CBB *out, const EC_GROUP *group, + const EC_SCALAR *scalar) { + uint8_t *buf; + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); + if (!CBB_add_space(out, &buf, scalar_len)) { + return 0; + } + ec_scalar_to_bytes(group, buf, &scalar_len, scalar); + return 1; +} + +static int scalar_from_cbs(CBS *cbs, const EC_GROUP *group, EC_SCALAR *out) { + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); + CBS tmp; + if (!CBS_get_bytes(cbs, &tmp, scalar_len)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + + ec_scalar_from_bytes(group, out, CBS_data(&tmp), CBS_len(&tmp)); + return 1; +} + +static int hash_c_dleq(const PMBTOKEN_METHOD *method, EC_SCALAR *out, + const EC_AFFINE *X, const EC_AFFINE *T, + const EC_AFFINE *S, const EC_AFFINE *W, + const EC_AFFINE *K0, const EC_AFFINE *K1) { + static const uint8_t kDLEQ2Label[] = "DLEQ2"; + + int ok = 0; + CBB cbb; + CBB_zero(&cbb); + uint8_t *buf = nullptr; + size_t len; + if (!CBB_init(&cbb, 0) || + !CBB_add_bytes(&cbb, kDLEQ2Label, sizeof(kDLEQ2Label)) || + !point_to_cbb(&cbb, method->group, X) || + !point_to_cbb(&cbb, method->group, T) || + !point_to_cbb(&cbb, method->group, S) || + !point_to_cbb(&cbb, method->group, W) || + !point_to_cbb(&cbb, method->group, K0) || + !point_to_cbb(&cbb, method->group, K1) || !CBB_finish(&cbb, &buf, &len) || + !method->hash_c(method->group, out, buf, len)) { + goto err; + } + + ok = 1; + +err: + CBB_cleanup(&cbb); + OPENSSL_free(buf); + return ok; +} + +static int hash_c_dleqor(const PMBTOKEN_METHOD *method, EC_SCALAR *out, + const EC_AFFINE *X0, const EC_AFFINE *X1, + const EC_AFFINE *T, const EC_AFFINE *S, + const EC_AFFINE *W, const EC_AFFINE *K00, + const EC_AFFINE *K01, const EC_AFFINE *K10, + const EC_AFFINE *K11) { + static const uint8_t kDLEQOR2Label[] = "DLEQOR2"; + + int ok = 0; + CBB cbb; + CBB_zero(&cbb); + uint8_t *buf = nullptr; + size_t len; + if (!CBB_init(&cbb, 0) || + !CBB_add_bytes(&cbb, kDLEQOR2Label, sizeof(kDLEQOR2Label)) || + !point_to_cbb(&cbb, method->group, X0) || + !point_to_cbb(&cbb, method->group, X1) || + !point_to_cbb(&cbb, method->group, T) || + !point_to_cbb(&cbb, method->group, S) || + !point_to_cbb(&cbb, method->group, W) || + !point_to_cbb(&cbb, method->group, K00) || + !point_to_cbb(&cbb, method->group, K01) || + !point_to_cbb(&cbb, method->group, K10) || + !point_to_cbb(&cbb, method->group, K11) || + !CBB_finish(&cbb, &buf, &len) || + !method->hash_c(method->group, out, buf, len)) { + goto err; + } + + ok = 1; + +err: + CBB_cleanup(&cbb); + OPENSSL_free(buf); + return ok; +} + +static int hash_c_batch(const PMBTOKEN_METHOD *method, EC_SCALAR *out, + const CBB *points, size_t index) { + static const uint8_t kDLEQBatchLabel[] = "DLEQ BATCH"; + if (index > 0xffff) { + // The protocol supports only two-byte batches. + OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); + return 0; + } + + int ok = 0; + CBB cbb; + CBB_zero(&cbb); + uint8_t *buf = nullptr; + size_t len; + if (!CBB_init(&cbb, 0) || + !CBB_add_bytes(&cbb, kDLEQBatchLabel, sizeof(kDLEQBatchLabel)) || + !CBB_add_bytes(&cbb, CBB_data(points), CBB_len(points)) || + !CBB_add_u16(&cbb, (uint16_t)index) || !CBB_finish(&cbb, &buf, &len) || + !method->hash_c(method->group, out, buf, len)) { + goto err; + } + + ok = 1; + +err: + CBB_cleanup(&cbb); + OPENSSL_free(buf); + return ok; +} + +// The DLEQ2 and DLEQOR2 constructions are described in appendix B of +// https://eprint.iacr.org/2020/072/20200324:214215. DLEQ2 is an instance of +// DLEQOR2 with only one value (n=1). + +static int dleq_generate(const PMBTOKEN_METHOD *method, CBB *cbb, + const TRUST_TOKEN_ISSUER_KEY *priv, + const EC_JACOBIAN *T, const EC_JACOBIAN *S, + const EC_JACOBIAN *W, const EC_JACOBIAN *Ws, + uint8_t private_metadata) { + const EC_GROUP *group = method->group; + + // We generate a DLEQ proof for the validity token and a DLEQOR2 proof for the + // private metadata token. To allow amortizing Jacobian-to-affine conversions, + // we compute Ki for both proofs first. + enum { + idx_T, + idx_S, + idx_W, + idx_Ws, + idx_Ks0, + idx_Ks1, + idx_Kb0, + idx_Kb1, + idx_Ko0, + idx_Ko1, + num_idx, + }; + EC_JACOBIAN jacobians[num_idx]; + + // Setup the DLEQ proof. + EC_SCALAR ks0, ks1; + if ( // ks0, ks1 <- Zp + !ec_random_nonzero_scalar(group, &ks0, kDefaultAdditionalData) || + !ec_random_nonzero_scalar(group, &ks1, kDefaultAdditionalData) || + // Ks = ks0*(G;T) + ks1*(H;S) + !ec_point_mul_scalar_precomp(group, &jacobians[idx_Ks0], + &method->g_precomp, &ks0, &method->h_precomp, + &ks1, nullptr, nullptr) || + !ec_point_mul_scalar_batch(group, &jacobians[idx_Ks1], T, &ks0, S, &ks1, + nullptr, nullptr)) { + return 0; + } + + // Setup the DLEQOR proof. First, select values of xb, yb (keys corresponding + // to the private metadata value) and pubo (public key corresponding to the + // other value) in constant time. + BN_ULONG mask = ((BN_ULONG)0) - (private_metadata & 1); + EC_PRECOMP pubo_precomp; + EC_SCALAR xb, yb; + ec_scalar_select(group, &xb, mask, &priv->x1, &priv->x0); + ec_scalar_select(group, &yb, mask, &priv->y1, &priv->y0); + ec_precomp_select(group, &pubo_precomp, mask, &priv->pub0_precomp, + &priv->pub1_precomp); + + EC_SCALAR k0, k1, minus_co, uo, vo; + if ( // k0, k1 <- Zp + !ec_random_nonzero_scalar(group, &k0, kDefaultAdditionalData) || + !ec_random_nonzero_scalar(group, &k1, kDefaultAdditionalData) || + // Kb = k0*(G;T) + k1*(H;S) + !ec_point_mul_scalar_precomp(group, &jacobians[idx_Kb0], + &method->g_precomp, &k0, &method->h_precomp, + &k1, nullptr, nullptr) || + !ec_point_mul_scalar_batch(group, &jacobians[idx_Kb1], T, &k0, S, &k1, + nullptr, nullptr) || + // co, uo, vo <- Zp + !ec_random_nonzero_scalar(group, &minus_co, kDefaultAdditionalData) || + !ec_random_nonzero_scalar(group, &uo, kDefaultAdditionalData) || + !ec_random_nonzero_scalar(group, &vo, kDefaultAdditionalData) || + // Ko = uo*(G;T) + vo*(H;S) - co*(pubo;W) + !ec_point_mul_scalar_precomp(group, &jacobians[idx_Ko0], + &method->g_precomp, &uo, &method->h_precomp, + &vo, &pubo_precomp, &minus_co) || + !ec_point_mul_scalar_batch(group, &jacobians[idx_Ko1], T, &uo, S, &vo, W, + &minus_co)) { + return 0; + } + + EC_AFFINE affines[num_idx]; + jacobians[idx_T] = *T; + jacobians[idx_S] = *S; + jacobians[idx_W] = *W; + jacobians[idx_Ws] = *Ws; + if (!ec_jacobian_to_affine_batch(group, affines, jacobians, num_idx)) { + return 0; + } + + // Select the K corresponding to K0 and K1 in constant-time. + EC_AFFINE K00, K01, K10, K11; + ec_affine_select(group, &K00, mask, &affines[idx_Ko0], &affines[idx_Kb0]); + ec_affine_select(group, &K01, mask, &affines[idx_Ko1], &affines[idx_Kb1]); + ec_affine_select(group, &K10, mask, &affines[idx_Kb0], &affines[idx_Ko0]); + ec_affine_select(group, &K11, mask, &affines[idx_Kb1], &affines[idx_Ko1]); + + // Compute c = Hc(...) for the two proofs. + EC_SCALAR cs, c; + if (!hash_c_dleq(method, &cs, &priv->pubs, &affines[idx_T], &affines[idx_S], + &affines[idx_Ws], &affines[idx_Ks0], &affines[idx_Ks1]) || + !hash_c_dleqor(method, &c, &priv->pub0, &priv->pub1, &affines[idx_T], + &affines[idx_S], &affines[idx_W], &K00, &K01, &K10, + &K11)) { + return 0; + } + + // Compute cb, ub, and ub for the two proofs. In each of these products, only + // one operand is in Montgomery form, so the product does not need to be + // converted. + + EC_SCALAR cs_mont; + ec_scalar_to_montgomery(group, &cs_mont, &cs); + + // us = ks0 + cs*xs + EC_SCALAR us, vs; + ec_scalar_mul_montgomery(group, &us, &priv->xs, &cs_mont); + ec_scalar_add(group, &us, &ks0, &us); + + // vs = ks1 + cs*ys + ec_scalar_mul_montgomery(group, &vs, &priv->ys, &cs_mont); + ec_scalar_add(group, &vs, &ks1, &vs); + + // Store DLEQ2 proof in transcript. + if (!scalar_to_cbb(cbb, group, &cs) || !scalar_to_cbb(cbb, group, &us) || + !scalar_to_cbb(cbb, group, &vs)) { + return 0; + } + + // cb = c - co + EC_SCALAR cb, ub, vb; + ec_scalar_add(group, &cb, &c, &minus_co); + + EC_SCALAR cb_mont; + ec_scalar_to_montgomery(group, &cb_mont, &cb); + + // ub = k0 + cb*xb + ec_scalar_mul_montgomery(group, &ub, &xb, &cb_mont); + ec_scalar_add(group, &ub, &k0, &ub); + + // vb = k1 + cb*yb + ec_scalar_mul_montgomery(group, &vb, &yb, &cb_mont); + ec_scalar_add(group, &vb, &k1, &vb); + + // Select c, u, v in constant-time. + EC_SCALAR co, c0, c1, u0, u1, v0, v1; + ec_scalar_neg(group, &co, &minus_co); + ec_scalar_select(group, &c0, mask, &co, &cb); + ec_scalar_select(group, &u0, mask, &uo, &ub); + ec_scalar_select(group, &v0, mask, &vo, &vb); + ec_scalar_select(group, &c1, mask, &cb, &co); + ec_scalar_select(group, &u1, mask, &ub, &uo); + ec_scalar_select(group, &v1, mask, &vb, &vo); + + // Store DLEQOR2 proof in transcript. + if (!scalar_to_cbb(cbb, group, &c0) || !scalar_to_cbb(cbb, group, &c1) || + !scalar_to_cbb(cbb, group, &u0) || !scalar_to_cbb(cbb, group, &u1) || + !scalar_to_cbb(cbb, group, &v0) || !scalar_to_cbb(cbb, group, &v1)) { + return 0; + } + + return 1; +} + +static int dleq_verify(const PMBTOKEN_METHOD *method, CBS *cbs, + const TRUST_TOKEN_CLIENT_KEY *pub, const EC_JACOBIAN *T, + const EC_JACOBIAN *S, const EC_JACOBIAN *W, + const EC_JACOBIAN *Ws) { + const EC_GROUP *group = method->group; + const EC_JACOBIAN *g = &group->generator.raw; + + // We verify a DLEQ proof for the validity token and a DLEQOR2 proof for the + // private metadata token. To allow amortizing Jacobian-to-affine conversions, + // we compute Ki for both proofs first. Additionally, all inputs to this + // function are public, so we can use the faster variable-time + // multiplications. + enum { + idx_T, + idx_S, + idx_W, + idx_Ws, + idx_Ks0, + idx_Ks1, + idx_K00, + idx_K01, + idx_K10, + idx_K11, + num_idx, + }; + EC_JACOBIAN jacobians[num_idx]; + + // Decode the DLEQ proof. + EC_SCALAR cs, us, vs; + if (!scalar_from_cbs(cbs, group, &cs) || !scalar_from_cbs(cbs, group, &us) || + !scalar_from_cbs(cbs, group, &vs)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + + // Ks = us*(G;T) + vs*(H;S) - cs*(pubs;Ws) + EC_JACOBIAN pubs; + ec_affine_to_jacobian(group, &pubs, &pub->pubs); + EC_SCALAR minus_cs; + ec_scalar_neg(group, &minus_cs, &cs); + if (!mul_public_3(group, &jacobians[idx_Ks0], g, &us, &method->h, &vs, &pubs, + &minus_cs) || + !mul_public_3(group, &jacobians[idx_Ks1], T, &us, S, &vs, Ws, + &minus_cs)) { + return 0; + } + + // Decode the DLEQOR proof. + EC_SCALAR c0, c1, u0, u1, v0, v1; + if (!scalar_from_cbs(cbs, group, &c0) || !scalar_from_cbs(cbs, group, &c1) || + !scalar_from_cbs(cbs, group, &u0) || !scalar_from_cbs(cbs, group, &u1) || + !scalar_from_cbs(cbs, group, &v0) || !scalar_from_cbs(cbs, group, &v1)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + + EC_JACOBIAN pub0, pub1; + ec_affine_to_jacobian(group, &pub0, &pub->pub0); + ec_affine_to_jacobian(group, &pub1, &pub->pub1); + EC_SCALAR minus_c0, minus_c1; + ec_scalar_neg(group, &minus_c0, &c0); + ec_scalar_neg(group, &minus_c1, &c1); + if ( // K0 = u0*(G;T) + v0*(H;S) - c0*(pub0;W) + !mul_public_3(group, &jacobians[idx_K00], g, &u0, &method->h, &v0, &pub0, + &minus_c0) || + !mul_public_3(group, &jacobians[idx_K01], T, &u0, S, &v0, W, &minus_c0) || + // K1 = u1*(G;T) + v1*(H;S) - c1*(pub1;W) + !mul_public_3(group, &jacobians[idx_K10], g, &u1, &method->h, &v1, &pub1, + &minus_c1) || + !mul_public_3(group, &jacobians[idx_K11], T, &u1, S, &v1, W, &minus_c1)) { + return 0; + } + + EC_AFFINE affines[num_idx]; + jacobians[idx_T] = *T; + jacobians[idx_S] = *S; + jacobians[idx_W] = *W; + jacobians[idx_Ws] = *Ws; + if (!ec_jacobian_to_affine_batch(group, affines, jacobians, num_idx)) { + return 0; + } + + // Check the DLEQ proof. + EC_SCALAR calculated; + if (!hash_c_dleq(method, &calculated, &pub->pubs, &affines[idx_T], + &affines[idx_S], &affines[idx_Ws], &affines[idx_Ks0], + &affines[idx_Ks1])) { + return 0; + } + + // cs == calculated + if (!ec_scalar_equal_vartime(group, &cs, &calculated)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_PROOF); + return 0; + } + + // Check the DLEQOR proof. + if (!hash_c_dleqor(method, &calculated, &pub->pub0, &pub->pub1, + &affines[idx_T], &affines[idx_S], &affines[idx_W], + &affines[idx_K00], &affines[idx_K01], &affines[idx_K10], + &affines[idx_K11])) { + return 0; + } + + // c0 + c1 == calculated + EC_SCALAR c; + ec_scalar_add(group, &c, &c0, &c1); + if (!ec_scalar_equal_vartime(group, &c, &calculated)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_PROOF); + return 0; + } + + return 1; +} + +static int pmbtoken_sign(const PMBTOKEN_METHOD *method, + const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, + size_t num_requested, size_t num_to_issue, + uint8_t private_metadata) { + const EC_GROUP *group = method->group; + if (num_requested < num_to_issue) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); + return 0; + } + + int ret = 0; + EC_JACOBIAN *Tps = reinterpret_cast( + OPENSSL_calloc(num_to_issue, sizeof(EC_JACOBIAN))); + EC_JACOBIAN *Sps = reinterpret_cast( + OPENSSL_calloc(num_to_issue, sizeof(EC_JACOBIAN))); + EC_JACOBIAN *Wps = reinterpret_cast( + OPENSSL_calloc(num_to_issue, sizeof(EC_JACOBIAN))); + EC_JACOBIAN *Wsps = reinterpret_cast( + OPENSSL_calloc(num_to_issue, sizeof(EC_JACOBIAN))); + EC_SCALAR *es = reinterpret_cast( + OPENSSL_calloc(num_to_issue, sizeof(EC_SCALAR))); + CBB batch_cbb; + CBB_zero(&batch_cbb); + + { + if (!Tps || !Sps || !Wps || !Wsps || !es || !CBB_init(&batch_cbb, 0) || + !point_to_cbb(&batch_cbb, method->group, &key->pubs) || + !point_to_cbb(&batch_cbb, method->group, &key->pub0) || + !point_to_cbb(&batch_cbb, method->group, &key->pub1)) { + goto err; + } + + for (size_t i = 0; i < num_to_issue; i++) { + EC_AFFINE Tp_affine; + EC_JACOBIAN Tp; + if (!cbs_get_prefixed_point(cbs, group, &Tp_affine, + method->prefix_point)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + goto err; + } + ec_affine_to_jacobian(group, &Tp, &Tp_affine); + + EC_SCALAR xb, yb; + BN_ULONG mask = ((BN_ULONG)0) - (private_metadata & 1); + ec_scalar_select(group, &xb, mask, &key->x1, &key->x0); + ec_scalar_select(group, &yb, mask, &key->y1, &key->y0); + + uint8_t s[TRUST_TOKEN_NONCE_SIZE]; + RAND_bytes(s, TRUST_TOKEN_NONCE_SIZE); + // The |jacobians| and |affines| contain Sp, Wp, and Wsp. + EC_JACOBIAN jacobians[3]; + EC_AFFINE affines[3]; + if (!method->hash_s(group, &jacobians[0], &Tp_affine, s) || + !ec_point_mul_scalar_batch(group, &jacobians[1], &Tp, &xb, + &jacobians[0], &yb, nullptr, nullptr) || + !ec_point_mul_scalar_batch(group, &jacobians[2], &Tp, &key->xs, + &jacobians[0], &key->ys, nullptr, + nullptr) || + !ec_jacobian_to_affine_batch(group, affines, jacobians, 3) || + !CBB_add_bytes(cbb, s, TRUST_TOKEN_NONCE_SIZE) || + !cbb_add_prefixed_point(cbb, group, &affines[1], + method->prefix_point) || + !cbb_add_prefixed_point(cbb, group, &affines[2], + method->prefix_point)) { + goto err; + } + + if (!point_to_cbb(&batch_cbb, group, &Tp_affine) || + !point_to_cbb(&batch_cbb, group, &affines[0]) || + !point_to_cbb(&batch_cbb, group, &affines[1]) || + !point_to_cbb(&batch_cbb, group, &affines[2])) { + goto err; + } + Tps[i] = Tp; + Sps[i] = jacobians[0]; + Wps[i] = jacobians[1]; + Wsps[i] = jacobians[2]; + + if (!CBB_flush(cbb)) { + goto err; + } + } + + // The DLEQ batching construction is described in appendix B of + // https://eprint.iacr.org/2020/072/20200324:214215. Note the additional + // computations all act on public inputs. + for (size_t i = 0; i < num_to_issue; i++) { + if (!hash_c_batch(method, &es[i], &batch_cbb, i)) { + goto err; + } + } + + EC_JACOBIAN Tp_batch, Sp_batch, Wp_batch, Wsp_batch; + if (!ec_point_mul_scalar_public_batch(group, &Tp_batch, + /*g_scalar=*/nullptr, Tps, es, + num_to_issue) || + !ec_point_mul_scalar_public_batch(group, &Sp_batch, + /*g_scalar=*/nullptr, Sps, es, + num_to_issue) || + !ec_point_mul_scalar_public_batch(group, &Wp_batch, + /*g_scalar=*/nullptr, Wps, es, + num_to_issue) || + !ec_point_mul_scalar_public_batch(group, &Wsp_batch, + /*g_scalar=*/nullptr, Wsps, es, + num_to_issue)) { + goto err; + } + + CBB proof; + if (!CBB_add_u16_length_prefixed(cbb, &proof) || + !dleq_generate(method, &proof, key, &Tp_batch, &Sp_batch, &Wp_batch, + &Wsp_batch, private_metadata) || + !CBB_flush(cbb)) { + goto err; + } + + // Skip over any unused requests. + size_t point_len = ec_point_byte_len(group, POINT_CONVERSION_UNCOMPRESSED); + size_t token_len = point_len; + if (method->prefix_point) { + token_len += 2; + } + if (!CBS_skip(cbs, token_len * (num_requested - num_to_issue))) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + goto err; + } + + ret = 1; + } + +err: + Delete(Tps); + Delete(Sps); + Delete(Wps); + Delete(Wsps); + Delete(es); + CBB_cleanup(&batch_cbb); + return ret; +} + +static STACK_OF(TRUST_TOKEN) *pmbtoken_unblind( + const PMBTOKEN_METHOD *method, const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id) { + const EC_GROUP *group = method->group; + if (count > sk_TRUST_TOKEN_PRETOKEN_num(pretokens)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return nullptr; + } + + int ok = 0; + STACK_OF(TRUST_TOKEN) *ret = sk_TRUST_TOKEN_new_null(); + EC_JACOBIAN *Tps = reinterpret_cast( + OPENSSL_calloc(count, sizeof(EC_JACOBIAN))); + EC_JACOBIAN *Sps = reinterpret_cast( + OPENSSL_calloc(count, sizeof(EC_JACOBIAN))); + EC_JACOBIAN *Wps = reinterpret_cast( + OPENSSL_calloc(count, sizeof(EC_JACOBIAN))); + EC_JACOBIAN *Wsps = reinterpret_cast( + OPENSSL_calloc(count, sizeof(EC_JACOBIAN))); + EC_SCALAR *es = + reinterpret_cast(OPENSSL_calloc(count, sizeof(EC_SCALAR))); + CBB batch_cbb; + CBB_zero(&batch_cbb); + if (ret == nullptr || Tps == nullptr || Sps == nullptr || Wps == nullptr || + Wsps == nullptr || es == nullptr || !CBB_init(&batch_cbb, 0) || + !point_to_cbb(&batch_cbb, method->group, &key->pubs) || + !point_to_cbb(&batch_cbb, method->group, &key->pub0) || + !point_to_cbb(&batch_cbb, method->group, &key->pub1)) { + goto err; + } + + for (size_t i = 0; i < count; i++) { + const TRUST_TOKEN_PRETOKEN *pretoken = + sk_TRUST_TOKEN_PRETOKEN_value(pretokens, i); + + uint8_t s[TRUST_TOKEN_NONCE_SIZE]; + EC_AFFINE Wp_affine, Wsp_affine; + if (!CBS_copy_bytes(cbs, s, TRUST_TOKEN_NONCE_SIZE) || + !cbs_get_prefixed_point(cbs, group, &Wp_affine, method->prefix_point) || + !cbs_get_prefixed_point(cbs, group, &Wsp_affine, + method->prefix_point)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + goto err; + } + + ec_affine_to_jacobian(group, &Tps[i], &pretoken->Tp); + ec_affine_to_jacobian(group, &Wps[i], &Wp_affine); + ec_affine_to_jacobian(group, &Wsps[i], &Wsp_affine); + if (!method->hash_s(group, &Sps[i], &pretoken->Tp, s)) { + goto err; + } + + EC_AFFINE Sp_affine; + if (!point_to_cbb(&batch_cbb, group, &pretoken->Tp) || + !ec_jacobian_to_affine(group, &Sp_affine, &Sps[i]) || + !point_to_cbb(&batch_cbb, group, &Sp_affine) || + !point_to_cbb(&batch_cbb, group, &Wp_affine) || + !point_to_cbb(&batch_cbb, group, &Wsp_affine)) { + goto err; + } + + // Unblind the token. + EC_JACOBIAN jacobians[3]; + EC_AFFINE affines[3]; + if (!ec_point_mul_scalar(group, &jacobians[0], &Sps[i], &pretoken->r) || + !ec_point_mul_scalar(group, &jacobians[1], &Wps[i], &pretoken->r) || + !ec_point_mul_scalar(group, &jacobians[2], &Wsps[i], &pretoken->r) || + !ec_jacobian_to_affine_batch(group, affines, jacobians, 3)) { + goto err; + } + + // Serialize the token. Include |key_id| to avoid an extra copy in the layer + // above. + CBB token_cbb; + size_t point_len = ec_point_byte_len(group, POINT_CONVERSION_UNCOMPRESSED); + if (!CBB_init(&token_cbb, + 4 + TRUST_TOKEN_NONCE_SIZE + 3 * (2 + point_len)) || + !CBB_add_u32(&token_cbb, key_id) || + !CBB_add_bytes(&token_cbb, pretoken->salt, TRUST_TOKEN_NONCE_SIZE) || + !cbb_add_prefixed_point(&token_cbb, group, &affines[0], + method->prefix_point) || + !cbb_add_prefixed_point(&token_cbb, group, &affines[1], + method->prefix_point) || + !cbb_add_prefixed_point(&token_cbb, group, &affines[2], + method->prefix_point) || + !CBB_flush(&token_cbb)) { + CBB_cleanup(&token_cbb); + goto err; + } + + TRUST_TOKEN *token = + TRUST_TOKEN_new(CBB_data(&token_cbb), CBB_len(&token_cbb)); + CBB_cleanup(&token_cbb); + if (token == nullptr || !sk_TRUST_TOKEN_push(ret, token)) { + TRUST_TOKEN_free(token); + goto err; + } + } + + // The DLEQ batching construction is described in appendix B of + // https://eprint.iacr.org/2020/072/20200324:214215. Note the additional + // computations all act on public inputs. + for (size_t i = 0; i < count; i++) { + if (!hash_c_batch(method, &es[i], &batch_cbb, i)) { + goto err; + } + } + + EC_JACOBIAN Tp_batch, Sp_batch, Wp_batch, Wsp_batch; + if (!ec_point_mul_scalar_public_batch(group, &Tp_batch, + /*g_scalar=*/nullptr, Tps, es, count) || + !ec_point_mul_scalar_public_batch(group, &Sp_batch, + /*g_scalar=*/nullptr, Sps, es, count) || + !ec_point_mul_scalar_public_batch(group, &Wp_batch, + /*g_scalar=*/nullptr, Wps, es, count) || + !ec_point_mul_scalar_public_batch(group, &Wsp_batch, + /*g_scalar=*/nullptr, Wsps, es, + count)) { + goto err; + } + + CBS proof; + if (!CBS_get_u16_length_prefixed(cbs, &proof) || + !dleq_verify(method, &proof, key, &Tp_batch, &Sp_batch, &Wp_batch, + &Wsp_batch) || + CBS_len(&proof) != 0) { + goto err; + } + + ok = 1; + +err: + Delete(Tps); + Delete(Sps); + Delete(Wps); + Delete(Wsps); + Delete(es); + CBB_cleanup(&batch_cbb); + if (!ok) { + sk_TRUST_TOKEN_pop_free(ret, TRUST_TOKEN_free); + ret = nullptr; + } + return ret; +} + +static int pmbtoken_read(const PMBTOKEN_METHOD *method, + const TRUST_TOKEN_ISSUER_KEY *key, + uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], + uint8_t *out_private_metadata, const uint8_t *token, + size_t token_len, int include_message, + const uint8_t *msg, size_t msg_len) { + const EC_GROUP *group = method->group; + CBS cbs, salt; + CBS_init(&cbs, token, token_len); + EC_AFFINE S, W, Ws; + if (!CBS_get_bytes(&cbs, &salt, TRUST_TOKEN_NONCE_SIZE) || + !cbs_get_prefixed_point(&cbs, group, &S, method->prefix_point) || + !cbs_get_prefixed_point(&cbs, group, &W, method->prefix_point) || + !cbs_get_prefixed_point(&cbs, group, &Ws, method->prefix_point) || + CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_TOKEN); + return 0; + } + + if (include_message) { + SHA512_CTX hash_ctx; + assert(SHA512_DIGEST_LENGTH == TRUST_TOKEN_NONCE_SIZE); + SHA512_Init(&hash_ctx); + SHA512_Update(&hash_ctx, CBS_data(&salt), CBS_len(&salt)); + SHA512_Update(&hash_ctx, msg, msg_len); + SHA512_Final(out_nonce, &hash_ctx); + } else { + OPENSSL_memcpy(out_nonce, CBS_data(&salt), CBS_len(&salt)); + } + + EC_JACOBIAN T; + if (!method->hash_t(group, &T, out_nonce)) { + return 0; + } + + // We perform three multiplications with S and T. This is enough that it is + // worth using |ec_point_mul_scalar_precomp|. + EC_JACOBIAN S_jacobian; + EC_PRECOMP S_precomp, T_precomp; + ec_affine_to_jacobian(group, &S_jacobian, &S); + if (!ec_init_precomp(group, &S_precomp, &S_jacobian) || + !ec_init_precomp(group, &T_precomp, &T)) { + return 0; + } + + EC_JACOBIAN Ws_calculated; + // Check the validity of the token. + if (!ec_point_mul_scalar_precomp(group, &Ws_calculated, &T_precomp, &key->xs, + &S_precomp, &key->ys, nullptr, nullptr) || + !ec_affine_jacobian_equal(group, &Ws, &Ws_calculated)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BAD_VALIDITY_CHECK); + return 0; + } + + EC_JACOBIAN W0, W1; + if (!ec_point_mul_scalar_precomp(group, &W0, &T_precomp, &key->x0, &S_precomp, + &key->y0, nullptr, nullptr) || + !ec_point_mul_scalar_precomp(group, &W1, &T_precomp, &key->x1, &S_precomp, + &key->y1, nullptr, nullptr)) { + return 0; + } + + const int is_W0 = ec_affine_jacobian_equal(group, &W, &W0); + const int is_W1 = ec_affine_jacobian_equal(group, &W, &W1); + const int is_valid = is_W0 ^ is_W1; + if (!is_valid) { + // Invalid tokens will fail the validity check above. + OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); + return 0; + } + + *out_private_metadata = is_W1; + return 1; +} + + +// PMBTokens experiment v1. + +static int pmbtoken_exp1_hash_t(const EC_GROUP *group, EC_JACOBIAN *out, + const uint8_t t[TRUST_TOKEN_NONCE_SIZE]) { + const uint8_t kHashTLabel[] = "PMBTokens Experiment V1 HashT"; + return ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( + group, out, kHashTLabel, sizeof(kHashTLabel), t, TRUST_TOKEN_NONCE_SIZE); +} + +static int pmbtoken_exp1_hash_s(const EC_GROUP *group, EC_JACOBIAN *out, + const EC_AFFINE *t, + const uint8_t s[TRUST_TOKEN_NONCE_SIZE]) { + const uint8_t kHashSLabel[] = "PMBTokens Experiment V1 HashS"; + int ret = 0; + CBB cbb; + uint8_t *buf = nullptr; + size_t len; + if (!CBB_init(&cbb, 0) || !point_to_cbb(&cbb, group, t) || + !CBB_add_bytes(&cbb, s, TRUST_TOKEN_NONCE_SIZE) || + !CBB_finish(&cbb, &buf, &len) || + !ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( + group, out, kHashSLabel, sizeof(kHashSLabel), buf, len)) { + goto err; + } + + ret = 1; + +err: + OPENSSL_free(buf); + CBB_cleanup(&cbb); + return ret; +} + +static int pmbtoken_exp1_hash_c(const EC_GROUP *group, EC_SCALAR *out, + uint8_t *buf, size_t len) { + const uint8_t kHashCLabel[] = "PMBTokens Experiment V1 HashC"; + return ec_hash_to_scalar_p384_xmd_sha512_draft07( + group, out, kHashCLabel, sizeof(kHashCLabel), buf, len); +} + +static int pmbtoken_exp1_hash_to_scalar(const EC_GROUP *group, EC_SCALAR *out, + uint8_t *buf, size_t len) { + const uint8_t kHashLabel[] = "PMBTokens Experiment V1 HashToScalar"; + return ec_hash_to_scalar_p384_xmd_sha512_draft07( + group, out, kHashLabel, sizeof(kHashLabel), buf, len); +} + +static int pmbtoken_exp1_ok = 0; +static PMBTOKEN_METHOD pmbtoken_exp1_method; +static CRYPTO_once_t pmbtoken_exp1_method_once = CRYPTO_ONCE_INIT; + +static void pmbtoken_exp1_init_method_impl() { + // This is the output of |ec_hash_to_scalar_p384_xmd_sha512_draft07| with DST + // "PMBTokens Experiment V1 HashH" and message "generator". + static const uint8_t kH[] = { + 0x04, 0x82, 0xd5, 0x68, 0xf5, 0x39, 0xf6, 0x08, 0x19, 0xa1, 0x75, + 0x9f, 0x98, 0xb5, 0x10, 0xf5, 0x0b, 0x9d, 0x2b, 0xe1, 0x64, 0x4d, + 0x02, 0x76, 0x18, 0x11, 0xf8, 0x2f, 0xd3, 0x33, 0x25, 0x1f, 0x2c, + 0xb8, 0xf6, 0xf1, 0x9e, 0x93, 0x85, 0x79, 0xb3, 0xb7, 0x81, 0xa3, + 0xe6, 0x23, 0xc3, 0x1c, 0xff, 0x03, 0xd9, 0x40, 0x6c, 0xec, 0xe0, + 0x4d, 0xea, 0xdf, 0x9d, 0x94, 0xd1, 0x87, 0xab, 0x27, 0xf7, 0x4f, + 0x53, 0xea, 0xa3, 0x18, 0x72, 0xb9, 0xd1, 0x56, 0xa0, 0x4e, 0x81, + 0xaa, 0xeb, 0x1c, 0x22, 0x6d, 0x39, 0x1c, 0x5e, 0xb1, 0x27, 0xfc, + 0x87, 0xc3, 0x95, 0xd0, 0x13, 0xb7, 0x0b, 0x5c, 0xc7, + }; + + pmbtoken_exp1_ok = pmbtoken_init_method( + &pmbtoken_exp1_method, EC_group_p384(), kH, sizeof(kH), + pmbtoken_exp1_hash_t, pmbtoken_exp1_hash_s, pmbtoken_exp1_hash_c, + pmbtoken_exp1_hash_to_scalar, 1); +} + +static int pmbtoken_exp1_init_method() { + CRYPTO_once(&pmbtoken_exp1_method_once, pmbtoken_exp1_init_method_impl); + if (!pmbtoken_exp1_ok) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); + return 0; + } + return 1; +} + +int bssl::pmbtoken_exp1_generate_key(CBB *out_private, CBB *out_public) { + if (!pmbtoken_exp1_init_method()) { + return 0; + } + + return pmbtoken_generate_key(&pmbtoken_exp1_method, out_private, out_public); +} + +int bssl::pmbtoken_exp1_derive_key_from_secret(CBB *out_private, + CBB *out_public, + const uint8_t *secret, + size_t secret_len) { + if (!pmbtoken_exp1_init_method()) { + return 0; + } + + return pmbtoken_derive_key_from_secret(&pmbtoken_exp1_method, out_private, + out_public, secret, secret_len); +} + +int bssl::pmbtoken_exp1_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, + const uint8_t *in, size_t len) { + if (!pmbtoken_exp1_init_method()) { + return 0; + } + return pmbtoken_client_key_from_bytes(&pmbtoken_exp1_method, key, in, len); +} + +int bssl::pmbtoken_exp1_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, + const uint8_t *in, size_t len) { + if (!pmbtoken_exp1_init_method()) { + return 0; + } + return pmbtoken_issuer_key_from_bytes(&pmbtoken_exp1_method, key, in, len); +} + +STACK_OF(TRUST_TOKEN_PRETOKEN) *bssl::pmbtoken_exp1_blind(CBB *cbb, + size_t count, + int include_message, + const uint8_t *msg, + size_t msg_len) { + if (!pmbtoken_exp1_init_method()) { + return nullptr; + } + return pmbtoken_blind(&pmbtoken_exp1_method, cbb, count, include_message, msg, + msg_len); +} + +int bssl::pmbtoken_exp1_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, + CBS *cbs, size_t num_requested, + size_t num_to_issue, uint8_t private_metadata) { + if (!pmbtoken_exp1_init_method()) { + return 0; + } + return pmbtoken_sign(&pmbtoken_exp1_method, key, cbb, cbs, num_requested, + num_to_issue, private_metadata); +} + +STACK_OF(TRUST_TOKEN) *bssl::pmbtoken_exp1_unblind( + const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id) { + if (!pmbtoken_exp1_init_method()) { + return nullptr; + } + return pmbtoken_unblind(&pmbtoken_exp1_method, key, pretokens, cbs, count, + key_id); +} + +int bssl::pmbtoken_exp1_read(const TRUST_TOKEN_ISSUER_KEY *key, + uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], + uint8_t *out_private_metadata, + const uint8_t *token, size_t token_len, + int include_message, const uint8_t *msg, + size_t msg_len) { + if (!pmbtoken_exp1_init_method()) { + return 0; + } + return pmbtoken_read(&pmbtoken_exp1_method, key, out_nonce, + out_private_metadata, token, token_len, include_message, + msg, msg_len); +} + +int bssl::pmbtoken_exp1_get_h_for_testing(uint8_t out[97]) { + if (!pmbtoken_exp1_init_method()) { + return 0; + } + EC_AFFINE h; + return ec_jacobian_to_affine(pmbtoken_exp1_method.group, &h, + &pmbtoken_exp1_method.h) && + ec_point_to_bytes(pmbtoken_exp1_method.group, &h, + POINT_CONVERSION_UNCOMPRESSED, out, 97) == 97; +} + +// PMBTokens experiment v2. + +static int pmbtoken_exp2_hash_t(const EC_GROUP *group, EC_JACOBIAN *out, + const uint8_t t[TRUST_TOKEN_NONCE_SIZE]) { + const uint8_t kHashTLabel[] = "PMBTokens Experiment V2 HashT"; + return ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( + group, out, kHashTLabel, sizeof(kHashTLabel), t, TRUST_TOKEN_NONCE_SIZE); +} + +static int pmbtoken_exp2_hash_s(const EC_GROUP *group, EC_JACOBIAN *out, + const EC_AFFINE *t, + const uint8_t s[TRUST_TOKEN_NONCE_SIZE]) { + const uint8_t kHashSLabel[] = "PMBTokens Experiment V2 HashS"; + int ret = 0; + CBB cbb; + uint8_t *buf = nullptr; + size_t len; + if (!CBB_init(&cbb, 0) || !point_to_cbb(&cbb, group, t) || + !CBB_add_bytes(&cbb, s, TRUST_TOKEN_NONCE_SIZE) || + !CBB_finish(&cbb, &buf, &len) || + !ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( + group, out, kHashSLabel, sizeof(kHashSLabel), buf, len)) { + goto err; + } + + ret = 1; + +err: + OPENSSL_free(buf); + CBB_cleanup(&cbb); + return ret; +} + +static int pmbtoken_exp2_hash_c(const EC_GROUP *group, EC_SCALAR *out, + uint8_t *buf, size_t len) { + const uint8_t kHashCLabel[] = "PMBTokens Experiment V2 HashC"; + return ec_hash_to_scalar_p384_xmd_sha512_draft07( + group, out, kHashCLabel, sizeof(kHashCLabel), buf, len); +} + +static int pmbtoken_exp2_hash_to_scalar(const EC_GROUP *group, EC_SCALAR *out, + uint8_t *buf, size_t len) { + const uint8_t kHashLabel[] = "PMBTokens Experiment V2 HashToScalar"; + return ec_hash_to_scalar_p384_xmd_sha512_draft07( + group, out, kHashLabel, sizeof(kHashLabel), buf, len); +} + +static int pmbtoken_exp2_ok = 0; +static PMBTOKEN_METHOD pmbtoken_exp2_method; +static CRYPTO_once_t pmbtoken_exp2_method_once = CRYPTO_ONCE_INIT; + +static void pmbtoken_exp2_init_method_impl() { + // This is the output of |ec_hash_to_scalar_p384_xmd_sha512_draft07| with DST + // "PMBTokens Experiment V2 HashH" and message "generator". + static const uint8_t kH[] = { + 0x04, 0xbc, 0x27, 0x24, 0x99, 0xfa, 0xc9, 0xa4, 0x74, 0x6f, 0xf9, + 0x07, 0x81, 0x55, 0xf8, 0x1f, 0x6f, 0xda, 0x09, 0xe7, 0x8c, 0x5d, + 0x9e, 0x4e, 0x14, 0x7c, 0x53, 0x14, 0xbc, 0x7e, 0x29, 0x57, 0x92, + 0x17, 0x94, 0x6e, 0xd2, 0xdf, 0xa5, 0x31, 0x1b, 0x4e, 0xb7, 0xfc, + 0x93, 0xe3, 0x6e, 0x14, 0x1f, 0x4f, 0x14, 0xf3, 0xe5, 0x47, 0x61, + 0x1c, 0x2c, 0x72, 0x25, 0xf0, 0x4a, 0x45, 0x23, 0x2d, 0x57, 0x93, + 0x0e, 0xb2, 0x55, 0xb8, 0x57, 0x25, 0x4c, 0x1e, 0xdb, 0xfd, 0x58, + 0x70, 0x17, 0x9a, 0xbb, 0x9e, 0x5e, 0x93, 0x9e, 0x92, 0xd3, 0xe8, + 0x25, 0x62, 0xbf, 0x59, 0xb2, 0xd2, 0x3d, 0x71, 0xff}; + + pmbtoken_exp2_ok = pmbtoken_init_method( + &pmbtoken_exp2_method, EC_group_p384(), kH, sizeof(kH), + pmbtoken_exp2_hash_t, pmbtoken_exp2_hash_s, pmbtoken_exp2_hash_c, + pmbtoken_exp2_hash_to_scalar, 0); +} + +static int pmbtoken_exp2_init_method() { + CRYPTO_once(&pmbtoken_exp2_method_once, pmbtoken_exp2_init_method_impl); + if (!pmbtoken_exp2_ok) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); + return 0; + } + return 1; +} + +int bssl::pmbtoken_exp2_generate_key(CBB *out_private, CBB *out_public) { + if (!pmbtoken_exp2_init_method()) { + return 0; + } + + return pmbtoken_generate_key(&pmbtoken_exp2_method, out_private, out_public); +} + + +int bssl::pmbtoken_exp2_derive_key_from_secret(CBB *out_private, + CBB *out_public, + const uint8_t *secret, + size_t secret_len) { + if (!pmbtoken_exp2_init_method()) { + return 0; + } + + return pmbtoken_derive_key_from_secret(&pmbtoken_exp2_method, out_private, + out_public, secret, secret_len); +} + +int bssl::pmbtoken_exp2_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, + const uint8_t *in, size_t len) { + if (!pmbtoken_exp2_init_method()) { + return 0; + } + return pmbtoken_client_key_from_bytes(&pmbtoken_exp2_method, key, in, len); +} + +int bssl::pmbtoken_exp2_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, + const uint8_t *in, size_t len) { + if (!pmbtoken_exp2_init_method()) { + return 0; + } + return pmbtoken_issuer_key_from_bytes(&pmbtoken_exp2_method, key, in, len); +} + +STACK_OF(TRUST_TOKEN_PRETOKEN) *bssl::pmbtoken_exp2_blind(CBB *cbb, + size_t count, + int include_message, + const uint8_t *msg, + size_t msg_len) { + if (!pmbtoken_exp2_init_method()) { + return nullptr; + } + return pmbtoken_blind(&pmbtoken_exp2_method, cbb, count, include_message, msg, + msg_len); +} + +int bssl::pmbtoken_exp2_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, + CBS *cbs, size_t num_requested, + size_t num_to_issue, uint8_t private_metadata) { + if (!pmbtoken_exp2_init_method()) { + return 0; + } + return pmbtoken_sign(&pmbtoken_exp2_method, key, cbb, cbs, num_requested, + num_to_issue, private_metadata); +} + +STACK_OF(TRUST_TOKEN) *bssl::pmbtoken_exp2_unblind( + const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id) { + if (!pmbtoken_exp2_init_method()) { + return nullptr; + } + return pmbtoken_unblind(&pmbtoken_exp2_method, key, pretokens, cbs, count, + key_id); +} + +int bssl::pmbtoken_exp2_read(const TRUST_TOKEN_ISSUER_KEY *key, + uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], + uint8_t *out_private_metadata, + const uint8_t *token, size_t token_len, + int include_message, const uint8_t *msg, + size_t msg_len) { + if (!pmbtoken_exp2_init_method()) { + return 0; + } + return pmbtoken_read(&pmbtoken_exp2_method, key, out_nonce, + out_private_metadata, token, token_len, include_message, + msg, msg_len); +} + +int bssl::pmbtoken_exp2_get_h_for_testing(uint8_t out[97]) { + if (!pmbtoken_exp2_init_method()) { + return 0; + } + EC_AFFINE h; + return ec_jacobian_to_affine(pmbtoken_exp2_method.group, &h, + &pmbtoken_exp2_method.h) && + ec_point_to_bytes(pmbtoken_exp2_method.group, &h, + POINT_CONVERSION_UNCOMPRESSED, out, 97) == 97; +} + +// PMBTokens PST v1. + +static int pmbtoken_pst1_hash_t(const EC_GROUP *group, EC_JACOBIAN *out, + const uint8_t t[TRUST_TOKEN_NONCE_SIZE]) { + const uint8_t kHashTLabel[] = "PMBTokens PST V1 HashT"; + return ec_hash_to_curve_p384_xmd_sha384_sswu( + group, out, kHashTLabel, sizeof(kHashTLabel), t, TRUST_TOKEN_NONCE_SIZE); +} + +static int pmbtoken_pst1_hash_s(const EC_GROUP *group, EC_JACOBIAN *out, + const EC_AFFINE *t, + const uint8_t s[TRUST_TOKEN_NONCE_SIZE]) { + const uint8_t kHashSLabel[] = "PMBTokens PST V1 HashS"; + int ret = 0; + CBB cbb; + uint8_t *buf = nullptr; + size_t len; + if (!CBB_init(&cbb, 0) || !point_to_cbb(&cbb, group, t) || + !CBB_add_bytes(&cbb, s, TRUST_TOKEN_NONCE_SIZE) || + !CBB_finish(&cbb, &buf, &len) || + !ec_hash_to_curve_p384_xmd_sha384_sswu(group, out, kHashSLabel, + sizeof(kHashSLabel), buf, len)) { + goto err; + } + + ret = 1; + +err: + OPENSSL_free(buf); + CBB_cleanup(&cbb); + return ret; +} + +static int pmbtoken_pst1_hash_c(const EC_GROUP *group, EC_SCALAR *out, + uint8_t *buf, size_t len) { + const uint8_t kHashCLabel[] = "PMBTokens PST V1 HashC"; + return ec_hash_to_scalar_p384_xmd_sha384(group, out, kHashCLabel, + sizeof(kHashCLabel), buf, len); +} + +static int pmbtoken_pst1_hash_to_scalar(const EC_GROUP *group, EC_SCALAR *out, + uint8_t *buf, size_t len) { + const uint8_t kHashLabel[] = "PMBTokens PST V1 HashToScalar"; + return ec_hash_to_scalar_p384_xmd_sha384(group, out, kHashLabel, + sizeof(kHashLabel), buf, len); +} + +static int pmbtoken_pst1_ok = 0; +static PMBTOKEN_METHOD pmbtoken_pst1_method; +static CRYPTO_once_t pmbtoken_pst1_method_once = CRYPTO_ONCE_INIT; + +static void pmbtoken_pst1_init_method_impl() { + // This is the output of |ec_hash_to_scalar_p384_xmd_sha384| with DST + // "PMBTokens PST V1 HashH" and message "generator". + static const uint8_t kH[] = { + 0x04, 0x4c, 0xfa, 0xd4, 0x33, 0x6d, 0x8c, 0x4e, 0x18, 0xce, 0x1a, + 0x82, 0x7b, 0x53, 0x8c, 0xf8, 0x63, 0x18, 0xe5, 0xa3, 0x96, 0x0d, + 0x05, 0xde, 0xf4, 0x83, 0xa7, 0xd8, 0xde, 0x9c, 0x50, 0x81, 0x38, + 0xc9, 0x38, 0x25, 0xa3, 0x70, 0x97, 0xc1, 0x1c, 0x33, 0x2e, 0x83, + 0x68, 0x64, 0x9c, 0x53, 0x73, 0xc3, 0x03, 0xc1, 0xa9, 0xd8, 0x92, + 0xa2, 0x32, 0xf4, 0x22, 0x40, 0x07, 0x2d, 0x9b, 0x6f, 0xab, 0xff, + 0x2a, 0x92, 0x03, 0xb1, 0x73, 0x09, 0x1a, 0x6a, 0x4a, 0xc2, 0x4c, + 0xac, 0x13, 0x59, 0xf4, 0x28, 0x0e, 0x78, 0x69, 0xa5, 0xdf, 0x0d, + 0x74, 0xeb, 0x14, 0xca, 0x8a, 0x32, 0xbb, 0xd3, 0x91}; + + pmbtoken_pst1_ok = pmbtoken_init_method( + &pmbtoken_pst1_method, EC_group_p384(), kH, sizeof(kH), + pmbtoken_pst1_hash_t, pmbtoken_pst1_hash_s, pmbtoken_pst1_hash_c, + pmbtoken_pst1_hash_to_scalar, 0); +} + +static int pmbtoken_pst1_init_method() { + CRYPTO_once(&pmbtoken_pst1_method_once, pmbtoken_pst1_init_method_impl); + if (!pmbtoken_pst1_ok) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); + return 0; + } + return 1; +} + +int bssl::pmbtoken_pst1_generate_key(CBB *out_private, CBB *out_public) { + if (!pmbtoken_pst1_init_method()) { + return 0; + } + + return pmbtoken_generate_key(&pmbtoken_pst1_method, out_private, out_public); +} + + +int bssl::pmbtoken_pst1_derive_key_from_secret(CBB *out_private, + CBB *out_public, + const uint8_t *secret, + size_t secret_len) { + if (!pmbtoken_pst1_init_method()) { + return 0; + } + + return pmbtoken_derive_key_from_secret(&pmbtoken_pst1_method, out_private, + out_public, secret, secret_len); +} + +int bssl::pmbtoken_pst1_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, + const uint8_t *in, size_t len) { + if (!pmbtoken_pst1_init_method()) { + return 0; + } + return pmbtoken_client_key_from_bytes(&pmbtoken_pst1_method, key, in, len); +} + +int bssl::pmbtoken_pst1_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, + const uint8_t *in, size_t len) { + if (!pmbtoken_pst1_init_method()) { + return 0; + } + return pmbtoken_issuer_key_from_bytes(&pmbtoken_pst1_method, key, in, len); +} + +STACK_OF(TRUST_TOKEN_PRETOKEN) *bssl::pmbtoken_pst1_blind(CBB *cbb, + size_t count, + int include_message, + const uint8_t *msg, + size_t msg_len) { + if (!pmbtoken_pst1_init_method()) { + return nullptr; + } + return pmbtoken_blind(&pmbtoken_pst1_method, cbb, count, include_message, msg, + msg_len); +} + +int bssl::pmbtoken_pst1_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, + CBS *cbs, size_t num_requested, + size_t num_to_issue, uint8_t private_metadata) { + if (!pmbtoken_pst1_init_method()) { + return 0; + } + return pmbtoken_sign(&pmbtoken_pst1_method, key, cbb, cbs, num_requested, + num_to_issue, private_metadata); +} + +STACK_OF(TRUST_TOKEN) *bssl::pmbtoken_pst1_unblind( + const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id) { + if (!pmbtoken_pst1_init_method()) { + return nullptr; + } + return pmbtoken_unblind(&pmbtoken_pst1_method, key, pretokens, cbs, count, + key_id); +} + +int bssl::pmbtoken_pst1_read(const TRUST_TOKEN_ISSUER_KEY *key, + uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], + uint8_t *out_private_metadata, + const uint8_t *token, size_t token_len, + int include_message, const uint8_t *msg, + size_t msg_len) { + if (!pmbtoken_pst1_init_method()) { + return 0; + } + return pmbtoken_read(&pmbtoken_pst1_method, key, out_nonce, + out_private_metadata, token, token_len, include_message, + msg, msg_len); +} + +int bssl::pmbtoken_pst1_get_h_for_testing(uint8_t out[97]) { + if (!pmbtoken_pst1_init_method()) { + return 0; + } + EC_AFFINE h; + return ec_jacobian_to_affine(pmbtoken_pst1_method.group, &h, + &pmbtoken_pst1_method.h) && + ec_point_to_bytes(pmbtoken_pst1_method.group, &h, + POINT_CONVERSION_UNCOMPRESSED, out, 97) == 97; +} diff --git a/third_party/boringssl/src/crypto/trust_token/trust_token.c b/third_party/boringssl/src/crypto/trust_token/trust_token.c deleted file mode 100644 index 5afb487c..00000000 --- a/third_party/boringssl/src/crypto/trust_token/trust_token.c +++ /dev/null @@ -1,898 +0,0 @@ -/* Copyright (c) 2019, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include -#include -#include -#include -#include -#include - -#include "internal.h" - - -// The Trust Token API is described in -// https://github.com/WICG/trust-token-api/blob/master/README.md and provides a -// protocol for issuing and redeeming tokens built on top of the PMBTokens -// construction. - -const TRUST_TOKEN_METHOD *TRUST_TOKEN_experiment_v1(void) { - static const TRUST_TOKEN_METHOD kMethod = { - pmbtoken_exp1_generate_key, - pmbtoken_exp1_derive_key_from_secret, - pmbtoken_exp1_client_key_from_bytes, - pmbtoken_exp1_issuer_key_from_bytes, - pmbtoken_exp1_blind, - pmbtoken_exp1_sign, - pmbtoken_exp1_unblind, - pmbtoken_exp1_read, - 1, /* has_private_metadata */ - 3, /* max_keys */ - 1, /* has_srr */ - }; - return &kMethod; -} - -const TRUST_TOKEN_METHOD *TRUST_TOKEN_experiment_v2_voprf(void) { - static const TRUST_TOKEN_METHOD kMethod = { - voprf_exp2_generate_key, - voprf_exp2_derive_key_from_secret, - voprf_exp2_client_key_from_bytes, - voprf_exp2_issuer_key_from_bytes, - voprf_exp2_blind, - voprf_exp2_sign, - voprf_exp2_unblind, - voprf_exp2_read, - 0, /* has_private_metadata */ - 6, /* max_keys */ - 0, /* has_srr */ - }; - return &kMethod; -} - -const TRUST_TOKEN_METHOD *TRUST_TOKEN_experiment_v2_pmb(void) { - static const TRUST_TOKEN_METHOD kMethod = { - pmbtoken_exp2_generate_key, - pmbtoken_exp2_derive_key_from_secret, - pmbtoken_exp2_client_key_from_bytes, - pmbtoken_exp2_issuer_key_from_bytes, - pmbtoken_exp2_blind, - pmbtoken_exp2_sign, - pmbtoken_exp2_unblind, - pmbtoken_exp2_read, - 1, /* has_private_metadata */ - 3, /* max_keys */ - 0, /* has_srr */ - }; - return &kMethod; -} - -void TRUST_TOKEN_PRETOKEN_free(TRUST_TOKEN_PRETOKEN *pretoken) { - OPENSSL_free(pretoken); -} - -TRUST_TOKEN *TRUST_TOKEN_new(const uint8_t *data, size_t len) { - TRUST_TOKEN *ret = OPENSSL_malloc(sizeof(TRUST_TOKEN)); - if (ret == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(TRUST_TOKEN)); - ret->data = OPENSSL_memdup(data, len); - if (len != 0 && ret->data == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - OPENSSL_free(ret); - return NULL; - } - ret->len = len; - return ret; -} - -void TRUST_TOKEN_free(TRUST_TOKEN *token) { - if (token == NULL) { - return; - } - OPENSSL_free(token->data); - OPENSSL_free(token); -} - -int TRUST_TOKEN_generate_key(const TRUST_TOKEN_METHOD *method, - uint8_t *out_priv_key, size_t *out_priv_key_len, - size_t max_priv_key_len, uint8_t *out_pub_key, - size_t *out_pub_key_len, size_t max_pub_key_len, - uint32_t id) { - // Prepend the key ID in front of the PMBTokens format. - int ret = 0; - CBB priv_cbb, pub_cbb; - CBB_zero(&priv_cbb); - CBB_zero(&pub_cbb); - if (!CBB_init_fixed(&priv_cbb, out_priv_key, max_priv_key_len) || - !CBB_init_fixed(&pub_cbb, out_pub_key, max_pub_key_len) || - !CBB_add_u32(&priv_cbb, id) || - !CBB_add_u32(&pub_cbb, id)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); - goto err; - } - - if (!method->generate_key(&priv_cbb, &pub_cbb)) { - goto err; - } - - if (!CBB_finish(&priv_cbb, NULL, out_priv_key_len) || - !CBB_finish(&pub_cbb, NULL, out_pub_key_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); - goto err; - } - - ret = 1; - -err: - CBB_cleanup(&priv_cbb); - CBB_cleanup(&pub_cbb); - return ret; -} - -int TRUST_TOKEN_derive_key_from_secret( - const TRUST_TOKEN_METHOD *method, uint8_t *out_priv_key, - size_t *out_priv_key_len, size_t max_priv_key_len, uint8_t *out_pub_key, - size_t *out_pub_key_len, size_t max_pub_key_len, uint32_t id, - const uint8_t *secret, size_t secret_len) { - // Prepend the key ID in front of the PMBTokens format. - int ret = 0; - CBB priv_cbb, pub_cbb; - CBB_zero(&priv_cbb); - CBB_zero(&pub_cbb); - if (!CBB_init_fixed(&priv_cbb, out_priv_key, max_priv_key_len) || - !CBB_init_fixed(&pub_cbb, out_pub_key, max_pub_key_len) || - !CBB_add_u32(&priv_cbb, id) || - !CBB_add_u32(&pub_cbb, id)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); - goto err; - } - - if (!method->derive_key_from_secret(&priv_cbb, &pub_cbb, secret, - secret_len)) { - goto err; - } - - if (!CBB_finish(&priv_cbb, NULL, out_priv_key_len) || - !CBB_finish(&pub_cbb, NULL, out_pub_key_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); - goto err; - } - - ret = 1; - -err: - CBB_cleanup(&priv_cbb); - CBB_cleanup(&pub_cbb); - return ret; -} - -TRUST_TOKEN_CLIENT *TRUST_TOKEN_CLIENT_new(const TRUST_TOKEN_METHOD *method, - size_t max_batchsize) { - if (max_batchsize > 0xffff) { - // The protocol supports only two-byte token counts. - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); - return NULL; - } - - TRUST_TOKEN_CLIENT *ret = OPENSSL_malloc(sizeof(TRUST_TOKEN_CLIENT)); - if (ret == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(TRUST_TOKEN_CLIENT)); - ret->method = method; - ret->max_batchsize = (uint16_t)max_batchsize; - return ret; -} - -void TRUST_TOKEN_CLIENT_free(TRUST_TOKEN_CLIENT *ctx) { - if (ctx == NULL) { - return; - } - EVP_PKEY_free(ctx->srr_key); - sk_TRUST_TOKEN_PRETOKEN_pop_free(ctx->pretokens, TRUST_TOKEN_PRETOKEN_free); - OPENSSL_free(ctx); -} - -int TRUST_TOKEN_CLIENT_add_key(TRUST_TOKEN_CLIENT *ctx, size_t *out_key_index, - const uint8_t *key, size_t key_len) { - if (ctx->num_keys == OPENSSL_ARRAY_SIZE(ctx->keys) || - ctx->num_keys >= ctx->method->max_keys) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_TOO_MANY_KEYS); - return 0; - } - - struct trust_token_client_key_st *key_s = &ctx->keys[ctx->num_keys]; - CBS cbs; - CBS_init(&cbs, key, key_len); - uint32_t key_id; - if (!CBS_get_u32(&cbs, &key_id) || - !ctx->method->client_key_from_bytes(&key_s->key, CBS_data(&cbs), - CBS_len(&cbs))) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - key_s->id = key_id; - *out_key_index = ctx->num_keys; - ctx->num_keys += 1; - return 1; -} - -int TRUST_TOKEN_CLIENT_set_srr_key(TRUST_TOKEN_CLIENT *ctx, EVP_PKEY *key) { - if (!ctx->method->has_srr) { - return 1; - } - EVP_PKEY_free(ctx->srr_key); - EVP_PKEY_up_ref(key); - ctx->srr_key = key; - return 1; -} - -int TRUST_TOKEN_CLIENT_begin_issuance(TRUST_TOKEN_CLIENT *ctx, uint8_t **out, - size_t *out_len, size_t count) { - if (count > ctx->max_batchsize) { - count = ctx->max_batchsize; - } - - int ret = 0; - CBB request; - STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens = NULL; - if (!CBB_init(&request, 0) || - !CBB_add_u16(&request, count)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - pretokens = ctx->method->blind(&request, count); - if (pretokens == NULL) { - goto err; - } - - if (!CBB_finish(&request, out, out_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - sk_TRUST_TOKEN_PRETOKEN_pop_free(ctx->pretokens, TRUST_TOKEN_PRETOKEN_free); - ctx->pretokens = pretokens; - pretokens = NULL; - ret = 1; - -err: - CBB_cleanup(&request); - sk_TRUST_TOKEN_PRETOKEN_pop_free(pretokens, TRUST_TOKEN_PRETOKEN_free); - return ret; -} - -STACK_OF(TRUST_TOKEN) * - TRUST_TOKEN_CLIENT_finish_issuance(TRUST_TOKEN_CLIENT *ctx, - size_t *out_key_index, - const uint8_t *response, - size_t response_len) { - CBS in; - CBS_init(&in, response, response_len); - uint16_t count; - uint32_t key_id; - if (!CBS_get_u16(&in, &count) || - !CBS_get_u32(&in, &key_id)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return NULL; - } - - size_t key_index = 0; - const struct trust_token_client_key_st *key = NULL; - for (size_t i = 0; i < ctx->num_keys; i++) { - if (ctx->keys[i].id == key_id) { - key_index = i; - key = &ctx->keys[i]; - break; - } - } - - if (key == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_KEY_ID); - return NULL; - } - - if (count > sk_TRUST_TOKEN_PRETOKEN_num(ctx->pretokens)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return NULL; - } - - STACK_OF(TRUST_TOKEN) *tokens = - ctx->method->unblind(&key->key, ctx->pretokens, &in, count, key_id); - if (tokens == NULL) { - return NULL; - } - - if (CBS_len(&in) != 0) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - sk_TRUST_TOKEN_pop_free(tokens, TRUST_TOKEN_free); - return NULL; - } - - sk_TRUST_TOKEN_PRETOKEN_pop_free(ctx->pretokens, TRUST_TOKEN_PRETOKEN_free); - ctx->pretokens = NULL; - - *out_key_index = key_index; - return tokens; -} - -int TRUST_TOKEN_CLIENT_begin_redemption(TRUST_TOKEN_CLIENT *ctx, uint8_t **out, - size_t *out_len, - const TRUST_TOKEN *token, - const uint8_t *data, size_t data_len, - uint64_t time) { - CBB request, token_inner, inner; - if (!CBB_init(&request, 0) || - !CBB_add_u16_length_prefixed(&request, &token_inner) || - !CBB_add_bytes(&token_inner, token->data, token->len) || - !CBB_add_u16_length_prefixed(&request, &inner) || - !CBB_add_bytes(&inner, data, data_len) || - (ctx->method->has_srr && !CBB_add_u64(&request, time)) || - !CBB_finish(&request, out, out_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - CBB_cleanup(&request); - return 0; - } - return 1; -} - -int TRUST_TOKEN_CLIENT_finish_redemption(TRUST_TOKEN_CLIENT *ctx, - uint8_t **out_rr, size_t *out_rr_len, - uint8_t **out_sig, size_t *out_sig_len, - const uint8_t *response, - size_t response_len) { - CBS in, srr, sig; - CBS_init(&in, response, response_len); - if (!ctx->method->has_srr) { - if (!CBS_stow(&in, out_rr, out_rr_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return 0; - } - - *out_sig = NULL; - *out_sig_len = 0; - return 1; - } - - if (!CBS_get_u16_length_prefixed(&in, &srr) || - !CBS_get_u16_length_prefixed(&in, &sig) || - CBS_len(&in) != 0) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_ERROR); - return 0; - } - - if (ctx->srr_key == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_NO_SRR_KEY_CONFIGURED); - return 0; - } - - EVP_MD_CTX md_ctx; - EVP_MD_CTX_init(&md_ctx); - int sig_ok = EVP_DigestVerifyInit(&md_ctx, NULL, NULL, NULL, ctx->srr_key) && - EVP_DigestVerify(&md_ctx, CBS_data(&sig), CBS_len(&sig), - CBS_data(&srr), CBS_len(&srr)); - EVP_MD_CTX_cleanup(&md_ctx); - - if (!sig_ok) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_SRR_SIGNATURE_ERROR); - return 0; - } - - uint8_t *srr_buf = NULL, *sig_buf = NULL; - size_t srr_len, sig_len; - if (!CBS_stow(&srr, &srr_buf, &srr_len) || - !CBS_stow(&sig, &sig_buf, &sig_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - OPENSSL_free(srr_buf); - OPENSSL_free(sig_buf); - return 0; - } - - *out_rr = srr_buf; - *out_rr_len = srr_len; - *out_sig = sig_buf; - *out_sig_len = sig_len; - return 1; -} - -TRUST_TOKEN_ISSUER *TRUST_TOKEN_ISSUER_new(const TRUST_TOKEN_METHOD *method, - size_t max_batchsize) { - if (max_batchsize > 0xffff) { - // The protocol supports only two-byte token counts. - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); - return NULL; - } - - TRUST_TOKEN_ISSUER *ret = OPENSSL_malloc(sizeof(TRUST_TOKEN_ISSUER)); - if (ret == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return NULL; - } - OPENSSL_memset(ret, 0, sizeof(TRUST_TOKEN_ISSUER)); - ret->method = method; - ret->max_batchsize = (uint16_t)max_batchsize; - return ret; -} - -void TRUST_TOKEN_ISSUER_free(TRUST_TOKEN_ISSUER *ctx) { - if (ctx == NULL) { - return; - } - EVP_PKEY_free(ctx->srr_key); - OPENSSL_free(ctx->metadata_key); - OPENSSL_free(ctx); -} - -int TRUST_TOKEN_ISSUER_add_key(TRUST_TOKEN_ISSUER *ctx, const uint8_t *key, - size_t key_len) { - if (ctx->num_keys == OPENSSL_ARRAY_SIZE(ctx->keys) || - ctx->num_keys >= ctx->method->max_keys) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_TOO_MANY_KEYS); - return 0; - } - - struct trust_token_issuer_key_st *key_s = &ctx->keys[ctx->num_keys]; - CBS cbs; - CBS_init(&cbs, key, key_len); - uint32_t key_id; - if (!CBS_get_u32(&cbs, &key_id) || - !ctx->method->issuer_key_from_bytes(&key_s->key, CBS_data(&cbs), - CBS_len(&cbs))) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - - key_s->id = key_id; - ctx->num_keys += 1; - return 1; -} - -int TRUST_TOKEN_ISSUER_set_srr_key(TRUST_TOKEN_ISSUER *ctx, EVP_PKEY *key) { - EVP_PKEY_free(ctx->srr_key); - EVP_PKEY_up_ref(key); - ctx->srr_key = key; - return 1; -} - -int TRUST_TOKEN_ISSUER_set_metadata_key(TRUST_TOKEN_ISSUER *ctx, - const uint8_t *key, size_t len) { - if (len < 32) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_METADATA_KEY); - } - OPENSSL_free(ctx->metadata_key); - ctx->metadata_key_len = 0; - ctx->metadata_key = OPENSSL_memdup(key, len); - if (ctx->metadata_key == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return 0; - } - ctx->metadata_key_len = len; - return 1; -} - -static const struct trust_token_issuer_key_st *trust_token_issuer_get_key( - const TRUST_TOKEN_ISSUER *ctx, uint32_t key_id) { - for (size_t i = 0; i < ctx->num_keys; i++) { - if (ctx->keys[i].id == key_id) { - return &ctx->keys[i]; - } - } - return NULL; -} - -int TRUST_TOKEN_ISSUER_issue(const TRUST_TOKEN_ISSUER *ctx, uint8_t **out, - size_t *out_len, size_t *out_tokens_issued, - const uint8_t *request, size_t request_len, - uint32_t public_metadata, uint8_t private_metadata, - size_t max_issuance) { - if (max_issuance > ctx->max_batchsize) { - max_issuance = ctx->max_batchsize; - } - - const struct trust_token_issuer_key_st *key = - trust_token_issuer_get_key(ctx, public_metadata); - if (key == NULL || private_metadata > 1 || - (!ctx->method->has_private_metadata && private_metadata != 0)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_METADATA); - return 0; - } - - CBS in; - uint16_t num_requested; - CBS_init(&in, request, request_len); - if (!CBS_get_u16(&in, &num_requested)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - - size_t num_to_issue = num_requested; - if (num_to_issue > max_issuance) { - num_to_issue = max_issuance; - } - - int ret = 0; - CBB response; - if (!CBB_init(&response, 0) || - !CBB_add_u16(&response, num_to_issue) || - !CBB_add_u32(&response, public_metadata)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - if (!ctx->method->sign(&key->key, &response, &in, num_requested, num_to_issue, - private_metadata)) { - goto err; - } - - if (CBS_len(&in) != 0) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - goto err; - } - - if (!CBB_finish(&response, out, out_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - *out_tokens_issued = num_to_issue; - ret = 1; - -err: - CBB_cleanup(&response); - return ret; -} - - -int TRUST_TOKEN_ISSUER_redeem_raw(const TRUST_TOKEN_ISSUER *ctx, - uint32_t *out_public, uint8_t *out_private, - TRUST_TOKEN **out_token, - uint8_t **out_client_data, - size_t *out_client_data_len, - const uint8_t *request, size_t request_len) { - CBS request_cbs, token_cbs; - CBS_init(&request_cbs, request, request_len); - if (!CBS_get_u16_length_prefixed(&request_cbs, &token_cbs)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_ERROR); - return 0; - } - - uint32_t public_metadata = 0; - uint8_t private_metadata = 0; - - // Parse the token. If there is an error, treat it as an invalid token. - if (!CBS_get_u32(&token_cbs, &public_metadata)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_TOKEN); - return 0; - } - - const struct trust_token_issuer_key_st *key = - trust_token_issuer_get_key(ctx, public_metadata); - uint8_t nonce[TRUST_TOKEN_NONCE_SIZE]; - if (key == NULL || - !ctx->method->read(&key->key, nonce, &private_metadata, - CBS_data(&token_cbs), CBS_len(&token_cbs))) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_TOKEN); - return 0; - } - - CBS client_data; - if (!CBS_get_u16_length_prefixed(&request_cbs, &client_data) || - (ctx->method->has_srr && !CBS_skip(&request_cbs, 8)) || - CBS_len(&request_cbs) != 0) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_ERROR); - return 0; - } - - uint8_t *client_data_buf = NULL; - size_t client_data_len = 0; - if (!CBS_stow(&client_data, &client_data_buf, &client_data_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - TRUST_TOKEN *token = TRUST_TOKEN_new(nonce, TRUST_TOKEN_NONCE_SIZE); - if (token == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - *out_public = public_metadata; - *out_private = private_metadata; - *out_token = token; - *out_client_data = client_data_buf; - *out_client_data_len = client_data_len; - - return 1; - -err: - OPENSSL_free(client_data_buf); - return 0; -} - -// https://tools.ietf.org/html/rfc7049#section-2.1 -static int add_cbor_int_with_type(CBB *cbb, uint8_t major_type, - uint64_t value) { - if (value <= 23) { - return CBB_add_u8(cbb, value | major_type); - } - if (value <= 0xff) { - return CBB_add_u8(cbb, 0x18 | major_type) && CBB_add_u8(cbb, value); - } - if (value <= 0xffff) { - return CBB_add_u8(cbb, 0x19 | major_type) && CBB_add_u16(cbb, value); - } - if (value <= 0xffffffff) { - return CBB_add_u8(cbb, 0x1a | major_type) && CBB_add_u32(cbb, value); - } - if (value <= 0xffffffffffffffff) { - return CBB_add_u8(cbb, 0x1b | major_type) && CBB_add_u64(cbb, value); - } - - return 0; -} - -// https://tools.ietf.org/html/rfc7049#section-2.1 -static int add_cbor_int(CBB *cbb, uint64_t value) { - return add_cbor_int_with_type(cbb, 0, value); -} - -// https://tools.ietf.org/html/rfc7049#section-2.1 -static int add_cbor_bytes(CBB *cbb, const uint8_t *data, size_t len) { - return add_cbor_int_with_type(cbb, 0x40, len) && - CBB_add_bytes(cbb, data, len); -} - -// https://tools.ietf.org/html/rfc7049#section-2.1 -static int add_cbor_text(CBB *cbb, const char *data, size_t len) { - return add_cbor_int_with_type(cbb, 0x60, len) && - CBB_add_bytes(cbb, (const uint8_t *)data, len); -} - -// https://tools.ietf.org/html/rfc7049#section-2.1 -static int add_cbor_map(CBB *cbb, uint8_t size) { - return add_cbor_int_with_type(cbb, 0xa0, size); -} - -static uint8_t get_metadata_obfuscator(const uint8_t *key, size_t key_len, - const uint8_t *client_data, - size_t client_data_len) { - uint8_t metadata_obfuscator[SHA256_DIGEST_LENGTH]; - SHA256_CTX sha_ctx; - SHA256_Init(&sha_ctx); - SHA256_Update(&sha_ctx, key, key_len); - SHA256_Update(&sha_ctx, client_data, client_data_len); - SHA256_Final(metadata_obfuscator, &sha_ctx); - return metadata_obfuscator[0] >> 7; -} - -int TRUST_TOKEN_ISSUER_redeem(const TRUST_TOKEN_ISSUER *ctx, uint8_t **out, - size_t *out_len, TRUST_TOKEN **out_token, - uint8_t **out_client_data, - size_t *out_client_data_len, - uint64_t *out_redemption_time, - const uint8_t *request, size_t request_len, - uint64_t lifetime) { - CBS request_cbs, token_cbs; - CBS_init(&request_cbs, request, request_len); - if (!CBS_get_u16_length_prefixed(&request_cbs, &token_cbs)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_ERROR); - return 0; - } - - uint32_t public_metadata = 0; - uint8_t private_metadata = 0; - - CBS token_copy = token_cbs; - - // Parse the token. If there is an error, treat it as an invalid token. - if (!CBS_get_u32(&token_cbs, &public_metadata)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_TOKEN); - return 0; - } - - const struct trust_token_issuer_key_st *key = - trust_token_issuer_get_key(ctx, public_metadata); - uint8_t nonce[TRUST_TOKEN_NONCE_SIZE]; - if (key == NULL || - !ctx->method->read(&key->key, nonce, &private_metadata, - CBS_data(&token_cbs), CBS_len(&token_cbs))) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_TOKEN); - return 0; - } - - int ok = 0; - CBB response, srr; - uint8_t *srr_buf = NULL, *sig_buf = NULL, *client_data_buf = NULL; - size_t srr_len = 0, sig_len = 0, client_data_len = 0; - EVP_MD_CTX md_ctx; - EVP_MD_CTX_init(&md_ctx); - CBB_zero(&srr); - if (!CBB_init(&response, 0)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - CBS client_data; - uint64_t redemption_time = 0; - if (!CBS_get_u16_length_prefixed(&request_cbs, &client_data) || - (ctx->method->has_srr && !CBS_get_u64(&request_cbs, &redemption_time))) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_ERROR); - goto err; - } - - const uint8_t kTokenHashDSTLabel[] = "TrustTokenV0 TokenHash"; - uint8_t token_hash[SHA256_DIGEST_LENGTH]; - SHA256_CTX sha_ctx; - SHA256_Init(&sha_ctx); - SHA256_Update(&sha_ctx, kTokenHashDSTLabel, sizeof(kTokenHashDSTLabel)); - SHA256_Update(&sha_ctx, CBS_data(&token_copy), CBS_len(&token_copy)); - SHA256_Final(token_hash, &sha_ctx); - - uint8_t metadata_obfuscator = get_metadata_obfuscator( - ctx->metadata_key, ctx->metadata_key_len, token_hash, sizeof(token_hash)); - - // The SRR is constructed as per the format described in - // https://docs.google.com/document/d/1TNnya6B8pyomDK2F1R9CL3dY10OAmqWlnCxsWyOBDVQ/edit#heading=h.7mkzvhpqb8l5 - - // The V2 protocol is intended to be used with - // |TRUST_TOKEN_ISSUER_redeem_raw|. However, we temporarily support it with - // |TRUST_TOKEN_ISSUER_redeem| to ease the transition for existing issuer - // callers. Those callers' consumers currently expect an expiry-timestamp - // field, so we fill in a placeholder value. - // - // TODO(svaldez): After the existing issues have migrated to - // |TRUST_TOKEN_ISSUER_redeem_raw| remove this logic. - uint64_t expiry_time = 0; - if (ctx->method->has_srr) { - expiry_time = redemption_time + lifetime; - } - - static const char kClientDataLabel[] = "client-data"; - static const char kExpiryTimestampLabel[] = "expiry-timestamp"; - static const char kMetadataLabel[] = "metadata"; - static const char kPrivateLabel[] = "private"; - static const char kPublicLabel[] = "public"; - static const char kTokenHashLabel[] = "token-hash"; - - // CBOR requires map keys to be sorted by length then sorted lexically. - // https://tools.ietf.org/html/rfc7049#section-3.9 - assert(strlen(kMetadataLabel) < strlen(kTokenHashLabel)); - assert(strlen(kTokenHashLabel) < strlen(kClientDataLabel)); - assert(strlen(kClientDataLabel) < strlen(kExpiryTimestampLabel)); - assert(strlen(kPublicLabel) < strlen(kPrivateLabel)); - - size_t map_entries = 4; - - if (!CBB_init(&srr, 0) || - !add_cbor_map(&srr, map_entries) || // SRR map - !add_cbor_text(&srr, kMetadataLabel, strlen(kMetadataLabel)) || - !add_cbor_map(&srr, 2) || // Metadata map - !add_cbor_text(&srr, kPublicLabel, strlen(kPublicLabel)) || - !add_cbor_int(&srr, public_metadata) || - !add_cbor_text(&srr, kPrivateLabel, strlen(kPrivateLabel)) || - !add_cbor_int(&srr, private_metadata ^ metadata_obfuscator) || - !add_cbor_text(&srr, kTokenHashLabel, strlen(kTokenHashLabel)) || - !add_cbor_bytes(&srr, token_hash, sizeof(token_hash)) || - !add_cbor_text(&srr, kClientDataLabel, strlen(kClientDataLabel)) || - !CBB_add_bytes(&srr, CBS_data(&client_data), CBS_len(&client_data)) || - !add_cbor_text(&srr, kExpiryTimestampLabel, - strlen(kExpiryTimestampLabel)) || - !add_cbor_int(&srr, expiry_time) || - !CBB_finish(&srr, &srr_buf, &srr_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - if (!EVP_DigestSignInit(&md_ctx, NULL, NULL, NULL, ctx->srr_key) || - !EVP_DigestSign(&md_ctx, NULL, &sig_len, srr_buf, srr_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_SRR_SIGNATURE_ERROR); - goto err; - } - - // Merge SRR and Signature into single string. - // TODO(svaldez): Expose API to construct this from the caller. - if (!ctx->method->has_srr) { - static const char kSRRHeader[] = "body=:"; - static const char kSRRSplit[] = ":, signature=:"; - static const char kSRREnd[] = ":"; - - size_t srr_b64_len, sig_b64_len; - if (!EVP_EncodedLength(&srr_b64_len, srr_len) || - !EVP_EncodedLength(&sig_b64_len, sig_len)) { - goto err; - } - - sig_buf = OPENSSL_malloc(sig_len); - uint8_t *srr_b64_buf = OPENSSL_malloc(srr_b64_len); - uint8_t *sig_b64_buf = OPENSSL_malloc(sig_b64_len); - if (!sig_buf || - !srr_b64_buf || - !sig_b64_buf || - !EVP_DigestSign(&md_ctx, sig_buf, &sig_len, srr_buf, srr_len) || - !CBB_add_bytes(&response, (const uint8_t *)kSRRHeader, - strlen(kSRRHeader)) || - !CBB_add_bytes(&response, srr_b64_buf, - EVP_EncodeBlock(srr_b64_buf, srr_buf, srr_len)) || - !CBB_add_bytes(&response, (const uint8_t *)kSRRSplit, - strlen(kSRRSplit)) || - !CBB_add_bytes(&response, sig_b64_buf, - EVP_EncodeBlock(sig_b64_buf, sig_buf, sig_len)) || - !CBB_add_bytes(&response, (const uint8_t *)kSRREnd, strlen(kSRREnd))) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - OPENSSL_free(srr_b64_buf); - OPENSSL_free(sig_b64_buf); - goto err; - } - - OPENSSL_free(srr_b64_buf); - OPENSSL_free(sig_b64_buf); - } else { - CBB child; - uint8_t *ptr; - if (!CBB_add_u16_length_prefixed(&response, &child) || - !CBB_add_bytes(&child, srr_buf, srr_len) || - !CBB_add_u16_length_prefixed(&response, &child) || - !CBB_reserve(&child, &ptr, sig_len) || - !EVP_DigestSign(&md_ctx, ptr, &sig_len, srr_buf, srr_len) || - !CBB_did_write(&child, sig_len) || - !CBB_flush(&response)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - } - - if (!CBS_stow(&client_data, &client_data_buf, &client_data_len) || - !CBB_finish(&response, out, out_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - TRUST_TOKEN *token = TRUST_TOKEN_new(nonce, TRUST_TOKEN_NONCE_SIZE); - if (token == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - *out_token = token; - *out_client_data = client_data_buf; - *out_client_data_len = client_data_len; - *out_redemption_time = redemption_time; - - ok = 1; - -err: - CBB_cleanup(&response); - CBB_cleanup(&srr); - OPENSSL_free(srr_buf); - OPENSSL_free(sig_buf); - EVP_MD_CTX_cleanup(&md_ctx); - if (!ok) { - OPENSSL_free(client_data_buf); - } - return ok; -} - -int TRUST_TOKEN_decode_private_metadata(const TRUST_TOKEN_METHOD *method, - uint8_t *out_value, const uint8_t *key, - size_t key_len, const uint8_t *nonce, - size_t nonce_len, - uint8_t encrypted_bit) { - uint8_t metadata_obfuscator = - get_metadata_obfuscator(key, key_len, nonce, nonce_len); - *out_value = encrypted_bit ^ metadata_obfuscator; - return 1; -} diff --git a/third_party/boringssl/src/crypto/trust_token/trust_token.cc b/third_party/boringssl/src/crypto/trust_token/trust_token.cc new file mode 100644 index 00000000..d0e5c243 --- /dev/null +++ b/third_party/boringssl/src/crypto/trust_token/trust_token.cc @@ -0,0 +1,701 @@ +// Copyright 2019 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include + +#include +#include +#include +#include +#include + +#include "../mem_internal.h" +#include "internal.h" + + +// The Trust Token API is described in +// https://github.com/WICG/trust-token-api/blob/main/README.md and provides a +// protocol for issuing and redeeming tokens built on top of the PMBTokens +// construction. + +using namespace bssl; + +const TRUST_TOKEN_METHOD *TRUST_TOKEN_experiment_v1() { + static const TRUST_TOKEN_METHOD kMethod = { + pmbtoken_exp1_generate_key, + pmbtoken_exp1_derive_key_from_secret, + pmbtoken_exp1_client_key_from_bytes, + pmbtoken_exp1_issuer_key_from_bytes, + pmbtoken_exp1_blind, + pmbtoken_exp1_sign, + pmbtoken_exp1_unblind, + pmbtoken_exp1_read, + 1, /* has_private_metadata */ + 3, /* max_keys */ + 1, /* has_srr */ + }; + return &kMethod; +} + +const TRUST_TOKEN_METHOD *TRUST_TOKEN_experiment_v2_voprf() { + static const TRUST_TOKEN_METHOD kMethod = { + voprf_exp2_generate_key, + voprf_exp2_derive_key_from_secret, + voprf_exp2_client_key_from_bytes, + voprf_exp2_issuer_key_from_bytes, + voprf_exp2_blind, + voprf_exp2_sign, + voprf_exp2_unblind, + voprf_exp2_read, + 0, /* has_private_metadata */ + 6, /* max_keys */ + 0, /* has_srr */ + }; + return &kMethod; +} + +const TRUST_TOKEN_METHOD *TRUST_TOKEN_experiment_v2_pmb() { + static const TRUST_TOKEN_METHOD kMethod = { + pmbtoken_exp2_generate_key, + pmbtoken_exp2_derive_key_from_secret, + pmbtoken_exp2_client_key_from_bytes, + pmbtoken_exp2_issuer_key_from_bytes, + pmbtoken_exp2_blind, + pmbtoken_exp2_sign, + pmbtoken_exp2_unblind, + pmbtoken_exp2_read, + 1, /* has_private_metadata */ + 3, /* max_keys */ + 0, /* has_srr */ + }; + return &kMethod; +} + +const TRUST_TOKEN_METHOD *TRUST_TOKEN_pst_v1_voprf() { + static const TRUST_TOKEN_METHOD kMethod = { + voprf_pst1_generate_key, + voprf_pst1_derive_key_from_secret, + voprf_pst1_client_key_from_bytes, + voprf_pst1_issuer_key_from_bytes, + voprf_pst1_blind, + voprf_pst1_sign, + voprf_pst1_unblind, + voprf_pst1_read, + 0, /* has_private_metadata */ + 6, /* max_keys */ + 0, /* has_srr */ + }; + return &kMethod; +} + +const TRUST_TOKEN_METHOD *TRUST_TOKEN_pst_v1_pmb() { + static const TRUST_TOKEN_METHOD kMethod = { + pmbtoken_pst1_generate_key, + pmbtoken_pst1_derive_key_from_secret, + pmbtoken_pst1_client_key_from_bytes, + pmbtoken_pst1_issuer_key_from_bytes, + pmbtoken_pst1_blind, + pmbtoken_pst1_sign, + pmbtoken_pst1_unblind, + pmbtoken_pst1_read, + 1, /* has_private_metadata */ + 3, /* max_keys */ + 0, /* has_srr */ + }; + return &kMethod; +} + + +void bssl::TRUST_TOKEN_PRETOKEN_free(TRUST_TOKEN_PRETOKEN *pretoken) { + Delete(pretoken); +} + +TRUST_TOKEN *TRUST_TOKEN_new(const uint8_t *data, size_t len) { + TRUST_TOKEN *ret = New(); + if (ret == nullptr) { + return nullptr; + } + ret->data = reinterpret_cast(OPENSSL_memdup(data, len)); + if (len != 0 && ret->data == nullptr) { + Delete(ret); + return nullptr; + } + ret->len = len; + return ret; +} + +void TRUST_TOKEN_free(TRUST_TOKEN *token) { + if (token == nullptr) { + return; + } + OPENSSL_free(token->data); + Delete(token); +} + +int TRUST_TOKEN_generate_key(const TRUST_TOKEN_METHOD *method, + uint8_t *out_priv_key, size_t *out_priv_key_len, + size_t max_priv_key_len, uint8_t *out_pub_key, + size_t *out_pub_key_len, size_t max_pub_key_len, + uint32_t id) { + // Prepend the key ID in front of the PMBTokens format. + CBB priv_cbb, pub_cbb; + CBB_init_fixed(&priv_cbb, out_priv_key, max_priv_key_len); + CBB_init_fixed(&pub_cbb, out_pub_key, max_pub_key_len); + if (!CBB_add_u32(&priv_cbb, id) || // + !CBB_add_u32(&pub_cbb, id)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); + return 0; + } + + if (!method->generate_key(&priv_cbb, &pub_cbb)) { + return 0; + } + + if (!CBB_finish(&priv_cbb, nullptr, out_priv_key_len) || + !CBB_finish(&pub_cbb, nullptr, out_pub_key_len)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); + return 0; + } + + return 1; +} + +int TRUST_TOKEN_derive_key_from_secret( + const TRUST_TOKEN_METHOD *method, uint8_t *out_priv_key, + size_t *out_priv_key_len, size_t max_priv_key_len, uint8_t *out_pub_key, + size_t *out_pub_key_len, size_t max_pub_key_len, uint32_t id, + const uint8_t *secret, size_t secret_len) { + // Prepend the key ID in front of the PMBTokens format. + CBB priv_cbb, pub_cbb; + CBB_init_fixed(&priv_cbb, out_priv_key, max_priv_key_len); + CBB_init_fixed(&pub_cbb, out_pub_key, max_pub_key_len); + if (!CBB_add_u32(&priv_cbb, id) || // + !CBB_add_u32(&pub_cbb, id)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); + return 0; + } + + if (!method->derive_key_from_secret(&priv_cbb, &pub_cbb, secret, + secret_len)) { + return 0; + } + + if (!CBB_finish(&priv_cbb, nullptr, out_priv_key_len) || + !CBB_finish(&pub_cbb, nullptr, out_pub_key_len)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); + return 0; + } + + return 1; +} + +TRUST_TOKEN_CLIENT *TRUST_TOKEN_CLIENT_new(const TRUST_TOKEN_METHOD *method, + size_t max_batchsize) { + if (max_batchsize > 0xffff) { + // The protocol supports only two-byte token counts. + OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); + return nullptr; + } + + TRUST_TOKEN_CLIENT *ret = New(); + if (ret == nullptr) { + return nullptr; + } + ret->method = method; + ret->max_batchsize = (uint16_t)max_batchsize; + return ret; +} + +void TRUST_TOKEN_CLIENT_free(TRUST_TOKEN_CLIENT *ctx) { + if (ctx == nullptr) { + return; + } + EVP_PKEY_free(ctx->srr_key); + sk_TRUST_TOKEN_PRETOKEN_pop_free(ctx->pretokens, TRUST_TOKEN_PRETOKEN_free); + Delete(ctx); +} + +static TRUST_TOKEN_PRETOKEN *dup_pretoken(const TRUST_TOKEN_PRETOKEN *in) { + return static_cast( + OPENSSL_memdup(in, sizeof(TRUST_TOKEN_PRETOKEN))); +} + +TRUST_TOKEN_CLIENT *TRUST_TOKEN_CLIENT_dup_for_testing( + const TRUST_TOKEN_CLIENT *ctx) { + bssl::UniquePtr ret( + TRUST_TOKEN_CLIENT_new(ctx->method, ctx->max_batchsize)); + if (ret == nullptr) { + return nullptr; + } + for (size_t i = 0; i < std::size(ret->keys); i++) { + ret->keys[i] = ctx->keys[i]; + } + ret->num_keys = ctx->num_keys; + if (ctx->pretokens != nullptr) { + ret->pretokens = sk_TRUST_TOKEN_PRETOKEN_deep_copy( + ctx->pretokens, dup_pretoken, TRUST_TOKEN_PRETOKEN_free); + if (ret->pretokens == nullptr) { + return nullptr; + } + } + ret->srr_key = bssl::UpRef(ctx->srr_key).release(); + return ret.release(); +} + +int TRUST_TOKEN_CLIENT_add_key(TRUST_TOKEN_CLIENT *ctx, size_t *out_key_index, + const uint8_t *key, size_t key_len) { + if (ctx->num_keys == std::size(ctx->keys) || + ctx->num_keys >= ctx->method->max_keys) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_TOO_MANY_KEYS); + return 0; + } + + struct trust_token_client_key_st *key_s = &ctx->keys[ctx->num_keys]; + CBS cbs; + CBS_init(&cbs, key, key_len); + uint32_t key_id; + if (!CBS_get_u32(&cbs, &key_id) || + !ctx->method->client_key_from_bytes(&key_s->key, CBS_data(&cbs), + CBS_len(&cbs))) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + key_s->id = key_id; + *out_key_index = ctx->num_keys; + ctx->num_keys += 1; + return 1; +} + +int TRUST_TOKEN_CLIENT_set_srr_key(TRUST_TOKEN_CLIENT *ctx, EVP_PKEY *key) { + if (!ctx->method->has_srr) { + return 1; + } + EVP_PKEY_free(ctx->srr_key); + EVP_PKEY_up_ref(key); + ctx->srr_key = key; + return 1; +} + +static int trust_token_client_begin_issuance_impl( + TRUST_TOKEN_CLIENT *ctx, uint8_t **out, size_t *out_len, size_t count, + int include_message, const uint8_t *msg, size_t msg_len) { + if (count > ctx->max_batchsize) { + count = ctx->max_batchsize; + } + + int ret = 0; + CBB request; + STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens = nullptr; + if (!CBB_init(&request, 0) || !CBB_add_u16(&request, count)) { + goto err; + } + + pretokens = + ctx->method->blind(&request, count, include_message, msg, msg_len); + if (pretokens == nullptr) { + goto err; + } + + if (!CBB_finish(&request, out, out_len)) { + goto err; + } + + sk_TRUST_TOKEN_PRETOKEN_pop_free(ctx->pretokens, TRUST_TOKEN_PRETOKEN_free); + ctx->pretokens = pretokens; + pretokens = nullptr; + ret = 1; + +err: + CBB_cleanup(&request); + sk_TRUST_TOKEN_PRETOKEN_pop_free(pretokens, TRUST_TOKEN_PRETOKEN_free); + return ret; +} + +int TRUST_TOKEN_CLIENT_begin_issuance(TRUST_TOKEN_CLIENT *ctx, uint8_t **out, + size_t *out_len, size_t count) { + return trust_token_client_begin_issuance_impl(ctx, out, out_len, count, + /*include_message=*/0, nullptr, + 0); +} + +int TRUST_TOKEN_CLIENT_begin_issuance_over_message( + TRUST_TOKEN_CLIENT *ctx, uint8_t **out, size_t *out_len, size_t count, + const uint8_t *msg, size_t msg_len) { + return trust_token_client_begin_issuance_impl( + ctx, out, out_len, count, /*include_message=*/1, msg, msg_len); +} + + +STACK_OF(TRUST_TOKEN) *TRUST_TOKEN_CLIENT_finish_issuance( + TRUST_TOKEN_CLIENT *ctx, size_t *out_key_index, const uint8_t *response, + size_t response_len) { + CBS in; + CBS_init(&in, response, response_len); + uint16_t count; + uint32_t key_id; + if (!CBS_get_u16(&in, &count) || !CBS_get_u32(&in, &key_id)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return nullptr; + } + + size_t key_index = 0; + const struct trust_token_client_key_st *key = nullptr; + for (size_t i = 0; i < ctx->num_keys; i++) { + if (ctx->keys[i].id == key_id) { + key_index = i; + key = &ctx->keys[i]; + break; + } + } + + if (key == nullptr) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_KEY_ID); + return nullptr; + } + + if (count > sk_TRUST_TOKEN_PRETOKEN_num(ctx->pretokens)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return nullptr; + } + + STACK_OF(TRUST_TOKEN) *tokens = + ctx->method->unblind(&key->key, ctx->pretokens, &in, count, key_id); + if (tokens == nullptr) { + return nullptr; + } + + if (CBS_len(&in) != 0) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + sk_TRUST_TOKEN_pop_free(tokens, TRUST_TOKEN_free); + return nullptr; + } + + sk_TRUST_TOKEN_PRETOKEN_pop_free(ctx->pretokens, TRUST_TOKEN_PRETOKEN_free); + ctx->pretokens = nullptr; + + *out_key_index = key_index; + return tokens; +} + +int TRUST_TOKEN_CLIENT_begin_redemption(TRUST_TOKEN_CLIENT *ctx, uint8_t **out, + size_t *out_len, + const TRUST_TOKEN *token, + const uint8_t *data, size_t data_len, + uint64_t time) { + CBB request, token_inner, inner; + if (!CBB_init(&request, 0) || + !CBB_add_u16_length_prefixed(&request, &token_inner) || + !CBB_add_bytes(&token_inner, token->data, token->len) || + !CBB_add_u16_length_prefixed(&request, &inner) || + !CBB_add_bytes(&inner, data, data_len) || + (ctx->method->has_srr && !CBB_add_u64(&request, time)) || + !CBB_finish(&request, out, out_len)) { + CBB_cleanup(&request); + return 0; + } + return 1; +} + +int TRUST_TOKEN_CLIENT_finish_redemption(TRUST_TOKEN_CLIENT *ctx, + uint8_t **out_rr, size_t *out_rr_len, + uint8_t **out_sig, size_t *out_sig_len, + const uint8_t *response, + size_t response_len) { + CBS in, srr, sig; + CBS_init(&in, response, response_len); + if (!ctx->method->has_srr) { + if (!CBS_stow(&in, out_rr, out_rr_len)) { + return 0; + } + + *out_sig = nullptr; + *out_sig_len = 0; + return 1; + } + + if (!CBS_get_u16_length_prefixed(&in, &srr) || + !CBS_get_u16_length_prefixed(&in, &sig) || CBS_len(&in) != 0) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_ERROR); + return 0; + } + + if (ctx->srr_key == nullptr) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_NO_SRR_KEY_CONFIGURED); + return 0; + } + + EVP_MD_CTX md_ctx; + EVP_MD_CTX_init(&md_ctx); + int sig_ok = + EVP_DigestVerifyInit(&md_ctx, nullptr, nullptr, nullptr, ctx->srr_key) && + EVP_DigestVerify(&md_ctx, CBS_data(&sig), CBS_len(&sig), CBS_data(&srr), + CBS_len(&srr)); + EVP_MD_CTX_cleanup(&md_ctx); + + if (!sig_ok) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_SRR_SIGNATURE_ERROR); + return 0; + } + + uint8_t *srr_buf = nullptr, *sig_buf = nullptr; + size_t srr_len, sig_len; + if (!CBS_stow(&srr, &srr_buf, &srr_len) || + !CBS_stow(&sig, &sig_buf, &sig_len)) { + OPENSSL_free(srr_buf); + OPENSSL_free(sig_buf); + return 0; + } + + *out_rr = srr_buf; + *out_rr_len = srr_len; + *out_sig = sig_buf; + *out_sig_len = sig_len; + return 1; +} + +TRUST_TOKEN_ISSUER *TRUST_TOKEN_ISSUER_new(const TRUST_TOKEN_METHOD *method, + size_t max_batchsize) { + if (max_batchsize > 0xffff) { + // The protocol supports only two-byte token counts. + OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); + return nullptr; + } + + TRUST_TOKEN_ISSUER *ret = New(); + if (ret == nullptr) { + return nullptr; + } + ret->method = method; + ret->max_batchsize = (uint16_t)max_batchsize; + return ret; +} + +void TRUST_TOKEN_ISSUER_free(TRUST_TOKEN_ISSUER *ctx) { + if (ctx == nullptr) { + return; + } + EVP_PKEY_free(ctx->srr_key); + Delete(ctx); +} + +int TRUST_TOKEN_ISSUER_add_key(TRUST_TOKEN_ISSUER *ctx, const uint8_t *key, + size_t key_len) { + if (ctx->num_keys == std::size(ctx->keys) || + ctx->num_keys >= ctx->method->max_keys) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_TOO_MANY_KEYS); + return 0; + } + + struct trust_token_issuer_key_st *key_s = &ctx->keys[ctx->num_keys]; + CBS cbs; + CBS_init(&cbs, key, key_len); + uint32_t key_id; + if (!CBS_get_u32(&cbs, &key_id) || + !ctx->method->issuer_key_from_bytes(&key_s->key, CBS_data(&cbs), + CBS_len(&cbs))) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + + key_s->id = key_id; + ctx->num_keys += 1; + return 1; +} + +int TRUST_TOKEN_ISSUER_set_srr_key(TRUST_TOKEN_ISSUER *ctx, EVP_PKEY *key) { + EVP_PKEY_free(ctx->srr_key); + EVP_PKEY_up_ref(key); + ctx->srr_key = key; + return 1; +} + +static const struct bssl::trust_token_issuer_key_st *trust_token_issuer_get_key( + const TRUST_TOKEN_ISSUER *ctx, uint32_t key_id) { + for (size_t i = 0; i < ctx->num_keys; i++) { + if (ctx->keys[i].id == key_id) { + return &ctx->keys[i]; + } + } + return nullptr; +} + +int TRUST_TOKEN_ISSUER_issue(const TRUST_TOKEN_ISSUER *ctx, uint8_t **out, + size_t *out_len, size_t *out_tokens_issued, + const uint8_t *request, size_t request_len, + uint32_t public_metadata, uint8_t private_metadata, + size_t max_issuance) { + if (max_issuance > ctx->max_batchsize) { + max_issuance = ctx->max_batchsize; + } + + const struct trust_token_issuer_key_st *key = + trust_token_issuer_get_key(ctx, public_metadata); + if (key == nullptr || private_metadata > 1 || + (!ctx->method->has_private_metadata && private_metadata != 0)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_METADATA); + return 0; + } + + CBS in; + uint16_t num_requested; + CBS_init(&in, request, request_len); + if (!CBS_get_u16(&in, &num_requested)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + + size_t num_to_issue = num_requested; + if (num_to_issue > max_issuance) { + num_to_issue = max_issuance; + } + + int ret = 0; + CBB response; + if (!CBB_init(&response, 0) || !CBB_add_u16(&response, num_to_issue) || + !CBB_add_u32(&response, public_metadata)) { + goto err; + } + + if (!ctx->method->sign(&key->key, &response, &in, num_requested, num_to_issue, + private_metadata)) { + goto err; + } + + if (CBS_len(&in) != 0) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + goto err; + } + + if (!CBB_finish(&response, out, out_len)) { + goto err; + } + + *out_tokens_issued = num_to_issue; + ret = 1; + +err: + CBB_cleanup(&response); + return ret; +} + +static int trust_token_issuer_redeem_impl( + const TRUST_TOKEN_ISSUER *ctx, uint32_t *out_public, uint8_t *out_private, + TRUST_TOKEN **out_token, uint8_t **out_client_data, + size_t *out_client_data_len, const uint8_t *request, size_t request_len, + int include_message, const uint8_t *msg, size_t msg_len) { + CBS request_cbs, token_cbs; + CBS_init(&request_cbs, request, request_len); + if (!CBS_get_u16_length_prefixed(&request_cbs, &token_cbs)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_ERROR); + return 0; + } + + uint32_t public_metadata = 0; + uint8_t private_metadata = 0; + + // Parse the token. If there is an error, treat it as an invalid token. + if (!CBS_get_u32(&token_cbs, &public_metadata)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_TOKEN); + return 0; + } + + const struct trust_token_issuer_key_st *key = + trust_token_issuer_get_key(ctx, public_metadata); + uint8_t nonce[TRUST_TOKEN_NONCE_SIZE]; + if (key == nullptr || + !ctx->method->read(&key->key, nonce, &private_metadata, + CBS_data(&token_cbs), CBS_len(&token_cbs), + include_message, msg, msg_len)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_TOKEN); + return 0; + } + + CBS client_data; + if (!CBS_get_u16_length_prefixed(&request_cbs, &client_data) || + (ctx->method->has_srr && !CBS_skip(&request_cbs, 8)) || + CBS_len(&request_cbs) != 0) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_ERROR); + return 0; + } + + uint8_t *client_data_buf = nullptr; + size_t client_data_len = 0; + TRUST_TOKEN *token; + if (!CBS_stow(&client_data, &client_data_buf, &client_data_len)) { + goto err; + } + + token = TRUST_TOKEN_new(nonce, TRUST_TOKEN_NONCE_SIZE); + if (token == nullptr) { + goto err; + } + *out_public = public_metadata; + *out_private = private_metadata; + *out_token = token; + *out_client_data = client_data_buf; + *out_client_data_len = client_data_len; + + return 1; + +err: + OPENSSL_free(client_data_buf); + return 0; +} + + +int TRUST_TOKEN_ISSUER_redeem(const TRUST_TOKEN_ISSUER *ctx, + uint32_t *out_public, uint8_t *out_private, + TRUST_TOKEN **out_token, + uint8_t **out_client_data, + size_t *out_client_data_len, + const uint8_t *request, size_t request_len) { + return trust_token_issuer_redeem_impl(ctx, out_public, out_private, out_token, + out_client_data, out_client_data_len, + request, request_len, 0, nullptr, 0); +} + +int TRUST_TOKEN_ISSUER_redeem_over_message( + const TRUST_TOKEN_ISSUER *ctx, uint32_t *out_public, uint8_t *out_private, + TRUST_TOKEN **out_token, uint8_t **out_client_data, + size_t *out_client_data_len, const uint8_t *request, size_t request_len, + const uint8_t *msg, size_t msg_len) { + return trust_token_issuer_redeem_impl(ctx, out_public, out_private, out_token, + out_client_data, out_client_data_len, + request, request_len, 1, msg, msg_len); +} + +static uint8_t get_metadata_obfuscator(const uint8_t *key, size_t key_len, + const uint8_t *client_data, + size_t client_data_len) { + uint8_t metadata_obfuscator[SHA256_DIGEST_LENGTH]; + SHA256_CTX sha_ctx; + SHA256_Init(&sha_ctx); + SHA256_Update(&sha_ctx, key, key_len); + SHA256_Update(&sha_ctx, client_data, client_data_len); + SHA256_Final(metadata_obfuscator, &sha_ctx); + return metadata_obfuscator[0] >> 7; +} + +int TRUST_TOKEN_decode_private_metadata(const TRUST_TOKEN_METHOD *method, + uint8_t *out_value, const uint8_t *key, + size_t key_len, const uint8_t *nonce, + size_t nonce_len, + uint8_t encrypted_bit) { + uint8_t metadata_obfuscator = + get_metadata_obfuscator(key, key_len, nonce, nonce_len); + *out_value = encrypted_bit ^ metadata_obfuscator; + return 1; +} diff --git a/third_party/boringssl/src/crypto/trust_token/voprf.c b/third_party/boringssl/src/crypto/trust_token/voprf.c deleted file mode 100644 index cedee1e8..00000000 --- a/third_party/boringssl/src/crypto/trust_token/voprf.c +++ /dev/null @@ -1,815 +0,0 @@ -/* Copyright (c) 2020, Google Inc. - * - * Permission to use, copy, modify, and/or distribute this software for any - * purpose with or without fee is hereby granted, provided that the above - * copyright notice and this permission notice appear in all copies. - * - * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES - * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF - * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY - * SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES - * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION - * OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN - * CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ - -#include - -#include -#include -#include -#include -#include -#include -#include - -#include "../ec_extra/internal.h" -#include "../fipsmodule/ec/internal.h" - -#include "internal.h" - - -typedef int (*hash_to_group_func_t)(const EC_GROUP *group, EC_RAW_POINT *out, - const uint8_t t[TRUST_TOKEN_NONCE_SIZE]); -typedef int (*hash_to_scalar_func_t)(const EC_GROUP *group, EC_SCALAR *out, - uint8_t *buf, size_t len); - -typedef struct { - const EC_GROUP *group; - - // hash_to_group implements the HashToGroup operation for VOPRFs. It returns - // one on success and zero on error. - hash_to_group_func_t hash_to_group; - // hash_to_scalar implements the HashToScalar operation for VOPRFs. It returns - // one on success and zero on error. - hash_to_scalar_func_t hash_to_scalar; -} VOPRF_METHOD; - -static const uint8_t kDefaultAdditionalData[32] = {0}; - -static int voprf_init_method(VOPRF_METHOD *method, int curve_nid, - hash_to_group_func_t hash_to_group, - hash_to_scalar_func_t hash_to_scalar) { - method->group = EC_GROUP_new_by_curve_name(curve_nid); - if (method->group == NULL) { - return 0; - } - - method->hash_to_group = hash_to_group; - method->hash_to_scalar = hash_to_scalar; - - return 1; -} - -static int cbb_add_point(CBB *out, const EC_GROUP *group, - const EC_AFFINE *point) { - size_t len = - ec_point_to_bytes(group, point, POINT_CONVERSION_UNCOMPRESSED, NULL, 0); - if (len == 0) { - return 0; - } - - uint8_t *p; - return CBB_add_space(out, &p, len) && - ec_point_to_bytes(group, point, POINT_CONVERSION_UNCOMPRESSED, p, - len) == len && - CBB_flush(out); -} - -static int cbs_get_point(CBS *cbs, const EC_GROUP *group, EC_AFFINE *out) { - CBS child; - size_t plen = 1 + 2 * BN_num_bytes(&group->field); - if (!CBS_get_bytes(cbs, &child, plen) || - !ec_point_from_uncompressed(group, out, CBS_data(&child), - CBS_len(&child))) { - return 0; - } - return 1; -} - -static int scalar_to_cbb(CBB *out, const EC_GROUP *group, - const EC_SCALAR *scalar) { - uint8_t *buf; - size_t scalar_len = BN_num_bytes(&group->order); - if (!CBB_add_space(out, &buf, scalar_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return 0; - } - ec_scalar_to_bytes(group, buf, &scalar_len, scalar); - return 1; -} - -static int scalar_from_cbs(CBS *cbs, const EC_GROUP *group, EC_SCALAR *out) { - size_t scalar_len = BN_num_bytes(&group->order); - CBS tmp; - if (!CBS_get_bytes(cbs, &tmp, scalar_len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - - ec_scalar_from_bytes(group, out, CBS_data(&tmp), CBS_len(&tmp)); - return 1; -} - -static int voprf_calculate_key(const VOPRF_METHOD *method, CBB *out_private, - CBB *out_public, const EC_SCALAR *priv) { - const EC_GROUP *group = method->group; - EC_RAW_POINT pub; - EC_AFFINE pub_affine; - if (!ec_point_mul_scalar_base(group, &pub, priv) || - !ec_jacobian_to_affine(group, &pub_affine, &pub)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); - return 0; - } - - if (!scalar_to_cbb(out_private, group, priv) || - !cbb_add_point(out_public, group, &pub_affine)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); - return 0; - } - - return 1; -} - - -static int voprf_generate_key(const VOPRF_METHOD *method, CBB *out_private, - CBB *out_public) { - EC_SCALAR priv; - if (!ec_random_nonzero_scalar(method->group, &priv, kDefaultAdditionalData)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); - return 0; - } - return voprf_calculate_key(method, out_private, out_public, &priv); -} - -static int voprf_derive_key_from_secret(const VOPRF_METHOD *method, - CBB *out_private, CBB *out_public, - const uint8_t *secret, - size_t secret_len) { - static const uint8_t kKeygenLabel[] = "TrustTokenVOPRFKeyGen"; - - EC_SCALAR priv; - int ok = 0; - CBB cbb; - CBB_zero(&cbb); - uint8_t *buf = NULL; - size_t len; - if (!CBB_init(&cbb, 0) || - !CBB_add_bytes(&cbb, kKeygenLabel, sizeof(kKeygenLabel)) || - !CBB_add_bytes(&cbb, secret, secret_len) || - !CBB_finish(&cbb, &buf, &len) || - !method->hash_to_scalar(method->group, &priv, buf, len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); - goto err; - } - - ok = voprf_calculate_key(method, out_private, out_public, &priv); - -err: - CBB_cleanup(&cbb); - OPENSSL_free(buf); - return ok; -} - -static int voprf_client_key_from_bytes(const VOPRF_METHOD *method, - TRUST_TOKEN_CLIENT_KEY *key, - const uint8_t *in, size_t len) { - const EC_GROUP *group = method->group; - if (!ec_point_from_uncompressed(group, &key->pubs, in, len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - - return 1; -} - -static int voprf_issuer_key_from_bytes(const VOPRF_METHOD *method, - TRUST_TOKEN_ISSUER_KEY *key, - const uint8_t *in, size_t len) { - const EC_GROUP *group = method->group; - if (!ec_scalar_from_bytes(group, &key->xs, in, len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - - // Recompute the public key. - EC_RAW_POINT pub; - if (!ec_point_mul_scalar_base(group, &pub, &key->xs) || - !ec_jacobian_to_affine(group, &key->pubs, &pub)) { - return 0; - } - - return 1; -} - -static STACK_OF(TRUST_TOKEN_PRETOKEN) * - voprf_blind(const VOPRF_METHOD *method, CBB *cbb, size_t count) { - const EC_GROUP *group = method->group; - STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens = - sk_TRUST_TOKEN_PRETOKEN_new_null(); - if (pretokens == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - for (size_t i = 0; i < count; i++) { - // Insert |pretoken| into |pretokens| early to simplify error-handling. - TRUST_TOKEN_PRETOKEN *pretoken = - OPENSSL_malloc(sizeof(TRUST_TOKEN_PRETOKEN)); - if (pretoken == NULL || - !sk_TRUST_TOKEN_PRETOKEN_push(pretokens, pretoken)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - TRUST_TOKEN_PRETOKEN_free(pretoken); - goto err; - } - - RAND_bytes(pretoken->t, sizeof(pretoken->t)); - - // We sample r in Montgomery form to simplify inverting. - EC_SCALAR r; - if (!ec_random_nonzero_scalar(group, &r, - kDefaultAdditionalData)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - // pretoken->r is rinv. - ec_scalar_inv0_montgomery(group, &pretoken->r, &r); - // Convert both out of Montgomery form. - ec_scalar_from_montgomery(group, &r, &r); - ec_scalar_from_montgomery(group, &pretoken->r, &pretoken->r); - - // Tp is the blinded token in the VOPRF protocol. - EC_RAW_POINT P, Tp; - if (!method->hash_to_group(group, &P, pretoken->t) || - !ec_point_mul_scalar(group, &Tp, &P, &r) || - !ec_jacobian_to_affine(group, &pretoken->Tp, &Tp)) { - goto err; - } - - if (!cbb_add_point(cbb, group, &pretoken->Tp)) { - goto err; - } - } - - return pretokens; - -err: - sk_TRUST_TOKEN_PRETOKEN_pop_free(pretokens, TRUST_TOKEN_PRETOKEN_free); - return NULL; -} - -static int hash_to_scalar_dleq(const VOPRF_METHOD *method, EC_SCALAR *out, - const EC_AFFINE *X, const EC_AFFINE *T, - const EC_AFFINE *W, const EC_AFFINE *K0, - const EC_AFFINE *K1) { - static const uint8_t kDLEQLabel[] = "DLEQ"; - - int ok = 0; - CBB cbb; - CBB_zero(&cbb); - uint8_t *buf = NULL; - size_t len; - if (!CBB_init(&cbb, 0) || - !CBB_add_bytes(&cbb, kDLEQLabel, sizeof(kDLEQLabel)) || - !cbb_add_point(&cbb, method->group, X) || - !cbb_add_point(&cbb, method->group, T) || - !cbb_add_point(&cbb, method->group, W) || - !cbb_add_point(&cbb, method->group, K0) || - !cbb_add_point(&cbb, method->group, K1) || - !CBB_finish(&cbb, &buf, &len) || - !method->hash_to_scalar(method->group, out, buf, len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - ok = 1; - -err: - CBB_cleanup(&cbb); - OPENSSL_free(buf); - return ok; -} - -static int hash_to_scalar_batch(const VOPRF_METHOD *method, EC_SCALAR *out, - const CBB *points, size_t index) { - static const uint8_t kDLEQBatchLabel[] = "DLEQ BATCH"; - if (index > 0xffff) { - // The protocol supports only two-byte batches. - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); - return 0; - } - - int ok = 0; - CBB cbb; - CBB_zero(&cbb); - uint8_t *buf = NULL; - size_t len; - if (!CBB_init(&cbb, 0) || - !CBB_add_bytes(&cbb, kDLEQBatchLabel, sizeof(kDLEQBatchLabel)) || - !CBB_add_bytes(&cbb, CBB_data(points), CBB_len(points)) || - !CBB_add_u16(&cbb, (uint16_t)index) || - !CBB_finish(&cbb, &buf, &len) || - !method->hash_to_scalar(method->group, out, buf, len)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - ok = 1; - -err: - CBB_cleanup(&cbb); - OPENSSL_free(buf); - return ok; -} - -static int dleq_generate(const VOPRF_METHOD *method, CBB *cbb, - const TRUST_TOKEN_ISSUER_KEY *priv, - const EC_RAW_POINT *T, const EC_RAW_POINT *W) { - const EC_GROUP *group = method->group; - - enum { - idx_T, - idx_W, - idx_k0, - idx_k1, - num_idx, - }; - EC_RAW_POINT jacobians[num_idx]; - - // Setup the DLEQ proof. - EC_SCALAR r; - if (// r <- Zp - !ec_random_nonzero_scalar(group, &r, kDefaultAdditionalData) || - // k0;k1 = r*(G;T) - !ec_point_mul_scalar_base(group, &jacobians[idx_k0], &r) || - !ec_point_mul_scalar(group, &jacobians[idx_k1], T, &r)) { - return 0; - } - - EC_AFFINE affines[num_idx]; - jacobians[idx_T] = *T; - jacobians[idx_W] = *W; - if (!ec_jacobian_to_affine_batch(group, affines, jacobians, num_idx)) { - return 0; - } - - // Compute c = Hc(...). - EC_SCALAR c; - if (!hash_to_scalar_dleq(method, &c, &priv->pubs, &affines[idx_T], - &affines[idx_W], &affines[idx_k0], - &affines[idx_k1])) { - return 0; - } - - - EC_SCALAR c_mont; - ec_scalar_to_montgomery(group, &c_mont, &c); - - // u = r + c*xs - EC_SCALAR u; - ec_scalar_mul_montgomery(group, &u, &priv->xs, &c_mont); - ec_scalar_add(group, &u, &r, &u); - - // Store DLEQ proof in transcript. - if (!scalar_to_cbb(cbb, group, &c) || - !scalar_to_cbb(cbb, group, &u)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return 0; - } - - return 1; -} - -static int mul_public_2(const EC_GROUP *group, EC_RAW_POINT *out, - const EC_RAW_POINT *p0, const EC_SCALAR *scalar0, - const EC_RAW_POINT *p1, const EC_SCALAR *scalar1) { - EC_RAW_POINT points[2] = {*p0, *p1}; - EC_SCALAR scalars[2] = {*scalar0, *scalar1}; - return ec_point_mul_scalar_public_batch(group, out, /*g_scalar=*/NULL, points, - scalars, 2); -} - -static int dleq_verify(const VOPRF_METHOD *method, CBS *cbs, - const TRUST_TOKEN_CLIENT_KEY *pub, const EC_RAW_POINT *T, - const EC_RAW_POINT *W) { - const EC_GROUP *group = method->group; - - - enum { - idx_T, - idx_W, - idx_k0, - idx_k1, - num_idx, - }; - EC_RAW_POINT jacobians[num_idx]; - - // Decode the DLEQ proof. - EC_SCALAR c, u; - if (!scalar_from_cbs(cbs, group, &c) || - !scalar_from_cbs(cbs, group, &u)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return 0; - } - - // k0;k1 = u*(G;T) - c*(pub;W) - EC_RAW_POINT pubs; - ec_affine_to_jacobian(group, &pubs, &pub->pubs); - EC_SCALAR minus_c; - ec_scalar_neg(group, &minus_c, &c); - if (!ec_point_mul_scalar_public(group, &jacobians[idx_k0], &u, &pubs, - &minus_c) || - !mul_public_2(group, &jacobians[idx_k1], T, &u, W, &minus_c)) { - return 0; - } - - // Check the DLEQ proof. - EC_AFFINE affines[num_idx]; - jacobians[idx_T] = *T; - jacobians[idx_W] = *W; - if (!ec_jacobian_to_affine_batch(group, affines, jacobians, num_idx)) { - return 0; - } - - // Compute c = Hc(...). - EC_SCALAR calculated; - if (!hash_to_scalar_dleq(method, &calculated, &pub->pubs, &affines[idx_T], - &affines[idx_W], &affines[idx_k0], - &affines[idx_k1])) { - return 0; - } - - // c == calculated - if (!ec_scalar_equal_vartime(group, &c, &calculated)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_PROOF); - return 0; - } - - return 1; -} - -static int voprf_sign(const VOPRF_METHOD *method, - const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, - size_t num_requested, size_t num_to_issue) { - const EC_GROUP *group = method->group; - if (num_requested < num_to_issue) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); - return 0; - } - - if (num_to_issue > ((size_t)-1) / sizeof(EC_RAW_POINT) || - num_to_issue > ((size_t)-1) / sizeof(EC_SCALAR)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); - return 0; - } - - int ret = 0; - EC_RAW_POINT *BTs = OPENSSL_malloc(num_to_issue * sizeof(EC_RAW_POINT)); - EC_RAW_POINT *Zs = OPENSSL_malloc(num_to_issue * sizeof(EC_RAW_POINT)); - EC_SCALAR *es = OPENSSL_malloc(num_to_issue * sizeof(EC_SCALAR)); - CBB batch_cbb; - CBB_zero(&batch_cbb); - if (!BTs || - !Zs || - !es || - !CBB_init(&batch_cbb, 0) || - !cbb_add_point(&batch_cbb, method->group, &key->pubs)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - for (size_t i = 0; i < num_to_issue; i++) { - EC_AFFINE BT_affine, Z_affine; - EC_RAW_POINT BT, Z; - if (!cbs_get_point(cbs, group, &BT_affine)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - goto err; - } - ec_affine_to_jacobian(group, &BT, &BT_affine); - if (!ec_point_mul_scalar(group, &Z, &BT, &key->xs) || - !ec_jacobian_to_affine(group, &Z_affine, &Z) || - !cbb_add_point(cbb, group, &Z_affine)) { - goto err; - } - - if (!cbb_add_point(&batch_cbb, group, &BT_affine) || - !cbb_add_point(&batch_cbb, group, &Z_affine)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - BTs[i] = BT; - Zs[i] = Z; - - if (!CBB_flush(cbb)) { - goto err; - } - } - - // The DLEQ batching construction is described in appendix B of - // https://eprint.iacr.org/2020/072/20200324:214215. Note the additional - // computations all act on public inputs. - for (size_t i = 0; i < num_to_issue; i++) { - if (!hash_to_scalar_batch(method, &es[i], &batch_cbb, i)) { - goto err; - } - } - - EC_RAW_POINT BT_batch, Z_batch; - if (!ec_point_mul_scalar_public_batch(group, &BT_batch, - /*g_scalar=*/NULL, BTs, es, - num_to_issue) || - !ec_point_mul_scalar_public_batch(group, &Z_batch, - /*g_scalar=*/NULL, Zs, es, - num_to_issue)) { - goto err; - } - - CBB proof; - if (!CBB_add_u16_length_prefixed(cbb, &proof) || - !dleq_generate(method, &proof, key, &BT_batch, &Z_batch) || - !CBB_flush(cbb)) { - goto err; - } - - // Skip over any unused requests. - size_t point_len = 1 + 2 * BN_num_bytes(&group->field); - if (!CBS_skip(cbs, point_len * (num_requested - num_to_issue))) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - goto err; - } - - ret = 1; - -err: - OPENSSL_free(BTs); - OPENSSL_free(Zs); - OPENSSL_free(es); - CBB_cleanup(&batch_cbb); - return ret; -} - -static STACK_OF(TRUST_TOKEN) * - voprf_unblind(const VOPRF_METHOD *method, const TRUST_TOKEN_CLIENT_KEY *key, - const STACK_OF(TRUST_TOKEN_PRETOKEN) * pretokens, CBS *cbs, - size_t count, uint32_t key_id) { - const EC_GROUP *group = method->group; - if (count > sk_TRUST_TOKEN_PRETOKEN_num(pretokens)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - return NULL; - } - - int ok = 0; - STACK_OF(TRUST_TOKEN) *ret = sk_TRUST_TOKEN_new_null(); - if (ret == NULL) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - return NULL; - } - - if (count > ((size_t)-1) / sizeof(EC_RAW_POINT) || - count > ((size_t)-1) / sizeof(EC_SCALAR)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); - return 0; - } - EC_RAW_POINT *BTs = OPENSSL_malloc(count * sizeof(EC_RAW_POINT)); - EC_RAW_POINT *Zs = OPENSSL_malloc(count * sizeof(EC_RAW_POINT)); - EC_SCALAR *es = OPENSSL_malloc(count * sizeof(EC_SCALAR)); - CBB batch_cbb; - CBB_zero(&batch_cbb); - if (!BTs || - !Zs || - !es || - !CBB_init(&batch_cbb, 0) || - !cbb_add_point(&batch_cbb, method->group, &key->pubs)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - for (size_t i = 0; i < count; i++) { - const TRUST_TOKEN_PRETOKEN *pretoken = - sk_TRUST_TOKEN_PRETOKEN_value(pretokens, i); - - EC_AFFINE Z_affine; - if (!cbs_get_point(cbs, group, &Z_affine)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); - goto err; - } - - ec_affine_to_jacobian(group, &BTs[i], &pretoken->Tp); - ec_affine_to_jacobian(group, &Zs[i], &Z_affine); - - if (!cbb_add_point(&batch_cbb, group, &pretoken->Tp) || - !cbb_add_point(&batch_cbb, group, &Z_affine)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - goto err; - } - - // Unblind the token. - // pretoken->r is rinv. - EC_RAW_POINT N; - EC_AFFINE N_affine; - if (!ec_point_mul_scalar(group, &N, &Zs[i], &pretoken->r) || - !ec_jacobian_to_affine(group, &N_affine, &N)) { - goto err; - } - - // Serialize the token. Include |key_id| to avoid an extra copy in the layer - // above. - CBB token_cbb; - size_t point_len = 1 + 2 * BN_num_bytes(&group->field); - if (!CBB_init(&token_cbb, 4 + TRUST_TOKEN_NONCE_SIZE + (2 + point_len)) || - !CBB_add_u32(&token_cbb, key_id) || - !CBB_add_bytes(&token_cbb, pretoken->t, TRUST_TOKEN_NONCE_SIZE) || - !cbb_add_point(&token_cbb, group, &N_affine) || - !CBB_flush(&token_cbb)) { - CBB_cleanup(&token_cbb); - goto err; - } - - TRUST_TOKEN *token = - TRUST_TOKEN_new(CBB_data(&token_cbb), CBB_len(&token_cbb)); - CBB_cleanup(&token_cbb); - if (token == NULL || - !sk_TRUST_TOKEN_push(ret, token)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_MALLOC_FAILURE); - TRUST_TOKEN_free(token); - goto err; - } - } - - // The DLEQ batching construction is described in appendix B of - // https://eprint.iacr.org/2020/072/20200324:214215. Note the additional - // computations all act on public inputs. - for (size_t i = 0; i < count; i++) { - if (!hash_to_scalar_batch(method, &es[i], &batch_cbb, i)) { - goto err; - } - } - - EC_RAW_POINT BT_batch, Z_batch; - if (!ec_point_mul_scalar_public_batch(group, &BT_batch, - /*g_scalar=*/NULL, BTs, es, count) || - !ec_point_mul_scalar_public_batch(group, &Z_batch, - /*g_scalar=*/NULL, Zs, es, count)) { - goto err; - } - - CBS proof; - if (!CBS_get_u16_length_prefixed(cbs, &proof) || - !dleq_verify(method, &proof, key, &BT_batch, &Z_batch) || - CBS_len(&proof) != 0) { - goto err; - } - - ok = 1; - -err: - OPENSSL_free(BTs); - OPENSSL_free(Zs); - OPENSSL_free(es); - CBB_cleanup(&batch_cbb); - if (!ok) { - sk_TRUST_TOKEN_pop_free(ret, TRUST_TOKEN_free); - ret = NULL; - } - return ret; -} - -static int voprf_read(const VOPRF_METHOD *method, - const TRUST_TOKEN_ISSUER_KEY *key, - uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], - const uint8_t *token, size_t token_len) { - const EC_GROUP *group = method->group; - CBS cbs; - CBS_init(&cbs, token, token_len); - EC_AFFINE Ws; - if (!CBS_copy_bytes(&cbs, out_nonce, TRUST_TOKEN_NONCE_SIZE) || - !cbs_get_point(&cbs, group, &Ws) || - CBS_len(&cbs) != 0) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_TOKEN); - return 0; - } - - - EC_RAW_POINT T; - if (!method->hash_to_group(group, &T, out_nonce)) { - return 0; - } - - EC_RAW_POINT Ws_calculated; - if (!ec_point_mul_scalar(group, &Ws_calculated, &T, &key->xs) || - !ec_affine_jacobian_equal(group, &Ws, &Ws_calculated)) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BAD_VALIDITY_CHECK); - return 0; - } - - return 1; -} - - -// VOPRF experiment v2. - -static int voprf_exp2_hash_to_group(const EC_GROUP *group, EC_RAW_POINT *out, - const uint8_t t[TRUST_TOKEN_NONCE_SIZE]) { - const uint8_t kHashTLabel[] = "TrustToken VOPRF Experiment V2 HashToGroup"; - return ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( - group, out, kHashTLabel, sizeof(kHashTLabel), t, TRUST_TOKEN_NONCE_SIZE); -} - -static int voprf_exp2_hash_to_scalar(const EC_GROUP *group, EC_SCALAR *out, - uint8_t *buf, size_t len) { - const uint8_t kHashCLabel[] = "TrustToken VOPRF Experiment V2 HashToScalar"; - return ec_hash_to_scalar_p384_xmd_sha512_draft07( - group, out, kHashCLabel, sizeof(kHashCLabel), buf, len); -} - -static int voprf_exp2_ok = 0; -static VOPRF_METHOD voprf_exp2_method; -static CRYPTO_once_t voprf_exp2_method_once = CRYPTO_ONCE_INIT; - -static void voprf_exp2_init_method_impl(void) { - voprf_exp2_ok = - voprf_init_method(&voprf_exp2_method, NID_secp384r1, - voprf_exp2_hash_to_group, voprf_exp2_hash_to_scalar); -} - -static int voprf_exp2_init_method(void) { - CRYPTO_once(&voprf_exp2_method_once, voprf_exp2_init_method_impl); - if (!voprf_exp2_ok) { - OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); - return 0; - } - return 1; -} - -int voprf_exp2_generate_key(CBB *out_private, CBB *out_public) { - if (!voprf_exp2_init_method()) { - return 0; - } - - return voprf_generate_key(&voprf_exp2_method, out_private, out_public); -} - -int voprf_exp2_derive_key_from_secret(CBB *out_private, CBB *out_public, - const uint8_t *secret, - size_t secret_len) { - if (!voprf_exp2_init_method()) { - return 0; - } - - return voprf_derive_key_from_secret(&voprf_exp2_method, out_private, - out_public, secret, secret_len); -} - -int voprf_exp2_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, - const uint8_t *in, size_t len) { - if (!voprf_exp2_init_method()) { - return 0; - } - return voprf_client_key_from_bytes(&voprf_exp2_method, key, in, len); -} - -int voprf_exp2_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, - const uint8_t *in, size_t len) { - if (!voprf_exp2_init_method()) { - return 0; - } - return voprf_issuer_key_from_bytes(&voprf_exp2_method, key, in, len); -} - -STACK_OF(TRUST_TOKEN_PRETOKEN) * voprf_exp2_blind(CBB *cbb, size_t count) { - if (!voprf_exp2_init_method()) { - return NULL; - } - return voprf_blind(&voprf_exp2_method, cbb, count); -} - -int voprf_exp2_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, - size_t num_requested, size_t num_to_issue, - uint8_t private_metadata) { - if (!voprf_exp2_init_method() || private_metadata != 0) { - return 0; - } - return voprf_sign(&voprf_exp2_method, key, cbb, cbs, num_requested, - num_to_issue); -} - -STACK_OF(TRUST_TOKEN) * - voprf_exp2_unblind(const TRUST_TOKEN_CLIENT_KEY *key, - const STACK_OF(TRUST_TOKEN_PRETOKEN) * pretokens, - CBS *cbs, size_t count, uint32_t key_id) { - if (!voprf_exp2_init_method()) { - return NULL; - } - return voprf_unblind(&voprf_exp2_method, key, pretokens, cbs, count, - key_id); -} - -int voprf_exp2_read(const TRUST_TOKEN_ISSUER_KEY *key, - uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], - uint8_t *out_private_metadata, const uint8_t *token, - size_t token_len) { - if (!voprf_exp2_init_method()) { - return 0; - } - return voprf_read(&voprf_exp2_method, key, out_nonce, token, token_len); -} diff --git a/third_party/boringssl/src/crypto/trust_token/voprf.cc b/third_party/boringssl/src/crypto/trust_token/voprf.cc new file mode 100644 index 00000000..1a302b23 --- /dev/null +++ b/third_party/boringssl/src/crypto/trust_token/voprf.cc @@ -0,0 +1,1266 @@ +// Copyright 2020 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../ec/internal.h" +#include "../fipsmodule/ec/internal.h" +#include "../mem_internal.h" + +#include "internal.h" + + +using namespace bssl; + +typedef int (*hash_to_group_func_t)(const EC_GROUP *group, EC_JACOBIAN *out, + const uint8_t t[TRUST_TOKEN_NONCE_SIZE]); +typedef int (*hash_to_scalar_func_t)(const EC_GROUP *group, EC_SCALAR *out, + uint8_t *buf, size_t len); + +typedef struct { + const EC_GROUP *(*group_func)(); + + // hash_to_group implements the HashToGroup operation for VOPRFs. It returns + // one on success and zero on error. + hash_to_group_func_t hash_to_group; + // hash_to_scalar implements the HashToScalar operation for VOPRFs. It returns + // one on success and zero on error. + hash_to_scalar_func_t hash_to_scalar; +} VOPRF_METHOD; + +static const uint8_t kDefaultAdditionalData[32] = {0}; + +static int cbb_add_point(CBB *out, const EC_GROUP *group, + const EC_AFFINE *point) { + uint8_t *p; + size_t len = ec_point_byte_len(group, POINT_CONVERSION_UNCOMPRESSED); + return CBB_add_space(out, &p, len) && + ec_point_to_bytes(group, point, POINT_CONVERSION_UNCOMPRESSED, p, + len) == len && + CBB_flush(out); +} + +static int cbb_serialize_point(CBB *out, const EC_GROUP *group, + const EC_AFFINE *point) { + uint8_t *p; + size_t len = ec_point_byte_len(group, POINT_CONVERSION_COMPRESSED); + return CBB_add_u16(out, len) && CBB_add_space(out, &p, len) && + ec_point_to_bytes(group, point, POINT_CONVERSION_COMPRESSED, p, len) == + len && + CBB_flush(out); +} + +static int cbs_get_point(CBS *cbs, const EC_GROUP *group, EC_AFFINE *out) { + CBS child; + size_t plen = ec_point_byte_len(group, POINT_CONVERSION_UNCOMPRESSED); + if (!CBS_get_bytes(cbs, &child, plen) || + !ec_point_from_uncompressed(group, out, CBS_data(&child), + CBS_len(&child))) { + return 0; + } + return 1; +} + +static int scalar_to_cbb(CBB *out, const EC_GROUP *group, + const EC_SCALAR *scalar) { + uint8_t *buf; + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); + if (!CBB_add_space(out, &buf, scalar_len)) { + return 0; + } + ec_scalar_to_bytes(group, buf, &scalar_len, scalar); + return 1; +} + +static int scalar_from_cbs(CBS *cbs, const EC_GROUP *group, EC_SCALAR *out) { + size_t scalar_len = BN_num_bytes(EC_GROUP_get0_order(group)); + CBS tmp; + if (!CBS_get_bytes(cbs, &tmp, scalar_len)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + + ec_scalar_from_bytes(group, out, CBS_data(&tmp), CBS_len(&tmp)); + return 1; +} + +static int voprf_calculate_key(const VOPRF_METHOD *method, CBB *out_private, + CBB *out_public, const EC_SCALAR *priv) { + const EC_GROUP *group = method->group_func(); + EC_JACOBIAN pub; + EC_AFFINE pub_affine; + if (!ec_point_mul_scalar_base(group, &pub, priv) || + !ec_jacobian_to_affine(group, &pub_affine, &pub)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); + return 0; + } + + if (!scalar_to_cbb(out_private, group, priv) || + !cbb_add_point(out_public, group, &pub_affine)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BUFFER_TOO_SMALL); + return 0; + } + + return 1; +} + + +static int voprf_generate_key(const VOPRF_METHOD *method, CBB *out_private, + CBB *out_public) { + EC_SCALAR priv; + if (!ec_random_nonzero_scalar(method->group_func(), &priv, + kDefaultAdditionalData)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); + return 0; + } + return voprf_calculate_key(method, out_private, out_public, &priv); +} + +static int voprf_derive_key_from_secret(const VOPRF_METHOD *method, + CBB *out_private, CBB *out_public, + const uint8_t *secret, + size_t secret_len) { + static const uint8_t kKeygenLabel[] = "TrustTokenVOPRFKeyGen"; + + EC_SCALAR priv; + int ok = 0; + CBB cbb; + CBB_zero(&cbb); + uint8_t *buf = nullptr; + size_t len; + if (!CBB_init(&cbb, 0) || + !CBB_add_bytes(&cbb, kKeygenLabel, sizeof(kKeygenLabel)) || + !CBB_add_bytes(&cbb, secret, secret_len) || + !CBB_finish(&cbb, &buf, &len) || + !method->hash_to_scalar(method->group_func(), &priv, buf, len)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_KEYGEN_FAILURE); + goto err; + } + + ok = voprf_calculate_key(method, out_private, out_public, &priv); + +err: + CBB_cleanup(&cbb); + OPENSSL_free(buf); + return ok; +} + +static int voprf_client_key_from_bytes(const VOPRF_METHOD *method, + TRUST_TOKEN_CLIENT_KEY *key, + const uint8_t *in, size_t len) { + const EC_GROUP *group = method->group_func(); + if (!ec_point_from_uncompressed(group, &key->pubs, in, len)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + + return 1; +} + +static int voprf_issuer_key_from_bytes(const VOPRF_METHOD *method, + TRUST_TOKEN_ISSUER_KEY *key, + const uint8_t *in, size_t len) { + const EC_GROUP *group = method->group_func(); + if (!ec_scalar_from_bytes(group, &key->xs, in, len)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + + // Recompute the public key. + EC_JACOBIAN pub; + if (!ec_point_mul_scalar_base(group, &pub, &key->xs) || + !ec_jacobian_to_affine(group, &key->pubs, &pub)) { + return 0; + } + + return 1; +} + +static STACK_OF(TRUST_TOKEN_PRETOKEN) *voprf_blind(const VOPRF_METHOD *method, + CBB *cbb, size_t count, + int include_message, + const uint8_t *msg, + size_t msg_len) { + SHA512_CTX hash_ctx; + + const EC_GROUP *group = method->group_func(); + STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens = + sk_TRUST_TOKEN_PRETOKEN_new_null(); + if (pretokens == nullptr) { + goto err; + } + + for (size_t i = 0; i < count; i++) { + // Insert |pretoken| into |pretokens| early to simplify error-handling. + TRUST_TOKEN_PRETOKEN *pretoken = New(); + if (pretoken == nullptr || + !sk_TRUST_TOKEN_PRETOKEN_push(pretokens, pretoken)) { + TRUST_TOKEN_PRETOKEN_free(pretoken); + goto err; + } + + RAND_bytes(pretoken->salt, sizeof(pretoken->salt)); + if (include_message) { + assert(SHA512_DIGEST_LENGTH == TRUST_TOKEN_NONCE_SIZE); + SHA512_Init(&hash_ctx); + SHA512_Update(&hash_ctx, pretoken->salt, sizeof(pretoken->salt)); + SHA512_Update(&hash_ctx, msg, msg_len); + SHA512_Final(pretoken->t, &hash_ctx); + } else { + OPENSSL_memcpy(pretoken->t, pretoken->salt, TRUST_TOKEN_NONCE_SIZE); + } + + // We sample r in Montgomery form to simplify inverting. + EC_SCALAR r; + if (!ec_random_nonzero_scalar(group, &r, kDefaultAdditionalData)) { + goto err; + } + + // pretoken->r is rinv. + ec_scalar_inv0_montgomery(group, &pretoken->r, &r); + // Convert both out of Montgomery form. + ec_scalar_from_montgomery(group, &r, &r); + ec_scalar_from_montgomery(group, &pretoken->r, &pretoken->r); + + // Tp is the blinded token in the VOPRF protocol. + EC_JACOBIAN P, Tp; + if (!method->hash_to_group(group, &P, pretoken->t) || + !ec_point_mul_scalar(group, &Tp, &P, &r) || + !ec_jacobian_to_affine(group, &pretoken->Tp, &Tp)) { + goto err; + } + + if (!cbb_add_point(cbb, group, &pretoken->Tp)) { + goto err; + } + } + + return pretokens; + +err: + sk_TRUST_TOKEN_PRETOKEN_pop_free(pretokens, TRUST_TOKEN_PRETOKEN_free); + return nullptr; +} + +static int hash_to_scalar_dleq(const VOPRF_METHOD *method, EC_SCALAR *out, + const EC_AFFINE *X, const EC_AFFINE *T, + const EC_AFFINE *W, const EC_AFFINE *K0, + const EC_AFFINE *K1) { + static const uint8_t kDLEQLabel[] = "DLEQ"; + + const EC_GROUP *group = method->group_func(); + int ok = 0; + CBB cbb; + CBB_zero(&cbb); + uint8_t *buf = nullptr; + size_t len; + if (!CBB_init(&cbb, 0) || + !CBB_add_bytes(&cbb, kDLEQLabel, sizeof(kDLEQLabel)) || + !cbb_add_point(&cbb, group, X) || !cbb_add_point(&cbb, group, T) || + !cbb_add_point(&cbb, group, W) || !cbb_add_point(&cbb, group, K0) || + !cbb_add_point(&cbb, group, K1) || !CBB_finish(&cbb, &buf, &len) || + !method->hash_to_scalar(group, out, buf, len)) { + goto err; + } + + ok = 1; + +err: + CBB_cleanup(&cbb); + OPENSSL_free(buf); + return ok; +} + +static int hash_to_scalar_challenge(const VOPRF_METHOD *method, EC_SCALAR *out, + const EC_AFFINE *Bm, const EC_AFFINE *a0, + const EC_AFFINE *a1, const EC_AFFINE *a2, + const EC_AFFINE *a3) { + static const uint8_t kChallengeLabel[] = "Challenge"; + + const EC_GROUP *group = method->group_func(); + CBB cbb; + uint8_t transcript[5 * EC_MAX_COMPRESSED + 2 + sizeof(kChallengeLabel) - 1]; + size_t len; + if (!CBB_init_fixed(&cbb, transcript, sizeof(transcript)) || + !cbb_serialize_point(&cbb, group, Bm) || + !cbb_serialize_point(&cbb, group, a0) || + !cbb_serialize_point(&cbb, group, a1) || + !cbb_serialize_point(&cbb, group, a2) || + !cbb_serialize_point(&cbb, group, a3) || + !CBB_add_bytes(&cbb, kChallengeLabel, sizeof(kChallengeLabel) - 1) || + !CBB_finish(&cbb, nullptr, &len) || + !method->hash_to_scalar(group, out, transcript, len)) { + return 0; + } + + return 1; +} + +static int hash_to_scalar_batch(const VOPRF_METHOD *method, EC_SCALAR *out, + const CBB *points, size_t index) { + static const uint8_t kDLEQBatchLabel[] = "DLEQ BATCH"; + if (index > 0xffff) { + // The protocol supports only two-byte batches. + OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_OVERFLOW); + return 0; + } + + int ok = 0; + CBB cbb; + CBB_zero(&cbb); + uint8_t *buf = nullptr; + size_t len; + if (!CBB_init(&cbb, 0) || + !CBB_add_bytes(&cbb, kDLEQBatchLabel, sizeof(kDLEQBatchLabel)) || + !CBB_add_bytes(&cbb, CBB_data(points), CBB_len(points)) || + !CBB_add_u16(&cbb, (uint16_t)index) || !CBB_finish(&cbb, &buf, &len) || + !method->hash_to_scalar(method->group_func(), out, buf, len)) { + goto err; + } + + ok = 1; + +err: + CBB_cleanup(&cbb); + OPENSSL_free(buf); + return ok; +} + +static int dleq_generate(const VOPRF_METHOD *method, CBB *cbb, + const TRUST_TOKEN_ISSUER_KEY *priv, + const EC_JACOBIAN *T, const EC_JACOBIAN *W) { + const EC_GROUP *group = method->group_func(); + + enum { + idx_T, + idx_W, + idx_k0, + idx_k1, + num_idx, + }; + EC_JACOBIAN jacobians[num_idx]; + + // Setup the DLEQ proof. + EC_SCALAR r; + if ( // r <- Zp + !ec_random_nonzero_scalar(group, &r, kDefaultAdditionalData) || + // k0;k1 = r*(G;T) + !ec_point_mul_scalar_base(group, &jacobians[idx_k0], &r) || + !ec_point_mul_scalar(group, &jacobians[idx_k1], T, &r)) { + return 0; + } + + EC_AFFINE affines[num_idx]; + jacobians[idx_T] = *T; + jacobians[idx_W] = *W; + if (!ec_jacobian_to_affine_batch(group, affines, jacobians, num_idx)) { + return 0; + } + + // Compute c = Hc(...). + EC_SCALAR c; + if (!hash_to_scalar_dleq(method, &c, &priv->pubs, &affines[idx_T], + &affines[idx_W], &affines[idx_k0], + &affines[idx_k1])) { + return 0; + } + + + EC_SCALAR c_mont; + ec_scalar_to_montgomery(group, &c_mont, &c); + + // u = r + c*xs + EC_SCALAR u; + ec_scalar_mul_montgomery(group, &u, &priv->xs, &c_mont); + ec_scalar_add(group, &u, &r, &u); + + // Store DLEQ proof in transcript. + if (!scalar_to_cbb(cbb, group, &c) || !scalar_to_cbb(cbb, group, &u)) { + return 0; + } + + return 1; +} + +static int mul_public_2(const EC_GROUP *group, EC_JACOBIAN *out, + const EC_JACOBIAN *p0, const EC_SCALAR *scalar0, + const EC_JACOBIAN *p1, const EC_SCALAR *scalar1) { + EC_JACOBIAN points[2] = {*p0, *p1}; + EC_SCALAR scalars[2] = {*scalar0, *scalar1}; + return ec_point_mul_scalar_public_batch(group, out, /*g_scalar=*/nullptr, + points, scalars, 2); +} + +static int dleq_verify(const VOPRF_METHOD *method, CBS *cbs, + const TRUST_TOKEN_CLIENT_KEY *pub, const EC_JACOBIAN *T, + const EC_JACOBIAN *W) { + const EC_GROUP *group = method->group_func(); + + + enum { + idx_T, + idx_W, + idx_k0, + idx_k1, + num_idx, + }; + EC_JACOBIAN jacobians[num_idx]; + + // Decode the DLEQ proof. + EC_SCALAR c, u; + if (!scalar_from_cbs(cbs, group, &c) || !scalar_from_cbs(cbs, group, &u)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + + // k0;k1 = u*(G;T) - c*(pub;W) + EC_JACOBIAN pubs; + ec_affine_to_jacobian(group, &pubs, &pub->pubs); + EC_SCALAR minus_c; + ec_scalar_neg(group, &minus_c, &c); + if (!ec_point_mul_scalar_public(group, &jacobians[idx_k0], &u, &pubs, + &minus_c) || + !mul_public_2(group, &jacobians[idx_k1], T, &u, W, &minus_c)) { + return 0; + } + + // Check the DLEQ proof. + EC_AFFINE affines[num_idx]; + jacobians[idx_T] = *T; + jacobians[idx_W] = *W; + if (!ec_jacobian_to_affine_batch(group, affines, jacobians, num_idx)) { + return 0; + } + + // Compute c = Hc(...). + EC_SCALAR calculated; + if (!hash_to_scalar_dleq(method, &calculated, &pub->pubs, &affines[idx_T], + &affines[idx_W], &affines[idx_k0], + &affines[idx_k1])) { + return 0; + } + + // c == calculated + if (!ec_scalar_equal_vartime(group, &c, &calculated)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_PROOF); + return 0; + } + + return 1; +} + +static int voprf_sign_tt(const VOPRF_METHOD *method, + const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, + size_t num_requested, size_t num_to_issue) { + const EC_GROUP *group = method->group_func(); + if (num_requested < num_to_issue) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); + return 0; + } + + int ret = 0; + EC_JACOBIAN *BTs = reinterpret_cast( + OPENSSL_calloc(num_to_issue, sizeof(EC_JACOBIAN))); + EC_JACOBIAN *Zs = reinterpret_cast( + OPENSSL_calloc(num_to_issue, sizeof(EC_JACOBIAN))); + EC_SCALAR *es = reinterpret_cast( + OPENSSL_calloc(num_to_issue, sizeof(EC_SCALAR))); + CBB batch_cbb; + CBB_zero(&batch_cbb); + + { + if (!BTs || !Zs || !es || !CBB_init(&batch_cbb, 0) || + !cbb_add_point(&batch_cbb, group, &key->pubs)) { + goto err; + } + + for (size_t i = 0; i < num_to_issue; i++) { + EC_AFFINE BT_affine, Z_affine; + EC_JACOBIAN BT, Z; + if (!cbs_get_point(cbs, group, &BT_affine)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + goto err; + } + ec_affine_to_jacobian(group, &BT, &BT_affine); + if (!ec_point_mul_scalar(group, &Z, &BT, &key->xs) || + !ec_jacobian_to_affine(group, &Z_affine, &Z) || + !cbb_add_point(cbb, group, &Z_affine)) { + goto err; + } + + if (!cbb_add_point(&batch_cbb, group, &BT_affine) || + !cbb_add_point(&batch_cbb, group, &Z_affine)) { + goto err; + } + BTs[i] = BT; + Zs[i] = Z; + + if (!CBB_flush(cbb)) { + goto err; + } + } + + // The DLEQ batching construction is described in appendix B of + // https://eprint.iacr.org/2020/072/20200324:214215. Note the additional + // computations all act on public inputs. + for (size_t i = 0; i < num_to_issue; i++) { + if (!hash_to_scalar_batch(method, &es[i], &batch_cbb, i)) { + goto err; + } + } + + EC_JACOBIAN BT_batch, Z_batch; + if (!ec_point_mul_scalar_public_batch(group, &BT_batch, + /*g_scalar=*/nullptr, BTs, es, + num_to_issue) || + !ec_point_mul_scalar_public_batch(group, &Z_batch, + /*g_scalar=*/nullptr, Zs, es, + num_to_issue)) { + goto err; + } + + CBB proof; + if (!CBB_add_u16_length_prefixed(cbb, &proof) || + !dleq_generate(method, &proof, key, &BT_batch, &Z_batch) || + !CBB_flush(cbb)) { + goto err; + } + + // Skip over any unused requests. + size_t point_len = ec_point_byte_len(group, POINT_CONVERSION_UNCOMPRESSED); + if (!CBS_skip(cbs, point_len * (num_requested - num_to_issue))) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + goto err; + } + + ret = 1; + } + +err: + Delete(BTs); + Delete(Zs); + Delete(es); + CBB_cleanup(&batch_cbb); + return ret; +} + +static STACK_OF(TRUST_TOKEN) *voprf_unblind_tt( + const VOPRF_METHOD *method, const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id) { + const EC_GROUP *group = method->group_func(); + if (count > sk_TRUST_TOKEN_PRETOKEN_num(pretokens)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return nullptr; + } + + int ok = 0; + STACK_OF(TRUST_TOKEN) *ret = sk_TRUST_TOKEN_new_null(); + EC_JACOBIAN *BTs = reinterpret_cast( + OPENSSL_calloc(count, sizeof(EC_JACOBIAN))); + EC_JACOBIAN *Zs = reinterpret_cast( + OPENSSL_calloc(count, sizeof(EC_JACOBIAN))); + EC_SCALAR *es = + reinterpret_cast(OPENSSL_calloc(count, sizeof(EC_SCALAR))); + CBB batch_cbb; + CBB_zero(&batch_cbb); + if (ret == nullptr || BTs == nullptr || Zs == nullptr || es == nullptr || + !CBB_init(&batch_cbb, 0) || + !cbb_add_point(&batch_cbb, group, &key->pubs)) { + goto err; + } + + for (size_t i = 0; i < count; i++) { + const TRUST_TOKEN_PRETOKEN *pretoken = + sk_TRUST_TOKEN_PRETOKEN_value(pretokens, i); + + EC_AFFINE Z_affine; + if (!cbs_get_point(cbs, group, &Z_affine)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + goto err; + } + + ec_affine_to_jacobian(group, &BTs[i], &pretoken->Tp); + ec_affine_to_jacobian(group, &Zs[i], &Z_affine); + + if (!cbb_add_point(&batch_cbb, group, &pretoken->Tp) || + !cbb_add_point(&batch_cbb, group, &Z_affine)) { + goto err; + } + + // Unblind the token. + // pretoken->r is rinv. + EC_JACOBIAN N; + EC_AFFINE N_affine; + if (!ec_point_mul_scalar(group, &N, &Zs[i], &pretoken->r) || + !ec_jacobian_to_affine(group, &N_affine, &N)) { + goto err; + } + + // Serialize the token. Include |key_id| to avoid an extra copy in the layer + // above. + CBB token_cbb; + size_t point_len = ec_point_byte_len(group, POINT_CONVERSION_UNCOMPRESSED); + if (!CBB_init(&token_cbb, 4 + TRUST_TOKEN_NONCE_SIZE + (2 + point_len)) || + !CBB_add_u32(&token_cbb, key_id) || + !CBB_add_bytes(&token_cbb, pretoken->salt, TRUST_TOKEN_NONCE_SIZE) || + !cbb_add_point(&token_cbb, group, &N_affine) || + !CBB_flush(&token_cbb)) { + CBB_cleanup(&token_cbb); + goto err; + } + + TRUST_TOKEN *token = + TRUST_TOKEN_new(CBB_data(&token_cbb), CBB_len(&token_cbb)); + CBB_cleanup(&token_cbb); + if (token == nullptr || !sk_TRUST_TOKEN_push(ret, token)) { + TRUST_TOKEN_free(token); + goto err; + } + } + + // The DLEQ batching construction is described in appendix B of + // https://eprint.iacr.org/2020/072/20200324:214215. Note the additional + // computations all act on public inputs. + for (size_t i = 0; i < count; i++) { + if (!hash_to_scalar_batch(method, &es[i], &batch_cbb, i)) { + goto err; + } + } + + EC_JACOBIAN BT_batch, Z_batch; + if (!ec_point_mul_scalar_public_batch(group, &BT_batch, + /*g_scalar=*/nullptr, BTs, es, count) || + !ec_point_mul_scalar_public_batch(group, &Z_batch, + /*g_scalar=*/nullptr, Zs, es, count)) { + goto err; + } + + CBS proof; + if (!CBS_get_u16_length_prefixed(cbs, &proof) || + !dleq_verify(method, &proof, key, &BT_batch, &Z_batch) || + CBS_len(&proof) != 0) { + goto err; + } + + ok = 1; + +err: + Delete(BTs); + Delete(Zs); + Delete(es); + CBB_cleanup(&batch_cbb); + if (!ok) { + sk_TRUST_TOKEN_pop_free(ret, TRUST_TOKEN_free); + ret = nullptr; + } + return ret; +} + +static void sha384_update_u16(SHA512_CTX *ctx, uint16_t v) { + uint8_t buf[2] = {static_cast(v >> 8), + static_cast(v & 0xff)}; + SHA384_Update(ctx, buf, 2); +} + +static void sha384_update_point_with_length(SHA512_CTX *ctx, + const EC_GROUP *group, + const EC_AFFINE *point) { + uint8_t buf[EC_MAX_COMPRESSED]; + size_t len = ec_point_to_bytes(group, point, POINT_CONVERSION_COMPRESSED, buf, + sizeof(buf)); + assert(len > 0); + sha384_update_u16(ctx, (uint16_t)len); + SHA384_Update(ctx, buf, len); +} + +static int compute_composite_seed(const VOPRF_METHOD *method, + uint8_t out[SHA384_DIGEST_LENGTH], + const EC_AFFINE *pub) { + const EC_GROUP *group = method->group_func(); + static const uint8_t kSeedDST[] = "Seed-OPRFV1-\x01-P384-SHA384"; + + SHA512_CTX hash_ctx; + SHA384_Init(&hash_ctx); + sha384_update_point_with_length(&hash_ctx, group, pub); + sha384_update_u16(&hash_ctx, sizeof(kSeedDST) - 1); + SHA384_Update(&hash_ctx, kSeedDST, sizeof(kSeedDST) - 1); + SHA384_Final(out, &hash_ctx); + + return 1; +} + +static int compute_composite_element(const VOPRF_METHOD *method, + uint8_t seed[SHA384_DIGEST_LENGTH], + EC_SCALAR *di, size_t index, + const EC_AFFINE *C, const EC_AFFINE *D) { + static const uint8_t kCompositeLabel[] = "Composite"; + const EC_GROUP *group = method->group_func(); + + if (index > UINT16_MAX) { + return 0; + } + + CBB cbb; + uint8_t transcript[2 + SHA384_DIGEST_LENGTH + 2 + 2 * EC_MAX_COMPRESSED + + sizeof(kCompositeLabel) - 1]; + size_t len; + if (!CBB_init_fixed(&cbb, transcript, sizeof(transcript)) || + !CBB_add_u16(&cbb, SHA384_DIGEST_LENGTH) || + !CBB_add_bytes(&cbb, seed, SHA384_DIGEST_LENGTH) || + !CBB_add_u16(&cbb, index) || !cbb_serialize_point(&cbb, group, C) || + !cbb_serialize_point(&cbb, group, D) || + !CBB_add_bytes(&cbb, kCompositeLabel, sizeof(kCompositeLabel) - 1) || + !CBB_finish(&cbb, nullptr, &len) || + !method->hash_to_scalar(group, di, transcript, len)) { + return 0; + } + + return 1; +} + +static int generate_proof(const VOPRF_METHOD *method, CBB *cbb, + const TRUST_TOKEN_ISSUER_KEY *priv, + const EC_SCALAR *r, const EC_JACOBIAN *M, + const EC_JACOBIAN *Z) { + const EC_GROUP *group = method->group_func(); + + enum { + idx_M, + idx_Z, + idx_t2, + idx_t3, + num_idx, + }; + EC_JACOBIAN jacobians[num_idx]; + + if (!ec_point_mul_scalar_base(group, &jacobians[idx_t2], r) || + !ec_point_mul_scalar(group, &jacobians[idx_t3], M, r)) { + return 0; + } + + + EC_AFFINE affines[num_idx]; + jacobians[idx_M] = *M; + jacobians[idx_Z] = *Z; + if (!ec_jacobian_to_affine_batch(group, affines, jacobians, num_idx)) { + return 0; + } + + EC_SCALAR c; + if (!hash_to_scalar_challenge(method, &c, &priv->pubs, &affines[idx_M], + &affines[idx_Z], &affines[idx_t2], + &affines[idx_t3])) { + return 0; + } + + EC_SCALAR c_mont; + ec_scalar_to_montgomery(group, &c_mont, &c); + + // s = r - c*xs + EC_SCALAR s; + ec_scalar_mul_montgomery(group, &s, &priv->xs, &c_mont); + ec_scalar_sub(group, &s, r, &s); + + // Store DLEQ proof in transcript. + if (!scalar_to_cbb(cbb, group, &c) || !scalar_to_cbb(cbb, group, &s)) { + return 0; + } + + return 1; +} + +static int verify_proof(const VOPRF_METHOD *method, CBS *cbs, + const TRUST_TOKEN_CLIENT_KEY *pub, const EC_JACOBIAN *M, + const EC_JACOBIAN *Z) { + const EC_GROUP *group = method->group_func(); + + enum { + idx_M, + idx_Z, + idx_t2, + idx_t3, + num_idx, + }; + EC_JACOBIAN jacobians[num_idx]; + + EC_SCALAR c, s; + if (!scalar_from_cbs(cbs, group, &c) || !scalar_from_cbs(cbs, group, &s)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return 0; + } + + EC_JACOBIAN pubs; + ec_affine_to_jacobian(group, &pubs, &pub->pubs); + if (!ec_point_mul_scalar_public(group, &jacobians[idx_t2], &s, &pubs, &c) || + !mul_public_2(group, &jacobians[idx_t3], M, &s, Z, &c)) { + return 0; + } + + EC_AFFINE affines[num_idx]; + jacobians[idx_M] = *M; + jacobians[idx_Z] = *Z; + if (!ec_jacobian_to_affine_batch(group, affines, jacobians, num_idx)) { + return 0; + } + + EC_SCALAR expected_c; + if (!hash_to_scalar_challenge(method, &expected_c, &pub->pubs, + &affines[idx_M], &affines[idx_Z], + &affines[idx_t2], &affines[idx_t3])) { + return 0; + } + + // c == expected_c + if (!ec_scalar_equal_vartime(group, &c, &expected_c)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_PROOF); + return 0; + } + + return 1; +} + +static int voprf_sign_impl(const VOPRF_METHOD *method, + const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, + CBS *cbs, size_t num_requested, size_t num_to_issue, + const EC_SCALAR *proof_scalar) { + const EC_GROUP *group = method->group_func(); + if (num_requested < num_to_issue) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, ERR_R_INTERNAL_ERROR); + return 0; + } + + int ret = 0; + EC_JACOBIAN *BTs = reinterpret_cast( + OPENSSL_calloc(num_to_issue, sizeof(EC_JACOBIAN))); + EC_JACOBIAN *Zs = reinterpret_cast( + OPENSSL_calloc(num_to_issue, sizeof(EC_JACOBIAN))); + EC_SCALAR *dis = reinterpret_cast( + OPENSSL_calloc(num_to_issue, sizeof(EC_SCALAR))); + + { + if (!BTs || !Zs || !dis) { + goto err; + } + + uint8_t seed[SHA384_DIGEST_LENGTH]; + if (!compute_composite_seed(method, seed, &key->pubs)) { + goto err; + } + + // This implements the BlindEvaluateBatch as defined in section 4 of + // draft-robert-privacypass-batched-tokens-01, based on the constructions + // in draft-irtf-cfrg-voprf-21. To optimize the computation of the proof, + // the computation of di is done during the token signing and passed into + // the proof generation. + for (size_t i = 0; i < num_to_issue; i++) { + EC_AFFINE BT_affine, Z_affine; + EC_JACOBIAN BT, Z; + if (!cbs_get_point(cbs, group, &BT_affine)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + goto err; + } + ec_affine_to_jacobian(group, &BT, &BT_affine); + if (!ec_point_mul_scalar(group, &Z, &BT, &key->xs) || + !ec_jacobian_to_affine(group, &Z_affine, &Z) || + !cbb_add_point(cbb, group, &Z_affine)) { + goto err; + } + BTs[i] = BT; + Zs[i] = Z; + if (!compute_composite_element(method, seed, &dis[i], i, &BT_affine, + &Z_affine)) { + goto err; + } + + if (!CBB_flush(cbb)) { + goto err; + } + } + + EC_JACOBIAN M, Z; + if (!ec_point_mul_scalar_public_batch(group, &M, + /*g_scalar=*/nullptr, BTs, dis, + num_to_issue) || + !ec_point_mul_scalar(group, &Z, &M, &key->xs)) { + goto err; + } + + CBB proof; + if (!CBB_add_u16_length_prefixed(cbb, &proof) || + !generate_proof(method, &proof, key, proof_scalar, &M, &Z) || + !CBB_flush(cbb)) { + goto err; + } + + // Skip over any unused requests. + size_t point_len = ec_point_byte_len(group, POINT_CONVERSION_UNCOMPRESSED); + if (!CBS_skip(cbs, point_len * (num_requested - num_to_issue))) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + goto err; + } + + ret = 1; + } + +err: + Delete(BTs); + Delete(Zs); + Delete(dis); + return ret; +} + +static int voprf_sign(const VOPRF_METHOD *method, + const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, + size_t num_requested, size_t num_to_issue) { + EC_SCALAR proof_scalar; + if (!ec_random_nonzero_scalar(method->group_func(), &proof_scalar, + kDefaultAdditionalData)) { + return 0; + } + + return voprf_sign_impl(method, key, cbb, cbs, num_requested, num_to_issue, + &proof_scalar); +} + +static int voprf_sign_with_proof_scalar_for_testing( + const VOPRF_METHOD *method, const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, + CBS *cbs, size_t num_requested, size_t num_to_issue, + const uint8_t *proof_scalar_buf, size_t proof_scalar_len) { + EC_SCALAR proof_scalar; + if (!ec_scalar_from_bytes(method->group_func(), &proof_scalar, + proof_scalar_buf, proof_scalar_len)) { + return 0; + } + return voprf_sign_impl(method, key, cbb, cbs, num_requested, num_to_issue, + &proof_scalar); +} + +static STACK_OF(TRUST_TOKEN) *voprf_unblind( + const VOPRF_METHOD *method, const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id) { + const EC_GROUP *group = method->group_func(); + if (count > sk_TRUST_TOKEN_PRETOKEN_num(pretokens)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + return nullptr; + } + + int ok = 0; + STACK_OF(TRUST_TOKEN) *ret = sk_TRUST_TOKEN_new_null(); + EC_JACOBIAN *BTs = reinterpret_cast( + OPENSSL_calloc(count, sizeof(EC_JACOBIAN))); + EC_JACOBIAN *Zs = reinterpret_cast( + OPENSSL_calloc(count, sizeof(EC_JACOBIAN))); + EC_SCALAR *dis = + reinterpret_cast(OPENSSL_calloc(count, sizeof(EC_SCALAR))); + if (ret == nullptr || !BTs || !Zs || !dis) { + goto err; + } + + uint8_t seed[SHA384_DIGEST_LENGTH]; + if (!compute_composite_seed(method, seed, &key->pubs)) { + goto err; + } + + for (size_t i = 0; i < count; i++) { + const TRUST_TOKEN_PRETOKEN *pretoken = + sk_TRUST_TOKEN_PRETOKEN_value(pretokens, i); + + EC_AFFINE Z_affine; + if (!cbs_get_point(cbs, group, &Z_affine)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_DECODE_FAILURE); + goto err; + } + + ec_affine_to_jacobian(group, &BTs[i], &pretoken->Tp); + ec_affine_to_jacobian(group, &Zs[i], &Z_affine); + if (!compute_composite_element(method, seed, &dis[i], i, &pretoken->Tp, + &Z_affine)) { + goto err; + } + + // Unblind the token. + // pretoken->r is rinv. + EC_JACOBIAN N; + EC_AFFINE N_affine; + if (!ec_point_mul_scalar(group, &N, &Zs[i], &pretoken->r) || + !ec_jacobian_to_affine(group, &N_affine, &N)) { + goto err; + } + + // Serialize the token. Include |key_id| to avoid an extra copy in the layer + // above. + CBB token_cbb; + size_t point_len = ec_point_byte_len(group, POINT_CONVERSION_UNCOMPRESSED); + if (!CBB_init(&token_cbb, 4 + TRUST_TOKEN_NONCE_SIZE + (2 + point_len)) || + !CBB_add_u32(&token_cbb, key_id) || + !CBB_add_bytes(&token_cbb, pretoken->salt, TRUST_TOKEN_NONCE_SIZE) || + !cbb_add_point(&token_cbb, group, &N_affine) || + !CBB_flush(&token_cbb)) { + CBB_cleanup(&token_cbb); + goto err; + } + + TRUST_TOKEN *token = + TRUST_TOKEN_new(CBB_data(&token_cbb), CBB_len(&token_cbb)); + CBB_cleanup(&token_cbb); + if (token == nullptr || !sk_TRUST_TOKEN_push(ret, token)) { + TRUST_TOKEN_free(token); + goto err; + } + } + + EC_JACOBIAN M, Z; + if (!ec_point_mul_scalar_public_batch(group, &M, + /*g_scalar=*/nullptr, BTs, dis, + count) || + !ec_point_mul_scalar_public_batch(group, &Z, + /*g_scalar=*/nullptr, Zs, dis, count)) { + goto err; + } + + CBS proof; + if (!CBS_get_u16_length_prefixed(cbs, &proof) || + !verify_proof(method, &proof, key, &M, &Z) || CBS_len(&proof) != 0) { + goto err; + } + + ok = 1; + +err: + Delete(BTs); + Delete(Zs); + Delete(dis); + if (!ok) { + sk_TRUST_TOKEN_pop_free(ret, TRUST_TOKEN_free); + ret = nullptr; + } + return ret; +} + +static int voprf_read(const VOPRF_METHOD *method, + const TRUST_TOKEN_ISSUER_KEY *key, + uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], + const uint8_t *token, size_t token_len, + int include_message, const uint8_t *msg, size_t msg_len) { + const EC_GROUP *group = method->group_func(); + CBS cbs, salt; + CBS_init(&cbs, token, token_len); + EC_AFFINE Ws; + if (!CBS_get_bytes(&cbs, &salt, TRUST_TOKEN_NONCE_SIZE) || + !cbs_get_point(&cbs, group, &Ws) || CBS_len(&cbs) != 0) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_INVALID_TOKEN); + return 0; + } + + if (include_message) { + SHA512_CTX hash_ctx; + assert(SHA512_DIGEST_LENGTH == TRUST_TOKEN_NONCE_SIZE); + SHA512_Init(&hash_ctx); + SHA512_Update(&hash_ctx, CBS_data(&salt), CBS_len(&salt)); + SHA512_Update(&hash_ctx, msg, msg_len); + SHA512_Final(out_nonce, &hash_ctx); + } else { + OPENSSL_memcpy(out_nonce, CBS_data(&salt), CBS_len(&salt)); + } + + + EC_JACOBIAN T; + if (!method->hash_to_group(group, &T, out_nonce)) { + return 0; + } + + EC_JACOBIAN Ws_calculated; + if (!ec_point_mul_scalar(group, &Ws_calculated, &T, &key->xs) || + !ec_affine_jacobian_equal(group, &Ws, &Ws_calculated)) { + OPENSSL_PUT_ERROR(TRUST_TOKEN, TRUST_TOKEN_R_BAD_VALIDITY_CHECK); + return 0; + } + + return 1; +} + + +// VOPRF experiment v2. + +static int voprf_exp2_hash_to_group(const EC_GROUP *group, EC_JACOBIAN *out, + const uint8_t t[TRUST_TOKEN_NONCE_SIZE]) { + const uint8_t kHashTLabel[] = "TrustToken VOPRF Experiment V2 HashToGroup"; + return ec_hash_to_curve_p384_xmd_sha512_sswu_draft07( + group, out, kHashTLabel, sizeof(kHashTLabel), t, TRUST_TOKEN_NONCE_SIZE); +} + +static int voprf_exp2_hash_to_scalar(const EC_GROUP *group, EC_SCALAR *out, + uint8_t *buf, size_t len) { + const uint8_t kHashCLabel[] = "TrustToken VOPRF Experiment V2 HashToScalar"; + return ec_hash_to_scalar_p384_xmd_sha512_draft07( + group, out, kHashCLabel, sizeof(kHashCLabel), buf, len); +} + +static VOPRF_METHOD voprf_exp2_method = { + EC_group_p384, voprf_exp2_hash_to_group, voprf_exp2_hash_to_scalar}; + +int bssl::voprf_exp2_generate_key(CBB *out_private, CBB *out_public) { + return voprf_generate_key(&voprf_exp2_method, out_private, out_public); +} + +int bssl::voprf_exp2_derive_key_from_secret(CBB *out_private, CBB *out_public, + const uint8_t *secret, + size_t secret_len) { + return voprf_derive_key_from_secret(&voprf_exp2_method, out_private, + out_public, secret, secret_len); +} + +int bssl::voprf_exp2_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, + const uint8_t *in, size_t len) { + return voprf_client_key_from_bytes(&voprf_exp2_method, key, in, len); +} + +int bssl::voprf_exp2_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, + const uint8_t *in, size_t len) { + return voprf_issuer_key_from_bytes(&voprf_exp2_method, key, in, len); +} + +STACK_OF(TRUST_TOKEN_PRETOKEN) *bssl::voprf_exp2_blind(CBB *cbb, size_t count, + int include_message, + const uint8_t *msg, + size_t msg_len) { + return voprf_blind(&voprf_exp2_method, cbb, count, include_message, msg, + msg_len); +} + +int bssl::voprf_exp2_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, + size_t num_requested, size_t num_to_issue, + uint8_t private_metadata) { + if (private_metadata != 0) { + return 0; + } + return voprf_sign_tt(&voprf_exp2_method, key, cbb, cbs, num_requested, + num_to_issue); +} + +STACK_OF(TRUST_TOKEN) *bssl::voprf_exp2_unblind( + const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id) { + return voprf_unblind_tt(&voprf_exp2_method, key, pretokens, cbs, count, + key_id); +} + +int bssl::voprf_exp2_read(const TRUST_TOKEN_ISSUER_KEY *key, + uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], + uint8_t *out_private_metadata, const uint8_t *token, + size_t token_len, int include_message, + const uint8_t *msg, size_t msg_len) { + return voprf_read(&voprf_exp2_method, key, out_nonce, token, token_len, + include_message, msg, msg_len); +} + +// VOPRF PST v1. + +static int voprf_pst1_hash_to_group(const EC_GROUP *group, EC_JACOBIAN *out, + const uint8_t t[TRUST_TOKEN_NONCE_SIZE]) { + const uint8_t kHashTLabel[] = "HashToGroup-OPRFV1-\x01-P384-SHA384"; + return ec_hash_to_curve_p384_xmd_sha384_sswu(group, out, kHashTLabel, + sizeof(kHashTLabel) - 1, t, + TRUST_TOKEN_NONCE_SIZE); +} + +static int voprf_pst1_hash_to_scalar(const EC_GROUP *group, EC_SCALAR *out, + uint8_t *buf, size_t len) { + const uint8_t kHashCLabel[] = "HashToScalar-OPRFV1-\x01-P384-SHA384"; + return ec_hash_to_scalar_p384_xmd_sha384(group, out, kHashCLabel, + sizeof(kHashCLabel) - 1, buf, len); +} + +static VOPRF_METHOD voprf_pst1_method = { + EC_group_p384, voprf_pst1_hash_to_group, voprf_pst1_hash_to_scalar}; + +int bssl::voprf_pst1_generate_key(CBB *out_private, CBB *out_public) { + return voprf_generate_key(&voprf_pst1_method, out_private, out_public); +} + +int bssl::voprf_pst1_derive_key_from_secret(CBB *out_private, CBB *out_public, + const uint8_t *secret, + size_t secret_len) { + return voprf_derive_key_from_secret(&voprf_pst1_method, out_private, + out_public, secret, secret_len); +} + +int bssl::voprf_pst1_client_key_from_bytes(TRUST_TOKEN_CLIENT_KEY *key, + const uint8_t *in, size_t len) { + return voprf_client_key_from_bytes(&voprf_pst1_method, key, in, len); +} + +int bssl::voprf_pst1_issuer_key_from_bytes(TRUST_TOKEN_ISSUER_KEY *key, + const uint8_t *in, size_t len) { + return voprf_issuer_key_from_bytes(&voprf_pst1_method, key, in, len); +} + +STACK_OF(TRUST_TOKEN_PRETOKEN) *bssl::voprf_pst1_blind(CBB *cbb, size_t count, + int include_message, + const uint8_t *msg, + size_t msg_len) { + return voprf_blind(&voprf_pst1_method, cbb, count, include_message, msg, + msg_len); +} + +int bssl::voprf_pst1_sign(const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, + size_t num_requested, size_t num_to_issue, + uint8_t private_metadata) { + if (private_metadata != 0) { + return 0; + } + return voprf_sign(&voprf_pst1_method, key, cbb, cbs, num_requested, + num_to_issue); +} + + +int bssl::voprf_pst1_sign_with_proof_scalar_for_testing( + const TRUST_TOKEN_ISSUER_KEY *key, CBB *cbb, CBS *cbs, size_t num_requested, + size_t num_to_issue, uint8_t private_metadata, + const uint8_t *proof_scalar_buf, size_t proof_scalar_len) { + if (private_metadata != 0) { + return 0; + } + return voprf_sign_with_proof_scalar_for_testing( + &voprf_pst1_method, key, cbb, cbs, num_requested, num_to_issue, + proof_scalar_buf, proof_scalar_len); +} + +STACK_OF(TRUST_TOKEN) *bssl::voprf_pst1_unblind( + const TRUST_TOKEN_CLIENT_KEY *key, + const STACK_OF(TRUST_TOKEN_PRETOKEN) *pretokens, CBS *cbs, size_t count, + uint32_t key_id) { + return voprf_unblind(&voprf_pst1_method, key, pretokens, cbs, count, key_id); +} + +int bssl::voprf_pst1_read(const TRUST_TOKEN_ISSUER_KEY *key, + uint8_t out_nonce[TRUST_TOKEN_NONCE_SIZE], + uint8_t *out_private_metadata, const uint8_t *token, + size_t token_len, int include_message, + const uint8_t *msg, size_t msg_len) { + return voprf_read(&voprf_pst1_method, key, out_nonce, token, token_len, + include_message, msg, msg_len); +} diff --git a/third_party/boringssl/src/crypto/x509/a_digest.c b/third_party/boringssl/src/crypto/x509/a_digest.c deleted file mode 100644 index d7dcecc7..00000000 --- a/third_party/boringssl/src/crypto/x509/a_digest.c +++ /dev/null @@ -1,95 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include -#include - -int ASN1_digest(i2d_of_void *i2d, const EVP_MD *type, char *data, - unsigned char *md, unsigned int *len) { - int i, ret; - unsigned char *str, *p; - - i = i2d(data, NULL); - if ((str = (unsigned char *)OPENSSL_malloc(i)) == NULL) { - OPENSSL_PUT_ERROR(X509, ERR_R_MALLOC_FAILURE); - return 0; - } - p = str; - i2d(data, &p); - - ret = EVP_Digest(str, i, md, len, type, NULL); - OPENSSL_free(str); - return ret; -} - -int ASN1_item_digest(const ASN1_ITEM *it, const EVP_MD *type, void *asn, - unsigned char *md, unsigned int *len) { - int i, ret; - unsigned char *str = NULL; - - i = ASN1_item_i2d(asn, &str, it); - if (!str) { - return 0; - } - - ret = EVP_Digest(str, i, md, len, type, NULL); - OPENSSL_free(str); - return ret; -} diff --git a/third_party/boringssl/src/crypto/x509/a_digest.cc b/third_party/boringssl/src/crypto/x509/a_digest.cc new file mode 100644 index 00000000..9f5dde96 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/a_digest.cc @@ -0,0 +1,52 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +int ASN1_digest(i2d_of_void *i2d, const EVP_MD *type, char *data, + unsigned char *md, unsigned int *len) { + int i, ret; + unsigned char *str, *p; + + i = i2d(data, nullptr); + if ((str = (unsigned char *)OPENSSL_malloc(i)) == nullptr) { + return 0; + } + p = str; + i2d(data, &p); + + ret = EVP_Digest(str, i, md, len, type, nullptr); + OPENSSL_free(str); + return ret; +} + +int ASN1_item_digest(const ASN1_ITEM *it, const EVP_MD *type, void *asn, + unsigned char *md, unsigned int *len) { + int i, ret; + unsigned char *str = nullptr; + + i = ASN1_item_i2d(reinterpret_cast(asn), &str, it); + if (!str) { + return 0; + } + + ret = EVP_Digest(str, i, md, len, type, nullptr); + OPENSSL_free(str); + return ret; +} diff --git a/third_party/boringssl/src/crypto/x509/a_sign.c b/third_party/boringssl/src/crypto/x509/a_sign.c deleted file mode 100644 index ed9e79bb..00000000 --- a/third_party/boringssl/src/crypto/x509/a_sign.c +++ /dev/null @@ -1,119 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" - -int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2, - ASN1_BIT_STRING *signature, void *asn, EVP_PKEY *pkey, - const EVP_MD *type) { - EVP_MD_CTX ctx; - EVP_MD_CTX_init(&ctx); - if (!EVP_DigestSignInit(&ctx, NULL, type, NULL, pkey)) { - EVP_MD_CTX_cleanup(&ctx); - return 0; - } - return ASN1_item_sign_ctx(it, algor1, algor2, signature, asn, &ctx); -} - -int ASN1_item_sign_ctx(const ASN1_ITEM *it, X509_ALGOR *algor1, - X509_ALGOR *algor2, ASN1_BIT_STRING *signature, - void *asn, EVP_MD_CTX *ctx) { - EVP_PKEY *pkey; - unsigned char *buf_in = NULL, *buf_out = NULL; - size_t inl = 0, outl = 0; - - pkey = EVP_PKEY_CTX_get0_pkey(ctx->pctx); - - // Write out the requested copies of the AlgorithmIdentifier. - if (algor1 && !x509_digest_sign_algorithm(ctx, algor1)) { - goto err; - } - if (algor2 && !x509_digest_sign_algorithm(ctx, algor2)) { - goto err; - } - - inl = ASN1_item_i2d(asn, &buf_in, it); - outl = EVP_PKEY_size(pkey); - buf_out = OPENSSL_malloc((unsigned int)outl); - if ((buf_in == NULL) || (buf_out == NULL)) { - outl = 0; - OPENSSL_PUT_ERROR(X509, ERR_R_MALLOC_FAILURE); - goto err; - } - - if (!EVP_DigestSign(ctx, buf_out, &outl, buf_in, inl)) { - outl = 0; - OPENSSL_PUT_ERROR(X509, ERR_R_EVP_LIB); - goto err; - } - ASN1_STRING_set0(signature, buf_out, outl); - buf_out = NULL; - signature->flags &= ~(ASN1_STRING_FLAG_BITS_LEFT | 0x07); - signature->flags |= ASN1_STRING_FLAG_BITS_LEFT; -err: - EVP_MD_CTX_cleanup(ctx); - OPENSSL_free(buf_in); - OPENSSL_free(buf_out); - return outl; -} diff --git a/third_party/boringssl/src/crypto/x509/a_sign.cc b/third_party/boringssl/src/crypto/x509/a_sign.cc new file mode 100644 index 00000000..a430bff3 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/a_sign.cc @@ -0,0 +1,100 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +int ASN1_item_sign(const ASN1_ITEM *it, X509_ALGOR *algor1, X509_ALGOR *algor2, + ASN1_BIT_STRING *signature, void *asn, EVP_PKEY *pkey, + const EVP_MD *type) { + if (signature->type != V_ASN1_BIT_STRING) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_TYPE); + return 0; + } + ScopedEVP_MD_CTX ctx; + if (!EVP_DigestSignInit(ctx.get(), nullptr, type, nullptr, pkey)) { + return 0; + } + return ASN1_item_sign_ctx(it, algor1, algor2, signature, asn, ctx.get()); +} + +int ASN1_item_sign_ctx(const ASN1_ITEM *it, X509_ALGOR *algor1, + X509_ALGOR *algor2, ASN1_BIT_STRING *signature, + void *asn, EVP_MD_CTX *ctx) { + // Historically, this function called |EVP_MD_CTX_cleanup| on return. Some + // callers rely on this to avoid memory leaks. + Cleanup cleanup = [&] { EVP_MD_CTX_cleanup(ctx); }; + + // Write out the requested copies of the AlgorithmIdentifier. This may modify + // |asn|, so we must do it first. + if ((algor1 != nullptr && !x509_digest_sign_algorithm(ctx, algor1)) || + (algor2 != nullptr && !x509_digest_sign_algorithm(ctx, algor2))) { + return 0; + } + + uint8_t *in = nullptr; + int in_len = ASN1_item_i2d(reinterpret_cast(asn), &in, it); + if (in_len < 0) { + return 0; + } + UniquePtr free_in(in); + + return x509_sign_to_bit_string(ctx, signature, Span(in, in_len)); +} + +int bssl::x509_sign_to_bit_string(EVP_MD_CTX *ctx, ASN1_BIT_STRING *out, + Span in) { + if (out->type != V_ASN1_BIT_STRING) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_TYPE); + return 0; + } + + EVP_PKEY *pkey = EVP_PKEY_CTX_get0_pkey(ctx->pctx); + size_t sig_len = EVP_PKEY_size(pkey); + if (sig_len > INT_MAX) { + // Ensure the signature will fit in |out|. + OPENSSL_PUT_ERROR(X509, ERR_R_OVERFLOW); + return 0; + } + Array sig; + if (!sig.Init(sig_len)) { + return 0; + } + + if (!EVP_DigestSign(ctx, sig.data(), &sig_len, in.data(), in.size())) { + OPENSSL_PUT_ERROR(X509, ERR_R_EVP_LIB); + return 0; + } + sig.Shrink(sig_len); + + uint8_t *sig_data; + sig.Release(&sig_data, &sig_len); + ASN1_STRING_set0(out, sig_data, static_cast(sig_len)); + return static_cast(sig_len); +} diff --git a/third_party/boringssl/src/crypto/x509/a_verify.c b/third_party/boringssl/src/crypto/x509/a_verify.c deleted file mode 100644 index af2c9145..00000000 --- a/third_party/boringssl/src/crypto/x509/a_verify.c +++ /dev/null @@ -1,117 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "internal.h" - -int ASN1_item_verify(const ASN1_ITEM *it, const X509_ALGOR *a, - const ASN1_BIT_STRING *signature, void *asn, - EVP_PKEY *pkey) { - if (!pkey) { - OPENSSL_PUT_ERROR(X509, ERR_R_PASSED_NULL_PARAMETER); - return 0; - } - - size_t sig_len; - if (signature->type == V_ASN1_BIT_STRING) { - if (!ASN1_BIT_STRING_num_bytes(signature, &sig_len)) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_BIT_STRING_BITS_LEFT); - return 0; - } - } else { - sig_len = (size_t)ASN1_STRING_length(signature); - } - - EVP_MD_CTX ctx; - uint8_t *buf_in = NULL; - int ret = 0, inl = 0; - EVP_MD_CTX_init(&ctx); - - if (!x509_digest_verify_init(&ctx, a, pkey)) { - goto err; - } - - inl = ASN1_item_i2d(asn, &buf_in, it); - - if (buf_in == NULL) { - OPENSSL_PUT_ERROR(X509, ERR_R_MALLOC_FAILURE); - goto err; - } - - if (!EVP_DigestVerify(&ctx, ASN1_STRING_get0_data(signature), sig_len, buf_in, - inl)) { - OPENSSL_PUT_ERROR(X509, ERR_R_EVP_LIB); - goto err; - } - - ret = 1; - -err: - OPENSSL_free(buf_in); - EVP_MD_CTX_cleanup(&ctx); - return ret; -} diff --git a/third_party/boringssl/src/crypto/x509/a_verify.cc b/third_party/boringssl/src/crypto/x509/a_verify.cc new file mode 100644 index 00000000..052c5213 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/a_verify.cc @@ -0,0 +1,70 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +int bssl::x509_verify_signature(const X509_ALGOR *sigalg, + const ASN1_BIT_STRING *signature, + Span in, EVP_PKEY *pkey) { + if (!pkey) { + OPENSSL_PUT_ERROR(X509, ERR_R_PASSED_NULL_PARAMETER); + return 0; + } + + if (signature->type == V_ASN1_BIT_STRING && + ASN1_BIT_STRING_unused_bits(signature) != 0) { + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_BIT_STRING_BITS_LEFT); + return 0; + } + + ScopedEVP_MD_CTX ctx; + if (!x509_digest_verify_init(ctx.get(), sigalg, pkey)) { + return 0; + } + if (!EVP_DigestVerify(ctx.get(), ASN1_STRING_get0_data(signature), + ASN1_STRING_length(signature), in.data(), in.size())) { + OPENSSL_PUT_ERROR(X509, ERR_R_EVP_LIB); + return 0; + } + return 1; +} + +int ASN1_item_verify(const ASN1_ITEM *it, const X509_ALGOR *sigalg, + const ASN1_BIT_STRING *signature, void *asn, + EVP_PKEY *pkey) { + uint8_t *in = nullptr; + int in_len = ASN1_item_i2d(reinterpret_cast(asn), &in, it); + if (in_len < 0) { + return 0; + } + UniquePtr free_in(in); + return x509_verify_signature(sigalg, signature, Span(in, in_len), pkey); +} diff --git a/third_party/boringssl/src/crypto/x509/algorithm.c b/third_party/boringssl/src/crypto/x509/algorithm.c deleted file mode 100644 index 16235eee..00000000 --- a/third_party/boringssl/src/crypto/x509/algorithm.c +++ /dev/null @@ -1,179 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include -#include -#include - -#include "internal.h" - -// Restrict the digests that are allowed in X509 certificates -static int x509_digest_nid_ok(const int digest_nid) { - switch (digest_nid) { - case NID_md4: - case NID_md5: - return 0; - } - return 1; -} - -int x509_digest_sign_algorithm(EVP_MD_CTX *ctx, X509_ALGOR *algor) { - EVP_PKEY *pkey = EVP_PKEY_CTX_get0_pkey(ctx->pctx); - if (pkey == NULL) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_CONTEXT_NOT_INITIALISED); - return 0; - } - - if (EVP_PKEY_id(pkey) == EVP_PKEY_RSA) { - int pad_mode; - if (!EVP_PKEY_CTX_get_rsa_padding(ctx->pctx, &pad_mode)) { - return 0; - } - // RSA-PSS has special signature algorithm logic. - if (pad_mode == RSA_PKCS1_PSS_PADDING) { - return x509_rsa_ctx_to_pss(ctx, algor); - } - } - - if (EVP_PKEY_id(pkey) == EVP_PKEY_ED25519) { - return X509_ALGOR_set0(algor, OBJ_nid2obj(NID_ED25519), V_ASN1_UNDEF, NULL); - } - - // Default behavior: look up the OID for the algorithm/hash pair and encode - // that. - const EVP_MD *digest = EVP_MD_CTX_md(ctx); - if (digest == NULL) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_CONTEXT_NOT_INITIALISED); - return 0; - } - - const int digest_nid = EVP_MD_type(digest); - int sign_nid; - if (!x509_digest_nid_ok(digest_nid) || - !OBJ_find_sigid_by_algs(&sign_nid, digest_nid, EVP_PKEY_id(pkey))) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED); - return 0; - } - - // RSA signature algorithms include an explicit NULL parameter. Others omit - // it. - int paramtype = - (EVP_PKEY_id(pkey) == EVP_PKEY_RSA) ? V_ASN1_NULL : V_ASN1_UNDEF; - X509_ALGOR_set0(algor, OBJ_nid2obj(sign_nid), paramtype, NULL); - return 1; -} - -int x509_digest_verify_init(EVP_MD_CTX *ctx, const X509_ALGOR *sigalg, - EVP_PKEY *pkey) { - // Convert the signature OID into digest and public key OIDs. - int sigalg_nid = OBJ_obj2nid(sigalg->algorithm); - int digest_nid, pkey_nid; - if (!OBJ_find_sigid_algs(sigalg_nid, &digest_nid, &pkey_nid)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM); - return 0; - } - - // Check the public key OID matches the public key type. - if (pkey_nid != EVP_PKEY_id(pkey)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_PUBLIC_KEY_TYPE); - return 0; - } - - // Check for permitted digest algorithms - if (!x509_digest_nid_ok(digest_nid)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED); - return 0; - } - - // NID_undef signals that there are custom parameters to set. - if (digest_nid == NID_undef) { - if (sigalg_nid == NID_rsassaPss) { - return x509_rsa_pss_to_ctx(ctx, sigalg, pkey); - } - if (sigalg_nid == NID_ED25519) { - if (sigalg->parameter != NULL) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PARAMETER); - return 0; - } - return EVP_DigestVerifyInit(ctx, NULL, NULL, NULL, pkey); - } - OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM); - return 0; - } - - // The parameter should be an explicit NULL for RSA and omitted for ECDSA. For - // compatibility, we allow either for both algorithms. See b/167375496. - // - // TODO(davidben): Chromium's verifier allows both forms for RSA, but enforces - // ECDSA more strictly. Align with Chromium and add a flag for b/167375496. - if (sigalg->parameter != NULL && sigalg->parameter->type != V_ASN1_NULL) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PARAMETER); - return 0; - } - - // Otherwise, initialize with the digest from the OID. - const EVP_MD *digest = EVP_get_digestbynid(digest_nid); - if (digest == NULL) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM); - return 0; - } - - return EVP_DigestVerifyInit(ctx, NULL, digest, NULL, pkey); -} diff --git a/third_party/boringssl/src/crypto/x509/algorithm.cc b/third_party/boringssl/src/crypto/x509/algorithm.cc new file mode 100644 index 00000000..4330fb4d --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/algorithm.cc @@ -0,0 +1,167 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +// TODO(crbug.com/42290422): Rewrite this logic to recognize signature +// algorithms without pulling in the OID table. We can enumerate every supported +// signature algorithm into a small enum and convert them to/from |EVP_PKEY_CTX| +// and |X509_ALGOR|. + +// Restrict the digests that are allowed in X509 certificates +static int x509_digest_nid_ok(const int digest_nid) { + switch (digest_nid) { + case NID_md4: + case NID_md5: + return 0; + } + return 1; +} + +int bssl::x509_digest_sign_algorithm(EVP_MD_CTX *ctx, X509_ALGOR *algor) { + EVP_PKEY *pkey = EVP_PKEY_CTX_get0_pkey(ctx->pctx); + if (pkey == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_CONTEXT_NOT_INITIALISED); + return 0; + } + + if (EVP_PKEY_id(pkey) == EVP_PKEY_RSA || + EVP_PKEY_id(pkey) == EVP_PKEY_RSA_PSS) { + int pad_mode; + if (!EVP_PKEY_CTX_get_rsa_padding(ctx->pctx, &pad_mode)) { + return 0; + } + // RSA-PSS has special signature algorithm logic. + if (pad_mode == RSA_PKCS1_PSS_PADDING) { + return x509_rsa_ctx_to_pss(ctx, algor); + } + } + + // Check for signing algorithms with an internal hash. + ASN1_OBJECT *algo_obj = nullptr; + switch (EVP_PKEY_id(pkey)) { + case EVP_PKEY_ED25519: + algo_obj = OBJ_nid2obj(NID_ED25519); + break; + case EVP_PKEY_ML_DSA_44: + algo_obj = OBJ_nid2obj(NID_ML_DSA_44); + break; + case EVP_PKEY_ML_DSA_65: + algo_obj = OBJ_nid2obj(NID_ML_DSA_65); + break; + case EVP_PKEY_ML_DSA_87: + algo_obj = OBJ_nid2obj(NID_ML_DSA_87); + break; + } + if (algo_obj != nullptr) { + return X509_ALGOR_set0(algor, algo_obj, V_ASN1_UNDEF, nullptr); + } + + // Default behavior: look up the OID for the algorithm/hash pair and encode + // that. + const EVP_MD *digest = EVP_MD_CTX_get0_md(ctx); + if (digest == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_CONTEXT_NOT_INITIALISED); + return 0; + } + + const int digest_nid = EVP_MD_type(digest); + int sign_nid; + if (!x509_digest_nid_ok(digest_nid) || + !OBJ_find_sigid_by_algs(&sign_nid, digest_nid, EVP_PKEY_id(pkey))) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED); + return 0; + } + + // RSA signature algorithms include an explicit NULL parameter. Others omit + // it. + int paramtype = + (EVP_PKEY_id(pkey) == EVP_PKEY_RSA) ? V_ASN1_NULL : V_ASN1_UNDEF; + return X509_ALGOR_set0(algor, OBJ_nid2obj(sign_nid), paramtype, nullptr); +} + +int bssl::x509_digest_verify_init(EVP_MD_CTX *ctx, const X509_ALGOR *sigalg, + EVP_PKEY *pkey) { + // Convert the signature OID into digest and public key OIDs. + int sigalg_nid = OBJ_obj2nid(sigalg->algorithm); + int digest_nid, pkey_nid; + if (!OBJ_find_sigid_algs(sigalg_nid, &digest_nid, &pkey_nid)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM); + return 0; + } + + // Check the public key OID matches the public key type. + const bool pkey_matches = + pkey_nid == EVP_PKEY_id(pkey) || + (sigalg_nid == NID_rsassaPss && EVP_PKEY_id(pkey) == EVP_PKEY_RSA_PSS); + if (!pkey_matches) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_WRONG_PUBLIC_KEY_TYPE); + return 0; + } + + // Check for permitted digest algorithms + if (!x509_digest_nid_ok(digest_nid)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_DIGEST_AND_KEY_TYPE_NOT_SUPPORTED); + return 0; + } + + // NID_undef signals that there are custom parameters to set. + if (digest_nid == NID_undef) { + if (sigalg_nid == NID_rsassaPss) { + return x509_rsa_pss_to_ctx(ctx, sigalg, pkey); + } + if (sigalg_nid == NID_ED25519 || sigalg_nid == NID_ML_DSA_44 || + sigalg_nid == NID_ML_DSA_65 || sigalg_nid == NID_ML_DSA_87) { + // These algorithms require that parameters be absent (Ed25519: RFC 8410 + // section 3, ML-DSA: RFC 9881 section 2). + if (sigalg->parameter != nullptr) { + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PARAMETER); + return 0; + } + return EVP_DigestVerifyInit(ctx, nullptr, nullptr, nullptr, pkey); + } + OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_SIGNATURE_ALGORITHM); + return 0; + } + + // The parameter should be an explicit NULL for RSA and omitted for ECDSA. For + // compatibility, we allow either for both algorithms. See b/167375496. + // + // TODO(davidben): Chromium's verifier allows both forms for RSA, but enforces + // ECDSA more strictly. Align with Chromium and add a flag for b/167375496. + if (sigalg->parameter != nullptr && sigalg->parameter->type != V_ASN1_NULL) { + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PARAMETER); + return 0; + } + + // Otherwise, initialize with the digest from the OID. + const EVP_MD *digest = EVP_get_digestbynid(digest_nid); + if (digest == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_MESSAGE_DIGEST_ALGORITHM); + return 0; + } + + return EVP_DigestVerifyInit(ctx, nullptr, digest, nullptr, pkey); +} diff --git a/third_party/boringssl/src/crypto/x509/asn1_gen.c b/third_party/boringssl/src/crypto/x509/asn1_gen.c deleted file mode 100644 index eb319700..00000000 --- a/third_party/boringssl/src/crypto/x509/asn1_gen.c +++ /dev/null @@ -1,818 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include - -#include -#include -#include -#include -#include - -#include "../internal.h" -#include "../x509v3/internal.h" -#include "internal.h" - -// Although this file is in crypto/x509 for layering purposes, it emits -// errors from the ASN.1 module for OpenSSL compatibility. - -#define ASN1_GEN_FLAG 0x10000 -#define ASN1_GEN_FLAG_IMP (ASN1_GEN_FLAG | 1) -#define ASN1_GEN_FLAG_EXP (ASN1_GEN_FLAG | 2) -#define ASN1_GEN_FLAG_TAG (ASN1_GEN_FLAG | 3) -#define ASN1_GEN_FLAG_BITWRAP (ASN1_GEN_FLAG | 4) -#define ASN1_GEN_FLAG_OCTWRAP (ASN1_GEN_FLAG | 5) -#define ASN1_GEN_FLAG_SEQWRAP (ASN1_GEN_FLAG | 6) -#define ASN1_GEN_FLAG_SETWRAP (ASN1_GEN_FLAG | 7) -#define ASN1_GEN_FLAG_FORMAT (ASN1_GEN_FLAG | 8) - -#define ASN1_GEN_STR(str, val) \ - { str, sizeof(str) - 1, val } - -#define ASN1_FLAG_EXP_MAX 20 -// Maximum number of nested sequences -#define ASN1_GEN_SEQ_MAX_DEPTH 50 - -// Input formats - -// ASCII: default -#define ASN1_GEN_FORMAT_ASCII 1 -// UTF8 -#define ASN1_GEN_FORMAT_UTF8 2 -// Hex -#define ASN1_GEN_FORMAT_HEX 3 -// List of bits -#define ASN1_GEN_FORMAT_BITLIST 4 - -struct tag_name_st { - const char *strnam; - int len; - int tag; -}; - -typedef struct { - int exp_tag; - int exp_class; - int exp_constructed; - int exp_pad; - long exp_len; -} tag_exp_type; - -typedef struct { - int imp_tag; - int imp_class; - int utype; - int format; - const char *str; - tag_exp_type exp_list[ASN1_FLAG_EXP_MAX]; - int exp_count; -} tag_exp_arg; - -static ASN1_TYPE *generate_v3(const char *str, X509V3_CTX *cnf, int depth, - int *perr); -static int bitstr_cb(const char *elem, int len, void *bitstr); -static int asn1_cb(const char *elem, int len, void *bitstr); -static int append_exp(tag_exp_arg *arg, int exp_tag, int exp_class, - int exp_constructed, int exp_pad, int imp_ok); -static int parse_tagging(const char *vstart, int vlen, int *ptag, int *pclass); -static ASN1_TYPE *asn1_multi(int utype, const char *section, X509V3_CTX *cnf, - int depth, int *perr); -static ASN1_TYPE *asn1_str2type(const char *str, int format, int utype); -static int asn1_str2tag(const char *tagstr, int len); - -ASN1_TYPE *ASN1_generate_v3(const char *str, X509V3_CTX *cnf) { - int err = 0; - ASN1_TYPE *ret = generate_v3(str, cnf, 0, &err); - if (err) { - OPENSSL_PUT_ERROR(ASN1, err); - } - return ret; -} - -static ASN1_TYPE *generate_v3(const char *str, X509V3_CTX *cnf, int depth, - int *perr) { - ASN1_TYPE *ret; - tag_exp_arg asn1_tags; - tag_exp_type *etmp; - - int i, len; - - unsigned char *orig_der = NULL, *new_der = NULL; - const unsigned char *cpy_start; - unsigned char *p; - const unsigned char *cp; - int cpy_len; - long hdr_len = 0; - int hdr_constructed = 0, hdr_tag, hdr_class; - int r; - - asn1_tags.imp_tag = -1; - asn1_tags.imp_class = -1; - asn1_tags.format = ASN1_GEN_FORMAT_ASCII; - asn1_tags.exp_count = 0; - if (CONF_parse_list(str, ',', 1, asn1_cb, &asn1_tags) != 0) { - *perr = ASN1_R_UNKNOWN_TAG; - return NULL; - } - - if ((asn1_tags.utype == V_ASN1_SEQUENCE) || (asn1_tags.utype == V_ASN1_SET)) { - if (!cnf) { - *perr = ASN1_R_SEQUENCE_OR_SET_NEEDS_CONFIG; - return NULL; - } - if (depth >= ASN1_GEN_SEQ_MAX_DEPTH) { - *perr = ASN1_R_ILLEGAL_NESTED_TAGGING; - return NULL; - } - ret = asn1_multi(asn1_tags.utype, asn1_tags.str, cnf, depth, perr); - } else { - ret = asn1_str2type(asn1_tags.str, asn1_tags.format, asn1_tags.utype); - } - - if (!ret) { - return NULL; - } - - // If no tagging return base type - if ((asn1_tags.imp_tag == -1) && (asn1_tags.exp_count == 0)) { - return ret; - } - - // Generate the encoding - cpy_len = i2d_ASN1_TYPE(ret, &orig_der); - ASN1_TYPE_free(ret); - ret = NULL; - // Set point to start copying for modified encoding - cpy_start = orig_der; - - // Do we need IMPLICIT tagging? - if (asn1_tags.imp_tag != -1) { - // If IMPLICIT we will replace the underlying tag - // Skip existing tag+len - r = ASN1_get_object(&cpy_start, &hdr_len, &hdr_tag, &hdr_class, cpy_len); - if (r & 0x80) { - goto err; - } - // Update copy length - cpy_len -= cpy_start - orig_der; - // For IMPLICIT tagging the length should match the original length - // and constructed flag should be consistent. - hdr_constructed = r & V_ASN1_CONSTRUCTED; - // Work out new length with IMPLICIT tag: ignore constructed because - // it will mess up if indefinite length - len = ASN1_object_size(0, hdr_len, asn1_tags.imp_tag); - } else { - len = cpy_len; - } - - // Work out length in any EXPLICIT, starting from end - - for (i = 0, etmp = asn1_tags.exp_list + asn1_tags.exp_count - 1; - i < asn1_tags.exp_count; i++, etmp--) { - // Content length: number of content octets + any padding - len += etmp->exp_pad; - etmp->exp_len = len; - // Total object length: length including new header - len = ASN1_object_size(0, len, etmp->exp_tag); - } - - // Allocate buffer for new encoding - - new_der = OPENSSL_malloc(len); - if (!new_der) { - goto err; - } - - // Generate tagged encoding - - p = new_der; - - // Output explicit tags first - - for (i = 0, etmp = asn1_tags.exp_list; i < asn1_tags.exp_count; i++, etmp++) { - ASN1_put_object(&p, etmp->exp_constructed, etmp->exp_len, etmp->exp_tag, - etmp->exp_class); - if (etmp->exp_pad) { - *p++ = 0; - } - } - - // If IMPLICIT, output tag - - if (asn1_tags.imp_tag != -1) { - if (asn1_tags.imp_class == V_ASN1_UNIVERSAL && - (asn1_tags.imp_tag == V_ASN1_SEQUENCE || - asn1_tags.imp_tag == V_ASN1_SET)) { - hdr_constructed = V_ASN1_CONSTRUCTED; - } - ASN1_put_object(&p, hdr_constructed, hdr_len, asn1_tags.imp_tag, - asn1_tags.imp_class); - } - - // Copy across original encoding - OPENSSL_memcpy(p, cpy_start, cpy_len); - - cp = new_der; - - // Obtain new ASN1_TYPE structure - ret = d2i_ASN1_TYPE(NULL, &cp, len); - -err: - OPENSSL_free(orig_der); - OPENSSL_free(new_der); - return ret; -} - -static int asn1_cb(const char *elem, int len, void *bitstr) { - tag_exp_arg *arg = bitstr; - int i; - int utype; - int vlen = 0; - const char *p, *vstart = NULL; - - int tmp_tag, tmp_class; - - if (elem == NULL) { - return -1; - } - - for (i = 0, p = elem; i < len; p++, i++) { - // Look for the ':' in name value pairs - if (*p == ':') { - vstart = p + 1; - vlen = len - (vstart - elem); - len = p - elem; - break; - } - } - - utype = asn1_str2tag(elem, len); - - if (utype == -1) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_TAG); - ERR_add_error_data(2, "tag=", elem); - return -1; - } - - // If this is not a modifier mark end of string and exit - if (!(utype & ASN1_GEN_FLAG)) { - arg->utype = utype; - arg->str = vstart; - // If no value and not end of string, error - if (!vstart && elem[len]) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_MISSING_VALUE); - return -1; - } - return 0; - } - - switch (utype) { - case ASN1_GEN_FLAG_IMP: - // Check for illegal multiple IMPLICIT tagging - if (arg->imp_tag != -1) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_NESTED_TAGGING); - return -1; - } - if (!parse_tagging(vstart, vlen, &arg->imp_tag, &arg->imp_class)) { - return -1; - } - break; - - case ASN1_GEN_FLAG_EXP: - - if (!parse_tagging(vstart, vlen, &tmp_tag, &tmp_class)) { - return -1; - } - if (!append_exp(arg, tmp_tag, tmp_class, 1, 0, 0)) { - return -1; - } - break; - - case ASN1_GEN_FLAG_SEQWRAP: - if (!append_exp(arg, V_ASN1_SEQUENCE, V_ASN1_UNIVERSAL, 1, 0, 1)) { - return -1; - } - break; - - case ASN1_GEN_FLAG_SETWRAP: - if (!append_exp(arg, V_ASN1_SET, V_ASN1_UNIVERSAL, 1, 0, 1)) { - return -1; - } - break; - - case ASN1_GEN_FLAG_BITWRAP: - if (!append_exp(arg, V_ASN1_BIT_STRING, V_ASN1_UNIVERSAL, 0, 1, 1)) { - return -1; - } - break; - - case ASN1_GEN_FLAG_OCTWRAP: - if (!append_exp(arg, V_ASN1_OCTET_STRING, V_ASN1_UNIVERSAL, 0, 0, 1)) { - return -1; - } - break; - - case ASN1_GEN_FLAG_FORMAT: - if (!vstart) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_FORMAT); - return -1; - } - if (!strncmp(vstart, "ASCII", 5)) { - arg->format = ASN1_GEN_FORMAT_ASCII; - } else if (!strncmp(vstart, "UTF8", 4)) { - arg->format = ASN1_GEN_FORMAT_UTF8; - } else if (!strncmp(vstart, "HEX", 3)) { - arg->format = ASN1_GEN_FORMAT_HEX; - } else if (!strncmp(vstart, "BITLIST", 7)) { - arg->format = ASN1_GEN_FORMAT_BITLIST; - } else { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_FORMAT); - return -1; - } - break; - } - - return 1; -} - -static int parse_tagging(const char *vstart, int vlen, int *ptag, int *pclass) { - char erch[2]; - long tag_num; - char *eptr; - if (!vstart) { - return 0; - } - tag_num = strtoul(vstart, &eptr, 10); - // Check we haven't gone past max length: should be impossible - if (eptr && *eptr && (eptr > vstart + vlen)) { - return 0; - } - if (tag_num < 0) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_NUMBER); - return 0; - } - *ptag = tag_num; - // If we have non numeric characters, parse them - if (eptr) { - vlen -= eptr - vstart; - } else { - vlen = 0; - } - if (vlen) { - switch (*eptr) { - case 'U': - *pclass = V_ASN1_UNIVERSAL; - break; - - case 'A': - *pclass = V_ASN1_APPLICATION; - break; - - case 'P': - *pclass = V_ASN1_PRIVATE; - break; - - case 'C': - *pclass = V_ASN1_CONTEXT_SPECIFIC; - break; - - default: - erch[0] = *eptr; - erch[1] = 0; - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_MODIFIER); - ERR_add_error_data(2, "Char=", erch); - return 0; - break; - } - } else { - *pclass = V_ASN1_CONTEXT_SPECIFIC; - } - - return 1; -} - -// Handle multiple types: SET and SEQUENCE - -static ASN1_TYPE *asn1_multi(int utype, const char *section, X509V3_CTX *cnf, - int depth, int *perr) { - ASN1_TYPE *ret = NULL; - STACK_OF(ASN1_TYPE) *sk = NULL; - STACK_OF(CONF_VALUE) *sect = NULL; - unsigned char *der = NULL; - int derlen; - size_t i; - sk = sk_ASN1_TYPE_new_null(); - if (!sk) { - goto bad; - } - if (section) { - if (!cnf) { - goto bad; - } - sect = X509V3_get_section(cnf, (char *)section); - if (!sect) { - goto bad; - } - for (i = 0; i < sk_CONF_VALUE_num(sect); i++) { - ASN1_TYPE *typ = generate_v3(sk_CONF_VALUE_value(sect, i)->value, cnf, - depth + 1, perr); - if (!typ) { - goto bad; - } - if (!sk_ASN1_TYPE_push(sk, typ)) { - goto bad; - } - } - } - - // Now we has a STACK of the components, convert to the correct form - - if (utype == V_ASN1_SET) { - derlen = i2d_ASN1_SET_ANY(sk, &der); - } else { - derlen = i2d_ASN1_SEQUENCE_ANY(sk, &der); - } - - if (derlen < 0) { - goto bad; - } - - if (!(ret = ASN1_TYPE_new())) { - goto bad; - } - - if (!(ret->value.asn1_string = ASN1_STRING_type_new(utype))) { - goto bad; - } - - ret->type = utype; - - ret->value.asn1_string->data = der; - ret->value.asn1_string->length = derlen; - - der = NULL; - -bad: - OPENSSL_free(der); - sk_ASN1_TYPE_pop_free(sk, ASN1_TYPE_free); - X509V3_section_free(cnf, sect); - return ret; -} - -static int append_exp(tag_exp_arg *arg, int exp_tag, int exp_class, - int exp_constructed, int exp_pad, int imp_ok) { - tag_exp_type *exp_tmp; - // Can only have IMPLICIT if permitted - if ((arg->imp_tag != -1) && !imp_ok) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_IMPLICIT_TAG); - return 0; - } - - if (arg->exp_count == ASN1_FLAG_EXP_MAX) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_DEPTH_EXCEEDED); - return 0; - } - - exp_tmp = &arg->exp_list[arg->exp_count++]; - - // If IMPLICIT set tag to implicit value then reset implicit tag since it - // has been used. - if (arg->imp_tag != -1) { - exp_tmp->exp_tag = arg->imp_tag; - exp_tmp->exp_class = arg->imp_class; - arg->imp_tag = -1; - arg->imp_class = -1; - } else { - exp_tmp->exp_tag = exp_tag; - exp_tmp->exp_class = exp_class; - } - exp_tmp->exp_constructed = exp_constructed; - exp_tmp->exp_pad = exp_pad; - - return 1; -} - -static int asn1_str2tag(const char *tagstr, int len) { - unsigned int i; - static const struct tag_name_st *tntmp, - tnst[] = { - ASN1_GEN_STR("BOOL", V_ASN1_BOOLEAN), - ASN1_GEN_STR("BOOLEAN", V_ASN1_BOOLEAN), - ASN1_GEN_STR("NULL", V_ASN1_NULL), - ASN1_GEN_STR("INT", V_ASN1_INTEGER), - ASN1_GEN_STR("INTEGER", V_ASN1_INTEGER), - ASN1_GEN_STR("ENUM", V_ASN1_ENUMERATED), - ASN1_GEN_STR("ENUMERATED", V_ASN1_ENUMERATED), - ASN1_GEN_STR("OID", V_ASN1_OBJECT), - ASN1_GEN_STR("OBJECT", V_ASN1_OBJECT), - ASN1_GEN_STR("UTCTIME", V_ASN1_UTCTIME), - ASN1_GEN_STR("UTC", V_ASN1_UTCTIME), - ASN1_GEN_STR("GENERALIZEDTIME", V_ASN1_GENERALIZEDTIME), - ASN1_GEN_STR("GENTIME", V_ASN1_GENERALIZEDTIME), - ASN1_GEN_STR("OCT", V_ASN1_OCTET_STRING), - ASN1_GEN_STR("OCTETSTRING", V_ASN1_OCTET_STRING), - ASN1_GEN_STR("BITSTR", V_ASN1_BIT_STRING), - ASN1_GEN_STR("BITSTRING", V_ASN1_BIT_STRING), - ASN1_GEN_STR("UNIVERSALSTRING", V_ASN1_UNIVERSALSTRING), - ASN1_GEN_STR("UNIV", V_ASN1_UNIVERSALSTRING), - ASN1_GEN_STR("IA5", V_ASN1_IA5STRING), - ASN1_GEN_STR("IA5STRING", V_ASN1_IA5STRING), - ASN1_GEN_STR("UTF8", V_ASN1_UTF8STRING), - ASN1_GEN_STR("UTF8String", V_ASN1_UTF8STRING), - ASN1_GEN_STR("BMP", V_ASN1_BMPSTRING), - ASN1_GEN_STR("BMPSTRING", V_ASN1_BMPSTRING), - ASN1_GEN_STR("VISIBLESTRING", V_ASN1_VISIBLESTRING), - ASN1_GEN_STR("VISIBLE", V_ASN1_VISIBLESTRING), - ASN1_GEN_STR("PRINTABLESTRING", V_ASN1_PRINTABLESTRING), - ASN1_GEN_STR("PRINTABLE", V_ASN1_PRINTABLESTRING), - ASN1_GEN_STR("T61", V_ASN1_T61STRING), - ASN1_GEN_STR("T61STRING", V_ASN1_T61STRING), - ASN1_GEN_STR("TELETEXSTRING", V_ASN1_T61STRING), - ASN1_GEN_STR("GeneralString", V_ASN1_GENERALSTRING), - ASN1_GEN_STR("GENSTR", V_ASN1_GENERALSTRING), - ASN1_GEN_STR("NUMERIC", V_ASN1_NUMERICSTRING), - ASN1_GEN_STR("NUMERICSTRING", V_ASN1_NUMERICSTRING), - - // Special cases - ASN1_GEN_STR("SEQUENCE", V_ASN1_SEQUENCE), - ASN1_GEN_STR("SEQ", V_ASN1_SEQUENCE), - ASN1_GEN_STR("SET", V_ASN1_SET), - // type modifiers - // Explicit tag - ASN1_GEN_STR("EXP", ASN1_GEN_FLAG_EXP), - ASN1_GEN_STR("EXPLICIT", ASN1_GEN_FLAG_EXP), - // Implicit tag - ASN1_GEN_STR("IMP", ASN1_GEN_FLAG_IMP), - ASN1_GEN_STR("IMPLICIT", ASN1_GEN_FLAG_IMP), - // OCTET STRING wrapper - ASN1_GEN_STR("OCTWRAP", ASN1_GEN_FLAG_OCTWRAP), - // SEQUENCE wrapper - ASN1_GEN_STR("SEQWRAP", ASN1_GEN_FLAG_SEQWRAP), - // SET wrapper - ASN1_GEN_STR("SETWRAP", ASN1_GEN_FLAG_SETWRAP), - // BIT STRING wrapper - ASN1_GEN_STR("BITWRAP", ASN1_GEN_FLAG_BITWRAP), - ASN1_GEN_STR("FORM", ASN1_GEN_FLAG_FORMAT), - ASN1_GEN_STR("FORMAT", ASN1_GEN_FLAG_FORMAT), - }; - - if (len == -1) { - len = strlen(tagstr); - } - - tntmp = tnst; - for (i = 0; i < sizeof(tnst) / sizeof(struct tag_name_st); i++, tntmp++) { - if ((len == tntmp->len) && !strncmp(tntmp->strnam, tagstr, len)) { - return tntmp->tag; - } - } - - return -1; -} - -static ASN1_TYPE *asn1_str2type(const char *str, int format, int utype) { - ASN1_TYPE *atmp = NULL; - - CONF_VALUE vtmp; - - unsigned char *rdata; - long rdlen; - - int no_unused = 1; - - if (!(atmp = ASN1_TYPE_new())) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return NULL; - } - - if (!str) { - str = ""; - } - - switch (utype) { - case V_ASN1_NULL: - if (str && *str) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_NULL_VALUE); - goto bad_form; - } - break; - - case V_ASN1_BOOLEAN: - if (format != ASN1_GEN_FORMAT_ASCII) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ASCII_FORMAT); - goto bad_form; - } - vtmp.name = NULL; - vtmp.section = NULL; - vtmp.value = (char *)str; - if (!X509V3_get_value_bool(&vtmp, &atmp->value.boolean)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_BOOLEAN); - goto bad_str; - } - break; - - case V_ASN1_INTEGER: - case V_ASN1_ENUMERATED: - if (format != ASN1_GEN_FORMAT_ASCII) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INTEGER_NOT_ASCII_FORMAT); - goto bad_form; - } - if (!(atmp->value.integer = s2i_ASN1_INTEGER(NULL, (char *)str))) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_INTEGER); - goto bad_str; - } - break; - - case V_ASN1_OBJECT: - if (format != ASN1_GEN_FORMAT_ASCII) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_OBJECT_NOT_ASCII_FORMAT); - goto bad_form; - } - if (!(atmp->value.object = OBJ_txt2obj(str, 0))) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_OBJECT); - goto bad_str; - } - break; - - case V_ASN1_UTCTIME: - case V_ASN1_GENERALIZEDTIME: - if (format != ASN1_GEN_FORMAT_ASCII) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_TIME_NOT_ASCII_FORMAT); - goto bad_form; - } - if (!(atmp->value.asn1_string = ASN1_STRING_new())) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - goto bad_str; - } - if (!ASN1_STRING_set(atmp->value.asn1_string, str, -1)) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - goto bad_str; - } - atmp->value.asn1_string->type = utype; - if (!ASN1_TIME_check(atmp->value.asn1_string)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_TIME_VALUE); - goto bad_str; - } - - break; - - case V_ASN1_BMPSTRING: - case V_ASN1_PRINTABLESTRING: - case V_ASN1_IA5STRING: - case V_ASN1_T61STRING: - case V_ASN1_UTF8STRING: - case V_ASN1_VISIBLESTRING: - case V_ASN1_UNIVERSALSTRING: - case V_ASN1_GENERALSTRING: - case V_ASN1_NUMERICSTRING: - - if (format == ASN1_GEN_FORMAT_ASCII) { - format = MBSTRING_ASC; - } else if (format == ASN1_GEN_FORMAT_UTF8) { - format = MBSTRING_UTF8; - } else { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_FORMAT); - goto bad_form; - } - - if (ASN1_mbstring_copy(&atmp->value.asn1_string, (unsigned char *)str, -1, - format, ASN1_tag2bit(utype)) <= 0) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - goto bad_str; - } - - break; - - case V_ASN1_BIT_STRING: - - case V_ASN1_OCTET_STRING: - - if (!(atmp->value.asn1_string = ASN1_STRING_new())) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - goto bad_form; - } - - if (format == ASN1_GEN_FORMAT_HEX) { - if (!(rdata = x509v3_hex_to_bytes((char *)str, &rdlen))) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_HEX); - goto bad_str; - } - - atmp->value.asn1_string->data = rdata; - atmp->value.asn1_string->length = rdlen; - atmp->value.asn1_string->type = utype; - - } else if (format == ASN1_GEN_FORMAT_ASCII) { - ASN1_STRING_set(atmp->value.asn1_string, str, -1); - } else if ((format == ASN1_GEN_FORMAT_BITLIST) && - (utype == V_ASN1_BIT_STRING)) { - if (!CONF_parse_list(str, ',', 1, bitstr_cb, atmp->value.bit_string)) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_LIST_ERROR); - goto bad_str; - } - no_unused = 0; - - } else { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_BITSTRING_FORMAT); - goto bad_form; - } - - if ((utype == V_ASN1_BIT_STRING) && no_unused) { - atmp->value.asn1_string->flags &= ~(ASN1_STRING_FLAG_BITS_LEFT | 0x07); - atmp->value.asn1_string->flags |= ASN1_STRING_FLAG_BITS_LEFT; - } - - break; - - default: - OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNSUPPORTED_TYPE); - goto bad_str; - break; - } - - atmp->type = utype; - return atmp; - -bad_str: - ERR_add_error_data(2, "string=", str); -bad_form: - - ASN1_TYPE_free(atmp); - return NULL; -} - -static int bitstr_cb(const char *elem, int len, void *bitstr) { - long bitnum; - char *eptr; - if (!elem) { - return 0; - } - bitnum = strtoul(elem, &eptr, 10); - if (eptr && *eptr && (eptr != elem + len)) { - return 0; - } - if (bitnum < 0) { - OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_NUMBER); - return 0; - } - if (!ASN1_BIT_STRING_set_bit(bitstr, bitnum, 1)) { - OPENSSL_PUT_ERROR(ASN1, ERR_R_MALLOC_FAILURE); - return 0; - } - return 1; -} diff --git a/third_party/boringssl/src/crypto/x509/asn1_gen.cc b/third_party/boringssl/src/crypto/x509/asn1_gen.cc new file mode 100644 index 00000000..27b84465 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/asn1_gen.cc @@ -0,0 +1,540 @@ +// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "../conf/internal.h" +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +// Although this file is in crypto/x509 for layering purposes, it emits +// errors from the ASN.1 module for OpenSSL compatibility. + +// ASN1_GEN_MAX_DEPTH is the maximum number of nested TLVs allowed. +#define ASN1_GEN_MAX_DEPTH 50 + +// ASN1_GEN_MAX_OUTPUT is the maximum output, in bytes, allowed. This limit is +// necessary because the SEQUENCE and SET section reference mechanism allows the +// output length to grow super-linearly with the input length. +#define ASN1_GEN_MAX_OUTPUT (64 * 1024) + +// ASN1_GEN_FORMAT_* are the values for the format modifiers. +#define ASN1_GEN_FORMAT_ASCII 1 +#define ASN1_GEN_FORMAT_UTF8 2 +#define ASN1_GEN_FORMAT_HEX 3 +#define ASN1_GEN_FORMAT_BITLIST 4 + +// generate_v3 converts |str| into an ASN.1 structure and writes the result to +// |cbb|. It returns one on success and zero on error. |depth| bounds recursion, +// and |format| specifies the current format modifier. +// +// If |tag| is non-zero, the structure is implicitly tagged with |tag|. |tag| +// must not have the constructed bit set. +static int generate_v3(CBB *cbb, const char *str, const X509V3_CTX *cnf, + CBS_ASN1_TAG tag, int format, int depth); + +static int bitstr_cb(const char *elem, size_t len, void *bitstr); + +ASN1_TYPE *bssl::ASN1_generate_v3(const char *str, const X509V3_CTX *cnf) { + ScopedCBB cbb; + if (!CBB_init(cbb.get(), 0) || // + !generate_v3(cbb.get(), str, cnf, /*tag=*/0, ASN1_GEN_FORMAT_ASCII, + /*depth=*/0)) { + return nullptr; + } + + // While not strictly necessary to avoid a DoS (we rely on any super-linear + // checks being performed internally), cap the overall output to + // |ASN1_GEN_MAX_OUTPUT| so the externally-visible behavior is consistent. + if (CBB_len(cbb.get()) > ASN1_GEN_MAX_OUTPUT) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_TOO_LONG); + return nullptr; + } + + const uint8_t *der = CBB_data(cbb.get()); + return d2i_ASN1_TYPE(nullptr, &der, CBB_len(cbb.get())); +} + +static int cbs_str_equal(const CBS *cbs, const char *str) { + return CBS_len(cbs) == strlen(str) && + OPENSSL_memcmp(CBS_data(cbs), str, strlen(str)) == 0; +} + +// parse_tag decodes a tag specifier in |cbs|. It returns the tag on success or +// zero on error. +static CBS_ASN1_TAG parse_tag(const CBS *cbs) { + CBS copy = *cbs; + uint64_t num; + if (!CBS_get_u64_decimal(©, &num) || num > CBS_ASN1_TAG_NUMBER_MASK) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_NUMBER); + return 0; + } + + CBS_ASN1_TAG tag_class = CBS_ASN1_CONTEXT_SPECIFIC; + // The tag may be suffixed by a class. + uint8_t c; + if (CBS_get_u8(©, &c)) { + switch (c) { + case 'U': + tag_class = CBS_ASN1_UNIVERSAL; + break; + case 'A': + tag_class = CBS_ASN1_APPLICATION; + break; + case 'P': + tag_class = CBS_ASN1_PRIVATE; + break; + case 'C': + tag_class = CBS_ASN1_CONTEXT_SPECIFIC; + break; + default: { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_MODIFIER); + return 0; + } + } + if (CBS_len(©) != 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_MODIFIER); + return 0; + } + } + + // Tag [UNIVERSAL 0] is reserved for indefinite-length end-of-contents. We + // also use zero in this file to indicator no explicit tagging. + if (tag_class == CBS_ASN1_UNIVERSAL && num == 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_NUMBER); + return 0; + } + + return tag_class | (CBS_ASN1_TAG)num; +} + +static int generate_wrapped(CBB *cbb, const char *str, const X509V3_CTX *cnf, + CBS_ASN1_TAG tag, int padding, int format, + int depth) { + CBB child; + return CBB_add_asn1(cbb, &child, tag) && + (!padding || CBB_add_u8(&child, 0)) && + generate_v3(&child, str, cnf, /*tag=*/0, format, depth + 1) && + CBB_flush(cbb); +} + +static int generate_v3(CBB *cbb, const char *str, const X509V3_CTX *cnf, + CBS_ASN1_TAG tag, int format, int depth) { + assert((tag & CBS_ASN1_CONSTRUCTED) == 0); + if (depth > ASN1_GEN_MAX_DEPTH) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_NESTED_TAGGING); + return 0; + } + + // Process modifiers. This function uses a mix of NUL-terminated strings and + // |CBS|. Several functions only work with NUL-terminated strings, so we need + // to keep track of when a slice spans the whole buffer. + for (;;) { + // Skip whitespace. + while (*str != '\0' && OPENSSL_isspace((unsigned char)*str)) { + str++; + } + + // Modifiers end at commas. + const char *comma = strchr(str, ','); + if (comma == nullptr) { + break; + } + + // Remove trailing whitespace. + CBS modifier; + CBS_init(&modifier, (const uint8_t *)str, comma - str); + for (;;) { + uint8_t v; + CBS copy = modifier; + if (!CBS_get_last_u8(©, &v) || !OPENSSL_isspace(v)) { + break; + } + modifier = copy; + } + + // Advance the string past the modifier, but save the original value. We + // will need to rewind if this is not a recognized modifier. + const char *str_old = str; + str = comma + 1; + + // Each modifier is either NAME:VALUE or NAME. + CBS name; + int has_value = CBS_get_until_first(&modifier, &name, ':'); + if (has_value) { + CBS_skip(&modifier, 1); // Skip the colon. + } else { + name = modifier; + CBS_init(&modifier, nullptr, 0); + } + + if (cbs_str_equal(&name, "FORMAT") || cbs_str_equal(&name, "FORM")) { + if (cbs_str_equal(&modifier, "ASCII")) { + format = ASN1_GEN_FORMAT_ASCII; + } else if (cbs_str_equal(&modifier, "UTF8")) { + format = ASN1_GEN_FORMAT_UTF8; + } else if (cbs_str_equal(&modifier, "HEX")) { + format = ASN1_GEN_FORMAT_HEX; + } else if (cbs_str_equal(&modifier, "BITLIST")) { + format = ASN1_GEN_FORMAT_BITLIST; + } else { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_FORMAT); + return 0; + } + } else if (cbs_str_equal(&name, "IMP") || + cbs_str_equal(&name, "IMPLICIT")) { + if (tag != 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_NESTED_TAGGING); + return 0; + } + tag = parse_tag(&modifier); + if (tag == 0) { + return 0; + } + } else if (cbs_str_equal(&name, "EXP") || + cbs_str_equal(&name, "EXPLICIT")) { + // It would actually be supportable, but OpenSSL does not allow wrapping + // an explicit tag in an implicit tag. + if (tag != 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_NESTED_TAGGING); + return 0; + } + tag = parse_tag(&modifier); + return tag != 0 && + generate_wrapped(cbb, str, cnf, tag | CBS_ASN1_CONSTRUCTED, + /*padding=*/0, format, depth); + } else if (cbs_str_equal(&name, "OCTWRAP")) { + tag = tag == 0 ? CBS_ASN1_OCTETSTRING : tag; + return generate_wrapped(cbb, str, cnf, tag, /*padding=*/0, format, depth); + } else if (cbs_str_equal(&name, "BITWRAP")) { + tag = tag == 0 ? CBS_ASN1_BITSTRING : tag; + return generate_wrapped(cbb, str, cnf, tag, /*padding=*/1, format, depth); + } else if (cbs_str_equal(&name, "SEQWRAP")) { + tag = tag == 0 ? CBS_ASN1_SEQUENCE : (tag | CBS_ASN1_CONSTRUCTED); + tag |= CBS_ASN1_CONSTRUCTED; + return generate_wrapped(cbb, str, cnf, tag, /*padding=*/0, format, depth); + } else if (cbs_str_equal(&name, "SETWRAP")) { + tag = tag == 0 ? CBS_ASN1_SET : (tag | CBS_ASN1_CONSTRUCTED); + return generate_wrapped(cbb, str, cnf, tag, /*padding=*/0, format, depth); + } else { + // If this was not a recognized modifier, rewind |str| to before splitting + // on the comma. The type itself consumes all remaining input. + str = str_old; + break; + } + } + + // The final element is, like modifiers, NAME:VALUE or NAME, but VALUE spans + // the length of the string, including any commas. + const char *colon = strchr(str, ':'); + CBS name; + const char *value; + int has_value = colon != nullptr; + if (has_value) { + CBS_init(&name, (const uint8_t *)str, colon - str); + value = colon + 1; + } else { + CBS_init(&name, (const uint8_t *)str, strlen(str)); + value = ""; // Most types treat missing and empty value equivalently. + } + + static const struct { + const char *name; + CBS_ASN1_TAG type; + } kTypes[] = { + {"BOOL", CBS_ASN1_BOOLEAN}, + {"BOOLEAN", CBS_ASN1_BOOLEAN}, + {"NULL", CBS_ASN1_NULL}, + {"INT", CBS_ASN1_INTEGER}, + {"INTEGER", CBS_ASN1_INTEGER}, + {"ENUM", CBS_ASN1_ENUMERATED}, + {"ENUMERATED", CBS_ASN1_ENUMERATED}, + {"OID", CBS_ASN1_OBJECT}, + {"OBJECT", CBS_ASN1_OBJECT}, + {"UTCTIME", CBS_ASN1_UTCTIME}, + {"UTC", CBS_ASN1_UTCTIME}, + {"GENERALIZEDTIME", CBS_ASN1_GENERALIZEDTIME}, + {"GENTIME", CBS_ASN1_GENERALIZEDTIME}, + {"OCT", CBS_ASN1_OCTETSTRING}, + {"OCTETSTRING", CBS_ASN1_OCTETSTRING}, + {"BITSTR", CBS_ASN1_BITSTRING}, + {"BITSTRING", CBS_ASN1_BITSTRING}, + {"UNIVERSALSTRING", CBS_ASN1_UNIVERSALSTRING}, + {"UNIV", CBS_ASN1_UNIVERSALSTRING}, + {"IA5", CBS_ASN1_IA5STRING}, + {"IA5STRING", CBS_ASN1_IA5STRING}, + {"UTF8", CBS_ASN1_UTF8STRING}, + {"UTF8String", CBS_ASN1_UTF8STRING}, + {"BMP", CBS_ASN1_BMPSTRING}, + {"BMPSTRING", CBS_ASN1_BMPSTRING}, + {"PRINTABLESTRING", CBS_ASN1_PRINTABLESTRING}, + {"PRINTABLE", CBS_ASN1_PRINTABLESTRING}, + {"T61", CBS_ASN1_T61STRING}, + {"T61STRING", CBS_ASN1_T61STRING}, + {"TELETEXSTRING", CBS_ASN1_T61STRING}, + {"SEQUENCE", CBS_ASN1_SEQUENCE}, + {"SEQ", CBS_ASN1_SEQUENCE}, + {"SET", CBS_ASN1_SET}, + }; + CBS_ASN1_TAG type = 0; + for (const auto &t : kTypes) { + if (cbs_str_equal(&name, t.name)) { + type = t.type; + break; + } + } + if (type == 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNKNOWN_TAG); + return 0; + } + + // If there is an implicit tag, use the constructed bit from the base type. + tag = tag == 0 ? type : (tag | (type & CBS_ASN1_CONSTRUCTED)); + CBB child; + if (!CBB_add_asn1(cbb, &child, tag)) { + return 0; + } + + switch (type) { + case CBS_ASN1_NULL: + if (*value != '\0') { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_NULL_VALUE); + return 0; + } + return CBB_flush(cbb); + + case CBS_ASN1_BOOLEAN: { + if (format != ASN1_GEN_FORMAT_ASCII) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_NOT_ASCII_FORMAT); + return 0; + } + ASN1_BOOLEAN boolean; + if (!X509V3_bool_from_string(value, &boolean)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_BOOLEAN); + return 0; + } + return CBB_add_u8(&child, boolean ? 0xff : 0x00) && CBB_flush(cbb); + } + + case CBS_ASN1_INTEGER: + case CBS_ASN1_ENUMERATED: { + if (format != ASN1_GEN_FORMAT_ASCII) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INTEGER_NOT_ASCII_FORMAT); + return 0; + } + ASN1_INTEGER *obj = s2i_ASN1_INTEGER(nullptr, value); + if (obj == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_INTEGER); + return 0; + } + int len = i2c_ASN1_INTEGER(obj, nullptr); + uint8_t *out; + int ok = len > 0 && // + CBB_add_space(&child, &out, len) && + i2c_ASN1_INTEGER(obj, &out) == len && CBB_flush(cbb); + ASN1_INTEGER_free(obj); + return ok; + } + + case CBS_ASN1_OBJECT: { + if (format != ASN1_GEN_FORMAT_ASCII) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_OBJECT_NOT_ASCII_FORMAT); + return 0; + } + ASN1_OBJECT *obj = OBJ_txt2obj(value, /*dont_search_names=*/0); + if (obj == nullptr || obj->length == 0) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_OBJECT); + return 0; + } + int ok = CBB_add_bytes(&child, obj->data, obj->length) && CBB_flush(cbb); + ASN1_OBJECT_free(obj); + return ok; + } + + case CBS_ASN1_UTCTIME: + case CBS_ASN1_GENERALIZEDTIME: { + if (format != ASN1_GEN_FORMAT_ASCII) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_TIME_NOT_ASCII_FORMAT); + return 0; + } + CBS value_cbs; + CBS_init(&value_cbs, (const uint8_t *)value, strlen(value)); + int ok = type == CBS_ASN1_UTCTIME + ? CBS_parse_utc_time(&value_cbs, nullptr, + /*allow_timezone_offset=*/0) + : CBS_parse_generalized_time(&value_cbs, nullptr, + /*allow_timezone_offset=*/0); + if (!ok) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_TIME_VALUE); + return 0; + } + return CBB_add_bytes(&child, (const uint8_t *)value, strlen(value)) && + CBB_flush(cbb); + } + + case CBS_ASN1_UNIVERSALSTRING: + case CBS_ASN1_IA5STRING: + case CBS_ASN1_UTF8STRING: + case CBS_ASN1_BMPSTRING: + case CBS_ASN1_PRINTABLESTRING: + case CBS_ASN1_T61STRING: { + int encoding; + if (format == ASN1_GEN_FORMAT_ASCII) { + encoding = MBSTRING_ASC; + } else if (format == ASN1_GEN_FORMAT_UTF8) { + encoding = MBSTRING_UTF8; + } else { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_FORMAT); + return 0; + } + + // |maxsize| is measured in code points, rather than bytes, but pass it in + // as a loose cap so fuzzers can exit from excessively long inputs + // earlier. This limit is not load-bearing because |ASN1_mbstring_ncopy|'s + // output is already linear in the input. + ASN1_STRING *obj = nullptr; + if (ASN1_mbstring_ncopy(&obj, (const uint8_t *)value, -1, encoding, + ASN1_tag2bit(type), /*minsize=*/0, + /*maxsize=*/ASN1_GEN_MAX_OUTPUT) <= 0) { + return 0; + } + int ok = CBB_add_bytes(&child, obj->data, obj->length) && CBB_flush(cbb); + ASN1_STRING_free(obj); + return ok; + } + + case CBS_ASN1_BITSTRING: + if (format == ASN1_GEN_FORMAT_BITLIST) { + ASN1_BIT_STRING *obj = ASN1_BIT_STRING_new(); + if (obj == nullptr) { + return 0; + } + if (!CONF_parse_list(value, ',', 1, bitstr_cb, obj)) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_LIST_ERROR); + ASN1_BIT_STRING_free(obj); + return 0; + } + int len = i2c_ASN1_BIT_STRING(obj, nullptr); + uint8_t *out; + int ok = len > 0 && // + CBB_add_space(&child, &out, len) && + i2c_ASN1_BIT_STRING(obj, &out) == len && // + CBB_flush(cbb); + ASN1_BIT_STRING_free(obj); + return ok; + } + + // The other formats are the same as OCTET STRING, but with the leading + // zero bytes. + if (!CBB_add_u8(&child, 0)) { + return 0; + } + [[fallthrough]]; + + case CBS_ASN1_OCTETSTRING: + if (format == ASN1_GEN_FORMAT_ASCII) { + return CBB_add_bytes(&child, (const uint8_t *)value, strlen(value)) && + CBB_flush(cbb); + } + if (format == ASN1_GEN_FORMAT_HEX) { + size_t len; + uint8_t *data = x509v3_hex_to_bytes(value, &len); + if (data == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_HEX); + return 0; + } + int ok = CBB_add_bytes(&child, data, len) && CBB_flush(cbb); + OPENSSL_free(data); + return ok; + } + + OPENSSL_PUT_ERROR(ASN1, ASN1_R_ILLEGAL_BITSTRING_FORMAT); + return 0; + + case CBS_ASN1_SEQUENCE: + case CBS_ASN1_SET: + if (has_value) { + if (cnf == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_SEQUENCE_OR_SET_NEEDS_CONFIG); + return 0; + } + const STACK_OF(CONF_VALUE) *section = X509V3_get_section(cnf, value); + if (section == nullptr) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_SEQUENCE_OR_SET_NEEDS_CONFIG); + return 0; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(section); i++) { + const CONF_VALUE *conf = sk_CONF_VALUE_value(section, i); + if (!generate_v3(&child, conf->value, cnf, /*tag=*/0, + ASN1_GEN_FORMAT_ASCII, depth + 1)) { + return 0; + } + // This recursive call, by referencing |section|, is the one place + // where |generate_v3|'s output can be super-linear in the input. + // Check bounds here. + if (CBB_len(&child) > ASN1_GEN_MAX_OUTPUT) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_TOO_LONG); + return 0; + } + } + } + if (type == CBS_ASN1_SET) { + // The SET type here is a SET OF and must be sorted. + return CBB_flush_asn1_set_of(&child) && CBB_flush(cbb); + } + return CBB_flush(cbb); + + default: + OPENSSL_PUT_ERROR(ASN1, ERR_R_INTERNAL_ERROR); + return 0; + } +} + +static int bitstr_cb(const char *elem, size_t len, void *bitstr) { + CBS cbs; + CBS_init(&cbs, (const uint8_t *)elem, len); + uint64_t bitnum; + if (!CBS_get_u64_decimal(&cbs, &bitnum) || CBS_len(&cbs) != 0 || + // Cap the highest allowed bit so this mechanism cannot be used to create + // extremely large allocations with short inputs. The highest named bit in + // RFC 5280 is 8, so 256 should give comfortable margin but still only + // allow a 32-byte allocation. + // + // We do not consider this function to be safe with untrusted inputs (even + // without bugs, it is prone to string injection vulnerabilities), so DoS + // is not truly a concern, but the limit is necessary to keep fuzzing + // effective. + bitnum > 256) { + OPENSSL_PUT_ERROR(ASN1, ASN1_R_INVALID_NUMBER); + return 0; + } + if (!ASN1_BIT_STRING_set_bit(reinterpret_cast(bitstr), + (int)bitnum, 1)) { + return 0; + } + return 1; +} diff --git a/third_party/boringssl/src/crypto/x509/by_dir.c b/third_party/boringssl/src/crypto/x509/by_dir.c deleted file mode 100644 index 9e3290dd..00000000 --- a/third_party/boringssl/src/crypto/x509/by_dir.c +++ /dev/null @@ -1,439 +0,0 @@ -/* crypto/x509/by_dir.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include -#include - -#include -#include -#include -#include -#include - -#if !defined(OPENSSL_TRUSTY) - -#include "../internal.h" -#include "internal.h" - -typedef struct lookup_dir_hashes_st { - unsigned long hash; - int suffix; -} BY_DIR_HASH; - -typedef struct lookup_dir_entry_st { - char *dir; - int dir_type; - STACK_OF(BY_DIR_HASH) *hashes; -} BY_DIR_ENTRY; - -typedef struct lookup_dir_st { - BUF_MEM *buffer; - STACK_OF(BY_DIR_ENTRY) *dirs; -} BY_DIR; - -DEFINE_STACK_OF(BY_DIR_HASH) -DEFINE_STACK_OF(BY_DIR_ENTRY) - -static int dir_ctrl(X509_LOOKUP *ctx, int cmd, const char *argp, long argl, - char **ret); -static int new_dir(X509_LOOKUP *lu); -static void free_dir(X509_LOOKUP *lu); -static int add_cert_dir(BY_DIR *ctx, const char *dir, int type); -static int get_cert_by_subject(X509_LOOKUP *xl, int type, X509_NAME *name, - X509_OBJECT *ret); -static X509_LOOKUP_METHOD x509_dir_lookup = { - "Load certs from files in a directory", - new_dir, // new - free_dir, // free - NULL, // init - NULL, // shutdown - dir_ctrl, // ctrl - get_cert_by_subject, // get_by_subject - NULL, // get_by_issuer_serial - NULL, // get_by_fingerprint - NULL, // get_by_alias -}; - -X509_LOOKUP_METHOD *X509_LOOKUP_hash_dir(void) { return &x509_dir_lookup; } - -static int dir_ctrl(X509_LOOKUP *ctx, int cmd, const char *argp, long argl, - char **retp) { - int ret = 0; - char *dir = NULL; - - BY_DIR *ld = ctx->method_data; - - switch (cmd) { - case X509_L_ADD_DIR: - if (argl == X509_FILETYPE_DEFAULT) { - dir = (char *)getenv(X509_get_default_cert_dir_env()); - if (dir) { - ret = add_cert_dir(ld, dir, X509_FILETYPE_PEM); - } else { - ret = - add_cert_dir(ld, X509_get_default_cert_dir(), X509_FILETYPE_PEM); - } - if (!ret) { - OPENSSL_PUT_ERROR(X509, X509_R_LOADING_CERT_DIR); - } - } else { - ret = add_cert_dir(ld, argp, (int)argl); - } - break; - } - return ret; -} - -static int new_dir(X509_LOOKUP *lu) { - BY_DIR *a; - - if ((a = (BY_DIR *)OPENSSL_malloc(sizeof(BY_DIR))) == NULL) { - return 0; - } - if ((a->buffer = BUF_MEM_new()) == NULL) { - OPENSSL_free(a); - return 0; - } - a->dirs = NULL; - lu->method_data = a; - return 1; -} - -static void by_dir_hash_free(BY_DIR_HASH *hash) { OPENSSL_free(hash); } - -static int by_dir_hash_cmp(const BY_DIR_HASH **a, const BY_DIR_HASH **b) { - if ((*a)->hash > (*b)->hash) { - return 1; - } - if ((*a)->hash < (*b)->hash) { - return -1; - } - return 0; -} - -static void by_dir_entry_free(BY_DIR_ENTRY *ent) { - if (ent != NULL) { - OPENSSL_free(ent->dir); - sk_BY_DIR_HASH_pop_free(ent->hashes, by_dir_hash_free); - OPENSSL_free(ent); - } -} - -static void free_dir(X509_LOOKUP *lu) { - BY_DIR *a = lu->method_data; - if (a != NULL) { - sk_BY_DIR_ENTRY_pop_free(a->dirs, by_dir_entry_free); - BUF_MEM_free(a->buffer); - OPENSSL_free(a); - } -} - -static int add_cert_dir(BY_DIR *ctx, const char *dir, int type) { - size_t j, len; - const char *s, *ss, *p; - - if (dir == NULL || !*dir) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_DIRECTORY); - return 0; - } - - s = dir; - p = s; - do { - if ((*p == ':') || (*p == '\0')) { - BY_DIR_ENTRY *ent; - ss = s; - s = p + 1; - len = p - ss; - if (len == 0) { - continue; - } - for (j = 0; j < sk_BY_DIR_ENTRY_num(ctx->dirs); j++) { - ent = sk_BY_DIR_ENTRY_value(ctx->dirs, j); - if (strlen(ent->dir) == len && strncmp(ent->dir, ss, len) == 0) { - break; - } - } - if (j < sk_BY_DIR_ENTRY_num(ctx->dirs)) { - continue; - } - if (ctx->dirs == NULL) { - ctx->dirs = sk_BY_DIR_ENTRY_new_null(); - if (!ctx->dirs) { - OPENSSL_PUT_ERROR(X509, ERR_R_MALLOC_FAILURE); - return 0; - } - } - ent = OPENSSL_malloc(sizeof(BY_DIR_ENTRY)); - if (!ent) { - return 0; - } - ent->dir_type = type; - ent->hashes = sk_BY_DIR_HASH_new(by_dir_hash_cmp); - ent->dir = OPENSSL_malloc(len + 1); - if (!ent->dir || !ent->hashes) { - by_dir_entry_free(ent); - return 0; - } - OPENSSL_strlcpy(ent->dir, ss, len + 1); - if (!sk_BY_DIR_ENTRY_push(ctx->dirs, ent)) { - by_dir_entry_free(ent); - return 0; - } - } - } while (*p++ != '\0'); - return 1; -} - -// g_ent_hashes_lock protects the |hashes| member of all |BY_DIR_ENTRY| -// objects. -static struct CRYPTO_STATIC_MUTEX g_ent_hashes_lock = CRYPTO_STATIC_MUTEX_INIT; - -static int get_cert_by_subject(X509_LOOKUP *xl, int type, X509_NAME *name, - X509_OBJECT *ret) { - union { - struct { - X509 st_x509; - X509_CINF st_x509_cinf; - } x509; - struct { - X509_CRL st_crl; - X509_CRL_INFO st_crl_info; - } crl; - } data; - int ok = 0; - size_t i; - int j, k; - unsigned long h; - unsigned long hash_array[2]; - int hash_index; - BUF_MEM *b = NULL; - X509_OBJECT stmp, *tmp; - const char *postfix = ""; - - if (name == NULL) { - return 0; - } - - stmp.type = type; - if (type == X509_LU_X509) { - data.x509.st_x509.cert_info = &data.x509.st_x509_cinf; - data.x509.st_x509_cinf.subject = name; - stmp.data.x509 = &data.x509.st_x509; - postfix = ""; - } else if (type == X509_LU_CRL) { - data.crl.st_crl.crl = &data.crl.st_crl_info; - data.crl.st_crl_info.issuer = name; - stmp.data.crl = &data.crl.st_crl; - postfix = "r"; - } else { - OPENSSL_PUT_ERROR(X509, X509_R_WRONG_LOOKUP_TYPE); - goto finish; - } - - if ((b = BUF_MEM_new()) == NULL) { - OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); - goto finish; - } - - BY_DIR *ctx = xl->method_data; - - hash_array[0] = X509_NAME_hash(name); - hash_array[1] = X509_NAME_hash_old(name); - for (hash_index = 0; hash_index < 2; ++hash_index) { - h = hash_array[hash_index]; - for (i = 0; i < sk_BY_DIR_ENTRY_num(ctx->dirs); i++) { - BY_DIR_ENTRY *ent; - size_t idx; - BY_DIR_HASH htmp, *hent; - ent = sk_BY_DIR_ENTRY_value(ctx->dirs, i); - j = strlen(ent->dir) + 1 + 8 + 6 + 1 + 1; - if (!BUF_MEM_grow(b, j)) { - OPENSSL_PUT_ERROR(X509, ERR_R_MALLOC_FAILURE); - goto finish; - } - if (type == X509_LU_CRL && ent->hashes) { - htmp.hash = h; - CRYPTO_STATIC_MUTEX_lock_read(&g_ent_hashes_lock); - if (sk_BY_DIR_HASH_find(ent->hashes, &idx, &htmp)) { - hent = sk_BY_DIR_HASH_value(ent->hashes, idx); - k = hent->suffix; - } else { - hent = NULL; - k = 0; - } - CRYPTO_STATIC_MUTEX_unlock_read(&g_ent_hashes_lock); - } else { - k = 0; - hent = NULL; - } - for (;;) { - char c = '/'; -#ifdef OPENSSL_SYS_VMS - c = ent->dir[strlen(ent->dir) - 1]; - if (c != ':' && c != '>' && c != ']') { - // If no separator is present, we assume the directory - // specifier is a logical name, and add a colon. We - // really should use better VMS routines for merging - // things like this, but this will do for now... -- - // Richard Levitte - c = ':'; - } else { - c = '\0'; - } -#endif - if (c == '\0') { - // This is special. When c == '\0', no directory - // separator should be added. - BIO_snprintf(b->data, b->max, "%s%08lx.%s%d", ent->dir, h, postfix, - k); - } else { - BIO_snprintf(b->data, b->max, "%s%c%08lx.%s%d", ent->dir, c, h, - postfix, k); - } -#ifndef OPENSSL_NO_POSIX_IO -#if defined(_WIN32) && !defined(stat) -#define stat _stat -#endif - { - struct stat st; - if (stat(b->data, &st) < 0) { - break; - } - } -#endif - // found one. - if (type == X509_LU_X509) { - if ((X509_load_cert_file(xl, b->data, ent->dir_type)) == 0) { - break; - } - } else if (type == X509_LU_CRL) { - if ((X509_load_crl_file(xl, b->data, ent->dir_type)) == 0) { - break; - } - } - // else case will caught higher up - k++; - } - - // we have added it to the cache so now pull it out again - CRYPTO_MUTEX_lock_write(&xl->store_ctx->objs_lock); - tmp = NULL; - sk_X509_OBJECT_sort(xl->store_ctx->objs); - if (sk_X509_OBJECT_find(xl->store_ctx->objs, &idx, &stmp)) { - tmp = sk_X509_OBJECT_value(xl->store_ctx->objs, idx); - } - CRYPTO_MUTEX_unlock_write(&xl->store_ctx->objs_lock); - - // If a CRL, update the last file suffix added for this - - if (type == X509_LU_CRL) { - CRYPTO_STATIC_MUTEX_lock_write(&g_ent_hashes_lock); - // Look for entry again in case another thread added an entry - // first. - if (!hent) { - htmp.hash = h; - sk_BY_DIR_HASH_sort(ent->hashes); - if (sk_BY_DIR_HASH_find(ent->hashes, &idx, &htmp)) { - hent = sk_BY_DIR_HASH_value(ent->hashes, idx); - } - } - if (!hent) { - hent = OPENSSL_malloc(sizeof(BY_DIR_HASH)); - if (hent == NULL) { - CRYPTO_STATIC_MUTEX_unlock_write(&g_ent_hashes_lock); - ok = 0; - goto finish; - } - hent->hash = h; - hent->suffix = k; - if (!sk_BY_DIR_HASH_push(ent->hashes, hent)) { - CRYPTO_STATIC_MUTEX_unlock_write(&g_ent_hashes_lock); - OPENSSL_free(hent); - ok = 0; - goto finish; - } - sk_BY_DIR_HASH_sort(ent->hashes); - } else if (hent->suffix < k) { - hent->suffix = k; - } - - CRYPTO_STATIC_MUTEX_unlock_write(&g_ent_hashes_lock); - } - - if (tmp != NULL) { - ok = 1; - ret->type = tmp->type; - OPENSSL_memcpy(&ret->data, &tmp->data, sizeof(ret->data)); - - // Clear any errors that might have been raised processing empty - // or malformed files. - ERR_clear_error(); - - // If we were going to up the reference count, we would need - // to do it on a perl 'type' basis - goto finish; - } - } - } -finish: - BUF_MEM_free(b); - return ok; -} - -#endif // OPENSSL_TRUSTY diff --git a/third_party/boringssl/src/crypto/x509/by_dir.cc b/third_party/boringssl/src/crypto/x509/by_dir.cc new file mode 100644 index 00000000..545ffd95 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/by_dir.cc @@ -0,0 +1,354 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +BSSL_NAMESPACE_BEGIN + +typedef struct lookup_dir_hashes_st { + uint32_t hash; + int suffix; +} BY_DIR_HASH; + +typedef struct lookup_dir_entry_st { + Mutex lock; + char *dir; + int dir_type; + STACK_OF(BY_DIR_HASH) *hashes; +} BY_DIR_ENTRY; + +typedef struct lookup_dir_st { + STACK_OF(BY_DIR_ENTRY) *dirs; +} BY_DIR; + +DEFINE_NAMESPACED_STACK_OF(BY_DIR_HASH) +DEFINE_NAMESPACED_STACK_OF(BY_DIR_ENTRY) + +BSSL_NAMESPACE_END + +static int dir_ctrl(X509_LOOKUP *ctx, int cmd, const char *argp, long argl, + char **ret); +static int new_dir(X509_LOOKUP *lu); +static void free_dir(X509_LOOKUP *lu); +static int add_cert_dir(BY_DIR *ctx, const char *dir, int type); +static int get_cert_by_subject(X509_LOOKUP *xl, int type, const X509_NAME *name, + X509_OBJECT *ret); +static const X509_LOOKUP_METHOD x509_dir_lookup = { + new_dir, // new + free_dir, // free + dir_ctrl, // ctrl + get_cert_by_subject, // get_by_subject +}; + +const X509_LOOKUP_METHOD *X509_LOOKUP_hash_dir() { return &x509_dir_lookup; } + +static int dir_ctrl(X509_LOOKUP *ctx, int cmd, const char *argp, long argl, + char **retp) { + int ret = 0; + char *dir = nullptr; + + BY_DIR *ld = reinterpret_cast(ctx->method_data); + + switch (cmd) { + case X509_L_ADD_DIR: + if (argl == X509_FILETYPE_DEFAULT) { + dir = (char *)getenv(X509_get_default_cert_dir_env()); + if (dir) { + ret = add_cert_dir(ld, dir, X509_FILETYPE_PEM); + } else { + ret = + add_cert_dir(ld, X509_get_default_cert_dir(), X509_FILETYPE_PEM); + } + if (!ret) { + OPENSSL_PUT_ERROR(X509, X509_R_LOADING_CERT_DIR); + } + } else { + ret = add_cert_dir(ld, argp, (int)argl); + } + break; + } + return ret; +} + +static int new_dir(X509_LOOKUP *lu) { + BY_DIR *a; + + if ((a = New()) == nullptr) { + return 0; + } + a->dirs = nullptr; + lu->method_data = a; + return 1; +} + +static void by_dir_hash_free(BY_DIR_HASH *hash) { Delete(hash); } + +static int by_dir_hash_cmp(const BY_DIR_HASH *const *a, + const BY_DIR_HASH *const *b) { + if ((*a)->hash > (*b)->hash) { + return 1; + } + if ((*a)->hash < (*b)->hash) { + return -1; + } + return 0; +} + +static void by_dir_entry_free(BY_DIR_ENTRY *ent) { + if (ent != nullptr) { + Delete(ent->dir); + sk_BY_DIR_HASH_pop_free(ent->hashes, by_dir_hash_free); + Delete(ent); + } +} + +static void free_dir(X509_LOOKUP *lu) { + BY_DIR *a = reinterpret_cast(lu->method_data); + if (a != nullptr) { + sk_BY_DIR_ENTRY_pop_free(a->dirs, by_dir_entry_free); + Delete(a); + } +} + +#if defined(OPENSSL_WINDOWS) +#define DIR_HASH_SEPARATOR ';' +#else +#define DIR_HASH_SEPARATOR ':' +#endif + +static int add_cert_dir(BY_DIR *ctx, const char *dir, int type) { + size_t j, len; + const char *s, *ss, *p; + + if (dir == nullptr || !*dir) { + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_DIRECTORY); + return 0; + } + + s = dir; + p = s; + do { + if (*p == DIR_HASH_SEPARATOR || *p == '\0') { + BY_DIR_ENTRY *ent; + ss = s; + s = p + 1; + len = p - ss; + if (len == 0) { + continue; + } + for (j = 0; j < sk_BY_DIR_ENTRY_num(ctx->dirs); j++) { + ent = sk_BY_DIR_ENTRY_value(ctx->dirs, j); + if (strlen(ent->dir) == len && strncmp(ent->dir, ss, len) == 0) { + break; + } + } + if (j < sk_BY_DIR_ENTRY_num(ctx->dirs)) { + continue; + } + if (ctx->dirs == nullptr) { + ctx->dirs = sk_BY_DIR_ENTRY_new_null(); + if (!ctx->dirs) { + return 0; + } + } + ent = New(); + if (!ent) { + return 0; + } + ent->dir_type = type; + ent->hashes = sk_BY_DIR_HASH_new(by_dir_hash_cmp); + ent->dir = OPENSSL_strndup(ss, len); + if (ent->dir == nullptr || ent->hashes == nullptr || + !sk_BY_DIR_ENTRY_push(ctx->dirs, ent)) { + by_dir_entry_free(ent); + return 0; + } + } + } while (*p++ != '\0'); + return 1; +} + +static int get_cert_by_subject(X509_LOOKUP *xl, int type, const X509_NAME *name, + X509_OBJECT *ret) { + UniquePtr lookup_cert; + UniquePtr lookup_crl; + int ok = 0; + size_t i; + int k; + uint32_t h; + uint32_t hash_array[2]; + int hash_index; + char *b = nullptr; + X509_OBJECT stmp, *tmp; + const char *postfix = ""; + + if (name == nullptr) { + return 0; + } + + stmp.type = type; + BY_DIR *ctx = reinterpret_cast(xl->method_data); + if (type == X509_LU_X509) { + lookup_cert.reset(X509_new()); + if (lookup_cert == nullptr || + !X509_set_subject_name(lookup_cert.get(), name)) { + return 0; + } + stmp.data.x509 = lookup_cert.get(); + postfix = ""; + } else if (type == X509_LU_CRL) { + lookup_crl.reset(X509_CRL_new()); + if (lookup_crl == nullptr || + !X509_CRL_set_issuer_name(lookup_crl.get(), name)) { + return 0; + } + stmp.data.crl = lookup_crl.get(); + postfix = "r"; + } else { + OPENSSL_PUT_ERROR(X509, X509_R_WRONG_LOOKUP_TYPE); + goto finish; + } + + hash_array[0] = X509_NAME_hash(name); + hash_array[1] = X509_NAME_hash_old(name); + for (hash_index = 0; hash_index < 2; ++hash_index) { + h = hash_array[hash_index]; + for (i = 0; i < sk_BY_DIR_ENTRY_num(ctx->dirs); i++) { + BY_DIR_ENTRY *ent; + size_t idx; + BY_DIR_HASH htmp, *hent; + ent = sk_BY_DIR_ENTRY_value(ctx->dirs, i); + if (type == X509_LU_CRL && ent->hashes) { + htmp.hash = h; + MutexReadLock lock(&ent->lock); + if (sk_BY_DIR_HASH_find(ent->hashes, &idx, &htmp)) { + hent = sk_BY_DIR_HASH_value(ent->hashes, idx); + k = hent->suffix; + } else { + hent = nullptr; + k = 0; + } + } else { + k = 0; + hent = nullptr; + } + for (;;) { + OPENSSL_free(b); + if (OPENSSL_asprintf(&b, "%s/%08" PRIx32 ".%s%d", ent->dir, h, postfix, + k) == -1) { + OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); + b = nullptr; + goto finish; + } + if (type == X509_LU_X509) { + if ((X509_load_cert_file(xl, b, ent->dir_type)) == 0) { + // Don't expose the lower level error, All of these boil + // down to "we could not find a CA". + ERR_clear_error(); + break; + } + } else if (type == X509_LU_CRL) { + if ((X509_load_crl_file(xl, b, ent->dir_type)) == 0) { + // Don't expose the lower level error, All of these boil + // down to "we could not find a CRL". + ERR_clear_error(); + break; + } + } + // The lack of a CA or CRL will be caught higher up + k++; + } + + // we have added it to the cache so now pull it out again + auto *store_impl = FromOpaque(xl->store_ctx); + store_impl->objs_lock.LockWrite(); + tmp = nullptr; + sk_X509_OBJECT_sort(store_impl->objs.get()); + if (sk_X509_OBJECT_find(store_impl->objs.get(), &idx, &stmp)) { + tmp = sk_X509_OBJECT_value(store_impl->objs.get(), idx); + } + store_impl->objs_lock.UnlockWrite(); + + // If a CRL, update the last file suffix added for this + + if (type == X509_LU_CRL) { + ent->lock.LockWrite(); + // Look for entry again in case another thread added an entry + // first. + if (!hent) { + htmp.hash = h; + sk_BY_DIR_HASH_sort(ent->hashes); + if (sk_BY_DIR_HASH_find(ent->hashes, &idx, &htmp)) { + hent = sk_BY_DIR_HASH_value(ent->hashes, idx); + } + } + if (!hent) { + hent = New(); + if (hent == nullptr) { + ent->lock.UnlockWrite(); + ok = 0; + goto finish; + } + hent->hash = h; + hent->suffix = k; + if (!sk_BY_DIR_HASH_push(ent->hashes, hent)) { + ent->lock.UnlockWrite(); + Delete(hent); + ok = 0; + goto finish; + } + sk_BY_DIR_HASH_sort(ent->hashes); + } else if (hent->suffix < k) { + hent->suffix = k; + } + + ent->lock.UnlockWrite(); + } + + if (tmp != nullptr) { + ok = 1; + ret->type = tmp->type; + OPENSSL_memcpy(&ret->data, &tmp->data, sizeof(ret->data)); + + // Clear any errors that might have been raised processing empty + // or malformed files. + ERR_clear_error(); + + // If we were going to up the reference count, we would need + // to do it on a perl 'type' basis + goto finish; + } + } + } +finish: + OPENSSL_free(b); + return ok; +} + +int X509_LOOKUP_add_dir(X509_LOOKUP *lookup, const char *name, int type) { + return X509_LOOKUP_ctrl(lookup, X509_L_ADD_DIR, name, type, nullptr); +} diff --git a/third_party/boringssl/src/crypto/x509/by_file.c b/third_party/boringssl/src/crypto/x509/by_file.c deleted file mode 100644 index 3435fc2a..00000000 --- a/third_party/boringssl/src/crypto/x509/by_file.c +++ /dev/null @@ -1,286 +0,0 @@ -/* crypto/x509/by_file.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include - -#include "internal.h" - -#ifndef OPENSSL_NO_STDIO - -static int by_file_ctrl(X509_LOOKUP *ctx, int cmd, const char *argc, long argl, - char **ret); -static X509_LOOKUP_METHOD x509_file_lookup = { - "Load file into cache", - NULL, // new - NULL, // free - NULL, // init - NULL, // shutdown - by_file_ctrl, // ctrl - NULL, // get_by_subject - NULL, // get_by_issuer_serial - NULL, // get_by_fingerprint - NULL, // get_by_alias -}; - -X509_LOOKUP_METHOD *X509_LOOKUP_file(void) { return &x509_file_lookup; } - -static int by_file_ctrl(X509_LOOKUP *ctx, int cmd, const char *argp, long argl, - char **ret) { - int ok = 0; - const char *file; - - switch (cmd) { - case X509_L_FILE_LOAD: - if (argl == X509_FILETYPE_DEFAULT) { - file = getenv(X509_get_default_cert_file_env()); - if (file) { - ok = (X509_load_cert_crl_file(ctx, file, X509_FILETYPE_PEM) != 0); - } - - else { - ok = (X509_load_cert_crl_file(ctx, X509_get_default_cert_file(), - X509_FILETYPE_PEM) != 0); - } - - if (!ok) { - OPENSSL_PUT_ERROR(X509, X509_R_LOADING_DEFAULTS); - } - } else { - if (argl == X509_FILETYPE_PEM) { - ok = (X509_load_cert_crl_file(ctx, argp, X509_FILETYPE_PEM) != 0); - } else { - ok = (X509_load_cert_file(ctx, argp, (int)argl) != 0); - } - } - break; - } - return ok; -} - -int X509_load_cert_file(X509_LOOKUP *ctx, const char *file, int type) { - int ret = 0; - BIO *in = NULL; - int i, count = 0; - X509 *x = NULL; - - in = BIO_new(BIO_s_file()); - - if ((in == NULL) || (BIO_read_filename(in, file) <= 0)) { - OPENSSL_PUT_ERROR(X509, ERR_R_SYS_LIB); - goto err; - } - - if (type == X509_FILETYPE_PEM) { - for (;;) { - x = PEM_read_bio_X509_AUX(in, NULL, NULL, NULL); - if (x == NULL) { - uint32_t error = ERR_peek_last_error(); - if (ERR_GET_LIB(error) == ERR_LIB_PEM && - ERR_GET_REASON(error) == PEM_R_NO_START_LINE && count > 0) { - ERR_clear_error(); - break; - } - OPENSSL_PUT_ERROR(X509, ERR_R_PEM_LIB); - goto err; - } - i = X509_STORE_add_cert(ctx->store_ctx, x); - if (!i) { - goto err; - } - count++; - X509_free(x); - x = NULL; - } - ret = count; - } else if (type == X509_FILETYPE_ASN1) { - x = d2i_X509_bio(in, NULL); - if (x == NULL) { - OPENSSL_PUT_ERROR(X509, ERR_R_ASN1_LIB); - goto err; - } - i = X509_STORE_add_cert(ctx->store_ctx, x); - if (!i) { - goto err; - } - ret = i; - } else { - OPENSSL_PUT_ERROR(X509, X509_R_BAD_X509_FILETYPE); - goto err; - } - - if (ret == 0) { - OPENSSL_PUT_ERROR(X509, X509_R_NO_CERTIFICATE_FOUND); - } - -err: - X509_free(x); - BIO_free(in); - return ret; -} - -int X509_load_crl_file(X509_LOOKUP *ctx, const char *file, int type) { - int ret = 0; - BIO *in = NULL; - int i, count = 0; - X509_CRL *x = NULL; - - in = BIO_new(BIO_s_file()); - - if ((in == NULL) || (BIO_read_filename(in, file) <= 0)) { - OPENSSL_PUT_ERROR(X509, ERR_R_SYS_LIB); - goto err; - } - - if (type == X509_FILETYPE_PEM) { - for (;;) { - x = PEM_read_bio_X509_CRL(in, NULL, NULL, NULL); - if (x == NULL) { - uint32_t error = ERR_peek_last_error(); - if (ERR_GET_LIB(error) == ERR_LIB_PEM && - ERR_GET_REASON(error) == PEM_R_NO_START_LINE && count > 0) { - ERR_clear_error(); - break; - } - OPENSSL_PUT_ERROR(X509, ERR_R_PEM_LIB); - goto err; - } - i = X509_STORE_add_crl(ctx->store_ctx, x); - if (!i) { - goto err; - } - count++; - X509_CRL_free(x); - x = NULL; - } - ret = count; - } else if (type == X509_FILETYPE_ASN1) { - x = d2i_X509_CRL_bio(in, NULL); - if (x == NULL) { - OPENSSL_PUT_ERROR(X509, ERR_R_ASN1_LIB); - goto err; - } - i = X509_STORE_add_crl(ctx->store_ctx, x); - if (!i) { - goto err; - } - ret = i; - } else { - OPENSSL_PUT_ERROR(X509, X509_R_BAD_X509_FILETYPE); - goto err; - } - - if (ret == 0) { - OPENSSL_PUT_ERROR(X509, X509_R_NO_CRL_FOUND); - } - -err: - X509_CRL_free(x); - BIO_free(in); - return ret; -} - -int X509_load_cert_crl_file(X509_LOOKUP *ctx, const char *file, int type) { - STACK_OF(X509_INFO) *inf; - X509_INFO *itmp; - BIO *in; - size_t i; - int count = 0; - - if (type != X509_FILETYPE_PEM) { - return X509_load_cert_file(ctx, file, type); - } - in = BIO_new_file(file, "r"); - if (!in) { - OPENSSL_PUT_ERROR(X509, ERR_R_SYS_LIB); - return 0; - } - inf = PEM_X509_INFO_read_bio(in, NULL, NULL, NULL); - BIO_free(in); - if (!inf) { - OPENSSL_PUT_ERROR(X509, ERR_R_PEM_LIB); - return 0; - } - for (i = 0; i < sk_X509_INFO_num(inf); i++) { - itmp = sk_X509_INFO_value(inf, i); - if (itmp->x509) { - if (!X509_STORE_add_cert(ctx->store_ctx, itmp->x509)) { - goto err; - } - count++; - } - if (itmp->crl) { - if (!X509_STORE_add_crl(ctx->store_ctx, itmp->crl)) { - goto err; - } - count++; - } - } - - if (count == 0) { - OPENSSL_PUT_ERROR(X509, X509_R_NO_CERTIFICATE_OR_CRL_FOUND); - } - -err: - sk_X509_INFO_pop_free(inf, X509_INFO_free); - return count; -} - -#endif // OPENSSL_NO_STDIO diff --git a/third_party/boringssl/src/crypto/x509/by_file.cc b/third_party/boringssl/src/crypto/x509/by_file.cc new file mode 100644 index 00000000..f7d124a7 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/by_file.cc @@ -0,0 +1,226 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "internal.h" + + +static int by_file_ctrl(X509_LOOKUP *ctx, int cmd, const char *argc, long argl, + char **ret); +static const X509_LOOKUP_METHOD x509_file_lookup = { + nullptr, // new + nullptr, // free + by_file_ctrl, // ctrl + nullptr, // get_by_subject +}; + +const X509_LOOKUP_METHOD *X509_LOOKUP_file() { return &x509_file_lookup; } + +static int by_file_ctrl(X509_LOOKUP *ctx, int cmd, const char *argp, long argl, + char **ret) { + if (cmd != X509_L_FILE_LOAD) { + return 0; + } + const char *file = argp; + int type = argl; + if (argl == X509_FILETYPE_DEFAULT) { + if ((file = getenv(X509_get_default_cert_file_env())) == nullptr) { + file = X509_get_default_cert_file(); + } + type = X509_FILETYPE_PEM; + } + if (X509_load_cert_crl_file(ctx, file, type) != 0) { + return 1; + } + if (argl == X509_FILETYPE_DEFAULT) { + OPENSSL_PUT_ERROR(X509, X509_R_LOADING_DEFAULTS); + } + return 0; +} + +int X509_load_cert_file(X509_LOOKUP *ctx, const char *file, int type) { + int ret = 0; + BIO *in = nullptr; + int i, count = 0; + X509 *x = nullptr; + + in = BIO_new(BIO_s_file()); + + if ((in == nullptr) || (BIO_read_filename(in, file) <= 0)) { + OPENSSL_PUT_ERROR(X509, ERR_R_SYS_LIB); + goto err; + } + + if (type == X509_FILETYPE_PEM) { + for (;;) { + x = PEM_read_bio_X509_AUX(in, nullptr, nullptr, nullptr); + if (x == nullptr) { + if (ERR_equals(ERR_peek_last_error(), ERR_LIB_PEM, + PEM_R_NO_START_LINE) && + count > 0) { + ERR_clear_error(); + break; + } + OPENSSL_PUT_ERROR(X509, ERR_R_PEM_LIB); + goto err; + } + i = X509_STORE_add_cert(ctx->store_ctx, x); + if (!i) { + goto err; + } + count++; + X509_free(x); + x = nullptr; + } + ret = count; + } else if (type == X509_FILETYPE_ASN1) { + x = d2i_X509_bio(in, nullptr); + if (x == nullptr) { + OPENSSL_PUT_ERROR(X509, ERR_R_ASN1_LIB); + goto err; + } + i = X509_STORE_add_cert(ctx->store_ctx, x); + if (!i) { + goto err; + } + ret = i; + } else { + OPENSSL_PUT_ERROR(X509, X509_R_BAD_X509_FILETYPE); + goto err; + } + + if (ret == 0) { + OPENSSL_PUT_ERROR(X509, X509_R_NO_CERTIFICATE_FOUND); + } + +err: + X509_free(x); + BIO_free(in); + return ret; +} + +int X509_load_crl_file(X509_LOOKUP *ctx, const char *file, int type) { + int ret = 0; + BIO *in = nullptr; + int i, count = 0; + X509_CRL *x = nullptr; + + in = BIO_new(BIO_s_file()); + + if ((in == nullptr) || (BIO_read_filename(in, file) <= 0)) { + OPENSSL_PUT_ERROR(X509, ERR_R_SYS_LIB); + goto err; + } + + if (type == X509_FILETYPE_PEM) { + for (;;) { + x = PEM_read_bio_X509_CRL(in, nullptr, nullptr, nullptr); + if (x == nullptr) { + if (ERR_equals(ERR_peek_last_error(), ERR_LIB_PEM, + PEM_R_NO_START_LINE) && + count > 0) { + ERR_clear_error(); + break; + } + OPENSSL_PUT_ERROR(X509, ERR_R_PEM_LIB); + goto err; + } + i = X509_STORE_add_crl(ctx->store_ctx, x); + if (!i) { + goto err; + } + count++; + X509_CRL_free(x); + x = nullptr; + } + ret = count; + } else if (type == X509_FILETYPE_ASN1) { + x = d2i_X509_CRL_bio(in, nullptr); + if (x == nullptr) { + OPENSSL_PUT_ERROR(X509, ERR_R_ASN1_LIB); + goto err; + } + i = X509_STORE_add_crl(ctx->store_ctx, x); + if (!i) { + goto err; + } + ret = i; + } else { + OPENSSL_PUT_ERROR(X509, X509_R_BAD_X509_FILETYPE); + goto err; + } + + if (ret == 0) { + OPENSSL_PUT_ERROR(X509, X509_R_NO_CRL_FOUND); + } + +err: + X509_CRL_free(x); + BIO_free(in); + return ret; +} + +int X509_load_cert_crl_file(X509_LOOKUP *ctx, const char *file, int type) { + STACK_OF(X509_INFO) *inf; + X509_INFO *itmp; + BIO *in; + size_t i; + int count = 0; + + if (type != X509_FILETYPE_PEM) { + return X509_load_cert_file(ctx, file, type); + } + in = BIO_new_file(file, "rb"); + if (!in) { + OPENSSL_PUT_ERROR(X509, ERR_R_SYS_LIB); + return 0; + } + inf = PEM_X509_INFO_read_bio(in, nullptr, nullptr, nullptr); + BIO_free(in); + if (!inf) { + OPENSSL_PUT_ERROR(X509, ERR_R_PEM_LIB); + return 0; + } + for (i = 0; i < sk_X509_INFO_num(inf); i++) { + itmp = sk_X509_INFO_value(inf, i); + if (itmp->x509) { + if (!X509_STORE_add_cert(ctx->store_ctx, itmp->x509)) { + goto err; + } + count++; + } + if (itmp->crl) { + if (!X509_STORE_add_crl(ctx->store_ctx, itmp->crl)) { + goto err; + } + count++; + } + } + + if (count == 0) { + OPENSSL_PUT_ERROR(X509, X509_R_NO_CERTIFICATE_OR_CRL_FOUND); + } + +err: + sk_X509_INFO_pop_free(inf, X509_INFO_free); + return count; +} + +int X509_LOOKUP_load_file(X509_LOOKUP *lookup, const char *name, int type) { + return X509_LOOKUP_ctrl(lookup, X509_L_FILE_LOAD, name, type, nullptr); +} diff --git a/third_party/boringssl/src/crypto/x509/i2d_pr.c b/third_party/boringssl/src/crypto/x509/i2d_pr.c deleted file mode 100644 index 3c8887ff..00000000 --- a/third_party/boringssl/src/crypto/x509/i2d_pr.c +++ /dev/null @@ -1,80 +0,0 @@ -/* crypto/asn1/i2d_pr.c */ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include -#include -#include -#include -#include - - -int i2d_PrivateKey(const EVP_PKEY *a, uint8_t **pp) { - switch (EVP_PKEY_id(a)) { - case EVP_PKEY_RSA: - return i2d_RSAPrivateKey(a->pkey.rsa, pp); - case EVP_PKEY_EC: - return i2d_ECPrivateKey(a->pkey.ec, pp); - case EVP_PKEY_DSA: - return i2d_DSAPrivateKey(a->pkey.dsa, pp); - default: - // Although this file is in crypto/x509 for layering reasons, it emits - // an error code from ASN1 for OpenSSL compatibility. - OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNSUPPORTED_PUBLIC_KEY_TYPE); - return -1; - } -} diff --git a/third_party/boringssl/src/crypto/x509/i2d_pr.cc b/third_party/boringssl/src/crypto/x509/i2d_pr.cc new file mode 100644 index 00000000..c8ec6216 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/i2d_pr.cc @@ -0,0 +1,37 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include +#include + + +int i2d_PrivateKey(const EVP_PKEY *a, uint8_t **pp) { + switch (EVP_PKEY_id(a)) { + case EVP_PKEY_RSA: + return i2d_RSAPrivateKey(EVP_PKEY_get0_RSA(a), pp); + case EVP_PKEY_EC: + return i2d_ECPrivateKey(EVP_PKEY_get0_EC_KEY(a), pp); + case EVP_PKEY_DSA: + return i2d_DSAPrivateKey(EVP_PKEY_get0_DSA(a), pp); + default: + // Although this file is in crypto/x509 for layering reasons, it emits + // an error code from ASN1 for OpenSSL compatibility. + OPENSSL_PUT_ERROR(ASN1, ASN1_R_UNSUPPORTED_PUBLIC_KEY_TYPE); + return -1; + } +} diff --git a/third_party/boringssl/src/crypto/x509/internal.h b/third_party/boringssl/src/crypto/x509/internal.h index 79043563..7257fcf4 100644 --- a/third_party/boringssl/src/crypto/x509/internal.h +++ b/third_party/boringssl/src/crypto/x509/internal.h @@ -1,114 +1,99 @@ -/* - * Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL project - * 2013. - */ -/* ==================================================================== - * Copyright (c) 2013 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). - * - */ - -#ifndef OPENSSL_HEADER_X509_INTERNAL_H -#define OPENSSL_HEADER_X509_INTERNAL_H +// Copyright 2013-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef OPENSSL_HEADER_CRYPTO_X509_INTERNAL_H +#define OPENSSL_HEADER_CRYPTO_X509_INTERNAL_H #include #include +#include #include #include "../asn1/internal.h" - -#if defined(__cplusplus) -extern "C" { -#endif - +#include "../mem_internal.h" +#include "../internal.h" // Internal structures. -typedef struct X509_POLICY_CACHE_st X509_POLICY_CACHE; -typedef struct X509_POLICY_TREE_st X509_POLICY_TREE; - -typedef struct X509_val_st { - ASN1_TIME *notBefore; - ASN1_TIME *notAfter; -} X509_VAL; - -DECLARE_ASN1_FUNCTIONS_const(X509_VAL) +DECLARE_OPAQUE_STRUCT(x509_st, X509Impl) +DECLARE_OPAQUE_STRUCT(x509_store_st, X509Store) +DECLARE_OPAQUE_STRUCT(X509_name_st, X509Name) struct X509_pubkey_st { - X509_ALGOR *algor; - ASN1_BIT_STRING *public_key; + X509_ALGOR algor; + ASN1_BIT_STRING public_key; EVP_PKEY *pkey; } /* X509_PUBKEY */; +BSSL_NAMESPACE_BEGIN + +void x509_pubkey_init(X509_PUBKEY *key); +void x509_pubkey_cleanup(X509_PUBKEY *key); + +int x509_parse_public_key(CBS *cbs, X509_PUBKEY *out, + Span algs); +int x509_marshal_public_key(CBB *cbb, const X509_PUBKEY *in); +int x509_pubkey_set1(X509_PUBKEY *key, EVP_PKEY *pkey); + +// X509_PUBKEY is an |ASN1_ITEM| whose ASN.1 type is SubjectPublicKeyInfo and C +// type is |X509_PUBKEY*|. +// TODO(crbug.com/42290417): Remove this when |X509| and |X509_REQ| no longer +// depend on the tables. +DECLARE_ASN1_ITEM(X509_PUBKEY) + +BSSL_NAMESPACE_END + struct X509_name_entry_st { ASN1_OBJECT *object; - ASN1_STRING *value; + ASN1_STRING value; int set; } /* X509_NAME_ENTRY */; -// we always keep X509_NAMEs in 2 forms. -struct X509_name_st { - STACK_OF(X509_NAME_ENTRY) *entries; - int modified; // true if 'bytes' needs to be built - BUF_MEM *bytes; - // unsigned long hash; Keep the hash around for lookups - unsigned char *canon_enc; - int canon_enclen; +BSSL_NAMESPACE_BEGIN + +// X509_NAME_ENTRY is an |ASN1_ITEM| whose ASN.1 type is AttributeTypeAndValue +// (RFC 5280) and C type is |X509_NAME_ENTRY*|. +DECLARE_ASN1_ITEM(X509_NAME_ENTRY) + +struct X509_NAME_CACHE { + static constexpr bool kAllowUniquePtr = true; + // canon contains the DER-encoded canonicalized X.509 Name, not including the + // outermost TLV. + Array canon; + // der contains the DER-encoded X.509 Name, including the outermost TLV. + Array der; +}; + +class X509Name : public X509_name_st { + public: + STACK_OF(X509_NAME_ENTRY) *entries = nullptr; + mutable bssl::Atomic cache; } /* X509_NAME */; +BSSL_NAMESPACE_END + struct x509_attributes_st { ASN1_OBJECT *object; STACK_OF(ASN1_TYPE) *set; } /* X509_ATTRIBUTE */; +BSSL_NAMESPACE_BEGIN + +// X509_ATTRIBUTE is an |ASN1_ITEM| whose ASN.1 type is Attribute (RFC 2986) and +// C type is |X509_ATTRIBUTE*|. +DECLARE_ASN1_ITEM(X509_ATTRIBUTE) + typedef struct x509_cert_aux_st { STACK_OF(ASN1_OBJECT) *trust; // trusted uses STACK_OF(ASN1_OBJECT) *reject; // rejected uses @@ -118,57 +103,76 @@ typedef struct x509_cert_aux_st { DECLARE_ASN1_FUNCTIONS_const(X509_CERT_AUX) +BSSL_NAMESPACE_END + struct X509_extension_st { ASN1_OBJECT *object; ASN1_BOOLEAN critical; ASN1_OCTET_STRING *value; } /* X509_EXTENSION */; -typedef struct { - ASN1_INTEGER *version; // [ 0 ] default of v1 - ASN1_INTEGER *serialNumber; - X509_ALGOR *signature; - X509_NAME *issuer; - X509_VAL *validity; - X509_NAME *subject; - X509_PUBKEY *key; - ASN1_BIT_STRING *issuerUID; // [ 1 ] optional in v2 - ASN1_BIT_STRING *subjectUID; // [ 2 ] optional in v2 - STACK_OF(X509_EXTENSION) *extensions; // [ 3 ] optional in v3 - ASN1_ENCODING enc; -} X509_CINF; - -// TODO(https://crbug.com/boringssl/407): This is not const because it contains -// an |X509_NAME|. -DECLARE_ASN1_FUNCTIONS(X509_CINF) - -struct x509_st { - X509_CINF *cert_info; - X509_ALGOR *sig_alg; - ASN1_BIT_STRING *signature; - CRYPTO_refcount_t references; +// X509_EXTENSION is an |ASN1_ITEM| whose ASN.1 type is X.509 Extension (RFC +// 5280) and C type is |X509_EXTENSION*|. +DECLARE_ASN1_ITEM(X509_EXTENSION) + +BSSL_NAMESPACE_BEGIN + +// X509_EXTENSIONS is an |ASN1_ITEM| whose ASN.1 type is SEQUENCE of Extension +// (RFC 5280) and C type is |STACK_OF(X509_EXTENSION)*|. +DECLARE_ASN1_ITEM(X509_EXTENSIONS) + +class X509Impl : public x509_st, public RefCounted { + public: + X509Impl(); + + // TBSCertificate fields: + uint8_t version = X509_VERSION_1; // One of the |X509_VERSION_*| constants. + ASN1_INTEGER serialNumber; + X509_ALGOR tbs_sig_alg; + X509Name issuer; + ASN1_TIME notBefore; + ASN1_TIME notAfter; + X509Name subject; + X509_PUBKEY key; + ASN1_BIT_STRING *issuerUID = nullptr; // [ 1 ] optional in v2 + ASN1_BIT_STRING *subjectUID = nullptr; // [ 2 ] optional in v2 + STACK_OF(X509_EXTENSION) *extensions = nullptr; // [ 3 ] optional in v3 + // Certificate fields: + X509_ALGOR sig_alg; + ASN1_BIT_STRING signature; + // Other state: + // buf, if not nullptr, contains a copy of the serialized Certificate. + // TODO(davidben): Now every parsed |X509| has an underlying |CRYPTO_BUFFER|, + // but |X509|s created peacemeal do not. Can we make this more uniform? + CRYPTO_BUFFER *buf = nullptr; CRYPTO_EX_DATA ex_data; // These contain copies of various extension values - long ex_pathlen; - long ex_pcpathlen; - unsigned long ex_flags; - unsigned long ex_kusage; - unsigned long ex_xkusage; - unsigned long ex_nscert; - ASN1_OCTET_STRING *skid; - AUTHORITY_KEYID *akid; - X509_POLICY_CACHE *policy_cache; - STACK_OF(DIST_POINT) *crldp; - STACK_OF(GENERAL_NAME) *altname; - NAME_CONSTRAINTS *nc; - unsigned char cert_hash[SHA256_DIGEST_LENGTH]; - X509_CERT_AUX *aux; - CRYPTO_BUFFER *buf; - CRYPTO_MUTEX lock; + long ex_pathlen = -1; + uint32_t ex_flags = 0; + uint32_t ex_kusage = 0; + uint32_t ex_xkusage = 0; + ASN1_OCTET_STRING *skid = nullptr; + AUTHORITY_KEYID *akid = nullptr; + STACK_OF(DIST_POINT) *crldp = nullptr; + STACK_OF(GENERAL_NAME) *altname = nullptr; + NAME_CONSTRAINTS *nc = nullptr; + unsigned char cert_hash[SHA256_DIGEST_LENGTH] = {}; + bssl::X509_CERT_AUX *aux = nullptr; + Mutex lock; + + private: + friend RefCounted; + ~X509Impl(); } /* X509 */; +int x509_marshal_tbs_cert(CBB *cbb, const X509 *x509); + +// X509 is an |ASN1_ITEM| whose ASN.1 type is X.509 Certificate (RFC 5280) and C +// type is |X509*|. +DECLARE_ASN1_ITEM(X509) + typedef struct { - ASN1_ENCODING enc; + bssl::ASN1_ENCODING enc; ASN1_INTEGER *version; X509_NAME *subject; X509_PUBKEY *pubkey; @@ -176,26 +180,39 @@ typedef struct { STACK_OF(X509_ATTRIBUTE) *attributes; // [ 0 ] } X509_REQ_INFO; -// TODO(https://crbug.com/boringssl/407): This is not const because it contains -// an |X509_NAME|. -DECLARE_ASN1_FUNCTIONS(X509_REQ_INFO) +DECLARE_ASN1_FUNCTIONS_const(X509_REQ_INFO) + +BSSL_NAMESPACE_END struct X509_req_st { - X509_REQ_INFO *req_info; + bssl::X509_REQ_INFO *req_info; X509_ALGOR *sig_alg; ASN1_BIT_STRING *signature; } /* X509_REQ */; +BSSL_NAMESPACE_BEGIN + +// X509_REQ is an |ASN1_ITEM| whose ASN.1 type is CertificateRequest (RFC 2986) +// and C type is |X509_REQ*|. +DECLARE_ASN1_ITEM(X509_REQ) + +BSSL_NAMESPACE_END + struct x509_revoked_st { ASN1_INTEGER *serialNumber; ASN1_TIME *revocationDate; STACK_OF(X509_EXTENSION) /* optional */ *extensions; - // Set up if indirect CRL - STACK_OF(GENERAL_NAME) *issuer; // Revocation reason int reason; } /* X509_REVOKED */; +BSSL_NAMESPACE_BEGIN + +// X509_REVOKED is an |ASN1_ITEM| whose ASN.1 type is an element of the +// revokedCertificates field of TBSCertList (RFC 5280) and C type is +// |X509_REVOKED*|. +DECLARE_ASN1_ITEM(X509_REVOKED) + typedef struct { ASN1_INTEGER *version; X509_ALGOR *sig_alg; @@ -204,37 +221,62 @@ typedef struct { ASN1_TIME *nextUpdate; STACK_OF(X509_REVOKED) *revoked; STACK_OF(X509_EXTENSION) /* [0] */ *extensions; - ASN1_ENCODING enc; + bssl::ASN1_ENCODING enc; } X509_CRL_INFO; -// TODO(https://crbug.com/boringssl/407): This is not const because it contains -// an |X509_NAME|. -DECLARE_ASN1_FUNCTIONS(X509_CRL_INFO) +DECLARE_ASN1_FUNCTIONS_const(X509_CRL_INFO) + +BSSL_NAMESPACE_END + +// Values in idp_flags field +// IDP present +#define IDP_PRESENT 0x1 +// IDP values inconsistent +#define IDP_INVALID 0x2 +// onlyuser true +#define IDP_ONLYUSER 0x4 +// onlyCA true +#define IDP_ONLYCA 0x8 +// onlyattr true +#define IDP_ONLYATTR 0x10 +// indirectCRL true +#define IDP_INDIRECT 0x20 +// onlysomereasons present +#define IDP_REASONS 0x40 struct X509_crl_st { // actual signature - X509_CRL_INFO *crl; + bssl::X509_CRL_INFO *crl; X509_ALGOR *sig_alg; ASN1_BIT_STRING *signature; - CRYPTO_refcount_t references; + bssl::CRYPTO_refcount_t references; int flags; // Copies of various extensions AUTHORITY_KEYID *akid; ISSUING_DIST_POINT *idp; // Convenient breakdown of IDP int idp_flags; - int idp_reasons; - // CRL and base CRL numbers for delta processing - ASN1_INTEGER *crl_number; - ASN1_INTEGER *base_crl_number; unsigned char crl_hash[SHA256_DIGEST_LENGTH]; - STACK_OF(GENERAL_NAMES) *issuers; } /* X509_CRL */; +BSSL_NAMESPACE_BEGIN + +// X509_CRL is an |ASN1_ITEM| whose ASN.1 type is X.509 CertificateList (RFC +// 5280) and C type is |X509_CRL*|. +DECLARE_ASN1_ITEM(X509_CRL) + +// GENERAL_NAME is an |ASN1_ITEM| whose ASN.1 type is GeneralName and C type is +// |GENERAL_NAME*|. +DECLARE_ASN1_ITEM(GENERAL_NAME) + +// GENERAL_NAMES is an |ASN1_ITEM| whose ASN.1 type is SEQUENCE OF GeneralName +// and C type is |GENERAL_NAMES*|, aka |STACK_OF(GENERAL_NAME)*|. +DECLARE_ASN1_ITEM(GENERAL_NAMES) + +BSSL_NAMESPACE_END + struct X509_VERIFY_PARAM_st { - char *name; - time_t check_time; // Time to use - unsigned long inh_flags; // Inheritance flags + int64_t check_time; // POSIX time to use unsigned long flags; // Various verify flags int purpose; // purpose to check untrusted certificates int trust; // trust setting to check @@ -243,7 +285,6 @@ struct X509_VERIFY_PARAM_st { // The following fields specify acceptable peer identities. STACK_OF(OPENSSL_STRING) *hosts; // Set of acceptable names unsigned int hostflags; // Flags to control matching features - char *peername; // Matching hostname in peer certificate char *email; // If not NULL email address to match size_t emaillen; unsigned char *ip; // If not NULL IP address to match @@ -262,62 +303,59 @@ struct x509_object_st { } data; } /* X509_OBJECT */; +BSSL_NAMESPACE_BEGIN + +// NETSCAPE_SPKI is an |ASN1_ITEM| whose ASN.1 type is +// SignedPublicKeyAndChallenge and C type is |NETSCAPE_SPKI*|. +DECLARE_ASN1_ITEM(NETSCAPE_SPKI) + +// NETSCAPE_SPKAC is an |ASN1_ITEM| whose ASN.1 type is PublicKeyAndChallenge +// and C type is |NETSCAPE_SPKAC*|. +DECLARE_ASN1_ITEM(NETSCAPE_SPKAC) + +BSSL_NAMESPACE_END + // This is a static that defines the function interface struct x509_lookup_method_st { - const char *name; int (*new_item)(X509_LOOKUP *ctx); void (*free)(X509_LOOKUP *ctx); - int (*init)(X509_LOOKUP *ctx); - int (*shutdown)(X509_LOOKUP *ctx); int (*ctrl)(X509_LOOKUP *ctx, int cmd, const char *argc, long argl, char **ret); - int (*get_by_subject)(X509_LOOKUP *ctx, int type, X509_NAME *name, + int (*get_by_subject)(X509_LOOKUP *ctx, int type, const X509_NAME *name, X509_OBJECT *ret); - int (*get_by_issuer_serial)(X509_LOOKUP *ctx, int type, X509_NAME *name, - ASN1_INTEGER *serial, X509_OBJECT *ret); - int (*get_by_fingerprint)(X509_LOOKUP *ctx, int type, unsigned char *bytes, - int len, X509_OBJECT *ret); - int (*get_by_alias)(X509_LOOKUP *ctx, int type, char *str, int len, - X509_OBJECT *ret); } /* X509_LOOKUP_METHOD */; +BSSL_NAMESPACE_BEGIN + // This is used to hold everything. It is used for all certificate // validation. Once we have a certificate chain, the 'verify' // function is then called to actually check the cert chain. -struct x509_store_st { +class X509Store : public x509_store_st, public RefCounted { + public: + X509Store(); + // The following is a cache of trusted certs - int cache; // if true, stash any hits - STACK_OF(X509_OBJECT) *objs; // Cache of all objects - CRYPTO_MUTEX objs_lock; + UniquePtr objs; // Cache of all objects + Mutex objs_lock; // These are external lookup methods - STACK_OF(X509_LOOKUP) *get_cert_methods; + Vector> get_cert_methods; - X509_VERIFY_PARAM *param; + UniquePtr param; // Callbacks for various operations - X509_STORE_CTX_verify_fn verify; // called to verify a certificate - X509_STORE_CTX_verify_cb verify_cb; // error callback - X509_STORE_CTX_get_issuer_fn get_issuer; // get issuers cert from ctx - X509_STORE_CTX_check_issued_fn check_issued; // check issued - X509_STORE_CTX_check_revocation_fn - check_revocation; // Check revocation status of chain - X509_STORE_CTX_get_crl_fn get_crl; // retrieve CRL - X509_STORE_CTX_check_crl_fn check_crl; // Check CRL validity - X509_STORE_CTX_cert_crl_fn cert_crl; // Check certificate against CRL - X509_STORE_CTX_lookup_certs_fn lookup_certs; - X509_STORE_CTX_lookup_crls_fn lookup_crls; - X509_STORE_CTX_cleanup_fn cleanup; - - CRYPTO_refcount_t references; + X509_STORE_CTX_verify_cb verify_cb = nullptr; // error callback + + private: + friend RefCounted; + ~X509Store() = default; } /* X509_STORE */; +BSSL_NAMESPACE_END // This is the functions plus an instance of the local variables. struct x509_lookup_st { - int init; // have we been started - int skip; // don't use us. - X509_LOOKUP_METHOD *method; // the functions + const X509_LOOKUP_METHOD *method; // the functions void *method_data; // method data X509_STORE *store_ctx; // who owns us @@ -335,47 +373,33 @@ struct x509_store_ctx_st { STACK_OF(X509_CRL) *crls; // set of CRLs passed in X509_VERIFY_PARAM *param; - void *other_ctx; // Other info for use with get_issuer() + + // trusted_stack, if non-NULL, is a set of trusted certificates to consider + // instead of those from |X509_STORE|. + STACK_OF(X509) *trusted_stack; // Callbacks for various operations - X509_STORE_CTX_verify_fn verify; // called to verify a certificate X509_STORE_CTX_verify_cb verify_cb; // error callback - X509_STORE_CTX_get_issuer_fn get_issuer; // get issuers cert from ctx - X509_STORE_CTX_check_issued_fn check_issued; // check issued - X509_STORE_CTX_check_revocation_fn - check_revocation; // Check revocation status of chain - X509_STORE_CTX_get_crl_fn get_crl; // retrieve CRL - X509_STORE_CTX_check_crl_fn check_crl; // Check CRL validity - X509_STORE_CTX_cert_crl_fn cert_crl; // Check certificate against CRL - X509_STORE_CTX_check_policy_fn check_policy; - X509_STORE_CTX_lookup_certs_fn lookup_certs; - X509_STORE_CTX_lookup_crls_fn lookup_crls; - X509_STORE_CTX_cleanup_fn cleanup; // The following is built up - int valid; // if 0, rebuild chain - int last_untrusted; // index of last untrusted cert - STACK_OF(X509) *chain; // chain of X509s - built up and trusted - X509_POLICY_TREE *tree; // Valid policy tree - - int explicit_policy; // Require explicit policy value + int last_untrusted; // index of last untrusted cert + STACK_OF(X509) *chain; // chain of X509s - built up and trusted // When something goes wrong, this is why int error_depth; int error; X509 *current_cert; - X509 *current_issuer; // cert currently being tested as valid issuer X509_CRL *current_crl; // current CRL - int current_crl_score; // score of current CRL - unsigned int current_reasons; // Reason mask - - X509_STORE_CTX *parent; // For CRL path validation: parent context + X509 *current_crl_issuer; // issuer of current CRL + int current_crl_score; // score of current CRL CRYPTO_EX_DATA ex_data; } /* X509_STORE_CTX */; -ASN1_TYPE *ASN1_generate_v3(const char *str, X509V3_CTX *cnf); +BSSL_NAMESPACE_BEGIN + +ASN1_TYPE *ASN1_generate_v3(const char *str, const X509V3_CTX *cnf); int X509_CERT_AUX_print(BIO *bp, X509_CERT_AUX *x, int indent); @@ -404,7 +428,7 @@ int x509_print_rsa_pss_params(BIO *bp, const X509_ALGOR *sigalg, int indent, // Signature algorithm functions. // x509_digest_sign_algorithm encodes the signing parameters of |ctx| as an -// AlgorithmIdentifer and saves the result in |algor|. It returns one on +// AlgorithmIdentifier and saves the result in |algor|. It returns one on // success, or zero on error. int x509_digest_sign_algorithm(EVP_MD_CTX *ctx, X509_ALGOR *algor); @@ -415,9 +439,225 @@ int x509_digest_sign_algorithm(EVP_MD_CTX *ctx, X509_ALGOR *algor); int x509_digest_verify_init(EVP_MD_CTX *ctx, const X509_ALGOR *sigalg, EVP_PKEY *pkey); +// x509_verify_signature verifies a |signature| using |sigalg| and |pkey| over +// |in|. It returns one if the signature is valid and zero on error. +int x509_verify_signature(const X509_ALGOR *sigalg, + const ASN1_BIT_STRING *signature, + Span in, EVP_PKEY *pkey); + +// x509_sign_to_bit_string signs |in| using |ctx| and saves the result in |out|. +// It returns the length of the signature on success and zero on error. +int x509_sign_to_bit_string(EVP_MD_CTX *ctx, ASN1_BIT_STRING *out, + Span in); + + +// Path-building functions. + +// X509_policy_check checks certificate policies in |certs|. |user_policies| is +// the user-initial-policy-set. If |user_policies| is NULL or empty, it is +// interpreted as anyPolicy. |flags| is a set of |X509_V_FLAG_*| values to +// apply. It returns |X509_V_OK| on success and |X509_V_ERR_*| on error. It +// additionally sets |*out_current_cert| to the certificate where the error +// occurred. If the function succeeded, or the error applies to the entire +// chain, it sets |*out_current_cert| to NULL. +int X509_policy_check(const STACK_OF(X509) *certs, + const STACK_OF(ASN1_OBJECT) *user_policies, + unsigned long flags, X509 **out_current_cert); + +// x509_check_issued_with_callback calls |X509_check_issued|, but allows the +// verify callback to override the result. It returns one on success and zero on +// error. +// +// TODO(davidben): Reduce the scope of the verify callback and remove this. The +// callback only runs with |X509_V_FLAG_CB_ISSUER_CHECK|, which is only used by +// one internal project and rust-openssl, who use it by mistake. +int x509_check_issued_with_callback(X509_STORE_CTX *ctx, const X509 *x, + const X509 *issuer); + +// x509v3_bytes_to_hex encodes |len| bytes from |in| to hex and returns a +// newly-allocated NUL-terminated string containing the result, or NULL on +// allocation error. +// +// This function was historically named |hex_to_string| in OpenSSL. Despite the +// name, |hex_to_string| converted to hex. +OPENSSL_EXPORT char *x509v3_bytes_to_hex(const uint8_t *in, size_t len); + +// x509v3_hex_string_to_bytes decodes |str| in hex and returns a newly-allocated +// array containing the result, or NULL on error. On success, it sets |*len| to +// the length of the result. Colon separators between bytes in the input are +// allowed and ignored. +// +// This function was historically named |string_to_hex| in OpenSSL. Despite the +// name, |string_to_hex| converted from hex. +unsigned char *x509v3_hex_to_bytes(const char *str, size_t *len); + +// x509v3_conf_name_matches returns one if |name| is equal to |cmp| or begins +// with |cmp| followed by '.', and zero otherwise. +int x509v3_conf_name_matches(const char *name, const char *cmp); + +// x509v3_looks_like_dns_name returns one if |in| looks like a DNS name and zero +// otherwise. +OPENSSL_EXPORT int x509v3_looks_like_dns_name(const unsigned char *in, + size_t len); + +// x509v3_cache_extensions fills in a number of fields relating to X.509 +// extensions in |x|. It returns one on success and zero if some extensions were +// invalid. +OPENSSL_EXPORT int x509v3_cache_extensions(X509 *x); + +// x509v3_a2i_ipadd decodes |ipasc| as an IPv4 or IPv6 address. IPv6 addresses +// use colon-separated syntax while IPv4 addresses use dotted decimal syntax. If +// it decodes an IPv4 address, it writes the result to the first four bytes of +// |ipout| and returns four. If it decodes an IPv6 address, it writes the result +// to all 16 bytes of |ipout| and returns 16. Otherwise, it returns zero. +int x509v3_a2i_ipadd(unsigned char ipout[16], const char *ipasc); + +// A |BIT_STRING_BITNAME| is used to contain a list of bit names. +typedef struct { + int bitnum; + const char *lname; + const char *sname; +} BIT_STRING_BITNAME; + +// x509V3_add_value_asn1_string appends a |CONF_VALUE| with the specified name +// and value to |*extlist|. if |*extlist| is NULL, it sets |*extlist| to a +// newly-allocated |STACK_OF(CONF_VALUE)| first. It returns one on success and +// zero on error. +int x509V3_add_value_asn1_string(const char *name, const ASN1_STRING *value, + STACK_OF(CONF_VALUE) **extlist); + +// X509V3_NAME_from_section adds attributes to |nm| by interpreting the +// key/value pairs in |dn_sk|. It returns one on success and zero on error. +// |chtype|, which should be one of |MBSTRING_*| constants, determines the +// character encoding used to interpret values. +int X509V3_NAME_from_section(X509_NAME *nm, const STACK_OF(CONF_VALUE) *dn_sk, + int chtype); + +// X509V3_bool_from_string decodes |str| as a boolean. On success, it returns +// one and sets |*out_bool| to resulting value. Otherwise, it returns zero. +int X509V3_bool_from_string(const char *str, ASN1_BOOLEAN *out_bool); + +// X509V3_get_value_bool decodes |value| as a boolean. On success, it returns +// one and sets |*out_bool| to the resulting value. Otherwise, it returns zero. +int X509V3_get_value_bool(const CONF_VALUE *value, ASN1_BOOLEAN *out_bool); + +// X509V3_get_value_int decodes |value| as an integer. On success, it returns +// one and sets |*aint| to the resulting value. Otherwise, it returns zero. If +// |*aint| was non-NULL at the start of the function, it frees the previous +// value before writing a new one. +int X509V3_get_value_int(const CONF_VALUE *value, ASN1_INTEGER **aint); + +// X509V3_get_section behaves like |NCONF_get_section| but queries |ctx|'s +// config database. +const STACK_OF(CONF_VALUE) *X509V3_get_section(const X509V3_CTX *ctx, + const char *section); + +// X509V3_add_value appends a |CONF_VALUE| containing |name| and |value| to +// |*extlist|. It returns one on success and zero on error. If |*extlist| is +// NULL, it sets |*extlist| to a newly-allocated |STACK_OF(CONF_VALUE)| +// containing the result. Either |name| or |value| may be NULL to omit the +// field. +// +// On failure, if |*extlist| was NULL, |*extlist| will remain NULL when the +// function returns. +int X509V3_add_value(const char *name, const char *value, + STACK_OF(CONF_VALUE) **extlist); + +// X509V3_add_value_bool behaves like |X509V3_add_value| but stores the value +// "TRUE" if |asn1_bool| is non-zero and "FALSE" otherwise. +int X509V3_add_value_bool(const char *name, int asn1_bool, + STACK_OF(CONF_VALUE) **extlist); + +// X509V3_add_value_bool behaves like |X509V3_add_value| but stores a string +// representation of |aint|. Note this string representation may be decimal or +// hexadecimal, depending on the size of |aint|. +int X509V3_add_value_int(const char *name, const ASN1_INTEGER *aint, + STACK_OF(CONF_VALUE) **extlist); + +STACK_OF(CONF_VALUE) *X509V3_parse_list(const char *line); + +#define X509V3_conf_err(val) \ + ERR_add_error_data(6, "section:", (val)->section, ",name:", (val)->name, \ + ",value:", (val)->value); + +// GENERAL_NAME_cmp returns zero if |a| and |b| are equal and a non-zero +// value otherwise. Note this function does not provide a comparison suitable +// for sorting. +// +// This function is exported for testing. +OPENSSL_EXPORT int GENERAL_NAME_cmp(const GENERAL_NAME *a, + const GENERAL_NAME *b); + +// X509_VERIFY_PARAM_lookup returns a pre-defined |X509_VERIFY_PARAM| named by +// |name|, or NULL if no such name is defined. +const X509_VERIFY_PARAM *X509_VERIFY_PARAM_lookup(const char *name); + +GENERAL_NAME *v2i_GENERAL_NAME(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, const CONF_VALUE *cnf); +GENERAL_NAME *v2i_GENERAL_NAME_ex(GENERAL_NAME *out, + const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, const CONF_VALUE *cnf, + int is_nc); +GENERAL_NAMES *v2i_GENERAL_NAMES(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval); + +int X509_check_akid(const X509 *issuer, const AUTHORITY_KEYID *akid); + +int X509_is_valid_trust_id(int trust); + +int X509_PURPOSE_get_trust(const X509_PURPOSE *xp); + +// TODO(https://crbug.com/boringssl/695): Remove this. +int DIST_POINT_set_dpname(DIST_POINT_NAME *dpn, X509_NAME *iname); + +void x509_name_init(X509_NAME *name); +void x509_name_cleanup(X509_NAME *name); + +// x509_parse_name parses a DER-encoded, X.509 Name from |cbs| and writes the +// result to |*out|. It returns one on success and zero on error. +int x509_parse_name(CBS *cbs, X509_NAME *out); + +// x509_marshal_name marshals |in| as a DER-encoded, X.509 Name and writes the +// result to |out|. It returns one on success and zero on error. +int x509_marshal_name(CBB *out, const X509_NAME *in); + +const X509_NAME_CACHE *x509_name_get_cache(const X509_NAME *name); +void x509_name_invalidate_cache(X509_NAME *name); + +int x509_name_copy(X509_NAME *dst, const X509_NAME *src); + +void x509_algor_init(X509_ALGOR *alg); +void x509_algor_cleanup(X509_ALGOR *alg); + +// x509_parse_algorithm parses a DER-encoded, AlgorithmIdentifier from |cbs| and +// writes the result to |*out|. It returns one on success and zero on error. +int x509_parse_algorithm(CBS *cbs, X509_ALGOR *out); + +// x509_marshal_algorithm marshals |in| as a DER-encoded, AlgorithmIdentifier +// and writes the result to |out|. It returns one on success and zero on error. +int x509_marshal_algorithm(CBB *out, const X509_ALGOR *in); + + +// Standard extensions. + +extern const X509V3_EXT_METHOD v3_bcons, v3_nscert, v3_key_usage, v3_ext_ku; +extern const X509V3_EXT_METHOD v3_info, v3_sinfo; +extern const X509V3_EXT_METHOD v3_skey_id, v3_akey_id; +extern const X509V3_EXT_METHOD v3_subject_alt_name, v3_issuer_alt_name, + v3_certificate_issuer; +extern const X509V3_EXT_METHOD v3_netscape_base_url, v3_netscape_revocation_url, + v3_netscape_ca_revocation_url, v3_netscape_renewal_url, + v3_netscape_ca_policy_url, v3_netscape_ssl_server_name, v3_netscape_comment; +extern const X509V3_EXT_METHOD v3_crl_num, v3_crl_reason, v3_crl_invdate; +extern const X509V3_EXT_METHOD v3_delta_crl, v3_cpols, v3_crld, v3_freshest_crl; +extern const X509V3_EXT_METHOD v3_ocsp_nocheck; +extern const X509V3_EXT_METHOD v3_crl_hold; +extern const X509V3_EXT_METHOD v3_policy_mappings, v3_policy_constraints; +extern const X509V3_EXT_METHOD v3_name_constraints, v3_inhibit_anyp, v3_idp; +extern const X509V3_EXT_METHOD v3_addr, v3_asid; + +BSSL_NAMESPACE_END -#if defined(__cplusplus) -} // extern C -#endif -#endif // OPENSSL_HEADER_X509_INTERNAL_H +#endif // OPENSSL_HEADER_CRYPTO_X509_INTERNAL_H diff --git a/third_party/boringssl/src/crypto/x509/name_print.c b/third_party/boringssl/src/crypto/x509/name_print.c deleted file mode 100644 index 29207ccb..00000000 --- a/third_party/boringssl/src/crypto/x509/name_print.c +++ /dev/null @@ -1,249 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include - -#include -#include -#include - - -static int maybe_write(BIO *out, const void *buf, int len) { - // If |out| is NULL, ignore the output but report the length. - return out == NULL || BIO_write(out, buf, len) == len; -} - -// do_indent prints |indent| spaces to |out|. -static int do_indent(BIO *out, int indent) { - for (int i = 0; i < indent; i++) { - if (!maybe_write(out, " ", 1)) { - return 0; - } - } - return 1; -} - -#define FN_WIDTH_LN 25 -#define FN_WIDTH_SN 10 - -static int do_name_ex(BIO *out, const X509_NAME *n, int indent, - unsigned long flags) { - int i, prev = -1, orflags, cnt; - int fn_opt, fn_nid; - char objtmp[80]; - const char *objbuf; - int outlen, len; - const char *sep_dn, *sep_mv, *sep_eq; - int sep_dn_len, sep_mv_len, sep_eq_len; - if (indent < 0) { - indent = 0; - } - outlen = indent; - if (!do_indent(out, indent)) { - return -1; - } - switch (flags & XN_FLAG_SEP_MASK) { - case XN_FLAG_SEP_MULTILINE: - sep_dn = "\n"; - sep_dn_len = 1; - sep_mv = " + "; - sep_mv_len = 3; - break; - - case XN_FLAG_SEP_COMMA_PLUS: - sep_dn = ","; - sep_dn_len = 1; - sep_mv = "+"; - sep_mv_len = 1; - indent = 0; - break; - - case XN_FLAG_SEP_CPLUS_SPC: - sep_dn = ", "; - sep_dn_len = 2; - sep_mv = " + "; - sep_mv_len = 3; - indent = 0; - break; - - case XN_FLAG_SEP_SPLUS_SPC: - sep_dn = "; "; - sep_dn_len = 2; - sep_mv = " + "; - sep_mv_len = 3; - indent = 0; - break; - - default: - return -1; - } - - if (flags & XN_FLAG_SPC_EQ) { - sep_eq = " = "; - sep_eq_len = 3; - } else { - sep_eq = "="; - sep_eq_len = 1; - } - - fn_opt = flags & XN_FLAG_FN_MASK; - - cnt = X509_NAME_entry_count(n); - for (i = 0; i < cnt; i++) { - const X509_NAME_ENTRY *ent; - if (flags & XN_FLAG_DN_REV) { - ent = X509_NAME_get_entry(n, cnt - i - 1); - } else { - ent = X509_NAME_get_entry(n, i); - } - if (prev != -1) { - if (prev == X509_NAME_ENTRY_set(ent)) { - if (!maybe_write(out, sep_mv, sep_mv_len)) { - return -1; - } - outlen += sep_mv_len; - } else { - if (!maybe_write(out, sep_dn, sep_dn_len)) { - return -1; - } - outlen += sep_dn_len; - if (!do_indent(out, indent)) { - return -1; - } - outlen += indent; - } - } - prev = X509_NAME_ENTRY_set(ent); - const ASN1_OBJECT *fn = X509_NAME_ENTRY_get_object(ent); - const ASN1_STRING *val = X509_NAME_ENTRY_get_data(ent); - fn_nid = OBJ_obj2nid(fn); - if (fn_opt != XN_FLAG_FN_NONE) { - int objlen, fld_len; - if ((fn_opt == XN_FLAG_FN_OID) || (fn_nid == NID_undef)) { - OBJ_obj2txt(objtmp, sizeof objtmp, fn, 1); - fld_len = 0; // XXX: what should this be? - objbuf = objtmp; - } else { - if (fn_opt == XN_FLAG_FN_SN) { - fld_len = FN_WIDTH_SN; - objbuf = OBJ_nid2sn(fn_nid); - } else if (fn_opt == XN_FLAG_FN_LN) { - fld_len = FN_WIDTH_LN; - objbuf = OBJ_nid2ln(fn_nid); - } else { - fld_len = 0; // XXX: what should this be? - objbuf = ""; - } - } - objlen = strlen(objbuf); - if (!maybe_write(out, objbuf, objlen)) { - return -1; - } - if ((objlen < fld_len) && (flags & XN_FLAG_FN_ALIGN)) { - if (!do_indent(out, fld_len - objlen)) { - return -1; - } - outlen += fld_len - objlen; - } - if (!maybe_write(out, sep_eq, sep_eq_len)) { - return -1; - } - outlen += objlen + sep_eq_len; - } - // If the field name is unknown then fix up the DER dump flag. We - // might want to limit this further so it will DER dump on anything - // other than a few 'standard' fields. - if ((fn_nid == NID_undef) && (flags & XN_FLAG_DUMP_UNKNOWN_FIELDS)) { - orflags = ASN1_STRFLGS_DUMP_ALL; - } else { - orflags = 0; - } - - len = ASN1_STRING_print_ex(out, val, flags | orflags); - if (len < 0) { - return -1; - } - outlen += len; - } - return outlen; -} - -int X509_NAME_print_ex(BIO *out, const X509_NAME *nm, int indent, - unsigned long flags) { - if (flags == XN_FLAG_COMPAT) { - return X509_NAME_print(out, nm, indent); - } - return do_name_ex(out, nm, indent, flags); -} - -int X509_NAME_print_ex_fp(FILE *fp, const X509_NAME *nm, int indent, - unsigned long flags) { - BIO *bio = NULL; - if (fp != NULL) { - // If |fp| is NULL, this function returns the number of bytes without - // writing. - bio = BIO_new_fp(fp, BIO_NOCLOSE); - if (bio == NULL) { - return -1; - } - } - int ret = X509_NAME_print_ex(bio, nm, indent, flags); - BIO_free(bio); - return ret; -} diff --git a/third_party/boringssl/src/crypto/x509/name_print.cc b/third_party/boringssl/src/crypto/x509/name_print.cc new file mode 100644 index 00000000..e4db871e --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/name_print.cc @@ -0,0 +1,187 @@ +// Copyright 2000-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include + +#include +#include +#include + + +static int maybe_write(BIO *out, const void *buf, int len) { + // If |out| is NULL, ignore the output but report the length. + return out == nullptr || BIO_write(out, buf, len) == len; +} + +// do_indent prints |indent| spaces to |out|. +static int do_indent(BIO *out, int indent) { + for (int i = 0; i < indent; i++) { + if (!maybe_write(out, " ", 1)) { + return 0; + } + } + return 1; +} + +#define FN_WIDTH_LN 25 +#define FN_WIDTH_SN 10 + +static int do_name_ex(BIO *out, const X509_NAME *n, int indent, + unsigned long flags) { + int prev = -1, orflags; + int outlen, len; + const char *sep_dn, *sep_mv, *sep_eq; + int sep_dn_len, sep_mv_len, sep_eq_len; + if (indent < 0) { + indent = 0; + } + outlen = indent; + if (!do_indent(out, indent)) { + return -1; + } + switch (flags & XN_FLAG_SEP_MASK) { + case XN_FLAG_SEP_MULTILINE: + sep_dn = "\n"; + sep_dn_len = 1; + sep_mv = " + "; + sep_mv_len = 3; + break; + + case XN_FLAG_SEP_COMMA_PLUS: + sep_dn = ","; + sep_dn_len = 1; + sep_mv = "+"; + sep_mv_len = 1; + indent = 0; + break; + + case XN_FLAG_SEP_CPLUS_SPC: + sep_dn = ", "; + sep_dn_len = 2; + sep_mv = " + "; + sep_mv_len = 3; + indent = 0; + break; + + case XN_FLAG_SEP_SPLUS_SPC: + sep_dn = "; "; + sep_dn_len = 2; + sep_mv = " + "; + sep_mv_len = 3; + indent = 0; + break; + + default: + return -1; + } + + if (flags & XN_FLAG_SPC_EQ) { + sep_eq = " = "; + sep_eq_len = 3; + } else { + sep_eq = "="; + sep_eq_len = 1; + } + + int cnt = X509_NAME_entry_count(n); + for (int i = 0; i < cnt; i++) { + const X509_NAME_ENTRY *ent; + if (flags & XN_FLAG_DN_REV) { + ent = X509_NAME_get_entry(n, cnt - i - 1); + } else { + ent = X509_NAME_get_entry(n, i); + } + if (prev != -1) { + if (prev == X509_NAME_ENTRY_set(ent)) { + if (!maybe_write(out, sep_mv, sep_mv_len)) { + return -1; + } + outlen += sep_mv_len; + } else { + if (!maybe_write(out, sep_dn, sep_dn_len)) { + return -1; + } + outlen += sep_dn_len; + if (!do_indent(out, indent)) { + return -1; + } + outlen += indent; + } + } + prev = X509_NAME_ENTRY_set(ent); + const ASN1_OBJECT *fn = X509_NAME_ENTRY_get_object(ent); + const ASN1_STRING *val = X509_NAME_ENTRY_get_data(ent); + assert((flags & XN_FLAG_FN_MASK) == XN_FLAG_FN_SN); + // Print the short name if available, othewise serialize the OID. + char objtmp[80]; + const char *objbuf = nullptr; + int fn_nid = OBJ_obj2nid(fn); + if (fn_nid != NID_undef) { + objbuf = OBJ_nid2sn(fn_nid); + } + if (objbuf == nullptr) { + OBJ_obj2txt(objtmp, sizeof(objtmp), fn, /*always_return_oid=*/1); + objbuf = objtmp; + } + int objlen = strlen(objbuf); + if (!maybe_write(out, objbuf, objlen) || + !maybe_write(out, sep_eq, sep_eq_len)) { + return -1; + } + outlen += objlen + sep_eq_len; + // If the field name is unknown then fix up the DER dump flag. We + // might want to limit this further so it will DER dump on anything + // other than a few 'standard' fields. + if ((fn_nid == NID_undef) && (flags & XN_FLAG_DUMP_UNKNOWN_FIELDS)) { + orflags = ASN1_STRFLGS_DUMP_ALL; + } else { + orflags = 0; + } + + len = ASN1_STRING_print_ex(out, val, flags | orflags); + if (len < 0) { + return -1; + } + outlen += len; + } + return outlen; +} + +int X509_NAME_print_ex(BIO *out, const X509_NAME *nm, int indent, + unsigned long flags) { + if (flags == XN_FLAG_COMPAT) { + return X509_NAME_print(out, nm, indent); + } + return do_name_ex(out, nm, indent, flags); +} + +int X509_NAME_print_ex_fp(FILE *fp, const X509_NAME *nm, int indent, + unsigned long flags) { + BIO *bio = nullptr; + if (fp != nullptr) { + // If |fp| is NULL, this function returns the number of bytes without + // writing. + bio = BIO_new_fp(fp, BIO_NOCLOSE); + if (bio == nullptr) { + return -1; + } + } + int ret = X509_NAME_print_ex(bio, nm, indent, flags); + BIO_free(bio); + return ret; +} diff --git a/third_party/boringssl/src/crypto/x509/policy.cc b/third_party/boringssl/src/crypto/x509/policy.cc new file mode 100644 index 00000000..0ab37547 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/policy.cc @@ -0,0 +1,686 @@ +// Copyright 2022 The BoringSSL Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include + +#include +#include +#include + +#include +#include +#include +#include + +#include "../internal.h" +#include "../mem_internal.h" +#include "internal.h" + + +BSSL_NAMESPACE_BEGIN +namespace { + +// This file computes the X.509 policy graph, as described in RFC 9618. +// Implementation notes: +// +// (1) It does not track "qualifier_set". This is not needed as it is not +// output by this implementation. +// +// (2) "expected_policy_set" is not tracked explicitly and built temporarily +// as part of building the graph. +// +// (3) anyPolicy nodes are not tracked explicitly. +// +// (4) Some pruning steps are deferred to when policies are evaluated, as a +// reachability pass. + +bool is_any_policy(const ASN1_OBJECT *obj) { + return OBJ_obj2nid(obj) == NID_any_policy; +} + +// An X509PolicyNode is a node in the policy graph. It corresponds to a node +// from RFC 5280, section 6.1.2, step (a), but we store some fields differently. +struct X509PolicyNode { + static std::optional Create(const ASN1_OBJECT *policy) { + assert(!is_any_policy(policy)); + X509PolicyNode node; + node.policy.reset(OBJ_dup(policy)); + if (node.policy == nullptr) { + return std::nullopt; + } + return node; + } + + bool operator<(const X509PolicyNode &other) const { + return OBJ_cmp(policy.get(), other.policy.get()) < 0; + } + + bool parent_is_any_policy() const { return parent_policies.empty(); } + + // policy is the "valid_policy" field from RFC 5280. + UniquePtr policy; + + // parent_policies, if non-empty, is the list of "valid_policy" values for all + // nodes which are a parent of this node. In this case, no entry in this list + // will be anyPolicy. This list is in no particular order and may contain + // duplicates if the corresponding certificate had duplicate mappings. + // + // If empty, this node has a single parent, anyPolicy. The node is then a root + // policy, and is in authorities-constrained-policy-set if it has a path to a + // leaf node. + // + // Note it is not possible for a policy to have both anyPolicy and a + // concrete policy as a parent. Section 6.1.3, step (d.1.ii) only runs if + // there was no match in step (d.1.i). We do not need to represent a parent + // list of, say, {anyPolicy, OID1, OID2}. + Vector> parent_policies; + + // mapped is whether this node matches a policy mapping in the certificate. + bool mapped = false; + + // reachable is whether this node is reachable from some valid policy in the + // end-entity certificate. It is computed during |has_explicit_policy|. + bool reachable = false; +}; + +// An X509PolicyLevel is the collection of nodes at the same depth in the +// policy graph. This structure can also be used to represent a level's +// "expected_policy_set" values. See |process_policy_mappings|. +class X509PolicyLevel { + public: + bool has_any_policy() const { return has_any_policy_; } + bool is_empty() const { return !has_any_policy_ && nodes_.empty(); } + Span nodes() const { return nodes_; } + + void set_has_any_policy(bool v) { has_any_policy_ = v; } + + // Although this is mutable, callers may not modify the node's policy. + Span nodes() { return Span(nodes_); } + + void Clear() { + has_any_policy_ = false; + nodes_.clear(); + } + + // Find returns the node corresponding to |policy|, or nullptr if none exists. + X509PolicyNode *Find(const ASN1_OBJECT *policy) { + // The list is sorted, so we can binary search. + auto it = std::lower_bound( + nodes_.begin(), nodes_.end(), policy, + [](const X509PolicyNode &node, const ASN1_OBJECT *obj) { + return OBJ_cmp(node.policy.get(), obj) < 0; + }); + if (it == nodes_.end() || OBJ_cmp(it->policy.get(), policy) != 0) { + return nullptr; + } + return &*it; + } + + // AddNodes adds the nodes in |nodes|. It returns true on success and false on + // error. No policy in |nodes| may already be present. This method leaves + // the objects in |nodes| in a moved-from state. + // + // This method re-sorts the nodes, so it runs in time proportional to the + // total size of the level. However, each level is only added to three times + // in the course of policy validation. + bool AddNodes(Span nodes) { + if (!nodes_.AppendMove(nodes)) { + return false; + } + std::sort(nodes_.begin(), nodes_.end()); +#if !defined(NDEBUG) + // There should be no duplicate nodes. + for (size_t i = 1; i < nodes_.size(); i++) { + assert(OBJ_cmp(nodes_[i - 1].policy.get(), nodes_[i].policy.get()) != 0); + } +#endif + return true; + } + + // EraseNodesIf removes all nodes that satisfy the predicate |pred|. + template + void EraseNodesIf(Pred pred) { + nodes_.EraseIf(pred); + } + + private: + // nodes is the list of nodes at this depth, except for the anyPolicy node, if + // any. This list is sorted by policy OID for efficient lookup. + Vector nodes_; + + // has_any_policy is whether there is an anyPolicy node at this depth. + bool has_any_policy_ = false; +}; + +int policyinfo_cmp(const POLICYINFO *const *a, const POLICYINFO *const *b) { + return OBJ_cmp((*a)->policyid, (*b)->policyid); +} + +// process_certificate_policies updates |level| to incorporate |x509|'s +// certificate policies extension. This implements steps (d) and (e) of RFC +// 5280, section 6.1.3. |level| must contain the previous level's +// "expected_policy_set" information. For all but the top-most level, this is +// the output of |process_policy_mappings|. |any_policy_allowed| specifies +// whether anyPolicy is allowed or inhibited, taking into account the exception +// for self-issued certificates. +bool process_certificate_policies(const X509 *x509, X509PolicyLevel *level, + int any_policy_allowed) { + int critical; + UniquePtr policies( + reinterpret_cast(X509_get_ext_d2i( + x509, NID_certificate_policies, &critical, nullptr))); + if (policies == nullptr) { + if (critical != -1) { + return false; // Syntax error in the extension. + } + + // RFC 5280, section 6.1.3, step (e). + level->Clear(); + return true; + } + + // certificatePolicies may not be empty. See RFC 5280, section 4.2.1.4. + // TODO(https://crbug.com/boringssl/443): Move this check into the parser. + if (sk_POLICYINFO_num(policies.get()) == 0) { + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_POLICY_EXTENSION); + return false; + } + + sk_POLICYINFO_set_cmp_func(policies.get(), policyinfo_cmp); + sk_POLICYINFO_sort(policies.get()); + bool cert_has_any_policy = false; + for (size_t i = 0; i < sk_POLICYINFO_num(policies.get()); i++) { + const POLICYINFO *policy = sk_POLICYINFO_value(policies.get(), i); + if (is_any_policy(policy->policyid)) { + cert_has_any_policy = true; + } + if (i > 0 && OBJ_cmp(sk_POLICYINFO_value(policies.get(), i - 1)->policyid, + policy->policyid) == 0) { + // Per RFC 5280, section 4.2.1.4, |policies| may not have duplicates. + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_POLICY_EXTENSION); + return false; + } + } + + // This does the same thing as RFC 5280, section 6.1.3, step (d), though in + // a slightly different order. |level| currently contains + // "expected_policy_set" values of the previous level. See + // |process_policy_mappings| for details. + const bool previous_level_has_any_policy = level->has_any_policy(); + + // First, we handle steps (d.1.i) and (d.2). The net effect of these two + // steps is to intersect |level| with |policies|, ignoring anyPolicy if it + // is inhibited. + if (!cert_has_any_policy || !any_policy_allowed) { + level->EraseNodesIf([&](const X509PolicyNode &node) { + assert(sk_POLICYINFO_is_sorted(policies.get())); + // Erase the node if it not present in the current certificate. + POLICYINFO info; + info.policyid = node.policy.get(); + return !sk_POLICYINFO_find(policies.get(), nullptr, &info); + }); + level->set_has_any_policy(false); + } + + // Step (d.1.ii) may attach new nodes to the previous level's anyPolicy + // node. + if (previous_level_has_any_policy) { + Vector new_nodes; + for (const POLICYINFO *policy : policies.get()) { + // Though we've reordered the steps slightly, |policy| is in |level| if + // and only if it would have been a match in step (d.1.ii). + if (!is_any_policy(policy->policyid) && + level->Find(policy->policyid) == nullptr) { + auto node = X509PolicyNode::Create(policy->policyid); + if (!node.has_value() || !new_nodes.Push(*std::move(node))) { + return false; + } + } + } + if (!level->AddNodes(Span(new_nodes))) { + return false; + } + } + + return true; +} + +int compare_issuer_policy(const POLICY_MAPPING *const *a, + const POLICY_MAPPING *const *b) { + return OBJ_cmp((*a)->issuerDomainPolicy, (*b)->issuerDomainPolicy); +} + +int compare_subject_policy(const POLICY_MAPPING *const *a, + const POLICY_MAPPING *const *b) { + return OBJ_cmp((*a)->subjectDomainPolicy, (*b)->subjectDomainPolicy); +} + +// process_policy_mappings processes the policy mappings extension of |cert|, +// whose corresponding graph level is |level|. |mapping_allowed| specifies +// whether policy mapping is inhibited at this point. On success, it returns an +// |X509PolicyLevel| containing the "expected_policy_set" for |level|. On error, +// it returns std::nullopt. This implements steps (a) and (b) of RFC 5280, +// section 6.1.4. +// +// We represent the "expected_policy_set" as an |X509PolicyLevel|. +// |has_any_policy| indicates whether there is an anyPolicy node with +// "expected_policy_set" of {anyPolicy}. If a node with policy oid P1 contains +// P2 in its "expected_policy_set", the level will contain a node of policy P2 +// with P1 in |parent_policies|. +// +// This is equivalent to the |X509PolicyLevel| that would result if the next +// certificates contained anyPolicy. |process_certificate_policies| will filter +// this result down to compute the actual level. +std::optional process_policy_mappings(const X509 *cert, + X509PolicyLevel *level, + bool mapping_allowed) { + int critical; + UniquePtr mappings(reinterpret_cast( + X509_get_ext_d2i(cert, NID_policy_mappings, &critical, nullptr))); + if (mappings == nullptr && critical != -1) { + // Syntax error in the policy mappings extension. + return std::nullopt; + } + + if (mappings != nullptr) { + // PolicyMappings may not be empty. See RFC 5280, section 4.2.1.5. + // TODO(https://crbug.com/boringssl/443): Move this check into the parser. + if (sk_POLICY_MAPPING_num(mappings.get()) == 0) { + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_POLICY_EXTENSION); + return std::nullopt; + } + + // RFC 5280, section 6.1.4, step (a). + for (const POLICY_MAPPING *mapping : mappings.get()) { + if (is_any_policy(mapping->issuerDomainPolicy) || + is_any_policy(mapping->subjectDomainPolicy)) { + return std::nullopt; + } + } + + // Sort to group by issuerDomainPolicy. + sk_POLICY_MAPPING_set_cmp_func(mappings.get(), compare_issuer_policy); + sk_POLICY_MAPPING_sort(mappings.get()); + + if (mapping_allowed) { + // Mark nodes as mapped, and add any nodes to |level| which may be + // needed as part of RFC 5280, section 6.1.4, step (b.1). + Vector new_nodes; + const ASN1_OBJECT *last_policy = nullptr; + for (const POLICY_MAPPING *mapping : mappings.get()) { + // There may be multiple mappings with the same |issuerDomainPolicy|. + if (last_policy != nullptr && + OBJ_cmp(mapping->issuerDomainPolicy, last_policy) == 0) { + continue; + } + last_policy = mapping->issuerDomainPolicy; + + X509PolicyNode *node = level->Find(mapping->issuerDomainPolicy); + if (node != nullptr) { + node->mapped = true; + } else { + if (!level->has_any_policy()) { + continue; + } + auto new_node = X509PolicyNode::Create(mapping->issuerDomainPolicy); + if (!new_node.has_value()) { + return std::nullopt; + } + new_node->mapped = true; + if (!new_nodes.Push(*std::move(new_node))) { + return std::nullopt; + } + } + } + if (!level->AddNodes(Span(new_nodes))) { + return std::nullopt; + } + } else { + // RFC 5280, section 6.1.4, step (b.2). If mapping is inhibited, delete + // all mapped nodes. + level->EraseNodesIf([&](const X509PolicyNode &node) { + // |mappings| must have been sorted by |compare_issuer_policy|. + assert(sk_POLICY_MAPPING_is_sorted(mappings.get())); + // Check if the node was mapped. + POLICY_MAPPING mapping; + mapping.issuerDomainPolicy = node.policy.get(); + return sk_POLICY_MAPPING_find(mappings.get(), /*out_index=*/nullptr, + &mapping); + }); + // Dropping the mappings. + mappings = nullptr; + } + } + + // If a node was not mapped, it retains the original "explicit_policy_set" + // value, itself. Add those to |mappings|. + if (mappings == nullptr) { + mappings.reset(sk_POLICY_MAPPING_new_null()); + if (mappings == nullptr) { + return std::nullopt; + } + } + for (const X509PolicyNode &node : level->nodes()) { + if (!node.mapped) { + UniquePtr mapping(POLICY_MAPPING_new()); + if (mapping == nullptr) { + return std::nullopt; + } + mapping->issuerDomainPolicy = OBJ_dup(node.policy.get()); + mapping->subjectDomainPolicy = OBJ_dup(node.policy.get()); + if (mapping->issuerDomainPolicy == nullptr || + mapping->subjectDomainPolicy == nullptr || + !PushToStack(mappings.get(), std::move(mapping))) { + return std::nullopt; + } + } + } + + // Sort to group by subjectDomainPolicy. + sk_POLICY_MAPPING_set_cmp_func(mappings.get(), compare_subject_policy); + sk_POLICY_MAPPING_sort(mappings.get()); + + // Convert |mappings| to our "expected_policy_set" representation. + Vector next_nodes; + for (POLICY_MAPPING *mapping : mappings.get()) { + // Skip mappings where |issuerDomainPolicy| does not appear in the graph. + if (!level->has_any_policy() && + level->Find(mapping->issuerDomainPolicy) == nullptr) { + continue; + } + + if (next_nodes.empty() || OBJ_cmp(next_nodes.back().policy.get(), + mapping->subjectDomainPolicy) != 0) { + auto new_node = X509PolicyNode::Create(mapping->subjectDomainPolicy); + if (!new_node.has_value() || !next_nodes.Push(*std::move(new_node))) { + return std::nullopt; + } + } + + // |mapping| is going to be destroyed, so steal its policy object. + UniquePtr policy( + std::exchange(mapping->issuerDomainPolicy, nullptr)); + if (!next_nodes.back().parent_policies.Push(std::move(policy))) { + return std::nullopt; + } + } + + X509PolicyLevel next; + next.set_has_any_policy(level->has_any_policy()); + if (!next.AddNodes(Span(next_nodes))) { + return std::nullopt; + } + return next; +} + +// apply_skip_certs, if |skip_certs| is non-NULL, sets |*value| to the minimum +// of its current value and |skip_certs|. It returns true on success and false +// if |skip_certs| is negative. +bool apply_skip_certs(const ASN1_INTEGER *skip_certs, size_t *value) { + if (skip_certs == nullptr) { + return true; + } + + // TODO(https://crbug.com/boringssl/443): Move this check into the parser. + if (skip_certs->type & V_ASN1_NEG) { + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_POLICY_EXTENSION); + return false; + } + + // If |skip_certs| does not fit in |uint64_t|, it must exceed |*value|. + uint64_t u64; + if (ASN1_INTEGER_get_uint64(&u64, skip_certs) && u64 < *value) { + *value = (size_t)u64; + } + ERR_clear_error(); + return true; +} + +// process_policy_constraints updates |*explicit_policy|, |*policy_mapping|, and +// |*inhibit_any_policy| according to |x509|'s policy constraints and inhibit +// anyPolicy extensions. It returns one on success and zero on error. This +// implements steps (i) and (j) of RFC 5280, section 6.1.4. +bool process_policy_constraints(const X509 *x509, size_t *explicit_policy, + size_t *policy_mapping, + size_t *inhibit_any_policy) { + int critical; + UniquePtr constraints( + reinterpret_cast( + X509_get_ext_d2i(x509, NID_policy_constraints, &critical, nullptr))); + if (constraints == nullptr && critical != -1) { + return false; + } + if (constraints != nullptr) { + if (constraints->requireExplicitPolicy == nullptr && + constraints->inhibitPolicyMapping == nullptr) { + // Per RFC 5280, section 4.2.1.11, at least one of the fields must be + // present. + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_POLICY_EXTENSION); + return false; + } + if (!apply_skip_certs(constraints->requireExplicitPolicy, + explicit_policy) || + !apply_skip_certs(constraints->inhibitPolicyMapping, policy_mapping)) { + return false; + } + } + + UniquePtr inhibit_any_policy_ext( + reinterpret_cast( + X509_get_ext_d2i(x509, NID_inhibit_any_policy, &critical, nullptr))); + if (inhibit_any_policy_ext == nullptr && critical != -1) { + return false; + } + return apply_skip_certs(inhibit_any_policy_ext.get(), inhibit_any_policy); +} + +// has_explicit_policy returns true if the set of authority-space policy OIDs +// |levels| has some non-empty intersection with |user_policies|, and false +// otherwise. This mirrors the logic in RFC 5280, section 6.1.5, step (g). This +// function modifies |levels| and should only be called at the end of policy +// evaluation. +bool has_explicit_policy(Span levels, + const STACK_OF(ASN1_OBJECT) *user_policies) { + assert(user_policies == nullptr || sk_ASN1_OBJECT_is_sorted(user_policies)); + + // Step (g.i). If the policy graph is empty, the intersection is empty. + if (levels.empty() || levels.back().is_empty()) { + return false; + } + + // Step (g.ii). If the policy graph is not empty and the user set contains + // anyPolicy, the intersection is the entire (non-empty) graph. + // + // If |user_policies| is empty, we interpret it as having a single anyPolicy + // value. The caller may also have supplied anyPolicy explicitly. + if (sk_ASN1_OBJECT_num(user_policies) == 0) { + return true; + } + for (const ASN1_OBJECT *user_policy : user_policies) { + if (is_any_policy(user_policy)) { + return true; + } + } + + // Step (g.iii) does not delete anyPolicy nodes, so if the graph has + // anyPolicy, some explicit policy will survive. The actual intersection may + // synthesize some nodes in step (g.iii.3), but we do not return the policy + // list itself, so we skip actually computing this. + if (levels.back().has_any_policy()) { + return true; + } + + // We defer pruning the tree, so as we look for nodes with parent anyPolicy, + // step (g.iii.1), we must limit to nodes reachable from the bottommost level. + // Start by marking each of those nodes as reachable. + for (X509PolicyNode &node : levels.back().nodes()) { + node.reachable = true; + } + + const size_t num_levels = levels.size(); + for (size_t i = num_levels - 1; i < num_levels; i--) { + X509PolicyLevel &level = levels[i]; + for (X509PolicyNode &node : level.nodes()) { + if (!node.reachable) { + continue; + } + if (node.parent_is_any_policy()) { + // |node|'s parent is anyPolicy and is part of "valid_policy_node_set". + // If it exists in |user_policies|, the intersection is non-empty and we + // can return immediately. + if (sk_ASN1_OBJECT_find(user_policies, /*out_index=*/nullptr, + node.policy.get())) { + return true; + } + } else if (i > 0) { + // |node|'s parents are concrete policies. Mark the parents reachable, + // to be inspected by the next loop iteration. + X509PolicyLevel &prev = levels[i - 1]; + for (const auto &parent_policy : node.parent_policies) { + X509PolicyNode *parent_node = prev.Find(parent_policy.get()); + if (parent_node != nullptr) { + parent_node->reachable = true; + } + } + } + } + } + + return false; +} + +int asn1_object_cmp(const ASN1_OBJECT *const *a, const ASN1_OBJECT *const *b) { + return OBJ_cmp(*a, *b); +} + +} // namespace + +int X509_policy_check(const STACK_OF(X509) *certs, + const STACK_OF(ASN1_OBJECT) *user_policies, + unsigned long flags, X509 **out_current_cert) { + *out_current_cert = nullptr; + + // Skip policy checking if the chain is just the trust anchor. + const size_t num_certs = sk_X509_num(certs); + if (num_certs <= 1) { + return X509_V_OK; + } + + // See RFC 5280, section 6.1.2, steps (d) through (f). + size_t explicit_policy = + (flags & X509_V_FLAG_EXPLICIT_POLICY) ? 0 : num_certs + 1; + size_t inhibit_any_policy = + (flags & X509_V_FLAG_INHIBIT_ANY) ? 0 : num_certs + 1; + size_t policy_mapping = (flags & X509_V_FLAG_INHIBIT_MAP) ? 0 : num_certs + 1; + + Vector levels; + std::optional level; + for (size_t i = num_certs - 2; i < num_certs; i--) { + X509 *cert = sk_X509_value(certs, i); + uint32_t ex_flags = X509_get_extension_flags(cert); + if (ex_flags & EXFLAG_INVALID) { + return X509_V_ERR_OUT_OF_MEM; + } + const bool is_self_issued = (ex_flags & EXFLAG_SI) != 0; + + // In all but the first iteration, the previous iteration will have prepared + // "expected_policy_set" for us as a staging level. + if (!level.has_value()) { + assert(i == num_certs - 2); + level.emplace(); + level->set_has_any_policy(true); + } + + // RFC 5280, section 6.1.3, steps (d) and (e). |any_policy_allowed| is + // computed as in step (d.2). + const int any_policy_allowed = + inhibit_any_policy > 0 || (i > 0 && is_self_issued); + if (!process_certificate_policies(cert, &*level, any_policy_allowed)) { + *out_current_cert = cert; + return X509_V_ERR_INVALID_POLICY_EXTENSION; + } + + // RFC 5280, section 6.1.3, step (f). + if (explicit_policy == 0 && level->is_empty()) { + return X509_V_ERR_NO_EXPLICIT_POLICY; + } + + // Insert the completed level into the list. + if (!levels.Push(*std::exchange(level, std::nullopt))) { + return X509_V_ERR_OUT_OF_MEM; + } + level = std::nullopt; + + // If this is not the leaf certificate, we go to section 6.1.4. If it + // is the leaf certificate, we go to section 6.1.5 instead. + if (i != 0) { + // RFC 5280, section 6.1.4, steps (a) and (b). + level = process_policy_mappings(cert, &levels.back(), policy_mapping > 0); + if (!level.has_value()) { + *out_current_cert = cert; + return X509_V_ERR_INVALID_POLICY_EXTENSION; + } + } + + // RFC 5280, section 6.1.4, step (h-j) for non-leaves, and section 6.1.5, + // step (a-b) for leaves. In the leaf case, RFC 5280 says only to update + // |explicit_policy|, but |policy_mapping| and |inhibit_any_policy| are no + // longer read at this point, so we use the same process. + if (i == 0 || !is_self_issued) { + if (explicit_policy > 0) { + explicit_policy--; + } + if (policy_mapping > 0) { + policy_mapping--; + } + if (inhibit_any_policy > 0) { + inhibit_any_policy--; + } + } + if (!process_policy_constraints(cert, &explicit_policy, &policy_mapping, + &inhibit_any_policy)) { + *out_current_cert = cert; + return X509_V_ERR_INVALID_POLICY_EXTENSION; + } + } + + // RFC 5280, section 6.1.5, step (g). We do not output the policy set, so it + // is only necessary to check if the user-constrained-policy-set is not empty. + if (explicit_policy == 0) { + // Build a sorted copy of |user_policies| for more efficient lookup. + STACK_OF(ASN1_OBJECT) *user_policies_sorted = nullptr; + // |user_policies_sorted|'s contents are owned by |user_policies|, so we do + // not use |sk_ASN1_OBJECT_pop_free|. + Cleanup cleanup = [&] { sk_ASN1_OBJECT_free(user_policies_sorted); }; + if (user_policies != nullptr) { + user_policies_sorted = sk_ASN1_OBJECT_dup(user_policies); + if (user_policies_sorted == nullptr) { + return X509_V_ERR_OUT_OF_MEM; + } + sk_ASN1_OBJECT_set_cmp_func(user_policies_sorted, asn1_object_cmp); + sk_ASN1_OBJECT_sort(user_policies_sorted); + } + + if (!has_explicit_policy(Span(levels), user_policies_sorted)) { + return X509_V_ERR_NO_EXPLICIT_POLICY; + } + } + + return X509_V_OK; +} + +BSSL_NAMESPACE_END diff --git a/third_party/boringssl/src/crypto/x509/rsa_pss.c b/third_party/boringssl/src/crypto/x509/rsa_pss.c deleted file mode 100644 index 42b4f21e..00000000 --- a/third_party/boringssl/src/crypto/x509/rsa_pss.c +++ /dev/null @@ -1,393 +0,0 @@ -/* Written by Dr Stephen N Henson (steve@openssl.org) for the OpenSSL - * project 2006. - */ -/* ==================================================================== - * Copyright (c) 2006 The OpenSSL Project. All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in - * the documentation and/or other materials provided with the - * distribution. - * - * 3. All advertising materials mentioning features or use of this - * software must display the following acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit. (http://www.OpenSSL.org/)" - * - * 4. The names "OpenSSL Toolkit" and "OpenSSL Project" must not be used to - * endorse or promote products derived from this software without - * prior written permission. For written permission, please contact - * licensing@OpenSSL.org. - * - * 5. Products derived from this software may not be called "OpenSSL" - * nor may "OpenSSL" appear in their names without prior written - * permission of the OpenSSL Project. - * - * 6. Redistributions of any form whatsoever must retain the following - * acknowledgment: - * "This product includes software developed by the OpenSSL Project - * for use in the OpenSSL Toolkit (http://www.OpenSSL.org/)" - * - * THIS SOFTWARE IS PROVIDED BY THE OpenSSL PROJECT ``AS IS'' AND ANY - * EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR - * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE OpenSSL PROJECT OR - * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, - * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT - * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; - * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) - * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED - * OF THE POSSIBILITY OF SUCH DAMAGE. - * ==================================================================== - * - * This product includes cryptographic software written by Eric Young - * (eay@cryptsoft.com). This product includes software written by Tim - * Hudson (tjh@cryptsoft.com). */ - -#include - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "internal.h" - - -static int rsa_pss_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it, - void *exarg) { - if (operation == ASN1_OP_FREE_PRE) { - RSA_PSS_PARAMS *pss = (RSA_PSS_PARAMS *)*pval; - X509_ALGOR_free(pss->maskHash); - } - return 1; -} - -ASN1_SEQUENCE_cb(RSA_PSS_PARAMS, rsa_pss_cb) = { - ASN1_EXP_OPT(RSA_PSS_PARAMS, hashAlgorithm, X509_ALGOR, 0), - ASN1_EXP_OPT(RSA_PSS_PARAMS, maskGenAlgorithm, X509_ALGOR, 1), - ASN1_EXP_OPT(RSA_PSS_PARAMS, saltLength, ASN1_INTEGER, 2), - ASN1_EXP_OPT(RSA_PSS_PARAMS, trailerField, ASN1_INTEGER, 3), -} ASN1_SEQUENCE_END_cb(RSA_PSS_PARAMS, RSA_PSS_PARAMS) - -IMPLEMENT_ASN1_FUNCTIONS_const(RSA_PSS_PARAMS) - - -// Given an MGF1 Algorithm ID decode to an Algorithm Identifier -static X509_ALGOR *rsa_mgf1_decode(const X509_ALGOR *alg) { - if (OBJ_obj2nid(alg->algorithm) != NID_mgf1 || - alg->parameter == NULL || - alg->parameter->type != V_ASN1_SEQUENCE) { - return NULL; - } - - const uint8_t *p = alg->parameter->value.sequence->data; - int plen = alg->parameter->value.sequence->length; - return d2i_X509_ALGOR(NULL, &p, plen); -} - -static RSA_PSS_PARAMS *rsa_pss_decode(const X509_ALGOR *alg) { - if (alg->parameter == NULL || alg->parameter->type != V_ASN1_SEQUENCE) { - return NULL; - } - - const uint8_t *p = alg->parameter->value.sequence->data; - int plen = alg->parameter->value.sequence->length; - return d2i_RSA_PSS_PARAMS(NULL, &p, plen); -} - -static int is_allowed_pss_md(const EVP_MD *md) { - int md_type = EVP_MD_type(md); - return md_type == NID_sha256 || md_type == NID_sha384 || - md_type == NID_sha512; -} - -// rsa_md_to_algor sets |*palg| to an |X509_ALGOR| describing the digest |md|, -// which must be an allowed PSS digest. -static int rsa_md_to_algor(X509_ALGOR **palg, const EVP_MD *md) { - // SHA-1 should be omitted (DEFAULT), but we do not allow SHA-1. - assert(is_allowed_pss_md(md)); - *palg = X509_ALGOR_new(); - if (*palg == NULL) { - return 0; - } - X509_ALGOR_set_md(*palg, md); - return 1; -} - -// rsa_md_to_mgf1 sets |*palg| to an |X509_ALGOR| describing MGF-1 with the -// digest |mgf1md|, which must be an allowed PSS digest. -static int rsa_md_to_mgf1(X509_ALGOR **palg, const EVP_MD *mgf1md) { - // SHA-1 should be omitted (DEFAULT), but we do not allow SHA-1. - assert(is_allowed_pss_md(mgf1md)); - X509_ALGOR *algtmp = NULL; - ASN1_STRING *stmp = NULL; - // need to embed algorithm ID inside another - if (!rsa_md_to_algor(&algtmp, mgf1md) || - !ASN1_item_pack(algtmp, ASN1_ITEM_rptr(X509_ALGOR), &stmp)) { - goto err; - } - *palg = X509_ALGOR_new(); - if (!*palg) { - goto err; - } - X509_ALGOR_set0(*palg, OBJ_nid2obj(NID_mgf1), V_ASN1_SEQUENCE, stmp); - stmp = NULL; - -err: - ASN1_STRING_free(stmp); - X509_ALGOR_free(algtmp); - if (*palg) { - return 1; - } - - return 0; -} - -static const EVP_MD *rsa_algor_to_md(const X509_ALGOR *alg) { - if (!alg) { - // If omitted, PSS defaults to SHA-1, which we do not allow. - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); - return NULL; - } - const EVP_MD *md = EVP_get_digestbyobj(alg->algorithm); - if (md == NULL || !is_allowed_pss_md(md)) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); - return NULL; - } - return md; -} - -static const EVP_MD *rsa_mgf1_to_md(const X509_ALGOR *alg) { - if (!alg) { - // If omitted, PSS defaults to MGF-1 with SHA-1, which we do not allow. - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); - return NULL; - } - // Check mask and lookup mask hash algorithm. - X509_ALGOR *maskHash = rsa_mgf1_decode(alg); - if (maskHash == NULL) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); - return NULL; - } - const EVP_MD *ret = rsa_algor_to_md(maskHash); - X509_ALGOR_free(maskHash); - return ret; -} - -int x509_rsa_ctx_to_pss(EVP_MD_CTX *ctx, X509_ALGOR *algor) { - const EVP_MD *sigmd, *mgf1md; - int saltlen; - if (!EVP_PKEY_CTX_get_signature_md(ctx->pctx, &sigmd) || - !EVP_PKEY_CTX_get_rsa_mgf1_md(ctx->pctx, &mgf1md) || - !EVP_PKEY_CTX_get_rsa_pss_saltlen(ctx->pctx, &saltlen)) { - return 0; - } - - if (sigmd != mgf1md || !is_allowed_pss_md(sigmd)) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); - return 0; - } - int md_len = EVP_MD_size(sigmd); - if (saltlen == -1) { - saltlen = md_len; - } else if (saltlen != md_len) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); - return 0; - } - - int ret = 0; - ASN1_STRING *os = NULL; - RSA_PSS_PARAMS *pss = RSA_PSS_PARAMS_new(); - if (!pss) { - goto err; - } - - // The DEFAULT value is 20, but this does not match any supported digest. - assert(saltlen != 20); - pss->saltLength = ASN1_INTEGER_new(); - if (!pss->saltLength || // - !ASN1_INTEGER_set(pss->saltLength, saltlen)) { - goto err; - } - - if (!rsa_md_to_algor(&pss->hashAlgorithm, sigmd) || - !rsa_md_to_mgf1(&pss->maskGenAlgorithm, mgf1md)) { - goto err; - } - - // Finally create string with pss parameter encoding. - if (!ASN1_item_pack(pss, ASN1_ITEM_rptr(RSA_PSS_PARAMS), &os)) { - goto err; - } - - X509_ALGOR_set0(algor, OBJ_nid2obj(NID_rsassaPss), V_ASN1_SEQUENCE, os); - os = NULL; - ret = 1; - -err: - RSA_PSS_PARAMS_free(pss); - ASN1_STRING_free(os); - return ret; -} - -int x509_rsa_pss_to_ctx(EVP_MD_CTX *ctx, const X509_ALGOR *sigalg, - EVP_PKEY *pkey) { - assert(OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss); - - // Decode PSS parameters - int ret = 0; - RSA_PSS_PARAMS *pss = rsa_pss_decode(sigalg); - if (pss == NULL) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); - goto err; - } - - const EVP_MD *mgf1md = rsa_mgf1_to_md(pss->maskGenAlgorithm); - const EVP_MD *md = rsa_algor_to_md(pss->hashAlgorithm); - if (mgf1md == NULL || md == NULL) { - goto err; - } - - // We require the MGF-1 and signing hashes to match. - if (mgf1md != md) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); - goto err; - } - - // We require the salt length be the hash length. The DEFAULT value is 20, but - // this does not match any supported salt length. - uint64_t salt_len = 0; - if (pss->saltLength == NULL || - !ASN1_INTEGER_get_uint64(&salt_len, pss->saltLength) || - salt_len != EVP_MD_size(md)) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); - goto err; - } - assert(salt_len <= INT_MAX); - - // The trailer field must be 1 (0xbc). This value is DEFAULT, so the structure - // is required to omit it in DER. Although a syntax error, we also tolerate an - // explicitly-encoded value. See the certificates in cl/362617931. - if (pss->trailerField != NULL && ASN1_INTEGER_get(pss->trailerField) != 1) { - OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); - goto err; - } - - EVP_PKEY_CTX *pctx; - if (!EVP_DigestVerifyInit(ctx, &pctx, md, NULL, pkey) || - !EVP_PKEY_CTX_set_rsa_padding(pctx, RSA_PKCS1_PSS_PADDING) || - !EVP_PKEY_CTX_set_rsa_pss_saltlen(pctx, (int)salt_len) || - !EVP_PKEY_CTX_set_rsa_mgf1_md(pctx, mgf1md)) { - goto err; - } - - ret = 1; - -err: - RSA_PSS_PARAMS_free(pss); - return ret; -} - -int x509_print_rsa_pss_params(BIO *bp, const X509_ALGOR *sigalg, int indent, - ASN1_PCTX *pctx) { - assert(OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss); - - int rv = 0; - X509_ALGOR *maskHash = NULL; - RSA_PSS_PARAMS *pss = rsa_pss_decode(sigalg); - if (!pss) { - if (BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") <= 0) { - goto err; - } - rv = 1; - goto err; - } - - if (BIO_puts(bp, "\n") <= 0 || // - !BIO_indent(bp, indent, 128) || // - BIO_puts(bp, "Hash Algorithm: ") <= 0) { - goto err; - } - - if (pss->hashAlgorithm) { - if (i2a_ASN1_OBJECT(bp, pss->hashAlgorithm->algorithm) <= 0) { - goto err; - } - } else if (BIO_puts(bp, "sha1 (default)") <= 0) { - goto err; - } - - if (BIO_puts(bp, "\n") <= 0 || // - !BIO_indent(bp, indent, 128) || // - BIO_puts(bp, "Mask Algorithm: ") <= 0) { - goto err; - } - - if (pss->maskGenAlgorithm) { - maskHash = rsa_mgf1_decode(pss->maskGenAlgorithm); - if (maskHash == NULL) { - if (BIO_puts(bp, "INVALID") <= 0) { - goto err; - } - } else { - if (i2a_ASN1_OBJECT(bp, pss->maskGenAlgorithm->algorithm) <= 0 || - BIO_puts(bp, " with ") <= 0 || - i2a_ASN1_OBJECT(bp, maskHash->algorithm) <= 0) { - goto err; - } - } - } else if (BIO_puts(bp, "mgf1 with sha1 (default)") <= 0) { - goto err; - } - BIO_puts(bp, "\n"); - - if (!BIO_indent(bp, indent, 128) || // - BIO_puts(bp, "Salt Length: 0x") <= 0) { - goto err; - } - - if (pss->saltLength) { - if (i2a_ASN1_INTEGER(bp, pss->saltLength) <= 0) { - goto err; - } - } else if (BIO_puts(bp, "14 (default)") <= 0) { - goto err; - } - BIO_puts(bp, "\n"); - - if (!BIO_indent(bp, indent, 128) || // - BIO_puts(bp, "Trailer Field: 0x") <= 0) { - goto err; - } - - if (pss->trailerField) { - if (i2a_ASN1_INTEGER(bp, pss->trailerField) <= 0) { - goto err; - } - } else if (BIO_puts(bp, "BC (default)") <= 0) { - goto err; - } - BIO_puts(bp, "\n"); - - rv = 1; - -err: - RSA_PSS_PARAMS_free(pss); - X509_ALGOR_free(maskHash); - return rv; -} diff --git a/third_party/boringssl/src/crypto/x509/rsa_pss.cc b/third_party/boringssl/src/crypto/x509/rsa_pss.cc new file mode 100644 index 00000000..4aa21235 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/rsa_pss.cc @@ -0,0 +1,187 @@ +// Copyright 2006-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../rsa/internal.h" +#include "internal.h" + + +using namespace bssl; + +static int rsa_pss_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it, + void *exarg) { + if (operation == ASN1_OP_FREE_PRE) { + RSA_PSS_PARAMS *pss = (RSA_PSS_PARAMS *)*pval; + X509_ALGOR_free(pss->maskHash); + } + return 1; +} + +ASN1_SEQUENCE_cb(RSA_PSS_PARAMS, rsa_pss_cb) = { + ASN1_EXP_OPT(RSA_PSS_PARAMS, hashAlgorithm, X509_ALGOR, 0), + ASN1_EXP_OPT(RSA_PSS_PARAMS, maskGenAlgorithm, X509_ALGOR, 1), + ASN1_EXP_OPT(RSA_PSS_PARAMS, saltLength, ASN1_INTEGER, 2), + ASN1_EXP_OPT(RSA_PSS_PARAMS, trailerField, ASN1_INTEGER, 3), +} ASN1_SEQUENCE_END_cb(RSA_PSS_PARAMS, RSA_PSS_PARAMS) + +IMPLEMENT_ASN1_FUNCTIONS_const(RSA_PSS_PARAMS) + + +static int rsa_pss_decode(const X509_ALGOR *alg, rsa_pss_params_t *out) { + if (alg->parameter == nullptr || alg->parameter->type != V_ASN1_SEQUENCE) { + return 0; + } + + // Although a syntax error in DER, we tolerate an explicitly-encoded trailer. + // See the certificates in cl/362617931. + CBS cbs; + CBS_init(&cbs, alg->parameter->value.sequence->data, + alg->parameter->value.sequence->length); + return rsa_parse_pss_params(&cbs, out, /*allow_explicit_trailer=*/true) && + CBS_len(&cbs) == 0; +} + +int bssl::x509_rsa_ctx_to_pss(EVP_MD_CTX *ctx, X509_ALGOR *algor) { + const EVP_MD *sigmd, *mgf1md; + int saltlen; + if (!EVP_PKEY_CTX_get_signature_md(ctx->pctx, &sigmd) || + !EVP_PKEY_CTX_get_rsa_mgf1_md(ctx->pctx, &mgf1md) || + !EVP_PKEY_CTX_get_rsa_pss_saltlen(ctx->pctx, &saltlen)) { + return 0; + } + + if (sigmd != mgf1md) { + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); + return 0; + } + int md_len = (int)EVP_MD_size(sigmd); + if (saltlen != RSA_PSS_SALTLEN_DIGEST && saltlen != md_len) { + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); + return 0; + } + + rsa_pss_params_t params; + switch (EVP_MD_type(sigmd)) { + case NID_sha256: + params = rsa_pss_sha256; + break; + case NID_sha384: + params = rsa_pss_sha384; + break; + case NID_sha512: + params = rsa_pss_sha512; + break; + default: + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); + return 0; + } + + // Encode |params| to an |ASN1_STRING|. + uint8_t buf[128]; // The largest param fits comfortably in 128 bytes. + CBB cbb; + CBB_init_fixed(&cbb, buf, sizeof(buf)); + if (!rsa_marshal_pss_params(&cbb, params)) { + return 0; + } + UniquePtr params_str(ASN1_STRING_type_new(V_ASN1_SEQUENCE)); + if (params_str == nullptr || + !ASN1_STRING_set(params_str.get(), CBB_data(&cbb), CBB_len(&cbb))) { + return 0; + } + + if (!X509_ALGOR_set0(algor, OBJ_nid2obj(NID_rsassaPss), V_ASN1_SEQUENCE, + params_str.get())) { + return 0; + } + params_str.release(); // |X509_ALGOR_set0| took ownership. + return 1; +} + +int bssl::x509_rsa_pss_to_ctx(EVP_MD_CTX *ctx, const X509_ALGOR *sigalg, + EVP_PKEY *pkey) { + assert(OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss); + rsa_pss_params_t params; + if (!rsa_pss_decode(sigalg, ¶ms)) { + OPENSSL_PUT_ERROR(X509, X509_R_INVALID_PSS_PARAMETERS); + return 0; + } + + const EVP_MD *md = rsa_pss_params_get_md(params); + EVP_PKEY_CTX *pctx; + if (!EVP_DigestVerifyInit(ctx, &pctx, md, nullptr, pkey) || + !EVP_PKEY_CTX_set_rsa_padding(pctx, RSA_PKCS1_PSS_PADDING) || + !EVP_PKEY_CTX_set_rsa_pss_saltlen(pctx, RSA_PSS_SALTLEN_DIGEST) || + !EVP_PKEY_CTX_set_rsa_mgf1_md(pctx, md)) { + return 0; + } + + return 1; +} + +int bssl::x509_print_rsa_pss_params(BIO *bp, const X509_ALGOR *sigalg, + int indent, ASN1_PCTX *pctx) { + assert(OBJ_obj2nid(sigalg->algorithm) == NID_rsassaPss); + rsa_pss_params_t params; + if (!rsa_pss_decode(sigalg, ¶ms)) { + return BIO_puts(bp, " (INVALID PSS PARAMETERS)\n") > 0; + } + + const char *hash_str = nullptr; + uint32_t salt_len = 0; + switch (params) { + case rsa_pss_none: + // |rsa_pss_decode| will never return this. + OPENSSL_PUT_ERROR(X509, ERR_R_INTERNAL_ERROR); + return 0; + case rsa_pss_sha256: + hash_str = "sha256"; + salt_len = 32; + break; + case rsa_pss_sha384: + hash_str = "sha384"; + salt_len = 48; + break; + case rsa_pss_sha512: + hash_str = "sha512"; + salt_len = 64; + break; + } + + if (BIO_puts(bp, "\n") <= 0 || // + !BIO_indent(bp, indent, 128) || // + BIO_printf(bp, "Hash Algorithm: %s\n", hash_str) <= 0 || + !BIO_indent(bp, indent, 128) || // + BIO_printf(bp, "Mask Algorithm: mgf1 with %s\n", hash_str) <= 0 || + !BIO_indent(bp, indent, 128) || // + BIO_printf(bp, "Salt Length: 0x%x\n", salt_len) <= 0 || + !BIO_indent(bp, indent, 128) || // + BIO_puts(bp, "Trailer Field: 0xBC (default)\n") <= 0) { + return 0; + } + + return 1; +} diff --git a/third_party/boringssl/src/crypto/x509/t_crl.c b/third_party/boringssl/src/crypto/x509/t_crl.c deleted file mode 100644 index 1957e317..00000000 --- a/third_party/boringssl/src/crypto/x509/t_crl.c +++ /dev/null @@ -1,145 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include -#include -#include -#include - -int X509_CRL_print_fp(FILE *fp, X509_CRL *x) { - BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); - if (b == NULL) { - OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); - return 0; - } - int ret = X509_CRL_print(b, x); - BIO_free(b); - return ret; -} - -int X509_CRL_print(BIO *out, X509_CRL *x) { - long version = X509_CRL_get_version(x); - assert(X509_CRL_VERSION_1 <= version && version <= X509_CRL_VERSION_2); - const X509_ALGOR *sig_alg; - const ASN1_BIT_STRING *signature; - X509_CRL_get0_signature(x, &signature, &sig_alg); - if (BIO_printf(out, "Certificate Revocation List (CRL):\n") <= 0 || - BIO_printf(out, "%8sVersion %ld (0x%lx)\n", "", version + 1, - (unsigned long)version) <= 0 || - // Note this and the other |X509_signature_print| call both print the - // outer signature algorithm, rather than printing the inner and outer - // ones separately. This matches OpenSSL, though it was probably a bug. - !X509_signature_print(out, sig_alg, NULL)) { - return 0; - } - - char *issuer = X509_NAME_oneline(X509_CRL_get_issuer(x), NULL, 0); - int ok = issuer != NULL && BIO_printf(out, "%8sIssuer: %s\n", "", issuer) > 0; - OPENSSL_free(issuer); - if (!ok) { - return 0; - } - - if (BIO_printf(out, "%8sLast Update: ", "") <= 0 || - !ASN1_TIME_print(out, X509_CRL_get0_lastUpdate(x)) || - BIO_printf(out, "\n%8sNext Update: ", "") <= 0) { - return 0; - } - if (X509_CRL_get0_nextUpdate(x)) { - if (!ASN1_TIME_print(out, X509_CRL_get0_nextUpdate(x))) { - return 0; - } - } else { - if (BIO_printf(out, "NONE") <= 0) { - return 0; - } - } - - if (BIO_printf(out, "\n") <= 0 || - !X509V3_extensions_print(out, "CRL extensions", - X509_CRL_get0_extensions(x), 0, 8)) { - return 0; - } - - const STACK_OF(X509_REVOKED) *rev = X509_CRL_get_REVOKED(x); - if (sk_X509_REVOKED_num(rev) > 0) { - if (BIO_printf(out, "Revoked Certificates:\n") <= 0) { - return 0; - } - } else { - if (BIO_printf(out, "No Revoked Certificates.\n") <= 0) { - return 0; - } - } - - for (size_t i = 0; i < sk_X509_REVOKED_num(rev); i++) { - const X509_REVOKED *r = sk_X509_REVOKED_value(rev, i); - if (BIO_printf(out, " Serial Number: ") <= 0 || - i2a_ASN1_INTEGER(out, X509_REVOKED_get0_serialNumber(r)) <= 0 || - BIO_printf(out, "\n Revocation Date: ") <= 0 || - !ASN1_TIME_print(out, X509_REVOKED_get0_revocationDate(r)) || - BIO_printf(out, "\n") <= 0 || - !X509V3_extensions_print(out, "CRL entry extensions", - X509_REVOKED_get0_extensions(r), 0, 8)) { - } - } - - return X509_signature_print(out, sig_alg, signature); -} diff --git a/third_party/boringssl/src/crypto/x509/t_crl.cc b/third_party/boringssl/src/crypto/x509/t_crl.cc new file mode 100644 index 00000000..5ef99ab3 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/t_crl.cc @@ -0,0 +1,106 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + + +int X509_CRL_print_fp(FILE *fp, const X509_CRL *x) { + BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); + if (b == nullptr) { + OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); + return 0; + } + int ret = X509_CRL_print(b, x); + BIO_free(b); + return ret; +} + +int X509_CRL_print(BIO *out, const X509_CRL *x) { + long version = X509_CRL_get_version(x); + assert(X509_CRL_VERSION_1 <= version && version <= X509_CRL_VERSION_2); + const X509_ALGOR *sig_alg; + const ASN1_BIT_STRING *signature; + X509_CRL_get0_signature(x, &signature, &sig_alg); + if (BIO_printf(out, "Certificate Revocation List (CRL):\n") <= 0 || + BIO_printf(out, "%8sVersion %ld (0x%lx)\n", "", version + 1, + (unsigned long)version) <= 0 || + // Note this and the other |X509_signature_print| call both print the + // outer signature algorithm, rather than printing the inner and outer + // ones separately. This matches OpenSSL, though it was probably a bug. + !X509_signature_print(out, sig_alg, nullptr)) { + return 0; + } + + char *issuer = X509_NAME_oneline(X509_CRL_get_issuer(x), nullptr, 0); + int ok = + issuer != nullptr && BIO_printf(out, "%8sIssuer: %s\n", "", issuer) > 0; + OPENSSL_free(issuer); + if (!ok) { + return 0; + } + + if (BIO_printf(out, "%8sLast Update: ", "") <= 0 || + !ASN1_TIME_print(out, X509_CRL_get0_lastUpdate(x)) || + BIO_printf(out, "\n%8sNext Update: ", "") <= 0) { + return 0; + } + if (X509_CRL_get0_nextUpdate(x)) { + if (!ASN1_TIME_print(out, X509_CRL_get0_nextUpdate(x))) { + return 0; + } + } else { + if (BIO_printf(out, "NONE") <= 0) { + return 0; + } + } + + if (BIO_printf(out, "\n") <= 0 || + !X509V3_extensions_print(out, "CRL extensions", + X509_CRL_get0_extensions(x), 0, 8)) { + return 0; + } + + // TODO(crbug.com/442860745): |X509_CRL_get_REVOKED| is not const-correct. + const STACK_OF(X509_REVOKED) *rev = + X509_CRL_get_REVOKED(const_cast(x)); + if (sk_X509_REVOKED_num(rev) > 0) { + if (BIO_printf(out, "Revoked Certificates:\n") <= 0) { + return 0; + } + } else { + if (BIO_printf(out, "No Revoked Certificates.\n") <= 0) { + return 0; + } + } + + for (size_t i = 0; i < sk_X509_REVOKED_num(rev); i++) { + const X509_REVOKED *r = sk_X509_REVOKED_value(rev, i); + if (BIO_printf(out, " Serial Number: ") <= 0 || + i2a_ASN1_INTEGER(out, X509_REVOKED_get0_serialNumber(r)) <= 0 || + BIO_printf(out, "\n Revocation Date: ") <= 0 || + !ASN1_TIME_print(out, X509_REVOKED_get0_revocationDate(r)) || + BIO_printf(out, "\n") <= 0 || + !X509V3_extensions_print(out, "CRL entry extensions", + X509_REVOKED_get0_extensions(r), 0, 8)) { + } + } + + return X509_signature_print(out, sig_alg, signature); +} diff --git a/third_party/boringssl/src/crypto/x509/t_req.c b/third_party/boringssl/src/crypto/x509/t_req.c deleted file mode 100644 index e9287d5f..00000000 --- a/third_party/boringssl/src/crypto/x509/t_req.c +++ /dev/null @@ -1,248 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include - -#include -#include -#include -#include -#include -#include - -#include "internal.h" - - -int X509_REQ_print_fp(FILE *fp, X509_REQ *x) { - BIO *bio = BIO_new_fp(fp, BIO_NOCLOSE); - if (bio == NULL) { - OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); - return 0; - } - int ret = X509_REQ_print(bio, x); - BIO_free(bio); - return ret; -} - -int X509_REQ_print_ex(BIO *bio, X509_REQ *x, unsigned long nmflags, - unsigned long cflag) { - long l; - EVP_PKEY *pkey; - STACK_OF(X509_ATTRIBUTE) *sk; - char mlch = ' '; - - int nmindent = 0; - - if ((nmflags & XN_FLAG_SEP_MASK) == XN_FLAG_SEP_MULTILINE) { - mlch = '\n'; - nmindent = 12; - } - - if (nmflags == X509_FLAG_COMPAT) { - nmindent = 16; - } - - X509_REQ_INFO *ri = x->req_info; - if (!(cflag & X509_FLAG_NO_HEADER)) { - if (BIO_write(bio, "Certificate Request:\n", 21) <= 0 || - BIO_write(bio, " Data:\n", 10) <= 0) { - goto err; - } - } - if (!(cflag & X509_FLAG_NO_VERSION)) { - l = X509_REQ_get_version(x); - // Only zero, |X509_REQ_VERSION_1|, is valid but our parser accepts some - // invalid values for compatibility. - assert(0 <= l && l <= 2); - if (BIO_printf(bio, "%8sVersion: %ld (0x%lx)\n", "", l + 1, - (unsigned long)l) <= 0) { - goto err; - } - } - if (!(cflag & X509_FLAG_NO_SUBJECT)) { - if (BIO_printf(bio, " Subject:%c", mlch) <= 0 || - X509_NAME_print_ex(bio, ri->subject, nmindent, nmflags) < 0 || - BIO_write(bio, "\n", 1) <= 0) { - goto err; - } - } - if (!(cflag & X509_FLAG_NO_PUBKEY)) { - if (BIO_write(bio, " Subject Public Key Info:\n", 33) <= 0 || - BIO_printf(bio, "%12sPublic Key Algorithm: ", "") <= 0 || - i2a_ASN1_OBJECT(bio, ri->pubkey->algor->algorithm) <= 0 || - BIO_puts(bio, "\n") <= 0) { - goto err; - } - - pkey = X509_REQ_get_pubkey(x); - if (pkey == NULL) { - BIO_printf(bio, "%12sUnable to load Public Key\n", ""); - ERR_print_errors(bio); - } else { - EVP_PKEY_print_public(bio, pkey, 16, NULL); - EVP_PKEY_free(pkey); - } - } - - if (!(cflag & X509_FLAG_NO_ATTRIBUTES)) { - if (BIO_printf(bio, "%8sAttributes:\n", "") <= 0) { - goto err; - } - - sk = x->req_info->attributes; - if (sk_X509_ATTRIBUTE_num(sk) == 0) { - if (BIO_printf(bio, "%12sa0:00\n", "") <= 0) { - goto err; - } - } else { - size_t i; - for (i = 0; i < sk_X509_ATTRIBUTE_num(sk); i++) { - X509_ATTRIBUTE *a = sk_X509_ATTRIBUTE_value(sk, i); - ASN1_OBJECT *aobj = X509_ATTRIBUTE_get0_object(a); - - if (X509_REQ_extension_nid(OBJ_obj2nid(aobj))) { - continue; - } - - if (BIO_printf(bio, "%12s", "") <= 0) { - goto err; - } - - const int num_attrs = X509_ATTRIBUTE_count(a); - const int obj_str_len = i2a_ASN1_OBJECT(bio, aobj); - if (obj_str_len <= 0) { - if (BIO_puts(bio, "(Unable to print attribute ID.)\n") < 0) { - goto err; - } else { - continue; - } - } - - int j; - for (j = 0; j < num_attrs; j++) { - const ASN1_TYPE *at = X509_ATTRIBUTE_get0_type(a, j); - const int type = at->type; - ASN1_BIT_STRING *bs = at->value.asn1_string; - - int k; - for (k = 25 - obj_str_len; k > 0; k--) { - if (BIO_write(bio, " ", 1) != 1) { - goto err; - } - } - - if (BIO_puts(bio, ":") <= 0) { - goto err; - } - - if (type == V_ASN1_PRINTABLESTRING || type == V_ASN1_UTF8STRING || - type == V_ASN1_IA5STRING || type == V_ASN1_T61STRING) { - if (BIO_write(bio, (char *)bs->data, bs->length) != bs->length) { - goto err; - } - BIO_puts(bio, "\n"); - } else { - BIO_puts(bio, "unable to print attribute\n"); - } - } - } - } - } - - if (!(cflag & X509_FLAG_NO_EXTENSIONS)) { - STACK_OF(X509_EXTENSION) *exts = X509_REQ_get_extensions(x); - if (exts) { - BIO_printf(bio, "%8sRequested Extensions:\n", ""); - - for (size_t i = 0; i < sk_X509_EXTENSION_num(exts); i++) { - const X509_EXTENSION *ex = sk_X509_EXTENSION_value(exts, i); - if (BIO_printf(bio, "%12s", "") <= 0) { - goto err; - } - const ASN1_OBJECT *obj = X509_EXTENSION_get_object(ex); - i2a_ASN1_OBJECT(bio, obj); - const int is_critical = X509_EXTENSION_get_critical(ex); - if (BIO_printf(bio, ": %s\n", is_critical ? "critical" : "") <= 0) { - goto err; - } - if (!X509V3_EXT_print(bio, ex, cflag, 16)) { - BIO_printf(bio, "%16s", ""); - ASN1_STRING_print(bio, X509_EXTENSION_get_data(ex)); - } - if (BIO_write(bio, "\n", 1) <= 0) { - goto err; - } - } - sk_X509_EXTENSION_pop_free(exts, X509_EXTENSION_free); - } - } - - if (!(cflag & X509_FLAG_NO_SIGDUMP) && - !X509_signature_print(bio, x->sig_alg, x->signature)) { - goto err; - } - - return 1; - -err: - OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); - return 0; -} - -int X509_REQ_print(BIO *bio, X509_REQ *req) { - return X509_REQ_print_ex(bio, req, XN_FLAG_COMPAT, X509_FLAG_COMPAT); -} diff --git a/third_party/boringssl/src/crypto/x509/t_req.cc b/third_party/boringssl/src/crypto/x509/t_req.cc new file mode 100644 index 00000000..8eeac078 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/t_req.cc @@ -0,0 +1,206 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +int X509_REQ_print_fp(FILE *fp, const X509_REQ *x) { + BIO *bio = BIO_new_fp(fp, BIO_NOCLOSE); + if (bio == nullptr) { + OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); + return 0; + } + int ret = X509_REQ_print(bio, x); + BIO_free(bio); + return ret; +} + +int X509_REQ_print_ex(BIO *bio, const X509_REQ *x, unsigned long nmflags, + unsigned long cflag) { + long l; + STACK_OF(X509_ATTRIBUTE) *sk; + char mlch = ' '; + + int nmindent = 0; + + if ((nmflags & XN_FLAG_SEP_MASK) == XN_FLAG_SEP_MULTILINE) { + mlch = '\n'; + nmindent = 12; + } + + if (nmflags == X509_FLAG_COMPAT) { + nmindent = 16; + } + + const X509_REQ_INFO *ri = x->req_info; + if (!(cflag & X509_FLAG_NO_HEADER)) { + if (BIO_write(bio, "Certificate Request:\n", 21) <= 0 || + BIO_write(bio, " Data:\n", 10) <= 0) { + goto err; + } + } + if (!(cflag & X509_FLAG_NO_VERSION)) { + l = X509_REQ_get_version(x); + // Only zero, |X509_REQ_VERSION_1|, is valid but our parser accepts some + // invalid values for compatibility. + assert(0 <= l && l <= 2); + if (BIO_printf(bio, "%8sVersion: %ld (0x%lx)\n", "", l + 1, + (unsigned long)l) <= 0) { + goto err; + } + } + if (!(cflag & X509_FLAG_NO_SUBJECT)) { + if (BIO_printf(bio, " Subject:%c", mlch) <= 0 || + X509_NAME_print_ex(bio, ri->subject, nmindent, nmflags) < 0 || + BIO_write(bio, "\n", 1) <= 0) { + goto err; + } + } + if (!(cflag & X509_FLAG_NO_PUBKEY)) { + if (BIO_write(bio, " Subject Public Key Info:\n", 33) <= 0 || + BIO_printf(bio, "%12sPublic Key Algorithm: ", "") <= 0 || + i2a_ASN1_OBJECT(bio, ri->pubkey->algor.algorithm) <= 0 || + BIO_puts(bio, "\n") <= 0) { + goto err; + } + + const EVP_PKEY *pkey = X509_REQ_get0_pubkey(x); + if (pkey == nullptr) { + BIO_printf(bio, "%12sUnable to load Public Key\n", ""); + ERR_print_errors(bio); + } else { + EVP_PKEY_print_public(bio, pkey, 16, nullptr); + } + } + + if (!(cflag & X509_FLAG_NO_ATTRIBUTES)) { + if (BIO_printf(bio, "%8sAttributes:\n", "") <= 0) { + goto err; + } + + sk = x->req_info->attributes; + if (sk_X509_ATTRIBUTE_num(sk) == 0) { + if (BIO_printf(bio, "%12sa0:00\n", "") <= 0) { + goto err; + } + } else { + for (size_t i = 0; i < sk_X509_ATTRIBUTE_num(sk); i++) { + // TODO(crbug.com/442860745): |X509_ATTRIBUTE| accessors are not + // const-correct. + X509_ATTRIBUTE *a = sk_X509_ATTRIBUTE_value(sk, i); + const ASN1_OBJECT *aobj = X509_ATTRIBUTE_get0_object(a); + + if (X509_REQ_extension_nid(OBJ_obj2nid(aobj))) { + continue; + } + + if (BIO_printf(bio, "%12s", "") <= 0) { + goto err; + } + + const int num_attrs = X509_ATTRIBUTE_count(a); + const int obj_str_len = i2a_ASN1_OBJECT(bio, aobj); + if (obj_str_len <= 0) { + if (BIO_puts(bio, "(Unable to print attribute ID.)\n") < 0) { + goto err; + } else { + continue; + } + } + + for (int j = 0; j < num_attrs; j++) { + for (int k = 25 - obj_str_len; k > 0; k--) { + if (BIO_write(bio, " ", 1) != 1) { + goto err; + } + } + + if (BIO_puts(bio, ":") <= 0) { + goto err; + } + + const ASN1_TYPE *at = X509_ATTRIBUTE_get0_type(a, j); + if (at->type == V_ASN1_PRINTABLESTRING || + at->type == V_ASN1_UTF8STRING || at->type == V_ASN1_IA5STRING || + at->type == V_ASN1_T61STRING) { + const ASN1_STRING *str = at->value.asn1_string; + int str_len = ASN1_STRING_length(str); + if (BIO_write(bio, ASN1_STRING_get0_data(str), str_len) != + str_len) { + goto err; + } + BIO_puts(bio, "\n"); + } else { + BIO_puts(bio, "unable to print attribute\n"); + } + } + } + } + } + + if (!(cflag & X509_FLAG_NO_EXTENSIONS)) { + STACK_OF(X509_EXTENSION) *exts = X509_REQ_get_extensions(x); + if (exts) { + BIO_printf(bio, "%8sRequested Extensions:\n", ""); + + for (size_t i = 0; i < sk_X509_EXTENSION_num(exts); i++) { + const X509_EXTENSION *ex = sk_X509_EXTENSION_value(exts, i); + if (BIO_printf(bio, "%12s", "") <= 0) { + goto err; + } + const ASN1_OBJECT *obj = X509_EXTENSION_get_object(ex); + i2a_ASN1_OBJECT(bio, obj); + const int is_critical = X509_EXTENSION_get_critical(ex); + if (BIO_printf(bio, ": %s\n", is_critical ? "critical" : "") <= 0) { + goto err; + } + if (!X509V3_EXT_print(bio, ex, cflag, 16)) { + BIO_printf(bio, "%16s", ""); + ASN1_STRING_print(bio, X509_EXTENSION_get_data(ex)); + } + if (BIO_write(bio, "\n", 1) <= 0) { + goto err; + } + } + sk_X509_EXTENSION_pop_free(exts, X509_EXTENSION_free); + } + } + + if (!(cflag & X509_FLAG_NO_SIGDUMP) && + !X509_signature_print(bio, x->sig_alg, x->signature)) { + goto err; + } + + return 1; + +err: + OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); + return 0; +} + +int X509_REQ_print(BIO *bio, const X509_REQ *req) { + return X509_REQ_print_ex(bio, req, XN_FLAG_COMPAT, X509_FLAG_COMPAT); +} diff --git a/third_party/boringssl/src/crypto/x509/t_x509.c b/third_party/boringssl/src/crypto/x509/t_x509.c deleted file mode 100644 index 6694e3d1..00000000 --- a/third_party/boringssl/src/crypto/x509/t_x509.c +++ /dev/null @@ -1,340 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "internal.h" - - -int X509_print_ex_fp(FILE *fp, X509 *x, unsigned long nmflag, - unsigned long cflag) { - BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); - if (b == NULL) { - OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); - return 0; - } - int ret = X509_print_ex(b, x, nmflag, cflag); - BIO_free(b); - return ret; -} - -int X509_print_fp(FILE *fp, X509 *x) { - return X509_print_ex_fp(fp, x, XN_FLAG_COMPAT, X509_FLAG_COMPAT); -} - -int X509_print(BIO *bp, X509 *x) { - return X509_print_ex(bp, x, XN_FLAG_COMPAT, X509_FLAG_COMPAT); -} - -int X509_print_ex(BIO *bp, X509 *x, unsigned long nmflags, - unsigned long cflag) { - long l; - int ret = 0, i; - char *m = NULL, mlch = ' '; - int nmindent = 0; - X509_CINF *ci; - EVP_PKEY *pkey = NULL; - const char *neg; - - if ((nmflags & XN_FLAG_SEP_MASK) == XN_FLAG_SEP_MULTILINE) { - mlch = '\n'; - nmindent = 12; - } - - if (nmflags == X509_FLAG_COMPAT) { - nmindent = 16; - } - - ci = x->cert_info; - if (!(cflag & X509_FLAG_NO_HEADER)) { - if (BIO_write(bp, "Certificate:\n", 13) <= 0) { - goto err; - } - if (BIO_write(bp, " Data:\n", 10) <= 0) { - goto err; - } - } - if (!(cflag & X509_FLAG_NO_VERSION)) { - l = X509_get_version(x); - assert(X509_VERSION_1 <= l && l <= X509_VERSION_3); - if (BIO_printf(bp, "%8sVersion: %ld (0x%lx)\n", "", l + 1, - (unsigned long)l) <= 0) { - goto err; - } - } - if (!(cflag & X509_FLAG_NO_SERIAL)) { - if (BIO_write(bp, " Serial Number:", 22) <= 0) { - goto err; - } - - const ASN1_INTEGER *serial = X509_get0_serialNumber(x); - uint64_t serial_u64; - if (ASN1_INTEGER_get_uint64(&serial_u64, serial)) { - assert(serial->type != V_ASN1_NEG_INTEGER); - if (BIO_printf(bp, " %" PRIu64 " (0x%" PRIx64 ")\n", serial_u64, - serial_u64) <= 0) { - goto err; - } - } else { - ERR_clear_error(); // Clear |ASN1_INTEGER_get_uint64|'s error. - neg = (serial->type == V_ASN1_NEG_INTEGER) ? " (Negative)" : ""; - if (BIO_printf(bp, "\n%12s%s", "", neg) <= 0) { - goto err; - } - - for (i = 0; i < serial->length; i++) { - if (BIO_printf(bp, "%02x%c", serial->data[i], - ((i + 1 == serial->length) ? '\n' : ':')) <= 0) { - goto err; - } - } - } - } - - if (!(cflag & X509_FLAG_NO_SIGNAME)) { - if (X509_signature_print(bp, ci->signature, NULL) <= 0) { - goto err; - } - } - - if (!(cflag & X509_FLAG_NO_ISSUER)) { - if (BIO_printf(bp, " Issuer:%c", mlch) <= 0) { - goto err; - } - if (X509_NAME_print_ex(bp, X509_get_issuer_name(x), nmindent, nmflags) < - 0) { - goto err; - } - if (BIO_write(bp, "\n", 1) <= 0) { - goto err; - } - } - if (!(cflag & X509_FLAG_NO_VALIDITY)) { - if (BIO_write(bp, " Validity\n", 17) <= 0) { - goto err; - } - if (BIO_write(bp, " Not Before: ", 24) <= 0) { - goto err; - } - if (!ASN1_TIME_print(bp, X509_get_notBefore(x))) { - goto err; - } - if (BIO_write(bp, "\n Not After : ", 25) <= 0) { - goto err; - } - if (!ASN1_TIME_print(bp, X509_get_notAfter(x))) { - goto err; - } - if (BIO_write(bp, "\n", 1) <= 0) { - goto err; - } - } - if (!(cflag & X509_FLAG_NO_SUBJECT)) { - if (BIO_printf(bp, " Subject:%c", mlch) <= 0) { - goto err; - } - if (X509_NAME_print_ex(bp, X509_get_subject_name(x), nmindent, nmflags) < - 0) { - goto err; - } - if (BIO_write(bp, "\n", 1) <= 0) { - goto err; - } - } - if (!(cflag & X509_FLAG_NO_PUBKEY)) { - if (BIO_write(bp, " Subject Public Key Info:\n", 33) <= 0) { - goto err; - } - if (BIO_printf(bp, "%12sPublic Key Algorithm: ", "") <= 0) { - goto err; - } - if (i2a_ASN1_OBJECT(bp, ci->key->algor->algorithm) <= 0) { - goto err; - } - if (BIO_puts(bp, "\n") <= 0) { - goto err; - } - - pkey = X509_get_pubkey(x); - if (pkey == NULL) { - BIO_printf(bp, "%12sUnable to load Public Key\n", ""); - ERR_print_errors(bp); - } else { - EVP_PKEY_print_public(bp, pkey, 16, NULL); - EVP_PKEY_free(pkey); - } - } - - if (!(cflag & X509_FLAG_NO_IDS)) { - if (ci->issuerUID) { - if (BIO_printf(bp, "%8sIssuer Unique ID: ", "") <= 0) { - goto err; - } - if (!X509_signature_dump(bp, ci->issuerUID, 12)) { - goto err; - } - } - if (ci->subjectUID) { - if (BIO_printf(bp, "%8sSubject Unique ID: ", "") <= 0) { - goto err; - } - if (!X509_signature_dump(bp, ci->subjectUID, 12)) { - goto err; - } - } - } - - if (!(cflag & X509_FLAG_NO_EXTENSIONS)) { - X509V3_extensions_print(bp, "X509v3 extensions", ci->extensions, cflag, 8); - } - - if (!(cflag & X509_FLAG_NO_SIGDUMP)) { - if (X509_signature_print(bp, x->sig_alg, x->signature) <= 0) { - goto err; - } - } - if (!(cflag & X509_FLAG_NO_AUX)) { - if (!X509_CERT_AUX_print(bp, x->aux, 0)) { - goto err; - } - } - ret = 1; -err: - if (m != NULL) { - OPENSSL_free(m); - } - return ret; -} - -int X509_signature_print(BIO *bp, const X509_ALGOR *sigalg, - const ASN1_STRING *sig) { - if (BIO_puts(bp, " Signature Algorithm: ") <= 0) { - return 0; - } - if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) { - return 0; - } - - // RSA-PSS signatures have parameters to print. - int sig_nid = OBJ_obj2nid(sigalg->algorithm); - if (sig_nid == NID_rsassaPss && - !x509_print_rsa_pss_params(bp, sigalg, 9, 0)) { - return 0; - } - - if (sig) { - return X509_signature_dump(bp, sig, 9); - } else if (BIO_puts(bp, "\n") <= 0) { - return 0; - } - return 1; -} - -int X509_NAME_print(BIO *bp, const X509_NAME *name, int obase) { - char *s, *c, *b; - int ret = 0, i; - - b = X509_NAME_oneline(name, NULL, 0); - if (!b) { - return 0; - } - if (!*b) { - OPENSSL_free(b); - return 1; - } - s = b + 1; // skip the first slash - - c = s; - for (;;) { - if (((*s == '/') && ((s[1] >= 'A') && (s[1] <= 'Z') && - ((s[2] == '=') || ((s[2] >= 'A') && (s[2] <= 'Z') && - (s[3] == '='))))) || - (*s == '\0')) { - i = s - c; - if (BIO_write(bp, c, i) != i) { - goto err; - } - c = s + 1; // skip following slash - if (*s != '\0') { - if (BIO_write(bp, ", ", 2) != 2) { - goto err; - } - } - } - if (*s == '\0') { - break; - } - s++; - } - - ret = 1; - if (0) { - err: - OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); - } - OPENSSL_free(b); - return ret; -} diff --git a/third_party/boringssl/src/crypto/x509/t_x509.cc b/third_party/boringssl/src/crypto/x509/t_x509.cc new file mode 100644 index 00000000..69c8d859 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/t_x509.cc @@ -0,0 +1,291 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +int X509_print_ex_fp(FILE *fp, const X509 *x, unsigned long nmflag, + unsigned long cflag) { + BIO *b = BIO_new_fp(fp, BIO_NOCLOSE); + if (b == nullptr) { + OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); + return 0; + } + int ret = X509_print_ex(b, x, nmflag, cflag); + BIO_free(b); + return ret; +} + +int X509_print_fp(FILE *fp, const X509 *x) { + return X509_print_ex_fp(fp, x, XN_FLAG_COMPAT, X509_FLAG_COMPAT); +} + +int X509_print(BIO *bp, const X509 *x) { + return X509_print_ex(bp, x, XN_FLAG_COMPAT, X509_FLAG_COMPAT); +} + +int X509_print_ex(BIO *bp, const X509 *x, unsigned long nmflags, + unsigned long cflag) { + auto *impl = FromOpaque(x); + char mlch = ' '; + int nmindent = 0; + if ((nmflags & XN_FLAG_SEP_MASK) == XN_FLAG_SEP_MULTILINE) { + mlch = '\n'; + nmindent = 12; + } + + if (nmflags == X509_FLAG_COMPAT) { + nmindent = 16; + } + + if (!(cflag & X509_FLAG_NO_HEADER)) { + if (BIO_write(bp, "Certificate:\n", 13) <= 0) { + return 0; + } + if (BIO_write(bp, " Data:\n", 10) <= 0) { + return 0; + } + } + if (!(cflag & X509_FLAG_NO_VERSION)) { + long l = X509_get_version(x); + assert(X509_VERSION_1 <= l && l <= X509_VERSION_3); + if (BIO_printf(bp, "%8sVersion: %ld (0x%lx)\n", "", l + 1, + (unsigned long)l) <= 0) { + return 0; + } + } + if (!(cflag & X509_FLAG_NO_SERIAL)) { + if (BIO_write(bp, " Serial Number:", 22) <= 0) { + return 0; + } + + const ASN1_INTEGER *serial = X509_get0_serialNumber(x); + uint64_t serial_u64; + if (ASN1_INTEGER_get_uint64(&serial_u64, serial)) { + assert(serial->type != V_ASN1_NEG_INTEGER); + if (BIO_printf(bp, " %" PRIu64 " (0x%" PRIx64 ")\n", serial_u64, + serial_u64) <= 0) { + return 0; + } + } else { + ERR_clear_error(); // Clear |ASN1_INTEGER_get_uint64|'s error. + const char *neg = + (serial->type == V_ASN1_NEG_INTEGER) ? " (Negative)" : ""; + if (BIO_printf(bp, "\n%12s%s", "", neg) <= 0) { + return 0; + } + + for (int i = 0; i < serial->length; i++) { + if (BIO_printf(bp, "%02x%c", serial->data[i], + ((i + 1 == serial->length) ? '\n' : ':')) <= 0) { + return 0; + } + } + } + } + + if (!(cflag & X509_FLAG_NO_SIGNAME)) { + if (X509_signature_print(bp, &impl->tbs_sig_alg, nullptr) <= 0) { + return 0; + } + } + + if (!(cflag & X509_FLAG_NO_ISSUER)) { + if (BIO_printf(bp, " Issuer:%c", mlch) <= 0) { + return 0; + } + if (X509_NAME_print_ex(bp, X509_get_issuer_name(x), nmindent, nmflags) < + 0) { + return 0; + } + if (BIO_write(bp, "\n", 1) <= 0) { + return 0; + } + } + if (!(cflag & X509_FLAG_NO_VALIDITY)) { + if (BIO_write(bp, " Validity\n", 17) <= 0) { + return 0; + } + if (BIO_write(bp, " Not Before: ", 24) <= 0) { + return 0; + } + if (!ASN1_TIME_print(bp, X509_get0_notBefore(x))) { + return 0; + } + if (BIO_write(bp, "\n Not After : ", 25) <= 0) { + return 0; + } + if (!ASN1_TIME_print(bp, X509_get0_notAfter(x))) { + return 0; + } + if (BIO_write(bp, "\n", 1) <= 0) { + return 0; + } + } + if (!(cflag & X509_FLAG_NO_SUBJECT)) { + if (BIO_printf(bp, " Subject:%c", mlch) <= 0) { + return 0; + } + if (X509_NAME_print_ex(bp, X509_get_subject_name(x), nmindent, nmflags) < + 0) { + return 0; + } + if (BIO_write(bp, "\n", 1) <= 0) { + return 0; + } + } + if (!(cflag & X509_FLAG_NO_PUBKEY)) { + if (BIO_write(bp, " Subject Public Key Info:\n", 33) <= 0) { + return 0; + } + if (BIO_printf(bp, "%12sPublic Key Algorithm: ", "") <= 0) { + return 0; + } + if (i2a_ASN1_OBJECT(bp, impl->key.algor.algorithm) <= 0) { + return 0; + } + if (BIO_puts(bp, "\n") <= 0) { + return 0; + } + + const EVP_PKEY *pkey = X509_get0_pubkey(x); + if (pkey == nullptr) { + BIO_printf(bp, "%12sUnable to load Public Key\n", ""); + ERR_print_errors(bp); + } else { + EVP_PKEY_print_public(bp, pkey, 16, nullptr); + } + } + + if (!(cflag & X509_FLAG_NO_IDS)) { + if (impl->issuerUID) { + if (BIO_printf(bp, "%8sIssuer Unique ID: ", "") <= 0) { + return 0; + } + if (!X509_signature_dump(bp, impl->issuerUID, 12)) { + return 0; + } + } + if (impl->subjectUID) { + if (BIO_printf(bp, "%8sSubject Unique ID: ", "") <= 0) { + return 0; + } + if (!X509_signature_dump(bp, impl->subjectUID, 12)) { + return 0; + } + } + } + + if (!(cflag & X509_FLAG_NO_EXTENSIONS)) { + X509V3_extensions_print(bp, "X509v3 extensions", impl->extensions, cflag, + 8); + } + + if (!(cflag & X509_FLAG_NO_SIGDUMP)) { + if (X509_signature_print(bp, &impl->sig_alg, &impl->signature) <= 0) { + return 0; + } + } + if (!(cflag & X509_FLAG_NO_AUX)) { + if (!X509_CERT_AUX_print(bp, impl->aux, 0)) { + return 0; + } + } + + return 1; +} + +int X509_signature_print(BIO *bp, const X509_ALGOR *sigalg, + const ASN1_STRING *sig) { + if (BIO_puts(bp, " Signature Algorithm: ") <= 0) { + return 0; + } + if (i2a_ASN1_OBJECT(bp, sigalg->algorithm) <= 0) { + return 0; + } + + // RSA-PSS signatures have parameters to print. + int sig_nid = OBJ_obj2nid(sigalg->algorithm); + if (sig_nid == NID_rsassaPss && + !x509_print_rsa_pss_params(bp, sigalg, 9, nullptr)) { + return 0; + } + + if (sig) { + return X509_signature_dump(bp, sig, 9); + } else if (BIO_puts(bp, "\n") <= 0) { + return 0; + } + return 1; +} + +int X509_NAME_print(BIO *bp, const X509_NAME *name, int obase) { + char *s, *c, *b; + int ret = 0, i; + + b = X509_NAME_oneline(name, nullptr, 0); + if (!b) { + return 0; + } + if (!*b) { + OPENSSL_free(b); + return 1; + } + s = b + 1; // skip the first slash + + c = s; + for (;;) { + if (((*s == '/') && ((s[1] >= 'A') && (s[1] <= 'Z') && + ((s[2] == '=') || ((s[2] >= 'A') && (s[2] <= 'Z') && + (s[3] == '='))))) || + (*s == '\0')) { + i = s - c; + if (BIO_write(bp, c, i) != i) { + goto err; + } + c = s + 1; // skip following slash + if (*s != '\0') { + if (BIO_write(bp, ", ", 2) != 2) { + goto err; + } + } + } + if (*s == '\0') { + break; + } + s++; + } + + ret = 1; + if (0) { + err: + OPENSSL_PUT_ERROR(X509, ERR_R_BUF_LIB); + } + OPENSSL_free(b); + return ret; +} diff --git a/third_party/boringssl/src/crypto/x509/t_x509a.c b/third_party/boringssl/src/crypto/x509/t_x509a.c deleted file mode 100644 index 956b9a03..00000000 --- a/third_party/boringssl/src/crypto/x509/t_x509a.c +++ /dev/null @@ -1,121 +0,0 @@ -/* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com) - * All rights reserved. - * - * This package is an SSL implementation written - * by Eric Young (eay@cryptsoft.com). - * The implementation was written so as to conform with Netscapes SSL. - * - * This library is free for commercial and non-commercial use as long as - * the following conditions are aheared to. The following conditions - * apply to all code found in this distribution, be it the RC4, RSA, - * lhash, DES, etc., code; not just the SSL code. The SSL documentation - * included with this distribution is covered by the same copyright terms - * except that the holder is Tim Hudson (tjh@cryptsoft.com). - * - * Copyright remains Eric Young's, and as such any Copyright notices in - * the code are not to be removed. - * If this package is used in a product, Eric Young should be given attribution - * as the author of the parts of the library used. - * This can be in the form of a textual message at program startup or - * in documentation (online or textual) provided with the package. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * "This product includes cryptographic software written by - * Eric Young (eay@cryptsoft.com)" - * The word 'cryptographic' can be left out if the rouines from the library - * being used are not cryptographic related :-). - * 4. If you include any Windows specific code (or a derivative thereof) from - * the apps directory (application code) you must include an acknowledgement: - * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)" - * - * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * The licence and distribution terms for any publically available version or - * derivative of this code cannot be changed. i.e. this code cannot simply be - * copied and put under another distribution licence - * [including the GNU Public Licence.] */ - -#include -#include -#include -#include -#include - -#include "internal.h" - - -// X509_CERT_AUX and string set routines - -int X509_CERT_AUX_print(BIO *out, X509_CERT_AUX *aux, int indent) { - char oidstr[80], first; - size_t i; - int j; - if (!aux) { - return 1; - } - if (aux->trust) { - first = 1; - BIO_printf(out, "%*sTrusted Uses:\n%*s", indent, "", indent + 2, ""); - for (i = 0; i < sk_ASN1_OBJECT_num(aux->trust); i++) { - if (!first) { - BIO_puts(out, ", "); - } else { - first = 0; - } - OBJ_obj2txt(oidstr, sizeof oidstr, sk_ASN1_OBJECT_value(aux->trust, i), - 0); - BIO_puts(out, oidstr); - } - BIO_puts(out, "\n"); - } else { - BIO_printf(out, "%*sNo Trusted Uses.\n", indent, ""); - } - if (aux->reject) { - first = 1; - BIO_printf(out, "%*sRejected Uses:\n%*s", indent, "", indent + 2, ""); - for (i = 0; i < sk_ASN1_OBJECT_num(aux->reject); i++) { - if (!first) { - BIO_puts(out, ", "); - } else { - first = 0; - } - OBJ_obj2txt(oidstr, sizeof oidstr, sk_ASN1_OBJECT_value(aux->reject, i), - 0); - BIO_puts(out, oidstr); - } - BIO_puts(out, "\n"); - } else { - BIO_printf(out, "%*sNo Rejected Uses.\n", indent, ""); - } - if (aux->alias) { - BIO_printf(out, "%*sAlias: %.*s\n", indent, "", aux->alias->length, - aux->alias->data); - } - if (aux->keyid) { - BIO_printf(out, "%*sKey Id: ", indent, ""); - for (j = 0; j < aux->keyid->length; j++) { - BIO_printf(out, "%s%02X", j ? ":" : "", aux->keyid->data[j]); - } - BIO_write(out, "\n", 1); - } - return 1; -} diff --git a/third_party/boringssl/src/crypto/x509/t_x509a.cc b/third_party/boringssl/src/crypto/x509/t_x509a.cc new file mode 100644 index 00000000..385ea022 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/t_x509a.cc @@ -0,0 +1,81 @@ +// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +// X509_CERT_AUX and string set routines + +int bssl::X509_CERT_AUX_print(BIO *out, X509_CERT_AUX *aux, int indent) { + char oidstr[80], first; + size_t i; + int j; + if (!aux) { + return 1; + } + if (aux->trust) { + first = 1; + BIO_printf(out, "%*sTrusted Uses:\n%*s", indent, "", indent + 2, ""); + for (i = 0; i < sk_ASN1_OBJECT_num(aux->trust); i++) { + if (!first) { + BIO_puts(out, ", "); + } else { + first = 0; + } + OBJ_obj2txt(oidstr, sizeof oidstr, sk_ASN1_OBJECT_value(aux->trust, i), + 0); + BIO_puts(out, oidstr); + } + BIO_puts(out, "\n"); + } else { + BIO_printf(out, "%*sNo Trusted Uses.\n", indent, ""); + } + if (aux->reject) { + first = 1; + BIO_printf(out, "%*sRejected Uses:\n%*s", indent, "", indent + 2, ""); + for (i = 0; i < sk_ASN1_OBJECT_num(aux->reject); i++) { + if (!first) { + BIO_puts(out, ", "); + } else { + first = 0; + } + OBJ_obj2txt(oidstr, sizeof oidstr, sk_ASN1_OBJECT_value(aux->reject, i), + 0); + BIO_puts(out, oidstr); + } + BIO_puts(out, "\n"); + } else { + BIO_printf(out, "%*sNo Rejected Uses.\n", indent, ""); + } + if (aux->alias) { + BIO_printf(out, "%*sAlias: %.*s\n", indent, "", aux->alias->length, + aux->alias->data); + } + if (aux->keyid) { + BIO_printf(out, "%*sKey Id: ", indent, ""); + for (j = 0; j < aux->keyid->length; j++) { + BIO_printf(out, "%s%02X", j ? ":" : "", aux->keyid->data[j]); + } + BIO_write(out, "\n", 1); + } + return 1; +} diff --git a/third_party/boringssl/src/crypto/x509/v3_akey.cc b/third_party/boringssl/src/crypto/x509/v3_akey.cc new file mode 100644 index 00000000..9f5df55f --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_akey.cc @@ -0,0 +1,182 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +static STACK_OF(CONF_VALUE) *i2v_AUTHORITY_KEYID( + const X509V3_EXT_METHOD *method, void *ext, STACK_OF(CONF_VALUE) *extlist); +static void *v2i_AUTHORITY_KEYID(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *values); + +const X509V3_EXT_METHOD bssl::v3_akey_id = { + NID_authority_key_identifier, + X509V3_EXT_MULTILINE, + ASN1_ITEM_ref(AUTHORITY_KEYID), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + i2v_AUTHORITY_KEYID, + v2i_AUTHORITY_KEYID, + nullptr, + nullptr, + nullptr, +}; + +static STACK_OF(CONF_VALUE) *i2v_AUTHORITY_KEYID( + const X509V3_EXT_METHOD *method, void *ext, STACK_OF(CONF_VALUE) *extlist) { + const AUTHORITY_KEYID *akeyid = + reinterpret_cast(ext); + int extlist_was_null = extlist == nullptr; + if (akeyid->keyid) { + char *tmp = x509v3_bytes_to_hex(akeyid->keyid->data, akeyid->keyid->length); + int ok = tmp != nullptr && X509V3_add_value("keyid", tmp, &extlist); + OPENSSL_free(tmp); + if (!ok) { + goto err; + } + } + if (akeyid->issuer) { + STACK_OF(CONF_VALUE) *tmpextlist = + i2v_GENERAL_NAMES(nullptr, akeyid->issuer, extlist); + if (tmpextlist == nullptr) { + goto err; + } + extlist = tmpextlist; + } + if (akeyid->serial) { + if (!X509V3_add_value_int("serial", akeyid->serial, &extlist)) { + goto err; + } + } + return extlist; + +err: + if (extlist_was_null) { + sk_CONF_VALUE_pop_free(extlist, X509V3_conf_free); + } + return nullptr; +} + +// Currently two options: keyid: use the issuers subject keyid, the value +// 'always' means its is an error if the issuer certificate doesn't have a +// key id. issuer: use the issuers cert issuer and serial number. The default +// is to only use this if keyid is not present. With the option 'always' this +// is always included. + +static void *v2i_AUTHORITY_KEYID(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *values) { + char keyid = 0, issuer = 0; + int j; + ASN1_OCTET_STRING *ikeyid = nullptr; + X509_NAME *isname = nullptr; + GENERAL_NAMES *gens = nullptr; + GENERAL_NAME *gen = nullptr; + ASN1_INTEGER *serial = nullptr; + const X509 *cert; + AUTHORITY_KEYID *akeyid; + + for (size_t i = 0; i < sk_CONF_VALUE_num(values); i++) { + const CONF_VALUE *cnf = sk_CONF_VALUE_value(values, i); + if (!strcmp(cnf->name, "keyid")) { + keyid = 1; + if (cnf->value && !strcmp(cnf->value, "always")) { + keyid = 2; + } + } else if (!strcmp(cnf->name, "issuer")) { + issuer = 1; + if (cnf->value && !strcmp(cnf->value, "always")) { + issuer = 2; + } + } else { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_UNKNOWN_OPTION); + ERR_add_error_data(2, "name=", cnf->name); + return nullptr; + } + } + + if (!ctx || !ctx->issuer_cert) { + if (ctx && (ctx->flags == X509V3_CTX_TEST)) { + return AUTHORITY_KEYID_new(); + } + OPENSSL_PUT_ERROR(X509V3, X509V3_R_NO_ISSUER_CERTIFICATE); + return nullptr; + } + + cert = ctx->issuer_cert; + + if (keyid) { + j = X509_get_ext_by_NID(cert, NID_subject_key_identifier, -1); + const X509_EXTENSION *ext; + if ((j >= 0) && (ext = X509_get_ext(cert, j))) { + ikeyid = reinterpret_cast(X509V3_EXT_d2i(ext)); + } + if (keyid == 2 && !ikeyid) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_UNABLE_TO_GET_ISSUER_KEYID); + return nullptr; + } + } + + if ((issuer && !ikeyid) || (issuer == 2)) { + isname = X509_NAME_dup(X509_get_issuer_name(cert)); + serial = ASN1_INTEGER_dup(X509_get0_serialNumber(cert)); + if (!isname || !serial) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_UNABLE_TO_GET_ISSUER_DETAILS); + goto err; + } + } + + if (!(akeyid = AUTHORITY_KEYID_new())) { + goto err; + } + + if (isname) { + if (!(gens = sk_GENERAL_NAME_new_null()) || !(gen = GENERAL_NAME_new()) || + !sk_GENERAL_NAME_push(gens, gen)) { + goto err; + } + gen->type = GEN_DIRNAME; + gen->d.dirn = isname; + } + + akeyid->issuer = gens; + akeyid->serial = serial; + akeyid->keyid = ikeyid; + + return akeyid; + +err: + X509_NAME_free(isname); + ASN1_INTEGER_free(serial); + ASN1_OCTET_STRING_free(ikeyid); + return nullptr; +} diff --git a/third_party/boringssl/src/crypto/x509/v3_akeya.cc b/third_party/boringssl/src/crypto/x509/v3_akeya.cc new file mode 100644 index 00000000..4dd885a8 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_akeya.cc @@ -0,0 +1,31 @@ +// Copyright 2001-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include "internal.h" + + +ASN1_SEQUENCE(AUTHORITY_KEYID) = { + ASN1_IMP_OPT(AUTHORITY_KEYID, keyid, ASN1_OCTET_STRING, 0), + ASN1_IMP_SEQUENCE_OF_OPT(AUTHORITY_KEYID, issuer, bssl::GENERAL_NAME, 1), + ASN1_IMP_OPT(AUTHORITY_KEYID, serial, ASN1_INTEGER, 2), +} ASN1_SEQUENCE_END(AUTHORITY_KEYID) + +IMPLEMENT_ASN1_FUNCTIONS_const(AUTHORITY_KEYID) diff --git a/third_party/boringssl/src/crypto/x509/v3_alt.cc b/third_party/boringssl/src/crypto/x509/v3_alt.cc new file mode 100644 index 00000000..8aa2da8f --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_alt.cc @@ -0,0 +1,636 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +static void *v2i_subject_alt(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval); +static void *v2i_issuer_alt(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval); +static int copy_email(const X509V3_CTX *ctx, GENERAL_NAMES *gens, int move_p); +static int copy_issuer(const X509V3_CTX *ctx, GENERAL_NAMES *gens); +static int do_othername(GENERAL_NAME *gen, const char *value, + const X509V3_CTX *ctx); +static int do_dirname(GENERAL_NAME *gen, const char *value, + const X509V3_CTX *ctx); + +static STACK_OF(CONF_VALUE) *i2v_GENERAL_NAMES_cb( + const X509V3_EXT_METHOD *method, void *ext, STACK_OF(CONF_VALUE) *ret) { + return i2v_GENERAL_NAMES(method, reinterpret_cast(ext), ret); +} + +const X509V3_EXT_METHOD bssl::v3_subject_alt_name = { + NID_subject_alt_name, + 0, + ASN1_ITEM_ref(GENERAL_NAMES), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + i2v_GENERAL_NAMES_cb, + v2i_subject_alt, + nullptr, + nullptr, + nullptr, +}; + +const X509V3_EXT_METHOD bssl::v3_issuer_alt_name = { + NID_issuer_alt_name, + 0, + ASN1_ITEM_ref(GENERAL_NAMES), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + i2v_GENERAL_NAMES_cb, + v2i_issuer_alt, + nullptr, + nullptr, + nullptr, +}; + +const X509V3_EXT_METHOD bssl::v3_certificate_issuer = { + NID_certificate_issuer, + 0, + ASN1_ITEM_ref(GENERAL_NAMES), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + i2v_GENERAL_NAMES_cb, + nullptr, + nullptr, + nullptr, + nullptr, +}; + +STACK_OF(CONF_VALUE) *i2v_GENERAL_NAMES(const X509V3_EXT_METHOD *method, + const GENERAL_NAMES *gens, + STACK_OF(CONF_VALUE) *ret) { + int ret_was_null = ret == nullptr; + for (size_t i = 0; i < sk_GENERAL_NAME_num(gens); i++) { + const GENERAL_NAME *gen = sk_GENERAL_NAME_value(gens, i); + STACK_OF(CONF_VALUE) *tmp = i2v_GENERAL_NAME(method, gen, ret); + if (tmp == nullptr) { + if (ret_was_null) { + sk_CONF_VALUE_pop_free(ret, X509V3_conf_free); + } + return nullptr; + } + ret = tmp; + } + if (!ret) { + return sk_CONF_VALUE_new_null(); + } + return ret; +} + +STACK_OF(CONF_VALUE) *i2v_GENERAL_NAME(const X509V3_EXT_METHOD *method, + const GENERAL_NAME *gen, + STACK_OF(CONF_VALUE) *ret) { + // Note the error-handling for this function relies on there being at most + // one |X509V3_add_value| call. If there were two and the second failed, we + // would need to sometimes free the first call's result. + unsigned char *p; + char oline[256], htmp[5]; + int i; + switch (gen->type) { + case GEN_OTHERNAME: + if (!X509V3_add_value("othername", "", &ret)) { + return nullptr; + } + break; + + case GEN_X400: + if (!X509V3_add_value("X400Name", "", &ret)) { + return nullptr; + } + break; + + case GEN_EDIPARTY: + if (!X509V3_add_value("EdiPartyName", "", &ret)) { + return nullptr; + } + break; + + case GEN_EMAIL: + if (!x509V3_add_value_asn1_string("email", gen->d.ia5, &ret)) { + return nullptr; + } + break; + + case GEN_DNS: + if (!x509V3_add_value_asn1_string("DNS", gen->d.ia5, &ret)) { + return nullptr; + } + break; + + case GEN_URI: + if (!x509V3_add_value_asn1_string("URI", gen->d.ia5, &ret)) { + return nullptr; + } + break; + + case GEN_DIRNAME: + if (X509_NAME_oneline(gen->d.dirn, oline, 256) == nullptr || + !X509V3_add_value("DirName", oline, &ret)) { + return nullptr; + } + break; + + case GEN_IPADD: + p = gen->d.ip->data; + if (gen->d.ip->length == 4) { + snprintf(oline, sizeof(oline), "%d.%d.%d.%d", p[0], p[1], p[2], p[3]); + } else if (gen->d.ip->length == 16) { + oline[0] = 0; + for (i = 0; i < 8; i++) { + uint16_t v = ((uint16_t)p[0] << 8) | p[1]; + snprintf(htmp, sizeof(htmp), "%X", v); + p += 2; + OPENSSL_strlcat(oline, htmp, sizeof(oline)); + if (i != 7) { + OPENSSL_strlcat(oline, ":", sizeof(oline)); + } + } + } else { + if (!X509V3_add_value("IP Address", "", &ret)) { + return nullptr; + } + break; + } + if (!X509V3_add_value("IP Address", oline, &ret)) { + return nullptr; + } + break; + + case GEN_RID: + i2t_ASN1_OBJECT(oline, 256, gen->d.rid); + if (!X509V3_add_value("Registered ID", oline, &ret)) { + return nullptr; + } + break; + } + return ret; +} + +int GENERAL_NAME_print(BIO *out, const GENERAL_NAME *gen) { + switch (gen->type) { + case GEN_OTHERNAME: + BIO_printf(out, "othername:"); + break; + + case GEN_X400: + BIO_printf(out, "X400Name:"); + break; + + case GEN_EDIPARTY: + // Maybe fix this: it is supported now + BIO_printf(out, "EdiPartyName:"); + break; + + case GEN_EMAIL: + BIO_printf(out, "email:"); + ASN1_STRING_print(out, gen->d.ia5); + break; + + case GEN_DNS: + BIO_printf(out, "DNS:"); + ASN1_STRING_print(out, gen->d.ia5); + break; + + case GEN_URI: + BIO_printf(out, "URI:"); + ASN1_STRING_print(out, gen->d.ia5); + break; + + case GEN_DIRNAME: + BIO_printf(out, "DirName: "); + X509_NAME_print_ex(out, gen->d.dirn, 0, XN_FLAG_ONELINE); + break; + + case GEN_IPADD: { + const unsigned char *p = gen->d.ip->data; + if (gen->d.ip->length == 4) { + BIO_printf(out, "IP Address:%d.%d.%d.%d", p[0], p[1], p[2], p[3]); + } else if (gen->d.ip->length == 16) { + BIO_printf(out, "IP Address"); + for (int i = 0; i < 8; i++) { + uint16_t v = ((uint16_t)p[0] << 8) | p[1]; + BIO_printf(out, ":%X", v); + p += 2; + } + BIO_puts(out, "\n"); + } else { + BIO_printf(out, "IP Address:"); + break; + } + break; + } + + case GEN_RID: + BIO_printf(out, "Registered ID"); + i2a_ASN1_OBJECT(out, gen->d.rid); + break; + } + return 1; +} + +static void *v2i_issuer_alt(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval) { + GENERAL_NAMES *gens = sk_GENERAL_NAME_new_null(); + if (gens == nullptr) { + return nullptr; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(nval); i++) { + const CONF_VALUE *cnf = sk_CONF_VALUE_value(nval, i); + if (x509v3_conf_name_matches(cnf->name, "issuer") && cnf->value && + !strcmp(cnf->value, "copy")) { + if (!copy_issuer(ctx, gens)) { + goto err; + } + } else { + GENERAL_NAME *gen = v2i_GENERAL_NAME(method, ctx, cnf); + if (gen == nullptr || !sk_GENERAL_NAME_push(gens, gen)) { + GENERAL_NAME_free(gen); + goto err; + } + } + } + return gens; +err: + sk_GENERAL_NAME_pop_free(gens, GENERAL_NAME_free); + return nullptr; +} + +// Append subject altname of issuer to issuer alt name of subject + +static int copy_issuer(const X509V3_CTX *ctx, GENERAL_NAMES *gens) { + if (ctx && (ctx->flags == X509V3_CTX_TEST)) { + return 1; + } + if (!ctx || !ctx->issuer_cert) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_NO_ISSUER_DETAILS); + return 0; + } + int i = X509_get_ext_by_NID(ctx->issuer_cert, NID_subject_alt_name, -1); + if (i < 0) { + return 1; + } + + int ret = 0; + GENERAL_NAMES *ialt = nullptr; + X509_EXTENSION *ext; + if (!(ext = X509_get_ext(ctx->issuer_cert, i)) || + !(ialt = reinterpret_cast(X509V3_EXT_d2i(ext)))) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_ISSUER_DECODE_ERROR); + goto err; + } + + for (size_t j = 0; j < sk_GENERAL_NAME_num(ialt); j++) { + GENERAL_NAME *gen = sk_GENERAL_NAME_value(ialt, j); + if (!sk_GENERAL_NAME_push(gens, gen)) { + goto err; + } + // Ownership of |gen| has moved from |ialt| to |gens|. + sk_GENERAL_NAME_set(ialt, j, nullptr); + } + + ret = 1; + +err: + GENERAL_NAMES_free(ialt); + return ret; +} + +static void *v2i_subject_alt(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval) { + GENERAL_NAMES *gens = sk_GENERAL_NAME_new_null(); + if (gens == nullptr) { + return nullptr; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(nval); i++) { + const CONF_VALUE *cnf = sk_CONF_VALUE_value(nval, i); + if (x509v3_conf_name_matches(cnf->name, "email") && cnf->value && + !strcmp(cnf->value, "copy")) { + if (!copy_email(ctx, gens, 0)) { + goto err; + } + } else if (x509v3_conf_name_matches(cnf->name, "email") && cnf->value && + !strcmp(cnf->value, "move")) { + if (!copy_email(ctx, gens, 1)) { + goto err; + } + } else { + GENERAL_NAME *gen = v2i_GENERAL_NAME(method, ctx, cnf); + if (gen == nullptr || !sk_GENERAL_NAME_push(gens, gen)) { + GENERAL_NAME_free(gen); + goto err; + } + } + } + return gens; +err: + sk_GENERAL_NAME_pop_free(gens, GENERAL_NAME_free); + return nullptr; +} + +// Copy any email addresses in a certificate or request to GENERAL_NAMES + +static int copy_email(const X509V3_CTX *ctx, GENERAL_NAMES *gens, int move_p) { + X509_NAME *nm; + ASN1_IA5STRING *email = nullptr; + X509_NAME_ENTRY *ne; + GENERAL_NAME *gen = nullptr; + int i; + if (ctx != nullptr && ctx->flags == X509V3_CTX_TEST) { + return 1; + } + if (!ctx || (!ctx->subject_cert && !ctx->subject_req)) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_NO_SUBJECT_DETAILS); + goto err; + } + // Find the subject name + if (ctx->subject_cert) { + nm = X509_get_subject_name(ctx->subject_cert); + } else { + nm = X509_REQ_get_subject_name(ctx->subject_req); + } + + // Now add any email address(es) to STACK + i = -1; + while ((i = X509_NAME_get_index_by_NID(nm, NID_pkcs9_emailAddress, i)) >= 0) { + ne = X509_NAME_get_entry(nm, i); + email = ASN1_STRING_dup(X509_NAME_ENTRY_get_data(ne)); + if (move_p) { + X509_NAME_delete_entry(nm, i); + X509_NAME_ENTRY_free(ne); + i--; + } + if (!email || !(gen = GENERAL_NAME_new())) { + goto err; + } + gen->d.ia5 = email; + email = nullptr; + gen->type = GEN_EMAIL; + if (!sk_GENERAL_NAME_push(gens, gen)) { + goto err; + } + gen = nullptr; + } + + return 1; + +err: + GENERAL_NAME_free(gen); + ASN1_IA5STRING_free(email); + return 0; +} + +GENERAL_NAMES *bssl::v2i_GENERAL_NAMES(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval) { + GENERAL_NAMES *gens = sk_GENERAL_NAME_new_null(); + if (gens == nullptr) { + return nullptr; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(nval); i++) { + const CONF_VALUE *cnf = sk_CONF_VALUE_value(nval, i); + GENERAL_NAME *gen = v2i_GENERAL_NAME(method, ctx, cnf); + if (gen == nullptr || !sk_GENERAL_NAME_push(gens, gen)) { + GENERAL_NAME_free(gen); + goto err; + } + } + return gens; +err: + sk_GENERAL_NAME_pop_free(gens, GENERAL_NAME_free); + return nullptr; +} + +GENERAL_NAME *bssl::v2i_GENERAL_NAME(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const CONF_VALUE *cnf) { + return v2i_GENERAL_NAME_ex(nullptr, method, ctx, cnf, 0); +} + +static GENERAL_NAME *a2i_GENERAL_NAME(GENERAL_NAME *out, + const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, int gen_type, + const char *value, int is_nc) { + if (!value) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_MISSING_VALUE); + return nullptr; + } + + GENERAL_NAME *gen = nullptr; + if (out) { + gen = out; + } else { + gen = GENERAL_NAME_new(); + if (gen == nullptr) { + return nullptr; + } + } + + switch (gen_type) { + case GEN_URI: + case GEN_EMAIL: + case GEN_DNS: { + ASN1_IA5STRING *str = ASN1_IA5STRING_new(); + if (str == nullptr || !ASN1_STRING_set(str, value, strlen(value))) { + ASN1_STRING_free(str); + goto err; + } + gen->type = gen_type; + gen->d.ia5 = str; + break; + } + + case GEN_RID: { + ASN1_OBJECT *obj; + if (!(obj = OBJ_txt2obj(value, 0))) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_BAD_OBJECT); + ERR_add_error_data(2, "value=", value); + goto err; + } + gen->type = GEN_RID; + gen->d.rid = obj; + break; + } + + case GEN_IPADD: + gen->type = GEN_IPADD; + if (is_nc) { + gen->d.ip = a2i_IPADDRESS_NC(value); + } else { + gen->d.ip = a2i_IPADDRESS(value); + } + if (gen->d.ip == nullptr) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_BAD_IP_ADDRESS); + ERR_add_error_data(2, "value=", value); + goto err; + } + break; + + case GEN_DIRNAME: + if (!do_dirname(gen, value, ctx)) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_DIRNAME_ERROR); + goto err; + } + break; + + case GEN_OTHERNAME: + if (!do_othername(gen, value, ctx)) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_OTHERNAME_ERROR); + goto err; + } + break; + default: + OPENSSL_PUT_ERROR(X509V3, X509V3_R_UNSUPPORTED_TYPE); + goto err; + } + + return gen; + +err: + if (!out) { + GENERAL_NAME_free(gen); + } + return nullptr; +} + +GENERAL_NAME *bssl::v2i_GENERAL_NAME_ex(GENERAL_NAME *out, + const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const CONF_VALUE *cnf, int is_nc) { + const char *name = cnf->name; + const char *value = cnf->value; + if (!value) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_MISSING_VALUE); + return nullptr; + } + + int type; + if (x509v3_conf_name_matches(name, "email")) { + type = GEN_EMAIL; + } else if (x509v3_conf_name_matches(name, "URI")) { + type = GEN_URI; + } else if (x509v3_conf_name_matches(name, "DNS")) { + type = GEN_DNS; + } else if (x509v3_conf_name_matches(name, "RID")) { + type = GEN_RID; + } else if (x509v3_conf_name_matches(name, "IP")) { + type = GEN_IPADD; + } else if (x509v3_conf_name_matches(name, "dirName")) { + type = GEN_DIRNAME; + } else if (x509v3_conf_name_matches(name, "otherName")) { + type = GEN_OTHERNAME; + } else { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_UNSUPPORTED_OPTION); + ERR_add_error_data(2, "name=", name); + return nullptr; + } + + return a2i_GENERAL_NAME(out, method, ctx, type, value, is_nc); +} + +static int do_othername(GENERAL_NAME *gen, const char *value, + const X509V3_CTX *ctx) { + const char *semicolon = strchr(value, ';'); + if (semicolon == nullptr) { + return 0; + } + + OTHERNAME *name = OTHERNAME_new(); + if (name == nullptr) { + return 0; + } + + char *objtmp = OPENSSL_strndup(value, semicolon - value); + if (objtmp == nullptr) { + goto err; + } + ASN1_OBJECT_free(name->type_id); + name->type_id = OBJ_txt2obj(objtmp, /*dont_search_names=*/0); + OPENSSL_free(objtmp); + if (name->type_id == nullptr) { + goto err; + } + + ASN1_TYPE_free(name->value); + name->value = ASN1_generate_v3(semicolon + 1, ctx); + if (name->value == nullptr) { + goto err; + } + + gen->type = GEN_OTHERNAME; + gen->d.otherName = name; + return 1; + +err: + OTHERNAME_free(name); + return 0; +} + +static int do_dirname(GENERAL_NAME *gen, const char *value, + const X509V3_CTX *ctx) { + int ret = 0; + const STACK_OF(CONF_VALUE) *sk = X509V3_get_section(ctx, value); + X509_NAME *nm = X509_NAME_new(); + if (nm == nullptr) { + goto err; + } + if (sk == nullptr) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_SECTION_NOT_FOUND); + ERR_add_error_data(2, "section=", value); + goto err; + } + // FIXME: should allow other character types... + if (!X509V3_NAME_from_section(nm, sk, MBSTRING_ASC)) { + goto err; + } + gen->type = GEN_DIRNAME; + gen->d.dirn = nm; + ret = 1; + +err: + if (!ret) { + X509_NAME_free(nm); + } + return ret; +} diff --git a/third_party/boringssl/src/crypto/x509/v3_bcons.cc b/third_party/boringssl/src/crypto/x509/v3_bcons.cc new file mode 100644 index 00000000..61194001 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_bcons.cc @@ -0,0 +1,96 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +static STACK_OF(CONF_VALUE) *i2v_BASIC_CONSTRAINTS( + const X509V3_EXT_METHOD *method, void *ext, STACK_OF(CONF_VALUE) *extlist); +static void *v2i_BASIC_CONSTRAINTS(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *values); + +const X509V3_EXT_METHOD bssl::v3_bcons = { + NID_basic_constraints, + 0, + ASN1_ITEM_ref(BASIC_CONSTRAINTS), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + i2v_BASIC_CONSTRAINTS, + v2i_BASIC_CONSTRAINTS, + nullptr, + nullptr, + nullptr, +}; + +ASN1_SEQUENCE(BASIC_CONSTRAINTS) = { + ASN1_OPT(BASIC_CONSTRAINTS, ca, ASN1_FBOOLEAN), + ASN1_OPT(BASIC_CONSTRAINTS, pathlen, ASN1_INTEGER), +} ASN1_SEQUENCE_END(BASIC_CONSTRAINTS) + +IMPLEMENT_ASN1_FUNCTIONS_const(BASIC_CONSTRAINTS) + +static STACK_OF(CONF_VALUE) *i2v_BASIC_CONSTRAINTS( + const X509V3_EXT_METHOD *method, void *ext, STACK_OF(CONF_VALUE) *extlist) { + const BASIC_CONSTRAINTS *bcons = + reinterpret_cast(ext); + X509V3_add_value_bool("CA", bcons->ca, &extlist); + X509V3_add_value_int("pathlen", bcons->pathlen, &extlist); + return extlist; +} + +static void *v2i_BASIC_CONSTRAINTS(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *values) { + BASIC_CONSTRAINTS *bcons = nullptr; + if (!(bcons = BASIC_CONSTRAINTS_new())) { + return nullptr; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(values); i++) { + const CONF_VALUE *val = sk_CONF_VALUE_value(values, i); + if (!strcmp(val->name, "CA")) { + if (!X509V3_get_value_bool(val, &bcons->ca)) { + goto err; + } + } else if (!strcmp(val->name, "pathlen")) { + if (!X509V3_get_value_int(val, &bcons->pathlen)) { + goto err; + } + } else { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_NAME); + X509V3_conf_err(val); + goto err; + } + } + return bcons; +err: + BASIC_CONSTRAINTS_free(bcons); + return nullptr; +} diff --git a/third_party/boringssl/src/crypto/x509/v3_bitst.cc b/third_party/boringssl/src/crypto/x509/v3_bitst.cc new file mode 100644 index 00000000..9b3acc8f --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_bitst.cc @@ -0,0 +1,100 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +static const BIT_STRING_BITNAME ns_cert_type_table[] = { + {0, "SSL Client", "client"}, {1, "SSL Server", "server"}, + {2, "S/MIME", "email"}, {3, "Object Signing", "objsign"}, + {4, "Unused", "reserved"}, {5, "SSL CA", "sslCA"}, + {6, "S/MIME CA", "emailCA"}, {7, "Object Signing CA", "objCA"}, + {-1, nullptr, nullptr}}; + +static const BIT_STRING_BITNAME key_usage_type_table[] = { + {0, "Digital Signature", "digitalSignature"}, + {1, "Non Repudiation", "nonRepudiation"}, + {2, "Key Encipherment", "keyEncipherment"}, + {3, "Data Encipherment", "dataEncipherment"}, + {4, "Key Agreement", "keyAgreement"}, + {5, "Certificate Sign", "keyCertSign"}, + {6, "CRL Sign", "cRLSign"}, + {7, "Encipher Only", "encipherOnly"}, + {8, "Decipher Only", "decipherOnly"}, + {-1, nullptr, nullptr}}; + +static STACK_OF(CONF_VALUE) *i2v_ASN1_BIT_STRING( + const X509V3_EXT_METHOD *method, void *ext, STACK_OF(CONF_VALUE) *ret) { + const ASN1_BIT_STRING *bits = reinterpret_cast(ext); + const BIT_STRING_BITNAME *bnam; + for (bnam = reinterpret_cast(method->usr_data); + bnam->lname; bnam++) { + if (ASN1_BIT_STRING_get_bit(bits, bnam->bitnum)) { + X509V3_add_value(bnam->lname, nullptr, &ret); + } + } + return ret; +} + +static void *v2i_ASN1_BIT_STRING(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval) { + ASN1_BIT_STRING *bs; + if (!(bs = ASN1_BIT_STRING_new())) { + return nullptr; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(nval); i++) { + const CONF_VALUE *val = sk_CONF_VALUE_value(nval, i); + const BIT_STRING_BITNAME *bnam; + for (bnam = reinterpret_cast(method->usr_data); + bnam->lname; bnam++) { + if (!strcmp(bnam->sname, val->name) || !strcmp(bnam->lname, val->name)) { + if (!ASN1_BIT_STRING_set_bit(bs, bnam->bitnum, 1)) { + ASN1_BIT_STRING_free(bs); + return nullptr; + } + break; + } + } + if (!bnam->lname) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_UNKNOWN_BIT_STRING_ARGUMENT); + X509V3_conf_err(val); + ASN1_BIT_STRING_free(bs); + return nullptr; + } + } + return bs; +} + +#define EXT_BITSTRING(nid, table) \ + { \ + nid, 0, ASN1_ITEM_ref(ASN1_BIT_STRING), 0, 0, 0, 0, 0, 0, \ + i2v_ASN1_BIT_STRING, v2i_ASN1_BIT_STRING, nullptr, nullptr, \ + (void *)(table) \ + } + +const X509V3_EXT_METHOD bssl::v3_nscert = + EXT_BITSTRING(NID_netscape_cert_type, ns_cert_type_table); +const X509V3_EXT_METHOD bssl::v3_key_usage = + EXT_BITSTRING(NID_key_usage, key_usage_type_table); diff --git a/third_party/boringssl/src/crypto/x509/v3_conf.cc b/third_party/boringssl/src/crypto/x509/v3_conf.cc new file mode 100644 index 00000000..d3d8e508 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_conf.cc @@ -0,0 +1,369 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// extension creation utilities + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +static int v3_check_critical(const char **value); +static int v3_check_generic(const char **value); +static X509_EXTENSION *do_ext_nconf(const CONF *conf, const X509V3_CTX *ctx, + int ext_nid, int crit, const char *value); +static X509_EXTENSION *v3_generic_extension(const char *ext, const char *value, + int crit, int type, + const X509V3_CTX *ctx); +static X509_EXTENSION *do_ext_i2d(const X509V3_EXT_METHOD *method, int ext_nid, + int crit, void *ext_struc); +static unsigned char *generic_asn1(const char *value, const X509V3_CTX *ctx, + size_t *ext_len); + +X509_EXTENSION *X509V3_EXT_nconf(const CONF *conf, const X509V3_CTX *ctx, + const char *name, const char *value) { + // If omitted, fill in an empty |X509V3_CTX|. + X509V3_CTX ctx_tmp; + if (ctx == nullptr) { + X509V3_set_ctx(&ctx_tmp, nullptr, nullptr, nullptr, nullptr, 0); + X509V3_set_nconf(&ctx_tmp, conf); + ctx = &ctx_tmp; + } + + int crit = v3_check_critical(&value); + int ext_type = v3_check_generic(&value); + if (ext_type != 0) { + return v3_generic_extension(name, value, crit, ext_type, ctx); + } + X509_EXTENSION *ret = do_ext_nconf(conf, ctx, OBJ_sn2nid(name), crit, value); + if (!ret) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_ERROR_IN_EXTENSION); + ERR_add_error_data(4, "name=", name, ", value=", value); + } + return ret; +} + +X509_EXTENSION *X509V3_EXT_nconf_nid(const CONF *conf, const X509V3_CTX *ctx, + int ext_nid, const char *value) { + // If omitted, fill in an empty |X509V3_CTX|. + X509V3_CTX ctx_tmp; + if (ctx == nullptr) { + X509V3_set_ctx(&ctx_tmp, nullptr, nullptr, nullptr, nullptr, 0); + X509V3_set_nconf(&ctx_tmp, conf); + ctx = &ctx_tmp; + } + + int crit = v3_check_critical(&value); + int ext_type = v3_check_generic(&value); + if (ext_type != 0) { + return v3_generic_extension(OBJ_nid2sn(ext_nid), value, crit, ext_type, + ctx); + } + return do_ext_nconf(conf, ctx, ext_nid, crit, value); +} + +// CONF *conf: Config file +// char *value: Value +static X509_EXTENSION *do_ext_nconf(const CONF *conf, const X509V3_CTX *ctx, + int ext_nid, int crit, const char *value) { + const X509V3_EXT_METHOD *method; + X509_EXTENSION *ext; + const STACK_OF(CONF_VALUE) *nval; + STACK_OF(CONF_VALUE) *nval_owned = nullptr; + void *ext_struc; + if (ext_nid == NID_undef) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_UNKNOWN_EXTENSION_NAME); + return nullptr; + } + if (!(method = X509V3_EXT_get_nid(ext_nid))) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_UNKNOWN_EXTENSION); + return nullptr; + } + // Now get internal extension representation based on type + if (method->v2i) { + if (*value == '@') { + // TODO(davidben): This is the only place where |X509V3_EXT_nconf|'s + // |conf| parameter is used. All other codepaths use the copy inside + // |ctx|. Should this be switched and then the parameter ignored? + if (conf == nullptr) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_NO_CONFIG_DATABASE); + return nullptr; + } + nval = NCONF_get_section(conf, value + 1); + } else { + nval_owned = X509V3_parse_list(value); + nval = nval_owned; + } + if (nval == nullptr || sk_CONF_VALUE_num(nval) <= 0) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_EXTENSION_STRING); + ERR_add_error_data(4, "name=", OBJ_nid2sn(ext_nid), ",section=", value); + sk_CONF_VALUE_pop_free(nval_owned, X509V3_conf_free); + return nullptr; + } + ext_struc = method->v2i(method, ctx, nval); + sk_CONF_VALUE_pop_free(nval_owned, X509V3_conf_free); + if (!ext_struc) { + return nullptr; + } + } else if (method->s2i) { + if (!(ext_struc = method->s2i(method, ctx, value))) { + return nullptr; + } + } else if (method->r2i) { + // TODO(davidben): Should this check be removed? This matches OpenSSL, but + // r2i-based extensions do not necessarily require a config database. The + // two built-in extensions only use it some of the time, and already handle + // |X509V3_get_section| returning NULL. + if (!ctx->db) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_NO_CONFIG_DATABASE); + return nullptr; + } + if (!(ext_struc = method->r2i(method, ctx, value))) { + return nullptr; + } + } else { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_EXTENSION_SETTING_NOT_SUPPORTED); + ERR_add_error_data(2, "name=", OBJ_nid2sn(ext_nid)); + return nullptr; + } + + ext = do_ext_i2d(method, ext_nid, crit, ext_struc); + ASN1_item_free(reinterpret_cast(ext_struc), + ASN1_ITEM_ptr(method->it)); + return ext; +} + +static X509_EXTENSION *do_ext_i2d(const X509V3_EXT_METHOD *method, int ext_nid, + int crit, void *ext_struc) { + // Convert the extension's internal representation to DER. + unsigned char *ext_der = nullptr; + int ext_len = ASN1_item_i2d(reinterpret_cast(ext_struc), + &ext_der, ASN1_ITEM_ptr(method->it)); + if (ext_len < 0) { + return nullptr; + } + + ASN1_OCTET_STRING *ext_oct = ASN1_OCTET_STRING_new(); + if (ext_oct == nullptr) { + OPENSSL_free(ext_der); + return nullptr; + } + ASN1_STRING_set0(ext_oct, ext_der, ext_len); + + X509_EXTENSION *ext = + X509_EXTENSION_create_by_NID(nullptr, ext_nid, crit, ext_oct); + ASN1_OCTET_STRING_free(ext_oct); + return ext; +} + +// Given an internal structure, nid and critical flag create an extension + +X509_EXTENSION *X509V3_EXT_i2d(int ext_nid, int crit, void *ext_struc) { + const X509V3_EXT_METHOD *method; + if (!(method = X509V3_EXT_get_nid(ext_nid))) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_UNKNOWN_EXTENSION); + return nullptr; + } + return do_ext_i2d(method, ext_nid, crit, ext_struc); +} + +// Check the extension string for critical flag +static int v3_check_critical(const char **value) { + const char *p = *value; + if ((strlen(p) < 9) || strncmp(p, "critical,", 9)) { + return 0; + } + p += 9; + while (OPENSSL_isspace((unsigned char)*p)) { + p++; + } + *value = p; + return 1; +} + +// Check extension string for generic extension and return the type +static int v3_check_generic(const char **value) { + int gen_type = 0; + const char *p = *value; + if ((strlen(p) >= 4) && !strncmp(p, "DER:", 4)) { + p += 4; + gen_type = 1; + } else if ((strlen(p) >= 5) && !strncmp(p, "ASN1:", 5)) { + p += 5; + gen_type = 2; + } else { + return 0; + } + + while (OPENSSL_isspace((unsigned char)*p)) { + p++; + } + *value = p; + return gen_type; +} + +// Create a generic extension: for now just handle DER type +static X509_EXTENSION *v3_generic_extension(const char *ext, const char *value, + int crit, int gen_type, + const X509V3_CTX *ctx) { + UniquePtr obj(OBJ_txt2obj(ext, 0)); + if (obj == nullptr) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_EXTENSION_NAME_ERROR); + ERR_add_error_data(2, "name=", ext); + return nullptr; + } + + UniquePtr ext_der; + size_t ext_len = 0; + if (gen_type == 1) { + ext_der.reset(x509v3_hex_to_bytes(value, &ext_len)); + } else if (gen_type == 2) { + ext_der.reset(generic_asn1(value, ctx, &ext_len)); + } + + if (ext_der == nullptr) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_EXTENSION_VALUE_ERROR); + ERR_add_error_data(2, "value=", value); + return nullptr; + } + + if (ext_len > INT_MAX) { + OPENSSL_PUT_ERROR(X509V3, ERR_R_OVERFLOW); + return nullptr; + } + + UniquePtr oct(ASN1_OCTET_STRING_new()); + if (oct == nullptr) { + return nullptr; + } + + ASN1_STRING_set0(oct.get(), ext_der.get(), (int)ext_len); + ext_der.release(); // ASN1_STRING_set0 took ownership. + + return X509_EXTENSION_create_by_OBJ(nullptr, obj.get(), crit, oct.get()); +} + +static unsigned char *generic_asn1(const char *value, const X509V3_CTX *ctx, + size_t *ext_len) { + ASN1_TYPE *typ = ASN1_generate_v3(value, ctx); + if (typ == nullptr) { + return nullptr; + } + unsigned char *ext_der = nullptr; + int len = i2d_ASN1_TYPE(typ, &ext_der); + ASN1_TYPE_free(typ); + if (len < 0) { + return nullptr; + } + *ext_len = len; + return ext_der; +} + +// This is the main function: add a bunch of extensions based on a config +// file section to an extension STACK. + +int X509V3_EXT_add_nconf_sk(const CONF *conf, const X509V3_CTX *ctx, + const char *section, + STACK_OF(X509_EXTENSION) **sk) { + const STACK_OF(CONF_VALUE) *nval = NCONF_get_section(conf, section); + if (nval == nullptr) { + return 0; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(nval); i++) { + const CONF_VALUE *val = sk_CONF_VALUE_value(nval, i); + X509_EXTENSION *ext = X509V3_EXT_nconf(conf, ctx, val->name, val->value); + int ok = ext != nullptr && // + (sk == nullptr || X509v3_add_ext(sk, ext, -1) != nullptr); + X509_EXTENSION_free(ext); + if (!ok) { + return 0; + } + } + return 1; +} + +// Convenience functions to add extensions to a certificate, CRL and request + +int X509V3_EXT_add_nconf(const CONF *conf, const X509V3_CTX *ctx, + const char *section, X509 *cert) { + STACK_OF(X509_EXTENSION) **sk = nullptr; + if (cert) { + sk = &FromOpaque(cert)->extensions; + } + return X509V3_EXT_add_nconf_sk(conf, ctx, section, sk); +} + +// Same as above but for a CRL + +int X509V3_EXT_CRL_add_nconf(const CONF *conf, const X509V3_CTX *ctx, + const char *section, X509_CRL *crl) { + STACK_OF(X509_EXTENSION) **sk = nullptr; + if (crl) { + sk = &crl->crl->extensions; + } + return X509V3_EXT_add_nconf_sk(conf, ctx, section, sk); +} + +// Add extensions to certificate request + +int X509V3_EXT_REQ_add_nconf(const CONF *conf, const X509V3_CTX *ctx, + const char *section, X509_REQ *req) { + STACK_OF(X509_EXTENSION) *extlist = nullptr, **sk = nullptr; + int i; + if (req) { + sk = &extlist; + } + i = X509V3_EXT_add_nconf_sk(conf, ctx, section, sk); + if (!i || !sk) { + return i; + } + i = X509_REQ_add_extensions(req, extlist); + sk_X509_EXTENSION_pop_free(extlist, X509_EXTENSION_free); + return i; +} + +// Config database functions + +const STACK_OF(CONF_VALUE) *bssl::X509V3_get_section(const X509V3_CTX *ctx, + const char *section) { + if (ctx->db == nullptr) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_OPERATION_NOT_DEFINED); + return nullptr; + } + return NCONF_get_section(ctx->db, section); +} + +void X509V3_set_nconf(X509V3_CTX *ctx, const CONF *conf) { ctx->db = conf; } + +void X509V3_set_ctx(X509V3_CTX *ctx, const X509 *issuer, const X509 *subj, + const X509_REQ *req, const X509_CRL *crl, int flags) { + OPENSSL_memset(ctx, 0, sizeof(*ctx)); + ctx->issuer_cert = issuer; + ctx->subject_cert = subj; + ctx->crl = crl; + ctx->subject_req = req; + ctx->flags = flags; +} diff --git a/third_party/boringssl/src/crypto/x509/v3_cpols.cc b/third_party/boringssl/src/crypto/x509/v3_cpols.cc new file mode 100644 index 00000000..f69c82d6 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_cpols.cc @@ -0,0 +1,441 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +// Certificate policies extension support: this one is a bit complex... + +static int i2r_certpol(const X509V3_EXT_METHOD *method, void *ext, BIO *out, + int indent); +static void *r2i_certpol(const X509V3_EXT_METHOD *method, const X509V3_CTX *ctx, + const char *value); +static void print_qualifiers(BIO *out, const STACK_OF(POLICYQUALINFO) *quals, + int indent); +static void print_notice(BIO *out, const USERNOTICE *notice, int indent); +static POLICYINFO *policy_section(const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *polstrs, + int ia5org); +static POLICYQUALINFO *notice_section(const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *unot, + int ia5org); +static int nref_nos(STACK_OF(ASN1_INTEGER) *nnums, + const STACK_OF(CONF_VALUE) *nos); + +const X509V3_EXT_METHOD bssl::v3_cpols = { + NID_certificate_policies, + 0, + ASN1_ITEM_ref(CERTIFICATEPOLICIES), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + i2r_certpol, + r2i_certpol, + nullptr, +}; + +ASN1_SEQUENCE(NOTICEREF) = { + ASN1_SIMPLE(NOTICEREF, organization, DISPLAYTEXT), + ASN1_SEQUENCE_OF(NOTICEREF, noticenos, ASN1_INTEGER), +} ASN1_SEQUENCE_END(NOTICEREF) + +IMPLEMENT_ASN1_ALLOC_FUNCTIONS(NOTICEREF) + +ASN1_SEQUENCE(USERNOTICE) = { + ASN1_OPT(USERNOTICE, noticeref, NOTICEREF), + ASN1_OPT(USERNOTICE, exptext, DISPLAYTEXT), +} ASN1_SEQUENCE_END(USERNOTICE) + +IMPLEMENT_ASN1_ALLOC_FUNCTIONS(USERNOTICE) + +ASN1_ADB_TEMPLATE(policydefault) = ASN1_SIMPLE(POLICYQUALINFO, d.other, + ASN1_ANY); + +ASN1_ADB(POLICYQUALINFO) = { + ADB_ENTRY(NID_id_qt_cps, + ASN1_SIMPLE(POLICYQUALINFO, d.cpsuri, ASN1_IA5STRING)), + ADB_ENTRY(NID_id_qt_unotice, + ASN1_SIMPLE(POLICYQUALINFO, d.usernotice, USERNOTICE)), +} ASN1_ADB_END(POLICYQUALINFO, 0, pqualid, 0, &policydefault_tt, NULL); + +ASN1_SEQUENCE(POLICYQUALINFO) = { + ASN1_SIMPLE(POLICYQUALINFO, pqualid, ASN1_OBJECT), + ASN1_ADB_OBJECT(POLICYQUALINFO), +} ASN1_SEQUENCE_END(POLICYQUALINFO) + +IMPLEMENT_ASN1_ALLOC_FUNCTIONS(POLICYQUALINFO) + +ASN1_SEQUENCE(POLICYINFO) = { + ASN1_SIMPLE(POLICYINFO, policyid, ASN1_OBJECT), + ASN1_SEQUENCE_OF_OPT(POLICYINFO, qualifiers, POLICYQUALINFO), +} ASN1_SEQUENCE_END(POLICYINFO) + +IMPLEMENT_ASN1_ALLOC_FUNCTIONS(POLICYINFO) + +ASN1_ITEM_TEMPLATE(CERTIFICATEPOLICIES) = ASN1_EX_TEMPLATE_TYPE( + ASN1_TFLG_SEQUENCE_OF, 0, CERTIFICATEPOLICIES, POLICYINFO) +ASN1_ITEM_TEMPLATE_END(CERTIFICATEPOLICIES) + +IMPLEMENT_ASN1_FUNCTIONS_const(CERTIFICATEPOLICIES) + +static void *r2i_certpol(const X509V3_EXT_METHOD *method, const X509V3_CTX *ctx, + const char *value) { + STACK_OF(POLICYINFO) *pols = sk_POLICYINFO_new_null(); + if (pols == nullptr) { + return nullptr; + } + STACK_OF(CONF_VALUE) *vals = X509V3_parse_list(value); + + { + if (vals == nullptr) { + OPENSSL_PUT_ERROR(X509V3, ERR_R_X509V3_LIB); + goto err; + } + int ia5org = 0; + for (size_t i = 0; i < sk_CONF_VALUE_num(vals); i++) { + const CONF_VALUE *cnf = sk_CONF_VALUE_value(vals, i); + if (cnf->value || !cnf->name) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_POLICY_IDENTIFIER); + X509V3_conf_err(cnf); + goto err; + } + POLICYINFO *pol; + const char *pstr = cnf->name; + if (!strcmp(pstr, "ia5org")) { + ia5org = 1; + continue; + } else if (*pstr == '@') { + const STACK_OF(CONF_VALUE) *polsect = X509V3_get_section(ctx, pstr + 1); + if (!polsect) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_SECTION); + + X509V3_conf_err(cnf); + goto err; + } + pol = policy_section(ctx, polsect, ia5org); + if (!pol) { + goto err; + } + } else { + ASN1_OBJECT *pobj = OBJ_txt2obj(cnf->name, 0); + if (pobj == nullptr) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_OBJECT_IDENTIFIER); + X509V3_conf_err(cnf); + goto err; + } + pol = POLICYINFO_new(); + if (pol == nullptr) { + ASN1_OBJECT_free(pobj); + goto err; + } + pol->policyid = pobj; + } + if (!sk_POLICYINFO_push(pols, pol)) { + POLICYINFO_free(pol); + goto err; + } + } + sk_CONF_VALUE_pop_free(vals, X509V3_conf_free); + return pols; + } + +err: + sk_CONF_VALUE_pop_free(vals, X509V3_conf_free); + sk_POLICYINFO_pop_free(pols, POLICYINFO_free); + return nullptr; +} + +static POLICYINFO *policy_section(const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *polstrs, + int ia5org) { + POLICYINFO *pol; + POLICYQUALINFO *qual; + if (!(pol = POLICYINFO_new())) { + goto err; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(polstrs); i++) { + const CONF_VALUE *cnf = sk_CONF_VALUE_value(polstrs, i); + if (!strcmp(cnf->name, "policyIdentifier")) { + ASN1_OBJECT *pobj; + if (!(pobj = OBJ_txt2obj(cnf->value, 0))) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_OBJECT_IDENTIFIER); + X509V3_conf_err(cnf); + goto err; + } + pol->policyid = pobj; + + } else if (x509v3_conf_name_matches(cnf->name, "CPS")) { + if (!pol->qualifiers) { + pol->qualifiers = sk_POLICYQUALINFO_new_null(); + } + if (!(qual = POLICYQUALINFO_new())) { + goto err; + } + if (!sk_POLICYQUALINFO_push(pol->qualifiers, qual)) { + goto err; + } + qual->pqualid = OBJ_nid2obj(NID_id_qt_cps); + if (qual->pqualid == nullptr) { + OPENSSL_PUT_ERROR(X509V3, ERR_R_INTERNAL_ERROR); + goto err; + } + qual->d.cpsuri = ASN1_IA5STRING_new(); + if (qual->d.cpsuri == nullptr) { + goto err; + } + if (!ASN1_STRING_set(qual->d.cpsuri, cnf->value, strlen(cnf->value))) { + goto err; + } + } else if (x509v3_conf_name_matches(cnf->name, "userNotice")) { + if (*cnf->value != '@') { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_EXPECTED_A_SECTION_NAME); + X509V3_conf_err(cnf); + goto err; + } + const STACK_OF(CONF_VALUE) *unot = + X509V3_get_section(ctx, cnf->value + 1); + if (!unot) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_SECTION); + X509V3_conf_err(cnf); + goto err; + } + qual = notice_section(ctx, unot, ia5org); + if (!qual) { + goto err; + } + if (!pol->qualifiers) { + pol->qualifiers = sk_POLICYQUALINFO_new_null(); + } + if (!sk_POLICYQUALINFO_push(pol->qualifiers, qual)) { + goto err; + } + } else { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_OPTION); + + X509V3_conf_err(cnf); + goto err; + } + } + if (!pol->policyid) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_NO_POLICY_IDENTIFIER); + goto err; + } + + return pol; + +err: + POLICYINFO_free(pol); + return nullptr; +} + +static POLICYQUALINFO *notice_section(const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *unot, + int ia5org) { + USERNOTICE *notice; + POLICYQUALINFO *qual; + if (!(qual = POLICYQUALINFO_new())) { + goto err; + } + qual->pqualid = OBJ_nid2obj(NID_id_qt_unotice); + if (qual->pqualid == nullptr) { + OPENSSL_PUT_ERROR(X509V3, ERR_R_INTERNAL_ERROR); + goto err; + } + if (!(notice = USERNOTICE_new())) { + goto err; + } + qual->d.usernotice = notice; + for (size_t i = 0; i < sk_CONF_VALUE_num(unot); i++) { + const CONF_VALUE *cnf = sk_CONF_VALUE_value(unot, i); + if (!strcmp(cnf->name, "explicitText")) { + notice->exptext = ASN1_VISIBLESTRING_new(); + if (notice->exptext == nullptr) { + goto err; + } + if (!ASN1_STRING_set(notice->exptext, cnf->value, strlen(cnf->value))) { + goto err; + } + } else if (!strcmp(cnf->name, "organization")) { + NOTICEREF *nref; + if (!notice->noticeref) { + if (!(nref = NOTICEREF_new())) { + goto err; + } + notice->noticeref = nref; + } else { + nref = notice->noticeref; + } + if (ia5org) { + nref->organization->type = V_ASN1_IA5STRING; + } else { + nref->organization->type = V_ASN1_VISIBLESTRING; + } + if (!ASN1_STRING_set(nref->organization, cnf->value, + strlen(cnf->value))) { + goto err; + } + } else if (!strcmp(cnf->name, "noticeNumbers")) { + NOTICEREF *nref; + STACK_OF(CONF_VALUE) *nos; + if (!notice->noticeref) { + if (!(nref = NOTICEREF_new())) { + goto err; + } + notice->noticeref = nref; + } else { + nref = notice->noticeref; + } + nos = X509V3_parse_list(cnf->value); + if (!nos || !sk_CONF_VALUE_num(nos)) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_NUMBERS); + X509V3_conf_err(cnf); + sk_CONF_VALUE_pop_free(nos, X509V3_conf_free); + goto err; + } + int ret = nref_nos(nref->noticenos, nos); + sk_CONF_VALUE_pop_free(nos, X509V3_conf_free); + if (!ret) { + goto err; + } + } else { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_OPTION); + X509V3_conf_err(cnf); + goto err; + } + } + + if (notice->noticeref && + (!notice->noticeref->noticenos || !notice->noticeref->organization)) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_NEED_ORGANIZATION_AND_NUMBERS); + goto err; + } + + return qual; + +err: + POLICYQUALINFO_free(qual); + return nullptr; +} + +static int nref_nos(STACK_OF(ASN1_INTEGER) *nnums, + const STACK_OF(CONF_VALUE) *nos) { + for (size_t i = 0; i < sk_CONF_VALUE_num(nos); i++) { + const CONF_VALUE *cnf = sk_CONF_VALUE_value(nos, i); + ASN1_INTEGER *aint = s2i_ASN1_INTEGER(nullptr, cnf->name); + if (aint == nullptr) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_NUMBER); + return 0; + } + if (!sk_ASN1_INTEGER_push(nnums, aint)) { + ASN1_INTEGER_free(aint); + return 0; + } + } + return 1; +} + +static int i2r_certpol(const X509V3_EXT_METHOD *method, void *ext, BIO *out, + int indent) { + const STACK_OF(POLICYINFO) *pol = + reinterpret_cast(ext); + // First print out the policy OIDs + for (size_t i = 0; i < sk_POLICYINFO_num(pol); i++) { + const POLICYINFO *pinfo = sk_POLICYINFO_value(pol, i); + BIO_printf(out, "%*sPolicy: ", indent, ""); + i2a_ASN1_OBJECT(out, pinfo->policyid); + BIO_puts(out, "\n"); + if (pinfo->qualifiers) { + print_qualifiers(out, pinfo->qualifiers, indent + 2); + } + } + return 1; +} + +static void print_qualifiers(BIO *out, const STACK_OF(POLICYQUALINFO) *quals, + int indent) { + for (size_t i = 0; i < sk_POLICYQUALINFO_num(quals); i++) { + const POLICYQUALINFO *qualinfo = sk_POLICYQUALINFO_value(quals, i); + switch (OBJ_obj2nid(qualinfo->pqualid)) { + case NID_id_qt_cps: + BIO_printf(out, "%*sCPS: %.*s\n", indent, "", + qualinfo->d.cpsuri->length, qualinfo->d.cpsuri->data); + break; + + case NID_id_qt_unotice: + BIO_printf(out, "%*sUser Notice:\n", indent, ""); + print_notice(out, qualinfo->d.usernotice, indent + 2); + break; + + default: + BIO_printf(out, "%*sUnknown Qualifier: ", indent + 2, ""); + + i2a_ASN1_OBJECT(out, qualinfo->pqualid); + BIO_puts(out, "\n"); + break; + } + } +} + +static void print_notice(BIO *out, const USERNOTICE *notice, int indent) { + if (notice->noticeref) { + NOTICEREF *ref; + ref = notice->noticeref; + BIO_printf(out, "%*sOrganization: %.*s\n", indent, "", + ref->organization->length, ref->organization->data); + BIO_printf(out, "%*sNumber%s: ", indent, "", + sk_ASN1_INTEGER_num(ref->noticenos) > 1 ? "s" : ""); + for (size_t i = 0; i < sk_ASN1_INTEGER_num(ref->noticenos); i++) { + ASN1_INTEGER *num; + char *tmp; + num = sk_ASN1_INTEGER_value(ref->noticenos, i); + if (i) { + BIO_puts(out, ", "); + } + if (num == nullptr) { + BIO_puts(out, "(null)"); + } else { + tmp = i2s_ASN1_INTEGER(nullptr, num); + if (tmp == nullptr) { + return; + } + BIO_puts(out, tmp); + OPENSSL_free(tmp); + } + } + BIO_puts(out, "\n"); + } + if (notice->exptext) { + BIO_printf(out, "%*sExplicit Text: %.*s\n", indent, "", + notice->exptext->length, notice->exptext->data); + } +} diff --git a/third_party/boringssl/src/crypto/x509/v3_crld.cc b/third_party/boringssl/src/crypto/x509/v3_crld.cc new file mode 100644 index 00000000..61225ad4 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_crld.cc @@ -0,0 +1,551 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +static void *v2i_crld(const X509V3_EXT_METHOD *method, const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval); +static int i2r_crldp(const X509V3_EXT_METHOD *method, void *pcrldp, BIO *out, + int indent); + +const X509V3_EXT_METHOD bssl::v3_crld = { + NID_crl_distribution_points, + 0, + ASN1_ITEM_ref(CRL_DIST_POINTS), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + v2i_crld, + i2r_crldp, + nullptr, + nullptr, +}; + +const X509V3_EXT_METHOD bssl::v3_freshest_crl = { + NID_freshest_crl, 0, ASN1_ITEM_ref(CRL_DIST_POINTS), + nullptr, nullptr, nullptr, + nullptr, nullptr, nullptr, + nullptr, v2i_crld, i2r_crldp, + nullptr, nullptr, +}; + +static STACK_OF(GENERAL_NAME) *gnames_from_sectname(const X509V3_CTX *ctx, + char *sect) { + const STACK_OF(CONF_VALUE) *gnsect; + STACK_OF(CONF_VALUE) *gnsect_owned = nullptr; + if (*sect == '@') { + gnsect = X509V3_get_section(ctx, sect + 1); + } else { + gnsect_owned = X509V3_parse_list(sect); + gnsect = gnsect_owned; + } + if (!gnsect) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_SECTION_NOT_FOUND); + return nullptr; + } + STACK_OF(GENERAL_NAME) *gens = v2i_GENERAL_NAMES(nullptr, ctx, gnsect); + sk_CONF_VALUE_pop_free(gnsect_owned, X509V3_conf_free); + return gens; +} + +// set_dist_point_name decodes a DistributionPointName from |cnf| and writes the +// result in |*pdp|. It returns 1 on success, -1 on error, and 0 if |cnf| used +// an unrecognized input type. The zero return can be used by callers to support +// additional syntax. +static int set_dist_point_name(DIST_POINT_NAME **pdp, const X509V3_CTX *ctx, + const CONF_VALUE *cnf) { + STACK_OF(GENERAL_NAME) *fnm = nullptr; + STACK_OF(X509_NAME_ENTRY) *rnm = nullptr; + if (!strcmp(cnf->name, "fullname")) { + // If |cnf| comes from |X509V3_parse_list|, which is possible for a v2i + // function, |cnf->value| may be NULL. + if (cnf->value == nullptr) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_MISSING_VALUE); + return -1; + } + fnm = gnames_from_sectname(ctx, cnf->value); + if (!fnm) { + goto err; + } + } else if (!strcmp(cnf->name, "relativename")) { + // If |cnf| comes from |X509V3_parse_list|, which is possible for a v2i + // function, |cnf->value| may be NULL. + if (cnf->value == nullptr) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_MISSING_VALUE); + return -1; + } + const STACK_OF(CONF_VALUE) *dnsect = X509V3_get_section(ctx, cnf->value); + if (!dnsect) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_SECTION_NOT_FOUND); + return -1; + } + X509_NAME *nm = X509_NAME_new(); + if (!nm) { + return -1; + } + auto *impl = FromOpaque(nm); + int ret = X509V3_NAME_from_section(impl, dnsect, MBSTRING_ASC); + rnm = impl->entries; + impl->entries = nullptr; + X509_NAME_free(nm); + if (!ret || sk_X509_NAME_ENTRY_num(rnm) <= 0) { + goto err; + } + // There can only be one RDN in nameRelativeToCRLIssuer. + if (sk_X509_NAME_ENTRY_value(rnm, sk_X509_NAME_ENTRY_num(rnm) - 1)->set) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_MULTIPLE_RDNS); + goto err; + } + } else { + return 0; + } + + if (*pdp) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_DISTPOINT_ALREADY_SET); + goto err; + } + + *pdp = DIST_POINT_NAME_new(); + if (!*pdp) { + goto err; + } + if (fnm) { + (*pdp)->type = 0; + (*pdp)->name.fullname = fnm; + } else { + (*pdp)->type = 1; + (*pdp)->name.relativename = rnm; + } + + return 1; + +err: + sk_GENERAL_NAME_pop_free(fnm, GENERAL_NAME_free); + sk_X509_NAME_ENTRY_pop_free(rnm, X509_NAME_ENTRY_free); + return -1; +} + +static const BIT_STRING_BITNAME reason_flags[] = { + {0, "Unused", "unused"}, + {1, "Key Compromise", "keyCompromise"}, + {2, "CA Compromise", "CACompromise"}, + {3, "Affiliation Changed", "affiliationChanged"}, + {4, "Superseded", "superseded"}, + {5, "Cessation Of Operation", "cessationOfOperation"}, + {6, "Certificate Hold", "certificateHold"}, + {7, "Privilege Withdrawn", "privilegeWithdrawn"}, + {8, "AA Compromise", "AACompromise"}, + {-1, nullptr, nullptr}}; + +static int set_reasons(ASN1_BIT_STRING **preas, const char *value) { + if (*preas) { + // Duplicate "reasons" or "onlysomereasons" key. + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_VALUE); + return 0; + } + int ret = 0; + STACK_OF(CONF_VALUE) *rsk = X509V3_parse_list(value); + if (!rsk) { + return 0; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(rsk); i++) { + const char *bnam = sk_CONF_VALUE_value(rsk, i)->name; + if (!*preas) { + *preas = ASN1_BIT_STRING_new(); + if (!*preas) { + goto err; + } + } + const BIT_STRING_BITNAME *pbn; + for (pbn = reason_flags; pbn->lname; pbn++) { + if (!strcmp(pbn->sname, bnam)) { + if (!ASN1_BIT_STRING_set_bit(*preas, pbn->bitnum, 1)) { + goto err; + } + break; + } + } + if (!pbn->lname) { + goto err; + } + } + ret = 1; + +err: + sk_CONF_VALUE_pop_free(rsk, X509V3_conf_free); + return ret; +} + +static int print_reasons(BIO *out, const char *rname, ASN1_BIT_STRING *rflags, + int indent) { + int first = 1; + const BIT_STRING_BITNAME *pbn; + BIO_printf(out, "%*s%s:\n%*s", indent, "", rname, indent + 2, ""); + for (pbn = reason_flags; pbn->lname; pbn++) { + if (ASN1_BIT_STRING_get_bit(rflags, pbn->bitnum)) { + if (first) { + first = 0; + } else { + BIO_puts(out, ", "); + } + BIO_puts(out, pbn->lname); + } + } + if (first) { + BIO_puts(out, "\n"); + } else { + BIO_puts(out, "\n"); + } + return 1; +} + +static DIST_POINT *crldp_from_section(const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval) { + DIST_POINT *point = nullptr; + point = DIST_POINT_new(); + if (!point) { + goto err; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(nval); i++) { + const CONF_VALUE *cnf = sk_CONF_VALUE_value(nval, i); + int ret = set_dist_point_name(&point->distpoint, ctx, cnf); + if (ret > 0) { + continue; + } + if (ret < 0) { + goto err; + } + if (!strcmp(cnf->name, "reasons")) { + if (!set_reasons(&point->reasons, cnf->value)) { + goto err; + } + } else if (!strcmp(cnf->name, "CRLissuer")) { + GENERAL_NAMES_free(point->CRLissuer); + point->CRLissuer = gnames_from_sectname(ctx, cnf->value); + if (!point->CRLissuer) { + goto err; + } + } + } + + return point; + +err: + DIST_POINT_free(point); + return nullptr; +} + +static void *v2i_crld(const X509V3_EXT_METHOD *method, const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval) { + STACK_OF(DIST_POINT) *crld = nullptr; + GENERAL_NAMES *gens = nullptr; + GENERAL_NAME *gen = nullptr; + if (!(crld = sk_DIST_POINT_new_null())) { + goto err; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(nval); i++) { + DIST_POINT *point; + const CONF_VALUE *cnf = sk_CONF_VALUE_value(nval, i); + if (!cnf->value) { + const STACK_OF(CONF_VALUE) *dpsect = X509V3_get_section(ctx, cnf->name); + if (!dpsect) { + goto err; + } + point = crldp_from_section(ctx, dpsect); + if (!point) { + goto err; + } + if (!sk_DIST_POINT_push(crld, point)) { + DIST_POINT_free(point); + goto err; + } + } else { + if (!(gen = v2i_GENERAL_NAME(method, ctx, cnf))) { + goto err; + } + if (!(gens = GENERAL_NAMES_new())) { + goto err; + } + if (!sk_GENERAL_NAME_push(gens, gen)) { + goto err; + } + gen = nullptr; + if (!(point = DIST_POINT_new())) { + goto err; + } + if (!sk_DIST_POINT_push(crld, point)) { + DIST_POINT_free(point); + goto err; + } + if (!(point->distpoint = DIST_POINT_NAME_new())) { + goto err; + } + point->distpoint->name.fullname = gens; + point->distpoint->type = 0; + gens = nullptr; + } + } + return crld; + +err: + GENERAL_NAME_free(gen); + GENERAL_NAMES_free(gens); + sk_DIST_POINT_pop_free(crld, DIST_POINT_free); + return nullptr; +} + +static int dpn_cb(int operation, ASN1_VALUE **pval, const ASN1_ITEM *it, + void *exarg) { + DIST_POINT_NAME *dpn = (DIST_POINT_NAME *)*pval; + + switch (operation) { + case ASN1_OP_NEW_POST: + dpn->dpname = nullptr; + break; + + case ASN1_OP_FREE_POST: + X509_NAME_free(dpn->dpname); + break; + } + return 1; +} + + +ASN1_CHOICE_cb(DIST_POINT_NAME, dpn_cb) = { + ASN1_IMP_SEQUENCE_OF(DIST_POINT_NAME, name.fullname, GENERAL_NAME, 0), + ASN1_IMP_SET_OF(DIST_POINT_NAME, name.relativename, X509_NAME_ENTRY, 1), +} ASN1_CHOICE_END_cb(DIST_POINT_NAME, DIST_POINT_NAME, type) + +IMPLEMENT_ASN1_ALLOC_FUNCTIONS(DIST_POINT_NAME) + +ASN1_SEQUENCE(DIST_POINT) = { + ASN1_EXP_OPT(DIST_POINT, distpoint, DIST_POINT_NAME, 0), + ASN1_IMP_OPT(DIST_POINT, reasons, ASN1_BIT_STRING, 1), + ASN1_IMP_SEQUENCE_OF_OPT(DIST_POINT, CRLissuer, GENERAL_NAME, 2), +} ASN1_SEQUENCE_END(DIST_POINT) + +IMPLEMENT_ASN1_ALLOC_FUNCTIONS(DIST_POINT) + +ASN1_ITEM_TEMPLATE(CRL_DIST_POINTS) = ASN1_EX_TEMPLATE_TYPE( + ASN1_TFLG_SEQUENCE_OF, 0, CRLDistributionPoints, DIST_POINT) +ASN1_ITEM_TEMPLATE_END(CRL_DIST_POINTS) + +IMPLEMENT_ASN1_FUNCTIONS_const(CRL_DIST_POINTS) + +ASN1_SEQUENCE(ISSUING_DIST_POINT) = { + ASN1_EXP_OPT(ISSUING_DIST_POINT, distpoint, DIST_POINT_NAME, 0), + ASN1_IMP_OPT(ISSUING_DIST_POINT, onlyuser, ASN1_FBOOLEAN, 1), + ASN1_IMP_OPT(ISSUING_DIST_POINT, onlyCA, ASN1_FBOOLEAN, 2), + ASN1_IMP_OPT(ISSUING_DIST_POINT, onlysomereasons, ASN1_BIT_STRING, 3), + ASN1_IMP_OPT(ISSUING_DIST_POINT, indirectCRL, ASN1_FBOOLEAN, 4), + ASN1_IMP_OPT(ISSUING_DIST_POINT, onlyattr, ASN1_FBOOLEAN, 5), +} ASN1_SEQUENCE_END(ISSUING_DIST_POINT) + +IMPLEMENT_ASN1_FUNCTIONS_const(ISSUING_DIST_POINT) + +static int i2r_idp(const X509V3_EXT_METHOD *method, void *pidp, BIO *out, + int indent); +static void *v2i_idp(const X509V3_EXT_METHOD *method, const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval); + +const X509V3_EXT_METHOD bssl::v3_idp = { + NID_issuing_distribution_point, + X509V3_EXT_MULTILINE, + ASN1_ITEM_ref(ISSUING_DIST_POINT), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + v2i_idp, + i2r_idp, + nullptr, + nullptr, +}; + +static void *v2i_idp(const X509V3_EXT_METHOD *method, const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval) { + ISSUING_DIST_POINT *idp = ISSUING_DIST_POINT_new(); + if (!idp) { + goto err; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(nval); i++) { + const CONF_VALUE *cnf = sk_CONF_VALUE_value(nval, i); + const char *name = cnf->name; + const char *val = cnf->value; + int ret = set_dist_point_name(&idp->distpoint, ctx, cnf); + if (ret > 0) { + continue; + } + if (ret < 0) { + goto err; + } + if (!strcmp(name, "onlyuser")) { + if (!X509V3_get_value_bool(cnf, &idp->onlyuser)) { + goto err; + } + } else if (!strcmp(name, "onlyCA")) { + if (!X509V3_get_value_bool(cnf, &idp->onlyCA)) { + goto err; + } + } else if (!strcmp(name, "onlyAA")) { + if (!X509V3_get_value_bool(cnf, &idp->onlyattr)) { + goto err; + } + } else if (!strcmp(name, "indirectCRL")) { + if (!X509V3_get_value_bool(cnf, &idp->indirectCRL)) { + goto err; + } + } else if (!strcmp(name, "onlysomereasons")) { + if (!set_reasons(&idp->onlysomereasons, val)) { + goto err; + } + } else { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_NAME); + X509V3_conf_err(cnf); + goto err; + } + } + return idp; + +err: + ISSUING_DIST_POINT_free(idp); + return nullptr; +} + +static int print_gens(BIO *out, STACK_OF(GENERAL_NAME) *gens, int indent) { + size_t i; + for (i = 0; i < sk_GENERAL_NAME_num(gens); i++) { + BIO_printf(out, "%*s", indent + 2, ""); + GENERAL_NAME_print(out, sk_GENERAL_NAME_value(gens, i)); + BIO_puts(out, "\n"); + } + return 1; +} + +static int print_distpoint(BIO *out, DIST_POINT_NAME *dpn, int indent) { + if (dpn->type == 0) { + BIO_printf(out, "%*sFull Name:\n", indent, ""); + print_gens(out, dpn->name.fullname, indent); + } else { + X509Name ntmp; + ntmp.entries = dpn->name.relativename; + BIO_printf(out, "%*sRelative Name:\n%*s", indent, "", indent + 2, ""); + X509_NAME_print_ex(out, &ntmp, 0, XN_FLAG_ONELINE); + BIO_puts(out, "\n"); + } + return 1; +} + +static int i2r_idp(const X509V3_EXT_METHOD *method, void *pidp, BIO *out, + int indent) { + ISSUING_DIST_POINT *idp = reinterpret_cast(pidp); + if (idp->distpoint) { + print_distpoint(out, idp->distpoint, indent); + } + if (idp->onlyuser > 0) { + BIO_printf(out, "%*sOnly User Certificates\n", indent, ""); + } + if (idp->onlyCA > 0) { + BIO_printf(out, "%*sOnly CA Certificates\n", indent, ""); + } + if (idp->indirectCRL > 0) { + BIO_printf(out, "%*sIndirect CRL\n", indent, ""); + } + if (idp->onlysomereasons) { + print_reasons(out, "Only Some Reasons", idp->onlysomereasons, indent); + } + if (idp->onlyattr > 0) { + BIO_printf(out, "%*sOnly Attribute Certificates\n", indent, ""); + } + if (!idp->distpoint && (idp->onlyuser <= 0) && (idp->onlyCA <= 0) && + (idp->indirectCRL <= 0) && !idp->onlysomereasons && + (idp->onlyattr <= 0)) { + BIO_printf(out, "%*s\n", indent, ""); + } + + return 1; +} + +static int i2r_crldp(const X509V3_EXT_METHOD *method, void *pcrldp, BIO *out, + int indent) { + STACK_OF(DIST_POINT) *crld = reinterpret_cast(pcrldp); + DIST_POINT *point; + size_t i; + for (i = 0; i < sk_DIST_POINT_num(crld); i++) { + BIO_puts(out, "\n"); + point = sk_DIST_POINT_value(crld, i); + if (point->distpoint) { + print_distpoint(out, point->distpoint, indent); + } + if (point->reasons) { + print_reasons(out, "Reasons", point->reasons, indent); + } + if (point->CRLissuer) { + BIO_printf(out, "%*sCRL Issuer:\n", indent, ""); + print_gens(out, point->CRLissuer, indent); + } + } + return 1; +} + +int bssl::DIST_POINT_set_dpname(DIST_POINT_NAME *dpn, X509_NAME *iname) { + size_t i; + STACK_OF(X509_NAME_ENTRY) *frag; + X509_NAME_ENTRY *ne; + if (!dpn || (dpn->type != 1)) { + return 1; + } + frag = dpn->name.relativename; + dpn->dpname = X509_NAME_dup(iname); + if (!dpn->dpname) { + return 0; + } + for (i = 0; i < sk_X509_NAME_ENTRY_num(frag); i++) { + ne = sk_X509_NAME_ENTRY_value(frag, i); + if (!X509_NAME_add_entry(dpn->dpname, ne, -1, i ? 0 : 1)) { + X509_NAME_free(dpn->dpname); + dpn->dpname = nullptr; + return 0; + } + } + // generate cached encoding of name + if (i2d_X509_NAME(dpn->dpname, nullptr) < 0) { + X509_NAME_free(dpn->dpname); + dpn->dpname = nullptr; + return 0; + } + return 1; +} diff --git a/third_party/boringssl/src/crypto/x509/v3_enum.cc b/third_party/boringssl/src/crypto/x509/v3_enum.cc new file mode 100644 index 00000000..41440fe7 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_enum.cc @@ -0,0 +1,74 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +typedef BIT_STRING_BITNAME ENUMERATED_NAMES; + +static const ENUMERATED_NAMES crl_reasons[] = { + {CRL_REASON_UNSPECIFIED, "Unspecified", "unspecified"}, + {CRL_REASON_KEY_COMPROMISE, "Key Compromise", "keyCompromise"}, + {CRL_REASON_CA_COMPROMISE, "CA Compromise", "CACompromise"}, + {CRL_REASON_AFFILIATION_CHANGED, "Affiliation Changed", + "affiliationChanged"}, + {CRL_REASON_SUPERSEDED, "Superseded", "superseded"}, + {CRL_REASON_CESSATION_OF_OPERATION, "Cessation Of Operation", + "cessationOfOperation"}, + {CRL_REASON_CERTIFICATE_HOLD, "Certificate Hold", "certificateHold"}, + {CRL_REASON_REMOVE_FROM_CRL, "Remove From CRL", "removeFromCRL"}, + {CRL_REASON_PRIVILEGE_WITHDRAWN, "Privilege Withdrawn", + "privilegeWithdrawn"}, + {CRL_REASON_AA_COMPROMISE, "AA Compromise", "AACompromise"}, + {-1, nullptr, nullptr}}; + +static char *i2s_ASN1_ENUMERATED_TABLE(const X509V3_EXT_METHOD *method, + void *ext) { + const ASN1_ENUMERATED *e = reinterpret_cast(ext); + long strval = ASN1_ENUMERATED_get(e); + for (const ENUMERATED_NAMES *enam = + reinterpret_cast(method->usr_data); + enam->lname; enam++) { + if (strval == enam->bitnum) { + return OPENSSL_strdup(enam->lname); + } + } + return i2s_ASN1_ENUMERATED(method, e); +} + +const X509V3_EXT_METHOD bssl::v3_crl_reason = { + NID_crl_reason, + 0, + ASN1_ITEM_ref(ASN1_ENUMERATED), + nullptr, + nullptr, + nullptr, + nullptr, + i2s_ASN1_ENUMERATED_TABLE, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + (void *)crl_reasons, +}; diff --git a/third_party/boringssl/src/crypto/x509/v3_extku.cc b/third_party/boringssl/src/crypto/x509/v3_extku.cc new file mode 100644 index 00000000..60c62382 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_extku.cc @@ -0,0 +1,97 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +static void *v2i_EXTENDED_KEY_USAGE(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval); +static STACK_OF(CONF_VALUE) *i2v_EXTENDED_KEY_USAGE( + const X509V3_EXT_METHOD *method, void *eku, STACK_OF(CONF_VALUE) *extlist); + +const X509V3_EXT_METHOD bssl::v3_ext_ku = { + NID_ext_key_usage, + 0, + ASN1_ITEM_ref(EXTENDED_KEY_USAGE), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + i2v_EXTENDED_KEY_USAGE, + v2i_EXTENDED_KEY_USAGE, + nullptr, + nullptr, + nullptr, +}; + +ASN1_ITEM_TEMPLATE(EXTENDED_KEY_USAGE) = ASN1_EX_TEMPLATE_TYPE( + ASN1_TFLG_SEQUENCE_OF, 0, EXTENDED_KEY_USAGE, ASN1_OBJECT) +ASN1_ITEM_TEMPLATE_END(EXTENDED_KEY_USAGE) + +IMPLEMENT_ASN1_FUNCTIONS_const(EXTENDED_KEY_USAGE) + +static STACK_OF(CONF_VALUE) *i2v_EXTENDED_KEY_USAGE( + const X509V3_EXT_METHOD *method, void *a, STACK_OF(CONF_VALUE) *ext_list) { + const EXTENDED_KEY_USAGE *eku = + reinterpret_cast(a); + for (size_t i = 0; i < sk_ASN1_OBJECT_num(eku); i++) { + const ASN1_OBJECT *obj = sk_ASN1_OBJECT_value(eku, i); + char obj_tmp[80]; + i2t_ASN1_OBJECT(obj_tmp, 80, obj); + X509V3_add_value(nullptr, obj_tmp, &ext_list); + } + return ext_list; +} + +static void *v2i_EXTENDED_KEY_USAGE(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval) { + EXTENDED_KEY_USAGE *extku = sk_ASN1_OBJECT_new_null(); + if (extku == nullptr) { + return nullptr; + } + + for (size_t i = 0; i < sk_CONF_VALUE_num(nval); i++) { + const CONF_VALUE *val = sk_CONF_VALUE_value(nval, i); + const char *extval; + if (val->value) { + extval = val->value; + } else { + extval = val->name; + } + ASN1_OBJECT *obj = OBJ_txt2obj(extval, 0); + if (obj == nullptr || !sk_ASN1_OBJECT_push(extku, obj)) { + ASN1_OBJECT_free(obj); + sk_ASN1_OBJECT_pop_free(extku, ASN1_OBJECT_free); + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_OBJECT_IDENTIFIER); + X509V3_conf_err(val); + return nullptr; + } + } + + return extku; +} diff --git a/third_party/boringssl/src/crypto/x509/v3_genn.cc b/third_party/boringssl/src/crypto/x509/v3_genn.cc new file mode 100644 index 00000000..67a0743b --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_genn.cc @@ -0,0 +1,238 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +ASN1_SEQUENCE(OTHERNAME) = { + ASN1_SIMPLE(OTHERNAME, type_id, ASN1_OBJECT), + // Maybe have a true ANY DEFINED BY later + ASN1_EXP(OTHERNAME, value, ASN1_ANY, 0), +} ASN1_SEQUENCE_END(OTHERNAME) + +IMPLEMENT_ASN1_ALLOC_FUNCTIONS(OTHERNAME) + +ASN1_SEQUENCE(EDIPARTYNAME) = { + // DirectoryString is a CHOICE type, so use explicit tagging. + ASN1_EXP_OPT(EDIPARTYNAME, nameAssigner, DIRECTORYSTRING, 0), + ASN1_EXP(EDIPARTYNAME, partyName, DIRECTORYSTRING, 1), +} ASN1_SEQUENCE_END(EDIPARTYNAME) + +IMPLEMENT_ASN1_ALLOC_FUNCTIONS(EDIPARTYNAME) + +BSSL_NAMESPACE_BEGIN + +ASN1_CHOICE(GENERAL_NAME) = { + ASN1_IMP(GENERAL_NAME, d.otherName, OTHERNAME, GEN_OTHERNAME), + ASN1_IMP(GENERAL_NAME, d.rfc822Name, ASN1_IA5STRING, GEN_EMAIL), + ASN1_IMP(GENERAL_NAME, d.dNSName, ASN1_IA5STRING, GEN_DNS), + // Don't decode this + ASN1_IMP(GENERAL_NAME, d.x400Address, ASN1_SEQUENCE, GEN_X400), + // X509_NAME is a CHOICE type so use EXPLICIT + ASN1_EXP(GENERAL_NAME, d.directoryName, X509_NAME, GEN_DIRNAME), + ASN1_IMP(GENERAL_NAME, d.ediPartyName, EDIPARTYNAME, GEN_EDIPARTY), + ASN1_IMP(GENERAL_NAME, d.uniformResourceIdentifier, ASN1_IA5STRING, + GEN_URI), + ASN1_IMP(GENERAL_NAME, d.iPAddress, ASN1_OCTET_STRING, GEN_IPADD), + ASN1_IMP(GENERAL_NAME, d.registeredID, ASN1_OBJECT, GEN_RID), +} ASN1_CHOICE_END(GENERAL_NAME) + +BSSL_NAMESPACE_END + +IMPLEMENT_ASN1_FUNCTIONS_const(GENERAL_NAME) + +BSSL_NAMESPACE_BEGIN + +ASN1_ITEM_TEMPLATE(GENERAL_NAMES) = ASN1_EX_TEMPLATE_TYPE(ASN1_TFLG_SEQUENCE_OF, + 0, GeneralNames, + GENERAL_NAME) +ASN1_ITEM_TEMPLATE_END(GENERAL_NAMES) + +BSSL_NAMESPACE_END + +IMPLEMENT_ASN1_FUNCTIONS_const(GENERAL_NAMES) + +IMPLEMENT_ASN1_DUP_FUNCTION_const(GENERAL_NAME) + +static int edipartyname_cmp(const EDIPARTYNAME *a, const EDIPARTYNAME *b) { + // nameAssigner is optional and may be NULL. + if (a->nameAssigner == nullptr) { + if (b->nameAssigner != nullptr) { + return -1; + } + } else { + if (b->nameAssigner == nullptr || + ASN1_STRING_cmp(a->nameAssigner, b->nameAssigner) != 0) { + return -1; + } + } + + // partyName may not be NULL. + return ASN1_STRING_cmp(a->partyName, b->partyName); +} + +// Returns 0 if they are equal, != 0 otherwise. +static int othername_cmp(const OTHERNAME *a, const OTHERNAME *b) { + int result = -1; + + if (!a || !b) { + return -1; + } + // Check their type first. + if ((result = OBJ_cmp(a->type_id, b->type_id)) != 0) { + return result; + } + // Check the value. + result = ASN1_TYPE_cmp(a->value, b->value); + return result; +} + +// Returns 0 if they are equal, != 0 otherwise. +int bssl::GENERAL_NAME_cmp(const GENERAL_NAME *a, const GENERAL_NAME *b) { + if (!a || !b || a->type != b->type) { + return -1; + } + + switch (a->type) { + case GEN_X400: + return ASN1_STRING_cmp(a->d.x400Address, b->d.x400Address); + + case GEN_EDIPARTY: + return edipartyname_cmp(a->d.ediPartyName, b->d.ediPartyName); + + case GEN_OTHERNAME: + return othername_cmp(a->d.otherName, b->d.otherName); + + case GEN_EMAIL: + case GEN_DNS: + case GEN_URI: + return ASN1_STRING_cmp(a->d.ia5, b->d.ia5); + + case GEN_DIRNAME: + return X509_NAME_cmp(a->d.dirn, b->d.dirn); + + case GEN_IPADD: + return ASN1_OCTET_STRING_cmp(a->d.ip, b->d.ip); + + case GEN_RID: + return OBJ_cmp(a->d.rid, b->d.rid); + } + + return -1; +} + +void GENERAL_NAME_set0_value(GENERAL_NAME *a, int type, void *value) { + switch (type) { + case GEN_X400: + a->d.x400Address = reinterpret_cast(value); + break; + + case GEN_EDIPARTY: + a->d.ediPartyName = reinterpret_cast(value); + break; + + case GEN_OTHERNAME: + a->d.otherName = reinterpret_cast(value); + break; + + case GEN_EMAIL: + case GEN_DNS: + case GEN_URI: + a->d.ia5 = reinterpret_cast(value); + break; + + case GEN_DIRNAME: + a->d.dirn = reinterpret_cast(value); + break; + + case GEN_IPADD: + a->d.ip = reinterpret_cast(value); + break; + + case GEN_RID: + a->d.rid = reinterpret_cast(value); + break; + } + a->type = type; +} + +void *GENERAL_NAME_get0_value(const GENERAL_NAME *a, int *out_type) { + if (out_type) { + *out_type = a->type; + } + switch (a->type) { + case GEN_X400: + return a->d.x400Address; + + case GEN_EDIPARTY: + return a->d.ediPartyName; + + case GEN_OTHERNAME: + return a->d.otherName; + + case GEN_EMAIL: + case GEN_DNS: + case GEN_URI: + return a->d.ia5; + + case GEN_DIRNAME: + return a->d.dirn; + + case GEN_IPADD: + return a->d.ip; + + case GEN_RID: + return a->d.rid; + + default: + return nullptr; + } +} + +int GENERAL_NAME_set0_othername(GENERAL_NAME *gen, ASN1_OBJECT *oid, + ASN1_TYPE *value) { + OTHERNAME *oth; + oth = OTHERNAME_new(); + if (!oth) { + return 0; + } + ASN1_TYPE_free(oth->value); + oth->type_id = oid; + oth->value = value; + GENERAL_NAME_set0_value(gen, GEN_OTHERNAME, oth); + return 1; +} + +int GENERAL_NAME_get0_otherName(const GENERAL_NAME *gen, ASN1_OBJECT **out_oid, + ASN1_TYPE **out_value) { + if (gen->type != GEN_OTHERNAME) { + return 0; + } + if (out_oid != nullptr) { + *out_oid = gen->d.otherName->type_id; + } + if (out_value != nullptr) { + *out_value = gen->d.otherName->value; + } + return 1; +} diff --git a/third_party/boringssl/src/crypto/x509/v3_ia5.cc b/third_party/boringssl/src/crypto/x509/v3_ia5.cc new file mode 100644 index 00000000..a6b8aed9 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_ia5.cc @@ -0,0 +1,83 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + + +using namespace bssl; + +static char *i2s_ASN1_IA5STRING(const X509V3_EXT_METHOD *method, void *ext) { + const ASN1_IA5STRING *ia5 = reinterpret_cast(ext); + char *tmp; + if (!ia5 || !ia5->length) { + return nullptr; + } + if (!(tmp = reinterpret_cast(OPENSSL_malloc(ia5->length + 1)))) { + return nullptr; + } + OPENSSL_memcpy(tmp, ia5->data, ia5->length); + tmp[ia5->length] = 0; + return tmp; +} + +static void *s2i_ASN1_IA5STRING(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, const char *str) { + ASN1_IA5STRING *ia5; + if (!str) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_NULL_ARGUMENT); + return nullptr; + } + if (!(ia5 = ASN1_IA5STRING_new())) { + goto err; + } + if (!ASN1_STRING_set(ia5, str, strlen(str))) { + ASN1_IA5STRING_free(ia5); + goto err; + } + return ia5; +err: + return nullptr; +} + +#define EXT_IA5STRING(nid) \ + { \ + nid, 0, ASN1_ITEM_ref(ASN1_IA5STRING), 0, 0, 0, 0, i2s_ASN1_IA5STRING, \ + s2i_ASN1_IA5STRING, 0, 0, 0, 0, nullptr \ + } + +const X509V3_EXT_METHOD bssl::v3_netscape_base_url = + EXT_IA5STRING(NID_netscape_base_url); +const X509V3_EXT_METHOD bssl::v3_netscape_revocation_url = + EXT_IA5STRING(NID_netscape_revocation_url); +const X509V3_EXT_METHOD bssl::v3_netscape_ca_revocation_url = + EXT_IA5STRING(NID_netscape_ca_revocation_url); +const X509V3_EXT_METHOD bssl::v3_netscape_renewal_url = + EXT_IA5STRING(NID_netscape_renewal_url); +const X509V3_EXT_METHOD bssl::v3_netscape_ca_policy_url = + EXT_IA5STRING(NID_netscape_ca_policy_url); +const X509V3_EXT_METHOD bssl::v3_netscape_ssl_server_name = + EXT_IA5STRING(NID_netscape_ssl_server_name); +const X509V3_EXT_METHOD bssl::v3_netscape_comment = + EXT_IA5STRING(NID_netscape_comment); diff --git a/third_party/boringssl/src/crypto/x509/v3_info.cc b/third_party/boringssl/src/crypto/x509/v3_info.cc new file mode 100644 index 00000000..21d46387 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_info.cc @@ -0,0 +1,162 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "internal.h" + + +using namespace bssl; + +static STACK_OF(CONF_VALUE) *i2v_AUTHORITY_INFO_ACCESS( + const X509V3_EXT_METHOD *method, void *ext, STACK_OF(CONF_VALUE) *ret); +static void *v2i_AUTHORITY_INFO_ACCESS(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval); + +const X509V3_EXT_METHOD bssl::v3_info = { + NID_info_access, + X509V3_EXT_MULTILINE, + ASN1_ITEM_ref(AUTHORITY_INFO_ACCESS), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + i2v_AUTHORITY_INFO_ACCESS, + v2i_AUTHORITY_INFO_ACCESS, + nullptr, + nullptr, + nullptr, +}; + +const X509V3_EXT_METHOD bssl::v3_sinfo = { + NID_sinfo_access, + X509V3_EXT_MULTILINE, + ASN1_ITEM_ref(AUTHORITY_INFO_ACCESS), + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + i2v_AUTHORITY_INFO_ACCESS, + v2i_AUTHORITY_INFO_ACCESS, + nullptr, + nullptr, + nullptr, +}; + +ASN1_SEQUENCE(ACCESS_DESCRIPTION) = { + ASN1_SIMPLE(ACCESS_DESCRIPTION, method, ASN1_OBJECT), + ASN1_SIMPLE(ACCESS_DESCRIPTION, location, GENERAL_NAME), +} ASN1_SEQUENCE_END(ACCESS_DESCRIPTION) + +IMPLEMENT_ASN1_ALLOC_FUNCTIONS(ACCESS_DESCRIPTION) + +ASN1_ITEM_TEMPLATE(AUTHORITY_INFO_ACCESS) = ASN1_EX_TEMPLATE_TYPE( + ASN1_TFLG_SEQUENCE_OF, 0, GeneralNames, ACCESS_DESCRIPTION) +ASN1_ITEM_TEMPLATE_END(AUTHORITY_INFO_ACCESS) + +IMPLEMENT_ASN1_FUNCTIONS_const(AUTHORITY_INFO_ACCESS) + +static STACK_OF(CONF_VALUE) *i2v_AUTHORITY_INFO_ACCESS( + const X509V3_EXT_METHOD *method, void *ext, STACK_OF(CONF_VALUE) *ret) { + const AUTHORITY_INFO_ACCESS *ainfo = + reinterpret_cast(ext); + ACCESS_DESCRIPTION *desc; + char objtmp[80], *name; + CONF_VALUE *vtmp; + STACK_OF(CONF_VALUE) *tret = ret; + + for (size_t i = 0; i < sk_ACCESS_DESCRIPTION_num(ainfo); i++) { + STACK_OF(CONF_VALUE) *tmp; + + desc = sk_ACCESS_DESCRIPTION_value(ainfo, i); + tmp = i2v_GENERAL_NAME(method, desc->location, tret); + if (tmp == nullptr) { + goto err; + } + tret = tmp; + vtmp = sk_CONF_VALUE_value(tret, i); + i2t_ASN1_OBJECT(objtmp, sizeof objtmp, desc->method); + + if (OPENSSL_asprintf(&name, "%s - %s", objtmp, vtmp->name) == -1) { + goto err; + } + OPENSSL_free(vtmp->name); + vtmp->name = name; + } + if (ret == nullptr && tret == nullptr) { + return sk_CONF_VALUE_new_null(); + } + + return tret; +err: + if (ret == nullptr && tret != nullptr) { + sk_CONF_VALUE_pop_free(tret, X509V3_conf_free); + } + return nullptr; +} + +static void *v2i_AUTHORITY_INFO_ACCESS(const X509V3_EXT_METHOD *method, + const X509V3_CTX *ctx, + const STACK_OF(CONF_VALUE) *nval) { + UniquePtr ainfo(sk_ACCESS_DESCRIPTION_new_null()); + if (ainfo == nullptr) { + return nullptr; + } + for (size_t i = 0; i < sk_CONF_VALUE_num(nval); i++) { + const CONF_VALUE *cnf = sk_CONF_VALUE_value(nval, i); + UniquePtr acc(ACCESS_DESCRIPTION_new()); + if (acc == nullptr) { + return nullptr; + } + char *ptmp = strchr(cnf->name, ';'); + if (!ptmp) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_INVALID_SYNTAX); + return nullptr; + } + CONF_VALUE ctmp; + ctmp.name = ptmp + 1; + ctmp.value = cnf->value; + if (!v2i_GENERAL_NAME_ex(acc->location, method, ctx, &ctmp, 0)) { + return nullptr; + } + UniquePtr objtmp(OPENSSL_strndup(cnf->name, ptmp - cnf->name)); + if (objtmp == nullptr) { + return nullptr; + } + acc->method = OBJ_txt2obj(objtmp.get(), 0); + if (!acc->method) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_BAD_OBJECT); + ERR_add_error_data(2, "value=", objtmp.get()); + return nullptr; + } + if (!PushToStack(ainfo.get(), std::move(acc))) { + return nullptr; + } + } + return ainfo.release(); +} diff --git a/third_party/boringssl/src/crypto/x509/v3_int.cc b/third_party/boringssl/src/crypto/x509/v3_int.cc new file mode 100644 index 00000000..7d072d16 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_int.cc @@ -0,0 +1,83 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include "internal.h" + + +using namespace bssl; + +static char *i2s_ASN1_INTEGER_cb(const X509V3_EXT_METHOD *method, void *ext) { + return i2s_ASN1_INTEGER(method, reinterpret_cast(ext)); +} + +static void *s2i_asn1_int(const X509V3_EXT_METHOD *meth, const X509V3_CTX *ctx, + const char *value) { + return s2i_ASN1_INTEGER(meth, value); +} + +const X509V3_EXT_METHOD bssl::v3_crl_num = { + NID_crl_number, + 0, + ASN1_ITEM_ref(ASN1_INTEGER), + nullptr, + nullptr, + nullptr, + nullptr, + i2s_ASN1_INTEGER_cb, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, +}; + +const X509V3_EXT_METHOD bssl::v3_delta_crl = { + NID_delta_crl, + 0, + ASN1_ITEM_ref(ASN1_INTEGER), + nullptr, + nullptr, + nullptr, + nullptr, + i2s_ASN1_INTEGER_cb, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, +}; + +const X509V3_EXT_METHOD bssl::v3_inhibit_anyp = { + NID_inhibit_any_policy, + 0, + ASN1_ITEM_ref(ASN1_INTEGER), + nullptr, + nullptr, + nullptr, + nullptr, + i2s_ASN1_INTEGER_cb, + s2i_asn1_int, + nullptr, + nullptr, + nullptr, + nullptr, + nullptr, +}; diff --git a/third_party/boringssl/src/crypto/x509/v3_lib.cc b/third_party/boringssl/src/crypto/x509/v3_lib.cc new file mode 100644 index 00000000..2289afd3 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_lib.cc @@ -0,0 +1,349 @@ +// Copyright 1999-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +/* X509 v3 extension utilities */ + +#include +#include + +#include +#include +#include +#include +#include + +#include "../mem_internal.h" +#include "internal.h" + + +using namespace bssl; + +// This is indirected though a pointer to avoid a global destructor. If we ever +// add bssl::NoDestructor, we can avoid this, but this API is already +// problematic and not thread-safe. +static bssl::Vector *ext_list = nullptr; + +int X509V3_EXT_add(X509V3_EXT_METHOD *ext) { + // We only support |ASN1_ITEM|-based extensions. + assert(ext->it != nullptr); + + // TODO(crbug.com/42290461): This API is not locked and doesn't check for + // duplicates. Remove it altogether as, even if those issues were fixed, it + // would not be possible to use safely anyway. + if (ext_list == nullptr) { + ext_list = bssl::New>(); + } + if (ext_list == nullptr || !ext_list->Push(ext)) { + return 0; + } + return 1; +} + +const X509V3_EXT_METHOD *X509V3_EXT_get_nid(int nid) { + if (nid < 0) { + return nullptr; + } + + switch (nid) { + case NID_netscape_cert_type: + return &v3_nscert; + case NID_netscape_base_url: + return &v3_netscape_base_url; + case NID_netscape_revocation_url: + return &v3_netscape_revocation_url; + case NID_netscape_ca_revocation_url: + return &v3_netscape_ca_revocation_url; + case NID_netscape_renewal_url: + return &v3_netscape_renewal_url; + case NID_netscape_ca_policy_url: + return &v3_netscape_ca_policy_url; + case NID_netscape_ssl_server_name: + return &v3_netscape_ssl_server_name; + case NID_netscape_comment: + return &v3_netscape_comment; + case NID_subject_key_identifier: + return &v3_skey_id; + case NID_key_usage: + return &v3_key_usage; + case NID_subject_alt_name: + return &v3_subject_alt_name; + case NID_issuer_alt_name: + return &v3_issuer_alt_name; + case NID_certificate_issuer: + return &v3_certificate_issuer; + case NID_basic_constraints: + return &v3_bcons; + case NID_crl_number: + return &v3_crl_num; + case NID_certificate_policies: + return &v3_cpols; + case NID_authority_key_identifier: + return &v3_akey_id; + case NID_crl_distribution_points: + return &v3_crld; + case NID_ext_key_usage: + return &v3_ext_ku; + case NID_delta_crl: + return &v3_delta_crl; + case NID_crl_reason: + return &v3_crl_reason; + case NID_invalidity_date: + return &v3_crl_invdate; + case NID_info_access: + return &v3_info; + case NID_id_pkix_OCSP_noCheck: + return &v3_ocsp_nocheck; + case NID_sinfo_access: + return &v3_sinfo; + case NID_policy_constraints: + return &v3_policy_constraints; + case NID_name_constraints: + return &v3_name_constraints; + case NID_policy_mappings: + return &v3_policy_mappings; + case NID_inhibit_any_policy: + return &v3_inhibit_anyp; + case NID_issuing_distribution_point: + return &v3_idp; + case NID_freshest_crl: + return &v3_freshest_crl; + } + + if (ext_list != nullptr) { + for (const X509V3_EXT_METHOD *ext : *ext_list) { + if (ext->ext_nid == nid) { + return ext; + } + } + } + return nullptr; +} + +const X509V3_EXT_METHOD *X509V3_EXT_get(const X509_EXTENSION *ext) { + int nid; + if ((nid = OBJ_obj2nid(ext->object)) == NID_undef) { + return nullptr; + } + return X509V3_EXT_get_nid(nid); +} + +int X509V3_EXT_free(int nid, void *ext_data) { + const X509V3_EXT_METHOD *ext_method = X509V3_EXT_get_nid(nid); + if (ext_method == nullptr) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_CANNOT_FIND_FREE_FUNCTION); + return 0; + } + + ASN1_item_free(reinterpret_cast(ext_data), + ASN1_ITEM_ptr(ext_method->it)); + return 1; +} + +int X509V3_EXT_add_alias(int nid_to, int nid_from) { + OPENSSL_BEGIN_ALLOW_DEPRECATED + const X509V3_EXT_METHOD *ext; + X509V3_EXT_METHOD *tmpext; + + if (!(ext = X509V3_EXT_get_nid(nid_from))) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_EXTENSION_NOT_FOUND); + return 0; + } + if (!(tmpext = New())) { + return 0; + } + *tmpext = *ext; + tmpext->ext_nid = nid_to; + if (!X509V3_EXT_add(tmpext)) { + Delete(tmpext); + return 0; + } + return 1; + OPENSSL_END_ALLOW_DEPRECATED +} + +int X509V3_add_standard_extensions() { return 1; } + +// Return an extension internal structure + +void *X509V3_EXT_d2i(const X509_EXTENSION *ext) { + const X509V3_EXT_METHOD *method; + const unsigned char *p; + + if (!(method = X509V3_EXT_get(ext))) { + return nullptr; + } + p = ext->value->data; + void *ret = + ASN1_item_d2i(nullptr, &p, ext->value->length, ASN1_ITEM_ptr(method->it)); + if (ret == nullptr) { + return nullptr; + } + // Check for trailing data. + if (p != ext->value->data + ext->value->length) { + ASN1_item_free(reinterpret_cast(ret), + ASN1_ITEM_ptr(method->it)); + OPENSSL_PUT_ERROR(X509V3, X509V3_R_TRAILING_DATA_IN_EXTENSION); + return nullptr; + } + return ret; +} + +void *X509V3_get_d2i(const STACK_OF(X509_EXTENSION) *extensions, int nid, + int *out_critical, int *out_idx) { + int lastpos; + X509_EXTENSION *ex, *found_ex = nullptr; + if (!extensions) { + if (out_idx) { + *out_idx = -1; + } + if (out_critical) { + *out_critical = -1; + } + return nullptr; + } + if (out_idx) { + lastpos = *out_idx + 1; + } else { + lastpos = 0; + } + if (lastpos < 0) { + lastpos = 0; + } + for (size_t i = lastpos; i < sk_X509_EXTENSION_num(extensions); i++) { + ex = sk_X509_EXTENSION_value(extensions, i); + if (OBJ_obj2nid(ex->object) == nid) { + if (out_idx) { + // TODO(https://crbug.com/boringssl/379): Consistently reject + // duplicate extensions. + *out_idx = (int)i; + found_ex = ex; + break; + } else if (found_ex) { + // Found more than one + if (out_critical) { + *out_critical = -2; + } + return nullptr; + } + found_ex = ex; + } + } + if (found_ex) { + // Found it + if (out_critical) { + *out_critical = X509_EXTENSION_get_critical(found_ex); + } + return X509V3_EXT_d2i(found_ex); + } + + // Extension not found + if (out_idx) { + *out_idx = -1; + } + if (out_critical) { + *out_critical = -1; + } + return nullptr; +} + +// This function is a general extension append, replace and delete utility. +// The precise operation is governed by the 'flags' value. The 'crit' and +// 'value' arguments (if relevant) are the extensions internal structure. + +int X509V3_add1_i2d(STACK_OF(X509_EXTENSION) **x, int nid, void *value, + int crit, unsigned long flags) { + int errcode, extidx = -1; + X509_EXTENSION *ext = nullptr, *extmp; + STACK_OF(X509_EXTENSION) *ret = nullptr; + unsigned long ext_op = flags & X509V3_ADD_OP_MASK; + + // If appending we don't care if it exists, otherwise look for existing + // extension. + if (ext_op != X509V3_ADD_APPEND) { + extidx = X509v3_get_ext_by_NID(*x, nid, -1); + } + + // See if extension exists + if (extidx >= 0) { + // If keep existing, nothing to do + if (ext_op == X509V3_ADD_KEEP_EXISTING) { + return 1; + } + // If default then its an error + if (ext_op == X509V3_ADD_DEFAULT) { + errcode = X509V3_R_EXTENSION_EXISTS; + goto err; + } + // If delete, just delete it + if (ext_op == X509V3_ADD_DELETE) { + X509_EXTENSION *prev_ext = sk_X509_EXTENSION_delete(*x, extidx); + if (prev_ext == nullptr) { + return -1; + } + X509_EXTENSION_free(prev_ext); + return 1; + } + } else { + // If replace existing or delete, error since extension must exist + if ((ext_op == X509V3_ADD_REPLACE_EXISTING) || + (ext_op == X509V3_ADD_DELETE)) { + errcode = X509V3_R_EXTENSION_NOT_FOUND; + goto err; + } + } + + // If we get this far then we have to create an extension: could have + // some flags for alternative encoding schemes... + + ext = X509V3_EXT_i2d(nid, crit, value); + + if (!ext) { + OPENSSL_PUT_ERROR(X509V3, X509V3_R_ERROR_CREATING_EXTENSION); + return 0; + } + + // If extension exists replace it.. + if (extidx >= 0) { + extmp = sk_X509_EXTENSION_value(*x, extidx); + X509_EXTENSION_free(extmp); + if (!sk_X509_EXTENSION_set(*x, extidx, ext)) { + return -1; + } + return 1; + } + + if ((ret = *x) == nullptr && + (ret = sk_X509_EXTENSION_new_null()) == nullptr) { + goto m_fail; + } + if (!sk_X509_EXTENSION_push(ret, ext)) { + goto m_fail; + } + + *x = ret; + return 1; + +m_fail: + if (ret != *x) { + sk_X509_EXTENSION_free(ret); + } + X509_EXTENSION_free(ext); + return -1; + +err: + if (!(flags & X509V3_ADD_SILENT)) { + OPENSSL_PUT_ERROR(X509V3, errcode); + } + return 0; +} diff --git a/third_party/boringssl/src/crypto/x509/v3_ncons.cc b/third_party/boringssl/src/crypto/x509/v3_ncons.cc new file mode 100644 index 00000000..58c15960 --- /dev/null +++ b/third_party/boringssl/src/crypto/x509/v3_ncons.cc @@ -0,0 +1,798 @@ +// Copyright 2003-2016 The OpenSSL Project Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "internal.h" + + +BSSL_NAMESPACE_BEGIN +namespace { + +int starts_with(const CBS *cbs, uint8_t c) { + return CBS_len(cbs) > 0 && CBS_data(cbs)[0] == c; +} + +int starts_with_str(const CBS *cbs, std::string_view str) { + return CBS_len(cbs) >= str.size() && + !OPENSSL_memcmp(CBS_data(cbs), str.data(), str.size()); +} + +int ends_with(const CBS *cbs, uint8_t c) { + return CBS_len(cbs) > 0 && CBS_data(cbs)[CBS_len(cbs) - 1] == c; +} + +int equal_case(const CBS *a, const CBS *b) { + if (CBS_len(a) != CBS_len(b)) { + return 0; + } + // Note we cannot use |OPENSSL_strncasecmp| because that would stop + // iterating at NUL. + const uint8_t *a_data = CBS_data(a), *b_data = CBS_data(b); + for (size_t i = 0; i < CBS_len(a); i++) { + if (OPENSSL_tolower(a_data[i]) != OPENSSL_tolower(b_data[i])) { + return 0; + } + } + return 1; +} + +int has_suffix_case(const CBS *a, const CBS *b) { + if (CBS_len(a) < CBS_len(b)) { + return 0; + } + CBS copy = *a; + CBS_skip(©, CBS_len(a) - CBS_len(b)); + return equal_case(©, b); +} + +bool is_allowed_rfc822_local_part(const CBS *cbs) { + if (CBS_len(cbs) == 0) { + return false; + } + for (size_t i = 0; i < CBS_len(cbs); i++) { + uint8_t c = CBS_data(cbs)[i]; + if (!(OPENSSL_isalnum(c) || c == '!' || c == '#' || c == '$' || c == '%' || + c == '&' || c == '\'' || c == '*' || c == '+' || c == '-' || + c == '/' || c == '=' || c == '?' || c == '^' || c == '_' || + c == '`' || c == '{' || c == '|' || c == '}' || c == '~' || + c == '.')) { + return false; + } + } + return true; +} + +bool is_allowed_rfc822_domain(const CBS *cbs) { + if (CBS_len(cbs) == 0) { + return false; + } + for (size_t i = 0; i < CBS_len(cbs); i++) { + uint8_t c = CBS_data(cbs)[i]; + if (!(OPENSSL_isalnum(c) || c == '-' || c == '.')) { + return false; + } + } + return true; +} + +// Removes the port part of a URI authority string, if present, leaving the +// host. Returns true if the port was syntactically valid (contained only +// digits), or was not present (vacuously valid). +bool nc_uri_remove_port(CBS *in_out) { + CBS host, unused; + if (CBS_get_until_first(in_out, &host, ':')) { + if (!CBS_skip(in_out, 1) || + CBS_get_until_first_not_of(in_out, &unused, "0123456789")) { + return false; + } + *in_out = host; + return true; + } + // There was no port. + return true; +} + +// Strips a single trailing dot from `host` if present. +// Per RFC 9499, we allow names to be written relative to the root (e.g. +// "www.example.com" is allowed even though technically the FQDN is +// "www.example.com." with a trailing dot representing the common root), and +// normalize names into this relative form for consistency. +void nc_uri_remove_trailing_dot(CBS *host) { + if (ends_with(host, '.')) { + uint8_t unused_byte; + BSSL_CHECK(CBS_get_last_u8(host, &unused_byte)); + } +} + +// Returns whether the authority portion of a URI contains a syntactically valid +// fully qualified domain name (FQDN). +// +// RFC 3986, section 3.2: +// authority = [ userinfo "@" ] host [ ":" port ] +// +// RFC 5280, section 4.2.1.10: if a URI name constraint applies, applications +// MUST reject a certificate with a subjectAltName with a URI that "does not +// include an authority component with a host name specified as a fully +// qualified domain name". +// +// Therefore we reject IP addresses in this function, and reject authority +// components containing a userinfo component. The caller is responsible for +// having removed any colon and port component, and normalizing to DNS relative +// form by removing any trailing dot. +bool nc_uri_is_fqdn(const CBS *uri_authority) { + if (CBS_len(uri_authority) == 0) { + return false; + } + CBS host_to_parse = *uri_authority; + + // Reject userinfo, and IPv6 addresses delimited by square brackets. IPv4 + // addresses are rejected later. + CBS unused; + if (CBS_get_until_first_of(&host_to_parse, &unused, "@[]")) { + return false; + } + + // Validate that the host is identified by a registered name. Following RFC + // 3986, section 3.2.2: we refer to the preferred name syntax rules for DNS + // registered names found in RFC 1034, section 3.5, modified by RFC 1123: + // + // ::= | " " + // ::=